[ { "id": "-0Cjhnl-dhK", "title": "Towards Uncertainties in Deep Learning that Are Accurate and Calibrated", "track": "main", "status": "Reject", "tldr": "", "abstract": "Predictive uncertainties can be characterized by two properties---calibration and sharpness. This paper introduces algorithms that ensure the calibration of any model while maintaining sharpness. They apply in both classification and regression and guarantee the strong property of distribution calibration, while being simpler and more broadly applicable than previous methods (especially in the context of neural networks, which are often miscalibrated). Importantly, these algorithms achieve a long-standing statistical principle that forecasts should maximize sharpness subject to being fully calibrated. Using our algorithms, machine learning models can under some assumptions be calibrated without sacrificing accuracy: in a sense, calibration can be a free lunch. Empirically, we find that our methods improve predictive uncertainties on several tasks with minimal computational and implementation overhead.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Volodymyr Kuleshov;Shachi Deshpande", "authorids": "~Volodymyr_Kuleshov1;~Shachi_Deshpande1", "gender": ";F", "homepage": "https://www.cs.cornell.edu/~kuleshov/;https://www.cs.cornell.edu/~shachi/", "dblp": "81/8612;201/2223.html", "google_scholar": "RY_t8XAAAAAJ;", "orcid": ";0000-0003-3223-4103", "linkedin": ";shachi-deshpande-b8bb22b0", "or_profile": "~Volodymyr_Kuleshov1;~Shachi_Deshpande1", "aff": "Cornell University;Amazon", "aff_domain": "cornell.edu;amazon.com", "position": "Assistant Professor;Intern", "bibtex": "@misc{\nkuleshov2022towards,\ntitle={Towards Uncertainties in Deep Learning that Are Accurate and Calibrated},\nauthor={Volodymyr Kuleshov and Shachi Deshpande},\nyear={2022},\nurl={https://openreview.net/forum?id=-0Cjhnl-dhK}\n}", "github": "", "project": "", "reviewers": "GeKX;fwct;EFrC;qXzz", "site": "https://openreview.net/forum?id=-0Cjhnl-dhK", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "3;4;2;3", "correctness": "2;3;1;2", "technical_novelty": "2;2;2;2", "empirical_novelty": "3;2;2;3", "wc_summary_paper": "116;52;86;73", "wc_summary_review": "67;49;92;55", "wc_main_review": "572;250;500;377", "wc_review": "755;351;678;505", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 3.0, 0.7071067811865476 ], "correctness_avg": [ 2.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 81.75, 23.19886850689059 ], "wc_summary_review_avg": [ 65.75, 16.48294573187693 ], "wc_main_review_avg": [ 424.75, 122.6404806742048 ], "wc_review_avg": [ 572.25, 156.56847543487163 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:ggJd72H16NgJ:scholar.google.com/&scioq=Towards+Uncertainties+in+Deep+Learning+that+Are+Accurate+and+Calibrated&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1", "aff_unique_norm": "Cornell University;Amazon", "aff_unique_dep": ";Amazon.com, Inc.", "aff_unique_url": "https://www.cornell.edu;https://www.amazon.com", "aff_unique_abbr": "Cornell;Amazon", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "-0LuSWi6j4", "title": "Mind Your Bits and Errors: Prioritizing the Bits that Matter in Variational Autoencoders", "track": "main", "status": "Reject", "tldr": "", "abstract": "Good likelihoods do not imply great sample quality. However, the precise manner in which models trained to achieve good likelihoods fail at sample quality remains poorly understood. In this work, we consider the task of image generative modeling with variational autoencoders and posit that the nature of high-dimensional image data distributions poses an intrinsic challenge. In particular, much of the entropy in these natural image distributions is attributable to visually imperceptible information. This signal dominates the training objective, giving models an easy way to achieve competitive likelihoods without successful modeling of the visually perceptible bits. Based on this hypothesis, we decompose the task of generative modeling explicitly into two steps: we first prioritize the modeling of visually perceptible information to achieve good sample quality, and then subsequently model the imperceptible information---the bulk of the likelihood signal---to achieve good likelihoods. Our work highlights the well-known adage that \"not all bits are created equal\" and demonstrates that this property can and should be exploited in the design of variational autoencoders.", "keywords": "deep generative models;variational autoencoders", "primary_area": "", "supplementary_material": "", "author": "Rui Shu;Stefano Ermon", "authorids": "~Rui_Shu1;~Stefano_Ermon1", "gender": "M;M", "homepage": "http://ruishu.github.io;http://cs.stanford.edu/~ermon/", "dblp": "146/0885;47/8135", "google_scholar": "UB7UZEYAAAAJ;", "orcid": ";", "linkedin": ";", "or_profile": "~Rui_Shu1;~Stefano_Ermon1", "aff": "Stanford University;Stanford University", "aff_domain": "stanford.edu;stanford.edu", "position": "PhD student;Assistant Professor", "bibtex": "@misc{\nshu2022mind,\ntitle={Mind Your Bits and Errors: Prioritizing the Bits that Matter in Variational Autoencoders},\nauthor={Rui Shu and Stefano Ermon},\nyear={2022},\nurl={https://openreview.net/forum?id=-0LuSWi6j4}\n}", "github": "", "project": "", "reviewers": "LBJj;sh3z;KKon;7Pio;Vnek", "site": "https://openreview.net/forum?id=-0LuSWi6j4", "pdf_size": 0, "recommendation": "3;3;5;5;6", "confidence": "4;4;4;4;3", "correctness": "2;2;2;4;3", "technical_novelty": "3;2;3;2;3", "empirical_novelty": "2;2;4;2;4", "wc_summary_paper": "144;56;97;119;106", "wc_summary_review": "53;63;52;25;65", "wc_main_review": "510;503;569;336;342", "wc_review": "707;622;718;480;513", "wc_reply_reviewers": "248;200;258;0;0", "wc_reply_authors": "415;394;639;311;446", "reply_reviewers": "1;1;1;0;0", "reply_authors": "1;1;2;1;1", "recommendation_avg": [ 4.4, 1.2 ], "confidence_avg": [ 3.8, 0.39999999999999997 ], "correctness_avg": [ 2.6, 0.8 ], "technical_novelty_avg": [ 2.6, 0.4898979485566356 ], "empirical_novelty_avg": [ 2.8, 0.9797958971132712 ], "wc_summary_paper_avg": [ 104.4, 28.917814578560396 ], "wc_summary_review_avg": [ 51.6, 14.277254638059798 ], "wc_main_review_avg": [ 452.0, 95.08943158942533 ], "wc_review_avg": [ 608.0, 97.47409912381853 ], "wc_reply_reviewers_avg": [ 141.2, 116.94511533193679 ], "wc_reply_authors_avg": [ 441.0, 108.6406921921984 ], "reply_reviewers_avg": [ 0.6, 0.48989794855663565 ], "reply_authors_avg": [ 1.2, 0.4 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.6666666666666666, "corr_recommendation_correctness": 0.5833333333333333, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:e6dYESXIjmwJ:scholar.google.com/&scioq=Mind+Your+Bits+and+Errors:+Prioritizing+the+Bits+that+Matter+in+Variational+Autoencoders&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Stanford University", "aff_unique_dep": "", "aff_unique_url": "https://www.stanford.edu", "aff_unique_abbr": "Stanford", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "-0qmvlqnVw4", "title": "How Frequency Effect Graph Neural Networks", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Graph neural networks (GNNs) have been demonstrated powerful expressiveness on graph representation with different message passing schemes, but fail to improve the prediction performance by stacking layers because of over-smoothing. The researches of frequency principle on deep neural networks motivated us to explore the effect of frequency on designing deep GNNs. In this work, we decompose input features into low-frequency and high-frequency signals and analyze the performance of different frequencies on GNNs as the depth increases. We prove that low-frequency signals can be learned faster in GNNs, i.e., easier to suffer from over-smoothing than high-frequency signals. Based on the frequency principle on GNNs, we present a novel powerful GNNs framework, Multi-Scale Frequency Enhanced Graph Neural Networks (MSF-GNNs) which considers multi-scale representations from wavelet decomposition. Specifically, we design an information propagation rule which considers the properties of different frequency signals and exploits the advantages of different frequency signals for better node representation. To enhance the consistent output of multi-scale representation, we utilize consistency regularized loss. Extensive experiments have demonstrated the effectiveness of proposed MSF-GNNs on node classification compared to state-of-the-art methods. The theoretical study and experimental results further show the effectiveness of MSF-GNNs on relieving the issues of over-smoothing.", "keywords": "graph neural networks", "primary_area": "", "supplementary_material": "", "author": "Xueqi Ma;Yubo Zhang;Weifeng Liu;Yue Gao", "authorids": "~Xueqi_Ma1;zhangyb17@mails.tsinghua.edu.cn;~Weifeng_Liu1;~Yue_Gao4", "gender": "F;;M;M", "homepage": ";;http://member.acm.org/~wfliu;http://www.gaoyue.org", "dblp": "194/4773;;23/1112-1;33/3099-2", "google_scholar": "https://scholar.google.com/citations?hl=en;;XHc6D58AAAAJ;UTDfWocAAAAJ", "orcid": ";;0000-0002-5388-9080;", "linkedin": ";;;", "or_profile": "~Xueqi_Ma1;zhangyb17@mails.tsinghua.edu.cn;~Weifeng_Liu1;~Yue_Gao4", "aff": "University of Melbourne;;China University of Petroleum (East China);Tsinghua University", "aff_domain": "unimelb.edu;;upc.edu.cn;tsinghua.edu.cn", "position": "PhD student;;Full Professor;Associate Professor", "bibtex": "@misc{\nma2022how,\ntitle={How Frequency Effect Graph Neural Networks},\nauthor={Xueqi Ma and Yubo Zhang and Weifeng Liu and Yue Gao},\nyear={2022},\nurl={https://openreview.net/forum?id=-0qmvlqnVw4}\n}", "github": "", "project": "", "reviewers": "3k2q;CaWn;iA6y;34QC", "site": "https://openreview.net/forum?id=-0qmvlqnVw4", "pdf_size": 0, "recommendation": "1;3;3;3", "confidence": "5;4;4;4", "correctness": "2;3;3;2", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;2;2;0", "wc_summary_paper": "75;122;78;30", "wc_summary_review": "191;60;49;44", "wc_main_review": "754;267;395;364", "wc_review": "1020;449;522;438", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "17;66;41;34", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 2.5, 0.8660254037844386 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 76.25, 32.545160930620696 ], "wc_summary_review_avg": [ 86.0, 60.897454790820284 ], "wc_main_review_avg": [ 445.0, 184.544032685969 ], "wc_review_avg": [ 607.25, 240.47803953791706 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 39.5, 17.613914953808536 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -1.0, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1, "aff_unique_index": "0;1;2", "aff_unique_norm": "University of Melbourne;China University of Petroleum;Tsinghua University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.unimelb.edu.au;http://www.cup.edu.cn;https://www.tsinghua.edu.cn", "aff_unique_abbr": "UniMelb;CUP;THU", "aff_campus_unique_index": "1", "aff_campus_unique": ";East China", "aff_country_unique_index": "0;1;1", "aff_country_unique": "Australia;China" }, { "id": "-29uFS4FiDZ", "title": "Word Sense Induction with Knowledge Distillation from BERT", "track": "main", "status": "Reject", "tldr": "", "abstract": "Pre-trained contextual language models are ubiquitously employed for language understanding tasks, but are unsuitable for resource-constrained systems. Noncontextual word embeddings are an efficient alternative in these settings. Such methods typically use one vector to encode multiple different meanings of a word, and incur errors due to polysemy. This paper proposes a two-stage method to distill multiple word senses from a pre-trained language model (BERT) by using attention over the senses of a word in a context and transferring this sense information to fit multi-sense embeddings in a skip-gram-like framework. We demonstrate an effective approach to training the sense disambiguation mechanism in our model with a distribution over word senses extracted from the output layer embeddings of BERT. Experiments on the contextual word similarity and sense induction tasks show that this method is superior to or competitive with state-of-the-art multi-sense embeddings on multiple benchmark data sets, and experiments with an embedding-based topic model (ETM) demonstrates the benefits of using this multi-sense embedding in a downstream application.\n", "keywords": "word embeddings;sense embeddings;word sense induction", "primary_area": "", "supplementary_material": "", "author": "Anik Saha;Alex Gittens;Bulent Yener", "authorids": "~Anik_Saha1;~Alex_Gittens1;~Bulent_Yener2", "gender": "M;M;", "homepage": ";http://www.cs.rpi.edu/~gittea;", "dblp": ";22/10359;", "google_scholar": ";qN1Ss8EAAAAJ;", "orcid": ";;", "linkedin": "https://linkedin.com/in/aniksh;;", "or_profile": "~Anik_Saha1;~Alex_Gittens1;~Bulent_Yener2", "aff": "Rensselaer Polytechnic Institute;Rensselaer Polytechnic Institute;", "aff_domain": "rpi.edu;rpi.edu;", "position": "PhD student;Assistant Professor;", "bibtex": "@misc{\nsaha2022word,\ntitle={Word Sense Induction with Knowledge Distillation from {BERT}},\nauthor={Anik Saha and Alex Gittens and Bulent Yener},\nyear={2022},\nurl={https://openreview.net/forum?id=-29uFS4FiDZ}\n}", "github": "", "project": "", "reviewers": "oDq7;2C3z;xBJd;pvxe;npDw", "site": "https://openreview.net/forum?id=-29uFS4FiDZ", "pdf_size": 0, "recommendation": "3;5;5;6;6", "confidence": "4;5;4;3;3", "correctness": "2;3;3;3;4", "technical_novelty": "3;1;3;3;4", "empirical_novelty": "3;2;2;3;2", "wc_summary_paper": "46;80;95;109;40", "wc_summary_review": "43;16;60;138;53", "wc_main_review": "367;298;227;210;111", "wc_review": "456;394;382;457;204", "wc_reply_reviewers": "0;0;0;0;27", "wc_reply_authors": "263;330;278;93;139", "reply_reviewers": "0;0;0;0;1", "reply_authors": "1;1;1;1;1", "recommendation_avg": [ 5.0, 1.0954451150103321 ], "confidence_avg": [ 3.8, 0.7483314773547882 ], "correctness_avg": [ 3.0, 0.6324555320336759 ], "technical_novelty_avg": [ 2.8, 0.9797958971132712 ], "empirical_novelty_avg": [ 2.4, 0.4898979485566356 ], "wc_summary_paper_avg": [ 74.0, 26.98888660171071 ], "wc_summary_review_avg": [ 62.0, 40.83625839863392 ], "wc_main_review_avg": [ 242.6, 86.2197193222061 ], "wc_review_avg": [ 378.6, 92.5971921820527 ], "wc_reply_reviewers_avg": [ 5.4, 10.8 ], "wc_reply_authors_avg": [ 220.6, 89.44406073071593 ], "reply_reviewers_avg": [ 0.2, 0.4 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.48795003647426666, "corr_recommendation_correctness": 0.8660254037844387, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17224652787978676262&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0", "aff_unique_norm": "Rensselaer Polytechnic Institute", "aff_unique_dep": "", "aff_unique_url": "https://www.rpi.edu", "aff_unique_abbr": "RPI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "-3Qj7Jl6UP5", "title": "The magnitude vector of images", "track": "main", "status": "Reject", "tldr": "", "abstract": "The magnitude of a finite metric space is a recently-introduced invariant quantity. Despite beneficial theoretical and practical properties, such as a general utility for outlier detection, and a close connection to Laplace radial basis kernels, magnitude has received little attention by the machine learning community so far. In this work, we investigate the properties of magnitude on individual images, with each image forming its own metric space. We show that the known properties of outlier detection translate to edge detection in images and we give supporting theoretical justifications. In addition, we provide a proof of concept of its utility by using a novel magnitude layer to defend against adversarial attacks. Since naive magnitude calculations may be computationally prohibitive, we introduce an algorithm that leverages the regular structure of images to dramatically reduce the computational cost.", "keywords": "magnitude;magnitude vector;edge detection;adversarial robustness;metric space;algebraic topology", "primary_area": "", "supplementary_material": "/attachment/73cf585db2ee40b4f210260fda4380309a258564.zip", "author": "Michael F Adamer;Leslie O'Bray;Edward De Brouwer;Bastian Rieck;Karsten Borgwardt", "authorids": "~Michael_F_Adamer1;~Leslie_O'Bray1;~Edward_De_Brouwer1;~Bastian_Rieck1;~Karsten_Borgwardt2", "gender": ";;M;M;", "homepage": ";https://www.leslieobray.com;https://edwarddebrouwer.xyz;https://bastian.rieck.me;https://www.biochem.mpg.de/borgwardt", "dblp": "203/8407.html;;;119/8860;11/3733.html", "google_scholar": "https://scholar.google.ch/citations?user=tgk7w18AAAAJ;;-Pm4XtAAAAAJ;https://scholar.google.ch/citations?user=La7zuKQAAAAJ;v3JsjMYAAAAJ", "orcid": "0000-0002-8996-7167;;;0000-0003-4335-0302;0000-0001-7221-2393", "linkedin": ";leslie-o-bray-722574a7/;edwarddebrouwer/;br-ml/;", "or_profile": "~Michael_F_Adamer1;~Leslie_O'Bray1;~Edward_De_Brouwer1;~Bastian_Rieck1;~Karsten_Borgwardt2", "aff": "Swiss Federal Institute of Technology;Swiss Federal Institute of Technology;KU Leuven;Helmholtz Zentrum M\u00fcnchen;ETHZ - ETH Zurich", "aff_domain": "ethz.ch;ethz.ch;kuleuven.be;helmholtz-munich.de;ethz.ch", "position": "Postdoc;PhD student;PhD student;Principal Investigator;Full Professor", "bibtex": "@misc{\nadamer2022the,\ntitle={The magnitude vector of images},\nauthor={Michael F Adamer and Leslie O'Bray and Edward De Brouwer and Bastian Rieck and Karsten Borgwardt},\nyear={2022},\nurl={https://openreview.net/forum?id=-3Qj7Jl6UP5}\n}", "github": "", "project": "", "reviewers": "cxFw;f2ou;qYUC;WNxv", "site": "https://openreview.net/forum?id=-3Qj7Jl6UP5", "pdf_size": 0, "recommendation": "3;3;3;3", "confidence": "4;3;3;4", "correctness": "3;3;3;2", "technical_novelty": "1;3;2;2", "empirical_novelty": "2;3;2;2", "wc_summary_paper": "100;152;73;32", "wc_summary_review": "49;14;85;19", "wc_main_review": "236;445;343;204", "wc_review": "385;611;501;255", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 89.25, 43.573931426943794 ], "wc_summary_review_avg": [ 41.75, 28.331740151286155 ], "wc_main_review_avg": [ 307.0, 94.855152732996 ], "wc_review_avg": [ 438.0, 132.47263868437136 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8072973606690950633&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff_unique_index": "0;0;1;2;3", "aff_unique_norm": "Swiss Federal Institute of Technology;Katholieke Universiteit Leuven;Helmholtz Zentrum M\u00fcnchen;ETH Zurich", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.ethz.ch;https://www.kuleuven.be;https://www.helmholtz-muenchen.de;https://www.ethz.ch", "aff_unique_abbr": "ETH Zurich;KU Leuven;;ETHZ", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;2;0", "aff_country_unique": "Switzerland;Belgium;Germany" }, { "id": "-3yxxvDis3L", "title": "How to Improve Sample Complexity of SGD over Highly Dependent Data?", "track": "main", "status": "Reject", "tldr": "", "abstract": "Conventional machine learning applications typically assume that data samples are independently and identically distributed (i.i.d.). However, many practical scenarios naturally involve a data-generating process that produces highly dependent data samples, which are known to heavily bias the stochastic optimization process and slow down the convergence of learning. In this paper, we conduct a fundamental study on how to facilitate the convergence of SGD over highly dependent data using different popular update schemes. Specifically, with a $\\phi$-mixing model that captures both exponential and polynomial decay of the data dependence over time, we show that SGD with periodic data-subsampling achieves an improved sample complexity over the standard SGD in the full spectrum of the $\\phi$-mixing data dependence. Moreover, we show that by fully utilizing the data, mini-batch SGD can further substantially improve the sample complexity with highly dependent data. Numerical experiments validate our theory. ", "keywords": "Dependent data sampling;SGD;sample complexity", "primary_area": "", "supplementary_material": "/attachment/84bbcbdcd719ef2e26ea0ea7b5cc8abc69926113.zip", "author": "Shaocong Ma;Ziyi Chen;Yi Zhou;Kaiyi Ji;Yingbin Liang", "authorids": "~Shaocong_Ma1;~Ziyi_Chen2;~Yi_Zhou2;~Kaiyi_Ji1;~Yingbin_Liang1", "gender": "M;M;M;M;F", "homepage": "https://mshaocong.github.io/;;https://sites.google.com/site/yizhouhomepage/home;https://cse.buffalo.edu/~kaiyiji/;https://sites.google.com/view/yingbinliang/home", "dblp": "270/3742;37/1439-2;;205/3164;51/332", "google_scholar": ";zjSBVOIAAAAJ;4fK8bYIAAAAJ;E0A3lSIAAAAJ;lGgLAiIAAAAJ", "orcid": ";;;;", "linkedin": ";ziyi-chen-84616184/;;;", "or_profile": "~Shaocong_Ma1;~Ziyi_Chen2;~Yi_Zhou2;~Kaiyi_Ji1;~Yingbin_Liang1", "aff": "Lawrence Livermore National Labs;University of Utah;University of Utah;University of Michigan - Ann Arbor;The Ohio State University", "aff_domain": "llnl.gov;utah.edu;utah.edu;umich.edu;osu.edu", "position": "Intern;PhD student;Assistant Professor;Postdoc;Professor", "bibtex": "@misc{\nma2022how,\ntitle={How to Improve Sample Complexity of {SGD} over Highly Dependent Data?},\nauthor={Shaocong Ma and Ziyi Chen and Yi Zhou and Kaiyi Ji and Yingbin Liang},\nyear={2022},\nurl={https://openreview.net/forum?id=-3yxxvDis3L}\n}", "github": "", "project": "", "reviewers": "RcE4;PELk;xaGe;J2CF", "site": "https://openreview.net/forum?id=-3yxxvDis3L", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "5;3;4;4", "correctness": "3;3;4;4", "technical_novelty": "1;2;2;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "29;24;244;88", "wc_summary_review": "13;22;21;75", "wc_main_review": "162;251;404;238", "wc_review": "204;297;669;401", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 96.25, 88.9392348741544 ], "wc_summary_review_avg": [ 32.75, 24.641174890820444 ], "wc_main_review_avg": [ 263.75, 87.81906114278381 ], "wc_review_avg": [ 392.75, 174.05225508450042 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.6488856845230502, "corr_recommendation_correctness": 0.6882472016116854, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:dk0PHg2CUfcJ:scholar.google.com/&scioq=How+to+Improve+Sample+Complexity+of+SGD+over+Highly+Dependent+Data%3F&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1;1;2;3", "aff_unique_norm": "Lawrence Livermore National Laboratory;University of Utah;University of Michigan;Ohio State University", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.llnl.gov;https://www.utah.edu;https://www.umich.edu;https://www.osu.edu", "aff_unique_abbr": "LLNL;Utah;UM;OSU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Ann Arbor", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "-4hMlsXK4st", "title": "Improving Robustness with Optimal Transport based Adversarial Generalization", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Deep nets have proven to be brittle against crafted adversarial examples. One of the main reasons is that the representations of the adversarial examples gradually become more divergent from those of the benign examples when feed-forwarding up to higher layers of deep nets. To remedy susceptibility to adversarial examples, it is natural to mitigate this divergence. In this paper, leveraging the richness and rigor of optimal transport (OT) theory, we propose an OT-based adversarial generalization technique that helps strengthen the classifier for tackling adversarial examples. The main idea of our proposed method is to examine a specific Wasserstein (WS) distance between the adversarial and benign joint distributions on an intermediate layer of a deep net, which can further be interpreted from a clustering view of OT as a generalization technique. More specifically, by minimizing the WS distance of interest, an adversarial example is pushed toward the cluster of benign examples sharing the same label on the latent space, which helps to strengthen the generalization ability of the classifier on the adversarial examples. Our comprehensive experiments with state-of-the-art adversarial training and defense on latent space approaches indicate the significant superiority of our method under specific attacks of various distortion sizes. The results demonstrate improvements in robust accuracy up to $5\\%$ against PGD attack on CIFAR-100 over the SOTA methods.", "keywords": "Optimal Transport;Adversarial Machine Learning;Adversarial Training", "primary_area": "", "supplementary_material": "", "author": "Siqi Xia;Shijie Liu;Trung Le;Dinh Phung;Sarah Erfani;Benjamin I. P. Rubinstein;Christopher Leckie;Paul Montague", "authorids": "~Siqi_Xia1;~Shijie_Liu4;~Trung_Le2;~Dinh_Phung2;~Sarah_Erfani1;~Benjamin_I._P._Rubinstein1;~Christopher_Leckie1;~Paul_Montague1", "gender": "F;M;M;M;;M;M;", "homepage": ";https://github.com/shijiel2;;http://www.bipr.net/;;;https://research.monash.edu/en/persons/dinh-phung;https://people.eng.unimelb.edu.au/smonazam/", "dblp": ";;;90/1092;73/1139;50/805;71/5859;136/0170", "google_scholar": ";https://scholar.google.com.au/citations?user=lH5nxwMAAAAJ;https://scholar.google.com/citations?hl=en;https://scholar.google.com.au/citations?user=hMG_gR4AAAAJ;https://scholar.google.com.au/citations?user=wUsI0cAAAAAJ;;https://scholar.google.com.au/citations?user=OtA9SwIAAAAJ;https://scholar.google.com.au/citations?user=Jq9ocx4AAAAJ", "orcid": ";0009-0008-2980-6266;;0000-0002-2947-6980;;0000-0001-9461-7471;0000-0002-9977-8247;", "linkedin": "siqi-xia-16226286/;;;benjaminrubinstein/;;;https://linkedin.com/in/dinh-phung-6b537a6;", "or_profile": "~Siqi_Xia1;~Shijie_Liu4;~Trung_Le2;~Benjamin_I._P._Rubinstein1;~Christopher_Leckie1;~Paul_Montague1;~Dinh_Phung1;~Sarah_Monazam_Erfani1", "aff": "Deakin University;The University of Melbourne;Monash University;The University of Melbourne;The University of Melbourne;Defence Science and Technology Group;Monash University;University of Melbourne", "aff_domain": "deakin.edu.au;unimelb.edu.au;monash.edu;unimelb.edu.au;unimelb.edu.au;dst.defence.gov.au;monash.edu;unimelb.edu", "position": "PhD student;PhD student;Assistant Professor;Associate Professor;Full Professor;Researcher;Full Professor;Assistant Professor", "bibtex": "@misc{\nxia2022improving,\ntitle={Improving Robustness with Optimal Transport based Adversarial Generalization},\nauthor={Siqi Xia and Shijie Liu and Trung Le and Dinh Phung and Sarah Erfani and Benjamin I. P. Rubinstein and Christopher Leckie and Paul Montague},\nyear={2022},\nurl={https://openreview.net/forum?id=-4hMlsXK4st}\n}", "github": "", "project": "", "reviewers": "ocTv;jT46;VcaH;BqPb;De4n", "site": "https://openreview.net/forum?id=-4hMlsXK4st", "pdf_size": 0, "recommendation": "5;5;5;5;6", "confidence": "4;4;4;2;4", "correctness": "3;3;4;3;3", "technical_novelty": "2;2;2;2;3", "empirical_novelty": "2;2;0;4;3", "wc_summary_paper": "59;70;140;95;80", "wc_summary_review": "74;22;143;29;25", "wc_main_review": "385;283;495;294;187", "wc_review": "518;375;778;418;292", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "0;0;0;0;0", "reply_reviewers": "0;0;0;0;0", "reply_authors": "0;0;0;0;0", "recommendation_avg": [ 5.2, 0.39999999999999997 ], "confidence_avg": [ 3.6, 0.8 ], "correctness_avg": [ 3.2, 0.39999999999999997 ], "technical_novelty_avg": [ 2.2, 0.39999999999999997 ], "empirical_novelty_avg": [ 2.2, 1.32664991614216 ], "wc_summary_paper_avg": [ 88.8, 28.20921835145384 ], "wc_summary_review_avg": [ 58.6, 46.27137343974134 ], "wc_main_review_avg": [ 328.8, 104.1122471181945 ], "wc_review_avg": [ 476.2, 167.56419665310366 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": 0.25, "corr_recommendation_correctness": -0.25000000000000006, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:IKFr6Dc9yloJ:scholar.google.com/&scioq=Improving+Robustness+with+Optimal+Transport+based+Adversarial+Generalization&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;2;1;1;3;2;1", "aff_unique_norm": "Deakin University;University of Melbourne;Monash University;Defence Science and Technology Group", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.deakin.edu.au;https://www.unimelb.edu.au;https://www.monash.edu;https://www.dst.defence.gov.au/", "aff_unique_abbr": "Deakin;UniMelb;Monash;DST Group", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0", "aff_country_unique": "Australia" }, { "id": "-6me0AsJVdu", "title": "Model Validation Using Mutated Training Labels: An Exploratory Study", "track": "main", "status": "Reject", "tldr": "", "abstract": "For out-of-sample validation, the sample set may be too small to be representative of the data distribution; the accuracy can have a large variance across different runs; excessive reuse of a fixed set of samples can lead to overfitting even if the samples are held out and not used in the training process. This paper introduces an exploratory study on Mutation Validation (MV), a model validation method using mutated training labels for supervised learning. MV mutates training data labels, retrains the model against the mutated data, then uses the metamorphic relation capturing the consequent training performance changes to assess model fit. It uses neither validation nor test set. The intuition underpinning MV is that overfitted models tend to fit noise in the training data. We explore 8 different learning algorithms, 18 datasets, and 5 types of hyperparameter tuning tasks. Our results demonstrate that MV is accurate in model selection: the model recommendation hit rate is 92% for MV and less than 60% for out-of-sample validation. MV also provides more stable hyperparameter tuning results than out-of-sample validation across different runs.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/5562f13312e24430f8a65bc718650f05eca6662b.zip", "author": "Jie Zhang;Mark Harman;Benjamin Guedj;Earl Barr;John Shawe-Taylor", "authorids": "~Jie_Zhang21;~Mark_Harman1;~Benjamin_Guedj1;~Earl_Barr2;~John_Shawe-Taylor1", "gender": "F;M;M;M;M", "homepage": "https://sites.google.com/view/jie-zhang;http://www0.cs.ucl.ac.uk/staff/M.Harman/;https://bguedj.github.io;https://www.earlbarr.com;", "dblp": "84/6889-50;h/MarkHarman.html;177/7258;http://dblp.uni-trier.de/pers/hd/b/Barr:Earl_T=;59/41", "google_scholar": "rPWRqf8AAAAJ;https://scholar.google.com.tw/citations?user=IwSN8IgAAAAJ;https://scholar.google.fr/citations?user=q-JTC2sAAAAJ;https://scholar.google.co.uk/citations?user=lSvqKPkAAAAJ;", "orcid": ";0000-0002-5864-4488;0000-0003-1237-7430;0000-0003-0771-7891?lang=en;", "linkedin": "jie-zhang-5326aa187/;markharman/?originalSubdomain=uk;benjaminguedj/;earlbarr;", "or_profile": "~Jie_Zhang21;~Mark_Harman1;~Benjamin_Guedj1;~Earl_Barr2;~John_Shawe-Taylor1", "aff": "King's College London, University of London;University College London, University of London;University College London, University of London;University College London, University of London;University College London", "aff_domain": "kcl.ac.uk;ucl.ac.uk;ucl.ac.uk;ucl.ac.uk;ucl.ac.uk", "position": "Lecturer;Full Professor;Principal Researcher;Professor;Professor", "bibtex": "@misc{\nzhang2022model,\ntitle={Model Validation Using Mutated Training Labels: An Exploratory Study},\nauthor={Jie Zhang and Mark Harman and Benjamin Guedj and Earl Barr and John Shawe-Taylor},\nyear={2022},\nurl={https://openreview.net/forum?id=-6me0AsJVdu}\n}", "github": "", "project": "", "reviewers": "S95g;11Jy;1Tm6;FmyF", "site": "https://openreview.net/forum?id=-6me0AsJVdu", "pdf_size": 0, "recommendation": "3;5;6;8", "confidence": "4;4;3;4", "correctness": "2;2;4;3", "technical_novelty": "2;3;3;4", "empirical_novelty": "3;2;4;4", "wc_summary_paper": "56;140;197;75", "wc_summary_review": "53;39;77;74", "wc_main_review": "1880;454;360;179", "wc_review": "1989;633;634;328", "wc_reply_reviewers": "465;16;18;5", "wc_reply_authors": "1259;787;683;321", "reply_reviewers": "1;1;1;1", "reply_authors": "3;1;1;1", "recommendation_avg": [ 5.5, 1.8027756377319946 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.82915619758885 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 3.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 117.0, 55.70906568952669 ], "wc_summary_review_avg": [ 60.75, 15.594470173750693 ], "wc_main_review_avg": [ 718.25, 677.979488996533 ], "wc_review_avg": [ 896.0, 643.2507287209241 ], "wc_reply_reviewers_avg": [ 126.0, 195.7843201076123 ], "wc_reply_authors_avg": [ 762.5, 334.7965800303223 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 1.5, 0.8660254037844386 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.16012815380508713, "corr_recommendation_correctness": 0.5853694070049635, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3995372709600052368&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff_unique_index": "0;1;1;1;1", "aff_unique_norm": "King's College London;University College London", "aff_unique_dep": ";", "aff_unique_url": "https://www.kcl.ac.uk;https://www.ucl.ac.uk", "aff_unique_abbr": "KCL;UCL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United Kingdom" }, { "title": "Hyperparameter Tuning with Renyi Differential Privacy", "status": "Oral", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6746", "id": "-70L8lpp9DF", "poster": "", "openreview": "https://openreview.net/forum?id=-70L8lpp9DF", "slides": "https://iclr.cc/virtual/2022/poster/6746", "video": "https://iclr.cc/virtual/2022/poster/6746", "author_site": "Nicolas Papernot, Thomas Steinke", "tldr": "", "abstract": "For many differentially private algorithms, such as the prominent noisy stochastic gradient descent (DP-SGD), the analysis needed to bound the privacy leakage of a single training run is well understood. However, few studies have reasoned about the privacy leakage resulting from the multiple training runs needed to fine tune the value of the training algorithm\u2019s hyperparameters. In this work, we first illustrate how simply setting hyperparameters based on non-private training runs can leak private information. Motivated by this observation, we then provide privacy guarantees for hyperparameter search procedures within the framework of Renyi Differential Privacy. Our results improve and extend the work of Liu and Talwar (STOC 2019). Our analysis supports our previous observation that tuning hyperparameters does indeed leak private information, but we prove that, under certain assumptions, this leakage is modest, as long as each candidate training run needed to select hyperparameters is itself differentially private.", "keywords": "differential privacy;hyperparameter tuning", "primary_area": "", "supplementary_material": "", "author": "Nicolas Papernot;Thomas Steinke", "authorids": "~Nicolas_Papernot1;~Thomas_Steinke2", "gender": "M;M", "homepage": "https://www.papernot.fr;http://www.thomas-steinke.net/", "dblp": "162/1405;https://dblp.uni-trier.de/pid/73/4025-2.html", "google_scholar": "cGxq0cMAAAAJ;kwnwhrgAAAAJ", "orcid": ";", "linkedin": "nicolaspapernot;thomas-steinke-2841248/", "or_profile": "~Nicolas_Papernot1;~Thomas_Steinke2", "aff": "Google;Google", "aff_domain": "google.com;google.com", "position": "Research Scientist;Research Scientist", "bibtex": "@inproceedings{\npapernot2022hyperparameter,\ntitle={Hyperparameter Tuning with Renyi Differential Privacy},\nauthor={Nicolas Papernot and Thomas Steinke},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=-70L8lpp9DF}\n}", "github": "", "project": "", "reviewers": "mxg8;4Ysq;CBQT;SYbz", "pdf_size": 0, "recommendation": "6;8;8;10", "confidence": "4;3;3;5", "correctness": "4;3;4;4", "technical_novelty": "3;4;3;4", "empirical_novelty": "3;3;3;4", "wc_summary_paper": "54;78;125;244", "wc_summary_review": "8;66;17;73", "wc_main_review": "296;337;155;926", "wc_review": "358;481;297;1243", "wc_reply_reviewers": "10;94;10;35", "wc_reply_authors": "287;358;458;887", "reply_reviewers": "1;1;1;1", "reply_authors": "1;1;2;2", "recommendation_avg": [ 8.0, 1.4142135623730951 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.5, 0.5 ], "empirical_novelty_avg": [ 3.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 125.25, 73.16206325685464 ], "wc_summary_review_avg": [ 41.0, 28.78367592924851 ], "wc_main_review_avg": [ 428.5, 295.0580451368849 ], "wc_review_avg": [ 594.75, 380.0897098054616 ], "wc_reply_reviewers_avg": [ 37.25, 34.31745182847934 ], "wc_reply_authors_avg": [ 497.5, 232.93829655082482 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.4264014327112209, "corr_recommendation_correctness": 0.0, "gs_citation": 138, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=363210376404492572&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=-70L8lpp9DF", "email": "google.com;google.com", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Google", "aff_unique_dep": "Google", "aff_unique_url": "https://www.google.com", "aff_unique_abbr": "Google", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "-7NOEQcD-xH", "title": "Deep Ensemble Policy Learning", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Ensemble learning, which can consistently improve the prediction performance in supervised learning, has drawn increasing attentions in reinforcement learning (RL). However, most related works focus on adopting ensemble methods in environment dynamics modeling and value function approximation, which are more essentially supervised learning tasks of the RL regime. Moreover, considering the inevitable difference between RL and supervised learning, the conclusions or theories of the existing ensemble supervised learning cannot be directly adopted to policy learning in RL. Adapting ensemble method to policy learning has not been well studied and still remains an open problem. In this work, we propose to learn the ensemble policies under the same RL objective in an end-to-end manner, in which sub-policy training and policy ensemble are combined organically and optimized simultaneously. We further theoretically prove that ensemble policy learning can improve exploration efficacy through increasing entropy of action distribution. In addition, we incorporate a regularization of diversity enhancement over the policy space which retains the ability of the ensemble policy to generalize to unseen states. The experimental results on two complex grid-world environments and one real-world application demonstrate that our proposed method achieves significantly higher sample efficiency and better policy generalization performance.\n", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhengyu Yang;Kan Ren;Xufang Luo;Weiqing Liu;Jiang Bian;Weinan Zhang;Dongsheng Li", "authorids": "~Zhengyu_Yang1;~Kan_Ren1;~Xufang_Luo1;~Weiqing_Liu1;~Jiang_Bian1;~Weinan_Zhang1;~Dongsheng_Li2", "gender": "M;M;F;;M;M;M", "homepage": ";https://saying.ren;;;https://sites.google.com/view/jiangbian;http://wnzhang.net;http://recmind.cn", "dblp": "159/1188;28/7458;218/7350;;09/851-2.html;28/10261-1;254/0830-2.html", "google_scholar": ";USnQVWgAAAAJ;;;pZBEnY8AAAAJ;Qzss0GEAAAAJ;VNg5rA8AAAAJ", "orcid": ";;;;0000-0002-9472-600X;0000-0002-0127-2425;0000-0003-3103-8442", "linkedin": ";;;weiqing-liu-09646b91/;jbian/;;", "or_profile": "~Zhengyu_Yang1;~Kan_Ren1;~Xufang_Luo1;~Weiqing_Liu1;~Jiang_Bian1;~Weinan_Zhang1;~Dongsheng_Li2", "aff": "Shanghai Jiaotong University;Microsoft;Microsoft Research;;Microsoft;Shanghai Jiaotong University;Microsoft Research Asia", "aff_domain": "sjtu.edu.cn;microsoft.com;microsoft.com;;microsoft.com;sjtu.edu.cn;microsoft.com", "position": "MS student;Researcher;Researcher;;Partner Research Manager;Associate Professor;Principal Researcher", "bibtex": "@misc{\nyang2022deep,\ntitle={Deep Ensemble Policy Learning},\nauthor={Zhengyu Yang and Kan Ren and Xufang Luo and Weiqing Liu and Jiang Bian and Weinan Zhang and Dongsheng Li},\nyear={2022},\nurl={https://openreview.net/forum?id=-7NOEQcD-xH}\n}", "github": "", "project": "", "reviewers": "L5en;qsDK;fjBk;mGqe", "site": "https://openreview.net/forum?id=-7NOEQcD-xH", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "5;5;3;4", "correctness": "4;4;3;3", "technical_novelty": "3;2;2;2", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "30;166;109;29", "wc_summary_review": "17;48;51;13", "wc_main_review": "265;433;337;202", "wc_review": "312;647;497;244", "wc_reply_reviewers": "0;209;0;0", "wc_reply_authors": "902;814;652;764", "reply_reviewers": "0;1;0;0", "reply_authors": "2;2;1;1", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 4.25, 0.82915619758885 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 83.5, 57.638962516686576 ], "wc_summary_review_avg": [ 32.25, 17.340343133859836 ], "wc_main_review_avg": [ 309.25, 85.94293164652926 ], "wc_review_avg": [ 425.0, 158.1123018616831 ], "wc_reply_reviewers_avg": [ 52.25, 90.49965469547384 ], "wc_reply_authors_avg": [ 783.0, 90.33825324855468 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": -0.9045340337332909, "corr_recommendation_correctness": -1.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:QftX7uSog_oJ:scholar.google.com/&scioq=Deep+Ensemble+Policy+Learning&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1;1;1;0;1", "aff_unique_norm": "Shanghai Jiao Tong University;Microsoft", "aff_unique_dep": ";Microsoft Corporation", "aff_unique_url": "https://www.sjtu.edu.cn;https://www.microsoft.com", "aff_unique_abbr": "SJTU;Microsoft", "aff_campus_unique_index": "1", "aff_campus_unique": ";Asia", "aff_country_unique_index": "0;1;1;1;0;0", "aff_country_unique": "China;United States" }, { "id": "-7UeX2KPqs", "title": "State-Action Joint Regularized Implicit Policy for Offline Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Offline reinforcement learning enables learning from a fixed dataset, without further interactions with the environment. The lack of environmental interactions makes the policy training vulnerable to state-action pairs far from the training dataset and prone to missing rewarding actions. For training more effective agents, we propose a framework that supports learning a flexible and well-regularized policy, which consists of a fully implicit policy and a regularization through the state-action visitation frequency induced by the current policy and that induced by the data-collecting behavior policy. We theoretically show the equivalence between policy-matching and state-action-visitation matching, and thus the compatibility of many prior work with our framework. An effective instantiation of our framework through the GAN structure is provided, together with some techniques to explicitly smooth the state-action mapping for robust generalization beyond the static dataset. Extensive experiments and ablation study on the D4RL dataset validate our framework and the effectiveness of our algorithmic designs.", "keywords": "Implicit Policy;State-action Visitation;Distribution Matching;Generative Adversarial Networks", "primary_area": "", "supplementary_material": "", "author": "Shentao Yang;Zhendong Wang;Huangjie Zheng;Mingyuan Zhou", "authorids": "~Shentao_Yang1;~Zhendong_Wang1;~Huangjie_Zheng1;~Mingyuan_Zhou1", "gender": "M;M;M;M", "homepage": ";https://zhendong-wang.github.io/;;http://mingyuanzhou.github.io", "dblp": ";;192/2170;", "google_scholar": "https://scholar.google.com/citations?hl=en;lRiIjhcAAAAJ;Vl5wCXsAAAAJ;LXwCIisAAAAJ", "orcid": "0009-0009-8058-3149;;0000-0003-0508-5034;", "linkedin": "shentaoyang/;;;", "or_profile": "~Shentao_Yang1;~Zhendong_Wang1;~Huangjie_Zheng1;~Mingyuan_Zhou1", "aff": "University of Texas at Austin;University of Texas at Austin;University of Texas, Austin;The University of Texas at Austin", "aff_domain": "utexas.edu;utexas.edu;utexas.edu;utexas.edu", "position": "PhD student;PhD student;PhD student;Associate Professor", "bibtex": "@misc{\nyang2022stateaction,\ntitle={State-Action Joint Regularized Implicit Policy for Offline Reinforcement Learning},\nauthor={Shentao Yang and Zhendong Wang and Huangjie Zheng and Mingyuan Zhou},\nyear={2022},\nurl={https://openreview.net/forum?id=-7UeX2KPqs}\n}", "github": "", "project": "", "reviewers": "Ht1b;aVY7;H1Xc", "site": "https://openreview.net/forum?id=-7UeX2KPqs", "pdf_size": 0, "recommendation": "3;6;6", "confidence": "4;4;4", "correctness": "2;3;3", "technical_novelty": "2;3;2", "empirical_novelty": "3;3;3", "wc_summary_paper": "57;243;49", "wc_summary_review": "85;55;37", "wc_main_review": "479;260;318", "wc_review": "621;558;404", "wc_reply_reviewers": "958;0;0", "wc_reply_authors": "5294;390;1279", "reply_reviewers": "3;0;0", "reply_authors": "11;1;2", "recommendation_avg": [ 5.0, 1.4142135623730951 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 2.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 116.33333333333333, 89.62638500402038 ], "wc_summary_review_avg": [ 59.0, 19.79898987322333 ], "wc_main_review_avg": [ 352.3333333333333, 92.64388209098801 ], "wc_review_avg": [ 527.6666666666666, 91.14944992824817 ], "wc_reply_reviewers_avg": [ 319.3333333333333, 451.6055309178084 ], "wc_reply_authors_avg": [ 2321.0, 2133.3271354076633 ], "reply_reviewers_avg": [ 1.0, 1.4142135623730951 ], "reply_authors_avg": [ 4.666666666666667, 4.4969125210773475 ], "replies_avg": [ 25, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 1.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:CaGqdZzJXBUJ:scholar.google.com/&scioq=State-Action+Joint+Regularized+Implicit+Policy+for+Offline+Reinforcement+Learning&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "University of Texas at Austin", "aff_unique_dep": "", "aff_unique_url": "https://www.utexas.edu", "aff_unique_abbr": "UT Austin", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Austin", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "-7usTUgt7N", "title": "Implicit vs Unfolded Graph Neural Networks", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "It has been observed that graph neural networks (GNN) sometimes struggle to maintain a healthy balance between modeling long-range dependencies across nodes while avoiding unintended consequences such as oversmoothed node representations. To address this issue (among other things), two separate strategies have recently been proposed, namely implicit and unfolded GNNs. The former treats node representations as the fixed points of a deep equilibrium model that can efficiently facilitate arbitrary implicit propagation across the graph with a fixed memory footprint. In contrast, the latter involves treating graph propagation as the unfolded descent iterations as applied to some graph-regularized energy function. While motivated differently, in this paper we carefully elucidate the similarity and differences of these methods, quantifying explicit situations where the solutions they produced may actually be equivalent and others where behavior diverges. This includes the analysis of convergence, representational capacity, and interpretability. We also provide empirical head-to-head comparisons across a variety of synthetic and public real-world benchmarks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yang Yongyi;Yangkun Wang;Zengfeng Huang;David Wipf", "authorids": "~Yang_Yongyi1;~Yangkun_Wang1;~Zengfeng_Huang1;~David_Wipf1", "gender": "M;;M;M", "homepage": "https://fftyyy.github.io;;https://zengfenghuang.github.io/;http://www.davidwipf.com/", "dblp": "05/3653;;97/9726;81/6421", "google_scholar": "EmL0jD0AAAAJ;;https://scholar.google.com.hk/citations?user=FwNBuXUAAAAJ;YJx1WSgAAAAJ", "orcid": ";;0000-0003-2671-7483;", "linkedin": "yongyi-yang-528922218/?originalSubdomain=cn;;;", "or_profile": "~Yang_Yongyi1;~Yangkun_Wang1;~Zengfeng_Huang1;~David_Wipf1", "aff": "Fudan University;;Fudan University;Amazon AI Research Lab", "aff_domain": "fudan.edu.cn;;fudan.edu.cn;amazon.com", "position": "Undergrad student;;Associate Professor;Principal Research Scientist", "bibtex": "@misc{\nyongyi2022implicit,\ntitle={Implicit vs Unfolded Graph Neural Networks},\nauthor={Yang Yongyi and Yangkun Wang and Zengfeng Huang and David Wipf},\nyear={2022},\nurl={https://openreview.net/forum?id=-7usTUgt7N}\n}", "github": "", "project": "", "reviewers": "Ht6n;ngBm;3stV;Ex9H", "site": "https://openreview.net/forum?id=-7usTUgt7N", "pdf_size": 0, "recommendation": "3;5;5;5", "confidence": "4;3;3;3", "correctness": "2;3;4;3", "technical_novelty": "1;2;3;3", "empirical_novelty": "2;2;3;2", "wc_summary_paper": "30;54;76;42", "wc_summary_review": "58;20;1542;54", "wc_main_review": "446;222;74;296", "wc_review": "534;296;1692;392", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.5, 0.8660254037844386 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.25, 0.82915619758885 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 50.5, 16.99264546796643 ], "wc_summary_review_avg": [ 418.5, 648.8210462061169 ], "wc_main_review_avg": [ 259.5, 134.09977628616687 ], "wc_review_avg": [ 728.5, 562.6835256163095 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -1.0, "corr_recommendation_correctness": 0.816496580927726, "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9409368653193530198&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2, "aff_unique_index": "0;0;1", "aff_unique_norm": "Fudan University;Amazon", "aff_unique_dep": ";Amazon AI Research Lab", "aff_unique_url": "https://www.fudan.edu.cn;https://www.amazon.com", "aff_unique_abbr": "Fudan;Amazon AI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1", "aff_country_unique": "China;United States" }, { "title": "NETWORK INSENSITIVITY TO PARAMETER NOISE VIA PARAMETER ATTACK DURING TRAINING", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7062", "id": "-8sBpe7rDiV", "poster": "", "openreview": "https://openreview.net/forum?id=-8sBpe7rDiV", "slides": "https://iclr.cc/virtual/2022/poster/7062", "video": "https://iclr.cc/virtual/2022/poster/7062", "author_site": "Julian B\u00fcchel, Fynn Faber, Dylan R Muir", "tldr": "", "abstract": "Neuromorphic neural network processors, in the form of compute-in-memory crossbar arrays of memristors, or in the form of subthreshold analog and mixed-signal ASICs, promise enormous advantages in compute density and energy efficiency for NN-based ML tasks. However, these technologies are prone to computational non-idealities, due to process variation and intrinsic device physics. This degrades the task performance of networks deployed to the processor, by introducing parameter noise into the deployed model. While it is possible to calibrate each device, or train networks individually for each processor, these approaches are expensive and impractical for commercial deployment. Alternative methods are therefore needed to train networks that are inherently robust against parameter variation, as a consequence of network architecture and parameters. We present a new network training algorithm that attacks network parameters during training, and promotes robust performance during inference in the face of random parameter variation. Our approach introduces a loss regularization term that penalizes the susceptibility of a network to weight perturbation. We compare against previous approaches for producing parameter insensitivity such as dropout, weight smoothing and introducing parameter noise during training. We show that our approach produces models that are more robust to random mismatch-induced parameter variation as well as to targeted parameter variation. Our approach finds minima in flatter locations in the weight-loss landscape compared with other approaches, highlighting that the networks found by our technique are less sensitive to parameter perturbation. Our work provides an approach to deploy neural network architectures to inference devices that suffer from computational non-idealities, with minimal loss of performance. This method will enable deployment at scale to novel energy-efficient computational substrates, promoting cheaper and more prevalent edge inference.", "keywords": "parameter attack;adversarial attack;neural network;deep learning;optimisation;neuromorphic processor", "primary_area": "", "supplementary_material": "/attachment/7ffbeeb53e08296882cdec5ad8afc7050521ca29.zip", "author": "Julian B\u00fcchel;Fynn Firouz Faber;Dylan Richard Muir", "authorids": "~Julian_B\u00fcchel1;~Fynn_Firouz_Faber1;~Dylan_Richard_Muir1", "gender": "M;M;M", "homepage": "https://research.ibm.com/people/julian-buchel;https://github.com/faberf;http://dylan-muir.com", "dblp": ";;https://dblp.uni-trier.de/pid/12/6325", "google_scholar": "AXl8G8sAAAAJ;;CLP_WPsAAAAJ", "orcid": ";;0000-0003-3856-826X", "linkedin": ";;dylanmuir", "or_profile": "~Julian_B\u00fcchel1;~Fynn_Firouz_Faber1;~Dylan_Richard_Muir1", "aff": "Swiss Federal Institute of Technology;Swiss Federal Institute of Technology;SynSense", "aff_domain": "ethz.ch;ethz.ch;synsense.ai", "position": "MS student;MS student;Director", "bibtex": "@inproceedings{\nb{\\\"u}chel2022network,\ntitle={{NETWORK} {INSENSITIVITY} {TO} {PARAMETER} {NOISE} {VIA} {PARAMETER} {ATTACK} {DURING} {TRAINING}},\nauthor={Julian B{\\\"u}chel and Fynn Firouz Faber and Dylan Richard Muir},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=-8sBpe7rDiV}\n}", "github": "", "project": "", "reviewers": "vgst;BXMW;9GjF", "pdf_size": 0, "recommendation": "6;6;8", "confidence": "2;4;3", "correctness": "4;4;4", "technical_novelty": "3;2;3", "empirical_novelty": "3;3;3", "wc_summary_paper": "112;80;123", "wc_summary_review": "38;59;112", "wc_main_review": "129;534;181", "wc_review": "279;673;416", "wc_reply_reviewers": "133;220;0", "wc_reply_authors": "348;862;180", "reply_reviewers": "1;1;0", "reply_authors": "2;3;1", "recommendation_avg": [ 6.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 3.0, 0.816496580927726 ], "correctness_avg": [ 4.0, 0.0 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 105.0, 18.239152027072603 ], "wc_summary_review_avg": [ 69.66666666666667, 31.13768706175132 ], "wc_main_review_avg": [ 281.3333333333333, 179.91911763036435 ], "wc_review_avg": [ 456.0, 163.31768632535383 ], "wc_reply_reviewers_avg": [ 117.66666666666667, 90.46669123065251 ], "wc_reply_authors_avg": [ 463.3333333333333, 290.12334541631697 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 2.0, 0.816496580927726 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14653117483419640685&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "pdf": "https://openreview.net/pdf?id=-8sBpe7rDiV", "email": "ethz.ch;ethz.ch;synsense.ai", "author_num": 3, "aff_unique_index": "0;0;1", "aff_unique_norm": "Swiss Federal Institute of Technology;SynSense", "aff_unique_dep": ";", "aff_unique_url": "https://www.ethz.ch;", "aff_unique_abbr": "ETH Zurich;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Switzerland;" }, { "id": "-9ffJ9NQmal", "title": "VICE: Variational Inference for Concept Embeddings", "track": "main", "status": "Reject", "tldr": "", "abstract": "In this paper we introduce Variational Inference for Concept Embeddings (VICE), a novel method for learning object concept embeddings from human behavior in an odd-one-out task. We use variational inference to obtain a sparse, non-negative solution, with uncertainty information about each embedding value. We leverage this information in a statistical procedure for selecting the dimensionality of the model, based on hypothesis-testing over a validation set. VICE performs as well or better than previous methods on a variety of criteria: accuracy of predicting human behavior in an odd-one-out task, calibration to (empirical) human choice probabilities, reproducibility of object representations across different random initializations, and superior performance on small datasets. The latter is particularly important in cognitive science, where data collection is expensive. Finally, VICE yields highly interpretable object representations, allowing humans to describe the characteristics being represented by each latent dimension.", "keywords": "cognitive science;variational Bayes;category representation;sparse coding;representation learning;interpretable representations", "primary_area": "", "supplementary_material": "", "author": "Lukas Muttenthaler;Charles Yang Zheng;Patrick McClure;Martin N Hebart;Francisco Pereira", "authorids": "~Lukas_Muttenthaler1;~Charles_Yang_Zheng1;~Patrick_McClure1;~Martin_N_Hebart1;~Francisco_Pereira1", "gender": "M;M;;M;M", "homepage": "https://lukasmut.github.io/;;;http://www.martin-hebart.de;http://www.franciscopereira.org", "dblp": "245/4369;https://dblp.uni-trier.de/pers/hd/z/Zheng:Charles_Y=;67/10171;;73/5236", "google_scholar": "https://scholar.google.com/citations?hl=en;;8TKyUl4AAAAJ;Q-n9_FgAAAAJ;HpbSzssAAAAJ", "orcid": "0000-0002-0804-4687;http://orcid.org/0000-0003-3427-0845;;0000-0001-7257-428X;", "linkedin": "lukas-muttenthaler/;;;;francisco-pereira-35735a7/", "or_profile": "~Lukas_Muttenthaler1;~Charles_Yang_Zheng1;~Patrick_McClure1;~Martin_N_Hebart1;~Francisco_Pereira1", "aff": "Max-Planck Institute;National Institutes of Health;National Institutes of Health;Max Planck Institute for Human Cognitive and Brain Sciences;National Institute of Mental Health", "aff_domain": "mpg.de;nih.gov;nih.gov;cbs.mpg.de;nih.gov", "position": "Researcher;Research Scientist;Researcher;Assistant Professor;Staff Scientist", "bibtex": "@misc{\nmuttenthaler2022vice,\ntitle={{VICE}: Variational Inference for Concept Embeddings},\nauthor={Lukas Muttenthaler and Charles Yang Zheng and Patrick McClure and Martin N Hebart and Francisco Pereira},\nyear={2022},\nurl={https://openreview.net/forum?id=-9ffJ9NQmal}\n}", "github": "", "project": "", "reviewers": "pBzd;9iqf;CNX6;qv3e", "site": "https://openreview.net/forum?id=-9ffJ9NQmal", "pdf_size": 0, "recommendation": "3;3;5;6", "confidence": "4;3;3;3", "correctness": "2;4;3;3", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;1;2;2", "wc_summary_paper": "106;214;62;76", "wc_summary_review": "177;37;62;57", "wc_main_review": "535;595;339;360", "wc_review": "818;846;463;493", "wc_reply_reviewers": "0;0;99;0", "wc_reply_authors": "802;922;971;930", "reply_reviewers": "0;0;1;0", "reply_authors": "2;2;2;2", "recommendation_avg": [ 4.25, 1.299038105676658 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 1.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 114.5, 59.60494945891658 ], "wc_summary_review_avg": [ 83.25, 54.92893135679958 ], "wc_main_review_avg": [ 457.25, 110.06901244219465 ], "wc_review_avg": [ 655.0, 177.59363727341136 ], "wc_reply_reviewers_avg": [ 24.75, 42.868257487329714 ], "wc_reply_authors_avg": [ 906.25, 62.993551257251724 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 2.0, 0.0 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.5555555555555555, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:lp2Npo4Lw6sJ:scholar.google.com/&scioq=VICE:+Variational+Inference+for+Concept+Embeddings&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1;1;2;3", "aff_unique_norm": "Max-Planck-Gesellschaft zur F\u00f6rderung der Wissenschaften e.V.;National Institutes of Health;Max Planck Institute for Human Cognitive and Brain Sciences;National Institute of Mental Health", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.mpg.de;https://www.nih.gov;https://www.mpi-cbs.de;https://www.nimh.nih.gov", "aff_unique_abbr": "MPG;NIH;MPI CBS;NIMH", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;0;1", "aff_country_unique": "Germany;United States" }, { "id": "-9uy3c7b_ks", "title": "Learning Controllable Elements Oriented Representations for Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Deep Reinforcement Learning (deep RL) has been successfully applied to solve various decision-making problems in recent years. However, the observations in many real-world tasks are often high dimensional and include much task-irrelevant information, limiting the applications of RL algorithms. To tackle this problem, we propose LCER, a representation learning method that aims to provide RL algorithms with compact and sufficient descriptions of the original observations. Specifically, LCER trains representations to retain the controllable elements of the environment, which can reflect the action-related environment dynamics and thus are likely to be task-relevant. We demonstrate the strength of LCER on the DMControl Suite, proving that it can achieve state-of-the-art performance. To the best of our knowledge, LCER is the first representation learning algorithm that enables the pixel-based SAC to outperform state-based SAC on the DMControl 100K benchmark, showing that the obtained representations can match the oracle descriptions ($i.e.$ the physical states) of the environment.", "keywords": "reinforcement learning;representation learning;mutual information", "primary_area": "", "supplementary_material": "", "author": "Qi Yi;Jiaming Guo;Rui Zhang;Shaohui Peng;Xing Hu;Xishan Zhang;Ke Tang;Zidong Du;Qi Guo;Yunji Chen", "authorids": "~Qi_Yi1;~Jiaming_Guo2;~Rui_Zhang1;~Shaohui_Peng2;~Xing_Hu3;~Xishan_Zhang1;~Ke_Tang2;~Zidong_Du1;~Qi_Guo4;~Yunji_Chen1", "gender": "M;M;F;F;;M;;M;M;M", "homepage": ";;;;;https://faculty.sustech.edu.cn/tangk3/;https://zidongdu.github.io/;http://novel.ict.ac.cn/qguo;;", "dblp": "295/8813;63/8512;60/2536-40;49/10052-1;133/6391;https://dblp.uni-trier.de/pers/hd/t/Tang:Ke.html;44/11216;67/398-1;48/474;246/8768", "google_scholar": "veu6_ykAAAAJ;;dse6jAsAAAAJ;Hc3iRxUAAAAJ;;mzLHFbAAAAAJ;https://scholar.google.com.sg/citations?user=8N9ym9YAAAAJ;;;", "orcid": ";;;;;0000-0002-6236-2002;0000-0002-7603-4210;;;", "linkedin": ";;;;;;;;;", "or_profile": "~Qi_Yi1;~Jiaming_Guo2;~Rui_Zhang1;~Xing_Hu3;~Xishan_Zhang1;~Ke_Tang2;~Zidong_Du1;~Qi_Guo4;~Yunji_Chen1;~shaohui_peng1", "aff": "University of Science and Technology of China;Institute of Computing Technology, Chinese Academy of Sciences;Institute of Computing Technology, CAS;Institute of Computing Technology, Chinese Academy of Sciences;, Cambricon Techonologies;Southern University of Science and Technology;Institute of Computing Technology, Chinese Academy of Sciences;Institute of Computing Technology, Chinese Academy of Sciences;Institute of Computing Technology, Chinese Academy of Sciences;Chinese Academy of Sciences", "aff_domain": "ustc.edu.cn;ict.ac.cn;ict.ac.cn;ict.ac.cn;cambricon.com;sustech.edu.cn;ict.ac.cn;ict.ac.cn;ict.ac.cn;ict.ac.cn", "position": "PhD student;PhD student;Assistant Professor;Associate Professor;Researcher;Full Professor;Associate Professor;Full Professor;Full Professor;PhD student", "bibtex": "@misc{\nyi2022learning,\ntitle={Learning Controllable Elements Oriented Representations for Reinforcement Learning },\nauthor={Qi Yi and Jiaming Guo and Rui Zhang and Shaohui Peng and Xing Hu and Xishan Zhang and Ke Tang and Zidong Du and Qi Guo and Yunji Chen},\nyear={2022},\nurl={https://openreview.net/forum?id=-9uy3c7b_ks}\n}", "github": "", "project": "", "reviewers": "WDLZ;uWv6;pqPt;vvtm", "site": "https://openreview.net/forum?id=-9uy3c7b_ks", "pdf_size": 0, "recommendation": "5;5;5;6", "confidence": "4;4;4;3", "correctness": "3;3;2;3", "technical_novelty": "3;3;3;3", "empirical_novelty": "3;2;2;3", "wc_summary_paper": "223;81;42;335", "wc_summary_review": "238;59;43;55", "wc_main_review": "755;490;589;183", "wc_review": "1216;630;674;573", "wc_reply_reviewers": "0;346;98;0", "wc_reply_authors": "828;1223;1511;487", "reply_reviewers": "0;1;1;0", "reply_authors": "2;2;3;1", "recommendation_avg": [ 5.25, 0.4330127018922193 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 170.25, 116.55336760471573 ], "wc_summary_review_avg": [ 98.75, 80.61133605145122 ], "wc_main_review_avg": [ 504.25, 208.24429764101586 ], "wc_review_avg": [ 773.25, 258.11758463924923 ], "wc_reply_reviewers_avg": [ 111.0, 141.45317246354003 ], "wc_reply_authors_avg": [ 1012.25, 388.26625851340725 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 2.0, 0.7071067811865476 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 10, 0 ], "corr_recommendation_confidence": -1.0, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4301776429187130889&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;1;1;2;3;1;1;1;1", "aff_unique_norm": "University of Science and Technology of China;Chinese Academy of Sciences;Cambricon Technologies;Southern University of Science and Technology", "aff_unique_dep": ";Institute of Computing Technology;;", "aff_unique_url": "http://www.ustc.edu.cn;http://www.ict.ac.cn;https://www.cambricon.com;https://www.sustech.edu.cn", "aff_unique_abbr": "USTC;CAS;Cambricon;SUSTech", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Fast Differentiable Matrix Square Root", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6545", "id": "-AOEi-5VTU8", "poster": "", "openreview": "https://openreview.net/forum?id=-AOEi-5VTU8", "slides": "https://iclr.cc/virtual/2022/poster/6545", "video": "https://iclr.cc/virtual/2022/poster/6545", "author_site": "Yue Song, Nicu Sebe, Wei Wang", "tldr": "", "abstract": "Computing the matrix square root or its inverse in a differentiable manner is important in a variety of computer vision tasks. Previous methods either adopt the Singular Value Decomposition (SVD) to explicitly factorize the matrix or use the Newton-Schulz iteration (NS iteration) to derive the approximate solution. However, both methods are not computationally efficient enough in either the forward pass or in the backward pass. In this paper, we propose two more efficient variants to compute the differentiable matrix square root. For the forward propagation, one method is to use Matrix Taylor Polynomial (MTP), and the other method is to use Matrix Pad\\'e Approximants (MPA). The backward gradient is computed by iteratively solving the continuous-time Lyapunov equation using the matrix sign function. Both methods yield considerable speed-up compared with the SVD or the Newton-Schulz iteration. Experimental results on the de-correlated batch normalization and second-order vision transformer demonstrate that our methods can also achieve competitive and even slightly better performances. The code is available at \\href{https://github.com/KingJamesSong/FastDifferentiableMatSqrt}{https://github.com/KingJamesSong/FastDifferentiableMatSqrt}.", "keywords": "Differentiabl Matrix Square Root;Differentiable Matrix Decomposition;Vision Transformers", "primary_area": "", "supplementary_material": "/attachment/fc3bf977f32fbcf2d828f8a8411abeaf42b59d1b.zip", "author": "Yue Song;Nicu Sebe;Wei Wang", "authorids": "~Yue_Song1;~Nicu_Sebe1;~Wei_Wang43", "gender": "M;M;M", "homepage": "https://kingjamessong.github.io/;http://disi.unitn.it/~sebe/;https://weiwangtrento.github.io/", "dblp": "11/1346;20/3519;35/7092-108", "google_scholar": "Uza2i10AAAAJ;https://scholar.google.it/citations?user=stFCYOAAAAAJ;https://scholar.google.com/citations?hl=en-US", "orcid": ";0000-0002-6597-7248;0000-0002-5477-1017", "linkedin": ";;", "or_profile": "~Yue_Song1;~Nicu_Sebe1;~Wei_Wang43", "aff": "University of Trento, Italy;University of Trento;University of Trento", "aff_domain": "unitn.it;unitn.it;unitn.it", "position": "PhD student;Full Professor;Assistant Professor", "bibtex": "@inproceedings{\nsong2022fast,\ntitle={Fast Differentiable Matrix Square Root},\nauthor={Yue Song and Nicu Sebe and Wei Wang},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=-AOEi-5VTU8}\n}", "github": "", "project": "", "reviewers": "Jnyr;J9af;JTDk", "pdf_size": 0, "recommendation": "8;8;8", "confidence": "4;5;4", "correctness": "4;3;4", "technical_novelty": "3;3;2", "empirical_novelty": "3;3;3", "wc_summary_paper": "161;106;75", "wc_summary_review": "173;58;31", "wc_main_review": "855;473;178", "wc_review": "1189;637;284", "wc_reply_reviewers": "169;0;0", "wc_reply_authors": "1321;1382;486", "reply_reviewers": "4;0;0", "reply_authors": "4;3;1", "recommendation_avg": [ 8.0, 0.0 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "correctness_avg": [ 3.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 114.0, 35.56215216584433 ], "wc_summary_review_avg": [ 87.33333333333333, 61.57019480957396 ], "wc_main_review_avg": [ 502.0, 277.14376533969994 ], "wc_review_avg": [ 703.3333333333334, 372.43015750905863 ], "wc_reply_reviewers_avg": [ 56.333333333333336, 79.66736401368435 ], "wc_reply_authors_avg": [ 1063.0, 408.75991323351 ], "reply_reviewers_avg": [ 1.3333333333333333, 1.8856180831641267 ], "reply_authors_avg": [ 2.6666666666666665, 1.247219128924647 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16011321219520846906&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "pdf": "https://openreview.net/pdf?id=-AOEi-5VTU8", "email": "unitn.it;unitn.it;unitn.it", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Trento", "aff_unique_dep": "", "aff_unique_url": "https://www.unitn.it", "aff_unique_abbr": "UniTN", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Italy" }, { "id": "-AW3SFO63GO", "title": "Dissecting Local Properties of Adversarial Examples", "track": "main", "status": "Reject", "tldr": "", "abstract": "Adversarial examples have attracted significant attention over the years, yet a sufficient understanding is in lack, especially when analyzing their performances in combination with adversarial training. In this paper, we revisit some properties of adversarial examples from both frequency and spatial perspectives: 1) the special high-frequency components of adversarial examples tend to mislead naturally-trained models while have little impact on adversarially-trained ones, and 2) adversarial examples show disorderly perturbations on naturally-trained models and locally-consistent (image shape related) perturbations on adversarially-trained ones. Motivated by these, we analyze the fragile tendency of models with the generated adversarial perturbations, and propose a connection with model vulnerability and local intermediate response. That is, a smaller local intermediate response comes along with better model adversarial robustness. To be specific, we demonstrate that: 1) DNNs are naturally fragile at least for large enough local response differences between adversarial/natural examples, 2) and smoother adversarially-trained models can alleviate local response differences with enhanced robustness. ", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Lu Chen;Renjie Chen;Hang Guo;Yuan Luo;Quanshi Zhang;Yisen Wang", "authorids": "~Lu_Chen4;~Renjie_Chen2;~Hang_Guo2;~Yuan_Luo1;~Quanshi_Zhang1;~Yisen_Wang1", "gender": "F;M;;M;M;M", "homepage": ";http://crj1998.ml/;;https://www.cs.sjtu.edu.cn/en/PeopleDetail.aspx?id=155;http://qszhang.com;https://yisenwang.github.io/", "dblp": ";;;90/6959-3;http://dblp.uni-trier.de/pers/hd/z/Zhang:Quanshi;172/1346-1", "google_scholar": "https://scholar.google.com.hk/citations?view_op=list_works;;;;iFFhHK0AAAAJ;uMWPDboAAAAJ", "orcid": ";;;0000-0002-3910-5286;;", "linkedin": ";;%E8%88%AA-%E9%83%AD-97861b221/;;;", "or_profile": "~Lu_Chen4;~Renjie_Chen2;~Hang_Guo2;~Yuan_Luo1;~Quanshi_Zhang1;~Yisen_Wang1", "aff": "Shanghai Jiaotong University;Shanghai Jiaotong University;Shanghai Jiaotong University;Shanghai Jiaotong University;Shanghai Jiaotong University;Peking University", "aff_domain": "cs.sjtu.edu.cn;sjtu.edu.cn;sjtu.edu;sjtu.edu.cn;sjtu.edu.cn;pku.edu.cn", "position": "PhD student;MS student;MS student;Full Professor;Associate Professor;Assistant Professor", "bibtex": "@misc{\nchen2022dissecting,\ntitle={Dissecting Local Properties of Adversarial Examples},\nauthor={Lu Chen and Renjie Chen and Hang Guo and Yuan Luo and Quanshi Zhang and Yisen Wang},\nyear={2022},\nurl={https://openreview.net/forum?id=-AW3SFO63GO}\n}", "github": "", "project": "", "reviewers": "CGeY;JTZC;Rh4T", "site": "https://openreview.net/forum?id=-AW3SFO63GO", "pdf_size": 0, "recommendation": "1;3;3", "confidence": "4;4;3", "correctness": "2;3;3", "technical_novelty": "1;2;2", "empirical_novelty": "1;2;2", "wc_summary_paper": "47;153;59", "wc_summary_review": "37;79;32", "wc_main_review": "197;415;228", "wc_review": "281;647;319", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 2.3333333333333335, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 2.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 1.6666666666666667, 0.4714045207910317 ], "empirical_novelty_avg": [ 1.6666666666666667, 0.4714045207910317 ], "wc_summary_paper_avg": [ 86.33333333333333, 47.39432689913659 ], "wc_summary_review_avg": [ 49.333333333333336, 21.076579946049648 ], "wc_main_review_avg": [ 280.0, 96.29468659623264 ], "wc_review_avg": [ 415.6666666666667, 164.31135755699367 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.5, "corr_recommendation_correctness": 1.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:nZmbR4rkocoJ:scholar.google.com/&scioq=Dissecting+Local+Properties+of+Adversarial+Examples&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;0;0;0;1", "aff_unique_norm": "Shanghai Jiao Tong University;Peking University", "aff_unique_dep": ";", "aff_unique_url": "https://www.sjtu.edu.cn;http://www.pku.edu.cn", "aff_unique_abbr": "SJTU;Peking U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "China" }, { "id": "-AY7C3f26C_", "title": "Rethinking Deep Face Restoration", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "A model that can authentically restore a low-quality face image to a high-quality one can benefit many applications.\nWhile existing approaches for face restoration make significant progress in generating high-quality faces, they often fail to preserve facial features and cannot authentically reconstruct the faces. Because the human visual system is very sensitive to faces, even minor facial changes may alter the identity and significantly degrade the perceptual quality. In this work, we argue the problems of existing models can be traced down to the two sub-tasks of the face restoration problem, i.e. face generation and face reconstruction,\nand the fragile balance between them. Based on the observation, we propose a new face restoration model that improves both generation and reconstruction by learning a stochastic model and enhancing the latent features respectively. Furthermore, we adapt the number of skip connections for a better balance between the two sub-tasks. Besides the model improvement, we also introduce a new evaluation metric for measuring models' ability to preserve the identity in the restored faces. Extensive experiments demonstrate that our model achieves state-of-the-art performance on multiple face restoration benchmarks. The user study shows that our model produces higher quality faces while better preserving the identity $86.4\\%$ of the time compared with the best performing baselines.", "keywords": "face restoration;generative model", "primary_area": "", "supplementary_material": "/attachment/362ff6ddc376f0d88e7a546c5b16e8bbe8b4cb1e.zip", "author": "Yang Zhao;Yu-Chuan Su;Chun-Te Chu;YANDONG LI;Marius Renn;Yukun Zhu;Changyou Chen;Xuhui Jia", "authorids": "~Yang_Zhao5;~Yu-Chuan_Su1;~Chun-Te_Chu1;~YANDONG_LI1;~Marius_Renn1;~Yukun_Zhu1;~Changyou_Chen1;~Xuhui_Jia1", "gender": "M;;;M;M;M;M;M", "homepage": "https://sites.google.com/view/zhao-yang/;http://sammy-su.github.io/;;https://cold-winter.github.io/;;;https://www.cse.buffalo.edu/~changyou/;https://scholar.google.com/citations?view_op=search_authors&mauthors=xuhui+jia&hl=en&oi=ao", "dblp": ";53/6299;;;;18/10777;65/2802;116/8360", "google_scholar": "9zmGBugAAAAJ;nrcJfPEAAAAJ;;kRLb6PkAAAAJ;;;LtEcKBcAAAAJ;https://scholar.google.com/citations?view_op=search_authors", "orcid": "0000-0003-2925-0137;0000-0002-2711-6738;;0000-0003-2448-1294;;;;", "linkedin": "yang-zhao-145a88153/;https://www.linkedin.com/pub/yu-chuan-su/52/38b/b82/;;;marius-renn/;;;", "or_profile": "~Yang_Zhao5;~Yu-Chuan_Su1;~Chun-Te_Chu1;~YANDONG_LI1;~Marius_Renn1;~Yukun_Zhu1;~Changyou_Chen1;~Xuhui_Jia1", "aff": "State University of New York, Buffalo;Google;;Google;;Google;State University of New York, Buffalo;Google", "aff_domain": "buffalo.edu;google.com;;google.com;;google.com;buffalo.edu;google.com", "position": "PhD student;Research Scientist;;Software Engineer;;SWE;Assistant Professor;Researcher", "bibtex": "@misc{\nzhao2022rethinking,\ntitle={Rethinking Deep Face Restoration},\nauthor={Yang Zhao and Yu-Chuan Su and Chun-Te Chu and YANDONG LI and Marius Renn and Yukun Zhu and Changyou Chen and Xuhui Jia},\nyear={2022},\nurl={https://openreview.net/forum?id=-AY7C3f26C_}\n}", "github": "", "project": "", "reviewers": "JfC2;71YX;JexD;CqSH", "site": "https://openreview.net/forum?id=-AY7C3f26C_", "pdf_size": 0, "recommendation": "3;5;6;6", "confidence": "5;4;5;4", "correctness": "3;2;2;3", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "39;61;46;125", "wc_summary_review": "29;6;31;12", "wc_main_review": "134;130;267;107", "wc_review": "202;197;344;244", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 5.0, 1.224744871391589 ], "confidence_avg": [ 4.5, 0.5 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 67.75, 33.995404101142846 ], "wc_summary_review_avg": [ 19.5, 10.735455276791944 ], "wc_main_review_avg": [ 159.5, 62.9146246909254 ], "wc_review_avg": [ 246.75, 59.03971121203084 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": -0.40824829046386296, "corr_recommendation_correctness": -0.40824829046386296, "gs_citation": 26, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15164187975742337117&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff_unique_index": "0;1;1;1;0;1", "aff_unique_norm": "State University of New York at Buffalo;Google", "aff_unique_dep": ";Google", "aff_unique_url": "https://www.buffalo.edu;https://www.google.com", "aff_unique_abbr": "SUNY Buffalo;Google", "aff_campus_unique_index": "0;1;1;1;0;1", "aff_campus_unique": "Buffalo;Mountain View", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "SHINE: SHaring the INverse Estimate from the forward pass for bi-level optimization and implicit models", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6363", "id": "-ApAkox5mp", "poster": "", "openreview": "https://openreview.net/forum?id=-ApAkox5mp", "slides": "https://iclr.cc/virtual/2022/poster/6363", "video": "https://iclr.cc/virtual/2022/poster/6363", "author_site": "Zaccharie Ramzi, Florian Mannel, Shaojie Bai, Jean-Luc Starck, Philippe Ciuciu, Thomas Moreau", "tldr": "", "abstract": "In recent years, implicit deep learning has emerged as a method to increase the depth of deep neural networks. While their training is memory-efficient, they are still significantly slower to train than their explicit counterparts. In Deep Equilibrium Models~(DEQs), the training is performed as a bi-level problem, and its computational complexity is partially driven by the iterative inversion of a huge Jacobian matrix. In this paper, we propose a novel strategy to tackle this computational bottleneck from which many bi-level problems suffer. The main idea is to use the quasi-Newton matrices from the forward pass to efficiently approximate the inverse Jacobian matrix in the direction needed for the gradient computation. We provide a theorem that motivates using our method with the original forward algorithms. In addition, by modifying these forward algorithms, we further provide theoretical guarantees that our method asymptotically estimates the true implicit gradient. We empirically study this approach in many settings, ranging from hyperparameter optimization to large Multiscale DEQs applied to CIFAR and ImageNet. We show that it reduces the computational cost of the backward pass by up to two orders of magnitude. All this is achieved while retaining the excellent performance of the original models in hyperparameter optimization and on CIFAR, and giving encouraging and competitive results on ImageNet.", "keywords": "implicit models;bi-level optimization;quasi-newton methods", "primary_area": "", "supplementary_material": "/attachment/0a592ce985426dc970e28e82b2c98a393a0600eb.zip", "author": "Zaccharie Ramzi;Florian Mannel;Shaojie Bai;Jean-Luc Starck;Philippe Ciuciu;Thomas Moreau", "authorids": "~Zaccharie_Ramzi1;~Florian_Mannel1;~Shaojie_Bai1;~Jean-Luc_Starck1;~Philippe_Ciuciu1;~Thomas_Moreau2", "gender": "M;;M;M;M;M", "homepage": "https://zaccharieramzi.fr/;https://imsc.uni-graz.at/mannel/;https://jerrybai1995.github.io;http://jstarck.cosmostat.org;https://sites.google.com/site/philippeciuciu/;https://tommoral.github.io", "dblp": "266/7212;;;;;150/2391-1", "google_scholar": "rTgYLN8AAAAJ;;DLVP3PcAAAAJ;;;https://scholar.google.fr/citations?user=HEO_PsAAAAAJ", "orcid": "0000-0002-5888-8749;0000-0001-9042-0428;;http://orcid.org/0000-0003-2177-7794;;0000-0002-1523-3419", "linkedin": "zaccharie-ramzi-043476a5/;;;;;thomasmoreau2010", "or_profile": "~Zaccharie_Ramzi1;~Florian_Mannel1;~Shaojie_Bai1;~Jean-Luc_Starck1;~Philippe_Ciuciu1;~Thomas_Martin_Moreau1", "aff": "CEA;Universit\u00e4t zu L\u00fcbeck;School of Computer Science, Carnegie Mellon University;CEA;CEA;INRIA", "aff_domain": "cea.fr;uni-luebeck.de;cs.cmu.edu;cea.fr;cea.fr;inria.fr", "position": "PhD student;Assistant Professor;PhD student;Full Professor;Research Director;Researcher", "bibtex": "@inproceedings{\nramzi2022shine,\ntitle={{SHINE}: {SH}aring the {IN}verse Estimate from the forward pass for bi-level optimization and implicit models},\nauthor={Zaccharie Ramzi and Florian Mannel and Shaojie Bai and Jean-Luc Starck and Philippe Ciuciu and Thomas Moreau},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=-ApAkox5mp}\n}", "github": "", "project": "", "reviewers": "H5f4;mXZ6;tSim", "pdf_size": 0, "recommendation": "8;8;8", "confidence": "3;3;3", "correctness": "4;3;4", "technical_novelty": "3;3;3", "empirical_novelty": "2;3;3", "wc_summary_paper": "191;32;109", "wc_summary_review": "96;84;59", "wc_main_review": "351;238;163", "wc_review": "638;354;331", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "384;559;38", "reply_reviewers": "0;0;0", "reply_authors": "1;2;1", "recommendation_avg": [ 8.0, 0.0 ], "confidence_avg": [ 3.0, 0.0 ], "correctness_avg": [ 3.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 110.66666666666667, 64.92217563274423 ], "wc_summary_review_avg": [ 79.66666666666667, 15.412837362262522 ], "wc_main_review_avg": [ 250.66666666666666, 77.27152702573495 ], "wc_review_avg": [ 441.0, 139.61614042318556 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 327.0, 216.48248581967704 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 35, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16232312271642758583&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 14, "pdf": "https://openreview.net/pdf?id=-ApAkox5mp", "email": "cea.fr;uni-luebeck.de;cs.cmu.edu;cea.fr;cea.fr;inria.fr", "author_num": 6, "aff_unique_index": "0;1;2;0;0;3", "aff_unique_norm": "Commissariat \u00e0 l'\u00c9nergie Atomique et aux \u00c9nergies Alternatives;University of L\u00fcbeck;Carnegie Mellon University;INRIA", "aff_unique_dep": ";;School of Computer Science;", "aff_unique_url": "https://www cea fr;https://www.uni-luebeck.de;https://www.cmu.edu;https://www.inria.fr", "aff_unique_abbr": "CEA;UzL;CMU;INRIA", "aff_campus_unique_index": "1", "aff_campus_unique": ";Pittsburgh", "aff_country_unique_index": "0;1;2;0;0;0", "aff_country_unique": "France;Germany;United States" }, { "id": "-BBL3b4Tqfo", "title": "Modeling Unknown Semantic Labels as Uncertainty in the Prediction: Evidential Deep Learning for Class-Incremental Semantic Segmentation", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Class-Incremental Learning is an essential component for expanding the knowledge of previously trained neural networks. \nThis is especially useful if the system needs to be able to handle new objects but the original training data is unavailable.\nWhile the semantic segmentation problem has received less attention than classification, it is faced with its own set of challenges in terms of unlabeled classes in the images.\nIn this paper we address the problem of how to model unlabeled classes to avoid unnecessary feature clustering of uncorrelated classes. \nWe propose to use Evidential Deep Learning to model the evidence of the classes with a Dirichlet distribution. \nOur method factorizes the problem into a separate foreground class probability, calculated by the expected value of the Dirichlet distribution, and an unknown class probability corresponding to the uncertainty of the estimate.\nIn our novel formulation the background probability is implicitly modelled, avoiding the feature space clustering that comes from forcing the model to output a high background score for these pixels.\nExperiments on the incremental Pascal VOC and ADE20k show that our method is superior to state-of-the-art methods, especially when repeatedly learning new classes.", "keywords": "Class-Incremental Learning;Semantic Segmentation;Evidential Deep Learrning;Deep Neural Networks;Class-Incremental Semantic Segmentation;Continual-Learning", "primary_area": "", "supplementary_material": "", "author": "Karl Holmquist;Michael Felsberg;Lena Klasen", "authorids": "~Karl_Holmquist1;~Michael_Felsberg2;lena.klasen@liu.se", "gender": "M;;", "homepage": ";https://liu.se/en/employee/micfe03;", "dblp": ";00/78;", "google_scholar": "Q2AY_q4AAAAJ;https://scholar.google.se/citations?hl=en;", "orcid": "0000-0002-8677-8715;0000-0002-6096-3648;", "linkedin": ";https://linkedin.com/in/michael-felsberg-668a202;", "or_profile": "~Karl_Holmquist1;~Michael_Felsberg2;lena.klasen@liu.se", "aff": "Link\u00f6ping University;Link\u00f6ping University;", "aff_domain": "liu.se;liu.se;", "position": "PhD student;Full Professor;", "bibtex": "@misc{\nholmquist2022modeling,\ntitle={Modeling Unknown Semantic Labels as Uncertainty in the Prediction: Evidential Deep Learning for Class-Incremental Semantic Segmentation},\nauthor={Karl Holmquist and Michael Felsberg and Lena Klasen},\nyear={2022},\nurl={https://openreview.net/forum?id=-BBL3b4Tqfo}\n}", "github": "", "project": "", "reviewers": "nzN6;dzSV;K6h8;9j35", "site": "https://openreview.net/forum?id=-BBL3b4Tqfo", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "4;4;4;4", "correctness": "2;3;4;3", "technical_novelty": "2;2;2;3", "empirical_novelty": "1;2;0;2", "wc_summary_paper": "84;62;63;130", "wc_summary_review": "22;33;9;79", "wc_main_review": "261;151;396;520", "wc_review": "367;246;468;729", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 84.75, 27.562429138230904 ], "wc_summary_review_avg": [ 35.75, 26.37588861062315 ], "wc_main_review_avg": [ 332.0, 138.96222508293397 ], "wc_review_avg": [ 452.5, 177.93608403019326 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.7071067811865475, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6766852850735412299&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Link\u00f6ping University", "aff_unique_dep": "", "aff_unique_url": "https://www.liu.se", "aff_unique_abbr": "LiU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Sweden" }, { "id": "-BTmxCddppP", "title": "Revisiting Out-of-Distribution Detection: A Simple Baseline is Surprisingly Effective", "track": "main", "status": "Reject", "tldr": "", "abstract": "It is an important problem in trustworthy machine learning to recognize out-of-distribution (OOD) inputs which are inputs unrelated to the in-distribution task. Many out-of-distribution detection methods have been suggested in recent years. The goal of this paper is to recognize common objectives as well as to identify the implicit scoring functions of different OOD detection methods. In particular, we show that binary discrimination between in- and (different) out-distributions is equivalent to several different formulations of the OOD detection problem. When trained in a shared fashion with a standard classifier, this binary discriminator reaches an OOD detection performance similar to that of Outlier Exposure. Moreover, we show that the confidence loss which is used by Outlier Exposure has an implicit scoring function which differs in a non-trivial fashion from the theoretically optimal scoring function in the case where training and test out-distribution are the same, but is similar to the one used when training with an extra background class. In practice, when trained in exactly the same way, all these methods perform similarly and reach state-of-the-art OOD detection performance.", "keywords": "out-of-distribution detection;robustness;OOD", "primary_area": "", "supplementary_material": "", "author": "Julian Bitterwolf;Alexander Meinke;Maximilian Augustin;Matthias Hein", "authorids": "~Julian_Bitterwolf1;~Alexander_Meinke1;~Maximilian_Augustin1;~Matthias_Hein2", "gender": ";M;M;M", "homepage": "https://uni-tuebingen.de/fakultaeten/mathematisch-naturwissenschaftliche-fakultaet/fachbereiche/informatik/lehrstuehle/maschinelles-lernen/team/julian-bitterwolf-msc/;;https://uni-tuebingen.de/fakultaeten/mathematisch-naturwissenschaftliche-fakultaet/fachbereiche/informatik/lehrstuehle/maschinelles-lernen/news/;https://uni-tuebingen.de/de/164260", "dblp": "232/1887;249/5767;210/2432;97/1213-1", "google_scholar": ";https://scholar.google.de/citations?user=PqHTP_AAAAAJ;https://scholar.google.de/citations?user=f82UrTYAAAAJ;0ZAb3tsAAAAJ", "orcid": ";;;", "linkedin": ";alexander-meinke-a32904173/;;", "or_profile": "~Julian_Bitterwolf1;~Alexander_Meinke1;~Maximilian_Augustin1;~Matthias_Hein2", "aff": "University of T\u00fcbingen;Max-Planck-Institute for Intelligent Systems, Max-Planck Institute;University of Tuebingen;University of T\u00fcbingen", "aff_domain": "uni-tuebingen.de;is.mpg.de;uni-tuebingen.de;uni-tuebingen.de", "position": "PhD student;PhD student;PhD student;Full Professor", "bibtex": "@misc{\nbitterwolf2022revisiting,\ntitle={Revisiting Out-of-Distribution Detection: A Simple Baseline is Surprisingly Effective},\nauthor={Julian Bitterwolf and Alexander Meinke and Maximilian Augustin and Matthias Hein},\nyear={2022},\nurl={https://openreview.net/forum?id=-BTmxCddppP}\n}", "github": "", "project": "", "reviewers": "YhZ7;iH61;vYWv;YRfA;Lwwq", "site": "https://openreview.net/forum?id=-BTmxCddppP", "pdf_size": 0, "recommendation": "3;6;6;8;10", "confidence": "2;4;4;2;3", "correctness": "2;3;3;4;4", "technical_novelty": "2;2;3;3;4", "empirical_novelty": "2;2;2;3;0", "wc_summary_paper": "68;84;70;74;90", "wc_summary_review": "23;290;32;24;149", "wc_main_review": "267;281;159;182;169", "wc_review": "358;655;261;280;408", "wc_reply_reviewers": "202;133;0;0;0", "wc_reply_authors": "1405;1133;391;287;138", "reply_reviewers": "2;1;0;0;0", "reply_authors": "4;3;1;1;1", "recommendation_avg": [ 6.6, 2.33238075793812 ], "confidence_avg": [ 3.0, 0.8944271909999159 ], "correctness_avg": [ 3.2, 0.7483314773547882 ], "technical_novelty_avg": [ 2.8, 0.7483314773547882 ], "empirical_novelty_avg": [ 1.8, 0.9797958971132712 ], "wc_summary_paper_avg": [ 77.2, 8.44748483277715 ], "wc_summary_review_avg": [ 103.6, 104.65677235611653 ], "wc_main_review_avg": [ 211.6, 51.658881133837966 ], "wc_review_avg": [ 392.4, 141.6228795075146 ], "wc_reply_reviewers_avg": [ 67.0, 84.90936344126011 ], "wc_reply_authors_avg": [ 670.8, 502.4230886414357 ], "reply_reviewers_avg": [ 0.6, 0.7999999999999999 ], "reply_authors_avg": [ 2.0, 1.2649110640673518 ], "replies_avg": [ 20, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0958706236059213, "corr_recommendation_correctness": 0.9625334218796219, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14580985648064181116&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 2, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "University of T\u00fcbingen;Max-Planck-Institute for Intelligent Systems;University of Tuebingen", "aff_unique_dep": ";Intelligent Systems;", "aff_unique_url": "https://www.uni-tuebingen.de/;https://www.mpi-is.mpg.de;https://www.uni-tuebingen.de/", "aff_unique_abbr": "Uni T\u00fcbingen;MPI-IS;Uni T\u00fcbingen", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Germany" }, { "id": "-FP1-bBxOzv", "title": "Self Reward Design with Fine-grained Interpretability", "track": "main", "status": "Reject", "tldr": "", "abstract": "Transparency and fairness issues in Deep Reinforcement Learning may stem from the black-box nature of deep neural networks used to learn its policy, value functions etc. This paper proposes a way to circumvent the issues through the bottom-up design of neural networks (NN) with detailed interpretability, where each neuron or layer has its own meaning and utility that corresponds to humanly understandable concept. With deliberate design, we show that lavaland problems can be solved using NN model with few parameters. Furthermore, we introduce the Self Reward Design (SRD), inspired by the Inverse Reward Design, so that our interpretable design can (1) solve the problem by pure design (although imperfectly) (2) be optimized via SRD (3) perform avoidance of unknown states by recognizing the inactivations of neurons aggregated as the activation in \\(w_{unknown}\\).", "keywords": "Reinforcement Learning;Interpretability;Explainable Artificial Intelligence;Neural Networks", "primary_area": "", "supplementary_material": "/attachment/ac365d9d4e575e633f7588b6929fee5c792068f8.zip", "author": "Erico Tjoa;Cuntai Guan", "authorids": "~Erico_Tjoa1;~Cuntai_Guan1", "gender": "M;M", "homepage": ";https://personal.ntu.edu.sg/ctguan/index.html", "dblp": ";95/7006", "google_scholar": "hh9WwAMAAAAJ;https://scholar.google.com.tw/citations?user=sg4vxPoAAAAJ", "orcid": ";0000-0002-0872-3276", "linkedin": ";", "or_profile": "~Erico_Tjoa1;~Cuntai_Guan1", "aff": "Nanyang Technological University;Nanyang Technological University", "aff_domain": "ntu.edu.sg;ntu.edu.sg", "position": "PhD student;Full Professor", "bibtex": "@misc{\ntjoa2022self,\ntitle={Self Reward Design with Fine-grained Interpretability},\nauthor={Erico Tjoa and Cuntai Guan},\nyear={2022},\nurl={https://openreview.net/forum?id=-FP1-bBxOzv}\n}", "github": "", "project": "", "reviewers": "cYQC;sPTc;cBTu;NCWP;NiCV", "site": "https://openreview.net/forum?id=-FP1-bBxOzv", "pdf_size": 0, "recommendation": "1;1;1;3;3", "confidence": "3;4;4;2;4", "correctness": "2;2;1;2;2", "technical_novelty": "1;2;1;2;2", "empirical_novelty": "2;1;1;2;0", "wc_summary_paper": "40;33;61;35;88", "wc_summary_review": "20;44;15;29;38", "wc_main_review": "433;176;139;134;152", "wc_review": "493;253;215;198;278", "wc_reply_reviewers": "444;31;0;0;242", "wc_reply_authors": "851;281;354;57;478", "reply_reviewers": "1;1;0;0;1", "reply_authors": "2;1;1;1;2", "recommendation_avg": [ 1.8, 0.9797958971132713 ], "confidence_avg": [ 3.4, 0.8 ], "correctness_avg": [ 1.8, 0.4 ], "technical_novelty_avg": [ 1.6, 0.4898979485566356 ], "empirical_novelty_avg": [ 1.2, 0.7483314773547883 ], "wc_summary_paper_avg": [ 51.4, 20.828826179120128 ], "wc_summary_review_avg": [ 29.2, 10.796295661012625 ], "wc_main_review_avg": [ 206.8, 114.030522229796 ], "wc_review_avg": [ 287.4, 106.56190688984502 ], "wc_reply_reviewers_avg": [ 143.4, 175.40991990192575 ], "wc_reply_authors_avg": [ 404.2, 262.03923370365743 ], "reply_reviewers_avg": [ 0.6, 0.48989794855663565 ], "reply_authors_avg": [ 1.4, 0.4898979485566356 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.408248290463863, "corr_recommendation_correctness": 0.408248290463863, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5179672160450176510&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 12, "aff_unique_index": "0;0", "aff_unique_norm": "Nanyang Technological University", "aff_unique_dep": "", "aff_unique_url": "https://www.ntu.edu.sg", "aff_unique_abbr": "NTU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Singapore" }, { "id": "-GU1sfGnM5K", "title": "A Reinforcement Learning Environment for Mathematical Reasoning via Program Synthesis", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "\tWe convert the DeepMind Mathematics Dataset into a reinforcement learning environment by interpreting it as a program synthesis problem. Each action taken in the environment adds an operator or an input into a discrete compute graph. Graphs which compute correct answers yield positive reward, enabling optimization of a policy to construct compute graphs conditioned on problem statements. Baseline models are trained using Double DQN on various subsets of problem types, demonstrating the capability to learn to correctly construct graphs despite the challenges of combinatorial explosion and noisy rewards.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Joseph W. Palermo;Johnny Ye;Alok Singh", "authorids": "~Joseph_W._Palermo1;~Johnny_Ye2;~Alok_Singh1", "gender": "M;M;M", "homepage": ";https://alokssingh.github.io/;https://github.com/JohnnyYeeee", "dblp": ";;", "google_scholar": "un0omUsAAAAJ;K6ecfUwAAAAJ;https://scholar.google.ca/citations?user=WbQ7uRAAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Joseph_W._Palermo1;~Alok_Singh1;~Johnny_Ye1", "aff": "Square;National Institute of Technology Silchar,;University of Toronto", "aff_domain": "squareup.com;nits.ac.in;utoronto.ca", "position": "Researcher;PhD student;Undergrad student", "bibtex": "@misc{\npalermo2022a,\ntitle={A Reinforcement Learning Environment for Mathematical Reasoning via Program Synthesis},\nauthor={Joseph W. Palermo and Johnny Ye and Alok Singh},\nyear={2022},\nurl={https://openreview.net/forum?id=-GU1sfGnM5K}\n}", "github": "", "project": "", "reviewers": "5tFT;vQZz;wyEm", "site": "https://openreview.net/forum?id=-GU1sfGnM5K", "pdf_size": 0, "recommendation": "1;3;5", "confidence": "4;4;4", "correctness": "2;2;3", "technical_novelty": "1;2;3", "empirical_novelty": "1;2;3", "wc_summary_paper": "89;92;76", "wc_summary_review": "23;41;85", "wc_main_review": "228;344;261", "wc_review": "340;477;422", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "206;0;0", "reply_reviewers": "0;0;0", "reply_authors": "1;0;0", "recommendation_avg": [ 3.0, 1.632993161855452 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 2.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.0, 0.816496580927726 ], "empirical_novelty_avg": [ 2.0, 0.816496580927726 ], "wc_summary_paper_avg": [ 85.66666666666667, 6.944222218666553 ], "wc_summary_review_avg": [ 49.666666666666664, 26.042699979499478 ], "wc_main_review_avg": [ 277.6666666666667, 48.80118395649388 ], "wc_review_avg": [ 413.0, 56.290911048469155 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 68.66666666666667, 97.10933128295251 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.8660254037844385, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3310860925692030654&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1;2", "aff_unique_norm": "Square, Inc.;National Institute of Technology Silchar;University of Toronto", "aff_unique_dep": ";;", "aff_unique_url": "https://squareup.com;https://www.nits.ac.in/silchar/;https://www.utoronto.ca", "aff_unique_abbr": "Square;NIT Silchar;U of T", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2", "aff_country_unique": "United States;India;Canada" }, { "title": "Top-N: Equivariant Set and Graph Generation without Exchangeability", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6635", "id": "-Gk_IPJWvk", "poster": "", "openreview": "https://openreview.net/forum?id=-Gk_IPJWvk", "slides": "https://iclr.cc/virtual/2022/poster/6635", "video": "https://iclr.cc/virtual/2022/poster/6635", "author_site": "Cl\u00e9ment Vignac, Pascal Frossard", "tldr": "", "abstract": "This work addresses one-shot set and graph generation, and, more specifically, the parametrization of probabilistic decoders that map a vector-shaped prior to a distribution over sets or graphs. Sets and graphs are most commonly generated by first sampling points i.i.d. from a normal distribution, and then processing these points along with the prior vector using Transformer layers or Graph Neural Networks. \nThis architecture is designed to generate exchangeable distributions, i.e., all permutations of the generated outputs are equally likely. We however show that it only optimizes a proxy to the evidence lower bound, which makes it hard to train. We then study equivariance in generative settings and show that non-exchangeable methods can still achieve permutation equivariance. Using this result, we introduce Top-n creation, a differentiable generation mechanism that uses the latent vector to select the most relevant points from a trainable reference set. Top-n can replace i.i.d. generation in any Variational Autoencoder or Generative Adversarial Network. Experimentally, our method outperforms i.i.d. generation by 15% at SetMNIST reconstruction, by 33% at object detection on CLEVR, generates sets that are 74% closer to the true distribution on a synthetic molecule-like dataset, and generates more valid molecules on QM9. ", "keywords": "set generation;graph generation;permutation equivariance;generative models;Top-N", "primary_area": "", "supplementary_material": "/attachment/7999da100924b2edc37b348a879c2d32838b2708.zip", "author": "Clement Vignac;Pascal Frossard", "authorids": "~Clement_Vignac1;~Pascal_Frossard1", "gender": ";", "homepage": "https://cvignac.github.io/;", "dblp": "254/1004.html;", "google_scholar": "eKJLfHQAAAAJ;", "orcid": ";", "linkedin": ";", "or_profile": "~Clement_Vignac1;~Pascal_Frossard1", "aff": "Swiss Federal Institute of Technology Lausanne;", "aff_domain": "epfl.ch;", "position": "PhD student;", "bibtex": "@inproceedings{\nvignac2022topn,\ntitle={Top-N: Equivariant Set and Graph Generation without Exchangeability},\nauthor={Clement Vignac and Pascal Frossard},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=-Gk_IPJWvk}\n}", "github": "", "project": "", "reviewers": "tgJk;xa3Q;tbNm;uN3i", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "4;3;4;4", "correctness": "2;3;3;4", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;3;2;2", "wc_summary_paper": "331;93;87;136", "wc_summary_review": "37;15;23;67", "wc_main_review": "1334;283;449;262", "wc_review": "1702;391;559;465", "wc_reply_reviewers": "1629;28;40;103", "wc_reply_authors": "5872;694;1570;439", "reply_reviewers": "3;1;1;1", "reply_authors": "10;1;3;1", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 161.75, 99.52732036983615 ], "wc_summary_review_avg": [ 35.5, 19.817921182606415 ], "wc_main_review_avg": [ 582.0, 440.1687176526746 ], "wc_review_avg": [ 779.25, 536.0664021368995 ], "wc_reply_reviewers_avg": [ 450.0, 681.2917877092017 ], "wc_reply_authors_avg": [ 2143.75, 2192.999358755036 ], "reply_reviewers_avg": [ 1.5, 0.8660254037844386 ], "reply_authors_avg": [ 3.75, 3.6996621467371855 ], "replies_avg": [ 27, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.13245323570650439, "corr_recommendation_correctness": 0.9733285267845754, "gs_citation": 40, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10023385156817268910&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=-Gk_IPJWvk", "email": "epfl.ch;", "author_num": 2, "aff_unique_index": "0", "aff_unique_norm": "Swiss Federal Institute of Technology Lausanne", "aff_unique_dep": "", "aff_unique_url": "https://www.epfl.ch", "aff_unique_abbr": "EPFL", "aff_campus_unique_index": "0", "aff_campus_unique": "Lausanne", "aff_country_unique_index": "0", "aff_country_unique": "Switzerland" }, { "id": "-H48S9ePSUC", "title": "Fundamental Limits of Transfer Learning in Binary Classifications", "track": "main", "status": "Reject", "tldr": "", "abstract": "A critical performance barrier in modern machine learning is scarcity of labeled data required for training state of the art massive models, especially in quickly emerging problems with lack of extensive data sets or scenarios where data collection and labeling is expensive/time consuming. Transfer learning is gaining traction as a promising technique to alleviate this barrier by utilizing the data of a related but different \\emph{source} task to compensate for the lack of data in a \\emph{target} task where there are few labeled training data. While there has been many recent algorithmic advances in this domain, a fundamental understanding of when and how much one can transfer knowledge from a related domain to reduce the amount of labeled training data is far from understood. We provide a precise answer to this question for binary classification problems by deriving a novel lower bound on the generalization error that can be achieved by \\emph{any} transfer learning algorithm (regardless of its computational complexity) as a function of the amount of source and target samples. Our lower bound depends on a natural notion of distance that can be easily computed on real world data sets. Other key features of our lower bound are that it does not depend on the source/target data distributions and requires minimal assumptions that enables it application to a broad range of problems. We also consider a more general setting where there are more than one source domains for knowledge transfer to the target task and develop new bounds on generalization error in this setting. We also corroborate our theoretical findings on real image classification and action recognition data sets. These experiments demonstrate that our natural notion of distance is indicative of the difficulty of knowledge transfer between different pairs of source/target tasks, allowing us to investigate the effect of different sources on the target generalization error. Furthermore, to evaluate the sharpness of our bounds we compare our developed lower bounds with upper-bounds achieved by transfer learning base-lines that utilize weighted empirical risk minimization on the combination of source(s) and target data sets.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/90aaa1af3475a5f13cc34ef32c6fc46cfe5c9950.zip", "author": "Mohammadreza Mousavi Kalan;Salman Avestimehr;Mahdi Soltanolkotabi", "authorids": "~Mohammadreza_Mousavi_Kalan1;~Salman_Avestimehr1;~Mahdi_Soltanolkotabi1", "gender": "M;;M", "homepage": ";;http://www-bcf.usc.edu/~soltanol/", "dblp": "207/8487;;75/6691", "google_scholar": "UaxZ3xgAAAAJ;;narJyMAAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Mohammadreza_Mousavi_Kalan1;~Salman_Avestimehr1;~Mahdi_Soltanolkotabi1", "aff": "University of Southern California;;University of Southern California", "aff_domain": "usc.edu;;usc.edu", "position": "PhD student;;Associate Professor", "bibtex": "@misc{\nkalan2022fundamental,\ntitle={Fundamental Limits of Transfer Learning in Binary Classifications},\nauthor={Mohammadreza Mousavi Kalan and Salman Avestimehr and Mahdi Soltanolkotabi},\nyear={2022},\nurl={https://openreview.net/forum?id=-H48S9ePSUC}\n}", "github": "", "project": "", "reviewers": "TfQT;WXNa;dGvD;pmYF;9V1Q", "site": "https://openreview.net/forum?id=-H48S9ePSUC", "pdf_size": 0, "recommendation": "3;5;6;6;6", "confidence": "4;3;2;2;4", "correctness": "2;3;3;3;3", "technical_novelty": "2;2;2;3;3", "empirical_novelty": "2;1;3;2;3", "wc_summary_paper": "53;89;46;149;140", "wc_summary_review": "11;40;13;126;165", "wc_main_review": "248;451;80;274;638", "wc_review": "312;580;139;549;943", "wc_reply_reviewers": "368;323;0;0;0", "wc_reply_authors": "1322;1487;372;271;58", "reply_reviewers": "1;1;0;0;0", "reply_authors": "2;3;1;1;1", "recommendation_avg": [ 5.2, 1.16619037896906 ], "confidence_avg": [ 3.0, 0.8944271909999159 ], "correctness_avg": [ 2.8, 0.39999999999999997 ], "technical_novelty_avg": [ 2.4, 0.4898979485566356 ], "empirical_novelty_avg": [ 2.2, 0.7483314773547882 ], "wc_summary_paper_avg": [ 95.4, 42.75792324236527 ], "wc_summary_review_avg": [ 71.0, 62.906279495770534 ], "wc_main_review_avg": [ 338.2, 190.54070431275306 ], "wc_review_avg": [ 504.6, 272.2532644432386 ], "wc_reply_reviewers_avg": [ 138.2, 169.85688093215416 ], "wc_reply_authors_avg": [ 702.0, 584.8114225970625 ], "reply_reviewers_avg": [ 0.4, 0.48989794855663565 ], "reply_authors_avg": [ 1.6, 0.8 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.5752237416355278, "corr_recommendation_correctness": 0.9432422182837988, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:ICy7_E8Ke-8J:scholar.google.com/&scioq=Fundamental+Limits+of+Transfer+Learning+in+Binary+Classifications&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "University of Southern California", "aff_unique_dep": "", "aff_unique_url": "https://www.usc.edu", "aff_unique_abbr": "USC", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Los Angeles", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "PER-ETD: A Polynomially Efficient Emphatic Temporal Difference Learning Method", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6938", "id": "-HSOjDPfhBJ", "poster": "", "openreview": "https://openreview.net/forum?id=-HSOjDPfhBJ", "slides": "https://iclr.cc/virtual/2022/poster/6938", "video": "https://iclr.cc/virtual/2022/poster/6938", "author_site": "Ziwei Guan, Tengyu Xu, Yingbin Liang", "tldr": "", "abstract": "Emphatic temporal difference (ETD) learning (Sutton et al., 2016) is a successful method to conduct the off-policy value function evaluation with function approximation. Although ETD has been shown to converge asymptotically to a desirable value function, it is well-known that ETD often encounters a large variance so that its sample complexity can increase exponentially fast with the number of iterations. In this work, we propose a new ETD method, called PER-ETD (i.e., PEriodically Restarted-ETD), which restarts and updates the follow-on trace only for a finite period for each iteration of the evaluation parameter. Further, PER-ETD features a design of the logarithmical increase of the restart period with the number of iterations, which guarantees the best trade-off between the variance and bias and keeps both vanishing sublinearly. We show that PER-ETD converges to the same desirable fixed point as ETD, but improves the exponential sample complexity of ETD to be polynomials. Our experiments validate the superior performance of PER-ETD and its advantage over ETD.", "keywords": "emphatic temporal difference;finite-time analysis;off-policy evaluation;reinforcement learning", "primary_area": "", "supplementary_material": "", "author": "Ziwei Guan;Tengyu Xu;Yingbin Liang", "authorids": "~Ziwei_Guan1;~Tengyu_Xu1;~Yingbin_Liang1", "gender": ";;F", "homepage": ";;https://sites.google.com/view/yingbinliang/home", "dblp": ";;51/332", "google_scholar": ";;lGgLAiIAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Ziwei_Guan1;~Tengyu_Xu1;~Yingbin_Liang1", "aff": ";;The Ohio State University", "aff_domain": ";;osu.edu", "position": ";;Professor", "bibtex": "@inproceedings{\nguan2022peretd,\ntitle={{PER}-{ETD}: A Polynomially Efficient Emphatic Temporal Difference Learning Method},\nauthor={Ziwei Guan and Tengyu Xu and Yingbin Liang},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=-HSOjDPfhBJ}\n}", "github": "", "project": "", "reviewers": "M48z;RfbH;vCDh;TgfL", "pdf_size": 0, "recommendation": "8;8;8;8", "confidence": "3;3;3;3", "correctness": "4;3;4;4", "technical_novelty": "3;3;4;4", "empirical_novelty": "3;3;2;2", "wc_summary_paper": "81;84;133;41", "wc_summary_review": "59;63;38;61", "wc_main_review": "380;58;248;282", "wc_review": "520;205;419;384", "wc_reply_reviewers": "11;0;0;16", "wc_reply_authors": "1142;310;418;430", "reply_reviewers": "1;0;0;1", "reply_authors": "2;1;1;2", "recommendation_avg": [ 8.0, 0.0 ], "confidence_avg": [ 3.0, 0.0 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 84.75, 32.62188682464581 ], "wc_summary_review_avg": [ 55.25, 10.059199769365355 ], "wc_main_review_avg": [ 242.0, 116.76472069936193 ], "wc_review_avg": [ 382.0, 113.738735706003 ], "wc_reply_reviewers_avg": [ 6.75, 6.977642868476432 ], "wc_reply_authors_avg": [ 575.0, 330.6765791525006 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8816612848235247911&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "pdf": "https://openreview.net/pdf?id=-HSOjDPfhBJ", "email": ";;osu.edu", "author_num": 3, "aff_unique_index": "0", "aff_unique_norm": "Ohio State University", "aff_unique_dep": "", "aff_unique_url": "https://www.osu.edu", "aff_unique_abbr": "OSU", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "id": "-JW-1Fg-v2", "title": "Language-Guided Image Clustering", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Image clustering methods have rapidly improved their ability to discover object categories. However, unsupervised clustering methods struggle on other image attributes, e.g. age or activity. The reason is that most recent clustering methods learn deep features that are designed to be sensitive to object category, but less so to other image attributes. We propose to overcome this limitation by introducing the new setting of language-guided image clustering. In this setting, the model is provided with an exhaustive list of phrases describing all the possible values of a specific attribute, together with a shared image-language embedding (e.g. CLIP). Our method then computes the subset of K attribute phrases that form the best clustering of the images. Differently from standard clustering methods, our method can cluster according to image attributes other than the object category. We evaluate our method on a attribute clustering tasks and demonstrate that our method significantly outperforms methods that do not use language-guidance. ", "keywords": "Clustering", "primary_area": "", "supplementary_material": "", "author": "Niv Cohen;Yedid Hoshen", "authorids": "~Niv_Cohen1;~Yedid_Hoshen3", "gender": "M;M", "homepage": "https://www.cs.huji.ac.il/w~nivc/;https://www.cs.huji.ac.il/~ydidh/", "dblp": "259/2291;136/0280", "google_scholar": "https://scholar.google.co.il/citations?user=ZMdC3OQAAAAJ;https://scholar.google.co.il/citations?user=6y1-qS4AAAAJ", "orcid": ";", "linkedin": "niv-cohen-39b49521/;", "or_profile": "~Niv_Cohen1;~Yedid_Hoshen3", "aff": "Hebrew University of Jerusalem;Hebrew University of Jerusalem", "aff_domain": "huji.ac.il;huji.ac.il", "position": "PhD student;Assistant Professor", "bibtex": "@misc{\ncohen2022languageguided,\ntitle={Language-Guided Image Clustering},\nauthor={Niv Cohen and Yedid Hoshen},\nyear={2022},\nurl={https://openreview.net/forum?id=-JW-1Fg-v2}\n}", "github": "", "project": "", "reviewers": "oFLH;SG37;aoq8", "site": "https://openreview.net/forum?id=-JW-1Fg-v2", "pdf_size": 0, "recommendation": "3;5;6", "confidence": "3;3;4", "correctness": "3;3;3", "technical_novelty": "2;2;2", "empirical_novelty": "2;2;2", "wc_summary_paper": "105;63;87", "wc_summary_review": "148;32;74", "wc_main_review": "744;639;81", "wc_review": "997;734;242", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "511;381;173", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "recommendation_avg": [ 4.666666666666667, 1.247219128924647 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 85.0, 17.204650534085253 ], "wc_summary_review_avg": [ 84.66666666666667, 47.95368135561185 ], "wc_main_review_avg": [ 488.0, 290.9673521204742 ], "wc_review_avg": [ 657.6666666666666, 312.9178095851298 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 355.0, 139.20727950314478 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.7559289460184545, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:t4TS81PL9ZwJ:scholar.google.com/&scioq=Language-Guided+Image+Clustering&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Hebrew University of Jerusalem", "aff_unique_dep": "", "aff_unique_url": "https://www.huji.ac.il", "aff_unique_abbr": "HUJI", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Jerusalem", "aff_country_unique_index": "0;0", "aff_country_unique": "Israel" }, { "id": "-NefWT-x2xE", "title": "DYNASHARE: DYNAMIC NEURAL NETWORKS FOR MULTI-TASK LEARNING", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Parameter sharing approaches for deep multi-task learning share a common intuition: for a single network to perform multiple prediction tasks, the network needs to support multiple specialized execution paths. However, previous parameter sharing approaches have relied on a static network structure for each task. In this paper, we propose to increase the capacity for a single network to support multiple tasks by radically increasing the space of possible specialized execution paths. DynaShare is a new approach to deep multi-task learning that learns from the training data a hierarchical gating policy consisting of a task-specific policy for coarse layer selection and gating units for individual input instances, which work together to determine the execution path at inference time. Experimental results on standard multi-task learning benchmark datasets demonstrate the potential of the proposed approach.", "keywords": "multi-task learning;dynamic networks;adaptive inference;neural network", "primary_area": "", "supplementary_material": "", "author": "Golara Javadi;Frederick Tung;Gabriel L. Oliveira", "authorids": "~Golara_Javadi1;~Frederick_Tung1;~Gabriel_L._Oliveira1", "gender": "F;M;M", "homepage": ";;https://sites.google.com/view/gabriel-leivas-oliveira/home", "dblp": ";10/7697;117/2073", "google_scholar": "https://scholar.google.com/citations?hl=en;https://scholar.google.ca/citations?user=T4EeZ9gAAAAJ;5anRZEcAAAAJ", "orcid": ";;0000-0003-0099-9873", "linkedin": "golara-javadi-76036546/;;", "or_profile": "~Golara_Javadi1;~Frederick_Tung1;~Gabriel_L._Oliveira1", "aff": "University of British Columbia;Borealis AI;Borealis AI", "aff_domain": "ubc.ca;borealisai.com;borealisai.com", "position": "PhD student;Researcher;Senior Machine Learning Researcher", "bibtex": "@misc{\njavadi2022dynashare,\ntitle={{DYNASHARE}: {DYNAMIC} {NEURAL} {NETWORKS} {FOR} {MULTI}-{TASK} {LEARNING}},\nauthor={Golara Javadi and Frederick Tung and Gabriel L. Oliveira},\nyear={2022},\nurl={https://openreview.net/forum?id=-NefWT-x2xE}\n}", "github": "", "project": "", "reviewers": "GsXt;R5f1;4hSD;CQ9i;xZvp", "site": "https://openreview.net/forum?id=-NefWT-x2xE", "pdf_size": 0, "recommendation": "3;3;3;3;3", "confidence": "4;4;4;4;5", "correctness": "2;3;2;3;2", "technical_novelty": "3;2;3;1;3", "empirical_novelty": "3;3;2;1;1", "wc_summary_paper": "90;68;120;51;50", "wc_summary_review": "421;59;134;14;34", "wc_main_review": "303;262;776;91;156", "wc_review": "814;389;1030;156;240", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "0;0;0;0;0", "reply_reviewers": "0;0;0;0;0", "reply_authors": "0;0;0;0;0", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 4.2, 0.39999999999999997 ], "correctness_avg": [ 2.4, 0.4898979485566356 ], "technical_novelty_avg": [ 2.4, 0.8 ], "empirical_novelty_avg": [ 2.0, 0.8944271909999159 ], "wc_summary_paper_avg": [ 75.8, 26.445415481704952 ], "wc_summary_review_avg": [ 132.4, 149.92077908015287 ], "wc_main_review_avg": [ 317.6, 241.20414590135053 ], "wc_review_avg": [ 525.8, 338.9468394896167 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:L2GH1A4S0yEJ:scholar.google.com/&scioq=DYNASHARE:+DYNAMIC+NEURAL+NETWORKS+FOR+MULTI-TASK+LEARNING&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;1", "aff_unique_norm": "University of British Columbia;Borealis AI", "aff_unique_dep": ";", "aff_unique_url": "https://www.ubc.ca;https://www.borealisai.com", "aff_unique_abbr": "UBC;Borealis AI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Canada" }, { "id": "-Nf6TikpjQ", "title": "Multi-agent Performative Prediction: From Global Stability and Optimality to Chaos", "track": "main", "status": "Reject", "tldr": "", "abstract": "The recent framework of performative prediction is aimed at capturing settings where predictions influence the target/outcome they try to predict. In this paper, we introduce a natural multi-agent version of this framework, where multiple decision makers try to predict the same outcome. We showcase that such competition can result in interesting phenomena by proving the possibility of phase transitions from stability to instability and eventually chaos. Specifically, we present settings of multi-agent performative prediction where under sufficient conditions, their dynamics lead to global stability and optimality. In the opposite direction, when the agents are not sufficiently cautious in their learning/updates rates, we show that instability and in fact formal chaos is possible. We complement our theoretical predictions with simulations showcasing the predictive power of our results. ", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Georgios Piliouras;Fang-Yi Yu", "authorids": "~Georgios_Piliouras1;~Fang-Yi_Yu1", "gender": ";", "homepage": ";https://cs.gmu.edu/~fangyiyu/", "dblp": "62/1236;183/3719", "google_scholar": ";LhLBzWEAAAAJ", "orcid": ";0000-0002-3697-8807", "linkedin": ";", "or_profile": "~Georgios_Piliouras1;~Fang-Yi_Yu1", "aff": "Singapore University of Technology and Design;School of Engineering and Applied Sciences, Harvard University", "aff_domain": "sutd.edu.sg;seas.harvard.edu", "position": "Associate Professor;Postdoc", "bibtex": "@misc{\npiliouras2022multiagent,\ntitle={Multi-agent Performative Prediction: From Global Stability and Optimality to Chaos},\nauthor={Georgios Piliouras and Fang-Yi Yu},\nyear={2022},\nurl={https://openreview.net/forum?id=-Nf6TikpjQ}\n}", "github": "", "project": "", "reviewers": "xAh2;jNt2;RUeS", "site": "https://openreview.net/forum?id=-Nf6TikpjQ", "pdf_size": 0, "recommendation": "6;6;6", "confidence": "4;3;3", "correctness": "4;3;4", "technical_novelty": "2;3;2", "empirical_novelty": "2;2;2", "wc_summary_paper": "227;42;111", "wc_summary_review": "94;106;147", "wc_main_review": "235;549;388", "wc_review": "556;697;646", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "730;814;284", "reply_reviewers": "0;0;0", "reply_authors": "1;3;1", "recommendation_avg": [ 6.0, 0.0 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "correctness_avg": [ 3.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 126.66666666666667, 76.33406113190159 ], "wc_summary_review_avg": [ 115.66666666666667, 22.69116323349001 ], "wc_main_review_avg": [ 390.6666666666667, 128.2038307626657 ], "wc_review_avg": [ 633.0, 58.29236656715869 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 609.3333333333334, 232.58737904041328 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.6666666666666667, 0.9428090415820634 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 51, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11394781984431664594&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff_unique_index": "0;1", "aff_unique_norm": "Singapore University of Technology and Design;Harvard University", "aff_unique_dep": ";School of Engineering and Applied Sciences", "aff_unique_url": "https://www.sutd.edu.sg;https://www.harvard.edu", "aff_unique_abbr": "SUTD;Harvard", "aff_campus_unique_index": "1", "aff_campus_unique": ";Cambridge", "aff_country_unique_index": "0;1", "aff_country_unique": "Singapore;United States" }, { "id": "-O_9iYmcbZm", "title": "Zero-Round Active Learning", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Active learning (AL) aims at reducing labeling efforts by identifying the most valuable unlabeled data points from a large pool. Traditional AL frameworks have two limitations: First, they perform data selection in a multi-round manner, which is time-consuming and impractical. Second, they usually assume that there are a small amount of labeled data points available \\emph{in the same domain as} the data in the unlabeled pool. \nOur paper investigates a new setting in active learning---how to conduct active learning without relying on pre-labeled data, which is an under-explored yet of great practical value.\nBesides, we propose $D^2ULO$ as a solution that solves both issues, which leverages the idea of domain adaptation (DA) to train a data utility model that can effectively predict the utility for any given unlabeled data in the target domain once labeled. The trained data utility model can then be used to select high-utility data and at the same time, provide an estimate for the utility of the selected data. Our algorithm does not rely on any feedback from annotators in the target domain and hence, which is able to not only work standalone but also benefit existing multi-round active learning algorithms by providing a warm-start. \nOur experiments show that $D^2ULO$ outperforms the existing state-of-the-art AL strategies equipped with domain adaptation over various domain shift settings (e.g., real-to-real data and synthetic-to-real data). Particularly, $D^2ULO$ is applicable to the scenario where source and target labels have mismatch, which is not supported by the existing works. ", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/f2aa1f158a47726172e545b9f948fb3302b43508.zip", "author": "Si Chen;Tianhao Wang;Ruoxi Jia", "authorids": "~Si_Chen5;~Tianhao_Wang2;~Ruoxi_Jia1", "gender": ";M;", "homepage": ";https://tianhaowang.netlify.app/;https://ruoxijia.info/", "dblp": ";274/2144;147/5355-1", "google_scholar": ";nvQOtgkAAAAJ;JCrug-YAAAAJ", "orcid": ";;", "linkedin": ";tian-hao-wang/;", "or_profile": "~Si_Chen5;~Tianhao_Wang2;~Ruoxi_Jia1", "aff": ";Princeton University;Virginia Tech", "aff_domain": ";princeton.edu;vt.edu", "position": ";PhD student;Assistant Professor", "bibtex": "@misc{\nchen2022zeroround,\ntitle={Zero-Round Active Learning},\nauthor={Si Chen and Tianhao Wang and Ruoxi Jia},\nyear={2022},\nurl={https://openreview.net/forum?id=-O_9iYmcbZm}\n}", "github": "", "project": "", "reviewers": "3onk;GF2P;n2HN;pfJ3", "site": "https://openreview.net/forum?id=-O_9iYmcbZm", "pdf_size": 0, "recommendation": "3;5;5;5", "confidence": "4;4;4;5", "correctness": "2;3;3;3", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "76;190;149;64", "wc_summary_review": "260;74;61;26", "wc_main_review": "334;367;494;488", "wc_review": "670;631;704;578", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.5, 0.8660254037844386 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 119.75, 51.99218691303531 ], "wc_summary_review_avg": [ 105.25, 91.05321246392134 ], "wc_main_review_avg": [ 420.75, 71.24385938451117 ], "wc_review_avg": [ 645.75, 46.874166659259124 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": 1.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1800589458370307565&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1", "aff_unique_norm": "Princeton University;Virginia Tech", "aff_unique_dep": ";", "aff_unique_url": "https://www.princeton.edu;https://www.vt.edu", "aff_unique_abbr": "Princeton;VT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "-RAFyM-YPj", "title": "Counting Substructures with Higher-Order Graph Neural Networks: Possibility and Impossibility Results", "track": "main", "status": "Reject", "tldr": "", "abstract": "While message passing Graph Neural Networks (GNNs) have become increasingly popular architectures for learning with graphs, recent works have revealed important shortcomings in their expressive power. In response, several higher-order GNNs have been proposed that substantially increase the expressive power, albeit at a large computational cost. Motivated by this gap, we explore alternative strategies and lower bounds. In particular, we analyze a new recursive pooling technique of local neighborhoods that allows different tradeoffs of computational cost and expressive power. First, we prove that this model can count subgraphs of size $k$, and thereby overcomes a known limitation of low-order GNNs. Second, we show how recursive pooling can exploit sparsity to reduce the computational complexity compared to the existing higher-order GNNs. More generally, we provide a (near) matching information-theoretic lower bound for counting subgraphs with graph representations that pool over representations of derived (sub-)graphs. We also discuss lower bounds on time complexity.", "keywords": "graph neural networks;expressive power;complexity", "primary_area": "", "supplementary_material": "/attachment/2c83a92504b358836c78c5c87ff8ea827884e3ff.zip", "author": "Behrooz Tahmasebi;Derek Lim;Stefanie Jegelka", "authorids": "~Behrooz_Tahmasebi1;~Derek_Lim1;~Stefanie_Jegelka3", "gender": "M;M;F", "homepage": "https://people.csail.mit.edu/bzt/;https://cptq.github.io/;http://people.csail.mit.edu/stefje/", "dblp": "223/0884;267/5433;38/7003", "google_scholar": "ZXCO3DMAAAAJ;y9YTBIsAAAAJ;gTWUZlsAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Behrooz_Tahmasebi1;~Derek_Lim1;~Stefanie_Jegelka3", "aff": "Massachusetts Institute of Technology;Meta Facebook;Massachusetts Institute of Technology", "aff_domain": "mit.edu;fb.com;mit.edu", "position": "PhD student;Intern;Associate Professor", "bibtex": "@misc{\ntahmasebi2022counting,\ntitle={Counting Substructures with Higher-Order Graph Neural Networks: Possibility and Impossibility Results},\nauthor={Behrooz Tahmasebi and Derek Lim and Stefanie Jegelka},\nyear={2022},\nurl={https://openreview.net/forum?id=-RAFyM-YPj}\n}", "github": "", "project": "", "reviewers": "bVPW;vpVz;BQ8d;572g;T3yH", "site": "https://openreview.net/forum?id=-RAFyM-YPj", "pdf_size": 0, "recommendation": "5;5;6;6;6", "confidence": "3;3;4;3;5", "correctness": "3;3;3;3;4", "technical_novelty": "3;3;3;3;4", "empirical_novelty": "2;0;2;2;2", "wc_summary_paper": "141;87;95;67;210", "wc_summary_review": "22;18;125;41;76", "wc_main_review": "483;219;517;186;1505", "wc_review": "646;324;737;294;1791", "wc_reply_reviewers": "189;43;237;44;414", "wc_reply_authors": "1129;721;364;390;1054", "reply_reviewers": "1;1;1;1;1", "reply_authors": "2;1;1;1;2", "recommendation_avg": [ 5.6, 0.48989794855663565 ], "confidence_avg": [ 3.6, 0.8 ], "correctness_avg": [ 3.2, 0.39999999999999997 ], "technical_novelty_avg": [ 3.2, 0.39999999999999997 ], "empirical_novelty_avg": [ 1.6, 0.8000000000000002 ], "wc_summary_paper_avg": [ 120.0, 51.11555536233564 ], "wc_summary_review_avg": [ 56.4, 39.962982871652606 ], "wc_main_review_avg": [ 582.0, 480.52887530303525 ], "wc_review_avg": [ 758.4, 544.7430954128744 ], "wc_reply_reviewers_avg": [ 185.4, 137.9892749455551 ], "wc_reply_authors_avg": [ 731.6, 320.5623808247 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 1.4, 0.4898979485566356 ], "replies_avg": [ 20, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.6123724356957945, "corr_recommendation_correctness": 0.408248290463863, "gs_citation": 30, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6648759520800597486&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "Massachusetts Institute of Technology;Meta", "aff_unique_dep": ";Meta Platforms, Inc.", "aff_unique_url": "https://web.mit.edu;https://meta.com", "aff_unique_abbr": "MIT;Meta", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Non-Parallel Text Style Transfer with Self-Parallel Supervision", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6175", "id": "-TSe5o7STVR", "poster": "", "openreview": "https://openreview.net/forum?id=-TSe5o7STVR", "slides": "https://iclr.cc/virtual/2022/poster/6175", "video": "https://iclr.cc/virtual/2022/poster/6175", "author_site": "Ruibo Liu, CHONGYANG GAO, Chenyan Jia, Guangxuan Xu, Soroush Vosoughi", "tldr": "", "abstract": "The performance of existing text style transfer models is severely limited by the non-parallel datasets on which the models are trained. In non-parallel datasets, no direct mapping exists between sentences of the source and target style; the style transfer models thus only receive weak supervision of the target sentences during training, which often leads the model to discard too much style-independent information, or utterly fail to transfer the style.\n\nIn this work, we propose LaMer, a novel text style transfer framework based on large-scale language models. LaMer first mines the roughly parallel expressions in the non-parallel datasets with scene graphs, and then employs MLE training, followed by imitation learning refinement, to leverage the intrinsic parallelism within the data. On two benchmark tasks (sentiment & formality transfer) and a newly proposed challenging task (political stance transfer), our model achieves qualitative advances in transfer accuracy, content preservation, and fluency. Further empirical and human evaluations demonstrate that our model not only makes training more efficient, but also generates more readable and diverse expressions than previous models.", "keywords": "style transfer;non-parallel corpus;imitation learning;language models;political stance transfer", "primary_area": "", "supplementary_material": "/attachment/e2ae99e37d9af5e859e28d2d17d4dd76a85c092d.zip", "author": "Ruibo Liu;Chongyang Gao;Chenyan Jia;Guangxuan Xu;Soroush Vosoughi", "authorids": "~Ruibo_Liu1;~Chongyang_Gao1;~Chenyan_Jia1;~Guangxuan_Xu1;~Soroush_Vosoughi1", "gender": "M;;F;M;", "homepage": "https://www.cs.dartmouth.edu/~rbliu/;https://gcyzsl.github.io/;https://www.jiachenyan.com/;;https://www.cs.dartmouth.edu/~soroush/", "dblp": ";259/8515;278/8322;278/8544.html;01/1709", "google_scholar": "5lgfeo4AAAAJ;HEAgatAAAAAJ;S34REOAAAAAJ;ohsEWqsAAAAJ;45DAXkwAAAAJ", "orcid": ";0000-0002-2358-4710;0000-0002-8407-9224;;0000-0002-2564-8909", "linkedin": ";chongyang-gao-685597116/;;;", "or_profile": "~Ruibo_Liu1;~Chongyang_Gao1;~Chenyan_Jia1;~Guangxuan_Xu1;~Soroush_Vosoughi1", "aff": "Dartmouth College;Northwestern University;University of Texas, Austin;University of California, Los Angeles;Dartmouth College", "aff_domain": "dartmouth.edu;northwestern.edu;utexas.edu;ucla.edu;dartmouth.edu", "position": "PhD student;PhD student;PhD student;MS student;Assistant Professor", "bibtex": "@inproceedings{\nliu2022nonparallel,\ntitle={Non-Parallel Text Style Transfer with Self-Parallel Supervision},\nauthor={Ruibo Liu and Chongyang Gao and Chenyan Jia and Guangxuan Xu and Soroush Vosoughi},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=-TSe5o7STVR}\n}", "github": "", "project": "", "reviewers": "7agN;Hwf9;53GA;itTT;8fDY", "pdf_size": 0, "recommendation": "3;6;6;8;8", "confidence": "4;3;5;4;4", "correctness": "3;3;3;4;3", "technical_novelty": "2;2;3;3;3", "empirical_novelty": "3;3;2;3;3", "wc_summary_paper": "74;110;58;262;93", "wc_summary_review": "22;79;31;119;116", "wc_main_review": "250;363;393;626;852", "wc_review": "346;552;482;1007;1061", "wc_reply_reviewers": "0;269;280;243;0", "wc_reply_authors": "557;1268;1398;1075;626", "reply_reviewers": "0;2;1;3;0", "reply_authors": "1;3;2;4;1", "recommendation_avg": [ 6.2, 1.8330302779823362 ], "confidence_avg": [ 4.0, 0.6324555320336759 ], "correctness_avg": [ 3.2, 0.39999999999999997 ], "technical_novelty_avg": [ 2.6, 0.4898979485566356 ], "empirical_novelty_avg": [ 2.8, 0.39999999999999997 ], "wc_summary_paper_avg": [ 119.4, 73.41825386101198 ], "wc_summary_review_avg": [ 73.4, 40.90281163929932 ], "wc_main_review_avg": [ 496.8, 215.6139142077802 ], "wc_review_avg": [ 689.6, 289.4039391577108 ], "wc_reply_reviewers_avg": [ 158.4, 129.8901073985236 ], "wc_reply_authors_avg": [ 984.8, 337.88246477140535 ], "reply_reviewers_avg": [ 1.2, 1.1661903789690602 ], "reply_authors_avg": [ 2.2, 1.16619037896906 ], "replies_avg": [ 24, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.4909902530309828, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14757482519407793869&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=-TSe5o7STVR", "email": "dartmouth.edu;northwestern.edu;utexas.edu;ucla.edu;dartmouth.edu", "author_num": 5, "aff_unique_index": "0;1;2;3;0", "aff_unique_norm": "Dartmouth College;Northwestern University;University of Texas at Austin;University of California, Los Angeles", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.dartmouth.edu;https://www.northwestern.edu;https://www.utexas.edu;https://www.ucla.edu", "aff_unique_abbr": "Dartmouth;NU;UT Austin;UCLA", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Austin;Los Angeles", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "-Txy_1wHJ4f", "title": "Safe Deep RL in 3D Environments using Human Feedback", "track": "main", "status": "Reject", "tldr": "", "abstract": "Agents should avoid unsafe behaviour during both training and deployment. This typically requires a simulator and a procedural specification of unsafe behaviour. Unfortunately, a simulator is not always available, and procedurally specifying constraints can be difficult or impossible for many real-world tasks. A recently introduced technique, ReQueST, aims to solve this problem by learning a neural simulator of the environment from safe human trajectories, then using the learned simulator to efficiently learn a reward model from human feedback. However, it is yet unknown whether this approach is feasible in complex 3D environments with feedback obtained from real humans - whether sufficient pixel-based neural simulator quality can be achieved, and whether the human data requirements are viable in terms of both quantity and quality. In this paper we answer this question in the affirmative, using ReQueST to train an agent to perform a 3D first-person object collection task using data entirely from human contractors. We show that the resulting agent exhibits an order of magnitude reduction in unsafe behaviour compared to standard reinforcement learning.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Matthew Rahtz;Vikrant Varma;Ramana Kumar;Zachary Kenton;Shane Legg;Jan Leike", "authorids": "~Matthew_Rahtz1;~Vikrant_Varma1;~Ramana_Kumar1;~Zachary_Kenton2;~Shane_Legg1;~Jan_Leike1", "gender": "M;;;M;M;M", "homepage": "http://amid.fish/;;;https://zackenton.github.io/;http://www.vetta.org;https://jan.leike.name", "dblp": "175/1944;281/7099;;209/9980;36/5739;https://dblp.uni-trier.de/pers/hd/l/Leike:Jan", "google_scholar": ";EPYHbToAAAAJ;OyX1-qYAAAAJ;https://scholar.google.co.uk/citations?hl=en;;beiWcokAAAAJ", "orcid": ";;;;;", "linkedin": ";;;zac-kenton-824429124/;;", "or_profile": "~Matthew_Rahtz1;~Vikrant_Varma1;~Ramana_Kumar1;~Zachary_Kenton2;~Shane_Legg1;~Jan_Leike1", "aff": "Google DeepMind;Google DeepMind;Google DeepMind;Google DeepMind;Google DeepMind;OpenAI", "aff_domain": "deepmind.com;deepmind.com;deepmind.com;google.com;deepmind.com;openai.com", "position": "Researcher;Researcher;Researcher;Researcher;Chief Scientist;Alignment Team Lead", "bibtex": "@misc{\nrahtz2022safe,\ntitle={Safe Deep {RL} in 3D Environments using Human Feedback},\nauthor={Matthew Rahtz and Vikrant Varma and Ramana Kumar and Zachary Kenton and Shane Legg and Jan Leike},\nyear={2022},\nurl={https://openreview.net/forum?id=-Txy_1wHJ4f}\n}", "github": "", "project": "", "reviewers": "yf75;gqNf;rmJm", "site": "https://openreview.net/forum?id=-Txy_1wHJ4f", "pdf_size": 0, "recommendation": "3;5;5", "confidence": "4;3;3", "correctness": "4;3;4", "technical_novelty": "1;2;2", "empirical_novelty": "2;2;3", "wc_summary_paper": "107;52;425", "wc_summary_review": "36;32;43", "wc_main_review": "455;108;278", "wc_review": "598;192;746", "wc_reply_reviewers": "393;0;0", "wc_reply_authors": "578;152;738", "reply_reviewers": "1;0;0", "reply_authors": "2;1;1", "recommendation_avg": [ 4.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "correctness_avg": [ 3.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 1.6666666666666667, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 194.66666666666666, 164.41073228013093 ], "wc_summary_review_avg": [ 37.0, 4.546060565661952 ], "wc_main_review_avg": [ 280.3333333333333, 141.67176461415622 ], "wc_review_avg": [ 512.0, 234.2021918485535 ], "wc_reply_reviewers_avg": [ 131.0, 185.26197667087544 ], "wc_reply_authors_avg": [ 489.3333333333333, 247.31266760025773 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.9999999999999998, "corr_recommendation_correctness": -0.49999999999999983, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8921262462306815837&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0;0;0;0;1", "aff_unique_norm": "Google;OpenAI", "aff_unique_dep": "Google DeepMind;", "aff_unique_url": "https://deepmind.com;https://openai.com", "aff_unique_abbr": "DeepMind;OpenAI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;1", "aff_country_unique": "United Kingdom;United States" }, { "id": "-VsGCG_AQ69", "title": "MARNET: Backdoor Attacks against Value-Decomposition Multi-Agent Reinforcement Learning", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Recent works have revealed that backdoor attacks against Deep Reinforcement Learning (DRL) could lead to abnormal action selection of the agent, which may result in failure or even catastrophe in crucial decision processes. However, existing attacks only consider single-agent RL systems, in which the only agent can observe the global state and have full control of the decision process. In this paper, we explore a new backdoor attack paradigm in cooperative multi-agent reinforcement learning (CMARL) scenarios, where a group of agents coordinate with each other to achieve a common goal, while each agent can only observe the local state, e.g., StarCraft II (Vinyals et al. (2017)). In the proposed MARNet attack framework, we carefully design a pipeline of trigger design, action poisoning and reward hacking modules to accommodate the cooperative multi-agent momentums. In particular, as only a subset of agents can observe the triggers in their local observations, we maneuver their actions to the worst actions suggested by an expert policy model. Since the global reward in CMARL is aggregated by individual rewards from all agents, we propose to modify the reward in a way that boosts the bad actions of poisoned agents (agents who observe the triggers) but mitigates the influence on non-poisoned agents. We conduct extensive experiments on two classical MARL algorithms VDN (Sunehag et al. (2018)) and QMIX (Rashid et al. (2018)), in two popular CMARL games Predator Prey (Boehmer et al. (2020)) and SMAC (Samvelyan et al. (2019)). The results show that MARNet outperforms baselines extended from single-agent DRL backdoor attacks TrojDRL (Kiourti et al. (2020)) and Multitasking learning (Ashcraft & Karra (2021)) by reducing the utility under attack by as much as 100%. We apply fine-tuning as a defense against MARNet, and demonstrate that fine-tuning cannot entirely eliminate the effect of the attack.", "keywords": "Backdoor Attacks;Multi-Agent Reinforcement Learning", "primary_area": "", "supplementary_material": "", "author": "Yanjiao Chen;Zhicong Zheng;Xueluan Gong", "authorids": "~Yanjiao_Chen1;~Zhicong_Zheng1;~Xueluan_Gong1", "gender": "F;M;F", "homepage": "https://person.zju.edu.cn/0020875;;", "dblp": ";;", "google_scholar": ";0stl8FoAAAAJ;8vwOEGcAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Yanjiao_Chen1;~Zhicong_Zheng1;~Xueluan_Gong1", "aff": "Zhejiang University, Tsinghua University;Wuhan University;Wuhan University", "aff_domain": "zju.edu.cn;whu.edu.cn;whu.edu.cn", "position": "Researcher;Undergrad student;PhD student", "bibtex": "@misc{\nchen2022marnet,\ntitle={{MARNET}: Backdoor Attacks against Value-Decomposition Multi-Agent Reinforcement Learning},\nauthor={Yanjiao Chen and Zhicong Zheng and Xueluan Gong},\nyear={2022},\nurl={https://openreview.net/forum?id=-VsGCG_AQ69}\n}", "github": "", "project": "", "reviewers": "ox3d;vKjZ;5vmH;cWNd", "site": "https://openreview.net/forum?id=-VsGCG_AQ69", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "3;4;3;2", "correctness": "2;3;3;3", "technical_novelty": "2;2;1;3", "empirical_novelty": "2;0;1;3", "wc_summary_paper": "93;34;32;73", "wc_summary_review": "28;24;15;55", "wc_main_review": "576;240;220;154", "wc_review": "697;298;267;282", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 3.0, 0.7071067811865476 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 1.5, 1.118033988749895 ], "wc_summary_paper_avg": [ 58.0, 25.990382836734053 ], "wc_summary_review_avg": [ 30.5, 14.908051515875574 ], "wc_main_review_avg": [ 297.5, 163.9107989120912 ], "wc_review_avg": [ 386.0, 179.89024431580498 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.816496580927726, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2889531734646497183&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;1", "aff_unique_norm": "Zhejiang University;Wuhan University", "aff_unique_dep": ";", "aff_unique_url": "http://www.zju.edu.cn;http://www.whu.edu.cn/", "aff_unique_abbr": "ZJU;WHU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "id": "-YAqAIsxr7v", "title": "OVD-Explorer: A General Information-theoretic Exploration Approach for Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Many exploration strategies are built upon the optimism in the face of the uncertainty (OFU) principle for reinforcement learning. However, without considering the aleatoric uncertainty, existing methods may over-explore the state-action pairs with large randomness and hence are non-robust. In this paper, we explicitly capture the aleatoric uncertainty from a distributional perspective and propose an information-theoretic exploration method named Optimistic Value Distribution Explorer (OVD-Explorer). OVD-Explorer follows the OFU principle, but more importantly, it avoids exploring the areas with high aleatoric uncertainty through maximizing the mutual information between policy and the upper bounds of policy's returns. Furthermore, to make OVD-Explorer tractable for continuous RL, we derive a closed form solution, and integrate it with SAC, which, to our knowledge, for the first time alleviates the negative impact on exploration caused by aleatoric uncertainty for continuous RL. Empirical evaluations on the commonly used Mujoco benchmark and a novel GridChaos task demonstrate that OVD-Explorer can alleviate over-exploration and outperform state-of-the-art methods.", "keywords": "Exploration;Uncertainty;Reinforcement Learning", "primary_area": "", "supplementary_material": "/attachment/5e46a7c71b711f0eb9592cb6a6a463eef016fa56.zip", "author": "Jinyi Liu;Zhi Wang;YAN ZHENG;Jianye HAO;Junjie Ye;Chenjia Bai;Pengyi Li", "authorids": "~Jinyi_Liu1;~Zhi_Wang4;~YAN_ZHENG1;~Jianye_HAO1;~Junjie_Ye1;~Chenjia_Bai2;~Pengyi_Li1", "gender": ";;M;M;;M;M", "homepage": ";;https://yanzzzzz.github.io;http://www.icdai.org/jianye.html;;https://baichenjia.github.io/;https://yeshenpy.github.io/", "dblp": "192/6688-2;;10/2381-2;21/7664.html;19/8588.html;247/1943;195/6948", "google_scholar": "kaQS7NAAAAAJ;VoB6-2cAAAAJ;https://scholar.google.com.hk/citations?user=tJuhd1kAAAAJ;;;Rm_1y2kAAAAJ;https://scholar.google.com/citations?hl=zh-CN", "orcid": ";;;0000-0002-0422-8235;;;0009-0009-8546-2346", "linkedin": "\u91d1\u6bc5-\u5218-5b7447118;;;;;;", "or_profile": "~Jinyi_Liu1;~Zhi_Wang4;~YAN_ZHENG1;~Jianye_HAO1;~Junjie_Ye1;~Chenjia_Bai2;~Pengyi_Li1", "aff": "Tianjin University;Ant Group;Tianjin Unibersity, China;Tianjin University;Huawei Technologies Ltd.;Harbin institute of technology;Tianjin University", "aff_domain": "tju.edu.cn;antgroup.com;tju.edu.cn;tju.edu.cn;huawei.com;hit.edu.cn;tju.edu.cn", "position": "MS student;Researcher;Associate Professor;Associate Professor;Principal Researcher;PhD student;MS student", "bibtex": "@misc{\nliu2022ovdexplorer,\ntitle={{OVD}-Explorer: A General Information-theoretic Exploration Approach for Reinforcement Learning},\nauthor={Jinyi Liu and Zhi Wang and YAN ZHENG and Jianye HAO and Junjie Ye and Chenjia Bai and Pengyi Li},\nyear={2022},\nurl={https://openreview.net/forum?id=-YAqAIsxr7v}\n}", "github": "", "project": "", "reviewers": "xZHr;nxh5;Sfg5;YS7S", "site": "https://openreview.net/forum?id=-YAqAIsxr7v", "pdf_size": 0, "recommendation": "3;3;6;6", "confidence": "3;3;3;3", "correctness": "2;2;2;3", "technical_novelty": "2;1;3;3", "empirical_novelty": "3;2;3;3", "wc_summary_paper": "79;58;42;99", "wc_summary_review": "28;13;33;114", "wc_main_review": "466;441;471;219", "wc_review": "573;512;546;432", "wc_reply_reviewers": "0;0;141;0", "wc_reply_authors": "740;750;669;731", "reply_reviewers": "0;0;1;0", "reply_authors": "1;1;2;1", "recommendation_avg": [ 4.5, 1.5 ], "confidence_avg": [ 3.0, 0.0 ], "correctness_avg": [ 2.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.82915619758885 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 69.5, 21.5 ], "wc_summary_review_avg": [ 47.0, 39.37638886439436 ], "wc_main_review_avg": [ 399.25, 104.68613805084225 ], "wc_review_avg": [ 515.75, 52.964020806581516 ], "wc_reply_reviewers_avg": [ 35.25, 61.054790966802926 ], "wc_reply_authors_avg": [ 722.5, 31.610915836147488 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.5773502691896258, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:ZSLwed_T_g0J:scholar.google.com/&scioq=OVD-Explorer:+A+General+Information-theoretic+Exploration+Approach+for+Reinforcement+Learning&hl=en&as_sdt=0,5", "gs_version_total": 2, "aff_unique_index": "0;1;0;0;2;3;0", "aff_unique_norm": "Tianjin University;Ant Group;Huawei;Harbin Institute of Technology", "aff_unique_dep": ";;Huawei Technologies;", "aff_unique_url": "http://www.tju.edu.cn;https://www.antgroup.com;https://www.huawei.com;http://www.hit.edu.cn/", "aff_unique_abbr": "TJU;Ant Group;Huawei;HIT", "aff_campus_unique_index": "1", "aff_campus_unique": ";Harbin", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "China" }, { "id": "-_1NWqlnaGH", "title": "A Compositional Approach to Occlusion in Panoptic Segmentation", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "This paper concerns image segmentation, with emphasis on correctly classifying objects that are partially occluded. We present a novel approach based on compositional modeling that has proven to be effective at classifying separate instances of foreground objects. We demonstrate the efficacy of the approach by replacing the object detection pipeline in UPSNet with a compositional element that utilizes a mixture of distributions to model parts of objects. We also show extensive experimental results for the COCO and Cityscapes datasets. The results show an improvement of 2.6 points in panoptic quality for the top \u201cthing\u201d classes of COCO, and a 3.43% increase in overall recall, using standard UPSNet as a baseline. Moreover, we present qualitative results to demonstrate that improved metrics and datasets are needed for proper characterization of panoptic segmentation systems.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ajit Sarkaar;A. Lynn Abbott", "authorids": "~Ajit_Sarkaar1;~A._Lynn_Abbott2", "gender": ";", "homepage": "https://ajitsarkaar.github.io/;https://ece.vt.edu/people/profile/abbott", "dblp": ";47/6042", "google_scholar": ";UJVd84YAAAAJ", "orcid": ";0000-0003-4363-3118", "linkedin": ";", "or_profile": "~Ajit_Sarkaar1;~Lynn_Abbott1", "aff": ";Virginia Tech", "aff_domain": ";vt.edu", "position": ";Full Professor", "bibtex": "@misc{\nsarkaar2022a,\ntitle={A Compositional Approach to Occlusion in Panoptic Segmentation},\nauthor={Ajit Sarkaar and A. Lynn Abbott},\nyear={2022},\nurl={https://openreview.net/forum?id=-_1NWqlnaGH}\n}", "github": "", "project": "", "reviewers": "Mg3b;CLAb;fDoF;Fwx8", "site": "https://openreview.net/forum?id=-_1NWqlnaGH", "pdf_size": 0, "recommendation": "3;3;3;3", "confidence": "4;4;3;5", "correctness": "3;2;1;2", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;1;1;3", "wc_summary_paper": "46;19;119;77", "wc_summary_review": "38;11;42;45", "wc_main_review": "170;49;137;285", "wc_review": "254;79;298;407", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 2.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 1.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 65.25, 37.204670405743414 ], "wc_summary_review_avg": [ 34.0, 13.509256086106296 ], "wc_main_review_avg": [ 160.25, 84.52033778919723 ], "wc_review_avg": [ 259.5, 118.16196511568349 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Pfd7CEFRhz0J:scholar.google.com/&scioq=A+Compositional+Approach+to+Occlusion+in+Panoptic+Segmentation&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "Virginia Tech", "aff_unique_dep": "", "aff_unique_url": "https://www.vt.edu", "aff_unique_abbr": "VT", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "id": "-bV96qRQuz", "title": "Ranking Convolutional Architectures by their Feature Extraction Capabilities", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "The fundamental problem in Neural Architecture Search (NAS) is to efficiently find high-performing architectures from a given search space. We propose a simple but powerful method which we call FEAR, for ranking architectures in any search space. FEAR leverages the viewpoint that neural networks are powerful non-linear feature extractors. First, we train different architectures in the search space to the same training or validation error. Then, we compare the usefulness of the features extracted by each architecture. We do so with a quick training keeping most of the architecture frozen. This gives fast estimates of the relative performance. We validate FEAR on Natsbench topology search space on three different datasets against competing baselines and show strong ranking correlation especially compared to recently proposed zero-cost methods. FEAR particularly excels at ranking high-performance architectures in the search space. When used in the inner loop of discrete search algorithms like random search and local search, FEAR can cut down the search time by approximately 2.4X without losing accuracy. We additionally empirically study very recently proposed zero-cost measures for ranking and find that they breakdown in ranking performance as training proceeds and also that data-agnostic ranking scores which ignore the dataset do not generalize across dissimilar datasets.", "keywords": "AutoML;NAS;Neural Architecture Search;Ranking", "primary_area": "", "supplementary_material": "/attachment/443b9199902f7b3d89dee9d75a4bb4d1389c54fb.zip", "author": "Debadeepta Dey;Shital Shah;Sebastien Bubeck", "authorids": "~Debadeepta_Dey1;~Shital_Shah1;~Sebastien_Bubeck1", "gender": "M;M;", "homepage": "http://www.debadeepta.com;http://shital.com;http://sbubeck.com/", "dblp": "76/10090;188/5763;35/4292", "google_scholar": "uIBzJWIAAAAJ;1PEHzesAAAAJ;V2Y1L4sAAAAJ", "orcid": ";;", "linkedin": ";http://www.linkedin.com/in/shitals;", "or_profile": "~Debadeepta_Dey1;~Shital_Shah1;~Sebastien_Bubeck1", "aff": "Microsoft Research;Microsoft Research;Microsoft", "aff_domain": "microsoft.com;research.microsoft.com;microsoft.com", "position": "Principal Researcher;Principal Research Engineer;Researcher", "bibtex": "@misc{\ndey2022ranking,\ntitle={Ranking Convolutional Architectures by their Feature Extraction Capabilities},\nauthor={Debadeepta Dey and Shital Shah and Sebastien Bubeck},\nyear={2022},\nurl={https://openreview.net/forum?id=-bV96qRQuz}\n}", "github": "", "project": "", "reviewers": "cTsJ;QKSH;8ztp;EHhF", "site": "https://openreview.net/forum?id=-bV96qRQuz", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "4;4;5;4", "correctness": "3;2;3;3", "technical_novelty": "1;2;2;2", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "139;64;27;80", "wc_summary_review": "33;419;13;28", "wc_main_review": "434;79;275;203", "wc_review": "606;562;315;311", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 1.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 77.5, 40.37635446644484 ], "wc_summary_review_avg": [ 123.25, 170.90988122399477 ], "wc_main_review_avg": [ 247.75, 128.36544511666682 ], "wc_review_avg": [ 448.5, 136.39739733587294 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:R_4i9rLk7foJ:scholar.google.com/&scioq=Ranking+Convolutional+Architectures+by+their+Feature+Extraction+Capabilities&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;0", "aff_unique_norm": "Microsoft", "aff_unique_dep": "Microsoft Research", "aff_unique_url": "https://www.microsoft.com/en-us/research", "aff_unique_abbr": "MSR", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "-cII-Vju5C", "title": "Orthogonalising gradients to speedup neural network optimisation", "track": "main", "status": "Reject", "tldr": "", "abstract": "The optimisation of neural networks can be sped up by orthogonalising the gradients before the optimisation step, ensuring the diversification of the learned representations. We hypothesize that components in the same layer learn the same representations at the beginning of learning. To prevent this we orthogonalise the gradients of the components with respect to each other.\nOur method of orthogonalisation allows the weights to be used more flexibly, in contrast to restricting the weights to an orthogonalised sub-space. We tested this method on ImageNet and CIFAR-10 resulting in a large decrease in learning time, and also obtain a speed-up on the semi-supervised learning BarlowTwins. We obtain similar accuracy to SGD without fine-tuning and better accuracy for na\u00efvely chosen hyper-parameters.\n", "keywords": "machine learning;deep learning;orthogonalisation;optmisation;optimization", "primary_area": "", "supplementary_material": "", "author": "Mark Tuddenham;Adam Prugel-Bennett;Jonathon Hare", "authorids": "~Mark_Tuddenham1;~Adam_Prugel-Bennett1;~Jonathon_Hare1", "gender": ";M;M", "homepage": ";;http://users.soton.ac.uk/jsh2", "dblp": ";p/AdamPrugelBennett.html;13/905", "google_scholar": "3_SqM-EAAAAJ;https://scholar.google.co.uk/citations?user=oQgxYjkAAAAJ;https://scholar.google.co.uk/citations?user=UFeON5oAAAAJ", "orcid": "0000-0002-3428-4051;0000-0002-1329-5077;0000-0003-2921-4283", "linkedin": ";;jonathonhare/", "or_profile": "~Mark_Tuddenham1;~Adam_Prugel-Bennett1;~Jonathon_Hare1", "aff": "University of Southampton;University of Southampton;University of Southampton", "aff_domain": "soton.ac.uk;soton.ac.uk;soton.ac.uk", "position": "PhD student;Full Professor;Full Professor", "bibtex": "@misc{\ntuddenham2022orthogonalising,\ntitle={Orthogonalising gradients to speedup neural network optimisation},\nauthor={Mark Tuddenham and Adam Prugel-Bennett and Jonathon Hare},\nyear={2022},\nurl={https://openreview.net/forum?id=-cII-Vju5C}\n}", "github": "", "project": "", "reviewers": "Pgpr;S4zJ;CSRa;snPa", "site": "https://openreview.net/forum?id=-cII-Vju5C", "pdf_size": 0, "recommendation": "3;3;3;3", "confidence": "4;5;4;5", "correctness": "2;2;2;2", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "37;80;51;32", "wc_summary_review": "17;61;19;19", "wc_main_review": "204;457;401;237", "wc_review": "258;598;471;288", "wc_reply_reviewers": "50;0;0;25", "wc_reply_authors": "137;190;101;237", "reply_reviewers": "1;0;0;1", "reply_authors": "1;1;1;1", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 4.5, 0.5 ], "correctness_avg": [ 2.0, 0.0 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 50.0, 18.66815470259447 ], "wc_summary_review_avg": [ 29.0, 18.49324200890693 ], "wc_main_review_avg": [ 324.75, 106.7529273603305 ], "wc_review_avg": [ 403.75, 138.65131625772617 ], "wc_reply_reviewers_avg": [ 18.75, 20.72890493972125 ], "wc_reply_authors_avg": [ 166.25, 51.67869483646041 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2501801593863183582&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Southampton", "aff_unique_dep": "", "aff_unique_url": "https://www.southampton.ac.uk", "aff_unique_abbr": "Southampton", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United Kingdom" }, { "id": "-dzXGe2FyW6", "title": "Equalized Robustness: Towards Sustainable Fairness Under Distributional Shifts", "track": "main", "status": "Reject", "tldr": "", "abstract": "Increasing concerns have been raised on deep learning fairness in recent years. Existing fairness metrics and algorithms mainly focus on the discrimination of model performance across different groups on in-distribution data. It remains unclear whether the fairness achieved on in-distribution data can be generalized to data with unseen distribution shifts, which are commonly encountered in real-world applications. In this paper, we first propose a new fairness goal, termed Equalized Robustness (ER), to impose fair model robustness against unseen distribution shifts across majority and minority groups. ER measures robustness disparity by the maximum mean discrepancy (MMD) distance between the loss curvature distributions of two groups of data. We show that previous fairness learning algorithms designed for in-distribution fairness fail to meet the new robust fairness goal. We further propose a novel fairness learning algorithm, termed Curvature Matching (CUMA), to simultaneously achieve both traditional in-distribution fairness and our new robust fairness. CUMA efficiently debiases the model robustness by minimizing the MMD distance between loss curvature distributions of two groups. Experiments on three popular datasets show CUMA achieves superior fairness in robustness against distribution shifts, without more sacrifice on either overall accuracies or the in-distribution fairness.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Haotao Wang;Junyuan Hong;Jiayu Zhou;Zhangyang Wang", "authorids": "~Haotao_Wang1;~Junyuan_Hong1;~Jiayu_Zhou1;~Zhangyang_Wang1", "gender": ";M;M;M", "homepage": ";https://jyhong.gitlab.io/;http://jiayuzhou.github.io/;https://vita-group.github.io", "dblp": "236/5090;185/1316;73/1353;119/4026", "google_scholar": "aMIJhlEAAAAJ;7Cbv6doAAAAJ;https://scholar.google.com.tw/citations?user=yQKlLTQAAAAJ;pxFyKAIAAAAJ", "orcid": ";0000-0002-5718-5187;0000-0003-4336-6777;", "linkedin": ";;jiayuzhou/;", "or_profile": "~Haotao_Wang1;~Junyuan_Hong1;~Jiayu_Zhou1;~Zhangyang_Wang1", "aff": "University of Texas, Austin;Sony AI;Michigan State University;University of Texas, Austin", "aff_domain": "utexas.edu;sony.com;msu.edu;utexas.edu", "position": "PhD student;Intern;Associate Professor;Assistant Professor", "bibtex": "@misc{\nwang2022equalized,\ntitle={Equalized Robustness: Towards Sustainable Fairness Under Distributional Shifts},\nauthor={Haotao Wang and Junyuan Hong and Jiayu Zhou and Zhangyang Wang},\nyear={2022},\nurl={https://openreview.net/forum?id=-dzXGe2FyW6}\n}", "github": "", "project": "", "reviewers": "bgyC;cgsp;ebvu;xUAG", "site": "https://openreview.net/forum?id=-dzXGe2FyW6", "pdf_size": 0, "recommendation": "3;3;6;8", "confidence": "4;4;2;3", "correctness": "3;2;3;3", "technical_novelty": "2;2;3;4", "empirical_novelty": "1;2;3;3", "wc_summary_paper": "23;64;107;142", "wc_summary_review": "8;68;68;98", "wc_main_review": "216;155;400;283", "wc_review": "247;287;575;523", "wc_reply_reviewers": "83;130;0;0", "wc_reply_authors": "1482;1954;756;504", "reply_reviewers": "1;1;0;0", "reply_authors": "4;6;1;1", "recommendation_avg": [ 5.0, 2.1213203435596424 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.82915619758885 ], "empirical_novelty_avg": [ 2.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 84.0, 44.760473634670134 ], "wc_summary_review_avg": [ 60.5, 32.69174207655505 ], "wc_main_review_avg": [ 263.5, 90.88591750100782 ], "wc_review_avg": [ 408.0, 142.89506639488994 ], "wc_reply_reviewers_avg": [ 53.25, 55.78250173665573 ], "wc_reply_authors_avg": [ 1174.0, 575.9531230924962 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 3.0, 2.1213203435596424 ], "replies_avg": [ 20, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.7106690545187014, "corr_recommendation_correctness": 0.5443310539518174, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:upETkeT9sh0J:scholar.google.com/&scioq=Equalized+Robustness:+Towards+Sustainable+Fairness+Under+Distributional+Shifts&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "University of Texas at Austin;Sony;Michigan State University", "aff_unique_dep": ";Sony AI;", "aff_unique_url": "https://www.utexas.edu;https://www.sony.com;https://www.msu.edu", "aff_unique_abbr": "UT Austin;Sony AI;MSU", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Austin;", "aff_country_unique_index": "0;1;0;0", "aff_country_unique": "United States;Japan" }, { "title": "Invariant Causal Representation Learning for Out-of-Distribution Generalization", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6685", "id": "-e4EXDWXnSn", "poster": "", "openreview": "https://openreview.net/forum?id=-e4EXDWXnSn", "slides": "https://iclr.cc/virtual/2022/poster/6685", "video": "https://iclr.cc/virtual/2022/poster/6685", "author_site": "Chaochao Lu, Yuhuai Wu, Jos\u00e9 Miguel Hern\u00e1ndez Lobato, Bernhard Schoelkopf", "tldr": "", "abstract": "Due to spurious correlations, machine learning systems often fail to generalize to environments whose distributions differ from the ones used at training time. Prior work addressing this, either explicitly or implicitly, attempted to find a data representation that has an invariant relationship with the target. This is done by leveraging a diverse set of training environments to reduce the effect of spurious features and build an invariant predictor. However, these methods have generalization guarantees only when both data representation and classifiers come from a linear model class. We propose invariant Causal Representation Learning (iCaRL), an approach that enables out-of-distribution (OOD) generalization in the nonlinear setting (i.e., nonlinear representations and nonlinear classifiers). It builds upon a practical and general assumption: the prior over the data representation (i.e., a set of latent variables encoding the data) given the target and the environment belongs to general exponential family distributions, i.e., a more flexible conditionally non-factorized prior that can actually capture complicated dependences between the latent variables. Based on this, we show that it is possible to identify the data representation up to simple transformations. We also show that all direct causes of the target can be fully discovered, which further enables us to obtain generalization guarantees in the nonlinear setting. Experiments on both synthetic and real-world datasets demonstrate that our approach outperforms a variety of baseline methods. ", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Chaochao Lu;Yuhuai Wu;Jos\u00e9 Miguel Hern\u00e1ndez-Lobato;Bernhard Sch\u00f6lkopf", "authorids": "~Chaochao_Lu1;~Yuhuai_Wu1;~Jos\u00e9_Miguel_Hern\u00e1ndez-Lobato1;~Bernhard_Sch\u00f6lkopf1", "gender": ";M;;", "homepage": "https://causallu.com/;http://www.cs.toronto.edu/~ywu/;;", "dblp": "142/2790;;;", "google_scholar": "C_Qxt0IAAAAJ;https://scholar.google.ca/citations?user=bOQGfFIAAAAJ;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Chaochao_Lu1;~Yuhuai_Wu1;~Jos\u00e9_Miguel_Hern\u00e1ndez-Lobato1;~Bernhard_Sch\u00f6lkopf1", "aff": "University of Cambridge;Stanford University;;", "aff_domain": "cam.ac.uk;stanford.edu;;", "position": "PhD student;Postdoc;;", "bibtex": "@inproceedings{\nlu2022invariant,\ntitle={Invariant Causal Representation Learning for Out-of-Distribution Generalization},\nauthor={Chaochao Lu and Yuhuai Wu and Jos{\\'e} Miguel Hern{\\'a}ndez-Lobato and Bernhard Sch{\\\"o}lkopf},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=-e4EXDWXnSn}\n}", "github": "", "project": "", "reviewers": "Ciwc;rp1x;it5B", "pdf_size": 0, "recommendation": "6;6;8", "confidence": "3;3;3", "correctness": "3;3;4", "technical_novelty": "3;3;4", "empirical_novelty": "3;3;4", "wc_summary_paper": "47;55;57", "wc_summary_review": "216;63;29", "wc_main_review": "85;440;378", "wc_review": "348;558;464", "wc_reply_reviewers": "0;620;0", "wc_reply_authors": "1111;2456;666", "reply_reviewers": "0;4;0", "reply_authors": "2;8;1", "recommendation_avg": [ 6.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 3.0, 0.0 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 3.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 53.0, 4.320493798938574 ], "wc_summary_review_avg": [ 102.66666666666667, 81.3319672016415 ], "wc_main_review_avg": [ 301.0, 154.8181729212261 ], "wc_review_avg": [ 456.6666666666667, 85.88881701879988 ], "wc_reply_reviewers_avg": [ 206.66666666666666, 292.27080289043965 ], "wc_reply_authors_avg": [ 1411.0, 760.9314467589486 ], "reply_reviewers_avg": [ 1.3333333333333333, 1.8856180831641267 ], "reply_authors_avg": [ 3.6666666666666665, 3.091206165165235 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.9999999999999998, "gs_citation": 116, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6237838274771137964&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2, "pdf": "https://openreview.net/pdf?id=-e4EXDWXnSn", "email": "cam.ac.uk;stanford.edu;;", "author_num": 4, "aff_unique_index": "0;1", "aff_unique_norm": "University of Cambridge;Stanford University", "aff_unique_dep": ";", "aff_unique_url": "https://www.cam.ac.uk;https://www.stanford.edu", "aff_unique_abbr": "Cambridge;Stanford", "aff_campus_unique_index": "0;1", "aff_campus_unique": "Cambridge;Stanford", "aff_country_unique_index": "0;1", "aff_country_unique": "United Kingdom;United States" }, { "id": "-e7awdzWsOc", "title": "Towards Structured Dynamic Sparse Pre-Training of BERT", "track": "main", "status": "Reject", "tldr": "", "abstract": "Identifying algorithms for computational efficient unsupervised training of large language models is an important and active area of research. \nIn this work, we develop and study a straightforward, dynamic always-sparse pre-training approach for BERT language modeling, which leverages periodic compression steps based on magnitude pruning followed by random parameter re-allocation. \nThis approach enables us to achieve Pareto improvements in terms of the number of floating-point operations (FLOPs) over statically sparse and dense models across a broad spectrum of network sizes. \nFurthermore, we demonstrate that training remains FLOP-efficient when using coarse-grained block sparsity, making it particularly promising for efficient execution on modern hardware accelerators.", "keywords": "sparsity;natural language processing;pre-training;computational efficiency", "primary_area": "", "supplementary_material": "", "author": "Anastasia S. D. Dietrich;Frithjof Gressmann;Douglas Orr;Ivan Chelombiev;Daniel Justus;Carlo Luschi", "authorids": "~Anastasia_S._D._Dietrich1;~Frithjof_Gressmann1;~Douglas_Orr1;~Ivan_Chelombiev1;~Daniel_Justus1;~Carlo_Luschi1", "gender": "F;M;M;M;M;M", "homepage": ";https://frthjf.com/;https://douglasorr.github.io/;;;", "dblp": ";200/0179;33/8535;;;72/10621", "google_scholar": ";https://scholar.google.com/citations?hl=en;;https://scholar.google.com/citations?hl=en;ZMKOlBcAAAAJ;", "orcid": "0000-0002-3839-0396;0009-0002-4155-7393;;;;", "linkedin": ";frithjof-gressmann-6a1606229/;;ivan-chelombiev-5a7790a9/;daniel-justus/;carlo-luschi-1908144/", "or_profile": "~Anastasia_S._D._Dietrich1;~Frithjof_Gressmann1;~Douglas_Orr1;~Ivan_Chelombiev1;~Daniel_Justus1;~Carlo_Luschi1", "aff": "Graphcore;University of Illinois, Urbana Champaign;Graphcore;Graphcore;Graphcore;Graphcore", "aff_domain": "graphcore.ai;illinois.edu;graphcore.ai;graphcore.ai;graphcore.ai;graphcore.ai", "position": "Researcher;PhD student;Researcher;Researcher;Researcher;Director of Research", "bibtex": "@misc{\ndietrich2022towards,\ntitle={Towards Structured Dynamic Sparse Pre-Training of {BERT}},\nauthor={Anastasia S. D. Dietrich and Frithjof Gressmann and Douglas Orr and Ivan Chelombiev and Daniel Justus and Carlo Luschi},\nyear={2022},\nurl={https://openreview.net/forum?id=-e7awdzWsOc}\n}", "github": "", "project": "", "reviewers": "idrP;KM7E;vEBH;KoFV;5K4c", "site": "https://openreview.net/forum?id=-e7awdzWsOc", "pdf_size": 0, "recommendation": "3;3;3;5;6", "confidence": "4;4;3;4;2", "correctness": "3;3;3;2;3", "technical_novelty": "1;2;2;2;2", "empirical_novelty": "2;2;2;3;3", "wc_summary_paper": "64;79;16;63;89", "wc_summary_review": "75;33;20;52;67", "wc_main_review": "267;173;239;308;346", "wc_review": "406;285;275;423;502", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "737;568;761;675;797", "reply_reviewers": "0;0;0;0;0", "reply_authors": "1;1;1;1;1", "recommendation_avg": [ 4.0, 1.2649110640673518 ], "confidence_avg": [ 3.4, 0.8 ], "correctness_avg": [ 2.8, 0.39999999999999997 ], "technical_novelty_avg": [ 1.8, 0.4000000000000001 ], "empirical_novelty_avg": [ 2.4, 0.4898979485566356 ], "wc_summary_paper_avg": [ 62.2, 25.055139193387053 ], "wc_summary_review_avg": [ 49.4, 20.51925924588897 ], "wc_main_review_avg": [ 266.6, 59.23039760123175 ], "wc_review_avg": [ 378.2, 86.53646630178517 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 707.6, 80.32334654382872 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.5929270612815711, "corr_recommendation_correctness": -0.3952847075210474, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2009664423719562280&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;0;0;0;0", "aff_unique_norm": "Graphcore;University of Illinois Urbana-Champaign", "aff_unique_dep": ";", "aff_unique_url": "https://www.graphcore.ai;https://illinois.edu", "aff_unique_abbr": "Graphcore;UIUC", "aff_campus_unique_index": "1", "aff_campus_unique": ";Urbana-Champaign", "aff_country_unique_index": "0;1;0;0;0;0", "aff_country_unique": "United Kingdom;United States" }, { "id": "-fORBF5k2ZB", "title": "Gating Mechanisms Underlying Sequence-to-Sequence Working Memory", "track": "main", "status": "Reject", "tldr": "", "abstract": "Working memory is the process by which a system temporarily stores information across a necessary duration. Memory retention and manipulation of discrete sequences are fundamental building blocks for the underlying computation required to perform working memory tasks. Recurrent neural networks (RNNs) have proven themselves to be powerful tools for such problems, as they, through training, bring rise to the dynamical behavior necessary to enact these computations over many time-steps. As of yet, the means by which these learned internal structures of the network result in a desired set of outputs remains broadly elusive. Furthermore, what is known is often difficult to extrapolate from due to a task specific formalism. In this work, we analyze an RNN, trained perfectly on a discrete sequence working memory task, in fine detail. We explain the learned mechanisms by which this network holds memory and extracts information from memory, and how gating is a natural architectural component to achieve these structures. A synthetic solution to a simplified variant of the working memory task is realized. We then explore how these results can be extrapolated to alternative tasks. ", "keywords": "Working Memory;RNN;Dynamical Systems;Slow Manifold;Gating", "primary_area": "", "supplementary_material": "/attachment/3d17805cd7a6c1d812402fb1259b500ae29e8f6d.zip", "author": "Ian D Jordan;Piotr A Sokol;Il Memming Park", "authorids": "~Ian_D_Jordan1;~Piotr_A_Sokol2;~Il_Memming_Park1", "gender": "M;M;M", "homepage": ";https://scholar.google.com/citations?user=MwySeOEAAAAJ&hl=en;http://catniplab.github.io/", "dblp": ";228/9266;00/4652-2", "google_scholar": ";MwySeOEAAAAJ;CsmltusAAAAJ", "orcid": ";;0000-0002-4255-7750", "linkedin": ";;memming/", "or_profile": "~Ian_D_Jordan1;~Piotr_A_Sokol2;~Il_Memming_Park1", "aff": "State University of New York, Stony Brook;State University of New York, Stony Brook;Stony Brook University", "aff_domain": "stonybrook.edu;stonybrook.edu;stonybrook.edu", "position": "PhD student;PhD student;Associate Professor", "bibtex": "@misc{\njordan2022gating,\ntitle={Gating Mechanisms Underlying Sequence-to-Sequence Working Memory},\nauthor={Ian D Jordan and Piotr A Sokol and Il Memming Park},\nyear={2022},\nurl={https://openreview.net/forum?id=-fORBF5k2ZB}\n}", "github": "", "project": "", "reviewers": "afLb;MToe;TN5R;AM8u", "site": "https://openreview.net/forum?id=-fORBF5k2ZB", "pdf_size": 0, "recommendation": "3;6;6;8", "confidence": "4;3;4;3", "correctness": "4;3;3;4", "technical_novelty": "1;3;2;3", "empirical_novelty": "1;1;3;4", "wc_summary_paper": "63;83;46;25", "wc_summary_review": "14;38;32;60", "wc_main_review": "140;299;248;333", "wc_review": "217;420;326;418", "wc_reply_reviewers": "0;0;238;101", "wc_reply_authors": "311;661;691;274", "reply_reviewers": "0;0;1;1", "reply_authors": "1;1;1;1", "recommendation_avg": [ 5.75, 1.7853571071357126 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.25, 0.82915619758885 ], "empirical_novelty_avg": [ 2.25, 1.299038105676658 ], "wc_summary_paper_avg": [ 54.25, 21.370248009791556 ], "wc_summary_review_avg": [ 36.0, 16.431676725154983 ], "wc_main_review_avg": [ 255.0, 72.96231904209186 ], "wc_review_avg": [ 345.25, 83.21470723375766 ], "wc_reply_reviewers_avg": [ 84.75, 97.61499628643132 ], "wc_reply_authors_avg": [ 484.25, 192.48814898585314 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.7001400420140049, "corr_recommendation_correctness": -0.14002800840280097, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:qT3RXwqqGGUJ:scholar.google.com/&scioq=Gating+Mechanisms+Underlying+Sequence-to-Sequence+Working+Memory&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;1", "aff_unique_norm": "State University of New York;Stony Brook University", "aff_unique_dep": ";", "aff_unique_url": "https://www.stonybrook.edu;https://www.stonybrook.edu", "aff_unique_abbr": "SUNY Stony Brook;SBU", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Stony Brook;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "-geBFMKGlkq", "title": "Density-based Clustering with Kernel Diffusion", "track": "main", "status": "Reject", "tldr": "", "abstract": "Finding a suitable density function is essential for density-based clustering algorithms such as DBSCAN and DPC. A naive density corresponding to the indicator function of a unit $d$-dimensional Euclidean ball is commonly used in these algorithms. Such density suffers from capturing local features in complex datasets. To tackle this issue, we propose a new kernel diffusion density function, which is adaptive to data of varying local distributional characteristics and smoothness. Furthermore, we develop a surrogate that can be efficiently computed in linear time and space and prove that it is asymptotically equivalent to the kernel diffusion density function. Extensive empirical experiments on benchmark and large-scale face image datasets show that the proposed approach not only achieves a significant improvement over classic density-based clustering algorithms but also outperforms the state-of-the-art face clustering methods by a large margin.", "keywords": "density-based clustering;diffusion process;density function;face clustering", "primary_area": "", "supplementary_material": "/attachment/6f78f967d73466afe2e79d0fa15eb7221827e832.zip", "author": "Chao Zheng;Yingjie Chen;Chong Chen;Jianqiang Huang;Xian-Sheng Hua", "authorids": "~Chao_Zheng2;~Yingjie_Chen1;~Chong_Chen2;~Jianqiang_Huang2;~Xian-Sheng_Hua1", "gender": ";F;;M;M", "homepage": "http://www.personal.soton.ac.uk/cz1y20/;https://scholar.google.com/citations?hl=zh-CN&user=3fmmfg8AAAAJ;;https://scholar.google.com.hk/citations?user=UqAybqgAAAAJ&hl=en;", "dblp": ";;;;56/5807-1", "google_scholar": "https://scholar.google.com/citations?hl=en;https://scholar.google.com/citations?hl=zh-CN;;https://scholar.google.com.hk/citations?user=UqAybqgAAAAJ;https://scholar.google.co.uk/citations?user=6G-l4o0AAAAJ", "orcid": ";;;0000-0001-5735-2910;", "linkedin": ";;;;xshua", "or_profile": "~Chao_Zheng2;~Yingjie_Chen1;~Chong_Chen2;~Jianqiang_Huang2;~Xian-Sheng_Hua1", "aff": "University of Southampton;Peking University;;Alibaba Group;Alibaba Group", "aff_domain": "soton.ac.uk;pku.edu;;alibaba-inc.com;alibaba-inc.com", "position": "Assistant Professor;PhD student;;Researcher;Distinguished Engineer", "bibtex": "@misc{\nzheng2022densitybased,\ntitle={Density-based Clustering with Kernel Diffusion},\nauthor={Chao Zheng and Yingjie Chen and Chong Chen and Jianqiang Huang and Xian-Sheng Hua},\nyear={2022},\nurl={https://openreview.net/forum?id=-geBFMKGlkq}\n}", "github": "", "project": "", "reviewers": "apj8;SSTf;BUbF;Nbci", "site": "https://openreview.net/forum?id=-geBFMKGlkq", "pdf_size": 0, "recommendation": "3;5;5;5", "confidence": "3;5;4;4", "correctness": "2;3;2;2", "technical_novelty": "2;3;2;3", "empirical_novelty": "3;2;2;2", "wc_summary_paper": "61;70;99;103", "wc_summary_review": "32;30;77;47", "wc_main_review": "448;246;580;683", "wc_review": "541;346;756;833", "wc_reply_reviewers": "0;243;585;903", "wc_reply_authors": "1401;1213;1437;2306", "reply_reviewers": "0;2;3;5", "reply_authors": "3;3;5;5", "recommendation_avg": [ 4.5, 0.8660254037844386 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 2.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 83.25, 18.08832496390973 ], "wc_summary_review_avg": [ 46.5, 18.7949461292125 ], "wc_main_review_avg": [ 489.25, 163.28406995172554 ], "wc_review_avg": [ 619.0, 190.51115452907212 ], "wc_reply_reviewers_avg": [ 432.75, 341.9037693562328 ], "wc_reply_authors_avg": [ 1589.25, 422.4667886355092 ], "reply_reviewers_avg": [ 2.5, 1.8027756377319946 ], "reply_authors_avg": [ 4.0, 1.0 ], "replies_avg": [ 31, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.816496580927726, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:eiCFOBKMfn0J:scholar.google.com/&scioq=Density-based+Clustering+with+Kernel+Diffusion&hl=en&as_sdt=0,5", "gs_version_total": 3, "aff_unique_index": "0;1;2;2", "aff_unique_norm": "University of Southampton;Peking University;Alibaba Group", "aff_unique_dep": ";;", "aff_unique_url": "https://www.southampton.ac.uk;http://www.pku.edu.cn;https://www.alibaba.com", "aff_unique_abbr": "Southampton;Peking U;Alibaba", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1", "aff_country_unique": "United Kingdom;China" }, { "id": "-h5rboREox7", "title": "Double Descent in Adversarial Training: An Implicit Label Noise Perspective", "track": "main", "status": "Reject", "tldr": "", "abstract": "Here, we show that the robust overfitting shall be viewed as the early part of an epoch-wise double descent --- the robust test error will start to decrease again after training the model for a considerable number of epochs. Inspired by our observations, we further advance the analyses of double descent to understand robust overfitting better. In standard training, double descent has been shown to be a result of label flipping noise. However, this reasoning is not applicable in our setting, since adversarial perturbations are believed not to change the label. Going beyond label flipping noise, we propose to measure the mismatch between the assigned and (unknown) true label distributions, denoted as \\emph{implicit label noise}. We show that the traditional labeling of adversarial examples inherited from their clean counterparts will lead to implicit label noise. Towards better labeling, we show that predicted distribution from a classifier, after scaling and interpolation, can provably reduce the implicit label noise under mild assumptions. In light of our analyses, we tailored the training objective accordingly to effectively mitigate the double descent and verified its effectiveness on three benchmark datasets.\n", "keywords": "Adversarial training;Robust overfitting;Double descent;Label noise", "primary_area": "", "supplementary_material": "/attachment/0b5a3f2f4900f694c26410369744e12594b33431.zip", "author": "Chengyu Dong;Liyuan Liu;Jingbo Shang", "authorids": "~Chengyu_Dong1;~Liyuan_Liu3;~Jingbo_Shang2", "gender": ";M;M", "homepage": "https://www.chengyu-dong.me/;https://shangjingbo1226.github.io/;https://liyuanlucasliu.github.io/", "dblp": "14/3155;151/3145.html;06/1624", "google_scholar": "Ppfi7j0AAAAJ;0SkFI4MAAAAJ;RmvbkzYAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Chengyu_Dong1;~Jingbo_Shang2;~Liyuan_Liu1", "aff": "University of California, San Diego;University of California, San Diego;University of Illinois, Urbana Champaign", "aff_domain": "ucsd.edu;ucsd.edu;illinois.edu", "position": "PhD student;Assistant Professor;PhD student", "bibtex": "@misc{\ndong2022double,\ntitle={Double Descent in Adversarial Training: An Implicit Label Noise Perspective},\nauthor={Chengyu Dong and Liyuan Liu and Jingbo Shang},\nyear={2022},\nurl={https://openreview.net/forum?id=-h5rboREox7}\n}", "github": "", "project": "", "reviewers": "mzDh;JBGf;YNwA;nwPt", "site": "https://openreview.net/forum?id=-h5rboREox7", "pdf_size": 0, "recommendation": "5;6;6;6", "confidence": "4;4;3;5", "correctness": "4;4;3;3", "technical_novelty": "2;3;2;4", "empirical_novelty": "2;3;0;4", "wc_summary_paper": "80;108;70;100", "wc_summary_review": "76;75;81;31", "wc_main_review": "371;328;369;182", "wc_review": "527;511;520;313", "wc_reply_reviewers": "323;161;0;44", "wc_reply_authors": "2016;1200;517;351", "reply_reviewers": "1;1;0;1", "reply_authors": "4;3;1;1", "recommendation_avg": [ 5.75, 0.4330127018922193 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.82915619758885 ], "empirical_novelty_avg": [ 2.25, 1.479019945774904 ], "wc_summary_paper_avg": [ 89.5, 15.190457530963313 ], "wc_summary_review_avg": [ 65.75, 20.191272867256288 ], "wc_main_review_avg": [ 312.5, 77.27386362800814 ], "wc_review_avg": [ 467.75, 89.52478707039744 ], "wc_reply_reviewers_avg": [ 132.0, 124.989999599968 ], "wc_reply_authors_avg": [ 1021.0, 656.6928505778025 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 2.25, 1.299038105676658 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": -0.5773502691896257, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12196791890303263889&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;1", "aff_unique_norm": "University of California, San Diego;University of Illinois Urbana-Champaign", "aff_unique_dep": ";", "aff_unique_url": "https://www.ucsd.edu;https://illinois.edu", "aff_unique_abbr": "UCSD;UIUC", "aff_campus_unique_index": "0;0;1", "aff_campus_unique": "San Diego;Urbana-Champaign", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Fairness in Representation for Multilingual NLP: Insights from Controlled Experiments on Conditional Language Modeling", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/5935", "id": "-llS6TiOew", "poster": "", "openreview": "https://openreview.net/forum?id=-llS6TiOew", "slides": "https://iclr.cc/virtual/2022/poster/5935", "video": "https://iclr.cc/virtual/2022/poster/5935", "tldr": "", "abstract": "We perform systematically and fairly controlled experiments with the 6-layer Transformer to investigate the hardness in conditional-language-modeling languages which have been traditionally considered morphologically rich (AR and RU) and poor (ZH). We evaluate through statistical comparisons across 30 possible language directions from the 6 languages of the United Nations Parallel Corpus across 5 data sizes on 3 representation levels --- character, byte, and word. Results show that performance is relative to the representation granularity of each of the languages, not to the language as a whole. On the character and byte levels, we are able to eliminate statistically significant performance disparity, hence demonstrating that a language cannot be intrinsically hard. The disparity that mirrors the morphological complexity hierarchy is shown to be a byproduct of word segmentation. Evidence from data statistics, along with the fact that word segmentation is qualitatively indeterminate, renders a decades-long debate on morphological complexity (unless it is being intentionally modeled in a word-based, meaning-driven context) irrelevant in the context of computing. The intent of our work is to help effect more objectivity and adequacy in evaluation as well as fairness and inclusivity in experimental setup in the area of language and computing so to uphold diversity in Machine Learning and Artificial Intelligence research. Multilinguality is real and relevant in computing not due to canonical, structural linguistic concepts such as morphology or \"words\" in our minds, but rather standards related to internationalization and localization, such as character encoding --- something which has thus far been sorely overlooked in our discourse and curricula. ", "keywords": "fairness;evaluation;multilingual NLP / multilinguality;representation learning for language data;statistical comparisons;Double Descent;conditional language modeling;data-centric approach;diversity in AI;morphology;Transformer;meta evaluation;visualization or interpretation of learned representations;character encoding;internationalization and localization;robustness;statistical science for NLP;science in the era of AI/DL (AIxScience);transdisciplinarity", "primary_area": "", "supplementary_material": "", "author": "Ada Wan", "authorids": "~Ada_Wan1", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@inproceedings{\nwan2022fairness,\ntitle={Fairness in Representation for Multilingual {NLP}: Insights from Controlled Experiments on Conditional Language Modeling},\nauthor={Ada Wan},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=-llS6TiOew}\n}", "github": "", "project": "", "reviewers": "UwwP;UWb7;86m6;kpoN;y4Vs", "pdf_size": 0, "recommendation": "6;6;8;8;8", "confidence": "3;5;4;3;4", "correctness": "3;3;3;3;3", "technical_novelty": "2;3;2;4;3", "empirical_novelty": "2;3;3;4;3", "wc_summary_paper": "93;219;108;202;53", "wc_summary_review": "110;422;141;232;301", "wc_main_review": "593;1757;355;271;61", "wc_review": "796;2398;604;705;415", "wc_reply_reviewers": "0;1686;52;102;438", "wc_reply_authors": "1923;5357;649;448;1944", "reply_reviewers": "0;6;1;1;5", "reply_authors": "4;13;1;1;8", "recommendation_avg": [ 7.2, 0.9797958971132712 ], "confidence_avg": [ 3.8, 0.7483314773547882 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.8, 0.7483314773547882 ], "empirical_novelty_avg": [ 3.0, 0.6324555320336759 ], "wc_summary_paper_avg": [ 135.0, 64.43911855387222 ], "wc_summary_review_avg": [ 241.2, 112.78546005580684 ], "wc_main_review_avg": [ 607.4, 599.5350198278662 ], "wc_review_avg": [ 983.6, 718.4345760053591 ], "wc_reply_reviewers_avg": [ 455.6, 633.9859935361349 ], "wc_reply_authors_avg": [ 2064.2, 1760.216509410135 ], "reply_reviewers_avg": [ 2.6, 2.4166091947189146 ], "reply_authors_avg": [ 5.4, 4.586937976471886 ], "replies_avg": [ 59, 0 ], "authors#_avg": [ 1, 0 ], "corr_recommendation_confidence": -0.21821789023599233, "corr_recommendation_correctness": 0.0, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15237653959269888220&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "pdf": "https://openreview.net/pdf?id=-llS6TiOew", "email": "", "author_num": 1 }, { "title": "Representation-Agnostic Shape Fields", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6435", "id": "-ngwPqanCEZ", "poster": "", "openreview": "https://openreview.net/forum?id=-ngwPqanCEZ", "slides": "https://iclr.cc/virtual/2022/poster/6435", "video": "https://iclr.cc/virtual/2022/poster/6435", "author_site": "Xiaoyang Huang, Jiancheng Yang, Yanjun Wang, Ziyu Chen, Linguo Li, Teng Li, Bingbing Ni, Wenjun Zhang", "tldr": "", "abstract": "3D shape analysis has been widely explored in the era of deep learning. Numerous models have been developed for various 3D data representation formats, e.g., MeshCNN for meshes, PointNet for point clouds and VoxNet for voxels. In this study, we present Representation-Agnostic Shape Fields (RASF), a generalizable and computation-efficient shape embedding module for 3D deep learning. RASF is implemented with a learnable 3D grid with multiple channels to store local geometry. Based on RASF, shape embeddings for various 3D shape representations (point clouds, meshes and voxels) are retrieved by coordinate indexing. While there are multiple ways to optimize the learnable parameters of RASF, we provide two effective schemes among all in this paper for RASF pre-training: shape reconstruction and normal estimation. Once trained, RASF becomes a plug-and-play performance booster with negligible cost. Extensive experiments on diverse 3D representation formats, networks and applications, validate the universal effectiveness of the proposed RASF. Code and pre-trained models are publicly available\\footnote{\\url{https://github.com/seanywang0408/RASF}}.", "keywords": "shape embedding;3D deep learning;shape classification and segmentation", "primary_area": "", "supplementary_material": "/attachment/77e2838b3b479e8a98a2b8f6f40ba11760b5e4dc.zip", "author": "Xiaoyang Huang;Jiancheng Yang;Yanjun Wang;Ziyu Chen;Linguo Li;Teng Li;Bingbing Ni;Wenjun Zhang", "authorids": "~Xiaoyang_Huang1;~Jiancheng_Yang3;~Yanjun_Wang1;~Ziyu_Chen2;~Linguo_Li1;~Teng_Li2;~Bingbing_Ni3;~Wenjun_Zhang3", "gender": "M;;M;M;;M;M;M", "homepage": ";;https://github.com/WYJSJTU;https://ziyc.github.io/;http://www.google.com;https://ai.ahu.edu.cn/2021/0705/c19213a302188/page.htm;;https://ee.sjtu.edu.cn/FacultyDetail.aspx?id=14&infoid=66&flag=66", "dblp": "06/6821;;;;;09/6669-1;64/831.html;", "google_scholar": "Svw7X6kAAAAJ;;X-WP6DYAAAAJ;u9Z__t0AAAAJ;;https://scholar.google.se/citations?user=fExBNYAAAAAJ;V9W87PYAAAAJ;", "orcid": ";;;;;0000-0003-1150-088X;;", "linkedin": "%E6%99%93%E9%98%B3-%E9%BB%84-73701a185/;;;;;;;", "or_profile": "~Xiaoyang_Huang1;~Jiancheng_Yang3;~Yanjun_Wang1;~Ziyu_Chen2;~Linguo_Li1;~Teng_Li2;~Bingbing_Ni3;~Wenjun_Zhang3", "aff": "Shanghai Jiaotong University;;Shanghai Jiaotong University;Shanghai Jiaotong University;Shanghai Jiaotong University;Anhui University;Shanghai Jiaotong University;Shanghai Jiaotong University", "aff_domain": "sjtu.edu.cn;;sjtu.edu.cn;sjtu.edu.cn;sjtu.edu.cn;ahu.edu.cn;sjtu.edu.cn;sjtu.edu.cn", "position": "PhD student;;Undergrad student;Undergrad student;MS student;Full Professor;Full Professor;Full Professor", "bibtex": "@inproceedings{\nhuang2022representationagnostic,\ntitle={Representation-Agnostic Shape Fields},\nauthor={Xiaoyang Huang and Bingbing Ni and Jiancheng Yang and Yanjun Wang and Ziyu Chen and Linguo Li and Teng Li and Wenjun Zhang},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=-ngwPqanCEZ}\n}", "github": "", "project": "", "reviewers": "uSYH;ESBS;quxB;Y3fs", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "4;4;4;4", "correctness": "3;3;4;4", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;3;3;4", "wc_summary_paper": "151;149;98;77", "wc_summary_review": "75;69;85;36", "wc_main_review": "244;249;99;454", "wc_review": "470;467;282;567", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "890;498;99;538", "reply_reviewers": "0;0;0;0", "reply_authors": "2;1;1;1", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 118.75, 32.12767498590584 ], "wc_summary_review_avg": [ 66.25, 18.376275465937052 ], "wc_main_review_avg": [ 261.5, 126.41696879770532 ], "wc_review_avg": [ 446.5, 103.14189255583786 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 506.25, 280.2644956108426 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 1.0, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3096162575637292302&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=-ngwPqanCEZ", "email": "sjtu.edu.cn;;sjtu.edu.cn;sjtu.edu.cn;sjtu.edu.cn;ahu.edu.cn;sjtu.edu.cn;sjtu.edu.cn", "author_num": 8, "aff_unique_index": "0;0;0;0;1;0;0", "aff_unique_norm": "Shanghai Jiao Tong University;Anhui University", "aff_unique_dep": ";", "aff_unique_url": "https://www.sjtu.edu.cn;http://www.ahu.edu.cn/", "aff_unique_abbr": "SJTU;AHU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "China" }, { "id": "-qg9k1ftTc", "title": "S$^3$ADNet: Sequential Anomaly Detection with Pessimistic Contrastive Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Anomalies are commonly found in sequential data generated by real-world applications, such as cyberattacks in network traffic, human activity changes in wearable sensors. Thanks to the development of computing technology, many impressive results have been obtained from deep learning-based anomaly detection approaches in recent years. This paper proposes a simple neural network framework for detecting anomalies on sequential data, called $S$elf-$S$upervised $S$equential $A$nomaly $D$etection $N$etwork (S$^3$ADNet). S$^3$ADNet first extracts the representations from each data point by performing feature augmentation for contrastive learning; then captures the contextual information from the sequential data points for estimating anomaly probabilities by optimizing the context-adaptive objective. Here, we design a novel loss function based on a pessimistic policy, considering that only anomalies can affect the contextual relationships in sequences. Our proposed method outperformed other state-of-the-art approaches on the benchmark datasets by F1-score with a more straightforward architecture.", "keywords": "deep learning;anomaly detection;unsupervised learning;sequential data", "primary_area": "", "supplementary_material": "", "author": "Quexuan Zhang;Yukio Ohsawa", "authorids": "~Quexuan_Zhang1;ohsawa@sys.t.u-tokyo.ac.jp", "gender": "M;", "homepage": ";", "dblp": "175/5342.html;", "google_scholar": "https://scholar.google.com/citations?hl=en;", "orcid": ";", "linkedin": ";", "or_profile": "~Quexuan_Zhang1;ohsawa@sys.t.u-tokyo.ac.jp", "aff": "School of Engineering, The University of Tokyo;", "aff_domain": "t.u-tokyo.ac.jp;", "position": "Postdoc;", "bibtex": "@misc{\nzhang2022sadnet,\ntitle={S\\${\\textasciicircum}3\\${ADN}et: Sequential Anomaly Detection with Pessimistic Contrastive Learning},\nauthor={Quexuan Zhang and Yukio Ohsawa},\nyear={2022},\nurl={https://openreview.net/forum?id=-qg9k1ftTc}\n}", "github": "", "project": "", "reviewers": "Gpfn;6q4k;kP8v;KQ2g;2onX", "site": "https://openreview.net/forum?id=-qg9k1ftTc", "pdf_size": 0, "recommendation": "1;3;5;5;5", "confidence": "4;4;4;3;4", "correctness": "2;3;3;3;3", "technical_novelty": "2;2;3;3;4", "empirical_novelty": "1;2;2;3;2", "wc_summary_paper": "80;58;25;59;84", "wc_summary_review": "24;97;12;40;48", "wc_main_review": "565;263;275;327;322", "wc_review": "669;418;312;426;454", "wc_reply_reviewers": "0;57;36;0;0", "wc_reply_authors": "434;320;197;231;54", "reply_reviewers": "0;1;1;0;0", "reply_authors": "1;1;1;1;1", "recommendation_avg": [ 3.8, 1.6 ], "confidence_avg": [ 3.8, 0.39999999999999997 ], "correctness_avg": [ 2.8, 0.39999999999999997 ], "technical_novelty_avg": [ 2.8, 0.7483314773547882 ], "empirical_novelty_avg": [ 2.0, 0.6324555320336759 ], "wc_summary_paper_avg": [ 61.2, 20.970455407548975 ], "wc_summary_review_avg": [ 44.2, 29.205478938034894 ], "wc_main_review_avg": [ 350.4, 110.2099813991455 ], "wc_review_avg": [ 455.8, 117.00666647674397 ], "wc_reply_reviewers_avg": [ 18.6, 23.728463919942225 ], "wc_reply_authors_avg": [ 247.2, 126.73026473577653 ], "reply_reviewers_avg": [ 0.4, 0.48989794855663565 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.37500000000000006, "corr_recommendation_correctness": 0.8750000000000001, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:uaLqMYS2CbsJ:scholar.google.com/&scioq=S%24%5E3%24ADNet:+Sequential+Anomaly+Detection+with+Pessimistic+Contrastive+Learning&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "University of Tokyo", "aff_unique_dep": "School of Engineering", "aff_unique_url": "https://www.u-tokyo.ac.jp", "aff_unique_abbr": "UTokyo", "aff_campus_unique_index": "0", "aff_campus_unique": "Tokyo", "aff_country_unique_index": "0", "aff_country_unique": "Japan" }, { "id": "-r_OrYjUMJK", "title": "Reynolds Equivariant and Invariant Networks", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Invariant and equivariant networks are useful in learning data with symmetry, including images, sets, point clouds, and graphs. In this paper, we consider invariant and equivariant networks for symmetries of finite groups. Invariant and equivariant networks have been constructed by various researchers using Reynolds operators. However, Reynolds operators are computationally expensive when the order of the group is large because they use the sum over the whole group, which poses an implementation difficulty. To overcome this difficulty, we consider representing the Reynolds operator as a sum over a subset instead of a sum over the whole group. We call such a subset a Reynolds design, and an operator defined by a sum over a Reynolds design a reductive Reynolds operator. For example, in the case of a graph with $n$ nodes, the computational complexity of the reductive Reynolds operator is reduced to $O(n^2)$, while the computational complexity of the Reynolds operator is $O(n!)$. We construct a learning model based on the reductive Reynolds operator and prove that it has universal approximation property. Reynolds designs for equivariant models are derived from combinatorial observations with Young diagrams, while Reynolds designs for invariant models are derived from invariants called Reynolds dimensions defined on the set of invariant polynomials. Numerical experiments show that the performance of our models is comparable to state-of-the-art methods.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/1c56f484558567893b6d8a4f9a4567a74d3e11c7.zip", "author": "Akiyoshi Sannai;Makoto Kawano;Wataru Kumagai", "authorids": "~Akiyoshi_Sannai1;~Makoto_Kawano1;~Wataru_Kumagai2", "gender": "M;M;M", "homepage": "https://sites.google.com/view/akiyoshisannai/%E3%83%9B%E3%83%BC%E3%83%A0;https://www.ht.sfc.keio.ac.jp/~makora/;https://sites.google.com/site/watarukumagaiswebpage/", "dblp": "220/5533;;", "google_scholar": "https://scholar.google.com/citations?hl=ja;https://scholar.google.com/citations?hl=ja;https://scholar.google.co.jp/citations?user=rd5MEO8AAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Akiyoshi_Sannai1;~Makoto_Kawano1;~Wataru_Kumagai2", "aff": "RIKEN;The University of Tokyo;Omron Sinic X", "aff_domain": "riken.jp;u-tokyo.ac.jp;sinicx.com", "position": "Researcher;Postdoc;Researcher", "bibtex": "@misc{\nsannai2022reynolds,\ntitle={Reynolds Equivariant and Invariant Networks},\nauthor={Akiyoshi Sannai and Makoto Kawano and Wataru Kumagai},\nyear={2022},\nurl={https://openreview.net/forum?id=-r_OrYjUMJK}\n}", "github": "", "project": "", "reviewers": "47pD;pjj4;YnXS", "site": "https://openreview.net/forum?id=-r_OrYjUMJK", "pdf_size": 0, "recommendation": "5;5;6", "confidence": "3;2;3", "correctness": "3;4;3", "technical_novelty": "3;2;4", "empirical_novelty": "1;2;2", "wc_summary_paper": "69;14;131", "wc_summary_review": "47;12;250", "wc_main_review": "339;122;184", "wc_review": "455;148;565", "wc_reply_reviewers": "90;142;0", "wc_reply_authors": "674;320;553", "reply_reviewers": "1;1;0", "reply_authors": "1;1;1", "recommendation_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 2.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 3.0, 0.816496580927726 ], "empirical_novelty_avg": [ 1.6666666666666667, 0.4714045207910317 ], "wc_summary_paper_avg": [ 71.33333333333333, 47.79353745248642 ], "wc_summary_review_avg": [ 103.0, 104.92219339427987 ], "wc_main_review_avg": [ 215.0, 91.26152895205442 ], "wc_review_avg": [ 389.3333333333333, 176.4583677685917 ], "wc_reply_reviewers_avg": [ 77.33333333333333, 58.6590904198905 ], "wc_reply_authors_avg": [ 515.6666666666666, 146.911159851418 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.4999999999999999, "corr_recommendation_correctness": -0.4999999999999999, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:eaa1FRDvdmkJ:scholar.google.com/&scioq=Reynolds+Equivariant+and+Invariant+Networks&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;2", "aff_unique_norm": "RIKEN;University of Tokyo;OMRON Corporation", "aff_unique_dep": ";;Sinic X Division", "aff_unique_url": "https://www.riken.jp;https://www.u-tokyo.ac.jp;https://www.omron.com", "aff_unique_abbr": "RIKEN;UTokyo;Omron", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Japan" }, { "id": "-spj8FZD4y2", "title": "Contextual Multi-Armed Bandit with Communication Constraints", "track": "main", "status": "Reject", "tldr": "", "abstract": "We consider a remote Contextual Multi-Armed Bandit (CMAB) problem, in which the decision-maker observes the context and the reward, but must communicate the actions to be taken by the agents over a rate-limited communication channel. This can model, for example, a personalized ad placement application, where the content owner observes the individual visitors to its website, and hence has the context information, but must convey the ads that must be shown to each visitor to a separate entity that manages the marketing content. In this Rate-Constrained CMAB (RC-CMAB) problem, the constraint on the communication rate between the decision-maker and the agents imposes a trade-off between the number of bits sent per agent and the acquired average reward. We are particularly interested in the scenario in which the number of agents and the number of possible actions are large, while the communication budget is limited. Consequently, it can be considered as a policy compression problem, where the distortion metric is induced by the learning objectives. We first consider the fundamental information theoretic limits of this problem by letting the number of agents go to infinity, and study the regret that can be achieved. Then, we propose a practical coding scheme, and provide numerical results for the achieved regret.", "keywords": "Machine Learning;Information Theory;Multi-Armed Bandits", "primary_area": "", "supplementary_material": "", "author": "Francesco Pase;Deniz Gunduz;Michele Zorzi", "authorids": "~Francesco_Pase1;~Deniz_Gunduz1;zorzi@dei.unipd.it", "gender": "M;;", "homepage": "https://sites.google.com/view/pasefrance;https://www.imperial.ac.uk/information-processing-and-communications-lab;", "dblp": "266/9860;05/6552;", "google_scholar": "https://scholar.google.com/citations?hl=it;https://scholar.google.co.uk/citations?user=MbmKROkAAAAJ;", "orcid": "0000-0003-0116-8852;0000-0002-7725-395X;", "linkedin": "francesco-pase-9714317b/;deniz-gunduz-33b2382/;", "or_profile": "~Francesco_Pase1;~Deniz_Gunduz1;zorzi@dei.unipd.it", "aff": "Imperial College London;Imperial College London;", "aff_domain": "imperial.ac.uk;imperial.ac.uk;", "position": "Visiting PhD;Full Professor;", "bibtex": "@misc{\npase2022contextual,\ntitle={Contextual Multi-Armed Bandit with Communication Constraints},\nauthor={Francesco Pase and Deniz Gunduz and Michele Zorzi},\nyear={2022},\nurl={https://openreview.net/forum?id=-spj8FZD4y2}\n}", "github": "", "project": "", "reviewers": "CLeX;Cm2b;vEGv;mXFx", "site": "https://openreview.net/forum?id=-spj8FZD4y2", "pdf_size": 0, "recommendation": "5;6;6;6", "confidence": "4;3;3;2", "correctness": "4;4;4;3", "technical_novelty": "3;3;3;3", "empirical_novelty": "2;3;2;3", "wc_summary_paper": "18;124;63;53", "wc_summary_review": "67;54;68;65", "wc_main_review": "215;340;216;302", "wc_review": "300;518;347;420", "wc_reply_reviewers": "0;0;0;225", "wc_reply_authors": "638;293;853;1064", "reply_reviewers": "0;0;0;1", "reply_authors": "2;1;2;2", "recommendation_avg": [ 5.75, 0.4330127018922193 ], "confidence_avg": [ 3.0, 0.7071067811865476 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 64.5, 38.20013088982812 ], "wc_summary_review_avg": [ 63.5, 5.5901699437494745 ], "wc_main_review_avg": [ 268.25, 54.43516786049254 ], "wc_review_avg": [ 396.25, 82.27507216648308 ], "wc_reply_reviewers_avg": [ 56.25, 97.42785792574935 ], "wc_reply_authors_avg": [ 712.0, 284.96578742017437 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.75, 0.4330127018922193 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.816496580927726, "corr_recommendation_correctness": -0.3333333333333333, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12771127639672793736&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Imperial College London", "aff_unique_dep": "", "aff_unique_url": "https://www.imperial.ac.uk", "aff_unique_abbr": "ICL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United Kingdom" }, { "id": "-u8EliRNW8k", "title": "Speech-MLP: a simple MLP architecture for speech processing", "track": "main", "status": "Reject", "tldr": "", "abstract": "Overparameterized transformer-based architectures have shown remarkable performance in recent years, achieving state-of-the-art results in speech processing tasks such as speech recognition, speech synthesis, keyword spotting, and speech enhancement et al. The main assumption is that with the underlying self-attention mechanism, transformers can ultimately capture the long-range temporal dependency from speech signals. In this paper, we propose a multi-layer perceptron (MLP) architecture, namely speech-MLP, useful for extracting information from speech signals. The model splits feature channels into non-overlapped chunks and processes each chunk individually. The processed chunks are then merged together and processed to consolidate the output. By setting the different numbers of chunks and focusing on different contextual window sizes, speech-MLP learns multiscale local temporal dependency. The proposed model is successfully evaluated on two tasks: keyword spotting and speech enhancement. In our experiments, we use two benchmark datasets for keyword spotting (Google speech command V2-35 and LibriWords) and the VoiceBank dataset for the speech enhancement task. In all experiments, speech-MLP surpassed transformer-based solutions, achieving state-of-the-art performance with fewer parameters and simpler training schemes. Such results indicate that oftentimes more complex models such as transformers are not necessary for speech processing tasks. Hence, they should not be considered as the first option as simpler and more compact models can offer optimal performance. ", "keywords": "MLP;transformers;speech signal processing", "primary_area": "", "supplementary_material": "", "author": "Chao Xing;Dong Wang;Lirong Dai;Qun Liu;Anderson Avila", "authorids": "~Chao_Xing1;wangdong99@mails.tsinghua.edu.cn;lrdai@ustc.edu.cn;~Qun_Liu1;~Anderson_Avila1", "gender": "M;;;M;M", "homepage": ";;;http://liuquncn.github.io/;https://lacsii.com", "dblp": "87/8540;;;75/4402-1;158/4090", "google_scholar": "https://scholar.google.ca/citations?user=487syywAAAAJ;;;2HhiGzcAAAAJ;Q0hJ-hAAAAAJ", "orcid": ";;;0000-0002-7000-1792;0000-0002-3088-5116", "linkedin": ";;;qunliu/;https://ca.linkedin.com/in/andersonavila", "or_profile": "~Chao_Xing1;wangdong99@mails.tsinghua.edu.cn;lrdai@ustc.edu.cn;~Qun_Liu1;~Anderson_Avila1", "aff": ";;;Huawei Noah's Ark Lab;Institut national de la recherche scientifique", "aff_domain": ";;;huawei.com;inrs.ca", "position": ";;;Chief Scientist of Speech and Language Computing;Assistant Professor", "bibtex": "@misc{\nxing2022speechmlp,\ntitle={Speech-{MLP}: a simple {MLP} architecture for speech processing},\nauthor={Chao Xing and Dong Wang and Lirong Dai and Qun Liu and Anderson Avila},\nyear={2022},\nurl={https://openreview.net/forum?id=-u8EliRNW8k}\n}", "github": "", "project": "", "reviewers": "EFsM;Qsws;cq7P;b8HX;zWqe", "site": "https://openreview.net/forum?id=-u8EliRNW8k", "pdf_size": 0, "recommendation": "3;5;5;5;8", "confidence": "4;4;3;4;4", "correctness": "3;3;3;3;4", "technical_novelty": "2;2;2;3;3", "empirical_novelty": "0;2;2;3;3", "wc_summary_paper": "55;109;261;218;52", "wc_summary_review": "35;87;120;79;29", "wc_main_review": "347;617;382;115;152", "wc_review": "437;813;763;412;233", "wc_reply_reviewers": "0;0;0;0;45", "wc_reply_authors": "829;1272;781;600;262", "reply_reviewers": "0;0;0;0;1", "reply_authors": "2;3;2;1;1", "recommendation_avg": [ 5.2, 1.6 ], "confidence_avg": [ 3.8, 0.39999999999999997 ], "correctness_avg": [ 3.2, 0.39999999999999997 ], "technical_novelty_avg": [ 2.4, 0.4898979485566356 ], "empirical_novelty_avg": [ 2.0, 1.0954451150103321 ], "wc_summary_paper_avg": [ 139.0, 85.61541917201598 ], "wc_summary_review_avg": [ 70.0, 33.988233257996804 ], "wc_main_review_avg": [ 322.6, 180.5531500694463 ], "wc_review_avg": [ 531.6, 221.42592440814153 ], "wc_reply_reviewers_avg": [ 9.0, 18.0 ], "wc_reply_authors_avg": [ 748.8, 328.73174474029736 ], "reply_reviewers_avg": [ 0.2, 0.4 ], "reply_authors_avg": [ 1.8, 0.7483314773547883 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.06250000000000001, "corr_recommendation_correctness": 0.875, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6358910007719552786&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1", "aff_unique_norm": "Huawei;Institut National de la Recherche Scientifique", "aff_unique_dep": "Noah's Ark Lab;", "aff_unique_url": "https://www.huawei.com;https://www.inrs.ca", "aff_unique_abbr": "Huawei;INRS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "China;Canada" }, { "id": "-uPIaaZdMLF", "title": "Attentional meta-learners for few-shot polythetic classification", "track": "main", "status": "Reject", "tldr": "", "abstract": "Polythetic classifications, based on shared patterns of features that need neither be universal nor constant among members of a class, are common in the natural world and greatly outnumber monothetic classifications over a set of features. We show that threshold meta-learners, such as Prototypical Networks, require an embedding dimension that is exponential in the number of features to emulate these functions. In contrast, attentional classifiers, such as Matching Networks, are polythetic by default and able to solve these problems with a linear embedding dimension. However, we find that in the presence of task-irrelevant features, inherent to meta-learning problems, attentional models are susceptible to misclassification. To address this challenge, we propose a self-attention feature-selection mechanism that adaptively dilutes non-discriminative features. We demonstrate the effectiveness of our approach in meta-learning Boolean functions, and synthetic and real-world few-shot learning tasks.", "keywords": "Meta-learning;self-attention;feature-selection", "primary_area": "", "supplementary_material": "/attachment/73a87170b1693bdc5e18179a4f12974c1773a757.zip", "author": "Ben Day;Ramon Vi\u00f1as Torn\u00e9;Nikola Simidjievski;Pietro Lio", "authorids": "~Ben_Day1;~Ramon_Vi\u00f1as_Torn\u00e91;~Nikola_Simidjievski1;~Pietro_Lio1", "gender": ";Unspecified;M;M", "homepage": ";https://simidjievskin.github.io/;https://www.cst.cam.ac.uk/people/pl219;", "dblp": "217/4944;;l/PietroLio.html;228/8253", "google_scholar": "5Em0-BAAAAAJ;;https://scholar.google.co.uk/citations?user=3YrWf7EAAAAJ;MbTLV4wAAAAJ", "orcid": "0000-0003-2411-4478;;0000-0002-0540-5053;", "linkedin": "ramon-vinas/;;;benday50", "or_profile": "~Ramon_Vi\u00f1as_Torn\u00e91;~Nikola_Simidjievski1;~Pietro_Lio1;~Benjamin_Day1", "aff": "University of Cambridge;University of Cambridge;University of Cambridge;University of Cambridge", "aff_domain": "cam.ac.uk;cam.ac.uk;cam.ac.uk;cam.ac.uk", "position": "PhD student;Principal Researcher;Full Professor;PhD student", "bibtex": "@misc{\nday2022attentional,\ntitle={Attentional meta-learners for few-shot polythetic classification},\nauthor={Ben Day and Ramon Vi{\\~n}as Torn{\\'e} and Nikola Simidjievski and Pietro Lio},\nyear={2022},\nurl={https://openreview.net/forum?id=-uPIaaZdMLF}\n}", "github": "", "project": "", "reviewers": "Hoh6;q98z;f8L4;PpwW", "site": "https://openreview.net/forum?id=-uPIaaZdMLF", "pdf_size": 0, "recommendation": "6;6;6;6", "confidence": "4;4;4;3", "correctness": "3;3;3;3", "technical_novelty": "2;3;3;3", "empirical_novelty": "3;3;2;4", "wc_summary_paper": "41;89;79;49", "wc_summary_review": "14;46;156;50", "wc_main_review": "155;446;722;299", "wc_review": "210;581;957;398", "wc_reply_reviewers": "20;273;385;0", "wc_reply_authors": "685;673;1216;612", "reply_reviewers": "1;1;1;0", "reply_authors": "2;2;3;2", "recommendation_avg": [ 6.0, 0.0 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 64.5, 20.0187412191676 ], "wc_summary_review_avg": [ 66.5, 53.52335938634644 ], "wc_main_review_avg": [ 405.5, 209.70515015134941 ], "wc_review_avg": [ 536.5, 275.94609980936497 ], "wc_reply_reviewers_avg": [ 169.5, 164.4939208603163 ], "wc_reply_authors_avg": [ 796.5, 243.77499871808018 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 2.25, 0.4330127018922193 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5360824455580624680&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "University of Cambridge", "aff_unique_dep": "", "aff_unique_url": "https://www.cam.ac.uk", "aff_unique_abbr": "Cambridge", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Cambridge", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United Kingdom" }, { "id": "-uZp67PZ7p", "title": "Multi-Agent Reinforcement Learning with Shared Resource in Inventory Management", "track": "main", "status": "Reject", "tldr": "", "abstract": "We consider inventory management (IM) problem for a single store with a large number of SKUs (stock keeping units) in this paper, where we need to make replenishment decisions for each SKU to balance its supply and demand. Each SKU should cooperate with each other to maximize profits, as well as compete for shared resources e.g., warehouse spaces, budget etc. Co-existence of cooperation and competition behaviors makes IM a complicate game, hence IM can be naturally modelled as a multi-agent reinforcement learning (MARL) problem. In IM problem, we find that agents only interact indirectly with each other through some shared resources, e.g., warehouse spaces. To formally model MARL problems with above structure, we propose shared resource stochastic game along with an efficient algorithm to learn policies particularly for a large number of agents. By leveraging shared-resource structure, our method can greatly reduce model complexity and accelerate learning procedure compared with standard MARL algorithms, as shown by extensive experiments.", "keywords": "Multi-Agent Reinforcement Learning;Inventory Management;Shared Resource;Decentralized Training Paradigm;Model-based RL", "primary_area": "", "supplementary_material": "", "author": "Mingxiao Feng;Guozi Liu;Li Zhao;Lei Song;Jiang Bian;Tao Qin;Wengang Zhou;Houqiang Li;Tie-Yan Liu", "authorids": "~Mingxiao_Feng1;~Guozi_Liu1;~Li_Zhao1;~Lei_Song3;~Jiang_Bian1;~Tao_Qin1;~Wengang_Zhou1;~Houqiang_Li1;~Tie-Yan_Liu1", "gender": ";;F;M;M;M;M;M;M", "homepage": "https://fmxfranky.github.io/;;https://www.microsoft.com/en-us/research/people/lizo/;;https://sites.google.com/view/jiangbian;https://www.microsoft.com/en-us/research/people/taoqin/;http://staff.ustc.edu.cn/~zhwg/index.html;https://staff.ustc.edu.cn/~lihq/;http://member.acm.org/~tieyanliu", "dblp": ";;97/4708-7;76/893-1.html;09/851-2.html;14/6841;22/4544-1;59/7017.html;l/TieYanLiu", "google_scholar": ";;b-LJkLQAAAAJ;pXDSOocAAAAJ;pZBEnY8AAAAJ;Bl4SRU0AAAAJ;8s1JF8YAAAAJ;7sFMIKoAAAAJ;Nh832fgAAAAJ", "orcid": ";;;;0000-0002-9472-600X;;0000-0003-1690-9836;0000-0003-2188-3028;0000-0002-0476-8020", "linkedin": ";guozi-liu-939a30221/;;;jbian/;;;;", "or_profile": "~Mingxiao_Feng1;~Guozi_Liu1;~Li_Zhao1;~Lei_Song3;~Jiang_Bian1;~Tao_Qin1;~Wengang_Zhou1;~Houqiang_Li1;~Tie-Yan_Liu1", "aff": "University of Science and Technology of China;Automation, Tsinghua University, Tsinghua University;Microsoft;Microsoft;Microsoft;Microsoft Research Asia;University of Science and Technology of China;University of Science and Technology of China;Microsoft", "aff_domain": "ustc.edu.cn;mails.tsinghua.edu.cn;microsoft.com;microsoft.com;microsoft.com;microsoft.com;ustc.edu.cn;ustc.edu.cn;microsoft.com", "position": "PhD student;Undergrad student;Researcher;Principal Researcher;Partner Research Manager;Principal Researcher;Full Professor;Professor;Distinguished Scientist", "bibtex": "@misc{\nfeng2022multiagent,\ntitle={Multi-Agent Reinforcement Learning with Shared Resource in Inventory Management},\nauthor={Mingxiao Feng and Guozi Liu and Li Zhao and Lei Song and Jiang Bian and Tao Qin and Wengang Zhou and Houqiang Li and Tie-Yan Liu},\nyear={2022},\nurl={https://openreview.net/forum?id=-uZp67PZ7p}\n}", "github": "", "project": "", "reviewers": "yMmz;Uk4L;xoVm;XZpy", "site": "https://openreview.net/forum?id=-uZp67PZ7p", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "4;4;4;3", "correctness": "3;3;3;3", "technical_novelty": "2;3;3;2", "empirical_novelty": "2;2;3;2", "wc_summary_paper": "136;169;84;20", "wc_summary_review": "81;14;208;52", "wc_main_review": "376;571;423;249", "wc_review": "593;754;715;321", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "1142;806;1019;883", "reply_reviewers": "0;0;0;0", "reply_authors": "3;2;2;3", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 102.25, 56.331052715176554 ], "wc_summary_review_avg": [ 88.75, 72.83328565978607 ], "wc_main_review_avg": [ 404.75, 115.17025440624849 ], "wc_review_avg": [ 595.75, 169.38030434498575 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 962.5, 128.6710923245777 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.5, 0.5 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 9, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.0, "gs_citation": 24, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4861683893384653399&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1;2;2;2;2;0;0;2", "aff_unique_norm": "University of Science and Technology of China;Tsinghua University;Microsoft", "aff_unique_dep": ";Automation;Microsoft Corporation", "aff_unique_url": "http://www.ustc.edu.cn;https://www.tsinghua.edu.cn;https://www.microsoft.com", "aff_unique_abbr": "USTC;THU;Microsoft", "aff_campus_unique_index": "1", "aff_campus_unique": ";Asia", "aff_country_unique_index": "0;0;1;1;1;0;0;0;1", "aff_country_unique": "China;United States" }, { "title": "GeneDisco: A Benchmark for Experimental Design in Drug Discovery", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6889", "id": "-w2oomO6qgc", "poster": "", "openreview": "https://openreview.net/forum?id=-w2oomO6qgc", "slides": "https://iclr.cc/virtual/2022/poster/6889", "video": "https://iclr.cc/virtual/2022/poster/6889", "author_site": "Arash Mehrjou, Ashkan Soleymani, Andrew Jesson, Pascal Notin, Yarin Gal, Stefan Bauer, Patrick Schwab", "tldr": "", "abstract": "In vitro cellular experimentation with genetic interventions, using for example CRISPR technologies, is an essential step in early-stage drug discovery and target validation that serves to assess initial hypotheses about causal associations between biological mechanisms and disease pathologies. With billions of potential hypotheses to test, the experimental design space for in vitro genetic experiments is extremely vast, and the available experimental capacity - even at the largest research institutions in the world - pales in relation to the size of this biological hypothesis space. Machine learning methods, such as active and reinforcement learning, could aid in optimally exploring the vast biological space by integrating prior knowledge from various information sources as well as extrapolating to yet unexplored areas of the experimental design space based on available data. However, there exist no standardised benchmarks and data sets for this challenging task and little research has been conducted in this area to date. Here, we introduce GeneDisco, a benchmark suite for evaluating active learning algorithms for experimental design in drug discovery. GeneDisco contains a curated set of multiple publicly available experimental data sets as well as open-source implementations of state-of-the-art active learning policies for experimental design and exploration.", "keywords": "batch active learning;drug discovery;benchmark", "primary_area": "", "supplementary_material": "/attachment/fb40e81982ddd0cf54a44b00407c812f769a14db.zip", "author": "Arash Mehrjou;Ashkan Soleymani;Andrew Jesson;Pascal Notin;Yarin Gal;Stefan Bauer;Patrick Schwab", "authorids": "~Arash_Mehrjou1;~Ashkan_Soleymani1;~Andrew_Jesson1;~Pascal_Notin1;~Yarin_Gal1;~Stefan_Bauer1;~Patrick_Schwab1", "gender": "M;M;M;;;;M", "homepage": "https://distantvantagepoint.com;https://ashkansoleymani.lids.mit.edu/;https://oatml.cs.ox.ac.uk/members/andrew_jesson/;http://www.cs.ox.ac.uk/people/yarin.gal/website//;https://cifar.ca/bios/stefan-bauer/;http://schwabpatrick.com;https://www.pascalnotin.com", "dblp": "174/1295;270/3353.html;;67/9076;;152/9378;270/9032", "google_scholar": "pnypNygAAAAJ;omHTV3MAAAAJ;ElJ_fC4AAAAJ;https://scholar.google.co.uk/citations?user=SIayDoQAAAAJ;O-oICE8AAAAJ;https://scholar.google.at/citations?hl=de;soxv0s0AAAAJ", "orcid": "0000-0002-3832-7784;;;;;;0000-0002-1877-8983", "linkedin": "arash-mehrjou/;;;;;;", "or_profile": "~Arash_Mehrjou1;~Ashkan_Soleymani1;~Andrew_Jesson1;~Yarin_Gal1;~Stefan_Bauer1;~Patrick_Schwab1;~Pascal_M_Notin1", "aff": "GlaxoSmithKlein;Max Planck Institute for Intelligent Systems;Department of Computer Science, University of Oxford;University of Oxford;KTH Royal Institute of Technology;GlaxoSmithKline plc;Department of Computer Science, University of Oxford", "aff_domain": "gsk.ai;tuebingen.mpg.de;cs.ox.ac.uk;ox.ac.uk;kth.se;gsk.com;cs.ox.ac.uk", "position": "Researcher;Research Assistant;PhD student;Associate Professor;Assistant Professor;Director;PhD student", "bibtex": "@inproceedings{\nmehrjou2022genedisco,\ntitle={GeneDisco: A Benchmark for Experimental Design in Drug Discovery},\nauthor={Arash Mehrjou and Ashkan Soleymani and Andrew Jesson and Pascal Notin and Yarin Gal and Stefan Bauer and Patrick Schwab},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=-w2oomO6qgc}\n}", "github": "", "project": "", "reviewers": "6EvA;ozjG;NhJz", "pdf_size": 0, "recommendation": "6;6;6", "confidence": "4;4;3", "correctness": "3;3;3", "technical_novelty": "1;3;2", "empirical_novelty": "3;4;2", "wc_summary_paper": "89;270;143", "wc_summary_review": "98;79;31", "wc_main_review": "145;319;108", "wc_review": "332;668;282", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "393;985;539", "reply_reviewers": "0;0;0", "reply_authors": "1;2;1", "recommendation_avg": [ 6.0, 0.0 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.0, 0.816496580927726 ], "empirical_novelty_avg": [ 3.0, 0.816496580927726 ], "wc_summary_paper_avg": [ 167.33333333333334, 75.86977146546721 ], "wc_summary_review_avg": [ 69.33333333333333, 28.193773938387338 ], "wc_main_review_avg": [ 190.66666666666666, 91.99396115446322 ], "wc_review_avg": [ 427.3333333333333, 171.39687537667916 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 639.0, 251.8147467220033 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 27, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10686323109700882145&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "pdf": "https://openreview.net/pdf?id=-w2oomO6qgc", "email": "gsk.ai;tuebingen.mpg.de;cs.ox.ac.uk;ox.ac.uk;kth.se;gsk.com;cs.ox.ac.uk", "author_num": 7, "aff_unique_index": "0;1;2;2;3;0;2", "aff_unique_norm": "GlaxoSmithKline;Max Planck Institute for Intelligent Systems;University of Oxford;KTH Royal Institute of Technology", "aff_unique_dep": ";Intelligent Systems;Department of Computer Science;", "aff_unique_url": "https://www.gsk.com;https://www.mpi-is.mpg.de;https://www.ox.ac.uk;https://www.kth.se", "aff_unique_abbr": "GSK;MPI-IS;Oxford;KTH", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Oxford", "aff_country_unique_index": "0;1;0;0;2;0;0", "aff_country_unique": "United Kingdom;Germany;Sweden" }, { "id": "-xhk0O7iAc0", "title": "A Topological View of Rule Learning in Knowledge Graphs", "track": "main", "status": "Reject", "tldr": "", "abstract": "Inductive relation prediction is an important learning task for knowledge graph completion. One can use the existence of rules, namely a sequence of relations, to predict the relation between two entities. Previous works view rules as paths and primarily focus on the searching of paths between entities. The space of paths is huge, and one has to sacrifice either efficiency or accuracy. In this paper, we consider rules in knowledge graphs as cycles and show that the space of cycles has a unique structure based on the theory of algebraic topology. By exploring the linear structure of the cycle space, we can improve the searching efficiency of rules. We propose to collect cycle bases that span the space of cycles. We build a novel GNN framework on the collected cycles to learn the representations of cycles, and to predict the existence/non-existence of a relation. Our method achieves state-of-the-art performance on benchmarks.", "keywords": "Inductive Relation Prediction;Topological Data Analysis;Cycle Basis;Homology", "primary_area": "", "supplementary_material": "", "author": "Zuoyu Yan;Tengfei Ma;Liangcai Gao;Zhi Tang;Chao Chen", "authorids": "~Zuoyu_Yan1;~Tengfei_Ma1;~Liangcai_Gao2;~Zhi_Tang2;~Chao_Chen1", "gender": "M;M;M;M;M", "homepage": "https://pkuyzy.github.io/;https://sites.google.com/site/matf0123/;;https://www.wict.pku.edu.cn/cpdp/kydw/ggcy/1297369.htm;https://chaochen.github.io/", "dblp": "203/8184;94/9023-1;23/7062;16/4222-1;66/3019-12", "google_scholar": "d-Ch_PgAAAAJ;9OvNakkAAAAJ;;https://scholar.google.com/citations?hl=en;J-iIIFAAAAAJ", "orcid": ";0000-0002-1086-529X;;0000-0002-6021-8357;0000-0003-1703-6483", "linkedin": ";;;;", "or_profile": "~Zuoyu_Yan1;~Tengfei_Ma1;~Liangcai_Gao2;~Zhi_Tang2;~Chao_Chen1", "aff": "Peking University;International Business Machines;Peking University;Peking University;State University of New York, Stony Brook", "aff_domain": "pku.edu.cn;ibm.com;pku.edu.cn;pku.edu.cn;stonybrook.edu", "position": "PhD student;Researcher;Associate Professor;Full Professor;Assistant Professor", "bibtex": "@misc{\nyan2022a,\ntitle={A Topological View of Rule Learning in Knowledge Graphs},\nauthor={Zuoyu Yan and Tengfei Ma and Liangcai Gao and Zhi Tang and Chao Chen},\nyear={2022},\nurl={https://openreview.net/forum?id=-xhk0O7iAc0}\n}", "github": "", "project": "", "reviewers": "weC8;YAZM;pNaT;vHmE", "site": "https://openreview.net/forum?id=-xhk0O7iAc0", "pdf_size": 0, "recommendation": "1;3;5;5", "confidence": "4;4;3;4", "correctness": "2;2;4;2", "technical_novelty": "2;3;4;2", "empirical_novelty": "2;3;3;2", "wc_summary_paper": "64;68;45;96", "wc_summary_review": "41;71;32;53", "wc_main_review": "559;337;150;571", "wc_review": "664;476;227;720", "wc_reply_reviewers": "331;0;61;149", "wc_reply_authors": "2233;653;282;990", "reply_reviewers": "1;0;1;1", "reply_authors": "4;1;1;2", "recommendation_avg": [ 3.5, 1.6583123951777 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 2.5, 0.8660254037844386 ], "technical_novelty_avg": [ 2.75, 0.82915619758885 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 68.25, 18.226011631731172 ], "wc_summary_review_avg": [ 49.25, 14.600941750448838 ], "wc_main_review_avg": [ 404.25, 173.8668671714079 ], "wc_review_avg": [ 521.75, 192.6841651511613 ], "wc_reply_reviewers_avg": [ 135.25, 124.81260953926089 ], "wc_reply_authors_avg": [ 1039.5, 733.1577251860612 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 2.0, 1.224744871391589 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.5222329678670935, "corr_recommendation_correctness": 0.5222329678670935, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:AHdKd8ihdTYJ:scholar.google.com/&scioq=A+Topological+View+of+Rule+Learning+in+Knowledge+Graphs&hl=en&as_sdt=0,5", "gs_version_total": 2, "aff_unique_index": "0;1;0;0;2", "aff_unique_norm": "Peking University;International Business Machines Corporation;State University of New York", "aff_unique_dep": ";;", "aff_unique_url": "http://www.pku.edu.cn;https://www.ibm.com;https://www.stonybrook.edu", "aff_unique_abbr": "Peking U;IBM;SUNY Stony Brook", "aff_campus_unique_index": "1", "aff_campus_unique": ";Stony Brook", "aff_country_unique_index": "0;1;0;0;1", "aff_country_unique": "China;United States" }, { "id": "-ybZRQktdgc", "title": "LRN: Limitless Routing Networks for Effective Multi-task Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Multi-task learning (MTL) is a field involved with learning multiple tasks simultaneously typically through using shared model parameters. The shared representation enables generalized parameters that are task invariant and assists in learning tasks with sparse data. However, the presence of unforeseen task interference can cause one task to improve at the detriment of another. A recent paradigm constructed to tackle these types of problems is the routing network, that builds neural network architectures from a set of modules conditioned on the input instance, task, and previous output of other modules. This approach has many constraints, so we propose the Limitless Routing Network (LRN) which removes the constraints through the usage of a transformer-based router and a reevaluation of the state and action space. We also provide a simple solution to the module collapse problem and display superior accuracy performance over several MTL benchmarks compared to the original routing network.", "keywords": "multi-task learning;MTL;reinforcement learning;machine learning;routing networks;modular networks", "primary_area": "", "supplementary_material": "", "author": "Ryan Wickman;Xiaofei Zhang;Weizi Li", "authorids": "~Ryan_Wickman1;~Xiaofei_Zhang2;~Weizi_Li1", "gender": "M;;M", "homepage": ";http://www.cs.memphis.edu/~xzhang12/;http://weizi-li.github.io/", "dblp": ";83/4809-2;60/7775", "google_scholar": ";IBy4k-4AAAAJ;", "orcid": ";0000-0002-5605-6295;", "linkedin": "ryan-wickman-771160144/;;", "or_profile": "~Ryan_Wickman1;~Xiaofei_Zhang2;~Weizi_Li1", "aff": "University of Memphis;University of Memphis;University of Memphis", "aff_domain": "memphis.edu;memphis.edu;memphis.edu", "position": "PhD student;Assistant Professor;Assistant Professor", "bibtex": "@misc{\nwickman2022lrn,\ntitle={{LRN}: Limitless Routing Networks for Effective Multi-task Learning},\nauthor={Ryan Wickman and Xiaofei Zhang and Weizi Li},\nyear={2022},\nurl={https://openreview.net/forum?id=-ybZRQktdgc}\n}", "github": "", "project": "", "reviewers": "ewVv;v8R3;CZQn;Xh12", "site": "https://openreview.net/forum?id=-ybZRQktdgc", "pdf_size": 0, "recommendation": "3;3;3;3", "confidence": "4;5;4;4", "correctness": "3;2;4;3", "technical_novelty": "2;2;1;2", "empirical_novelty": "2;1;1;2", "wc_summary_paper": "69;70;96;79", "wc_summary_review": "34;19;49;51", "wc_main_review": "304;171;176;257", "wc_review": "407;260;321;387", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 1.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.5, 0.5 ], "wc_summary_paper_avg": [ 78.5, 10.828203913853857 ], "wc_summary_review_avg": [ 38.25, 12.910751333675357 ], "wc_main_review_avg": [ 227.0, 56.049085630365106 ], "wc_review_avg": [ 343.75, 57.88512330469721 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:58zeS1zLtvgJ:scholar.google.com/&scioq=LRN:+Limitless+Routing+Networks+for+Effective+Multi-task+Learning&hl=en&as_sdt=0,5", "gs_version_total": 2, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Memphis", "aff_unique_dep": "", "aff_unique_url": "https://www.memphis.edu", "aff_unique_abbr": "UM", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "00UIZu1IRU", "title": "Learning mixture of neural temporal point processes for event sequence clustering", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Event sequence clustering applies to many scenarios e.g. e-Commerce and electronic health. Traditional clustering models fail to characterize complex real-world processes due to the strong parametric assumption. While Neural Temporal Point Processes (NTPPs) mainly focus on modeling similar sequences instead of clustering. To fill the gap, we propose Mixture of Neural Temporal Point Processes (NTPP-MIX), a general framework that can utilize many existing NTPPs for event sequence clustering. In NTPP-MIX, the prior distribution of coefficients for cluster assignment is modeled by a Dirichlet distribution. When the assignment is given, the conditional probability of a sequence is modeled by the mixture of series of NTPPs. We combine variational EM algorithm and Stochastic Gradient Descent (SGD) to efficiently train the framework. Moreover, to further improve its capability, we propose a fully data-driven NTPP based on the attention mechanism named Fully Attentive Temporal Point Process (FATPP). Experiments on both synthetic and real-world datasets show the effectiveness of NTPP-MIX against state-of-the-arts, especially when using FATPP as a basic NTPP module.", "keywords": "temporal point process;event sequence clustering;deep learning", "primary_area": "", "supplementary_material": "", "author": "Yunhao Zhang;Junchi Yan;Zhenyu Ren;Jian Yin", "authorids": "~Yunhao_Zhang1;~Junchi_Yan2;~Zhenyu_Ren1;~Jian_Yin2", "gender": "M;M;;M", "homepage": ";;https://www.alibaba.com;http://thinklab.sjtu.edu.cn/", "dblp": "10/2569;;;60/7949.html", "google_scholar": "https://scholar.google.com/citations?hl=zh-CN;;;ga230VoAAAAJ", "orcid": ";;;0000-0001-9639-7679", "linkedin": ";%E9%9C%87%E5%AE%87-%E4%BB%BB-3a7718213/;;", "or_profile": "~Yunhao_Zhang1;~Zhenyu_Ren1;~Jian_Yin2;~Junchi_Yan1", "aff": "Alibaba Group;;Alibaba Group;Shanghai Jiaotong University", "aff_domain": "alibaba-inc.com;;alibaba-inc.com;sjtu.edu.cn", "position": "Intern;;\u7a0b\u5e8f\u5458;Associate Professor", "bibtex": "@misc{\nzhang2022learning,\ntitle={Learning mixture of neural temporal point processes for event sequence clustering},\nauthor={Yunhao Zhang and Junchi Yan and Zhenyu Ren and Jian Yin},\nyear={2022},\nurl={https://openreview.net/forum?id=00UIZu1IRU}\n}", "github": "", "project": "", "reviewers": "bcEp;mfeP;jeQM;H6cH", "site": "https://openreview.net/forum?id=00UIZu1IRU", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "4;4;4;4", "correctness": "3;3;3;4", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;0;2;2", "wc_summary_paper": "72;48;147;98", "wc_summary_review": "30;51;56;29", "wc_main_review": "359;351;602;79", "wc_review": "461;450;805;206", "wc_reply_reviewers": "0;0;227;0", "wc_reply_authors": "957;833;508;311", "reply_reviewers": "0;0;1;0", "reply_authors": "2;2;1;1", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 1.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 91.25, 36.72448093574639 ], "wc_summary_review_avg": [ 41.5, 12.134661099511597 ], "wc_main_review_avg": [ 347.75, 185.07211432303896 ], "wc_review_avg": [ 480.5, 213.28443450003564 ], "wc_reply_reviewers_avg": [ 56.75, 98.29388332953378 ], "wc_reply_authors_avg": [ 652.25, 256.32145345249586 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15432170241418817995&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2, "aff_unique_index": "0;0;1", "aff_unique_norm": "Alibaba Group;Shanghai Jiao Tong University", "aff_unique_dep": ";", "aff_unique_url": "https://www.alibaba.com;https://www.sjtu.edu.cn", "aff_unique_abbr": "Alibaba;SJTU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "id": "00Vc1Ov5KZn", "title": "Vi-MIX FOR SELF-SUPERVISED VIDEO REPRESENTATION", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Contrastive representation learning of videos highly rely on exhaustive data aug- mentation strategies. Therefore, towards designing video augmentation for self- supervised learning, we first analyze the best strategy to mix videos to create a new augmented video sample. Then, the question remains, can we make use of the other modalities in videos for data mixing? To this end, we propose Cross-Modal Manifold Cutmix (CMMC) that inserts a video tesseract into an- other video tesseract in the feature space across two different modalities. We find that our video mixing strategy: Vi-Mix, i.e. preliminary mixing of videos followed by CMMC across different modalities in a video, improves the qual- ity of learned video representations. We exhaustively conduct experiments for two downstream tasks: action recognition and video retrieval on three popular video datasets UCF101, HMDB51, and NTU-60. We show that the performance of Vi-Mix on both the downstream tasks is on par with the other self-supervised approaches while requiring less training data.", "keywords": "data augmentation;self-supervision;video representation", "primary_area": "", "supplementary_material": "", "author": "Srijan Das;Michael S Ryoo", "authorids": "~Srijan_Das1;~Michael_S_Ryoo1", "gender": "M;M", "homepage": "https://srijandas07.github.io/;http://michaelryoo.com/", "dblp": "173/0062;r/MichaelSRyoo", "google_scholar": "ZDTF5AEAAAAJ;vcw0TJIAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Srijan_Das1;~Michael_S_Ryoo1", "aff": "State University of New York, Stony Brook;Google DeepMind", "aff_domain": "stonybrook.edu;google.com", "position": "Postdoc;Research Scientist", "bibtex": "@misc{\ndas2022vimix,\ntitle={Vi-{MIX} {FOR} {SELF}-{SUPERVISED} {VIDEO} {REPRESENTATION}},\nauthor={Srijan Das and Michael S Ryoo},\nyear={2022},\nurl={https://openreview.net/forum?id=00Vc1Ov5KZn}\n}", "github": "", "project": "", "reviewers": "gf3Y;xTtr;M6Bp;jkeU", "site": "https://openreview.net/forum?id=00Vc1Ov5KZn", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "5;5;3;4", "correctness": "3;3;2;3", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;1;2;2", "wc_summary_paper": "75;87;71;53", "wc_summary_review": "107;48;73;55", "wc_main_review": "362;404;496;280", "wc_review": "544;539;640;388", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 4.25, 0.82915619758885 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 71.5, 12.196310917650468 ], "wc_summary_review_avg": [ 70.75, 22.829531313629722 ], "wc_main_review_avg": [ 385.5, 77.83797273824646 ], "wc_review_avg": [ 527.75, 90.16755236780024 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.17407765595569782, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17797670039815614833&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1", "aff_unique_norm": "State University of New York;Google", "aff_unique_dep": ";Google DeepMind", "aff_unique_url": "https://www.stonybrook.edu;https://deepmind.com", "aff_unique_abbr": "SUNY Stony Brook;DeepMind", "aff_campus_unique_index": "0", "aff_campus_unique": "Stony Brook;", "aff_country_unique_index": "0;1", "aff_country_unique": "United States;United Kingdom" }, { "title": "Online Hyperparameter Meta-Learning with Hypergradient Distillation", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6204", "id": "01AMRlen9wJ", "poster": "", "openreview": "https://openreview.net/forum?id=01AMRlen9wJ", "slides": "https://iclr.cc/virtual/2022/poster/6204", "video": "https://iclr.cc/virtual/2022/poster/6204", "author_site": "Hae Beom Lee, Hayeon Lee, JaeWoong Shin, Eunho Yang, Timothy Hospedales, Sung Ju Hwang", "tldr": "", "abstract": "Many gradient-based meta-learning methods assume a set of parameters that do not participate in inner-optimization, which can be considered as hyperparameters. Although such hyperparameters can be optimized using the existing gradient-based hyperparameter optimization (HO) methods, they suffer from the following issues. Unrolled differentiation methods do not scale well to high-dimensional hyperparameters or horizon length, Implicit Function Theorem (IFT) based methods are restrictive for online optimization, and short horizon approximations suffer from short horizon bias. In this work, we propose a novel HO method that can overcome these limitations, by approximating the second-order term with knowledge distillation. Specifically, we parameterize a single Jacobian-vector product (JVP) for each HO step and minimize the distance from the true second-order term. Our method allows online optimization and also is scalable to the hyperparameter dimension and the horizon length. We demonstrate the effectiveness of our method on three different meta-learning methods and two benchmark datasets.", "keywords": "Hyperparameter Optimization;Meta-learning", "primary_area": "", "supplementary_material": "/attachment/ddb5f727c5a4d99a2696b10da7fb220146875b4a.zip", "author": "Hae Beom Lee;Hayeon Lee;JaeWoong Shin;Eunho Yang;Timothy Hospedales;Sung Ju Hwang", "authorids": "~Hae_Beom_Lee1;~Hayeon_Lee1;~JaeWoong_Shin1;~Eunho_Yang1;~Timothy_Hospedales1;~Sung_Ju_Hwang1", "gender": "M;F;M;M;M;", "homepage": "https://haebeom-lee.github.io;https://hayeonlee.github.io/;;https://sites.google.com/site/hleehome2/;http://homepages.inf.ed.ac.uk/thospeda/;", "dblp": "326/7260;246/4987;267/5672;96/2621;32/3545;", "google_scholar": ";5DaLgBUAAAAJ;i_o_95kAAAAJ;;https://scholar.google.fr/citations?user=nHhtvqkAAAAJ;", "orcid": ";;;;0000-0003-4867-7486;", "linkedin": ";;%EC%9E%AC%EC%9B%85-%EC%8B%A0-88662220a/;;timothyhospedales/;", "or_profile": "~Hae_Beom_Lee1;~Hayeon_Lee1;~JaeWoong_Shin1;~Eunho_Yang1;~Timothy_Hospedales1;~Sung_Ju_Hwang1", "aff": "Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology;Lunit Inc.;Korea Advanced Institute of Science & Technology;Samsung AI Research Centre;", "aff_domain": "kaist.ac.kr;kaist.ac.kr;lunit.io;kaist.ac.kr;samsung.com;", "position": "PhD student;PhD student;Researcher;Associate Professor;Principal Researcher;", "bibtex": "@inproceedings{\nlee2022online,\ntitle={Online Hyperparameter Meta-Learning with Hypergradient Distillation},\nauthor={Hae Beom Lee and Hayeon Lee and JaeWoong Shin and Eunho Yang and Timothy Hospedales and Sung Ju Hwang},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=01AMRlen9wJ}\n}", "github": "", "project": "", "reviewers": "GYz2;w7Q6;G8V6;zUNV", "pdf_size": 0, "recommendation": "6;6;8;8", "confidence": "3;4;2;3", "correctness": "3;3;3;4", "technical_novelty": "3;3;3;2", "empirical_novelty": "3;3;3;3", "wc_summary_paper": "60;42;105;89", "wc_summary_review": "26;20;56;33", "wc_main_review": "61;144;322;284", "wc_review": "147;206;483;406", "wc_reply_reviewers": "0;0;0;25", "wc_reply_authors": "478;701;408;559", "reply_reviewers": "0;0;0;1", "reply_authors": "1;1;1;1", "recommendation_avg": [ 7.0, 1.0 ], "confidence_avg": [ 3.0, 0.7071067811865476 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 74.0, 24.525496936861444 ], "wc_summary_review_avg": [ 33.75, 13.645054048995188 ], "wc_main_review_avg": [ 202.75, 105.31708076090982 ], "wc_review_avg": [ 310.5, 138.31937680599924 ], "wc_reply_reviewers_avg": [ 6.25, 10.825317547305483 ], "wc_reply_authors_avg": [ 536.5, 108.973620661149 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.7071067811865475, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4886164047549260989&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "pdf": "https://openreview.net/pdf?id=01AMRlen9wJ", "email": "kaist.ac.kr;kaist.ac.kr;lunit.io;kaist.ac.kr;samsung.com;", "author_num": 6, "aff_unique_index": "0;0;1;0;2", "aff_unique_norm": "Korea Advanced Institute of Science and Technology;Lunit Inc.;Samsung", "aff_unique_dep": ";;AI Research", "aff_unique_url": "https://www.kaist.ac.kr;https://www.lunit.io;https://www.samsung.com/global/researchers/samsung-ai-research-centre/", "aff_unique_abbr": "KAIST;Lunit;SARC", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "South Korea" }, { "id": "01CDUB3v6H", "title": "LARGE: Latent-Based Regression through GAN Semantics", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "We propose a novel method for solving regression tasks using few-shot or weak supervision. At the core of our method is the fundamental observation that GANs are incredibly successful at encoding semantic information within their latent space, even in a completely unsupervised setting. For modern generative frameworks, this semantic encoding manifests as smooth, linear directions which affect image attributes in a disentangled manner. These directions have been widely used in GAN-based image editing.\nWe show that such directions are not only linear, but that the magnitude of change induced on the respective attribute is approximately linear with respect to the distance traveled along them. By leveraging this observation, our method turns a pre-trained GAN into a regression model, using as few as two labeled samples. This enables solving regression tasks on datasets and attributes which are difficult to produce quality supervision for. Additionally, we show that the same latent-distances can be used to sort collections of images by the strength of given attributes, even in the absence of explicit supervision. \nExtensive experimental evaluations demonstrate that our method can be applied across a wide range of domains, leverage multiple latent direction discovery frameworks, and achieve state-of-the-art results in few-shot and low-supervision settings, even when compared to methods designed to tackle a single task.", "keywords": "GAN;Latent Space;Latent semantics;Regression;Few-Shot", "primary_area": "", "supplementary_material": "/attachment/4c47b1e18cc8a0ee00825394bd69cbdb8ea1f7b9.zip", "author": "Yotam Nitzan;Rinon Gal;Ofir Brenner;Daniel Cohen-or", "authorids": "~Yotam_Nitzan1;~Rinon_Gal1;~Ofir_Brenner1;~Daniel_Cohen-or2", "gender": "M;;M;", "homepage": "https://yotamnitzan.github.io/;;;", "dblp": "265/5979;;;", "google_scholar": "pTUX5wEAAAAJ;;iLLlWr8AAAAJ;", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Yotam_Nitzan1;~Rinon_Gal1;~Ofir_Brenner1;~Daniel_Cohen-or2", "aff": "Tel Aviv University;;;", "aff_domain": "tau.ac.il;;;", "position": "PhD student;;;", "bibtex": "@misc{\nnitzan2022large,\ntitle={{LARGE}: Latent-Based Regression through {GAN} Semantics},\nauthor={Yotam Nitzan and Rinon Gal and Ofir Brenner and Daniel Cohen-or},\nyear={2022},\nurl={https://openreview.net/forum?id=01CDUB3v6H}\n}", "github": "", "project": "", "reviewers": "rXQu;p4Hs;n86w;g5a9", "site": "https://openreview.net/forum?id=01CDUB3v6H", "pdf_size": 0, "recommendation": "5;5;5;8", "confidence": "4;4;4;4", "correctness": "3;3;4;4", "technical_novelty": "3;2;2;3", "empirical_novelty": "3;3;2;3", "wc_summary_paper": "113;168;106;97", "wc_summary_review": "83;102;20;41", "wc_main_review": "682;526;213;108", "wc_review": "878;796;339;246", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 5.75, 1.299038105676658 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 121.0, 27.721832551258224 ], "wc_summary_review_avg": [ 61.5, 32.57683225852385 ], "wc_main_review_avg": [ 382.25, 231.50202482915782 ], "wc_review_avg": [ 564.75, 275.75657290443684 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 25, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18373168955695458400&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff_unique_index": "0", "aff_unique_norm": "Tel Aviv University", "aff_unique_dep": "", "aff_unique_url": "https://www.tau.ac.il", "aff_unique_abbr": "TAU", "aff_country_unique_index": "0", "aff_country_unique": "Israel" }, { "title": "Crystal Diffusion Variational Autoencoder for Periodic Material Generation", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7063", "id": "03RLpj-tc_", "poster": "", "openreview": "https://openreview.net/forum?id=03RLpj-tc_", "slides": "https://iclr.cc/virtual/2022/poster/7063", "video": "https://iclr.cc/virtual/2022/poster/7063", "author_site": "Tian Xie, Xiang Fu, Octavian Ganea, Regina Barzilay, Tommi Jaakkola", "tldr": "", "abstract": "Generating the periodic structure of stable materials is a long-standing challenge for the material design community. This task is difficult because stable materials only exist in a low-dimensional subspace of all possible periodic arrangements of atoms: 1) the coordinates must lie in the local energy minimum defined by quantum mechanics, and 2) global stability also requires the structure to follow the complex, yet specific bonding preferences between different atom types. Existing methods fail to incorporate these factors and often lack proper invariances. We propose a Crystal Diffusion Variational Autoencoder (CDVAE) that captures the physical inductive bias of material stability. By learning from the data distribution of stable materials, the decoder generates materials in a diffusion process that moves atomic coordinates towards a lower energy state and updates atom types to satisfy bonding preferences between neighbors. Our model also explicitly encodes interactions across periodic boundaries and respects permutation, translation, rotation, and periodic invariances. We significantly outperform past methods in three tasks: 1) reconstructing the input structure, 2) generating valid, diverse, and realistic materials, and 3) generating materials that optimize a specific property. We also provide several standard datasets and evaluation metrics for the broader machine learning community.", "keywords": "materials;graph neural networks;periodic;diffusion models;score matching;molecule;3D;generative", "primary_area": "", "supplementary_material": "", "author": "Tian Xie;Xiang Fu;Octavian-Eugen Ganea;Regina Barzilay;Tommi S. Jaakkola", "authorids": "~Tian_Xie2;~Xiang_Fu4;~Octavian-Eugen_Ganea1;~Regina_Barzilay1;~Tommi_S._Jaakkola1", "gender": "M;M;;female;", "homepage": "http://www.txie.me;https://xiangfu.co/;;https://www.regina.csail.mit.edu/;", "dblp": ";97/374-5.html;;b/ReginaBarzilay;", "google_scholar": "xFbOAf8AAAAJ;https://scholar.google.com/citations?view_op=list_works;;;", "orcid": ";;;;", "linkedin": "txie-93/;;;;", "or_profile": "~Tian_Xie2;~Xiang_Fu4;~Octavian-Eugen_Ganea1;~Regina_Barzilay1;~Tommi_S._Jaakkola1", "aff": "Massachusetts Institute of Technology;Massachusetts Institute of Technology;;Massachusetts Institute of Technology;", "aff_domain": "mit.edu;mit.edu;;mit.edu;", "position": "Postdoc;PhD student;;Professor;", "bibtex": "@inproceedings{\nxie2022crystal,\ntitle={Crystal Diffusion Variational Autoencoder for Periodic Material Generation},\nauthor={Tian Xie and Xiang Fu and Octavian-Eugen Ganea and Regina Barzilay and Tommi S. Jaakkola},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=03RLpj-tc_}\n}", "github": "", "project": "", "reviewers": "vK4U;Fg5a;oZsg;5P5h", "pdf_size": 0, "recommendation": "3;5;6;8", "confidence": "4;4;5;4", "correctness": "3;3;4;4", "technical_novelty": "2;3;3;3", "empirical_novelty": "3;3;3;4", "wc_summary_paper": "61;44;61;36", "wc_summary_review": "95;71;28;15", "wc_main_review": "1066;257;252;129", "wc_review": "1222;372;341;180", "wc_reply_reviewers": "0;122;32;20", "wc_reply_authors": "2307;1114;332;236", "reply_reviewers": "0;1;1;1", "reply_authors": "4;2;1;1", "recommendation_avg": [ 5.5, 1.8027756377319946 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 50.5, 10.874281585465774 ], "wc_summary_review_avg": [ 52.25, 32.22867512014727 ], "wc_main_review_avg": [ 426.0, 373.04356313974915 ], "wc_review_avg": [ 528.75, 406.83004743995986 ], "wc_reply_reviewers_avg": [ 43.5, 46.74130935264865 ], "wc_reply_authors_avg": [ 997.25, 829.3272499441943 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 2.0, 1.224744871391589 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.16012815380508713, "corr_recommendation_correctness": 0.8320502943378437, "gs_citation": 315, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10416305679920850993&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "pdf": "https://openreview.net/pdf?id=03RLpj-tc_", "email": "mit.edu;mit.edu;;mit.edu;", "author_num": 5, "aff_unique_index": "0;0;0", "aff_unique_norm": "Massachusetts Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://web.mit.edu", "aff_unique_abbr": "MIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Finite-Time Convergence and Sample Complexity of Multi-Agent Actor-Critic Reinforcement Learning with Average Reward", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6851", "id": "04pGUg0-pdZ", "poster": "", "openreview": "https://openreview.net/forum?id=04pGUg0-pdZ", "slides": "https://iclr.cc/virtual/2022/poster/6851", "video": "https://iclr.cc/virtual/2022/poster/6851", "author_site": "FNU Hairi, Jia Liu, Songtao Lu", "tldr": "", "abstract": "In this paper, we establish the first finite-time convergence result of the actor-critic algorithm for fully decentralized multi-agent reinforcement learning (MARL) problems with average reward. \nIn this problem, a set of $N$ agents work cooperatively to maximize the global average reward through interacting with their neighbors over a communication network.\nWe consider a practical MARL setting, where the rewards and actions of each agent are only known to itself, and the knowledge of joint actions of the agents is not assumed. \nToward this end, we propose a mini-batch Markovian sampled fully decentralized actor-critic algorithm and analyze its finite-time convergence and sample complexity.\nWe show that the sample complexity of this algorithm is $\\mathcal{O}(N^{2}/\\epsilon^{2}\\log(N/\\epsilon))$.\nInterestingly, this sample complexity bound matches that of the state-of-the-art single-agent actor-critic algorithms for reinforcement learning. ", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "FNU Hairi;Jia Liu;Songtao Lu", "authorids": "~FNU_Hairi1;~Jia_Liu1;~Songtao_Lu1", "gender": ";M;M", "homepage": ";https://kevinliu-osu.github.io/index.html;https://songtaogithub.github.io/", "dblp": ";;05/2887", "google_scholar": ";Ofx3dScAAAAJ;LRsjX7kAAAAJ", "orcid": "0000-0001-7457-9893;;", "linkedin": ";;", "or_profile": "~FNU_Hairi1;~Jia_Liu1;~Songtao_Lu1", "aff": "Ohio State University;The Ohio State University;IBM Thomas J. Watson Research Center", "aff_domain": "osu.edu;osu.edu;ibm.com", "position": "Postdoc;Assistant Professor;Research Scientist", "bibtex": "@inproceedings{\nhairi2022finitetime,\ntitle={Finite-Time Convergence and Sample Complexity of Multi-Agent Actor-Critic Reinforcement Learning with Average Reward},\nauthor={FNU Hairi and Jia Liu and Songtao Lu},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=04pGUg0-pdZ}\n}", "github": "", "project": "", "reviewers": "AYYd;avem;v83u;N673;F2da", "pdf_size": 0, "recommendation": "6;6;6;8;8", "confidence": "3;3;4;3;4", "correctness": "4;3;3;4;4", "technical_novelty": "3;2;3;3;3", "empirical_novelty": "2;1;2;2;3", "wc_summary_paper": "30;66;61;105;126", "wc_summary_review": "24;54;46;26;43", "wc_main_review": "239;210;457;196;633", "wc_review": "293;330;564;327;802", "wc_reply_reviewers": "19;22;14;16;126", "wc_reply_authors": "1530;1231;1728;848;1690", "reply_reviewers": "1;1;1;1;1", "reply_authors": "2;2;3;2;3", "recommendation_avg": [ 6.8, 0.9797958971132712 ], "confidence_avg": [ 3.4, 0.4898979485566356 ], "correctness_avg": [ 3.6, 0.4898979485566356 ], "technical_novelty_avg": [ 2.8, 0.39999999999999997 ], "empirical_novelty_avg": [ 2.0, 0.6324555320336759 ], "wc_summary_paper_avg": [ 77.6, 33.96822044205436 ], "wc_summary_review_avg": [ 38.6, 11.689311356961968 ], "wc_main_review_avg": [ 347.0, 171.54008277950666 ], "wc_review_avg": [ 463.2, 195.0419442068808 ], "wc_reply_reviewers_avg": [ 39.4, 43.384789961459994 ], "wc_reply_authors_avg": [ 1405.4, 329.15139373850445 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 2.4, 0.4898979485566356 ], "replies_avg": [ 23, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.16666666666666663, "corr_recommendation_correctness": 0.6666666666666667, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1, "pdf": "https://openreview.net/pdf?id=04pGUg0-pdZ", "email": "osu.edu;osu.edu;ibm.com", "author_num": 3, "aff_unique_index": "0;0;1", "aff_unique_norm": "Ohio State University;IBM", "aff_unique_dep": ";Research", "aff_unique_url": "https://www.osu.edu;https://www.ibm.com/research", "aff_unique_abbr": "OSU;IBM", "aff_campus_unique_index": "1", "aff_campus_unique": ";Yorktown Heights", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "057dxuWpfx", "title": "Shaped Rewards Bias Emergent Language", "track": "main", "status": "Reject", "tldr": "", "abstract": "One of the primary characteristics of emergent phenomena is that they are determined by the basic properties of the system whence they emerge as opposed to explicitly designed constraints. Reinforcement learning is often used to elicit such phenomena which specifically arise from the pressure to maximize reward. We distinguish two types of rewards. The first is the base reward which is motivated directly by the task being solved. The second is shaped rewards which are designed specifically to make the task easier to learn by introducing biases in the learning process. The inductive bias which reward shaping introduces is problematic for emergent language experimentation because it biases the object of study: the emergent language. The fact that shaped rewards are intentionally designed conflicts with the basic premise of emergent phenomena arising from basic principles. In this paper, we use a simple sender-receiver navigation game to demonstrate how reward shaping can 1) explicitly bias the semantics of the learned language, 2) significantly change the entropy of the learned communication, and 3) mask the potential effects of other environmental variables of interest.", "keywords": "emergent language;reinforcement learning;neural networks", "primary_area": "", "supplementary_material": "/attachment/a8b92a2b39965c129cfeb3f65246c0868e7d1c51.zip", "author": "Brendon Boldt;Yonatan Bisk;David R Mortensen", "authorids": "~Brendon_Boldt1;~Yonatan_Bisk1;~David_R_Mortensen1", "gender": "M;M;M", "homepage": "http://brendonjboldt.xyz/;http://www.YonatanBisk.com;http://www.cs.cmu.edu/~dmortens/", "dblp": "207/4805;38/9282;180/5443", "google_scholar": "QEXlK3AAAAAJ;bWoGh8UAAAAJ;https://scholar.google.com/citations?authuser=1", "orcid": "0000-0002-5599-5581;0000-0002-2111-9081;0000-0002-3927-6851", "linkedin": ";yonatanbisk/;davidrmortensen/", "or_profile": "~Brendon_Boldt1;~Yonatan_Bisk1;~David_R_Mortensen1", "aff": "School of Computer Science, Carnegie Mellon University;Meta;Carnegie Mellon University", "aff_domain": "cs.cmu.edu;meta.com;cmu.edu", "position": "PhD student;Visiting Professor;Systems Scientist", "bibtex": "@misc{\nboldt2022shaped,\ntitle={Shaped Rewards Bias Emergent Language},\nauthor={Brendon Boldt and Yonatan Bisk and David R Mortensen},\nyear={2022},\nurl={https://openreview.net/forum?id=057dxuWpfx}\n}", "github": "", "project": "", "reviewers": "5smG;2p5d;oZZ8;jcC3;uFB4", "site": "https://openreview.net/forum?id=057dxuWpfx", "pdf_size": 0, "recommendation": "1;3;3;3;6", "confidence": "4;4;3;3;4", "correctness": "2;3;3;3;3", "technical_novelty": "1;2;2;2;4", "empirical_novelty": "1;2;2;2;4", "wc_summary_paper": "59;265;119;83;285", "wc_summary_review": "54;2;251;56;46", "wc_main_review": "249;1520;130;234;473", "wc_review": "362;1787;500;373;804", "wc_reply_reviewers": "276;577;71;0;474", "wc_reply_authors": "690;2186;263;861;314", "reply_reviewers": "1;1;1;0;2", "reply_authors": "2;4;1;2;2", "recommendation_avg": [ 3.2, 1.6 ], "confidence_avg": [ 3.6, 0.4898979485566356 ], "correctness_avg": [ 2.8, 0.39999999999999997 ], "technical_novelty_avg": [ 2.2, 0.9797958971132712 ], "empirical_novelty_avg": [ 2.2, 0.9797958971132712 ], "wc_summary_paper_avg": [ 162.2, 94.27279565176796 ], "wc_summary_review_avg": [ 81.8, 86.85251867389914 ], "wc_main_review_avg": [ 521.2, 511.7692448750706 ], "wc_review_avg": [ 765.2, 535.2201042561835 ], "wc_reply_reviewers_avg": [ 279.6, 222.68417096866136 ], "wc_reply_authors_avg": [ 862.8, 698.8079564515562 ], "reply_reviewers_avg": [ 1.0, 0.6324555320336759 ], "reply_authors_avg": [ 2.2, 0.9797958971132712 ], "replies_avg": [ 27, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.10206207261596574, "corr_recommendation_correctness": 0.6875000000000002, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:HPOqAxzVvHEJ:scholar.google.com/&scioq=Shaped+Rewards+Bias+Emergent+Language&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1;0", "aff_unique_norm": "Carnegie Mellon University;Meta", "aff_unique_dep": "School of Computer Science;Meta Platforms, Inc.", "aff_unique_url": "https://www.cmu.edu;https://meta.com", "aff_unique_abbr": "CMU;Meta", "aff_campus_unique_index": "0", "aff_campus_unique": "Pittsburgh;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Learning Scenario Representation for Solving Two-stage Stochastic Integer Programs", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7065", "id": "06Wy2BtxXrz", "poster": "", "openreview": "https://openreview.net/forum?id=06Wy2BtxXrz", "slides": "https://iclr.cc/virtual/2022/poster/7065", "video": "https://iclr.cc/virtual/2022/poster/7065", "author_site": "Yaoxin Wu, Wen Song, Zhiguang Cao, Jie Zhang", "tldr": "", "abstract": "Many practical combinatorial optimization problems under uncertainty can be modeled as stochastic integer programs (SIPs), which are extremely challenging to solve due to the high complexity. To solve two-stage SIPs efficiently, we propose a conditional variational autoencoder (CVAE) based method to learn scenario representation for a class of SIP instances. Specifically, we design a graph convolutional network based encoder to embed each scenario with the deterministic part of its instance (i.e. context) into a low-dimensional latent space, from which a decoder reconstructs the scenario from its latent representation conditioned on the context. Such a design effectively captures the dependencies of the scenarios on their corresponding instances. We apply the trained encoder to two tasks in typical SIP solving, i.e. scenario reduction and objective prediction. Experiments on two SIP problems show that the learned latent representation significantly boosts the solving performance to attain high-quality solutions in short computational time, and generalizes fairly well to problems of larger sizes or with more scenarios.", "keywords": "Conditional Variational Autoencoder;Stochastic Integer Programming;Scenario Reduction", "primary_area": "", "supplementary_material": "", "author": "Yaoxin Wu;Wen Song;Zhiguang Cao;Jie Zhang", "authorids": "~Yaoxin_Wu2;~Wen_Song1;~Zhiguang_Cao1;~Jie_Zhang9", "gender": "M;M;M;M", "homepage": "https://songwenas12.github.io/;https://zhiguangcaosg.github.io/;https://personal.ntu.edu.sg/zhangj/;https://research.tue.nl/en/persons/yaoxin-wu", "dblp": "50/5489;178/8621;84/6889-2;192/4964", "google_scholar": "s8Nz-xoAAAAJ;https://scholar.google.com.sg/citations?user=2R-cOkYAAAAJ;IFV_RdMAAAAJ;0qRnmK8AAAAJ", "orcid": "0000-0001-7624-1861;0000-0002-4499-759X;;0000-0002-3625-6599", "linkedin": ";;;", "or_profile": "~Wen_Song1;~Zhiguang_Cao1;~Jie_Zhang9;~YAOXIN_WU1", "aff": "Shandong University;Singapore Institute of Manufacturing Technology, A*STAR;Nanyang Technological University;Nanyang Technological University", "aff_domain": "sdu.edu.cn;simtech.a-star.edu.sg;ntu.edu.sg;ntu.edu.sg", "position": "Associate Professor;Scientist;Full Professor;PhD student", "bibtex": "@inproceedings{\nwu2022learning,\ntitle={Learning Scenario Representation for Solving Two-stage Stochastic Integer Programs},\nauthor={Yaoxin Wu and Wen Song and Zhiguang Cao and Jie Zhang},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=06Wy2BtxXrz}\n}", "github": "", "project": "", "reviewers": "2AQy;Qekv;tf4e", "pdf_size": 0, "recommendation": "6;6;6", "confidence": "3;2;3", "correctness": "3;3;4", "technical_novelty": "2;2;2", "empirical_novelty": "2;3;3", "wc_summary_paper": "89;73;103", "wc_summary_review": "48;40;68", "wc_main_review": "669;430;290", "wc_review": "806;543;461", "wc_reply_reviewers": "227;0;0", "wc_reply_authors": "3354;1062;750", "reply_reviewers": "4;0;0", "reply_authors": "10;3;2", "recommendation_avg": [ 6.0, 0.0 ], "confidence_avg": [ 2.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 88.33333333333333, 12.256517540566824 ], "wc_summary_review_avg": [ 52.0, 11.775681155103795 ], "wc_main_review_avg": [ 463.0, 156.47577022231482 ], "wc_review_avg": [ 603.3333333333334, 147.1650622789103 ], "wc_reply_reviewers_avg": [ 75.66666666666667, 107.00882621956418 ], "wc_reply_authors_avg": [ 1722.0, 1161.0064599303485 ], "reply_reviewers_avg": [ 1.3333333333333333, 1.8856180831641267 ], "reply_authors_avg": [ 5.0, 3.559026084010437 ], "replies_avg": [ 24, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4991507840660372740&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=06Wy2BtxXrz", "email": "sdu.edu.cn;simtech.a-star.edu.sg;ntu.edu.sg;ntu.edu.sg", "author_num": 4, "aff_unique_index": "0;1;2;2", "aff_unique_norm": "Shandong University;Singapore Institute of Manufacturing Technology;Nanyang Technological University", "aff_unique_dep": ";;", "aff_unique_url": "http://www.sdu.edu.cn;https://www.simtech.a-star.edu.sg;https://www.ntu.edu.sg", "aff_unique_abbr": "SDU;SIMTech;NTU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1", "aff_country_unique": "China;Singapore" }, { "id": "06fUz_bJStS", "title": "Differentially Private SGD with Sparse Gradients", "track": "main", "status": "Reject", "tldr": "", "abstract": "A large number of recent studies reveal that networks and their optimization updates contain information about potentially private training data. To protect sensitive training data, differential privacy has been adopted in deep learning to provide rigorously defined and measurable privacy. However, differentially private stochastic gradient descent (DP-SGD) requires the injection of an amount of noise that scales with the number of gradient dimensions, while neural networks typically contain millions of parameters. As a result, networks trained with DP-SGD typically have large performance drops compared to non-private training. Recent works propose to first project gradients into a lower dimensional subspace, which is found by application of the power method, and then inject noise in this subspace. Although better performance has been achieved, the use of the power method leads to a significantly increased memory footprint by storing sample gradients, and more computational cost by projection. In this work, we mitigate these disadvantages through a sparse gradient representation. Specifically, we randomly freeze a progressively increasing subset of parameters, which results in sparse gradient updates while maintaining or increasing accuracy over differentially private baselines. Our experiment shows that we can reduce up to 40\\% of the gradient dimension while achieve the same performance within the same training epochs. Additionally, sparsity of the gradient updates is beneficial for decreasing communication overhead when deployed in collaborative training, e.g. federated learning. When we apply our approach across various DP-SGD frameworks, we maintain accuracy while achieve up to 70\\% representation sparsity, which proves that our approach is a safe and effective add-on to a variety of methods. We further notice that our approach leads to improvement in accuracy in particular for large networks. Importantly, the additional computational cost of our approach is negligible, and results in reduced computation during training due to lower computational cost in power method iterations.", "keywords": "differential privacy;differentially private SGD;privacy-preserving training", "primary_area": "", "supplementary_material": "", "author": "Junyi Zhu;Matthew B. Blaschko", "authorids": "~Junyi_Zhu1;~Matthew_B._Blaschko1", "gender": "M;M", "homepage": "https://junyizhu-ai.github.io/;http://homes.esat.kuleuven.be/~mblaschk/", "dblp": "192/6828-2.html;12/5233", "google_scholar": "3LeC4cMAAAAJ;EmmO7LcAAAAJ", "orcid": "0000-0002-8980-5336;0000-0002-2640-181X", "linkedin": ";matthew-blaschko-5b7a51b0/", "or_profile": "~Junyi_Zhu1;~Matthew_Blaschko1", "aff": "KU Leuven;KU Leuven", "aff_domain": "kuleuven.be;esat.kuleuven.be", "position": "PhD student;Associate Professor", "bibtex": "@misc{\nzhu2022differentially,\ntitle={Differentially Private {SGD} with Sparse Gradients},\nauthor={Junyi Zhu and Matthew B. Blaschko},\nyear={2022},\nurl={https://openreview.net/forum?id=06fUz_bJStS}\n}", "github": "", "project": "", "reviewers": "w3pG;A5Uk;omCe;rw2E", "site": "https://openreview.net/forum?id=06fUz_bJStS", "pdf_size": 0, "recommendation": "3;5;5;5", "confidence": "4;3;3;4", "correctness": "2;3;3;3", "technical_novelty": "2;2;3;2", "empirical_novelty": "2;2;4;3", "wc_summary_paper": "40;30;34;66", "wc_summary_review": "58;17;218;19", "wc_main_review": "467;240;56;259", "wc_review": "565;287;308;344", "wc_reply_reviewers": "0;0;96;23", "wc_reply_authors": "742;809;582;588", "reply_reviewers": "0;0;1;1", "reply_authors": "2;2;2;1", "recommendation_avg": [ 4.5, 0.8660254037844386 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 42.5, 14.026760139105537 ], "wc_summary_review_avg": [ 78.0, 82.46514415193852 ], "wc_main_review_avg": [ 255.5, 145.58931966322254 ], "wc_review_avg": [ 376.0, 111.0067565511217 ], "wc_reply_reviewers_avg": [ 29.75, 39.38511774769754 ], "wc_reply_authors_avg": [ 680.25, 98.17427106935911 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.75, 0.4330127018922193 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 1.0, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8780409541333393423&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Katholieke Universiteit Leuven", "aff_unique_dep": "", "aff_unique_url": "https://www.kuleuven.be", "aff_unique_abbr": "KU Leuven", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Belgium" }, { "title": "Zero-Shot Self-Supervised Learning for MRI Reconstruction", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6862", "id": "085y6YPaYjP", "poster": "", "openreview": "https://openreview.net/forum?id=085y6YPaYjP", "slides": "https://iclr.cc/virtual/2022/poster/6862", "video": "https://iclr.cc/virtual/2022/poster/6862", "author_site": "Burhaneddin Yaman, Seyed Amir Hossein Hosseini, Mehmet Akcakaya", "tldr": "", "abstract": "Deep learning (DL) has emerged as a powerful tool for accelerated MRI reconstruction, but often necessitates a database of fully-sampled measurements for training. Recent self-supervised and unsupervised learning approaches enable training without fully-sampled data. However, a database of undersampled measurements may not be available in many scenarios, especially for scans involving contrast or translational acquisitions in development. Moreover, recent studies show that database-trained models may not generalize well when the unseen measurements differ in terms of sampling pattern, acceleration rate, SNR, image contrast, and anatomy. Such challenges necessitate a new methodology to enable subject-specific DL MRI reconstruction without external training datasets, since it is clinically imperative to provide high-quality reconstructions that can be used to identify lesions/disease for $\\textit{every individual}$. In this work, we propose a zero-shot self-supervised learning approach to perform subject-specific accelerated DL MRI reconstruction to tackle these issues. The proposed approach partitions the available measurements from a single scan into three disjoint sets. Two of these sets are used to enforce data consistency and define loss during training for self-supervision, while the last set serves to self-validate, establishing an early stopping criterion. In the presence of models pre-trained on a database with different image characteristics, we show that the proposed approach can be combined with transfer learning for faster convergence time and reduced computational complexity.", "keywords": "Zero-shot learning;Self-supervised learning;MRI Reconstruction;Transfer learning;Physics-guided deep learning", "primary_area": "", "supplementary_material": "", "author": "Burhaneddin Yaman;Seyed Amir Hossein Hosseini;Mehmet Akcakaya", "authorids": "~Burhaneddin_Yaman1;~Seyed_Amir_Hossein_Hosseini1;~Mehmet_Akcakaya1", "gender": ";M;M", "homepage": "https://yaman.umn.edu/;;http://z.umn.edu/akcakaya/", "dblp": "204/6339;;02/4471", "google_scholar": "0JS9ozcAAAAJ;iXb2KX0AAAAJ;x-q3XC4AAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Burhaneddin_Yaman1;~Seyed_Amir_Hossein_Hosseini1;~Mehmet_Akcakaya1", "aff": "University of Minnesota, Minneapolis;;University of Minnesota - Twin Cities", "aff_domain": "umn.edu;;umn.edu", "position": "PhD student;;Associate Professor", "bibtex": "@inproceedings{\nyaman2022zeroshot,\ntitle={Zero-Shot Self-Supervised Learning for {MRI} Reconstruction},\nauthor={Burhaneddin Yaman and Seyed Amir Hossein Hosseini and Mehmet Akcakaya},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=085y6YPaYjP}\n}", "github": "", "project": "", "reviewers": "mBMk;19v3;pt6r", "pdf_size": 0, "recommendation": "5;5;6", "confidence": "5;4;4", "correctness": "4;3;3", "technical_novelty": "2;2;3", "empirical_novelty": "2;3;3", "wc_summary_paper": "122;133;168", "wc_summary_review": "73;90;95", "wc_main_review": "187;354;553", "wc_review": "382;577;816", "wc_reply_reviewers": "52;0;441", "wc_reply_authors": "1823;747;2440", "reply_reviewers": "1;0;2", "reply_authors": "3;1;4", "recommendation_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 141.0, 19.61292091114087 ], "wc_summary_review_avg": [ 86.0, 9.41629792788369 ], "wc_main_review_avg": [ 364.6666666666667, 149.60912033993412 ], "wc_review_avg": [ 591.6666666666666, 177.48301953207303 ], "wc_reply_reviewers_avg": [ 164.33333333333334, 196.78132251026489 ], "wc_reply_authors_avg": [ 1670.0, 699.5803504006288 ], "reply_reviewers_avg": [ 1.0, 0.816496580927726 ], "reply_authors_avg": [ 2.6666666666666665, 1.247219128924647 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.4999999999999999, "corr_recommendation_correctness": -0.4999999999999999, "gs_citation": 88, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8560658023776593054&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=085y6YPaYjP", "email": "umn.edu;;umn.edu", "author_num": 3, "aff_unique_index": "0;0", "aff_unique_norm": "University of Minnesota", "aff_unique_dep": "", "aff_unique_url": "https://www.minnesota.edu", "aff_unique_abbr": "UMN", "aff_campus_unique_index": "0;1", "aff_campus_unique": "Minneapolis;Twin Cities", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "NAS-Bench-Suite: NAS Evaluation is (Now) Surprisingly Easy", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7126", "id": "0DLwqQLmqV", "poster": "", "openreview": "https://openreview.net/forum?id=0DLwqQLmqV", "slides": "https://iclr.cc/virtual/2022/poster/7126", "video": "https://iclr.cc/virtual/2022/poster/7126", "author_site": "Yash Mehta, Colin White, Arber Zela, Arjun Krishnakumar, Guri Zabergja, Shakiba Moradian, Mahmoud Safari, Kaicheng Yu, Frank Hutter", "tldr": "", "abstract": "The release of tabular benchmarks, such as NAS-Bench-101 and NAS-Bench-201, has significantly lowered the computational overhead for conducting scientific research in neural architecture search (NAS). Although they have been widely adopted and used to tune real-world NAS algorithms, these benchmarks are limited to small search spaces and focus solely on image classification. Recently, several new NAS benchmarks have been introduced that cover significantly larger search spaces over a wide range of tasks, including object detection, speech recognition, and natural language processing. However, substantial differences among these NAS benchmarks have so far prevented their widespread adoption, limiting researchers to using just a few benchmarks. In this work, we present an in-depth analysis of popular NAS algorithms and performance prediction methods across 25 different combinations of search spaces and datasets, finding that many conclusions drawn from a few NAS benchmarks do \\emph{not} generalize to other benchmarks. To help remedy this problem, we introduce \\nasbs, a comprehensive and extensible collection of NAS benchmarks, accessible through a unified interface, created with the aim to facilitate reproducible, generalizable, and rapid NAS research. Our code is available at https://github.com/automl/naslib.", "keywords": "neural architecture search;AutoML", "primary_area": "", "supplementary_material": "", "author": "Yash Mehta;Colin White;Arber Zela;Arjun Krishnakumar;Guri Zabergja;Shakiba Moradian;Mahmoud Safari;Kaicheng Yu;Frank Hutter", "authorids": "~Yash_Mehta1;~Colin_White1;~Arber_Zela1;~Arjun_Krishnakumar1;~Guri_Zabergja1;~Shakiba_Moradian1;~Mahmoud_Safari1;~Kaicheng_Yu1;~Frank_Hutter1", "gender": "M;M;M;M;M;F;M;M;M", "homepage": "https://yashsmehta.com/;https://crwhite.ml/;https://ml.informatik.uni-freiburg.de/people/zela/index.html;;https://github.com/gurizab;https://www.linkedin.com/in/shakiba-moradian-99828b133;https://ml.informatik.uni-freiburg.de/profile/safari/;https://www.yukaicheng.cn;http://ml.informatik.uni-freiburg.de/~hutter/", "dblp": ";136/9162;;312/6584;312/6886;;280/3542;;89/5383", "google_scholar": "zFqBbIkAAAAJ;LS6HY-gAAAAJ;hD_6YioAAAAJ;;essstzYAAAAJ;;https://scholar.google.it/citations?user=ntPjyLwAAAAJ;j9OguiIAAAAJ;https://scholar.google.de/citations?user=YUrxwrkAAAAJ", "orcid": "0000-0002-9610-7077;;;;;;;;0000-0002-2037-3694", "linkedin": "yashsmehta/;;https://de.linkedin.com/in/arber-zela-ba85a2145;arjun-krishnakumar-10235754/;guri-zab%C3%ABrgja-88b8a4215/;;;;frank-hutter-9190b24b/", "or_profile": "~Yash_Mehta1;~Colin_White1;~Arber_Zela1;~Arjun_Krishnakumar1;~Guri_Zabergja1;~Shakiba_Moradian1;~Mahmoud_Safari1;~Kaicheng_Yu1;~Frank_Hutter1", "aff": "HHMI Janelia Research Campus;Abacus.AI;University of Freiburg;University of Freiburg, Universit\u00e4t Freiburg;CS Department, University of Freiburg, Germany, Albert-Ludwigs-Universit\u00e4t Freiburg;Universit\u00e4t Freiburg;Universit\u00e4t Freiburg;Alibaba Group;Albert-Ludwigs-Universit\u00e4t Freiburg", "aff_domain": "janelia.hhmi.org;abacus.ai;uni-freiburg.de;cs.uni-freiburg.de;informatik.uni-freiburg.de;uni-freiburg.de;uni-freiburg.de;alibaba-inc.com;uni-freiburg.de", "position": "Researcher;Head of Research;PhD student;MS student;MS student;MS student;Postdoc;Researcher;Full Professor", "bibtex": "@inproceedings{\nmehta2022nasbenchsuite,\ntitle={{NAS}-Bench-Suite: {NAS} Evaluation is (Now) Surprisingly Easy},\nauthor={Yash Mehta and Colin White and Arber Zela and Arjun Krishnakumar and Guri Zabergja and Shakiba Moradian and Mahmoud Safari and Kaicheng Yu and Frank Hutter},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=0DLwqQLmqV}\n}", "github": "", "project": "", "reviewers": "2xvD;Gku7;Ensh;xtz2", "pdf_size": 0, "recommendation": "6;8;8;8", "confidence": "4;5;4;3", "correctness": "4;3;4;4", "technical_novelty": "2;1;3;3", "empirical_novelty": "3;4;3;3", "wc_summary_paper": "49;48;38;57", "wc_summary_review": "18;49;15;27", "wc_main_review": "62;144;146;288", "wc_review": "129;241;199;372", "wc_reply_reviewers": "0;22;0;0", "wc_reply_authors": "273;767;117;510", "reply_reviewers": "0;1;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 7.5, 0.8660254037844386 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.82915619758885 ], "empirical_novelty_avg": [ 3.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 48.0, 6.745368781616021 ], "wc_summary_review_avg": [ 27.25, 13.311179511974137 ], "wc_main_review_avg": [ 160.0, 81.30190649671138 ], "wc_review_avg": [ 235.25, 88.51094565080638 ], "wc_reply_reviewers_avg": [ 5.5, 9.526279441628825 ], "wc_reply_authors_avg": [ 416.75, 245.9089008149156 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 9, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": -0.3333333333333333, "gs_citation": 59, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4023865038521320162&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=0DLwqQLmqV", "email": "janelia.hhmi.org;abacus.ai;uni-freiburg.de;cs.uni-freiburg.de;informatik.uni-freiburg.de;uni-freiburg.de;uni-freiburg.de;alibaba-inc.com;uni-freiburg.de", "author_num": 9, "aff_unique_index": "0;1;2;2;2;2;2;3;4", "aff_unique_norm": "HHMI Janelia Research Campus;Abacus.AI;University of Freiburg;Alibaba Group;Albert-Ludwigs-Universit\u00e4t Freiburg", "aff_unique_dep": ";;;;", "aff_unique_url": "https://www.janelia.org;https://www.abacus.ai;https://www.uni-freiburg.de;https://www.alibaba.com;https://www.uni-freiburg.de", "aff_unique_abbr": "HHMI Janelia;Abacus.AI;UoF;Alibaba;Albert-Ludwigs-Universit\u00e4t", "aff_campus_unique_index": "0;2;2", "aff_campus_unique": "Janelia;;Freiburg", "aff_country_unique_index": "0;0;1;1;1;1;1;2;1", "aff_country_unique": "United States;Germany;China" }, { "title": "Fast Model Editing at Scale", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6846", "id": "0DcZxeWfOPt", "poster": "", "openreview": "https://openreview.net/forum?id=0DcZxeWfOPt", "slides": "https://iclr.cc/virtual/2022/poster/6846", "video": "https://iclr.cc/virtual/2022/poster/6846", "author_site": "Eric Mitchell, Charles Lin, Antoine Bosselut, Chelsea Finn, Christopher Manning", "tldr": "", "abstract": "While large pre-trained models have enabled impressive results on a variety of downstream tasks, the largest existing models still make errors, and even accurate predictions may become outdated over time. Because detecting all such failures at training time is impossible, enabling both developers and end users of such models to correct inaccurate outputs while leaving the model otherwise intact is desirable. However, the distributed, black-box nature of the representations learned by large neural networks makes producing such targeted edits difficult. If presented with only a single problematic input and new desired output, fine-tuning approaches tend to overfit; other editing algorithms are either computationally infeasible or simply ineffective when applied to very large models. To enable easy post-hoc editing at scale, we propose Model Editor Networks using Gradient Decomposition (MEND), a collection of small auxiliary editing networks that use a single desired input-output pair to make fast, local edits to a pre-trained model's behavior. MEND learns to transform the gradient obtained by standard fine-tuning, using a low-rank decomposition of the gradient to make the parameterization of this transformation tractable. MEND can be trained on a single GPU in less than a day even for 10 billion+ parameter models; once trained MEND enables rapid application of new edits to the pre-trained model. Our experiments with T5, GPT, BERT, and BART models show that MEND is the only approach to model editing that effectively edits the behavior of models with more than 10 billion parameters. Code available at https://sites.google.com/view/mend-editing.", "keywords": "editing;transfomers;meta-learning", "primary_area": "", "supplementary_material": "", "author": "Eric Mitchell;Charles Lin;Antoine Bosselut;Chelsea Finn;Christopher D Manning", "authorids": "~Eric_Mitchell1;~Charles_Lin2;~Antoine_Bosselut1;~Chelsea_Finn1;~Christopher_D_Manning1", "gender": "M;;M;F;M", "homepage": "https://ericmitchell.ai;;https://atcbosselut.github.io/;https://ai.stanford.edu/~cbfinn/;https://nlp.stanford.edu/~manning/", "dblp": "238/0419;;184/3742;131/1783;m/ChristopherDManning", "google_scholar": "q77J4fgAAAAJ;;XD9hkJwAAAAJ;vfPE6hgAAAAJ;1zmDOdwAAAAJ", "orcid": "0000-0002-7487-1744;;;;0000-0001-6155-649X", "linkedin": ";charles-lin/;;;christopher-manning-011575/", "or_profile": "~Eric_Mitchell1;~Charles_Lin2;~Antoine_Bosselut1;~Chelsea_Finn1;~Christopher_D_Manning1", "aff": "Stanford University;Stanford University;Swiss Federal Institute of Technology Lausanne;Google;Computer Science Department, Stanford University", "aff_domain": "stanford.edu;stanford.edu;epfl.ch;google.com;cs.stanford.edu", "position": "PhD student;Undergrad student;Assistant Professor;Research Scientist;Full Professor", "bibtex": "@inproceedings{\nmitchell2022fast,\ntitle={Fast Model Editing at Scale},\nauthor={Eric Mitchell and Charles Lin and Antoine Bosselut and Chelsea Finn and Christopher D Manning},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=0DcZxeWfOPt}\n}", "github": "", "project": "", "reviewers": "WxF5;tLcY;vuAW;5ZVS", "pdf_size": 0, "recommendation": "3;6;8;8", "confidence": "4;3;4;4", "correctness": "3;3;4;3", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;2;3;4", "wc_summary_paper": "47;205;77;140", "wc_summary_review": "27;38;50;49", "wc_main_review": "365;143;668;1001", "wc_review": "439;386;795;1190", "wc_reply_reviewers": "616;0;56;50", "wc_reply_authors": "1856;95;793;485", "reply_reviewers": "3;0;1;1", "reply_authors": "3;1;1;2", "recommendation_avg": [ 6.25, 2.0463381929681126 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 117.25, 60.77160109788124 ], "wc_summary_review_avg": [ 41.0, 9.354143466934854 ], "wc_main_review_avg": [ 544.25, 322.9035266143744 ], "wc_review_avg": [ 702.5, 322.41936976552756 ], "wc_reply_reviewers_avg": [ 180.5, 252.37422610084414 ], "wc_reply_authors_avg": [ 807.25, 654.0689470537491 ], "reply_reviewers_avg": [ 1.25, 1.0897247358851685 ], "reply_authors_avg": [ 1.75, 0.82915619758885 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.07053456158585983, "corr_recommendation_correctness": 0.49374193110101877, "gs_citation": 510, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16012977472608893653&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=0DcZxeWfOPt", "email": "stanford.edu;stanford.edu;epfl.ch;google.com;cs.stanford.edu", "author_num": 5, "aff_unique_index": "0;0;1;2;0", "aff_unique_norm": "Stanford University;Swiss Federal Institute of Technology Lausanne;Google", "aff_unique_dep": ";;Google", "aff_unique_url": "https://www.stanford.edu;https://www.epfl.ch;https://www.google.com", "aff_unique_abbr": "Stanford;EPFL;Google", "aff_campus_unique_index": "0;0;1;2;0", "aff_campus_unique": "Stanford;Lausanne;Mountain View", "aff_country_unique_index": "0;0;1;0;0", "aff_country_unique": "United States;Switzerland" }, { "id": "0DecTiJFbm", "title": "A New Perspective on Fluid Simulation: An Image-to-Image Translation Task via Neural Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "Standard numerical methods for creating simulation models in the field of fluid dynamics are designed to be close to perfection, which results in high computational effort and high computation times in many cases. Unfortunately, there is no mathematical way to decrease this correctness in cases where only approximate predictions are needed. For such cases, we developed an approach based on Neural Networks that is much less time-consuming but nearly as accurate as the numerical model for a human observer. We show that we can keep our results stable and nearly indistinguishable from their numerical counterparts over tenth to hundreds of time steps.", "keywords": "cGAN;CFD;Image-to-Image;Fluid Simulation", "primary_area": "", "supplementary_material": "/attachment/006c0a23c0099bd2c9dd0e0baac4c3eff7474557.zip", "author": "Roman Lehmann;Markus Hoffmann;Simon Leufen;Wolfgang Karl", "authorids": "~Roman_Lehmann1;markus.hoffmann@kit.edu;uevou@student.kit.edu;wolfgang.karl@kit.edu", "gender": "M;;;", "homepage": "https://capp.itec.kit.edu/61_162.php;;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Roman_Lehmann1;markus.hoffmann@kit.edu;uevou@student.kit.edu;wolfgang.karl@kit.edu", "aff": "Karlsruhe Institute of Technology;;;", "aff_domain": "kit.edu;;;", "position": "PhD student;;;", "bibtex": "@misc{\nlehmann2022a,\ntitle={A New Perspective on Fluid Simulation: An Image-to-Image Translation Task via Neural Networks},\nauthor={Roman Lehmann and Markus Hoffmann and Simon Leufen and Wolfgang Karl},\nyear={2022},\nurl={https://openreview.net/forum?id=0DecTiJFbm}\n}", "github": "", "project": "", "reviewers": "jvzR;9rKJ;BH97;1Jbg", "site": "https://openreview.net/forum?id=0DecTiJFbm", "pdf_size": 0, "recommendation": "1;1;3;3", "confidence": "5;5;5;4", "correctness": "2;3;3;1", "technical_novelty": "1;1;2;1", "empirical_novelty": "1;1;2;0", "wc_summary_paper": "34;74;81;21", "wc_summary_review": "51;40;38;29", "wc_main_review": "138;441;722;259", "wc_review": "223;555;841;309", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 2.0, 1.0 ], "confidence_avg": [ 4.75, 0.4330127018922193 ], "correctness_avg": [ 2.25, 0.82915619758885 ], "technical_novelty_avg": [ 1.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 52.5, 25.53918557824427 ], "wc_summary_review_avg": [ 39.5, 7.826237921249264 ], "wc_main_review_avg": [ 390.0, 219.9374911196361 ], "wc_review_avg": [ 482.0, 240.42670400768714 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": -0.30151134457776363, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:W9N5yMKSA4wJ:scholar.google.com/&scioq=A+New+Perspective+on+Fluid+Simulation:+An+Image-to-Image+Translation+Task+via+Neural+Networks&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "Karlsruhe Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.kit.edu", "aff_unique_abbr": "KIT", "aff_country_unique_index": "0", "aff_country_unique": "Germany" }, { "id": "0EL4vLgYKRW", "title": "Plan Better Amid Conservatism: Offline Multi-Agent Reinforcement Learning with Actor Rectification", "track": "main", "status": "Reject", "tldr": "", "abstract": "The idea of conservatism has led to significant progress in offline reinforcement learning (RL) where an agent learns from pre-collected datasets. However, it is still an open question to resolve offline RL in the more practical multi-agent setting as many real-world scenarios involve interaction among multiple agents. Given the recent success of transferring online RL algorithms to the multi-agent setting, one may expect that offline RL algorithms will also transfer to multi-agent settings directly. Surprisingly, when conservatism-based algorithms are applied to the multi-agent setting, the performance degrades significantly with an increasing number of agents. Towards mitigating the degradation, we identify that a key issue that the landscape of the value function can be non-concave and policy gradient improvements are prone to local optima. Multiple agents exacerbate the problem since the suboptimal policy by any agent could lead to uncoordinated global failure. Following this intuition, we propose a simple yet effective method, \\underline{O}ffline \\underline{M}ulti-Agent RL with \\underline{A}ctor \\underline{R}ectification (OMAR), to tackle this critical challenge via an effective combination of first-order policy gradient and zeroth-order optimization methods for the actor to better optimize the conservative value function. Despite the simplicity, OMAR significantly outperforms strong baselines with state-of-the-art performance in multi-agent continuous control benchmarks.", "keywords": "Multi-Agent Reinforcement Learning (MARL);Offline reinforcement learning (RL);Offline Multi-Agent Reinforcement Learning", "primary_area": "", "supplementary_material": "", "author": "Ling Pan;Longbo Huang;Tengyu Ma;Huazhe Xu", "authorids": "~Ling_Pan1;~Longbo_Huang2;~Tengyu_Ma1;~Huazhe_Xu1", "gender": "F;M;M;M", "homepage": "https://ling-pan.github.io/;http://people.iiis.tsinghua.edu.cn/~huang/;http://ai.stanford.edu/~tengyuma/;http://hxu.rocks", "dblp": "199/9303/;79/7077;54/9061;164/9006", "google_scholar": "qZ_zlacAAAAJ;;i38QlUwAAAAJ;t9HPFawAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Ling_Pan1;~Longbo_Huang2;~Tengyu_Ma1;~Huazhe_Xu1", "aff": "Tsinghua University;Tsinghua University;Facebook AI Research;Stanford University", "aff_domain": "tsinghua.edu.cn;tsinghua.edu.cn;fb.com;stanford.edu", "position": "PhD student;Full Professor;Visiting Scientist;Postdoc", "bibtex": "@misc{\npan2022plan,\ntitle={Plan Better Amid Conservatism: Offline Multi-Agent Reinforcement Learning with Actor Rectification},\nauthor={Ling Pan and Longbo Huang and Tengyu Ma and Huazhe Xu},\nyear={2022},\nurl={https://openreview.net/forum?id=0EL4vLgYKRW}\n}", "github": "", "project": "", "reviewers": "pt8q;cYK5;6gYu;qETT", "site": "https://openreview.net/forum?id=0EL4vLgYKRW", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "3;4;3;3", "correctness": "3;3;4;4", "technical_novelty": "2;3;3;2", "empirical_novelty": "2;3;3;2", "wc_summary_paper": "75;40;84;105", "wc_summary_review": "30;34;31;78", "wc_main_review": "244;427;236;192", "wc_review": "349;501;351;375", "wc_reply_reviewers": "0;28;0;0", "wc_reply_authors": "443;1157;1059;685", "reply_reviewers": "0;1;0;0", "reply_authors": "2;3;3;2", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 76.0, 23.46273641330013 ], "wc_summary_review_avg": [ 43.25, 20.116846174288852 ], "wc_main_review_avg": [ 274.75, 90.10375963299201 ], "wc_review_avg": [ 394.0, 62.617888817813075 ], "wc_reply_reviewers_avg": [ 7.0, 12.12435565298214 ], "wc_reply_authors_avg": [ 836.0, 287.2368360778262 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 2.5, 0.5 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 1.0, "gs_citation": 68, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14725479827547401552&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0;1;2", "aff_unique_norm": "Tsinghua University;Meta;Stanford University", "aff_unique_dep": ";Facebook AI Research;", "aff_unique_url": "https://www.tsinghua.edu.cn;https://research.facebook.com;https://www.stanford.edu", "aff_unique_abbr": "THU;FAIR;Stanford", "aff_campus_unique_index": "1", "aff_campus_unique": ";Stanford", "aff_country_unique_index": "0;0;1;1", "aff_country_unique": "China;United States" }, { "title": "Pyraformer: Low-Complexity Pyramidal Attention for Long-Range Time Series Modeling and Forecasting", "status": "Oral", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6827", "id": "0EXmFzUn5I", "poster": "", "openreview": "https://openreview.net/forum?id=0EXmFzUn5I", "slides": "https://iclr.cc/virtual/2022/poster/6827", "video": "https://iclr.cc/virtual/2022/poster/6827", "author_site": "Shi Zhan Liu, Hang Yu, Cong Liao, Jianguo Li, Weiyao Lin, Alex Liu \u00b7", "tldr": "", "abstract": "Accurate prediction of the future given the past based on time series data is of paramount importance, since it opens the door for decision making and risk management ahead of time. In practice, the challenge is to build a flexible but parsimonious model that can capture a wide range of temporal dependencies. In this paper, we propose Pyraformer by exploring the multiresolution representation of the time series. Specifically, we introduce the pyramidal attention module (PAM) in which the inter-scale tree structure summarizes features at different resolutions and the intra-scale neighboring connections model the temporal dependencies of different ranges. Under mild conditions, the maximum length of the signal traversing path in Pyraformer is a constant (i.e., $\\mathcal O(1)$) with regard to the sequence length $L$, while its time and space complexity scale linearly with $L$. Extensive numerical results show that Pyraformer typically achieves the highest prediction accuracy in both single-step and long-range forecasting tasks with the least amount of time and memory consumption, especially when the sequence is long.", "keywords": "sparse attention;pyramidal graph;Transformer;time series forecasting;long-range dependence;multiresolution", "primary_area": "", "supplementary_material": "", "author": "Shizhan Liu;Hang Yu;Cong Liao;Jianguo Li;Weiyao Lin;Alex X. Liu;Schahram Dustdar", "authorids": "~Shizhan_Liu1;~Hang_Yu1;~Cong_Liao1;~Jianguo_Li2;~Weiyao_Lin1;~Alex_X._Liu1;dustdar@dsg.tuwien.ac.at", "gender": ";M;;;M;M;", "homepage": ";;;;https://weiyaolin.github.io/;http://alexliucs.gitee.io/;", "dblp": ";74/2568-2;;;42/6095;https://dblp.uni-trier.de/pid/l/AlexXLiu.html;", "google_scholar": ";;;;S9g81n8AAAAJ;https://scholar.google.com.au/citations?hl=en;", "orcid": ";;;;;;", "linkedin": ";hang-yu-7ba38844/;;;;;", "or_profile": "~Shizhan_Liu1;~Hang_Yu1;~Cong_Liao1;~Jianguo_Li2;~Weiyao_Lin1;~Alex_X._Liu1;dustdar@dsg.tuwien.ac.at", "aff": ";Ant Group;;;Shanghai Jiaotong University;;", "aff_domain": ";antgroup.com;;;sjtu.edu.cn;;", "position": ";Senior Algorithm Expert;;;Full Professor;;", "bibtex": "@inproceedings{\nliu2022pyraformer,\ntitle={Pyraformer: Low-Complexity Pyramidal Attention for Long-Range Time Series Modeling and Forecasting},\nauthor={Shizhan Liu and Hang Yu and Cong Liao and Jianguo Li and Weiyao Lin and Alex X. Liu and Schahram Dustdar},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=0EXmFzUn5I}\n}", "github": "", "project": "", "reviewers": "WMVh;f9f9;17as;cdAe", "pdf_size": 0, "recommendation": "6;6;8;8", "confidence": "4;4;3;4", "correctness": "4;3;4;4", "technical_novelty": "4;2;3;3", "empirical_novelty": "3;3;3;3", "wc_summary_paper": "105;81;107;85", "wc_summary_review": "27;38;52;18", "wc_main_review": "238;358;691;82", "wc_review": "370;477;850;185", "wc_reply_reviewers": "31;35;16;0", "wc_reply_authors": "1273;1265;1997;659", "reply_reviewers": "1;1;1;0", "reply_authors": "3;3;4;1", "recommendation_avg": [ 7.0, 1.0 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 94.5, 11.6081867662439 ], "wc_summary_review_avg": [ 33.75, 12.695963925594622 ], "wc_main_review_avg": [ 342.25, 223.87091704819542 ], "wc_review_avg": [ 470.5, 242.73081798568555 ], "wc_reply_reviewers_avg": [ 20.5, 13.793114224133722 ], "wc_reply_authors_avg": [ 1298.5, 473.98180344819144 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 2.75, 1.0897247358851685 ], "replies_avg": [ 29, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 984, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14372054485665578370&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=0EXmFzUn5I", "email": ";antgroup.com;;;sjtu.edu.cn;;", "author_num": 7, "aff_unique_index": "0;1", "aff_unique_norm": "Ant Group;Shanghai Jiao Tong University", "aff_unique_dep": ";", "aff_unique_url": "https://www.antgroup.com;https://www.sjtu.edu.cn", "aff_unique_abbr": "Ant Group;SJTU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "id": "0GhVG1de-Iv", "title": "Stability and Generalisation in Batch Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Overfitting has been recently acknowledged as a key limiting factor in the capabilities of reinforcement learning algorithms, despite little theoretical characterisation. We provide a theoretical examination of overfitting in the context of batch reinforcement learning, through the fundamental relationship between algorithmic stability (Bousquet & Elisseeff, 2002)\u2013which characterises the effect of a change at a single data point\u2013and the generalisation gap\u2013which quantifies overfitting. Examining a popular fitted policy evaluation method with linear value function approximation, we characterise the dynamics of overfitting in the RL context. We provide finite sample, finite time, polynomial bounds on the generalisation gap in RL. In addition, our approach applies to a class of algorithms which only partially fit to temporal difference errors, as is common in deep RL, rather than perfectly optimising at each step. As such, our results characterise an unexplored bias-variance trade-off in the frequency of target network updates. To do so, our work extends the stochastic gradient-based approach of Hardt et al. (2016) to the iterative methods more common in RL. We find that under regimes where learning requires few iterations, the expected temporal difference error over the dataset is representative of the true performance on the MDP, indicating that, as is the case in supervised learning, good generalisation in RL can be ensured through the use of algorithms that learn quickly.\n", "keywords": "Reinforcement Learning;Algorithmic Stability;Generalisation;Overfitting;Target Network;Fitted TD;Off-Policy;Batch Reinforcement Learning", "primary_area": "", "supplementary_material": "/attachment/247d2d0ab02940046f868c6cfc123989ccfd6076.zip", "author": "Matthew J. A. Smith;Shimon Whiteson", "authorids": "~Matthew_J._A._Smith2;~Shimon_Whiteson1", "gender": ";", "homepage": ";", "dblp": ";https://dblp.uni-trier.de/pers/w/Whiteson:Shimon.html", "google_scholar": "axVCqWIAAAAJ;", "orcid": ";", "linkedin": ";", "or_profile": "~Matthew_J._A._Smith2;~Shimon_Whiteson1", "aff": "University of Oxford;University of Oxford", "aff_domain": "ox.ac.uk;ox.ac.uk", "position": "PhD student;Professor", "bibtex": "@misc{\nsmith2022stability,\ntitle={Stability and Generalisation in Batch Reinforcement Learning},\nauthor={Matthew J. A. Smith and Shimon Whiteson},\nyear={2022},\nurl={https://openreview.net/forum?id=0GhVG1de-Iv}\n}", "github": "", "project": "", "reviewers": "HjVF;P8Es;ZmZV", "site": "https://openreview.net/forum?id=0GhVG1de-Iv", "pdf_size": 0, "recommendation": "3;3;3", "confidence": "4;4;2", "correctness": "3;2;3", "technical_novelty": "3;2;3", "empirical_novelty": "2;0;0", "wc_summary_paper": "109;121;25", "wc_summary_review": "134;104;41", "wc_main_review": "1492;1505;209", "wc_review": "1735;1730;275", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "79;154;97", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 3.3333333333333335, 0.9428090415820634 ], "correctness_avg": [ 2.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 0.6666666666666666, 0.9428090415820634 ], "wc_summary_paper_avg": [ 85.0, 42.708313008125245 ], "wc_summary_review_avg": [ 93.0, 38.7556447501522 ], "wc_main_review_avg": [ 1068.6666666666667, 607.8992972158757 ], "wc_review_avg": [ 1246.6666666666667, 687.0751212365517 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 110.0, 31.96873472629156 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3223907155751598182&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "University of Oxford", "aff_unique_dep": "", "aff_unique_url": "https://www.ox.ac.uk", "aff_unique_abbr": "Oxford", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United Kingdom" }, { "id": "0HkFxvSRDSW", "title": "Role Diversity Matters: A Study of Cooperative Training Strategies for Multi-Agent RL", "track": "main", "status": "Reject", "tldr": "", "abstract": "Cooperative multi-agent reinforcement learning (MARL) is making rapid progress for solving tasks in a grid world and real-world scenarios, in which agents are given different attributes and goals. For example, in Starcraft II battle tasks, agents are initialized with the various move, defense, and attack abilities according to their unit types. Current researchers tend to treat different agents equally and expect them to form a joint policy automatically. However, ignoring the differences between agents in these scenarios may bring policy degradation. Accordingly, in this study, we quantify the agent's difference and study the relationship between the agent's role and the model performance via {\\bf Role Diversity}, a metric that can describe MARL tasks. We define role diversity from three perspectives: policy-based, trajectory-based, and contribution-based to fully describe the agents' differences. Through theoretical analysis, we find that the error bound in MARL can be decomposed into three parts that have a strong relation to the role diversity. The decomposed factors can significantly impact policy optimization on parameter sharing, communication mechanism, and credit assignment strategy. Role diversity can therefore serve as a flag for selecting a suitable training strategy and helping to avoid possible bottlenecks on current tasks. The main experimental platforms are based on {\\bf Multiagent Particle Environment (MPE) }and {\\bf The StarCraft Multi-Agent Challenge (SMAC)}, with extensions to ensure the requirement of this study are met. Our experimental results clearly show that role diversity can serve as a robust description for the characteristics of a multi-agent cooperation task and help explain the question of why the performance of different MARL training strategies is unstable according to this description. In addition, role diversity can help to find a better training strategy and increase performance in cooperative MARL.", "keywords": "Multi-agent Reinforcement Learning", "primary_area": "", "supplementary_material": "", "author": "Siyi Hu;Chuanlong Xie;Xiaodan Liang;Xiaojun Chang", "authorids": "~Siyi_Hu1;~Chuanlong_Xie1;~Xiaodan_Liang2;~Xiaojun_Chang1", "gender": "M;M;F;M", "homepage": "http://www.mmvg.org;;https://www.sysu-hcp.net/;https://www.xiaojun.ai", "dblp": ";;;116/8412", "google_scholar": ";_fgE3u8AAAAJ;voxznZAAAAAJ;https://scholar.google.co.uk/citations?user=8suupocAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Siyi_Hu1;~Chuanlong_Xie1;~Xiaodan_Liang2;~Xiaojun_Chang1", "aff": "Monash University;Huawei Technologies Ltd.;SUN YAT-SEN UNIVERSITY;Royal Melbourne Institute of Technology", "aff_domain": "monash.edu;huawei.com;sysu.edu.cn;rmit.edu.au", "position": "PhD student;Researcher;Associate Professor;Associate Professor", "bibtex": "@misc{\nhu2022role,\ntitle={Role Diversity Matters: A Study of Cooperative Training Strategies for Multi-Agent {RL}},\nauthor={Siyi Hu and Chuanlong Xie and Xiaodan Liang and Xiaojun Chang},\nyear={2022},\nurl={https://openreview.net/forum?id=0HkFxvSRDSW}\n}", "github": "", "project": "", "reviewers": "mf45;L6y7;SqGu;97k6", "site": "https://openreview.net/forum?id=0HkFxvSRDSW", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "3;2;4;4", "correctness": "3;2;3;3", "technical_novelty": "2;2;3;3", "empirical_novelty": "3;2;4;3", "wc_summary_paper": "34;63;87;69", "wc_summary_review": "28;989;30;26", "wc_main_review": "515;289;249;210", "wc_review": "577;1341;366;305", "wc_reply_reviewers": "238;385;0;112", "wc_reply_authors": "1461;2676;708;1181", "reply_reviewers": "1;1;0;1", "reply_authors": "3;5;1;3", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 63.25, 19.057478846898924 ], "wc_summary_review_avg": [ 268.25, 416.12760963435244 ], "wc_main_review_avg": [ 315.75, 118.37942177591509 ], "wc_review_avg": [ 647.25, 413.05591328535655 ], "wc_reply_reviewers_avg": [ 183.75, 143.4893288715227 ], "wc_reply_authors_avg": [ 1506.5, 726.8687983398379 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 3.0, 1.4142135623730951 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.9045340337332909, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:FlnC6pOq83YJ:scholar.google.com/&scioq=Role+Diversity+Matters:+A+Study+of+Cooperative+Training+Strategies+for+Multi-Agent+RL&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "Monash University;Huawei;Sun Yat-sen University;Royal Melbourne Institute of Technology", "aff_unique_dep": ";Huawei Technologies;;", "aff_unique_url": "https://www.monash.edu;https://www.huawei.com;http://www.sysu.edu.cn;https://www.rmit.edu.au", "aff_unique_abbr": "Monash;Huawei;SYSU;RMIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;0", "aff_country_unique": "Australia;China" }, { "id": "0IqFsR9wJvI", "title": "Online graph nets", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Temporal graph neural networks (T-GNNs) sequentially update node states and use temporal message passing to predict events in continuous-time dynamic graphs. While node states rest in the memory, the message-passing operations must be computed on-demand for each prediction. In practice, these operations are the computational bottleneck of state-of-the-art T-GNNs as they require topologically exploring large temporal graphs. To circumvent this caveat, we propose Online Graph Nets (OGNs). To avoid temporal message passing, OGN maintains a summary of the temporal neighbors of each node in a latent variable and updates it as events unroll, in an online fashion. At prediction time, OGN simply combines node states and their latents to obtain node-level representations. Consequently, the memory cost of OGN is constant with respect to the number of previous events. Remarkably, OGN outperforms most existing T-GNNs on temporal link prediction benchmarks while running orders of magnitude faster. For instance, OGN performs similarly to the best-known T-GNN on Reddit, with a $374\\times$ speedup. Also, since OGNs do not explore temporal graphs at prediction time, they are well-suited for on-device predictions (e.g., on mobile phones).", "keywords": "continuous-time dynamic graphs;temporal graph neural networks;graph neural networks", "primary_area": "", "supplementary_material": "/attachment/6af30fc2f09610dfcebe42b0f282f0857b4baff1.zip", "author": "Hojin Kang;Jou-Hui Ho;Diego Mesquita;Jorge P\u00e9rez;Amauri H Souza", "authorids": "~Hojin_Kang1;~Jou-Hui_Ho1;~Diego_Mesquita1;~Jorge_P\u00e9rez2;~Amauri_H_Souza1", "gender": "M;F;M;M;M", "homepage": ";;https://weakly-informative.github.io;http://www.amauriholanda.org;https://users.dcc.uchile.cl/~jperez/", "dblp": ";;163/4293;131/3352;12/6407-1", "google_scholar": ";;;lP0LBI4AAAAJ;a6lUuiwAAAAJ", "orcid": ";;;;", "linkedin": "hojin-kang-kim-820046170;jou-hui-ho-519946151/;;;", "or_profile": "~Hojin_Kang1;~Jou-Hui_Ho1;~Diego_Mesquita1;~Amauri_H_Souza1;~Jorge_P\u00e9rez1", "aff": ";;Getulio Vargas Foundation;Federal Institute of Cear\u00e1;Universidad de Chile", "aff_domain": ";;fgv.br;ifce.edu.br;dcc.uchile.cl", "position": ";;Assistant Professor;Associate Professor;Associate Professor", "bibtex": "@misc{\nkang2022online,\ntitle={Online graph nets},\nauthor={Hojin Kang and Jou-Hui Ho and Diego Mesquita and Jorge P{\\'e}rez and Amauri H Souza},\nyear={2022},\nurl={https://openreview.net/forum?id=0IqFsR9wJvI}\n}", "github": "", "project": "", "reviewers": "HSHM;HGnR;oULy;5rxL", "site": "https://openreview.net/forum?id=0IqFsR9wJvI", "pdf_size": 0, "recommendation": "5;6;6;6", "confidence": "5;4;4;3", "correctness": "3;3;3;3", "technical_novelty": "2;2;2;2", "empirical_novelty": "3;3;2;2", "wc_summary_paper": "169;165;169;143", "wc_summary_review": "86;55;106;34", "wc_main_review": "314;549;584;307", "wc_review": "569;769;859;484", "wc_reply_reviewers": "0;0;74;0", "wc_reply_authors": "775;640;942;751", "reply_reviewers": "0;0;1;0", "reply_authors": "1;1;2;1", "recommendation_avg": [ 5.75, 0.4330127018922193 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 161.5, 10.805091392487155 ], "wc_summary_review_avg": [ 70.25, 27.716195626384224 ], "wc_main_review_avg": [ 438.5, 128.62056600715144 ], "wc_review_avg": [ 670.25, 150.26539022675848 ], "wc_reply_reviewers_avg": [ 18.5, 32.04293994002423 ], "wc_reply_authors_avg": [ 777.0, 108.02083132433299 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.816496580927726, "corr_recommendation_correctness": 0.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11275998351788872629&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;2", "aff_unique_norm": "Getulio Vargas Foundation;Federal Institute of Cear\u00e1;Universidad de Chile", "aff_unique_dep": ";;", "aff_unique_url": "https://fgv.br;http://www.ifce.edu.br;https://www.uchile.cl", "aff_unique_abbr": "FGV;IFCE;UCH", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1", "aff_country_unique": "Brazil;Chile" }, { "id": "0J98XyjlQ1", "title": "D$^2$-GCN: Data-Dependent GCNs for Boosting Both Efficiency and Scalability", "track": "main", "status": "Reject", "tldr": "", "abstract": "Graph Convolutional Networks (GCNs) have gained an increasing attention thanks to their state-of-the-art (SOTA) performance in graph-based learning tasks. However, their sheer number of node features and large adjacency matrix limit their deployment into real-world applications, as they impose the following challenges: (1) prohibitive inference cost, especially for resource-constrained applications and (2) low trainability of deep GCNs. To this end, we aim to develop low-cost GCNs with improved trainability, as inspired by recent findings in deep neural network optimization which show that not all data/(model components) are equally important. Specifically, we propose a Data-Dependent GCN framework dubbed D$^2$-GCN which integrates data-dependent dynamic skipping at multiple granularities: (1) node-wise skipping to bypass aggregating features of unimportant neighbor nodes and their corresponding combinations; (2) edge-wise skipping to prune the unimportant edge connections of each node; and (3) bit-wise skipping to dynamically adapt the bit-precision of both the node features and weights. Our D$^2$-GCN is achieved by identifying the importance of node features via a low-cost indicator, and thus is simple and generally applicable to various graph-based learning tasks. Extensive experiments and ablation studies on 6 GCN model and dataset pairs consistently validate that the proposed D$^2$-GCN can (1) largely squeeze out unnecessary costs from both the aggregation and combination phases (e.g., reduce the inference FLOPs by $\\downarrow$1.1$\\times$ $\\sim$ $\\downarrow$37.0$\\times$ and shrink the energy cost of GCN inference by $\\downarrow$1.6$\\times$ $\\sim$ $\\downarrow$8.4$\\times$), while offering a comparable or an even better accuracy (e.g., $\\downarrow$ 0.5% $\\sim$ $\\uparrow$ 5.6%); and (2) help GCNs to go deeper by boosting their trainability (e.g., providing a $\\uparrow$ 0.8% $\\sim$ $\\uparrow$ 5.1% higher accuracy when increasing the model depth from 4 layers to 64 layers) and thus achieving a comparable or even better accuracy of GCNs with more layers over SOTA techniques (e.g., a $\\downarrow$0.4% $\\sim$ $\\uparrow$38.6% higher accuracy for models with 64 layers). All the codes and pretrained models will be released upon acceptance.", "keywords": "Graph Convolutional Networks;Efficient Networks", "primary_area": "", "supplementary_material": "", "author": "Chaojian Li;Xu Ouyang;Yang Zhao;Haoran You;Yonggan Fu;Yuchen Gu;Haonan Liu;Siyuan Miao;Yingyan Lin", "authorids": "~Chaojian_Li1;~Xu_Ouyang2;~Yang_Zhao1;~Haoran_You1;~Yonggan_Fu1;~Yuchen_Gu1;~Haonan_Liu1;~Siyuan_Miao1;~Yingyan_Lin1", "gender": ";M;F;M;M;F;M;M;F", "homepage": "https://licj15.github.io/;https://xo28.github.io/;https://www.yangkatiezhao.net/;http://haoranyou.com/;https://www.yongganfu.com/;;;https://www.chsi.com.cn/;https://eiclab.scs.gatech.edu/", "dblp": "249/5403;;50/2082-13;230/4247;244/8166;;;;120/6981", "google_scholar": "HvEBdf4AAAAJ;https://scholar.google.com/citations?view_op=list_works;HxeTq4MAAAAJ;z5Eku1sAAAAJ;https://scholar.google.com/citations?hl=en;;;;dio8IesAAAAJ", "orcid": ";0000-0003-2433-8180;0000-0001-8023-1551;0000-0002-2873-2153;;;;;", "linkedin": ";xu-ouyang-b0a2351a0/;;haoran-you-b4b958165/;yonggan-fu-b211831b0;yuchen-irene-gu;haonan-liu-842a381a6/;;yingyan-celine-lin-a281211a/", "or_profile": "~Chaojian_Li1;~Xu_Ouyang2;~Yang_Zhao1;~Haoran_You1;~Yonggan_Fu1;~Yuchen_Gu1;~Haonan_Liu1;~Siyuan_Miao1;~Yingyan_Lin1", "aff": "Rice University;Rice University;Rice University;Rice University;Rice University;Rice University;Zhejiang University;Xi'an Jiaotong University;Rice University", "aff_domain": "rice.edu;rice.edu;rice.edu;rice.edu;rice.edu;rice.edu;zju.edu.cn;xjtu.edu.cn;rice.edu", "position": "PhD student;Researcher;PhD student;PhD student;PhD student;Undergrad student;Undergrad student;Undergrad student;Assistant Professor", "bibtex": "@misc{\nli2022dgcn,\ntitle={D\\${\\textasciicircum}2\\$-{GCN}: Data-Dependent {GCN}s for Boosting Both Efficiency and Scalability},\nauthor={Chaojian Li and Xu Ouyang and Yang Zhao and Haoran You and Yonggan Fu and Yuchen Gu and Haonan Liu and Siyuan Miao and Yingyan Lin},\nyear={2022},\nurl={https://openreview.net/forum?id=0J98XyjlQ1}\n}", "github": "", "project": "", "reviewers": "AVzB;pLUm;gZU6;u2pj", "site": "https://openreview.net/forum?id=0J98XyjlQ1", "pdf_size": 0, "recommendation": "3;5;6;6", "confidence": "3;3;4;4", "correctness": "3;3;3;3", "technical_novelty": "3;3;4;2", "empirical_novelty": "3;3;4;2", "wc_summary_paper": "50;28;67;74", "wc_summary_review": "24;27;32;14", "wc_main_review": "328;317;206;147", "wc_review": "402;372;305;235", "wc_reply_reviewers": "89;0;31;0", "wc_reply_authors": "2002;1007;755;489", "reply_reviewers": "1;0;1;0", "reply_authors": "4;2;1;1", "recommendation_avg": [ 5.0, 1.224744871391589 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 54.75, 17.73943347460679 ], "wc_summary_review_avg": [ 24.25, 6.5717197140474575 ], "wc_main_review_avg": [ 249.5, 76.02137857208326 ], "wc_review_avg": [ 328.5, 64.39914595706996 ], "wc_reply_reviewers_avg": [ 30.0, 36.33868462121325 ], "wc_reply_authors_avg": [ 1063.25, 572.1006795136674 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 2.0, 1.224744871391589 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 9, 0 ], "corr_recommendation_confidence": 0.8164965809277259, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:gNS7y_E4vmMJ:scholar.google.com/&scioq=D%24%5E2%24-GCN:+Data-Dependent+GCNs+for+Boosting+Both+Efficiency+and+Scalability&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;0;0;0;0;1;2;0", "aff_unique_norm": "Rice University;Zhejiang University;Xi'an Jiao Tong University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.rice.edu;https://www.zju.edu.cn;https://www.xjtu.edu.cn", "aff_unique_abbr": "Rice;ZJU;XJTU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;1;1;0", "aff_country_unique": "United States;China" }, { "title": "KL Guided Domain Adaptation", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6322", "id": "0JzqUlIVVDd", "poster": "", "openreview": "https://openreview.net/forum?id=0JzqUlIVVDd", "slides": "https://iclr.cc/virtual/2022/poster/6322", "video": "https://iclr.cc/virtual/2022/poster/6322", "author_site": "A. Tuan Nguyen, Toan Tran, Yarin Gal, Philip Torr, Atilim Gunes Baydin", "tldr": "", "abstract": "Domain adaptation is an important problem and often needed for real-world applications. In this problem, instead of i.i.d. training and testing datapoints, we assume that the source (training) data and the target (testing) data have different distributions. With that setting, the empirical risk minimization training procedure often does not perform well, since it does not account for the change in the distribution. A common approach in the domain adaptation literature is to learn a representation of the input that has the same (marginal) distribution over the source and the target domain. However, these approaches often require additional networks and/or optimizing an adversarial (minimax) objective, which can be very expensive or unstable in practice. To improve upon these marginal alignment techniques, in this paper, we first derive a generalization bound for the target loss based on the training loss and the reverse Kullback-Leibler (KL) divergence between the source and the target representation distributions. Based on this bound, we derive an algorithm that minimizes the KL term to obtain a better generalization to the target domain. We show that with a probabilistic representation network, the KL term can be estimated efficiently via minibatch samples without any additional network or a minimax objective. This leads to a theoretically sound alignment method which is also very efficient and stable in practice. Experimental results also suggest that our method outperforms other representation-alignment approaches.", "keywords": "domain adaptation;invariant representation", "primary_area": "", "supplementary_material": "/attachment/1db7dd3c325b238240272c2a63bfa0ca9f38dbc8.zip", "author": "A. Tuan Nguyen;Toan Tran;Yarin Gal;Philip Torr;Atilim Gunes Baydin", "authorids": "~A._Tuan_Nguyen1;~Toan_Tran1;~Yarin_Gal1;~Philip_Torr1;~Atilim_Gunes_Baydin1", "gender": "M;M;;;", "homepage": "https://atuannguyen.com;;http://www.cs.ox.ac.uk/people/yarin.gal/website//;http://www.robots.ox.ac.uk/~tvg/;http://www.robots.ox.ac.uk/~gunes/", "dblp": ";207/8479-3;67/9076;;", "google_scholar": "V-guxukAAAAJ;https://scholar.google.com.au/citations?user=PnwSuNMAAAAJ;https://scholar.google.co.uk/citations?user=SIayDoQAAAAJ;;https://scholar.google.co.uk/citations?user=GWBSOj4AAAAJ", "orcid": ";0000-0001-7182-7548;;;", "linkedin": "a-tuan-nguyen/;;;;", "or_profile": "~A._Tuan_Nguyen1;~Toan_Tran1;~Yarin_Gal1;~Philip_Torr1;~Atilim_Gunes_Baydin1", "aff": "University of Oxford;Hanoi University of Science and Technology;University of Oxford;University of Oxford;University of Oxford", "aff_domain": "ox.ac.uk;hust.edu.vn;ox.ac.uk;ox.ac.uk;ox.ac.uk", "position": "PhD student;Lecturer;Associate Professor;Full Professor;Assistant Professor", "bibtex": "@inproceedings{\nnguyen2022kl,\ntitle={{KL} Guided Domain Adaptation},\nauthor={A. Tuan Nguyen and Toan Tran and Yarin Gal and Philip Torr and Atilim Gunes Baydin},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=0JzqUlIVVDd}\n}", "github": "", "project": "", "reviewers": "FBaS;V8Y3;i12Z;g5v8", "pdf_size": 0, "recommendation": "3;6;6;8", "confidence": "4;3;3;4", "correctness": "3;3;3;4", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "47;33;16;115", "wc_summary_review": "100;28;32;61", "wc_main_review": "412;210;266;145", "wc_review": "559;271;314;321", "wc_reply_reviewers": "448;13;0;0", "wc_reply_authors": "1150;936;548;364", "reply_reviewers": "1;1;0;0", "reply_authors": "3;3;1;1", "recommendation_avg": [ 5.75, 1.7853571071357126 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 52.75, 37.5790832778023 ], "wc_summary_review_avg": [ 55.25, 28.80429655450728 ], "wc_main_review_avg": [ 258.25, 98.55550466615246 ], "wc_review_avg": [ 366.25, 112.91894216649392 ], "wc_reply_reviewers_avg": [ 115.25, 192.18659552632695 ], "wc_reply_authors_avg": [ 749.5, 309.9979838644116 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 2.0, 1.0 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.14002800840280097, "corr_recommendation_correctness": 0.7276068751089989, "gs_citation": 64, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17961201142994065292&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 13, "pdf": "https://openreview.net/pdf?id=0JzqUlIVVDd", "email": "ox.ac.uk;hust.edu.vn;ox.ac.uk;ox.ac.uk;ox.ac.uk", "author_num": 5, "aff_unique_index": "0;1;0;0;0", "aff_unique_norm": "University of Oxford;Hanoi University of Science and Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.ox.ac.uk;https://www.hust.edu.vn", "aff_unique_abbr": "Oxford;HUST", "aff_campus_unique_index": "1", "aff_campus_unique": ";Hanoi", "aff_country_unique_index": "0;1;0;0;0", "aff_country_unique": "United Kingdom;Vietnam" }, { "id": "0Kj5mhn6sw", "title": "Gesture2Vec: Clustering Gestures using Representation Learning Methods for Co-speech Gesture Generation", "track": "main", "status": "Reject", "tldr": "", "abstract": "Co-speech gestures are a principal component in conveying messages and enhancing interaction experiences between humans. Similarly, the co-speech gesture is a key ingredient in human-agent interaction including both virtual agents and robots. Existing machine learning approaches have yielded only marginal success in learning speech-to-motion at the frame level. Current methods generate repetitive gesture sequences that lack appropriateness with respect to the speech context. In this paper, we propose a Gesture2Vec model using representation learning methods to learn the relationship between semantic features and corresponding gestures. We propose a vector-quantized variational autoencoder structure as well as training techniques to learn a rigorous representation of gesture sequences. Furthermore, we use a machine translation model that takes input text and translates it into a discrete sequence of associated gesture chunks in the learned gesture space. Ultimately, we use translated quantized gestures from the input text as an input to the autoencoder\u2019s decoder to produce gesture sequences. The resulting gestures can be applied to both virtual agents and humanoid robots. Subjective and objective evaluations confirm the success of our approach in terms of appropriateness, human-likeness, and diversity. ", "keywords": "representation learning;gesture generation;vector quantization;machine translation", "primary_area": "", "supplementary_material": "/attachment/fdcc32cf23140ab32f7194158f20fccfad130514.zip", "author": "Payam Jome Yazdian;Mo Chen;Angelica Lim", "authorids": "~Payam_Jome_Yazdian1;~Mo_Chen1;angelica@sfu.ca", "gender": "M;M;", "homepage": ";http://www.sfu.ca/~mochen/;", "dblp": "206/8991;;", "google_scholar": "https://scholar.google.ca/citations?user=34A6QeoAAAAJ;https://scholar.google.ca/citations?user=19UAgLUAAAAJ;", "orcid": ";0000-0001-8506-3665;", "linkedin": "payam-jome-yazdian/;;", "or_profile": "~Payam_Jome_Yazdian1;~Mo_Chen1;angelica@sfu.ca", "aff": "Simon Fraser University;Simon Fraser University;", "aff_domain": "sfu.ca;sfu.ca;", "position": "PhD student;Assistant Professor;", "bibtex": "@misc{\nyazdian2022gesturevec,\ntitle={Gesture2Vec: Clustering Gestures using Representation Learning Methods for Co-speech Gesture Generation},\nauthor={Payam Jome Yazdian and Mo Chen and Angelica Lim},\nyear={2022},\nurl={https://openreview.net/forum?id=0Kj5mhn6sw}\n}", "github": "", "project": "", "reviewers": "mN3D;aN3z;RQVL", "site": "https://openreview.net/forum?id=0Kj5mhn6sw", "pdf_size": 0, "recommendation": "3;5;6", "confidence": "5;4;5", "correctness": "2;3;3", "technical_novelty": "2;1;3", "empirical_novelty": "0;2;3", "wc_summary_paper": "83;83;114", "wc_summary_review": "78;8;78", "wc_main_review": "1145;423;436", "wc_review": "1306;514;628", "wc_reply_reviewers": "0;43;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;1;0", "reply_authors": "0;0;0", "recommendation_avg": [ 4.666666666666667, 1.247219128924647 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "correctness_avg": [ 2.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.0, 0.816496580927726 ], "empirical_novelty_avg": [ 1.6666666666666667, 1.247219128924647 ], "wc_summary_paper_avg": [ 93.33333333333333, 14.613540144521982 ], "wc_summary_review_avg": [ 54.666666666666664, 32.99831645537221 ], "wc_main_review_avg": [ 668.0, 337.3316864254923 ], "wc_review_avg": [ 816.0, 349.59405029262155 ], "wc_reply_reviewers_avg": [ 14.333333333333334, 20.27039439401436 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.18898223650461357, "corr_recommendation_correctness": 0.9449111825230683, "gs_citation": 34, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11095467091697502908&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0", "aff_unique_norm": "Simon Fraser University", "aff_unique_dep": "", "aff_unique_url": "https://www.sfu.ca", "aff_unique_abbr": "SFU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Canada" }, { "id": "0LHZ4UXEPOy", "title": "Generative Kernel Continual Learning", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Kernel continual learning by Derakhshani et al. (2021) has recently emerged as a strong continual learner due to its non-parametric ability to tackle task interference and catastrophic forgetting. Unfortunately its success comes at the expense of an explicit memory to store samples from past tasks, which hampers scalability to continual learning settings with a large number of tasks. In this paper, we introduce generative kernel continual learning, which explores and exploits the synergies between generative models and kernels for continual learning. The generative model is able to produce representative samples for kernel learning, which removes the dependence on memory in kernel continual learning. Moreover, as we replay only on the generative model, we avoid task interference while being computationally more efficient compared to previous methods that need replay on the entire model. We further introduce a supervised contrastive regularization, which enables our model to generate even more discriminative samples for better kernel-based classification performance. We conduct extensive experiments on three widely-used continual learning benchmarks that demonstrate the abilities and benefits of our contributions. Most notably, on the challenging SplitCIFAR100 benchmark, with just a simple linear kernel we obtain the same accuracy as kernel continual learning with variational random features for one tenth of the memory, or a 10.1% accuracy gain for the same memory budget.", "keywords": "kernel continual learning;generative learning;catastrophic forgetting", "primary_area": "", "supplementary_material": "", "author": "Mohammad Mahdi Derakhshani;Xiantong Zhen;Ling Shao;Cees G. M. Snoek", "authorids": "~Mohammad_Mahdi_Derakhshani2;~Xiantong_Zhen1;~Ling_Shao1;~Cees_G._M._Snoek1", "gender": "M;M;M;M", "homepage": "https://mmderakhshani.github.io/;;;http://www.ceessnoek.info", "dblp": "220/5737;78/10651;;s/CeesSnoek", "google_scholar": "n7GnOJoAAAAJ;https://scholar.google.ca/citations?user=DnBb3e0AAAAJ;z84rLjoAAAAJ;https://scholar.google.nl/citations?user=0uKdbscAAAAJ", "orcid": "0000-0003-0307-8439;;;0000-0001-9092-1556", "linkedin": "mmderakhshani/;;;cgmsnoek/", "or_profile": "~Mohammad_Mahdi_Derakhshani2;~Xiantong_Zhen1;~Ling_Shao1;~Cees_Snoek1", "aff": "Samsung;Inception Institute of Artificial Intelligence;Terminus Group;University of Amsterdam", "aff_domain": "samsung.com;inceptioniai.org;terminusgroup.com;uva.nl", "position": "Intern;Senior Scientist;Chief Scientist;Full Professor", "bibtex": "@misc{\nderakhshani2022generative,\ntitle={Generative Kernel Continual Learning},\nauthor={Mohammad Mahdi Derakhshani and Xiantong Zhen and Ling Shao and Cees G. M. Snoek},\nyear={2022},\nurl={https://openreview.net/forum?id=0LHZ4UXEPOy}\n}", "github": "", "project": "", "reviewers": "6SH2;wx32;p6X2;mCiT;qPfB", "site": "https://openreview.net/forum?id=0LHZ4UXEPOy", "pdf_size": 0, "recommendation": "3;3;3;6;6", "confidence": "4;4;4;4;4", "correctness": "3;3;2;4;3", "technical_novelty": "2;2;2;3;3", "empirical_novelty": "1;2;2;3;2", "wc_summary_paper": "45;160;63;79;146", "wc_summary_review": "47;187;44;12;129", "wc_main_review": "526;1347;865;203;888", "wc_review": "618;1694;972;294;1163", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "0;0;0;0;0", "reply_reviewers": "0;0;0;0;0", "reply_authors": "0;0;0;0;0", "recommendation_avg": [ 4.2, 1.469693845669907 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.0, 0.6324555320336759 ], "technical_novelty_avg": [ 2.4, 0.4898979485566356 ], "empirical_novelty_avg": [ 2.0, 0.6324555320336759 ], "wc_summary_paper_avg": [ 98.6, 45.91557469965937 ], "wc_summary_review_avg": [ 83.8, 64.47759300718351 ], "wc_main_review_avg": [ 765.8, 383.87883505085296 ], "wc_review_avg": [ 948.2, 477.4793817538093 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.6454972243679027, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:O_nYcxXSbHoJ:scholar.google.com/&scioq=Generative+Kernel+Continual+Learning&hl=en&as_sdt=0,33", "gs_version_total": 2, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "Samsung;Inception Institute of Artificial Intelligence;Terminus Group;University of Amsterdam", "aff_unique_dep": "Samsung;;;", "aff_unique_url": "https://www.samsung.com;https://www.inceptioniai.org;;https://www.uva.nl", "aff_unique_abbr": "Samsung;;;UvA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;3", "aff_country_unique": "South Korea;United Arab Emirates;;Netherlands" }, { "id": "0Mo_5PkLpwc", "title": "Robust Cross-Modal Semi-supervised Few Shot Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Semi-supervised learning has been successfully applied to few-shot\nlearning (FSL) due to its capability of leveraging the information\nof limited labeled data and massive unlabeled data. However, in many\nrealistic applications, the query and support sets provided for FSL\nare potentially noisy or unreadable where the noise exists in both\ncorrupted labels and outliers. Motivated by that, we propose to\nemploy a robust cross-modal semi-supervised few-shot learning\n(RCFSL) based on Bayesian deep learning. By placing the uncertainty\nprior on top of the parameters of infinite Gaussian mixture model\nfor noisy input, multi-modality information from image and text data\nare integrated into a robust heterogenous variational autoencoder.\nSubsequently, a robust divergence measure is employed to further\nenhance the robustness, where a novel variational lower bound is\nderived and optimized to infer the network parameters. Finally, a robust semi-supervised\ngenerative adversarial network is employed to generate robust\nfeatures to compensate data sparsity in few shot learning and a\njoint optimization is applied for training and inference. Our\napproach is more parameter-efficient, scalable and adaptable\ncompared to previous approaches. Superior performances over the\nstate-of-the-art on multiple benchmark multi-modal dataset are\ndemonstrated given the complicated noise for semi-supervised\nfew-shot learning.", "keywords": "few-shot learning;noisy labels;variational inference;cross-modality;uncertainty;robustness", "primary_area": "", "supplementary_material": "", "author": "Xu Chen", "authorids": "~Xu_Chen16", "gender": "M", "homepage": "https://scholar.google.com/citations?user=-lm4aPcAAAAJ&hl=en", "dblp": "83/6331-19.html", "google_scholar": "-lm4aPcAAAAJ", "orcid": "", "linkedin": "xuchen1/", "or_profile": "~Xu_Chen16", "aff": "Zoom", "aff_domain": "zoom.com", "position": "Principal Researcher", "bibtex": "@misc{\nchen2022robust,\ntitle={Robust Cross-Modal Semi-supervised Few Shot Learning},\nauthor={Xu Chen},\nyear={2022},\nurl={https://openreview.net/forum?id=0Mo_5PkLpwc}\n}", "github": "", "project": "", "reviewers": "SEhY;JD5c;RfHj", "site": "https://openreview.net/forum?id=0Mo_5PkLpwc", "pdf_size": 0, "recommendation": "5;6;8", "confidence": "2;2;3", "correctness": "3;3;4", "technical_novelty": "2;2;2", "empirical_novelty": "2;3;3", "wc_summary_paper": "32;73;121", "wc_summary_review": "23;13;108", "wc_main_review": "230;202;236", "wc_review": "285;288;465", "wc_reply_reviewers": "53;0;94", "wc_reply_authors": "1799;1166;1745", "reply_reviewers": "1;0;2", "reply_authors": "3;2;3", "recommendation_avg": [ 6.333333333333333, 1.247219128924647 ], "confidence_avg": [ 2.3333333333333335, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 75.33333333333333, 36.37153954521157 ], "wc_summary_review_avg": [ 48.0, 42.62237284181474 ], "wc_main_review_avg": [ 222.66666666666666, 14.817407180595247 ], "wc_review_avg": [ 346.0, 84.15461959987698 ], "wc_reply_reviewers_avg": [ 49.0, 38.47943173523573 ], "wc_reply_authors_avg": [ 1570.0, 286.52050537439726 ], "reply_reviewers_avg": [ 1.0, 0.816496580927726 ], "reply_authors_avg": [ 2.6666666666666665, 0.4714045207910317 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 1, 0 ], "corr_recommendation_confidence": 0.944911182523068, "corr_recommendation_correctness": 0.944911182523068, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:o8uZMfssbRsJ:scholar.google.com/&scioq=Robust+Cross-Modal+Semi-supervised+Few+Shot+Learning&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "Zoom Video Communications Inc.", "aff_unique_dep": "", "aff_unique_url": "https://zoom.us", "aff_unique_abbr": "Zoom", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "id": "0Q6BzWbvg0P", "title": "Less is More: Dimension Reduction Finds On-Manifold Adversarial Examples in Hard-Label Attacks", "track": "main", "status": "Reject", "tldr": "", "abstract": "Designing deep networks robust to adversarial examples remains an open problem. Likewise, recent zeroth-order hard-label attacks on image classification models have shown comparable performance to their first-order, gradient-level alternatives. It was recently shown in the gradient-level setting that regular adversarial examples leave the data manifold, while their on-manifold counterparts are in fact generalization errors. In this paper, we argue that query efficiency in the zeroth-order setting is connected to an adversary's traversal through the data manifold. To explain this behavior, we propose an information-theoretic argument based on a noisy manifold distance oracle, which leaks manifold information through the adversary's gradient estimate. Through numerical experiments of manifold-gradient mutual information, we show this behavior acts as a function of the effective problem dimensionality. On high-dimensional real-world datasets and multiple zeroth-order attacks using dimension reduction, we observe the same behavior to produce samples closer to the data manifold. This can result in up to 4x decrease in the manifold distance measure, regardless of the model robustness. Our results suggest that taking the manifold-gradient mutual information into account can thus inform better robust model design in the future, and avoid leakage of the sensitive data manifold information.", "keywords": "hard-label attacks;adversarial machine learning;generalization", "primary_area": "", "supplementary_material": "/attachment/37aa6e3b446a0ee07393ad48efe682f3b7140e0f.zip", "author": "Washington Garcia;Pin-Yu Chen;Somesh Jha;Hamilton Scott Clouse;Kevin R. B. Butler", "authorids": "~Washington_Garcia1;~Pin-Yu_Chen1;~Somesh_Jha1;~Hamilton_Scott_Clouse1;~Kevin_R._B._Butler1", "gender": "M;M;M;;M", "homepage": ";http://www.pinyuchen.com;;;https://www.cise.ufl.edu/~butler/", "dblp": "192/6857;39/8969;j/SomeshJha;158/9558;82/1935", "google_scholar": "https://scholar.google.com/citations?hl=en;jxwlCUUAAAAJ;BaI7l8QAAAAJ;Ry1l_7gAAAAJ;gvo0nMsAAAAJ", "orcid": ";0000-0003-1039-8369;;0000-0002-5819-2282;", "linkedin": ";pin-yu-chen-940062a2;;scott-clouse-74294210/;", "or_profile": "~Washington_Garcia1;~Pin-Yu_Chen1;~Somesh_Jha1;~Hamilton_Scott_Clouse1;~Kevin_R._B._Butler1", "aff": "University of Florida;International Business Machines;Department of Computer Science, University of Wisconsin, Madison;Air Force Research Laboratory;University of Florida", "aff_domain": "ufl.edu;ibm.com;cs.wisc.edu;us.af.mil;ufl.edu", "position": "PhD student;Research Staff Member;Full Professor;Principal Researcher;Associate Professor", "bibtex": "@misc{\ngarcia2022less,\ntitle={Less is More: Dimension Reduction Finds On-Manifold Adversarial Examples in Hard-Label Attacks},\nauthor={Washington Garcia and Pin-Yu Chen and Somesh Jha and Hamilton Scott Clouse and Kevin R. B. Butler},\nyear={2022},\nurl={https://openreview.net/forum?id=0Q6BzWbvg0P}\n}", "github": "", "project": "", "reviewers": "cq6A;iTBK;Wy2g;tKPt", "site": "https://openreview.net/forum?id=0Q6BzWbvg0P", "pdf_size": 0, "recommendation": "3;5;5;5", "confidence": "4;2;3;2", "correctness": "1;3;2;3", "technical_novelty": "1;3;3;2", "empirical_novelty": "1;0;3;3", "wc_summary_paper": "41;37;63;100", "wc_summary_review": "204;80;8;117", "wc_main_review": "328;400;154;447", "wc_review": "573;517;225;664", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "1139;804;121;295", "reply_reviewers": "0;0;0;0", "reply_authors": "3;1;1;1", "recommendation_avg": [ 4.5, 0.8660254037844386 ], "confidence_avg": [ 2.75, 0.82915619758885 ], "correctness_avg": [ 2.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.25, 0.82915619758885 ], "empirical_novelty_avg": [ 1.75, 1.299038105676658 ], "wc_summary_paper_avg": [ 60.25, 24.993749218554626 ], "wc_summary_review_avg": [ 102.25, 70.62002194845311 ], "wc_main_review_avg": [ 332.25, 111.29774256470793 ], "wc_review_avg": [ 494.75, 164.33863666222865 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 589.75, 404.40782324282503 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.5, 0.8660254037844386 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.8703882797784891, "corr_recommendation_correctness": 0.8703882797784891, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4337064150686390220&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff_unique_index": "0;1;2;3;0", "aff_unique_norm": "University of Florida;International Business Machines Corporation;University of Wisconsin-Madison;Air Force Research Laboratory", "aff_unique_dep": ";;Department of Computer Science;", "aff_unique_url": "https://www.ufl.edu;https://www.ibm.com;https://www.wisc.edu;https://www.afrl.af.mil/", "aff_unique_abbr": "UF;IBM;UW-Madison;AFRL", "aff_campus_unique_index": "1", "aff_campus_unique": ";Madison", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Towards a Unified View of Parameter-Efficient Transfer Learning", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6524", "id": "0RDcd5Axok", "poster": "", "openreview": "https://openreview.net/forum?id=0RDcd5Axok", "slides": "https://iclr.cc/virtual/2022/poster/6524", "video": "https://iclr.cc/virtual/2022/poster/6524", "author_site": "Junxian He, Chunting Zhou, Xuezhe Ma, Taylor Berg-Kirkpatrick, Graham Neubig", "tldr": "", "abstract": "Fine-tuning large pretrained language models on downstream tasks has become the de-facto learning paradigm in NLP. However, conventional approaches fine-tune all the parameters of the pretrained model, which becomes prohibitive as the model size and the number of tasks grow. Recent work has proposed a variety of parameter-efficient transfer learning methods that only fine-tune a small number of (extra) parameters to attain strong performance. While effective, the critical ingredients for success and the connections among the various methods are poorly understood. In this paper, we break down the design of state-of-the-art parameter-efficient transfer learning methods and present a unified framework that establishes connections between them. Specifically, we re-frame them as modifications to specific hidden states in pretrained models, and define a set of design dimensions along which different methods vary, such as the function to compute the modification and the position to apply the modification. Through comprehensive empirical studies across machine translation, text summarization, language understanding, and text classification benchmarks, we utilize the unified view to identify important design choices in previous methods. Furthermore, our unified framework enables the transfer of design elements across different approaches, and as a result we are able to instantiate new parameter-efficient fine-tuning methods that tune less parameters than previous methods while being more effective, achieving comparable results to fine-tuning all parameters on all four tasks.", "keywords": "parameter-efficient transfer learning;unified view;natural language processing", "primary_area": "", "supplementary_material": "/attachment/3c1a22f013c8c48fb14354550635473a2a78399f.zip", "author": "Junxian He;Chunting Zhou;Xuezhe Ma;Taylor Berg-Kirkpatrick;Graham Neubig", "authorids": "~Junxian_He1;~Chunting_Zhou1;~Xuezhe_Ma1;~Taylor_Berg-Kirkpatrick1;~Graham_Neubig1", "gender": "M;F;M;M;M", "homepage": "https://jxhe.github.io;https://violet-zct.github.io/;https://xuezhemax.github.io/;https://cseweb.ucsd.edu/~tberg/;http://phontron.com", "dblp": "188/6127.html;161/2679;127/0230;22/8160;03/8155", "google_scholar": "BIFGeoUAAAAJ;mR5W7EgAAAAJ;6_MQLIcAAAAJ;mN6_BKAAAAAJ;wlosgkoAAAAJ", "orcid": ";;;;", "linkedin": ";;xuezhe-ma-b5354731;;", "or_profile": "~Junxian_He1;~Chunting_Zhou1;~Xuezhe_Ma1;~Taylor_Berg-Kirkpatrick1;~Graham_Neubig1", "aff": "Carnegie Mellon University;Language Technologies Institute, Carnegie Mellon University;USC/ISI;University of California, San Diego;Carnegie Mellon University", "aff_domain": "cmu.edu;cs.cmu.edu;isi.edu;ucsd.edu;cmu.edu", "position": "PhD student;PhD student;Assistant Professor;Assistant Professor;Associate Professor", "bibtex": "@inproceedings{\nhe2022towards,\ntitle={Towards a Unified View of Parameter-Efficient Transfer Learning},\nauthor={Junxian He and Chunting Zhou and Xuezhe Ma and Taylor Berg-Kirkpatrick and Graham Neubig},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=0RDcd5Axok}\n}", "github": "", "project": "", "reviewers": "eByL;EtWw;JZ1E", "pdf_size": 0, "recommendation": "8;8;10", "confidence": "4;3;5", "correctness": "3;4;4", "technical_novelty": "3;3;4", "empirical_novelty": "3;3;4", "wc_summary_paper": "160;150;94", "wc_summary_review": "53;47;24", "wc_main_review": "662;168;85", "wc_review": "875;365;203", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "1186;153;8", "reply_reviewers": "0;0;0", "reply_authors": "2;1;1", "recommendation_avg": [ 8.666666666666666, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "correctness_avg": [ 3.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 3.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 134.66666666666666, 29.044027881055953 ], "wc_summary_review_avg": [ 41.333333333333336, 12.498888839501783 ], "wc_main_review_avg": [ 305.0, 254.7011320482629 ], "wc_review_avg": [ 481.0, 286.34245231889736 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 449.0, 524.4889576212894 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.8660254037844385, "corr_recommendation_correctness": 0.4999999999999999, "gs_citation": 1066, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5204198989920297993&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=0RDcd5Axok", "email": "cmu.edu;cs.cmu.edu;isi.edu;ucsd.edu;cmu.edu", "author_num": 5, "aff_unique_index": "0;0;1;2;0", "aff_unique_norm": "Carnegie Mellon University;University of Southern California;University of California, San Diego", "aff_unique_dep": ";;", "aff_unique_url": "https://www.cmu.edu;https://isi.usc.edu;https://www.ucsd.edu", "aff_unique_abbr": "CMU;USC;UCSD", "aff_campus_unique_index": "1;2;3", "aff_campus_unique": ";Pittsburgh;ISI;San Diego", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "W-CTC: a Connectionist Temporal Classification Loss with Wild Cards", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6093", "id": "0RqDp8FCW5Z", "poster": "", "openreview": "https://openreview.net/forum?id=0RqDp8FCW5Z", "slides": "https://iclr.cc/virtual/2022/poster/6093", "video": "https://iclr.cc/virtual/2022/poster/6093", "author_site": "Xingyu Cai, Jiahong Yuan, Yuchen Bian, Guangxu Xun, Jiaji Huang, Kenneth Church", "tldr": "", "abstract": "Connectionist Temporal Classification (CTC) loss is commonly used in sequence learning applications. For example, in Automatic Speech Recognition (ASR) task, the training data consists of pairs of audio (input sequence) and text (output label),without temporal alignment information. Standard CTC computes a loss by aggregating over all possible alignment paths, that map the entire sequence to the entire label (full alignment). However, in practice, there are often cases where the label is incomplete. Specifically, we solve the partial alignment problem where the label only matches a middle part of the sequence. This paper proposes the wild-card CTC (W-CTC) to address this issue, by padding wild-cards at both ends of the labels. Consequently, the proposed W-CTC improves the standard CTC via aggregating over even more alignment paths. Evaluations on a number of tasks in speech and vision domains, show that the proposed W-CTC consistently outperforms the standard CTC by a large margin when label is incomplete. The effectiveness of the proposed method is further confirmed in an ablation study.", "keywords": "CTC;wild cards;dynamic programing;partial alignment", "primary_area": "", "supplementary_material": "/attachment/741e32015e59043afbe4bf9a9f3aaac23463ab63.zip", "author": "Xingyu Cai;Jiahong Yuan;Yuchen Bian;Guangxu Xun;Jiaji Huang;Kenneth Church", "authorids": "~Xingyu_Cai1;~Jiahong_Yuan1;~Yuchen_Bian1;~Guangxu_Xun1;~Jiaji_Huang1;~Kenneth_Church1", "gender": "M;M;M;;;M", "homepage": ";;https://yuchenbian.github.io/;;https://jiaji-huang.github.io/;https://kwchurch.github.io/", "dblp": "174/2232;78/3632;187/4068;127/0253;;c/KennethWardChurch", "google_scholar": "9DSi4YAAAAAJ;jbbXJOkAAAAJ;gU0icBEAAAAJ;HhyfdQYAAAAJ;zgqtvYUAAAAJ;E6aqGvYAAAAJ", "orcid": ";;0000-0002-0685-3771;0000-0002-7657-4305;;0000-0001-8378-6069", "linkedin": ";;yuchenbian/;;;ken-church-a902772/", "or_profile": "~Xingyu_Cai1;~Jiahong_Yuan1;~Yuchen_Bian1;~Guangxu_Xun1;~Jiaji_Huang1;~Kenneth_Ward_Church1", "aff": "Baidu;Baidu Research Institute, USA;Baidu Research USA;Baidu;Baidu;Baidu", "aff_domain": "baidu.com;baidu.com;baidu.com;baidu.com;baidu.com;baidu.com", "position": "Researcher;Principal Researcher;Researcher;Researcher;Research Scientist;Fellow", "bibtex": "@inproceedings{\ncai2022wctc,\ntitle={W-{CTC}: a Connectionist Temporal Classification Loss with Wild Cards},\nauthor={Xingyu Cai and Jiahong Yuan and Yuchen Bian and Guangxu Xun and Jiaji Huang and Kenneth Church},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=0RqDp8FCW5Z}\n}", "github": "", "project": "", "reviewers": "K22X;pcpu;VmYt;sfbX", "pdf_size": 0, "recommendation": "6;6;6;6", "confidence": "5;4;4;4", "correctness": "4;3;4;3", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "140;157;86;270", "wc_summary_review": "60;76;59;52", "wc_main_review": "168;393;632;174", "wc_review": "368;626;777;496", "wc_reply_reviewers": "136;564;0;0", "wc_reply_authors": "505;1263;1414;769", "reply_reviewers": "1;3;0;0", "reply_authors": "2;3;2;1", "recommendation_avg": [ 6.0, 0.0 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 163.25, 66.97527528872128 ], "wc_summary_review_avg": [ 61.75, 8.78564169540279 ], "wc_main_review_avg": [ 341.75, 190.52608089182962 ], "wc_review_avg": [ 566.75, 151.84099413531249 ], "wc_reply_reviewers_avg": [ 175.0, 231.3503836175769 ], "wc_reply_authors_avg": [ 987.75, 366.86194610507096 ], "reply_reviewers_avg": [ 1.0, 1.224744871391589 ], "reply_authors_avg": [ 2.0, 0.7071067811865476 ], "replies_avg": [ 20, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5081917496829121749&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "pdf": "https://openreview.net/pdf?id=0RqDp8FCW5Z", "email": "baidu.com;baidu.com;baidu.com;baidu.com;baidu.com;baidu.com", "author_num": 6, "aff_unique_index": "0;0;0;0;0;0", "aff_unique_norm": "Baidu", "aff_unique_dep": "Baidu, Inc.", "aff_unique_url": "https://www.baidu.com", "aff_unique_abbr": "Baidu", "aff_campus_unique_index": "1", "aff_campus_unique": ";USA", "aff_country_unique_index": "0;1;1;0;0;0", "aff_country_unique": "China;United States" }, { "id": "0SiVrAfIxOe", "title": "Closed-Loop Control of Additive Manufacturing via Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Additive manufacturing suffers from imperfections in hardware control and material consistency. As a result, the deposition of a large range of materials requires on-the-fly adjustment of process parameters. Unfortunately, learning the in-process control is challenging. The deposition parameters are complex and highly coupled, artifacts occur after long time horizons, available simulators lack predictive power, and learning on hardware is intractable. In this work, we demonstrate the feasibility of learning a closed-loop control policy for additive manufacturing. To achieve this goal, we assume that the perception of a deposition device is limited and can capture the process only qualitatively. We leverage this assumption to formulate an efficient numerical model that explicitly includes printing imperfections. We further show that in combination with reinforcement learning, our model can be used to discover control policies that outperform state-of-the-art controllers. Furthermore, the recovered policies have a minimal sim-to-real gap. We showcase this by implementing a first-of-its-kind self-correcting printer.", "keywords": "additive manufacturing;closed-loop;reinforcement learning;in-process", "primary_area": "", "supplementary_material": "", "author": "Michal Piovarci;Michael Foshey;Timothy Erps;Jie Xu;Vahid Babaei;Piotr Didyk;Wojciech Matusik;Szymon Rusinkiewicz;Bernd Bickel", "authorids": "~Michal_Piovarci1;~Michael_Foshey1;~Timothy_Erps1;~Jie_Xu7;~Vahid_Babaei1;~Piotr_Didyk1;~Wojciech_Matusik2;~Szymon_Rusinkiewicz2;~Bernd_Bickel1", "gender": "M;;;M;;M;M;;M", "homepage": "https://misop.github.io/;;;https://people.csail.mit.edu/jiex;http://cam.mpi-inf.mpg.de/?view=people_vahid;https://www.pdf.inf.usi.ch/people/piotr/;https://cdfg.mit.edu/wojciech;https://www.cs.princeton.edu/~smr/;http://berndbickel.com/", "dblp": "149/4057.html;;;37/5126-28;124/2206;55/1484;;61/5465.html;", "google_scholar": "8G5ocVIAAAAJ;;https://scholar.google.com/scholar?hl=en;3Tj5lWEAAAAJ;;kH5VxAIAAAAJ;https://scholar.google.com/citations?hl=en;RaScARwAAAAJ;https://scholar.google.com.tw/citations?user=Bt-QKXYAAAAJ", "orcid": "0000-0002-5062-4474;;;;;0000-0003-0768-8939;0000-0003-0212-5643;0000-0002-4253-2588;0000-0001-6511-9385", "linkedin": ";michael-foshey/;;;;;wojciech-matusik-67238126/;;", "or_profile": "~Michal_Piovarci1;~Michael_Foshey1;~Timothy_Erps1;~Jie_Xu7;~Vahid_Babaei1;~Piotr_Didyk1;~Wojciech_Matusik2;~Szymon_Rusinkiewicz2;~Bernd_Bickel1", "aff": "Institute of Science and Technology Austria;Massachusetts Institute of Technology;;Massachusetts Institute of Technology;Saarland Informatics Campus, Max-Planck Institute;Universit\u00e0 della Svizzera italiana;Massachusetts Institute of Technology;Princeton University;Institute of Science and Technology Austria", "aff_domain": "ist.ac.at;mit.edu;;mit.edu;mpi-inf.mpg.de;usi.ch;mit.edu;princeton.edu;ist.ac.at", "position": "Postdoc;Researcher;;PhD student;Researcher;Assistant Professor;Full Professor;Professor;Professor", "bibtex": "@misc{\npiovarci2022closedloop,\ntitle={Closed-Loop Control of Additive Manufacturing via Reinforcement Learning},\nauthor={Michal Piovarci and Michael Foshey and Timothy Erps and Jie Xu and Vahid Babaei and Piotr Didyk and Wojciech Matusik and Szymon Rusinkiewicz and Bernd Bickel},\nyear={2022},\nurl={https://openreview.net/forum?id=0SiVrAfIxOe}\n}", "github": "", "project": "", "reviewers": "WUp8;3pgj;wpL1", "site": "https://openreview.net/forum?id=0SiVrAfIxOe", "pdf_size": 0, "recommendation": "5;5;5", "confidence": "3;3;4", "correctness": "3;4;2", "technical_novelty": "3;2;2", "empirical_novelty": "2;2;3", "wc_summary_paper": "97;52;244", "wc_summary_review": "49;61;77", "wc_main_review": "426;205;2081", "wc_review": "572;318;2402", "wc_reply_reviewers": "69;74;291", "wc_reply_authors": "943;436;1961", "reply_reviewers": "1;1;1", "reply_authors": "2;1;3", "recommendation_avg": [ 5.0, 0.0 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "correctness_avg": [ 3.0, 0.816496580927726 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 131.0, 81.98780397107853 ], "wc_summary_review_avg": [ 62.333333333333336, 11.469767022723502 ], "wc_main_review_avg": [ 904.0, 837.1407687280955 ], "wc_review_avg": [ 1097.3333333333333, 928.3481147835773 ], "wc_reply_reviewers_avg": [ 144.66666666666666, 103.493424375121 ], "wc_reply_authors_avg": [ 1113.3333333333333, 634.1221403553805 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 2.0, 0.816496580927726 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 9, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10821596069241058638&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;1;2;3;1;4;0", "aff_unique_norm": "Institute of Science and Technology Austria;Massachusetts Institute of Technology;Max-Planck Institute;Universit\u00e0 della Svizzera italiana;Princeton University", "aff_unique_dep": ";;Informatics;;", "aff_unique_url": "https://www.ist.ac.at;https://web.mit.edu;https://www.mpi-sws.org;https://www.usi.ch;https://www.princeton.edu", "aff_unique_abbr": "IST Austria;MIT;MPI-SWS;USI;Princeton", "aff_campus_unique_index": "1", "aff_campus_unique": ";Saarland", "aff_country_unique_index": "0;1;1;2;3;1;1;0", "aff_country_unique": "Austria;United States;Germany;Switzerland" }, { "id": "0Tnl8uBHfQw", "title": "Deep Classifiers with Label Noise Modeling and Distance Awareness", "track": "main", "status": "Reject", "tldr": "", "abstract": "Uncertainty estimation in deep learning has recently emerged as a crucial area of interest to advance reliability and robustness in safety-critical applications. While there have been many proposed methods that either focus on distance-aware model uncertainties for out-of-distribution detection or on input-dependent label uncertainties for in-distribution calibration, both of these types of uncertainty are often necessary. In this work, we propose the HetSNGP method for jointly modeling the model and data uncertainty. We show that our proposed model affords a favorable combination between these two complementary types of uncertainty and thus outperforms the baseline methods on some challenging out-of-distribution datasets, including CIFAR-100C, Imagenet-C, and Imagenet-A. Moreover, we propose HetSNGP Ensemble, an ensembled version of our method which adds an additional type of uncertainty and also outperforms other ensemble baselines.", "keywords": "Deep learning;uncertainty estimation;out-of-distribution detection", "primary_area": "", "supplementary_material": "/attachment/fd3d36242fea7e0f8f86da03197537c7a77f85e5.zip", "author": "Vincent Fortuin;Mark Collier;Florian Wenzel;James Urquhart Allingham;Jeremiah Zhe Liu;Dustin Tran;Balaji Lakshminarayanan;Jesse Berent;Rodolphe Jenatton;Effrosyni Kokiopoulou", "authorids": "~Vincent_Fortuin1;~Mark_Collier1;~Florian_Wenzel1;~James_Urquhart_Allingham1;~Jeremiah_Zhe_Liu1;~Dustin_Tran1;~Balaji_Lakshminarayanan1;jberent@google.com;~Rodolphe_Jenatton3;~Effrosyni_Kokiopoulou1", "gender": "M;M;M;M;M;;M;;M;F", "homepage": "https://fortuin.github.io/;;;https://jamesallingham.com;;http://dustintran.com;http://www.gatsby.ucl.ac.uk/~balaji/;;http://rodolphejenatton.com/;", "dblp": "218/7489;;04/9709;;199/2301;;71/8324;;68/8398;05/960", "google_scholar": "https://scholar.google.ch/citations?user=XBlrYTIAAAAJ;U4rBrcgAAAAJ;;CIp9adkAAAAJ;9jrmcG4AAAAJ;wVazIm8AAAAJ;QYn8RbgAAAAJ;;QIR6rygAAAAJ;9om-fCsAAAAJ", "orcid": "0000-0002-0640-2671;;;;;;;;;", "linkedin": "vincent-fortuin-42426b134/;mark-collier-aa446032/;;;;;;;;", "or_profile": "~Vincent_Fortuin1;~Mark_Collier1;~Florian_Wenzel1;~James_Urquhart_Allingham1;~Jeremiah_Zhe_Liu1;~Dustin_Tran1;~Balaji_Lakshminarayanan1;jberent@google.com;~Rodolphe_Jenatton3;~Effrosyni_Kokiopoulou1", "aff": "University of Cambridge;Google;Amazon;University of Amsterdam;Google DeepMind;Google;Google Brain;;Google;Google DeepMind", "aff_domain": "cam.ac.uk;google.com;amazon.com;uva.nl;google.com;google.com;google.com;;google.com;google.com", "position": "Researcher;Researcher;Researcher;Researcher;Research Scientist;Research Scientist;Research Scientist;;Senior research scientist;Researcher", "bibtex": "@misc{\nfortuin2022deep,\ntitle={Deep Classifiers with Label Noise Modeling and Distance Awareness},\nauthor={Vincent Fortuin and Mark Collier and Florian Wenzel and James Urquhart Allingham and Jeremiah Zhe Liu and Dustin Tran and Balaji Lakshminarayanan and Jesse Berent and Rodolphe Jenatton and Effrosyni Kokiopoulou},\nyear={2022},\nurl={https://openreview.net/forum?id=0Tnl8uBHfQw}\n}", "github": "", "project": "", "reviewers": "m1FZ;tqBj;wQfZ;ru45", "site": "https://openreview.net/forum?id=0Tnl8uBHfQw", "pdf_size": 0, "recommendation": "5;5;6;8", "confidence": "3;3;4;4", "correctness": "3;3;4;4", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "33;48;55;59", "wc_summary_review": "19;15;34;21", "wc_main_review": "259;110;290;107", "wc_review": "311;173;379;187", "wc_reply_reviewers": "218;48;40;0", "wc_reply_authors": "895;710;444;475", "reply_reviewers": "1;1;1;0", "reply_authors": "2;1;1;1", "recommendation_avg": [ 6.0, 1.224744871391589 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 48.75, 9.908960591303208 ], "wc_summary_review_avg": [ 22.25, 7.119515432949071 ], "wc_main_review_avg": [ 191.5, 83.72723571216238 ], "wc_review_avg": [ 262.5, 86.0740959871203 ], "wc_reply_reviewers_avg": [ 76.5, 83.694384518915 ], "wc_reply_authors_avg": [ 631.0, 183.87631712648587 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 10, 0 ], "corr_recommendation_confidence": 0.8164965809277259, "corr_recommendation_correctness": 0.8164965809277259, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6179253725373647958&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff_unique_index": "0;1;2;3;1;1;1;1;1", "aff_unique_norm": "University of Cambridge;Google;Amazon;University of Amsterdam", "aff_unique_dep": ";Google;Amazon.com, Inc.;", "aff_unique_url": "https://www.cam.ac.uk;https://www.google.com;https://www.amazon.com;https://www.uva.nl", "aff_unique_abbr": "Cambridge;Google;Amazon;UvA", "aff_campus_unique_index": "0;1;1;1;1", "aff_campus_unique": "Cambridge;Mountain View;", "aff_country_unique_index": "0;1;1;2;0;1;1;1;0", "aff_country_unique": "United Kingdom;United States;Netherlands" }, { "id": "0U0C2pXfTZl", "title": "SLASH: Embracing Probabilistic Circuits into Neural Answer Set Programming", "track": "main", "status": "Reject", "tldr": "", "abstract": "The goal of combining the robustness of neural networks and the expressivity of symbolic methods has rekindled the interest in Neuro-Symbolic AI. Recent advancements in Neuro-Symbolic AI often consider specifically-tailored architectures consisting of disjoint neural and symbolic components, and thus do not exhibit desired gains that can be achieved by integrating them into a unifying framework. We introduce SLASH -- a novel deep probabilistic programming language (DPPL). At its core, SLASH consists of Neural-Probabilistic Predicates (NPPs) and logical programs which are united via answer set programming. The probability estimates resulting from NPPs act as the binding element between the logical program and raw input data, thereby allowing SLASH to answer task-dependent logical queries. This allows SLASH to elegantly integrate the symbolic and neural components in a unified framework. We evaluate SLASH on the benchmark data of MNIST addition as well as novel tasks for DPPLs such as missing data prediction and set prediction with state-of-the-art performance, thereby showing the effectiveness and generality of our method.", "keywords": "Deep Probabilistic Programming Languages;Probabilistic Circuits;Neuro-Symbolic Computations", "primary_area": "", "supplementary_material": "/attachment/c82dc03da02805a2eec34b4e792cc1b475cbebc4.zip", "author": "Arseny Skryagin;Wolfgang Stammer;Daniel Ochs;Devendra Singh Dhami;Kristian Kersting", "authorids": "~Arseny_Skryagin1;~Wolfgang_Stammer1;~Daniel_Ochs1;~Devendra_Singh_Dhami1;~Kristian_Kersting1", "gender": "M;M;M;M;M", "homepage": "https://ml-research.github.io/people/skryagin/;https://ml-research.github.io/people/wstammer/;https://d-ochs.github.io/;https://sites.google.com/view/devendradhami;http://www.ml.informatik.tu-darmstadt.de/", "dblp": ";256/5497;;201/2130;40/3793", "google_scholar": ";66-aU5AAAAAJ;hmL7GPQAAAAJ;aVlaHfkAAAAJ;QY-earAAAAAJ", "orcid": ";0000-0003-3793-8046;;;0000-0002-2873-9152", "linkedin": ";https://linkedin.com/in/wolfgang-stammer-7835a4207/en-us?trk=people-guest_people_search-card;daniel-ochs-086837b9/;;", "or_profile": "~Arseny_Skryagin1;~Wolfgang_Stammer1;~Daniel_Ochs1;~Devendra_Singh_Dhami1;~Kristian_Kersting1", "aff": "CS Department, TU Darmstadt, TU Darmstadt;CS Department, TU Darmstadt;CS Department, TU Darmstadt, Technische Universit\u00e4t Darmstadt;CS Department, TU Darmstadt, TU Darmstadt;TU Darmstadt", "aff_domain": "cs.tu-darmstadt.de;cs.tu-darmstadt.de;cs.tu-darmstadt.de;cs.tu-darmstadt.de;tu-darmstadt.de", "position": "PhD student;PhD student;PhD student;Postdoctoral researcher;Full Professor", "bibtex": "@misc{\nskryagin2022slash,\ntitle={{SLASH}: Embracing Probabilistic Circuits into Neural Answer Set Programming},\nauthor={Arseny Skryagin and Wolfgang Stammer and Daniel Ochs and Devendra Singh Dhami and Kristian Kersting},\nyear={2022},\nurl={https://openreview.net/forum?id=0U0C2pXfTZl}\n}", "github": "", "project": "", "reviewers": "bMeH;TutZ;qUza;8iux", "site": "https://openreview.net/forum?id=0U0C2pXfTZl", "pdf_size": 0, "recommendation": "3;5;6;8", "confidence": "5;4;3;2", "correctness": "3;3;3;4", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "105;93;17;105", "wc_summary_review": "32;68;62;37", "wc_main_review": "827;412;307;100", "wc_review": "964;573;386;242", "wc_reply_reviewers": "259;291;0;0", "wc_reply_authors": "2786;1551;1418;224", "reply_reviewers": "3;2;0;0", "reply_authors": "7;5;2;1", "recommendation_avg": [ 5.5, 1.8027756377319946 ], "confidence_avg": [ 3.5, 1.118033988749895 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 80.0, 36.701498607005135 ], "wc_summary_review_avg": [ 49.75, 15.497983739828868 ], "wc_main_review_avg": [ 411.5, 264.8551490909701 ], "wc_review_avg": [ 541.25, 270.82224336268985 ], "wc_reply_reviewers_avg": [ 137.5, 137.9646693903914 ], "wc_reply_authors_avg": [ 1494.75, 907.0814117266432 ], "reply_reviewers_avg": [ 1.25, 1.299038105676658 ], "reply_authors_avg": [ 3.75, 2.384848003542364 ], "replies_avg": [ 27, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.9922778767136676, "corr_recommendation_correctness": 0.8006407690254357, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7571425051332941947&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Technische Universit\u00e4t Darmstadt", "aff_unique_dep": "Computer Science Department", "aff_unique_url": "https://www.tu-darmstadt.de", "aff_unique_abbr": "TU Darmstadt", "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Darmstadt", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "Germany" }, { "title": "Large-Scale Representation Learning on Graphs via Bootstrapping", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6390", "id": "0UXT6PpRpW", "poster": "", "openreview": "https://openreview.net/forum?id=0UXT6PpRpW", "slides": "https://iclr.cc/virtual/2022/poster/6390", "video": "https://iclr.cc/virtual/2022/poster/6390", "author_site": "Shantanu Thakoor, Corentin Tallec, Mohammad Gheshlaghi Azar, Mehdi Azabou, Eva Dyer, Remi Munos, Petar Veli\u010dkovi\u0107, Michal Valko", "tldr": "", "abstract": "Self-supervised learning provides a promising path towards eliminating the need for costly label information in representation learning on graphs. However, to achieve state-of-the-art performance, methods often need large numbers of negative examples and rely on complex augmentations. This can be prohibitively expensive, especially for large graphs. To address these challenges, we introduce Bootstrapped Graph Latents (BGRL) - a graph representation learning method that learns by predicting alternative augmentations of the input. BGRL uses only simple augmentations and alleviates the need for contrasting with negative examples, and thus is scalable by design. BGRL outperforms or matches prior methods on several established benchmarks, while achieving a 2-10x reduction in memory costs. Furthermore, we show that BGRL can be scaled up to extremely large graphs with hundreds of millions of nodes in the semi-supervised regime, achieving state-of-the-art performance and improving over supervised baselines where representations are shaped only through label information. In particular, our solution centered on BGRL constituted one of the winning entries to the Open Graph Benchmark -Large Scale Challenge at KDD Cup 2021, on a graph orders of magnitudes larger than all previously available benchmarks, thus demonstrating the scalability and effectiveness of our approach.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/16a5c61c7ca86ff6547b499d5134965d271317fc.zip", "author": "Shantanu Thakoor;Corentin Tallec;Mohammad Gheshlaghi Azar;Mehdi Azabou;Eva L Dyer;Remi Munos;Petar Veli\u010dkovi\u0107;Michal Valko", "authorids": "~Shantanu_Thakoor5;~Corentin_Tallec2;~Mohammad_Gheshlaghi_Azar1;~Mehdi_Azabou2;~Eva_L_Dyer1;~Remi_Munos1;~Petar_Veli\u010dkovi\u01071;~Michal_Valko1", "gender": "M;M;M;M;M;M;F;M", "homepage": ";http://mgazar.net;https://www.mehai.dev;http://researchers.lille.inria.fr/~munos/;https://petar-v.com;https://misovalko.github.io/research.html;http://dyerlab.gatech.edu;", "dblp": ";;281/8371;69/6815;184/4786.html;03/5455;64/8509.html;218/7437", "google_scholar": "OPKX4GgLCxIC;;jXxyYCoAAAAJ;https://scholar.google.com/citations?hl=en;https://scholar.google.co.uk/citations?user=kcTK_FAAAAAJ;jrazNCQAAAAJ;Sb_jcHcAAAAJ;polyCecAAAAJ", "orcid": ";;;;0000-0002-2820-4692;;;", "linkedin": ";;;;petarvelickovic;michalvalko/;;", "or_profile": "~Corentin_Tallec2;~Mohammad_Gheshlaghi_Azar1;~Mehdi_Azabou2;~Remi_Munos1;~Petar_Veli\u010dkovi\u01071;~Michal_Valko1;~Eva_Dyer1;~Shantanu_Thakoor1", "aff": "Google DeepMind;Google DeepMind;Georgia Institute of Technology;Google DeepMind;Google DeepMind;Google DeepMind;Georgia Institute of Technology;Google", "aff_domain": "deepmind.com;google.com;gatech.edu;google.com;google.com;deepmind.com;gatech.edu;google.com", "position": "Research Scientist;Researcher;PhD student;Research scientist;Senior Staff Research Scientist;Senior Staff Research Scientist;Associate Professor;Research Engineer", "bibtex": "@inproceedings{\nthakoor2022largescale,\ntitle={Large-Scale Representation Learning on Graphs via Bootstrapping},\nauthor={Shantanu Thakoor and Corentin Tallec and Mohammad Gheshlaghi Azar and Mehdi Azabou and Eva L Dyer and Remi Munos and Petar Veli{\\v{c}}kovi{\\'c} and Michal Valko},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=0UXT6PpRpW}\n}", "github": "", "project": "", "reviewers": "8P5B;8wKC;CbdK;GsyJ", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "3;5;5;5", "correctness": "3;3;4;4", "technical_novelty": "2;2;2;3", "empirical_novelty": "4;3;3;3", "wc_summary_paper": "58;30;200;23", "wc_summary_review": "32;14;33;166", "wc_main_review": "139;256;206;54", "wc_review": "229;300;439;243", "wc_reply_reviewers": "0;0;64;36", "wc_reply_authors": "334;474;505;305", "reply_reviewers": "0;0;1;1", "reply_authors": "1;1;2;2", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 4.5, 0.8660254037844386 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 77.75, 71.78570540156306 ], "wc_summary_review_avg": [ 61.25, 60.94823623370901 ], "wc_main_review_avg": [ 163.75, 75.75082507801483 ], "wc_review_avg": [ 302.75, 83.03726573051404 ], "wc_reply_reviewers_avg": [ 25.0, 26.888659319497503 ], "wc_reply_authors_avg": [ 404.5, 86.31483070712703 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": 0.6622661785325219, "corr_recommendation_correctness": 0.6882472016116854, "gs_citation": 324, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3168526433938319234&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=0UXT6PpRpW", "email": "deepmind.com;google.com;gatech.edu;google.com;google.com;deepmind.com;gatech.edu;google.com", "author_num": 8, "aff_unique_index": "0;0;1;0;0;0;1;0", "aff_unique_norm": "Google;Georgia Institute of Technology", "aff_unique_dep": "Google DeepMind;", "aff_unique_url": "https://deepmind.com;https://www.gatech.edu", "aff_unique_abbr": "DeepMind;Georgia Tech", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;1;0;0;0;1;1", "aff_country_unique": "United Kingdom;United States" }, { "id": "0VezzBzLmBr", "title": "Online Tuning for Offline Decentralized Multi-Agent Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Offline reinforcement learning could learn effective policies from a fixed dataset, which is promising in real-world applications. However, in offline decentralized multi-agent reinforcement learning, due to the discrepancy between the behavior policy and learned policy, the transition dynamics in offline experiences do not accord with the transition dynamics in online execution, which creates severe errors in value estimates, leading to uncoordinated and suboptimal policies. One way to overcome the transition bias is to bridge offline training and online tuning. However, considering both deployment efficiency and sample efficiency, we could only collect very limited online experiences, making it insufficient to use merely online data for updating the agent policy. To utilize both offline and online experiences to tune the policies of agents, we introduce online transition correction (OTC) to implicitly correct the biased transition dynamics by modifying sampling probabilities. We design two types of distances, i.e., embedding-based and value-based distance, to measure the similarity between transitions, and further propose an adaptive rank-based prioritization to sample transitions according to the transition similarity. OTC is simple yet effective to increase data efficiency and improve agent policies in online tuning. Empirically, we show that OTC outperforms baselines in a variety of tasks.", "keywords": "multi-agent reinforcement learning", "primary_area": "", "supplementary_material": "", "author": "Jiechuan Jiang;Zongqing Lu", "authorids": "~Jiechuan_Jiang1;~Zongqing_Lu2", "gender": ";", "homepage": ";", "dblp": "220/4026;", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": "~Jiechuan_Jiang1;~Zongqing_Lu2", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\njiang2022online,\ntitle={Online Tuning for Offline Decentralized Multi-Agent Reinforcement Learning},\nauthor={Jiechuan Jiang and Zongqing Lu},\nyear={2022},\nurl={https://openreview.net/forum?id=0VezzBzLmBr}\n}", "github": "", "project": "", "reviewers": "ZgL6;2xuS;a2PP;T8NR", "site": "https://openreview.net/forum?id=0VezzBzLmBr", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "5;5;4;4", "correctness": "3;3;3;2", "technical_novelty": "2;2;3;2", "empirical_novelty": "2;2;3;2", "wc_summary_paper": "48;57;41;95", "wc_summary_review": "96;56;40;31", "wc_main_review": "139;110;285;210", "wc_review": "283;223;366;336", "wc_reply_reviewers": "0;0;76;0", "wc_reply_authors": "269;155;800;223", "reply_reviewers": "0;0;1;0", "reply_authors": "2;2;2;1", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 4.5, 0.5 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 60.25, 20.849160654568326 ], "wc_summary_review_avg": [ 55.75, 24.903564001965663 ], "wc_main_review_avg": [ 186.0, 67.75322870535396 ], "wc_review_avg": [ 302.0, 54.43803817185186 ], "wc_reply_reviewers_avg": [ 19.0, 32.90896534380867 ], "wc_reply_authors_avg": [ 361.75, 256.2531707120909 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.75, 0.4330127018922193 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -1.0, "corr_recommendation_correctness": -0.5773502691896257, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9118701908262332158&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4 }, { "id": "0WHn7Dj52cS", "title": "Vibration-based Uncertainty Estimation for Learning from Limited Supervision", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "We investigate the problem of estimating uncertainty for training data, so that deep neural networks can make use of the results for learning from limited supervision. However, neither the prediction probability nor the entropy can accurately capture the uncertainty of out-of-distribution data. In this paper, we present a novel approach that measures the uncertainty from the vibration of sequential data, \\textit{e.g.}, the output probability during the training procedure. The key observation is that, a training sample that suffers heavier vibration often offers richer information when it is manually labeled. We make use of the Fourier Transformation to measure the extent of vibration, deriving a powerful tool that can be used for semi-supervised, active learning, and one-bit supervision. Experiments on the CIFAR10, CIFAR100, and mini-ImageNet datasets validate the effectiveness of our approach.", "keywords": "uncertainty estimation;semi-supervised learning;active learning", "primary_area": "", "supplementary_material": "", "author": "Hengtong Hu;Lingxi Xie;Yinquan Wang;Richang Hong;Meng Wang;Qi Tian", "authorids": "~Hengtong_Hu1;~Lingxi_Xie1;~Yinquan_Wang1;~Richang_Hong1;~Meng_Wang2;~Qi_Tian3", "gender": "M;M;M;M;;M", "homepage": ";http://lingxixie.com/;https://www.researchgate.net/profile/Yinquan_Wang4;https://sites.google.com/site/homeofrichanghong/;;https://www.qitian1987.com/index.html", "dblp": "232/0173;123/2869;;59/1501;;78/1467-1.html", "google_scholar": "tF5tWsMAAAAJ;EEMm7hwAAAAJ;;https://scholar.google.com/scholar?hl=en;;https://scholar.google.com/citations?hl=en", "orcid": ";;;;;0000-0002-7252-5047", "linkedin": ";;;;;", "or_profile": "~Hengtong_Hu1;~Lingxi_Xie1;~Yinquan_Wang1;~Richang_Hong1;~Meng_Wang2;~Qi_Tian3", "aff": "Hefei University of Technology;Huawei Technologies Ltd.;Academy of Mathematics and Systems Science, Chinese Academy of Sciences;Hefei University of Technology;;Huawei Technologies Ltd.", "aff_domain": "hfut.edu;huawei.com;amss.ac.cn;hfut.edu;;huawei.com", "position": "PhD student;Researcher;PhD student;Full Professor;;Principal Researcher", "bibtex": "@misc{\nhu2022vibrationbased,\ntitle={Vibration-based Uncertainty Estimation for Learning from Limited Supervision},\nauthor={Hengtong Hu and Lingxi Xie and Yinquan Wang and Richang Hong and Meng Wang and Qi Tian},\nyear={2022},\nurl={https://openreview.net/forum?id=0WHn7Dj52cS}\n}", "github": "", "project": "", "reviewers": "Ftt8;bvuB;SVEK", "site": "https://openreview.net/forum?id=0WHn7Dj52cS", "pdf_size": 0, "recommendation": "3;3;5", "confidence": "4;4;3", "correctness": "2;2;2", "technical_novelty": "3;3;3", "empirical_novelty": "2;2;4", "wc_summary_paper": "116;80;90", "wc_summary_review": "35;21;99", "wc_main_review": "1030;328;372", "wc_review": "1181;429;561", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 3.6666666666666665, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 2.0, 0.0 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.9428090415820634 ], "wc_summary_paper_avg": [ 95.33333333333333, 15.173075568988056 ], "wc_summary_review_avg": [ 51.666666666666664, 33.95421754199158 ], "wc_main_review_avg": [ 576.6666666666666, 321.057973304234 ], "wc_review_avg": [ 723.6666666666666, 327.84278074846924 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -1.0, "corr_recommendation_correctness": 0.0, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5484360292407498892&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;1;2;0;1", "aff_unique_norm": "Hefei University of Technology;Huawei;Chinese Academy of Sciences", "aff_unique_dep": ";Huawei Technologies;Academy of Mathematics and Systems Science", "aff_unique_url": "http://www.hfut.edu.cn/;https://www.huawei.com;http://www.amss.cas.cn", "aff_unique_abbr": "HUT;Huawei;AMSS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "id": "0WIM9dHzQBh", "title": "DP-InstaHide: Data Augmentations Provably Enhance Guarantees Against Dataset Manipulations", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Data poisoning and backdoor attacks manipulate training data to induce security breaches in a victim model. These attacks can be provably deflected using differentially private (DP) training methods, although this comes with a sharp decrease in model performance. The InstaHide method has recently been proposed as an alternative to DP training that leverages supposed privacy properties of the mixup augmentation, although without rigorous guarantees. In this paper, we rigorously show that k-way mixup provably yields at least k times stronger DP guarantees than a naive DP mechanism, and we observe that this enhanced privacy guarantee is a strong foundation for building defenses against poisoning.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/4beda7a14dd681abceff52d259b7a35d23d8ca67.zip", "author": "Eitan Borgnia;Jonas Geiping;Valeriia Cherepanova;Liam H Fowl;Arjun Gupta;Amin Ghiasi;Furong Huang;Micah Goldblum;Tom Goldstein", "authorids": "~Eitan_Borgnia1;~Jonas_Geiping1;~Valeriia_Cherepanova1;~Liam_H_Fowl1;~Arjun_Gupta2;~Amin_Ghiasi1;~Furong_Huang1;~Micah_Goldblum1;~Tom_Goldstein1", "gender": "M;M;F;;M;M;F;;M", "homepage": "https://eitanborgnia.com;https://jonasgeiping.github.io/;https://www.vcherepanova.com/;;https://github.com/Arjung27;http://cs.umd.edu/~amin;https://furong-huang.com;;https://www.cs.umd.edu/~tomg/", "dblp": ";190/7229;;241/6940;;239/8313;72/8513;241/7231;25/8184", "google_scholar": ";https://scholar.google.de/citations?user=206vNCEAAAAJ;PySUqqUAAAAJ;IXv3ToAAAAAJ;5pcsbisAAAAJ;tNQWOxUAAAAJ;13yyuCcAAAAJ;pGDKzuUAAAAJ;KmSuVtgAAAAJ", "orcid": ";;;;;;;;", "linkedin": ";;;;arjung27/;;;;", "or_profile": "~Eitan_Borgnia1;~Jonas_Geiping1;~Valeriia_Cherepanova1;~Liam_H_Fowl1;~Arjun_Gupta2;~Amin_Ghiasi1;~Furong_Huang1;~Micah_Goldblum1;~Tom_Goldstein1", "aff": "University of Maryland, College Park;University of Maryland, College Park;Amazon;University of Maryland, College Park;Zipline International Inc;University of Maryland, College Park;University of Maryland;New York University;University of Maryland, College Park", "aff_domain": "umd.edu;umd.edu;amazon.com;umd.edu;flyzipline.com;umd.edu;cs.umd.edu;nyu.edu;umd.edu", "position": "Researcher;Postdoc;Intern;PhD student;Professional;PhD student;Assistant Professor;Postdoc;Associate Professor", "bibtex": "@misc{\nborgnia2022dpinstahide,\ntitle={{DP}-InstaHide: Data Augmentations Provably Enhance Guarantees Against Dataset Manipulations},\nauthor={Eitan Borgnia and Jonas Geiping and Valeriia Cherepanova and Liam H Fowl and Arjun Gupta and Amin Ghiasi and Furong Huang and Micah Goldblum and Tom Goldstein},\nyear={2022},\nurl={https://openreview.net/forum?id=0WIM9dHzQBh}\n}", "github": "", "project": "", "reviewers": "bGWL;U3LX;WuW9;jLWF", "site": "https://openreview.net/forum?id=0WIM9dHzQBh", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "3;4;5;4", "correctness": "2;3;4;4", "technical_novelty": "3;2;3;3", "empirical_novelty": "1;2;3;3", "wc_summary_paper": "85;26;112;142", "wc_summary_review": "79;45;29;147", "wc_main_review": "180;340;243;579", "wc_review": "344;411;384;868", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 91.25, 42.728064547788726 ], "wc_summary_review_avg": [ 75.0, 45.32107677449864 ], "wc_main_review_avg": [ 335.5, 151.69789055883408 ], "wc_review_avg": [ 501.75, 212.79376753091242 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 9, 0 ], "corr_recommendation_confidence": 0.7071067811865475, "corr_recommendation_correctness": 0.9045340337332909, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:39vGf1qA2TUJ:scholar.google.com/&scioq=DP-InstaHide:+Data+Augmentations+Provably+Enhance+Guarantees+Against+Dataset+Manipulations&hl=en&as_sdt=0,33", "gs_version_total": 2, "aff_unique_index": "0;0;1;0;2;0;0;3;0", "aff_unique_norm": "University of Maryland;Amazon;Zipline International Inc;New York University", "aff_unique_dep": ";Amazon.com, Inc.;;", "aff_unique_url": "https://www/umd.edu;https://www.amazon.com;https://www.zipline.com;https://www.nyu.edu", "aff_unique_abbr": "UMD;Amazon;;NYU", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "College Park;", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "0bXmbOt1oq", "title": "Towards Learning to Speak and Hear Through Multi-Agent Communication over a Continuous Acoustic Channel", "track": "main", "status": "Reject", "tldr": "", "abstract": "While multi-agent reinforcement learning has been used as an effective means to study emergent communication between agents, existing work has focused almost exclusively on communication with discrete symbols. Human communication often takes place (and emerged) over a continuous acoustic channel; human infants acquire language in large part through continuous signalling with their caregivers. We therefore ask: Are we able to observe emergent language between agents with a continuous communication channel trained through reinforcement learning? And if so, what is the impact of channel characteristics on the emerging language? We propose an environment and training methodology to serve as a means to carry out an initial exploration of these questions. We use a simple messaging environment where a \"speaker\" agent needs to convey a concept to a \"listener\". The Speaker is equipped with a vocoder that maps symbols to a continuous waveform, this is passed over a lossy continuous channel, and the Listener needs to map the continuous signal to the concept. Using deep Q-learning, we show that basic compositionality emerges in the learned language representations. We find that noise is essential in the communication channel when conveying unseen concept combinations. And we show that we can ground the emergent communication by introducing a caregiver predisposed to \"hearing\" or \"speaking\" English. Finally, we describe how our platform serves as a starting point for future work that uses a combination of deep reinforcement learning and multi-agent systems to study our questions of continuous signalling in language learning and emergence.", "keywords": "multi-agent reinforcement learning;language acquisition;emergent communication;acoustic communication;continuous signalling", "primary_area": "", "supplementary_material": "", "author": "Kevin Michael Eloff;Arnu Pretorius;Okko R\u00e4s\u00e4nen;Herman Arnold Engelbrecht;Herman Kamper", "authorids": "~Kevin_Michael_Eloff1;~Arnu_Pretorius1;okko.rasanen@tuni.fi;~Herman_Arnold_Engelbrecht1;~Herman_Kamper1", "gender": "M;M;;M;M", "homepage": ";;;;http://www.kamperh.com/", "dblp": ";188/4368;;;http://dblp.uni-trier.de/pers/hd/k/Kamper:Herman", "google_scholar": ";zZ6ydrAAAAAJ;;eFIxEXoAAAAJ;F3dhs4kAAAAJ", "orcid": ";;;0000-0001-8753-8994;", "linkedin": "kevin-eloff-a74943157/;arnupretorius/;;;kamperh/", "or_profile": "~Kevin_Michael_Eloff1;~Arnu_Pretorius1;okko.rasanen@tuni.fi;~Herman_Arnold_Engelbrecht1;~Herman_Kamper1", "aff": "Stellenbosch University;InstaDeep;;University of Stellenbosch University;Stellenbosch University", "aff_domain": "sun.ac.za;instadeep.com;;sun.ac.za;sun.ac.za", "position": "MS student;Researcher;;Full Professor;Assistant Professor", "bibtex": "@misc{\neloff2022towards,\ntitle={Towards Learning to Speak and Hear Through Multi-Agent Communication over a Continuous Acoustic Channel},\nauthor={Kevin Michael Eloff and Arnu Pretorius and Okko R{\\\"a}s{\\\"a}nen and Herman Arnold Engelbrecht and Herman Kamper},\nyear={2022},\nurl={https://openreview.net/forum?id=0bXmbOt1oq}\n}", "github": "", "project": "", "reviewers": "6auy;yr56;GZXE;42Xh", "site": "https://openreview.net/forum?id=0bXmbOt1oq", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "4;4;5;4", "correctness": "2;4;3;4", "technical_novelty": "1;3;2;2", "empirical_novelty": "1;1;2;2", "wc_summary_paper": "92;162;94;276", "wc_summary_review": "86;55;89;83", "wc_main_review": "445;710;429;1020", "wc_review": "623;927;612;1379", "wc_reply_reviewers": "49;124;106;68", "wc_reply_authors": "643;298;276;279", "reply_reviewers": "1;1;1;1", "reply_authors": "1;1;1;1", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 1.5, 0.5 ], "wc_summary_paper_avg": [ 156.0, 74.79304780526061 ], "wc_summary_review_avg": [ 78.25, 13.589977924926883 ], "wc_main_review_avg": [ 651.0, 240.50051974995813 ], "wc_review_avg": [ 885.25, 311.83839965597565 ], "wc_reply_reviewers_avg": [ 86.75, 29.72688177390962 ], "wc_reply_authors_avg": [ 374.0, 155.5361694269214 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": 0.5222329678670935, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2425294729562949479&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;0;0", "aff_unique_norm": "Stellenbosch University;InstaDeep", "aff_unique_dep": ";", "aff_unique_url": "https://www.sun.ac.za;https://www.instadeep.com", "aff_unique_abbr": "SU;InstaDeep", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0", "aff_country_unique": "South Africa;United Kingdom" }, { "title": "Efficient Learning of Safe Driving Policy via Human-AI Copilot Optimization", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6470", "id": "0cgU-BZp2ky", "poster": "", "openreview": "https://openreview.net/forum?id=0cgU-BZp2ky", "slides": "https://iclr.cc/virtual/2022/poster/6470", "video": "https://iclr.cc/virtual/2022/poster/6470", "author_site": "Quanyi Li, Zhenghao Peng, Bolei Zhou", "tldr": "", "abstract": "Human intervention is an effective way to inject human knowledge into the training loop of reinforcement learning, which can bring fast learning and ensured training safety. Given the very limited budget of human intervention, it remains challenging to design when and how human expert interacts with the learning agent in the training. In this work, we develop a novel human-in-the-loop learning method called Human-AI Copilot Optimization (HACO).To allow the agent's sufficient exploration in the risky environments while ensuring the training safety, the human expert can take over the control and demonstrate how to avoid probably dangerous situations or trivial behaviors. The proposed HACO then effectively utilizes the data both from the trial-and-error exploration and human's partial demonstration to train a high-performing agent. HACO extracts proxy state-action values from partial human demonstration and optimizes the agent to improve the proxy values meanwhile reduce the human interventions. The experiments show that HACO achieves a substantially high sample efficiency in the safe driving benchmark. HACO can train agents to drive in unseen traffic scenarios with a handful of human intervention budget and achieve high safety and generalizability, outperforming both reinforcement learning and imitation learning baselines with a large margin. Code and demo video are included in the supplementary materials.", "keywords": "Human in the Loop;Safe Reinforcement Learning;Autonomous Driving", "primary_area": "", "supplementary_material": "/attachment/38010520ff1a8ed332ecadbf1abc4201ea99de9a.zip", "author": "Quanyi Li;Zhenghao Peng;Bolei Zhou", "authorids": "~Quanyi_Li1;~Zhenghao_Peng1;~Bolei_Zhou5", "gender": "M;M;M", "homepage": "https://quanyili.github.io;https://pengzhenghao.github.io;https://boleizhou.github.io/", "dblp": "270/7691;220/3963;46/8066", "google_scholar": "Ty49X3UAAAAJ;JZ8ws6IAAAAJ;9D4aG8AAAAAJ", "orcid": ";;", "linkedin": "https://www.linkedin.com/mwlite/in/quanyi-li-2b7985183;;", "or_profile": "~Quanyi_Li1;~Zhenghao_Peng1;~Bolei_Zhou5", "aff": "The Chinese University of Hong Kong;The Chinese University of Hong Kong;University of California, Los Angeles", "aff_domain": "ie.cuhk.edu;ie.cuhk.edu;ucla.edu", "position": "Researcher;MS student;Assistant Professor", "bibtex": "@inproceedings{\nli2022efficient,\ntitle={Efficient Learning of Safe Driving Policy via Human-{AI} Copilot Optimization},\nauthor={Quanyi Li and Zhenghao Peng and Bolei Zhou},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=0cgU-BZp2ky}\n}", "github": "", "project": "", "reviewers": "Mao2;ohEo;irh4;cwDu", "pdf_size": 0, "recommendation": "6;6;6;8", "confidence": "3;3;3;3", "correctness": "4;3;3;3", "technical_novelty": "3;3;3;3", "empirical_novelty": "2;3;3;4", "wc_summary_paper": "62;153;100;126", "wc_summary_review": "18;6;95;50", "wc_main_review": "142;208;728;198", "wc_review": "222;367;923;374", "wc_reply_reviewers": "0;0;75;15", "wc_reply_authors": "471;255;795;162", "reply_reviewers": "0;0;1;1", "reply_authors": "1;1;3;1", "recommendation_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 3.0, 0.0 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 110.25, 33.573613150806395 ], "wc_summary_review_avg": [ 42.25, 34.441072863660914 ], "wc_main_review_avg": [ 319.0, 237.4721036248258 ], "wc_review_avg": [ 471.5, 267.64201837529174 ], "wc_reply_reviewers_avg": [ 22.5, 30.923292192132454 ], "wc_reply_authors_avg": [ 420.75, 243.41977631244345 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.5, 0.8660254037844386 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": -0.3333333333333333, "gs_citation": 65, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2677899661233819599&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=0cgU-BZp2ky", "email": "ie.cuhk.edu;ie.cuhk.edu;ucla.edu", "author_num": 3, "aff_unique_index": "0;0;1", "aff_unique_norm": "Chinese University of Hong Kong;University of California, Los Angeles", "aff_unique_dep": ";", "aff_unique_url": "https://www.cuhk.edu.hk;https://www.ucla.edu", "aff_unique_abbr": "CUHK;UCLA", "aff_campus_unique_index": "0;0;1", "aff_campus_unique": "Hong Kong SAR;Los Angeles", "aff_country_unique_index": "0;0;1", "aff_country_unique": "China;United States" }, { "id": "0d1mLPC2q2", "title": "Understanding the Success of Knowledge Distillation -- A Data Augmentation Perspective", "track": "main", "status": "Reject", "tldr": "", "abstract": "Knowledge distillation (KD) is a general neural network training approach that uses a teacher model to guide a student model. Many works have explored the rationale for its success. However, its interplay with data augmentation (DA) has not been well understood so far. In this paper, we are motivated by an interesting observation in classification: KD loss can take more advantage of a DA method than cross-entropy loss \\emph{simply by training for more iterations}. We present a generic framework to explain this interplay between KD and DA. Inspired by it, we enhance KD via stronger data augmentation schemes named TLmixup and TLCutMix. Furthermore, an even stronger and efficient DA approach is developed specifically for KD based on the idea of active learning. The findings and merits of our method are validated with extensive experiments on CIFAR-100, Tiny ImageNet, and ImageNet datasets. We achieve new state-of-the-art accuracy by using the original KD loss armed with stronger augmentation schemes, compared to existing state-of-the-art methods that employ more advanced distillation losses. We also show that, by combining our approaches with the advanced distillation losses, we can advance the state-of-the-art even further. In addition to very promising performance, this paper importantly sheds light on explaining the success of knowledge distillation. The interaction of KD and DA methods we have discovered can inspire more powerful KD algorithms. ", "keywords": "knowledge distillation;data augmentation;mixup;cutmix;model compression;active learning", "primary_area": "", "supplementary_material": "", "author": "Huan Wang;Suhas Lohit;Michael Jeffrey Jones;Yun Fu", "authorids": "~Huan_Wang3;~Suhas_Lohit1;~Michael_Jeffrey_Jones1;~Yun_Fu1", "gender": "M;;M;M", "homepage": "https://huanwang.tech/;http://suhaslohit.github.io;;http://www1.ece.neu.edu/~yunfu/", "dblp": "70/6155-14;169/9097;49/1064;00/5815-1", "google_scholar": "0-On0y4AAAAJ;GMRYY5cAAAAJ;h-V4QaMAAAAJ;https://scholar.google.com.tw/citations?user=h-JEcQ8AAAAJ", "orcid": "0000-0001-6951-901X;;0000-0001-5215-2346;0000-0002-5098-2853", "linkedin": "huanwang-zju/;;;furaymond/", "or_profile": "~Huan_Wang3;~Suhas_Lohit1;~Michael_Jeffrey_Jones1;~Yun_Fu1", "aff": "Northeastern University;Mitsubishi Electric Research Labs;MERL;Northeastern University", "aff_domain": "neu.edu;merl.com;merl.com;northeastern.edu", "position": "PhD student;Researcher;Principal Researcher;Full Professor", "bibtex": "@misc{\nwang2022understanding,\ntitle={Understanding the Success of Knowledge Distillation -- A Data Augmentation Perspective},\nauthor={Huan Wang and Suhas Lohit and Michael Jeffrey Jones and Yun Fu},\nyear={2022},\nurl={https://openreview.net/forum?id=0d1mLPC2q2}\n}", "github": "", "project": "", "reviewers": "T234;zS3x;146T", "site": "https://openreview.net/forum?id=0d1mLPC2q2", "pdf_size": 0, "recommendation": "3;3;5", "confidence": "5;4;4", "correctness": "3;2;3", "technical_novelty": "2;2;3", "empirical_novelty": "3;2;3", "wc_summary_paper": "143;162;202", "wc_summary_review": "71;529;44", "wc_main_review": "221;133;596", "wc_review": "435;824;842", "wc_reply_reviewers": "0;0;50", "wc_reply_authors": "515;788;560", "reply_reviewers": "0;0;1", "reply_authors": "1;1;1", "recommendation_avg": [ 3.6666666666666665, 0.9428090415820634 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "correctness_avg": [ 2.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 169.0, 24.589970855343985 ], "wc_summary_review_avg": [ 214.66666666666666, 222.54038335147675 ], "wc_main_review_avg": [ 316.6666666666667, 200.75911491691284 ], "wc_review_avg": [ 700.3333333333334, 187.76285279279523 ], "wc_reply_reviewers_avg": [ 16.666666666666668, 23.570226039551585 ], "wc_reply_authors_avg": [ 621.0, 119.50732195141852 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.4999999999999999, "corr_recommendation_correctness": 0.5, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Z0UK1VGEr7EJ:scholar.google.com/&scioq=Understanding+the+Success+of+Knowledge+Distillation+--+A+Data+Augmentation+Perspective&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1;1;0", "aff_unique_norm": "Northeastern University;Mitsubishi Electric Research Laboratories", "aff_unique_dep": ";", "aff_unique_url": "https://www.northeastern.edu;https://www.merl.com", "aff_unique_abbr": "NEU;MERL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "0jFw-C30hm", "title": "Less is more: Selecting the right benchmarking set of data for time series classification", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "In this paper, we have proposed a new pipeline for landscape analysis of machine learning datasets that enables us to better understand a benchmarking problem landscape, allows us to select a diverse benchmark datasets portfolio, and identify the presence of performance assessment bias via bootstrapping evaluation. Combining a large multi-domain representation corpus of time-series specific features and the results of a large empirical study of time-series classification benchmark, we showcase the capability of the pipeline to point out issues with non-redundancy and representativeness in the benchmark. By observing discrepancy between the empirical results of the bootstrap evaluation and recently adapted practices in TSC literature when introducing novel methods we warn on the potentially harmful effects of tuning the methods on certain parts of the landscape (unless this is an explicit and desired goal of the study). Finally, we propose a set of datasets uniformly distributed across the landscape space one should consider when benchmarking novel TSC methods. ", "keywords": "benchmarking;time-series classification;landscape analysis", "primary_area": "", "supplementary_material": "", "author": "Tome Eftimov;Ga\u0161per Petelin;Gjorgjina Cenikj;Ana Kostovska;Gordana Ispirova;Peter Koro\u0161ec;Jasmin Bogatinovski", "authorids": "~Tome_Eftimov1;gasper.petelin@ijs.si;~Gjorgjina_Cenikj1;~Ana_Kostovska1;gordana.ispirova@ijs.si;peter.korosec@ijs.si;~Jasmin_Bogatinovski1", "gender": "M;;F;;;;M", "homepage": "http://cs.ijs.si/eftimov/;;;;;;https://bogatinovskijasmin.github.io/bogatinovskijasmin/aboutme/", "dblp": "145/4632;;;;;;", "google_scholar": "MJXKaMsAAAAJ;;;;;;", "orcid": "0000-0001-7330-1902;;;my-orcid?orcid=0000-0002-5983-7169;;;", "linkedin": ";;gjorgjina-cenikj-310693167/;;;;", "or_profile": "~Tome_Eftimov1;gasper.petelin@ijs.si;~Gjorgjina_Cenikj1;~Ana_Kostovska1;gordana.ispirova@ijs.si;peter.korosec@ijs.si;~Jasmin_Bogatinovski1", "aff": "Jozef Stefan Institute;;Jo\u017eef Stefan International Postgraduate School;Jozef Stefan Institute;;;TU Berlin", "aff_domain": "ijs.si;;ijs.si;kt.ijs.si;;;tu-berlin.de", "position": "Senior Researcher;;PhD student;PhD student;;;PhD student", "bibtex": "@misc{\neftimov2022less,\ntitle={Less is more: Selecting the right benchmarking set of data for time series classification},\nauthor={Tome Eftimov and Ga{\\v{s}}per Petelin and Gjorgjina Cenikj and Ana Kostovska and Gordana Ispirova and Peter Koro{\\v{s}}ec and Jasmin Bogatinovski},\nyear={2022},\nurl={https://openreview.net/forum?id=0jFw-C30hm}\n}", "github": "", "project": "", "reviewers": "YA6W;X5gA;H7Yi", "site": "https://openreview.net/forum?id=0jFw-C30hm", "pdf_size": 0, "recommendation": "3;3;6", "confidence": "4;3;3", "correctness": "2;2;3", "technical_novelty": "1;1;2", "empirical_novelty": "2;1;3", "wc_summary_paper": "57;45;78", "wc_summary_review": "126;95;26", "wc_main_review": "354;140;450", "wc_review": "537;280;554", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 4.0, 1.4142135623730951 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "correctness_avg": [ 2.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 1.3333333333333333, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.0, 0.816496580927726 ], "wc_summary_paper_avg": [ 60.0, 13.638181696985855 ], "wc_summary_review_avg": [ 82.33333333333333, 41.79579989531111 ], "wc_main_review_avg": [ 314.6666666666667, 129.57708988174653 ], "wc_review_avg": [ 457.0, 125.3501761732574 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": -0.5, "corr_recommendation_correctness": 1.0, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13031164003616939289&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "aff_unique_index": "0;1;0;2", "aff_unique_norm": "Jozef Stefan Institute;Jo\u017eef Stefan International Postgraduate School;Technische Universit\u00e4t Berlin", "aff_unique_dep": ";;", "aff_unique_url": "https://www.ijs.si;https://www.joisepstefan.si;https://www.tu-berlin.de", "aff_unique_abbr": "JSI;;TU Berlin", "aff_campus_unique_index": "1", "aff_campus_unique": ";Berlin", "aff_country_unique_index": "0;0;0;1", "aff_country_unique": "Slovenia;Germany" }, { "title": "Towards Training Billion Parameter Graph Neural Networks for Atomic Simulations", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6800", "id": "0jP2n0YFmKG", "poster": "", "openreview": "https://openreview.net/forum?id=0jP2n0YFmKG", "slides": "https://iclr.cc/virtual/2022/poster/6800", "video": "https://iclr.cc/virtual/2022/poster/6800", "author_site": "Anuroop Sriram, Abhishek Das, Brandon Wood, Siddharth Goyal, Larry Zitnick", "tldr": "", "abstract": "Recent progress in Graph Neural Networks (GNNs) for modeling atomic simulations has the potential to revolutionize catalyst discovery, which is a key step in making progress towards the energy breakthroughs needed to combat climate change. However, the GNNs that have proven most effective for this task are memory intensive as they model higher-order interactions in the graphs such as those between triplets or quadruplets of atoms, making it challenging to scale these models. In this paper, we introduce Graph Parallelism, a method to distribute input graphs across multiple GPUs, enabling us to train very large GNNs with hundreds of millions or billions of parameters. We empirically evaluate our method by scaling up the recently proposed DimeNet++ and GemNet models by over an order of magnitude in the number of parameters. On the large-scale Open Catalyst 2020 (OC20) dataset, these graph-parallelized models lead to relative improvements of 1) 15% on the force MAE metric on the S2EF task and 2) 21% on the AFbT metric on the IS2RS task, establishing new state-of-the-art results.", "keywords": "Graph Neural Networks;Atomic Simulations;Computational Chemistry", "primary_area": "", "supplementary_material": "", "author": "Anuroop Sriram;Abhishek Das;Brandon M Wood;Siddharth Goyal;C. Lawrence Zitnick", "authorids": "~Anuroop_Sriram1;~Abhishek_Das1;~Brandon_M_Wood1;~Siddharth_Goyal2;~C._Lawrence_Zitnick2", "gender": "M;M;M;M;", "homepage": "https://anuroopsriram.com;https://abhishekdas.com/;https://www.bmwood.org;;http://larryzitnick.org/", "dblp": "200/7951;40/5262;276/7546;;10/6888", "google_scholar": "D4uRc_UAAAAJ;t6exkOAAAAAJ;KbqboRgAAAAJ;vxjELqYAAAAJ;ZeJjFQMAAAAJ", "orcid": ";;0000-0002-7251-337X;;", "linkedin": "anuroopsriram/;;;;", "or_profile": "~Anuroop_Sriram1;~Abhishek_Das1;~Brandon_M_Wood1;~Siddharth_Goyal2;~Larry_Zitnick1", "aff": "Meta Facebook;Facebook AI Research (FAIR);FAIR at Meta;;Meta", "aff_domain": "meta.com;fb.com;meta.com;;meta.com", "position": "Principal Researcher;Research Scientist;Researcher;;Researcher", "bibtex": "@inproceedings{\nsriram2022towards,\ntitle={Towards Training Billion Parameter Graph Neural Networks for Atomic Simulations},\nauthor={Anuroop Sriram and Abhishek Das and Brandon M Wood and C. Lawrence Zitnick},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=0jP2n0YFmKG}\n}", "github": "", "project": "", "reviewers": "ZcRk;BwW9;N2CL;4Syd", "pdf_size": 0, "recommendation": "5;5;6;8", "confidence": "1;3;3;2", "correctness": "3;3;3;3", "technical_novelty": "3;3;2;3", "empirical_novelty": "3;3;3;4", "wc_summary_paper": "62;72;87;71", "wc_summary_review": "173;51;51;36", "wc_main_review": "173;140;163;123", "wc_review": "408;263;301;230", "wc_reply_reviewers": "0;0;29;51", "wc_reply_authors": "115;564;323;224", "reply_reviewers": "0;0;1;1", "reply_authors": "1;1;1;1", "recommendation_avg": [ 6.0, 1.224744871391589 ], "confidence_avg": [ 2.25, 0.82915619758885 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 73.0, 8.972179222463181 ], "wc_summary_review_avg": [ 77.75, 55.33251756426776 ], "wc_main_review_avg": [ 149.75, 19.536824204563032 ], "wc_review_avg": [ 300.5, 66.95707580233773 ], "wc_reply_reviewers_avg": [ 20.0, 21.459263733874934 ], "wc_reply_authors_avg": [ 306.5, 165.874199319846 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 33, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5143219235404085170&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=0jP2n0YFmKG", "email": "meta.com;fb.com;meta.com;;meta.com", "author_num": 5, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Meta", "aff_unique_dep": "Meta Platforms, Inc.", "aff_unique_url": "https://meta.com", "aff_unique_abbr": "Meta", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "0kNbTghw7q", "title": "Improving Generative Adversarial Networks via Adversarial Learning in Latent Space", "track": "main", "status": "Reject", "tldr": "", "abstract": "Generative Adversarial Networks (GANs) have been widely studied as generative models, which map a latent distribution to the target distribution. Although many efforts have been made in terms of backbone architecture design, loss function, and training techniques, few results have been obtained on how the sampling in latent space can affect the final performance, and existing works on latent space mainly focus on controllability. We observe that, as the neural generator is a continuous function, two close samples in latent space would be mapped into two nearby images, while their quality can differ much as the quality is not a continuous function in pixel space. From the above continuous mapping function perspective, on the other hand, two distant latent samples are also possible to be mapped into two close images. If the latent samples are mapped in aggregation into limited modes or even a single mode, mode collapse occurs. Accordingly, we propose adding an implicit latent transform before the mapping function to improve latent $z$ from its initial distribution, e.g., Gaussian. This is achieved by using the iterative fast gradient sign method (I-FGSM). We further propose new GAN training strategies to obtain better generation mappings w.r.t quality and diversity by introducing targeted latent transforms into the bi-level optimization of GAN. Experimental results on visual data show that our method can effectively achieve improvement in both quality and diversity.", "keywords": "Generative Adversarial Networks;Adversarial Traing;Latent Space", "primary_area": "", "supplementary_material": "", "author": "Yang Li;Yichuan Mo;Liangliang Shi;Junchi Yan;Xiaolu Zhang;JUN ZHOU", "authorids": "~Yang_Li32;~Yichuan_Mo1;~Liangliang_Shi1;~Junchi_Yan2;~Xiaolu_Zhang2;~JUN_ZHOU6", "gender": "M;M;M;F;M;M", "homepage": "https://yangco-le.github.io;https://www.linkedin.com/in/%E6%98%93%E5%B7%9D-%E8%8E%AB-446841212/;;https://scholar.google.com/citations?user=cAz9PToAAAAJ;https://scholar.google.com/citations?user=mCVvloEAAAAJ&hl=en;http://thinklab.sjtu.edu.cn/", "dblp": ";321/6790;89/8730;48/5176;99/3847-11;60/7949.html", "google_scholar": "ecE0xDIAAAAJ;xvSYG1gAAAAJ;Qf1k8lUAAAAJ;;mCVvloEAAAAJ;ga230VoAAAAJ", "orcid": "0000-0002-5249-3471;;0000-0001-7033-4207;0000-0001-8055-0245;0000-0001-6033-6102;0000-0001-9639-7679", "linkedin": ";;;;;", "or_profile": "~Yang_Li32;~Yichuan_Mo1;~Liangliang_Shi1;~Xiaolu_Zhang2;~JUN_ZHOU6;~Junchi_Yan1", "aff": "Shanghai Jiaotong University;Shanghai Jiaotong University;Shanghai Jiaotong University;Ant Group;Ant Group;Shanghai Jiaotong University", "aff_domain": "sjtu.edu.cn;sjtu.edu.cn;sjtu.edu.cn;antfin.com;antgroup.com;sjtu.edu.cn", "position": "Undergrad student;Undergrad student;PhD student;Researcher;Researcher;Associate Professor", "bibtex": "@misc{\nli2022improving,\ntitle={Improving Generative Adversarial Networks via Adversarial Learning in Latent Space},\nauthor={Yang Li and Yichuan Mo and Liangliang Shi and Junchi Yan and Xiaolu Zhang and JUN ZHOU},\nyear={2022},\nurl={https://openreview.net/forum?id=0kNbTghw7q}\n}", "github": "", "project": "", "reviewers": "ua7t;NCez;JVu6;uSqk", "site": "https://openreview.net/forum?id=0kNbTghw7q", "pdf_size": 0, "recommendation": "3;5;6;6", "confidence": "4;4;4;3", "correctness": "2;3;3;4", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "64;168;79;93", "wc_summary_review": "128;60;6;48", "wc_main_review": "178;245;357;133", "wc_review": "370;473;442;274", "wc_reply_reviewers": "0;0;0;43", "wc_reply_authors": "1548;554;1654;880", "reply_reviewers": "0;0;0;1", "reply_authors": "3;2;3;2", "recommendation_avg": [ 5.0, 1.224744871391589 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 101.0, 40.01874560752748 ], "wc_summary_review_avg": [ 60.5, 43.8263619297792 ], "wc_main_review_avg": [ 228.25, 84.34267899468216 ], "wc_review_avg": [ 389.75, 76.56492343103335 ], "wc_reply_reviewers_avg": [ 10.75, 18.619546181365433 ], "wc_reply_authors_avg": [ 1159.0, 458.3153935883018 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 2.5, 0.5 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.4714045207910316, "corr_recommendation_correctness": 0.8660254037844386, "gs_citation": 25, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8145226379368778648&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0;0;1;1;0", "aff_unique_norm": "Shanghai Jiao Tong University;Ant Group", "aff_unique_dep": ";", "aff_unique_url": "https://www.sjtu.edu.cn;https://www.antgroup.com", "aff_unique_abbr": "SJTU;Ant Group", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Fast topological clustering with Wasserstein distance", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6706", "id": "0kPL3xO4R5", "poster": "", "openreview": "https://openreview.net/forum?id=0kPL3xO4R5", "slides": "https://iclr.cc/virtual/2022/poster/6706", "video": "https://iclr.cc/virtual/2022/poster/6706", "author_site": "Tananun Songdechakraiwut, Bryan Krause, Matthew Banks, Kirill Nourski, Barry Van Veen", "tldr": "", "abstract": "The topological patterns exhibited by many real-world networks motivate the development of topology-based methods for assessing the similarity of networks. However, extracting topological structure is difficult, especially for large and dense networks whose node degrees range over multiple orders of magnitude. In this paper, we propose a novel and computationally practical topological clustering method that clusters complex networks with intricate topology using principled theory from persistent homology and optimal transport. Such networks are aggregated into clusters through a centroid-based clustering strategy based on both their topological and geometric structure, preserving correspondence between nodes in different networks. The notions of topological proximity and centroid are characterized using a novel and efficient approach to computation of the Wasserstein distance and barycenter for persistence barcodes associated with connected components and cycles. The proposed method is demonstrated to be effective using both simulated networks and measured functional brain networks.", "keywords": "Topological data analysis;cluster analysis;persistent homology;Wasserstein distance;Wasserstein barycenter;brain networks;intracranial electrophysiology;consciousness", "primary_area": "", "supplementary_material": "", "author": "Tananun Songdechakraiwut;Bryan M Krause;Matthew I Banks;Kirill V Nourski;Barry D Van Veen", "authorids": "~Tananun_Songdechakraiwut1;~Bryan_M_Krause1;~Matthew_I_Banks1;~Kirill_V_Nourski1;~Barry_D_Van_Veen1", "gender": ";M;male;;", "homepage": ";;https://anesthesia.wisc.edu/research/researchers/banks-laboratory/;;", "dblp": "271/7516;;;;", "google_scholar": ";IMcRjFsAAAAJ;;https://scholar.google.com/citations?hl=en;M3xRqgYAAAAJ", "orcid": ";;0000-0002-1936-7529;;", "linkedin": ";;;;", "or_profile": "~Tananun_Songdechakraiwut1;~Bryan_M_Krause1;~Matthew_I_Banks1;~Barry_D_Van_Veen1;~Kirill_Nourski1", "aff": "Duke University;University of Wisconsin, Madison;University of Wisconsin, Madison;University of Wisconsin, Madison;The University of Iowa", "aff_domain": "duke.edu;wisc.edu;wisc.edu;wisc.edu;uiowa.edu", "position": "Assistant Professor;Researcher;Full Professor;Professor;Associate Professor", "bibtex": "@inproceedings{\nsongdechakraiwut2022fast,\ntitle={Fast topological clustering with Wasserstein distance},\nauthor={Tananun Songdechakraiwut and Bryan M Krause and Matthew I Banks and Kirill V Nourski and Barry D Van Veen},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=0kPL3xO4R5}\n}", "github": "", "project": "", "reviewers": "VKeM;FgNK;NiBH", "pdf_size": 0, "recommendation": "6;8;8", "confidence": "4;5;5", "correctness": "3;4;3", "technical_novelty": "2;2;3", "empirical_novelty": "2;2;3", "wc_summary_paper": "160;78;105", "wc_summary_review": "26;73;125", "wc_main_review": "529;685;1014", "wc_review": "715;836;1244", "wc_reply_reviewers": "18;158;23", "wc_reply_authors": "1095;1718;1972", "reply_reviewers": "1;1;1", "reply_authors": "2;4;3", "recommendation_avg": [ 7.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 114.33333333333333, 34.120700787384514 ], "wc_summary_review_avg": [ 74.66666666666667, 40.43375927228247 ], "wc_main_review_avg": [ 742.6666666666666, 202.15560563310848 ], "wc_review_avg": [ 931.6666666666666, 226.31001352618543 ], "wc_reply_reviewers_avg": [ 66.33333333333333, 64.85025486114573 ], "wc_reply_authors_avg": [ 1595.0, 368.44628735633455 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 3.0, 0.816496580927726 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.9999999999999997, "corr_recommendation_correctness": 0.49999999999999983, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8410024039474684977&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=0kPL3xO4R5", "email": "duke.edu;wisc.edu;wisc.edu;wisc.edu;uiowa.edu", "author_num": 5, "aff_unique_index": "0;1;1;1;2", "aff_unique_norm": "Duke University;University of Wisconsin;University of Iowa", "aff_unique_dep": ";;", "aff_unique_url": "https://www.duke.edu;https://www.wisc.edu;https://www.uiowa.edu", "aff_unique_abbr": "Duke;UW;UIowa", "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Madison", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "0kwQV5SkHWW", "title": "Partially Relaxed Masks for Lightweight Knowledge Transfer without Forgetting in Continual Learning", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "The existing research on continual learning (CL) has focused mainly on preventing catastrophic forgetting. In the task-incremental learning setting of CL, several approaches have achieved excellent results, with almost no forgetting. The goal of this work is to endow such systems with the additional ability to transfer knowledge among tasks when the tasks are similar and have shared knowledge to achieve higher accuracy. Since the existing system HAT is one of most effective task-incremental learning algorithms, this paper extends HAT with the aim of both objectives, i.e., overcoming catastrophic forgetting and transferring knowledge among tasks without introducing additional mechanisms into the architecture of HAT. The current study finds that task similarity, which indicates knowledge sharing and transfer, can be computed via the clustering of task embeddings optimized by HAT. Thus, we propose a new approach, named \u201cpartially relaxed masks\u201d (PRM), to exploit HAT\u2019s masks to not only keep some parameters from being modified in learning subsequent tasks as much as possible to prevent forgetting but also enable remaining parameters to be updated to facilitate knowledge transfer. Extensive experiments demonstrate that PRM performs competitively compared with the latest baselines while also requiring much less computation time.", "keywords": "Continual learning;Task similarity;Catastrophic forgetting;Knowledge transfer", "primary_area": "", "supplementary_material": "/attachment/64b06284c31e4d2e934c9c98cdb0e5aa7bbcb81e.zip", "author": "Tatsuya Konishi;Mori Kurokawa;Roberto Legaspi;Chihiro Ono;Zixuan Ke;Gyuhak Kim;Bing Liu", "authorids": "~Tatsuya_Konishi2;~Mori_Kurokawa1;~Roberto_Legaspi1;~Chihiro_Ono1;~Zixuan_Ke1;~Gyuhak_Kim1;~Bing_Liu1", "gender": "M;M;M;M;;M;M", "homepage": ";https://www.researchgate.net/profile/Roberto-Legaspi;;https://vincent950129.github.io/;https://k-gyuhak.github.io/;https://www.cs.uic.edu/~liub/;", "dblp": "25/2313;296/0450.html;28/3923;196/3817;317/0166;l/BingLiu1.html;185/3974.html", "google_scholar": "https://scholar.google.co.jp/citations?user=vmIprTMAAAAJ;zE7Zhk0AAAAJ;https://scholar.google.co.jp/citations?hl=ja;SZ4sFNEAAAAJ;https://scholar.google.com/citations?hl=en;Kt1bjZoAAAAJ;tx15SxoAAAAJ", "orcid": "0000-0003-4544-0643;0000-0001-8909-635X;0000-0002-6410-1359;;;;0000-0002-2255-0156", "linkedin": ";roberto-legaspi-5a3a4361/;chihiro-ono-03956251/;;;;ukaznil/", "or_profile": "~Mori_Kurokawa1;~Roberto_Legaspi1;~Chihiro_Ono1;~Zixuan_Ke1;~Gyuhak_Kim1;~Bing_Liu1;~Tatsuya_KONISHI1", "aff": "KDDI Research, Inc.;KDDI Research, Inc.;KDDI Research, Inc.;University of Illinois, Chicago;University of Illinois, Chicago;University of Illinois at Chicago;KDDI Research, Inc.", "aff_domain": "kddi-research.jp;kddi-research.jp;kddi-research.jp;uic.edu;uic.edu;uic.edu;kddi-research.jp", "position": "Researcher;Researcher;Researcher;PhD student;PhD student;Full Professor;Researcher", "bibtex": "@misc{\nkonishi2022partially,\ntitle={Partially Relaxed Masks for Lightweight Knowledge Transfer without Forgetting in Continual Learning},\nauthor={Tatsuya Konishi and Mori Kurokawa and Roberto Legaspi and Chihiro Ono and Zixuan Ke and Gyuhak Kim and Bing Liu},\nyear={2022},\nurl={https://openreview.net/forum?id=0kwQV5SkHWW}\n}", "github": "", "project": "", "reviewers": "4wCo;gDWc;4u89;u84b", "site": "https://openreview.net/forum?id=0kwQV5SkHWW", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "4;4;3;4", "correctness": "2;4;3;3", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "98;66;95;74", "wc_summary_review": "89;12;36;265", "wc_main_review": "431;319;219;49", "wc_review": "618;397;350;388", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 83.25, 13.589977924926883 ], "wc_summary_review_avg": [ 100.5, 98.97600719366285 ], "wc_main_review_avg": [ 254.5, 140.3593602151278 ], "wc_review_avg": [ 438.25, 105.26721949400962 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:LTsIm_hjMBAJ:scholar.google.com/&scioq=Partially+Relaxed+Masks+for+Lightweight+Knowledge+Transfer+without+Forgetting+in+Continual+Learning&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;0;1;1;1;0", "aff_unique_norm": "KDDI Research;University of Illinois at Chicago", "aff_unique_dep": ";", "aff_unique_url": "https://www.kddi-research.com;https://www.uic.edu", "aff_unique_abbr": "KDDI;UIC", "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Chicago", "aff_country_unique_index": "0;0;0;1;1;1;0", "aff_country_unique": "Japan;United States" }, { "id": "0lGKTI1tho", "title": "POLAR: A Polynomial Arithmetic Framework for Verifying Neural-Network Controlled Systems", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "We propose POLAR, a \\textbf{pol}ynomial \\textbf{ar}ithmetic framework that leverages polynomial overapproximations with interval remainders for bounded-time reachability analysis of neural network-controlled systems (NNCSs).\nCompared with existing arithmetic approaches that use standard Taylor models, our framework uses a novel approach to iteratively overapproximate the neuron output ranges layer-by-layer via a combination of Bernstein polynomial interpolation for continuous activation functions and Taylor model arithmetic for the other operations. This approach overcomes the main drawback in the standard Taylor model arithmetic, i.e. its inability to handle functions that cannot be well approximated by Taylor polynomials, and significantly improve the accuracy and efficiency of reachable states computation for NNCSs. To further tighten the overapproximation, our method keeps the Taylor model remainders symbolic under the linear mappings when propagating Taylor models across the neural-network controller. \nWe show that POLAR can be seamlessly integrated with existing Taylor model flowpipe construction techniques, and POLAR significantly outperforms the current state-of-the-art techniques on a suite of benchmarks.", "keywords": "Neural network controlled systems;safety;verification;Taylor model arithmetic", "primary_area": "", "supplementary_material": "/attachment/ccf7791f024e2033a06260f6d801ac0c3fc43263.zip", "author": "Chao Huang;Jiameng Fan;Xin Chen;Wenchao Li;Qi Zhu", "authorids": "~Chao_Huang5;~Jiameng_Fan1;~Xin_Chen19;~Wenchao_Li1;~Qi_Zhu2", "gender": "M;M;M;;", "homepage": "https://chaohuang2018.github.io/main/;https://www.jiamengf.com;https://shinchern.github.io/;http://sites.bu.edu/depend/;http://zhulab.ece.northwestern.edu/", "dblp": "18/4087-15;196/7836.html;24/1518-2;23/5721-1;66/5923-2.html", "google_scholar": "GbY72eIAAAAJ;https://scholar.google.com/citations?hl=en;https://scholar.google.de/citations?user=fDvCpnEAAAAJ;zwA5eokAAAAJ;TN09YMcAAAAJ", "orcid": "0000-0002-9300-1787;;0000-0002-2730-1511;;", "linkedin": ";;;;", "or_profile": "~Chao_Huang5;~Jiameng_Fan1;~Xin_Chen19;~Wenchao_Li1;~Qi_Zhu2", "aff": "University of Liverpool;Boston University;University of Dayton;Boston University;Northwestern University", "aff_domain": "liverpool.ac.uk;bu.edu;udayton.edu;bu.edu;northwestern.edu", "position": "Assistant Professor;PhD student;Assistant Professor;Assistant Professor;Associate Professor", "bibtex": "@misc{\nhuang2022polar,\ntitle={{POLAR}: A Polynomial Arithmetic Framework for Verifying Neural-Network Controlled Systems},\nauthor={Chao Huang and Jiameng Fan and Xin Chen and Wenchao Li and Qi Zhu},\nyear={2022},\nurl={https://openreview.net/forum?id=0lGKTI1tho}\n}", "github": "", "project": "", "reviewers": "Gf2A;zrk4;MWuh", "site": "https://openreview.net/forum?id=0lGKTI1tho", "pdf_size": 0, "recommendation": "1;3;6", "confidence": "5;4;3", "correctness": "2;2;3", "technical_novelty": "1;2;2", "empirical_novelty": "1;2;2", "wc_summary_paper": "25;104;87", "wc_summary_review": "39;69;48", "wc_main_review": "372;1067;141", "wc_review": "436;1240;276", "wc_reply_reviewers": "0;375;0", "wc_reply_authors": "2004;1787;253", "reply_reviewers": "0;1;0", "reply_authors": "4;4;1", "recommendation_avg": [ 3.3333333333333335, 2.0548046676563256 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "correctness_avg": [ 2.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 1.6666666666666667, 0.4714045207910317 ], "empirical_novelty_avg": [ 1.6666666666666667, 0.4714045207910317 ], "wc_summary_paper_avg": [ 72.0, 33.95094500402996 ], "wc_summary_review_avg": [ 52.0, 12.569805089976535 ], "wc_main_review_avg": [ 526.6666666666666, 393.53977633214606 ], "wc_review_avg": [ 650.6666666666666, 421.8098571104705 ], "wc_reply_reviewers_avg": [ 125.0, 176.7766952966369 ], "wc_reply_authors_avg": [ 1348.0, 779.3334759053191 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 3.0, 1.4142135623730951 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.9933992677987827, "corr_recommendation_correctness": 0.9176629354822468, "gs_citation": 71, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=917613921601853247&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "aff_unique_index": "0;1;2;1;3", "aff_unique_norm": "University of Liverpool;Boston University;University of Dayton;Northwestern University", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.liverpool.ac.uk;https://www.bu.edu;https://www.udayton.edu;https://www.northwestern.edu", "aff_unique_abbr": "Liv Uni;BU;UD;NU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1;1", "aff_country_unique": "United Kingdom;United States" }, { "id": "0lSoIruExF", "title": "Incorporating User-Item Similarity in Hybrid Neighborhood-based Recommendation System", "track": "main", "status": "Reject", "tldr": "", "abstract": "Modern hybrid recommendation systems require a sufficient amount of data. However, several internet privacy issues make users skeptical about sharing their personal information with online service providers. This work introduces various novel methods utilizing the baseline estimate to learn user interests from their interactions. Subsequently, extracted user feature vectors are implemented to estimate the user-item correlations, providing an additional fine-tuning factor for neighborhood-based collaborative filtering systems. Comprehensive experiments show that utilizing the user-item similarity can boost the accuracy of hybrid neighborhood-based systems by at least $2.11\\%$ while minimizing the need for tracking users' digital footprints.", "keywords": "Recommendation system;Neighborhood-based;Collaborative filtering;Data mining", "primary_area": "", "supplementary_material": "", "author": "Nghia Duong Tan;Giang Do Truong;Nam Doan Nguyen;Nghia Cao Tuan;Hoang Tran Manh;Minh Nguyen Duc;Hieu Dang Quang", "authorids": "~Nghia_Duong_Tan1;giang.dt172524@sis.hust.edu.vn;nam.dn168746@sis.hust.edu.vn;nghia.ct182705@sis.hust.edu.vn;hoang.tranmanh@hust.edu.vn;minh.nguyenduc1@hust.edu.vn;hieu.dangquang@hust.edu.vn", "gender": "M;;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": "0000-0002-2442-6263;;;;;;", "linkedin": ";;;;;;", "or_profile": "~Nghia_Duong_Tan1;giang.dt172524@sis.hust.edu.vn;nam.dn168746@sis.hust.edu.vn;nghia.ct182705@sis.hust.edu.vn;hoang.tranmanh@hust.edu.vn;minh.nguyenduc1@hust.edu.vn;hieu.dangquang@hust.edu.vn", "aff": "Hanoi University of Science and Technology;;;;;;", "aff_domain": "hust.edu.vn;;;;;;", "position": "Lecturer;;;;;;", "bibtex": "@misc{\ntan2022incorporating,\ntitle={Incorporating User-Item Similarity in Hybrid Neighborhood-based Recommendation System},\nauthor={Nghia Duong Tan and Giang Do Truong and Nam Doan Nguyen and Nghia Cao Tuan and Hoang Tran Manh and Minh Nguyen Duc and Hieu Dang Quang},\nyear={2022},\nurl={https://openreview.net/forum?id=0lSoIruExF}\n}", "github": "", "project": "", "reviewers": "ef34;a2xk;kQ8s;7nd1;ZK3G", "site": "https://openreview.net/forum?id=0lSoIruExF", "pdf_size": 0, "recommendation": "1;1;3;3;5", "confidence": "4;4;5;4;3", "correctness": "3;2;2;4;3", "technical_novelty": "1;2;1;2;2", "empirical_novelty": "1;1;2;2;3", "wc_summary_paper": "64;58;24;114;52", "wc_summary_review": "15;14;31;35;14", "wc_main_review": "149;138;111;327;137", "wc_review": "228;210;166;476;203", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "0;0;0;0;0", "reply_reviewers": "0;0;0;0;0", "reply_authors": "0;0;0;0;0", "recommendation_avg": [ 2.6, 1.4966629547095767 ], "confidence_avg": [ 4.0, 0.6324555320336759 ], "correctness_avg": [ 2.8, 0.7483314773547882 ], "technical_novelty_avg": [ 1.6, 0.4898979485566356 ], "empirical_novelty_avg": [ 1.8, 0.7483314773547883 ], "wc_summary_paper_avg": [ 62.4, 29.213695418416343 ], "wc_summary_review_avg": [ 21.8, 9.239047569960878 ], "wc_main_review_avg": [ 172.4, 78.30095784854743 ], "wc_review_avg": [ 256.6, 111.54120314932953 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": -0.42257712736425823, "corr_recommendation_correctness": 0.2857142857142857, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:8YBCTYT7wSUJ:scholar.google.com/&scioq=Incorporating+User-Item+Similarity+in+Hybrid+Neighborhood-based+Recommendation+System&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "Hanoi University of Science and Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.hust.edu.vn", "aff_unique_abbr": "HUST", "aff_campus_unique_index": "0", "aff_campus_unique": "Hanoi", "aff_country_unique_index": "0", "aff_country_unique": "Vietnam" }, { "id": "0m4c9ZfDrDt", "title": "Generalizing Successor Features to continuous domains for Multi-task Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "The deep reinforcement learning (RL) framework has shown great promise to tackle sequential decision-making problems, where the agent learns to behave optimally through interactions with the environment and receiving rewards.\nThe ability of an RL agent to learn different reward functions concurrently has many benefits, such as the decomposition of task rewards and skill reuse. One obstacle for achieving this, is the amount of data required as well as the capacity of the model for solving multiple tasks. In this paper, we consider the problem of continuous control for various robot manipulation tasks with an explicit representation that promotes skill reuse while learning multiple tasks, related through the reward function. Our approach relies on two key concepts: successor features (SF), a value function representation that decouples the dynamics of the environment from the rewards, and an actor-critic framework that incorporates the learned SF representations.\nWe propose a practical implementation of successor features in continuous action spaces. We first show how to learn a decomposable representation required by SF. Our proposed methods, is able to learn decoupled state and reward features representations. We study this approach on a non-trivial continuous control problems with compositional structure built into the reward functions of various tasks.", "keywords": "Reinforcement learning;multi-task learning;representation learning", "primary_area": "", "supplementary_material": "/attachment/af3190e5faf8a282ba235d900b5eaaf36d58ce1a.zip", "author": "Melissa Mozifian;Dieter Fox;David Meger;Fabio Ramos;Animesh Garg", "authorids": "~Melissa_Mozifian1;~Dieter_Fox1;~David_Meger2;~Fabio_Ramos1;~Animesh_Garg1", "gender": "F;M;M;M;M", "homepage": "https://melfm.github.io/about.html;https://homes.cs.washington.edu/~fox/;http://www.cim.mcgill.ca/~dmeger/;https://fabioramos.github.io/;http://animesh.garg.tech", "dblp": "211/6963;f/DieterFox;51/3415.html;22/2488;123/5728", "google_scholar": "sygJEU0AAAAJ;DqXsbPAAAAAJ;https://scholar.google.com.tw/citations?user=gFwEytkAAAAJ;https://scholar.google.com.au/citations?user=T_mJiHoAAAAJ;zp8V7ZMAAAAJ", "orcid": ";;;;0000-0003-0482-4296", "linkedin": ";;;fabio-ramos-3256b421/;animeshgarg/", "or_profile": "~Melissa_Mozifian1;~Dieter_Fox1;~David_Meger2;~Fabio_Ramos1;~Animesh_Garg1", "aff": "Mila;Department of Computer Science;McGill University;NVIDIA;University of Toronto", "aff_domain": "mila.quebec;cs.washington.edu;mcgill.ca;nvidia.com;toronto.edu", "position": "PhD student;Full Professor;Associate Professor;Principal Research Scientist;Assistant Professor", "bibtex": "@misc{\nmozifian2022generalizing,\ntitle={Generalizing Successor Features to continuous domains for Multi-task Learning},\nauthor={Melissa Mozifian and Dieter Fox and David Meger and Fabio Ramos and Animesh Garg},\nyear={2022},\nurl={https://openreview.net/forum?id=0m4c9ZfDrDt}\n}", "github": "", "project": "", "reviewers": "QCxi;f8Lk;Nsua", "site": "https://openreview.net/forum?id=0m4c9ZfDrDt", "pdf_size": 0, "recommendation": "3;3;3", "confidence": "4;4;4", "correctness": "3;3;4", "technical_novelty": "2;2;1", "empirical_novelty": "2;2;1", "wc_summary_paper": "131;82;54", "wc_summary_review": "67;38;17", "wc_main_review": "1192;446;191", "wc_review": "1390;566;262", "wc_reply_reviewers": "407;278;0", "wc_reply_authors": "773;408;266", "reply_reviewers": "1;2;0", "reply_authors": "1;1;1", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 1.6666666666666667, 0.4714045207910317 ], "empirical_novelty_avg": [ 1.6666666666666667, 0.4714045207910317 ], "wc_summary_paper_avg": [ 89.0, 31.822423959633664 ], "wc_summary_review_avg": [ 40.666666666666664, 20.49932248202906 ], "wc_main_review_avg": [ 609.6666666666666, 424.72762513822386 ], "wc_review_avg": [ 739.3333333333334, 476.53564632902567 ], "wc_reply_reviewers_avg": [ 228.33333333333334, 169.8280175811858 ], "wc_reply_authors_avg": [ 482.3333333333333, 213.55145099535667 ], "reply_reviewers_avg": [ 1.0, 0.816496580927726 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9313838297600934426&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;2;3;4", "aff_unique_norm": "Mila;Unknown Institution;McGill University;NVIDIA;University of Toronto", "aff_unique_dep": "Quebec Artificial Intelligence Institute;Department of Computer Science;;NVIDIA Corporation;", "aff_unique_url": "https://mila.quebec;;https://www.mcgill.ca;https://www.nvidia.com;https://www.utoronto.ca", "aff_unique_abbr": "Mila;;McGill;NVIDIA;U of T", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;2;0", "aff_country_unique": "Canada;;United States" }, { "id": "0n1UvVzW99x", "title": "Synthetic Reduced Nearest Neighbor Model for Regression", "track": "main", "status": "Reject", "tldr": "", "abstract": "Nearest neighbor models are among the most established and accurate approaches to machine learning. In this paper, we investigate Synthetic Reduced Nearest Neighbor (SRNN) as a novel approach to regression tasks. Existing prototype nearest neighbor models are initialized by training a k-means model over each class. However, such initialization is only applicable to classification tasks. In this work, we propose a novel initialization and expectation maximization approach for enabling the application of SRNN to regression. The proposed initialization approach is based on applying the k-means algorithm on the target responses of samples to create various clusters of targets. This is proceeded by learning several centroids in the input space for each cluster found over the targets. Essentially, the initialization consists of finding target clusters and running k-means in the space of feature vectors for the corresponding target cluster. The optimization procedure consists of applying an expectation maximization approach similar to the k-means algorithm that optimizes the centroids in the input space. This algorithm is comprised of two steps: (1) The assignment step, where assignments of the samples to each centroid is found and the target response (i.e., prediction) of each centroid is determined; and (2) the update/centroid step, where each centroid is updated such that the loss function of the entire model is minimized. We will show that the centroid step operates over all samples via solving a weighted binary classification. However, the centroid step is NP-hard and no surrogate objective function exists for solving this problem. Therefore, a new surrogate is proposed to approximate the solution for the centroid step. Furthermore, we consider the consistency of the model, and show that the model is consistent under mild assumptions. The bias-variance relationship in this model is also discussed. We report the empirical evaluation of the proposed SRNN regression model in comparison to several state-of-the-art techniques.", "keywords": "Regression;Nearest Neighbor;Prototype Learning;Prototype Nearest Neighbor", "primary_area": "", "supplementary_material": "/attachment/82fc463d37f2e5b06a6282b14f10605d26d13225.zip", "author": "Pooya Tavallali;Vahid Behzadan;Mukesh Singhal", "authorids": "~Pooya_Tavallali1;~Vahid_Behzadan2;~Mukesh_Singhal1", "gender": "M;M;M", "homepage": ";http://www.sail-lab.org;", "dblp": "231/7674;172/2715;s/MukeshSinghal", "google_scholar": "T2Pa1vQAAAAJ;MYMANOYAAAAJ;", "orcid": ";;", "linkedin": "pooya-tavallali-7b8949105/;vahid-behzadan/;", "or_profile": "~Pooya_Tavallali1;~Vahid_Behzadan2;~Mukesh_Singhal1", "aff": "University of California at Merced;University of New Haven;University of California at Merced", "aff_domain": "ucmerced.edu;newhaven.edu;ucmerced.edu", "position": "PhD student;Assistant Professor;Full Professor", "bibtex": "@misc{\ntavallali2022synthetic,\ntitle={Synthetic Reduced Nearest Neighbor Model for Regression},\nauthor={Pooya Tavallali and Vahid Behzadan and Mukesh Singhal},\nyear={2022},\nurl={https://openreview.net/forum?id=0n1UvVzW99x}\n}", "github": "", "project": "", "reviewers": "HzHJ;c48u;j59H;rWNi", "site": "https://openreview.net/forum?id=0n1UvVzW99x", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "3;4;4;3", "correctness": "3;2;3;3", "technical_novelty": "3;2;3;3", "empirical_novelty": "3;2;3;0", "wc_summary_paper": "75;49;25;57", "wc_summary_review": "6;21;36;13", "wc_main_review": "245;365;382;757", "wc_review": "326;435;443;827", "wc_reply_reviewers": "0;32;0;0", "wc_reply_authors": "90;185;184;705", "reply_reviewers": "0;1;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 1.224744871391589 ], "wc_summary_paper_avg": [ 51.5, 17.96524422322168 ], "wc_summary_review_avg": [ 19.0, 11.157956802210698 ], "wc_main_review_avg": [ 437.25, 192.01090463825224 ], "wc_review_avg": [ 507.75, 190.0254917109807 ], "wc_reply_reviewers_avg": [ 8.0, 13.856406460551018 ], "wc_reply_authors_avg": [ 291.0, 242.11670739542117 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:wIMyokvxsmMJ:scholar.google.com/&scioq=Synthetic+Reduced+Nearest+Neighbor+Model+for+Regression&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;0", "aff_unique_norm": "University of California, Merced;University of New Haven", "aff_unique_dep": ";", "aff_unique_url": "https://www.ucmerced.edu;https://www.newhaven.edu", "aff_unique_abbr": "UC Merced;UNH", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Merced;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "An Experimental Design Perspective on Model-Based Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6697", "id": "0no8Motr-zO", "poster": "", "openreview": "https://openreview.net/forum?id=0no8Motr-zO", "slides": "https://iclr.cc/virtual/2022/poster/6697", "video": "https://iclr.cc/virtual/2022/poster/6697", "author_site": "Viraj Mehta, Biswajit Paria, Jeff Schneider, Stefano Ermon, Willie Neiswanger", "tldr": "", "abstract": "In many practical applications of RL, it is expensive to observe state transitions from the environment. For example, in the problem of plasma control for nuclear fusion, computing the next state for a given state-action pair requires querying an expensive transition function which can lead to many hours of computer simulation or dollars of scientific research. Such expensive data collection prohibits application of standard RL algorithms which usually require a large number of observations to learn. In this work, we address the problem of efficiently learning a policy while making a minimal number of state-action queries to the transition function. In particular, we leverage ideas from Bayesian optimal experimental design to guide the selection of state-action queries for efficient learning. We propose an \\emph{acquisition function} that quantifies how much information a state-action pair would provide about the optimal solution to a Markov decision process. At each iteration, our algorithm maximizes this acquisition function, to choose the most informative state-action pair to be queried, thus yielding a data-efficient RL approach. We experiment with a variety of simulated continuous control problems and show that our approach learns an optimal policy with up to $5$ -- $1,000\\times$ less data than model-based RL baselines and $10^3$ -- $10^5\\times$ less data than model-free RL baselines. We also provide several ablated comparisons which point to substantial improvements arising from the principled method of obtaining data.", "keywords": "reinforcement learning;acquisition function;information gain", "primary_area": "", "supplementary_material": "/attachment/0660f347dce9114bd8bb8595b1b333fdf13e182e.zip", "author": "Viraj Mehta;Biswajit Paria;Jeff Schneider;Stefano Ermon;Willie Neiswanger", "authorids": "~Viraj_Mehta1;~Biswajit_Paria1;~Jeff_Schneider1;~Stefano_Ermon1;~Willie_Neiswanger2", "gender": "M;M;;M;M", "homepage": "http://virajm.com;;https://www.cs.cmu.edu/~schneide;http://cs.stanford.edu/~ermon/;https://willieneis.github.io/", "dblp": "https://dblp.org/pers/m/Mehta:Viraj.html;166/5945;38/247;47/8135;120/7593.html", "google_scholar": "4pHjHBkAAAAJ;8tgfu84AAAAJ;3bSbb20AAAAJ;;QwKHApEAAAAJ", "orcid": "0000-0002-2021-9718;;0000-0002-5080-9073;;", "linkedin": "virajrmehta/;;jeff-schneider-1593b322/;;", "or_profile": "~Viraj_Mehta1;~Biswajit_Paria1;~Jeff_Schneider1;~Stefano_Ermon1;~Willie_Neiswanger2", "aff": "Carnegie Mellon University;Carnegie Mellon University;Carnegie Mellon University;Stanford University;Stanford University", "aff_domain": "cmu.edu;cs.cmu.edu;cs.cmu.edu;stanford.edu;stanford.edu", "position": "PhD student;PhD student;Researcher;Assistant Professor;Postdoc", "bibtex": "@inproceedings{\nmehta2022an,\ntitle={An Experimental Design Perspective on Model-Based Reinforcement Learning},\nauthor={Viraj Mehta and Biswajit Paria and Jeff Schneider and Willie Neiswanger and Stefano Ermon},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=0no8Motr-zO}\n}", "github": "", "project": "", "reviewers": "AKFY;n6c6;X5up;gjNV", "pdf_size": 0, "recommendation": "5;8;8;8", "confidence": "4;3;4;3", "correctness": "3;4;3;4", "technical_novelty": "3;3;3;3", "empirical_novelty": "3;3;3;2", "wc_summary_paper": "44;96;99;73", "wc_summary_review": "30;72;24;41", "wc_main_review": "949;574;185;232", "wc_review": "1023;742;308;346", "wc_reply_reviewers": "0;27;56;0", "wc_reply_authors": "952;711;353;397", "reply_reviewers": "0;1;1;0", "reply_authors": "3;1;1;1", "recommendation_avg": [ 7.25, 1.299038105676658 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 78.0, 22.056745000112777 ], "wc_summary_review_avg": [ 41.75, 18.498310733685926 ], "wc_main_review_avg": [ 485.0, 307.09363392945806 ], "wc_review_avg": [ 604.75, 295.28915913050383 ], "wc_reply_reviewers_avg": [ 20.75, 23.14492384951828 ], "wc_reply_authors_avg": [ 603.25, 244.13149632933477 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.5, 0.8660254037844386 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 36, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3572099415190174238&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=0no8Motr-zO", "email": "cmu.edu;cs.cmu.edu;cs.cmu.edu;stanford.edu;stanford.edu", "author_num": 5, "aff_unique_index": "0;0;0;1;1", "aff_unique_norm": "Carnegie Mellon University;Stanford University", "aff_unique_dep": ";", "aff_unique_url": "https://www.cmu.edu;https://www.stanford.edu", "aff_unique_abbr": "CMU;Stanford", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Stanford", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "0oSM3TC9Z5a", "title": "Learning to Persuade", "track": "main", "status": "Reject", "tldr": "", "abstract": "In the standard Bayesian persuasion model, an informed sender looks to design a signaling scheme to partially reveal the information to an uninformed receiver, so as to influence the behavior of the receiver. This kind of strategic interaction abounds in the real world. However, the standard model relies crucially on some stringent assumptions that usually do not hold in reality. For example, the sender knows the receiver's utility function and the receiver's behavior is completely rational.\n\nIn this paper, we aim to relax these assumptions using techniques from the AI domain. We put forward a framework that contains both a receiver model and a sender model. We first train a receiver model through interactions between the sender and the receiver. The model is used to predict the receiver's behavior when the sender's scheme changes. Then we update the sender model to obtain an approximately optimal scheme using the receiver model. Experiments show that our framework has comparable performance to the optimal scheme. ", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xiaodong Liu;Zhikang Fan;Xun Wang;Weiran Shen", "authorids": "~Xiaodong_Liu2;~Zhikang_Fan1;~Xun_Wang2;~Weiran_Shen1", "gender": "M;M;M;M", "homepage": ";https://fanzhikang.github.io/;;https://www.weiran-shen.info/", "dblp": ";319/0076-1;;159/2147", "google_scholar": ";https://scholar.google.com/citations?hl=en;;-lXgERkAAAAJ", "orcid": ";0000-0001-5026-2269;0000-0001-9093-7670;0000-0003-4366-9276", "linkedin": "%E6%99%93%E5%86%AC-%E5%88%98-036a75115/;;;", "or_profile": "~Xiaodong_Liu2;~Zhikang_Fan1;~Xun_Wang2;~Weiran_Shen1", "aff": "Renmin University of China;Renmin University of China;Tsinghua University;Renmin University of China", "aff_domain": "ruc.edu.cn;ruc.edu.cn;tsinghua.edu.cn;ruc.edu.cn", "position": "MS student;PhD student;PhD student;Assistant Professor", "bibtex": "@misc{\nliu2022learning,\ntitle={Learning to Persuade},\nauthor={Xiaodong Liu and Zhikang Fan and Xun Wang and Weiran Shen},\nyear={2022},\nurl={https://openreview.net/forum?id=0oSM3TC9Z5a}\n}", "github": "", "project": "", "reviewers": "yzkQ;oPdR;qN5Y;MTb8", "site": "https://openreview.net/forum?id=0oSM3TC9Z5a", "pdf_size": 0, "recommendation": "3;3;3;6", "confidence": "5;4;4;3", "correctness": "2;4;3;4", "technical_novelty": "2;2;1;3", "empirical_novelty": "2;2;1;3", "wc_summary_paper": "77;45;86;146", "wc_summary_review": "34;19;63;50", "wc_main_review": "489;283;268;419", "wc_review": "600;347;417;615", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "688;602;572;243", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 3.75, 1.299038105676658 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 88.5, 36.52738698565776 ], "wc_summary_review_avg": [ 41.5, 16.560495161679196 ], "wc_main_review_avg": [ 364.75, 92.76953972075101 ], "wc_review_avg": [ 494.75, 115.55599292118086 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 526.25, 168.98576123448981 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.816496580927726, "corr_recommendation_correctness": 0.5222329678670935, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:se2aQjuVo9EJ:scholar.google.com/&scioq=Learning+to+Persuade&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Renmin University of China;Tsinghua University", "aff_unique_dep": ";", "aff_unique_url": "http://www.ruc.edu.cn;https://www.tsinghua.edu.cn", "aff_unique_abbr": "RUC;THU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "id": "0q0REJNgtg", "title": "Retrieval-Augmented Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Most deep reinforcement learning (RL) algorithms distill experience into parametric behavior policies or value functions via gradient updates. While effective, this approach has several disadvantages: (1) it is computationally expensive, (2) it can take many updates to integrate experiences into the parametric model, (3) experiences that are not fully integrated do not appropriately influence the agent's behavior, and (4) behavior is limited by the capacity of the model. In this paper we explore an alternative paradigm in which we train a network to map a dataset of past experiences to optimal behavior. Specifically, we augment an RL agent with a retrieval process (parameterized as a neural network) that has direct access to a dataset of experiences. This dataset can come from the agent's past experiences, expert demonstrations, or any other relevant source. The retrieval process is trained to retrieve information from the dataset that may be useful in the current context, to help the agent achieve its goal faster and more efficiently. We integrate our method into two different RL agents: an offline DQN agent and an online R2D2 agent. In offline multi-task problems, we show that the retrieval-augmented DQN agent avoids task interference and learns faster than the baseline DQN agent. On Atari, we show that retrieval-augmented R2D2 learns significantly faster than the baseline R2D2 agent and achieves higher scores. We run extensive ablations to measure the contributions of the components of our proposed method.\n", "keywords": "replay buffer;reinforcement learning;offline RL;attention", "primary_area": "", "supplementary_material": "", "author": "Anirudh Goyal;Abram L. Friesen;Theophane Weber;Andrea Banino;Nan Rosemary Ke;Adria Puigdomenech Badia;Ksenia Konyushkova;Michal Valko;Simon Osindero;Timothy P Lillicrap;Nicolas Heess;Charles Blundell", "authorids": "~Anirudh_Goyal1;~Abram_L._Friesen1;~Theophane_Weber1;~Andrea_Banino1;~Nan_Rosemary_Ke1;~Adria_Puigdomenech_Badia2;~Ksenia_Konyushkova1;~Michal_Valko1;~Simon_Osindero1;~Timothy_P_Lillicrap1;~Nicolas_Heess1;~Charles_Blundell1", "gender": "M;M;M;;F;;F;M;Non-Binary;M;;", "homepage": "https://anirudh9119.github.io/;http://www.abramfriesen.com;http://www.thphn.com/;;https://nke001.github.io/;;https://ksenia.konyushkova.com/;https://misovalko.github.io/research.html;;http://contrastiveconvergence.net/~timothylillicrap/index.php;;http://www.gatsby.ucl.ac.uk/~ucgtcbl/", "dblp": "172/1039;47/11107;;;120/5291;;127/6394;03/5455;05/5467;37/10849;76/9181;35/8396", "google_scholar": "krrh6OUAAAAJ;sfvCNiEAAAAJ;LZxqcX4AAAAJ;;https://scholar.google.ca/citations?user=dxwPYhQAAAAJ;;https://scholar.google.ch/citations?user=gTACuSgAAAAJ;jrazNCQAAAAJ;Jq8ZS5kAAAAJ;https://scholar.google.co.uk/citations?user=htPVdRMAAAAJ;79k7bGEAAAAJ;https://scholar.google.co.uk/citations?user=f31mvPsAAAAJ", "orcid": ";;;;;;;;;;;", "linkedin": ";;;;;;;michalvalko/;;;;", "or_profile": "~Anirudh_Goyal1;~Abram_L._Friesen1;~Theophane_Weber1;~Andrea_Banino1;~Nan_Rosemary_Ke1;~Adria_Puigdomenech_Badia2;~Ksenia_Konyushkova1;~Michal_Valko1;~Simon_Osindero1;~Timothy_P_Lillicrap1;~Nicolas_Heess1;~Charles_Blundell1", "aff": "University of Montreal;Google DeepMind;;;Mila;;Google DeepMind;Google DeepMind;Google;Google DeepMind;Google DeepMind;Google DeepMind", "aff_domain": "umontreal.ca;google.com;;;mila.quebec;;google.com;deepmind.com;google.com;deepmind.com;google.com;google.com", "position": "PhD student;Research Scientist;;;PhD student;;Research Scientist;Senior Staff Research Scientist;Scientist;Research Scientist;Research Scientist;Research Scientist", "bibtex": "@misc{\ngoyal2022retrievalaugmented,\ntitle={Retrieval-Augmented Reinforcement Learning},\nauthor={Anirudh Goyal and Abram L. Friesen and Theophane Weber and Andrea Banino and Nan Rosemary Ke and Adria Puigdomenech Badia and Ksenia Konyushkova and Michal Valko and Simon Osindero and Timothy P Lillicrap and Nicolas Heess and Charles Blundell},\nyear={2022},\nurl={https://openreview.net/forum?id=0q0REJNgtg}\n}", "github": "", "project": "", "reviewers": "yAsc;cZ2E;LSKp;YWJ5", "site": "https://openreview.net/forum?id=0q0REJNgtg", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "3;4;3;4", "correctness": "2;3;4;3", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;2;4;3", "wc_summary_paper": "137;122;48;60", "wc_summary_review": "25;108;90;71", "wc_main_review": "570;194;657;343", "wc_review": "732;424;795;474", "wc_reply_reviewers": "125;0;519;107", "wc_reply_authors": "1708;1484;2346;1020", "reply_reviewers": "1;0;3;1", "reply_authors": "5;4;7;2", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 91.75, 38.356062102358734 ], "wc_summary_review_avg": [ 73.5, 30.907118921051183 ], "wc_main_review_avg": [ 441.0, 182.9685765370655 ], "wc_review_avg": [ 606.25, 159.80046151372656 ], "wc_reply_reviewers_avg": [ 187.75, 197.12607006684834 ], "wc_reply_authors_avg": [ 1639.5, 477.439786779443 ], "reply_reviewers_avg": [ 1.25, 1.0897247358851685 ], "reply_authors_avg": [ 4.5, 1.8027756377319946 ], "replies_avg": [ 28, 0 ], "authors#_avg": [ 12, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.7071067811865475, "gs_citation": 59, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11016479255634907533&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff_unique_index": "0;1;2;1;1;1;1;1;1", "aff_unique_norm": "University of Montreal;Google;Mila", "aff_unique_dep": ";Google DeepMind;Quebec Artificial Intelligence Institute", "aff_unique_url": "https://wwwumontreal.ca;https://deepmind.com;https://mila.quebec", "aff_unique_abbr": "UM;DeepMind;Mila", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;1;0;1;1;2;1;1;1", "aff_country_unique": "Canada;United Kingdom;United States" }, { "id": "0qpEfoNObj", "title": "Weight Expansion: A New Perspective on Dropout and Generalization", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "While dropout is known to be a successful regularization technique, insights into the mechanisms that lead to this success are still lacking. We introduce the concept of \u201cweight expansion\u201d, an increase in the signed volume of a parallelotope spanned by the column or row vectors of the weight covariance matrix, and show that weight expansion is an effective means of increasing the generalization in a PAC-Bayesian setting. We provide a theoretical argument that dropout leads to weight expansion and extensive experimental support for the correlation between dropout and weight expansion. To support our hypothesis that weight expansion should be regarded as the cause for the increased generalization capacity obtained by using dropout, and not just as a mere by-product, we have studied other methods that achieve weight expansion (resp. contraction), and found that they generally lead to an increased (resp. decreased) generalization ability. This suggests that dropout is an attractive regularizer because it is a computationally cheap method for obtaining weight expansion. This insight justifies the role of dropout as a regularizer, while paving the way for identifying regularizers that promise improved generalization through weight expansion.", "keywords": "dropout;generalization;PAC-Bayes", "primary_area": "", "supplementary_material": "/attachment/613eb9c23042befa98bc41f7a1e0694e472a6113.zip", "author": "Gaojie Jin;Xinping Yi;Pengfei Yang;Lijun Zhang;Sven Schewe;Xiaowei Huang", "authorids": "~Gaojie_Jin1;~Xinping_Yi1;~Pengfei_Yang2;~Lijun_Zhang2;~Sven_Schewe1;~Xiaowei_Huang1", "gender": "M;M;M;M;Not Specified;M", "homepage": "https://alexkael.github.io/;https://sites.google.com/site/xinpingyi00/;https://iscasmc.ios.ac.cn/?page_id=1181;;https://cgi.csc.liv.ac.uk/~sven/;https://cgi.csc.liv.ac.uk/~xiaowei/", "dblp": "276/5476;95/10043.html;;76/4015-1;38/5198.html;60/5414-1.html", "google_scholar": "n_cu7jwAAAAJ;wAcbI5kAAAAJ;;;https://scholar.google.co.uk/citations?user=CG0CxlEAAAAJ;https://scholar.google.co.uk/citations?user=X4fLCCIAAAAJ", "orcid": ";;;;0000-0002-9093-9518;", "linkedin": ";;;;;", "or_profile": "~Gaojie_Jin1;~Xinping_Yi1;~Pengfei_Yang2;~Lijun_Zhang2;~Sven_Schewe1;~Xiaowei_Huang1", "aff": "University of Liverpool;University of Liverpool;Chinese Academy of Sciences, Chinese Academy of Sciences;Chinese Academy of Sciences, Chinese Academy of Sciences;University of Liverpool;University of Liverpool", "aff_domain": "liverpool.ac.uk;liverpool.ac.uk;ios.ac.cn;ios.ac.cn;liv.ac.uk;liverpool.ac.uk", "position": "PhD student;Assistant Professor;Postdoc;Full Professor;Full Professor;Full Professor", "bibtex": "@misc{\njin2022weight,\ntitle={Weight Expansion: A New Perspective on Dropout and Generalization},\nauthor={Gaojie Jin and Xinping Yi and Pengfei Yang and Lijun Zhang and Sven Schewe and Xiaowei Huang},\nyear={2022},\nurl={https://openreview.net/forum?id=0qpEfoNObj}\n}", "github": "", "project": "", "reviewers": "wW51;nB1c;iaGH;w6em", "site": "https://openreview.net/forum?id=0qpEfoNObj", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "4;4;3;3", "correctness": "4;3;3;4", "technical_novelty": "3;3;3;4", "empirical_novelty": "3;3;3;4", "wc_summary_paper": "128;36;49;74", "wc_summary_review": "30;27;215;30", "wc_main_review": "770;126;110;270", "wc_review": "928;189;374;374", "wc_reply_reviewers": "409;32;0;0", "wc_reply_authors": "3113;481;1427;516", "reply_reviewers": "2;1;0;0", "reply_authors": "8;2;3;1", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 71.75, 35.23049105533444 ], "wc_summary_review_avg": [ 75.5, 80.54967411479701 ], "wc_main_review_avg": [ 319.0, 267.7368110663904 ], "wc_review_avg": [ 466.25, 277.08335839598885 ], "wc_reply_reviewers_avg": [ 110.25, 172.97741904653336 ], "wc_reply_authors_avg": [ 1384.25, 1067.722195844968 ], "reply_reviewers_avg": [ 0.75, 0.82915619758885 ], "reply_authors_avg": [ 3.5, 2.692582403567252 ], "replies_avg": [ 22, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.6882472016116854, "corr_recommendation_correctness": 0.2294157338705618, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1799565525421379353&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0;1;1;0;0", "aff_unique_norm": "University of Liverpool;Chinese Academy of Sciences", "aff_unique_dep": ";", "aff_unique_url": "https://www.liverpool.ac.uk;http://www.cas.cn", "aff_unique_abbr": "Liv Uni;CAS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;1;0;0", "aff_country_unique": "United Kingdom;China" }, { "title": "Neural Spectral Marked Point Processes", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6311", "id": "0rcbOaoBXbg", "poster": "", "openreview": "https://openreview.net/forum?id=0rcbOaoBXbg", "slides": "https://iclr.cc/virtual/2022/poster/6311", "video": "https://iclr.cc/virtual/2022/poster/6311", "author_site": "Shixiang Zhu, Haoyun Wang, Zheng Dong, Xiuyuan Cheng, Yao Xie", "tldr": "", "abstract": "Self- and mutually-exciting point processes are popular models in machine learning and statistics for dependent discrete event data. To date, most existing models assume stationary kernels (including the classical Hawkes processes) and simple parametric models. Modern applications with complex event data require more general point process models that can incorporate contextual information of the events, called marks, besides the temporal and location information. Moreover, such applications often require non-stationary models to capture more complex spatio-temporal dependence. To tackle these challenges, a key question is to devise a versatile influence kernel in the point process model. In this paper, we introduce a novel and general neural network-based non-stationary influence kernel with high expressiveness for handling complex discrete events data while providing theoretical performance guarantees. We demonstrate the superior performance of our proposed method compared with the state-of-the-art on synthetic and real data.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Shixiang Zhu;Haoyun Wang;Zheng Dong;Xiuyuan Cheng;Yao Xie", "authorids": "~Shixiang_Zhu1;~Haoyun_Wang1;~Zheng_Dong3;~Xiuyuan_Cheng1;~Yao_Xie2", "gender": "M;F;M;;F", "homepage": "https://sites.google.com/view/woodyzhu;https://github.com/wanghy012;https://sites.google.com/view/zheng-dong/home;;http://www2.isye.gatech.edu/~yxie77", "dblp": "133/3853;;;79/9747;13/4242-2", "google_scholar": "v6_Gv6IAAAAJ;;iqZN-q4AAAAJ;I2gwdssAAAAJ;qvYp8ZQAAAAJ", "orcid": "0000-0002-2241-6096;;0000-0002-1505-8569;;", "linkedin": "shixiang-zhu-26b956a0/;;zheng-dong-23a264222/;;yaoxie/", "or_profile": "~Shixiang_Zhu1;~Haoyun_Wang1;~Zheng_Dong3;~Xiuyuan_Cheng1;~Yao_Xie2", "aff": "Georgia Institute of Technology;Georgia Institute of Technology;Georgia Institute of Technology;Duke University;Georgia Institute of Technology", "aff_domain": "gatech.edu;gatech.edu;gatech.edu;duke.edu;gatech.edu", "position": "PhD student;PhD student;PhD student;Assistant Professor;Associate Professor", "bibtex": "@inproceedings{\nzhu2022neural,\ntitle={Neural Spectral Marked Point Processes},\nauthor={Shixiang Zhu and Haoyun Wang and Xiuyuan Cheng and Yao Xie},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=0rcbOaoBXbg}\n}", "github": "", "project": "", "reviewers": "q7Hf;CU9d;HJXe", "pdf_size": 0, "recommendation": "3;6;8", "confidence": "3;4;3", "correctness": "3;3;4", "technical_novelty": "2;2;3", "empirical_novelty": "2;2;0", "wc_summary_paper": "20;22;87", "wc_summary_review": "56;25;35", "wc_main_review": "144;127;152", "wc_review": "220;174;274", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "415;71;609", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "recommendation_avg": [ 5.666666666666667, 2.0548046676563256 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 1.3333333333333333, 0.9428090415820634 ], "wc_summary_paper_avg": [ 43.0, 31.123410267299864 ], "wc_summary_review_avg": [ 38.666666666666664, 12.918548250050733 ], "wc_main_review_avg": [ 141.0, 10.424330514074594 ], "wc_review_avg": [ 222.66666666666666, 40.868352330650936 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 365.0, 222.4649785172189 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.1147078669352809, "corr_recommendation_correctness": 0.8029550685469661, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15895838574872798573&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "pdf": "https://openreview.net/pdf?id=0rcbOaoBXbg", "email": "gatech.edu;gatech.edu;gatech.edu;duke.edu;gatech.edu", "author_num": 5, "aff_unique_index": "0;0;0;1;0", "aff_unique_norm": "Georgia Institute of Technology;Duke University", "aff_unique_dep": ";", "aff_unique_url": "https://www.gatech.edu;https://www.duke.edu", "aff_unique_abbr": "Georgia Tech;Duke", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "0rjx6jy25R4", "title": "Classify and Generate Reciprocally: Simultaneous Positive-Unlabelled Learning and Conditional Generation with Extra Data", "track": "main", "status": "Reject", "tldr": "", "abstract": "The scarcity of class-labeled data is a ubiquitous bottleneck in a wide range of machine learning problems. While abundant unlabeled data normally exist and provide a potential solution, it is extremely challenging to exploit them. In this paper, we address this problem by leveraging Positive-Unlabeled~(PU) classification and the conditional generation with extra unlabeled data \\emph{simultaneously}, both of which aim to make full use of agnostic unlabeled data to improve classification and generation performance. In particular, we present a novel training framework to jointly target both PU classification and conditional generation when exposing to extra data, especially out-of-distribution unlabeled data, by exploring the interplay between them: 1) enhancing the performance of PU classifiers with the assistance of a novel Conditional Generative Adversarial Network~(CGAN) that is robust to noisy labels, 2) leveraging extra data with predicted labels from a PU classifier to help the generation. Our key contribution is a Classifier-Noise-Invariant Conditional GAN~(CNI-CGAN) that can learn the clean data distribution from noisy labels predicted by a PU classifier. Theoretically, we proved the optimal condition of CNI-CGAN and experimentally, we conducted extensive evaluations on diverse datasets, verifying the simultaneous improvements on both classification and generation. ", "keywords": "PU learning;Robust Generative Models;Lable noises", "primary_area": "", "supplementary_material": "", "author": "Bing Yu;Ke Sun;He Wang;Zhanxing Zhu;Zhouchen Lin", "authorids": "~Bing_Yu1;~Ke_Sun6;~He_Wang6;~Zhanxing_Zhu1;~Zhouchen_Lin1", "gender": ";M;M;M;M", "homepage": ";https://sites.google.com/view/kesun;http://drhewang.com/;https://zhanxingzhu.github.io/;https://zhouchenlin.github.io", "dblp": "47/2129;69/476-13;01/6368-2;87/7756.html;l/ZhouchenLin", "google_scholar": ";lYdNhFQAAAAJ;https://scholar.google.co.jp/citations?user=BaaPAVYAAAAJ;a2sHceIAAAAJ;https://scholar.google.com.tw/citations?user=TanjFwoAAAAJ", "orcid": ";;0000-0002-2281-5679;;0000-0003-1493-7569", "linkedin": ";;;;", "or_profile": "~Bing_Yu1;~Ke_Sun6;~He_Wang6;~Zhanxing_Zhu1;~Zhouchen_Lin1", "aff": "Peking University;University of Alberta;University of Leeds;Peking University;Peking University", "aff_domain": "pku.edu.cn;ualberta.ca;leeds.ac.uk;pku.edu.cn;pku.edu.cn", "position": "PhD student;PhD student;Associate Professor;Assistant Professor;Professor", "bibtex": "@misc{\nyu2022classify,\ntitle={Classify and Generate Reciprocally: Simultaneous Positive-Unlabelled Learning and Conditional Generation with Extra Data},\nauthor={Bing Yu and Ke Sun and He Wang and Zhanxing Zhu and Zhouchen Lin},\nyear={2022},\nurl={https://openreview.net/forum?id=0rjx6jy25R4}\n}", "github": "", "project": "", "reviewers": "XLhs;jcSw;NqyD;EFMY", "site": "https://openreview.net/forum?id=0rjx6jy25R4", "pdf_size": 0, "recommendation": "1;5;5;5", "confidence": "4;4;3;5", "correctness": "3;3;3;4", "technical_novelty": "1;3;2;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "40;37;37;95", "wc_summary_review": "10;385;38;55", "wc_main_review": "70;54;92;210", "wc_review": "120;476;167;360", "wc_reply_reviewers": "0;124;110;0", "wc_reply_authors": "582;589;646;192", "reply_reviewers": "0;1;1;0", "reply_authors": "2;2;2;2", "recommendation_avg": [ 4.0, 1.7320508075688772 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.82915619758885 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 52.25, 24.71209218176397 ], "wc_summary_review_avg": [ 122.0, 152.69086416678635 ], "wc_main_review_avg": [ 106.5, 61.259693110560065 ], "wc_review_avg": [ 280.75, 144.20709933980365 ], "wc_reply_reviewers_avg": [ 58.5, 58.709028266528136 ], "wc_reply_authors_avg": [ 502.25, 180.8346966154449 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 2.0, 0.0 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:zTJ7IRRe_iUJ:scholar.google.com/&scioq=Classify+and+Generate+Reciprocally:+Simultaneous+Positive-Unlabelled+Learning+and+Conditional+Generation+with+Extra+Data&hl=en&as_sdt=0,5", "gs_version_total": 6, "aff_unique_index": "0;1;2;0;0", "aff_unique_norm": "Peking University;University of Alberta;University of Leeds", "aff_unique_dep": ";;", "aff_unique_url": "http://www.pku.edu.cn;https://www.ualberta.ca;https://www.leeds.ac.uk", "aff_unique_abbr": "Peking U;UAlberta;Leeds", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2;0;0", "aff_country_unique": "China;Canada;United Kingdom" }, { "id": "0sEIBFb4cs", "title": "Practical Adversarial Attacks on Brain--Computer Interfaces", "track": "main", "status": "Reject", "tldr": "", "abstract": "Deep learning has been widely employed in brain--computer interfaces (BCIs) to decode a subject's intentions based on recorded brain activities enabling direct interaction with computers and machines. BCI systems play a crucial role in motor rehabilitation and have recently experienced a significant market boost as consumer-grade products. Recent studies have shown that deep learning-based BCIs are vulnerable to adversarial attacks. Failures in such systems might cause medical misdiagnoses, physical harm, and financial damages, hence it is of utmost importance to analyze and understand in-depth, potential malicious attacks to develop countermeasures. In this work, we present the first study that analyzes and models adversarial attacks based on physical domain constraints in EEG-based BCIs. Specifically, we assess the robustness of EEGNet which is the current state-of-the-art network for embedded BCIs. We propose new methods to induce denial-of-service attacks and incorporate domain-specific insights and constraints to accomplish two key goals: (i) create smooth adversarial attacks that are physiologically plausible; (ii) consider the realistic case where the attack happens at the origin of the signal acquisition and it propagates on the human head. Our results show that EEGNet is significantly vulnerable to adversarial attacks with an attack success rate of more than 50\\%. With our work, we want to raise awareness and incentivize future developments of proper countermeasures.", "keywords": "neuroscience;brain-computer interfaces;practical attacks;adversarial attacks;EEGNet;edge computing;embedded systems", "primary_area": "", "supplementary_material": "/attachment/20c20afa33ae5fb5df516aa69837cf85e8c2fd58.zip", "author": "Rodolfo Octavio Siller Quintanilla;Xiaying Wang;Michael Hersche;Luca Benini;Gagandeep Singh", "authorids": "~Rodolfo_Octavio_Siller_Quintanilla1;~Xiaying_Wang1;~Michael_Hersche1;~Luca_Benini1;~Gagandeep_Singh1", "gender": "M;F;M;M;M", "homepage": ";;https://research.ibm.com/people/michael-hersche--1;https://ggndpsngh.github.io/;https://ee.ethz.ch/the-department/people-a-z/person-detail.luca-benini.html", "dblp": ";229/8936;231/2484;64/3747-1;b/LucaBenini.html", "google_scholar": ";https://scholar.google.ch/citations?user=eyAg9tUAAAAJ;uhC6m3EAAAAJ;https://scholar.google.ch/citations?user=m4b2ruEAAAAJ;8riq3sYAAAAJ", "orcid": ";0000-0003-3467-5033;0000-0003-3065-7639;0000-0002-9299-2961;0000-0001-8068-3806", "linkedin": "siller-rodolfo/;;;gagandeep-singh-1bb01b49/;lubenini/", "or_profile": "~Rodolfo_Octavio_Siller_Quintanilla1;~Xiaying_Wang1;~Michael_Hersche1;~Gagandeep_Singh1;~Luca_Benini2", "aff": "University of Zurich;ETHZ - ETH Zurich;International Business Machines;University of Illinois, Urbana Champaign;University of Bologna", "aff_domain": "uzh.ch;ethz.ch;ibm.com;illinois.edu;unibo.it", "position": "MS student;PhD student;PhD student;Assistant Professor;Full Professor", "bibtex": "@misc{\nquintanilla2022practical,\ntitle={Practical Adversarial Attacks on Brain--Computer Interfaces},\nauthor={Rodolfo Octavio Siller Quintanilla and Xiaying Wang and Michael Hersche and Luca Benini and Gagandeep Singh},\nyear={2022},\nurl={https://openreview.net/forum?id=0sEIBFb4cs}\n}", "github": "", "project": "", "reviewers": "n45G;VvTE;2xxQ;D9vr", "site": "https://openreview.net/forum?id=0sEIBFb4cs", "pdf_size": 0, "recommendation": "3;3;6;8", "confidence": "5;5;4;4", "correctness": "3;3;3;3", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "89;46;56;71", "wc_summary_review": "66;210;58;74", "wc_main_review": "632;63;268;329", "wc_review": "787;319;382;474", "wc_reply_reviewers": "364;299;80;112", "wc_reply_authors": "1698;989;702;211", "reply_reviewers": "1;1;1;1", "reply_authors": "3;2;2;2", "recommendation_avg": [ 5.0, 2.1213203435596424 ], "confidence_avg": [ 4.5, 0.5 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 65.5, 16.224980739587952 ], "wc_summary_review_avg": [ 102.0, 62.609903369994115 ], "wc_main_review_avg": [ 323.0, 203.80260057222037 ], "wc_review_avg": [ 490.5, 179.83951178759355 ], "wc_reply_reviewers_avg": [ 213.75, 120.50388997870567 ], "wc_reply_authors_avg": [ 900.0, 538.2030286053767 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 2.25, 0.4330127018922193 ], "replies_avg": [ 20, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.9428090415820635, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:fENX7FtOFRcJ:scholar.google.com/&scioq=Practical+Adversarial+Attacks+on+Brain--Computer+Interfaces&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1;2;3;4", "aff_unique_norm": "University of Zurich;ETH Zurich;International Business Machines Corporation;University of Illinois Urbana-Champaign;University of Bologna", "aff_unique_dep": ";;;;", "aff_unique_url": "https://www.unizh.ch;https://www.ethz.ch;https://www.ibm.com;https://illinois.edu;https://www.unibo.it", "aff_unique_abbr": "UZH;ETHZ;IBM;UIUC;Unibo", "aff_campus_unique_index": "1", "aff_campus_unique": ";Urbana-Champaign", "aff_country_unique_index": "0;0;1;1;2", "aff_country_unique": "Switzerland;United States;Italy" }, { "title": "Learning Graphon Mean Field Games and Approximate Nash Equilibria", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7112", "id": "0sgntlpKDOz", "poster": "", "openreview": "https://openreview.net/forum?id=0sgntlpKDOz", "slides": "https://iclr.cc/virtual/2022/poster/7112", "video": "https://iclr.cc/virtual/2022/poster/7112", "author_site": "Kai Cui, Heinz Koeppl", "tldr": "", "abstract": "Recent advances at the intersection of dense large graph limits and mean field games have begun to enable the scalable analysis of a broad class of dynamical sequential games with large numbers of agents. So far, results have been largely limited to graphon mean field systems with continuous-time diffusive or jump dynamics, typically without control and with little focus on computational methods. We propose a novel discrete-time formulation for graphon mean field games as the limit of non-linear dense graph Markov games with weak interaction. On the theoretical side, we give extensive and rigorous existence and approximation properties of the graphon mean field solution in sufficiently large systems. On the practical side we provide general learning schemes for graphon mean field equilibria by either introducing agent equivalence classes or reformulating the graphon mean field system as a classical mean field system. By repeatedly finding a regularized optimal control solution and its generated mean field, we successfully obtain plausible approximate Nash equilibria in otherwise infeasible large dense graph games with many agents. Empirically, we are able to demonstrate on a number of examples that the finite-agent behavior comes increasingly close to the mean field behavior for our computed equilibria as the graph or system size grows, verifying our theory. More generally, we successfully apply policy gradient reinforcement learning in conjunction with sequential Monte Carlo methods.", "keywords": "Mean Field Games;Reinforcement Learning;Multi Agent Systems", "primary_area": "", "supplementary_material": "/attachment/2f6d5e026e1776237d60fe154e2222763248bb51.zip", "author": "Kai Cui;Heinz Koeppl", "authorids": "~Kai_Cui3;~Heinz_Koeppl1", "gender": ";M", "homepage": ";", "dblp": ";41/6084", "google_scholar": ";https://scholar.google.de/citations?user=WaPW80kAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Kai_Cui3;~Heinz_Koeppl1", "aff": ";TU Darmstadt", "aff_domain": ";tu-darmstadt.de", "position": ";Full Professor", "bibtex": "@inproceedings{\ncui2022learning,\ntitle={Learning Graphon Mean Field Games and Approximate Nash Equilibria},\nauthor={Kai Cui and Heinz Koeppl},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=0sgntlpKDOz}\n}", "github": "", "project": "", "reviewers": "V2M1;GRFn;wpJP;B2QF", "pdf_size": 0, "recommendation": "5;5;6;8", "confidence": "4;4;4;3", "correctness": "4;3;4;4", "technical_novelty": "3;3;3;3", "empirical_novelty": "2;2;2;4", "wc_summary_paper": "115;90;164;62", "wc_summary_review": "42;22;54;24", "wc_main_review": "297;507;493;217", "wc_review": "454;619;711;303", "wc_reply_reviewers": "0;345;0;0", "wc_reply_authors": "1311;2183;1610;358", "reply_reviewers": "0;1;0;0", "reply_authors": "2;4;3;1", "recommendation_avg": [ 6.0, 1.224744871391589 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 107.75, 37.4991666574072 ], "wc_summary_review_avg": [ 35.5, 13.219304066402286 ], "wc_main_review_avg": [ 378.5, 124.84690624921389 ], "wc_review_avg": [ 521.75, 156.29679299333048 ], "wc_reply_reviewers_avg": [ 86.25, 149.38938215281567 ], "wc_reply_authors_avg": [ 1365.5, 660.7028454608017 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 2.5, 1.118033988749895 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.9428090415820632, "corr_recommendation_correctness": 0.4714045207910316, "gs_citation": 43, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18310233350128597723&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=0sgntlpKDOz", "email": ";tu-darmstadt.de", "author_num": 2, "aff_unique_index": "0", "aff_unique_norm": "Technische Universit\u00e4t Darmstadt", "aff_unique_dep": "", "aff_unique_url": "https://www.tu-darmstadt.de", "aff_unique_abbr": "TU Darmstadt", "aff_campus_unique_index": "0", "aff_campus_unique": "Darmstadt", "aff_country_unique_index": "0", "aff_country_unique": "Germany" }, { "id": "0uZu36la_y4", "title": "Protect the weak: Class focused online learning for adversarial training", "track": "main", "status": "Reject", "tldr": "", "abstract": "Adversarial training promises a defense against adversarial perturbations in terms of average accuracy. In this work, we identify that the focus on the average accuracy metric can create vulnerabilities to the \"weakest\" class. For instance, on CIFAR10, where the average accuracy is 47%, the worst class accuracy can be as low as 14%. The performance sacrifice of the weakest class can be detrimental for real-world systems, if indeed the threat model can adversarially choose the class to attack. To this end, we propose to explicitly minimize the worst class error, which results in a min-max-max optimization formulation. We provide high probability convergence guarantees of the worst class loss for our method, dubbed as class focused online learning (CFOL), which can be plugged into existing training setups with virtually no overhead in computation. We observe significant improvements on the worst class accuracy of 30% for CIFAR10. We also observe consistent behavior across CIFAR100 and STL10. Intriugingly, we find that minimizing the worst case can even sometimes improve the average.", "keywords": "Adversarial training;Adversarial examples;Minimax;Robustness", "primary_area": "", "supplementary_material": "", "author": "Thomas Pethick;Grigorios Chrysos;Volkan Cevher", "authorids": "~Thomas_Pethick1;~Grigorios_Chrysos1;~Volkan_Cevher1", "gender": "M;M;M", "homepage": "https://pethick.dk;https://grigorisg9gr.github.io/;http://lions.epfl.ch", "dblp": "305/4521;75/6117-2;70/5301", "google_scholar": ";1bU041kAAAAJ;https://scholar.google.ch/citations?user=hlWhzU8AAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Thomas_Pethick1;~Grigorios_Chrysos1;~Volkan_Cevher1", "aff": "Swiss Federal Institute of Technology Lausanne;Swiss Federal Institute of Technology Lausanne;Swiss Institute of Technology", "aff_domain": "epfl.ch;epfl.ch;epfl.ch", "position": "PhD student;Postdoc;Associate Professor", "bibtex": "@misc{\npethick2022protect,\ntitle={Protect the weak: Class focused online learning for adversarial training},\nauthor={Thomas Pethick and Grigorios Chrysos and Volkan Cevher},\nyear={2022},\nurl={https://openreview.net/forum?id=0uZu36la_y4}\n}", "github": "", "project": "", "reviewers": "85vN;MFgG;mvco;wi3v", "site": "https://openreview.net/forum?id=0uZu36la_y4", "pdf_size": 0, "recommendation": "3;3;6;6", "confidence": "4;3;4;4", "correctness": "3;4;4;3", "technical_novelty": "2;2;3;3", "empirical_novelty": "3;2;3;3", "wc_summary_paper": "84;87;161;74", "wc_summary_review": "80;37;117;37", "wc_main_review": "348;140;446;247", "wc_review": "512;264;724;358", "wc_reply_reviewers": "94;0;215;5", "wc_reply_authors": "1182;607;798;203", "reply_reviewers": "1;0;1;1", "reply_authors": "2;2;2;2", "recommendation_avg": [ 4.5, 1.5 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 101.5, 34.6878941419049 ], "wc_summary_review_avg": [ 67.75, 33.41687447981932 ], "wc_main_review_avg": [ 295.25, 113.95037296999075 ], "wc_review_avg": [ 464.5, 174.02514186174363 ], "wc_reply_reviewers_avg": [ 78.5, 87.23101512650189 ], "wc_reply_authors_avg": [ 697.5, 352.69001970569 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 2.0, 0.0 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.5773502691896258, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:sFbzKqPY0pgJ:scholar.google.com/&scioq=Protect+the+weak:+Class+focused+online+learning+for+adversarial+training&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;1", "aff_unique_norm": "Swiss Federal Institute of Technology Lausanne;Swiss Federal Institute of Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.epfl.ch;https://www.ethz.ch", "aff_unique_abbr": "EPFL;ETH Zurich", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Lausanne;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Switzerland" }, { "title": "Analytic-DPM: an Analytic Estimate of the Optimal Reverse Variance in Diffusion Probabilistic Models", "status": "Oral", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7166", "id": "0xiJLKH-ufZ", "poster": "", "openreview": "https://openreview.net/forum?id=0xiJLKH-ufZ", "slides": "https://iclr.cc/virtual/2022/poster/7166", "video": "https://iclr.cc/virtual/2022/poster/7166", "author_site": "Fan Bao, Chongxuan Li, Jun Zhu, Bo Zhang", "tldr": "", "abstract": "Diffusion probabilistic models (DPMs) represent a class of powerful generative models. Despite their success, the inference of DPMs is expensive since it generally needs to iterate over thousands of timesteps. A key problem in the inference is to estimate the variance in each timestep of the reverse process. In this work, we present a surprising result that both the optimal reverse variance and the corresponding optimal KL divergence of a DPM have analytic forms w.r.t. its score function. Building upon it, we propose \\textit{Analytic-DPM}, a training-free inference framework that estimates the analytic forms of the variance and KL divergence using the Monte Carlo method and a pretrained score-based model. Further, to correct the potential bias caused by the score-based model, we derive both lower and upper bounds of the optimal variance and clip the estimate for a better result. Empirically, our analytic-DPM improves the log-likelihood of various DPMs, produces high-quality samples, and meanwhile enjoys a $20\\times$ to $80\\times$ speed up.", "keywords": "diffusion probabilistic models;generative models", "primary_area": "", "supplementary_material": "/attachment/82cdf46f2e4104c5546cc812a2654ad6d79347f0.zip", "author": "Fan Bao;Chongxuan Li;Jun Zhu;Bo Zhang", "authorids": "~Fan_Bao1;~Chongxuan_Li1;~Jun_Zhu2;~Bo_Zhang2", "gender": "M;M;M;M", "homepage": "https://baofff.github.io/;http://ml.cs.tsinghua.edu.cn/~chongxuan;http://ml.cs.tsinghua.edu.cn/~jun;https://www.cs.tsinghua.edu.cn/csen/info/1059/4006.htm", "dblp": "71/3877;161/9965;50/2644-1;", "google_scholar": ";UKMcQn4AAAAJ;axsP38wAAAAJ;", "orcid": ";0000-0002-0912-9076;;", "linkedin": ";;;", "or_profile": "~Fan_Bao1;~Chongxuan_Li1;~Jun_Zhu2;~Bo_Zhang2", "aff": "Tsinghua University;Renmin University of China;Tsinghua University;Tsinghua University", "aff_domain": "tsinghua.edu.cn;ruc.edu.cn;mail.tsinghua.edu.cn;tsinghua.edu.cn", "position": "PhD student;Assistant Professor;Professor;Full Professor", "bibtex": "@inproceedings{\nbao2022analyticdpm,\ntitle={Analytic-{DPM}: an Analytic Estimate of the Optimal Reverse Variance in Diffusion Probabilistic Models},\nauthor={Fan Bao and Chongxuan Li and Jun Zhu and Bo Zhang},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=0xiJLKH-ufZ}\n}", "github": "", "project": "", "reviewers": "ejLx;a5eg;Qdy8;Xtgn;FHw9", "pdf_size": 0, "recommendation": "8;8;8;8;8", "confidence": "4;4;3;3;4", "correctness": "4;4;3;4;3", "technical_novelty": "4;3;4;3;3", "empirical_novelty": "3;3;3;3;3", "wc_summary_paper": "318;193;130;64;85", "wc_summary_review": "111;34;131;93;38", "wc_main_review": "477;295;212;110;398", "wc_review": "906;522;473;267;521", "wc_reply_reviewers": "156;157;0;0;14", "wc_reply_authors": "903;393;254;119;495", "reply_reviewers": "2;1;0;0;1", "reply_authors": "3;2;1;1;1", "recommendation_avg": [ 8.0, 0.0 ], "confidence_avg": [ 3.6, 0.4898979485566356 ], "correctness_avg": [ 3.6, 0.4898979485566356 ], "technical_novelty_avg": [ 3.4, 0.4898979485566356 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 158.0, 91.40459507048865 ], "wc_summary_review_avg": [ 81.4, 38.99025519280426 ], "wc_main_review_avg": [ 298.4, 130.2222715206581 ], "wc_review_avg": [ 537.8, 206.70500719624573 ], "wc_reply_reviewers_avg": [ 65.4, 74.55896994996644 ], "wc_reply_authors_avg": [ 432.8, 267.20059880172425 ], "reply_reviewers_avg": [ 0.8, 0.7483314773547883 ], "reply_authors_avg": [ 1.6, 0.8 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 383, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=799884416375929942&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=0xiJLKH-ufZ", "email": "tsinghua.edu.cn;ruc.edu.cn;mail.tsinghua.edu.cn;tsinghua.edu.cn", "author_num": 4, "aff_unique_index": "0;1;0;0", "aff_unique_norm": "Tsinghua University;Renmin University of China", "aff_unique_dep": ";", "aff_unique_url": "https://www.tsinghua.edu.cn;http://www.ruc.edu.cn", "aff_unique_abbr": "THU;RUC", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "id": "0ze7XgWcYNV", "title": "Learning When and What to Ask: a Hierarchical Reinforcement Learning Framework", "track": "main", "status": "Reject", "tldr": "", "abstract": "Reliable AI agents should be mindful of the limits of their knowledge and consult humans when sensing that they do not have sufficient knowledge to make sound decisions. We formulate a hierarchical reinforcement learning framework for learning to decide when to request additional information from humans and what type of information would be helpful to request. Our framework extends partially-observed Markov decision processes (POMDPs) by allowing an agent to interact with an assistant to leverage their knowledge in accomplishing tasks. Results on a simulated human-assisted navigation problem demonstrate the effectiveness of our framework: aided with an interaction policy learned by our method, a navigation policy achieves up to a 7\u00d7 improvement in task success rate compared to performing tasks only by itself. The interaction policy is also efficient: on average, only a quarter of all actions taken during a task execution are requests for information. We analyze benefits and challenges of learning with a hierarchical policy structure and suggest directions for future work.", "keywords": "human-agent interaction;reinforcement learning;navigation", "primary_area": "", "supplementary_material": "/attachment/67c73ff3486e292ca8e0574d65c60df69050edd3.zip", "author": "Khanh Xuan Nguyen;Yonatan Bisk;Hal Daum\u00e9 III", "authorids": "~Khanh_Xuan_Nguyen1;~Yonatan_Bisk1;~Hal_Daum\u00e9_III1", "gender": "M;M;M", "homepage": "http://machineslearner.com;http://www.YonatanBisk.com;http://hal3.name", "dblp": "53/6791;38/9282;77/2856.html", "google_scholar": "SmqouhIAAAAJ;bWoGh8UAAAAJ;PbEw81gAAAAJ", "orcid": ";0000-0002-2111-9081;", "linkedin": ";yonatanbisk/;", "or_profile": "~Khanh_Xuan_Nguyen1;~Yonatan_Bisk1;~Hal_Daum\u00e9_III1", "aff": "Department of Computer Science, University of Maryland, College Park;Meta;Microsoft", "aff_domain": "cs.umd.edu;meta.com;microsoft.com", "position": "PhD student;Visiting Professor;Senior Principle Researcher", "bibtex": "@misc{\nnguyen2022learning,\ntitle={Learning When and What to Ask: a Hierarchical Reinforcement Learning Framework},\nauthor={Khanh Xuan Nguyen and Yonatan Bisk and Hal Daum{\\'e} III},\nyear={2022},\nurl={https://openreview.net/forum?id=0ze7XgWcYNV}\n}", "github": "", "project": "", "reviewers": "LvSE;9mjj;rgjt", "site": "https://openreview.net/forum?id=0ze7XgWcYNV", "pdf_size": 0, "recommendation": "3;3;5", "confidence": "4;3;4", "correctness": "2;3;3", "technical_novelty": "2;2;3", "empirical_novelty": "1;1;3", "wc_summary_paper": "77;161;176", "wc_summary_review": "24;145;91", "wc_main_review": "678;633;398", "wc_review": "779;939;665", "wc_reply_reviewers": "0;0;51", "wc_reply_authors": "758;985;333", "reply_reviewers": "0;0;2", "reply_authors": "1;2;1", "recommendation_avg": [ 3.6666666666666665, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 2.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 1.6666666666666667, 0.9428090415820634 ], "wc_summary_paper_avg": [ 138.0, 43.56604182158393 ], "wc_summary_review_avg": [ 86.66666666666667, 49.492984912566705 ], "wc_main_review_avg": [ 569.6666666666666, 122.76897907135263 ], "wc_review_avg": [ 794.3333333333334, 112.38426145249265 ], "wc_reply_reviewers_avg": [ 17.0, 24.041630560342615 ], "wc_reply_authors_avg": [ 692.0, 270.2381665617695 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.9428090415820634 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.5, "corr_recommendation_correctness": 0.5, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8110710953313731815&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "University of Maryland, College Park;Meta;Microsoft", "aff_unique_dep": "Department of Computer Science;Meta Platforms, Inc.;Microsoft Corporation", "aff_unique_url": "https://www/umd.edu;https://meta.com;https://www.microsoft.com", "aff_unique_abbr": "UMD;Meta;Microsoft", "aff_campus_unique_index": "0", "aff_campus_unique": "College Park;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "1-58A45OkER", "title": "Delving into Channels: Exploring Hyperparameter Space of Channel Bit Widths with Linear Complexity", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Allocating different bit widths to different channels and quantizing them independently bring higher quantization precision and accuracy. Most of prior works use equal bit width to quantize all layers or channels, which is sub-optimal. On the other hand, it is very challenging to explore the hyperparameter space of channel bit widths, as the search space increases exponentially as the number of channels, which could be tens of thousand in a deep neural network. In this paper, we address an important problem of efficiently exploring the hyperparameter space of channel bit widths. We formulate the quantization of deep neural networks as a rate-distortion optimization problem, and present an ultra-fast algorithm to search the bit allocation of channels. Our approach has only linear time complexity and can find the optimal bit allocation within a few minutes on CPU. In addition, we provide an effective way to improve the performance on target hardware platforms. We restrict the bit rate (size) of each layer to allow as many weights and activations as possible to be stored on-chip, and incorporate hardware-aware constraints into our objective function. The hardware-aware constraints do not cause additional overhead to optimization, and have very positive impact on hardware performance. Experimental results show that our approach achieves state-of-the-art quantization results on four deep neural networks, ResNet-18, ResNet-34, ResNet-50, and MobileNet-v2, on ImageNet. Hardware simulation results demonstrate that our approach is able to bring up to 3.5x and 3.0x speedup on two deep-learning accelerators, TPU and Eyeriss, respectively.", "keywords": "Deep Learning;Neural Network Compression;Rate-Distortion Theories", "primary_area": "", "supplementary_material": "", "author": "Zhe Wang;Jie Lin;Xue Geng;Mohamed M. Sabry Aly;Vijay Chandrasekhar", "authorids": "~Zhe_Wang12;~Jie_Lin1;~Xue_Geng1;~Mohamed_M._Sabry_Aly1;vijay.cmu@gmail.com", "gender": "M;M;F;M;", "homepage": "https://www.linkedin.com/in/wangzhemark/?originalSubdomain=sg;;;;", "dblp": ";88/6731;149/3281;;", "google_scholar": "Xqu6fAkAAAAJ;;ZYVZ1bgAAAAJ;eCo7XWkAAAAJ;", "orcid": ";;;;", "linkedin": ";;xue-geng-9963b95a/;mohamed-m-sabry-aly/;", "or_profile": "~Zhe_Wang12;~Jie_Lin1;~Xue_Geng1;~Mohamed_M._Sabry_Aly1;vijay.cmu@gmail.com", "aff": ", A*STAR;I2R, A*STAR;Institute for Infocomm Research, A*STAR;Nanyang Technological University;", "aff_domain": "i2r.a-star.edu.sg;i2r.a-star.edu.sg;i2r.a-star.edu.sg;ntu.edu.sg;", "position": "Researcher;Research Scientist;Research Scientist;Assistant Professor;", "bibtex": "@misc{\nwang2022delving,\ntitle={Delving into Channels: Exploring Hyperparameter Space of Channel Bit Widths with Linear Complexity},\nauthor={Zhe Wang and Jie Lin and Xue Geng and Mohamed M. Sabry Aly and Vijay Chandrasekhar},\nyear={2022},\nurl={https://openreview.net/forum?id=1-58A45OkER}\n}", "github": "", "project": "", "reviewers": "QydC;x7PX;Yfjg;XY9L", "site": "https://openreview.net/forum?id=1-58A45OkER", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "5;5;5;4", "correctness": "2;3;3;3", "technical_novelty": "3;2;2;3", "empirical_novelty": "0;1;2;0", "wc_summary_paper": "67;59;42;110", "wc_summary_review": "69;27;36;101", "wc_main_review": "362;328;327;186", "wc_review": "498;414;405;397", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 4.75, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 0.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 69.5, 25.064915718988566 ], "wc_summary_review_avg": [ 58.25, 29.217931138258233 ], "wc_main_review_avg": [ 300.75, 67.73247005683463 ], "wc_review_avg": [ 428.5, 40.574006457336694 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.6622661785325219, "corr_recommendation_correctness": 0.9271726499455306, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:loC_GvULl40J:scholar.google.com/&scioq=Delving+into+Channels:+Exploring+Hyperparameter+Space+of+Channel+Bit+Widths+with+Linear+Complexity&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0;1;2", "aff_unique_norm": "A*STAR;Institute for Infocomm Research;Nanyang Technological University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.a-star.edu.sg;https://www.i2r.a-star.edu.sg;https://www.ntu.edu.sg", "aff_unique_abbr": "A*STAR;I2R;NTU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Singapore" }, { "id": "1-YP2squpa7", "title": "Deep learning via message passing algorithms based on belief propagation", "track": "main", "status": "Reject", "tldr": "", "abstract": "Message-passing algorithms based on the Belief Propagation (BP) equations constitute a well-known distributed computational scheme. It is exact on tree-like graphical models and has also proven to be effective in many problems defined on graphs with loops (from inference to optimization, from signal processing to clustering). \nThe BP-based scheme is fundamentally different from stochastic gradient descent (SGD), on which the current success of deep networks is based. In this paper, we present and adapt to mini-batch training on GPUs a family of BP-based message-passing algorithms with a reinforcement field that biases distributions towards locally entropic solutions.\nThese algorithms are capable of training multi-layer neural networks with discrete weights and activations with performance comparable to SGD-inspired heuristics (BinaryNet) and are naturally well-adapted to continual learning. Furthermore, using these algorithms to estimate the marginals of the weights allows us to make approximate Bayesian predictions that have higher accuracy than point-wise solutions.", "keywords": "belief propagation;neural networks;graphical models;gradient-free algorithms;discrete neural networks", "primary_area": "", "supplementary_material": "", "author": "Fabrizio Pittorino;Carlo Lucibello;Gabriele Perugini;Riccardo Zecchina", "authorids": "~Fabrizio_Pittorino1;~Carlo_Lucibello1;~Gabriele_Perugini1;~Riccardo_Zecchina1", "gender": "M;M;;M", "homepage": "http://didattica.unibocconi.it/docenti/cv.php?rif=214098;;https://www.artlab.unibocconi.eu/wps/wcm/connect/Cdr/Artlab/Home/;", "dblp": "241/6324.html;145/7395;;01/2463", "google_scholar": "https://scholar.google.it/citations?user=sT_qeloAAAAJ;;;https://scholar.google.it/citations?user=fNOReswAAAAJ", "orcid": ";0000-0003-0837-9783;;0000-0002-1221-5207", "linkedin": "fabrizio-pittorino-80045593/?originalSubdomain=it;;;", "or_profile": "~Fabrizio_Pittorino1;~Carlo_Lucibello1;~Gabriele_Perugini1;~Riccardo_Zecchina1", "aff": "Bocconi University;Bocconi University;;Bocconi University", "aff_domain": "unibocconi.it;unibocconi.it;;unibocconi.it", "position": "Postdoc;Assistant Professor;;Full Professor", "bibtex": "@misc{\npittorino2022deep,\ntitle={Deep learning via message passing algorithms based on belief propagation},\nauthor={Fabrizio Pittorino and Carlo Lucibello and Gabriele Perugini and Riccardo Zecchina},\nyear={2022},\nurl={https://openreview.net/forum?id=1-YP2squpa7}\n}", "github": "", "project": "", "reviewers": "jTGp;nNGL;tYY4;SnTy", "site": "https://openreview.net/forum?id=1-YP2squpa7", "pdf_size": 0, "recommendation": "3;5;6;8", "confidence": "4;4;4;3", "correctness": "3;3;3;4", "technical_novelty": "2;3;2;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "48;136;30;64", "wc_summary_review": "37;184;23;64", "wc_main_review": "322;1443;198;333", "wc_review": "407;1763;251;461", "wc_reply_reviewers": "0;792;0;0", "wc_reply_authors": "849;2620;207;829", "reply_reviewers": "0;1;0;0", "reply_authors": "2;4;1;2", "recommendation_avg": [ 5.5, 1.8027756377319946 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 69.5, 40.233692348577705 ], "wc_summary_review_avg": [ 77.0, 63.50984175700645 ], "wc_main_review_avg": [ 574.0, 504.51015847057033 ], "wc_review_avg": [ 720.5, 606.8070121546059 ], "wc_reply_reviewers_avg": [ 198.0, 342.9460598986377 ], "wc_reply_authors_avg": [ 1126.25, 900.213134485384 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 2.25, 1.0897247358851685 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.8006407690254357, "corr_recommendation_correctness": 0.8006407690254357, "gs_citation": 18, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15880817913380830804&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "aff_unique_index": "0;0;0", "aff_unique_norm": "Bocconi University", "aff_unique_dep": "", "aff_unique_url": "https://www.bocconi.edu", "aff_unique_abbr": "Bocconi", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Italy" }, { "id": "1-lFH8oYTI", "title": "Calibration Regularized Training of Deep Neural Networks using Kernel Density Estimation", "track": "main", "status": "Reject", "tldr": "", "abstract": "Calibrated probabilistic classifiers are models whose predicted probabilities can directly be interpreted as uncertainty estimates. This property is particularly important in safety-critical applications such as medical diagnosis or autonomous driving. However, it has been shown recently that deep neural networks are poorly calibrated and tend to output overconfident predictions. As a remedy, we propose a trainable calibration error estimator based on Dirichlet kernel density estimates, which asymptotically converges to the true Lp calibration error. This novel estimator enables us to achieve the strongest notion of multiclass calibration, called canonical calibration, while other common calibration methods only allow for top-label and marginal calibration. The empirical results show that our estimator is competitive with the state-of-the-art, consistently yielding tradeoffs between calibration error and accuracy that are (near) Pareto optimal across a range of network architectures. The computational complexity of our estimator is O(n^2), matching that of the kernel maximum mean discrepancy, used in a previously considered trainable calibration estimator. By contrast, the proposed method has a natural choice of kernel, and can be used to generate consistent estimates of other quantities based on conditional expectation, such as the sharpness of an estimator.", "keywords": "calibration;dirichlet kernel density estimation", "primary_area": "", "supplementary_material": "", "author": "Teodora Popordanoska;Raphael Sayer;Matthew B. Blaschko", "authorids": "~Teodora_Popordanoska1;~Raphael_Sayer1;~Matthew_B._Blaschko1", "gender": ";;M", "homepage": "https://tpopordanoska.github.io/;https://www.esat.kuleuven.be/psi/members/00144708;http://homes.esat.kuleuven.be/~mblaschk/", "dblp": "270/8007;;12/5233", "google_scholar": "B2YV6zIAAAAJ;;EmmO7LcAAAAJ", "orcid": "0000-0002-1436-2286;;0000-0002-2640-181X", "linkedin": "tpopordanoska/;;matthew-blaschko-5b7a51b0/", "or_profile": "~Teodora_Popordanoska1;~Raphael_Sayer1;~Matthew_Blaschko1", "aff": "KU Leuven;KU Leuven;KU Leuven", "aff_domain": "kuleuven.be;kuleuven.be;esat.kuleuven.be", "position": "PhD student;PhD student;Associate Professor", "bibtex": "@misc{\npopordanoska2022calibration,\ntitle={Calibration Regularized Training of Deep Neural Networks using Kernel Density Estimation},\nauthor={Teodora Popordanoska and Raphael Sayer and Matthew B. Blaschko},\nyear={2022},\nurl={https://openreview.net/forum?id=1-lFH8oYTI}\n}", "github": "", "project": "", "reviewers": "6Gih;HhAZ;JEjT;qGbY", "site": "https://openreview.net/forum?id=1-lFH8oYTI", "pdf_size": 0, "recommendation": "5;5;5;8", "confidence": "5;4;3;3", "correctness": "3;3;4;3", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "38;53;73;92", "wc_summary_review": "32;59;13;106", "wc_main_review": "305;614;101;294", "wc_review": "375;726;187;492", "wc_reply_reviewers": "0;0;0;54", "wc_reply_authors": "455;934;346;764", "reply_reviewers": "0;0;0;1", "reply_authors": "2;2;2;1", "recommendation_avg": [ 5.75, 1.299038105676658 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 64.0, 20.38381711063951 ], "wc_summary_review_avg": [ 52.5, 34.9463875100131 ], "wc_main_review_avg": [ 328.5, 183.71785433103665 ], "wc_review_avg": [ 445.0, 195.34200777098613 ], "wc_reply_reviewers_avg": [ 13.5, 23.382685902179844 ], "wc_reply_authors_avg": [ 624.75, 235.34163996199229 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.75, 0.4330127018922193 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.5222329678670935, "corr_recommendation_correctness": -0.3333333333333333, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13719601430952975869&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;0", "aff_unique_norm": "Katholieke Universiteit Leuven", "aff_unique_dep": "", "aff_unique_url": "https://www.kuleuven.be", "aff_unique_abbr": "KU Leuven", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Belgium" }, { "id": "11PMuvv3tEO", "title": "Lagrangian Generative Adversarial Imitation Learning with Safety", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Imitation Learning (IL) merely concentrates on reproducing expert behaviors and could take dangerous actions, which is unbearable in safety-critical scenarios. In this work, we first formalize a practical task of safe imitation learning (Safe IL), which has been long neglected. Taking safety into consideration, we augment Generative Adversarial Imitation Learning (GAIL) with safety constraints and then relax it as an unconstrained saddle point problem by utilizing a Lagrange multiplier, dubbed LGAIL. Then, we apply a two-stage optimization framework to solve LGAIL. Specifically, a discriminator is firstly optimized to measure the similarity between the agent-generated state-action pairs and the expert ones, and then forward reinforcement learning is employed to improve the similarity while considering safety concerns via a Lagrange multiplier. Besides, we provide a theoretical interpretation of LGAIL, which indicates that the proposed LGAIL can be guaranteed to learn a safe policy from unsafe expert data. At last, extensive experiments in OpenAI Safety Gym conclude the effectiveness of our approach.", "keywords": "safe imitation learning;inverse reinforcement learning", "primary_area": "", "supplementary_material": "", "author": "Zhihao Cheng;Li Shen;Meng Fang;Liu Liu;Dacheng Tao", "authorids": "~Zhihao_Cheng1;~Li_Shen1;~Meng_Fang1;~Liu_Liu8;~Dacheng_Tao1", "gender": "M;M;M;F;", "homepage": "https://www.researchgate.net/profile/Zhihao_Cheng5;https://sites.google.com/site/mathshenli/home;;;", "dblp": ";91/3680-8;67/463;74/7037-14;", "google_scholar": ";yVhgENIAAAAJ;IcNYP1oAAAAJ;FvGjCqEAAAAJ;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~Zhihao_Cheng1;~Li_Shen1;~Meng_Fang1;~Liu_Liu8;~Dacheng_Tao1", "aff": "University of Sydney;JD Explore Academy;Eindhoven University of Technology;University of Sydney;", "aff_domain": "sydney.edu.au;jd.com;tue.nl;sydney.edu.au;", "position": "PhD student;Researcher;Assistant Professor;Postdoc;", "bibtex": "@misc{\ncheng2022lagrangian,\ntitle={Lagrangian Generative Adversarial Imitation Learning with Safety},\nauthor={Zhihao Cheng and Li Shen and Meng Fang and Liu Liu and Dacheng Tao},\nyear={2022},\nurl={https://openreview.net/forum?id=11PMuvv3tEO}\n}", "github": "", "project": "", "reviewers": "2nXZ;WRX9;e1bo;o4nw", "site": "https://openreview.net/forum?id=11PMuvv3tEO", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "3;4;5;4", "correctness": "3;2;2;3", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;3;2;2", "wc_summary_paper": "67;79;58;89", "wc_summary_review": "113;116;73;43", "wc_main_review": "51;2065;411;536", "wc_review": "231;2260;542;668", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "86;281;28;23", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 73.25, 11.755317945508747 ], "wc_summary_review_avg": [ 86.25, 30.194163343268844 ], "wc_main_review_avg": [ 765.75, 770.965425619074 ], "wc_review_avg": [ 925.25, 786.8606531654763 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 104.5, 104.86777388692867 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14531281765424829500&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "University of Sydney;JD;Eindhoven University of Technology", "aff_unique_dep": ";JD Explore Academy;", "aff_unique_url": "https://www.sydney.edu.au;;https://www.tue.nl", "aff_unique_abbr": "USYD;;TU/e", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;2;0", "aff_country_unique": "Australia;;Netherlands" }, { "id": "11aY89G7YY4", "title": "Data-centric Semi-supervised Learning", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "We study unsupervised data selection for semi-supervised learning (SSL), where a large-scale unlabeled data is available and a small subset of data is budgeted for label acquisition. Existing SSL methods focus on learning a model that effectively integrates information from given small labeled data and large unlabeled data, whereas we focus on selecting the right data for SSL without any label or task information, in an also stark contrast to supervised data selection for active learning. Intuitively, instances to be labeled shall collectively have maximum diversity and coverage for downstream tasks, and individually have maximum information propagation utility for SSL. We formalize these concepts in a three-step data-centric SSL method that improves FixMatch in stability and accuracy by 8% on CIFAR-10 (0.08% labeled) and 14% on ImageNet-1K (0.2% labeled). Our work demonstrates that a small compute spent on careful labeled data selection brings big annotation efficiency and model performance gain without changing the learning pipeline. Our completely unsupervised data selection can be easily extended to other weakly supervised learning settings.", "keywords": "Data-centric;semi-supervised learning", "primary_area": "", "supplementary_material": "", "author": "Xudong Wang;Long Lian;Stella Yu", "authorids": "~Xudong_Wang4;~Long_Lian1;~Stella_Yu2", "gender": "M;M;F", "homepage": "http://people.eecs.berkeley.edu/~xdwang/;https://github.com/TonyLianLong;http://www.eecs.umich.edu/~stellayu", "dblp": ";276/0012;58/5089", "google_scholar": "Azf07WcAAAAJ;eOLxyqUAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";0000-0001-6098-189X;", "linkedin": ";longlian/;", "or_profile": "~Xudong_Wang4;~Long_Lian1;~Stella_Yu2", "aff": "FAIR Labs, Meta;University of California, Berkeley;University of California, Berkeley", "aff_domain": "fb.com;berkeley.edu;berkeley.edu", "position": "Research Intern;Undergrad student;Director, ICSI Vision Group", "bibtex": "@misc{\nwang2022datacentric,\ntitle={Data-centric Semi-supervised Learning},\nauthor={Xudong Wang and Long Lian and Stella Yu},\nyear={2022},\nurl={https://openreview.net/forum?id=11aY89G7YY4}\n}", "github": "", "project": "", "reviewers": "vCLz;cDfo;yvvq;E4Gx", "site": "https://openreview.net/forum?id=11aY89G7YY4", "pdf_size": 0, "recommendation": "3;5;6;6", "confidence": "4;4;4;4", "correctness": "3;3;3;3", "technical_novelty": "1;2;2;2", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "54;46;89;117", "wc_summary_review": "35;41;33;79", "wc_main_review": "316;297;263;300", "wc_review": "405;384;385;496", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 5.0, 1.224744871391589 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 1.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 76.5, 28.429737951659 ], "wc_summary_review_avg": [ 47.0, 18.708286933869708 ], "wc_main_review_avg": [ 294.0, 19.300259065618782 ], "wc_review_avg": [ 417.5, 46.08958667638494 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1, "aff_unique_index": "0;1;1", "aff_unique_norm": "Meta;University of California, Berkeley", "aff_unique_dep": "FAIR Labs;", "aff_unique_url": "https://meta.com;https://www.berkeley.edu", "aff_unique_abbr": "Meta;UC Berkeley", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Berkeley", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Out-of-distribution Generalization in the Presence of Nuisance-Induced Spurious Correlations", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6290", "id": "12RoR2o32T", "poster": "", "openreview": "https://openreview.net/forum?id=12RoR2o32T", "slides": "https://iclr.cc/virtual/2022/poster/6290", "video": "https://iclr.cc/virtual/2022/poster/6290", "author_site": "Aahlad Puli, Lily Zhang, Eric Oermann, Rajesh Ranganath", "tldr": "", "abstract": "In many prediction problems, spurious correlations are induced by a changing relationship between the label and a nuisance variable that is also correlated with the covariates. For example, in classifying animals in natural images, the background, which is a nuisance, can predict the type of animal. This nuisance-label relationship does not always hold, and the performance of a model trained under one such relationship may be poor on data with a different nuisance-label relationship. To build predictive models that perform well regardless of the nuisance-label relationship, we develop Nuisance-Randomized Distillation (NURD). We introduce the nuisance-randomized distribution, a distribution where the nuisance and the label are independent. Under this distribution, we define the set of representations such that conditioning on any member, the nuisance and the label remain independent. We prove that the representations in this set always perform better than chance, while representations outside of this set may not. NURD finds a representation from this set that is most informative of the label under the nuisance-randomized distribution, and we prove that this representation achieves the highest performance regardless of the nuisance-label relationship. We evaluate NURD on several tasks including chest X-ray classification where, using non-lung patches as the nuisance, NURD produces models that predict pneumonia under strong spurious correlations.", "keywords": "spurious correlations;out of distribution generalization;ml for health;representation learning", "primary_area": "", "supplementary_material": "", "author": "Aahlad Manas Puli;Lily H Zhang;Eric Karl Oermann;Rajesh Ranganath", "authorids": "~Aahlad_Manas_Puli1;~Lily_H_Zhang1;~Eric_Karl_Oermann1;~Rajesh_Ranganath2", "gender": "M;F;M;", "homepage": "http://aahladmanas.github.io;https://lhz1029.github.io/;https://www.nyuolab.org;", "dblp": "228/9272;267/6682;https://dblp.uni-trier.de/pers/hd/o/Oermann:Eric_K=;97/7057", "google_scholar": "xWmCmBQAAAAJ;fmCi9ZQAAAAJ;GQum-K4AAAAJ;", "orcid": ";;0000-0002-1876-5963;", "linkedin": ";;eric-oermann-b829528/;", "or_profile": "~Aahlad_Manas_Puli1;~Lily_H_Zhang1;~Eric_Karl_Oermann1;~Rajesh_Ranganath2", "aff": "New York University;New York University;New York University;New York University", "aff_domain": "nyu.edu;nyu.edu;nyu.edu;nyu.edu", "position": "PhD student;PhD student;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\npuli2022outofdistribution,\ntitle={Out-of-distribution Generalization in the Presence of Nuisance-Induced Spurious Correlations},\nauthor={Aahlad Manas Puli and Lily H Zhang and Eric Karl Oermann and Rajesh Ranganath},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=12RoR2o32T}\n}", "github": "", "project": "", "reviewers": "Vct3;frmn;2F1e;mFHY;S6jJ", "pdf_size": 0, "recommendation": "5;5;6;8;8", "confidence": "5;4;3;3;3", "correctness": "2;2;3;4;4", "technical_novelty": "2;2;3;3;4", "empirical_novelty": "2;3;3;2;4", "wc_summary_paper": "60;49;92;97;68", "wc_summary_review": "31;26;27;90;116", "wc_main_review": "234;312;410;438;765", "wc_review": "325;387;529;625;949", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "0;0;0;0;0", "reply_reviewers": "0;0;0;0;0", "reply_authors": "0;0;0;0;0", "recommendation_avg": [ 6.4, 1.3564659966250536 ], "confidence_avg": [ 3.6, 0.8 ], "correctness_avg": [ 3.0, 0.8944271909999159 ], "technical_novelty_avg": [ 2.8, 0.7483314773547882 ], "empirical_novelty_avg": [ 2.8, 0.7483314773547882 ], "wc_summary_paper_avg": [ 73.2, 18.475930287809597 ], "wc_summary_review_avg": [ 58.0, 37.688194438046516 ], "wc_main_review_avg": [ 431.8, 181.66606727729865 ], "wc_review_avg": [ 563.0, 219.8253852492928 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.7740702698132101, "corr_recommendation_correctness": 0.9890707100936805, "gs_citation": 50, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11021328735736547096&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=12RoR2o32T", "email": "nyu.edu;nyu.edu;nyu.edu;nyu.edu", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "New York University", "aff_unique_dep": "", "aff_unique_url": "https://www.nyu.edu", "aff_unique_abbr": "NYU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "A Generalized Weighted Optimization Method for Computational Learning and Inversion", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7110", "id": "14F3fI6MGxX", "poster": "", "openreview": "https://openreview.net/forum?id=14F3fI6MGxX", "slides": "https://iclr.cc/virtual/2022/poster/7110", "video": "https://iclr.cc/virtual/2022/poster/7110", "author_site": "Kui Ren, Yunan Yang, Bj\u00f6rn Engquist", "tldr": "", "abstract": "The generalization capacity of various machine learning models exhibits different phenomena in the under- and over-parameterized regimes. In this paper, we focus on regression models such as feature regression and kernel regression and analyze a generalized weighted least-squares optimization method for computational learning and inversion with noisy data. The highlight of the proposed framework is that we allow weighting in both the parameter space and the data space. The weighting scheme encodes both a priori knowledge on the object to be learned and a strategy to weight the contribution of different data points in the loss function. Here, we characterize the impact of the weighting scheme on the generalization error of the learning method, where we derive explicit generalization errors for the random Fourier feature model in both the under- and over-parameterized regimes. For more general feature maps, error bounds are provided based on the singular values of the feature matrix. We demonstrate that appropriate weighting from prior knowledge can improve the generalization capability of the learned model.", "keywords": "weighted optimization;generalization error;feature regression;machine learning", "primary_area": "", "supplementary_material": "", "author": "Kui Ren;Yunan Yang;Bj\u00f6rn Engquist", "authorids": "~Kui_Ren2;~Yunan_Yang1;engquist@oden.utexas.edu", "gender": ";F;", "homepage": "http://www.columbia.edu/~kr2002/;https://yunany.github.io/;", "dblp": ";207/9063;", "google_scholar": ";Db_1B18AAAAJ;", "orcid": ";0000-0001-7238-7978;", "linkedin": ";;", "or_profile": "~Kui_Ren2;~Yunan_Yang1;engquist@oden.utexas.edu", "aff": "University of Texas at Austin;ETHZ - ETH Zurich;", "aff_domain": "utexas.edu;ethz.ch;", "position": "Assistant Professor;Postdoc;", "bibtex": "@inproceedings{\nren2022a,\ntitle={A Generalized Weighted Optimization Method for Computational Learning and Inversion},\nauthor={Kui Ren and Yunan Yang and Bj{\\\"o}rn Engquist},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=14F3fI6MGxX}\n}", "github": "", "project": "", "reviewers": "vfX6;WRNZ;muho;Koia;XgS8", "pdf_size": 0, "recommendation": "5;6;6;6;6", "confidence": "4;4;2;2;4", "correctness": "4;4;3;4;4", "technical_novelty": "3;4;3;3;3", "empirical_novelty": "3;4;0;0;0", "wc_summary_paper": "59;37;23;38;112", "wc_summary_review": "37;35;16;18;87", "wc_main_review": "298;122;97;113;146", "wc_review": "394;194;136;169;345", "wc_reply_reviewers": "113;0;0;0;0", "wc_reply_authors": "1158;824;386;461;636", "reply_reviewers": "1;0;0;0;0", "reply_authors": "2;1;1;1;1", "recommendation_avg": [ 5.8, 0.39999999999999997 ], "confidence_avg": [ 3.2, 0.9797958971132712 ], "correctness_avg": [ 3.8, 0.39999999999999997 ], "technical_novelty_avg": [ 3.2, 0.39999999999999997 ], "empirical_novelty_avg": [ 1.4, 1.7435595774162693 ], "wc_summary_paper_avg": [ 53.8, 31.288336485022658 ], "wc_summary_review_avg": [ 38.6, 25.663982543634962 ], "wc_main_review_avg": [ 155.2, 73.13931911085855 ], "wc_review_avg": [ 247.6, 102.39648431464823 ], "wc_reply_reviewers_avg": [ 22.6, 45.2 ], "wc_reply_authors_avg": [ 693.0, 277.38348905441364 ], "reply_reviewers_avg": [ 0.2, 0.4000000000000001 ], "reply_authors_avg": [ 1.2, 0.4000000000000001 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.408248290463863, "corr_recommendation_correctness": -0.25000000000000006, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=330645139126087307&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "pdf": "https://openreview.net/pdf?id=14F3fI6MGxX", "email": "utexas.edu;ethz.ch;", "author_num": 3, "aff_unique_index": "0;1", "aff_unique_norm": "University of Texas at Austin;ETH Zurich", "aff_unique_dep": ";", "aff_unique_url": "https://www.utexas.edu;https://www.ethz.ch", "aff_unique_abbr": "UT Austin;ETHZ", "aff_campus_unique_index": "0", "aff_campus_unique": "Austin;", "aff_country_unique_index": "0;1", "aff_country_unique": "United States;Switzerland" }, { "id": "14kbUbOaZUc", "title": "Metric Learning on Temporal Graphs via Few-Shot Examples", "track": "main", "status": "Reject", "tldr": "", "abstract": "Graph metric learning methods aim to learn the distance metric over graphs such that similar graphs are closer and dissimilar graphs are farther apart. This is of critical importance in many graph classification applications such as drug discovery and epidemics categorization. In many real-world applications, the graphs are typically evolving over time; labeling graph data is usually expensive and also requires background knowledge. However, state-of-the-art graph metric learning techniques consider the input graph as static, and largely ignore the intrinsic dynamics of temporal graphs; Furthermore, most of these techniques require abundant labeled examples for training in the representation learning process. To address the two aforementioned problems, we wish to learn a distance metric only over fewer temporal graphs, which metric could not only help accurately categorize seen temporal graphs but also be adapted smoothly to unseen temporal graphs. In this paper, we first propose the streaming-snapshot model to describe temporal graphs on different time scales. Then we propose the MetaTag framework: 1) to learn the metric over a limited number of streaming-snapshot modeled temporal graphs, 2) and adapt the learned metric to unseen temporal graphs via a few examples. Finally, we demonstrate the performance of MetaTag in comparison with state-of-the-art algorithms for temporal graph classification problems.", "keywords": "Metric Learning;Few-Shot Learning;Temporal Graph", "primary_area": "", "supplementary_material": "/attachment/6241528a4dbb04285173b36cc9f40052912ce823.zip", "author": "Dongqi Fu;Liri Fang;Ross Maciejewski;Vetle I Torvik;Jingrui He", "authorids": "~Dongqi_Fu1;~Liri_Fang1;~Ross_Maciejewski1;~Vetle_I_Torvik1;~Jingrui_He1", "gender": "M;;M;;F", "homepage": "https://dongqifu.github.io/;;http://rmaciejewski.faculty.asu.edu/;;https://www.hejingrui.org", "dblp": "273/0228;;81/5349.html;;34/2685", "google_scholar": "WByXZAcAAAAJ;;https://scholar.google.com.tw/citations?user=nChgOjEAAAAJ;;hXpZynkAAAAJ", "orcid": "0000-0002-8726-9234;;;;0000-0002-6429-6272", "linkedin": ";;;;", "or_profile": "~Dongqi_Fu1;~Liri_Fang1;~Ross_Maciejewski1;~Vetle_I_Torvik1;~Jingrui_He1", "aff": "University of Illinois, Urbana Champaign;;Arizona State University;;University of Illinois, Urbana Champaign", "aff_domain": "illinois.edu;;asu.edu;;illinois.edu", "position": "PhD student;;Full Professor;;Associate Professor", "bibtex": "@misc{\nfu2022metric,\ntitle={Metric Learning on Temporal Graphs via Few-Shot Examples},\nauthor={Dongqi Fu and Liri Fang and Ross Maciejewski and Vetle I Torvik and Jingrui He},\nyear={2022},\nurl={https://openreview.net/forum?id=14kbUbOaZUc}\n}", "github": "", "project": "", "reviewers": "5yGt;jL2u;Qtrm;sLfA", "site": "https://openreview.net/forum?id=14kbUbOaZUc", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "3;4;3;3", "correctness": "3;2;3;4", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "95;326;22;68", "wc_summary_review": "49;91;39;73", "wc_main_review": "375;767;218;457", "wc_review": "519;1184;279;598", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 127.75, 117.39756172936472 ], "wc_summary_review_avg": [ 63.0, 20.346989949375804 ], "wc_main_review_avg": [ 454.25, 199.94671165087962 ], "wc_review_avg": [ 645.0, 332.6266676019829 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.7071067811865475, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:3dVjrnDt4aAJ:scholar.google.com/&scioq=Metric+Learning+on+Temporal+Graphs+via+Few-Shot+Examples&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;0", "aff_unique_norm": "University of Illinois Urbana-Champaign;Arizona State University", "aff_unique_dep": ";", "aff_unique_url": "https://illinois.edu;https://www.asu.edu", "aff_unique_abbr": "UIUC;ASU", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Urbana-Champaign;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Online Ad Hoc Teamwork under Partial Observability", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7013", "id": "18Ys0-PzyPI", "poster": "", "openreview": "https://openreview.net/forum?id=18Ys0-PzyPI", "slides": "https://iclr.cc/virtual/2022/poster/7013", "video": "https://iclr.cc/virtual/2022/poster/7013", "author_site": "Pengjie Gu, Mengchen Zhao, Jianye HAO, Bo An", "tldr": "", "abstract": "Autonomous agents often need to work together as a team to accomplish complex cooperative tasks. Due to privacy and other realistic constraints, agents might need to collaborate with previously unknown teammates on the fly. This problem is known as ad hoc teamwork, which remains a core research challenge. Prior works usually rely heavily on strong assumptions like full observability, fixed and predefined teammates' types. This paper relaxes these assumptions with a novel reinforcement learning framework called ODITS, which allows the autonomous agent to adapt to arbitrary teammates in an online fashion. Instead of limiting teammates into a finite set of predefined types, ODITS automatically learns latent variables of teammates' behaviors to infer how to cooperate with new teammates effectively. To overcome partial observability, we introduce an information-based regularizer to derive proxy representations of the learned variables from local observations. Extensive experimental results show that ODITS significantly outperforms various baselines in widely used ad hoc teamwork tasks.", "keywords": "coordination;reinforcement learning", "primary_area": "", "supplementary_material": "", "author": "Pengjie Gu;Mengchen Zhao;Jianye Hao;Bo An", "authorids": "~Pengjie_Gu1;~Mengchen_Zhao1;haojianye@huawei.com;~Bo_An2", "gender": "M;M;;M", "homepage": ";https://batmanzzmc.github.io/;;https://personal.ntu.edu.sg/boan/", "dblp": "226/1222;178/8719;;42/6178-1.html", "google_scholar": ";nLgORGMAAAAJ;;PEEpuNwAAAAJ", "orcid": ";;;0000-0002-7064-7438", "linkedin": ";;;", "or_profile": "~Pengjie_Gu1;~Mengchen_Zhao1;haojianye@huawei.com;~Bo_An2", "aff": "Nanyang Technological University;Huawei Noah's Ark Lab;;Nanyang Technological University", "aff_domain": "ntu.edu.sg;huawei.com;;ntu.edu.sg", "position": "PhD student;Research Scientist;;Full Professor", "bibtex": "@inproceedings{\ngu2022online,\ntitle={Online Ad Hoc Teamwork under Partial Observability},\nauthor={Pengjie Gu and Mengchen Zhao and Jianye Hao and Bo An},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=18Ys0-PzyPI}\n}", "github": "", "project": "", "reviewers": "v5Yz;7i5n;Z8fc;X2GL", "pdf_size": 0, "recommendation": "6;6;6;8", "confidence": "4;4;2;4", "correctness": "3;4;3;4", "technical_novelty": "3;2;3;3", "empirical_novelty": "3;3;3;3", "wc_summary_paper": "101;115;80;323", "wc_summary_review": "25;20;74;36", "wc_main_review": "363;324;533;270", "wc_review": "489;459;687;629", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "714;354;471;133", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 3.5, 0.8660254037844386 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 154.75, 97.93460828532476 ], "wc_summary_review_avg": [ 38.75, 21.158627082114755 ], "wc_main_review_avg": [ 372.5, 98.37301459241758 ], "wc_review_avg": [ 566.0, 94.8525170989152 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 418.0, 209.61035279775663 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 29, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2657305940070693052&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=18Ys0-PzyPI", "email": "ntu.edu.sg;huawei.com;;ntu.edu.sg", "author_num": 4, "aff_unique_index": "0;1;0", "aff_unique_norm": "Nanyang Technological University;Huawei", "aff_unique_dep": ";Noah's Ark Lab", "aff_unique_url": "https://www.ntu.edu.sg;https://www.huawei.com", "aff_unique_abbr": "NTU;Huawei", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", "aff_country_unique": "Singapore;China" }, { "id": "1DUwCRNAbA", "title": "An Investigation into the Role of Author Demographics in ICLR Participation and Review", "track": "main", "status": "Reject", "tldr": "", "abstract": "As machine learning conferences grow rapidly, many are concerned that individuals will be left behind on the basis of traits such as gender and geography. We leverage historic ICLR submissions from 2017 to 2021 to investigate the impact of gender and country of origin both on representation and paper review outcomes at ICLR. We also study various hypotheses that could explain gender representation disparities at ICLR, with a focus on factors that impact the likelihood of an author returning to the conference in consecutive years. Finally, we probe the effects of paper topic on the review process and perform a study on how the inclusion of theorems and the number of co-authors impact the success of papers in the review process.", "keywords": "Conference Review;OpenReview;Gender;Bias;Fairness", "primary_area": "", "supplementary_material": "", "author": "Keshav Ganapathy;Emily Liu;Zain Zarger;Gowthami Somepalli;Micah Goldblum;Tom Goldstein", "authorids": "~Keshav_Ganapathy1;~Emily_Liu1;~Zain_Zarger1;~Gowthami_Somepalli1;~Micah_Goldblum1;~Tom_Goldstein1", "gender": "F;M;F;;M;M", "homepage": ";https://github.com/zzarger;https://somepago.github.io/;;https://www.cs.umd.edu/~tomg/;https://keshavganapathy.github.io/", "dblp": ";;286/5012;241/7231;25/8184;", "google_scholar": ";;T2ezBDsAAAAJ;pGDKzuUAAAAJ;KmSuVtgAAAAJ;", "orcid": ";;;;;", "linkedin": "emilyliu9;;;;;", "or_profile": "~Emily_Liu1;~Zain_Zarger1;~Gowthami_Somepalli1;~Micah_Goldblum1;~Tom_Goldstein1;~Keshav_R_Ganapathy1", "aff": "Montgomery Blair High School;University of Maryland, College Park;University of Maryland, College Park;New York University;University of Maryland, College Park;University of Maryland, College Park", "aff_domain": "mbhs.edu;umd.edu;umd.edu;nyu.edu;umd.edu;umd.edu", "position": "High School Student;Undergrad student;PhD student;Postdoc;Associate Professor;Undergrad student", "bibtex": "@misc{\nganapathy2022an,\ntitle={An Investigation into the Role of Author Demographics in {ICLR} Participation and Review},\nauthor={Keshav Ganapathy and Emily Liu and Zain Zarger and Gowthami Somepalli and Micah Goldblum and Tom Goldstein},\nyear={2022},\nurl={https://openreview.net/forum?id=1DUwCRNAbA}\n}", "github": "", "project": "", "reviewers": "kcQ1;b44g;fQri;4yLt", "site": "https://openreview.net/forum?id=1DUwCRNAbA", "pdf_size": 0, "recommendation": "1;5;6;6", "confidence": "5;4;4;4", "correctness": "2;3;4;3", "technical_novelty": "1;1;2;3", "empirical_novelty": "1;2;3;3", "wc_summary_paper": "46;61;82;84", "wc_summary_review": "70;43;68;78", "wc_main_review": "541;692;752;316", "wc_review": "657;796;902;478", "wc_reply_reviewers": "424;227;26;0", "wc_reply_authors": "484;687;244;212", "reply_reviewers": "2;1;1;0", "reply_authors": "2;1;1;1", "recommendation_avg": [ 4.5, 2.0615528128088303 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 1.75, 0.82915619758885 ], "empirical_novelty_avg": [ 2.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 68.25, 15.690363284513205 ], "wc_summary_review_avg": [ 64.75, 13.102957681378658 ], "wc_main_review_avg": [ 575.25, 168.2667153658144 ], "wc_review_avg": [ 708.25, 158.80865058302084 ], "wc_reply_reviewers_avg": [ 169.25, 171.31750494330694 ], "wc_reply_authors_avg": [ 406.75, 192.95255245785168 ], "reply_reviewers_avg": [ 1.0, 0.7071067811865476 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.9801960588196067, "corr_recommendation_correctness": 0.8574929257125441, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6572922764001081831&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;1;2;1;1", "aff_unique_norm": "Montgomery Blair High School;University of Maryland;New York University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.montgomeryblairhs.org;https://www/umd.edu;https://www.nyu.edu", "aff_unique_abbr": ";UMD;NYU", "aff_campus_unique_index": "1;1;1;1", "aff_campus_unique": ";College Park", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Unifying Likelihood-free Inference with Black-box Optimization and Beyond", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6503", "id": "1HxTO6CTkz", "poster": "", "openreview": "https://openreview.net/forum?id=1HxTO6CTkz", "slides": "https://iclr.cc/virtual/2022/poster/6503", "video": "https://iclr.cc/virtual/2022/poster/6503", "author_site": "Dinghuai Zhang, Jie Fu, Yoshua Bengio, Aaron Courville", "tldr": "", "abstract": "Black-box optimization formulations for biological sequence design have drawn recent attention due to their promising potential impact on the pharmaceutical industry. In this work, we propose to unify two seemingly distinct worlds: likelihood-free inference and black-box optimization, under one probabilistic framework. In tandem, we provide a recipe for constructing various sequence design methods based on this framework. We show how previous optimization approaches can be \"reinvented\" in our framework, and further propose new probabilistic black-box optimization algorithms. Extensive experiments on sequence design application illustrate the benefits of the proposed methodology.", "keywords": "biological sequence design;black-box optimization;likelihood-free inference;Bayesian inference", "primary_area": "", "supplementary_material": "/attachment/621cd50b3f74abec8bcd856fb3e5add524e31a94.zip", "author": "Dinghuai Zhang;Jie Fu;Yoshua Bengio;Aaron Courville", "authorids": "~Dinghuai_Zhang1;~Jie_Fu2;~Yoshua_Bengio1;~Aaron_Courville3", "gender": ";;M;", "homepage": ";;http://yoshuabengio.org;", "dblp": ";;56/953;56/1688", "google_scholar": ";;kukA0LcAAAAJ;https://scholar.google.ca/citations?user=km6CP8cAAAAJ", "orcid": ";;;", "linkedin": ";;yoshuabengio/?originalSubdomain=ca;", "or_profile": "~Dinghuai_Zhang1;~Jie_Fu2;~Yoshua_Bengio1;~Aaron_Courville3", "aff": ";;University of Montreal;Universit\u00e9 de Montr\u00e9al", "aff_domain": ";;umontreal.ca; ", "position": ";;Full Professor;Assistant Professor", "bibtex": "@inproceedings{\nzhang2022unifying,\ntitle={Unifying Likelihood-free Inference with Black-box Sequence Design and Beyond},\nauthor={Dinghuai Zhang and Jie Fu and Yoshua Bengio and Aaron Courville},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=1HxTO6CTkz}\n}", "github": "", "project": "", "reviewers": "1dZH;JsdK;LzqG;5Uhg", "pdf_size": 0, "recommendation": "6;6;8;10", "confidence": "4;4;3;4", "correctness": "3;4;4;4", "technical_novelty": "2;3;4;4", "empirical_novelty": "2;0;2;3", "wc_summary_paper": "22;82;49;84", "wc_summary_review": "90;50;70;28", "wc_main_review": "707;335;305;145", "wc_review": "819;467;424;257", "wc_reply_reviewers": "220;78;0;0", "wc_reply_authors": "2348;1355;185;64", "reply_reviewers": "1;1;0;0", "reply_authors": "5;4;1;1", "recommendation_avg": [ 7.5, 1.6583123951777 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.25, 0.82915619758885 ], "empirical_novelty_avg": [ 1.75, 1.0897247358851685 ], "wc_summary_paper_avg": [ 59.25, 25.606395685453272 ], "wc_summary_review_avg": [ 59.5, 23.038012067016545 ], "wc_main_review_avg": [ 373.0, 205.91745919178393 ], "wc_review_avg": [ 491.75, 204.57440577941318 ], "wc_reply_reviewers_avg": [ 74.5, 89.83735303313428 ], "wc_reply_authors_avg": [ 988.0, 933.1229822483208 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 2.75, 1.7853571071357126 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.17407765595569782, "corr_recommendation_correctness": 0.5222329678670935, "gs_citation": 28, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5950905850445809311&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=1HxTO6CTkz", "email": ";;umontreal.ca; ", "author_num": 4, "aff_unique_index": "0;1", "aff_unique_norm": "University of Montreal;Universit\u00e9 de Montr\u00e9al", "aff_unique_dep": ";", "aff_unique_url": "https://wwwumontreal.ca;https://www.umontreal.ca", "aff_unique_abbr": "UM;UdeM", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Canada" }, { "id": "1IiJQTDpuG", "title": "ImaginE: An Imagination-Based Automatic Evaluation Metric for Natural Language Generation", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Automatic evaluations for natural language generation conventionally rely on token-level or embedding-level comparisons with the text references. This is different from human evaluation manners, in which people also form pictures of the text contents in their minds during reading. In this work, we propose ImaginE, an imagination-based automatic evaluation metric for natural language generation. With the help of CLIP and DALL-E, two cross-modal models pre-trained on large-scale image-text pairs, we automatically generate an image as the embodied imagination for the text snippet, and compute the imagination similarity using contextual embeddings. Experiments spanning several text generation tasks demonstrate that adding imagination with our ImaginE displays great potential in introducing multi-modal information into NLG evaluation, and improves existing automatic metrics\u2019 correlations with human similarity judgments in many circumstances.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Wanrong Zhu;Xin Eric Wang;An Yan;Miguel Eckstein;William Yang Wang", "authorids": "~Wanrong_Zhu1;~Xin_Eric_Wang2;~An_Yan1;~Miguel_Eckstein1;~William_Yang_Wang2", "gender": ";M;;M;", "homepage": ";https://eric-xw.github.io;https://zzxslp.github.io;https://psych.ucsb.edu/people/faculty/miguel-eckstein;", "dblp": ";10/5630-61;37/10133-3;56/975;", "google_scholar": ";YjqluE0AAAAJ;7I_zqNoAAAAJ;G5dQztgAAAAJ;", "orcid": ";0000-0003-2605-5504;;;", "linkedin": ";;;;", "or_profile": "~Wanrong_Zhu1;~Xin_Eric_Wang2;~An_Yan1;~Miguel_Eckstein1;~William_Yang_Wang2", "aff": ";University of California, Santa Cruz;University of California, San Diego;;", "aff_domain": ";ucsc.edu;ucsd.edu;;", "position": ";Assistant Professor;PhD student;;", "bibtex": "@misc{\nzhu2022imagine,\ntitle={ImaginE: An Imagination-Based Automatic Evaluation Metric for Natural Language Generation},\nauthor={Wanrong Zhu and Xin Eric Wang and An Yan and Miguel Eckstein and William Yang Wang},\nyear={2022},\nurl={https://openreview.net/forum?id=1IiJQTDpuG}\n}", "github": "", "project": "", "reviewers": "oVWd;Lu5x;HnYH;mCxc", "site": "https://openreview.net/forum?id=1IiJQTDpuG", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "4;4;4;4", "correctness": "2;3;3;4", "technical_novelty": "2;2;3;2", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "192;88;42;36", "wc_summary_review": "64;45;9;27", "wc_main_review": "717;399;66;150", "wc_review": "973;532;117;213", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 89.5, 62.503999872008194 ], "wc_summary_review_avg": [ 36.25, 20.461854754640402 ], "wc_main_review_avg": [ 333.0, 253.27356751149537 ], "wc_review_avg": [ 458.75, 334.29206915510275 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.9733285267845754, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=292962099060002374&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff_unique_index": "0;1", "aff_unique_norm": "University of California, Santa Cruz;University of California, San Diego", "aff_unique_dep": ";", "aff_unique_url": "https://www.ucsc.edu;https://www.ucsd.edu", "aff_unique_abbr": "UCSC;UCSD", "aff_campus_unique_index": "0;1", "aff_campus_unique": "Santa Cruz;San Diego", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Source-Free Adaptation to Measurement Shift via Bottom-Up Feature Restoration", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6285", "id": "1JDiK_TbV4S", "poster": "", "openreview": "https://openreview.net/forum?id=1JDiK_TbV4S", "slides": "https://iclr.cc/virtual/2022/poster/6285", "video": "https://iclr.cc/virtual/2022/poster/6285", "author_site": "Cian Eastwood, Ian Mason, Chris Williams, Bernhard Schoelkopf", "tldr": "", "abstract": "Source-free domain adaptation (SFDA) aims to adapt a model trained on labelled data in a source domain to unlabelled data in a target domain without access to the source-domain data during adaptation. Existing methods for SFDA leverage entropy-minimization techniques which: (i) apply only to classification; (ii) destroy model calibration; and (iii) rely on the source model achieving a good level of feature-space class-separation in the target domain. We address these issues for a particularly pervasive type of domain shift called measurement shift which can be resolved by restoring the source features rather than extracting new ones. In particular, we propose Feature Restoration (FR) wherein we: (i) store a lightweight and flexible approximation of the feature distribution under the source data; and (ii) adapt the feature-extractor such that the approximate feature distribution under the target data realigns with that saved on the source. We additionally propose a bottom-up training scheme which boosts performance, which we call Bottom-Up Feature Restoration (BUFR). On real and synthetic data, we demonstrate that BUFR outperforms existing SFDA methods in terms of accuracy, calibration, and data efficiency, while being less reliant on the performance of the source model in the target domain.\n", "keywords": "Transfer learning;dataset shift;unsupervised domain adaptation;source-free domain adaptation", "primary_area": "", "supplementary_material": "/attachment/0c82fd39eca730bb05e176302ee6db836cf32787.zip", "author": "Cian Eastwood;Ian Mason;Chris Williams;Bernhard Sch\u00f6lkopf", "authorids": "~Cian_Eastwood1;~Ian_Mason1;~Chris_Williams1;~Bernhard_Sch\u00f6lkopf1", "gender": "M;;;", "homepage": "https://cianeastwood.github.io/;https://www.ianxmason.com;http://homepages.inf.ed.ac.uk/ckiw/;", "dblp": "238/2792;145/5797;w/ChristopherKIWilliams;", "google_scholar": "https://scholar.google.com/citations?hl=en;m5ZV5wsAAAAJ;https://scholar.google.co.uk/citations?user=rvKJDbIAAAAJ;", "orcid": ";0000-0003-3091-9045;;", "linkedin": ";;;", "or_profile": "~Cian_Eastwood1;~Ian_Mason1;~Chris_Williams1;~Bernhard_Sch\u00f6lkopf1", "aff": "University of Edinburgh;University of Edinburgh;University of Edinburgh;", "aff_domain": "ed.ac.uk;ed.ac.uk;ed.ac.uk;", "position": "PhD student;PhD student;Full Professor;", "bibtex": "@inproceedings{\neastwood2022sourcefree,\ntitle={Source-Free Adaptation to Measurement Shift via Bottom-Up Feature Restoration},\nauthor={Cian Eastwood and Ian Mason and Chris Williams and Bernhard Sch{\\\"o}lkopf},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=1JDiK_TbV4S}\n}", "github": "", "project": "", "reviewers": "Fzmz;gJxM;uMHx;V9Lg;HPoa", "pdf_size": 0, "recommendation": "6;8;8;8;8", "confidence": "4;4;4;4;5", "correctness": "2;4;4;4;4", "technical_novelty": "3;3;3;3;3", "empirical_novelty": "3;3;3;4;3", "wc_summary_paper": "189;101;125;126;63", "wc_summary_review": "198;58;25;59;34", "wc_main_review": "722;212;686;277;241", "wc_review": "1109;371;836;462;338", "wc_reply_reviewers": "0;31;134;77;0", "wc_reply_authors": "1663;470;1289;971;393", "reply_reviewers": "0;1;1;1;0", "reply_authors": "3;1;2;2;1", "recommendation_avg": [ 7.6, 0.7999999999999999 ], "confidence_avg": [ 4.2, 0.39999999999999997 ], "correctness_avg": [ 3.6, 0.8000000000000002 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 3.2, 0.39999999999999997 ], "wc_summary_paper_avg": [ 120.8, 41.058007745140294 ], "wc_summary_review_avg": [ 74.8, 63.015553635590635 ], "wc_main_review_avg": [ 427.6, 226.9031511460341 ], "wc_review_avg": [ 623.2, 300.73736049915715 ], "wc_reply_reviewers_avg": [ 48.4, 51.250756091983654 ], "wc_reply_authors_avg": [ 957.2, 482.51855922855447 ], "reply_reviewers_avg": [ 0.6, 0.48989794855663565 ], "reply_authors_avg": [ 1.8, 0.7483314773547883 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.2500000000000001, "corr_recommendation_correctness": 1.0, "gs_citation": 71, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13912921237099843796&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "pdf": "https://openreview.net/pdf?id=1JDiK_TbV4S", "email": "ed.ac.uk;ed.ac.uk;ed.ac.uk;", "author_num": 4, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Edinburgh", "aff_unique_dep": "", "aff_unique_url": "https://www.ed.ac.uk", "aff_unique_abbr": "Edinburgh", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United Kingdom" }, { "id": "1JN7MepVDFv", "title": "On the relationship between disentanglement and multi-task learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "One of the main arguments behind studying disentangled representations is the assumption that they can be easily reused in different tasks. At the same time finding a joint, adaptable representation of data is one of the key challenges in the multi-task learning setting. In this paper, we take a closer look at the relationship between disentanglement and multi-task learning based on hard parameter sharing. We perform a thorough empirical study of the representations obtained by neural networks trained on automatically generated supervised tasks. Using a set of standard metrics we show that disentanglement appears in a natural way during the process of multi-task neural network training.", "keywords": "disentanglement representations;multi-task learning", "primary_area": "", "supplementary_material": "/attachment/fc25ea9267e9cc648fc6244d5cb41dd0e7552a72.zip", "author": "Lukasz Maziarka;Aleksandra Nowak;Maciej Wolczyk;Andrzej Bedychaj", "authorids": "~Lukasz_Maziarka1;~Aleksandra_Nowak1;~Maciej_Wolczyk1;~Andrzej_Bedychaj1", "gender": "M;F;M;M", "homepage": ";;;", "dblp": ";34/10106;236/5956;", "google_scholar": "https://scholar.google.pl/citations?user=2dkp8z4AAAAJ;2A-eZhQAAAAJ;;", "orcid": "0000-0001-6947-8131;0000-0002-2830-6613;;", "linkedin": "lukasz-maziarka-161749b3/;;;andrzej-bedychaj-a248483b/", "or_profile": "~Lukasz_Maziarka1;~Aleksandra_Nowak1;~Maciej_Wolczyk1;~Andrzej_Bedychaj1", "aff": "Jagiellonian University;University of Twente;Jagiellonian University Cracow;Jagiellonian University", "aff_domain": "uj.edu.pl;utwente.nl;uj.edu.pl;uj.edu.pl", "position": "PhD student;Intern;PhD student;PhD student", "bibtex": "@misc{\nmaziarka2022on,\ntitle={On the relationship between disentanglement and multi-task learning},\nauthor={Lukasz Maziarka and Aleksandra Nowak and Maciej Wolczyk and Andrzej Bedychaj},\nyear={2022},\nurl={https://openreview.net/forum?id=1JN7MepVDFv}\n}", "github": "", "project": "", "reviewers": "ZyfG;zzap;r7PG;NYHm", "site": "https://openreview.net/forum?id=1JN7MepVDFv", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "4;4;3;3", "correctness": "2;3;3;3", "technical_novelty": "3;3;3;2", "empirical_novelty": "2;3;4;3", "wc_summary_paper": "99;71;52;75", "wc_summary_review": "35;50;29;145", "wc_main_review": "415;457;358;160", "wc_review": "549;578;439;380", "wc_reply_reviewers": "277;148;106;0", "wc_reply_authors": "932;1286;1226;503", "reply_reviewers": "1;1;1;0", "reply_authors": "3;3;3;2", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 74.25, 16.723860200324566 ], "wc_summary_review_avg": [ 64.75, 46.959423974320636 ], "wc_main_review_avg": [ 347.5, 113.81234555178976 ], "wc_review_avg": [ 486.5, 80.43164800002546 ], "wc_reply_reviewers_avg": [ 132.75, 99.22039860835069 ], "wc_reply_authors_avg": [ 986.75, 309.75907977006904 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 2.75, 0.4330127018922193 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -1.0, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1184269708812417316&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff_unique_index": "0;1;0;0", "aff_unique_norm": "Jagiellonian University;University of Twente", "aff_unique_dep": ";", "aff_unique_url": "https://www.uj.edu.pl;https://www.utwente.nl", "aff_unique_abbr": "UJ;UT", "aff_campus_unique_index": "1", "aff_campus_unique": ";Cracow", "aff_country_unique_index": "0;1;0;0", "aff_country_unique": "Poland;Netherlands" }, { "title": "Filtered-CoPhy: Unsupervised Learning of Counterfactual Physics in Pixel Space", "status": "Oral", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6541", "id": "1L0C5ROtFp", "poster": "", "openreview": "https://openreview.net/forum?id=1L0C5ROtFp", "slides": "https://iclr.cc/virtual/2022/poster/6541", "video": "https://iclr.cc/virtual/2022/poster/6541", "author_site": "Steeven Janny, Fabien Baradel, Natalia Neverova, Madiha Nadri, Greg Mori, Christian Wolf", "tldr": "", "abstract": "Learning causal relationships in high-dimensional data (images, videos) is a hard task, as they are often defined on low dimensional manifolds and must be extracted from complex signals dominated by appearance, lighting, textures and also spurious correlations in the data. We present a method for learning counterfactual reasoning of physical processes in pixel space, which requires the prediction of the impact of interventions on initial conditions. Going beyond the identification of structural relationships, we deal with the challenging problem of forecasting raw video over long horizons. Our method does not require the knowledge or supervision of any ground truth positions or other object or scene properties. Our model learns and acts on a suitable hybrid latent representation based on a combination of dense features, sets of 2D keypoints and an additional latent vector per keypoint. We show that this better captures the dynamics of physical processes than purely dense or sparse representations. We introduce a new challenging and carefully designed counterfactual benchmark for predictions in pixel space and outperform strong baselines in physics-inspired ML and video prediction.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Steeven JANNY;Fabien Baradel;Natalia Neverova;Madiha Nadri;Greg Mori;Christian Wolf", "authorids": "~Steeven_JANNY2;~Fabien_Baradel1;~Natalia_Neverova1;madiha.nadri-wolf@univ-lyon1.fr;~Greg_Mori2;~Christian_Wolf1", "gender": "M;M;F;;;", "homepage": "https://steevenjanny.github.io/;https://fabienbaradel.github.io;https://nneverova.github.io/;;;", "dblp": "228/8300;198/1352;119/1495;;;", "google_scholar": "IC0ceIgAAAAJ;https://scholar.google.fr/citations?user=egECWaEAAAAJ;https://scholar.google.fr/citations?user=cLPaHcIAAAAJ;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": "~Steeven_JANNY2;~Fabien_Baradel1;~Natalia_Neverova1;madiha.nadri-wolf@univ-lyon1.fr;~Greg_Mori2;~Christian_Wolf1", "aff": ";Naver Labs Europe;Meta GenAI;;;", "aff_domain": ";naverlabs.com;meta.com;;;", "position": ";Research Scientist;Principal Researcher;;;", "bibtex": "@inproceedings{\njanny2022filteredcophy,\ntitle={Filtered-CoPhy: Unsupervised Learning of Counterfactual Physics in Pixel Space},\nauthor={Steeven JANNY and Fabien Baradel and Natalia Neverova and Madiha Nadri and Greg Mori and Christian Wolf},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=1L0C5ROtFp}\n}", "github": "", "project": "", "reviewers": "sqvy;v4mS;tY4H", "pdf_size": 0, "recommendation": "8;8;10", "confidence": "4;5;3", "correctness": "4;4;4", "technical_novelty": "3;3;4", "empirical_novelty": "3;4;4", "wc_summary_paper": "86;144;236", "wc_summary_review": "107;73;39", "wc_main_review": "271;215;165", "wc_review": "464;432;440", "wc_reply_reviewers": "0;78;166", "wc_reply_authors": "814;1230;886", "reply_reviewers": "0;2;2", "reply_authors": "1;2;2", "recommendation_avg": [ 8.666666666666666, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "correctness_avg": [ 4.0, 0.0 ], "technical_novelty_avg": [ 3.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 155.33333333333334, 61.759389749431804 ], "wc_summary_review_avg": [ 73.0, 27.760883751542686 ], "wc_main_review_avg": [ 217.0, 43.29742101634538 ], "wc_review_avg": [ 445.3333333333333, 13.59738536958076 ], "wc_reply_reviewers_avg": [ 81.33333333333333, 67.8101926130742 ], "wc_reply_authors_avg": [ 976.6666666666666, 181.52930586792007 ], "reply_reviewers_avg": [ 1.3333333333333333, 0.9428090415820634 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.8660254037844385, "corr_recommendation_correctness": 0.0, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16343781972156996080&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "pdf": "https://openreview.net/pdf?id=1L0C5ROtFp", "email": ";naverlabs.com;meta.com;;;", "author_num": 6, "aff_unique_index": "0;1", "aff_unique_norm": "NAVER LABS;Meta", "aff_unique_dep": ";Meta GenAI", "aff_unique_url": "https://labs.naver.com;https://meta.com", "aff_unique_abbr": "NLE;Meta", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "Unknown;United States" }, { "id": "1LVeBXpLohL", "title": "Network calibration by weight scaling", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Calibrating neural networks is crucial in applications where the decision making depends on the predicted probabilities. Modern neural networks are not well calibrated and they tend to overestimate probabilities when compared to the expected accuracy. This results in a misleading reliability that corrupts our decision policy. We define a weight scaling calibration method that computes a convex combination of the network output class distribution and the uniform distribution. The weights controls the confidence of the calibrated prediction. Since the goal of calibration is making the confidence prediction more accurate, the most suitable\nweight is found as a function of the given confidence. We derive an optimization method that is based on a closed form solution for the optimal weight scaling in each bin of a discretized value of the prediction confidence. We report extensive experiments on a variety of image datasets and network architectures. This approach achieves state-of-the-art calibration with a guarantee that the classification accuracy is not altered.", "keywords": "network calibration;temperature scaling;Expected Calibration Error (ECE)", "primary_area": "", "supplementary_material": "", "author": "Lior Frenkel;Jacob Goldberger", "authorids": "~Lior_Frenkel1;~Jacob_Goldberger1", "gender": "M;M", "homepage": ";http://www.eng.biu.ac.il/goldbej/", "dblp": ";65/6574", "google_scholar": ";https://scholar.google.co.il/citations?user=vgzrOK4AAAAJ", "orcid": ";", "linkedin": "lior-frenkel-0b248811a/;", "or_profile": "~Lior_Frenkel1;~Jacob_Goldberger1", "aff": "Bar Ilan University, Technion;Bar-Ilan University", "aff_domain": "biu.ac.il;biu.ac.il", "position": "MS student;Full Professor", "bibtex": "@misc{\nfrenkel2022network,\ntitle={Network calibration by weight scaling},\nauthor={Lior Frenkel and Jacob Goldberger},\nyear={2022},\nurl={https://openreview.net/forum?id=1LVeBXpLohL}\n}", "github": "", "project": "", "reviewers": "aSTq;enkr;wLV8;vd9o", "site": "https://openreview.net/forum?id=1LVeBXpLohL", "pdf_size": 0, "recommendation": "3;3;3;3", "confidence": "4;5;4;3", "correctness": "3;2;3;3", "technical_novelty": "1;2;2;2", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "56;158;74;81", "wc_summary_review": "34;34;126;19", "wc_main_review": "593;450;595;253", "wc_review": "683;642;795;353", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 1.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 92.25, 39.04084399702445 ], "wc_summary_review_avg": [ 53.25, 42.446289590493066 ], "wc_main_review_avg": [ 472.75, 139.83271255325056 ], "wc_review_avg": [ 618.25, 163.06037992105868 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:8uQcTSSJYFUJ:scholar.google.com/&scioq=Network+calibration+by+weight+scaling&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Bar-Ilan University", "aff_unique_dep": "", "aff_unique_url": "https://www.biu.ac.il", "aff_unique_abbr": "BIU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Israel" }, { "title": "Map Induction: Compositional spatial submap learning for efficient exploration in novel environments", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6636", "id": "1NUsBU-7HAL", "poster": "", "openreview": "https://openreview.net/forum?id=1NUsBU-7HAL", "slides": "https://iclr.cc/virtual/2022/poster/6636", "video": "https://iclr.cc/virtual/2022/poster/6636", "author_site": "Sugandha Sharma, Aidan Curtis, Marta Kryven, Joshua B Tenenbaum, Ila Fiete", "tldr": "", "abstract": "Humans are expert explorers and foragers. Understanding the computational cognitive mechanisms that support this capability can advance the study of the human mind and enable more efficient exploration algorithms. We hypothesize that humans explore new environments by inferring the structure of unobserved spaces through re-use of spatial information collected from previously explored spaces. Taking inspiration from the neuroscience of repeating map fragments and ideas about program induction, we present a novel ``Map Induction'' framework, which involves the generation of novel map proposals for unseen environments based on compositions of already-seen spaces in a Hierarchical Bayesian framework. The model thus explicitly reasons about unseen spaces through a distribution of strong spatial priors. We introduce a new behavioral Map Induction Task (MIT) that involves foraging for rewards to compare human performance with state-of-the-art existing models and Map Induction. We show that Map Induction better predicts human behavior than the non-inductive baselines. We also show that Map Induction, when used to augment state-of-the-art approximate planning algorithms, improves their performance.\n", "keywords": "Cognitive Science;Bayesian Framework;Program Induction;Spatial Navigation;Planning;Map Learning", "primary_area": "", "supplementary_material": "", "author": "Sugandha Sharma;Aidan Curtis;Marta Kryven;Joshua B. Tenenbaum;Ila R Fiete", "authorids": "~Sugandha_Sharma1;~Aidan_Curtis2;~Marta_Kryven1;~Joshua_B._Tenenbaum1;~Ila_R_Fiete1", "gender": "F;M;;;F", "homepage": "https://www.sugandhasharma.com/;;https://marta-kryven.github.io;;https://fietelab.mit.edu/", "dblp": ";;134/5551;t/JoshuaBTenenbaum;", "google_scholar": "FsXCQc8AAAAJ;tRJf4Q8AAAAJ;https://scholar.google.com/citations?hl=en;;uE-CihIAAAAJ", "orcid": ";;0000-0002-2764-8611;;0000-0003-4738-2539", "linkedin": "sugandhasharma17/;;marta-kryven/;;", "or_profile": "~Sugandha_Sharma1;~Aidan_Curtis2;~Marta_Kryven1;~Joshua_B._Tenenbaum1;~Ila_R_Fiete1", "aff": "Massachusetts Institute of Technology;Massachusetts Institute of Technology;Massachusetts Institute of Technology;Massachusetts Institute of Technology;Massachusetts Institute of Technology", "aff_domain": "mit.edu;mit.edu;mit.edu;mit.edu;mit.edu", "position": "PhD student;PhD student;Postdoc;Professor;Professor", "bibtex": "@inproceedings{\nsharma2022map,\ntitle={Map Induction: Compositional spatial submap learning for efficient exploration in novel environments},\nauthor={Sugandha Sharma and Aidan Curtis and Marta Kryven and Joshua B. Tenenbaum and Ila R Fiete},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=1NUsBU-7HAL}\n}", "github": "", "project": "", "reviewers": "8xU7;M48j;toMF;vNyW", "pdf_size": 0, "recommendation": "6;6;6;8", "confidence": "3;4;3;4", "correctness": "3;3;3;3", "technical_novelty": "4;2;4;3", "empirical_novelty": "4;3;2;4", "wc_summary_paper": "112;224;99;75", "wc_summary_review": "115;83;60;38", "wc_main_review": "377;498;323;314", "wc_review": "604;805;482;427", "wc_reply_reviewers": "0;0;0;214", "wc_reply_authors": "744;766;709;741", "reply_reviewers": "0;0;0;2", "reply_authors": "1;1;1;2", "recommendation_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 3.25, 0.82915619758885 ], "empirical_novelty_avg": [ 3.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 127.5, 57.273466806192204 ], "wc_summary_review_avg": [ 74.0, 28.521921393903323 ], "wc_main_review_avg": [ 378.0, 73.35189159115122 ], "wc_review_avg": [ 579.5, 145.09738109283708 ], "wc_reply_reviewers_avg": [ 53.5, 92.66471820493493 ], "wc_reply_authors_avg": [ 740.0, 20.334699407662754 ], "reply_reviewers_avg": [ 0.5, 0.8660254037844386 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": 0.0, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15462260189293500047&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=1NUsBU-7HAL", "email": "mit.edu;mit.edu;mit.edu;mit.edu;mit.edu", "author_num": 5, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Massachusetts Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://web.mit.edu", "aff_unique_abbr": "MIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Neural Networks as Kernel Learners: The Silent Alignment Effect", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7005", "id": "1NvflqAdoom", "poster": "", "openreview": "https://openreview.net/forum?id=1NvflqAdoom", "slides": "https://iclr.cc/virtual/2022/poster/7005", "video": "https://iclr.cc/virtual/2022/poster/7005", "author_site": "Alexander Atanasov, Blake Bordelon, Cengiz Pehlevan", "tldr": "", "abstract": "Neural networks in the lazy training regime converge to kernel machines. Can neural networks in the rich feature learning regime learn a kernel machine with a data-dependent kernel? We demonstrate that this can indeed happen due to a phenomenon we term silent alignment, which requires that the tangent kernel of a network evolves in eigenstructure while small and before the loss appreciably decreases, and grows only in overall scale afterwards. We show that such an effect takes place in homogenous neural networks with small initialization and whitened data. We provide an analytical treatment of this effect in the linear network case. In general, we find that the kernel develops a low-rank contribution in the early phase of training, and then evolves in overall scale, yielding a function equivalent to a kernel regression solution with the final network's tangent kernel. The early spectral learning of the kernel depends on the depth. We also demonstrate that non-whitened data can weaken the silent alignment effect.", "keywords": "Neural Tangent Kernel;Feature Learning;Inductive Bias of Neural Networks", "primary_area": "", "supplementary_material": "/attachment/ea777ffc3162711e4eed957df7469affa245766a.zip", "author": "Alexander Atanasov;Blake Bordelon;Cengiz Pehlevan", "authorids": "~Alexander_Atanasov1;~Blake_Bordelon1;~Cengiz_Pehlevan2", "gender": "M;M;", "homepage": "http://abatanasov.com/;https://blakebordelon.github.io/;https://pehlevan.seas.harvard.edu/", "dblp": "305/3785.html;228/6993;145/3480", "google_scholar": "abMQRYIAAAAJ;yeQ8_pgAAAAJ;veDLTPEAAAAJ", "orcid": "0000-0002-3338-0324;0000-0003-0455-9445;0000-0001-9767-6063", "linkedin": "alexatanasov/;;", "or_profile": "~Alexander_Atanasov1;~Blake_Bordelon1;~Cengiz_Pehlevan2", "aff": "Harvard University;Harvard University;School of Engineering and Applied Sciences, Harvard University", "aff_domain": "harvard.edu;harvard.edu;seas.harvard.edu", "position": "PhD student;PhD student;Assistant Professor", "bibtex": "@inproceedings{\natanasov2022neural,\ntitle={Neural Networks as Kernel Learners: The Silent Alignment Effect},\nauthor={Alexander Atanasov and Blake Bordelon and Cengiz Pehlevan},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=1NvflqAdoom}\n}", "github": "", "project": "", "reviewers": "puin;XmeY;dMqU", "pdf_size": 0, "recommendation": "5;6;8", "confidence": "4;4;3", "correctness": "4;2;4", "technical_novelty": "3;4;3", "empirical_novelty": "3;3;3", "wc_summary_paper": "95;105;174", "wc_summary_review": "40;46;36", "wc_main_review": "226;265;588", "wc_review": "361;416;798", "wc_reply_reviewers": "267;100;32", "wc_reply_authors": "1855;955;509", "reply_reviewers": "2;1;1", "reply_authors": "5;2;1", "recommendation_avg": [ 6.333333333333333, 1.247219128924647 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.9428090415820634 ], "technical_novelty_avg": [ 3.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 124.66666666666667, 35.122009560324926 ], "wc_summary_review_avg": [ 40.666666666666664, 4.109609335312651 ], "wc_main_review_avg": [ 359.6666666666667, 162.2391924152594 ], "wc_review_avg": [ 525.0, 194.34162360818812 ], "wc_reply_reviewers_avg": [ 133.0, 98.73533646403736 ], "wc_reply_authors_avg": [ 1106.3333333333333, 559.8245756980981 ], "reply_reviewers_avg": [ 1.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 2.6666666666666665, 1.699673171197595 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.944911182523068, "corr_recommendation_correctness": 0.18898223650461365, "gs_citation": 107, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11955245405333483637&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "pdf": "https://openreview.net/pdf?id=1NvflqAdoom", "email": "harvard.edu;harvard.edu;seas.harvard.edu", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Harvard University", "aff_unique_dep": "", "aff_unique_url": "https://www.harvard.edu", "aff_unique_abbr": "Harvard", "aff_campus_unique_index": "1", "aff_campus_unique": ";Cambridge", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "1O5UK-zoK8g", "title": "Adaptive Generalization for Semantic Segmentation", "track": "main", "status": "Reject", "tldr": "", "abstract": "Out-of-distribution robustness remains a salient weakness of current state-of-the-art models for semantic segmentation. Until recently, research on generalization followed a restrictive assumption that the model parameters remain fixed after the training process. In this work, we empirically study an adaptive inference strategy for semantic segmentation that adjusts the model to the test sample before producing the final prediction. We achieve this with two complementary techniques. Using Instance-adaptive Batch Normalization (IaBN), we modify normalization layers by combining the feature statistics acquired at training time with those of the test sample. We next introduce a test-time training (TTT) approach for semantic segmentation, Seg-TTT, which adapts the model parameters to the test sample using a self-supervised loss. Relying on a more rigorous evaluation protocol compared to previous work on generalization in semantic segmentation, our study shows that these techniques consistently and significantly outperform the baseline and attain a new state of the art, substantially improving in accuracy over previous generalization methods.", "keywords": "domain generalization;semantic segmentation;test-time training", "primary_area": "", "supplementary_material": "/attachment/20b9656290406527a48d541b1fbb2c125589d45b.zip", "author": "Sherwin Bahmani;Oliver Hahn;Eduard Sebastian Zamfir;Nikita Araslanov;Stefan Roth", "authorids": "~Sherwin_Bahmani1;~Oliver_Hahn2;~Eduard_Sebastian_Zamfir1;~Nikita_Araslanov1;~Stefan_Roth1", "gender": "M;M;M;M;M", "homepage": "https://sherwinbahmani.github.io;https://olvrhhn.github.io/;https://eduardzamfir.github.io;https://arnike.github.io;https://www.visinf.tu-darmstadt.de/visual_inference/people_vi/stefan_roth.en.jsp", "dblp": "307/5156;63/44451;326/5425;173/7854;24/3452", "google_scholar": "XPD09yEAAAAJ;https://scholar.google.com/citations?hl=de;5-FIWKoAAAAJ;RdMFioAAAAAJ;0yDoR0AAAAAJ", "orcid": ";0009-0008-6164-1035;;;0000-0001-9002-9832", "linkedin": "sherwin-bahmani-a2b5691a9/;;eduard-zamfir-167660161/;;stefanroth13", "or_profile": "~Sherwin_Bahmani1;~Oliver_Hahn2;~Eduard_Sebastian_Zamfir1;~Nikita_Araslanov1;~Stefan_Roth1", "aff": "Stanford University;Technische Universit\u00e4t Darmstadt;TU Darmstadt;Technische Universit\u00e4t M\u00fcnchen;Technische Universit\u00e4t Darmstadt", "aff_domain": "stanford.edu;tu-darmstadt.de;tu-darmstadt.de;tum.de;tu-darmstadt.de", "position": "Intern;MS student;MS student;Postdoc;Full Professor", "bibtex": "@misc{\nbahmani2022adaptive,\ntitle={Adaptive Generalization for Semantic Segmentation},\nauthor={Sherwin Bahmani and Oliver Hahn and Eduard Sebastian Zamfir and Nikita Araslanov and Stefan Roth},\nyear={2022},\nurl={https://openreview.net/forum?id=1O5UK-zoK8g}\n}", "github": "", "project": "", "reviewers": "FDdf;YkXo;tfFH;Dswx", "site": "https://openreview.net/forum?id=1O5UK-zoK8g", "pdf_size": 0, "recommendation": "5;5;5;6", "confidence": "4;3;3;3", "correctness": "4;3;2;3", "technical_novelty": "2;3;2;3", "empirical_novelty": "2;3;2;3", "wc_summary_paper": "93;44;115;95", "wc_summary_review": "60;33;55;40", "wc_main_review": "339;221;147;262", "wc_review": "492;298;317;397", "wc_reply_reviewers": "44;0;0;0", "wc_reply_authors": "644;582;602;736", "reply_reviewers": "1;0;0;0", "reply_authors": "2;1;1;1", "recommendation_avg": [ 5.25, 0.4330127018922193 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 86.75, 26.13785568863674 ], "wc_summary_review_avg": [ 47.0, 10.931605554537724 ], "wc_main_review_avg": [ 242.25, 69.41676670661059 ], "wc_review_avg": [ 376.0, 76.58655234439007 ], "wc_reply_reviewers_avg": [ 11.0, 19.05255888325765 ], "wc_reply_authors_avg": [ 641.0, 59.23681287847955 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14642247274808313548&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;1;2;1", "aff_unique_norm": "Stanford University;Technische Universit\u00e4t Darmstadt;Technische Universit\u00e4t M\u00fcnchen", "aff_unique_dep": ";;", "aff_unique_url": "https://www.stanford.edu;https://www.tu-darmstadt.de;https://www.tum.de", "aff_unique_abbr": "Stanford;TUD;TUM", "aff_campus_unique_index": "0;2", "aff_campus_unique": "Stanford;;Darmstadt", "aff_country_unique_index": "0;1;1;1;1", "aff_country_unique": "United States;Germany" }, { "id": "1OHZX4YDqhT", "title": "FedNAS: Federated Deep Learning via Neural Architecture Search", "track": "main", "status": "Reject", "tldr": "", "abstract": "Federated Learning (FL) is an effective learning framework used when data cannotbe centralized due to privacy, communication costs, and regulatory restrictions.While there have been many algorithmic advances in FL, significantly less effort hasbeen made on model development, and most works in FL employ predefined modelarchitectures discovered in the centralized environment. However, these predefinedarchitectures may not be the optimal choice for the FL setting since the user datadistribution at FL users is often non-identical and independent distribution (non-IID). This well-known challenge in FL has often been studied at the optimizationlayer. Instead, we advocate for a different (and complementary) approach. Wepropose Federated Neural Architecture Search (FedNAS) for automating the modeldesign process in FL. More specifically, FedNAS enables scattered workers tosearch for a better architecture in a collaborative fashion to achieve higher accuracy. Beyond automating and improving FL model design, FedNAS also provides anew paradigm for personalized FL via customizing not only the model weightsbut also the neural architecture of each user. As such, we also compare FedNASwith representative personalized FL methods, including perFedAvg (based on meta-learning), Ditto (bi-level optimization), and local fine-tuning. Our experiments ona non-IID dataset show that the architecture searched by FedNAS can outperformthe manually predefined architecture as well as existing personalized FL methods.To facilitate further research and real-world deployment, we also build a realisticdistributed training system for FedNAS, which will be publicly available andmaintained regularly. ", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/7a9845a1beec272fb8b4c16e962123e5f1433d94.zip", "author": "Chaoyang He;Erum Mushtaq;Jie Ding;Salman Avestimehr", "authorids": "~Chaoyang_He1;~Erum_Mushtaq1;~Jie_Ding2;~Salman_Avestimehr1", "gender": "M;;M;", "homepage": "http://chaoyanghe.com;https://scholar.google.com/citations?user=C5IpcRYAAAAJ&hl=en;http://jding.org;", "dblp": "222/6721-1.html;;94/1825-2;", "google_scholar": "2z2camUAAAAJ;;ZyqvoqcAAAAJ;", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Chaoyang_He1;~Erum_Mushtaq1;~Jie_Ding2;~Salman_Avestimehr1", "aff": "University of Southern California;University of Southern California;University of Minnesota, Minneapolis;", "aff_domain": "usc.edu;usc.edu;umn.edu;", "position": "PhD student;PhD student;Assistant Professor;", "bibtex": "@misc{\nhe2022fednas,\ntitle={Fed{NAS}: Federated Deep Learning via Neural Architecture Search},\nauthor={Chaoyang He and Erum Mushtaq and Jie Ding and Salman Avestimehr},\nyear={2022},\nurl={https://openreview.net/forum?id=1OHZX4YDqhT}\n}", "github": "", "project": "", "reviewers": "3t92;y4xh;sG5T;xdiq", "site": "https://openreview.net/forum?id=1OHZX4YDqhT", "pdf_size": 0, "recommendation": "5;5;5;6", "confidence": "4;4;3;3", "correctness": "3;3;3;4", "technical_novelty": "2;3;3;2", "empirical_novelty": "2;3;2;3", "wc_summary_paper": "26;57;56;53", "wc_summary_review": "1;60;29;77", "wc_main_review": "155;207;134;216", "wc_review": "182;324;219;346", "wc_reply_reviewers": "85;0;0;0", "wc_reply_authors": "655;288;229;205", "reply_reviewers": "1;0;0;0", "reply_authors": "3;1;1;1", "recommendation_avg": [ 5.25, 0.4330127018922193 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 48.0, 12.786711852544421 ], "wc_summary_review_avg": [ 41.75, 29.14939965076468 ], "wc_main_review_avg": [ 178.0, 34.46012188022556 ], "wc_review_avg": [ 267.75, 68.95061638593234 ], "wc_reply_reviewers_avg": [ 21.25, 36.80607966083864 ], "wc_reply_authors_avg": [ 344.25, 181.9359433976695 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.5, 0.8660254037844386 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 1.0, "gs_citation": 24, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2718816908435673883&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;1", "aff_unique_norm": "University of Southern California;University of Minnesota", "aff_unique_dep": ";", "aff_unique_url": "https://www.usc.edu;https://www.minnesota.edu", "aff_unique_abbr": "USC;UMN", "aff_campus_unique_index": "0;0;1", "aff_campus_unique": "Los Angeles;Minneapolis", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "1QxveKM654", "title": "Genome Sequence Reconstruction Using Gated Graph Convolutional Network", "track": "main", "status": "Reject", "tldr": "", "abstract": "A quest to determine the human DNA sequence from telomere to telomere started three decades ago and was finally finished in 2021. This accomplishment was a result of a tremendous effort of numerous experts with an abundance of data, various tools, and often included manual inspection during genome reconstruction. Therefore, such method could hardly be used as a general approach to assembling genomes, especially when the assembly speed is important. Motivated by this achievement and aspiring to make it more accessible, we investigate a previously untaken path of applying geometric deep learning to the central part of the genome assembly---untangling a large assembly graph from which a genomic sequence needs to be reconstructed. A graph convolutional network is trained on a dataset generated from human genomic data to reconstruct the genome by finding a path through the assembly graph. We show that our model can compute scores from the lengths of the overlaps between the sequences and the graph topology which, when traversed with a greedy search algorithm, outperforms the greedy search over the overlap lengths only. Moreover, our method reconstructs the correct path through the graph in the fraction of time required for the state-of-the-art de novo assemblers. This favourable result paves the way for the development of powerful graph machine learning algorithms that can solve the de novo genome assembly problem much quicker and possibly more accurately than human handcrafted techniques.", "keywords": "genome assembly;graph neural networks;assembly graph;path finding", "primary_area": "", "supplementary_material": "/attachment/39238b1e8f903d8973fbebb2c6925d0dac925ce0.zip", "author": "Lovro Vr\u010dek;Robert Vaser;Thomas Laurent;Mile Sikic;Xavier Bresson", "authorids": "~Lovro_Vr\u010dek1;~Robert_Vaser1;~Thomas_Laurent1;~Mile_Sikic1;~Xavier_Bresson6", "gender": "M;M;M;;M", "homepage": ";;http://thomaslaurent.lmu.build/homepage.html;;https://www.comp.nus.edu.sg/cs/people/xaviercs/", "dblp": ";;47/8889-1;;95/378", "google_scholar": ";N8jaWLUAAAAJ;_Ag_9uAAAAAJ;https://scholar.google.hr/citations?user=EK7apmcAAAAJ;https://scholar.google.com.sg/citations?hl=en", "orcid": ";;;;", "linkedin": "https://hr.linkedin.com/in/lovro-vr%C4%8Dek-5464a7172;;;;", "or_profile": "~Lovro_Vr\u010dek1;~Robert_Vaser1;~Thomas_Laurent1;~Mile_Sikic1;~Xavier_Bresson6", "aff": "Faculty of Electrical Engineering and Computing, University of Zagreb;Genome Institute of Singapore;Loyola Marymount University;UniZg-FER, University of Zagreb;National University of Singapore", "aff_domain": "unizg.hr;a-star.gis.edu.sg;lmu.edu;fer.unizg.hr;nus.edu.sg", "position": "PhD student;Postdoc;Full Professor;Full Professor;Associate Professor", "bibtex": "@misc{\nvr{\\v{c}}ek2022genome,\ntitle={Genome Sequence Reconstruction Using Gated Graph Convolutional Network},\nauthor={Lovro Vr{\\v{c}}ek and Robert Vaser and Thomas Laurent and Mile Sikic and Xavier Bresson},\nyear={2022},\nurl={https://openreview.net/forum?id=1QxveKM654}\n}", "github": "", "project": "", "reviewers": "c7uL;xJLL;GFZQ;Av24", "site": "https://openreview.net/forum?id=1QxveKM654", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "4;4;2;5", "correctness": "4;2;3;3", "technical_novelty": "3;2;4;2", "empirical_novelty": "1;3;2;2", "wc_summary_paper": "119;188;86;142", "wc_summary_review": "19;29;103;84", "wc_main_review": "256;368;397;286", "wc_review": "394;585;586;512", "wc_reply_reviewers": "40;62;0;56", "wc_reply_authors": "393;430;538;311", "reply_reviewers": "1;1;0;1", "reply_authors": "1;1;1;1", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 3.75, 1.0897247358851685 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.75, 0.82915619758885 ], "empirical_novelty_avg": [ 2.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 133.75, 37.11047695732298 ], "wc_summary_review_avg": [ 58.75, 35.56947427219019 ], "wc_main_review_avg": [ 326.75, 57.66877404627222 ], "wc_review_avg": [ 519.25, 78.29232082394799 ], "wc_reply_reviewers_avg": [ 39.5, 24.181604578687494 ], "wc_reply_authors_avg": [ 418.0, 81.5751187556598 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.22941573387056177, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:yXKkdnYmt7UJ:scholar.google.com/&scioq=Genome+Sequence+Reconstruction+Using+Gated+Graph+Convolutional+Network&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;2;0;3", "aff_unique_norm": "University of Zagreb;Genome Institute of Singapore;Loyola Marymount University;National University of Singapore", "aff_unique_dep": "Faculty of Electrical Engineering and Computing;;;", "aff_unique_url": "https://www.unizg.hr;https://www.genome-institute-of-singapore.org;https://www.lmu.edu;https://www.nus.edu.sg", "aff_unique_abbr": "UNIZG;GIS;LMU;NUS", "aff_campus_unique_index": "1", "aff_campus_unique": ";Zagreb", "aff_country_unique_index": "0;1;2;0;1", "aff_country_unique": "Croatia;Singapore;United States" }, { "id": "1R_PRbQK2eu", "title": "Dual Training of Energy-Based Models with Overparametrized Shallow Neural Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "Energy-based models (EBMs) are generative models that are usually trained via maximum likelihood estimation. This approach becomes challenging in generic situations where the trained energy is nonconvex, due to the need to sample the Gibbs distribution associated with this energy. Using general Fenchel duality results, we derive variational principles dual to maximum likelihood EBMs with shallow overparametrized neural network energies, both in the active (aka feature-learning) and lazy regimes. In the active regime, this dual formulation leads to a training algorithm in which one updates concurrently the particles in the sample space and the neurons in the parameter space of the energy at a faster rate. We also consider a variant of this algorithm in which the particles are sometimes restarted at random samples drawn from the data set, and show that performing these restarts at every iteration step corresponds to score matching training. Using intermediate parameter setups in our dual algorithm thereby gives a way to interpolate between maximum likelihood and score matching training. These results are illustrated in simple numerical experiments.", "keywords": "energy-based models;generative modeling;neural networks;duality;Fenchel;maximum mean discrepancy;maximum likelihood;active;lazy;score matching;measure;feature", "primary_area": "", "supplementary_material": "/attachment/495b0b460e8e8336dbd5f2c21fcfab46bebf0a31.zip", "author": "Carles Domingo-Enrich;Alberto Bietti;Marylou Gabri\u00e9;Joan Bruna;Eric Vanden-Eijnden", "authorids": "~Carles_Domingo-Enrich1;~Alberto_Bietti1;~Marylou_Gabri\u00e91;~Joan_Bruna1;~Eric_Vanden-Eijnden1", "gender": "M;M;F;M;M", "homepage": "https://cdenrich.github.io;http://alberto.bietti.me;https://marylou-gabrie.github.io/;http://cims.nyu.edu/~bruna;https://wp.nyu.edu/courantinstituteofmathematicalsciences-eve2/", "dblp": "216/7444.html;166/6461;164/5772;44/8776;88/7927", "google_scholar": "1ZHcGwIAAAAJ;iT7Tp70AAAAJ;5m1DvLwAAAAJ;L4bNmsMAAAAJ;A5Gx65gAAAAJ", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~Carles_Domingo-Enrich1;~Alberto_Bietti1;~Marylou_Gabri\u00e91;~Joan_Bruna1;~Eric_Vanden-Eijnden1", "aff": "New York University;New York University;\u00c9cole Polytechnique;New York University;New York University", "aff_domain": "nyu.edu;nyu.edu;polytechnique.edu;nyu.edu;nyu.edu", "position": "PhD student;Postdoc;Assistant Professor;Associate Professor;Full Professor", "bibtex": "@misc{\ndomingo-enrich2022dual,\ntitle={Dual Training of Energy-Based Models with Overparametrized Shallow Neural Networks},\nauthor={Carles Domingo-Enrich and Alberto Bietti and Marylou Gabri{\\'e} and Joan Bruna and Eric Vanden-Eijnden},\nyear={2022},\nurl={https://openreview.net/forum?id=1R_PRbQK2eu}\n}", "github": "", "project": "", "reviewers": "hfDM;pG2E;RMY3;aaPp", "site": "https://openreview.net/forum?id=1R_PRbQK2eu", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "3;4;3;3", "correctness": "3;4;3;4", "technical_novelty": "2;3;3;3", "empirical_novelty": "1;1;2;2", "wc_summary_paper": "65;515;70;74", "wc_summary_review": "21;61;38;39", "wc_main_review": "331;504;300;153", "wc_review": "417;1080;408;266", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.5, 0.5 ], "wc_summary_paper_avg": [ 181.0, 192.8613491604785 ], "wc_summary_review_avg": [ 39.75, 14.201672436723781 ], "wc_main_review_avg": [ 322.0, 124.7497494987465 ], "wc_review_avg": [ 542.75, 315.9108853775064 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.13245323570650439, "corr_recommendation_correctness": 0.6882472016116854, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9709468191284215625&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;1;0;0", "aff_unique_norm": "New York University;Ecole Polytechnique", "aff_unique_dep": ";", "aff_unique_url": "https://www.nyu.edu;https://www.polytechnique.edu", "aff_unique_abbr": "NYU;X", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;0", "aff_country_unique": "United States;France" }, { "id": "1RqyBxJU_Wy", "title": "A Flexible Measurement of Diversity in Datasets with Random Network Distillation", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Generative models are increasingly able to produce remarkably high quality images and text. The community has developed numerous evaluation metrics for comparing generative models. However, these metrics do not always effectively quantify data diversity. We develop a new, more flexible diversity metric that can readily be applied to data, both synthetic and natural, of any type. Our method employs random network distillation, a technique introduced in reinforcement learning. We validate and deploy this metric on both images and text. We further explore diversity in few-shot image generation, a setting which was previously difficult to evaluate.\n", "keywords": "Diversity;Metric;Random Network Distillation;Generative", "primary_area": "", "supplementary_material": "/attachment/23423ace6a337437b68e0ee055e0184ce4e2e2d4.zip", "author": "Liam H Fowl;Micah Goldblum;Arjun Gupta;Amr Sharaf;Tom Goldstein", "authorids": "~Liam_H_Fowl1;~Micah_Goldblum1;~Arjun_Gupta2;~Amr_Sharaf1;~Tom_Goldstein1", "gender": ";;M;M;M", "homepage": ";;https://github.com/Arjung27;http://cs.umd.edu/~amr;https://www.cs.umd.edu/~tomg/", "dblp": "241/6940;241/7231;;159/1156;25/8184", "google_scholar": "IXv3ToAAAAAJ;pGDKzuUAAAAJ;5pcsbisAAAAJ;It3Gm1EAAAAJ;KmSuVtgAAAAJ", "orcid": ";;;;", "linkedin": ";;arjung27/;amrsharaf/;", "or_profile": "~Liam_H_Fowl1;~Micah_Goldblum1;~Arjun_Gupta2;~Amr_Sharaf1;~Tom_Goldstein1", "aff": "University of Maryland, College Park;New York University;Zipline International Inc;Microsoft;University of Maryland, College Park", "aff_domain": "umd.edu;nyu.edu;flyzipline.com;microsoft.com;umd.edu", "position": "PhD student;Postdoc;Professional;Researcher;Associate Professor", "bibtex": "@misc{\nfowl2022a,\ntitle={A Flexible Measurement of Diversity in Datasets with Random Network Distillation},\nauthor={Liam H Fowl and Micah Goldblum and Arjun Gupta and Amr Sharaf and Tom Goldstein},\nyear={2022},\nurl={https://openreview.net/forum?id=1RqyBxJU_Wy}\n}", "github": "", "project": "", "reviewers": "jzjC;DaYa;byU6;PMKt", "site": "https://openreview.net/forum?id=1RqyBxJU_Wy", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "4;5;3;4", "correctness": "4;2;2;3", "technical_novelty": "3;3;3;2", "empirical_novelty": "3;1;2;2", "wc_summary_paper": "95;158;44;71", "wc_summary_review": "157;33;14;71", "wc_main_review": "409;266;205;383", "wc_review": "661;457;263;525", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 2.75, 0.82915619758885 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 92.0, 42.16040796766559 ], "wc_summary_review_avg": [ 68.75, 54.92893135679958 ], "wc_main_review_avg": [ 315.75, 83.60435096333205 ], "wc_review_avg": [ 476.5, 143.4878043598131 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.17407765595569782, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:pNFU-BhLZj4J:scholar.google.com/&scioq=A+Flexible+Measurement+of+Diversity+in+Datasets+with+Random+Network+Distillation&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1;2;3;0", "aff_unique_norm": "University of Maryland;New York University;Zipline International Inc;Microsoft", "aff_unique_dep": ";;;Microsoft Corporation", "aff_unique_url": "https://www/umd.edu;https://www.nyu.edu;https://www.zipline.com;https://www.microsoft.com", "aff_unique_abbr": "UMD;NYU;;Microsoft", "aff_campus_unique_index": "0;0", "aff_campus_unique": "College Park;", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "1T5FmILBsq2", "title": "SGORNN: Combining Scalar Gates and Orthogonal Constraints in Recurrent Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "Recurrent Neural Network (RNN) models have been applied in different domains, producing high accuracies on time-dependent data. However, RNNs have long suffered from exploding gradients during training, mainly due to their recurrent process. In this context, we propose a variant of the scalar gated FastRNN architecture, called Scalar Gated Orthogonal Recurrent Neural Networks (SGORNN). SGORNN utilizes orthogonal linear transformations at the recurrent step. In our experiments, SGORNN forms its recurrent weights through a strategy inspired by Volume Preserving RNNs (VPRNN), though our architecture allows the use of any orthogonal constraint mechanism. We present a simple constraint on the scalar gates of SGORNN, which is easily enforced at training time to provide a theoretical generalization ability for SGORNN similar to that of FastRNN. Our constraint is further motivated by success in experimental settings. Next, we provide bounds on the gradients of SGORNN, to show the impossibility of (exponentially) exploding gradients. Our experimental results on the addition problem confirm that our combination of orthogonal and scalar gated RNNs are able to outperform both predecessor models on long sequences using only a single RNN cell. We further evaluate SGORNN on the HAR-2 classification task, where it improves slightly upon the accuracy of both FastRNN and VPRNN using far fewer parameters than FastRNN. Finally, we evaluate SGORNN on the Penn Treebank word-level language modelling task, where it again outperforms its predecessor architectures. Overall, this architecture shows higher representation capacity than VPRNN, suffers from less overfitting than the other two models in our experiments, benefits from a decrease in parameter count, and alleviates exploding gradients when compared with FastRNN on the addition problem.", "keywords": "Deep Learning;Recurrent Neural Networks;Exploding Gradient Problem;Deep Learning Generalization;Orthogonal RNNs", "primary_area": "", "supplementary_material": "/attachment/12248e3559b9162910a27fcfaedf9a45ccf96bbc.zip", "author": "William Keith Taylor-Melanson;Martha Dais Ferreira;Stan Matwin", "authorids": "~William_Keith_Taylor-Melanson1;~Martha_Dais_Ferreira2;~Stan_Matwin2", "gender": ";;M", "homepage": "https://github.com/wtaylor17;;https://web.cs.dal.ca/~stan/", "dblp": ";;", "google_scholar": ";https://scholar.google.com/citations?hl=pt-BR;https://scholar.google.com.tw/citations?user=rCoJeuYAAAAJ", "orcid": ";my-orcid?orcid=0000-0002-3078-9634;", "linkedin": "william-taylor-melanson-6a1ab41a8/;martha-dais-ferreira-1373bbb6/;", "or_profile": "~William_Keith_Taylor-Melanson1;~Martha_Dais_Ferreira2;~Stan_Matwin2", "aff": "Dalhousie University;Dalhousie University;", "aff_domain": "dal.ca;dal.ca;", "position": "MS student;Postdoc;", "bibtex": "@misc{\ntaylor-melanson2022sgornn,\ntitle={{SGORNN}: Combining Scalar Gates and Orthogonal Constraints in Recurrent Networks},\nauthor={William Keith Taylor-Melanson and Martha Dais Ferreira and Stan Matwin},\nyear={2022},\nurl={https://openreview.net/forum?id=1T5FmILBsq2}\n}", "github": "", "project": "", "reviewers": "W7nS;815o;aPoP;NMkH", "site": "https://openreview.net/forum?id=1T5FmILBsq2", "pdf_size": 0, "recommendation": "3;3;5;6", "confidence": "4;3;4;4", "correctness": "3;4;4;3", "technical_novelty": "1;1;3;3", "empirical_novelty": "1;1;2;3", "wc_summary_paper": "86;87;83;48", "wc_summary_review": "34;19;37;25", "wc_main_review": "208;76;438;118", "wc_review": "328;182;558;191", "wc_reply_reviewers": "128;31;155;0", "wc_reply_authors": "539;218;472;105", "reply_reviewers": "1;1;1;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 4.25, 1.299038105676658 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.0, 1.0 ], "empirical_novelty_avg": [ 1.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 76.0, 16.232683080747925 ], "wc_summary_review_avg": [ 28.75, 7.1545440106270926 ], "wc_main_review_avg": [ 210.0, 140.0071426749364 ], "wc_review_avg": [ 314.75, 151.8903798797014 ], "wc_reply_reviewers_avg": [ 78.5, 64.65485287277359 ], "wc_reply_authors_avg": [ 333.5, 178.16074202809102 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.5555555555555555, "corr_recommendation_correctness": -0.19245008972987526, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12366349520167809320&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0", "aff_unique_norm": "Dalhousie University", "aff_unique_dep": "", "aff_unique_url": "https://www.dal.ca", "aff_unique_abbr": "Dal", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Canada" }, { "title": "Benchmarking the Spectrum of Agent Capabilities", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6250", "id": "1W0z96MFEoH", "poster": "", "openreview": "https://openreview.net/forum?id=1W0z96MFEoH", "slides": "https://iclr.cc/virtual/2022/poster/6250", "video": "https://iclr.cc/virtual/2022/poster/6250", "tldr": "", "abstract": "Evaluating the general abilities of intelligent agents requires complex simulation environments. Existing benchmarks typically evaluate only one narrow task per environment, requiring researchers to perform expensive training runs on many different environments. We introduce Crafter, an open world survival game with visual inputs that evaluates a wide range of general abilities within a single environment. Agents either learn from the provided reward signal or through intrinsic objectives and are evaluated by semantically meaningful achievements that can be unlocked during each episode, such as discovering resources and crafting tools. Consistently unlocking all achievements requires strong generalization, deep exploration, and long-term reasoning. We experimentally verify that Crafter is of appropriate difficulty to drive future research and provide baselines scores of reward agents and unsupervised agents. Furthermore, we observe sophisticated behaviors emerging from maximizing the reward signal, such as building tunnel systems, bridges, houses, and plantations. We hope that Crafter will accelerate research progress by quickly evaluating a wide spectrum of abilities.", "keywords": "Evaluation;Reinforcement Learning;Environment;Benchmark;Unsupervised Reinforcement Learning;Exploration", "primary_area": "", "supplementary_material": "", "author": "Danijar Hafner", "authorids": "~Danijar_Hafner1", "gender": "", "homepage": "https://danijar.com", "dblp": "184/8088", "google_scholar": "VINmGpYAAAAJ", "orcid": "0000-0002-9534-7271", "linkedin": "", "or_profile": "~Danijar_Hafner1", "aff": "University of Toronto", "aff_domain": "cs.toronto", "position": "PhD student", "bibtex": "@inproceedings{\nhafner2022benchmarking,\ntitle={Benchmarking the Spectrum of Agent Capabilities},\nauthor={Danijar Hafner},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=1W0z96MFEoH}\n}", "github": "", "project": "", "reviewers": "ix7A;cMdw;7FaV;Mm58", "pdf_size": 0, "recommendation": "5;5;6;8", "confidence": "4;4;4;4", "correctness": "4;4;3;4", "technical_novelty": "3;2;2;3", "empirical_novelty": "2;2;2;0", "wc_summary_paper": "107;100;70;62", "wc_summary_review": "140;43;112;105", "wc_main_review": "532;284;601;534", "wc_review": "779;427;783;701", "wc_reply_reviewers": "401;54;404;0", "wc_reply_authors": "1971;1318;1357;742", "reply_reviewers": "2;1;3;0", "reply_authors": "6;6;5;1", "recommendation_avg": [ 6.0, 1.224744871391589 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 1.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 84.75, 19.122957407263137 ], "wc_summary_review_avg": [ 100.0, 35.41892149684968 ], "wc_main_review_avg": [ 487.75, 120.86847190231207 ], "wc_review_avg": [ 672.5, 145.4604757313821 ], "wc_reply_reviewers_avg": [ 214.75, 188.72118985423973 ], "wc_reply_authors_avg": [ 1347.0, 434.8396256092584 ], "reply_reviewers_avg": [ 1.5, 1.118033988749895 ], "reply_authors_avg": [ 4.5, 2.0615528128088303 ], "replies_avg": [ 29, 0 ], "authors#_avg": [ 1, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 151, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=402017763422030578&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "pdf": "https://openreview.net/pdf?id=1W0z96MFEoH", "email": "cs.toronto", "author_num": 1, "aff_unique_index": "0", "aff_unique_norm": "University of Toronto", "aff_unique_dep": "", "aff_unique_url": "https://www.utoronto.ca", "aff_unique_abbr": "U of T", "aff_country_unique_index": "0", "aff_country_unique": "Canada" }, { "id": "1XdUvpaTNlM", "title": "BWCP: Probabilistic Learning-to-Prune Channels for ConvNets via Batch Whitening", "track": "main", "status": "Reject", "tldr": "", "abstract": "This work presents a probabilistic channel pruning method to accelerate Convolutional Neural Networks (CNNs). Previous pruning methods often zero out unimportant channels in training in a deterministic manner, which reduces CNN's learning capacity and results in suboptimal performance. To address this problem, we develop a probability-based pruning algorithm, called batch whitening channel pruning (BWCP), which can stochastically discard unimportant channels by modeling the probability of a channel being activated. BWCP has several merits. (1) It simultaneously trains and prunes CNNs from scratch in a probabilistic way, exploring larger network space than deterministic methods. (2) BWCP is empowered by the proposed batch whitening tool, which is able to empirically and theoretically increase the activation probability of useful channels while reducing the probability of unimportant channels without adding any extra parameters and computational cost in inference. (3) Extensive experiments on CIFAR-10, CIFAR-100, and ImageNet with various network architectures show that BWCP outperforms its counterparts by achieving better accuracy given limited computational budgets. For example, ResNet50 pruned by BWCP has only 0.58% Top-1 accuracy drop on ImageNet, while reducing 42.9% FLOPs of the plain ResNet50.", "keywords": "Classification;Normalization", "primary_area": "", "supplementary_material": "", "author": "Wenqi Shao;Hang Yu;Zhaoyang Zhang;Hang Xu;Zhenguo Li;Ping Luo", "authorids": "~Wenqi_Shao2;~Hang_Yu6;~Zhaoyang_Zhang1;~Hang_Xu1;~Zhenguo_Li1;~Ping_Luo2", "gender": "M;M;M;M;M;", "homepage": "https://wqshao126.github.io/;;https://zzyfd.github.io/#/;;http://www.ee.columbia.edu/~zgli/;http://luoping.me/", "dblp": "227/3122;;;;23/6479;54/4989-2.html", "google_scholar": "Bs9mrwwAAAAJ;;Pf6o7uAAAAAJ;https://scholar.google.com.hk/citations?user=J_8TX6sAAAAJ;XboZC1AAAAAJ;https://scholar.google.com.hk/citations?hl=en", "orcid": ";0000-0001-7858-8789;;0000-0003-3645-8972;;0000-0002-6685-7950", "linkedin": ";;;;;", "or_profile": "~Wenqi_Shao2;~Hang_Yu6;~Zhaoyang_Zhang1;~Hang_Xu1;~Zhenguo_Li1;~Luo_Ping2", "aff": "The Chinese University of Hong Kong;University of Sydney;The Chinese University of Hong Kong;Huawei Noah\u2018s Ark Lab;Huawei Noah's Ark Lab;The University of Hong Kong", "aff_domain": "cuhk.edu.hk;usyd.edu.au;cuhk.edu.hk;huawei.com;huawei.com;hku.hk", "position": "PhD student;PhD student;PhD student;Researcher;Principal Researcher;Assistant Professor", "bibtex": "@misc{\nshao2022bwcp,\ntitle={{BWCP}: Probabilistic Learning-to-Prune Channels for ConvNets via Batch Whitening},\nauthor={Wenqi Shao and Hang Yu and Zhaoyang Zhang and Hang Xu and Zhenguo Li and Ping Luo},\nyear={2022},\nurl={https://openreview.net/forum?id=1XdUvpaTNlM}\n}", "github": "", "project": "", "reviewers": "PUWD;qyNE;kp3m;8cV6", "site": "https://openreview.net/forum?id=1XdUvpaTNlM", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "5;4;4;5", "correctness": "3;4;2;3", "technical_novelty": "2;3;3;3", "empirical_novelty": "1;3;2;3", "wc_summary_paper": "35;62;17;82", "wc_summary_review": "19;36;31;18", "wc_main_review": "377;215;341;246", "wc_review": "431;313;389;346", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 4.5, 0.5 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 49.0, 24.889756929307286 ], "wc_summary_review_avg": [ 26.0, 7.713624310270756 ], "wc_main_review_avg": [ 294.75, 66.40924257962892 ], "wc_review_avg": [ 369.75, 44.45995389111419 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.2294157338705618, "corr_recommendation_correctness": 0.0, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10996797699356303893&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;0;2;2;3", "aff_unique_norm": "Chinese University of Hong Kong;University of Sydney;Huawei;University of Hong Kong", "aff_unique_dep": ";;Noah's Ark Lab;", "aff_unique_url": "https://www.cuhk.edu.hk;https://www.sydney.edu.au;https://www.huawei.com;https://www.hku.hk", "aff_unique_abbr": "CUHK;USYD;Huawei;HKU", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;1;0;0;0;0", "aff_country_unique": "China;Australia" }, { "id": "1Z3h4rCLvo-", "title": "Improving Long-Horizon Imitation Through Language Prediction", "track": "main", "status": "Reject", "tldr": "", "abstract": "Complex, long-horizon planning and its combinatorial nature pose steep challenges for learning-based agents. Difficulties in such settings are exacerbated in low data regimes where over-fitting stifles generalization and compounding errors hurt accuracy. In this work, we explore the use of an often unused source of auxiliary supervision: language. Inspired by recent advances in transformer-based models, we train agents with an instruction prediction loss that encourages learning temporally extended representations that operate at a high level of abstraction. Concretely, we demonstrate that instruction modeling significantly improves performance in planning environments when training with a limited number of demonstrations on the BabyAI and Crafter benchmarks. In further analysis we find that instruction modeling is most important for tasks that require complex reasoning, while understandably offering smaller gains in environments that require simple plans. Our benchmarks and code will be publicly released.", "keywords": "imitation learning;language;planning", "primary_area": "", "supplementary_material": "/attachment/e7f39c10af402fa493ca2e3b4594ac3a0193fa06.zip", "author": "Donald Joseph Hejna III;Pieter Abbeel;Lerrel Pinto", "authorids": "~Donald_Joseph_Hejna_III1;~Pieter_Abbeel2;~Lerrel_Pinto1", "gender": "M;M;M", "homepage": "https://joeyhejna.com;https://people.eecs.berkeley.edu/~pabbeel/;https://www.lerrelpinto.com/", "dblp": "336/3297;;168/8304", "google_scholar": "y_sLoXoAAAAJ;https://scholar.google.com.tw/citations?user=vtwH6GkAAAAJ;pmVPj94AAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Donald_Joseph_Hejna_III1;~Pieter_Abbeel2;~Lerrel_Pinto1", "aff": "Stanford University;Covariant;New York University", "aff_domain": "stanford.edu;covariant.ai;cs.nyu.edu", "position": "PhD student;Founder;Assistant Professor", "bibtex": "@misc{\niii2022improving,\ntitle={Improving Long-Horizon Imitation Through Language Prediction},\nauthor={Donald Joseph Hejna III and Pieter Abbeel and Lerrel Pinto},\nyear={2022},\nurl={https://openreview.net/forum?id=1Z3h4rCLvo-}\n}", "github": "", "project": "", "reviewers": "hiJp;z2VY;EYpA;cgzc", "site": "https://openreview.net/forum?id=1Z3h4rCLvo-", "pdf_size": 0, "recommendation": "5;5;5;6", "confidence": "4;5;5;4", "correctness": "2;3;3;3", "technical_novelty": "3;1;2;2", "empirical_novelty": "1;2;3;3", "wc_summary_paper": "31;66;344;72", "wc_summary_review": "25;56;79;40", "wc_main_review": "616;149;234;430", "wc_review": "672;271;657;542", "wc_reply_reviewers": "414;0;0;336", "wc_reply_authors": "1522;771;746;1410", "reply_reviewers": "2;0;0;1", "reply_authors": "3;2;1;3", "recommendation_avg": [ 5.25, 0.4330127018922193 ], "confidence_avg": [ 4.5, 0.5 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 128.25, 125.54356813473161 ], "wc_summary_review_avg": [ 50.0, 20.0124960961895 ], "wc_main_review_avg": [ 357.25, 180.83331413210342 ], "wc_review_avg": [ 535.5, 160.7770195021664 ], "wc_reply_reviewers_avg": [ 187.5, 189.51714961976396 ], "wc_reply_authors_avg": [ 1112.25, 356.06907686571157 ], "reply_reviewers_avg": [ 0.75, 0.82915619758885 ], "reply_authors_avg": [ 2.25, 0.82915619758885 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:-4F4oo3p-p4J:scholar.google.com/&scioq=Improving+Long-Horizon+Imitation+Through+Language+Prediction&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;2", "aff_unique_norm": "Stanford University;Covariant;New York University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.stanford.edu;;https://www.nyu.edu", "aff_unique_abbr": "Stanford;;NYU", "aff_campus_unique_index": "0", "aff_campus_unique": "Stanford;", "aff_country_unique_index": "0;0", "aff_country_unique": "United States;" }, { "id": "1Z5P--ntu8", "title": "On the Global Convergence of Gradient Descent for multi-layer ResNets in the mean-field regime", "track": "main", "status": "Reject", "tldr": "", "abstract": "Finding the optimal configuration of parameters in ResNet is a nonconvex minimization problem, but first-order methods nevertheless find the global optimum in the overparameterized regime. We study this phenomenon with mean-field analysis, by translating the training process of ResNet to a gradient-flow partial differential equation (PDE) and examining the convergence properties of this limiting process. \nThe activation function is assumed to be $2$-homogeneous or partially $1$-homogeneous; the regularized ReLU satisfies the latter condition. We show that if the ResNet is sufficiently large, with depth and width depending algebraically on the accuracy and confidence levels, first-order optimization methods can find global minimizers that fit the training data.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhiyan Ding;Shi Chen;Qin Li;Stephen Wright", "authorids": "~Zhiyan_Ding1;~Shi_Chen2;~Qin_Li1;~Stephen_Wright1", "gender": "M;;;M", "homepage": "https://math.berkeley.edu/~zding.m/;;http://www.math.wisc.edu/~qinli/;https://wrightstephen.github.io/sw_proj/", "dblp": "244/9654;;;75/2677", "google_scholar": "jpvAJFkAAAAJ;;;VFQRIOwAAAAJ", "orcid": "0000-0001-8863-403X;;;", "linkedin": ";;;", "or_profile": "~Zhiyan_Ding1;~Shi_Chen2;~Qin_Li1;~Stephen_Wright1", "aff": "University of Wisconsin, Madison;;University of Wisconsin, Madison;University of Wisconsin, Madison", "aff_domain": "wisc.edu;;wisc.edu;wisc.edu", "position": "PhD student;;Associate Professor;Full Professor", "bibtex": "@misc{\nding2022on,\ntitle={On the Global Convergence of Gradient Descent for multi-layer ResNets in the mean-field regime},\nauthor={Zhiyan Ding and Shi Chen and Qin Li and Stephen Wright},\nyear={2022},\nurl={https://openreview.net/forum?id=1Z5P--ntu8}\n}", "github": "", "project": "", "reviewers": "jaQt;Q3Rw;pGat;YcBa", "site": "https://openreview.net/forum?id=1Z5P--ntu8", "pdf_size": 0, "recommendation": "3;5;6;8", "confidence": "4;3;3;5", "correctness": "4;4;3;3", "technical_novelty": "2;3;3;2", "empirical_novelty": "0;0;0;0", "wc_summary_paper": "48;55;58;71", "wc_summary_review": "49;21;28;58", "wc_main_review": "592;292;209;1025", "wc_review": "689;368;295;1154", "wc_reply_reviewers": "0;0;0;292", "wc_reply_authors": "1445;707;444;831", "reply_reviewers": "0;0;0;4", "reply_authors": "2;1;1;4", "recommendation_avg": [ 5.5, 1.8027756377319946 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 0, 0 ], "wc_summary_paper_avg": [ 58.0, 8.336666000266533 ], "wc_summary_review_avg": [ 39.0, 15.049916943292411 ], "wc_main_review_avg": [ 529.5, 319.5907539338396 ], "wc_review_avg": [ 626.5, 338.7023029151116 ], "wc_reply_reviewers_avg": [ 73.0, 126.43970895252804 ], "wc_reply_authors_avg": [ 856.75, 367.2494894482496 ], "reply_reviewers_avg": [ 1.0, 1.7320508075688772 ], "reply_authors_avg": [ 2.0, 1.224744871391589 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.4181210050035454, "corr_recommendation_correctness": -0.8320502943378437, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12818488739115663068&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Wisconsin", "aff_unique_dep": "", "aff_unique_url": "https://www.wisc.edu", "aff_unique_abbr": "UW", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Madison", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "1Zxv7TdLquI", "title": "YOUR AUTOREGRESSIVE GENERATIVE MODEL CAN BE BETTER IF YOU TREAT IT AS AN ENERGY-BASED ONE", "track": "main", "status": "Reject", "tldr": "", "abstract": "Autoregressive generative models are commonly used, especially for those tasks involving sequential data. They have, however, been plagued by a slew of inherent flaws due to the intrinsic characteristics of chain-style conditional modeling (e.g., exposure bias or lack of long-range coherence), severely limiting their ability to model distributions properly. In this paper, we propose a unique method for training the autoregressive generative model that takes advantage of a well-designed energy-based learning objective. We show that our method is capable of alleviating the exposure bias problem and increase temporal coherence by imposing a constraint which fits joint distributions at each time step. Besides, unlike former energy-based models, we estimate energy scores based on the underlying autoregressive network itself, which does not require any extra network. Finally, thanks to importance sampling, we can train the entire model efficiently without requiring an MCMC process. Extensive empirical results, covering benchmarks like language modeling, neural machine translation, and image generation, demonstrate the effectiveness of the proposed approach.", "keywords": "autoregressive generative model;exposure bias;energy-based model", "primary_area": "", "supplementary_material": "", "author": "Yezhen Wang;Tong Che;Bo Li;Kaitao Song;Hengzhi Pei;Yoshua Bengio;Dongsheng Li", "authorids": "~Yezhen_Wang1;~Tong_Che1;~Bo_Li23;~Kaitao_Song1;~Hengzhi_Pei1;~Yoshua_Bengio1;~Dongsheng_Li2", "gender": "M;M;M;M;M;M;M", "homepage": ";;https://www.brianboli.com/;;;http://yoshuabengio.org;http://recmind.cn", "dblp": ";125/0738;50/3402-80;222/2082;243/7002;56/953;254/0830-2.html", "google_scholar": "g-VEnLEAAAAJ;7b5tlJkAAAAJ;1_zc1-IAAAAJ;https://scholar.google.com.hk/citations?user=LLk9dR8AAAAJ;Qgc5qxYAAAAJ;kukA0LcAAAAJ;VNg5rA8AAAAJ", "orcid": ";;;;;;0000-0003-3103-8442", "linkedin": ";;brianbo1121/;;;yoshuabengio/?originalSubdomain=ca;", "or_profile": "~Yezhen_Wang1;~Tong_Che1;~Bo_Li23;~Kaitao_Song1;~Hengzhi_Pei1;~Yoshua_Bengio1;~Dongsheng_Li2", "aff": ";NVIDIA;Nanyang Technological University;Microsoft;University of Illinois, Urbana Champaign;University of Montreal;Microsoft Research Asia", "aff_domain": ";nvidia.com;ntu.edu.sg;microsoft.com;illinois.edu;umontreal.ca;microsoft.com", "position": ";Researcher;PhD student;Researcher;MS student;Full Professor;Principal Researcher", "bibtex": "@misc{\nwang2022your,\ntitle={{YOUR} {AUTOREGRESSIVE} {GENERATIVE} {MODEL} {CAN} {BE} {BETTER} {IF} {YOU} {TREAT} {IT} {AS} {AN} {ENERGY}-{BASED} {ONE}},\nauthor={Yezhen Wang and Tong Che and Bo Li and Kaitao Song and Hengzhi Pei and Yoshua Bengio and Dongsheng Li},\nyear={2022},\nurl={https://openreview.net/forum?id=1Zxv7TdLquI}\n}", "github": "", "project": "", "reviewers": "FnWE;SjXn;AQxn;DZsJ;in11", "site": "https://openreview.net/forum?id=1Zxv7TdLquI", "pdf_size": 0, "recommendation": "3;5;5;6;6", "confidence": "4;2;3;4;5", "correctness": "3;3;3;3;3", "technical_novelty": "2;2;4;3;3", "empirical_novelty": "1;3;3;3;3", "wc_summary_paper": "85;113;119;106;62", "wc_summary_review": "45;25;24;45;45", "wc_main_review": "146;341;372;438;302", "wc_review": "276;479;515;589;409", "wc_reply_reviewers": "49;92;138;401;35", "wc_reply_authors": "1158;1544;1315;878;1188", "reply_reviewers": "1;1;1;1;1", "reply_authors": "3;3;3;2;2", "recommendation_avg": [ 5.0, 1.0954451150103321 ], "confidence_avg": [ 3.6, 1.019803902718557 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.8, 0.7483314773547882 ], "empirical_novelty_avg": [ 2.6, 0.8000000000000002 ], "wc_summary_paper_avg": [ 97.0, 20.92844953645635 ], "wc_summary_review_avg": [ 36.8, 10.047885349664377 ], "wc_main_review_avg": [ 319.8, 97.64097500537365 ], "wc_review_avg": [ 453.6, 106.09354363013803 ], "wc_reply_reviewers_avg": [ 143.0, 133.9328189802634 ], "wc_reply_authors_avg": [ 1216.6, 217.15211258470407 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 2.6, 0.4898979485566356 ], "replies_avg": [ 26, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.17902871850985824, "corr_recommendation_correctness": 0.0, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10438817638902637153&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;2;3;4;2", "aff_unique_norm": "NVIDIA;Nanyang Technological University;Microsoft;University of Illinois Urbana-Champaign;University of Montreal", "aff_unique_dep": "NVIDIA Corporation;;Microsoft Corporation;;", "aff_unique_url": "https://www.nvidia.com;https://www.ntu.edu.sg;https://www.microsoft.com;https://illinois.edu;https://wwwumontreal.ca", "aff_unique_abbr": "NVIDIA;NTU;Microsoft;UIUC;UM", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Urbana-Champaign;Asia", "aff_country_unique_index": "0;1;0;0;2;3", "aff_country_unique": "United States;Singapore;Canada;China" }, { "id": "1_s0_W2V7R", "title": "Amortized Posterior on Latent Variables in Gaussian Process", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Deep neural networks have achieved impressive performance on a variety of domains. However, performing tasks in partially observed, dynamic environments is still an open problem. Gaussian Process (GPs) is well-known for capturing uncertainty in model parameters. However, it simply assumes a fixed Gaussian prior on latent variables. Thus, agents are not able to update their beliefs about latent variables as observing data points. Instead, in this paper, we propose to replace the prior with an amortized posterior, which enables quick adaptation, especially to abrupt changes. Experiments show that our proposed method can adjust behaviors on the fly (e.g., blind \u201cPredator\u201d take 56% more chance to approach \u201cPrey\u201d), correct mistakes to escape bad situations (e.g., 25% \u2191 on avoiding repeating to hit objects with negative rewards), and update beliefs quickly (e.g., 9% faster convergence rate on learning new concepts).", "keywords": "meta-learning;uncertainty estimation", "primary_area": "", "supplementary_material": "/attachment/92e41f6d753106f80c7ea405b12297cda609d228.zip", "author": "Qing Sun", "authorids": "~Qing_Sun2", "gender": "F", "homepage": "https://computing.ece.vt.edu/~sunqing/", "dblp": "https://dblp.uni-trier.de/pers/hd/s/Sun:Qing", "google_scholar": "sSlAO5sAAAAJ", "orcid": "", "linkedin": "", "or_profile": "~Qing_Sun2", "aff": "Amazon", "aff_domain": "amazon.com", "position": "Researcher", "bibtex": "@misc{\nsun2022amortized,\ntitle={Amortized Posterior on Latent Variables in Gaussian Process},\nauthor={Qing Sun},\nyear={2022},\nurl={https://openreview.net/forum?id=1_s0_W2V7R}\n}", "github": "", "project": "", "reviewers": "HkWu;Agxi;wwjZ;9ZQ7", "site": "https://openreview.net/forum?id=1_s0_W2V7R", "pdf_size": 0, "recommendation": "1;3;3;3", "confidence": "4;3;3;4", "correctness": "2;3;3;2", "technical_novelty": "2;2;2;2", "empirical_novelty": "0;2;3;2", "wc_summary_paper": "32;50;52;28", "wc_summary_review": "79;74;40;57", "wc_main_review": "686;821;582;542", "wc_review": "797;945;674;627", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 2.5, 0.8660254037844386 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 1.75, 1.0897247358851685 ], "wc_summary_paper_avg": [ 40.5, 10.618380290797651 ], "wc_summary_review_avg": [ 62.5, 15.337861650177967 ], "wc_main_review_avg": [ 657.75, 107.91750321426085 ], "wc_review_avg": [ 760.75, 123.16325547824725 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 1, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:y7vhpV7mNiMJ:scholar.google.com/&scioq=Amortized+Posterior+on+Latent+Variables+in+Gaussian+Process&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "Amazon", "aff_unique_dep": "Amazon.com, Inc.", "aff_unique_url": "https://www.amazon.com", "aff_unique_abbr": "Amazon", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "id": "1bEaEzGwfhP", "title": "Learning to Model Editing Processes", "track": "main", "status": "Reject", "tldr": "", "abstract": "Most existing sequence generation models produce outputs in one pass, usually left-to-right. However, this is in contrast with a more natural approach that humans use in generating content; iterative refinement and editing. Recent work has introduced edit-based models for various tasks (such as neural machine translation and text style transfer), but these generally model a single edit. In this work, we propose to model editing processes, modeling the whole process of iteratively generating sequences. We form a conceptual framework to describe the likelihood of multi-step edits, and describe neural models that can learn a generative model of sequences based on these multi-step edits. We introduce baseline results and metrics on this task, finding that modeling editing processes improves performance on a variety of axes on both our proposed task and related downstream tasks compared to previous single-step models of edits.", "keywords": "Edit;Representation Learning;Source-code;natural language editing", "primary_area": "", "supplementary_material": "", "author": "Machel Reid;Graham Neubig", "authorids": "~Machel_Reid1;~Graham_Neubig1", "gender": ";M", "homepage": "https://machelreid.github.io/;http://phontron.com", "dblp": "260/6668;03/8155", "google_scholar": "N8ctPiIAAAAJ;wlosgkoAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Machel_Reid1;~Graham_Neubig1", "aff": "The University of Tokyo;Carnegie Mellon University", "aff_domain": "u-tokyo.ac.jp;cmu.edu", "position": "Researcher;Associate Professor", "bibtex": "@misc{\nreid2022learning,\ntitle={Learning to Model Editing Processes},\nauthor={Machel Reid and Graham Neubig},\nyear={2022},\nurl={https://openreview.net/forum?id=1bEaEzGwfhP}\n}", "github": "", "project": "", "reviewers": "eci3;DCPz;EhJP;vM6c", "site": "https://openreview.net/forum?id=1bEaEzGwfhP", "pdf_size": 0, "recommendation": "3;5;5;5", "confidence": "4;4;4;5", "correctness": "3;2;2;2", "technical_novelty": "1;3;3;2", "empirical_novelty": "2;3;2;3", "wc_summary_paper": "79;131;160;275", "wc_summary_review": "17;25;168;53", "wc_main_review": "127;354;429;330", "wc_review": "223;510;757;658", "wc_reply_reviewers": "0;0;0;40", "wc_reply_authors": "349;498;824;682", "reply_reviewers": "0;0;0;1", "reply_authors": "1;1;1;1", "recommendation_avg": [ 4.5, 0.8660254037844386 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 2.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.82915619758885 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 161.25, 71.79963440018341 ], "wc_summary_review_avg": [ 65.75, 60.528402423986044 ], "wc_main_review_avg": [ 310.0, 111.78774530332025 ], "wc_review_avg": [ 537.0, 201.47332329616248 ], "wc_reply_reviewers_avg": [ 10.0, 17.320508075688775 ], "wc_reply_authors_avg": [ 588.25, 180.10604515118308 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": -1.0, "gs_citation": 34, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15615331016221008896&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff_unique_index": "0;1", "aff_unique_norm": "University of Tokyo;Carnegie Mellon University", "aff_unique_dep": ";", "aff_unique_url": "https://www.u-tokyo.ac.jp;https://www.cmu.edu", "aff_unique_abbr": "UTokyo;CMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "Japan;United States" }, { "id": "1ch9DLxqF-", "title": "Dominant Datapoints and the Block Structure Phenomenon in Neural Network Hidden Representations", "track": "main", "status": "Reject", "tldr": "", "abstract": "Recent work has uncovered a striking phenomenon in large-capacity neural networks: they contain blocks of contiguous hidden layers with highly similar representations. This block structure has two seemingly contradictory properties: on the one hand, its constituent layers have highly similar dominant first principal components (PCs), but on the other hand, their representations, and their common first PC, are highly dissimilar across different random seeds. Our work seeks to reconcile these discrepant properties by investigating the origin of the block structure in relation to the data and training methods. By analyzing properties of the dominant PCs, we find that the block structure arises from dominant datapoints \u2014 a small group of examples that share similar image statistics (e.g. background color). However, the set of dominant datapoints, and the precise shared image statistic, can vary across random seeds. Thus, the block structure reflects meaningful dataset statistics, but is simultaneously unique to each model. Through studying hidden layer activations and creating synthetic datapoints, we demonstrate that these simple image statistics dominate the representational geometry of the layers inside the block structure. We also explore how the phenomenon evolves through training, finding that the block structure takes shape early in training, but the underlying representations and the corresponding dominant datapoints continue to change substantially. Finally, we study the interplay between the block structure and different training mechanisms, introducing a targeted intervention to eliminate the block structure, as well as examining the effects of pretraining and Shake-Shake regularization. ", "keywords": "representation learning;representational similarity;distributed representations", "primary_area": "", "supplementary_material": "", "author": "Thao Nguyen;Maithra Raghu;Simon Kornblith", "authorids": "~Thao_Nguyen3;~Maithra_Raghu1;~Simon_Kornblith1", "gender": "F;F;M", "homepage": "https://thaonguyen19.github.io/;http://maithraraghu.com/;", "dblp": "77/2922;;220/4059", "google_scholar": "DvJG-_8AAAAJ;tiE4g64AAAAJ;1O3RPmsAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Thao_Nguyen3;~Maithra_Raghu1;~Simon_Kornblith1", "aff": "University of Washington, Seattle;Google Brain;Google", "aff_domain": "uw.edu;cornell.edu;google.com", "position": "PhD student;Senior Research Scientist;Research Scientist", "bibtex": "@misc{\nnguyen2022dominant,\ntitle={Dominant Datapoints and the Block Structure Phenomenon in Neural Network Hidden Representations},\nauthor={Thao Nguyen and Maithra Raghu and Simon Kornblith},\nyear={2022},\nurl={https://openreview.net/forum?id=1ch9DLxqF-}\n}", "github": "", "project": "", "reviewers": "9NUZ;1vN3;yvqE;83My;4Z4y", "site": "https://openreview.net/forum?id=1ch9DLxqF-", "pdf_size": 0, "recommendation": "3;3;5;6;6", "confidence": "3;4;4;3;3", "correctness": "2;4;2;4;4", "technical_novelty": "2;2;2;3;2", "empirical_novelty": "2;2;3;3;2", "wc_summary_paper": "78;99;171;36;31", "wc_summary_review": "23;93;134;17;34", "wc_main_review": "362;92;708;58;152", "wc_review": "463;284;1013;111;217", "wc_reply_reviewers": "0;0;549;180;0", "wc_reply_authors": "1079;352;668;650;270", "reply_reviewers": "0;0;1;1;0", "reply_authors": "2;1;2;2;1", "recommendation_avg": [ 4.6, 1.3564659966250536 ], "confidence_avg": [ 3.4, 0.4898979485566356 ], "correctness_avg": [ 3.2, 0.9797958971132712 ], "technical_novelty_avg": [ 2.2, 0.39999999999999997 ], "empirical_novelty_avg": [ 2.4, 0.4898979485566356 ], "wc_summary_paper_avg": [ 83.0, 50.86845781031699 ], "wc_summary_review_avg": [ 60.2, 45.735762812048954 ], "wc_main_review_avg": [ 274.4, 241.1485849015084 ], "wc_review_avg": [ 417.6, 318.9530372954614 ], "wc_reply_reviewers_avg": [ 145.8, 213.3132907251679 ], "wc_reply_authors_avg": [ 603.8, 285.2706784792296 ], "reply_reviewers_avg": [ 0.4, 0.48989794855663565 ], "reply_authors_avg": [ 1.6, 0.4898979485566356 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.3611575592573077, "corr_recommendation_correctness": 0.3611575592573077, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:VucjtuBqniUJ:scholar.google.com/&scioq=Dominant+Datapoints+and+the+Block+Structure+Phenomenon+in+Neural+Network+Hidden+Representations&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1;1", "aff_unique_norm": "University of Washington;Google", "aff_unique_dep": ";Google Brain", "aff_unique_url": "https://www.washington.edu;https://brain.google.com", "aff_unique_abbr": "UW;Google Brain", "aff_campus_unique_index": "0;1;1", "aff_campus_unique": "Seattle;Mountain View", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "1eGFH6yYAJn", "title": "Modality Laziness: Everybody's Business is Nobody's Business", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Models fusing multiple modalities receive more information and can outperform their uni-modal counterparts. However, existing multi-modal training approaches often suffer from learning insufficient representations of each modality. We theoretically analyze this phenomenon and prove that with more modalities, the models quickly saturate and ignore the features that are hard-to-learn but important. We name this problem of multi-modal training, \\emph{Modality Laziness}. The solution to this problem depends on a notion called paired feature. If there exist no paired features in the data, one may simply run independent training on each modality. Otherwise, we propose Uni-Modal Teacher (UMT), which distills the pre-trained uni-modal features to the corresponding parts in multi-modal models, as a pushing force to tackle the laziness problem. We empirically verify that we can achieve competitive performance on various multi-modal datasets in light of this dichotomy.", "keywords": "multi-modal learning", "primary_area": "", "supplementary_material": "/attachment/bc37afb6a0fc59859388285e4aa6008d6f3b559f.zip", "author": "Chenzhuang Du;Jiaye Teng;Tingle Li;Yichen Liu;Yue Wang;Yang Yuan;Hang Zhao", "authorids": "~Chenzhuang_Du1;~Jiaye_Teng2;~Tingle_Li1;~Yichen_Liu1;~Yue_Wang2;~Yang_Yuan4;~Hang_Zhao1", "gender": "M;M;M;M;M;M;M", "homepage": "https://scholar.google.com/citations?hl=zh-CN&user=VoF-UAEAAAAJ;http://www.tengjiaye.com;https://tinglok.netlify.app/;;https://yuewang.xyz;http://people.iiis.tsinghua.edu.cn/~yuanyang/index.html;http://www.mit.edu/~hangzhao/", "dblp": "275/3182;266/8187;248/9136;;33/4822-41;;", "google_scholar": "https://scholar.google.com/citations?hl=zh-CN;NGqfK2wAAAAJ;UGpC1zgAAAAJ;;v-AEFIEAAAAJ;;DmahiOYAAAAJ", "orcid": ";0000-0002-4385-5792;;;;;", "linkedin": ";;;yichen-liu1996/;;;", "or_profile": "~Chenzhuang_Du1;~Jiaye_Teng2;~Tingle_Li1;~Yichen_Liu1;~Yue_Wang2;~Yang_Yuan4;~Hang_Zhao1", "aff": "Tsinghua University;Tsinghua University;University of California, Berkeley;Tsinghua University;Massachusetts Institute of Technology;Tsinghua University;Tsinghua University", "aff_domain": "tsinghua.edu.cn;iiis.tsinghua.edu.cn;eecs.berkeley.edu;tsinghua.edu.cn;mit.edu;tsinghua.edu.cn;tsinghua.edu.cn", "position": "PhD student;PhD student;PhD student;MS student;PhD student;Assistant Professor;Assistant Professor", "bibtex": "@misc{\ndu2022modality,\ntitle={Modality Laziness: Everybody's Business is Nobody's Business},\nauthor={Chenzhuang Du and Jiaye Teng and Tingle Li and Yichen Liu and Yue Wang and Yang Yuan and Hang Zhao},\nyear={2022},\nurl={https://openreview.net/forum?id=1eGFH6yYAJn}\n}", "github": "", "project": "", "reviewers": "QHtV;gGWV;6zEA;zMqd", "site": "https://openreview.net/forum?id=1eGFH6yYAJn", "pdf_size": 0, "recommendation": "3;3;5;6", "confidence": "4;3;3;3", "correctness": "2;4;3;3", "technical_novelty": "2;2;3;3", "empirical_novelty": "1;2;3;0", "wc_summary_paper": "44;175;101;130", "wc_summary_review": "52;25;46;60", "wc_main_review": "476;1121;247;205", "wc_review": "572;1321;394;395", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "20;23;20;24", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 4.25, 1.299038105676658 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 1.5, 1.118033988749895 ], "wc_summary_paper_avg": [ 112.5, 47.53156845718433 ], "wc_summary_review_avg": [ 45.75, 12.968712349342937 ], "wc_main_review_avg": [ 512.25, 366.28225113974605 ], "wc_review_avg": [ 670.5, 382.4934639964453 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 21.75, 1.7853571071357126 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": -0.5555555555555555, "corr_recommendation_correctness": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6840977171097183200&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;1;0;2;0;0", "aff_unique_norm": "Tsinghua University;University of California, Berkeley;Massachusetts Institute of Technology", "aff_unique_dep": ";;", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.berkeley.edu;https://web.mit.edu", "aff_unique_abbr": "THU;UC Berkeley;MIT", "aff_campus_unique_index": "1", "aff_campus_unique": ";Berkeley", "aff_country_unique_index": "0;0;1;0;1;0;0", "aff_country_unique": "China;United States" }, { "id": "1gEb_H1DEqZ", "title": "Logic Pre-Training of Language Models", "track": "main", "status": "Reject", "tldr": "", "abstract": "Pre-trained language models (PrLMs) have been shown useful for enhancing a broad range of natural language understanding (NLU) tasks. However, the capacity for capturing logic relations in challenging NLU still remains a bottleneck even for state-of-the-art PrLM enhancement, which greatly stalled their reasoning abilities. Thus we propose logic pre-training of language models, leading to the logic reasoning ability equipped PrLM, \\textsc{Prophet}. To let logic pre-training perform on a clear, accurate, and generalized knowledge basis, we introduce \\textit{fact} instead of the plain language unit in previous PrLMs. The \\textit{fact} is extracted through syntactic parsing in avoidance of unnecessary complex knowledge injection. Meanwhile, it enables training logic-aware models to be conducted on a more general language text. To explicitly guide the PrLM to capture logic relations, three pre-training objectives are introduced: 1) logical connectives masking to capture sentence-level logics, 2) logical structure completion to accurately capture facts from the original context, 3) logical path prediction on a logical graph to uncover global logic relationships among facts. We evaluate our model on a broad range of NLP and NLU tasks, including natural language inference, relation extraction, and machine reading comprehension with logical reasoning. Results show that the extracted fact and the newly introduced pre-training tasks can help \\textsc{Prophet} achieve significant performance in all the downstream tasks, especially in logic reasoning related tasks. ", "keywords": "Language Models;Pre-training;Logical Reasoning;Natural Language Understanding", "primary_area": "", "supplementary_material": "/attachment/d946766912e5f06d18400aa46beaa3940dd7a6ac.zip", "author": "Siru Ouyang;Zhuosheng Zhang;hai zhao", "authorids": "~Siru_Ouyang1;~Zhuosheng_Zhang1;~hai_zhao1", "gender": "F;M;M", "homepage": "https://ozyyshr.github.io;https://bcmi.sjtu.edu.cn/~zhangzs/;http://bcmi.sjtu.edu.cn/~zhaohai/", "dblp": "https://dblp.org/search/pid/api?q=author:Siru_Ouyang:;06/9708;25/1145-1.html", "google_scholar": "fetoihAAAAAJ;https://scholar.google.co.jp/citations?user=63LTQhgAAAAJ;https://scholar.google.com.tw/citations?user=4dU5KS0AAAAJ", "orcid": "0009-0001-1331-424X;0000-0002-4183-3645;", "linkedin": ";;", "or_profile": "~Siru_Ouyang1;~Zhuosheng_Zhang1;~hai_zhao1", "aff": "Shanghai Jiaotong University;Shanghai Jiaotong University;Shanghai Jiaotong University", "aff_domain": "sjtu.edu.cn;sjtu.edu.cn;sjtu.edu.cn", "position": "Undergrad student;PhD student;Full Professor", "bibtex": "@misc{\nouyang2022logic,\ntitle={Logic Pre-Training of Language Models},\nauthor={Siru Ouyang and Zhuosheng Zhang and hai zhao},\nyear={2022},\nurl={https://openreview.net/forum?id=1gEb_H1DEqZ}\n}", "github": "", "project": "", "reviewers": "K1pp;DQM2;rHDy;cXhg", "site": "https://openreview.net/forum?id=1gEb_H1DEqZ", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "5;5;4;4", "correctness": "2;2;3;3", "technical_novelty": "1;2;2;4", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "45;64;49;140", "wc_summary_review": "52;75;32;64", "wc_main_review": "353;1301;305;405", "wc_review": "450;1440;386;609", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 4.5, 0.5 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.25, 1.0897247358851685 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 74.5, 38.47401720642127 ], "wc_summary_review_avg": [ 55.75, 15.943258763502524 ], "wc_main_review_avg": [ 591.0, 411.4413688485882 ], "wc_review_avg": [ 721.25, 422.8388434143675 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -1.0, "corr_recommendation_correctness": 1.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:pFn9dv3uhyoJ:scholar.google.com/&scioq=Logic+Pre-Training+of+Language+Models&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;0", "aff_unique_norm": "Shanghai Jiao Tong University", "aff_unique_dep": "", "aff_unique_url": "https://www.sjtu.edu.cn", "aff_unique_abbr": "SJTU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "id": "1hw-h1C8bch", "title": "Practical Adversarial Training with Differential Privacy for Deep Learning", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Deep learning models are often vulnerable to privacy risks and adversarial attacks, rendering them un-trustworthy on crowd-sourced tasks. However, these risks are rarely resolved jointly, despite the fact that there are separate solutions in the security community and the privacy community. In this work, we propose the practical adversarial training with differential privacy (DP-Adv), to combine the backbones from both communities and deliver robust and private models with high accuracy. Our algorithm is significantly more concise in the design, compared to previous arts, and is capable of incorporating technical advances from both communities. To be specific, DP-Adv can work with all existing DP optimizers and attacking methods off-the-shelf. In particular, DP-Adv is as private as non-robust DP training, and as efficient as non-DP adversarial training. Our experiments on multiple image datasets show that DP-Adv outperforms state-of-the-art methods that preserve robustness and privacy. Furthermore, we observe that adversarial training and DP can notably worsen the calibration, but the mis-calibration can be mitigated by pre-training. ", "keywords": "adversarial robustness;differential privacy;adversarial training;calibration;deep learning", "primary_area": "", "supplementary_material": "/attachment/f3a59dc628799e1534ad0e3fd5c9633cc4467028.zip", "author": "Zhiqi Bu;Ping Li;Weijie Zhao", "authorids": "~Zhiqi_Bu1;~Ping_Li3;~Weijie_Zhao2", "gender": "M;M;", "homepage": "https://sites.google.com/view/zhiqi-bu;http://www.stat.rutgers.edu/home/pingli/;https://www.cs.rit.edu/~wjz/", "dblp": "245/2573;62/5860-1;135/6597-1", "google_scholar": "MEvTLxIAAAAJ;;c-gzOhwAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Zhiqi_Bu1;~Ping_Li3;~Weijie_Zhao2", "aff": "Amazon;LinkedIn;Rochester Institute of Technology", "aff_domain": "amazon.com;linkedin.com;rit.edu", "position": "Researcher;Engineer;Assistant Professor", "bibtex": "@misc{\nbu2022practical,\ntitle={Practical Adversarial Training with Differential Privacy for Deep Learning},\nauthor={Zhiqi Bu and Ping Li and Weijie Zhao},\nyear={2022},\nurl={https://openreview.net/forum?id=1hw-h1C8bch}\n}", "github": "", "project": "", "reviewers": "ECKA;Dcou;43Go;KNYv", "site": "https://openreview.net/forum?id=1hw-h1C8bch", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "4;5;3;5", "correctness": "2;1;3;4", "technical_novelty": "1;1;2;3", "empirical_novelty": "0;1;1;3", "wc_summary_paper": "35;65;52;109", "wc_summary_review": "15;26;50;26", "wc_main_review": "311;433;402;186", "wc_review": "361;524;504;321", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 4.25, 0.82915619758885 ], "correctness_avg": [ 2.5, 1.118033988749895 ], "technical_novelty_avg": [ 1.75, 0.82915619758885 ], "empirical_novelty_avg": [ 1.25, 1.0897247358851685 ], "wc_summary_paper_avg": [ 65.25, 27.407799984675894 ], "wc_summary_review_avg": [ 29.25, 12.794041581923986 ], "wc_main_review_avg": [ 333.0, 95.9869782835151 ], "wc_review_avg": [ 427.5, 87.93321329281673 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.5222329678670935, "corr_recommendation_correctness": 0.7745966692414834, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4035426861056733846&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;2", "aff_unique_norm": "Amazon;LinkedIn Corporation;Rochester Institute of Technology", "aff_unique_dep": "Amazon.com, Inc.;;", "aff_unique_url": "https://www.amazon.com;https://www.linkedin.com;https://www.rit.edu", "aff_unique_abbr": "Amazon;LinkedIn;RIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "1iDVz-khM4P", "title": "Neural Networks Playing Dough: Investigating Deep Cognition With a Gradient-Based Adversarial Attack", "track": "main", "status": "Reject", "tldr": "", "abstract": "Discovering adversarial examples has shaken our trust in the reliability of deep learning. Even though brilliant works have been devoted to understanding and fixing this vulnerability, fundamental questions (e.g. the mysterious generalization of adversarial examples across models and training sets) remain unanswered. This paper tests the hypothesis that it is not the neural networks failing in learning that causes adversarial vulnerability, but their different perception of the presented data. And therefore, adversarial examples should be semantic-sensitive signals which can provide us with an exceptional opening to understanding neural network learning. To investigate this hypothesis, I performed a gradient-based attack on fully connected feed-forward and convolutional neural networks, instructing them to minimally evolve controlled inputs into adversarial examples for all the classes of the MNIST and Fashion-MNIST datasets. Then I abstracted adversarial perturbations from these examples. The perturbations unveiled vivid and recurring visual structures, unique to each class and persistent over parameters of abstraction methods, model architectures, and training configurations. Furthermore, these patterns proved to be explainable and derivable from the corresponding dataset. This finding explains the generalizability of adversarial examples by, semantically, tying them to the datasets. In conclusion, this experiment not only resists interpretation of adversarial examples as deep learning failure but on the contrary, demystifies them in the form of supporting evidence for the authentic learning capacity of neural networks.", "keywords": "Deep Learning;Adversarial Perturbation;Adversarial Example;Categorical Learning", "primary_area": "", "supplementary_material": "/attachment/0c25b1903ef70150fd59ce4b5a1eb0e81125f9ad.zip", "author": "Sahar Niknam", "authorids": "~Sahar_Niknam1", "gender": "F", "homepage": "http://shrnkm.com", "dblp": "", "google_scholar": "https://scholar.google.com/citations?hl=en", "orcid": "0000-0002-8021-1021", "linkedin": "", "or_profile": "~Sahar_Niknam1", "aff": "University of Luxemburg", "aff_domain": "uni.lu", "position": "PhD student", "bibtex": "@misc{\nniknam2022neural,\ntitle={Neural Networks Playing Dough: Investigating Deep Cognition With a Gradient-Based Adversarial Attack},\nauthor={Sahar Niknam},\nyear={2022},\nurl={https://openreview.net/forum?id=1iDVz-khM4P}\n}", "github": "", "project": "", "reviewers": "P1SQ;feru;9iri;8a9h", "site": "https://openreview.net/forum?id=1iDVz-khM4P", "pdf_size": 0, "recommendation": "3;3;3;3", "confidence": "4;5;2;4", "correctness": "3;2;2;2", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;2;1;2", "wc_summary_paper": "55;136;67;67", "wc_summary_review": "25;101;84;101", "wc_main_review": "131;465;124;762", "wc_review": "211;702;275;930", "wc_reply_reviewers": "0;18;0;0", "wc_reply_authors": "21;52;58;75", "reply_reviewers": "0;1;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 3.75, 1.0897247358851685 ], "correctness_avg": [ 2.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 1.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 81.25, 31.98730216820418 ], "wc_summary_review_avg": [ 77.75, 31.235996862594284 ], "wc_main_review_avg": [ 370.5, 264.7286346431001 ], "wc_review_avg": [ 529.5, 298.4832491112357 ], "wc_reply_reviewers_avg": [ 4.5, 7.794228634059948 ], "wc_reply_authors_avg": [ 51.5, 19.525624189766635 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 1, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:wK5bODA2A4IJ:scholar.google.com/&scioq=Neural+Networks+Playing+Dough:+Investigating+Deep+Cognition+With+a+Gradient-Based+Adversarial+Attack&hl=en&as_sdt=0,14", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "University of Luxembourg", "aff_unique_dep": "", "aff_unique_url": "https://wwwen.uniluxembourg.lu", "aff_unique_abbr": "Uni Lu", "aff_country_unique_index": "0", "aff_country_unique": "Luxembourg" }, { "id": "1kqWZlj4QYJ", "title": "Learning Two-Step Hybrid Policy for Graph-Based Interpretable Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "We present a two-step hybrid reinforcement learning (RL) policy that is designed to generate interpretable and robust hierarchical policies\non the RL problem with graph-based input. Unlike prior deep reinforcement learning policies parameterized by an end-to-end black-box graph neural network, our approach disentangles the decision-making process into two steps. The first step is a simplified classification problem that maps the graph input to an action group where all actions share a similar semantic meaning. The second step implements a sophisticated rule-miner that conduct explicit one-hop reasoning over the graph and identifies decisive edges in the graph input without the necessity of heavy domain knowledge. This two-step hybrid policy presents human-friendly interpretations and achieves better performance in terms of generalization and robustness. Extensive experimental studies on four levels of complex text-based games have demonstrated the superiority of the proposed method compared to the state-of-the-art. ", "keywords": "Interpretable Reinforcement Learning;Generalization;Robustness", "primary_area": "", "supplementary_material": "", "author": "Tongzhou Mu;Kaixiang Lin;Feiyang Niu;Govind Thattai", "authorids": "~Tongzhou_Mu1;~Kaixiang_Lin1;~Feiyang_Niu1;~Govind_Thattai1", "gender": "M;;M;M", "homepage": "http://cseweb.ucsd.edu/~t3mu/;http://kaixianglin.github.io;;", "dblp": "183/0943;;;279/2880", "google_scholar": "uVsZydYAAAAJ;egq785sAAAAJ;;ZiagaFYAAAAJ", "orcid": ";;;", "linkedin": ";;feiyangniu/;govind-thattai-aaa5326/", "or_profile": "~Tongzhou_Mu1;~Kaixiang_Lin1;~Feiyang_Niu1;~Govind_Thattai1", "aff": "University of California, San Diego;Amazon;Amazon;Amazon", "aff_domain": "ucsd.edu;amazon.com;amazon.com;amazon.com", "position": "PhD student;Applied Scientist;Applied Scientist;Principal Scientist", "bibtex": "@misc{\nmu2022learning,\ntitle={Learning Two-Step Hybrid Policy for Graph-Based Interpretable Reinforcement Learning},\nauthor={Tongzhou Mu and Kaixiang Lin and Feiyang Niu and Govind Thattai},\nyear={2022},\nurl={https://openreview.net/forum?id=1kqWZlj4QYJ}\n}", "github": "", "project": "", "reviewers": "3CUH;eXdP;5Hfh;WZ15", "site": "https://openreview.net/forum?id=1kqWZlj4QYJ", "pdf_size": 0, "recommendation": "3;5;5;5", "confidence": "4;3;4;4", "correctness": "4;3;4;3", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;3;3;0", "wc_summary_paper": "50;127;88;47", "wc_summary_review": "54;64;90;47", "wc_main_review": "673;215;499;272", "wc_review": "777;406;677;366", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.5, 0.8660254037844386 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 1.224744871391589 ], "wc_summary_paper_avg": [ 78.0, 32.58066911528982 ], "wc_summary_review_avg": [ 63.75, 16.315253599009733 ], "wc_main_review_avg": [ 414.75, 183.07699882836184 ], "wc_review_avg": [ 556.5, 174.7004579272762 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": -0.5773502691896257, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15086852676010529738&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;1;1;1", "aff_unique_norm": "University of California, San Diego;Amazon", "aff_unique_dep": ";Amazon.com, Inc.", "aff_unique_url": "https://www.ucsd.edu;https://www.amazon.com", "aff_unique_abbr": "UCSD;Amazon", "aff_campus_unique_index": "0", "aff_campus_unique": "San Diego;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "1nlRIagHDUB", "title": "Coresets for Kernel Clustering", "track": "main", "status": "Reject", "tldr": "", "abstract": "We devise the first coreset for kernel $k$-Means, and use it to obtain new, more efficient, algorithms. Kernel $k$-Means has superior clustering capability compared to classical $k$-Means particularly when clusters are separable non-linearly, but it also introduces significant computational challenges. We address this computational issue by constructing a coreset, which is a reduced dataset that accurately preserves the clustering costs.\n\nOur main result is the first coreset for kernel $k$-Means, whose size is independent of the number of input points $n$, and moreover is constructed in time near-linear in $n$. This result immediately implies new algorithms for kernel $k$-Means, such as a $(1+\\epsilon)$-approximation in time near-linear in $n$, and a streaming algorithm using space and update time $\\mathrm{poly}(k \\epsilon^{-1} \\log n)$. \n\nWe validate our coreset on various datasets with different kernels. Our coreset performs consistently well, achieving small errors while using very few points. We show that our coresets can speed up kernel $k$-Means++ (the kernelized version of the widely used $k$-Means++ algorithm), and we further use this faster kernel $k$-Means++ for spectral clustering. In both applications, we achieve up to 1000x speedup while the error is comparable to baselines that do not use coresets.", "keywords": "coreset;clustering;kernel;PTAS;streaming;spectral clustering;k-means", "primary_area": "", "supplementary_material": "/attachment/c72d7658b71292a62f723f41b1c164116e9260c5.zip", "author": "Shaofeng H.-C. Jiang;Robert Krauthgamer;Jianing Lou;Yubo Zhang", "authorids": "~Shaofeng_H.-C._Jiang1;~Robert_Krauthgamer1;~Jianing_Lou1;~Yubo_Zhang4", "gender": "M;M;M;M", "homepage": "https://shaofengjiang.cn;http://www.wisdom.weizmann.ac.il/~robi/;https://jianinglou.github.io/;http://saigyouji.github.io/", "dblp": "157/6062;k/RobertKrauthgamer;304/2105;", "google_scholar": ";;;", "orcid": "0000-0001-7972-827X;;;", "linkedin": ";;;", "or_profile": "~Shaofeng_H.-C._Jiang1;~Robert_Krauthgamer1;~Jianing_Lou1;~Yubo_Zhang4", "aff": "Peking University;Weizmann Institute of Science;School of EECS, Peking University;Peking University", "aff_domain": "pku.edu.cn;weizmann.ac.il;pku.edu.cn;pku.edu.cn", "position": "Assistant Professor;Full Professor;Undergrad student;Undergrad student", "bibtex": "@misc{\njiang2022coresets,\ntitle={Coresets for Kernel Clustering},\nauthor={Shaofeng H.-C. Jiang and Robert Krauthgamer and Jianing Lou and Yubo Zhang},\nyear={2022},\nurl={https://openreview.net/forum?id=1nlRIagHDUB}\n}", "github": "", "project": "", "reviewers": "BBxb;AgHC;w9ag", "site": "https://openreview.net/forum?id=1nlRIagHDUB", "pdf_size": 0, "recommendation": "3;5;8", "confidence": "4;4;4", "correctness": "4;4;4", "technical_novelty": "2;2;3", "empirical_novelty": "3;3;3", "wc_summary_paper": "14;62;32", "wc_summary_review": "11;53;16", "wc_main_review": "149;231;311", "wc_review": "174;346;359", "wc_reply_reviewers": "0;0;84", "wc_reply_authors": "616;399;331", "reply_reviewers": "0;0;1", "reply_authors": "1;1;1", "recommendation_avg": [ 5.333333333333333, 2.0548046676563256 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 4.0, 0.0 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 36.0, 19.79898987322333 ], "wc_summary_review_avg": [ 26.666666666666668, 18.732028424302822 ], "wc_main_review_avg": [ 230.33333333333334, 66.13790306792484 ], "wc_review_avg": [ 293.0, 84.31290925277497 ], "wc_reply_reviewers_avg": [ 28.0, 39.59797974644666 ], "wc_reply_authors_avg": [ 448.6666666666667, 121.53554578348216 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15491169655484414638&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff_unique_index": "0;1;0;0", "aff_unique_norm": "Peking University;Weizmann Institute of Science", "aff_unique_dep": ";", "aff_unique_url": "http://www.pku.edu.cn;https://www.weizmann.org.il", "aff_unique_abbr": "Peking U;Weizmann", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0", "aff_country_unique": "China;Israel" }, { "id": "1oEvY1a67c1", "title": "If your data distribution shifts, use self-learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "In this paper, we demonstrate that self-learning techniques like entropy minimization or pseudo-labeling are simple, yet effective techniques for increasing test performance under domain shifts. Our results show that self-learning consistently increases performance under distribution shifts, irrespective of the model architecture, the pre-training technique or the type of distribution shift. At the same time, self-learning is simple to use in practice because it does not require knowledge or access to the original training data or scheme, is robust to hyperparameter choices, is straight-forward to implement and requires only a few training epochs. This makes self-learning techniques highly attractive for any practitioner who applies machine learning algorithms in the real world. We present state-of-the art adaptation results on CIFAR10-C (8.5% error), ImageNet-C (22.0% mCE), ImageNet-R (17.4% error) and ImageNet-A (14.8% error), theoretically study the dynamics of self-supervised adaptation methods and propose a new classification dataset (ImageNet-D) which is challenging even with adaptation.", "keywords": "Self-Learning;Domain Adaptation;Robustness;Pseudolabeling;Entropy Minimization;Corruption Robustness", "primary_area": "", "supplementary_material": "", "author": "Evgenia Rusak;Steffen Schneider;George Pachitariu;Luisa Eck;Peter Vincent Gehler;Oliver Bringmann;Wieland Brendel;Matthias Bethge", "authorids": "~Evgenia_Rusak1;~Steffen_Schneider1;~George_Pachitariu2;~Luisa_Eck1;~Peter_Vincent_Gehler1;~Oliver_Bringmann1;~Wieland_Brendel1;~Matthias_Bethge1", "gender": "F;;;F;M;M;M;M", "homepage": "https://evgeniarusak.github.io/;https://stes.io;;https://luisaeck.de;https://www.embedded.uni-tuebingen.de;;https://bethgelab.org;http://gehler.io", "dblp": "245/2556;16/8643.html;;;06/6843;37/11107;77/3005;78/1502", "google_scholar": "https://scholar.google.de/citations?user=XKc19kkAAAAJ;https://scholar.google.de/citations?user=KR5dj44AAAAJ;;;pk53ZkAAAAAJ;v-JL-hsAAAAJ;https://scholar.google.com/citations?hl=en;https://scholar.google.de/citations?user=mlSE-YwAAAAJ", "orcid": ";0000-0003-2327-6459;;;0000-0002-1615-507X;;;", "linkedin": ";https://linkedin.com/in/steffen-schneider;george-pachitariu-4b4b8785;;;;;peter-gehler-53550692?lipi=urn%3Ali%3Apage%3Ad_flagship3_profile_view_base_contact_details%3BbTsmC%2B4qS3SOQfOagpaxaw%3D%3D", "or_profile": "~Evgenia_Rusak1;~Steffen_Schneider1;~George_Pachitariu2;~Luisa_Eck1;~Oliver_Bringmann1;~Wieland_Brendel1;~Matthias_Bethge1;~Peter_Vincent1", "aff": "FAIR;Meta;Tuebingen AI Center;University of Munich, Fakult\u00e4t f\u00fcr Physik;University of T\u00fcbingen;University of Tuebingen;University of Tuebingen;Amazon", "aff_domain": "meta.com;meta.com;tuebingen.ai;campus.lmu.de;uni-tuebingen.de;uni-tuebingen.de;uni-tuebingen.de;amazon.com", "position": "Intern;Intern;Software Engineer;MS student;Full Professor;Group Leader;Full Professor;Researcher", "bibtex": "@misc{\nrusak2022if,\ntitle={If your data distribution shifts, use self-learning},\nauthor={Evgenia Rusak and Steffen Schneider and George Pachitariu and Luisa Eck and Peter Vincent Gehler and Oliver Bringmann and Wieland Brendel and Matthias Bethge},\nyear={2022},\nurl={https://openreview.net/forum?id=1oEvY1a67c1}\n}", "github": "", "project": "", "reviewers": "PUq6;b5KJ;s85k", "site": "https://openreview.net/forum?id=1oEvY1a67c1", "pdf_size": 0, "recommendation": "5;6;8", "confidence": "4;4;3", "correctness": "3;3;3", "technical_novelty": "2;2;3", "empirical_novelty": "2;3;4", "wc_summary_paper": "66;134;94", "wc_summary_review": "147;60;31", "wc_main_review": "776;228;471", "wc_review": "989;422;596", "wc_reply_reviewers": "168;0;378", "wc_reply_authors": "2466;910;502", "reply_reviewers": "1;0;1", "reply_authors": "5;2;1", "recommendation_avg": [ 6.333333333333333, 1.247219128924647 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 3.0, 0.816496580927726 ], "wc_summary_paper_avg": [ 98.0, 27.9045993819418 ], "wc_summary_review_avg": [ 79.33333333333333, 49.29052196473025 ], "wc_main_review_avg": [ 491.6666666666667, 224.1968381182532 ], "wc_review_avg": [ 669.0, 237.1623916222806 ], "wc_reply_reviewers_avg": [ 182.0, 154.63505424062166 ], "wc_reply_authors_avg": [ 1292.6666666666667, 846.2266573179762 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 2.6666666666666665, 1.699673171197595 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": -0.944911182523068, "corr_recommendation_correctness": 0.0, "gs_citation": 28, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5941215717305244351&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff_unique_index": "0;0;1;2;3;4;4;5", "aff_unique_norm": "Meta;Tuebingen AI Center;University of Munich;University of T\u00fcbingen;University of Tuebingen;Amazon", "aff_unique_dep": "Facebook AI Research;AI Center;Fakult\u00e4t f\u00fcr Physik;;;Amazon.com, Inc.", "aff_unique_url": "https://research.facebook.com;https://www.uni-tuebingen.de/en/faculties/ai-center.html;https://www.lmu.de;https://www.uni-tuebingen.de/;https://www.uni-tuebingen.de/;https://www.amazon.com", "aff_unique_abbr": "FAIR;T\u00fcbingen AI Center;LMU;Uni T\u00fcbingen;Uni T\u00fcbingen;Amazon", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;1;1;1;1;0", "aff_country_unique": "United States;Germany" }, { "id": "1saVY0lW1x", "title": "Machine Learning Applications in Forecasting of COVID-19 Based on Patients' Individual Symptoms", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Predicting the COVID-19 outbreak has been studied by many researchers in recent years. Many\nmachine learning models have been used for the prediction of the transmission in a country or\nregion, but few studies aim to predict whether an individual has been infected by COVID-19.\nHowever, due to the gravity of this global pandemic, prediction at an individual level is critical.\nThe objective of this paper is to predict if an individual has COVID-19 based on the symptoms\nand features. The prediction results can help the government better allocate the medical resources\nduring this pandemic. Data of this study was taken on June 18th from the Israeli Ministry of\nHealth on COVID-19. The purpose of this study is to compare and analyze different models,\nwhich are Support Vector Machine (SVM), Logistic Regression (LR), Naive Bayesian (NB),\nDecision Tree (DT), Random Forest (RF) and Neural Network (NN).", "keywords": "COVID-19;Machine learning;Classification", "primary_area": "", "supplementary_material": "/attachment/d1aaaa01f2b7ae73c126f04f36a0f86e1e7425d7.zip", "author": "Zhanyang Sun;Rui Ding;Xinyu Zhou", "authorids": "~Zhanyang_Sun1;1602022728@qq.com;1084237276@qq.com", "gender": "M;;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": "zhanyang-sun-90b1411b6/;;", "or_profile": "~Zhanyang_Sun1;1602022728@qq.com;1084237276@qq.com", "aff": "Wake Forest University;;", "aff_domain": "wfu.edu;;", "position": "Undergrad student;;", "bibtex": "@misc{\nsun2022machine,\ntitle={Machine Learning Applications in Forecasting of {COVID}-19 Based on Patients' Individual Symptoms},\nauthor={Zhanyang Sun and Rui Ding and Xinyu Zhou},\nyear={2022},\nurl={https://openreview.net/forum?id=1saVY0lW1x}\n}", "github": "", "project": "", "reviewers": "onWc;LCNJ;a1G8", "site": "https://openreview.net/forum?id=1saVY0lW1x", "pdf_size": 0, "recommendation": "1;1;3", "confidence": "5;5;5", "correctness": "2;3;2", "technical_novelty": "1;1;1", "empirical_novelty": "1;2;2", "wc_summary_paper": "57;47;68", "wc_summary_review": "60;20;47", "wc_main_review": "123;201;329", "wc_review": "240;268;444", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 1.6666666666666667, 0.9428090415820634 ], "confidence_avg": [ 5.0, 0.0 ], "correctness_avg": [ 2.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 1.0, 0.0 ], "empirical_novelty_avg": [ 1.6666666666666667, 0.4714045207910317 ], "wc_summary_paper_avg": [ 57.333333333333336, 8.576453553512405 ], "wc_summary_review_avg": [ 42.333333333333336, 16.659998666133067 ], "wc_main_review_avg": [ 217.66666666666666, 84.92087820763251 ], "wc_review_avg": [ 317.3333333333333, 90.29334908446407 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": -0.4999999999999999, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14822540877341006994&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2, "aff_unique_index": "0", "aff_unique_norm": "Wake Forest University", "aff_unique_dep": "", "aff_unique_url": "https://www.wfu.edu", "aff_unique_abbr": "WFU", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "id": "1sx0Drq4jfT", "title": "Training Meta-Surrogate Model for Transferable Adversarial Attack", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "We consider adversarial attacks to a black-box model when no queries are allowed. In this setting, many methods directly attack surrogate models and transfer the obtained adversarial examples to fool the target model. Plenty of previous works investigated what kind of attacks to the surrogate model can generate more transferable adversarial examples, but their performances are still limited due to the mismatches between surrogate models and the target model. In this paper, we tackle this problem from a novel angle---instead of using the original surrogate models, can we obtain a Meta-Surrogate Model (MSM) such that attacks to this model can be easier transferred to other models? We show that this goal can be mathematically formulated as a well-posed (bi-level-like) optimization problem and design a differentiable attacker to make training feasible. Given one or a set of surrogate models, our method can thus obtain an MSM such that adversarial examples generated on MSM enjoy eximious transferability. Comprehensive experiments on Cifar-10 and ImageNet demonstrate that by attacking the MSM, we can obtain stronger transferable adversarial examples to fool black-box models including adversarially trained ones, with much higher success rates than existing methods. ", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/0ed2db2c6d5b63ea3aa747a8341930f8aafe8e0a.zip", "author": "Yunxiao Qin;Yuanhao Xiong;Jinfeng Yi;Cho-Jui Hsieh", "authorids": "~Yunxiao_Qin1;~Yuanhao_Xiong1;~Jinfeng_Yi1;~Cho-Jui_Hsieh1", "gender": "M;M;M;M", "homepage": "https://qyxqyx.github.io/homepage;https://xyh97.github.io/;http://jinfengyi.net/;http://web.cs.ucla.edu/~chohsieh/index.html", "dblp": "230/4075;232/1248;117/4898;14/2770", "google_scholar": "EMEy3gwAAAAJ;DVKxiMkAAAAJ;lZxRZ84AAAAJ;Wy89g4IAAAAJ", "orcid": "0000-0003-3209-020X;;;", "linkedin": ";;https://www.linkedin.com/nhome/?trk=;", "or_profile": "~Yunxiao_Qin1;~Yuanhao_Xiong1;~Jinfeng_Yi1;~Cho-Jui_Hsieh1", "aff": "Communication University of China;University of California, Los Angeles;JD AI Research;University of California, Los Angeles", "aff_domain": "cuc.edu.cn;cs.ucla.edu;jd.com;ucla.edu", "position": "Lecturer;PhD student;Senior Director;Assistant Professor", "bibtex": "@misc{\nqin2022training,\ntitle={Training Meta-Surrogate Model for Transferable Adversarial Attack},\nauthor={Yunxiao Qin and Yuanhao Xiong and Jinfeng Yi and Cho-Jui Hsieh},\nyear={2022},\nurl={https://openreview.net/forum?id=1sx0Drq4jfT}\n}", "github": "", "project": "", "reviewers": "Ci39;cXab;d2tZ;V2aQ", "site": "https://openreview.net/forum?id=1sx0Drq4jfT", "pdf_size": 0, "recommendation": "5;5;5;6", "confidence": "5;3;3;5", "correctness": "3;2;3;3", "technical_novelty": "2;3;3;2", "empirical_novelty": "2;2;3;0", "wc_summary_paper": "51;84;60;44", "wc_summary_review": "17;216;13;24", "wc_main_review": "160;281;175;365", "wc_review": "228;581;248;433", "wc_reply_reviewers": "0;279;0;63", "wc_reply_authors": "732;1371;889;526", "reply_reviewers": "0;3;0;1", "reply_authors": "5;7;4;3", "recommendation_avg": [ 5.25, 0.4330127018922193 ], "confidence_avg": [ 4.0, 1.0 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 1.75, 1.0897247358851685 ], "wc_summary_paper_avg": [ 59.75, 15.105876340020794 ], "wc_summary_review_avg": [ 67.5, 85.82686059736777 ], "wc_main_review_avg": [ 245.25, 83.3977667566704 ], "wc_review_avg": [ 372.5, 144.4930794190504 ], "wc_reply_reviewers_avg": [ 85.5, 114.63965282571297 ], "wc_reply_authors_avg": [ 879.5, 311.60110718673644 ], "reply_reviewers_avg": [ 1.0, 1.224744871391589 ], "reply_authors_avg": [ 4.75, 1.479019945774904 ], "replies_avg": [ 28, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 23, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10812596193748787198&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff_unique_index": "0;1;2;1", "aff_unique_norm": "Communication University of China;University of California, Los Angeles;JD", "aff_unique_dep": ";;JD AI Research", "aff_unique_url": "http://www.cuc.edu.cn/;https://www.ucla.edu;https://www.jd.com", "aff_unique_abbr": "CUC;UCLA;JD AI", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Los Angeles", "aff_country_unique_index": "0;1;0;1", "aff_country_unique": "China;United States" }, { "id": "1uf_kj0GUF-", "title": "Nonparametric Learning of Two-Layer ReLU Residual Units", "track": "main", "status": "Reject", "tldr": "", "abstract": "We describe an algorithm that learns two-layer residual units using rectified linear unit (ReLU) activation: suppose the input $\\mathbf{x}$ is from a distribution with support space $\\mathbb{R}^d$ and the ground-truth generative model is a residual unit of this type, given by $\\mathbf{y} = \\boldsymbol{B}^\\ast\\left[\\left(\\boldsymbol{A}^\\ast\\mathbf{x}\\right)^+ + \\mathbf{x}\\right]$, where ground-truth network parameters $\\boldsymbol{A}^\\ast \\in \\mathbb{R}^{d\\times d}$ represent a nonnegative full-rank matrix and $\\boldsymbol{B}^\\ast \\in \\mathbb{R}^{m\\times d}$ is full-rank with $m \\geq d$ and for $\\boldsymbol{c} \\in \\mathbb{R}^d$, $[\\boldsymbol{c}^{+}]_i = \\max\\{0, c_i\\}$. We design layer-wise objectives as functionals whose analytic minimizers express the exact ground-truth network in terms of its parameters and nonlinearities. Following this objective landscape, learning residual units from finite samples can be formulated using convex optimization of a nonparametric function: for each layer, we first formulate the corresponding empirical risk minimization (ERM) as a positive semi-definite quadratic program (QP), then we show the solution space of the QP can be equivalently determined by a set of linear inequalities, which can then be efficiently solved by linear programming (LP). We further prove the statistical strong consistency of our algorithm, and demonstrate its robustness and sample efficiency through experimental results.", "keywords": "neural network learning;nonparametric methods;convex optimization", "primary_area": "", "supplementary_material": "/attachment/e4739d2debeb60b6924a4206e877735462b3f6ae.zip", "author": "Zhunxuan Wang;Linyun He;Chunchuan Lyu;Shay B Cohen", "authorids": "~Zhunxuan_Wang1;~Linyun_He1;~Chunchuan_Lyu1;~Shay_B_Cohen1", "gender": "M;;M;M", "homepage": "https://uuzeeex.github.io/;;;http://homepages.inf.ed.ac.uk/scohen", "dblp": "251/3234;;172/1054;04/5629", "google_scholar": ";UK0VCxsAAAAJ;;", "orcid": ";;;0000-0003-4753-8353", "linkedin": ";;;", "or_profile": "~Zhunxuan_Wang1;~Linyun_He1;~Chunchuan_Lyu1;~Shay_B_Cohen1", "aff": "Amazon;Pennsylvania State University;;University of Edinburgh", "aff_domain": "amazon.com;psu.edu;;ed.ac.uk", "position": "Software Engineer;PhD student;;Reader", "bibtex": "@misc{\nwang2022nonparametric,\ntitle={Nonparametric Learning of Two-Layer Re{LU} Residual Units},\nauthor={Zhunxuan Wang and Linyun He and Chunchuan Lyu and Shay B Cohen},\nyear={2022},\nurl={https://openreview.net/forum?id=1uf_kj0GUF-}\n}", "github": "", "project": "", "reviewers": "3xzu;T6Yf;1xAR;UrKE", "site": "https://openreview.net/forum?id=1uf_kj0GUF-", "pdf_size": 0, "recommendation": "3;5;6;6", "confidence": "4;4;3;4", "correctness": "3;4;4;4", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "43;126;59;301", "wc_summary_review": "43;308;47;54", "wc_main_review": "285;287;109;119", "wc_review": "371;721;215;474", "wc_reply_reviewers": "0;185;0;27", "wc_reply_authors": "531;571;111;229", "reply_reviewers": "0;1;0;1", "reply_authors": "1;1;1;2", "recommendation_avg": [ 5.0, 1.224744871391589 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 132.25, 102.2823909575837 ], "wc_summary_review_avg": [ 113.0, 112.65211937642363 ], "wc_main_review_avg": [ 200.0, 86.07554821202127 ], "wc_review_avg": [ 445.25, 183.97876915557404 ], "wc_reply_reviewers_avg": [ 53.0, 77.00324668479895 ], "wc_reply_authors_avg": [ 360.5, 195.52685237583097 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.4714045207910316, "corr_recommendation_correctness": 0.9428090415820632, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15022705893377420931&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff_unique_index": "0;1;2", "aff_unique_norm": "Amazon;Pennsylvania State University;University of Edinburgh", "aff_unique_dep": "Amazon.com, Inc.;;", "aff_unique_url": "https://www.amazon.com;https://www.psu.edu;https://www.ed.ac.uk", "aff_unique_abbr": "Amazon;PSU;Edinburgh", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1", "aff_country_unique": "United States;United Kingdom" }, { "title": "Cold Brew: Distilling Graph Node Representations with Incomplete or Missing Neighborhoods", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7094", "id": "1ugNpm7W6E", "poster": "", "openreview": "https://openreview.net/forum?id=1ugNpm7W6E", "slides": "https://iclr.cc/virtual/2022/poster/7094", "video": "https://iclr.cc/virtual/2022/poster/7094", "author_site": "Wenqing Zheng, Edward Huang, Nikhil Rao, Sumeet Katariya, Zhangyang Wang, Karthik Subbian", "tldr": "", "abstract": "Graph Neural Networks (GNNs) have achieved state-of-the-art performance in node classification, regression, and recommendation tasks. GNNs work well when rich and high-quality connections are available. However, their effectiveness is often jeopardized in many real-world graphs in which node degrees have power-law distributions. The extreme case of this situation, where a node may have no neighbors, is called Strict Cold Start (SCS). SCS forces the prediction to rely completely on the node's own features. We propose Cold Brew, a teacher-student distillation approach to address the SCS and noisy-neighbor challenges for GNNs. We also introduce feature contribution ratio (FCR), a metric to quantify the behavior of inductive GNNs to solve SCS. We experimentally show that FCR disentangles the contributions of different graph data components and helps select the best architecture for SCS generalization. We further demonstrate the superior performance of Cold Brew on several public benchmark and proprietary e-commerce datasets, where many nodes have either very few or noisy connections. Our source code is available at https://github.com/amazon-research/gnn-tail-generalization.", "keywords": "Graph Neural Networks;Cold Start;Knowledge Distillation", "primary_area": "", "supplementary_material": "", "author": "Wenqing Zheng;Edward W Huang;Nikhil Rao;Sumeet Katariya;Zhangyang Wang;Karthik Subbian", "authorids": "~Wenqing_Zheng1;~Edward_W_Huang1;~Nikhil_Rao1;~Sumeet_Katariya1;~Zhangyang_Wang1;~Karthik_Subbian1", "gender": "M;M;M;;M;M", "homepage": "https://wenqing-zheng.github.io;;;;https://vita-group.github.io;http://mailtosuka.googlepages.com", "dblp": ";192/2417.html;57/9513.html;72/9639;119/4026;32/5843", "google_scholar": "https://scholar.google.com/citations?hl=zh-CN;EqvdkCAAAAAJ;GhqD_rwAAAAJ;;pxFyKAIAAAAJ;", "orcid": "0000-0002-8283-7511;0000-0002-4461-8545;;;;", "linkedin": ";ewhuang/;nikhil-rao-012068a1/;;;", "or_profile": "~Wenqing_Zheng1;~Edward_W_Huang1;~Nikhil_Rao1;~Sumeet_Katariya1;~Zhangyang_Wang1;~Karthik_Subbian1", "aff": "University of Texas, Austin;Amazon;Amazon;Amazon;University of Texas, Austin;Amazon", "aff_domain": "utexas.edu;amazon.com;amazon.com;amazon.com;utexas.edu;amazon.com", "position": "PhD student;Applied Scientist;Scientist;Applied Scientist;Assistant Professor;Researcher", "bibtex": "@inproceedings{\nzheng2022cold,\ntitle={Cold Brew: Distilling Graph Node Representations with Incomplete or Missing Neighborhoods},\nauthor={Wenqing Zheng and Edward W Huang and Nikhil Rao and Sumeet Katariya and Zhangyang Wang and Karthik Subbian},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=1ugNpm7W6E}\n}", "github": "", "project": "", "reviewers": "cztX;svEE;YxwQ;kTEC", "pdf_size": 0, "recommendation": "6;6;6;6", "confidence": "3;4;3;4", "correctness": "3;4;3;3", "technical_novelty": "4;3;3;2", "empirical_novelty": "0;2;3;2", "wc_summary_paper": "84;83;105;59", "wc_summary_review": "35;37;82;26", "wc_main_review": "210;208;360;260", "wc_review": "329;328;547;345", "wc_reply_reviewers": "19;24;18;5", "wc_reply_authors": "995;479;1530;944", "reply_reviewers": "1;1;1;1", "reply_authors": "4;3;5;3", "recommendation_avg": [ 6.0, 0.0 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 1.75, 1.0897247358851685 ], "wc_summary_paper_avg": [ 82.75, 16.284578594486256 ], "wc_summary_review_avg": [ 45.0, 21.760055146988943 ], "wc_main_review_avg": [ 259.5, 61.65022303284879 ], "wc_review_avg": [ 387.25, 92.47803793333853 ], "wc_reply_reviewers_avg": [ 16.5, 7.0178344238090995 ], "wc_reply_authors_avg": [ 987.0, 372.4332154897036 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 3.75, 0.82915619758885 ], "replies_avg": [ 25, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 142, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6445832848440992452&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 14, "pdf": "https://openreview.net/pdf?id=1ugNpm7W6E", "email": "utexas.edu;amazon.com;amazon.com;amazon.com;utexas.edu;amazon.com", "author_num": 6, "aff_unique_index": "0;1;1;1;0;1", "aff_unique_norm": "University of Texas at Austin;Amazon", "aff_unique_dep": ";Amazon.com, Inc.", "aff_unique_url": "https://www.utexas.edu;https://www.amazon.com", "aff_unique_abbr": "UT Austin;Amazon", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Austin;", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "1v1N7Zhmgcx", "title": "Maximum Likelihood Training of Parametrized Diffusion Model", "track": "main", "status": "Reject", "tldr": "", "abstract": "Whereas the diverse variations of the diffusion model exist in image synthesis, the previous variations have not innovated the diffusing mechanism by maintaining the static linear diffusion. Meanwhile, it is intuitive that there would be more promising diffusion pattern adapted to the data distribution. This paper introduces such adaptive and nonlinear diffusion method for the score-based diffusion models. Unlike the static and linear VE-or-VP SDEs of the previous diffusion models, our parameterized diffusion model (PDM) learns the optimal diffusion process by combining the normalizing flow ahead of the diffusion process. Specifically, PDM utilizes the flow to non-linearly transform a data variable into a latent variable, and PDM applies the diffusion process to the transformed latent distribution with the linear diffusing mechanism. Subsequently, PDM enjoys the nonlinear and learned diffusion from the perspective of the data variable. This model structure is feasible because of the invertibility of the flow. We train PDM with the variational proxy of the log-likelihood, and we prove that the variational gap between the variational bound and the log-likelihood becomes tight when the normalizing flow becomes the optimal. ", "keywords": "Score-based Diffusion Model;Normalizing Flow Model;Variational Inference;Variational Gap;Stochastic Calculus", "primary_area": "", "supplementary_material": "", "author": "Dongjun Kim;Byeonghu Na;Se Jung Kwon;Dongsoo Lee;Wanmo Kang;Il-chul Moon", "authorids": "~Dongjun_Kim1;~Byeonghu_Na1;~Se_Jung_Kwon1;~Dongsoo_Lee1;~Wanmo_Kang1;~Il-chul_Moon1", "gender": "M;M;M;M;M;", "homepage": "https://sites.google.com/view/dongjun-kim?pli=1;https://sites.google.com/view/byeonghu-na;;;https://sites.google.com/site/wanmokang/;", "dblp": "03/4394;276/5100;119/5676;11/9680;;", "google_scholar": "https://scholar.google.com/citations?hl=ko;https://scholar.google.co.kr/citations?user=mJoqpmEAAAAJ;https://scholar.google.co.kr/citations?user=8eTxKOkAAAAJ;ALiieEkAAAAJ;;", "orcid": ";0000-0003-3463-2674;;;;", "linkedin": ";byeonghu-na-17942120b/;se-jung-kwon-305503175/;;;", "or_profile": "~Dongjun_Kim1;~Byeonghu_Na1;~Se_Jung_Kwon1;~Dongsoo_Lee1;~Wanmo_Kang1;~Il-chul_Moon1", "aff": "Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology;NAVER Cloud;NAVER CLOVA;Korea Advanced Institute of Science & Technology;", "aff_domain": "kaist.ac.kr;kaist.ac.kr;navercorp.com;navercorp.com;kaist.ac.kr;", "position": "PhD student;PhD student;AI Researcher;Executive Officer;Professor;", "bibtex": "@misc{\nkim2022maximum,\ntitle={Maximum Likelihood Training of Parametrized Diffusion Model},\nauthor={Dongjun Kim and Byeonghu Na and Se Jung Kwon and Dongsoo Lee and Wanmo Kang and Il-chul Moon},\nyear={2022},\nurl={https://openreview.net/forum?id=1v1N7Zhmgcx}\n}", "github": "", "project": "", "reviewers": "wDxP;4sPP;qGLk;LLCZ", "site": "https://openreview.net/forum?id=1v1N7Zhmgcx", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "4;4;4;3", "correctness": "4;4;3;3", "technical_novelty": "3;2;3;3", "empirical_novelty": "3;2;2;3", "wc_summary_paper": "54;126;106;41", "wc_summary_review": "10;121;178;32", "wc_main_review": "536;1197;438;277", "wc_review": "600;1444;722;350", "wc_reply_reviewers": "0;483;0;63", "wc_reply_authors": "482;3326;2414;2807", "reply_reviewers": "0;2;0;1", "reply_authors": "2;5;5;5", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 81.75, 35.27304211433995 ], "wc_summary_review_avg": [ 85.25, 67.78412424749618 ], "wc_main_review_avg": [ 612.0, 350.17923981869626 ], "wc_review_avg": [ 779.0, 406.68046424680887 ], "wc_reply_reviewers_avg": [ 136.5, 201.69841347913473 ], "wc_reply_authors_avg": [ 2257.25, 1074.7714582645 ], "reply_reviewers_avg": [ 0.75, 0.82915619758885 ], "reply_authors_avg": [ 4.25, 1.299038105676658 ], "replies_avg": [ 27, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": -1.0, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13136649681273310602&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;1;1;0", "aff_unique_norm": "Korea Advanced Institute of Science and Technology;NAVER Corporation", "aff_unique_dep": ";Cloud Division", "aff_unique_url": "https://www.kaist.ac.kr;https://www.naver.com", "aff_unique_abbr": "KAIST;NAVER", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "South Korea" }, { "title": "Simple GNN Regularisation for 3D Molecular Property Prediction and Beyond", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/5968", "id": "1wVvweK3oIb", "poster": "", "openreview": "https://openreview.net/forum?id=1wVvweK3oIb", "slides": "https://iclr.cc/virtual/2022/poster/5968", "video": "https://iclr.cc/virtual/2022/poster/5968", "author_site": "Jonathan Godwin, Michael Schaarschmidt, Alexander Gaunt, Alvaro Sanchez Gonzalez, Yulia Rubanova, Petar Veli\u010dkovi\u0107, James Kirkpatrick, Peter Battaglia", "tldr": "", "abstract": "In this paper we show that simple noisy regularisation can be an effective way to address oversmoothing. We first argue that regularisers ad-dressing oversmoothing should both penalise node latent similarity and encourage meaningful node representations. From this observation we derive \u201cNoisy Nodes\u201d,a simple technique in which we corrupt the input graph with noise, and add a noise correcting node-level loss. The diverse node level loss encourages latent node diversity, and the denoising objective encourages graph manifold learning. Our regulariser applies well-studied methods in simple, straightforward ways which allow even generic architectures to overcome oversmoothing and achieve state of the art results on quantum chemistry tasks such as QM9 and Open Catalyst, and improve results significantly on Open Graph Benchmark (OGB) datasets. Our results suggest Noisy Nodes can serve as a complementary building block in the GNN toolkit.", "keywords": "Graph Neural Networks;GNNs;Deep Learning;Molecular Property Prediction", "primary_area": "", "supplementary_material": "", "author": "Jonathan Godwin;Michael Schaarschmidt;Alexander L Gaunt;Alvaro Sanchez-Gonzalez;Yulia Rubanova;Petar Veli\u010dkovi\u0107;James Kirkpatrick;Peter Battaglia", "authorids": "~Jonathan_Godwin1;~Michael_Schaarschmidt1;~Alexander_L_Gaunt1;~Alvaro_Sanchez-Gonzalez1;~Yulia_Rubanova2;~Petar_Veli\u010dkovi\u01071;~James_Kirkpatrick1;~Peter_Battaglia1", "gender": ";M;M;M;M;;M;F", "homepage": "https://www.linkedin.com/in/jonathan-godwin-12907638/;;;;https://petar-v.com;;;https://yuliarubanova.github.io/", "dblp": ";52/4278;185/1083;222/1889;184/4786.html;124/5973;41/3400;222/3085", "google_scholar": ";;;https://scholar.google.co.uk/citations?user=d1oQ8NcAAAAJ;https://scholar.google.co.uk/citations?user=kcTK_FAAAAAJ;;https://scholar.google.co.uk/citations?user=nQ7Ij30AAAAJ;u_HzE9wAAAAJ", "orcid": ";;;;0000-0002-2820-4692;;;", "linkedin": ";;;;petarvelickovic;;;https://linkedin.com/in/yulia-rubanova-031702100", "or_profile": "~Jonathan_Godwin1;~Michael_Schaarschmidt1;~Alexander_L_Gaunt1;~Alvaro_Sanchez-Gonzalez1;~Petar_Veli\u010dkovi\u01071;~James_Kirkpatrick1;~Peter_Battaglia1;~Yulia_Rubanova1", "aff": "Google DeepMind;Google DeepMind;Google DeepMind;Google DeepMind;Google DeepMind;Google;Google DeepMind;Google DeepMind", "aff_domain": "deepmind.com;deepmind.com;google.com;google.com;google.com;google.com;google.com;deepmind.com", "position": "Researcher;Research scientist;Researcher;Senior Research Engineer;Senior Staff Research Scientist;Researcher;Researcher;Research Scientist", "bibtex": "@inproceedings{\ngodwin2022simple,\ntitle={Simple {GNN} Regularisation for 3D Molecular Property Prediction and Beyond},\nauthor={Jonathan Godwin and Michael Schaarschmidt and Alexander L Gaunt and Alvaro Sanchez-Gonzalez and Yulia Rubanova and Petar Veli{\\v{c}}kovi{\\'c} and James Kirkpatrick and Peter Battaglia},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=1wVvweK3oIb}\n}", "github": "", "project": "", "reviewers": "7LGq;Y3os;yQtf;kA11", "pdf_size": 0, "recommendation": "6;6;6;8", "confidence": "4;4;4;2", "correctness": "3;3;4;4", "technical_novelty": "2;2;3;2", "empirical_novelty": "2;3;0;3", "wc_summary_paper": "46;78;77;41", "wc_summary_review": "37;49;76;15", "wc_main_review": "236;437;350;111", "wc_review": "319;564;503;167", "wc_reply_reviewers": "0;22;14;11", "wc_reply_authors": "1377;1508;846;497", "reply_reviewers": "0;1;1;1", "reply_authors": "3;3;2;2", "recommendation_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 3.5, 0.8660254037844386 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 1.224744871391589 ], "wc_summary_paper_avg": [ 60.5, 17.09532099727876 ], "wc_summary_review_avg": [ 44.25, 22.01561945528674 ], "wc_main_review_avg": [ 283.5, 122.47142523870619 ], "wc_review_avg": [ 388.25, 156.36715607825064 ], "wc_reply_reviewers_avg": [ 11.75, 7.8859051477937525 ], "wc_reply_authors_avg": [ 1057.0, 407.40704461263306 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 2.5, 0.5 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": -1.0, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 147, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14174083926130472805&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=1wVvweK3oIb", "email": "deepmind.com;deepmind.com;google.com;google.com;google.com;google.com;google.com;deepmind.com", "author_num": 8, "aff_unique_index": "0;0;0;0;0;0;0;0", "aff_unique_norm": "Google", "aff_unique_dep": "Google DeepMind", "aff_unique_url": "https://deepmind.com", "aff_unique_abbr": "DeepMind", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0;0;0;1;0;0", "aff_country_unique": "United Kingdom;United States" }, { "title": "Learning Multimodal VAEs through Mutual Supervision", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6307", "id": "1xXvPrAshao", "poster": "", "openreview": "https://openreview.net/forum?id=1xXvPrAshao", "slides": "https://iclr.cc/virtual/2022/poster/6307", "video": "https://iclr.cc/virtual/2022/poster/6307", "author_site": "Tom Joy, Yuge Shi, Philip Torr, Tom Rainforth, Sebastian Schmon, Siddharth N", "tldr": "", "abstract": "Multimodal VAEs seek to model the joint distribution over heterogeneous data (e.g.\\ vision, language), whilst also capturing a shared representation across such modalities. Prior work has typically combined information from the modalities by reconciling idiosyncratic representations directly in the recognition model through explicit products, mixtures, or other such factorisations. Here we introduce a novel alternative, the MEME, that avoids such explicit combinations by repurposing semi-supervised VAEs to combine information between modalities implicitly through mutual supervision. This formulation naturally allows learning from partially-observed data where some modalities can be entirely missing---something that most existing approaches either cannot handle, or do so to a limited extent. We demonstrate that MEME outperforms baselines on standard metrics across both partial and complete observation schemes on the MNIST-SVHN (image--image) and CUB (image--text) datasets. We also contrast the quality of the representations learnt by mutual supervision against standard approaches and observe interesting trends in its ability to capture relatedness between data.", "keywords": "Multimodal Variational Autoencoder;Variational Autoencoder", "primary_area": "", "supplementary_material": "", "author": "Tom Joy;Yuge Shi;Philip Torr;Tom Rainforth;Sebastian M Schmon;Siddharth N", "authorids": "~Tom_Joy1;~Yuge_Shi1;~Philip_Torr1;~Tom_Rainforth1;~Sebastian_M_Schmon1;~Siddharth_N1", "gender": "M;F;;M;M;M", "homepage": "https://thwjoy.github.io;https://yugeten.github.io/;http://www.robots.ox.ac.uk/~tvg/;http://www.robots.ox.ac.uk/~twgr;https://schmons.github.io/;https://homepages.inf.ed.ac.uk/snaraya3/", "dblp": ";227/4684;;166/1198;242/3324;67/8366", "google_scholar": "thwjoy;https://scholar.google.co.uk/citations?user=t6B_Z7MAAAAJ;;https://scholar.google.co.uk/citations?user=ieLRNKMAAAAJ;https://scholar.google.de/citations?user=hs2WrYYAAAAJ;V7D7hxMAAAAJ", "orcid": ";;;;;0000-0003-4911-7333", "linkedin": ";;;;;", "or_profile": "~Tom_Joy1;~Yuge_Shi1;~Philip_Torr1;~Tom_Rainforth1;~Sebastian_M_Schmon1;~Siddharth_N1", "aff": "University of Oxford, University of Oxford;University of Oxford;University of Oxford;;Improbable;University of Edinburgh", "aff_domain": "robots.ox.ac.uk;ox.ac.uk;ox.ac.uk;ox.ac.uk;improbable.io;ed.ac.uk", "position": "PhD student;PhD student;Full Professor;Postdoc;Researcher;Reader (Associate Professor)", "bibtex": "@inproceedings{\njoy2022learning,\ntitle={Learning Multimodal {VAE}s through Mutual Supervision},\nauthor={Tom Joy and Yuge Shi and Philip Torr and Tom Rainforth and Sebastian M Schmon and Siddharth N},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=1xXvPrAshao}\n}", "github": "", "project": "", "reviewers": "ELno;WKus;ag7M;1avA", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "3;3;5;4", "correctness": "4;4;3;4", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;3;3;2", "wc_summary_paper": "73;106;62;107", "wc_summary_review": "46;56;48;78", "wc_main_review": "441;403;391;290", "wc_review": "560;565;501;475", "wc_reply_reviewers": "0;37;0;0", "wc_reply_authors": "275;161;414;139", "reply_reviewers": "0;1;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 87.0, 19.887181801351343 ], "wc_summary_review_avg": [ 57.0, 12.68857754044952 ], "wc_main_review_avg": [ 381.25, 55.82282239371277 ], "wc_review_avg": [ 525.25, 38.408169703853375 ], "wc_reply_reviewers_avg": [ 9.25, 16.021469970012117 ], "wc_reply_authors_avg": [ 247.25, 109.23912989400822 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.3458572319330373, "corr_recommendation_correctness": 0.13245323570650439, "gs_citation": 29, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8935371068156409953&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "pdf": "https://openreview.net/pdf?id=1xXvPrAshao", "email": "robots.ox.ac.uk;ox.ac.uk;ox.ac.uk;ox.ac.uk;improbable.io;ed.ac.uk", "author_num": 6, "aff_unique_index": "0;0;0;1;2", "aff_unique_norm": "University of Oxford;Improbable;University of Edinburgh", "aff_unique_dep": ";;", "aff_unique_url": "https://www.ox.ac.uk;https://improbable.io;https://www.ed.ac.uk", "aff_unique_abbr": "Oxford;Improbable;Edinburgh", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United Kingdom" }, { "title": "Imitation Learning by Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6902", "id": "1zwleytEpYx", "poster": "", "openreview": "https://openreview.net/forum?id=1zwleytEpYx", "slides": "https://iclr.cc/virtual/2022/poster/6902", "video": "https://iclr.cc/virtual/2022/poster/6902", "tldr": "", "abstract": "Imitation learning algorithms learn a policy from demonstrations of expert behavior. We show that, for deterministic experts, imitation learning can be done by reduction to reinforcement learning with a stationary reward. Our theoretical analysis both certifies the recovery of expert reward and bounds the total variation distance between the expert and the imitation learner, showing a link to adversarial imitation learning. We conduct experiments which confirm that our reduction works well in practice for continuous control tasks.", "keywords": "reinforcement learning;imitation learning;Markov Decision Process;continuous control", "primary_area": "", "supplementary_material": "", "author": "Kamil Ciosek", "authorids": "~Kamil_Ciosek1", "gender": "", "homepage": "http://www.ciosek.net", "dblp": "82/12", "google_scholar": "", "orcid": "", "linkedin": "ciosek/", "or_profile": "~Kamil_Ciosek1", "aff": "Spotify", "aff_domain": "spotify.com", "position": "Researcher", "bibtex": "@inproceedings{\nciosek2022imitation,\ntitle={Imitation Learning by Reinforcement Learning},\nauthor={Kamil Ciosek},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=1zwleytEpYx}\n}", "github": "", "project": "", "reviewers": "TFjU;VY5r;7ezW;XK3g", "pdf_size": 0, "recommendation": "5;6;6;6", "confidence": "4;4;3;4", "correctness": "4;3;4;3", "technical_novelty": "3;3;2;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "118;55;69;88", "wc_summary_review": "41;94;26;96", "wc_main_review": "477;333;186;157", "wc_review": "636;482;281;341", "wc_reply_reviewers": "156;371;0;0", "wc_reply_authors": "681;813;239;347", "reply_reviewers": "1;3;0;0", "reply_authors": "2;4;1;2", "recommendation_avg": [ 5.75, 0.4330127018922193 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 82.5, 23.606143268225754 ], "wc_summary_review_avg": [ 64.25, 31.21197686786276 ], "wc_main_review_avg": [ 288.25, 127.77983995920484 ], "wc_review_avg": [ 435.0, 137.07844469499938 ], "wc_reply_reviewers_avg": [ 131.75, 152.10584308303214 ], "wc_reply_authors_avg": [ 520.0, 234.8723057322851 ], "reply_reviewers_avg": [ 1.0, 1.224744871391589 ], "reply_authors_avg": [ 2.25, 1.0897247358851685 ], "replies_avg": [ 20, 0 ], "authors#_avg": [ 1, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": -0.5773502691896257, "gs_citation": 33, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5663632794147354936&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=1zwleytEpYx", "email": "spotify.com", "author_num": 1, "aff_unique_index": "0", "aff_unique_norm": "Spotify", "aff_unique_dep": "", "aff_unique_url": "https://www.spotify.com", "aff_unique_abbr": "Spotify", "aff_country_unique_index": "0", "aff_country_unique": "Sweden" }, { "title": "Stein Latent Optimization for Generative Adversarial Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6332", "id": "2-mkiUs9Jx7", "poster": "", "openreview": "https://openreview.net/forum?id=2-mkiUs9Jx7", "slides": "https://iclr.cc/virtual/2022/poster/6332", "video": "https://iclr.cc/virtual/2022/poster/6332", "author_site": "Uiwon Hwang, Heeseung Kim, Dahuin Jung, Hyemi Jang, Hyungyu Lee, Sungroh Yoon", "tldr": "", "abstract": "Generative adversarial networks (GANs) with clustered latent spaces can perform conditional generation in a completely unsupervised manner. In the real world, the salient attributes of unlabeled data can be imbalanced. However, most of existing unsupervised conditional GANs cannot cluster attributes of these data in their latent spaces properly because they assume uniform distributions of the attributes. To address this problem, we theoretically derive Stein latent optimization that provides reparameterizable gradient estimations of the latent distribution parameters assuming a Gaussian mixture prior in a continuous latent space. Structurally, we introduce an encoder network and novel unsupervised conditional contrastive loss to ensure that data generated from a single mixture component represent a single attribute. We confirm that the proposed method, named Stein Latent Optimization for GANs (SLOGAN), successfully learns balanced or imbalanced attributes and achieves state-of-the-art unsupervised conditional generation performance even in the absence of attribute information (e.g., the imbalance ratio). Moreover, we demonstrate that the attributes to be learned can be manipulated using a small amount of probe data.", "keywords": "Generative Adversarial Networks;Unsupervised Conditional GANs", "primary_area": "", "supplementary_material": "", "author": "Uiwon Hwang;Heeseung Kim;Dahuin Jung;Hyemi Jang;Hyungyu Lee;Sungroh Yoon", "authorids": "~Uiwon_Hwang1;~Heeseung_Kim1;~Dahuin_Jung2;~Hyemi_Jang1;~Hyungyu_Lee1;~Sungroh_Yoon1", "gender": "M;M;F;;;M", "homepage": "https://sites.google.com/view/uiwon-hwang;https://gmltmd789.github.io;https://hai.ssu.ac.kr/;http://data.snu.ac.kr;http://ailab.snu.ac.kr;https://snu.ac.kr", "dblp": "207/8512;294/8710;224/0158;224/0270;99/1474;", "google_scholar": "https://scholar.google.co.kr/citations?user=CJ8-pGIAAAAJ;4ojbJpoAAAAJ;https://scholar.google.co.kr/citations?user=wleS-UQAAAAJ;;Bphl_fIAAAAJ;", "orcid": "0000-0001-5054-2236;;;0000-0002-7736-0528;0000-0002-2367-197X;", "linkedin": "uiwon-hwang/;gmltmd789/;;;;", "or_profile": "~Uiwon_Hwang1;~Heeseung_Kim1;~Dahuin_Jung2;~Hyemi_Jang1;~Sungroh_Yoon1;~Hyungyu_Lee2", "aff": "Seoul National University;Seoul National University;Seoul National University;Seoul National University;Seoul National University;Seoul National University", "aff_domain": "snu.ac.kr;snu.ac.kr;snu.ac.kr;snu.ac.kr;snu.ac.kr;snu.ac.kr", "position": "PhD student;PhD student;PhD student;PhD student;Full Professor;PhD student", "bibtex": "@inproceedings{\nhwang2022stein,\ntitle={Stein Latent Optimization for Generative Adversarial Networks},\nauthor={Uiwon Hwang and Heeseung Kim and Dahuin Jung and Hyemi Jang and Hyungyu Lee and Sungroh Yoon},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=2-mkiUs9Jx7}\n}", "github": "", "project": "", "reviewers": "9K6Q;5GSd;bra9;VThV", "pdf_size": 0, "recommendation": "6;6;6;6", "confidence": "3;4;3;4", "correctness": "4;3;4;3", "technical_novelty": "3;4;3;2", "empirical_novelty": "3;3;3;2", "wc_summary_paper": "114;55;70;50", "wc_summary_review": "98;23;31;77", "wc_main_review": "250;213;101;280", "wc_review": "462;291;202;407", "wc_reply_reviewers": "71;0;33;86", "wc_reply_authors": "711;1070;788;1118", "reply_reviewers": "1;0;1;2", "reply_authors": "3;3;4;4", "recommendation_avg": [ 6.0, 0.0 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 72.25, 25.202926417382564 ], "wc_summary_review_avg": [ 57.25, 31.275989192989563 ], "wc_main_review_avg": [ 211.0, 67.79749257900325 ], "wc_review_avg": [ 340.5, 101.01608782763269 ], "wc_reply_reviewers_avg": [ 47.5, 33.544746235439014 ], "wc_reply_authors_avg": [ 921.75, 175.21183607279502 ], "reply_reviewers_avg": [ 1.0, 0.7071067811865476 ], "reply_authors_avg": [ 3.5, 0.5 ], "replies_avg": [ 24, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14809143039614633477&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=2-mkiUs9Jx7", "email": "snu.ac.kr;snu.ac.kr;snu.ac.kr;snu.ac.kr;snu.ac.kr;snu.ac.kr", "author_num": 6, "aff_unique_index": "0;0;0;0;0;0", "aff_unique_norm": "Seoul National University", "aff_unique_dep": "", "aff_unique_url": "https://www.snu.ac.kr", "aff_unique_abbr": "SNU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "South Korea" }, { "id": "24N4XH2NaYq", "title": "Sparse Hierarchical Table Ensemble", "track": "main", "status": "Reject", "tldr": "", "abstract": "Deep learning for tabular data is drawing increasing attention, with recent work attempting to boost the accuracy of neuron-based networks. However, when computational capacity is low as in Internet of Things (IoT), drone, or Natural User Interface (NUI) applications, such deep learning methods are deserted. We offer to enable deep learning capabilities using ferns (oblivious decision trees) instead of neurons, by constructing a Sparse Hierarchical Table Ensemble (S-HTE). S-HTE inference is dense at the beginning of the training process and becomes gradually sparse using an annealing mechanism, leading to an efficient final predictor. Unlike previous work with ferns, S-HTE learns useful internal representations, and it earns from increasing depth. Using a standard classification and regression benchmark, we show its accuracy is comparable to alternatives while having an order of magnitude lower computational complexity. Our PyTorch implementation is available at https://anonymous.4open.science/r/HTE_CTE-60EB/", "keywords": "tabular data;DL alternative;architecture", "primary_area": "", "supplementary_material": "", "author": "Guy Farjon;Aharon Bar HIllel", "authorids": "~Guy_Farjon2;~Aharon_Bar_HIllel1", "gender": "M;M", "homepage": "https://scholar.google.com/citations?view_op=list_works&hl=iw&user=YkAIj7sAAAAJ;http://www.bgu.ac.il/~barhille/", "dblp": ";72/1301", "google_scholar": ";x4GlT3IAAAAJ", "orcid": ";", "linkedin": ";aharon-bar-hillel-4a76684/", "or_profile": "~Guy_Farjon2;~Aharon_Bar_HIllel1", "aff": "Ben Gurion University of the Negev;Ben Gurion University", "aff_domain": "bgu.ac.il;bgu.ac.il", "position": "PhD student;Senior Lecturer", "bibtex": "@misc{\nfarjon2022sparse,\ntitle={Sparse Hierarchical Table Ensemble},\nauthor={Guy Farjon and Aharon Bar HIllel},\nyear={2022},\nurl={https://openreview.net/forum?id=24N4XH2NaYq}\n}", "github": "", "project": "", "reviewers": "vA5d;dw2A;M9FK;2rMh", "site": "https://openreview.net/forum?id=24N4XH2NaYq", "pdf_size": 0, "recommendation": "1;5;5;5", "confidence": "5;4;3;2", "correctness": "3;3;3;3", "technical_novelty": "1;2;2;3", "empirical_novelty": "1;2;2;2", "wc_summary_paper": "112;122;57;55", "wc_summary_review": "103;17;25;47", "wc_main_review": "282;313;227;173", "wc_review": "497;452;309;275", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.0, 1.7320508075688772 ], "confidence_avg": [ 3.5, 1.118033988749895 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 1.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 86.5, 30.712375355872428 ], "wc_summary_review_avg": [ 48.0, 33.60059523282288 ], "wc_main_review_avg": [ 248.75, 53.49006917176309 ], "wc_review_avg": [ 383.25, 93.40335914730262 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.7745966692414834, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:UwtlXA6U2owJ:scholar.google.com/&scioq=Sparse+Hierarchical+Table+Ensemble&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1", "aff_unique_norm": "Ben Gurion University of the Negev;Ben-Gurion University of the Negev", "aff_unique_dep": ";", "aff_unique_url": "https://www.bgu.ac.il;https://www.bgu.ac.il", "aff_unique_abbr": "BGU;BGU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Israel" }, { "id": "25HMCfbzOC", "title": "Learning Complex Geometric Structures from Data with Deep Riemannian Manifolds", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "We present Deep Riemannian Manifolds, a new class of neural network parameterized Riemannian manifolds that can represent and learn complex geometric structures. To do this, we first construct a neural network which outputs symmetric positive definite matrices and show that the induced metric can universally approximate all geometries. We then develop differentiable solvers for core manifold operations like the Riemannian exponential and logarithmic map, allowing us to train the manifold parameters in an end-to-end machine learning system. We apply our method to learn 1) low-distortion manifold graph embeddings and 2) the underlying manifold of geodesic data. In addition to improving upon the baselines, our ability to directly optimize the Riemannian manifold brings to light new perspectives with which to view these tasks.", "keywords": "manifold;geometry;graph embedding;geodesic;differential equations;BVP;differentiable programming", "primary_area": "", "supplementary_material": "", "author": "Aaron Lou;Maximilian Nickel;Mustafa Mukadam;Brandon Amos", "authorids": "~Aaron_Lou1;~Maximilian_Nickel1;~Mustafa_Mukadam1;~Brandon_Amos1", "gender": "M;M;M;", "homepage": "https://aaronlou.com;https://mnick.github.io/;http://www.mustafamukadam.com;http://bamos.github.io", "dblp": "232/3858;83/10622;;133/4801.html", "google_scholar": ";KDqGTIUAAAAJ;yYpm9LoAAAAJ;d8gdZR4AAAAJ", "orcid": ";0000-0001-5006-0827;;", "linkedin": ";;mhmukadam/;bdamos", "or_profile": "~Aaron_Lou1;~Maximilian_Nickel1;~Mustafa_Mukadam1;~Brandon_Amos1", "aff": "Stanford University;Meta Facebook;Meta AI;Meta", "aff_domain": "stanford.edu;fb.com;meta.com;meta.com", "position": "PhD student;Research Scientist;Researcher;Research Scientist", "bibtex": "@misc{\nlou2022learning,\ntitle={Learning Complex Geometric Structures from Data with Deep Riemannian Manifolds},\nauthor={Aaron Lou and Maximilian Nickel and Mustafa Mukadam and Brandon Amos},\nyear={2022},\nurl={https://openreview.net/forum?id=25HMCfbzOC}\n}", "github": "", "project": "", "reviewers": "q3XR;UrLH;woWg;HN7w", "site": "https://openreview.net/forum?id=25HMCfbzOC", "pdf_size": 0, "recommendation": "1;3;3;6", "confidence": "2;3;4;3", "correctness": "2;3;2;4", "technical_novelty": "2;2;2;3", "empirical_novelty": "1;2;3;3", "wc_summary_paper": "86;185;39;30", "wc_summary_review": "32;64;84;49", "wc_main_review": "143;796;776;462", "wc_review": "261;1045;899;541", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "87;188;699;502", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 3.25, 1.7853571071357126 ], "confidence_avg": [ 3.0, 0.7071067811865476 ], "correctness_avg": [ 2.75, 0.82915619758885 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 85.0, 61.526417090547376 ], "wc_summary_review_avg": [ 57.25, 19.149086140074676 ], "wc_main_review_avg": [ 544.25, 266.8579912612699 ], "wc_review_avg": [ 686.5, 306.55301335984285 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 369.0, 244.37368925479683 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.39605901719066966, "corr_recommendation_correctness": 0.8866206949335731, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10290896462808615471&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;1;1", "aff_unique_norm": "Stanford University;Meta", "aff_unique_dep": ";Meta Platforms, Inc.", "aff_unique_url": "https://www.stanford.edu;https://meta.com", "aff_unique_abbr": "Stanford;Meta", "aff_campus_unique_index": "0", "aff_campus_unique": "Stanford;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Direct then Diffuse: Incremental Unsupervised Skill Discovery for State Covering and Goal Reaching", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6977", "id": "25kzAhUB1lz", "poster": "", "openreview": "https://openreview.net/forum?id=25kzAhUB1lz", "slides": "https://iclr.cc/virtual/2022/poster/6977", "video": "https://iclr.cc/virtual/2022/poster/6977", "author_site": "Pierre-Alexandre Kamienny, Jean Tarbouriech, sylvain lamprier, Alessandro Lazaric, Ludovic Denoyer", "tldr": "", "abstract": "Learning meaningful behaviors in the absence of reward is a difficult problem in reinforcement learning. A desirable and challenging unsupervised objective is to learn a set of diverse skills that provide a thorough coverage of the state space while being directed, i.e., reliably reaching distinct regions of the environment. In this paper, we build on the mutual information framework for skill discovery and introduce UPSIDE, which addresses the coverage-directedness trade-off in the following ways: 1) We design policies with a decoupled structure of a directed skill, trained to reach a specific region, followed by a diffusing part that induces a local coverage. 2) We optimize policies by maximizing their number under the constraint that each of them reaches distinct regions of the environment (i.e., they are sufficiently discriminable) and prove that this serves as a lower bound to the original mutual information objective. 3) Finally, we compose the learned directed skills into a growing tree that adaptively covers the environment. We illustrate in several navigation and control environments how the skills learned by UPSIDE solve sparse-reward downstream tasks better than existing baselines.", "keywords": "unsupervised reinforcement learning;skill discovery;mutual information", "primary_area": "", "supplementary_material": "", "author": "Pierre-Alexandre Kamienny;Jean Tarbouriech;sylvain lamprier;Alessandro Lazaric;Ludovic Denoyer", "authorids": "~Pierre-Alexandre_Kamienny1;~Jean_Tarbouriech1;~sylvain_lamprier1;~Alessandro_Lazaric2;~Ludovic_Denoyer1", "gender": ";;M;M;M", "homepage": ";;https://scholar.google.com/citations?user=NuGN8SUAAAAJ&hl=fr&oi=ao;;", "dblp": ";;28/4095.html;36/321;54/5551", "google_scholar": ";;NuGN8SUAAAAJ;6JZ3R6wAAAAJ;9PLqulwAAAAJ", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~Pierre-Alexandre_Kamienny1;~Jean_Tarbouriech1;~sylvain_lamprier1;~Alessandro_Lazaric2;~Ludovic_Denoyer1", "aff": ";;Universit\u00e9 d'Angers;Meta Facebook;Meta Facebook", "aff_domain": ";;univ-angers.fr;fb.com;fb.com", "position": ";;Full Professor;Research Scientist;Research Scientist", "bibtex": "@inproceedings{\nkamienny2022direct,\ntitle={Direct then Diffuse: Incremental Unsupervised Skill Discovery for State Covering and Goal Reaching},\nauthor={Pierre-Alexandre Kamienny and Jean Tarbouriech and Alessandro Lazaric and Ludovic Denoyer},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=25kzAhUB1lz}\n}", "github": "", "project": "", "reviewers": "wdBX;zwRd;H3Jm;d1m1", "pdf_size": 0, "recommendation": "6;6;8;8", "confidence": "4;4;4;3", "correctness": "3;3;4;4", "technical_novelty": "2;3;3;3", "empirical_novelty": "3;3;3;3", "wc_summary_paper": "143;66;126;97", "wc_summary_review": "69;65;36;26", "wc_main_review": "315;416;426;346", "wc_review": "527;547;588;469", "wc_reply_reviewers": "473;59;18;0", "wc_reply_authors": "1502;554;611;282", "reply_reviewers": "2;1;1;0", "reply_authors": "3;1;1;1", "recommendation_avg": [ 7.0, 1.0 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 108.0, 29.300170647967224 ], "wc_summary_review_avg": [ 49.0, 18.398369492974098 ], "wc_main_review_avg": [ 375.75, 46.69247798093393 ], "wc_review_avg": [ 532.75, 42.87408891160254 ], "wc_reply_reviewers_avg": [ 137.5, 194.8775256411062 ], "wc_reply_authors_avg": [ 737.25, 458.6978171955912 ], "reply_reviewers_avg": [ 1.0, 0.7071067811865476 ], "reply_authors_avg": [ 1.5, 0.8660254037844386 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 1.0, "gs_citation": 25, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3819638393383796982&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "pdf": "https://openreview.net/pdf?id=25kzAhUB1lz", "email": ";;univ-angers.fr;fb.com;fb.com", "author_num": 5, "aff_unique_index": "0;1;1", "aff_unique_norm": "Universit\u00e9 d'Angers;Meta", "aff_unique_dep": ";Meta Platforms, Inc.", "aff_unique_url": "https://www.univ-angers.fr;https://meta.com", "aff_unique_abbr": "UA;Meta", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1", "aff_country_unique": "France;United States" }, { "title": "Adversarial Support Alignment", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6100", "id": "26gKg6x-ie", "poster": "", "openreview": "https://openreview.net/forum?id=26gKg6x-ie", "slides": "https://iclr.cc/virtual/2022/poster/6100", "video": "https://iclr.cc/virtual/2022/poster/6100", "author_site": "Shangyuan Tong, Timur Garipov, Yang Zhang, Shiyu Chang, Tommi Jaakkola", "tldr": "", "abstract": "We study the problem of aligning the supports of distributions. Compared to the existing work on distribution alignment, support alignment does not require the densities to be matched. We propose symmetric support difference as a divergence measure to quantify the mismatch between supports. We show that select discriminators (e.g. discriminator trained for Jensen-Shannon divergence) are able to map support differences as support differences in their one-dimensional output space. Following this result, our method aligns supports by minimizing a symmetrized relaxed optimal transport cost in the discriminator 1D space via an adversarial process. Furthermore, we show that our approach can be viewed as a limit of existing notions of alignment by increasing transportation assignment tolerance. We quantitatively evaluate the method across domain adaptation tasks with shifts in label distributions. Our experiments show that the proposed method is more robust against these shifts than other alignment-based baselines.", "keywords": "support alignment;distribution alignment;optimal transport;domain adaptation", "primary_area": "", "supplementary_material": "", "author": "Shangyuan Tong;Timur Garipov;Yang Zhang;Shiyu Chang;Tommi S. Jaakkola", "authorids": "~Shangyuan_Tong1;~Timur_Garipov1;~Yang_Zhang3;~Shiyu_Chang2;~Tommi_S._Jaakkola1", "gender": "M;M;M;Unspecified;", "homepage": ";https://timgaripov.github.io/;;http://people.csail.mit.edu/chang87/;", "dblp": "241/1025;190/7045;06/6785-1;28/9988;", "google_scholar": "VDs38-UAAAAJ;gWQzBQMAAAAJ;_-5PSgQAAAAJ;r21asW4AAAAJ;", "orcid": "0000-0002-1455-6259;;;;", "linkedin": ";timur-garipov-5a133a24b/;;;", "or_profile": "~Shangyuan_Tong1;~Timur_Garipov1;~Yang_Zhang3;~Shiyu_Chang2;~Tommi_S._Jaakkola1", "aff": "Massachusetts Institute of Technology;Massachusetts Institute of Technology;International Business Machines;University of California, Santa Barbara;", "aff_domain": "mit.edu;mit.edu;ibm.com;ucsb.edu;", "position": "PhD student;PhD student;Research Staff Employee;Assistant Professor;", "bibtex": "@inproceedings{\ntong2022adversarial,\ntitle={Adversarial Support Alignment},\nauthor={Shangyuan Tong and Timur Garipov and Yang Zhang and Shiyu Chang and Tommi S. Jaakkola},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=26gKg6x-ie}\n}", "github": "", "project": "", "reviewers": "nH3D;FsUp;BDME;ukYK", "pdf_size": 0, "recommendation": "3;8;8;8", "confidence": "3;4;4;3", "correctness": "3;4;3;4", "technical_novelty": "2;3;4;3", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "44;144;63;50", "wc_summary_review": "55;69;58;15", "wc_main_review": "207;576;717;149", "wc_review": "306;789;838;214", "wc_reply_reviewers": "169;76;250;0", "wc_reply_authors": "1591;3330;2118;141", "reply_reviewers": "1;1;1;0", "reply_authors": "5;7;4;1", "recommendation_avg": [ 6.75, 2.165063509461097 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 75.25, 40.28259549731124 ], "wc_summary_review_avg": [ 49.25, 20.44963324854507 ], "wc_main_review_avg": [ 412.25, 240.3719773600908 ], "wc_review_avg": [ 536.75, 279.19292165096164 ], "wc_reply_reviewers_avg": [ 123.75, 94.31430167265196 ], "wc_reply_authors_avg": [ 1795.0, 1144.3214146383873 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 4.25, 2.165063509461097 ], "replies_avg": [ 26, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18158530648839344635&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=26gKg6x-ie", "email": "mit.edu;mit.edu;ibm.com;ucsb.edu;", "author_num": 5, "aff_unique_index": "0;0;1;2", "aff_unique_norm": "Massachusetts Institute of Technology;International Business Machines Corporation;University of California, Santa Barbara", "aff_unique_dep": ";;", "aff_unique_url": "https://web.mit.edu;https://www.ibm.com;https://www.ucsb.edu", "aff_unique_abbr": "MIT;IBM;UCSB", "aff_campus_unique_index": "1", "aff_campus_unique": ";Santa Barbara", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "27aftiBeius", "title": "$$Research on fusion algorithm of multi-attribute decision making and reinforcement learning based on intuitionistic fuzzy number in wargame environment$$", "track": "main", "status": "Reject", "tldr": "", "abstract": "Intelligent games have seen an increasing interest within the research community on artificial intelligence. The article proposes an algorithm that combines the multi-attribute management and reinforcement learning methods, and that joined their effect on wargaming, it solves the problem of the agent\u2019s low rate of winning against specific rules and its inability to quickly converge during intelligent wargame training. At the same time, this paper studied a multi-attribute decision making and reinforcement learning algorithm in a wargame simulation environment, yielding data on the conflict between red and blue sides. We calculate the weight of each attribute based on the intuitionistic fuzzy number weight calculations. And then we determine the threat posed by each opponent\u2019s game agents . Using the red side reinforcement learning reward function, the AC framework is trained on the reward function, and an algorithm combining multi-attribute decision making with reinforcement learning is obtained. A simulation experiment confirms that the algorithm of multi-attribute decision making combined with reinforcement learning presented in this paper is significantly more intelligent than the pure reinforcement learning algorithm. By resolving the shortcomings of the agent\u2019s neural network, coupled with sparse rewards in large-map combat games, this robust algorithm effectively reduces the difficulties of convergence. It is also the first time in this field that an algorithm design for intelligent wargaming combines multi-attribute decision making with reinforcement learning. Finally, another novelty of this research is the interdisciplinary, like designing intelligent wargames and improving reinforcement learning algorithms.", "keywords": "Wargame;Reinforcement learning;Multiple attribute decision making;Intelligent game", "primary_area": "", "supplementary_material": "", "author": "Yuxiang Sun;Bo Yuan;Yufan Xue;Jiawei Zhou;Leonardo Stella;Xianzhong Zhou", "authorids": "~Yuxiang_Sun2;~Bo_Yuan6;yufanxue1@163.com;jiaweizhou163@163.com;l.stella@derby.ac.uk;~Xianzhong_Zhou1", "gender": "M;M;;;;M", "homepage": "https://sme.nju.edu.cn/zxz/list.htm;https://www.derby.ac.uk/staff/bo-yuan/;;;;https://sme.nju.edu.cn/zxz/list.htm", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";0000-0001-8401-321X;;;;", "linkedin": ";;;;;", "or_profile": "~Yuxiang_Sun2;~Bo_Yuan6;yufanxue1@163.com;jiaweizhou163@163.com;l.stella@derby.ac.uk;~Xianzhong_Zhou1", "aff": "Nanjing University;University of Derby;;;;Nanjing University", "aff_domain": "nju.edu;derby.ac.uk;;;;nju.edu.cn", "position": "PhD student;Lecturer;;;;Full Professor", "bibtex": "@misc{\nsun2022research,\ntitle={\\$\\$Research on fusion algorithm of multi-attribute decision making and reinforcement learning based on intuitionistic fuzzy number in wargame environment\\$\\$},\nauthor={Yuxiang Sun and Bo Yuan and Yufan Xue and Jiawei Zhou and Leonardo Stella and Xianzhong Zhou},\nyear={2022},\nurl={https://openreview.net/forum?id=27aftiBeius}\n}", "github": "", "project": "", "reviewers": "4Tzn;VXJT;7o41;Hi7P", "site": "https://openreview.net/forum?id=27aftiBeius", "pdf_size": 0, "recommendation": "1;1;3;5", "confidence": "4;4;4;2", "correctness": "1;1;2;3", "technical_novelty": "1;2;2;2", "empirical_novelty": "1;2;1;0", "wc_summary_paper": "108;39;28;45", "wc_summary_review": "174;66;60;19", "wc_main_review": "1909;676;297;85", "wc_review": "2191;781;385;149", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 2.5, 1.6583123951777 ], "confidence_avg": [ 3.5, 0.8660254037844386 ], "correctness_avg": [ 1.75, 0.82915619758885 ], "technical_novelty_avg": [ 1.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 55.0, 31.20096152364539 ], "wc_summary_review_avg": [ 79.75, 57.34271967739235 ], "wc_main_review_avg": [ 741.75, 706.3849428604775 ], "wc_review_avg": [ 876.5, 791.8110570079203 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.8703882797784891, "corr_recommendation_correctness": 1.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:MkAcfBzsZL8J:scholar.google.com/&scioq=%24%24Research+on+fusion+algorithm+of+multi-attribute+decision+making+and+reinforcement+learning+based+on+intuitionistic+fuzzy+number+in+wargame+environment%24%24&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1;0", "aff_unique_norm": "Nanjing University;University of Derby", "aff_unique_dep": ";", "aff_unique_url": "https://www.nju.edu.cn;https://www.derby.ac.uk", "aff_unique_abbr": "Nanjing U;UD", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", "aff_country_unique": "China;United Kingdom" }, { "title": "Patch-Fool: Are Vision Transformers Always Robust Against Adversarial Perturbations?", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6624", "id": "28ib9tf6zhr", "poster": "", "openreview": "https://openreview.net/forum?id=28ib9tf6zhr", "slides": "https://iclr.cc/virtual/2022/poster/6624", "video": "https://iclr.cc/virtual/2022/poster/6624", "author_site": "Yonggan Fu, Shunyao Zhang, Shang Wu, Cheng Wan, Yingyan Lin", "tldr": "", "abstract": "Vision transformers (ViTs) have recently set off a new wave in neural architecture design thanks to their record-breaking performance in various vision tasks. In parallel, to fulfill the goal of deploying ViTs into real-world vision applications, their robustness against potential malicious attacks has gained increasing attention. In particular, recent works show that ViTs are more robust against adversarial attacks as compared with convolutional neural networks (CNNs), and conjecture that this is because ViTs focus more on capturing global interactions among different input/feature patches, leading to their improved robustness to local perturbations imposed by adversarial attacks. In this work, we ask an intriguing question: \"Under what kinds of perturbations do ViTs become more vulnerable learners compared to CNNs?\" Driven by this question, we first conduct a comprehensive experiment regarding the robustness of both ViTs and CNNs under various existing adversarial attacks to understand the underlying reason favoring their robustness. Based on the drawn insights, we then propose a dedicated attack framework, dubbed Patch-Fool, that fools the self-attention mechanism by attacking its basic component (i.e., a single patch) with a series of attention-aware optimization techniques. Interestingly, our Patch-Fool framework shows for the first time that ViTs are not necessarily more robust than CNNs against adversarial perturbations. In particular, we find that ViTs are more vulnerable learners compared with CNNs against our Patch-Fool attack which is consistent across extensive experiments, and the observations from Sparse/Mild Patch-Fool, two variants of Patch-Fool, indicate an intriguing insight that the perturbation density and strength on each patch seem to be the key factors that influence the robustness ranking between ViTs and CNNs. It can be expected that our Patch-Fool framework will shed light on both future architecture designs and training schemes for robustifying ViTs towards their real-world deployment. Our codes are available at https://github.com/RICE-EIC/Patch-Fool.", "keywords": "Vision transformer;adversarial examples;robustness", "primary_area": "", "supplementary_material": "", "author": "Yonggan Fu;Shunyao Zhang;Shang Wu;Cheng Wan;Yingyan Lin", "authorids": "~Yonggan_Fu1;sz74@rice.edu;sw99@rice.edu;~Cheng_Wan2;~Yingyan_Lin1", "gender": "M;;;M;F", "homepage": "https://www.yongganfu.com/;;;http://cc.gatech.edu/~cwan39;https://eiclab.scs.gatech.edu/", "dblp": "244/8166;;;;120/6981", "google_scholar": "https://scholar.google.com/citations?hl=en;;;JZCbRO0AAAAJ;dio8IesAAAAJ", "orcid": ";;;0000-0002-2295-3481;", "linkedin": "yonggan-fu-b211831b0;;;cheng-wan/;yingyan-celine-lin-a281211a/", "or_profile": "~Yonggan_Fu1;sz74@rice.edu;sw99@rice.edu;~Cheng_Wan2;~Yingyan_Lin1", "aff": "Rice University;;;Rice University;Rice University", "aff_domain": "rice.edu;;;rice.edu;rice.edu", "position": "PhD student;;;PhD student;Assistant Professor", "bibtex": "@inproceedings{\nfu2022patchfool,\ntitle={Patch-Fool: Are Vision Transformers Always Robust Against Adversarial Perturbations?},\nauthor={Yonggan Fu and Shunyao Zhang and Shang Wu and Cheng Wan and Yingyan Lin},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=28ib9tf6zhr}\n}", "github": "", "project": "", "reviewers": "L4v1;gMrG;r33g;hTTd", "pdf_size": 0, "recommendation": "6;6;8;8", "confidence": "4;3;4;4", "correctness": "3;4;4;4", "technical_novelty": "3;3;4;3", "empirical_novelty": "2;3;4;3", "wc_summary_paper": "70;69;95;53", "wc_summary_review": "36;35;28;19", "wc_main_review": "399;157;139;191", "wc_review": "505;261;262;263", "wc_reply_reviewers": "55;43;41;50", "wc_reply_authors": "961;626;496;999", "reply_reviewers": "1;1;1;1", "reply_authors": "2;1;1;3", "recommendation_avg": [ 7.0, 1.0 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 71.75, 15.022899187573616 ], "wc_summary_review_avg": [ 29.5, 6.800735254367722 ], "wc_main_review_avg": [ 221.5, 104.166933332992 ], "wc_review_avg": [ 322.75, 105.22446246001925 ], "wc_reply_reviewers_avg": [ 47.25, 5.584576975922169 ], "wc_reply_authors_avg": [ 770.5, 214.90288504345398 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 1.75, 0.82915619758885 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 92, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1831846608432102028&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "pdf": "https://openreview.net/pdf?id=28ib9tf6zhr", "email": "rice.edu;;;rice.edu;rice.edu", "author_num": 5, "aff_unique_index": "0;0;0", "aff_unique_norm": "Rice University", "aff_unique_dep": "", "aff_unique_url": "https://www.rice.edu", "aff_unique_abbr": "Rice", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "2DJn3E7lXu", "title": "What to expect of hardware metric predictors in NAS", "track": "main", "status": "Reject", "tldr": "", "abstract": "Modern Neural Architecture Search (NAS) focuses on finding the best performing architectures in hardware-aware settings; e.g., those with an optimal tradeoff of accuracy and latency. Due to many advantages of prediction models over live measurements, the search process is often guided by estimates of how well each considered network architecture performs on the desired metrics. Typical predic-\ntion models range from operation-wise lookup tables over gradient-boosted trees and neural networks, with little known information on how they compare. We evaluate 18 different performance predictors on ten combinations of metrics, devices, network types, and training tasks, and find that MLP models are the most promising. We then simulate and evaluate how the guidance of such prediction models affects the subsequent architecture selection. Due to inaccurate predictions, the selected architectures are generally suboptimal, which we quantify as\nan expected reduction in accuracy and hypervolume. We show that simply verifying the predictions of just the selected architectures can lead to substantially improved results. Under a time budget, we find it preferable to use a fast and inaccurate prediction model over accurate but slow live measurements.", "keywords": "Neural Architecture Search;Hardware-Aware;Predictors;Deep Learning", "primary_area": "", "supplementary_material": "/attachment/0c1e9e5a2dba10579193dd4e7d4d420c0ead75f9.zip", "author": "Kevin Alexander Laube;Maximus Mutschler;Andreas Zell", "authorids": "~Kevin_Alexander_Laube1;~Maximus_Mutschler1;~Andreas_Zell1", "gender": "M;;M", "homepage": ";https://uni-tuebingen.de/fakultaeten/mathematisch-naturwissenschaftliche-fakultaet/fachbereiche/informatik/lehrstuehle/kognitive-systeme/the-chair/staff/maximus-mutschler/;https://uni-tuebingen.de/fakultaeten/mathematisch-naturwissenschaftliche-fakultaet/fachbereiche/informatik/lehrstuehle/kognitive-systeme/", "dblp": "232/1731;;05/4192", "google_scholar": ";;", "orcid": ";;", "linkedin": "laubeke/;;", "or_profile": "~Kevin_Alexander_Laube1;~Maximus_Mutschler1;~Andreas_Zell1", "aff": "Bosch;University of Tuebingen;Eberhard-Karls-Universit\u00e4t T\u00fcbingen", "aff_domain": "bosch.com;uni-tuebingen.de;uni-tuebingen.de", "position": "Research Engineer;PhD student;Full Professor", "bibtex": "@misc{\nlaube2022what,\ntitle={What to expect of hardware metric predictors in {NAS}},\nauthor={Kevin Alexander Laube and Maximus Mutschler and Andreas Zell},\nyear={2022},\nurl={https://openreview.net/forum?id=2DJn3E7lXu}\n}", "github": "", "project": "", "reviewers": "bBS4;GUHu;bHy6;7VPE", "site": "https://openreview.net/forum?id=2DJn3E7lXu", "pdf_size": 0, "recommendation": "5;6;6;6", "confidence": "2;4;5;4", "correctness": "4;3;4;4", "technical_novelty": "1;1;2;1", "empirical_novelty": "2;4;3;2", "wc_summary_paper": "143;116;44;64", "wc_summary_review": "253;47;35;70", "wc_main_review": "76;581;284;277", "wc_review": "472;744;363;411", "wc_reply_reviewers": "0;152;0;41", "wc_reply_authors": "881;1698;734;1220", "reply_reviewers": "0;1;0;1", "reply_authors": "2;3;1;2", "recommendation_avg": [ 5.75, 0.4330127018922193 ], "confidence_avg": [ 3.75, 1.0897247358851685 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 1.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 91.75, 39.57508686029634 ], "wc_summary_review_avg": [ 101.25, 88.51094565080638 ], "wc_main_review_avg": [ 304.5, 180.1672833785313 ], "wc_review_avg": [ 497.5, 147.46609779878222 ], "wc_reply_reviewers_avg": [ 48.25, 62.19475460197588 ], "wc_reply_authors_avg": [ 1133.25, 370.64091449811633 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 2.0, 0.7071067811865476 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.9271726499455306, "corr_recommendation_correctness": -0.3333333333333333, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10153443581081341401&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1;2", "aff_unique_norm": "Robert Bosch GmbH;University of Tuebingen;Eberhard Karls University of T\u00fcbingen", "aff_unique_dep": ";;", "aff_unique_url": "https://www.bosch.com;https://www.uni-tuebingen.de/;https://www.uni-tuebingen.de/", "aff_unique_abbr": "Bosch;Uni T\u00fcbingen;Uni T\u00fcbingen", "aff_campus_unique_index": "1", "aff_campus_unique": ";T\u00fcbingen", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Germany" }, { "id": "2DJwuD-elOt", "title": "Hybrid Cloud-Edge Networks for Efficient Inference", "track": "main", "status": "Reject", "tldr": "", "abstract": "Although deep neural networks (DNNs) achieve state-of-the-art accuracy on large-scale and fine-grained prediction tasks, they are high capacity models and often cannot be deployed on edge devices. As such, two distinct paradigms have emerged in parallel: 1) edge device inference for low-level tasks, 2) cloud-based inference for large-scale tasks. We propose a novel hybrid option, which marries these extremes and seeks to bring the latency and computational cost benefits of edge device inference to tasks currently deployed in the cloud. Our proposed method is an end-to-end approach, and involves architecting and training two networks in tandem. The first network is a low-capacity network that can be deployed on an edge device, whereas the second is a high-capacity network deployed in the cloud. When the edge device encounters challenging inputs, these inputs are transmitted and processed on the cloud. Empirically, on the ImageNet classification dataset, our proposed method leads to substantial decrease in the number of floating point operations (FLOPs) used compared to a well-designed high-capacity network, while suffering no excess classification loss. A novel aspect of our method is that, by allowing abstentions on a small fraction of examples ($<20\\%$), we can increase accuracy without increasing the edge device memory and FLOPs substantially (up to $7$\\% higher accuracy and $3$X fewer FLOPs on ImageNet with $80$\\% coverage), relative to MobileNetV3 architectures.\n", "keywords": "low-capacity model;large-scale prediction;efficient inference;hybrid networks;routing nets;coverage and latency;FLOPs", "primary_area": "", "supplementary_material": "", "author": "Anil Kag;Igor Fedorov;Aditya Gangrade;Paul Whatmough;Venkatesh Saligrama", "authorids": "~Anil_Kag1;igor.fedorov@arm.com;~Aditya_Gangrade1;~Paul_Whatmough1;~Venkatesh_Saligrama1", "gender": "M;;;M;", "homepage": "https://anilkagak2.github.io/;;;;https://venkatesh-saligrama.github.io/", "dblp": "213/9132;;;87/9432;67/4721", "google_scholar": "bZdVsMkAAAAJ;;;hu3x-LoAAAAJ;S4z3uzMAAAAJ", "orcid": ";;;;0000-0002-0675-2268", "linkedin": ";;;paul-whatmough-2062729/;venkatesh-saligrama-91175a16/", "or_profile": "~Anil_Kag1;igor.fedorov@arm.com;~Aditya_Gangrade1;~Paul_Whatmough1;~Venkatesh_Saligrama1", "aff": "Boston University;;;Arm Inc;Boston University", "aff_domain": "bu.edu;;;arm.com;bu.edu", "position": "PhD student;;;Senior Principal Research Engineer;Full Professor", "bibtex": "@misc{\nkag2022hybrid,\ntitle={Hybrid Cloud-Edge Networks for Efficient Inference},\nauthor={Anil Kag and Igor Fedorov and Aditya Gangrade and Paul Whatmough and Venkatesh Saligrama},\nyear={2022},\nurl={https://openreview.net/forum?id=2DJwuD-elOt}\n}", "github": "", "project": "", "reviewers": "EcwK;37D1;p5u2;Jnva", "site": "https://openreview.net/forum?id=2DJwuD-elOt", "pdf_size": 0, "recommendation": "5;5;5;6", "confidence": "3;4;4;4", "correctness": "3;3;3;3", "technical_novelty": "3;3;2;3", "empirical_novelty": "3;2;2;3", "wc_summary_paper": "57;53;84;117", "wc_summary_review": "24;60;44;67", "wc_main_review": "340;327;607;331", "wc_review": "421;440;735;515", "wc_reply_reviewers": "106;90;0;56", "wc_reply_authors": "351;388;1097;366", "reply_reviewers": "1;1;0;2", "reply_authors": "1;1;2;2", "recommendation_avg": [ 5.25, 0.4330127018922193 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 77.75, 25.606395685453272 ], "wc_summary_review_avg": [ 48.75, 16.543503256565703 ], "wc_main_review_avg": [ 401.25, 118.88308332138766 ], "wc_review_avg": [ 527.75, 124.7104145610943 ], "wc_reply_reviewers_avg": [ 63.0, 40.607881008493905 ], "wc_reply_authors_avg": [ 550.5, 315.7962159367968 ], "reply_reviewers_avg": [ 1.0, 0.7071067811865476 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:vlX8qY5DvO4J:scholar.google.com/&scioq=Hybrid+Cloud-Edge+Networks+for+Efficient+Inference&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1;0", "aff_unique_norm": "Boston University;Arm Limited", "aff_unique_dep": ";", "aff_unique_url": "https://www.bu.edu;https://www.arm.com", "aff_unique_abbr": "BU;Arm", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", "aff_country_unique": "United States;United Kingdom" }, { "id": "2DT7DptUiXv", "title": "ConVAEr: Convolutional Variational AutoEncodeRs for incremental similarity learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Due to catastrophic forgetting, incremental similarity learning in neural networks remains an open challenge. Previous work has shown that keeping image exemplars during incremental similarity learning is effective for preserving base knowledge (past learnt features and embeddings). It is also generally accepted that the output layers learn more task-specific feature embeddings during the later training stages compared to the input layers\u2019 general features earlier on. Building on these insights, we start by freezing the input layers of a neural network. We then investigate the viability of generating \u201cembedding\u201d exemplars from a VAE that can protect base knowledge in the intermediate to output layers of the neural networks. These generated exemplars replace the necessity for retaining images from previously learned classes. We experimented with three metric learning loss functions on the CUB-200 and CARS-196 in an incremental similarity learning setup. We train different VAEs to generate exemplars from the intermediate convolution layers and linear output layers. We use these generated exemplars to rep-resent base knowledge. We compared our work to a previous technique that stores image exemplars. The comparison is done for base knowledge, new knowledge and average knowledge preservation as metrics. The results show that generating exemplars from the linear and convolutional layers retained the highest ratio of base knowledge. We note that using embeddings from the linear layers leads to better performance on new knowledge than convolutional embeddings. Overall our methods yield better average knowledge performance across all experiments. These results support the view that for incremental similarity learning to overcome catastrophic forgetting, emphasis can be placed on learning embedding exemplars for intermediate to output layers. Further, we note that most incremental similarity learning for new classes depends on the linear layers rather than the convolutions. Further investigation into the relationship between transfer learning and similarity learning and the protection of intermediate layer embedding space for catastrophic forgetting is required.", "keywords": "catastrophic forgetting;incremental similarity learning", "primary_area": "", "supplementary_material": "", "author": "Jiahao Huo;Terence L van Zyl", "authorids": "~Jiahao_Huo1;tvanzyl@uj.ac.za", "gender": "M;", "homepage": ";", "dblp": ";", "google_scholar": "LdDS2c0AAAAJ;", "orcid": "0000-0001-6686-2576;", "linkedin": "jiahao-huo-a07265198/;", "or_profile": "~Jiahao_Huo1;tvanzyl@uj.ac.za", "aff": "University of the Witwatersrand;", "aff_domain": "wits.ac.za;", "position": "MS student;", "bibtex": "@misc{\nhuo2022convaer,\ntitle={Con{VAE}r: Convolutional Variational AutoEncodeRs for incremental similarity learning},\nauthor={Jiahao Huo and Terence L van Zyl},\nyear={2022},\nurl={https://openreview.net/forum?id=2DT7DptUiXv}\n}", "github": "", "project": "", "reviewers": "5yMK;EmCV;Ygcr", "site": "https://openreview.net/forum?id=2DT7DptUiXv", "pdf_size": 0, "recommendation": "1;1;5", "confidence": "4;3;3", "correctness": "2;2;3", "technical_novelty": "1;2;2", "empirical_novelty": "1;1;0", "wc_summary_paper": "150;85;42", "wc_summary_review": "4;37;17", "wc_main_review": "226;243;143", "wc_review": "380;365;202", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 2.3333333333333335, 1.8856180831641267 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "correctness_avg": [ 2.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 1.6666666666666667, 0.4714045207910317 ], "empirical_novelty_avg": [ 0.6666666666666666, 0.4714045207910317 ], "wc_summary_paper_avg": [ 92.33333333333333, 44.39469437769438 ], "wc_summary_review_avg": [ 19.333333333333332, 13.572848714334887 ], "wc_main_review_avg": [ 204.0, 43.688289811649376 ], "wc_review_avg": [ 315.6666666666667, 80.60741625654276 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.4999999999999999, "corr_recommendation_correctness": 0.9999999999999997, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:cNyxlJVcvfMJ:scholar.google.com/&scioq=ConVAEr:+Convolutional+Variational+AutoEncodeRs+for+incremental+similarity+learning&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "University of the Witwatersrand", "aff_unique_dep": "", "aff_unique_url": "https://www.wits.ac.za", "aff_unique_abbr": "Wits", "aff_country_unique_index": "0", "aff_country_unique": "South Africa" }, { "id": "2I1wy0y6xo", "title": "Stability analysis of SGD through the normalized loss function", "track": "main", "status": "Reject", "tldr": "", "abstract": "We prove new generalization bounds for stochastic gradient descent for both the convex and non-convex cases. Our analysis is based on the stability framework. We analyze stability with respect to the normalized version of the loss function used for training. This leads to investigating a form of angle-wise stability instead of euclidean stability in weights. For neural networks, the measure of distance we consider is invariant to rescaling the weights of each layer. Furthermore, we exploit the notion of on-average stability in order to obtain a data-dependent quantity in the bound. This data-dependent quantity is seen to be more favorable when training with larger learning rates in our numerical experiments. This might help to shed some light on why larger learning rates can lead to better generalization in some practical scenarios.", "keywords": "Generalization bounds;deep neural networks;stability", "primary_area": "", "supplementary_material": "", "author": "Alexandre Lemire Paquin;Brahim Chaib-draa;Philippe Gigu\u00e8re", "authorids": "~Alexandre_Lemire_Paquin1;~Brahim_Chaib-draa1;~Philippe_Gigu\u00e8re1", "gender": ";M;M", "homepage": ";http://www2.ift.ulaval.ca/~chaib/;https://norlab.ulaval.ca", "dblp": ";c/BChaibdraa.html;93/4851", "google_scholar": ";JycXWO0AAAAJ;https://scholar.google.ca/citations?user=tgZPkzkAAAAJ", "orcid": ";;0000-0002-7520-8290", "linkedin": ";;", "or_profile": "~Alexandre_Lemire_Paquin1;~Brahim_Chaib-draa1;~Philippe_Gigu\u00e8re1", "aff": "Laval university;Universite Laval, Laval university;Universit\u00e9 Laval", "aff_domain": "ulaval.ca;ift.ulaval.ca;ulaval.ca", "position": "PhD student;Full Professor;Full Professor", "bibtex": "@misc{\npaquin2022stability,\ntitle={Stability analysis of {SGD} through the normalized loss function},\nauthor={Alexandre Lemire Paquin and Brahim Chaib-draa and Philippe Gigu{\\`e}re},\nyear={2022},\nurl={https://openreview.net/forum?id=2I1wy0y6xo}\n}", "github": "", "project": "", "reviewers": "hQwz;MY7B;WveE", "site": "https://openreview.net/forum?id=2I1wy0y6xo", "pdf_size": 0, "recommendation": "3;5;8", "confidence": "4;4;3", "correctness": "2;4;4", "technical_novelty": "1;2;3", "empirical_novelty": "1;2;3", "wc_summary_paper": "69;150;39", "wc_summary_review": "10;28;19", "wc_main_review": "304;327;90", "wc_review": "383;505;148", "wc_reply_reviewers": "0;78;0", "wc_reply_authors": "499;765;191", "reply_reviewers": "0;1;0", "reply_authors": "1;2;1", "recommendation_avg": [ 5.333333333333333, 2.0548046676563256 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.9428090415820634 ], "technical_novelty_avg": [ 2.0, 0.816496580927726 ], "empirical_novelty_avg": [ 2.0, 0.816496580927726 ], "wc_summary_paper_avg": [ 86.0, 46.88283267892417 ], "wc_summary_review_avg": [ 19.0, 7.3484692283495345 ], "wc_main_review_avg": [ 240.33333333333334, 106.71561376959897 ], "wc_review_avg": [ 345.3333333333333, 148.1583237246186 ], "wc_reply_reviewers_avg": [ 26.0, 36.76955262170047 ], "wc_reply_authors_avg": [ 485.0, 234.54352829840917 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.9176629354822472, "corr_recommendation_correctness": 0.8029550685469663, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:JrLaPl9XgGsJ:scholar.google.com/&scioq=Stability+analysis+of+SGD+through+the+normalized+loss+function&hl=en&as_sdt=0,5", "gs_version_total": 2, "aff_unique_index": "0;1;2", "aff_unique_norm": "Laval University;Universite Laval;Universit\u00e9 Laval", "aff_unique_dep": ";;", "aff_unique_url": "https://www.laval.ca;https://www.ulaval.ca;https://www.ulaval.ca", "aff_unique_abbr": "Laval;UL;ULaval", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Canada" }, { "id": "2JFVnWuvrvV", "title": "A Closer Look at Distribution Shifts and Out-of-Distribution Generalization on Graphs", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Distribution shifts, in which the training distribution differs from the testing distribution, can significantly degrade the performance of Graph Neural Networks (GNNs). Although some existing graph classification benchmarks consider distribution shifts, we are far from understanding the effects of distribution shifts on graphs, and more specifically how they differ from distribution shifts in tensor data like images. We ask: (1) how useful are existing domain generalization methods for tackling distribution shifts on graph data? (2) are GNNs capable of generalizing to test graphs from unseen distributions? As a first step to answering these questions, we curate GDS, a benchmark of 8 datasets reflecting a diverse range of distribution shifts across graphs. We observe that in most cases, we need both a suitable domain generalization algorithm and a strong GNN backbone model to optimize out-of-distribution test performance. However, even if we carefully pick such combinations of models and algorithms, the out-of-distribution performance is still much lower than the in-distribution performance. This large gap emphasizes the need for domain generalization algorithms specifically tailored for graphs and strong GNNs that generalize well to out-of-distribution graphs. To facilitate further research, we provide an open-source package that administers the GDS benchmark with modular combinations of popular domain generalization algorithms and GNN backbone models.", "keywords": "Graph Neural Networks;Distribution Shifts;Out-of-Distribution Generalization", "primary_area": "", "supplementary_material": "", "author": "Mucong Ding;Kezhi Kong;Jiuhai Chen;John Kirchenbauer;Micah Goldblum;David Wipf;Furong Huang;Tom Goldstein", "authorids": "~Mucong_Ding1;~Kezhi_Kong1;~Jiuhai_Chen1;~John_Kirchenbauer1;~Micah_Goldblum1;~David_Wipf1;~Furong_Huang1;~Tom_Goldstein1", "gender": "M;M;M;M;;M;F;M", "homepage": "http://www.cs.umd.edu/~mcding/;https://devnkong.github.io;https://www.linkedin.com/in/jiuhai-chen-6a486715a/;https://jwkirchenbauer.notion.site/;;http://www.davidwipf.com/;https://furong-huang.com;https://www.cs.umd.edu/~tomg/", "dblp": "232/1754.html;228/8866;;321/0678;241/7231;81/6421;72/8513;25/8184", "google_scholar": "_bVao2MAAAAJ;;;48GJrbsAAAAJ;pGDKzuUAAAAJ;YJx1WSgAAAAJ;13yyuCcAAAAJ;KmSuVtgAAAAJ", "orcid": "0000-0002-6173-8055;;;;;;;", "linkedin": "mucong-ding-489296104;;;johnkirchenbauer/;;;;", "or_profile": "~Mucong_Ding1;~Kezhi_Kong1;~Jiuhai_Chen1;~John_Kirchenbauer1;~Micah_Goldblum1;~David_Wipf1;~Furong_Huang1;~Tom_Goldstein1", "aff": "Department of Computer Science, University of Maryland, College Park;University of Maryland, College Park;University of Maryland, College Park;University of Maryland, College Park;New York University;Amazon AI Research Lab;University of Maryland;University of Maryland, College Park", "aff_domain": "cs.umd.edu;umd.edu;umd.edu;umd.edu;nyu.edu;amazon.com;cs.umd.edu;umd.edu", "position": "PhD student;PhD student;PhD student;PhD student;Postdoc;Principal Research Scientist;Assistant Professor;Associate Professor", "bibtex": "@misc{\nding2022a,\ntitle={A Closer Look at Distribution Shifts and Out-of-Distribution Generalization on Graphs},\nauthor={Mucong Ding and Kezhi Kong and Jiuhai Chen and John Kirchenbauer and Micah Goldblum and David Wipf and Furong Huang and Tom Goldstein},\nyear={2022},\nurl={https://openreview.net/forum?id=2JFVnWuvrvV}\n}", "github": "", "project": "", "reviewers": "Wtbd;3nzi;DAsA", "site": "https://openreview.net/forum?id=2JFVnWuvrvV", "pdf_size": 0, "recommendation": "3;3;3", "confidence": "5;4;4", "correctness": "3;2;3", "technical_novelty": "1;2;1", "empirical_novelty": "2;3;3", "wc_summary_paper": "21;71;45", "wc_summary_review": "20;27;36", "wc_main_review": "237;538;269", "wc_review": "278;636;350", "wc_reply_reviewers": "38;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "1;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "correctness_avg": [ 2.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 1.3333333333333333, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 45.666666666666664, 20.417857108151406 ], "wc_summary_review_avg": [ 27.666666666666668, 6.548960901462833 ], "wc_main_review_avg": [ 348.0, 134.9839496631606 ], "wc_review_avg": [ 421.3333333333333, 154.61205932555484 ], "wc_reply_reviewers_avg": [ 12.666666666666666, 17.913371790059205 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 24, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4882391150064300131&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "aff_unique_index": "0;1;1;1;2;3;1;1", "aff_unique_norm": "University of Maryland, College Park;University of Maryland;New York University;Amazon", "aff_unique_dep": "Department of Computer Science;;;Amazon AI Research Lab", "aff_unique_url": "https://www/umd.edu;https://www/umd.edu;https://www.nyu.edu;https://www.amazon.com", "aff_unique_abbr": "UMD;UMD;NYU;Amazon AI", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "College Park;", "aff_country_unique_index": "0;0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "2M0WXSP6Qi", "title": "Information-theoretic stochastic contrastive conditional GAN: InfoSCC-GAN", "track": "main", "status": "Reject", "tldr": "", "abstract": "Conditional generation is a subclass of generative problems when the output of generation is conditioned by a class attributes\u2019 information. In this paper, we present a new stochastic contrastive conditional generative adversarial network (InfoSCC-GAN) with explorable latent space. The InfoSCC-GAN architecture is based on an unsupervised contrastive encoder built on the InfoNCE paradigm, attributes' classifier, and stochastic EigenGAN generator. \nWe propose two approaches for selecting the class attributes: external attributes from the dataset annotations and internal attributes from the clustered latent space of the encoder. We propose a novel training method based on a generator regularization using external or internal attributes every $n$-th iteration using the pre-trained contrastive encoder and pre-trained attributes\u2019 classifier. The proposed InfoSCC-GAN is derived from an information-theoretic formulation of mutual information maximization between the input data and latent space representation for the encoder and the latent space and generated data for the decoder. Thus, we demonstrate a link between the training objective functions and the above information-theoretic formulation. The experimental results show that InfoSCC-GAN outperforms vanilla EigenGAN in image generation on several popular datasets, yet providing an interpretable latent space. In addition, we investigate the impact of regularization techniques and each part of the system by performing an ablation study. Finally, we demonstrate that thanks to the stochastic EigenGAN generator, the proposed framework enjoys a truly stochastic generation in contrast to vanilla deterministic GANs yet with the independent training of an encoder, a classifier, and a generator. \nThe code, supplementary materials, and demos are available \\url{https://anonymous.4open.science/r/InfoSCC-GAN-D113}", "keywords": "GANs;Generative adversarial networks;Contrastive learning;Conditional image generation", "primary_area": "", "supplementary_material": "", "author": "Vitaliy Kinakh;Mariia Drozdova;Guillaume Qu\u00e9tant;Svyatoslav Voloshynovskyy;Tobias GOLLING", "authorids": "~Vitaliy_Kinakh1;mariia.drozdova@unige.ch;guillaume.quetant@unige.ch;svyatoslav.voloshynovskyy@unige.ch;tobias.golling@unige.ch", "gender": "M;;;;", "homepage": "https://vkinakh.github.io/;;;;", "dblp": "300/9004;;;;", "google_scholar": "tGCmfh0AAAAJ;;;;", "orcid": "0000-0001-5301-9141;;;;", "linkedin": "vitaliy-kinakh-606022a4/;;;;", "or_profile": "~Vitaliy_Kinakh1;mariia.drozdova@unige.ch;guillaume.quetant@unige.ch;svyatoslav.voloshynovskyy@unige.ch;tobias.golling@unige.ch", "aff": "University of Geneva;;;;", "aff_domain": "unige.ch;;;;", "position": "PhD student;;;;", "bibtex": "@misc{\nkinakh2022informationtheoretic,\ntitle={Information-theoretic stochastic contrastive conditional {GAN}: Info{SCC}-{GAN}},\nauthor={Vitaliy Kinakh and Mariia Drozdova and Guillaume Qu{\\'e}tant and Svyatoslav Voloshynovskyy and Tobias GOLLING},\nyear={2022},\nurl={https://openreview.net/forum?id=2M0WXSP6Qi}\n}", "github": "", "project": "", "reviewers": "wdWF;atoU;bPdp", "site": "https://openreview.net/forum?id=2M0WXSP6Qi", "pdf_size": 0, "recommendation": "1;3;5", "confidence": "4;4;4", "correctness": "2;2;3", "technical_novelty": "1;1;3", "empirical_novelty": "1;2;3", "wc_summary_paper": "90;60;82", "wc_summary_review": "9;48;28", "wc_main_review": "148;375;245", "wc_review": "247;483;355", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 3.0, 1.632993161855452 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 2.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 1.6666666666666667, 0.9428090415820634 ], "empirical_novelty_avg": [ 2.0, 0.816496580927726 ], "wc_summary_paper_avg": [ 77.33333333333333, 12.684198393626966 ], "wc_summary_review_avg": [ 28.333333333333332, 15.923427883328248 ], "wc_main_review_avg": [ 256.0, 92.99820786803726 ], "wc_review_avg": [ 361.6666666666667, 96.46185198765825 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.8660254037844385, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16423896841725988747&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff_unique_index": "0", "aff_unique_norm": "University of Geneva", "aff_unique_dep": "", "aff_unique_url": "https://www.unige.ch", "aff_unique_abbr": "UNIGE", "aff_country_unique_index": "0", "aff_country_unique": "Switzerland" }, { "id": "2NqIV8dzR7N", "title": "Automatic Termination for Hyperparameter Optimization", "track": "main", "status": "Reject", "tldr": "", "abstract": "Bayesian optimization (BO) is a widely popular approach for the hyperparameter optimization (HPO) of machine learning algorithms. At its core, BO iteratively evaluates promising configurations until a user-defined budget, such as wall-clock time or number of iterations, is exhausted. While the final performance after tuning heavily depends on the provided budget, it is hard to pre-specify an optimal value in advance. In this work, we propose an effective and intuitive termination criterion for BO that automatically stops the procedure if it is sufficiently close to the global optima. Across an extensive range of real-world HPO problems, we show that our termination criterion achieves better test performance compared to existing baselines from the literature, such as stopping when the probability of improvement drops below a fixed threshold. We also provide evidence that these baselines are, compared to our method, highly sensitive to the choices of their own hyperparameters. Additionally, we find that overfitting might occur in the context of HPO, which is arguably an overlooked problem in the literature, and show that our termination criterion mitigates this phenomenon on both small and large datasets.", "keywords": "Bayesian optimization;hyperparameter optimization;automatic termination", "primary_area": "", "supplementary_material": "/attachment/35c9593680decde0990a04531ea84838e55f23e2.zip", "author": "Anastasia Makarova;Huibin Shen;Valerio Perrone;Aaron Klein;Jean Baptiste Faddoul;Andreas Krause;Matthias Seeger;Cedric Archambeau", "authorids": "~Anastasia_Makarova1;~Huibin_Shen1;~Valerio_Perrone1;~Aaron_Klein1;~Jean_Baptiste_Faddoul2;~Andreas_Krause1;~Matthias_Seeger2;~Cedric_Archambeau1", "gender": "F;M;M;M;;M;M;M", "homepage": "https://avidereta.github.io/;;https://sites.google.com/view/valerioperrone/;https://aaronkl.github.io/;;https://las.inf.ethz.ch/krausea;https://mseeger.github.io/;http://www0.cs.ucl.ac.uk/staff/c.archambeau/", "dblp": "244/2207;118/6073;202/1297;178/3281;;87/1831-1.html;43/5832;59/1878", "google_scholar": "skAF5s8AAAAJ;https://scholar.google.de/citations?view_op=list_works;https://scholar.google.de/citations?user=gXqr8A4AAAAJ;usl__skAAAAJ;aQRuXncAAAAJ;https://scholar.google.ch/citations?user=eDHv58AAAAAJ;V-lc8A8AAAAJ;pPx5WWIAAAAJ", "orcid": ";;;;;0000-0001-7260-9673;;", "linkedin": "anastasia-makarova-957331a7/;;valerio-perrone-391731132/;;;krausea/;matthias-seeger-3010b765/?locale=de_DE;carchambeau/", "or_profile": "~Anastasia_Makarova1;~Huibin_Shen1;~Valerio_Perrone1;~Aaron_Klein1;~Jean_Baptiste_Faddoul2;~Andreas_Krause1;~Matthias_Seeger2;~Cedric_Archambeau1", "aff": "Swiss Federal Institute of Technology;Amazon;Amazon;Amazon Berlin;Amazon;ETH Zurich;Amazon Development Center Germany;Amazon Web Services", "aff_domain": "ethz.ch;amazon.com;amazon.com;amazon.com;amazon.com;ethz.ch;amazon.de;amazon.com", "position": "PhD student;Machine Learning Scientist;Senior Machine Learning Scientist;Scientist;Applied Science Manager;Full Professor;Principal Applied Scientist;Principal Researcher", "bibtex": "@misc{\nmakarova2022automatic,\ntitle={Automatic Termination for Hyperparameter Optimization},\nauthor={Anastasia Makarova and Huibin Shen and Valerio Perrone and Aaron Klein and Jean Baptiste Faddoul and Andreas Krause and Matthias Seeger and Cedric Archambeau},\nyear={2022},\nurl={https://openreview.net/forum?id=2NqIV8dzR7N}\n}", "github": "", "project": "", "reviewers": "QxQD;MRmf;yUir", "site": "https://openreview.net/forum?id=2NqIV8dzR7N", "pdf_size": 0, "recommendation": "5;6;6", "confidence": "4;3;4", "correctness": "3;3;2", "technical_novelty": "2;3;3", "empirical_novelty": "2;3;3", "wc_summary_paper": "129;112;56", "wc_summary_review": "23;35;71", "wc_main_review": "269;205;546", "wc_review": "421;352;673", "wc_reply_reviewers": "0;0;256", "wc_reply_authors": "333;301;1117", "reply_reviewers": "0;0;2", "reply_authors": "3;1;4", "recommendation_avg": [ 5.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 2.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 99.0, 31.187604375242845 ], "wc_summary_review_avg": [ 43.0, 20.396078054371138 ], "wc_main_review_avg": [ 340.0, 147.9887383102737 ], "wc_review_avg": [ 482.0, 137.96376335835436 ], "wc_reply_reviewers_avg": [ 85.33333333333333, 120.67955732250411 ], "wc_reply_authors_avg": [ 583.6666666666666, 377.3498229612529 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.9428090415820634 ], "reply_authors_avg": [ 2.6666666666666665, 1.247219128924647 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": -0.4999999999999999, "corr_recommendation_correctness": -0.4999999999999999, "gs_citation": 40, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15717074894357393858&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "aff_unique_index": "0;1;1;1;1;2;1;1", "aff_unique_norm": "Swiss Federal Institute of Technology;Amazon;ETH Zurich", "aff_unique_dep": ";Amazon.com, Inc.;", "aff_unique_url": "https://www.ethz.ch;https://www.amazon.com;https://www.ethz.ch", "aff_unique_abbr": "ETH Zurich;Amazon;ETHZ", "aff_campus_unique_index": "1", "aff_campus_unique": ";Berlin", "aff_country_unique_index": "0;1;1;2;1;0;2;1", "aff_country_unique": "Switzerland;United States;Germany" }, { "id": "2O_pIShVl-", "title": "Polygonal Unadjusted Langevin Algorithms: Creating stable and efficient adaptive algorithms for neural networks", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "We present a new class of Langevin based algorithms, which overcomes many of the known shortcomings of popular adaptive optimizers that are currently used for the fine-tuning of deep learning models. Its underpinning theory relies on recent advances of Euler's polygonal approximations for stochastic differential equations (SDEs) with monotone coefficients. As a result, it inherits the stability properties of tamed algorithms, while it addresses other known issues, e.g. vanishing gradients in neural networks. In particular, we provide a nonasymptotic analysis and full theoretical guarantees for the convergence properties of an algorithm of this novel class, which we named TH$\\varepsilon$O POULA (or, simply, TheoPouLa). Finally, several experiments are presented with different types of deep learning models, which show the superior performance of TheoPouLa over many popular adaptive optimization algorithms.", "keywords": "nonconvex optimization;Langevin based algorithm", "primary_area": "", "supplementary_material": "/attachment/7cac50fcc041fb19f6161243ea2fedef249a1339.zip", "author": "Dong-Young Lim;Sotirios Sabanis", "authorids": "~Dong-Young_Lim1;~Sotirios_Sabanis1", "gender": "M;M", "homepage": "https://sites.google.com/view/dlim;https://maths.ed.ac.uk/people/academic-staff?person=22", "dblp": ";183/3256", "google_scholar": ";https://scholar.google.co.uk/citations?user=z_nY3CkAAAAJ", "orcid": ";0000-0002-3991-362X", "linkedin": "dong-young-lim-406a55128/;", "or_profile": "~Dong-Young_Lim1;~Sotirios_Sabanis1", "aff": "University of Edinburgh;National Technical University of Athens", "aff_domain": "ed.ac.uk;ntua.gr", "position": "Postdoc;Full Professor", "bibtex": "@misc{\nlim2022polygonal,\ntitle={Polygonal Unadjusted Langevin Algorithms: Creating stable and efficient adaptive algorithms for neural networks},\nauthor={Dong-Young Lim and Sotirios Sabanis},\nyear={2022},\nurl={https://openreview.net/forum?id=2O_pIShVl-}\n}", "github": "", "project": "", "reviewers": "2Zad;KLBe;5h9s", "site": "https://openreview.net/forum?id=2O_pIShVl-", "pdf_size": 0, "recommendation": "5;5;8", "confidence": "3;4;3", "correctness": "3;3;4", "technical_novelty": "3;2;3", "empirical_novelty": "2;0;3", "wc_summary_paper": "58;137;97", "wc_summary_review": "57;58;90", "wc_main_review": "309;275;750", "wc_review": "424;470;937", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 6.0, 1.4142135623730951 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 1.6666666666666667, 1.247219128924647 ], "wc_summary_paper_avg": [ 97.33333333333333, 32.25247621845836 ], "wc_summary_review_avg": [ 68.33333333333333, 15.326085243430198 ], "wc_main_review_avg": [ 444.6666666666667, 216.34899789203763 ], "wc_review_avg": [ 610.3333333333334, 231.75034460000748 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.5, "corr_recommendation_correctness": 1.0, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16440981027473902275&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;1", "aff_unique_norm": "University of Edinburgh;National Technical University of Athens", "aff_unique_dep": ";", "aff_unique_url": "https://www.ed.ac.uk;https://www.ntua.gr", "aff_unique_abbr": "Edinburgh;NTUA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "United Kingdom;Greece" }, { "id": "2PSrjVtj6gU", "title": "Graph Attention Multi-layer Perceptron", "track": "main", "status": "Reject", "tldr": "", "abstract": "Recently, graph neural networks (GNNs) have achieved a stride of success in many graph-based applications. However, most GNNs suffer from a critical issue: representation learned is constructed based on a fixed k-hop neighborhood and insensitive to individual needs for each node, which greatly hampers the performance of GNNs. To satisfy the unique needs of each node, we propose a new architecture -- Graph Attention Multi-Layer Perceptron (GAMLP). This architecture combines multi-scale knowledge and learns to capture the underlying correlations between different scales of knowledge with two novel attention mechanisms: Recursive attention and Jumping Knowledge (JK) attention. Instead of using node feature only, the knowledge within node labels is also exploited to reinforce the performance of GAMLP. Extensive experiments on 12 real-world datasets demonstrate that GAMLP achieves state-of-the-art performance while enjoying high scalability and efficiency.", "keywords": "Graph Neural Network;Attention;Scalability", "primary_area": "", "supplementary_material": "/attachment/77df8d4fd969371affa833c69a8da262ab25ba18.zip", "author": "Wentao Zhang;Ziqi Yin;Zeang Sheng;Yang Li;Wen Ouyang;Xiaosen Li;Yangyu Tao;Zhi Yang;Bin CUI", "authorids": "~Wentao_Zhang1;~Ziqi_Yin1;~Zeang_Sheng1;~Yang_Li36;~Wen_Ouyang1;~Xiaosen_Li1;~Yangyu_Tao2;~Zhi_Yang4;~Bin_CUI2", "gender": ";;M;M;M;M;M;M;M", "homepage": ";;https://scholar.google.com/citations?user=cIaU0iIAAAAJ&hl=en;https://thomas-young-2013.github.io/;https://github.com/ouyangwen-it;https://github.com/xs-li;;https://yangzhihome.github.io/;https://cuibinpku.github.io/index.html", "dblp": ";;298/0674;37/4190-106;;266/6200;47/208.html;90/5587-1;55/5031.html", "google_scholar": ";;cIaU0iIAAAAJ;_4s8hFYAAAAJ;;https://scholar.google.com/citations?hl=zh-CN;;;IJAU8KoAAAAJ", "orcid": ";;0009-0002-4427-3038;;;;;;0000-0003-1681-4677", "linkedin": ";;;yang-thomas-li-b75554107/;;;yangyutao/;;", "or_profile": "~Wentao_Zhang1;~Ziqi_Yin1;~Zeang_Sheng1;~Yang_Li36;~Wen_Ouyang1;~Xiaosen_Li1;~Yangyu_Tao2;~Zhi_Yang4;~Bin_CUI2", "aff": ";;Peking University;Peking University;Tencent big data;;;Peking University;Peking University", "aff_domain": ";;pku.edu.cn;pku.edu.cn;tencent.com;;;pku.edu.cn;pku.edu.cn", "position": ";;Undergrad student;PhD student;Researcher;;;Associate Professor;Full Professor", "bibtex": "@misc{\nzhang2022graph,\ntitle={Graph Attention Multi-layer Perceptron},\nauthor={Wentao Zhang and Ziqi Yin and Zeang Sheng and Yang Li and Wen Ouyang and Xiaosen Li and Yangyu Tao and Zhi Yang and Bin CUI},\nyear={2022},\nurl={https://openreview.net/forum?id=2PSrjVtj6gU}\n}", "github": "", "project": "", "reviewers": "eUi3;XqaP;Cwzz;Z4vM", "site": "https://openreview.net/forum?id=2PSrjVtj6gU", "pdf_size": 0, "recommendation": "3;6;6;6", "confidence": "4;4;3;4", "correctness": "4;4;3;3", "technical_novelty": "2;1;2;2", "empirical_novelty": "2;3;2;3", "wc_summary_paper": "51;37;24;79", "wc_summary_review": "6;47;23;144", "wc_main_review": "207;196;170;273", "wc_review": "264;280;217;496", "wc_reply_reviewers": "0;0;35;0", "wc_reply_authors": "704;1024;910;1609", "reply_reviewers": "0;0;1;0", "reply_authors": "1;2;2;3", "recommendation_avg": [ 5.25, 1.299038105676658 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 1.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 47.75, 20.41292482717751 ], "wc_summary_review_avg": [ 55.0, 53.40880077290633 ], "wc_main_review_avg": [ 211.5, 37.963798545456434 ], "wc_review_avg": [ 314.25, 107.45784057015105 ], "wc_reply_reviewers_avg": [ 8.75, 15.155444566227676 ], "wc_reply_authors_avg": [ 1061.75, 336.125255671156 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 2.0, 0.7071067811865476 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 9, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": -0.5773502691896257, "gs_citation": 150, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14386424806051137152&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0;1;0;0", "aff_unique_norm": "Peking University;Tencent", "aff_unique_dep": ";big data", "aff_unique_url": "http://www.pku.edu.cn;https://www.tencent.com", "aff_unique_abbr": "Peking U;Tencent", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "id": "2RNpZ8S4alJ", "title": "KINet: Keypoint Interaction Networks for Unsupervised Forward Modeling", "track": "main", "status": "Reject", "tldr": "", "abstract": "Object-centric representation is an essential abstraction for physical reasoning and forward prediction. Most existing approaches learn this representation through extensive supervision (e.g, object class and bounding box) although such ground-truth information is not readily accessible in reality. To address this, we introduce KINet (Keypoint Interaction Network)---an end-to-end unsupervised framework to reason about object interactions in complex systems based on a keypoint representation. Using visual observations, our model learns to associate objects with keypoint coordinates and discovers a graph representation of the system as a set of keypoint embeddings and their relations. It then learns an action-conditioned forward model using contrastive estimation to predict future keypoint states. By learning to perform physical reasoning in the keypoint space, our model automatically generalizes to scenarios with a different number of objects, and novel object geometries. Experiments demonstrate the effectiveness of our model to accurately perform forward prediction and learn plannable object-centric representations which can also be used in downstream model-based control tasks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Alireza Rezazadeh;Changhyun Choi", "authorids": "~Alireza_Rezazadeh1;cchoi@umn.edu", "gender": ";", "homepage": "https://www.alireza.page/;", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": "~Alireza_Rezazadeh1;cchoi@umn.edu", "aff": "University of Minnesota - Twin Cities;", "aff_domain": "umn.edu;", "position": "PhD student;", "bibtex": "@misc{\nrezazadeh2022kinet,\ntitle={{KIN}et: Keypoint Interaction Networks for Unsupervised Forward Modeling},\nauthor={Alireza Rezazadeh and Changhyun Choi},\nyear={2022},\nurl={https://openreview.net/forum?id=2RNpZ8S4alJ}\n}", "github": "", "project": "", "reviewers": "58w4;YtC3;QJbD;nqXG", "site": "https://openreview.net/forum?id=2RNpZ8S4alJ", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "4;4;4;4", "correctness": "3;3;2;2", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "54;72;84;30", "wc_summary_review": "29;47;24;130", "wc_main_review": "292;290;424;1145", "wc_review": "375;409;532;1305", "wc_reply_reviewers": "57;0;86;0", "wc_reply_authors": "499;566;859;1687", "reply_reviewers": "1;0;1;0", "reply_authors": "2;1;2;3", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 60.0, 20.346989949375804 ], "wc_summary_review_avg": [ 57.5, 42.72294465506796 ], "wc_main_review_avg": [ 537.75, 354.77624990971424 ], "wc_review_avg": [ 655.25, 379.6527196004264 ], "wc_reply_reviewers_avg": [ 35.75, 37.19122880465231 ], "wc_reply_authors_avg": [ 902.75, 472.5930463940408 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 2.0, 0.7071067811865476 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": -0.5773502691896257, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:3xAVqL-vSR8J:scholar.google.com/&scioq=KINet:+Keypoint+Interaction+Networks+for+Unsupervised+Forward+Modeling&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "University of Minnesota", "aff_unique_dep": "", "aff_unique_url": "https://www.minnesota.edu", "aff_unique_abbr": "UMN", "aff_campus_unique_index": "0", "aff_campus_unique": "Twin Cities", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "id": "2RYOwBOFesi", "title": "An Empirical Study of Pre-trained Models on Out-of-distribution Generalization", "track": "main", "status": "Reject", "tldr": "", "abstract": "Generalizing to out-of-distribution (OOD) data -- that is, data from domains unseen during training -- is a key challenge in modern machine learning, which has only recently received much attention. Some existing approaches propose leveraging larger models and pre-training on larger datasets. In this paper, we provide new insights in applying these approaches. Concretely, we show that larger models and larger datasets need to be simultaneously leveraged to improve OOD performance. Moreover, we show that using smaller learning rates during fine-tuning is critical to achieving good results, contrary to popular intuition that larger learning rates generalize better when training from scratch. We show that strategies that improve in-distribution accuracy may, counter-intuitively, lead to poor OOD performance despite strong in-distribution performance. Our insights culminate to a method that achieves state-of-the-art results on a number of OOD generalization benchmark tasks, often by a significant margin.", "keywords": "out-of-distribution generalization;domain generalization;pre-training", "primary_area": "", "supplementary_material": "", "author": "Yaodong Yu;Heinrich Jiang;Dara Bahri;Hossein Mobahi;Seungyeon Kim;Ankit Singh Rawat;Andreas Veit;Yi Ma", "authorids": "~Yaodong_Yu4;~Heinrich_Jiang1;~Dara_Bahri1;~Hossein_Mobahi2;~Seungyeon_Kim1;~Ankit_Singh_Rawat1;~Andreas_Veit1;~Yi_Ma4", "gender": "M;M;M;;M;;M;M", "homepage": "https://yaodongyu.github.io;;http://www.dara.run;https://www.seungyeon.ai;https://ankitsrawat.github.io/home/;http://andreasveit.eu/;http://people.eecs.berkeley.edu/~yima/;http://people.csail.mit.edu/hmobahi/", "dblp": ";182/2472;231/7656;74/7997-1.html;https://dblp.org/pers/hd/r/Rawat:Ankit_Singh;133/1801;;94/1490", "google_scholar": "bZ9oyW8AAAAJ;;j5PpTOwAAAAJ;zbcN_QIAAAAJ;http://scholar.google.com/citations?user=U0_ab4cAAAAJ;UA9Hb2EAAAAJ;https://scholar.google.com.hk/citations?user=XqLiBQMAAAAJ;GSHmKZkAAAAJ", "orcid": ";;;;;;;", "linkedin": ";;;;;;;", "or_profile": "~Yaodong_Yu4;~Heinrich_Jiang1;~Dara_Bahri1;~Seungyeon_Kim1;~Ankit_Singh_Rawat1;~Andreas_Veit1;~Yi_Ma4;~Hossein_Mobahi1", "aff": "Electrical Engineering & Computer Science Department, University of California Berkeley;Google;Google Research;Google;Google;Google;University of California, Berkeley;Google", "aff_domain": "eecs.berkeley.edu;google.com;google.com;google.com;google.com;google.com;berkeley.edu;google.com", "position": "PhD student;Research scientist;Research Scientist;Researcher;Research Scientist;Senior Research Scientist;Full Professor;Research Scientist", "bibtex": "@misc{\nyu2022an,\ntitle={An Empirical Study of Pre-trained Models on Out-of-distribution Generalization},\nauthor={Yaodong Yu and Heinrich Jiang and Dara Bahri and Hossein Mobahi and Seungyeon Kim and Ankit Singh Rawat and Andreas Veit and Yi Ma},\nyear={2022},\nurl={https://openreview.net/forum?id=2RYOwBOFesi}\n}", "github": "", "project": "", "reviewers": "jtZq;5Ldh;mKe9;1Vts", "site": "https://openreview.net/forum?id=2RYOwBOFesi", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "5;3;2;3", "correctness": "2;3;3;3", "technical_novelty": "1;2;1;2", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "76;65;88;108", "wc_summary_review": "49;15;124;82", "wc_main_review": "416;292;426;423", "wc_review": "541;372;638;613", "wc_reply_reviewers": "0;105;0;91", "wc_reply_authors": "965;845;617;559", "reply_reviewers": "0;1;0;1", "reply_authors": "2;2;1;2", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 3.25, 1.0897247358851685 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 1.5, 0.5 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 84.25, 15.943258763502524 ], "wc_summary_review_avg": [ 67.5, 40.31438949060249 ], "wc_main_review_avg": [ 389.25, 56.264442590325196 ], "wc_review_avg": [ 541.0, 103.86770431659689 ], "wc_reply_reviewers_avg": [ 49.0, 49.24936547814601 ], "wc_reply_authors_avg": [ 746.5, 165.35643319810694 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.75, 0.4330127018922193 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": -0.7894736842105263, "corr_recommendation_correctness": 0.9271726499455306, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9630344427725157738&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;1;1;1;1;0;1", "aff_unique_norm": "University of California, Berkeley;Google", "aff_unique_dep": "Electrical Engineering & Computer Science Department;Google", "aff_unique_url": "https://www.berkeley.edu;https://www.google.com", "aff_unique_abbr": "UC Berkeley;Google", "aff_campus_unique_index": "0;1;1;1;1;1;0;1", "aff_campus_unique": "Berkeley;Mountain View", "aff_country_unique_index": "0;0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "2_dQlkDHnvN", "title": "Defending Backdoor Data Poisoning Attacks by Using Noisy Label Defense Algorithm", "track": "main", "status": "Reject", "tldr": "", "abstract": "Training deep neural networks with data corruption is a challenging problem. One example of such corruption is the backdoor data poisoning attack, in which an adversary strategically injects a backdoor trigger to a small fraction of the training data to subtly compromise the training process. Consequently, the trained deep neural network would misclassify testing examples that have been corrupted by the same trigger. While the label of the data could be changed to arbitrary values by an adversary, the extent of corruption injected to the feature values are strictly limited in order to keep the backdoor attack in disguise, which leads to a resemblance between the backdoor attack and a milder attack that involves only noisy labels. In this paper, we investigate an intriguing question: Can we leverage algorithms that defend against noisy labels corruptions to defend against general backdoor attacks? We first discuss the limitations of directly using the noisy-label defense algorithms to defend against backdoor attacks. Next, we propose a meta-algorithm that transforms an existing noisy label defense algorithm to one that protects against backdoor attacks. Extensive experiments on different types of backdoor attacks show that, by introducing a lightweight alteration for minimax optimization to the existing noisy-label defense algorithms, the robustness against backdoor attacks can be substantially improved, while the intial form of those algorithms would fail in presence of a backdoor attacks.", "keywords": "Backdoor Attack;Data Poisoning;Noisy Label", "primary_area": "", "supplementary_material": "/attachment/938820f91d9ea5bf2b3014f754563a977b29d303.zip", "author": "Boyang Liu;Zhuangdi Zhu;Pang-Ning Tan;Jiayu Zhou", "authorids": "~Boyang_Liu1;~Zhuangdi_Zhu1;ptan@msu.edu;~Jiayu_Zhou1", "gender": "M;F;;M", "homepage": ";;;http://jiayuzhou.github.io/", "dblp": "165/8466;185/5271;;73/1353", "google_scholar": ";eG_hZ9MAAAAJ;;https://scholar.google.com.tw/citations?user=yQKlLTQAAAAJ", "orcid": ";;;0000-0003-4336-6777", "linkedin": ";zhuangdi-zhu-9a4b26103/;;jiayuzhou/", "or_profile": "~Boyang_Liu1;~Zhuangdi_Zhu1;ptan@msu.edu;~Jiayu_Zhou1", "aff": "Michigan State University;Michigan State University;;Michigan State University", "aff_domain": "msu.edu;msu.edu;;msu.edu", "position": "PhD student;PhD student;;Associate Professor", "bibtex": "@misc{\nliu2022defending,\ntitle={Defending Backdoor Data Poisoning Attacks by Using Noisy Label Defense Algorithm},\nauthor={Boyang Liu and Zhuangdi Zhu and Pang-Ning Tan and Jiayu Zhou},\nyear={2022},\nurl={https://openreview.net/forum?id=2_dQlkDHnvN}\n}", "github": "", "project": "", "reviewers": "tutw;62pH;2ape;MBRR", "site": "https://openreview.net/forum?id=2_dQlkDHnvN", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "4;3;4;4", "correctness": "3;2;4;3", "technical_novelty": "3;3;3;3", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "54;70;18;58", "wc_summary_review": "96;53;10;33", "wc_main_review": "514;329;487;165", "wc_review": "664;452;515;256", "wc_reply_reviewers": "121;77;0;0", "wc_reply_authors": "652;260;778;345", "reply_reviewers": "1;1;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 50.0, 19.390719429665317 ], "wc_summary_review_avg": [ 48.0, 31.614869919074472 ], "wc_main_review_avg": [ 373.75, 139.70929639791333 ], "wc_review_avg": [ 471.75, 146.43151129452977 ], "wc_reply_reviewers_avg": [ 49.5, 51.88689622631132 ], "wc_reply_authors_avg": [ 508.75, 213.13537364782974 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.13245323570650439, "corr_recommendation_correctness": 0.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=617075383417506046&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;0", "aff_unique_norm": "Michigan State University", "aff_unique_dep": "", "aff_unique_url": "https://www.msu.edu", "aff_unique_abbr": "MSU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Escaping limit cycles: Global convergence for constrained nonconvex-nonconcave minimax problems", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6528", "id": "2_vhkAMARk", "poster": "", "openreview": "https://openreview.net/forum?id=2_vhkAMARk", "slides": "https://iclr.cc/virtual/2022/poster/6528", "video": "https://iclr.cc/virtual/2022/poster/6528", "author_site": "Thomas Pethick, Puya Latafat, Panos Patrinos, Olivier Fercoq, Volkan Cevher", "tldr": "", "abstract": "This paper introduces a new extragradient-type algorithm for a class of nonconvex-nonconcave minimax problems. It is well-known that finding a local solution for general minimax problems is computationally intractable. This observation has recently motivated the study of structures sufficient for convergence of first order methods in the more general setting of variational inequalities when the so-called weak Minty variational inequality (MVI) holds. This problem class captures non-trivial structures as we demonstrate with examples, for which a large family of existing algorithms provably converge to limit cycles. Our results require a less restrictive parameter range in the weak MVI compared to what is previously known, thus extending the applicability of our scheme. The proposed algorithm is applicable to constrained and regularized problems, and involves an adaptive stepsize allowing for potentially larger stepsizes. Our scheme also converges globally even in settings where the underlying operator exhibits limit cycles.", "keywords": "Minimax;Nonconvex-Nonconcave;Variational inequilities;Saddle point problem;First-order methods;Limit cycles", "primary_area": "", "supplementary_material": "/attachment/9565873500530ddbed5595f34938b390ceec5c9d.zip", "author": "Thomas Pethick;Puya Latafat;Panos Patrinos;Olivier Fercoq;Volkan Cevher", "authorids": "~Thomas_Pethick1;~Puya_Latafat1;panos.patrinos@kuleuven.be;~Olivier_Fercoq1;~Volkan_Cevher1", "gender": "M;M;;M;M", "homepage": "https://pethick.dk;https://github.com/pylat;;;http://lions.epfl.ch", "dblp": "305/4521;;;48/8772;70/5301", "google_scholar": ";TaTK05QAAAAJ;;;https://scholar.google.ch/citations?user=hlWhzU8AAAAJ", "orcid": ";0000-0002-7969-8565;;;", "linkedin": ";;;;", "or_profile": "~Thomas_Pethick1;~Puya_Latafat1;panos.patrinos@kuleuven.be;~Olivier_Fercoq1;~Volkan_Cevher1", "aff": "Swiss Federal Institute of Technology Lausanne;KU Leuven;;Telecom Paris;Swiss Institute of Technology", "aff_domain": "epfl.ch;kuleuven.be;;telecom-paris.fr;epfl.ch", "position": "PhD student;Postdoc;;Associate Professor;Associate Professor", "bibtex": "@inproceedings{\npethick2022escaping,\ntitle={Escaping limit cycles: Global convergence for constrained nonconvex-nonconcave minimax problems},\nauthor={Thomas Pethick and Puya Latafat and Panos Patrinos and Olivier Fercoq and Volkan Cevher},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=2_vhkAMARk}\n}", "github": "", "project": "", "reviewers": "wk1L;crvQ;DVU2;4TM5", "pdf_size": 0, "recommendation": "5;8;8;8", "confidence": "3;3;3;4", "correctness": "4;4;4;4", "technical_novelty": "2;3;3;4", "empirical_novelty": "0;3;3;3", "wc_summary_paper": "243;108;125;101", "wc_summary_review": "109;33;26;61", "wc_main_review": "569;709;206;491", "wc_review": "921;850;357;653", "wc_reply_reviewers": "0;114;9;38", "wc_reply_authors": "643;774;166;230", "reply_reviewers": "0;1;1;1", "reply_authors": "1;2;1;1", "recommendation_avg": [ 7.25, 1.299038105676658 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 4.0, 0.0 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.25, 1.299038105676658 ], "wc_summary_paper_avg": [ 144.25, 57.67744359799592 ], "wc_summary_review_avg": [ 57.25, 32.62188682464581 ], "wc_main_review_avg": [ 493.75, 183.57747002287618 ], "wc_review_avg": [ 695.25, 218.57993389147137 ], "wc_reply_reviewers_avg": [ 40.25, 44.83511458667191 ], "wc_reply_authors_avg": [ 453.25, 260.4029329711937 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": 0.0, "gs_citation": 68, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10525786915599690905&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 15, "pdf": "https://openreview.net/pdf?id=2_vhkAMARk", "email": "epfl.ch;kuleuven.be;;telecom-paris.fr;epfl.ch", "author_num": 5, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "Swiss Federal Institute of Technology Lausanne;Katholieke Universiteit Leuven;Telecom Paris;Swiss Federal Institute of Technology", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.epfl.ch;https://www.kuleuven.be;https://www.telecom-paris.fr;https://www.ethz.ch", "aff_unique_abbr": "EPFL;KU Leuven;Telecom Paris;ETH Zurich", "aff_campus_unique_index": "0", "aff_campus_unique": "Lausanne;", "aff_country_unique_index": "0;1;2;0", "aff_country_unique": "Switzerland;Belgium;France" }, { "id": "2aC0_RxkBL_", "title": "Where is the bottleneck in long-tailed classification?", "track": "main", "status": "Reject", "tldr": "", "abstract": "A commonly held belief in deep-learning based long-tailed classi\ufb01cation is that the representations learned from long-tailed data are \u201dgood enough\u201d and the performance bottleneck is the classi\ufb01cation head atop the representation learner. We design experiments to investigate this folk wisdom, and \ufb01nd that representations learned from long-tailed data distributions substantially differ from the representations learned from \u201dnormal\u201d data distributions. We show that the long-tailed representations are volatile and brittle with respect to the true data distribution. Compared to the representations learned from the true, balanced distributions, long-tailed representations fail to localize tail classes and display vastly worse inter-class separation and intra-class compactness when unseen samples from the true data distribution are embedded into the feature space. We provide an explanation for why data augmentation helps long-tailed classi\ufb01cation despite leaving the dataset imbalance unchanged \u2014 it promotes inter-class separation, intra-class compactness, and improves localization of tail classes w.r.t to the true data distribution.", "keywords": "fairness;bias;long tailed learning;imbalanced learning", "primary_area": "", "supplementary_material": "", "author": "Zaid Khan;Yun Fu", "authorids": "~Zaid_Khan1;~Yun_Fu1", "gender": "Not Specified;M", "homepage": "https://zaidkhan.me;http://www1.ece.neu.edu/~yunfu/", "dblp": "259/1127-1;00/5815-1", "google_scholar": "uXXocfgAAAAJ;https://scholar.google.com.tw/citations?user=h-JEcQ8AAAAJ", "orcid": ";0000-0002-5098-2853", "linkedin": "https://linkedin.com/in/khan-zaid;furaymond/", "or_profile": "~Zaid_Khan1;~Yun_Fu1", "aff": ";Northeastern University", "aff_domain": ";northeastern.edu", "position": ";Full Professor", "bibtex": "@misc{\nkhan2022where,\ntitle={Where is the bottleneck in long-tailed classification?},\nauthor={Zaid Khan and Yun Fu},\nyear={2022},\nurl={https://openreview.net/forum?id=2aC0_RxkBL_}\n}", "github": "", "project": "", "reviewers": "B7am;jXk1;gifg;eREq", "site": "https://openreview.net/forum?id=2aC0_RxkBL_", "pdf_size": 0, "recommendation": "3;3;5;8", "confidence": "5;4;5;5", "correctness": "2;2;3;4", "technical_novelty": "2;2;1;3", "empirical_novelty": "2;2;2;4", "wc_summary_paper": "55;58;75;45", "wc_summary_review": "69;29;16;37", "wc_main_review": "514;223;361;345", "wc_review": "638;310;452;427", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "436;624;513;406", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 4.75, 2.0463381929681126 ], "confidence_avg": [ 4.75, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.82915619758885 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 58.25, 10.80219885023415 ], "wc_summary_review_avg": [ 37.75, 19.536824204563032 ], "wc_main_review_avg": [ 360.75, 103.33047711106342 ], "wc_review_avg": [ 456.75, 117.57417871284494 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 494.75, 84.2121576733431 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.49374193110101877, "corr_recommendation_correctness": 0.9945577827230725, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:bo2J7kA3LlYJ:scholar.google.com/&scioq=Where+is+the+bottleneck+in+long-tailed+classification%3F&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "Northeastern University", "aff_unique_dep": "", "aff_unique_url": "https://www.northeastern.edu", "aff_unique_abbr": "NEU", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "title": "Should We Be Pre-training? An Argument for End-task Aware Training as an Alternative", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6249", "id": "2bO2x8NAIMB", "poster": "", "openreview": "https://openreview.net/forum?id=2bO2x8NAIMB", "slides": "https://iclr.cc/virtual/2022/poster/6249", "video": "https://iclr.cc/virtual/2022/poster/6249", "author_site": "Lucio Dery, Paul Michel, Ameet Talwalkar, Graham Neubig", "tldr": "", "abstract": "In most settings of practical concern, machine learning practitioners know in advance what end-task they wish to boost with auxiliary tasks. However, widely used methods for leveraging auxiliary data like pre-training and its continued-pretraining variant are end-task agnostic: they rarely, if ever, exploit knowledge of the target task. We study replacing end-task agnostic continued training of pre-trained language models with end-task aware training of said models. We argue that for sufficiently important end-tasks, the benefits of leveraging auxiliary data in a task-aware fashion can justify forgoing the traditional approach of obtaining generic, end-task agnostic representations as with (continued) pre-training. On three different low-resource NLP tasks from two domains, we demonstrate that multi-tasking the end-task and auxiliary objectives results in significantly better downstream task performance than the widely-used task-agnostic continued pre-training paradigm of Gururangan et al. (2020).\nWe next introduce an online meta-learning algorithm that learns a set of multi-task weights to better balance among our multiple auxiliary objectives, achieving further improvements on end-task performance and data efficiency.", "keywords": "pre-training;multitask learning;meta-learning;deeplearning;end-task aware training;NLP", "primary_area": "", "supplementary_material": "/attachment/95d6f185a3830744171e2f6cf96e2bd4b79f1b9e.zip", "author": "Lucio M. Dery;Paul Michel;Ameet Talwalkar;Graham Neubig", "authorids": "~Lucio_M._Dery1;~Paul_Michel1;~Ameet_Talwalkar1;~Graham_Neubig1", "gender": "M;M;M;M", "homepage": "https://ldery.github.io/;https://pmichel31415.github.io/;http://www.cs.cmu.edu/~atalwalk/;http://phontron.com", "dblp": "211/7773;185/1024;56/5528;03/8155", "google_scholar": "ggFzw0MAAAAJ;oyyIf0YAAAAJ;https://scholar.google.com.tw/citations?user=TW7U1W0AAAAJ;wlosgkoAAAAJ", "orcid": ";;;", "linkedin": ";paul-michel-4954b799/;;", "or_profile": "~Lucio_M._Dery1;~Paul_Michel1;~Ameet_Talwalkar1;~Graham_Neubig1", "aff": "Carnegie Mellon University;Ecole Normale Sup\u00e9rieure de Paris;Carnegie Mellon University;Carnegie Mellon University", "aff_domain": "cmu.edu;ens.fr;cmu.edu;cmu.edu", "position": "PhD student;Postdoc;Associate Professor;Associate Professor", "bibtex": "@inproceedings{\ndery2022should,\ntitle={Should We Be Pre-training? An Argument for End-task Aware Training as an Alternative},\nauthor={Lucio M. Dery and Paul Michel and Ameet Talwalkar and Graham Neubig},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=2bO2x8NAIMB}\n}", "github": "", "project": "", "reviewers": "nvTM;HAVV;ZMKG;uWQC", "pdf_size": 0, "recommendation": "5;6;6;6", "confidence": "4;3;4;3", "correctness": "3;3;3;4", "technical_novelty": "2;3;2;3", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "211;60;33;63", "wc_summary_review": "51;31;32;60", "wc_main_review": "189;435;470;235", "wc_review": "451;526;535;358", "wc_reply_reviewers": "0;142;0;0", "wc_reply_authors": "334;634;1038;188", "reply_reviewers": "0;1;0;0", "reply_authors": "1;1;2;1", "recommendation_avg": [ 5.75, 0.4330127018922193 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 91.75, 69.83328361175636 ], "wc_summary_review_avg": [ 43.5, 12.419742348374221 ], "wc_main_review_avg": [ 332.25, 121.97412635473148 ], "wc_review_avg": [ 467.5, 71.13543420827625 ], "wc_reply_reviewers_avg": [ 35.5, 61.48780366869514 ], "wc_reply_authors_avg": [ 548.5, 325.15034983834784 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 34, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18049548390488755873&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=2bO2x8NAIMB", "email": "cmu.edu;ens.fr;cmu.edu;cmu.edu", "author_num": 4, "aff_unique_index": "0;1;0;0", "aff_unique_norm": "Carnegie Mellon University;Ecole Normale Sup\u00e9rieure de Paris", "aff_unique_dep": ";", "aff_unique_url": "https://www.cmu.edu;https://www.ens.fr", "aff_unique_abbr": "CMU;ENS Paris", "aff_campus_unique_index": "1", "aff_campus_unique": ";Paris", "aff_country_unique_index": "0;1;0;0", "aff_country_unique": "United States;France" }, { "id": "2big50UF39", "title": "Active Deep Multiple Instance Learning", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "State-of-the-art multiple instance learning (MIL) models achieve competitive performance at the bag level. However, instance-level prediction, which is essential for many important applications, remains largely unsatisfactory. We propose a novel active deep multiple instance learning (ADMIL) model that samples a small subset of informative instances for annotation, aiming to significantly boost the instance-level prediction. A variance regularized loss function is designed to properly balance the bias and variance of instance-level predictions, aiming to effectively accommodate the highly imbalanced instance distribution in MIL and other fundamental challenges. Instead of directly minimizing the variance regularized loss that is non-convex, we optimize a distributionally robust bag level likelihood as its convex surrogate. The robust bag likelihood provides a good approximation of the variance based MIL loss with a strong theoretical guarantee. It also automatically balances bias and variance, making it effective to identify the potentially positive instances to support active sampling. The robust bag likelihood can be naturally integrated with a deep architecture to support deep model training using mini-batches of positive-negative bag pairs. Finally, a novel P-F sampling function is developed that combines a probability vector and predicted instance scores, obtained by optimizing the robust bag likelihood. By leveraging the key MIL assumption, the sampling function can explore the most challenging bags and effectively detect their positive instances for annotation, which can significantly improve the instance-level prediction. Experiments conducted over multiple real-world datasets clearly demonstrate the state-of-the-art instance-level prediction achieved by the proposed ADMIL model. ", "keywords": "multiple instance learning;active learning", "primary_area": "", "supplementary_material": "", "author": "Hitesh Sapkota;Qi Yu", "authorids": "~Hitesh_Sapkota1;~Qi_Yu1", "gender": "M;M", "homepage": "https://hiteshsapkota.github.io/;https://www.rit.edu/mining/", "dblp": "251/4284;58/6957-1", "google_scholar": "0FKsBXYAAAAJ;L3gWdfEAAAAJ", "orcid": ";0000-0002-0426-5407", "linkedin": "hitesh-sapkota-2226051ba/;", "or_profile": "~Hitesh_Sapkota1;~Qi_Yu1", "aff": "Rochester Institute of Technology;Rochester Institute of Technology", "aff_domain": "rit.edu;rit.edu", "position": "PhD student;Professor", "bibtex": "@misc{\nsapkota2022active,\ntitle={Active Deep Multiple Instance Learning},\nauthor={Hitesh Sapkota and Qi Yu},\nyear={2022},\nurl={https://openreview.net/forum?id=2big50UF39}\n}", "github": "", "project": "", "reviewers": "8Nkn;6dit;bDio;j7WH", "site": "https://openreview.net/forum?id=2big50UF39", "pdf_size": 0, "recommendation": "3;3;3;6", "confidence": "3;5;5;3", "correctness": "2;2;4;3", "technical_novelty": "3;3;3;3", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "66;38;59;91", "wc_summary_review": "33;12;76;31", "wc_main_review": "481;204;323;114", "wc_review": "580;254;458;236", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.75, 1.299038105676658 ], "confidence_avg": [ 4.0, 1.0 ], "correctness_avg": [ 2.75, 0.82915619758885 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 63.5, 18.9274932307477 ], "wc_summary_review_avg": [ 38.0, 23.42007685726074 ], "wc_main_review_avg": [ 280.5, 137.459994180125 ], "wc_review_avg": [ 382.0, 143.77065069060515 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.17407765595569782, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1, "aff_unique_index": "0;0", "aff_unique_norm": "Rochester Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.rit.edu", "aff_unique_abbr": "RIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "2cpsEstmH1", "title": "Beyond Examples: Constructing Explanation Space for Explaining Prototypes", "track": "main", "status": "Reject", "tldr": "", "abstract": "As deep learning has been successfully deployed in diverse applications, there is ever increasing need for explaining its decision. Most of the existing methods produced explanations with a second model that explains the first black-box model, but we propose an inherently interpretable model for more faithful explanations. Our method constructs an explanation space in which similarities in terms of human-interpretable features at images share similar latent representations by using a variational autoencoder. This explanation space provides additional explanations of the relationships, going beyond previous classification networks that provide explanations by distances and learned prototypes. In addition, our distance has more intrinsic meaning by VAE training techniques that regulate the latent space. With user study, we validate the quality of explanation space and additional explanations.", "keywords": "Interpretable machine learning;XAI;Uncertainty;Prototype-based classification", "primary_area": "", "supplementary_material": "/attachment/639b3e9f02f9cb13c4ce356b43b1dd370a55cbe3.zip", "author": "Hyungjun Joo;Seokhyeon Ha;Jae Myung Kim;Seungyub Han;Jungwoo Lee", "authorids": "~Hyungjun_Joo1;~Seokhyeon_Ha1;~Jae_Myung_Kim1;~Seungyub_Han1;~Jungwoo_Lee1", "gender": "M;M;M;M;M", "homepage": ";;https://jaemyung-kim.github.io;;https://cml.snu.ac.kr", "dblp": "289/8846;;51/1888;347/8731;34/516-1", "google_scholar": ";PL-b34sAAAAJ;eP6FHFAAAAAJ;ot1-XNAAAAAJ;j98IWfoAAAAJ", "orcid": ";;;0009-0001-8704-8968;0000-0002-6804-980X", "linkedin": "hyungjun-joo-817a81212/;;;;", "or_profile": "~Hyungjun_Joo1;~Seokhyeon_Ha1;~Jae_Myung_Kim1;~Seungyub_Han1;~Jungwoo_Lee1", "aff": "Seoul National University;Seoul National University;University of Tuebingen;Seoul National University;Seoul National University", "aff_domain": "snu.ac.kr;snu.ac.kr;uni-tuebingen.de;snu.ac.kr;snu.ac.kr", "position": "PhD student;PhD student;PhD student;PhD student;Full Professor", "bibtex": "@misc{\njoo2022beyond,\ntitle={Beyond Examples: Constructing Explanation Space for Explaining Prototypes},\nauthor={Hyungjun Joo and Seokhyeon Ha and Jae Myung Kim and Seungyub Han and Jungwoo Lee},\nyear={2022},\nurl={https://openreview.net/forum?id=2cpsEstmH1}\n}", "github": "", "project": "", "reviewers": "RHQ4;HXZp;QGcu;bDHd", "site": "https://openreview.net/forum?id=2cpsEstmH1", "pdf_size": 0, "recommendation": "3;5;5;8", "confidence": "3;3;2;3", "correctness": "2;3;2;4", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "80;45;70;61", "wc_summary_review": "46;39;48;30", "wc_main_review": "261;77;1279;243", "wc_review": "387;161;1397;334", "wc_reply_reviewers": "0;0;769;0", "wc_reply_authors": "1158;937;3856;601", "reply_reviewers": "0;0;2;0", "reply_authors": "3;2;8;1", "recommendation_avg": [ 5.25, 1.7853571071357126 ], "confidence_avg": [ 2.75, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.82915619758885 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 64.0, 12.864680330268607 ], "wc_summary_review_avg": [ 40.75, 7.048936089935842 ], "wc_main_review_avg": [ 465.0, 475.405090422894 ], "wc_review_avg": [ 569.75, 484.86976344168954 ], "wc_reply_reviewers_avg": [ 192.25, 332.98676775511666 ], "wc_reply_authors_avg": [ 1638.0, 1295.8292711619074 ], "reply_reviewers_avg": [ 0.5, 0.8660254037844386 ], "reply_authors_avg": [ 3.5, 2.692582403567252 ], "replies_avg": [ 22, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.08084520834544431, "corr_recommendation_correctness": 0.8866206949335731, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:ELZDunWTlJAJ:scholar.google.com/&scioq=Beyond+Examples:+Constructing+Explanation+Space+for+Explaining+Prototypes&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;1;0;0", "aff_unique_norm": "Seoul National University;University of Tuebingen", "aff_unique_dep": ";", "aff_unique_url": "https://www.snu.ac.kr;https://www.uni-tuebingen.de/", "aff_unique_abbr": "SNU;Uni T\u00fcbingen", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;0", "aff_country_unique": "South Korea;Germany" }, { "id": "2d4riGOpmU8", "title": "Sequential Covariate Shift Detection Using Classifier Two-Sample Tests", "track": "main", "status": "Reject", "tldr": "", "abstract": "A standard assumption in supervised learning is that the training data and test data are from the same distribution. However, this assumption often fails to hold in practice, which can cause the learned model to perform poorly. We consider the problem of detecting covariate shift, where the covariate distribution shifts but the conditional distribution of labels given covariates remains the same. This problem can naturally be solved using a two-sample test--- i.e., test whether the current test distribution of covariates equals the training distribution of covariates. Our algorithm builds on classifier tests, which train a discriminator to distinguish train and test covariates, and then use the accuracy of this discriminator as a test statistic. A key challenge is that classifier tests assume given a fixed set of test covariates. In practice, test covariates often arrive sequentially over time---e.g., a self-driving car observes a stream of images while driving. Furthermore, covariate shift can occur multiple times--- i.e., shift and then shift back later or gradually shift over time. To address these challenges, our algorithm trains the discriminator online. Furthermore, it evaluates test accuracy using each new covariate before taking a gradient step; this strategy avoids constructing a held-out test set, which can reduce sample efficiency. We prove that this optimization preserves the correctness---i.e., our algorithm achieves a desired bound on the false positive rate. In our experiments, we show that our algorithm efficiently detects covariate shifts on ImageNet.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/715af7f0a3f6b550b3d38b1dc4ed7da99cc7e6ac.zip", "author": "Sooyong Jang;Sangdon Park;Insup Lee;Osbert Bastani", "authorids": "~Sooyong_Jang1;~Sangdon_Park1;~Insup_Lee1;~Osbert_Bastani1", "gender": ";M;;M", "homepage": ";https://sangdon.github.io/;https://www.cis.upenn.edu/~lee/;http://obastani.github.io", "dblp": ";119/1530-1;l/InsupLee.html;21/11275", "google_scholar": ";Vi2E2F4AAAAJ;qPlUgrgAAAAJ;cxYepGkAAAAJ", "orcid": ";;0000-0003-2672-1132;", "linkedin": ";;;", "or_profile": "~Sooyong_Jang1;~Sangdon_Park1;~Insup_Lee1;~Osbert_Bastani1", "aff": ";Georgia Institute of Technology;University of Pennsylvania;University of Pennsylvania", "aff_domain": ";gatech.edu;upenn.edu;upenn.edu", "position": ";Postdoc;Full Professor;Assistant Professor", "bibtex": "@misc{\njang2022sequential,\ntitle={Sequential Covariate Shift Detection Using Classifier Two-Sample Tests},\nauthor={Sooyong Jang and Sangdon Park and Insup Lee and Osbert Bastani},\nyear={2022},\nurl={https://openreview.net/forum?id=2d4riGOpmU8}\n}", "github": "", "project": "", "reviewers": "yyGB;PsoX;Wc7g;X4Zb", "site": "https://openreview.net/forum?id=2d4riGOpmU8", "pdf_size": 0, "recommendation": "3;5;6;6", "confidence": "3;5;3;4", "correctness": "3;3;4;3", "technical_novelty": "2;3;3;3", "empirical_novelty": "3;0;3;3", "wc_summary_paper": "80;157;56;155", "wc_summary_review": "60;170;70;18", "wc_main_review": "135;946;207;219", "wc_review": "275;1273;333;392", "wc_reply_reviewers": "0;0;37;90", "wc_reply_authors": "260;936;445;468", "reply_reviewers": "0;0;1;1", "reply_authors": "1;2;1;1", "recommendation_avg": [ 5.0, 1.224744871391589 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 1.299038105676658 ], "wc_summary_paper_avg": [ 112.0, 44.81629168059312 ], "wc_summary_review_avg": [ 79.5, 55.774097930849585 ], "wc_main_review_avg": [ 376.75, 330.2229360598685 ], "wc_review_avg": [ 568.25, 408.98494776702967 ], "wc_reply_reviewers_avg": [ 31.75, 36.867160183556315 ], "wc_reply_authors_avg": [ 527.25, 249.38662253617375 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.24618298195866545, "corr_recommendation_correctness": 0.4714045207910316, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17014091212404389073&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff_unique_index": "0;1;1", "aff_unique_norm": "Georgia Institute of Technology;University of Pennsylvania", "aff_unique_dep": ";", "aff_unique_url": "https://www.gatech.edu;https://www.upenn.edu", "aff_unique_abbr": "Georgia Tech;UPenn", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "2e7Bf6b-v_P", "title": "ES-ENAS: Blackbox Optimization over Hybrid Spaces via Combinatorial and Continuous Evolution", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "We consider the problem of efficient blackbox optimization over a large hybrid search space, consisting of a mixture of a high dimensional continuous space and a complex combinatorial space. Such examples arise commonly in evolutionary computation, but also more recently, neuroevolution and architecture search for Reinforcement Learning (RL) policies. Unfortunately however, previous mutation-based approaches suffer in high dimensional continuous spaces both theoretically and practically. We thus instead propose ES-ENAS, a simple joint optimization procedure by combining Evolutionary Strategies (ES) and combinatorial optimization techniques in a highly scalable and intuitive way, inspired by the \\textit{one-shot} or \\textit{supernet} paradigm introduced in Efficient Neural Architecture Search (ENAS). Through this relatively simple marriage between two different lines of research, we are able to gain the best of both worlds, and empirically demonstrate our approach by optimizing BBOB functions over hybrid spaces as well as combinatorial neural network architectures via edge pruning and quantization on popular RL benchmarks. Due to the modularity of the algorithm, we also are able incorporate a wide variety of popular techniques ranging from use of different continuous and combinatorial optimizers, as well as constrained optimization.", "keywords": "ES;ENAS;hybrid;search;space;blackbox;combinatorial;optimization;reinforcement;learning;mujoco;policies;evolutionary;computation;neuroevolution;high;dimension;supernet;one-shot;nas;neural;architecture;search;efficient", "primary_area": "", "supplementary_material": "", "author": "Xingyou Song;Krzysztof Marcin Choromanski;Jack Parker-Holder;Yunhao Tang;Daiyi Peng;Deepali Jain;Wenbo Gao;Aldo Pacchiano;Tamas Sarlos;Yuxiang Yang", "authorids": "~Xingyou_Song1;~Krzysztof_Marcin_Choromanski1;~Jack_Parker-Holder1;~Yunhao_Tang1;~Daiyi_Peng1;~Deepali_Jain1;~Wenbo_Gao1;~Aldo_Pacchiano1;~Tamas_Sarlos1;~Yuxiang_Yang2", "gender": "M;;M;M;M;F;;M;M;M", "homepage": "https://xingyousong.github.io/;;https://jparkerholder.github.io/;https://robintyh1.github.io;http://www.daiyip.org;;http://www.columbia.edu/~wg2279;https://www.aldopacchiano.ai;https://sites.google.com/site/stamas/;https://yxyang.github.io", "dblp": "211/7623;78/11411;237/9793.html;210/2229;;84/8010;203/4481;129/6338;48/959;", "google_scholar": "GnpHmO8AAAAJ;;;;_8Egwg8AAAAJ;;;no_BfYgAAAAJ;c4YtO-MAAAAJ;2NQKmzIAAAAJ", "orcid": ";;;;;;;;;", "linkedin": "xingyou-song-355629a1/;;;;;;;;;", "or_profile": "~Xingyou_Song1;~Krzysztof_Marcin_Choromanski1;~Jack_Parker-Holder1;~Yunhao_Tang1;~Daiyi_Peng1;~Deepali_Jain1;~Wenbo_Gao1;~Aldo_Pacchiano1;~Tamas_Sarlos1;~Yuxiang_Yang2", "aff": "Google DeepMind;Google Brain Robotics & Columbia University;University of Oxford;Google DeepMind;;Google;;Microsoft;Google Research;Google", "aff_domain": "google.com;columbia.edu;ox.ac.uk;deepmind.com;;google.com;;microsoft.com;google.com;google.com", "position": "Senior Research Scientist;research scientist & adjunct assistant professor;PhD student;Research Scientist;;Researcher;;Postdoc;Staff Research Scientist;Researcher", "bibtex": "@misc{\nsong2022esenas,\ntitle={{ES}-{ENAS}: Blackbox Optimization over Hybrid Spaces via Combinatorial and Continuous Evolution},\nauthor={Xingyou Song and Krzysztof Marcin Choromanski and Jack Parker-Holder and Yunhao Tang and Daiyi Peng and Deepali Jain and Wenbo Gao and Aldo Pacchiano and Tamas Sarlos and Yuxiang Yang},\nyear={2022},\nurl={https://openreview.net/forum?id=2e7Bf6b-v_P}\n}", "github": "", "project": "", "reviewers": "XvDG;qjiu;4unB", "site": "https://openreview.net/forum?id=2e7Bf6b-v_P", "pdf_size": 0, "recommendation": "3;3;5", "confidence": "4;3;3", "correctness": "3;3;3", "technical_novelty": "2;2;2", "empirical_novelty": "2;2;3", "wc_summary_paper": "51;97;174", "wc_summary_review": "59;36;38", "wc_main_review": "257;150;405", "wc_review": "367;283;617", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 3.6666666666666665, 0.9428090415820634 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 107.33333333333333, 50.74336300334152 ], "wc_summary_review_avg": [ 44.333333333333336, 10.402991022884823 ], "wc_main_review_avg": [ 270.6666666666667, 104.55089138256493 ], "wc_review_avg": [ 422.3333333333333, 141.85751850203624 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 10, 0 ], "corr_recommendation_confidence": -0.5, "corr_recommendation_correctness": 0.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=605574112780373165&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;1;0;0;2;0;0", "aff_unique_norm": "Google;University of Oxford;Microsoft", "aff_unique_dep": "Google DeepMind;;Microsoft Corporation", "aff_unique_url": "https://deepmind.com;https://www.ox.ac.uk;https://www.microsoft.com", "aff_unique_abbr": "DeepMind;Oxford;Microsoft", "aff_campus_unique_index": "1;1;1;1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;1;0;0;1;1;1;1", "aff_country_unique": "United Kingdom;United States" }, { "title": "R5: Rule Discovery with Reinforced and Recurrent Relational Reasoning", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7053", "id": "2eXhNpHeW6E", "poster": "", "openreview": "https://openreview.net/forum?id=2eXhNpHeW6E", "slides": "https://iclr.cc/virtual/2022/poster/7053", "video": "https://iclr.cc/virtual/2022/poster/7053", "author_site": "Shengyao Lu, Bang Liu, Keith G Mills, SHANGLING JUI, Di Niu", "tldr": "", "abstract": "Systematicity, i.e., the ability to recombine known parts and rules to form new sequences while reasoning over relational data, is critical to machine intelligence. A model with strong systematicity is able to train on small-scale tasks and generalize to large-scale tasks. In this paper, we propose R5, a relational reasoning framework based on reinforcement learning that reasons over relational graph data and explicitly mines underlying compositional logical rules from observations. R5 has strong systematicity and being robust to noisy data. It consists of a policy value network equipped with Monte Carlo Tree Search to perform recurrent relational prediction and a backtrack rewriting mechanism for rule mining. By alternately applying the two components, R5 progressively learns a set of explicit rules from data and performs explainable and generalizable relation prediction. We conduct extensive evaluations on multiple datasets. Experimental results show that R5 outperforms various embedding-based and rule induction baselines on relation prediction tasks while achieving a high recall rate in discovering ground truth rules. ", "keywords": "systematicity;graph reasoning", "primary_area": "", "supplementary_material": "/attachment/c6b012eaca2260ea3246a34633c60cb211b98152.zip", "author": "Shengyao Lu;Bang Liu;Keith G. Mills;SHANGLING JUI;Di Niu", "authorids": "~Shengyao_Lu1;~Bang_Liu1;~Keith_G._Mills1;~SHANGLING_JUI1;~Di_Niu1", "gender": "F;M;M;M;M", "homepage": "https://sluxsr.github.io/;http://www-labs.iro.umontreal.ca/~liubang/;https://kgmills.github.io/;;https://www.ualberta.ca/~dniu", "dblp": "320/4184;;299/5864;;82/4953", "google_scholar": "https://scholar.google.ca/citations?user=MSsab9EAAAAJ;lmfAnP4AAAAJ;CBOD_ngAAAAJ;;https://scholar.google.ca/citations?user=3kC5OogAAAAJ", "orcid": ";0000-0002-9483-8984;0000-0001-6054-1798;0000-0002-1047-4264;0000-0002-5250-7327", "linkedin": ";bang-liu-12b66789/?originalSubdomain=ca;kgmills/;;", "or_profile": "~Shengyao_Lu1;~Bang_Liu1;~Keith_G._Mills1;~SHANGLING_JUI1;~Di_Niu1", "aff": "University of Alberta;University of Montreal;Huawei Technologies Ltd.;Huawei Technologies Ltd.;University of Alberta", "aff_domain": "ualberta.ca;umontreal.ca;huawei.com;huawei.com;ualberta.ca", "position": "PhD student;Assistant Professor;Research Intern;Principal Researcher;Associate Professor", "bibtex": "@inproceedings{\nlu2022r,\ntitle={R5: Rule Discovery with Reinforced and Recurrent Relational Reasoning},\nauthor={Shengyao Lu and Bang Liu and Keith G. Mills and SHANGLING JUI and Di Niu},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=2eXhNpHeW6E}\n}", "github": "", "project": "", "reviewers": "TLJz;dUTD;Mm5p", "pdf_size": 0, "recommendation": "5;6;6", "confidence": "3;5;3", "correctness": "3;4;3", "technical_novelty": "3;3;3", "empirical_novelty": "3;3;3", "wc_summary_paper": "65;79;110", "wc_summary_review": "91;53;11", "wc_main_review": "253;453;182", "wc_review": "409;585;303", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "1806;420;701", "reply_reviewers": "0;0;0", "reply_authors": "3;2;1", "recommendation_avg": [ 5.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.9428090415820634 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 84.66666666666667, 18.803073034893938 ], "wc_summary_review_avg": [ 51.666666666666664, 32.67346867958093 ], "wc_main_review_avg": [ 296.0, 114.73738129601297 ], "wc_review_avg": [ 432.3333333333333, 116.3022881211811 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 975.6666666666666, 598.2364824121719 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.0, 0.816496580927726 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.4999999999999999, "corr_recommendation_correctness": 0.4999999999999999, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13510369297360682676&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=2eXhNpHeW6E", "email": "ualberta.ca;umontreal.ca;huawei.com;huawei.com;ualberta.ca", "author_num": 5, "aff_unique_index": "0;1;2;2;0", "aff_unique_norm": "University of Alberta;University of Montreal;Huawei", "aff_unique_dep": ";;Huawei Technologies", "aff_unique_url": "https://www.ualberta.ca;https://wwwumontreal.ca;https://www.huawei.com", "aff_unique_abbr": "UAlberta;UM;Huawei", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;1;0", "aff_country_unique": "Canada;China" }, { "title": "Critical Points in Quantum Generative Models", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6777", "id": "2f1z55GVQN", "poster": "", "openreview": "https://openreview.net/forum?id=2f1z55GVQN", "slides": "https://iclr.cc/virtual/2022/poster/6777", "video": "https://iclr.cc/virtual/2022/poster/6777", "author_site": "Eric Anschuetz", "tldr": "", "abstract": "One of the most important properties of neural networks is the clustering of local minima of the loss function near the global minimum, enabling efficient training. Though generative models implemented on quantum computers are known to be more expressive than their traditional counterparts, it has empirically been observed that these models experience a transition in the quality of their local minima. Namely, below some critical number of parameters, all local minima are far from the global minimum in function value; above this critical parameter count, all local minima are good approximators of the global minimum. Furthermore, for a certain class of quantum generative models, this transition has empirically been observed to occur at parameter counts exponentially large in the problem size, meaning practical training of these models is out of reach. Here, we give the first proof of this transition in trainability, specializing to this latter class of quantum generative model. We use techniques inspired by those used to study the loss landscapes of classical neural networks. We also verify that our analytic results hold experimentally even at modest model sizes.", "keywords": "loss landscapes;quantum;Wishart spin-glass model", "primary_area": "", "supplementary_material": "", "author": "Eric Ricardo Anschuetz", "authorids": "~Eric_Ricardo_Anschuetz1", "gender": "M", "homepage": "https://eanschuetz.github.io/", "dblp": "59/1076", "google_scholar": "dCjnZaUAAAAJ", "orcid": "0000-0002-9825-3692", "linkedin": "eanschuetz/", "or_profile": "~Eric_Ricardo_Anschuetz1", "aff": "Massachusetts Institute of Technology", "aff_domain": "mit.edu", "position": "PhD student", "bibtex": "@inproceedings{\nanschuetz2022critical,\ntitle={Critical Points in Quantum Generative Models},\nauthor={Eric Ricardo Anschuetz},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=2f1z55GVQN}\n}", "github": "", "project": "", "reviewers": "zxWF;F6sD;5aN5", "pdf_size": 0, "recommendation": "6;8;8", "confidence": "4;3;3", "correctness": "2;4;4", "technical_novelty": "3;4;4", "empirical_novelty": "2;3;2", "wc_summary_paper": "116;143;86", "wc_summary_review": "243;39;38", "wc_main_review": "301;236;323", "wc_review": "660;418;447", "wc_reply_reviewers": "125;0;0", "wc_reply_authors": "1043;68;66", "reply_reviewers": "1;0;0", "reply_authors": "2;1;1", "recommendation_avg": [ 7.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.9428090415820634 ], "technical_novelty_avg": [ 3.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 115.0, 23.280893453645632 ], "wc_summary_review_avg": [ 106.66666666666667, 96.40308893160818 ], "wc_main_review_avg": [ 286.6666666666667, 36.935379004718804 ], "wc_review_avg": [ 508.3333333333333, 107.89604049989765 ], "wc_reply_reviewers_avg": [ 41.666666666666664, 58.92556509887896 ], "wc_reply_authors_avg": [ 392.3333333333333, 460.0915367861294 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 1, 0 ], "corr_recommendation_confidence": -0.9999999999999998, "corr_recommendation_correctness": 1.0, "gs_citation": 68, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4238856895681826797&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=2f1z55GVQN", "email": "mit.edu", "author_num": 1, "aff_unique_index": "0", "aff_unique_norm": "Massachusetts Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://web.mit.edu", "aff_unique_abbr": "MIT", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "id": "2g9m74He1Ky", "title": "Spatio-temporal Disentangled representation learning for mobility prediction", "track": "main", "status": "Reject", "tldr": "", "abstract": "Spatio-temporal (ST) prediction task like mobility forecasting is of great significance to traffic management and public safety. \nThere is an increasing number of works proposed for mobility forecasting problems recently, and they typically focus on better extraction of the features from the spatial and temporal domains. Although prior works show promising results on more accurate predictions, they still suffer in characterising and separating the dynamic and static components, making it difficult to make further improvements. Disentangled representation learning separates the learnt latent representation into independent variables associated with semantic factors. It offers a better separation of the spatial and temporal features, which could improve the performance of mobility forecasting models. In this work, we propose a VAE-based architecture for learning the disentangled representation from real spatio-temporal data for mobility forecasting. Our deep generative model learns a latent representation that (i) separates the temporal dynamics of the data from the spatially varying component and generates effective reconstructions; (ii) is able to achieve state-of-the-art performance across multiple spatio-temporal datasets. Moreover, we investigate the effectiveness of our method by eliminating the non-informative features from the learnt representations, and the results show that models can benefit from this operation.", "keywords": "Disentangled representation learning;Spatio-temporal data;principle of relevant information", "primary_area": "", "supplementary_material": "", "author": "Sichen Zhao;Wei Shao;Jeffrey Chan;Flora D. Salim", "authorids": "~Sichen_Zhao1;wei.shao@rmit.edu.au;jeffrey.chan@rmit.edu.au;~Flora_D._Salim1", "gender": "M;;;", "homepage": "http://www.sichenzhao.com;;;", "dblp": ";;;", "google_scholar": "hxa94MMAAAAJ;;;", "orcid": "0000-0002-0697-7299;;;", "linkedin": ";;;", "or_profile": "~Sichen_Zhao1;wei.shao@rmit.edu.au;jeffrey.chan@rmit.edu.au;~Flora_D._Salim1", "aff": "Royal Melbourne Institute of Technology;;;", "aff_domain": "rmit.edu.au;;;", "position": "PhD student;;;", "bibtex": "@misc{\nzhao2022spatiotemporal,\ntitle={Spatio-temporal Disentangled representation learning for mobility prediction},\nauthor={Sichen Zhao and Wei Shao and Jeffrey Chan and Flora D. Salim},\nyear={2022},\nurl={https://openreview.net/forum?id=2g9m74He1Ky}\n}", "github": "", "project": "", "reviewers": "MSHW;z9DA;ZJKW;f7db", "site": "https://openreview.net/forum?id=2g9m74He1Ky", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "5;5;4;3", "correctness": "3;3;2;4", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;0;2;2", "wc_summary_paper": "90;52;79;98", "wc_summary_review": "143;57;65;43", "wc_main_review": "1006;238;482;229", "wc_review": "1239;347;626;370", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "617;230;430;169", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 4.25, 0.82915619758885 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 79.75, 17.383541066192468 ], "wc_summary_review_avg": [ 77.0, 38.91015291668744 ], "wc_main_review_avg": [ 488.75, 315.4119330336124 ], "wc_review_avg": [ 645.5, 359.7308021284805 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 361.5, 176.29591600488084 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.8703882797784891, "corr_recommendation_correctness": 0.816496580927726, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3377421083583102709&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "Royal Melbourne Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.rmit.edu.au", "aff_unique_abbr": "RMIT", "aff_country_unique_index": "0", "aff_country_unique": "Australia" }, { "title": "Partial Wasserstein Adversarial Network for Non-rigid Point Set Registration", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7018", "id": "2ggNjUisGyr", "poster": "", "openreview": "https://openreview.net/forum?id=2ggNjUisGyr", "slides": "https://iclr.cc/virtual/2022/poster/7018", "video": "https://iclr.cc/virtual/2022/poster/7018", "author_site": "Ziming Wang, Nan Xue, Ling Lei, Gui-Song Xia", "tldr": "", "abstract": "Given two point sets, the problem of registration is to recover a transformation that matches one set to the other. This task is challenging due to the presence of large number of outliers, the unknown non-rigid deformations and the large sizes of point sets. To obtain strong robustness against outliers, we formulate the registration problem as a partial distribution matching (PDM) problem, where the goal is to partially match the distributions represented by point sets in a metric space. To handle large point sets, we propose a scalable PDM algorithm by utilizing the efficient partial Wasserstein-1 (PW) discrepancy. Specifically, we derive the Kantorovich-Rubinstein duality for the PW discrepancy, and show its gradient can be explicitly computed. Based on these results, we propose a partial Wasserstein adversarial network (PWAN), which is able to approximate the PW discrepancy by a neural network, and minimize it by gradient descent. In addition,\nit also incorporates an efficient coherence regularizer for non-rigid transformations to avoid unrealistic deformations. We evaluate PWAN on practical point set registration tasks, and show that the proposed PWAN is robust, scalable and performs more favorably than the state-of-the-art methods.\n", "keywords": "partial Wasserstein discrepancy;partial distribution matching;point set registration", "primary_area": "", "supplementary_material": "", "author": "Ziming Wang;Nan Xue;Ling Lei;Gui-Song Xia", "authorids": "~Ziming_Wang1;~Nan_Xue1;~Ling_Lei1;~Gui-Song_Xia3", "gender": "M;M;F;M", "homepage": ";https://xuenan.net;http://www.captain-whu.com/leiling.html;http://www.captain-whu.com/xia_En.html", "dblp": ";153/8762-1;;97/594", "google_scholar": "Z93qEesAAAAJ;CKTrWqYAAAAJ;;SAUCVsEAAAAJ", "orcid": "0000-0002-1739-307X;;;0000-0001-7660-6090", "linkedin": ";;;guisongxia/", "or_profile": "~Ziming_Wang1;~Nan_Xue1;~Ling_Lei1;~Gui-song_Xia1", "aff": "Wuhan University;Wuhan University;;Wuhan University", "aff_domain": "whu.edu.cn;whu.edu.cn;;whu.edu.cn", "position": "PhD student;Research Associate Professor;;Full Professor", "bibtex": "@inproceedings{\nwang2022partial,\ntitle={Partial Wasserstein Adversarial Network for Non-rigid Point Set Registration},\nauthor={Ziming Wang and Nan Xue and Ling Lei and Gui-Song Xia},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=2ggNjUisGyr}\n}", "github": "", "project": "", "reviewers": "dwHk;U9Y7;y2JC;7kLR;x4db", "pdf_size": 0, "recommendation": "6;6;6;6;6", "confidence": "3;4;3;4;3", "correctness": "3;4;4;4;3", "technical_novelty": "3;3;2;3;2", "empirical_novelty": "3;2;3;3;3", "wc_summary_paper": "49;68;84;104;90", "wc_summary_review": "35;46;13;26;81", "wc_main_review": "169;153;205;95;747", "wc_review": "253;267;302;225;918", "wc_reply_reviewers": "0;0;0;0;314", "wc_reply_authors": "225;148;796;173;2183", "reply_reviewers": "0;0;0;0;3", "reply_authors": "2;2;2;2;6", "recommendation_avg": [ 6.0, 0.0 ], "confidence_avg": [ 3.4, 0.4898979485566356 ], "correctness_avg": [ 3.6, 0.4898979485566356 ], "technical_novelty_avg": [ 2.6, 0.4898979485566356 ], "empirical_novelty_avg": [ 2.8, 0.39999999999999997 ], "wc_summary_paper_avg": [ 79.0, 18.93145530591877 ], "wc_summary_review_avg": [ 40.2, 23.094588110637524 ], "wc_main_review_avg": [ 273.8, 239.24748692515038 ], "wc_review_avg": [ 393.0, 263.6687315553363 ], "wc_reply_reviewers_avg": [ 62.8, 125.6 ], "wc_reply_authors_avg": [ 705.0, 776.7159068797291 ], "reply_reviewers_avg": [ 0.6, 1.2 ], "reply_authors_avg": [ 2.8, 1.6 ], "replies_avg": [ 25, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17661046507900526159&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=2ggNjUisGyr", "email": "whu.edu.cn;whu.edu.cn;;whu.edu.cn", "author_num": 4, "aff_unique_index": "0;0;0", "aff_unique_norm": "Wuhan University", "aff_unique_dep": "", "aff_unique_url": "http://www.whu.edu.cn/", "aff_unique_abbr": "WHU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "id": "2hMEdc35xZ6", "title": "Defect Transfer GAN: Diverse Defect Synthesis for Data Augmentation", "track": "main", "status": "Reject", "tldr": "", "abstract": "Large amounts of data are a common requirement for many deep learning approaches. However, data is not always equally available at large scale for all classes. For example, on highly optimized production lines, defective samples are hardly acquired while non-defective samples come almost for free. The defects however often seem to resemble each other, e.g., scratches on different products may only differ in few characteristics. In this work, we propose to make use of the shared characteristics by transferring a stylized defect-specific content from one type of background product to another. Moreover, the stochastic variations of the shared characteristics are captured, which also allows generating novel defects from random noise. These synthetic defective samples enlarge the dataset and increase the diversity of defects on the target product. Experiments demonstrate that our model is able to disentangle the defect-specific content from the background of an image without pixel-level labels. We present convincing results on images from real industrial production lines. Furthermore, we show consistent gains of using our method to enlarge training sets in classification tasks.", "keywords": "Defect synthesis;Generative Adversarial Networks;Content transfer;Automated visual inspection;Data augmentation", "primary_area": "", "supplementary_material": "", "author": "Ruyu Wang;Sabrina Hoppe;Eduardo Monari;Marco Huber", "authorids": "~Ruyu_Wang1;~Sabrina_Hoppe2;eduardo.monari@de.bosch.com;~Marco_Huber2", "gender": "F;F;;M", "homepage": ";;;https://www.ipa.fraunhofer.de/ai", "dblp": "231/3091;;;76/4332", "google_scholar": "dPlp4OAAAAAJ;https://scholar.google.co.il/citations?user=i3TvWsoAAAAJ;;https://scholar.google.de/citations?user=SUU9998AAAAJ", "orcid": ";;;0000-0002-8250-2092", "linkedin": "ruyu-wang/?locale=de_DE;;;marco-huber-78a1a151/", "or_profile": "~Ruyu_Wang1;~Sabrina_Hoppe2;eduardo.monari@de.bosch.com;~Marco_Huber2", "aff": "Robert Bosch GmbH, Bosch;Robert Bosch GmbH, Bosch;;University of Stuttgart", "aff_domain": "de.bosch.com;de.bosch.com;;uni-stuttgart.de", "position": "PhD student;PhD student;;Full Professor", "bibtex": "@misc{\nwang2022defect,\ntitle={Defect Transfer {GAN}: Diverse Defect Synthesis for Data Augmentation},\nauthor={Ruyu Wang and Sabrina Hoppe and Eduardo Monari and Marco Huber},\nyear={2022},\nurl={https://openreview.net/forum?id=2hMEdc35xZ6}\n}", "github": "", "project": "", "reviewers": "swLC;Ks5k;GYiR;GkV1;NBex", "site": "https://openreview.net/forum?id=2hMEdc35xZ6", "pdf_size": 0, "recommendation": "3;3;3;3;5", "confidence": "5;4;4;4;4", "correctness": "3;3;3;3;2", "technical_novelty": "2;2;2;2;2", "empirical_novelty": "2;3;2;2;2", "wc_summary_paper": "55;49;82;60;66", "wc_summary_review": "72;35;35;29;34", "wc_main_review": "138;127;146;227;185", "wc_review": "265;211;263;316;285", "wc_reply_reviewers": "0;41;32;40;38", "wc_reply_authors": "1128;1011;993;1539;1643", "reply_reviewers": "0;1;1;1;1", "reply_authors": "3;3;3;3;4", "recommendation_avg": [ 3.4, 0.8 ], "confidence_avg": [ 4.2, 0.39999999999999997 ], "correctness_avg": [ 2.8, 0.39999999999999997 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.2, 0.39999999999999997 ], "wc_summary_paper_avg": [ 62.4, 11.288932633336069 ], "wc_summary_review_avg": [ 41.0, 15.658863304850707 ], "wc_main_review_avg": [ 164.6, 36.816300737580896 ], "wc_review_avg": [ 268.0, 34.28119017770533 ], "wc_reply_reviewers_avg": [ 30.2, 15.419468213917106 ], "wc_reply_authors_avg": [ 1262.8, 273.93605093159977 ], "reply_reviewers_avg": [ 0.8, 0.4000000000000001 ], "reply_authors_avg": [ 3.2, 0.39999999999999997 ], "replies_avg": [ 26, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.25, "corr_recommendation_correctness": -1.0, "gs_citation": 27, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15804666820328791444&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff_unique_index": "0;0;1", "aff_unique_norm": "Robert Bosch GmbH;University of Stuttgart", "aff_unique_dep": ";", "aff_unique_url": "https://www.bosch.com;https://www.uni-stuttgart.de", "aff_unique_abbr": "Bosch;USTuttgart", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Germany" }, { "id": "2hnbGJBFsv", "title": "Domain Adaptation via Maximizing Surrogate Mutual Information", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Unsupervised domain adaptation (UDA), which is an important topic in transfer learning, aims to predict unlabeled data from target domain with access to labeled data from the source domain. In this work, we propose a novel framework called SIDA (Surrogate Mutual Information Maximization Domain Adaptation) with strong theoretical guarantees. To be specific, SIDA implements adaptation by maximizing mutual information (MI) between features. In the framework, a surrogate joint distribution models the underlying joint distribution of the unlabeled target domain. Our theoretical analysis validates SIDA by bounding the expected risk on target domain with MI and surrogate distribution bias. Experiments show that our approach is comparable with state-of-the-art unsupervised adaptation methods on standard UDA tasks.\n", "keywords": "Domain Adaptation;Transfer Learning;Information Theory", "primary_area": "", "supplementary_material": "/attachment/929d7df02611eb03489895e1e41c3ae3439af28f.zip", "author": "Haiteng Zhao;Ma Chang;Qinyu Chen;Zhi-Hong Deng", "authorids": "~Haiteng_Zhao1;~Ma_Chang1;~Qinyu_Chen2;~Zhi-Hong_Deng1", "gender": "M;F;M;M", "homepage": "https://zhao-ht.github.io/haitengzhao/;https://github.com/chang-github-00;https://morganchen.site;http://www.cis.pku.edu.cn/jzyg/szdw/dzh.htm", "dblp": "304/8330;;;161/4814-1", "google_scholar": "ZQlZN10AAAAJ;8OOpuiIAAAAJ;;https://scholar.google.com.tw/citations?user=tRoAxlsAAAAJ", "orcid": ";;;0000-0002-0263-8142", "linkedin": ";;;", "or_profile": "~Haiteng_Zhao1;~Ma_Chang1;~Qinyu_Chen2;~Zhi-Hong_Deng1", "aff": "Peking University;Peking University;Peking University;Peking University", "aff_domain": "pku.edu.cn;pku.edu.cn;pku.edu.cn;pku.edu.cn", "position": "PhD student;Undergrad student;Undergrad student;Full Professor", "bibtex": "@misc{\nzhao2022domain,\ntitle={Domain Adaptation via Maximizing Surrogate Mutual Information},\nauthor={Haiteng Zhao and Ma Chang and Qinyu Chen and Zhi-Hong Deng},\nyear={2022},\nurl={https://openreview.net/forum?id=2hnbGJBFsv}\n}", "github": "", "project": "", "reviewers": "cPYn;nkSv;yY5T", "site": "https://openreview.net/forum?id=2hnbGJBFsv", "pdf_size": 0, "recommendation": "5;5;5", "confidence": "3;4;4", "correctness": "3;4;3", "technical_novelty": "4;3;3", "empirical_novelty": "0;2;0", "wc_summary_paper": "69;45;50", "wc_summary_review": "29;49;26", "wc_main_review": "125;313;304", "wc_review": "223;407;380", "wc_reply_reviewers": "39;31;86", "wc_reply_authors": "104;417;285", "reply_reviewers": "1;1;1", "reply_authors": "1;1;1", "recommendation_avg": [ 5.0, 0.0 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 3.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 0.6666666666666666, 0.9428090415820634 ], "wc_summary_paper_avg": [ 54.666666666666664, 10.338708279513883 ], "wc_summary_review_avg": [ 34.666666666666664, 10.208928554075703 ], "wc_main_review_avg": [ 247.33333333333334, 86.58072662101088 ], "wc_review_avg": [ 336.6666666666667, 81.12678691748833 ], "wc_reply_reviewers_avg": [ 52.0, 24.26245384677046 ], "wc_reply_authors_avg": [ 268.6666666666667, 128.30259372107625 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6422415684930918135&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Peking University", "aff_unique_dep": "", "aff_unique_url": "http://www.pku.edu.cn", "aff_unique_abbr": "Peking U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "id": "2jYxq9_TkpG", "title": "Network Pruning Optimization by Simulated Annealing Algorithm", "track": "main", "status": "Reject", "tldr": "", "abstract": "One critical problem of large neural networks is over-parameterization with a large number of weight parameters. This becomes an obstacle to implement networks in edge devices as well as limiting the development of industrial applications by engineers for machine learning problems. Plenty of papers have shown that the redundant branches can be erased strategically in a fully connected network. In this work, we reduce network complexity by pruning and structure optimization. We propose to do network optimization by Simulated Annealing, a heuristic based non-convex optimization method which can potentially solve this NP-hard problem and find the global minimum for a given percentage of branch pruning given sufficient amount of time. Our results have shown that Simulated Annealing can significantly reduce the complexity of a fully connected neural network with only limited loss of performance.", "keywords": "optimization;network pruning", "primary_area": "", "supplementary_material": "/attachment/40fff44bea68ddeb35da0620748d3cc1abad8999.zip", "author": "Chun Lin Kuo;Ercan Engin Kuruoglu;Wai Kin Victor Chan", "authorids": "~Chun_Lin_Kuo1;kuruoglu@sz.tsinghua.edu.cn;chanw@sz.tsinghua.edu.cn", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": "jackkuotw/;;", "or_profile": "~Chun_Lin_Kuo1;kuruoglu@sz.tsinghua.edu.cn;chanw@sz.tsinghua.edu.cn", "aff": "Tsinghua University;;", "aff_domain": "tsinghua.edu.cn;;", "position": "MS student;;", "bibtex": "@misc{\nkuo2022network,\ntitle={Network Pruning Optimization by Simulated Annealing Algorithm},\nauthor={Chun Lin Kuo and Ercan Engin Kuruoglu and Wai Kin Victor Chan},\nyear={2022},\nurl={https://openreview.net/forum?id=2jYxq9_TkpG}\n}", "github": "", "project": "", "reviewers": "1vEQ;EBdG;t58P;inN3", "site": "https://openreview.net/forum?id=2jYxq9_TkpG", "pdf_size": 0, "recommendation": "1;3;3;3", "confidence": "4;4;4;4", "correctness": "2;3;3;2", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;2;2;1", "wc_summary_paper": "66;48;39;44", "wc_summary_review": "36;45;30;49", "wc_main_review": "160;141;182;178", "wc_review": "262;234;251;271", "wc_reply_reviewers": "49;31;0;56", "wc_reply_authors": "81;196;196;168", "reply_reviewers": "1;1;0;1", "reply_authors": "1;1;1;1", "recommendation_avg": [ 2.5, 0.8660254037844386 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 1.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 49.25, 10.18270592720815 ], "wc_summary_review_avg": [ 40.0, 7.44983221287567 ], "wc_main_review_avg": [ 165.25, 16.269219403523945 ], "wc_review_avg": [ 254.5, 13.793114224133722 ], "wc_reply_reviewers_avg": [ 34.0, 21.644860821913362 ], "wc_reply_authors_avg": [ 160.25, 47.16129239111244 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:80O83jjYmh0J:scholar.google.com/&scioq=Network+Pruning+Optimization+by+Simulated+Annealing+Algorithm&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "Tsinghua University", "aff_unique_dep": "", "aff_unique_url": "https://www.tsinghua.edu.cn", "aff_unique_abbr": "THU", "aff_country_unique_index": "0", "aff_country_unique": "China" }, { "id": "2p_5F9sHN9", "title": "The Geometry of Adversarial Subspaces", "track": "main", "status": "Reject", "tldr": "", "abstract": "Artificial neural networks (ANNs) are constructed using well-understood mathematical operations, and yet their high-dimensional, non-linear, and compositional nature has hindered our ability to provide an intuitive description of how and why they produce any particular output. A striking example of this lack of understanding is our inability to design networks that are robust to adversarial input perturbations, which are often imperceptible to a human observer but cause significant undesirable changes in the network\u2019s response. The primary contribution of this work is to further our understanding of the decision boundary geometry of ANN classifiers by utilizing such adversarial perturbations. For this purpose, we define adversarial subspaces, which are spanned by orthogonal directions of minimal perturbation to the decision boundary from any given input sample. We find that the decision boundary lies close to input samples in a large subspace, where the distance to the boundary grows smoothly and sub-linearly as one increases the dimensionality of the subspace. We undertake analysis to characterize the geometry of the boundary, which is more curved within the adversarial subspace than within a random subspace of equal dimensionality. To date, the most widely used defense against test-time adversarial attacks is adversarial training, where one incorporates adversarial attacks into the training procedure. Using our analysis, we provide new insight into the consequences of adversarial training by quantifying the increase in boundary distance within adversarial subspaces, the redistribution of proximal class labels, and the decrease in boundary curvature.", "keywords": "adversarial attack;decision boundary;riemannian geometry;differential geometry;interpretability;adversarial training", "primary_area": "", "supplementary_material": "", "author": "Dylan M. Paiton;David Schultheiss;Matthias Kuemmerer;Zac Cranko;Matthias Bethge", "authorids": "~Dylan_M._Paiton1;david.schultheiss@student.uni-tuebingen.de;~Matthias_Kuemmerer1;~Zac_Cranko1;~Matthias_Bethge1", "gender": "M;;;M;M", "homepage": "http://dylan.vision;;;;https://bethgelab.org", "dblp": "139/5790;;151/6291.html;203/8760;77/3005", "google_scholar": "HYQYE3gAAAAJ;;https://scholar.google.de/citations?user=y5Ej2qYAAAAJ;;https://scholar.google.com/citations?hl=en", "orcid": ";;0000-0001-9644-4703;;", "linkedin": "dpaiton/;;;;", "or_profile": "~Dylan_M._Paiton1;david.schultheiss@student.uni-tuebingen.de;~Matthias_Kuemmerer1;~Zac_Cranko1;~Matthias_Bethge1", "aff": "University of Tuebingen;;Eberhard-Karls-Universit\u00e4t T\u00fcbingen;University of Tuebingen;University of Tuebingen", "aff_domain": "uni-tuebingen.de;;uni-tuebingen.de;uni-tuebingen.de;uni-tuebingen.de", "position": "Postdoc;;Postdoc;Postdoc;Full Professor", "bibtex": "@misc{\npaiton2022the,\ntitle={The Geometry of Adversarial Subspaces},\nauthor={Dylan M. Paiton and David Schultheiss and Matthias Kuemmerer and Zac Cranko and Matthias Bethge},\nyear={2022},\nurl={https://openreview.net/forum?id=2p_5F9sHN9}\n}", "github": "", "project": "", "reviewers": "xfHm;Pjpz;Dw9Y;kBnb", "site": "https://openreview.net/forum?id=2p_5F9sHN9", "pdf_size": 0, "recommendation": "6;6;6;6", "confidence": "4;4;4;4", "correctness": "3;4;4;3", "technical_novelty": "3;3;3;2", "empirical_novelty": "3;3;3;3", "wc_summary_paper": "127;122;95;55", "wc_summary_review": "44;50;108;21", "wc_main_review": "490;429;279;188", "wc_review": "661;601;482;264", "wc_reply_reviewers": "127;147;0;0", "wc_reply_authors": "1215;948;1002;486", "reply_reviewers": "1;2;0;0", "reply_authors": "2;2;3;1", "recommendation_avg": [ 6.0, 0.0 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 99.75, 28.560243346302215 ], "wc_summary_review_avg": [ 55.75, 32.04976598978532 ], "wc_main_review_avg": [ 346.5, 119.45396602875938 ], "wc_review_avg": [ 502.0, 151.76132577175252 ], "wc_reply_reviewers_avg": [ 68.5, 68.86399639869879 ], "wc_reply_authors_avg": [ 912.75, 265.83770895040453 ], "reply_reviewers_avg": [ 0.75, 0.82915619758885 ], "reply_authors_avg": [ 2.0, 0.7071067811865476 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:y6iKlMGuPk8J:scholar.google.com/&scioq=The+Geometry+of+Adversarial+Subspaces&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;0;0", "aff_unique_norm": "University of Tuebingen;Eberhard Karls University of T\u00fcbingen", "aff_unique_dep": ";", "aff_unique_url": "https://www.uni-tuebingen.de/;https://www.uni-tuebingen.de/", "aff_unique_abbr": "Uni T\u00fcbingen;Uni T\u00fcbingen", "aff_campus_unique_index": "1", "aff_campus_unique": ";T\u00fcbingen", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Germany" }, { "id": "2s4sNT11IcH", "title": "On the Convergence and Calibration of Deep Learning with Differential Privacy", "track": "main", "status": "Reject", "tldr": "", "abstract": "In deep learning with differential privacy (DP), the neural network achieves the privacy usually at the cost of slower convergence (and thus lower performance) than its non-private counterpart. This work gives the first convergence analysis of the DP deep learning, through the lens of training dynamics and the neural tangent kernel (NTK) matrix. Our convergence theory successfully characterizes the effects of two key components in the DP training: the per-sample clipping and the noise addition. We initiate a general principled framework to understand the DP deep learning with any network architecture, loss function and various optimizers including DP-Adam. Our analysis also motivates a new clipping method, the 'global clipping', that significantly improves the convergence, as well as preserves the same DP guarantee and computational efficiency as the existing method, which we term as 'local clipping'. In addition, our global clipping is surprisingly effective at learning calibrated classifiers, in contrast to the existing DP classifiers which are oftentimes over-confident and unreliable. Implementation-wise, the new clipping can be realized by inserting one line of code into the Pytorch Opacus library.", "keywords": "Deep Learning;Differential Privacy;Optimization Algorithms;Convergence Theory;Calibration", "primary_area": "", "supplementary_material": "/attachment/fa3b4ed560016ca0ec3e9336cafab2c38ff34d99.zip", "author": "Zhiqi Bu;Hua Wang;Qi Long;Weijie J Su", "authorids": "~Zhiqi_Bu1;~Hua_Wang7;~Qi_Long1;~Weijie_J_Su1", "gender": "M;M;M;M", "homepage": "https://sites.google.com/view/zhiqi-bu;https://statistics.wharton.upenn.edu/profile/wanghua/;https://www.med.upenn.edu/long-lab/;http://stat.wharton.upenn.edu/~suw/", "dblp": "245/2573;;47/7320;228/9127", "google_scholar": "MEvTLxIAAAAJ;;gfklepYAAAAJ;Uhf4nBkAAAAJ", "orcid": ";;0000-0003-0660-5230;", "linkedin": ";;qi-long-9652a0125/;", "or_profile": "~Zhiqi_Bu1;~Hua_Wang7;~Qi_Long1;~Weijie_J_Su1", "aff": "Amazon;The Wharton School, University of Pennsylvania;University of Pennsylvania;University of Pennsylvania", "aff_domain": "amazon.com;wharton.upenn.edu;upenn.edu;upenn.edu", "position": "Researcher;PhD student;Professor;Assistant Professor", "bibtex": "@misc{\nbu2022on,\ntitle={On the Convergence and Calibration of Deep Learning with Differential Privacy},\nauthor={Zhiqi Bu and Hua Wang and Qi Long and Weijie J Su},\nyear={2022},\nurl={https://openreview.net/forum?id=2s4sNT11IcH}\n}", "github": "", "project": "", "reviewers": "b1Rw;LL2P;Aa4t;Yf7v", "site": "https://openreview.net/forum?id=2s4sNT11IcH", "pdf_size": 0, "recommendation": "3;3;5;8", "confidence": "4;3;3;4", "correctness": "2;4;3;4", "technical_novelty": "2;2;3;4", "empirical_novelty": "2;2;4;4", "wc_summary_paper": "192;388;83;29", "wc_summary_review": "71;23;120;86", "wc_main_review": "912;392;321;13", "wc_review": "1175;803;524;128", "wc_reply_reviewers": "0;0;21;0", "wc_reply_authors": "1008;789;745;78", "reply_reviewers": "0;0;1;0", "reply_authors": "3;2;3;1", "recommendation_avg": [ 4.75, 2.0463381929681126 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.75, 0.82915619758885 ], "empirical_novelty_avg": [ 3.0, 1.0 ], "wc_summary_paper_avg": [ 173.0, 137.31533053523194 ], "wc_summary_review_avg": [ 75.0, 34.878360053190576 ], "wc_main_review_avg": [ 409.5, 323.20929751478377 ], "wc_review_avg": [ 657.5, 383.1347674121992 ], "wc_reply_reviewers_avg": [ 5.25, 9.093266739736606 ], "wc_reply_authors_avg": [ 655.0, 347.7046160176767 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 2.25, 0.82915619758885 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.3665083330689157, "corr_recommendation_correctness": 0.47886115464444223, "gs_citation": 44, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3243055467196602428&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff_unique_index": "0;1;1;1", "aff_unique_norm": "Amazon;University of Pennsylvania", "aff_unique_dep": "Amazon.com, Inc.;The Wharton School", "aff_unique_url": "https://www.amazon.com;https://www.wharton.upenn.edu", "aff_unique_abbr": "Amazon;UPenn Wharton", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "ZeroFL: Efficient On-Device Training for Federated Learning with Local Sparsity", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6036", "id": "2sDQwC_hmnM", "poster": "", "openreview": "https://openreview.net/forum?id=2sDQwC_hmnM", "slides": "https://iclr.cc/virtual/2022/poster/6036", "video": "https://iclr.cc/virtual/2022/poster/6036", "author_site": "Xinchi Qiu, Javier Fernandez-Marques, Pedro Porto Buarque de Gusmao, Yan Gao, Titouan Parcollet, Nicholas Lane", "tldr": "", "abstract": "When the available hardware cannot meet the memory and compute requirements to efficiently train high performing machine learning models, a compromise in either the training quality or the model complexity is needed. In Federated Learning (FL), nodes are orders of magnitude more constrained than traditional server-grade hardware and are often battery powered, severely limiting the sophistication of models that can be trained under this paradigm. While most research has focused on designing better aggregation strategies to improve convergence rates and in alleviating the communication costs of FL, fewer efforts have been devoted to accelerating on-device training. Such stage, which repeats hundreds of times (i.e. every round) and can involve thousands of devices, accounts for the majority of the time required to train federated models and, the totality of the energy consumption at the client side. In this work, we present the first study on the unique aspects that arise when introducing sparsity at training time in FL workloads. We then propose ZeroFL, a framework that relies on highly sparse operations to accelerate on-device training. Models trained with ZeroFL and 95% sparsity achieve up to 2.3% higher accuracy compared to competitive baselines obtained from adapting a state-of-the-art sparse training framework to the FL setting.", "keywords": "Federated Learning;sparse training", "primary_area": "", "supplementary_material": "", "author": "Xinchi Qiu;Javier Fernandez-Marques;Pedro PB Gusmao;Yan Gao;Titouan Parcollet;Nicholas Donald Lane", "authorids": "~Xinchi_Qiu1;~Javier_Fernandez-Marques1;pedropgusmao@gmail.com;~Yan_Gao4;~Titouan_Parcollet1;~Nicholas_Donald_Lane1", "gender": "F;;;M;M;", "homepage": ";;;https://www.cst.cam.ac.uk/people/yg381;http://www.darnault-parcollet.fr/Parcollet/index.html;", "dblp": "265/6559;;;;https://dblp.org/pers/hd/p/Parcollet:Titouan;", "google_scholar": "yW6vsS8AAAAJ;;;https://scholar.google.com/citations?hl=en;;", "orcid": ";;;;;", "linkedin": "xinchi-qiu-686a7394/;;;;titouan-parcollet-b233a698;", "or_profile": "~Xinchi_Qiu1;~Javier_Fernandez-Marques1;pedropgusmao@gmail.com;~Yan_Gao4;~Titouan_Parcollet1;~Nicholas_Donald_Lane1", "aff": "University of Cambridge;;;University of Cambridge;Avignon University;", "aff_domain": "cam.ac.uk;;;cam.ac.uk;univ-avignon.fr;", "position": "PhD student;;;PhD student;Associate Professor;", "bibtex": "@inproceedings{\nqiu2022zerofl,\ntitle={Zero{FL}: Efficient On-Device Training for Federated Learning with Local Sparsity},\nauthor={Xinchi Qiu and Javier Fernandez-Marques and Pedro PB Gusmao and Yan Gao and Titouan Parcollet and Nicholas Donald Lane},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=2sDQwC_hmnM}\n}", "github": "", "project": "", "reviewers": "Fw5w;sb7x;1bwd", "pdf_size": 0, "recommendation": "5;6;6", "confidence": "4;4;3", "correctness": "4;4;2", "technical_novelty": "2;3;2", "empirical_novelty": "3;3;3", "wc_summary_paper": "312;47;135", "wc_summary_review": "59;75;96", "wc_main_review": "481;243;428", "wc_review": "852;365;659", "wc_reply_reviewers": "295;31;207", "wc_reply_authors": "2633;677;1068", "reply_reviewers": "1;1;2", "reply_authors": "5;1;3", "recommendation_avg": [ 5.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.9428090415820634 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 164.66666666666666, 110.2008267764912 ], "wc_summary_review_avg": [ 76.66666666666667, 15.15109090315135 ], "wc_main_review_avg": [ 384.0, 102.02287325235781 ], "wc_review_avg": [ 625.3333333333334, 200.2370817028876 ], "wc_reply_reviewers_avg": [ 177.66666666666666, 109.75528334536894 ], "wc_reply_authors_avg": [ 1459.3333333333333, 845.1194524378722 ], "reply_reviewers_avg": [ 1.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 3.0, 1.632993161855452 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.4999999999999999, "corr_recommendation_correctness": -0.4999999999999999, "gs_citation": 77, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12244144857052708879&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "pdf": "https://openreview.net/pdf?id=2sDQwC_hmnM", "email": "cam.ac.uk;;;cam.ac.uk;univ-avignon.fr;", "author_num": 6, "aff_unique_index": "0;0;1", "aff_unique_norm": "University of Cambridge;Avignon University", "aff_unique_dep": ";", "aff_unique_url": "https://www.cam.ac.uk;https://www.univ-avignon.fr", "aff_unique_abbr": "Cambridge;U. Avignon", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Cambridge;", "aff_country_unique_index": "0;0;1", "aff_country_unique": "United Kingdom;France" }, { "title": "ToM2C: Target-oriented Multi-agent Communication and Cooperation with Theory of Mind", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6578", "id": "2t7CkQXNpuq", "poster": "", "openreview": "https://openreview.net/forum?id=2t7CkQXNpuq", "slides": "https://iclr.cc/virtual/2022/poster/6578", "video": "https://iclr.cc/virtual/2022/poster/6578", "author_site": "Yuanfei Wang, Fangwei Zhong, Jing Xu, Yizhou Wang", "tldr": "", "abstract": "Being able to predict the mental states of others is a key factor to effective social interaction. It is also crucial for distributed multi-agent systems, where agents are required to communicate and cooperate. In this paper, we introduce such an important social-cognitive skill, i.e. Theory of Mind (ToM), to build socially intelligent agents who are able to communicate and cooperate effectively to accomplish challenging tasks. With ToM, each agent is capable of inferring the mental states and intentions of others according to its (local) observation. Based on the inferred states, the agents decide \"when'' and with \"whom'' to share their intentions. With the information observed, inferred, and received, the agents decide their sub-goals and reach a consensus among the team. In the end, the low-level executors independently take primitive actions to accomplish the sub-goals. We demonstrate the idea in two typical target-oriented multi-agent tasks: cooperative navigation and multi-sensor target coverage. The experiments show that the proposed model not only outperforms the state-of-the-art methods on reward and communication efficiency, but also shows good generalization across different scales of the environment.\n", "keywords": "Theory of Mind;Target-oriented Multi-Agent Cooperation;Multi-agent Communication", "primary_area": "", "supplementary_material": "/attachment/7d57aba638cf68329bdd4dd937db8435cef956d5.zip", "author": "Yuanfei Wang;fangwei zhong;Jing Xu;Yizhou Wang", "authorids": "~Yuanfei_Wang1;~fangwei_zhong1;~Jing_Xu2;~Yizhou_Wang1", "gender": "M;M;F;M", "homepage": "https://yuanfei-wang.github.io/;https://fangweizhong.xyz/;;https://cfcs.pku.edu.cn/wangyizhou/", "dblp": "47/10626;207/1900;;71/3387-1", "google_scholar": ";ejDz1bYAAAAJ;;831z_VcAAAAJ", "orcid": "0009-0008-8908-1981;0000-0002-0428-4552;;", "linkedin": ";;;", "or_profile": "~Yuanfei_Wang1;~fangwei_zhong1;~Jing_Xu2;~Yizhou_Wang1", "aff": "Peking University;Peking University;;Peking University", "aff_domain": "pku.edu.cn;pku.edu.cn;;pku.edu.cn", "position": "PhD student;Postdoc;;Full Professor", "bibtex": "@inproceedings{\nwang2022tomc,\ntitle={ToM2C: Target-oriented Multi-agent Communication and Cooperation with Theory of Mind},\nauthor={Yuanfei Wang and fangwei zhong and Jing Xu and Yizhou Wang},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=2t7CkQXNpuq}\n}", "github": "", "project": "", "reviewers": "V4LB;jxPG;YARZ", "pdf_size": 0, "recommendation": "6;6;6", "confidence": "4;3;4", "correctness": "4;3;3", "technical_novelty": "3;2;3", "empirical_novelty": "3;3;2", "wc_summary_paper": "78;97;97", "wc_summary_review": "44;56;48", "wc_main_review": "818;574;388", "wc_review": "940;727;533", "wc_reply_reviewers": "101;49;54", "wc_reply_authors": "1126;735;1023", "reply_reviewers": "1;1;1", "reply_authors": "2;1;2", "recommendation_avg": [ 6.0, 0.0 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 90.66666666666667, 8.956685895029603 ], "wc_summary_review_avg": [ 49.333333333333336, 4.988876515698588 ], "wc_main_review_avg": [ 593.3333333333334, 176.07826542635965 ], "wc_review_avg": [ 733.3333333333334, 166.21739446346228 ], "wc_reply_reviewers_avg": [ 68.0, 23.423634787681152 ], "wc_reply_authors_avg": [ 961.3333333333334, 165.47373071142002 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 93, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13700850065152438149&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=2t7CkQXNpuq", "email": "pku.edu.cn;pku.edu.cn;;pku.edu.cn", "author_num": 4, "aff_unique_index": "0;0;0", "aff_unique_norm": "Peking University", "aff_unique_dep": "", "aff_unique_url": "http://www.pku.edu.cn", "aff_unique_abbr": "Peking U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "id": "2wiaitACS_O", "title": "CUP: A Conservative Update Policy Algorithm for Safe Reinforcement Learning", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Safe reinforcement learning (RL) is still very challenging since it requires the agent to consider both return maximization and safe exploration.\nIn this paper, we propose CUP, a \\textbf{C}onservative \\textbf{U}pdate \\textbf{P}olicy algorithm with a theoretical safety guarantee.\nThe derivation of CUP is based on surrogate functions w.r.t. our new proposed bounds.\nAlthough using bounds as surrogate functions to design safe RL algorithms have appeared in some existing works, we develop them at least three aspects: \n\\textbf{(i)} We provide a rigorous theoretical analysis to extend the bounds w.r.t. generalized advantage estimator (GAE).\nGAE significantly reduces variance while maintains a tolerable level of bias, which is an efficient step for us to design CUP;\n\\textbf{(ii)} The proposed bounds are more compact than existing works, i.e., using the proposed bounds as surrogate functions are better local approximations to the objective and constraints.\n\\textbf{(iii)} The bound of worst-case safe constraint violation of CUP is more compact than the existing safe RL algorithms, which explains why CUP is so good in practice.\nFinally, extensive experiments on continuous control tasks show the effectiveness of CUP where the agent satisfies safe constraints.\n", "keywords": "reinforcement learning;constrained Markov decision processes;safety learning", "primary_area": "", "supplementary_material": "/attachment/c2b976e001cf511ed7bd67b78cac0783ebc7a3bc.zip", "author": "Long Yang;Yu Zhang;Jiaming Ji;Juntao Dai;Weidong Zhang", "authorids": "~Long_Yang4;~Yu_Zhang33;~Jiaming_Ji1;~Juntao_Dai1;~Weidong_Zhang2", "gender": "M;M;M;M;M", "homepage": "https://person.zju.edu.cn/longyang;https://person.zju.edu.cn/en/yuzhang1;https://github.com/jijiaming-bit;https://person.zju.edu.cn/jtdai;https://www.linkedin.com/in/weidongz/", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~Long_Yang4;~Yu_Zhang33;~Jiaming_Ji1;~Juntao_Dai1;~Weidong_Zhang2", "aff": "Peking University;;Zhejiang University;Zhejiang University;Netease Games AI Lab", "aff_domain": "pku.edu.cn;;zju.edu.cn;zju.edu.cn;corp.netease.com", "position": "Postdoc;;MS student;PhD student;Director", "bibtex": "@misc{\nyang2022cup,\ntitle={{CUP}: A Conservative Update Policy Algorithm for Safe Reinforcement Learning},\nauthor={Long Yang and Yu Zhang and Jiaming Ji and Juntao Dai and Weidong Zhang},\nyear={2022},\nurl={https://openreview.net/forum?id=2wiaitACS_O}\n}", "github": "", "project": "", "reviewers": "mcQ6;TJT1;nTtg;eL62", "site": "https://openreview.net/forum?id=2wiaitACS_O", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "3;4;4;2", "correctness": "4;3;3;3", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;2;0;2", "wc_summary_paper": "12;86;48;667", "wc_summary_review": "8;12;48;212", "wc_main_review": "56;281;530;1014", "wc_review": "76;379;626;1893", "wc_reply_reviewers": "0;0;0;72", "wc_reply_authors": "713;548;715;1935", "reply_reviewers": "0;0;0;1", "reply_authors": "2;1;2;7", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 1.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 203.25, 269.02172310057045 ], "wc_summary_review_avg": [ 70.0, 83.45058418010026 ], "wc_main_review_avg": [ 470.25, 355.89772056027556 ], "wc_review_avg": [ 743.5, 691.6597790821727 ], "wc_reply_reviewers_avg": [ 18.0, 31.176914536239792 ], "wc_reply_authors_avg": [ 977.75, 556.8084836817773 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 3.0, 2.345207879911715 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.20751433915982243, "corr_recommendation_correctness": -0.9271726499455306, "gs_citation": 25, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6017589196848028504&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;1;2", "aff_unique_norm": "Peking University;Zhejiang University;NetEase Games", "aff_unique_dep": ";;AI Lab", "aff_unique_url": "http://www.pku.edu.cn;https://www.zju.edu.cn;https://game.netease.com", "aff_unique_abbr": "Peking U;ZJU;NGAL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "id": "2yITmG7YIFT", "title": "HD-cos Networks: Efficient Neural Architechtures for Secure Multi-Party Computation", "track": "main", "status": "Reject", "tldr": "", "abstract": "Multi-party computation (MPC) is a branch of cryptography where multiple non-colluding parties execute a well designed protocol to securely compute a function. With the non-colluding party assumption, MPC has a cryptographic guarantee that the parties will not learn sensitive information from the computation process, making it an appealing framework for applications that involve privacy-sensitive user data.\nIn this paper, we study training and inference of neural networks under the MPC setup. This is challenging because the elementary operations of neural networks such as the ReLU activation function and matrix-vector multiplications are very expensive to compute due to the added multi-party communication overhead. \nTo address this, we propose the HD-cos network that uses 1) cosine as activation function, 2) the Hadamard-Diagonal transformation to replace the unstructured linear transformations. We show that both of the approaches enjoy strong theoretical motivations and efficient computation under the MPC setup. We demonstrate on multiple public datasets that HD-cos matches the quality of the more expensive baselines. ", "keywords": "multi-party computation;privacy;cryptography;privacy-preserving machine learning", "primary_area": "", "supplementary_material": "", "author": "Wittawat Jitkrittum;Michal Lukasik;Ananda Theertha Suresh;Felix Yu;Gang Wang", "authorids": "~Wittawat_Jitkrittum1;~Michal_Lukasik1;~Ananda_Theertha_Suresh1;~Felix_Yu1;wanggang@google.com", "gender": "M;;M;M;", "homepage": "http://wittawat.com;https://mlukasik.github.io/;https://theertha.info;http://felixyu.org;", "dblp": "95/3398.html;72/11338;119/3884;23/10574;", "google_scholar": "https://scholar.google.co.uk/citations?hl=en;https://scholar.google.co.uk/citations?user=cLZLZCQAAAAJ;K6ef57QAAAAJ;lYvF6cUAAAAJ;", "orcid": "0000-0002-9400-9262;;;;", "linkedin": "wittawat-jitkrittum/;;;;", "or_profile": "~Wittawat_Jitkrittum1;~Michal_Lukasik1;~Ananda_Theertha_Suresh1;~Felix_Yu1;wanggang@google.com", "aff": "Google Research;Google Research;Google;Google;", "aff_domain": "google.com;google.com;google.com;google.com;", "position": "Research Scientist;Research Scientist;Research Scientist;Research Scientist;", "bibtex": "@misc{\njitkrittum2022hdcos,\ntitle={{HD}-cos Networks: Efficient Neural Architechtures for Secure Multi-Party Computation},\nauthor={Wittawat Jitkrittum and Michal Lukasik and Ananda Theertha Suresh and Felix Yu and Gang Wang},\nyear={2022},\nurl={https://openreview.net/forum?id=2yITmG7YIFT}\n}", "github": "", "project": "", "reviewers": "pKsC;g4Ma;w59J;7c3H", "site": "https://openreview.net/forum?id=2yITmG7YIFT", "pdf_size": 0, "recommendation": "1;3;5;5", "confidence": "5;3;4;4", "correctness": "1;2;3;3", "technical_novelty": "1;2;2;3", "empirical_novelty": "0;2;2;3", "wc_summary_paper": "36;36;75;63", "wc_summary_review": "16;6;46;30", "wc_main_review": "43;150;135;159", "wc_review": "95;192;256;252", "wc_reply_reviewers": "27;37;0;55", "wc_reply_authors": "64;211;250;361", "reply_reviewers": "1;1;0;1", "reply_authors": "1;1;1;2", "recommendation_avg": [ 3.5, 1.6583123951777 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 2.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 1.75, 1.0897247358851685 ], "wc_summary_paper_avg": [ 52.5, 17.03672503740082 ], "wc_summary_review_avg": [ 24.5, 15.058220346375597 ], "wc_main_review_avg": [ 121.75, 46.267564232408 ], "wc_review_avg": [ 198.75, 65.04373528634406 ], "wc_reply_reviewers_avg": [ 29.75, 19.891895334532606 ], "wc_reply_authors_avg": [ 221.5, 106.28852242834125 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.42640143271122083, "corr_recommendation_correctness": 1.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Kw8Ry2hpfAkJ:scholar.google.com/&scioq=HD-cos+Networks:+Efficient+Neural+Architechtures+for+Secure+Multi-Party+Computation&hl=en&as_sdt=0,33", "gs_version_total": 3, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Google", "aff_unique_dep": "Google Research", "aff_unique_url": "https://research.google", "aff_unique_abbr": "Google Research", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "2z5h4hY-LQ", "title": "GAETS: A Graph Autoencoder Time Series Approach Towards Battery Parameter Estimation", "track": "main", "status": "Reject", "tldr": "", "abstract": " Lithium-ion batteries are powering the ongoing transportation electrification revolution. Lithium-ion batteries possess higher energy density and favourable electrochemical properties which make it a preferable energy source for electric vehicles. Precise estimation of battery parameters (Charge capacity, voltage etc) is vital to estimate the available range in an electric vehicle. Graph-based estimation techniques enable us to understand the variable dependencies underpinning them to improve estimates. In this paper we employ Graph Neural Networks for battery parameter estimation, we introduce a unique graph autoencoder time series estimation approach. Variables in battery measurements are known to have an underlying relationship with each other in a certain causal structure. Therefore, we include ideas from the field of causal structure learning as a regularisation to our learned adjacency matrix technique. We use graph autoencoder based on a non-linear version of NOTEARS Zheng et al. (2018) as this allowed us to perform gradient-descent in learning the structure (instead of treating it as a combinatorial optimisation problem). The proposed architecture outperforms the state-of-the-art Graph Time Series (GTS) Shang et al. (2021a) architecture for battery parameter estimation. We call our method GAETS (Graph AutoEncoder Time Series).", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Edward Elson Kosasih;Rucha Bhalchandra Joshi;Janamejaya Channegowda", "authorids": "eek31@cam.ac.uk;rucha.joshi@niser.ac.in;~Janamejaya_Channegowda1", "gender": ";;M", "homepage": ";;", "dblp": ";;276/0113", "google_scholar": ";;", "orcid": ";;0000-0002-3634-4214", "linkedin": ";;https://in.linkedin.com/in/janamejaya-channegowda", "or_profile": "eek31@cam.ac.uk;rucha.joshi@niser.ac.in;~Janamejaya_Channegowda1", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nkosasih2022gaets,\ntitle={{GAETS}: A Graph Autoencoder Time Series Approach Towards Battery Parameter Estimation},\nauthor={Edward Elson Kosasih and Rucha Bhalchandra Joshi and Janamejaya Channegowda},\nyear={2022},\nurl={https://openreview.net/forum?id=2z5h4hY-LQ}\n}", "github": "", "project": "", "reviewers": "61UC;LsgQ;L9LB;5kWi", "site": "https://openreview.net/forum?id=2z5h4hY-LQ", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "3;4;4;3", "correctness": "2;2;4;2", "technical_novelty": "2;2;1;2", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "62;138;35;46", "wc_summary_review": "44;89;17;48", "wc_main_review": "193;386;132;221", "wc_review": "299;613;184;315", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 2.5, 0.8660254037844386 ], "technical_novelty_avg": [ 1.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 70.25, 40.2763888649417 ], "wc_summary_review_avg": [ 49.5, 25.734218464915543 ], "wc_main_review_avg": [ 233.0, 94.01329693187023 ], "wc_review_avg": [ 352.75, 158.52503745465572 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": -0.3333333333333333, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9033176969628331952&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4 }, { "id": "30SXt3-vvnM", "title": "Model-Efficient Deep Learning with Kernelized Classification", "track": "main", "status": "Reject", "tldr": "", "abstract": "We investigate the possibility of using the embeddings produced by a lightweight network more effectively with a nonlinear classification layer. Although conventional deep networks use an abundance of nonlinearity for representation (embedding) learning, they almost universally use a linear classifier on the learned embeddings. This is suboptimal since better nonlinear classifiers could exist in the same embedding vector space. We advocate a nonlinear kernelized classification layer for deep networks to tackle this problem. We theoretically show that our classification layer optimizes over all possible kernel functions on the space of embeddings to learn an optimal nonlinear classifier. We then demonstrate the usefulness of this layer in learning more model-efficient classifiers in a number of computer vision and natural language processing tasks.", "keywords": "deep networks;kernels on the sphere;nonlinear classification", "primary_area": "", "supplementary_material": "", "author": "Sadeep Jayasumana;Srikumar Ramalingam;Sanjiv Kumar", "authorids": "~Sadeep_Jayasumana1;~Srikumar_Ramalingam2;~Sanjiv_Kumar1", "gender": ";M;", "homepage": ";https://www.cs.utah.edu/~srikumar/;http://www.sanjivk.com/", "dblp": ";17/4216;", "google_scholar": ";6m1ptOgAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";;", "linkedin": ";srikumar-ramalingam-17728b22/;", "or_profile": "~Sadeep_Jayasumana1;~Srikumar_Ramalingam2;~Sanjiv_Kumar1", "aff": ";Google;Google", "aff_domain": ";google.com;google.com", "position": ";Research Scientist;Research Scientist", "bibtex": "@misc{\njayasumana2022modelefficient,\ntitle={Model-Efficient Deep Learning with Kernelized Classification},\nauthor={Sadeep Jayasumana and Srikumar Ramalingam and Sanjiv Kumar},\nyear={2022},\nurl={https://openreview.net/forum?id=30SXt3-vvnM}\n}", "github": "", "project": "", "reviewers": "ZEPc;pWF3;PtK6;WDU4", "site": "https://openreview.net/forum?id=30SXt3-vvnM", "pdf_size": 0, "recommendation": "3;3;6;6", "confidence": "4;4;3;4", "correctness": "3;2;4;4", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "104;73;82;87", "wc_summary_review": "54;24;51;28", "wc_main_review": "532;96;166;297", "wc_review": "690;193;299;412", "wc_reply_reviewers": "46;0;0;37", "wc_reply_authors": "1511;393;338;746", "reply_reviewers": "1;0;0;1", "reply_authors": "3;1;1;2", "recommendation_avg": [ 4.5, 1.5 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 86.5, 11.280514172678478 ], "wc_summary_review_avg": [ 39.25, 13.36740438529485 ], "wc_main_review_avg": [ 272.75, 166.1586215036704 ], "wc_review_avg": [ 398.5, 185.25995249918424 ], "wc_reply_reviewers_avg": [ 20.75, 20.99255820523073 ], "wc_reply_authors_avg": [ 747.0, 468.05288162770665 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.75, 0.82915619758885 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.5773502691896258, "corr_recommendation_correctness": 0.9045340337332909, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4516588113967378178&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Google", "aff_unique_dep": "Google", "aff_unique_url": "https://www.google.com", "aff_unique_abbr": "Google", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "30nbp1eV0dJ", "title": "Tight lower bounds for Differentially Private ERM", "track": "main", "status": "Reject", "tldr": "", "abstract": "We consider the lower bounds of differentially private ERM for general convex functions. For approximate-DP, the well-known upper bound of DP-ERM is $O(\\frac{\\sqrt{p\\log(1/\\delta)}}{\\epsilon n})$, which is believed to be tight. However, current lower bounds are off by some logarithmic terms, in particular $\\Omega(\\frac{\\sqrt{p}}{\\epsilon n})$ for constrained case and $\\Omega(\\frac{\\sqrt{p}}{\\epsilon n \\log p})$ for unconstrained case.\n\nWe achieve tight $\\Omega(\\frac{\\sqrt{p \\log(1/\\delta)}}{\\epsilon n})$ lower bounds for both cases by introducing a novel biased mean property for fingerprinting codes. As for pure-DP, we utilize a novel $\\ell_2$ loss function instead of linear functions considered by previous papers, and achieve the first (tight) $\\Omega(\\frac{p}{\\epsilon n})$ lower bound. We also introduce an auxiliary dimension to simplify the computation brought by $\\ell_2$ loss. Our results close a gap in our understanding of DP-ERM by presenting the fundamental limits. Our techniques may be of independent interest, which help enrich the tools so that it readily applies to problems that are not (easily) reducible from one-way marginals.", "keywords": "Differential Privacy;Empirical Risk Minimization;Lower bounds", "primary_area": "", "supplementary_material": "/attachment/ee943d753a2cfaf40898567b87dc730ce3af1e60.zip", "author": "Daogao Liu;Zhou Lu", "authorids": "~Daogao_Liu1;~Zhou_Lu1", "gender": "M;", "homepage": "https://daogaoliu.github.io/;https://leozoroaster.github.io/", "dblp": "245/4078;68/11524", "google_scholar": "auA3AaQAAAAJ;17_nX_kAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Daogao_Liu1;~Zhou_Lu1", "aff": "University of Washington, Seattle;Princeton University", "aff_domain": "uw.edu;princeton.edu", "position": "PhD student;PhD student", "bibtex": "@misc{\nliu2022tight,\ntitle={Tight lower bounds for Differentially Private {ERM}},\nauthor={Daogao Liu and Zhou Lu},\nyear={2022},\nurl={https://openreview.net/forum?id=30nbp1eV0dJ}\n}", "github": "", "project": "", "reviewers": "rseh;E1tY;FRcE;bqs4", "site": "https://openreview.net/forum?id=30nbp1eV0dJ", "pdf_size": 0, "recommendation": "3;5;5;8", "confidence": "4;3;2;3", "correctness": "4;2;4;4", "technical_novelty": "2;3;3;3", "empirical_novelty": "0;0;3;1", "wc_summary_paper": "85;417;62;70", "wc_summary_review": "45;86;48;42", "wc_main_review": "216;553;266;74", "wc_review": "346;1056;376;186", "wc_reply_reviewers": "136;95;0;0", "wc_reply_authors": "644;870;72;25", "reply_reviewers": "2;1;0;0", "reply_authors": "3;3;1;1", "recommendation_avg": [ 5.25, 1.7853571071357126 ], "confidence_avg": [ 3.0, 0.7071067811865476 ], "correctness_avg": [ 3.5, 0.8660254037844386 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.0, 1.224744871391589 ], "wc_summary_paper_avg": [ 158.5, 149.47324175249562 ], "wc_summary_review_avg": [ 55.25, 17.879807045938723 ], "wc_main_review_avg": [ 277.25, 174.08816013732812 ], "wc_review_avg": [ 491.0, 334.1032774457623 ], "wc_reply_reviewers_avg": [ 57.75, 59.541477139889636 ], "wc_reply_authors_avg": [ 402.75, 363.52948642441646 ], "reply_reviewers_avg": [ 0.75, 0.82915619758885 ], "reply_authors_avg": [ 2.0, 1.0 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.39605901719066966, "corr_recommendation_correctness": 0.08084520834544431, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6112598734384419642&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1", "aff_unique_norm": "University of Washington;Princeton University", "aff_unique_dep": ";", "aff_unique_url": "https://www.washington.edu;https://www.princeton.edu", "aff_unique_abbr": "UW;Princeton", "aff_campus_unique_index": "0", "aff_campus_unique": "Seattle;", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "A Tale of Two Flows: Cooperative Learning of Langevin Flow and Normalizing Flow Toward Energy-Based Model", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6153", "id": "31d5RLCUuXC", "poster": "", "openreview": "https://openreview.net/forum?id=31d5RLCUuXC", "slides": "https://iclr.cc/virtual/2022/poster/6153", "video": "https://iclr.cc/virtual/2022/poster/6153", "author_site": "Jianwen Xie, yaxuan zhu, Jun Li, Ping Li", "tldr": "", "abstract": "This paper studies the cooperative learning of two generative flow models, in which the two models are iteratively updated based on the jointly synthesized examples. The first flow model is a normalizing flow that transforms an initial simple density to a target density by applying a sequence of invertible transformations. The second flow model is a Langevin flow that runs finite steps of gradient-based MCMC toward an energy-based model. We start from proposing a generative framework that trains an energy-based model with a normalizing flow as an amortized sampler to initialize the MCMC chains of the energy-based model. In each learning iteration, we generate synthesized examples by using a normalizing flow initialization followed by a short-run Langevin flow revision toward the current energy-based model. Then we treat the synthesized examples as fair samples from the energy-based model and update the model parameters with the maximum likelihood learning gradient, while the normalizing flow directly learns from the synthesized examples by maximizing the tractable likelihood. Under the short-run non-mixing MCMC scenario, the estimation of the energy-based model is shown to follow the perturbation of maximum likelihood, and the short-run Langevin flow and the normalizing flow form a two-flow generator that we call CoopFlow. We provide an understating of the CoopFlow algorithm by information geometry and show that it is a valid generator as it converges to a moment matching estimator. We demonstrate that the trained CoopFlow is capable of synthesizing realistic images, reconstructing images, and interpolating between images.", "keywords": "Langevin dynamics;energy-based model;normalizing flow;cooperative learning;short-run MCMC", "primary_area": "", "supplementary_material": "", "author": "Jianwen Xie;Yaxuan Zhu;Jun Li;Ping Li", "authorids": "~Jianwen_Xie1;~Yaxuan_Zhu1;~Jun_Li13;~Ping_Li3", "gender": ";M;M;M", "homepage": ";;https://junli-galios.github.io/;http://www.stat.rutgers.edu/home/pingli/", "dblp": ";289/6018;116/1011-98;62/5860-1", "google_scholar": ";EptgCGsAAAAJ;fyQZYz8AAAAJ;", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Jianwen_Xie1;~Yaxuan_Zhu1;~Jun_Li13;~Ping_Li3", "aff": ";University of California, Los Angeles;Baidu;LinkedIn", "aff_domain": ";ucla.edu;baidu.com;linkedin.com", "position": ";PhD student;Postdoc;Engineer", "bibtex": "@inproceedings{\nxie2022a,\ntitle={A Tale of Two Flows: Cooperative Learning of Langevin Flow and Normalizing Flow Toward Energy-Based Model},\nauthor={Jianwen Xie and Yaxuan Zhu and Jun Li and Ping Li},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=31d5RLCUuXC}\n}", "github": "", "project": "", "reviewers": "QoKP;DUki;bAHo;1uN8", "pdf_size": 0, "recommendation": "3;6;6;8", "confidence": "4;3;4;3", "correctness": "3;3;3;4", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "197;77;54;112", "wc_summary_review": "43;131;35;22", "wc_main_review": "495;383;179;197", "wc_review": "735;591;268;331", "wc_reply_reviewers": "379;297;49;24", "wc_reply_authors": "6290;2674;2977;1137", "reply_reviewers": "1;2;1;1", "reply_authors": "14;7;6;4", "recommendation_avg": [ 5.75, 1.7853571071357126 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 110.0, 54.30929938785806 ], "wc_summary_review_avg": [ 57.75, 42.949825377991935 ], "wc_main_review_avg": [ 313.5, 131.7526090823252 ], "wc_review_avg": [ 481.25, 190.0557484003049 ], "wc_reply_reviewers_avg": [ 187.25, 153.7666657634222 ], "wc_reply_authors_avg": [ 3269.5, 1878.2407327070723 ], "reply_reviewers_avg": [ 1.25, 0.4330127018922193 ], "reply_authors_avg": [ 7.75, 3.766629793329841 ], "replies_avg": [ 42, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.7001400420140049, "corr_recommendation_correctness": 0.7276068751089989, "gs_citation": 54, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13555643758738744317&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=31d5RLCUuXC", "email": ";ucla.edu;baidu.com;linkedin.com", "author_num": 4, "aff_unique_index": "0;1;2", "aff_unique_norm": "University of California, Los Angeles;Baidu;LinkedIn Corporation", "aff_unique_dep": ";Baidu, Inc.;", "aff_unique_url": "https://www.ucla.edu;https://www.baidu.com;https://www.linkedin.com", "aff_unique_abbr": "UCLA;Baidu;LinkedIn", "aff_campus_unique_index": "0", "aff_campus_unique": "Los Angeles;", "aff_country_unique_index": "0;1;0", "aff_country_unique": "United States;China" }, { "id": "327eol9Xgyi", "title": "Trident Pyramid Networks: The importance of processing at the feature pyramid level for better object detection", "track": "main", "status": "Reject", "tldr": "", "abstract": "Feature pyramids have become ubiquitous in multi-scale computer vision tasks such as object detection. Based on their importance, we divide a computer vision network into three parts: a backbone (generating a feature pyramid), a core (refining the feature pyramid) and a head (generating the final output). Most existing networks operating on feature pyramids, named cores, are shallow and mostly focus on communication-based processing in the form of top-down and bottom-up operations. We present a new core architecture called Trident Pyramid Network (TPN), that allows for a deeper design and for a better balance between communication-based processing and self-processing. We show consistent improvements when using our TPN core on the COCO object detection benchmark, outperforming the popular BiFPN baseline by 1.5 AP. Additionally, we empirically show that it is more beneficial to put additional computation into the TPN core, rather than into the backbone, by outperforming a ResNet-101+FPN baseline with our ResNet-50+TPN network by 1.7 AP, while operating under similar computation budgets. This emphasizes the importance of performing computation at the feature pyramid level in modern-day object detection systems. Code will be released.", "keywords": "feature pyramid;network architecture;object detection;deep learning", "primary_area": "", "supplementary_material": "", "author": "C\u00e9dric Picron;Tinne Tuytelaars", "authorids": "~C\u00e9dric_Picron1;~Tinne_Tuytelaars1", "gender": "M;", "homepage": "https://www.kuleuven.be/wieiswie/nl/person/00123384;", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": "~C\u00e9dric_Picron1;~Tinne_Tuytelaars1", "aff": "Department of Electrical Engineering, KU Leuven, Belgium, KU Leuven;", "aff_domain": "esat.kuleuven.be;", "position": "PhD student;", "bibtex": "@misc{\npicron2022trident,\ntitle={Trident Pyramid Networks: The importance of processing at the feature pyramid level for better object detection},\nauthor={C{\\'e}dric Picron and Tinne Tuytelaars},\nyear={2022},\nurl={https://openreview.net/forum?id=327eol9Xgyi}\n}", "github": "", "project": "", "reviewers": "EG97;MSxk;YsZ7;NXJS", "site": "https://openreview.net/forum?id=327eol9Xgyi", "pdf_size": 0, "recommendation": "3;5;6;6", "confidence": "5;5;4;4", "correctness": "4;4;3;3", "technical_novelty": "2;2;3;2", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "46;73;209;78", "wc_summary_review": "25;26;124;57", "wc_main_review": "268;263;959;296", "wc_review": "339;362;1292;431", "wc_reply_reviewers": "1155;125;17;0", "wc_reply_authors": "1573;374;1224;287", "reply_reviewers": "4;1;1;0", "reply_authors": "4;1;3;1", "recommendation_avg": [ 5.0, 1.224744871391589 ], "confidence_avg": [ 4.5, 0.5 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 101.5, 63.24752959602454 ], "wc_summary_review_avg": [ 58.0, 40.21815510438041 ], "wc_main_review_avg": [ 446.5, 296.1591632889315 ], "wc_review_avg": [ 606.0, 397.5066037187307 ], "wc_reply_reviewers_avg": [ 324.25, 482.0235341764964 ], "wc_reply_authors_avg": [ 864.5, 548.9328283132646 ], "reply_reviewers_avg": [ 1.5, 1.5 ], "reply_authors_avg": [ 2.25, 1.299038105676658 ], "replies_avg": [ 22, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.8164965809277259, "corr_recommendation_correctness": -0.8164965809277259, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17261488724596259658&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0", "aff_unique_norm": "KU Leuven", "aff_unique_dep": "Department of Electrical Engineering", "aff_unique_url": "https://www.kuleuven.be", "aff_unique_abbr": "KU Leuven", "aff_country_unique_index": "0", "aff_country_unique": "Belgium" }, { "id": "32KyhxmvmO", "title": "Learning Representations of Partial Subgraphs by Subgraph InfoMax", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Subgraphs are important substructures of graphs, but learning their representations has not been studied well. Particularly, when we have partial subgraphs, existing node- or subgraph-level message-passing is likely to produce suboptimal representations. In this paper, we propose Intra- and Inter-Subgraph InfoMax, a model that learns subgraph representations under incomplete observation. Our model employs subgraph summaries at two different levels while maximizing the mutual information between the subgraph summaries and the node representations. By doing so, we reconstruct the representation of the underlying subgraph and improve its expressiveness from different angles of the local-global structure. We conduct experiments on three real-world datasets under training and evaluation protocols designed for this problem. Experimental results show that our model outperforms baselines in all settings.", "keywords": "Graph Neural Network;Subgraph;Mutual Information Maximization", "primary_area": "", "supplementary_material": "/attachment/e901f812a85f092161197695ccd0484477c65078.zip", "author": "Dongkwan Kim;Jiho Jin;Jaimeen Ahn;Alice Oh", "authorids": "~Dongkwan_Kim1;~Jiho_Jin1;~Jaimeen_Ahn1;~Alice_Oh1", "gender": "M;;M;F", "homepage": "https://dongkwan-kim.github.io/;https://jinjh0123.github.io/;https://jaimeenahn.github.io;http://uilab.kr", "dblp": "62/10307-1.html;320/5607;;50/7562", "google_scholar": "KgjSE64AAAAJ;-I0ahKwAAAAJ;-UW9YckAAAAJ;https://scholar.google.co.kr/citations?user=B88-xMEAAAAJ", "orcid": ";0000-0002-1767-3733;;", "linkedin": "dongkwan-kim/;jiho-jin;;alice-oh-4677544/", "or_profile": "~Dongkwan_Kim1;~Jiho_Jin1;~Jaimeen_Ahn1;~Alice_Oh1", "aff": "Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology", "aff_domain": "kaist.ac.kr;kaist.ac.kr;kaist.ac.kr;kaist.ac.kr", "position": "PhD student;MS student;MS student;Full Professor", "bibtex": "@misc{\nkim2022learning,\ntitle={Learning Representations of Partial Subgraphs by Subgraph InfoMax},\nauthor={Dongkwan Kim and Jiho Jin and Jaimeen Ahn and Alice Oh},\nyear={2022},\nurl={https://openreview.net/forum?id=32KyhxmvmO}\n}", "github": "", "project": "", "reviewers": "38CS;z3wS;NYN3", "site": "https://openreview.net/forum?id=32KyhxmvmO", "pdf_size": 0, "recommendation": "3;3;3", "confidence": "4;4;4", "correctness": "3;3;2", "technical_novelty": "2;2;2", "empirical_novelty": "3;2;2", "wc_summary_paper": "39;28;9", "wc_summary_review": "17;13;11", "wc_main_review": "515;84;237", "wc_review": "571;125;257", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "603;135;216", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 2.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 25.333333333333332, 12.39175353029407 ], "wc_summary_review_avg": [ 13.666666666666666, 2.494438257849294 ], "wc_main_review_avg": [ 278.6666666666667, 178.40465863374257 ], "wc_review_avg": [ 317.6666666666667, 187.06386312938395 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 318.0, 204.22046910141012 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:RM1cPMXnCqUJ:scholar.google.com/&scioq=Learning+Representations+of+Partial+Subgraphs+by+Subgraph+InfoMax&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Korea Advanced Institute of Science and Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.kaist.ac.kr", "aff_unique_abbr": "KAIST", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "South Korea" }, { "id": "32OdIHsu1_", "title": "DL-based prediction of optimal actions of human experts", "track": "main", "status": "Reject", "tldr": "", "abstract": "Expert systems have been developed to emulate human experts\u2019 decision-making. Once developed properly, expert systems can assist or substitute human experts, but they require overly expensive knowledge engineering/acquisition. Notably, deep learning (DL) can train highly efficient computer vision systems only from examples instead of relying on carefully selected feature sets by human experts. Thus, we hypothesize that DL can be used to build expert systems that can learn human experts\u2019 decision-making from examples only without relying on overly expensive knowledge engineering. To address this hypothesis, we train DL agents to predict optimal strategies (actions or action sequences) for the popular game `Angry Birds\u2019, which requires complex problem-solving skills. In our experiments, after being trained with screenshots of different levels and pertinent 3-star guides, DL agents can predict strategies for unseen levels. This raises the possibility of building DL-based expert systems that do not require overly expensive knowledge engineering.", "keywords": "deep learning;expert system;sequential learning", "primary_area": "", "supplementary_material": "", "author": "Jung H. Lee;Ryan S Butner;Elise Saxon;Nathan Oken Hodas", "authorids": "~Jung_H._Lee2;ryan.butner@pnnl.gov;elise.saxon@pnnl.gov;~Nathan_Oken_Hodas1", "gender": "M;;;M", "homepage": ";;;", "dblp": "155/2694;;;", "google_scholar": "Qa4tx8sAAAAJ;;;", "orcid": "0000-0001-8568-8163;;;", "linkedin": ";;;", "or_profile": "~Jung_H._Lee2;ryan.butner@pnnl.gov;elise.saxon@pnnl.gov;~Nathan_Oken_Hodas1", "aff": "Allen Institute for Brain Science;;;", "aff_domain": "alleninstitute.org;;;", "position": "Researcher;;;", "bibtex": "@misc{\nlee2022dlbased,\ntitle={{DL}-based prediction of optimal actions of human experts},\nauthor={Jung H. Lee and Ryan S Butner and Elise Saxon and Nathan Oken Hodas},\nyear={2022},\nurl={https://openreview.net/forum?id=32OdIHsu1_}\n}", "github": "", "project": "", "reviewers": "QBNS;6Hog;7AXP;iZWN", "site": "https://openreview.net/forum?id=32OdIHsu1_", "pdf_size": 0, "recommendation": "1;3;3;5", "confidence": "4;4;3;4", "correctness": "3;1;1;1", "technical_novelty": "1;2;2;2", "empirical_novelty": "1;2;2;2", "wc_summary_paper": "65;58;38;129", "wc_summary_review": "10;28;31;66", "wc_main_review": "157;166;227;578", "wc_review": "232;252;296;773", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "106;128;221;469", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 3.0, 1.4142135623730951 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 1.5, 0.8660254037844386 ], "technical_novelty_avg": [ 1.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 72.5, 34.09178786746157 ], "wc_summary_review_avg": [ 33.75, 20.27775875189366 ], "wc_main_review_avg": [ 282.0, 173.00433520579767 ], "wc_review_avg": [ 388.25, 223.33872816867208 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 231.0, 144.02951086496128 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": -0.816496580927726, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:vMHGrFD38mAJ:scholar.google.com/&scioq=DL-based+prediction+of+optimal+actions+of+human+experts&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "Allen Institute for Brain Science", "aff_unique_dep": "", "aff_unique_url": "https://www.alleninstitute.org", "aff_unique_abbr": "Allen Institute", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "id": "33nhOe3cTd", "title": "Spending Thinking Time Wisely: Accelerating MCTS with Virtual Expansions", "track": "main", "status": "Reject", "tldr": "", "abstract": "One of the most important AI research questions is to trade off computation versus performance, since \"perfect rational\" exists in theory but it is impossible to achieve in practice. Recently, Monte-Carlo tree search (MCTS) has attracted considerable attention due to the significant improvement of performance in varieties of challenging domains. However, the expensive time cost during search severely restricts its scope for applications. This paper proposes the Virtual MCTS (V-MCTS), a variant of MCTS that mimics the human behavior that spends adequate amounts of time to think about different questions. Inspired by this, we propose a strategy that converges to the ground truth MCTS search results with much less computation. We give theoretical bounds of the V-MCTS and evaluate the performance in $9 \\times 9$ Go board games and Atari games. Experiments show that our method can achieve similar performances as the original search algorithm while requiring less than $50\\%$ number of search times on average.\nWe believe that this approach is a viable alternative for tasks with limited time and resources. ", "keywords": "Computer Go;Monte-Carlo Tree Search;Reinforcement learning;Adaptive;Acceleration", "primary_area": "", "supplementary_material": "", "author": "Weirui Ye;Pieter Abbeel;Yang Gao", "authorids": "~Weirui_Ye1;~Pieter_Abbeel2;~Yang_Gao1", "gender": "M;M;M", "homepage": "https://yewr.github.io/;https://people.eecs.berkeley.edu/~pabbeel/;http://yang-gao.weebly.com", "dblp": "245/3595;;89/4402-29", "google_scholar": "_GgST9AAAAAJ;https://scholar.google.com.tw/citations?user=vtwH6GkAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";;", "linkedin": ";;yang-gao-45245348/", "or_profile": "~Weirui_Ye1;~Pieter_Abbeel2;~Yang_Gao1", "aff": "Tsinghua University;Covariant;Tsinghua University", "aff_domain": "tsinghua.edu.cn;covariant.ai;tsinghua.edu.cn", "position": "PhD student;Founder;Assistant Professor", "bibtex": "@misc{\nye2022spending,\ntitle={Spending Thinking Time Wisely: Accelerating {MCTS} with Virtual Expansions},\nauthor={Weirui Ye and Pieter Abbeel and Yang Gao},\nyear={2022},\nurl={https://openreview.net/forum?id=33nhOe3cTd}\n}", "github": "", "project": "", "reviewers": "i2Lg;xM7t;d2cw", "site": "https://openreview.net/forum?id=33nhOe3cTd", "pdf_size": 0, "recommendation": "3;5;8", "confidence": "5;4;4", "correctness": "2;3;3", "technical_novelty": "2;2;3", "empirical_novelty": "2;2;3", "wc_summary_paper": "101;64;220", "wc_summary_review": "52;66;59", "wc_main_review": "209;323;749", "wc_review": "362;453;1028", "wc_reply_reviewers": "0;0;131", "wc_reply_authors": "427;301;491", "reply_reviewers": "0;0;1", "reply_authors": "1;1;2", "recommendation_avg": [ 5.333333333333333, 2.0548046676563256 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "correctness_avg": [ 2.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 128.33333333333334, 66.55490632219052 ], "wc_summary_review_avg": [ 59.0, 5.715476066494082 ], "wc_main_review_avg": [ 427.0, 232.39621339428058 ], "wc_review_avg": [ 614.3333333333334, 294.85627383900487 ], "wc_reply_reviewers_avg": [ 43.666666666666664, 61.753992223625154 ], "wc_reply_authors_avg": [ 406.3333333333333, 78.93175674101155 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.8029550685469661, "corr_recommendation_correctness": 0.8029550685469661, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14923053732579049360&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff_unique_index": "0;1;0", "aff_unique_norm": "Tsinghua University;Covariant", "aff_unique_dep": ";", "aff_unique_url": "https://www.tsinghua.edu.cn;", "aff_unique_abbr": "THU;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China;" }, { "id": "34k1OWJWtDW", "title": "Sample-specific and Context-aware Augmentation for Long Tail Image Classification", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Recent long-tail classification methods generally adopt the two-stage pipeline and focus on learning the classifier to tackle the imbalanced data in the second stage via re-sampling or re-weighting, but the classifier is easily prone to overconfidence in head classes. Data augmentation is a natural way to tackle this issue. Existing augmentation methods either perform low-level transformations or apply the same semantic transformation for all samples. However, meaningful augmentations for different samples should be different. In this paper, we propose a novel sample-specific and context-aware augmentation learning method for long-tail image classification. We model the semantic within-class transformation range for each sample by a specific Gaussian distribution and design a semantic transformation generator (STG) to predict the distribution from the sample itself. To encode the context information accurately, STG is equipped with a memory-based structure. We train STG by constructing ground-truth distributions for samples of head classes in the feature space. We apply STG to samples of tail classes for augmentation in the classifier-tuning stage. Extensive experiments on four imbalanced datasets show the effectiveness of our method.", "keywords": "Long-tail image classification;Semantic augmentation", "primary_area": "", "supplementary_material": "", "author": "Jiahao Chen;Bing Su", "authorids": "~Jiahao_Chen4;~Bing_Su1", "gender": "M;M", "homepage": "https://jiahaochen1.github.io/;https://gsai.ruc.edu.cn/bingsu", "dblp": ";41/5270-1", "google_scholar": "https://scholar.google.com.hk/citations?user=Af4IREwAAAAJ;https://scholar.google.com.sg/citations?user=d3g2VJQAAAAJ", "orcid": ";0000-0001-8560-1910", "linkedin": ";", "or_profile": "~Jiahao_Chen4;~Bing_Su1", "aff": "Harbin Institute of Technology;Renmin University of China", "aff_domain": "hit.edu.cn;ruc.edu.cn", "position": "MS student;Associate Professor", "bibtex": "@misc{\nchen2022samplespecific,\ntitle={Sample-specific and Context-aware Augmentation for Long Tail Image Classification},\nauthor={Jiahao Chen and Bing Su},\nyear={2022},\nurl={https://openreview.net/forum?id=34k1OWJWtDW}\n}", "github": "", "project": "", "reviewers": "QNmc;BZZD;xaeX;Wd9x", "site": "https://openreview.net/forum?id=34k1OWJWtDW", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "4;4;4;3", "correctness": "3;2;1;3", "technical_novelty": "2;2;2;2", "empirical_novelty": "3;2;2;2", "wc_summary_paper": "58;142;105;84", "wc_summary_review": "53;40;38;57", "wc_main_review": "285;304;343;401", "wc_review": "396;486;486;542", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 2.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 97.25, 30.73576906472327 ], "wc_summary_review_avg": [ 47.0, 8.154753215150045 ], "wc_main_review_avg": [ 333.25, 44.35298749802543 ], "wc_review_avg": [ 477.5, 52.313956072925706 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -1.0, "corr_recommendation_correctness": 0.5222329678670935, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3898261487234927247&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1", "aff_unique_norm": "Harbin Institute of Technology;Renmin University of China", "aff_unique_dep": ";", "aff_unique_url": "http://www.hit.edu.cn/;http://www.ruc.edu.cn", "aff_unique_abbr": "HIT;RUC", "aff_campus_unique_index": "0", "aff_campus_unique": "Harbin;", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "id": "34mWBCWMxh9", "title": "Blur Is an Ensemble: Spatial Smoothings to Improve Accuracy, Uncertainty, and Robustness", "track": "main", "status": "Reject", "tldr": "", "abstract": "Bayesian neural networks (BNNs) have shown success in the areas of uncertainty estimation and robustness. However, a crucial challenge prohibits their use in practice. Bayesian NNs require a large number of predictions to produce reliable results, leading to a significant increase in computational cost. To alleviate this issue, we propose spatial smoothing, a method that ensembles neighboring feature map points of CNNs. By simply adding a few blur layers to the models, we empirically show that spatial smoothing improves accuracy, uncertainty estimation, and robustness of BNNs across a whole range of ensemble sizes. In particular, BNNs incorporating spatial smoothing achieve high predictive performance merely with a handful of ensembles. Moreover, this method also can be applied to canonical deterministic neural networks to improve the performances. A number of evidences suggest that the improvements can be attributed to the stabilized feature maps and the flattening of the loss landscape. In addition, we provide a fundamental explanation for prior works\u2014namely, global average pooling, pre-activation, and ReLU6\u2014by addressing them as special cases of spatial smoothing. These not only enhance accuracy, but also improve uncertainty estimation and robustness by making the loss landscape smoother in the same manner as spatial smoothing. \n", "keywords": "bayesian neural network;uncertainty;uncertainty estimation;uncertainty quantification", "primary_area": "", "supplementary_material": "/attachment/864eff245ae7bfde3137e1ee1787ba114b553631.zip", "author": "Namuk Park;Songkuk Kim", "authorids": "~Namuk_Park1;~Songkuk_Kim1", "gender": ";M", "homepage": "http://namukpark.com/;", "dblp": "244/9940;78/2018", "google_scholar": "c2vdTRAAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";", "linkedin": ";", "or_profile": "~Namuk_Park1;~Songkuk_Kim1", "aff": "Yonsei University;Yonsei University", "aff_domain": "yonsei.ac.kr;yonsei.ac.kr", "position": "PhD student;Assistant Professor", "bibtex": "@misc{\npark2022blur,\ntitle={Blur Is an Ensemble: Spatial Smoothings to Improve Accuracy, Uncertainty, and Robustness},\nauthor={Namuk Park and Songkuk Kim},\nyear={2022},\nurl={https://openreview.net/forum?id=34mWBCWMxh9}\n}", "github": "", "project": "", "reviewers": "6pZ5;RJkD;vPUa;3UhV", "site": "https://openreview.net/forum?id=34mWBCWMxh9", "pdf_size": 0, "recommendation": "5;5;5;8", "confidence": "4;3;4;2", "correctness": "2;3;3;3", "technical_novelty": "3;2;3;3", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "17;125;31;91", "wc_summary_review": "82;80;41;45", "wc_main_review": "96;440;68;93", "wc_review": "195;645;140;229", "wc_reply_reviewers": "82;429;0;0", "wc_reply_authors": "1093;1498;243;154", "reply_reviewers": "2;1;0;0", "reply_authors": "4;3;1;1", "recommendation_avg": [ 5.75, 1.299038105676658 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 66.0, 43.965895873961216 ], "wc_summary_review_avg": [ 62.0, 19.06567596493762 ], "wc_main_review_avg": [ 174.25, 153.81543323086927 ], "wc_review_avg": [ 302.25, 200.41878030763485 ], "wc_reply_reviewers_avg": [ 127.75, 177.11913363609253 ], "wc_reply_authors_avg": [ 747.0, 567.7547886191714 ], "reply_reviewers_avg": [ 0.75, 0.82915619758885 ], "reply_authors_avg": [ 2.25, 1.299038105676658 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.8703882797784892, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14900968091861004409&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Yonsei University", "aff_unique_dep": "", "aff_unique_url": "https://www.yonsei.ac.kr", "aff_unique_abbr": "Yonsei", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "South Korea" }, { "id": "35-QqyfmjfP", "title": "AnoSeg: Anomaly Segmentation Network Using Self-Supervised Learning", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Anomaly segmentation, which localizes defective areas, is an important component in large-scale industrial manufacturing. However, most recent researches have focused on anomaly detection. This paper proposes a novel anomaly segmentation network (AnoSeg) that can directly generate an accurate anomaly map using self-supervised learning. For highly accurate anomaly segmentation, the proposed AnoSeg considers three novel techniques: Anomaly data generation based on hard augmentation, self-supervised learning with pixel-wise and adversarial losses, and coordinate channel concatenation. First, to generate synthetic anomaly images and reference masks for normal data, the proposed method uses hard augmentation to change the normal sample distribution. Then, the proposed AnoSeg is trained in a self-supervised learning manner from the synthetic anomaly data and normal data. Finally, the coordinate channel, which represents the pixel location information, is concatenated to an input of AnoSeg to consider the positional relationship of each pixel in the image. The estimated anomaly map can also be utilized to improve the performance of anomaly detection. Our experiments show that the proposed method outperforms the state-of-the-art anomaly detection and anomaly segmentation methods for the MVTec AD dataset. In addition, we compared the proposed method with the existing methods through the intersection over union (IoU) metric commonly used in segmentation tasks and demonstrated the superiority of our method for anomaly segmentation.", "keywords": "Anomaly detection;Anomaly segmentation;Self-Supervised learning", "primary_area": "", "supplementary_material": "/attachment/f62ec278ec8ed27fd0648e25d132b957a531a437.zip", "author": "Jouwon Song;Kyeongbo Kong;Ye-In Park;Seong-Gyun Kim;Suk-Ju Kang", "authorids": "~Jouwon_Song1;~Kyeongbo_Kong1;~Ye-In_Park1;~Seong-Gyun_Kim1;~Suk-Ju_Kang1", "gender": "M;M;F;M;M", "homepage": ";https://www.pnu-cvsp.com/;;http://www.lgdisplay.com;http://vds.sogang.ac.kr/", "dblp": "285/1055.html;218/1547;285/1287.html;212/9288.html;99/7096", "google_scholar": "H5vn5JIAAAAJ;O9QSF7UAAAAJ;k9Zt3UoAAAAJ;;", "orcid": ";0000-0002-1135-7502;;;", "linkedin": ";;;;", "or_profile": "~Jouwon_Song1;~Kyeongbo_Kong1;~Ye-In_Park1;~Seong-Gyun_Kim1;~Suk-Ju_Kang1", "aff": "Sogang University;Pukyoung National University;Sogang University;LG Display;Sogang University", "aff_domain": "sogang.ac.kr;pknu.ac.kr;sogang.ac.kr;lgdisplay.com;sogang.ac.kr", "position": "MS student;Assistant Professor;MS student;Principal Researcher;Full Professor", "bibtex": "@misc{\nsong2022anoseg,\ntitle={AnoSeg: Anomaly Segmentation Network Using Self-Supervised Learning},\nauthor={Jouwon Song and Kyeongbo Kong and Ye-In Park and Seong-Gyun Kim and Suk-Ju Kang},\nyear={2022},\nurl={https://openreview.net/forum?id=35-QqyfmjfP}\n}", "github": "", "project": "", "reviewers": "FgYo;CL9y;zWkb;42zs", "site": "https://openreview.net/forum?id=35-QqyfmjfP", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "4;3;3;5", "correctness": "1;3;3;3", "technical_novelty": "1;2;3;2", "empirical_novelty": "1;2;4;4", "wc_summary_paper": "19;37;125;194", "wc_summary_review": "15;29;65;59", "wc_main_review": "119;44;318;301", "wc_review": "153;110;508;554", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 2.5, 0.8660254037844386 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.75, 1.299038105676658 ], "wc_summary_paper_avg": [ 93.75, 70.417948706278 ], "wc_summary_review_avg": [ 42.0, 20.71231517720798 ], "wc_main_review_avg": [ 195.5, 117.19748290812393 ], "wc_review_avg": [ 331.25, 200.98678439141216 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.20751433915982243, "corr_recommendation_correctness": 0.9271726499455306, "gs_citation": 68, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18265733496785355154&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;0;2;0", "aff_unique_norm": "Sogang University;Pukyoung National University;LG", "aff_unique_dep": ";;LG Display", "aff_unique_url": "https://www.sogang.ac.kr;http://www.pukyong.ac.kr;https://www.lgdisplay.com", "aff_unique_abbr": "Sogang;PKNU;LG Display", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "South Korea" }, { "id": "35jJIcBiEyj", "title": "From Biased Data to Unbiased Models: a Meta-Learning Approach", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "It is well known that large deep architectures are powerful models when adequately trained, but may exhibit undesirable behavior leading to confident incorrect predictions, even when evaluated on slightly different test examples. Test data characterized by distribution shifts (from training data distribution), outliers, and adversarial samples are among the types of data affected by this problem.\nThis situation worsens whenever data are biased, meaning that predictions are mostly based on spurious correlations present in the data. Unfortunately, since such correlations occur in the most of data, a model is prevented from correctly generalizing the considered classes.\nIn this work, we tackle this problem from a meta-learning perspective. \nConsidering the dataset as composed of unknown biased and unbiased samples, we first identify these two subsets by a pseudo-labeling algorithm, even if coarsely. \nSubsequently, we apply a bi-level optimization algorithm in which, in the inner loop, we look for the best parameters guiding the training of the two subsets, while in the outer loop, we train the final model taking benefit from augmented data generated using Mixup.\nProperly tuning the contributions of biased and unbiased data, followed by the regularization introduced by the mixed data has proved to be an effective training strategy to learn unbiased models, which show superior generalization capabilities. Experimental results on synthetically and realistically biased datasets surpass state-of-the-art performance, as compared to existing methods.", "keywords": "dataset bias;out of distribution;meta-learning;data augmentation", "primary_area": "", "supplementary_material": "", "author": "Ruggero Ragonesi;Valentina Sanguineti;Jacopo Cavazza;Vittorio Murino", "authorids": "~Ruggero_Ragonesi1;~Valentina_Sanguineti1;~Jacopo_Cavazza1;~Vittorio_Murino1", "gender": ";F;M;M", "homepage": ";;;https://www.iit.it/people-details/-/people/vittorio-murino", "dblp": ";;179/2139;62/6790", "google_scholar": ";;;yV3_PTkAAAAJ", "orcid": ";;;0000-0002-8645-2328", "linkedin": ";valentina-sanguineti-97038b164/;;vittorio-murino-514b26/", "or_profile": "~Ruggero_Ragonesi1;~Valentina_Sanguineti1;~Jacopo_Cavazza1;~Vittorio_Murino1", "aff": ";Universit\u00e0 degli Studi di Genova, Istituto Italiano di Tecnologia;;University of Verona", "aff_domain": ";iit.it;;univr.it", "position": ";PhD student;;Professor", "bibtex": "@misc{\nragonesi2022from,\ntitle={From Biased Data to Unbiased Models: a Meta-Learning Approach},\nauthor={Ruggero Ragonesi and Valentina Sanguineti and Jacopo Cavazza and Vittorio Murino},\nyear={2022},\nurl={https://openreview.net/forum?id=35jJIcBiEyj}\n}", "github": "", "project": "", "reviewers": "LA4A;DkSZ;sFyz;yyNb", "site": "https://openreview.net/forum?id=35jJIcBiEyj", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "4;3;4;3", "correctness": "3;2;3;4", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "146;182;128;135", "wc_summary_review": "56;77;52;39", "wc_main_review": "160;378;685;143", "wc_review": "362;637;865;317", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 147.75, 20.78911975048487 ], "wc_summary_review_avg": [ 56.0, 13.656500283747663 ], "wc_main_review_avg": [ 341.5, 218.90009136590143 ], "wc_review_avg": [ 545.25, 221.54951478168488 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.6882472016116854, "corr_recommendation_correctness": 0.3244428422615251, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:XYhbUUDl_yYJ:scholar.google.com/&scioq=From+Biased+Data+to+Unbiased+Models:+a+Meta-Learning+Approach&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1", "aff_unique_norm": "Universit\u00e0 degli Studi di Genova;University of Verona", "aff_unique_dep": ";", "aff_unique_url": "https://www.unige.it;https://www.univr.it", "aff_unique_abbr": "UniGe;UniVR", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Italy" }, { "id": "36SHWj0Gp1", "title": "GenTAL: Generative Denoising Skip-gram Transformer for Unsupervised Binary Code Similarity Detection", "track": "main", "status": "Reject", "tldr": "", "abstract": "Binary code similarity detection serves a critical role in cybersecurity. It alleviates the huge manual effort required in the reverse engineering process for malware analysis and vulnerability detection, where often the original source code is not available for analysis. Most of the existing solutions focus on a manual feature engineering process and customized code matching algorithms that are inefficient and inaccurate. Recent deep-learning-based solutions embed the semantics of binary code into a latent space through supervised contrastive learning. However, one cannot cover all its possible forms in the training set to learn the variance of the same semantic. In this paper, we propose an unsupervised model aiming to learn the intrinsic representation of assembly code semantics. Specifically, we propose a Transformer-based auto-encoder-like language model for the low-level assembly code grammar to capture the abstract semantic representation. By coupling a Transformer encoder and a skip-gram-style loss design, it can learn a compact representation that is robust against different compilation options. We conduct experiments on four different block-level code similarity tasks. It shows that our method is more robust compared to the state-of-the-art.", "keywords": "Representation Learning;Transformer;Autoencoder;Binary Code Similarity Detection", "primary_area": "", "supplementary_material": "", "author": "Litao Li;Steven Ding;Philippe Charland;Hanbo Yu;Christopher James Molloy", "authorids": "~Litao_Li1;~Steven_Ding1;~Philippe_Charland1;~Hanbo_Yu1;~Christopher_James_Molloy1", "gender": "M;;;M;M", "homepage": "https://github.com/lxdragoon;https://www.l1nna.com;;;https://github.com/ChrisJMolloy", "dblp": ";;30/1570;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;hanboyu/;", "or_profile": "~Litao_Li1;~Steven_Ding1;~Philippe_Charland1;~Hanbo_Yu1;~Christopher_James_Molloy1", "aff": "Queens University;Queen's University;Defence R&D Canada;Queen's University;Queens University", "aff_domain": "queensu.ca;queensu.ca;drdc-rddc.gc.ca;queensu.ca;queensu.ca", "position": "PhD student;Assistant Professor;Defence Scientist;Undergrad student;MS student", "bibtex": "@inproceedings{\nli2022gental,\ntitle={Gen{TAL}: Generative Denoising Skip-gram Transformer for Unsupervised Binary Code Similarity Detection},\nauthor={Litao Li and Steven Ding and Philippe Charland and Hanbo Yu and Christopher James Molloy},\nbooktitle={Submitted to The Tenth International Conference on Learning Representations },\nyear={2022},\nurl={https://openreview.net/forum?id=36SHWj0Gp1},\nnote={under review}\n}", "github": "", "project": "", "reviewers": "19LT;Kjjd;G9WQ", "site": "https://openreview.net/forum?id=36SHWj0Gp1", "pdf_size": 0, "recommendation": "1;3;6", "confidence": "5;5;2", "correctness": "3;3;3", "technical_novelty": "2;3;3", "empirical_novelty": "2;3;3", "wc_summary_paper": "37;38;80", "wc_summary_review": "39;11;32", "wc_main_review": "136;355;261", "wc_review": "212;404;373", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 3.3333333333333335, 2.0548046676563256 ], "confidence_avg": [ 4.0, 1.4142135623730951 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 51.666666666666664, 20.038851153585515 ], "wc_summary_review_avg": [ 27.333333333333332, 11.897712198383164 ], "wc_main_review_avg": [ 250.66666666666666, 89.70445300479174 ], "wc_review_avg": [ 329.6666666666667, 84.15990071815014 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.917662935482247, "corr_recommendation_correctness": 0.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1496542297062300411&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "aff_unique_index": "0;1;2;1;0", "aff_unique_norm": "Queens University;Queen's University;Defence Research and Development Canada", "aff_unique_dep": ";;", "aff_unique_url": "https://www.queensu.ca;https://www.queensu.ca;https://www.drdc-rddc.gc.ca", "aff_unique_abbr": "Queen's U;Queen's;DRDC", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "Canada" }, { "id": "36rU1ecTFvR", "title": "Can standard training with clean images outperform adversarial one in robust accuracy?", "track": "main", "status": "Reject", "tldr": "", "abstract": "The deep learning network has achieved great success in almost every field. Unfortunately, it is very vulnerable to adversarial attacks. A lot of researchers have devoted themselves to making the network robust. The most effective one is adversarial training, where malicious examples are generated and fed to train the network. However, this will incur a big computation load. In this work, we ask: \u201cCan standard training with clean images outperform adversarial one in robust accuracy?\u201d Surprisingly, the answer is YES. This success stems from two innovations. The first is a novel loss function that combines the traditional cross-entropy with the feature smoothing loss that encourages the features in an intermediate layer to be uniform. The collaboration between these terms sets up the grounds for our second innovation, namely Active Defense. When a clean or adversarial image feeds into the network, the defender first adds some random noise, then induces this sample to a new smoother one via promotion of feature smoothing. At that point, it can be classified correctly with high probability. Thus the perturbations carefully generated by the attacker can be diminished. While there is an inevitable clean accuracy drop, it is still comparable with others. The great benefit is the robust accuracy outperforms most of the existing methods and is quite resilient to the increase of perturbation budget. Moreover, adaptive attackers also fail to generate effective adversarial samples as the induced perturbations overweight the initial ones imposed by an adversary.", "keywords": "Adversarial Training;Robust Accuracy", "primary_area": "", "supplementary_material": "", "author": "Jing Wang;Jiahao Hu;Guanrong Li", "authorids": "~Jing_Wang4;hujiahao@mail.nankai.edu.cn;liguanrong@mail.nankai.edu.cn", "gender": ";;", "homepage": "https://cs.nankai.edu.cn/;;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": "~Jing_Wang4;hujiahao@mail.nankai.edu.cn;liguanrong@mail.nankai.edu.cn", "aff": "Nankai University;;", "aff_domain": "nankai.edu.cn;;", "position": "Associate Professor;;", "bibtex": "@misc{\nwang2022can,\ntitle={Can standard training with clean images outperform adversarial one in robust accuracy?},\nauthor={Jing Wang and Jiahao Hu and Guanrong Li},\nyear={2022},\nurl={https://openreview.net/forum?id=36rU1ecTFvR}\n}", "github": "", "project": "", "reviewers": "tMiw;R7ZE;qx2w;9ZHX", "site": "https://openreview.net/forum?id=36rU1ecTFvR", "pdf_size": 0, "recommendation": "3;3;5;6", "confidence": "4;3;4;4", "correctness": "2;2;4;3", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;2;4;2", "wc_summary_paper": "24;45;73;108", "wc_summary_review": "32;16;60;82", "wc_main_review": "182;135;362;332", "wc_review": "238;196;495;522", "wc_reply_reviewers": "177;54;46;37", "wc_reply_authors": "191;176;281;373", "reply_reviewers": "1;1;1;1", "reply_authors": "1;1;1;1", "recommendation_avg": [ 4.25, 1.299038105676658 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.82915619758885 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 62.5, 31.5 ], "wc_summary_review_avg": [ 47.5, 25.391927851189244 ], "wc_main_review_avg": [ 252.75, 96.28960224240205 ], "wc_review_avg": [ 362.75, 146.815147379281 ], "wc_reply_reviewers_avg": [ 78.5, 57.18609970963224 ], "wc_reply_authors_avg": [ 255.25, 78.95687113861592 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 20, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.5555555555555555, "corr_recommendation_correctness": 0.7543365091413573, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:ux1TzuFFlKsJ:scholar.google.com/&scioq=Can+standard+training+with+clean+images+outperform+adversarial+one+in+robust+accuracy%3F&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "Nankai University", "aff_unique_dep": "", "aff_unique_url": "http://www.nankai.edu.cn", "aff_unique_abbr": "NKU", "aff_country_unique_index": "0", "aff_country_unique": "China" }, { "id": "39Q__qgCpAH", "title": "Achieving Small-Batch Accuracy with Large-Batch Scalability via Adaptive Learning Rate Adjustment", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "We consider synchronous data-parallel neural network training with fixed large batch sizes. While the large batch size provides a high degree of parallelism, it likely degrades the generalization performance due to the low gradient noise scale. We propose a two-phase adaptive learning rate adjustment framework that tackles the poor generalization issue in large-batch training. Our empirical study shows that the number of training epochs before decaying the learning rate strongly affects the final accuracy. The framework performs extra epochs using the large learning rate even after the loss is flattened. After sufficient training under the noisy condition, the framework decays the learning rate based on the observed loss landscape at run-time. Our experimental results demonstrate that the proposed heuristics and algorithm enable to use an extremely large batch size while maintaining the model accuracy. For CIFAR-10 classification with ResNet20, our method achieves $92.66\\%$ accuracy using $8,192$ batch size, which is close to $92.83\\%$ achieved using $128$ batch size, at a negligible extra computational cost.", "keywords": "deep learning;large-batch training", "primary_area": "", "supplementary_material": "/attachment/bdecba2c0f8e79e03a70980434e05fcd26bf32d6.zip", "author": "Sunwoo Lee;Salman Avestimehr", "authorids": "~Sunwoo_Lee1;~Salman_Avestimehr1", "gender": "M;", "homepage": "https://sites.google.com/view/sunwoolee;", "dblp": "56/7811-1;", "google_scholar": "WA9KNNcAAAAJ;", "orcid": "0000-0001-6334-3068;", "linkedin": "sunwoo-lee-90a7308a;", "or_profile": "~Sunwoo_Lee1;~Salman_Avestimehr1", "aff": "University of Southern California;", "aff_domain": "usc.edu;", "position": "Postdoc;", "bibtex": "@misc{\nlee2022achieving,\ntitle={Achieving Small-Batch Accuracy with Large-Batch Scalability via Adaptive Learning Rate Adjustment},\nauthor={Sunwoo Lee and Salman Avestimehr},\nyear={2022},\nurl={https://openreview.net/forum?id=39Q__qgCpAH}\n}", "github": "", "project": "", "reviewers": "8Lwn;Yb65;DeC9;xQdC", "site": "https://openreview.net/forum?id=39Q__qgCpAH", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "4;4;4;5", "correctness": "3;2;3;3", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "61;30;49;90", "wc_summary_review": "30;52;48;34", "wc_main_review": "225;518;539;397", "wc_review": "316;600;636;521", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 57.5, 21.777281740382566 ], "wc_summary_review_avg": [ 41.0, 9.219544457292887 ], "wc_main_review_avg": [ 419.75, 124.81861840286489 ], "wc_review_avg": [ 518.25, 123.95639354224534 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3599783379490469553&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "University of Southern California", "aff_unique_dep": "", "aff_unique_url": "https://www.usc.edu", "aff_unique_abbr": "USC", "aff_campus_unique_index": "0", "aff_campus_unique": "Los Angeles", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "id": "3AkuJOgL_X", "title": "Federated Robustness Propagation: Sharing Adversarial Robustness in Federated Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Federated learning (FL) emerges as a popular distributed learning schema that learns a model from a set of participating users without requiring raw data to be shared. One major challenge of FL comes from heterogeneity in users, which may have distributionally different (or \\emph{non-iid}) data and varying computation resources. Just like in centralized learning, FL users also desire model robustness against malicious attackers at test time. Whereas adversarial training (AT) provides a sound solution for centralized learning, extending its usage for FL users has imposed significant challenges, as many users may have very limited training data as well as tight computational budgets, to afford the data-hungry and costly AT. In this paper, we study a novel learning setting that propagates adversarial robustness from high-resource users that can afford AT, to those low-resource users that cannot afford it, during the FL process. We show that existing FL techniques cannot effectively propagate adversarial robustness among \\emph{non-iid} users, and propose a simple yet effective propagation approach that transfers robustness through carefully designed batch-normalization statistics. We demonstrate the rationality and effectiveness of our method through extensive experiments. Especially, the proposed method is shown to grant FL remarkable robustness even when only a small portion of users afford AT during learning.", "keywords": "federated learning;data heterogeneity;hardware heterogeneity;security heterogeneity;adversarial robustness", "primary_area": "", "supplementary_material": "", "author": "Junyuan Hong;Haotao Wang;Zhangyang Wang;Jiayu Zhou", "authorids": "~Junyuan_Hong1;~Haotao_Wang1;~Zhangyang_Wang1;~Jiayu_Zhou1", "gender": "M;;M;M", "homepage": "https://jyhong.gitlab.io/;;https://vita-group.github.io;http://jiayuzhou.github.io/", "dblp": "185/1316;236/5090;119/4026;73/1353", "google_scholar": "7Cbv6doAAAAJ;aMIJhlEAAAAJ;pxFyKAIAAAAJ;https://scholar.google.com.tw/citations?user=yQKlLTQAAAAJ", "orcid": "0000-0002-5718-5187;;;0000-0003-4336-6777", "linkedin": ";;;jiayuzhou/", "or_profile": "~Junyuan_Hong1;~Haotao_Wang1;~Zhangyang_Wang1;~Jiayu_Zhou1", "aff": "Sony AI;University of Texas, Austin;University of Texas, Austin;Michigan State University", "aff_domain": "sony.com;utexas.edu;utexas.edu;msu.edu", "position": "Intern;PhD student;Assistant Professor;Associate Professor", "bibtex": "@misc{\nhong2022federated,\ntitle={Federated Robustness Propagation: Sharing Adversarial Robustness in Federated Learning},\nauthor={Junyuan Hong and Haotao Wang and Zhangyang Wang and Jiayu Zhou},\nyear={2022},\nurl={https://openreview.net/forum?id=3AkuJOgL_X}\n}", "github": "", "project": "", "reviewers": "udUC;p9LS;B9Be;bKE4;dgjR", "site": "https://openreview.net/forum?id=3AkuJOgL_X", "pdf_size": 0, "recommendation": "3;3;6;8;8", "confidence": "4;4;3;4;4", "correctness": "2;4;3;4;3", "technical_novelty": "3;2;3;3;3", "empirical_novelty": "3;2;2;3;3", "wc_summary_paper": "90;77;54;64;116", "wc_summary_review": "23;6;76;45;43", "wc_main_review": "430;528;686;382;426", "wc_review": "543;611;816;491;585", "wc_reply_reviewers": "0;0;89;0;15", "wc_reply_authors": "837;1198;1541;479;809", "reply_reviewers": "0;0;1;0;1", "reply_authors": "2;3;4;1;1", "recommendation_avg": [ 5.6, 2.244994432064365 ], "confidence_avg": [ 3.8, 0.39999999999999997 ], "correctness_avg": [ 3.2, 0.7483314773547882 ], "technical_novelty_avg": [ 2.8, 0.39999999999999997 ], "empirical_novelty_avg": [ 2.6, 0.4898979485566356 ], "wc_summary_paper_avg": [ 80.2, 21.618510586994656 ], "wc_summary_review_avg": [ 38.6, 23.516802503741875 ], "wc_main_review_avg": [ 490.4, 108.82940779035785 ], "wc_review_avg": [ 609.2, 111.09347415577568 ], "wc_reply_reviewers_avg": [ 20.8, 34.59132839311032 ], "wc_reply_authors_avg": [ 972.8, 364.0540619193803 ], "reply_reviewers_avg": [ 0.4, 0.48989794855663565 ], "reply_authors_avg": [ 2.2, 1.16619037896906 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.08908708063747481, "corr_recommendation_correctness": 0.28571428571428575, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1, "aff_unique_index": "0;1;1;2", "aff_unique_norm": "Sony;University of Texas at Austin;Michigan State University", "aff_unique_dep": "Sony AI;;", "aff_unique_url": "https://www.sony.com;https://www.utexas.edu;https://www.msu.edu", "aff_unique_abbr": "Sony AI;UT Austin;MSU", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Austin", "aff_country_unique_index": "0;1;1;1", "aff_country_unique": "Japan;United States" }, { "id": "3ByLvyOSyan", "title": "A Robust Initialization of Residual Blocks for Effective ResNet Training without Batch Normalization", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Batch Normalization is an essential component of all state-of-the-art neural networks architectures. However, it introduces many practical issues so that in recent years a lot of research has been devoted to designing normalization-free architectures. In this paper, we show that weights initialization is key to train ResNet-like normalization-free networks. In particular, we propose a slight modification of the summation operation of a block output with the skip-connection branch so that the whole network is correctly initialized. We show that this modified architecture achieves competitive results on CIFAR-10 without additional regularization or algorithmic modifications.", "keywords": "Normalization-Free ResNets;Weights Initialization;Exploding Gradient;Residual Blocks", "primary_area": "", "supplementary_material": "/attachment/7e6b38779a1004ac6a00893fb10dc641a25beed1.zip", "author": "Enrico Civitelli;Alessio Sortino;Matteo Lapucci;Francesco Bagattini;Giulio Galvan", "authorids": "~Enrico_Civitelli1;~Alessio_Sortino1;~Matteo_Lapucci1;~Francesco_Bagattini1;~Giulio_Galvan1", "gender": "M;M;M;;M", "homepage": ";;https://webgol.dinfo.unifi.it/matteo-lapucci/;https://www.flair-tech.com;https://www.flair-tech.com/en/people/", "dblp": ";;;;", "google_scholar": "1dFdU5MAAAAJ;;eOflrC8AAAAJ;;https://scholar.google.it/citations?hl=it", "orcid": "0000-0001-5322-4831;0000-0002-3971-3441;0000-0002-2488-5486;;0000-0002-0384-0334", "linkedin": "enrico-civitelli-898a0314b/;;https://it.linkedin.com/in/matteo-lapucci-6384721b4;;giulio-galvan", "or_profile": "~Enrico_Civitelli1;~Alessio_Sortino1;~Matteo_Lapucci1;~Francesco_Bagattini1;~Giulio_Galvan1", "aff": "University of Florence;;Universit\u00e0 degli Studi di Firenze;;", "aff_domain": "unifi.it;;unifi.it;;", "position": "PhD student;;PhD student;;", "bibtex": "@misc{\ncivitelli2022a,\ntitle={A Robust Initialization of Residual Blocks for Effective ResNet Training without Batch Normalization},\nauthor={Enrico Civitelli and Alessio Sortino and Matteo Lapucci and Francesco Bagattini and Giulio Galvan},\nyear={2022},\nurl={https://openreview.net/forum?id=3ByLvyOSyan}\n}", "github": "", "project": "", "reviewers": "od4T;AbSm;tBHM;uxfs", "site": "https://openreview.net/forum?id=3ByLvyOSyan", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "5;3;4;5", "correctness": "3;3;3;2", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "139;105;66;101", "wc_summary_review": "125;44;41;113", "wc_main_review": "408;148;322;857", "wc_review": "672;297;429;1071", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "183;153;205;139", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 4.25, 0.82915619758885 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 102.75, 25.849323008543184 ], "wc_summary_review_avg": [ 80.75, 38.499188303131795 ], "wc_main_review_avg": [ 433.75, 261.6986578108493 ], "wc_review_avg": [ 617.25, 294.4846133501715 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 170.0, 25.709920264364882 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.5222329678670935, "corr_recommendation_correctness": -1.0, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8001007387312283901&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff_unique_index": "0;0", "aff_unique_norm": "University of Florence", "aff_unique_dep": "", "aff_unique_url": "https://www.unifi.it", "aff_unique_abbr": "UNIFI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Italy" }, { "id": "3CRkJ9GRs3I", "title": "Understanding ResNet from a Discrete Dynamical System Perspective", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Residual network (ResNet) is one of popular networks proposed in recently years. Discussion about its theoretical properties is helpful for the understanding of networks with convolution modules. In this paper, we formulate the learning process of ResNet as a iterative system, then we may apply tools in discrete dynamical systems to explain its stability and accuracy. Due to the backward propagation of learning process, the module operations vary with the change of different layers. So we introduce the condition number of modules to describe the perturbation of output data, which can demonstrate the robustness of ResNet. In addition, the inter-class and intra-class median principal angle is defined to analyze the classification efficiency of ResNet. Mathematical description of the learning process of ResNet is given in a modular manner so that our research framework can be applied to other networks. In order to verify the feasibility of our idea, several experiments are carried out on the Dogs vs. Cats dataset, Kaggle: Animals 10 dataset, and ImageNet 2012 dataset. Simulation results are accordance with the theoretical analysis and prove the validity of our theory.", "keywords": "the condition number of modules;median principal angle;network mathematical description", "primary_area": "", "supplementary_material": "", "author": "Lijuan Zhang", "authorids": "~Lijuan_Zhang2", "gender": "", "homepage": "http://lwss.ncut.edu.cn/TutorServlet?action=queryDs&teacherid=gajhl8GQFuvQhFEXHHDKHQ==", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "~Lijuan_Zhang2", "aff": "North China University of Technology", "aff_domain": "ncut.edu.cn", "position": "MS student", "bibtex": "@misc{\nzhang2022understanding,\ntitle={Understanding ResNet from a Discrete Dynamical System Perspective},\nauthor={Lijuan Zhang},\nyear={2022},\nurl={https://openreview.net/forum?id=3CRkJ9GRs3I}\n}", "github": "", "project": "", "reviewers": "63ry;5JK9;GQyt", "site": "https://openreview.net/forum?id=3CRkJ9GRs3I", "pdf_size": 0, "recommendation": "1;3;3", "confidence": "3;3;5", "correctness": "1;2;1", "technical_novelty": "1;1;1", "empirical_novelty": "1;1;2", "wc_summary_paper": "38;58;56", "wc_summary_review": "29;18;15", "wc_main_review": "167;201;171", "wc_review": "234;277;242", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 2.3333333333333335, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.9428090415820634 ], "correctness_avg": [ 1.3333333333333333, 0.4714045207910317 ], "technical_novelty_avg": [ 1.0, 0.0 ], "empirical_novelty_avg": [ 1.3333333333333333, 0.4714045207910317 ], "wc_summary_paper_avg": [ 50.666666666666664, 8.993825042154693 ], "wc_summary_review_avg": [ 20.666666666666668, 6.018490028422596 ], "wc_main_review_avg": [ 179.66666666666666, 15.173075568988057 ], "wc_review_avg": [ 251.0, 18.672618098881223 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 1, 0 ], "corr_recommendation_confidence": 0.5000000000000001, "corr_recommendation_correctness": 0.5000000000000001, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:4oTD0I6OywcJ:scholar.google.com/&scioq=Understanding+ResNet+from+a+Discrete+Dynamical+System+Perspective&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "North China University of Technology", "aff_unique_dep": "", "aff_unique_url": "http://www.ncut.edu.cn", "aff_unique_abbr": "NCUT", "aff_country_unique_index": "0", "aff_country_unique": "China" }, { "id": "3FvF1db-bKT", "title": "Local Augmentation for Graph Neural Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "Data augmentation has been widely used in image data and linguistic data but remains under-explored on graph-structured data. Existing methods focus on augmenting the graph data from a global perspective and largely fall into two genres: structural manipulation and adversarial training with feature noise injection. However, the structural manipulation approach suffers information loss issues while the adversarial training approach may downgrade the feature quality by injecting noise. In this work, we introduce the local augmentation, which enhances node features by its local subgraph structures. Specifically, we model the data argumentation as a feature generation process. Given the central node's feature, our local augmentation approach learns the conditional distribution of its neighbors' features and generates the neighbors' optimal feature to boost the performance of downstream tasks. Based on the local augmentation, we further design a novel framework: LA-GNN, which can apply to any GNN models in a plug-and-play manner. Extensive experiments and analyses show that local augmentation consistently yields performance improvement for various GNN architectures across a diverse set of benchmarks.", "keywords": "Local Augmentation;Graph Neural Networks", "primary_area": "", "supplementary_material": "", "author": "Songtao Liu;Hanze Dong;Lanqing Li;Tingyang Xu;Yu Rong;Peilin Zhao;Junzhou Huang;Dinghao Wu", "authorids": "~Songtao_Liu2;~Hanze_Dong1;~Lanqing_Li1;~Tingyang_Xu1;~Yu_Rong1;~Peilin_Zhao2;~Junzhou_Huang2;~Dinghao_Wu1", "gender": "M;M;M;M;M;;M;", "homepage": "https://songtaoliu0823.github.io/;https://hendrydong.github.io/;https://lanqingli1993.github.io/;;https://royrong.me/;;http://ranger.uta.edu/~huang/;", "dblp": ";228/7798;275/9979;157/0940;24/10036-1;84/8411;22/1170.html;", "google_scholar": "https://scholar.google.com.tw/citations?hl=zh-CN;g9WLzWoAAAAJ;n8IjgKkAAAAJ;6gIs5YMAAAAJ;https://scholar.google.com.hk/citations?user=itezhEMAAAAJ;https://scholar.google.com.hk/citations?user=HPeX_YcAAAAJ;https://scholar.google.com.tw/citations?user=X7KrguAAAAAJ;", "orcid": ";;0000-0003-1998-4022;0009-0002-0106-8376;0000-0001-7387-302X;0000-0001-8543-3953;0000-0002-9548-1227;", "linkedin": ";hanze-dong/;lanqing-li-%EF%BC%88%E6%9D%8E%E8%93%9D%E9%9D%92%EF%BC%89-49209a83/;;;;;", "or_profile": "~Songtao_Liu2;~Hanze_Dong1;~Lanqing_Li1;~Tingyang_Xu1;~Yu_Rong1;~Peilin_Zhao2;~Junzhou_Huang2;~Dinghao_Wu1", "aff": "Pennsylvania State University;Hong Kong University of Science and Technology;Tencent AI Lab;Tencent AI Lab;Tencent AI Lab;Tencent;University of Texas, Arlington;", "aff_domain": "psu.edu;ust.hk;tencent.com;tencent.com;tencent.com;tencent.com;uta.edu;", "position": "PhD student;PhD student;Research Scientist;Researcher;Senior Researcher;Researcher;Full Professor;", "bibtex": "@misc{\nliu2022local,\ntitle={Local Augmentation for Graph Neural Networks},\nauthor={Songtao Liu and Hanze Dong and Lanqing Li and Tingyang Xu and Yu Rong and Peilin Zhao and Junzhou Huang and Dinghao Wu},\nyear={2022},\nurl={https://openreview.net/forum?id=3FvF1db-bKT}\n}", "github": "", "project": "", "reviewers": "E5PW;cVVh;Dob2;XsGb", "site": "https://openreview.net/forum?id=3FvF1db-bKT", "pdf_size": 0, "recommendation": "3;5;5;5", "confidence": "4;3;3;2", "correctness": "2;3;3;2", "technical_novelty": "3;4;2;2", "empirical_novelty": "1;3;2;2", "wc_summary_paper": "70;46;48;80", "wc_summary_review": "59;49;36;27", "wc_main_review": "371;348;211;152", "wc_review": "500;443;295;259", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "1638;1155;1269;1713", "reply_reviewers": "0;0;0;0", "reply_authors": "3;4;3;3", "recommendation_avg": [ 4.5, 0.8660254037844386 ], "confidence_avg": [ 3.0, 0.7071067811865476 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.82915619758885 ], "empirical_novelty_avg": [ 2.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 61.0, 14.45683229480096 ], "wc_summary_review_avg": [ 42.75, 12.214233500306108 ], "wc_main_review_avg": [ 270.5, 91.77281732626497 ], "wc_review_avg": [ 374.25, 100.12835512480967 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 1443.75, 236.71858292073313 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 3.25, 0.4330127018922193 ], "replies_avg": [ 24, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": -0.816496580927726, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 139, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1477899180662383839&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff_unique_index": "0;1;2;2;2;2;3", "aff_unique_norm": "Pennsylvania State University;Hong Kong University of Science and Technology;Tencent;University of Texas at Arlington", "aff_unique_dep": ";;Tencent AI Lab;", "aff_unique_url": "https://www.psu.edu;https://www.ust.hk;https://ai.tencent.com;https://www.uta.edu", "aff_unique_abbr": "PSU;HKUST;Tencent AI Lab;UTA", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Hong Kong SAR;Arlington", "aff_country_unique_index": "0;1;1;1;1;1;0", "aff_country_unique": "United States;China" }, { "id": "3GHHpYrYils", "title": "On Anytime Learning at Macroscale", "track": "main", "status": "Reject", "tldr": "", "abstract": " Classical machine learning frameworks assume access to a possibly large dataset in order to train a predictive model. In many practical applications however, data does not arrive all at once, but in batches over time. This creates a natural trade-off between accuracy of a model and time to obtain such a model. A greedy predictor could produce non-trivial predictions by immediately training on batches as soon as these become available but, it may also make sub-optimal use of future data. On the other hand, a tardy predictor could wait for a long time to aggregate several batches into a larger dataset, but ultimately deliver a much better performance. In this work, we consider such a streaming learning setting, which we dub {\\em anytime learning at macroscale} (ALMA). It is an instance of anytime learning applied not at the level of a single chunk of data, but at the level of the entire sequence of large batches. We first formalize this learning setting, we then introduce metrics to assess how well learners perform on the given task for a given memory and compute budget, and finally we test about thirty baseline approaches on three standard benchmarks repurposed for anytime learning at macroscale. Our findings indicate that no model strikes the best trade-off across the board. While replay-based methods attain the lowest error rate, they also incur in a 5 to 10 times increase of compute. Approaches that grow capacity over time do offer better scaling in terms of training flops, but they also underperform simpler ensembling methods in terms of error rate. Overall, ALMA offers both a good abstraction of the typical learning setting faced everyday by practitioners, and a set of unsolved modeling problems for those interested in efficient learning of dynamic models.", "keywords": "Anytime Learning;Mixture of experts;growing architectures", "primary_area": "", "supplementary_material": "/attachment/6b42e0bce4a796e826357d1b4ddbfcb3fe04121a.zip", "author": "Lucas Caccia;Jing Xu;Myle Ott;MarcAurelio Ranzato;Ludovic Denoyer", "authorids": "~Lucas_Caccia1;~Jing_Xu5;~Myle_Ott1;~MarcAurelio_Ranzato1;~Ludovic_Denoyer1", "gender": "M;F;;M;M", "homepage": "https://www.cs.mcgill.ca/~lpagec/;;http://myleott.com;https://ranzato.github.io/;", "dblp": ";;92/9767;28/1732;54/5551", "google_scholar": "fuvIITUAAAAJ;https://scholar.google.com/citations?hl=en;;NbXF7T8AAAAJ;9PLqulwAAAAJ", "orcid": ";0000-0001-8289-1852;;;", "linkedin": ";jing-xu-818022a1;;;", "or_profile": "~Lucas_Caccia1;~Jing_Xu5;~Myle_Ott1;~MarcAurelio_Ranzato1;~Ludovic_Denoyer1", "aff": "McGill University;FAIR;Meta Facebook;Google DeepMind;Meta Facebook", "aff_domain": "mcgill.ca;meta.com;fb.com;deepmind.com;fb.com", "position": "PhD student;Researcher;Research Engineer;Researcher;Research Scientist", "bibtex": "@misc{\ncaccia2022on,\ntitle={On Anytime Learning at Macroscale},\nauthor={Lucas Caccia and Jing Xu and Myle Ott and MarcAurelio Ranzato and Ludovic Denoyer},\nyear={2022},\nurl={https://openreview.net/forum?id=3GHHpYrYils}\n}", "github": "", "project": "", "reviewers": "D3ZG;x4p7;4ezA;KMi2", "site": "https://openreview.net/forum?id=3GHHpYrYils", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "4;4;4;3", "correctness": "1;3;3;4", "technical_novelty": "2;2;1;2", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "61;83;85;36", "wc_summary_review": "15;35;47;11", "wc_main_review": "119;295;276;231", "wc_review": "195;413;408;278", "wc_reply_reviewers": "133;0;148;161", "wc_reply_authors": "665;646;592;957", "reply_reviewers": "1;0;1;1", "reply_authors": "1;1;1;2", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 2.75, 1.0897247358851685 ], "technical_novelty_avg": [ 1.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 66.25, 19.84155991851447 ], "wc_summary_review_avg": [ 27.0, 14.696938456699069 ], "wc_main_review_avg": [ 230.25, 68.30583796426188 ], "wc_review_avg": [ 323.5, 91.83272837066315 ], "wc_reply_reviewers_avg": [ 110.5, 64.5619857191521 ], "wc_reply_authors_avg": [ 715.0, 142.26208208795484 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.6622661785325219, "corr_recommendation_correctness": 1.0, "gs_citation": 28, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3969866147643018629&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;1;1;2;1", "aff_unique_norm": "McGill University;Meta;Google", "aff_unique_dep": ";Facebook AI Research;Google DeepMind", "aff_unique_url": "https://www.mcgill.ca;https://research.facebook.com;https://deepmind.com", "aff_unique_abbr": "McGill;FAIR;DeepMind", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;2;1", "aff_country_unique": "Canada;United States;United Kingdom" }, { "title": "Toward Efficient Low-Precision Training: Data Format Optimization and Hysteresis Quantization", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6023", "id": "3HJOA-1hb0e", "poster": "", "openreview": "https://openreview.net/forum?id=3HJOA-1hb0e", "slides": "https://iclr.cc/virtual/2022/poster/6023", "video": "https://iclr.cc/virtual/2022/poster/6023", "author_site": "SunWoo Lee, Jeongwoo Park, Dongsuk Jeon", "tldr": "", "abstract": "As the complexity and size of deep neural networks continue to increase, low-precision training has been extensively studied in the last few years to reduce hardware overhead. Training performance is largely affected by the numeric formats representing different values in low-precision training, but finding an optimal format typically requires numerous training runs, which is a very time-consuming process. In this paper, we propose a method to efficiently find an optimal format for activations and errors without actual training. We employ this method to determine an 8-bit format suitable for training various models. In addition, we propose hysteresis quantization to suppress undesired fluctuation in quantized weights during training. This scheme enables deeply quantized training using 4-bit weights, exhibiting only 0.2% degradation for ResNet-18 trained on ImageNet.", "keywords": "low-precision training;quantized training;logarithmic weight;data format optimization;hysteresis quantization", "primary_area": "", "supplementary_material": "/attachment/483c369adb5fb87c20db91b838816460ccc91150.zip", "author": "Sunwoo Lee;Jeongwoo Park;Dongsuk Jeon", "authorids": "~Sunwoo_Lee2;~Jeongwoo_Park1;~Dongsuk_Jeon1", "gender": "M;M;M", "homepage": ";;http://mms.snu.ac.kr", "dblp": ";;28/9878", "google_scholar": ";hD7SL-cAAAAJ;_7GzTD4AAAAJ", "orcid": "0000-0001-7760-0168;;", "linkedin": ";;", "or_profile": "~Sunwoo_Lee2;~Jeongwoo_Park1;~Dongsuk_Jeon1", "aff": "Seoul National University;Seoul National University;Seoul National University", "aff_domain": "snu.ac.kr;snu.ac.kr;snu.ac.kr", "position": "PhD student;PhD student;Associate Professor", "bibtex": "@inproceedings{\nlee2022toward,\ntitle={Toward Efficient Low-Precision Training: Data Format Optimization and Hysteresis Quantization},\nauthor={Sunwoo Lee and Jeongwoo Park and Dongsuk Jeon},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=3HJOA-1hb0e}\n}", "github": "", "project": "", "reviewers": "mUcf;i9MY;mPme;AC1G", "pdf_size": 0, "recommendation": "5;5;6;8", "confidence": "4;5;4;5", "correctness": "3;2;3;3", "technical_novelty": "3;2;3;3", "empirical_novelty": "3;2;3;3", "wc_summary_paper": "76;86;77;50", "wc_summary_review": "61;76;26;36", "wc_main_review": "349;336;286;314", "wc_review": "486;498;389;400", "wc_reply_reviewers": "235;894;26;0", "wc_reply_authors": "3150;5866;710;516", "reply_reviewers": "1;3;1;0", "reply_authors": "5;12;1;1", "recommendation_avg": [ 6.0, 1.224744871391589 ], "confidence_avg": [ 4.5, 0.5 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 72.25, 13.423393758658799 ], "wc_summary_review_avg": [ 49.75, 19.803724397193573 ], "wc_main_review_avg": [ 321.25, 23.889066536807167 ], "wc_review_avg": [ 443.25, 49.088567915554435 ], "wc_reply_reviewers_avg": [ 288.75, 361.12004582963823 ], "wc_reply_authors_avg": [ 2560.5, 2172.450862505295 ], "reply_reviewers_avg": [ 1.25, 1.0897247358851685 ], "reply_authors_avg": [ 4.75, 4.493050188902857 ], "replies_avg": [ 29, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.40824829046386296, "corr_recommendation_correctness": 0.4714045207910316, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6956488233570535159&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "pdf": "https://openreview.net/pdf?id=3HJOA-1hb0e", "email": "snu.ac.kr;snu.ac.kr;snu.ac.kr", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Seoul National University", "aff_unique_dep": "", "aff_unique_url": "https://www.snu.ac.kr", "aff_unique_abbr": "SNU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "South Korea" }, { "title": "Learning Continuous Environment Fields via Implicit Functions", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6783", "id": "3ILxkQ7yElm", "poster": "", "openreview": "https://openreview.net/forum?id=3ILxkQ7yElm", "slides": "https://iclr.cc/virtual/2022/poster/6783", "video": "https://iclr.cc/virtual/2022/poster/6783", "author_site": "Xueting Li, Sifei Liu, Shalini De Mello, Xiaolong Wang, Ming-Hsuan Yang, Jan Kautz", "tldr": "", "abstract": " We propose a novel scene representation that encodes reaching distance -- the distance between any position in the scene to a goal along a feasible trajectory. We demonstrate that this environment field representation can directly guide the dynamic behaviors of agents in 2D mazes or 3D indoor scenes. Our environment field is a continuous representation and learned via a neural implicit function using discretely sampled training data. We showcase its application for agent navigation in 2D mazes, and human trajectory prediction in 3D indoor environments. To produce physically plausible and natural trajectories for humans, we additionally learn a generative model that predicts regions where humans commonly appear, and enforce the environment field to be defined within such regions. Extensive experiments demonstrate that the proposed method can generate both feasible and plausible trajectories efficiently and accurately.", "keywords": "Continuous Scene Representation;Implicit Neural Networks", "primary_area": "", "supplementary_material": "/attachment/0417ca1a8105d1321ff51501f619f30abbe9550b.zip", "author": "Xueting Li;Shalini De Mello;Xiaolong Wang;Ming-Hsuan Yang;Jan Kautz;Sifei Liu", "authorids": "~Xueting_Li1;~Shalini_De_Mello1;~Xiaolong_Wang3;~Ming-Hsuan_Yang1;~Jan_Kautz1;~Sifei_Liu2", "gender": "F;Not Specified;M;M;;F", "homepage": "https://sunshineatnoon.github.io/;https://research.nvidia.com/person/shalini-de-mello;https://xiaolonw.github.io/;https://faculty.ucmerced.edu/mhyang/;http://jankautz.com;https://www.sifeiliu.net", "dblp": ";206/7364;91/952-4;79/3711.html;48/6214;118/1301", "google_scholar": "nfXdXswAAAAJ;xQM4BlMAAAAJ;Y8O9N_0AAAAJ;p9-ohHsAAAAJ;P9FclNEAAAAJ;j4pcHV4AAAAJ", "orcid": ";;;0000-0003-4848-2304;;", "linkedin": ";shalini-de-mello-02b8251/;;minghsuanyang/;;", "or_profile": "~Xueting_Li1;~Shalini_De_Mello1;~Xiaolong_Wang3;~Ming-Hsuan_Yang1;~Jan_Kautz1;~Sifei_Liu2", "aff": "University of California, Merced;NVIDIA;University of California, San Diego;University of California at Merced;NVIDIA;NVIDIA", "aff_domain": "ucmerced.edu;nvidia.com;ucsd.edu;umcerced.edu;nvidia.com;nvidia.com", "position": "PhD student;Principal Researcher;Assistant Professor;Professor;VP Research;Researcher", "bibtex": "@inproceedings{\nli2022learning,\ntitle={Learning Continuous Environment Fields via Implicit Functions},\nauthor={Xueting Li and Sifei Liu and Shalini De Mello and Xiaolong Wang and Ming-Hsuan Yang and Jan Kautz},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=3ILxkQ7yElm}\n}", "github": "", "project": "", "reviewers": "z3Y4;AhgQ;fAEP", "pdf_size": 0, "recommendation": "1;6;8", "confidence": "5;4;2", "correctness": "1;3;4", "technical_novelty": "1;2;3", "empirical_novelty": "1;2;3", "wc_summary_paper": "43;135;105", "wc_summary_review": "26;42;65", "wc_main_review": "504;731;291", "wc_review": "573;908;461", "wc_reply_reviewers": "541;271;11", "wc_reply_authors": "2428;1136;613", "reply_reviewers": "1;1;1", "reply_authors": "5;4;1", "recommendation_avg": [ 5.0, 2.943920288775949 ], "confidence_avg": [ 3.6666666666666665, 1.247219128924647 ], "correctness_avg": [ 2.6666666666666665, 1.247219128924647 ], "technical_novelty_avg": [ 2.0, 0.816496580927726 ], "empirical_novelty_avg": [ 2.0, 0.816496580927726 ], "wc_summary_paper_avg": [ 94.33333333333333, 38.30868772948971 ], "wc_summary_review_avg": [ 44.333333333333336, 16.006942938057293 ], "wc_main_review_avg": [ 508.6666666666667, 179.6595545902181 ], "wc_review_avg": [ 647.3333333333334, 189.9058246137338 ], "wc_reply_reviewers_avg": [ 274.3333333333333, 216.38443156156643 ], "wc_reply_authors_avg": [ 1392.3333333333333, 762.8177297945355 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 3.3333333333333335, 1.699673171197595 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.9078412990032039, "corr_recommendation_correctness": 0.9986254289035241, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12112202077903180116&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=3ILxkQ7yElm", "email": "ucmerced.edu;nvidia.com;ucsd.edu;umcerced.edu;nvidia.com;nvidia.com", "author_num": 6, "aff_unique_index": "0;1;2;0;1;1", "aff_unique_norm": "University of California, Merced;NVIDIA;University of California, San Diego", "aff_unique_dep": ";NVIDIA Corporation;", "aff_unique_url": "https://www.ucmerced.edu;https://www.nvidia.com;https://www.ucsd.edu", "aff_unique_abbr": "UC Merced;NVIDIA;UCSD", "aff_campus_unique_index": "0;2;0", "aff_campus_unique": "Merced;;San Diego", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "3JvRnAzw_0", "title": "Robust Weight Perturbation for Adversarial Training", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Overfitting widely exists in adversarial robust training of deep networks. An effective and promising remedy is adversarial weight perturbation, which injects the worst-case weight perturbation during network training by maximizing the classification loss on adversarial examples. Adversarial weight perturbation helps reduce the robust generalization gap; however, it also undermines the robustness enhancement. A criterion that regulates the weight perturbation is therefore crucial for adversarial training. In this paper, we propose such a criterion, namely Loss Stationary Condition (LSC) for constrained perturbation. With LSC, we find that deep network first overfits the adversarial examples with small loss, and then gradually develops to overfit all adversarial examples in the later stage of training. Following this, we find that it is essential to conduct weight perturbation on adversarial data with small classification loss to eliminate overfitting in adversarial training. Weight perturbation on adversarial data with large classification loss is not necessary and may even lead to poor robustness. Based on these observations, we propose a robust perturbation strategy to constrain the extent of weight perturbation. The perturbation strategy prevents deep networks from overfitting while avoiding the side effect of excessive weight perturbation, significantly improving the robustness of adversarial training. Extensive experiments demonstrate the superiority of the proposed method over the state-of-the-art adversarial training methods.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/670677adef4678b58dbacf4c2a04a0da6c812223.zip", "author": "Chaojian Yu;Bo Han;Mingming Gong;Li Shen;Shiming Ge;Bo Du;Tongliang Liu", "authorids": "~Chaojian_Yu1;~Bo_Han1;~Mingming_Gong1;~Li_Shen1;~Shiming_Ge1;~Bo_Du1;~Tongliang_Liu1", "gender": "M;M;M;M;M;M;M", "homepage": ";https://mingming-gong.github.io/;https://sites.google.com/site/mathshenli/home;;;https://tongliang-liu.github.io/;https://bhanml.github.io/", "dblp": "223/9872;98/8479;91/3680-8;93/8104.html;70/6443-1.html;150/6667;241/0472-3", "google_scholar": "b3ltuG8AAAAJ;https://scholar.google.com.au/citations?user=6BmiCJIAAAAJ;yVhgENIAAAAJ;;Shy1gnMAAAAJ;https://scholar.google.com.au/citations?user=EiLdZ_YAAAAJ;nTNjqHwAAAAJ", "orcid": ";0000-0001-7147-5589;;;;;", "linkedin": ";;;;;;", "or_profile": "~Chaojian_Yu1;~Mingming_Gong1;~Li_Shen1;~Shiming_Ge1;~Bo_Du1;~Tongliang_Liu1;~bo_han2", "aff": "The University of Sydney;University of Melbourne;JD Explore Academy;Institute of Information Engineering, Chinese Academy of Sciences;Wuhan University;University of Sydney;Microsoft Research", "aff_domain": "uni.sydney.edu.au;unimelb.edu.au;jd.com;iie.ac.cn;whu.edu.cn;sydney.edu.au;microsoft.com", "position": "PhD student;Assistant Professor;Researcher;Researcher;Full Professor;Lecturer;Researcher", "bibtex": "@misc{\nyu2022robust,\ntitle={Robust Weight Perturbation for Adversarial Training},\nauthor={Chaojian Yu and Bo Han and Mingming Gong and Li Shen and Shiming Ge and Bo Du and Tongliang Liu},\nyear={2022},\nurl={https://openreview.net/forum?id=3JvRnAzw_0}\n}", "github": "", "project": "", "reviewers": "Z3Cr;8qVH;PMmu;nnPq", "site": "https://openreview.net/forum?id=3JvRnAzw_0", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "4;5;4;4", "correctness": "2;1;3;3", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "30;88;35;122", "wc_summary_review": "121;89;64;74", "wc_main_review": "294;566;734;228", "wc_review": "445;743;833;424", "wc_reply_reviewers": "109;0;121;0", "wc_reply_authors": "797;813;1180;720", "reply_reviewers": "1;0;1;0", "reply_authors": "1;2;2;1", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 2.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 68.75, 38.23202191880518 ], "wc_summary_review_avg": [ 87.0, 21.552262062252307 ], "wc_main_review_avg": [ 455.5, 204.7016121089426 ], "wc_review_avg": [ 611.25, 179.74478434714038 ], "wc_reply_reviewers_avg": [ 57.5, 57.65630928181234 ], "wc_reply_authors_avg": [ 877.5, 178.1523224659168 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.9045340337332909, "gs_citation": 30, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5361625954178402118&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff_unique_index": "0;1;2;3;4;0;5", "aff_unique_norm": "University of Sydney;University of Melbourne;JD;Chinese Academy of Sciences;Wuhan University;Microsoft", "aff_unique_dep": ";;JD Explore Academy;Institute of Information Engineering;;Microsoft Research", "aff_unique_url": "https://www.sydney.edu.au;https://www.unimelb.edu.au;;http://www.cas.cn;http://www.whu.edu.cn/;https://www.microsoft.com/en-us/research", "aff_unique_abbr": "USYD;UniMelb;;CAS;WHU;MSR", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;2;2;0;3", "aff_country_unique": "Australia;;China;United States" }, { "id": "3Li0OPkhQU", "title": "Provable Learning of Convolutional Neural Networks with Data Driven Features", "track": "main", "status": "Reject", "tldr": "", "abstract": "Convolutional networks (CNN) are computationally hard to learn. In practice, however, CNNs are learned successfully on natural image data. In this work, we study a semi-supervised algorithm, that learns a linear classifier over data-dependent features which were obtained from unlabeled data. We show that the algorithm provably learns CNNs, under some natural distributional assumptions. Specifically, it efficiently learns CNNs, assuming the distribution of patches in the input images has low-dimensional structure (e.g., when the patches are sampled from a low-dimensional manifold). We complement our result with a lower bound, showing that the dependence of our algorithm on the dimension of the patch distribution is essentially optimal.", "keywords": "Deep learning theory;convolutional neural networks", "primary_area": "", "supplementary_material": "", "author": "Alon Brutzkus;Amir Globerson;eran malach;Shai Shalev-Shwartz", "authorids": "~Alon_Brutzkus1;~Amir_Globerson1;~eran_malach1;~Shai_Shalev-Shwartz1", "gender": "M;M;M;M", "homepage": ";http://www.cs.tau.ac.il/~gamir/;;http://www.cs.huji.ac.il/~shais/", "dblp": "161/7411;08/4162.html;202/2566;95/2750", "google_scholar": "m1wmXdgAAAAJ;https://scholar.google.com.tw/citations?user=5JserkUAAAAJ;I15dUOwAAAAJ;https://scholar.google.co.il/citations?user=uYVc9koAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Alon_Brutzkus1;~Amir_Globerson1;~eran_malach1;~Shai_Shalev-Shwartz1", "aff": "Tel Aviv University;Tel Aviv University;Hebrew University of Jerusalem, Israel;Hebrew University, Hebrew University of Jerusalem", "aff_domain": "tau.ac.il;tau.ac.il;huji.ac.il;cs.huji", "position": "PhD student;Associate Professor;PhD student;Full Professor", "bibtex": "@misc{\nbrutzkus2022provable,\ntitle={Provable Learning of Convolutional Neural Networks with Data Driven Features},\nauthor={Alon Brutzkus and Amir Globerson and eran malach and Shai Shalev-Shwartz},\nyear={2022},\nurl={https://openreview.net/forum?id=3Li0OPkhQU}\n}", "github": "", "project": "", "reviewers": "g4Er;nV5M;WV6X;vjtz", "site": "https://openreview.net/forum?id=3Li0OPkhQU", "pdf_size": 0, "recommendation": "3;5;5;5", "confidence": "3;5;3;3", "correctness": "3;1;3;4", "technical_novelty": "2;3;2;3", "empirical_novelty": "0;1;2;4", "wc_summary_paper": "57;79;314;130", "wc_summary_review": "35;83;101;111", "wc_main_review": "241;145;1097;213", "wc_review": "333;307;1512;454", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.5, 0.8660254037844386 ], "confidence_avg": [ 3.5, 0.8660254037844386 ], "correctness_avg": [ 2.75, 1.0897247358851685 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 1.75, 1.479019945774904 ], "wc_summary_paper_avg": [ 145.0, 101.10143421336811 ], "wc_summary_review_avg": [ 82.5, 29.201883500897676 ], "wc_main_review_avg": [ 424.0, 390.1217758597948 ], "wc_review_avg": [ 651.5, 499.89723944026736 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": -0.13245323570650439, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:3iTs9-Yh2awJ:scholar.google.com/&scioq=Provable+Learning+of+Convolutional+Neural+Networks+with+Data+Driven+Features&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;1;1", "aff_unique_norm": "Tel Aviv University;Hebrew University of Jerusalem", "aff_unique_dep": ";", "aff_unique_url": "https://www.tau.ac.il;https://www.huji.ac.il", "aff_unique_abbr": "TAU;HUJI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Israel" }, { "id": "3M3t3tUbA2Y", "title": "DreamerPro: Reconstruction-Free Model-Based Reinforcement Learning with Prototypical Representations", "track": "main", "status": "Reject", "tldr": "", "abstract": "In model-based reinforcement learning (MBRL) such as Dreamer, the approaches based on observation reconstruction\noften fail to discard task-irrelevant details, thus struggling to handle visual distractions or generalize to unseen distractions. To address this issue, previous work has proposed to contrastively learn the latent representations and its temporal dynamics, but showed inconsistent performance, often worse than Dreamer. Although, in computer vision, an alternative prototypical approach has often shown to be more accurate and robust, it is elusive how this approach can be combined best with the temporal dynamics learning in MBRL. In this work, we propose a reconstruction-free MBRL agent, called DreamerPro, to achieve this goal. Similar to SwAV, by encouraging uniform cluster assignment across the batch, we implicitly push apart the embeddings of different observations. Additionally, we let the temporal latent state to 'reconstruct' the cluster assignment of the observation, thereby relieving the world model from modeling low-level details. We evaluate our model on the standard setting of DeepMind Control Suite, and also on a natural background setting, where the background is replaced by natural videos irrelevant to the task. The results show that the proposed model is consistently better than the previous models.", "keywords": "model-based reinforcement learning;representation learning", "primary_area": "", "supplementary_material": "", "author": "Fei Deng;Ingook Jang;Sungjin Ahn", "authorids": "~Fei_Deng1;~Ingook_Jang1;~Sungjin_Ahn1", "gender": "M;M;", "homepage": ";;", "dblp": "46/10037-1;70/7891;", "google_scholar": "https://scholar.google.com/citations?hl=en;https://scholar.google.co.kr/citations?hl=ko;", "orcid": ";;", "linkedin": ";;", "or_profile": "~Fei_Deng1;~Ingook_Jang1;~Sungjin_Ahn1", "aff": "Rutgers University;Electronics and Telecommunications Research Institute;", "aff_domain": "rutgers.edu;etri.re.kr;", "position": "PhD student;Researcher;", "bibtex": "@misc{\ndeng2022dreamerpro,\ntitle={DreamerPro: Reconstruction-Free Model-Based Reinforcement Learning with Prototypical Representations},\nauthor={Fei Deng and Ingook Jang and Sungjin Ahn},\nyear={2022},\nurl={https://openreview.net/forum?id=3M3t3tUbA2Y}\n}", "github": "", "project": "", "reviewers": "pipU;VAKJ;FvUb;xtkN", "site": "https://openreview.net/forum?id=3M3t3tUbA2Y", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "4;4;3;4", "correctness": "2;4;3;4", "technical_novelty": "2;3;2;3", "empirical_novelty": "2;3;3;2", "wc_summary_paper": "76;40;125;51", "wc_summary_review": "68;62;51;61", "wc_main_review": "482;150;322;273", "wc_review": "626;252;498;385", "wc_reply_reviewers": "81;76;17;0", "wc_reply_authors": "405;34;70;222", "reply_reviewers": "1;1;1;0", "reply_authors": "2;1;1;1", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 73.0, 32.73377460666582 ], "wc_summary_review_avg": [ 60.5, 6.103277807866851 ], "wc_main_review_avg": [ 306.75, 119.01129148110275 ], "wc_review_avg": [ 440.25, 138.13829121572337 ], "wc_reply_reviewers_avg": [ 43.5, 35.556293395122054 ], "wc_reply_authors_avg": [ 182.75, 146.43663305334496 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.13245323570650439, "corr_recommendation_correctness": 0.899228803025897, "gs_citation": 66, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11064573461444670693&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "aff_unique_index": "0;1", "aff_unique_norm": "Rutgers University;Electronics and Telecommunications Research Institute", "aff_unique_dep": ";", "aff_unique_url": "https://www.rutgers.edu;http://www.etri.re.kr", "aff_unique_abbr": "Rutgers;ETRI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "United States;South Korea" }, { "id": "3MjOIZ2CF9", "title": "An evaluation of quality and robustness of smoothed explanations", "track": "main", "status": "Reject", "tldr": "", "abstract": "Explanation methods play a crucial role in helping to understand the decisions of deep neural networks (DNNs) to develop trust that is critical for the adoption of predictive models. However, explanation methods are easily manipulated through visually imperceptible perturbations that generate misleading explanations. The geometry of the decision surface of the DNNs has been identified as the main cause of this phenomenon and several \\emph{smoothing} approaches have been proposed to build more robust explanations.\nIn this work, we provide a thorough evaluation of the quality and robustness of the explanations derived by smoothing approaches. Their different properties are evaluated with extensive experiments, which reveal the settings where the smoothed explanations are better, and also worse than the explanations derived by the common Gradient method. By making the connection with the literature on adversarial attacks, we further show that such smoothed explanations are robust primarily against additive $\\ell_p$-norm attacks. However, a combination of additive and non-additive attacks can still manipulate these explanations, which reveals shortcomings in their robustness properties.", "keywords": "Explanation methods;Interpretability;Robustness;Adversarial attacks", "primary_area": "", "supplementary_material": "/attachment/57b492e43983c2ace16029400e16e8d7170bda24.zip", "author": "Ahmad Ajalloeian;Seyed-Mohsen Moosavi-Dezfooli;Michalis Vlachos;Pascal Frossard", "authorids": "~Ahmad_Ajalloeian1;~Seyed-Mohsen_Moosavi-Dezfooli1;~Michalis_Vlachos1;~Pascal_Frossard1", "gender": "M;M;;", "homepage": "https://hecnet.unil.ch/hec/recherche/fiche?pnom=aajalloeian&dyn_lang=en;;;", "dblp": ";;;", "google_scholar": "EeUciD8AAAAJ;https://scholar.google.ch/citations?user=qosS83IAAAAJ;;", "orcid": ";;;", "linkedin": "ahmad-ajalloeian-3652a067/;;;", "or_profile": "~Ahmad_Ajalloeian1;~Seyed-Mohsen_Moosavi-Dezfooli1;~Michalis_Vlachos1;~Pascal_Frossard1", "aff": "University of Lausanne;Swiss Federal Institute of Technology;;", "aff_domain": "unil.ch;ethz.ch;;", "position": "PhD student;Postdoc;;", "bibtex": "@misc{\najalloeian2022an,\ntitle={An evaluation of quality and robustness of smoothed explanations},\nauthor={Ahmad Ajalloeian and Seyed-Mohsen Moosavi-Dezfooli and Michalis Vlachos and Pascal Frossard},\nyear={2022},\nurl={https://openreview.net/forum?id=3MjOIZ2CF9}\n}", "github": "", "project": "", "reviewers": "xHez;Sfxr;Aokc;dEQe", "site": "https://openreview.net/forum?id=3MjOIZ2CF9", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "4;4;4;4", "correctness": "2;3;3;4", "technical_novelty": "3;2;3;3", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "142;43;75;17", "wc_summary_review": "59;52;82;13", "wc_main_review": "284;511;486;172", "wc_review": "485;606;643;202", "wc_reply_reviewers": "0;265;381;0", "wc_reply_authors": "461;1230;1339;286", "reply_reviewers": "0;1;2;0", "reply_authors": "2;3;3;1", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 69.25, 46.756684014159944 ], "wc_summary_review_avg": [ 51.5, 24.84451649760969 ], "wc_main_review_avg": [ 363.25, 141.20441742381857 ], "wc_review_avg": [ 484.0, 172.98121285272572 ], "wc_reply_reviewers_avg": [ 161.5, 166.626078391109 ], "wc_reply_authors_avg": [ 829.0, 461.295458464529 ], "reply_reviewers_avg": [ 0.75, 0.82915619758885 ], "reply_authors_avg": [ 2.25, 0.82915619758885 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.7071067811865475, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11753354310912343305&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1", "aff_unique_norm": "University of Lausanne;Swiss Federal Institute of Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.unil.ch;https://www.ethz.ch", "aff_unique_abbr": "UNIL;ETH Zurich", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Switzerland" }, { "id": "3Od_-TkEdnG", "title": "Domain-wise Adversarial Training for Out-of-Distribution Generalization", "track": "main", "status": "Reject", "tldr": "", "abstract": "Despite the impressive success on many tasks, deep learning models are shown to rely on spurious features, which will catastrophically fail when generalized to out-of-distribution (OOD) data. To alleviate this issue, Invariant Risk Minimization (IRM) is proposed to extract domain-invariant features for OOD generalization. Nevertheless, recent work shows that IRM is only effective for a certain type of distribution shift (e.g., correlation shift) while fails for other cases (e.g., diversity shift). Meanwhile, another thread of method, Adversarial Training (AT), has shown better domain transfer performance, suggesting that it is potential to be an effective candidate for extracting domain-invariant features. In this paper, we investigate this possibility by exploring the similarity between the IRM and AT objectives. Inspired by this connection, we propose Domain-wise Adversarial Training (DAT), an AT-inspired method for alleviating distribution shift by domain-specific perturbations. Extensive experiments show that our proposed DAT can effectively remove the domain-varying features and improve OOD generalization on both correlation shift and diversity shift tasks.", "keywords": "Domain Generalization;IRM;Adversarial Training", "primary_area": "", "supplementary_material": "", "author": "Shiji Xin;Yifei Wang;Jingtong Su;Yisen Wang", "authorids": "~Shiji_Xin1;~Yifei_Wang1;~Jingtong_Su1;~Yisen_Wang1", "gender": ";M;M;M", "homepage": ";https://yifeiwang77.com;https://cims.nyu.edu/~js12196/;https://yisenwang.github.io/", "dblp": ";00/555-1;275/3776;172/1346-1", "google_scholar": ";-CLy6YsAAAAJ;i0OY_LAAAAAJ;uMWPDboAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Shiji_Xin1;~Yifei_Wang1;~Jingtong_Su1;~Yisen_Wang1", "aff": ";Peking University;New York University;Peking University", "aff_domain": ";pku.edu.cn;nyu.edu;pku.edu.cn", "position": ";PhD student;PhD student;Assistant Professor", "bibtex": "@misc{\nxin2022domainwise,\ntitle={Domain-wise Adversarial Training for Out-of-Distribution Generalization},\nauthor={Shiji Xin and Yifei Wang and Jingtong Su and Yisen Wang},\nyear={2022},\nurl={https://openreview.net/forum?id=3Od_-TkEdnG}\n}", "github": "", "project": "", "reviewers": "Gexh;LGzX;HCbA;Ywim", "site": "https://openreview.net/forum?id=3Od_-TkEdnG", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "4;5;4;3", "correctness": "4;3;4;3", "technical_novelty": "3;2;2;4", "empirical_novelty": "0;2;2;3", "wc_summary_paper": "64;66;53;98", "wc_summary_review": "147;30;107;69", "wc_main_review": "553;1008;187;284", "wc_review": "764;1104;347;451", "wc_reply_reviewers": "294;297;46;53", "wc_reply_authors": "1434;2489;757;873", "reply_reviewers": "1;3;1;1", "reply_authors": "3;5;1;2", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.82915619758885 ], "empirical_novelty_avg": [ 1.75, 1.0897247358851685 ], "wc_summary_paper_avg": [ 70.25, 16.768646337734005 ], "wc_summary_review_avg": [ 88.25, 43.49353400219393 ], "wc_main_review_avg": [ 508.0, 318.2931039152435 ], "wc_review_avg": [ 666.5, 295.56429080658575 ], "wc_reply_reviewers_avg": [ 172.5, 123.02946801478092 ], "wc_reply_authors_avg": [ 1388.25, 685.1464715664819 ], "reply_reviewers_avg": [ 1.5, 0.8660254037844386 ], "reply_authors_avg": [ 2.75, 1.479019945774904 ], "replies_avg": [ 24, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.6488856845230502, "corr_recommendation_correctness": -0.6882472016116854, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15502005908742976748&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;0", "aff_unique_norm": "Peking University;New York University", "aff_unique_dep": ";", "aff_unique_url": "http://www.pku.edu.cn;https://www.nyu.edu", "aff_unique_abbr": "Peking U;NYU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", "aff_country_unique": "China;United States" }, { "title": "Amortized Implicit Differentiation for Stochastic Bilevel Optimization", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6785", "id": "3PN4iyXBeF", "poster": "", "openreview": "https://openreview.net/forum?id=3PN4iyXBeF", "slides": "https://iclr.cc/virtual/2022/poster/6785", "video": "https://iclr.cc/virtual/2022/poster/6785", "author_site": "Michael Arbel, Julien Mairal", "tldr": "", "abstract": "We study a class of algorithms for solving bilevel optimization problems in both stochastic and deterministic settings when the inner-level objective is strongly convex. Specifically, we consider algorithms based on inexact implicit differentiation and we exploit a warm-start strategy to amortize the estimation of the exact gradient. We then introduce a unified theoretical framework inspired by the study of singularly perturbed systems to analyze such amortized algorithms. By using this framework, our analysis shows these algorithms to match the computational complexity of oracle methods that have access to an unbiased estimate of the gradient, thus outperforming many existing results for bilevel optimization.\nWe illustrate these findings on synthetic experiments and demonstrate the efficiency of these algorithms on hyper-parameter optimization experiments involving several thousands of variables. ", "keywords": "bilevel optimization;stochastic optimization", "primary_area": "", "supplementary_material": "", "author": "Michael Arbel;Julien Mairal", "authorids": "~Michael_Arbel1;~Julien_Mairal1", "gender": "M;", "homepage": "https://michaelarbel.github.io/;http://julien.mairal.org", "dblp": "200/8609;49/6555", "google_scholar": "NsOqVtkAAAAJ;https://scholar.google.fr/citations?user=Bx9WGD6lBFEC", "orcid": ";", "linkedin": "michael-arbel-0a38a655/;", "or_profile": "~Michael_Arbel1;~Julien_Mairal1", "aff": "INRIA;Inria", "aff_domain": "inria.fr;inria.fr", "position": "Postdoc;Research Scientist", "bibtex": "@inproceedings{\narbel2022amortized,\ntitle={Amortized Implicit Differentiation for Stochastic Bilevel Optimization},\nauthor={Michael Arbel and Julien Mairal},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=3PN4iyXBeF}\n}", "github": "", "project": "", "reviewers": "QCfa;Xg4d;2dfc;ikrt;6zLQ", "pdf_size": 0, "recommendation": "3;6;6;6;8", "confidence": "3;3;3;3;2", "correctness": "2;4;3;4;3", "technical_novelty": "2;3;2;3;3", "empirical_novelty": "2;2;2;2;2", "wc_summary_paper": "17;50;39;68;92", "wc_summary_review": "25;25;64;73;114", "wc_main_review": "227;141;221;210;384", "wc_review": "269;216;324;351;590", "wc_reply_reviewers": "0;32;52;0;34", "wc_reply_authors": "1083;671;778;286;666", "reply_reviewers": "0;1;1;0;1", "reply_authors": "3;1;2;1;2", "recommendation_avg": [ 5.8, 1.6 ], "confidence_avg": [ 2.8, 0.39999999999999997 ], "correctness_avg": [ 3.2, 0.7483314773547882 ], "technical_novelty_avg": [ 2.6, 0.4898979485566356 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 53.2, 25.4825430442097 ], "wc_summary_review_avg": [ 60.2, 33.31906361229259 ], "wc_main_review_avg": [ 236.6, 79.8864193715052 ], "wc_review_avg": [ 350.0, 128.68100092865302 ], "wc_reply_reviewers_avg": [ 23.6, 20.489997559785117 ], "wc_reply_authors_avg": [ 696.8, 255.38786188854004 ], "reply_reviewers_avg": [ 0.6, 0.48989794855663565 ], "reply_authors_avg": [ 1.8, 0.7483314773547883 ], "replies_avg": [ 20, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.6875000000000001, "corr_recommendation_correctness": 0.5345224838248488, "gs_citation": 71, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16172665780204804021&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "pdf": "https://openreview.net/pdf?id=3PN4iyXBeF", "email": "inria.fr;inria.fr", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "INRIA", "aff_unique_dep": "", "aff_unique_url": "https://www.inria.fr", "aff_unique_abbr": "INRIA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "France" }, { "title": "Rethinking Network Design and Local Geometry in Point Cloud: A Simple Residual MLP Framework", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6005", "id": "3Pbra-_u76D", "poster": "", "openreview": "https://openreview.net/forum?id=3Pbra-_u76D", "slides": "https://iclr.cc/virtual/2022/poster/6005", "video": "https://iclr.cc/virtual/2022/poster/6005", "author_site": "Xu Ma, Can Qin, Haoxuan You, Haoxi Ran, Yun Fu", "tldr": "", "abstract": "Point cloud analysis is challenging due to irregularity and unordered data structure. To capture the 3D geometries, prior works mainly rely on exploring sophisticated local geometric extractors, using convolution, graph, or attention mechanisms. These methods, however, incur unfavorable latency during inference and the performance saturates over the past few years. In this paper, we present an ovel perspective on this task. We find detailed local geometrical informationprobably is not the key to point cloud analysis \u2013 we introduce a pure residual MLP network, called PointMLP, which integrates no local geometrical extractors but still performs very competitively. Equipped with a proposed lightweight geometric-affine module to stabilize the training, PointMLP delivers the new state-of-the-art on multiple datasets. On the real-world ScanObjectNN dataset, our method even surpasses the prior best method by 3.3% accuracy. We emphasize PointMLP achieves this strong performance without any sophisticated operations, hence leading to a prominent inference speed. Compared to most recent CurveNet, PointMLP trains 2\u00d7 faster, tests 7\u00d7 faster, and is more accurate on ModelNet40 benchmark. We hope our PointMLP may help the community towards a better understanding of point cloud analysis. The code is available at https://github.com/ma-xu/pointMLP-pytorch.", "keywords": "point cloud representation;local relation;mlp", "primary_area": "", "supplementary_material": "", "author": "Xu Ma;Can Qin;Haoxuan You;Haoxi Ran;Yun Fu", "authorids": "~Xu_Ma2;~Can_Qin1;~Haoxuan_You1;~Haoxi_Ran1;~Yun_Fu1", "gender": "M;M;M;M;M", "homepage": "https://ma-xu.github.io/;http://canqin.tech;https://hxyou.github.io/;https://haoxiran.com;http://www1.ece.neu.edu/~yunfu/", "dblp": "77/9370-5;214/2488;210/2628;279/3600;00/5815-1", "google_scholar": "Ya7frcEAAAAJ;QCik-YcAAAAJ;BhysChMAAAAJ;FxBvRNUAAAAJ;https://scholar.google.com.tw/citations?user=h-JEcQ8AAAAJ", "orcid": ";;;;0000-0002-5098-2853", "linkedin": ";;;haoxi-ran/;furaymond/", "or_profile": "~Xu_Ma2;~Can_Qin1;~Haoxuan_You1;~Haoxi_Ran1;~Yun_Fu1", "aff": "Northeastern University;Northeastern University;Columbia University;Carnegie Mellon University;Northeastern University", "aff_domain": "northeastern.edu;neu.edu;columbia.edu;cmu.edu;northeastern.edu", "position": "PhD student;PhD student;PhD student;MS student;Full Professor", "bibtex": "@inproceedings{\nma2022rethinking,\ntitle={Rethinking Network Design and Local Geometry in Point Cloud: A Simple Residual {MLP} Framework},\nauthor={Xu Ma and Can Qin and Haoxuan You and Haoxi Ran and Yun Fu},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=3Pbra-_u76D}\n}", "github": "", "project": "", "reviewers": "vGXQ;7Y1Q;8anY", "pdf_size": 0, "recommendation": "6;6;8", "confidence": "2;4;4", "correctness": "3;3;3", "technical_novelty": "2;2;3", "empirical_novelty": "2;2;0", "wc_summary_paper": "49;73;119", "wc_summary_review": "35;60;46", "wc_main_review": "284;151;429", "wc_review": "368;284;594", "wc_reply_reviewers": "73;0;131", "wc_reply_authors": "1030;457;1215", "reply_reviewers": "1;0;1", "reply_authors": "3;2;4", "recommendation_avg": [ 6.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 3.3333333333333335, 0.9428090415820634 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 1.3333333333333333, 0.9428090415820634 ], "wc_summary_paper_avg": [ 80.33333333333333, 29.044027881055953 ], "wc_summary_review_avg": [ 47.0, 10.23067283548187 ], "wc_main_review_avg": [ 288.0, 113.5282637349249 ], "wc_review_avg": [ 415.3333333333333, 130.90794051121912 ], "wc_reply_reviewers_avg": [ 68.0, 53.59726361174297 ], "wc_reply_authors_avg": [ 900.6666666666666, 322.6828508337904 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 3.0, 0.816496580927726 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.5, "corr_recommendation_correctness": 0.0, "gs_citation": 825, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10170039268493179331&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=3Pbra-_u76D", "email": "northeastern.edu;neu.edu;columbia.edu;cmu.edu;northeastern.edu", "author_num": 5, "aff_unique_index": "0;0;1;2;0", "aff_unique_norm": "Northeastern University;Columbia University;Carnegie Mellon University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.northeastern.edu;https://www.columbia.edu;https://www.cmu.edu", "aff_unique_abbr": "NEU;Columbia;CMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "3Qh8ezpsca", "title": "Towards simple time-to-event modeling: optimizing neural networks via rank regression", "track": "main", "status": "Reject", "tldr": "", "abstract": "Time-to-event analysis, also known as survival analysis, aims to predict the first occurred event time, conditional on a set of features.\nHowever, the presence of censorship brings much complexity in learning algorithms due to data incompleteness.\nHazard-based models (e.g. Cox's proportional hazards) and accelerated failure time (AFT) models are two popular tools in time-to-event modeling, requiring the proportional hazards and linearity assumptions, respectively. \nIn addition, AFT models require pre-specified parametric distributional assumptions in most cases. \nTo alleviate such strict assumptions and improve predictive performance, there have been many deep learning approaches for hazard-based models in recent years. \nHowever, compared to hazard-based methods, AFT-based representation learning has received limited attention in neural network literature, despite its model simplicity and interpretability. \nIn this work, we introduce a Deep AFT Rank-regression for Time-to-event prediction model (DART), which is a deep learning-based semiparametric AFT model, and propose a $l_1$-type rank loss function that is more suitable for optimizing neural networks. \nUnlike existing neural network-based AFT models, the proposed model is semiparametric in that any distributional assumption is not imposed for the survival time distribution without requiring further hyperparameters or complicated model architectures. \nWe verify the usefulness of DART via quantitative analysis upon various benchmark datasets. \nThe results show that our method has considerable potential to model high-throughput censored time-to-event data.", "keywords": "time-to-event analysis;survival analysis;semiparametric method;accelerated failure time", "primary_area": "", "supplementary_material": "/attachment/a6d41084aa995a2541600574678b07a782286d12.zip", "author": "Hyunjun Lee;Junhyun Lee;Taehwa Choi;Jaewoo Kang;Sangbum Choi", "authorids": "~Hyunjun_Lee3;~Junhyun_Lee1;~Taehwa_Choi2;~Jaewoo_Kang1;~Sangbum_Choi1", "gender": "M;M;M;M;M", "homepage": "https://junhyunlee.com;;https://dmis.korea.ac.kr;https://sites.google.com/site/bumtoss/;", "dblp": "155/8661;;k/JaewooKang;;", "google_scholar": "kyZHNxYAAAAJ;https://scholar.google.co.kr/citations?user=ufX6ZREAAAAJ;https://scholar.google.co.kr/citations?user=RaBZafQAAAAJ;pofnQN0AAAAJ;", "orcid": "0000-0002-2385-4047;;0000-0001-6798-9106;;", "linkedin": ";;;;hyunjun-lee-5959b41aa/", "or_profile": "~Junhyun_Lee1;~Taehwa_Choi2;~Jaewoo_Kang1;~Sangbum_Choi1;~HYUNJUN_LEE2", "aff": "Korea University;Korea University;Korea University;Korea University;SK Inc. C&C", "aff_domain": "korea.ac.kr;korea.ac.kr;korea.ac.kr;korea.ac.kr;sk.com", "position": "PhD student;PhD student;Full Professor;Associate Professor;Researcher", "bibtex": "@misc{\nlee2022towards,\ntitle={Towards simple time-to-event modeling: optimizing neural networks via rank regression},\nauthor={Hyunjun Lee and Junhyun Lee and Taehwa Choi and Jaewoo Kang and Sangbum Choi},\nyear={2022},\nurl={https://openreview.net/forum?id=3Qh8ezpsca}\n}", "github": "", "project": "", "reviewers": "7ymN;qkV5;PtQZ;6g2w;Rqgw", "site": "https://openreview.net/forum?id=3Qh8ezpsca", "pdf_size": 0, "recommendation": "1;3;3;5;6", "confidence": "5;4;4;2;4", "correctness": "2;2;2;3;3", "technical_novelty": "2;1;2;2;3", "empirical_novelty": "2;2;1;1;3", "wc_summary_paper": "44;59;25;66;124", "wc_summary_review": "30;53;54;27;124", "wc_main_review": "585;280;205;88;563", "wc_review": "659;392;284;181;811", "wc_reply_reviewers": "138;0;0;0;0", "wc_reply_authors": "809;891;782;486;689", "reply_reviewers": "1;0;0;0;0", "reply_authors": "2;2;1;1;1", "recommendation_avg": [ 3.6, 1.7435595774162693 ], "confidence_avg": [ 3.8, 0.9797958971132712 ], "correctness_avg": [ 2.4, 0.4898979485566356 ], "technical_novelty_avg": [ 2.0, 0.6324555320336759 ], "empirical_novelty_avg": [ 1.8, 0.7483314773547883 ], "wc_summary_paper_avg": [ 63.6, 33.31426121047861 ], "wc_summary_review_avg": [ 57.6, 35.046255149445 ], "wc_main_review_avg": [ 344.2, 197.4815434414062 ], "wc_review_avg": [ 465.4, 234.98816991499802 ], "wc_reply_reviewers_avg": [ 27.6, 55.2 ], "wc_reply_authors_avg": [ 731.4, 138.61688208872684 ], "reply_reviewers_avg": [ 0.2, 0.4000000000000001 ], "reply_authors_avg": [ 1.4, 0.4898979485566356 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.6321954228176435, "corr_recommendation_correctness": 0.8897565210026093, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:QhqsgXWnNA4J:scholar.google.com/&scioq=Towards+simple+time-to-event+modeling:+optimizing+neural+networks+via+rank+regression&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0;0;0;1", "aff_unique_norm": "Korea University;SK Inc.", "aff_unique_dep": ";C&C", "aff_unique_url": "https://www.korea.ac.kr;https://www.skinc.co.kr", "aff_unique_abbr": "KU;SK Inc.", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "South Korea" }, { "id": "3SUToIxuIT3", "title": "Efficient Point Transformer for Large-scale 3D Scene Understanding", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "The recent success of neural networks has enabled a better interpretation of 3D point clouds, but processing a large-scale 3D scene remains a challenging problem. Most approaches divide a large-scale 3D scene into multiple regions and combine the local predictions, but this inevitably increases inference time and involves preprocessing stages, such as k-nearest neighbor search. An alternative is to quantize the point cloud to voxels and process them with sparse convolution. Although sparse convolution is efficient and scalable for large 3D scenes, the quantization artifacts impair geometric details and degrade prediction accuracy. This paper proposes an Efficient Point Transformer (EPT) that effectively relieves the quantization artifacts and avoids expensive resource requirements. Each layer of EPT implements the local self-attention mechanism for analyzing continuous 3D coordinates and offers fast inference time using a voxel hashing-based architecture. The proposed method can be adopted for various 3D vision applications, such as 3D semantic segmentation and 3D detection. In experiments, the proposed EPT model outperforms the state-of-the-art on large-scale 3D semantic segmentation benchmarks and also shows better performance on 3D detection benchmarks than point-based or voxel-based baseline methods. ", "keywords": "3D scene understanding;self-attention;transformer", "primary_area": "", "supplementary_material": "/attachment/1c5c0e23f2237f4b4e488dee2e9b1b5d866943d4.zip", "author": "Chunghyun Park;Yoonwoo Jeong;Minsu Cho;Jaesik Park", "authorids": "~Chunghyun_Park1;~Yoonwoo_Jeong1;~Minsu_Cho1;~Jaesik_Park3", "gender": "M;M;M;M", "homepage": "https://chrockey.github.io/;https://jeongyw12382.github.io;http://cvlab.postech.ac.kr/~mcho/;http://jaesik.info", "dblp": "307/2929;;;00/10336", "google_scholar": "5ABvjQcAAAAJ;HQ1PMggAAAAJ;5TyoF5QAAAAJ;_3q6KBIAAAAJ", "orcid": ";;;", "linkedin": "chunghyun-park-7a50b0170/;yoonwoo-jeong-6994ab185/;minsu-cho-062b3750/;", "or_profile": "~Chunghyun_Park1;~Yoonwoo_Jeong1;~Minsu_Cho1;~Jaesik_Park3", "aff": "Pohang University of Science and Technology;POSTECH;POSTECH;Pohang University of Science and Technology", "aff_domain": "postech.ac.kr;postech.ac.kr;postech.ac.kr;postech.edu", "position": "MS student;MS student;Associate Professor;Assistant Professor", "bibtex": "@misc{\npark2022efficient,\ntitle={Efficient Point Transformer for Large-scale 3D Scene Understanding},\nauthor={Chunghyun Park and Yoonwoo Jeong and Minsu Cho and Jaesik Park},\nyear={2022},\nurl={https://openreview.net/forum?id=3SUToIxuIT3}\n}", "github": "", "project": "", "reviewers": "gXUq;PKxx;aiRe;Utsv", "site": "https://openreview.net/forum?id=3SUToIxuIT3", "pdf_size": 0, "recommendation": "5;5;5;6", "confidence": "4;3;4;4", "correctness": "4;3;3;3", "technical_novelty": "3;3;3;2", "empirical_novelty": "3;3;0;2", "wc_summary_paper": "80;75;54;58", "wc_summary_review": "40;47;40;79", "wc_main_review": "177;617;239;331", "wc_review": "297;739;333;468", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 5.25, 0.4330127018922193 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 1.224744871391589 ], "wc_summary_paper_avg": [ 66.75, 10.985786271359915 ], "wc_summary_review_avg": [ 51.5, 16.132265804901678 ], "wc_main_review_avg": [ 341.0, 168.50519279832298 ], "wc_review_avg": [ 459.25, 173.63809345877993 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": -0.3333333333333333, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10192284546056478460&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Pohang University of Science and Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.postech.ac.kr", "aff_unique_abbr": "POSTECH", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Pohang", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "South Korea" }, { "id": "3Skn65dgAr4", "title": "Differentiable Self-Adaptive Learning Rate", "track": "main", "status": "Reject", "tldr": "", "abstract": "Adaptive learning rate has been studied for a long time. In the training session of neural networks, learning rate controls update stride and direction in a multi-dimensional space. A large learning rate may cause failure to converge, while a small learning rate will make the convergence too slow.\nEven though some optimizers make learning rate adaptive to the training, e.g., using first-order and second-order momentum to adapt learning rate, their network's parameters are still unstable during training and converges too slowly in many occasions.\nTo solve this problem, we propose a novel optimizer which makes learning rate differentiable with the goal of minimizing loss function and thereby realize an optimizer with truly self-adaptive learning rate. We conducted extensive experiments on multiple network models compared with various benchmark optimizers. It is shown that our optimizer achieves fast and high qualified convergence in extremely short epochs, which is far more faster than those state-of-art optimizers.", "keywords": "self-adaptive learning rate", "primary_area": "", "supplementary_material": "", "author": "Bozhou Chen;Hongzhi Wang;Chenmin Ba", "authorids": "~Bozhou_Chen1;~Hongzhi_Wang2;~Chenmin_Ba1", "gender": "M;M;M", "homepage": ";http://homepage.hit.edu.cn/wang;", "dblp": "259/9940;81/940;https://dblp.uni-trier.de/pid/259/9983.html", "google_scholar": "avQkdTsAAAAJ;;", "orcid": ";0000-0002-7521-2871;", "linkedin": ";;", "or_profile": "~Bozhou_Chen1;~Hongzhi_Wang2;~Chenmin_Ba1", "aff": "Harbin Institute of Technology;Harbin Institute of Technology;Harbin Institute of Technology", "aff_domain": "hit.edu.cn;hit.edu.cn;hit.edu.cn", "position": "MS student;Full Professor;MS student", "bibtex": "@misc{\nchen2022differentiable,\ntitle={Differentiable Self-Adaptive Learning Rate},\nauthor={Bozhou Chen and Hongzhi Wang and Chenmin Ba},\nyear={2022},\nurl={https://openreview.net/forum?id=3Skn65dgAr4}\n}", "github": "", "project": "", "reviewers": "bLyu;thhp;57Xy;piNT", "site": "https://openreview.net/forum?id=3Skn65dgAr4", "pdf_size": 0, "recommendation": "1;3;3;8", "confidence": "5;5;5;5", "correctness": "3;3;2;4", "technical_novelty": "1;2;1;3", "empirical_novelty": "1;2;1;3", "wc_summary_paper": "17;47;60;36", "wc_summary_review": "16;73;133;6", "wc_main_review": "196;464;357;88", "wc_review": "229;584;550;130", "wc_reply_reviewers": "0;329;0;0", "wc_reply_authors": "103;359;74;5", "reply_reviewers": "0;2;0;0", "reply_authors": "1;2;1;1", "recommendation_avg": [ 3.75, 2.5860201081971503 ], "confidence_avg": [ 5.0, 0.0 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 1.75, 0.82915619758885 ], "empirical_novelty_avg": [ 1.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 40.0, 15.763882770434446 ], "wc_summary_review_avg": [ 57.0, 50.77893263943227 ], "wc_main_review_avg": [ 276.25, 144.61046815497141 ], "wc_review_avg": [ 373.25, 197.25285169041283 ], "wc_reply_reviewers_avg": [ 82.25, 142.46117892254017 ], "wc_reply_authors_avg": [ 135.25, 133.9969682492854 ], "reply_reviewers_avg": [ 0.5, 0.8660254037844386 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.6835859270246631, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13162144457820460672&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Harbin Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "http://www.hit.edu.cn/", "aff_unique_abbr": "HIT", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Harbin", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "id": "3UeYAgzUe3", "title": "Encouraging Disentangled and Convex Representation with Controllable Interpolation Regularization", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "We focus on controllable disentangled representation learning (C-Dis-RL), where users can control the partition of the disentangled latent space to factorize dataset attributes (concepts) for downstream tasks. Two general problems remain underexplored in current methods: (1) They lack comprehensive disentanglement constraints, especially missing the minimization of mutual information between different attributes across latent and observation domains. (2) They lack convexity constraints in disentangled latent space, which is important for meaningfully manipulating specific attributes for downstream tasks. To encourage both comprehensive C-Dis-RL and convexity simultaneously, we propose a simple yet efficient method: Controllable Interpolation Regularization (CIR), which creates a positive loop\nwhere the disentanglement and convexity can help each other. Specifically, we conduct controlled interpolation in latent space during training and \u2019reuse\u2019 the encoder to help form a \u2019perfect disentanglement\u2019 regularization. In that case, (a) disentanglement loss implicitly enlarges the potential \u2019understandable\u2019 distribution to encourage convexity; (b) convexity can in turn improve robust and precise disentanglement. CIR is a general module and we merge CIR with three different algorithms: ELEGANT, I2I-Dis, and GZS-Net to show the compatibility and effectiveness. Qualitative and quantitative experiments show improvement in C-Dis-RL and latent convexity by CIR. This further improves downstream tasks: controllable image synthesis, cross-modality image translation and zero-shot synthesis. More\nexperiments demonstrate CIR can also improve other downstream tasks, such as new attribute value mining, data augmentation, and eliminating bias for fairness.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yunhao Ge;Zhi Xu;Yao Xiao;Gan Xin;Yunkui Pang;Laurent Itti", "authorids": "~Yunhao_Ge1;~Zhi_Xu2;~Yao_Xiao3;~Gan_Xin1;~Yunkui_Pang1;~Laurent_Itti1", "gender": "M;M;;M;M;M", "homepage": "https://gyhandy.github.io/;https://github.com/zhix9767;;;https://github.com/Pangyk;http://ilab.usc.edu", "dblp": "204/1908;;;;278/2962;31/3256", "google_scholar": "https://scholar.google.ca/citations?user=QhjGr4oAAAAJ;https://scholar.google.com/citations?hl=en;;;;xhUvqK8AAAAJ", "orcid": ";;;;;0000-0002-0168-2977", "linkedin": "yunhao-ge-720727135/;;;ganxin/;;", "or_profile": "~Yunhao_Ge1;~Zhi_Xu2;~Yao_Xiao3;~Gan_Xin1;~Yunkui_Pang1;~Laurent_Itti1", "aff": "University of Southern California;Northeastern University;;;University of North Carolina at Chapel Hill;University of Southern California", "aff_domain": "usc.edu;neu.edu;;;unc.edu;usc.edu", "position": "PhD student;PhD student;;;PhD student;Professor", "bibtex": "@misc{\nge2022encouraging,\ntitle={Encouraging Disentangled and Convex Representation with Controllable Interpolation Regularization},\nauthor={Yunhao Ge and Zhi Xu and Yao Xiao and Gan Xin and Yunkui Pang and Laurent Itti},\nyear={2022},\nurl={https://openreview.net/forum?id=3UeYAgzUe3}\n}", "github": "", "project": "", "reviewers": "yDXi;Vv5k;m9jP", "site": "https://openreview.net/forum?id=3UeYAgzUe3", "pdf_size": 0, "recommendation": "3;5;6", "confidence": "4;4;2", "correctness": "2;3;4", "technical_novelty": "2;3;3", "empirical_novelty": "1;2;3", "wc_summary_paper": "99;56;64", "wc_summary_review": "47;46;30", "wc_main_review": "332;248;174", "wc_review": "478;350;268", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 4.666666666666667, 1.247219128924647 ], "confidence_avg": [ 3.3333333333333335, 0.9428090415820634 ], "correctness_avg": [ 3.0, 0.816496580927726 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.0, 0.816496580927726 ], "wc_summary_paper_avg": [ 73.0, 18.672618098881223 ], "wc_summary_review_avg": [ 41.0, 7.788880963698615 ], "wc_main_review_avg": [ 251.33333333333334, 64.54627969311804 ], "wc_review_avg": [ 365.3333333333333, 86.4150192707006 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.7559289460184546, "corr_recommendation_correctness": 0.9819805060619659, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=238752421869131052&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "University of Southern California;Northeastern University;University of North Carolina", "aff_unique_dep": ";;", "aff_unique_url": "https://www.usc.edu;https://www.northeastern.edu;https://www.unc.edu", "aff_unique_abbr": "USC;NEU;UNC", "aff_campus_unique_index": "0;2;0", "aff_campus_unique": "Los Angeles;;Chapel Hill", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "3Uk9_JRVwiF", "title": "AID-PURIFIER: A LIGHT AUXILIARY NETWORK FOR BOOSTING ADVERSARIAL DEFENSE", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "In this study, we propose AID-Purifier that can boost the robustness of adversarially-trained networks by purifying their inputs. AID-purifier is an auxiliary network that works as an add-on to an already trained main classifier. To keep it computationally light, it is trained as a discriminator with a binary cross-entropy loss. To obtain additionally useful information from the adversarial examples, the architecture design is closely related to the information maximization principle where two layers of the main classification network are piped into the auxiliary network. To assist the iterative optimization procedure of purification, the auxiliary network is trained with AVmixup. AID-purifier can be also used together with other purifiers such as PixelDefend for an extra enhancement. Because input purification has been studied relative less when compared to adversarial training or gradient masking, we conduct extensive attack experiments to validate AID-purifier\u2019s robustness. The overall results indicate that the best performing adversarially-trained networks can be enhanced further with AID-purifier.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/17ac8674871640edfd036bd0df3bd62cf05890e1.zip", "author": "Duhun Hwang;Eunjung Lee;Wonjong Rhee", "authorids": "~Duhun_Hwang1;~Eunjung_Lee1;~Wonjong_Rhee1", "gender": "M;F;", "homepage": "https://www.linkedin.com/in/duhun-hwang-751752130/;https://sites.google.com/site/ejleelily;http://drl.snu.ac.kr", "dblp": ";;37/711", "google_scholar": "https://scholar.google.com/citations?hl=ko;https://scholar.google.co.kr/citations?user=j9Zw-7UAAAAJ;https://scholar.google.co.kr/citations?user=htFuYWsAAAAJ", "orcid": ";;0000-0002-2590-8774", "linkedin": ";;wonjong/", "or_profile": "~Duhun_Hwang1;~Eunjung_Lee1;~Wonjong_Rhee1", "aff": "Seoul National University;Seoul National University;Seoul National University", "aff_domain": "snu.ac.kr;snu.ac.kr;snu.ac.kr", "position": "PhD student;Postdoc;Associate Professor", "bibtex": "@misc{\nhwang2022aidpurifier,\ntitle={{AID}-{PURIFIER}: A {LIGHT} {AUXILIARY} {NETWORK} {FOR} {BOOSTING} {ADVERSARIAL} {DEFENSE}},\nauthor={Duhun Hwang and Eunjung Lee and Wonjong Rhee},\nyear={2022},\nurl={https://openreview.net/forum?id=3Uk9_JRVwiF}\n}", "github": "", "project": "", "reviewers": "E2aH;p83T;vyXJ", "site": "https://openreview.net/forum?id=3Uk9_JRVwiF", "pdf_size": 0, "recommendation": "3;5;6", "confidence": "4;3;4", "correctness": "3;3;4", "technical_novelty": "2;2;3", "empirical_novelty": "2;2;3", "wc_summary_paper": "112;38;56", "wc_summary_review": "55;52;19", "wc_main_review": "333;263;165", "wc_review": "500;353;240", "wc_reply_reviewers": "888;0;0", "wc_reply_authors": "1168;310;80", "reply_reviewers": "3;0;0", "reply_authors": "2;1;1", "recommendation_avg": [ 4.666666666666667, 1.247219128924647 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 68.66666666666667, 31.510139461590597 ], "wc_summary_review_avg": [ 42.0, 16.30950643030009 ], "wc_main_review_avg": [ 253.66666666666666, 68.90250761442253 ], "wc_review_avg": [ 364.3333333333333, 106.44664808667716 ], "wc_reply_reviewers_avg": [ 296.0, 418.60721446243616 ], "wc_reply_authors_avg": [ 519.3333333333334, 468.18894571410897 ], "reply_reviewers_avg": [ 1.0, 1.4142135623730951 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.18898223650461363, "corr_recommendation_correctness": 0.7559289460184545, "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9353499660620897060&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff_unique_index": "0;0;0", "aff_unique_norm": "Seoul National University", "aff_unique_dep": "", "aff_unique_url": "https://www.snu.ac.kr", "aff_unique_abbr": "SNU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "South Korea" }, { "id": "3Wybo29gGlx", "title": "Should we Replace CNNs with Transformers for Medical Images?", "track": "main", "status": "Reject", "tldr": "", "abstract": "Convolutional Neural Networks (CNNs) have reigned for a decade as the de facto approach to automated medical image diagnosis, pushing the state-of-the-art in classification, detection and segmentation tasks. Recently, vision transformers (ViTs) have appeared as a competitive alternative to CNNs, yielding impressive levels of performance in the natural image domain, while possessing several interesting properties that could prove beneficial for medical imaging tasks. In this work, we explore whether it is feasible to switch to transformer-based models in the medical imaging domain as well, or if we should keep working with CNNs - can we trivially replace CNNs with transformers? We consider this question in a series of experiments on several standard medical image benchmark datasets and tasks. Our findings show that, while CNNs perform better if trained from scratch, off-the-shelf vision transformers are on par with CNNs when pretrained on ImageNet in both classification and segmentation tasks. Further, ViTs often outperform their CNN counterparts when pretrained using self-supervision.", "keywords": "vision transformers;medical image analysis", "primary_area": "", "supplementary_material": "/attachment/698dd8b5f5bb75ee743c1d77fec2dc638b2f5764.zip", "author": "Christos Matsoukas;Johan Fredin Haslum;Moein Sorkhei;Magnus Soderberg;Kevin Smith", "authorids": "~Christos_Matsoukas1;~Johan_Fredin_Haslum1;~Moein_Sorkhei1;~Magnus_Soderberg1;~Kevin_Smith1", "gender": "M;;;;", "homepage": ";;;;", "dblp": "235/5347;;;;", "google_scholar": "3dezSAEAAAAJ;;;;", "orcid": "0000-0003-1401-3497;;;;", "linkedin": "chrismats/;;;magnus-s%C3%B6derberg-b8867447/;", "or_profile": "~Christos_Matsoukas1;~Johan_Fredin_Haslum1;~Moein_Sorkhei1;~Magnus_Soderberg1;~Kevin_Smith1", "aff": "KTH Royal Institute of Technology, Stockholm, Sweden;;;Clinical Pharmacology & Safety Sciences;", "aff_domain": "kth.se;;;astrazeneca.com;", "position": "PhD student;;;Instructor;", "bibtex": "@misc{\nmatsoukas2022should,\ntitle={Should we Replace {CNN}s with Transformers for Medical Images?},\nauthor={Christos Matsoukas and Johan Fredin Haslum and Moein Sorkhei and Magnus Soderberg and Kevin Smith},\nyear={2022},\nurl={https://openreview.net/forum?id=3Wybo29gGlx}\n}", "github": "", "project": "", "reviewers": "LKZi;aKib;hd46;T9mB;taLm", "site": "https://openreview.net/forum?id=3Wybo29gGlx", "pdf_size": 0, "recommendation": "3;3;3;5;6", "confidence": "4;4;4;4;5", "correctness": "4;2;2;2;3", "technical_novelty": "1;1;1;2;1", "empirical_novelty": "2;2;2;2;3", "wc_summary_paper": "87;68;101;92;46", "wc_summary_review": "105;14;7;65;63", "wc_main_review": "165;1622;194;359;527", "wc_review": "357;1704;302;516;636", "wc_reply_reviewers": "0;0;0;0;26", "wc_reply_authors": "572;2117;547;362;810", "reply_reviewers": "0;0;0;0;1", "reply_authors": "1;4;1;1;2", "recommendation_avg": [ 4.0, 1.2649110640673518 ], "confidence_avg": [ 4.2, 0.39999999999999997 ], "correctness_avg": [ 2.6, 0.8 ], "technical_novelty_avg": [ 1.2, 0.4 ], "empirical_novelty_avg": [ 2.2, 0.39999999999999997 ], "wc_summary_paper_avg": [ 78.8, 19.630588376307013 ], "wc_summary_review_avg": [ 50.8, 36.223749115738975 ], "wc_main_review_avg": [ 573.4, 540.0772537332043 ], "wc_review_avg": [ 703.0, 514.1937378070643 ], "wc_reply_reviewers_avg": [ 5.2, 10.4 ], "wc_reply_authors_avg": [ 881.6, 633.8979728631415 ], "reply_reviewers_avg": [ 0.2, 0.4 ], "reply_authors_avg": [ 1.8, 1.1661903789690602 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.7905694150420948, "corr_recommendation_correctness": 0.0, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10106858810591139940&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1", "aff_unique_norm": "KTH Royal Institute of Technology;Clinical Pharmacology & Safety Sciences", "aff_unique_dep": ";Clinical Pharmacology & Safety Sciences", "aff_unique_url": "https://www.kth.se;", "aff_unique_abbr": "KTH;", "aff_campus_unique_index": "0", "aff_campus_unique": "Stockholm;", "aff_country_unique_index": "0", "aff_country_unique": "Sweden;" }, { "id": "3XD_rnM97s", "title": "Understanding Knowledge Integration in Language Models with Graph Convolutions", "track": "main", "status": "Reject", "tldr": "", "abstract": "Pretrained language models (LMs) are not very good at robustly capturing factual knowledge. This has led to the development of a number of knowledge integration (KI) methods which aim to incorporate external knowledge into pretrained LMs. Even though KI methods show some performance gains over base LMs, the efficacy and limitations of these methods are not well-understood. For instance, it is unclear how and what kind of knowledge is effectively integrated into LMs and if such integration may lead to catastrophic forgetting of already learned knowledge. In this paper, we revisit the KI process from the view of graph signal processing and show that KI could be interpreted using a graph convolution operation. We propose a simple probe model called Graph Convolution Simulator (GCS) for interpreting knowledge-enhanced LMs and exposing what kind of knowledge is integrated into these models. We conduct experiments to verify that our GCS model can indeed be used to correctly interpret the KI process, and we use it to analyze two typical knowledge-enhanced LMs: K-Adapter and ERNIE. We find that only a small amount of factual knowledge is captured in these models during integration. While K-Adapter is better at integrating simple relational knowledge, complex relational knowledge is integrated better in ERNIE. We further find that while K-Adapter struggles to integrate time-related knowledge, it successfully integrates knowledge of unpopular entities and relations. Our analysis also show some challenges in KI. In particular, we find simply increasing the size of the KI corpus may not lead to better KI and more fundamental advances may be needed.", "keywords": "knowledge integration;graph convolution;language model;interpretation;knowledge graph;mutual information", "primary_area": "", "supplementary_material": "", "author": "Yifan Hou;Guoji Fu;MRINMAYA SACHAN", "authorids": "~Yifan_Hou1;~Guoji_Fu1;~MRINMAYA_SACHAN2", "gender": "M;;M", "homepage": "https://yifan-h.github.io/;;https://sites.google.com/site/mrinsachan/", "dblp": ";;86/10440.html", "google_scholar": "Bm23WyIAAAAJ;;Tpp9ZjoAAAAJ", "orcid": "0000-0002-3197-4460;;", "linkedin": "yifanhou;;", "or_profile": "~Yifan_Hou1;~Guoji_Fu1;~MRINMAYA_SACHAN2", "aff": "Department of Computer Science, Swiss Federal Institute of Technology;;Swiss Federal Institute of Technology", "aff_domain": "inf.ethz.ch;;ethz.ch", "position": "PhD student;;Assistant Professor", "bibtex": "@misc{\nhou2022understanding,\ntitle={Understanding Knowledge Integration in Language Models with Graph Convolutions},\nauthor={Yifan Hou and Guoji Fu and MRINMAYA SACHAN},\nyear={2022},\nurl={https://openreview.net/forum?id=3XD_rnM97s}\n}", "github": "", "project": "", "reviewers": "4QcM;WDUZ;eLPV;Lpu3;pRFd", "site": "https://openreview.net/forum?id=3XD_rnM97s", "pdf_size": 0, "recommendation": "3;5;6;6;8", "confidence": "4;4;3;3;3", "correctness": "2;3;3;2;4", "technical_novelty": "3;2;3;3;4", "empirical_novelty": "2;3;4;3;3", "wc_summary_paper": "324;152;255;96;63", "wc_summary_review": "257;25;43;112;42", "wc_main_review": "972;250;257;562;234", "wc_review": "1553;427;555;770;339", "wc_reply_reviewers": "1124;0;47;0;0", "wc_reply_authors": "4110;904;688;994;583", "reply_reviewers": "2;0;1;0;0", "reply_authors": "8;3;3;3;2", "recommendation_avg": [ 5.6, 1.624807680927192 ], "confidence_avg": [ 3.4, 0.4898979485566356 ], "correctness_avg": [ 2.8, 0.7483314773547882 ], "technical_novelty_avg": [ 3.0, 0.6324555320336759 ], "empirical_novelty_avg": [ 3.0, 0.6324555320336759 ], "wc_summary_paper_avg": [ 178.0, 97.84681905918046 ], "wc_summary_review_avg": [ 95.8, 85.95673330228412 ], "wc_main_review_avg": [ 455.0, 285.93985381544843 ], "wc_review_avg": [ 728.8, 436.8608016290773 ], "wc_reply_reviewers_avg": [ 234.2, 445.2722313371899 ], "wc_reply_authors_avg": [ 1455.8, 1335.2016177341907 ], "reply_reviewers_avg": [ 0.6, 0.7999999999999999 ], "reply_authors_avg": [ 3.8, 2.1354156504062622 ], "replies_avg": [ 29, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.8040302522073698, "corr_recommendation_correctness": 0.7566444492037343, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10390178175970120797&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Swiss Federal Institute of Technology", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.ethz.ch", "aff_unique_abbr": "ETH Zurich", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Switzerland" }, { "id": "3XcEQTRyxhp", "title": "Object-Aware Cropping for Self-Supervised Learning", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "A core component of the recent success of self-supervised learning is cropping data augmentation, which selects sub-regions of an image to be used as positive views in the self-supervised loss. The underlying assumption is that randomly cropped and resized regions of a given image share information about the objects of interest, which the learned representation will capture. This assumption is mostly satisfied in datasets such as ImageNet where there is a large, centered object, which is highly likely to be present in random crops of the full image. However, in other datasets such as OpenImages or COCO, which are more representative of real world uncurated data, there are typically multiple small objects in an image. In this work, we show that self-supervised learning based on the usual random cropping performs poorly on such datasets. We propose replacing one or both of the random crops with crops obtained from an object proposal algorithm. This encourages the model to learn both object and scene level semantic representations. Using this approach, which we call object-aware cropping, results in significant improvements over scene cropping on classification and object detection benchmarks. For example, on OpenImages, our approach achieves an improvement of 8.8% mAP over random scene-level cropping using MoCo-v2 based pre-training. We also show significant improvements on COCO and PASCAL-VOC object detection and segmentation tasks over the state-of-the-art self-supervised learning approaches.\nOur approach is efficient, simple and general, and can be used in most existing contrastive and non-contrastive self-supervised learning frameworks. ", "keywords": "Object cropping;Self-Supervised learning for multi-object dataset.", "primary_area": "", "supplementary_material": "", "author": "Shlok Kumar Mishra;Anshul Shah;Ankan Bansal;Abhyuday Narayan Jagannatha;Abhishek Sharma;David Jacobs;Dilip Krishnan", "authorids": "~Shlok_Kumar_Mishra1;~Anshul_Shah1;~Ankan_Bansal1;~Abhyuday_Narayan_Jagannatha1;~Abhishek_Sharma6;~David_Jacobs1;~Dilip_Krishnan1", "gender": "M;M;;M;M;;M", "homepage": "https://shlokk.github.io/shlokmishra.github.io/;;;https://people.cs.umass.edu/~abhyuday/;;;http://dilipkay.wordpress.com", "dblp": "173/6664;250/5430;;;;;08/2316", "google_scholar": "6XJ-4S0AAAAJ;akf8VG8AAAAJ;;v79KSqMAAAAJ;18fTep8AAAAJ;;_MEuWIMAAAAJ", "orcid": ";;;;;;", "linkedin": "shlokk/;;;;abhishek-sharma-a1204921;;", "or_profile": "~Shlok_Kumar_Mishra1;~Anshul_Shah1;~Ankan_Bansal1;~Abhyuday_Narayan_Jagannatha1;~Abhishek_Sharma6;~David_Jacobs1;~Dilip_Krishnan1", "aff": "University of Maryland, College Park;Microsoft;;;;;Google", "aff_domain": "umd.edu;microsoft.com;;;;;google.com", "position": "PhD student;Research intern;;;;;Research Scientist", "bibtex": "@misc{\nmishra2022objectaware,\ntitle={Object-Aware Cropping for Self-Supervised Learning},\nauthor={Shlok Kumar Mishra and Anshul Shah and Ankan Bansal and Abhyuday Narayan Jagannatha and Abhishek Sharma and David Jacobs and Dilip Krishnan},\nyear={2022},\nurl={https://openreview.net/forum?id=3XcEQTRyxhp}\n}", "github": "", "project": "", "reviewers": "x6Kd;N5yA;W5Fm;zJgS", "site": "https://openreview.net/forum?id=3XcEQTRyxhp", "pdf_size": 0, "recommendation": "1;3;3;6", "confidence": "5;5;4;4", "correctness": "1;4;2;4", "technical_novelty": "2;2;2;3", "empirical_novelty": "1;2;2;2", "wc_summary_paper": "45;109;173;54", "wc_summary_review": "215;24;217;35", "wc_main_review": "1144;177;616;163", "wc_review": "1404;310;1006;252", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.25, 1.7853571071357126 ], "confidence_avg": [ 4.5, 0.5 ], "correctness_avg": [ 2.75, 1.299038105676658 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 95.25, 51.13890397730479 ], "wc_summary_review_avg": [ 122.75, 93.33374255862667 ], "wc_main_review_avg": [ 525.0, 401.1203061426833 ], "wc_review_avg": [ 743.0, 483.3890772452352 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": -0.7001400420140049, "corr_recommendation_correctness": 0.7815036806726284, "gs_citation": 28, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6520287571289148702&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff_unique_index": "0;1;2", "aff_unique_norm": "University of Maryland;Microsoft;Google", "aff_unique_dep": ";Microsoft Corporation;Google", "aff_unique_url": "https://www/umd.edu;https://www.microsoft.com;https://www.google.com", "aff_unique_abbr": "UMD;Microsoft;Google", "aff_campus_unique_index": "0;2", "aff_campus_unique": "College Park;;Mountain View", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "GDA-AM: ON THE EFFECTIVENESS OF SOLVING MIN-IMAX OPTIMIZATION VIA ANDERSON MIXING", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6099", "id": "3YqeuCVwy1d", "poster": "", "openreview": "https://openreview.net/forum?id=3YqeuCVwy1d", "slides": "https://iclr.cc/virtual/2022/poster/6099", "video": "https://iclr.cc/virtual/2022/poster/6099", "author_site": "Huan He, Shifan Zhao, Yuanzhe Xi, Joyce Ho, Yousef Saad", "tldr": "", "abstract": "Many modern machine learning algorithms such as generative adversarial networks (GANs) and adversarial training can be formulated as minimax optimization.Gradient descent ascent (GDA) is the most commonly used algorithm due to its simplicity. However, GDA can converge to non-optimal minimax points. We propose a new minimax optimization framework,GDA-AM, that views the GDA dynamics as a fixed-point iteration and solves it using Anderson Mixing to converge to the local minimax. It addresses the diverging issue of simultaneous GDA and accelerates the convergence of alternating GDA. We show theoretically that the algorithm can achieve global convergence for bilinear problems under mildconditions. We also empirically show that GDA-AM solves a variety of minimax problems and improves GAN training on several datasets", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/c83c91081ac65ee2f2fd8403d50f994af4c99227.zip", "author": "Huan He;Shifan Zhao;Yuanzhe Xi;Joyce Ho;Yousef Saad", "authorids": "~Huan_He2;~Shifan_Zhao1;~Yuanzhe_Xi1;~Joyce_Ho1;~Yousef_Saad2", "gender": "M;M;M;F;M", "homepage": "https://hehuannb.github.io/;;http://www.math.emory.edu/~yxi26/;http://joyceho.github.io/;https://www.cs.umn.edu/~saad", "dblp": ";244/1502;;144/4961;", "google_scholar": "https://scholar.google.com/citations?hl=en;;;DrUBb5sAAAAJ;", "orcid": ";;;;", "linkedin": "huanheemory/;%E4%B8%96%E5%87%A1-%E8%B5%B5-b35742ba/;;;", "or_profile": "~Huan_He2;~Shifan_Zhao1;~Yuanzhe_Xi1;~Joyce_Ho1;~Yousef_Saad2", "aff": "Emory University;Emory University;;Emory University;, University of Minnesota, Minneapolis", "aff_domain": "emory.edu;emory.edu;;emory.edu;cs.umn.edu", "position": "PhD student;PhD student;;Assistant Professor;Full Professor", "bibtex": "@inproceedings{\nhe2022gdaam,\ntitle={{GDA}-{AM}: {ON} {THE} {EFFECTIVENESS} {OF} {SOLVING} {MIN}-{IMAX} {OPTIMIZATION} {VIA} {ANDERSON} {MIXING}},\nauthor={Huan He and Shifan Zhao and Yuanzhe Xi and Joyce Ho and Yousef Saad},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=3YqeuCVwy1d}\n}", "github": "", "project": "", "reviewers": "gBHU;KCYs;VjNk;t7gv", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "3;5;4;3", "correctness": "3;3;3;4", "technical_novelty": "2;1;3;3", "empirical_novelty": "2;3;3;2", "wc_summary_paper": "70;48;56;89", "wc_summary_review": "2;58;98;155", "wc_main_review": "202;397;442;219", "wc_review": "274;503;596;463", "wc_reply_reviewers": "0;139;279;0", "wc_reply_authors": "1022;1933;2253;546", "reply_reviewers": "0;1;2;0", "reply_authors": "2;5;5;2", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.82915619758885 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 65.75, 15.562374497485916 ], "wc_summary_review_avg": [ 78.25, 55.91231975155386 ], "wc_main_review_avg": [ 315.0, 105.87492621012777 ], "wc_review_avg": [ 459.0, 117.20281566583628 ], "wc_reply_reviewers_avg": [ 104.5, 115.62979719778116 ], "wc_reply_authors_avg": [ 1438.5, 685.1950452243507 ], "reply_reviewers_avg": [ 0.75, 0.82915619758885 ], "reply_authors_avg": [ 3.5, 1.5 ], "replies_avg": [ 24, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.20751433915982243, "corr_recommendation_correctness": 0.9271726499455306, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14668679221362678458&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=3YqeuCVwy1d", "email": "emory.edu;emory.edu;;emory.edu;cs.umn.edu", "author_num": 5, "aff_unique_index": "0;0;0;1", "aff_unique_norm": "Emory University;University of Minnesota", "aff_unique_dep": ";", "aff_unique_url": "https://www.emory.edu;https://www.minnesota.edu", "aff_unique_abbr": "Emory;UMN", "aff_campus_unique_index": "1", "aff_campus_unique": ";Minneapolis", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "3ZuLmU7zBpy", "title": "Sanitizer: Sanitizing data for anonymizing sensitive information", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "We propose a framework that protects against sensitive information leakage to facilitate data release with untrusted parties. Sanitization concerns with transforming a data sample to remove sensitive attribute information while retaining every other information with a goal of keeping its utility high for unknown downstream tasks. This is done in a two-step process: first, we develop a method that encodes unstructured image-like modality into a structured representation bifurcated by sensitive and non-sensitive representation. Second, we design mechanisms that transform the sensitive features such that the data obtained from projecting features back to the image protects from the leakage of sensitive information. Instead of removing sensitive information from the unstructured data, we replace the sensitive features by sampling synthetic features from the joint distribution of the sensitive features in its structured representation. Hence, using this method one can share a sanitized dataset that preserves distribution with the original dataset resulting in a good utility-privacy trade-off. We compare our technique against state-of-the-art baselines and demonstrate competitive empirical results quantitatively and qualitatively.", "keywords": "privacy preserving machine learning;private data release;privacy for computer vision", "primary_area": "", "supplementary_material": "", "author": "Abhishek Singh;Ethan Garza;Ayush Chopra;Praneeth Vepakomma;Vivek Sharma;Ramesh Raskar", "authorids": "~Abhishek_Singh5;ezg@mit.edu;~Ayush_Chopra1;~Praneeth_Vepakomma2;~Vivek_Sharma1;~Ramesh_Raskar1", "gender": "M;;M;;M;M", "homepage": "https://tremblerz.github.io/;;https://www.media.mit.edu/people/ayushc/overview/;https://praneeth.mit.edu/;https://vivoutlaw.github.io/;https://www.media.mit.edu/people/raskar/overview/", "dblp": "27/2328-5;;;131/6694;;r/RameshRaskar", "google_scholar": "https://scholar.google.co.in/citations?user=3QygpzAAAAAJ;;BVeYLpcAAAAJ;T_mPgZIAAAAJ;fNbVXwQAAAAJ;", "orcid": "0000-0003-0217-9801;;;;;0000-0002-3254-3224", "linkedin": "tremblerz/;;;;vivoutlaw/;", "or_profile": "~Abhishek_Singh5;ezg@mit.edu;~Ayush_Chopra1;~Praneeth_Vepakomma2;~Vivek_Sharma1;~Ramesh_Raskar1", "aff": "Massachusetts Institute of Technology;;Massachusetts Institute of Technology;Massachusetts Institute of Technology;Massachusetts Institute of Technology;Massachusetts Institute of Technology", "aff_domain": "mit.edu;;mit.edu;mit.edu;mit.edu;mit.edu", "position": "PhD student;;PhD student;PhD student;PhD Student and Research Scientist;Associate Professor", "bibtex": "@misc{\nsingh2022sanitizer,\ntitle={Sanitizer: Sanitizing data for anonymizing sensitive information},\nauthor={Abhishek Singh and Ethan Garza and Ayush Chopra and Praneeth Vepakomma and Vivek Sharma and Ramesh Raskar},\nyear={2022},\nurl={https://openreview.net/forum?id=3ZuLmU7zBpy}\n}", "github": "", "project": "", "reviewers": "27v9;7GLi;EDSg;7zWM", "site": "https://openreview.net/forum?id=3ZuLmU7zBpy", "pdf_size": 0, "recommendation": "3;3;3;3", "confidence": "3;4;4;3", "correctness": "2;2;2;2", "technical_novelty": "2;2;2;3", "empirical_novelty": "0;2;2;2", "wc_summary_paper": "47;60;26;204", "wc_summary_review": "23;13;27;52", "wc_main_review": "683;392;180;771", "wc_review": "753;465;233;1027", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 2.0, 0.0 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 84.25, 70.19392780005974 ], "wc_summary_review_avg": [ 28.75, 14.359230480774379 ], "wc_main_review_avg": [ 506.5, 234.9601029962321 ], "wc_review_avg": [ 619.5, 298.8021920936993 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:_k0RlQciNjoJ:scholar.google.com/&scioq=Sanitizer:+Sanitizing+data+for+anonymizing+sensitive+information&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Massachusetts Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://web.mit.edu", "aff_unique_abbr": "MIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "3aZMdP1BdSm", "title": "Identifying Interactions among Categorical Predictors with Monte-Carlo Tree Search", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Identifying interpretable interactions among categorical predictors for predictive modeling is crucial in various research fields. Recent studies have examined interpretable interactions using decision tree (DT) learning methods, which construct DTs by greedy rules due to the high memory and time complexity of building and evaluating DTs, resulting in a local optimal solution. This paper formulates the selection of quadratic and higher order interactive terms into a LASSO problem and then relaxes it into multiple DT learning problems. A Monte Carlo Tree Search-based interaction selection (MCTs-IS) method is proposed to identify the optimal DT in an online learning manner. A DT pruning strategy is developed based on LASSO that can easily be applied to MCTs. We prove that MCTs-IS converges with high probability to the optimal solution of the DT learning problem. Extensive experiments have been conducted to demonstrate the effectiveness of the proposed algorithm on real-world datasets.", "keywords": "interaction identification;Monte Carlo tree search;decision tree", "primary_area": "", "supplementary_material": "", "author": "Tan Zhu;Fei Do;Chloe Becquey;Jinbo Bi", "authorids": "~Tan_Zhu1;fei.dou@uconn.edu;~Chloe_Becquey1;~Jinbo_Bi1", "gender": ";;F;F", "homepage": "http://tanzhu.info;;;https://jinbo-bi.uconn.edu/", "dblp": "170/5347;;;26/3430", "google_scholar": "n4wTgx4AAAAJ;;;https://scholar.google.com/citations?hl=en", "orcid": ";;0000-0002-0062-947X;0000-0001-6996-4092", "linkedin": ";;;", "or_profile": "~Tan_Zhu1;fei.dou@uconn.edu;~Chloe_Becquey1;~Jinbo_Bi1", "aff": "University of Connecticut;;University of Connecticut;University of Connecticut", "aff_domain": "uconn.edu;;uconn.edu;uconn.edu", "position": "PhD student;;Undergrad student;Professor", "bibtex": "@misc{\nzhu2022identifying,\ntitle={Identifying Interactions among Categorical Predictors with Monte-Carlo Tree Search},\nauthor={Tan Zhu and Fei Do and Chloe Becquey and Jinbo Bi},\nyear={2022},\nurl={https://openreview.net/forum?id=3aZMdP1BdSm}\n}", "github": "", "project": "", "reviewers": "bdKu;RXyb;FrEv;SnRi", "site": "https://openreview.net/forum?id=3aZMdP1BdSm", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "3;3;3;3", "correctness": "2;2;4;3", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;0;2;2", "wc_summary_paper": "62;249;113;26", "wc_summary_review": "46;144;2;72", "wc_main_review": "802;770;229;88", "wc_review": "910;1163;344;186", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 3.0, 0.0 ], "correctness_avg": [ 2.75, 0.82915619758885 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 112.5, 84.65370635713477 ], "wc_summary_review_avg": [ 66.0, 51.51698748956503 ], "wc_main_review_avg": [ 472.25, 317.887067210983 ], "wc_review_avg": [ 650.75, 399.9058482943204 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.9045340337332909, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:FzciP07b7DoJ:scholar.google.com/&scioq=Identifying+Interactions+among+Categorical+Predictors+with+Monte-Carlo+Tree+Search&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Connecticut", "aff_unique_dep": "", "aff_unique_url": "https://www.uconn.edu", "aff_unique_abbr": "UConn", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "On the Importance of Difficulty Calibration in Membership Inference Attacks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7213", "id": "3eIrli0TwQ", "poster": "", "openreview": "https://openreview.net/forum?id=3eIrli0TwQ", "slides": "https://iclr.cc/virtual/2022/poster/7213", "video": "https://iclr.cc/virtual/2022/poster/7213", "author_site": "Lauren Watson, Chuan Guo, Graham Cormode, Alexandre Sablayrolles", "tldr": "", "abstract": "The vulnerability of machine learning models to membership inference attacks has received much attention in recent years. However, existing attacks mostly remain impractical due to having high false positive rates, where non-member samples are often erroneously predicted as members. This type of error makes the predicted membership signal unreliable, especially since most samples are non-members in real world applications. In this work, we argue that membership inference attacks can benefit drastically from difficulty calibration, where an attack's predicted membership score is adjusted to the difficulty of correctly classifying the target sample. We show that difficulty calibration can significantly reduce the false positive rate of a variety of existing attacks without a loss in accuracy.", "keywords": "membership inference attack;privacy", "primary_area": "", "supplementary_material": "", "author": "Lauren Watson;Chuan Guo;Graham Cormode;Alexandre Sablayrolles", "authorids": "~Lauren_Watson1;~Chuan_Guo1;~Graham_Cormode1;~Alexandre_Sablayrolles1", "gender": "F;M;M;", "homepage": "https://laurenwatson.github.io/;https://sites.google.com/view/chuanguo;http://dimacs.rutgers.edu/~graham/;", "dblp": ";;c/GrahamCormode;186/7749", "google_scholar": "BSSfZbMAAAAJ;0gp5M-kAAAAJ;https://scholar.google.co.uk/citations?user=gpLVKmEAAAAJ;Wy8wM-cAAAAJ", "orcid": ";;0000-0002-0698-0922;", "linkedin": ";;;", "or_profile": "~Lauren_Watson1;~Chuan_Guo1;~Graham_Cormode1;~Alexandre_Sablayrolles1", "aff": "University of Edinburgh;Meta;The university of Warwick;Meta Facebook", "aff_domain": "ed.ac.uk;meta.com;warwick.ac.uk;fb.com", "position": "PhD student;Researcher;Full Professor;Researcher", "bibtex": "@inproceedings{\nwatson2022on,\ntitle={On the Importance of Difficulty Calibration in Membership Inference Attacks},\nauthor={Lauren Watson and Chuan Guo and Graham Cormode and Alexandre Sablayrolles},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=3eIrli0TwQ}\n}", "github": "", "project": "", "reviewers": "32Pd;QoUg;QzDd;2TZc", "pdf_size": 0, "recommendation": "5;5;5;8", "confidence": "2;4;4;5", "correctness": "3;3;3;4", "technical_novelty": "3;2;2;2", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "78;75;137;17", "wc_summary_review": "15;47;131;17", "wc_main_review": "69;156;211;411", "wc_review": "162;278;479;445", "wc_reply_reviewers": "0;0;93;0", "wc_reply_authors": "86;338;741;278", "reply_reviewers": "0;0;1;0", "reply_authors": "1;1;2;1", "recommendation_avg": [ 5.75, 1.299038105676658 ], "confidence_avg": [ 3.75, 1.0897247358851685 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 76.75, 42.440399385491176 ], "wc_summary_review_avg": [ 52.5, 47.06113045816048 ], "wc_main_review_avg": [ 211.75, 125.68487379155854 ], "wc_review_avg": [ 341.0, 128.32575735213877 ], "wc_reply_reviewers_avg": [ 23.25, 40.2701812759764 ], "wc_reply_authors_avg": [ 360.75, 238.454791312735 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.662266178532522, "corr_recommendation_correctness": 1.0, "gs_citation": 148, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2933122838404146328&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=3eIrli0TwQ", "email": "ed.ac.uk;meta.com;warwick.ac.uk;fb.com", "author_num": 4, "aff_unique_index": "0;1;2;1", "aff_unique_norm": "University of Edinburgh;Meta;University of Warwick", "aff_unique_dep": ";Meta Platforms, Inc.;", "aff_unique_url": "https://www.ed.ac.uk;https://meta.com;https://warwick.ac.uk", "aff_unique_abbr": "Edinburgh;Meta;Warwick", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;1", "aff_country_unique": "United Kingdom;United States" }, { "id": "3iH9ewU_KJT", "title": "MT-GBM: A Multi-Task Gradient Boosting Machine with Shared Decision Trees", "track": "main", "status": "Reject", "tldr": "", "abstract": "Despite the success of deep learning in computer vision and natural language processing, Gradient Boosted Decision Tree (GBDT) is yet one of the most powerful tools for applications with tabular data such as e-commerce and FinTech. However, applying GBDT to multi-task learning is still a challenge. Unlike deep models that can jointly learn a shared latent representation across multiple tasks, GBDT can hardly learn a shared tree structure.\n\nIn this paper, we propose Multi-Task Gradient Boosting Machine (MT-GBM), a GBDT-based method for multi-task learning. The MT-GBM can find the shared tree structures and split branches according to multi-task losses. First, it assigns multiple outputs to each leaf node. Next, it computes the gradient corresponding to each output (task). Then, we also propose an algorithm to combine the gradients of all tasks and update the tree. Finally, we apply MT-GBM to LightGBM. Experiments show that our MT-GBM improves the performance of the main task significantly, which means the proposed MT-GBM is efficient and effective.", "keywords": "Multi-Task learning;Classification", "primary_area": "", "supplementary_material": "", "author": "Zhenzhe Ying;Zhuoer Xu;LANQING XUE;Changhua Meng;Weiqiang Wang", "authorids": "~Zhenzhe_Ying3;~Zhuoer_Xu1;~LANQING_XUE2;~Changhua_Meng1;weiqiang.wwq@antgroup.com", "gender": "M;M;F;M;", "homepage": "https://www.linkedin.com/in/zhenzhe-ying-4446bb111/;https://github.com/Unkrible;;https://www.linkedin.com/in/changhua-meng-04826021/;", "dblp": "311/5504;276/7035;255/5215.html;295/9441;", "google_scholar": ";na24qQoAAAAJ;;;", "orcid": ";;;;", "linkedin": "zhenzhe-ying-4446bb111/;;;;", "or_profile": "~Zhenzhe_Ying3;~Zhuoer_Xu1;~LANQING_XUE2;~Changhua_Meng1;weiqiang.wwq@antgroup.com", "aff": "Alibaba Group;Nanjing University;Ant Group;Ant Group;", "aff_domain": "antgroup.com;nju.edu.cn;antgroup.com;antgroup.com;", "position": "MS student;MS student;Researcher;Researcher;", "bibtex": "@misc{\nying2022mtgbm,\ntitle={{MT}-{GBM}: A Multi-Task Gradient Boosting Machine with Shared Decision Trees},\nauthor={Zhenzhe Ying and Zhuoer Xu and LANQING XUE and Changhua Meng and Weiqiang Wang},\nyear={2022},\nurl={https://openreview.net/forum?id=3iH9ewU_KJT}\n}", "github": "", "project": "", "reviewers": "UURx;8oMi;tQ17;CBKT", "site": "https://openreview.net/forum?id=3iH9ewU_KJT", "pdf_size": 0, "recommendation": "3;3;3;3", "confidence": "4;4;3;3", "correctness": "1;2;2;3", "technical_novelty": "2;3;3;2", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "297;180;35;84", "wc_summary_review": "133;62;20;48", "wc_main_review": "1448;1024;225;108", "wc_review": "1878;1266;280;240", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 2.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 149.0, 100.1074422807815 ], "wc_summary_review_avg": [ 65.75, 41.66758332325022 ], "wc_main_review_avg": [ 701.25, 556.9027630565322 ], "wc_review_avg": [ 916.0, 690.9080980854111 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10324054203079629125&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;2;2", "aff_unique_norm": "Alibaba Group;Nanjing University;Ant Group", "aff_unique_dep": ";;", "aff_unique_url": "https://www.alibaba.com;https://www.nju.edu.cn;https://www.antgroup.com", "aff_unique_abbr": "Alibaba;Nanjing U;Ant Group", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "title": "FlexConv: Continuous Kernel Convolutions With Differentiable Kernel Sizes", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6763", "id": "3jooF27-0Wy", "poster": "", "openreview": "https://openreview.net/forum?id=3jooF27-0Wy", "slides": "https://iclr.cc/virtual/2022/poster/6763", "video": "https://iclr.cc/virtual/2022/poster/6763", "author_site": "David W. Romero, Robert-Jan Bruintjes, Jakub Tomczak, Erik Bekkers, Mark Hoogendoorn, Jan Gemert", "tldr": "", "abstract": "When designing Convolutional Neural Networks (CNNs), one must select the size of the convolutional kernels before training. Recent works show CNNs benefit from different kernel sizes at different layers, but exploring all possible combinations is unfeasible in practice. A more efficient approach is to learn the kernel size during training. However, existing works that learn the kernel size have a limited bandwidth. These approaches scale kernels by dilation, and thus the detail they can describe is limited. In this work, we propose FlexConv, a novel convolutional operation with which high bandwidth convolutional kernels of learnable kernel size can be learned at a fixed parameter cost. FlexNets model long-term dependencies without the use of pooling, achieve state-of-the-art performance on several sequential datasets, outperform recent works with learned kernel sizes, and are competitive with much deeper ResNets on image benchmark datasets. Additionally, FlexNets can be deployed at higher resolutions than those seen during training. To avoid aliasing, we propose a novel kernel parameterization with which the frequency of the kernels can be analytically controlled. Our novel kernel parameterization shows higher descriptive power and faster convergence speed than existing parameterizations. This leads to important improvements in classification accuracy.", "keywords": "Convolutional neural networks;learnable kernel size;continuous convolutional kernels;alias-free convolutional networks;implicit neural representations;resolution-agnostic representations;time series;sequential data;computer vision", "primary_area": "", "supplementary_material": "/attachment/04b4fb892858d3bdb2302dae5548c62f30e6e7ab.zip", "author": "David W. Romero;Robert-Jan Bruintjes;Jakub Mikolaj Tomczak;Erik J Bekkers;Mark Hoogendoorn;Jan van Gemert", "authorids": "~David_W._Romero1;~Robert-Jan_Bruintjes1;~Jakub_Mikolaj_Tomczak1;~Erik_J_Bekkers1;~Mark_Hoogendoorn2;~Jan_van_Gemert1", "gender": "M;M;M;;M;M", "homepage": "https://davidwromero.xyz/;https://rjbruintjes.nl;https://jmtomczak.github.io/;https://erikbekkers.bitbucket.io/;http://www.cs.vu.nl/~mhoogen;https://jvgemert.github.io/", "dblp": "254/1396;166/3241;80/8238;43/5596;19/1103.html;25/3153", "google_scholar": "7tdzmVoAAAAJ;RXVnqgcAAAAJ;https://scholar.google.pl/citations?user=XB99pR4AAAAJ;https://scholar.google.nl/citations?user=yeWrfR4AAAAJ;3s4lqHkAAAAJ;JUdMRGcAAAAJ", "orcid": ";0000-0002-9798-0214;0000-0001-8634-6878;;;0000-0002-3913-2786", "linkedin": "david-w-romero-05893567/;;jakub-tomczak-04305314a/;;;jan-van-gemert-1628b94/", "or_profile": "~David_W._Romero1;~Robert-Jan_Bruintjes1;~Jakub_Mikolaj_Tomczak1;~Erik_J_Bekkers1;~Mark_Hoogendoorn2;~Jan_C_van_Gemert1", "aff": "Vrije Universiteit Amsterdam;Delft University of Technology;Vrije Universiteit Amsterdam;University of Amsterdam;VU University Amsterdam;Delft University of Technology", "aff_domain": "vu.nl;tudelft.nl;vu.nl;uva.nl;vu.nl;tudelft.nl", "position": "PhD student;PhD student;Assistant Professor;Assistant Professor;Full Professor;Associate Professor", "bibtex": "@inproceedings{\nromero2022flexconv,\ntitle={FlexConv: Continuous Kernel Convolutions With Differentiable Kernel Sizes},\nauthor={David W. Romero and Robert-Jan Bruintjes and Jakub Mikolaj Tomczak and Erik J Bekkers and Mark Hoogendoorn and Jan van Gemert},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=3jooF27-0Wy}\n}", "github": "", "project": "", "reviewers": "hUMj;T41g;APZa;ub8r", "pdf_size": 0, "recommendation": "6;6;6;8", "confidence": "3;3;4;4", "correctness": "3;2;3;3", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;2;3;2", "wc_summary_paper": "97;82;47;58", "wc_summary_review": "18;26;22;45", "wc_main_review": "293;194;373;729", "wc_review": "408;302;442;832", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "1721;951;757;1346", "reply_reviewers": "0;0;0;0", "reply_authors": "3;3;2;3", "recommendation_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 71.0, 19.63415391607186 ], "wc_summary_review_avg": [ 27.75, 10.353139620424328 ], "wc_main_review_avg": [ 397.25, 201.75774458493532 ], "wc_review_avg": [ 496.0, 200.7436175822285 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 1193.75, 371.0966013048355 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.75, 0.4330127018922193 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 104, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1024278192039187692&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "pdf": "https://openreview.net/pdf?id=3jooF27-0Wy", "email": "vu.nl;tudelft.nl;vu.nl;uva.nl;vu.nl;tudelft.nl", "author_num": 6, "aff_unique_index": "0;1;0;2;3;1", "aff_unique_norm": "Vrije Universiteit Amsterdam;Delft University of Technology;University of Amsterdam;VU University Amsterdam", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.vu.nl;https://www.tudelft.nl;https://www.uva.nl;https://www.vu.nl", "aff_unique_abbr": "VU Amsterdam;TU Delft;UvA;VU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Amsterdam", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "Netherlands" }, { "id": "3kK8x_92hnD", "title": "Topological Vanilla Transfer Learning", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "In this paper we investigate the connection of topological similarity between source and target tasks with the efficiency of vanilla transfer learning (i.e., transfer learning without retraining) between them. We discuss that while it is necessary to have strong topological similarity between the source and target tasks, the other direction does not hold (i.e., it is not a sufficient condition). To this extent, we further investigate what can be done in order guarantee efficient feature representation transfer that is needed for such vanilla transfer learning. To answer this, we provide a matrix transformation based homeomorphism (i.e., topology preserving mapping) that significantly improves the transferability measures while keeping the topological properties of the source and target models intact. We prove that while finding such optimal matrix transformation is typically APX-hard, there exists an efficient randomised algorithm that achieves probably correct approximation guarantees. To demonstrate the effectiveness of our approach, we run a number of experiments on transferring features between ImageNet and a number of other datasets (CIFAR-10, CIFAR-100, MNIST, and ISIC 2019) with a variety of pre-trained models (ResNet50, EfficientNetB3, and InceptionV3). These numerical results show that our matrix transformation can increase the performance (measured by F-score) by up to 3-fold.", "keywords": "vanilla transfer learning;topological machine learning;linear homeomorphism", "primary_area": "", "supplementary_material": "/attachment/2e304f137f40ac6fb759fffcf0159d31ff9395dd.zip", "author": "Nicholas George Bishop;Lau Truong Nguyen;Hieu Trung Thai;Thomas Davies;Long Tran-Thanh", "authorids": "~Nicholas_George_Bishop1;~Lau_Truong_Nguyen1;~Hieu_Trung_Thai1;~Thomas_Davies1;~Long_Tran-Thanh1", "gender": "M;M;M;;", "homepage": "http://www.nickbishop.net;https://github.com/nguyentruonglau;;;https://warwick.ac.uk/fac/sci/dcs/people/long_tran-thanh/", "dblp": "294/1920.html;;;;46/8333", "google_scholar": "https://scholar.google.co.uk/citations?hl=en;;zRoycO4AAAAJ;;https://scholar.google.co.uk/citations?user=YBQai3gAAAAJ", "orcid": ";;;;", "linkedin": ";lautruongnguyen/;;;", "or_profile": "~Nicholas_George_Bishop1;~Lau_Truong_Nguyen1;~Hieu_Trung_Thai1;~Thomas_Davies1;~Long_Tran-Thanh1", "aff": "University of Southampton;FPT Software;FPT software;;", "aff_domain": "soton.ac.uk;fsoft.com.vn;fpt.com.vn;;", "position": "PhD student;Researcher;Researcher;;", "bibtex": "@misc{\nbishop2022topological,\ntitle={Topological Vanilla Transfer Learning},\nauthor={Nicholas George Bishop and Lau Truong Nguyen and Hieu Trung Thai and Thomas Davies and Long Tran-Thanh},\nyear={2022},\nurl={https://openreview.net/forum?id=3kK8x_92hnD}\n}", "github": "", "project": "", "reviewers": "DrXS;4YRa;uXFM", "site": "https://openreview.net/forum?id=3kK8x_92hnD", "pdf_size": 0, "recommendation": "3;3;3", "confidence": "4;3;4", "correctness": "2;3;4", "technical_novelty": "2;2;2", "empirical_novelty": "2;2;2", "wc_summary_paper": "81;52;97", "wc_summary_review": "71;91;24", "wc_main_review": "514;359;185", "wc_review": "666;502;306", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.0, 0.816496580927726 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 76.66666666666667, 18.62495339293199 ], "wc_summary_review_avg": [ 62.0, 28.083209693100727 ], "wc_main_review_avg": [ 352.6666666666667, 134.38832621259266 ], "wc_review_avg": [ 491.3333333333333, 147.1627972311239 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Hm6oV0QrvSAJ:scholar.google.com/&scioq=Topological+Vanilla+Transfer+Learning&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;1", "aff_unique_norm": "University of Southampton;FPT Corporation", "aff_unique_dep": ";", "aff_unique_url": "https://www.southampton.ac.uk;https://www.fpt-software.com", "aff_unique_abbr": "Southampton;FPT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1", "aff_country_unique": "United Kingdom;Vietnam" }, { "id": "3kTt_W1_tgw", "title": "$f$-Mutual Information Contrastive Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Self-supervised contrastive learning is an emerging field due to its power in providing good data representations. Such learning paradigm widely adopts the InfoNCE loss, which is closely connected with maximizing the mutual information. In this work, we propose the $f$-Mutual Information Contrastive Learning framework ($f$-MICL) , which directly maximizes the $f$-divergence-based generalization of mutual information. We theoretically prove that, under mild assumptions, our $f$-MICL naturally attains the alignment for positive pairs and the uniformity for data representations, the two main factors for the success of contrastive learning. We further provide theoretical guidance on designing the similarity function and choosing the effective $f$-divergences for $f$-MICL. Using several benchmark tasks from both vision and natural text, we empirically verify that our novel method outperforms or performs on par with state-of-the-art strategies.", "keywords": "contrastive learning;f-divergence;mutual information", "primary_area": "", "supplementary_material": "/attachment/b76736db8e7c4104843b0349a8aae13498f2deed.zip", "author": "Guojun Zhang;Yiwei Lu;Sun Sun;Hongyu Guo;Yaoliang Yu", "authorids": "~Guojun_Zhang1;~Yiwei_Lu1;~Sun_Sun1;~Hongyu_Guo1;~Yaoliang_Yu1", "gender": "M;M;F;M;M", "homepage": "https://gordon-guojun-zhang.github.io/;https://cs.uwaterloo.ca/~y485lu/;;https://hongyuharryguo.github.io/;https://cs.uwaterloo.ca/~y328yu/", "dblp": "56/4451;;;;90/4989", "google_scholar": "https://scholar.google.ca/citations?user=p8Y0xJEAAAAJ;ke0k9PkAAAAJ;2X_jP6kAAAAJ;https://scholar.google.ca/citations?user=bZUqlakAAAAJ;https://scholar.google.ca/citations?user=zbXIQMsAAAAJ", "orcid": ";;;;0000-0002-3823-0720", "linkedin": "guojun-zhang-bbb009a4/;;;harry-h-y-guo-a582087/;", "or_profile": "~Guojun_Zhang1;~Yiwei_Lu1;~Sun_Sun1;~Hongyu_Guo1;~Yaoliang_Yu1", "aff": "Huawei Technologies Ltd.;University of Waterloo;National Research Council Canada;National Research Council Canada;University of Waterloo", "aff_domain": "huawei.com;uwaterloo.ca;nrc-cnrc.gc.ca;nrc-cnrc.gc.ca;uwaterloo.ca", "position": "Researcher;PhD student;Researcher;Senior Research Officer;Associate Professor", "bibtex": "@misc{\nzhang2022fmutual,\ntitle={\\$f\\$-Mutual Information Contrastive Learning},\nauthor={Guojun Zhang and Yiwei Lu and Sun Sun and Hongyu Guo and Yaoliang Yu},\nyear={2022},\nurl={https://openreview.net/forum?id=3kTt_W1_tgw}\n}", "github": "", "project": "", "reviewers": "qEh5;HSE4;fZBZ;Q5hx", "site": "https://openreview.net/forum?id=3kTt_W1_tgw", "pdf_size": 0, "recommendation": "5;6;6;6", "confidence": "4;3;4;4", "correctness": "2;3;4;4", "technical_novelty": "2;3;3;3", "empirical_novelty": "3;3;3;3", "wc_summary_paper": "38;58;24;60", "wc_summary_review": "41;48;331;89", "wc_main_review": "742;101;72;165", "wc_review": "821;207;427;314", "wc_reply_reviewers": "0;0;19;0", "wc_reply_authors": "1156;128;599;239", "reply_reviewers": "0;0;1;0", "reply_authors": "2;1;1;1", "recommendation_avg": [ 5.75, 0.4330127018922193 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 45.0, 14.866068747318506 ], "wc_summary_review_avg": [ 127.25, 119.05539676973909 ], "wc_main_review_avg": [ 270.0, 274.57876829791485 ], "wc_review_avg": [ 442.25, 232.0962892852878 ], "wc_reply_reviewers_avg": [ 4.75, 8.227241335952167 ], "wc_reply_authors_avg": [ 530.5, 400.91177333672806 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": 0.8703882797784891, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13917018134924602661&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "aff_unique_index": "0;1;2;2;1", "aff_unique_norm": "Huawei;University of Waterloo;National Research Council Canada", "aff_unique_dep": "Huawei Technologies;;", "aff_unique_url": "https://www.huawei.com;https://uwaterloo.ca;https://www.nrc-cnrc.gc.ca", "aff_unique_abbr": "Huawei;UW;NRC-CNRC", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1;1", "aff_country_unique": "China;Canada" }, { "id": "3mgYqlH60Uj", "title": "Learning Symmetric Locomotion using Cumulative Fatigue for Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Modern deep reinforcement learning (DRL) methods allow simulated characters to learn complex skills such as locomotion from scratch. However, without further exploitation of domain-specific knowledge, such as motion capture data, finite state machines or morphological specifications, physics-based locomotion generation with DRL often results in unrealistic motions. One explanation for this is that present RL models do not estimate biomechanical effort; instead, they minimize instantaneous squared joint actuation torques as a proxy for the actual subjective cost of actions. To mitigate this discrepancy in a computationally efficient manner, we propose a method for mapping actuation torques to subjective effort without simulating muscles and their energy expenditure. Our approach is based on the Three Compartment Controller model, in which the relationships of variables such as maximum voluntary joint torques, recovery, and cumulative fatigue are present. We extend this method for sustained symmetric locomotion tasks for deep reinforcement learning using a Normalized Cumulative Fatigue (NCF) model.\nIn summary, in this paper we present the first RL model to use biomechanical cumulative effort for full-body movement generation without the use of any finite state machines, morphological specification or motion capture data. Our results show that the learned policies are more symmetric, periodic and robust compared to methods found in previous literature.", "keywords": "reinforcement learning;biomechanical model;cumulative fatigue;animation;bioinspired models;physics-based simulation;locomotion", "primary_area": "", "supplementary_material": "/attachment/cc42d9b6cf85abaeb2ffd4289a16456c98eaf33b.zip", "author": "Rui Xu;Noshaba Cheema;Erik Herrmann;Perttu H\u00e4m\u00e4l\u00e4inen;Philipp Slusallek", "authorids": "rui.xu@dfki.de;~Noshaba_Cheema1;~Erik_Herrmann1;~Perttu_H\u00e4m\u00e4l\u00e4inen1;philipp.slusallek@dfki.de", "gender": ";F;M;M;", "homepage": ";https://people.mpi-inf.mpg.de/~ncheema/;;http://perttu.info;", "dblp": ";;;;", "google_scholar": ";DnNYl1IAAAAJ;;https://scholar.google.fi/citations?user=i90uqXUAAAAJ;", "orcid": ";0000-0003-1275-4080;0000-0003-1052-9883;;", "linkedin": ";;;;", "or_profile": "rui.xu@dfki.de;~Noshaba_Cheema1;~Erik_Herrmann1;~Perttu_H\u00e4m\u00e4l\u00e4inen1;philipp.slusallek@dfki.de", "aff": ";German Research Center for AI;Saarland University;Aalto University;", "aff_domain": ";dfki.de;uni-saarland.de;aalto.fi;", "position": ";Researcher;PhD student;Assistant Professor;", "bibtex": "@misc{\nxu2022learning,\ntitle={Learning Symmetric Locomotion using Cumulative Fatigue for Reinforcement Learning},\nauthor={Rui Xu and Noshaba Cheema and Erik Herrmann and Perttu H{\\\"a}m{\\\"a}l{\\\"a}inen and Philipp Slusallek},\nyear={2022},\nurl={https://openreview.net/forum?id=3mgYqlH60Uj}\n}", "github": "", "project": "", "reviewers": "cgvV;rhvx;fKKh;VCud", "site": "https://openreview.net/forum?id=3mgYqlH60Uj", "pdf_size": 0, "recommendation": "5;6;6;6", "confidence": "5;3;4;3", "correctness": "3;3;3;1", "technical_novelty": "3;3;3;2", "empirical_novelty": "3;2;2;0", "wc_summary_paper": "93;43;107;75", "wc_summary_review": "72;56;85;94", "wc_main_review": "223;125;426;214", "wc_review": "388;224;618;383", "wc_reply_reviewers": "181;0;41;0", "wc_reply_authors": "766;171;563;90", "reply_reviewers": "1;0;1;0", "reply_authors": "3;2;2;2", "recommendation_avg": [ 5.75, 0.4330127018922193 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 2.5, 0.8660254037844386 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.75, 1.0897247358851685 ], "wc_summary_paper_avg": [ 79.5, 23.93219588754864 ], "wc_summary_review_avg": [ 76.75, 14.306903927824496 ], "wc_main_review_avg": [ 247.0, 110.21569761154714 ], "wc_review_avg": [ 403.25, 140.43748609256718 ], "wc_reply_reviewers_avg": [ 55.5, 74.36565067287451 ], "wc_reply_authors_avg": [ 397.5, 277.95728089042746 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 2.25, 0.4330127018922193 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.8703882797784891, "corr_recommendation_correctness": -0.3333333333333333, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:J_LPGzFECzsJ:scholar.google.com/&scioq=Learning+Symmetric+Locomotion+using+Cumulative+Fatigue+for+Reinforcement+Learning&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1;2", "aff_unique_norm": "German Research Center for Artificial Intelligence;Saarland University;Aalto University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.dfki.de/;https://www.uni-saarland.de;https://www.aalto.fi", "aff_unique_abbr": "DFKI;UdS;Aalto", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1", "aff_country_unique": "Germany;Finland" }, { "id": "3mm5rjb7nR8", "title": "Learning Global Spatial Information for Multi-View Object-Centric Models", "track": "main", "status": "Reject", "tldr": "", "abstract": "Recently, several studies have been working on multi-view object-centric models, which predict unobserved views of a scene and infer object-centric representations from several observation views. In general, multi-object scenes can be uniquely determined if both the properties of individual objects and the spatial arrangement of objects are specified; however, existing multi-view object-centric models only infer object-level representations and lack spatial information. This insufficient modeling can degrade novel-view synthesis quality and make it difficult to generate novel scenes. We can model both spatial information and object representations by introducing hierarchical probabilistic model, which contains a global latent variable on top of object-level latent variables. However, how to execute inference and training with that hierarchical multi-view object-centric model is unclear. Therefore, we introduce several crucial components which help inference and training with the proposed model. We show that the proposed method achieves good inference quality and can also generate novel scenes.", "keywords": "deep generative models;object-centric representation learning;segmentation", "primary_area": "", "supplementary_material": "/attachment/f9922126fc53b22f9e62fd24c750520fac25982d.zip", "author": "Yuya Kobayashi;Masahiro Suzuki;Yutaka Matsuo", "authorids": "~Yuya_Kobayashi1;~Masahiro_Suzuki1;~Yutaka_Matsuo1", "gender": ";M;M", "homepage": ";;http://ymatsuo.com", "dblp": ";;m/YMatsuo.html", "google_scholar": ";r2nt5kUAAAAJ;Dy8iau4AAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Yuya_Kobayashi1;~Masahiro_Suzuki1;~Yutaka_Matsuo1", "aff": ";The University of Tokyo, Tokyo Institute of Technology;The University of Tokyo", "aff_domain": ";u-tokyo.ac.jp;u-tokyo.ac.jp", "position": ";Assistant Professor;Associate Professor", "bibtex": "@misc{\nkobayashi2022learning,\ntitle={Learning Global Spatial Information for Multi-View Object-Centric Models},\nauthor={Yuya Kobayashi and Masahiro Suzuki and Yutaka Matsuo},\nyear={2022},\nurl={https://openreview.net/forum?id=3mm5rjb7nR8}\n}", "github": "", "project": "", "reviewers": "TFws;Pvdr;RJ1B;KiHp", "site": "https://openreview.net/forum?id=3mm5rjb7nR8", "pdf_size": 0, "recommendation": "5;5;5;5", "confidence": "4;5;3;4", "correctness": "3;3;3;3", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;3;2;2", "wc_summary_paper": "73;135;70;112", "wc_summary_review": "51;54;45;18", "wc_main_review": "475;867;221;230", "wc_review": "599;1056;336;360", "wc_reply_reviewers": "80;133;48;164", "wc_reply_authors": "196;1386;212;533", "reply_reviewers": "1;1;1;1", "reply_authors": "1;2;1;1", "recommendation_avg": [ 5.0, 0.0 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 97.5, 27.26261176043117 ], "wc_summary_review_avg": [ 42.0, 14.230249470757707 ], "wc_main_review_avg": [ 448.25, 262.36556081162786 ], "wc_review_avg": [ 587.75, 289.23725123157976 ], "wc_reply_reviewers_avg": [ 106.25, 45.090880452703516 ], "wc_reply_authors_avg": [ 581.75, 483.4027177209495 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:3Ox3cYEhr_sJ:scholar.google.com/&scioq=Learning+Global+Spatial+Information+for+Multi-View+Object-Centric+Models&hl=en&as_sdt=0,5", "gs_version_total": 4, "aff_unique_index": "0;0", "aff_unique_norm": "University of Tokyo", "aff_unique_dep": "", "aff_unique_url": "https://www.u-tokyo.ac.jp", "aff_unique_abbr": "UTokyo", "aff_campus_unique_index": "0", "aff_campus_unique": "Tokyo;", "aff_country_unique_index": "0;0", "aff_country_unique": "Japan" }, { "id": "3pZTPQjeQDR", "title": "How BPE Affects Memorization in Transformers", "track": "main", "status": "Reject", "tldr": "", "abstract": "Training data memorization in NLP can both be beneficial (e.g., closed-book QA) and undesirable (personal data extraction). In any case, successful model training requires a non-trivial amount of memorization to store word spellings, various linguistic idiosyncrasies and common knowledge. However, little is known about what affects the memorization behavior of NLP models, as the field tends to focus on the equally important question of generalization.\nIn this work, we demonstrate that the size of the subword vocabulary learned by Byte-Pair Encoding (BPE) greatly affects both ability and tendency of standard Transformer models to memorize training data, even when we control for the number of learned parameters. We find that with a large subword vocabulary size, Transformer models fit random mappings more easily and are more vulnerable to membership inference attacks. Similarly, given a prompt, Transformer-based language models with large subword vocabularies reproduce the training data more often. We conjecture this effect is caused by reduction in the sequences' length that happens as the BPE vocabulary grows. Our findings can allow a more informed choice of hyper-parameters, that is better tailored for a particular use-case.", "keywords": "training data memorization;Byte-Pair Encoding;Transformers", "primary_area": "", "supplementary_material": "", "author": "Eugene Kharitonov;Marco Baroni;Dieuwke Hupkes", "authorids": "~Eugene_Kharitonov1;~Marco_Baroni1;~Dieuwke_Hupkes1", "gender": ";M;", "homepage": ";http://marcobaroni.org;https://github.com/google/BIG-bench", "dblp": "117/4229;http://dblp.uni-trier.de/pers/hd/b/Baroni:Marco;184/8838", "google_scholar": ";https://scholar.google.com/citations?hl=en;https://scholar.google.nl/citations?user=tAtSMTcAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Eugene_Kharitonov1;~Marco_Baroni1;~Dieuwke_Hupkes1", "aff": "Meta Facebook;Universitat Pompeu Fabra;Meta Facebook", "aff_domain": "fb.com;upf.edu;facebook.com", "position": "Research Engineer;Full Professor;Research Scientist", "bibtex": "@misc{\nkharitonov2022how,\ntitle={How {BPE} Affects Memorization in Transformers},\nauthor={Eugene Kharitonov and Marco Baroni and Dieuwke Hupkes},\nyear={2022},\nurl={https://openreview.net/forum?id=3pZTPQjeQDR}\n}", "github": "", "project": "", "reviewers": "WXGB;saPE;KAZC", "site": "https://openreview.net/forum?id=3pZTPQjeQDR", "pdf_size": 0, "recommendation": "6;6;6", "confidence": "5;4;4", "correctness": "3;3;3", "technical_novelty": "3;2;3", "empirical_novelty": "3;3;3", "wc_summary_paper": "72;126;144", "wc_summary_review": "31;66;27", "wc_main_review": "234;639;133", "wc_review": "337;831;304", "wc_reply_reviewers": "38;83;0", "wc_reply_authors": "841;1568;139", "reply_reviewers": "1;1;0", "reply_authors": "2;3;2", "recommendation_avg": [ 6.0, 0.0 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 114.0, 30.59411708155671 ], "wc_summary_review_avg": [ 41.333333333333336, 17.518244457961217 ], "wc_main_review_avg": [ 335.3333333333333, 218.6478650453484 ], "wc_review_avg": [ 490.6666666666667, 241.0288133997446 ], "wc_reply_reviewers_avg": [ 40.333333333333336, 33.9247533357118 ], "wc_reply_authors_avg": [ 849.3333333333334, 583.4165654906354 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 2.3333333333333335, 0.4714045207910317 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 36, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5783421637508370193&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "Meta;Universitat Pompeu Fabra", "aff_unique_dep": "Meta Platforms, Inc.;", "aff_unique_url": "https://meta.com;https://www.upf.edu/", "aff_unique_abbr": "Meta;UPF", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", "aff_country_unique": "United States;Spain" }, { "title": "Practical Conditional Neural Process Via Tractable Dependent Predictions", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6738", "id": "3pugbNqOh5m", "poster": "", "openreview": "https://openreview.net/forum?id=3pugbNqOh5m", "slides": "https://iclr.cc/virtual/2022/poster/6738", "video": "https://iclr.cc/virtual/2022/poster/6738", "author_site": "Stratis Markou, James Requeima, Wessel Bruinsma, Anna Vaughan, Richard E Turner", "tldr": "", "abstract": "Conditional Neural Processes (CNPs; Garnelo et al., 2018a) are meta-learning models which leverage the flexibility of deep learning to produce well-calibrated predictions and naturally handle off-the-grid and missing data. CNPs scale to large datasets and train with ease. Due to these features, CNPs appear well-suited to tasks from environmental sciences or healthcare. Unfortunately, CNPs do not produce correlated predictions, making them fundamentally inappropriate for many estimation and decision making tasks. Predicting heat waves or floods, for example, requires modelling dependencies in temperature or precipitation over time and space. Existing approaches which model output dependencies, such as Neural Processes (NPs; Garnelo et al., 2018b) or the FullConvGNP (Bruinsma et al., 2021), are either complicated to train or prohibitively expensive. What is needed is an approach which provides dependent predictions, but is simple to train and computationally tractable. In this work, we present a new class of Neural Process models that make correlated predictions and support exact maximum likelihood training that is simple and scalable. We extend the proposed models by using invertible output transformations, to capture non-Gaussian output distributions. Our models can be used in downstream estimation tasks which require dependent function samples. By accounting for output dependencies, our models show improved predictive performance on a range of experiments with synthetic and real data.", "keywords": "conditional neural processes;neural processes;meta-learning;convolutional conditional neural processes;Gaussian neural processes", "primary_area": "", "supplementary_material": "", "author": "Stratis Markou;James Requeima;Wessel Bruinsma;Anna Vaughan;Richard E Turner", "authorids": "~Stratis_Markou1;~James_Requeima1;~Wessel_Bruinsma1;av555@cam.ac.uk;~Richard_E_Turner1", "gender": "M;M;;;M", "homepage": ";http://jamesr.info;https://wessel.ai;;https://rich-turner-group.github.io/", "dblp": "300/3941;;242/3348.html;;40/5352", "google_scholar": ";https://scholar.google.ca/citations?hl=en;QRQwz3cAAAAJ;;https://scholar.google.co.uk/citations?user=DgLEyZgAAAAJ", "orcid": ";;;;", "linkedin": "stratos-m-85884b94/;;;;", "or_profile": "~Stratis_Markou1;~James_Requeima1;~Wessel_Bruinsma1;av555@cam.ac.uk;~Richard_E_Turner1", "aff": "University of Cambridge;University of Cambridge;Invenia Labs;;University of Cambridge", "aff_domain": "cam.ac.uk;cam.ac.uk;invenialabs.co.uk;;cam.ac.uk", "position": "PhD student;MS student;Researcher;;Professor", "bibtex": "@inproceedings{\nmarkou2022practical,\ntitle={Practical Conditional Neural Process Via Tractable Dependent Predictions},\nauthor={Stratis Markou and James Requeima and Wessel Bruinsma and Anna Vaughan and Richard E Turner},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=3pugbNqOh5m}\n}", "github": "", "project": "", "reviewers": "Z7Bg;WnVi;g5Qp", "pdf_size": 0, "recommendation": "6;6;8", "confidence": "3;4;4", "correctness": "4;4;4", "technical_novelty": "2;3;2", "empirical_novelty": "2;3;3", "wc_summary_paper": "54;138;61", "wc_summary_review": "108;30;37", "wc_main_review": "216;223;178", "wc_review": "378;391;276", "wc_reply_reviewers": "0;60;0", "wc_reply_authors": "2233;3530;553", "reply_reviewers": "0;1;0", "reply_authors": "5;7;2", "recommendation_avg": [ 6.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 4.0, 0.0 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 84.33333333333333, 38.055515004033545 ], "wc_summary_review_avg": [ 58.333333333333336, 35.235714205271265 ], "wc_main_review_avg": [ 205.66666666666666, 19.770910168449223 ], "wc_review_avg": [ 348.3333333333333, 51.42200134399888 ], "wc_reply_reviewers_avg": [ 20.0, 28.284271247461902 ], "wc_reply_authors_avg": [ 2105.3333333333335, 1218.7032270227053 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 4.666666666666667, 2.0548046676563256 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.49999999999999983, "corr_recommendation_correctness": 0.0, "gs_citation": 30, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13206280508408819392&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=3pugbNqOh5m", "email": "cam.ac.uk;cam.ac.uk;invenialabs.co.uk;;cam.ac.uk", "author_num": 5, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "University of Cambridge;Invenia Labs", "aff_unique_dep": ";", "aff_unique_url": "https://www.cam.ac.uk;https://www.invenia.ca", "aff_unique_abbr": "Cambridge;", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Cambridge;", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "United Kingdom;Canada" }, { "id": "3r034NfDKnL", "title": "The Role of Learning Regime, Architecture and Dataset Structure on Systematic Generalization in Simple Neural Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "Humans often systematically generalize in situations where standard deep neural networks do not. Empirical studies have shown that the learning procedure and network architecture can influence systematicity in deep networks, but the underlying reasons for this influence remain unclear. Here we theoretically study the acquisition of systematic knowledge by simple neural networks. We introduce a minimal space of datasets with systematic and non-systematic features in both the input and output. For shallow and deep linear networks, we derive learning trajectories for all datasets in this space. The solutions reveal that both shallow and deep networks rely on non-systematic inputs to the same extent throughout learning, such that even with early stopping, no networks learn a fully systematic mapping. Turning to the impact of architecture, we show that modularity improves extraction of systematic structure, but only achieves perfect systematicity in the trivial setting where systematic mappings are fully segregated from non-systematic information. Finally, we analyze iterated learning, a procedure in which generations of networks learn from languages generated by earlier learners. Here we find that networks with output modularity successfully converge over generations to a fully systematic `language\u2019 starting from any dataset in our space. Our results contribute to clarifying the role of learning regime, architecture, and dataset structure in promoting systematic generalization, and provide theoretical support for empirical observations that iterated learning can improve systematicity.", "keywords": "Systematic Generalization;Iterated Learning;Linear Neural Networks", "primary_area": "", "supplementary_material": "/attachment/7c83e38f6d4ce7c4d61bf061858e116954ecb763.zip", "author": "Devon Jarvis;Richard Klein;Benjamin Rosman;Andrew M Saxe", "authorids": "~Devon_Jarvis1;~Richard_Klein1;~Benjamin_Rosman1;~Andrew_M_Saxe1", "gender": "M;M;M;M", "homepage": "https://jarvisdevon.github.io/;https://www.wits.ac.za/staff/academic-a-z-listing/k/richardkleinwitsacza/;http://www.raillab.org;https://www.saxelab.org", "dblp": "320/3650;26/8293;45/4591;39/6894", "google_scholar": "https://scholar.google.co.za/citations?user=MJjN5nEAAAAJ;https://scholar.google.co.za/citations?user=QZ_MjosAAAAJ;https://scholar.google.co.za/citations?user=pWJ0SocAAAAJ;h0Al1fcAAAAJ", "orcid": "0000-0003-2362-7538;0000-0003-0783-2072;;0000-0002-9831-8812", "linkedin": "devon-jarvis-6b059a139;;;", "or_profile": "~Devon_Jarvis1;~Richard_Klein1;~Benjamin_Rosman1;~Andrew_M_Saxe1", "aff": "University of Witwatersrand;University of the Witwatersrand;University of the Witwatersrand;Facebook AI", "aff_domain": "wits.ac.za;wits.ac.za;wits.ac.za;fb.com", "position": "PhD student;Associate Professor;Full Professor;Researcher", "bibtex": "@misc{\njarvis2022the,\ntitle={The Role of Learning Regime, Architecture and Dataset Structure on Systematic Generalization in Simple Neural Networks},\nauthor={Devon Jarvis and Richard Klein and Benjamin Rosman and Andrew M Saxe},\nyear={2022},\nurl={https://openreview.net/forum?id=3r034NfDKnL}\n}", "github": "", "project": "", "reviewers": "2WAW;9QCY;u1Hs;ZgRW", "site": "https://openreview.net/forum?id=3r034NfDKnL", "pdf_size": 0, "recommendation": "3;5;5;5", "confidence": "4;3;3;4", "correctness": "4;3;3;3", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "51;82;128;70", "wc_summary_review": "54;48;41;23", "wc_main_review": "228;393;425;766", "wc_review": "333;523;594;859", "wc_reply_reviewers": "196;453;218;0", "wc_reply_authors": "1106;1986;1278;2096", "reply_reviewers": "1;1;1;0", "reply_authors": "3;4;3;4", "recommendation_avg": [ 4.5, 0.8660254037844386 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 82.75, 28.367014294775544 ], "wc_summary_review_avg": [ 41.5, 11.629703349613008 ], "wc_main_review_avg": [ 453.0, 195.56201062578592 ], "wc_review_avg": [ 577.25, 188.58999840924756 ], "wc_reply_reviewers_avg": [ 216.75, 160.64459997148987 ], "wc_reply_authors_avg": [ 1616.5, 430.5934857844461 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 3.5, 0.5 ], "replies_avg": [ 26, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": -1.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16034080553373169930&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;0;1", "aff_unique_norm": "University of the Witwatersrand;Meta", "aff_unique_dep": ";Facebook AI", "aff_unique_url": "https://www.wits.ac.za;https://www.facebook.com", "aff_unique_abbr": "Wits;Facebook AI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1", "aff_country_unique": "South Africa;United States" }, { "title": "Unraveling Model-Agnostic Meta-Learning via The Adaptation Learning Rate", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6474", "id": "3rULBvOJ8D2", "poster": "", "openreview": "https://openreview.net/forum?id=3rULBvOJ8D2", "slides": "https://iclr.cc/virtual/2022/poster/6474", "video": "https://iclr.cc/virtual/2022/poster/6474", "author_site": "Yingtian Zou, Fusheng Liu, Qianxiao Li", "tldr": "", "abstract": "Model-Agnostic Meta-Learning (MAML) aims to find initial weights that allow fast adaptation to new tasks. The adaptation (inner loop) learning rate in MAML plays a central role in enabling such fast adaptation. However, how to choose this value in practice and how this choice affects the adaptation error remains less explored. In this paper, we study the effect of the adaptation learning rate in meta-learning with mixed linear regression. First, we present a principled way to estimate optimal adaptation learning rates that minimize the population risk of MAML. Second, we interpret the underlying dependence between the optimal adaptation learning rate and the input data. Finally, we prove that compared with empirical risk minimization (ERM), MAML produces an initialization with a smaller average distance to the task optima, consistent with previous practical findings. These results are corroborated with numerical experiments.", "keywords": "Meta-Learning;Learning rate;Optimization", "primary_area": "", "supplementary_material": "/attachment/6f15c6cd58af940add5e3ca0c1fe98353cde88c9.zip", "author": "Yingtian Zou;Fusheng Liu;Qianxiao Li", "authorids": "~Yingtian_Zou1;~Fusheng_Liu1;~Qianxiao_Li1", "gender": "M;;M", "homepage": ";https://mathematicallfs.github.io;https://blog.nus.edu.sg/qianxiaoli/", "dblp": "223/4047;;172/0930.html", "google_scholar": "APA-glsAAAAJ;;https://scholar.google.com.sg/citations?user=zLgReYoAAAAJ", "orcid": ";;0000-0002-3903-3737", "linkedin": ";;", "or_profile": "~Yingtian_Zou1;~Fusheng_Liu1;~Qianxiao_Li1", "aff": "National University of Singapore;National University of Singapore;National University of Singapore", "aff_domain": "nus.edu.sg;u.nus.edu;nus.edu.sg", "position": "PhD student;PhD student;Assistant Professor", "bibtex": "@inproceedings{\nzou2022unraveling,\ntitle={Unraveling Model-Agnostic Meta-Learning via The Adaptation Learning Rate},\nauthor={Yingtian Zou and Fusheng Liu and Qianxiao Li},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=3rULBvOJ8D2}\n}", "github": "", "project": "", "reviewers": "ui6t;6tyw;pMe3;9gLM;iGwz", "pdf_size": 0, "recommendation": "5;5;5;6;6", "confidence": "4;2;4;1;2", "correctness": "3;3;4;3;3", "technical_novelty": "2;3;3;3;3", "empirical_novelty": "2;3;2;0;0", "wc_summary_paper": "58;109;114;143;137", "wc_summary_review": "55;37;46;54;16", "wc_main_review": "177;258;128;139;263", "wc_review": "290;404;288;336;416", "wc_reply_reviewers": "0;0;0;0;96", "wc_reply_authors": "412;479;304;73;806", "reply_reviewers": "0;0;0;0;1", "reply_authors": "2;1;1;1;2", "recommendation_avg": [ 5.4, 0.48989794855663565 ], "confidence_avg": [ 2.6, 1.2 ], "correctness_avg": [ 3.2, 0.39999999999999997 ], "technical_novelty_avg": [ 2.8, 0.39999999999999997 ], "empirical_novelty_avg": [ 1.4, 1.2000000000000002 ], "wc_summary_paper_avg": [ 112.2, 30.049292836937113 ], "wc_summary_review_avg": [ 41.6, 14.347125147568763 ], "wc_main_review_avg": [ 193.0, 57.483910792499145 ], "wc_review_avg": [ 346.8, 54.51752011968262 ], "wc_reply_reviewers_avg": [ 19.2, 38.4 ], "wc_reply_authors_avg": [ 414.8, 239.27841524048927 ], "reply_reviewers_avg": [ 0.2, 0.4 ], "reply_authors_avg": [ 1.4, 0.4898979485566356 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.7484551991837489, "corr_recommendation_correctness": -0.408248290463863, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4381794533005845903&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=3rULBvOJ8D2", "email": "nus.edu.sg;u.nus.edu;nus.edu.sg", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "National University of Singapore", "aff_unique_dep": "", "aff_unique_url": "https://www.nus.edu.sg", "aff_unique_abbr": "NUS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Singapore" }, { "id": "3t0ZcNhBs5", "title": "Beyond Message Passing Paradigm: Training Graph Data with Consistency Constraints", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Recent years have witnessed great success in handling graph-related tasks with Graph Neural Networks (GNNs). However, most existing GNNs are based on powerful message passing to guide feature aggregation among neighbors. Despite their success, there still exist three weaknesses that limit their capacity to train graph data: weak generalization with severely limited labeled data, poor robustness to label noise and structure disturbation, and high computation and memory burden for keeping the entire graph. In this paper, we propose a simple yet effective Graph Consistency Learning (GCL) framework, which is based purely on multilayer perceptrons, where structure information is only implicitly incorporated as prior knowledge in the computation of supervision signals but does not explicitly involve the forward. Specifically, the GCL framework is optimized with three well-designed consistency constraints: neighborhood consistency, label consistency, and class-center consistency. More importantly, we provide theoretical analysis on the connections between message passing and consistency constraints. Extensive experiments show that GCL produces truly encouraging performance with better generalization and robustness compared with other leading methods.", "keywords": "Graph Learning;Multilayer Perceptrons;Consistency Constraints", "primary_area": "", "supplementary_material": "", "author": "Lirong Wu;Stan Z. Li", "authorids": "~Lirong_Wu1;~Stan_Z._Li2", "gender": ";M", "homepage": ";https://en.westlake.edu.cn/academics/School_of_Engineering/About/Our_People/Faculty/201912/t20191206_2497.shtml", "dblp": "15/10330;l/StanZLi", "google_scholar": "Tk7TrCoAAAAJ;https://scholar.google.com/citations?hl=zh-CN", "orcid": ";", "linkedin": ";stan-z-li-%E6%9D%8E%E5%AD%90%E9%9D%92-55753224/", "or_profile": "~Lirong_Wu1;~Stan_Z._Li1", "aff": "Westlake University;Westlake University", "aff_domain": "westlake.edu.cn;westlake.edu.cn", "position": "PhD student;Chair Professor", "bibtex": "@misc{\nwu2022beyond,\ntitle={Beyond Message Passing Paradigm: Training Graph Data with Consistency Constraints},\nauthor={Lirong Wu and Stan Z. Li},\nyear={2022},\nurl={https://openreview.net/forum?id=3t0ZcNhBs5}\n}", "github": "", "project": "", "reviewers": "TzsL;pLNU;UhuD;eQ4u", "site": "https://openreview.net/forum?id=3t0ZcNhBs5", "pdf_size": 0, "recommendation": "3;3;5;6", "confidence": "4;4;4;5", "correctness": "3;2;3;3", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "126;92;61;105", "wc_summary_review": "74;54;71;19", "wc_main_review": "474;392;134;166", "wc_review": "674;538;266;290", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.25, 1.299038105676658 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 96.0, 23.569047498785352 ], "wc_summary_review_avg": [ 54.5, 21.86892772862904 ], "wc_main_review_avg": [ 291.5, 144.88184841449257 ], "wc_review_avg": [ 442.0, 171.1139970896595 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.7777777777777777, "corr_recommendation_correctness": 0.5555555555555555, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:sVP1J1fTpAQJ:scholar.google.com/&scioq=Beyond+Message+Passing+Paradigm:+Training+Graph+Data+with+Consistency+Constraints&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Westlake University", "aff_unique_dep": "", "aff_unique_url": "https://www.westlake.edu.cn", "aff_unique_abbr": "WU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "title": "Large Learning Rate Tames Homogeneity: Convergence and Balancing Effect", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6679", "id": "3tbDrs77LJ5", "poster": "", "openreview": "https://openreview.net/forum?id=3tbDrs77LJ5", "slides": "https://iclr.cc/virtual/2022/poster/6679", "video": "https://iclr.cc/virtual/2022/poster/6679", "author_site": "Yuqing Wang, Minshuo Chen, Tuo Zhao, Molei Tao", "tldr": "", "abstract": "Recent empirical advances show that training deep models with large learning rate often improves generalization performance. However, theoretical justifications on the benefits of large learning rate are highly limited, due to challenges in analysis. In this paper, we consider using Gradient Descent (GD) with a large learning rate on a homogeneous matrix factorization problem, i.e., $\\min_{X, Y} \\|A - XY^\\top\\|_{\\sf F}^2$. We prove a convergence theory for constant large learning rates well beyond $2/L$, where $L$ is the largest eigenvalue of Hessian at the initialization. Moreover, we rigorously establish an implicit bias of GD induced by such a large learning rate, termed `balancing', meaning that magnitudes of $X$ and $Y$ at the limit of GD iterations will be close even if their initialization is significantly unbalanced. Numerical experiments are provided to support our theory.", "keywords": "large learning rate;gradient descent;matrix factorization;implicit regularization;convergence;balancing;alignment", "primary_area": "", "supplementary_material": "/attachment/7ba3e0b83326fa16426f54f6138a7355f584d7ce.zip", "author": "Yuqing Wang;Minshuo Chen;Tuo Zhao;Molei Tao", "authorids": "~Yuqing_Wang3;~Minshuo_Chen1;~Tuo_Zhao1;~Molei_Tao1", "gender": ";M;M;", "homepage": "https://yzwangyuqing.github.io;https://minshuochen.github.io;http://www2.isye.gatech.edu/~tzhao80;http://people.math.gatech.edu/~mtao8/", "dblp": ";217/1509;;56/9263", "google_scholar": "c7Bi9RUAAAAJ;qU9WvTgAAAAJ;EJXN6tYAAAAJ;", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Yuqing_Wang3;~Minshuo_Chen1;~Tuo_Zhao1;~Molei_Tao1", "aff": "Georgia Institute of Technology;Georgia Institute of Technology;Georgia Institute of Technology;Georgia Institute of Technology", "aff_domain": "gatech.edu;gatech.edu;gatech.edu;gatech.edu", "position": "PhD student;PhD student;Associate Professor;Associate Professor", "bibtex": "@inproceedings{\nwang2022large,\ntitle={Large Learning Rate Tames Homogeneity: Convergence and Balancing Effect},\nauthor={Yuqing Wang and Minshuo Chen and Tuo Zhao and Molei Tao},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=3tbDrs77LJ5}\n}", "github": "", "project": "", "reviewers": "VWDy;Xmww;MuC1;C6VY", "pdf_size": 0, "recommendation": "5;6;8;8", "confidence": "4;3;3;4", "correctness": "4;3;4;3", "technical_novelty": "2;3;4;3", "empirical_novelty": "2;3;0;0", "wc_summary_paper": "104;45;146;117", "wc_summary_review": "20;29;78;24", "wc_main_review": "550;114;266;200", "wc_review": "674;188;490;341", "wc_reply_reviewers": "0;230;152;0", "wc_reply_authors": "1526;598;909;485", "reply_reviewers": "0;2;2;0", "reply_authors": "3;2;2;1", "recommendation_avg": [ 6.75, 1.299038105676658 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 1.25, 1.299038105676658 ], "wc_summary_paper_avg": [ 103.0, 36.776351096866584 ], "wc_summary_review_avg": [ 37.75, 23.45607597190971 ], "wc_main_review_avg": [ 282.5, 163.57490638848 ], "wc_review_avg": [ 423.25, 179.8879859801649 ], "wc_reply_reviewers_avg": [ 95.5, 99.40196175126525 ], "wc_reply_authors_avg": [ 879.5, 404.26012665114524 ], "reply_reviewers_avg": [ 1.0, 1.0 ], "reply_authors_avg": [ 2.0, 0.7071067811865476 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.19245008972987526, "corr_recommendation_correctness": -0.19245008972987526, "gs_citation": 55, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4593169389761188484&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=3tbDrs77LJ5", "email": "gatech.edu;gatech.edu;gatech.edu;gatech.edu", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Georgia Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.gatech.edu", "aff_unique_abbr": "Georgia Tech", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "The Uncanny Similarity of Recurrence and Depth", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6385", "id": "3wNcr5nq56", "poster": "", "openreview": "https://openreview.net/forum?id=3wNcr5nq56", "slides": "https://iclr.cc/virtual/2022/poster/6385", "video": "https://iclr.cc/virtual/2022/poster/6385", "author_site": "Avi Schwarzschild, Arjun Gupta, Amin Ghiasi, Micah Goldblum, Tom Goldstein", "tldr": "", "abstract": "It is widely believed that deep neural networks contain layer specialization, wherein networks extract hierarchical features representing edges and patterns in shallow layers and complete objects in deeper layers. Unlike common feed-forward models that have distinct filters at each layer, recurrent networks reuse the same parameters at various depths. In this work, we observe that recurrent models exhibit the same hierarchical behaviors and the same performance benefits as depth despite reusing the same filters at every recurrence. By training models of various feed-forward and recurrent architectures on several datasets for image classification as well as maze solving, we show that recurrent networks have the ability to closely emulate the behavior of non-recurrent deep models, often doing so with far fewer parameters.", "keywords": "Deep learning;recurrent networks;depth", "primary_area": "", "supplementary_material": "/attachment/0bbe14b33ac1c42b6c5e40aa0c3cee1208e971a0.zip", "author": "Avi Schwarzschild;Arjun Gupta;Amin Ghiasi;Micah Goldblum;Tom Goldstein", "authorids": "~Avi_Schwarzschild1;~Arjun_Gupta2;~Amin_Ghiasi1;~Micah_Goldblum1;~Tom_Goldstein1", "gender": "M;M;M;;M", "homepage": "https://cs.umd.edu/~avi1;https://github.com/Arjung27;http://cs.umd.edu/~amin;;https://www.cs.umd.edu/~tomg/", "dblp": "249/9334.html;;239/8313;241/7231;25/8184", "google_scholar": "WNvQ7AcAAAAJ;5pcsbisAAAAJ;tNQWOxUAAAAJ;pGDKzuUAAAAJ;KmSuVtgAAAAJ", "orcid": ";;;;", "linkedin": ";arjung27/;;;", "or_profile": "~Avi_Schwarzschild1;~Arjun_Gupta2;~Amin_Ghiasi1;~Micah_Goldblum1;~Tom_Goldstein1", "aff": "University of Maryland, College Park;Zipline International Inc;University of Maryland, College Park;New York University;University of Maryland, College Park", "aff_domain": "umd.edu;flyzipline.com;umd.edu;nyu.edu;umd.edu", "position": "PhD student;Professional;PhD student;Postdoc;Associate Professor", "bibtex": "@inproceedings{\nschwarzschild2022the,\ntitle={The Uncanny Similarity of Recurrence and Depth},\nauthor={Avi Schwarzschild and Arjun Gupta and Amin Ghiasi and Micah Goldblum and Tom Goldstein},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=3wNcr5nq56}\n}", "github": "", "project": "", "reviewers": "9nUr;UD8X;Ej1w;opFT", "pdf_size": 0, "recommendation": "6;6;6;8", "confidence": "5;4;5;4", "correctness": "3;3;3;3", "technical_novelty": "1;2;2;3", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "245;194;69;93", "wc_summary_review": "360;97;351;59", "wc_main_review": "1071;840;502;635", "wc_review": "1676;1131;922;787", "wc_reply_reviewers": "218;0;114;0", "wc_reply_authors": "232;345;377;100", "reply_reviewers": "1;0;1;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 4.5, 0.5 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 150.25, 72.06030460662791 ], "wc_summary_review_avg": [ 216.75, 139.43524482712397 ], "wc_main_review_avg": [ 762.0, 215.22894786714915 ], "wc_review_avg": [ 1129.0, 338.75728774448527 ], "wc_reply_reviewers_avg": [ 83.0, 90.77995373429091 ], "wc_reply_authors_avg": [ 263.5, 108.68417548106991 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.0, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15030809144030367999&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=3wNcr5nq56", "email": "umd.edu;flyzipline.com;umd.edu;nyu.edu;umd.edu", "author_num": 5, "aff_unique_index": "0;1;0;2;0", "aff_unique_norm": "University of Maryland;Zipline International Inc;New York University", "aff_unique_dep": ";;", "aff_unique_url": "https://www/umd.edu;https://www.zipline.com;https://www.nyu.edu", "aff_unique_abbr": "UMD;;NYU", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "College Park;", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "The Information Geometry of Unsupervised Reinforcement Learning", "status": "Oral", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6206", "id": "3wU2UX0voE", "poster": "", "openreview": "https://openreview.net/forum?id=3wU2UX0voE", "slides": "https://iclr.cc/virtual/2022/poster/6206", "video": "https://iclr.cc/virtual/2022/poster/6206", "author_site": "Benjamin Eysenbach, Ruslan Salakhutdinov, Sergey Levine", "tldr": "", "abstract": "How can a reinforcement learning (RL) agent prepare to solve downstream tasks if those tasks are not known a priori? One approach is unsupervised skill discovery, a class of algorithms that learn a set of policies without access to a reward function. Such algorithms bear a close resemblance to representation learning algorithms (e.g., contrastive learning) in supervised learning, in that both are pretraining algorithms that maximize some approximation to a mutual information objective. While prior work has shown that the set of skills learned by such methods can accelerate downstream RL tasks, prior work offers little analysis into whether these skill learning algorithms are optimal, or even what notion of optimality would be appropriate to apply to them. In this work, we show that unsupervised skill discovery algorithms based on mutual information maximization do not learn skills that are optimal for every possible reward function. However, we show that the distribution over skills provides an optimal initialization minimizing regret against adversarially-chosen reward functions, assuming a certain type of adaptation procedure. Our analysis also provides a geometric perspective on these skill learning methods.", "keywords": "unsupervised skill learning;reward-free RL;mutual information;DIAYN", "primary_area": "", "supplementary_material": "/attachment/70a8d05ec738c35895783c0cb17f52610301f6df.zip", "author": "Benjamin Eysenbach;Ruslan Salakhutdinov;Sergey Levine", "authorids": "~Benjamin_Eysenbach1;~Ruslan_Salakhutdinov1;~Sergey_Levine1", "gender": "M;M;M", "homepage": "https://ben-eysenbach.github.io/;https://people.eecs.berkeley.edu/~svlevine/;https://www.cs.cmu.edu/~rsalakhu/", "dblp": "192/1863;80/7594;", "google_scholar": "DRnOvU8AAAAJ;8R35rCwAAAAJ;", "orcid": "0009-0000-7136-6307;;", "linkedin": "benjamin-eysenbach-a7235775/;;", "or_profile": "~Benjamin_Eysenbach1;~Sergey_Levine1;~Russ_Salakhutdinov1", "aff": "Carnegie Mellon University;Google;School of Computer Science, Carnegie Mellon University", "aff_domain": "cmu.edu;google.com;cs.cmu.edu", "position": "PhD student;Research Scientist;Full Professor", "bibtex": "@inproceedings{\neysenbach2022the,\ntitle={The Information Geometry of Unsupervised Reinforcement Learning},\nauthor={Benjamin Eysenbach and Ruslan Salakhutdinov and Sergey Levine},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=3wU2UX0voE}\n}", "github": "", "project": "", "reviewers": "tCVb;Xkef;r6uL", "pdf_size": 0, "recommendation": "8;8;8", "confidence": "4;3;4", "correctness": "4;4;4", "technical_novelty": "4;4;4", "empirical_novelty": "0;0;0", "wc_summary_paper": "192;69;165", "wc_summary_review": "42;35;70", "wc_main_review": "73;147;840", "wc_review": "307;251;1075", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 8.0, 0.0 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 4.0, 0.0 ], "technical_novelty_avg": [ 4.0, 0.0 ], "empirical_novelty_avg": [ 0, 0 ], "wc_summary_paper_avg": [ 142.0, 52.78257288158659 ], "wc_summary_review_avg": [ 49.0, 15.121728296285006 ], "wc_main_review_avg": [ 353.3333333333333, 345.4488223874687 ], "wc_review_avg": [ 544.3333333333334, 375.9338003189155 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 40, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1840572653029125797&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "pdf": "https://openreview.net/pdf?id=3wU2UX0voE", "email": "cmu.edu;google.com;cs.cmu.edu", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "Carnegie Mellon University;Google", "aff_unique_dep": ";Google", "aff_unique_url": "https://www.cmu.edu;https://www.google.com", "aff_unique_abbr": "CMU;Google", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Mountain View;Pittsburgh", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "3z9RnbAS49", "title": "A Theoretical and Empirical Model of the Generalization Error under Time-Varying Learning Rate", "track": "main", "status": "Reject", "tldr": "", "abstract": "Stochastic gradient descent is commonly employed as the most principled optimization algorithm for deep learning, and the dependence of the generalization error of neural networks on the given hyperparameters is crucial. \nHowever, the case in which the batch size and learning rate vary with time has not yet been analyzed, nor the dependence of them on the generalization error as a functional form for both the constant and time-varying cases has been expressed. \nIn this study, we analyze the generalization bound for the time-varying case by applying PAC-Bayes and experimentally show that the theoretical functional form for the batch size and learning rate approximates the generalization error well for both cases. \nWe also experimentally show that hyperparameter optimization based on the proposed model outperforms the existing libraries.", "keywords": "deep learning;generalization error;stochastic gradient descent;functional form;hyperparameter;batch size;learning rate", "primary_area": "", "supplementary_material": "", "author": "Toru Makuuchi;YUSUKE Mukuta;Tatsuya Harada", "authorids": "~Toru_Makuuchi1;~YUSUKE_Mukuta1;~Tatsuya_Harada1", "gender": "M;;M", "homepage": "https://www.mi.t.u-tokyo.ac.jp/members/;https://www.mi.t.u-tokyo.ac.jp/mukuta/;https://www.mi.t.u-tokyo.ac.jp/harada/", "dblp": ";153/5464;14/5849", "google_scholar": ";https://scholar.google.co.jp/citations?user=emo91rIAAAAJ;https://scholar.google.com/citations?hl=ja", "orcid": ";;", "linkedin": ";;", "or_profile": "~Toru_Makuuchi1;~YUSUKE_Mukuta1;~Tatsuya_Harada1", "aff": "The University of Tokyo, The University of Tokyo;The University of Tokyo;The University of Tokyo", "aff_domain": "t.u-tokyo.ac.jp;u-tokyo.ac.jp;u-tokyo.ac.jp", "position": "MS student;Lecturer;Full Professor", "bibtex": "@misc{\nmakuuchi2022a,\ntitle={A Theoretical and Empirical Model of the Generalization Error under Time-Varying Learning Rate},\nauthor={Toru Makuuchi and YUSUKE Mukuta and Tatsuya Harada},\nyear={2022},\nurl={https://openreview.net/forum?id=3z9RnbAS49}\n}", "github": "", "project": "", "reviewers": "8e8R;GHoM;P32Z;sp7W", "site": "https://openreview.net/forum?id=3z9RnbAS49", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "3;4;4;4", "correctness": "2;2;2;4", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "131;63;98;140", "wc_summary_review": "53;52;71;68", "wc_main_review": "185;480;292;256", "wc_review": "369;595;461;464", "wc_reply_reviewers": "0;0;0;111", "wc_reply_authors": "679;808;307;586", "reply_reviewers": "0;0;0;1", "reply_authors": "1;1;1;1", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 2.5, 0.8660254037844386 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 108.0, 30.32325840011261 ], "wc_summary_review_avg": [ 61.0, 8.573214099741124 ], "wc_main_review_avg": [ 303.25, 109.06735304388752 ], "wc_review_avg": [ 472.25, 80.5027173454412 ], "wc_reply_reviewers_avg": [ 27.75, 48.064409910036346 ], "wc_reply_authors_avg": [ 595.0, 184.01766219577945 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:7EVbm-T06KwJ:scholar.google.com/&scioq=A+Theoretical+and+Empirical+Model+of+the+Generalization+Error+under+Time-Varying+Learning+Rate&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Tokyo", "aff_unique_dep": "", "aff_unique_url": "https://www.u-tokyo.ac.jp", "aff_unique_abbr": "UTokyo", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Japan" }, { "id": "3zJVXU311-Q", "title": "Hopular: Modern Hopfield Networks for Tabular Data", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "While Deep Learning excels in structured data as encountered in vision and natural language processing, it failed to meet its expectations on tabular data. In real world, however, most machine learning applications face tabular data with less than 10,000 samples. For tabular data, Support Vector Machines (SVMs), Random Forests, and Gradient Boosting are the best performing techniques, where Gradient Boosting has the lead. Recently, we saw a surge of Deep Learning methods that were tailored to tabular data. However, these methods still underperform compared to Gradient Boosting. We suggest \"Hopular\" to learn from tabular data with hundreds or thousands of samples. Hopular is a Deep Learning architecture, where each layer is equipped with continuous modern Hopfield networks. The modern Hopfield networks can store two types of data: (i) the whole training set and (ii) the feature embedding vectors of the actual input. The stored data allow the identification of feature-feature, feature-target, sample-sample, and sample-target dependencies. The stored training set enables to find similarities across input vectors and targets, while the stored actual input enables to determine dependencies between features and targets. Hopular's novelty is that the original training set and the original input are provided at each layer. Therefore, Hopular can improve the current prediction at every layer by re-accessing the original training set like standard iterative learning algorithms. In experiments on small-sized tabular datasets with less than 1,000 samples, Hopular surpasses Gradient Boosting, Random Forests, SVMs, and in particular several Deep Learning methods. In experiments on medium-sized tabular data with about 10,000 samples, Hopular outperforms XGBoost and a state-of-the art Deep Learning method designed for tabular data. Although Hopular needs more training time than Gradient Boosting, Random Forests, and SVMs, it is a strong alternative to these methods on small-sized and medium-sized tabular datasets as it yields higher performance.", "keywords": "deep learning;tabular data;gradient boosting;Hopfield networks;associative memory", "primary_area": "", "supplementary_material": "", "author": "Bernhard Sch\u00e4fl;Lukas Gruber;Angela Bitto-Nemling;Sepp Hochreiter", "authorids": "~Bernhard_Sch\u00e4fl1;~Lukas_Gruber2;~Angela_Bitto-Nemling1;~Sepp_Hochreiter1", "gender": ";Not Specified;;M", "homepage": ";https://www.jku.at/en/institute-for-machine-learning/;https://www.jku.at/institut-fuer-machine-learning/ueber-uns/team/di-dr-angela-bitto-nemling;https://www.jku.at/en/institute-for-machine-learning/about-us/team/sepp-hochreiter/", "dblp": "271/0980;18/7703;;h/SeppHochreiter.html", "google_scholar": ";;;https://scholar.google.at/citations?user=tvUH3WMAAAAJ", "orcid": "0000-0002-6008-1290;;;0000-0001-7449-2528", "linkedin": ";;;https://linkedin.com/in/sepp-hochreiter-41514846", "or_profile": "~Bernhard_Sch\u00e4fl1;~Lukas_Gruber2;~Angela_Bitto-Nemling1;~Sepp_Hochreiter1", "aff": "Johannes Kepler University Linz;Johannes Kepler University Linz;Johannes Kepler University Linz;Johannes Kepler University Linz", "aff_domain": "jku.at;jku.at;jku.at;jku.at", "position": "PhD student;PhD student;Postdoc;Full Professor", "bibtex": "@misc{\nsch{\\\"a}fl2022hopular,\ntitle={Hopular: Modern Hopfield Networks for Tabular Data},\nauthor={Bernhard Sch{\\\"a}fl and Lukas Gruber and Angela Bitto-Nemling and Sepp Hochreiter},\nyear={2022},\nurl={https://openreview.net/forum?id=3zJVXU311-Q}\n}", "github": "", "project": "", "reviewers": "6m2C;wdX3;wQkj;ZM7R", "site": "https://openreview.net/forum?id=3zJVXU311-Q", "pdf_size": 0, "recommendation": "3;3;3;6", "confidence": "4;4;3;4", "correctness": "2;3;2;3", "technical_novelty": "3;2;2;3", "empirical_novelty": "3;2;2;3", "wc_summary_paper": "168;82;108;46", "wc_summary_review": "95;449;51;29", "wc_main_review": "274;1868;431;435", "wc_review": "537;2399;590;510", "wc_reply_reviewers": "0;236;41;166", "wc_reply_authors": "466;1656;580;655", "reply_reviewers": "0;2;1;1", "reply_authors": "1;3;1;1", "recommendation_avg": [ 3.75, 1.299038105676658 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 101.0, 44.50842616853577 ], "wc_summary_review_avg": [ 156.0, 170.82447131485583 ], "wc_main_review_avg": [ 752.0, 647.5859016377673 ], "wc_review_avg": [ 1009.0, 803.0326892474552 ], "wc_reply_reviewers_avg": [ 110.75, 94.69787484415899 ], "wc_reply_authors_avg": [ 839.25, 476.3283400134827 ], "reply_reviewers_avg": [ 1.0, 0.7071067811865476 ], "reply_authors_avg": [ 1.5, 0.8660254037844386 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 33, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14791030849099418064&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Johannes Kepler University", "aff_unique_dep": "", "aff_unique_url": "https://www.jku.at", "aff_unique_abbr": "JKU", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Linz", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Austria" }, { "title": "Value Gradient weighted Model-Based Reinforcement Learning", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6704", "id": "4-D6CZkRXxI", "poster": "", "openreview": "https://openreview.net/forum?id=4-D6CZkRXxI", "slides": "https://iclr.cc/virtual/2022/poster/6704", "video": "https://iclr.cc/virtual/2022/poster/6704", "author_site": "Claas Voelcker, Victor Liao, Animesh Garg, Amir-massoud Farahmand", "tldr": "", "abstract": "Model-based reinforcement learning (MBRL) is a sample efficient technique to obtain control policies, yet unavoidable modeling errors often lead performance deterioration. The model in MBRL is often solely fitted to reconstruct dynamics, state observations in particular, while the impact of model error on the policy is not captured by the training objective. This leads to a mismatch between the intended goal of MBRL, enabling good policy and value learning, and the target of the loss function employed in practice, future state prediction. Naive intuition would suggest that value-aware model learning would fix this problem and, indeed, several solutions to this objective mismatch problem have been proposed based on theoretical analysis. However, they tend to be inferior in practice to commonly used maximum likelihood (MLE) based approaches. In this paper we propose the Value-gradient weighted Model Learning (VaGraM), a novel method for value-aware model learning which improves the performance of MBRL in challenging settings, such as small model capacity and the presence of distracting state dimensions. We analyze both MLE and value-aware approaches and demonstrate how they fail to account for exploration and the behavior of function approximation when learning value-aware models and highlight the additional goals that must be met to stabilize optimization in the deep learning setting. We verify our analysis by showing that our loss function is able to achieve high returns on the Mujoco benchmark suite while being more robust than maximum likelihood based approaches.\n", "keywords": "model-based reinforcement learning;reinforcment learning;objective mismatch;value function;sensitivity", "primary_area": "", "supplementary_material": "/attachment/1a681542947089632a1944c54ae8b9062ca8e0cd.zip", "author": "Claas A Voelcker;Victor Liao;Animesh Garg;Amir-massoud Farahmand", "authorids": "~Claas_A_Voelcker1;~Victor_Liao1;~Animesh_Garg1;~Amir-massoud_Farahmand1", "gender": "M;;M;M", "homepage": ";https://victorliao.com;http://animesh.garg.tech;http://academic.sologen.net/", "dblp": "250/2358;;123/5728;17/671", "google_scholar": "UZq8qZ8AAAAJ;;zp8V7ZMAAAAJ;https://scholar.google.ca/citations?user=G5SAV7gAAAAJ", "orcid": ";;0000-0003-0482-4296;", "linkedin": ";;animeshgarg/;amir-massoud-farahmand/", "or_profile": "~Claas_A_Voelcker1;~Victor_Liao1;~Animesh_Garg1;~Amir-massoud_Farahmand1", "aff": "Toronto University;University of Waterloo;University of Toronto;Vector Institute", "aff_domain": "utoronto.ca;uwaterloo.ca;toronto.edu;vectorinstitute.ai", "position": "PhD student;Undergrad student;Assistant Professor;Faculty Member", "bibtex": "@inproceedings{\nvoelcker2022value,\ntitle={Value Gradient weighted Model-Based Reinforcement Learning},\nauthor={Claas A Voelcker and Victor Liao and Animesh Garg and Amir-massoud Farahmand},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=4-D6CZkRXxI}\n}", "github": "", "project": "", "reviewers": "kWH6;VsMY;dE85;WgYP", "pdf_size": 0, "recommendation": "6;6;8;8", "confidence": "5;5;5;5", "correctness": "3;4;3;4", "technical_novelty": "3;3;3;4", "empirical_novelty": "4;2;3;2", "wc_summary_paper": "144;76;47;142", "wc_summary_review": "53;55;39;42", "wc_main_review": "991;154;1142;499", "wc_review": "1188;285;1228;683", "wc_reply_reviewers": "0;0;275;212", "wc_reply_authors": "1052;367;2131;1013", "reply_reviewers": "0;0;4;2", "reply_authors": "2;1;3;3", "recommendation_avg": [ 7.0, 1.0 ], "confidence_avg": [ 5.0, 0.0 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 102.25, 42.02603359823527 ], "wc_summary_review_avg": [ 47.25, 6.869315832017043 ], "wc_main_review_avg": [ 696.5, 393.2279873050747 ], "wc_review_avg": [ 846.0, 388.6444390442246 ], "wc_reply_reviewers_avg": [ 121.75, 123.77070533854123 ], "wc_reply_authors_avg": [ 1140.75, 633.1431019129877 ], "reply_reviewers_avg": [ 1.5, 1.6583123951777 ], "reply_authors_avg": [ 2.25, 0.82915619758885 ], "replies_avg": [ 24, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 33, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8520603595405446141&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "pdf": "https://openreview.net/pdf?id=4-D6CZkRXxI", "email": "utoronto.ca;uwaterloo.ca;toronto.edu;vectorinstitute.ai", "author_num": 4, "aff_unique_index": "0;1;0;2", "aff_unique_norm": "University of Toronto;University of Waterloo;Vector Institute", "aff_unique_dep": ";;", "aff_unique_url": "https://www.utoronto.ca;https://uwaterloo.ca;https://vectorinstitute.ai/", "aff_unique_abbr": "U of T;UW;Vector Institute", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Canada" }, { "title": "GreaseLM: Graph REASoning Enhanced Language Models", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/5938", "id": "41e9o6cQPj", "poster": "", "openreview": "https://openreview.net/forum?id=41e9o6cQPj", "slides": "https://iclr.cc/virtual/2022/poster/5938", "video": "https://iclr.cc/virtual/2022/poster/5938", "author_site": "Xikun Zhang, Antoine Bosselut, Michihiro Yasunaga, Hongyu Ren, Percy Liang, Christopher Manning, Jure Leskovec", "tldr": "", "abstract": "Answering complex questions about textual narratives requires reasoning over both stated context and the world knowledge that underlies it. However, pretrained language models (LM), the foundation of most modern QA systems, do not robustly represent latent relationships between concepts, which is necessary for reasoning. While knowledge graphs (KG) are often used to augment LMs with structured representations of world knowledge, it remains an open question how to effectively fuse and reason over the KG representations and the language context, which provides situational constraints and nuances. In this work, we propose GreaseLM, a new model that fuses encoded representations from pretrained LMs and graph neural networks over multiple layers of modality interaction operations. Information from both modalities propagates to the other, allowing language context representations to be grounded by structured world knowledge, and allowing linguistic nuances (e.g., negation, hedging) in the context to inform the graph representations of knowledge. Our results on three benchmarks in the commonsense reasoning (i.e., CommonsenseQA, OpenbookQA) and medical question answering (i.e., MedQA-USMLE) domains demonstrate that GreaseLM can more reliably answer questions that require reasoning over both situational constraints and structured knowledge, even outperforming models 8x larger.", "keywords": "language models;commonsense;question answering;knowledge graphs;KG augmentation", "primary_area": "", "supplementary_material": "", "author": "Xikun Zhang;Antoine Bosselut;Michihiro Yasunaga;Hongyu Ren;Percy Liang;Christopher D Manning;Jure Leskovec", "authorids": "~Xikun_Zhang1;~Antoine_Bosselut1;~Michihiro_Yasunaga1;~Hongyu_Ren1;~Percy_Liang1;~Christopher_D_Manning1;~Jure_Leskovec1", "gender": "M;M;;;;M;", "homepage": "https://xikunzhang.github.io/;https://atcbosselut.github.io/;;;https://cs.stanford.edu/~pliang/;https://nlp.stanford.edu/~manning/;http://cs.stanford.edu/~jure/", "dblp": "38/326-1;184/3742;202/1809;30/10885;04/1701;m/ChristopherDManning;l/JureLeskovec", "google_scholar": "EA_bUQMAAAAJ;XD9hkJwAAAAJ;SieJYoEAAAAJ;;pouyVyUAAAAJ;1zmDOdwAAAAJ;Q_kKkIUAAAAJ", "orcid": "0000-0002-8346-8594;;;;;0000-0001-6155-649X;0000-0002-5411-923X", "linkedin": "xikun/;;;;;christopher-manning-011575/;leskovec/", "or_profile": "~Xikun_Zhang1;~Antoine_Bosselut1;~Michihiro_Yasunaga1;~Hongyu_Ren1;~Percy_Liang1;~Christopher_D_Manning1;~Jure_Leskovec1", "aff": "Stanford University;Swiss Federal Institute of Technology Lausanne;Stanford University;Computer Science Department, Stanford University;Stanford University;Computer Science Department, Stanford University;Kumo.AI", "aff_domain": "stanford.edu;epfl.ch;stanford.edu;cs.stanford.edu;stanford.edu;cs.stanford.edu;kumo.ai", "position": "PhD student;Assistant Professor;PhD student;PhD student;Associate Professor;Full Professor;Chief Scientist", "bibtex": "@inproceedings{\nzhang2022greaselm,\ntitle={Grease{LM}: Graph {REAS}oning Enhanced Language Models},\nauthor={Xikun Zhang and Antoine Bosselut and Michihiro Yasunaga and Hongyu Ren and Percy Liang and Christopher D Manning and Jure Leskovec},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=41e9o6cQPj}\n}", "github": "", "project": "", "reviewers": "RNQN;99q1;ZyPu;NQbJ", "pdf_size": 0, "recommendation": "6;6;8;8", "confidence": "4;3;4;3", "correctness": "4;3;4;4", "technical_novelty": "2;3;3;3", "empirical_novelty": "3;3;3;3", "wc_summary_paper": "35;106;154;147", "wc_summary_review": "47;46;13;85", "wc_main_review": "299;282;135;491", "wc_review": "381;434;302;723", "wc_reply_reviewers": "173;19;12;73", "wc_reply_authors": "777;672;182;675", "reply_reviewers": "1;1;1;1", "reply_authors": "2;2;2;2", "recommendation_avg": [ 7.0, 1.0 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 110.5, 47.28900506460249 ], "wc_summary_review_avg": [ 47.75, 25.488968201949643 ], "wc_main_review_avg": [ 301.75, 126.50963402049663 ], "wc_review_avg": [ 460.0, 158.94181325252333 ], "wc_reply_reviewers_avg": [ 69.25, 64.38313055451715 ], "wc_reply_authors_avg": [ 576.5, 231.6532969763219 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 2.0, 0.0 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1, "pdf": "https://openreview.net/pdf?id=41e9o6cQPj", "email": "stanford.edu;epfl.ch;stanford.edu;cs.stanford.edu;stanford.edu;cs.stanford.edu;kumo.ai", "author_num": 7, "aff_unique_index": "0;1;0;0;0;0;2", "aff_unique_norm": "Stanford University;Swiss Federal Institute of Technology Lausanne;Kumo.AI", "aff_unique_dep": ";;", "aff_unique_url": "https://www.stanford.edu;https://www.epfl.ch;https://www.kumo.ai", "aff_unique_abbr": "Stanford;EPFL;Kumo.AI", "aff_campus_unique_index": "0;1;0;0;0;0", "aff_campus_unique": "Stanford;Lausanne;", "aff_country_unique_index": "0;1;0;0;0;0;0", "aff_country_unique": "United States;Switzerland" }, { "title": "Graph-Augmented Normalizing Flows for Anomaly Detection of Multiple Time Series", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6770", "id": "45L_dgP48Vd", "poster": "", "openreview": "https://openreview.net/forum?id=45L_dgP48Vd", "slides": "https://iclr.cc/virtual/2022/poster/6770", "video": "https://iclr.cc/virtual/2022/poster/6770", "author_site": "Enyan Dai, Jie Chen", "tldr": "", "abstract": "Anomaly detection is a widely studied task for a broad variety of data types; among them, multiple time series appear frequently in applications, including for example, power grids and traffic networks. Detecting anomalies for multiple time series, however, is a challenging subject, owing to the intricate interdependencies among the constituent series. We hypothesize that anomalies occur in low density regions of a distribution and explore the use of normalizing flows for unsupervised anomaly detection, because of their superior quality in density estimation. Moreover, we propose a novel flow model by imposing a Bayesian network among constituent series. A Bayesian network is a directed acyclic graph (DAG) that models causal relationships; it factorizes the joint probability of the series into the product of easy-to-evaluate conditional probabilities. We call such a graph-augmented normalizing flow approach GANF and propose joint estimation of the DAG with flow parameters. We conduct extensive experiments on real-world datasets and demonstrate the effectiveness of GANF for density estimation, anomaly detection, and identification of time series distribution drift.", "keywords": "Anomaly Detection;Normalizing Flow;DAG;Multiple Time Series", "primary_area": "", "supplementary_material": "", "author": "Enyan Dai;Jie Chen", "authorids": "~Enyan_Dai1;~Jie_Chen1", "gender": "M;", "homepage": "https://enyandai.github.io/;https://jiechenjiechen.github.io", "dblp": "250/2886;92/6289-7", "google_scholar": "https://scholar.google.com/citations?hl=zh-CN;Z-lkme8AAAAJ", "orcid": "0000-0001-9715-0280;", "linkedin": ";", "or_profile": "~Enyan_Dai1;~Jie_Chen1", "aff": "Pennsylvania State University;International Business Machines", "aff_domain": "psu.edu;ibm.com", "position": "PhD student;Research Staff Member", "bibtex": "@inproceedings{\ndai2022graphaugmented,\ntitle={Graph-Augmented Normalizing Flows for Anomaly Detection of Multiple Time Series},\nauthor={Enyan Dai and Jie Chen},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=45L_dgP48Vd}\n}", "github": "", "project": "", "reviewers": "ofqA;tq1S;J4SV;mBgF", "pdf_size": 0, "recommendation": "6;6;8;8", "confidence": "4;4;4;3", "correctness": "1;4;3;3", "technical_novelty": "2;3;3;4", "empirical_novelty": "1;3;3;3", "wc_summary_paper": "52;174;123;65", "wc_summary_review": "18;168;35;83", "wc_main_review": "441;722;445;286", "wc_review": "511;1064;603;434", "wc_reply_reviewers": "259;226;41;0", "wc_reply_authors": "876;942;826;290", "reply_reviewers": "2;1;1;0", "reply_authors": "3;2;2;1", "recommendation_avg": [ 7.0, 1.0 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 2.75, 1.0897247358851685 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 103.5, 48.69548233666035 ], "wc_summary_review_avg": [ 76.0, 58.21941257003544 ], "wc_main_review_avg": [ 473.5, 157.14404220332375 ], "wc_review_avg": [ 653.0, 244.71718370396468 ], "wc_reply_reviewers_avg": [ 131.5, 112.54887827073178 ], "wc_reply_authors_avg": [ 733.5, 259.3390637755909 ], "reply_reviewers_avg": [ 1.0, 0.7071067811865476 ], "reply_authors_avg": [ 2.0, 0.7071067811865476 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.22941573387056177, "gs_citation": 122, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14229520023541942069&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "pdf": "https://openreview.net/pdf?id=45L_dgP48Vd", "email": "psu.edu;ibm.com", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "Pennsylvania State University;International Business Machines Corporation", "aff_unique_dep": ";", "aff_unique_url": "https://www.psu.edu;https://www.ibm.com", "aff_unique_abbr": "PSU;IBM", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Explanations of Black-Box Models based on Directional Feature Interactions", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6731", "id": "45Mr7LeKR9", "poster": "", "openreview": "https://openreview.net/forum?id=45Mr7LeKR9", "slides": "https://iclr.cc/virtual/2022/poster/6731", "video": "https://iclr.cc/virtual/2022/poster/6731", "author_site": "Aria Masoomi, Davin Hill, Zhonghui Xu, Craig Hersh, Edwin Silverman, Peter Castaldi, Stratis Ioannidis, Jennifer Dy", "tldr": "", "abstract": "As machine learning algorithms are deployed ubiquitously to a variety of domains, it is imperative to make these often black-box models transparent. Several recent works explain black-box models by capturing the most influential features for prediction per instance; such explanation methods are univariate, as they characterize importance per feature. We extend univariate explanation to a higher-order; this enhances explainability, as bivariate methods can capture feature interactions in black-box models, represented as a directed graph. Analyzing this graph enables us to discover groups of features that are equally important (i.e., interchangeable), while the notion of directionality allows us to identify the most influential features. We apply our bivariate method on Shapley value explanations, and experimentally demonstrate the ability of directional explanations to discover feature interactions. We show the superiority of our method against state-of-the-art on CIFAR10, IMDB, Census, Divorce, Drug, and gene data. ", "keywords": "Explainability;Shapley values;Interpretability;Directional interaction;feature interaction", "primary_area": "", "supplementary_material": "/attachment/18c104288296fe3b066533e675c6996c37bcc392.zip", "author": "Aria Masoomi;Davin Hill;Zhonghui Xu;Craig P Hersh;Edwin K. Silverman;Peter J. Castaldi;Stratis Ioannidis;Jennifer Dy", "authorids": "~Aria_Masoomi1;~Davin_Hill1;~Zhonghui_Xu1;~Craig_P_Hersh1;~Edwin_K._Silverman1;~Peter_J._Castaldi1;~Stratis_Ioannidis1;~Jennifer_Dy1", "gender": "M;;M;;M;;M;", "homepage": ";;;https://connects.catalyst.harvard.edu/Profiles/display/Person/626;;;https://ece.northeastern.edu/fac-ece/ioannidis/;https://mllabneu.github.io/", "dblp": "242/9324;;;;81/5419;72/9576;42/6940;24/6000", "google_scholar": "KXcX8coAAAAJ;;;;;;GPIB5kUAAAAJ;6h7b0fAAAAAJ", "orcid": ";;0000-0002-6469-9178;;;;0000-0001-8355-4751;", "linkedin": "aria-masoomi-779a02232;;;;;;stratis-ioannidis-87b826110;", "or_profile": "~Aria_Masoomi1;~Davin_Hill1;~Zhonghui_Xu1;~Craig_P_Hersh1;~Edwin_K._Silverman1;~Peter_J._Castaldi1;~Stratis_Ioannidis1;~Jennifer_Dy1", "aff": "Northeastern University;;Harvard University;Brigham and Women's Hospital;Brigham and Women's Hospital;;Northeastern University;Northeastern University", "aff_domain": "northeastern.edu;;harvard.edu;bwh.harvard.edu;harvard.edu;;northeastern.edu;northeastern.edu", "position": "PhD student;;Information System Project Manager;Associate Professor;Full Professor;;Associate Professor;Full Professor", "bibtex": "@inproceedings{\nmasoomi2022explanations,\ntitle={Explanations of Black-Box Models based on Directional Feature Interactions},\nauthor={Aria Masoomi and Davin Hill and Zhonghui Xu and Craig P Hersh and Edwin K. Silverman and Peter J. Castaldi and Stratis Ioannidis and Jennifer Dy},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=45Mr7LeKR9}\n}", "github": "", "project": "", "reviewers": "5zsk;GZtH;k52n;WTCZ", "pdf_size": 0, "recommendation": "8;8;8;8", "confidence": "3;3;4;4", "correctness": "4;4;3;4", "technical_novelty": "3;3;3;3", "empirical_novelty": "3;3;3;2", "wc_summary_paper": "86;85;172;49", "wc_summary_review": "94;18;61;35", "wc_main_review": "521;140;455;176", "wc_review": "701;243;688;260", "wc_reply_reviewers": "27;13;24;21", "wc_reply_authors": "934;263;1208;420", "reply_reviewers": "1;1;1;1", "reply_authors": "3;1;2;1", "recommendation_avg": [ 8.0, 0.0 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 98.0, 45.249309386995066 ], "wc_summary_review_avg": [ 52.0, 28.679260799399973 ], "wc_main_review_avg": [ 323.0, 167.12719706858007 ], "wc_review_avg": [ 473.0, 221.6291948277573 ], "wc_reply_reviewers_avg": [ 21.25, 5.2141634036535525 ], "wc_reply_authors_avg": [ 706.25, 381.45535453051383 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 1.75, 0.82915619758885 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 33, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5981770379829691043&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=45Mr7LeKR9", "email": "northeastern.edu;;harvard.edu;bwh.harvard.edu;harvard.edu;;northeastern.edu;northeastern.edu", "author_num": 8, "aff_unique_index": "0;1;2;2;0;0", "aff_unique_norm": "Northeastern University;Harvard University;Brigham and Women's Hospital", "aff_unique_dep": ";;", "aff_unique_url": "https://www.northeastern.edu;https://www.harvard.edu;https://www.brighamandwomens.org", "aff_unique_abbr": "NEU;Harvard;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "46lmrnVBHBL", "title": "Explanatory Learning: Beyond Empiricism in Neural Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "We introduce Explanatory Learning (EL), an explanation-driven machine learning framework to use existing knowledge buried in symbolic sequences expressed in an unknown language. In EL, the burden of interpreting explanations is not left to humans or human-coded compilers, as done in Program Synthesis. Rather, EL calls for a learned interpreter, built upon existing explanations paired with observations of several phenomena. This interpreter can then be used to make predictions on novel phenomena, and even find an explanation for them. We formulate the EL problem as a simple binary classification task, so that common end-to-end approaches aligned with the dominant empiricist view of machine learning could, in principle, solve it. To these models, we oppose Critical Rationalist Networks (CRNs), which instead embrace a rationalist view on the acquisition of knowledge. CRNs express several desired properties by construction, they are truly explainable, can adjust their processing at test-time for harder inferences, and can offer strong confidence guarantees on their predictions.", "keywords": "explainability;rationalism;deep learning", "primary_area": "", "supplementary_material": "/attachment/696e1f193a8b9c2fec38c331e3af16000a4e47a9.zip", "author": "Antonio Norelli;Giorgio Mariani;Luca Moschella;Andrea Santilli;Giambattista Parascandolo;Simone Melzi;Emanuele Rodol\u00e0", "authorids": "~Antonio_Norelli2;~Giorgio_Mariani1;~Luca_Moschella1;~Andrea_Santilli1;~Giambattista_Parascandolo1;~Simone_Melzi2;~Emanuele_Rodol\u00e01", "gender": "M;M;M;M;M;M;M", "homepage": "https://phd.uniroma1.it/web/ANTONIO-NORELLI_nP1612487_EN.aspx;;https://luca.moschella.dev;https://santilli.xyz/;https://sites.google.com/site/melzismn/;;https://sites.google.com/view/giambattista-parascandolo/home", "dblp": "261/9526;312/5858;205/3639;179/7048;160/2770;54/8401;179/2714", "google_scholar": ";;4byA-nefJJMC;j2Y_XBIAAAAJ;https://scholar.google.it/citations?user=hkrUTqEAAAAJ;-EH4wBYAAAAJ;https://scholar.google.it/citations?user=1zCDX_UAAAAJ", "orcid": ";;0000-0002-0550-7498;;0000-0003-2790-9591;0000-0003-0091-7241;", "linkedin": ";giorgio-mariani/;lucamoschella/;andreasantilli/;;;", "or_profile": "~Antonio_Norelli2;~Giorgio_Mariani1;~Luca_Moschella1;~Andrea_Santilli1;~Simone_Melzi2;~Emanuele_Rodol\u00e01;~Giambattista_Parascandolo2", "aff": "Sapienza University of Rome;University of Roma \"La Sapienza\";NVIDIA;Sapienza University of Rome;Sapienza University of Rome;Sapienza University of Rome;OpenAI", "aff_domain": "uniroma1.it;uniroma1.it;nvidia.com;uniroma1.it;uniroma1.it;uniroma1.it;openai.com", "position": "PhD student;PhD student;Intern;PhD student;Postdoc;Full Professor;Principal Researcher", "bibtex": "@misc{\nnorelli2022explanatory,\ntitle={Explanatory Learning: Beyond Empiricism in Neural Networks},\nauthor={Antonio Norelli and Giorgio Mariani and Luca Moschella and Andrea Santilli and Giambattista Parascandolo and Simone Melzi and Emanuele Rodol{\\`a}},\nyear={2022},\nurl={https://openreview.net/forum?id=46lmrnVBHBL}\n}", "github": "", "project": "", "reviewers": "VRSC;ZgwB;PMcp;H4ra", "site": "https://openreview.net/forum?id=46lmrnVBHBL", "pdf_size": 0, "recommendation": "3;5;6;8", "confidence": "4;2;4;4", "correctness": "3;3;3;3", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;2;3;0", "wc_summary_paper": "153;106;91;80", "wc_summary_review": "70;6;49;60", "wc_main_review": "148;694;917;243", "wc_review": "371;806;1057;383", "wc_reply_reviewers": "0;1033;0;0", "wc_reply_authors": "884;2481;3075;1148", "reply_reviewers": "0;2;0;0", "reply_authors": "2;5;5;2", "recommendation_avg": [ 5.5, 1.8027756377319946 ], "confidence_avg": [ 3.5, 0.8660254037844386 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.75, 1.0897247358851685 ], "wc_summary_paper_avg": [ 107.5, 27.84331158465171 ], "wc_summary_review_avg": [ 46.25, 24.39646490785089 ], "wc_main_review_avg": [ 500.5, 316.81106356944036 ], "wc_review_avg": [ 654.25, 291.1368879067027 ], "wc_reply_reviewers_avg": [ 258.25, 447.30212105466256 ], "wc_reply_authors_avg": [ 1897.0, 910.4820151985431 ], "reply_reviewers_avg": [ 0.5, 0.8660254037844386 ], "reply_authors_avg": [ 3.5, 1.5 ], "replies_avg": [ 22, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.16012815380508713, "corr_recommendation_correctness": 0.0, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14688931554357870205&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;2;0;0;0;3", "aff_unique_norm": "Sapienza University of Rome;University of Rome La Sapienza;NVIDIA;OpenAI", "aff_unique_dep": ";;NVIDIA Corporation;", "aff_unique_url": "https://www.uniroma1.it;https://www.uniroma1.it;https://www.nvidia.com;https://openai.com", "aff_unique_abbr": "Sapienza;La Sapienza;NVIDIA;OpenAI", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Rome;", "aff_country_unique_index": "0;0;1;0;0;0;1", "aff_country_unique": "Italy;United States" }, { "title": "CrossMatch: Cross-Classifier Consistency Regularization for Open-Set Single Domain Generalization", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6139", "id": "48RBsJwGkJf", "poster": "", "openreview": "https://openreview.net/forum?id=48RBsJwGkJf", "slides": "https://iclr.cc/virtual/2022/poster/6139", "video": "https://iclr.cc/virtual/2022/poster/6139", "author_site": "Ronghang Zhu, Sheng Li", "tldr": "", "abstract": "Single domain generalization (SDG) is a challenging scenario of domain generalization, where only one source domain is available to train the model. Typical SDG methods are based on the adversarial data augmentation strategy, which complements the diversity of source domain to learn a robust model. Existing SDG methods require the source and target domains to have the same label space. However, as target domains may contain novel categories unseen in source label space, this assumption is not practical in many real-world applications. In this paper, we propose a challenging and untouched problem: \\textit{Open-Set Single Domain Generalization} (OS-SDG), where target domains include unseen categories out of source label space. The goal of OS-SDG is to learn a model, with only one source domain, to classify a target sample with correct class if it belongs to source label space, or assign it to unknown classes. We design a \\textit{CrossMatch} approach to improve the performance of SDG methods on identifying unknown classes by leveraging a multi-binary classifier. CrossMatch generates auxiliary samples out of source label space by using an adversarial data augmentation strategy. We also adopt a consistency regularization on generated auxiliary samples between multi-binary classifiers and the model trained by SDG methods, to improve the model\u2019s capability on unknown class identification. Experimental results on benchmark datasets prove the effectiveness of CrossMatch on enhancing the performance of SDG methods in the OS-SDG setting.", "keywords": "Single Domain Generalization;Open-Set Recognition", "primary_area": "", "supplementary_material": "", "author": "Ronghang Zhu;Sheng Li", "authorids": "~Ronghang_Zhu2;~Sheng_Li3", "gender": ";M", "homepage": ";http://sheng-li.org", "dblp": ";23/3439-1", "google_scholar": ";DEncVcYAAAAJ", "orcid": ";0000-0003-1205-8632", "linkedin": ";sheng-li-15a70022/", "or_profile": "~Ronghang_Zhu2;~Sheng_Li3", "aff": ";University of Georgia", "aff_domain": ";uga.edu", "position": ";Assistant Professor", "bibtex": "@inproceedings{\nzhu2022crossmatch,\ntitle={CrossMatch: Cross-Classifier Consistency Regularization for Open-Set Single Domain Generalization},\nauthor={Ronghang Zhu and Sheng Li},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=48RBsJwGkJf}\n}", "github": "", "project": "", "reviewers": "46jP;3bup;zAkR;N4tv", "pdf_size": 0, "recommendation": "5;5;6;8", "confidence": "4;4;5;4", "correctness": "4;2;4;3", "technical_novelty": "3;3;2;3", "empirical_novelty": "3;2;3;3", "wc_summary_paper": "92;82;105;214", "wc_summary_review": "73;46;14;45", "wc_main_review": "153;420;106;447", "wc_review": "318;548;225;706", "wc_reply_reviewers": "0;0;0;31", "wc_reply_authors": "1078;1458;342;1734", "reply_reviewers": "0;0;0;1", "reply_authors": "2;3;1;3", "recommendation_avg": [ 6.0, 1.224744871391589 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 123.25, 53.0253477122027 ], "wc_summary_review_avg": [ 44.5, 20.886598574205422 ], "wc_main_review_avg": [ 281.5, 153.2032963091852 ], "wc_review_avg": [ 449.25, 189.2001255284996 ], "wc_reply_reviewers_avg": [ 7.75, 13.423393758658799 ], "wc_reply_authors_avg": [ 1153.0, 522.9560210954646 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 2.25, 0.82915619758885 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 46, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9847985198643782295&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "pdf": "https://openreview.net/pdf?id=48RBsJwGkJf", "email": ";uga.edu", "author_num": 2, "aff_unique_index": "0", "aff_unique_norm": "University of Georgia", "aff_unique_dep": "", "aff_unique_url": "https://www.uga.edu", "aff_unique_abbr": "UGA", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "title": "Linking Emergent and Natural Languages via Corpus Transfer", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6044", "id": "49A1Y6tRhaq", "poster": "", "openreview": "https://openreview.net/forum?id=49A1Y6tRhaq", "slides": "https://iclr.cc/virtual/2022/poster/6044", "video": "https://iclr.cc/virtual/2022/poster/6044", "author_site": "Shunyu Yao, Mo Yu, Yang Zhang, Karthik Narasimhan, Joshua B Tenenbaum, Chuang Gan", "tldr": "", "abstract": "The study of language emergence aims to understand how human languages are shaped by perceptual grounding and communicative intent. Computational approaches to emergent communication (EC) predominantly consider referential games in limited domains and analyze the learned protocol within the game framework. As a result, it remains unclear how the emergent languages from these settings connect to natural languages or provide benefits in real-world language processing tasks, where statistical models trained on large text corpora dominate. In this work, we propose a novel way to establish such a link by corpus transfer, i.e. pretraining on a corpus of emergent language for downstream natural language tasks, which is in contrast to prior work that directly transfers speaker and listener parameters. Our approach showcases non-trivial transfer benefits for two different tasks \u2013 language modeling and image captioning. For example, in a low-resource setup (modeling 2 million natural language tokens), pre-training on an emergent language corpus with just 2 million tokens reduces model perplexity by 24.6% on average across ten natural languages. We also introduce a novel metric to predict the transferability of an emergent language by translating emergent messages to natural language captions grounded on the same images. We find that our translation-based metric highly correlates with the downstream performance on modeling natural languages (for instance $\\rho = 0.83$ on Hebrew), while topographic similarity, a popular metric in previous works, shows surprisingly low correlation ($\\rho = 0.003$), hinting that simple properties like attribute disentanglement from synthetic domains might not capture the full complexities of natural language. Our findings also indicate potential benefits of moving language emergence forward with natural language resources and models.", "keywords": "Emergent Language;Emergent Communication;Transfer Learning", "primary_area": "", "supplementary_material": "/attachment/d12c42aee122b091709b735cf313ee180c667fcf.zip", "author": "Shunyu Yao;Mo Yu;Yang Zhang;Karthik R Narasimhan;Joshua B. Tenenbaum;Chuang Gan", "authorids": "~Shunyu_Yao1;~Mo_Yu1;~Yang_Zhang3;~Karthik_R_Narasimhan1;~Joshua_B._Tenenbaum1;~Chuang_Gan1", "gender": "M;M;M;M;;M", "homepage": "https://ysymyth.github.io;http://researcher.ibm.com/researcher/view.php?person=us-yum;;http://www.karthiknarasimhan.com;;http://people.csail.mit.edu/ganchuang/", "dblp": "156/1038;32/7445.html;06/6785-1;147/0322;t/JoshuaBTenenbaum;139/6993", "google_scholar": "qJBXk9cAAAAJ;vC8DssQAAAAJ;_-5PSgQAAAAJ;euc0GX4AAAAJ;;PTeSCbIAAAAJ", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": "~Shunyu_Yao1;~Mo_Yu1;~Yang_Zhang3;~Karthik_R_Narasimhan1;~Joshua_B._Tenenbaum1;~Chuang_Gan1", "aff": "Princeton University;International Business Machines;International Business Machines;Princeton University;Massachusetts Institute of Technology;MIT-IBM Watson AI Lab", "aff_domain": "princeton.edu;ibm.com;ibm.com;princeton.edu;mit.edu;ibm.com", "position": "PhD student;Research Staff Member;Research Staff Employee;Assistant Professor;Professor;PhD student", "bibtex": "@inproceedings{\nyao2022linking,\ntitle={Linking Emergent and Natural Languages via Corpus Transfer},\nauthor={Shunyu Yao and Mo Yu and Yang Zhang and Karthik R Narasimhan and Joshua B. Tenenbaum and Chuang Gan},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=49A1Y6tRhaq}\n}", "github": "", "project": "", "reviewers": "wvqW;QkKj;gtt4;G942", "pdf_size": 0, "recommendation": "3;6;8;8", "confidence": "4;3;4;4", "correctness": "2;3;3;3", "technical_novelty": "2;3;3;4", "empirical_novelty": "2;3;3;4", "wc_summary_paper": "111;131;130;398", "wc_summary_review": "149;52;45;113", "wc_main_review": "1312;177;702;1194", "wc_review": "1572;360;877;1705", "wc_reply_reviewers": "860;0;0;345", "wc_reply_authors": "3353;674;961;1559", "reply_reviewers": "1;0;0;1", "reply_authors": "7;2;2;3", "recommendation_avg": [ 6.25, 2.0463381929681126 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 192.5, 118.91278316480529 ], "wc_summary_review_avg": [ 89.75, 43.239883209833025 ], "wc_main_review_avg": [ 846.25, 449.0425230420834 ], "wc_review_avg": [ 1128.5, 543.803503114866 ], "wc_reply_reviewers_avg": [ 301.25, 352.00097656114536 ], "wc_reply_authors_avg": [ 1636.75, 1041.0433168221196 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 3.5, 2.0615528128088303 ], "replies_avg": [ 24, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.07053456158585983, "corr_recommendation_correctness": 0.9169493006161777, "gs_citation": 24, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1456115068072297417&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=49A1Y6tRhaq", "email": "princeton.edu;ibm.com;ibm.com;princeton.edu;mit.edu;ibm.com", "author_num": 6, "aff_unique_index": "0;1;1;0;2;2", "aff_unique_norm": "Princeton University;International Business Machines Corporation;Massachusetts Institute of Technology", "aff_unique_dep": ";;", "aff_unique_url": "https://www.princeton.edu;https://www.ibm.com;https://web.mit.edu", "aff_unique_abbr": "Princeton;IBM;MIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "How to Train Your MAML to Excel in Few-Shot Classification", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6294", "id": "49h_IkpJtaE", "poster": "", "openreview": "https://openreview.net/forum?id=49h_IkpJtaE", "slides": "https://iclr.cc/virtual/2022/poster/6294", "video": "https://iclr.cc/virtual/2022/poster/6294", "author_site": "Han-Jia Ye, Wei-Lun Chao", "tldr": "", "abstract": "Model-agnostic meta-learning (MAML) is arguably one of the most popular meta-learning algorithms nowadays.\nNevertheless, its performance on few-shot classification is far behind many recent algorithms dedicated to the problem. In this paper, we point out several key facets of how to train MAML to excel in few-shot classification. First, we find that MAML needs a large number of gradient steps in its inner loop update, which contradicts its common usage in few-shot classification. Second, we find that MAML is sensitive to the class label assignments during meta-testing. Concretely, MAML meta-trains the initialization of an $N$-way classifier. These $N$ ways, during meta-testing, then have \"$N!$\" different permutations to be paired with a few-shot task of $N$ novel classes. We find that these permutations lead to a huge variance of accuracy, making MAML unstable in few-shot classification. Third, we investigate several approaches to make MAML permutation-invariant, among which meta-training a single vector to initialize all the $N$ weight vectors in the classification head performs the best. On benchmark datasets like MiniImageNet and TieredImageNet, our approach, which we name UNICORN-MAML, performs on a par with or even outperforms many recent few-shot classification algorithms, without sacrificing MAML's simplicity.", "keywords": "meta-learning;few-shot learning;classification;MAML", "primary_area": "", "supplementary_material": "", "author": "Han-Jia Ye;Wei-Lun Chao", "authorids": "~Han-Jia_Ye1;~Wei-Lun_Chao1", "gender": "M;M", "homepage": "http://www.lamda.nju.edu.cn/yehj;https://sites.google.com/view/wei-lun-harry-chao", "dblp": "165/3014;64/8842", "google_scholar": "mgOYhtoAAAAJ;PGKakWwAAAAJ", "orcid": ";0000-0003-1269-7231", "linkedin": ";", "or_profile": "~Han-Jia_Ye1;~Wei-Lun_Chao1", "aff": "Nanjing University;Ohio State University", "aff_domain": "nju.edu.cn;osu.edu", "position": "Associate Researcher;Assistant Professor", "bibtex": "@inproceedings{\nye2022how,\ntitle={How to Train Your {MAML} to Excel in Few-Shot Classification},\nauthor={Han-Jia Ye and Wei-Lun Chao},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=49h_IkpJtaE}\n}", "github": "", "project": "", "reviewers": "eimD;zwrd;Qepb;mgm6", "pdf_size": 0, "recommendation": "3;8;8;8", "confidence": "5;4;4;5", "correctness": "3;4;3;4", "technical_novelty": "2;3;3;4", "empirical_novelty": "2;4;3;4", "wc_summary_paper": "72;109;47;85", "wc_summary_review": "60;35;65;47", "wc_main_review": "782;203;231;143", "wc_review": "914;347;343;275", "wc_reply_reviewers": "205;0;0;0", "wc_reply_authors": "1847;219;288;22", "reply_reviewers": "2;0;0;0", "reply_authors": "4;1;1;1", "recommendation_avg": [ 6.75, 2.165063509461097 ], "confidence_avg": [ 4.5, 0.5 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 3.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 78.25, 22.398381637966615 ], "wc_summary_review_avg": [ 51.75, 11.691342951089922 ], "wc_main_review_avg": [ 339.75, 257.3046589162349 ], "wc_review_avg": [ 469.75, 258.0788396982596 ], "wc_reply_reviewers_avg": [ 51.25, 88.76760388790495 ], "wc_reply_authors_avg": [ 594.0, 729.9749995719031 ], "reply_reviewers_avg": [ 0.5, 0.8660254037844386 ], "reply_authors_avg": [ 1.75, 1.299038105676658 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 66, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3274682944038978071&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=49h_IkpJtaE", "email": "nju.edu.cn;osu.edu", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "Nanjing University;Ohio State University", "aff_unique_dep": ";", "aff_unique_url": "https://www.nju.edu.cn;https://www.osu.edu", "aff_unique_abbr": "Nanjing U;OSU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "China;United States" }, { "title": "Self-supervised Learning is More Robust to Dataset Imbalance", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6413", "id": "4AZz9osqrar", "poster": "", "openreview": "https://openreview.net/forum?id=4AZz9osqrar", "slides": "https://iclr.cc/virtual/2022/poster/6413", "video": "https://iclr.cc/virtual/2022/poster/6413", "author_site": "Hong Liu, Jeff Z. HaoChen, Adrien Gaidon, Tengyu Ma", "tldr": "", "abstract": "Self-supervised learning (SSL) is a scalable way to learn general visual representations since it learns without labels. However, large-scale unlabeled datasets in the wild often have long-tailed label distributions, where we know little about the behavior of SSL. In this work, we systematically investigate self-supervised learning under dataset imbalance. First, we find via extensive experiments that off-the-shelf self-supervised representations are already more robust to class imbalance than supervised representations. The performance gap between balanced and imbalanced pre-training with SSL is significantly smaller than the gap with supervised learning, across sample sizes, for both in-domain and, especially, out-of-domain evaluation. Second, towards understanding the robustness of SSL, we hypothesize that SSL learns richer features from frequent data: it may learn label-irrelevant-but-transferable features that help classify the rare classes and downstream tasks. In contrast, supervised learning has no incentive to learn features irrelevant to the labels from frequent examples. We validate this hypothesis with semi-synthetic experiments as well as rigorous mathematical analyses on a simplified setting. Third, inspired by the theoretical insights, we devise a re-weighted regularization technique that consistently improves the SSL representation quality on imbalanced datasets with several evaluation criteria, closing the small gap between balanced and imbalanced datasets with the same number of examples.", "keywords": "self-supervised learning;dataset imbalance;representation learning;long-tailed recognition", "primary_area": "", "supplementary_material": "", "author": "Hong Liu;Jeff Z. HaoChen;Adrien Gaidon;Tengyu Ma", "authorids": "~Hong_Liu5;~Jeff_Z._HaoChen1;~Adrien_Gaidon1;~Tengyu_Ma1", "gender": "M;;;M", "homepage": ";https://cs.stanford.edu/~jhaochen/;https://adriengaidon.com/;http://ai.stanford.edu/~tengyuma/", "dblp": ";267/5319;06/7548.html;54/9061", "google_scholar": "BUc2uq0AAAAJ;SWQxcO8AAAAJ;https://scholar.google.fr/citations?user=2StUgf4AAAAJ;i38QlUwAAAAJ", "orcid": ";;;", "linkedin": ";;adrien-gaidon-63ab2358/;", "or_profile": "~Hong_Liu5;~Jeff_Z._HaoChen1;~Adrien_Gaidon1;~Tengyu_Ma1", "aff": "Stanford University;Stanford University;Toyota Research Institute (TRI);Facebook AI Research", "aff_domain": "stanford.edu;stanford.edu;tri.global;fb.com", "position": "PhD student;PhD student;Head of ML;Visiting Scientist", "bibtex": "@inproceedings{\nliu2022selfsupervised,\ntitle={Self-supervised Learning is More Robust to Dataset Imbalance},\nauthor={Hong Liu and Jeff Z. HaoChen and Adrien Gaidon and Tengyu Ma},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=4AZz9osqrar}\n}", "github": "", "project": "", "reviewers": "8MoR;qJjn;Jyb6;Kf27", "pdf_size": 0, "recommendation": "5;8;8;8", "confidence": "2;4;4;3", "correctness": "2;4;3;3", "technical_novelty": "2;3;3;2", "empirical_novelty": "2;4;3;2", "wc_summary_paper": "20;114;159;354", "wc_summary_review": "50;32;33;40", "wc_main_review": "121;220;299;77", "wc_review": "191;366;491;471", "wc_reply_reviewers": "0;40;17;108", "wc_reply_authors": "365;361;542;936", "reply_reviewers": "0;1;1;1", "reply_authors": "1;1;1;2", "recommendation_avg": [ 7.25, 1.299038105676658 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 161.75, 121.79978448256794 ], "wc_summary_review_avg": [ 38.75, 7.189401922274203 ], "wc_main_review_avg": [ 179.25, 86.38395394979325 ], "wc_review_avg": [ 379.75, 118.868362064933 ], "wc_reply_reviewers_avg": [ 41.25, 41.06930118713977 ], "wc_reply_authors_avg": [ 551.0, 233.98824756812039 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.8703882797784892, "corr_recommendation_correctness": 0.816496580927726, "gs_citation": 204, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16670147468471464490&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=4AZz9osqrar", "email": "stanford.edu;stanford.edu;tri.global;fb.com", "author_num": 4, "aff_unique_index": "0;0;1;2", "aff_unique_norm": "Stanford University;Toyota Research Institute;Meta", "aff_unique_dep": ";;Facebook AI Research", "aff_unique_url": "https://www.stanford.edu;https://www.tri.toyota.com/;https://research.facebook.com", "aff_unique_abbr": "Stanford;TRI;FAIR", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Stanford;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "4ApXq4y81kY", "title": "Co-variance: Tackling Noisy Labels with Sample Selection by Emphasizing High-variance Examples", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "The sample selection approach is popular in learning with noisy labels, which tends to select potentially clean data out of noisy data for robust training. The state-of-the-art methods train two deep networks simultaneously for sample selection, which aims to employ their different learning abilities. To prevent two networks from converging to a consensus, their divergence should be maintained during training. Typically, the divergence is kept by \ufb01rst locating the disagreement data on which the prediction labels of two networks are different, and then selecting clean data out of such data. However, this procedure is sample-inef\ufb01cient for network weight updates, which means that a few clean examples can be utilized in training. In this paper, to address the issues, we propose a simple yet effective method called Co-variance. In particular, we select possibly clean data that simultaneously have high-variance prediction probabilities between two networks. As selected data have high variances, the divergence of two networks can be maintained by training on such data. Additionally, the condition of high variances is milder than the condition of disagreement in sample selection, which allows more data to be considered for training, and makes our method more sample-ef\ufb01cient. Moreover, we show that the proposed method enables to mine enough hard clean examples to help generalization. A series of empirical results show that Co-variance is superior to multiple baselines in the robustness of trained models, especially on class-imbalanced and real-world noisy datasets.", "keywords": "noisy labels;sample selection;high variances", "primary_area": "", "supplementary_material": "/attachment/9ebf6a31da000c719066f8b390447eb56bd72074.zip", "author": "Xiaobo Xia;Bo Han;Yibing Zhan;Jun Yu;Mingming Gong;Chen Gong;Tongliang Liu", "authorids": "~Xiaobo_Xia1;~Bo_Han1;~Yibing_Zhan2;~Jun_Yu3;~Mingming_Gong1;~Chen_Gong5;~Tongliang_Liu1", "gender": "M;M;M;M;M;M;M", "homepage": "https://xiaoboxia.github.io/;https://faculty.ustc.edu.cn/yujun_AI/en/index.htm;https://mingming-gong.github.io/;http://www.escience.cn/people/chengong/index.html;https://tongliang-liu.github.io/;https://bhanml.github.io/;", "dblp": "242/8072;50/5754-1.html;98/8479;21/8587-2;150/6667;241/0472-3;142/8486", "google_scholar": "jRsugY0AAAAJ;efZyqyQAAAAJ;https://scholar.google.com.au/citations?user=6BmiCJIAAAAJ;https://scholar.google.com.hk/citations?user=guttoBwAAAAJ;https://scholar.google.com.au/citations?user=EiLdZ_YAAAAJ;nTNjqHwAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";0000-0002-3197-8103;0000-0001-7147-5589;;;;", "linkedin": ";;;;;;", "or_profile": "~Xiaobo_Xia1;~Jun_Yu3;~Mingming_Gong1;~Chen_Gong5;~Tongliang_Liu1;~bo_han2;~Yibing_Zhan1", "aff": "The University of Sydney;University of Science and Technology of China;University of Melbourne;Nanjing University of Science and Technology;University of Sydney;Microsoft Research;JD Explore Academy", "aff_domain": "sydney.edu.au;ustc.edu.cn;unimelb.edu.au;njust.edu.cn;sydney.edu.au;microsoft.com;jd.com", "position": "PhD student;Associate Professor;Assistant Professor;Full Professor;Lecturer;Researcher;Researcher", "bibtex": "@misc{\nxia2022covariance,\ntitle={Co-variance: Tackling Noisy Labels with Sample Selection by Emphasizing High-variance Examples},\nauthor={Xiaobo Xia and Bo Han and Yibing Zhan and Jun Yu and Mingming Gong and Chen Gong and Tongliang Liu},\nyear={2022},\nurl={https://openreview.net/forum?id=4ApXq4y81kY}\n}", "github": "", "project": "", "reviewers": "nXvC;Srdw;nc32", "site": "https://openreview.net/forum?id=4ApXq4y81kY", "pdf_size": 0, "recommendation": "3;5;6", "confidence": "4;4;5", "correctness": "3;4;4", "technical_novelty": "2;3;3", "empirical_novelty": "2;4;4", "wc_summary_paper": "44;68;101", "wc_summary_review": "18;91;66", "wc_main_review": "158;61;349", "wc_review": "220;220;516", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 4.666666666666667, 1.247219128924647 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "correctness_avg": [ 3.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 3.3333333333333335, 0.9428090415820634 ], "wc_summary_paper_avg": [ 71.0, 23.366642891095847 ], "wc_summary_review_avg": [ 58.333333333333336, 30.291179500896884 ], "wc_main_review_avg": [ 189.33333333333334, 119.64484480699069 ], "wc_review_avg": [ 318.6666666666667, 139.53573815414538 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.7559289460184544, "corr_recommendation_correctness": 0.9449111825230683, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:-1ehVzgyYM8J:scholar.google.com/&scioq=Co-variance:+Tackling+Noisy+Labels+with+Sample+Selection+by+Emphasizing+High-variance+Examples&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;2;3;0;4;5", "aff_unique_norm": "University of Sydney;University of Science and Technology of China;University of Melbourne;Nanjing University of Science and Technology;Microsoft;JD", "aff_unique_dep": ";;;;Microsoft Research;JD Explore Academy", "aff_unique_url": "https://www.sydney.edu.au;http://www.ustc.edu.cn;https://www.unimelb.edu.au;http://www.nust.edu.cn/;https://www.microsoft.com/en-us/research;", "aff_unique_abbr": "USYD;USTC;UniMelb;NUST;MSR;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;1;0;2", "aff_country_unique": "Australia;China;United States;" }, { "title": "MCMC Should Mix: Learning Energy-Based Model with Neural Transport Latent Space MCMC", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6004", "id": "4C93Qvn-tz", "poster": "", "openreview": "https://openreview.net/forum?id=4C93Qvn-tz", "slides": "https://iclr.cc/virtual/2022/poster/6004", "video": "https://iclr.cc/virtual/2022/poster/6004", "author_site": "Erik Nijkamp, Ruiqi Gao, Pavel Sountsov, Srinivas Vasudevan, Bo Pang, Song-Chun Zhu, Yingnian Wu", "tldr": "", "abstract": "Learning energy-based model (EBM) requires MCMC sampling of the learned model as an inner loop of the learning algorithm. However, MCMC sampling of EBMs in high-dimensional data space is generally not mixing, because the energy function, which is usually parametrized by deep network, is highly multi-modal in the data space. This is a serious handicap for both theory and practice of EBMs. In this paper, we propose to learn EBM with a flow-based model (or in general latent variable model) serving as a backbone, so that the EBM is a correction or an exponential tilting of the flow-based model. We show that the model has a particularly simple form in the space of the latent variables of the generative model, and MCMC sampling of the EBM in the latent space mixes well and traverses modes in the data space. This enables proper sampling and learning of EBMs.", "keywords": "Generative models;energy-based models;MCMC", "primary_area": "", "supplementary_material": "", "author": "Erik Nijkamp;Ruiqi Gao;Pavel Sountsov;Srinivas Vasudevan;Bo Pang;Song-Chun Zhu;Ying Nian Wu", "authorids": "~Erik_Nijkamp2;~Ruiqi_Gao1;~Pavel_Sountsov2;~Srinivas_Vasudevan1;~Bo_Pang1;~Song-Chun_Zhu1;~Ying_Nian_Wu1", "gender": "M;F;;M;M;M;M", "homepage": "https://eriknijkamp.com/;http://www.stat.ucla.edu/~ruiqigao/;http://people.brandeis.edu/~sl157/;;;https://zhusongchun.net/;http://www.stat.ucla.edu/~ywu/", "dblp": ";206/7084;;210/2593;16/6344;10/10313;18/568.html", "google_scholar": ";VdlgOXoAAAAJ;;;s9fNEVEAAAAJ;https://scholar.google.com.tw/citations?user=Al8dyb4AAAAJ;7k_1QFIAAAAJ", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": "~Erik_Nijkamp2;~Ruiqi_Gao1;~Pavel_Sountsov2;~Srinivas_Vasudevan1;~Bo_Pang1;~Song-Chun_Zhu1;~Yingnian_Wu1", "aff": "University of California, Los Angeles;Google;Google;;University of California, Los Angeles;Peking University;UCLA", "aff_domain": "ucla.edu;google.com;google.com;;ucla.edu;pku.edu.cn;stat.ucla.edu", "position": "PhD student;Researcher;Researcher;;PhD student;Full Professor;Full Professor", "bibtex": "@inproceedings{\nnijkamp2022mcmc,\ntitle={{MCMC} Should Mix: Learning Energy-Based Model with Neural Transport Latent Space {MCMC}},\nauthor={Erik Nijkamp and Ruiqi Gao and Pavel Sountsov and Srinivas Vasudevan and Bo Pang and Song-Chun Zhu and Ying Nian Wu},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=4C93Qvn-tz}\n}", "github": "", "project": "", "reviewers": "17Wo;tv7r;Kukr;Wtka", "pdf_size": 0, "recommendation": "6;6;8;8", "confidence": "4;4;2;4", "correctness": "3;3;3;3", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "46;122;35;195", "wc_summary_review": "16;2;54;190", "wc_main_review": "310;290;226;554", "wc_review": "372;414;315;939", "wc_reply_reviewers": "24;471;23;552", "wc_reply_authors": "561;1283;631;936", "reply_reviewers": "1;2;1;1", "reply_authors": "2;3;2;2", "recommendation_avg": [ 7.0, 1.0 ], "confidence_avg": [ 3.5, 0.8660254037844386 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 99.5, 64.5155020130821 ], "wc_summary_review_avg": [ 65.5, 74.3555646875202 ], "wc_main_review_avg": [ 345.0, 124.591331961738 ], "wc_review_avg": [ 510.0, 250.16294689661777 ], "wc_reply_reviewers_avg": [ 267.5, 245.675090312388 ], "wc_reply_authors_avg": [ 852.75, 285.6294583897116 ], "reply_reviewers_avg": [ 1.25, 0.4330127018922193 ], "reply_authors_avg": [ 2.25, 0.4330127018922193 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.0, "gs_citation": 31, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3721500918064175514&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=4C93Qvn-tz", "email": "ucla.edu;google.com;google.com;;ucla.edu;pku.edu.cn;stat.ucla.edu", "author_num": 7, "aff_unique_index": "0;1;1;0;2;0", "aff_unique_norm": "University of California, Los Angeles;Google;Peking University", "aff_unique_dep": ";Google;", "aff_unique_url": "https://www.ucla.edu;https://www.google.com;http://www.pku.edu.cn", "aff_unique_abbr": "UCLA;Google;Peking U", "aff_campus_unique_index": "0;1;1;0;0", "aff_campus_unique": "Los Angeles;Mountain View;", "aff_country_unique_index": "0;0;0;0;1;0", "aff_country_unique": "United States;China" }, { "id": "4GBHVfEcmoS", "title": "Propagating Distributions through Neural Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "We propose a new approach to propagating probability distributions through neural networks. To handle non-linearities, we use local linearization and show this to be an optimal approximation in terms of total variation for ReLUs. We demonstrate the advantages of our method over the moment matching approach popularized in prior works. In addition, we formulate new loss functions for training neural networks based on distributions. To demonstrate the utility of propagating distributions, we apply it to quantifying prediction uncertainties. In regression tasks we obtain calibrated confidence intervals, and in a classification setting we improve selective prediction on out-of-distribution data. We also show empirically that training with our uncertainty aware losses improve robustness to random and adversarial noise.", "keywords": "propagating distributions;uncertainty quantification", "primary_area": "", "supplementary_material": "", "author": "Felix Petersen;Christian Borgelt;Mikhail Yurochkin;Hilde Kuehne;Oliver Deussen", "authorids": "~Felix_Petersen1;~Christian_Borgelt1;~Mikhail_Yurochkin1;~Hilde_Kuehne5;~Oliver_Deussen1", "gender": "Not Specified;M;M;F;M", "homepage": "http://www.petersen.ai/;https://www.borgelt.net/;https://moonfolk.github.io/;https://hildekuehne.github.io;https://graphics.uni-konstanz.de", "dblp": "230/3983;b/ChristianBorgelt.html;191/6719;45/4963;48/2158", "google_scholar": "v8Kat6YAAAAJ;https://scholar.google.de/citations?user=T50Bxb8AAAAJ;QjBF9sUAAAAJ;pxhCcH0AAAAJ;https://scholar.google.de/scholar?hl=en", "orcid": ";;;0000-0003-1079-4441;0000-0001-5803-2185", "linkedin": ";christian-borgelt-a2429071/;mikhail-yurochkin-a45659114/;hilde-kuehne-8b9aa661;", "or_profile": "~Felix_Petersen1;~Christian_Borgelt1;~Mikhail_Yurochkin1;~Hilde_Kuehne5;~Oliver_Deussen1", "aff": "University of Konstanz;Paris-Lodron-University of Salzburg;IBM Research;Goethe University Frankfurt;University of Konstanz", "aff_domain": "uni-konstanz.de;sbg.ac.at;ibm.com;uni-frankfurt.de;uni-konstanz.de", "position": "PhD student;Full Professor;Researcher;Assistant Professor;Full Professor", "bibtex": "@misc{\npetersen2022propagating,\ntitle={Propagating Distributions through Neural Networks},\nauthor={Felix Petersen and Christian Borgelt and Mikhail Yurochkin and Hilde Kuehne and Oliver Deussen},\nyear={2022},\nurl={https://openreview.net/forum?id=4GBHVfEcmoS}\n}", "github": "", "project": "", "reviewers": "bo8J;rdMQ;cdSE;H6dp", "site": "https://openreview.net/forum?id=4GBHVfEcmoS", "pdf_size": 0, "recommendation": "3;6;6;6", "confidence": "4;4;4;2", "correctness": "4;4;3;4", "technical_novelty": "2;3;3;2", "empirical_novelty": "2;3;2;2", "wc_summary_paper": "61;102;56;84", "wc_summary_review": "57;43;85;34", "wc_main_review": "398;251;872;138", "wc_review": "516;396;1013;256", "wc_reply_reviewers": "0;0;25;0", "wc_reply_authors": "789;846;2553;394", "reply_reviewers": "0;0;1;0", "reply_authors": "1;2;4;1", "recommendation_avg": [ 5.25, 1.299038105676658 ], "confidence_avg": [ 3.5, 0.8660254037844386 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 75.75, 18.471261462065875 ], "wc_summary_review_avg": [ 54.75, 19.292161620720474 ], "wc_main_review_avg": [ 414.75, 279.62597787044035 ], "wc_review_avg": [ 545.25, 285.3010471414362 ], "wc_reply_reviewers_avg": [ 6.25, 10.825317547305483 ], "wc_reply_authors_avg": [ 1145.5, 831.0536986260274 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 2.0, 1.224744871391589 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": -0.3333333333333333, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8631605287940412501&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;2;3;0", "aff_unique_norm": "University of Konstanz;Paris-Lodron-University of Salzburg;IBM;Goethe University Frankfurt", "aff_unique_dep": ";;IBM Research;", "aff_unique_url": "https://www.uni-konstanz.de;https://www.uni-salzburg.at;https://www.ibm.com/research;https://www.uni-frankfurt.de", "aff_unique_abbr": "Uni Konstanz;PLUS;IBM;GU Frankfurt", "aff_campus_unique_index": "1", "aff_campus_unique": ";Frankfurt", "aff_country_unique_index": "0;1;2;0;0", "aff_country_unique": "Germany;Austria;United States" }, { "id": "4JlwgTbmzXQ", "title": "EqR: Equivariant Representations for Data-Efficient Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "We study different notions of equivariance as an inductive bias in Reinforcement Learning (RL) and propose new mechanisms for recovering representations that are equivariant to both an agent\u2019s action, and symmetry transformations of the state-action pairs. Whereas prior work on exploiting symmetries in deep RL can only incorporate predefined linear transformations, our approach allows for non-linear symmetry transformations of state-action pairs to be learned from the data itself. This is achieved through an equivariant Lie algebraic parameterization of state and action encodings, equivariant latent transition models, and the use of symmetry-based losses. We demonstrate the advantages of our learned equivariant representations for Atari games, in a data-efficient setting limited to 100k steps of interactions with the environment. Our method, which we call Equivariant representations for RL (EqR), outperforms many previous methods in a similar setting by achieving a median human-normalized score of 0.418, and surpassing human-level performance on 8 out of the 26 games.", "keywords": "Equivariance;Invariance;Representation learning;Reinforcement learning;Symmetric MDPs;MDP homomorphism;Lie parameterization.", "primary_area": "", "supplementary_material": "/attachment/8421acf84dfb847308d9d2fc64888f923ab7e70e.zip", "author": "Arnab Kumar Mondal;Vineet Jain;Kaleem Siddiqi;Siamak Ravanbakhsh", "authorids": "~Arnab_Kumar_Mondal1;~Vineet_Jain1;~Kaleem_Siddiqi1;~Siamak_Ravanbakhsh1", "gender": "M;;M;", "homepage": "https://arnab39.github.io;;http://www.cim.mcgill.ca/~siddiqi/;", "dblp": ";92/3653;s/KaleemSiddiqi;", "google_scholar": "NhWR4yIAAAAJ;https://scholar.google.com/citations?hl=en;https://scholar.google.com.tw/citations?user=1jEAOVcAAAAJ;", "orcid": ";;;", "linkedin": "arnab-mondal-01b522a9/;;;", "or_profile": "~Arnab_Kumar_Mondal1;~Vineet_Jain1;~Kaleem_Siddiqi1;~Siamak_Ravanbakhsh1", "aff": "ServiceNow Inc;McGill University;McGill University;", "aff_domain": "servicenow.com;mcgill.ca;mcgill.ca;", "position": "Intern;PhD student;Full Professor;", "bibtex": "@misc{\nmondal2022eqr,\ntitle={EqR: Equivariant Representations for Data-Efficient Reinforcement Learning},\nauthor={Arnab Kumar Mondal and Vineet Jain and Kaleem Siddiqi and Siamak Ravanbakhsh},\nyear={2022},\nurl={https://openreview.net/forum?id=4JlwgTbmzXQ}\n}", "github": "", "project": "", "reviewers": "dVfB;c4oF;1BRN;pfpn", "site": "https://openreview.net/forum?id=4JlwgTbmzXQ", "pdf_size": 0, "recommendation": "5;6;8;8", "confidence": "3;3;3;3", "correctness": "2;3;4;4", "technical_novelty": "1;3;3;3", "empirical_novelty": "2;2;4;2", "wc_summary_paper": "62;40;129;272", "wc_summary_review": "65;37;28;38", "wc_main_review": "822;279;212;218", "wc_review": "949;356;369;528", "wc_reply_reviewers": "396;155;0;48", "wc_reply_authors": "2527;1392;414;405", "reply_reviewers": "1;2;0;1", "reply_authors": "5;3;1;1", "recommendation_avg": [ 6.75, 1.299038105676658 ], "confidence_avg": [ 3.0, 0.0 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.5, 0.8660254037844386 ], "empirical_novelty_avg": [ 2.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 125.75, 90.57697003101838 ], "wc_summary_review_avg": [ 42.0, 13.838352503098047 ], "wc_main_review_avg": [ 382.75, 254.95232397450312 ], "wc_review_avg": [ 550.5, 239.8337966175743 ], "wc_reply_reviewers_avg": [ 149.75, 152.84367013389857 ], "wc_reply_authors_avg": [ 1184.5, 872.7332066559632 ], "reply_reviewers_avg": [ 1.0, 0.7071067811865476 ], "reply_authors_avg": [ 2.5, 1.6583123951777 ], "replies_avg": [ 22, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.986440050415621, "gs_citation": 33, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15795267808339174229&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff_unique_index": "0;1;1", "aff_unique_norm": "ServiceNow;McGill University", "aff_unique_dep": ";", "aff_unique_url": "https://www.servicenow.com;https://www.mcgill.ca", "aff_unique_abbr": "ServiceNow;McGill", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1", "aff_country_unique": "United States;Canada" }, { "id": "4KOJ5XJ_z5W", "title": "Improving State-of-the-Art in One-Class Classification by Leveraging Unlabeled Data", "track": "main", "status": "Reject", "tldr": "", "abstract": "Recent advances in One-Class (OC) classification combine the ability to learn exclusively from positive examples with the expressive power of deep neural networks. A cornerstone of OC methods is to make assumptions regarding negative distribution, e.g., that negative data are scattered uniformly or concentrated in the origin. An alternative approach employed in Positive-Unlabeled (PU) learning is to additionally leverage unlabeled data to approximate negative distribution more precisely. In this paper, our goal is to find the best ways to utilize unlabeled data on top of positive data in different settings. While it is reasonable to expect that PU algorithms outperform OC algorithms due to access to more data, we find that the opposite can be true if unlabeled data is unreliable, i.e. contain negative examples that are either too few or sampled from a different distribution. As an alternative to using existing PU algorithms, we propose to modify OC algorithms to incorporate unlabeled data. We find that such PU modifications can consistently benefit even from unreliable unlabeled data if they satisfy a crucial property: when unlabeled data consists exclusively of positive examples, the PU modification becomes equivalent to the original OC algorithm. Our main practical recommendation is to use state-of-the-art PU algorithms when unlabeled data is reliable and to use PU modifications of state-of-the-art OC algorithms that satisfy the formulated property otherwise. Additionally, we make a progress towards distinguishing the cases of reliable and unreliable unlabeled data using statistical tests.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/3fcff1bc231286900f224d1ccbdc5fe0423fbea2.zip", "author": "Farid Bagirov;Dmitry Ivanov;Aleksei Shpilman", "authorids": "~Farid_Bagirov1;~Dmitry_Ivanov1;~Aleksei_Shpilman1", "gender": ";Not Specified;", "homepage": ";;", "dblp": ";122/1439;213/1764", "google_scholar": ";https://scholar.google.ru/citations?user=G9szMAwAAAAJ;https://scholar.google.ru/citations?user=9I8tzzkAAAAJ", "orcid": ";0000-0002-6974-8397;0000-0002-3892-4829", "linkedin": "kraalfar;;", "or_profile": "~Farid_Bagirov1;~Dmitry_Ivanov1;~Aleksei_Shpilman1", "aff": "Higher School of Economics, Higher School of Economics;Technion - Israel Institute of Technology, Technion - Israel Institute of Technology;Higher School of Economics", "aff_domain": "edu.hse.ru;campus.technion.ac.il;hse.ru", "position": "PhD student;Postdoc;Lecturer", "bibtex": "@misc{\nbagirov2022improving,\ntitle={Improving State-of-the-Art in One-Class Classification by Leveraging Unlabeled Data},\nauthor={Farid Bagirov and Dmitry Ivanov and Aleksei Shpilman},\nyear={2022},\nurl={https://openreview.net/forum?id=4KOJ5XJ_z5W}\n}", "github": "", "project": "", "reviewers": "kMPP;9LXT;99kf;R2RK", "site": "https://openreview.net/forum?id=4KOJ5XJ_z5W", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "3;4;3;4", "correctness": "3;2;3;4", "technical_novelty": "2;2;2;1", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "39;48;43;79", "wc_summary_review": "37;33;22;50", "wc_main_review": "137;195;273;245", "wc_review": "213;276;338;374", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 1.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 52.25, 15.769828787910159 ], "wc_summary_review_avg": [ 35.5, 10.012492197250394 ], "wc_main_review_avg": [ 212.5, 51.7759596724194 ], "wc_review_avg": [ 300.25, 61.36927162676774 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.7071067811865475, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:BAJx8FE6NmkJ:scholar.google.com/&scioq=Improving+State-of-the-Art+in+One-Class+Classification+by+Leveraging+Unlabeled+Data&hl=en&as_sdt=0,33", "gs_version_total": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "Higher School of Economics;Technion - Israel Institute of Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.hse.ru;https://www.technion.ac.il/en/", "aff_unique_abbr": "HSE;Technion", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", "aff_country_unique": "Russian Federation;Israel" }, { "title": "Learning a subspace of policies for online adaptation in Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6408", "id": "4Muj-t_4o4", "poster": "", "openreview": "https://openreview.net/forum?id=4Muj-t_4o4", "slides": "https://iclr.cc/virtual/2022/poster/6408", "video": "https://iclr.cc/virtual/2022/poster/6408", "author_site": "Jean-Baptiste Gaya, Laure Soulier, Ludovic Denoyer", "tldr": "", "abstract": "Deep Reinforcement Learning (RL) is mainly studied in a setting where the training and the testing environments are similar. But in many practical applications, these environments may differ. For instance, in control systems, the robot(s) on which a policy is learned might differ from the robot(s) on which a policy will run. It can be caused by different internal factors (e.g., calibration issues, system attrition, defective modules) or also by external changes (e.g., weather conditions). There is a need to develop RL methods that generalize well to variations of the training conditions. In this article, we consider the simplest yet hard to tackle generalization setting where the test environment is unknown at train time, forcing the agent to adapt to the system's new dynamics. This online adaptation process can be computationally expensive (e.g., fine-tuning) and cannot rely on meta-RL techniques since there is just a single train environment. To do so, we propose an approach where we learn a subspace of policies within the parameter space. This subspace contains an infinite number of policies that are trained to solve the training environment while having different parameter values. As a consequence, two policies in that subspace process information differently and exhibit different behaviors when facing variations of the train environment. Our experiments carried out over a large variety of benchmarks compare our approach with baselines, including diversity-based methods. In comparison, our approach is simple to tune, does not need any extra component (e.g., discriminator) and learns policies able to gather a high reward on unseen environments.", "keywords": "Deep Reinforcement Learning;Online adaptation", "primary_area": "", "supplementary_material": "", "author": "Jean-Baptiste Gaya;Laure Soulier;Ludovic Denoyer", "authorids": "~Jean-Baptiste_Gaya1;~Laure_Soulier1;~Ludovic_Denoyer1", "gender": ";;M", "homepage": "https://twitter.com/jb_gaya;;", "dblp": "304/2605;;54/5551", "google_scholar": ";;9PLqulwAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Jean-Baptiste_Gaya1;~Laure_Soulier1;~Ludovic_Denoyer1", "aff": "Meta Facebook;;Meta Facebook", "aff_domain": "fb.com;;fb.com", "position": "PhD student;;Research Scientist", "bibtex": "@inproceedings{\ngaya2022learning,\ntitle={Learning a subspace of policies for online adaptation in Reinforcement Learning},\nauthor={Jean-Baptiste Gaya and Laure Soulier and Ludovic Denoyer},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=4Muj-t_4o4}\n}", "github": "", "project": "", "reviewers": "6h1N;D4JT;UNwx;qrCv", "pdf_size": 0, "recommendation": "3;6;6;8", "confidence": "5;4;4;5", "correctness": "2;3;3;4", "technical_novelty": "2;3;3;2", "empirical_novelty": "2;4;3;3", "wc_summary_paper": "32;60;57;77", "wc_summary_review": "21;50;85;19", "wc_main_review": "294;422;663;523", "wc_review": "347;532;805;619", "wc_reply_reviewers": "0;0;232;421", "wc_reply_authors": "629;463;1125;1134", "reply_reviewers": "0;0;1;3", "reply_authors": "2;2;3;4", "recommendation_avg": [ 5.75, 1.7853571071357126 ], "confidence_avg": [ 4.5, 0.5 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 56.5, 16.070158679988197 ], "wc_summary_review_avg": [ 43.75, 26.789690181112583 ], "wc_main_review_avg": [ 475.5, 135.29320012476606 ], "wc_review_avg": [ 575.75, 164.8232007333919 ], "wc_reply_reviewers_avg": [ 163.25, 176.396393103714 ], "wc_reply_authors_avg": [ 837.75, 297.61163871730554 ], "reply_reviewers_avg": [ 1.0, 1.224744871391589 ], "reply_authors_avg": [ 2.75, 0.82915619758885 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.14002800840280097, "corr_recommendation_correctness": 0.9901475429766743, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8112991031910355476&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "pdf": "https://openreview.net/pdf?id=4Muj-t_4o4", "email": "fb.com;;fb.com", "author_num": 3, "aff_unique_index": "0;0", "aff_unique_norm": "Meta", "aff_unique_dep": "Meta Platforms, Inc.", "aff_unique_url": "https://meta.com", "aff_unique_abbr": "Meta", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Associated Learning: an Alternative to End-to-End Backpropagation that Works on CNN, RNN, and Transformer", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6458", "id": "4N-17dske79", "poster": "", "openreview": "https://openreview.net/forum?id=4N-17dske79", "slides": "https://iclr.cc/virtual/2022/poster/6458", "video": "https://iclr.cc/virtual/2022/poster/6458", "author_site": "Dennis Wu, Di-Nan Lin, Vincent Chen, Hung-Hsuan Chen", "tldr": "", "abstract": "This paper studies Associate Learning (AL), an alternative methodology to the end-to-end backpropagation (BP). We introduce the workflow to convert a neural network into a proper structure such that AL can be used to learn the weights for various types of neural networks. We compared AL and BP on some of the most successful types of neural networks -- Convolutional Neural Network (CNN), Recurrent Neural Network (RNN), and Transformer. Experimental results show that AL consistently outperforms BP on various open datasets. We discuss possible reasons for AL's success and its limitations.", "keywords": "pipeline training;parallel training;backpropagation;associated learning", "primary_area": "", "supplementary_material": "/attachment/00037c7af16d786121a99947b26e8fd4a8a9fede.zip", "author": "Dennis Y.H. Wu;Dinan Lin;Vincent Chen;Hung-Hsuan Chen", "authorids": "~Dennis_Y.H._Wu1;~Dinan_Lin1;~Vincent_Chen4;~Hung-Hsuan_Chen1", "gender": "M;;M;", "homepage": "https://hibb-bb.github.io;https://github.com/4hfly;;", "dblp": "40/149;;;13/1892", "google_scholar": "rmm9zw0AAAAJ;;yo6BlgIAAAAJ;", "orcid": ";;;", "linkedin": ";dinan-lin/;;", "or_profile": "~Dennis_Y.H._Wu1;~Dinan_Lin1;~Vincent_Chen4;~Hung-Hsuan_Chen1", "aff": ", Academia Sinica;National Cheng Kung University;Academia Sinica;National Central University, Taiwan", "aff_domain": "iis.sinica.edu.tw;ncku.edu.tw;iis.sinica.edu.tw;ncu.edu.tw", "position": "Research Assistant;MS student;Researcher;Associa", "bibtex": "@inproceedings{\nwu2022associated,\ntitle={Associated Learning: an Alternative to End-to-End Backpropagation that Works on {CNN}, {RNN}, and Transformer},\nauthor={Dennis Y.H. Wu and Dinan Lin and Vincent Chen and Hung-Hsuan Chen},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=4N-17dske79}\n}", "github": "", "project": "", "reviewers": "pfre;L9xu;dmxT;9Led", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "4;3;4;3", "correctness": "3;3;4;3", "technical_novelty": "3;3;2;2", "empirical_novelty": "3;3;2;2", "wc_summary_paper": "101;18;47;52", "wc_summary_review": "79;30;48;46", "wc_main_review": "383;217;100;220", "wc_review": "563;265;195;318", "wc_reply_reviewers": "144;0;0;91", "wc_reply_authors": "1290;576;647;1087", "reply_reviewers": "1;0;0;1", "reply_authors": "3;2;2;3", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 54.5, 29.82029510249689 ], "wc_summary_review_avg": [ 50.75, 17.73943347460679 ], "wc_main_review_avg": [ 230.0, 100.71990865762339 ], "wc_review_avg": [ 335.25, 138.53947993261704 ], "wc_reply_reviewers_avg": [ 58.75, 61.66593468034033 ], "wc_reply_authors_avg": [ 900.0, 298.3513029969871 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 2.5, 0.5 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3776186422319133337&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "pdf": "https://openreview.net/pdf?id=4N-17dske79", "email": "iis.sinica.edu.tw;ncku.edu.tw;iis.sinica.edu.tw;ncu.edu.tw", "author_num": 4, "aff_unique_index": "0;1;0;2", "aff_unique_norm": "Academia Sinica;National Cheng Kung University;National Central University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.sinica.edu.tw;https://www.ncku.edu.tw;https://www.ncu.edu.tw", "aff_unique_abbr": "Academia Sinica;NCKU;NCU", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Taiwan", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "id": "4PzEuW0JxAB", "title": "Disentangled Representations using Trained Models", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "We propose a novel method to learn disentangled representations. The ability to compute a disentangled representation is useful for many tasks because it contains information about samples from a dataset in an interpretable and compact structure. Thus development of a method that learns disentangled representations is an active area of research. In contrast to previously proposed methods, we neither require access to the values of the interpretable factors, nor to information about groups of data samples which share the values of some interpretable factors. Our proposed algorithm uses only a set of models which already have been trained on the data. With the help of the implicit function theorem we show how, using a diverse set of models that have already been trained on the data, to select a pair of data points that have a common value of interpretable factors. We prove that such an auxiliary sampler is sufficient to obtain a disentangled representation. Based on this theoretical result, we propose a loss function that the method should optimize to compute the disentangled representation. Our approach is easy to implement and shows promising results in simulations.", "keywords": "disentangled representation", "primary_area": "", "supplementary_material": "", "author": "Anna Sepliarskaia;Thomas G\u00e4rtner;Patrick Forr\u00e9", "authorids": "~Anna_Sepliarskaia1;~Thomas_G\u00e4rtner2;~Patrick_Forr\u00e91", "gender": "F;M;", "homepage": ";https://thomasgaertner.org/;", "dblp": ";https://dblp.uni-trier.de/pers/hd/g/G=auml=rtner_0001:Thomas;", "google_scholar": "qrYGApkAAAAJ;sOI8QyoAAAAJ;", "orcid": ";0000-0001-5985-9213;", "linkedin": ";;", "or_profile": "~Anna_Sepliarskaia1;~Thomas_G\u00e4rtner2;~Patrick_Forr\u00e91", "aff": "TU Wien Vienna University of Technology;TU Wien;", "aff_domain": "tuwien.ac.at;tuwien.ac.at;", "position": "Postdoc;Full Professor;", "bibtex": "@misc{\nsmit2022disentangled,\ntitle={Disentangled Representations using Trained Models},\nauthor={Eva Smit and Thomas G{\\\"a}rtner and Patrick Forr{\\'e}},\nyear={2022},\nurl={https://openreview.net/forum?id=4PzEuW0JxAB}\n}", "github": "", "project": "", "reviewers": "", "site": "https://openreview.net/forum?id=4PzEuW0JxAB", "pdf_size": 0, "recommendation": "", "confidence": "", "correctness": "", "technical_novelty": "", "empirical_novelty": "", "wc_summary_paper": "", "wc_summary_review": "", "wc_main_review": "", "wc_review": "", "wc_reply_reviewers": "", "wc_reply_authors": "", "reply_reviewers": "", "reply_authors": "", "recommendation_avg": [ 0, 0 ], "confidence_avg": [ 0, 0 ], "correctness_avg": [ 0, 0 ], "technical_novelty_avg": [ 0, 0 ], "empirical_novelty_avg": [ 0, 0 ], "wc_summary_paper_avg": [ 0, 0 ], "wc_summary_review_avg": [ 0, 0 ], "wc_main_review_avg": [ 0, 0 ], "wc_review_avg": [ 0, 0 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 1, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0, "corr_recommendation_correctness": 0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:MBH2LIwZEa4J:scholar.google.com/&scioq=Disentangled+Representations+using+Trained+Models&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1", "aff_unique_norm": "Vienna University of Technology;Technische Universit\u00e4t Wien", "aff_unique_dep": ";", "aff_unique_url": "https://www.tuwien.ac.at;https://www.tuwien.ac.at", "aff_unique_abbr": "TU Wien;TU Wien", "aff_campus_unique_index": "0", "aff_campus_unique": "Vienna;", "aff_country_unique_index": "0;0", "aff_country_unique": "Austria" }, { "id": "4QUoBU27oXN", "title": "Cognitively Inspired Learning of Incremental Drifting Concepts", "track": "main", "status": "Reject", "tldr": "", "abstract": " Humans continually expand their learned knowledge to new domains and learn new concepts without any interference with past learned experiences. In contrast, machine learning models perform poorly in a continual learning setting, where input data distribution changes over time. Inspired by the nervous system learning mechanisms, we develop a computational model that enables a deep neural network to learn new concepts and expand its learned knowledge to new domains incrementally in a continual learning setting. We rely on the Parallel Distributed Processing theory to encode abstract concepts in an embedding space in terms of a multimodal distribution. This embedding space is modeled by internal data representations in a hidden network layer. We also leverage the Complementary Learning Systems theory to equip the model with a memory mechanism to overcome catastrophic forgetting through implementing pseudo-rehearsal. Our model can generate pseudo-data points for experience replay and accumulate new experiences to past learned experiences without causing cross-task interference.", "keywords": "Complementary Learning Systems;continual learning;Parallel Distributed Processing", "primary_area": "", "supplementary_material": "/attachment/5438f9922f85c9ce20bbd24d599dadd9da887a97.zip", "author": "Mohammad Rostami;Aram Galstyan", "authorids": "~Mohammad_Rostami1;~Aram_Galstyan1", "gender": "M;M", "homepage": "https://viterbi.usc.edu/directory/faculty/Rostami/Mohammad;http://www.isi.edu/~galstyan", "dblp": "83/9890;16/3411", "google_scholar": "Uzx8nLoAAAAJ;rJTwW0MAAAAJ", "orcid": ";", "linkedin": ";aram-galstyan-4a01373/", "or_profile": "~Mohammad_Rostami1;~Aram_Galstyan1", "aff": "USC/ISI;Amazon Alexa", "aff_domain": "isi.edu;amazon.com", "position": "Research Scientist;Scholar", "bibtex": "@misc{\nrostami2022cognitively,\ntitle={Cognitively Inspired Learning of Incremental Drifting Concepts},\nauthor={Mohammad Rostami and Aram Galstyan},\nyear={2022},\nurl={https://openreview.net/forum?id=4QUoBU27oXN}\n}", "github": "", "project": "", "reviewers": "WRij;vhwe;kyrh;QevD", "site": "https://openreview.net/forum?id=4QUoBU27oXN", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "4;3;4;3", "correctness": "2;3;3;4", "technical_novelty": "1;4;3;3", "empirical_novelty": "1;3;2;3", "wc_summary_paper": "46;55;141;48", "wc_summary_review": "23;128;48;75", "wc_main_review": "391;474;747;272", "wc_review": "460;657;936;395", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.75, 1.0897247358851685 ], "empirical_novelty_avg": [ 2.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 72.5, 39.68941924493227 ], "wc_summary_review_avg": [ 68.5, 38.96472763923803 ], "wc_main_review_avg": [ 471.0, 174.77557037526725 ], "wc_review_avg": [ 612.0, 210.47208841079143 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.6882472016116854, "corr_recommendation_correctness": 0.9733285267845754, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=243659177067009192&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;1", "aff_unique_norm": "University of Southern California;Amazon", "aff_unique_dep": ";Amazon Alexa", "aff_unique_url": "https://isi.usc.edu;https://www.amazon.com/alexa", "aff_unique_abbr": "USC;Amazon Alexa", "aff_campus_unique_index": "0", "aff_campus_unique": "ISI;", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "4Stc6i97dVN", "title": "Sharper Utility Bounds for Differentially Private Models", "track": "main", "status": "Reject", "tldr": "", "abstract": "In this paper, by introducing Generalized Bernstein condition, we propose the first $\\mathcal{O}\\big(\\frac{\\sqrt{p}}{n\\epsilon}\\big)$ high probability excess population risk bound for differentially private algorithms under the assumptions $G$-Lipschitz, $L$-smooth, and Polyak-{\\L}ojasiewicz condition, based on gradient perturbation method. If we replace the properties $G$-Lipschitz and $L$-smooth by $\\alpha$-H{\\\"o}lder smoothness (which can be used in non-smooth setting), the high probability bound comes to $\\mathcal{O}\\big(n^{-\\frac{2\\alpha}{1+2\\alpha}}\\big)$ w.r.t $n$, which cannot achieve $\\mathcal{O}\\left(1/n\\right)$ when $\\alpha\\in(0,1]$. %and only better than previous results when $\\alpha\\in[1/2,1]$. To solve this problem, we propose a variant of gradient perturbation method, \\textbf{max$\\{1,g\\}$-Normalized Gradient Perturbation} (m-NGP). We further show that by normalization, the high probability excess population risk bound under assumptions $\\alpha$-H{\\\"o}lder smooth and Polyak-{\\L}ojasiewicz condition can achieve $\\mathcal{O}\\big(\\frac{\\sqrt{p}}{n\\epsilon}\\big)$, which is the first $\\mathcal{O}\\left(1/n\\right)$ high probability utility bound w.r.t $n$ for differentially private algorithms under non-smooth conditions. Moreover, we evaluate the performance of the new proposed algorithm m-NGP, the experimental results show that m-NGP improves the performance (measured by accuracy) of the DP model over real datasets. It demonstrates that m-NGP improves the excess population risk bound and the accuracy of the DP model on real datasets simultaneously.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/568ab13e0298ea53844bde7999088806c8d25834.zip", "author": "Yilin Kang;Yong Liu;Jian Li;Weiping Wang", "authorids": "~Yilin_Kang2;~Yong_Liu7;~Jian_Li15;~Weiping_Wang4", "gender": ";M;M;M", "homepage": "https://yilinkang.github.io/;https://iie-liuyong.github.io;https://lijian.ac.cn/;https://teacher.ucas.ac.cn/~0012246", "dblp": ";29/4867-18;33/5448-40.html;72/4134-5.html", "google_scholar": ";vVhmzbAAAAAJ;IAJpTqYAAAAJ;zH_wmdwAAAAJ", "orcid": ";0000-0002-6739-621X;0000-0003-4977-1802;0000-0002-8618-4992", "linkedin": ";;;", "or_profile": "~Yilin_Kang2;~Yong_Liu7;~Jian_Li15;~Weiping_Wang4", "aff": "School of Cyber Security, University of Chinese Academy of Sciences;Renmin University of China;Institute of Information Engineering;IIE", "aff_domain": "ucas.edu.cn;ruc.edu.cn;iie.ac.cn;iie.ac.cn", "position": "PhD student;Associate Professor;Postdoc;Full Professor", "bibtex": "@misc{\nkang2022sharper,\ntitle={Sharper Utility Bounds for Differentially Private Models},\nauthor={Yilin Kang and Yong Liu and Jian Li and Weiping Wang},\nyear={2022},\nurl={https://openreview.net/forum?id=4Stc6i97dVN}\n}", "github": "", "project": "", "reviewers": "AguJ;wceT;9ZFZ;n773", "site": "https://openreview.net/forum?id=4Stc6i97dVN", "pdf_size": 0, "recommendation": "5;5;6;8", "confidence": "4;4;3;5", "correctness": "3;3;4;4", "technical_novelty": "3;2;3;4", "empirical_novelty": "3;2;1;3", "wc_summary_paper": "109;45;62;133", "wc_summary_review": "63;28;36;62", "wc_main_review": "174;309;120;487", "wc_review": "346;382;218;682", "wc_reply_reviewers": "80;45;0;176", "wc_reply_authors": "1027;839;590;380", "reply_reviewers": "1;1;0;1", "reply_authors": "2;2;1;2", "recommendation_avg": [ 6.0, 1.224744871391589 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 87.25, 35.31554190437972 ], "wc_summary_review_avg": [ 47.25, 15.514106484100203 ], "wc_main_review_avg": [ 272.5, 141.68715538114245 ], "wc_review_avg": [ 407.0, 170.0676336049867 ], "wc_reply_reviewers_avg": [ 75.25, 64.71234426289934 ], "wc_reply_authors_avg": [ 709.0, 245.16627011071486 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.75, 0.4330127018922193 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": 0.8164965809277259, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "University of Chinese Academy of Sciences;Renmin University of China;Institute of Information Engineering;Institute of Industrial Engineers", "aff_unique_dep": "School of Cyber Security;;;", "aff_unique_url": "http://www.ucas.ac.cn;http://www.ruc.edu.cn;;https://www.iie.org", "aff_unique_abbr": "UCAS;RUC;;IIE", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;2", "aff_country_unique": "China;;United States" }, { "id": "4V4TZG7i7L_", "title": "Hierarchical Multimodal Variational Autoencoders", "track": "main", "status": "Reject", "tldr": "", "abstract": "Humans find structure in natural phenomena by absorbing stimuli from multiple input sources such as vision, text, and speech. We study the use of deep generative models that generate multimodal data from latent representations. Existing approaches generate samples using a single shared latent variable, sometimes with marginally independent latent variables to capture modality-specific variations. However, there are cases where modality-specific variations depend on the kind of structure shared across modalities. To capture such heterogeneity, we propose a hierarchical multimodal VAE (HMVAE) that represents modality-specific variations using latent variables dependent on a shared top-level variable. Our experiments on the CUB and the Oxford Flower datasets show that the HMVAE can represent multimodal heterogeneity and outperform existing methods in sample generation quality and quantitative measures as the held-out log-likelihood.", "keywords": "hierarchical vae;variational inference;multimodal learning", "primary_area": "", "supplementary_material": "/attachment/f90aa9ab6b008796f6600908a32c6249a72760bd.zip", "author": "Jannik Wolff;Rahul G Krishnan;Lukas Ruff;Jan Nikolas Morshuis;Tassilo Klein;Shinichi Nakajima;Moin Nabi", "authorids": "~Jannik_Wolff1;~Rahul_G_Krishnan1;~Lukas_Ruff1;~Jan_Nikolas_Morshuis2;~Tassilo_Klein1;~Shinichi_Nakajima2;~Moin_Nabi1", "gender": ";M;M;;M;M;M", "homepage": ";http://www.cs.toronto.edu/~rahulgk/index.html;;;https://tjklein.github.io;https://web.ml.tu-berlin.de/author/dr.-shinichi-nakajima/;http://moinnabi.github.io/", "dblp": ";172/0880;222/9848;;65/4695.html;97/6115.html;167/0748", "google_scholar": ";ilJgXHkAAAAJ;https://scholar.google.de/citations?user=40QzNXMAAAAJ;;z7-L4ywAAAAJ;hXSvID4AAAAJ;31seHAMAAAAJ", "orcid": ";;0000-0002-9707-297X;;;0000-0003-3970-4569;", "linkedin": ";rahulgk/;lukasruff/;;tassiloklein;;", "or_profile": "~Jannik_Wolff1;~Rahul_G_Krishnan1;~Lukas_Ruff1;~Jan_Nikolas_Morshuis2;~Tassilo_Klein1;~Shinichi_Nakajima2;~Moin_Nabi1", "aff": ";Department of Computer Science, University of Toronto;Aignostics GmbH;;SAP SE;BIFOLD, TU Berlin;", "aff_domain": ";cs.toronto.edu;aignostics.com;;sap.com;tu-berlin.de;", "position": ";Assistant Professor;Researcher;;Principal Researcher;Postdoc;", "bibtex": "@misc{\nwolff2022hierarchical,\ntitle={Hierarchical Multimodal Variational Autoencoders},\nauthor={Jannik Wolff and Rahul G Krishnan and Lukas Ruff and Jan Nikolas Morshuis and Tassilo Klein and Shinichi Nakajima and Moin Nabi},\nyear={2022},\nurl={https://openreview.net/forum?id=4V4TZG7i7L_}\n}", "github": "", "project": "", "reviewers": "j3q4;vGFa;Gac5;XV1Z", "site": "https://openreview.net/forum?id=4V4TZG7i7L_", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "4;4;3;3", "correctness": "3;3;3;4", "technical_novelty": "2;2;3;4", "empirical_novelty": "2;1;3;0", "wc_summary_paper": "127;85;42;30", "wc_summary_review": "33;52;65;41", "wc_main_review": "335;296;164;120", "wc_review": "495;433;271;191", "wc_reply_reviewers": "0;0;18;12", "wc_reply_authors": "1044;1045;578;398", "reply_reviewers": "0;0;1;1", "reply_authors": "2;3;1;1", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.82915619758885 ], "empirical_novelty_avg": [ 1.5, 1.118033988749895 ], "wc_summary_paper_avg": [ 71.0, 38.255718526777144 ], "wc_summary_review_avg": [ 47.75, 12.028611723719408 ], "wc_main_review_avg": [ 228.75, 89.20587144353223 ], "wc_review_avg": [ 347.5, 121.87185893388187 ], "wc_reply_reviewers_avg": [ 7.5, 7.794228634059948 ], "wc_reply_authors_avg": [ 766.25, 285.43508456389867 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.75, 0.82915619758885 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": -1.0, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:5ZnrwLLzq8AJ:scholar.google.com/&scioq=Hierarchical+Multimodal+Variational+Autoencoders&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "University of Toronto;Aignostics;SAP SE;Technische Universit\u00e4t Berlin", "aff_unique_dep": "Department of Computer Science;;;Berlin Institute for Foundations of Learning and Data (BIFOLD)", "aff_unique_url": "https://www.utoronto.ca;;https://www.sap.com;https://www.tu-berlin.de", "aff_unique_abbr": "U of T;Aignostics;SAP;TU Berlin", "aff_campus_unique_index": "0", "aff_campus_unique": "Toronto;", "aff_country_unique_index": "0;1;1;1", "aff_country_unique": "Canada;Germany" }, { "id": "4XtpgPsvxE8", "title": "Multi-Objective Model Selection for Time Series Forecasting", "track": "main", "status": "Reject", "tldr": "", "abstract": "Research on time series forecasting has predominantly focused on developing methods that improve accuracy. However, other criteria such as training time or latency are critical in many real-world applications. We therefore address the question of how to choose an appropriate forecasting model for a given dataset among the plethora of available forecasting methods when accuracy is only one of many criteria. For this, our contributions are two-fold. First, we present a comprehensive benchmark, evaluating 7 classical and 6 deep learning forecasting methods on 44 heterogeneous, publicly available datasets. The benchmark code is open-sourced along with evaluations and forecasts for all methods. These evaluations enable us to answer open questions such as the amount of data required for deep learning models to outperform classical ones. Second, we leverage the benchmark evaluations to learn good defaults that consider multiple objectives such as accuracy and latency. By learning a mapping from forecasting models to performance metrics, we show that our method ParetoSelect is able to accurately select models from the Pareto front \u2014 alleviating the need to train or evaluate many forecasting models for model selection. To the best of our knowledge, ParetoSelect constitutes the first method to learn default models in a multi-objective setting.", "keywords": "time series;forecasting;model selection;multiobjective optimization;transfer-learning;tabular dataset.", "primary_area": "", "supplementary_material": "/attachment/0e115f81a056171dec7ffdc28fcd91aca7aa70d0.zip", "author": "Oliver Borchert;David Salinas;Valentin Flunkert;Tim Januschowski;Stephan G\u00fcnnemann", "authorids": "~Oliver_Borchert1;~David_Salinas2;~Valentin_Flunkert2;~Tim_Januschowski2;~Stephan_G\u00fcnnemann1", "gender": "M;M;M;M;M", "homepage": ";;http://www.daml.in.tum.de;https://geoalgo.github.io/;", "dblp": ";;43/3011;99/7083.html;54/8909", "google_scholar": ";https://scholar.google.ca/citations?user=DzlwsFwAAAAJ;;https://scholar.google.fr/citations?user=D0WjJlsAAAAJ;https://scholar.google.de/citations?user=EFdp8UMAAAAJ", "orcid": ";;;;", "linkedin": "https://linkedin.com/in/borchero;;;david-salinas-184a7582/;", "or_profile": "~Oliver_Borchert1;~Valentin_Flunkert2;~Stephan_G\u00fcnnemann1;~David_Salinas1;~Tim_Januschowski1", "aff": ";Amazon;Technical University Munich;Amazon;", "aff_domain": ";amazon.com;tum.de;amazon.com;", "position": ";Principal Researcher;Professor;Researcher;", "bibtex": "@misc{\nborchert2022multiobjective,\ntitle={Multi-Objective Model Selection for Time Series Forecasting},\nauthor={Oliver Borchert and David Salinas and Valentin Flunkert and Tim Januschowski and Stephan G{\\\"u}nnemann},\nyear={2022},\nurl={https://openreview.net/forum?id=4XtpgPsvxE8}\n}", "github": "", "project": "", "reviewers": "rQb3;FZ2f;XSBL", "site": "https://openreview.net/forum?id=4XtpgPsvxE8", "pdf_size": 0, "recommendation": "5;5;6", "confidence": "4;4;4", "correctness": "3;3;3", "technical_novelty": "2;2;2", "empirical_novelty": "3;3;4", "wc_summary_paper": "103;66;110", "wc_summary_review": "139;34;76", "wc_main_review": "345;763;599", "wc_review": "587;863;785", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "787;1493;571", "reply_reviewers": "0;0;0", "reply_authors": "1;3;1", "recommendation_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 93.0, 19.30457631409368 ], "wc_summary_review_avg": [ 83.0, 43.15089802078283 ], "wc_main_review_avg": [ 569.0, 171.96123594190252 ], "wc_review_avg": [ 745.0, 116.17228585166085 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 950.3333333333334, 393.72522849768666 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.6666666666666667, 0.9428090415820634 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16804466631850558624&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "Amazon;Technical University of Munich", "aff_unique_dep": "Amazon.com, Inc.;", "aff_unique_url": "https://www.amazon.com;https://www.tum.de", "aff_unique_abbr": "Amazon;TUM", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", "aff_country_unique": "United States;Germany" }, { "id": "4YOOO4ZNKM", "title": "Self-supervised Learning for Sequential Recommendation with Model Augmentation", "track": "main", "status": "Reject", "tldr": "", "abstract": "The sequential recommendation aims at predicting the next items in user behaviors, which can be solved by characterizing item relationships in sequences. Due to the data sparsity and noise issues in sequences, a new self-supervised learning (SSL) paradigm is proposed to improve the performance, which employs contrastive learning between positive and negative views of sequences. \nHowever, existing methods all construct views by adopting augmentation from data perspectives, while we argue that 1) optimal data augmentation methods are hard to devise, 2) data augmentation methods destroy sequential correlations, and 3) data augmentation fails to incorporate comprehensive self-supervised signals. \nTherefore, we investigate the possibility of model augmentation to construct view pairs. We propose three levels of model augmentation methods: neuron masking, layer dropping, and encoder complementing. \nThis work opens up a novel direction in constructing views for contrastive SSL. Experiments verify the efficacy of model augmentation for the SSL in the sequential recommendation. \n", "keywords": "Sequential Recommendation;Self-supervised Learning;Contrastive Learning;Model Augmentation", "primary_area": "", "supplementary_material": "", "author": "Zhiwei Liu;Yongjun Chen;Jia Li;Man Luo;Philip S. Yu;Caiming Xiong", "authorids": "~Zhiwei_Liu3;yongjun.chen@salesforce.com;~Jia_Li8;~Man_Luo2;~Philip_S._Yu1;~Caiming_Xiong1", "gender": ";;F;;M;M", "homepage": "https://sites.google.com/view/zhiwei-jim;;http://linkedin.com/in/venali;;https://cs.uic.edu/profiles/philip-yu/;http://cmxiong.com/", "dblp": "90/9499-1.html;;;;y/PhilipSYu;80/7282", "google_scholar": "https://scholar.google.com/citations?;;1f7xJcAAAAAJ;;D0lL1r0AAAAJ;vaSdahkAAAAJ", "orcid": "0000-0003-1525-1067;;;;0000-0002-3491-5968;", "linkedin": ";;;;;caiming-xiong-150a1417", "or_profile": "~Zhiwei_Liu3;yongjun.chen@salesforce.com;~Jia_Li8;~Man_Luo2;~Philip_S._Yu1;~Caiming_Xiong1", "aff": "University of Illinois, Chicago;;Salesforce Research;;University of Illinois Chicago;Salesforce Research", "aff_domain": "uic.edu;;salesforce.com;;uic.edu;salesforce.com", "position": "PhD student;;Research Engineer;;Full Professor;Research Scientist", "bibtex": "@misc{\nliu2022selfsupervised,\ntitle={Self-supervised Learning for Sequential Recommendation with Model Augmentation},\nauthor={Zhiwei Liu and Yongjun Chen and Jia Li and Man Luo and Philip S. Yu and Caiming Xiong},\nyear={2022},\nurl={https://openreview.net/forum?id=4YOOO4ZNKM}\n}", "github": "", "project": "", "reviewers": "D5iZ;fdBG;1XwB", "site": "https://openreview.net/forum?id=4YOOO4ZNKM", "pdf_size": 0, "recommendation": "3;3;5", "confidence": "4;4;4", "correctness": "3;3;3", "technical_novelty": "2;2;3", "empirical_novelty": "2;2;3", "wc_summary_paper": "46;53;39", "wc_summary_review": "35;169;65", "wc_main_review": "94;77;105", "wc_review": "175;299;209", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "22;30;51", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "recommendation_avg": [ 3.6666666666666665, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 46.0, 5.715476066494082 ], "wc_summary_review_avg": [ 89.66666666666667, 57.418541333691934 ], "wc_main_review_avg": [ 92.0, 11.51810169544733 ], "wc_review_avg": [ 227.66666666666666, 52.315283511502535 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 34.333333333333336, 12.229290885229428 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3676158799315621888&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;0;1", "aff_unique_norm": "University of Illinois at Chicago;Salesforce", "aff_unique_dep": ";Salesforce Research", "aff_unique_url": "https://www.uic.edu;https://research.salesforce.com", "aff_unique_abbr": "UIC;Salesforce", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Chicago;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "When, Why, and Which Pretrained GANs Are Useful?", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6642", "id": "4Ycr8oeCoIh", "poster": "", "openreview": "https://openreview.net/forum?id=4Ycr8oeCoIh", "slides": "https://iclr.cc/virtual/2022/poster/6642", "video": "https://iclr.cc/virtual/2022/poster/6642", "author_site": "Timofey Grigoryev, Andrey Voynov, Artem Babenko", "tldr": "", "abstract": "The literature has proposed several methods to finetune pretrained GANs on new datasets, which typically results in higher performance compared to training from scratch, especially in the limited-data regime. However, despite the apparent empirical benefits of GAN pretraining, its inner mechanisms were not analyzed in-depth, and understanding of its role is not entirely clear. Moreover, the essential practical details, e.g., selecting a proper pretrained GAN checkpoint, currently do not have rigorous grounding and are typically determined by trial and error. \n\nThis work aims to dissect the process of GAN finetuning. First, we show that initializing the GAN training process by a pretrained checkpoint primarily affects the model's coverage rather than the fidelity of individual samples. Second, we explicitly describe how pretrained generators and discriminators contribute to the finetuning process and explain the previous evidence on the importance of pretraining both of them. Finally, as an immediate practical benefit of our analysis, we describe a simple recipe to choose an appropriate GAN checkpoint that is the most suitable for finetuning to a particular target task. Importantly, for most of the target tasks, Imagenet-pretrained GAN, despite having poor visual quality, appears to be an excellent starting point for finetuning, resembling the typical pretraining scenario of discriminative computer vision models.", "keywords": "GAN;pretraining", "primary_area": "", "supplementary_material": "", "author": "Timofey Grigoryev;Andrey Voynov;Artem Babenko", "authorids": "~Timofey_Grigoryev1;~Andrey_Voynov1;~Artem_Babenko1", "gender": "M;M;M", "homepage": "https://www.hse.ru/org/persons/208494742;https://anvoynov.github.io/anvoynov/;", "dblp": ";255/6107;117/4834", "google_scholar": ";imBjSgUAAAAJ;q885d1wAAAAJ", "orcid": ";;0000-0002-1830-8252", "linkedin": ";;", "or_profile": "~Timofey_Grigoryev1;~Andrey_Voynov1;~Artem_Babenko1", "aff": "Moscow Institute of Physics and Technology;Yandex;Yandex", "aff_domain": "phystech.edu;yandex-team.ru;yandex-team.ru", "position": "PhD student;Researcher;Researcher", "bibtex": "@inproceedings{\ngrigoryev2022when,\ntitle={When, Why, and Which Pretrained {GAN}s Are Useful?},\nauthor={Timofey Grigoryev and Andrey Voynov and Artem Babenko},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=4Ycr8oeCoIh}\n}", "github": "", "project": "", "reviewers": "iExS;BE8y;8amM", "pdf_size": 0, "recommendation": "6;6;8", "confidence": "3;4;4", "correctness": "4;4;3", "technical_novelty": "2;3;3", "empirical_novelty": "3;3;4", "wc_summary_paper": "13;65;48", "wc_summary_review": "60;31;15", "wc_main_review": "120;403;398", "wc_review": "193;499;461", "wc_reply_reviewers": "0;102;0", "wc_reply_authors": "182;1023;297", "reply_reviewers": "0;2;0", "reply_authors": "1;3;1", "recommendation_avg": [ 6.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 42.0, 21.64871050817269 ], "wc_summary_review_avg": [ 35.333333333333336, 18.624953392931992 ], "wc_main_review_avg": [ 307.0, 132.24472264202709 ], "wc_review_avg": [ 384.3333333333333, 136.17961994692484 ], "wc_reply_reviewers_avg": [ 34.0, 48.08326112068523 ], "wc_reply_authors_avg": [ 500.6666666666667, 372.31736760755894 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.9428090415820634 ], "reply_authors_avg": [ 1.6666666666666667, 0.9428090415820634 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.49999999999999983, "corr_recommendation_correctness": -0.9999999999999998, "gs_citation": 27, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1749765519247284522&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=4Ycr8oeCoIh", "email": "phystech.edu;yandex-team.ru;yandex-team.ru", "author_num": 3, "aff_unique_index": "0;1;1", "aff_unique_norm": "Moscow Institute of Physics and Technology;Yandex", "aff_unique_dep": ";", "aff_unique_url": "https://www.mipt.ru/en;https://yandex.com", "aff_unique_abbr": "MIPT;Yandex", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Russian Federation" }, { "id": "4ZEJ_Z18NH", "title": "Learning Perceptual Compression of Facial Video", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "We propose in this paper a new paradigm for facial video compression. We leverage the generative capacity of GANs such as StyleGAN to represent and compress each video frame (intra compression), as well as the successive differences between frames (inter compression). Each frame is inverted in the latent space of StyleGAN, where the optimal compression is learned. To do so, a diffeomorphic latent representation is learned using a normalizing flows model, where an entropy model can be optimized for image coding. In addition, we propose a new perceptual loss that is more efficient than other counterparts (LPIPS, VGG16). Finally, an entropy model for inter coding with residual is also learned in the previously constructed latent space. Our method (SGANC) is simple, faster to train, and achieves competitive results for image and video coding compared to state-of-the-art codecs such as VTM, AV1, and recent deep learning techniques.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/c7327f182ec3fd33b191f7c9c1e0c94618601e43.zip", "author": "Mustafa Shukor;Xu YAO;Bharath Bhushan Damodaran;Pierre Hellier", "authorids": "~Mustafa_Shukor1;~Xu_YAO1;~Bharath_Bhushan_Damodaran1;~Pierre_Hellier1", "gender": "M;F;M;M", "homepage": "https://twitter.com/MustafaShukor1;https://xu-yao.github.io/;;", "dblp": ";;189/3814;", "google_scholar": "lhp9mRgAAAAJ;4DjVtSgAAAAJ;DarhRtEAAAAJ;https://scholar.google.fr/citations?user=U2BX6Q8AAAAJ", "orcid": ";;;0000-0003-3603-2381", "linkedin": ";xu-yao-05a303b5;bbdamodaran/;", "or_profile": "~Mustafa_Shukor1;~Xu_YAO1;~Bharath_Bhushan_Damodaran1;~Pierre_Hellier1", "aff": "Universit\u00e9 Pierre et Marie Curie - Paris 6, Sorbonne Universit\u00e9 - Facult\u00e9 des Sciences (Paris VI);T\u00e9l\u00e9com Paris;Interdigital R&D;Interdigital", "aff_domain": "isir.upmc.fr;telecom-paris.fr;interdigital.com;interdigital.com", "position": "Researcher;PhD student;Researcher;principal scientist", "bibtex": "@misc{\nshukor2022learning,\ntitle={Learning Perceptual Compression of Facial Video},\nauthor={Mustafa Shukor and Xu YAO and Bharath Bhushan Damodaran and Pierre Hellier},\nyear={2022},\nurl={https://openreview.net/forum?id=4ZEJ_Z18NH}\n}", "github": "", "project": "", "reviewers": "jUoe;wd3h;4GZc", "site": "https://openreview.net/forum?id=4ZEJ_Z18NH", "pdf_size": 0, "recommendation": "3;5;6", "confidence": "3;4;4", "correctness": "4;3;3", "technical_novelty": "2;3;3", "empirical_novelty": "2;2;3", "wc_summary_paper": "102;117;147", "wc_summary_review": "27;35;110", "wc_main_review": "60;433;534", "wc_review": "189;585;791", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 4.666666666666667, 1.247219128924647 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 122.0, 18.708286933869708 ], "wc_summary_review_avg": [ 57.333333333333336, 37.38389433373087 ], "wc_main_review_avg": [ 342.3333333333333, 203.85343318723434 ], "wc_review_avg": [ 521.6666666666666, 249.81237403744078 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.9449111825230683, "corr_recommendation_correctness": -0.9449111825230683, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:BDa7RHtfRq4J:scholar.google.com/&scioq=Learning+Perceptual+Compression+of+Facial+Video&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;2;2", "aff_unique_norm": "Universit\u00e9 Pierre et Marie Curie - Paris 6;T\u00e9l\u00e9com Paris;InterDigital", "aff_unique_dep": "Facult\u00e9 des Sciences;;R&D", "aff_unique_url": "https://www.upmc.fr;https://www.telecom-paris.fr;https://www.interdigital.com", "aff_unique_abbr": "UPMC;T\u00e9l\u00e9com Paris;Interdigital", "aff_campus_unique_index": "0", "aff_campus_unique": "Paris;", "aff_country_unique_index": "0;0;1;1", "aff_country_unique": "France;United States" }, { "id": "4j4qVy8OQA1", "title": "A Koopman Approach to Understanding Sequence Neural Models", "track": "main", "status": "Reject", "tldr": "", "abstract": "Deep learning models are often treated as \"black boxes\". Existing approaches for understanding the decision mechanisms of neural networks provide limited explanations or depend on local theories. Recently, a data-driven framework based on Koopman theory was developed for the analysis of nonlinear dynamical systems. In this paper, we introduce a new approach to understanding trained sequence neural models: the Koopman Analysis of Neural Networks (KANN) method. At the core of our method lies the Koopman operator, which is linear, yet it encodes the dominant features of the network latent dynamics. Moreover, its eigenvectors and eigenvalues facilitate understanding: in the sentiment analysis problem, the eigenvectors highlight positive and negative n-grams; and, in the ECG classification challenge, the eigenvectors capture the dominant features of the normal beat signal.", "keywords": "Koopman methods;sequence neural models;understanding deep learning", "primary_area": "", "supplementary_material": "/attachment/2abde8d63d3722368a734e2d1390ccc7f768db63.zip", "author": "Ilan Naiman;Omri Azencot", "authorids": "~Ilan_Naiman1;~Omri_Azencot1", "gender": "M;Unspecified", "homepage": "https://www.linkedin.com/in/ilan-naiman-80071a190;http://omriazencot.com", "dblp": "285/4824;132/3985.html", "google_scholar": "Fglytk8AAAAJ;https://scholar.google.co.il/citations?user=MEGuRmAAAAAJ", "orcid": ";", "linkedin": "ilan-naiman-80071a190;omri-azencot-a8812417/", "or_profile": "~Ilan_Naiman1;~Omri_Azencot1", "aff": "Ben Gurion University of the Negev, Technion;Ben-Gurion University of the Negev", "aff_domain": "bgu.ac.il;bgu.ac.il", "position": "PhD student;Assistant Professor", "bibtex": "@misc{\nnaiman2022a,\ntitle={A Koopman Approach to Understanding Sequence Neural Models},\nauthor={Ilan Naiman and Omri Azencot},\nyear={2022},\nurl={https://openreview.net/forum?id=4j4qVy8OQA1}\n}", "github": "", "project": "", "reviewers": "Dka9;XyzC;6ZvH;Ts2w", "site": "https://openreview.net/forum?id=4j4qVy8OQA1", "pdf_size": 0, "recommendation": "3;3;5;6", "confidence": "4;5;4;3", "correctness": "4;2;3;4", "technical_novelty": "1;2;2;3", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "40;56;72;79", "wc_summary_review": "59;42;32;92", "wc_main_review": "115;951;157;271", "wc_review": "214;1049;261;442", "wc_reply_reviewers": "1471;332;0;0", "wc_reply_authors": "651;946;337;600", "reply_reviewers": "2;1;0;0", "reply_authors": "2;2;2;1", "recommendation_avg": [ 4.25, 1.299038105676658 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 61.75, 15.07274029498286 ], "wc_summary_review_avg": [ 56.25, 22.78568629644497 ], "wc_main_review_avg": [ 373.5, 338.27023221087603 ], "wc_review_avg": [ 491.5, 332.9388081915354 ], "wc_reply_reviewers_avg": [ 450.75, 604.4341878980705 ], "wc_reply_authors_avg": [ 633.5, 216.21574873260272 ], "reply_reviewers_avg": [ 0.75, 0.82915619758885 ], "reply_authors_avg": [ 1.75, 0.4330127018922193 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.816496580927726, "corr_recommendation_correctness": 0.4061811972299616, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6025283231881023854&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1", "aff_unique_norm": "Ben Gurion University of the Negev;Ben-Gurion University of the Negev", "aff_unique_dep": ";", "aff_unique_url": "https://www.bgu.ac.il;https://www.bgu.ac.il", "aff_unique_abbr": "BGU;BGU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Israel" }, { "id": "4jUmjIoTz2", "title": "Collaborate to Defend Against Adversarial Attacks", "track": "main", "status": "Reject", "tldr": "", "abstract": "Adversarially robust learning methods require invariant predictions to a small neighborhood of its natural inputs, thus often encountering insufficient model capacity. Learning multiple models in an ensemble can mitigate this insufficiency, further improving both generalization and robustness. However, an ensemble still wastes the limited capacity of multiple models. To optimally utilizing the limited capacity, this paper proposes to learn a collaboration among multiple sub-models. Compared with the ensemble, the collaboration enables the possibility of correct predictions even if there exists a single correct sub-model. Besides, learning a collaboration could enable every sub-model to fit its own vulnerability area and reserve the rest of the sub-models to fit other vulnerability areas. To implement the idea, we propose a collaboration framework---CDA$^2$ the abbreviation for Collaborate to Defend against Adversarial Attacks. CDA$^2$ could effectively minimize the vulnerability overlap of all sub-models and then choose a representative sub-model to make correct predictions. Empirical experiments verify that CDA$^2$ outperforms various ensemble methods against black-box and white-box adversarial attacks.", "keywords": "adversarial defense;collaboration;ensemble.", "primary_area": "", "supplementary_material": "/attachment/83770bd8f56280e88b2dc738358246cda958b492.zip", "author": "Sen Cui;Jingfeng Zhang;Jian Liang;Masashi Sugiyama;Changshui Zhang", "authorids": "~Sen_Cui1;~Jingfeng_Zhang1;~Jian_Liang3;~Masashi_Sugiyama1;~Changshui_Zhang2", "gender": "M;M;M;M;M", "homepage": ";https://zjfheart.github.io;;http://www.ms.k.u-tokyo.ac.jp/sugi/;http://bigeye.au.tsinghua.edu.cn/english/Introduction.html", "dblp": "267/5483;227/2664.html;19/2208;35/1228;z/ChangshuiZhang", "google_scholar": "UzQuG1UAAAAJ;NS0P1FkAAAAJ;mrunnpoAAAAJ;https://scholar.google.co.jp/citations?user=GkYIrlIAAAAJ;GL9M37YAAAAJ", "orcid": ";0000-0003-3491-8074;;0000-0001-6658-6743;", "linkedin": ";;;;", "or_profile": "~Sen_Cui1;~Jingfeng_Zhang1;~Jian_Liang3;~Masashi_Sugiyama1;~Changshui_Zhang2", "aff": "Tsinghua University;RIKEN;Alibaba Group;The University of Tokyo;Tsinghua University", "aff_domain": "tsinghua.edu.cn;riken.jp;alibaba-inc.com;u-tokyo.ac.jp;mail.tsinghua.edu.cn", "position": "PhD student;Postdoc;Senior Algorithm Engineer;Full Professor;Full Professor", "bibtex": "@misc{\ncui2022collaborate,\ntitle={Collaborate to Defend Against Adversarial Attacks},\nauthor={Sen Cui and Jingfeng Zhang and Jian Liang and Masashi Sugiyama and Changshui Zhang},\nyear={2022},\nurl={https://openreview.net/forum?id=4jUmjIoTz2}\n}", "github": "", "project": "", "reviewers": "g3Dc;qeVf;Koh8", "site": "https://openreview.net/forum?id=4jUmjIoTz2", "pdf_size": 0, "recommendation": "3;6;8", "confidence": "5;3;4", "correctness": "1;4;3", "technical_novelty": "3;3;4", "empirical_novelty": "2;3;4", "wc_summary_paper": "149;52;103", "wc_summary_review": "30;42;13", "wc_main_review": "456;62;476", "wc_review": "635;156;592", "wc_reply_reviewers": "281;0;0", "wc_reply_authors": "2147;295;1670", "reply_reviewers": "1;0;0", "reply_authors": "5;1;3", "recommendation_avg": [ 5.666666666666667, 2.0548046676563256 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "correctness_avg": [ 2.6666666666666665, 1.247219128924647 ], "technical_novelty_avg": [ 3.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 3.0, 0.816496580927726 ], "wc_summary_paper_avg": [ 101.33333333333333, 39.617616732402716 ], "wc_summary_review_avg": [ 28.333333333333332, 11.897712198383164 ], "wc_main_review_avg": [ 331.3333333333333, 190.62237247733773 ], "wc_review_avg": [ 461.0, 216.38083710593844 ], "wc_reply_reviewers_avg": [ 93.66666666666667, 132.4646703422799 ], "wc_reply_authors_avg": [ 1370.6666666666667, 785.1438650902705 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 3.0, 1.632993161855452 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.5960395606792698, "corr_recommendation_correctness": 0.7370434740955019, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:RCkb2ZzcO2IJ:scholar.google.com/&scioq=Collaborate+to+Defend+Against+Adversarial+Attacks&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;2;3;0", "aff_unique_norm": "Tsinghua University;RIKEN;Alibaba Group;University of Tokyo", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.riken.jp;https://www.alibaba.com;https://www.u-tokyo.ac.jp", "aff_unique_abbr": "THU;RIKEN;Alibaba;UTokyo", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;1;0", "aff_country_unique": "China;Japan" }, { "id": "4l5iO9eoh3f", "title": "Supervised Permutation Invariant Networks for solving the CVRP with bounded fleet size", "track": "main", "status": "Reject", "tldr": "", "abstract": "Learning to solve combinatorial optimization problems, such as the vehicle routing problem, offers great computational advantages over classical operation research solvers and heuristics. The recently developed deep reinforcement learning approaches either improve an initially given solution iteratively or sequentially construct a set of individual tours. \nHowever, all existing learning-based approaches are not able to work for a fixed number of vehicles and thus bypass the NP-hardness of the original problem. On the other hand, this makes them less suitable for real applications, as many logistic service providers rely on solutions provided for a specific bounded fleet size and cannot accommodate short term changes to the number of vehicles.\nIn contrast we propose a powerful supervised deep learning framework that constructs a complete tour plan from scratch while respecting an apriori fixed number of vehicles. \nIn combination with an efficient post-processing scheme, our supervised approach is not only much faster and easier to train but also achieves competitive results that incorporate the practical aspect of vehicle costs.\nIn thorough controlled experiments we re-evaluate and compare our method to multiple state-of-the-art approaches where we demonstrate stable performance and shed some light on existent inconsistencies in the experimentation protocols of the related work.", "keywords": "Deep Learning;Combinatorial Optimization;Vehicle Routing", "primary_area": "", "supplementary_material": "", "author": "Daniela Thyssens;Jonas Falkner;Lars Schmidt-Thieme", "authorids": "~Daniela_Thyssens1;~Jonas_Falkner1;~Lars_Schmidt-Thieme1", "gender": ";;M", "homepage": "https://www.ismll.uni-hildesheim.de/personen/thyssensd.html;;https://www.ismll.uni-hildesheim.de/personen/lst_en.html", "dblp": "274/2865;;s/LarsSchmidtThieme", "google_scholar": "Jg5ZIaoAAAAJ;;https://scholar.google.de/citations?user=l3taTdYAAAAJ", "orcid": ";;0000-0001-5729-6023", "linkedin": ";;", "or_profile": "~Daniela_Thyssens1;~Jonas_Falkner1;~Lars_Schmidt-Thieme1", "aff": "Information Systems and Machine Learning Lab, University of Hildesheim;;University of Hildesheim", "aff_domain": "ismll.uni-hildesheim.de;;uni-hildesheim.de", "position": "PhD student;;Full Professor", "bibtex": "@misc{\nthyssens2022supervised,\ntitle={Supervised Permutation Invariant Networks for solving the {CVRP} with bounded fleet size},\nauthor={Daniela Thyssens and Jonas Falkner and Lars Schmidt-Thieme},\nyear={2022},\nurl={https://openreview.net/forum?id=4l5iO9eoh3f}\n}", "github": "", "project": "", "reviewers": "gFqt;DMGQ;e8jg;hmYm", "site": "https://openreview.net/forum?id=4l5iO9eoh3f", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "3;4;4;3", "correctness": "2;3;2;3", "technical_novelty": "2;3;2;2", "empirical_novelty": "3;2;3;3", "wc_summary_paper": "84;131;54;91", "wc_summary_review": "47;120;176;42", "wc_main_review": "322;284;623;303", "wc_review": "453;535;853;436", "wc_reply_reviewers": "54;165;374;102", "wc_reply_authors": "359;484;730;217", "reply_reviewers": "1;1;1;1", "reply_authors": "1;1;1;1", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 90.0, 27.44995446262161 ], "wc_summary_review_avg": [ 96.25, 55.43633736097651 ], "wc_main_review_avg": [ 383.0, 139.21386425209235 ], "wc_review_avg": [ 569.25, 168.04519481377622 ], "wc_reply_reviewers_avg": [ 173.75, 122.13184474165614 ], "wc_reply_authors_avg": [ 447.5, 188.48143144617723 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.2294157338705618, "corr_recommendation_correctness": 0.6882472016116854, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7625909024395928317&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0", "aff_unique_norm": "University of Hildesheim", "aff_unique_dep": "Information Systems and Machine Learning Lab", "aff_unique_url": "https://www.uni-hildesheim.de", "aff_unique_abbr": "", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Germany" }, { "id": "4l9eWfCM3Jb", "title": "Jointly Learning Identification and Control for Few-Shot Policy Adaptation", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Complex dynamical systems are challenging to model and control. Especially when not deployed in controlled conditions, they might be subject to disturbances that cannot be predicted in advance, \\emph{e.g.} wind, a payload, or environment-specific forces. Adapting to such disturbances with a limited sample budget is difficult, especially for systems with many degrees of freedom. This paper introduces a theoretical framework to model this problem. We show that the expected error of a sensorimotor controller can be bounded by two components: the optimality of the controller and the domain gap between training and testing due to unmodelled dynamic effects. These components are usually minimized separately; the former with online or offline optimization, the latter with system identification. Motivated by this observation, we propose a differentiable programming approach to \\emph{jointly} minimize model and control errors with gradient descent. Similar to model-based methods, our algorithm learns from prior knowledge about the system, but \\emph{grounds} the model to account for observed disturbances, thereby favouring sample efficiency. Yet, it maintains the flexibility of model-free methods, which can be applied to generic systems with arbitrary inputs. We evaluate our approach on several complex systems and tasks, and experimentally analyze the advantages over model-free and model-based methods in terms of performance and sample efficiency.", "keywords": "policy learning;control;system identification;few-shot domain adaptation", "primary_area": "", "supplementary_material": "/attachment/90ad2763481437a618066f1c70d35e91f5c97e05.zip", "author": "Nina Wiedemann;Antonio Loquercio;Matthias M\u00fcller;Rene Ranftl;Davide Scaramuzza", "authorids": "~Nina_Wiedemann1;~Antonio_Loquercio1;~Matthias_M\u00fcller1;~Rene_Ranftl1;~Davide_Scaramuzza1", "gender": "F;M;;;", "homepage": ";https://antonilo.github.io/;https://matthias.pw;;", "dblp": "247/3414;203/5131;169/4686-1;;", "google_scholar": "qC1JKzoAAAAJ;pbmjtZsAAAAJ;AeMLOMEAAAAJ;;", "orcid": "0000-0002-8160-7634;;;;", "linkedin": "https://ch.linkedin.com/in/nina-wiedemann-79866917b;;;;", "or_profile": "~Nina_Wiedemann1;~Antonio_Loquercio1;~Matthias_M\u00fcller1;~Rene_Ranftl1;~Davide_Scaramuzza1", "aff": "Department of Informatics, University of Zurich, University of Zurich;University of California, Berkeley;Intel;;", "aff_domain": "ifi.uzh.ch;berkeley.edu;intel.com;;", "position": "Intern;Postdoc;Researcher;;", "bibtex": "@misc{\nwiedemann2022jointly,\ntitle={Jointly Learning Identification and Control for Few-Shot Policy Adaptation},\nauthor={Nina Wiedemann and Antonio Loquercio and Matthias M{\\\"u}ller and Rene Ranftl and Davide Scaramuzza},\nyear={2022},\nurl={https://openreview.net/forum?id=4l9eWfCM3Jb}\n}", "github": "", "project": "", "reviewers": "pCFN;vGB4;SDWs", "site": "https://openreview.net/forum?id=4l9eWfCM3Jb", "pdf_size": 0, "recommendation": "3;3;3", "confidence": "4;3;4", "correctness": "2;3;2", "technical_novelty": "2;1;1", "empirical_novelty": "2;2;1", "wc_summary_paper": "43;74;73", "wc_summary_review": "26;846;37", "wc_main_review": "385;200;742", "wc_review": "454;1120;852", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 2.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 1.3333333333333333, 0.4714045207910317 ], "empirical_novelty_avg": [ 1.6666666666666667, 0.4714045207910317 ], "wc_summary_paper_avg": [ 63.333333333333336, 14.38363267359428 ], "wc_summary_review_avg": [ 303.0, 383.98524277199334 ], "wc_main_review_avg": [ 442.3333333333333, 224.9538224218967 ], "wc_review_avg": [ 808.6666666666666, 273.61448954485013 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:w11-BoIIh0kJ:scholar.google.com/&scioq=Jointly+Learning+Identification+and+Control+for+Few-Shot+Policy+Adaptation&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;2", "aff_unique_norm": "University of Zurich;University of California, Berkeley;Intel", "aff_unique_dep": "Department of Informatics;;Intel Corporation", "aff_unique_url": "https://www.uzh.ch;https://www.berkeley.edu;https://www.intel.com", "aff_unique_abbr": "UZH;UC Berkeley;Intel", "aff_campus_unique_index": "1", "aff_campus_unique": ";Berkeley", "aff_country_unique_index": "0;1;1", "aff_country_unique": "Switzerland;United States" }, { "id": "4lLyoISm9M", "title": "Range-Net: A High Precision Neural SVD", "track": "main", "status": "Reject", "tldr": "", "abstract": "For Big Data applications, computing a rank-$r$ Singular Value Decomposition (SVD) is restrictive due to the main memory requirements. Recently introduced streaming Randomized SVD schemes work under the restrictive assumption that the singular value spectrum of the data has an exponential decay. This is seldom true for any practical data. Further, the approximation errors in the singular vectors and values are high due to the randomized projection. We present Range-Net as a low memory alternative to rank-$r$ SVD that satisfies the lower bound on tail-energy given by Eckart-Young-Mirsky (EYM) theorem at machine precision. Range-Net is a deterministic two-stage neural optimization approach with random initialization, where the memory requirement depends explicitly on the feature dimension and desired rank, independent of the sample dimension. The data samples are read in a streaming manner with the network minimization problem converging to the desired rank-$r$ approximation. Range-Net is fully interpretable where all the network outputs and weights have a specific meaning. We provide theoretical guarantees that Range-Net extracted SVD factors satisfy EYM tail-energy lower bound with numerical experiments on real datasets at various scales that confirm these bounds. A comparison against the state-of-the-art streaming Randomized SVD shows that Range-Net is six orders of magnitude more accurate in terms of tail energy while correctly extracting the singular values and vectors.", "keywords": "SVD;Eigen;Interpretable;Neural Nets;Streaming;Big Data", "primary_area": "", "supplementary_material": "/attachment/fb5954a4a370ae9a2d625d2789ccb3b324dc1fa1.zip", "author": "Soumyajit Gupta;Gurpreet Singh;Clint N. Dawson", "authorids": "~Soumyajit_Gupta1;~Gurpreet_Singh3;~Clint_N._Dawson1", "gender": "M;M;", "homepage": ";;", "dblp": ";;", "google_scholar": ";ou3UDckAAAAJ;", "orcid": ";;", "linkedin": "soumyajit-gupta/;grpt-singh;", "or_profile": "~Soumyajit_Gupta1;~Gurpreet_Singh3;~Clint_N._Dawson1", "aff": "University of Texas, Austin;University of Texas at Austin;", "aff_domain": "utexas.edu;utexas.edu;", "position": "PhD student;Research Scientist;", "bibtex": "@misc{\ngupta2022rangenet,\ntitle={Range-Net: A High Precision Neural {SVD}},\nauthor={Soumyajit Gupta and Gurpreet Singh and Clint N. Dawson},\nyear={2022},\nurl={https://openreview.net/forum?id=4lLyoISm9M}\n}", "github": "", "project": "", "reviewers": "MNXm;YsEC;uKtm;vVFi", "site": "https://openreview.net/forum?id=4lLyoISm9M", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "5;4;2;3", "correctness": "3;3;3;4", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "47;61;101;78", "wc_summary_review": "31;69;19;45", "wc_main_review": "194;192;467;367", "wc_review": "272;322;587;490", "wc_reply_reviewers": "0;0;0;39", "wc_reply_authors": "796;663;750;828", "reply_reviewers": "0;0;0;1", "reply_authors": "1;1;1;2", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 3.5, 1.118033988749895 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 71.75, 20.141685629559408 ], "wc_summary_review_avg": [ 41.0, 18.601075237738275 ], "wc_main_review_avg": [ 305.0, 117.44998935717278 ], "wc_review_avg": [ 417.75, 126.76429899620791 ], "wc_reply_reviewers_avg": [ 9.75, 16.887495373796554 ], "wc_reply_authors_avg": [ 759.25, 62.102234259324355 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.7181848464596079, "corr_recommendation_correctness": 0.6622661785325219, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:AewUd60esHQJ:scholar.google.com/&scioq=Range-Net:+A+High+Precision+Neural+SVD&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "University of Texas at Austin", "aff_unique_dep": "", "aff_unique_url": "https://www.utexas.edu", "aff_unique_abbr": "UT Austin", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Austin", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "4o1xPXaS4X", "title": "Fooling Adversarial Training with Induction Noise", "track": "main", "status": "Reject", "tldr": "", "abstract": "Adversarial training is widely believed to be a reliable approach to improve model robustness against adversarial attack. However, in this paper, we show that when trained on one type of poisoned data, adversarial training can also be fooled to have catastrophic behavior, e.g., $<1\\%$ robust test accuracy with $>90\\%$ robust training accuracy on CIFAR-10 dataset. Previously, there are other types of noise poisoned in the training data that have successfully fooled standard training ($15.8\\%$ standard test accuracy with $99.9\\%$ standard training accuracy on CIFAR-10 dataset), but their poisonings can be easily removed when adopting adversarial training. Therefore, we aim to design a new type of inducing noise, named ADVIN, which is an irremovable poisoning of training data. ADVIN can not only degrade the robustness of adversarial training by a large margin, for example, from $51.7\\%$ to $0.57\\%$ on CIFAR-10 dataset, but also be effective for fooling standard training ($13.1\\%$ standard test accuracy with $100\\%$ standard training accuracy). Additionally, ADVIN can be applied to preventing personal data (like selfies) from being exploited without authorization under whether standard or adversarial training. ", "keywords": "Data poisoning;adversarial training;data privacy", "primary_area": "", "supplementary_material": "", "author": "Zhirui Wang;Yifei Wang;Yisen Wang", "authorids": "~Zhirui_Wang1;~Yifei_Wang1;~Yisen_Wang1", "gender": "M;M;M", "homepage": "https://github.com/jeferay;https://yifeiwang77.com;https://yisenwang.github.io/", "dblp": ";00/555-1;172/1346-1", "google_scholar": ";-CLy6YsAAAAJ;uMWPDboAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Zhirui_Wang1;~Yifei_Wang1;~Yisen_Wang1", "aff": "Peking University;Peking University;Peking University", "aff_domain": "pku.edu.cn;pku.edu.cn;pku.edu.cn", "position": "Undergrad student;PhD student;Assistant Professor", "bibtex": "@misc{\nwang2022fooling,\ntitle={Fooling Adversarial Training with Induction Noise},\nauthor={Zhirui Wang and Yifei Wang and Yisen Wang},\nyear={2022},\nurl={https://openreview.net/forum?id=4o1xPXaS4X}\n}", "github": "", "project": "", "reviewers": "H39o;eute;r3Un;Hbey", "site": "https://openreview.net/forum?id=4o1xPXaS4X", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "4;4;4;4", "correctness": "3;3;4;3", "technical_novelty": "3;2;3;3", "empirical_novelty": "3;2;3;4", "wc_summary_paper": "277;46;45;48", "wc_summary_review": "45;40;63;45", "wc_main_review": "211;359;237;375", "wc_review": "533;445;345;468", "wc_reply_reviewers": "95;559;32;45", "wc_reply_authors": "914;2484;245;892", "reply_reviewers": "2;2;1;1", "reply_authors": "3;6;1;4", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 104.0, 99.88743664745833 ], "wc_summary_review_avg": [ 48.25, 8.757139944068497 ], "wc_main_review_avg": [ 295.5, 72.31009611388993 ], "wc_review_avg": [ 447.75, 67.53286237084876 ], "wc_reply_reviewers_avg": [ 182.75, 218.49756863635807 ], "wc_reply_authors_avg": [ 1133.75, 824.588495857176 ], "reply_reviewers_avg": [ 1.5, 0.5 ], "reply_authors_avg": [ 3.5, 1.8027756377319946 ], "replies_avg": [ 25, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9373800430483313870&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Peking University", "aff_unique_dep": "", "aff_unique_url": "http://www.pku.edu.cn", "aff_unique_abbr": "Peking U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "Graph-less Neural Networks: Teaching Old MLPs New Tricks Via Distillation", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6711", "id": "4p6_5HBWPCw", "poster": "", "openreview": "https://openreview.net/forum?id=4p6_5HBWPCw", "slides": "https://iclr.cc/virtual/2022/poster/6711", "video": "https://iclr.cc/virtual/2022/poster/6711", "author_site": "Shichang Zhang, Yozen Liu, Yizhou Sun, Neil Shah", "tldr": "", "abstract": "Graph Neural Networks (GNNs) are popular for graph machine learning and have shown great results on wide node classification tasks. Yet, they are less popular for practical deployments in the industry owing to their scalability challenges incurred by data dependency. Namely, GNN inference depends on neighbor nodes multiple hops away from the target, and fetching them burdens latency-constrained applications. Existing inference acceleration methods like pruning and quantization can speed up GNNs by reducing Multiplication-and-ACcumulation (MAC) operations, but the improvements are limited given the data dependency is not resolved. Conversely, multi-layer perceptrons (MLPs) have no graph dependency and infer much faster than GNNs, even though they are less accurate than GNNs for node classification in general. Motivated by these complementary strengths and weaknesses, we bring GNNs and MLPs together via knowledge distillation (KD). Our work shows that the performance of MLPs can be improved by large margins with GNN KD. We call the distilled MLPs Graph-less Neural Networks (GLNNs) as they have no inference graph dependency. We show that GLNNs with competitive accuracy infer faster than GNNs by 146X-273X and faster than other acceleration methods by 14X-27X. Under a production setting involving both transductive and inductive predictions across 7 datasets, GLNN accuracies improve over stand-alone MLPs by 12.36% on average and match GNNs on 6/7 datasets. Comprehensive analysis shows when and why GLNNs can achieve competitive accuracies to GNNs and suggests GLNN as a handy choice for latency-constrained applications. ", "keywords": "Graph Neural Networks;Distillation;Node Classification;Model Inference Acceleration", "primary_area": "", "supplementary_material": "/attachment/8f427cff78ced924a327c553f9c963658ed20a3f.zip", "author": "Shichang Zhang;Yozen Liu;Yizhou Sun;Neil Shah", "authorids": "~Shichang_Zhang2;~Yozen_Liu1;~Yizhou_Sun1;~Neil_Shah2", "gender": "M;;F;M", "homepage": "https://shichangzh.github.io/;https://www.linkedin.com/in/yozen-liu-531a67130/;http://web.cs.ucla.edu/~yzsun/;http://nshah.net", "dblp": "234/4118;242/8056.html;37/3868;71/7771", "google_scholar": "TYqG0x4AAAAJ;i3U2JjEAAAAJ;https://scholar.google.com.tw/citations?user=TQgOjK0AAAAJ;Qut69OgAAAAJ", "orcid": "0000-0003-0954-5018;;;0000-0003-3261-8430", "linkedin": "shichang-zhang-4430a4106/;;;", "or_profile": "~Shichang_Zhang2;~Yozen_Liu1;~Yizhou_Sun1;~Neil_Shah2", "aff": "University of California, Los Angeles;Snap Inc.;University of California, Los Angeles;Snap Inc.", "aff_domain": "cs.ucla.edu;snapchat.com;ucla.edu;snap.com", "position": "PhD student;Researcher;Associate Professor;Research Scientist", "bibtex": "@inproceedings{\nzhang2022graphless,\ntitle={Graph-less Neural Networks: Teaching Old {MLP}s New Tricks Via Distillation},\nauthor={Shichang Zhang and Yozen Liu and Yizhou Sun and Neil Shah},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=4p6_5HBWPCw}\n}", "github": "", "project": "", "reviewers": "5DSq;tKyT;UK5z;fkQB", "pdf_size": 0, "recommendation": "3;8;8;10", "confidence": "5;4;3;4", "correctness": "3;3;4;4", "technical_novelty": "1;3;3;4", "empirical_novelty": "1;3;3;4", "wc_summary_paper": "68;312;72;28", "wc_summary_review": "73;77;20;32", "wc_main_review": "579;585;312;208", "wc_review": "720;974;404;268", "wc_reply_reviewers": "663;36;32;35", "wc_reply_authors": "3135;868;991;580", "reply_reviewers": "1;1;1;1", "reply_authors": "5;2;3;2", "recommendation_avg": [ 7.25, 2.5860201081971503 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.75, 1.0897247358851685 ], "empirical_novelty_avg": [ 2.75, 1.0897247358851685 ], "wc_summary_paper_avg": [ 120.0, 112.17842929904127 ], "wc_summary_review_avg": [ 50.5, 24.904818810824544 ], "wc_main_review_avg": [ 421.0, 165.15901428623266 ], "wc_review_avg": [ 591.5, 275.0577212150206 ], "wc_reply_reviewers_avg": [ 191.5, 272.22463150861273 ], "wc_reply_authors_avg": [ 1393.5, 1016.4596647186744 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 3.0, 1.224744871391589 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.6835859270246631, "corr_recommendation_correctness": 0.6767155423319645, "gs_citation": 258, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14166973652994088038&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=4p6_5HBWPCw", "email": "cs.ucla.edu;snapchat.com;ucla.edu;snap.com", "author_num": 4, "aff_unique_index": "0;1;0;1", "aff_unique_norm": "University of California, Los Angeles;Snap Inc.", "aff_unique_dep": ";", "aff_unique_url": "https://www.ucla.edu;https://www.snapinc.com", "aff_unique_abbr": "UCLA;Snap", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Los Angeles;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "4pijrj4H_B", "title": "Fair Node Representation Learning via Adaptive Data Augmentation", "track": "main", "status": "Reject", "tldr": "", "abstract": "Node representation learning has demonstrated its efficacy for various applications on graphs, which leads to increasing attention towards the area. However, fairness is a largely under-explored territory within the field, which may lead to biased results towards underrepresented groups in ensuing tasks. To this end, this work theoretically explains the sources of bias in node representations obtained via Graph Neural Networks (GNNs). Our analysis reveals that both nodal features and graph structure lead to bias in the obtained representations. Building upon the analysis, fairness-aware data augmentation frameworks on nodal features and graph structure are developed to reduce the intrinsic bias. Our analysis and proposed schemes can be readily employed to enhance the fairness of various GNN-based learning mechanisms. Extensive experiments on node classification and link prediction are carried out over real networks in the context of graph contrastive learning. Comparison with multiple benchmarks demonstrates that the proposed augmentation strategies can improve fairness in terms of statistical parity and equal opportunity, while providing comparable utility to state-of-the-art contrastive methods. ", "keywords": "Fair node representations;fairness-aware graph data augmentations;unsupervised node representation learning;graph contrastive learning", "primary_area": "", "supplementary_material": "/attachment/442df7782d66c8400dd29d781ec50378f96360a5.zip", "author": "Oyku Deniz Kose;Yanning Shen", "authorids": "~Oyku_Deniz_Kose1;~Yanning_Shen1", "gender": "F;F", "homepage": ";https://sites.google.com/uci.edu/yanning-shen/home", "dblp": "263/4808;120/7392.html", "google_scholar": "mIURm58AAAAJ;MfzntAIAAAAJ", "orcid": "0000-0002-8685-2161;", "linkedin": ";", "or_profile": "~Oyku_Deniz_Kose1;~Yanning_Shen1", "aff": "University of California, Irvine;University of California, Irvine", "aff_domain": "uci.edu;uci.edu", "position": "PhD student;Assistant Professor", "bibtex": "@misc{\nkose2022fair,\ntitle={Fair Node Representation Learning via Adaptive Data Augmentation},\nauthor={Oyku Deniz Kose and Yanning Shen},\nyear={2022},\nurl={https://openreview.net/forum?id=4pijrj4H_B}\n}", "github": "", "project": "", "reviewers": "xMz4;FSrq;gszv;LHQM", "site": "https://openreview.net/forum?id=4pijrj4H_B", "pdf_size": 0, "recommendation": "3;6;6;8", "confidence": "5;3;4;2", "correctness": "2;3;3;4", "technical_novelty": "3;3;3;4", "empirical_novelty": "3;3;3;4", "wc_summary_paper": "53;50;63;86", "wc_summary_review": "31;35;31;23", "wc_main_review": "600;140;409;348", "wc_review": "684;225;503;457", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "2785;1206;1616;728", "reply_reviewers": "0;0;0;0", "reply_authors": "5;2;4;1", "recommendation_avg": [ 5.75, 1.7853571071357126 ], "confidence_avg": [ 3.5, 1.118033988749895 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 3.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 63.0, 14.124446891825535 ], "wc_summary_review_avg": [ 30.0, 4.358898943540674 ], "wc_main_review_avg": [ 374.25, 164.11333736171474 ], "wc_review_avg": [ 467.25, 163.59152636979704 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 1583.75, 761.420506356376 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 3.0, 1.5811388300841898 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.9393364366277244, "corr_recommendation_correctness": 0.9901475429766743, "gs_citation": 36, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17422208843441512950&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 2, "aff_unique_index": "0;0", "aff_unique_norm": "University of California, Irvine", "aff_unique_dep": "", "aff_unique_url": "https://www.uci.edu", "aff_unique_abbr": "UCI", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Irvine", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Query Embedding on Hyper-Relational Knowledge Graphs", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6321", "id": "4rLw09TgRw9", "poster": "", "openreview": "https://openreview.net/forum?id=4rLw09TgRw9", "slides": "https://iclr.cc/virtual/2022/poster/6321", "video": "https://iclr.cc/virtual/2022/poster/6321", "author_site": "Dimitrios Alivanistos, Max Berrendorf, Michael Cochez, Mikhail Galkin", "tldr": "", "abstract": "Multi-hop logical reasoning is an established problem in the field of representation learning on knowledge graphs (KGs). It subsumes both one-hop link prediction as well as other more complex types of logical queries. Existing algorithms operate only on classical, triple-based graphs, whereas modern KGs often employ a hyper-relational modeling paradigm. In this paradigm, typed edges may have several key-value pairs known as qualifiers that provide fine-grained context for facts. In queries, this context modifies the meaning of relations, and usually reduces the answer set. Hyper-relational queries are often observed in real-world KG applications, and existing approaches for approximate query answering cannot make use of qualifier pairs. In this work, we bridge this gap and extend the multi-hop reasoning problem to hyper-relational KGs allowing to tackle this new type of complex queries. Building upon recent advancements in Graph Neural Networks and query embedding techniques, we study how to embed and answer hyper-relational conjunctive queries. Besides that, we propose a method to answer such queries and demonstrate in our experiments that qualifiers improve query answering on a diverse set of query patterns.", "keywords": "Query embedding;Approximate Query Answering;Graph Neural Network;Hyper-relational Graph;Knowledge Graph", "primary_area": "", "supplementary_material": "", "author": "Dimitrios Alivanistos;Max Berrendorf;Michael Cochez;Mikhail Galkin", "authorids": "~Dimitrios_Alivanistos1;~Max_Berrendorf1;~Michael_Cochez2;~Mikhail_Galkin1", "gender": "M;;M;M", "homepage": "https://dimitrisalivas.github.io/;https://www.dbs.ifi.lmu.de/cms/personen/mitarbeiter/berrendorf/index.html;https://www.cochez.nl;https://migalkin.github.io/", "dblp": ";https://dblp.uni-trier.de/pers/hd/b/Berrendorf:Max;83/11448;160/8154", "google_scholar": "snZnIqEAAAAJ;h25eyTIAAAAJ;https://scholar.google.fi/citations?user=JuZrOtoAAAAJ;yfYRbG4AAAAJ", "orcid": ";0000-0001-9724-4009;0000-0001-5726-4638;", "linkedin": ";;michaelcochez/;", "or_profile": "~Dimitrios_Alivanistos1;~Max_Berrendorf1;~Michael_Cochez2;~Mikhail_Galkin1", "aff": "Vrije Universiteit Amsterdam;Institut f\u00fcr Informatik;VU Amsterdam;Mila & McGill University", "aff_domain": "vu.nl;lmu.de;vu.nl;mila.quebec", "position": "PhD student;PhD student;Assistant Professor;Postdoc", "bibtex": "@inproceedings{\nalivanistos2022query,\ntitle={Query Embedding on Hyper-Relational Knowledge Graphs},\nauthor={Dimitrios Alivanistos and Max Berrendorf and Michael Cochez and Mikhail Galkin},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=4rLw09TgRw9}\n}", "github": "", "project": "", "reviewers": "GQAR;iDXR;frRt;wJVm;Bute", "pdf_size": 0, "recommendation": "5;5;6;6;8", "confidence": "2;4;4;3;4", "correctness": "3;3;3;3;4", "technical_novelty": "2;2;3;3;3", "empirical_novelty": "2;3;3;3;3", "wc_summary_paper": "53;83;97;72;90", "wc_summary_review": "72;23;86;0;136", "wc_main_review": "146;315;266;156;157", "wc_review": "271;421;449;228;383", "wc_reply_reviewers": "0;0;0;12;137", "wc_reply_authors": "539;1036;821;441;671", "reply_reviewers": "0;0;0;1;1", "reply_authors": "1;3;2;1;1", "recommendation_avg": [ 6.0, 1.0954451150103321 ], "confidence_avg": [ 3.4, 0.8 ], "correctness_avg": [ 3.2, 0.39999999999999997 ], "technical_novelty_avg": [ 2.6, 0.4898979485566356 ], "empirical_novelty_avg": [ 2.8, 0.39999999999999997 ], "wc_summary_paper_avg": [ 79.0, 15.401298646542765 ], "wc_summary_review_avg": [ 63.4, 47.97332592180784 ], "wc_main_review_avg": [ 208.0, 69.2271622992016 ], "wc_review_avg": [ 350.4, 86.08739745165956 ], "wc_reply_reviewers_avg": [ 29.8, 53.80111523007679 ], "wc_reply_authors_avg": [ 701.6, 210.41254715439382 ], "reply_reviewers_avg": [ 0.4, 0.48989794855663565 ], "reply_authors_avg": [ 1.6, 0.8 ], "replies_avg": [ 20, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.4564354645876385, "corr_recommendation_correctness": 0.912870929175277, "gs_citation": 33, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4690980256531947393&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=4rLw09TgRw9", "email": "vu.nl;lmu.de;vu.nl;mila.quebec", "author_num": 4, "aff_unique_index": "0;1;0;2", "aff_unique_norm": "Vrije Universiteit Amsterdam;Institut f\u00fcr Informatik;McGill University", "aff_unique_dep": ";Department of Computer Science;Mila", "aff_unique_url": "https://www.vu.nl;;https://www.mcgill.ca", "aff_unique_abbr": "VU Amsterdam;;McGill", "aff_campus_unique_index": "1", "aff_campus_unique": ";Amsterdam", "aff_country_unique_index": "0;1;0;2", "aff_country_unique": "Netherlands;Germany;Canada" }, { "id": "4sz0AcJ8HUB", "title": "SERCNN: Stacked Embedding Recurrent Convolutional Neural Network in Depression Detection on Twitter", "track": "main", "status": "Reject", "tldr": "", "abstract": "Conventional approach of self-reporting-based screening for depression is not scalable, expensive, and requires one to be fully aware of their mental health. Motivated by previous studies that demonstrated great potentials for using social media posts to monitor and predict one's mental health status, this study utilizes natural language processing and machine learning techniques on social media data to predict one's risk of depression. Most existing works utilize handcrafted features, and the adoption of deep learning in this domain is still lacking. Social media texts are often unstructured, ill-formed, and contain typos, making handcrafted features and conventional feature extraction methods inefficient. Moreover, prediction models built on these features often require a high number of posts per individual for accurate predictions. Therefore, this study proposes a Stacked Embedding Recurrent Convolutional Neural Network (SERCNN) for a more optimized prediction that has a better trade-off between the number of posts and accuracy. Feature vectors of two widely available pretrained embeddings trained on two distinct datasets are stacked, forming a meta-embedding vector that has a more robust and richer representation for any given word. We adapt Lai et al. (2015) RCNN approach that incorporates both the embedding vector and context learned from the neural network to form the final user representation before performing classification. We conducted our experiments on the Shen et al. (2017) depression Twitter dataset, the largest ground truth dataset used in this domain. Using SERCNN, our proposed model achieved a prediction accuracy of 78% when using only ten posts from each user, and the accuracy increases to 90% with an F1-measure of 0.89 when five hundred posts are analyzed.", "keywords": "Social Media;Twitter;NLP;Depression;Mental Health", "primary_area": "", "supplementary_material": "", "author": "Heng Ee Tay;Mei Kuan Lim;Chun Yong Chong", "authorids": "~Heng_Ee_Tay1;lim.meikuan@monash.edu;chong.chunyong@monash.edu", "gender": "M;;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": "d3aries/;;", "or_profile": "~Heng_Ee_Tay1;lim.meikuan@monash.edu;chong.chunyong@monash.edu", "aff": "Monash University Malaysia;;", "aff_domain": "monash.edu;;", "position": "PhD student;;", "bibtex": "@misc{\ntay2022sercnn,\ntitle={{SERCNN}: Stacked Embedding Recurrent Convolutional Neural Network in Depression Detection on Twitter},\nauthor={Heng Ee Tay and Mei Kuan Lim and Chun Yong Chong},\nyear={2022},\nurl={https://openreview.net/forum?id=4sz0AcJ8HUB}\n}", "github": "", "project": "", "reviewers": "4NmL;pXvC;pWBG;XSZ1", "site": "https://openreview.net/forum?id=4sz0AcJ8HUB", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "5;5;2;4", "correctness": "3;2;4;2", "technical_novelty": "1;1;2;1", "empirical_novelty": "1;2;1;0", "wc_summary_paper": "48;81;27;123", "wc_summary_review": "10;41;2;76", "wc_main_review": "211;339;88;489", "wc_review": "269;461;117;688", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 4.0, 1.224744871391589 ], "correctness_avg": [ 2.75, 0.82915619758885 ], "technical_novelty_avg": [ 1.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 69.75, 36.27240686803124 ], "wc_summary_review_avg": [ 32.25, 29.157974895386683 ], "wc_main_review_avg": [ 281.75, 148.97545938845096 ], "wc_review_avg": [ 383.75, 213.80993311817858 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": -0.5222329678670935, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:b58Besf0Lg4J:scholar.google.com/&scioq=SERCNN:+Stacked+Embedding+Recurrent+Convolutional+Neural+Network+in+Depression+Detection+on+Twitter&hl=en&as_sdt=0,14", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "Monash University", "aff_unique_dep": "", "aff_unique_url": "https://www.monash.edu.my", "aff_unique_abbr": "Monash", "aff_campus_unique_index": "0", "aff_campus_unique": "Malaysia", "aff_country_unique_index": "0", "aff_country_unique": "Malaysia" }, { "id": "4tOrvK-fFOR", "title": "Sound Source Detection from Raw Waveforms with Multi-Scale Synperiodic Filterbanks", "track": "main", "status": "Reject", "tldr": "", "abstract": "Accurately estimating sound sources' temporal location, spatial location and semantic identity label from multi-channel sound raw waveforms is crucial for an agent to understand the 3D environment acoustically. Multiple sounds form a complex waveform mixture in time, frequency and space, so accurately detecting them requires a representation that can achieve high resolutions across all these dimensions. Existing methods fail to do so because they either extract hand-engineered features\\,(i.e. STFT, LogMel) that require a great deal of parameter tuning work (i.e. filter length, window size), or propose to learn a single filter bank to process sound waveforms in a single-scale that often leads to a limited time-frequency resolution capability. In this paper, we tackle this issue by proposing to learn a group of parameterized synperiodic filter banks. Each synperiodic filter's length and frequency response are inversely related, hence is capable of maintaining a better time-frequency resolution trade-off. By alternating the periodicity term, we can easily obtain a group of synperiodic filter banks, where each bank differs in its temporal length. Convolution of the proposed filterbanks with the raw waveform helps to achieve multi-scale perception in the time domain. Moreover, applying synperiodic filter bank to recursively process a downsampled waveform enables us to also achieve multi-scale perception in the frequency domain. Benefiting from the advantage of the multi-scale perception in both time and frequency domain, our proposed synperiodic filter bank groups learn a data-dependent time-frequency resolution map. Following the learnable synperiodic filter bank group front-end, we add a Transformer-like backbone with two parallel soft-stitched branches to learn semantic identity label and spatial location representation semi-independently. Experiments on both direction of arrival estimation task and the physical location estimation task shows our framework outperforms existing methods by a large margin. Replacing existing methods' front-end with synperiodic filter bank also helps to improve the performance.", "keywords": "speech processing;object detection;deep neural network;sound object;detection and localization;filter bank design", "primary_area": "", "supplementary_material": "/attachment/1a934b65c9159a79497a50fa0bc7652bfcc4ffe6.zip", "author": "Yuhang He", "authorids": "~Yuhang_He3", "gender": "M", "homepage": "https://yuhanghe01.github.io/", "dblp": "", "google_scholar": "H1p3ve8AAAAJ", "orcid": "", "linkedin": "", "or_profile": "~Yuhang_He3", "aff": "University of Oxford", "aff_domain": "ox.ac.uk", "position": "PhD student", "bibtex": "@misc{\nhe2022sound,\ntitle={Sound Source Detection from Raw Waveforms with Multi-Scale Synperiodic Filterbanks},\nauthor={Yuhang He},\nyear={2022},\nurl={https://openreview.net/forum?id=4tOrvK-fFOR}\n}", "github": "", "project": "", "reviewers": "TyRd;dYuW;DURm;sfZq", "site": "https://openreview.net/forum?id=4tOrvK-fFOR", "pdf_size": 0, "recommendation": "5;6;6;6", "confidence": "4;5;2;3", "correctness": "3;3;3;3", "technical_novelty": "2;3;3;3", "empirical_novelty": "1;2;3;3", "wc_summary_paper": "134;77;42;75", "wc_summary_review": "37;112;65;37", "wc_main_review": "653;713;81;208", "wc_review": "824;902;188;320", "wc_reply_reviewers": "152;0;0;0", "wc_reply_authors": "1467;705;249;602", "reply_reviewers": "1;0;0;0", "reply_authors": "2;1;1;1", "recommendation_avg": [ 5.75, 0.4330127018922193 ], "confidence_avg": [ 3.5, 1.118033988749895 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 82.0, 33.08322837934654 ], "wc_summary_review_avg": [ 62.75, 30.646166155002163 ], "wc_main_review_avg": [ 413.75, 273.79132108231624 ], "wc_review_avg": [ 558.5, 309.2874876227618 ], "wc_reply_reviewers_avg": [ 38.0, 65.81793068761733 ], "wc_reply_authors_avg": [ 755.75, 444.096484449044 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 1, 0 ], "corr_recommendation_confidence": -0.2581988897471611, "corr_recommendation_correctness": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6345460990222129898&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "University of Oxford", "aff_unique_dep": "", "aff_unique_url": "https://www.ox.ac.uk", "aff_unique_abbr": "Oxford", "aff_country_unique_index": "0", "aff_country_unique": "United Kingdom" }, { "id": "4x50D2_CMVA", "title": "Automatic Tuning of Federated Learning Hyper-Parameters from System Perspective", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "\nFederated Learning (FL) is a distributed model training paradigm that preserves clients' data privacy. \nFL hyper-parameters significantly affect the training overheads in terms of time, computation, and communication.\nHowever, the current practice of manually selecting FL hyper-parameters puts a high burden on FL practitioners since various applications prefer different training preferences. In this paper, we propose FedTuning, an automatic FL hyper-parameter tuning algorithm tailored to applications' diverse system requirements of FL training. FedTuning is lightweight and flexible, achieving an average of 41% improvement for different training preferences on time, computation, and communication compared to fixed FL hyper-parameters. ", "keywords": "federated learning;training preference;system perspective;hyper-parameter tuning", "primary_area": "", "supplementary_material": "", "author": "Huanle Zhang;Mi Zhang;Xin Liu;Prasant Mohapatra;Michael Delucia", "authorids": "~Huanle_Zhang1;~Mi_Zhang1;~Xin_Liu6;~Prasant_Mohapatra1;michael.j.delucia2.civ@mail.mil", "gender": "M;M;F;M;", "homepage": "https://datasystech.github.io/;https://mi-zhang.github.io/;https://xinliu.engineering.ucdavis.edu/;https://faculty.engineering.ucdavis.edu/mohapatra/;", "dblp": ";84/2519-2.html;76/1820-2;m/Prasant_Mohapatra2.html;", "google_scholar": "Xm4NYnsAAAAJ;https://scholar.google.com.tw/citations?user=r3A90uAAAAAJ;4MV5BkQAAAAJ;;", "orcid": ";;;0000-0002-2768-5308;", "linkedin": ";mizhang/;;;", "or_profile": "~Huanle_Zhang1;~Mi_Zhang1;~Xin_Liu6;~Prasant_Mohapatra1;michael.j.delucia2.civ@mail.mil", "aff": "University of California, Davis;Michigan State University;University of California, Davis;University of California, Davis;", "aff_domain": "ucdavis.edu;msu.edu;ucdavis.edu;ucdavis.edu;", "position": "Postdoc;Associate Professor;Full Professor;Full Professor;", "bibtex": "@misc{\nzhang2022automatic,\ntitle={Automatic Tuning of Federated Learning Hyper-Parameters from System Perspective},\nauthor={Huanle Zhang and Mi Zhang and Xin Liu and Prasant Mohapatra and Michael Delucia},\nyear={2022},\nurl={https://openreview.net/forum?id=4x50D2_CMVA}\n}", "github": "", "project": "", "reviewers": "i8oS;8REv;yZL2;ChAj;nsED", "site": "https://openreview.net/forum?id=4x50D2_CMVA", "pdf_size": 0, "recommendation": "3;3;3;3;6", "confidence": "4;4;4;3;4", "correctness": "3;2;2;3;3", "technical_novelty": "1;2;2;2;3", "empirical_novelty": "2;2;2;2;3", "wc_summary_paper": "70;23;85;35;87", "wc_summary_review": "25;88;54;40;49", "wc_main_review": "412;484;651;178;931", "wc_review": "507;595;790;253;1067", "wc_reply_reviewers": "31;63;220;0;97", "wc_reply_authors": "382;772;523;456;350", "reply_reviewers": "1;2;1;0;1", "reply_authors": "1;1;2;1;2", "recommendation_avg": [ 3.6, 1.2 ], "confidence_avg": [ 3.8, 0.39999999999999997 ], "correctness_avg": [ 2.6, 0.4898979485566356 ], "technical_novelty_avg": [ 2.0, 0.6324555320336759 ], "empirical_novelty_avg": [ 2.2, 0.39999999999999997 ], "wc_summary_paper_avg": [ 60.0, 26.260236099471765 ], "wc_summary_review_avg": [ 51.2, 20.87486526902629 ], "wc_main_review_avg": [ 531.2, 251.14888014880734 ], "wc_review_avg": [ 642.4, 273.5994152040534 ], "wc_reply_reviewers_avg": [ 82.2, 76.09835740671411 ], "wc_reply_authors_avg": [ 496.6, 150.2099863524393 ], "reply_reviewers_avg": [ 1.0, 0.6324555320336759 ], "reply_authors_avg": [ 1.4, 0.4898979485566356 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.25000000000000006, "corr_recommendation_correctness": 0.408248290463863, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3822861498243381206&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1;0;0", "aff_unique_norm": "University of California, Davis;Michigan State University", "aff_unique_dep": ";", "aff_unique_url": "https://www.ucdavis.edu;https://www.msu.edu", "aff_unique_abbr": "UC Davis;MSU", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Davis;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Sqrt(d) Dimension Dependence of Langevin Monte Carlo", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6903", "id": "5-2mX9_U5i", "poster": "", "openreview": "https://openreview.net/forum?id=5-2mX9_U5i", "slides": "https://iclr.cc/virtual/2022/poster/6903", "video": "https://iclr.cc/virtual/2022/poster/6903", "author_site": "Ruilin Li, Hongyuan Zha, Molei Tao", "tldr": "", "abstract": "This article considers the popular MCMC method of unadjusted Langevin Monte Carlo (LMC) and provides a non-asymptotic analysis of its sampling error in 2-Wasserstein distance. The proof is based on a refinement of mean-square analysis in Li et al. (2019), and this refined framework automates the analysis of a large class of sampling algorithms based on discretizations of contractive SDEs. Using this framework, we establish an $\\tilde{O}(\\sqrt{d}/\\epsilon)$ mixing time bound for LMC, without warm start, under the common log-smooth and log-strongly-convex conditions, plus a growth condition on the 3rd-order derivative of the potential of target measures. This bound improves the best previously known $\\tilde{O}(d/\\epsilon)$ result and is optimal (in terms of order) in both dimension $d$ and accuracy tolerance $\\epsilon$ for target measures satisfying the aforementioned assumptions. Our theoretical analysis is further validated by numerical experiments.", "keywords": "unadjusted Langevin algorithm / Langevin Monte Carlo;non-asymptotic sampling error in Wasserstein-2 distance;optimal dimension dependence;mean square analysis", "primary_area": "", "supplementary_material": "/attachment/521f7ed39d0e85dd0af2fa732a35179f57a82703.zip", "author": "Ruilin Li;Hongyuan Zha;Molei Tao", "authorids": "~Ruilin_Li1;~Hongyuan_Zha1;~Molei_Tao1", "gender": "M;;", "homepage": ";;http://people.math.gatech.edu/~mtao8/", "dblp": ";z/HongyuanZha;56/9263", "google_scholar": "lLjVU_cAAAAJ;n1DQMIsAAAAJ;", "orcid": ";;", "linkedin": ";;", "or_profile": "~Ruilin_Li1;~Hongyuan_Zha1;~Molei_Tao1", "aff": ";The Chinese University of Hong Kong, Shenzhen;Georgia Institute of Technology", "aff_domain": ";cuhk.edu.cn;gatech.edu", "position": ";Full Professor;Associate Professor", "bibtex": "@inproceedings{\nli2022sqrtd,\ntitle={Sqrt(d) Dimension Dependence of Langevin Monte Carlo},\nauthor={Ruilin Li and Hongyuan Zha and Molei Tao},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=5-2mX9_U5i}\n}", "github": "", "project": "", "reviewers": "XyDg;LMTg;WTWj;uCPh", "pdf_size": 0, "recommendation": "6;6;8;8", "confidence": "4;3;4;4", "correctness": "3;4;4;4", "technical_novelty": "3;3;3;3", "empirical_novelty": "2;3;3;0", "wc_summary_paper": "24;60;157;48", "wc_summary_review": "13;31;92;26", "wc_main_review": "276;102;784;43", "wc_review": "313;193;1033;117", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "154;35;504;50", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 7.0, 1.0 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.0, 1.224744871391589 ], "wc_summary_paper_avg": [ 72.25, 50.61805507919087 ], "wc_summary_review_avg": [ 40.5, 30.450779957170226 ], "wc_main_review_avg": [ 301.25, 291.58135657136927 ], "wc_review_avg": [ 414.0, 364.1469483601366 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 185.75, 189.3705032469418 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 34, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4310330770060386860&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=5-2mX9_U5i", "email": ";cuhk.edu.cn;gatech.edu", "author_num": 3, "aff_unique_index": "0;1", "aff_unique_norm": "Chinese University of Hong Kong;Georgia Institute of Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.cuhk.edu.cn;https://www.gatech.edu", "aff_unique_abbr": "CUHK;Georgia Tech", "aff_campus_unique_index": "0", "aff_campus_unique": "Shenzhen;", "aff_country_unique_index": "0;1", "aff_country_unique": "China;United States" }, { "title": "Acceleration of Federated Learning with Alleviated Forgetting in Local Training", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6422", "id": "541PxiEKN3F", "poster": "", "openreview": "https://openreview.net/forum?id=541PxiEKN3F", "slides": "https://iclr.cc/virtual/2022/poster/6422", "video": "https://iclr.cc/virtual/2022/poster/6422", "author_site": "Chencheng Xu, Zhiwei Hong, Minlie Huang, Tao Jiang", "tldr": "", "abstract": "Federated learning (FL) enables distributed optimization of machine learning models while protecting privacy by independently training local models on each client and then aggregating parameters on a central server, thereby producing an effective global model. Although a variety of FL algorithms have been proposed, their training efficiency remains low when the data are not independently and identically distributed (non-i.i.d.) across different clients. We observe that the slow convergence rates of the existing methods are (at least partially) caused by the catastrophic forgetting issue during the local training stage on each individual client, which leads to a large increase in the loss function concerning the previous training data provided at other clients. Here, we propose FedReg, an algorithm to accelerate FL with alleviated knowledge forgetting in the local training stage by regularizing locally trained parameters with the loss on generated pseudo data, which encode the knowledge of previous training data learned by the global model. Our comprehensive experiments demonstrate that FedReg not only significantly improves the convergence rate of FL, especially when the neural network architecture is deep and the clients' data are extremely non-i.i.d., but is also able to protect privacy better in classification problems and more robust against gradient inversion attacks.", "keywords": "Federated learning;non-i.i.d. data", "primary_area": "", "supplementary_material": "/attachment/178dd540d239e2ae7938e477519234f589258051.zip", "author": "Chencheng Xu;Zhiwei Hong;Minlie Huang;Tao Jiang", "authorids": "~Chencheng_Xu1;~Zhiwei_Hong1;~Minlie_Huang1;~Tao_Jiang2", "gender": "F;M;M;", "homepage": ";;http://coai.cs.tsinghua.edu.cn/hml;", "dblp": "261/4841.html;266/4877.html;;", "google_scholar": "gthKG_8AAAAJ;uPBztC0AAAAJ;https://scholar.google.com/citations?hl=zh-CN;", "orcid": ";;;", "linkedin": ";%E5%BF%97%E4%BC%9F-%E6%B4%AA-33222a154/;;", "or_profile": "~Chencheng_Xu1;~Zhiwei_Hong1;~Minlie_Huang1;~Tao_Jiang2", "aff": "Tsinghua University;Tsinghua University;Tsinghua University;University of California-Riverside", "aff_domain": "tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;", "position": "PhD student;PhD student;Full Professor;", "bibtex": "@inproceedings{\nxu2022acceleration,\ntitle={Acceleration of Federated Learning with Alleviated Forgetting in Local Training},\nauthor={Chencheng Xu and Zhiwei Hong and Minlie Huang and Tao Jiang},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=541PxiEKN3F}\n}", "github": "", "project": "", "reviewers": "6gnX;YGd2;9rQY;UTZk", "pdf_size": 0, "recommendation": "5;6;6;6", "confidence": "2;4;4;3", "correctness": "3;3;2;4", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;3;3;0", "wc_summary_paper": "64;174;26;78", "wc_summary_review": "14;37;27;37", "wc_main_review": "461;452;143;106", "wc_review": "539;663;196;221", "wc_reply_reviewers": "590;189;59;70", "wc_reply_authors": "1758;1106;496;355", "reply_reviewers": "3;1;1;1", "reply_authors": "5;2;2;1", "recommendation_avg": [ 5.75, 0.4330127018922193 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 1.224744871391589 ], "wc_summary_paper_avg": [ 85.5, 54.52293095569973 ], "wc_summary_review_avg": [ 28.75, 9.443913383762052 ], "wc_main_review_avg": [ 290.5, 166.54503895343146 ], "wc_review_avg": [ 404.75, 201.2813640156485 ], "wc_reply_reviewers_avg": [ 227.0, 215.68843269864985 ], "wc_reply_authors_avg": [ 928.75, 555.7730539527803 ], "reply_reviewers_avg": [ 1.5, 0.8660254037844386 ], "reply_authors_avg": [ 2.5, 1.5 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.8703882797784891, "corr_recommendation_correctness": 0.0, "gs_citation": 64, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=637540214191418314&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=541PxiEKN3F", "email": "tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;", "author_num": 4, "aff_unique_index": "0;0;0;1", "aff_unique_norm": "Tsinghua University;University of California, Riverside", "aff_unique_dep": ";", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.ucr.edu", "aff_unique_abbr": "THU;UCR", "aff_campus_unique_index": "1", "aff_campus_unique": ";Riverside", "aff_country_unique_index": "0;0;0;1", "aff_country_unique": "China;United States" }, { "title": "Continuous-Time Meta-Learning with Forward Mode Differentiation", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6055", "id": "57PipS27Km", "poster": "", "openreview": "https://openreview.net/forum?id=57PipS27Km", "slides": "https://iclr.cc/virtual/2022/poster/6055", "video": "https://iclr.cc/virtual/2022/poster/6055", "author_site": "Tristan Deleu, David Kanaa, Leo Feng, Giancarlo Kerg, Yoshua Bengio, Guillaume Lajoie, Pierre-Luc Bacon", "tldr": "", "abstract": "Drawing inspiration from gradient-based meta-learning methods with infinitely small gradient steps, we introduce Continuous-Time Meta-Learning (COMLN), a meta-learning algorithm where adaptation follows the dynamics of a gradient vector field. Specifically, representations of the inputs are meta-learned such that a task-specific linear classifier is obtained as a solution of an ordinary differential equation (ODE). Treating the learning process as an ODE offers the notable advantage that the length of the trajectory is now continuous, as opposed to a fixed and discrete number of gradient steps. As a consequence, we can optimize the amount of adaptation necessary to solve a new task using stochastic gradient descent, in addition to learning the initial conditions as is standard practice in gradient-based meta-learning. Importantly, in order to compute the exact meta-gradients required for the outer-loop updates, we devise an efficient algorithm based on forward mode differentiation, whose memory requirements do not scale with the length of the learning trajectory, thus allowing longer adaptation in constant memory. We provide analytical guarantees for the stability of COMLN, we show empirically its efficiency in terms of runtime and memory usage, and we illustrate its effectiveness on a range of few-shot image classification problems.", "keywords": "meta-learning;few-shot learning;dynamical systems", "primary_area": "", "supplementary_material": "/attachment/85675731a0dbac5f83cca025b5b88c23416c44e3.zip", "author": "Tristan Deleu;David Kanaa;Leo Feng;Giancarlo Kerg;Yoshua Bengio;Guillaume Lajoie;Pierre-Luc Bacon", "authorids": "~Tristan_Deleu1;~David_Kanaa1;~Leo_Feng1;~Giancarlo_Kerg1;~Yoshua_Bengio1;~Guillaume_Lajoie1;~Pierre-Luc_Bacon1", "gender": ";;M;M;M;M;", "homepage": "https://tristandeleu.github.io/;;https://leofeng-ca.github.io/;;http://yoshuabengio.org;https://dms.umontreal.ca/~lajoie/;", "dblp": "192/1896;;255/9367;;56/953;31/10384;", "google_scholar": "nLNwh-wAAAAJ;;WsRunnEAAAAJ;;kukA0LcAAAAJ;;", "orcid": ";;;;;;", "linkedin": ";;leo-feng/;;yoshuabengio/?originalSubdomain=ca;;", "or_profile": "~Tristan_Deleu1;~David_Kanaa1;~Leo_Feng1;~Giancarlo_Kerg1;~Yoshua_Bengio1;~Guillaume_Lajoie1;~Pierre-Luc_Bacon1", "aff": "University of Montreal;;Mila - Quebec Artificial Intelligence Institute;University of Montreal;University of Montreal;Mila - Quebec Artificial Intelligence Institute;", "aff_domain": "umontreal.ca;;mila.quebec;umontreal.ca;umontreal.ca;mila.quebec;", "position": "PhD student;;PhD student;PhD student;Full Professor;Associate Professor;", "bibtex": "@inproceedings{\ndeleu2022continuoustime,\ntitle={Continuous-Time Meta-Learning with Forward Mode Differentiation},\nauthor={Tristan Deleu and David Kanaa and Leo Feng and Giancarlo Kerg and Yoshua Bengio and Guillaume Lajoie and Pierre-Luc Bacon},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=57PipS27Km}\n}", "github": "", "project": "", "reviewers": "nRbR;Bwjf;Us8H;PyBo", "pdf_size": 0, "recommendation": "6;8;8;8", "confidence": "4;4;4;3", "correctness": "3;4;4;4", "technical_novelty": "3;3;4;3", "empirical_novelty": "3;2;4;3", "wc_summary_paper": "161;143;56;100", "wc_summary_review": "56;54;13;55", "wc_main_review": "737;399;155;339", "wc_review": "954;596;224;494", "wc_reply_reviewers": "0;47;26;0", "wc_reply_authors": "1720;1002;168;1198", "reply_reviewers": "0;1;1;0", "reply_authors": "3;3;1;2", "recommendation_avg": [ 7.5, 0.8660254037844386 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 115.0, 40.6386515524322 ], "wc_summary_review_avg": [ 44.5, 18.200274723201296 ], "wc_main_review_avg": [ 407.5, 210.41090751194434 ], "wc_review_avg": [ 567.0, 261.5282011562042 ], "wc_reply_reviewers_avg": [ 18.25, 19.702474463883973 ], "wc_reply_authors_avg": [ 1022.0, 558.5463275324618 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 2.25, 0.82915619758885 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": 1.0, "gs_citation": 26, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2167930181404098181&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=57PipS27Km", "email": "umontreal.ca;;mila.quebec;umontreal.ca;umontreal.ca;mila.quebec;", "author_num": 7, "aff_unique_index": "0;1;0;0;1", "aff_unique_norm": "University of Montreal;Quebec Artificial Intelligence Institute", "aff_unique_dep": ";Artificial Intelligence", "aff_unique_url": "https://wwwumontreal.ca;https://mila.quebec", "aff_unique_abbr": "UM;Mila", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "Canada" }, { "id": "57T1ctyxtP", "title": "Structured Stochastic Gradient MCMC", "track": "main", "status": "Reject", "tldr": "", "abstract": "Stochastic gradient Markov Chain Monte Carlo (SGMCMC) is considered the gold standard for Bayesian inference in large-scale models, such as Bayesian neural networks. Since practitioners face speed versus accuracy tradeoffs in these models, variational inference (VI) is often the preferable option. Unfortunately, VI makes strong assumptions on both the factorization and functional form of the posterior. In this work, we propose a new non-parametric variational approximation that makes no assumptions about the approximate posterior's functional form and allows practitioners to specify the exact dependencies the algorithm should respect or break. The approach relies on a new Langevin-type algorithm that operates on a modified energy function, where parts of the latent variables are averaged over samples from earlier iterations of the Markov chain. This way, statistical dependencies can be broken in a controlled way, allowing the chain to mix faster. This scheme can be further modified in a ``dropout'' manner, leading to even more scalability. By implementing the scheme on a ResNet-20 architecture, we obtain better predictive likelihoods and faster mixing time than full SGMCMC.", "keywords": "Approximate MCMC;Langevin Dynamics;Stochastic Gradient MCMC", "primary_area": "", "supplementary_material": "/attachment/837aeeb1cdc76daf1335d160193b96c6397486a9.zip", "author": "Antonios Alexos;Alex James Boyd;Stephan Mandt", "authorids": "~Antonios_Alexos1;~Alex_James_Boyd1;~Stephan_Mandt1", "gender": "M;M;M", "homepage": "https://antonyalexos.github.io;;https://www.stephanmandt.com", "dblp": "267/9750;https://dblp.uni-trier.de/pers/hd/b/Boyd:Alex;147/5018", "google_scholar": "G33kETkAAAAJ;;HOrGe7wAAAAJ", "orcid": ";;", "linkedin": "antonios-alexos-861446122/;ajboyd2;stephan-mandt-8702795a/", "or_profile": "~Antonios_Alexos1;~Alex_James_Boyd1;~Stephan_M_Mandt1", "aff": "University of California, Irvine;University of California, Irvine;University of California, Irvine", "aff_domain": "uci.edu;uci.edu;uci.edu", "position": "PhD student;PhD student;Assistant Professor", "bibtex": "@misc{\nalexos2022structured,\ntitle={Structured Stochastic Gradient {MCMC}},\nauthor={Antonios Alexos and Alex James Boyd and Stephan Mandt},\nyear={2022},\nurl={https://openreview.net/forum?id=57T1ctyxtP}\n}", "github": "", "project": "", "reviewers": "zQoE;AW5Q;NCgf;qXUG", "site": "https://openreview.net/forum?id=57T1ctyxtP", "pdf_size": 0, "recommendation": "3;5;5;8", "confidence": "4;4;3;5", "correctness": "1;3;2;4", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;2;2;1", "wc_summary_paper": "24;133;61;149", "wc_summary_review": "25;125;101;22", "wc_main_review": "102;415;508;238", "wc_review": "151;673;670;409", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 5.25, 1.7853571071357126 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 2.5, 1.118033988749895 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 91.75, 51.27072751580574 ], "wc_summary_review_avg": [ 68.25, 45.55971356362988 ], "wc_main_review_avg": [ 315.75, 156.9591905560168 ], "wc_review_avg": [ 475.75, 215.96223628217967 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.5940885257860046, "corr_recommendation_correctness": 0.9393364366277244, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8097612641869986343&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of California, Irvine", "aff_unique_dep": "", "aff_unique_url": "https://www.uci.edu", "aff_unique_abbr": "UCI", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Irvine", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "5ALGcXpmFyC", "title": "Training Data Size Induced Double Descent For Denoising Neural Networks and the Role of Training Noise Level", "track": "main", "status": "Reject", "tldr": "", "abstract": "When training a denoising neural network, we show that more data isn\u2019t more beneficial. In fact the generalization error versus number of of training data points is a double descent curve.\nTraining a network to denoise noisy inputs is the most widely used technique for pre-training deep neural networks. Hence one important question is the effect of scaling the number of training data points. We formalize the question of how many data points should be used by looking at the generalization error for denoising noisy test data. Prior work on computing the generalization error focus on adding noise to target outputs. However, adding noise to the input is more in line with current pre-training practices. In the linear (in the inputs) regime, we provide an asymptotically exact formula for the generalization error for rank 1 data and an approximation for the generalization error for rank r data. We show using our formulas, that the generalization error versus number of data points follows a double descent curve. From this, we derive a formula for the amount of noise that needs to be added to the training data to minimize the denoising error and see that this follows a double descent curve as well.", "keywords": "Double Descent;Denoising Neural Neworks;High Dimensional Statistics.", "primary_area": "", "supplementary_material": "/attachment/6c0b8bb32d2d554412bdf533fe67d788029cb48e.zip", "author": "Rishi Sonthalia;Raj Rao Nadakuditi", "authorids": "~Rishi_Sonthalia1;rajnrao@umich.edu", "gender": "M;", "homepage": "https://sites.google.com/umich.edu/rsonthal/home;", "dblp": "223/5758;", "google_scholar": "HYozgRsAAAAJ;", "orcid": ";", "linkedin": "rishi-sonthalia-53b44795/;", "or_profile": "~Rishi_Sonthalia1;rajnrao@umich.edu", "aff": "University of California, Los Angeles;", "aff_domain": "ucla.edu;", "position": "Postdoc;", "bibtex": "@misc{\nsonthalia2022training,\ntitle={Training Data Size Induced Double Descent For Denoising Neural Networks and the Role of Training Noise Level},\nauthor={Rishi Sonthalia and Raj Rao Nadakuditi},\nyear={2022},\nurl={https://openreview.net/forum?id=5ALGcXpmFyC}\n}", "github": "", "project": "", "reviewers": "yBGu;f7hC;dPcd;5V96", "site": "https://openreview.net/forum?id=5ALGcXpmFyC", "pdf_size": 0, "recommendation": "3;5;6;6", "confidence": "3;2;3;4", "correctness": "3;3;3;3", "technical_novelty": "3;3;2;4", "empirical_novelty": "3;3;4;2", "wc_summary_paper": "91;83;138;60", "wc_summary_review": "45;57;20;175", "wc_main_review": "382;245;248;493", "wc_review": "518;385;406;728", "wc_reply_reviewers": "196;0;10;53", "wc_reply_authors": "716;80;285;336", "reply_reviewers": "1;0;1;1", "reply_authors": "2;1;1;1", "recommendation_avg": [ 5.0, 1.224744871391589 ], "confidence_avg": [ 3.0, 0.7071067811865476 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 93.0, 28.36370920736567 ], "wc_summary_review_avg": [ 74.25, 59.67987516742976 ], "wc_main_review_avg": [ 342.0, 103.25453985176632 ], "wc_review_avg": [ 509.25, 136.0392866050098 ], "wc_reply_reviewers_avg": [ 64.75, 78.34977664294902 ], "wc_reply_authors_avg": [ 354.25, 229.7850898121982 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.28867513459481287, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:0hir1DFcRsEJ:scholar.google.com/&scioq=Training+Data+Size+Induced+Double+Descent+For+Denoising+Neural+Networks+and+the+Role+of+Training+Noise+Level&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "University of California, Los Angeles", "aff_unique_dep": "", "aff_unique_url": "https://www.ucla.edu", "aff_unique_abbr": "UCLA", "aff_campus_unique_index": "0", "aff_campus_unique": "Los Angeles", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "id": "5Bw_CZer00j", "title": "Self-supervised Discovery of Human Actons from Long Kinematic Videos", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "For human action understanding, a popular research direction is to analyze short video clips with unambiguous semantic content, such as jumping and drinking. However, methods for understanding short semantic actions cannot be directly translated to long kinematic sequences such as dancing, where it becomes challenging even to semantically label the human movements. To promote analysis of long videos of complex human motions, we propose a self-supervised method for learning a representation of such motion sequences that is similar to words in a sentence, where videos are segmented and clustered into recurring temporal patterns, called actons. Our approach first obtains a frame-wise representation by contrasting two augmented views of video frames conditioned on their temporal context. The frame-wise representations across a collection of videos are then clustered by K-means. Actons are then automatically extracted by forming a continuous motion sequence from frames within the same cluster. We evaluate the self-supervised representation by temporal alignment metrics, and the clustering results by normalized mutual information and language entropy. We also study an application of this tokenization by using it to classify dance genres. On AIST++ and PKU-MMD datasets, actons are shown to bring significant performance improvements compared to several baselines.", "keywords": "Self-supervised Learning;Video Analysis", "primary_area": "", "supplementary_material": "/attachment/8f160d831cf4a5dc7fad5de93a6a498982f04c2b.zip", "author": "Kenneth Li;Xiao Sun;Zhirong Wu;Fangyun Wei;Stephen Lin", "authorids": "~Kenneth_Li1;~Xiao_Sun2;~Zhirong_Wu1;~Fangyun_Wei1;~Stephen_Lin1", "gender": ";M;M;M;", "homepage": "https://likenneth.github.io/;https://jimmysuen.github.io/;https://www.microsoft.com/en-us/research/people/wuzhiron/;;https://www.microsoft.com/en-us/research/people/stevelin/", "dblp": "75/6627-12;151/8845;147/5025;161/2636;55/4755-1.html", "google_scholar": "v0GItgwAAAAJ;wYIe0tYAAAAJ;lH4zgcIAAAAJ;-ncz2s8AAAAJ;c3PYmxUAAAAJ", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~Kenneth_Li1;~Xiao_Sun2;~Zhirong_Wu1;~Fangyun_Wei1;~Stephen_Lin1", "aff": "Harvard University;Microsoft;Microsoft Research;Microsoft Research;Microsoft Research", "aff_domain": "harvard.edu;microsoft.com;microsoft.com;microsoft.com;microsoft.com", "position": "PhD student;Researcher;Researcher;Researcher;Researcher", "bibtex": "@misc{\nli2022selfsupervised,\ntitle={Self-supervised Discovery of Human Actons from Long Kinematic Videos},\nauthor={Kenneth Li and Xiao Sun and Zhirong Wu and Fangyun Wei and Stephen Lin},\nyear={2022},\nurl={https://openreview.net/forum?id=5Bw_CZer00j}\n}", "github": "", "project": "", "reviewers": "y2rp;xpKv;JGwk", "site": "https://openreview.net/forum?id=5Bw_CZer00j", "pdf_size": 0, "recommendation": "3;3;5", "confidence": "4;5;4", "correctness": "4;4;3", "technical_novelty": "2;1;2", "empirical_novelty": "0;2;0", "wc_summary_paper": "47;99;46", "wc_summary_review": "30;92;73", "wc_main_review": "73;324;276", "wc_review": "150;515;395", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 3.6666666666666665, 0.9428090415820634 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "correctness_avg": [ 3.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 1.6666666666666667, 0.4714045207910317 ], "empirical_novelty_avg": [ 0.6666666666666666, 0.9428090415820634 ], "wc_summary_paper_avg": [ 64.0, 24.752104287649296 ], "wc_summary_review_avg": [ 65.0, 25.93581821856921 ], "wc_main_review_avg": [ 224.33333333333334, 108.78827551206467 ], "wc_review_avg": [ 353.3333333333333, 151.89543186752596 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.4999999999999999, "corr_recommendation_correctness": -1.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:zTxnHADW7VQJ:scholar.google.com/&scioq=Self-supervised+Discovery+of+Human+Actons+from+Long+Kinematic+Videos&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;1;1;1", "aff_unique_norm": "Harvard University;Microsoft", "aff_unique_dep": ";Microsoft Corporation", "aff_unique_url": "https://www.harvard.edu;https://www.microsoft.com", "aff_unique_abbr": "Harvard;Microsoft", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Resonance in Weight Space: Covariate Shift Can Drive Divergence of SGD with Momentum", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7051", "id": "5ECQL05ub0J", "poster": "", "openreview": "https://openreview.net/forum?id=5ECQL05ub0J", "slides": "https://iclr.cc/virtual/2022/poster/7051", "video": "https://iclr.cc/virtual/2022/poster/7051", "author_site": "Kirby Banman, Garnet Liam Peet-Pare, Nidhi Hegde, Alona Fyshe, Martha White", "tldr": "", "abstract": "Most convergence guarantees for stochastic gradient descent with momentum (SGDm) rely on iid sampling. Yet, SGDm is often used outside this regime, in settings with temporally correlated input samples such as continual learning and reinforcement learning. Existing work has shown that SGDm with a decaying step-size can converge under Markovian temporal correlation. In this work, we show that SGDm under covariate shift with a fixed step-size can be unstable and diverge. In particular, we show SGDm under covariate shift is a parametric oscillator, and so can suffer from a phenomenon known as resonance. We approximate the learning system as a time varying system of ordinary differential equations, and leverage existing theory to characterize the system's divergence/convergence as resonant/nonresonant modes. The theoretical result is limited to the linear setting with periodic covariate shift, so we empirically supplement this result to show that resonance phenomena persist even under non-periodic covariate shift, nonlinear dynamics with neural networks, and optimizers other than SGDm.", "keywords": "optimization;momentum;stochastic gradient descent;non-iid sampling", "primary_area": "", "supplementary_material": "/attachment/0945c42d80cb1110d6da871961508649b30acbe7.zip", "author": "Kirby Banman;Garnet Liam Peet-Pare;Nidhi Hegde;Alona Fyshe;Martha White", "authorids": "~Kirby_Banman1;~Garnet_Liam_Peet-Pare1;~Nidhi_Hegde1;~Alona_Fyshe1;~Martha_White1", "gender": ";M;F;F;F", "homepage": "https://kdbanman.com;https://lpeetpare.github.io/;https://sites.ualberta.ca/~nidhih/;http://webdocs.cs.ualberta.ca/~alona/;http://marthawhite.ca", "dblp": ";;15/6158;30/3660;60/7057", "google_scholar": "https://scholar.google.com/citations?hl=en;;RyhZW8EAAAAJ;https://scholar.google.ca/citations?user=Vw8z7qwAAAAJ;t5zdD_IAAAAJ", "orcid": ";;0000-0001-7385-3416;0000-0003-4367-0306;0000-0002-5356-2950", "linkedin": ";liam-peet-pare-498a721ab/?originalSubdomain=ca;nidhihegde/;;", "or_profile": "~Kirby_Banman1;~Garnet_Liam_Peet-Pare1;~Nidhi_Hegde1;~Alona_Fyshe1;~Martha_White1", "aff": ";;University of Alberta;University of Alberta;University of Alberta", "aff_domain": ";;ualberta.ca;ualberta.ca;ualberta.ca", "position": ";;Associate Professor;Assistant Professor;Associate Professor", "bibtex": "@inproceedings{\nbanman2022resonance,\ntitle={Resonance in Weight Space: Covariate Shift Can Drive Divergence of {SGD} with Momentum},\nauthor={Kirby Banman and Garnet Liam Peet-Pare and Nidhi Hegde and Alona Fyshe and Martha White},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=5ECQL05ub0J}\n}", "github": "", "project": "", "reviewers": "6uKo;Vbeb;Vijg;Z17i", "pdf_size": 0, "recommendation": "3;6;8;8", "confidence": "3;4;4;2", "correctness": "3;4;4;4", "technical_novelty": "2;2;4;4", "empirical_novelty": "0;3;3;3", "wc_summary_paper": "171;137;108;110", "wc_summary_review": "68;50;14;30", "wc_main_review": "759;384;455;179", "wc_review": "998;571;577;319", "wc_reply_reviewers": "500;0;0;18", "wc_reply_authors": "2215;742;531;266", "reply_reviewers": "1;0;0;1", "reply_authors": "3;1;1;2", "recommendation_avg": [ 6.25, 2.0463381929681126 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 1.0 ], "empirical_novelty_avg": [ 2.25, 1.299038105676658 ], "wc_summary_paper_avg": [ 131.5, 25.51960031034969 ], "wc_summary_review_avg": [ 40.5, 20.3654118544163 ], "wc_main_review_avg": [ 444.25, 208.0689489087692 ], "wc_review_avg": [ 616.25, 243.7615381884517 ], "wc_reply_reviewers_avg": [ 129.5, 214.03445984233474 ], "wc_reply_authors_avg": [ 938.5, 756.0385241507207 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.75, 0.82915619758885 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.03683547343418787, "corr_recommendation_correctness": 0.9169493006161777, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:HXoFm2fztRkJ:scholar.google.com/&scioq=Resonance+in+Weight+Space:+Covariate+Shift+Can+Drive+Divergence+of+SGD+with+Momentum&hl=en&as_sdt=0,5", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=5ECQL05ub0J", "email": ";;ualberta.ca;ualberta.ca;ualberta.ca", "author_num": 5, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Alberta", "aff_unique_dep": "", "aff_unique_url": "https://www.ualberta.ca", "aff_unique_abbr": "UAlberta", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Canada" }, { "title": "Understanding Latent Correlation-Based Multiview Learning and Self-Supervision: An Identifiability Perspective", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6061", "id": "5FUq05QRc5b", "poster": "", "openreview": "https://openreview.net/forum?id=5FUq05QRc5b", "slides": "https://iclr.cc/virtual/2022/poster/6061", "video": "https://iclr.cc/virtual/2022/poster/6061", "author_site": "Qi Lyu, Xiao Fu, Weiran Wang, Songtao Lu", "tldr": "", "abstract": "Multiple views of data, both naturally acquired (e.g., image and audio) and artificially produced (e.g., via adding different noise to data samples), have proven useful in enhancing representation learning. Natural views are often handled by multiview analysis tools, e.g., (deep) canonical correlation analysis [(D)CCA], while the artificial ones are frequently used in self-supervised learning (SSL) paradigms, e.g., BYOL and Barlow Twins. Both types of approaches often involve learning neural feature extractors such that the embeddings of data exhibit high cross-view correlations. Although intuitive, the effectiveness of correlation-based neural embedding is mostly empirically validated. \nThis work aims to understand latent correlation maximization-based deep multiview learning from a latent component identification viewpoint. An intuitive generative model of multiview data is adopted, where the views are different nonlinear mixtures of shared and private components. Since the shared components are view/distortion-invariant, representing the data using such components is believed to reveal the identity of the samples effectively and robustly. Under this model, latent correlation maximization is shown to guarantee the extraction of the shared components across views (up to certain ambiguities). In addition, it is further shown that the private information in each view can be provably disentangled from the shared using proper regularization design. A finite sample analysis, which has been rare in nonlinear mixture identifiability study, is also presented. The theoretical results and newly designed regularization are tested on a series of tasks. ", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/2dcad8168bd71de571a245b23b3e5d651bcffb8b.zip", "author": "Qi Lyu;Xiao Fu;Weiran Wang;Songtao Lu", "authorids": "~Qi_Lyu2;~Xiao_Fu1;~Weiran_Wang1;~Songtao_Lu1", "gender": "M;M;M;M", "homepage": "http://web.engr.oregonstate.edu/~lyuqi/;https://web.engr.oregonstate.edu/~fuxia/;https://sites.google.com/corp/ttic.edu/weiranwang/home;https://songtaogithub.github.io/", "dblp": "165/3049;60/4601-1;;05/2887", "google_scholar": ";pDnpH1MAAAAJ;O9djN1AAAAAJ;LRsjX7kAAAAJ", "orcid": ";;;", "linkedin": ";;weiran-wang-12ab8b16b/;", "or_profile": "~Qi_Lyu2;~Xiao_Fu1;~Weiran_Wang1;~Songtao_Lu1", "aff": "Oregon State University;Oregon State University;Google;IBM Thomas J. Watson Research Center", "aff_domain": "oregonstate.edu;oregonstate.edu;google.com;ibm.com", "position": "PhD student;Assistant Professor;Researcher;Research Scientist", "bibtex": "@inproceedings{\nlyu2022understanding,\ntitle={Understanding Latent Correlation-Based Multiview Learning and Self-Supervision: An Identifiability Perspective},\nauthor={Qi Lyu and Xiao Fu and Weiran Wang and Songtao Lu},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=5FUq05QRc5b}\n}", "github": "", "project": "", "reviewers": "hUwt;u537;zhWW", "pdf_size": 0, "recommendation": "8;8;8", "confidence": "2;2;4", "correctness": "4;4;4", "technical_novelty": "3;3;3", "empirical_novelty": "3;3;4", "wc_summary_paper": "59;53;181", "wc_summary_review": "44;48;256", "wc_main_review": "121;387;915", "wc_review": "224;488;1352", "wc_reply_reviewers": "0;4;450", "wc_reply_authors": "562;524;3281", "reply_reviewers": "0;1;2", "reply_authors": "1;1;7", "recommendation_avg": [ 8.0, 0.0 ], "confidence_avg": [ 2.6666666666666665, 0.9428090415820634 ], "correctness_avg": [ 4.0, 0.0 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 97.66666666666667, 58.97645481225726 ], "wc_summary_review_avg": [ 116.0, 99.00841715059718 ], "wc_main_review_avg": [ 474.3333333333333, 329.9791239188052 ], "wc_review_avg": [ 688.0, 481.7302149543871 ], "wc_reply_reviewers_avg": [ 151.33333333333334, 211.1955386734188 ], "wc_reply_authors_avg": [ 1455.6666666666667, 1290.7988052192934 ], "reply_reviewers_avg": [ 1.0, 0.816496580927726 ], "reply_authors_avg": [ 3.0, 2.8284271247461903 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 39, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11446943291269931498&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=5FUq05QRc5b", "email": "oregonstate.edu;oregonstate.edu;google.com;ibm.com", "author_num": 4, "aff_unique_index": "0;0;1;2", "aff_unique_norm": "Oregon State University;Google;IBM", "aff_unique_dep": ";Google;Research", "aff_unique_url": "https://oregonstate.edu;https://www.google.com;https://www.ibm.com/research", "aff_unique_abbr": "OSU;Google;IBM", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Mountain View;Yorktown Heights", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "switch-GLAT: Multilingual Parallel Machine Translation Via Code-Switch Decoder", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7033", "id": "5HvpvYd68b", "poster": "", "openreview": "https://openreview.net/forum?id=5HvpvYd68b", "slides": "https://iclr.cc/virtual/2022/poster/7033", "video": "https://iclr.cc/virtual/2022/poster/7033", "author_site": "Zhenqiao Song, Hao Zhou, Lihua Qian, Jingjing Xu, Shanbo Cheng, Mingxuan Wang, Lei Li", "tldr": "", "abstract": "Multilingual machine translation aims to develop a single model for multiple language directions. However, existing multilingual models based on Transformer are limited in terms of both translation performance and inference speed. In this paper, we propose switch-GLAT, a non-autoregressive multilingual machine translation model with a code-switch decoder. It can generate contextual code-switched translations for a given source sentence, and perform code-switch back-translation, greatly boosting multilingual translation performance. In addition, its inference is highly efficient thanks to its parallel decoder. Experiments show that our proposed switch-GLAT outperform the multilingual Transformer with as much as 0.74 BLEU improvement and 6.2x faster decoding speed in inference.\n", "keywords": "multilingual non-autoregressive machine translation;contextualized code-switching;back-translation", "primary_area": "", "supplementary_material": "", "author": "Zhenqiao Song;Hao Zhou;Lihua Qian;Jingjing Xu;Shanbo Cheng;Mingxuan Wang;Lei Li", "authorids": "~Zhenqiao_Song1;~Hao_Zhou5;~Lihua_Qian1;~Jingjing_Xu1;~Shanbo_Cheng1;~Mingxuan_Wang1;~Lei_Li11", "gender": "F;M;;F;M;M;M", "homepage": "https://jocelynsong.github.io/;https://zhouh.github.io/;https://to.be.done;;https://sites.google.com/view/chengshanbo/home;https://mingxuan.github.io/;https://www.cs.cmu.edu/~leili", "dblp": "227/7889;63/778-12;167/5564;25/624;185/5589;43/11214;13/7007-5.html", "google_scholar": "https://scholar.google.com/citations?hl=en;https://scholar.google.com/citations?hl=zh-CN;;;CYUBKN0AAAAJ;hOQ6G6EAAAAJ;BYXqAlwAAAAJ", "orcid": ";;;;;;0000-0003-3095-9776", "linkedin": ";;;;;;", "or_profile": "~Zhenqiao_Song1;~Hao_Zhou5;~Lihua_Qian1;~Jingjing_Xu1;~Shanbo_Cheng1;~Mingxuan_Wang1;~Lei_Li11", "aff": "ByteDance AI Lab;Bytedance;ByteDance;;ByteDance Inc.;ByteDance Inc.;Computer Science Department, UC Santa Barbara", "aff_domain": "bytedance.com;bytedance.com;bytedance.com;;bytedance.com;bytedance.com;cs.ucsb.edu", "position": "NLP Researcher;Researcher;Researcher;;Researcher;Researcher;Assistant Professor", "bibtex": "@inproceedings{\nsong2022switchglat,\ntitle={switch-{GLAT}: Multilingual Parallel Machine Translation Via Code-Switch Decoder},\nauthor={Zhenqiao Song and Hao Zhou and Lihua Qian and Jingjing Xu and Shanbo Cheng and Mingxuan Wang and Lei Li},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=5HvpvYd68b}\n}", "github": "", "project": "", "reviewers": "np9D;cdKo;ip8L", "pdf_size": 0, "recommendation": "5;6;8", "confidence": "4;4;4", "correctness": "3;3;3", "technical_novelty": "2;3;3", "empirical_novelty": "3;3;3", "wc_summary_paper": "75;109;152", "wc_summary_review": "36;53;57", "wc_main_review": "409;361;397", "wc_review": "520;523;606", "wc_reply_reviewers": "496;106;131", "wc_reply_authors": "2247;652;828", "reply_reviewers": "2;1;1", "reply_authors": "5;2;3", "recommendation_avg": [ 6.333333333333333, 1.247219128924647 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 112.0, 31.506613062445584 ], "wc_summary_review_avg": [ 48.666666666666664, 9.104333522498441 ], "wc_main_review_avg": [ 389.0, 20.396078054371138 ], "wc_review_avg": [ 549.6666666666666, 39.852505846210256 ], "wc_reply_reviewers_avg": [ 244.33333333333334, 178.2476429640017 ], "wc_reply_authors_avg": [ 1242.3333333333333, 714.0309672711837 ], "reply_reviewers_avg": [ 1.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 3.3333333333333335, 1.247219128924647 ], "replies_avg": [ 22, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5095064953806118031&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "pdf": "https://openreview.net/pdf?id=5HvpvYd68b", "email": "bytedance.com;bytedance.com;bytedance.com;;bytedance.com;bytedance.com;cs.ucsb.edu", "author_num": 7, "aff_unique_index": "0;0;0;0;0;1", "aff_unique_norm": "ByteDance;University of California, Santa Barbara", "aff_unique_dep": "AI Lab;Computer Science Department", "aff_unique_url": "https://www.bytedance.com;https://www.ucsb.edu", "aff_unique_abbr": "ByteDance;UCSB", "aff_campus_unique_index": "1", "aff_campus_unique": ";Santa Barbara", "aff_country_unique_index": "0;0;0;0;0;1", "aff_country_unique": "China;United States" }, { "title": "Generative Modeling with Optimal Transport Maps", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6183", "id": "5JdLZg346Lw", "poster": "", "openreview": "https://openreview.net/forum?id=5JdLZg346Lw", "slides": "https://iclr.cc/virtual/2022/poster/6183", "video": "https://iclr.cc/virtual/2022/poster/6183", "author_site": "Litu Rout, Alexander Korotin, Evgeny Burnaev", "tldr": "", "abstract": "With the discovery of Wasserstein GANs, Optimal Transport (OT) has become a powerful tool for large-scale generative modeling tasks. In these tasks, OT cost is typically used as the loss for training GANs. In contrast to this approach, we show that the OT map itself can be used as a generative model, providing comparable performance. Previous analogous approaches consider OT maps as generative models only in the latent spaces due to their poor performance in the original high-dimensional ambient space. In contrast, we apply OT maps directly in the ambient space, e.g., a space of high-dimensional images. First, we derive a min-max optimization algorithm to efficiently compute OT maps for the quadratic cost (Wasserstein-2 distance). Next, we extend the approach to the case when the input and output distributions are located in the spaces of different dimensions and derive error bounds for the computed OT map. We evaluate the algorithm on image generation and unpaired image restoration tasks. In particular, we consider denoising, colorization, and inpainting, where the optimality of the restoration map is a desired attribute, since the output (restored) image is expected to be close to the input (degraded) one.", "keywords": "Optimal Transport Map;Generative Modeling;Unpaired Image Restoration", "primary_area": "", "supplementary_material": "/attachment/eebc7b2dde2be3ef527556c77b4d711a02cb4ac5.zip", "author": "Litu Rout;Alexander Korotin;Evgeny Burnaev", "authorids": "~Litu_Rout1;~Alexander_Korotin2;~Evgeny_Burnaev1", "gender": "M;M;M", "homepage": "https://liturout.github.io/;http://faculty.skoltech.ru/people/evgenyburnaev;https://akorotin.netlify.app", "dblp": "206/6445;144/7845;209/9906", "google_scholar": "https://scholar.google.co.in/citations?hl=en;https://scholar.google.ru/citations?user=pCRdcOwAAAAJ;https://scholar.google.ru/citations?user=1rIIvjAAAAAJ", "orcid": ";0000-0001-8424-0690;0000-0003-4286-925X", "linkedin": "litu-rout-sac-isro/;;", "or_profile": "~Litu_Rout1;~Evgeny_Burnaev1;~Alexander_Andreevich_Korotin1", "aff": "Indian Space Research Organisation;Skolkovo Institute of Science and Technology;Skolkovo Institute of Science and Technology", "aff_domain": "isro.gov.in;skoltech.ru;skoltech.ru", "position": "Scientist;Associate Professor;PhD student", "bibtex": "@inproceedings{\nrout2022generative,\ntitle={Generative Modeling with Optimal Transport Maps},\nauthor={Litu Rout and Alexander Korotin and Evgeny Burnaev},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=5JdLZg346Lw}\n}", "github": "", "project": "", "reviewers": "vEgs;4T3P;qNeK;fFFr", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "4;5;3;4", "correctness": "2;3;3;4", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;3;2;3", "wc_summary_paper": "53;115;235;96", "wc_summary_review": "23;66;116;19", "wc_main_review": "212;93;512;212", "wc_review": "288;274;863;327", "wc_reply_reviewers": "309;0;210;0", "wc_reply_authors": "1185;513;815;255", "reply_reviewers": "2;0;1;0", "reply_authors": "3;2;3;1", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 124.75, 67.49953703544936 ], "wc_summary_review_avg": [ 56.0, 39.2364626336269 ], "wc_main_review_avg": [ 257.25, 154.89573105802498 ], "wc_review_avg": [ 438.0, 246.1412196280826 ], "wc_reply_reviewers_avg": [ 129.75, 134.3881970263758 ], "wc_reply_authors_avg": [ 692.0, 346.83857916904225 ], "reply_reviewers_avg": [ 0.75, 0.82915619758885 ], "reply_authors_avg": [ 2.25, 0.82915619758885 ], "replies_avg": [ 20, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.9733285267845754, "gs_citation": 84, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7494071659521623034&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=5JdLZg346Lw", "email": "isro.gov.in;skoltech.ru;skoltech.ru", "author_num": 3, "aff_unique_index": "0;1;1", "aff_unique_norm": "Indian Space Research Organisation;Skolkovo Institute of Science and Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.isro.gov.in;https://www.skoltech.ru", "aff_unique_abbr": "ISRO;Skoltech", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1", "aff_country_unique": "India;Russian Federation" }, { "id": "5Jj1qMVtS9W", "title": "Universally rank consistent ordinal regression in neural networks", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Despite the pervasiveness of ordinal labels in supervised learning, it remains common practice in deep learning to treat such problems as categorical classification using the categorical cross entropy loss. Recent methods attempting to address this issue while respecting the ordinal structure of the labels have resorted to converting ordinal regression into a series of extended binary classification subtasks. However, the adoption of such methods remains inconsistent due to theoretical and practical limitations. Here we address these limitations by demonstrating that the subtask probabilities form a Markov chain. We show how to straightforwardly modify neural network architectures to exploit this fact and thereby constrain predictions to be universally rank consistent. We furthermore prove that all rank consistent solutions can be represented within this formulation. Using diverse benchmarks and the real-world application of a specialized recurrent neural network for COVID-19 prognosis, we demonstrate the practical superiority of this method versus the current state-of-the-art. The method is open sourced as user-friendly PyTorch and TensorFlow packages.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/d751a36dbd52454303c53b0de6ac10e273f499fb.zip", "author": "Garrett Jenkinson;Kia Khezeli;Gavin Robert Oliver;John Kalantari;Eric W Klee", "authorids": "~Garrett_Jenkinson1;~Kia_Khezeli1;~Gavin_Robert_Oliver1;~John_Kalantari1;~Eric_W_Klee1", "gender": ";;;M;M", "homepage": "https://www.mayo.edu/research/faculty/jenkinson-william-garrett-g-ph-d/bio-20507427;;https://www.mayo.edu/research/faculty/oliver-gavin-r-m-s/bio-20460117;;", "dblp": ";;;;", "google_scholar": "5y22LmQAAAAJ;;L9XLKR0AAAAJ;;", "orcid": "0000-0003-2548-098X;;;;0000-0003-2946-5795", "linkedin": ";kia-khezeli;;jkalantari;", "or_profile": "~Garrett_Jenkinson1;~Kia_Khezeli1;~Gavin_Robert_Oliver1;~John_Kalantari1;~Eric_W_Klee1", "aff": "Mayo Clinic;Mayo Clinic;Mayo Clinic;Mayo Clinic;", "aff_domain": "mayo.edu;mayo.edu;mayo.edu;mayo.edu;", "position": "Data Scientist;ML Research Scientist;Principal Bioinformatician;Assistant Professor;", "bibtex": "@misc{\njenkinson2022universally,\ntitle={Universally rank consistent ordinal regression in neural networks},\nauthor={Garrett Jenkinson and Kia Khezeli and Gavin Robert Oliver and John Kalantari and Eric W Klee},\nyear={2022},\nurl={https://openreview.net/forum?id=5Jj1qMVtS9W}\n}", "github": "", "project": "", "reviewers": "CSBz;Yebt;JZvW;vP2h", "site": "https://openreview.net/forum?id=5Jj1qMVtS9W", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "4;4;3;4", "correctness": "4;3;3;2", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;3;2;2", "wc_summary_paper": "20;44;64;124", "wc_summary_review": "14;18;46;39", "wc_main_review": "209;658;207;778", "wc_review": "243;720;317;941", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 63.0, 38.50973902793941 ], "wc_summary_review_avg": [ 29.25, 13.5531361684298 ], "wc_main_review_avg": [ 463.0, 258.5062861904909 ], "wc_review_avg": [ 555.25, 287.3189647412784 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": -0.7071067811865475, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6472425174054481373&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Mayo Clinic", "aff_unique_dep": "", "aff_unique_url": "https://www.mayoclinic.org", "aff_unique_abbr": "Mayo Clinic", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Multiset-Equivariant Set Prediction with Approximate Implicit Differentiation", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6515", "id": "5K7RRqZEjoS", "poster": "", "openreview": "https://openreview.net/forum?id=5K7RRqZEjoS", "slides": "https://iclr.cc/virtual/2022/poster/6515", "video": "https://iclr.cc/virtual/2022/poster/6515", "author_site": "Yan Zhang, David Zhang, Simon Lacoste-Julien, Gertjan J Burghouts, Cees G Snoek", "tldr": "", "abstract": "Most set prediction models in deep learning use set-equivariant operations, but they actually operate on multisets. We show that set-equivariant functions cannot represent certain functions on multisets, so we introduce the more appropriate notion of multiset-equivariance. We identify that the existing Deep Set Prediction Network (DSPN) can be multiset-equivariant without being hindered by set-equivariance and improve it with approximate implicit differentiation, allowing for better optimization while being faster and saving memory. In a range of toy experiments, we show that the perspective of multiset-equivariance is beneficial and that our changes to DSPN achieve better results in most cases. On CLEVR object property prediction, we substantially improve over the state-of-the-art Slot Attention from 8% to 77% in one of the strictest evaluation metrics because of the benefits made possible by implicit differentiation.", "keywords": "set prediction;permutation equivariance;implicit differentiation", "primary_area": "", "supplementary_material": "/attachment/fa63763b8da525fc375314aa5c3de0fc4248f164.zip", "author": "Yan Zhang;David W Zhang;Simon Lacoste-Julien;Gertjan J. Burghouts;Cees G. M. Snoek", "authorids": "~Yan_Zhang1;~David_W_Zhang1;~Simon_Lacoste-Julien1;~Gertjan_J._Burghouts1;~Cees_G._M._Snoek1", "gender": "M;M;M;M;M", "homepage": "https://www.cyanogenoid.com;https://davzha.netlify.app/;http://www.iro.umontreal.ca/~slacoste/;https://gertjanburghouts.github.io/;http://www.ceessnoek.info", "dblp": "04/3348-67;119/0960;94/446.html;84/2061;s/CeesSnoek", "google_scholar": "https://scholar.google.co.uk/citations?user=XtCqbfEAAAAJ;https://scholar.google.nl/citations?user=MG3oLzUAAAAJ;oejm5IUAAAAJ;zN6afwwAAAAJ;https://scholar.google.nl/citations?user=0uKdbscAAAAJ", "orcid": "0000-0003-3470-3663;0000-0002-2137-1738;0000-0001-6485-6180;0000-0001-6265-7276;0000-0001-9092-1556", "linkedin": ";david-zhang-1b86b314a;simon-lacoste-julien-355b9a3;gertjanburghouts/;cgmsnoek/", "or_profile": "~Yan_Zhang1;~David_W_Zhang1;~Simon_Lacoste-Julien1;~Gertjan_J._Burghouts1;~Cees_Snoek1", "aff": "Mila - Quebec Artificial Intelligence Institute;University of Amsterdam;Samsung - SAIT AI Lab, Montreal;TNO;University of Amsterdam", "aff_domain": "mila.quebec;uva.nl;samsung.com;tno.nl;uva.nl", "position": "Industrial Partner;PhD student;VP Lab Director;Researcher;Full Professor", "bibtex": "@inproceedings{\nzhang2022multisetequivariant,\ntitle={Multiset-Equivariant Set Prediction with Approximate Implicit Differentiation},\nauthor={Yan Zhang and David W Zhang and Simon Lacoste-Julien and Gertjan J. Burghouts and Cees G. M. Snoek},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=5K7RRqZEjoS}\n}", "github": "", "project": "", "reviewers": "PV3V;KAp5;MvMj;4sjy", "pdf_size": 0, "recommendation": "5;6;8;8", "confidence": "3;4;4;4", "correctness": "2;4;3;4", "technical_novelty": "3;3;3;3", "empirical_novelty": "3;4;3;4", "wc_summary_paper": "105;48;76;121", "wc_summary_review": "43;61;40;61", "wc_main_review": "504;297;172;393", "wc_review": "652;406;288;575", "wc_reply_reviewers": "0;17;0;24", "wc_reply_authors": "1405;468;335;342", "reply_reviewers": "0;1;0;1", "reply_authors": "3;1;1;1", "recommendation_avg": [ 6.75, 1.299038105676658 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 3.5, 0.5 ], "wc_summary_paper_avg": [ 87.5, 27.932955446926844 ], "wc_summary_review_avg": [ 51.25, 9.807522622966516 ], "wc_main_review_avg": [ 341.5, 122.23849639127602 ], "wc_review_avg": [ 480.25, 142.25746904820147 ], "wc_reply_reviewers_avg": [ 10.25, 10.54454835448157 ], "wc_reply_authors_avg": [ 637.5, 446.26589607542275 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.5, 0.8660254037844386 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.7777777777777777, "corr_recommendation_correctness": 0.5222329678670935, "gs_citation": 21, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=473656227219457535&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=5K7RRqZEjoS", "email": "mila.quebec;uva.nl;samsung.com;tno.nl;uva.nl", "author_num": 5, "aff_unique_index": "0;1;2;3;1", "aff_unique_norm": "Quebec Artificial Intelligence Institute;University of Amsterdam;Samsung;TNO", "aff_unique_dep": "Artificial Intelligence;;SAIT AI Lab;", "aff_unique_url": "https://mila.quebec;https://www.uva.nl;https://www.samsung.com;https://www.tno.nl", "aff_unique_abbr": "Mila;UvA;Samsung;TNO", "aff_campus_unique_index": "1", "aff_campus_unique": ";Montreal", "aff_country_unique_index": "0;1;0;1;1", "aff_country_unique": "Canada;Netherlands" }, { "title": "Learning Hierarchical Structures with Differentiable Nondeterministic Stacks", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6931", "id": "5LXw_QplBiF", "poster": "", "openreview": "https://openreview.net/forum?id=5LXw_QplBiF", "slides": "https://iclr.cc/virtual/2022/poster/6931", "video": "https://iclr.cc/virtual/2022/poster/6931", "author_site": "Brian DuSell, David Chiang", "tldr": "", "abstract": "Learning hierarchical structures in sequential data -- from simple algorithmic patterns to natural language -- in a reliable, generalizable way remains a challenging problem for neural language models. Past work has shown that recurrent neural networks (RNNs) struggle to generalize on held-out algorithmic or syntactic patterns without supervision or some inductive bias. To remedy this, many papers have explored augmenting RNNs with various differentiable stacks, by analogy with finite automata and pushdown automata (PDAs). In this paper, we improve the performance of our recently proposed Nondeterministic Stack RNN (NS-RNN), which uses a differentiable data structure that simulates a nondeterministic PDA, with two important changes. First, the model now assigns unnormalized positive weights instead of probabilities to stack actions, and we provide an analysis of why this improves training. Second, the model can directly observe the state of the underlying PDA. Our model achieves lower cross-entropy than all previous stack RNNs on five context-free language modeling tasks (within 0.05 nats of the information-theoretic lower bound), including a task on which the NS-RNN previously failed to outperform a deterministic stack RNN baseline. Finally, we propose a restricted version of the NS-RNN that incrementally processes infinitely long sequences, and we present language modeling results on the Penn Treebank.", "keywords": "RNN;pushdown automata;nondeterminism;formal languages;language modeling", "primary_area": "", "supplementary_material": "/attachment/fbc0f04369e9911a9ace7abc5a8f3be541ece0f8.zip", "author": "Brian DuSell;David Chiang", "authorids": "~Brian_DuSell1;~David_Chiang1", "gender": ";M", "homepage": ";https://nd.edu/~dchiang", "dblp": ";https://dblp.org/pers/hd/c/Chiang_0001:David", "google_scholar": ";dok0514AAAAJ", "orcid": ";0000-0002-0435-4864", "linkedin": ";", "or_profile": "~Brian_DuSell1;~David_Chiang1", "aff": ";University of Notre Dame", "aff_domain": ";nd.edu", "position": ";Associate Professor", "bibtex": "@inproceedings{\ndusell2022learning,\ntitle={Learning Hierarchical Structures with Differentiable Nondeterministic Stacks},\nauthor={Brian DuSell and David Chiang},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=5LXw_QplBiF}\n}", "github": "", "project": "", "reviewers": "THSf;cb1a;Cbcr;vA7w", "pdf_size": 0, "recommendation": "6;6;8;8", "confidence": "3;3;2;3", "correctness": "3;3;3;4", "technical_novelty": "2;4;4;3", "empirical_novelty": "2;4;4;2", "wc_summary_paper": "68;53;186;318", "wc_summary_review": "21;45;50;67", "wc_main_review": "143;133;150;920", "wc_review": "232;231;386;1305", "wc_reply_reviewers": "40;38;0;25", "wc_reply_authors": "384;251;208;1457", "reply_reviewers": "1;1;0;1", "reply_authors": "2;2;2;3", "recommendation_avg": [ 7.0, 1.0 ], "confidence_avg": [ 2.75, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 3.25, 0.82915619758885 ], "empirical_novelty_avg": [ 3.0, 1.0 ], "wc_summary_paper_avg": [ 156.25, 106.649835911735 ], "wc_summary_review_avg": [ 45.75, 16.452583383772897 ], "wc_main_review_avg": [ 336.5, 336.93805068587903 ], "wc_review_avg": [ 538.5, 447.01146517734867 ], "wc_reply_reviewers_avg": [ 25.75, 15.943258763502524 ], "wc_reply_authors_avg": [ 575.0, 513.3395562393375 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 2.25, 0.4330127018922193 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=881169137976073427&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=5LXw_QplBiF", "email": ";nd.edu", "author_num": 2, "aff_unique_index": "0", "aff_unique_norm": "University of Notre Dame", "aff_unique_dep": "", "aff_unique_url": "https://www.nd.edu", "aff_unique_abbr": "Notre Dame", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "id": "5LYsQ7kkb57", "title": "Dynamically Decoding Source Domain Knowledge For Unseen Domain Generalization", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Domain generalization is an important problem which has gain much attention recently. While most existing studies focus on learning domain-invariant feature representations, some researchers try ensemble learning of multi experts and demonstrate promising performance. However, in existing multi-expert learning frameworks, the source domain knowledge has not yet been much explored, resulting in sub-optimal performance. In this paper, we propose to adapt Transformers for the purpose of dynamically decoding source domain knowledge for domain generalization. Specifically, we build one domain-specific local expert per source domain, and one domain-agnostic feature branch as query. Then, all local-domain features will be encoded by Transformer encoders, as source domain knowledge in memory. While in the Transformer decoders, the domain-agnostic query will interact with the memory in the cross-attention module, where similar domains with the input will contribute more in the attention output. This way, the source domain knowledge will be dynamically decoded for the inference of the current input from unseen domain. Therefore, this mechanism makes the proposed method well generalizable to unseen domains. The proposed method is evaluated on three benchmarks in the domain generalization field. The comparison with the state-of-the-art methods shows that the proposed method achieves the best performance, outperforming the others with a clear gap.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Cuicui Kang;Karthik Nandakumar", "authorids": "~Cuicui_Kang3;~Karthik_Nandakumar3", "gender": "F;", "homepage": ";", "dblp": "06/10699;", "google_scholar": "aLO9l5MAAAAJ;", "orcid": ";", "linkedin": ";", "or_profile": "~Cuicui_Kang3;~Karthik_Nandakumar3", "aff": "MBZUAI;", "aff_domain": "mbzuai.ac.ae;", "position": "Principal Researcher;", "bibtex": "@misc{\nkang2022dynamically,\ntitle={Dynamically Decoding Source Domain Knowledge For Unseen Domain Generalization},\nauthor={Cuicui Kang and Karthik Nandakumar},\nyear={2022},\nurl={https://openreview.net/forum?id=5LYsQ7kkb57}\n}", "github": "", "project": "", "reviewers": "4rug;mhFy;CJcE;K47e", "site": "https://openreview.net/forum?id=5LYsQ7kkb57", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "5;4;2;3", "correctness": "4;4;3;2", "technical_novelty": "3;2;3;3", "empirical_novelty": "2;2;3;2", "wc_summary_paper": "86;69;87;83", "wc_summary_review": "37;40;46;39", "wc_main_review": "315;125;123;388", "wc_review": "438;234;256;510", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 3.5, 1.118033988749895 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 81.25, 7.224091638399945 ], "wc_summary_review_avg": [ 40.5, 3.3541019662496847 ], "wc_main_review_avg": [ 237.75, 116.64342030307581 ], "wc_review_avg": [ 359.5, 117.55317945508747 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.8944271909999159, "corr_recommendation_correctness": -0.9045340337332909, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3240109506488828052&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff_unique_index": "0", "aff_unique_norm": "Mohamed bin Zayed University of Artificial Intelligence", "aff_unique_dep": "", "aff_unique_url": "https://www.mbzuai.ac.ae", "aff_unique_abbr": "MBZUAI", "aff_country_unique_index": "0", "aff_country_unique": "United Arab Emirates" }, { "title": "Adaptive Wavelet Transformer Network for 3D Shape Representation Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6423", "id": "5MLb3cLCJY", "poster": "", "openreview": "https://openreview.net/forum?id=5MLb3cLCJY", "slides": "https://iclr.cc/virtual/2022/poster/6423", "video": "https://iclr.cc/virtual/2022/poster/6423", "author_site": "Hao Huang, Yi Fang", "tldr": "", "abstract": "We present a novel method for 3D shape representation learning using multi-scale wavelet decomposition. Previous works often decompose 3D shapes into complementary components in spatial domain at a single scale. In this work, we study to decompose 3D shapes into sub-bands components in frequency domain at multiple scales, resulting in a hierarchical decomposition tree in a principled manner rooted in multi-resolution wavelet analysis. Specifically, we propose Adaptive Wavelet Transformer Network (AWT-Net) that firstly generates approximation or detail wavelet coefficients per point, classifying each point into high or low sub-bands components, using lifting scheme at multiple scales recursively and hierarchically. Then, AWT-Net exploits Transformer to enhance the original shape features by querying and fusing features from different but integrated sub-bands. The wavelet coefficients can be learned without direct supervision on coefficients, and AWT-Net is fully differentiable and can be learned in an end-to-end fashion. Extensive experiments demonstrate that AWT-Net achieves competitive performance on 3D shape classification and segmentation benchmarks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Hao Huang;Yi Fang", "authorids": "~Hao_Huang3;~Yi_Fang2", "gender": "M;M", "homepage": "https://nyuair.github.io/website/;http://mmvc.engineering.nyu.edu/", "dblp": "04/5616-3;96/361-6", "google_scholar": "SsZ7BooAAAAJ;j-cyhzwAAAAJ", "orcid": "0000-0002-9131-5854;", "linkedin": ";", "or_profile": "~Hao_Huang3;~Yi_Fang2", "aff": "New York University;New York University", "aff_domain": "nyu.edu;nyu.edu", "position": "PhD student;Associate Professor", "bibtex": "@inproceedings{\nhuang2022adaptive,\ntitle={Adaptive Wavelet Transformer Network for 3D Shape Representation Learning},\nauthor={Hao Huang and Yi Fang},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=5MLb3cLCJY}\n}", "github": "", "project": "", "reviewers": "tf1v;Dnca;2wxQ;fDxy", "pdf_size": 0, "recommendation": "6;6;6;6", "confidence": "4;3;4;4", "correctness": "3;4;3;4", "technical_novelty": "3;4;3;4", "empirical_novelty": "2;3;3;4", "wc_summary_paper": "99;34;101;146", "wc_summary_review": "131;29;155;64", "wc_main_review": "381;89;401;622", "wc_review": "611;152;657;832", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "846;446;932;1007", "reply_reviewers": "0;0;0;0", "reply_authors": "2;1;2;2", "recommendation_avg": [ 6.0, 0.0 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.5, 0.5 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 95.0, 39.91866731242415 ], "wc_summary_review_avg": [ 94.75, 50.529075788104414 ], "wc_main_review_avg": [ 373.25, 189.41010400715163 ], "wc_review_avg": [ 563.0, 251.20808108020728 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 807.75, 216.48599839250573 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.75, 0.4330127018922193 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 24, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4302314314594437925&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=5MLb3cLCJY", "email": "nyu.edu;nyu.edu", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "New York University", "aff_unique_dep": "", "aff_unique_url": "https://www.nyu.edu", "aff_unique_abbr": "NYU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "5MbRzxoCAql", "title": "Fight fire with fire: countering bad shortcuts in imitation learning with good shortcuts", "track": "main", "status": "Reject", "tldr": "", "abstract": "When operating in partially observed settings, it is important for a control policy to fuse information from a history of observations. However, a naive implementation of this approach has been observed repeatedly to fail for imitation-learned policies, often in surprising ways, and to the point of sometimes performing worse than when using instantaneous observations alone. We observe that behavioral cloning policies acting on single observations and observation histories each have their strengths and drawbacks, and combining them optimally could achieve the best of both worlds. Motivated by this, we propose a simple model combination approach inspired by human decision making: we first compute a coarse action based on the instantaneous observation, and then refine it into a final action using historical information. Our experiments show that this outperforms all baselines on CARLA autonomous driving from images and various MuJoCo continuous control tasks.\n", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Chuan Wen;Jianing Qian;Jierui Lin;Dinesh Jayaraman;Yang Gao", "authorids": "~Chuan_Wen1;~Jianing_Qian2;~Jierui_Lin1;~Dinesh_Jayaraman2;~Yang_Gao1", "gender": "M;F;M;M;M", "homepage": "https://alvinwen428.github.io/;;;https://www.seas.upenn.edu/~dineshj/;http://yang-gao.weebly.com", "dblp": "239/8286;;255/9244;145/3870;89/4402-29", "google_scholar": "G5M9nYwAAAAJ;o67NTxYAAAAJ;Nr_yTQgAAAAJ;QxLpghAAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";;;0000-0002-6888-3095;", "linkedin": ";;jieruilin/;dinesh-jayaraman-44b31539/;yang-gao-45245348/", "or_profile": "~Chuan_Wen1;~Jianing_Qian2;~Jierui_Lin1;~Dinesh_Jayaraman2;~Yang_Gao1", "aff": "Tsinghua University;School of Engineering and Applied Science, University of Pennsylvania;University of Texas at Austin;University of Pennsylvania;Tsinghua University", "aff_domain": "tsinghua.edu.cn;seas.upenn.edu;utexas.edu;upenn.edu;tsinghua.edu.cn", "position": "PhD student;PhD student;MS student;Assistant Professor;Assistant Professor", "bibtex": "@misc{\nwen2022fight,\ntitle={Fight fire with fire: countering bad shortcuts in imitation learning with good shortcuts},\nauthor={Chuan Wen and Jianing Qian and Jierui Lin and Dinesh Jayaraman and Yang Gao},\nyear={2022},\nurl={https://openreview.net/forum?id=5MbRzxoCAql}\n}", "github": "", "project": "", "reviewers": "Lt5X;AJ5m;Dq7x;ESVr", "site": "https://openreview.net/forum?id=5MbRzxoCAql", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "4;4;2;4", "correctness": "3;2;3;3", "technical_novelty": "2;3;2;4", "empirical_novelty": "3;2;2;2", "wc_summary_paper": "101;88;63;110", "wc_summary_review": "48;90;20;53", "wc_main_review": "105;428;230;277", "wc_review": "254;606;313;440", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "88;400;108;381", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 3.5, 0.8660254037844386 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.82915619758885 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 90.5, 17.698870020427858 ], "wc_summary_review_avg": [ 52.75, 24.913600703230355 ], "wc_main_review_avg": [ 260.0, 115.58330329247387 ], "wc_review_avg": [ 403.25, 134.98032264000557 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 244.25, 146.5748528909376 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Z6C7t23shrkJ:scholar.google.com/&scioq=Fight+fire+with+fire:+countering+bad+shortcuts+in+imitation+learning+with+good+shortcuts&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;2;1;0", "aff_unique_norm": "Tsinghua University;University of Pennsylvania;University of Texas at Austin", "aff_unique_dep": ";School of Engineering and Applied Science;", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.upenn.edu;https://www.utexas.edu", "aff_unique_abbr": "THU;UPenn;UT Austin", "aff_campus_unique_index": "1", "aff_campus_unique": ";Austin", "aff_country_unique_index": "0;1;1;1;0", "aff_country_unique": "China;United States" }, { "id": "5N4bCRdqHAw", "title": "MFE-NER: Multi-feature Fusion Embedding for Chinese Named Entity Recognition", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Pre-trained language models lead Named Entity Recognition (NER) into a new era, while some more knowledge is needed to improve their performance in specific problems. In Chinese NER, character substitution is a complicated linguistic phenomenon. Some Chinese characters are quite similar for sharing the same components or having similar pronunciations. People replace characters in a named entity with similar characters to generate a new collocation but referring to the same object. It becomes even more common in the Internet age and is often used to avoid Internet censorship or just for fun. Such character substitution is not friendly to those pre-trained language models because the new collocations are occasional. As a result, it always leads to unrecognizable or recognition errors in the NER task. In this paper, we propose a new method, Multi-Feature Fusion Embedding (MFE) for Chinese Named Entity Recognition, to strengthen the language pattern of Chinese and handle the character substitution problem in Chinese Named Entity Recognition. MFE fuses semantic, glyph, and phonetic features together. In the glyph domain, we disassemble Chinese characters into components to denote structure features so that characters with similar structures can have close embedding space representation. Meanwhile, an improved phonetic system is also proposed in our work, making it reasonable to calculate phonetic similarity among Chinese characters. Experiments demonstrate that our method improves the overall performance of Chinese NER and especially performs well in informal language environments.", "keywords": "Named Entity Recognition;Natural Language Processing", "primary_area": "", "supplementary_material": "", "author": "Jiatong LI;Kui Meng", "authorids": "~Jiatong_LI4;mengkui@sjtu.edu.cn", "gender": "M;", "homepage": "https://phenixace.github.io/;", "dblp": ";", "google_scholar": "ml9hh18AAAAJ;", "orcid": ";", "linkedin": ";", "or_profile": "~Jiatong_LI4;mengkui@sjtu.edu.cn", "aff": "The University of Melbourne;", "aff_domain": "unimelb.edu.au;", "position": "MS student;", "bibtex": "@misc{\nli2022mfener,\ntitle={{MFE}-{NER}: Multi-feature Fusion Embedding for Chinese Named Entity Recognition},\nauthor={Jiatong LI and Kui Meng},\nyear={2022},\nurl={https://openreview.net/forum?id=5N4bCRdqHAw}\n}", "github": "", "project": "", "reviewers": "XR8s;hUd8;c2De;Jpgc", "site": "https://openreview.net/forum?id=5N4bCRdqHAw", "pdf_size": 0, "recommendation": "1;3;5;5", "confidence": "5;4;4;4", "correctness": "2;3;3;4", "technical_novelty": "1;2;2;2", "empirical_novelty": "1;2;2;2", "wc_summary_paper": "45;35;96;98", "wc_summary_review": "14;26;59;38", "wc_main_review": "131;290;251;156", "wc_review": "190;351;406;292", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.5, 1.6583123951777 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 1.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 68.5, 28.72716484444645 ], "wc_summary_review_avg": [ 34.25, 16.618889854620253 ], "wc_main_review_avg": [ 207.0, 65.57819759645731 ], "wc_review_avg": [ 309.75, 80.03241530779887 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.8703882797784891, "corr_recommendation_correctness": 0.8528028654224417, "gs_citation": 29, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8556997999409357383&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0", "aff_unique_norm": "University of Melbourne", "aff_unique_dep": "", "aff_unique_url": "https://www.unimelb.edu.au", "aff_unique_abbr": "UniMelb", "aff_country_unique_index": "0", "aff_country_unique": "Australia" }, { "title": "The Convex Geometry of Backpropagation: Neural Network Gradient Flows Converge to Extreme Points of the Dual Convex Program", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7123", "id": "5QhUE1qiVC6", "poster": "", "openreview": "https://openreview.net/forum?id=5QhUE1qiVC6", "slides": "https://iclr.cc/virtual/2022/poster/7123", "video": "https://iclr.cc/virtual/2022/poster/7123", "author_site": "Yifei Wang, Mert Pilanci", "tldr": "", "abstract": "We study non-convex subgradient flows for training two-layer ReLU neural networks from a convex geometry and duality perspective. We characterize the implicit bias of unregularized non-convex gradient flow as convex regularization of an equivalent convex model. We then show that the limit points of non-convex subgradient flows can be identified via primal-dual correspondence in this convex optimization problem. Moreover, we derive a sufficient condition on the dual variables which ensures that the stationary points of the non-convex objective are the KKT points of the convex objective, thus proving convergence of non-convex gradient flows to the global optimum. For a class of regular training data distributions such as orthogonal separable data, we show that this sufficient condition holds. Therefore, non-convex gradient flows in fact converge to optimal solutions of a convex optimization problem. We present numerical results verifying the predictions of our theory for non-convex subgradient descent.", "keywords": "Two-layer ReLU networks;convex optimization;convex duality;gradient flow", "primary_area": "", "supplementary_material": "/attachment/04bee031dbd55f17f67a6d239aea255d5ec906d5.zip", "author": "Yifei Wang;Mert Pilanci", "authorids": "~Yifei_Wang2;~Mert_Pilanci3", "gender": "M;M", "homepage": "http://web.stanford.edu/~wangyf18/;https://stanford.edu/~pilanci/", "dblp": ";45/8056", "google_scholar": ";aSAS-aAAAAAJ", "orcid": ";", "linkedin": ";mert-pilanci-ba615743/", "or_profile": "~Yifei_Wang2;~Mert_Pilanci3", "aff": "Stanford University;Stanford University", "aff_domain": "stanford.edu;stanford.edu", "position": "PhD student;Assistant Professor", "bibtex": "@inproceedings{\nwang2022the,\ntitle={The Convex Geometry of Backpropagation: Neural Network Gradient Flows Converge to Extreme Points of the Dual Convex Program},\nauthor={Yifei Wang and Mert Pilanci},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=5QhUE1qiVC6}\n}", "github": "", "project": "", "reviewers": "uQbv;pBSm;8kmN", "pdf_size": 0, "recommendation": "6;6;8", "confidence": "2;3;3", "correctness": "3;4;4", "technical_novelty": "3;3;4", "empirical_novelty": "0;3;0", "wc_summary_paper": "136;51;40", "wc_summary_review": "25;28;20", "wc_main_review": "225;39;118", "wc_review": "386;118;178", "wc_reply_reviewers": "18;0;0", "wc_reply_authors": "172;8;125", "reply_reviewers": "1;0;0", "reply_authors": "1;1;1", "recommendation_avg": [ 6.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 2.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 3.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 1.0, 1.4142135623730951 ], "wc_summary_paper_avg": [ 75.66666666666667, 42.89781139198388 ], "wc_summary_review_avg": [ 24.333333333333332, 3.299831645537222 ], "wc_main_review_avg": [ 127.33333333333333, 76.22044053635189 ], "wc_review_avg": [ 227.33333333333334, 114.83708266738387 ], "wc_reply_reviewers_avg": [ 6.0, 8.48528137423857 ], "wc_reply_authors_avg": [ 101.66666666666667, 68.9557023667288 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.49999999999999983, "corr_recommendation_correctness": 0.49999999999999983, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1638476532596439892&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "pdf": "https://openreview.net/pdf?id=5QhUE1qiVC6", "email": "stanford.edu;stanford.edu", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Stanford University", "aff_unique_dep": "", "aff_unique_url": "https://www.stanford.edu", "aff_unique_abbr": "Stanford", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "On the role of population heterogeneity in emergent communication", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6605", "id": "5Qkd7-bZfI", "poster": "", "openreview": "https://openreview.net/forum?id=5Qkd7-bZfI", "slides": "https://iclr.cc/virtual/2022/poster/6605", "video": "https://iclr.cc/virtual/2022/poster/6605", "author_site": "Mathieu Rita, Florian Strub, Jean-Bastien Grill, Olivier Pietquin, Emmanuel Dupoux", "tldr": "", "abstract": "Populations have often been perceived as a structuring component for language to emerge and evolve: the larger the population, the more systematic the language. While this observation is widespread in the sociolinguistic literature, it has not been reproduced in computer simulations with neural agents. In this paper, we thus aim to clarify this apparent contradiction. We explore emergent language properties by varying agent population size in the speaker-listener Lewis Game. After reproducing the experimental paradox, we challenge the simulation assumption that the agent community is homogeneous. We first investigate how speaker-listener asymmetry alters language structure to examine two potential diversity factors: training speed and network capacity. We find out that emergent language properties are only altered by the relative difference of factors between speaker and listener, and not by their absolute values. From then, we leverage this observation to control population heterogeneity without introducing confounding factors. We finally show that introducing such training speed heterogeneities naturally sort out the initial paradox: larger simulated communities start developing more systematic and structured languages.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Mathieu Rita;Florian Strub;Jean-Bastien Grill;Olivier Pietquin;Emmanuel Dupoux", "authorids": "~Mathieu_Rita1;~Florian_Strub1;~Jean-Bastien_Grill2;~Olivier_Pietquin1;emmanuel.dupoux@gmail.com", "gender": "M;M;M;M;", "homepage": ";http://www.florian-strub.com;https://dblp.uni-trier.de/pid/178/3291.html;http://www.cristal.univ-lille.fr/~pietquin/;", "dblp": ";;;58/6269;", "google_scholar": "https://scholar.google.fr/citations?hl=fr;zxO5kccAAAAJ;;8K8-LdwAAAAJ;", "orcid": ";;;;", "linkedin": ";florian-strub-64443527/;;opietquin/;", "or_profile": "~Mathieu_Rita1;~Florian_Strub1;~Jean-Bastien_Grill2;~Olivier_Pietquin1;emmanuel.dupoux@gmail.com", "aff": "INRIA;Google DeepMind;Google DeepMind;Google Brain;", "aff_domain": "inria.fr;google.com;deepmind.com;google.com;", "position": "PhD student;Research Scientist;Researcher;Staff Research Scientist;", "bibtex": "@inproceedings{\nrita2022on,\ntitle={On the role of population heterogeneity in emergent communication},\nauthor={Mathieu Rita and Florian Strub and Jean-Bastien Grill and Olivier Pietquin and Emmanuel Dupoux},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=5Qkd7-bZfI}\n}", "github": "", "project": "", "reviewers": "grQ9;5n9H;UKkL;zxeP", "pdf_size": 0, "recommendation": "6;6;6;6", "confidence": "4;2;4;4", "correctness": "3;3;3;2", "technical_novelty": "3;4;3;4", "empirical_novelty": "3;4;3;3", "wc_summary_paper": "153;65;186;258", "wc_summary_review": "119;20;40;141", "wc_main_review": "477;440;818;366", "wc_review": "749;525;1044;765", "wc_reply_reviewers": "0;71;49;323", "wc_reply_authors": "818;780;1357;1830", "reply_reviewers": "0;1;1;2", "reply_authors": "1;1;2;4", "recommendation_avg": [ 6.0, 0.0 ], "confidence_avg": [ 3.5, 0.8660254037844386 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.5, 0.5 ], "empirical_novelty_avg": [ 3.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 165.5, 69.34154598795732 ], "wc_summary_review_avg": [ 80.0, 51.09305236526782 ], "wc_main_review_avg": [ 525.25, 173.67984195064204 ], "wc_review_avg": [ 770.75, 184.09559337474647 ], "wc_reply_reviewers_avg": [ 110.75, 125.2085759842352 ], "wc_reply_authors_avg": [ 1196.25, 431.2240571906906 ], "reply_reviewers_avg": [ 1.0, 0.7071067811865476 ], "reply_authors_avg": [ 2.0, 1.224744871391589 ], "replies_avg": [ 20, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 38, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9738620591444184168&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 9, "pdf": "https://openreview.net/pdf?id=5Qkd7-bZfI", "email": "inria.fr;google.com;deepmind.com;google.com;", "author_num": 5, "aff_unique_index": "0;1;1;1", "aff_unique_norm": "INRIA;Google", "aff_unique_dep": ";Google DeepMind", "aff_unique_url": "https://www.inria.fr;https://deepmind.com", "aff_unique_abbr": "INRIA;DeepMind", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;1;1;2", "aff_country_unique": "France;United Kingdom;United States" }, { "id": "5SgoJKayTvs", "title": "Intervention Adversarial Auto-Encoder", "track": "main", "status": "Reject", "tldr": "", "abstract": "In this paper we propose a new method to stabilize the training process of the latent variables of adversarial auto-encoders, which we name Intervention Adversarial auto-encoder (IVAAE). The main idea is to introduce a sequence of distributions that bridge the distribution of the learned latent variable and its prior distribution. We theoretically and heuristically demonstrate that such bridge-like distributions, realized by a multi-output discriminator, have an effect on guiding the initial latent distribution towards the target one and hence stabilizing the training process. Several different types of the bridge distributions are proposed. We also apply a novel use of Stein variational gradient descent (SVGD), by which point assemble develops in a smooth and gradual fashion. We conduct experiments on multiple real-world datasets. It shows that IVAAE enjoys a more stable training process and achieves a better generating performance compared to the vanilla Adversarial auto-encoder (AAE)", "keywords": "Deep Learning;Generative Models;Adversarial Training", "primary_area": "", "supplementary_material": "", "author": "Yang Hu;Cheng Zhang", "authorids": "~Yang_Hu8;~Cheng_Zhang3", "gender": "M;M", "homepage": "https://www.researchgate.net/profile/Yang-Hu-6;https://zcrabbit.github.io", "dblp": ";", "google_scholar": ";PddDrLgAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Yang_Hu8;~Cheng_Zhang3", "aff": "Peking University;Peking University", "aff_domain": "pku.edu.cn;pku.edu.cn", "position": "Undergrad student;Assistant Professor", "bibtex": "@misc{\nhu2022intervention,\ntitle={Intervention Adversarial Auto-Encoder},\nauthor={Yang Hu and Cheng Zhang},\nyear={2022},\nurl={https://openreview.net/forum?id=5SgoJKayTvs}\n}", "github": "", "project": "", "reviewers": "Nq3D;tnv3;LAu5", "site": "https://openreview.net/forum?id=5SgoJKayTvs", "pdf_size": 0, "recommendation": "3;3;3", "confidence": "4;5;3", "correctness": "2;2;2", "technical_novelty": "2;2;2", "empirical_novelty": "2;2;2", "wc_summary_paper": "86;14;25", "wc_summary_review": "29;8;16", "wc_main_review": "163;104;392", "wc_review": "278;126;433", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "correctness_avg": [ 2.0, 0.0 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 41.666666666666664, 31.668421004036322 ], "wc_summary_review_avg": [ 17.666666666666668, 8.65383665716478 ], "wc_main_review_avg": [ 219.66666666666666, 124.21576210592421 ], "wc_review_avg": [ 279.0, 125.3342198550207 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:piChd2nsszwJ:scholar.google.com/&scioq=Intervention+Adversarial+Auto-Encoder&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Peking University", "aff_unique_dep": "", "aff_unique_url": "http://www.pku.edu.cn", "aff_unique_abbr": "Peking U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "title": "Modular Lifelong Reinforcement Learning via Neural Composition", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6937", "id": "5XmLzdslFNN", "poster": "", "openreview": "https://openreview.net/forum?id=5XmLzdslFNN", "slides": "https://iclr.cc/virtual/2022/poster/6937", "video": "https://iclr.cc/virtual/2022/poster/6937", "author_site": "Jorge Mendez, Harm van Seijen, ERIC EATON", "tldr": "", "abstract": "Humans commonly solve complex problems by decomposing them into easier subproblems and then combining the subproblem solutions. This type of compositional reasoning permits reuse of the subproblem solutions when tackling future tasks that share part of the underlying compositional structure. In a continual or lifelong reinforcement learning (RL) setting, this ability to decompose knowledge into reusable components would enable agents to quickly learn new RL tasks by leveraging accumulated compositional structures. We explore a particular form of composition based on neural modules and present a set of RL problems that intuitively admit compositional solutions. Empirically, we demonstrate that neural composition indeed captures the underlying structure of this space of problems. We further propose a compositional lifelong RL method that leverages accumulated neural components to accelerate the learning of future tasks while retaining performance on previous tasks via off-line RL over replayed experiences.", "keywords": "lifelong learning;continual learning;reinforcement learning;composition;modularity;compositionality", "primary_area": "", "supplementary_material": "", "author": "Jorge A Mendez;Harm van Seijen;ERIC EATON", "authorids": "~Jorge_A_Mendez1;~Harm_van_Seijen1;~ERIC_EATON1", "gender": "M;;M", "homepage": "https://www.microsoft.com/en-us/research/people/havansei/;;https://www.seas.upenn.edu/~mendezme/", "dblp": "33/7770;22/2336;255/6609", "google_scholar": ";QIZWnnQAAAAJ;87sQtnsAAAAJ", "orcid": ";;0000-0002-2537-598X", "linkedin": ";;", "or_profile": "~Harm_van_Seijen1;~ERIC_EATON1;~Jorge_Armando_Mendez_Mendez1", "aff": "Microsoft Research;University of Pennsylvania;University of Pennsylvania", "aff_domain": "microsoft.com;upenn.edu;upenn.edu", "position": "Research manager;Faculty;PhD student", "bibtex": "@inproceedings{\nmendez2022modular,\ntitle={Modular Lifelong Reinforcement Learning via Neural Composition},\nauthor={Jorge A Mendez and Harm van Seijen and ERIC EATON},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=5XmLzdslFNN}\n}", "github": "", "project": "", "reviewers": "HvzU;77pf;52Uy;wUbR", "pdf_size": 0, "recommendation": "6;6;6;8", "confidence": "4;3;4;4", "correctness": "3;4;4;4", "technical_novelty": "3;4;3;2", "empirical_novelty": "2;4;3;3", "wc_summary_paper": "82;326;65;682", "wc_summary_review": "90;52;77;357", "wc_main_review": "837;860;477;696", "wc_review": "1009;1238;619;1735", "wc_reply_reviewers": "237;232;31;990", "wc_reply_authors": "1647;1171;884;2301", "reply_reviewers": "2;1;1;5", "reply_authors": "3;3;2;5", "recommendation_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 288.75, 249.42070383189926 ], "wc_summary_review_avg": [ 144.0, 123.73156428333071 ], "wc_main_review_avg": [ 717.5, 152.3884838168554 ], "wc_review_avg": [ 1150.25, 403.67398665259566 ], "wc_reply_reviewers_avg": [ 372.5, 366.0700069658808 ], "wc_reply_authors_avg": [ 1500.75, 536.4011442008676 ], "reply_reviewers_avg": [ 2.25, 1.6393596310755 ], "reply_authors_avg": [ 3.25, 1.0897247358851685 ], "replies_avg": [ 28, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 53, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17042814609795844207&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=5XmLzdslFNN", "email": "microsoft.com;upenn.edu;upenn.edu", "author_num": 3, "aff_unique_index": "0;1;1", "aff_unique_norm": "Microsoft;University of Pennsylvania", "aff_unique_dep": "Microsoft Research;", "aff_unique_url": "https://www.microsoft.com/en-us/research;https://www.upenn.edu", "aff_unique_abbr": "MSR;UPenn", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "5_zwnS5oJDp", "title": "Bayesian Learning with Information Gain Provably Bounds Risk for a Robust Adversarial Defense", "track": "main", "status": "Reject", "tldr": "", "abstract": "In this paper, we present a novel method to learn a Bayesian neural network robust against adversarial attacks. Previous algorithms have shown an adversarially trained Bayesian Neural Network (BNN) provides improved robustness against attacks. However, the learning approach for approximating the multi-modal Bayesian posterior leads to mode collapse with consequential sub-par robustness and under performance of an adversarially trained BNN. Instead, we propose approximating the multi-modal posterior of a BNN to prevent mode collapse and encourage diversity over learned posterior distributions of models to develop a novel adversarial training method for BNNs. Importantly, we conceptualize and formulate information gain (IG) in the adversarial Bayesian learning context and prove, training a BNN with IG bounds the difference between the conventional empirical risk with the risk obtained from adversarial training---our intuition is that information gain from benign and adversarial examples should be the same for a robust BNN. Extensive experimental results demonstrate our proposed algorithm to achieve state-of-the-art performance under strong adversarial attacks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Bao Gia Doan;Ehsan M Abbasnejad;Damith C. Ranasinghe", "authorids": "~Bao_Gia_Doan1;~Ehsan_M_Abbasnejad1;damith.ranasinghe@adelaide.edu.au", "gender": "M;M;", "homepage": "https://researchers.adelaide.edu.au/profile/bao.doan;https://ehsanabb.github.io/;", "dblp": "247/1144;30/11191;", "google_scholar": ";https://scholar.google.com/citations?hl=en;", "orcid": ";;", "linkedin": ";;", "or_profile": "~Bao_Gia_Doan1;~Ehsan_M_Abbasnejad1;damith.ranasinghe@adelaide.edu.au", "aff": "The University of Adelaide;University of Adelaide;", "aff_domain": "adelaide.edu.au;adelaide.edu.au;", "position": "PhD student;Assistant Professor;", "bibtex": "@misc{\ndoan2022bayesian,\ntitle={Bayesian Learning with Information Gain Provably Bounds Risk for a Robust Adversarial Defense},\nauthor={Bao Gia Doan and Ehsan M Abbasnejad and Damith C. Ranasinghe},\nyear={2022},\nurl={https://openreview.net/forum?id=5_zwnS5oJDp}\n}", "github": "", "project": "", "reviewers": "UwZA;qjBg;RZv2;CwGg", "site": "https://openreview.net/forum?id=5_zwnS5oJDp", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "4;4;4;5", "correctness": "2;2;2;3", "technical_novelty": "2;2;1;2", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "87;69;105;65", "wc_summary_review": "80;155;60;21", "wc_main_review": "397;129;703;201", "wc_review": "564;353;868;287", "wc_reply_reviewers": "0;21;0;0", "wc_reply_authors": "801;252;1685;921", "reply_reviewers": "0;1;0;0", "reply_authors": "1;1;3;2", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 2.25, 0.4330127018922193 ], "technical_novelty_avg": [ 1.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 81.5, 15.898113095584646 ], "wc_summary_review_avg": [ 79.0, 48.73910134583936 ], "wc_main_review_avg": [ 357.5, 222.28079089296043 ], "wc_review_avg": [ 518.0, 226.4961368323972 ], "wc_reply_reviewers_avg": [ 5.25, 9.093266739736606 ], "wc_reply_authors_avg": [ 914.75, 511.24865525495517 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.75, 0.82915619758885 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 1.0, "corr_recommendation_correctness": 1.0, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3769346561128501605&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 7, "aff_unique_index": "0;0", "aff_unique_norm": "University of Adelaide", "aff_unique_dep": "", "aff_unique_url": "https://www.adelaide.edu.au", "aff_unique_abbr": "Adelaide", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Australia" }, { "id": "5alVAdi6wW4", "title": "Generating Unobserved Alternatives with Tower Implicit Model (TIM)", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "We consider problems where multiple predictions can be considered correct, but only one of them is given as supervision. This setting differs from both the regression and class-conditional generative modelling settings: in the former, there is a unique observed output for each input, which is provided as supervision; in the latter, there are many observed outputs for each input, and many are provided as supervision. Applying either regression methods and conditional generative models to the present setting often results in a model that can only make a single prediction for each input. We explore several problems that have this property and develop an approach, TIM, that can generate multiple high quality predictions given the same input and achieves a reduction of the Fr\u00e9chet Inception Distance (FID) by 19.6% on average compared to the baseline.", "keywords": "one-to-many prediction;inverse problem;implicit maximum likelihood estimation", "primary_area": "", "supplementary_material": "/attachment/454600569697e38f4130cb11a8a5a806edcf2b25.zip", "author": "Shichong Peng;Seyed Alireza Moazenipourasil;Ke Li", "authorids": "~Shichong_Peng1;~Seyed_Alireza_Moazenipourasil1;~Ke_Li1", "gender": "M;M;M", "homepage": "https://sites.google.com/view/niopeng/home;https://amoazeni75.github.io/;http://www.sfu.ca/~keli/", "dblp": "221/4790;;75/6627-11", "google_scholar": ";KjSsypYAAAAJ;vQc8tI4AAAAJ", "orcid": ";;", "linkedin": ";s-alireza-moazeni/;", "or_profile": "~Shichong_Peng1;~Seyed_Alireza_Moazenipourasil1;~Ke_Li1", "aff": "Simon Fraser University;Simon Fraser University;Simon Fraser University", "aff_domain": "sfu.ca;sfu.ca;sfu.ca", "position": "PhD student;PhD student;Assistant Professor", "bibtex": "@misc{\npeng2022generating,\ntitle={Generating Unobserved Alternatives with Tower Implicit Model ({TIM})},\nauthor={Shichong Peng and Seyed Alireza Moazenipourasil and Ke Li},\nyear={2022},\nurl={https://openreview.net/forum?id=5alVAdi6wW4}\n}", "github": "", "project": "", "reviewers": "kEQP;xS3B;nuX3;hLdY", "site": "https://openreview.net/forum?id=5alVAdi6wW4", "pdf_size": 0, "recommendation": "3;3;5;6", "confidence": "3;4;3;3", "correctness": "2;3;3;3", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "67;131;84;53", "wc_summary_review": "42;64;83;49", "wc_main_review": "283;243;738;221", "wc_review": "392;438;905;323", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.25, 1.299038105676658 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 83.75, 29.40556920040828 ], "wc_summary_review_avg": [ 59.5, 15.724185193516387 ], "wc_main_review_avg": [ 371.25, 212.90652291557439 ], "wc_review_avg": [ 514.5, 229.14024090063273 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.5555555555555555, "corr_recommendation_correctness": 0.5555555555555555, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:k4gXDqFRw5IJ:scholar.google.com/&scioq=Generating+Unobserved+Alternatives+with+Tower+Implicit+Model+(TIM)&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0;0", "aff_unique_norm": "Simon Fraser University", "aff_unique_dep": "", "aff_unique_url": "https://www.sfu.ca", "aff_unique_abbr": "SFU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Canada" }, { "id": "5fbUEUTZEn7", "title": "Graph Kernel Neural Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "The convolution operator at the core of many modern neural architectures can effectively be seen as performing a dot product between an input matrix and a filter. While this is readily applicable to data such as images, which can be represented as regular grids in the Euclidean space, extending the convolution operator to work on graphs proves more challenging, due to their irregular structure. In this paper, we propose to use graph kernels, i.e., kernel functions that compute an inner product on graphs, to extend the standard convolution operator to the graph domain. This allows us to define an entirely structural model that does not require computing the embedding of the input graph. Our architecture allows to plug-in any type and number of graph kernels and has the added benefit of providing some interpretability in terms of the structural masks that are learned during the training process, similarly to what happens for convolutional masks in traditional convolutional neural networks. We perform an extensive ablation study to investigate the impact of the model hyper-parameters and we show that our model achieves competitive performance on standard graph classification datasets.", "keywords": "graph neural network;graph kernel;deep learning", "primary_area": "", "supplementary_material": "/attachment/36f2444d6bc506f0ecc6b80a34a62a8724700b61.zip", "author": "Luca Cosmo;Giorgia Minello;Michael M. Bronstein;Emanuele Rodol\u00e0;Luca Rossi;Andrea Torsello", "authorids": "~Luca_Cosmo2;~Giorgia_Minello1;~Michael_M._Bronstein1;~Emanuele_Rodol\u00e01;~Luca_Rossi1;~Andrea_Torsello3", "gender": "M;F;M;M;;M", "homepage": ";http://www.dais.unive.it/~minello;http://www.inf.usi.ch/bronstein/;;https://blextar.github.io/luca-rossi;http://www.dsi.unive.it/~atorsell", "dblp": "122/8728;168/0171;07/2668;54/8401;61/7974-4;11/2491", "google_scholar": "https://scholar.google.it/citations?hl=it;eK0cNpcAAAAJ;UU3N6-UAAAAJ;-EH4wBYAAAAJ;NykUU_YAAAAJ;https://scholar.google.it/citations?user=emE_ITIAAAAJ", "orcid": "0000-0001-7729-4666;;;0000-0003-0091-7241;;0000-0001-9189-4924", "linkedin": ";;mbronstein/;;;", "or_profile": "~Luca_Cosmo2;~Giorgia_Minello1;~Michael_M._Bronstein1;~Emanuele_Rodol\u00e01;~Luca_Rossi1;~Andrea_Torsello3", "aff": "University of Venice;University Ca' Foscari of Venice;Twitter;Sapienza University of Rome;Queen Mary University London;University Ca' Foscari of Venice", "aff_domain": "unive.it;unive.it;twitter.com;uniroma1.it;qmul.ac.uk;unive.it", "position": "Assistant Professor;Postdoc;Head of Graph ML;Full Professor;Lecturer;Full Professor", "bibtex": "@misc{\ncosmo2022graph,\ntitle={Graph Kernel Neural Networks},\nauthor={Luca Cosmo and Giorgia Minello and Michael M. Bronstein and Emanuele Rodol{\\`a} and Luca Rossi and Andrea Torsello},\nyear={2022},\nurl={https://openreview.net/forum?id=5fbUEUTZEn7}\n}", "github": "", "project": "", "reviewers": "6pRJ;55BS;ovv9", "site": "https://openreview.net/forum?id=5fbUEUTZEn7", "pdf_size": 0, "recommendation": "5;6;6", "confidence": "4;4;3", "correctness": "4;3;3", "technical_novelty": "3;4;3", "empirical_novelty": "2;3;2", "wc_summary_paper": "117;111;42", "wc_summary_review": "85;40;29", "wc_main_review": "826;888;376", "wc_review": "1028;1039;447", "wc_reply_reviewers": "80;25;0", "wc_reply_authors": "1455;1235;1090", "reply_reviewers": "1;1;0", "reply_authors": "3;2;2", "recommendation_avg": [ 5.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 3.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 90.0, 34.02939905434711 ], "wc_summary_review_avg": [ 51.333333333333336, 24.225789747475496 ], "wc_main_review_avg": [ 696.6666666666666, 228.1539441303223 ], "wc_review_avg": [ 838.0, 276.5152195931838 ], "wc_reply_reviewers_avg": [ 35.0, 33.4165627596057 ], "wc_reply_authors_avg": [ 1260.0, 150.05554527129834 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 2.3333333333333335, 0.4714045207910317 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.4999999999999999, "corr_recommendation_correctness": -0.9999999999999997, "gs_citation": 40, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13695278743289408015&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 14, "aff_unique_index": "0;1;2;3;4;1", "aff_unique_norm": "University of Venice;Ca' Foscari University of Venice;Twitter, Inc.;Sapienza University of Rome;Queen Mary University of London", "aff_unique_dep": ";;;;", "aff_unique_url": "https://www.unive.it;https://www.unive.it;https://twitter.com;https://www.uniroma1.it;https://www.qmul.ac.uk", "aff_unique_abbr": "Unive;Ca' Foscari;Twitter;Sapienza;QMUL", "aff_campus_unique_index": "1;2;3;1", "aff_campus_unique": ";Venice;Rome;London", "aff_country_unique_index": "0;0;1;0;2;0", "aff_country_unique": "Italy;United States;United Kingdom" }, { "id": "5fmBRf5rrC", "title": "Knothe-Rosenblatt transport for Unsupervised Domain Adaptation", "track": "main", "status": "Reject", "tldr": "", "abstract": " Unsupervised domain adaptation (UDA) aims at exploiting related but different data sources in order to tackle a common task in a target domain. UDA remains a central yet challenging problem in machine learning.\n In this paper, we present an approach based on the Knothe-Rosenblatt transport: we exploit autoregressive density estimation algorithms to accurately model the different sources by an autoregressive model using a mixture of Gaussians.\n Our Knothe-Rosenblatt Domain Adaptation (KRDA) then takes advantage of the triangularity of the autoregressive models to build an explicit mapping of the source samples into the target domain. We show that the transfer map built by KRDA preserves each component quantiles of the observations, hence aligning the representations of the different data sets in the same target domain.\n Finally, we show that KRDA has state-of-the-art performance on both synthetic and real world UDA problems.", "keywords": "domain adaptation;transfer learning;deep learning;density estimation;transport", "primary_area": "", "supplementary_material": "", "author": "Aladin Virmaux;Illyyne Saffar;Jianfeng Zhang;Bal\u00e1zs K\u00e9gl", "authorids": "~Aladin_Virmaux1;~Illyyne_Saffar1;~Jianfeng_Zhang2;~Bal\u00e1zs_K\u00e9gl2", "gender": ";F;M;M", "homepage": "https://avirmaux.github.io;;;https://scholar.google.com/citations?user=s0njcGgAAAAJ&hl=en&oi=ao", "dblp": "192/8303;;74/5065;k/BalazsKegl.html", "google_scholar": "5FxvLvwAAAAJ;https://scholar.google.com/citations?hl=fr;_Wzsb6YAAAAJ;s0njcGgAAAAJ", "orcid": ";;;", "linkedin": ";illyynesaffar/;;balazskegl", "or_profile": "~Aladin_Virmaux1;~Illyyne_Saffar1;~Jianfeng_Zhang2;~Balazs_Kegl1", "aff": "Huawei Technologies Ltd.;Ericsson;Huawei Technologies Ltd.;CNRS (on leave)", "aff_domain": "huawei.com;ericsson.com;huawei.com;in2p3.fr", "position": "Researcher;Researcher;Researcher;Principal Researcher", "bibtex": "@misc{\nvirmaux2022knotherosenblatt,\ntitle={Knothe-Rosenblatt transport for Unsupervised Domain Adaptation},\nauthor={Aladin Virmaux and Illyyne Saffar and Jianfeng Zhang and Bal{\\'a}zs K{\\'e}gl},\nyear={2022},\nurl={https://openreview.net/forum?id=5fmBRf5rrC}\n}", "github": "", "project": "", "reviewers": "HHVi;kupY;paqc;KT59", "site": "https://openreview.net/forum?id=5fmBRf5rrC", "pdf_size": 0, "recommendation": "3;3;3;3", "confidence": "4;4;3;4", "correctness": "3;3;2;3", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;0;1;2", "wc_summary_paper": "89;64;55;89", "wc_summary_review": "43;3;47;103", "wc_main_review": "498;196;280;348", "wc_review": "630;263;382;540", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 1.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 74.25, 15.08931741332258 ], "wc_summary_review_avg": [ 49.0, 35.608987629529715 ], "wc_main_review_avg": [ 330.5, 110.68310620867125 ], "wc_review_avg": [ 453.75, 141.45383522548974 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:IiFPf7xqNf0J:scholar.google.com/&scioq=Knothe-Rosenblatt+transport+for+Unsupervised+Domain+Adaptation&hl=en&as_sdt=0,5", "gs_version_total": 3, "aff_unique_index": "0;1;0;2", "aff_unique_norm": "Huawei;Ericsson;CNRS", "aff_unique_dep": "Huawei Technologies;;", "aff_unique_url": "https://www.huawei.com;https://www.ericsson.com;https://www.cnrs.fr", "aff_unique_abbr": "Huawei;Ericsson;CNRS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;2", "aff_country_unique": "China;Sweden;France" }, { "title": "Open-Set Recognition: A Good Closed-Set Classifier is All You Need", "status": "Oral", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6727", "id": "5hLP5JY9S2d", "poster": "", "openreview": "https://openreview.net/forum?id=5hLP5JY9S2d", "slides": "https://iclr.cc/virtual/2022/poster/6727", "video": "https://iclr.cc/virtual/2022/poster/6727", "author_site": "Sagar Vaze, Kai Han, Andrea Vedaldi, Andrew Zisserman", "tldr": "", "abstract": "The ability to identify whether or not a test sample belongs to one of the semantic classes in a classifier's training set is critical to practical deployment of the model. This task is termed open-set recognition (OSR) and has received significant attention in recent years. In this paper, we first demonstrate that the ability of a classifier to make the 'none-of-above' decision is highly correlated with its accuracy on the closed-set classes. We find that this relationship holds across loss objectives and architectures, and further demonstrate the trend both on the standard OSR benchmarks as well as on a large-scale ImageNet evaluation. Second, we use this correlation to boost the performance of the maximum softmax probability OSR 'baseline' by improving its closed-set accuracy, and with this strong baseline achieve state-of-the-art on a number of OSR benchmarks. Similarly, we boost the performance of the existing state-of-the-art method by improving its closed-set accuracy, but the resulting discrepancy with the strong baseline is marginal. Our third contribution is to present the 'Semantic Shift Benchmark' (SSB), which better respects the task of detecting semantic novelty, as opposed to low-level distributional shifts as tackled by neighbouring machine learning fields. On this new evaluation, we again demonstrate that there is negligible difference between the strong baseline and the existing state-of-the-art. Code available at: https://github.com/sgvaze/osr_closed_set_all_you_need.", "keywords": "open set recognition;image recognition;computer vision", "primary_area": "", "supplementary_material": "/attachment/77b537399e49faa19e972ac00c03fc55fab82141.zip", "author": "Sagar Vaze;Kai Han;Andrea Vedaldi;Andrew Zisserman", "authorids": "~Sagar_Vaze1;~Kai_Han1;~Andrea_Vedaldi1;~Andrew_Zisserman1", "gender": "M;M;M;", "homepage": "https://sgvaze.github.io/;http://www.kaihan.org/;https://www.robots.ox.ac.uk/~vedaldi/;", "dblp": "226/4705;51/4757-1.html;99/2825;", "google_scholar": "lvuOknUAAAAJ;tG8S_vMAAAAJ;bRT7t28AAAAJ;", "orcid": "0000-0003-2920-9345;0000-0002-7995-9999;0000-0003-1374-2858;", "linkedin": "sagar-vaze-2356ab171/;kaihancs/;;", "or_profile": "~Sagar_Vaze1;~Kai_Han1;~Andrea_Vedaldi1;~Andrew_Zisserman1", "aff": "University of Oxford;Google Research;Meta;", "aff_domain": "ox.ac.uk;google.com;meta.com;", "position": "PhD student;Visiting Faculty Researcher;Researcher;", "bibtex": "@inproceedings{\nvaze2022openset,\ntitle={Open-Set Recognition: A Good Closed-Set Classifier is All You Need},\nauthor={Sagar Vaze and Kai Han and Andrea Vedaldi and Andrew Zisserman},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=5hLP5JY9S2d}\n}", "github": "", "project": "", "reviewers": "PPFy;dw7J;HAFU", "pdf_size": 0, "recommendation": "6;8;8", "confidence": "4;4;4", "correctness": "4;4;4", "technical_novelty": "2;3;2", "empirical_novelty": "3;3;3", "wc_summary_paper": "82;187;70", "wc_summary_review": "103;64;60", "wc_main_review": "100;509;469", "wc_review": "285;760;599", "wc_reply_reviewers": "31;108;171", "wc_reply_authors": "89;1035;850", "reply_reviewers": "1;1;1", "reply_authors": "1;2;2", "recommendation_avg": [ 7.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 4.0, 0.0 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 113.0, 52.55473337388365 ], "wc_summary_review_avg": [ 75.66666666666667, 19.39644870130154 ], "wc_main_review_avg": [ 359.3333333333333, 184.10202485457773 ], "wc_review_avg": [ 548.0, 197.24265934798856 ], "wc_reply_reviewers_avg": [ 103.33333333333333, 57.24993934979805 ], "wc_reply_authors_avg": [ 658.0, 409.3710623220291 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 556, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15114021291138625040&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=5hLP5JY9S2d", "email": "ox.ac.uk;google.com;meta.com;", "author_num": 4, "aff_unique_index": "0;1;2", "aff_unique_norm": "University of Oxford;Google;Meta", "aff_unique_dep": ";Google Research;Meta Platforms, Inc.", "aff_unique_url": "https://www.ox.ac.uk;https://research.google;https://meta.com", "aff_unique_abbr": "Oxford;Google Research;Meta", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;1;1", "aff_country_unique": "United Kingdom;United States" }, { "title": "Privacy Implications of Shuffling", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6854", "id": "5i2f-aR6B8H", "poster": "", "openreview": "https://openreview.net/forum?id=5i2f-aR6B8H", "slides": "https://iclr.cc/virtual/2022/poster/6854", "video": "https://iclr.cc/virtual/2022/poster/6854", "author_site": "Casey Meehan, Amrita Roy Chowdhury, Kamalika Chaudhuri, Somesh Jha", "tldr": "", "abstract": "\\ldp deployments are vulnerable to inference attacks as an adversary can link the noisy responses to their identity and subsequently, auxiliary information using the \\textit{order} of the data. An alternative model, shuffle \\textsf{DP}, prevents this by shuffling the noisy responses uniformly at random. However, this limits the data learnability -- only symmetric functions (input order agnostic) can be learned. In this paper, we strike a balance and show that systematic shuffling of the noisy responses can thwart specific inference attacks while retaining some meaningful data learnability. To this end, we propose a novel privacy guarantee, \\name-privacy, that captures the privacy of the order of a data sequence. \\name-privacy allows tuning the granularity at which the ordinal information is maintained, which formalizes the degree the resistance to inference attacks trading it off with data learnability. Additionally, we propose a novel shuffling mechanism that can achieve \\name-privacy and demonstrate the practicality of our mechanism via evaluation on real-world datasets. ", "keywords": "local differential privacy;shuffle DP model", "primary_area": "", "supplementary_material": "/attachment/58c62f90ba8a0ca1711113b01bdfb0417ab0e931.zip", "author": "Casey Meehan;Amrita Roy Chowdhury;Kamalika Chaudhuri;Somesh Jha", "authorids": "~Casey_Meehan1;~Amrita_Roy_Chowdhury1;~Kamalika_Chaudhuri1;~Somesh_Jha1", "gender": "M;F;F;M", "homepage": "https://casey-meehan.github.io/;https://sites.google.com/wisc.edu/amrita-roy-chowdhury/;http://cseweb.ucsd.edu/users/kamalika;", "dblp": "255/5544;147/6281.html;56/6435;j/SomeshJha", "google_scholar": "s-lqUEUAAAAJ;lWWAZ4YAAAAJ;I-DJ7EsAAAAJ;BaI7l8QAAAAJ", "orcid": ";;;", "linkedin": "casey-meehan-ucsd/;;;", "or_profile": "~Casey_Meehan1;~Amrita_Roy_Chowdhury1;~Kamalika_Chaudhuri1;~Somesh_Jha1", "aff": "University of California, San Diego;;University of California, San Diego;Department of Computer Science, University of Wisconsin, Madison", "aff_domain": "ucsd.edu;;ucsd.edu;cs.wisc.edu", "position": "PhD student;;Associate Professor;Full Professor", "bibtex": "@inproceedings{\nmeehan2022privacy,\ntitle={Privacy Implications of Shuffling},\nauthor={Casey Meehan and Amrita Roy Chowdhury and Kamalika Chaudhuri and Somesh Jha},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=5i2f-aR6B8H}\n}", "github": "", "project": "", "reviewers": "f7JZ;FmCK;ayWA", "pdf_size": 0, "recommendation": "6;6;8", "confidence": "3;3;3", "correctness": "3;3;3", "technical_novelty": "3;2;4", "empirical_novelty": "3;3;4", "wc_summary_paper": "73;110;103", "wc_summary_review": "79;27;11", "wc_main_review": "384;226;94", "wc_review": "536;363;208", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "805;713;27", "reply_reviewers": "0;0;0", "reply_authors": "2;2;1", "recommendation_avg": [ 6.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 3.0, 0.0 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 3.0, 0.816496580927726 ], "empirical_novelty_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 95.33333333333333, 16.048537489614297 ], "wc_summary_review_avg": [ 39.0, 29.028721409436322 ], "wc_main_review_avg": [ 234.66666666666666, 118.55050494292391 ], "wc_review_avg": [ 369.0, 133.97263402152944 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 515.0, 347.1061317042191 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3610310146939368322&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "pdf": "https://openreview.net/pdf?id=5i2f-aR6B8H", "email": "ucsd.edu;;ucsd.edu;cs.wisc.edu", "author_num": 4, "aff_unique_index": "0;0;1", "aff_unique_norm": "University of California, San Diego;University of Wisconsin-Madison", "aff_unique_dep": ";Department of Computer Science", "aff_unique_url": "https://www.ucsd.edu;https://www.wisc.edu", "aff_unique_abbr": "UCSD;UW-Madison", "aff_campus_unique_index": "0;0;1", "aff_campus_unique": "San Diego;Madison", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Learning by Directional Gradient Descent", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7001", "id": "5i7lJLuhTm", "poster": "", "openreview": "https://openreview.net/forum?id=5i7lJLuhTm", "slides": "https://iclr.cc/virtual/2022/poster/7001", "video": "https://iclr.cc/virtual/2022/poster/7001", "author_site": "David Silver, Anirudh Goyal, Ivo Danihelka, Matteo Hessel, Hado van Hasselt", "tldr": "", "abstract": "How should state be constructed from a sequence of observations, so as to best achieve some objective? Most deep learning methods update the parameters of the state representation by gradient descent. However, no prior method for computing the gradient is fully satisfactory, for example consuming too much memory, introducing too much variance, or adding too much bias. In this work, we propose a new learning algorithm that addresses these limitations. The basic idea is to update the parameters of the representation by using the directional derivative along a candidate direction, a quantity that may be computed online with the same computational cost as the representation itself. We consider several different choices of candidate direction, including random selection and approximations to the true gradient, and investigate their performance on several synthetic tasks. \n", "keywords": "credit assignment;directional derivative;recurrent networks", "primary_area": "", "supplementary_material": "", "author": "David Silver;Anirudh Goyal;Ivo Danihelka;Matteo Hessel;Hado van Hasselt", "authorids": "~David_Silver1;~Anirudh_Goyal1;~Ivo_Danihelka1;~Matteo_Hessel1;~Hado_van_Hasselt1", "gender": ";M;M;M;M", "homepage": ";https://anirudh9119.github.io/;;;http://hadovanhasselt.com", "dblp": "34/3601;172/1039;26/2791;162/3167;https://dblp.uni-trier.de/pers/h/Hasselt:Hado_van.html", "google_scholar": ";krrh6OUAAAAJ;https://scholar.google.co.uk/citations?user=1TTFBEkAAAAJ;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~David_Silver1;~Anirudh_Goyal1;~Ivo_Danihelka1;~Matteo_Hessel1;~Hado_van_Hasselt1", "aff": "Google DeepMind;University of Montreal;Google DeepMind;;Google DeepMind", "aff_domain": "deepmind.com;umontreal.ca;deepmind.com;;google.com", "position": "Full Professor;PhD student;Research Scientist;;Research scientist", "bibtex": "@inproceedings{\nsilver2022learning,\ntitle={Learning by Directional Gradient Descent},\nauthor={David Silver and Anirudh Goyal and Ivo Danihelka and Matteo Hessel and Hado van Hasselt},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=5i7lJLuhTm}\n}", "github": "", "project": "", "reviewers": "AjYD;GYWv;brku;gsh8", "pdf_size": 0, "recommendation": "1;5;6;6", "confidence": "5;3;3;3", "correctness": "2;3;4;3", "technical_novelty": "1;1;3;3", "empirical_novelty": "0;3;3;3", "wc_summary_paper": "11;61;175;115", "wc_summary_review": "27;71;37;54", "wc_main_review": "17;254;321;402", "wc_review": "55;386;533;571", "wc_reply_reviewers": "0;141;187;0", "wc_reply_authors": "0;721;574;140", "reply_reviewers": "0;1;1;0", "reply_authors": "0;4;2;1", "recommendation_avg": [ 4.5, 2.0615528128088303 ], "confidence_avg": [ 3.5, 0.8660254037844386 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.0, 1.0 ], "empirical_novelty_avg": [ 2.25, 1.299038105676658 ], "wc_summary_paper_avg": [ 90.5, 61.096235563248904 ], "wc_summary_review_avg": [ 47.25, 16.768646337734005 ], "wc_main_review_avg": [ 248.5, 143.56270407038173 ], "wc_review_avg": [ 386.25, 203.34376680882056 ], "wc_reply_reviewers_avg": [ 82.0, 83.59724875855665 ], "wc_reply_authors_avg": [ 358.75, 297.5360272303171 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.75, 1.479019945774904 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.9801960588196067, "corr_recommendation_correctness": 0.8574929257125441, "gs_citation": 43, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15939678127630533507&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "pdf": "https://openreview.net/pdf?id=5i7lJLuhTm", "email": "deepmind.com;umontreal.ca;deepmind.com;;google.com", "author_num": 5, "aff_unique_index": "0;1;0;0", "aff_unique_norm": "Google;University of Montreal", "aff_unique_dep": "Google DeepMind;", "aff_unique_url": "https://deepmind.com;https://wwwumontreal.ca", "aff_unique_abbr": "DeepMind;UM", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0", "aff_country_unique": "United Kingdom;Canada" }, { "title": "IGLU: Efficient GCN Training via Lazy Updates", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6235", "id": "5kq11Tl1z4", "poster": "", "openreview": "https://openreview.net/forum?id=5kq11Tl1z4", "slides": "https://iclr.cc/virtual/2022/poster/6235", "video": "https://iclr.cc/virtual/2022/poster/6235", "author_site": "S Deepak Narayanan, Aditya Sinha, Prateek Jain, Purushottam Kar, SUNDARARAJAN SELLAMANICKAM", "tldr": "", "abstract": "Training multi-layer Graph Convolution Networks (GCN) using standard SGD techniques scales poorly as each descent step ends up updating node embeddings for a large portion of the graph. Recent attempts to remedy this sub-sample the graph that reduces compute but introduce additional variance and may offer suboptimal performance. This paper develops the IGLU method that caches intermediate computations at various GCN layers thus enabling lazy updates that significantly reduce the compute cost of descent. IGLU introduces bounded bias into the gradients but nevertheless converges to a first-order saddle point under standard assumptions such as objective smoothness. Benchmark experiments show that IGLU offers up to 1.2% better accuracy despite requiring up to 88% less compute.", "keywords": "Graph convolutional networks;Graph neural networks;Optimization;Lazy updates", "primary_area": "", "supplementary_material": "", "author": "S Deepak Narayanan;Aditya Sinha;Prateek Jain;Purushottam Kar;SUNDARARAJAN SELLAMANICKAM", "authorids": "~S_Deepak_Narayanan1;~Aditya_Sinha1;~Prateek_Jain1;~Purushottam_Kar1;~SUNDARARAJAN_SELLAMANICKAM2", "gender": ";M;M;M;", "homepage": ";https://adityaasinha28.github.io/;http://prateekjain.org;https://www.cse.iitk.ac.in/users/purushot/;", "dblp": ";;https://dblp.uni-trier.de/pers/j/Jain_0002:Prateek.html;https://dblp.uni-trier.de/pers/hd/k/Kar:Purushottam;", "google_scholar": ";5letoXIAAAAJ;qYhRbJoAAAAJ;;https://scholar.google.co.in/citations?user=JOk66doAAAAJ", "orcid": ";;;0000-0003-2096-5267;", "linkedin": ";adityaasinha28/;;purushottam-kar-74774421/;", "or_profile": "~S_Deepak_Narayanan1;~Aditya_Sinha1;~Prateek_Jain1;~Purushottam_Kar1;~SUNDARARAJAN_SELLAMANICKAM2", "aff": ";Department of Computer Science;Google;Indian Institute of Technology, Kanpur;Microsoft", "aff_domain": ";cs.illinois.edu;google.com;iitk.ac.in;microsoft.com", "position": ";MS student;Researcher;Associate Professor;Principal Researcher", "bibtex": "@inproceedings{\nnarayanan2022iglu,\ntitle={{IGLU}: Efficient {GCN} Training via Lazy Updates},\nauthor={S Deepak Narayanan and Aditya Sinha and Prateek Jain and Purushottam Kar and SUNDARARAJAN SELLAMANICKAM},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=5kq11Tl1z4}\n}", "github": "", "project": "", "reviewers": "jiPr;mKbp;tDRJ", "pdf_size": 0, "recommendation": "6;6;6", "confidence": "3;3;3", "correctness": "4;3;4", "technical_novelty": "3;3;4", "empirical_novelty": "3;3;3", "wc_summary_paper": "56;54;43", "wc_summary_review": "45;59;35", "wc_main_review": "304;169;111", "wc_review": "405;282;189", "wc_reply_reviewers": "0;31;0", "wc_reply_authors": "1376;1991;227", "reply_reviewers": "0;1;0", "reply_authors": "4;5;3", "recommendation_avg": [ 6.0, 0.0 ], "confidence_avg": [ 3.0, 0.0 ], "correctness_avg": [ 3.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 3.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 51.0, 5.715476066494082 ], "wc_summary_review_avg": [ 46.333333333333336, 9.843215373488933 ], "wc_main_review_avg": [ 194.66666666666666, 80.85515169459244 ], "wc_review_avg": [ 292.0, 88.46468221838589 ], "wc_reply_reviewers_avg": [ 10.333333333333334, 14.613540144521982 ], "wc_reply_authors_avg": [ 1198.0, 731.0663444585587 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 4.0, 0.816496580927726 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16548699335588161367&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=5kq11Tl1z4", "email": ";cs.illinois.edu;google.com;iitk.ac.in;microsoft.com", "author_num": 5, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "Unknown Institution;Google;Indian Institute of Technology Kanpur;Microsoft", "aff_unique_dep": "Department of Computer Science;Google;;Microsoft Corporation", "aff_unique_url": ";https://www.google.com;https://www.iitk.ac.in;https://www.microsoft.com", "aff_unique_abbr": ";Google;IIT Kanpur;Microsoft", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Mountain View;Kanpur", "aff_country_unique_index": "1;2;1", "aff_country_unique": ";United States;India" }, { "id": "5n7kJBpTSU4", "title": "ABC: Attention with Bounded-memory Control", "track": "main", "status": "Withdraw", "tldr": "", "abstract": " Transformer architectures have achieved state-of-the-art results on a variety of sequence modeling tasks. However, their attention mechanism comes with a quadratic complexity in sequence lengths, making the computational overhead prohibitive, especially for long sequences. Attention context can be seen as a random-access memory with each token taking a slot. Under this perspective, the memory size grows linearly with the sequence length, and so does the overhead of reading from it. One way to improve the efficiency is to bound the memory size. We show that disparate approaches can be subsumed into one abstraction, attention with bounded-memory control (ABC), and they vary in their organization of the memory. ABC reveals new, unexplored possibilities. First, it connects several efficient attention variants that would otherwise seem apart. Second, this abstraction gives new insights\u2014an established approach (Wang et al., 2020b) previously thought to be not applicable in causal attention, actually is. Last, we present a new instance of ABC, which draws inspiration from existing ABC approaches, but replaces their heuristic memory-organizing functions with a learned, contextualized one. Our experiments on language modeling, machine translation, and masked language model finetuning show that our approach outperforms previous efficient attention models; compared to the strong transformer baselines, it significantly improves the inference time and space efficiency with no or negligible accuracy loss.", "keywords": "Attention;transformers;efficiency;machine translation;language modeling", "primary_area": "", "supplementary_material": "/attachment/d44e3d98ca6845e61d1223225ed4f978faff57d5.zip", "author": "Hao Peng;Jungo Kasai;Nikolaos Pappas;Dani Yogatama;Zhaofeng Wu;Lingpeng Kong;Roy Schwartz;Noah Smith", "authorids": "~Hao_Peng4;~Jungo_Kasai1;~Nikolaos_Pappas1;~Dani_Yogatama2;~Zhaofeng_Wu1;~Lingpeng_Kong1;~Roy_Schwartz1;~Noah_Smith1", "gender": "M;M;;M;M;M;;M", "homepage": "https://homes.cs.washington.edu/~jkasai/;http://nik0spapp.github.io/;https://zhaofengwu.github.io/;https://ikekonglp.github.io/;https://schwartz-lab-huji.github.io/;https://homes.cs.washington.edu/~nasmith/;;https://haopeng-nlp.github.io/", "dblp": "205/9020;36/8968-2.html;168/7994.html;144/7656;19/376-1;90/5204.html;08/8178;", "google_scholar": "nHCLoIwAAAAJ;https://scholar.google.ch/citations?user=daiFj_cAAAAJ;53baCywAAAAJ;f1hBi5wAAAAJ;wvfWo9IAAAAJ;https://scholar.google.com/citations?hl=en;;6Y37nm0AAAAJ", "orcid": ";0000-0002-2004-8111;;;;0000-0002-2310-6380;;", "linkedin": ";nik0spapp/;zhaofengwu/;;;;;", "or_profile": "~Jungo_Kasai1;~Nikolaos_Pappas1;~Zhaofeng_Wu1;~Lingpeng_Kong1;~Roy_Schwartz1;~Noah_Smith1;~Dani_Yogatama1;~Hao_Peng1", "aff": "Paul G. Allen School of Computer Science & Engineering, University of Washington;AWS AI Labs;Allen Institute for Artificial Intelligence;Department of Computer Science, The University of Hong Kong;Hebrew University, Hebrew University of Jerusalem;Allen Institute for Artificial Intelligence;Google DeepMind;Department of Computer Science, University of Washington", "aff_domain": "cs.washington.edu;amazon.com;allenai.org;cs.hku.hk;cs.huji.ac.il;allenai.org;google.com;cs.washington.edu", "position": "PhD student;Researcher;Researcher;Assistant Professor;Assistant Professor;Senior Director of NLP Research;Research Scientist;PhD student", "bibtex": "@misc{\npeng2022abc,\ntitle={{ABC}: Attention with Bounded-memory Control},\nauthor={Hao Peng and Jungo Kasai and Nikolaos Pappas and Dani Yogatama and Zhaofeng Wu and Lingpeng Kong and Roy Schwartz and Noah Smith},\nyear={2022},\nurl={https://openreview.net/forum?id=5n7kJBpTSU4}\n}", "github": "", "project": "", "reviewers": "RM28;HjYQ;QDQZ;hwc8", "site": "https://openreview.net/forum?id=5n7kJBpTSU4", "pdf_size": 0, "recommendation": "3;5;6;6", "confidence": "2;3;4;2", "correctness": "4;3;4;4", "technical_novelty": "2;3;3;4", "empirical_novelty": "2;3;4;4", "wc_summary_paper": "129;65;173;55", "wc_summary_review": "40;26;101;16", "wc_main_review": "226;97;314;144", "wc_review": "395;188;588;215", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 5.0, 1.224744871391589 ], "confidence_avg": [ 2.75, 0.82915619758885 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 3.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 105.5, 48.21566135603659 ], "wc_summary_review_avg": [ 45.75, 33.01798752195536 ], "wc_main_review_avg": [ 195.25, 82.65402289060104 ], "wc_review_avg": [ 346.5, 160.53737882499516 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": 0.4923659639173309, "corr_recommendation_correctness": 0.0, "gs_citation": 21, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10127078228209742188&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff_unique_index": "0;1;2;3;4;2;5;0", "aff_unique_norm": "University of Washington;Amazon;Allen Institute for Artificial Intelligence;University of Hong Kong;Hebrew University of Jerusalem;Google", "aff_unique_dep": "Paul G. Allen School of Computer Science & Engineering;AWS AI Labs;;Department of Computer Science;;Google DeepMind", "aff_unique_url": "https://www.washington.edu;https://aws.amazon.com;https://allenai.org;https://www.hku.hk;https://www.huji.ac.il;https://deepmind.com", "aff_unique_abbr": "UW;AWS;AI2;HKU;HUJI;DeepMind", "aff_campus_unique_index": "0;2;0", "aff_campus_unique": "Seattle;;Hong Kong SAR", "aff_country_unique_index": "0;0;0;1;2;0;3;0", "aff_country_unique": "United States;China;Israel;United Kingdom" }, { "id": "5o7lEUYRvM", "title": "Function-Space Variational Inference for Deep Bayesian Classification", "track": "main", "status": "Reject", "tldr": "", "abstract": "Bayesian deep learning approaches assume model parameters to be latent random variables and infer posterior predictive distributions to quantify uncertainty, increase safety and trust, and prevent overconfident and unpredictable behavior. However, weight-space priors are model-specific, can be difficult to interpret and hard to choose. Instead of weight-space priors, we leverage function-space variational inference to apply a Dirichlet predictive prior in function space, resulting in a variational Dirichlet posterior which facilitates easier specification of epistemic uncertainty. This is achieved through the perspective of stochastic neural network classifiers as variational implicit processes, which can be trained using function-space variational inference by devising a novel Dirichlet KL estimator. Experiments on small- and large-scale image classification tasks demonstrate that our function-space inference scales to large-scale tasks and models, improves adversarial robustness and boosts uncertainty quantification across models, without influencing the in-distribution performances, architecture or model size.", "keywords": "Bayesian deep learning;image classification;functional variational inference;Dirichlet distribution", "primary_area": "", "supplementary_material": "/attachment/936d7d24f623774b7e0b4476b16e780ab68f65fd.zip", "author": "Jihao Andreas Lin;Joe Watson;Pascal Klink;Jan Peters", "authorids": "~Jihao_Andreas_Lin1;~Joe_Watson1;~Pascal_Klink2;~Jan_Peters3", "gender": "M;M;M;M", "homepage": "https://jandylin.github.io;http://joemwatson.github.io/;;https://www.jan-peters.net", "dblp": "279/2864;143/2943;;p/JanPeters1", "google_scholar": "Bn1GyeEAAAAJ;https://scholar.google.co.uk/citations?user=xLtXIZAAAAAJ;https://scholar.google.de/citations?user=ZjqU_KwAAAAJ;https://scholar.google.de/citations?user=-kIVAcAAAAAJ", "orcid": ";;;0000-0002-5266-8091", "linkedin": "jihao-andreas-lin/;;;janrpeters/", "or_profile": "~Jihao_Andreas_Lin1;~Joe_Watson1;~Pascal_Klink2;~Jan_Peters3", "aff": "University of Cambridge;TU Darmstadt;TU Darmstadt;TU Darmstadt", "aff_domain": "cam.ac.uk;tu-darmstadt.de;tu-darmstadt.de;tu-darmstadt.de", "position": "PhD student;PhD student;PhD student;Full Professor", "bibtex": "@misc{\nlin2022functionspace,\ntitle={Function-Space Variational Inference for Deep Bayesian Classification},\nauthor={Jihao Andreas Lin and Joe Watson and Pascal Klink and Jan Peters},\nyear={2022},\nurl={https://openreview.net/forum?id=5o7lEUYRvM}\n}", "github": "", "project": "", "reviewers": "rF5E;7nPR;why2;FSJj", "site": "https://openreview.net/forum?id=5o7lEUYRvM", "pdf_size": 0, "recommendation": "3;5;6;6", "confidence": "4;4;3;3", "correctness": "2;3;3;4", "technical_novelty": "2;3;2;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "96;121;97;127", "wc_summary_review": "40;201;37;96", "wc_main_review": "396;730;586;794", "wc_review": "532;1052;720;1017", "wc_reply_reviewers": "0;197;40;283", "wc_reply_authors": "543;612;539;1543", "reply_reviewers": "0;1;1;1", "reply_authors": "1;1;1;3", "recommendation_avg": [ 5.0, 1.224744871391589 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 110.25, 13.91716565971678 ], "wc_summary_review_avg": [ 93.5, 66.36452365533863 ], "wc_main_review_avg": [ 626.5, 152.92073109948174 ], "wc_review_avg": [ 830.25, 215.1492214719821 ], "wc_reply_reviewers_avg": [ 130.0, 114.99782606640875 ], "wc_reply_authors_avg": [ 809.25, 424.6235833064386 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.5, 0.8660254037844386 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.8164965809277259, "corr_recommendation_correctness": 0.8660254037844386, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:W9j5hQVUBPUJ:scholar.google.com/&scioq=Function-Space+Variational+Inference+for+Deep+Bayesian+Classification&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;1;1", "aff_unique_norm": "University of Cambridge;Technische Universit\u00e4t Darmstadt", "aff_unique_dep": ";", "aff_unique_url": "https://www.cam.ac.uk;https://www.tu-darmstadt.de", "aff_unique_abbr": "Cambridge;TU Darmstadt", "aff_campus_unique_index": "0;1;1;1", "aff_campus_unique": "Cambridge;Darmstadt", "aff_country_unique_index": "0;1;1;1", "aff_country_unique": "United Kingdom;Germany" }, { "id": "5qwA7LLbgP0", "title": "Disentangling Sources of Risk for Distributional Multi-Agent Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "In cooperative multi-agent reinforcement learning, state transitions, rewards, and actions can all induce randomness (or uncertainty) in the observed long-term returns. These randomnesses are reflected from two risk sources: (a) agent-wise risk (i.e., how cooperative our teammates act for a given agent) and (b) environment-wise risk (i.e., transition stochasticity). Although these two sources are both important factors for learning robust policies of agents, prior works do not separate them or deal with only a single risk source, which could lead to suboptimal equilibria. In this paper, we propose Disentangled RIsk-sensitive Multi-Agent reinforcement learning (DRIMA), a novel framework being capable of disentangling risk sources. Our main idea is to separate risk-level leverages (i.e., quantiles) in both centralized training and decentralized execution with a hierarchical quantile structure and quantile regression. Our experiments demonstrate that DRIMA significantly outperforms prior-arts across various scenarios in StarCraft Multi-agent Challenge. Notably, DRIMA shows robust performance regardless of reward shaping, exploration schedule, where prior methods learn only a suboptimal policy.", "keywords": "multi-agent reinforcement learning;risk-sensitive reinforcement learning;reinforcement learning;distributional reinforcement learning", "primary_area": "", "supplementary_material": "/attachment/353d35246c658429ca17efb9dd137d16415b96f0.zip", "author": "Kyunghwan Son;Junsu Kim;Yung Yi;Jinwoo Shin", "authorids": "~Kyunghwan_Son1;~Junsu_Kim1;~Yung_Yi1;~Jinwoo_Shin1", "gender": "M;M;M;M", "homepage": "http://lanada.kaist.ac.kr/;https://sites.google.com/view/junsu-kim;;https://sites.google.com/site/mijirim/", "dblp": "206/9135;;01/66;31/7062", "google_scholar": ";1o9cS8UAAAAJ;;https://scholar.google.com.tw/citations?user=m3eDp7kAAAAJ", "orcid": ";;;", "linkedin": ";junsu-kim-b170b3168/;;", "or_profile": "~Kyunghwan_Son1;~Junsu_Kim1;~Yung_Yi1;~Jinwoo_Shin1", "aff": ";Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology", "aff_domain": ";kaist.ac.kr;kaist.ac.kr;kaist.ac.kr", "position": ";Ph.D. student;Full Professor;Associate Professor", "bibtex": "@misc{\nson2022disentangling,\ntitle={Disentangling Sources of Risk for Distributional Multi-Agent Reinforcement Learning},\nauthor={Kyunghwan Son and Junsu Kim and Yung Yi and Jinwoo Shin},\nyear={2022},\nurl={https://openreview.net/forum?id=5qwA7LLbgP0}\n}", "github": "", "project": "", "reviewers": "LvSz;opi1;DnPu;xk2m;5LTK", "site": "https://openreview.net/forum?id=5qwA7LLbgP0", "pdf_size": 0, "recommendation": "3;3;6;6;6", "confidence": "4;4;4;4;3", "correctness": "2;3;4;3;4", "technical_novelty": "2;2;3;3;3", "empirical_novelty": "2;2;3;3;2", "wc_summary_paper": "22;65;68;123;73", "wc_summary_review": "24;38;113;110;66", "wc_main_review": "265;309;439;600;327", "wc_review": "311;412;620;833;466", "wc_reply_reviewers": "20;0;63;280;112", "wc_reply_authors": "1624;2519;1068;2439;1558", "reply_reviewers": "1;0;1;2;1", "reply_authors": "3;4;3;6;3", "recommendation_avg": [ 4.8, 1.469693845669907 ], "confidence_avg": [ 3.8, 0.39999999999999997 ], "correctness_avg": [ 3.2, 0.7483314773547882 ], "technical_novelty_avg": [ 2.6, 0.4898979485566356 ], "empirical_novelty_avg": [ 2.4, 0.4898979485566356 ], "wc_summary_paper_avg": [ 70.2, 32.09610568277716 ], "wc_summary_review_avg": [ 70.2, 36.34501341312175 ], "wc_main_review_avg": [ 388.0, 120.52883472430986 ], "wc_review_avg": [ 528.4, 182.14126385857764 ], "wc_reply_reviewers_avg": [ 95.0, 100.1878236114549 ], "wc_reply_authors_avg": [ 1841.6, 555.3365105951526 ], "reply_reviewers_avg": [ 1.0, 0.6324555320336759 ], "reply_authors_avg": [ 3.8, 1.16619037896906 ], "replies_avg": [ 32, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.408248290463863, "corr_recommendation_correctness": 0.7637626158259733, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8857655750744467124&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0;0", "aff_unique_norm": "Korea Advanced Institute of Science and Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.kaist.ac.kr", "aff_unique_abbr": "KAIST", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "South Korea" }, { "id": "5qz8nIzTkml", "title": "$L_q$ regularization for Fairness AI robust to sampling bias", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "It is well recognized that historical biases exist in training data against a certain sensitive group (e.g., non-white, women) which are socially unacceptable, and these unfair biases are inherited to trained AI models. Various learning algorithms have been proposed to remove or alleviate unfair biases in trained AI models. In this paper, we consider another type of bias in training data so-called {\\it sampling bias} in view of fairness AI. Here, sampling bias means that training data do not represent well the population of interest. Sampling bias occurs when special sampling designs (e.g., stratified sampling) are used when collecting training data, or the population where training data are collected is different from the population of interest. When sampling bias exists, fair AI models on training data may not be fair in test data. To ensure fairness on test data, we develop computationally efficient learning algorithms robust to sampling bias. In particular, we propose a robust fairness constraint based on the $L_q$ norm which is a generic algorithm to be applied to various fairness AI problems without much hamper. By analyzing multiple benchmark data sets, we show that our proposed robust fairness AI algorithm improves existing fair AI algorithms much in terms of the robustness to sampling bias and has significant computational advantages compared to other robust fair AI algorithms.\n", "keywords": "Fairness AI;Sampling bias;Robustness", "primary_area": "", "supplementary_material": "", "author": "Yongdai Kim;Sara Kim;Seonghyeon Kim;Kunwoong Kim", "authorids": "~Yongdai_Kim1;~Sara_Kim1;~Seonghyeon_Kim2;~Kunwoong_Kim1", "gender": "M;F;;M", "homepage": ";https://stat.snu.ac.kr;http://nohomepage.com;", "dblp": "93/734;;;296/1715", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Yongdai_Kim1;~Sara_Kim1;~Seonghyeon_Kim2;~Kun_woong_Kim1", "aff": "Seoul National University;;Seoul National univ.;Seoul National University", "aff_domain": "snu.ac.kr;;stat.snu.ac.kr;snu.ac.kr", "position": "Full Professor;;PhD student;PhD student", "bibtex": "@misc{\nkim2022lq,\ntitle={\\$L\\_q\\$ regularization for Fairness {AI} robust to sampling bias},\nauthor={Yongdai Kim and Sara Kim and Seonghyeon Kim and Kunwoong Kim},\nyear={2022},\nurl={https://openreview.net/forum?id=5qz8nIzTkml}\n}", "github": "", "project": "", "reviewers": "x4Wu;1UrS;BgiQ;z8f3", "site": "https://openreview.net/forum?id=5qz8nIzTkml", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "4;3;5;4", "correctness": "3;4;2;3", "technical_novelty": "3;2;2;2", "empirical_novelty": "2;2;3;2", "wc_summary_paper": "110;56;70;84", "wc_summary_review": "39;36;59;54", "wc_main_review": "636;303;459;266", "wc_review": "785;395;588;404", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 80.0, 19.949937343260004 ], "wc_summary_review_avg": [ 47.0, 9.72111104761179 ], "wc_main_review_avg": [ 416.0, 146.2173040375181 ], "wc_review_avg": [ 543.0, 159.54153064327795 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:BMnU5yZcfWwJ:scholar.google.com/&scioq=%24L_q%24+regularization+for+Fairness+AI+robust+to+sampling+bias&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0;0", "aff_unique_norm": "Seoul National University", "aff_unique_dep": "", "aff_unique_url": "https://www.snu.ac.kr", "aff_unique_abbr": "SNU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "South Korea" }, { "id": "5sP_PUUS78v", "title": "SeqPATE: Differentially Private Text Generation via Knowledge Distillation", "track": "main", "status": "Reject", "tldr": "", "abstract": "Protecting the privacy of user data is crucial when training neural text generation models, which may leak sensitive user information during generation. Differentially private (DP) learning algorithms provide guarantees on identifying the existence of a training sample from model outputs. PATE is a DP learning algorithm that fits the large model well, such as GPT. In this paper, we propose SeqPATE that adapts PATE to text generation while satisfying DP. There are two key challenges in adapting PATE to text generation: (i) obtaining sequence-level supervision for text generation, and (ii) reducing noise required to protect privacy given the large output space (i.e. vocabulary size). For (i), we generate pseudo input and reduce the sequence generation problem to the next word prediction. For (ii), we reduce the output space with top-$k$ and top-$p$ selection strategy that dynamically filters the candidate words; and we refine the teacher aggregation mechanism of PATE to avoid the low agreement rates due to voting over the large output space. To limit the privacy loss, we design an efficient knowledge distillation to reduce the time of distilling from the private data. We apply SeqPATE to a simple text generation task (sentence completion) and achieves 39\\% and 28\\% gains in Bleu4 on two datasets.", "keywords": "Natural Language Generation;Text Generation;Privacy Protection", "primary_area": "", "supplementary_material": "/attachment/77ce990d4a5d6aeeb48bdf78c1536deb72ffcd1f.zip", "author": "Zhiliang Tian;Yingxiu Zhao;Ziyue Huang;Yu-Xiang Wang;Nevin Zhang;He He", "authorids": "~Zhiliang_Tian2;~Yingxiu_Zhao1;~Ziyue_Huang1;~Yu-Xiang_Wang1;~Nevin_Zhang1;~He_He2", "gender": "M;F;M;;M;F", "homepage": "https://scholar.google.com.hk/citations?hl=en&user=ClvGvccAAAAJ#;;;http://www.cs.ucsb.edu/~yuxiangw/publications.html;https://cse.hkust.edu.hk/~lzhang/teach/courses.html;http://hhexiy.github.io", "dblp": "203/9265;;160/3232;62/1637-3.html;https://dblp.uni-trier.de/pid/z/NevinLianwenZhang.html;08/8618-1", "google_scholar": "https://scholar.google.com.hk/citations?hl=en;https://scholar.google.com/citations?hl=en;Eq2jI2gAAAAJ;HGNZ1fkAAAAJ;;https://scholar.google.com/citations?hl=en", "orcid": ";0000-0002-5218-9920;;;;", "linkedin": ";;;;;", "or_profile": "~Zhiliang_Tian2;~Yingxiu_Zhao1;~Ziyue_Huang1;~Yu-Xiang_Wang1;~Nevin_Zhang1;~He_He1", "aff": "Hong Kong University of Science and Technology;Hong Kong University of Science and Technology;Hong Kong University of Science and Technology;UC Santa Barbara;Hong Kong University of Science and Technology;New York University", "aff_domain": "ust.hk;ust.hk;hkust.edu;ucsb.edu;ust.hk;nyu.edu", "position": "PhD student;PhD student;PhD student;Assistant Professor;Full Professor;Assistant Professor", "bibtex": "@misc{\ntian2022seqpate,\ntitle={Seq{PATE}: Differentially Private Text Generation via Knowledge Distillation},\nauthor={Zhiliang Tian and Yingxiu Zhao and Ziyue Huang and Yu-Xiang Wang and Nevin Zhang and He He},\nyear={2022},\nurl={https://openreview.net/forum?id=5sP_PUUS78v}\n}", "github": "", "project": "", "reviewers": "wFLx;CVpe;386M;pnSu", "site": "https://openreview.net/forum?id=5sP_PUUS78v", "pdf_size": 0, "recommendation": "3;6;6;8", "confidence": "4;3;4;4", "correctness": "4;3;3;4", "technical_novelty": "2;3;2;3", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "132;74;154;159", "wc_summary_review": "31;193;39;45", "wc_main_review": "1103;365;342;176", "wc_review": "1266;632;535;380", "wc_reply_reviewers": "0;131;351;0", "wc_reply_authors": "1926;1271;1346;307", "reply_reviewers": "0;1;3;0", "reply_authors": "4;4;6;1", "recommendation_avg": [ 5.75, 1.7853571071357126 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 129.75, 33.751851801049376 ], "wc_summary_review_avg": [ 77.0, 67.15653356152326 ], "wc_main_review_avg": [ 496.5, 357.6747824490846 ], "wc_review_avg": [ 703.25, 337.10634449680714 ], "wc_reply_reviewers_avg": [ 120.5, 143.42332446293386 ], "wc_reply_authors_avg": [ 1212.5, 581.0027968951613 ], "reply_reviewers_avg": [ 1.0, 1.224744871391589 ], "reply_authors_avg": [ 3.75, 1.7853571071357126 ], "replies_avg": [ 27, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.08084520834544431, "corr_recommendation_correctness": -0.14002800840280097, "gs_citation": 25, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3885591952876642497&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff_unique_index": "0;0;0;1;0;2", "aff_unique_norm": "Hong Kong University of Science and Technology;University of California, Santa Barbara;New York University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.ust.hk;https://www.ucsb.edu;https://www.nyu.edu", "aff_unique_abbr": "HKUST;UCSB;NYU", "aff_campus_unique_index": "0;0;0;1;0", "aff_campus_unique": "Hong Kong SAR;Santa Barbara;", "aff_country_unique_index": "0;0;0;1;0;1", "aff_country_unique": "China;United States" }, { "id": "5ueTHF0yAlZ", "title": "Improving greedy core-set configurations for active learning with uncertainty-scaled distances", "track": "main", "status": "Reject", "tldr": "", "abstract": "We scale perceived distances of the core-set algorithm by a factor of uncertainty and search for low-confidence configurations, finding significant improvements in sample efficiency across CIFAR10/100 and SVHN image classification, especially in larger acquisition sizes. We show the necessity of our modifications and explain how the improvement is due to a probabilistic quadratic speed-up in the convergence of core-set loss, under assumptions about the relationship of model uncertainty and misclassification.", "keywords": "Active learning", "primary_area": "", "supplementary_material": "/attachment/b7b0169e79611f3fcf63c5f163c3d6b4c00b6da0.zip", "author": "Yuchen Li;Frank Rudzicz", "authorids": "~Yuchen_Li1;~Frank_Rudzicz2", "gender": "M;M", "homepage": ";http://www.cs.toronto.edu/~frank", "dblp": "143/0258;36/6505", "google_scholar": ";https://scholar.google.ca/citations?user=elXOB1sAAAAJ", "orcid": ";0000-0002-1139-3423", "linkedin": ";", "or_profile": "~Yuchen_Li1;~Frank_Rudzicz2", "aff": ";Vector Institute for Artificial Intelligence", "aff_domain": ";vectorinstitute.ai", "position": ";Faculty", "bibtex": "@misc{\nli2022improving,\ntitle={Improving greedy core-set configurations for active learning with uncertainty-scaled distances},\nauthor={Yuchen Li and Frank Rudzicz},\nyear={2022},\nurl={https://openreview.net/forum?id=5ueTHF0yAlZ}\n}", "github": "", "project": "", "reviewers": "4LSV;Y5dH;FMtn;qzud", "site": "https://openreview.net/forum?id=5ueTHF0yAlZ", "pdf_size": 0, "recommendation": "3;3;5;8", "confidence": "4;4;4;4", "correctness": "3;3;3;4", "technical_novelty": "3;2;2;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "20;106;52;56", "wc_summary_review": "7;58;26;137", "wc_main_review": "111;344;322;306", "wc_review": "138;508;400;499", "wc_reply_reviewers": "0;0;37;0", "wc_reply_authors": "188;60;293;29", "reply_reviewers": "0;0;1;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 4.75, 2.0463381929681126 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 58.5, 30.769302884530873 ], "wc_summary_review_avg": [ 57.0, 49.65380146574882 ], "wc_main_review_avg": [ 270.75, 93.21312944000968 ], "wc_review_avg": [ 386.25, 149.4596517458809 ], "wc_reply_reviewers_avg": [ 9.25, 16.021469970012117 ], "wc_reply_authors_avg": [ 142.5, 105.36721501491819 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.9169493006161777, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:apMABwLkNfgJ:scholar.google.com/&scioq=Improving+greedy+core-set+configurations+for+active+learning+with+uncertainty-scaled+distances&hl=en&as_sdt=0,33", "gs_version_total": 3, "aff_unique_index": "0", "aff_unique_norm": "Vector Institute for Artificial Intelligence", "aff_unique_dep": "", "aff_unique_url": "https://vectorinstitute.ai/", "aff_unique_abbr": "Vector Institute", "aff_country_unique_index": "0", "aff_country_unique": "Canada" }, { "id": "5vjyt5JHmaU", "title": "Safe Exploration in Linear Equality Constraint", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "With the extensive research and application, some shortcomings of reinforcement learning methods are gradually revealed. One of the considerable problems is that it is difficult for reinforcement learning methods to strictly satisfy the constraints. In this paper, a Singular Value Decomposition-based non-training method called 'Action Decomposition Regular' is proposed to achieve safe exploration. By adopting linear dynamics model, our method decomposes the action space into a constraint dimension and a free dimension for separate control, making policy strictly satisfy the linear equality constraint without limiting the exploration region. In addition, we show how our method should be used when the action space is limited and convex, which makes the method more suitable for real-world scenarios. Finally, we show the effectiveness of our method in a physically-based environment and prevail where reward shaping fails.", "keywords": "Reinforcement Learning;safe exploration;Singular Value Decomposition;strictly satisfy constraint;Model-based", "primary_area": "", "supplementary_material": "", "author": "Xiaohu Jia;Zijia Niu;Wang Yao;Jinwei Liu", "authorids": "~Xiaohu_Jia1;~Zijia_Niu1;~Wang_Yao1;~Jinwei_Liu1", "gender": "M;M;M;M", "homepage": ";;http://iai.buaa.edu.cn/info/1013/1223.htm;https://github.com/sddzljw", "dblp": ";;;", "google_scholar": ";;;", "orcid": "0000-0003-2043-1863;0000-0003-3769-2859;;", "linkedin": ";;;", "or_profile": "~Xiaohu_Jia1;~Zijia_Niu1;~Wang_Yao1;~Jinwei_Liu1", "aff": "Beihang University;Beihang University;Beihang University;Beihang University", "aff_domain": "buaa.edu.cn;buaa.edu.cn;buaa.edu.cn;buaa.edu.cn", "position": "MS student;PhD student;Lecturer;PhD student", "bibtex": "@misc{\njia2022safe,\ntitle={Safe Exploration in Linear Equality Constraint},\nauthor={Xiaohu Jia and Zijia Niu and Wang Yao and Jinwei Liu},\nyear={2022},\nurl={https://openreview.net/forum?id=5vjyt5JHmaU}\n}", "github": "", "project": "", "reviewers": "28nL;NCoG;LMZL;6Uzv", "site": "https://openreview.net/forum?id=5vjyt5JHmaU", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "2;2;3;5", "correctness": "3;3;3;4", "technical_novelty": "3;2;4;2", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "51;605;63;43", "wc_summary_review": "41;66;36;13", "wc_main_review": "337;824;290;101", "wc_review": "429;1495;389;157", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 3.0, 1.224744871391589 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.82915619758885 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 190.5, 239.41752233284848 ], "wc_summary_review_avg": [ 39.0, 18.828170383762732 ], "wc_main_review_avg": [ 388.0, 266.77237488165827 ], "wc_review_avg": [ 617.5, 517.158341323042 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.9428090415820632, "corr_recommendation_correctness": 1.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:TX1OF4UFdNoJ:scholar.google.com/&scioq=Safe+Exploration+in+Linear+Equality+Constraint&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Beihang University", "aff_unique_dep": "", "aff_unique_url": "http://www.buaa.edu.cn/", "aff_unique_abbr": "BUAA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "id": "5x7J3WXasqy", "title": "Spatially and Seamlessly Hierarchical Reinforcement Learning for State Space and Policy Space in Autonomous Driving", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Despite advances in hierarchical reinforcement learning, its applications to path planning in autonomous driving on highways are challenging. One reason is that conventional hierarchical reinforcement learning approaches are not amenable to autonomous driving due to its riskiness: the agent must move avoiding multiple obstacles such as other agents that are highly unpredictable, thus safe regions are small, scattered, and changeable over time. To overcome this challenge, we propose a spatially hierarchical reinforcement learning method for state space and policy space. The high-level policy selects not only behavioral sub-policy but also regions to pay mind to in state space and for outline in policy space. Subsequently, the low-level policy elaborates the short-term goal position of the agent within the outline of the region selected by the high-level command. The network structure and optimization suggested in our method are as concise as those of single-level methods. Experiments on the environment with various shapes of roads showed that our method finds the nearly optimal policies from early episodes, outperforming a baseline hierarchical reinforcement learning method, especially in narrow and complex roads. The resulting trajectories on the roads were similar to those of human strategies on the behavioral planning level.", "keywords": "Hierarchical Reinforcement Learning;Spatial Hierarchy;Autonomous Driving;Path Planning", "primary_area": "", "supplementary_material": "", "author": "Jaehyun Kim;Jaeseung Jeong", "authorids": "~Jaehyun_Kim1;~Jaeseung_Jeong2", "gender": ";", "homepage": ";http://raphe.kaist.ac.kr/people1.htm", "dblp": ";", "google_scholar": ";https://scholar.google.co.kr/citations?user=ZfNxZ3sAAAAJ", "orcid": "0000-0001-7469-9043;", "linkedin": ";", "or_profile": "~Jaehyun_Kim1;~Jaeseung_Jeong2", "aff": "Ajou University;Korea Advanced Institute of Science & Technology", "aff_domain": "ajou.ac.kr;kaist.ac.kr", "position": "Undergrad student;Full Professor", "bibtex": "@misc{\nkim2022spatially,\ntitle={Spatially and Seamlessly Hierarchical Reinforcement Learning for State Space and Policy Space in Autonomous Driving},\nauthor={Jaehyun Kim and Jaeseung Jeong},\nyear={2022},\nurl={https://openreview.net/forum?id=5x7J3WXasqy}\n}", "github": "", "project": "", "reviewers": "BQnq;BBWG;xkgg", "site": "https://openreview.net/forum?id=5x7J3WXasqy", "pdf_size": 0, "recommendation": "1;3;5", "confidence": "4;4;3", "correctness": "2;3;3", "technical_novelty": "1;2;2", "empirical_novelty": "1;2;2", "wc_summary_paper": "135;83;38", "wc_summary_review": "40;40;11", "wc_main_review": "419;92;78", "wc_review": "594;215;127", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 3.0, 1.632993161855452 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 2.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 1.6666666666666667, 0.4714045207910317 ], "empirical_novelty_avg": [ 1.6666666666666667, 0.4714045207910317 ], "wc_summary_paper_avg": [ 85.33333333333333, 39.634440691006205 ], "wc_summary_review_avg": [ 30.333333333333332, 13.670731102939918 ], "wc_main_review_avg": [ 196.33333333333334, 157.55281301483922 ], "wc_review_avg": [ 312.0, 202.61457663916153 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.8660254037844385, "corr_recommendation_correctness": 0.8660254037844385, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:HXqlLjGTafQJ:scholar.google.com/&scioq=Spatially+and+Seamlessly+Hierarchical+Reinforcement+Learning+for+State+Space+and+Policy+Space+in+Autonomous+Driving&hl=en&as_sdt=0,5", "gs_version_total": 3, "aff_unique_index": "0;1", "aff_unique_norm": "Ajou University;Korea Advanced Institute of Science and Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.ajou.ac.kr;https://www.kaist.ac.kr", "aff_unique_abbr": "Ajou;KAIST", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "South Korea" }, { "title": "BiBERT: Accurate Fully Binarized BERT", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6006", "id": "5xEgrl_5FAJ", "poster": "", "openreview": "https://openreview.net/forum?id=5xEgrl_5FAJ", "slides": "https://iclr.cc/virtual/2022/poster/6006", "video": "https://iclr.cc/virtual/2022/poster/6006", "author_site": "Haotong Qin, Yifu Ding, Mingyuan Zhang, Qinghua YAN, Aishan Liu, Qingqing Dang, Ziwei Liu, Xianglong Liu", "tldr": "", "abstract": "The large pre-trained BERT has achieved remarkable performance on Natural Language Processing (NLP) tasks but is also computation and memory expensive. As one of the powerful compression approaches, binarization extremely reduces the computation and memory consumption by utilizing 1-bit parameters and bitwise operations. Unfortunately, the full binarization of BERT (i.e., 1-bit weight, embedding, and activation) usually suffer a significant performance drop, and there is rare study addressing this problem. In this paper, with the theoretical justification and empirical analysis, we identify that the severe performance drop can be mainly attributed to the information degradation and optimization direction mismatch respectively in the forward and backward propagation, and propose BiBERT, an accurate fully binarized BERT, to eliminate the performance bottlenecks. Specifically, BiBERT introduces an efficient Bi-Attention structure for maximizing representation information statistically and a Direction-Matching Distillation (DMD) scheme to optimize the full binarized BERT accurately. Extensive experiments show that BiBERT outperforms both the straightforward baseline and existing state-of-the-art quantized BERTs with ultra-low bit activations by convincing margins on the NLP benchmark. As the first fully binarized BERT, our method yields impressive 56.3 times and 31.2 times saving on FLOPs and model size, demonstrating the vast advantages and potential of the fully binarized BERT model in real-world resource-constrained scenarios.", "keywords": "Network Binarization;Model Compression;BERT;NLP", "primary_area": "", "supplementary_material": "/attachment/251338709ffa2ac9f911a7383a2772d80a7b5b6d.zip", "author": "Haotong Qin;Yifu Ding;Mingyuan Zhang;Qinghua YAN;Aishan Liu;Qingqing Dang;Ziwei Liu;Xianglong Liu", "authorids": "~Haotong_Qin1;~Yifu_Ding2;~Mingyuan_Zhang1;~Qinghua_YAN1;~Aishan_Liu1;dangqingqing@baidu.com;~Ziwei_Liu1;~Xianglong_Liu2", "gender": "M;F;M;F;M;;M;M", "homepage": "https://htqin.github.io/;https://yifu-ding.github.io/;https://mingyuan-zhang.github.io/;https://github.com/yan-qh;https://liuaishan.github.io/;;https://liuziwei7.github.io/;http://www.nlsde.buaa.edu.cn/~xlliu", "dblp": "262/3626.html;;;;177/5658;;05/6300-2;55/7901", "google_scholar": "mK6n-KgAAAAJ;RCEI1r0AAAAJ;2QLD4fAAAAAJ;;88tzr_sAAAAJ;;https://scholar.google.com.hk/citations?user=lc45xlcAAAAJ;https://scholar.google.com.hk/citations?user=8VY7ZDcAAAAJ", "orcid": ";0000-0002-3612-8757;;;;;;", "linkedin": ";yifu-ding-253614186/;;;;;;", "or_profile": "~Haotong_Qin1;~Yifu_Ding2;~Mingyuan_Zhang1;~Qinghua_YAN1;~Aishan_Liu1;dangqingqing@baidu.com;~Ziwei_Liu1;~Xianglong_Liu2", "aff": "Beihang University;Beihang University;Nanyang Technological University;Beihang University;Beihang University;;Nanyang Technological University;Beihang University", "aff_domain": "buaa.edu.cn;buaa.edu.cn;ntu.edu.sg;buaa.edu.cn;buaa.edu.cn;;ntu.edu.sg;buaa.edu.cn", "position": "PhD student;PhD student;PhD student;MS student;Assistant Professor;;Assistant Professor;Associate Professor", "bibtex": "@inproceedings{\nqin2022bibert,\ntitle={Bi{BERT}: Accurate Fully Binarized {BERT}},\nauthor={Haotong Qin and Yifu Ding and Mingyuan Zhang and Qinghua YAN and Aishan Liu and Qingqing Dang and Ziwei Liu and Xianglong Liu},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=5xEgrl_5FAJ}\n}", "github": "", "project": "", "reviewers": "tpUQ;TtMP;84Py;T1kK;yAoP", "pdf_size": 0, "recommendation": "5;6;6;6;8", "confidence": "4;4;3;3;3", "correctness": "3;3;3;4;4", "technical_novelty": "2;3;3;3;3", "empirical_novelty": "3;2;2;2;3", "wc_summary_paper": "91;160;82;130;97", "wc_summary_review": "21;53;91;40;11", "wc_main_review": "247;388;412;179;146", "wc_review": "359;601;585;349;254", "wc_reply_reviewers": "117;48;81;0;201", "wc_reply_authors": "2400;1309;1352;532;1933", "reply_reviewers": "1;1;1;0;2", "reply_authors": "5;3;3;1;4", "recommendation_avg": [ 6.2, 0.9797958971132712 ], "confidence_avg": [ 3.4, 0.4898979485566356 ], "correctness_avg": [ 3.4, 0.4898979485566356 ], "technical_novelty_avg": [ 2.8, 0.39999999999999997 ], "empirical_novelty_avg": [ 2.4, 0.4898979485566356 ], "wc_summary_paper_avg": [ 112.0, 28.962044126753206 ], "wc_summary_review_avg": [ 43.2, 28.00285699709942 ], "wc_main_review_avg": [ 274.4, 107.86769674003426 ], "wc_review_avg": [ 429.6, 138.45085770770797 ], "wc_reply_reviewers_avg": [ 89.4, 67.81327303706848 ], "wc_reply_authors_avg": [ 1505.2, 631.3101931697286 ], "reply_reviewers_avg": [ 1.0, 0.6324555320336759 ], "reply_authors_avg": [ 3.2, 1.32664991614216 ], "replies_avg": [ 30, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": -0.5833333333333334, "corr_recommendation_correctness": 0.6666666666666667, "gs_citation": 134, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5828841794097016283&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=5xEgrl_5FAJ", "email": "buaa.edu.cn;buaa.edu.cn;ntu.edu.sg;buaa.edu.cn;buaa.edu.cn;;ntu.edu.sg;buaa.edu.cn", "author_num": 8, "aff_unique_index": "0;0;1;0;0;1;0", "aff_unique_norm": "Beihang University;Nanyang Technological University", "aff_unique_dep": ";", "aff_unique_url": "http://www.buaa.edu.cn/;https://www.ntu.edu.sg", "aff_unique_abbr": "BUAA;NTU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;0;1;0", "aff_country_unique": "China;Singapore" }, { "id": "5y35LXrRMMz", "title": "Exploiting Minimum-Variance Policy Evaluation for Policy Optimization", "track": "main", "status": "Reject", "tldr": "", "abstract": "Off-policy methods are the basis of a large number of effective Policy Optimization (PO) algorithms. In this setting, Importance Sampling (IS) is typically employed as a what-if analysis tool, with the goal of estimating the performance of a target policy, given samples collected with a different behavioral policy. However, in Monte Carlo simulation, IS represents a variance minimization approach. In this field, a suitable behavioral distribution is employed for sampling, allowing diminishing the variance of the estimator below the one achievable when sampling from the target distribution. In this paper, we analyze IS in these two guises, showing the connections between the two objectives. We illustrate that variance minimization can be used as a performance improvement tool, with the advantage, compared with direct off-policy learning, of implicitly enforcing a trust region. We make use of these theoretical findings to build a PO algorithm, Policy Optimization via Optimal Policy Evaluation (PO2PE), that employs variance minimization as an inner loop. Finally, we present empirical evaluations on continuous RL benchmarks, with a particular focus on the robustness to small batch sizes.", "keywords": "Reinforcement Learning;Policy Optimization;Importance Sampling;Variance Reduction", "primary_area": "", "supplementary_material": "/attachment/45839cd5a1b4c88a69332c909f58a6cf0e22df97.zip", "author": "Alberto Maria Metelli;Samuele Meta;Marcello Restelli", "authorids": "~Alberto_Maria_Metelli2;~Samuele_Meta1;~Marcello_Restelli1", "gender": "M;;M", "homepage": "https://albertometelli.github.io/;;http://home.deib.polimi.it/restelli/", "dblp": "209/4941;;64/1011", "google_scholar": "R31IsPwAAAAJ;;https://scholar.google.com.tw/citations?user=xdgxRiEAAAAJ", "orcid": "0000-0002-3424-5212;;0000-0002-6322-1076", "linkedin": ";metasamuele/;", "or_profile": "~Alberto_Maria_Metelli2;~Samuele_Meta1;~Marcello_Restelli1", "aff": "Politecnico di Milano;Politecnico di Milano;Politecnico di Milano", "aff_domain": "polimi.it;polimi.it;polimi.it", "position": "Postdoc;MS student;Associate Professor", "bibtex": "@misc{\nmetelli2022exploiting,\ntitle={Exploiting Minimum-Variance Policy Evaluation for Policy Optimization},\nauthor={Alberto Maria Metelli and Samuele Meta and Marcello Restelli},\nyear={2022},\nurl={https://openreview.net/forum?id=5y35LXrRMMz}\n}", "github": "", "project": "", "reviewers": "pKkx;Ruth;P7sd;9jPT", "site": "https://openreview.net/forum?id=5y35LXrRMMz", "pdf_size": 0, "recommendation": "3;5;6;10", "confidence": "4;3;4;4", "correctness": "3;3;4;4", "technical_novelty": "3;3;3;3", "empirical_novelty": "1;2;2;3", "wc_summary_paper": "90;71;104;586", "wc_summary_review": "80;113;43;121", "wc_main_review": "790;790;612;500", "wc_review": "960;974;759;1207", "wc_reply_reviewers": "353;405;0;38", "wc_reply_authors": "2032;816;410;147", "reply_reviewers": "1;1;0;1", "reply_authors": "4;2;1;1", "recommendation_avg": [ 6.0, 2.5495097567963922 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 212.75, 215.81401136163518 ], "wc_summary_review_avg": [ 89.25, 30.808886704975237 ], "wc_main_review_avg": [ 673.0, 123.51922927220684 ], "wc_review_avg": [ 975.0, 158.67104335700324 ], "wc_reply_reviewers_avg": [ 199.0, 181.43456120596207 ], "wc_reply_authors_avg": [ 851.25, 722.1638924094724 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 2.0, 1.224744871391589 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.22645540682891915, "corr_recommendation_correctness": 0.7844645405527362, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:bYv8fr7crCgJ:scholar.google.com/&scioq=Exploiting+Minimum-Variance+Policy+Evaluation+for+Policy+Optimization&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;0", "aff_unique_norm": "Politecnico di Milano", "aff_unique_dep": "", "aff_unique_url": "https://www.polimi.it", "aff_unique_abbr": "Polimi", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Italy" }, { "id": "5ziLr3pWz77", "title": "Neural network architectures for disentangling the multimodal structure of data ensembles", "track": "main", "status": "Reject", "tldr": "", "abstract": "We introduce neural network architectures that model the mechanism that generates data and address the difficult problem of disentangling the multimodal structure of data ensembles. We provide (i) an autoencoder-decoder architecture that implements the $M$-mode SVD and (ii) a generalized autoencoder that employs a kernel activation and implements the doubly nonlinear Kernel-MPCA. The neural network projection architecture decomposes an unlabeled data given an estimated forward model and a set of observations that constrain the solution set.", "keywords": "autoencoders;SEMs", "primary_area": "", "supplementary_material": "", "author": "M. Alex O. Vasilescu", "authorids": "~M._Alex_O._Vasilescu1", "gender": "F", "homepage": "http://www.cs.ucla.edu/~maov", "dblp": "", "google_scholar": "https://scholar.google.com/citations?hl=en", "orcid": "0000-0001-6581-6930", "linkedin": "", "or_profile": "~M._Alex_O._Vasilescu1", "aff": "University of California, Los Angeles", "aff_domain": "ucla.edu", "position": "Assoc Director Computer Vision and Graphics Lab", "bibtex": "@misc{\nvasilescu2022neural,\ntitle={Neural network architectures for disentangling the multimodal structure of data ensembles},\nauthor={M. Alex O. Vasilescu},\nyear={2022},\nurl={https://openreview.net/forum?id=5ziLr3pWz77}\n}", "github": "", "project": "", "reviewers": "oC53;4HKq;GQkp;Bzxf", "site": "https://openreview.net/forum?id=5ziLr3pWz77", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "3;3;2;3", "correctness": "3;1;2;3", "technical_novelty": "2;1;2;3", "empirical_novelty": "2;1;0;0", "wc_summary_paper": "45;43;33;87", "wc_summary_review": "63;19;20;76", "wc_main_review": "275;82;209;183", "wc_review": "383;144;262;346", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 2.75, 0.4330127018922193 ], "correctness_avg": [ 2.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 0.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 52.0, 20.71231517720798 ], "wc_summary_review_avg": [ 44.5, 25.42144763777232 ], "wc_main_review_avg": [ 187.25, 69.40596155950871 ], "wc_review_avg": [ 283.75, 91.826943213852 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 1, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": 0.5222329678670935, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:JfkxGY8NdMIJ:scholar.google.com/&scioq=Neural+network+architectures+for+disentangling+the+multimodal+structure+of+data+ensembles&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "University of California, Los Angeles", "aff_unique_dep": "", "aff_unique_url": "https://www.ucla.edu", "aff_unique_abbr": "UCLA", "aff_campus_unique_index": "0", "aff_campus_unique": "Los Angeles", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "id": "6-lLt2zxbZR", "title": "An Application of Pseudo-log-likelihoods to Natural Language Scoring", "track": "main", "status": "Reject", "tldr": "", "abstract": "Language models built using semi-supervised machine learning on large corpora of natural language have very quickly enveloped the fields of natural language generation and understanding. In this paper we apply a zero-shot approach in- dependently developed by several researchers now gaining recognition as a significant alternative to fine-tuning for evaluation on common sense tasks. A language model with relatively few parameters and training steps (albert-xxlarge-v2) compared to a more recent language model (T5) can outperform it on a recent large data set (TimeDial), while displaying robustness in its performance across a similar class of language tasks. Surprisingly, this result is achieved by using a hyperparameter-free zero-shot method with the smaller model, compared to fine-tuning to the larger model. We argue that robustness of the smaller model ought to be understood in terms of compositionality, in a sense that we draw from re- cent literature on a class of similar models. We identify a practical cost for our method and model: high GPU-time for natural language evaluation. The zero-shot measurement technique that produces remarkable stability, both for ALBERT and other BERT variants, is an application of pseudo-log-likelihoods to masked language models for the relative measurement of probability for substitution alter- natives in forced choice language tasks such as the Winograd Schema Challenge, Winogrande, CommonsenseQA, and others. One contribution of this paper is to bring together a number of similar, but independent strands of research. We produce some absolute state-of-the-art (SOTA) results for common sense reasoning in binary choice tasks, performing better than any published result in the literature, including fine-tuned efforts. In others our results are SOTA relative to published methods similar to our own \u2013 in some cases by wide margins, but below SOTA absolute for fine-tuned alternatives. In addition, we show a remarkable consistency of the model\u2019s performance under adversarial settings, which we argue is best explained by the model\u2019s compositionality of representations.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Darren Abramson;Ali Emami", "authorids": "~Darren_Abramson1;~Ali_Emami3", "gender": ";M", "homepage": "https://www.darrenabramson.com;http://cosc.brocku.ca/~aemami/", "dblp": ";75/10772", "google_scholar": "ZVwYzY0AAAAJ;https://scholar.google.ca/citations?user=Pjdq8cUAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Darren_Abramson1;~Ali_Emami3", "aff": "Dalhousie University;Brock University", "aff_domain": "dal.ca;brocku.ca", "position": "Associate Professor;Assistant Professor", "bibtex": "@misc{\nabramson2022an,\ntitle={An Application of Pseudo-log-likelihoods to Natural Language Scoring},\nauthor={Darren Abramson and Ali Emami},\nyear={2022},\nurl={https://openreview.net/forum?id=6-lLt2zxbZR}\n}", "github": "", "project": "", "reviewers": "wa7Y;zjM9;T3k9;X2TK", "site": "https://openreview.net/forum?id=6-lLt2zxbZR", "pdf_size": 0, "recommendation": "1;3;3;5", "confidence": "4;4;3;4", "correctness": "1;2;2;3", "technical_novelty": "2;2;1;2", "empirical_novelty": "0;2;2;2", "wc_summary_paper": "23;36;71;117", "wc_summary_review": "2;95;35;84", "wc_main_review": "123;461;330;885", "wc_review": "148;592;436;1086", "wc_reply_reviewers": "0;0;0;499", "wc_reply_authors": "47;554;718;872", "reply_reviewers": "0;0;0;2", "reply_authors": "1;1;1;2", "recommendation_avg": [ 3.0, 1.4142135623730951 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 2.0, 0.7071067811865476 ], "technical_novelty_avg": [ 1.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 61.75, 36.409991760504425 ], "wc_summary_review_avg": [ 54.0, 37.5699347883384 ], "wc_main_review_avg": [ 449.75, 278.69102515151076 ], "wc_review_avg": [ 565.5, 340.10990870599466 ], "wc_reply_reviewers_avg": [ 124.75, 216.07333824421744 ], "wc_reply_authors_avg": [ 547.75, 310.20668513105903 ], "reply_reviewers_avg": [ 0.5, 0.8660254037844386 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 1.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7598708898348659652&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1", "aff_unique_norm": "Dalhousie University;Brock University", "aff_unique_dep": ";", "aff_unique_url": "https://www.dal.ca;https://www.brocku.ca", "aff_unique_abbr": "Dal;Brock", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Canada" }, { "id": "62r41yOG5m", "title": "Inducing Reusable Skills From Demonstrations with Option-Controller Network", "track": "main", "status": "Reject", "tldr": "", "abstract": "Humans can decompose previous experiences into skills and reuse them to enable fast learning in the future. Inspired by this process, we propose a new model called Option-Controller Network (OCN), which is a bi-level recurrent policy network composed of a high-level controller and a pool of low-level options. The options are disconnected from any task-specific information to model task-agnostic skills.\nThe controller use options to solve a given task, and it calls one option at a time and waits until the option return. With the isolation of information and the synchronous calling mechanism, we can impose a division of works between the controller and options in an end-to-end training regime. In experiments, we first perform behavior cloning from unstructured demonstrations coming from different tasks. We then freeze the learned options and learn a new controller with an RL algorithm to solve a new task. Extensive results on discrete and continuous environments show that OCN can jointly learn to decompose unstructured demonstrations into skills and model each skill with separate options. The learned options provide a good temporal abstraction, allowing OCN to quickly transfer to tasks with a novel combination of learned skills even with sparse reward, while previous methods either suffer from the delayed reward problem due to the lack of temporal\nabstraction or a complicated option controlling mechanism that increases the complexity of exploration.", "keywords": "Reusable Skill;Option-Controller Network", "primary_area": "", "supplementary_material": "", "author": "Siyuan Zhou;Yikang Shen;Yuchen Lu;Aaron Courville;Joshua B. Tenenbaum;Chuang Gan", "authorids": "~Siyuan_Zhou2;~Yikang_Shen1;~Yuchen_Lu1;~Aaron_Courville3;~Joshua_B._Tenenbaum1;~Chuang_Gan1", "gender": ";M;M;;;M", "homepage": "https://scholar.google.com/citations?user=WjUmtm0AAAAJ&hl=zh-CN;;http://jackhaha363.github.io/;;;http://people.csail.mit.edu/ganchuang/", "dblp": ";152/8226;223/4762;56/1688;t/JoshuaBTenenbaum;139/6993", "google_scholar": "WjUmtm0AAAAJ;qff5rRYAAAAJ;https://scholar.google.ca/citations?hl=en;https://scholar.google.ca/citations?user=km6CP8cAAAAJ;;PTeSCbIAAAAJ", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": "~Siyuan_Zhou2;~Yikang_Shen1;~Yuchen_Lu1;~Aaron_Courville3;~Joshua_B._Tenenbaum1;~Chuang_Gan1", "aff": "Peking University;International Business Machines;University of Montreal;Universit\u00e9 de Montr\u00e9al;Massachusetts Institute of Technology;MIT-IBM Watson AI Lab", "aff_domain": "pku.edu.cn;ibm.com;umontreal.ca; ;mit.edu;ibm.com", "position": "Undergrad student;Researcher;PhD student;Assistant Professor;Professor;PhD student", "bibtex": "@misc{\nzhou2022inducing,\ntitle={Inducing Reusable Skills From Demonstrations with Option-Controller Network},\nauthor={Siyuan Zhou and Yikang Shen and Yuchen Lu and Aaron Courville and Joshua B. Tenenbaum and Chuang Gan},\nyear={2022},\nurl={https://openreview.net/forum?id=62r41yOG5m}\n}", "github": "", "project": "", "reviewers": "TfQH;2XVq;RMbd;XgEE", "site": "https://openreview.net/forum?id=62r41yOG5m", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "5;4;5;4", "correctness": "3;3;3;3", "technical_novelty": "2;1;3;2", "empirical_novelty": "3;2;3;2", "wc_summary_paper": "63;75;109;75", "wc_summary_review": "81;42;81;52", "wc_main_review": "1115;383;489;204", "wc_review": "1259;500;679;331", "wc_reply_reviewers": "45;25;22;27", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "1;1;1;1", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 4.5, 0.5 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 80.5, 17.168284713389397 ], "wc_summary_review_avg": [ 64.0, 17.363755354185336 ], "wc_main_review_avg": [ 547.75, 342.9762200211554 ], "wc_review_avg": [ 692.25, 349.58645211163434 ], "wc_reply_reviewers_avg": [ 29.75, 8.98262211161084 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:CPS3lS5STA8J:scholar.google.com/&scioq=Inducing+Reusable+Skills+From+Demonstrations+with+Option-Controller+Network&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1;2;3;4;4", "aff_unique_norm": "Peking University;International Business Machines Corporation;University of Montreal;Universit\u00e9 de Montr\u00e9al;Massachusetts Institute of Technology", "aff_unique_dep": ";;;;", "aff_unique_url": "http://www.pku.edu.cn;https://www.ibm.com;https://wwwumontreal.ca;https://www.umontreal.ca;https://web.mit.edu", "aff_unique_abbr": "Peking U;IBM;UM;UdeM;MIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2;2;1;1", "aff_country_unique": "China;United States;Canada" }, { "id": "63PjP_UEKe", "title": "Cross Domain Ensemble Distillation for Domain Generalization", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "For domain generalization, the task of learning a model that generalizes to unseen target domains utilizing multiple source domains, many approaches explicitly align the distribution of the domains. However, the optimization for domain alignment has a risk of overfitting since the target domain is not available. To address the issue, this paper proposes a method for domain generalization by employing self-distillation. The proposed method aims to train a model robust to domain shift by allowing meaningful erroneous predictions in multiple domains. Specifically, our method matches the ensemble of predictive distributions of data with the same class label but different domains with each predictive distribution. We also propose a de-stylization method that standardizes feature maps of images to help produce consistent predictions. Image classification experiments on two benchmarks demonstrated that the proposed method greatly improves performance in both single-source and multi-source settings. We also show that the proposed method works effectively in person-reID experiments. In all experiments, our method significantly improves the performance.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Kyungmoon Lee;Sungyeon Kim;Suha Kwak", "authorids": "~Kyungmoon_Lee2;~Sungyeon_Kim1;~Suha_Kwak3", "gender": "M;M;M", "homepage": ";https://github.com/tjddus9597;https://suhakwak.github.io/", "dblp": "243/9276;69/8198;65/6173", "google_scholar": "https://scholar.google.co.kr/citations?user=1KZgL5IAAAAJ;https://scholar.google.com/citations?hl=ko;-gscDIEAAAAJ", "orcid": ";;", "linkedin": "kyungmoon-lee-07b747186/;sungyeonkim-b47b0a242/;", "or_profile": "~Kyungmoon_Lee2;~Sungyeon_Kim1;~Suha_Kwak3", "aff": "Nalbi;POSTECH;POSTECH", "aff_domain": "nalbi.ai;postech.ac.kr;postech.ac.kr", "position": "Researcher;PhD student;Associate Professor", "bibtex": "@misc{\nlee2022cross,\ntitle={Cross Domain Ensemble Distillation for Domain Generalization},\nauthor={Kyungmoon Lee and Sungyeon Kim and Suha Kwak},\nyear={2022},\nurl={https://openreview.net/forum?id=63PjP_UEKe}\n}", "github": "", "project": "", "reviewers": "NhFe;ZbEu;TAdx", "site": "https://openreview.net/forum?id=63PjP_UEKe", "pdf_size": 0, "recommendation": "3;6;6", "confidence": "4;4;4", "correctness": "3;4;3", "technical_novelty": "1;3;3", "empirical_novelty": "2;3;3", "wc_summary_paper": "151;63;96", "wc_summary_review": "49;49;39", "wc_main_review": "439;284;756", "wc_review": "639;396;891", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 5.0, 1.4142135623730951 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.3333333333333335, 0.9428090415820634 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 103.33333333333333, 36.298148100909444 ], "wc_summary_review_avg": [ 45.666666666666664, 4.714045207910317 ], "wc_main_review_avg": [ 493.0, 196.4399823525411 ], "wc_review_avg": [ 642.0, 202.09403751719148 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.5, "gs_citation": 42, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7614016061271891852&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff_unique_index": "0;1;1", "aff_unique_norm": "Nalbi;Pohang University of Science and Technology", "aff_unique_dep": ";", "aff_unique_url": ";https://www.postech.ac.kr", "aff_unique_abbr": ";POSTECH", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Pohang", "aff_country_unique_index": "1;1", "aff_country_unique": ";South Korea" }, { "title": "HyAR: Addressing Discrete-Continuous Action Reinforcement Learning via Hybrid Action Representation", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6794", "id": "64trBbOhdGU", "poster": "", "openreview": "https://openreview.net/forum?id=64trBbOhdGU", "slides": "https://iclr.cc/virtual/2022/poster/6794", "video": "https://iclr.cc/virtual/2022/poster/6794", "author_site": "Boyan Li, Hongyao Tang, YAN ZHENG, Jianye HAO, Pengyi Li, Zhen Wang, Zhaopeng Meng, LI Wang", "tldr": "", "abstract": "Discrete-continuous hybrid action space is a natural setting in many practical problems, such as robot control and game AI. However, most previous Reinforcement Learning (RL) works only demonstrate the success in controlling with either discrete or continuous action space, while seldom take into account the hybrid action space. One naive way to address hybrid action RL is to convert the hybrid action space into a unified homogeneous action space by discretization or continualization, so that conventional RL algorithms can be applied. However, this ignores the underlying structure of hybrid action space and also induces the scalability issue and additional approximation difficulties, thus leading to degenerated results. In this paper, we propose Hybrid Action Representation (HyAR) to learn a compact and decodable latent representation space for the original hybrid action space. HyAR constructs the latent space and embeds the dependence between discrete action and continuous parameter via an embedding table and conditional Variantional Auto-Encoder (VAE). To further improve the effectiveness, the action representation is trained to be semantically smooth through unsupervised environmental dynamics prediction. Finally, the agent then learns its policy with conventional DRL algorithms in the learned representation space and interacts with the environment by decoding the hybrid action embeddings to the original action space. We evaluate HyAR in a variety of environments with discrete-continuous action space. The results demonstrate the superiority of HyAR when compared with previous baselines, especially for high-dimensional action spaces.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/46f1b346494c57a2ebf9ab7a03b64c208bdc0c18.zip", "author": "Boyan Li;Hongyao Tang;YAN ZHENG;Jianye HAO;Pengyi Li;Zhen Wang;Zhaopeng Meng;LI Wang", "authorids": "~Boyan_Li1;~Hongyao_Tang1;~YAN_ZHENG1;~Jianye_HAO1;~Pengyi_Li1;w-zhen@nwpu.edu.cn;~Zhaopeng_Meng1;~LI_Wang13", "gender": ";M;M;M;M;;;M", "homepage": "http://www.icdai.org/team.html;https://bluecontra.github.io/;https://yanzzzzz.github.io;http://www.icdai.org/jianye.html;https://yeshenpy.github.io/;;http://cic.tju.edu.cn/info/1104/1205.htm;https://cic.tju.edu.cn/info/1072/3162.htm", "dblp": ";220/4275;10/2381-2;21/7664.html;195/6948;;67/8175;", "google_scholar": ";yIqzRH4AAAAJ;https://scholar.google.com.hk/citations?user=tJuhd1kAAAAJ;;https://scholar.google.com/citations?hl=zh-CN;;;", "orcid": ";;;0000-0002-0422-8235;0009-0009-8546-2346;;;", "linkedin": ";;;;;;;", "or_profile": "~Boyan_Li1;~Hongyao_Tang1;~YAN_ZHENG1;~Jianye_HAO1;~Pengyi_Li1;w-zhen@nwpu.edu.cn;~Zhaopeng_Meng1;~LI_Wang13", "aff": "University of tianjin of china;College of Intelligence and Computing, Tianjin University;Tianjin Unibersity, China;Tianjin University;Tianjin University;;Tianjin University;Tianjin University", "aff_domain": "tju.edu.cn;tju.edu.cn;tju.edu.cn;tju.edu.cn;tju.edu.cn;;tju.edu.cn;tju.edu.cn", "position": "MS student;PhD student;Associate Professor;Associate Professor;MS student;;Full Professor;Associate Professor", "bibtex": "@inproceedings{\nli2022hyar,\ntitle={Hy{AR}: Addressing Discrete-Continuous Action Reinforcement Learning via Hybrid Action Representation},\nauthor={Boyan Li and Hongyao Tang and YAN ZHENG and Jianye HAO and Pengyi Li and Zhen Wang and Zhaopeng Meng and LI Wang},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=64trBbOhdGU}\n}", "github": "", "project": "", "reviewers": "Am1g;JLuH;TSgb;9YW6", "pdf_size": 0, "recommendation": "6;8;8;8", "confidence": "4;1;4;4", "correctness": "4;4;3;4", "technical_novelty": "3;3;3;3", "empirical_novelty": "3;3;4;3", "wc_summary_paper": "186;111;141;176", "wc_summary_review": "262;15;84;35", "wc_main_review": "721;155;168;118", "wc_review": "1169;281;393;329", "wc_reply_reviewers": "18;0;15;49", "wc_reply_authors": "1892;142;627;501", "reply_reviewers": "1;0;1;1", "reply_authors": "4;1;1;1", "recommendation_avg": [ 7.5, 0.8660254037844386 ], "confidence_avg": [ 3.25, 1.299038105676658 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 3.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 153.5, 29.685855217594792 ], "wc_summary_review_avg": [ 99.0, 97.39866528859623 ], "wc_main_review_avg": [ 290.5, 249.22529967882474 ], "wc_review_avg": [ 543.0, 363.59867986559027 ], "wc_reply_reviewers_avg": [ 20.5, 17.811513130556875 ], "wc_reply_authors_avg": [ 790.5, 660.378111387711 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.75, 1.299038105676658 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": -0.3333333333333333, "gs_citation": 69, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3712433755498095777&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=64trBbOhdGU", "email": "tju.edu.cn;tju.edu.cn;tju.edu.cn;tju.edu.cn;tju.edu.cn;;tju.edu.cn;tju.edu.cn", "author_num": 8, "aff_unique_index": "0;1;1;1;1;1;1", "aff_unique_norm": "University of Tianjin;Tianjin University", "aff_unique_dep": ";College of Intelligence and Computing", "aff_unique_url": "http://www.tju.edu.cn/;http://www.tju.edu.cn", "aff_unique_abbr": "Tianjin University;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "China" }, { "id": "66kgCIYQW3", "title": "Automatic Concept Extraction for Concept Bottleneck-based Video Classification", "track": "main", "status": "Reject", "tldr": "", "abstract": "Recent efforts in interpretable deep learning models have shown that concept-based explanation methods achieve competitive accuracy with standard end-to-end models and enable reasoning and intervention about extracted high-level visual concepts from images, e.g., identifying the wing color and beak length for bird-species classification. However, these concept bottleneck models rely on a domain expert providing a necessary and sufficient set of concepts--which is intractable for complex tasks such as video classification. For complex tasks, the labels and the relationship between visual elements span many frames, e.g., identifying a bird flying or catching prey--necessitating concepts with various levels of abstraction. To this end, we present CoDEx, an automatic Concept Discovery and Extraction module that rigorously composes a necessary and sufficient set of concept abstractions for concept-based video classification. CoDEx identifies a rich set of complex concept abstractions from natural language explanations of videos--obviating the need to predefine the amorphous set of concepts. To demonstrate our method\u2019s viability, we construct two new public datasets that combine existing complex video classification datasets with short, crowd-sourced natural language explanations for their labels. Our method elicits inherent complex concept abstractions in natural language to generalize concept-bottleneck methods to complex tasks.", "keywords": "Explainable AI;Video Classification;Concept Extraction", "primary_area": "", "supplementary_material": "/attachment/a8cdbb5282747af74e9a9ce4b0241b7044052442.zip", "author": "Jeya Vikranth Jeyakumar;Luke Dickens;Yu-Hsi Cheng;Joseph Noor;Luis Antonio Garcia;Diego Ramirez Echavarria;Alessandra Russo;Lance M. Kaplan;Mani Srivastava", "authorids": "~Jeya_Vikranth_Jeyakumar1;~Luke_Dickens1;~Yu-Hsi_Cheng1;~Joseph_Noor1;~Luis_Antonio_Garcia1;~Diego_Ramirez_Echavarria1;~Alessandra_Russo1;~Lance_M._Kaplan1;~Mani_Srivastava2", "gender": "M;M;;;M;M;F;M;M", "homepage": ";http://www.ucl.ac.uk/dis/people/dickens;;;https://lagarcia.us;https://github.com/drelso;http://www.imperial.ac.uk/people/a.russo/;;https://samueli.ucla.edu/people/mani-srivastava/", "dblp": "216/5565;30/6365;;;;;79/683;47/4107;s/ManiBSrivastava.html", "google_scholar": "cOox9kYAAAAJ;;;2EkZjfgAAAAJ;F6Gzg9gAAAAJ;;https://scholar.google.com.tw/citations?user=_6zceo4AAAAJ;obew8e0AAAAJ;X2Qs7XYAAAAJ", "orcid": ";0000-0003-0896-1407;;;;;0000-0002-3318-8711;0000-0002-3627-4471;0000-0002-3782-9192", "linkedin": "jeyavikranth/;;;;;diego-ramirez-echavarria/;alessandra-russo-422b6219/?originalSubdomain=uk;;msrivastava/", "or_profile": "~Jeya_Vikranth_Jeyakumar1;~Luke_Dickens1;~Yu-Hsi_Cheng1;~Joseph_Noor1;~Luis_Antonio_Garcia1;~Diego_Ramirez_Echavarria1;~Alessandra_Russo1;~Lance_M._Kaplan1;~Mani_Srivastava2", "aff": "University of California, Los Angeles;University College London, University of London;;;University of Southern California Information Sciences Institute;;Imperial College London;US DEVCOM Army Research Laboratory ;University of California, Los Angeles", "aff_domain": "ucla.edu;ucl.ac.uk;;;isi.edu;;imperial.ac.uk;army.mil;ucla.edu", "position": "PhD student;Lecturer;;;Research Lead;;Full Professor;Principal Researcher;Full Professor", "bibtex": "@misc{\njeyakumar2022automatic,\ntitle={Automatic Concept Extraction for Concept Bottleneck-based Video Classification},\nauthor={Jeya Vikranth Jeyakumar and Luke Dickens and Yu-Hsi Cheng and Joseph Noor and Luis Antonio Garcia and Diego Ramirez Echavarria and Alessandra Russo and Lance M. Kaplan and Mani Srivastava},\nyear={2022},\nurl={https://openreview.net/forum?id=66kgCIYQW3}\n}", "github": "", "project": "", "reviewers": "LdBL;9TvQ;uphf;zzha", "site": "https://openreview.net/forum?id=66kgCIYQW3", "pdf_size": 0, "recommendation": "5;5;5;6", "confidence": "4;3;3;3", "correctness": "3;3;4;3", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "143;46;107;78", "wc_summary_review": "87;104;104;68", "wc_main_review": "260;223;427;200", "wc_review": "490;373;638;346", "wc_reply_reviewers": "38;0;84;0", "wc_reply_authors": "36;36;36;36", "reply_reviewers": "1;0;1;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 5.25, 0.4330127018922193 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 93.5, 35.80851853958776 ], "wc_summary_review_avg": [ 90.75, 14.85555451674558 ], "wc_main_review_avg": [ 277.5, 88.9283419388892 ], "wc_review_avg": [ 461.75, 115.25704967593089 ], "wc_reply_reviewers_avg": [ 30.5, 34.565155865408734 ], "wc_reply_authors_avg": [ 36.0, 0.0 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 9, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": -0.3333333333333333, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5590078037382210986&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;2;3;4;0", "aff_unique_norm": "University of California, Los Angeles;University College London;University of Southern California;Imperial College London;US Army Research Laboratory", "aff_unique_dep": ";;Information Sciences Institute;;DEVCOM", "aff_unique_url": "https://www.ucla.edu;https://www.ucl.ac.uk;https://isi.usc.edu;https://www.imperial.ac.uk;https://www.arl.army.mil", "aff_unique_abbr": "UCLA;UCL;USC ISI;ICL;ARL", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Los Angeles;", "aff_country_unique_index": "0;1;0;1;0;0", "aff_country_unique": "United States;United Kingdom" }, { "id": "66miN107dRS", "title": "Contrastive Attraction and Contrastive Repulsion for Representation Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Contrastive learning (CL) methods effectively learn data representations without label supervision, where the encoder needs to contrast each positive sample over multiple negative samples via a one-vs-many softmax cross-entropy loss. By leveraging large amounts of unlabeled image data, recent CL methods have achieved promising results when pretrained on ImageNet, a well-curated dataset with balanced image classes. However, they tend to yield worse performance when pretrained on images in the wild. In this paper, to further improve the performance of CL and enhance its robustness on uncurated datasets, we propose a doubly CL strategy that contrasts positive samples and negative ones within themselves separately. We realize this strategy with contrastive attraction and contrastive repulsion (CACR), which makes the query not only exert a greater force to attract more distant positive samples but also do so to repel closer negative samples. Theoretical analysis reveals that CACR generalizes CL's behavior by positive attraction and negative repulsion. It further considers the intra-contrastive relation within the positive and negative pairs to narrow the gap between the sampled and true distribution, which is important when datasets are less curated. Extensive large-scale experiments on standard vision tasks show that CACR not only consistently outperforms existing CL methods on benchmark datasets in representation learning, but also shows better robustness when generalized to pretrain on wild large image datasets.", "keywords": "contrastive learning;Bayesian methods;conditional distribution;label imbalance;doubly contrastive", "primary_area": "", "supplementary_material": "/attachment/62461a9a45e9622dc7e6e7b49fe8b945f0d5fdcf.zip", "author": "Huangjie Zheng;Xu Chen;Jiangchao Yao;Hongxia Yang;Chunyuan Li;Ya Zhang;Hao Zhang;Ivor Tsang;Jingren Zhou;Mingyuan Zhou", "authorids": "~Huangjie_Zheng1;~Xu_Chen2;~Jiangchao_Yao1;~Hongxia_Yang2;~Chunyuan_Li1;~Ya_Zhang1;~Hao_Zhang1;~Ivor_Tsang1;~Jingren_Zhou2;~Mingyuan_Zhou1", "gender": "M;M;M;F;;F;M;M;M;M", "homepage": ";https://xuchensjtu.github.io/xuchen.github.io/;https://sunarker.github.io/;https://www4.comp.polyu.edu.hk/~hongxyang/;http://chunyuan.li/;https://annzhanglion.github.io/;https://haozhangxidian.github.io/;http://mingyuanzhou.github.io;https://www.a-star.edu.sg/cfar/about-cfar/management/prof-ivor-tsang;", "dblp": "192/2170;xxxxxx;166/5900;;64/9590;85/3714-2;55/2270-50;;35/5873;84/2644", "google_scholar": "Vl5wCXsAAAAJ;6Qa2JCwAAAAJ;w8oDh9QAAAAJ;iJlC5mMAAAAJ;Zd7WmXUAAAAJ;pbjw9sMAAAAJ;Eo8e5icAAAAJ;LXwCIisAAAAJ;rJMOlVsAAAAJ;", "orcid": "0000-0003-0508-5034;0000-0001-5299-7074;;;;0000-0002-5390-9053;;;;", "linkedin": ";;;;;;;;;", "or_profile": "~Huangjie_Zheng1;~Xu_Chen2;~Jiangchao_Yao1;~Hongxia_Yang2;~Chunyuan_Li1;~Ya_Zhang1;~Hao_Zhang1;~Mingyuan_Zhou1;~Ivor_W_Tsang1;~Jingren_Zhou1", "aff": "University of Texas, Austin;Alibaba Group;Alibaba Group;Alibaba Group;Microsoft Research;Shanghai Jiaotong University;Cornell University;The University of Texas at Austin;University of Technology Sydney;Alibaba Group", "aff_domain": "utexas.edu;alibaba-inc.com;alibaba-inc.com;alibaba-inc.com;microsoft.com;sjtu.edu.cn;med.cornell.edu;utexas.edu;uts.edu.au;alibaba-inc.com", "position": "PhD student;Researcher;Researcher;Principal Researcher;Principal Researcher;Professor;Postdoc;Associate Professor;Full Professor;Researcher", "bibtex": "@misc{\nzheng2022contrastive,\ntitle={Contrastive Attraction and Contrastive Repulsion for Representation Learning},\nauthor={Huangjie Zheng and Xu Chen and Jiangchao Yao and Hongxia Yang and Chunyuan Li and Ya Zhang and Hao Zhang and Ivor Tsang and Jingren Zhou and Mingyuan Zhou},\nyear={2022},\nurl={https://openreview.net/forum?id=66miN107dRS}\n}", "github": "", "project": "", "reviewers": "MXU1;qbmC;edUG;WK7k", "site": "https://openreview.net/forum?id=66miN107dRS", "pdf_size": 0, "recommendation": "3;6;6;8", "confidence": "4;5;4;4", "correctness": "4;3;3;4", "technical_novelty": "2;3;3;3", "empirical_novelty": "0;2;3;3", "wc_summary_paper": "61;105;75;50", "wc_summary_review": "28;50;83;30", "wc_main_review": "80;100;305;116", "wc_review": "169;255;463;196", "wc_reply_reviewers": "0;12;44;37", "wc_reply_authors": "286;182;306;106", "reply_reviewers": "0;1;1;1", "reply_authors": "2;1;1;1", "recommendation_avg": [ 5.75, 1.7853571071357126 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 1.224744871391589 ], "wc_summary_paper_avg": [ 72.75, 20.620075169601105 ], "wc_summary_review_avg": [ 47.75, 22.094965489902897 ], "wc_main_review_avg": [ 150.25, 90.25069251811867 ], "wc_review_avg": [ 270.75, 115.27006332955665 ], "wc_reply_reviewers_avg": [ 23.25, 17.93564885918544 ], "wc_reply_authors_avg": [ 220.0, 80.91971329657564 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 10, 0 ], "corr_recommendation_confidence": 0.08084520834544431, "corr_recommendation_correctness": -0.14002800840280097, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14291759390905652286&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff_unique_index": "0;1;1;1;2;3;4;0;5;1", "aff_unique_norm": "University of Texas at Austin;Alibaba Group;Microsoft;Shanghai Jiao Tong University;Cornell University;University of Technology Sydney", "aff_unique_dep": ";;Microsoft Research;;;", "aff_unique_url": "https://www.utexas.edu;https://www.alibaba.com;https://www.microsoft.com/en-us/research;https://www.sjtu.edu.cn;https://www.cornell.edu;https://www.uts.edu.au", "aff_unique_abbr": "UT Austin;Alibaba;MSR;SJTU;Cornell;UTS", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Austin;", "aff_country_unique_index": "0;1;1;1;0;1;0;0;2;1", "aff_country_unique": "United States;China;Australia" }, { "id": "67T66kchK_7", "title": "SPLID: Self-Imitation Policy Learning through Iterative Distillation", "track": "main", "status": "Reject", "tldr": "", "abstract": "Goal-Conditioned continuous control tasks remain challenging due to the sparse reward signals. To address this issue, many relabelling methods like Hindsight Experience Replay have been developed and bring significant improvement. Though relabelling methods provide an alternative to an expert demonstration, the majority of the relabelled data are not optimal. If we can improve the quality of the relabelled data, the sample efficiency, as well as the agent performance, should be improved. To this end, we propose a novel meta-algorithm Self-Imitation Policy Learning through Iterative Distillation (SPLID) which relies on the concept of $\\delta$-distilled policy to iteratively level up the quality of the target data and agent mimics from the relabeled target data. Under certain assumptions, we show that SPLID has good theoretical properties of performance improvement and local convergence guarantee. Specifically, in the deterministic environment, we develop a practical implementation of SPLID, which imposes $\\delta$-distilled policy by discriminating First Hit Time (FHT). Experiments show that SPLID outperforms previous Goal-Conditioned RL methods with a substantial margin.", "keywords": "Multi-goal RL;RL with sparse reward.", "primary_area": "", "supplementary_material": "/attachment/566ee71028ce5033be0a6c6e4e581a8c03ff6d1d.zip", "author": "Zhihan Liu;Hao Sun;Bolei Zhou", "authorids": "~Zhihan_Liu1;~Hao_Sun3;~Bolei_Zhou5", "gender": "M;M;M", "homepage": ";https://boleizhou.github.io/;https://holarissun.github.io", "dblp": ";46/8066;SunLLZL19", "google_scholar": "0VVg_R4AAAAJ;9D4aG8AAAAAJ;7ZNoHJkAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Zhihan_Liu1;~Bolei_Zhou5;~Hao_Sun1", "aff": "University of Science and Technology of China;University of California, Los Angeles;University of Cambridge", "aff_domain": "ustc.edu.cn;ucla.edu;cam.ac.uk", "position": "Undergrad student;Assistant Professor;PhD student", "bibtex": "@misc{\nliu2022splid,\ntitle={{SPLID}: Self-Imitation Policy Learning through Iterative Distillation},\nauthor={Zhihan Liu and Hao Sun and Bolei Zhou},\nyear={2022},\nurl={https://openreview.net/forum?id=67T66kchK_7}\n}", "github": "", "project": "", "reviewers": "xVN8;FXhj;Uax1;JnUJ", "site": "https://openreview.net/forum?id=67T66kchK_7", "pdf_size": 0, "recommendation": "3;3;3;8", "confidence": "4;4;3;3", "correctness": "3;2;3;4", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "120;64;70;126", "wc_summary_review": "52;32;36;75", "wc_main_review": "360;141;169;264", "wc_review": "532;237;275;465", "wc_reply_reviewers": "107;86;0;0", "wc_reply_authors": "669;435;298;72", "reply_reviewers": "1;1;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 4.25, 2.165063509461097 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 95.0, 28.160255680657446 ], "wc_summary_review_avg": [ 48.75, 16.90229274388537 ], "wc_main_review_avg": [ 233.5, 86.09442490661053 ], "wc_review_avg": [ 377.25, 124.27062203111402 ], "wc_reply_reviewers_avg": [ 48.25, 48.81790142970097 ], "wc_reply_authors_avg": [ 368.5, 216.56696423970115 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.816496580927726, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Qauj7CZ8XI4J:scholar.google.com/&scioq=SPLID:+Self-Imitation+Policy+Learning+through+Iterative+Distillation&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;2", "aff_unique_norm": "University of Science and Technology of China;University of California, Los Angeles;University of Cambridge", "aff_unique_dep": ";;", "aff_unique_url": "http://www.ustc.edu.cn;https://www.ucla.edu;https://www.cam.ac.uk", "aff_unique_abbr": "USTC;UCLA;Cambridge", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Los Angeles;Cambridge", "aff_country_unique_index": "0;1;2", "aff_country_unique": "China;United States;United Kingdom" }, { "title": "Offline Reinforcement Learning with Implicit Q-Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/5941", "id": "68n2s9ZJWF8", "poster": "", "openreview": "https://openreview.net/forum?id=68n2s9ZJWF8", "slides": "https://iclr.cc/virtual/2022/poster/5941", "video": "https://iclr.cc/virtual/2022/poster/5941", "author_site": "Ilya Kostrikov, Ashvin Nair, Sergey Levine", "tldr": "", "abstract": "Offline reinforcement learning requires reconciling two conflicting aims: learning a policy that improves over the behavior policy that collected the dataset, while at the same time minimizing the deviation from the behavior policy so as to avoid errors due to distributional shift. This tradeoff is critical, because most current offline reinforcement learning methods need to query the value of unseen actions during training to improve the policy, and therefore need to either constrain these actions to be in-distribution, or else regularize their values. We propose a new offline RL method that never needs to evaluate actions outside of the dataset, but still enables the learned policy to improve substantially over the best behavior in the data through generalization. The main insight in our work is that, instead of evaluating unseen actions from the latest policy, we can approximate the policy improvement step implicitly by treating the state value function as a random variable, with randomness determined by the action (while still integrating over the dynamics to avoid excessive optimism), and then taking a state conditional upper expectile of this random variable to estimate the value of the best actions in that state. This leverages the generalization capacity of the function approximator to estimate the value of the best available action at a given state without ever directly querying a Q-function with this unseen action. Our algorithm alternates between fitting this upper expectile value function and backing it up into a Q-function, without any explicit policy. Then, we extract the policy via advantage-weighted behavioral cloning, which also avoids querying out-of-sample actions. We dub our method Implicit Q-learning (IQL). IQL is easy to implement, computationally efficient, and only requires fitting an additional critic with an asymmetric L2 loss. IQL demonstrates the state-of-the-art performance on D4RL, a standard benchmark for offline reinforcement learning. We also demonstrate that IQL achieves strong performance fine-tuning using online interaction after offline initialization.", "keywords": "Deep Reinforcement Learning;Offline Reinforcement Learning;Batch Reinforcement Learning;Continuous Control", "primary_area": "", "supplementary_material": "/attachment/552bdbad7a77b911d3142c29adfb9c9c08a5f121.zip", "author": "Ilya Kostrikov;Ashvin Nair;Sergey Levine", "authorids": "~Ilya_Kostrikov1;~Ashvin_Nair1;~Sergey_Levine1", "gender": "M;M;M", "homepage": ";http://ashvin.me/;https://people.eecs.berkeley.edu/~svlevine/", "dblp": "https://dblp.org/pers/k/Kostrikov:Ilya.html;182/2436;80/7594", "google_scholar": "PTS2AOgAAAAJ;BsOkXDsAAAAJ;8R35rCwAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Ilya_Kostrikov1;~Ashvin_Nair1;~Sergey_Levine1", "aff": "University of California, Berkeley;University of California, Berkeley;Google", "aff_domain": "berkeley.edu;berkeley.edu;google.com", "position": "Postdoc;PhD student;Research Scientist", "bibtex": "@inproceedings{\nkostrikov2022offline,\ntitle={Offline Reinforcement Learning with Implicit Q-Learning},\nauthor={Ilya Kostrikov and Ashvin Nair and Sergey Levine},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=68n2s9ZJWF8}\n}", "github": "", "project": "", "reviewers": "SYQw;eQvF;DeQA;Sf4r", "pdf_size": 0, "recommendation": "5;5;6;8", "confidence": "4;5;2;4", "correctness": "4;4;4;4", "technical_novelty": "3;2;3;3", "empirical_novelty": "0;2;2;2", "wc_summary_paper": "62;67;42;134", "wc_summary_review": "16;43;37;32", "wc_main_review": "311;507;165;251", "wc_review": "389;617;244;417", "wc_reply_reviewers": "439;349;66;0", "wc_reply_authors": "1816;1672;589;0", "reply_reviewers": "1;2;1;0", "reply_authors": "4;5;3;0", "recommendation_avg": [ 6.0, 1.224744871391589 ], "confidence_avg": [ 3.75, 1.0897247358851685 ], "correctness_avg": [ 4.0, 0.0 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 76.25, 34.629286738250904 ], "wc_summary_review_avg": [ 32.0, 10.024968827881711 ], "wc_main_review_avg": [ 308.5, 125.80441168734903 ], "wc_review_avg": [ 416.75, 132.95934529020516 ], "wc_reply_reviewers_avg": [ 213.5, 184.76268562672496 ], "wc_reply_authors_avg": [ 1019.25, 755.7907696578465 ], "reply_reviewers_avg": [ 1.0, 0.7071067811865476 ], "reply_authors_avg": [ 3.0, 1.8708286933869707 ], "replies_avg": [ 22, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.18731716231633877, "corr_recommendation_correctness": 0.0, "gs_citation": 1021, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13007981603513280935&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "pdf": "https://openreview.net/pdf?id=68n2s9ZJWF8", "email": "berkeley.edu;berkeley.edu;google.com", "author_num": 3, "aff_unique_index": "0;0;1", "aff_unique_norm": "University of California, Berkeley;Google", "aff_unique_dep": ";Google", "aff_unique_url": "https://www.berkeley.edu;https://www.google.com", "aff_unique_abbr": "UC Berkeley;Google", "aff_campus_unique_index": "0;0;1", "aff_campus_unique": "Berkeley;Mountain View", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "6A7zcZ43m1S", "title": "R-GSN: The Relation-based Graph Similar Network for Heterogeneous Graph", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Heterogeneous graph is a kind of data structure widely existing in real life. Nowadays, the research of graph neural networks on heterogeneous graphs has become more and more popular. The existing heterogeneous graph neural network algorithms mainly have two ideas, one is based on meta-path and the other is not. The idea based on meta-path often requires a lot of manual preprocessing, at the same time it is difficult to extend to large-scale graphs. In this paper, we proposed the general heterogeneous message passing paradigm and designed R-GSN that does not need meta-path, which is much improved compared to the baseline R-GCN. Experiments have shown that our R-GSN algorithm achieves the state-of-the-art performance on the ogbn-mag large-scale heterogeneous graph dataset.", "keywords": "Heterogeneous graph;graph neural network.", "primary_area": "", "supplementary_material": "", "author": "Xinliang Wu;Guizhong Liu;Mengying Jiang", "authorids": "~Xinliang_Wu1;~Guizhong_Liu1;~Mengying_Jiang2", "gender": "M;M;F", "homepage": ";https://gr.xjtu.edu.cn/web/liugz;", "dblp": "00/4153.html;;206/6312.html", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": "~Xinliang_Wu1;~Guizhong_Liu1;~Mengying_Jiang2", "aff": ";Xi'an Jiaotong University;Xi'an Jiaotong University", "aff_domain": ";xjtu.edu.cn;xjtu.edu", "position": ";Full Professor;PhD student", "bibtex": "@misc{\nwu2022rgsn,\ntitle={R-{GSN}: The Relation-based Graph Similar Network for Heterogeneous Graph},\nauthor={Xinliang Wu and Guizhong Liu and Mengying Jiang},\nyear={2022},\nurl={https://openreview.net/forum?id=6A7zcZ43m1S}\n}", "github": "", "project": "", "reviewers": "tkMz;XBpr;LRkv", "site": "https://openreview.net/forum?id=6A7zcZ43m1S", "pdf_size": 0, "recommendation": "3;3;5", "confidence": "5;3;4", "correctness": "2;3;3", "technical_novelty": "3;2;2", "empirical_novelty": "2;2;2", "wc_summary_paper": "43;60;51", "wc_summary_review": "46;13;39", "wc_main_review": "190;329;224", "wc_review": "279;402;314", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 3.6666666666666665, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "correctness_avg": [ 2.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 51.333333333333336, 6.944222218666553 ], "wc_summary_review_avg": [ 32.666666666666664, 14.1970262926979 ], "wc_main_review_avg": [ 247.66666666666666, 59.162675921751735 ], "wc_review_avg": [ 331.6666666666667, 51.74510175422941 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.5, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7033381672493514137&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0", "aff_unique_norm": "Xi'an Jiao Tong University", "aff_unique_dep": "", "aff_unique_url": "https://www.xjtu.edu.cn", "aff_unique_abbr": "XJTU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "id": "6CrZzjpjWdk", "title": "KG-FiD: Infusing Knowledge Graph in Fusion-in-Decoder for Open-Domain Question Answering", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Current Open-Domain Question Answering (ODQA) model paradigm often contains a retrieving module and a reading module. Given an input question, the reading module predicts the answer from the relevant passages which are retrieved by the retriever. The recent proposed Fusion-in-Decoder (FiD), which is built on top of the pretrained generative model T5, achieves the state-of-the-art performance in the reading module. Although being effective, it remains constrained by inefficient attention on all retrieved passages which contain a lot of noise. In this work, we propose a novel method KG-FiD, which filters noisy passages by leveraging the structural relationship among the retrieved passages with a knowledge graph. We initiate the passage node embedding from the FiD encoder and then use graph neural network (GNN) to update the representation for reranking. To improve the efficiency, we build the GNN on top of the intermediate layer output of the FiD encoder and only pass a few top reranked passages into the higher layers of encoder and decoder for answer generation. We also apply the proposed GNN based reranking method to enhance the passage retrieval results in the retrieving module. Extensive experiments on common ODQA benchmark datasets (Natural Question and TriviaQA) demonstrate that KG-FiD can improve vanilla FiD by up to 1.5% on answer exact match score and achieve comparable performance with FiD with only 40% of computation cost.", "keywords": "open-domain question answering;knowledge graph;pre-trained language model", "primary_area": "", "supplementary_material": "", "author": "Donghan Yu;Chenguang Zhu;Yuwei Fang;Wenhao Yu;Shuohang Wang;Yichong Xu;Xiang Ren;Yiming Yang;Michael Zeng", "authorids": "~Donghan_Yu2;~Chenguang_Zhu1;~Yuwei_Fang1;~Wenhao_Yu2;~Shuohang_Wang1;~Yichong_Xu1;~Xiang_Ren1;~Yiming_Yang1;~Michael_Zeng1", "gender": "M;M;M;M;M;M;M;F;M", "homepage": ";;https://yuwfan.github.io/;https://wyu97.github.io/;;http://xycking.wixsite.com/yichongxu;https://shanzhenren.github.io/;http://www.cs.cmu.edu/~yiming/;https://www.microsoft.com/en-us/research/people/nzeng/", "dblp": "204/0106;48/7536-1.html;227/2871.html;159/8117-2.html;173/5469.html;154/6421;36/360-1;25/1666;232/1866-1.html", "google_scholar": "KlwvYcEAAAAJ;1b2kKWoAAAAJ;Om_-hHsAAAAJ;z4qSdX8AAAAJ;mN-IO6wAAAAJ;sYza2XwAAAAJ;_moJlrIAAAAJ;MlZq4XwAAAAJ;", "orcid": ";;;0000-0002-4075-5980;;;;0000-0001-8322-607X;", "linkedin": ";;yuwei-fang-79220192/;;;;xren7;yiming-yang-24100924/;michaelnanshanzeng/", "or_profile": "~Donghan_Yu2;~Chenguang_Zhu1;~Yuwei_Fang1;~Wenhao_Yu2;~Shuohang_Wang1;~Yichong_Xu1;~Xiang_Ren1;~Yiming_Yang1;~Michael_Zeng1", "aff": "Carnegie Mellon University;Zoom;Microsoft;University of Notre Dame;Microsoft;Microsoft;University of Southern California;School of Computer Science, Carnegie Mellon University;Microsoft", "aff_domain": "cmu.edu;zoom.us;microsoft.com;nd.edu;microsoft.com;microsoft.com;usc.edu;cs.cmu.edu;microsoft.com", "position": "PhD student;Principal Researcher;Senior Applied Scientist;PhD student;Researcher;Senior Researcher;Associate Professor;Full Professor;Partner Research Manager", "bibtex": "@misc{\nyu2022kgfid,\ntitle={{KG}-FiD: Infusing Knowledge Graph in Fusion-in-Decoder for Open-Domain Question Answering},\nauthor={Donghan Yu and Chenguang Zhu and Yuwei Fang and Wenhao Yu and Shuohang Wang and Yichong Xu and Xiang Ren and Yiming Yang and Michael Zeng},\nyear={2022},\nurl={https://openreview.net/forum?id=6CrZzjpjWdk}\n}", "github": "", "project": "", "reviewers": "YjZ7;DdPW;8H62;mXHm", "site": "https://openreview.net/forum?id=6CrZzjpjWdk", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "5;4;3;3", "correctness": "3;2;3;3", "technical_novelty": "2;2;2;2", "empirical_novelty": "0;2;2;2", "wc_summary_paper": "397;255;249;292", "wc_summary_review": "29;80;57;68", "wc_main_review": "373;414;496;277", "wc_review": "799;749;802;637", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 1.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 298.25, 59.343807596075266 ], "wc_summary_review_avg": [ 58.5, 18.874586088176873 ], "wc_main_review_avg": [ 390.0, 78.85112554681766 ], "wc_review_avg": [ 746.75, 66.7696600260927 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 9, 0 ], "corr_recommendation_confidence": -0.899228803025897, "corr_recommendation_correctness": -0.13245323570650439, "gs_citation": 130, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2492411683198541373&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff_unique_index": "0;1;2;3;2;2;4;0;2", "aff_unique_norm": "Carnegie Mellon University;Zoom Video Communications Inc.;Microsoft;University of Notre Dame;University of Southern California", "aff_unique_dep": ";;Microsoft Corporation;;", "aff_unique_url": "https://www.cmu.edu;https://zoom.us;https://www.microsoft.com;https://www.nd.edu;https://www.usc.edu", "aff_unique_abbr": "CMU;Zoom;Microsoft;Notre Dame;USC", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Los Angeles;Pittsburgh", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "6Dz7RiRiMFd", "title": "3D-Transformer: Molecular Representation with Transformer in 3D Space", "track": "main", "status": "Reject", "tldr": "", "abstract": "Spatial structures in the 3D space are important to determine molecular properties. Recent papers use geometric deep learning to represent molecules and predict properties. These papers, however, are computationally expensive in capturing long-range dependencies of input atoms; and more importantly, they have not considered the non-uniformity of interatomic distances, thus failing to learn context-dependent representations at different scales. To deal with such issues, we introduce 3D-Transformer, a variant of the Transformer for molecular representations that incorporates 3D spatial information. 3D-Transformer operates on a fully-connected graph with direct connections between atoms. To cope with the non-uniformity of interatomic distances, we develop a multi-scale self-attention module that exploits local fine-grained patterns with increasing contextual scales. As molecules of different sizes rely on different kinds of spatial features, we design an adaptive position encoding module that adopts different position encoding methods for small and large molecules. Finally, to attain the molecular representation from atom embeddings, we propose an attentive farthest point sampling algorithm that selects a portion of atoms with the assistance of attention scores, overcoming handicaps of the virtual node and previous distance-dominant downsampling methods. We validate 3D-Transformer across three important scientific domains: quantum chemistry, material science, and proteomics. Our experiments show significant improvements over state-of-the-art models on the crystal property prediction task and the protein-ligand binding affinity prediction task, and show better or competitive performance in quantum chemistry molecular datasets. This work provides clear evidence that biochemical tasks can gain consistent benefits from 3D molecular representations and different tasks require different position encoding methods. ", "keywords": "structural biology;self-attention;transformer;proteins;small molecules;crystals;geometric deel learning", "primary_area": "", "supplementary_material": "", "author": "Fang Wu;Qiang Zhang;Dragomir Radev;Jiyu Cui;Wen Zhang;Huabin Xing;Ningyu Zhang;Huajun Chen", "authorids": "~Fang_Wu1;~Qiang_Zhang6;~Dragomir_Radev2;~Jiyu_Cui1;~Wen_Zhang4;xinghb@zju.edu.cn;~Ningyu_Zhang1;~Huajun_Chen1", "gender": ";;;M;;;M;M", "homepage": ";https://qiangairesearcher.github.io;;https://www.researchgate.net/profile/Jiyu_Cui3;https://person.zju.edu.cn/en/wenzhang;;https://person.zju.edu.cn/en/ningyu;", "dblp": ";72/3527-26;;;43/2368-15;;139/4181-1.html;94/5089", "google_scholar": ";https://scholar.google.com/citations?hl=zh-CN;;;Ig9ho4kAAAAJ;;xQDOPvsAAAAJ;", "orcid": ";;;;;;0000-0002-1970-0678;", "linkedin": ";;;;;;ningyuzhang/;", "or_profile": "~Fang_Wu1;~Qiang_Zhang6;~Dragomir_Radev2;~Jiyu_Cui1;~Wen_Zhang4;xinghb@zju.edu.cn;~Ningyu_Zhang1;~Huajun_Chen1", "aff": ";Zhejiang University;;Zhejiang University;Zhejiang University;;Zhejiang University;Zhejiang University", "aff_domain": ";zju.edu.cn;;zju.edu.cn;zju.edu.cn;;zju.edu.cn;zju.edu.cn", "position": ";Principal Researcher;;PhD student;Assistant Professor;;Associate Professor;Full Professor", "bibtex": "@misc{\nwu2022dtransformer,\ntitle={3D-Transformer: Molecular Representation with Transformer in 3D Space },\nauthor={Fang Wu and Qiang Zhang and Dragomir Radev and Jiyu Cui and Wen Zhang and Huabin Xing and Ningyu Zhang and Huajun Chen},\nyear={2022},\nurl={https://openreview.net/forum?id=6Dz7RiRiMFd}\n}", "github": "", "project": "", "reviewers": "iK2o;2LpV;E2hi;aCJM", "site": "https://openreview.net/forum?id=6Dz7RiRiMFd", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "3;3;2;4", "correctness": "3;2;3;2", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;3;2;2", "wc_summary_paper": "47;136;41;52", "wc_summary_review": "26;19;60;47", "wc_main_review": "217;301;283;523", "wc_review": "290;456;384;622", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "609;423;504;609", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 3.0, 0.7071067811865476 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 69.0, 38.878014352587506 ], "wc_summary_review_avg": [ 38.0, 16.355427233796124 ], "wc_main_review_avg": [ 331.0, 115.17812292271479 ], "wc_review_avg": [ 438.0, 121.4495780149112 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 536.25, 78.18367796413776 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1693650108792909492&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Zhejiang University", "aff_unique_dep": "", "aff_unique_url": "https://www.zju.edu.cn", "aff_unique_abbr": "ZJU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Understanding Intrinsic Robustness Using Label Uncertainty", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6167", "id": "6ET9SzlgNX", "poster": "", "openreview": "https://openreview.net/forum?id=6ET9SzlgNX", "slides": "https://iclr.cc/virtual/2022/poster/6167", "video": "https://iclr.cc/virtual/2022/poster/6167", "author_site": "Xiao Zhang, David Evans", "tldr": "", "abstract": "A fundamental question in adversarial machine learning is whether a robust classifier exists for a given task. A line of research has made some progress towards this goal by studying the concentration of measure, but we argue standard concentration fails to fully characterize the intrinsic robustness of a classification problem since it ignores data labels which are essential to any classification task. Building on a novel definition of label uncertainty, we empirically demonstrate that error regions induced by state-of-the-art models tend to have much higher label uncertainty than randomly-selected subsets. This observation motivates us to adapt a concentration estimation algorithm to account for label uncertainty, resulting in more accurate intrinsic robustness measures for benchmark image classification problems.", "keywords": "Concentration of Measure;Intrinsic Adversarial Robustness;Label Uncertainty", "primary_area": "", "supplementary_material": "", "author": "Xiao Zhang;David Evans", "authorids": "~Xiao_Zhang2;~David_Evans1", "gender": "M;Not Specified", "homepage": "https://xiao-zhang.net;https://www.cs.virginia.edu/evans/", "dblp": ";https://dblp.uni-trier.de/pid/e/DavidEvans", "google_scholar": "L-lz7CUAAAAJ;DsR4PucAAAAJ", "orcid": "0009-0008-1837-7670;", "linkedin": ";", "or_profile": "~Xiao_Zhang2;~David_Evans1", "aff": "University of Virginia;University of Virginia", "aff_domain": "cs.virginia.edu;virginia.edu", "position": "PhD student;Professor", "bibtex": "@inproceedings{\nzhang2022understanding,\ntitle={Understanding Intrinsic Robustness Using Label Uncertainty},\nauthor={Xiao Zhang and David Evans},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=6ET9SzlgNX}\n}", "github": "", "project": "", "reviewers": "jR8K;zzwx;rWcj;HNrN", "pdf_size": 0, "recommendation": "6;6;6;8", "confidence": "3;4;4;2", "correctness": "3;3;4;4", "technical_novelty": "3;3;3;4", "empirical_novelty": "3;3;3;4", "wc_summary_paper": "59;111;121;48", "wc_summary_review": "44;41;73;15", "wc_main_review": "149;541;225;33", "wc_review": "252;693;419;96", "wc_reply_reviewers": "31;0;0;0", "wc_reply_authors": "375;1739;223;0", "reply_reviewers": "1;0;0;0", "reply_authors": "1;3;1;0", "recommendation_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 84.75, 31.68891762114951 ], "wc_summary_review_avg": [ 43.25, 20.54720175595694 ], "wc_main_review_avg": [ 237.0, 188.36135484753765 ], "wc_review_avg": [ 365.0, 221.15040131096302 ], "wc_reply_reviewers_avg": [ 7.75, 13.423393758658799 ], "wc_reply_authors_avg": [ 584.25, 679.9049106308911 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 1.0897247358851685 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.8703882797784891, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7215793248994812724&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=6ET9SzlgNX", "email": "cs.virginia.edu;virginia.edu", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "University of Virginia", "aff_unique_dep": "", "aff_unique_url": "https://www.virginia.edu", "aff_unique_abbr": "UVA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "6EVxJKlpGR", "title": "Surprise Minimizing Multi-Agent Learning with Energy-based Models", "track": "main", "status": "Reject", "tldr": "", "abstract": "Multi-Agent Reinforcement Learning (MARL) has demonstrated significant success by virtue of collaboration across agents. Recent work, on the other hand, introduces surprise which quantifies the degree of change in an agent's environment. Surprise-based learning has received significant attention in the case of single-agent entropic settings but remains an open problem for fast-paced dynamics in multi-agent scenarios. A potential alternative to address surprise may be realized through the lens of free-energy minimization. We explore surprise minimization in multi-agent learning by utilizing the free energy across all agents in a multi-agent system. A temporal Energy-Based Model (EBM) represents an estimate of surprise which is minimized over the joint agent distribution. Our formulation of the EBM is theoretically akin to the minimum conjugate entropy objective and highlights suitable convergence towards minimum surprising states. We further validate our theoretical claims in an empirical study of multi-agent tasks demanding collaboration in the presence of fast-paced dynamics.", "keywords": "Multi-Agent Learning;Reinforcement Learning;Energy-based Models.", "primary_area": "", "supplementary_material": "", "author": "Karush Suri", "authorids": "~Karush_Suri1", "gender": "M", "homepage": "https://karush17.github.io/", "dblp": "252/3260", "google_scholar": "https://scholar.google.co.in/citations?user=ZFCHp9gAAAAJ", "orcid": "", "linkedin": "", "or_profile": "~Karush_Suri1", "aff": "Google", "aff_domain": "google.com", "position": "Researcher", "bibtex": "@misc{\nsuri2022surprise,\ntitle={Surprise Minimizing Multi-Agent Learning with Energy-based Models},\nauthor={Karush Suri},\nyear={2022},\nurl={https://openreview.net/forum?id=6EVxJKlpGR}\n}", "github": "", "project": "", "reviewers": "g9cM;8Wiu;zFND;doix", "site": "https://openreview.net/forum?id=6EVxJKlpGR", "pdf_size": 0, "recommendation": "5;6;6;6", "confidence": "4;4;1;3", "correctness": "3;3;2;4", "technical_novelty": "3;2;2;3", "empirical_novelty": "3;2;0;2", "wc_summary_paper": "32;143;112;59", "wc_summary_review": "60;42;131;43", "wc_main_review": "148;532;498;430", "wc_review": "240;717;741;532", "wc_reply_reviewers": "135;12;51;1173", "wc_reply_authors": "1364;1401;1045;1715", "reply_reviewers": "1;1;1;2", "reply_authors": "3;4;4;4", "recommendation_avg": [ 5.75, 0.4330127018922193 ], "confidence_avg": [ 3.0, 1.224744871391589 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 1.75, 1.0897247358851685 ], "wc_summary_paper_avg": [ 86.5, 43.5 ], "wc_summary_review_avg": [ 69.0, 36.50342449688796 ], "wc_main_review_avg": [ 402.0, 151.17539482336403 ], "wc_review_avg": [ 557.5, 200.35530938809683 ], "wc_reply_reviewers_avg": [ 342.75, 481.40127492560714 ], "wc_reply_authors_avg": [ 1381.25, 237.24499467849685 ], "reply_reviewers_avg": [ 1.25, 0.4330127018922193 ], "reply_authors_avg": [ 3.75, 0.4330127018922193 ], "replies_avg": [ 27, 0 ], "authors#_avg": [ 1, 0 ], "corr_recommendation_confidence": -0.4714045207910316, "corr_recommendation_correctness": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7793845198063905177&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0", "aff_unique_norm": "Google", "aff_unique_dep": "Google", "aff_unique_url": "https://www.google.com", "aff_unique_abbr": "Google", "aff_campus_unique_index": "0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "title": "Uncertainty Modeling for Out-of-Distribution Generalization", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7084", "id": "6HN7LHyzGgC", "poster": "", "openreview": "https://openreview.net/forum?id=6HN7LHyzGgC", "slides": "https://iclr.cc/virtual/2022/poster/7084", "video": "https://iclr.cc/virtual/2022/poster/7084", "author_site": "Xiaotong Li, Yongxing Dai, Yixiao Ge, Jun Liu, Ying Shan, LINGYU DUAN", "tldr": "", "abstract": "Though remarkable progress has been achieved in various vision tasks, deep neural networks still suffer obvious performance degradation when tested in out-of-distribution scenarios. We argue that the feature statistics (mean and standard deviation), which carry the domain characteristics of the training data, can be properly manipulated to improve the generalization ability of deep learning models. Common methods often consider the feature statistics as deterministic values measured from the learned features and do not explicitly consider the uncertain statistics discrepancy caused by potential domain shifts during testing. In this paper, we improve the network generalization ability by modeling the uncertainty of domain shifts with synthesized feature statistics during training. Specifically, we hypothesize that the feature statistic, after considering the potential uncertainties, follows a multivariate Gaussian distribution. Hence, each feature statistic is no longer a deterministic value, but a probabilistic point with diverse distribution possibilities. With the uncertain feature statistics, the models can be trained to alleviate the domain perturbations and achieve better robustness against potential domain shifts. Our method can be readily integrated into networks without additional parameters. Extensive experiments demonstrate that our proposed method consistently improves the network generalization ability on multiple vision tasks, including image classification, semantic segmentation, and instance retrieval.", "keywords": "domain generalization;uncertainty modeling", "primary_area": "", "supplementary_material": "", "author": "Xiaotong Li;Yongxing Dai;Yixiao Ge;Jun Liu;Ying Shan;LINGYU DUAN", "authorids": "~Xiaotong_Li2;~Yongxing_Dai1;~Yixiao_Ge2;~Jun_Liu8;~Ying_Shan2;~LINGYU_DUAN1", "gender": "M;M;F;M;M;M", "homepage": "https://github.com/lixiaotong97;https://sikastar.github.io/;https://geyixiao.com/;;;http://eecs.pku.edu.cn/EN/People/Faculty/Detail/?ID=6096", "dblp": ";269/4603.html;228/6649;95/3736-36;68/5910;d/LingyuDuan", "google_scholar": "cpCE_T4AAAAJ;https://scholar.google.com.hk/citations?user=j8slWLIAAAAJ;TtU74NAAAAAJ;Q5Ild8UAAAAJ;4oXBp9UAAAAJ;", "orcid": ";;;;0000-0001-7673-8325;", "linkedin": ";;;;YingShanProfile/;", "or_profile": "~Xiaotong_Li2;~Yongxing_Dai1;~Yixiao_Ge2;~Jun_Liu8;~Ying_Shan2;~LINGYU_DUAN1", "aff": "Peking University;Peking University;Tencent;Singapore University of Technology and Design;Tencent PCG ARC Lab;Peking University", "aff_domain": "pku.edu.cn;pku.edu.cn;tencent.com;sutd.edu.sg;arc.tencent.com;pku.edu.cn", "position": "PhD student;PhD student;Researcher;Assistant Professor;Director;Full Professor", "bibtex": "@inproceedings{\nli2022uncertainty,\ntitle={Uncertainty Modeling for Out-of-Distribution Generalization},\nauthor={Xiaotong Li and Yongxing Dai and Yixiao Ge and Jun Liu and Ying Shan and LINGYU DUAN},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=6HN7LHyzGgC}\n}", "github": "", "project": "", "reviewers": "jq8e;xdeV;SDxo", "pdf_size": 0, "recommendation": "6;6;8", "confidence": "4;3;3", "correctness": "4;3;4", "technical_novelty": "3;2;4", "empirical_novelty": "3;2;4", "wc_summary_paper": "56;37;73", "wc_summary_review": "27;13;14", "wc_main_review": "234;84;127", "wc_review": "317;134;214", "wc_reply_reviewers": "32;24;0", "wc_reply_authors": "1461;621;120", "reply_reviewers": "1;1;0", "reply_authors": "3;1;1", "recommendation_avg": [ 6.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "correctness_avg": [ 3.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 3.0, 0.816496580927726 ], "empirical_novelty_avg": [ 3.0, 0.816496580927726 ], "wc_summary_paper_avg": [ 55.333333333333336, 14.704496666741854 ], "wc_summary_review_avg": [ 18.0, 6.377042156569663 ], "wc_main_review_avg": [ 148.33333333333334, 63.067864682067324 ], "wc_review_avg": [ 221.66666666666666, 74.90586685226258 ], "wc_reply_reviewers_avg": [ 18.666666666666668, 13.59738536958076 ], "wc_reply_authors_avg": [ 734.0, 553.2612402834668 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.9428090415820634 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.49999999999999983, "corr_recommendation_correctness": 0.49999999999999983, "gs_citation": 252, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18401330697518830514&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=6HN7LHyzGgC", "email": "pku.edu.cn;pku.edu.cn;tencent.com;sutd.edu.sg;arc.tencent.com;pku.edu.cn", "author_num": 6, "aff_unique_index": "0;0;1;2;1;0", "aff_unique_norm": "Peking University;Tencent;Singapore University of Technology and Design", "aff_unique_dep": ";Tencent Holdings Limited;", "aff_unique_url": "http://www.pku.edu.cn;https://www.tencent.com;https://www.sutd.edu.sg", "aff_unique_abbr": "Peking U;Tencent;SUTD", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;0;0", "aff_country_unique": "China;Singapore" }, { "title": "CADDA: Class-wise Automatic Differentiable Data Augmentation for EEG Signals", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7154", "id": "6IYp-35L-xJ", "poster": "", "openreview": "https://openreview.net/forum?id=6IYp-35L-xJ", "slides": "https://iclr.cc/virtual/2022/poster/7154", "video": "https://iclr.cc/virtual/2022/poster/7154", "author_site": "C\u00e9dric Rommel, Thomas Moreau, Joseph Paillard, Alexandre Gramfort", "tldr": "", "abstract": "Data augmentation is a key element of deep learning pipelines, as it informs the network during training about transformations of the input data that keep the label unchanged. Manually finding adequate augmentation methods and parameters for a given pipeline is however rapidly cumbersome. In particular, while intuition can guide this decision for images, the design and choice of augmentation policies remains unclear for more complex types of data, such as neuroscience signals. Besides, class-dependent augmentation strategies have been surprisingly unexplored in the literature, although it is quite intuitive: changing the color of a car image does not change the object class to be predicted, but doing the same to the picture of an orange does. This paper investigates gradient-based automatic data augmentation algorithms amenable to class-wise policies with exponentially larger search spaces. Motivated by supervised learning applications using EEG signals for which good augmentation policies are mostly unknown, we propose a new differentiable relaxation of the problem. In the class-agnostic setting, results show that our new relaxation leads to optimal performance with faster training than competing gradient-based methods, while also outperforming gradient-free methods in the class-wise setting. This work proposes also novel differentiable augmentation operations relevant for sleep stage classification.", "keywords": "Neuroscience;EEG;Sleep staging;Automatic data augmentation", "primary_area": "", "supplementary_material": "/attachment/e693f48db22ca4fa82f893bfea1e6f1f0ca42b51.zip", "author": "C\u00e9dric Rommel;Thomas Moreau;Joseph Paillard;Alexandre Gramfort", "authorids": "~C\u00e9dric_Rommel1;~Thomas_Moreau2;~Joseph_Paillard1;~Alexandre_Gramfort1", "gender": "M;M;M;M", "homepage": "https://cedricrommel.github.io/;;http://alexandre.gramfort.net;https://tommoral.github.io", "dblp": "295/9766;;15/7980;150/2391-1", "google_scholar": "GBv4KYwAAAAJ;;fhxshS0AAAAJ;https://scholar.google.fr/citations?user=HEO_PsAAAAAJ", "orcid": ";;0000-0001-9791-4404;0000-0002-1523-3419", "linkedin": "cedric-rommel/;joseph-paillard-73a2a214b;alexandregramfort/;thomasmoreau2010", "or_profile": "~C\u00e9dric_Rommel1;~Joseph_Paillard1;~Alexandre_Gramfort1;~Thomas_Martin_Moreau1", "aff": "INRIA;Mines ParisTech;INRIA;INRIA", "aff_domain": "inria.fr;mines-paristech.fr;inria.fr;inria.fr", "position": "Postdoc;MS student;Full Professor;Researcher", "bibtex": "@inproceedings{\nrommel2022cadda,\ntitle={{CADDA}: Class-wise Automatic Differentiable Data Augmentation for {EEG} Signals},\nauthor={C{\\'e}dric Rommel and Thomas Moreau and Joseph Paillard and Alexandre Gramfort},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=6IYp-35L-xJ}\n}", "github": "", "project": "", "reviewers": "NPRR;EdsK;jp1B;18fi", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "3;3;2;4", "correctness": "3;3;4;3", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;3;2;3", "wc_summary_paper": "32;53;60;47", "wc_summary_review": "24;73;34;36", "wc_main_review": "422;536;151;71", "wc_review": "478;662;245;154", "wc_reply_reviewers": "0;64;0;0", "wc_reply_authors": "645;1137;396;20", "reply_reviewers": "0;1;0;0", "reply_authors": "1;2;1;1", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 3.0, 0.7071067811865476 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 48.0, 10.319883720275147 ], "wc_summary_review_avg": [ 41.75, 18.606114586339622 ], "wc_main_review_avg": [ 295.0, 190.4744077297525 ], "wc_review_avg": [ 384.75, 198.95900959745452 ], "wc_reply_reviewers_avg": [ 16.0, 27.712812921102035 ], "wc_reply_authors_avg": [ 549.5, 405.65040367291635 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.6488856845230502, "corr_recommendation_correctness": -0.13245323570650439, "gs_citation": 50, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7439818644438694869&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=6IYp-35L-xJ", "email": "inria.fr;mines-paristech.fr;inria.fr;inria.fr", "author_num": 4, "aff_unique_index": "0;1;0;0", "aff_unique_norm": "INRIA;MINES ParisTech", "aff_unique_dep": ";", "aff_unique_url": "https://www.inria.fr;https://www.mines-paristech.fr", "aff_unique_abbr": "INRIA;Mines ParisTech", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "France" }, { "id": "6Jf6HX4MoLH", "title": "Motion Planning Transformers: One Model to Plan them All", "track": "main", "status": "Reject", "tldr": "", "abstract": "Transformers have become the powerhouse of natural language processing and recently found use in computer vision tasks. Their effective use of attention can be used in other contexts as well, and in this paper, we propose a transformer-based approach for efficiently solving complex motion planning problems. Traditional neural network-based motion planning uses convolutional networks to encode the planning space, but these methods are limited to fixed map sizes, which is often not realistic in the real world. Our approach first identifies regions on the map using transformers to provide attention to map areas likely to include the best path and then applies traditional planners to generate the final collision-free path. We validate our method on a variety of randomly generated environments with different map sizes, demonstrating reduction in planning complexity and achieving comparable accuracy to traditional planners.\n", "keywords": "Motion Planning;Attention Networks", "primary_area": "", "supplementary_material": "/attachment/f37171ff4cfc2d3ad2134e5b18f51168a1566bb3.zip", "author": "Jacob John Johnson;Linjun Li;Ahmed Qureshi;Michael C. Yip", "authorids": "~Jacob_John_Johnson1;~Linjun_Li1;~Ahmed_Qureshi1;~Michael_C._Yip1", "gender": "M;M;M;", "homepage": ";https://github.com/LeeLinJun;https://qureshiahmed.github.io/;http://www.ucsdarclab.com", "dblp": ";;222/2796;", "google_scholar": ";;https://scholar.google.com/citations?hl=en;gSYxbCYAAAAJ", "orcid": ";;;", "linkedin": ";;;michael-yip-43913421/", "or_profile": "~Jacob_John_Johnson1;~Linjun_Li1;~Ahmed_Qureshi1;~Michael_C._Yip1", "aff": "University of California San Diego;University of California, San Diego;Purdue University;University of California, San Diego", "aff_domain": "ucsd.edu;ucsd.edu;purdue.edu;ucsd.edu", "position": "PhD student;MS student;Assistant Professor;Associate Professor", "bibtex": "@misc{\njohnson2022motion,\ntitle={Motion Planning Transformers: One Model to Plan them All},\nauthor={Jacob John Johnson and Linjun Li and Ahmed Qureshi and Michael C. Yip},\nyear={2022},\nurl={https://openreview.net/forum?id=6Jf6HX4MoLH}\n}", "github": "", "project": "", "reviewers": "jDfv;WoFc;J9BR;JNUy", "site": "https://openreview.net/forum?id=6Jf6HX4MoLH", "pdf_size": 0, "recommendation": "3;6;6;6", "confidence": "4;3;4;4", "correctness": "3;3;4;3", "technical_novelty": "2;2;3;4", "empirical_novelty": "2;2;3;4", "wc_summary_paper": "102;80;140;56", "wc_summary_review": "32;75;32;48", "wc_main_review": "350;356;379;340", "wc_review": "484;511;551;444", "wc_reply_reviewers": "0;47;74;54", "wc_reply_authors": "733;704;286;429", "reply_reviewers": "0;1;1;1", "reply_authors": "1;1;1;1", "recommendation_avg": [ 5.25, 1.299038105676658 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.82915619758885 ], "empirical_novelty_avg": [ 2.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 94.5, 30.89902911096075 ], "wc_summary_review_avg": [ 46.75, 17.5695048308141 ], "wc_main_review_avg": [ 356.25, 14.324367350776788 ], "wc_review_avg": [ 497.5, 39.01602234979881 ], "wc_reply_reviewers_avg": [ 43.75, 27.13277538328875 ], "wc_reply_authors_avg": [ 538.0, 187.72719568565446 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 21, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18013940973623221165&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "University of California, San Diego;Purdue University", "aff_unique_dep": ";", "aff_unique_url": "https://ucsd.edu;https://www.purdue.edu", "aff_unique_abbr": "UCSD;Purdue", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "San Diego;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "6LHiNULIeiC", "title": "SOInter: A Novel Deep Energy-Based Interpretation Method for Explaining Structured Output Models", "track": "main", "status": "Reject", "tldr": "", "abstract": "We propose a novel interpretation technique to explain the behavior of structured output models, which learn mappings between an input vector to a set of output variables simultaneously. Because of the complex relationship between the computational path of output variables in structured models, a feature can affect the value of output through other ones. We focus on one of the outputs as the target and try to find the most important features utilized by the structured model to decide on the target in each locality of the input space. In this paper, we assume an arbitrary structured output model is available as a black-box and argue how considering the correlations between output variables can improve the explanation performance. The goal is to train a function as an interpreter for the target output variable over the input space. We introduce an energy-based training process for the interpreter function, which effectively considers the structural information incorporated into the model to be explained. The effectiveness of the proposed method is confirmed using a variety of simulated and real data sets.", "keywords": "Structured Output Model;Interpretable Learning;Deep Learning", "primary_area": "", "supplementary_material": "", "author": "Seyyede Fatemeh Seyyedsalehi;Mahdieh Soleymani Baghshah;Hamid R. Rabiee", "authorids": "seyyedsalehi@ce.sharif.edu;~Mahdieh_Soleymani_Baghshah1;~Hamid_R._Rabiee1", "gender": ";;M", "homepage": ";;http://sharif.edu/~rabiee", "dblp": ";;01/4547", "google_scholar": ";;rKDtrNgAAAAJ", "orcid": ";;0000-0002-9835-4493", "linkedin": ";;hrabiee/", "or_profile": "seyyedsalehi@ce.sharif.edu;~Mahdieh_Soleymani_Baghshah1;~Hamid_R._Rabiee1", "aff": ";;Sharif University of Technology", "aff_domain": ";;sharif.edu", "position": ";;Full Professor", "bibtex": "@misc{\nseyyedsalehi2022sointer,\ntitle={{SOI}nter: A Novel Deep Energy-Based Interpretation Method for Explaining Structured Output Models},\nauthor={Seyyede Fatemeh Seyyedsalehi and Mahdieh Soleymani Baghshah and Hamid R. Rabiee},\nyear={2022},\nurl={https://openreview.net/forum?id=6LHiNULIeiC}\n}", "github": "", "project": "", "reviewers": "bvqh;KTxH;iN3y;JCYy", "site": "https://openreview.net/forum?id=6LHiNULIeiC", "pdf_size": 0, "recommendation": "3;3;3;3", "confidence": "2;4;3;4", "correctness": "2;3;2;3", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;0;2;1", "wc_summary_paper": "67;50;93;122", "wc_summary_review": "58;14;11;9", "wc_main_review": "229;114;1136;636", "wc_review": "354;178;1240;767", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 83.0, 27.230497608380205 ], "wc_summary_review_avg": [ 23.0, 20.285462775100793 ], "wc_main_review_avg": [ 528.75, 400.66281022825166 ], "wc_review_avg": [ 634.75, 409.6519101627625 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:VA4larE8tjQJ:scholar.google.com/&scioq=SOInter:+A+Novel+Deep+Energy-Based+Interpretation+Method+for+Explaining+Structured+Output+Models&hl=en&as_sdt=0,5", "gs_version_total": 4, "aff_unique_index": "0", "aff_unique_norm": "Sharif University of Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.sharif.edu", "aff_unique_abbr": "SUT", "aff_country_unique_index": "0", "aff_country_unique": "Iran" }, { "id": "6LNPEcJAGWe", "title": "Federated Contrastive Representation Learning with Feature Fusion and Neighborhood Matching", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Federated learning (FL) enables distributed clients to learn a shared model for prediction while keeping the training data local on each client. However, existing FL requires fully-labeled data for training, which is inconvenient or sometimes infeasible to obtain due to the high labeling cost and the requirement of expertise. The lack of labels makes FL impractical in many realistic settings. Self-supervised learning can address this challenge by learning from unlabeled data such that FL can be widely used. Contrastive learning (CL), a self-supervised learning approach, can effectively learn data representations from unlabeled data. However, the distributed data collected on clients are usually not independent and identically distributed (non-IID) among clients, and each client may only have few classes of data, which degrades the performance of CL and learned representations. To tackle this problem, we propose a federated contrastive learning framework consisting of two approaches: feature fusion and neighborhood matching, by which a unified feature space among clients is learned for better data representations. Feature fusion provides remote features as accurate contrastive information to each client for better local learning. Neighborhood matching further aligns each client\u2019s local features to the remote features such that well-clustered features among clients can be learned. Extensive experiments show the effectiveness of the proposed framework. It outperforms other methods by 11% on IID data and matches the performance of centralized learning.", "keywords": "Federated Learning;Contrastive Learning;Self-supervised Learning", "primary_area": "", "supplementary_material": "", "author": "Yawen Wu;Zhepeng Wang;Dewen Zeng;Meng Li;Yiyu Shi;Jingtong Hu", "authorids": "~Yawen_Wu1;~Zhepeng_Wang1;~Dewen_Zeng1;~Meng_Li1;~Yiyu_Shi1;~Jingtong_Hu1", "gender": "M;M;M;M;M;M", "homepage": "https://sites.google.com/view/yawenwu;;https://scholar.google.com/citations?user=RpJ5nSsAAAAJ&hl=en&authuser=1;https://mengli.me;;http://www.pitt.edu/~jthu/index.html", "dblp": "230/8649;242/8456;;70/1726-4;94/5536;37/3401", "google_scholar": "73k09jEAAAAJ;JyPU5aEAAAAJ;RpJ5nSsAAAAJ;lvdRkEkAAAAJ;;OcWo8CYAAAAJ", "orcid": ";;;;;0000-0003-4029-4034", "linkedin": "yawenwu06/;zhepeng-wang/;;;;", "or_profile": "~Yawen_Wu1;~Zhepeng_Wang1;~Dewen_Zeng1;~Meng_Li1;~Yiyu_Shi1;~Jingtong_Hu1", "aff": "University of Pittsburgh;George Mason University;University of Notre Dame;Meta Facebook;University of Notre Dame;University of Pittsburgh", "aff_domain": "pitt.edu;gmu.edu;nd.edu;fb.com;nd.edu;pitt.edu", "position": "PhD student;PhD student;PhD student;Researcher;Full Professor;Associate Professor", "bibtex": "@misc{\nwu2022federated,\ntitle={Federated Contrastive Representation Learning with Feature Fusion and Neighborhood Matching},\nauthor={Yawen Wu and Zhepeng Wang and Dewen Zeng and Meng Li and Yiyu Shi and Jingtong Hu},\nyear={2022},\nurl={https://openreview.net/forum?id=6LNPEcJAGWe}\n}", "github": "", "project": "", "reviewers": "zrs7;F5ip;C5BU;jD4M", "site": "https://openreview.net/forum?id=6LNPEcJAGWe", "pdf_size": 0, "recommendation": "1;3;3;6", "confidence": "5;5;4;2", "correctness": "3;2;2;4", "technical_novelty": "2;2;1;3", "empirical_novelty": "2;3;1;3", "wc_summary_paper": "38;53;96;69", "wc_summary_review": "14;38;77;51", "wc_main_review": "133;258;108;145", "wc_review": "185;349;281;265", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.25, 1.7853571071357126 ], "confidence_avg": [ 4.0, 1.224744871391589 ], "correctness_avg": [ 2.75, 0.82915619758885 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 64.0, 21.482551058940835 ], "wc_summary_review_avg": [ 45.0, 22.74862633215465 ], "wc_main_review_avg": [ 161.0, 57.57169443398379 ], "wc_review_avg": [ 270.0, 58.33523806414096 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.914659120760047, "corr_recommendation_correctness": 0.5488604301969737, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16601127213489118856&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;2;3;2;0", "aff_unique_norm": "University of Pittsburgh;George Mason University;University of Notre Dame;Meta", "aff_unique_dep": ";;;Meta Platforms, Inc.", "aff_unique_url": "https://www.pitt.edu;https://www.gmu.edu;https://www.nd.edu;https://meta.com", "aff_unique_abbr": "Pitt;GMU;Notre Dame;Meta", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "6MFWE6u2b6R", "title": "Bandits for Black-box Attacks to Graph Neural Networks with Structure Perturbation", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Graph neural networks (GNNs) have achieved state-of-the-art performance in many graph-based tasks such as node classification and graph classification. However, many recent works have demonstrated that an attacker can mislead GNN models by slightly perturbing the graph structure. Existing attacks to GNNs are either under the less practical threat model where the attacker is assumed to access the GNN model parameters, or under the practical black-box threat model but consider perturbing node features that are shown to be not enough effective. In this paper, we aim to bridge this gap and consider black-box attacks to GNNs with structure perturbation. We address this challenge motivated by bandits. Specifically, we formulate our attack as an online optimization with bandit feedback. This original problem is essentially NP-hard due to the fact that perturbing the graph structure is a binary optimization problem. We then propose a bandit convex optimization based attack which is proven to be {sublinear} to the query number $T$, i.e., $\\mathcal{O}(\\sqrt{N}T^{3/4})$ where $N$ is the number of nodes in the graph. Finally, we evaluate our proposed attack algorithm by conducting experiments over multiple datasets and GNN models. The experimental results on various citation graphs and image graphs show that our attack is both effective and efficient. ", "keywords": "Graph neural networks;Black-box attacks;Structure perturbation;Bandits;Sublinear query complexity", "primary_area": "", "supplementary_material": "/attachment/cf670c0423eedc58b2dad53efa9e2e363821b0ee.zip", "author": "Binghui Wang;Youqi Li;Pan Zhou", "authorids": "~Binghui_Wang2;~Youqi_Li3;~Pan_Zhou5", "gender": "M;M;M", "homepage": "https://wangbinghui.net;https://lyouqi.github.io/;http://faculty.hust.edu.cn/pzhou/zh_CN/index.htm", "dblp": "123/7149;https://dblp.uni-trier.de/pid/194/1801.html;84/6614-1", "google_scholar": "SoOztcEAAAAJ;;cTpFPJgAAAAJ", "orcid": "0000-0001-5616-060X;;", "linkedin": ";;", "or_profile": "~Binghui_Wang2;~Youqi_Li3;~Pan_Zhou5", "aff": "Illinois Institute of Technology;Beijing Institute of Technology;Huazhong University of Science and Technology", "aff_domain": "iit.edu;bit.edu.cn;hust.edu.cn", "position": "Assistant Professor;Postdoc;Professor", "bibtex": "@misc{\nwang2022bandits,\ntitle={Bandits for Black-box Attacks to Graph Neural Networks with Structure Perturbation},\nauthor={Binghui Wang and Youqi Li and Pan Zhou},\nyear={2022},\nurl={https://openreview.net/forum?id=6MFWE6u2b6R}\n}", "github": "", "project": "", "reviewers": "wTee;SQ7N;X17a;UdbB", "site": "https://openreview.net/forum?id=6MFWE6u2b6R", "pdf_size": 0, "recommendation": "3;5;5;5", "confidence": "5;3;3;3", "correctness": "2;3;3;4", "technical_novelty": "2;3;4;2", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "53;159;71;34", "wc_summary_review": "52;52;36;50", "wc_main_review": "373;472;422;178", "wc_review": "478;683;529;262", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.5, 0.8660254037844386 ], "confidence_avg": [ 3.5, 0.8660254037844386 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.75, 0.82915619758885 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 79.25, 47.8663503935698 ], "wc_summary_review_avg": [ 47.5, 6.689544080129826 ], "wc_main_review_avg": [ 361.25, 111.43916501840813 ], "wc_review_avg": [ 488.0, 150.73320801999805 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -1.0, "corr_recommendation_correctness": 0.816496580927726, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3235060737974099335&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;2", "aff_unique_norm": "Illinois Institute of Technology;Beijing Institute of Technology;Huazhong University of Science and Technology", "aff_unique_dep": ";;", "aff_unique_url": "https://www.iit.edu;http://www.bit.edu.cn/;http://www.hust.edu.cn", "aff_unique_abbr": "IIT;BIT;HUST", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1", "aff_country_unique": "United States;China" }, { "title": "When Can We Learn General-Sum Markov Games with a Large Number of Players Sample-Efficiently?", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7074", "id": "6MmiS0HUJHR", "poster": "", "openreview": "https://openreview.net/forum?id=6MmiS0HUJHR", "slides": "https://iclr.cc/virtual/2022/poster/7074", "video": "https://iclr.cc/virtual/2022/poster/7074", "author_site": "Ziang Song, Song Mei, Yu Bai", "tldr": "", "abstract": "Multi-agent reinforcement learning has made substantial empirical progresses in solving games with a large number of players. However, theoretically, the best known sample complexity for finding a Nash equilibrium in general-sum games scales exponentially in the number of players due to the size of the joint action space, and there is a matching exponential lower bound. This paper investigates what learning goals admit better sample complexities in the setting of $m$-player general-sum Markov games with $H$ steps, $S$ states, and $A_i$ actions per player. First, we design algorithms for learning an $\\epsilon$-Coarse Correlated Equilibrium (CCE) in $\\widetilde{\\mathcal{O}}(H^5S\\max_{i\\le m} A_i / \\epsilon^2)$ episodes, and an $\\epsilon$-Correlated Equilibrium (CE) in $\\widetilde{\\mathcal{O}}(H^6S\\max_{i\\le m} A_i^2 / \\epsilon^2)$ episodes. This is the first line of results for learning CCE and CE with sample complexities polynomial in $\\max_{i\\le m} A_i$. Our algorithm for learning CE integrates an adversarial bandit subroutine which minimizes a weighted swap regret, along with several novel designs in the outer loop. Second, we consider the important special case of Markov Potential Games, and design an algorithm that learns an $\\epsilon$-approximate Nash equilibrium within $\\widetilde{\\mathcal{O}}(S\\sum_{i\\le m} A_i / \\epsilon^3)$ episodes (when only highlighting the dependence on $S$, $A_i$, and $\\epsilon$), which only depends linearly in $\\sum_{i\\le m} A_i$ and significantly improves over the existing efficient algorithm in the $\\epsilon$ dependence. Overall, our results shed light on what equilibria or structural assumptions on the game may enable sample-efficient learning with many players.", "keywords": "reinforcement learning theory;multi-agent RL;Markov games;general-sum games", "primary_area": "", "supplementary_material": "", "author": "Ziang Song;Song Mei;Yu Bai", "authorids": "~Ziang_Song1;~Song_Mei1;~Yu_Bai1", "gender": "M;M;", "homepage": ";https://www.stat.berkeley.edu/~songmei/;https://yubai.org", "dblp": ";https://dblp.org/pers/hd/m/Mei:Song;03/6325-17.html", "google_scholar": "P_-O-wcAAAAJ;https://scholar.google.com.hk/citations?hl=en;owqhKD8AAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Ziang_Song1;~Song_Mei1;~Yu_Bai1", "aff": "Peking University;University of California, Berkeley;Salesforce Research", "aff_domain": "pku.edu.cn;berkeley.edu;salesforce.com", "position": "Undergrad student;Assistant Professor;Research Scientist", "bibtex": "@inproceedings{\nsong2022when,\ntitle={When Can We Learn General-Sum Markov Games with a Large Number of Players Sample-Efficiently?},\nauthor={Ziang Song and Song Mei and Yu Bai},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=6MmiS0HUJHR}\n}", "github": "", "project": "", "reviewers": "q1AK;bdjf;Qf6T;6yah", "pdf_size": 0, "recommendation": "6;8;8;8", "confidence": "3;3;4;4", "correctness": "3;4;4;4", "technical_novelty": "3;2;3;4", "empirical_novelty": "3;0;0;0", "wc_summary_paper": "80;51;58;126", "wc_summary_review": "255;39;13;15", "wc_main_review": "92;205;87;70", "wc_review": "427;295;158;211", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "715;230;36;48", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 7.5, 0.8660254037844386 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 0.75, 1.299038105676658 ], "wc_summary_paper_avg": [ 78.75, 29.303370113350443 ], "wc_summary_review_avg": [ 80.5, 101.26573951737083 ], "wc_main_review_avg": [ 113.5, 53.45325060274632 ], "wc_review_avg": [ 272.75, 101.573557090416 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 257.25, 275.23387782030034 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": 1.0, "gs_citation": 124, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4735644162579825552&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=6MmiS0HUJHR", "email": "pku.edu.cn;berkeley.edu;salesforce.com", "author_num": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "Peking University;University of California, Berkeley;Salesforce", "aff_unique_dep": ";;Salesforce Research", "aff_unique_url": "http://www.pku.edu.cn;https://www.berkeley.edu;https://research.salesforce.com", "aff_unique_abbr": "Peking U;UC Berkeley;Salesforce", "aff_campus_unique_index": "1", "aff_campus_unique": ";Berkeley", "aff_country_unique_index": "0;1;1", "aff_country_unique": "China;United States" }, { "id": "6NT1a56mNim", "title": "Language Models as Zero-Shot Planners: Extracting Actionable Knowledge for Embodied Agents", "track": "main", "status": "Reject", "tldr": "", "abstract": "Can world knowledge learned by large language models (LLMs) be used to act in interactive environments? In this paper, we investigate the possibility of grounding high-level tasks, expressed in natural language (i.e. \"make breakfast\"), to a fixed set of actionable steps (i.e. \"open fridge\"). While prior work focused on learning from explicit step-by-step examples of how to act, we surprisingly find that if pre-trained LMs are large enough and prompted appropriately, they can effectively decompose high-level tasks into low-level plans without any further training. However, the plans produced naively by LLMs often cannot map precisely to admissible actions. We propose a procedure that conditions on existing demonstrations and semantically translates the plans to admissible actions. Our evaluation in the recent VirtualHome environment shows that the resulting method substantially improves executability over the LLM baseline. The conducted human evaluation reveals a trade-off between executability and correctness but shows a promising sign towards extracting actionable knowledge from language models. Videos at https://sites.google.com/view/language-model-as-planner", "keywords": "GPT-3;Codex;LLMs;Language Models;Knowledge Extraction;Embodied Agents;Action Planning", "primary_area": "", "supplementary_material": "/attachment/8a9ed1aa10485005b09f05020d42c4f3405d9aba.zip", "author": "Wenlong Huang;Pieter Abbeel;Deepak Pathak;Igor Mordatch", "authorids": "~Wenlong_Huang1;~Pieter_Abbeel2;~Deepak_Pathak1;~Igor_Mordatch5", "gender": "M;M;M;", "homepage": "https://wenlong.page;https://people.eecs.berkeley.edu/~pabbeel/;https://www.cs.cmu.edu/~dpathak/;", "dblp": "82/2872;;155/9860;", "google_scholar": "hYVMrzsAAAAJ;https://scholar.google.com.tw/citations?user=vtwH6GkAAAAJ;https://scholar.google.cl/citations?user=AEsPCAUAAAAJ;Vzr1RukAAAAJ", "orcid": ";;;", "linkedin": ";;pathak22/;", "or_profile": "~Wenlong_Huang1;~Pieter_Abbeel2;~Deepak_Pathak1;~Igor_Mordatch5", "aff": "Google;Covariant;Carnegie Mellon University;Research, Google", "aff_domain": "google.com;covariant.ai;cmu.edu;research.google.com", "position": "Intern;Founder;Assistant Professor;Researcher", "bibtex": "@misc{\nhuang2022language,\ntitle={Language Models as Zero-Shot Planners: Extracting Actionable Knowledge for Embodied Agents},\nauthor={Wenlong Huang and Pieter Abbeel and Deepak Pathak and Igor Mordatch},\nyear={2022},\nurl={https://openreview.net/forum?id=6NT1a56mNim}\n}", "github": "", "project": "", "reviewers": "KhzN;mJtc;VpdM;gxEx;28Tw", "site": "https://openreview.net/forum?id=6NT1a56mNim", "pdf_size": 0, "recommendation": "3;5;5;6;8", "confidence": "4;4;4;4;5", "correctness": "3;3;3;4;3", "technical_novelty": "2;2;2;4;3", "empirical_novelty": "2;2;3;4;4", "wc_summary_paper": "67;86;57;185;136", "wc_summary_review": "28;24;62;106;98", "wc_main_review": "450;325;425;392;300", "wc_review": "545;435;544;683;534", "wc_reply_reviewers": "169;177;0;185;101", "wc_reply_authors": "1351;1729;1273;1385;527", "reply_reviewers": "1;1;0;2;2", "reply_authors": "3;3;2;3;2", "recommendation_avg": [ 5.4, 1.624807680927192 ], "confidence_avg": [ 4.2, 0.39999999999999997 ], "correctness_avg": [ 3.2, 0.39999999999999997 ], "technical_novelty_avg": [ 2.6, 0.8 ], "empirical_novelty_avg": [ 3.0, 0.8944271909999159 ], "wc_summary_paper_avg": [ 106.2, 47.880685041047606 ], "wc_summary_review_avg": [ 63.6, 34.11509929635263 ], "wc_main_review_avg": [ 378.4, 57.41289053862382 ], "wc_review_avg": [ 548.2, 79.01240408948458 ], "wc_reply_reviewers_avg": [ 126.4, 69.90164518807836 ], "wc_reply_authors_avg": [ 1253.0, 395.24169820503505 ], "reply_reviewers_avg": [ 1.2, 0.7483314773547883 ], "reply_authors_avg": [ 2.6, 0.4898979485566356 ], "replies_avg": [ 28, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.8000946913656628, "corr_recommendation_correctness": 0.1846372364689991, "gs_citation": 1268, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11998123682359381476&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "Google;Covariant;Carnegie Mellon University", "aff_unique_dep": "Google;;", "aff_unique_url": "https://www.google.com;;https://www.cmu.edu", "aff_unique_abbr": "Google;;CMU", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States;" }, { "title": "Goal-Directed Planning via Hindsight Experience Replay", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6831", "id": "6NePxZwfae", "poster": "", "openreview": "https://openreview.net/forum?id=6NePxZwfae", "slides": "https://iclr.cc/virtual/2022/poster/6831", "video": "https://iclr.cc/virtual/2022/poster/6831", "author_site": "Lorenzo Moro, Amarildo Likmeta, Enrico Prati, Marcello Restelli", "tldr": "", "abstract": "We consider the problem of goal-directed planning under a deterministic transition model. Monte Carlo Tree Search has shown remarkable performance in solving deterministic control problems. It has been extended from complex continuous domains through function approximators to bias the search of the planning tree in AlphaZero. Nonetheless, these algorithms still struggle with control problems with sparse rewards, such as goal-directed domains, where a positive reward is awarded only when reaching a goal state. In this work, we recast AlphaZero with Hindsight Experience Replay to tackle complex goal-directed planning tasks. We perform a thorough empirical evaluation in several simulated domains, including a novel application to a quantum compiling domain.", "keywords": "Reinforcement Learning;Goal-Directed Planning;Monte Carlo Tree Search", "primary_area": "", "supplementary_material": "", "author": "Lorenzo Moro;Amarildo Likmeta;Enrico Prati;Marcello Restelli", "authorids": "~Lorenzo_Moro1;~Amarildo_Likmeta1;~Enrico_Prati1;~Marcello_Restelli1", "gender": "M;M;;M", "homepage": ";https://www.unibo.it/sitoweb/amarildo.likmeta2;https://www2.mi.ifn.cnr.it/eprati/;http://home.deib.polimi.it/restelli/", "dblp": ";255/6990;;64/1011", "google_scholar": ";AfEypgsAAAAJ;;https://scholar.google.com.tw/citations?user=xdgxRiEAAAAJ", "orcid": "0000-0001-9818-8854;0000-0002-4227-0741;;0000-0002-6322-1076", "linkedin": ";amarildolikmeta;;", "or_profile": "~Lorenzo_Moro1;~Amarildo_Likmeta1;~Enrico_Prati1;~Marcello_Restelli1", "aff": "Politecnico di Milano;Universita' di Bologna;Consiglio Nazionale delle Ricerche;Politecnico di Milano", "aff_domain": "polimi.it;unibo.it;cnr.it;polimi.it", "position": "PhD student;PhD student;Principal Researcher;Associate Professor", "bibtex": "@inproceedings{\nmoro2022goaldirected,\ntitle={Goal-Directed Planning via Hindsight Experience Replay},\nauthor={Lorenzo Moro and Amarildo Likmeta and Enrico Prati and Marcello Restelli},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=6NePxZwfae}\n}", "github": "", "project": "", "reviewers": "1k2M;mGXi;Eiby;37RQ", "pdf_size": 0, "recommendation": "3;6;8;8", "confidence": "4;4;5;4", "correctness": "4;4;4;3", "technical_novelty": "2;2;4;3", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "28;131;62;339", "wc_summary_review": "20;67;47;154", "wc_main_review": "349;564;194;334", "wc_review": "397;762;303;827", "wc_reply_reviewers": "0;78;0;0", "wc_reply_authors": "673;776;570;736", "reply_reviewers": "0;1;0;0", "reply_authors": "2;2;2;2", "recommendation_avg": [ 6.25, 2.0463381929681126 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.82915619758885 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 140.0, 120.73731817462239 ], "wc_summary_review_avg": [ 72.0, 50.19462122578474 ], "wc_main_review_avg": [ 360.25, 132.25803378245118 ], "wc_review_avg": [ 572.25, 225.89308865036134 ], "wc_reply_reviewers_avg": [ 19.5, 33.77499074759311 ], "wc_reply_authors_avg": [ 688.75, 77.77330840333333 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 2.0, 0.0 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.49374193110101877, "corr_recommendation_correctness": -0.49374193110101877, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16780903185907417649&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=6NePxZwfae", "email": "polimi.it;unibo.it;cnr.it;polimi.it", "author_num": 4, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "Politecnico di Milano;University of Bologna;Consiglio Nazionale delle Ricerche", "aff_unique_dep": ";;", "aff_unique_url": "https://www.polimi.it;https://www.unibo.it;https://www.cnr.it", "aff_unique_abbr": "Polimi;Unibo;CNR", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Italy" }, { "id": "6P6-N1gLQDC", "title": "Structural Causal Interpretation Theorem", "track": "main", "status": "Reject", "tldr": "", "abstract": "Human mental processes allow for qualitative reasoning about causality in terms of mechanistic relations of the variables of interest, which we argue are naturally described by structural causal model (SCM). Since interpretations are being derived from mental models, the same applies for SCM. By defining a metric space on SCM, we provide a theoretical perspective on the comparison of mental models and thereby conclude that interpretations can be used for guiding a learning system towards true causality. To this effect, we present a theoretical analysis from first principles that results in a human-readable interpretation scheme consistent with the provided causality that we name structural causal interpretations (SCI). Going further, we prove that any existing neural induction method (NIM) is in fact interpretable. Our first experiment (E1) assesses the quality of such NIM-based SCI. In (E2) we observe evidence for our conjecture on improved sample-efficiency for SCI-based learning. After conducting a small user study, in (E3) we observe superiority in human-based over NIM-based SCI in support of our initial hypothesis.", "keywords": "causality;interpretations;neural causal models;induction", "primary_area": "", "supplementary_material": "", "author": "Matej Zecevic;Devendra Singh Dhami;Constantin A. Rothkopf;Kristian Kersting", "authorids": "~Matej_Zecevic1;~Devendra_Singh_Dhami1;~Constantin_A._Rothkopf1;~Kristian_Kersting1", "gender": "M;M;M;M", "homepage": "https://www.matej-zecevic.de;https://sites.google.com/view/devendradhami;http://www.ml.informatik.tu-darmstadt.de/;https://www.pip.tu-darmstadt.de", "dblp": "286/1847;201/2130;40/3793;71/5555", "google_scholar": "gzJZcPUAAAAJ;aVlaHfkAAAAJ;QY-earAAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";;0000-0002-2873-9152;", "linkedin": ";;;", "or_profile": "~Matej_Zecevic1;~Devendra_Singh_Dhami1;~Kristian_Kersting1;~Constantin_Rothkopf1", "aff": "TU Darmstadt;CS Department, TU Darmstadt, TU Darmstadt;TU Darmstadt;Technische Universit\u00e4t Darmstadt", "aff_domain": "tu-darmstadt.de;cs.tu-darmstadt.de;tu-darmstadt.de;tu-darmstadt.de", "position": "PhD student;Postdoctoral researcher;Full Professor;Full Professor", "bibtex": "@misc{\nzecevic2022structural,\ntitle={Structural Causal Interpretation Theorem},\nauthor={Matej Zecevic and Devendra Singh Dhami and Constantin A. Rothkopf and Kristian Kersting},\nyear={2022},\nurl={https://openreview.net/forum?id=6P6-N1gLQDC}\n}", "github": "", "project": "", "reviewers": "bMJP;T5c7;2Fy5", "site": "https://openreview.net/forum?id=6P6-N1gLQDC", "pdf_size": 0, "recommendation": "3;6;8", "confidence": "3;3;4", "correctness": "2;3;3", "technical_novelty": "2;3;3", "empirical_novelty": "3;2;4", "wc_summary_paper": "30;86;59", "wc_summary_review": "65;47;88", "wc_main_review": "359;515;898", "wc_review": "454;648;1045", "wc_reply_reviewers": "0;0;250", "wc_reply_authors": "1122;1221;2386", "reply_reviewers": "0;0;1", "reply_authors": "4;3;5", "recommendation_avg": [ 5.666666666666667, 2.0548046676563256 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "correctness_avg": [ 2.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 3.0, 0.816496580927726 ], "wc_summary_paper_avg": [ 58.333333333333336, 22.866763848189994 ], "wc_summary_review_avg": [ 66.66666666666667, 16.779617264870957 ], "wc_main_review_avg": [ 590.6666666666666, 226.45725620718997 ], "wc_review_avg": [ 715.6666666666666, 245.97334995663428 ], "wc_reply_reviewers_avg": [ 83.33333333333333, 117.85113019775793 ], "wc_reply_authors_avg": [ 1576.3333333333333, 573.945603307104 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 4.0, 0.816496580927726 ], "replies_avg": [ 22, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.8029550685469661, "corr_recommendation_correctness": 0.9176629354822472, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:q4fsf7LlWQoJ:scholar.google.com/&scioq=Structural+Causal+Interpretation+Theorem&hl=en&as_sdt=0,33", "gs_version_total": 2, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Technische Universit\u00e4t Darmstadt", "aff_unique_dep": "", "aff_unique_url": "https://www.tu-darmstadt.de", "aff_unique_abbr": "TU Darmstadt", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Darmstadt;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Germany" }, { "id": "6PTUd_zPdHL", "title": "Differentiable Top-k Classification Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "The top-k classification accuracy is one of the core metrics in machine learning. Here, k is conventionally a positive integer, such as 1 or 5. In this work, we relax this assumption and propose to draw k from a probability distribution for training. Combining this with recent advances in differentiable sorting and ranking, we propose a new family of differentiable top-k cross-entropy classification losses. We find that relaxing k does not only produce better top-5 accuracies, but also makes models more robust, which leads to top-1 accuracy improvements. When fine-tuning publicly available ImageNet models, we achieve a new state-of-the-art on ImageNet for publicly available models with an 88.36% top-1 and a 98.71% top-5 accuracy. ", "keywords": "top-k;top-5;imagenet", "primary_area": "", "supplementary_material": "", "author": "Felix Petersen;Hilde Kuehne;Christian Borgelt;Oliver Deussen", "authorids": "~Felix_Petersen1;~Hilde_Kuehne5;~Christian_Borgelt1;~Oliver_Deussen1", "gender": "Not Specified;F;M;M", "homepage": "http://www.petersen.ai/;https://hildekuehne.github.io;https://www.borgelt.net/;https://graphics.uni-konstanz.de", "dblp": "230/3983;45/4963;b/ChristianBorgelt.html;48/2158", "google_scholar": "v8Kat6YAAAAJ;pxhCcH0AAAAJ;https://scholar.google.de/citations?user=T50Bxb8AAAAJ;https://scholar.google.de/scholar?hl=en", "orcid": ";0000-0003-1079-4441;;0000-0001-5803-2185", "linkedin": ";hilde-kuehne-8b9aa661;christian-borgelt-a2429071/;", "or_profile": "~Felix_Petersen1;~Hilde_Kuehne5;~Christian_Borgelt1;~Oliver_Deussen1", "aff": "University of Konstanz;Goethe University Frankfurt;Paris-Lodron-University of Salzburg;University of Konstanz", "aff_domain": "uni-konstanz.de;uni-frankfurt.de;sbg.ac.at;uni-konstanz.de", "position": "PhD student;Assistant Professor;Full Professor;Full Professor", "bibtex": "@misc{\npetersen2022differentiable,\ntitle={Differentiable Top-k Classification Learning},\nauthor={Felix Petersen and Hilde Kuehne and Christian Borgelt and Oliver Deussen},\nyear={2022},\nurl={https://openreview.net/forum?id=6PTUd_zPdHL}\n}", "github": "", "project": "", "reviewers": "575y;FWmd;RKLW;Abaq", "site": "https://openreview.net/forum?id=6PTUd_zPdHL", "pdf_size": 0, "recommendation": "3;5;6;6", "confidence": "4;5;4;2", "correctness": "3;3;3;3", "technical_novelty": "3;2;3;3", "empirical_novelty": "3;2;3;3", "wc_summary_paper": "36;108;57;100", "wc_summary_review": "36;36;48;28", "wc_main_review": "106;385;336;101", "wc_review": "178;529;441;229", "wc_reply_reviewers": "0;0;68;208", "wc_reply_authors": "396;1175;852;847", "reply_reviewers": "0;0;1;2", "reply_authors": "2;2;2;3", "recommendation_avg": [ 5.0, 1.224744871391589 ], "confidence_avg": [ 3.75, 1.0897247358851685 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 75.25, 29.82762980861872 ], "wc_summary_review_avg": [ 37.0, 7.14142842854285 ], "wc_main_review_avg": [ 232.0, 129.67459273119002 ], "wc_review_avg": [ 344.25, 145.27108280728137 ], "wc_reply_reviewers_avg": [ 69.0, 84.91760712596653 ], "wc_reply_authors_avg": [ 817.5, 277.2764865616989 ], "reply_reviewers_avg": [ 0.75, 0.82915619758885 ], "reply_authors_avg": [ 2.25, 0.4330127018922193 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.37463432463267754, "corr_recommendation_correctness": 0.0, "gs_citation": 43, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2888939572667326983&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 12, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "University of Konstanz;Goethe University Frankfurt;Paris-Lodron-University of Salzburg", "aff_unique_dep": ";;", "aff_unique_url": "https://www.uni-konstanz.de;https://www.uni-frankfurt.de;https://www.uni-salzburg.at", "aff_unique_abbr": "Uni Konstanz;GU Frankfurt;PLUS", "aff_campus_unique_index": "1", "aff_campus_unique": ";Frankfurt", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "Germany;Austria" }, { "id": "6PahjGFjVG-", "title": "Secure Distributed Training at Scale", "track": "main", "status": "Reject", "tldr": "", "abstract": "Some of the hardest problems in deep learning can be solved via pooling together computational resources of many independent parties, as is the case for scientific collaborations and volunteer computing. Unfortunately, any single participant in such systems can jeopardize the entire training run by sending incorrect updates, whether deliberately or by mistake. Training in presence of such peers requires specialized distributed training algorithms with Byzantine tolerance. These algorithms often sacrifice efficiency by introducing redundant communication or passing all updates through a trusted server. As a result, it can be infeasible to apply such algorithms to large-scale distributed deep learning, where models can have billions of parameters. In this work, we propose a novel protocol for secure (Byzantine-tolerant) decentralized training that emphasizes communication efficiency. We rigorously analyze this protocol: in particular, we provide theoretical bounds for its resistance against Byzantine and Sybil attacks and show that it has a marginal communication overhead. To demonstrate its practical effectiveness, we conduct large-scale experiments on image classification and language modeling in presence of Byzantine attackers.", "keywords": "distributed training;byzantine tolerance;volunteer computing", "primary_area": "", "supplementary_material": "", "author": "Eduard Gorbunov;Alexander Borzunov;Michael Diskin;Max Ryabinin", "authorids": "~Eduard_Gorbunov1;~Alexander_Borzunov1;~Michael_Diskin1;~Max_Ryabinin1", "gender": "M;M;M;Not Specified", "homepage": "https://eduardgorbunov.github.io;https://github.com/borzunov;;https://mryab.github.io/", "dblp": "215/5512.html;295/8854;295/8914.html;276/0192", "google_scholar": "https://scholar.google.ru/citations?user=85j2RqQAAAAJ;https://scholar.google.ru/citations?user=HdwzsCMAAAAJ;LRKQhcYAAAAJ;930PERsAAAAJ", "orcid": ";;0000-0001-8902-513X;", "linkedin": ";;https://www.linkedin.com/m/in/yhn112/;", "or_profile": "~Eduard_Gorbunov1;~Alexander_Borzunov1;~Michael_Diskin1;~Max_Ryabinin1", "aff": "Mohamed bin Zayed University of Artificial Intelligence;HSE University;Yandex;Yandex", "aff_domain": "mbzuai.ac.ae;hse.ru;yandex-team.ru;yandex-team.ru", "position": "Postdoc;Instructor;Researcher;Research Scientist", "bibtex": "@misc{\ngorbunov2022secure,\ntitle={Secure Distributed Training at Scale},\nauthor={Eduard Gorbunov and Alexander Borzunov and Michael Diskin and Max Ryabinin},\nyear={2022},\nurl={https://openreview.net/forum?id=6PahjGFjVG-}\n}", "github": "", "project": "", "reviewers": "ojVo;63QJ;yAgj;38p5;w1BH", "site": "https://openreview.net/forum?id=6PahjGFjVG-", "pdf_size": 0, "recommendation": "3;3;5;6;6", "confidence": "2;4;3;4;3", "correctness": "2;2;3;4;3", "technical_novelty": "2;2;2;3;3", "empirical_novelty": "2;1;2;3;2", "wc_summary_paper": "103;86;81;72;123", "wc_summary_review": "6;95;35;23;16", "wc_main_review": "217;1821;600;325;305", "wc_review": "326;2002;716;420;444", "wc_reply_reviewers": "141;462;818;42;0", "wc_reply_authors": "1255;5364;3754;960;761", "reply_reviewers": "1;2;3;1;0", "reply_authors": "2;8;5;2;1", "recommendation_avg": [ 4.6, 1.3564659966250536 ], "confidence_avg": [ 3.2, 0.7483314773547882 ], "correctness_avg": [ 2.8, 0.7483314773547882 ], "technical_novelty_avg": [ 2.4, 0.4898979485566356 ], "empirical_novelty_avg": [ 2.0, 0.6324555320336759 ], "wc_summary_paper_avg": [ 93.0, 18.077610461562667 ], "wc_summary_review_avg": [ 35.0, 31.45155004129367 ], "wc_main_review_avg": [ 653.6, 597.6311906184281 ], "wc_review_avg": [ 781.6, 623.8556243234488 ], "wc_reply_reviewers_avg": [ 292.6, 308.59008409214965 ], "wc_reply_authors_avg": [ 2418.8, 1826.8985084016026 ], "reply_reviewers_avg": [ 1.4, 1.019803902718557 ], "reply_authors_avg": [ 3.6, 2.5768197453450252 ], "replies_avg": [ 33, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.2758386421836853, "corr_recommendation_correctness": 0.9063269671749656, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4900391981513067748&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff_unique_index": "0;1;2;2", "aff_unique_norm": "Mohamed bin Zayed University of Artificial Intelligence;Higher School of Economics;Yandex", "aff_unique_dep": ";;", "aff_unique_url": "https://mbzuai.ac.ae;https://hse.ru;https://yandex.com", "aff_unique_abbr": "MBZUAI;HSE;Yandex", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1", "aff_country_unique": "United Arab Emirates;Russian Federation" }, { "title": "Learning Value Functions from Undirected State-only Experience", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6973", "id": "6Pe99Juo9gd", "poster": "", "openreview": "https://openreview.net/forum?id=6Pe99Juo9gd", "slides": "https://iclr.cc/virtual/2022/poster/6973", "video": "https://iclr.cc/virtual/2022/poster/6973", "author_site": "Matthew Chang, Arjun Gupta, Saurabh Gupta", "tldr": "", "abstract": "This paper tackles the problem of learning value functions from undirected state-only experience (state transitions without action labels i.e. (s,s',r) tuples). We first theoretically characterize the applicability of Q-learning in this setting. We show that tabular Q-learning in discrete Markov decision processes (MDPs) learns the same value function under any arbitrary refinement of the action space. This theoretical result motivates the design of Latent Action Q-learning or LAQ, an offline RL method that can learn effective value functions from state-only experience. Latent Action Q-learning (LAQ) learns value functions using Q-learning on discrete latent actions obtained through a latent-variable future prediction model. We show that LAQ can recover value functions that have high correlation with value functions learned using ground truth actions. Value functions learned using LAQ lead to sample efficient acquisition of goal-directed behavior, can be used with domain-specific low-level controllers, and facilitate transfer across embodiments. Our experiments in 5 environments ranging from 2D grid world to 3D visual navigation in realistic environments demonstrate the benefits of LAQ over simpler alternatives, imitation learning oracles, and competing methods.", "keywords": "Reinforcement Learning;Offline RL;Offline RL without actions", "primary_area": "", "supplementary_material": "", "author": "Matthew Chang;Arjun Gupta;Saurabh Gupta", "authorids": "~Matthew_Chang1;~Arjun_Gupta1;~Saurabh_Gupta1", "gender": "M;;", "homepage": "https://matthewchang.github.io/;https://arjung128.github.io;http://saurabhg.web.illinois.edu", "dblp": "56/2174;251/4823;06/5843-1", "google_scholar": "lx-5mjUAAAAJ;qrqim7kAAAAJ;1HO5UacAAAAJ", "orcid": ";;", "linkedin": "matthew-chang-1976b8136/;;", "or_profile": "~Matthew_Chang1;~Arjun_Gupta1;~Saurabh_Gupta1", "aff": "University of Illinois, Urbana Champaign;University of Illinois, Urbana Champaign;University of Illinois, Urbana Champaign", "aff_domain": "illinois.edu;illinois.edu;illinois.edu", "position": "PhD student;PhD student;Assistant Professor", "bibtex": "@inproceedings{\nchang2022learning,\ntitle={Learning Value Functions from Undirected State-only Experience},\nauthor={Matthew Chang and Arjun Gupta and Saurabh Gupta},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=6Pe99Juo9gd}\n}", "github": "", "project": "", "reviewers": "kJe7;cKHF;YoH4;Fa3L", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "2;4;4;3", "correctness": "3;3;3;3", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;3;2;4", "wc_summary_paper": "63;89;83;162", "wc_summary_review": "66;71;131;15", "wc_main_review": "307;747;803;626", "wc_review": "436;907;1017;803", "wc_reply_reviewers": "0;0;224;0", "wc_reply_authors": "613;1166;1457;487", "reply_reviewers": "0;0;2;0", "reply_authors": "1;2;3;1", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 99.25, 37.485830656396026 ], "wc_summary_review_avg": [ 70.75, 41.111890007636475 ], "wc_main_review_avg": [ 620.75, 192.10722917162695 ], "wc_review_avg": [ 790.75, 218.34648497285227 ], "wc_reply_reviewers_avg": [ 56.0, 96.99484522385713 ], "wc_reply_authors_avg": [ 930.75, 396.9133249211974 ], "reply_reviewers_avg": [ 0.5, 0.8660254037844386 ], "reply_authors_avg": [ 1.75, 0.82915619758885 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.20751433915982243, "corr_recommendation_correctness": 0.0, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12984165017700511290&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "pdf": "https://openreview.net/pdf?id=6Pe99Juo9gd", "email": "illinois.edu;illinois.edu;illinois.edu", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Illinois Urbana-Champaign", "aff_unique_dep": "", "aff_unique_url": "https://illinois.edu", "aff_unique_abbr": "UIUC", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Urbana-Champaign", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "6PlIkYUK9As", "title": "Less data is more: Selecting informative and diverse subsets with balancing constraints", "track": "main", "status": "Reject", "tldr": "", "abstract": "Deep learning has yielded extraordinary results in vision and natural language processing, but this achievement comes at a cost. Most models require enormous resources during training, both in terms of computation and in human labeling effort. We show that we can identify informative and diverse subsets of data that lead to deep learning models with similar performance as the ones trained with the original dataset. Prior methods have exploited diversity and uncertainty in submodular objective functions for choosing subsets. In addition to these measures, we show that balancing constraints on predicted class labels and decision boundaries are beneficial. We propose a novel formulation of these constraints using matroids, an algebraic structure that generalizes linear independence in vector spaces, and present an efficient greedy algorithm with constant approximation guarantees. We outperform competing baselines on standard classification datasets such as CIFAR-10, CIFAR-100, ImageNet, as well as long-tailed datasets such as CIFAR-100-LT.", "keywords": "subset selection;approximation algorithm;active learning;efficient training", "primary_area": "", "supplementary_material": "", "author": "Srikumar Ramalingam;Daniel Glasner;Kaushal Patel;Raviteja Vemulapalli;Sadeep Jayasumana;Sanjiv Kumar", "authorids": "~Srikumar_Ramalingam2;~Daniel_Glasner2;khpatel@google.com;~Raviteja_Vemulapalli1;~Sadeep_Jayasumana1;~Sanjiv_Kumar1", "gender": "M;M;;M;;", "homepage": "https://www.cs.utah.edu/~srikumar/;https://sites.google.com/site/dglasner/;;http://ravitejav.weebly.com/;;http://www.sanjivk.com/", "dblp": "17/4216;28/1971;;135/4940;;", "google_scholar": "6m1ptOgAAAAJ;w0OodaEAAAAJ;;0OFqm7YAAAAJ;;https://scholar.google.com/citations?hl=en", "orcid": ";;;;;", "linkedin": "srikumar-ramalingam-17728b22/;;;raviteja-vemulapalli-85146113?utm_source=share&utm_campaign=share_via&utm_content=profile&utm_medium=ios_app;;", "or_profile": "~Srikumar_Ramalingam2;~Daniel_Glasner2;khpatel@google.com;~Raviteja_Vemulapalli1;~Sadeep_Jayasumana1;~Sanjiv_Kumar1", "aff": "Google;Google;;Google;;Google", "aff_domain": "google.com;google.com;;google.com;;google.com", "position": "Research Scientist;Research Software Engineer;;Research Scientist;;Research Scientist", "bibtex": "@misc{\nramalingam2022less,\ntitle={Less data is more: Selecting informative and diverse subsets with balancing constraints},\nauthor={Srikumar Ramalingam and Daniel Glasner and Kaushal Patel and Raviteja Vemulapalli and Sadeep Jayasumana and Sanjiv Kumar},\nyear={2022},\nurl={https://openreview.net/forum?id=6PlIkYUK9As}\n}", "github": "", "project": "", "reviewers": "yngU;4ATq;oEcZ;Cp2k", "site": "https://openreview.net/forum?id=6PlIkYUK9As", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "4;4;4;3", "correctness": "2;3;3;3", "technical_novelty": "2;3;2;3", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "101;127;61;80", "wc_summary_review": "76;91;38;45", "wc_main_review": "372;847;287;555", "wc_review": "549;1065;386;680", "wc_reply_reviewers": "141;0;48;0", "wc_reply_authors": "661;1156;774;665", "reply_reviewers": "1;0;1;0", "reply_authors": "1;2;1;1", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 92.25, 24.5496944991175 ], "wc_summary_review_avg": [ 62.5, 21.80022935659164 ], "wc_main_review_avg": [ 515.25, 214.62569161216464 ], "wc_review_avg": [ 670.0, 250.7099918232219 ], "wc_reply_reviewers_avg": [ 47.25, 57.56463758246029 ], "wc_reply_authors_avg": [ 814.0, 202.59195443057456 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4639359172999782488&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Google", "aff_unique_dep": "Google", "aff_unique_url": "https://www.google.com", "aff_unique_abbr": "Google", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Exposing the Implicit Energy Networks behind Masked Language Models via Metropolis--Hastings", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6637", "id": "6PvWo1kEvlT", "poster": "", "openreview": "https://openreview.net/forum?id=6PvWo1kEvlT", "slides": "https://iclr.cc/virtual/2022/poster/6637", "video": "https://iclr.cc/virtual/2022/poster/6637", "author_site": "Kartik Goyal, Chris Dyer, Taylor Berg-Kirkpatrick", "tldr": "", "abstract": "While recent work has shown that scores from models trained by the ubiquitous masked language modeling (MLM) objective effectively discriminate probable from improbable sequences, it is still an open question if these MLMs specify a principled probability distribution over the space of possible sequences. In this paper, we interpret MLMs as energy-based sequence models and propose two energy parametrizations derivable from the trained MLMs. In order to draw samples correctly from these models, we develop a tractable sampling scheme based on the Metropolis--Hastings Monte Carlo algorithm. In our approach, samples are proposed from the same masked conditionals used for training the masked language models, and they are accepted or rejected based on their energy values according to the target distribution. We validate the effectiveness of the proposed parametrizations by exploring the quality of samples drawn from these energy-based models for both open-ended unconditional generation and a conditional generation task of machine translation. We theoretically and empirically justify our sampling algorithm by showing that the masked conditionals on their own do not yield a Markov chain whose stationary distribution is that of our target distribution, and our approach generates higher quality samples than other recently proposed undirected generation approaches (Wang et al., 2019, Ghazvininejad et al., 2019).", "keywords": "Masked Language Models;Energy-based models;Metropolis Hastings Monte Carlo;Bidirectional Sequence models", "primary_area": "", "supplementary_material": "", "author": "Kartik Goyal;Chris Dyer;Taylor Berg-Kirkpatrick", "authorids": "~Kartik_Goyal1;~Chris_Dyer1;~Taylor_Berg-Kirkpatrick1", "gender": "M;M;M", "homepage": "https://kartikgo.github.io/;http://www.cs.cmu.edu/~cdyer/;https://cseweb.ucsd.edu/~tberg/", "dblp": "136/8676;41/6895;22/8160", "google_scholar": "Lz6-_iIAAAAJ;W2DsnAkAAAAJ;mN6_BKAAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Kartik_Goyal1;~Chris_Dyer1;~Taylor_Berg-Kirkpatrick1", "aff": "Toyota Technological Institute at Chicago;Google DeepMind;University of California, San Diego", "aff_domain": "ttic.edu;google.com;ucsd.edu", "position": "Postdoc;Research scientist;Assistant Professor", "bibtex": "@inproceedings{\ngoyal2022exposing,\ntitle={Exposing the Implicit Energy Networks behind Masked Language Models via Metropolis--Hastings},\nauthor={Kartik Goyal and Chris Dyer and Taylor Berg-Kirkpatrick},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=6PvWo1kEvlT}\n}", "github": "", "project": "", "reviewers": "cuUN;zwFH;dvua;dqpQ", "pdf_size": 0, "recommendation": "3;6;8;8", "confidence": "4;4;4;5", "correctness": "3;4;4;4", "technical_novelty": "2;3;3;4", "empirical_novelty": "3;3;4;3", "wc_summary_paper": "72;107;65;442", "wc_summary_review": "2;40;21;75", "wc_main_review": "395;216;112;554", "wc_review": "469;363;198;1071", "wc_reply_reviewers": "717;0;0;0", "wc_reply_authors": "2395;333;65;331", "reply_reviewers": "4;0;0;0", "reply_authors": "7;1;1;1", "recommendation_avg": [ 6.25, 2.0463381929681126 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 3.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 171.5, 156.98168683002487 ], "wc_summary_review_avg": [ 34.5, 26.9675731203236 ], "wc_main_review_avg": [ 319.25, 169.15876418323705 ], "wc_review_avg": [ 525.25, 329.5545288719304 ], "wc_reply_reviewers_avg": [ 179.25, 310.47010725672123 ], "wc_reply_authors_avg": [ 781.0, 938.1972074143048 ], "reply_reviewers_avg": [ 1.0, 1.7320508075688772 ], "reply_authors_avg": [ 2.5, 2.598076211353316 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.49374193110101877, "corr_recommendation_correctness": 0.9169493006161777, "gs_citation": 58, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15443775486703005042&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=6PvWo1kEvlT", "email": "ttic.edu;google.com;ucsd.edu", "author_num": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "Toyota Technological Institute at Chicago;Google;University of California, San Diego", "aff_unique_dep": ";Google DeepMind;", "aff_unique_url": "https://www.tti-chicago.org;https://deepmind.com;https://www.ucsd.edu", "aff_unique_abbr": "TTI Chicago;DeepMind;UCSD", "aff_campus_unique_index": "0;2", "aff_campus_unique": "Chicago;;San Diego", "aff_country_unique_index": "0;1;0", "aff_country_unique": "United States;United Kingdom" }, { "title": "Pseudo-Labeled Auto-Curriculum Learning for Semi-Supervised Keypoint Localization", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6943", "id": "6Q52pZ-Th7N", "poster": "", "openreview": "https://openreview.net/forum?id=6Q52pZ-Th7N", "slides": "https://iclr.cc/virtual/2022/poster/6943", "video": "https://iclr.cc/virtual/2022/poster/6943", "author_site": "Can Wang, Sheng Jin, Yingda Guan, Wentao Liu, Chen Qian, Ping Luo, Wanli Ouyang", "tldr": "", "abstract": "Localizing keypoints of an object is a basic visual problem. However, supervised learning of a keypoint localization network often requires a large amount of data, which is expensive and time-consuming to obtain. To remedy this, there is an ever-growing interest in semi-supervised learning (SSL), which leverages a small set of labeled data along with a large set of unlabeled data. Among these SSL approaches, pseudo-labeling (PL) is one of the most popular. PL approaches apply pseudo-labels to unlabeled data, and then train the model with a combination of the labeled and pseudo-labeled data iteratively. The key to the success of PL is the selection of high-quality pseudo-labeled samples. Previous works mostly select training samples by manually setting a single confidence threshold. We propose to automatically select reliable pseudo-labeled samples with a series of dynamic thresholds, which constitutes a learning curriculum.Extensive experiments on five keypoint localization benchmark datasets demonstrate that the proposed approach significantly outperforms the previous state-of-the-art SSL approaches. ", "keywords": "Keypoint Localization;Semi-Supervised Learning;Curriculum Learning", "primary_area": "", "supplementary_material": "", "author": "Can Wang;Sheng Jin;Yingda Guan;Wentao Liu;Chen Qian;Ping Luo;Wanli Ouyang", "authorids": "~Can_Wang6;~Sheng_Jin1;~Yingda_Guan1;~Wentao_Liu1;~Chen_Qian1;~Ping_Luo2;~Wanli_Ouyang1", "gender": "M;M;F;M;M;;", "homepage": ";https://jin-s13.github.io/;;;;;", "dblp": ";70/6780-7;;30/3943-2;;;", "google_scholar": "y3_URHYAAAAJ;wrNd--oAAAAJ;s5VDmlQAAAAJ;KZn9NWEAAAAJ;AerkT0YAAAAJ;;", "orcid": ";0000-0001-5736-7434;;;;;", "linkedin": ";;;;;;", "or_profile": "~Can_Wang6;~Sheng_Jin1;~Yingda_Guan1;~Wentao_Liu1;~Chen_Qian1;~Ping_Luo2;~Wanli_Ouyang1", "aff": ";The University of Hong Kong;;Sensetime;Tsinghua University;;", "aff_domain": ";hku.hk;;sensetime.com;mails.tsinghua.edu.cn;;", "position": ";PhD student;;Senior Researcher;PhD student;;", "bibtex": "@inproceedings{\nwang2022pseudolabeled,\ntitle={Pseudo-Labeled Auto-Curriculum Learning for Semi-Supervised Keypoint Localization},\nauthor={Can Wang and Sheng Jin and Yingda Guan and Wentao Liu and Chen Qian and Ping Luo and Wanli Ouyang},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=6Q52pZ-Th7N}\n}", "github": "", "project": "", "reviewers": "UzKw;6dEq;QPpV", "pdf_size": 0, "recommendation": "5;6;8", "confidence": "4;3;4", "correctness": "4;3;4", "technical_novelty": "2;3;3", "empirical_novelty": "2;3;3", "wc_summary_paper": "153;65;65", "wc_summary_review": "65;36;83", "wc_main_review": "317;458;222", "wc_review": "535;559;370", "wc_reply_reviewers": "5;0;9", "wc_reply_authors": "1032;1252;689", "reply_reviewers": "1;0;1", "reply_authors": "2;2;1", "recommendation_avg": [ 6.333333333333333, 1.247219128924647 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 94.33333333333333, 41.48359782961079 ], "wc_summary_review_avg": [ 61.333333333333336, 19.362047641943477 ], "wc_main_review_avg": [ 332.3333333333333, 96.95474316516042 ], "wc_review_avg": [ 488.0, 84.01190391843289 ], "wc_reply_reviewers_avg": [ 4.666666666666667, 3.6817870057290873 ], "wc_reply_authors_avg": [ 991.0, 231.6649880035105 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.18898223650461363, "corr_recommendation_correctness": 0.18898223650461363, "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6851653134571170152&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=6Q52pZ-Th7N", "email": ";hku.hk;;sensetime.com;mails.tsinghua.edu.cn;;", "author_num": 7, "aff_unique_index": "0;1;2", "aff_unique_norm": "University of Hong Kong;SenseTime;Tsinghua University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.hku.hk;https://www.sensetime.com;https://www.tsinghua.edu.cn", "aff_unique_abbr": "HKU;SenseTime;THU", "aff_campus_unique_index": "0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "id": "6Q5RdltG3L", "title": "Human imperceptible attacks and applications to improve fairness", "track": "main", "status": "Desk Reject", "tldr": "", "abstract": "Modern neural networks are able to perform at least as well as humans in numerous tasks involving object classification and image generation. However, there is also evidence that perturbations which are imperceptible to humans may significantly degrade the performance of well-trained deep neural networks. We provide a Distributionally Robust Optimization (DRO) framework which integrates human-based image quality assessment methods to design optimal attacks that are imperceptible to humans but significantly damaging to deep neural networks. Our attack algorithm can generate better-quality (less perceptible to humans) attacks than other state-of-the-art human imperceptible attack methods. We provide an algorithmic implementation of independent interest which can speed up DRO training significantly. Finally, we demonstrate how the use of optimally designed human imperceptible attacks can improve group fairness in image classification while maintaining a similar accuracy.\n", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/7e535abc9df925a9a3e77b6e6954af7de4ca2d6d.zip", "author": "Xinru Hua;Huanzhong Xu;Jose Blanchet;Viet Anh Nguyen", "authorids": "~Xinru_Hua1;~Huanzhong_Xu1;~Jose_Blanchet1;~Viet_Anh_Nguyen2", "gender": "F;M;M;M", "homepage": ";https://icme.stanford.edu/person/huanzhong-xu;https://web.stanford.edu/~jblanche/;http://www.vietanhnguyen.net", "dblp": ";;75/5093.html;", "google_scholar": "6V5aaYEAAAAJ;;https://scholar.google.co.in/citations?user=O24CcQQAAAAJ;3iyf-EoAAAAJ", "orcid": ";;;", "linkedin": ";;jose-blanchet;", "or_profile": "~Xinru_Hua1;~Huanzhong_Xu1;~Jose_Blanchet1;~Viet_Anh_Nguyen2", "aff": "Stanford University;Stanford University;Stanford University;VinAI Research, Vietnam", "aff_domain": "stanford.edu;stanford.edu;stanford.edu;vinai.io", "position": "PhD student;PhD student;Professor;Research Scientist", "bibtex": "@misc{\nhua2022human,\ntitle={Human imperceptible attacks and applications to improve fairness},\nauthor={Xinru Hua and Huanzhong Xu and Jose Blanchet and Viet Anh Nguyen},\nyear={2022},\nurl={https://openreview.net/forum?id=6Q5RdltG3L}\n}", "github": "", "project": "", "reviewers": "", "site": "https://openreview.net/forum?id=6Q5RdltG3L", "pdf_size": 0, "recommendation": "", "confidence": "", "correctness": "", "technical_novelty": "", "empirical_novelty": "", "wc_summary_paper": "", "wc_summary_review": "", "wc_main_review": "", "wc_review": "", "wc_reply_reviewers": "", "wc_reply_authors": "", "reply_reviewers": "", "reply_authors": "", "recommendation_avg": [ 0, 0 ], "confidence_avg": [ 0, 0 ], "correctness_avg": [ 0, 0 ], "technical_novelty_avg": [ 0, 0 ], "empirical_novelty_avg": [ 0, 0 ], "wc_summary_paper_avg": [ 0, 0 ], "wc_summary_review_avg": [ 0, 0 ], "wc_main_review_avg": [ 0, 0 ], "wc_review_avg": [ 0, 0 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 1, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0, "corr_recommendation_correctness": 0, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12104209345417038011&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff_unique_index": "0;0;0;1", "aff_unique_norm": "Stanford University;VinAI Research", "aff_unique_dep": ";", "aff_unique_url": "https://www.stanford.edu;https://www.vin.ai", "aff_unique_abbr": "Stanford;VinAI", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Stanford;", "aff_country_unique_index": "0;0;0;1", "aff_country_unique": "United States;Vietnam" }, { "id": "6Qvjzr2VGLl", "title": "Towards Generative Latent Variable Models for Speech", "track": "main", "status": "Reject", "tldr": "", "abstract": "While stochastic latent variable models (LVMs) now achieve state-of-the-art performance on natural image generation, they are still inferior to deterministic models on speech. On natural images, these models have been parameterised with very deep hierarchies of latent variables, but research shows that these model constructs are not directly applicable to sequence data. In this paper, we benchmark popular temporal LVMs against state-of-the-art deterministic models on speech. We report the likelihood, which is a much used metric in the image domain but rarely, and often incomparably, reported for speech models. This is prerequisite work needed for the research community to improve LVMs on speech. We adapt Clockwork VAE, a state-of-the-art temporal LVM for video generation, to the speech domain, similar to how WaveNet adapted PixelCNN from images to speech. Despite being autoregressive only in latent space, we find that the Clockwork VAE outperforms previous LVMs and reduces the gap to deterministic models by using a hierarchy of latent variables.", "keywords": "hierarchical temporal latent variable models;generative speech modelling;variational autoencoder", "primary_area": "", "supplementary_material": "", "author": "Jakob Drachmann Havtorn;Lasse Borgholt;Jes Frellsen;S\u00f8ren Hauberg;Lars Maal\u00f8e", "authorids": "~Jakob_Drachmann_Havtorn1;~Lasse_Borgholt1;~Jes_Frellsen1;~S\u00f8ren_Hauberg1;~Lars_Maal\u00f8e1", "gender": "M;M;M;M;M", "homepage": "https://github.com/JakobHavtorn;;https://frellsen.org;http://www2.compute.dtu.dk/~sohau/;https://www.linkedin.com/in/larsmaaloe/?originalSubdomain=dk", "dblp": "264/5083;169/3233;83/8247;39/7226;157/8377", "google_scholar": "azrs0T8AAAAJ;;Yj2sBWkAAAAJ;https://scholar.google.com/citations?hl=en;https://scholar.google.dk/citations?user=bNmv8vsAAAAJ", "orcid": "0000-0002-4849-0817;0000-0002-3562-8442;0000-0001-9224-1271;;", "linkedin": "jakobhavtorn/;lasse-borgholt-14771133/;frellsen/;;", "or_profile": "~Jakob_Drachmann_Havtorn1;~Lasse_Borgholt1;~Jes_Frellsen1;~S\u00f8ren_Hauberg1;~Lars_Maal\u00f8e1", "aff": "Corti;University of Copenhagen;Technical University of Denmark;Technical University of Denmark;Technical University of Denmark", "aff_domain": "corti.ai;diku.dk;dtu.dk;dtu.dk;dtu.dk", "position": "Principal Researcher;PhD student;Associate Professor;Professor;Associate Professor", "bibtex": "@misc{\nhavtorn2022towards,\ntitle={Towards Generative Latent Variable Models for Speech},\nauthor={Jakob Drachmann Havtorn and Lasse Borgholt and Jes Frellsen and S{\\o}ren Hauberg and Lars Maal{\\o}e},\nyear={2022},\nurl={https://openreview.net/forum?id=6Qvjzr2VGLl}\n}", "github": "", "project": "", "reviewers": "3vU6;5Ydq;q4Q4", "site": "https://openreview.net/forum?id=6Qvjzr2VGLl", "pdf_size": 0, "recommendation": "3;5;6", "confidence": "4;4;3", "correctness": "2;4;4", "technical_novelty": "2;2;2", "empirical_novelty": "2;2;3", "wc_summary_paper": "55;46;146", "wc_summary_review": "14;30;114", "wc_main_review": "130;208;179", "wc_review": "199;284;439", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "849;439;393", "reply_reviewers": "0;0;0", "reply_authors": "2;1;1", "recommendation_avg": [ 4.666666666666667, 1.247219128924647 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.9428090415820634 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 82.33333333333333, 45.16881913690264 ], "wc_summary_review_avg": [ 52.666666666666664, 43.8583578757294 ], "wc_main_review_avg": [ 172.33333333333334, 32.190405748020986 ], "wc_review_avg": [ 307.3333333333333, 99.35905707192587 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 560.3333333333334, 204.9802158475029 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.7559289460184545, "corr_recommendation_correctness": 0.944911182523068, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:kmt-iNV1EDwJ:scholar.google.com/&scioq=Towards+Generative+Latent+Variable+Models+for+Speech&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;2;2;2", "aff_unique_norm": "Corti;University of Copenhagen;Technical University of Denmark", "aff_unique_dep": ";;", "aff_unique_url": ";https://www.ku.dk;https://www.tek.dk", "aff_unique_abbr": ";UCPH;DTU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "1;1;1;1", "aff_country_unique": ";Denmark" }, { "title": "Programmatic Reinforcement Learning without Oracles", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6744", "id": "6Tk2noBdvxt", "poster": "", "openreview": "https://openreview.net/forum?id=6Tk2noBdvxt", "slides": "https://iclr.cc/virtual/2022/poster/6744", "video": "https://iclr.cc/virtual/2022/poster/6744", "author_site": "Wenjie Qiu, He Zhu", "tldr": "", "abstract": "Deep reinforcement learning (RL) has led to encouraging successes in many challenging control tasks. However, a deep RL model lacks interpretability due to the difficulty of identifying how the model's control logic relates to its network structure. Programmatic policies structured in more interpretable representations emerge as a promising solution. Yet two shortcomings remain: First, synthesizing programmatic policies requires optimizing over the discrete and non-differentiable search space of program architectures. Previous works are suboptimal because they only enumerate program architectures greedily guided by a pretrained RL oracle. Second, these works do not exploit compositionality, an important programming concept, to reuse and compose primitive functions to form a complex function for new tasks. Our first contribution is a programmatically interpretable RL framework that conducts program architecture search on top of a continuous relaxation of the architecture space defined by programming language grammar rules. Our algorithm allows policy architectures to be learned with policy parameters via bilevel optimization using efficient policy-gradient methods, and thus does not require a pretrained oracle. Our second contribution is improving programmatic policies to support compositionality by integrating primitive functions learned to grasp task-agnostic skills as a composite program to solve novel RL problems. Experiment results demonstrate that our algorithm excels in discovering optimal programmatic policies that are highly interpretable. The code of this work is available at https://github.com/RU-Automated-Reasoning-Group/pi-PRL.", "keywords": "Reinforcement Learning;Programmatic Reinforcement Learning;Compositional Reinforcement Learning;Program Synthesis;Differentiable Architecture Search", "primary_area": "", "supplementary_material": "", "author": "Wenjie Qiu;He Zhu", "authorids": "~Wenjie_Qiu1;~He_Zhu4", "gender": "M;M", "homepage": "https://github.com/Roadsong;https://herowanzhu.github.io", "dblp": "120/1151-2;59/2802-1", "google_scholar": "sc4btRMAAAAJ;3X9GC2gAAAAJ", "orcid": "0000-0002-2271-6443;", "linkedin": "qiuwenjie/;", "or_profile": "~Wenjie_Qiu1;~He_Zhu4", "aff": "Rutgers University;Rutgers University", "aff_domain": "rutgers.edu;rutgers.edu", "position": "PhD student;Assistant Professor", "bibtex": "@inproceedings{\nqiu2022programmatic,\ntitle={Programmatic Reinforcement Learning without Oracles},\nauthor={Wenjie Qiu and He Zhu},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=6Tk2noBdvxt}\n}", "github": "", "project": "", "reviewers": "xiEe;iSsz;R2Gf", "pdf_size": 0, "recommendation": "8;8;8", "confidence": "5;4;4", "correctness": "3;4;3", "technical_novelty": "3;3;3", "empirical_novelty": "3;3;4", "wc_summary_paper": "32;151;148", "wc_summary_review": "72;112;37", "wc_main_review": "932;477;707", "wc_review": "1036;740;892", "wc_reply_reviewers": "65;0;49", "wc_reply_authors": "2241;1749;2035", "reply_reviewers": "2;0;1", "reply_authors": "4;3;4", "recommendation_avg": [ 8.0, 0.0 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 110.33333333333333, 55.40356988097026 ], "wc_summary_review_avg": [ 73.66666666666667, 30.64129385141706 ], "wc_main_review_avg": [ 705.3333333333334, 185.75671066089524 ], "wc_review_avg": [ 889.3333333333334, 120.85620473199637 ], "wc_reply_reviewers_avg": [ 38.0, 27.65260686927485 ], "wc_reply_authors_avg": [ 2008.3333333333333, 201.74130850065276 ], "reply_reviewers_avg": [ 1.0, 0.816496580927726 ], "reply_authors_avg": [ 3.6666666666666665, 0.4714045207910317 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 54, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15044719869409050316&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=6Tk2noBdvxt", "email": "rutgers.edu;rutgers.edu", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Rutgers University", "aff_unique_dep": "", "aff_unique_url": "https://www.rutgers.edu", "aff_unique_abbr": "Rutgers", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Learnability Lock: Authorized Learnability Control Through Adversarial Invertible Transformations", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6516", "id": "6VpeS27viTq", "poster": "", "openreview": "https://openreview.net/forum?id=6VpeS27viTq", "slides": "https://iclr.cc/virtual/2022/poster/6516", "video": "https://iclr.cc/virtual/2022/poster/6516", "author_site": "Weiqi Peng, Jinghui Chen", "tldr": "", "abstract": "Owing much to the revolution of information technology, recent progress of deep learning benefits incredibly from the vastly enhanced access to data available in various digital formats. Yet those publicly accessible information also raises a fundamental issue concerning Intellectual Property, that is, how to precisely control legal or illegal exploitation of a dataset for training commercial models. To tackle this issue, this paper introduces and investigates a new concept called ''learnability lock'' for securing the process of data authorization. In particular, we propose adversarial invertible transformation, that can be viewed as a mapping from image to image, to encrypt data samples so that they become ''unlearnable'' by machine learning models with negligible loss of visual features. Meanwhile, authorized clients can use a specific key to unlock the learnability of the protected dataset and train models normally. The proposed learnability lock leverages class-wise perturbation that applies a universal transformation function on data samples of the same label. This ensures that the learnability can be easily restored with a simple inverse transformation while remaining difficult to be detected or reverse-engineered. We empirically demonstrate the success and practicability of our method on visual classification tasks. ", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Weiqi Peng;Jinghui Chen", "authorids": "~Weiqi_Peng1;~Jinghui_Chen1", "gender": "M;M", "homepage": "https://rickypeng012.com/;https://jinghuichen.github.io/", "dblp": ";67/5633", "google_scholar": ";mKia7Y4AAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Weiqi_Peng1;~Jinghui_Chen1", "aff": "Yale University;Pennsylvania State University", "aff_domain": "yale.edu;psu.edu", "position": "MS student;Assistant Professor", "bibtex": "@inproceedings{\npeng2022learnability,\ntitle={Learnability Lock: Authorized Learnability Control Through Adversarial Invertible Transformations},\nauthor={Weiqi Peng and Jinghui Chen},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=6VpeS27viTq}\n}", "github": "", "project": "", "reviewers": "BPLq;nYcf;ZETA;ZNq5", "pdf_size": 0, "recommendation": "6;8;8;8", "confidence": "3;4;4;3", "correctness": "4;4;4;4", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "140;108;131;120", "wc_summary_review": "74;32;46;56", "wc_main_review": "539;256;228;542", "wc_review": "753;396;405;718", "wc_reply_reviewers": "11;122;15;223", "wc_reply_authors": "1068;664;384;920", "reply_reviewers": "1;1;1;2", "reply_authors": "4;3;2;5", "recommendation_avg": [ 7.5, 0.8660254037844386 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 4.0, 0.0 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 124.75, 11.986972094736853 ], "wc_summary_review_avg": [ 52.0, 15.297058540778355 ], "wc_main_review_avg": [ 391.25, 149.58170844057105 ], "wc_review_avg": [ 568.0, 167.9866066089794 ], "wc_reply_reviewers_avg": [ 92.75, 87.39100354155455 ], "wc_reply_authors_avg": [ 759.0, 260.3132728079765 ], "reply_reviewers_avg": [ 1.25, 0.4330127018922193 ], "reply_authors_avg": [ 3.5, 1.118033988749895 ], "replies_avg": [ 24, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": 0.0, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5140896885149581095&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=6VpeS27viTq", "email": "yale.edu;psu.edu", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "Yale University;Pennsylvania State University", "aff_unique_dep": ";", "aff_unique_url": "https://www.yale.edu;https://www.psu.edu", "aff_unique_abbr": "Yale;PSU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Demystifying Batch Normalization in ReLU Networks: Equivalent Convex Optimization Models and Implicit Regularization", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7099", "id": "6XGgutacQ0B", "poster": "", "openreview": "https://openreview.net/forum?id=6XGgutacQ0B", "slides": "https://iclr.cc/virtual/2022/poster/7099", "video": "https://iclr.cc/virtual/2022/poster/7099", "author_site": "Tolga Ergen, Arda Sahiner, Batu Ozturkler, John M Pauly, Morteza Mardani, Mert Pilanci", "tldr": "", "abstract": "Batch Normalization (BN) is a commonly used technique to accelerate and stabilize training of deep neural networks. Despite its empirical success, a full theoretical understanding of BN is yet to be developed. In this work, we analyze BN through the lens of convex optimization. We introduce an analytic framework based on convex duality to obtain exact convex representations of weight-decay regularized ReLU networks with BN, which can be trained in polynomial-time. Our analyses also show that optimal layer weights can be obtained as simple closed-form formulas in the high-dimensional and/or overparameterized regimes. Furthermore, we find that Gradient Descent provides an algorithmic bias effect on the standard non-convex BN network, and we design an approach to explicitly encode this implicit regularization into the convex objective. Experiments with CIFAR image classification highlight the effectiveness of this explicit regularization for mimicking and substantially improving the performance of standard BN networks. ", "keywords": "batch normalization;ReLU networks;deep networks;convex optimization;whitening;implicit regularization;algorithmic bias", "primary_area": "", "supplementary_material": "/attachment/b7b0f3ee814f6eaac63a4651b0807b370e6a625e.zip", "author": "Tolga Ergen;Arda Sahiner;Batu Ozturkler;John M. Pauly;Morteza Mardani;Mert Pilanci", "authorids": "~Tolga_Ergen1;~Arda_Sahiner1;~Batu_Ozturkler1;~John_M._Pauly1;~Morteza_Mardani1;~Mert_Pilanci3", "gender": "M;M;;M;M;M", "homepage": "https://tolgaergen.github.io/;http://web.stanford.edu/~sahiner/;https://batuozt.github.io;http://www.stanford.edu/~pauly;http://web.stanford.edu/~morteza/;https://stanford.edu/~pilanci/", "dblp": "202/7477.html;264/6371;281/6970;95/6728;74/258;45/8056", "google_scholar": "https://scholar.google.com.tr/citations?user=T1pWaCsAAAAJ;723GIZQAAAAJ;O_tiFfoAAAAJ;Fc6GIIQAAAAJ;H7edsyEAAAAJ;aSAS-aAAAAAJ", "orcid": "0000-0003-4806-0224;;;;;", "linkedin": ";;;john-pauly-69805911/;;mert-pilanci-ba615743/", "or_profile": "~Tolga_Ergen1;~Arda_Sahiner1;~Batu_Ozturkler1;~John_M._Pauly1;~Morteza_Mardani1;~Mert_Pilanci3", "aff": "Stanford University;Stanford University;Microsoft;;NVIDIA;Stanford University", "aff_domain": "stanford.edu;stanford.edu;microsoft.com;;nvidia.com;stanford.edu", "position": "PhD student;PhD student;Intern;;Principal Researcher;Assistant Professor", "bibtex": "@inproceedings{\nergen2022demystifying,\ntitle={Demystifying Batch Normalization in Re{LU} Networks: Equivalent Convex Optimization Models and Implicit Regularization},\nauthor={Tolga Ergen and Arda Sahiner and Batu Ozturkler and John M. Pauly and Morteza Mardani and Mert Pilanci},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=6XGgutacQ0B}\n}", "github": "", "project": "", "reviewers": "syyN;YR4A;pM4k", "pdf_size": 0, "recommendation": "5;6;6", "confidence": "3;3;4", "correctness": "2;3;4", "technical_novelty": "4;3;3", "empirical_novelty": "4;3;3", "wc_summary_paper": "45;184;27", "wc_summary_review": "71;17;50", "wc_main_review": "247;256;299", "wc_review": "363;457;376", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 5.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "correctness_avg": [ 3.0, 0.816496580927726 ], "technical_novelty_avg": [ 3.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 85.33333333333333, 70.15379929523101 ], "wc_summary_review_avg": [ 46.0, 22.22611077089287 ], "wc_main_review_avg": [ 267.3333333333333, 22.69116323349001 ], "wc_review_avg": [ 398.6666666666667, 41.58792559812951 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.4999999999999999, "corr_recommendation_correctness": 0.8660254037844385, "gs_citation": 40, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4409880585671897334&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "pdf": "https://openreview.net/pdf?id=6XGgutacQ0B", "email": "stanford.edu;stanford.edu;microsoft.com;;nvidia.com;stanford.edu", "author_num": 6, "aff_unique_index": "0;0;1;2;0", "aff_unique_norm": "Stanford University;Microsoft;NVIDIA", "aff_unique_dep": ";Microsoft Corporation;NVIDIA Corporation", "aff_unique_url": "https://www.stanford.edu;https://www.microsoft.com;https://www.nvidia.com", "aff_unique_abbr": "Stanford;Microsoft;NVIDIA", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Stanford;", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Multi-Mode Deep Matrix and Tensor Factorization", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6972", "id": "6YVIk0sAkF_", "poster": "", "openreview": "https://openreview.net/forum?id=6YVIk0sAkF_", "slides": "https://iclr.cc/virtual/2022/poster/6972", "video": "https://iclr.cc/virtual/2022/poster/6972", "tldr": "", "abstract": "Recently, deep linear and nonlinear matrix factorizations gain increasing attention in the area of machine learning. Existing deep nonlinear matrix factorization methods can only exploit partial nonlinearity of the data and are not effective in handling matrices of which the number of rows is comparable to the number of columns. On the other hand, there is still a gap between deep learning and tensor decomposition. This paper presents a framework of multi-mode deep matrix and tensor factorizations to explore and exploit the full nonlinearity of the data in matrices and tensors. We use the factorization methods to solve matrix and tensor completion problems and prove that our methods have tighter generalization error bounds than conventional matrix and tensor factorization methods. The experiments on synthetic data and real datasets showed that the proposed methods have much higher recovery accuracy than many baselines.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jicong Fan", "authorids": "~Jicong_Fan2", "gender": "M", "homepage": "https://jicongfan.github.io/", "dblp": "139/1570", "google_scholar": "vdJsnhIAAAAJ", "orcid": "0000-0001-9665-0355", "linkedin": "", "or_profile": "~Jicong_Fan2", "aff": "The Chinese University of Hong Kong, Shenzhen", "aff_domain": "cuhk.edu.cn", "position": "Research Assistant Professor", "bibtex": "@inproceedings{\nfan2022multimode,\ntitle={Multi-Mode Deep Matrix and Tensor Factorization},\nauthor={Jicong Fan},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=6YVIk0sAkF_}\n}", "github": "", "project": "", "reviewers": "26KD;jdoi;8BDW;BPJG", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "4;4;4;2", "correctness": "4;3;4;3", "technical_novelty": "3;3;4;3", "empirical_novelty": "2;2;4;3", "wc_summary_paper": "105;89;84;97", "wc_summary_review": "124;45;23;48", "wc_main_review": "130;224;145;275", "wc_review": "359;358;252;420", "wc_reply_reviewers": "0;108;0;0", "wc_reply_authors": "0;1186;372;678", "reply_reviewers": "0;1;0;0", "reply_authors": "0;2;1;1", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 3.5, 0.8660254037844386 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 93.75, 7.980444849756184 ], "wc_summary_review_avg": [ 60.0, 38.19031290785662 ], "wc_main_review_avg": [ 193.5, 59.06987387831465 ], "wc_review_avg": [ 347.25, 60.45401144671874 ], "wc_reply_reviewers_avg": [ 27.0, 46.76537180435969 ], "wc_reply_authors_avg": [ 559.0, 434.37886688926295 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.7071067811865476 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 1, 0 ], "corr_recommendation_confidence": -0.9271726499455306, "corr_recommendation_correctness": -0.6882472016116854, "gs_citation": 38, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=95029945055266339&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=6YVIk0sAkF_", "email": "cuhk.edu.cn", "author_num": 1, "aff_unique_index": "0", "aff_unique_norm": "Chinese University of Hong Kong", "aff_unique_dep": "", "aff_unique_url": "https://www.cuhk.edu.cn", "aff_unique_abbr": "CUHK", "aff_campus_unique_index": "0", "aff_campus_unique": "Shenzhen", "aff_country_unique_index": "0", "aff_country_unique": "China" }, { "id": "6g4VoBTaq6I", "title": "A Variance Reduction Method for Neural-based Divergence Estimation", "track": "main", "status": "Reject", "tldr": "", "abstract": "A central problem in machine learning is the computation of similarity or closeness between two (data) distributions. The applications span from generative modelling via adversarial training, representation learning, and robustness in out-of-distribution settings, to name a few. A palette of divergences, mutual information, and integral probability metrics are indispensable tools for measuring the ``distance'' between distributions and these are made tractable in high dimensional settings through variational representation formulas. Indeed, such formulas transform an estimation problem into an optimization problem. Unfortunately, the approximation of expectations that are inherent in variational formulas by statistical averages can be problematic due to high statistical variance, e.g., exponential for the Kullback-Leibler divergence and certain estimators. In this paper, we propose a new variance penalty term that acts directly on the variance of each component of the statistical estimator. The power of the variance penalty is controlled by a penalty coefficient which trades off bias and variance. We tested the proposed approach on several variational formulas and synthetic examples and showed that the overall error is decreased about an order of magnitude relative to the baseline statistical estimator. Impressive results are obtained for R\\'enyi divergence with large order values due to the improved stability of the proposed estimator. Furthermore, in real biological datasets we are able to detect very rare sub-populations with a moderate sample size. Finally, we obtain improved (in terms of objective measures) disentangled representation of speech signals into text, speaker, and style components via variance-penalized mutual information minimization.", "keywords": "Divergence estimation;Variational formulas;Variance reduction;Representation learning", "primary_area": "", "supplementary_material": "/attachment/915474d137235f665a81e6706d73e68200c4a411.zip", "author": "Jeremiah Birrell;Markos A. Katsoulakis;Yannis Pantazis;Dipjyoti Paul;Anastasios Tsourtis", "authorids": "~Jeremiah_Birrell1;markos@math.umass.edu;~Yannis_Pantazis1;~Dipjyoti_Paul1;tsourtis@iacm.forth.gr", "gender": "M;;M;M;", "homepage": "https://www.researchgate.net/profile/Jeremiah-Birrell;;https://sites.google.com/site/yannispantazis/;https://dipjyoti92.github.io/;", "dblp": ";;;;", "google_scholar": "R60hJGUAAAAJ;;https://scholar.google.gr/citations?user=MypIGOYAAAAJ;https://scholar.google.co.in/citations?user=l1U9lcwAAAAJ;", "orcid": ";;0000-0002-2009-7562;;", "linkedin": ";;;dipjyoti-paul-85221096/;", "or_profile": "~Jeremiah_Birrell1;markos@math.umass.edu;~Yannis_Pantazis1;~Dipjyoti_Paul1;tsourtis@iacm.forth.gr", "aff": "University of Massachusetts, Amherst;;Foundation for Research and Technology - Hellas;University of Crete, University of Crete;", "aff_domain": "umass.edu;;forth.gr;csd.uoc.gr;", "position": "Postdoc;;Researcher;PhD student;", "bibtex": "@misc{\nbirrell2022a,\ntitle={A Variance Reduction Method for Neural-based Divergence Estimation},\nauthor={Jeremiah Birrell and Markos A. Katsoulakis and Yannis Pantazis and Dipjyoti Paul and Anastasios Tsourtis},\nyear={2022},\nurl={https://openreview.net/forum?id=6g4VoBTaq6I}\n}", "github": "", "project": "", "reviewers": "3SLV;JtE9;KwzF;gwrQ", "site": "https://openreview.net/forum?id=6g4VoBTaq6I", "pdf_size": 0, "recommendation": "3;3;8;8", "confidence": "4;4;3;3", "correctness": "2;3;4;1", "technical_novelty": "2;2;3;1", "empirical_novelty": "2;2;3;2", "wc_summary_paper": "66;107;76;14", "wc_summary_review": "33;77;33;6", "wc_main_review": "388;577;214;102", "wc_review": "487;761;323;122", "wc_reply_reviewers": "0;44;5;16", "wc_reply_authors": "584;604;110;91", "reply_reviewers": "0;1;1;1", "reply_authors": "1;1;1;1", "recommendation_avg": [ 5.5, 2.5 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 2.5, 1.118033988749895 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 65.75, 33.48413803579241 ], "wc_summary_review_avg": [ 37.25, 25.459526704163217 ], "wc_main_review_avg": [ 320.25, 179.88381667064994 ], "wc_review_avg": [ 423.25, 233.95552461953105 ], "wc_reply_reviewers_avg": [ 16.25, 17.03489066592445 ], "wc_reply_authors_avg": [ 347.25, 246.94268059612537 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -1.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:vy5swEJfK6EJ:scholar.google.com/&scioq=A+Variance+Reduction+Method+for+Neural-based+Divergence+Estimation&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;2", "aff_unique_norm": "University of Massachusetts Amherst;Foundation for Research and Technology - Hellas;University of Crete", "aff_unique_dep": ";;", "aff_unique_url": "https://www.umass.edu;https://www.forth.gr;https://www.uoc.gr", "aff_unique_abbr": "UMass Amherst;FORTH;UoC", "aff_campus_unique_index": "0", "aff_campus_unique": "Amherst;", "aff_country_unique_index": "0;1;1", "aff_country_unique": "United States;Greece" }, { "id": "6gLEKETxUWp", "title": "Interpreting Molecule Generative Models for Interactive Molecule Discovery", "track": "main", "status": "Reject", "tldr": "", "abstract": "Discovering novel molecules with desired properties is crucial for advancing drug discovery and chemical science. Recently deep generative models can synthesize new molecules by sampling random vectors from latent space and then decoding them to a molecule structure. However, through the feedforward generation pipeline, it is difficult to reveal the underlying connections between latent space and molecular properties as well as customize the output molecule with desired properties. In this work, we develop a simple yet effective method to interpret the latent space of the learned generative models with various molecular properties for more interactive molecule generation and discovery. This method, called Molecular Space Explorer (MolSpacE), is model-agnostic and can work with any pre-trained molecule generative models in an off-the-shelf manner. It first identifies latent directions that govern certain molecular properties via the property separation hyperplane and then moves molecules along the directions for smooth change of molecular structures and properties. This method achieves interactive molecule discovery through identifying interpretable and steerable concepts that emerge in the representations of generative models. Experiments show that MolSpacE can manipulate the output molecule toward desired properties with high success. We further quantify and compare the interpretability of multiple state-of-the-art molecule generative models. An interface and a demo video are developed to illustrate the promising application of interactive molecule discovery.", "keywords": "Molecule Generation;Controllable Molecule Generation;Interpretable Molecule Generation;Molecule Manipulation", "primary_area": "", "supplementary_material": "", "author": "Yuanqi Du;Xian Liu;Shengchao Liu;Bolei Zhou", "authorids": "~Yuanqi_Du1;~Xian_Liu1;~Shengchao_Liu1;~Bolei_Zhou5", "gender": "M;M;M;M", "homepage": "https://yuanqidu.github.io/;https://alvinliu0.github.io/;https://chao1224.github.io/;https://boleizhou.github.io/", "dblp": "266/2837;;;46/8066", "google_scholar": "fAc_zZMAAAAJ;https://scholar.google.com/citations?hl=en-us;F1ws3XUAAAAJ;9D4aG8AAAAAJ", "orcid": ";0000-0001-9817-7418;0000-0003-2030-2367;", "linkedin": ";xian-liu-9840b52a3/;;", "or_profile": "~Yuanqi_Du1;~Xian_Liu1;~Shengchao_Liu1;~Bolei_Zhou5", "aff": "University of Amsterdam;The Chinese University of Hong Kong;MILA-UdeM;University of California, Los Angeles", "aff_domain": "uva.nl;cuhk.edu.hk;mila.quebec;ucla.edu", "position": "Researcher;PhD student;PhD student;Assistant Professor", "bibtex": "@misc{\ndu2022interpreting,\ntitle={Interpreting Molecule Generative Models for Interactive Molecule Discovery},\nauthor={Yuanqi Du and Xian Liu and Shengchao Liu and Bolei Zhou},\nyear={2022},\nurl={https://openreview.net/forum?id=6gLEKETxUWp}\n}", "github": "", "project": "", "reviewers": "Pgt6;bacV;5bTH;MABC", "site": "https://openreview.net/forum?id=6gLEKETxUWp", "pdf_size": 0, "recommendation": "3;3;3;3", "confidence": "4;5;4;4", "correctness": "3;3;2;2", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;2;2;1", "wc_summary_paper": "105;97;55;14", "wc_summary_review": "14;123;66;60", "wc_main_review": "292;1992;1364;253", "wc_review": "411;2212;1485;327", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 1.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 67.75, 36.38251640554842 ], "wc_summary_review_avg": [ 65.75, 38.6935072072822 ], "wc_main_review_avg": [ 975.25, 737.1198596564876 ], "wc_review_avg": [ 1108.75, 783.6952133961263 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14081890840919363257&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "University of Amsterdam;Chinese University of Hong Kong;Mila;University of California, Los Angeles", "aff_unique_dep": ";;Montreal Institute for Learning Algorithms;", "aff_unique_url": "https://www.uva.nl;https://www.cuhk.edu.hk;https://mila.quebec;https://www.ucla.edu", "aff_unique_abbr": "UvA;CUHK;MILA;UCLA", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Hong Kong SAR;Los Angeles", "aff_country_unique_index": "0;1;2;3", "aff_country_unique": "Netherlands;China;Canada;United States" }, { "id": "6hTObFz_nB", "title": "Do Androids Dream of Electric Fences? Safety-Aware Reinforcement Learning with Latent Shielding", "track": "main", "status": "Reject", "tldr": "", "abstract": "The growing trend of fledgling reinforcement learning systems making their way into real-world applications has been accompanied by growing concerns for their safety and robustness. In recent years, a variety of approaches have been put forward to address the challenges of safety-aware reinforcement learning; however, these methods often either require a handcrafted model of the environment to be provided beforehand, or that the environment is relatively simple and low-dimensional. We present a novel approach to safety-aware deep reinforcement learning in high-dimensional environments called latent shielding. Latent shielding leverages internal representations of the environment learnt by model-based agents to \"imagine\" future trajectories and avoid those deemed unsafe. We experimentally demonstrate that this approach leads to improved adherence to formally-defined safety specifications.", "keywords": "Model-Based Reinforcement Learning;Safety-Aware Reinforcement Learning;Shielding;World Models", "primary_area": "", "supplementary_material": "", "author": "Chloe He;Borja G. Le\u00f3n;Francesco Belardinelli", "authorids": "~Chloe_He2;~Borja_G._Le\u00f3n1;~Francesco_Belardinelli1", "gender": ";M;M", "homepage": ";https://www.doc.ic.ac.uk/~bg19/;https://www.doc.ic.ac.uk/~fbelard/", "dblp": ";259/1299;59/2916", "google_scholar": ";https://scholar.google.es/citations?user=sJiadiMAAAAJ;https://scholar.google.fr/citations?user=Mr35r1EAAAAJ", "orcid": ";;0000-0002-7768-1794", "linkedin": ";borja-gonzalez-leon/;", "or_profile": "~Chloe_He2;~Borja_G._Le\u00f3n1;~Francesco_Belardinelli1", "aff": ";Meta Facebook;Imperial College London", "aff_domain": ";fb.com;imperial.ac.uk", "position": ";Intern;Lecturer", "bibtex": "@misc{\nhe2022do,\ntitle={Do Androids Dream of Electric Fences? Safety-Aware Reinforcement Learning with Latent Shielding},\nauthor={Chloe He and Borja G. Le{\\'o}n and Francesco Belardinelli},\nyear={2022},\nurl={https://openreview.net/forum?id=6hTObFz_nB}\n}", "github": "", "project": "", "reviewers": "eXDd;WV6o;Lq99;HtqJ", "site": "https://openreview.net/forum?id=6hTObFz_nB", "pdf_size": 0, "recommendation": "5;5;5;8", "confidence": "4;3;4;3", "correctness": "3;3;2;4", "technical_novelty": "2;2;2;3", "empirical_novelty": "3;2;2;3", "wc_summary_paper": "61;88;123;59", "wc_summary_review": "75;10;17;38", "wc_main_review": "668;220;64;257", "wc_review": "804;318;204;354", "wc_reply_reviewers": "254;0;60;85", "wc_reply_authors": "1673;1102;951;651", "reply_reviewers": "1;0;1;1", "reply_authors": "3;2;2;1", "recommendation_avg": [ 5.75, 1.299038105676658 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 82.75, 25.907286619790966 ], "wc_summary_review_avg": [ 35.0, 25.288337232803585 ], "wc_main_review_avg": [ 302.25, 223.24244108143952 ], "wc_review_avg": [ 420.0, 228.51258171050452 ], "wc_reply_reviewers_avg": [ 99.75, 94.26127253543737 ], "wc_reply_authors_avg": [ 1094.25, 371.48443776287587 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 2.0, 0.7071067811865476 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.816496580927726, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4069571580902890220&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff_unique_index": "0;1", "aff_unique_norm": "Meta;Imperial College London", "aff_unique_dep": "Meta Platforms, Inc.;", "aff_unique_url": "https://meta.com;https://www.imperial.ac.uk", "aff_unique_abbr": "Meta;ICL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "United States;United Kingdom" }, { "id": "6iEcgoZ1Aek", "title": "Crossformer: Transformer with Alternated Cross-Layer Guidance", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Transformers with stacked attention layers have achieved state-of-the-art results on a wide range of tasks related to discrete sequences. Significant work has been done to better understand or interpret the capabilities of Transformer, which is often massively over-parameterized and prone to overfitting. There exist intensive interactions between Transformer layers, where the information from higher layers can and do distill the information from lower layers. This motivates us to inject a cross-layer inductive bias that not only uses higher layers, which are closer to the training objective, to guide lower ones, but also provides regularization customized to the stacked structure of Transformer. To this end, we propose Crossformer that either regularizes the differences between specific states of two adjacent layers or directly imposes alternated states sharing between all adjacent layers. Crossformer with states sharing not only provides the desired cross-layer guidance and regularization but also reduces the memory requirement. It is simple to convert a Transformer-based model to a Crossformer-based one. On a variety of neural machine translation tasks, we show that our method outperforms Transformer models while being more memory-efficient. We further demonstrate the general applicability and stability of Crossformer on visual question answering, graph node classification, and significantly deeper models, showing the great potential of incorporating our method into various Transformer-related tasks.", "keywords": "attention;language model;machine translation;deep learning", "primary_area": "", "supplementary_material": "", "author": "Shujian Zhang;Zhibin Duan;Huangjie Zheng;Pengcheng He;Bo Chen;Weizhu Chen;Mingyuan Zhou", "authorids": "~Shujian_Zhang1;~Zhibin_Duan1;~Huangjie_Zheng1;~Pengcheng_He2;~Bo_Chen1;~Weizhu_Chen1;~Mingyuan_Zhou1", "gender": ";M;M;M;M;M;M", "homepage": "https://www.utexas.edu/;;;;http://web.xidian.edu.cn/bchen/en/index.html;https://www.microsoft.com/en-us/research/people/wzchen/;http://mingyuanzhou.github.io", "dblp": "84/3190.html;268/2560;192/2170;116/8665;89/5615-1;79/2536;", "google_scholar": "7RmLVQkAAAAJ;https://scholar.google.com.hk/citations?user=bITyHaEAAAAJ;Vl5wCXsAAAAJ;https://scholar.google.com/citations?hl=en;;LG_E-4EAAAAJ;LXwCIisAAAAJ", "orcid": ";;0000-0003-0508-5034;;0000-0001-5151-9388;;", "linkedin": ";;;;;;", "or_profile": "~Shujian_Zhang1;~Zhibin_Duan1;~Huangjie_Zheng1;~Pengcheng_He2;~Bo_Chen1;~Weizhu_Chen1;~Mingyuan_Zhou1", "aff": "University of Texas, Austin;Xidian University;University of Texas, Austin;Microsoft;Xidian University;Microsoft GenAI;The University of Texas at Austin", "aff_domain": "utexas.edu;xidian.edu;utexas.edu;microsoft.com;xidian.edu.cn;microsoft.com;utexas.edu", "position": "PhD student;MS student;PhD student;Principal Researcher;Full Professor;Vice President;Associate Professor", "bibtex": "@misc{\nzhang2022crossformer,\ntitle={Crossformer: Transformer with Alternated Cross-Layer Guidance},\nauthor={Shujian Zhang and Zhibin Duan and Huangjie Zheng and Pengcheng He and Bo Chen and Weizhu Chen and Mingyuan Zhou},\nyear={2022},\nurl={https://openreview.net/forum?id=6iEcgoZ1Aek}\n}", "github": "", "project": "", "reviewers": "B4WQ;GV4s;wvVX;2aTF", "site": "https://openreview.net/forum?id=6iEcgoZ1Aek", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "3;4;3;4", "correctness": "3;3;2;3", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "27;48;64;62", "wc_summary_review": "22;46;6;35", "wc_main_review": "231;210;239;104", "wc_review": "280;304;309;201", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "334;353;344;365", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 50.25, 14.771171246722448 ], "wc_summary_review_avg": [ 27.25, 14.922717580923388 ], "wc_main_review_avg": [ 196.0, 54.161794652688535 ], "wc_review_avg": [ 273.5, 43.26950427263987 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 349.0, 11.423659658795863 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:KdurjKzYpi4J:scholar.google.com/&scioq=Crossformer:+Transformer+with+Alternated+Cross-Layer+Guidance&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1;0;2;1;2;0", "aff_unique_norm": "University of Texas at Austin;Xidian University;Microsoft", "aff_unique_dep": ";;Microsoft Corporation", "aff_unique_url": "https://www.utexas.edu;http://www.xidian.edu.cn/;https://www.microsoft.com", "aff_unique_abbr": "UT Austin;Xidian;Microsoft", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Austin;", "aff_country_unique_index": "0;1;0;0;1;0;0", "aff_country_unique": "United States;China" }, { "id": "6j9YOwh8itH", "title": "Unified Recurrence Modeling for Video Action Anticipation", "track": "main", "status": "Reject", "tldr": "", "abstract": "Forecasting future events based on evidence of current conditions is an innate skill of human beings, and key for predicting the outcome of any decision making. In artificial vision for example, we would like to predict the next human action before it is actually performed, without observing the future video frames associated to it. Computer vision models for action anticipation are expected to collect the subtle evidence in the preamble of the target actions. In prior studies recurrence modeling often leads to better performance, and the strong temporal inference is assumed to be a key element for reasonable prediction. To this end, we propose a unified recurrence modeling for video action anticipation by generalizing the recurrence mechanism from sequence into graph representation via message passing. The information flow in space-time can be described by the interaction between vertices and edges, and the changes of vertices for each incoming frame reflects the underlying dynamics. Our model leverages self-attention for all building blocks in the graph modeling, and we introduce different edge learning strategies can be end-to-end optimized while updating the vertices. Our experimental results demonstrate that our modeling method is light-weight, efficient, and outperforms all previous works on the large-scale EPIC-Kitchen dataset. ", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/674f0348c328194dc51e14b3fa7dec4ab5d5c29f.zip", "author": "Tsung-Ming Tai;Giuseppe Fiameni;Cheng-Kuang Lee;Simon See;Oswald Lanz", "authorids": "~Tsung-Ming_Tai1;~Giuseppe_Fiameni1;~Cheng-Kuang_Lee1;~Simon_See1;~Oswald_Lanz3", "gender": ";M;M;M;M", "homepage": ";https://www.linkedin.com/in/giuseppefiameni/;;;https://www.unibz.it/en/faculties/engineering/academic-staff/person/46208-oswald-lanz", "dblp": "206/9805;96/10002;;62/6547;02/1449.html", "google_scholar": "T5_GvzMAAAAJ;Se2mLvIAAAAJ;WWvSfXkAAAAJ;ebIHTEoAAAAJ;https://scholar.google.it/citations?user=vpmV4xcAAAAJ", "orcid": ";0000-0001-8687-6609;;0000-0002-4958-9237;", "linkedin": ";giuseppefiameni/;cheng-kuang-ck-lee-b97258157/?originalSubdomain=tw;simonsee/;", "or_profile": "~Tsung-Ming_Tai1;~Giuseppe_Fiameni1;~Cheng-Kuang_Lee1;~Simon_See1;~Oswald_Lanz3", "aff": "NVIDIA;NVIDIA;;NVIDIA;Free University of Bozen-Bolzano", "aff_domain": "nvidia.com;nvidia.com;;nvidia.com;unibz.it", "position": "Researcher;Data Scientist;;Associate Professor;Professor", "bibtex": "@misc{\ntai2022unified,\ntitle={Unified Recurrence Modeling for Video Action Anticipation},\nauthor={Tsung-Ming Tai and Giuseppe Fiameni and Cheng-Kuang Lee and Simon See and Oswald Lanz},\nyear={2022},\nurl={https://openreview.net/forum?id=6j9YOwh8itH}\n}", "github": "", "project": "", "reviewers": "HJA6;iyEe;yqiW;hNXw", "site": "https://openreview.net/forum?id=6j9YOwh8itH", "pdf_size": 0, "recommendation": "3;3;5;6", "confidence": "3;4;5;3", "correctness": "3;3;2;4", "technical_novelty": "2;3;3;2", "empirical_novelty": "2;3;2;3", "wc_summary_paper": "89;39;122;120", "wc_summary_review": "52;42;205;26", "wc_main_review": "748;194;823;96", "wc_review": "889;275;1150;242", "wc_reply_reviewers": "0;78;181;0", "wc_reply_authors": "216;170;314;79", "reply_reviewers": "0;1;1;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 4.25, 1.299038105676658 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 92.5, 33.544746235439014 ], "wc_summary_review_avg": [ 81.25, 72.04642600434806 ], "wc_main_review_avg": [ 465.25, 323.2084273344369 ], "wc_review_avg": [ 639.0, 391.70333161718196 ], "wc_reply_reviewers_avg": [ 64.75, 74.28786913083455 ], "wc_reply_authors_avg": [ 194.75, 84.67991202168317 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.058025885318565944, "corr_recommendation_correctness": 0.2721655269759087, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7742796337533608198&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "aff_unique_index": "0;0;0;1", "aff_unique_norm": "NVIDIA;Free University of Bozen-Bolzano", "aff_unique_dep": "NVIDIA Corporation;", "aff_unique_url": "https://www.nvidia.com;https://www.unibz.it", "aff_unique_abbr": "NVIDIA;UNIBZ", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1", "aff_country_unique": "United States;Italy" }, { "id": "6jZo9g3MiVV", "title": "Contrastive Quant: Quantization Makes Stronger Contrastive Learning", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Contrastive learning, which learns visual representations by enforcing feature consistency under different augmented views, has emerged as one of the most effective unsupervised learning methods. In this work, we explore contrastive learning from a new perspective, inspired by the recent works showing that properly designed weight perturbations or quantization help the models learn a smoother loss landscape. Interestingly, we find that quantization, when properly engineered, can enhance the effectiveness of contrastive learning. To this end, we propose a novel contrastive learning framework, dubbed Contrastive Quant, to encourage the feature consistency under both (1) differently augmented inputs via various data transformations and (2) differently augmented weights/activations via various quantization levels, where the feature consistency under injected noises via quantization can be viewed as augmentations on both model weights and intermediate activations as a complement to the input augmentations. Extensive experiments, built on top of two state-of-the-art contrastive learning methods SimCLR and BYOL, show that Contrastive Quant consistently improves the learned visual representation, especially with limited labeled data under semi-supervised scenarios. For example, our Contrastive Quant achieves a 8.69% and 10.27% higher accuracy on ResNet-18 and ResNet-34, respectively, on ImageNet when fine-tuning with 10% labeled data. We believe this work has opened up a new perspective for future contrastive learning innovations. All codes will be released upon acceptance.", "keywords": "Contrastive learning;quantization", "primary_area": "", "supplementary_material": "", "author": "Yonggan Fu;Qixuan Yu;Meng Li;Xu Ouyang;Vikas Chandra;Yingyan Lin", "authorids": "~Yonggan_Fu1;~Qixuan_Yu1;~Meng_Li1;~Xu_Ouyang2;~Vikas_Chandra2;~Yingyan_Lin1", "gender": "M;;M;M;M;F", "homepage": "https://www.yongganfu.com/;http://www.makiy.org;https://mengli.me;https://xo28.github.io/;https://v-chandra.github.io/;https://eiclab.scs.gatech.edu/", "dblp": "244/8166;;70/1726-4;;57/5163;120/6981", "google_scholar": "https://scholar.google.com/citations?hl=en;;lvdRkEkAAAAJ;https://scholar.google.com/citations?view_op=list_works;p-h_BvcAAAAJ;dio8IesAAAAJ", "orcid": ";;;0000-0003-2433-8180;;", "linkedin": "yonggan-fu-b211831b0;;;xu-ouyang-b0a2351a0/;vchandra/;yingyan-celine-lin-a281211a/", "or_profile": "~Yonggan_Fu1;~Qixuan_Yu1;~Meng_Li1;~Xu_Ouyang2;~Vikas_Chandra2;~Yingyan_Lin1", "aff": "Rice University;Rice University;Meta Facebook;Rice University;Meta;Rice University", "aff_domain": "rice.edu;rice.edu;fb.com;rice.edu;meta.com;rice.edu", "position": "PhD student;Undergrad student;Researcher;Researcher;Director, AI;Assistant Professor", "bibtex": "@misc{\nfu2022contrastive,\ntitle={Contrastive Quant: Quantization Makes Stronger Contrastive Learning},\nauthor={Yonggan Fu and Qixuan Yu and Meng Li and Xu Ouyang and Vikas Chandra and Yingyan Lin},\nyear={2022},\nurl={https://openreview.net/forum?id=6jZo9g3MiVV}\n}", "github": "", "project": "", "reviewers": "Dbeq;xQz1;LVpP;PEx5", "site": "https://openreview.net/forum?id=6jZo9g3MiVV", "pdf_size": 0, "recommendation": "3;5;5;5", "confidence": "5;5;4;4", "correctness": "2;3;3;4", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;2;3;4", "wc_summary_paper": "45;62;54;76", "wc_summary_review": "19;29;31;75", "wc_main_review": "227;668;388;504", "wc_review": "291;759;473;655", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.5, 0.8660254037844386 ], "confidence_avg": [ 4.5, 0.5 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 59.25, 11.388041973930374 ], "wc_summary_review_avg": [ 38.5, 21.558061137310098 ], "wc_main_review_avg": [ 446.75, 161.2224782714867 ], "wc_review_avg": [ 544.5, 178.60221163244313 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.816496580927726, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6971484127699216655&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0;1;0;1;0", "aff_unique_norm": "Rice University;Meta", "aff_unique_dep": ";Meta Platforms, Inc.", "aff_unique_url": "https://www.rice.edu;https://meta.com", "aff_unique_abbr": "Rice;Meta", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Few-shot Learning via Dirichlet Tessellation Ensemble", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6124", "id": "6kCiVaoQdx9", "poster": "", "openreview": "https://openreview.net/forum?id=6kCiVaoQdx9", "slides": "https://iclr.cc/virtual/2022/poster/6124", "video": "https://iclr.cc/virtual/2022/poster/6124", "author_site": "Chunwei Ma, Ziyun Huang, Mingchen Gao, Jinhui Xu", "tldr": "", "abstract": "Few-shot learning (FSL) is the process of rapid generalization from abundant base samples to inadequate novel samples. Despite extensive research in recent years, FSL is still not yet able to generate satisfactory solutions for a wide range of real-world applications. To confront this challenge, we study the FSL problem from a geometric point of view in this paper. One observation is that the widely embraced ProtoNet model is essentially a Voronoi Diagram (VD) in the feature space. We retrofit it by making use of a recent advance in computational geometry called Cluster-induced Voronoi Diagram (CIVD). Starting from the simplest nearest neighbor model, CIVD gradually incorporates cluster-to-point and then cluster-to-cluster relationships for space subdivision, which is used to improve the accuracy and robustness at multiple stages of FSL. Specifically, we use CIVD (1) to integrate parametric and nonparametric few-shot classifiers; (2) to combine feature representation and surrogate representation; (3) and to leverage feature-level, transformation-level, and geometry-level heterogeneities for a better ensemble. Our CIVD-based workflow enables us to achieve new state-of-the-art results on mini-ImageNet, CUB, and tiered-ImagenNet datasets, with ${\\sim}2\\%{-}5\\%$ improvements upon the next best. To summarize, CIVD provides a mathematically elegant and geometrically interpretable framework that compensates for extreme data insufficiency, prevents overfitting, and allows for fast geometric ensemble for thousands of individual VD. These together make FSL stronger.", "keywords": "Few-shot Learning;Computational Geometry;Dirichlet Tessellation;Voronoi Diagram;Ensemble Learning", "primary_area": "", "supplementary_material": "", "author": "Chunwei Ma;Ziyun Huang;Mingchen Gao;Jinhui Xu", "authorids": "~Chunwei_Ma1;~Ziyun_Huang1;~Mingchen_Gao1;~Jinhui_Xu1", "gender": ";M;F;M", "homepage": ";;http://engineering.buffalo.edu/computer-science-engineering/people/faculty-directory/mingchen-gao.html;https://www.cse.buffalo.edu/~jinhui/", "dblp": ";;11/9613;24/6437-1.html", "google_scholar": ";1MPrmtEAAAAJ;1KUHms8AAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";;0000-0002-5488-8514;", "linkedin": ";;;", "or_profile": "~Chunwei_Ma1;~Ziyun_Huang1;~Mingchen_Gao1;~Jinhui_Xu1", "aff": ";Pennsylvania State University, Erie;University at Buffalo, SUNY;University at Buffalo, State University of New York", "aff_domain": ";psu.edu;buffalo.edu;buffalo.edu", "position": ";Assistant Professor;Assistant Professor;Full Professor", "bibtex": "@inproceedings{\nma2022fewshot,\ntitle={Few-shot Learning via Dirichlet Tessellation Ensemble},\nauthor={Chunwei Ma and Ziyun Huang and Mingchen Gao and Jinhui Xu},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=6kCiVaoQdx9}\n}", "github": "", "project": "", "reviewers": "uC53;Qc9m;JRwP;bCqa", "pdf_size": 0, "recommendation": "6;6;6;8", "confidence": "3;5;2;4", "correctness": "3;3;4;4", "technical_novelty": "2;3;2;4", "empirical_novelty": "3;3;2;4", "wc_summary_paper": "45;70;58;65", "wc_summary_review": "53;42;49;47", "wc_main_review": "276;113;199;253", "wc_review": "374;225;306;365", "wc_reply_reviewers": "110;79;26;46", "wc_reply_authors": "1178;457;457;850", "reply_reviewers": "1;2;1;1", "reply_authors": "3;2;1;2", "recommendation_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 3.5, 1.118033988749895 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.82915619758885 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 59.5, 9.394147114027968 ], "wc_summary_review_avg": [ 47.75, 3.960744879438715 ], "wc_main_review_avg": [ 210.25, 62.71911590575875 ], "wc_review_avg": [ 317.5, 59.44955845084133 ], "wc_reply_reviewers_avg": [ 65.25, 32.02635633349507 ], "wc_reply_authors_avg": [ 735.5, 301.6790513111575 ], "reply_reviewers_avg": [ 1.25, 0.4330127018922193 ], "reply_authors_avg": [ 2.0, 0.7071067811865476 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.2581988897471611, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9255876308024965347&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "pdf": "https://openreview.net/pdf?id=6kCiVaoQdx9", "email": ";psu.edu;buffalo.edu;buffalo.edu", "author_num": 4, "aff_unique_index": "0;1;1", "aff_unique_norm": "Pennsylvania State University;University at Buffalo", "aff_unique_dep": ";", "aff_unique_url": "https://www.psu.edu;https://www.buffalo.edu", "aff_unique_abbr": "PSU;UB", "aff_campus_unique_index": "0;1;1", "aff_campus_unique": "Erie;Buffalo", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "6kruvdT0yfY", "title": "C+1 Loss: Learn to Classify C Classes of Interest and the Background Class Differentially", "track": "main", "status": "Reject", "tldr": "", "abstract": "There is one kind of problem all around the classification area, where we want to classify C+1 classes of samples, including C semantically deterministic classes which we call classes of interest and the (C+1)th semantically undeterministic class which we call background class. In spite of most classification algorithm use softmax-based cross-entropy loss to supervise the classifier training process without differentiating the background class from the classes of interest, it is unreasonable as each of the classes of interest has its own inherent characteristics, but the background class dosen\u2019t. We figure out that the background class should be treated differently from the classes of interest during training. Motivated by this, firstly we define the C+1 classification problem. Then, we propose three properties that a good C+1 classifier should have: basic discriminability, compactness and background margin. Based on them we define a uniform general C+1 loss, composed of three parts, driving the C+1 classifier to satisfy those properties. Finally, we instantialize a C+1 loss and experiment it in semantic segmentation, human parsing and object detection tasks. The proposed approach shows its superiority over the traditional cross-entropy loss.", "keywords": "C+1 loss;classes of interest;background class", "primary_area": "", "supplementary_material": "", "author": "Changhuai Chen;Xile Shen;Mengyu Ye;Yi Lu;Jun Che;Shiliang Pu", "authorids": "~Changhuai_Chen1;~Xile_Shen1;~Mengyu_Ye1;~Yi_Lu5;~Jun_Che1;~Shiliang_Pu1", "gender": "M;;M;;M;M", "homepage": ";;;;;https://github.com/luuuyi", "dblp": "258/2736;;;;155/3173;", "google_scholar": ";ZIXqHtAAAAAJ;;;https://scholar.google.com.hk/citations?user=NWR_wpoAAAAJ;", "orcid": "0000-0003-2428-5031;;;;;", "linkedin": ";;http://linkedin.com/in/mengyu-ye-482ba2148;;;", "or_profile": "~Changhuai_Chen1;~Xile_Shen1;~Mengyu_Ye1;~Jun_Che1;~Shiliang_Pu1;~Lu_Yi1", "aff": "Hikvision Research Institute;Tsinghua University;;;;", "aff_domain": "hikvision.com;tsinghua.edu.cn;;;;", "position": "Researcher;MS student;;;;", "bibtex": "@misc{\nchen2022c,\ntitle={C+1 Loss: Learn to Classify C Classes of Interest and the Background Class Differentially},\nauthor={Changhuai Chen and Xile Shen and Mengyu Ye and Yi Lu and Jun Che and Shiliang Pu},\nyear={2022},\nurl={https://openreview.net/forum?id=6kruvdT0yfY}\n}", "github": "", "project": "", "reviewers": "NYF4;dnQ2;74g9;ZXUD", "site": "https://openreview.net/forum?id=6kruvdT0yfY", "pdf_size": 0, "recommendation": "1;3;3;6", "confidence": "5;4;5;4", "correctness": "3;2;3;4", "technical_novelty": "1;1;1;2", "empirical_novelty": "1;1;2;3", "wc_summary_paper": "39;68;84;77", "wc_summary_review": "39;42;31;123", "wc_main_review": "91;187;187;200", "wc_review": "169;297;302;400", "wc_reply_reviewers": "0;143;0;0", "wc_reply_authors": "67;168;85;110", "reply_reviewers": "0;1;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 3.25, 1.7853571071357126 ], "confidence_avg": [ 4.5, 0.5 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 1.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 67.0, 17.131841699011815 ], "wc_summary_review_avg": [ 58.75, 37.31202889149825 ], "wc_main_review_avg": [ 166.25, 43.76856748855279 ], "wc_review_avg": [ 292.0, 82.03352973022677 ], "wc_reply_reviewers_avg": [ 35.75, 61.92081637058736 ], "wc_reply_authors_avg": [ 107.5, 38.12151623427379 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.7001400420140049, "corr_recommendation_correctness": 0.5940885257860046, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:TTIU1oY9tT4J:scholar.google.com/&scioq=C%2B1+Loss:+Learn+to+Classify+C+Classes+of+Interest+and+the+Background+Class+Differentially&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1", "aff_unique_norm": "Hikvision Research Institute;Tsinghua University", "aff_unique_dep": ";", "aff_unique_url": "https://www.hikvision.com/cn/;https://www.tsinghua.edu.cn", "aff_unique_abbr": "Hikvision;THU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "id": "6lcE6GdcHyQ", "title": "Will a Blind Model Hear Better? Advanced Audiovisual Recognition System with Brain-Like Compensating and Gating", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Multi-modal data (e.g., audio-visual inputs, various medical images) fusion neural networks has draw more attention recently with growing number of models and training techniques being proposed. Despite the success of the multi-modal fusion neural network, we find a interesting \"low single-modality robustness\" phenomenon. Specifically, a multi-modal trained model may achieve worse performance than single-modal trained model if another modal data are masked. This is like a born blind or deaf person (single-modal trained) surpass the healthy one (multi-modal trained) with only one modality data input, and the multi-modal experience becomes a bias causing negative transfer. It shows that the existing neural networks have lower robustness than the human brain in terms of modal-missing problem. To overcome the defect, in this paper we design a brain-like neural network modeling the processing of audio and visual signals by training it to perform audiovisual speech recognition tasks. Our results demonstrate the computational model's vulnerability to sensory deprivation while promoting this adaption can help in multi-modal processing. Besides, we propose modality mix and gated fusion techniques to get a more robust model with better generalization ability. We ask for more attention on the interaction of signals of different modalities and hope our work will inspire more researchers to study the cross-modal complementary.", "keywords": "brain-inspired computing;multi-modal neural network;audio-visual speech recognition", "primary_area": "", "supplementary_material": "", "author": "Zhengliang Wu;gen Shi", "authorids": "~Zhengliang_Wu1;~gen_Shi1", "gender": "M;M", "homepage": "https://blog.csdn.net/weixin_42939529;", "dblp": ";301/5413", "google_scholar": ";K6awU5kAAAAJ", "orcid": ";0000-0002-1717-4053", "linkedin": ";", "or_profile": "~Zhengliang_Wu1;~gen_Shi1", "aff": "Beijing Institute of Technology;Beijing Institute of Technology", "aff_domain": "bit.edu.cn;bit.edu.cn", "position": "MS student;MS student", "bibtex": "@misc{\nwu2022will,\ntitle={Will a Blind Model Hear Better? Advanced Audiovisual Recognition System with Brain-Like Compensating and Gating},\nauthor={Zhengliang Wu and gen Shi},\nyear={2022},\nurl={https://openreview.net/forum?id=6lcE6GdcHyQ}\n}", "github": "", "project": "", "reviewers": "qkhp;wmCC;27rH", "site": "https://openreview.net/forum?id=6lcE6GdcHyQ", "pdf_size": 0, "recommendation": "1;3;5", "confidence": "5;5;4", "correctness": "1;3;2", "technical_novelty": "1;2;2", "empirical_novelty": "1;2;2", "wc_summary_paper": "21;37;55", "wc_summary_review": "1;100;55", "wc_main_review": "182;477;422", "wc_review": "204;614;532", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 3.0, 1.632993161855452 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "correctness_avg": [ 2.0, 0.816496580927726 ], "technical_novelty_avg": [ 1.6666666666666667, 0.4714045207910317 ], "empirical_novelty_avg": [ 1.6666666666666667, 0.4714045207910317 ], "wc_summary_paper_avg": [ 37.666666666666664, 13.888444437333106 ], "wc_summary_review_avg": [ 52.0, 40.47221268969612 ], "wc_main_review_avg": [ 360.3333333333333, 128.08417371227233 ], "wc_review_avg": [ 450.0, 177.1402457564815 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.8660254037844385, "corr_recommendation_correctness": 0.5, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:lDPvhTIAtPsJ:scholar.google.com/&scioq=Will+a+Blind+Model+Hear+Better%3F+Advanced+Audiovisual+Recognition+System+with+Brain-Like+Compensating+and+Gating&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Beijing Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "http://www.bit.edu.cn/", "aff_unique_abbr": "BIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "id": "6ooiNCGZa5K", "title": "On-Target Adaptation", "track": "main", "status": "Reject", "tldr": "", "abstract": "Domain adaptation seeks to mitigate the shift between training on the source data and testing on the target data. Most adaptation methods rely on the source data by joint optimization over source and target. Source-free methods replace the source data with source parameters by fine-tuning the model on target. Either way, the majority of the parameter updates for the model representation and the classifier are derived from the source, and not the target. However, target accuracy is the goal, and so we argue for optimizing as much as possible on target. We show significant improvement by on-target adaptation, which learns the representation purely on target data, with only source predictions for supervision (without source data or parameter fine-tuning). In the long-tailed classification setting, we demonstrate on-target class distribution learning, which learns the (im)balance of classes on target data. On-target adaptation achieves state-of-the-art accuracy and computational efficiency on VisDA-C and ImageNet-Sketch. Learning more on target can deliver better models for target.\n", "keywords": "domain adaptation;source-free adaptation;unsupervised domain adaptation", "primary_area": "", "supplementary_material": "/attachment/d41975f59b8c17d1d74377b99b9a425da37c6bbe.zip", "author": "Dequan Wang;Shaoteng Liu;Sayna Ebrahimi;Evan Shelhamer;Trevor Darrell", "authorids": "~Dequan_Wang1;~Shaoteng_Liu1;~Sayna_Ebrahimi1;~Evan_Shelhamer2;~Trevor_Darrell2", "gender": ";M;F;;", "homepage": ";https://www.shaotengliu.com/;https://saynaebrahimi.github.io/;;", "dblp": ";02/10511;207/7584;;", "google_scholar": ";v4JMf6kAAAAJ;wRyjJfMAAAAJ;;", "orcid": ";;;;", "linkedin": ";;saynaebrahimi/;;", "or_profile": "~Dequan_Wang1;~Shaoteng_Liu1;~Sayna_Ebrahimi1;~Evan_Shelhamer2;~Trevor_Darrell2", "aff": ";The Chinese University of Hong Kong;Google;;", "aff_domain": ";cuhk.edu.hk;google.com;;", "position": ";PhD student;Research Scientist;;", "bibtex": "@misc{\nwang2022ontarget,\ntitle={On-Target Adaptation},\nauthor={Dequan Wang and Shaoteng Liu and Sayna Ebrahimi and Evan Shelhamer and Trevor Darrell},\nyear={2022},\nurl={https://openreview.net/forum?id=6ooiNCGZa5K}\n}", "github": "", "project": "", "reviewers": "Lx65;pNRq;QwsH", "site": "https://openreview.net/forum?id=6ooiNCGZa5K", "pdf_size": 0, "recommendation": "3;5;6", "confidence": "3;4;3", "correctness": "3;2;3", "technical_novelty": "3;3;3", "empirical_novelty": "3;2;3", "wc_summary_paper": "28;52;156", "wc_summary_review": "15;96;79", "wc_main_review": "363;656;321", "wc_review": "406;804;556", "wc_reply_reviewers": "0;71;56", "wc_reply_authors": "446;184;301", "reply_reviewers": "0;1;1", "reply_authors": "1;1;1", "recommendation_avg": [ 4.666666666666667, 1.247219128924647 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "correctness_avg": [ 2.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 78.66666666666667, 55.553777749332426 ], "wc_summary_review_avg": [ 63.333333333333336, 34.874377732401506 ], "wc_main_review_avg": [ 446.6666666666667, 149.0108124339379 ], "wc_review_avg": [ 588.6666666666666, 164.11648979375053 ], "wc_reply_reviewers_avg": [ 42.333333333333336, 30.554141381415967 ], "wc_reply_authors_avg": [ 310.3333333333333, 107.16446342991794 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.18898223650461363, "corr_recommendation_correctness": -0.18898223650461363, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1, "aff_unique_index": "0;1", "aff_unique_norm": "Chinese University of Hong Kong;Google", "aff_unique_dep": ";Google", "aff_unique_url": "https://www.cuhk.edu.hk;https://www.google.com", "aff_unique_abbr": "CUHK;Google", "aff_campus_unique_index": "0;1", "aff_campus_unique": "Hong Kong SAR;Mountain View", "aff_country_unique_index": "0;1", "aff_country_unique": "China;United States" }, { "id": "6p8D4V_Wmyp", "title": "RainNet: A Large-Scale Imagery Dataset for Spatial Precipitation Downscaling", "track": "main", "status": "Reject", "tldr": "", "abstract": "Contemporary deep learning frameworks have been applied to solve meteorological problems (\\emph{e.g.}, front detection, synthetic radar generation, precipitation nowcasting, \\emph{e.t.c.}) and have achieved highly promising results. Spatial precipitation downscaling is one of the most important meteorological problems. However, the lack of a well-organized and annotated large-scale dataset hinders the training and verification of more effective and advancing deep-learning models for precipitation downscaling. To alleviate these obstacles, we present the first large-scale spatial precipitation downscaling dataset named \\emph{RainNet}, which contains more than $62,400$ pairs of high-quality low/high-resolution precipitation maps for over $17$ years, ready to help the evolution of deep models in precipitation downscaling. Specifically, the precipitation maps carefully collected in RainNet cover various meteorological phenomena (\\emph{e.g.}, hurricane, squall, \\emph{e.t.c}.), which is of great help to improve the model generalization ability. In addition, the map pairs in RainNet are organized in the form of image sequences ($720$ maps per month or 1 map/hour), showing complex physical properties, \\emph{e.g.}, temporal misalignment, temporal sparse, and fluid properties. Two machine-learning-oriented metrics are specifically introduced to evaluate or verify the comprehensive performance of the trained model, (\\emph{e.g.}, prediction maps reconstruction accuracy). To illustrate the applications of RainNet, 14 state-of-the-art models, including deep models and traditional approaches, are evaluated. To fully explore potential downscaling solutions, we propose an implicit physical estimation framework to learn the above characteristics. Extensive experiments demonstrate that the value of RainNet in training and evaluating downscaling models.", "keywords": "Dateset;Downscaling;Computer Vision;Physics;Weather Forecast", "primary_area": "", "supplementary_material": "/attachment/bf4aeea1c3bd22682e5f7bb1ed357072605fd277.zip", "author": "Xuanhong Chen;Kairui Feng;Naiyuan Liu;Yifan Lu;Bingbing Ni;Ziang Liu;Maofeng Liu", "authorids": "~Xuanhong_Chen1;~Kairui_Feng1;~Naiyuan_Liu1;~Yifan_Lu1;~Bingbing_Ni3;~Ziang_Liu1;~Maofeng_Liu1", "gender": "M;M;M;M;M;M;M", "homepage": "https://github.com/neuralchen;https://cee.princeton.edu/people/kairui-kelvin-feng;https://github.com/NNNNAI;https://yifanlu0227.github.io;;;", "dblp": "255/6337;;;;64/831.html;;", "google_scholar": "UuCqlfEAAAAJ;;;hiXGPH8AAAAJ;V9W87PYAAAAJ;;1JAPVtkAAAAJ", "orcid": ";;;;;;", "linkedin": ";;;yifan-lu-65ab69229/;;%E5%AD%90%E6%98%82-%E5%88%98-8aaa36186/;", "or_profile": "~Xuanhong_Chen1;~Kairui_Feng1;~Naiyuan_Liu1;~Yifan_Lu1;~Bingbing_Ni3;~Ziang_Liu1;~Maofeng_Liu1", "aff": "Shanghai Jiaotong University;Princeton University;University of Technology Sydney;Shanghai Jiaotong University;Shanghai Jiaotong University;;University of Miami", "aff_domain": "sjtu.edu.cn;princeton.edu;uts.edu.au;sjtu.edu.cn;sjtu.edu.cn;;miami.edu", "position": "PhD student;Postdoc;MS student;Undergrad student;Full Professor;;Postdoc", "bibtex": "@misc{\nchen2022rainnet,\ntitle={RainNet: A Large-Scale Imagery Dataset for Spatial Precipitation Downscaling},\nauthor={Xuanhong Chen and Kairui Feng and Naiyuan Liu and Yifan Lu and Bingbing Ni and Ziang Liu and Maofeng Liu},\nyear={2022},\nurl={https://openreview.net/forum?id=6p8D4V_Wmyp}\n}", "github": "", "project": "", "reviewers": "5pVg;D3tQ;szBD;ggKX", "site": "https://openreview.net/forum?id=6p8D4V_Wmyp", "pdf_size": 0, "recommendation": "3;3;5;6", "confidence": "5;5;4;4", "correctness": "2;3;4;3", "technical_novelty": "3;2;3;3", "empirical_novelty": "2;3;0;3", "wc_summary_paper": "101;51;22;67", "wc_summary_review": "41;40;37;50", "wc_main_review": "818;430;92;122", "wc_review": "960;521;151;239", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.25, 1.299038105676658 ], "confidence_avg": [ 4.5, 0.5 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 1.224744871391589 ], "wc_summary_paper_avg": [ 60.25, 28.525208149985513 ], "wc_summary_review_avg": [ 42.0, 4.847679857416329 ], "wc_main_review_avg": [ 365.5, 292.8357047902458 ], "wc_review_avg": [ 467.75, 315.35803065721984 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": -0.9622504486493761, "corr_recommendation_correctness": 0.5443310539518174, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:syI_3sMmLDkJ:scholar.google.com/&scioq=RainNet:+A+Large-Scale+Imagery+Dataset+for+Spatial+Precipitation+Downscaling&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;2;0;0;3", "aff_unique_norm": "Shanghai Jiao Tong University;Princeton University;University of Technology Sydney;University of Miami", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.sjtu.edu.cn;https://www.princeton.edu;https://www.uts.edu.au;https://www.miami.edu", "aff_unique_abbr": "SJTU;Princeton;UTS;UM", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2;0;0;1", "aff_country_unique": "China;United States;Australia" }, { "title": "TRAIL: Near-Optimal Imitation Learning with Suboptimal Data", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6774", "id": "6q_2b6u0BnJ", "poster": "", "openreview": "https://openreview.net/forum?id=6q_2b6u0BnJ", "slides": "https://iclr.cc/virtual/2022/poster/6774", "video": "https://iclr.cc/virtual/2022/poster/6774", "author_site": "Sherry Yang, Sergey Levine, Ofir Nachum", "tldr": "", "abstract": "In imitation learning, one aims to learn task-solving policies using access to near-optimal expert trajectories collected from the task environment. However, high-quality trajectories -- e.g., from human experts -- can be expensive to obtain in practical settings. On the contrary, it is often much easier to obtain large amounts of suboptimal trajectories which can nevertheless provide insight into the structure of the environment, showing what \\emph{could} be done in the environment even if not what \\emph{should} be done. Is it possible to formalize these conceptual benefits and devise algorithms to use offline datasets to yield \\emph{provable} improvements to the sample-efficiency of imitation learning? In this work, we answer this question affirmatively and present training objectives which use an offline dataset to learn an approximate \\emph{factored} dynamics model whose structure enables the extraction of a \\emph{latent action space}. Our theoretical analysis shows that the learned latent action space can boost the sample-efficiency of downstream imitation learning, effectively reducing the need for large near-optimal expert datasets through the use of auxiliary non-expert data. We evaluate the practicality of our objective through experiments on a set of navigation and locomotion tasks. Our results verify the benefits suggested by our theory and show that our algorithms is able to recover near-optimal policies with fewer expert trajectories.", "keywords": "Imitation Learning;Action Representations;Latent Dynamics Model;Offline Datasets", "primary_area": "", "supplementary_material": "", "author": "Mengjiao Yang;Sergey Levine;Ofir Nachum", "authorids": "~Mengjiao_Yang1;~Sergey_Levine1;~Ofir_Nachum1", "gender": "M;M;F", "homepage": "https://people.eecs.berkeley.edu/~svlevine/;https://scholar.google.com/citations?user=C-ZlBWMAAAAJ&hl=en;https://sherryy.github.io", "dblp": "80/7594;;", "google_scholar": "8R35rCwAAAAJ;C-ZlBWMAAAAJ;7c1B_fIAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Sergey_Levine1;~Ofir_Nachum1;~Sherry_Yang1", "aff": "Google;OpenAI;University of California, Berkeley", "aff_domain": "google.com;openai.com;berkeley.edu", "position": "Research Scientist;Researcher;Student", "bibtex": "@inproceedings{\nyang2022trail,\ntitle={{TRAIL}: Near-Optimal Imitation Learning with Suboptimal Data},\nauthor={Mengjiao Yang and Sergey Levine and Ofir Nachum},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=6q_2b6u0BnJ}\n}", "github": "", "project": "", "reviewers": "xchs;nwvX;M6PX", "pdf_size": 0, "recommendation": "6;6;8", "confidence": "3;3;3", "correctness": "4;3;3", "technical_novelty": "3;3;3", "empirical_novelty": "3;2;3", "wc_summary_paper": "96;94;175", "wc_summary_review": "163;13;97", "wc_main_review": "434;391;241", "wc_review": "693;498;513", "wc_reply_reviewers": "504;102;23", "wc_reply_authors": "887;740;424", "reply_reviewers": "1;1;1", "reply_authors": "2;3;2", "recommendation_avg": [ 6.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 3.0, 0.0 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 121.66666666666667, 37.72119946248911 ], "wc_summary_review_avg": [ 91.0, 61.38403701289123 ], "wc_main_review_avg": [ 355.3333333333333, 82.7298145907642 ], "wc_review_avg": [ 568.0, 88.60022573334675 ], "wc_reply_reviewers_avg": [ 209.66666666666666, 210.60916936881506 ], "wc_reply_authors_avg": [ 683.6666666666666, 193.17062118471557 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 2.3333333333333335, 0.4714045207910317 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": -0.49999999999999983, "gs_citation": 57, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13031874054704232682&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=6q_2b6u0BnJ", "email": "google.com;openai.com;berkeley.edu", "author_num": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "Google;OpenAI;University of California, Berkeley", "aff_unique_dep": "Google;;", "aff_unique_url": "https://www.google.com;https://openai.com;https://www.berkeley.edu", "aff_unique_abbr": "Google;OpenAI;UC Berkeley", "aff_campus_unique_index": "0;2", "aff_campus_unique": "Mountain View;;Berkeley", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "6res1KC1Z3Z", "title": "Batch-Softmax Contrastive Loss for Pairwise Sentence Scoring Tasks", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "The use of contrastive loss for representation learning has become prominent in computer vision, and it is now getting attention in natural language processing (NLP). Here, we explore the idea of using a batch-softmax contrastive loss when fine-tuning large-scale pre-trained transformer models to learn better task-specific sentence embeddings for pairwise sentence scoring tasks. We introduce and study a number of variations in the calculation of the loss as well as in the overall training procedure; in particular, we find that data shuffling can be quite important. Our experimental results show sizable improvements on a number of datasets and pairwise sentence scoring tasks including classification, ranking, and regression. Finally, we offer detailed analysis and discussion, which should be useful for researchers aiming to explore the utility of the contrastive loss in NLP.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Anton Chernyavskiy;Dmitry Ilvovsky;Pavel Kalinin;Preslav Nakov", "authorids": "~Anton_Chernyavskiy1;~Dmitry_Ilvovsky1;~Pavel_Kalinin1;~Preslav_Nakov2", "gender": "M;M;M;M", "homepage": ";;;https://mbzuai.ac.ae/study/faculty/preslav-nakov/", "dblp": "272/4246;270/9528;;https://dblp.uni-trier.de/pid/19/1947", "google_scholar": ";https://scholar.google.com/citations?hl=ru;;DfXsKZ4AAAAJ", "orcid": ";;;0000-0002-3600-1510", "linkedin": ";;pavel-kalinin-51265782/;preslavnakov/", "or_profile": "~Anton_Chernyavskiy1;~Dmitry_Ilvovsky1;~Pavel_Kalinin1;~Preslav_Nakov2", "aff": "Higher School of Economics;Higher School of Economics;Yandex;Qatar Computing Research Institute, HBKU", "aff_domain": "hse.ru;hse.ru;yandex-team.ru;hbku.edu.qa", "position": "PhD student;Associate Professor;Software Developer;Principal Scientist", "bibtex": "@misc{\nchernyavskiy2022batchsoftmax,\ntitle={Batch-Softmax Contrastive Loss for Pairwise Sentence Scoring Tasks},\nauthor={Anton Chernyavskiy and Dmitry Ilvovsky and Pavel Kalinin and Preslav Nakov},\nyear={2022},\nurl={https://openreview.net/forum?id=6res1KC1Z3Z}\n}", "github": "", "project": "", "reviewers": "rjYM;5avs;EYaq", "site": "https://openreview.net/forum?id=6res1KC1Z3Z", "pdf_size": 0, "recommendation": "3;3;5", "confidence": "5;4;4", "correctness": "1;1;3", "technical_novelty": "1;2;2", "empirical_novelty": "2;1;2", "wc_summary_paper": "99;75;46", "wc_summary_review": "65;29;14", "wc_main_review": "376;679;181", "wc_review": "540;783;241", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "758;865;567", "reply_reviewers": "0;0;0", "reply_authors": "1;2;1", "recommendation_avg": [ 3.6666666666666665, 0.9428090415820634 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "correctness_avg": [ 1.6666666666666667, 0.9428090415820634 ], "technical_novelty_avg": [ 1.6666666666666667, 0.4714045207910317 ], "empirical_novelty_avg": [ 1.6666666666666667, 0.4714045207910317 ], "wc_summary_paper_avg": [ 73.33333333333333, 21.66923061752668 ], "wc_summary_review_avg": [ 36.0, 21.400934559032695 ], "wc_main_review_avg": [ 412.0, 204.89509510966826 ], "wc_review_avg": [ 521.3333333333334, 221.66390975729198 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 730.0, 123.25853587750694 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.4999999999999999, "corr_recommendation_correctness": 1.0, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17313257762518138450&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff_unique_index": "0;0;1;2", "aff_unique_norm": "Higher School of Economics;Yandex;Qatar Computing Research Institute", "aff_unique_dep": ";;", "aff_unique_url": "https://www.hse.ru;https://yandex.com;https://www.qcri.org", "aff_unique_abbr": "HSE;Yandex;QCRI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1", "aff_country_unique": "Russian Federation;Qatar" }, { "title": "Chemical-Reaction-Aware Molecule Representation Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6246", "id": "6sh3pIzKS-", "poster": "", "openreview": "https://openreview.net/forum?id=6sh3pIzKS-", "slides": "https://iclr.cc/virtual/2022/poster/6246", "video": "https://iclr.cc/virtual/2022/poster/6246", "author_site": "Hongwei Wang, Weijiang Li, Xiaomeng Jin, Kyunghyun Cho, Heng Ji, Jiawei Han, Martin Burke", "tldr": "", "abstract": "Molecule representation learning (MRL) methods aim to embed molecules into a real vector space. However, existing SMILES-based (Simplified Molecular-Input Line-Entry System) or GNN-based (Graph Neural Networks) MRL methods either take SMILES strings as input that have difficulty in encoding molecule structure information, or over-emphasize the importance of GNN architectures but neglect their generalization ability. Here we propose using chemical reactions to assist learning molecule representation. The key idea of our approach is to preserve the equivalence of molecules with respect to chemical reactions in the embedding space, i.e., forcing the sum of reactant embeddings and the sum of product embeddings to be equal for each chemical equation. This constraint is proven effective to 1) keep the embedding space well-organized and 2) improve the generalization ability of molecule embeddings. Moreover, our model can use any GNN as the molecule encoder and is thus agnostic to GNN architectures. Experimental results demonstrate that our method achieves state-of-the-art performance in a variety of downstream tasks, e.g., reaction product prediction, molecule property prediction, reaction classification, and graph-edit-distance prediction. The code is available at https://github.com/hwwang55/MolR.", "keywords": "molecule representation learning;graph neural networks;chemical reaction", "primary_area": "", "supplementary_material": "/attachment/a26121fc79d0df8f5fafc3f2477fb793fae69ff7.zip", "author": "Hongwei Wang;Weijiang Li;Xiaomeng Jin;Kyunghyun Cho;Heng Ji;Jiawei Han;Martin D. Burke", "authorids": "~Hongwei_Wang1;~Weijiang_Li1;xjin17@illinois.edu;~Kyunghyun_Cho1;~Heng_Ji3;~Jiawei_Han1;~Martin_D._Burke1", "gender": "M;F;;M;F;M;M", "homepage": "https://hongweiw.net;;;http://kyunghyuncho.me;http://blender.cs.illinois.edu/hengji.html;http://hanj.cs.illinois.edu/;https://chemistry.illinois.edu/mdburke", "dblp": "https://dblp.org/pers/hd/w/Wang_0004:Hongwei;;;41/9736;;h/JiaweiHan.html;", "google_scholar": "3C__4wsAAAAJ;RZLYsFwAAAAJ;;https://scholar.google.fi/citations?user=0RAmmIAAAAAJ;z7GCqT4AAAAJ;https://scholar.google.com.tw/citations?user=Kv9AbjMAAAAJ;u4hmNXAAAAAJ", "orcid": "0000-0001-7474-8271;;;;;0000-0002-3629-2696;", "linkedin": "hongwei-wang-730a7b72/;;;;;;", "or_profile": "~Hongwei_Wang1;~Weijiang_Li1;xjin17@illinois.edu;~Kyunghyun_Cho1;~Heng_Ji3;~Jiawei_Han1;~Martin_D._Burke1", "aff": "University of Illinois, Urbana Champaign;University of Illinois, Urbana Champaign;;New York University;University of Illinois, Urbana-Champaign;University of Illinois at Urbana-Champaign (UIUC);University of Illinois, Urbana Champaign", "aff_domain": "illinois.edu;illinois.edu;;nyu.edu;uiuc.edu;illinois.edu;illinois.edu", "position": "Postdoc;Undergrad student;;Associate Professor;Full Professor;Full Professor;Full Professor", "bibtex": "@inproceedings{\nwang2022chemicalreactionaware,\ntitle={Chemical-Reaction-Aware Molecule Representation Learning},\nauthor={Hongwei Wang and Weijiang Li and Xiaomeng Jin and Kyunghyun Cho and Heng Ji and Jiawei Han and Martin D. Burke},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=6sh3pIzKS-}\n}", "github": "", "project": "", "reviewers": "3hy7;LLpm;17vx;h7pt", "pdf_size": 0, "recommendation": "6;6;8;8", "confidence": "4;3;5;4", "correctness": "3;4;4;4", "technical_novelty": "4;3;3;3", "empirical_novelty": "3;3;3;3", "wc_summary_paper": "54;79;45;55", "wc_summary_review": "33;18;18;26", "wc_main_review": "354;159;335;134", "wc_review": "441;256;398;215", "wc_reply_reviewers": "0;42;25;40", "wc_reply_authors": "597;766;688;265", "reply_reviewers": "0;1;1;1", "reply_authors": "1;1;1;1", "recommendation_avg": [ 7.0, 1.0 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 58.25, 12.597122687344122 ], "wc_summary_review_avg": [ 23.75, 6.2599920127744575 ], "wc_main_review_avg": [ 245.5, 99.62053001264348 ], "wc_review_avg": [ 327.5, 94.36763216272834 ], "wc_reply_reviewers_avg": [ 26.75, 16.78354849249705 ], "wc_reply_authors_avg": [ 579.0, 190.89918805484743 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.7071067811865475, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 89, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16867974973581425308&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "pdf": "https://openreview.net/pdf?id=6sh3pIzKS-", "email": "illinois.edu;illinois.edu;;nyu.edu;uiuc.edu;illinois.edu;illinois.edu", "author_num": 7, "aff_unique_index": "0;0;1;2;0;0", "aff_unique_norm": "University of Illinois Urbana-Champaign;New York University;University of Illinois", "aff_unique_dep": ";;", "aff_unique_url": "https://illinois.edu;https://www.nyu.edu;https://illinois.edu", "aff_unique_abbr": "UIUC;NYU;UIUC", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Urbana-Champaign;", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Stability Regularization for Discrete Representation Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6990", "id": "6tmjoym9LR6", "poster": "", "openreview": "https://openreview.net/forum?id=6tmjoym9LR6", "slides": "https://iclr.cc/virtual/2022/poster/6990", "video": "https://iclr.cc/virtual/2022/poster/6990", "author_site": "Adeel Pervez, Efstratios Gavves", "tldr": "", "abstract": "We present a method for training neural network models with discrete stochastic variables.\nThe core of the method is \\emph{stability regularization}, which is a regularization procedure based on the idea of noise stability developed in Gaussian isoperimetric theory in the analysis of Gaussian functions.\nStability regularization is method to make the output of continuous functions of Gaussian random variables close to discrete, that is binary or categorical, without the need for significant manual tuning.\nThe method allows control over the extent to which a Gaussian function's output is close to discrete, thus allowing for continued flow of gradient.\nThe method can be used standalone or in combination with existing continuous relaxation methods.\nWe validate the method in a broad range of experiments using discrete variables including neural relational inference, generative modeling, clustering and conditional computing.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Adeel Pervez;Efstratios Gavves", "authorids": "~Adeel_Pervez1;~Efstratios_Gavves1", "gender": ";M", "homepage": ";https://www.egavves.com", "dblp": "225/4821;03/8693", "google_scholar": ";https://scholar.google.nl/citations?user=QqfCvsgAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Adeel_Pervez1;~Efstratios_Gavves1", "aff": "University of Amsterdam;University of Amsterdam", "aff_domain": "uva.nl;uva.nl", "position": "PhD student;Associate Professor", "bibtex": "@inproceedings{\npervez2022stability,\ntitle={Stability Regularization for Discrete Representation Learning},\nauthor={Adeel Pervez and Efstratios Gavves},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=6tmjoym9LR6}\n}", "github": "", "project": "", "reviewers": "btCU;k6BN;ZtpK;jxs7", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "4;4;3;3", "correctness": "3;3;4;4", "technical_novelty": "3;2;3;3", "empirical_novelty": "0;2;3;2", "wc_summary_paper": "47;114;74;41", "wc_summary_review": "57;20;60;12", "wc_main_review": "838;285;211;246", "wc_review": "942;419;345;299", "wc_reply_reviewers": "279;0;0;0", "wc_reply_authors": "1399;724;224;590", "reply_reviewers": "1;0;0;0", "reply_authors": "2;1;1;1", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.75, 1.0897247358851685 ], "wc_summary_paper_avg": [ 69.0, 28.801041647829337 ], "wc_summary_review_avg": [ 37.25, 21.46363203188128 ], "wc_main_review_avg": [ 395.0, 257.10211978900526 ], "wc_review_avg": [ 501.25, 258.042995448433 ], "wc_reply_reviewers_avg": [ 69.75, 120.8105438279292 ], "wc_reply_authors_avg": [ 734.25, 425.1942938234237 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -1.0, "corr_recommendation_correctness": 1.0, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10409829449547338888&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "pdf": "https://openreview.net/pdf?id=6tmjoym9LR6", "email": "uva.nl;uva.nl", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "University of Amsterdam", "aff_unique_dep": "", "aff_unique_url": "https://www.uva.nl", "aff_unique_abbr": "UvA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Netherlands" }, { "title": "Bootstrapping Semantic Segmentation with Regional Contrast", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6375", "id": "6u6N8WWwYSM", "poster": "", "openreview": "https://openreview.net/forum?id=6u6N8WWwYSM", "slides": "https://iclr.cc/virtual/2022/poster/6375", "video": "https://iclr.cc/virtual/2022/poster/6375", "author_site": "Shikun Liu, Shuaifeng Zhi, Edward Johns, Andrew Davison", "tldr": "", "abstract": "We present ReCo, a contrastive learning framework designed at a regional level to assist learning in semantic segmentation. ReCo performs pixel-level contrastive learning on a sparse set of hard negative pixels, with minimal additional memory footprint. ReCo is easy to implement, being built on top of off-the-shelf segmentation networks, and consistently improves performance, achieving more accurate segmentation boundaries and faster convergence. The strongest effect is in semi-supervised learning with very few labels. With ReCo, we achieve high quality semantic segmentation model, requiring only 5 examples of each semantic class. ", "keywords": "semi-supervised learning;semantic segmentation;contrastive learning", "primary_area": "", "supplementary_material": "/attachment/dff06f48fbfad44993e5565640c139a38bfa3b3f.zip", "author": "Shikun Liu;Shuaifeng Zhi;Edward Johns;Andrew Davison", "authorids": "~Shikun_Liu1;~Shuaifeng_Zhi2;~Edward_Johns1;~Andrew_Davison1", "gender": "M;M;M;M", "homepage": "http://shikun.io;https://shuaifengzhi.com/;https://www.robot-learning.uk;http://www.doc.ic.ac.uk/~ajd/", "dblp": "https://dblp.uni-trier.de/pers/hd/l/Liu:Shikun;209/3436;68/9968;d/AndrewJDavison", "google_scholar": "https://scholar.google.com/citations?hl=en;https://scholar.google.co.uk/citations?user=5ls6RgQAAAAJ;https://scholar.google.co.uk/citations?user=sMIUkiQAAAAJ;https://scholar.google.co.uk/citations?user=A0ae1agAAAAJ", "orcid": ";;0000-0002-8914-8786;", "linkedin": ";;https://uk.linkedin.com/in/edward-johns-1b24845a;", "or_profile": "~Shikun_Liu1;~Shuaifeng_Zhi2;~Edward_Johns1;~Andrew_Davison1", "aff": "Imperial College London;National University of Defense Technology;Imperial College London;Imperial College London", "aff_domain": "imperial.ac.uk;nudt.edu.cn;imperial.ac.uk;imperial.ac.uk", "position": "Ph.D. Student;Lecturer;Associate Professor;Full Professor", "bibtex": "@inproceedings{\nliu2022bootstrapping,\ntitle={Bootstrapping Semantic Segmentation with Regional Contrast},\nauthor={Shikun Liu and Shuaifeng Zhi and Edward Johns and Andrew Davison},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=6u6N8WWwYSM}\n}", "github": "", "project": "", "reviewers": "qBDp;uAhS;cwn7;qZuF", "pdf_size": 0, "recommendation": "6;6;8;8", "confidence": "4;4;5;3", "correctness": "4;3;4;4", "technical_novelty": "2;3;3;2", "empirical_novelty": "3;3;3;3", "wc_summary_paper": "38;38;127;89", "wc_summary_review": "48;187;101;23", "wc_main_review": "329;125;208;663", "wc_review": "415;350;436;775", "wc_reply_reviewers": "65;0;0;45", "wc_reply_authors": "445;865;459;468", "reply_reviewers": "1;0;0;1", "reply_authors": "2;2;1;1", "recommendation_avg": [ 7.0, 1.0 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 73.0, 37.489998666310996 ], "wc_summary_review_avg": [ 89.75, 62.81470767264622 ], "wc_main_review_avg": [ 331.25, 204.81256675311698 ], "wc_review_avg": [ 494.0, 165.30426491775702 ], "wc_reply_reviewers_avg": [ 27.5, 28.394541729001368 ], "wc_reply_authors_avg": [ 559.25, 176.71498946043033 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 163, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12918707374441736964&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=6u6N8WWwYSM", "email": "imperial.ac.uk;nudt.edu.cn;imperial.ac.uk;imperial.ac.uk", "author_num": 4, "aff_unique_index": "0;1;0;0", "aff_unique_norm": "Imperial College London;National University of Defense Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.imperial.ac.uk;http://www.nudt.edu.cn/", "aff_unique_abbr": "ICL;NUDT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0", "aff_country_unique": "United Kingdom;China" }, { "id": "6uu1t8jQ-M", "title": "Generating Novel Scene Compositions from Single Images and Videos", "track": "main", "status": "Reject", "tldr": "", "abstract": "Given a large dataset for training, GANs can achieve remarkable performance for the image synthesis task. However, training GANs in extremely low data regimes remains a challenge, as overfitting often occurs, leading to memorization or training divergence. In this work, we introduce SIV-GAN, an unconditional generative model that can generate new scene compositions from a single training image or a single video clip. We propose a two-branch discriminator architecture, with content and layout branches designed to judge internal content and scene layout realism separately from each other. This discriminator design enables synthesis of visually plausible, novel compositions of a scene, with varying content and layout, while preserving the context of the original sample. Compared to previous single image GANs, our model generates more diverse, higher quality images, while not being restricted to a single image setting. We show that SIV-GAN successfully deals with a new challenging task of learning from a single video, for which prior GAN models fail to achieve synthesis of both high quality and diversity.", "keywords": "GANs;Image Generation;Deep Learning;Image Synthesis;Generative Models", "primary_area": "", "supplementary_material": "", "author": "Vadim Sushko;Dan Zhang;Juergen Gall;Anna Khoreva", "authorids": "~Vadim_Sushko1;~Dan_Zhang1;~Juergen_Gall1;~Anna_Khoreva1", "gender": "M;;;F", "homepage": ";;https://pages.iai.uni-bonn.de/gall_juergen/;", "dblp": "281/6637;21/802-17;13/6920;152/5005", "google_scholar": "-3S_ucEAAAAJ;https://scholar.google.de/citations?user=yazO-mMAAAAJ;1CLaPMEAAAAJ;https://scholar.google.de/citations?user=ILgZT7MAAAAJ", "orcid": ";0000-0003-0930-9162;0000-0002-9447-3399;", "linkedin": "vadim-sushko/;;;", "or_profile": "~Vadim_Sushko1;~Dan_Zhang1;~Juergen_Gall1;~Anna_Khoreva1", "aff": "Robert Bosch GmbH, Bosch;Robert Bosch GmbH, Bosch;University of Bonn;Bosch Center for Artificial Intelligence", "aff_domain": "de.bosch.com;de.bosch.com;uni-bonn.de;bosch.com", "position": "PhD student;Research Scientist;Professor;Research Group Leader", "bibtex": "@misc{\nsushko2022generating,\ntitle={Generating Novel Scene Compositions from Single Images and Videos},\nauthor={Vadim Sushko and Dan Zhang and Juergen Gall and Anna Khoreva},\nyear={2022},\nurl={https://openreview.net/forum?id=6uu1t8jQ-M}\n}", "github": "", "project": "", "reviewers": "TwJf;LNdN;DXh5;inBU", "site": "https://openreview.net/forum?id=6uu1t8jQ-M", "pdf_size": 0, "recommendation": "3;5;5;5", "confidence": "5;5;4;4", "correctness": "3;3;3;3", "technical_novelty": "2;3;2;2", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "60;66;48;82", "wc_summary_review": "57;55;41;45", "wc_main_review": "248;437;358;284", "wc_review": "365;558;447;411", "wc_reply_reviewers": "182;76;0;0", "wc_reply_authors": "2277;2226;2775;2662", "reply_reviewers": "1;1;0;0", "reply_authors": "4;5;4;4", "recommendation_avg": [ 4.5, 0.8660254037844386 ], "confidence_avg": [ 4.5, 0.5 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 64.0, 12.24744871391589 ], "wc_summary_review_avg": [ 49.5, 6.689544080129826 ], "wc_main_review_avg": [ 331.75, 72.56161175166935 ], "wc_review_avg": [ 445.25, 71.2894627557257 ], "wc_reply_reviewers_avg": [ 64.5, 74.59725196010909 ], "wc_reply_authors_avg": [ 2485.0, 237.57840810982802 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 4.25, 0.4330127018922193 ], "replies_avg": [ 26, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.0, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16984627309417788046&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff_unique_index": "0;0;1;2", "aff_unique_norm": "Robert Bosch GmbH;University of Bonn;Bosch Center for Artificial Intelligence", "aff_unique_dep": ";;Center for Artificial Intelligence", "aff_unique_url": "https://www.bosch.com;https://www.uni-bonn.de/;https://www.bosch-ai.com", "aff_unique_abbr": "Bosch;UBonn;BCAI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Germany" }, { "id": "6vSDzn-4FlW", "title": "Synaptic Diversity in ANNs Can Facilitate Faster Learning", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Various advancements in artificial neural networks (ANNs) are inspired by biological concepts, e.g., the artificial neuron, an efficient model of biological nerve cells demonstrating learning capabilities on large amounts of data. More recent inspirations with promising results are advanced regularization techniques, e.g., synaptic scaling, and backpropagation alternatives, e.g., Targetprop. While neurosciences continuously discover and better understand the mechanisms of biological neural networks (BNNs), new opportunities for a transfer of these concepts towards ANNs arise. However, only few concepts are readily applicable and improvements for ANNs are far from being guaranteed. In this paper, we focus on the inhomogeneous and dynamically changing structures of BNNs in contrast to mostly homogeneous and fix topologies of ANNs. More specifically, we transfer concepts of synaptic diversity, namely spontaneous synaptic remodeling, diversity in synaptic plasticity and multi-synaptic connectivity to ANNs. We observe ANNs enhanced by synaptic diversity concepts to learn faster, to predict with higher accuracy and to be more resilient to gradient inversion attacks. Our proposed methods are easily applicable to existing ANN topologies and are therefore supposed to stimulate an adaptation of and further research into these mechanisms.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Martin Hofmann;Moritz F. P. Becker;Christian Tetzlaff;Patrick M\u00e4der", "authorids": "~Martin_Hofmann2;~Moritz_F._P._Becker1;~Christian_Tetzlaff1;~Patrick_M\u00e4der1", "gender": ";;;M", "homepage": ";;https://tetzlab.com/home;http://www.tu-ilmenau.de/dAI.sy", "dblp": "159/7821-19;;;m/PatrickMader", "google_scholar": ";;;https://scholar.google.de/citations?user=LPzoSrsAAAAJ", "orcid": "0000-0002-4440-3317;0000-0002-4287-5732;;0000-0001-6871-2707", "linkedin": "martin-hofmann-484a10b7/;;;", "or_profile": "~Martin_Hofmann2;~Moritz_F._P._Becker1;~Christian_Tetzlaff1;~Patrick_M\u00e4der1", "aff": ";University of Goettingen;University of G\u00f6ttingen;Technische Universit\u00e4t Ilmenau", "aff_domain": ";uni-goettingen.de;uni-goettingen.de;tu-ilmenau.de", "position": ";PhD student;Principal Researcher;Full Professor", "bibtex": "@misc{\nhofmann2022synaptic,\ntitle={Synaptic Diversity in {ANN}s Can Facilitate Faster Learning},\nauthor={Martin Hofmann and Moritz F. P. Becker and Christian Tetzlaff and Patrick M{\\\"a}der},\nyear={2022},\nurl={https://openreview.net/forum?id=6vSDzn-4FlW}\n}", "github": "", "project": "", "reviewers": "6oyw;X8C4;1vZx;xNv6", "site": "https://openreview.net/forum?id=6vSDzn-4FlW", "pdf_size": 0, "recommendation": "3;3;3;3", "confidence": "5;4;3;4", "correctness": "2;4;3;2", "technical_novelty": "2;4;3;3", "empirical_novelty": "1;3;3;2", "wc_summary_paper": "119;49;96;103", "wc_summary_review": "54;52;29;83", "wc_main_review": "233;360;173;613", "wc_review": "406;461;298;799", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 2.75, 0.82915619758885 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 91.75, 26.05163142684158 ], "wc_summary_review_avg": [ 54.5, 19.1637678967368 ], "wc_main_review_avg": [ 344.75, 168.95025155352684 ], "wc_review_avg": [ 491.0, 187.2418222513336 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:-HcGqXN5jTwJ:scholar.google.com/&scioq=Synaptic+Diversity+in+ANNs+Can+Facilitate+Faster+Learning&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;1", "aff_unique_norm": "University of G\u00f6ttingen;Technische Universit\u00e4t Ilmenau", "aff_unique_dep": ";", "aff_unique_url": "https://www.uni-goettingen.de;https://www.tu-ilmenau.de/", "aff_unique_abbr": "UG;TU Ilmenau", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Germany" }, { "title": "Training Transition Policies via Distribution Matching for Complex Tasks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6407", "id": "6vkzF28Hur8", "poster": "", "openreview": "https://openreview.net/forum?id=6vkzF28Hur8", "slides": "https://iclr.cc/virtual/2022/poster/6407", "video": "https://iclr.cc/virtual/2022/poster/6407", "author_site": "Ju-Seung Byun, Andrew Perrault", "tldr": "", "abstract": "Humans decompose novel complex tasks into simpler ones to exploit previously learned skills. Analogously, hierarchical reinforcement learning seeks to leverage lower-level policies for simple tasks to solve complex ones. However, because each lower-level policy induces a different distribution of states, transitioning from one lower-level policy to another may fail due to an unexpected starting state. We introduce transition policies that smoothly connect lower-level policies by producing a distribution of states and actions that matches what is expected by the next policy. Training transition policies is challenging because the natural reward signal---whether the next policy can execute its subtask successfully---is sparse. By training transition policies via adversarial inverse reinforcement learning to match the distribution of expected states and actions, we avoid relying on task-based reward. To further improve performance, we use deep Q-learning with a binary action space to determine when to switch from a transition policy to the next pre-trained policy, using the success or failure of the next subtask as the reward. Although the reward is still sparse, the problem is less severe due to the simple binary action space. We demonstrate our method on continuous bipedal locomotion and arm manipulation tasks that require diverse skills. We show that it smoothly connects the lower-level policies, achieving higher success rates than previous methods that search for successful trajectories based on a reward function, but do not match the state distribution.\n", "keywords": "Reinforcement Learning;Hierarchical Reinforcement Learning;Inverse Reinforcement Learning", "primary_area": "", "supplementary_material": "/attachment/c993990fcc0e79ec9cdf23e22d7e1a058e99b0af.zip", "author": "JU-SEUNG BYUN;Andrew Perrault", "authorids": "~JU-SEUNG_BYUN1;~Andrew_Perrault1", "gender": "M;M", "homepage": "https://shashacks.github.io/;https://aperrault.github.io", "dblp": "277/0662;151/3622", "google_scholar": "yKcK_BMAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";0000-0002-5062-7958", "linkedin": "ju-seung-byun-1a76b01b9/;andrew-perrault-2b956733/", "or_profile": "~JU-SEUNG_BYUN1;~Andrew_Perrault1", "aff": "Google;Ohio State University", "aff_domain": "google.com;osu.edu", "position": "Intern;Assistant Professor", "bibtex": "@inproceedings{\nbyun2022training,\ntitle={Training Transition Policies via Distribution Matching for Complex Tasks},\nauthor={JU-SEUNG BYUN and Andrew Perrault},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=6vkzF28Hur8}\n}", "github": "", "project": "", "reviewers": "1ZCq;M3VJ;CNo5", "pdf_size": 0, "recommendation": "6;6;6", "confidence": "2;2;3", "correctness": "4;4;4", "technical_novelty": "3;3;3", "empirical_novelty": "3;3;2", "wc_summary_paper": "133;135;57", "wc_summary_review": "37;44;63", "wc_main_review": "206;96;483", "wc_review": "376;275;603", "wc_reply_reviewers": "28;0;0", "wc_reply_authors": "231;100;259", "reply_reviewers": "1;0;0", "reply_authors": "1;1;1", "recommendation_avg": [ 6.0, 0.0 ], "confidence_avg": [ 2.3333333333333335, 0.4714045207910317 ], "correctness_avg": [ 4.0, 0.0 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 108.33333333333333, 36.307330144506935 ], "wc_summary_review_avg": [ 48.0, 10.98483803552272 ], "wc_main_review_avg": [ 261.6666666666667, 162.82164748241829 ], "wc_review_avg": [ 418.0, 137.15927481095352 ], "wc_reply_reviewers_avg": [ 9.333333333333334, 13.199326582148887 ], "wc_reply_authors_avg": [ 196.66666666666666, 69.3028779264533 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11055212331250216883&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=6vkzF28Hur8", "email": "google.com;osu.edu", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "Google;Ohio State University", "aff_unique_dep": "Google;", "aff_unique_url": "https://www.google.com;https://www.osu.edu", "aff_unique_abbr": "Google;OSU", "aff_campus_unique_index": "0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "6w2zSI9RAnf", "title": "Reasoning With Hierarchical Symbols: Reclaiming Symbolic Policies For Visual Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Deep vision models are nowadays widely integrated into visual reinforcement learning (RL) to parameterize the policy networks. However, the learned policies are overparameterized black boxes that lack interpretability, and are usually brittle under input distribution shifts. This work revisits this end-to-end learning pipeline, and proposes an alternative stage-wise approach that features hierarchical reasoning. Specifically, our approach progressively converts a policy network into the interpretable symbolic policy, composed from geometric and numerical symbols and operators. A policy regression algorithm called RoundTourMix is proposed to distill the symbolic rules as teacher-student. The symbolic policy can be treated as discrete and abstracted representations of the policy network, but are found to be more interpretable, robust and transferable. The proposed symbolic distillation approach is experimentally demonstrated to maintain the performance and ``de-noise\" the CNN policy: on six specific environments, our distilled symbolic policy achieved compelling or even higher scores than the CNN based RL agents. Our codes will be fully released upon acceptance.\n", "keywords": "Symbolic Regression;Reinforcement Learning;Convolutional Neural Networks;Interpretability", "primary_area": "", "supplementary_material": "/attachment/bd4ffbdfd4672939ae8d6084d70ecca954326777.zip", "author": "Wenqing Zheng;S P Sharan;Zhiwen Fan;Zhangyang Wang", "authorids": "~Wenqing_Zheng1;~S_P_Sharan1;~Zhiwen_Fan2;~Zhangyang_Wang1", "gender": "M;M;;M", "homepage": "https://wenqing-zheng.github.io;https://spsharan.com/;;https://vita-group.github.io", "dblp": ";324/6204;;119/4026", "google_scholar": "https://scholar.google.com/citations?hl=zh-CN;1NtGcNIAAAAJ;;pxFyKAIAAAAJ", "orcid": "0000-0002-8283-7511;0000-0002-6298-6464;;", "linkedin": ";;;", "or_profile": "~Wenqing_Zheng1;~S_P_Sharan1;~Zhiwen_Fan2;~Zhangyang_Wang1", "aff": "University of Texas, Austin;National Institute of Technology Tiruchirappalli;;University of Texas, Austin", "aff_domain": "utexas.edu;nitt.edu;;utexas.edu", "position": "PhD student;Undergrad student;;Assistant Professor", "bibtex": "@misc{\nzheng2022reasoning,\ntitle={Reasoning With Hierarchical Symbols: Reclaiming Symbolic Policies For Visual Reinforcement Learning},\nauthor={Wenqing Zheng and S P Sharan and Zhiwen Fan and Zhangyang Wang},\nyear={2022},\nurl={https://openreview.net/forum?id=6w2zSI9RAnf}\n}", "github": "", "project": "", "reviewers": "AL2N;wezQ;GGAU;cXsw;VSj4", "site": "https://openreview.net/forum?id=6w2zSI9RAnf", "pdf_size": 0, "recommendation": "3;3;6;6;8", "confidence": "4;4;4;4;4", "correctness": "2;2;4;3;4", "technical_novelty": "2;2;3;4;4", "empirical_novelty": "2;2;2;4;4", "wc_summary_paper": "56;98;195;99;60", "wc_summary_review": "79;78;39;140;42", "wc_main_review": "742;359;250;998;338", "wc_review": "877;535;484;1237;440", "wc_reply_reviewers": "781;0;235;713;0", "wc_reply_authors": "1243;1169;631;2212;414", "reply_reviewers": "1;0;2;4;0", "reply_authors": "3;3;2;6;1", "recommendation_avg": [ 5.2, 1.9390719429665317 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.0, 0.8944271909999159 ], "technical_novelty_avg": [ 3.0, 0.8944271909999159 ], "empirical_novelty_avg": [ 2.8, 0.9797958971132712 ], "wc_summary_paper_avg": [ 101.6, 50.10628703067111 ], "wc_summary_review_avg": [ 75.6, 36.42306961254089 ], "wc_main_review_avg": [ 537.4, 285.7268625803321 ], "wc_review_avg": [ 714.6, 303.3523364010899 ], "wc_reply_reviewers_avg": [ 345.8, 339.31307077682703 ], "wc_reply_authors_avg": [ 1133.8, 623.9549342700961 ], "reply_reviewers_avg": [ 1.4, 1.4966629547095764 ], "reply_authors_avg": [ 3.0, 1.6733200530681511 ], "replies_avg": [ 28, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.9225312080288851, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:1dd_NZvglTYJ:scholar.google.com/&scioq=Reasoning+With+Hierarchical+Symbols:+Reclaiming+Symbolic+Policies+For+Visual+Reinforcement+Learning&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;0", "aff_unique_norm": "University of Texas at Austin;National Institute of Technology Tiruchirappalli", "aff_unique_dep": ";", "aff_unique_url": "https://www.utexas.edu;https://www.nitt.edu", "aff_unique_abbr": "UT Austin;NIT Trichy", "aff_campus_unique_index": "0;1;0", "aff_campus_unique": "Austin;Tiruchirappalli", "aff_country_unique_index": "0;1;0", "aff_country_unique": "United States;India" }, { "title": "Revisiting flow generative models for Out-of-distribution detection", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6013", "id": "6y2KBh-0Fd9", "poster": "", "openreview": "https://openreview.net/forum?id=6y2KBh-0Fd9", "slides": "https://iclr.cc/virtual/2022/poster/6013", "video": "https://iclr.cc/virtual/2022/poster/6013", "author_site": "Dihong Jiang, Sun Sun, Yaoliang Yu", "tldr": "", "abstract": "Deep generative models have been widely used in practical applications such as the detection of out-of-distribution (OOD) data. In this work, we aim to re-examine the potential of generative flow models in OOD detection. We first propose a simple combination of univariate one-sample statistical test (e.g., Kolmogorov-Smirnov) and random projections in the latent space of flow models to perform OOD detection. Then, we propose a two-sample version of our test to account for imperfect flow models. Quite distinctly, our method does not pose parametric assumptions on OOD data and is capable of exploiting any flow model. Experimentally, firstly we confirm the efficacy of our method against state-of-the-art baselines through extensive experiments on several image datasets; secondly we investigate the relationship between model accuracy (e.g., the generation quality) and the OOD detection performance, and found surprisingly that they are not always positively correlated; and thirdly we show that detection in the latent space of flow models generally outperforms detection in the sample space across various OOD datasets, hence highlighting the benefits of training a flow model.", "keywords": "flow models;out-of-distribution detection;random projection;distribution comparison", "primary_area": "", "supplementary_material": "", "author": "Dihong Jiang;Sun Sun;Yaoliang Yu", "authorids": "~Dihong_Jiang1;~Sun_Sun1;~Yaoliang_Yu1", "gender": "M;F;M", "homepage": "https://dihjiang.github.io/;;https://cs.uwaterloo.ca/~y328yu/", "dblp": "234/8064;;90/4989", "google_scholar": "Cen4GYkAAAAJ;2X_jP6kAAAAJ;https://scholar.google.ca/citations?user=zbXIQMsAAAAJ", "orcid": ";;0000-0002-3823-0720", "linkedin": ";;", "or_profile": "~Dihong_Jiang1;~Sun_Sun1;~Yaoliang_Yu1", "aff": "University of Waterloo;National Research Council Canada;University of Waterloo", "aff_domain": "uwaterloo.ca;nrc-cnrc.gc.ca;uwaterloo.ca", "position": "PhD student;Researcher;Associate Professor", "bibtex": "@inproceedings{\njiang2022revisiting,\ntitle={Revisiting flow generative models for Out-of-distribution detection},\nauthor={Dihong Jiang and Sun Sun and Yaoliang Yu},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=6y2KBh-0Fd9}\n}", "github": "", "project": "", "reviewers": "R73v;XXZp;Kij3;zvSD", "pdf_size": 0, "recommendation": "6;8;8;8", "confidence": "4;4;3;4", "correctness": "4;4;3;4", "technical_novelty": "3;2;3;1", "empirical_novelty": "3;3;2;2", "wc_summary_paper": "118;104;84;121", "wc_summary_review": "11;64;19;40", "wc_main_review": "235;274;325;153", "wc_review": "364;442;428;314", "wc_reply_reviewers": "81;19;25;0", "wc_reply_authors": "961;598;628;546", "reply_reviewers": "1;1;1;0", "reply_authors": "3;1;1;1", "recommendation_avg": [ 7.5, 0.8660254037844386 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.82915619758885 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 106.75, 14.618053906043718 ], "wc_summary_review_avg": [ 33.5, 20.54872258803452 ], "wc_main_review_avg": [ 246.75, 62.83460431959447 ], "wc_review_avg": [ 387.0, 51.39066063011839 ], "wc_reply_reviewers_avg": [ 31.25, 30.169313880166385 ], "wc_reply_authors_avg": [ 683.25, 163.02051251299636 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.5, 0.8660254037844386 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": -0.3333333333333333, "gs_citation": 46, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3271879045267934988&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2, "pdf": "https://openreview.net/pdf?id=6y2KBh-0Fd9", "email": "uwaterloo.ca;nrc-cnrc.gc.ca;uwaterloo.ca", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "University of Waterloo;National Research Council Canada", "aff_unique_dep": ";", "aff_unique_url": "https://uwaterloo.ca;https://www.nrc-cnrc.gc.ca", "aff_unique_abbr": "UW;NRC-CNRC", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Canada" }, { "title": "On Non-Random Missing Labels in Semi-Supervised Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6177", "id": "6yVvwR9H9Oj", "poster": "", "openreview": "https://openreview.net/forum?id=6yVvwR9H9Oj", "slides": "https://iclr.cc/virtual/2022/poster/6177", "video": "https://iclr.cc/virtual/2022/poster/6177", "author_site": "Xinting Hu, Yulei Niu, Chunyan Miao, Xian-Sheng Hua, Hanwang Zhang", "tldr": "", "abstract": "Semi-Supervised Learning (SSL) is fundamentally a missing label problem, in which the label Missing Not At Random (MNAR) problem is more realistic and challenging, compared to the widely-adopted yet naive Missing Completely At Random assumption where both labeled and unlabeled data share the same class distribution. Different from existing SSL solutions that overlook the role of ''class'' in causing the non-randomness, e.g., users are more likely to label popular classes, we explicitly incorporate ''class'' into SSL. Our method is three-fold: 1) We propose Class-Aware Propensity (CAP) that exploits the unlabeled data to train an improved classifier using the biased labeled data. 2) To encourage rare class training, whose model is low-recall but high-precision that discards too many pseudo-labeled data, we propose Class-Aware Imputation (CAI) that dynamically decreases (or increases) the pseudo-label assignment threshold for rare (or frequent) classes. 3) Overall, we integrate CAP and CAI into a Class-Aware Doubly Robust (CADR) estimator for training an unbiased SSL model. Under various MNAR settings and ablations, our method not only significantly outperforms existing baselines, but also surpasses other label bias removal SSL methods.\n", "keywords": "Semi-Supervised Learning;Missing Not At Random;Image Classification", "primary_area": "", "supplementary_material": "/attachment/b5903a5ec9797f00763d55740ae6bbf0d2d7ff86.zip", "author": "Xinting Hu;Yulei Niu;Chunyan Miao;Xian-Sheng Hua;Hanwang Zhang", "authorids": "~Xinting_Hu1;~Yulei_Niu1;~Chunyan_Miao1;~Xian-Sheng_Hua1;~Hanwang_Zhang3", "gender": "M;F;M;M;", "homepage": "https://yuleiniu.github.io;http://www.ntulily.org/ascymiao/;;https://mreallab.github.io/index.html;https://joyhuyy1412.github.io/", "dblp": "165/2982;m/ChunyanMiao;56/5807-1;79/8116.html;222/7753", "google_scholar": "WXd3dDwAAAAJ;https://scholar.google.com.tw/citations?user=fmXGRJgAAAAJ;https://scholar.google.co.uk/citations?user=6G-l4o0AAAAJ;YG0DFyYAAAAJ;", "orcid": ";0000-0002-0300-3448;;;", "linkedin": ";;xshua;;", "or_profile": "~Yulei_Niu1;~Chunyan_Miao1;~Xian-Sheng_Hua1;~Hanwang_Zhang3;~Joy_Hu1", "aff": "Columbia University;School of Computer Science and Engineering, Nanyang Technological University;Alibaba Group;;Nanyang Technological University", "aff_domain": "columbia.edu;scse.ntu.edu.sg;alibaba-inc.com;;ntu.edu.sg", "position": "Postdoc;Full Professor;Distinguished Engineer;;PhD student", "bibtex": "@inproceedings{\nhu2022on,\ntitle={On Non-Random Missing Labels in Semi-Supervised Learning},\nauthor={Xinting Hu and Yulei Niu and Chunyan Miao and Xian-Sheng Hua and Hanwang Zhang},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=6yVvwR9H9Oj}\n}", "github": "", "project": "", "reviewers": "9j41;Fenu;D3Mi", "pdf_size": 0, "recommendation": "6;6;8", "confidence": "2;3;4", "correctness": "3;4;4", "technical_novelty": "3;3;3", "empirical_novelty": "3;2;3", "wc_summary_paper": "85;71;77", "wc_summary_review": "5;50;37", "wc_main_review": "55;185;324", "wc_review": "145;306;438", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "350;655;647", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "recommendation_avg": [ 6.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 3.0, 0.816496580927726 ], "correctness_avg": [ 3.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 77.66666666666667, 5.734883511361751 ], "wc_summary_review_avg": [ 30.666666666666668, 18.909139471577113 ], "wc_main_review_avg": [ 188.0, 109.83927652104536 ], "wc_review_avg": [ 296.3333333333333, 119.81188959735545 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 550.6666666666666, 141.93034285247896 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.8660254037844385, "corr_recommendation_correctness": 0.49999999999999983, "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2589366795376584946&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=6yVvwR9H9Oj", "email": "columbia.edu;scse.ntu.edu.sg;alibaba-inc.com;;ntu.edu.sg", "author_num": 5, "aff_unique_index": "0;1;2;1", "aff_unique_norm": "Columbia University;Nanyang Technological University;Alibaba Group", "aff_unique_dep": ";School of Computer Science and Engineering;", "aff_unique_url": "https://www.columbia.edu;https://www.ntu.edu.sg;https://www.alibaba.com", "aff_unique_abbr": "Columbia;NTU;Alibaba", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2;1", "aff_country_unique": "United States;Singapore;China" }, { "id": "6ya8C6sCiD", "title": "Multi-Agent Language Learning: Symbolic Mapping", "track": "main", "status": "Reject", "tldr": "", "abstract": "The study of emergent communication has long been devoted to coax neural network agents to learn a language sharing similar properties with human language. In this paper, we try to find a natural way to help agents learn a compositional and symmetric language in complex settings like dialog games. Inspired by the theory that human language was originated from simple interactions, we hypothesize that language may evolve from simple tasks to difficult tasks. We propose a novel architecture called symbolic mapping as a basic component of the communication system of agent. We find that symbolic mapping learned in simple referential games can notably promote language learning in difficult tasks. Further, we explore vocabulary expansion, and show that with the help of symbolic mapping, agents can easily learn to use new symbols when the environment becomes more complex. All in all, we probe into how symbolic mapping helps language learning and find that a process from simplicity to complexity can serve as a natural way to help multi-agent language learning. ", "keywords": "emergent communication;multi-agent reinforcement learning", "primary_area": "", "supplementary_material": "", "author": "Yicheng Feng;Zongqing Lu", "authorids": "~Yicheng_Feng1;~Zongqing_Lu2", "gender": "M;", "homepage": "https://takenpeanut.github.io/;", "dblp": "340/4016;", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": "~Yicheng_Feng1;~Zongqing_Lu2", "aff": "Peking University;", "aff_domain": "pku.edu.cn;", "position": "PhD student;", "bibtex": "@misc{\nfeng2022multiagent,\ntitle={Multi-Agent Language Learning: Symbolic Mapping},\nauthor={Yicheng Feng and Zongqing Lu},\nyear={2022},\nurl={https://openreview.net/forum?id=6ya8C6sCiD}\n}", "github": "", "project": "", "reviewers": "rYPh;ZcF5;CC9y;Litw;Z2BE", "site": "https://openreview.net/forum?id=6ya8C6sCiD", "pdf_size": 0, "recommendation": "3;5;6;6;6", "confidence": "2;3;4;4;4", "correctness": "2;4;3;2;3", "technical_novelty": "1;2;3;2;4", "empirical_novelty": "1;0;2;3;3", "wc_summary_paper": "97;123;167;343;87", "wc_summary_review": "108;27;53;72;70", "wc_main_review": "447;381;1147;968;250", "wc_review": "652;531;1367;1383;407", "wc_reply_reviewers": "0;0;135;22;0", "wc_reply_authors": "549;326;1312;498;74", "reply_reviewers": "0;0;1;1;0", "reply_authors": "1;1;3;3;1", "recommendation_avg": [ 5.2, 1.16619037896906 ], "confidence_avg": [ 3.4, 0.8 ], "correctness_avg": [ 2.8, 0.7483314773547882 ], "technical_novelty_avg": [ 2.4, 1.019803902718557 ], "empirical_novelty_avg": [ 1.8, 1.1661903789690602 ], "wc_summary_paper_avg": [ 163.4, 93.96509990416654 ], "wc_summary_review_avg": [ 66.0, 26.48018126826174 ], "wc_main_review_avg": [ 638.6, 352.43529902664403 ], "wc_review_avg": [ 868.0, 421.18214587040603 ], "wc_reply_reviewers_avg": [ 31.4, 52.49609509287333 ], "wc_reply_authors_avg": [ 551.8, 414.7492736581946 ], "reply_reviewers_avg": [ 0.4, 0.48989794855663565 ], "reply_authors_avg": [ 1.8, 0.9797958971132713 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.9861168645694258, "corr_recommendation_correctness": 0.2750095491084634, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12606348266319538306&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "aff_unique_index": "0", "aff_unique_norm": "Peking University", "aff_unique_dep": "", "aff_unique_url": "http://www.pku.edu.cn", "aff_unique_abbr": "Peking U", "aff_country_unique_index": "0", "aff_country_unique": "China" }, { "title": "QUERY EFFICIENT DECISION BASED SPARSE ATTACKS AGAINST BLACK-BOX DEEP LEARNING MODELS", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6170", "id": "73MEhZ0anV", "poster": "", "openreview": "https://openreview.net/forum?id=73MEhZ0anV", "slides": "https://iclr.cc/virtual/2022/poster/6170", "video": "https://iclr.cc/virtual/2022/poster/6170", "author_site": "Viet Vo, Ehsan Abbasnejad, Damith Ranasinghe", "tldr": "", "abstract": "Despite our best efforts, deep learning models remain highly vulnerable to even tiny adversarial perturbations applied to the inputs. The ability to extract information from solely the output of a machine learning model to craft adversarial perturbations to black-box models is a practical threat against real-world systems, such as Machine Learning as a Service (MLaaS), particularly $sparse~attacks$. The realization of sparse attacks in black-box settings demonstrates that machine learning models are more vulnerable than we believe. Because, these attacks aim to $minimize~the~number~of~perturbed~pixels$\u2014measured by $l_0$ norm\u2014required to mislead a model by $solely$ observing the decision ($the~predicted~label$) returned to a model query; the so-called $decision-based~setting$. But, such an attack leads to an NP-hard optimization problem. We develop an evolution-based algorithm\u2014$SparseEvo$\u2014for the problem and evaluate it against both convolutional deep neural networks and $vision~transformers$. Notably, vision transformers are yet to be investigated under a decision-based attack setting. SparseEvo requires significantly fewer queries than the state-of-the-art sparse attack $Pointwise$ for both untargeted and targeted attacks. The attack algorithm, although conceptually simple, is competitive with only a limited query budget against the state-of-the-art gradient-based $white-box$ attacks in standard computer vision tasks such as $ImageNet$. Importantly, the query efficient SparseEvo, along with decision-based attacks, in general, raises new questions regarding the safety of deployed systems and poses new directions to study and understand the robustness of machine learning models.", "keywords": "decision-based attacks;sparse attacks;evolution algorithms;vision transformer;convolutional neural network", "primary_area": "", "supplementary_material": "", "author": "Viet Vo;Ehsan M Abbasnejad;Damith Ranasinghe", "authorids": "~Viet_Vo1;~Ehsan_M_Abbasnejad1;damith.ranasinghe@adelaide.edu.au", "gender": "M;M;", "homepage": "https://vietvo89.github.io/;https://ehsanabb.github.io/;", "dblp": "308/6866;30/11191;", "google_scholar": "https://scholar.google.com.au/citations?user=u5H3IooAAAAJ;https://scholar.google.com/citations?hl=en;", "orcid": "0000-0001-6413-8875;;", "linkedin": "viet-vo-75097835/;;", "or_profile": "~Viet_Vo1;~Ehsan_M_Abbasnejad1;damith.ranasinghe@adelaide.edu.au", "aff": "The University of Adelaide;University of Adelaide;", "aff_domain": "adelaide.edu.au;adelaide.edu.au;", "position": "PhD student;Assistant Professor;", "bibtex": "@inproceedings{\nvo2022query,\ntitle={{QUERY} {EFFICIENT} {DECISION} {BASED} {SPARSE} {ATTACKS} {AGAINST} {BLACK}-{BOX} {DEEP} {LEARNING} {MODELS}},\nauthor={Viet Vo and Ehsan M Abbasnejad and Damith Ranasinghe},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=73MEhZ0anV}\n}", "github": "", "project": "", "reviewers": "HSPi;8jjY;DgSu;t1fV", "pdf_size": 0, "recommendation": "5;6;6;6", "confidence": "3;3;2;5", "correctness": "3;3;4;4", "technical_novelty": "1;3;3;3", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "43;64;59;87", "wc_summary_review": "37;71;9;29", "wc_main_review": "371;144;53;81", "wc_review": "451;279;121;197", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "1905;601;76;451", "reply_reviewers": "0;0;0;0", "reply_authors": "4;2;1;1", "recommendation_avg": [ 5.75, 0.4330127018922193 ], "confidence_avg": [ 3.25, 1.0897247358851685 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.5, 0.8660254037844386 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 63.25, 15.75396775418815 ], "wc_summary_review_avg": [ 36.5, 22.377444000600246 ], "wc_main_review_avg": [ 162.25, 124.94673865291563 ], "wc_review_avg": [ 262.0, 122.59282197584001 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 758.25, 689.1354638240583 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.0, 1.224744871391589 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.13245323570650439, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 29, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10824573856366104653&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "pdf": "https://openreview.net/pdf?id=73MEhZ0anV", "email": "adelaide.edu.au;adelaide.edu.au;", "author_num": 3, "aff_unique_index": "0;0", "aff_unique_norm": "University of Adelaide", "aff_unique_dep": "", "aff_unique_url": "https://www.adelaide.edu.au", "aff_unique_abbr": "Adelaide", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Australia" }, { "id": "74cDdRwm4NV", "title": "Learning to Shape Rewards using a Game of Two Partners", "track": "main", "status": "Reject", "tldr": "", "abstract": "Reward shaping (RS) is a powerful method in reinforcement learning (RL) for overcoming the problem of sparse or uninformative rewards. However, RS typically relies on manually engineered shaping-reward functions whose construction is time consuming and error-prone. It also requires domain knowledge which runs contrary to the goal of autonomous learning. We introduce Reinforcement Learning Optimal Shaping Algorithm (ROSA), an automated RS framework in which the shaping reward function is constructed in a novel Markov game between two agents. A reward-shaping agent (Shaper) uses switching controls to determine which states to add shaping rewards and their optimal values while the other agent (Controller) learns the optimal policy for the task using these shaped rewards. We prove that ROSA, which easily adopts existing RL algorithms, learns to construct a shaping reward function that is tailored to the task thus ensuring efficient convergence to high performance policies. We demonstrate ROSA\u2019s congenial properties in three carefully designed experiments and show its superior performance against state-of-the-art RS algorithms in challenging sparse reward environments.", "keywords": "Reinforcement learning;Reward Shaping;Markov game;Sparse rewards", "primary_area": "", "supplementary_material": "", "author": "David Henry Mguni;Jianhong Wang;Taher Jafferjee;Nicolas Perez-Nieves;Wenbin Song;Feifei Tong;Hui Chen;Jiangcheng Zhu;Yaodong Yang;Jun Wang", "authorids": "~David_Henry_Mguni1;~Jianhong_Wang1;~Taher_Jafferjee1;~Nicolas_Perez-Nieves1;~Wenbin_Song1;~Feifei_Tong1;~Hui_Chen5;~Jiangcheng_Zhu1;~Yaodong_Yang1;~Jun_Wang2", "gender": "M;M;Not Specified;;M;M;;M;M;M", "homepage": ";https://hsvgbkhgbv.github.io/;https://atlashugs.github.io/;;https://github.com/cherrymilk;http://huaweicloud.com;https://argmax.me/;;https://www.yangyaodong.com;http://www0.cs.ucl.ac.uk/staff/jun.wang/", "dblp": "217/2369;;267/1551;;;;;202/5904.html;170/1496-1;w/JunWang12", "google_scholar": "K-_yzBsAAAAJ;K1FKF3IAAAAJ;;OqOeYNoAAAAJ;;;;ZosT8hcAAAAJ;https://scholar.google.co.uk/citations?user=6yL0xw8AAAAJ;https://scholar.google.co.uk/citations?user=wIE1tY4AAAAJ", "orcid": ";;;0000-0003-1586-0399;;;;;0000-0001-8132-5613;", "linkedin": ";jianhong-wang-45995b100/;;;;;;https://cn.linkedin.com/in/%E7%96%86%E6%88%90-%E6%9C%B1-85672b169;yaodong-yang;", "or_profile": "~David_Henry_Mguni1;~Jianhong_Wang1;~Taher_Jafferjee1;~Nicolas_Perez-Nieves1;~Wenbin_Song1;~Feifei_Tong1;~Hui_Chen5;~Jiangcheng_Zhu1;~Yaodong_Yang1;~Jun_Wang2", "aff": "Queen Mary University, London;Imperial College London;Huawei Technologies Ltd.;Imperial College London;Shanghaitech University;Huawei Technologies Ltd.;;Huawei Technologies Ltd.;King's College London;University College London", "aff_domain": "qmul.ac.uk;ic.ac.uk;huawei.com;ic.ac.uk;shanghaitech.edu.cn;huawei.com;;huawei.com;kcl.ac.uk;ucl.ac.uk", "position": "Lecturer;PhD student;Researcher;PhD student;MS student;Engineer;;Researcher;Assistant Professor;Professor", "bibtex": "@misc{\nmguni2022learning,\ntitle={Learning to Shape Rewards using a Game of Two Partners},\nauthor={David Henry Mguni and Jianhong Wang and Taher Jafferjee and Nicolas Perez-Nieves and Wenbin Song and Feifei Tong and Hui Chen and Jiangcheng Zhu and Yaodong Yang and Jun Wang},\nyear={2022},\nurl={https://openreview.net/forum?id=74cDdRwm4NV}\n}", "github": "", "project": "", "reviewers": "g7fZ;XHiG;uREp;sEbz", "site": "https://openreview.net/forum?id=74cDdRwm4NV", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "3;4;3;4", "correctness": "2;2;3;3", "technical_novelty": "2;3;3;3", "empirical_novelty": "1;2;3;3", "wc_summary_paper": "111;93;217;134", "wc_summary_review": "20;62;64;26", "wc_main_review": "535;696;1064;405", "wc_review": "666;851;1345;565", "wc_reply_reviewers": "0;363;243;130", "wc_reply_authors": "787;1123;1048;724", "reply_reviewers": "0;1;1;2", "reply_authors": "1;2;2;2", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 138.75, 47.457217575412066 ], "wc_summary_review_avg": [ 43.0, 20.12461179749811 ], "wc_main_review_avg": [ 675.0, 247.11434600200775 ], "wc_review_avg": [ 856.75, 299.96864419468915 ], "wc_reply_reviewers_avg": [ 184.0, 134.43771792172018 ], "wc_reply_authors_avg": [ 920.5, 168.59492874935472 ], "reply_reviewers_avg": [ 1.0, 0.7071067811865476 ], "reply_authors_avg": [ 1.75, 0.4330127018922193 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 10, 0 ], "corr_recommendation_confidence": 0.6882472016116854, "corr_recommendation_correctness": 0.6882472016116854, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2724759105135043079&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff_unique_index": "0;1;2;1;3;2;2;4;5", "aff_unique_norm": "Queen Mary University of London;Imperial College London;Huawei;ShanghaiTech University;King's College London;University College London", "aff_unique_dep": ";;Huawei Technologies;;;", "aff_unique_url": "https://www.qmul.ac.uk;https://www.imperial.ac.uk;https://www.huawei.com;https://www.shanghaitech.edu.cn;https://www.kcl.ac.uk;https://www.ucl.ac.uk", "aff_unique_abbr": "QMUL;ICL;Huawei;ShanghaiTech;KCL;UCL", "aff_campus_unique_index": "0", "aff_campus_unique": "London;", "aff_country_unique_index": "0;0;1;0;1;1;1;0;0", "aff_country_unique": "United Kingdom;China" }, { "title": "Evolutionary Diversity Optimization with Clustering-based Selection for Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/5983", "id": "74x5BXs4bWD", "poster": "", "openreview": "https://openreview.net/forum?id=74x5BXs4bWD", "slides": "https://iclr.cc/virtual/2022/poster/5983", "video": "https://iclr.cc/virtual/2022/poster/5983", "author_site": "Yutong Wang, Ke Xue, Chao Qian", "tldr": "", "abstract": "Reinforcement Learning (RL) has achieved significant successes, which aims to obtain a single policy maximizing the expected cumulative rewards for a given task. However, in many real-world scenarios, e.g., navigating in complex environments and controlling robots, one may need to find a set of policies having both high rewards and diverse behaviors, which can bring better exploration and robust few-shot adaptation. Recently, some methods have been developed by using evolutionary techniques, including iterative reproduction and selection of policies. However, due to the inefficient selection mechanisms, these methods cannot fully guarantee both high quality and diversity. In this paper, we propose EDO-CS, a new Evolutionary Diversity Optimization algorithm with Clustering-based Selection. In each iteration, the policies are divided into several clusters based on their behaviors, and a high-quality policy is selected from each cluster for reproduction. EDO-CS also adaptively balances the importance between quality and diversity in the reproduction process. Experiments on various (i.e., deceptive and multi-modal) continuous control tasks, show the superior performance of EDO-CS over previous methods, i.e., EDO-CS can achieve a set of policies with both high quality and diversity efficiently while previous methods cannot.", "keywords": "Reinforcement learning;Quality-Diversity;Evolutionary algorithms", "primary_area": "", "supplementary_material": "", "author": "Yutong Wang;Ke Xue;Chao Qian", "authorids": "~Yutong_Wang2;~Ke_Xue1;~Chao_Qian1", "gender": "F;M;M", "homepage": "http://www.lamda.nju.edu.cn/wangyt/;http://www.lamda.nju.edu.cn/xuek/;http://www.lamda.nju.edu.cn/qianc/", "dblp": ";93/2469-1;84/8508-1", "google_scholar": ";78bZVOwAAAAJ;", "orcid": ";0000-0001-6789-2670;", "linkedin": ";;", "or_profile": "~Yutong_Wang2;~Ke_Xue1;~Chao_Qian1", "aff": "Nanjing University;Nanjing University;Nanjing University", "aff_domain": "nju.edu;nju.edu.cn;nju.edu", "position": "MS student;MS student;Associate Professor", "bibtex": "@inproceedings{\nwang2022evolutionary,\ntitle={Evolutionary Diversity Optimization with Clustering-based Selection for Reinforcement Learning},\nauthor={Yutong Wang and Ke Xue and Chao Qian},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=74x5BXs4bWD}\n}", "github": "", "project": "", "reviewers": "5sru;SvUL;oa8b;RfKF", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "5;4;4;3", "correctness": "3;4;3;4", "technical_novelty": "1;3;3;3", "empirical_novelty": "2;3;4;3", "wc_summary_paper": "89;155;111;71", "wc_summary_review": "53;131;76;74", "wc_main_review": "740;567;507;338", "wc_review": "882;853;694;483", "wc_reply_reviewers": "399;170;104;0", "wc_reply_authors": "1719;978;1092;0", "reply_reviewers": "1;2;1;0", "reply_authors": "3;3;3;0", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.5, 0.8660254037844386 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 106.5, 31.38072656902641 ], "wc_summary_review_avg": [ 83.5, 28.86607004772212 ], "wc_main_review_avg": [ 538.0, 143.7062977047283 ], "wc_review_avg": [ 728.0, 158.52602310030994 ], "wc_reply_reviewers_avg": [ 168.25, 146.35978785171835 ], "wc_reply_authors_avg": [ 947.25, 615.3817412793461 ], "reply_reviewers_avg": [ 1.0, 0.7071067811865476 ], "reply_authors_avg": [ 2.25, 1.299038105676658 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.9733285267845754, "corr_recommendation_correctness": 0.6882472016116854, "gs_citation": 24, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13889751550076505766&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=74x5BXs4bWD", "email": "nju.edu;nju.edu.cn;nju.edu", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Nanjing University", "aff_unique_dep": "", "aff_unique_url": "https://www.nju.edu.cn", "aff_unique_abbr": "Nanjing U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "id": "77_zstKV8HQ", "title": "SimMER: Simple Maximization of Entropy and Rank for Self-supervised Representation Learning", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Consistency regularization, referring to enforcing consistency across a model's responses to different views of the same input, is widely used for self-supervised image representation learning. However, consistency regularization can be trivially achieved by collapsing the model into a constant mapping. To prevent this, existing methods often use negative pairs (contrastive learning) or ad hoc architecture constructs. Inspired by SimSiam's alternating optimization hypothesis, we propose a novel optimization target, SimMER, for self-supervised learning that explicitly avoids model collapse by balancing consistency (total variance minimization) and entropy of inputs' representations (entropy maximization). Combining consistency regularization with entropy maximization alone, the method can achieve performance on par with the state-of-the-art. Furthermore, we introduce an linear independence loss to further increase the performance by removing linear dependency along the feature dimension of the batch representation matrix (rank maximization), which has both anticollapsing and redundancy removal effects. With both entropy and rank maximization, our method surpasses the state-of-the-art on CIFAR-10 and Mini-ImageNet under the standard linear evaluation protocol.\n", "keywords": "self-supervised learning;representation learning;computer vision;image classification", "primary_area": "", "supplementary_material": "", "author": "Zhengyu Yang;Zijian Hu;Xuefeng Hu;Ram Nevatia", "authorids": "~Zhengyu_Yang2;~Zijian_Hu2;~Xuefeng_Hu1;~Ram_Nevatia1", "gender": ";;M;M", "homepage": "https://zhengyuyang.com;https://www.zijianhu.com/;https://xuefenghu.me;http://iris.usc.edu/people/nevatia/", "dblp": "159/1188-3;218/6669-1;;n/RamakantNevatia", "google_scholar": ";jQ9GwCoAAAAJ;https://scholar.google.com/citations?hl=en;https://scholar.google.com.tw/citations?user=EUMYhUvzt6IC", "orcid": ";0000-0001-9490-944X;;", "linkedin": ";;xuefeng-hu-137b9485/;", "or_profile": "~Zhengyu_Yang2;~Zijian_Hu2;~Xuefeng_Hu1;~Ram_Nevatia1", "aff": "Meta;ByteDance Inc.;University of Southern California;University of Southern California", "aff_domain": "meta.com;bytedance.com;usc.edu;usc.edu", "position": "Researcher;Researcher;PhD student;Full Professor", "bibtex": "@misc{\nyang2022simmer,\ntitle={Sim{MER}: Simple Maximization of Entropy and Rank for Self-supervised Representation Learning},\nauthor={Zhengyu Yang and Zijian Hu and Xuefeng Hu and Ram Nevatia},\nyear={2022},\nurl={https://openreview.net/forum?id=77_zstKV8HQ}\n}", "github": "", "project": "", "reviewers": "mp7H;3gyD;CaG3;nJPh", "site": "https://openreview.net/forum?id=77_zstKV8HQ", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "5;5;4;4", "correctness": "3;2;3;3", "technical_novelty": "2;2;2;2", "empirical_novelty": "1;0;2;2", "wc_summary_paper": "165;53;51;59", "wc_summary_review": "32;28;46;21", "wc_main_review": "183;172;194;178", "wc_review": "380;253;291;258", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 4.5, 0.5 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 1.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 82.0, 48.010415536631214 ], "wc_summary_review_avg": [ 31.75, 9.12071817347735 ], "wc_main_review_avg": [ 181.75, 8.073877630977572 ], "wc_review_avg": [ 295.5, 50.92396292512986 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -1.0, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:R-E51lvez2YJ:scholar.google.com/&scioq=SimMER:+Simple+Maximization+of+Entropy+and+Rank+for+Self-supervised+Representation+Learning&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1;2;2", "aff_unique_norm": "Meta;ByteDance;University of Southern California", "aff_unique_dep": "Meta Platforms, Inc.;;", "aff_unique_url": "https://meta.com;https://www.bytedance.com;https://www.usc.edu", "aff_unique_abbr": "Meta;ByteDance;USC", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Los Angeles", "aff_country_unique_index": "0;1;0;0", "aff_country_unique": "United States;China" }, { "id": "7ADMMyZpeY", "title": "A theoretically grounded characterization of feature representations", "track": "main", "status": "Reject", "tldr": "", "abstract": "A large body of work has explored how learned feature representations can be useful for a variety of downstream tasks. This is true even when the downstream tasks differ greatly from the actual objective used to (pre)train the feature representation. This observation underlies the success of, e.g., few-shot learning, transfer learning and self-supervised learning, among others. However, very little is understood about why such transfer is successful, and more importantly, how one should choose the pre-training task. As a first step towards this understanding, we ask: what makes a feature representation good for a target task? We present simple, intuitive measurements of the feature space that are good predictors of downstream task performance. We present theoretical results showing how these measurements can be used to bound the error of the downstream classifiers, and show empirically that these bounds correlate well with actual downstream performance. Finally, we show that our bounds are practically useful for choosing the right pre-trained representation for a target task.", "keywords": "features;analysis;generalization;transfer;few-shot", "primary_area": "", "supplementary_material": "/attachment/407a93ef572da1be142b840b636b110a32ac96cf.zip", "author": "Bharath Hariharan;Cheng Perng Phoo", "authorids": "~Bharath_Hariharan3;~Cheng_Perng_Phoo1", "gender": "M;M", "homepage": "https://cpphoo.github.io/;http://home.bharathh.info", "dblp": "226/0521;05/8412", "google_scholar": "kt9D2usAAAAJ;TpglobcAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Cheng_Perng_Phoo1;~Bharath_Hariharan2", "aff": "Meta Facebook;Cornell University", "aff_domain": "fb.com;cornell.edu", "position": "Intern;Assistant Professor", "bibtex": "@misc{\nhariharan2022a,\ntitle={A theoretically grounded characterization of feature representations },\nauthor={Bharath Hariharan and Cheng Perng Phoo},\nyear={2022},\nurl={https://openreview.net/forum?id=7ADMMyZpeY}\n}", "github": "", "project": "", "reviewers": "sCcc;tEiy;PXiU;DEHm;J9Rs", "site": "https://openreview.net/forum?id=7ADMMyZpeY", "pdf_size": 0, "recommendation": "5;6;6;6;8", "confidence": "3;3;3;3;2", "correctness": "4;4;3;3;3", "technical_novelty": "2;2;3;3;4", "empirical_novelty": "3;2;3;2;4", "wc_summary_paper": "55;69;74;72;76", "wc_summary_review": "197;21;92;31;60", "wc_main_review": "124;1019;1267;194;326", "wc_review": "376;1109;1433;297;462", "wc_reply_reviewers": "376;198;0;33;284", "wc_reply_authors": "718;222;374;255;349", "reply_reviewers": "2;1;0;1;1", "reply_authors": "3;2;1;1;2", "recommendation_avg": [ 6.2, 0.9797958971132712 ], "confidence_avg": [ 2.8, 0.39999999999999997 ], "correctness_avg": [ 3.4, 0.4898979485566356 ], "technical_novelty_avg": [ 2.8, 0.7483314773547882 ], "empirical_novelty_avg": [ 2.8, 0.7483314773547882 ], "wc_summary_paper_avg": [ 69.2, 7.467261881037787 ], "wc_summary_review_avg": [ 80.2, 63.4268082123009 ], "wc_main_review_avg": [ 586.0, 466.03819585952397 ], "wc_review_avg": [ 735.4, 452.17987571319446 ], "wc_reply_reviewers_avg": [ 178.2, 143.90886004690609 ], "wc_reply_authors_avg": [ 383.6, 176.50223794615184 ], "reply_reviewers_avg": [ 1.0, 0.6324555320336759 ], "reply_authors_avg": [ 1.8, 0.7483314773547883 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.9185586535436918, "corr_recommendation_correctness": -0.5833333333333334, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:q8Mxm0QV0TQJ:scholar.google.com/&scioq=A+theoretically+grounded+characterization+of+feature+representations&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1", "aff_unique_norm": "Meta;Cornell University", "aff_unique_dep": "Meta Platforms, Inc.;", "aff_unique_url": "https://meta.com;https://www.cornell.edu", "aff_unique_abbr": "Meta;Cornell", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "7AssAnH5vyJ", "title": "A Game-Theoretic Approach for Improving Generalization Ability of TSP Solvers", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "In this paper, we shed new light on the study of how to improve the generalization ability of deep learning-based solvers for the Traveling Salesman Problem (TSP). We build a two-player zero-sum game between a trainable solver and a task generator, where the solver aims to solve instances provided by the generator, and the generator aims to generate increasingly difficult instances for the solver. Grounded in the \\textsl{Policy Space Response Oracle} (PSRO) framework, our two-player framework allows us to obtain a behaviourally diverse population of powerful solvers over which we utilise a model mixing method to combine these solvers and achieve strong generalization ability on various tasks. Experimentally, we achieve the state-of-the-art results on a general TSP instance generation method over which the performance of other deep learning-based methods degenerates vastly. On realistic instances from TSPLib we approximately attain a \\textbf{12\\%} improvement over the base model. Furthermore, we empirically illustrate as the solvers' performance improves, the obtained strategy's exploitability keeps decreasing showing gradual convergence to the Nash equilibrium.", "keywords": "Combinatorial Optimization Problem;Policy Space Response Oracle;Reinforcement Learning", "primary_area": "", "supplementary_material": "", "author": "Chenguang Wang;Yaodong Yang;Congying Han;Tiande Guo;Haifeng Zhang;Jun Wang", "authorids": "~Chenguang_Wang2;~Yaodong_Yang1;~Congying_Han1;~Tiande_Guo1;~Haifeng_Zhang3;~Jun_Wang2", "gender": "M;M;F;M;;M", "homepage": "https://github.com/Wastedzz/cgwang;https://www.yangyaodong.com;http://people.ucas.edu.cn/~hancy;https://people.ucas.ac.cn/~tdguo?language=en;https://pkuzhf.github.io;http://www0.cs.ucl.ac.uk/staff/jun.wang/", "dblp": "62/3432-11;170/1496-1;07/2808;;93/7133-2;w/JunWang12", "google_scholar": "Ptf3uO0AAAAJ;https://scholar.google.co.uk/citations?user=6yL0xw8AAAAJ;;;;https://scholar.google.co.uk/citations?user=wIE1tY4AAAAJ", "orcid": "0009-0008-4097-1174;0000-0001-8132-5613;0000-0002-3445-4620;0000-0002-3804-9163;;", "linkedin": ";yaodong-yang;;;;", "or_profile": "~Chenguang_Wang2;~Yaodong_Yang1;~Congying_Han1;~Tiande_Guo1;~Haifeng_Zhang3;~Jun_Wang2", "aff": "Chinese Academy of Sciences;King's College London;University of Chinese Academy of Sciences;University of Chinese Academy of Sciences;Institute of Automation, Chinese Academy of Sciences;University College London", "aff_domain": "ucas.ac.cn;kcl.ac.uk;ucas.ac.cn;ucas.ac.cn;ia.ac.cn;ucl.ac.uk", "position": "MS student;Assistant Professor;Full Professor;Full Professor;Associate Professor;Professor", "bibtex": "@misc{\nwang2022a,\ntitle={A Game-Theoretic Approach for Improving Generalization Ability of {TSP} Solvers},\nauthor={Chenguang Wang and Yaodong Yang and Congying Han and Tiande Guo and Haifeng Zhang and Jun Wang},\nyear={2022},\nurl={https://openreview.net/forum?id=7AssAnH5vyJ}\n}", "github": "", "project": "", "reviewers": "o8q8;1pfj;KDpS;79Pz", "site": "https://openreview.net/forum?id=7AssAnH5vyJ", "pdf_size": 0, "recommendation": "3;3;3;6", "confidence": "4;4;4;3", "correctness": "4;3;3;4", "technical_novelty": "3;3;2;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "51;100;250;310", "wc_summary_review": "44;112;74;123", "wc_main_review": "177;1035;685;214", "wc_review": "272;1247;1009;647", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.75, 1.299038105676658 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 177.75, 105.85455823912355 ], "wc_summary_review_avg": [ 88.25, 31.355820831226854 ], "wc_main_review_avg": [ 527.75, 354.78681979464795 ], "wc_review_avg": [ 793.75, 369.29891348337327 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -1.0, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 18, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5699178640657750054&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1;2;2;0;3", "aff_unique_norm": "Chinese Academy of Sciences;King's College London;University of Chinese Academy of Sciences;University College London", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.cas.cn;https://www.kcl.ac.uk;http://www.ucas.ac.cn;https://www.ucl.ac.uk", "aff_unique_abbr": "CAS;KCL;UCAS;UCL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0;0;1", "aff_country_unique": "China;United Kingdom" }, { "id": "7AzOUBeajwl", "title": "Text Style Transfer with Confounders", "track": "main", "status": "Reject", "tldr": "", "abstract": "Existing methods for style transfer operate either with paired sentences or distributionally matched corpora which differ only in the desired style. In this paper, we relax this restriction and consider data sources with additional confounding differences, from which the desired style needs to be inferred. Specifically, we first learn an invariant style classifier that takes out nuisance variation, and then introduce an orthogonal classifier that highlights the confounding cues. The resulting pair of classifiers guide us to transfer text in the specified direction, creating sentences of the type not seen during training. Experiments show that using positive and negative review datasets from different categories, we can successfully transfer the sentiment without changing the category.", "keywords": "style transfer;confounder;invariance", "primary_area": "", "supplementary_material": "", "author": "Tianxiao Shen;Regina Barzilay;Tommi S. Jaakkola", "authorids": "~Tianxiao_Shen1;~Regina_Barzilay1;~Tommi_S._Jaakkola1", "gender": ";female;", "homepage": "https://shentianxiao.github.io/;https://www.regina.csail.mit.edu/;", "dblp": "185/5533;b/ReginaBarzilay;", "google_scholar": "aYtllNgAAAAJ;;", "orcid": ";;", "linkedin": "shentianxiao/;;", "or_profile": "~Tianxiao_Shen1;~Regina_Barzilay1;~Tommi_S._Jaakkola1", "aff": "Massachusetts Institute of Technology;Massachusetts Institute of Technology;", "aff_domain": "mit.edu;mit.edu;", "position": "PhD student;Professor;", "bibtex": "@misc{\nshen2022text,\ntitle={Text Style Transfer with Confounders},\nauthor={Tianxiao Shen and Regina Barzilay and Tommi S. Jaakkola},\nyear={2022},\nurl={https://openreview.net/forum?id=7AzOUBeajwl}\n}", "github": "", "project": "", "reviewers": "6xK3;9yNQ;Lb6o;fSZh", "site": "https://openreview.net/forum?id=7AzOUBeajwl", "pdf_size": 0, "recommendation": "3;3;5;6", "confidence": "4;4;4;4", "correctness": "2;3;3;4", "technical_novelty": "2;2;2;4", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "88;100;131;244", "wc_summary_review": "18;34;35;40", "wc_main_review": "310;227;252;534", "wc_review": "416;361;418;818", "wc_reply_reviewers": "0;0;79;466", "wc_reply_authors": "562;563;481;1198", "reply_reviewers": "0;0;1;2", "reply_authors": "1;1;1;2", "recommendation_avg": [ 4.25, 1.299038105676658 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.5, 0.8660254037844386 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 140.75, 61.64160526787082 ], "wc_summary_review_avg": [ 31.75, 8.257572282456872 ], "wc_main_review_avg": [ 330.75, 121.14737925353565 ], "wc_review_avg": [ 503.25, 183.15481839143627 ], "wc_reply_reviewers_avg": [ 136.25, 193.0937272414617 ], "wc_reply_authors_avg": [ 701.0, 288.8658858363168 ], "reply_reviewers_avg": [ 0.75, 0.82915619758885 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.816496580927726, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:IxCArSlPeqMJ:scholar.google.com/&scioq=Text+Style+Transfer+with+Confounders&hl=en&as_sdt=0,33", "gs_version_total": 3, "aff_unique_index": "0;0", "aff_unique_norm": "Massachusetts Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://web.mit.edu", "aff_unique_abbr": "MIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Optimal ANN-SNN Conversion for High-accuracy and Ultra-low-latency Spiking Neural Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/5899", "id": "7B3IJMM1k_M", "poster": "", "openreview": "https://openreview.net/forum?id=7B3IJMM1k_M", "slides": "https://iclr.cc/virtual/2022/poster/5899", "video": "https://iclr.cc/virtual/2022/poster/5899", "author_site": "Tong Bu, Wei Fang, Jianhao Ding, PENGLIN DAI, Zhaofei Yu, Tiejun Huang", "tldr": "", "abstract": "Spiking Neural Networks (SNNs) have gained great attraction due to their distinctive properties of low power consumption and fast inference on neuromorphic hardware. As the most effective method to get deep SNNs, ANN-SNN conversion has achieved comparable performance as ANNs on large-scale datasets. Despite this, it requires long time-steps to match the firing rates of SNNs to the activation of ANNs. As a result, the converted SNN suffers severe performance degradation problems with short time-steps, which hamper the practical application of SNNs. In this paper, we theoretically analyze ANN-SNN conversion error and derive the estimated activation function of SNNs. Then we propose the quantization clip-floor-shift activation function to replace the ReLU activation function in source ANNs, which can better approximate the activation function of SNNs. We prove that the expected conversion error between SNNs and ANNs is zero, enabling us to achieve high-accuracy and ultra-low-latency SNNs. We evaluate our method on CIFAR-10/100 and ImageNet datasets, and show that it outperforms the state-of-the-art ANN-SNN and directly trained SNNs in both accuracy and time-steps. To the best of our knowledge, this is the first time to explore high-performance ANN-SNN conversion with ultra-low latency (4 time-steps). Code is available at https://github.com/putshua/SNN_conversion_QCFS", "keywords": "Spiking Neural Networks;ANN-SNN Conversion;Ultra-low Latency;Quantization Clip-floor-shift Activation", "primary_area": "", "supplementary_material": "/attachment/8cca47a6a2fc40304aad64e6c22b5fd8bae3f00e.zip", "author": "Tong Bu;Wei Fang;Jianhao Ding;PENGLIN DAI;Zhaofei Yu;Tiejun Huang", "authorids": "~Tong_Bu1;~Wei_Fang2;~Jianhao_Ding1;~PENGLIN_DAI1;~Zhaofei_Yu1;~Tiejun_Huang1", "gender": ";;M;;M;M", "homepage": ";https://fangwei123456.github.io/;https://dingjianhao.github.io/;;https://yuzhaofei.github.io;https://idm.pku.edu.cn/~tjhuang/", "dblp": ";;128/2534;;166/0573;h/TiejunHuang", "google_scholar": ";https://scholar.google.com.hk/citations?user=e2lED2gAAAAJ;4rDfCSsAAAAJ;;qaUgD50AAAAJ;https://scholar.google.com.tw/citations?user=knvEK4AAAAAJ", "orcid": ";;;;;0000-0002-4234-6099", "linkedin": ";;;;;", "or_profile": "~Tong_Bu1;~Wei_Fang2;~Jianhao_Ding1;~PENGLIN_DAI1;~Zhaofei_Yu1;~Tiejun_Huang1", "aff": ";School of Computer Science, Peking University;Institute of Automation, Chinese Academy of Sciences;;Peking University;Institute of Computing Technology, Chinese Academy of Sciences", "aff_domain": ";pku.edu.cn;ia.ac.cn;;pku.edu.cn;ict.ac.cn", "position": ";PhD student;Intern;;Assistant Professor;Postdoc", "bibtex": "@inproceedings{\nbu2022optimal,\ntitle={Optimal {ANN}-{SNN} Conversion for High-accuracy and Ultra-low-latency Spiking Neural Networks},\nauthor={Tong Bu and Wei Fang and Jianhao Ding and PENGLIN DAI and Zhaofei Yu and Tiejun Huang},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=7B3IJMM1k_M}\n}", "github": "", "project": "", "reviewers": "hn22;LAGa;d5di;5DgR", "pdf_size": 0, "recommendation": "6;6;6;8", "confidence": "4;4;5;5", "correctness": "3;3;2;4", "technical_novelty": "3;3;3;3", "empirical_novelty": "3;3;2;3", "wc_summary_paper": "12;53;50;64", "wc_summary_review": "19;45;43;22", "wc_main_review": "189;614;164;223", "wc_review": "220;712;257;309", "wc_reply_reviewers": "18;89;217;33", "wc_reply_authors": "523;1687;1288;332", "reply_reviewers": "1;1;2;1", "reply_authors": "1;4;2;1", "recommendation_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 4.5, 0.5 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 44.75, 19.613452016409553 ], "wc_summary_review_avg": [ 32.25, 11.818946653572814 ], "wc_main_review_avg": [ 297.5, 183.9272954185974 ], "wc_review_avg": [ 374.5, 197.40377402673943 ], "wc_reply_reviewers_avg": [ 89.25, 78.35934851694468 ], "wc_reply_authors_avg": [ 957.5, 552.5941096320155 ], "reply_reviewers_avg": [ 1.25, 0.4330127018922193 ], "reply_authors_avg": [ 2.0, 1.224744871391589 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": 0.816496580927726, "gs_citation": 280, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8156890759052445683&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=7B3IJMM1k_M", "email": ";pku.edu.cn;ia.ac.cn;;pku.edu.cn;ict.ac.cn", "author_num": 6, "aff_unique_index": "0;1;0;1", "aff_unique_norm": "Peking University;Chinese Academy of Sciences", "aff_unique_dep": "School of Computer Science;Institute of Automation", "aff_unique_url": "http://www.pku.edu.cn;http://www.ia.cas.cn", "aff_unique_abbr": "PKU;CAS", "aff_campus_unique_index": "0", "aff_campus_unique": "Beijing;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "id": "7Bc2U-dLJ6N", "title": "SGDEM: stochastic gradient descent with energy and momentum", "track": "main", "status": "Reject", "tldr": "", "abstract": "In this paper, we propose SGDEM, Stochastic Gradient Descent with Energy and Momentum to solve a large class of general nonconvex stochastic optimization problems, based on the AEGD method that originated in the work [AEGD: Adaptive Gradient Descent with Energy. arXiv: 2010.05109]. SGDEM incorporates both energy and momentum at the same time so as to inherit their dual advantages. We show that SGDEM features an unconditional energy stability property, and derive energy-dependent convergence rates in the general nonconvex stochastic setting, as well as a regret bound in the online convex setting. A lower threshold for the energy variable is also provided. Our experimental results show that SGDEM converges faster than AEGD and generalizes better or at least as well as SGDM in training some deep neural networks.", "keywords": "stochastic optimization;energy stability;momentum", "primary_area": "", "supplementary_material": "", "author": "Hailiang Liu;Xuping Tian", "authorids": "~Hailiang_Liu1;~Xuping_Tian1", "gender": "M;", "homepage": "https://faculty.sites.iastate.edu/hliu/;https://math.iastate.edu/directory/xuping-tian/", "dblp": ";", "google_scholar": "Wq7IGEIAAAAJ;", "orcid": ";", "linkedin": ";", "or_profile": "~Hailiang_Liu1;~Xuping_Tian1", "aff": "Iowa State University;Iowa State University", "aff_domain": "iastate.edu;iastate.edu", "position": "Full Professor;PhD student", "bibtex": "@misc{\nliu2022sgdem,\ntitle={{SGDEM}: stochastic gradient descent with energy and momentum},\nauthor={Hailiang Liu and Xuping Tian},\nyear={2022},\nurl={https://openreview.net/forum?id=7Bc2U-dLJ6N}\n}", "github": "", "project": "", "reviewers": "Su6j;e7GR;TrUv;64qc", "site": "https://openreview.net/forum?id=7Bc2U-dLJ6N", "pdf_size": 0, "recommendation": "5;5;5;6", "confidence": "4;4;5;5", "correctness": "3;4;2;4", "technical_novelty": "2;2;2;2", "empirical_novelty": "1;2;2;2", "wc_summary_paper": "60;49;97;59", "wc_summary_review": "28;58;67;284", "wc_main_review": "231;147;441;87", "wc_review": "319;254;605;430", "wc_reply_reviewers": "0;83;0;49", "wc_reply_authors": "205;313;335;52", "reply_reviewers": "0;1;0;1", "reply_authors": "1;1;1;1", "recommendation_avg": [ 5.25, 0.4330127018922193 ], "confidence_avg": [ 4.5, 0.5 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 1.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 66.25, 18.267115262131565 ], "wc_summary_review_avg": [ 109.25, 101.92000539638919 ], "wc_main_review_avg": [ 226.5, 133.9878725855441 ], "wc_review_avg": [ 402.0, 133.02819250068762 ], "wc_reply_reviewers_avg": [ 33.0, 35.12121865767189 ], "wc_reply_authors_avg": [ 226.25, 111.98967586344735 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": 0.5222329678670935, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:FtNlRjkX-CUJ:scholar.google.com/&scioq=SGDEM:+stochastic+gradient+descent+with+energy+and+momentum&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Iowa State University", "aff_unique_dep": "", "aff_unique_url": "https://www.iastate.edu", "aff_unique_abbr": "ISU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Neural Markov Controlled SDE: Stochastic Optimization for Continuous-Time Data", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6440", "id": "7DI6op61AY", "poster": "", "openreview": "https://openreview.net/forum?id=7DI6op61AY", "slides": "https://iclr.cc/virtual/2022/poster/6440", "video": "https://iclr.cc/virtual/2022/poster/6440", "author_site": "Sung Woo Park, Kyungjae Lee, Junseok Kwon", "tldr": "", "abstract": "We propose a novel probabilistic framework for modeling stochastic dynamics with the rigorous use of stochastic optimal control theory. The proposed model called the neural Markov controlled stochastic differential equation (CSDE) overcomes the fundamental and structural limitations of conventional dynamical models by introducing the following two components: (1) Markov dynamic programming to efficiently train the proposed CSDE and (2) multi-conditional forward-backward losses to provide rich information for accurate inference and to assure theoretical optimality. We demonstrate that our dynamical model efficiently generates a complex time series in the data space without extra networks while showing comparable performance against existing model-based methods on several datasets.", "keywords": "controlled stochastic differential equation;time-series prediction", "primary_area": "", "supplementary_material": "", "author": "Sung Woo Park;Kyungjae Lee;Junseok Kwon", "authorids": "~Sung_Woo_Park2;~Kyungjae_Lee1;~Junseok_Kwon5", "gender": "M;M;M", "homepage": ";https://sites.google.com/view/kyungjaelee;https://sites.google.com/view/cau-cvml/", "dblp": "92/6585;13/7265-1;04/425", "google_scholar": "B1xpjO8AAAAJ;https://scholar.google.co.kr/citations?user=OZZJagIAAAAJ;lwsaTnEAAAAJ", "orcid": ";0000-0003-0147-2715;", "linkedin": ";;", "or_profile": "~Sung_Woo_Park2;~Kyungjae_Lee1;~Junseok_Kwon5", "aff": "ChungAng University;ChungAng University;Chung-Ang University", "aff_domain": "cau.ac.kr;cau.ac.kr;cau.ac.kr", "position": "PhD student;Assistant Professor;Full Professor", "bibtex": "@inproceedings{\npark2022neural,\ntitle={Neural Markov Controlled {SDE}: Stochastic Optimization for Continuous-Time Data},\nauthor={Sung Woo Park and Kyungjae Lee and Junseok Kwon},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=7DI6op61AY}\n}", "github": "", "project": "", "reviewers": "poqQ;hDHJ;WGcS;NnV6", "pdf_size": 0, "recommendation": "3;6;8;8", "confidence": "4;2;3;4", "correctness": "4;3;3;4", "technical_novelty": "3;3;4;4", "empirical_novelty": "3;0;2;4", "wc_summary_paper": "141;106;49;81", "wc_summary_review": "132;25;38;47", "wc_main_review": "699;236;888;258", "wc_review": "972;367;975;386", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "1875;679;1422;166", "reply_reviewers": "0;0;0;0", "reply_authors": "4;1;3;1", "recommendation_avg": [ 6.25, 2.0463381929681126 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.5, 0.5 ], "empirical_novelty_avg": [ 2.25, 1.479019945774904 ], "wc_summary_paper_avg": [ 94.25, 33.71479645496915 ], "wc_summary_review_avg": [ 60.5, 42.01487831709144 ], "wc_main_review_avg": [ 520.25, 281.40928822624176 ], "wc_review_avg": [ 675.0, 298.5774606362644 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 1035.5, 659.0267445255921 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.25, 1.299038105676658 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.18417736717093933, "corr_recommendation_correctness": -0.3665083330689157, "gs_citation": 33, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6164729609389350545&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "pdf": "https://openreview.net/pdf?id=7DI6op61AY", "email": "cau.ac.kr;cau.ac.kr;cau.ac.kr", "author_num": 3, "aff_unique_index": "0;0;1", "aff_unique_norm": "Chungang University;Chung-Ang University", "aff_unique_dep": ";", "aff_unique_url": "http://www.cau.ac.kr;http://www.cau.ac.kr", "aff_unique_abbr": "CAU;CAU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "South Korea" }, { "title": "$\\mathrm{SO}(2)$-Equivariant Reinforcement Learning", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6798", "id": "7F9cOhdvfk_", "poster": "", "openreview": "https://openreview.net/forum?id=7F9cOhdvfk_", "slides": "https://iclr.cc/virtual/2022/poster/6798", "video": "https://iclr.cc/virtual/2022/poster/6798", "author_site": "Dian Wang, Robin Walters, Robert Platt", "tldr": "", "abstract": "Equivariant neural networks enforce symmetry within the structure of their convolutional layers, resulting in a substantial improvement in sample efficiency when learning an equivariant or invariant function. Such models are applicable to robotic manipulation learning which can often be formulated as a rotationally symmetric problem. This paper studies equivariant model architectures in the context of $Q$-learning and actor-critic reinforcement learning. We identify equivariant and invariant characteristics of the optimal $Q$-function and the optimal policy and propose equivariant DQN and SAC algorithms that leverage this structure. We present experiments that demonstrate that our equivariant versions of DQN and SAC can be significantly more sample efficient than competing algorithms on an important class of robotic manipulation problems.", "keywords": "Reinforcement Learning;Equivariance;Robotic Manipulation", "primary_area": "", "supplementary_material": "/attachment/3a0036825585896b12b650624b8c513f3c7d408e.zip", "author": "Dian Wang;Robin Walters;Robert Platt", "authorids": "~Dian_Wang1;~Robin_Walters1;~Robert_Platt1", "gender": "M;M;", "homepage": "https://pointw.github.io/;http://www.robinwalters.com;http://www.ccs.neu.edu/home/rplatt/", "dblp": "191/1369-1;258/3416;39/5434", "google_scholar": "CckjtfQAAAAJ;fnprJmUAAAAJ;Z4Y5S2oAAAAJ", "orcid": ";;", "linkedin": "dianwang1007;;", "or_profile": "~Dian_Wang1;~Robin_Walters1;~Robert_Platt1", "aff": "Northeastern University;Northeastern University ;Northeastern University", "aff_domain": "northeastern.edu;northeastern.edu;neu.edu", "position": "PhD student;Assistant Professor;Associate Professor", "bibtex": "@inproceedings{\nwang2022mathrmsoequivariant,\ntitle={\\${\\textbackslash}mathrm\\{{SO}\\}(2)\\$-Equivariant Reinforcement Learning},\nauthor={Dian Wang and Robin Walters and Robert Platt},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=7F9cOhdvfk_}\n}", "github": "", "project": "", "reviewers": "PtaK;nGwx;27S2;pbdx;cinR", "pdf_size": 0, "recommendation": "5;6;8;8;8", "confidence": "4;3;3;4;3", "correctness": "3;3;3;4;3", "technical_novelty": "2;3;2;2;3", "empirical_novelty": "2;3;3;4;3", "wc_summary_paper": "46;60;31;69;57", "wc_summary_review": "3;129;24;2;27", "wc_main_review": "230;397;268;225;138", "wc_review": "279;586;323;296;222", "wc_reply_reviewers": "0;0;40;0;0", "wc_reply_authors": "629;638;542;508;234", "reply_reviewers": "0;0;1;0;0", "reply_authors": "1;1;1;1;1", "recommendation_avg": [ 7.0, 1.2649110640673518 ], "confidence_avg": [ 3.4, 0.4898979485566356 ], "correctness_avg": [ 3.2, 0.39999999999999997 ], "technical_novelty_avg": [ 2.4, 0.4898979485566356 ], "empirical_novelty_avg": [ 3.0, 0.6324555320336759 ], "wc_summary_paper_avg": [ 52.6, 13.062924634246347 ], "wc_summary_review_avg": [ 37.0, 47.146579939588406 ], "wc_main_review_avg": [ 251.6, 84.2486795148743 ], "wc_review_avg": [ 341.2, 126.79022044306099 ], "wc_reply_reviewers_avg": [ 8.0, 16.0 ], "wc_reply_authors_avg": [ 510.2, 146.79836511351206 ], "reply_reviewers_avg": [ 0.2, 0.4000000000000001 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.3227486121839514, "corr_recommendation_correctness": 0.3952847075210474, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1, "pdf": "https://openreview.net/pdf?id=7F9cOhdvfk_", "email": "northeastern.edu;northeastern.edu;neu.edu", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Northeastern University", "aff_unique_dep": "", "aff_unique_url": "https://www.northeastern.edu", "aff_unique_abbr": "NEU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "7HhX4mbern", "title": "Randomized Signature Layers for Signal Extraction in Time Series Data", "track": "main", "status": "Reject", "tldr": "", "abstract": "Time series analysis is a widespread task in Natural Sciences, Social Sciences, and Engineering. A fundamental problem is finding an expressive yet efficient-to-compute representation of the input time series to use as a starting point to perform arbitrary downstream tasks. \nIn this paper, we build upon recent work using the signature of a path as a feature map and investigate a computationally efficient technique to approximate these features based on linear random projections. We present several theoretical results to justify our approach and empirically validate that our random projections can effectively retrieve the underlying signature of a path.\nWe show the surprising performance of the proposed random features on several tasks, including (1) mapping the controls of Stochastic Differential Equations to the corresponding solutions and (2) using the random signatures as time series representation for classification tasks. Besides providing a new tool to extract signatures and further validating the high level of expressiveness of such features, we believe our results provide interesting conceptual links between several existing research areas, suggesting new intriguing directions for future investigations.", "keywords": "signature;random features;time series;SDE;differential equations;rough path", "primary_area": "", "supplementary_material": "", "author": "Enea Monzio Compagnoni;Luca Biggio;Antonio Orvieto;Thomas Hofmann;Josef Teichmann", "authorids": "~Enea_Monzio_Compagnoni1;~Luca_Biggio1;~Antonio_Orvieto3;~Thomas_Hofmann1;~Josef_Teichmann2", "gender": "M;M;M;M;M", "homepage": "https://eneamc.github.io/;;http://orvi.altervista.org/;http://www.da.inf.ethz.ch/;https://people.math.ethz.ch/~jteichma/", "dblp": "310/1851;279/2333;;h/ThHofmann;99/9501", "google_scholar": "6qKgak8AAAAJ;6HtmuegAAAAJ;xkuLyHoAAAAJ;T3hAyLkAAAAJ;6quAJUEAAAAJ", "orcid": "0009-0004-7094-2586;;;;0000-0003-0125-7484", "linkedin": "eneamc/;;antonio-orvieto-947ab0130/;thomas-hofmann-1ab2402/;", "or_profile": "~Enea_Monzio_Compagnoni1;~Luca_Biggio1;~Antonio_Orvieto3;~Thomas_Hofmann1;~Josef_Teichmann2", "aff": "Swiss Federal Institute of Technology;Swiss Federal Institute of Technology;Swiss Federal Institute of Technology;Swiss Federal Institute of Technology;ETHZ - ETH Zurich", "aff_domain": "ethz.ch;ethz.ch;ethz.ch;ethz.ch;ethz.ch", "position": "MS student;PhD student;PhD student;Full Professor;Full Professor", "bibtex": "@misc{\ncompagnoni2022randomized,\ntitle={Randomized Signature Layers for Signal Extraction in Time Series Data},\nauthor={Enea Monzio Compagnoni and Luca Biggio and Antonio Orvieto and Thomas Hofmann and Josef Teichmann},\nyear={2022},\nurl={https://openreview.net/forum?id=7HhX4mbern}\n}", "github": "", "project": "", "reviewers": "g484;KwmV;5Jna;iWAG", "site": "https://openreview.net/forum?id=7HhX4mbern", "pdf_size": 0, "recommendation": "5;5;5;6", "confidence": "4;4;5;4", "correctness": "3;3;3;4", "technical_novelty": "2;3;2;1", "empirical_novelty": "3;2;4;3", "wc_summary_paper": "62;146;16;36", "wc_summary_review": "51;33;30;20", "wc_main_review": "329;347;202;224", "wc_review": "442;526;248;280", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "302;419;571;753", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 5.25, 0.4330127018922193 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 65.0, 49.52776998815917 ], "wc_summary_review_avg": [ 33.5, 11.191514642799696 ], "wc_main_review_avg": [ 275.5, 63.30284353802758 ], "wc_review_avg": [ 374.0, 114.49890829173874 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 511.25, 169.0478852278253 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": 1.0, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17428957616973271333&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;0;0;1", "aff_unique_norm": "Swiss Federal Institute of Technology;ETH Zurich", "aff_unique_dep": ";", "aff_unique_url": "https://www.ethz.ch;https://www.ethz.ch", "aff_unique_abbr": "ETH Zurich;ETHZ", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "Switzerland" }, { "title": "Differentially Private Fractional Frequency Moments Estimation with Polylogarithmic Space", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6971", "id": "7I8LPkcx8V", "poster": "", "openreview": "https://openreview.net/forum?id=7I8LPkcx8V", "slides": "https://iclr.cc/virtual/2022/poster/6971", "video": "https://iclr.cc/virtual/2022/poster/6971", "author_site": "Lun Wang, Iosif Pinelis, Dawn Song", "tldr": "", "abstract": "We prove that $\\mathbb{F}_p$ sketch, a well-celebrated streaming algorithm for frequency moments estimation, is differentially private as is when $p\\in(0, 1]$. $\\mathbb{F}_p$ sketch uses only polylogarithmic space, exponentially better than existing DP baselines and only worse than the optimal non-private baseline by a logarithmic factor. The evaluation shows that $\\mathbb{F}_p$ sketch can achieve reasonable accuracy with strong privacy guarantees. The code for evaluation is included in the supplementary material.", "keywords": "Differential Privacy;Fractional Frequency Moments", "primary_area": "", "supplementary_material": "/attachment/a35eb16208dbfcc38520e52c573e4e5bdf9f05c6.zip", "author": "Lun Wang;Iosif Pinelis;Dawn Song", "authorids": "~Lun_Wang1;~Iosif_Pinelis1;~Dawn_Song2", "gender": ";;F", "homepage": "https://wanglun1996.github.io/;;http://people.eecs.berkeley.edu/~dawnsong/", "dblp": ";;", "google_scholar": ";https://scholar.google.com/citations?hl=en;84WzBlYAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Lun_Wang1;~Iosif_Pinelis1;~Dawn_Song2", "aff": "University of California, Berkeley;Michigan Technological University;University of California, Berkeley", "aff_domain": "berkeley.edu;mtu.edu;berkeley.edu", "position": "PhD student;Full Professor;Full Professor", "bibtex": "@inproceedings{\nwang2022differentially,\ntitle={Differentially Private Fractional Frequency Moments Estimation with Polylogarithmic Space},\nauthor={Lun Wang and Iosif Pinelis and Dawn Song},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=7I8LPkcx8V}\n}", "github": "", "project": "", "reviewers": "Pbv7;P1Tg;PGRe;49oY", "pdf_size": 0, "recommendation": "6;6;8;8", "confidence": "4;4;3;3", "correctness": "3;3;4;4", "technical_novelty": "2;3;3;3", "empirical_novelty": "3;0;2;3", "wc_summary_paper": "129;69;268;65", "wc_summary_review": "107;41;127;11", "wc_main_review": "448;432;638;41", "wc_review": "684;542;1033;117", "wc_reply_reviewers": "0;47;225;0", "wc_reply_authors": "538;970;1324;97", "reply_reviewers": "0;1;3;0", "reply_authors": "1;3;4;1", "recommendation_avg": [ 7.0, 1.0 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 1.224744871391589 ], "wc_summary_paper_avg": [ 132.75, 82.09864493400606 ], "wc_summary_review_avg": [ 71.5, 47.25198408532704 ], "wc_main_review_avg": [ 389.75, 217.04420632672966 ], "wc_review_avg": [ 594.0, 328.27351400927853 ], "wc_reply_reviewers_avg": [ 68.0, 92.65257686648548 ], "wc_reply_authors_avg": [ 732.25, 460.4260934178253 ], "reply_reviewers_avg": [ 1.0, 1.224744871391589 ], "reply_authors_avg": [ 2.25, 1.299038105676658 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -1.0, "corr_recommendation_correctness": 1.0, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1896989640929672149&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=7I8LPkcx8V", "email": "berkeley.edu;mtu.edu;berkeley.edu", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "University of California, Berkeley;Michigan Technological University", "aff_unique_dep": ";", "aff_unique_url": "https://www.berkeley.edu;https://www.mtu.edu", "aff_unique_abbr": "UC Berkeley;MTU", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Berkeley;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Constructing a Good Behavior Basis for Transfer using Generalized Policy Updates", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6493", "id": "7IWGzQ6gZ1D", "poster": "", "openreview": "https://openreview.net/forum?id=7IWGzQ6gZ1D", "slides": "https://iclr.cc/virtual/2022/poster/6493", "video": "https://iclr.cc/virtual/2022/poster/6493", "author_site": "Safa Alver, Doina Precup", "tldr": "", "abstract": "We study the problem of learning a good set of policies, so that when combined together, they can solve a wide variety of unseen reinforcement learning tasks with no or very little new data. Specifically, we consider the framework of generalized policy evaluation and improvement, in which the rewards for all tasks of interest are assumed to be expressible as a linear combination of a fixed set of features. We show theoretically that, under certain assumptions, having access to a specific set of diverse policies, which we call a set of independent policies, can allow for instantaneously achieving high-level performance on all possible downstream tasks which are typically more complex than the ones on which the agent was trained. Based on this theoretical analysis, we propose a simple algorithm that iteratively constructs this set of policies. In addition to empirically validating our theoretical results, we compare our approach with recently proposed diverse policy set construction methods and show that, while others fail, our approach is able to build a behavior basis that enables instantaneous transfer to all possible downstream tasks. We also show empirically that having access to a set of independent policies can better bootstrap the learning process on downstream tasks where the new reward function cannot be described as a linear combination of the features. Finally, we demonstrate how this policy set can be useful in a lifelong reinforcement learning setting.", "keywords": "reinforcement learning;lifelong learning;transfer learning;successor features", "primary_area": "", "supplementary_material": "", "author": "Safa Alver;Doina Precup", "authorids": "~Safa_Alver1;~Doina_Precup1", "gender": ";F", "homepage": ";http://cs.mcgill.ca/~dprecup/", "dblp": "247/1013;p/DoinaPrecup", "google_scholar": ";https://scholar.google.com.tw/citations?user=j54VcVEAAAAJ", "orcid": ";", "linkedin": "https://linkedin.com/in/safa-alver;", "or_profile": "~Safa_Alver1;~Doina_Precup1", "aff": "McGill University;McGill University", "aff_domain": "mcgill.ca;mcgill.ca", "position": "PhD student;Associate Professor", "bibtex": "@inproceedings{\nalver2022constructing,\ntitle={Constructing a Good Behavior Basis for Transfer using Generalized Policy Updates},\nauthor={Safa Alver and Doina Precup},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=7IWGzQ6gZ1D}\n}", "github": "", "project": "", "reviewers": "8NxE;3QcK;4b5c", "pdf_size": 0, "recommendation": "6;6;10", "confidence": "3;4;4", "correctness": "4;4;4", "technical_novelty": "2;3;4", "empirical_novelty": "2;2;4", "wc_summary_paper": "64;95;109", "wc_summary_review": "2;263;15", "wc_main_review": "157;1816;167", "wc_review": "223;2174;291", "wc_reply_reviewers": "64;664;73", "wc_reply_authors": "811;6857;230", "reply_reviewers": "1;1;1", "reply_authors": "2;12;1", "recommendation_avg": [ 7.333333333333333, 1.8856180831641267 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 4.0, 0.0 ], "technical_novelty_avg": [ 3.0, 0.816496580927726 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.9428090415820634 ], "wc_summary_paper_avg": [ 89.33333333333333, 18.803073034893938 ], "wc_summary_review_avg": [ 93.33333333333333, 120.08978122869388 ], "wc_main_review_avg": [ 713.3333333333334, 779.7137651443352 ], "wc_review_avg": [ 896.0, 904.108769267651 ], "wc_reply_reviewers_avg": [ 267.0, 280.74543629416314 ], "wc_reply_authors_avg": [ 2632.6666666666665, 2996.4572785578343 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 5.0, 4.96655480858378 ], "replies_avg": [ 23, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.5, "corr_recommendation_correctness": 0.0, "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4084155692587483810&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=7IWGzQ6gZ1D", "email": "mcgill.ca;mcgill.ca", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "McGill University", "aff_unique_dep": "", "aff_unique_url": "https://www.mcgill.ca", "aff_unique_abbr": "McGill", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Canada" }, { "title": "Transfer RL across Observation Feature Spaces via Model-Based Regularization", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6740", "id": "7KdAoOsI81C", "poster": "", "openreview": "https://openreview.net/forum?id=7KdAoOsI81C", "slides": "https://iclr.cc/virtual/2022/poster/6740", "video": "https://iclr.cc/virtual/2022/poster/6740", "author_site": "Yanchao Sun, Ruijie Zheng, Xiyao Wang, Andrew Cohen, Furong Huang", "tldr": "", "abstract": "In many reinforcement learning (RL) applications, the observation space is specified by human developers and restricted by physical realizations, and may thus be subject to dramatic changes over time (e.g. increased number of observable features). However, when the observation space changes, the previous policy will likely fail due to the mismatch of input features, and another policy must be trained from scratch, which is inefficient in terms of computation and sample complexity. Following theoretical insights, we propose a novel algorithm which extracts the latent-space dynamics in the source task, and transfers the dynamics model to the target task to use as a model-based regularizer. Our algorithm works for drastic changes of observation space (e.g. from vector-based observation to image-based observation), without any inter-task mapping or any prior knowledge of the target task. Empirical results show that our algorithm significantly improves the efficiency and stability of learning in the target task.", "keywords": "transfer reinforcement learning;representation learning;observation space change;latent dynamics model", "primary_area": "", "supplementary_material": "/attachment/dccc4bf7b5ed9bbb43e86e6aa370597dd1891b6c.zip", "author": "Yanchao Sun;Ruijie Zheng;Xiyao Wang;Andrew E Cohen;Furong Huang", "authorids": "~Yanchao_Sun1;~Ruijie_Zheng1;~Xiyao_Wang1;~Andrew_E_Cohen1;~Furong_Huang1", "gender": "F;;M;M;F", "homepage": "https://ycsun2017.github.io/home/index.html;http://www.ruijiezheng.com;;;https://furong-huang.com", "dblp": "132/6840;294/8474;;;72/8513", "google_scholar": "bloBY_QAAAAJ;;puVqfbwAAAAJ;v1Frtb0AAAAJ;13yyuCcAAAAJ", "orcid": "0000-0002-1137-9939;;;;", "linkedin": ";;;;", "or_profile": "~Yanchao_Sun1;~Ruijie_Zheng1;~Xiyao_Wang1;~Andrew_E_Cohen1;~Furong_Huang1", "aff": "University of Maryland, College Park;Department of Computer Science, University of Maryland, College Park;University of Maryland, College Park;Unity Technologies;University of Maryland", "aff_domain": "umd.edu;cs.umd.edu;umd.edu;unity3d.com;cs.umd.edu", "position": "PhD student;Undergrad student;PhD student;Researcher;Assistant Professor", "bibtex": "@inproceedings{\nsun2022transfer,\ntitle={Transfer {RL} across Observation Feature Spaces via Model-Based Regularization},\nauthor={Yanchao Sun and Ruijie Zheng and Xiyao Wang and Andrew E Cohen and Furong Huang},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=7KdAoOsI81C}\n}", "github": "", "project": "", "reviewers": "igV9;gWD8;RpCP;L4Qx", "pdf_size": 0, "recommendation": "5;5;6;8", "confidence": "3;4;3;3", "correctness": "3;2;3;3", "technical_novelty": "2;2;3;4", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "71;104;131;68", "wc_summary_review": "76;262;68;61", "wc_main_review": "691;868;571;369", "wc_review": "838;1234;770;498", "wc_reply_reviewers": "0;0;204;37", "wc_reply_authors": "2971;1827;1767;629", "reply_reviewers": "0;0;1;1", "reply_authors": "6;5;5;2", "recommendation_avg": [ 6.0, 1.224744871391589 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.82915619758885 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 93.5, 25.85053190942113 ], "wc_summary_review_avg": [ 116.75, 84.0278971532669 ], "wc_main_review_avg": [ 624.75, 181.56042382633942 ], "wc_review_avg": [ 835.0, 263.15584736045673 ], "wc_reply_reviewers_avg": [ 60.25, 84.35749818480868 ], "wc_reply_authors_avg": [ 1798.5, 828.2950863068065 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 4.5, 1.5 ], "replies_avg": [ 26, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.4714045207910316, "corr_recommendation_correctness": 0.4714045207910316, "gs_citation": 23, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13099881676078059682&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=7KdAoOsI81C", "email": "umd.edu;cs.umd.edu;umd.edu;unity3d.com;cs.umd.edu", "author_num": 5, "aff_unique_index": "0;1;0;2;0", "aff_unique_norm": "University of Maryland;University of Maryland, College Park;Unity Technologies", "aff_unique_dep": ";Department of Computer Science;", "aff_unique_url": "https://www/umd.edu;https://www/umd.edu;https://unity.com", "aff_unique_abbr": "UMD;UMD;Unity", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "College Park;", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "7KgeqhkbZab", "title": "Contrastive Learning for Source Code with Structural and Functional Properties", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Pre-trained transformer models have recently shown promises for understanding the source code. Most existing works expect to understand code from the textual features and limited structural knowledge of code. However, the program functionalities sometimes cannot be fully revealed by the code sequence, even with structure information. Programs can contain very different tokens and structures while sharing the same functionality, but changing only one or a few code tokens can introduce unexpected or malicious program behaviors while preserving the syntax and most tokens. In this work, we present BOOST, a novel self-supervised model to focus pre-training based on the characteristics of source code. We first employ automated, structure-guided code transformation algorithms that generate (i.) functionally equivalent code that looks drastically different from the original one, and (ii.) textually and syntactically very similar code that is functionally distinct from the original. We train our model in a way that brings the functionally equivalent code closer and distinct code further through a contrastive learning objective. To encode the structure information, we introduce a new node-type masked language model objective that helps the model learn about structural context. We pre-train BOOST with a much smaller dataset than the state-of-the-art models, but our small models can still match or outperform these large models in code understanding and generation tasks.", "keywords": "Source Code Modeling;Contrastive Learning;Pre-trained Models", "primary_area": "", "supplementary_material": "", "author": "Yangruibo Ding;Luca Buratti;Saikat Chakraborty;Saurabh Pujar;Alessandro Morari;Baishakhi Ray", "authorids": "~Yangruibo_Ding1;~Luca_Buratti1;~Saikat_Chakraborty1;~Saurabh_Pujar1;amorari@us.ibm.com;~Baishakhi_Ray2", "gender": ";;M;M;;F", "homepage": ";;https://saikatc.info;;;http://rayb.info/", "dblp": ";;137/5220;252/5520.html;;74/1969", "google_scholar": ";;Hl_6OwwAAAAJ;-NuBDksAAAAJ;;https://scholar.google.com.tw/citations?user=VaAEb5YAAAAJ", "orcid": ";;0000-0002-6889-7171;0000-0002-9772-3162;;", "linkedin": ";;saikatch107/;saurabh-pujar-63597040/;;", "or_profile": "~Yangruibo_Ding1;~Luca_Buratti1;~Saikat_Chakraborty1;~Saurabh_Pujar1;amorari@us.ibm.com;~Baishakhi_Ray2", "aff": ";;Columbia University;International Business Machines;;Columbia University", "aff_domain": ";;columbia.edu;ibm.com;;columbia.edu", "position": ";;PhD student;Researcher;;Assistant Professor", "bibtex": "@misc{\nding2022contrastive,\ntitle={Contrastive Learning for Source Code with Structural and Functional Properties},\nauthor={Yangruibo Ding and Luca Buratti and Saikat Chakraborty and Saurabh Pujar and Alessandro Morari and Baishakhi Ray},\nyear={2022},\nurl={https://openreview.net/forum?id=7KgeqhkbZab}\n}", "github": "", "project": "", "reviewers": "aC2u;PPxs;Gk2T;mfVa", "site": "https://openreview.net/forum?id=7KgeqhkbZab", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "5;4;5;4", "correctness": "2;3;4;4", "technical_novelty": "2;2;3;2", "empirical_novelty": "1;2;3;2", "wc_summary_paper": "81;68;62;149", "wc_summary_review": "58;49;18;5", "wc_main_review": "455;333;296;340", "wc_review": "594;450;376;494", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 4.5, 0.5 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 90.0, 34.74910070778811 ], "wc_summary_review_avg": [ 32.5, 21.73131381210073 ], "wc_main_review_avg": [ 356.0, 59.552497848536966 ], "wc_review_avg": [ 478.5, 78.89708486376414 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.9045340337332909, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8045237906759573064&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "aff_unique_index": "0;1;0", "aff_unique_norm": "Columbia University;International Business Machines Corporation", "aff_unique_dep": ";", "aff_unique_url": "https://www.columbia.edu;https://www.ibm.com", "aff_unique_abbr": "Columbia;IBM", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "7MLeqJrHNa", "title": "Continual Learning of Neural Networks for Realtime Wireline Cable Position Inference", "track": "main", "status": "Reject", "tldr": "", "abstract": "In the oil fields, Wireline cable is spooled onto a drum where computer vision techniques based on convolutional neural networks (CNNs) are applied to estimate the cable position in real time for automated spooling control. However, as new training data keeps arriving to continuously improve the network, the re-training procedure faces challenges. Online learning fashion with no memory to historical data leads to catastrophic forgetting. Meanwhile, saving all data will cause the disk space and training time to increase without bounds. In this paper, we proposed a method called the modified-REMIND (mREMIND) network. It is a replay-based continual learning method with longer memory to historical data and no memory overflow issues. Information of old data are kept for multiple iterations using a new dictionary update rule. Additionally, by dynamically partitioning the dataset, the method can be applied on devices with limited memory. In our experiments, we compared the proposed method with multiple state-of-the-art continual learning methods and the mREMIND network outperformed others both in accuracy and in disk space usage.", "keywords": "Continual Learning;Wireline Automation", "primary_area": "", "supplementary_material": "", "author": "Jun Wang;Tianxiang Su", "authorids": "~Jun_Wang27;tsu@slb.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": "~Jun_Wang27;tsu@slb.com", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nwang2022continual,\ntitle={Continual Learning of Neural Networks for Realtime Wireline Cable Position Inference},\nauthor={Jun Wang and Tianxiang Su},\nyear={2022},\nurl={https://openreview.net/forum?id=7MLeqJrHNa}\n}", "github": "", "project": "", "reviewers": "kEWr;MCzH;cXMu;jrD1", "site": "https://openreview.net/forum?id=7MLeqJrHNa", "pdf_size": 0, "recommendation": "1;3;3;5", "confidence": "5;4;5;4", "correctness": "1;2;4;3", "technical_novelty": "1;2;1;2", "empirical_novelty": "1;2;1;3", "wc_summary_paper": "61;51;49;73", "wc_summary_review": "17;5;20;67", "wc_main_review": "481;107;102;175", "wc_review": "559;163;171;315", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.0, 1.4142135623730951 ], "confidence_avg": [ 4.5, 0.5 ], "correctness_avg": [ 2.5, 1.118033988749895 ], "technical_novelty_avg": [ 1.5, 0.5 ], "empirical_novelty_avg": [ 1.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 58.5, 9.526279441628825 ], "wc_summary_review_avg": [ 27.25, 23.62599204266352 ], "wc_main_review_avg": [ 216.25, 155.54963034350163 ], "wc_review_avg": [ 302.0, 160.23420358962065 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.7071067811865476, "corr_recommendation_correctness": 0.6324555320336759, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:8lYLkKoczZ0J:scholar.google.com/&scioq=Continual+Learning+of+Neural+Networks+for+Realtime+Wireline+Cable+Position+Inference&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "title": "Conditional Image Generation by Conditioning Variational Auto-Encoders", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6145", "id": "7MV6uLzOChW", "poster": "", "openreview": "https://openreview.net/forum?id=7MV6uLzOChW", "slides": "https://iclr.cc/virtual/2022/poster/6145", "video": "https://iclr.cc/virtual/2022/poster/6145", "author_site": "William Harvey, Saeid Naderiparizi, Frank Wood", "tldr": "", "abstract": "We present a conditional variational auto-encoder (VAE) which, to avoid the substantial cost of training from scratch, uses an architecture and training objective capable of leveraging a foundation model in the form of a pretrained unconditional VAE. To train the conditional VAE, we only need to train an artifact to perform amortized inference over the unconditional VAE's latent variables given a conditioning input. We demonstrate our approach on tasks including image inpainting, for which it outperforms state-of-the-art GAN-based approaches at faithfully representing the inherent uncertainty. We conclude by describing a possible application of our inpainting model, in which it is used to perform Bayesian experimental design for the purpose of guiding a sensor.", "keywords": "variational auto-encoders;Bayesian inference;variational inference;amortized inference;image completion", "primary_area": "", "supplementary_material": "/attachment/69ea66a9ba7d75f74d81259485691d1cae872a28.zip", "author": "William Harvey;Saeid Naderiparizi;Frank Wood", "authorids": "~William_Harvey1;~Saeid_Naderiparizi1;~Frank_Wood2", "gender": "M;M;M", "homepage": "https://www.cs.ubc.ca/~wsgh/;https://www.cs.ubc.ca/~saeidnp/;http://www.robots.ox.ac.uk/~fwood/", "dblp": "26/8210-2;244/9611;44/4750", "google_scholar": "https://scholar.google.co.uk/citations?user=kDd7nBkAAAAJ;Ubt0dYYAAAAJ;d4yNzXIAAAAJ", "orcid": ";;", "linkedin": ";saeidnp;frank-wood-43529114?trk=hp-identity-name", "or_profile": "~William_Harvey1;~Saeid_Naderiparizi1;~Frank_Wood2", "aff": "University of British Columbia;University of British Columbia;University of British Columbia", "aff_domain": "cs.ubc.ca;ubc.ca;ubc.ca", "position": "PhD student;PhD student;Associate Professor", "bibtex": "@inproceedings{\nharvey2022conditional,\ntitle={Conditional Image Generation by Conditioning Variational Auto-Encoders},\nauthor={William Harvey and Saeid Naderiparizi and Frank Wood},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=7MV6uLzOChW}\n}", "github": "", "project": "", "reviewers": "gc2C;EhZn;Fk3K;67T5", "pdf_size": 0, "recommendation": "6;8;8;8", "confidence": "3;4;3;5", "correctness": "4;3;4;3", "technical_novelty": "2;3;4;3", "empirical_novelty": "2;3;4;3", "wc_summary_paper": "74;48;95;59", "wc_summary_review": "78;58;95;40", "wc_main_review": "170;231;172;430", "wc_review": "322;337;362;529", "wc_reply_reviewers": "0;0;0;235", "wc_reply_authors": "763;312;429;1941", "reply_reviewers": "0;0;0;2", "reply_authors": "2;1;1;4", "recommendation_avg": [ 7.5, 0.8660254037844386 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 69.0, 17.621010186706094 ], "wc_summary_review_avg": [ 67.75, 20.69269194667528 ], "wc_main_review_avg": [ 250.75, 106.35171601812544 ], "wc_review_avg": [ 387.5, 82.93521568067452 ], "wc_reply_reviewers_avg": [ 58.75, 101.75798494467153 ], "wc_reply_authors_avg": [ 861.25, 644.9861917126599 ], "reply_reviewers_avg": [ 0.5, 0.8660254037844386 ], "reply_authors_avg": [ 2.0, 1.224744871391589 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.5222329678670935, "corr_recommendation_correctness": -0.5773502691896257, "gs_citation": 34, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2137944836024750208&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=7MV6uLzOChW", "email": "cs.ubc.ca;ubc.ca;ubc.ca", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of British Columbia", "aff_unique_dep": "", "aff_unique_url": "https://www.ubc.ca", "aff_unique_abbr": "UBC", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Canada" }, { "id": "7N-6ZLyFUXz", "title": "Thompson Sampling for (Combinatorial) Pure Exploration", "track": "main", "status": "Reject", "tldr": "", "abstract": "Pure exploration plays an important role in online learning. Existing work mainly focuses on the UCB approach that uses confidence bounds of all the arms to decide which one is optimal. However, the UCB approach faces some challenges when looking for the best arm set under some specific combinatorial structures. It uses the sum of upper confidence bounds within arm set $S$ to judge whether $S$ is optimal. This sum can be much larger than the exact upper confidence bound of $S$, since the empirical means of different arms in $S$ are independent. Because of this, the UCB approach requires much higher complexity than necessary. To deal with this challenge, we explore the idea of Thompson Sampling (TS) that uses independent random samples instead of the upper confidence bounds to make decisions, and design the first TS-based algorithm framework TS-Verify for (combinatorial) pure exploration. In TS-Verify, the sum of independent random samples within arm set $S$ will not exceed the exact upper confidence bound of $S$ with high probability. Hence it solves the above challange, and behaves better than existing UCB-based algorithms under the general combinatorial pure exploration setting. As for pure exploration of classic multi-armed bandit, we show that TS-Verify achieves an asymptotically optimal complexity upper bound.", "keywords": "pure exploration;(combinatorial) multi-armed bandit;Thompson Sampling", "primary_area": "", "supplementary_material": "", "author": "Siwei Wang;Jun Zhu", "authorids": "~Siwei_Wang2;~Jun_Zhu2", "gender": "M;M", "homepage": "https://www.microsoft.com/en-us/research/people/siweiwang/publications/;http://ml.cs.tsinghua.edu.cn/~jun", "dblp": "51/8279-2;50/2644-1", "google_scholar": ";axsP38wAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Siwei_Wang2;~Jun_Zhu2", "aff": "Tsinghua University;Tsinghua University", "aff_domain": "tsinghua.edu.cn;mail.tsinghua.edu.cn", "position": "Postdoc;Professor", "bibtex": "@misc{\nwang2022thompson,\ntitle={Thompson Sampling for (Combinatorial) Pure Exploration},\nauthor={Siwei Wang and Jun Zhu},\nyear={2022},\nurl={https://openreview.net/forum?id=7N-6ZLyFUXz}\n}", "github": "", "project": "", "reviewers": "6x9v;YozJ;TkoG;MaqF", "site": "https://openreview.net/forum?id=7N-6ZLyFUXz", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "4;2;4;3", "correctness": "3;3;4;3", "technical_novelty": "3;3;2;2", "empirical_novelty": "0;2;0;0", "wc_summary_paper": "67;230;186;166", "wc_summary_review": "46;11;131;47", "wc_main_review": "235;133;708;503", "wc_review": "348;374;1025;716", "wc_reply_reviewers": "0;0;157;0", "wc_reply_authors": "0;0;116;0", "reply_reviewers": "0;0;1;0", "reply_authors": "0;0;1;0", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 0.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 162.25, 59.66730679358672 ], "wc_summary_review_avg": [ 58.75, 44.16092730004659 ], "wc_main_review_avg": [ 394.75, 225.7635654838929 ], "wc_review_avg": [ 615.75, 277.3394806009415 ], "wc_reply_reviewers_avg": [ 39.25, 67.98299419707844 ], "wc_reply_authors_avg": [ 29.0, 50.22947341949744 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 0.25, 0.4330127018922193 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.30151134457776363, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1750323348048628899&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff_unique_index": "0;0", "aff_unique_norm": "Tsinghua University", "aff_unique_dep": "", "aff_unique_url": "https://www.tsinghua.edu.cn", "aff_unique_abbr": "THU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "id": "7QDPaL-Yl8U", "title": "LPRules: Rule Induction in Knowledge Graphs Using Linear Programming", "track": "main", "status": "Reject", "tldr": "", "abstract": "Knowledge graph (KG) completion is a well-studied problem in AI. Rule-based methods and embedding-based methods form two of the solution techniques. Rule-based methods learn first-order logic rules that capture existing facts in an input graph and then use these rules for reasoning about missing facts. A major drawback of such methods is the lack of scalability to large datasets. In this paper, we present a simple linear programming (LP) model to choose rules from a list of candidate rules and assign weights to them. For smaller KGs, we use simple heuristics to create the candidate list. For larger KGs, we start with a small initial candidate list, and then use standard column generation ideas to add more rules in order to improve the LP model objective value. To foster interpretability and generalizability, we limit the complexity of the set of chosen rules via explicit constraints, and tune the complexity hyperparameter for individual datasets. We show that our method can obtain state-of-the-art results for three out of four widely used KG datasets, while taking significantly less computing time than other popular rule learners including some based on neuro-symbolic methods. The improved scalability of our method allows us to tackle large datasets such as YAGO3-10.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Sanjeeb Dash;Joao Goncalves", "authorids": "~Sanjeeb_Dash1;~Joao_Goncalves1", "gender": ";", "homepage": "https://researcher.watson.ibm.com/researcher/view.php?person=us-sanjeebd;https://ibm.com", "dblp": "09/294;", "google_scholar": "NJV8UUoAAAAJ;", "orcid": ";", "linkedin": ";", "or_profile": "~Sanjeeb_Dash1;~Joao_Goncalves1", "aff": ";International Business Machines", "aff_domain": ";ibm.com", "position": ";Engineer", "bibtex": "@misc{\ndash2022lprules,\ntitle={{LPR}ules: Rule Induction in Knowledge Graphs Using Linear Programming},\nauthor={Sanjeeb Dash and Joao Goncalves},\nyear={2022},\nurl={https://openreview.net/forum?id=7QDPaL-Yl8U}\n}", "github": "", "project": "", "reviewers": "wtBS;PFXs;N4fv;3BGR", "site": "https://openreview.net/forum?id=7QDPaL-Yl8U", "pdf_size": 0, "recommendation": "3;3;6;6", "confidence": "5;3;5;3", "correctness": "3;3;3;3", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "26;49;52;93", "wc_summary_review": "33;30;74;11", "wc_main_review": "331;221;71;172", "wc_review": "390;300;197;276", "wc_reply_reviewers": "967;0;0;206", "wc_reply_authors": "2254;0;88;228", "reply_reviewers": "2;0;0;1", "reply_authors": "3;0;1;1", "recommendation_avg": [ 4.5, 1.5 ], "confidence_avg": [ 4.0, 1.0 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 55.0, 24.13503677229434 ], "wc_summary_review_avg": [ 37.0, 22.967368155711704 ], "wc_main_review_avg": [ 198.75, 93.56915891467658 ], "wc_review_avg": [ 290.75, 68.81633163719205 ], "wc_reply_reviewers_avg": [ 293.25, 397.97699368179565 ], "wc_reply_authors_avg": [ 642.5, 933.9457960716992 ], "reply_reviewers_avg": [ 0.75, 0.82915619758885 ], "reply_authors_avg": [ 1.25, 1.0897247358851685 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17744263279077883503&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "International Business Machines Corporation", "aff_unique_dep": "", "aff_unique_url": "https://www.ibm.com", "aff_unique_abbr": "IBM", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "title": "Energy-Inspired Molecular Conformation Optimization", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6692", "id": "7QfLW-XZTl", "poster": "", "openreview": "https://openreview.net/forum?id=7QfLW-XZTl", "slides": "https://iclr.cc/virtual/2022/poster/6692", "video": "https://iclr.cc/virtual/2022/poster/6692", "author_site": "Jiaqi Guan, Wei Qian, Qiang Liu, Wei-Ying Ma, Jianzhu Ma, Jian Peng", "tldr": "", "abstract": "This paper studies an important problem in computational chemistry: predicting a molecule's spatial atom arrangements, or a molecular conformation. We propose a neural energy minimization formulation that casts the prediction problem into an unrolled optimization process, where a neural network is parametrized to learn the gradient fields of an implicit conformational energy landscape. Assuming different forms of the underlying potential energy function, we can not only reinterpret and unify many of the existing models but also derive new variants of SE(3)-equivariant neural networks in a principled manner. In our experiments, these new variants show superior performance in molecular conformation optimization comparing to existing SE(3)-equivariant neural networks. Moreover, our energy-inspired formulation is also suitable for molecular conformation generation, where we can generate more diverse and accurate conformers comparing to existing baselines.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/c711b93785348fbed6d765b5dda99d8211395e85.zip", "author": "Jiaqi Guan;Wesley Wei Qian;qiang liu;Wei-Ying Ma;Jianzhu Ma;Jian Peng", "authorids": "~Jiaqi_Guan1;~Wesley_Wei_Qian1;~qiang_liu4;~Wei-Ying_Ma2;~Jianzhu_Ma2;~Jian_Peng1", "gender": "M;M;M;M;M;M", "homepage": "http://jiaqi.web.illinois.edu/;https://DrQ.ai/;https://air.tsinghua.edu.cn/en/info/1046/1189.htm;https://majianzhu.com/;http://jianpeng.web.engr.illinois.edu/;https://www.cs.utexas.edu/~lqiang/", "dblp": "207/7593;263/2040;m/WYMa.html;24/9080.html;29/4181-1;61/3234-1", "google_scholar": "On-ONT4AAAAJ;SZ5EErcAAAAJ;SToCbu8AAAAJ;;https://scholar.google.com.tw/citations?user=4wcAVXAAAAAJ;https://scholar.google.com.tw/citations?user=2qDh4WUAAAAJ", "orcid": ";0000-0003-0726-575X;;;;", "linkedin": ";wesleychin0919;wei-ying-ma-16a0171/;;;", "or_profile": "~Jiaqi_Guan1;~Wesley_Wei_Qian1;~Wei-Ying_Ma2;~Jianzhu_Ma2;~Jian_Peng1;~Qiang_Liu1", "aff": "Bytedance AI Lab;University of Illinois, Urbana Champaign;Tsinghua University;Peking University;University of Illinois, Urbana Champaign;University of Texas, Austin", "aff_domain": "bytedance.com;illinois.edu;tsinghua.edu.cn;pku.edu.cn;illinois.edu;utexas.edu", "position": "Intern;PhD student;Full Professor;Associate Professor;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\nguan2022energyinspired,\ntitle={Energy-Inspired Molecular Conformation Optimization},\nauthor={Jiaqi Guan and Wesley Wei Qian and qiang liu and Wei-Ying Ma and Jianzhu Ma and Jian Peng},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=7QfLW-XZTl}\n}", "github": "", "project": "", "reviewers": "nNjh;h2Mm;83EH;jLPL", "pdf_size": 0, "recommendation": "3;6;6;8", "confidence": "4;3;4;3", "correctness": "3;3;2;4", "technical_novelty": "3;3;2;3", "empirical_novelty": "0;3;3;2", "wc_summary_paper": "32;80;14;160", "wc_summary_review": "11;51;119;262", "wc_main_review": "422;292;850;614", "wc_review": "465;423;983;1036", "wc_reply_reviewers": "315;128;517;254", "wc_reply_authors": "2089;867;1721;1425", "reply_reviewers": "3;1;2;2", "reply_authors": "6;3;7;4", "recommendation_avg": [ 5.75, 1.7853571071357126 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 1.224744871391589 ], "wc_summary_paper_avg": [ 71.5, 56.50442460551209 ], "wc_summary_review_avg": [ 110.75, 95.47872799739217 ], "wc_main_review_avg": [ 544.5, 210.3110791185286 ], "wc_review_avg": [ 726.75, 283.7590306932979 ], "wc_reply_reviewers_avg": [ 303.5, 140.50355867379304 ], "wc_reply_authors_avg": [ 1525.5, 447.0668294561787 ], "reply_reviewers_avg": [ 2.0, 0.7071067811865476 ], "reply_authors_avg": [ 5.0, 1.5811388300841898 ], "replies_avg": [ 33, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.7001400420140049, "corr_recommendation_correctness": 0.39605901719066966, "gs_citation": 24, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14935852302007314627&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "pdf": "https://openreview.net/pdf?id=7QfLW-XZTl", "email": "bytedance.com;illinois.edu;tsinghua.edu.cn;pku.edu.cn;illinois.edu;utexas.edu", "author_num": 6, "aff_unique_index": "0;1;2;3;1;4", "aff_unique_norm": "ByteDance;University of Illinois Urbana-Champaign;Tsinghua University;Peking University;University of Texas at Austin", "aff_unique_dep": "AI Lab;;;;", "aff_unique_url": "https://www.bytedance.com;https://illinois.edu;https://www.tsinghua.edu.cn;http://www.pku.edu.cn;https://www.utexas.edu", "aff_unique_abbr": "Bytedance AI Lab;UIUC;THU;Peking U;UT Austin", "aff_campus_unique_index": "1;1;2", "aff_campus_unique": ";Urbana-Champaign;Austin", "aff_country_unique_index": "0;1;0;0;1;1", "aff_country_unique": "China;United States" }, { "id": "7Rnf1F7rQhR", "title": "Best Practices in Pool-based Active Learning for Image Classification", "track": "main", "status": "Reject", "tldr": "", "abstract": "The recent popularity of active learning (AL) methods for image classification using deep-learning has led to a large number of publications that lead to significant progress in the field. Benchmarking the latest works in an exhaustive and unified way and evaluating the improvements made by the novel methods is of key importance to advance the research in AL. Reproducing state-of-the-art AL methods is often cumbersome, since the results and the ranking order of different strategies are highly dependent on several factors, such as training settings, used data type, network architectures, loss function and more. With our work we highlight the main factors that should be considered when proposing new AL strategies. In addition, we provide solid benchmarks to compare new with existing methods. We therefore conduct a comprehensive study on the influence of these key aspects, providing best practices in pool-based AL for image classification. We emphasize aspects such as the importance of using data augmentation, the need of separating the contribution of a classification network and the acquisition strategy to the overall performance, the advantages that a proper initialization of the network can bring to AL. Moreover, we make a new codebase available, that enables state-of-the-art performance for the investigated methods, which we hope will serve the AL community as a new starting point when proposing new AL strategies.", "keywords": "Active Learning;Deep Learning;Image classification", "primary_area": "", "supplementary_material": "", "author": "Adrian Lang;Christoph Mayer;Radu Timofte", "authorids": "~Adrian_Lang2;~Christoph_Mayer1;~Radu_Timofte1", "gender": "M;;M", "homepage": ";https://2006pmach.github.io/;https://www.informatik.uni-wuerzburg.de/computervision/", "dblp": ";163/0032-7;24/8616", "google_scholar": ";wgt4-t0AAAAJ;https://scholar.google.ch/citations?user=u3MwH5kAAAAJ", "orcid": ";;0000-0002-1478-0402", "linkedin": "adrian-lang-2a8486187/;;https://ch.linkedin.com/in/radutimofte", "or_profile": "~Adrian_Lang2;~Christoph_Mayer1;~Radu_Timofte1", "aff": ";Swiss Federal Institute of Technology;ETH Zurich", "aff_domain": ";ethz.ch;vision.ee.ethz.ch", "position": ";PhD student;Group Leader & Lecturer", "bibtex": "@misc{\nlang2022best,\ntitle={Best Practices in Pool-based Active Learning for Image Classification},\nauthor={Adrian Lang and Christoph Mayer and Radu Timofte},\nyear={2022},\nurl={https://openreview.net/forum?id=7Rnf1F7rQhR}\n}", "github": "", "project": "", "reviewers": "k1Y4;ow4N;3Uts;Zkvo", "site": "https://openreview.net/forum?id=7Rnf1F7rQhR", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "3;5;3;4", "correctness": "3;3;4;3", "technical_novelty": "2;3;1;2", "empirical_novelty": "3;3;3;2", "wc_summary_paper": "38;67;44;106", "wc_summary_review": "48;38;45;118", "wc_main_review": "186;348;118;255", "wc_review": "272;453;207;479", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "607;686;332;177", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 63.75, 26.686841326766267 ], "wc_summary_review_avg": [ 62.25, 32.39116391857508 ], "wc_main_review_avg": [ 226.75, 85.12747793750265 ], "wc_review_avg": [ 352.75, 115.9231965570308 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 450.5, 205.42456036219232 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.3458572319330373, "corr_recommendation_correctness": -0.13245323570650439, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14222628859294989780&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1", "aff_unique_norm": "Swiss Federal Institute of Technology;ETH Zurich", "aff_unique_dep": ";", "aff_unique_url": "https://www.ethz.ch;https://www.ethz.ch", "aff_unique_abbr": "ETH Zurich;ETHZ", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Switzerland" }, { "id": "7TFcl1Xkr7", "title": "Interactive Model with Structural Loss for Language-based Abductive Reasoning", "track": "main", "status": "Reject", "tldr": "", "abstract": "The abductive natural language inference task ($\\alpha$NLI) is proposed to infer the most plausible explanation between the cause and the event. In the $\\alpha$NLI task, two observations are given, and the most plausible hypothesis is asked to pick out from the candidates. Existing methods model the relation between each candidate hypothesis separately and penalize the inference network uniformly. In this paper, we argue that it is unnecessary to distinguish the reasoning abilities among correct hypotheses; and similarly, all wrong hypotheses contribute the same when explaining the reasons of the observations. Therefore, we propose to group instead of ranking the hypotheses and design a structural loss called \"joint softmax focal loss\" in this paper. Based on the observation that the hypotheses are generally semantically related, we have designed a novel interactive language model aiming at exploiting the rich interaction among competing hypotheses. We name this new model for $\\alpha$NLI: Interactive Model with Structural Loss (IMSL). The experimental results show that our IMSL has achieved the highest performance on the RoBERTa-large pretrained model, with ACC and AUC results increased by about 1% and 5% respectively.", "keywords": "abductive natural language;abductive reasoning;BiLSTM;joint loss function.", "primary_area": "", "supplementary_material": "", "author": "Linhao Li;Ming Xu;Yongfeng Dong;Xin Li;Jianhua Tao;Qinghua Hu", "authorids": "~Linhao_Li1;~Ming_Xu4;dongyf@hebut.edu.cn;~Xin_Li2;~Jianhua_Tao2;~Qinghua_Hu1", "gender": "M;;;M;M;M", "homepage": ";;;;http://people.ucas.ac.cn/~0001573?language=en;http://cic.tju.edu.cn/faculty/huqinghua/index.html", "dblp": ";;;09/1365-5;;", "google_scholar": ";;;gMBvzGoAAAAJ;;TVSNq_wAAAAJ", "orcid": ";;;0000-0003-2067-2763;;0000-0001-7765-8095", "linkedin": ";;;;;", "or_profile": "~Linhao_Li1;~Ming_Xu4;dongyf@hebut.edu.cn;~Xin_Li2;~Jianhua_Tao2;~Qinghua_Hu1", "aff": "Hebei University of Technolog;;;West Virginia University;, Institute of automation, Chinese academy of science;Tianjin University", "aff_domain": "hebut.edu.cn;;;wvu.edu;nlpr.ia.ac.cn;tju.edu.cn", "position": "Assistant Professor;;;Full Professor;Full Professor;Professor", "bibtex": "@misc{\nli2022interactive,\ntitle={Interactive Model with Structural Loss for Language-based Abductive Reasoning},\nauthor={Linhao Li and Ming Xu and Yongfeng Dong and Xin Li and Jianhua Tao and Qinghua Hu},\nyear={2022},\nurl={https://openreview.net/forum?id=7TFcl1Xkr7}\n}", "github": "", "project": "", "reviewers": "W1Sp;3Vwo;imYG;hk5y;cRzc", "site": "https://openreview.net/forum?id=7TFcl1Xkr7", "pdf_size": 0, "recommendation": "3;3;3;3;3", "confidence": "3;4;4;3;4", "correctness": "2;2;3;2;3", "technical_novelty": "1;2;2;3;2", "empirical_novelty": "2;2;2;2;2", "wc_summary_paper": "64;59;182;188;109", "wc_summary_review": "22;63;52;102;157", "wc_main_review": "198;268;324;802;792", "wc_review": "284;390;558;1092;1058", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "0;0;0;0;0", "reply_reviewers": "0;0;0;0;0", "reply_authors": "0;0;0;0;0", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 3.6, 0.4898979485566356 ], "correctness_avg": [ 2.4, 0.4898979485566356 ], "technical_novelty_avg": [ 2.0, 0.6324555320336759 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 120.4, 55.57913277481037 ], "wc_summary_review_avg": [ 79.2, 46.576388868180835 ], "wc_main_review_avg": [ 476.8, 264.4922683179983 ], "wc_review_avg": [ 676.4, 337.1537334807373 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14079666666373929754&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "Hebei University of Technology;West Virginia University;Chinese Academy of Sciences;Tianjin University", "aff_unique_dep": ";;Institute of Automation;", "aff_unique_url": "http://www.hbut.edu.cn;https://www.wvu.edu;http://www.ia.cas.cn;http://www.tju.edu.cn", "aff_unique_abbr": "HUT;WVU;CAS;TJU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0", "aff_country_unique": "China;United States" }, { "title": "Collapse by Conditioning: Training Class-conditional GANs with Limited Data", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6764", "id": "7TZeCsNOUB_", "poster": "", "openreview": "https://openreview.net/forum?id=7TZeCsNOUB_", "slides": "https://iclr.cc/virtual/2022/poster/6764", "video": "https://iclr.cc/virtual/2022/poster/6764", "author_site": "Mohamad Shahbazi, Martin Danelljan, Danda Paudel, Luc Van Gool", "tldr": "", "abstract": "Class-conditioning offers a direct means to control a Generative Adversarial Network (GAN) based on a discrete input variable. While necessary in many applications, the additional information provided by the class labels could even be expected to benefit the training of the GAN itself. On the contrary, we observe that class-conditioning causes mode collapse in limited data settings, where unconditional learning leads to satisfactory generative ability. Motivated by this observation, we propose a training strategy for class-conditional GANs (cGANs) that effectively prevents the observed mode-collapse by leveraging unconditional learning. Our training strategy starts with an unconditional GAN and gradually injects the class conditioning into the generator and the objective function. The proposed method for training cGANs with limited data results not only in stable training but also in generating high-quality images, thanks to the early-stage exploitation of the shared information across classes. We analyze the observed mode collapse problem in comprehensive experiments on four datasets. Our approach demonstrates outstanding results compared with state-of-the-art methods and established baselines. The code is available at https://github.com/mshahbazi72/transitional-cGAN", "keywords": "Generative Adversarial Network;GAN;Conditional GAN;limited data", "primary_area": "", "supplementary_material": "", "author": "Mohamad Shahbazi;Martin Danelljan;Danda Pani Paudel;Luc Van Gool", "authorids": "~Mohamad_Shahbazi1;~Martin_Danelljan4;~Danda_Pani_Paudel1;~Luc_Van_Gool1", "gender": ";M;M;", "homepage": ";https://martin-danelljan.github.io/;https://people.ee.ethz.ch/~paudeld/;", "dblp": ";151/8848;;61/5017", "google_scholar": ";NCSSpMkAAAAJ;https://scholar.google.ch/citations?user=W43pvPkAAAAJ;https://scholar.google.be/citations?user=TwMib_QAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Mohamad_Shahbazi1;~Martin_Danelljan4;~Danda_Pani_Paudel1;~Luc_Van_Gool1", "aff": ";ETH Zurich;ETHZ - ETH Zurich;KU Leuven", "aff_domain": ";vision.ee.ethz.ch;ethz.ch;kuleuven.be", "position": ";Principal Researcher;Lecturer;Emeritus", "bibtex": "@inproceedings{\nshahbazi2022collapse,\ntitle={Collapse by Conditioning: Training Class-conditional {GAN}s with Limited Data},\nauthor={Mohamad Shahbazi and Martin Danelljan and Danda Pani Paudel and Luc Van Gool},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=7TZeCsNOUB_}\n}", "github": "", "project": "", "reviewers": "tZA6;juAa;iwi9;F98y", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "4;4;4;4", "correctness": "3;3;3;4", "technical_novelty": "3;2;4;2", "empirical_novelty": "4;3;3;3", "wc_summary_paper": "64;77;56;203", "wc_summary_review": "45;47;38;60", "wc_main_review": "306;203;440;317", "wc_review": "415;327;534;580", "wc_reply_reviewers": "13;362;49;22", "wc_reply_authors": "785;1057;899;491", "reply_reviewers": "1;2;1;1", "reply_authors": "1;2;2;1", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.82915619758885 ], "empirical_novelty_avg": [ 3.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 100.0, 59.93746741396403 ], "wc_summary_review_avg": [ 47.5, 7.952986860293433 ], "wc_main_review_avg": [ 316.5, 84.0312441892895 ], "wc_review_avg": [ 464.0, 99.40573424103863 ], "wc_reply_reviewers_avg": [ 111.5, 145.23171141317587 ], "wc_reply_authors_avg": [ 808.0, 206.94202086574876 ], "reply_reviewers_avg": [ 1.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.9271726499455306, "gs_citation": 49, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2177449249574403992&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=7TZeCsNOUB_", "email": ";vision.ee.ethz.ch;ethz.ch;kuleuven.be", "author_num": 4, "aff_unique_index": "0;0;1", "aff_unique_norm": "ETH Zurich;Katholieke Universiteit Leuven", "aff_unique_dep": ";", "aff_unique_url": "https://www.ethz.ch;https://www.kuleuven.be", "aff_unique_abbr": "ETHZ;KU Leuven", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1", "aff_country_unique": "Switzerland;Belgium" }, { "id": "7U-rmW7TPHM", "title": "EfficientPhys: Enabling Simple, Fast, and Accurate Camera-Based Vitals Measurement", "track": "main", "status": "Reject", "tldr": "", "abstract": "Camera-based physiological measurement is a growing field with neural models providing state-the-art-performance. Prior research have explored various \"end-to-end'' models; however these methods still require several preprocessing steps. These additional operations are often non-trivial to implement making replication and deployment difficult and can even have a higher computational budget than the \"core'' network itself. In this paper, we propose two novel and efficient neural models for camera-based physiological measurement called EfficientPhys that remove the need for face detection, segmentation, normalization, color space transformation or any other preprocessing steps. Using an input of raw video frames, our models achieve state-of-the-art accuracy on three public datasets. We show that this is the case whether using a transformer or convolutional backbone. We further evaluate the latency of the proposed networks and show that our most light weight network also achieves a 33\\% improvement in efficiency.\n", "keywords": "Computer Vision;Healthcare;Deep Learning", "primary_area": "", "supplementary_material": "/attachment/250178cd031656628978d3d673f1eee3b3661ce8.zip", "author": "Xin Liu;Brian L. Hill;Ziheng Jiang;Shwetak Patel;Daniel McDuff", "authorids": "~Xin_Liu8;~Brian_L._Hill1;~Ziheng_Jiang1;~Shwetak_Patel1;~Daniel_McDuff1", "gender": "M;M;;M;M", "homepage": "https://homes.cs.washington.edu/~xliu0/;https://www.brianlhill.info;http://www.ziheng.org/;http://abstract.cs.washington.edu/~shwetak/;http://alumni.media.mit.edu/~djmcduff/", "dblp": "76/1820-61;;14/8980;p/ShwetakNPatel;63/9606", "google_scholar": "p9F83HoAAAAJ;UnyYursAAAAJ;tuRCeekAAAAJ;https://scholar.google.com.tw/citations?user=z4S5rC0AAAAJ;m7Jr-b4AAAAJ", "orcid": ";0000-0002-6881-5770;;;", "linkedin": ";brianhill11/;;;", "or_profile": "~Xin_Liu8;~Brian_L._Hill1;~Ziheng_Jiang1;~Shwetak_Patel1;~Daniel_McDuff1", "aff": "Department of Computer Science, University of Washington;;ByteDance;University of Washington;Microsoft", "aff_domain": "cs.washington.edu;;bytedance.com;u.washington.edu;microsoft.com", "position": "PhD student;;Research Scientist;Full Professor;Principal Researcer", "bibtex": "@misc{\nliu2022efficientphys,\ntitle={EfficientPhys: Enabling Simple, Fast, and Accurate Camera-Based Vitals Measurement},\nauthor={Xin Liu and Brian L. Hill and Ziheng Jiang and Shwetak Patel and Daniel McDuff},\nyear={2022},\nurl={https://openreview.net/forum?id=7U-rmW7TPHM}\n}", "github": "", "project": "", "reviewers": "kqwX;B8NS;anoh;S4mf", "site": "https://openreview.net/forum?id=7U-rmW7TPHM", "pdf_size": 0, "recommendation": "3;3;5;8", "confidence": "4;4;4;4", "correctness": "3;2;3;3", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "64;91;145;108", "wc_summary_review": "32;54;178;130", "wc_main_review": "168;811;1629;532", "wc_review": "264;956;1952;770", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.75, 2.0463381929681126 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 102.0, 29.368350311176826 ], "wc_summary_review_avg": [ 98.5, 58.555529200921754 ], "wc_main_review_avg": [ 785.0, 537.984665208963 ], "wc_review_avg": [ 985.5, 612.7795280523004 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.49374193110101877, "gs_citation": 31, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6717455567973223289&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;0;2", "aff_unique_norm": "University of Washington;ByteDance;Microsoft", "aff_unique_dep": "Department of Computer Science;;Microsoft Corporation", "aff_unique_url": "https://www.washington.edu;https://www.bytedance.com;https://www.microsoft.com", "aff_unique_abbr": "UW;ByteDance;Microsoft", "aff_campus_unique_index": "0", "aff_campus_unique": "Seattle;", "aff_country_unique_index": "0;1;0;0", "aff_country_unique": "United States;China" }, { "title": "Understanding over-squashing and bottlenecks on graphs via curvature", "status": "Oral", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6849", "id": "7UmjRGzp-A", "poster": "", "openreview": "https://openreview.net/forum?id=7UmjRGzp-A", "slides": "https://iclr.cc/virtual/2022/poster/6849", "video": "https://iclr.cc/virtual/2022/poster/6849", "author_site": "Jake Topping, Francesco Di Giovanni, Benjamin Chamberlain, Xiaowen Dong, Michael Bronstein", "tldr": "", "abstract": "Most graph neural networks (GNNs) use the message passing paradigm, in which node features are propagated on the input graph. Recent works pointed to the distortion of information flowing from distant nodes as a factor limiting the efficiency of message passing for tasks relying on long-distance interactions. This phenomenon, referred to as 'over-squashing', has been heuristically attributed to graph bottlenecks where the number of $k$-hop neighbors grows rapidly with $k$. We provide a precise description of the over-squashing phenomenon in GNNs and analyze how it arises from bottlenecks in the graph. For this purpose, we introduce a new edge-based combinatorial curvature and prove that negatively curved edges are responsible for the over-squashing issue. We also propose and experimentally test a curvature-based graph rewiring method to alleviate the over-squashing.", "keywords": "Graph neural networks;Geometric deep learning;Differential geometry;Ricci curvature", "primary_area": "", "supplementary_material": "", "author": "Jake Topping;Francesco Di Giovanni;Benjamin Paul Chamberlain;Xiaowen Dong;Michael M. Bronstein", "authorids": "~Jake_Topping1;~Francesco_Di_Giovanni1;~Benjamin_Paul_Chamberlain1;~Xiaowen_Dong1;~Michael_M._Bronstein1", "gender": "M;M;M;;M", "homepage": ";https://francescodgv.github.io/;;https://web.media.mit.edu/~xdong/;http://www.inf.usi.ch/bronstein/", "dblp": ";;;91/9827-1;07/2668", "google_scholar": "https://scholar.google.com/citations?hl=en;yzjjeqsAAAAJ;https://scholar.google.co.uk/citations?user=Tr8LSOEAAAAJ;_8tUq8kAAAAJ;UU3N6-UAAAAJ", "orcid": ";;;;", "linkedin": "jake-topping/;;;;mbronstein/", "or_profile": "~Jake_Topping1;~Francesco_Di_Giovanni1;~Benjamin_Paul_Chamberlain1;~Xiaowen_Dong1;~Michael_M._Bronstein1", "aff": "University of Oxford;Twitter;Twitter;Massachusetts Institute of Technology;Twitter", "aff_domain": "ox.ac.uk;twitter.com;twitter.com;mit.edu;twitter.com", "position": "PhD student;Postdoc;ML Researcher;Research Affiliate;Head of Graph ML", "bibtex": "@inproceedings{\ntopping2022understanding,\ntitle={Understanding over-squashing and bottlenecks on graphs via curvature},\nauthor={Jake Topping and Francesco Di Giovanni and Benjamin Paul Chamberlain and Xiaowen Dong and Michael M. Bronstein},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=7UmjRGzp-A}\n}", "github": "", "project": "", "reviewers": "LDyA;idLR;Za5a;kUJT", "pdf_size": 0, "recommendation": "8;8;8;10", "confidence": "3;3;4;4", "correctness": "4;3;4;4", "technical_novelty": "4;3;4;4", "empirical_novelty": "3;3;2;2", "wc_summary_paper": "364;51;45;101", "wc_summary_review": "37;59;23;119", "wc_main_review": "116;141;347;1308", "wc_review": "517;251;415;1528", "wc_reply_reviewers": "8;0;52;210", "wc_reply_authors": "306;761;798;2186", "reply_reviewers": "1;0;1;2", "reply_authors": "2;1;2;5", "recommendation_avg": [ 8.5, 0.8660254037844386 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 140.25, 130.9988072464784 ], "wc_summary_review_avg": [ 59.5, 36.670833096617805 ], "wc_main_review_avg": [ 478.0, 487.512563940664 ], "wc_review_avg": [ 677.75, 499.97968708738557 ], "wc_reply_reviewers_avg": [ 67.5, 84.62121483410647 ], "wc_reply_authors_avg": [ 1012.75, 704.5400538649311 ], "reply_reviewers_avg": [ 1.0, 0.7071067811865476 ], "reply_authors_avg": [ 2.5, 1.5 ], "replies_avg": [ 23, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 584, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13989740203838615686&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "pdf": "https://openreview.net/pdf?id=7UmjRGzp-A", "email": "ox.ac.uk;twitter.com;twitter.com;mit.edu;twitter.com", "author_num": 5, "aff_unique_index": "0;1;1;2;1", "aff_unique_norm": "University of Oxford;Twitter, Inc.;Massachusetts Institute of Technology", "aff_unique_dep": ";;", "aff_unique_url": "https://www.ox.ac.uk;https://twitter.com;https://web.mit.edu", "aff_unique_abbr": "Oxford;Twitter;MIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1;1", "aff_country_unique": "United Kingdom;United States" }, { "id": "7VH_ZMpwZXa", "title": "No Shifted Augmentations (NSA): strong baselines for self-supervised Anomaly Detection", "track": "main", "status": "Reject", "tldr": "", "abstract": "Unsupervised Anomaly detection (AD) requires building a notion of normalcy, distinguishing in-distribution (ID) and out-of-distribution (OOD) data, using only available ID samples. Recently, large gains were made on this task for the domain of natural images using self-supervised contrastive feature learning as a first step followed by kNN or traditional one-class classifiers for feature scoring.\nLearned representations that are non-uniformly distributed on the unit hypersphere have been shown to be beneficial for this task. We go a step further and investigate how the \\emph {geometrical compactness} of the ID feature distribution makes isolating and detecting outliers easier, especially in the realistic situation when ID training data is polluted (i.e. ID data contains some OOD data that is used for learning the feature extractor parameters).\n\nWe propose novel architectural modifications to the self-supervised feature learning step, that enable such compact ID distributions to be learned. We show that the proposed modifications can be effectively applied to most existing self-supervised learning objectives with large gains in performance. Furthermore, this improved OOD performance is obtained without resorting to tricks such as using strongly augmented ID images (e.g. by 90 degree rotations) as proxies for the unseen OOD data, which imposes overly prescriptive assumptions about ID data and its invariances.\n\nWe perform extensive studies on benchmark datasets for one-class OOD detection and show state-of-the-art performance in the presence of pollution in the ID data, and comparable performance otherwise. We also propose and extensively evaluate a novel feature scoring technique based on the angular Mahalanobis distance, and propose a simple and novel technique for feature ensembling during evaluation that enables a big boost in performance at nearly zero run-time cost compared to the standard use of model ensembling or test time augmentations. Code for all models and experiments will be made open-source.", "keywords": "self-supervised learning;anomaly detection", "primary_area": "", "supplementary_material": "", "author": "Mohamed Yousef;Tom Bishop;Unmesh Kurup", "authorids": "~Mohamed_Yousef1;~Tom_Bishop1;~Unmesh_Kurup1", "gender": "M;M;", "homepage": "https://sites.google.com/site/mybmcv/;;", "dblp": ";https://dblp.org/pers/hd/b/Bishop:Tom_E=;53/4178", "google_scholar": ";https://scholar.google.com/citations?authuser=1;", "orcid": ";;", "linkedin": ";tom-bishop-97b2b125/;", "or_profile": "~Mohamed_Yousef1;~Tom_Bishop1;~Unmesh_Kurup1", "aff": ";Intuition Machines, Inc.;", "aff_domain": ";imachines.com;", "position": ";Director of ML Research;", "bibtex": "@misc{\nyousef2022no,\ntitle={No Shifted Augmentations ({NSA}): strong baselines for self-supervised Anomaly Detection},\nauthor={Mohamed Yousef and Tom Bishop and Unmesh Kurup},\nyear={2022},\nurl={https://openreview.net/forum?id=7VH_ZMpwZXa}\n}", "github": "", "project": "", "reviewers": "ft5T;UoZi;p72E;9Mbn", "site": "https://openreview.net/forum?id=7VH_ZMpwZXa", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "5;4;3;3", "correctness": "3;4;4;3", "technical_novelty": "2;2;4;2", "empirical_novelty": "2;3;0;3", "wc_summary_paper": "117;182;77;69", "wc_summary_review": "52;17;52;64", "wc_main_review": "322;802;168;302", "wc_review": "491;1001;297;435", "wc_reply_reviewers": "83;0;0;47", "wc_reply_authors": "947;722;73;812", "reply_reviewers": "1;0;0;1", "reply_authors": "2;1;1;2", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.5, 0.8660254037844386 ], "empirical_novelty_avg": [ 2.0, 1.224744871391589 ], "wc_summary_paper_avg": [ 111.25, 44.712274601053345 ], "wc_summary_review_avg": [ 46.25, 17.583728273605686 ], "wc_main_review_avg": [ 398.5, 240.36794711441874 ], "wc_review_avg": [ 556.0, 266.4451162997738 ], "wc_reply_reviewers_avg": [ 32.5, 34.90343822605446 ], "wc_reply_authors_avg": [ 638.5, 336.16848454309337 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.9045340337332909, "corr_recommendation_correctness": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2823572719654664256&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "Intuition Machines, Inc.", "aff_unique_dep": "", "aff_unique_url": "", "aff_unique_abbr": "", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "id": "7VYh_3ZD84", "title": "Sharpness-Aware Minimization in Large-Batch Training: Training Vision Transformer In Minutes", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Large-batch training is an important direction for distributed machine learning, which can improve the utilization of large-scale clusters and therefore accelerate the training process. However, recent work illustrates that large-batch training is prone to converge to sharp minima and cause a huge generalization gap. Sharpness-Aware Minimization (SAM) tries to narrow the generalization gap by seeking parameters that lie in a flat region. However, it requires two sequential gradient calculations that doubles the computational overhead. In this paper, we propose a novel algorithm LookSAM to significantly reduce its additional training cost. We further propose a layer-wise modification for adapting LookSAM to the large-batch training setting (Look-LayerSAM). Equipped with our enhanced training algorithm, we are the first to successfully scale up the batch size when training Vision Transformers (ViTs). With a 64k batch size, we are able to train ViTs from scratch within an hour while maintaining competitive performance.", "keywords": "Distributed Machine Learning;Large-Batch Training", "primary_area": "", "supplementary_material": "", "author": "Yong Liu;Siqi Mai;Xiangning Chen;Cho-Jui Hsieh;Yang You", "authorids": "~Yong_Liu13;siqimai@comp.nus.edu.sg;~Xiangning_Chen1;~Cho-Jui_Hsieh1;~Yang_You1", "gender": "M;;M;M;M", "homepage": "https://ai.comp.nus.edu.sg/people/yong;;;http://web.cs.ucla.edu/~chohsieh/index.html;https://www.comp.nus.edu.sg/~youy/", "dblp": "29/4867;;56/7393;14/2770;33/8167-1.html", "google_scholar": "2ejuK8UAAAAJ;;vNcBx1sAAAAJ;Wy89g4IAAAAJ;jF4dPZwAAAAJ", "orcid": ";;;;", "linkedin": ";;;;yang-you-0b92914b/", "or_profile": "~Yong_Liu13;siqimai@comp.nus.edu.sg;~Xiangning_Chen1;~Cho-Jui_Hsieh1;~Yang_You1", "aff": "National University of Singapore;;University of California, Los Angeles;University of California, Los Angeles;National University of Singapore", "aff_domain": "nus.edu.sg;;cs.ucla.edu;ucla.edu;nus.edu.sg", "position": "PhD student;;PhD student;Assistant Professor;Professor", "bibtex": "@misc{\nliu2022sharpnessaware,\ntitle={Sharpness-Aware Minimization in Large-Batch Training: Training Vision Transformer In Minutes},\nauthor={Yong Liu and Siqi Mai and Xiangning Chen and Cho-Jui Hsieh and Yang You},\nyear={2022},\nurl={https://openreview.net/forum?id=7VYh_3ZD84}\n}", "github": "", "project": "", "reviewers": "1Hu3;xDAN;USHV;dsYy", "site": "https://openreview.net/forum?id=7VYh_3ZD84", "pdf_size": 0, "recommendation": "3;3;5;6", "confidence": "4;4;4;4", "correctness": "2;2;2;3", "technical_novelty": "2;3;3;4", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "83;63;138;47", "wc_summary_review": "60;32;104;150", "wc_main_review": "422;383;1238;129", "wc_review": "565;478;1480;326", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.25, 1.299038105676658 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 2.25, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 82.75, 34.35385713424331 ], "wc_summary_review_avg": [ 86.5, 44.75209492303126 ], "wc_main_review_avg": [ 543.0, 416.7319282224485 ], "wc_review_avg": [ 712.25, 451.4379110132422 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.7777777777777777, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7801211729332557368&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;1;0", "aff_unique_norm": "National University of Singapore;University of California, Los Angeles", "aff_unique_dep": ";", "aff_unique_url": "https://www.nus.edu.sg;https://www.ucla.edu", "aff_unique_abbr": "NUS;UCLA", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Los Angeles", "aff_country_unique_index": "0;1;1;0", "aff_country_unique": "Singapore;United States" }, { "id": "7WVAI3dRwhR", "title": "Adversarial twin neural networks: maximizing physics recovery for physical system", "track": "main", "status": "Reject", "tldr": "", "abstract": "The exact modeling of modern physical systems is challenging due to the expanding system territory and insufficient sensors. To tackle this problem, existing methods utilize sparse regression to find physical parameters or add another virtual learning model like a Neural Network (NN) to universally approximate the unobserved physical quantities. However, the two models can't perfectly play their own roles in joint learning without proper restrictions. Thus, we propose (1) sparsity regularization for the physical model and (2) physical superiority over the virtual model. They together define output boundaries for the physical and virtual models. Further, even the two models output properly, the joint model still can't guarantee learning maximal physical knowledge. For example, if the data of an observed node can linearly represent those of an unobserved node, these two nodes can be aggregated. Therefore, we propose (3) to seek the dissimilarity of physical and virtual outputs to obtain maximal physics. To achieve goals (1)-(3), we design a twin structure of the Physical Neural Network (PNN) and Virtual Neural Network (VNN), where sparse regularization and skip-connections are utilized to guarantee (1) and (2). Then, we propose an adversarial learning scheme to maximize output dissimilarity, achieving (3). We denote the model as the Adversarial Twin Neural Network (ATN). Finally, we conduct extensive experiments over various systems to demonstrate the best performance of ATN over other state-of-the-art methods.", "keywords": "Physical Equation Learning;Incomplete Observability;Twin Neural Network;Mini-Max Game", "primary_area": "", "supplementary_material": "", "author": "Haoran Li;Erik Blasch;Jingyi Yuan;Yang Weng", "authorids": "~Haoran_Li6;~Erik_Blasch1;~Jingyi_Yuan1;~Yang_Weng1", "gender": "M;M;F;", "homepage": ";https://sites.google.com/site/erikblasch/;;", "dblp": ";01/4960;248/7785;", "google_scholar": "https://scholar.google.com/citations?hl=en;Po7s1TsAAAAJ;1k_2PUwAAAAJ;", "orcid": ";0000-0001-6894-6108;0000-0002-2850-1582;", "linkedin": ";erik-blasch-76a0429/;jingyi-yuan-7a1757171/;", "or_profile": "~Haoran_Li6;~Erik_Blasch1;~Jingyi_Yuan1;~Yang_Weng1", "aff": "Arizona State University;Air Force Research Laboratory;Arizona State University;", "aff_domain": "asu.edu;us.af.mil;asu.edu;", "position": "PhD student;Principal Researcher;PhD student;", "bibtex": "@misc{\nli2022adversarial,\ntitle={Adversarial twin neural networks: maximizing physics recovery for physical system},\nauthor={Haoran Li and Erik Blasch and Jingyi Yuan and Yang Weng},\nyear={2022},\nurl={https://openreview.net/forum?id=7WVAI3dRwhR}\n}", "github": "", "project": "", "reviewers": "8dT9;dem1;FaHY", "site": "https://openreview.net/forum?id=7WVAI3dRwhR", "pdf_size": 0, "recommendation": "5;5;6", "confidence": "4;2;3", "correctness": "3;3;2", "technical_novelty": "2;3;2", "empirical_novelty": "2;0;3", "wc_summary_paper": "101;42;180", "wc_summary_review": "31;20;125", "wc_main_review": "964;258;431", "wc_review": "1096;320;736", "wc_reply_reviewers": "314;32;96", "wc_reply_authors": "771;83;441", "reply_reviewers": "1;1;1", "reply_authors": "1;1;1", "recommendation_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.0, 0.816496580927726 ], "correctness_avg": [ 2.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 1.6666666666666667, 1.247219128924647 ], "wc_summary_paper_avg": [ 107.66666666666667, 56.53514148051831 ], "wc_summary_review_avg": [ 58.666666666666664, 47.11923410054776 ], "wc_main_review_avg": [ 551.0, 300.4541007652694 ], "wc_review_avg": [ 717.3333333333334, 317.0755255280497 ], "wc_reply_reviewers_avg": [ 147.33333333333334, 120.71269840226236 ], "wc_reply_authors_avg": [ 431.6666666666667, 280.95234866827906 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": -0.9999999999999997, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9464099676175901176&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;0", "aff_unique_norm": "Arizona State University;Air Force Research Laboratory", "aff_unique_dep": ";", "aff_unique_url": "https://www.asu.edu;https://www.afrl.af.mil/", "aff_unique_abbr": "ASU;AFRL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Continual Learning with Recursive Gradient Optimization", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6560", "id": "7YDLgf9_zgm", "poster": "", "openreview": "https://openreview.net/forum?id=7YDLgf9_zgm", "slides": "https://iclr.cc/virtual/2022/poster/6560", "video": "https://iclr.cc/virtual/2022/poster/6560", "author_site": "Hao Liu, Huaping Liu", "tldr": "", "abstract": "Learning multiple tasks sequentially without forgetting previous knowledge, called Continual Learning(CL), remains a long-standing challenge for neural networks. Most existing methods rely on additional network capacity or data replay. In contrast, we introduce a novel approach which we refer to as Recursive Gradient Optimization(RGO). RGO is composed of an iteratively updated optimizer that modifies the gradient to minimize forgetting without data replay and a virtual Feature Encoding Layer(FEL) that represents different long-term structures with only task descriptors. Experiments demonstrate that RGO has significantly better performance on popular continual classification benchmarks when compared to the baselines and achieves new state-of-the-art performance on 20-split-CIFAR100(82.22%) and 20-split-miniImageNet(72.63%). With higher average accuracy than Single-Task Learning(STL), this method is flexible and reliable to provide continual learning capabilities for learning models that rely on gradient descent.", "keywords": "continual learning;lifelong learning", "primary_area": "", "supplementary_material": "/attachment/904ae61e1113f88a24f7ca54704a06406dfe99de.zip", "author": "Hao Liu;Huaping Liu", "authorids": "~Hao_Liu18;~Huaping_Liu1", "gender": "M;M", "homepage": ";https://sites.google.com/site/thuliuhuaping/", "dblp": ";", "google_scholar": ";", "orcid": "0000-0001-7080-3392;", "linkedin": ";", "or_profile": "~Hao_Liu18;~Huaping_Liu1", "aff": "Tsinghua University;Tsinghua University", "aff_domain": "tsinghua.edu.cn;tsinghua.edu.cn", "position": "MS student;Associate Professor", "bibtex": "@inproceedings{\nliu2022continual,\ntitle={Continual Learning with Recursive Gradient Optimization},\nauthor={Hao Liu and Huaping Liu},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=7YDLgf9_zgm}\n}", "github": "", "project": "", "reviewers": "DhW7;Ztre;CQxg;oYYz", "pdf_size": 0, "recommendation": "5;8;8;8", "confidence": "4;4;4;4", "correctness": "3;4;4;3", "technical_novelty": "3;4;4;3", "empirical_novelty": "2;4;3;3", "wc_summary_paper": "40;241;281;83", "wc_summary_review": "27;102;55;84", "wc_main_review": "160;392;281;774", "wc_review": "227;735;617;941", "wc_reply_reviewers": "0;29;0;585", "wc_reply_authors": "554;228;124;853", "reply_reviewers": "0;1;0;2", "reply_authors": "1;1;1;3", "recommendation_avg": [ 7.25, 1.299038105676658 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.5, 0.5 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 161.25, 101.88811265304702 ], "wc_summary_review_avg": [ 67.0, 28.53944638566067 ], "wc_main_review_avg": [ 401.75, 230.04822863912688 ], "wc_review_avg": [ 630.0, 259.9634589706792 ], "wc_reply_reviewers_avg": [ 153.5, 249.40779859499182 ], "wc_reply_authors_avg": [ 439.75, 286.51559730667367 ], "reply_reviewers_avg": [ 0.75, 0.82915619758885 ], "reply_authors_avg": [ 1.5, 0.8660254037844386 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 45, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9414094872676506368&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=7YDLgf9_zgm", "email": "tsinghua.edu.cn;tsinghua.edu.cn", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Tsinghua University", "aff_unique_dep": "", "aff_unique_url": "https://www.tsinghua.edu.cn", "aff_unique_abbr": "THU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "id": "7Z7u2z1Ornl", "title": "Pruning Edges and Gradients to Learn Hypergraphs from Larger Sets", "track": "main", "status": "Reject", "tldr": "", "abstract": "This paper aims for set-to-hypergraph prediction, where the goal is to infer the set of relations for a given set of entities. This is a common abstraction for applications in particle physics, biological systems and combinatorial optimization. We address two common scaling problems encountered in set-to-hypergraph tasks that limit the size of the input set: the exponentially growing number of hyperedges and the run-time complexity, both leading to higher memory requirements. We make three contributions. First, we propose to predict and supervise the \\emph{positive} edges only, which changes the asymptotic memory scaling from exponential to linear. Second, we introduce a training method that encourages iterative refinement of the predicted hypergraph, which allows us to skip iterations in the backward pass for improved efficiency and constant memory usage. Third, we combine both contributions in a single set-to-hypergraph model that enables us to address problems with larger input set sizes. We provide ablations for our main technical contributions and show that our model outperforms prior state-of-the-art, especially for larger sets.", "keywords": "set-to-hypergraph", "primary_area": "", "supplementary_material": "/attachment/da2df53692e93a9b9c2a16899ad8734564ef6804.zip", "author": "David W Zhang;Gertjan J. Burghouts;Cees G. M. Snoek", "authorids": "~David_W_Zhang1;~Gertjan_J._Burghouts1;~Cees_G._M._Snoek1", "gender": "M;M;M", "homepage": "https://davzha.netlify.app/;https://gertjanburghouts.github.io/;http://www.ceessnoek.info", "dblp": "119/0960;84/2061;s/CeesSnoek", "google_scholar": "https://scholar.google.nl/citations?user=MG3oLzUAAAAJ;zN6afwwAAAAJ;https://scholar.google.nl/citations?user=0uKdbscAAAAJ", "orcid": "0000-0002-2137-1738;0000-0001-6265-7276;0000-0001-9092-1556", "linkedin": "david-zhang-1b86b314a;gertjanburghouts/;cgmsnoek/", "or_profile": "~David_W_Zhang1;~Gertjan_J._Burghouts1;~Cees_Snoek1", "aff": "University of Amsterdam;TNO;University of Amsterdam", "aff_domain": "uva.nl;tno.nl;uva.nl", "position": "PhD student;Researcher;Full Professor", "bibtex": "@misc{\nzhang2022pruning,\ntitle={Pruning Edges and Gradients to Learn Hypergraphs from Larger Sets},\nauthor={David W Zhang and Gertjan J. Burghouts and Cees G. M. Snoek},\nyear={2022},\nurl={https://openreview.net/forum?id=7Z7u2z1Ornl}\n}", "github": "", "project": "", "reviewers": "ezwC;am1Q;d3zQ;hMC3", "site": "https://openreview.net/forum?id=7Z7u2z1Ornl", "pdf_size": 0, "recommendation": "3;5;5;5", "confidence": "2;3;3;3", "correctness": "2;2;3;3", "technical_novelty": "2;2;3;2", "empirical_novelty": "3;0;2;3", "wc_summary_paper": "103;52;67;63", "wc_summary_review": "45;29;49;25", "wc_main_review": "781;362;265;213", "wc_review": "929;443;381;301", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.5, 0.8660254037844386 ], "confidence_avg": [ 2.75, 0.4330127018922193 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 1.224744871391589 ], "wc_summary_paper_avg": [ 71.25, 19.13602623325961 ], "wc_summary_review_avg": [ 37.0, 10.198039027185569 ], "wc_main_review_avg": [ 405.25, 223.43273596319767 ], "wc_review_avg": [ 513.5, 245.1137491043699 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 1.0, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7642057874894819591&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff_unique_index": "0;1;0", "aff_unique_norm": "University of Amsterdam;TNO", "aff_unique_dep": ";", "aff_unique_url": "https://www.uva.nl;https://www.tno.nl", "aff_unique_abbr": "UvA;TNO", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Netherlands" }, { "title": "The Effects of Invertibility on the Representational Complexity of Encoders in Variational Autoencoders", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6975", "id": "7_JR7WpwKV1", "poster": "", "openreview": "https://openreview.net/forum?id=7_JR7WpwKV1", "slides": "https://iclr.cc/virtual/2022/poster/6975", "video": "https://iclr.cc/virtual/2022/poster/6975", "author_site": "Divyansh Pareek, Andrej Risteski", "tldr": "", "abstract": "Training and using modern neural-network based latent-variable generative models (like Variational Autoencoders) often require simultaneously training a generative direction along with an inferential (encoding) direction, which approximates the posterior distribution over the latent variables. Thus, the question arises: how complex does the inferential model need to be, in order to be able to accurately model the posterior distribution of a given generative model? In this paper, we identify an important property of the generative map impacting the required size of the encoder. We show that if the generative map is ``strongly invertible\" (in a sense we suitably formalize), the inferential model need not be much more complex. Conversely, we prove that there exist non-invertible generative maps, for which the encoding direction needs to be exponentially larger (under standard assumptions in computational complexity). Importantly, we do not require the generative model to be layerwise invertible, which a lot of the related literature assumes and isn't satisfied by many architectures used in practice (e.g. convolution and pooling based networks). Thus, we provide theoretical support for the empirical wisdom that learning deep generative models is harder when data lies on a low-dimensional manifold.", "keywords": "variational autoencoders;encoder;representational complexity;Langevin;invertibility;deep learning theory", "primary_area": "", "supplementary_material": "", "author": "Divyansh Pareek;Andrej Risteski", "authorids": "~Divyansh_Pareek1;~Andrej_Risteski2", "gender": "M;M", "homepage": ";", "dblp": "224/5879.html;63/11143", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": "~Divyansh_Pareek1;~Andrej_Risteski2", "aff": ";Carnegie Mellon University", "aff_domain": ";cmu.edu", "position": ";Assistant Professor", "bibtex": "@inproceedings{\npareek2022the,\ntitle={The Effects of Invertibility on the Representational Complexity of Encoders in Variational Autoencoders },\nauthor={Divyansh Pareek and Andrej Risteski},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=7_JR7WpwKV1}\n}", "github": "", "project": "", "reviewers": "agVY;8CZc;hddw", "pdf_size": 0, "recommendation": "6;6;6", "confidence": "3;3;3", "correctness": "4;3;4", "technical_novelty": "2;2;4", "empirical_novelty": "3;2;0", "wc_summary_paper": "45;87;65", "wc_summary_review": "33;200;32", "wc_main_review": "190;709;252", "wc_review": "268;996;349", "wc_reply_reviewers": "121;118;110", "wc_reply_authors": "728;996;532", "reply_reviewers": "1;1;2", "reply_authors": "2;4;2", "recommendation_avg": [ 6.0, 0.0 ], "confidence_avg": [ 3.0, 0.0 ], "correctness_avg": [ 3.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.6666666666666665, 0.9428090415820634 ], "empirical_novelty_avg": [ 1.6666666666666667, 1.247219128924647 ], "wc_summary_paper_avg": [ 65.66666666666667, 17.15290710702481 ], "wc_summary_review_avg": [ 88.33333333333333, 78.96131260870028 ], "wc_main_review_avg": [ 383.6666666666667, 231.43369580844438 ], "wc_review_avg": [ 537.6666666666666, 325.7732681209774 ], "wc_reply_reviewers_avg": [ 116.33333333333333, 4.642796092394706 ], "wc_reply_authors_avg": [ 752.0, 190.1858739934874 ], "reply_reviewers_avg": [ 1.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 2.6666666666666665, 0.9428090415820634 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:fjsaVQmuG-YJ:scholar.google.com/&scioq=The+Effects+of+Invertibility+on+the+Representational+Complexity+of+Encoders+in+Variational+Autoencoders&hl=en&as_sdt=0,5", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=7_JR7WpwKV1", "email": ";cmu.edu", "author_num": 2, "aff_unique_index": "0", "aff_unique_norm": "Carnegie Mellon University", "aff_unique_dep": "", "aff_unique_url": "https://www.cmu.edu", "aff_unique_abbr": "CMU", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "title": "Possibility Before Utility: Learning And Using Hierarchical Affordances", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6681", "id": "7b4zxUnrO2N", "poster": "", "openreview": "https://openreview.net/forum?id=7b4zxUnrO2N", "slides": "https://iclr.cc/virtual/2022/poster/6681", "video": "https://iclr.cc/virtual/2022/poster/6681", "author_site": "Robert S Costales, Shariq Iqbal, Fei Sha", "tldr": "", "abstract": "Reinforcement learning algorithms struggle on tasks with complex hierarchical dependency structures. Humans and other intelligent agents do not waste time assessing the utility of every high-level action in existence, but instead only consider ones they deem possible in the first place. By focusing only on what is feasible, or \"afforded'', at the present moment, an agent can spend more time both evaluating the utility of and acting on what matters. To this end, we present Hierarchical Affordance Learning (HAL), a method that learns a model of hierarchical affordances in order to prune impossible subtasks for more effective learning. Existing works in hierarchical reinforcement learning provide agents with structural representations of subtasks but are not affordance-aware, and by grounding our definition of hierarchical affordances in the present state, our approach is more flexible than the multitude of approaches that ground their subtask dependencies in a symbolic history. While these logic-based methods often require complete knowledge of the subtask hierarchy, our approach is able to utilize incomplete and varying symbolic specifications. Furthermore, we demonstrate that relative to non-affordance-aware methods, HAL agents are better able to efficiently learn complex tasks, navigate environment stochasticity, and acquire diverse skills in the absence of extrinsic supervision---all of which are hallmarks of human learning.", "keywords": "RL;HRL;reinforcement learning;hierarchical reinforcement learning;affordances;hierarchical affordances", "primary_area": "", "supplementary_material": "", "author": "Robby Costales;Shariq Iqbal;Fei Sha", "authorids": "~Robby_Costales1;~Shariq_Iqbal1;~Fei_Sha3", "gender": "M;M;M", "homepage": "https://robbycostales.com/;https://shariqiqbal2810.github.io/;http://feisha.org", "dblp": "263/7351;195/5885;13/3601", "google_scholar": "BgpME38AAAAJ;pRb9yXEAAAAJ;HDHOS0QAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Robby_Costales1;~Shariq_Iqbal1;~Fei_Sha2", "aff": "Google;University of Southern California;Google", "aff_domain": "google.com;usc.edu;google.com", "position": "Intern;PhD student;research scientist", "bibtex": "@inproceedings{\ncostales2022possibility,\ntitle={Possibility Before Utility: Learning And Using Hierarchical Affordances},\nauthor={Robby Costales and Shariq Iqbal and Fei Sha},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=7b4zxUnrO2N}\n}", "github": "", "project": "", "reviewers": "m66y;3eiv;QUoN;LfMQ", "pdf_size": 0, "recommendation": "8;8;8;8", "confidence": "4;3;4;3", "correctness": "3;3;4;3", "technical_novelty": "3;3;4;3", "empirical_novelty": "3;2;4;3", "wc_summary_paper": "147;171;88;69", "wc_summary_review": "40;64;65;13", "wc_main_review": "1025;584;417;385", "wc_review": "1212;819;570;467", "wc_reply_reviewers": "0;0;24;65", "wc_reply_authors": "1737;904;630;1147", "reply_reviewers": "0;0;1;1", "reply_authors": "3;2;1;3", "recommendation_avg": [ 8.0, 0.0 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 3.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 118.75, 41.6795813318704 ], "wc_summary_review_avg": [ 45.5, 21.266170318136737 ], "wc_main_review_avg": [ 602.75, 255.22771695096125 ], "wc_review_avg": [ 767.0, 287.0270022140774 ], "wc_reply_reviewers_avg": [ 22.25, 26.55536668924005 ], "wc_reply_authors_avg": [ 1104.5, 408.41553594348 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 2.25, 0.82915619758885 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1637590798034368393&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=7b4zxUnrO2N", "email": "google.com;usc.edu;google.com", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "Google;University of Southern California", "aff_unique_dep": "Google;", "aff_unique_url": "https://www.google.com;https://www.usc.edu", "aff_unique_abbr": "Google;USC", "aff_campus_unique_index": "0;1;0", "aff_campus_unique": "Mountain View;Los Angeles", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "7d_GchF1e7", "title": "Variance Pruning: Pruning Language Models via Temporal Neuron Variance", "track": "main", "status": "Withdraw", "tldr": "", "abstract": " As language models become larger, different pruning methods have been proposed to reduce model size. However, the typical sparsity patterns that are formed by commonly pruning regimes do not fully exploit the properties of modern hardware devices on which these models are being trained and deployed. Most known unstructured, or even structured, pruning regimes usually introduce requirements for additional hardware components to make these sparsity patterns useful. Here we propose a simple pruning algorithm, based on variance analysis of output neurons that correspond to entire rows of weights. Our algorithm facilitates the construction of row-sparse matrices, allowing an extremely convenient way of exploiting this sparsity on existing hardware architectures. Empirical experiments with natural language understanding tasks show that our method leads to little to no accuracy degradation, and at times even better accuracy, using a 50\\% sparse BERT\\textsubscript{LARGE} model. ", "keywords": "Natural language processing;Transformers;Language Models;Pruning", "primary_area": "", "supplementary_material": "/attachment/4a0500695cae74ac18ca7783c8f059d2ef5a2583.zip", "author": "Berry Weinstein;Yonatan Belinkov", "authorids": "~Berry_Weinstein1;~Yonatan_Belinkov1", "gender": "M;M", "homepage": ";https://www.belinkov.com", "dblp": ";136/8705", "google_scholar": ";https://scholar.google.com/citations?authorid=K-6ujU4AAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Berry_Weinstein1;~Yonatan_Belinkov1", "aff": ";Technion, Technion", "aff_domain": ";technion.ac.il", "position": ";Assistant Professor", "bibtex": "@misc{\nweinstein2022variance,\ntitle={Variance Pruning: Pruning Language Models via Temporal Neuron Variance},\nauthor={Berry Weinstein and Yonatan Belinkov},\nyear={2022},\nurl={https://openreview.net/forum?id=7d_GchF1e7}\n}", "github": "", "project": "", "reviewers": "c47M;3AcL;7Y1c;3RVV", "site": "https://openreview.net/forum?id=7d_GchF1e7", "pdf_size": 0, "recommendation": "3;3;3;6", "confidence": "4;4;3;4", "correctness": "3;2;3;4", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "227;64;97;51", "wc_summary_review": "82;40;26;49", "wc_main_review": "615;351;143;338", "wc_review": "924;455;266;438", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.75, 1.299038105676658 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 109.75, 69.7401426726387 ], "wc_summary_review_avg": [ 49.25, 20.60794749605113 ], "wc_main_review_avg": [ 361.75, 167.82934040268407 ], "wc_review_avg": [ 520.75, 244.2737961796148 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": 0.816496580927726, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:TOfzeyIQv5cJ:scholar.google.com/&scioq=Variance+Pruning:+Pruning+Language+Models+via+Temporal+Neuron+Variance&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "Technion - Israel Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.technion.ac.il/en/", "aff_unique_abbr": "Technion", "aff_country_unique_index": "0", "aff_country_unique": "Israel" }, { "title": "Variational Neural Cellular Automata", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6756", "id": "7fFO4cMBx_9", "poster": "", "openreview": "https://openreview.net/forum?id=7fFO4cMBx_9", "slides": "https://iclr.cc/virtual/2022/poster/6756", "video": "https://iclr.cc/virtual/2022/poster/6756", "author_site": "Rasmus Berg Palm, Miguel Gonzalez-Duque, Shyam Sudhakaran, Sebastian Risi", "tldr": "", "abstract": "In nature, the process of cellular growth and differentiation has lead to an amazing diversity of organisms --- algae, starfish, giant sequoia, tardigrades, and orcas are all created by the same generative process.\nInspired by the incredible diversity of this biological generative process, we propose a generative model, the Variational Neural Cellular Automata (VNCA), which is loosely inspired by the biological processes of cellular growth and differentiation. Unlike previous related works, the VNCA is a proper probabilistic generative model, and we evaluate it according to best practices. We find that the VNCA learns to reconstruct samples well and that despite its relatively few parameters and simple local-only communication, the VNCA can learn to generate a large variety of output from information encoded in a common vector format. While there is a significant gap to the current state-of-the-art in terms of generative modeling performance, we show that the VNCA can learn a purely self-organizing generative process of data. Additionally, the self-organizing nature bestows the VNCA with some inherent robustness against perturbations in the early stages of growth.", "keywords": "Neural Cellular Automata;Cellular Automata;Self-Organization;Generative Models", "primary_area": "", "supplementary_material": "", "author": "Rasmus Berg Palm;Miguel Gonz\u00e1lez Duque;Shyam Sudhakaran;Sebastian Risi", "authorids": "~Rasmus_Berg_Palm1;~Miguel_Gonz\u00e1lez_Duque1;~Shyam_Sudhakaran1;~Sebastian_Risi1", "gender": "M;M;M;M", "homepage": ";https://www.miguelgondu.com/;https://shyamsn97.github.io/;http://www.sebastianrisi.com", "dblp": "203/8376;244/9609.html;;81/7183", "google_scholar": "https://scholar.google.dk/citations?user=smoQomYAAAAJ;eje0FAYAAAAJ;;Tf8winBIYUsC", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Rasmus_Berg_Palm1;~Miguel_Gonz\u00e1lez_Duque1;~Shyam_Sudhakaran1;~Sebastian_Risi1", "aff": "IT University of Copenhagen;IT University;IT University;IT University of Copenhagen", "aff_domain": "itu.dk;itu.dk;itu.dk;itu.dk", "position": "Postdoc;PhD student;Researcher;Professor", "bibtex": "@inproceedings{\npalm2022variational,\ntitle={Variational Neural Cellular Automata},\nauthor={Rasmus Berg Palm and Miguel Gonz{\\'a}lez Duque and Shyam Sudhakaran and Sebastian Risi},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=7fFO4cMBx_9}\n}", "github": "", "project": "", "reviewers": "cp9d;AnwX;rDYM;kwgv", "pdf_size": 0, "recommendation": "5;5;5;8", "confidence": "4;4;4;5", "correctness": "3;4;4;4", "technical_novelty": "3;2;2;3", "empirical_novelty": "2;1;3;0", "wc_summary_paper": "47;45;57;144", "wc_summary_review": "44;68;33;114", "wc_main_review": "247;350;344;855", "wc_review": "338;463;434;1113", "wc_reply_reviewers": "0;152;58;92", "wc_reply_authors": "479;568;597;1375", "reply_reviewers": "0;1;1;1", "reply_authors": "2;2;2;2", "recommendation_avg": [ 5.75, 1.299038105676658 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 1.5, 1.118033988749895 ], "wc_summary_paper_avg": [ 73.25, 41.09972627646077 ], "wc_summary_review_avg": [ 64.75, 31.123744954616242 ], "wc_main_review_avg": [ 449.0, 237.94221987701133 ], "wc_review_avg": [ 587.0, 307.189680816267 ], "wc_reply_reviewers_avg": [ 75.5, 55.070409477322755 ], "wc_reply_authors_avg": [ 754.75, 360.73146175513995 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 2.0, 0.0 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 1.0, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 34, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8036499533836302391&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=7fFO4cMBx_9", "email": "itu.dk;itu.dk;itu.dk;itu.dk", "author_num": 4, "aff_unique_index": "0;1;1;0", "aff_unique_norm": "IT University of Copenhagen;IT University", "aff_unique_dep": ";", "aff_unique_url": "https://itu.dk;", "aff_unique_abbr": "ITU;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Denmark;" }, { "title": "Exploring Memorization in Adversarial Training", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6359", "id": "7gE9V9GBZaI", "poster": "", "openreview": "https://openreview.net/forum?id=7gE9V9GBZaI", "slides": "https://iclr.cc/virtual/2022/poster/6359", "video": "https://iclr.cc/virtual/2022/poster/6359", "author_site": "Yinpeng Dong, Ke Xu, Xiao Yang, Tianyu Pang, Zhijie Deng, Hang Su, Jun Zhu", "tldr": "", "abstract": "Deep learning models have a propensity for fitting the entire training set even with random labels, which requires memorization of every training sample. In this paper, we explore the memorization effect in adversarial training (AT) for promoting a deeper understanding of model capacity, convergence, generalization, and especially robust overfitting of the adversarially trained models. We first demonstrate that deep networks have sufficient capacity to memorize adversarial examples of training data with completely random labels, but not all AT algorithms can converge under the extreme circumstance. Our study of AT with random labels motivates further analyses on the convergence and generalization of AT. We find that some AT approaches suffer from a gradient instability issue and the recently suggested complexity measures cannot explain robust generalization by considering models trained on random labels. Furthermore, we identify a significant drawback of memorization in AT that it could result in robust overfitting. We then propose a new mitigation algorithm motivated by detailed memorization analyses. Extensive experiments on various datasets validate the effectiveness of the proposed method. ", "keywords": "Adversarial examples;adversarial training;memorization;robust overfitting", "primary_area": "", "supplementary_material": "/attachment/8a1e51f3193fe605cadb87716a17c30a7f849603.zip", "author": "Yinpeng Dong;Ke Xu;Xiao Yang;Tianyu Pang;Zhijie Deng;Hang Su;Jun Zhu", "authorids": "~Yinpeng_Dong2;~Ke_Xu7;~Xiao_Yang4;~Tianyu_Pang1;~Zhijie_Deng1;~Hang_Su3;~Jun_Zhu2", "gender": "M;M;M;M;M;M;M", "homepage": "https://dongyp13.github.io;;https://ml.cs.tsinghua.edu.cn/~xiaoyang/;https://p2333.github.io/;https://thudzj.github.io/;http://ml.cs.tsinghua.edu.cn/~jun;", "dblp": "183/0980;181/2626;57/33851;202/2550;209/4959;50/2644-1;26/5371-6", "google_scholar": "6_4ad84AAAAJ;;bwkwp0MAAAAJ;wYDbtFsAAAAJ;J3dR0sUAAAAJ;axsP38wAAAAJ;dxN1_X0AAAAJ", "orcid": ";;0000-0001-9502-9962;0000-0003-0639-6176;0000-0002-0932-1631;;", "linkedin": ";;;%E5%A4%A9%E5%AE%87-%E5%BA%9E-b3999017a/;;;", "or_profile": "~Yinpeng_Dong2;~Ke_Xu7;~Xiao_Yang4;~Tianyu_Pang1;~Zhijie_Deng1;~Jun_Zhu2;~Hang_Su2", "aff": "Tsinghua University;Carnegie Mellon University;Tsinghua University;Tsinghua University;Tsinghua University;Tsinghua University;Tsinghua University", "aff_domain": "tsinghua.edu.cn;cmu.edu;mail.tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;mail.tsinghua.edu.cn;tsinghua.edu.cn", "position": "PhD student;Undergrad student;PhD student;PhD student;PhD student;Professor;Associate Professor", "bibtex": "@inproceedings{\ndong2022exploring,\ntitle={Exploring Memorization in Adversarial Training},\nauthor={Yinpeng Dong and Ke Xu and Xiao Yang and Tianyu Pang and Zhijie Deng and Hang Su and Jun Zhu},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=7gE9V9GBZaI}\n}", "github": "", "project": "", "reviewers": "x9Ej;erPY;g5ke;c7wV", "pdf_size": 0, "recommendation": "3;6;8;10", "confidence": "4;4;2;4", "correctness": "3;3;4;4", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;2;3;4", "wc_summary_paper": "110;90;71;112", "wc_summary_review": "27;56;32;69", "wc_main_review": "351;698;219;749", "wc_review": "488;844;322;930", "wc_reply_reviewers": "444;0;0;149", "wc_reply_authors": "1067;828;86;711", "reply_reviewers": "1;0;0;1", "reply_authors": "2;2;1;2", "recommendation_avg": [ 6.75, 2.5860201081971503 ], "confidence_avg": [ 3.5, 0.8660254037844386 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 95.75, 16.67895380412093 ], "wc_summary_review_avg": [ 46.0, 17.219175357722563 ], "wc_main_review_avg": [ 504.25, 224.88594331349393 ], "wc_review_avg": [ 646.0, 249.899979991996 ], "wc_reply_reviewers_avg": [ 148.25, 181.26275817166635 ], "wc_reply_authors_avg": [ 673.0, 362.3789453044975 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.75, 0.4330127018922193 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": -0.27907278609297376, "corr_recommendation_correctness": 0.8700628401410974, "gs_citation": 94, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13986529616809382017&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=7gE9V9GBZaI", "email": "tsinghua.edu.cn;cmu.edu;mail.tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;mail.tsinghua.edu.cn;tsinghua.edu.cn", "author_num": 7, "aff_unique_index": "0;1;0;0;0;0;0", "aff_unique_norm": "Tsinghua University;Carnegie Mellon University", "aff_unique_dep": ";", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.cmu.edu", "aff_unique_abbr": "THU;CMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0;0;0;0", "aff_country_unique": "China;United States" }, { "id": "7gRvcAulxa", "title": "A Frequency Perspective of Adversarial Robustness", "track": "main", "status": "Reject", "tldr": "", "abstract": "Adversarial examples pose a unique challenge for deep learning systems. Despite recent advances in both attacks and defenses, there is still a lack of clarity and consensus in the community about the true nature and underlying properties of adversarial examples. A deep understanding of these examples can provide new insights towards the development of more effective attacks and defenses. Driven by the common misconception that adversarial examples are high-frequency noise, we present a frequency-based understanding of adversarial examples, supported by theoretical and empirical findings. Our analysis shows that adversarial examples are neither in high-frequency nor in low-frequency components, but are simply dataset dependent. Particularly, we highlight the glaring disparities between models trained on CIFAR-10 and ImageNet-derived datasets. Utilizing this framework, we analyze many intriguing properties of training robust models with frequency constraints, and propose a frequency-based explanation for the commonly observed accuracy vs. robustness trade-off.", "keywords": "Adversarial examples;Frequency analysis;Adversarial Robustness;Adversarial training", "primary_area": "", "supplementary_material": "/attachment/e0ee12ccbaf5951207cb6a75bcbe7909a43e5d59.zip", "author": "Shishira Maiya;Max Ehrlich;Vatsal Agarwal;Ser-Nam Lim;Tom Goldstein;Abhinav Shrivastava", "authorids": "~Shishira_Maiya1;~Max_Ehrlich1;~Vatsal_Agarwal1;~Ser-Nam_Lim3;~Tom_Goldstein1;~Abhinav_Shrivastava2", "gender": "M;M;;M;M;M", "homepage": "http://cs.umd.edu/~shishira/;https://maxlikelihood.ai;;https://www.cs.umd.edu/~tomg/;http://abhinavsh.info;https://sites.google.com/site/sernam", "dblp": "230/4408;177/8998;;25/8184;65/10572;04/6633", "google_scholar": "43zd4zIAAAAJ;q-WSy3AAAAAJ;;KmSuVtgAAAAJ;mIF9BowAAAAJ;HX0BfLYAAAAJ", "orcid": ";;;;0000-0001-8928-8554;", "linkedin": "shishira-r-maiya/;;vatsal-agarwal929;;;", "or_profile": "~Shishira_Maiya1;~Max_Ehrlich1;~Vatsal_Agarwal1;~Tom_Goldstein1;~Abhinav_Shrivastava2;~Ser-Nam_Lim1", "aff": "University of Maryland, College Park;University of Maryland, College Park;University of Maryland, College Park;University of Maryland, College Park;Department of Computer Science, University of Maryland, College Park;Meta Facebook", "aff_domain": "umd.edu;umd.edu;umd.edu;umd.edu;cs.umd.edu;facebook.com", "position": "PhD student;PhD student;Undergrad student;Associate Professor;Assistant Professor;Research Scientist Manager", "bibtex": "@misc{\nmaiya2022a,\ntitle={A Frequency Perspective of Adversarial Robustness},\nauthor={Shishira Maiya and Max Ehrlich and Vatsal Agarwal and Ser-Nam Lim and Tom Goldstein and Abhinav Shrivastava},\nyear={2022},\nurl={https://openreview.net/forum?id=7gRvcAulxa}\n}", "github": "", "project": "", "reviewers": "pNyu;axh9;hy96;dU9j", "site": "https://openreview.net/forum?id=7gRvcAulxa", "pdf_size": 0, "recommendation": "3;5;6;8", "confidence": "3;4;4;4", "correctness": "2;3;4;4", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "53;54;80;60", "wc_summary_review": "25;65;43;19", "wc_main_review": "280;441;62;423", "wc_review": "358;560;185;502", "wc_reply_reviewers": "394;0;114;0", "wc_reply_authors": "901;771;413;150", "reply_reviewers": "1;0;1;0", "reply_authors": "3;2;2;1", "recommendation_avg": [ 5.5, 1.8027756377319946 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 61.75, 10.871407452579449 ], "wc_summary_review_avg": [ 38.0, 17.916472867168917 ], "wc_main_review_avg": [ 301.5, 151.6945944982879 ], "wc_review_avg": [ 401.25, 144.90233780032673 ], "wc_reply_reviewers_avg": [ 127.0, 161.0248428038357 ], "wc_reply_authors_avg": [ 558.75, 296.0172081146635 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 2.0, 0.7071067811865476 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.8006407690254357, "corr_recommendation_correctness": 0.9198662110077999, "gs_citation": 46, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1567869082805871992&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0;0;0;1;2", "aff_unique_norm": "University of Maryland;University of Maryland, College Park;Meta", "aff_unique_dep": ";Department of Computer Science;Meta Platforms, Inc.", "aff_unique_url": "https://www/umd.edu;https://www/umd.edu;https://meta.com", "aff_unique_abbr": "UMD;UMD;Meta", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "College Park;", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "A General Analysis of Example-Selection for Stochastic Gradient Descent", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7168", "id": "7gWSJrP3opB", "poster": "", "openreview": "https://openreview.net/forum?id=7gWSJrP3opB", "slides": "https://iclr.cc/virtual/2022/poster/7168", "video": "https://iclr.cc/virtual/2022/poster/7168", "author_site": "Yucheng Lu, Si Yi Meng, Christopher De Sa", "tldr": "", "abstract": "Training example order in SGD has long been known to affect convergence rate. Recent results show that accelerated rates are possible in a variety of cases for permutation-based sample orders, in which each example from the training set is used once before any example is reused. In this paper, we develop a broad condition on the sequence of examples used by SGD that is sufficient to prove tight convergence rates in both strongly convex and non-convex settings. We show that our approach suffices to recover, and in some cases improve upon, previous state-of-the-art analyses for four known example-selection schemes: (1) shuffle once, (2) random reshuffling, (3) random reshuffling with data echoing, and (4) Markov Chain Gradient Descent. Motivated by our theory, we propose two new example-selection approaches. First, using quasi-Monte-Carlo methods, we achieve unprecedented accelerated convergence rates for learning with data augmentation. Second, we greedily choose a fixed scan-order to minimize the metric used in our condition and show that we can obtain more accurate solutions from the same number of epochs of SGD. We conclude by empirically demonstrating the utility of our approach for both convex linear-model and deep learning tasks. Our code is available at: https://github.com/EugeneLYC/qmc-ordering.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yucheng Lu;Si Yi Meng;Christopher De Sa", "authorids": "~Yucheng_Lu1;~Si_Yi_Meng2;~Christopher_De_Sa2", "gender": "M;F;M", "homepage": "https://www.yucheng-lu.me/;https://www.cs.cornell.edu/~siyimeng/;http://cs.cornell.edu/~cdesa", "dblp": ";250/9468;154/6336", "google_scholar": "FsBgPhQAAAAJ;https://scholar.google.ca/citations?user=Fey3yDgAAAAJ;", "orcid": ";;", "linkedin": ";;", "or_profile": "~Yucheng_Lu1;~Si_Yi_Meng2;~Christopher_De_Sa1", "aff": "Cornell University;Flatiron Institute;Cornell University", "aff_domain": "cornell.edu;flatironinstitute.org;cornell.edu", "position": "PhD student;Intern;Assistant Professor", "bibtex": "@inproceedings{\nlu2022a,\ntitle={A General Analysis of Example-Selection for Stochastic Gradient Descent},\nauthor={Yucheng Lu and Si Yi Meng and Christopher De Sa},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=7gWSJrP3opB}\n}", "github": "", "project": "", "reviewers": "SC9E;nwty;XLAG;rjuV", "pdf_size": 0, "recommendation": "8;8;8;8", "confidence": "3;4;4;3", "correctness": "4;4;3;4", "technical_novelty": "3;3;3;3", "empirical_novelty": "3;2;3;3", "wc_summary_paper": "136;122;40;66", "wc_summary_review": "34;27;231;17", "wc_main_review": "54;725;732;247", "wc_review": "224;874;1003;330", "wc_reply_reviewers": "0;0;234;0", "wc_reply_authors": "18;728;931;350", "reply_reviewers": "0;0;1;0", "reply_authors": "1;1;2;1", "recommendation_avg": [ 8.0, 0.0 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 91.0, 39.408120990476064 ], "wc_summary_review_avg": [ 77.25, 88.97295937530683 ], "wc_main_review_avg": [ 439.5, 296.95664666748917 ], "wc_review_avg": [ 607.75, 335.9764686700543 ], "wc_reply_reviewers_avg": [ 58.5, 101.32497224277932 ], "wc_reply_authors_avg": [ 506.75, 350.8513752288852 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 25, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8962356431705257873&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=7gWSJrP3opB", "email": "cornell.edu;flatironinstitute.org;cornell.edu", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "Cornell University;Flatiron Institute", "aff_unique_dep": ";", "aff_unique_url": "https://www.cornell.edu;https://flatironinstitute.org", "aff_unique_abbr": "Cornell;Flatiron", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Generalization Through the Lens of Leave-One-Out Error", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6753", "id": "7grkzyj89A_", "poster": "", "openreview": "https://openreview.net/forum?id=7grkzyj89A_", "slides": "https://iclr.cc/virtual/2022/poster/6753", "video": "https://iclr.cc/virtual/2022/poster/6753", "author_site": "Gregor Bachmann, Thomas Hofmann, Aurelien Lucchi", "tldr": "", "abstract": "Despite the tremendous empirical success of deep learning models to solve various learning tasks, our theoretical understanding of their generalization ability is very limited. Classical generalization bounds based on tools such as the VC dimension or Rademacher complexity, are so far unsuitable for deep models and it is doubtful that these techniques can yield tight bounds even in the most idealistic settings~\\citep{nagarajan2019uniform}. In this work, we instead revisit the concept of leave-one-out (LOO) error to measure the generalization ability of deep models in the so-called kernel regime. While popular in statistics, the LOO error has been largely overlooked in the context of deep learning. By building upon the recently established connection between neural networks and kernel learning, we leverage the closed-form expression for the leave-one-out error, giving us access to an efficient proxy for the test error. We show both theoretically and empirically that the leave-one-out error is capable of capturing various phenomena in generalization theory, such as double descent, random labels or transfer learning.\nOur work therefore demonstrates that the leave-one-out error provides a tractable way to estimate the generalization ability of deep neural networks in the kernel regime, opening the door to potential, new research directions in the field of generalization.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/55ab265d30781917ebea7e446995bd0d61e4ac9d.zip", "author": "Gregor Bachmann;Thomas Hofmann;Aurelien Lucchi", "authorids": "~Gregor_Bachmann1;~Thomas_Hofmann1;~Aurelien_Lucchi1", "gender": "M;M;M", "homepage": "http://www.da.inf.ethz.ch/people/GregorBachmann;http://www.da.inf.ethz.ch/;http://people.inf.ethz.ch/alucchi/", "dblp": ";h/ThHofmann;14/5780", "google_scholar": "bbGqqloAAAAJ;T3hAyLkAAAAJ;https://scholar.google.ch/citations?user=V1ONSgIAAAAJ", "orcid": ";;", "linkedin": ";thomas-hofmann-1ab2402/;", "or_profile": "~Gregor_Bachmann1;~Thomas_Hofmann1;~Aurelien_Lucchi1", "aff": "Swiss Federal Institute of Technology;Swiss Federal Institute of Technology;University of Basel", "aff_domain": "ethz.ch;ethz.ch;unibas.ch", "position": "PhD student;Full Professor;Assistant Professor", "bibtex": "@inproceedings{\nbachmann2022generalization,\ntitle={Generalization Through the Lens of Leave-One-Out Error},\nauthor={Gregor Bachmann and Thomas Hofmann and Aurelien Lucchi},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=7grkzyj89A_}\n}", "github": "", "project": "", "reviewers": "dZ2X;6oHc;k266", "pdf_size": 0, "recommendation": "6;6;6", "confidence": "3;4;3", "correctness": "3;3;4", "technical_novelty": "3;3;3", "empirical_novelty": "3;2;2", "wc_summary_paper": "95;107;102", "wc_summary_review": "49;86;85", "wc_main_review": "561;691;189", "wc_review": "705;884;376", "wc_reply_reviewers": "0;0;20", "wc_reply_authors": "1084;816;399", "reply_reviewers": "0;0;1", "reply_authors": "2;2;1", "recommendation_avg": [ 6.0, 0.0 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 101.33333333333333, 4.921607686744467 ], "wc_summary_review_avg": [ 73.33333333333333, 17.21110752456745 ], "wc_main_review_avg": [ 480.3333333333333, 212.73039797410763 ], "wc_review_avg": [ 655.0, 210.3821918952901 ], "wc_reply_reviewers_avg": [ 6.666666666666667, 9.428090415820632 ], "wc_reply_authors_avg": [ 766.3333333333334, 281.8466880336818 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17232289047191270815&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=7grkzyj89A_", "email": "ethz.ch;ethz.ch;unibas.ch", "author_num": 3, "aff_unique_index": "0;0;1", "aff_unique_norm": "Swiss Federal Institute of Technology;University of Basel", "aff_unique_dep": ";", "aff_unique_url": "https://www.ethz.ch;https://www.unibas.ch", "aff_unique_abbr": "ETH Zurich;UniBas", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Switzerland" }, { "title": "Learning Neural Contextual Bandits through Perturbed Rewards", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6714", "id": "7inCJ3MhXt3", "poster": "", "openreview": "https://openreview.net/forum?id=7inCJ3MhXt3", "slides": "https://iclr.cc/virtual/2022/poster/6714", "video": "https://iclr.cc/virtual/2022/poster/6714", "author_site": "Yiling Jia, Weitong ZHANG, Dongruo Zhou, Quanquan Gu, Hongning Wang", "tldr": "", "abstract": "Thanks to the power of representation learning, neural contextual bandit algorithms demonstrate remarkable performance improvement against their classical counterparts. But because their exploration has to be performed in the entire neural network parameter space to obtain nearly optimal regret, the resulting computational cost is prohibitively high. \nWe propose to perturb the rewards when updating the neural network to eliminate the need of explicit exploration and the corresponding computational overhead. We prove that a $\\tilde{O}(\\tilde{d}\\sqrt{T})$ regret upper bound is still achievable under standard regularity conditions, where $T$ is the number of rounds of interactions and $\\tilde{d}$ is the effective dimension of a neural tangent kernel matrix. \nExtensive comparisons with several benchmark contextual bandit algorithms, including two recent neural contextual bandit models, demonstrate the effectiveness and computational efficiency of our proposed neural bandit algorithm.", "keywords": "contextual bandit;neural bandit", "primary_area": "", "supplementary_material": "", "author": "Yiling Jia;Weitong ZHANG;Dongruo Zhou;Quanquan Gu;Hongning Wang", "authorids": "~Yiling_Jia1;~Weitong_ZHANG1;~Dongruo_Zhou1;~Quanquan_Gu1;~Hongning_Wang1", "gender": "F;M;M;M;M", "homepage": "https://yilingjia.github.io;https://web.cs.ucla.edu/~weightzero;;http://web.cs.ucla.edu/~qgu/;http://www.cs.virginia.edu/~hw5x/", "dblp": "218/7475;96/4158;215/3401;50/4597;05/6545", "google_scholar": "6-pZivMAAAAJ;Ec6bzmcAAAAJ;1780wr0AAAAJ;GU9HgNAAAAAJ;qkdvKNoAAAAJ", "orcid": ";;;;0000-0002-6524-9195", "linkedin": "yiling-jia-793b2228/;;;;", "or_profile": "~Yiling_Jia1;~Weitong_ZHANG1;~Dongruo_Zhou1;~Quanquan_Gu1;~Hongning_Wang1", "aff": "University of Virginia;University of California, Los Angeles;University of California, Los Angeles;University of California, Los Angeles;University of Virginia", "aff_domain": "virginia.edu;ucla.edu;cs.ucla.edu;cs.ucla.edu;virginia.edu", "position": "PhD student;PhD student;PhD student;Assistant Professor;Associate Professor", "bibtex": "@inproceedings{\njia2022learning,\ntitle={Learning Neural Contextual Bandits through Perturbed Rewards},\nauthor={Yiling Jia and Weitong ZHANG and Dongruo Zhou and Quanquan Gu and Hongning Wang},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=7inCJ3MhXt3}\n}", "github": "", "project": "", "reviewers": "8GZW;ZmVR;ha3h;NMyc", "pdf_size": 0, "recommendation": "5;6;8;8", "confidence": "4;3;3;3", "correctness": "4;4;3;4", "technical_novelty": "2;3;3;2", "empirical_novelty": "2;2;3;2", "wc_summary_paper": "98;33;97;113", "wc_summary_review": "39;143;64;30", "wc_main_review": "190;203;425;514", "wc_review": "327;379;586;657", "wc_reply_reviewers": "0;50;50;205", "wc_reply_authors": "339;569;742;1526", "reply_reviewers": "0;1;1;1", "reply_authors": "1;1;2;3", "recommendation_avg": [ 6.75, 1.299038105676658 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 85.25, 30.825111516424396 ], "wc_summary_review_avg": [ 69.0, 44.50280890011326 ], "wc_main_review_avg": [ 333.0, 140.1552710389445 ], "wc_review_avg": [ 487.25, 137.80851751615356 ], "wc_reply_reviewers_avg": [ 76.25, 77.08558555268293 ], "wc_reply_authors_avg": [ 794.0, 446.14403503801327 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.75, 0.82915619758885 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.7777777777777777, "corr_recommendation_correctness": -0.5555555555555555, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12880021207599903542&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=7inCJ3MhXt3", "email": "virginia.edu;ucla.edu;cs.ucla.edu;cs.ucla.edu;virginia.edu", "author_num": 5, "aff_unique_index": "0;1;1;1;0", "aff_unique_norm": "University of Virginia;University of California, Los Angeles", "aff_unique_dep": ";", "aff_unique_url": "https://www.virginia.edu;https://www.ucla.edu", "aff_unique_abbr": "UVA;UCLA", "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Los Angeles", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "7kOsYRp4EmB", "title": "Improving Meta-Continual Learning Representations with Representation Replay", "track": "main", "status": "Reject", "tldr": "", "abstract": "Continual learning often suffers from catastrophic forgetting. Recently, meta-continual learning algorithms use meta-learning to learn how to continually learn. A recent state-of-the-art is online aware meta-learning (OML). This can be further improved by incorporating experience replay (ER) into its meta-testing. However, the use of ER only in meta-testing but not in meta-training suggests that the model may not be optimally meta-trained. In this paper, we remove this inconsistency in the use of ER and improve continual learning representations by integrating ER also into meta-training. We propose to store the samples' representations, instead of the samples themselves, into the replay buffer. This ensures the batch nature of ER does not conflict with the online-aware nature of OML. Moreover, we introduce a meta-learned sample selection scheme to replace the widely used reservoir sampling to populate the replay buffer. This allows the most significant samples to be stored, rather than relying on randomness. Class-balanced modifiers are further added to the sample selection scheme to ensure each class has sufficient samples stored in the replay buffer. Experimental results on a number of real-world meta-continual learning benchmark data sets demonstrate that the proposed method outperforms the state-of-the-art. Moreover, the learned representations have better clustering structures and are more discriminative.", "keywords": "meta-learning;continual learning;meta-continual learning;replay", "primary_area": "", "supplementary_material": "", "author": "Lawrence Ki-On Chan;James Kwok", "authorids": "~Lawrence_Ki-On_Chan1;~James_Kwok1", "gender": "M;", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": "lawrence-chan-71402b119/;", "or_profile": "~Lawrence_Ki-On_Chan1;~James_Kwok1", "aff": "Hong Kong University of Science and Technology;", "aff_domain": "hkust.edu;", "position": "Researcher;", "bibtex": "@misc{\nchan2022improving,\ntitle={Improving Meta-Continual Learning Representations with Representation Replay},\nauthor={Lawrence Ki-On Chan and James Kwok},\nyear={2022},\nurl={https://openreview.net/forum?id=7kOsYRp4EmB}\n}", "github": "", "project": "", "reviewers": "Quoa;oDEb;XC4V;iSAe", "site": "https://openreview.net/forum?id=7kOsYRp4EmB", "pdf_size": 0, "recommendation": "5;5;5;6", "confidence": "4;3;4;3", "correctness": "3;3;3;4", "technical_novelty": "2;2;2;2", "empirical_novelty": "3;2;2;3", "wc_summary_paper": "66;47;53;107", "wc_summary_review": "26;20;15;13", "wc_main_review": "231;44;211;361", "wc_review": "323;111;279;481", "wc_reply_reviewers": "23;0;49;33", "wc_reply_authors": "320;139;310;503", "reply_reviewers": "1;0;1;1", "reply_authors": "1;1;1;1", "recommendation_avg": [ 5.25, 0.4330127018922193 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 68.25, 23.40272420040026 ], "wc_summary_review_avg": [ 18.5, 5.024937810560445 ], "wc_main_review_avg": [ 211.75, 112.67957889520177 ], "wc_review_avg": [ 298.5, 131.76019884623733 ], "wc_reply_reviewers_avg": [ 26.25, 17.76759691123141 ], "wc_reply_authors_avg": [ 318.0, 128.77693892929744 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 1.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:OS3xkP_LslsJ:scholar.google.com/&scioq=Improving+Meta-Continual+Learning+Representations+with+Representation+Replay&hl=en&as_sdt=0,5", "gs_version_total": 2, "aff_unique_index": "0", "aff_unique_norm": "Hong Kong University of Science and Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.ust.hk", "aff_unique_abbr": "HKUST", "aff_campus_unique_index": "0", "aff_campus_unique": "Hong Kong SAR", "aff_country_unique_index": "0", "aff_country_unique": "China" }, { "id": "7kqWcX_r2w", "title": "Meta Attention For Off-Policy Actor-Critic", "track": "main", "status": "Reject", "tldr": "", "abstract": "Off-Policy Actor-Critic methods can effectively exploit past experiences and thus they have achieved great success in various reinforcement learning tasks. In many image-based and multi-source tasks, attention mechanism has been employed in Actor-Critic methods to improve their sampling ef\ufb01ciency. In this paper, we propose a meta attention method for state-based reinforcement learning tasks, which combines attention mechanism and meta-learning based on the Off-Policy Actor-Critic framework. Unlike previous attention-based work, our meta attention method introduces attention in the actor and the critic of the typical Actor-Critic framework rather than in multiple pixels of an image or multiple information sources. In contrast to existing meta-learning methods, the proposed meta-attention approach is able to function in both the gradient-based training phase and the agent's decision-making process. The experimental results demonstrate the superiority of our meta-attention method in various continuous control tasks, which are based on the Off-Policy Actor-Critic methods including DDPG, TD3, and SAC.", "keywords": "reinforcement learning;meta learning;Attention Mechanism", "primary_area": "", "supplementary_material": "", "author": "Jiateng Huang;Wanrong Huang;Long Lan;Dan Wu", "authorids": "~Jiateng_Huang1;~Wanrong_Huang1;lanlong@nudt.edu.cn;~Dan_Wu3", "gender": "M;M;;F", "homepage": "https://github.com/Gideon-K-Marx;;;https://xueshu.baidu.com/scholarID/CN-B1HAE3QK", "dblp": ";184/0874;;", "google_scholar": ";;;", "orcid": ";0000-0001-5778-9055;;", "linkedin": ";;;", "or_profile": "~Jiateng_Huang1;~Wanrong_Huang1;lanlong@nudt.edu.cn;~Dan_Wu3", "aff": "National University of Defense Technology;National Innovation Institute of Defense Technology;;National University of Defense Technology", "aff_domain": "nudt.edu.cn;nudt.edu.cn;;nudt.edu.cn", "position": "MS student;Assistant Research Fellow;;Researcher", "bibtex": "@misc{\nhuang2022meta,\ntitle={Meta Attention For Off-Policy Actor-Critic},\nauthor={Jiateng Huang and Wanrong Huang and Long Lan and Dan Wu},\nyear={2022},\nurl={https://openreview.net/forum?id=7kqWcX_r2w}\n}", "github": "", "project": "", "reviewers": "ihtm;qn99;R4EG;v8k3", "site": "https://openreview.net/forum?id=7kqWcX_r2w", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "4;3;3;4", "correctness": "2;3;2;3", "technical_novelty": "2;2;3;3", "empirical_novelty": "1;2;2;2", "wc_summary_paper": "134;22;49;114", "wc_summary_review": "73;53;55;39", "wc_main_review": "728;403;385;424", "wc_review": "935;478;489;577", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 1.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 79.75, 45.81689099011412 ], "wc_summary_review_avg": [ 55.0, 12.083045973594572 ], "wc_main_review_avg": [ 485.0, 140.97340174657063 ], "wc_review_avg": [ 619.75, 186.00991237028202 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10835720677294923844&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1;0", "aff_unique_norm": "National University of Defense Technology;National Innovation Institute of Defense Technology", "aff_unique_dep": ";", "aff_unique_url": "http://www.nudt.edu.cn/;", "aff_unique_abbr": "NUDT;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "id": "7ktHTjV9FHw", "title": "Relative Molecule Self-Attention Transformer", "track": "main", "status": "Reject", "tldr": "", "abstract": "Self-supervised learning holds promise to revolutionize molecule property prediction - a central task to drug discovery and many more industries - by enabling data efficient learning from scarce experimental data. Despite significant progress, non-pretrained methods can be still competitive in certain settings. We reason that architecture might be a key bottleneck. In particular, enriching the backbone architecture with domain-specific inductive biases has been key for the success of self-supervised learning in other domains. In this spirit, we methodologically explore the design space of the self-attention mechanism tailored to molecular data. We identify a novel variant of self-attention adapted to processing molecules, inspired by the relative self-attention layer, which involves fusing embedded graph and distance relationships between atoms. Our main contribution is Relative Molecule Attention Transformer (R-MAT): a novel Transformer-based model based on the developed self-attention layer that achieves state-of-the-art or very competitive results across a~wide range of molecule property prediction tasks. ", "keywords": "molecular property prediction;transformer-based methods;graph neural networks;self-supervised learing", "primary_area": "", "supplementary_material": "/attachment/7ba8f7e4b2c98ad7a17ea60368580c763c5b9560.zip", "author": "Lukasz Maziarka;Dawid Majchrowski;Tomasz Danel;Piotr Gai\u0144ski;Jacek Tabor;Igor T. Podolak;Pawel Morkisz;Stanislaw Kamil Jastrzebski", "authorids": "~Lukasz_Maziarka1;~Dawid_Majchrowski1;~Tomasz_Danel1;~Piotr_Gai\u0144ski1;~Jacek_Tabor1;~Igor_T._Podolak1;~Pawel_Morkisz1;~Stanislaw_Kamil_Jastrzebski1", "gender": "M;M;M;M;M;M;M;", "homepage": ";https://www.researchgate.net/profile/Dawid-Majchrowski;;https://github.com/panpiort8/;;http://www.gmum.net;http://home.agh.edu.pl/~morkiszp/;http://sjastrzebski.com", "dblp": ";;248/8081;;31/5172;99/2808;173/3108.html;139/0187", "google_scholar": "https://scholar.google.pl/citations?user=2dkp8z4AAAAJ;;WZq_OCsAAAAJ;;https://scholar.google.pl/citations?user=zSKYziUAAAAJ;30LH850AAAAJ;E8gToekAAAAJ;wbJxGQ8AAAAJ", "orcid": "0000-0001-6947-8131;;0000-0001-6053-0028;;0000-0001-6652-7727;0000-0001-9082-6735;0000-0002-4734-966X;0000-0003-4138-1818", "linkedin": "lukasz-maziarka-161749b3/;;;;;igor-podolak-58866447/;pawel-morkisz/;sjastrzebski/", "or_profile": "~Lukasz_Maziarka1;~Dawid_Majchrowski1;~Tomasz_Danel1;~Piotr_Gai\u0144ski1;~Jacek_Tabor1;~Igor_T._Podolak1;~Pawel_Morkisz1;~Stanislaw_Kamil_Jastrzebski1", "aff": "Jagiellonian University;NVIDIA;Jagiellonian University;Jagiellonian University;Jagiellonian University;Jagiellonian University;AGH University of Science and Technology, Krakow, Poland;Molecule.one", "aff_domain": "uj.edu.pl;nvidia.com;uj.edu.pl;uj.edu.pl;uj.edu.pl;uj.edu.pl;agh.edu.pl;molecule.one", "position": "PhD student;AI developer technology engineer;PhD student;MS student;Full Professor;Associate Professor;Assistant Professor;Chief Scientific Officer", "bibtex": "@misc{\nmaziarka2022relative,\ntitle={Relative Molecule Self-Attention Transformer},\nauthor={Lukasz Maziarka and Dawid Majchrowski and Tomasz Danel and Piotr Gai{\\'n}ski and Jacek Tabor and Igor T. Podolak and Pawel Morkisz and Stanislaw Kamil Jastrzebski},\nyear={2022},\nurl={https://openreview.net/forum?id=7ktHTjV9FHw}\n}", "github": "", "project": "", "reviewers": "DtcY;GZ33;Lgqf", "site": "https://openreview.net/forum?id=7ktHTjV9FHw", "pdf_size": 0, "recommendation": "6;6;6", "confidence": "3;2;4", "correctness": "3;4;3", "technical_novelty": "2;2;2", "empirical_novelty": "2;3;3", "wc_summary_paper": "71;62;31", "wc_summary_review": "51;18;76", "wc_main_review": "133;131;357", "wc_review": "255;211;464", "wc_reply_reviewers": "21;0;0", "wc_reply_authors": "339;800;1023", "reply_reviewers": "1;0;0", "reply_authors": "2;1;3", "recommendation_avg": [ 6.0, 0.0 ], "confidence_avg": [ 3.0, 0.816496580927726 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 54.666666666666664, 17.13346303452853 ], "wc_summary_review_avg": [ 48.333333333333336, 23.753362335093154 ], "wc_main_review_avg": [ 207.0, 106.06915982823031 ], "wc_review_avg": [ 310.0, 110.36605758414434 ], "wc_reply_reviewers_avg": [ 7.0, 9.899494936611665 ], "wc_reply_authors_avg": [ 720.6666666666666, 284.82080136269695 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 2.0, 0.816496580927726 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 43, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11619803213292921980&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 18, "aff_unique_index": "0;1;0;0;0;0;2;3", "aff_unique_norm": "Jagiellonian University;NVIDIA;AGH University of Science and Technology;Molecule.one", "aff_unique_dep": ";NVIDIA Corporation;;", "aff_unique_url": "https://www.uj.edu.pl;https://www.nvidia.com;https://www.agh.edu.pl;https://molecule.one", "aff_unique_abbr": "UJ;NVIDIA;AGH;", "aff_campus_unique_index": "1", "aff_campus_unique": ";Krakow", "aff_country_unique_index": "0;1;0;0;0;0;0", "aff_country_unique": "Poland;United States;" }, { "title": "Improving Federated Learning Face Recognition via Privacy-Agnostic Clusters", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6471", "id": "7l1IjZVddDW", "poster": "", "openreview": "https://openreview.net/forum?id=7l1IjZVddDW", "slides": "https://iclr.cc/virtual/2022/poster/6471", "video": "https://iclr.cc/virtual/2022/poster/6471", "author_site": "Qiang Meng, Feng Zhou, Hainan Ren, Tianshu Feng, Guochao Liu, Yuanqing Lin", "tldr": "", "abstract": "The growing public concerns on data privacy in face recognition can be partly relieved by the federated learning (FL) paradigm. However, conventional FL methods usually perform poorly due to the particularity of the task, \\textit{i.e.}, broadcasting class centers among clients is essential for recognition performances but leads to privacy leakage. To resolve the privacy-utility paradox, this work proposes PrivacyFace, a framework largely improves the federated learning face recognition via communicating auxiliary and privacy-agnostic information among clients. PrivacyFace mainly consists of two components: First, a practical Differentially Private Local Clustering (DPLC) mechanism is proposed to distill sanitized clusters from local class centers. Second, a consensus-aware recognition loss subsequently encourages global consensuses among clients, which ergo leads to more discriminative features. The proposed schemes are mathematically proved to be differential private, introduce a lightweight overhead as well as yield prominent performance boosts (\\textit{e.g.}, +9.63\\% and +10.26\\% for TAR@FAR=1e-4 on IJB-B and IJB-C respectively). Extensive experiments and ablation studies on a large-scale dataset have demonstrated the efficacy and practicability of our method. ", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Qiang Meng;Feng Zhou;Hainan Ren;Tianshu Feng;Guochao Liu;Yuanqing Lin", "authorids": "~Qiang_Meng1;~Feng_Zhou8;~Hainan_Ren1;~Tianshu_Feng1;~Guochao_Liu1;~Yuanqing_Lin3", "gender": "M;M;M;;M;M", "homepage": "http://irvingmeng.github.io;http://www.f-zhou.com;https://github.com/sycophant-stone;;https://scholar.google.com/citations?hl=zh-CN&user=cpuwV68AAAAJ;https://www.aibee.com", "dblp": "131/7173.html;;312/6433.html;;208/7935;", "google_scholar": "LdCZhUIAAAAJ;zHpew00AAAAJ;;;https://scholar.google.com/citations?hl=zh-CN;", "orcid": "0009-0008-5508-5183;;;;;", "linkedin": ";;;;;", "or_profile": "~Qiang_Meng1;~Feng_Zhou8;~Hainan_Ren1;~Tianshu_Feng1;~Guochao_Liu1;~Yuanqing_Lin3", "aff": "Aibee Inc.;;Aibee;;Aibee Inc.;", "aff_domain": "aibee.com;;aibee.com;;aibee.com;", "position": "Researcher;;Researcher;;Researcher;", "bibtex": "@inproceedings{\nmeng2022improving,\ntitle={Improving Federated Learning Face Recognition via Privacy-Agnostic Clusters},\nauthor={Qiang Meng and Feng Zhou and Hainan Ren and Tianshu Feng and Guochao Liu and Yuanqing Lin},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=7l1IjZVddDW}\n}", "github": "", "project": "", "reviewers": "ZPue;7nj6;ytHU;VU44", "pdf_size": 0, "recommendation": "5;8;8;8", "confidence": "5;5;4;3", "correctness": "2;4;3;3", "technical_novelty": "2;4;3;4", "empirical_novelty": "2;4;3;3", "wc_summary_paper": "117;94;73;65", "wc_summary_review": "122;27;247;34", "wc_main_review": "286;104;241;363", "wc_review": "525;225;561;462", "wc_reply_reviewers": "0;16;18;0", "wc_reply_authors": "439;234;488;454", "reply_reviewers": "0;1;1;0", "reply_authors": "1;2;2;1", "recommendation_avg": [ 7.25, 1.299038105676658 ], "confidence_avg": [ 4.25, 0.82915619758885 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 3.25, 0.82915619758885 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 87.25, 20.17888748172208 ], "wc_summary_review_avg": [ 107.5, 88.81582066276256 ], "wc_main_review_avg": [ 248.5, 94.14483522743029 ], "wc_review_avg": [ 443.25, 130.8938023742912 ], "wc_reply_reviewers_avg": [ 8.5, 8.52936105461599 ], "wc_reply_authors_avg": [ 403.75, 99.60013805211317 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 22, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.5222329678670935, "corr_recommendation_correctness": 0.816496580927726, "gs_citation": 46, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6368198407478012142&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=7l1IjZVddDW", "email": "aibee.com;;aibee.com;;aibee.com;", "author_num": 6, "aff_unique_index": "0;1;0", "aff_unique_norm": "Aibee Inc.;AIBEE", "aff_unique_dep": ";", "aff_unique_url": "https://www.aibee.ai;https://www.aibee.cn", "aff_unique_abbr": "Aibee;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", "aff_country_unique": "United States;China" }, { "id": "7oyVOECcrt", "title": "Local Permutation Equivariance For Graph Neural Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "In this work we develop a new method, named {\\it locally permutation-equivariant graph neural networks}, which provides a framework for building graph neural networks that operate on local node neighbourhoods, through sub-graphs, while using permutation equivariant update functions. The potential benefits of learning on graph-structured data are vast, and relevant to many application domains. However, one of the challenges, is that graphs are not always of the same size, and often each node in a graph does not have the same connectivity. This necessitates that the update function must be flexible to the input size, which is not the case in most other domains.\n\nUsing our locally permutation-equivariant graph neural networks ensures an expressive update function through using permutation representations, while operating on a lower-dimensional space than that utilised in global permutation equivariance. Furthermore, the use of local update functions offers a significant improvement in GPU memory over global methods. We demonstrate that our method can outperform competing methods on a set of widely used graph benchmark classification tasks.", "keywords": "Graphs;Equivariance;Permutation Equivariance;Graph Neural Networks;Representations", "primary_area": "", "supplementary_material": "", "author": "Joshua Mitton;Roderick Murray-Smith", "authorids": "~Joshua_Mitton1;~Roderick_Murray-Smith1", "gender": ";M", "homepage": "https://github.com/JoshuaMitton;http://www.dcs.gla.ac.uk/~rod/", "dblp": ";78/604", "google_scholar": "https://scholar.google.co.uk/citations?user=OHIUJkkAAAAJ;https://scholar.google.co.uk/citations?user=laX7LzQAAAAJ", "orcid": ";", "linkedin": ";rodms/", "or_profile": "~Joshua_Mitton1;~Roderick_Murray-Smith1", "aff": "University of Glasgow;University of Glasgow", "aff_domain": "gla.ac.uk;gla.ac.uk", "position": "PhD student;Professor", "bibtex": "@misc{\nmitton2022local,\ntitle={Local Permutation Equivariance For Graph Neural Networks},\nauthor={Joshua Mitton and Roderick Murray-Smith},\nyear={2022},\nurl={https://openreview.net/forum?id=7oyVOECcrt}\n}", "github": "", "project": "", "reviewers": "zCkV;Y3RZ;auCg;KECi", "site": "https://openreview.net/forum?id=7oyVOECcrt", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "4;4;3;2", "correctness": "2;2;2;3", "technical_novelty": "2;2;1;3", "empirical_novelty": "0;2;1;3", "wc_summary_paper": "76;124;56;100", "wc_summary_review": "28;51;22;69", "wc_main_review": "185;374;150;286", "wc_review": "289;549;228;455", "wc_reply_reviewers": "104;0;19;0", "wc_reply_authors": "810;779;943;781", "reply_reviewers": "1;0;1;0", "reply_authors": "1;1;2;1", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "correctness_avg": [ 2.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 1.5, 1.118033988749895 ], "wc_summary_paper_avg": [ 89.0, 25.514701644346147 ], "wc_summary_review_avg": [ 42.5, 18.741664813991314 ], "wc_main_review_avg": [ 248.75, 87.87882281869734 ], "wc_review_avg": [ 380.25, 128.0339310495464 ], "wc_reply_reviewers_avg": [ 30.75, 42.99636612552275 ], "wc_reply_authors_avg": [ 828.25, 67.37720311796862 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.8703882797784891, "corr_recommendation_correctness": 1.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8164037642962661729&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "University of Glasgow", "aff_unique_dep": "", "aff_unique_url": "https://www.gla.ac.uk", "aff_unique_abbr": "Glasgow", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United Kingdom" }, { "id": "7pZiaojaVGU", "title": "An Equivalence Between Data Poisoning and Byzantine Gradient Attacks", "track": "main", "status": "Reject", "tldr": "", "abstract": "To address the resilience of distributed learning, the ``Byzantine\" literature considers a strong threat model where workers can report arbitrary gradients to the parameter server. While this model helped generate several fundamental results, it has however sometimes been considered unrealistic, when the workers are mostly trustworthy machines. In this paper, we show a surprising equivalence between this model and data poisoning, a threat considered much more realistic. More specifically, we prove that any gradient attack can be reduced to data poisoning in a personalized federated learning system that provides PAC guarantees (which we show are both desirable and realistic in various personalized federated learning contexts such as linear regression and classification). Maybe most importantly, we derive a simple and practical attack that may be constructed against classical personalized federated learning models, and we show both theoretically and empirically the effectiveness of this attack.", "keywords": "Federated learning;PAC learning;Byzantine attack;Data poisoning;Personalized learning", "primary_area": "", "supplementary_material": "", "author": "Sadegh Farhadkhani;Rachid Guerraoui;L\u00ea-Nguy\u00ean Hoang;Oscar Villemaud", "authorids": "~Sadegh_Farhadkhani1;~Rachid_Guerraoui1;~L\u00ea-Nguy\u00ean_Hoang1;~Oscar_Villemaud1", "gender": "M;M;M;M", "homepage": "https://sadeghfarhadkhani.github.io/;https://lpdwww.epfl.ch/rachid/;http://epfl.ch/le.hoang;", "dblp": "281/6141;g/RachidGuerraoui;;314/5758", "google_scholar": "X4axFjgAAAAJ;;https://scholar.google.ch/scholar?hl=en;", "orcid": ";;0000-0002-9236-5837;0000-0003-0516-5528", "linkedin": ";;l%C3%AA-nguy%C3%AAn-hoang;oscar-villemaud-64b730190/", "or_profile": "~Sadegh_Farhadkhani1;~Rachid_Guerraoui1;~L\u00ea-Nguy\u00ean_Hoang1;~Oscar_Villemaud1", "aff": "EPFL;;Swiss Federal Institute of Technology Lausanne;EPFL - EPF Lausanne", "aff_domain": "epfl.ch;;epfl.ch;epfl.ch", "position": "PhD student;;Postdoc;PhD student", "bibtex": "@misc{\nfarhadkhani2022an,\ntitle={An Equivalence Between Data Poisoning and Byzantine Gradient Attacks},\nauthor={Sadegh Farhadkhani and Rachid Guerraoui and L{\\^e}-Nguy{\\^e}n Hoang and Oscar Villemaud},\nyear={2022},\nurl={https://openreview.net/forum?id=7pZiaojaVGU}\n}", "github": "", "project": "", "reviewers": "bDXf;a13A;oiiD;gjXp;xXNa", "site": "https://openreview.net/forum?id=7pZiaojaVGU", "pdf_size": 0, "recommendation": "3;5;5;5;6", "confidence": "4;3;4;3;3", "correctness": "3;4;4;3;4", "technical_novelty": "3;2;3;2;3", "empirical_novelty": "2;3;2;1;4", "wc_summary_paper": "33;42;26;53;79", "wc_summary_review": "37;34;109;39;305", "wc_main_review": "282;249;883;414;48", "wc_review": "352;325;1018;506;432", "wc_reply_reviewers": "0;0;134;567;74", "wc_reply_authors": "566;650;1681;2275;541", "reply_reviewers": "0;0;1;2;1", "reply_authors": "1;1;3;3;2", "recommendation_avg": [ 4.8, 0.9797958971132712 ], "confidence_avg": [ 3.4, 0.4898979485566356 ], "correctness_avg": [ 3.6, 0.4898979485566356 ], "technical_novelty_avg": [ 2.6, 0.4898979485566356 ], "empirical_novelty_avg": [ 2.4, 1.019803902718557 ], "wc_summary_paper_avg": [ 46.6, 18.55370582929459 ], "wc_summary_review_avg": [ 104.8, 103.95845323974379 ], "wc_main_review_avg": [ 375.2, 279.6565035896716 ], "wc_review_avg": [ 526.6, 253.7617780517783 ], "wc_reply_reviewers_avg": [ 155.0, 212.03584602609058 ], "wc_reply_authors_avg": [ 1142.6, 708.4136080002982 ], "reply_reviewers_avg": [ 0.8, 0.7483314773547883 ], "reply_authors_avg": [ 2.0, 0.8944271909999159 ], "replies_avg": [ 22, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.6666666666666667, "corr_recommendation_correctness": 0.6666666666666667, "gs_citation": 29, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15814948581438408162&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff_unique_index": "0;1;0", "aff_unique_norm": "EPFL;Swiss Federal Institute of Technology Lausanne", "aff_unique_dep": ";", "aff_unique_url": "https://www.epfl.ch;https://www.epfl.ch", "aff_unique_abbr": "EPFL;EPFL", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Lausanne", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Switzerland" }, { "id": "7qaCQiuOVf", "title": "Interpreting Reinforcement Policies through Local Behaviors", "track": "main", "status": "Reject", "tldr": "", "abstract": "Many works in explainable AI have focused on explaining black-box classification models. Explaining deep reinforcement learning (RL) policies in a manner that could be understood by domain users has received much less attention. In this paper, we propose a novel perspective to understanding RL policies based on identifying important states from automatically learned meta-states. The key conceptual difference between our approach and many previous ones is that we form meta-states based on locality governed by the expert policy dynamics rather than based on similarity of actions, and that we do not assume any particular knowledge of the underlying topology of the state space. Theoretically, we show that our algorithm to find meta-states converges and the objective that selects important states from each meta-state is submodular leading to efficient high quality greedy selection. Experiments on three domains (four rooms, door-key and minipacman) and a carefully conducted user study illustrate that our perspective leads to better understanding of the policy. We conjecture that this is a result of our meta-states being more intuitive in that the corresponding important states are strong indicators of tractable intermediate goals that are easier for humans to interpret and follow.", "keywords": "Reinforcement Learning;Explainability", "primary_area": "", "supplementary_material": "/attachment/9837a6c7caca0a69702d8ef8de0db7a3e8d1b6d3.zip", "author": "Ronny Luss;Amit Dhurandhar;Miao Liu", "authorids": "~Ronny_Luss1;~Amit_Dhurandhar1;~Miao_Liu1", "gender": ";M;M", "homepage": ";https://researcher.watson.ibm.com/researcher/view.php?person=us-adhuran;https://sites.google.com/view/miaoliuhome", "dblp": "80/75;66/3289;", "google_scholar": "lBPWZdAAAAAJ;km9vIPEAAAAJ;7QHvAEYAAAAJ", "orcid": ";;", "linkedin": ";;miao-liu-3273a32b", "or_profile": "~Ronny_Luss1;~Amit_Dhurandhar1;~Miao_Liu1", "aff": "IBM;International Business Machines;International Business Machines", "aff_domain": "us.ibm.com;ibm.com;ibm.com", "position": "Research Scientist;Principal Researcher;Research Staff Member", "bibtex": "@misc{\nluss2022interpreting,\ntitle={Interpreting Reinforcement Policies through Local Behaviors},\nauthor={Ronny Luss and Amit Dhurandhar and Miao Liu},\nyear={2022},\nurl={https://openreview.net/forum?id=7qaCQiuOVf}\n}", "github": "", "project": "", "reviewers": "GrkK;53LC;3j9K;u9SV", "site": "https://openreview.net/forum?id=7qaCQiuOVf", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "5;3;3;4", "correctness": "2;3;4;3", "technical_novelty": "2;2;2;3", "empirical_novelty": "3;2;2;3", "wc_summary_paper": "65;50;10;154", "wc_summary_review": "52;114;100;89", "wc_main_review": "888;402;31;747", "wc_review": "1005;566;141;990", "wc_reply_reviewers": "652;51;0;0", "wc_reply_authors": "4066;1196;668;1699", "reply_reviewers": "2;1;0;0", "reply_authors": "8;4;2;5", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 69.75, 52.63257071433999 ], "wc_summary_review_avg": [ 88.75, 22.993205518152532 ], "wc_main_review_avg": [ 517.0, 331.6481569374387 ], "wc_review_avg": [ 675.5, 355.3733951775231 ], "wc_reply_reviewers_avg": [ 175.75, 275.7502266544853 ], "wc_reply_authors_avg": [ 1907.25, 1298.57486788402 ], "reply_reviewers_avg": [ 0.75, 0.82915619758885 ], "reply_authors_avg": [ 4.75, 2.165063509461097 ], "replies_avg": [ 30, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.6225430174794673, "corr_recommendation_correctness": 0.6488856845230502, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16274081595150825517&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;0", "aff_unique_norm": "International Business Machines Corporation", "aff_unique_dep": "", "aff_unique_url": "https://www.ibm.com", "aff_unique_abbr": "IBM", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Latent Image Animator: Learning to Animate Images via Latent Space Navigation", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6122", "id": "7r6kDq0mK_", "poster": "", "openreview": "https://openreview.net/forum?id=7r6kDq0mK_", "slides": "https://iclr.cc/virtual/2022/poster/6122", "video": "https://iclr.cc/virtual/2022/poster/6122", "author_site": "Yaohui Wang, Di Yang, Francois Bremond, Antitza Dantcheva", "tldr": "", "abstract": "Due to the remarkable progress of deep generative models, animating images has become increasingly efficient, whereas associated results have become increasingly realistic. Current animation-approaches commonly exploit structure representation extracted from driving videos. Such structure representation is instrumental in transferring motion from driving videos to still images. However, such approaches fail in case the source image and driving video encompass large appearance variation. Moreover, the extraction of structure information requires additional modules that endow the animation-model with increased complexity. Deviating from such models, we here introduce the Latent Image Animator (LIA), a self-supervised autoencoder that evades need for structure representation. LIA is streamlined to animate images by linear navigation in the latent space. Specifically, motion in generated video is constructed by linear displacement of codes in the latent space. Towards this, we learn a set of orthogonal motion directions simultaneously, and use their linear combination, in order to represent any displacement in the latent space. Extensive quantitative and qualitative analysis suggests that our model systematically and significantly outperforms state-of-art methods on VoxCeleb, Taichi and TED-talk datasets w.r.t. generated quality.", "keywords": "Video generation;Generative Adversarial Network", "primary_area": "", "supplementary_material": "/attachment/3d1b0f87e508f09c25d26e1adc283749ac89a25c.zip", "author": "Yaohui Wang;Di Yang;Francois Bremond;Antitza Dantcheva", "authorids": "~Yaohui_Wang1;~Di_Yang4;~Francois_Bremond1;antitza.dantcheva@inria.fr", "gender": "M;M;M;", "homepage": "https://wyhsirius.github.io/;;http://www-sop.inria.fr/members/Francois.Bremond/;", "dblp": "168/6263-1.html;;90/6418;", "google_scholar": "R7LyAb4AAAAJ;Q-koPr0AAAAJ;h-oGBzsAAAAJ;", "orcid": ";0000-0002-8124-532X;0000-0003-2988-2142;", "linkedin": ";;francois-bremond-05263a5/;", "or_profile": "~Yaohui_Wang1;~Di_Yang4;~Francois_Bremond1;antitza.dantcheva@inria.fr", "aff": "Shanghai AI Laboratory;INRIA;inria;", "aff_domain": "pjlab.org.cn;inria.fr;inria.fr;", "position": "Research Scientist;PhD student;Researcher;", "bibtex": "@inproceedings{\nwang2022latent,\ntitle={Latent Image Animator: Learning to Animate Images via Latent Space Navigation},\nauthor={Yaohui Wang and Di Yang and Francois Bremond and Antitza Dantcheva},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=7r6kDq0mK_}\n}", "github": "", "project": "", "reviewers": "gSih;PbBc;jq2i;UntW;EXdL", "pdf_size": 0, "recommendation": "6;6;6;8;8", "confidence": "4;4;4;4;3", "correctness": "4;4;3;4;4", "technical_novelty": "2;2;3;3;3", "empirical_novelty": "2;3;2;3;3", "wc_summary_paper": "161;73;83;173;96", "wc_summary_review": "18;23;38;133;52", "wc_main_review": "225;244;195;660;370", "wc_review": "404;340;316;966;518", "wc_reply_reviewers": "90;0;116;56;67", "wc_reply_authors": "1059;1003;856;747;974", "reply_reviewers": "1;0;2;1;1", "reply_authors": "2;2;2;1;2", "recommendation_avg": [ 6.8, 0.9797958971132712 ], "confidence_avg": [ 3.8, 0.39999999999999997 ], "correctness_avg": [ 3.8, 0.39999999999999997 ], "technical_novelty_avg": [ 2.6, 0.4898979485566356 ], "empirical_novelty_avg": [ 2.6, 0.4898979485566356 ], "wc_summary_paper_avg": [ 117.2, 41.48445491988536 ], "wc_summary_review_avg": [ 52.8, 41.83491364876949 ], "wc_main_review_avg": [ 338.8, 171.32355354708235 ], "wc_review_avg": [ 508.8, 239.06685257475576 ], "wc_reply_reviewers_avg": [ 65.8, 38.81443030626625 ], "wc_reply_authors_avg": [ 927.8, 112.1131571226143 ], "reply_reviewers_avg": [ 1.0, 0.6324555320336759 ], "reply_authors_avg": [ 1.8, 0.4 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.6123724356957946, "corr_recommendation_correctness": 0.408248290463863, "gs_citation": 178, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7803625712624198725&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "pdf": "https://openreview.net/pdf?id=7r6kDq0mK_", "email": "pjlab.org.cn;inria.fr;inria.fr;", "author_num": 4, "aff_unique_index": "0;1;1", "aff_unique_norm": "Shanghai AI Laboratory;INRIA", "aff_unique_dep": ";", "aff_unique_url": "https://www.shanghai-ai-lab.com;https://www.inria.fr", "aff_unique_abbr": "SAIL;INRIA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1", "aff_country_unique": "China;France" }, { "id": "7sz69eztw9", "title": "Context-invariant, multi-variate time series representations", "track": "main", "status": "Reject", "tldr": "", "abstract": "Modern time series corpora, in particular those coming from sensor-based data, exhibit characteristics that have so far not been adequately addressed in the literature on representation learning for time series. In particular, such corpora often allow to distinguish between \\emph{exogenous} signals that describe a context which influences a given appliance and \\emph{endogenous} signals that describe the internal state of the appliance. We propose a temporal convolution network based embedding that improves on the state-of-the-art by incorporating recent advances in contrastive learning to the time series domain and by adopting a multi-resolution approach. Employing techniques borrowed from domain-adversarial learning, we achieve an invariance of the embeddings with respect to the context provided by the exogenous signal. To show the effectiveness of our approach, we contribute new data sets to the research community and use both new as well as existing data sets to empirically verify that we can separate normal from abnormal internal appliance behaviour independent of the external signals in data sets from IoT and DevOps.", "keywords": "time series;representation learning;contrastive learning;context invariance;anomaly detection;domain adversarial learning", "primary_area": "", "supplementary_material": "/attachment/f6eb64f7b94213357103229c85440182aa60a0d9.zip", "author": "Stephan Rabanser;Tim Januschowski;Kashif Rasul;Oliver Borchert;Richard Kurle;Jan Gasthaus;Michael Bohlke-Schneider;Nicolas Papernot;Valentin Flunkert", "authorids": "~Stephan_Rabanser1;~Tim_Januschowski2;~Kashif_Rasul1;~Oliver_Borchert1;~Richard_Kurle1;~Jan_Gasthaus2;~Michael_Bohlke-Schneider1;~Nicolas_Papernot1;~Valentin_Flunkert2", "gender": "M;;M;M;M;M;M;M;M", "homepage": "https://rabanser.dev;;;;http://www.gatsby.ucl.ac.uk/~ucabjga/;;https://www.papernot.fr;;", "dblp": "210/2399;80/5769;;210/2562.html;11/5155;242/8809;162/1405;;54/8909", "google_scholar": "https://scholar.google.com/citations?hl=en;cfIrwmAAAAAJ;;q2YBN34AAAAJ;sSAJdVwAAAAJ;https://scholar.google.de/citations?user=19k2WQEAAAAJ;cGxq0cMAAAAJ;https://scholar.google.ca/citations?user=DzlwsFwAAAAJ;https://scholar.google.de/citations?user=EFdp8UMAAAAJ", "orcid": ";;;;;0000-0002-4969-2218;;;", "linkedin": ";;https://linkedin.com/in/borchero;;jan-gasthaus/;michael-bohlke-schneider-16a4ab93/;nicolaspapernot;;", "or_profile": "~Stephan_Rabanser1;~Kashif_Rasul1;~Oliver_Borchert1;~Richard_Kurle1;~Jan_Gasthaus2;~Michael_Bohlke-Schneider1;~Nicolas_Papernot1;~Valentin_Flunkert2;~Tim_Januschowski1", "aff": "University of Toronto;Zalando SE;;Amazon AWS AI;Amazon Development Center Germany;Amazon Development Center Germany;Google;Amazon;", "aff_domain": "toronto.edu;zalando.de;;amazon.de;amazon.de;amazon.de;google.com;amazon.com;", "position": "PhD student;Researcher;;Applied Scientist;Researcher;Researcher;Research Scientist;Principal Researcher;", "bibtex": "@misc{\nrabanser2022contextinvariant,\ntitle={Context-invariant, multi-variate time series representations},\nauthor={Stephan Rabanser and Tim Januschowski and Kashif Rasul and Oliver Borchert and Richard Kurle and Jan Gasthaus and Michael Bohlke-Schneider and Nicolas Papernot and Valentin Flunkert},\nyear={2022},\nurl={https://openreview.net/forum?id=7sz69eztw9}\n}", "github": "", "project": "", "reviewers": "PwXK;krgP;N1hN", "site": "https://openreview.net/forum?id=7sz69eztw9", "pdf_size": 0, "recommendation": "3;3;5", "confidence": "3;4;3", "correctness": "2;3;3", "technical_novelty": "3;2;3", "empirical_novelty": "2;2;2", "wc_summary_paper": "120;78;38", "wc_summary_review": "52;39;34", "wc_main_review": "278;131;298", "wc_review": "450;248;370", "wc_reply_reviewers": "45;0;0", "wc_reply_authors": "1851;557;1123", "reply_reviewers": "1;0;0", "reply_authors": "4;2;2", "recommendation_avg": [ 3.6666666666666665, 0.9428090415820634 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "correctness_avg": [ 2.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 78.66666666666667, 33.47967874530592 ], "wc_summary_review_avg": [ 41.666666666666664, 7.586537784494029 ], "wc_main_review_avg": [ 235.66666666666666, 74.45953412573989 ], "wc_review_avg": [ 356.0, 83.05821251788812 ], "wc_reply_reviewers_avg": [ 15.0, 21.213203435596427 ], "wc_reply_authors_avg": [ 1177.0, 529.6514577216481 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 2.6666666666666665, 0.9428090415820634 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 9, 0 ], "corr_recommendation_confidence": -0.5, "corr_recommendation_correctness": 0.5, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:lB0FC6Fj838J:scholar.google.com/&scioq=Context-invariant,+multi-variate+time+series+representations&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1;2;2;2;3;2", "aff_unique_norm": "University of Toronto;Zalando SE;Amazon;Google", "aff_unique_dep": ";;Amazon Web Services AI;Google", "aff_unique_url": "https://www.utoronto.ca;https://www.zalando.de;https://aws.amazon.com;https://www.google.com", "aff_unique_abbr": "U of T;Zalando;AWS;Google", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;1;2;1;1;2;2", "aff_country_unique": "Canada;Germany;United States" }, { "id": "7t_6BiC69a", "title": "Fieldwise Factorized Networks for Tabular Data Classification", "track": "main", "status": "Reject", "tldr": "", "abstract": "Tabular data is one of the most common data-types in machine learning, however, deep neural networks have not yet convincingly outperformed classical baselines on such datasets. In this paper, we first investigate the theoretical connection between neural network and factorization machine techniques, and present fieldwise factorized neural networks (F2NN), a neural network architecture framework that is aimed for tabular classification. Our framework learns high-dimensional field representations by a low-rank factorization, and handles both categorical and numerical fields. Furthermore, we show that simply by changing our penultimate activation function, the framework recovers a range of popular tabular classification methods. We evaluate our method against state-of-the-art tabular baselines, including tree-based and deep neural network methods, on a range of tasks. Our findings suggest that our theoretically grounded but simple and shallow neural network architecture achieves as strong or better results than more complex methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Chen Almagor;Yedid Hoshen", "authorids": "~Chen_Almagor1;~Yedid_Hoshen3", "gender": ";M", "homepage": ";https://www.cs.huji.ac.il/~ydidh/", "dblp": ";136/0280", "google_scholar": ";https://scholar.google.co.il/citations?user=6y1-qS4AAAAJ", "orcid": ";", "linkedin": "chen-almagor/;", "or_profile": "~Chen_Almagor1;~Yedid_Hoshen3", "aff": "Hebrew University in Jerusalem;Hebrew University of Jerusalem", "aff_domain": "huji.il;huji.ac.il", "position": "MS student;Assistant Professor", "bibtex": "@misc{\nalmagor2022fieldwise,\ntitle={Fieldwise Factorized Networks for Tabular Data Classification},\nauthor={Chen Almagor and Yedid Hoshen},\nyear={2022},\nurl={https://openreview.net/forum?id=7t_6BiC69a}\n}", "github": "", "project": "", "reviewers": "EGGP;jCBE;J2FM;c4rf", "site": "https://openreview.net/forum?id=7t_6BiC69a", "pdf_size": 0, "recommendation": "3;5;6;6", "confidence": "3;4;3;2", "correctness": "2;3;3;3", "technical_novelty": "3;3;3;4", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "109;68;81;56", "wc_summary_review": "64;250;38;53", "wc_main_review": "518;266;546;118", "wc_review": "691;584;665;227", "wc_reply_reviewers": "386;158;92;49", "wc_reply_authors": "1115;738;758;519", "reply_reviewers": "1;1;1;1", "reply_authors": "2;1;1;1", "recommendation_avg": [ 5.0, 1.224744871391589 ], "confidence_avg": [ 3.0, 0.7071067811865476 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 78.5, 19.704060495238032 ], "wc_summary_review_avg": [ 101.25, 86.37527134544933 ], "wc_main_review_avg": [ 362.0, 178.14600753314681 ], "wc_review_avg": [ 541.75, 185.95614402326157 ], "wc_reply_reviewers_avg": [ 171.25, 129.92185151082168 ], "wc_reply_authors_avg": [ 782.5, 213.64046901277857 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.28867513459481287, "corr_recommendation_correctness": 0.9428090415820632, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:pFPsuPZ5W58J:scholar.google.com/&scioq=Fieldwise+Factorized+Networks+for+Tabular+Data+Classification&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1", "aff_unique_norm": "Hebrew University;Hebrew University of Jerusalem", "aff_unique_dep": ";", "aff_unique_url": "https://www.huji.ac.il;https://www.huji.ac.il", "aff_unique_abbr": "HUJI;HUJI", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Jerusalem", "aff_country_unique_index": "0;0", "aff_country_unique": "Israel" }, { "title": "Learning Causal Models from Conditional Moment Restrictions by Importance Weighting", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6841", "id": "7twQI5VnC8", "poster": "", "openreview": "https://openreview.net/forum?id=7twQI5VnC8", "slides": "https://iclr.cc/virtual/2022/poster/6841", "video": "https://iclr.cc/virtual/2022/poster/6841", "author_site": "Masahiro Kato, Masaaki Imaizumi, Kenichiro McAlinn, Shota Yasui, Haruo Kakehi", "tldr": "", "abstract": "We consider learning causal relationships under conditional moment restrictions. Unlike causal inference under unconditional moment restrictions, conditional moment restrictions pose serious challenges for causal inference. To address this issue, we propose a method that transforms conditional moment restrictions to unconditional moment restrictions through importance weighting using a conditional density ratio estimator. Then, using this transformation, we propose a method that successfully estimate a parametric or nonparametric functions defined under the conditional moment restrictions. We analyze the estimation error and provide a bound on the structural function, providing theoretical support for our proposed method. In experiments, we confirm the soundness of our proposed method.", "keywords": "Causal inference;Conditional moment restrictions", "primary_area": "", "supplementary_material": "", "author": "Masahiro Kato;Masaaki Imaizumi;Kenichiro McAlinn;Shota Yasui;Haruo Kakehi", "authorids": "~Masahiro_Kato1;~Masaaki_Imaizumi1;~Kenichiro_McAlinn2;~Shota_Yasui1;~Haruo_Kakehi2", "gender": "M;M;M;M;M", "homepage": "https://masakat0.github.io/;https://sites.google.com/view/mimaizumi/home;https://yasui-salmon.github.io/;;http://mcalinn.com/", "dblp": ";;227/2734;;", "google_scholar": "https://scholar.google.co.jp/schhp?hl=ja;https://scholar.google.co.jp/citations?user=6c0Ljd4AAAAJ;https://scholar.google.co.jp/citations?user=47E8oVcAAAAJ;;", "orcid": ";;;;", "linkedin": ";masaaki-imaizumi-38600b157/;;%E6%82%A0%E5%A4%AB-%E7%AD%A7-b45008174/;", "or_profile": "~Masahiro_Kato1;~Masaaki_Imaizumi1;~Shota_Yasui1;~Haruo_Kakehi2;~Kenichiro_McAlinn1", "aff": "Cyberagent;The University of Tokyo;;;Temple University", "aff_domain": "cyberagent.co.jp;u-tokyo.ac.jp;;;temple.edu", "position": "Researcher;Associate Professor;;;Assistant Professor", "bibtex": "@inproceedings{\nkato2022learning,\ntitle={Learning Causal Models from Conditional Moment Restrictions by Importance Weighting},\nauthor={Masahiro Kato and Masaaki Imaizumi and Kenichiro McAlinn and Shota Yasui and Haruo Kakehi},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=7twQI5VnC8}\n}", "github": "", "project": "", "reviewers": "S3Qv;FAMR;ThQb", "pdf_size": 0, "recommendation": "6;8;8", "confidence": "3;3;2", "correctness": "3;4;4", "technical_novelty": "3;3;3", "empirical_novelty": "2;3;3", "wc_summary_paper": "51;110;89", "wc_summary_review": "31;136;23", "wc_main_review": "230;826;111", "wc_review": "312;1072;223", "wc_reply_reviewers": "25;418;0", "wc_reply_authors": "331;1651;155", "reply_reviewers": "1;2;0", "reply_authors": "1;3;1", "recommendation_avg": [ 7.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 2.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 83.33333333333333, 24.417662095749915 ], "wc_summary_review_avg": [ 63.333333333333336, 51.486783633170255 ], "wc_main_review_avg": [ 389.0, 312.80132139533345 ], "wc_review_avg": [ 535.6666666666666, 380.981481031413 ], "wc_reply_reviewers_avg": [ 147.66666666666666, 191.42680643583392 ], "wc_reply_authors_avg": [ 712.3333333333334, 667.6153250354745 ], "reply_reviewers_avg": [ 1.0, 0.816496580927726 ], "reply_authors_avg": [ 1.6666666666666667, 0.9428090415820634 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.49999999999999983, "corr_recommendation_correctness": 0.9999999999999998, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12640037247333067558&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "pdf": "https://openreview.net/pdf?id=7twQI5VnC8", "email": "cyberagent.co.jp;u-tokyo.ac.jp;;;temple.edu", "author_num": 5, "aff_unique_index": "0;1;2", "aff_unique_norm": "CyberAgent Inc.;University of Tokyo;Temple University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.cyberagent.co.jp;https://www.u-tokyo.ac.jp;https://www.temple.edu", "aff_unique_abbr": "CyberAgent;UTokyo;Temple", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1", "aff_country_unique": "Japan;United States" }, { "id": "7uSajQt2ki", "title": "Zero-shot Cross-lingual Conversational Semantic Role Labeling", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "While conversational semantic role labeling (CSRL) has shown its usefulness on Chinese conversational tasks, it is still under-explored in non-Chinese languages due to the lack of multilingual CSRL annotations for the parser training. To avoid expensive data collection and error-propagation of translation-based methods, we present a simple but effective approach to perform zero-shot cross-lingual CSRL. Our model implicitly learns language-agnostic, conversational structure-aware and semantically rich representations with the hierarchical encoders and elaborately designed pre-training objectives. Through comprehensive experiments, we find that, our cross-lingual model not only outperforms baselines by large margins but it is also robust to low-resource scenarios. More impressively, we attempt to use CSRL information to help downstream English conversational tasks, including question-in-context rewriting and multi-turn dialogue response generation. Although we have obtained competitive performance on these tasks without CSRL information, substantial improvements are further achieved after introducing CSRL information, which indicates the effectiveness of our cross-lingual CSRL model and the usefulness of CSRL to English dialogue tasks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Han Wu;Haochen Tan;Kun Xu;Shuqi LIU;Lianwei Wu;Linqi Song", "authorids": "~Han_Wu5;~Haochen_Tan1;~Kun_Xu4;~Shuqi_LIU1;~Lianwei_Wu2;~Linqi_Song1", "gender": "M;M;;F;M;M", "homepage": "https://hahahawu.com/;https://scholars.cityu.edu.hk/en/persons/haochen-tan(6f087d1a-f724-44a4-83b4-9f3064fc52b7)/publications.html;https://sites.google.com/view/kunxu2/home;;;https://sites.google.com/site/aisquaredlab/", "dblp": "13/1864-4;269/9939;;;214/2499;137/7963.html", "google_scholar": "https://scholar.google.com.hk/citations?user=1SHXVAIAAAAJ;;;;ZVWnMrYAAAAJ;UcGN3MoAAAAJ", "orcid": "0000-0002-8008-064X;;;0000-0002-3479-000X;0000-0003-1451-9295;0000-0003-2756-4984", "linkedin": ";;;shuqi-liu-b652961b9/;;", "or_profile": "~Han_Wu5;~Haochen_Tan1;~Kun_Xu4;~Shuqi_LIU1;~Lianwei_Wu2;~Linqi_Song1", "aff": "City University of Hong Kong;City University of Hong Kong;Tencent AI Lab;City University of Hong Kong;Northwestern Polytechnical University;City University of Hong Kong", "aff_domain": "cityu.edu.hk;cityu.edu.hk;tencent.com;cityu.edu.hk;nwpu.edu.cn;cityu.edu.hk", "position": "PhD student;PhD student;Researcher;PhD student;Associate Professor;Assistant Professor", "bibtex": "@misc{\nwu2022zeroshot,\ntitle={Zero-shot Cross-lingual Conversational Semantic Role Labeling},\nauthor={Han Wu and Haochen Tan and Kun Xu and Shuqi LIU and Lianwei Wu and Linqi Song},\nyear={2022},\nurl={https://openreview.net/forum?id=7uSajQt2ki}\n}", "github": "", "project": "", "reviewers": "5fcP;mWgD;LZfP;VbfE", "site": "https://openreview.net/forum?id=7uSajQt2ki", "pdf_size": 0, "recommendation": "5;5;5;6", "confidence": "3;5;4;4", "correctness": "3;2;3;4", "technical_novelty": "2;3;2;3", "empirical_novelty": "3;0;2;3", "wc_summary_paper": "82;61;42;144", "wc_summary_review": "16;21;20;35", "wc_main_review": "169;621;636;340", "wc_review": "267;703;698;519", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 5.25, 0.4330127018922193 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.0, 1.224744871391589 ], "wc_summary_paper_avg": [ 82.25, 38.356062102358734 ], "wc_summary_review_avg": [ 23.0, 7.176350047203662 ], "wc_main_review_avg": [ 441.5, 196.60175482431484 ], "wc_review_avg": [ 546.75, 177.70815259857946 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.816496580927726, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6701067975739320915&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff_unique_index": "0;0;1;0;2;0", "aff_unique_norm": "City University of Hong Kong;Tencent;Northwestern Polytechnical University", "aff_unique_dep": ";Tencent AI Lab;", "aff_unique_url": "https://www.cityu.edu.hk;https://ai.tencent.com;https://www.nwpu.edu.cn", "aff_unique_abbr": "CityU;Tencent AI Lab;NWPU", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "VC dimension of partially quantized neural networks in the overparametrized regime", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6203", "id": "7udZAsEzd60", "poster": "", "openreview": "https://openreview.net/forum?id=7udZAsEzd60", "slides": "https://iclr.cc/virtual/2022/poster/6203", "video": "https://iclr.cc/virtual/2022/poster/6203", "author_site": "Yutong Wang, Clayton Scott", "tldr": "", "abstract": "Vapnik-Chervonenkis (VC) theory has so far been unable to explain the small generalization error of overparametrized neural networks. Indeed, existing applications of VC theory to large networks obtain upper bounds on VC dimension that are proportional to the number of weights, and for a large class of networks, these upper bound are known to be tight. In this work, we focus on a class of partially quantized networks that we refer to as hyperplane arrangement neural networks (HANNs). Using a sample compression analysis, we show that HANNs can have VC dimension significantly smaller than the number of weights, while being highly expressive. In particular, empirical risk minimization over HANNs in the overparametrized regime achieves the minimax rate for classification with Lipschitz posterior class probability. We further demonstrate the expressivity of HANNs empirically. On a panel of 121 UCI datasets, overparametrized HANNs are able to match the performance of state-of-the-art full-precision models.", "keywords": "VC dimension;quantized neural networks;classification;minimax theory;overparametrization", "primary_area": "", "supplementary_material": "/attachment/0a8e41070fc65e398f4f8fb821807fb71ff3378e.zip", "author": "Yutong Wang;Clayton Scott", "authorids": "~Yutong_Wang1;~Clayton_Scott1", "gender": "M;M", "homepage": "https://yutongwang.me/;http://web.eecs.umich.edu/~cscott/", "dblp": "90/3631;96/8859.html", "google_scholar": "GH7ryE4AAAAJ;https://scholar.google.com/citations?hl=en", "orcid": "0000-0001-7472-6750;", "linkedin": ";", "or_profile": "~Yutong_Wang1;~Clayton_D._Scott1", "aff": "University of Michigan - Ann Arbor;University of Michigan", "aff_domain": "umich.edu;umich.edu", "position": "PhD student;Professor", "bibtex": "@inproceedings{\nwang2022vc,\ntitle={{VC} dimension of partially quantized neural networks in the overparametrized regime},\nauthor={Yutong Wang and Clayton Scott},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=7udZAsEzd60}\n}", "github": "", "project": "", "reviewers": "QZye;UWBZ;W4WM", "pdf_size": 0, "recommendation": "6;6;8", "confidence": "2;3;2", "correctness": "4;4;3", "technical_novelty": "3;2;3", "empirical_novelty": "3;3;4", "wc_summary_paper": "128;84;51", "wc_summary_review": "33;28;23", "wc_main_review": "223;284;196", "wc_review": "384;396;270", "wc_reply_reviewers": "65;58;15", "wc_reply_authors": "374;765;383", "reply_reviewers": "1;1;1", "reply_authors": "1;1;2", "recommendation_avg": [ 6.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 2.3333333333333335, 0.4714045207910317 ], "correctness_avg": [ 3.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 87.66666666666667, 31.541859312489635 ], "wc_summary_review_avg": [ 28.0, 4.08248290463863 ], "wc_main_review_avg": [ 234.33333333333334, 36.80881536926839 ], "wc_review_avg": [ 350.0, 56.78027826631356 ], "wc_reply_reviewers_avg": [ 46.0, 22.105806175452337 ], "wc_reply_authors_avg": [ 507.3333333333333, 182.23489115851433 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.49999999999999983, "corr_recommendation_correctness": -0.9999999999999998, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11387455269961935968&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=7udZAsEzd60", "email": "umich.edu;umich.edu", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "University of Michigan", "aff_unique_dep": "", "aff_unique_url": "https://www.umich.edu", "aff_unique_abbr": "UM", "aff_campus_unique_index": "0", "aff_campus_unique": "Ann Arbor;", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "7vXQJ2QW8hR", "title": "Max-Affine Spline Insights Into Deep Network Pruning", "track": "main", "status": "Reject", "tldr": "", "abstract": "State-of-the-art (SOTA) approaches to deep network (DN) training overparametrize the model and then prune a posteriori to obtain a \"winning ticket'' subnetwork that can be trained from scratch to achieve high accuracy. To date, the literature has remained largely empirical and hence provides little insights into how pruning affects a DN's decision boundary and no guidance regarding how to design a principled pruning technique. Using a recently developed spline interpretation of DNs, we develop new theory and visualization tools that provide new insights into how pruning DN nodes affects the decision boundary. We discover that a DN's spline mappings exhibit an early-bird (EB) phenomenon whereby the spline's partition converges at early training stages, bridging the recently developed max-affine spline theory and lottery ticket hypothesis of DNs. We leverage this new insight to develop a principled and efficient pruning strategy that focuses on a tiny fraction of DN nodes whose corresponding spline partition regions actually contribute to the final decision boundary. Extensive experiments on four networks and three datasets validate that our new spline-based DN pruning approach reduces training FLOPs by up to 3.5x while achieving similar or even better accuracy than state-of-the-art methods. All the codes will be released publicly upon acceptance.", "keywords": "DNN Interpretability;Network pruning;Max-affine spline theory;Visualization", "primary_area": "", "supplementary_material": "", "author": "Randall Balestriero;Haoran You;Zhihan Lu;Yutong Kou;Huihong Shi;Yingyan Lin;Richard Baraniuk", "authorids": "~Randall_Balestriero1;~Haoran_You1;~Zhihan_Lu1;~Yutong_Kou1;~Huihong_Shi1;~Yingyan_Lin1;~Richard_Baraniuk1", "gender": "M;M;M;M;F;F;", "homepage": "https://randallbalestriero.github.io/;http://haoranyou.com/;;https://kou-99.github.io/;https://shihuihong214.github.io/huihong.shi/;https://eiclab.scs.gatech.edu/;http://richb.rice.edu/", "dblp": "175/5364;230/4247;;247/4139;253/3178;120/6981;32/2804", "google_scholar": "S1x_xqcAAAAJ;z5Eku1sAAAAJ;;;https://scholar.google.com/citations?hl=en;dio8IesAAAAJ;https://scholar.google.com.tw/citations?user=N-BBA20AAAAJ", "orcid": ";0000-0002-2873-2153;;;0000-0002-7845-0154;;", "linkedin": "randallbalestriero/;haoran-you-b4b958165/;zhihan-lu/;;;yingyan-celine-lin-a281211a/;richard-baraniuk", "or_profile": "~Randall_Balestriero1;~Haoran_You1;~Zhihan_Lu1;~Yutong_Kou1;~Huihong_Shi1;~Yingyan_Lin1;~Richard_Baraniuk1", "aff": "Meta Facebook;Rice University;Rice University;Institute of Automation, Chinese Academy of Sciences;Nanjing University;Rice University;William Marsh Rice University", "aff_domain": "facebook.com;rice.edu;rice.edu;ia.ac.cn;nju.edu.cn;rice.edu;rice.edu", "position": "Postdoc;PhD student;Undergrad student;PhD student;PhD student;Assistant Professor;C. Sidney Burrus Professor", "bibtex": "@misc{\nbalestriero2022maxaffine,\ntitle={Max-Affine Spline Insights Into Deep Network Pruning},\nauthor={Randall Balestriero and Haoran You and Zhihan Lu and Yutong Kou and Huihong Shi and Yingyan Lin and Richard Baraniuk},\nyear={2022},\nurl={https://openreview.net/forum?id=7vXQJ2QW8hR}\n}", "github": "", "project": "", "reviewers": "haRu;pmay;2J1h;yjUc", "site": "https://openreview.net/forum?id=7vXQJ2QW8hR", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "4;4;5;3", "correctness": "2;3;3;3", "technical_novelty": "2;2;2;2", "empirical_novelty": "3;3;2;3", "wc_summary_paper": "93;70;178;89", "wc_summary_review": "34;52;191;18", "wc_main_review": "201;174;693;270", "wc_review": "328;296;1062;377", "wc_reply_reviewers": "42;0;42;29", "wc_reply_authors": "495;545;2652;266", "reply_reviewers": "1;0;1;1", "reply_authors": "1;1;4;1", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 107.5, 41.620307543313515 ], "wc_summary_review_avg": [ 73.75, 68.75454530429244 ], "wc_main_review_avg": [ 334.5, 209.9196274768036 ], "wc_review_avg": [ 515.75, 316.694154508731 ], "wc_reply_reviewers_avg": [ 28.25, 17.151894939043906 ], "wc_reply_authors_avg": [ 989.5, 965.59165800042 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.75, 1.299038105676658 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": -0.6488856845230502, "corr_recommendation_correctness": 0.6622661785325219, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12609309598852783522&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1;1;2;3;1;1", "aff_unique_norm": "Meta;Rice University;Chinese Academy of Sciences;Nanjing University", "aff_unique_dep": "Meta Platforms, Inc.;;Institute of Automation;", "aff_unique_url": "https://meta.com;https://www.rice.edu;http://www.ia.cas.cn;https://www.nju.edu.cn", "aff_unique_abbr": "Meta;Rice;CAS;Nanjing U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;1;0;0", "aff_country_unique": "United States;China" }, { "id": "7vcKot39bsv", "title": "Adaptive Inertia: Disentangling the Effects of Adaptive Learning Rate and Momentum", "track": "main", "status": "Reject", "tldr": "", "abstract": "Adaptive Momentum Estimation (Adam), which combines Adaptive Learning Rate and Momentum, would be the most popular stochastic optimizer for accelerating the training of deep neural networks. However, empirically Adam often generalizes worse than Stochastic Gradient Descent (SGD). We unveil the mystery of this behavior in the diffusion theoretical framework. Specifically, we disentangle the effects of Adaptive Learning Rate and Momentum of the Adam dynamics on saddle-point escaping and flat minima selection. We prove that Adaptive Learning Rate can escape saddle points efficiently, but cannot select flat minima as SGD does. In contrast, Momentum provides a drift effect to help the training process pass through saddle points, and almost does not affect flat minima selection. This partly explains why SGD (with Momentum) generalizes better, while Adam generalizes worse but converges faster. Furthermore, motivated by the analysis, we design a novel adaptive optimization framework named Adaptive Inertia, which uses parameter-wise adaptive inertia to accelerate the training and provably favors flat minima as well as SGD. Our extensive experiments demonstrate that the proposed adaptive inertia method can generalize significantly better than SGD and conventional adaptive gradient methods.", "keywords": "Learning Dynamics;Diffusion;Stochastic Optimization;Momentum", "primary_area": "", "supplementary_material": "/attachment/a72130f20fd98f63d4a5fbe9b2a7106c1aaec0cc.zip", "author": "Zeke Xie;Xinrui Wang;Huishuai Zhang;Issei Sato;Masashi Sugiyama", "authorids": "~Zeke_Xie1;~Xinrui_Wang1;~Huishuai_Zhang3;~Issei_Sato1;~Masashi_Sugiyama1", "gender": "M;Not Specified;M;M;M", "homepage": "https://sites.google.com/view/zeke-xie;https://github.com/SystemErrorWang;;http://www.ms.k.u-tokyo.ac.jp/sugi/;https://huishuai-git.github.io", "dblp": "210/1039;;13/2665;35/1228;144/7537", "google_scholar": "https://scholar.google.co.jp/citations?user=ysXmZCMAAAAJ;VtS-BpQAAAAJ;i4t2aUEAAAAJ;https://scholar.google.co.jp/citations?user=GkYIrlIAAAAJ;w1srHyIAAAAJ", "orcid": ";;;0000-0001-6658-6743;", "linkedin": ";;;;", "or_profile": "~Zeke_Xie1;~Xinrui_Wang1;~Issei_Sato1;~Masashi_Sugiyama1;~Huishuai_Zhang2", "aff": "Baidu;ByteDance Inc.;the University of Tokyo;The University of Tokyo;Microsoft Research Asia", "aff_domain": "baidu.com;bytedance.com;u-tokyo.ac.jp;u-tokyo.ac.jp;microsoft.com", "position": "Researcher;Machine Learning Engineer;Associate Professor;Full Professor;Researcher", "bibtex": "@misc{\nxie2022adaptive,\ntitle={Adaptive Inertia: Disentangling the Effects of Adaptive Learning Rate and Momentum},\nauthor={Zeke Xie and Xinrui Wang and Huishuai Zhang and Issei Sato and Masashi Sugiyama},\nyear={2022},\nurl={https://openreview.net/forum?id=7vcKot39bsv}\n}", "github": "", "project": "", "reviewers": "EqDN;1sdV;A3Wq;MYxM", "site": "https://openreview.net/forum?id=7vcKot39bsv", "pdf_size": 0, "recommendation": "3;5;6;8", "confidence": "4;4;3;4", "correctness": "2;2;3;4", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "63;97;110;85", "wc_summary_review": "17;58;43;36", "wc_main_review": "589;397;212;187", "wc_review": "669;552;365;308", "wc_reply_reviewers": "0;511;0;0", "wc_reply_authors": "1607;1770;280;224", "reply_reviewers": "0;1;0;0", "reply_authors": "3;3;1;1", "recommendation_avg": [ 5.5, 1.8027756377319946 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.82915619758885 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 88.75, 17.297037318569906 ], "wc_summary_review_avg": [ 38.5, 14.739402972983676 ], "wc_main_review_avg": [ 346.25, 161.9311196157181 ], "wc_review_avg": [ 473.5, 144.52075975443805 ], "wc_reply_reviewers_avg": [ 127.75, 221.26949066692407 ], "wc_reply_authors_avg": [ 970.25, 720.8302071223153 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 2.0, 1.0 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.16012815380508713, "corr_recommendation_correctness": 0.9198662110077999, "gs_citation": 68, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13927749267930732278&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 8, "aff_unique_index": "0;1;2;2;3", "aff_unique_norm": "Baidu;ByteDance;University of Tokyo;Microsoft", "aff_unique_dep": "Baidu, Inc.;;;Research", "aff_unique_url": "https://www.baidu.com;https://www.bytedance.com;https://www.u-tokyo.ac.jp;https://www.microsoft.com/en-us/research/group/asia", "aff_unique_abbr": "Baidu;ByteDance;UTokyo;MSR Asia", "aff_campus_unique_index": "1", "aff_campus_unique": ";Asia", "aff_country_unique_index": "0;0;1;1;0", "aff_country_unique": "China;Japan" }, { "id": "7x_47XJULn", "title": "Federated Learning with Heterogeneous Architectures using Graph HyperNetworks", "track": "main", "status": "Reject", "tldr": "", "abstract": "Standard Federated Learning (FL) techniques are limited to clients with identical network architectures. As a result, inter-organizational collaboration is severely restricted when both data privacy and architectural proprietary are required. In this work, we propose a new FL framework that removes this limitation by adopting a graph hypernetwork as a shared knowledge aggregator. A property of the graph hyper network is that it can adapt to various computational graphs, thereby allowing meaningful parameter sharing across models. Unlike existing solutions, our framework makes no use of external data and does not require clients to disclose their model architecture. Compared with distillation-based and non-graph hypernetwork baselines, our method performs notably better on standard benchmarks. We additionally show encouraging generalization performance to unseen architectures.", "keywords": "federated learning;graph neural networks;hypernetworks", "primary_area": "", "supplementary_material": "/attachment/57289ef119d5fcc539bfb1c841df2bdbaee4a03a.zip", "author": "Or Litany;Haggai Maron;David Acuna;Jan Kautz;Gal Chechik;Sanja Fidler", "authorids": "~Or_Litany1;~Haggai_Maron1;~David_Acuna1;~Jan_Kautz1;~Gal_Chechik1;~Sanja_Fidler1", "gender": "M;M;M;;;F", "homepage": "http://orlitany.github.io;https://haggaim.github.io/;http://www.cs.toronto.edu/~davidj/;http://jankautz.com;https://chechiklab.biu.ac.il/~gal/;http://www.cs.toronto.edu/~fidler/", "dblp": "119/1476;181/6629;217/2906;48/6214;c/GalChechik;08/6607", "google_scholar": "https://scholar.google.co.il/citations?user=Ihs8dwsAAAAJ;https://scholar.google.co.il/citations?user=4v8uJrIAAAAJ;https://scholar.google.ca/citations?user=9aFd9dEAAAAJ;P9FclNEAAAAJ;Wk2gAZUAAAAJ;CUlqK5EAAAAJ", "orcid": ";;;;0000-0001-9164-5303;", "linkedin": ";;;;;sanja-fidler-2846a1a?trk=hp-identity-name", "or_profile": "~Or_Litany1;~Haggai_Maron1;~David_Acuna1;~Jan_Kautz1;~Gal_Chechik1;~Sanja_Fidler1", "aff": "NVIDIA;NVIDIA;Department of Computer Science, University of Toronto;NVIDIA;NVIDIA;Department of Computer Science, University of Toronto", "aff_domain": "nvidia.com;nvidia.com;cs.toronto.edu;nvidia.com;nvidia.com;cs.toronto.edu", "position": "Research Scientist;Research Scientist;PhD student;VP Research;Principal Researcher;Associate Professor", "bibtex": "@misc{\nlitany2022federated,\ntitle={Federated Learning with Heterogeneous Architectures using Graph HyperNetworks},\nauthor={Or Litany and Haggai Maron and David Acuna and Jan Kautz and Gal Chechik and Sanja Fidler},\nyear={2022},\nurl={https://openreview.net/forum?id=7x_47XJULn}\n}", "github": "", "project": "", "reviewers": "7WPJ;Tk9o;jPp2;U48w", "site": "https://openreview.net/forum?id=7x_47XJULn", "pdf_size": 0, "recommendation": "3;3;6;6", "confidence": "4;4;3;3", "correctness": "3;2;3;2", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;3;4;3", "wc_summary_paper": "28;75;42;91", "wc_summary_review": "559;35;54;54", "wc_main_review": "140;237;141;495", "wc_review": "727;347;237;640", "wc_reply_reviewers": "0;86;72;151", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;1;1;1", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.5, 1.5 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 59.0, 25.149552679918582 ], "wc_summary_review_avg": [ 175.5, 221.54965583363023 ], "wc_main_review_avg": [ 253.25, 145.02823001057413 ], "wc_review_avg": [ 487.75, 201.93238348516564 ], "wc_reply_reviewers_avg": [ 77.25, 53.64408168661292 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -1.0, "corr_recommendation_correctness": 0.0, "gs_citation": 29, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15826700912321042339&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0;1;0;0;1", "aff_unique_norm": "NVIDIA;University of Toronto", "aff_unique_dep": "NVIDIA Corporation;Department of Computer Science", "aff_unique_url": "https://www.nvidia.com;https://www.utoronto.ca", "aff_unique_abbr": "NVIDIA;U of T", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Toronto", "aff_country_unique_index": "0;0;1;0;0;1", "aff_country_unique": "United States;Canada" }, { "id": "7xzVpAP5Cm", "title": "Non-reversible Parallel Tempering for Uncertainty Approximation in Deep Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Parallel tempering (PT), also known as replica exchange, is the go-to workhorse for simulations of multi-modal distributions. The key to the success of PT is to adopt efficient swap schemes. The popular deterministic even-odd (DEO) scheme exploits the non-reversibility property and has successfully reduced the communication cost from $O(P^2)$ to $O(P)$ given sufficient many $P$ chains. However, such an innovation largely disappears given limited chains in big data problems due to the extremely few bias-corrected swaps. To handle this issue, we generalize the DEO scheme to promote the non-reversibility and obtain an optimal communication cost $O(P\\log P)$. In addition, we also analyze the bias when we adopt stochastic gradient descent (SGD) with large and constant learning rates as exploration kernels. Such a user-friendly nature enables us to conduct large-scale uncertainty approximation tasks without much tuning costs.", "keywords": "replica exchange;parallel tempering;non-reversibility;stochastic approximation;round trip rate;deep learning", "primary_area": "", "supplementary_material": "/attachment/c2b695490cd071f5fe49c54f3e13b396dd5a96b9.zip", "author": "Wei Deng;Qian Zhang;Qi Feng;Faming Liang;Guang Lin", "authorids": "~Wei_Deng1;~Qian_Zhang10;~Qi_Feng3;~Faming_Liang1;~Guang_Lin1", "gender": "M;M;M;M;M", "homepage": "https://waynedw.github.io/;;https://sites.google.com/site/qifengmath/;https://www.stat.purdue.edu/~fmliang/;http://www.math.purdue.edu/~lin491/", "dblp": "69/508-2;04/2024-67.html;;29/1122;", "google_scholar": "IYiyxssAAAAJ;https://scholar.google.com/citations?hl=en;bNZM-X4AAAAJ;TboqoPIAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";;;;0000-0002-0976-1987", "linkedin": ";;;;", "or_profile": "~Wei_Deng1;~Qian_Zhang10;~Qi_Feng3;~Faming_Liang1;~Guang_Lin1", "aff": "Morgan Stanley;Purdue University;University of Michigan - Ann Arbor;Purdue University;Purdue University", "aff_domain": "morganstanley.com;purdue.edu;umich.edu;purdue.edu;purdue.edu", "position": "Researcher;PhD student;Postdoc;Full Professor;Associate Professor", "bibtex": "@misc{\ndeng2022nonreversible,\ntitle={Non-reversible Parallel Tempering for Uncertainty Approximation in Deep Learning},\nauthor={Wei Deng and Qian Zhang and Qi Feng and Faming Liang and Guang Lin},\nyear={2022},\nurl={https://openreview.net/forum?id=7xzVpAP5Cm}\n}", "github": "", "project": "", "reviewers": "mbau;kvcG;ofJx;pLfL", "site": "https://openreview.net/forum?id=7xzVpAP5Cm", "pdf_size": 0, "recommendation": "3;5;5;8", "confidence": "4;3;3;2", "correctness": "2;3;3;4", "technical_novelty": "4;3;3;3", "empirical_novelty": "4;3;3;0", "wc_summary_paper": "117;44;73;103", "wc_summary_review": "26;38;92;18", "wc_main_review": "333;229;292;227", "wc_review": "476;311;457;348", "wc_reply_reviewers": "1268;0;0;0", "wc_reply_authors": "1841;569;345;500", "reply_reviewers": "8;0;0;0", "reply_authors": "8;1;1;2", "recommendation_avg": [ 5.25, 1.7853571071357126 ], "confidence_avg": [ 3.0, 0.7071067811865476 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 3.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 1.5 ], "wc_summary_paper_avg": [ 84.25, 28.154706533721853 ], "wc_summary_review_avg": [ 43.5, 28.892040426387332 ], "wc_main_review_avg": [ 270.25, 44.67311831515682 ], "wc_review_avg": [ 398.0, 70.06068797835202 ], "wc_reply_reviewers_avg": [ 317.0, 549.0601059993342 ], "wc_reply_authors_avg": [ 813.75, 598.6047840604016 ], "reply_reviewers_avg": [ 2.0, 3.4641016151377544 ], "reply_authors_avg": [ 3.0, 2.9154759474226504 ], "replies_avg": [ 26, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.9901475429766743, "corr_recommendation_correctness": 0.9901475429766743, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:AkgDRu_9IhIJ:scholar.google.com/&scioq=Non-reversible+Parallel+Tempering+for+Uncertainty+Approximation+in+Deep+Learning&hl=en&as_sdt=0,33", "gs_version_total": 2, "aff_unique_index": "0;1;2;1;1", "aff_unique_norm": "Morgan Stanley;Purdue University;University of Michigan", "aff_unique_dep": ";;", "aff_unique_url": "https://www.morganstanley.com;https://www.purdue.edu;https://www.umich.edu", "aff_unique_abbr": "Morgan Stanley;Purdue;UM", "aff_campus_unique_index": "1", "aff_campus_unique": ";Ann Arbor", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "7y0AmECNwE", "title": "Parameter Estimation for the SEIR Model Using Recurrent Nets", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "The standard way to estimate the parameters $\\Theta_\\text{SEIR}$ (e.g., the transmission rate $\\beta$) of an SEIR model is to use grid search, where simulations are performed on each set of parameters, and the parameter set leading to the least $L_2$ distance between predicted number of infections and observed infections is selected. This brute-force strategy is not only time consuming, as simulations are slow when the population is large, but also inaccurate, since it is impossible to enumerate all parameter combinations. To address these issues, in this paper, we propose to transform the non-differentiable problem of finding optimal $\\Theta_\\text{SEIR}$ to a differentiable one, where we first train a recurrent net to fit a small number of simulation data. Next, based on this recurrent net that is able to generalize SEIR simulations, we are able to transform the objective to a differentiable one with respect to $\\Theta_\\text{SEIR}$, and straightforwardly obtain its optimal value. The proposed strategy is both time efficient as it only relies on a small number of SEIR simulations, and accurate as we are able to find the optimal $\\Theta_\\text{SEIR}$ based on the differentiable objective. On two COVID-19 datasets, we observe that the proposed strategy leads to significantly better parameter estimations with a smaller number of simulations. ", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Chun Fan;Yuxian Meng;Xiaofei Sun;Fei Wu;Tianwei Zhang;Jiwei Li", "authorids": "~Chun_Fan1;~Yuxian_Meng1;~Xiaofei_Sun1;~Fei_Wu1;~Tianwei_Zhang1;~Jiwei_Li1", "gender": "M;M;M;M;M;M", "homepage": ";https://yuxianmeng.github.io/;;https://person.zju.edu.cn/wufei;https://personal.ntu.edu.sg/tianwei.zhang/index.html;https://nlp.stanford.edu/~bdlijiwei/", "dblp": ";234/8585;;84/3254-1;77/7902-4;73/5746-1", "google_scholar": ";https://scholar.google.com/citations?hl=zh-CN;hIokU_IAAAAJ;XJLn4MYAAAAJ;9vpiYDIAAAAJ;PwU16JEAAAAJ", "orcid": ";;;;;", "linkedin": "chunfan/;;;;;", "or_profile": "~Chun_Fan1;~Yuxian_Meng1;~Xiaofei_Sun1;~Fei_Wu1;~Tianwei_Zhang1;~Jiwei_Li1", "aff": "Peking University;Shannon.AI;;Zhejiang University;Nanyang Technological University;Zhejiang University", "aff_domain": "pku.edu.cn;shannon.ai;;zju.edu.cn;ntu.edu.sg;zju.edu.cn", "position": "Researcher;Researcher;;Full Professor;Assistant Professor;Assistant Professor", "bibtex": "@misc{\nfan2022parameter,\ntitle={Parameter Estimation for the {SEIR} Model Using Recurrent Nets},\nauthor={Chun Fan and Yuxian Meng and Xiaofei Sun and Fei Wu and Tianwei Zhang and Jiwei Li},\nyear={2022},\nurl={https://openreview.net/forum?id=7y0AmECNwE}\n}", "github": "", "project": "", "reviewers": "oCDA;kBoX;TVP6;U3XU", "site": "https://openreview.net/forum?id=7y0AmECNwE", "pdf_size": 0, "recommendation": "3;3;6;6", "confidence": "3;4;4;5", "correctness": "3;4;3;4", "technical_novelty": "3;2;2;2", "empirical_novelty": "3;2;2;2", "wc_summary_paper": "120;66;67;120", "wc_summary_review": "29;21;18;20", "wc_main_review": "990;187;305;73", "wc_review": "1139;274;390;213", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.5, 1.5 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 93.25, 26.75233634656981 ], "wc_summary_review_avg": [ 22.0, 4.183300132670378 ], "wc_main_review_avg": [ 388.75, 356.6920064986038 ], "wc_review_avg": [ 504.0, 372.08937098498257 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.7071067811865476, "corr_recommendation_correctness": 0.0, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16811922191612862512&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;1;2;3;2", "aff_unique_norm": "Peking University;Shannon.AI;Zhejiang University;Nanyang Technological University", "aff_unique_dep": ";;;", "aff_unique_url": "http://www.pku.edu.cn;https://www.shannon.ai;https://www.zju.edu.cn;https://www.ntu.edu.sg", "aff_unique_abbr": "Peking U;Shannon.AI;ZJU;NTU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;2;0", "aff_country_unique": "China;United States;Singapore" }, { "id": "7yuU9VeIpde", "title": "Memory-Constrained Policy Optimization", "track": "main", "status": "Reject", "tldr": "", "abstract": "We introduce a new constrained optimization method for policy gradient reinforcement learning, which uses two trust regions to regulate each policy update. In addition to using the proximity of one single old policy as the first trust region as done by prior works, we propose to form a second trust region through the construction of another virtual policy that represents a wide range of past policies. We then enforce the new policy to stay closer to the virtual policy, which is beneficial in case the old policy performs badly. More importantly, we propose a mechanism to automatically build the virtual policy from a memory buffer of past policies, providing a new capability for dynamically selecting appropriate trust regions during the optimization process. Our proposed method, dubbed as Memory-Constrained Policy Optimization (MCPO), is examined on a diverse suite of environments including robotic locomotion control, navigation with sparse rewards and Atari games, consistently demonstrating competitive performance against recent on-policy constrained policy gradient methods. ", "keywords": "reinforcement learning;trust region policy optimization;on-policy;policy gradient;neural network", "primary_area": "", "supplementary_material": "", "author": "Hung Le;Thommen Karimpanal George;Majid Abdolshah;Dung Nguyen;Kien Do;Sunil Gupta;Svetha Venkatesh", "authorids": "~Hung_Le1;~Thommen_Karimpanal_George1;~Majid_Abdolshah1;~Dung_Nguyen1;~Kien_Do1;~Sunil_Gupta2;~Svetha_Venkatesh1", "gender": "M;M;;F;M;M;M", "homepage": "https://www.thommengk.com/;http://majid.website;https://www.deakin.edu.au/about-deakin/people/dung-nguyen;https://www.deakin.edu.au/about-deakin/people/svetha-venkatesh;https://thaihungle.github.io/;https://clarken92.github.io/;https://personal-sites.deakin.edu.au/~sunilg/", "dblp": "133/3358;190/6649;;81/1984;45/466-2;185/0836;47/333-1", "google_scholar": "v3-hy24AAAAJ;https://scholar.google.com.au/citations?user=RKC-MCUAAAAJ;https://scholar.google.com.au/citations?user=O5OU_kUAAAAJ;AEkRUQcAAAAJ;https://scholar.google.com.au/citations?user=q2HbxngAAAAJ;aD6y8joAAAAJ;https://scholar.google.com.au/citations?user=bXeL2t8AAAAJ", "orcid": "0000-0001-8918-3314;;0000-0002-7726-7841;;0000-0002-3126-184X;0000-0002-0119-122X;0000-0002-3308-1930", "linkedin": "thommen-george-karimpanal-762451149/;;;;;kien-duc-do-b45846a4/;", "or_profile": "~Thommen_Karimpanal_George1;~Majid_Abdolshah1;~Dung_Nguyen1;~Svetha_Venkatesh1;~Hung_Thai_Le1;~Kien_Duc_Do1;~Sunil_Kumar_Gupta1", "aff": "Deakin University;Amazon;Deakin University;Deakin University;Deakin University;Deakin University;Deakin University", "aff_domain": "deakin.edu.au;amazon.com;deakin.edu.au;deakin.edu.au;deakin.edu.au;deakin.edu.au;deakin.edu.au", "position": "Postdoc;Machine Learning Scientist;PhD student;Full Professor;Lecturer;Research Fellow;Associate Professor", "bibtex": "@misc{\nle2022memoryconstrained,\ntitle={Memory-Constrained Policy Optimization},\nauthor={Hung Le and Thommen Karimpanal George and Majid Abdolshah and Dung Nguyen and Kien Do and Sunil Gupta and Svetha Venkatesh},\nyear={2022},\nurl={https://openreview.net/forum?id=7yuU9VeIpde}\n}", "github": "", "project": "", "reviewers": "HBno;RkGG;9u6G;JAMC", "site": "https://openreview.net/forum?id=7yuU9VeIpde", "pdf_size": 0, "recommendation": "3;5;5;8", "confidence": "3;4;5;4", "correctness": "2;4;4;4", "technical_novelty": "2;3;2;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "66;41;116;83", "wc_summary_review": "11;18;24;31", "wc_main_review": "155;173;517;212", "wc_review": "232;232;657;326", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "245;354;740;260", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 5.25, 1.7853571071357126 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.5, 0.8660254037844386 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 76.5, 27.26261176043117 ], "wc_summary_review_avg": [ 21.0, 7.3824115301167 ], "wc_main_review_avg": [ 264.25, 147.37261448450997 ], "wc_review_avg": [ 361.75, 174.72889715213108 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 399.75, 200.83622058782126 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.39605901719066966, "corr_recommendation_correctness": 0.7276068751089989, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:2k0djjRnJmwJ:scholar.google.com/&scioq=Memory-Constrained+Policy+Optimization&hl=en&as_sdt=0,5", "gs_version_total": 2, "aff_unique_index": "0;1;0;0;0;0;0", "aff_unique_norm": "Deakin University;Amazon", "aff_unique_dep": ";Amazon.com, Inc.", "aff_unique_url": "https://www.deakin.edu.au;https://www.amazon.com", "aff_unique_abbr": "Deakin;Amazon", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0;0;0;0", "aff_country_unique": "Australia;United States" }, { "id": "7zFokR7k_86", "title": "Learning Symbolic Rules for Reasoning in Quasi-Natural Language", "track": "main", "status": "Reject", "tldr": "", "abstract": "Symbolic reasoning, rule-based symbol manipulation, is a hallmark of human intelligence. However, rule-based systems have had limited success competing with learning-based systems outside formalized domains such as automated theorem proving. We hypothesize that this is due to the manual construction of rules in past attempts. In this work, we ask how we can build a rule-based system that can reason with natural language input but without the manual construction of rules. We propose MetaQNL, a \"Quasi-Natural\" language that can express both formal logic and natural language sentences, and MetaInduce, a learning algorithm that induces MetaQNL rules from training data consisting of questions and answers, with or without intermediate reasoning steps. Our approach achieves state-of-the-art accuracy on multiple reasoning benchmarks; it learns compact models with much less data and produces not only answers but also checkable proofs.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Kaiyu Yang;Jia Deng", "authorids": "~Kaiyu_Yang1;~Jia_Deng1", "gender": "M;M", "homepage": "https://yangky11.github.io;", "dblp": "177/9276;07/6526-1.html", "google_scholar": "FciCu4EAAAAJ;U3Eub-EAAAAJ", "orcid": "0000-0002-2777-612X;", "linkedin": "kaiyuy;", "or_profile": "~Kaiyu_Yang1;~Jia_Deng1", "aff": "Princeton University;Princeton University", "aff_domain": "princeton.edu;princeton.edu", "position": "Ph.D. student;Assistant Professor", "bibtex": "@misc{\nyang2022learning,\ntitle={Learning Symbolic Rules for Reasoning in Quasi-Natural Language},\nauthor={Kaiyu Yang and Jia Deng},\nyear={2022},\nurl={https://openreview.net/forum?id=7zFokR7k_86}\n}", "github": "", "project": "", "reviewers": "v7TB;CPsz;9uxa;7F3Z", "site": "https://openreview.net/forum?id=7zFokR7k_86", "pdf_size": 0, "recommendation": "3;5;6;8", "confidence": "1;3;5;3", "correctness": "2;4;3;4", "technical_novelty": "2;3;4;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "168;82;205;340", "wc_summary_review": "38;20;131;7", "wc_main_review": "357;166;344;177", "wc_review": "563;268;680;524", "wc_reply_reviewers": "62;47;70;28", "wc_reply_authors": "353;766;648;43", "reply_reviewers": "1;1;1;1", "reply_authors": "1;1;2;1", "recommendation_avg": [ 5.5, 1.8027756377319946 ], "confidence_avg": [ 3.0, 1.4142135623730951 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 198.75, 92.96067717051119 ], "wc_summary_review_avg": [ 49.0, 48.605555238058955 ], "wc_main_review_avg": [ 261.0, 89.7022853666505 ], "wc_review_avg": [ 508.75, 150.38513058145077 ], "wc_reply_reviewers_avg": [ 51.75, 16.005858302509115 ], "wc_reply_authors_avg": [ 452.5, 280.2200028548997 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.5883484054145521, "corr_recommendation_correctness": 0.7526178090063818, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7184720471075402704&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0", "aff_unique_norm": "Princeton University", "aff_unique_dep": "", "aff_unique_url": "https://www.princeton.edu", "aff_unique_abbr": "Princeton", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "7zc05Ua_HOK", "title": "Learning Sample Reweighting for Adversarial Robustness", "track": "main", "status": "Reject", "tldr": "", "abstract": "There has been great interest in enhancing the robustness of neural network classifiers to defend against adversarial perturbations through adversarial training, while balancing the trade-off between robust accuracy and standard accuracy. We propose a novel adversarial training framework that learns to reweight the loss associated with individual training samples based on a notion of class-conditioned margin, with the goal of improving robust generalization. Inspired by MAML-based approaches, we formulate weighted adversarial training as a bilevel optimization problem where the upper-level task corresponds to learning a robust classifier, and the lower-level task corresponds to learning a parametric function that maps from a sample's \\textit{multi-class margin} to an importance weight. Extensive experiments demonstrate that our approach improves both clean and robust accuracy compared to related techniques and state-of-the-art baselines. ", "keywords": "deep learning;adversarial attack;robust training", "primary_area": "", "supplementary_material": "", "author": "Chester Holtz;Tsui-Wei Weng;Gal Mishne", "authorids": "~Chester_Holtz1;~Tsui-Wei_Weng1;~Gal_Mishne1", "gender": "M;F;F", "homepage": "https://cseweb.ucsd.edu/~chholtz/;https://lilywenglab.github.io;http://mishne.ucsd.edu/", "dblp": "161/9916;177/9197;125/3214", "google_scholar": "YqC8p9sAAAAJ;v8GM4xoAAAAJ;KrwpdXYAAAAJ", "orcid": "0000-0002-8548-4539;;0000-0002-5287-3626", "linkedin": "choltz95/;;", "or_profile": "~Chester_Holtz1;~Tsui-Wei_Weng1;~Gal_Mishne1", "aff": "University of California, San Diego;University of California, San Diego;University of California, San Diego", "aff_domain": "ucsd.edu;ucsd.edu;ucsd.edu", "position": "PhD student;Assistant Professor;Assistant Professor", "bibtex": "@misc{\nholtz2022learning,\ntitle={Learning Sample Reweighting for Adversarial Robustness},\nauthor={Chester Holtz and Tsui-Wei Weng and Gal Mishne},\nyear={2022},\nurl={https://openreview.net/forum?id=7zc05Ua_HOK}\n}", "github": "", "project": "", "reviewers": "oJtx;fjzH;3Xwo;E9Gz;K5MU;tCDo", "site": "https://openreview.net/forum?id=7zc05Ua_HOK", "pdf_size": 0, "recommendation": "3;3;6;6;8;8", "confidence": "4;4;4;4;4;4", "correctness": "3;3;3;3;4;4", "technical_novelty": "2;2;3;2;2;3", "empirical_novelty": "2;2;3;3;3;3", "wc_summary_paper": "52;48;47;91;54;64", "wc_summary_review": "39;59;60;22;46;51", "wc_main_review": "242;422;258;137;453;235", "wc_review": "333;529;365;250;553;350", "wc_reply_reviewers": "0;0;607;37;73;48", "wc_reply_authors": "446;767;524;496;431;140", "reply_reviewers": "0;0;1;1;1;1", "reply_authors": "1;1;2;1;2;1", "recommendation_avg": [ 5.666666666666667, 2.0548046676563256 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 59.333333333333336, 15.205992970609392 ], "wc_summary_review_avg": [ 46.166666666666664, 13.005340783266268 ], "wc_main_review_avg": [ 291.1666666666667, 110.86540889244439 ], "wc_review_avg": [ 396.6666666666667, 108.54901606596391 ], "wc_reply_reviewers_avg": [ 127.5, 215.99131927001142 ], "wc_reply_authors_avg": [ 467.3333333333333, 183.82751577376973 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 20, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.8029550685469661, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14851282094905618777&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of California, San Diego", "aff_unique_dep": "", "aff_unique_url": "https://www.ucsd.edu", "aff_unique_abbr": "UCSD", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "San Diego", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "80GQMJCj5oD", "title": "Gradient-based Hyperparameter Optimization without Validation Data for Learning fom Limited Labels", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Optimizing hyperparameters of machine learning algorithms especially for limited labeled data is important but difficult, because then obtaining enough validation data is practically impossible. Bayesian model selection enables hyperparameter optimization \\emph{without validation data}, but it requires Hessian log determinants, which is computationally demanding for deep neural networks. We study methods to efficiently approximate Hessian log determinants and empirically demonstrate that approximated Bayesian model selection can effectively tune hyperparameters of algorithms of deep semi-supervised learning and learning from noisy labels.", "keywords": "hyperparameter optimization;learning from limited labels;bayesian model selection", "primary_area": "", "supplementary_material": "", "author": "Ryuichiro Hataya;Hideki Nakayama", "authorids": "~Ryuichiro_Hataya1;~Hideki_Nakayama1", "gender": "Unspecified;M", "homepage": "https://mosko.tokyo;https://www.nlab.ci.i.u-tokyo.ac.jp/index-e.html", "dblp": "238/1068;09/1592", "google_scholar": "https://scholar.google.com/citations?view_op=list_works;lZAYGJoAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Ryuichiro_Hataya1;~Hideki_Nakayama1", "aff": "The University of Tokyo;The University of Tokyo", "aff_domain": "u-tokyo.ac.jp;u-tokyo.ac.jp", "position": "PhD student;Associate Professor", "bibtex": "@misc{\nhataya2022gradientbased,\ntitle={Gradient-based Hyperparameter Optimization without Validation Data for Learning fom Limited Labels},\nauthor={Ryuichiro Hataya and Hideki Nakayama},\nyear={2022},\nurl={https://openreview.net/forum?id=80GQMJCj5oD}\n}", "github": "", "project": "", "reviewers": "unH9;wRfN;czv2;z3aF", "site": "https://openreview.net/forum?id=80GQMJCj5oD", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "3;3;3;2", "correctness": "4;1;3;4", "technical_novelty": "2;2;2;1", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "167;174;78;54", "wc_summary_review": "33;28;36;45", "wc_main_review": "306;229;439;313", "wc_review": "506;431;553;412", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 2.75, 0.4330127018922193 ], "correctness_avg": [ 3.0, 1.224744871391589 ], "technical_novelty_avg": [ 1.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 118.25, 52.992334351300286 ], "wc_summary_review_avg": [ 35.5, 6.18465843842649 ], "wc_main_review_avg": [ 321.75, 75.29068667504633 ], "wc_review_avg": [ 475.5, 56.89683646741706 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.40824829046386296, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14059542594742475694&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "University of Tokyo", "aff_unique_dep": "", "aff_unique_url": "https://www.u-tokyo.ac.jp", "aff_unique_abbr": "UTokyo", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Japan" }, { "title": "On-Policy Model Errors in Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7133", "id": "81e1aeOt-sd", "poster": "", "openreview": "https://openreview.net/forum?id=81e1aeOt-sd", "slides": "https://iclr.cc/virtual/2022/poster/7133", "video": "https://iclr.cc/virtual/2022/poster/7133", "author_site": "Lukas Fr\u00f6hlich, Maksym Lefarov, Melanie Zeilinger, Felix Berkenkamp", "tldr": "", "abstract": "Model-free reinforcement learning algorithms can compute policy gradients given sampled environment transitions, but require large amounts of data. In contrast, model-based methods can use the learned model to generate new data, but model errors and bias can render learning unstable or suboptimal. In this paper, we present a novel method that combines real-world data and a learned model in order to get the best of both worlds. The core idea is to exploit the real-world data for on-policy predictions and use the learned model only to generalize to different actions. Specifically, we use the data as time-dependent on-policy correction terms on top of a learned model, to retain the ability to generate data without accumulating errors over long prediction horizons. We motivate this method theoretically and show that it counteracts an error term for model-based policy improvement. Experiments on MuJoCo- and PyBullet-benchmarks show that our method can drastically improve existing model-based approaches without introducing additional tuning parameters.", "keywords": "Model-based reinforcement learning;reinforcement learning;model learning", "primary_area": "", "supplementary_material": "", "author": "Lukas Froehlich;Maksym Lefarov;Melanie Zeilinger;Felix Berkenkamp", "authorids": "~Lukas_Froehlich1;~Maksym_Lefarov1;~Melanie_Zeilinger1;~Felix_Berkenkamp1", "gender": "M;M;F;M", "homepage": ";;;https://berkenkamp.me", "dblp": ";289/5854;41/7142;168/8558", "google_scholar": "6sU3uXUAAAAJ;fVwJDucAAAAJ;;https://scholar.google.ch/citations?user=N_tCEl8AAAAJ", "orcid": ";0009-0005-2550-8315;0000-0003-4570-7571;", "linkedin": ";mlefarov/;;berkenkamp/", "or_profile": "~Lukas_Froehlich1;~Maksym_Lefarov1;~Melanie_Zeilinger1;~Felix_Berkenkamp1", "aff": "Swiss Federal Institute of Technology;Bosch;ETHZ - ETH Zurich;Bosch", "aff_domain": "ethz.ch;bosch.com;ethz.ch;bosch.com", "position": "PhD student;Machine Learning Engineer;Assistant Professor;Research Scientist", "bibtex": "@inproceedings{\nfroehlich2022onpolicy,\ntitle={On-Policy Model Errors in Reinforcement Learning},\nauthor={Lukas Froehlich and Maksym Lefarov and Melanie Zeilinger and Felix Berkenkamp},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=81e1aeOt-sd}\n}", "github": "", "project": "", "reviewers": "A4Tf;8GLt;XAKT;xLi5", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "4;3;4;3", "correctness": "4;3;3;4", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;2;0;3", "wc_summary_paper": "65;71;20;132", "wc_summary_review": "27;33;63;69", "wc_main_review": "170;543;720;175", "wc_review": "262;647;803;376", "wc_reply_reviewers": "4;88;103;27", "wc_reply_authors": "442;874;882;61", "reply_reviewers": "1;1;1;1", "reply_authors": "1;2;3;1", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 1.75, 1.0897247358851685 ], "wc_summary_paper_avg": [ 72.0, 39.85599076675927 ], "wc_summary_review_avg": [ 48.0, 18.24828759089466 ], "wc_main_review_avg": [ 402.0, 237.885476647903 ], "wc_review_avg": [ 522.0, 214.18566712084169 ], "wc_reply_reviewers_avg": [ 55.5, 41.16126820203673 ], "wc_reply_authors_avg": [ 564.75, 340.9966092206783 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 1.75, 0.82915619758885 ], "replies_avg": [ 20, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.6882472016116854, "corr_recommendation_correctness": 0.2294157338705618, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8081725540215247086&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=81e1aeOt-sd", "email": "ethz.ch;bosch.com;ethz.ch;bosch.com", "author_num": 4, "aff_unique_index": "0;1;2;1", "aff_unique_norm": "Swiss Federal Institute of Technology;Robert Bosch GmbH;ETH Zurich", "aff_unique_dep": ";;", "aff_unique_url": "https://www.ethz.ch;https://www.bosch.com;https://www.ethz.ch", "aff_unique_abbr": "ETH Zurich;Bosch;ETHZ", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;1", "aff_country_unique": "Switzerland;Germany" }, { "id": "827jG3ahxL", "title": "REFACTOR: Learning to Extract Theorems from Proofs", "track": "main", "status": "Reject", "tldr": "", "abstract": "Human mathematicians are often good at recognizing modular and reusable theorems that make complex mathematical results within reach. In this paper, we propose a novel method called theoREm-from-prooF extrACTOR (REFACTOR) for training neural networks to mimic this ability in formal mathematical theorem proving. We show on a set of unseen proofs, REFACTOR is able to extract $19.6\\%$ of the theorems that humans would use to write the proofs. When applying the model to the existing Metamath library, REFACTOR extracted $16$ new theorems. With newly extracted theorems, we show that the existing proofs in the MetaMath database can be refactored. The new theorems are used very frequently after refactoring, with an average usage of $733.5$ times, and help to shorten the proof lengths. Lastly, we demonstrate that the prover trained on the new-theorem refactored dataset proves relatively $14$-$30\\%$ more test theorems by frequently leveraging a diverse set of newly extracted theorems.", "keywords": "theorem extraction;mathematical reasoning;theorem proving;reasoning", "primary_area": "", "supplementary_material": "/attachment/1dc83dd1e21af4ba5e06bf4bd54348766ddab4ae.zip", "author": "Jin Peng Zhou;Yuhuai Wu;Qiyang Li;Roger Baker Grosse", "authorids": "~Jin_Peng_Zhou1;~Yuhuai_Wu1;~Qiyang_Li1;~Roger_Baker_Grosse1", "gender": "M;M;M;M", "homepage": ";http://www.cs.toronto.edu/~ywu/;https://colinqiyangli.github.io/;http://www.cs.toronto.edu/~rgrosse/", "dblp": "255/1107;;;26/7058", "google_scholar": "Nf48jqcAAAAJ;https://scholar.google.ca/citations?user=bOQGfFIAAAAJ;qlwwdfEAAAAJ;xgQd1qgAAAAJ", "orcid": ";;;", "linkedin": "https://ca.linkedin.com/in/jinpeng-zhou;;;", "or_profile": "~Jin_Peng_Zhou1;~Yuhuai_Wu1;~Qiyang_Li1;~Roger_Baker_Grosse1", "aff": "Department of Computer Science, Cornell University;Stanford University;University of California, Berkeley;Department of Computer Science, University of Toronto", "aff_domain": "cs.cornell.edu;stanford.edu;berkeley.edu;cs.toronto.edu", "position": "PhD student;Postdoc;PhD student;Assistant Professor", "bibtex": "@misc{\nzhou2022refactor,\ntitle={{REFACTOR}: Learning to Extract Theorems from Proofs},\nauthor={Jin Peng Zhou and Yuhuai Wu and Qiyang Li and Roger Baker Grosse},\nyear={2022},\nurl={https://openreview.net/forum?id=827jG3ahxL}\n}", "github": "", "project": "", "reviewers": "c1Bf;pVkF;C6qG;7SqK;Xews", "site": "https://openreview.net/forum?id=827jG3ahxL", "pdf_size": 0, "recommendation": "3;3;5;5;6", "confidence": "3;4;4;5;4", "correctness": "3;4;4;3;3", "technical_novelty": "2;2;2;3;3", "empirical_novelty": "3;3;2;2;3", "wc_summary_paper": "103;80;40;91;27", "wc_summary_review": "64;69;67;6;39", "wc_main_review": "213;172;400;328;236", "wc_review": "380;321;507;425;302", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "320;210;483;559;374", "reply_reviewers": "0;0;0;0;0", "reply_authors": "2;2;2;2;2", "recommendation_avg": [ 4.4, 1.2 ], "confidence_avg": [ 4.0, 0.6324555320336759 ], "correctness_avg": [ 3.4, 0.4898979485566356 ], "technical_novelty_avg": [ 2.4, 0.4898979485566356 ], "empirical_novelty_avg": [ 2.6, 0.4898979485566356 ], "wc_summary_paper_avg": [ 68.2, 29.53912659507725 ], "wc_summary_review_avg": [ 49.0, 24.074883177286655 ], "wc_main_review_avg": [ 269.8, 82.7922701705902 ], "wc_review_avg": [ 387.0, 74.12691818765974 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 389.2, 122.2806607767557 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.0, 0.0 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.5270462766947298, "corr_recommendation_correctness": -0.2721655269759087, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12202321198796428388&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "Cornell University;Stanford University;University of California, Berkeley;University of Toronto", "aff_unique_dep": "Department of Computer Science;;;Department of Computer Science", "aff_unique_url": "https://www.cornell.edu;https://www.stanford.edu;https://www.berkeley.edu;https://www.utoronto.ca", "aff_unique_abbr": "Cornell;Stanford;UC Berkeley;U of T", "aff_campus_unique_index": "1;2;3", "aff_campus_unique": ";Stanford;Berkeley;Toronto", "aff_country_unique_index": "0;0;0;1", "aff_country_unique": "United States;Canada" }, { "id": "83grvoIJRnb", "title": "SGTR: Generating Scene Graph by Learning Compositional Triplets with Transformer", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "In this work, we propose an end-to-end framework for the scene graph generation. Motivated by the recently introduced DETR, our method, termed SGTR, generating scene graphs by learning compositional queries with Transformers. We develop a decoding-and-assembling paradigm for the end-to-end scene graph generation. Based on a shared backbone, the overall structure first consists of two parallel branches: entity detector and triplet constructor, followed by a newly designed assembling mechanism. Specifically, each triplet is constructed by a set of the compositional queries in the triplet constructor. The predicate queries and entity queries are learned simultaneously with explicit information exchange. In the training phase, the grouping mechanism is learned by matching the decoded triplets with the outcome of the entity detector. Extensive experimental results show that SGTR can achieve state-of-the-art performance, surpassing most of the existing approaches. Moreover, the sparse queries significantly improving the efficiency of scene graph generation. We hope our SGTR can serve as a strong baseline for the Transformer-based scene graph generation.", "keywords": "Computer Vision;Scene graph Generation;Scene Understanding", "primary_area": "", "supplementary_material": "/attachment/be990678c99ff4fd241515620da94bd7fc876ffa.zip", "author": "Rongjie Li;Songyang Zhang;Xuming He", "authorids": "~Rongjie_Li2;~Songyang_Zhang1;~Xuming_He3", "gender": "M;M;M", "homepage": "https://github.com/Scarecrow0;https://www.zhangsongyang.com/;https://faculty.sist.shanghaitech.edu.cn/faculty/hexm/index.html", "dblp": "73/8077;;03/4230", "google_scholar": "Sy5GMAgAAAAJ;8XQPi7YAAAAJ;0KyeZ2QAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Rongjie_Li2;~Songyang_Zhang1;~Xuming_He3", "aff": "SIST ,ShanghaiTech University;ShanghaiTech University;ShanghaiTech University", "aff_domain": "shanghaitech.edu.cn;shanghaitech.edu.cn;shanghaitech.edu.cn", "position": "PhD student;PhD student;Associate Professor", "bibtex": "@misc{\nli2022sgtr,\ntitle={{SGTR}: Generating Scene Graph by Learning Compositional Triplets with Transformer},\nauthor={Rongjie Li and Songyang Zhang and Xuming He},\nyear={2022},\nurl={https://openreview.net/forum?id=83grvoIJRnb}\n}", "github": "", "project": "", "reviewers": "2LAw;BnDZ;uWso;9jjk", "site": "https://openreview.net/forum?id=83grvoIJRnb", "pdf_size": 0, "recommendation": "1;3;5;5", "confidence": "4;5;4;4", "correctness": "2;3;3;3", "technical_novelty": "3;2;2;2", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "59;48;96;95", "wc_summary_review": "35;2;64;49", "wc_main_review": "460;557;292;575", "wc_review": "554;607;452;719", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.5, 1.6583123951777 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 74.5, 21.360009363293827 ], "wc_summary_review_avg": [ 37.5, 22.91833327273168 ], "wc_main_review_avg": [ 471.0, 112.2207645669909 ], "wc_review_avg": [ 583.0, 96.27304918823336 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.17407765595569782, "corr_recommendation_correctness": 0.8703882797784891, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:66DH4Zo3rIAJ:scholar.google.com/&scioq=SGTR:+Generating+Scene+Graph+by+Learning+Compositional+Triplets+with+Transformer&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;0", "aff_unique_norm": "ShanghaiTech University", "aff_unique_dep": "School of Information Science and Technology", "aff_unique_url": "http://www.shanghaitech.edu.cn", "aff_unique_abbr": "ShanghaiTech", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "id": "844kbKgwDL", "title": "Predicting subscriber usage: Analyzing multi-dimensional time-series using Convolutional Neural Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "Companies operating under the subscription model typically invest significant resources attempting to predict customer's feature usage. These predictions can be used to fuel growth: It may allow these companies to target individual customers -- for example to convert non-paying consumers to begin paying for for enhanced services -- or to identify customers not maximizing their subscription product.\nThis assistance can avoid an increase in the churn rate, and for some consumers may increase their usage.\n\nIn this work, we develop a deep learning model to predict the product usage of a given consumer, based on historical usage. We adapt a Convolutional Neural Network to time-series data followed by Auxiliary Output, and demonstrate that this enhanced model effectively predicts future change in usage.", "keywords": "Convolutional Neural Network;time series;usage prediction", "primary_area": "", "supplementary_material": "/attachment/306b1a4ea31333ee7793b0c43cab7297123ce44f.zip", "author": "Benjamin Azaria;Lee-Ad Gottlieb", "authorids": "benji@cloudinary.com;~Lee-Ad_Gottlieb1", "gender": ";F", "homepage": ";https://www.ariel.ac.il/wp/lee-ad-gottlieb/", "dblp": ";09/1539", "google_scholar": ";https://scholar.google.co.il/citations?user=HV6OPfcAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "benji@cloudinary.com;~Lee-Ad_Gottlieb1", "aff": ";Ariel University", "aff_domain": ";ariel.ac.il", "position": ";Associate Professor", "bibtex": "@misc{\nazaria2022predicting,\ntitle={Predicting subscriber usage: Analyzing multi-dimensional time-series using Convolutional Neural Networks},\nauthor={Benjamin Azaria and Lee-Ad Gottlieb},\nyear={2022},\nurl={https://openreview.net/forum?id=844kbKgwDL}\n}", "github": "", "project": "", "reviewers": "Cb5i;ANiP;BJUT;Y6ay;4yDn", "site": "https://openreview.net/forum?id=844kbKgwDL", "pdf_size": 0, "recommendation": "3;3;3;3;3", "confidence": "3;4;5;5;5", "correctness": "2;2;3;3;3", "technical_novelty": "2;2;2;2;2", "empirical_novelty": "2;1;3;2;2", "wc_summary_paper": "20;134;84;49;50", "wc_summary_review": "17;162;17;13;64", "wc_main_review": "149;939;193;342;369", "wc_review": "186;1235;294;404;483", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "0;0;0;0;0", "reply_reviewers": "0;0;0;0;0", "reply_authors": "0;0;0;0;0", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 4.4, 0.7999999999999999 ], "correctness_avg": [ 2.6, 0.4898979485566356 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.0, 0.6324555320336759 ], "wc_summary_paper_avg": [ 67.4, 38.98512536852998 ], "wc_summary_review_avg": [ 54.6, 56.88796006186195 ], "wc_main_review_avg": [ 398.4, 283.08415709820287 ], "wc_review_avg": [ 520.4, 371.12833359903954 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12719508625204176756&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0", "aff_unique_norm": "Ariel University", "aff_unique_dep": "", "aff_unique_url": "https://www.ariel.ac.il", "aff_unique_abbr": "Ariel U", "aff_country_unique_index": "0", "aff_country_unique": "Israel" }, { "id": "847CwJv9Vx", "title": "Benchmarking person re-identification approaches and training datasets for practical real-world implementations", "track": "main", "status": "Reject", "tldr": "", "abstract": "Person Re-Identification (Re-ID) is receiving a lot of attention recently. Large datasets containing labeled images of various individuals have been released, allowing researchers to develop and test many successful approaches. However, when such Re-ID models are deployed in a new city or environment, the task of searching people within a network of security cameras is likely to face an important domain shift, thus resulting in decreased performance. Indeed, while most public datasets were collected in a limited geographic area, images from a new city present different features (e.g., people's ethnicities and clothing style, weather, architecture, etc.). In addition, the whole frames of the video streams must be converted into cropped images of people using pedestrian detection models, which behave differently from the human annotators who created the dataset used for training. To better understand the extent of this issue, this paper introduces a complete methodology to evaluate Re-ID approaches and training datasets with respect to their suitability for deployment for live operations. This method is used to benchmark four Re-ID approaches and three datasets and provides interesting insight and guidelines that can help designing better Re-ID pipelines in the future.", "keywords": "person re-identification;benchmark study;practical deployment", "primary_area": "", "supplementary_material": "", "author": "Jose Miguel Huaman Cruz;Felix Oliver Sumari Huayta;Luigy Alex Machaca Arcana;Esteban GONZALEZ CLUA;Joris Guerin", "authorids": "~Jose_Miguel_Huaman_Cruz2;~Felix_Oliver_Sumari_Huayta1;~Luigy_Alex_Machaca_Arcana1;~Esteban_GONZALEZ_CLUA1;~Joris_Guerin1", "gender": ";M;M;;M", "homepage": ";;;http://lattes.cnpq.br/4791589931798048;https://espace-dev.pages.ird.fr/personnels/guerin/homepage/", "dblp": ";;;;194/2302", "google_scholar": "yYiRCEUAAAAJ;;https://scholar.google.com/citations?hl=en;;https://scholar.google.fr/citations?user=gO-31VYAAAAJ", "orcid": "my-orcid?orcid=0000-0003-3776-7865;0000-0003-3115-2878;;;0000-0002-8048-8960", "linkedin": "jmhuamanc/;felix-oliver-sumari-h/;luigymachaca/;;", "or_profile": "~Jose_Miguel_Huaman_Cruz2;~Felix_Oliver_Sumari_Huayta1;~Luigy_Alex_Machaca_Arcana1;~Esteban_GONZALEZ_CLUA1;~Joris_Guerin1", "aff": ";Intituo de Computa\u00e7\u00e3o - UFF;;Universidade federal fluminense;LAAS / CNRS", "aff_domain": ";ic.uff.br;;ic.uff.br;laas.fr", "position": ";PhD student;;Assistant Professor;Postdoc", "bibtex": "@misc{\ncruz2022benchmarking,\ntitle={Benchmarking person re-identification approaches and training datasets for practical real-world implementations},\nauthor={Jose Miguel Huaman Cruz and Felix Oliver Sumari Huayta and Luigy Alex Machaca Arcana and Esteban GONZALEZ CLUA and Joris Guerin},\nyear={2022},\nurl={https://openreview.net/forum?id=847CwJv9Vx}\n}", "github": "", "project": "", "reviewers": "mAbU;xCZG;fPDX", "site": "https://openreview.net/forum?id=847CwJv9Vx", "pdf_size": 0, "recommendation": "3;5;5", "confidence": "4;4;4", "correctness": "2;3;3", "technical_novelty": "2;2;2", "empirical_novelty": "3;2;2", "wc_summary_paper": "56;89;61", "wc_summary_review": "41;44;184", "wc_main_review": "524;393;288", "wc_review": "621;526;533", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "723;658;0", "reply_reviewers": "0;0;0", "reply_authors": "1;1;0", "recommendation_avg": [ 4.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 2.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 68.66666666666667, 14.522013940527977 ], "wc_summary_review_avg": [ 89.66666666666667, 66.71498249185778 ], "wc_main_review_avg": [ 401.6666666666667, 96.54129801397028 ], "wc_review_avg": [ 560.0, 43.22807729551092 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 460.3333333333333, 326.5846835899619 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0.6666666666666666, 0.4714045207910317 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.9999999999999998, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4196671714462344461&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;1", "aff_unique_norm": "Universidade Federal Fluminense;LAAS", "aff_unique_dep": "Instituto de Computa\u00e7\u00e3o;", "aff_unique_url": "https://www.uff.br;https://www.laas.fr/", "aff_unique_abbr": "UFF;LAAS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1", "aff_country_unique": "Brazil;France" }, { "title": "Evidential Turing Processes", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6361", "id": "84NMXTHYe-", "poster": "", "openreview": "https://openreview.net/forum?id=84NMXTHYe-", "slides": "https://iclr.cc/virtual/2022/poster/6361", "video": "https://iclr.cc/virtual/2022/poster/6361", "author_site": "Melih Kandemir, Abdullah Akg\u00fcl, Manuel Haussmann, Gozde Unal", "tldr": "", "abstract": "A probabilistic classifier with reliable predictive uncertainties i) fits successfully to the target domain data, ii) provides calibrated class probabilities in difficult regions of the target domain (e.g. class overlap), and iii) accurately identifies queries coming out of the target domain and reject them. We introduce an original combination of Evidential Deep Learning, Neural Processes, and Neural Turing Machines capable of providing all three essential properties mentioned above for total uncertainty quantification. We observe our method on three image classification benchmarks to consistently improve the in-domain uncertainty quantification, out-of-domain detection, and robustness against input perturbations with one single model. Our unified solution delivers an implementation-friendly and computationally efficient recipe for safety clearance and provides intellectual economy to an investigation of algorithmic roots of epistemic awareness in deep neural nets.", "keywords": "Evidential Deep Learning;Neural Processes;Attention;Neural Turing Machines", "primary_area": "", "supplementary_material": "/attachment/9dc5cd09409d8a8028f022028e94cd36b7dac563.zip", "author": "Melih Kandemir;Abdullah Akg\u00fcl;Manuel Haussmann;Gozde Unal", "authorids": "~Melih_Kandemir1;~Abdullah_Akg\u00fcl1;~Manuel_Haussmann1;~Gozde_Unal1", "gender": "M;M;;F", "homepage": "https://imada.sdu.dk/~kandemir/;https://aportekila.github.io/;https://manuelhaussmann.github.io/;https://ituvisionlab.github.io/", "dblp": "95/7056;294/5457;198/2433;", "google_scholar": "Jxm1UeYAAAAJ;FZeaKPoAAAAJ;https://scholar.google.com/citations?hl=de;soanB6MAAAAJ", "orcid": "0000-0001-6293-3656;0000-0002-0489-9493;;", "linkedin": "melih-kandemir-64681a16/;abdullahakgul70/;;", "or_profile": "~Melih_Kandemir1;~Abdullah_Akg\u00fcl1;~Manuel_Haussmann1;~Gozde_Unal1", "aff": "University of Southern Denmark;Istanbul Technical University;Aalto University;Istanbul Technical University", "aff_domain": "sdu.dk;itu.edu.tr;aalto.fi;itu.edu.tr", "position": "Associate Professor;MS student;Postdoc;Full Professor", "bibtex": "@inproceedings{\nkandemir2022evidential,\ntitle={Evidential Turing Processes },\nauthor={Melih Kandemir and Abdullah Akg{\\\"u}l and Manuel Haussmann and Gozde Unal},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=84NMXTHYe-}\n}", "github": "", "project": "", "reviewers": "zTRM;L3yb;UsNy;cnXy", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "3;2;2;3", "correctness": "2;4;4;3", "technical_novelty": "2;3;3;3", "empirical_novelty": "3;2;3;3", "wc_summary_paper": "63;40;44;99", "wc_summary_review": "67;57;19;32", "wc_main_review": "321;155;98;439", "wc_review": "451;252;161;570", "wc_reply_reviewers": "216;0;0;164", "wc_reply_authors": "280;118;96;774", "reply_reviewers": "1;0;0;1", "reply_authors": "1;1;1;2", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 2.5, 0.5 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 61.5, 23.32916629457641 ], "wc_summary_review_avg": [ 43.75, 19.149086140074676 ], "wc_main_review_avg": [ 253.25, 134.95253795316336 ], "wc_review_avg": [ 358.5, 160.96350518052222 ], "wc_reply_reviewers_avg": [ 95.0, 96.76259607927021 ], "wc_reply_authors_avg": [ 317.0, 273.2489707208428 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.2294157338705618, "corr_recommendation_correctness": 0.20751433915982243, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=552603888094876415&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "pdf": "https://openreview.net/pdf?id=84NMXTHYe-", "email": "sdu.dk;itu.edu.tr;aalto.fi;itu.edu.tr", "author_num": 4, "aff_unique_index": "0;1;2;1", "aff_unique_norm": "University of Southern Denmark;Istanbul Technical University;Aalto University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.sdu.dk;https://www.itu.edu.tr;https://www.aalto.fi", "aff_unique_abbr": "SDU;ITU;Aalto", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2;1", "aff_country_unique": "Denmark;T\u00fcrkiye;Finland" }, { "id": "86sEVRfeGYS", "title": "Continual Backprop: Stochastic Gradient Descent with Persistent Randomness", "track": "main", "status": "Reject", "tldr": "", "abstract": "The Backprop algorithm for learning in neural networks utilizes two mechanisms: first, stochastic gradient descent and second, initialization with small random weights, where the latter is essential to the effectiveness of the former. We show that in continual learning setups, Backprop performs well initially, but over time its performance degrades. Stochastic gradient descent alone is insufficient to learn continually; the initial randomness enables only initial learning but not continual learning. To the best of our knowledge, ours is the first result showing this degradation in Backprop\u2019s ability to learn. To address this issue, we propose an algorithm that continually injects random features alongside gradient descent using a new generate-and-test process. We call this the Continual Backprop algorithm. We show that, unlike Backprop, Continual Backprop is able to continually adapt in both supervised and reinforcement learning problems. We expect that as continual learning becomes more common in future applications, a method like Continual Backprop will be essential where the advantages of random initialization are present throughout learning.", "keywords": "Continual Adaptation;Reinforcement Learning;Continual Learning;Online Learning", "primary_area": "", "supplementary_material": "", "author": "Shibhansh Dohare;Richard S. Sutton;A. Rupam Mahmood", "authorids": "~Shibhansh_Dohare1;~Richard_S._Sutton1;~A._Rupam_Mahmood1", "gender": "M;;M", "homepage": "https://shibhansh.github.io/;;http://richsutton.com", "dblp": "202/1730;120/6935;48/6070", "google_scholar": "mqkvfUkAAAAJ;https://scholar.google.ca/citations?user=YwB8XM4AAAAJ;https://scholar.google.ca/citations?user=6m4wv6gAAAAJ", "orcid": "0000-0002-3796-9347;;0000-0002-3679-3415", "linkedin": ";;richard-sutton-0653545/", "or_profile": "~Shibhansh_Dohare1;~Rupam_Mahmood1;~Richard_S_Sutton1", "aff": "University of Alberta;University of Alberta;Google DeepMind", "aff_domain": "ualberta.ca;ualberta.ca;deepmind.com", "position": "PhD student;Assistant Professor;Research Scientist", "bibtex": "@misc{\ndohare2022continual,\ntitle={Continual Backprop: Stochastic Gradient Descent with Persistent Randomness},\nauthor={Shibhansh Dohare and Richard S. Sutton and A. Rupam Mahmood},\nyear={2022},\nurl={https://openreview.net/forum?id=86sEVRfeGYS}\n}", "github": "", "project": "", "reviewers": "kzUn;Evj3;gcKs;jf4G", "site": "https://openreview.net/forum?id=86sEVRfeGYS", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "4;4;5;4", "correctness": "3;4;4;3", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;3;3;4", "wc_summary_paper": "62;115;60;137", "wc_summary_review": "45;92;30;277", "wc_main_review": "568;258;216;334", "wc_review": "675;465;306;748", "wc_reply_reviewers": "407;0;0;349", "wc_reply_authors": "1040;188;359;940", "reply_reviewers": "1;0;0;2", "reply_authors": "2;1;1;2", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 93.5, 33.425289826716536 ], "wc_summary_review_avg": [ 111.0, 98.53172078067043 ], "wc_main_review_avg": [ 344.0, 136.06616037795732 ], "wc_review_avg": [ 548.5, 174.34233565029464 ], "wc_reply_reviewers_avg": [ 189.0, 190.1091791576619 ], "wc_reply_authors_avg": [ 631.75, 365.031762316651 ], "reply_reviewers_avg": [ 0.75, 0.82915619758885 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.13245323570650439, "corr_recommendation_correctness": 0.2294157338705618, "gs_citation": 87, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15936479234983580396&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0;1", "aff_unique_norm": "University of Alberta;Google", "aff_unique_dep": ";Google DeepMind", "aff_unique_url": "https://www.ualberta.ca;https://deepmind.com", "aff_unique_abbr": "UAlberta;DeepMind", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1", "aff_country_unique": "Canada;United Kingdom" }, { "id": "87Ks7PvYVJi", "title": "Offline Decentralized Multi-Agent Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "In many real-world multi-agent cooperative tasks, due to high cost and risk, agents cannot continuously interact with the environment and collect experiences during learning, but have to learn from offline datasets. However, the transition probabilities calculated from the dataset can be much different from the transition probabilities induced by the learned policies of other agents, creating large errors in value estimates. Moreover, the experience distributions of agents' datasets may vary wildly due to diverse behavior policies, causing large difference in value estimates between agents. Consequently, agents will learn uncoordinated suboptimal policies. In this paper, we propose MABCQ, which exploits value deviation and transition normalization to modify the transition probabilities. Value deviation optimistically increases the transition probabilities of high-value next states, and transition normalization normalizes the biased transition probabilities of next states. They together encourage agents to discover potential optimal and coordinated policies. Mathematically, we prove the convergence of Q-learning under the non-stationary transition probabilities after modification. Empirically, we show that MABCQ greatly outperforms baselines and reduces the difference in value estimates between agents. ", "keywords": "multi-agent reinforcement learning", "primary_area": "", "supplementary_material": "", "author": "Jiechuan Jiang;Zongqing Lu", "authorids": "~Jiechuan_Jiang1;~Zongqing_Lu2", "gender": ";", "homepage": ";", "dblp": "220/4026;", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": "~Jiechuan_Jiang1;~Zongqing_Lu2", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\njiang2022offline,\ntitle={Offline Decentralized Multi-Agent Reinforcement Learning},\nauthor={Jiechuan Jiang and Zongqing Lu},\nyear={2022},\nurl={https://openreview.net/forum?id=87Ks7PvYVJi}\n}", "github": "", "project": "", "reviewers": "WD4d;zmVr;N2SH;Hcfz", "site": "https://openreview.net/forum?id=87Ks7PvYVJi", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "5;3;2;4", "correctness": "2;3;3;2", "technical_novelty": "2;2;3;2", "empirical_novelty": "2;2;2;0", "wc_summary_paper": "37;62;41;57", "wc_summary_review": "43;30;74;23", "wc_main_review": "165;384;275;128", "wc_review": "245;476;390;208", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "650;327;403;386", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 3.5, 1.118033988749895 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 49.25, 10.497023387608508 ], "wc_summary_review_avg": [ 42.5, 19.551214796017153 ], "wc_main_review_avg": [ 238.0, 100.14239861317483 ], "wc_review_avg": [ 329.75, 108.4259539962642 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 441.5, 123.63757519459851 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.2581988897471611, "corr_recommendation_correctness": -0.5773502691896257, "gs_citation": 44, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14801125201743115114&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4 }, { "id": "87ULMOeCnE-", "title": "From Graph Local Embedding to Deep Metric Learning", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Deep metric learning continues to play a crucial role in many computer vision applications, while its various mining and weighting strategies have been extensively investigated. Techniques based on pairwise learning often use excessive random sampling and end up in slow convergence and model degradation. Further, neural network approaches mostly employ MLP layers for metric learning. The tactic can indeed be thought of as graph convolutions with only self-connections, indicating that local neighborhood relationships are neglected. We comprehensively identify the missing neighborhood relationships issue of conventional embedding and propose a novel approach, termed as Graph Local Embedding (GLE), to deep metric learning. Our method explores the local relationships and draws on the graph convolution networks to construct a discriminative mapping for embedding learning. The strategy can enhance metric learning by exploring the manifold-to-manifold relationships. By focusing on an essential variety of neighboring relations within GLE, burdens of redundant pairs can be substantially eased, and the context of each encoded data is greatly enriched. We demonstrate in the experiments that coupling GLE with existing metric learning techniques can yield impressive performance gains on popular benchmark datasets for fine-grained retrieval.", "keywords": "metric learning;fine-grained retrieval", "primary_area": "", "supplementary_material": "", "author": "Bing-Jhang Lin;Ding-Jie Chen;He-Yen Hsieh;Tyng-Luh Liu", "authorids": "~Bing-Jhang_Lin2;~Ding-Jie_Chen1;~He-Yen_Hsieh1;~Tyng-Luh_Liu1", "gender": "M;M;;", "homepage": ";;;https://www.iis.sinica.edu.tw/pages/liutyng/index_en.html", "dblp": ";123/2959;209/1822.html;68/2368", "google_scholar": ";6nxRMzEAAAAJ;;https://scholar.google.com.tw/citations?user=20N2rlkAAAAJ", "orcid": ";;;0000-0002-8366-5213", "linkedin": "bjlin/;;;", "or_profile": "~Bing-Jhang_Lin2;~Ding-Jie_Chen1;~He-Yen_Hsieh1;~Tyng-Luh_Liu1", "aff": ";Academia Sinica;IIS, Academia Sinica;Academia Sinica", "aff_domain": ";sinica.edu.tw;iis.sinica.edu.tw;sinica.edu.tw", "position": ";Postdoc;Research Assistant;Principal Researcher", "bibtex": "@misc{\nlin2022from,\ntitle={From Graph Local Embedding to Deep Metric Learning},\nauthor={Bing-Jhang Lin and Ding-Jie Chen and He-Yen Hsieh and Tyng-Luh Liu},\nyear={2022},\nurl={https://openreview.net/forum?id=87ULMOeCnE-}\n}", "github": "", "project": "", "reviewers": "bj7o;RAGX;zhSo;NzsP", "site": "https://openreview.net/forum?id=87ULMOeCnE-", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "4;4;4;4", "correctness": "2;3;2;3", "technical_novelty": "2;3;2;3", "empirical_novelty": "2;3;2;3", "wc_summary_paper": "73;73;105;80", "wc_summary_review": "70;73;81;46", "wc_main_review": "1067;217;632;247", "wc_review": "1210;363;818;373", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 82.75, 13.160072188251856 ], "wc_summary_review_avg": [ 67.5, 13.047988350699889 ], "wc_main_review_avg": [ 540.75, 345.09735945092365 ], "wc_review_avg": [ 691.0, 351.49608817168934 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:rt1sAfFmNdcJ:scholar.google.com/&scioq=From+Graph+Local+Embedding+to+Deep+Metric+Learning&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;0", "aff_unique_norm": "Academia Sinica", "aff_unique_dep": "", "aff_unique_url": "https://www.sinica.edu.tw", "aff_unique_abbr": "Academia Sinica", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Taiwan", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "id": "89W18gW0-6o", "title": "Provably Improved Context-Based Offline Meta-RL with Attention and Contrastive Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Meta-learning for offline reinforcement learning (OMRL) is an understudied problem with tremendous potential impact by enabling RL algorithms in many real-world applications. A popular solution to the problem is to infer task identity as augmented state using a context-based encoder, for which efficient learning of robust task representations remains an open challenge. In this work, we provably improve upon one of the SOTA OMRL algorithms, FOCAL, by incorporating intra-task attention mechanism and inter-task contrastive learning objectives, to robustify task representation learning against sparse reward and distribution shift. Theoretical analysis and experiments are presented to demonstrate the superior performance and robustness of our end-to-end and model-free framework compared to prior algorithms across multiple meta-RL benchmarks.", "keywords": "Reinforcement Learning;Representation learning for planning;Meta-RL;Attention Mechanism;Contrastive Learning;Offline RL", "primary_area": "", "supplementary_material": "/attachment/bff2dd054d6abd10b71d5e132113f4326e5e25b5.zip", "author": "Lanqing Li;Yuanhao HUANG;Mingzhe Chen;siteng luo;Dijun Luo;Junzhou Huang", "authorids": "~Lanqing_Li1;~Yuanhao_HUANG1;~Mingzhe_Chen1;~siteng_luo1;~Dijun_Luo1;~Junzhou_Huang2", "gender": "M;M;;M;M;M", "homepage": "https://lanqingli1993.github.io/;;https://github.com/Asuka20;https://idonthave.one;https://sites.google.com/site/dijunluo/;http://ranger.uta.edu/~huang/", "dblp": "275/9979;;;;;22/1170.html", "google_scholar": "n8IjgKkAAAAJ;;;;y_1aniIAAAAJ;https://scholar.google.com.tw/citations?user=X7KrguAAAAAJ", "orcid": "0000-0003-1998-4022;;;;;0000-0002-9548-1227", "linkedin": "lanqing-li-%EF%BC%88%E6%9D%8E%E8%93%9D%E9%9D%92%EF%BC%89-49209a83/;%E5%9B%AD%E8%B1%AA-%E9%BB%84-3a7a13164/;;;;", "or_profile": "~Lanqing_Li1;~Yuanhao_HUANG1;~Mingzhe_Chen1;~siteng_luo1;~Dijun_Luo1;~Junzhou_Huang2", "aff": "Tencent AI Lab;;Tsinghua University;Peking University;Tencent AI Lab;University of Texas, Arlington", "aff_domain": "tencent.com;;mails.tsinghua.edu.cn;pku.edu.cn;tencent.com;uta.edu", "position": "Research Scientist;;MS student;MS student;Researcher;Full Professor", "bibtex": "@misc{\nli2022provably,\ntitle={Provably Improved Context-Based Offline Meta-{RL} with Attention and Contrastive Learning},\nauthor={Lanqing Li and Yuanhao HUANG and Mingzhe Chen and siteng luo and Dijun Luo and Junzhou Huang},\nyear={2022},\nurl={https://openreview.net/forum?id=89W18gW0-6o}\n}", "github": "", "project": "", "reviewers": "UknQ;yZzz;oBW4;uEZk", "site": "https://openreview.net/forum?id=89W18gW0-6o", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "4;3;2;3", "correctness": "4;3;2;4", "technical_novelty": "2;2;3;2", "empirical_novelty": "3;3;1;3", "wc_summary_paper": "174;92;107;68", "wc_summary_review": "33;102;25;44", "wc_main_review": "457;603;397;116", "wc_review": "664;797;529;228", "wc_reply_reviewers": "80;390;171;0", "wc_reply_authors": "1093;1258;883;291", "reply_reviewers": "1;1;1;0", "reply_authors": "3;3;2;1", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 3.0, 0.7071067811865476 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 110.25, 39.347013863824536 ], "wc_summary_review_avg": [ 51.0, 30.20761493398643 ], "wc_main_review_avg": [ 393.25, 176.7347942539895 ], "wc_review_avg": [ 554.5, 210.97926438396738 ], "wc_reply_reviewers_avg": [ 160.25, 145.79158926357857 ], "wc_reply_authors_avg": [ 881.25, 365.7788778756915 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 2.25, 0.82915619758885 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.7071067811865475, "corr_recommendation_correctness": -0.30151134457776363, "gs_citation": 18, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9215854471633851569&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;1;2;0;3", "aff_unique_norm": "Tencent;Tsinghua University;Peking University;University of Texas at Arlington", "aff_unique_dep": "Tencent AI Lab;;;", "aff_unique_url": "https://ai.tencent.com;https://www.tsinghua.edu.cn;http://www.pku.edu.cn;https://www.uta.edu", "aff_unique_abbr": "Tencent AI Lab;THU;Peking U;UTA", "aff_campus_unique_index": "1", "aff_campus_unique": ";Arlington", "aff_country_unique_index": "0;0;0;0;1", "aff_country_unique": "China;United States" }, { "id": "8CEJlHbKoP4", "title": "Learning a metacognition for object detection", "track": "main", "status": "Reject", "tldr": "", "abstract": "In contrast to object recognition models, humans do not blindly trust their perception when building representations of the world, instead recruiting metacognition to detect percepts that are unreliable or false, such as when we realize that we mistook one object for another. We propose METAGEN, an unsupervised model that enhances object recognition models through a metacognition. Given noisy output from an object-detection model, METAGEN learns a meta-representation of how its perceptual system works and uses it to infer the objects in the world responsible for the detections. METAGEN achieves this by conditioning its inference on basic principles of objects that even human infants understand (known as Spelke principles: object permanence, cohesion, and spatiotemporal continuity). We test METAGEN on a variety of state-of-the-art object detection neural networks. We find that METAGEN quickly learns an accurate metacognitive representation of the neural network, and that this improves detection accuracy by filling in objects that the detection model missed and removing hallucinated objects. This approach enables generalization to out-of-sample data and outperforms comparison models that lack a metacognition.", "keywords": "metacognition;object detection;unsupervised learning", "primary_area": "", "supplementary_material": "", "author": "Marlene Berke;Mario Belledonne;Zhangir Azerbayev;Julian Jara-Ettinger", "authorids": "~Marlene_Berke1;~Mario_Belledonne1;~Zhangir_Azerbayev1;~Julian_Jara-Ettinger1", "gender": "F;;M;M", "homepage": "https://compdevlab.yale.edu/;;;http://compdevlab.yale.edu", "dblp": ";;;", "google_scholar": ";YTsCpJoAAAAJ;;", "orcid": ";;;", "linkedin": ";;zhangir-azerbayev-314ab21b8/;", "or_profile": "~Marlene_Berke1;~Mario_Belledonne1;~Zhangir_Azerbayev1;~Julian_Jara-Ettinger1", "aff": "Yale University;Yale University;Yale University;Yale University", "aff_domain": "yale.edu;yale.edu;yale.edu;yale.edu", "position": "PhD student;PhD student;Undergrad student;Assistant Professor", "bibtex": "@misc{\nberke2022learning,\ntitle={Learning a metacognition for object detection},\nauthor={Marlene Berke and Mario Belledonne and Zhangir Azerbayev and Julian Jara-Ettinger},\nyear={2022},\nurl={https://openreview.net/forum?id=8CEJlHbKoP4}\n}", "github": "", "project": "", "reviewers": "RHiF;1uYX;RfQP;Rg2j", "site": "https://openreview.net/forum?id=8CEJlHbKoP4", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "3;4;3;4", "correctness": "3;2;3;2", "technical_novelty": "2;3;2;2", "empirical_novelty": "0;3;2;4", "wc_summary_paper": "77;30;169;100", "wc_summary_review": "49;44;63;41", "wc_main_review": "436;267;798;278", "wc_review": "562;341;1030;419", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 1.479019945774904 ], "wc_summary_paper_avg": [ 94.0, 50.114868053303304 ], "wc_summary_review_avg": [ 49.25, 8.437268515343103 ], "wc_main_review_avg": [ 444.75, 214.62918603955055 ], "wc_review_avg": [ 588.0, 267.2124622842281 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.6882472016116854, "corr_recommendation_correctness": -0.6882472016116854, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10695616395053753189&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Yale University", "aff_unique_dep": "", "aff_unique_url": "https://www.yale.edu", "aff_unique_abbr": "Yale", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "8Dhw-NmmwT3", "title": "Lifting Imbalanced Regression with Self-Supervised Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "A new influential task called imbalanced regression, most recently inspired by imbalanced classification, originating straightforwardly from both the imbalance and regression worlds, has received a great deal of attention. Yet we are still at a fairly preliminary stage in the exploration of this task, so more attempts are needed. In this paper, we work on a seamless marriage of imbalanced regression and self-supervised learning. But with this comes the first question of how to measure the similarity and dissimilarity under the regression sense, for which the definition is clear in the classification. To overcome the limitation, the formal definition of similarity in the regression task is given. On top of this, through experimenting on a simple neural network, we found that self-supervised learning could help alleviate the problem. However, the second problem is, it is not guaranteed that the noisy samples are similar to original samples when scaling to a deep network by adding random noise to the input, we specifically propose to limit the volume of noise on the output, and in doing so to find meaningful noise on the input by back propagation. Experimental results show that our approach achieves the state-of-the-art performance.", "keywords": "Imbalanced Regression;Self-Supervised Learning;Long-Tailed Learning", "primary_area": "", "supplementary_material": "", "author": "Weiguo Pian;Hanyu Peng;Mingming Sun;Ping Li", "authorids": "~Weiguo_Pian1;~Hanyu_Peng1;~Mingming_Sun1;~Ping_Li3", "gender": ";;M;M", "homepage": ";;;http://www.stat.rutgers.edu/home/pingli/", "dblp": ";;87/8665-1.html;62/5860-1", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Weiguo_Pian1;~Hanyu_Peng1;~Mingming_Sun1;~Ping_Li3", "aff": ";;Baidu;LinkedIn", "aff_domain": ";;baidu.com;linkedin.com", "position": ";;Principal Researcher;Engineer", "bibtex": "@misc{\npian2022lifting,\ntitle={Lifting Imbalanced Regression with Self-Supervised Learning},\nauthor={Weiguo Pian and Hanyu Peng and Mingming Sun and Ping Li},\nyear={2022},\nurl={https://openreview.net/forum?id=8Dhw-NmmwT3}\n}", "github": "", "project": "", "reviewers": "QHt8;TyLX;hPS1;cjZs", "site": "https://openreview.net/forum?id=8Dhw-NmmwT3", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "3;4;5;3", "correctness": "4;3;3;3", "technical_novelty": "3;3;3;3", "empirical_novelty": "3;2;4;3", "wc_summary_paper": "72;99;45;27", "wc_summary_review": "15;33;38;48", "wc_main_review": "179;334;434;515", "wc_review": "266;466;517;590", "wc_reply_reviewers": "0;0;80;0", "wc_reply_authors": "712;757;431;649", "reply_reviewers": "0;0;2;0", "reply_authors": "1;1;2;1", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 60.75, 27.279800219209818 ], "wc_summary_review_avg": [ 33.5, 11.968709203585824 ], "wc_main_review_avg": [ 365.5, 125.31659906014048 ], "wc_review_avg": [ 459.75, 120.2297280209849 ], "wc_reply_reviewers_avg": [ 20.0, 34.64101615137755 ], "wc_reply_authors_avg": [ 637.25, 125.10470614649155 ], "reply_reviewers_avg": [ 0.5, 0.8660254037844386 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.30151134457776363, "corr_recommendation_correctness": -0.5773502691896257, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:76cV1tqYz-IJ:scholar.google.com/&scioq=Lifting+Imbalanced+Regression+with+Self-Supervised+Learning&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1", "aff_unique_norm": "Baidu;LinkedIn Corporation", "aff_unique_dep": "Baidu, Inc.;", "aff_unique_url": "https://www.baidu.com;https://www.linkedin.com", "aff_unique_abbr": "Baidu;LinkedIn", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "China;United States" }, { "title": "CKConv: Continuous Kernel Convolution For Sequential Data", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6393", "id": "8FhxBtXSl0", "poster": "", "openreview": "https://openreview.net/forum?id=8FhxBtXSl0", "slides": "https://iclr.cc/virtual/2022/poster/6393", "video": "https://iclr.cc/virtual/2022/poster/6393", "author_site": "David W. Romero, Anna Kuzina, Erik Bekkers, Jakub Tomczak, Mark Hoogendoorn", "tldr": "", "abstract": "Conventional neural architectures for sequential data present important limitations. Recurrent neural networks suffer from exploding and vanishing gradients, small effective memory horizons, and must be trained sequentially. Convolutional neural networks cannot handle sequences of unknown size and their memory horizon must be defined a priori. In this work, we show that these problems can be solved by formulating the convolutional kernels of CNNs as continuous functions. The resulting Continuous Kernel Convolution (CKConv) handles arbitrarily long sequences in a parallel manner, within a single operation, and without relying on any form of recurrence. We show that Continuous Kernel Convolutional Networks (CKCNNs) obtain state-of-the-art results in multiple datasets, e.g., permuted MNIST, and, thanks to their continuous nature, are able to handle non-uniformly sampled datasets and irregularly-sampled data natively. CKCNNs match or perform better than neural ODEs designed for these purposes in a faster and simpler manner.", "keywords": "Convolutional Networks;Continuous kernel Convolutions;Continuous Convolutional Kernels;Implicit Neural Representations;Sequential Data.", "primary_area": "", "supplementary_material": "", "author": "David W. Romero;Anna Kuzina;Erik J Bekkers;Jakub Mikolaj Tomczak;Mark Hoogendoorn", "authorids": "~David_W._Romero1;~Anna_Kuzina1;~Erik_J_Bekkers1;~Jakub_Mikolaj_Tomczak1;~Mark_Hoogendoorn2", "gender": "M;F;;M;M", "homepage": "https://davidwromero.xyz/;;https://erikbekkers.bitbucket.io/;https://jmtomczak.github.io/;http://www.cs.vu.nl/~mhoogen", "dblp": "254/1396;;43/5596;80/8238;19/1103.html", "google_scholar": "7tdzmVoAAAAJ;IMoc7ioAAAAJ;https://scholar.google.nl/citations?user=yeWrfR4AAAAJ;https://scholar.google.pl/citations?user=XB99pR4AAAAJ;3s4lqHkAAAAJ", "orcid": ";;;0000-0001-8634-6878;", "linkedin": "david-w-romero-05893567/;;;jakub-tomczak-04305314a/;", "or_profile": "~David_W._Romero1;~Anna_Kuzina1;~Erik_J_Bekkers1;~Jakub_Mikolaj_Tomczak1;~Mark_Hoogendoorn2", "aff": "Vrije Universiteit Amsterdam;VU Amsterdam;University of Amsterdam;Vrije Universiteit Amsterdam;VU University Amsterdam", "aff_domain": "vu.nl;vu.nl;uva.nl;vu.nl;vu.nl", "position": "PhD student;PhD student;Assistant Professor;Assistant Professor;Full Professor", "bibtex": "@inproceedings{\nromero2022ckconv,\ntitle={{CKC}onv: Continuous Kernel Convolution For Sequential Data},\nauthor={David W. Romero and Anna Kuzina and Erik J Bekkers and Jakub Mikolaj Tomczak and Mark Hoogendoorn},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=8FhxBtXSl0}\n}", "github": "", "project": "", "reviewers": "Lwkz;DZHd;NgYq;nLYG", "pdf_size": 0, "recommendation": "6;8;8;8", "confidence": "3;3;4;4", "correctness": "4;4;4;4", "technical_novelty": "3;3;3;3", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "121;55;97;305", "wc_summary_review": "51;29;46;120", "wc_main_review": "76;166;247;932", "wc_review": "248;250;390;1357", "wc_reply_reviewers": "0;15;0;194", "wc_reply_authors": "560;516;351;1272", "reply_reviewers": "0;1;0;1", "reply_authors": "2;1;1;3", "recommendation_avg": [ 7.5, 0.8660254037844386 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 4.0, 0.0 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 144.5, 95.62818622142741 ], "wc_summary_review_avg": [ 61.5, 34.74550330618338 ], "wc_main_review_avg": [ 355.25, 338.4356475018552 ], "wc_review_avg": [ 561.25, 463.01910057793515 ], "wc_reply_reviewers_avg": [ 52.25, 82.06818811208153 ], "wc_reply_authors_avg": [ 674.75, 353.5147627751916 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.75, 0.82915619758885 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": 0.0, "gs_citation": 153, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13572212513025696836&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "pdf": "https://openreview.net/pdf?id=8FhxBtXSl0", "email": "vu.nl;vu.nl;uva.nl;vu.nl;vu.nl", "author_num": 5, "aff_unique_index": "0;0;1;0;2", "aff_unique_norm": "Vrije Universiteit Amsterdam;University of Amsterdam;VU University Amsterdam", "aff_unique_dep": ";;", "aff_unique_url": "https://www.vu.nl;https://www.uva.nl;https://www.vu.nl", "aff_unique_abbr": "VU Amsterdam;UvA;VU", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Amsterdam", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "Netherlands" }, { "title": "AdaRL: What, Where, and How to Adapt in Transfer Reinforcement Learning", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6988", "id": "8H5bpVwvt5", "poster": "", "openreview": "https://openreview.net/forum?id=8H5bpVwvt5", "slides": "https://iclr.cc/virtual/2022/poster/6988", "video": "https://iclr.cc/virtual/2022/poster/6988", "author_site": "Biwei Huang, Fan Feng, Chaochao Lu, Sara Magliacane, Kun Zhang", "tldr": "", "abstract": "One practical challenge in reinforcement learning (RL) is how to make quick adaptations when faced with new environments. In this paper, we propose a principled framework for adaptive RL, called AdaRL, that adapts reliably and efficiently to changes across domains with a few samples from the target domain, even in partially observable environments. Specifically, we leverage a parsimonious graphical representation that characterizes structural relationships over variables in the RL system. Such graphical representations provide a compact way to encode what and where the changes across domains are, and furthermore inform us with a minimal set of changes that one has to consider for the purpose of policy adaptation. We show that by explicitly leveraging this compact representation to encode changes, we can efficiently adapt the policy to the target domain, in which only a few samples are needed and further policy optimization is avoided. We illustrate the efficacy of AdaRL through a series of experiments that vary factors in the observation, transition and reward functions for Cartpole and Atari games.", "keywords": "Transfer RL;Graphical models;Efficient adaptation", "primary_area": "", "supplementary_material": "", "author": "Biwei Huang;Fan Feng;Chaochao Lu;Sara Magliacane;Kun Zhang", "authorids": "~Biwei_Huang1;~Fan_Feng2;~Chaochao_Lu1;~Sara_Magliacane1;~Kun_Zhang1", "gender": "F;;;F;M", "homepage": ";;https://causallu.com/;http://saramagliacane.github.io;http://www.andrew.cmu.edu/user/kunz1/", "dblp": "165/3288;;142/2790;120/5256;96/3115-1", "google_scholar": ";;C_Qxt0IAAAAJ;https://scholar.google.nl/citations?user=H3j_zQ4AAAAJ;RGoypN4AAAAJ", "orcid": ";;;;", "linkedin": ";;;magliacane/;", "or_profile": "~Biwei_Huang1;~Fan_Feng2;~Chaochao_Lu1;~Sara_Magliacane1;~Kun_Zhang1", "aff": "Carnegie Mellon University;;University of Cambridge;MIT-IBM Watson AI Lab;Carnegie Mellon University", "aff_domain": "cmu.edu;;cam.ac.uk;mit.edu;cmu.edu", "position": "PhD student;;PhD student;Research Scientist;Associate Professor", "bibtex": "@inproceedings{\nhuang2022adarl,\ntitle={Ada{RL}: What, Where, and How to Adapt in Transfer Reinforcement Learning},\nauthor={Biwei Huang and Fan Feng and Chaochao Lu and Sara Magliacane and Kun Zhang},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=8H5bpVwvt5}\n}", "github": "", "project": "", "reviewers": "i8j9;6qyY;awgy;DmF1", "pdf_size": 0, "recommendation": "8;8;8;8", "confidence": "4;4;4;3", "correctness": "4;4;3;4", "technical_novelty": "3;3;3;3", "empirical_novelty": "3;2;3;3", "wc_summary_paper": "132;65;76;138", "wc_summary_review": "28;75;15;39", "wc_main_review": "498;737;404;228", "wc_review": "658;877;495;405", "wc_reply_reviewers": "76;20;0;0", "wc_reply_authors": "1470;1200;819;0", "reply_reviewers": "1;1;0;0", "reply_authors": "3;2;1;0", "recommendation_avg": [ 8.0, 0.0 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 102.75, 32.5528416578338 ], "wc_summary_review_avg": [ 39.25, 22.320114247019436 ], "wc_main_review_avg": [ 466.75, 183.67821727140102 ], "wc_review_avg": [ 608.75, 179.4691825913296 ], "wc_reply_reviewers_avg": [ 24.0, 31.11269837220809 ], "wc_reply_authors_avg": [ 872.25, 554.1616979727127 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.5, 1.118033988749895 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 79, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6560728254700453684&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=8H5bpVwvt5", "email": "cmu.edu;;cam.ac.uk;mit.edu;cmu.edu", "author_num": 5, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "Carnegie Mellon University;University of Cambridge;Massachusetts Institute of Technology", "aff_unique_dep": ";;IBM Watson AI Lab", "aff_unique_url": "https://www.cmu.edu;https://www.cam.ac.uk;https://www.mitibmwatsonailab.org", "aff_unique_abbr": "CMU;Cambridge;MIT-IBM AI Lab", "aff_campus_unique_index": "1", "aff_campus_unique": ";Cambridge", "aff_country_unique_index": "0;1;0;0", "aff_country_unique": "United States;United Kingdom" }, { "id": "8IXBbFjkMat", "title": "Bag-of-Vectors Autoencoders for Unsupervised Conditional Text Generation", "track": "main", "status": "Reject", "tldr": "", "abstract": "Text autoencoders are often used for unsupervised conditional text generation by applying mappings in the latent space to change attributes to the desired values. Recently, Mai et al. (2020) proposed $\\operatorname{Emb2Emb}$, a method to $\\textit{learn}$ these mappings in the embedding space of an autoencoder. However, their method is restricted to autoencoders with a single-vector embedding, which limits how much information can be retained. We address this issue by extending their method to $\\textit{Bag-of-Vectors Autoencoders}$ (BoV-AEs), which encode the text into a variable-size bag of vectors that grows with the size of the text, as in attention-based models. This allows to encode and reconstruct much longer texts than standard autoencoders. Analogous to conventional autoencoders, we propose regularization techniques that facilitate learning meaningful operations in the latent space. Finally, we adapt $\\operatorname{Emb2Emb}$ for a training scheme that learns to map an input bag to an output bag, including a novel loss function and neural architecture. Our experimental evaluations on unsupervised sentiment transfer and sentence summarization show that our method performs substantially better than a standard autoencoder.", "keywords": "autoencoders;latent space learning;variable-size;natural language processing", "primary_area": "", "supplementary_material": "/attachment/cbd08d39be18585fe5b3a3b0ee46c55ade675d2e.zip", "author": "Florian Mai;James Henderson", "authorids": "~Florian_Mai1;~James_Henderson1", "gender": "Non-Binary;M", "homepage": ";http://idiap.ch/~jhenderson/", "dblp": "200/7899;h/JamesHenderson.html", "google_scholar": "MfETM20AAAAJ;CSib0ooAAAAJ", "orcid": ";0000-0003-3714-4799", "linkedin": ";james-henderson-3b68346b/", "or_profile": "~Florian_Mai1;~James_Henderson1", "aff": "Idiap Research Institute;Idiap Research Institute", "aff_domain": "idiap.ch;idiap.ch", "position": "PhD student;Senior Researcher", "bibtex": "@misc{\nmai2022bagofvectors,\ntitle={Bag-of-Vectors Autoencoders for Unsupervised Conditional Text Generation},\nauthor={Florian Mai and James Henderson},\nyear={2022},\nurl={https://openreview.net/forum?id=8IXBbFjkMat}\n}", "github": "", "project": "", "reviewers": "hEPH;jHbb;74E4;kHZy", "site": "https://openreview.net/forum?id=8IXBbFjkMat", "pdf_size": 0, "recommendation": "5;5;5;6", "confidence": "3;4;4;3", "correctness": "4;3;3;3", "technical_novelty": "3;3;3;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "80;36;171;73", "wc_summary_review": "24;15;16;21", "wc_main_review": "228;234;247;509", "wc_review": "332;285;434;603", "wc_reply_reviewers": "0;250;0;36", "wc_reply_authors": "378;1391;373;278", "reply_reviewers": "0;2;0;1", "reply_authors": "1;2;1;1", "recommendation_avg": [ 5.25, 0.4330127018922193 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 90.0, 49.663870167356066 ], "wc_summary_review_avg": [ 19.0, 3.6742346141747673 ], "wc_main_review_avg": [ 304.5, 118.26770480566536 ], "wc_review_avg": [ 413.5, 121.94773470630768 ], "wc_reply_reviewers_avg": [ 71.5, 104.09971181516306 ], "wc_reply_authors_avg": [ 605.0, 455.5430824850708 ], "reply_reviewers_avg": [ 0.75, 0.82915619758885 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": -0.3333333333333333, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2309257038700767049&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0", "aff_unique_norm": "Idiap Research Institute", "aff_unique_dep": "", "aff_unique_url": "https://www.idiap.ch", "aff_unique_abbr": "Idiap", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Switzerland" }, { "id": "8KD0wdSF2NE", "title": "A composable autoencoder-based algorithm for accelerating numerical simulations", "track": "main", "status": "Reject", "tldr": "", "abstract": "Numerical simulations for engineering applications solve partial differential equations (PDE) to model various physical processes. Traditional PDE solvers are very accurate but computationally costly. On the other hand, Machine Learning (ML) methods offer a significant computational speedup but face challenges with accuracy and generalization to different PDE conditions, such as geometry, boundary conditions, initial conditions and PDE source terms. In this work, we propose a novel ML-based approach, CoAE-MLSim, which is an unsupervised, lower-dimensional, local method, that is motivated from key ideas used in commercial PDE solvers. This allows our approach to learn better with relatively fewer samples of PDE solutions. The proposed ML-approach is compared against commercial solvers for better benchmarks as well as latest ML-approaches for solving PDEs. It is tested for a variety of complex engineering cases to demonstrate its computational speed, accuracy, scalability, and generalization across different PDE conditions. The results show that our approach captures physics accurately across all metrics of comparison (including measures such as results on section cuts and lines).", "keywords": "numerical simulations;machine learning", "primary_area": "", "supplementary_material": "", "author": "Rishikesh Ranade;Derek Christopher Hill;Haiyang He;Amir Maleki;Norman Chang;Jay Pathak", "authorids": "~Rishikesh_Ranade1;~Derek_Christopher_Hill1;~Haiyang_He2;~Amir_Maleki1;norman.chang@ansys.com;~Jay_Pathak1", "gender": "M;M;;M;;M", "homepage": ";https://ansys.com;https://www.linkedin.com/in/he-haiyang-25a344a3/;;;https://www.linkedin.com/in/jayppathak/", "dblp": ";;;;;", "google_scholar": "Gb7qV4oAAAAJ;;;Rc2jOxoAAAAJ;;WKsh_bMAAAAJ", "orcid": ";;;;;", "linkedin": ";;;amaleki2/;;jayppathak/", "or_profile": "~Rishikesh_Ranade1;~Derek_Christopher_Hill1;~Haiyang_He2;~Amir_Maleki1;norman.chang@ansys.com;~Jay_Pathak1", "aff": "Ansys Inc;Ansys, Inc.;ANSYS;Ansys Inc;;Ansys Inc.", "aff_domain": "ansys.com;ansys.com;ansys.com;ansys.com;;ansys.com", "position": "Researcher;Chief Technologist, Fluids;Researcher;Machine Learning R&D;;Director, Research & Software Development", "bibtex": "@misc{\nranade2022a,\ntitle={A composable autoencoder-based algorithm for accelerating numerical simulations},\nauthor={Rishikesh Ranade and Derek Christopher Hill and Haiyang He and Amir Maleki and Norman Chang and Jay Pathak},\nyear={2022},\nurl={https://openreview.net/forum?id=8KD0wdSF2NE}\n}", "github": "", "project": "", "reviewers": "WDpc;dyhE;ji7S;hE8v", "site": "https://openreview.net/forum?id=8KD0wdSF2NE", "pdf_size": 0, "recommendation": "5;5;5;5", "confidence": "5;2;3;3", "correctness": "2;3;2;3", "technical_novelty": "2;3;3;3", "empirical_novelty": "0;3;0;3", "wc_summary_paper": "52;34;49;12", "wc_summary_review": "22;133;58;30", "wc_main_review": "595;341;1553;103", "wc_review": "669;508;1660;145", "wc_reply_reviewers": "93;93;396;0", "wc_reply_authors": "1587;1066;2682;870", "reply_reviewers": "1;1;1;0", "reply_authors": "3;2;5;3", "recommendation_avg": [ 5.0, 0.0 ], "confidence_avg": [ 3.25, 1.0897247358851685 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.5, 1.5 ], "wc_summary_paper_avg": [ 36.75, 15.833114033569013 ], "wc_summary_review_avg": [ 60.75, 43.80282525134652 ], "wc_main_review_avg": [ 648.0, 550.7059106274419 ], "wc_review_avg": [ 745.5, 561.0634990800952 ], "wc_reply_reviewers_avg": [ 145.5, 149.52675345903822 ], "wc_reply_authors_avg": [ 1551.25, 703.4633519238938 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 3.25, 1.0897247358851685 ], "replies_avg": [ 23, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:XhENFD-Egy8J:scholar.google.com/&scioq=A+composable+autoencoder-based+algorithm+for+accelerating+numerical+simulations&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;0;0;1", "aff_unique_norm": "Ansys;Ansys Inc.", "aff_unique_dep": ";", "aff_unique_url": "https://www.ansys.com;https://www.ansys.com", "aff_unique_abbr": "Ansys;Ansys", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "8MN_GH4Ckp4", "title": "Model Compression via Symmetries of the Parameter Space", "track": "main", "status": "Reject", "tldr": "", "abstract": "We provide a theoretical framework for neural networks in terms of the representation theory of quivers, thus revealing symmetries of the parameter space of neural networks. An exploitation of these symmetries leads to a model compression algorithm for radial neural networks based on an analogue of the QR decomposition. The algorithm is lossless; the compressed model has the same feedforward function as the original model. If applied before training, optimization of the compressed model by gradient descent is equivalent to a projected version of gradient descent on the original model. ", "keywords": "symmetry;orthogonal group;quiver representation;representation theory;model compression;parameter optimization;projected gradient descent", "primary_area": "", "supplementary_material": "/attachment/7b9adfd8270731269e4dc637895baf643dce81ee.zip", "author": "Iordan Ganev;Robin Walters", "authorids": "~Iordan_Ganev1;~Robin_Walters1", "gender": ";M", "homepage": "https://ivganev.github.io/;http://www.robinwalters.com", "dblp": ";258/3416", "google_scholar": ";fnprJmUAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Iordan_Ganev1;~Robin_Walters1", "aff": "Institute for Computing and Information Sciences, Radboud University Nijmegen, Radboud University;Northeastern University ", "aff_domain": "cs.ru.nl;northeastern.edu", "position": "Postdoc;Assistant Professor", "bibtex": "@misc{\nganev2022model,\ntitle={Model Compression via Symmetries of the Parameter Space},\nauthor={Iordan Ganev and Robin Walters},\nyear={2022},\nurl={https://openreview.net/forum?id=8MN_GH4Ckp4}\n}", "github": "", "project": "", "reviewers": "EGB6;6ipT;uaQ7", "site": "https://openreview.net/forum?id=8MN_GH4Ckp4", "pdf_size": 0, "recommendation": "3;3;5", "confidence": "5;3;4", "correctness": "3;4;3", "technical_novelty": "2;3;3", "empirical_novelty": "2;2;3", "wc_summary_paper": "56;74;199", "wc_summary_review": "67;29;104", "wc_main_review": "218;550;548", "wc_review": "341;653;851", "wc_reply_reviewers": "0;0;211", "wc_reply_authors": "517;700;823", "reply_reviewers": "0;0;2", "reply_authors": "1;1;3", "recommendation_avg": [ 3.6666666666666665, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 109.66666666666667, 63.594199595735326 ], "wc_summary_review_avg": [ 66.66666666666667, 30.619528989773105 ], "wc_main_review_avg": [ 438.6666666666667, 156.03703264147123 ], "wc_review_avg": [ 615.0, 209.93332274796205 ], "wc_reply_reviewers_avg": [ 70.33333333333333, 99.46635388690768 ], "wc_reply_authors_avg": [ 680.0, 125.72191535289303 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.9428090415820634 ], "reply_authors_avg": [ 1.6666666666666667, 0.9428090415820634 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": -0.5, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10416459817545960462&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1", "aff_unique_norm": "Radboud University;Northeastern University", "aff_unique_dep": "Institute for Computing and Information Sciences;", "aff_unique_url": "https://www.ru.nl;https://www.northeastern.edu", "aff_unique_abbr": "RU;NEU", "aff_campus_unique_index": "0", "aff_campus_unique": "Nijmegen;", "aff_country_unique_index": "0;1", "aff_country_unique": "Netherlands;United States" }, { "title": "Relational Multi-Task Learning: Modeling Relations between Data and Tasks", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/5979", "id": "8Py-W8lSUgy", "poster": "", "openreview": "https://openreview.net/forum?id=8Py-W8lSUgy", "slides": "https://iclr.cc/virtual/2022/poster/5979", "video": "https://iclr.cc/virtual/2022/poster/5979", "author_site": "Kaidi Cao, Jiaxuan You, Jure Leskovec", "tldr": "", "abstract": "A key assumption in multi-task learning is that at the inference time the multi-task model only has access to a given data point but not to the data point\u2019s labels from other tasks. This presents an opportunity to extend multi-task learning to utilize data point\u2019s labels from other auxiliary tasks, and this way improves performance on the new task. Here we introduce a novel relational multi-task learning setting where we leverage data point labels from auxiliary tasks to make more accurate predictions on the new task. We develop MetaLink, where our key innovation is to build a knowledge graph that connects data points and tasks and thus allows us to leverage labels from auxiliary tasks. The knowledge graph consists of two types of nodes: (1) data nodes, where node features are data embeddings computed by the neural network, and (2) task nodes, with the last layer\u2019s weights for each task as node features. The edges in this knowledge graph capture data-task relationships, and the edge label captures the label of a data point on a particular task. Under MetaLink, we reformulate the new task as a link label prediction problem between a data node and a task node. The MetaLink framework provides flexibility to model knowledge transfer from auxiliary task labels to the task of interest. We evaluate MetaLink on 6 benchmark datasets in both biochemical and vision domains. Experiments demonstrate that MetaLink can successfully utilize the relations among different tasks, outperforming the state-of-the-art methods under the proposed relational multi-task learning setting, with up to 27% improvement in ROC AUC.", "keywords": "Graph Neural Networks;Relational Representation Learning;Multi-task Learning;Meta Learning", "primary_area": "", "supplementary_material": "", "author": "Kaidi Cao;Jiaxuan You;Jure Leskovec", "authorids": "~Kaidi_Cao1;~Jiaxuan_You2;~Jure_Leskovec1", "gender": "M;;M", "homepage": "https://ai.stanford.edu/~kaidicao/;http://cs.stanford.edu/~jure/;https://cs.stanford.edu/~jiaxuan/", "dblp": "203/8207;l/JureLeskovec;192/4727", "google_scholar": "https://scholar.google.com.hk/citations?user=4Zw1PJ8AAAAJ;Q_kKkIUAAAAJ;NDbMl7oAAAAJ", "orcid": ";0000-0002-5411-923X;", "linkedin": ";leskovec/;jiaxuan-you-5859b37b/", "or_profile": "~Kaidi_Cao1;~Jure_Leskovec1;~Jiaxuan_You1", "aff": "Stanford University;Kumo.AI;Stanford University", "aff_domain": "stanford.edu;kumo.ai;stanford.edu", "position": "PhD student;Chief Scientist;PhD student", "bibtex": "@inproceedings{\ncao2022relational,\ntitle={Relational Multi-Task Learning: Modeling Relations between Data and Tasks},\nauthor={Kaidi Cao and Jiaxuan You and Jure Leskovec},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=8Py-W8lSUgy}\n}", "github": "", "project": "", "reviewers": "QALx;2M5P;jGkV;ijJF", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "4;4;4;4", "correctness": "3;4;4;4", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;4;3;3", "wc_summary_paper": "13;59;47;83", "wc_summary_review": "21;51;162;32", "wc_main_review": "153;327;62;157", "wc_review": "187;437;271;272", "wc_reply_reviewers": "0;14;0;0", "wc_reply_authors": "743;614;396;228", "reply_reviewers": "0;1;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 50.5, 25.233905761891084 ], "wc_summary_review_avg": [ 66.5, 56.171612047367844 ], "wc_main_review_avg": [ 174.75, 95.7610959628178 ], "wc_review_avg": [ 291.75, 90.67903561463366 ], "wc_reply_reviewers_avg": [ 3.5, 6.06217782649107 ], "wc_reply_authors_avg": [ 495.25, 197.96132829418983 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.6622661785325219, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1131898779823896273&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "pdf": "https://openreview.net/pdf?id=8Py-W8lSUgy", "email": "stanford.edu;kumo.ai;stanford.edu", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "Stanford University;Kumo.AI", "aff_unique_dep": ";", "aff_unique_url": "https://www.stanford.edu;https://www.kumo.ai", "aff_unique_abbr": "Stanford;Kumo.AI", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Stanford;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "8QE3pwEVc8P", "title": "Zero-Cost Operation Scoring in Differentiable Architecture Search", "track": "main", "status": "Reject", "tldr": "", "abstract": "Differentiable neural architecture search (NAS) has attracted significant attention in recent years due to its ability to quickly discover promising architectures of deep neural networks even in very large search spaces. Despite its success, many differentiable NAS methods lack robustness and may degenerate to trivial architectures with excessive parameter-free operations such as skip connections thus leading to inferior performance. In fact, selecting operations based on the magnitude of architectural parameters was recently proven to be fundamentally wrong, showcasing the need to rethink how operation scoring and selection occurs in differentiable NAS. To this end, we formalize and analyze a fundamental component of differentiable NAS: local \"operation scoring\" that occurs at each choice of operation.\nWhen comparing existing operation scoring functions, we find that existing methods can be viewed as inexact proxies for accuracy.\nWe also find that existing methods perform poorly when analyzed empirically on NAS benchmarks. From this perspective, we introduce new training-free proxies to the context of differentiable NAS, and show that we can significantly speed up the search process while improving accuracy on multiple search spaces. We take inspiration from zero-cost proxies that were recently studied in the context of sample-based NAS but shown to degrade significantly for larger search spaces like DARTS. Our novel \"perturbation-based zero-cost operation scoring\" (Zero-Cost-PT) improves searching time and accuracy compared to the best available differentiable architecture search for many search space sizes, including very large ones. Specifically, we are able improve accuracy compared to the best current method (DARTS-PT) on the DARTS CNN search space while being over 40x faster (total searching time 25 minutes on a single GPU). Our code is available at: https://github.com/avail-upon-acceptance.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Lichuan Xiang;\u0141ukasz Dudziak;Mohamed S Abdelfattah;Thomas Chun Pong Chau;Nicholas Donald Lane;Hongkai Wen", "authorids": "~Lichuan_Xiang1;~\u0141ukasz_Dudziak1;~Mohamed_S_Abdelfattah1;~Thomas_Chun_Pong_Chau1;~Nicholas_Donald_Lane1;~Hongkai_Wen1", "gender": "M;M;M;M;;", "homepage": ";;https://mohsaied.github.io/;http://www.thomaschau.uk/;;", "dblp": "294/8850;228/7987;124/7095;https://dblp.uni-trier.de/pid/18/5848;;", "google_scholar": ";R47NvpoAAAAJ;https://scholar.google.ca/citations?user=q4wBpWAAAAAJ;S1ijDzAAAAAJ;;", "orcid": ";;;0000-0001-5419-3029;;", "linkedin": "lichuan-xiang-17ab43101/;;mabdelfattah/;thomasccp;;", "or_profile": "~Lichuan_Xiang1;~\u0141ukasz_Dudziak1;~Mohamed_S_Abdelfattah1;~Thomas_Chun_Pong_Chau1;~Nicholas_Donald_Lane1;~Hongkai_Wen1", "aff": "The university of Warwick;Samsung;Samsung AI Center;Samsung;;", "aff_domain": "warwick.ac.uk;samsung.com;samsung.com;samsung.com;;", "position": "PhD student;Software Engineer;Principal Scientist;Researcher;;", "bibtex": "@misc{\nxiang2022zerocost,\ntitle={Zero-Cost Operation Scoring in Differentiable Architecture Search},\nauthor={Lichuan Xiang and {\\L}ukasz Dudziak and Mohamed S Abdelfattah and Thomas Chun Pong Chau and Nicholas Donald Lane and Hongkai Wen},\nyear={2022},\nurl={https://openreview.net/forum?id=8QE3pwEVc8P}\n}", "github": "", "project": "", "reviewers": "86cZ;q4ge;G1Xa;podk", "site": "https://openreview.net/forum?id=8QE3pwEVc8P", "pdf_size": 0, "recommendation": "5;5;6;8", "confidence": "4;4;3;4", "correctness": "3;3;3;3", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "26;95;55;112", "wc_summary_review": "71;39;51;44", "wc_main_review": "125;170;103;470", "wc_review": "222;304;209;626", "wc_reply_reviewers": "132;109;32;205", "wc_reply_authors": "1299;1178;327;1641", "reply_reviewers": "1;1;1;1", "reply_authors": "2;2;1;3", "recommendation_avg": [ 6.0, 1.224744871391589 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 72.0, 33.66749173906485 ], "wc_summary_review_avg": [ 51.25, 12.173228823939851 ], "wc_main_review_avg": [ 217.0, 148.05235560436046 ], "wc_review_avg": [ 340.25, 168.95025155352684 ], "wc_reply_reviewers_avg": [ 119.5, 61.71102008555684 ], "wc_reply_authors_avg": [ 1111.25, 483.5774886199729 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 2.0, 0.7071067811865476 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 26, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13670892211175721729&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff_unique_index": "0;1;1;1", "aff_unique_norm": "University of Warwick;Samsung", "aff_unique_dep": ";Samsung", "aff_unique_url": "https://warwick.ac.uk;https://www.samsung.com", "aff_unique_abbr": "Warwick;Samsung", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1", "aff_country_unique": "United Kingdom;South Korea" }, { "id": "8TnLOVrNRNp", "title": "Visual TransforMatcher: Efficient Match-to-Match Attention for Visual Correspondence", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Establishing correspondences between images remains a challenging task, especially under large appearance changes due to different viewpoints and intra-class variations. In this work, we introduce a strong image matching learner, dubbed \\textit{Visual Transformatcher}, which builds on the success of the Transformers in vision domains. Unlike previous self-attention schemes over image matches, it performs match-to-match attention for precise match localization and dynamically updates matching scores in a global context. \nTo handle a large number of candidate matches in a dense correlation map, we develop a light-weight architecture with an effective positional encoding technique for matching. In experiments, our method achieves the new state of the art on the SPair-71k dataset, while performing on par with existing state-of-the-art models on the PF-PASCAL and PF-WILLOW datasets, showing the effectiveness of the proposed approach. We also provide the results of extensive ablation studies to justify the design choices of our model. The code and trained weights will be released upon acceptance.", "keywords": "image matching;semantic correspondence;visual transformer;4D attention", "primary_area": "", "supplementary_material": "", "author": "Seung Wook Kim;Juhong Min;Minsu Cho", "authorids": "~Seung_Wook_Kim2;~Juhong_Min1;~Minsu_Cho1", "gender": "M;;M", "homepage": "https://wookiekim.github.io;;http://cvlab.postech.ac.kr/~mcho/", "dblp": "07/10150-5;;", "google_scholar": "kZ4AN54AAAAJ;;5TyoF5QAAAAJ", "orcid": ";;", "linkedin": "seung-wook-kim-77b9bb117/;;minsu-cho-062b3750/", "or_profile": "~Seung_Wook_Kim2;~Juhong_Min1;~Minsu_Cho1", "aff": "POSTECH;;POSTECH", "aff_domain": "postech.ac.kr;;postech.ac.kr", "position": "PhD student;;Associate Professor", "bibtex": "@misc{\nkim2022visual,\ntitle={Visual TransforMatcher: Efficient Match-to-Match Attention for Visual Correspondence},\nauthor={Seung Wook Kim and Juhong Min and Minsu Cho},\nyear={2022},\nurl={https://openreview.net/forum?id=8TnLOVrNRNp}\n}", "github": "", "project": "", "reviewers": "ARbp;HnVv;YJJy;rKA6", "site": "https://openreview.net/forum?id=8TnLOVrNRNp", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "2;4;3;4", "correctness": "3;3;2;2", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "46;82;125;20", "wc_summary_review": "12;126;89;13", "wc_main_review": "192;774;853;196", "wc_review": "250;982;1067;229", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 68.25, 39.47388377142538 ], "wc_summary_review_avg": [ 60.0, 49.26966612430005 ], "wc_main_review_avg": [ 503.75, 311.00994759010524 ], "wc_review_avg": [ 632.0, 393.7188082883519 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.5222329678670935, "corr_recommendation_correctness": -0.5773502691896257, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:5Hu0rrWu9_sJ:scholar.google.com/&scioq=Visual+TransforMatcher:+Efficient+Match-to-Match+Attention+for+Visual+Correspondence&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Pohang University of Science and Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.postech.ac.kr", "aff_unique_abbr": "POSTECH", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Pohang", "aff_country_unique_index": "0;0", "aff_country_unique": "South Korea" }, { "id": "8WRYT8QAcj", "title": "COMBO: Conservative Offline Model-Based Policy Optimization", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Model-based reinforcement learning (RL) algorithms, which learn a dynamics model from logged experience and perform conservative planning under the learned model, have emerged as a promising paradigm for offline reinforcement learning (offline RL). However, practical variants of such model-based algorithms rely on explicit uncertainty quantification for incorporating conservatism. Uncertainty estimation with complex models, such as deep neural networks, can be difficult and unreliable. We empirically find that uncertainty estimation is not accurate and leads to poor performance in certain scenarios in offline model-based RL. We overcome this limitation by developing a new model-based offline RL algorithm, COMBO, that trains a value function using both the offline dataset and data generated using rollouts under the model while also additionally regularizing the value function on out-of-support state-action tuples generated via model rollouts. This results in a conservative estimate of the value function for out-of-support state-action tuples, without requiring explicit uncertainty estimation. Through extensive experiments, we find that COMBO attains greater performance compared to prior offline RL on problems that demand generalization to related but previously unseen tasks, and also consistently matches or outperforms prior offline RL methods on widely studied offline RL benchmarks, including image-based tasks.", "keywords": "offline reinforcement learning;model-based reinforcement learning", "primary_area": "", "supplementary_material": "/attachment/03ba21e0cf7ade022b07576c9776b5828260ab90.zip", "author": "Tianhe Yu;Aviral Kumar;Rafael Rafailov;Aravind Rajeswaran;Sergey Levine;Chelsea Finn", "authorids": "~Tianhe_Yu1;~Aviral_Kumar2;~Rafael_Rafailov1;~Aravind_Rajeswaran1;~Sergey_Levine1;~Chelsea_Finn1", "gender": "M;M;M;M;M;F", "homepage": "https://cs.stanford.edu/~tianheyu/;https://aviralkumar2907.github.io/;https://rmrafailov.github.io/;http://aravindr93.github.io/;https://people.eecs.berkeley.edu/~svlevine/;https://ai.stanford.edu/~cbfinn/", "dblp": "192/1797;202/7961;272/5358;164/5778;80/7594;131/1783", "google_scholar": ";;TwABcRgAAAAJ;_EJrRVAAAAAJ;8R35rCwAAAAJ;vfPE6hgAAAAJ", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": "~Tianhe_Yu1;~Aviral_Kumar2;~Rafael_Rafailov1;~Aravind_Rajeswaran1;~Sergey_Levine1;~Chelsea_Finn1", "aff": "Stanford University;University of California, Berkeley;Stanford University;Meta Facebook;Google;Google", "aff_domain": "stanford.edu;berkeley.edu;stanford.edu;meta.com;google.com;google.com", "position": "PhD student;PhD student;MS student;Research Scientist;Research Scientist;Research Scientist", "bibtex": "@misc{\nyu2022combo,\ntitle={{COMBO}: Conservative Offline Model-Based Policy Optimization},\nauthor={Tianhe Yu and Aviral Kumar and Rafael Rafailov and Aravind Rajeswaran and Sergey Levine and Chelsea Finn},\nyear={2022},\nurl={https://openreview.net/forum?id=8WRYT8QAcj}\n}", "github": "", "project": "", "reviewers": "", "site": "https://openreview.net/forum?id=8WRYT8QAcj", "pdf_size": 0, "recommendation": "", "confidence": "", "correctness": "", "technical_novelty": "", "empirical_novelty": "", "wc_summary_paper": "", "wc_summary_review": "", "wc_main_review": "", "wc_review": "", "wc_reply_reviewers": "", "wc_reply_authors": "", "reply_reviewers": "", "reply_authors": "", "recommendation_avg": [ 0, 0 ], "confidence_avg": [ 0, 0 ], "correctness_avg": [ 0, 0 ], "technical_novelty_avg": [ 0, 0 ], "empirical_novelty_avg": [ 0, 0 ], "wc_summary_paper_avg": [ 0, 0 ], "wc_summary_review_avg": [ 0, 0 ], "wc_main_review_avg": [ 0, 0 ], "wc_review_avg": [ 0, 0 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 1, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0, "corr_recommendation_correctness": 0, "gs_citation": 502, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14724540879421190856&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff_unique_index": "0;1;0;2;3;3", "aff_unique_norm": "Stanford University;University of California, Berkeley;Meta;Google", "aff_unique_dep": ";;Meta Platforms, Inc.;Google", "aff_unique_url": "https://www.stanford.edu;https://www.berkeley.edu;https://meta.com;https://www.google.com", "aff_unique_abbr": "Stanford;UC Berkeley;Meta;Google", "aff_campus_unique_index": "0;1;0;3;3", "aff_campus_unique": "Stanford;Berkeley;;Mountain View", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Label Encoding for Regression Networks", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6164", "id": "8WawVDdKqlL", "poster": "", "openreview": "https://openreview.net/forum?id=8WawVDdKqlL", "slides": "https://iclr.cc/virtual/2022/poster/6164", "video": "https://iclr.cc/virtual/2022/poster/6164", "author_site": "Deval Shah, Zi Yu Xue, Tor Aamodt", "tldr": "", "abstract": "Deep neural networks are used for a wide range of regression problems. However, there exists a significant gap in accuracy between specialized approaches and generic direct regression in which a network is trained by minimizing the squared or absolute error of output labels. Prior work has shown that solving a regression problem with a set of binary classifiers can improve accuracy by utilizing well-studied binary classification algorithms. We introduce binary-encoded labels (BEL), which generalizes the application of binary classification to regression by providing a framework for considering arbitrary multi-bit values when encoding target values. We identify desirable properties of suitable encoding and decoding functions used for the conversion between real-valued and binary-encoded labels based on theoretical and empirical study. These properties highlight a tradeoff between classification error probability and error-correction capabilities of label encodings. BEL can be combined with off-the-shelf task-specific feature extractors and trained end-to-end. We propose a series of sample encoding, decoding, and training loss functions for BEL and demonstrate they result in lower error than direct regression and specialized approaches while being suitable for a diverse set of regression problems, network architectures, and evaluation metrics. BEL achieves state-of-the-art accuracies for several regression benchmarks. Code is available at https://github.com/ubc-aamodt-group/BEL_regression.\n", "keywords": "Regression;Label encoding;Output codes", "primary_area": "", "supplementary_material": "/attachment/19a93cadaf0cf76f826b30f92d720dbe009aa393.zip", "author": "Deval Shah;Zi Yu Xue;Tor Aamodt", "authorids": "~Deval_Shah1;~Zi_Yu_Xue1;~Tor_Aamodt1", "gender": "F;M;M", "homepage": ";;http://www.ece.ubc.ca/~aamodt/", "dblp": "217/0997;;a/TorMAamodt", "google_scholar": "https://scholar.google.ca/citations?hl=en;;https://scholar.google.ca/citations?user=zCsB5XsAAAAJ", "orcid": ";;0000-0003-1161-692X", "linkedin": "deval-shah-91485867/;fisherxue;tor-aamodt-2811564/", "or_profile": "~Deval_Shah1;~Zi_Yu_Xue1;~Tor_Aamodt1", "aff": "University of British Columbia;University of British Columbia;University of British Columbia", "aff_domain": "ubc.ca;ubc.ca;ubc.ca", "position": "PhD student;Undergrad student;Full Professor", "bibtex": "@inproceedings{\nshah2022label,\ntitle={Label Encoding for Regression Networks},\nauthor={Deval Shah and Zi Yu Xue and Tor Aamodt},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=8WawVDdKqlL}\n}", "github": "", "project": "", "reviewers": "qL8F;HgS5;zpsJ;t2zZ", "pdf_size": 0, "recommendation": "6;8;8;8", "confidence": "3;3;4;3", "correctness": "3;3;3;3", "technical_novelty": "3;3;3;3", "empirical_novelty": "3;3;3;3", "wc_summary_paper": "50;83;77;66", "wc_summary_review": "19;91;56;68", "wc_main_review": "185;328;667;470", "wc_review": "254;502;800;604", "wc_reply_reviewers": "0;23;25;16", "wc_reply_authors": "972;1270;1704;697", "reply_reviewers": "0;1;1;1", "reply_authors": "2;2;3;1", "recommendation_avg": [ 7.5, 0.8660254037844386 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 69.0, 12.549900398011133 ], "wc_summary_review_avg": [ 58.5, 26.043233286210835 ], "wc_main_review_avg": [ 412.5, 178.16635484849544 ], "wc_review_avg": [ 540.0, 196.80955261368794 ], "wc_reply_reviewers_avg": [ 16.0, 9.82344135219425 ], "wc_reply_authors_avg": [ 1160.75, 373.4122219478093 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 2.0, 0.7071067811865476 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": 0.0, "gs_citation": 23, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17134575941397611216&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=8WawVDdKqlL", "email": "ubc.ca;ubc.ca;ubc.ca", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of British Columbia", "aff_unique_dep": "", "aff_unique_url": "https://www.ubc.ca", "aff_unique_abbr": "UBC", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Canada" }, { "id": "8Wdj6IJsSyJ", "title": "Fully differentiable model discovery", "track": "main", "status": "Reject", "tldr": "", "abstract": "Model discovery aims at autonomously discovering differential equations underlying a dataset. Approaches based on Physics Informed Neural Networks (PINNs) have shown great promise, but a fully-differentiable model which explicitly learns the equation has remained elusive. In this paper we propose such an approach by integrating neural network-based surrogates with Sparse Bayesian Learning (SBL). This combination yields a robust model discovery algorithm, which we showcase on various datasets. We then identify a connection with multitask learning, and build on it to construct a Physics Informed Normalizing Flows (PINFs). We present a proof-of-concept using a PINF to directly learn a density model from single particle data. Our work expands PINNs to various types of neural network architectures, and connects neural network-based surrogates to the rich field of Bayesian parameter inference.", "keywords": "Model discovery;Sparse Bayesian Learning;Normalizing Flows", "primary_area": "", "supplementary_material": "", "author": "Gert-Jan Both;Remy Kusters", "authorids": "~Gert-Jan_Both1;~Remy_Kusters1", "gender": "M;", "homepage": ";", "dblp": ";", "google_scholar": "w8oKI8wAAAAJ;https://scholar.google.fr/citations?user=442FIp8AAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Gert-Jan_Both1;~Remy_Kusters1", "aff": "Pasqal;International Business Machines", "aff_domain": "pasqal.com;ibm.com", "position": "Researcher;Researcher", "bibtex": "@misc{\nboth2022fully,\ntitle={Fully differentiable model discovery},\nauthor={Gert-Jan Both and Remy Kusters},\nyear={2022},\nurl={https://openreview.net/forum?id=8Wdj6IJsSyJ}\n}", "github": "", "project": "", "reviewers": "MdGD;F5HG;rgZy;VxhG", "site": "https://openreview.net/forum?id=8Wdj6IJsSyJ", "pdf_size": 0, "recommendation": "5;5;5;5", "confidence": "3;3;3;3", "correctness": "2;3;3;3", "technical_novelty": "2;3;2;2", "empirical_novelty": "3;2;3;2", "wc_summary_paper": "21;64;79;120", "wc_summary_review": "56;51;72;72", "wc_main_review": "375;526;299;611", "wc_review": "452;641;450;803", "wc_reply_reviewers": "338;0;0;0", "wc_reply_authors": "681;941;730;1056", "reply_reviewers": "1;0;0;0", "reply_authors": "1;2;1;2", "recommendation_avg": [ 5.0, 0.0 ], "confidence_avg": [ 3.0, 0.0 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 71.0, 35.40480193420096 ], "wc_summary_review_avg": [ 62.75, 9.41740410091868 ], "wc_main_review_avg": [ 452.75, 122.5691131566187 ], "wc_review_avg": [ 586.5, 147.1096529803534 ], "wc_reply_reviewers_avg": [ 84.5, 146.35829323957014 ], "wc_reply_authors_avg": [ 852.0, 153.0212403557101 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2833834271673705755&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1", "aff_unique_norm": "Pasqal;International Business Machines Corporation", "aff_unique_dep": ";", "aff_unique_url": "https://www.pasqal.com;https://www.ibm.com", "aff_unique_abbr": ";IBM", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "France;United States" }, { "id": "8XM-AXMnAk_", "title": "Deep Active Learning by Leveraging Training Dynamics", "track": "main", "status": "Reject", "tldr": "", "abstract": "Active learning theories and methods have been extensively studied in classical statistical learning settings. However, deep active learning, i.e., active learning with deep learning models, is usually based on empirical criteria without solid theoretical justi\ufb01cation, thus suffering from heavy doubts when some of those fail to provide bene\ufb01ts in applications. In this paper, by exploring the connection between the generalization performance and the training dynamics, we propose a theory-driven deep active learning method (dynamicAL) which selects samples to maximize training dynamics. In particular, we prove that convergence speed of training and the generalization performance is positively correlated under the ultra-wide condition and show that maximizing the training dynamics leads to a better generalization performance. Further on, to scale up to large deep neural networks and data sets, we introduce two relaxations for the subset selection problem and reduce the time complexity from polynomial to constant. Empirical results show that dynamicAL not only outperforms the other baselines consistently but also scales well on large deep learning models. We hope our work inspires more attempts in bridging the theoretical \ufb01ndings of deep networks and practical impacts in deep active learning applications.", "keywords": "deep learning;active learning;neural tangent kernel", "primary_area": "", "supplementary_material": "", "author": "Haonan Wang;Wei Huang;Hanghang Tong;Andrew J Margenot;Jingrui He", "authorids": "~Haonan_Wang1;~Wei_Huang6;~Hanghang_Tong3;~Andrew_J_Margenot1;~Jingrui_He1", "gender": "M;M;;;F", "homepage": "http://charles-haonan-wang.me/;https://weihuang05.github.io/;http://tonghanghang.org;https://margenot.cropsciences.illinois.edu/;https://www.hejingrui.org", "dblp": ";81/6685-34;58/1757;;34/2685", "google_scholar": "cLziVZMAAAAJ;RZfDh4MAAAAJ;RaINcuUAAAAJ;;hXpZynkAAAAJ", "orcid": "0009-0006-6963-8987;0000-0001-5674-7021;0000-0003-4405-3887;;0000-0002-6429-6272", "linkedin": ";;htong/;;", "or_profile": "~Haonan_Wang1;~Wei_Huang6;~Hanghang_Tong3;~Andrew_J_Margenot1;~Jingrui_He1", "aff": ";RIKEN AIP;University of Illinois, Urbana Champaign;University of Illinois, Urbana-Champaign;University of Illinois, Urbana Champaign", "aff_domain": ";riken.jp;illinois.edu;uiuc.edu;illinois.edu", "position": ";Postdoc;Associate Professor;Assistant Professor;Associate Professor", "bibtex": "@misc{\nwang2022deep,\ntitle={Deep Active Learning by Leveraging Training Dynamics},\nauthor={Haonan Wang and Wei Huang and Hanghang Tong and Andrew J Margenot and Jingrui He},\nyear={2022},\nurl={https://openreview.net/forum?id=8XM-AXMnAk_}\n}", "github": "", "project": "", "reviewers": "YgGb;j1Xn;p3z9;8dDv", "site": "https://openreview.net/forum?id=8XM-AXMnAk_", "pdf_size": 0, "recommendation": "3;6;6;6", "confidence": "3;3;3;3", "correctness": "3;4;3;4", "technical_novelty": "4;3;3;3", "empirical_novelty": "1;2;2;3", "wc_summary_paper": "21;98;31;213", "wc_summary_review": "9;76;29;46", "wc_main_review": "798;259;141;481", "wc_review": "828;433;201;740", "wc_reply_reviewers": "1336;0;145;30", "wc_reply_authors": "2280;563;865;1080", "reply_reviewers": "3;0;1;1", "reply_authors": "5;1;2;3", "recommendation_avg": [ 5.25, 1.299038105676658 ], "confidence_avg": [ 3.0, 0.0 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 90.75, 76.53879735140865 ], "wc_summary_review_avg": [ 40.0, 24.566236993076494 ], "wc_main_review_avg": [ 419.75, 250.18330779650347 ], "wc_review_avg": [ 550.5, 249.43586350001877 ], "wc_reply_reviewers_avg": [ 377.75, 555.8868477487123 ], "wc_reply_authors_avg": [ 1197.0, 651.6820543792809 ], "reply_reviewers_avg": [ 1.25, 1.0897247358851685 ], "reply_authors_avg": [ 2.75, 1.479019945774904 ], "replies_avg": [ 22, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 40, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7740339318824531883&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff_unique_index": "0;1;2;1", "aff_unique_norm": "RIKEN;University of Illinois Urbana-Champaign;University of Illinois", "aff_unique_dep": "Advanced Institute for Computational Science;;", "aff_unique_url": "https://www.aip.riken.jp;https://illinois.edu;https://illinois.edu", "aff_unique_abbr": "RIKEN AIP;UIUC;UIUC", "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Urbana-Champaign", "aff_country_unique_index": "0;1;1;1", "aff_country_unique": "Japan;United States" }, { "id": "8Z7-NG11HY", "title": "Constrained Density Matching and Modeling for Effective Contextualized Alignment", "track": "main", "status": "Reject", "tldr": "", "abstract": "Multilingual representations pre-trained with monolingual data offer unmatched task performances between languages. While this has been tackled through the lens of contextualized alignments, these techniques require large parallel data, thereby leaving under-represented language communities behind. In this work, we analyze the limitations according to which previous alignments become very resource-intensive, \\emph{viz.,} (i) the inability to sufficiently leverage data and (ii) that alignments are not trained properly. To address them, we present density based approaches to perform alignments, and we complement them with our validation criteria accounting for downstream task performances. Our experiments encompass 16 alignment techniques (including ours), evaluated across 6 language pairs, synthetic and 4 NLP tasks. We demonstrate that our solutions are particularly effective in the scenarios of limited and no parallel data. More importantly, we show, both theoretically and empirically, the advantages of our boostrapping procedures, by which unsupervised approaches rival supervised counterparts. \n\n", "keywords": "Cross-lingual Alignment;Word Embeddings;NLP", "primary_area": "", "supplementary_material": "", "author": "Wei Zhao;Steffen Eger", "authorids": "~Wei_Zhao7;~Steffen_Eger1", "gender": "M;M", "homepage": "https://www.abdn.ac.uk/people/wei.zhao;https://steffeneger.github.io/", "dblp": "181/2852-33;69/9271", "google_scholar": "https://scholar.google.de/citations?user=vQgXoPUAAAAJ;https://scholar.google.de/citations?user=TnuqAW0AAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Wei_Zhao7;~Steffen_Eger1", "aff": "Technische Universit\u00e4t Darmstadt;TU Darmstadt", "aff_domain": "tu-darmstadt.de;tu-darmstadt.de", "position": "PhD student;Lecturer", "bibtex": "@misc{\nzhao2022constrained,\ntitle={Constrained Density Matching and Modeling for Effective Contextualized Alignment},\nauthor={Wei Zhao and Steffen Eger},\nyear={2022},\nurl={https://openreview.net/forum?id=8Z7-NG11HY}\n}", "github": "", "project": "", "reviewers": "UA47;NmM5;FFEj;hsXL", "site": "https://openreview.net/forum?id=8Z7-NG11HY", "pdf_size": 0, "recommendation": "3;5;6;8", "confidence": "4;4;3;2", "correctness": "3;3;3;3", "technical_novelty": "2;2;3;4", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "52;29;81;110", "wc_summary_review": "30;63;23;35", "wc_main_review": "256;594;129;146", "wc_review": "338;686;233;291", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 5.5, 1.8027756377319946 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.75, 0.82915619758885 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 68.0, 30.454884665682123 ], "wc_summary_review_avg": [ 37.75, 15.188400179084036 ], "wc_main_review_avg": [ 281.25, 187.03124738930657 ], "wc_review_avg": [ 387.0, 176.58850472213643 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.9198662110077999, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:_iTeVJDi6gQJ:scholar.google.com/&scioq=Constrained+Density+Matching+and+Modeling+for+Effective+Contextualized+Alignment&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Technische Universit\u00e4t Darmstadt", "aff_unique_dep": "", "aff_unique_url": "https://www.tu-darmstadt.de", "aff_unique_abbr": "TUD", "aff_campus_unique_index": "1", "aff_campus_unique": ";Darmstadt", "aff_country_unique_index": "0;0", "aff_country_unique": "Germany" }, { "id": "8apIRxHxZC", "title": "Learning to Actively Learn: A Robust Approach", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "This work proposes a procedure for designing algorithms for specific adaptive data collection tasks like active learning and pure-exploration multi-armed bandits. Unlike the design of traditional adaptive algorithms that rely on concentration of measure and careful analysis to justify the correctness and sample complexity of the procedure, our adaptive algorithm is learned via adversarial training over equivalence classes of problems derived from information theoretic lower bounds. In particular, a single adaptive learning algorithm is learned that competes with the best adaptive algorithm learned for each equivalence class. Our procedure takes as input just the available queries, set of hypotheses, loss function, and total query budget. This is in contrast to existing meta-learning work that learns an adaptive algorithm relative to an explicit, user-defined subset or prior distribution over problems which can be challenging to define and be mismatched to the instance encountered at test time. This work is particularly focused on the regime when the total query budget is very small, such as a few dozen, which is much smaller than those budgets typically considered by theoretically derived algorithms. We perform synthetic experiments to justify the stability and effectiveness of the training procedure, and then evaluate the method on tasks derived from real data including a noisy 20 Questions game and a joke recommendation task.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/94fad99433c636325899bf8240e3815e9f52690e.zip", "author": "Jifan Zhang;Lalit K Jain;Kevin Jamieson", "authorids": "~Jifan_Zhang1;~Lalit_K_Jain1;~Kevin_Jamieson1", "gender": "M;;M", "homepage": "https://jifanz.github.io/;http://www.lalitjain.com;", "dblp": "277/6616;178/3228;85/10260", "google_scholar": "ZUOsJWcAAAAJ;hGMSFu4AAAAJ;", "orcid": ";;", "linkedin": ";;", "or_profile": "~Jifan_Zhang1;~Lalit_K_Jain1;~Kevin_Jamieson1", "aff": "University of Wisconsin, Madison;University of Washington;University of Washington", "aff_domain": "wisc.edu;uw.edu;washington.edu", "position": "PhD student;Assistant Professor;Associate Professor", "bibtex": "@misc{\nzhang2022learning,\ntitle={Learning to Actively Learn: A Robust Approach},\nauthor={Jifan Zhang and Lalit K Jain and Kevin Jamieson},\nyear={2022},\nurl={https://openreview.net/forum?id=8apIRxHxZC}\n}", "github": "", "project": "", "reviewers": "xkAv;D4Tj;uEuR", "site": "https://openreview.net/forum?id=8apIRxHxZC", "pdf_size": 0, "recommendation": "3;5;5", "confidence": "3;2;3", "correctness": "3;3;3", "technical_novelty": "2;4;3", "empirical_novelty": "2;4;2", "wc_summary_paper": "15;168;109", "wc_summary_review": "20;51;50", "wc_main_review": "189;905;425", "wc_review": "224;1124;584", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 4.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 2.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 3.0, 0.816496580927726 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.9428090415820634 ], "wc_summary_paper_avg": [ 97.33333333333333, 63.004409016794654 ], "wc_summary_review_avg": [ 40.333333333333336, 14.38363267359428 ], "wc_main_review_avg": [ 506.3333333333333, 297.90975516458377 ], "wc_review_avg": [ 644.0, 369.8648401781386 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.49999999999999983, "corr_recommendation_correctness": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10137045339804779730&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1;1", "aff_unique_norm": "University of Wisconsin;University of Washington", "aff_unique_dep": ";", "aff_unique_url": "https://www.wisc.edu;https://www.washington.edu", "aff_unique_abbr": "UW;UW", "aff_campus_unique_index": "0", "aff_campus_unique": "Madison;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Diffusion-Based Voice Conversion with Fast Maximum Likelihood Sampling Scheme", "status": "Oral", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6240", "id": "8c50f-DoWAu", "poster": "", "openreview": "https://openreview.net/forum?id=8c50f-DoWAu", "slides": "https://iclr.cc/virtual/2022/poster/6240", "video": "https://iclr.cc/virtual/2022/poster/6240", "author_site": "Vadim Popov, Ivan Vovk, Vladimir Gogoryan, Tasnima Sadekova, Mikhail Kudinov, Jiansheng Wei", "tldr": "", "abstract": "Voice conversion is a common speech synthesis task which can be solved in different ways depending on a particular real-world scenario. The most challenging one often referred to as one-shot many-to-many voice conversion consists in copying target voice from only one reference utterance in the most general case when both source and target speakers do not belong to the training dataset. We present a scalable high-quality solution based on diffusion probabilistic modeling and demonstrate its superior quality compared to state-of-the-art one-shot voice conversion approaches. Moreover, focusing on real-time applications, we investigate general principles which can make diffusion models faster while keeping synthesis quality at a high level. As a result, we develop a novel Stochastic Differential Equations solver suitable for various diffusion model types and generative tasks as shown through empirical studies and justify it by theoretical analysis.", "keywords": "speech;voice conversion;diffusion models;stochastic differential equations", "primary_area": "", "supplementary_material": "", "author": "Vadim Popov;Ivan Vovk;Vladimir Gogoryan;Tasnima Sadekova;Mikhail Sergeevich Kudinov;Jiansheng Wei", "authorids": "~Vadim_Popov1;~Ivan_Vovk1;~Vladimir_Gogoryan1;~Tasnima_Sadekova1;~Mikhail_Sergeevich_Kudinov1;~Jiansheng_Wei1", "gender": "M;M;M;F;M;M", "homepage": "https://nopersonallinkxdfrergwgweg.com;https://github.com/ivanvovk;https://www.facebook.com/vladimir.gogoryan;https://www.facebook.com/123;;http://www.huawei.com", "dblp": ";;;;;", "google_scholar": ";mRAPOFkAAAAJ;;;;", "orcid": ";;;;;", "linkedin": ";;vsgogoryan/;;kudinov-mikhail-304a2362;", "or_profile": "~Vadim_Popov1;~Ivan_Vovk1;~Vladimir_Gogoryan1;~Tasnima_Sadekova1;~Mikhail_Sergeevich_Kudinov1;~Jiansheng_Wei1", "aff": "Huawei Technologies Ltd.;Huawei Technologies Ltd.;Huawei Technologies Ltd.;Huawei Technologies Ltd.;Huawei Technologies Ltd.;Huawei Technologies Co. Ltd.", "aff_domain": "huawei.com;huawei.com;huawei.com;huawei.com;huawei.com;huawei.com", "position": "Researcher;Researcher;Researcher;Engeneer;Engineer;Researcher", "bibtex": "@inproceedings{\npopov2022diffusionbased,\ntitle={Diffusion-Based Voice Conversion with Fast Maximum Likelihood Sampling Scheme},\nauthor={Vadim Popov and Ivan Vovk and Vladimir Gogoryan and Tasnima Sadekova and Mikhail Sergeevich Kudinov and Jiansheng Wei},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=8c50f-DoWAu}\n}", "github": "", "project": "", "reviewers": "ifWN;QC5G;Gokd", "pdf_size": 0, "recommendation": "8;8;10", "confidence": "4;4;3", "correctness": "4;3;4", "technical_novelty": "4;3;4", "empirical_novelty": "3;3;4", "wc_summary_paper": "57;246;135", "wc_summary_review": "40;146;28", "wc_main_review": "157;567;138", "wc_review": "254;959;301", "wc_reply_reviewers": "19;0;0", "wc_reply_authors": "17;831;24", "reply_reviewers": "1;0;0", "reply_authors": "1;2;1", "recommendation_avg": [ 8.666666666666666, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 3.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 146.0, 77.54998388136518 ], "wc_summary_review_avg": [ 71.33333333333333, 53.02410353372847 ], "wc_main_review_avg": [ 287.3333333333333, 197.90626288444963 ], "wc_review_avg": [ 504.6666666666667, 321.8346711520615 ], "wc_reply_reviewers_avg": [ 6.333333333333333, 8.956685895029603 ], "wc_reply_authors_avg": [ 290.6666666666667, 382.0840512534149 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.9999999999999997, "corr_recommendation_correctness": 0.4999999999999999, "gs_citation": 145, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17487782166390673105&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=8c50f-DoWAu", "email": "huawei.com;huawei.com;huawei.com;huawei.com;huawei.com;huawei.com", "author_num": 6, "aff_unique_index": "0;0;0;0;0;0", "aff_unique_norm": "Huawei", "aff_unique_dep": "Huawei Technologies", "aff_unique_url": "https://www.huawei.com", "aff_unique_abbr": "Huawei", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "China" }, { "id": "8dF_13D2SmD", "title": "RoDesigner: Variation-Aware Optimization for Robust Analog Design with Multi-Task RL", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Analog/mixed-signal circuit design is one of the most complex and time-consuming stages in the chip design process. Due to various process, voltage, and temperature (PVT) variations from chip manufacturing, analog circuits inevitably suffer from performance degradations. Although there has been plenty of work on automating analog circuit design under the typical condition, limited research has been done on exploring robust designs under the real and unpredictable silicon variations. To address these challenges, we present RoDesigner, a robust circuit design framework that involves the variation information in the optimization process. Specifically, circuit optimizations under different variations are considered as a set of tasks. Similarities among tasks are leveraged and competitions are alleviated to realize a sample-efficient multi-task training. Moreover, RoDesigner prunes the task space before multi-task training to reduce simulation costs. In this way, RoDesigner can rapidly produce a set of circuit parameters that satisfies diverse constraints (e.g., gain, bandwidth, noise...) across variations. We compare our method with Bayesian optimization, evolutionary algorithm, and Deep Deterministic Policy Gradient (DDPG) and demonstrate that RoDesigner can significantly reduce required optimization time by14\u00d7-30\u00d7. We also show that RoDesigner\u2019s circuit performance is as good as a state-of-the-art human design, while the design time is reduced from several days by an expert to an hour.", "keywords": "multi-task reinforcement learning;circuit automation", "primary_area": "", "supplementary_material": "/attachment/df057dc1e0ad840f67787177705dbbd451c3d073.zip", "author": "Wei Shi;Hanrui Wang;Jiaqi Gu;Mingjie Liu;David Z. Pan;Song Han;Nan Sun", "authorids": "~Wei_Shi5;~Hanrui_Wang1;~Jiaqi_Gu3;~Mingjie_Liu2;~David_Z._Pan1;~Song_Han5;~Nan_Sun1", "gender": ";M;M;M;M;;", "homepage": ";https://hanruiwang.me;https://scopex-asu.github.io;;http://users.ece.utexas.edu/~dpan/;;", "dblp": ";214/9819-2;;;p/DavidZhigangPan.html;;", "google_scholar": ";https://scholar.google.com/citations?hl=en;FeIV12MAAAAJ;-v5DbrMAAAAJ;3aLlroEAAAAJ;;", "orcid": ";0000-0002-7229-4015;;;0000-0002-5705-2501;;", "linkedin": ";hanrui-wang-34458217a/;;jayliu940712/;davidzpan/;;", "or_profile": "~Wei_Shi5;~Hanrui_Wang1;~Jiaqi_Gu3;~Mingjie_Liu2;~David_Z._Pan1;~Song_Han5;~Nan_Sun1", "aff": ";Massachusetts Institute of Technology;University of Texas, Austin;University of Texas, Austin;University of Texas, Austin;;University of Texas at Austin", "aff_domain": ";mit.edu;utexas.edu;utexas.edu;utexas.edu;;", "position": ";PhD student;PhD student;PhD student;Professor;;", "bibtex": "@misc{\nshi2022rodesigner,\ntitle={RoDesigner: Variation-Aware Optimization for Robust Analog Design with Multi-Task {RL}},\nauthor={Wei Shi and Hanrui Wang and Jiaqi Gu and Mingjie Liu and David Z. Pan and Song Han and Nan Sun},\nyear={2022},\nurl={https://openreview.net/forum?id=8dF_13D2SmD}\n}", "github": "", "project": "", "reviewers": "np4B;cfiF;p4Tw;GNQf", "site": "https://openreview.net/forum?id=8dF_13D2SmD", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "4;3;3;2", "correctness": "3;3;4;3", "technical_novelty": "2;2;1;3", "empirical_novelty": "1;2;3;3", "wc_summary_paper": "96;101;110;74", "wc_summary_review": "11;38;37;28", "wc_main_review": "380;127;82;217", "wc_review": "487;266;229;319", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "490;20;19;18", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 3.0, 0.7071067811865476 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 95.25, 13.254716141811564 ], "wc_summary_review_avg": [ 28.5, 10.828203913853857 ], "wc_main_review_avg": [ 201.5, 113.94406522500415 ], "wc_review_avg": [ 325.25, 98.71265116488362 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 136.75, 203.95020838430148 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": -0.9733285267845754, "corr_recommendation_correctness": 0.13245323570650439, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16352005161269808431&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;1;1;1", "aff_unique_norm": "Massachusetts Institute of Technology;University of Texas at Austin", "aff_unique_dep": ";", "aff_unique_url": "https://web.mit.edu;https://www.utexas.edu", "aff_unique_abbr": "MIT;UT Austin", "aff_campus_unique_index": "1;1;1;1", "aff_campus_unique": ";Austin", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "8e2vrVvvaeQ", "title": "Indiscriminate Poisoning Attacks Are Shortcuts", "track": "main", "status": "Reject", "tldr": "", "abstract": "Indiscriminate data poisoning attacks, which add imperceptible perturbations to training data to maximize the test error of trained models, have become a trendy topic because they are thought to be capable of preventing unauthorized use of data. In this work, we investigate why these perturbations work in principle. We find that the perturbations of advanced poisoning attacks are almost linear separable when assigned with the target labels of the corresponding samples. This is an important population property for various perturbations that were not unveiled before. Moreover, we further confirm that linear separability is indeed the workhorse for poisoning attacks. We synthesize linear separable data as perturbations and show that such synthetic perturbations are as powerful as the deliberately crafted attacks. Our finding also suggests that the shortcut learning problem is more serious than previously believed as deep learning heavily relies on shortcuts even if they are of an imperceptible scale and mixed together with the normal features. It also suggests that pre-trained feature extractors can be a powerful defense.", "keywords": "Data Security;Data Poisoning;Shortcuts", "primary_area": "", "supplementary_material": "/attachment/fe5e3de5e38cab0606bfbd098d123ed5d646a431.zip", "author": "Da Yu;Huishuai Zhang;Wei Chen;Jian Yin;Tie-Yan Liu", "authorids": "~Da_Yu1;~Huishuai_Zhang3;~Wei_Chen1;~Jian_Yin3;~Tie-Yan_Liu1", "gender": "M;F;M;M;M", "homepage": ";https://weichen-cas.github.io/;http://sai.sysu.edu.cn/teacher/teacher01/1385356.htm;http://member.acm.org/~tieyanliu;https://huishuai-git.github.io", "dblp": "48/8545;;95/578-1;l/TieYanLiu;144/7537", "google_scholar": "FcRGdiwAAAAJ;https://scholar.google.com/citations?hl=en;;Nh832fgAAAAJ;w1srHyIAAAAJ", "orcid": ";;;0000-0002-0476-8020;", "linkedin": ";;;;", "or_profile": "~Da_Yu1;~Wei_Chen1;~Jian_Yin3;~Tie-Yan_Liu1;~Huishuai_Zhang2", "aff": "Microsoft; Chinese Academy of Sciences;SUN YAT-SEN UNIVERSITY;Microsoft;Microsoft Research Asia", "aff_domain": "microsoft.com;ict.ac.cn;sysu.edu.cn;microsoft.com;microsoft.com", "position": "Research intern;Full Professor;Full Professor;Distinguished Scientist;Researcher", "bibtex": "@misc{\nyu2022indiscriminate,\ntitle={Indiscriminate Poisoning Attacks Are Shortcuts},\nauthor={Da Yu and Huishuai Zhang and Wei Chen and Jian Yin and Tie-Yan Liu},\nyear={2022},\nurl={https://openreview.net/forum?id=8e2vrVvvaeQ}\n}", "github": "", "project": "", "reviewers": "baCa;sUUe;ngSk;nWuM", "site": "https://openreview.net/forum?id=8e2vrVvvaeQ", "pdf_size": 0, "recommendation": "3;5;8;8", "confidence": "5;4;5;5", "correctness": "4;3;3;3", "technical_novelty": "1;2;4;3", "empirical_novelty": "2;2;4;3", "wc_summary_paper": "106;80;45;49", "wc_summary_review": "32;122;64;28", "wc_main_review": "160;555;474;237", "wc_review": "298;757;583;314", "wc_reply_reviewers": "234;93;91;75", "wc_reply_authors": "607;1144;431;424", "reply_reviewers": "2;1;1;1", "reply_authors": "3;3;2;3", "recommendation_avg": [ 6.0, 2.1213203435596424 ], "confidence_avg": [ 4.75, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 1.118033988749895 ], "empirical_novelty_avg": [ 2.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 70.0, 24.809272460110556 ], "wc_summary_review_avg": [ 61.5, 37.613162589710534 ], "wc_main_review_avg": [ 356.5, 162.86574225416467 ], "wc_review_avg": [ 488.0, 192.1991155026474 ], "wc_reply_reviewers_avg": [ 123.25, 64.32097247399172 ], "wc_reply_authors_avg": [ 651.5, 293.64647111790737 ], "reply_reviewers_avg": [ 1.25, 0.4330127018922193 ], "reply_authors_avg": [ 2.75, 0.4330127018922193 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.2721655269759087, "corr_recommendation_correctness": -0.8164965809277261, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11965882524330509087&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;2;0;0", "aff_unique_norm": "Microsoft;Chinese Academy of Sciences;Sun Yat-sen University", "aff_unique_dep": "Microsoft Corporation;;", "aff_unique_url": "https://www.microsoft.com;https://www.cas.cn;http://www.sysu.edu.cn", "aff_unique_abbr": "Microsoft;CAS;SYSU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Asia", "aff_country_unique_index": "0;1;1;0;1", "aff_country_unique": "United States;China" }, { "title": "The Role of Pretrained Representations for the OOD Generalization of RL Agents", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7156", "id": "8eb12UQYxrG", "poster": "", "openreview": "https://openreview.net/forum?id=8eb12UQYxrG", "slides": "https://iclr.cc/virtual/2022/poster/7156", "video": "https://iclr.cc/virtual/2022/poster/7156", "author_site": "Frederik Tr\u00e4uble, Andrea Dittadi, Manuel Wuthrich, Felix Widmaier, Peter Gehler, Ole Winther, Francesco Locatello, Olivier Bachem, Bernhard Schoelkopf, Stefan Bauer", "tldr": "", "abstract": "Building sample-efficient agents that generalize out-of-distribution (OOD) in real-world settings remains a fundamental unsolved problem on the path towards achieving higher-level cognition. One particularly promising approach is to begin with low-dimensional, pretrained representations of our world, which should facilitate efficient downstream learning and generalization. By training 240 representations and over 10,000 reinforcement learning (RL) policies on a simulated robotic setup, we evaluate to what extent different properties of pretrained VAE-based representations affect the OOD generalization of downstream agents. We observe that many agents are surprisingly robust to realistic distribution shifts, including the challenging sim-to-real case. In addition, we find that the generalization performance of a simple downstream proxy task reliably predicts the generalization performance of our RL agents under a wide range of OOD settings. Such proxy tasks can thus be used to select pretrained representations that will lead to agents that generalize.", "keywords": "representations;out-of-distribution;generalization;deep learning;reinforcement learning", "primary_area": "", "supplementary_material": "", "author": "Frederik Tr\u00e4uble;Andrea Dittadi;Manuel Wuthrich;Felix Widmaier;Peter Vincent Gehler;Ole Winther;Francesco Locatello;Olivier Bachem;Bernhard Sch\u00f6lkopf;Stefan Bauer", "authorids": "~Frederik_Tr\u00e4uble1;~Andrea_Dittadi1;~Manuel_Wuthrich1;~Felix_Widmaier1;~Peter_Vincent_Gehler1;~Ole_Winther1;~Francesco_Locatello1;~Olivier_Bachem1;~Bernhard_Sch\u00f6lkopf1;~Stefan_Bauer1", "gender": "M;M;M;;;M;M;M;;", "homepage": "https://ei.is.tuebingen.mpg.de/person/ftraeuble;https://addtt.github.io;;https://is.tuebingen.mpg.de/person/felixwidmaier;;https://olewinther.github.io/;https://twitter.com/FrancescoLocat8;http://www.olivierbachem.ch/;;https://cifar.ca/bios/stefan-bauer/", "dblp": ";;https://dblp.uni-trier.de/pers/hd/w/W=uuml=thrich:Manuel;;;36/1568;195/6074;https://dblp.org/pers/hd/b/Bachem:Olivier;;", "google_scholar": "https://scholar.google.de/citations?user=oc2OOyMAAAAJ;PrvuuaAAAAAJ;;;;7VAwhzUAAAAJ;;https://scholar.google.ch/citations?user=mW9BcgsAAAAJ;;O-oICE8AAAAJ", "orcid": ";;;;;0000-0002-1966-3205;;;;", "linkedin": ";;;;;owinther/;;olivier-bachem-10257756/;;", "or_profile": "~Frederik_Tr\u00e4uble1;~Andrea_Dittadi1;~Manuel_Wuthrich1;~Felix_Widmaier1;~Peter_Vincent_Gehler1;~Ole_Winther1;~Francesco_Locatello1;~Olivier_Bachem1;~Bernhard_Sch\u00f6lkopf1;~Stefan_Bauer1", "aff": "Max Planck Institute for Intelligent Systems;Technical University of Denmark;Max Planck Institute for Intelligent Systems;, Max Planck Institute for Intelligent Systems;;Technical University of Denmark;Amazon;Google Brain;;KTH Royal Institute of Technology", "aff_domain": "is.tuebingen.mpg.de;dtu.dk;mpg.tuebingen.de;is.tuebingen.mpg.de;;dtu.dk;amazon.com;google.com;;kth.se", "position": "PhD student;PhD student;Postdoc;Research Engineer;;Full Professor;Senior Applied Scientist;Research scientist;;Assistant Professor", "bibtex": "@inproceedings{\ntr{\\\"a}uble2022the,\ntitle={The Role of Pretrained Representations for the {OOD} Generalization of {RL} Agents},\nauthor={Frederik Tr{\\\"a}uble and Andrea Dittadi and Manuel Wuthrich and Felix Widmaier and Peter Vincent Gehler and Ole Winther and Francesco Locatello and Olivier Bachem and Bernhard Sch{\\\"o}lkopf and Stefan Bauer},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=8eb12UQYxrG}\n}", "github": "", "project": "", "reviewers": "BEPU;YvcV;AwPM;vE69", "pdf_size": 0, "recommendation": "3;5;6;8", "confidence": "4;4;4;3", "correctness": "3;4;4;4", "technical_novelty": "1;2;2;3", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "75;56;79;34", "wc_summary_review": "40;45;49;12", "wc_main_review": "189;314;410;526", "wc_review": "304;415;538;572", "wc_reply_reviewers": "0;516;72;197", "wc_reply_authors": "704;1104;845;649", "reply_reviewers": "0;1;1;1", "reply_authors": "1;2;2;2", "recommendation_avg": [ 5.5, 1.8027756377319946 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 61.0, 17.84656829757475 ], "wc_summary_review_avg": [ 36.5, 14.5 ], "wc_main_review_avg": [ 359.75, 123.90797996900764 ], "wc_review_avg": [ 457.25, 106.01739244105185 ], "wc_reply_reviewers_avg": [ 196.25, 197.60614236404697 ], "wc_reply_authors_avg": [ 825.5, 175.96661615204175 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.75, 0.4330127018922193 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 10, 0 ], "corr_recommendation_confidence": -0.8006407690254357, "corr_recommendation_correctness": 0.8006407690254357, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17821119916891342018&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "pdf": "https://openreview.net/pdf?id=8eb12UQYxrG", "email": "is.tuebingen.mpg.de;dtu.dk;mpg.tuebingen.de;is.tuebingen.mpg.de;;dtu.dk;amazon.com;google.com;;kth.se", "author_num": 10, "aff_unique_index": "0;1;0;0;1;2;3;4", "aff_unique_norm": "Max Planck Institute for Intelligent Systems;Technical University of Denmark;Amazon;Google;KTH Royal Institute of Technology", "aff_unique_dep": "Intelligent Systems;;Amazon.com, Inc.;Google Brain;", "aff_unique_url": "https://www.mpi-is.mpg.de;https://www.tek.dk;https://www.amazon.com;https://brain.google.com;https://www.kth.se", "aff_unique_abbr": "MPI-IS;DTU;Amazon;Google Brain;KTH", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;1;0;0;1;2;2;3", "aff_country_unique": "Germany;Denmark;United States;Sweden" }, { "id": "8f95ajHrIFc", "title": "On Reward Maximization and Distribution Matching for Fine-Tuning Language Models", "track": "main", "status": "Reject", "tldr": "", "abstract": "The availability of large pre-trained models is changing the landscape of Machine Learning research and practice, moving from a \"training from scratch\" to a \"fine-tuning'' paradigm. While in some applications the goal is to \"nudge'' the pre-trained distribution towards preferred outputs, in others it is to steer it towards a different distribution over the sample space. Two main paradigms have emerged to tackle this challenge: Reward Maximization (RM) and, more recently, Distribution Matching (DM). RM applies standard Reinforcement Learning (RL) techniques, such as Policy Gradients, to gradually increase the reward signal. DM prescribes to first make explicit the target distribution that the model is fine-tuned to approximate. Here we explore the intimate connections between the two paradigms and show that methods such as KL-control developed in the RM paradigm can also be construed as belonging to DM. We further observe that while DM differs from RM, it can suffer from similar training difficulties, such as high gradient variance. We leverage connections between the two paradigms to import the concept of baseline into DM methods. We empirically validate the benefits of adding a baseline on an array of controllable language generation tasks such as constraining topic, sentiment, and gender distributions in texts sampled from a language model. We observe superior performance in terms of constraint satisfaction, stability, and sample efficiency.", "keywords": "Reinforcement Learning;Language Models;Reward Maximization;Distribution Matching;Energy Based Models;Controlled Text Generation", "primary_area": "", "supplementary_material": "", "author": "Tomasz Korbak;Hady Elsahar;Germ\u00e1n Kruszewski;Marc Dymetman", "authorids": "~Tomasz_Korbak1;~Hady_Elsahar2;~Germ\u00e1n_Kruszewski1;~Marc_Dymetman1", "gender": "M;Unspecified;M;M", "homepage": "https://tomekkorbak.com;http://hadyelsahar.io;https://germank.github.io;https://europe.naverlabs.com/people_user/marc-dymetman/", "dblp": "209/9713.html;144/6739;117/4112;74/1221", "google_scholar": "YQ5rrk4AAAAJ;SbcM6bsAAAAJ;uU3rQI8AAAAJ;bTXN9_0AAAAJ", "orcid": "0000-0002-6258-2013;;;", "linkedin": "tomaszkorbak/;hadyelsahar/;;", "or_profile": "~Tomasz_Korbak1;~Hady_Elsahar2;~Germ\u00e1n_Kruszewski1;~Marc_Dymetman1", "aff": "University of Sussex;Naver Labs Europe;Naver Labs Europe;Naver Labs Europe", "aff_domain": "sussex.ac.uk;naverlabs.com;naverlabs.com;naverlabs.com", "position": "PhD student;Researcher;Senior Scientist;Principal Researcher", "bibtex": "@misc{\nkorbak2022on,\ntitle={On Reward Maximization and Distribution Matching for Fine-Tuning Language Models},\nauthor={Tomasz Korbak and Hady Elsahar and Germ{\\'a}n Kruszewski and Marc Dymetman},\nyear={2022},\nurl={https://openreview.net/forum?id=8f95ajHrIFc}\n}", "github": "", "project": "", "reviewers": "Up1T;ifjN;qjvo;Yzso", "site": "https://openreview.net/forum?id=8f95ajHrIFc", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "4;4;3;4", "correctness": "3;3;4;4", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "91;59;96;46", "wc_summary_review": "59;23;98;50", "wc_main_review": "254;82;424;108", "wc_review": "404;164;618;204", "wc_reply_reviewers": "279;0;290;36", "wc_reply_authors": "770;221;797;362", "reply_reviewers": "1;0;1;1", "reply_authors": "1;1;2;1", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 73.0, 21.083168642308014 ], "wc_summary_review_avg": [ 57.5, 26.874709300753377 ], "wc_main_review_avg": [ 217.0, 136.31214179228496 ], "wc_review_avg": [ 347.5, 180.71178710864436 ], "wc_reply_reviewers_avg": [ 151.25, 133.9129848072994 ], "wc_reply_authors_avg": [ 537.5, 251.18170713648715 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 1.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:8MdITdVh3t8J:scholar.google.com/&scioq=On+Reward+Maximization+and+Distribution+Matching+for+Fine-Tuning+Language+Models&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;1;1", "aff_unique_norm": "University of Sussex;NAVER LABS", "aff_unique_dep": ";", "aff_unique_url": "https://www.sussex.ac.uk;https://labs.naver.com", "aff_unique_abbr": "Sussex;NLE", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1", "aff_country_unique": "United Kingdom;Unknown" }, { "id": "8gX3bY78aCb", "title": "Molecular Graph Representation Learning via Heterogeneous Motif Graph Construction", "track": "main", "status": "Reject", "tldr": "", "abstract": "We consider feature representation learning of molecular graphs. Graph Neural Networks have been widely used in feature representation learning of molecular graphs. However, most proposed methods focus on the individual molecular graph while neglecting their connections, such as motif-level relationships. We propose a novel molecular graph representation learning method by constructing a Heterogeneous Motif graph (HM-graph) to address this issue. In particular, we build an HM-graph that contains motif nodes and molecular nodes. Each motif node corresponds to a motif extracted from molecules. Then, we propose a Heterogeneous Motif Graph Neural Network (HM-GNN) to learn feature representations for each node in the HM-graph. Our HM-graph also enables effective multi-task learning, especially for small molecular datasets. To address the potential efficiency issue, we propose an edge sampler, which significantly reduces computational resources usage. The experimental results show that our model consistently outperforms previous state-of-the-art models. Under multi-task settings, the promising performances of our methods on combined datasets shed light on a new learning paradigm for small molecular datasets. Finally, we show that our model achieves similar performances with significantly less computational resources by using our edge sampler.", "keywords": "Molecular Graph Representation;Graph Neural Networks;Heterogeneous", "primary_area": "", "supplementary_material": "/attachment/52ad5a4405891c3bdaf245d1bfdcff4e061b5445.zip", "author": "Zhaoning Yu;Hongyang Gao", "authorids": "~Zhaoning_Yu2;~Hongyang_Gao1", "gender": "M;M", "homepage": "https://faculty.sites.iastate.edu/hygao/;https://zhaoningyu1996.github.io/", "dblp": "200/7985;313/1914", "google_scholar": "jGmq0aEAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": "0000-0002-9020-9080;0000-0001-6813-779X", "linkedin": "hongyang-gao-74924690/;zhaoning-yu-112773168/", "or_profile": "~Hongyang_Gao1;~ZHAONING_YU1", "aff": "Iowa State University;Iowa State University", "aff_domain": "iastate.edu;iastate.edu", "position": "Assistant Professor;PhD student", "bibtex": "@misc{\nyu2022molecular,\ntitle={Molecular Graph Representation Learning via Heterogeneous Motif Graph Construction},\nauthor={Zhaoning Yu and Hongyang Gao},\nyear={2022},\nurl={https://openreview.net/forum?id=8gX3bY78aCb}\n}", "github": "", "project": "", "reviewers": "swsR;VxSx;FtFN;6PpB", "site": "https://openreview.net/forum?id=8gX3bY78aCb", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "4;4;4;5", "correctness": "3;3;4;3", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "57;56;55;61", "wc_summary_review": "56;57;53;51", "wc_main_review": "148;764;496;356", "wc_review": "261;877;604;468", "wc_reply_reviewers": "0;82;0;0", "wc_reply_authors": "333;1138;380;635", "reply_reviewers": "0;1;0;0", "reply_authors": "1;2;1;1", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 57.25, 2.277608394786075 ], "wc_summary_review_avg": [ 54.25, 2.384848003542364 ], "wc_main_review_avg": [ 441.0, 223.84592915664112 ], "wc_review_avg": [ 552.5, 223.6431309027845 ], "wc_reply_reviewers_avg": [ 20.5, 35.50704155516198 ], "wc_reply_authors_avg": [ 621.5, 319.57354396132354 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.6622661785325219, "corr_recommendation_correctness": 0.13245323570650439, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4646420710071600137&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Iowa State University", "aff_unique_dep": "", "aff_unique_url": "https://www.iastate.edu", "aff_unique_abbr": "ISU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Discrete Representations Strengthen Vision Transformer Robustness", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6647", "id": "8hWs60AZcWk", "poster": "", "openreview": "https://openreview.net/forum?id=8hWs60AZcWk", "slides": "https://iclr.cc/virtual/2022/poster/6647", "video": "https://iclr.cc/virtual/2022/poster/6647", "author_site": "Chengzhi Mao, Lu Jiang, Mostafa Dehghani, Carl Vondrick, Rahul Sukthankar, Irfan Essa", "tldr": "", "abstract": "Vision Transformer (ViT) is emerging as the state-of-the-art architecture for image recognition. While recent studies suggest that ViTs are more robust than their convolutional counterparts, our experiments find that ViTs are overly reliant on local features (\\eg, nuisances and texture) and fail to make adequate use of global context (\\eg, shape and structure). As a result, ViTs fail to generalize to out-of-distribution, real-world data. To address this deficiency, we present a simple and effective architecture modification to ViT's input layer by adding discrete tokens produced by a vector-quantized encoder. Different from the standard continuous pixel tokens, discrete tokens are invariant under small perturbations and contain less information individually, which promote ViTs to learn global information that is invariant. Experimental results demonstrate that adding discrete representation on four architecture variants strengthens ViT robustness by up to 12\\% across seven ImageNet robustness benchmarks while maintaining the performance on ImageNet.", "keywords": "vision transformer;robustness;image recognition", "primary_area": "", "supplementary_material": "", "author": "Chengzhi Mao;Lu Jiang;Mostafa Dehghani;Carl Vondrick;Rahul Sukthankar;Irfan Essa", "authorids": "~Chengzhi_Mao2;~Lu_Jiang1;~Mostafa_Dehghani1;~Carl_Vondrick2;~Rahul_Sukthankar1;~Irfan_Essa1", "gender": "M;M;M;M;M;M", "homepage": "http://www.cs.columbia.edu/~mcz/;http://www.lujiang.info/;http://mostafadehghani.com/;http://www.cs.columbia.edu/~vondrick/;http://www.cs.cmu.edu/~rahuls;http://www.irfanessa.com/", "dblp": ";22/752-4;125/4062;26/8610;;e/IrfanAEssa", "google_scholar": "pTTEiHUAAAAJ;jIKjjSYAAAAJ;https://scholar.google.nl/citations?user=MiHOX3QAAAAJ;3MzhkFIAAAAJ;bmZbi_UNs-oC;https://scholar.google.com.tw/citations?user=XM97iScAAAAJ", "orcid": ";0000-0003-0286-8439;;;;0000-0002-6236-2969", "linkedin": ";roadjiang/;;;rahulsukthankar/;irfanessa/", "or_profile": "~Chengzhi_Mao2;~Lu_Jiang1;~Mostafa_Dehghani1;~Carl_Vondrick2;~Rahul_Sukthankar1;~Irfan_Essa1", "aff": "Columbia University;Google Research;Google DeepMind;Columbia University;Google Research;Georgia Institute of Technology", "aff_domain": "columbia.edu;google.com;google.com;columbia.edu;google.com;gatech.edu", "position": "PhD student;Researcher;Research Scientist;Assistant Professor;Distinguished Scientist;Full Professor", "bibtex": "@inproceedings{\nmao2022discrete,\ntitle={Discrete Representations Strengthen Vision Transformer Robustness},\nauthor={Chengzhi Mao and Lu Jiang and Mostafa Dehghani and Carl Vondrick and Rahul Sukthankar and Irfan Essa},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=8hWs60AZcWk}\n}", "github": "", "project": "", "reviewers": "rENv;a2pv;mNoS;YfcP", "pdf_size": 0, "recommendation": "3;5;8;8", "confidence": "4;4;4;3", "correctness": "2;3;3;2", "technical_novelty": "2;3;4;3", "empirical_novelty": "2;2;4;0", "wc_summary_paper": "54;38;166;89", "wc_summary_review": "102;69;73;72", "wc_main_review": "137;302;552;137", "wc_review": "293;409;791;298", "wc_reply_reviewers": "0;0;0;18", "wc_reply_authors": "1172;1247;286;294", "reply_reviewers": "0;0;0;1", "reply_authors": "3;3;1;2", "recommendation_avg": [ 6.0, 2.1213203435596424 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.0, 1.4142135623730951 ], "wc_summary_paper_avg": [ 86.75, 49.33241834737073 ], "wc_summary_review_avg": [ 79.0, 13.360389215887388 ], "wc_main_review_avg": [ 282.0, 169.81607697741694 ], "wc_review_avg": [ 447.75, 203.52810002552474 ], "wc_reply_reviewers_avg": [ 4.5, 7.794228634059948 ], "wc_reply_authors_avg": [ 749.75, 460.52273288079925 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 2.25, 0.82915619758885 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.5443310539518174, "corr_recommendation_correctness": 0.23570226039551587, "gs_citation": 54, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6549701583607853490&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=8hWs60AZcWk", "email": "columbia.edu;google.com;google.com;columbia.edu;google.com;gatech.edu", "author_num": 6, "aff_unique_index": "0;1;1;0;1;2", "aff_unique_norm": "Columbia University;Google;Georgia Institute of Technology", "aff_unique_dep": ";Google Research;", "aff_unique_url": "https://www.columbia.edu;https://research.google;https://www.gatech.edu", "aff_unique_abbr": "Columbia;Google Research;Georgia Tech", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;1;0;0;0", "aff_country_unique": "United States;United Kingdom" }, { "title": "Triangle and Four Cycle Counting with Predictions in Graph Streams", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6712", "id": "8in_5gN9I0", "poster": "", "openreview": "https://openreview.net/forum?id=8in_5gN9I0", "slides": "https://iclr.cc/virtual/2022/poster/6712", "video": "https://iclr.cc/virtual/2022/poster/6712", "author_site": "Justin Chen, Talya Eden, Piotr Indyk, Honghao Lin, Shyam Narayanan, Ronitt Rubinfeld, Sandeep Silwal, Tal Wagner, David Woodruff, Michael Zhang", "tldr": "", "abstract": "We propose data-driven one-pass streaming algorithms for estimating the number of triangles and four cycles, two fundamental problems in graph analytics that are widely studied in the graph data stream literature. Recently, Hsu et al. (2019) and Jiang et al. (2020) applied machine learning techniques in other data stream problems, using a trained oracle that can predict certain properties of the stream elements to improve on prior \u201cclassical\u201d algorithms that did not use oracles. In this paper, we explore the power of a \u201cheavy edge\u201d oracle in multiple graph edge streaming models. In the adjacency list model, we present a one-pass triangle counting algorithm improving upon the previous space upper bounds without such an oracle. In the arbitrary order model, we present algorithms for both triangle and four cycle estimation with fewer passes and the same space complexity as in previous algorithms, and we show several of these bounds are optimal. We analyze our algorithms under several noise models, showing that the algorithms perform well even when the oracle errs. Our methodology expands upon prior work on \u201cclassical\u201d streaming algorithms, as previous multi-pass and random order streaming algorithms can be seen as special cases of our algorithms, where the first pass or random order was used to implement the heavy edge oracle. Lastly, our experiments demonstrate advantages of the proposed method compared to state-of-the-art streaming algorithms.", "keywords": "learning augmented;streaming;graph streaming;data driven;cycle counting;triangle counting", "primary_area": "", "supplementary_material": "/attachment/14b95db6a480d2864710b7276908d21056779dac.zip", "author": "Justin Y Chen;Talya Eden;Piotr Indyk;Honghao Lin;Shyam Narayanan;Ronitt Rubinfeld;Sandeep Silwal;Tal Wagner;David Woodruff;Michael Zhang", "authorids": "~Justin_Y_Chen1;~Talya_Eden1;~Piotr_Indyk1;~Honghao_Lin1;~Shyam_Narayanan1;~Ronitt_Rubinfeld1;~Sandeep_Silwal1;~Tal_Wagner1;~David_Woodruff1;~Michael_Zhang8", "gender": "M;;;M;M;F;M;M;M;M", "homepage": "https://people.csail.mit.edu/justc/;;https://people.csail.mit.edu/indyk/;https://honghlin.github.io;https://sites.google.com/view/shyamnarayanan/home;http://people.csail.mit.edu/ronitt/;https://sandeepsilwal.com;http://www.mit.edu/~talw/;http://www.cs.cmu.edu/~dwoodruf/;", "dblp": "254/0805.html;161/3999;i/PiotrIndyk;https://dblp.uni-trier.de/pid/264/2663.html;222/2805;;225/4637;https://dblp.org/pers/hd/w/Wagner:Tal;w/DPWoodruff;", "google_scholar": "X_myU1YAAAAJ;;oOwNKsAAAAAJ;;CTT44Y0AAAAJ;https://scholar.google.com.tw/citations?user=pZhZndYAAAAJ;MnDnUvcAAAAJ;gV4dPToAAAAJ;https://scholar.google.com.tw/citations?user=0G2t-6sAAAAJ;", "orcid": ";;;;;;;;;", "linkedin": ";;;;;;;tal-wagner-22645857/;;michael-zhang-a4366716b/", "or_profile": "~Justin_Y_Chen1;~Talya_Eden1;~Piotr_Indyk1;~Honghao_Lin1;~Shyam_Narayanan1;~Ronitt_Rubinfeld1;~Sandeep_Silwal1;~Tal_Wagner1;~David_Woodruff1;~Jinyao_Zhang1", "aff": "Massachusetts Institute of Technology;Boston University, MIT;Massachusetts Institute of Technology;Carnegie Mellon University;Massachusetts Institute of Technology;Massachusetts Institute of Technology;Massachusetts Institute of Technology;Microsoft;Carnegie Mellon University;", "aff_domain": "mit.edu;bu.edu;mit.edu;cmu.edu;mit.edu;mit.edu;mit.edu;microsoft.com;cmu.edu;", "position": "PhD student;Postdoc;Full Professor;PhD student;PhD student;Full Professor;PhD student;Postdoc;Associate Professor;", "bibtex": "@inproceedings{\nchen2022triangle,\ntitle={Triangle and Four Cycle Counting with Predictions in Graph Streams},\nauthor={Justin Y Chen and Talya Eden and Piotr Indyk and Honghao Lin and Shyam Narayanan and Ronitt Rubinfeld and Sandeep Silwal and Tal Wagner and David Woodruff and Michael Zhang},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=8in_5gN9I0}\n}", "github": "", "project": "", "reviewers": "FczQ;oLhV;tiRU", "pdf_size": 0, "recommendation": "6;6;8", "confidence": "4;3;4", "correctness": "4;3;4", "technical_novelty": "4;3;3", "empirical_novelty": "2;2;3", "wc_summary_paper": "53;68;260", "wc_summary_review": "107;267;12", "wc_main_review": "584;415;217", "wc_review": "744;750;489", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 6.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 3.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 127.0, 94.2443632266673 ], "wc_summary_review_avg": [ 128.66666666666666, 105.22462745109732 ], "wc_main_review_avg": [ 405.3333333333333, 149.98296199531762 ], "wc_review_avg": [ 661.0, 121.64703037887936 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 10, 0 ], "corr_recommendation_confidence": 0.49999999999999983, "corr_recommendation_correctness": 0.49999999999999983, "gs_citation": 35, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2078605368294023456&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=8in_5gN9I0", "email": "mit.edu;bu.edu;mit.edu;cmu.edu;mit.edu;mit.edu;mit.edu;microsoft.com;cmu.edu;", "author_num": 10, "aff_unique_index": "0;1;0;2;0;0;0;3;2", "aff_unique_norm": "Massachusetts Institute of Technology;Boston University;Carnegie Mellon University;Microsoft", "aff_unique_dep": ";;;Microsoft Corporation", "aff_unique_url": "https://web.mit.edu;https://www.bu.edu;https://www.cmu.edu;https://www.microsoft.com", "aff_unique_abbr": "MIT;BU;CMU;Microsoft", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "8kVP8m93VqN", "title": "Task-oriented Dialogue System for Automatic Disease Diagnosis via Hierarchical Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "In this paper, we focus on automatic disease diagnosis with reinforcement learning (RL) methods in task-oriented dialogues setting. Different from conventional RL tasks, the action space for disease diagnosis (i.e., symptoms) is inevitably large, especially when the number of diseases increases. However, existing approaches to this problem typically works well in simple tasks but has significant challenges in complex scenarios. Inspired by the offline consultation process, we propose to integrate a hierarchical policy of two levels into the dialogue policy learning. The high level policy consists of a master model that is responsible for triggering a low level model, the low level policy consists of several symptom checkers and a disease classifier. Experimental results on both self-constructed real-world and synthetic datasets demonstrate that our hierarchical framework achieves higher accuracy and symptom recall in disease diagnosis compared with existing systems.\n", "keywords": "Dialogue System;Automatic Disease Diagnosis;Hierarchical Reinforcement Learning", "primary_area": "", "supplementary_material": "", "author": "Kangenbei Liao;CHENG ZHONG;Wei Chen;Qianlong Liu;zhongyu wei;Baolin Peng;Xuanjing Huang", "authorids": "~Kangenbei_Liao1;~CHENG_ZHONG2;~Wei_Chen28;~Qianlong_Liu1;~zhongyu_wei1;~Baolin_Peng2;~Xuanjing_Huang1", "gender": "M;M;M;M;M;M;F", "homepage": ";https://github.com/Guardianzc;http://faculty.hust.edu.cn/CHENWEI/zh_CN/index.htm;;http://www.sdspeople.fudan.edu.cn/zywei/;;https://xuanjing-huang.github.io/", "dblp": "https://dblp.uni-trier.de/search?q=kangenbei+liao;;181/2832-88;22/10360;31/10489;144/2759;05/6735-1", "google_scholar": ";;1Rmwv_MAAAAJ;;AjLDxxgAAAAJ;u1CNjgwAAAAJ;RGsMgZA4H78C", "orcid": ";;0000-0001-9431-9247;;;;0000-0001-9197-9426", "linkedin": ";;;;;;", "or_profile": "~Kangenbei_Liao1;~CHENG_ZHONG2;~Wei_Chen28;~Qianlong_Liu1;~zhongyu_wei1;~Baolin_Peng2;~Xuanjing_Huang1", "aff": ";Fudan University;Fudan University;;Fudan University;;Fudan University", "aff_domain": ";fudan.edu.cn;fudan.edu.cn;;fudan.edu.cn;;fudan.edu.cn", "position": ";MS student;PhD student;;Associate Professor;;Full Professor", "bibtex": "@misc{\nliao2022taskoriented,\ntitle={Task-oriented Dialogue System for Automatic Disease Diagnosis via Hierarchical Reinforcement Learning},\nauthor={Kangenbei Liao and CHENG ZHONG and Wei Chen and Qianlong Liu and zhongyu wei and Baolin Peng and Xuanjing Huang},\nyear={2022},\nurl={https://openreview.net/forum?id=8kVP8m93VqN}\n}", "github": "", "project": "", "reviewers": "WycQ;J3rx;eBvJ;65Z2", "site": "https://openreview.net/forum?id=8kVP8m93VqN", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "3;4;4;3", "correctness": "3;1;4;3", "technical_novelty": "1;1;2;3", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "93;58;58;54", "wc_summary_review": "28;52;38;36", "wc_main_review": "143;287;121;219", "wc_review": "264;397;217;309", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 2.75, 1.0897247358851685 ], "technical_novelty_avg": [ 1.75, 0.82915619758885 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 65.75, 15.81731646013318 ], "wc_summary_review_avg": [ 38.5, 8.645808232895291 ], "wc_main_review_avg": [ 192.5, 65.56485338960196 ], "wc_review_avg": [ 296.75, 66.39418272710343 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.13245323570650439, "gs_citation": 60, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11099100498235173383&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Fudan University", "aff_unique_dep": "", "aff_unique_url": "https://www.fudan.edu.cn", "aff_unique_abbr": "Fudan", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "id": "8kpSWDgzsh0", "title": "Network Learning in Quadratic Games from Fictitious Plays", "track": "main", "status": "Reject", "tldr": "", "abstract": "We study the ability of an adversary learning the underlying interaction network from repeated fictitious plays in linear-quadratic games. The adversary may strategically perturb the decisions for a set of action-compromised players, and observe the sequential decisions from a set of action-leaked players. Then the question lies in whether such an adversary can fully re-construct, or effectively estimate the underlying interaction structure among the players. First of all, by drawing connections between this network learning problem in games and classical system identification theory, we establish a series of results characterizing the learnability of the interaction graph from the adversary's point of view. Next, in view of the inherent stability and sparsity constraints for the network interaction structure, we propose a stable and sparse system identification framework for learning the interaction graph from full player action observations. We also propose a stable and sparse subspace identification framework for learning the interaction graph from partially observed player actions. Finally, the effectiveness of the proposed learning frameworks is demonstrated in numerical examples. \n", "keywords": "network learning;game theory", "primary_area": "", "supplementary_material": "/attachment/f88e5a42c580df25773dc60fc68102cd2768ab31.zip", "author": "KEMI DING;Yijun Chen;Lei Wang;Xiaoqiang Ren;Guodong Shi", "authorids": "~KEMI_DING1;yche7598@uni.sydney.edu.au;~Lei_Wang26;~Xiaoqiang_Ren1;~Guodong_Shi1", "gender": "F;;M;;M", "homepage": "https://kemiding.github.io/;;;;", "dblp": ";;;;https://dblp.org/pers/hd/s/Shi:Guodong.html", "google_scholar": ";;https://scholar.google.fi/citations?user=b2LyJzsAAAAJ;https://scholar.google.com/citations?hl=en;https://scholar.google.com.tw/citations?user=gD553TwAAAAJ", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~KEMI_DING1;yche7598@uni.sydney.edu.au;~Lei_Wang26;~Xiaoqiang_Ren1;~Guodong_Shi1", "aff": ";;Zhejiang University;Shanghai University;The University of Sydney", "aff_domain": ";;zju.edu.cn;shu.edu.cn;sydney.edu.au", "position": ";;Researcher;Full Professor;Assistant Professor", "bibtex": "@misc{\nding2022network,\ntitle={Network Learning in Quadratic Games from Fictitious Plays},\nauthor={KEMI DING and Yijun Chen and Lei Wang and Xiaoqiang Ren and Guodong Shi},\nyear={2022},\nurl={https://openreview.net/forum?id=8kpSWDgzsh0}\n}", "github": "", "project": "", "reviewers": "Zoba;YLLb;2igb", "site": "https://openreview.net/forum?id=8kpSWDgzsh0", "pdf_size": 0, "recommendation": "3;3;6", "confidence": "4;4;2", "correctness": "2;2;4", "technical_novelty": "2;2;3", "empirical_novelty": "2;2;0", "wc_summary_paper": "114;294;142", "wc_summary_review": "56;120;17", "wc_main_review": "397;713;130", "wc_review": "567;1127;289", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 4.0, 1.4142135623730951 ], "confidence_avg": [ 3.3333333333333335, 0.9428090415820634 ], "correctness_avg": [ 2.6666666666666665, 0.9428090415820634 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 1.3333333333333333, 0.9428090415820634 ], "wc_summary_paper_avg": [ 183.33333333333334, 79.08364067379689 ], "wc_summary_review_avg": [ 64.33333333333333, 42.460439103816256 ], "wc_main_review_avg": [ 413.3333333333333, 238.28880702952785 ], "wc_review_avg": [ 661.0, 348.50920599987984 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -1.0, "corr_recommendation_correctness": 1.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:z6qTQ2RLogQJ:scholar.google.com/&scioq=Network+Learning+in+Quadratic+Games+from+Fictitious+Plays&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1;2", "aff_unique_norm": "Zhejiang University;Shanghai University;University of Sydney", "aff_unique_dep": ";;", "aff_unique_url": "https://www.zju.edu.cn;https://www.shu.edu.cn;https://www.sydney.edu.au", "aff_unique_abbr": "ZJU;SHU;USYD", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1", "aff_country_unique": "China;Australia" }, { "title": "Prototypical Contrastive Predictive Coding", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6453", "id": "8la28hZOwug", "poster": "", "openreview": "https://openreview.net/forum?id=8la28hZOwug", "slides": "https://iclr.cc/virtual/2022/poster/6453", "video": "https://iclr.cc/virtual/2022/poster/6453", "tldr": "", "abstract": "Transferring representational knowledge of a model to another is a wide-ranging topic in machine learning. Those applications include the distillation of a large supervised or self-supervised teacher model to a smaller student model or self-supervised learning via self-distillation. Knowledge distillation is an original method to solve these problems, which minimizes a cross-entropy loss between the prototypical probabilistic outputs of teacher and student networks. On the other hand, contrastive learning has shown its competency in transferring representations as they allow students to capture the information of teacher representations. In this paper, we amalgamate the advantages of knowledge distillation and contrastive learning by modeling the critic of a contrastive objective by the prototypical probabilistic discrepancy between two features. We refer to it as prototypical contrastive predictive coding and present efficient implementation using the proposed objective for three distillation tasks: supervised model compression, self-supervised model compression, and self-supervised learning via self-distillation. Through extensive experiments, we validate the effectiveness of our method and show that our method achieves state-of-the-art performance in supervised / self-supervised model compression. ", "keywords": "Knowledge distillation;contrastive learning;self-supervised learning", "primary_area": "", "supplementary_material": "", "author": "Kyungmin Lee", "authorids": "~Kyungmin_Lee1", "gender": "M", "homepage": "https://kyungmnlee.github.io/", "dblp": "57/5118", "google_scholar": "6dpime0AAAAJ", "orcid": "", "linkedin": "", "or_profile": "~Kyungmin_Lee1", "aff": "Korea Advanced Institute of Science & Technology", "aff_domain": "kaist.ac.kr", "position": "PhD student", "bibtex": "@inproceedings{\nlee2022prototypical,\ntitle={Prototypical Contrastive Predictive Coding},\nauthor={Kyungmin Lee},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=8la28hZOwug}\n}", "github": "", "project": "", "reviewers": "dY6T;hGXe;EXkK;tvAr", "pdf_size": 0, "recommendation": "6;6;6;8", "confidence": "3;3;4;3", "correctness": "3;4;4;4", "technical_novelty": "3;3;2;3", "empirical_novelty": "3;3;2;3", "wc_summary_paper": "62;36;132;160", "wc_summary_review": "35;24;73;22", "wc_main_review": "234;85;337;135", "wc_review": "331;145;542;317", "wc_reply_reviewers": "0;0;144;0", "wc_reply_authors": "767;70;339;687", "reply_reviewers": "0;0;1;0", "reply_authors": "2;1;2;2", "recommendation_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 97.5, 50.34630075785112 ], "wc_summary_review_avg": [ 38.5, 20.524375751773793 ], "wc_main_review_avg": [ 197.75, 96.63688478008798 ], "wc_review_avg": [ 333.75, 140.78596343385942 ], "wc_reply_reviewers_avg": [ 36.0, 62.353829072479584 ], "wc_reply_authors_avg": [ 465.75, 279.45784565833895 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.75, 0.4330127018922193 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 1, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9824176755890183387&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "pdf": "https://openreview.net/pdf?id=8la28hZOwug", "email": "kaist.ac.kr", "author_num": 1, "aff_unique_index": "0", "aff_unique_norm": "Korea Advanced Institute of Science and Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.kaist.ac.kr", "aff_unique_abbr": "KAIST", "aff_country_unique_index": "0", "aff_country_unique": "South Korea" }, { "id": "8p5qvzrmMj", "title": "Confidence Score Weighting Adaptation for Source-Free Unsupervised Domain Adaptation", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Unsupervised domain adaptation (UDA) aims to achieve high performance within the unlabeled target domain by leveraging the labeled source domain.\nSource-free UDA, which is a more challenging UDA task, can access the pre-trained model within the source domain.\nThe pre-trained model, however, provides a noisy pseudo-label; thus, source-free UDA requires robust training.\nIn this study, we propose a Confidence score Weighting Adaptation (CoWA), which is a simple yet effective source-free UDA method. \nCoWA utilizes the Joint Model-Data Structure (JMDS) confidence score designed for source-free UDA as a sample-wise weight.\nAs components of CoWA, we introduce Suppressed Cross Entropy (SCE) loss and a weight mixup to robustly leverage the low-confidence samples.\nExperiment results show that CoWA achieves a superior performance compared to other source-free UDA methods on various UDA benchmarks including open-set and partial-set domain adaptation.\nFurthermore, on several benchmarks, CoWA surpasses state-of-the-art conventional UDA methods that use labeled source domain data. ", "keywords": "Unsupervised Domain Adaptation;Source-free Unsupervised Domain Adaptation;Confidence score", "primary_area": "", "supplementary_material": "", "author": "Jonghyun Lee;Dahuin Jung;Junho Yim;Sungroh Yoon", "authorids": "~Jonghyun_Lee1;~Dahuin_Jung2;~Junho_Yim3;~Sungroh_Yoon1", "gender": "M;F;M;", "homepage": ";https://hai.ssu.ac.kr/;;http://ailab.snu.ac.kr", "dblp": ";224/0158;;99/1474", "google_scholar": ";https://scholar.google.co.kr/citations?user=wleS-UQAAAAJ;1xvJRjYAAAAJ;Bphl_fIAAAAJ", "orcid": ";;;0000-0002-2367-197X", "linkedin": "jonghyun-lee-0886061a3/;;;", "or_profile": "~Jonghyun_Lee1;~Dahuin_Jung2;~Junho_Yim3;~Sungroh_Yoon1", "aff": "Seoul National University;Seoul National University;;Seoul National University", "aff_domain": "snu.ac.kr;snu.ac.kr;;snu.ac.kr", "position": "PhD student;PhD student;;Full Professor", "bibtex": "@misc{\nlee2022confidence,\ntitle={Confidence Score Weighting Adaptation for Source-Free Unsupervised Domain Adaptation},\nauthor={Jonghyun Lee and Dahuin Jung and Junho Yim and Sungroh Yoon},\nyear={2022},\nurl={https://openreview.net/forum?id=8p5qvzrmMj}\n}", "github": "", "project": "", "reviewers": "Yw3t;oGoq;1uNZ;Nr66", "site": "https://openreview.net/forum?id=8p5qvzrmMj", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "4;4;4;5", "correctness": "4;2;3;3", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;2;0;2", "wc_summary_paper": "66;45;97;64", "wc_summary_review": "27;49;65;45", "wc_main_review": "292;415;259;404", "wc_review": "385;509;421;513", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 1.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 68.0, 18.641351882307248 ], "wc_summary_review_avg": [ 46.5, 13.518505834595775 ], "wc_main_review_avg": [ 342.5, 68.11938050217427 ], "wc_review_avg": [ 457.0, 55.49774770204643 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10940154677915438762&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;0", "aff_unique_norm": "Seoul National University", "aff_unique_dep": "", "aff_unique_url": "https://www.snu.ac.kr", "aff_unique_abbr": "SNU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "South Korea" }, { "id": "8qQ48aMXR_g", "title": "On Locality in Graph Learning via Graph Neural Network", "track": "main", "status": "Reject", "tldr": "", "abstract": "Theoretical understanding on the learning process of graph neural network (GNN) has been lacking. The common practice in GNN training is to adapt strategies from other machine learning families, despite the striking differences between learning non-graph and graph-structured data. This results in unstable learning performance (e.g., accuracy) for GNN. In this paper, we study how the training set in the input graph effects the performance of GNN. Combining the topology awareness of GNN and the dependence (topology) of data samples, we formally derive a structural relation between the performance of GNN and the coverage of the training set in the graph. More specifically, the distance of the training set to the rest of the vertexes in the graph is negatively correlated to the learning outcome of GNN. We further validate our theory on different graph data sets with extensive experiments. Using the derived result as a guidance, we also investigate the initial data labelling problem in active learning of GNN, and show that locality-aware data labelling substantially outperforms the prevailing random sampling approach. ", "keywords": "Graph Neural Network;Structural Behavior;Learning Process", "primary_area": "", "supplementary_material": "", "author": "Junwei Su;Jiaqi Han;Chuan Wu", "authorids": "~Junwei_Su1;~Jiaqi_Han2;~Chuan_Wu1", "gender": "M;M;", "homepage": ";https://hanjq17.github.io;https://i.cs.hku.hk/~cwu/", "dblp": "226/0880;235/0412;34/3772-1", "google_scholar": "https://scholar.google.ca/citations?user=jtWS-OMAAAAJ;AKppgMAAAAAJ;mY7MWXMAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Junwei_Su1;~Jiaqi_Han2;~Chuan_Wu1", "aff": "The University of Hong Kong;;The University of Hong Kong", "aff_domain": "hku.hk;;hku.hk", "position": "PhD student;;Full Professor", "bibtex": "@misc{\nsu2022on,\ntitle={On Locality in Graph Learning via Graph Neural Network},\nauthor={Junwei Su and Jiaqi Han and Chuan Wu},\nyear={2022},\nurl={https://openreview.net/forum?id=8qQ48aMXR_g}\n}", "github": "", "project": "", "reviewers": "RjRF;1RTm;te3z;sY4o", "site": "https://openreview.net/forum?id=8qQ48aMXR_g", "pdf_size": 0, "recommendation": "1;3;3;6", "confidence": "4;4;4;4", "correctness": "3;2;2;3", "technical_novelty": "2;3;2;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "41;61;30;71", "wc_summary_review": "93;86;61;39", "wc_main_review": "375;464;283;592", "wc_review": "509;611;374;702", "wc_reply_reviewers": "0;383;223;40", "wc_reply_authors": "660;1131;811;672", "reply_reviewers": "0;2;1;1", "reply_authors": "1;2;1;1", "recommendation_avg": [ 3.25, 1.7853571071357126 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 50.75, 16.13032857693854 ], "wc_summary_review_avg": [ 69.75, 21.370248009791556 ], "wc_main_review_avg": [ 428.5, 114.04494727957044 ], "wc_review_avg": [ 549.0, 121.94055929017219 ], "wc_reply_reviewers_avg": [ 161.5, 153.04329452805177 ], "wc_reply_authors_avg": [ 818.5, 189.93222475398954 ], "reply_reviewers_avg": [ 1.0, 0.7071067811865476 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.14002800840280097, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:VNWqrqeJWyUJ:scholar.google.com/&scioq=On+Locality+in+Graph+Learning+via+Graph+Neural+Network&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "University of Hong Kong", "aff_unique_dep": "", "aff_unique_url": "https://www.hku.hk", "aff_unique_abbr": "HKU", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Hong Kong SAR", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "id": "8qWazUd8Jm", "title": "How Faithful is your Synthetic Data? Sample-level Metrics for Evaluating and Auditing Generative Models", "track": "main", "status": "Reject", "tldr": "", "abstract": "Devising domain- and model-agnostic evaluation metrics for generative models is an important and as yet unresolved problem. Most existing metrics, which were tailored solely to the image synthesis setup, exhibit a limited capacity for diagnosing the modes of failure of generative models across broader application domains. In this paper, we introduce a 3-dimensional evaluation metric, ($\\alpha$-Precision, $\\beta$-Recall, Authenticity), that characterizes the fidelity, diversity and generalization performance of any generative model in a domain-agnostic fashion. Our metric unifies statistical divergence measures with precision-recall analysis, enabling sample-level and distribution-level diagnoses of model fidelity and diversity. We introduce generalization as an additional dimension for model performance that quantifies the extent to which a model copies training data---a crucial performance indicator when modeling sensitive data with requirements on privacy. The three metric components correspond to (interpretable) probabilistic quantities, and are estimated via sample-level binary classification. The sample-level nature of our metric inspires a novel use case which we call model auditing, wherein we judge the quality of individual samples generated by a (black-box) model, discarding low-quality samples and hence improving the overall model performance in a post-hoc manner.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ahmed Alaa;Boris van Breugel;Evgeny Saveliev;Mihaela van der Schaar", "authorids": "~Ahmed_Alaa1;~Boris_van_Breugel2;es583@cam.ac.uk;~Mihaela_van_der_Schaar2", "gender": "M;;;F", "homepage": "https://alaalab.berkeley.edu/;;;https://www.vanderschaar-lab.com", "dblp": "140/7324;284/0835;;", "google_scholar": "https://scholar.google.com.eg/citations?user=_pv1sEcAAAAJ;https://scholar.google.com/citations?hl=en;;DZ3S--MAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Ahmed_Alaa1;~Boris_van_Breugel2;es583@cam.ac.uk;~Mihaela_van_der_Schaar2", "aff": "University of California, Berkeley;University of Cambridge;;University of California, Los Angeles", "aff_domain": "berkeley.edu;cam.ac.uk;;ucla.edu", "position": "Assistant Professor;PhD student;;Full Professor", "bibtex": "@misc{\nalaa2022how,\ntitle={How Faithful is your Synthetic Data? Sample-level Metrics for Evaluating and Auditing Generative Models},\nauthor={Ahmed Alaa and Boris van Breugel and Evgeny Saveliev and Mihaela van der Schaar},\nyear={2022},\nurl={https://openreview.net/forum?id=8qWazUd8Jm}\n}", "github": "", "project": "", "reviewers": "AgxQ;caMn;hx4h;qovd", "site": "https://openreview.net/forum?id=8qWazUd8Jm", "pdf_size": 0, "recommendation": "3;6;6;8", "confidence": "5;3;4;4", "correctness": "4;3;4;4", "technical_novelty": "2;3;3;2", "empirical_novelty": "2;3;2;2", "wc_summary_paper": "60;93;100;31", "wc_summary_review": "4;30;60;22", "wc_main_review": "212;217;272;245", "wc_review": "276;340;432;298", "wc_reply_reviewers": "495;0;0;22", "wc_reply_authors": "675;313;496;747", "reply_reviewers": "1;0;0;1", "reply_authors": "1;1;1;1", "recommendation_avg": [ 5.75, 1.7853571071357126 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 71.0, 27.595289453093258 ], "wc_summary_review_avg": [ 29.0, 20.223748416156685 ], "wc_main_review_avg": [ 236.5, 24.046829312822098 ], "wc_review_avg": [ 336.5, 59.7390157267426 ], "wc_reply_reviewers_avg": [ 129.25, 211.3567777479587 ], "wc_reply_authors_avg": [ 557.75, 168.2845432593261 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.5940885257860046, "corr_recommendation_correctness": -0.08084520834544431, "gs_citation": 266, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15840878488291944826&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff_unique_index": "0;1;2", "aff_unique_norm": "University of California, Berkeley;University of Cambridge;University of California, Los Angeles", "aff_unique_dep": ";;", "aff_unique_url": "https://www.berkeley.edu;https://www.cam.ac.uk;https://www.ucla.edu", "aff_unique_abbr": "UC Berkeley;Cambridge;UCLA", "aff_campus_unique_index": "0;1;2", "aff_campus_unique": "Berkeley;Cambridge;Los Angeles", "aff_country_unique_index": "0;1;0", "aff_country_unique": "United States;United Kingdom" }, { "id": "8r1wpu__y3S", "title": "Efficient Regularization for Adversarially Robustness Deep ReLU Networks", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "We present a regularization functional for deep neural networks with ReLU activations, and propose regularizers that encourage networks which are smooth not only in their predictions but also their decision boundaries. We evaluate the stability of our networks against the standard set of $\\ell_2$ and $\\ell_\\infty$ norm-bounded adversaries, as well as several recently proposed perception-based adversaries, including spatial, recoloring, JPEG, and a learned neural threat model. Crucially, our models are simultaneously robust against multiple state-of-the-art adversaries, suggesting that the robustness generalizes well to \\textit{unseen} adversaries. Furthermore, our techniques do not rely on adversarial training and are thus very efficient, incurring overhead on par with two additional parallel passes through the network. On CIFAR-10, we obtain our results after training for only 4 hours, while the next-best performing baseline requires nearly 25 hours of training. To the best of our knowledge, this work presents the first technique to achieve robustness against adversarial perturbations \\textit{without} adversarial training.", "keywords": "adversarial robustness;regularization", "primary_area": "", "supplementary_material": "", "author": "Charles Jin;Martin Rinard", "authorids": "~Charles_Jin1;~Martin_Rinard1", "gender": ";Not Specified", "homepage": "https://charlesjin.com;http://people.csail.mit.edu/rinard/", "dblp": "245/5611;", "google_scholar": "WC99LxgAAAAJ;https://scholar.google.com.tw/citations?user=hxlxVEUAAAAJ", "orcid": "0000-0001-6871-5764;", "linkedin": ";", "or_profile": "~Charles_Jin1;~Martin_Rinard1", "aff": "Research, Google;Massachusetts Institute of Technology", "aff_domain": "research.google.com;mit.edu", "position": "Intern;Full Professor", "bibtex": "@misc{\njin2022efficient,\ntitle={Efficient Regularization for Adversarially Robustness Deep Re{LU} Networks},\nauthor={Charles Jin and Martin Rinard},\nyear={2022},\nurl={https://openreview.net/forum?id=8r1wpu__y3S}\n}", "github": "", "project": "", "reviewers": "mKyt;HyPF;cBXr;iRUY", "site": "https://openreview.net/forum?id=8r1wpu__y3S", "pdf_size": 0, "recommendation": "3;3;6;6", "confidence": "4;3;3;2", "correctness": "3;3;4;3", "technical_novelty": "2;2;4;3", "empirical_novelty": "2;3;0;3", "wc_summary_paper": "67;24;74;60", "wc_summary_review": "8;33;59;55", "wc_main_review": "193;351;411;422", "wc_review": "268;408;544;537", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.5, 1.5 ], "confidence_avg": [ 3.0, 0.7071067811865476 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.82915619758885 ], "empirical_novelty_avg": [ 2.0, 1.224744871391589 ], "wc_summary_paper_avg": [ 56.25, 19.266226926930972 ], "wc_summary_review_avg": [ 38.75, 20.327014045353536 ], "wc_main_review_avg": [ 344.25, 91.40944973032055 ], "wc_review_avg": [ 439.25, 112.72837930175346 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.7071067811865476, "corr_recommendation_correctness": 0.5773502691896258, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:j-HDlXr8ZDQJ:scholar.google.com/&scioq=Efficient+Regularization+for+Adversarially+Robustness+Deep+ReLU+Networks&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1", "aff_unique_norm": "Google;Massachusetts Institute of Technology", "aff_unique_dep": "Google Research;", "aff_unique_url": "https://research.google;https://web.mit.edu", "aff_unique_abbr": "Google;MIT", "aff_campus_unique_index": "0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "8rCMq0yJMG", "title": "Source-Target Unified Knowledge Distillation for Memory-Efficient Federated Domain Adaptation on Edge Devices", "track": "main", "status": "Reject", "tldr": "", "abstract": "To support local inference on an edge device, it is necessary to deploy a compact machine learning model on such a device.\nWhen such a compact model is applied to a new environment, its inference accuracy can be degraded if the target data from the new environment have a different distribution from the source data that are used for model training.\nTo ensure high inference accuracy in the new environment, it is indispensable to adapt the compact model to the target data.\nHowever, to protect users' privacy, the target data cannot be sent to a centralized server for joint training with the source data. Furthermore, utilizing the target data to directly train the compact model cannot achieve sufficient inference accuracy due to its low model capacity.\nTo this end, a scheme called source-target unified knowledge distillation (STU-KD) is developed in this paper. It starts with a large pretrained model loaded onto the edge device, and then the knowledge of the large model is transferred to the compact model via knowledge distillation.\nSince training the large model leads to large memory consumption, a domain adaptation method called lite-residual hypothesis transfer is designed to achieve memory-efficient adaptation to the target data on the edge device. Moreover, to prevent the compact model from forgetting the knowledge of the source data during knowledge distillation, a collaborative knowledge distillation (Co-KD) method is developed to unify the source data on the server and the target data on the edge device to train the compact model. STU-KD can be easily integrated with secure aggregation so that the server cannot obtain the true model parameters of the compact model. Extensive experiments conducted upon several objective recognition tasks show that STU-KD can improve the inference accuracy by up to $14.7\\%$, as compared to the state-of-the-art schemes. Results also reveal that the inference accuracy of the compact model is not impacted by incorporating secure aggregation into STU-KD.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/4f6f6828e2ee11a2361bf6efe54c985905153f08.zip", "author": "Xiaochen Zhou;Yuchuan Tian;Xudong Wang", "authorids": "~Xiaochen_Zhou2;~Yuchuan_Tian1;~Xudong_Wang5", "gender": "M;M;M", "homepage": "http://wanglab.sjtu.edu.cn/ShowPeople.aspx?info_lb=533&flag=103&info_id=1491;;http://wanglab.sjtu.edu.cn/en/Default.aspx", "dblp": ";193/6675;", "google_scholar": ";;oG2PlTsAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Xiaochen_Zhou2;~Yuchuan_Tian1;~Xudong_Wang5", "aff": "Shanghai Jiaotong University;Shanghai Jiaotong University;Shanghai Jiao Tong University, Tsinghua University", "aff_domain": "sjtu.edu;sjtu.edu.cn;sjtu.edu.cn", "position": "PhD student;Undergrad student;Full Professor", "bibtex": "@misc{\nzhou2022sourcetarget,\ntitle={Source-Target Unified Knowledge Distillation for Memory-Efficient Federated Domain Adaptation on Edge Devices},\nauthor={Xiaochen Zhou and Yuchuan Tian and Xudong Wang},\nyear={2022},\nurl={https://openreview.net/forum?id=8rCMq0yJMG}\n}", "github": "", "project": "", "reviewers": "uoT4;99KT;uaQn;wCzy", "site": "https://openreview.net/forum?id=8rCMq0yJMG", "pdf_size": 0, "recommendation": "3;5;6;8", "confidence": "4;3;3;4", "correctness": "3;4;4;4", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "165;81;90;37", "wc_summary_review": "16;34;15;41", "wc_main_review": "471;129;181;108", "wc_review": "652;244;286;186", "wc_reply_reviewers": "347;0;0;0", "wc_reply_authors": "3091;854;830;250", "reply_reviewers": "3;0;0;0", "reply_authors": "7;2;1;1", "recommendation_avg": [ 5.5, 1.8027756377319946 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 93.25, 46.023771031935226 ], "wc_summary_review_avg": [ 26.5, 11.280514172678478 ], "wc_main_review_avg": [ 222.25, 146.05371443410812 ], "wc_review_avg": [ 342.0, 182.46643526961336 ], "wc_reply_reviewers_avg": [ 86.75, 150.25540755660012 ], "wc_reply_authors_avg": [ 1256.25, 1086.547370113241 ], "reply_reviewers_avg": [ 0.75, 1.299038105676658 ], "reply_authors_avg": [ 2.75, 2.48746859276655 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.8006407690254357, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6248020858946415634&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;0", "aff_unique_norm": "Shanghai Jiao Tong University", "aff_unique_dep": "", "aff_unique_url": "https://www.sjtu.edu.cn", "aff_unique_abbr": "SJTU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "id": "8rR8bIZnzMA", "title": "Dynamic Graph Representation Learning via Graph Transformer Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "Dynamic graph representation learning is an important task with widespread applications. Previous methods on dynamic graph learning are usually sensitive to noisy graph information such as missing or spurious connections, which can yield degenerated performance and generalization. To overcome this challenge, we propose a Transformer-based dynamic graph learning method named Dynamic Graph Transformer (DGT) with spatial-temporal encoding to effectively learn graph topology and capture implicit links. To improve the generalization ability, we introduce two complementary self-supervised pre-training tasks and show that jointly optimizing the two pre-training tasks results in a smaller Bayesian error rate via an information-theoretic analysis. We also propose a temporal-union graph structure and a target-context node sampling strategy for efficient and scalable training. Extensive experiments on real-world datasets illustrate that DGT presents superior performance compared with several state-of-the-art baselines.", "keywords": "dynamic graphs;graph neural networks;graph representation learning;transformers;graph transformers", "primary_area": "", "supplementary_material": "", "author": "Weilin Cong;Yanhong Wu;Yuandong Tian;Mengting Gu;Yinglong Xia;Mehrdad Mahdavi;Chun-cheng Jason Chen", "authorids": "~Weilin_Cong1;~Yanhong_Wu1;~Yuandong_Tian1;~Mengting_Gu1;~Yinglong_Xia1;~Mehrdad_Mahdavi2;~Chun-cheng_Jason_Chen1", "gender": "M;M;M;F;M;M;M", "homepage": "https://congweilin.github.io/CongWeilin.io/;http://yhwu.me;http://yuandong-tian.com;;;http://www.cse.psu.edu/~mzm616/;", "dblp": "203/8227;;t/YuandongTian;;61/3251;88/4321;91/6415.html", "google_scholar": "yYHxZ6MAAAAJ;https://scholar.google.com.hk/citations?user=cR2dd0oAAAAJ;0mgEF28AAAAJ;EB0q5vgAAAAJ;;HzxnwocAAAAJ;", "orcid": ";;0000-0003-4202-4847;;0000-0002-8155-5440;;", "linkedin": ";yanhongwu/;yuandongtian;;;;", "or_profile": "~Weilin_Cong1;~Yanhong_Wu1;~Yuandong_Tian1;~Mengting_Gu1;~Yinglong_Xia1;~Mehrdad_Mahdavi2;~Chun-cheng_Jason_Chen1", "aff": "Meta;Meta;Meta AI (FAIR);;Meta;Toyota Technological Institute at Chicago;", "aff_domain": "meta.com;meta.com;meta.com;;meta.com;ttic.edu;", "position": "Intern;Researcher;Research Scientist;;Researcher;Researcher;", "bibtex": "@misc{\ncong2022dynamic,\ntitle={Dynamic Graph Representation Learning via Graph Transformer Networks},\nauthor={Weilin Cong and Yanhong Wu and Yuandong Tian and Mengting Gu and Yinglong Xia and Mehrdad Mahdavi and Chun-cheng Jason Chen},\nyear={2022},\nurl={https://openreview.net/forum?id=8rR8bIZnzMA}\n}", "github": "", "project": "", "reviewers": "8DU1;uub9;SMmY;G1X6", "site": "https://openreview.net/forum?id=8rR8bIZnzMA", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "4;4;4;3", "correctness": "3;3;3;4", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "38;37;29;85", "wc_summary_review": "112;44;24;41", "wc_main_review": "336;110;252;327", "wc_review": "486;191;305;453", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "450;217;530;479", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 47.25, 22.072324299900995 ], "wc_summary_review_avg": [ 55.25, 33.640563312762765 ], "wc_main_review_avg": [ 256.25, 90.51622782683776 ], "wc_review_avg": [ 358.75, 118.432206346078 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 419.0, 120.08954992004925 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": -0.6622661785325219, "corr_recommendation_correctness": 0.6622661785325219, "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17502211205827052051&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;0;0;1", "aff_unique_norm": "Meta;Toyota Technological Institute at Chicago", "aff_unique_dep": "Meta Platforms, Inc.;", "aff_unique_url": "https://meta.com;https://www.tti-chicago.org", "aff_unique_abbr": "Meta;TTI Chicago", "aff_campus_unique_index": "1", "aff_campus_unique": ";Chicago", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "8rpv8g3zfF", "title": "Federated Learning with GAN-based Data Synthesis for Non-IID Clients", "track": "main", "status": "Reject", "tldr": "", "abstract": "Federated learning (FL) has recently emerged as a popular privacy-preserving collaborative learning paradigm. However, it suffers from the non-IID (independent and identically distributed) data among clients. In this paper, we propose a novel framework, namely Synthetic Data Aided Federated Learning (SDA-FL), to resolve the non-IID issue by sharing differentially private synthetic data. Specifically, each client pretrains a local generative adversarial network (GAN) to generate synthetic data, which are uploaded to the parameter server (PS) to construct a global shared synthetic dataset. The PS is responsible for generating and updating high-quality labels for the global dataset via pseudo labeling with a confident threshold before each global aggregation. A combination of the local private dataset and labeled synthetic dataset leads to nearly identical data distributions among clients, which improves the consistency among local models and benefits the global aggregation. To ensure privacy, the local GANs are trained with differential privacy by adding artificial noise to the local model gradients before being uploaded to the PS. Extensive experiments evidence that the proposed framework outperforms the baseline methods by a large margin in several benchmark datasets under both the supervised and semi-supervised settings.", "keywords": "federated learning;non-IID;generative model;data augmentation", "primary_area": "", "supplementary_material": "", "author": "Zijian Li;Jiawei Shao;Yuyi Mao;Jessie Hui Wang;Jun Zhang", "authorids": "zijian1997.li@connect.polyu.hk;~Jiawei_Shao1;yuyi-eie.mao@polyu.edu.hk;jessiewang@tsinghua.edu.cn;eejzhang@ust.hk", "gender": ";;;;", "homepage": ";https://shaojiawei07.github.io/;;;", "dblp": ";251/9479;;;", "google_scholar": ";p26zthIAAAAJ;;;", "orcid": ";0000-0001-8836-1430;;;", "linkedin": ";;;;", "or_profile": "zijian1997.li@connect.polyu.hk;~Jiawei_Shao1;yuyi-eie.mao@polyu.edu.hk;jessiewang@tsinghua.edu.cn;eejzhang@ust.hk", "aff": ";Hong Kong University of Science and Technology;;;", "aff_domain": ";ust.hk;;;", "position": ";PhD student;;;", "bibtex": "@misc{\nli2022federated,\ntitle={Federated Learning with {GAN}-based Data Synthesis for Non-{IID} Clients},\nauthor={Zijian Li and Jiawei Shao and Yuyi Mao and Jessie Hui Wang and Jun Zhang},\nyear={2022},\nurl={https://openreview.net/forum?id=8rpv8g3zfF}\n}", "github": "", "project": "", "reviewers": "Ena1;UMsE;t36G;ex8V", "site": "https://openreview.net/forum?id=8rpv8g3zfF", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "4;4;4;4", "correctness": "3;3;3;3", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "88;134;99;159", "wc_summary_review": "85;42;108;40", "wc_main_review": "400;264;242;526", "wc_review": "573;440;449;725", "wc_reply_reviewers": "85;66;0;128", "wc_reply_authors": "1100;1064;486;1010", "reply_reviewers": "1;1;0;1", "reply_authors": "3;2;1;3", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 120.0, 28.20460955234091 ], "wc_summary_review_avg": [ 68.75, 28.925550988702014 ], "wc_main_review_avg": [ 358.0, 114.3241006962224 ], "wc_review_avg": [ 546.75, 115.55599292118086 ], "wc_reply_reviewers_avg": [ 69.75, 46.11060073345391 ], "wc_reply_authors_avg": [ 915.0, 249.74587083673677 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 2.25, 0.82915619758885 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 60, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11034608282152287772&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff_unique_index": "0", "aff_unique_norm": "Hong Kong University of Science and Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.ust.hk", "aff_unique_abbr": "HKUST", "aff_campus_unique_index": "0", "aff_campus_unique": "Hong Kong SAR", "aff_country_unique_index": "0", "aff_country_unique": "China" }, { "id": "8svLJL54sj8", "title": "Automatic prior selection for meta Bayesian optimization with a case study on tuning deep neural network optimizers", "track": "main", "status": "Reject", "tldr": "", "abstract": "The performance of deep neural networks can be highly sensitive to the choice of a variety of meta-parameters, such as optimizer parameters and model hyperparameters. Tuning these well, however, often requires extensive and costly experimentation. Bayesian optimization (BO) is a principled approach to solve such expensive hyperparameter tuning problems efficiently. Key to the performance of BO is specifying and refining a distribution over functions, which is used to reason about the optima of the underlying function being optimized. In this work, we consider the scenario where we have data from similar functions that allows us to specify a tighter distribution a priori. Specifically, we focus on the common but potentially costly task of tuning optimizer parameters for training neural networks. Building on the meta BO method from Wang et al. (2018), we develop practical improvements that (a) boost its performance by leveraging tuning results on multiple tasks without requiring observations for the same meta-parameter points across all tasks, and (b) retain its regret bound for a special case of our method. As a result, we provide a coherent BO solution for iterative optimization of continuous optimizer parameters. To verify our approach in realistic model training setups, we collected a large multi-task hyperparameter tuning dataset by training tens of thousands of configurations of near-state-of-the-art models on popular image and text datasets, as well as a protein sequence dataset. Our results show that on average, our method is able to locate good hyperparameters at least 3 times more efficiently than the best competing methods.", "keywords": "Bayesian optimization;Gaussian process;hyperparameter tuning;meta learning;transfer learning;multi task", "primary_area": "", "supplementary_material": "", "author": "Zi Wang;George Edward Dahl;Kevin Swersky;Chansoo Lee;Zelda E Mariet;Zachary Nado;Justin Gilmer;Jasper Snoek;Zoubin Ghahramani", "authorids": "~Zi_Wang1;~George_Edward_Dahl1;~Kevin_Swersky1;~Chansoo_Lee1;~Zelda_E_Mariet1;~Zachary_Nado1;~Justin_Gilmer1;~Jasper_Snoek1;~Zoubin_Ghahramani1", "gender": "F;M;M;M;F;M;M;M;M", "homepage": "http://zi-wang.com/;https://www.cs.toronto.edu/~gdahl;http://www.cs.toronto.edu/~kswersky;;https://zelda.lids.mit.edu/;http://zna.do;;;http://mlg.eng.cam.ac.uk/zoubin/", "dblp": "78/8711-4;10/7998;35/9381;137/3219;164/7319;228/7785;;95/6097;g/ZoubinGhahramani", "google_scholar": "U0egIsIAAAAJ;ghbWy-0AAAAJ;https://scholar.google.ca/citations?user=IrixA8MAAAAJ;;twuEPEEAAAAJ;tazGc34AAAAJ;Ml_vQ8MAAAAJ;FM2DTXwAAAAJ;https://scholar.google.co.uk/citations?user=0uTu7fYAAAAJ", "orcid": ";;;;;;;;", "linkedin": ";;;;;;;;", "or_profile": "~Zi_Wang1;~George_Edward_Dahl1;~Kevin_Swersky1;~Chansoo_Lee1;~Zelda_E_Mariet1;~Zachary_Nado1;~Justin_Gilmer1;~Jasper_Snoek1;~Zoubin_Ghahramani1", "aff": "Google DeepMind;Google;Google Deepmind;Google;Google;Google;Google Brain;Google;University of Cambridge", "aff_domain": "google.com;google.com;google.com;google.com;google.com;google.com;google.com;google.com;cam.ac.uk", "position": "Research scientist;Research Scientist;Research Scientist;Researcher;Research Scientist;Research Engineer;Researcher;Research Scientist;Full Professor", "bibtex": "@misc{\nwang2022automatic,\ntitle={Automatic prior selection for meta Bayesian optimization with a case study on tuning deep neural network optimizers},\nauthor={Zi Wang and George Edward Dahl and Kevin Swersky and Chansoo Lee and Zelda E Mariet and Zachary Nado and Justin Gilmer and Jasper Snoek and Zoubin Ghahramani},\nyear={2022},\nurl={https://openreview.net/forum?id=8svLJL54sj8}\n}", "github": "", "project": "", "reviewers": "7A1X;dwbG;Neri;jdBZ", "site": "https://openreview.net/forum?id=8svLJL54sj8", "pdf_size": 0, "recommendation": "3;3;5;8", "confidence": "4;4;3;4", "correctness": "3;2;4;3", "technical_novelty": "2;1;3;4", "empirical_novelty": "1;1;2;3", "wc_summary_paper": "96;123;82;77", "wc_summary_review": "59;17;86;24", "wc_main_review": "551;403;267;201", "wc_review": "706;543;435;302", "wc_reply_reviewers": "179;0;0;111", "wc_reply_authors": "1383;899;562;534", "reply_reviewers": "1;0;0;1", "reply_authors": "3;2;1;1", "recommendation_avg": [ 4.75, 2.0463381929681126 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.5, 1.118033988749895 ], "empirical_novelty_avg": [ 1.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 94.5, 17.867568385205637 ], "wc_summary_review_avg": [ 46.5, 27.807373122968663 ], "wc_main_review_avg": [ 355.5, 134.3307485276547 ], "wc_review_avg": [ 496.5, 148.04137935050457 ], "wc_reply_reviewers_avg": [ 72.5, 76.38226233884409 ], "wc_reply_authors_avg": [ 844.5, 342.4795614339635 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.75, 0.82915619758885 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 9, 0 ], "corr_recommendation_confidence": -0.07053456158585983, "corr_recommendation_correctness": 0.34554737023254406, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18252301191455760903&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;1;0;0;0;0;0;2", "aff_unique_norm": "Google;DeepMind;University of Cambridge", "aff_unique_dep": "Google DeepMind;DeepMind;", "aff_unique_url": "https://deepmind.com;https://deepmind.com;https://www.cam.ac.uk", "aff_unique_abbr": "DeepMind;DeepMind;Cambridge", "aff_campus_unique_index": "1;1;1;1;1;1;2", "aff_campus_unique": ";Mountain View;Cambridge", "aff_country_unique_index": "0;1;0;1;1;1;1;1;0", "aff_country_unique": "United Kingdom;United States" }, { "id": "8uqOMUHgW4M", "title": "Learning shared neural manifolds from multi-subject FMRI data", "track": "main", "status": "Reject", "tldr": "", "abstract": "Functional magnetic resonance imaging (fMRI) is a notoriously noisy measurement of brain activity because of the large variations between individuals, signals marred by environmental differences during collection, and spatiotemporal averaging required by the measurement resolution. In addition, the data is extremely high dimensional, with the space of the activity typically having much lower intrinsic dimension. In order to understand the connection between stimuli of interest and brain activity, and analyze differences and commonalities between subjects, it becomes important to learn a meaningful embedding of the data that denoises, and reveals its intrinsic structure. Specifically, we assume that while noise varies significantly between individuals, true responses to stimuli will share common, low-dimensional features between subjects which are jointly discoverable. Similar approaches have been exploited previously but they have mainly used linear methods such as PCA and shared response modeling (SRM). In contrast, we propose a neural network called MRMD-AE (manifold-regularized multiple-decoder, autoencoder), that learns a common embedding from multiple subjects in an experiment while retaining the ability to decode to individual raw fMRI signals. We show that our learned common space represents an extensible manifold (where new points not seen during training can be mapped), improves the classification accuracy of stimulus features of unseen timepoints, as well as improves cross-subject translation of fMRI signals. We believe this framework can be used for many downstream applications such as guided BCI training in the future. ", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/58c8155f2468f78553e9a4350f559c7220448f96.zip", "author": "Jessie Huang;Erica Lindsey Busch;Tom Wallenstein;Michal Gerasimiuk;Guillaume Lajoie;Guy Wolf;Nicholas Turk-Browne;Smita Krishnaswamy", "authorids": "~Jessie_Huang1;~Erica_Lindsey_Busch1;tom.wallenstein@yale.edu;michal.gerasimiuk@yale.edu;~Guillaume_Lajoie1;~Guy_Wolf1;~Nicholas_Turk-Browne1;~Smita_Krishnaswamy1", "gender": "F;F;;;M;M;;F", "homepage": "https://sites.google.com/view/jessiehuang;;;;https://dms.umontreal.ca/~lajoie/;http://guywolf.org;https://ntblab.yale.edu;http://www.krishnaswamylab.org", "dblp": "220/3825;291/6172;;;31/10384;120/1308;;74/2457", "google_scholar": "-Uf7kNYAAAAJ;https://scholar.google.com/citations?hl=en;;;;g0k3SjcAAAAJ;;l2Pr9m8AAAAJ", "orcid": ";0000-0002-5906-9622;;;;0000-0002-6740-059X;;", "linkedin": ";;;;;;;", "or_profile": "~Jessie_Huang1;~Erica_Lindsey_Busch1;tom.wallenstein@yale.edu;michal.gerasimiuk@yale.edu;~Guillaume_Lajoie1;~Guy_Wolf1;~Nicholas_Turk-Browne1;~Smita_Krishnaswamy1", "aff": "Yale University;Yale University;;;Mila - Quebec Artificial Intelligence Institute;University of Montreal;Yale University;Yale University", "aff_domain": "yale.edu;yale.edu;;;mila.quebec;umontreal.ca;yale.edu;yale.edu", "position": "Postdoc;PhD student;;;Associate Professor;Associate Professor;Full Professor;Associate Professor", "bibtex": "@misc{\nhuang2022learning,\ntitle={Learning shared neural manifolds from multi-subject {FMRI} data},\nauthor={Jessie Huang and Erica Lindsey Busch and Tom Wallenstein and Michal Gerasimiuk and Guillaume Lajoie and Guy Wolf and Nicholas Turk-Browne and Smita Krishnaswamy},\nyear={2022},\nurl={https://openreview.net/forum?id=8uqOMUHgW4M}\n}", "github": "", "project": "", "reviewers": "cq7U;h5JS;nPEU;gwm6;Rq5W", "site": "https://openreview.net/forum?id=8uqOMUHgW4M", "pdf_size": 0, "recommendation": "3;5;6;6;8", "confidence": "5;3;4;5;4", "correctness": "4;3;2;4;3", "technical_novelty": "1;3;3;3;3", "empirical_novelty": "1;2;3;3;3", "wc_summary_paper": "43;123;66;129;109", "wc_summary_review": "18;34;175;26;157", "wc_main_review": "202;438;971;274;1536", "wc_review": "263;595;1212;429;1802", "wc_reply_reviewers": "0;141;0;0;1007", "wc_reply_authors": "381;905;880;497;3351", "reply_reviewers": "0;1;0;0;3", "reply_authors": "1;2;2;1;5", "recommendation_avg": [ 5.6, 1.624807680927192 ], "confidence_avg": [ 4.2, 0.7483314773547882 ], "correctness_avg": [ 3.2, 0.7483314773547882 ], "technical_novelty_avg": [ 2.6, 0.8000000000000002 ], "empirical_novelty_avg": [ 2.4, 0.8 ], "wc_summary_paper_avg": [ 94.0, 33.69272918598314 ], "wc_summary_review_avg": [ 82.0, 69.00724599634447 ], "wc_main_review_avg": [ 684.2, 503.82790712702683 ], "wc_review_avg": [ 860.2, 569.8531038785346 ], "wc_reply_reviewers_avg": [ 229.6, 392.5173117201329 ], "wc_reply_authors_avg": [ 1202.8, 1093.7236213961917 ], "reply_reviewers_avg": [ 0.8, 1.1661903789690602 ], "reply_authors_avg": [ 2.2, 1.469693845669907 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": -0.26318067798390754, "corr_recommendation_correctness": -0.4276686017238498, "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16040359753092397013&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0;1;2;0;0", "aff_unique_norm": "Yale University;Quebec Artificial Intelligence Institute;University of Montreal", "aff_unique_dep": ";Artificial Intelligence;", "aff_unique_url": "https://www.yale.edu;https://mila.quebec;https://wwwumontreal.ca", "aff_unique_abbr": "Yale;Mila;UM", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;1;0;0", "aff_country_unique": "United States;Canada" }, { "title": "On the Pitfalls of Analyzing Individual Neurons in Language Models", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6897", "id": "8uz0EWPQIMu", "poster": "", "openreview": "https://openreview.net/forum?id=8uz0EWPQIMu", "slides": "https://iclr.cc/virtual/2022/poster/6897", "video": "https://iclr.cc/virtual/2022/poster/6897", "author_site": "Omer Antverg, Yonatan Belinkov", "tldr": "", "abstract": "While many studies have shown that linguistic information is encoded in hidden word representations, few have studied individual neurons, to show how and in which neurons it is encoded.\nAmong these, the common approach is to use an external probe to rank neurons according to their relevance to some linguistic attribute, and to evaluate the obtained ranking using the same probe that produced it.\nWe show two pitfalls in this methodology:\n 1. It confounds distinct factors: probe quality and ranking quality.\n We separate them and draw conclusions on each.\n 2. It focuses on encoded information, rather than information that is used by the model.\n We show that these are not the same.\nWe compare two recent ranking methods and a simple one we introduce, and evaluate them with regard to both of these aspects.", "keywords": "NLP;interpretability;multilingual;individual neurons", "primary_area": "", "supplementary_material": "", "author": "Omer Antverg;Yonatan Belinkov", "authorids": "~Omer_Antverg1;~Yonatan_Belinkov1", "gender": ";M", "homepage": ";https://www.belinkov.com", "dblp": ";136/8705", "google_scholar": ";https://scholar.google.com/citations?authorid=K-6ujU4AAAAJ", "orcid": ";", "linkedin": "omer-antverg-8427a2153/;", "or_profile": "~Omer_Antverg1;~Yonatan_Belinkov1", "aff": "Technion, Technion;Technion, Technion", "aff_domain": "technion.ac.il;technion.ac.il", "position": "MS student;Assistant Professor", "bibtex": "@inproceedings{\nantverg2022on,\ntitle={On the Pitfalls of Analyzing Individual Neurons in Language Models},\nauthor={Omer Antverg and Yonatan Belinkov},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=8uz0EWPQIMu}\n}", "github": "", "project": "", "reviewers": "C78M;LRS7;RqKU;zE55", "pdf_size": 0, "recommendation": "6;8;8;8", "confidence": "3;4;3;3", "correctness": "4;3;3;4", "technical_novelty": "3;3;3;3", "empirical_novelty": "3;3;3;3", "wc_summary_paper": "26;114;278;40", "wc_summary_review": "27;95;59;18", "wc_main_review": "415;766;376;58", "wc_review": "468;975;713;116", "wc_reply_reviewers": "0;290;101;0", "wc_reply_authors": "944;1814;707;15", "reply_reviewers": "0;1;1;0", "reply_authors": "4;4;2;1", "recommendation_avg": [ 7.5, 0.8660254037844386 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 114.5, 100.14364682794411 ], "wc_summary_review_avg": [ 49.75, 30.243801017729236 ], "wc_main_review_avg": [ 403.75, 250.83099389828203 ], "wc_review_avg": [ 568.0, 316.6141184470459 ], "wc_reply_reviewers_avg": [ 97.75, 118.40687268904622 ], "wc_reply_authors_avg": [ 870.0, 643.0796995707452 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 2.75, 1.299038105676658 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": -0.5773502691896257, "gs_citation": 56, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11409897775551892386&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=8uz0EWPQIMu", "email": "technion.ac.il;technion.ac.il", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Technion - Israel Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.technion.ac.il/en/", "aff_unique_abbr": "Technion", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Israel" }, { "id": "8wI4UUN5RxC", "title": "Variational Inference via Resolution of Singularities", "track": "main", "status": "Reject", "tldr": "", "abstract": "Predicated on the premise that neural networks are best viewed as singular statistical models, we set out to propose a new variational approximation for Bayesian neural networks. The approximation relies on a central result from singular learning theory according to which the posterior distribution over the parameters of a singular model, following an algebraic-geometrical transformation known as a desingularization map, is asymptotically a mixture of standard forms. From here we proceed to demonstrate that a generalized gamma mean-field variational family, following desingularization, can recover the leading order term of the model evidence. Affine coupling layers are employed to learn the unknown desingularization map, effectively rendering the proposed methodology a normalizing flow with the generalized gamma as the source distribution.", "keywords": "singular learning theory;Bayesian neural networks;variational inference;normalizing flow", "primary_area": "", "supplementary_material": "/attachment/908fa0f088be85e76baac4d91ed538b53b98c55e.zip", "author": "Susan Wei", "authorids": "~Susan_Wei1", "gender": "F", "homepage": "https://www.suswei.com/", "dblp": "203/8878", "google_scholar": "Udv9jsIAAAAJ", "orcid": "0000-0002-6842-2352", "linkedin": "", "or_profile": "~Susan_Wei1", "aff": "The University of Melbourne", "aff_domain": "unimelb.edu.au", "position": "Assistant Professor", "bibtex": "@misc{\nwei2022variational,\ntitle={Variational Inference via Resolution of Singularities},\nauthor={Susan Wei},\nyear={2022},\nurl={https://openreview.net/forum?id=8wI4UUN5RxC}\n}", "github": "", "project": "", "reviewers": "32H4;jfja;PvQE;kf3F", "site": "https://openreview.net/forum?id=8wI4UUN5RxC", "pdf_size": 0, "recommendation": "5;5;5;5", "confidence": "4;3;2;2", "correctness": "3;3;2;3", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "70;113;113;100", "wc_summary_review": "15;45;77;37", "wc_main_review": "275;266;852;289", "wc_review": "360;424;1042;426", "wc_reply_reviewers": "185;88;432;443", "wc_reply_authors": "915;715;1748;585", "reply_reviewers": "3;2;2;1", "reply_authors": "3;2;5;2", "recommendation_avg": [ 5.0, 0.0 ], "confidence_avg": [ 2.75, 0.82915619758885 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 99.0, 17.564168070250297 ], "wc_summary_review_avg": [ 43.5, 22.242976419535225 ], "wc_main_review_avg": [ 420.5, 249.26140896657068 ], "wc_review_avg": [ 563.0, 277.82188538702275 ], "wc_reply_reviewers_avg": [ 287.0, 154.40692989629707 ], "wc_reply_authors_avg": [ 990.75, 452.7241847968805 ], "reply_reviewers_avg": [ 2.0, 0.7071067811865476 ], "reply_authors_avg": [ 3.0, 1.224744871391589 ], "replies_avg": [ 27, 0 ], "authors#_avg": [ 1, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:6Dg7DGVtRg8J:scholar.google.com/&scioq=Variational+Inference+via+Resolution+of+Singularities&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "University of Melbourne", "aff_unique_dep": "", "aff_unique_url": "https://www.unimelb.edu.au", "aff_unique_abbr": "UniMelb", "aff_country_unique_index": "0", "aff_country_unique": "Australia" }, { "title": "Givens Coordinate Descent Methods for Rotation Matrix Learning in Trainable Embedding Indexes", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/5974", "id": "9-Rfew334N", "poster": "", "openreview": "https://openreview.net/forum?id=9-Rfew334N", "slides": "https://iclr.cc/virtual/2022/poster/5974", "video": "https://iclr.cc/virtual/2022/poster/5974", "author_site": "Yunjiang Jiang, Han Zhang, Yiming Qiu, Yun Xiao, Bo Long, Wen-Yun Yang", "tldr": "", "abstract": "Product quantization (PQ) coupled with a space rotation, is widely used in modern approximate nearest neighbor (ANN) search systems to significantly compress the disk storage for embeddings and speed up the inner product computation. Existing rotation learning methods, however, minimize quantization distortion for fixed embeddings, which are not applicable to an end-to-end training scenario where embeddings are updated constantly. In this paper, based on geometric intuitions from Lie group theory, in particular the special orthogonal groupSO(n), we propose a family of block Givens coordinate descent algorithms to learn rotation matrix that are provably convergent on any convex objectives. Compared to the state-of-the-art SVD method, the Givens algorithms are much more parallelizable, reducing runtime by orders of magnitude on modern GPUs, and converge more stably according to experimental studies. They further improve upon vanilla product quantization significantly in an end-to-end training scenario.", "keywords": "Search index;Product quantization;Block coordinate descent", "primary_area": "", "supplementary_material": "", "author": "Yunjiang Jiang;Han Zhang;Yiming Qiu;Yun Xiao;Bo Long;Wen-Yun Yang", "authorids": "~Yunjiang_Jiang1;~Han_Zhang10;~Yiming_Qiu1;~Yun_Xiao2;~Bo_Long1;~Wen-Yun_Yang1", "gender": "M;F;M;M;M;M", "homepage": "https://sites.google.com/view/jyj3m/home;;https://www.linkedin.com/in/yiming-qiu-0645a487/;;https://www.linkedin.com/in/bolonglinkedin/;", "dblp": ";;;;96/6993.html;53/864", "google_scholar": ";_JA9U-EAAAAJ;;;;WyvbFd8AAAAJ", "orcid": ";;;;;", "linkedin": ";;;yun-xiao-75581326/;bolonglinkedin/;wen-yun-yang-31b48740/", "or_profile": "~Yunjiang_Jiang1;~Han_Zhang10;~Yiming_Qiu1;~Yun_Xiao2;~Bo_Long1;~Wen-Yun_Yang1", "aff": ";;;JD.COM Silicon Valley Research Center;Meta;", "aff_domain": ";;;jd.com;meta.com;", "position": ";;;Principal Scientist;Principal Researcher;", "bibtex": "@inproceedings{\njiang2022givens,\ntitle={Givens Coordinate Descent Methods for Rotation Matrix Learning in Trainable Embedding Indexes},\nauthor={Yunjiang Jiang and Han Zhang and Yiming Qiu and Yun Xiao and Bo Long and Wen-Yun Yang},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=9-Rfew334N}\n}", "github": "", "project": "", "reviewers": "Qvs3;HiTR;rUSS;kyJu", "pdf_size": 0, "recommendation": "6;6;6;6", "confidence": "4;3;4;3", "correctness": "3;3;3;3", "technical_novelty": "3;3;3;2", "empirical_novelty": "2;3;2;3", "wc_summary_paper": "103;72;92;214", "wc_summary_review": "90;44;74;47", "wc_main_review": "661;215;405;157", "wc_review": "854;331;571;418", "wc_reply_reviewers": "505;0;130;0", "wc_reply_authors": "955;419;732;265", "reply_reviewers": "3;0;1;0", "reply_authors": "3;1;2;1", "recommendation_avg": [ 6.0, 0.0 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 120.25, 55.2556558191105 ], "wc_summary_review_avg": [ 63.75, 19.13602623325961 ], "wc_main_review_avg": [ 359.5, 196.76064138948115 ], "wc_review_avg": [ 543.5, 198.7919766992622 ], "wc_reply_reviewers_avg": [ 158.75, 206.83251074238788 ], "wc_reply_authors_avg": [ 592.75, 268.4328361061664 ], "reply_reviewers_avg": [ 1.0, 1.224744871391589 ], "reply_authors_avg": [ 1.75, 0.82915619758885 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=499998354450948390&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=9-Rfew334N", "email": ";;;jd.com;meta.com;", "author_num": 6, "aff_unique_index": "0;1", "aff_unique_norm": "JD.com;Meta", "aff_unique_dep": "Research Center;Meta Platforms, Inc.", "aff_unique_url": "https://www.jd.com;https://meta.com", "aff_unique_abbr": "JD.COM;Meta", "aff_campus_unique_index": "0", "aff_campus_unique": "Silicon Valley;", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "91muTwt1_t5", "title": "Knowledge Guided Geometric Editing for Unsupervised Drug Design", "track": "main", "status": "Reject", "tldr": "", "abstract": "Deep learning models have been widely used in automatic drug design. Current deep approaches always represent and generate candidate molecules as a 1D string or a 2D graph, which rely on large measurement data from lab experiments for training. However, many disease targets in particular newly discovered ones do not have such data available. In this paper, we propose \\method, which incorporates physicochemical knowledge into deep models, leading to unsupervised drug design. Specifically, \\method directly models drug molecules in the geometric~(3D) space and performs geometric editing with the knowledge guidance by self-training and simulated annealing in a purely training data free fashion. Our experimental results demonstrate that GEKO outperforms baselines on all 12 targets with and without prior drug-target measurement data.", "keywords": "drug discovery;3D molecular generation;monte carlo sampling", "primary_area": "", "supplementary_material": "", "author": "Yuwei Yang;Siqi Ouyang;Meihua Dang;Mingyue Zheng;Lei Li;Hao Zhou", "authorids": "~Yuwei_Yang2;~Siqi_Ouyang2;~Meihua_Dang1;~Mingyue_Zheng1;~Lei_Li11;~Hao_Zhou6", "gender": ";M;F;M;M;M", "homepage": ";https://owaski.github.io/;https://cs.stanford.edu/~mhdang/;https://www.researchgate.net/profile/Mingyue-Zheng;https://www.cs.cmu.edu/~leili;https://zhouh.github.io/", "dblp": ";224/0162;270/9145;;13/7007-5.html;63/778-12", "google_scholar": ";https://scholar.google.com/citations?hl=en;TiZrG7IAAAAJ;vzBQN8EAAAAJ;BYXqAlwAAAAJ;https://scholar.google.com/citations?hl=zh-CN", "orcid": ";;;0000-0002-3323-3092;0000-0003-3095-9776;", "linkedin": "yuwei-yang-89780179/;;;;;", "or_profile": "~Yuwei_Yang2;~Siqi_Ouyang2;~Meihua_Dang1;~Mingyue_Zheng1;~Lei_Li11;~Hao_Zhou5", "aff": "ByteDance AI Lab;UC Santa Barbara;University of California, Los Angeles;Shanghai Institute of Materia Medica;Computer Science Department, UC Santa Barbara;Bytedance", "aff_domain": "bytedance.com;ucsb.edu;ucla.edu;simm.ac.cn;cs.ucsb.edu;bytedance.com", "position": "Researcher;PhD student;MS student;Full Professor;Assistant Professor;Researcher", "bibtex": "@misc{\nyang2022knowledge,\ntitle={Knowledge Guided Geometric Editing for Unsupervised Drug Design},\nauthor={Yuwei Yang and Siqi Ouyang and Meihua Dang and Mingyue Zheng and Lei Li and Hao Zhou},\nyear={2022},\nurl={https://openreview.net/forum?id=91muTwt1_t5}\n}", "github": "", "project": "", "reviewers": "gg4D;X9Jm;ExEq;ZrRD", "site": "https://openreview.net/forum?id=91muTwt1_t5", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "4;4;4;3", "correctness": "2;3;3;4", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "110;148;69;92", "wc_summary_review": "61;24;121;22", "wc_main_review": "407;449;220;332", "wc_review": "578;621;410;446", "wc_reply_reviewers": "190;125;118;63", "wc_reply_authors": "1216;528;1272;788", "reply_reviewers": "1;1;1;1", "reply_authors": "2;2;3;2", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 104.75, 28.89095879336648 ], "wc_summary_review_avg": [ 57.0, 40.08116764766216 ], "wc_main_review_avg": [ 352.0, 86.97413408594535 ], "wc_review_avg": [ 513.75, 88.01242809967238 ], "wc_reply_reviewers_avg": [ 124.0, 45.03887209955418 ], "wc_reply_authors_avg": [ 951.0, 307.719027685972 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 2.25, 0.4330127018922193 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.6622661785325219, "corr_recommendation_correctness": 0.9733285267845754, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12247859298535042419&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;2;3;1;0", "aff_unique_norm": "ByteDance;University of California, Santa Barbara;University of California, Los Angeles;Shanghai Institute of Materia Medica", "aff_unique_dep": "AI Lab;;;", "aff_unique_url": "https://www.bytedance.com;https://www.ucsb.edu;https://www.ucla.edu;http://www.simm.ac.cn", "aff_unique_abbr": "ByteDance;UCSB;UCLA;", "aff_campus_unique_index": "1;2;1", "aff_campus_unique": ";Santa Barbara;Los Angeles", "aff_country_unique_index": "0;1;1;0;1;0", "aff_country_unique": "China;United States" }, { "id": "92awwjGxIZI", "title": "Self-GenomeNet: Self-supervised Learning with Reverse-Complement Context Prediction for Nucleotide-level Genomics Data", "track": "main", "status": "Reject", "tldr": "", "abstract": "We introduce Self-GenomeNet, a novel contrastive self-supervised learning method for nucleotide-level genomic data, which substantially improves the quality of the learned representations and performance compared to the current state-of-the-art deep learning frameworks. To the best of our knowledge, Self-GenomeNet is the first self-supervised framework that learns a representation of nucleotide-level genome data, using domain-specific characteristics. Our proposed method learns and parametrizes the latent space by leveraging the reverse-complement of genomic sequences. During the training procedure, we force our framework to capture semantic representations with a novel context network on top of intermediate features extracted by an encoder network. The network is trained with an unsupervised contrastive loss. Extensive experiments show that our method with self-supervised and semi-supervised settings is able to considerably outperform previous deep learning methods on different datasets and a public bioinformatics benchmark. Moreover, the learned representations generalize well when transferred to new datasets and tasks. The source code of the method and all the experiments are available at supplementary.", "keywords": "Genome Sequence Analysis;Self-supervised Learning;Representation Learning;Application in Computational Biology", "primary_area": "", "supplementary_material": "/attachment/4614665a25936dc0ab1ccf0382e2acbdf942f7e9.zip", "author": "H\u00fcseyin Anil G\u00fcnd\u00fcz;Martin Binder;Xiao-Yin To;Ren\u00e9 Mreches;Philipp C. M\u00fcnch;Alice C McHardy;Bernd Bischl;Mina Rezaei", "authorids": "~H\u00fcseyin_Anil_G\u00fcnd\u00fcz1;~Martin_Binder1;~Xiao-Yin_To1;~Ren\u00e9_Mreches1;~Philipp_C._M\u00fcnch1;~Alice_C_McHardy1;~Bernd_Bischl1;~Mina_Rezaei1", "gender": ";;F;;M;F;M;F", "homepage": "https://www.slds.stat.uni-muenchen.de/people/guenduez/;https://www.slds.stat.uni-muenchen.de/people/binder/;https://www.slds.stat.uni-muenchen.de/people/to/;https://www.helmholtz-hzi.de/en/nc/research/research-topics/bacterial-and-viral-pathogens/computational-biology-of-infection-research/team/;;https://www.helmholtz-hzi.de/en/research/research-topics/bacterial-and-viral-pathogens/computational-biology-of-infection-research/alice-mchardy/;https://www.slds.stat.uni-muenchen.de/;https://www.compstat.statistik.uni-muenchen.de/people/minar/", "dblp": "321/9907.html;255/7243;;;;;48/5326;205/2767", "google_scholar": ";;https://scholar.google.com/citations?hl=de;;sJ_p3qwAAAAJ;https://scholar.google.de/citations?user=zJaGqmAAAAAJ;https://scholar.google.de/citations?user=s34UckkAAAAJ;https://scholar.google.de/citations?hl=en", "orcid": ";;;;;;0000-0001-6002-6980;0000-0001-6994-6345", "linkedin": ";;;;;;;mina-rezaei-b88a3a69/", "or_profile": "~H\u00fcseyin_Anil_G\u00fcnd\u00fcz1;~Martin_Binder1;~Xiao-Yin_To1;~Ren\u00e9_Mreches1;~Philipp_C._M\u00fcnch1;~Alice_C_McHardy1;~Bernd_Bischl1;~Mina_Rezaei1", "aff": "LMU Munich;Department of Statistics;University of Munich, Department of Statistics;Helmholtz Centre for Infection Research;Harvard University;;LMU;Ludwig-Maximilians-Universit\u00e4t M\u00fcnchen", "aff_domain": "lmu.de;lmu.de;campus.lmu.de;helmholtz-hzi.de;harvard.edu;;uni-muenchen.de;lmu.de", "position": "PhD student;PhD student;MS student;Software Developer;Researcher;;Full Professor;Principal Researcher", "bibtex": "@misc{\ng{\\\"u}nd{\\\"u}z2022selfgenomenet,\ntitle={Self-GenomeNet: Self-supervised Learning with Reverse-Complement Context Prediction for Nucleotide-level Genomics Data},\nauthor={H{\\\"u}seyin Anil G{\\\"u}nd{\\\"u}z and Martin Binder and Xiao-Yin To and Ren{\\'e} Mreches and Philipp C. M{\\\"u}nch and Alice C McHardy and Bernd Bischl and Mina Rezaei},\nyear={2022},\nurl={https://openreview.net/forum?id=92awwjGxIZI}\n}", "github": "", "project": "", "reviewers": "9TG8;ERw2;ifj3;4DkC", "site": "https://openreview.net/forum?id=92awwjGxIZI", "pdf_size": 0, "recommendation": "5;5;6;8", "confidence": "4;4;4;4", "correctness": "4;2;3;3", "technical_novelty": "2;2;2;4", "empirical_novelty": "3;2;3;3", "wc_summary_paper": "38;95;170;73", "wc_summary_review": "67;38;44;22", "wc_main_review": "114;525;420;430", "wc_review": "219;658;634;525", "wc_reply_reviewers": "16;0;33;12", "wc_reply_authors": "765;1934;1957;758", "reply_reviewers": "1;0;1;1", "reply_authors": "2;3;3;1", "recommendation_avg": [ 6.0, 1.224744871391589 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.5, 0.8660254037844386 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 94.0, 48.358039662500794 ], "wc_summary_review_avg": [ 42.75, 16.145819892467525 ], "wc_main_review_avg": [ 372.25, 154.6291935567149 ], "wc_review_avg": [ 509.0, 174.77270954013386 ], "wc_reply_reviewers_avg": [ 15.25, 11.818946653572814 ], "wc_reply_authors_avg": [ 1353.5, 592.061018814784 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 2.25, 0.82915619758885 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8624269881258958297&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;2;3;4;0;5", "aff_unique_norm": "Ludwig Maximilian University of Munich;University Affiliation Not Specified;University of Munich;Helmholtz Centre for Infection Research;Harvard University;Ludwig-Maximilians-Universit\u00e4t M\u00fcnchen", "aff_unique_dep": ";Department of Statistics;Department of Statistics;;;", "aff_unique_url": "https://www.lmu.de;;https://www.lmu.de;https://www.helmholtz-hzi.de;https://www.harvard.edu;https://www.lmu.de", "aff_unique_abbr": "LMU;;LMU;HZI;Harvard;LMU", "aff_campus_unique_index": "0", "aff_campus_unique": "Munich;", "aff_country_unique_index": "0;0;0;2;0;0", "aff_country_unique": "Germany;;United States" }, { "title": "Learning Transferable Reward for Query Object Localization with Policy Adaptation", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6128", "id": "92tYQiil17", "poster": "", "openreview": "https://openreview.net/forum?id=92tYQiil17", "slides": "https://iclr.cc/virtual/2022/poster/6128", "video": "https://iclr.cc/virtual/2022/poster/6128", "author_site": "Tingfeng Li, Shaobo Han, Martin Min, Dimitris Metaxas", "tldr": "", "abstract": "We propose a reinforcement learning based approach to query object localization, for which an agent is trained to localize objects of interest specified by a small exemplary set. We learn a transferable reward signal formulated using the exemplary set by ordinal metric learning. Our proposed method enables test-time policy adaptation to new environments where the reward signals are not readily available, and outperforms fine-tuning approaches that are limited to annotated images. In addition, the transferable reward allows repurposing the trained agent from one specific class to another class. Experiments on corrupted MNIST, CU-Birds, and COCO datasets demonstrate the effectiveness of our approach.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Tingfeng Li;Shaobo Han;Martin Renqiang Min;Dimitris N. Metaxas", "authorids": "~Tingfeng_Li3;~Shaobo_Han1;~Martin_Renqiang_Min1;~Dimitris_N._Metaxas1", "gender": "M;M;M;F", "homepage": "https://shaobohan.net/;http://www.cs.toronto.edu/~cuty;https://www.cs.rutgers.edu/~dnm/;https://litingfeng.github.io/", "dblp": ";29/7048;m/DNMetaxas;", "google_scholar": "3L333oYAAAAJ;T2M4JjEAAAAJ;https://scholar.google.com.tw/citations?user=a7VNhCIAAAAJ;kDoYCPMAAAAJ", "orcid": ";0000-0002-8563-6133;;", "linkedin": ";martin-renqiang-min-955a8766;dimitris-metaxas-1bb74914/;tingfeng-li-30098ab8/", "or_profile": "~Shaobo_Han1;~Martin_Renqiang_Min1;~Dimitris_Metaxas1;~TINGFENG_LI2", "aff": "NEC Labs America;NEC Laboratories America;Rutgers University;Rutgers University", "aff_domain": "nec-labs.com;nec-labs.com;cs.rutgers.edu;cs.rutgers.edu", "position": "Researcher;Researcher;Full Professor;PhD student", "bibtex": "@inproceedings{\nli2022learning,\ntitle={Learning Transferable Reward for Query Object Localization with Policy Adaptation},\nauthor={Tingfeng Li and Shaobo Han and Martin Renqiang Min and Dimitris N. Metaxas},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=92tYQiil17}\n}", "github": "", "project": "", "reviewers": "pHvg;tkLp;DiRh;yBs4", "pdf_size": 0, "recommendation": "6;6;8;8", "confidence": "4;4;2;3", "correctness": "3;4;4;3", "technical_novelty": "3;3;2;3", "empirical_novelty": "4;3;3;3", "wc_summary_paper": "181;86;73;94", "wc_summary_review": "31;69;103;79", "wc_main_review": "169;504;327;566", "wc_review": "381;659;503;739", "wc_reply_reviewers": "38;58;30;235", "wc_reply_authors": "324;689;407;1013", "reply_reviewers": "1;1;1;1", "reply_authors": "1;1;1;2", "recommendation_avg": [ 7.0, 1.0 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 108.5, 42.523522902036234 ], "wc_summary_review_avg": [ 70.5, 25.937424698685874 ], "wc_main_review_avg": [ 391.5, 155.54179502628867 ], "wc_review_avg": [ 570.5, 138.46569972379442 ], "wc_reply_reviewers_avg": [ 90.25, 84.19137426126265 ], "wc_reply_authors_avg": [ 608.25, 270.0197909413308 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.9045340337332909, "corr_recommendation_correctness": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6915912044091990536&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "pdf": "https://openreview.net/pdf?id=92tYQiil17", "email": "nec-labs.com;nec-labs.com;cs.rutgers.edu;cs.rutgers.edu", "author_num": 4, "aff_unique_index": "0;1;2;2", "aff_unique_norm": "NEC Labs America;NEC Laboratories America;Rutgers University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.nec-labs.com;https://www.nec-labs.com;https://www.rutgers.edu", "aff_unique_abbr": "NEC LA;NEC Labs America;Rutgers", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "93SVBUB1r5C", "title": "Learning with convolution and pooling operations in kernel methods", "track": "main", "status": "Reject", "tldr": "", "abstract": "Recent empirical work has shown that hierarchical convolutional kernels inspired by convolutional neural networks (CNNs) significantly improve the performance of kernel methods in image classification tasks. A widely accepted explanation for the success of these architectures is that they encode hypothesis classes that are suitable for natural images. However, understanding the precise interplay between approximation and generalization in convolutional architectures remains a challenge. In this paper, we consider the stylized setting of covariates (image pixels) uniformly distributed on the hypercube, and fully characterize the RKHS of kernels composed of single layers of convolution, pooling, and downsampling operations. We then study the gain in sample efficiency of kernel methods using these kernels over standard inner-product kernels. In particular, we show that 1) the convolution layer breaks the curse of dimensionality by restricting the RKHS to `local' functions; 2) local pooling biases learning towards low-frequency functions, which are stable by small translations; 3) downsampling may modify the high-frequency eigenspaces but leaves the low-frequency part approximately unchanged. Notably, our results quantify how choosing an architecture adapted to the target function leads to a large improvement in the sample complexity.\n\n", "keywords": "convolutional kernels;inductive bias;average pooling;downsampling;kernel ridge regression;generalization error;neural tangent kernel", "primary_area": "", "supplementary_material": "", "author": "Theodor Misiakiewicz;Song Mei", "authorids": "~Theodor_Misiakiewicz1;~Song_Mei1", "gender": ";M", "homepage": "https://misiakie.github.io;https://www.stat.berkeley.edu/~songmei/", "dblp": "168/8360;https://dblp.org/pers/hd/m/Mei:Song", "google_scholar": "E8Jst30AAAAJ;https://scholar.google.com.hk/citations?hl=en", "orcid": ";", "linkedin": ";", "or_profile": "~Theodor_Misiakiewicz1;~Song_Mei1", "aff": "Stanford University;University of California, Berkeley", "aff_domain": "stanford.edu;berkeley.edu", "position": "PhD student;Assistant Professor", "bibtex": "@misc{\nmisiakiewicz2022learning,\ntitle={Learning with convolution and pooling operations in kernel methods},\nauthor={Theodor Misiakiewicz and Song Mei},\nyear={2022},\nurl={https://openreview.net/forum?id=93SVBUB1r5C}\n}", "github": "", "project": "", "reviewers": "HeDj;nYey;oiTT", "site": "https://openreview.net/forum?id=93SVBUB1r5C", "pdf_size": 0, "recommendation": "5;5;6", "confidence": "4;3;3", "correctness": "3;4;4", "technical_novelty": "3;2;3", "empirical_novelty": "2;2;2", "wc_summary_paper": "179;56;156", "wc_summary_review": "105;84;34", "wc_main_review": "508;131;179", "wc_review": "792;271;369", "wc_reply_reviewers": "216;107;0", "wc_reply_authors": "1304;226;427", "reply_reviewers": "1;1;0", "reply_authors": "3;1;1", "recommendation_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "correctness_avg": [ 3.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 130.33333333333334, 53.39371581833286 ], "wc_summary_review_avg": [ 74.33333333333333, 29.780679792256066 ], "wc_main_review_avg": [ 272.6666666666667, 167.55562923664752 ], "wc_review_avg": [ 477.3333333333333, 226.07127686245818 ], "wc_reply_reviewers_avg": [ 107.66666666666667, 88.18289075678015 ], "wc_reply_authors_avg": [ 652.3333333333334, 468.04724357934447 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.9428090415820634 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.4999999999999999, "corr_recommendation_correctness": 0.4999999999999999, "gs_citation": 27, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7229234146494475932&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff_unique_index": "0;1", "aff_unique_norm": "Stanford University;University of California, Berkeley", "aff_unique_dep": ";", "aff_unique_url": "https://www.stanford.edu;https://www.berkeley.edu", "aff_unique_abbr": "Stanford;UC Berkeley", "aff_campus_unique_index": "0;1", "aff_campus_unique": "Stanford;Berkeley", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "97WDkHzofx", "title": "Interventional Black-Box Explanations", "track": "main", "status": "Reject", "tldr": "", "abstract": "Deep Neural Networks (DNNs) are powerful systems able to freely evolve on their own from training data. However, like any highly parametrized mathematical model, capturing the explanation of any prediction of such models is rather difficult. We believe that there exist relevant mechanisms inside the structure of post-hoc DNNs that supports transparency and interpretability. To capture these mechanisms, we quantify the effects of parameters (pieces of knowledge) on models' predictions using the framework of causality. We introduce a general formalism of the causal diagram to express cause-effect relations inside the DNN's architecture. Then, we develop a novel algorithm to construct explanations of DNN's predictions using the $do$-operator. We call our method, Interventional Black-Box Explanations. On image classification tasks, we explain the behaviour of the model and extract visual explanations from the effects of the causal filters in convolution layers. We qualitatively demonstrate that our method captures more informative concepts compared to traditional attribution-based methods. \nFinally, we believe that our method is orthogonal to logic-based explanation methods and can be leveraged to improve their explanations.", "keywords": "Causal Inference;Interventions;black-Box Models;Explanations;Deep Neural Networks", "primary_area": "", "supplementary_material": "", "author": "Ola Ahmad;Simon Corbeil;Vahid Hashemi;Freddy Lecue", "authorids": "~Ola_Ahmad1;~Simon_Corbeil1;~Vahid_Hashemi1;~Freddy_Lecue1", "gender": "F;M;M;", "homepage": ";;;http://www-sop.inria.fr/members/Freddy.Lecue/", "dblp": "99/10407;;56/10094.html;02/3657.html", "google_scholar": "https://scholar.google.fr/citations?user=MbXCWNMAAAAJ;;;https://scholar.google.ca/citations?user=GLByS4gAAAAJ", "orcid": ";;0000-0002-9167-7417;", "linkedin": ";simoncorbeilletourneau/;vahid-hashemi-152aa450/;freddylecue/", "or_profile": "~Ola_Ahmad1;~Simon_Corbeil1;~Vahid_Hashemi1;~Freddy_Lecue1", "aff": "Thales Canada;;AUDI AG;INRIA", "aff_domain": "thalesdigital.io;;audi.de;inria.fr", "position": "Researcher;;Technical Team Lead;Full Professor", "bibtex": "@misc{\nahmad2022interventional,\ntitle={Interventional Black-Box Explanations},\nauthor={Ola Ahmad and Simon Corbeil and Vahid Hashemi and Freddy Lecue},\nyear={2022},\nurl={https://openreview.net/forum?id=97WDkHzofx}\n}", "github": "", "project": "", "reviewers": "TDuY;GBET;Acmr;NpLg", "site": "https://openreview.net/forum?id=97WDkHzofx", "pdf_size": 0, "recommendation": "3;3;3;3", "confidence": "4;3;3;3", "correctness": "2;3;3;3", "technical_novelty": "2;2;3;2", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "49;44;65;34", "wc_summary_review": "45;55;69;12", "wc_main_review": "288;611;634;429", "wc_review": "382;710;768;475", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 48.0, 11.20267825120404 ], "wc_summary_review_avg": [ 45.25, 21.00446381129497 ], "wc_main_review_avg": [ 490.5, 141.33382468468048 ], "wc_review_avg": [ 583.75, 160.01308540241325 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:U0eLNiCN2JsJ:scholar.google.com/&scioq=Interventional+Black-Box+Explanations&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1;2", "aff_unique_norm": "Thales;AUDI AG;INRIA", "aff_unique_dep": ";;", "aff_unique_url": "https://www.thalesgroup.com/ca-en;https://www.audi.com;https://www.inria.fr", "aff_unique_abbr": "Thales;AUDI;INRIA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2", "aff_country_unique": "Canada;Germany;France" }, { "id": "97r5Y5DrJTo", "title": "The Effect of diversity in Meta-Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Few-shot learning aims to learn representations that can tackle novel tasks given a small number of examples. Recent studies show that task distribution plays a vital role in the performance of the model. Conventional wisdom is that task diversity should improve the performance of meta-learning. In this work, we find evidence to the contrary; we study different task distributions on a myriad of models and datasets to evaluate the effect of task diversity on meta-learning algorithms. For this experiment, we train on multiple datasets, and with three broad classes of meta-learning models - Metric-based (i.e., Protonet, Matching Networks), Optimization-based (i.e., MAML, Reptile, and MetaOptNet), and Bayesian meta-learning models (i.e., CNAPs). Our experiments demonstrate that the effect of task diversity on all these algorithms follows a similar trend, and task diversity does not seem to offer any benefits to the learning of the model. Furthermore, we also demonstrate that even a handful of tasks, repeated over multiple batches, would be sufficient to achieve a performance similar to uniform sampling and draws into question the need for additional tasks to create better models.", "keywords": "Meta-Learning;Few-shot learning", "primary_area": "", "supplementary_material": "/attachment/da378409b28eb5d80ee793d2cabe3f0634e3539c.zip", "author": "Ramnath Kumar;Tristan Deleu;Yoshua Bengio", "authorids": "~Ramnath_Kumar1;~Tristan_Deleu1;~Yoshua_Bengio1", "gender": "M;;M", "homepage": "https://ramnathkumar181.github.io/;https://tristandeleu.github.io/;http://yoshuabengio.org", "dblp": ";192/1896;56/953", "google_scholar": "csZjvdEAAAAJ;nLNwh-wAAAAJ;kukA0LcAAAAJ", "orcid": ";;", "linkedin": ";;yoshuabengio/?originalSubdomain=ca", "or_profile": "~Ramnath_Kumar1;~Tristan_Deleu1;~Yoshua_Bengio1", "aff": "Montreal Institute for Learning Algorithms, University of Montreal, University of Montreal;University of Montreal;University of Montreal", "aff_domain": "mila.umontreal.ca;umontreal.ca;umontreal.ca", "position": "Consultant;PhD student;Full Professor", "bibtex": "@misc{\nkumar2022the,\ntitle={The Effect of diversity in Meta-Learning},\nauthor={Ramnath Kumar and Tristan Deleu and Yoshua Bengio},\nyear={2022},\nurl={https://openreview.net/forum?id=97r5Y5DrJTo}\n}", "github": "", "project": "", "reviewers": "wVFn;MHHW;UoR2", "site": "https://openreview.net/forum?id=97r5Y5DrJTo", "pdf_size": 0, "recommendation": "3;3;5", "confidence": "5;4;3", "correctness": "3;3;3", "technical_novelty": "2;2;2", "empirical_novelty": "3;2;3", "wc_summary_paper": "42;47;52", "wc_summary_review": "24;40;56", "wc_main_review": "215;406;270", "wc_review": "281;493;378", "wc_reply_reviewers": "0;0;325", "wc_reply_authors": "375;841;2117", "reply_reviewers": "0;0;1", "reply_authors": "1;1;3", "recommendation_avg": [ 3.6666666666666665, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 47.0, 4.08248290463863 ], "wc_summary_review_avg": [ 40.0, 13.063945294843617 ], "wc_main_review_avg": [ 297.0, 80.27868127134792 ], "wc_review_avg": [ 384.0, 86.65256295497939 ], "wc_reply_reviewers_avg": [ 108.33333333333333, 153.2064692570853 ], "wc_reply_authors_avg": [ 1111.0, 736.3495546726884 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.9428090415820634 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.8660254037844387, "corr_recommendation_correctness": 0.0, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12213071059734001817&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Montreal", "aff_unique_dep": "Montreal Institute for Learning Algorithms", "aff_unique_url": "https://www.umontreal.ca", "aff_unique_abbr": "UM", "aff_campus_unique_index": "0", "aff_campus_unique": "Montreal;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Canada" }, { "id": "97ru13Fdmbt", "title": "Monotonicity as a requirement and as a regularizer: efficient methods and applications", "track": "main", "status": "Reject", "tldr": "", "abstract": "We study the setting where risk minimization is performed over general classes of models and consider two cases where monotonicity is treated as either a requirement to be satisfied everywhere or a useful property. We specifically consider cases where point-wise gradient penalties are used alongside the empirical risk during training. In our first contribution, we show that different choices of penalties define the regions of the input space where the property is observed. As such, previous methods result in models that are monotonic only in a small volume of the input space. We thus propose an approach that uses mixtures of training instances and random points to populate the space and enforce the penalty in a much larger region. As a second contribution, we introduce the notion of monotonicity as a regularization bias for convolutional models. In this case, we consider applications, such as image classification and generative modeling, where monotonicity is not a hard constraint but can help improve some aspects of the model. Namely, we show that using group monotonicity can be beneficial in several applications such as: (1) defining strategies to detect anomalous data, (2) allowing for controllable data generation, and (3) generating explanations for predictions. Our proposed approaches do not introduce relevant computational overhead while leading to efficient procedures that provide extra benefits over baseline models.", "keywords": "Monotonic neural networks;Gradient penalties;Structural risk minimization", "primary_area": "", "supplementary_material": "/attachment/59cd0f58561201d4a1641c658eba212a2f3bb0ad.zip", "author": "Joao Monteiro;Mohamed Osama Ahmed;Hossein Hajimirsadeghi;Greg Mori", "authorids": "~Joao_Monteiro1;~Mohamed_Osama_Ahmed1;~Hossein_Hajimirsadeghi1;~Greg_Mori2", "gender": "M;M;M;M", "homepage": ";;http://www.cs.sfu.ca/~mori/;", "dblp": "215/5354-2;64/8131;m/GregMori;https://dblp.org/pers/hd/a/Ahmed:Mohamed_Osama", "google_scholar": "https://scholar.google.ca/citations?hl=en;;Bl9FSL0AAAAJ;https://scholar.google.ca/citations?user=jyVyVj4AAAAJ", "orcid": ";;;0000-0001-6758-1178", "linkedin": "joao-monteiro-47180256/;;;mohamed-osama-ahmed-91439a154/", "or_profile": "~Joao_Monteiro1;~Hossein_Hajimirsadeghi1;~Greg_Mori1;~Mohamed_Osama_Ahmed2", "aff": "ServiceNow Research;Borealis AI;Simon Fraser University;", "aff_domain": "servicenow.com;borealisai.com;sfu.ca;", "position": "Researcher;Principal Researcher;Full Professor;", "bibtex": "@misc{\nmonteiro2022monotonicity,\ntitle={Monotonicity as a requirement and as a regularizer: efficient methods and applications},\nauthor={Joao Monteiro and Mohamed Osama Ahmed and Hossein Hajimirsadeghi and Greg Mori},\nyear={2022},\nurl={https://openreview.net/forum?id=97ru13Fdmbt}\n}", "github": "", "project": "", "reviewers": "P7xG;onzS;KKi2;Sj4Q", "site": "https://openreview.net/forum?id=97ru13Fdmbt", "pdf_size": 0, "recommendation": "5;5;5;6", "confidence": "4;3;4;3", "correctness": "3;2;3;3", "technical_novelty": "2;3;2;3", "empirical_novelty": "2;1;2;3", "wc_summary_paper": "36;112;120;56", "wc_summary_review": "184;2;48;71", "wc_main_review": "415;422;141;327", "wc_review": "635;536;309;454", "wc_reply_reviewers": "111;596;0;0", "wc_reply_authors": "614;1857;595;993", "reply_reviewers": "1;3;0;0", "reply_authors": "2;4;1;2", "recommendation_avg": [ 5.25, 0.4330127018922193 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 81.0, 35.81898937714463 ], "wc_summary_review_avg": [ 76.25, 66.98647251497873 ], "wc_main_review_avg": [ 326.25, 113.31675736624305 ], "wc_review_avg": [ 483.5, 119.40372691000897 ], "wc_reply_reviewers_avg": [ 176.75, 246.25939068388843 ], "wc_reply_authors_avg": [ 1014.75, 511.52926358127354 ], "reply_reviewers_avg": [ 1.0, 1.224744871391589 ], "reply_authors_avg": [ 2.25, 1.0897247358851685 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:YyNFDHgtgZsJ:scholar.google.com/&scioq=Monotonicity+as+a+requirement+and+as+a+regularizer:+efficient+methods+and+applications&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1;2", "aff_unique_norm": "ServiceNow;Borealis AI;Simon Fraser University", "aff_unique_dep": "Research;;", "aff_unique_url": "https://www.servicenow.com;https://www.borealisai.com;https://www.sfu.ca", "aff_unique_abbr": "ServiceNow;Borealis AI;SFU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1", "aff_country_unique": "United States;Canada" }, { "id": "99v8tgOhZH", "title": "Multi-objective optimization for Hardware-aware Neural Architecture Search", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Hardware-aware Neural Architecture Search (HW-NAS) has been drawing increasing attention since it can automatically design deep neural networks optimized in a resource-constrained device. However, existing methods may be not optimal in terms of multi-object (accuracy, hardware-metrics). Thus, we propose a new multi-objective optimization method for searching promising architectures in HW-NAS. Our method addresses the architecture selection process in NAS. An architecture population is divided to small cells by given hardware-cost metrics, then, top-ranked architecture is selected within each cell. The selected ones play knobs to guide the direction of evolution in NAS. Despite its simplicity, this method leads to promising results, improving both accuracy and hardware metrics. Using latency as a hardware metric, we demonstrated that the optimized architecture extended its top accuracy to a much lower inference latency regime. We can also significantly reduce computing cost of search using both accuracy predictor and latency estimator and sharing pre-trained weights of super-network. This makes HW-NAS research more reproducible and accessible to the public. For a target hardware, we experimented on both CPU and Field Programmable Gate Array (FPGA). The codes are available at https://anonymous.4open.science/r/multi-objective-optimization-0E27/README.md.", "keywords": "multi-objective optimization;neural architecture search;evolutionary algorithm;hardware-aware;accuracy predictor;latency estimator;FPGA", "primary_area": "", "supplementary_material": "", "author": "Taehee Jeong;Elliott Delaye", "authorids": "~Taehee_Jeong1;elliott@xilinx.com", "gender": "M;", "homepage": ";", "dblp": ";", "google_scholar": "https://scholar.google.com/citations?hl=en;", "orcid": ";", "linkedin": "taehee-jeong-173a5020/;", "or_profile": "~Taehee_Jeong1;elliott@xilinx.com", "aff": "Xilinx;", "aff_domain": "xilinx.com;", "position": "Researcher;", "bibtex": "@misc{\njeong2022multiobjective,\ntitle={Multi-objective optimization for Hardware-aware Neural Architecture Search},\nauthor={Taehee Jeong and Elliott Delaye},\nyear={2022},\nurl={https://openreview.net/forum?id=99v8tgOhZH}\n}", "github": "", "project": "", "reviewers": "5iCG;YWz5;5ZqX;yPDt", "site": "https://openreview.net/forum?id=99v8tgOhZH", "pdf_size": 0, "recommendation": "1;1;1;3", "confidence": "4;5;5;4", "correctness": "1;2;2;3", "technical_novelty": "1;1;1;1", "empirical_novelty": "1;1;1;1", "wc_summary_paper": "135;34;10;19", "wc_summary_review": "64;34;17;37", "wc_main_review": "1865;462;36;222", "wc_review": "2064;530;63;278", "wc_reply_reviewers": "758;144;0;0", "wc_reply_authors": "552;507;201;552", "reply_reviewers": "1;1;0;0", "reply_authors": "1;2;1;1", "recommendation_avg": [ 1.5, 0.8660254037844386 ], "confidence_avg": [ 4.5, 0.5 ], "correctness_avg": [ 2.0, 0.7071067811865476 ], "technical_novelty_avg": [ 1.0, 0.0 ], "empirical_novelty_avg": [ 1.0, 0.0 ], "wc_summary_paper_avg": [ 49.5, 50.102395152327794 ], "wc_summary_review_avg": [ 38.0, 16.837458240482736 ], "wc_main_review_avg": [ 646.25, 719.6688040341891 ], "wc_review_avg": [ 733.75, 785.6037089398191 ], "wc_reply_reviewers_avg": [ 225.5, 313.0091851687423 ], "wc_reply_authors_avg": [ 453.0, 146.6475366312029 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.816496580927726, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16696026947862709539&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "Xilinx", "aff_unique_dep": "", "aff_unique_url": "https://www.xilinx.com", "aff_unique_abbr": "Xilinx", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "id": "9AuUv3LKWe2", "title": "MGA-VQA: Multi-Granularity Alignment for Visual Question Answering", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Learning to answer visual questions is a challenging task since the multimodal inputs are within two feature spaces. Moreover, reasoning in visual question answering requires the model to understand both image and question, and align them in the same space, rather than simply memorize statistics about the question-answer pairs. Thus, it is essential to find component connections between different modalities and within each modality to achieve better attention. Previous works learned attention weights directly on the features. However, the improvement is limited since these two modality features are in two domains: image features are highly diverse, lacking structure and grammatical rules as language, and natural language features have a higher probability of missing detailed information. To better learn the attention between visual and text, we focus on how to construct input stratification and embed structural information to improve the alignment between different level components. We propose Multi-Granularity Alignment architecture for Visual Question Answering task (MGA-VQA), which learns intra- and inter-modality correlations by multi-granularity alignment, and outputs the final result by the decision fusion module. In contrast to previous works, our model splits alignment into different levels to achieve learning better correlations without needing additional data and annotations. The experiments on the VQA-v2 and GQA datasets demonstrate that our model significantly outperforms non-pretrained state-of-the-art methods on both datasets without extra pretraining data and annotations. Moreover, it even achieves better results over the pre-trained methods on GQA.", "keywords": "Computer Vision;Visual Question Answering", "primary_area": "", "supplementary_material": "", "author": "Peixi Xiong;Yilin Shen;Hongxia Jin", "authorids": "~Peixi_Xiong1;~Yilin_Shen1;~Hongxia_Jin1", "gender": "F;M;", "homepage": ";;", "dblp": "238/0337;30/383;", "google_scholar": "IbqmMp8AAAAJ;9PSFMzAAAAAJ;", "orcid": ";;", "linkedin": ";;", "or_profile": "~Peixi_Xiong1;~Yilin_Shen1;~Hongxia_Jin1", "aff": "Northwestern University;Samsung Research America;", "aff_domain": "northwestern.edu;gmail.com;", "position": "PhD student;Principal Researcher;", "bibtex": "@misc{\nxiong2022mgavqa,\ntitle={{MGA}-{VQA}: Multi-Granularity Alignment for Visual Question Answering},\nauthor={Peixi Xiong and Yilin Shen and Hongxia Jin},\nyear={2022},\nurl={https://openreview.net/forum?id=9AuUv3LKWe2}\n}", "github": "", "project": "", "reviewers": "n6tZ;9yyn;Uy9R;tG2C", "site": "https://openreview.net/forum?id=9AuUv3LKWe2", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "4;4;4;4", "correctness": "3;4;3;3", "technical_novelty": "2;2;3;2", "empirical_novelty": "2;2;2;0", "wc_summary_paper": "91;44;42;62", "wc_summary_review": "56;22;36;24", "wc_main_review": "225;206;142;285", "wc_review": "372;272;220;371", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 59.75, 19.651653874419832 ], "wc_summary_review_avg": [ 34.5, 13.518505834595775 ], "wc_main_review_avg": [ 214.5, 51.01225342993583 ], "wc_review_avg": [ 308.75, 65.38874138565446 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": -0.5773502691896257, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13725719942019743820&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1", "aff_unique_norm": "Northwestern University;Samsung", "aff_unique_dep": ";Samsung Research America", "aff_unique_url": "https://www.northwestern.edu;https://www.samsung.com/us/careers/research/", "aff_unique_abbr": "NU;SRA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "9BIN1yr5Gp", "title": "Parallel Deep Neural Networks Have Zero Duality Gap", "track": "main", "status": "Reject", "tldr": "", "abstract": "Training deep neural networks is a well-known highly non-convex problem. In recent works, it is shown that there is no duality gap for regularized two-layer neural networks with ReLU activation, which enables global optimization via convex programs. For multi-layer linear networks with vector outputs, we formulate convex dual problems and demonstrate that the duality gap is non-zero for depth three and deeper networks. However, by modifying the deep networks to more powerful parallel architectures, we show that the duality gap is exactly zero. Therefore, strong convex duality holds, and hence there exist equivalent convex programs that enable training deep networks to global optimality. We also demonstrate that the weight decay regularization in the parameters explicitly encourages low-rank solutions via closed-form expressions. For three-layer non-parallel ReLU networks, we show that strong duality holds for rank-1 data matrices, however, the duality gap is non-zero for whitened data matrices. Similarly, by transforming the neural network architecture into a corresponding parallel version, the duality gap vanishes.", "keywords": "Deep neural networks;convex duality;convex optimization", "primary_area": "", "supplementary_material": "", "author": "Yifei Wang;Tolga Ergen;Mert Pilanci", "authorids": "~Yifei_Wang2;~Tolga_Ergen1;~Mert_Pilanci3", "gender": "M;M;M", "homepage": "http://web.stanford.edu/~wangyf18/;https://tolgaergen.github.io/;https://stanford.edu/~pilanci/", "dblp": ";202/7477.html;45/8056", "google_scholar": ";https://scholar.google.com.tr/citations?user=T1pWaCsAAAAJ;aSAS-aAAAAAJ", "orcid": ";0000-0003-4806-0224;", "linkedin": ";;mert-pilanci-ba615743/", "or_profile": "~Yifei_Wang2;~Tolga_Ergen1;~Mert_Pilanci3", "aff": "Stanford University;Stanford University;Stanford University", "aff_domain": "stanford.edu;stanford.edu;stanford.edu", "position": "PhD student;PhD student;Assistant Professor", "bibtex": "@misc{\nwang2022parallel,\ntitle={Parallel Deep Neural Networks Have Zero Duality Gap},\nauthor={Yifei Wang and Tolga Ergen and Mert Pilanci},\nyear={2022},\nurl={https://openreview.net/forum?id=9BIN1yr5Gp}\n}", "github": "", "project": "", "reviewers": "Pya1;c3BS;3piH;eU6M", "site": "https://openreview.net/forum?id=9BIN1yr5Gp", "pdf_size": 0, "recommendation": "5;5;5;6", "confidence": "4;4;4;3", "correctness": "2;2;3;4", "technical_novelty": "2;3;3;3", "empirical_novelty": "1;0;0;0", "wc_summary_paper": "64;76;126;49", "wc_summary_review": "57;25;75;24", "wc_main_review": "373;511;361;127", "wc_review": "494;612;562;200", "wc_reply_reviewers": "47;103;0;0", "wc_reply_authors": "501;195;350;315", "reply_reviewers": "1;1;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 5.25, 0.4330127018922193 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.82915619758885 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 0.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 78.75, 28.908260065247788 ], "wc_summary_review_avg": [ 45.25, 21.706853756359994 ], "wc_main_review_avg": [ 343.0, 137.93476719087178 ], "wc_review_avg": [ 467.0, 159.7404144229005 ], "wc_reply_reviewers_avg": [ 37.5, 42.40577790820492 ], "wc_reply_authors_avg": [ 340.25, 109.16816156737275 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -1.0, "corr_recommendation_correctness": 0.8703882797784891, "gs_citation": 18, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9697613792468524971&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 7, "aff_unique_index": "0;0;0", "aff_unique_norm": "Stanford University", "aff_unique_dep": "", "aff_unique_url": "https://www.stanford.edu", "aff_unique_abbr": "Stanford", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "9Cwxjd6nRh", "title": "High Fidelity Visualization of What Your Self-Supervised Representation Knows About", "track": "main", "status": "Reject", "tldr": "", "abstract": "Discovering what is learned by neural networks remains a challenge. In self-supervised learning, classification is the most common task used to evaluate how good a representation is. However, relying only on such downstream task can limit our understanding of how much information is contained in the representation of a given input. In this work, we study how to visualize representations learned with self-supervised models. We investigate a simple gradient descend based method to match a target representation and show the limitations of such techniques. We overcome these limitations by developing a representation-conditioned diffusion model (RCDM) that is able to generate high-quality inputs that share commonalities with a given representation. We further demonstrate how our model's generation quality is on par with state-of-the-art generative models and how the representation conditioning brings new avenues to analyze and improve self-supervised models.", "keywords": "self-supervised learning;visualization;diffusion model;conditional generative model;representation", "primary_area": "", "supplementary_material": "", "author": "Florian Bordes;Randall Balestriero;Pascal Vincent", "authorids": "~Florian_Bordes1;~Randall_Balestriero1;~Pascal_Vincent1", "gender": "M;M;M", "homepage": ";https://randallbalestriero.github.io/;http://www.iro.umontreal.ca/~vincentp", "dblp": "194/9862;175/5364;43/861", "google_scholar": "OADfWhUAAAAJ;S1x_xqcAAAAJ;WBCKQMsAAAAJ", "orcid": ";;", "linkedin": "florianbordes;randallbalestriero/;", "or_profile": "~Florian_Bordes1;~Randall_Balestriero1;~Pascal_Vincent1", "aff": "University of Montreal;Meta Facebook;Facebook A.I. Research", "aff_domain": "umontreal.ca;facebook.com;fb.com", "position": "PhD student;Postdoc;Research Scientist", "bibtex": "@misc{\nbordes2022high,\ntitle={High Fidelity Visualization of What Your Self-Supervised Representation Knows About},\nauthor={Florian Bordes and Randall Balestriero and Pascal Vincent},\nyear={2022},\nurl={https://openreview.net/forum?id=9Cwxjd6nRh}\n}", "github": "", "project": "", "reviewers": "YLR1;MCTj;rQaS;mFTx", "site": "https://openreview.net/forum?id=9Cwxjd6nRh", "pdf_size": 0, "recommendation": "5;5;6;8", "confidence": "4;4;4;3", "correctness": "4;3;3;4", "technical_novelty": "2;3;2;3", "empirical_novelty": "3;2;2;4", "wc_summary_paper": "81;55;91;94", "wc_summary_review": "147;44;129;23", "wc_main_review": "773;399;607;261", "wc_review": "1001;498;827;378", "wc_reply_reviewers": "0;0;29;68", "wc_reply_authors": "1213;1008;1517;203", "reply_reviewers": "0;0;1;1", "reply_authors": "2;2;3;1", "recommendation_avg": [ 6.0, 1.224744871391589 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 80.25, 15.35211711784404 ], "wc_summary_review_avg": [ 85.75, 53.15719612620666 ], "wc_main_review_avg": [ 510.0, 195.51214796017152 ], "wc_review_avg": [ 676.0, 249.4564090176879 ], "wc_reply_reviewers_avg": [ 24.25, 27.896012259819503 ], "wc_reply_authors_avg": [ 985.25, 486.58523148570794 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 2.0, 0.7071067811865476 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.9428090415820632, "corr_recommendation_correctness": 0.40824829046386296, "gs_citation": 63, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11708986819601945526&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1;1", "aff_unique_norm": "University of Montreal;Meta", "aff_unique_dep": ";Meta Platforms, Inc.", "aff_unique_url": "https://wwwumontreal.ca;https://meta.com", "aff_unique_abbr": "UM;Meta", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1", "aff_country_unique": "Canada;United States" }, { "id": "9FfAEgUYGON", "title": "Mismatched No More: Joint Model-Policy Optimization for Model-Based RL", "track": "main", "status": "Reject", "tldr": "", "abstract": "Many model-based reinforcement learning (RL) methods follow a similar template: fit a model to previously observed data, and then use data from that model for RL or planning. However, models that achieve better training performance (e.g., lower MSE) are not necessarily better for control: an RL agent may seek out the small fraction of states where an accurate model makes mistakes, or it might act in ways that do not expose the errors of an inaccurate model. As noted in prior work, there is an objective mismatch: models are useful if they yield good policies, but they are trained to maximize their accuracy, rather than the performance of the policies that result from them. In this work we propose a single objective for jointly training the model and the policy, such that updates to either component increases a lower bound on expected return. This joint optimization mends the objective mismatch in prior work. Our objective is a global lower bound on expected return, and this bound becomes tight under certain assumptions. The resulting algorithm (MnM) is conceptually similar to a GAN: a classifier distinguishes between real and fake transitions, the model is updated to produce transitions that look realistic, and the policy is updated to avoid states where the model predictions are unrealistic.", "keywords": "reinforcement learning;model-based RL;joint optimization", "primary_area": "", "supplementary_material": "", "author": "Benjamin Eysenbach;Alexander Khazatsky;Sergey Levine;Ruslan Salakhutdinov", "authorids": "~Benjamin_Eysenbach1;~Alexander_Khazatsky1;~Sergey_Levine1;~Ruslan_Salakhutdinov1", "gender": "M;M;M;M", "homepage": "https://ben-eysenbach.github.io/;https://www.linkedin.com/in/alexander-khazatsky-b98841149/;https://people.eecs.berkeley.edu/~svlevine/;https://www.cs.cmu.edu/~rsalakhu/", "dblp": "192/1863;;80/7594;", "google_scholar": "DRnOvU8AAAAJ;;8R35rCwAAAAJ;", "orcid": "0009-0000-7136-6307;;;", "linkedin": "benjamin-eysenbach-a7235775/;;;", "or_profile": "~Benjamin_Eysenbach1;~Alexander_Khazatsky1;~Sergey_Levine1;~Russ_Salakhutdinov1", "aff": "Carnegie Mellon University;University of California, Berkeley;Google;School of Computer Science, Carnegie Mellon University", "aff_domain": "cmu.edu;berkeley.edu;google.com;cs.cmu.edu", "position": "PhD student;Undergraduate Researcher;Research Scientist;Full Professor", "bibtex": "@misc{\neysenbach2022mismatched,\ntitle={Mismatched No More: Joint Model-Policy Optimization for Model-Based {RL}},\nauthor={Benjamin Eysenbach and Alexander Khazatsky and Sergey Levine and Ruslan Salakhutdinov},\nyear={2022},\nurl={https://openreview.net/forum?id=9FfAEgUYGON}\n}", "github": "", "project": "", "reviewers": "D2vZ;yCzC;uKVv;7QkN", "site": "https://openreview.net/forum?id=9FfAEgUYGON", "pdf_size": 0, "recommendation": "3;6;6;6", "confidence": "4;4;3;4", "correctness": "2;4;4;3", "technical_novelty": "2;3;2;3", "empirical_novelty": "2;3;2;3", "wc_summary_paper": "131;137;61;116", "wc_summary_review": "29;21;27;46", "wc_main_review": "957;391;132;447", "wc_review": "1117;549;220;609", "wc_reply_reviewers": "174;28;94;0", "wc_reply_authors": "2055;535;331;1250", "reply_reviewers": "1;1;1;0", "reply_authors": "5;2;3;4", "recommendation_avg": [ 5.25, 1.299038105676658 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 111.25, 30.003124837256536 ], "wc_summary_review_avg": [ 30.75, 9.283722313813572 ], "wc_main_review_avg": [ 481.75, 299.011182901242 ], "wc_review_avg": [ 623.75, 320.9808210781448 ], "wc_reply_reviewers_avg": [ 74.0, 67.06713054842886 ], "wc_reply_authors_avg": [ 1042.75, 676.7571111558415 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 3.5, 1.118033988749895 ], "replies_avg": [ 22, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": 0.8703882797784892, "gs_citation": 57, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8013275836022497815&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "Carnegie Mellon University;University of California, Berkeley;Google", "aff_unique_dep": ";;Google", "aff_unique_url": "https://www.cmu.edu;https://www.berkeley.edu;https://www.google.com", "aff_unique_abbr": "CMU;UC Berkeley;Google", "aff_campus_unique_index": "1;2;3", "aff_campus_unique": ";Berkeley;Mountain View;Pittsburgh", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "9HXfisrWl1", "title": "DeepDebug: Fixing Python Bugs Using Stack Traces, Backtranslation, and Code Skeletons", "track": "main", "status": "Reject", "tldr": "", "abstract": "The joint task of bug localization and program repair is an integral part of the software development process. In this work we present DeepDebug, an approach to automated debugging using large, pretrained transformers. We begin by training a bug-creation model on reversed commit data for the purpose of generating synthetic bugs. We apply these synthetic bugs toward two ends. First, we directly train a backtranslation model on all functions from 200K repositories. Next, we focus on 10K repositories for which we can execute tests, and create buggy versions of all functions in those repositories that are covered by passing tests. This provides us with rich debugging information such as stack traces and print statements, which we use to finetune our model which was pretrained on raw source code. Finally, we strengthen all our models by expanding the context window beyond the buggy function itself, and adding a skeleton consisting of that function's parent class, imports, signatures, docstrings, and method bodies, in order of priority. On the QuixBugs benchmark, we increase the total number of fixes found by over 50%, while also decreasing the false positive rate from 35% to 5% and decreasing the timeout from six hours to one minute. On our own benchmark of executable tests, our model fixes 68% of all bugs on its first attempt without using traces, and after adding traces it fixes 75% on first attempt.", "keywords": "program repair;bugpatching;backtranslation;transformers", "primary_area": "", "supplementary_material": "", "author": "Dawn Drain;Colin Clement;Guillermo Serrato Castilla;Neel Sundaresan", "authorids": "~Dawn_Drain1;~Colin_Clement1;~Guillermo_Serrato_Castilla1;~Neel_Sundaresan3", "gender": "F;;;", "homepage": ";https://cbclement.com;;https://www.linkedin.com/in/neel-sundaresan-a964a2/", "dblp": "274/2078.html;;;s/NeelSundaresan.html", "google_scholar": "NIrBaDIAAAAJ;J2aZLEYAAAAJ;;", "orcid": "0000-0002-6606-4141;0000-0002-3727-7308;;", "linkedin": "dawn-drain-414601a8;colin-b-clement/;guillermo-serrato;neel-sundaresan-a964a2/", "or_profile": "~Dawn_Drain1;~Colin_Clement1;~Guillermo_Serrato_Castilla1;~Neel_Sundaresan3", "aff": "Microsoft;Microsoft;Microsoft;University of California, Santa Cruz", "aff_domain": "microsoft.com;microsoft.com;microsoft.com;ucsc.edu", "position": "Researcher;Senior Research Manager;Researcher;Full Professor (adjunct)", "bibtex": "@misc{\ndrain2022deepdebug,\ntitle={DeepDebug: Fixing Python Bugs Using Stack Traces, Backtranslation, and Code Skeletons},\nauthor={Dawn Drain and Colin Clement and Guillermo Serrato Castilla and Neel Sundaresan},\nyear={2022},\nurl={https://openreview.net/forum?id=9HXfisrWl1}\n}", "github": "", "project": "", "reviewers": "K9aQ;sVvX;vC4c;Rbe2", "site": "https://openreview.net/forum?id=9HXfisrWl1", "pdf_size": 0, "recommendation": "3;5;5;5", "confidence": "4;3;3;4", "correctness": "2;3;4;3", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;3;2;2", "wc_summary_paper": "64;184;80;35", "wc_summary_review": "88;69;64;2", "wc_main_review": "195;889;296;256", "wc_review": "347;1142;440;293", "wc_reply_reviewers": "0;0;7;85", "wc_reply_authors": "374;807;97;370", "reply_reviewers": "0;0;1;1", "reply_authors": "1;1;1;1", "recommendation_avg": [ 4.5, 0.8660254037844386 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 90.75, 56.202201914159915 ], "wc_summary_review_avg": [ 55.75, 32.29841327371981 ], "wc_main_review_avg": [ 409.0, 279.45214259332494 ], "wc_review_avg": [ 555.5, 342.67367859233076 ], "wc_reply_reviewers_avg": [ 23.0, 35.90960874195095 ], "wc_reply_authors_avg": [ 412.0, 254.1938236857851 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.816496580927726, "gs_citation": 36, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7813102027202328433&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;0;1", "aff_unique_norm": "Microsoft;University of California, Santa Cruz", "aff_unique_dep": "Microsoft Corporation;", "aff_unique_url": "https://www.microsoft.com;https://www.ucsc.edu", "aff_unique_abbr": "Microsoft;UCSC", "aff_campus_unique_index": "1", "aff_campus_unique": ";Santa Cruz", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "9HmtMeHmyR4", "title": "Self-Supervision is All You Need for Solving Rubik's Cube", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "While combinatorial problems are of great academic and practical importance, previous approaches like explicit heuristics and reinforcement learning have been complex and costly. To address this, we developed a simple and robust method to train a Deep Neural Network (DNN) through self-supervised learning for solving a goal-predefined combinatorial problem. Assuming that more optimal moves occur more frequently as a path of random moves connecting two problem states, the DNN can approximate an optimal solver by learning to predict the last move of a random scramble based on the problem state. Tested on 1,000 scrambled Rubik's Cube instances, a Transformer-based model could solve all of them near-optimally using a breadth-first search; with a maximum breadth of $10^3$, the mean solution length was $20.5$ moves. The proposed method may apply to other goal-predefined combinatorial problems, though it has a few constraints.", "keywords": "Rubik's Cube;self-supervised learning;combinatorial search;pathfinding;planning", "primary_area": "", "supplementary_material": "", "author": "Kyo Takano", "authorids": "~Kyo_Takano1", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@misc{\ntakano2022selfsupervision,\ntitle={Self-Supervision is All You Need for Solving Rubik's Cube},\nauthor={Kyo Takano},\nyear={2022},\nurl={https://openreview.net/forum?id=9HmtMeHmyR4}\n}", "github": "", "project": "", "reviewers": "rL6t;DxXr;vZZv;RsVr", "site": "https://openreview.net/forum?id=9HmtMeHmyR4", "pdf_size": 0, "recommendation": "3;5;5;5", "confidence": "5;4;4;5", "correctness": "2;2;3;3", "technical_novelty": "2;3;2;2", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "16;92;56;159", "wc_summary_review": "22;85;258;229", "wc_main_review": "215;474;613;584", "wc_review": "253;651;927;972", "wc_reply_reviewers": "0;502;0;0", "wc_reply_authors": "273;1224;290;559", "reply_reviewers": "0;2;0;0", "reply_authors": "1;3;1;2", "recommendation_avg": [ 4.5, 0.8660254037844386 ], "confidence_avg": [ 4.5, 0.5 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 80.75, 52.57078561330428 ], "wc_summary_review_avg": [ 148.5, 98.11345473481198 ], "wc_main_review_avg": [ 471.5, 156.90522617172445 ], "wc_review_avg": [ 700.75, 286.2344973968023 ], "wc_reply_reviewers_avg": [ 125.5, 217.3723763498941 ], "wc_reply_authors_avg": [ 586.5, 385.14834804267304 ], "reply_reviewers_avg": [ 0.5, 0.8660254037844386 ], "reply_authors_avg": [ 1.75, 0.82915619758885 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 1, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6961073818161935467&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5 }, { "title": "Representational Continuity for Unsupervised Continual Learning", "status": "Oral", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7119", "id": "9Hrka5PA7LW", "poster": "", "openreview": "https://openreview.net/forum?id=9Hrka5PA7LW", "slides": "https://iclr.cc/virtual/2022/poster/7119", "video": "https://iclr.cc/virtual/2022/poster/7119", "author_site": "Divyam Madaan, Jaehong Yoon, Yuanchun Li, Yunxin Liu, Sung Ju Hwang", "tldr": "", "abstract": "Continual learning (CL) aims to learn a sequence of tasks without forgetting the previously acquired knowledge. However, recent CL advances are restricted to supervised continual learning (SCL) scenarios. Consequently, they are not scalable to real-world applications where the data distribution is often biased and unannotated. In this work, we focus on unsupervised continual learning (UCL), where we learn the feature representations on an unlabelled sequence of tasks and show that reliance on annotated data is not necessary for continual learning. We conduct a systematic study analyzing the learned feature representations and show that unsupervised visual representations are surprisingly more robust to catastrophic forgetting, consistently achieve better performance, and generalize better to out-of-distribution tasks than SCL. Furthermore, we find that UCL achieves a smoother loss landscape through qualitative analysis of the learned representations and learns meaningful feature representations. Additionally, we propose Lifelong Unsupervised Mixup (LUMP), a simple yet effective technique that interpolates between the current task and previous tasks' instances to alleviate catastrophic forgetting for unsupervised representations. ", "keywords": "Continual Learning;Representational Learning;Deep Learning", "primary_area": "", "supplementary_material": "", "author": "Divyam Madaan;Jaehong Yoon;Yuanchun Li;Yunxin Liu;Sung Ju Hwang", "authorids": "~Divyam_Madaan1;~Jaehong_Yoon1;~Yuanchun_Li1;~Yunxin_Liu2;~Sung_Ju_Hwang1", "gender": "M;M;M;;", "homepage": "https://dmadaan.com/;https://jaehong31.github.io/;http://yuanchun-li.github.io/;;", "dblp": "239/4899;203/4449;;;", "google_scholar": "DNk4dZkAAAAJ;-5comoUAAAAJ;B3hg7BgAAAAJ;;", "orcid": ";;0000-0002-1591-2526;;", "linkedin": ";jaehongyoon/;;;", "or_profile": "~Divyam_Madaan1;~Jaehong_Yoon1;~Yuanchun_Li1;~Yunxin_Liu2;~Sung_Ju_Hwang1", "aff": "New York University;Korea Advanced Institute of Science and Technology (KAIST);Institute for AI Industry Research, Tsinghua University;;", "aff_domain": "nyu.edu;kaist.ac.kr;tsinghua.edu.cn;;", "position": "PhD student;PhD student;Assistant Professor;;", "bibtex": "@inproceedings{\nmadaan2022representational,\ntitle={Representational Continuity for Unsupervised Continual Learning},\nauthor={Divyam Madaan and Jaehong Yoon and Yuanchun Li and Yunxin Liu and Sung Ju Hwang},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=9Hrka5PA7LW}\n}", "github": "", "project": "", "reviewers": "u1jR;QCJy;tbme;synL", "pdf_size": 0, "recommendation": "8;8;8;8", "confidence": "3;4;4;4", "correctness": "3;4;4;3", "technical_novelty": "3;3;3;3", "empirical_novelty": "3;3;4;3", "wc_summary_paper": "59;79;157;62", "wc_summary_review": "90;42;80;23", "wc_main_review": "308;256;216;229", "wc_review": "457;377;453;314", "wc_reply_reviewers": "17;21;0;12", "wc_reply_authors": "1149;609;687;425", "reply_reviewers": "1;1;0;1", "reply_authors": "2;2;1;2", "recommendation_avg": [ 8.0, 0.0 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 3.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 89.25, 39.85207020971433 ], "wc_summary_review_avg": [ 58.75, 27.325583250865844 ], "wc_main_review_avg": [ 252.25, 35.27304211433995 ], "wc_review_avg": [ 400.25, 59.12433931977591 ], "wc_reply_reviewers_avg": [ 12.5, 7.88986691902975 ], "wc_reply_authors_avg": [ 717.5, 266.6697395656283 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.75, 0.4330127018922193 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 174, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=524440573047821971&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=9Hrka5PA7LW", "email": "nyu.edu;kaist.ac.kr;tsinghua.edu.cn;;", "author_num": 5, "aff_unique_index": "0;1;2", "aff_unique_norm": "New York University;Korea Advanced Institute of Science and Technology;Tsinghua University", "aff_unique_dep": ";;Institute for AI Industry Research", "aff_unique_url": "https://www.nyu.edu;https://www.kaist.ac.kr;https://www.tsinghua.edu.cn", "aff_unique_abbr": "NYU;KAIST;Tsinghua", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2", "aff_country_unique": "United States;South Korea;China" }, { "id": "9KVfvieKho6", "title": "LPMARL: Linear Programming based Implicit Task Assigment for Hiearchical Multi-Agent Reinforcement Learning", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Training a multi-agent reinforcement learning (MARL) model with a sparse reward is notoriously difficult because the final outcome (i.e., success or failure) is induced by numerous combinations of interactions among agents. Earlier studies have tried to resolve this issue by using hierarchical MARL to decompose the main task into subproblems or employing an intrinsic reward to induce interactions for learning an effective policy. However, none of the methodologies have shown significant success. In this study, we employ a hierarchically structured policy to induce effective coordination among agents. At every step, LPMARL conducts the two hierarchical decision-makings: (1) solving an agent-task assignment problem and (2) solving a local cooperative game among agents assigned to the same task. For the first step, LPMARL formulates the agent-task assignment problem into a resource assignment problem, a type of linear programming (LP). For this, LPMARL uses a graph neural network to generate state-dependent cost coefficients for the LP problem. The solution of the formulated LP is the assignments of agents to tasks, which decompose agents into tasks to accomplish the sub-goals among the agents in the group. For the lower-level decision, LPMARL employs a general MARL strategy to solve each sub-task. We train the GNN generating the state-dependent LP for high-level decisions and the low-level cooperative MARL policy together end-to-end using implicit function theorem. We empirically demonstrate that our algorithm outperforms existing algorithms in various mixed cooperative-competitive environments.\n", "keywords": "Multi-Agent Reinforcement Learning;Hierarchical Multi-Agent Reinforcement Learning;Implicit Deep Learning;Differentiable Optimization", "primary_area": "", "supplementary_material": "/attachment/42a574b53dde27e47eae4a2db7984ab63e1d4952.zip", "author": "Kyuree Ahn;Jinkyoo Park", "authorids": "~Kyuree_Ahn1;~Jinkyoo_Park1", "gender": "F;M", "homepage": ";http://silab.kaist.ac.kr/", "dblp": ";156/7535", "google_scholar": "q0d5bq4AAAAJ;sH2a0nkAAAAJ", "orcid": ";0000-0003-2620-1479", "linkedin": ";", "or_profile": "~Kyuree_Ahn1;~Jinkyoo_Park1", "aff": "Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology", "aff_domain": "kaist.ac.kr;kaist.ac.kr", "position": "PhD student;Associate Professor", "bibtex": "@misc{\nahn2022lpmarl,\ntitle={{LPMARL}: Linear Programming based Implicit Task Assigment for Hiearchical Multi-Agent Reinforcement Learning},\nauthor={Kyuree Ahn and Jinkyoo Park},\nyear={2022},\nurl={https://openreview.net/forum?id=9KVfvieKho6}\n}", "github": "", "project": "", "reviewers": "C9Ff;85J9;ZBdT;sw9Q;tszB", "site": "https://openreview.net/forum?id=9KVfvieKho6", "pdf_size": 0, "recommendation": "3;3;3;5;5", "confidence": "5;2;5;3;3", "correctness": "3;3;3;4;3", "technical_novelty": "2;3;2;3;2", "empirical_novelty": "2;2;2;2;2", "wc_summary_paper": "41;35;55;54;60", "wc_summary_review": "32;94;47;21;39", "wc_main_review": "251;507;476;262;214", "wc_review": "324;636;578;337;313", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "0;0;0;0;0", "reply_reviewers": "0;0;0;0;0", "reply_authors": "0;0;0;0;0", "recommendation_avg": [ 3.8, 0.9797958971132712 ], "confidence_avg": [ 3.6, 1.2000000000000002 ], "correctness_avg": [ 3.2, 0.39999999999999997 ], "technical_novelty_avg": [ 2.4, 0.4898979485566356 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 49.0, 9.402127418834526 ], "wc_summary_review_avg": [ 46.6, 25.19206224190469 ], "wc_main_review_avg": [ 342.0, 123.48765120448279 ], "wc_review_avg": [ 437.6, 139.73202925600128 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.4082482904638631, "corr_recommendation_correctness": 0.6123724356957946, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:7UcrYH5CHGYJ:scholar.google.com/&scioq=LPMARL:+Linear+Programming+based+Implicit+Task+Assigment+for+Hiearchical+Multi-Agent+Reinforcement+Learning&hl=en&as_sdt=0,33", "gs_version_total": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Korea Advanced Institute of Science and Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.kaist.ac.kr", "aff_unique_abbr": "KAIST", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "South Korea" }, { "title": "Adversarially Robust Conformal Prediction", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6450", "id": "9L1BsI4wP1H", "poster": "", "openreview": "https://openreview.net/forum?id=9L1BsI4wP1H", "slides": "https://iclr.cc/virtual/2022/poster/6450", "video": "https://iclr.cc/virtual/2022/poster/6450", "author_site": "Asaf Gendler, Tsui-Wei Weng, Luca Daniel, Yaniv Romano", "tldr": "", "abstract": "Conformal prediction is a model-agnostic tool for constructing prediction sets that are valid under the common i.i.d. assumption, which has been applied to quantify the prediction uncertainty of deep net classifiers. In this paper, we generalize this framework to the case where adversaries exist during inference time, under which the i.i.d. assumption is grossly violated. By combining conformal prediction with randomized smoothing, our proposed method forms a prediction set with finite-sample coverage guarantee that holds for any data distribution with $\\ell_2$-norm bounded adversarial noise, generated by any adversarial attack algorithm. The core idea is to bound the Lipschitz constant of the non-conformity score by smoothing it with Gaussian noise and leverage this knowledge to account for the effect of the unknown adversarial perturbation. We demonstrate the necessity of our method in the adversarial setting and the validity of our theoretical guarantee on three widely used benchmark data sets: CIFAR10, CIFAR100, and ImageNet.", "keywords": "Conformal Prediction;Adversarial Robustness;Randomized Smoothing;Uncertainty Estimation;Calibration", "primary_area": "", "supplementary_material": "/attachment/dd1d8b8cd038c5f5af21205164d7a36773f76009.zip", "author": "Asaf Gendler;Tsui-Wei Weng;Luca Daniel;Yaniv Romano", "authorids": "~Asaf_Gendler1;~Tsui-Wei_Weng1;~Luca_Daniel1;~Yaniv_Romano1", "gender": "M;F;;M", "homepage": ";https://lilywenglab.github.io;https://www.mit.edu/~dluca/;https://sites.google.com/view/yaniv-romano/", "dblp": "308/4674;177/9197;35/5202;142/0021", "google_scholar": "O7q6DkEAAAAJ;v8GM4xoAAAAJ;;L_m67ywAAAAJ", "orcid": ";;0000-0002-5880-3151;", "linkedin": "asaf-gendler-7b9051163/;;;", "or_profile": "~Asaf_Gendler1;~Tsui-Wei_Weng1;~Luca_Daniel1;~Yaniv_Romano1", "aff": "Technion;University of California, San Diego;;Technion, Technion", "aff_domain": "ac.il;ucsd.edu;;technion.ac.il", "position": "MS student;Assistant Professor;;Assistant Professor", "bibtex": "@inproceedings{\ngendler2022adversarially,\ntitle={Adversarially Robust Conformal Prediction},\nauthor={Asaf Gendler and Tsui-Wei Weng and Luca Daniel and Yaniv Romano},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=9L1BsI4wP1H}\n}", "github": "", "project": "", "reviewers": "m5fr;oSbg;t8Jp;Cuc8", "pdf_size": 0, "recommendation": "5;6;8;8", "confidence": "4;4;3;4", "correctness": "3;3;4;4", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "67;220;66;99", "wc_summary_review": "41;125;49;34", "wc_main_review": "567;409;193;247", "wc_review": "675;754;308;380", "wc_reply_reviewers": "970;0;0;0", "wc_reply_authors": "1463;145;203;145", "reply_reviewers": "2;0;0;0", "reply_authors": "3;1;1;1", "recommendation_avg": [ 6.75, 1.299038105676658 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 113.0, 63.186232677696495 ], "wc_summary_review_avg": [ 62.25, 36.615399765672365 ], "wc_main_review_avg": [ 354.0, 146.42745644174798 ], "wc_review_avg": [ 529.25, 189.0652995660494 ], "wc_reply_reviewers_avg": [ 242.5, 420.0223208354527 ], "wc_reply_authors_avg": [ 489.0, 562.8374543329539 ], "reply_reviewers_avg": [ 0.5, 0.8660254037844386 ], "reply_authors_avg": [ 1.5, 0.8660254037844386 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.5555555555555555, "corr_recommendation_correctness": 0.9622504486493761, "gs_citation": 43, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7245917339471935490&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2, "pdf": "https://openreview.net/pdf?id=9L1BsI4wP1H", "email": "ac.il;ucsd.edu;;technion.ac.il", "author_num": 4, "aff_unique_index": "0;1;0", "aff_unique_norm": "Technion - Israel Institute of Technology;University of California, San Diego", "aff_unique_dep": ";", "aff_unique_url": "https://www.technion.ac.il/en/;https://www.ucsd.edu", "aff_unique_abbr": "Technion;UCSD", "aff_campus_unique_index": "1", "aff_campus_unique": ";San Diego", "aff_country_unique_index": "0;1;0", "aff_country_unique": "Israel;United States" }, { "id": "9LJkfH5rtc", "title": "Patchwise Sparse Dictionary Learning from pre-trained Neural Network Activation Maps for Anomaly Detection in Images", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "In this work, we investigate a methodology to perform anomaly detection and localization on images. The method leverages both sparse representation learning and the adoption of a pre-trained neural network for classification purposes. The objective is to assess the effectiveness of the K-SVD sparse dictionary learning algorithm and understand the role of neural network activation maps as data descriptors. We extract meaningful representation features and build a sparse dictionary of the most expressive ones. The dictionary is built only over features coming from images without anomalies. Thus, images containing anomalies will either have a non-sparse representation as linear combinations of the dictionary elements or a high reconstruction error. We show that the proposed pipeline achieves state-of-the-art performance in terms of AUC-ROC score over benchmarks such as MVTec Anomaly Detection, Rd-MVTec Anomaly Detection, Magnetic Tiles Defect, and BeanTech Anomaly Detection Datasets. ", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/ac70bd3911e69dd64ff2fda195759ade1eda4355.zip", "author": "Stefano Samele;Matteo Matteucci", "authorids": "~Stefano_Samele1;~Matteo_Matteucci1", "gender": "M;M", "homepage": ";https://www.deib.polimi.it/eng/people/details/267262", "dblp": "297/0148;19/2200", "google_scholar": "9eINCUAAAAAJ;https://scholar.google.com.tw/citations?user=PdbEg5YAAAAJ", "orcid": "0000-0002-3325-9225;0000-0002-8306-6739", "linkedin": ";matteo-matteucci-a5b59717/", "or_profile": "~Stefano_Samele1;~Matteo_Matteucci1", "aff": "Politecnico di Milano;Politecnico di Milano", "aff_domain": "polimi.it;polimi.it", "position": "PhD student;Full Professor", "bibtex": "@misc{\nsamele2022patchwise,\ntitle={Patchwise Sparse Dictionary Learning from pre-trained Neural Network Activation Maps for Anomaly Detection in Images},\nauthor={Stefano Samele and Matteo Matteucci},\nyear={2022},\nurl={https://openreview.net/forum?id=9LJkfH5rtc}\n}", "github": "", "project": "", "reviewers": "hCgd;xLeb;Az8x;dADN", "site": "https://openreview.net/forum?id=9LJkfH5rtc", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "5;5;5;4", "correctness": "3;3;4;3", "technical_novelty": "1;2;2;2", "empirical_novelty": "1;3;2;2", "wc_summary_paper": "62;96;71;92", "wc_summary_review": "71;110;19;20", "wc_main_review": "207;325;133;444", "wc_review": "340;531;223;556", "wc_reply_reviewers": "79;17;0;0", "wc_reply_authors": "388;588;453;594", "reply_reviewers": "1;1;0;0", "reply_authors": "2;1;1;1", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 4.75, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 1.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 80.25, 14.184057952504283 ], "wc_summary_review_avg": [ 55.0, 38.08543028508409 ], "wc_main_review_avg": [ 277.25, 118.14054130568388 ], "wc_review_avg": [ 412.5, 137.65990701725758 ], "wc_reply_reviewers_avg": [ 24.0, 32.50384592629001 ], "wc_reply_authors_avg": [ 505.75, 88.31867016661879 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.6622661785325219, "corr_recommendation_correctness": 0.13245323570650439, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6927222596407594634&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0", "aff_unique_norm": "Politecnico di Milano", "aff_unique_dep": "", "aff_unique_url": "https://www.polimi.it", "aff_unique_abbr": "Polimi", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Italy" }, { "title": "Distributionally Robust Fair Principal Components via Geodesic Descents", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6615", "id": "9NVd-DMtThY", "poster": "", "openreview": "https://openreview.net/forum?id=9NVd-DMtThY", "slides": "https://iclr.cc/virtual/2022/poster/6615", "video": "https://iclr.cc/virtual/2022/poster/6615", "author_site": "Hieu Vu, Toan Tran, Man-Chung Yue, Viet Anh Nguyen", "tldr": "", "abstract": "Principal component analysis is a simple yet useful dimensionality reduction technique in modern machine learning pipelines. In consequential domains such as college admission, healthcare and credit approval, it is imperative to take into account emerging criteria such as the fairness and the robustness of the learned projection. In this paper, we propose a distributionally robust optimization problem for principal component analysis which internalizes a fairness criterion in the objective function. The learned projection thus balances the trade-off between the total reconstruction error and the reconstruction error gap between subgroups, taken in the min-max sense over all distributions in a moment-based ambiguity set. The resulting optimization problem over the Stiefel manifold can be efficiently solved by a Riemannian subgradient descent algorithm with a sub-linear convergence rate. Our experimental results on real-world datasets show the merits of our proposed method over state-of-the-art baselines. ", "keywords": "fair principal component analysis;distributionally robust optimization;manifold optimization", "primary_area": "", "supplementary_material": "/attachment/5acab76e829bf9742c11d96850e8882322444cbe.zip", "author": "Hieu Vu;Toan Tran;Man-Chung Yue;Viet Anh Nguyen", "authorids": "~Hieu_Vu1;~Toan_Tran1;~Man-Chung_Yue1;~Viet_Anh_Nguyen2", "gender": "M;M;;M", "homepage": "https://github.com/hieuvt29;;;http://www.vietanhnguyen.net", "dblp": ";207/8479-3;;", "google_scholar": ";https://scholar.google.com.au/citations?user=PnwSuNMAAAAJ;;3iyf-EoAAAAJ", "orcid": ";0000-0001-7182-7548;;", "linkedin": ";;;", "or_profile": "~Hieu_Vu1;~Toan_Tran1;~Man-Chung_Yue1;~Viet_Anh_Nguyen2", "aff": "University of Iowa;Hanoi University of Science and Technology;;VinAI Research, Vietnam", "aff_domain": "cs.uiowa.edu;hust.edu.vn;;vinai.io", "position": "PhD student;Lecturer;;Research Scientist", "bibtex": "@inproceedings{\nvu2022distributionally,\ntitle={Distributionally Robust Fair Principal Components via Geodesic Descents},\nauthor={Hieu Vu and Toan Tran and Man-Chung Yue and Viet Anh Nguyen},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=9NVd-DMtThY}\n}", "github": "", "project": "", "reviewers": "oUxn;8Bm5;wgFH;Hi3C", "pdf_size": 0, "recommendation": "5;6;6;6", "confidence": "5;4;3;4", "correctness": "3;3;3;4", "technical_novelty": "2;3;3;2", "empirical_novelty": "2;2;3;2", "wc_summary_paper": "57;51;80;37", "wc_summary_review": "264;33;34;46", "wc_main_review": "107;204;402;146", "wc_review": "428;288;516;229", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "1127;891;1709;752", "reply_reviewers": "0;0;0;0", "reply_authors": "2;2;3;1", "recommendation_avg": [ 5.75, 0.4330127018922193 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 56.25, 15.514106484100203 ], "wc_summary_review_avg": [ 94.25, 98.1386137053097 ], "wc_main_review_avg": [ 214.75, 113.48430508224475 ], "wc_review_avg": [ 365.25, 113.13128435583148 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 1119.75, 365.66198530883685 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.0, 0.7071067811865476 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.816496580927726, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 18, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9660473860857986539&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=9NVd-DMtThY", "email": "cs.uiowa.edu;hust.edu.vn;;vinai.io", "author_num": 4, "aff_unique_index": "0;1;2", "aff_unique_norm": "University of Iowa;Hanoi University of Science and Technology;VinAI Research", "aff_unique_dep": ";;", "aff_unique_url": "https://www.uiowa.edu;https://www.hust.edu.vn;https://www.vin.ai", "aff_unique_abbr": "UIowa;HUST;VinAI", "aff_campus_unique_index": "1", "aff_campus_unique": ";Hanoi", "aff_country_unique_index": "0;1;1", "aff_country_unique": "United States;Vietnam" }, { "title": "Audio Lottery: Speech Recognition Made Ultra-Lightweight, Noise-Robust, and Transferable", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6428", "id": "9Nk6AJkVYB", "poster": "", "openreview": "https://openreview.net/forum?id=9Nk6AJkVYB", "slides": "https://iclr.cc/virtual/2022/poster/6428", "video": "https://iclr.cc/virtual/2022/poster/6428", "author_site": "Shaojin Ding, Tianlong Chen, Zhangyang Wang", "tldr": "", "abstract": "Lightweight speech recognition models have seen explosive demands owing to a growing amount of speech-interactive features on mobile devices. Since designing such systems from scratch is non-trivial, practitioners typically choose to compress large (pre-trained) speech models. Recently, lottery ticket hypothesis reveals the existence of highly sparse subnetworks that can be trained in isolation without sacrificing the performance of the full models. In this paper, we investigate the tantalizing possibility of using lottery ticket hypothesis to discover lightweight speech recognition models, that are (1) robust to various noise existing in speech; (2) transferable to fit the open-world personalization; and 3) compatible with structured sparsity. We conducted extensive experiments on CNN-LSTM, RNN-Transducer, and Transformer models, and verified the existence of highly sparse winning tickets that can match the full model performance across those backbones. We obtained winning tickets that have less than 20% of full model weights on all backbones, while the most lightweight one only keeps 4.4% weights. Those winning tickets generalize to structured sparsity with no performance loss, and transfer exceptionally from large source datasets to various target datasets. Perhaps most surprisingly, when the training utterances have high background noises, the winning tickets even substantially outperform the full models, showing the extra bonus of noise robustness by inducing sparsity. Codes are available at https://github.com/VITA-Group/Audio-Lottery.", "keywords": "Speech Recognition;Lottery Ticket Hypothesis", "primary_area": "", "supplementary_material": "", "author": "Shaojin Ding;Tianlong Chen;Zhangyang Wang", "authorids": "~Shaojin_Ding1;~Tianlong_Chen1;~Zhangyang_Wang1", "gender": "M;M;M", "homepage": ";https://tianlong-chen.github.io;https://vita-group.github.io", "dblp": "226/1807;;119/4026", "google_scholar": "7dnqDRAAAAAJ;LE3ctn0AAAAJ;pxFyKAIAAAAJ", "orcid": ";0000-0001-7774-8197;", "linkedin": ";tianlong-chen-783862167/;", "or_profile": "~Shaojin_Ding1;~Tianlong_Chen1;~Zhangyang_Wang1", "aff": "Google;University of Texas, Austin;University of Texas, Austin", "aff_domain": "google.com;utexas.edu;utexas.edu", "position": "Software Engineer;PhD student;Assistant Professor", "bibtex": "@inproceedings{\nding2022audio,\ntitle={Audio Lottery: Speech Recognition Made Ultra-Lightweight, Noise-Robust, and Transferable},\nauthor={Shaojin Ding and Tianlong Chen and Zhangyang Wang},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=9Nk6AJkVYB}\n}", "github": "", "project": "", "reviewers": "5kPn;etjM;5xPW;j7YG", "pdf_size": 0, "recommendation": "5;6;6;6", "confidence": "4;4;4;3", "correctness": "3;3;4;4", "technical_novelty": "2;2;2;2", "empirical_novelty": "3;3;3;2", "wc_summary_paper": "414;61;87;207", "wc_summary_review": "34;62;19;111", "wc_main_review": "71;263;327;241", "wc_review": "519;386;433;559", "wc_reply_reviewers": "237;222;121;99", "wc_reply_authors": "1419;1374;969;1139", "reply_reviewers": "2;2;1;2", "reply_authors": "5;4;2;4", "recommendation_avg": [ 5.75, 0.4330127018922193 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 192.25, 139.36889000060236 ], "wc_summary_review_avg": [ 56.5, 35.04639781775011 ], "wc_main_review_avg": [ 225.5, 94.62954084217043 ], "wc_review_avg": [ 474.25, 68.32779449096832 ], "wc_reply_reviewers_avg": [ 169.75, 60.4870853984551 ], "wc_reply_authors_avg": [ 1225.25, 182.18723198951128 ], "reply_reviewers_avg": [ 1.75, 0.4330127018922193 ], "reply_authors_avg": [ 3.75, 1.0897247358851685 ], "replies_avg": [ 31, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 35, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2898311547497608739&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "pdf": "https://openreview.net/pdf?id=9Nk6AJkVYB", "email": "google.com;utexas.edu;utexas.edu", "author_num": 3, "aff_unique_index": "0;1;1", "aff_unique_norm": "Google;University of Texas at Austin", "aff_unique_dep": "Google;", "aff_unique_url": "https://www.google.com;https://www.utexas.edu", "aff_unique_abbr": "Google;UT Austin", "aff_campus_unique_index": "0;1;1", "aff_campus_unique": "Mountain View;Austin", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Visual Representation Learning Does Not Generalize Strongly Within the Same Domain", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6276", "id": "9RUHPlladgh", "poster": "", "openreview": "https://openreview.net/forum?id=9RUHPlladgh", "slides": "https://iclr.cc/virtual/2022/poster/6276", "video": "https://iclr.cc/virtual/2022/poster/6276", "author_site": "Lukas Schott, Julius von Kuegelgen, Frederik Tr\u00e4uble, Peter Gehler, Chris Russell, Matthias Bethge, Bernhard Schoelkopf, Francesco Locatello, Wieland Brendel", "tldr": "", "abstract": "An important component for generalization in machine learning is to uncover underlying latent factors of variation as well as the mechanism through which each factor acts in the world.\nIn this paper, we test whether 17 unsupervised, weakly supervised, and fully supervised representation learning approaches correctly infer the generative factors of variation in simple datasets (dSprites, Shapes3D, MPI3D) from controlled environments, and on our contributed CelebGlow dataset. \nIn contrast to prior robustness work that introduces novel factors of variation during test time, such as blur or other (un)structured noise, we here recompose, interpolate, or extrapolate only existing factors of variation from the training data set (e.g., small and medium-sized objects during training and large objects during testing). Models that learn the correct mechanism should be able to generalize to this benchmark.\nIn total, we train and test 2000+ models and observe that all of them struggle to learn the underlying mechanism regardless of supervision signal and architectural bias. Moreover, the generalization capabilities of all tested models drop significantly as we move from artificial datasets towards more realistic real-world datasets.\nDespite their inability to identify the correct mechanism, the models are quite modular as their ability to infer other in-distribution factors remains fairly stable, providing only a single factor is out-of-distribution. These results point to an important yet understudied problem of learning mechanistic models of observations that can facilitate generalization.", "keywords": "Generalization;Composition;Out of distribution;Disentanglement", "primary_area": "", "supplementary_material": "/attachment/1afb08ed0c165f2166e4314f0d76693796a29268.zip", "author": "Lukas Schott;Julius Von K\u00fcgelgen;Frederik Tr\u00e4uble;Peter Vincent Gehler;Chris Russell;Matthias Bethge;Bernhard Sch\u00f6lkopf;Francesco Locatello;Wieland Brendel", "authorids": "~Lukas_Schott2;~Julius_Von_K\u00fcgelgen1;~Frederik_Tr\u00e4uble1;~Peter_Vincent_Gehler1;~Chris_Russell3;~Matthias_Bethge1;~Bernhard_Sch\u00f6lkopf1;~Francesco_Locatello1;~Wieland_Brendel1", "gender": ";M;M;;M;M;;M;M", "homepage": ";https://sites.google.com/view/julius-von-kuegelgen/home;https://ei.is.tuebingen.mpg.de/person/ftraeuble;;https://www.oii.ox.ac.uk/people/profiles/chris-russell/;https://bethgelab.org;;https://twitter.com/FrancescoLocat8;", "dblp": ";223/5666;;;57/9988-1;77/3005;;195/6074;37/11107", "google_scholar": ";6EOl3hAAAAAJ;https://scholar.google.de/citations?user=oc2OOyMAAAAJ;;https://scholar.google.co.uk/citations?user=RM2sHhYAAAAJ;https://scholar.google.com/citations?hl=en;;;v-JL-hsAAAAJ", "orcid": ";0000-0001-6469-4118;;;0000-0003-1665-1759;;;;", "linkedin": ";julius-von-k%C3%BCgelgen/;;;;;;;", "or_profile": "~Lukas_Schott2;~Julius_Von_K\u00fcgelgen1;~Frederik_Tr\u00e4uble1;~Peter_Vincent_Gehler1;~Chris_Russell3;~Matthias_Bethge1;~Bernhard_Sch\u00f6lkopf1;~Francesco_Locatello1;~Wieland_Brendel1", "aff": ";, Max Planck Institute for Intelligent Systems;Max Planck Institute for Intelligent Systems;;Amazon;University of Tuebingen;;Amazon;University of Tuebingen", "aff_domain": ";is.tuebingen.mpg.de;is.tuebingen.mpg.de;;amazon.com;uni-tuebingen.de;;amazon.com;uni-tuebingen.de", "position": ";PhD student;PhD student;;Researcher;Full Professor;;Senior Applied Scientist;Group Leader", "bibtex": "@inproceedings{\nschott2022visual,\ntitle={Visual Representation Learning Does Not Generalize Strongly Within the Same Domain},\nauthor={Lukas Schott and Julius Von K{\\\"u}gelgen and Frederik Tr{\\\"a}uble and Peter Vincent Gehler and Chris Russell and Matthias Bethge and Bernhard Sch{\\\"o}lkopf and Francesco Locatello and Wieland Brendel},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=9RUHPlladgh}\n}", "github": "", "project": "", "reviewers": "Yd4H;Cjpj;ZKCQ;g7qs", "pdf_size": 0, "recommendation": "8;8;8;8", "confidence": "5;3;4;3", "correctness": "4;4;4;2", "technical_novelty": "3;4;2;2", "empirical_novelty": "3;3;3;2", "wc_summary_paper": "69;86;154;35", "wc_summary_review": "89;36;42;57", "wc_main_review": "386;285;741;283", "wc_review": "544;407;937;375", "wc_reply_reviewers": "46;0;120;78", "wc_reply_authors": "679;420;401;579", "reply_reviewers": "1;0;1;1", "reply_authors": "1;1;1;1", "recommendation_avg": [ 8.0, 0.0 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 3.5, 0.8660254037844386 ], "technical_novelty_avg": [ 2.75, 0.82915619758885 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 86.0, 43.34166586553867 ], "wc_summary_review_avg": [ 56.0, 20.530465167647808 ], "wc_main_review_avg": [ 423.75, 187.83952592572203 ], "wc_review_avg": [ 565.75, 223.54348011069345 ], "wc_reply_reviewers_avg": [ 61.0, 43.9203825119955 ], "wc_reply_authors_avg": [ 519.75, 115.02472560280246 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 9, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 81, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=827943787586075996&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "pdf": "https://openreview.net/pdf?id=9RUHPlladgh", "email": ";is.tuebingen.mpg.de;is.tuebingen.mpg.de;;amazon.com;uni-tuebingen.de;;amazon.com;uni-tuebingen.de", "author_num": 9, "aff_unique_index": "0;0;1;2;1;2", "aff_unique_norm": "Max Planck Institute for Intelligent Systems;Amazon;University of Tuebingen", "aff_unique_dep": ";Amazon.com, Inc.;", "aff_unique_url": "https://www.mpi-is.mpg.de;https://www.amazon.com;https://www.uni-tuebingen.de/", "aff_unique_abbr": "MPI-IS;Amazon;Uni T\u00fcbingen", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;1;0", "aff_country_unique": "Germany;United States" }, { "title": "DARA: Dynamics-Aware Reward Augmentation in Offline Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/5934", "id": "9SDQB3b68K", "poster": "", "openreview": "https://openreview.net/forum?id=9SDQB3b68K", "slides": "https://iclr.cc/virtual/2022/poster/5934", "video": "https://iclr.cc/virtual/2022/poster/5934", "author_site": "Jinxin Liu, Hongyin Zhang, Donglin Wang", "tldr": "", "abstract": "Offline reinforcement learning algorithms promise to be applicable in settings where a fixed dataset is available and no new experience can be acquired. However, such formulation is inevitably offline-data-hungry and, in practice, collecting a large offline dataset for one specific task over one specific environment is also costly and laborious. In this paper, we thus 1) formulate the offline dynamics adaptation by using (source) offline data collected from another dynamics to relax the requirement for the extensive (target) offline data, 2) characterize the dynamics shift problem in which prior offline methods do not scale well, and 3) derive a simple dynamics-aware reward augmentation (DARA) framework from both model-free and model-based offline settings. Specifically, DARA emphasizes learning from those source transition pairs that are adaptive for the target environment and mitigates the offline dynamics shift by characterizing state-action-next-state pairs instead of the typical state-action distribution sketched by prior offline RL methods. The experimental evaluation demonstrates that DARA, by augmenting rewards in the source offline dataset, can acquire an adaptive policy for the target environment and yet significantly reduce the requirement of target offline data. With only modest amounts of target offline data, our performance consistently outperforms the prior offline RL methods in both simulated and real-world tasks. ", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/b6a590b91295ea32030d827354e895948b286ddc.zip", "author": "Jinxin Liu;Zhang Hongyin;Donglin Wang", "authorids": "~Jinxin_Liu1;~Zhang_Hongyin1;~Donglin_Wang1", "gender": ";M;M", "homepage": ";https://scholar.google.com/citations?hl=zh-CN&user=PXrMYi8AAAAJ&view_op=list_works&gmla=AJsN-F6UhafqIyL09MBmrkWxqB3SxOpNbHGfO8SBUz22mE8MMD8VxFY5Vn_AdAhmuLaKSTkaolVu14nZjyrbciEt6jfya1O4AmXcbjqdEtOLS3LdgYkXDX7u1rliVxOQrxg28Dmqga8Q_MaIzFPzAQ7IX26p0cABzA;https://milab.westlake.edu.cn/", "dblp": ";216/9018;", "google_scholar": ";https://scholar.google.com/citations?hl=zh-CN;https://scholar.google.ca/citations?user=-fo6wdwAAAAJ", "orcid": ";;0000-0002-8188-3735", "linkedin": ";;", "or_profile": "~Jinxin_Liu1;~Zhang_Hongyin1;~Donglin_Wang1", "aff": ";Westlake University;Westlake University", "aff_domain": ";westlake.edu.cn;westlake.edu.cn", "position": ";PhD student;Associate Professor", "bibtex": "@inproceedings{\nliu2022dara,\ntitle={{DARA}: Dynamics-Aware Reward Augmentation in Offline Reinforcement Learning},\nauthor={Jinxin Liu and Zhang Hongyin and Donglin Wang},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=9SDQB3b68K}\n}", "github": "", "project": "", "reviewers": "73ni;i5hn;QyWv;wSxQ", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "5;3;3;4", "correctness": "3;3;3;3", "technical_novelty": "2;2;4;3", "empirical_novelty": "2;2;3;4", "wc_summary_paper": "90;91;99;148", "wc_summary_review": "33;61;71;68", "wc_main_review": "234;306;303;366", "wc_review": "357;458;473;582", "wc_reply_reviewers": "88;0;29;38", "wc_reply_authors": "1101;382;368;526", "reply_reviewers": "1;0;1;1", "reply_authors": "2;2;2;2", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.75, 0.82915619758885 ], "empirical_novelty_avg": [ 2.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 107.0, 23.926972228010797 ], "wc_summary_review_avg": [ 58.25, 15.022899187573616 ], "wc_main_review_avg": [ 302.25, 46.73529180394619 ], "wc_review_avg": [ 467.5, 79.75117553992543 ], "wc_reply_reviewers_avg": [ 38.75, 31.712576369636068 ], "wc_reply_authors_avg": [ 594.25, 299.0371005410533 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 2.0, 0.0 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.20751433915982243, "corr_recommendation_correctness": 0.0, "gs_citation": 48, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5218051358875104342&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=9SDQB3b68K", "email": ";westlake.edu.cn;westlake.edu.cn", "author_num": 3, "aff_unique_index": "0;0", "aff_unique_norm": "Westlake University", "aff_unique_dep": "", "aff_unique_url": "https://www.westlake.edu.cn", "aff_unique_abbr": "WU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "id": "9Sf8fbue1br", "title": "Improving Mini-batch Optimal Transport via Partial Transportation", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Mini-batch optimal transport (m-OT) has been widely used recently to deal with the memory issue of OT in large-scale applications. Despite their practicality, m-OT suffers from misspecified mappings, namely, mappings that are optimal on the mini-batch level but are partially wrong in the comparison with the optimal transportation plan between the original measures. To address the misspecified mappings issue, we propose a novel mini-batch method by using partial optimal transport (POT) between mini-batch empirical measures, which we refer to as mini-batch partial optimal transport (m-POT). Leveraging the insight from the partial transportation, we explain the source of misspecified mappings from the m-OT and motivate why limiting the amount of transported masses among mini-batches via POT can alleviate the incorrect mappings. Finally, we carry out extensive experiments on various applications to compare m-POT with m-OT and recently proposed mini-batch method, mini-batch unbalanced optimal transport (m-UOT). We observe that m-POT is better than m-OT in deep domain adaptation applications while having comparable performance with m-UOT. On other applications, such as deep generative model and color transfer, m-POT yields more favorable performance than m-OT while m-UOT is non-trivial to apply. ", "keywords": "Deep Domain Adaptation;Deep Generative Models;Color Transfer;Optimal Transport", "primary_area": "", "supplementary_material": "/attachment/6e2c34a9dc8122f2b93ae19474c228ece0385096.zip", "author": "Khai Nguyen;Dang Nguyen;Tung Pham;Nhat Ho", "authorids": "~Khai_Nguyen1;~Dang_Nguyen2;~Tung_Pham1;~Nhat_Ho1", "gender": "M;M;M;M", "homepage": "https://khainb.com;https://hsgser.github.io/;;https://nhatptnk8912.github.io/", "dblp": "120/4308;;38/10862-1;203/4479", "google_scholar": "im5fNaQAAAAJ;https://scholar.google.co.jp/citations?user=WIqAtrcAAAAJ;KcUuEKsAAAAJ;https://scholar.google.ca/citations?user=Xs7cKMwAAAAJ", "orcid": ";;;", "linkedin": ";dang-nguyen-50b7a7a0/;;nhat-pham-minh-ho-267b8164/", "or_profile": "~Khai_Nguyen1;~Dang_Nguyen2;~Tung_Pham1;~Nhat_Ho1", "aff": "University of Texas, Austin;;VinAI Research;University of Texas, Austin", "aff_domain": "utexas.edu;;vinai.io;utexas.edu", "position": "PhD student;;Researcher;Assistant Professor", "bibtex": "@misc{\nnguyen2022improving,\ntitle={Improving Mini-batch Optimal Transport via Partial Transportation},\nauthor={Khai Nguyen and Dang Nguyen and Tung Pham and Nhat Ho},\nyear={2022},\nurl={https://openreview.net/forum?id=9Sf8fbue1br}\n}", "github": "", "project": "", "reviewers": "YS9Q;Gub4;XKw1", "site": "https://openreview.net/forum?id=9Sf8fbue1br", "pdf_size": 0, "recommendation": "3;3;6", "confidence": "3;4;2", "correctness": "2;2;4", "technical_novelty": "2;2;2", "empirical_novelty": "2;2;3", "wc_summary_paper": "67;18;66", "wc_summary_review": "59;53;19", "wc_main_review": "647;972;36", "wc_review": "773;1043;121", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 4.0, 1.4142135623730951 ], "confidence_avg": [ 3.0, 0.816496580927726 ], "correctness_avg": [ 2.6666666666666665, 0.9428090415820634 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 50.333333333333336, 22.866763848189994 ], "wc_summary_review_avg": [ 43.666666666666664, 17.613126418163876 ], "wc_main_review_avg": [ 551.6666666666666, 388.0209043624096 ], "wc_review_avg": [ 645.6666666666666, 387.0239728434862 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.8660254037844387, "corr_recommendation_correctness": 1.0, "gs_citation": 50, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13182138160286154979&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff_unique_index": "0;1;0", "aff_unique_norm": "University of Texas at Austin;VinAI Research", "aff_unique_dep": ";", "aff_unique_url": "https://www.utexas.edu;https://www.vinai.io/", "aff_unique_abbr": "UT Austin;VinAI", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Austin;", "aff_country_unique_index": "0;1;0", "aff_country_unique": "United States;Vietnam" }, { "id": "9TdCcMlmsLm", "title": "Text Generation with Efficient (Soft) $Q$-Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Maximum likelihood estimation (MLE) is the predominant algorithm for training text generation models. This paradigm relies on direct supervision examples, which is not applicable to many emerging applications, such as generating adversarial attacks or generating prompts to control language models. Reinforcement learning (RL) on the other hand offers a more flexible solution by allowing users to plug in arbitrary task metrics as reward. Yet previous RL algorithms for text generation, such as policy gradient (on-policy RL) and Q-learning (off-policy RL), are often notoriously inefficient or unstable to train due to the large sequence space and the sparse reward received only at the end of sequences. In this paper, we introduce a new RL formulation for text generation from the soft Q-learning (SQL) perspective. It enables us to draw from the latest RL advances, such as path consistency learning, to combine the best of on-/off-policy updates, and learn effectively from sparse reward. We apply the approach to a wide range of text generation tasks, including learning from noisy/negative examples, adversarial attacks, and prompt generation. Experiments show our approach consistently outperforms both task-specialized algorithms and the previous RL methods.", "keywords": "text generation;reinforcement learning for text generation", "primary_area": "", "supplementary_material": "/attachment/bb0ee2e3cc04bc74b308a80ab21ac490df96c5d5.zip", "author": "Han Guo;Bowen Tan;Zhengzhong Liu;Eric Xing;Zhiting Hu", "authorids": "~Han_Guo1;~Bowen_Tan2;~Zhengzhong_Liu1;~Eric_Xing1;~Zhiting_Hu3", "gender": ";M;M;M;M", "homepage": ";https://bowentan.me;https://hunterhector.github.io/;http://www.cs.cmu.edu/~epxing/;http://zhiting.ucsd.edu", "dblp": ";;166/0352;36/3855;134/4031", "google_scholar": ";;S9E-hMwAAAAJ;https://scholar.google.com.tw/citations?user=5pKTRxEAAAAJ;N7_xhHoAAAAJ", "orcid": ";;;;", "linkedin": ";;hunterhector/;;", "or_profile": "~Han_Guo1;~Bowen_Tan2;~Zhengzhong_Liu1;~Eric_Xing1;~Zhiting_Hu3", "aff": ";Carnegie Mellon University;;School of Computer Science, Carnegie Mellon University;Amazon", "aff_domain": ";cmu.edu;;cs.cmu.edu;amazon.com", "position": ";PhD student;;Full Professor;Researcher", "bibtex": "@misc{\nguo2022text,\ntitle={Text Generation with Efficient (Soft) \\$Q\\$-Learning},\nauthor={Han Guo and Bowen Tan and Zhengzhong Liu and Eric Xing and Zhiting Hu},\nyear={2022},\nurl={https://openreview.net/forum?id=9TdCcMlmsLm}\n}", "github": "", "project": "", "reviewers": "2uap;FPpH;jb7t", "site": "https://openreview.net/forum?id=9TdCcMlmsLm", "pdf_size": 0, "recommendation": "5;5;6", "confidence": "4;3;4", "correctness": "3;3;4", "technical_novelty": "2;1;3", "empirical_novelty": "2;2;3", "wc_summary_paper": "31;28;146", "wc_summary_review": "58;79;48", "wc_main_review": "474;283;288", "wc_review": "563;390;482", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "484;483;409", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "recommendation_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.0, 0.816496580927726 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 68.33333333333333, 54.932281543328926 ], "wc_summary_review_avg": [ 61.666666666666664, 12.918548250050733 ], "wc_main_review_avg": [ 348.3333333333333, 88.88319426203259 ], "wc_review_avg": [ 478.3333333333333, 70.67452786463373 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 458.6666666666667, 35.122009560324926 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.4999999999999999, "corr_recommendation_correctness": 0.9999999999999997, "gs_citation": 31, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7547615031220290523&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "aff_unique_index": "0;0;1", "aff_unique_norm": "Carnegie Mellon University;Amazon", "aff_unique_dep": ";Amazon.com, Inc.", "aff_unique_url": "https://www.cmu.edu;https://www.amazon.com", "aff_unique_abbr": "CMU;Amazon", "aff_campus_unique_index": "1", "aff_campus_unique": ";Pittsburgh", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "9Vimsa_gGG5", "title": "Initializing ReLU networks in an expressive subspace of weights", "track": "main", "status": "Reject", "tldr": "", "abstract": "Using a mean-field theory of signal propagation, we analyze the evolution of correlations between two signals propagating forward through a deep ReLU network with correlated weights. Signals become highly correlated in deep ReLU networks with uncorrelated weights. We show that ReLU networks with anti-correlated weights can avoid this fate and have a chaotic phase where the signal correlations saturate below unity. Consistent with this analysis, we find that networks initialized with anti-correlated weights can train faster by taking advantage of the increased expressivity in the chaotic phase. An initialization scheme combining this with a previously proposed strategy of using an asymmetric initialization to reduce dead node probability shows consistently lower training times compared to various other initializations on synthetic and real-world datasets. Our study suggests that use of initial distributions with correlations in them can help in reducing training time.", "keywords": "Signal propagation;deep ReLU networks;mean-field theory;improved initialization", "primary_area": "", "supplementary_material": "", "author": "Dayal Singh;G J Sreejith", "authorids": "~Dayal_Singh2;~G_J_Sreejith1", "gender": ";M", "homepage": ";http://www.iiserpune.ac.in/~sreejith/index.html", "dblp": ";", "google_scholar": ";G4n2WsAAAAAJ", "orcid": ";0000-0002-2068-1670", "linkedin": ";sreejith-g-j-98b496117", "or_profile": "~Dayal_Singh2;~G_J_Sreejith1", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nsingh2022initializing,\ntitle={Initializing Re{LU} networks in an expressive subspace of weights},\nauthor={Dayal Singh and G J Sreejith},\nyear={2022},\nurl={https://openreview.net/forum?id=9Vimsa_gGG5}\n}", "github": "", "project": "", "reviewers": "kn8X;bmuJ;QqGU;NAFJ", "site": "https://openreview.net/forum?id=9Vimsa_gGG5", "pdf_size": 0, "recommendation": "1;3;5;5", "confidence": "3;3;3;4", "correctness": "3;2;3;4", "technical_novelty": "1;2;2;3", "empirical_novelty": "1;2;2;2", "wc_summary_paper": "82;85;28;72", "wc_summary_review": "46;39;23;78", "wc_main_review": "313;129;251;281", "wc_review": "441;253;302;431", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.5, 1.6583123951777 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 1.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 66.75, 22.884219453588535 ], "wc_summary_review_avg": [ 46.5, 20.006249023742555 ], "wc_main_review_avg": [ 243.5, 69.64732586395547 ], "wc_review_avg": [ 356.75, 81.19844518215851 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.5222329678670935, "corr_recommendation_correctness": 0.42640143271122083, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3283625068625201289&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6 }, { "title": "Multitask Prompted Training Enables Zero-Shot Task Generalization", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7101", "id": "9Vrb9D0WI4", "poster": "", "openreview": "https://openreview.net/forum?id=9Vrb9D0WI4", "slides": "https://iclr.cc/virtual/2022/poster/7101", "video": "https://iclr.cc/virtual/2022/poster/7101", "author_site": "Victor Sanh, Albert Webson, Colin Raffel, Stephen Bach, Lintang Sutawika, Zaid Alyafeai, Antoine Chaffin, Arnaud Stiegler, Arun Raja, Manan Dey, M Saiful Bari, Canwen Xu, Urmish Thakker, Shanya Sharma, Eliza Szczechla, Taewoon Kim, Gunjan Chhablani, Nihal Nayak, Debajyoti Datta, Jonathan Chang, Mike Tian-Jian Jiang, Han Wang, Matteo Manica, Sheng Shen, Zheng Xin Yong, Harshit Pandey, Rachel Bawden, Thomas Wang, Trishala Neeraj, Jos Rozen, Abheesht Sharma, Andrea Santilli, Thibault Fevry, Jason Fries, Ryan Teehan, Teven Le Scao, Stella R Biderman, Leo Gao, Thomas Wolf, Alexander M Rush", "tldr": "", "abstract": "Large language models have recently been shown to attain reasonable zero-shot generalization on a diverse set of tasks (Brown et al., 2020). It has been hypothesized that this is a consequence of implicit multitask learning in language models\u2019 pretraining (Radford et al., 2019). Can zero-shot generalization instead be directly induced by explicit multitask learning? To test this question at scale, we develop a system for easily mapping any natural language tasks into a human-readable prompted form. We convert a large set of supervised datasets, each with multiple prompts with diverse wording. These prompted datasets allow for benchmarking the ability of a model to perform completely unseen tasks specified in natural language. We fine-tune a pretrained encoder-decoder model (Raffel et al., 2020; Lester et al., 2021) on this multitask mixture covering a wide variety of tasks. The model attains strong zero-shot performance on several datasets, often outperforming models 16\u00d7 its size. Further, our model attains strong performance on a subset of tasks from the BIG-Bench benchmark, outperforming models 6\u00d7 its size. All trained models are available at https://github.com/bigscience-workshop/t-zero, and all prompts are available at https://github.com/bigscience-workshop/promptsource.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Victor Sanh;Albert Webson;Colin Raffel;Stephen Bach;Lintang Sutawika;Zaid Alyafeai;Antoine Chaffin;Arnaud Stiegler;Arun Raja;Manan Dey;M Saiful Bari;Canwen Xu;Urmish Thakker;Shanya Sharma Sharma;Eliza Szczechla;Taewoon Kim;Gunjan Chhablani;Nihal Nayak;Debajyoti Datta;Jonathan Chang;Mike Tian-Jian Jiang;Han Wang;Matteo Manica;Sheng Shen;Zheng Xin Yong;Harshit Pandey;Rachel Bawden;Thomas Wang;Trishala Neeraj;Jos Rozen;Abheesht Sharma;Andrea Santilli;Thibault Fevry;Jason Alan Fries;Ryan Teehan;Teven Le Scao;Stella Biderman;Leo Gao;Thomas Wolf;Alexander M Rush", "authorids": "~Victor_Sanh1;~Albert_Webson1;~Colin_Raffel1;~Stephen_Bach1;~Lintang_Sutawika1;~Zaid_Alyafeai1;~Antoine_Chaffin1;~Arnaud_Stiegler1;~Arun_Raja1;~Manan_Dey3;~M_Saiful_Bari2;~Canwen_Xu1;~Urmish_Thakker1;~Shanya_Sharma_Sharma1;~Eliza_Szczechla1;~Taewoon_Kim1;~Gunjan_Chhablani1;~Nihal_Nayak1;~Debajyoti_Datta1;~Jonathan_Chang2;~Mike_Tian-Jian_Jiang1;~Han_Wang9;~Matteo_Manica1;~Sheng_Shen2;~Zheng_Xin_Yong1;~Harshit_Pandey1;~Rachel_Bawden1;~Thomas_Wang1;~Trishala_Neeraj1;~Jos_Rozen1;~Abheesht_Sharma1;~Andrea_Santilli1;~Thibault_Fevry1;~Jason_Alan_Fries1;~Ryan_Teehan1;~Teven_Le_Scao1;~Stella_Biderman1;~Leo_Gao1;~Thomas_Wolf1;~Alexander_M_Rush1", "gender": ";;;M;M;;M;M;;M;;;M;F;;;M;;;;;M;M;M;M;M;F;;;M;M;M;M;M;M;;F;;M;M", "homepage": ";https://representations.ai;http://colinraffel.com;http://stephenbach.net;https://lintang.sutawika.com;https://zaidalyafeai.github.io/;https://antoine.chaffin.fr;;;https://www.linkedin.com/in/manandey;;;https://urmish.github.io/;https://www.shanyasharma.com/;https://github.com/elsanns;https://taewoon.kim/;https://gchhablani.github.io;;;;;https://hannight.github.io/;https://ibm.biz/matteomanica;https://sincerass.github.io;https://yongzx.github.io;https://harsh4799.github.io/;https://rbawden.github.io;;https://trishalaneeraj.github.io/;https://europe.naverlabs.com/people_user/jos-rozen/;;https://santilli.xyz/;http://thibaultfevry.com/;https://web.stanford.edu/~jfries/;https://rteehas.github.io/;;http://www.stellabiderman.com;https://leogao.dev;https://thomwolf.io;http://rush.seas.harvard.edu/", "dblp": "230/4101;276/1456;149/0082;90/1077;304/3270.html;206/0089;298/9225.html;;;;;;239/4246;https://dblp.uni-trier.de/pers/hd/s/Sharma:Shanya;;00/3896-2;286/5210.html;;147/8345;;;;194/3100;138/5764-1.html;266/0855;;146/4432;;;264/0099;;179/7048;;182/2122;304/2042;;239/5641;279/3125;;http://dblp.uni-trier.de/pers/hd/r/Rush:Alexander_M=", "google_scholar": "6STg_7IAAAAJ;3OQplr0AAAAJ;I66ZBYwAAAAJ;hs6pGXoAAAAJ;pVgdC6wAAAAJ;vb6w6j4AAAAJ;GQ_tVhwAAAAJ;jBbxaSsAAAAJ;https://scholar.google.com/citations?view_op=list_works;;;;-GPPICQAAAAJ;ypCFnQ8AAAAJ;;dJ4ksGoAAAAJ;IgPFpZYAAAAJ;;L6lx408AAAAJ;;;xA8AYqkAAAAJ;-20KQZQAAAAJ;https://scholar.google.com/citations?hl=en;https://scholar.google.com/citations?hl=en;;https://scholar.google.co.uk/citations?user=G3DS5GAAAAAJ;;X2lSWUQAAAAJ;OxEDKogAAAAJ;gSLkLTMAAAAJ;j2Y_XBIAAAAJ;QjeNAUEAAAAJ;wywWmwoAAAAJ;;;bO7H0DAAAAAJ;r6mBY50AAAAJ;D2H5EFEAAAAJ;LIjnUGgAAAAJ", "orcid": ";;;0000-0003-3857-3560;;0009-0003-1774-5236;0000-0003-3605-4097;;;;;;;;;0000-0003-2892-0194;0000-0001-8445-9051;;;;0000-0002-2496-2699;;0000-0002-8872-0269;;;;0000-0001-9553-1768;;;0000-0001-5707-2134;;;;0000-0001-9316-5768;0000-0002-1426-6964;0000-0002-7052-3048;0000-0001-8228-1042;;;0000-0002-9900-1606", "linkedin": "victor-sanh/;;;;;zaid-alyafeai-a8801a278/;antoine-chaffin/;arnaudstiegler/;;;;;urmishthakker/;shanya-sharma-thechange/;;tae898/;gchhablani;;;;;;matteo-manica-drugilsberg/;sheng-s-ab198a174/;;;;thomas-w-394479109/;trishalaneeraj/;jos-rozen-7b649743/;https://in.linkedin.com/in/abheesht-sharma-567303156;andreasantilli/;;jason-fries/;;;stellabiderman;;;sasha-rush-a69b6917/", "or_profile": "~Victor_Sanh1;~Albert_Webson1;~Colin_Raffel1;~Stephen_Bach1;~Lintang_Sutawika1;~Zaid_Alyafeai1;~Antoine_Chaffin1;~Arnaud_Stiegler1;~Arun_Raja1;~Manan_Dey3;~M_Saiful_Bari2;~Canwen_Xu1;~Urmish_Thakker1;~Shanya_Sharma_Sharma1;~Eliza_Szczechla1;~Taewoon_Kim1;~Gunjan_Chhablani1;~Nihal_Nayak1;~Debajyoti_Datta1;~Jonathan_Chang2;~Mike_Tian-Jian_Jiang1;~Han_Wang9;~Matteo_Manica1;~Sheng_Shen2;~Zheng_Xin_Yong1;~Harshit_Pandey1;~Rachel_Bawden1;~Thomas_Wang1;~Trishala_Neeraj1;~Jos_Rozen1;~Abheesht_Sharma1;~Andrea_Santilli1;~Thibault_Fevry1;~Jason_Alan_Fries1;~Ryan_Teehan1;~Teven_Le_Scao1;~Stella_Biderman1;~Leo_Gao1;~Thomas_Wolf1;~Alexander_M_Rush1", "aff": "Hugging Face;Brown University;University of North Carolina, Chapel Hill;Snorkel AI;Datasaur Inc;KFUPM;IRISA;;Institute for Infocomm Research, A*STAR;;;;SambaNova Systems;Walmart Labs;Scott Tiger;Vrije Universiteit Amsterdam;Georgia Institute of Technology;;University of Virginia;;;New York University;International Business Machines;University of California, Berkeley;Brown University;Northeastern University;Inria;Hugging Face;;Naver Labs Europe;BITS Pilani;Sapienza University of Rome;;Stanford University;New York University;Hugging Face;Georgia Institute of Technology;OpenAI;Hugging Face;School of Engineering and Applied Sciences, Harvard University", "aff_domain": "huggingface.co;brown.edu;unc.edu;snorkel.ai;datasaur.ai;kfupm.edu.sa;imatag.com;;i2r.a-star.edu.sg;;;;sambanova.com;walmartlabs.com;tiger.com.pl;vu.nl;gatech.edu;;virginia.edu;;;nyu.edu;ibm.com;berkeley.edu;brown.edu;neu.edu;inria.fr;huggingface.co;;naverlabs.com;bits-pilani.ac.in;uniroma1.it;;stanford.edu;nyu.edu;huggingface.co;gatech.edu;openai.com;huggingface.co;seas.harvard.edu", "position": "Researcher;PhD student;Assistant Professor;Researcher;Researcher;PhD student;PhD student;;Researcher;;;;Principal Engineer;SDE-3;Data Scientist;PhD student;MS student;;PhD student;;;MS student;Research Scientist;PhD student;PhD student;MS student;Researcher;Researcher;;Researcher;MS student;PhD student;;Research Scientist;PhD student;Researcher;MS student;Researcher;Researcher;Assistant Professor", "bibtex": "@inproceedings{\nsanh2022multitask,\ntitle={Multitask Prompted Training Enables Zero-Shot Task Generalization},\nauthor={Victor Sanh and Albert Webson and Colin Raffel and Stephen Bach and Lintang Sutawika and Zaid Alyafeai and Antoine Chaffin and Arnaud Stiegler and Arun Raja and Manan Dey and M Saiful Bari and Canwen Xu and Urmish Thakker and Shanya Sharma Sharma and Eliza Szczechla and Taewoon Kim and Gunjan Chhablani and Nihal Nayak and Debajyoti Datta and Jonathan Chang and Mike Tian-Jian Jiang and Han Wang and Matteo Manica and Sheng Shen and Zheng Xin Yong and Harshit Pandey and Rachel Bawden and Thomas Wang and Trishala Neeraj and Jos Rozen and Abheesht Sharma and Andrea Santilli and Thibault Fevry and Jason Alan Fries and Ryan Teehan and Teven Le Scao and Stella Biderman and Leo Gao and Thomas Wolf and Alexander M Rush},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=9Vrb9D0WI4}\n}", "github": "", "project": "", "reviewers": "qyUt;Ju6C;n8Ae;zBjM", "pdf_size": 0, "recommendation": "3;6;8;8", "confidence": "5;4;4;4", "correctness": "3;3;3;4", "technical_novelty": "1;3;3;2", "empirical_novelty": "2;3;4;3", "wc_summary_paper": "166;22;153;66", "wc_summary_review": "221;90;80;41", "wc_main_review": "753;417;701;263", "wc_review": "1140;529;934;370", "wc_reply_reviewers": "153;48;18;0", "wc_reply_authors": "2831;426;631;294", "reply_reviewers": "1;1;1;0", "reply_authors": "5;1;1;1", "recommendation_avg": [ 6.25, 2.0463381929681126 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.82915619758885 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 101.75, 59.98489393172251 ], "wc_summary_review_avg": [ 108.0, 67.76060802560733 ], "wc_main_review_avg": [ 533.5, 201.85328830613585 ], "wc_review_avg": [ 743.25, 307.8208691755645 ], "wc_reply_reviewers_avg": [ 54.75, 59.259492910419006 ], "wc_reply_authors_avg": [ 1045.5, 1037.8286226540488 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 2.0, 1.7320508075688772 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 40, 0 ], "corr_recommendation_confidence": -0.9169493006161777, "corr_recommendation_correctness": 0.49374193110101877, "gs_citation": 1949, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1498498644397448654&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 11, "pdf": "https://openreview.net/pdf?id=9Vrb9D0WI4", "email": "huggingface.co;brown.edu;unc.edu;snorkel.ai;datasaur.ai;kfupm.edu.sa;imatag.com;;i2r.a-star.edu.sg;;;;sambanova.com;walmartlabs.com;tiger.com.pl;vu.nl;gatech.edu;;virginia.edu;;;nyu.edu;ibm.com;berkeley.edu;brown.edu;neu.edu;inria.fr;huggingface.co;;naverlabs.com;bits-pilani.ac.in;uniroma1.it;;stanford.edu;nyu.edu;huggingface.co;gatech.edu;openai.com;huggingface.co;seas.harvard.edu", "author_num": 40, "aff_unique_index": "0;1;2;3;4;5;6;7;8;9;11;12;13;14;15;16;1;17;18;0;19;20;21;22;14;0;12;23;0;24", "aff_unique_norm": "Hugging Face;Brown University;University of North Carolina;Snorkel AI;Datasaur Inc;King Fahd University of Petroleum & Minerals;Institut de Recherche en Informatique et Automatique;Institute for Infocomm Research;SambaNova Systems;Walmart;;Vrije Universiteit Amsterdam;Georgia Institute of Technology;University of Virginia;New York University;International Business Machines Corporation;University of California, Berkeley;Northeastern University;INRIA;NAVER LABS;Birla Institute of Technology and Science, Pilani;Sapienza University of Rome;Stanford University;OpenAI;Harvard University", "aff_unique_dep": ";;;;;;;;;Walmart Labs;;;;;;;;;;;;;;;School of Engineering and Applied Sciences", "aff_unique_url": "https://huggingface.co;https://www.brown.edu;https://www.unc.edu;https://www.snorkelai.com;https://www.datasaur.io;https://www.kfupm.edu.sa;https://www.irisa.fr;https://www.i2r.a-star.edu.sg;https://www.sambanova.com;https://www.walmart.com;;https://www.vu.nl;https://www.gatech.edu;https://www.virginia.edu;https://www.nyu.edu;https://www.ibm.com;https://www.berkeley.edu;https://www.northeastern.edu;https://www.inria.fr;https://labs.naver.com;https://www.bits-pilani.ac.in;https://www.uniroma1.it;https://www.stanford.edu;https://openai.com;https://www.harvard.edu", "aff_unique_abbr": "Hugging Face;Brown;UNC;Snorkel AI;Datasaur;KFUPM;IRISA;I2R;;Walmart Labs;;VU Amsterdam;Georgia Tech;UVA;NYU;IBM;UC Berkeley;NEU;Inria;NLE;BITS Pilani;Sapienza;Stanford;OpenAI;Harvard", "aff_campus_unique_index": "1;2;3;4;5;6", "aff_campus_unique": ";Chapel Hill;Berkeley;Pilani;Rome;Stanford;Cambridge", "aff_country_unique_index": "0;0;0;0;0;1;2;3;0;0;5;0;0;0;0;0;0;0;2;0;6;7;8;0;0;0;0;0;0;0", "aff_country_unique": "United States;Saudi Arabia;France;Singapore;;Netherlands;Unknown;India;Italy" }, { "id": "9W2KnHqm_xN", "title": "Successive POI Recommendation via Brain-inspired Spatiotemporal Aware Representation", "track": "main", "status": "Reject", "tldr": "", "abstract": "POI vector representation (embedding) is the core of successive POI recommendation. However, existing approaches only rely on basic discretization and interval analyses and fail to fully exploit complicated spatiotemporal attributes of POIs. Neuroscience research has shown that the mammalian brain entorhinal-hippocampal system provides efficient graph representations for general knowledge. Moreover, entorhinal grid cells present concise spatial representations, while hippocampal place cells represent perception conjunctions effectively. Thus, the entorhinal-hippocampal system provides a novel angle for spatiotemporal aware representation, which inspires us to propose the SpatioTemporal aware Embedding framework (STE) and apply to POIs (STEP). STEP considers two types of POI-specific representations: sequential representation and spatiotemporal conjunctive representation, learned using sparse unlabeled data based on the proposed graph-building policies. Notably, the spatiotemporal conjunctive representation represents POIs from spatial and temporal aspects jointly and precisely. Furthermore, we introduce a user privacy secure successive POI recommendation method using STEP. Experimental results on two datasets demonstrate that STEP captures POI-specific spatiotemporal information more accurately and achieves the state-of-the-art successive POI recommendation performance. Therefore, this work provides a novel solution to spatiotemporal aware representation and paves a new way for spatiotemporal modeling-related tasks.", "keywords": "Neuroscience;spatiotemporal aware modeling;successive POI recommendation", "primary_area": "", "supplementary_material": "", "author": "Gehua Ma;Jingyuan Zhao;Huajin Tang", "authorids": "~Gehua_Ma1;~Jingyuan_Zhao1;~Huajin_Tang1", "gender": "M;F;M", "homepage": "https://genema.github.io;;https://person.zju.edu.cn/htang", "dblp": "https://dblp.uni-trier.de/pid/348/6861;;18/434", "google_scholar": ";;U041O4QAAAAJ", "orcid": ";;", "linkedin": ";jingyuan-zhao-phd-24347853/;", "or_profile": "~Gehua_Ma1;~Jingyuan_Zhao1;~Huajin_Tang1", "aff": "Zhejiang University;Capgemini;Zhejiang University", "aff_domain": "zju.edu.cn;capgemini.com;zju.edu.cn", "position": "PhD student;VP, AI & Analytics APAC;Full Professor", "bibtex": "@misc{\nma2022successive,\ntitle={Successive {POI} Recommendation via Brain-inspired Spatiotemporal Aware Representation},\nauthor={Gehua Ma and Jingyuan Zhao and Huajin Tang},\nyear={2022},\nurl={https://openreview.net/forum?id=9W2KnHqm_xN}\n}", "github": "", "project": "", "reviewers": "a1wE;qsPw;sU9T;F7HP", "site": "https://openreview.net/forum?id=9W2KnHqm_xN", "pdf_size": 0, "recommendation": "5;5;5;6", "confidence": "3;4;5;4", "correctness": "3;3;2;3", "technical_novelty": "2;3;2;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "36;12;57;38", "wc_summary_review": "14;14;73;19", "wc_main_review": "246;59;477;206", "wc_review": "296;85;607;263", "wc_reply_reviewers": "0;0;330;0", "wc_reply_authors": "247;208;761;256", "reply_reviewers": "0;0;1;0", "reply_authors": "1;1;2;1", "recommendation_avg": [ 5.25, 0.4330127018922193 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 35.75, 15.974589196595948 ], "wc_summary_review_avg": [ 30.0, 24.9098374141623 ], "wc_main_review_avg": [ 247.0, 149.9383206521935 ], "wc_review_avg": [ 312.75, 187.8887636342312 ], "wc_reply_reviewers_avg": [ 82.5, 142.89419162443238 ], "wc_reply_authors_avg": [ 368.0, 227.614806196785 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=952973751240232324&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "Zhejiang University;Capgemini", "aff_unique_dep": ";", "aff_unique_url": "https://www.zju.edu.cn;https://www.capgemini.com", "aff_unique_abbr": "ZJU;Capgemini", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", "aff_country_unique": "China;France" }, { "id": "9WJ-fT_92Hp", "title": "Efficient Reinforcement Learning Experimentation in PyTorch", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Abstract: Deep reinforcement learning (RL) has proved successful at solving challenging environments but often requires long training times and very many samples. Furthermore, advancing artificial intelligence requires to easily prototype new methods, yet avoiding impractically slow experimental turnaround times. To this end, we present a PyTorch-based library for RL with a modular design that allows composing agents based on three components types: actors, storages and algorithms. Additionally, the definition of synchronous and asynchronous architectures is permitted with flexibility and independence of the components. We present several standard use-cases of the library and showcase its potential by obtaining the highest to-date test performance on the Obstacle Tower Unity3D challenge environment. In summary, we believe that this work helps accelerate experimentation of new ideas, simplifying research and enabling to tackle more challenging RL problems.\n", "keywords": "Deep Reinforcement Learning;Python library", "primary_area": "", "supplementary_material": "/attachment/7d869b3841f75a95f2a77929378b1d13ec90f76b.zip", "author": "Albert Bou;Sebastian Dittert;Gianni De Fabritiis", "authorids": "~Albert_Bou1;sebastian.dittert@upf.edu;~Gianni_De_Fabritiis1", "gender": "M;;M", "homepage": "https://www.linkedin.com/in/albert-bou-873b8b98/;;https://www.compscience.org", "dblp": ";;29/605", "google_scholar": ";;-_kX4kMAAAAJ", "orcid": ";;", "linkedin": ";;gdefabritiis/", "or_profile": "~Albert_Bou1;sebastian.dittert@upf.edu;~Gianni_De_Fabritiis1", "aff": "Universitat Pompeu Fabra;;Universitat Pompeu Fabra", "aff_domain": "upf.edu;;upf.edu", "position": "PhD student;;Full Professor", "bibtex": "@misc{\nbou2022efficient,\ntitle={Efficient Reinforcement Learning Experimentation in PyTorch},\nauthor={Albert Bou and Sebastian Dittert and Gianni De Fabritiis},\nyear={2022},\nurl={https://openreview.net/forum?id=9WJ-fT_92Hp}\n}", "github": "", "project": "", "reviewers": "wBHe;pHwp;KMwD;ozqt;gKp1", "site": "https://openreview.net/forum?id=9WJ-fT_92Hp", "pdf_size": 0, "recommendation": "3;3;5;5;6", "confidence": "4;4;4;4;4", "correctness": "3;2;3;3;3", "technical_novelty": "2;2;2;2;3", "empirical_novelty": "2;2;2;2;2", "wc_summary_paper": "266;31;113;51;80", "wc_summary_review": "84;62;71;24;123", "wc_main_review": "589;307;224;188;1107", "wc_review": "939;400;408;263;1310", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "0;0;0;0;0", "reply_reviewers": "0;0;0;0;0", "reply_authors": "0;0;0;0;0", "recommendation_avg": [ 4.4, 1.2 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 2.8, 0.39999999999999997 ], "technical_novelty_avg": [ 2.2, 0.39999999999999997 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 108.2, 83.60717672544625 ], "wc_summary_review_avg": [ 72.8, 32.083640691168455 ], "wc_main_review_avg": [ 483.0, 342.25546014636495 ], "wc_review_avg": [ 664.0, 397.2339361132178 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.5833333333333335, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:5IjfHm5NU5AJ:scholar.google.com/&scioq=Efficient+Reinforcement+Learning+Experimentation+in+PyTorch&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Universitat Pompeu Fabra", "aff_unique_dep": "", "aff_unique_url": "https://www.upf.edu/", "aff_unique_abbr": "UPF", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Spain" }, { "title": "SGD Can Converge to Local Maxima", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6576", "id": "9XhPLAjjRB", "poster": "", "openreview": "https://openreview.net/forum?id=9XhPLAjjRB", "slides": "https://iclr.cc/virtual/2022/poster/6576", "video": "https://iclr.cc/virtual/2022/poster/6576", "author_site": "Liu Ziyin, Botao Li, James Simon, Masahito Ueda", "tldr": "", "abstract": "Previous works on stochastic gradient descent (SGD) often focus on its success. In this work, we construct worst-case optimization problems illustrating that, when not in the regimes that the previous works often assume, SGD can exhibit many strange and potentially undesirable behaviors. Specifically, we construct landscapes and data distributions such that (1) SGD converges to local maxima, (2) SGD escapes saddle points arbitrarily slowly, (3) SGD prefers sharp minima over flat ones, and (4) AMSGrad converges to local maxima. We also realize results in a minimal neural network-like example. Our results highlight the importance of simultaneously analyzing the minibatch sampling, discrete-time updates rules, and realistic landscapes to understand the role of SGD in deep learning.", "keywords": "stochastic gradient descent;saddle points;convergence;amsgrad;deep learning", "primary_area": "", "supplementary_material": "", "author": "Liu Ziyin;Botao Li;James B Simon;Masahito Ueda", "authorids": "~Liu_Ziyin1;botao.li95@gmail.com;~James_B_Simon1;~Masahito_Ueda1", "gender": ";;M;M", "homepage": "https://www.mit.edu/~ziyinl/;;https://james-simon.github.io/;http://cat.phys.s.u-tokyo.ac.jp/index-e.html", "dblp": ";;294/5406;", "google_scholar": "NpN9oRMAAAAJ;;zjGfh3sAAAAJ;https://scholar.google.co.jp/citations?user=Xpjx9CwAAAAJ", "orcid": ";;;0000-0002-5367-1436", "linkedin": ";;;", "or_profile": "~Liu_Ziyin1;botao.li95@gmail.com;~James_B_Simon1;~Masahito_Ueda1", "aff": "The University of Tokyo;;University of California, Berkeley;The University of Tokyo", "aff_domain": "u-tokyo.ac.jp;;berkeley.edu;u-tokyo.ac.jp", "position": "PhD student;;PhD student;Full Professor", "bibtex": "@inproceedings{\nziyin2022sgd,\ntitle={{SGD} Can Converge to Local Maxima},\nauthor={Liu Ziyin and Botao Li and James B Simon and Masahito Ueda},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=9XhPLAjjRB}\n}", "github": "", "project": "", "reviewers": "MX9i;DCLz;bMKE;k2vd;Jqcm", "pdf_size": 0, "recommendation": "6;6;8;8;8", "confidence": "4;5;3;4;4", "correctness": "4;3;3;3;4", "technical_novelty": "2;4;4;3;4", "empirical_novelty": "2;2;2;2;4", "wc_summary_paper": "98;248;75;45;215", "wc_summary_review": "4;147;93;30;29", "wc_main_review": "286;1777;500;174;204", "wc_review": "388;2172;668;249;448", "wc_reply_reviewers": "0;2365;20;0;0", "wc_reply_authors": "599;5039;690;98;232", "reply_reviewers": "0;6;1;0;0", "reply_authors": "1;9;1;1;1", "recommendation_avg": [ 7.2, 0.9797958971132712 ], "confidence_avg": [ 4.0, 0.6324555320336759 ], "correctness_avg": [ 3.4, 0.4898979485566356 ], "technical_novelty_avg": [ 3.4, 0.8 ], "empirical_novelty_avg": [ 2.4, 0.8 ], "wc_summary_paper_avg": [ 136.2, 80.2879816660003 ], "wc_summary_review_avg": [ 60.6, 52.25552602356999 ], "wc_main_review_avg": [ 588.2, 605.2306667709428 ], "wc_review_avg": [ 785.0, 706.5312448858862 ], "wc_reply_reviewers_avg": [ 477.0, 944.0317791261054 ], "wc_reply_authors_avg": [ 1331.6, 1866.764966459356 ], "reply_reviewers_avg": [ 1.4, 2.33238075793812 ], "reply_authors_avg": [ 2.6, 3.2000000000000006 ], "replies_avg": [ 27, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.6454972243679028, "corr_recommendation_correctness": -0.16666666666666663, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1, "pdf": "https://openreview.net/pdf?id=9XhPLAjjRB", "email": "u-tokyo.ac.jp;;berkeley.edu;u-tokyo.ac.jp", "author_num": 4, "aff_unique_index": "0;1;0", "aff_unique_norm": "University of Tokyo;University of California, Berkeley", "aff_unique_dep": ";", "aff_unique_url": "https://www.u-tokyo.ac.jp;https://www.berkeley.edu", "aff_unique_abbr": "UTokyo;UC Berkeley", "aff_campus_unique_index": "1", "aff_campus_unique": ";Berkeley", "aff_country_unique_index": "0;1;0", "aff_country_unique": "Japan;United States" }, { "title": "miniF2F: a cross-system benchmark for formal Olympiad-level mathematics", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6258", "id": "9ZPegFuFTFv", "poster": "", "openreview": "https://openreview.net/forum?id=9ZPegFuFTFv", "slides": "https://iclr.cc/virtual/2022/poster/6258", "video": "https://iclr.cc/virtual/2022/poster/6258", "author_site": "Kunhao Zheng, Jesse Han, Stanislas Polu", "tldr": "", "abstract": "We present $\\textsf{miniF2F}$, a dataset of formal Olympiad-level mathematics problems statements intended to provide a unified cross-system benchmark for neural theorem proving. The $\\textsf{miniF2F}$ benchmark currently targets Metamath, Lean, Isabelle (partially) and HOL Light (partially) and consists of 488 problem statements drawn from the AIME, AMC, and the International Mathematical Olympiad (IMO), as well as material from high-school and undergraduate mathematics courses. We report baseline results using GPT-f, a neural theorem prover based on GPT-3 and provide an analysis of its performance. We intend for $\\textsf{miniF2F}$ to be a community-driven effort and hope that our benchmark will help spur advances in neural theorem proving.", "keywords": "Neural theorem proving;Benchmark dataset", "primary_area": "", "supplementary_material": "", "author": "Kunhao Zheng;Jesse Michael Han;Stanislas Polu", "authorids": "~Kunhao_Zheng1;~Jesse_Michael_Han1;~Stanislas_Polu1", "gender": "M;M;M", "homepage": "https://dyekuu.github.io/;https://jesse-michael-han.github.io;", "dblp": "301/7847;;", "google_scholar": "zDy4jSYAAAAJ;;", "orcid": "0000-0003-1548-1890;;", "linkedin": "kunhao-zheng-x18/;;", "or_profile": "~Kunhao_Zheng1;~Jesse_Michael_Han1;~Stanislas_Polu1", "aff": "Ecole polytechnique;University of Pittsburgh;OpenAI", "aff_domain": "polytechnique.edu;pitt.edu;openai.com", "position": "MS student;PhD student;Research Engineer", "bibtex": "@inproceedings{\nzheng2022miniff,\ntitle={miniF2F: a cross-system benchmark for formal Olympiad-level mathematics},\nauthor={Kunhao Zheng and Jesse Michael Han and Stanislas Polu},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=9ZPegFuFTFv}\n}", "github": "", "project": "", "reviewers": "Jr4E;jbLg;wdpj;uyH2", "pdf_size": 0, "recommendation": "5;6;8;8", "confidence": "3;3;4;5", "correctness": "4;4;4;4", "technical_novelty": "2;1;3;4", "empirical_novelty": "3;3;3;3", "wc_summary_paper": "30;109;46;47", "wc_summary_review": "50;28;26;45", "wc_main_review": "341;129;196;172", "wc_review": "421;266;268;264", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "790;92;365;407", "reply_reviewers": "0;0;0;0", "reply_authors": "2;1;1;1", "recommendation_avg": [ 6.75, 1.299038105676658 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 4.0, 0.0 ], "technical_novelty_avg": [ 2.5, 1.118033988749895 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 58.0, 30.20761493398643 ], "wc_summary_review_avg": [ 37.25, 10.425329730996522 ], "wc_main_review_avg": [ 209.5, 79.62568681022475 ], "wc_review_avg": [ 304.75, 67.13186650168458 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 413.5, 248.75138190570922 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.8703882797784892, "corr_recommendation_correctness": 0.0, "gs_citation": 184, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11007110813493819221&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "pdf": "https://openreview.net/pdf?id=9ZPegFuFTFv", "email": "polytechnique.edu;pitt.edu;openai.com", "author_num": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "Ecole Polytechnique;University of Pittsburgh;OpenAI", "aff_unique_dep": ";;", "aff_unique_url": "https://www.polytechnique.edu;https://www.pitt.edu;https://openai.com", "aff_unique_abbr": "X;Pitt;OpenAI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1", "aff_country_unique": "France;United States" }, { "id": "9dn7CjyTFoS", "title": "One Thing to Fool them All: Generating Interpretable, Universal, and Physically-Realizable Adversarial Features", "track": "main", "status": "Reject", "tldr": "", "abstract": "It is well understood that modern deep networks are vulnerable to adversarial attacks. However, conventional methods fail to produce adversarial perturbations that are intelligible to humans, and they pose limited threats in the physical world. To study feature-class associations in networks and better understand the real-world threats they face, we develop feature-level adversarial perturbations using deep image generators and a novel optimization objective. We term these feature-fool attacks. We show that they are versatile and use them to generate targeted feature-level attacks at the ImageNet scale that are simultaneously interpretable, universal to any source image, and physically-realizable. These attacks can also reveal spurious, semantically-describable feature/class associations, and we use them to guide the design of ``copy/paste'' adversaries in which one natural image is pasted into another to cause a targeted misclassification.", "keywords": "adversaries;interpretablity;generative modeling", "primary_area": "", "supplementary_material": "/attachment/f5c04d600b5614f98e14dfdf9daff8fca4bb710b.zip", "author": "Stephen Casper;Max Nadeau;Gabriel Kreiman", "authorids": "~Stephen_Casper1;mnadeau@college.harvard.edu;~Gabriel_Kreiman1", "gender": "M;;M", "homepage": "https://stephencasper.com/;;http://klab.tch.harvard.edu", "dblp": "255/5295.html;;12/1367", "google_scholar": "N4aglP4AAAAJ;;WxZ_6nsAAAAJ", "orcid": "0000-0003-0084-1937;;0000-0003-3505-8475", "linkedin": ";;kreiman/", "or_profile": "~Stephen_Casper1;mnadeau@college.harvard.edu;~Gabriel_Kreiman1", "aff": "Massachusetts Institute of Technology;;Harvard Medical School", "aff_domain": "mit.edu;;harvard.edu", "position": "Graduate Student;;Full Professor", "bibtex": "@misc{\ncasper2022one,\ntitle={One Thing to Fool them All: Generating Interpretable, Universal, and Physically-Realizable Adversarial Features},\nauthor={Stephen Casper and Max Nadeau and Gabriel Kreiman},\nyear={2022},\nurl={https://openreview.net/forum?id=9dn7CjyTFoS}\n}", "github": "", "project": "", "reviewers": "zy7u;6FCU;k6Su;1mBx", "site": "https://openreview.net/forum?id=9dn7CjyTFoS", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "5;4;3;4", "correctness": "2;3;3;3", "technical_novelty": "2;3;2;4", "empirical_novelty": "1;3;2;2", "wc_summary_paper": "62;51;63;81", "wc_summary_review": "34;49;18;47", "wc_main_review": "833;201;368;152", "wc_review": "929;301;449;280", "wc_reply_reviewers": "0;40;30;30", "wc_reply_authors": "839;144;493;75", "reply_reviewers": "0;1;1;1", "reply_authors": "2;1;1;1", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.82915619758885 ], "empirical_novelty_avg": [ 2.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 64.25, 10.755812382149477 ], "wc_summary_review_avg": [ 37.0, 12.389511693363866 ], "wc_main_review_avg": [ 388.5, 268.83498656238925 ], "wc_review_avg": [ 489.75, 261.8314104533679 ], "wc_reply_reviewers_avg": [ 25.0, 15.0 ], "wc_reply_authors_avg": [ 387.75, 304.930627356453 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.6488856845230502, "corr_recommendation_correctness": 0.9271726499455306, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9520058058739532626&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1", "aff_unique_norm": "Massachusetts Institute of Technology;Harvard University", "aff_unique_dep": ";Medical School", "aff_unique_url": "https://web.mit.edu;https://hms.harvard.edu", "aff_unique_abbr": "MIT;HMS", "aff_campus_unique_index": "1", "aff_campus_unique": ";Boston", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "9gz8qakpyhG", "title": "Test-time Batch Statistics Calibration for Covariate Shift", "track": "main", "status": "Reject", "tldr": "", "abstract": "Deep neural networks have a clear degradation when applying to the unseen environment due to the covariate shift. Conventional approaches like domain adaptation requires the pre-collected target data for iterative training, which is impractical in real-world applications. In this paper, we propose to adapt the deep models to the novel environment during inference. An previous solution is test time normalization, which substitutes the source statistics in BN layers with the target batch statistics. However, we show that test time normalization may potentially deteriorate the discriminative structures due to the mismatch between target batch statistics and source parameters. To this end, we present a general formulation $\\alpha$-BN to calibrate the batch statistics by mixing up the source and target statistics for both alleviating the domain shift and preserving the discriminative structures. Based on $\\alpha$-BN, we further present a novel loss function to form a unified test time adaptation framework Core, which performs the pairwise class correlation online optimization. Extensive experiments show that our approaches achieve the state-of-the-art performance on total twelve datasets from three topics, including model robustness to corruptions, domain generalization on image classification and semantic segmentation. Particularly, our $\\alpha$-BN improves 28.4\\% to 43.9\\% on GTA5 $\\rightarrow$ Cityscapes without any training, even outperforms the latest source-free domain adaptation method.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "fuming you;Jingjing Li;Zhou Zhao", "authorids": "~fuming_you2;~Jingjing_Li1;~Zhou_Zhao2", "gender": "M;;M", "homepage": ";https://lijin118.github.io/;https://dblp.uni-trier.de/pid/75/7785.html?", "dblp": "277/1388;65/4699-1;75/7785", "google_scholar": "W6Nf_CAAAAAJ;https://scholar.google.ca/citations?view_op=list_works;https://scholar.google.com.hk/citations?user=IIoFY90AAAAJ", "orcid": ";;0000-0001-6121-0384", "linkedin": ";;", "or_profile": "~fuming_you2;~Jingjing_Li1;~Zhou_Zhao2", "aff": "University of Electronic Science and Technology of China;University of Electronic Science and Technology of China;Zhejiang University", "aff_domain": "uestc.edu.cn;uestc.edu.cn;zju.edu.cn", "position": "Undergrad student;Full Professor;Associate Professor", "bibtex": "@misc{\nyou2022testtime,\ntitle={Test-time Batch Statistics Calibration for Covariate Shift},\nauthor={fuming you and Jingjing Li and Zhou Zhao},\nyear={2022},\nurl={https://openreview.net/forum?id=9gz8qakpyhG}\n}", "github": "", "project": "", "reviewers": "H2Ln;A9FS;8DqH;cBnx", "site": "https://openreview.net/forum?id=9gz8qakpyhG", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "4;4;4;5", "correctness": "3;3;2;3", "technical_novelty": "3;2;2;3", "empirical_novelty": "3;2;3;3", "wc_summary_paper": "94;76;49;56", "wc_summary_review": "162;97;155;32", "wc_main_review": "583;214;540;374", "wc_review": "839;387;744;462", "wc_reply_reviewers": "132;0;78;0", "wc_reply_authors": "1266;470;1360;725", "reply_reviewers": "1;0;2;0", "reply_authors": "3;2;3;2", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 68.75, 17.62632973707232 ], "wc_summary_review_avg": [ 111.5, 52.376044142336674 ], "wc_main_review_avg": [ 427.75, 146.01434004918832 ], "wc_review_avg": [ 608.0, 188.42372462086615 ], "wc_reply_reviewers_avg": [ 52.5, 55.863673348608216 ], "wc_reply_authors_avg": [ 955.25, 370.42905866035943 ], "reply_reviewers_avg": [ 0.75, 0.82915619758885 ], "reply_authors_avg": [ 2.5, 0.5 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": -0.5773502691896257, "gs_citation": 65, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11468043647659657155&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2, "aff_unique_index": "0;0;1", "aff_unique_norm": "University of Electronic Science and Technology of China;Zhejiang University", "aff_unique_dep": ";", "aff_unique_url": "https://www.uestc.edu.cn;https://www.zju.edu.cn", "aff_unique_abbr": "UESTC;ZJU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "PoNet: Pooling Network for Efficient Token Mixing in Long Sequences", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6219", "id": "9jInD9JjicF", "poster": "", "openreview": "https://openreview.net/forum?id=9jInD9JjicF", "slides": "https://iclr.cc/virtual/2022/poster/6219", "video": "https://iclr.cc/virtual/2022/poster/6219", "author_site": "Chao-Hong Tan, Qian Chen, Wen Wang, Qinglin Zhang, Siqi Zheng, Zhen-Hua Ling", "tldr": "", "abstract": "Transformer-based models have achieved great success in various NLP, vision, and speech tasks. However, the core of Transformer, the self-attention mechanism, has a quadratic time and memory complexity with respect to the sequence length, which hinders applications of Transformer-based models to long sequences. Many approaches have been proposed to mitigate this problem, such as sparse attention mechanisms, low-rank matrix approximations and scalable kernels, and token mixing alternatives to self-attention. We propose a novel Pooling Network (PoNet) for token mixing in long sequences with linear complexity. We design multi-granularity pooling and pooling fusion to capture different levels of contextual information and combine their interactions with tokens. On the Long Range Arena benchmark, PoNet significantly outperforms Transformer and achieves competitive accuracy, while being only slightly slower than the fastest model, FNet, across all sequence lengths measured on GPUs. We also conduct systematic studies on the transfer learning capability of PoNet and observe that PoNet achieves 95.7 percent of the accuracy of BERT on the GLUE benchmark, outperforming FNet by 4.5 percent relative. Comprehensive ablation analysis demonstrates effectiveness of the designed multi-granularity pooling and pooling fusion for token mixing in long sequences and efficacy of the designed pre-training tasks for PoNet to learn transferable contextualized language representations.", "keywords": "Transformer;Efficient Transformers;Token Mixing;Pooling;Linear;Long Range Arena;Transfer Learning;BERT;GLUE", "primary_area": "", "supplementary_material": "", "author": "Chao-Hong Tan;Qian Chen;Wen Wang;Qinglin Zhang;Siqi Zheng;Zhen-Hua Ling", "authorids": "~Chao-Hong_Tan1;~Qian_Chen1;~Wen_Wang6;~Qinglin_Zhang1;~Siqi_Zheng1;~Zhen-Hua_Ling1", "gender": ";M;;M;M;M", "homepage": ";https://scholar.google.com/citations?user=8eosmSQAAAAJ&hl=en;https://scholar.google.com/citations?user=85Tj1OwAAAAJ&hl=en;;;http://staff.ustc.edu.cn/~zhling/", "dblp": "282/0435;11/1394-3;29/4680-1;67/4963;;70/5210", "google_scholar": "FkWdcrcAAAAJ;8eosmSQAAAAJ;85Tj1OwAAAAJ;6Q7NBaEAAAAJ;https://scholar.google.com.hk/citations?user=BsrS95gAAAAJ;f8jRR3EAAAAJ", "orcid": ";0000-0001-6939-7438;0000-0002-0356-1968;;;", "linkedin": ";;wen-wang-414b548/;;;", "or_profile": "~Chao-Hong_Tan1;~Qian_Chen1;~Wen_Wang6;~Qinglin_Zhang1;~Siqi_Zheng1;~Zhen-Hua_Ling1", "aff": "University of Science and Technology of China;Alibaba Group;Alibaba Group;Alibaba Group;Alibaba Group;University of Science and Technology of China", "aff_domain": "ustc.edu.cn;alibaba-inc.com;alibaba-inc.com;alibaba-inc.com;alibaba-inc.com;ustc.edu.cn", "position": "PhD student;Researcher;Senior Staff Algorithm Engineer;Researcher;Researcher;Professor", "bibtex": "@inproceedings{\ntan2022ponet,\ntitle={PoNet: Pooling Network for Efficient Token Mixing in Long Sequences},\nauthor={Chao-Hong Tan and Qian Chen and Wen Wang and Qinglin Zhang and Siqi Zheng and Zhen-Hua Ling},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=9jInD9JjicF}\n}", "github": "", "project": "", "reviewers": "ssBH;Xvc7;vM28;zcvh", "pdf_size": 0, "recommendation": "5;5;6;8", "confidence": "4;5;4;4", "correctness": "3;3;3;3", "technical_novelty": "2;2;2;2", "empirical_novelty": "3;2;3;3", "wc_summary_paper": "60;126;222;59", "wc_summary_review": "51;25;129;62", "wc_main_review": "149;392;160;127", "wc_review": "260;543;511;248", "wc_reply_reviewers": "0;314;54;0", "wc_reply_authors": "1851;4518;1459;618", "reply_reviewers": "0;1;2;0", "reply_authors": "3;7;3;1", "recommendation_avg": [ 6.0, 1.224744871391589 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 116.75, 66.55589755987069 ], "wc_summary_review_avg": [ 66.75, 38.36909563698368 ], "wc_main_review_avg": [ 207.0, 107.46860006532141 ], "wc_review_avg": [ 390.5, 137.0337549657018 ], "wc_reply_reviewers_avg": [ 92.0, 130.05383500689243 ], "wc_reply_authors_avg": [ 2111.5, 1459.0580009033226 ], "reply_reviewers_avg": [ 0.75, 0.82915619758885 ], "reply_authors_avg": [ 3.5, 2.179449471770337 ], "replies_avg": [ 23, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.4714045207910316, "corr_recommendation_correctness": 0.0, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12721480032939252557&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=9jInD9JjicF", "email": "ustc.edu.cn;alibaba-inc.com;alibaba-inc.com;alibaba-inc.com;alibaba-inc.com;ustc.edu.cn", "author_num": 6, "aff_unique_index": "0;1;1;1;1;0", "aff_unique_norm": "University of Science and Technology of China;Alibaba Group", "aff_unique_dep": ";", "aff_unique_url": "http://www.ustc.edu.cn;https://www.alibaba.com", "aff_unique_abbr": "USTC;Alibaba", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Unified Visual Transformer Compression", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6958", "id": "9jsZiUgkCZP", "poster": "", "openreview": "https://openreview.net/forum?id=9jsZiUgkCZP", "slides": "https://iclr.cc/virtual/2022/poster/6958", "video": "https://iclr.cc/virtual/2022/poster/6958", "author_site": "Shixing Yu, Tianlong Chen, Jiayi Shen, Huan Yuan, Jianchao Tan, Sen Yang, Ji Liu, Zhangyang Wang", "tldr": "", "abstract": "Vision transformers (ViTs) have gained popularity recently. Even without customized image operators such as convolutions, ViTs can yield competitive performance when properly trained on massive data. However, the computational overhead of ViTs remains prohibitive, due to stacking multi-head self-attention modules and else. Compared to the vast literature and prevailing success in compressing convolutional neural networks, the study of Vision Transformer compression has also just emerged, and existing works focused on one or two aspects of compression. This paper proposes a unified ViT compression framework that seamlessly assembles three effective techniques: pruning, layer skipping, and knowledge distillation. We formulate a budget-constrained, end-to-end optimization framework, targeting jointly learning model weights, layer-wise pruning ratios/masks, and skip configurations, under a distillation loss. The optimization problem is then solved using the primal-dual algorithm. Experiments are conducted with several ViT variants, e.g. DeiT and T2T-ViT backbones on the ImageNet dataset, and our approach consistently outperforms recent competitors. For example, DeiT-Tiny can be trimmed down to 50\\% of the original FLOPs almost without losing accuracy. Codes are available online:~\\url{https://github.com/VITA-Group/UVC}.", "keywords": "Vision Transformer;Model Compression;Pruning;Layer Skipping;Distillation", "primary_area": "", "supplementary_material": "", "author": "Shixing Yu;Tianlong Chen;Jiayi Shen;Huan Yuan;Jianchao Tan;Sen Yang;Ji Liu;Zhangyang Wang", "authorids": "~Shixing_Yu1;~Tianlong_Chen1;~Jiayi_Shen1;yuanhuan9412@163.com;~Jianchao_Tan1;~Sen_Yang4;~Ji_Liu1;~Zhangyang_Wang1", "gender": "M;M;;;M;M;M;M", "homepage": "https://billysx.github.io/;https://tianlong-chen.github.io;https://jiayishen.netlify.app/;;https://jianchaotan.github.io/;http://senyang.info;http://jiliu-ml.org;https://vita-group.github.io", "dblp": ";;;;165/9938;90/4655-4.html;51/4433-2.html;119/4026", "google_scholar": ";LE3ctn0AAAAJ;;;1Gywy80AAAAJ;zit9APUAAAAJ;RRzVwKkAAAAJ;pxFyKAIAAAAJ", "orcid": ";0000-0001-7774-8197;;;;0009-0000-1472-5537;;", "linkedin": "%E4%B8%96%E5%85%B4-%E4%BA%8E-029401182/;tianlong-chen-783862167/;;;jianchao-tan-b58a96a7/;;;", "or_profile": "~Shixing_Yu1;~Tianlong_Chen1;~Jiayi_Shen1;yuanhuan9412@163.com;~Jianchao_Tan1;~Sen_Yang4;~Ji_Liu1;~Zhangyang_Wang1", "aff": "Cornell University;University of Texas, Austin;Texas A&M;;Kuaishou;Kuaishou Technology;Meta Facebook;University of Texas, Austin", "aff_domain": "cornell.edu;utexas.edu;tamu.edu;;kuaishou.com;kuaishou.com;facebook.com;utexas.edu", "position": "PhD student;PhD student;PhD student;;Researcher;Principal Researcher;Principal Researcher;Assistant Professor", "bibtex": "@inproceedings{\nyu2022unified,\ntitle={Unified Visual Transformer Compression},\nauthor={Shixing Yu and Tianlong Chen and Jiayi Shen and Huan Yuan and Jianchao Tan and Sen Yang and Ji Liu and Zhangyang Wang},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=9jsZiUgkCZP}\n}", "github": "", "project": "", "reviewers": "iRBq;K4Gq;UdwX", "pdf_size": 0, "recommendation": "5;6;8", "confidence": "5;4;4", "correctness": "2;4;4", "technical_novelty": "3;3;3", "empirical_novelty": "2;4;4", "wc_summary_paper": "74;18;35", "wc_summary_review": "49;43;32", "wc_main_review": "157;336;474", "wc_review": "280;397;541", "wc_reply_reviewers": "0;0;32", "wc_reply_authors": "1437;1509;1258", "reply_reviewers": "0;0;1", "reply_authors": "5;4;3", "recommendation_avg": [ 6.333333333333333, 1.247219128924647 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.9428090415820634 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 3.3333333333333335, 0.9428090415820634 ], "wc_summary_paper_avg": [ 42.333333333333336, 23.442601296689656 ], "wc_summary_review_avg": [ 41.333333333333336, 7.039570693980959 ], "wc_main_review_avg": [ 322.3333333333333, 129.7750189965525 ], "wc_review_avg": [ 406.0, 106.74268124794318 ], "wc_reply_reviewers_avg": [ 10.666666666666666, 15.084944665313014 ], "wc_reply_authors_avg": [ 1401.3333333333333, 105.52830057488003 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 4.0, 0.816496580927726 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": -0.7559289460184544, "corr_recommendation_correctness": 0.7559289460184546, "gs_citation": 133, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1947517498926990042&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=9jsZiUgkCZP", "email": "cornell.edu;utexas.edu;tamu.edu;;kuaishou.com;kuaishou.com;facebook.com;utexas.edu", "author_num": 8, "aff_unique_index": "0;1;2;3;3;4;1", "aff_unique_norm": "Cornell University;University of Texas at Austin;Texas A&M University;Kuaishou Technology;Meta", "aff_unique_dep": ";;;;Meta Platforms, Inc.", "aff_unique_url": "https://www.cornell.edu;https://www.utexas.edu;https://www.tamu.edu;https://www.kuaishou.com;https://meta.com", "aff_unique_abbr": "Cornell;UT Austin;TAMU;Kuaishou;Meta", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Austin", "aff_country_unique_index": "0;0;0;1;1;0;0", "aff_country_unique": "United States;China" }, { "id": "9kBDWEmA6i", "title": "When high-performing models behave poorly in practice: periodic sampling can help", "track": "main", "status": "Reject", "tldr": "", "abstract": "Training a deep neural network (DNN) for breast cancer detection from medical images suffers from the (hopefully) low prevalence of the pathology.\nFor a sensible amount of positive cases, images must be collected from numerous places resulting in large heterogeneous datasets with different acquisition devices, populations, cancer incidences.\nWithout precaution, this heterogeneity may result in a DNN biased by latent variables a priori independent of the pathology.\nThis may be dramatic if this DNN is used inside a software to help radiologists to detect cancers.\nThis work mitigates this issue by acting on how mini-batches for Stochastic Gradient Descent (SGD) algorithms are constructed.\nThe dataset is divided into homogeneous subsets sharing some attributes (\\textit{e.g.} acquisition device, source) called Data Segments (DSs).\nBatches are built by sampling each DS periodically with a frequency proportional to the rarest label in the DS and by simultaneously preserving an overall balance between positive and negative labels within the batch.\nPeriodic sampling is compared to balanced sampling (equal amount of labels within a batch, independently of DS) and to balanced sampling within DS (equal amount of labels within a batch and each DS).\nWe show, on breast cancer prediction from mammography images of various devices and origins, that periodic sampling leads to better generalization than other sampling strategies.", "keywords": "Periodic sampling;deep learning;computer vision;mammography", "primary_area": "", "supplementary_material": "", "author": "Stanislas Chambon;Julien GUILLAUMIN;Luis Montero;Yaroslav Nikulin;Paul Wambergue;Pierre Fillard", "authorids": "~Stanislas_Chambon1;~Julien_GUILLAUMIN1;~Luis_Montero1;~Yaroslav_Nikulin1;~Paul_Wambergue1;~Pierre_Fillard3", "gender": ";;M;M;;M", "homepage": ";https://github.com/JGuillaumin;;https://www.linkedin.com/in/yaroslav-nikulin/;https://fr.linkedin.com/in/paulwambergue;", "dblp": ";;;;;", "google_scholar": ";;;;;rb-PARAAAAAJ", "orcid": ";;;;;0000-0002-2848-6394", "linkedin": ";;luis-montero-/;;;", "or_profile": "~Stanislas_Chambon1;~Julien_GUILLAUMIN1;~Luis_Montero1;~Yaroslav_Nikulin1;~Paul_Wambergue1;~Pierre_Fillard3", "aff": ";;;Therapixel;Therapixel;Therapixel", "aff_domain": ";;;therapixel.com;therapixel.com;therapixel.com", "position": ";;;Senior Research Scientist;Researcher;Principal Researcher", "bibtex": "@inproceedings{\nchambon2022when,\ntitle={When high-performing models behave poorly in practice: periodic sampling can help},\nauthor={Stanislas Chambon and Julien GUILLAUMIN and Luis Montero and Yaroslav Nikulin and Paul Wambergue and Pierre Fillard},\nbooktitle={Submitted to The Tenth International Conference on Learning Representations },\nyear={2022},\nurl={https://openreview.net/forum?id=9kBDWEmA6i},\nnote={under review}\n}", "github": "", "project": "", "reviewers": "G6RS;KyWh;ycfR;Je4d;nScu", "site": "https://openreview.net/forum?id=9kBDWEmA6i", "pdf_size": 0, "recommendation": "3;5;5;5;6", "confidence": "4;4;4;4;4", "correctness": "3;4;4;2;4", "technical_novelty": "1;2;3;2;3", "empirical_novelty": "2;2;2;2;3", "wc_summary_paper": "67;172;59;95;336", "wc_summary_review": "70;85;50;69;187", "wc_main_review": "168;516;309;475;354", "wc_review": "305;773;418;639;877", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "0;0;0;0;0", "reply_reviewers": "0;0;0;0;0", "reply_authors": "0;0;0;0;0", "recommendation_avg": [ 4.8, 0.9797958971132712 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.4, 0.8 ], "technical_novelty_avg": [ 2.2, 0.7483314773547882 ], "empirical_novelty_avg": [ 2.2, 0.39999999999999997 ], "wc_summary_paper_avg": [ 145.8, 103.13757802081645 ], "wc_summary_review_avg": [ 92.2, 48.684289046878355 ], "wc_main_review_avg": [ 364.4, 124.06869065159027 ], "wc_review_avg": [ 602.4, 213.68163234120055 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.3572172541558801, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15076103384217638329&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;0", "aff_unique_norm": "Therapixel", "aff_unique_dep": "", "aff_unique_url": "https://www.therapixel.com", "aff_unique_abbr": "", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "France" }, { "title": "Huber Additive Models for Non-stationary Time Series Analysis", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6601", "id": "9kpuB2bgnim", "poster": "", "openreview": "https://openreview.net/forum?id=9kpuB2bgnim", "slides": "https://iclr.cc/virtual/2022/poster/6601", "video": "https://iclr.cc/virtual/2022/poster/6601", "author_site": "Yingjie Wang, Xianrui Zhong, Fengxiang He, Hong Chen, Dacheng Tao", "tldr": "", "abstract": "Sparse additive models have shown promising \ufb02exibility and interpretability in processing time series data. However, existing methods usually assume the time series data to be stationary and the innovation is sampled from a Gaussian distribution. Both assumptions are too stringent for heavy-tailed and non-stationary time series data that frequently arise in practice, such as \ufb01nance and medical \ufb01elds. To address these problems, we propose an adaptive sparse Huber additive model for robust forecasting in both non-Gaussian data and (non)stationary data. In theory, the generalization bounds of our estimator are established for both stationary and nonstationary time series data, which are independent of the widely used mixing conditions in learning theory of dependent observations. Moreover, the error bound for non-stationary time series contains a discrepancy measure for the shifts of the data distributions over time. Such a discrepancy measure can be estimated empirically and used as a penalty in our method. Experimental results on both synthetic and real-world benchmark datasets validate the effectiveness of the proposed method. The code is available at https://github.com/xianruizhong/SpHAM.", "keywords": "Sparse additive models;variable selection;Huber;non-stationary;robust forecasting", "primary_area": "", "supplementary_material": "", "author": "Yingjie Wang;Xianrui Zhong;Fengxiang He;Hong Chen;Dacheng Tao", "authorids": "~Yingjie_Wang1;~Xianrui_Zhong1;~Fengxiang_He1;~Hong_Chen1;~Dacheng_Tao1", "gender": "M;M;;;", "homepage": "https://www.researchgate.net/profile/Yingjie-Wang-37;https://xianruizhong.github.io;https://fengxianghe.github.io/;https://chenhongml.github.io/;", "dblp": "33/6297-7;326/7308;225/4682;https://dblp.uni-trier.de/pers/hd/c/Chen_0004:Hong;", "google_scholar": "https://scholar.google.com/citations?hl=en;F8izpj0AAAAJ;QSx-Yu0AAAAJ;;", "orcid": ";0000-0002-2100-6474;;;", "linkedin": ";xianrui-zhong/;fengxiang-he-35b173122;;", "or_profile": "~Yingjie_Wang1;~Xianrui_Zhong1;~Fengxiang_He1;~Hong_Chen1;~Dacheng_Tao1", "aff": ";University of Illinois, Urbana Champaign;JD.com, Inc.;Huazhong Agricultural University;", "aff_domain": ";illinois.edu;jd.com;hzau.edu.cn;", "position": ";Undergrad student;Algorithm Scientist;Full Professor;", "bibtex": "@inproceedings{\nwang2022huber,\ntitle={Huber Additive Models for Non-stationary Time Series Analysis},\nauthor={Yingjie Wang and Xianrui Zhong and Fengxiang He and Hong Chen and Dacheng Tao},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=9kpuB2bgnim}\n}", "github": "", "project": "", "reviewers": "4Rek;es4A;KeVD;3C4P", "pdf_size": 0, "recommendation": "6;6;6;8", "confidence": "4;2;3;2", "correctness": "3;3;3;3", "technical_novelty": "3;2;3;3", "empirical_novelty": "2;0;2;3", "wc_summary_paper": "102;151;41;86", "wc_summary_review": "45;15;46;20", "wc_main_review": "301;99;331;264", "wc_review": "448;265;418;370", "wc_reply_reviewers": "137;0;0;0", "wc_reply_authors": "2499;781;1540;1124", "reply_reviewers": "1;0;0;0", "reply_authors": "5;1;3;2", "recommendation_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 2.75, 0.82915619758885 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.75, 1.0897247358851685 ], "wc_summary_paper_avg": [ 95.0, 39.31284777270657 ], "wc_summary_review_avg": [ 31.5, 14.115594213493104 ], "wc_main_review_avg": [ 248.75, 89.655939568999 ], "wc_review_avg": [ 375.25, 69.46716850426537 ], "wc_reply_reviewers_avg": [ 34.25, 59.322740159234044 ], "wc_reply_authors_avg": [ 1486.0, 643.6524683398643 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 2.75, 1.479019945774904 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.5222329678670935, "corr_recommendation_correctness": 0.0, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6715135542145661993&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "pdf": "https://openreview.net/pdf?id=9kpuB2bgnim", "email": ";illinois.edu;jd.com;hzau.edu.cn;", "author_num": 5, "aff_unique_index": "0;1;2", "aff_unique_norm": "University of Illinois Urbana-Champaign;JD.com;Huazhong Agricultural University", "aff_unique_dep": ";;", "aff_unique_url": "https://illinois.edu;https://www.jd.com;http://www.hzau.edu.cn/", "aff_unique_abbr": "UIUC;JD.com;HAU", "aff_campus_unique_index": "0", "aff_campus_unique": "Urbana-Champaign;", "aff_country_unique_index": "0;1;1", "aff_country_unique": "United States;China" }, { "id": "9mls_1dBQS", "title": "Model-based Reinforcement Learning with Ensembled Model-value Expansion", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Model-based reinforcement learning (MBRL) methods are often more data-efficient and quicker to converge than their model-free counterparts, but typically rely crucially on accurate modeling of the environment dynamics and associated uncertainty in order to perform well. Recent approaches have used ensembles of dynamics models within MBRL to separately capture aleatoric and epistemic uncertainty of the learned dynamics, but many MBRL algorithms are still limited because they treat these dynamics models as a \"black box\" without fully exploiting the uncertainty modeling.\nIn this paper, we propose a simple but effective approach to improving the performance of MBRL by directly incorporating the ensemble prediction \\emph{into} the RL method itself: we propose constructing multiple value roll-outs using different members of the dynamics ensemble, and aggregating the separate estimates to form a joint estimate of the state value. Despite its simplicity, we show that this method substantially improves the performance of MBRL methods: we comprehensively evaluate this technique on common locomotion benchmarks, with ablative experiments to show the added value of our proposed components.", "keywords": "MBRL;Model-based Reinforcement Learning;Ensemble;Dynamics;Neural Network;Reinforcement Learning", "primary_area": "", "supplementary_material": "/attachment/86a26434119e66dca5c9fbd24c95243a8f392627.zip", "author": "Gaurav Manek;J Zico Kolter", "authorids": "~Gaurav_Manek1;~J_Zico_Kolter1", "gender": ";M", "homepage": "https://www.gauravmanek.com/;http://www.zicokolter.com", "dblp": "200/8866;67/2526", "google_scholar": "C8Mdr2UAAAAJ;UXh1I6UAAAAJ", "orcid": ";", "linkedin": "https://sg.linkedin.com/in/gauravmanek;", "or_profile": "~Gaurav_Manek1;~Zico_Kolter1", "aff": "Carnegie Mellon University;Carnegie Mellon University", "aff_domain": "cmu.edu;cmu.edu", "position": "PhD student;Full Professor", "bibtex": "@misc{\nmanek2022modelbased,\ntitle={Model-based Reinforcement Learning with Ensembled Model-value Expansion},\nauthor={Gaurav Manek and J Zico Kolter},\nyear={2022},\nurl={https://openreview.net/forum?id=9mls_1dBQS}\n}", "github": "", "project": "", "reviewers": "EueN;P6yV;t2eH;DYrX", "site": "https://openreview.net/forum?id=9mls_1dBQS", "pdf_size": 0, "recommendation": "1;1;1;3", "confidence": "5;4;4;4", "correctness": "1;1;4;3", "technical_novelty": "1;2;1;1", "empirical_novelty": "1;2;1;2", "wc_summary_paper": "55;80;54;46", "wc_summary_review": "100;55;24;23", "wc_main_review": "774;731;194;94", "wc_review": "929;866;272;163", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "130;187;74;51", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 1.5, 0.8660254037844386 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 2.25, 1.299038105676658 ], "technical_novelty_avg": [ 1.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.5, 0.5 ], "wc_summary_paper_avg": [ 58.75, 12.754901018824098 ], "wc_summary_review_avg": [ 50.5, 31.34086788842964 ], "wc_main_review_avg": [ 448.25, 306.6743998119178 ], "wc_review_avg": [ 557.5, 342.9012248447066 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 110.5, 52.690131903421914 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:QZrxNnG03DEJ:scholar.google.com/&scioq=Model-based+Reinforcement+Learning+with+Ensembled+Model-value+Expansion&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Carnegie Mellon University", "aff_unique_dep": "", "aff_unique_url": "https://www.cmu.edu", "aff_unique_abbr": "CMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Plant 'n' Seek: Can You Find the Winning Ticket?", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7073", "id": "9n9c8sf0xm", "poster": "", "openreview": "https://openreview.net/forum?id=9n9c8sf0xm", "slides": "https://iclr.cc/virtual/2022/poster/7073", "video": "https://iclr.cc/virtual/2022/poster/7073", "author_site": "Jonas Fischer, Rebekka Burkholz", "tldr": "", "abstract": "The lottery ticket hypothesis has sparked the rapid development of pruning algorithms that aim to reduce the computational costs associated with deep learning during training and model deployment. Currently, such algorithms are primarily evaluated on imaging data, for which we lack ground truth information and thus the understanding of how sparse lottery tickets could be. To fill this gap, we develop a framework that allows us to plant and hide winning tickets with desirable properties in randomly initialized neural networks. To analyze the ability of state-of-the-art pruning to identify tickets of extreme sparsity, we design and hide such tickets solving four challenging tasks. In extensive experiments, we observe similar trends as in imaging studies, indicating that our framework can provide transferable insights into realistic problems. Additionally, we can now see beyond such relative trends and highlight limitations of current pruning methods. Based on our results, we conclude that the current limitations in ticket sparsity are likely of algorithmic rather than fundamental nature. We anticipate that comparisons to planted tickets will facilitate future developments of efficient pruning algorithms.", "keywords": "lottery tickets;ground truth;planting;LTH", "primary_area": "", "supplementary_material": "/attachment/f014e12cb4e7b4689e497c94223bb971eedd6287.zip", "author": "Jonas Fischer;Rebekka Burkholz", "authorids": "~Jonas_Fischer1;~Rebekka_Burkholz1", "gender": ";F", "homepage": ";https://sites.google.com/view/rebekkaburkholz/startseite", "dblp": ";194/3172", "google_scholar": ";https://scholar.google.ch/citations?user=vkWBb2wAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Jonas_Fischer1;~Rebekka_Burkholz1", "aff": ";Helmholtz Center CISPA for Information Security", "aff_domain": ";cispa.saarland", "position": ";Associate Professor", "bibtex": "@inproceedings{\nfischer2022plant,\ntitle={Plant 'n' Seek: Can You Find the Winning Ticket?},\nauthor={Jonas Fischer and Rebekka Burkholz},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=9n9c8sf0xm}\n}", "github": "", "project": "", "reviewers": "RZKn;2wJr;HJLz;vW7Q;kf7j", "pdf_size": 0, "recommendation": "5;5;6;6;6", "confidence": "3;3;3;3;4", "correctness": "3;3;4;4;4", "technical_novelty": "2;2;3;2;3", "empirical_novelty": "2;2;4;2;3", "wc_summary_paper": "46;42;222;43;150", "wc_summary_review": "30;13;183;33;171", "wc_main_review": "351;482;649;293;383", "wc_review": "427;537;1054;369;704", "wc_reply_reviewers": "139;62;278;0;0", "wc_reply_authors": "862;1040;826;484;464", "reply_reviewers": "1;1;1;0;0", "reply_authors": "3;3;2;1;1", "recommendation_avg": [ 5.6, 0.48989794855663565 ], "confidence_avg": [ 3.2, 0.39999999999999997 ], "correctness_avg": [ 3.6, 0.4898979485566356 ], "technical_novelty_avg": [ 2.4, 0.4898979485566356 ], "empirical_novelty_avg": [ 2.6, 0.8 ], "wc_summary_paper_avg": [ 100.6, 73.36375126723006 ], "wc_summary_review_avg": [ 86.0, 74.71010641138186 ], "wc_main_review_avg": [ 431.6, 124.79679483063657 ], "wc_review_avg": [ 618.2, 245.9816253300234 ], "wc_reply_reviewers_avg": [ 95.8, 104.46128469437852 ], "wc_reply_authors_avg": [ 735.2, 225.33388560090114 ], "reply_reviewers_avg": [ 0.6, 0.48989794855663565 ], "reply_authors_avg": [ 2.0, 0.8944271909999159 ], "replies_avg": [ 20, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.408248290463863, "corr_recommendation_correctness": 1.0, "gs_citation": 27, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17211251383675114459&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 11, "pdf": "https://openreview.net/pdf?id=9n9c8sf0xm", "email": ";cispa.saarland", "author_num": 2, "aff_unique_index": "0", "aff_unique_norm": "Helmholtz Center CISPA", "aff_unique_dep": "Information Security", "aff_unique_url": "https://www.cispa.de/", "aff_unique_abbr": "CISPA", "aff_country_unique_index": "0", "aff_country_unique": "Germany" }, { "title": "Multi-Task Processes", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6487", "id": "9otKVlgrpZG", "poster": "", "openreview": "https://openreview.net/forum?id=9otKVlgrpZG", "slides": "https://iclr.cc/virtual/2022/poster/6487", "video": "https://iclr.cc/virtual/2022/poster/6487", "author_site": "Donggyun Kim, Seongwoong Cho, Wonkwang Lee, Seunghoon Hong", "tldr": "", "abstract": "Neural Processes (NPs) consider a task as a function realized from a stochastic process and flexibly adapt to unseen tasks through inference on functions. However, naive NPs can model data from only a single stochastic process and are designed to infer each task independently. Since many real-world data represent a set of correlated tasks from multiple sources (e.g., multiple attributes and multi-sensor data), it is beneficial to infer them jointly and exploit the underlying correlation to improve the predictive performance.\nTo this end, we propose Multi-Task Neural Processes (MTNPs), an extension of NPs designed to jointly infer tasks realized from multiple stochastic processes. We build MTNPs in a hierarchical way such that inter-task correlation is considered by conditioning all per-task latent variables on a single global latent variable. In addition, we further design our MTNPs so that they can address multi-task settings with incomplete data (i.e., not all tasks share the same set of input points), which has high practical demands in various applications.\nExperiments demonstrate that MTNPs can successfully model multiple tasks jointly by discovering and exploiting their correlations in various real-world data such as time series of weather attributes and pixel-aligned visual modalities. We release our code at https://github.com/GitGyun/multi_task_neural_processes.", "keywords": "stochastic processes;neural processes;multi-task learning;incomplete data", "primary_area": "", "supplementary_material": "", "author": "Donggyun Kim;Seongwoong Cho;Wonkwang Lee;Seunghoon Hong", "authorids": "~Donggyun_Kim1;~Seongwoong_Cho1;~Wonkwang_Lee2;~Seunghoon_Hong2", "gender": ";M;M;M", "homepage": ";https://www.github.com/seongwoongcho;https://www.github.com/1Konny;https://maga33.github.io/", "dblp": ";;256/4988;142/3014.html", "google_scholar": "g_CtB50AAAAJ;;y2p6gTEAAAAJ;hvr3ALkAAAAJ", "orcid": ";;;", "linkedin": "%EB%8F%99%EA%B7%A0-%EA%B9%80-37a890187/;;;seunghoon-hong-194489a4/", "or_profile": "~Donggyun_Kim1;~Seongwoong_Cho1;~Wonkwang_Lee2;~Seunghoon_Hong1", "aff": "Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology;", "aff_domain": "kaist.ac.kr;kaist.ac.kr;kaist.ac.kr;", "position": "MS student;Undergrad student;MS student;", "bibtex": "@inproceedings{\nkim2022multitask,\ntitle={Multi-Task Processes},\nauthor={Donggyun Kim and Seongwoong Cho and Wonkwang Lee and Seunghoon Hong},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=9otKVlgrpZG}\n}", "github": "", "project": "", "reviewers": "eyrZ;ACBa;F6YH;GYAF", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "5;4;4;3", "correctness": "3;3;3;4", "technical_novelty": "2;3;3;3", "empirical_novelty": "3;3;2;3", "wc_summary_paper": "39;21;67;79", "wc_summary_review": "24;11;44;41", "wc_main_review": "353;184;689;375", "wc_review": "416;216;800;495", "wc_reply_reviewers": "254;157;116;22", "wc_reply_authors": "4429;3289;3481;1988", "reply_reviewers": "1;1;3;1", "reply_authors": "9;5;8;4", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 51.5, 22.819947414488052 ], "wc_summary_review_avg": [ 30.0, 13.360389215887388 ], "wc_main_review_avg": [ 400.25, 182.35319437838209 ], "wc_review_avg": [ 481.75, 210.00282736191912 ], "wc_reply_reviewers_avg": [ 137.25, 83.2987845049374 ], "wc_reply_authors_avg": [ 3296.75, 870.1759520349893 ], "reply_reviewers_avg": [ 1.5, 0.8660254037844386 ], "reply_authors_avg": [ 6.5, 2.0615528128088303 ], "replies_avg": [ 37, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.9733285267845754, "corr_recommendation_correctness": 0.9271726499455306, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4585505321678747721&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "pdf": "https://openreview.net/pdf?id=9otKVlgrpZG", "email": "kaist.ac.kr;kaist.ac.kr;kaist.ac.kr;", "author_num": 4, "aff_unique_index": "0;0;0", "aff_unique_norm": "Korea Advanced Institute of Science and Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.kaist.ac.kr", "aff_unique_abbr": "KAIST", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "South Korea" }, { "title": "Embedded-model flows: Combining the inductive biases of model-free deep learning and explicit probabilistic modeling", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6910", "id": "9pEJSVfDbba", "poster": "", "openreview": "https://openreview.net/forum?id=9pEJSVfDbba", "slides": "https://iclr.cc/virtual/2022/poster/6910", "video": "https://iclr.cc/virtual/2022/poster/6910", "author_site": "Gianluigi Silvestri, Emily Fertig, Dave Moore, Luca Ambrogioni", "tldr": "", "abstract": "Normalizing flows have shown great success as general-purpose density estimators. However, many real world applications require the use of domain-specific knowledge, which normalizing flows cannot readily incorporate. We propose embedded-model flows (EMF), which alternate general-purpose transformations with structured layers that embed domain-specific inductive biases. These layers are automatically constructed by converting user-specified differentiable probabilistic models into equivalent bijective transformations. We also introduce gated structured layers, which allow bypassing the parts of the models that fail to capture the statistics of the data. We demonstrate that EMFs can be used to induce desirable properties such as multimodality and continuity. Furthermore, we show that EMFs enable a high performance form of variational inference where the structure of the prior model is embedded in the variational architecture. In our experiments, we show that this approach outperforms a large number of alternative methods in common structured inference problems.", "keywords": "Normalizing Flows;Probabilistic model;Probabilistic programming;Generative modeling;Variational Inference", "primary_area": "", "supplementary_material": "/attachment/7be181156cc235233c5e50f654984623293f2e89.zip", "author": "Gianluigi Silvestri;Emily Fertig;Dave Moore;Luca Ambrogioni", "authorids": "~Gianluigi_Silvestri1;~Emily_Fertig1;~Dave_Moore1;~Luca_Ambrogioni1", "gender": "M;F;M;M", "homepage": ";;http://www.cs.berkeley.edu/~dmoore;https://scholar.google.nl/citations?user=J9IABpQAAAAJ&hl=en", "dblp": "217/3077;;133/6838;151/9813", "google_scholar": "https://scholar.google.com/citations?hl=en;https://scholar.google.no/citations?user=NKbk2KcAAAAJ;;https://scholar.google.nl/citations?user=J9IABpQAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Gianluigi_Silvestri1;~Emily_Fertig1;~Dave_Moore1;~Luca_Ambrogioni1", "aff": "OnePlanet Research Center;Google;Google Research;Radboud University Nijmegen", "aff_domain": "imec.nl;google.com;google.com;ru.nl", "position": "PhD student;Researcher;Research Engineer;Assistant Professor", "bibtex": "@inproceedings{\nsilvestri2022embeddedmodel,\ntitle={Embedded-model flows: Combining the inductive biases of model-free deep learning and explicit probabilistic modeling},\nauthor={Gianluigi Silvestri and Emily Fertig and Dave Moore and Luca Ambrogioni},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=9pEJSVfDbba}\n}", "github": "", "project": "", "reviewers": "ihM2;g1yH;2pw8;zyZ8", "pdf_size": 0, "recommendation": "6;6;8;8", "confidence": "4;3;4;4", "correctness": "3;3;4;3", "technical_novelty": "3;3;3;3", "empirical_novelty": "3;3;3;3", "wc_summary_paper": "74;94;100;103", "wc_summary_review": "81;25;29;156", "wc_main_review": "593;188;356;825", "wc_review": "748;307;485;1084", "wc_reply_reviewers": "399;0;21;326", "wc_reply_authors": "788;280;224;2111", "reply_reviewers": "2;0;1;3", "reply_authors": "4;1;1;6", "recommendation_avg": [ 7.0, 1.0 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 92.75, 11.299889379989523 ], "wc_summary_review_avg": [ 72.75, 52.8978969336211 ], "wc_main_review_avg": [ 490.5, 240.82825830869598 ], "wc_review_avg": [ 656.0, 292.6986504922768 ], "wc_reply_reviewers_avg": [ 186.5, 178.03721521075306 ], "wc_reply_authors_avg": [ 850.75, 760.055713418431 ], "reply_reviewers_avg": [ 1.5, 1.118033988749895 ], "reply_authors_avg": [ 3.0, 2.1213203435596424 ], "replies_avg": [ 26, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13875622113438069976&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=9pEJSVfDbba", "email": "imec.nl;google.com;google.com;ru.nl", "author_num": 4, "aff_unique_index": "0;1;1;2", "aff_unique_norm": "OnePlanet Research Center;Google;Radboud University", "aff_unique_dep": ";Google;", "aff_unique_url": "https://www.oneplanetresearchcenter.nl;https://www.google.com;https://www.ru.nl/", "aff_unique_abbr": ";Google;RU", "aff_campus_unique_index": "1;1;2", "aff_campus_unique": ";Mountain View;Nijmegen", "aff_country_unique_index": "0;1;1;0", "aff_country_unique": "Netherlands;United States" }, { "id": "9poQ2m0R--", "title": "Green CWS: Extreme Distillation and Efficient Decode Method Towards Industrial Application", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Benefiting from the strong ability of the pre-trained model, the research on Chinese Word Segmentation (CWS) has made great progress in recent years. However, due to massive computation, large and complex models are incapable of empowering their ability for industrial use. On the other hand, for low-resource scenarios, the prevalent decode method, such as Conditional Random Field (CRF), fails to exploit the full information of the training data. This work proposes a fast and accurate CWS framework that incorporates a light-weighted model and an upgraded decode method (PCRF) towards industrially low-resource CWS scenarios. First, we distill a Transformer-based student model as an encoder, which not only accelerates the inference speed but also combines open knowledge and domain-specific knowledge. Second, the perplexity score to evaluate the language model is fused into the CRF module to better identify the word boundaries. Experiments show that our work obtains relatively high performance on multiple datasets with as low as 14% of time consumption compared with the original BERT-based model. Moreover, under the low-resource setting, we get superior results in comparison with the traditional decoding methods.", "keywords": "Chinese Word Segmentation;Knowledge Distillation;Decoding", "primary_area": "", "supplementary_material": "", "author": "Yulan Hu;Yong Liu", "authorids": "~Yulan_Hu1;~Yong_Liu7", "gender": "M;M", "homepage": ";https://iie-liuyong.github.io", "dblp": "68/7653;29/4867-18", "google_scholar": ";vVhmzbAAAAAJ", "orcid": ";0000-0002-6739-621X", "linkedin": "%E7%BE%BD%E8%93%9D-%E8%83%A1-78855b15a/?originalSubdomain=cn;", "or_profile": "~Yulan_Hu1;~Yong_Liu7", "aff": "Renmin University of China;Renmin University of China", "aff_domain": "ruc.edu.cn;ruc.edu.cn", "position": "PhD student;Associate Professor", "bibtex": "@misc{\nlan2022green,\ntitle={Green {CWS}: Extreme Distillation and Efficient Decode Method Towards Industrial Application},\nauthor={Yulan Hu and Yong Liu},\nyear={2022},\nurl={https://openreview.net/forum?id=9poQ2m0R--}\n}", "github": "", "project": "", "reviewers": "YrQ9;M1Dc;n8pf;kBku", "site": "https://openreview.net/forum?id=9poQ2m0R--", "pdf_size": 0, "recommendation": "3;3;3;3", "confidence": "4;4;3;3", "correctness": "3;2;3;3", "technical_novelty": "2;2;2;3", "empirical_novelty": "3;2;2;2", "wc_summary_paper": "72;106;98;71", "wc_summary_review": "27;83;92;20", "wc_main_review": "635;241;499;219", "wc_review": "734;430;689;310", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 86.75, 15.514106484100203 ], "wc_summary_review_avg": [ 55.5, 32.25290684574028 ], "wc_main_review_avg": [ 398.5, 175.39883123897948 ], "wc_review_avg": [ 540.75, 176.65980725677247 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:n1EQEYQAcN0J:scholar.google.com/&scioq=Green+CWS:+Extreme+Distillation+and+Efficient+Decode+Method+Towards+Industrial+Application&hl=en&as_sdt=0,5", "gs_version_total": 3, "aff_unique_index": "0;0", "aff_unique_norm": "Renmin University of China", "aff_unique_dep": "", "aff_unique_url": "http://www.ruc.edu.cn", "aff_unique_abbr": "RUC", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "id": "9q3g_5gQbbA", "title": "Towards Understanding Data Values: Empirical Results on Synthetic Data", "track": "main", "status": "Reject", "tldr": "", "abstract": "Understanding the influence of data on machine learning models is an emerging research field. Inspired by recent work in data valuation, we perform several experiments to get an intuition for this influence on a multi-layer perceptron. We generate a synthetic two-dimensional data set to visualize how different valuation methods value data points on a mesh grid spanning the relevant feature space. In this setting, individual data values can be derived directly from the impact of the respective data points on the decision boundary. Our results show that the most important data points are the miss-classified ones. Furthermore, despite performance differences on real world data sets, all investigated methods except one qualitatively agree on the data values derived from our experiments. Finally, we place our results into the recent literature and discuss data values and their relationship to other methods.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/f3900a5e8356f98cda2569a713c049efacbbd30f.zip", "author": "Danilo Brajovic;Omar De Mitri;Alex Windberger;Marco Huber", "authorids": "~Danilo_Brajovic1;~Omar_De_Mitri1;~Alex_Windberger1;~Marco_Huber2", "gender": ";;M;M", "homepage": ";;;https://www.ipa.fraunhofer.de/ai", "dblp": ";296/3805;;76/4332", "google_scholar": ";;LhjPLPAAAAAJ;https://scholar.google.de/citations?user=SUU9998AAAAJ", "orcid": ";0000-0002-4011-143X;;0000-0002-8250-2092", "linkedin": ";https://de.linkedin.com/in/omar-demitri;robert-alexander-windberger-966b8b141;marco-huber-78a1a151/", "or_profile": "~Danilo_Brajovic1;~Omar_De_Mitri1;~Alex_Windberger1;~Marco_Huber2", "aff": ";Universit\u00e0 del Salento;IDS Imaging Development Systems GmbH;University of Stuttgart", "aff_domain": ";unisalento.it;ids-imaging.com;uni-stuttgart.de", "position": ";PhD student;AI Specialist;Full Professor", "bibtex": "@misc{\nbrajovic2022towards,\ntitle={Towards Understanding Data Values: Empirical Results on Synthetic Data},\nauthor={Danilo Brajovic and Omar De Mitri and Alex Windberger and Marco Huber},\nyear={2022},\nurl={https://openreview.net/forum?id=9q3g_5gQbbA}\n}", "github": "", "project": "", "reviewers": "hZMc;roaw;vtgh;Dtr6", "site": "https://openreview.net/forum?id=9q3g_5gQbbA", "pdf_size": 0, "recommendation": "3;3;3;6", "confidence": "4;5;4;3", "correctness": "2;3;4;4", "technical_novelty": "2;3;2;3", "empirical_novelty": "3;3;2;3", "wc_summary_paper": "86;20;87;51", "wc_summary_review": "62;91;45;46", "wc_main_review": "185;187;154;169", "wc_review": "333;298;286;266", "wc_reply_reviewers": "51;0;0;0", "wc_reply_authors": "515;433;0;45", "reply_reviewers": "1;0;0;0", "reply_authors": "1;1;0;1", "recommendation_avg": [ 3.75, 1.299038105676658 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 61.0, 27.75788176356402 ], "wc_summary_review_avg": [ 61.0, 18.587630295441105 ], "wc_main_review_avg": [ 173.75, 13.36740438529485 ], "wc_review_avg": [ 295.75, 24.355440870573457 ], "wc_reply_reviewers_avg": [ 12.75, 22.083647796503186 ], "wc_reply_authors_avg": [ 248.25, 228.15934672942944 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 0.75, 0.4330127018922193 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.816496580927726, "corr_recommendation_correctness": 0.5222329678670935, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:d2Wq-h4mkAoJ:scholar.google.com/&scioq=Towards+Understanding+Data+Values:+Empirical+Results+on+Synthetic+Data&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;2", "aff_unique_norm": "Universit\u00e0 del Salento;IDS Imaging Development Systems GmbH;University of Stuttgart", "aff_unique_dep": ";;", "aff_unique_url": "https://www.unisalento.it;https://www.ids-imaging.com;https://www.uni-stuttgart.de", "aff_unique_abbr": "UNISALENTO;IDS;USTuttgart", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1", "aff_country_unique": "Italy;Germany" }, { "id": "9qKAGxS1Tq2", "title": "From SCAN to Real Data: Systematic Generalization via Meaningful Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Humans can systematically generalize to novel compositions of existing concepts. There have been extensive conjectures into the extent to which neural networks can do the same. Recent arguments supported by evidence on the SCAN dataset claim that neural networks are inherently ineffective in such cognitive capacity. In this paper, we revisit systematic generalization from the perspective of meaningful learning, an exceptional capability of humans to learn new concepts by connecting them with other previously known knowledge. We propose to augment a training dataset in either an inductive or deductive manner to build semantic links between new and old concepts. Our observations on SCAN suggest that, following the meaningful learning principle, modern sequence-to-sequence models, including RNNs, CNNs, and Transformers, can successfully generalize to compositions of new concepts. We further validate our findings on two real-world datasets on semantic parsing and consistent compositional generalization is also observed. Moreover, our experiments demonstrate that both prior knowledge and semantic linking play a key role to achieve systematic generalization. Meanwhile, inductive learning generally works better than deductive learning in our experiments. Finally, we provide an explanation for data augmentation techniques by concluding them into either inductive-based or deductive-based meaningful learning. We hope our findings will encourage excavating existing neural networks' potential in systematic generalization through more advanced learning schemes.", "keywords": "systematic generalization;meaningful learning;inductive learning;deductive learning;data augmentation", "primary_area": "", "supplementary_material": "/attachment/a51dd77882e88f9098a09a7210c94ed7ef3a4902.zip", "author": "Ning Shi;Boxin Wang;Wei Wang;Xiangyu Liu;Rong Zhang;Hui Xue';Xinbing Wang;Zhouhan Lin", "authorids": "~Ning_Shi1;~Boxin_Wang1;~Wei_Wang54;~Xiangyu_Liu3;~Rong_Zhang2;~Hui_Xue'1;~Xinbing_Wang1;~Zhouhan_Lin1", "gender": "M;;M;M;M;M;M;M", "homepage": "https://sites.google.com/ualberta.ca/shining;https://wbx.life;;;;http://www.alibaba.com;http://www.cs.sjtu.edu.cn/~wang-xb/;https://hantek.github.io", "dblp": "67/3378;236/6319;35/7092;;13/5366-2;;96/1149.html;121/7919.html", "google_scholar": "qaqVNMQAAAAJ;YOf2ATIAAAAJ;https://scholar.google.com/citations?hl=en;TLx5GG4AAAAJ;;;https://scholar.google.com.tw/citations?user=CT5yZbwAAAAJ;https://scholar.google.ca/citations?user=LNZ4efwAAAAJ", "orcid": ";;;;;;0000-0002-0357-8356;0009-0009-7204-0689", "linkedin": "stshining/;;%E7%BB%B4-%E6%B1%AA-028818131/;;;;;https://ca.linkedin.com/in/zhouhan-lin-34b98975", "or_profile": "~Ning_Shi1;~Boxin_Wang1;~Wei_Wang54;~Xiangyu_Liu3;~Rong_Zhang2;~Hui_Xue'1;~Xinbing_Wang1;~Zhouhan_Lin1", "aff": "Georgia Institute of Technology;NVIDIA;;Alibaba Group;;Alibaba Group;Shanghai Jiaotong University;Shanghai Jiaotong University", "aff_domain": "gatech.edu;nvidia.com;;alibaba.com;;alibaba-inc.com;cs.sjtu.edu.cn;sjtu.edu.cn", "position": "MS student;Research Intern;;Researcher;;Principal Researcher;Full Professor;Assistant Professor", "bibtex": "@misc{\nshi2022from,\ntitle={From {SCAN} to Real Data: Systematic Generalization via Meaningful Learning},\nauthor={Ning Shi and Boxin Wang and Wei Wang and Xiangyu Liu and Rong Zhang and Hui Xue' and Xinbing Wang and Zhouhan Lin},\nyear={2022},\nurl={https://openreview.net/forum?id=9qKAGxS1Tq2}\n}", "github": "", "project": "", "reviewers": "NJJw;yQN2;LcKh;PBxU", "site": "https://openreview.net/forum?id=9qKAGxS1Tq2", "pdf_size": 0, "recommendation": "3;5;5;5", "confidence": "5;4;4;2", "correctness": "4;3;3;3", "technical_novelty": "2;1;2;2", "empirical_novelty": "1;2;3;2", "wc_summary_paper": "60;55;48;99", "wc_summary_review": "29;29;25;54", "wc_main_review": "441;330;219;176", "wc_review": "530;414;292;329", "wc_reply_reviewers": "0;50;0;0", "wc_reply_authors": "736;671;381;477", "reply_reviewers": "0;1;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 4.5, 0.8660254037844386 ], "confidence_avg": [ 3.75, 1.0897247358851685 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 1.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 65.5, 19.80530232033836 ], "wc_summary_review_avg": [ 34.25, 11.519006033508273 ], "wc_main_review_avg": [ 291.5, 102.99150450401237 ], "wc_review_avg": [ 391.25, 91.50785485410529 ], "wc_reply_reviewers_avg": [ 12.5, 21.650635094610966 ], "wc_reply_authors_avg": [ 566.25, 143.23996474448043 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": -0.6622661785325219, "corr_recommendation_correctness": -1.0, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10648692548614332612&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;2;2;3;3", "aff_unique_norm": "Georgia Institute of Technology;NVIDIA;Alibaba Group;Shanghai Jiao Tong University", "aff_unique_dep": ";NVIDIA Corporation;;", "aff_unique_url": "https://www.gatech.edu;https://www.nvidia.com;https://www.alibaba.com;https://www.sjtu.edu.cn", "aff_unique_abbr": "Georgia Tech;NVIDIA;Alibaba;SJTU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;1;1;1", "aff_country_unique": "United States;China" }, { "id": "9r4_7GxTLnS", "title": "PolyViT: Co-training Vision Transformers on Images, Videos and Audio", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Can we train a single transformer model capable of processing multiple modalities and datasets, whilst sharing the majority of its learnable parameters? We present PolyViT, a model trained on image, audio and video which answers this question. By co-training different tasks on a single modality we are able to improve the accuracy of each individual task and achieve state-of-the-art results on 5 standard video- and audio-classification datasets. Co-training PolyViT on multiple modalities and tasks leads to a model that is even more parameter-efficient, and learns representations that generalize across multiple domains. Finally, we show that co-training is simple and practical to implement, as we do not need to tune hyperparameters for each combination of datasets, but can simply adapt those from standard, single-task training.", "keywords": "transformers;multi-task learning;image classification;video;audio;co-training", "primary_area": "", "supplementary_material": "", "author": "Valerii Likhosherstov;Mostafa Dehghani;Anurag Arnab;Krzysztof Marcin Choromanski;Mario Lucic;Yi Tay;Adrian Weller", "authorids": "~Valerii_Likhosherstov2;~Mostafa_Dehghani1;~Anurag_Arnab1;~Krzysztof_Marcin_Choromanski1;~Mario_Lucic1;~Yi_Tay1;~Adrian_Weller1", "gender": ";M;;;M;M;M", "homepage": "https://valerytyumen.github.io/;http://mostafadehghani.com/;;;http://lucic.ai;http://yitay.net;http://mlg.eng.cam.ac.uk/adrian/", "dblp": "232/4391.html;125/4062;;78/11411;155/1945;;73/8324", "google_scholar": "iiVVfxUAAAAJ;https://scholar.google.nl/citations?user=MiHOX3QAAAAJ;;;SzZRlcMAAAAJ;VBclY_cAAAAJ;https://scholar.google.co.uk/citations?user=Ek4hM10AAAAJ", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": "~Valerii_Likhosherstov2;~Mostafa_Dehghani1;~Anurag_Arnab1;~Krzysztof_Marcin_Choromanski1;~Mario_Lucic1;~Yi_Tay1;~Adrian_Weller1", "aff": ";Google DeepMind;;Google Brain Robotics & Columbia University;Google;Google;University of Cambridge", "aff_domain": ";google.com;;columbia.edu;deepmind.com;google.com;cam.ac.uk", "position": ";Research Scientist;;research scientist & adjunct assistant professor;Senior Staff Research Scientist;Research Scientist;Principal Researcher", "bibtex": "@misc{\nlikhosherstov2022polyvit,\ntitle={PolyViT: Co-training Vision Transformers on Images, Videos and Audio},\nauthor={Valerii Likhosherstov and Mostafa Dehghani and Anurag Arnab and Krzysztof Marcin Choromanski and Mario Lucic and Yi Tay and Adrian Weller},\nyear={2022},\nurl={https://openreview.net/forum?id=9r4_7GxTLnS}\n}", "github": "", "project": "", "reviewers": "uwSQ;opPh;Qh8y;UxMd", "site": "https://openreview.net/forum?id=9r4_7GxTLnS", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "4;5;4;4", "correctness": "2;3;2;3", "technical_novelty": "2;1;2;2", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "104;157;295;77", "wc_summary_review": "41;49;107;50", "wc_main_review": "749;394;470;328", "wc_review": "894;600;872;455", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 1.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 158.25, 84.03384734736355 ], "wc_summary_review_avg": [ 61.75, 26.356925086208367 ], "wc_main_review_avg": [ 485.25, 160.35176176144745 ], "wc_review_avg": [ 705.25, 185.1585469266812 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.0, "gs_citation": 81, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2433441885724580400&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0;0;0;1", "aff_unique_norm": "Google;University of Cambridge", "aff_unique_dep": "Google DeepMind;", "aff_unique_url": "https://deepmind.com;https://www.cam.ac.uk", "aff_unique_abbr": "DeepMind;Cambridge", "aff_campus_unique_index": "1;1;1;2", "aff_campus_unique": ";Mountain View;Cambridge", "aff_country_unique_index": "0;1;1;1;0", "aff_country_unique": "United Kingdom;United States" }, { "id": "9rKTy4oZAQt", "title": "A Risk-Sensitive Policy Gradient Method", "track": "main", "status": "Reject", "tldr": "", "abstract": "Standard deep reinforcement learning (DRL) agents aim to maximize expected reward, considering collected experiences equally in formulating a policy. This differs from human decision-making, where gains and losses are valued differently and outlying outcomes are given increased consideration. It also wastes an opportunity for the agent to modulate behavior based on distributional context. Several approaches to distributional DRL have been investigated, with one popular strategy being to evaluate the projected distribution of returns for possible actions. We propose a more direct approach, whereby the distribution of full-episode outcomes is optimized to maximize a chosen function of its cumulative distribution function (CDF). This technique allows for outcomes to be weighed based on relative quality, does not require modification of the reward function to modulate agent behavior, and may be used for both continuous and discrete action spaces. We show how to achieve an unbiased estimate of the policy gradient for a broad class of CDF-based objectives via sampling, subsequently incorporating variance reduction measures to facilitate effective on-policy learning. We use the resulting approach to train agents with different \u201crisk profiles\u201d in penalty-based formulations of six OpenAI Safety Gym environments, finding that moderate emphasis on improvement in training scenarios where the agent performs poorly generally improves agent behavior. We interpret and explore this observation, which leads to improved performance over the widely-used Proximal Policy Optimization algorithm in all environments tested.", "keywords": "deep reinforcement learning;policy gradient;risk-sensitive;ai safety", "primary_area": "", "supplementary_material": "/attachment/8697db10f6f7e862f09f51f0e38392adc532b5d1.zip", "author": "Jared Markowitz;Ryan Gardner;Ashley Llorens;Raman Arora;I-Jeng Wang", "authorids": "~Jared_Markowitz1;ryan.gardner@jhuapl.edu;allorens@microsoft.com;~Raman_Arora1;~I-Jeng_Wang1", "gender": "M;;;M;M", "homepage": ";;;http://www.cs.jhu.edu/~raman/Home.html;", "dblp": ";;;;", "google_scholar": "KHedducAAAAJ;;;Spe0xdkAAAAJ;vPoTuLQAAAAJ", "orcid": ";;;;", "linkedin": "jared-markowitz;;;;", "or_profile": "~Jared_Markowitz1;ryan.gardner@jhuapl.edu;allorens@microsoft.com;~Raman_Arora1;~I-Jeng_Wang1", "aff": "Johns Hopkins University Applied Physics Laboratory;;;Johns Hopkins University;Johns Hopkins University Applied Physics Laboratory", "aff_domain": "jhuapl.edu;;;jhu.edu;jhuapl.edu", "position": "Researcher;;;Associate Professor;Researcher", "bibtex": "@misc{\nmarkowitz2022a,\ntitle={A Risk-Sensitive Policy Gradient Method},\nauthor={Jared Markowitz and Ryan Gardner and Ashley Llorens and Raman Arora and I-Jeng Wang},\nyear={2022},\nurl={https://openreview.net/forum?id=9rKTy4oZAQt}\n}", "github": "", "project": "", "reviewers": "ugec;apyr;VVvx;SaK7", "site": "https://openreview.net/forum?id=9rKTy4oZAQt", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "4;4;2;3", "correctness": "3;2;3;3", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "163;141;126;87", "wc_summary_review": "104;220;90;72", "wc_main_review": "391;708;232;278", "wc_review": "658;1069;448;437", "wc_reply_reviewers": "0;41;0;0", "wc_reply_authors": "510;1125;630;408", "reply_reviewers": "0;1;0;0", "reply_authors": "1;4;1;1", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 129.25, 27.716195626384224 ], "wc_summary_review_avg": [ 121.5, 57.98922313671739 ], "wc_main_review_avg": [ 402.25, 185.7637949117104 ], "wc_review_avg": [ 653.0, 255.81340856178747 ], "wc_reply_reviewers_avg": [ 10.25, 17.75352077758099 ], "wc_reply_authors_avg": [ 668.25, 275.16211130895186 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.75, 1.299038105676658 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.9045340337332909, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1, "aff_unique_index": "0;0;0", "aff_unique_norm": "Johns Hopkins University", "aff_unique_dep": "Applied Physics Laboratory", "aff_unique_url": "https://www.jhuapl.edu", "aff_unique_abbr": "JHU APL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "9rr7pFeIEuq", "title": "A general sample complexity analysis of vanilla policy gradient", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "We adapt recent tools developed for the analysis of Stochastic Gradient Descent (SGD) in non-convex optimization to obtain convergence guarantees and sample complexities for the vanilla policy gradient (PG) -- REINFORCE and GPOMDP. Our only assumptions are that the expected return is smooth w.r.t. the policy parameters and that the second moment of its gradient satisfies a certain ABC assumption. The ABC assumption allows for the second moment of the gradient to be bounded by $A\\geq 0$ times the suboptimality gap, $B \\geq 0$ times the norm of the full batch gradient and an additive constant $C \\geq 0$, or any combination of aforementioned. We show that the ABC assumption is more general than the commonly used assumptions on the policy space to prove convergence to a stationary point. We provide a single convergence theorem under the ABC assumption, and show that, despite the generality of the ABC assumption, we recover the $\\widetilde{\\mathcal{O}}(\\epsilon^{-4})$ sample complexity of PG. Our convergence theorem also affords greater flexibility in the choice of hyper parameters such as the step size and places no restriction on the batch size $m$. Even the single trajectory case (i.e., $m=1$) fits within our analysis. We believe that the generality of the ABC assumption may provide theoretical guarantees for PG to a much broader range of problems that have not been previously considered.", "keywords": "Reinforcement learning;policy optimization;policy gradient;sample complexity", "primary_area": "", "supplementary_material": "", "author": "Rui Yuan;Robert M. Gower;Alessandro Lazaric", "authorids": "~Rui_Yuan1;~Robert_M._Gower1;~Alessandro_Lazaric2", "gender": "M;M;M", "homepage": "https://rui-yuan91.github.io/;https://gowerrobert.github.io/;", "dblp": ";143/0056;36/321", "google_scholar": "4QZgrj0AAAAJ;okKw87MAAAAJ;6JZ3R6wAAAAJ", "orcid": "0000-0002-1768-9639;;", "linkedin": "rui-yuan-phd-55135537/;;", "or_profile": "~Rui_Yuan1;~Robert_M._Gower1;~Alessandro_Lazaric2", "aff": "Meta;Flatiron Institute;Meta Facebook", "aff_domain": "fb.com;simonsfoundation.org;fb.com", "position": "PhD student;Researcher;Research Scientist", "bibtex": "@misc{\nyuan2022a,\ntitle={A general sample complexity analysis of vanilla policy gradient},\nauthor={Rui Yuan and Robert M. Gower and Alessandro Lazaric},\nyear={2022},\nurl={https://openreview.net/forum?id=9rr7pFeIEuq}\n}", "github": "", "project": "", "reviewers": "", "site": "https://openreview.net/forum?id=9rr7pFeIEuq", "pdf_size": 0, "recommendation": "", "confidence": "", "correctness": "", "technical_novelty": "", "empirical_novelty": "", "wc_summary_paper": "", "wc_summary_review": "", "wc_main_review": "", "wc_review": "", "wc_reply_reviewers": "", "wc_reply_authors": "", "reply_reviewers": "", "reply_authors": "", "recommendation_avg": [ 0, 0 ], "confidence_avg": [ 0, 0 ], "correctness_avg": [ 0, 0 ], "technical_novelty_avg": [ 0, 0 ], "empirical_novelty_avg": [ 0, 0 ], "wc_summary_paper_avg": [ 0, 0 ], "wc_summary_review_avg": [ 0, 0 ], "wc_main_review_avg": [ 0, 0 ], "wc_review_avg": [ 0, 0 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 1, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0, "corr_recommendation_correctness": 0, "gs_citation": 78, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14382420974325591554&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "aff_unique_index": "0;1;0", "aff_unique_norm": "Meta;Flatiron Institute", "aff_unique_dep": "Meta Platforms, Inc.;", "aff_unique_url": "https://meta.com;https://flatironinstitute.org", "aff_unique_abbr": "Meta;Flatiron", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "9u5E8AFudRx", "title": "Help Me Explore: Minimal Social Interventions for Graph-Based Autotelic Agents", "track": "main", "status": "Reject", "tldr": "", "abstract": "In the quest for autonomous agents learning open-ended repertoires of skills, most works take a Piagetian perspective: learning trajectories are the results of interactions between developmental agents and their physical environment. The Vygotskian perspective, on the other hand, emphasizes the centrality of the socio-cultural environment: higher cognitive functions emerge from transmissions of socio-cultural processes internalized by the agent. This paper acknowledges these two perspectives and presents GANGSTR, a hybrid agent engaging in both individual and social goal-directed exploration. In a 5-block manipulation domain, GANGSTR discovers and learns to master tens of thousands of configurations. In individual phases, the agent engages in autotelic learning; it generates, pursues and makes progress towards its own goals. To this end, it builds a graph to represent the network of discovered configuration and to navigate between them. In social phases, a simulated social partner suggests goal configurations at the frontier of the agent\u2019s current capabilities. This paper makes two contributions: 1) a minimal social interaction protocol called Help Me Explore (HME); 2) GANGSTR, a graph-based autotelic agent. As this paper shows, coupling individual and social exploration enables the GANGSTR agent to discover and master the most complex configurations (e.g. stacks of 5 blocks) with only minimal intervention.", "keywords": "Deep reinforcement learning;intrinsic motivations;autonomous learning;social learning", "primary_area": "", "supplementary_material": "", "author": "Ahmed Akakzia;Olivier Serris;Olivier Sigaud;C\u00e9dric Colas", "authorids": "~Ahmed_Akakzia1;~Olivier_Serris1;~Olivier_Sigaud1;~C\u00e9dric_Colas1", "gender": "M;;M;M", "homepage": ";;http://people.isir.upmc.fr/sigaud;https://cedriccolas.com", "dblp": ";;50/5522;215/3872", "google_scholar": "U2CTuQUAAAAJ;;https://scholar.google.fr/citations?user=elLfDv0AAAAJ;https://scholar.google.fr/citations?user=VBz8gZ4AAAAJ", "orcid": ";;0000-0002-8544-0229;0000-0003-0212-427X", "linkedin": ";https://fr.linkedin.com/in/olivier-serris-4575631b9;;", "or_profile": "~Ahmed_Akakzia1;~Olivier_Serris1;~Olivier_Sigaud1;~C\u00e9dric_Colas1", "aff": "ISIR, UMR 7222;;Sorbonne Universit\u00e9;Massachusetts Institute of Technology", "aff_domain": "sorbonne-universite.fr;;upmc.fr;mit.edu", "position": "PhD student;;Full Professor;Postdoc", "bibtex": "@misc{\nakakzia2022help,\ntitle={Help Me Explore: Minimal Social Interventions for Graph-Based Autotelic Agents},\nauthor={Ahmed Akakzia and Olivier Serris and Olivier Sigaud and C{\\'e}dric Colas},\nyear={2022},\nurl={https://openreview.net/forum?id=9u5E8AFudRx}\n}", "github": "", "project": "", "reviewers": "a8j4;Pnco;SdCn", "site": "https://openreview.net/forum?id=9u5E8AFudRx", "pdf_size": 0, "recommendation": "3;5;8", "confidence": "4;3;3", "correctness": "2;3;4", "technical_novelty": "2;3;3", "empirical_novelty": "2;3;3", "wc_summary_paper": "60;49;79", "wc_summary_review": "51;24;20", "wc_main_review": "418;98;203", "wc_review": "529;171;302", "wc_reply_reviewers": "108;0;0", "wc_reply_authors": "1459;926;469", "reply_reviewers": "1;0;0", "reply_authors": "3;2;1", "recommendation_avg": [ 5.333333333333333, 2.0548046676563256 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "correctness_avg": [ 3.0, 0.816496580927726 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 62.666666666666664, 12.39175353029407 ], "wc_summary_review_avg": [ 31.666666666666668, 13.767917618708921 ], "wc_main_review_avg": [ 239.66666666666666, 133.18742016004697 ], "wc_review_avg": [ 334.0, 147.8941062607522 ], "wc_reply_reviewers_avg": [ 36.0, 50.91168824543142 ], "wc_reply_authors_avg": [ 951.3333333333334, 404.5625895814996 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 2.0, 0.816496580927726 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.8029550685469661, "corr_recommendation_correctness": 0.9933992677987828, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9097552172937161861&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "Institut des Sciences de l'Ing\u00e9nierie de Robotique;Sorbonne Universit\u00e9;Massachusetts Institute of Technology", "aff_unique_dep": "UMR 7222;;", "aff_unique_url": "https://www.isir.upmc.fr;https://www.sorbonne-universite.fr;https://web.mit.edu", "aff_unique_abbr": "ISIR;Sorbonne U;MIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1", "aff_country_unique": "France;United States" }, { "id": "9vsRT9mc7U", "title": "Generative Adversarial Training for Neural Combinatorial Optimization Models", "track": "main", "status": "Reject", "tldr": "", "abstract": "Recent studies show that deep neural networks can be trained to learn good heuristics for various Combinatorial Optimization Problems (COPs). However, it remains a great challenge for the trained deep optimization models to generalize to distributions different from the training one. To address this issue, we propose a general framework, Generative Adversarial Neural Combinatorial Optimization (GANCO) which is equipped with another deep model to generate training instances for the optimization model, so as to improve its generalization ability. The two models are trained alternatively in an adversarial way, where the generation model is trained by reinforcement learning to find instance distributions hard for the optimization model. We apply the GANCO framework to two recent deep combinatorial optimization models, i.e., Attention Model (AM) and Policy Optimization with Multiple Optima (POMO). Extensive experiments on various problems such as Traveling Salesman Problem, Capacitated Vehicle Routing Problem, and 0-1 Knapsack Problem show that GANCO can significantly improve the generalization ability of optimization models on various instance distributions, with little sacrifice of performance on the original training distribution.", "keywords": "Deep Learning;Combinatorial Optimization Problem", "primary_area": "", "supplementary_material": "", "author": "Liang Xin;Wen Song;Zhiguang Cao;Jie Zhang", "authorids": "~Liang_Xin1;~Wen_Song1;~Zhiguang_Cao1;~Jie_Zhang9", "gender": "M;M;M;M", "homepage": "https://www.researchgate.net/profile/Liang-Xin;https://songwenas12.github.io/;https://zhiguangcaosg.github.io/;https://personal.ntu.edu.sg/zhangj/", "dblp": ";50/5489;178/8621;84/6889-2", "google_scholar": ";s8Nz-xoAAAAJ;https://scholar.google.com.sg/citations?user=2R-cOkYAAAAJ;IFV_RdMAAAAJ", "orcid": ";0000-0001-7624-1861;0000-0002-4499-759X;", "linkedin": ";;;", "or_profile": "~Liang_Xin1;~Wen_Song1;~Zhiguang_Cao1;~Jie_Zhang9", "aff": "Nanyang Technological University;Shandong University;Singapore Institute of Manufacturing Technology, A*STAR;Nanyang Technological University", "aff_domain": "ntu.edu.sg;sdu.edu.cn;simtech.a-star.edu.sg;ntu.edu.sg", "position": "PhD student;Associate Professor;Scientist;Full Professor", "bibtex": "@misc{\nxin2022generative,\ntitle={Generative Adversarial Training for Neural Combinatorial Optimization Models},\nauthor={Liang Xin and Wen Song and Zhiguang Cao and Jie Zhang},\nyear={2022},\nurl={https://openreview.net/forum?id=9vsRT9mc7U}\n}", "github": "", "project": "", "reviewers": "mumN;tjCH;N945;sEuD", "site": "https://openreview.net/forum?id=9vsRT9mc7U", "pdf_size": 0, "recommendation": "3;3;6;6", "confidence": "4;3;4;4", "correctness": "2;2;4;3", "technical_novelty": "3;2;3;3", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "62;132;67;117", "wc_summary_review": "58;73;83;225", "wc_main_review": "683;288;157;890", "wc_review": "803;493;307;1232", "wc_reply_reviewers": "628;0;32;207", "wc_reply_authors": "2264;304;331;1587", "reply_reviewers": "1;0;1;1", "reply_authors": "4;1;2;3", "recommendation_avg": [ 4.5, 1.5 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.82915619758885 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 94.5, 30.516389039334257 ], "wc_summary_review_avg": [ 109.75, 67.13186650168458 ], "wc_main_review_avg": [ 504.5, 295.00042372850925 ], "wc_review_avg": [ 708.75, 350.22305392420986 ], "wc_reply_reviewers_avg": [ 216.75, 250.16731900869866 ], "wc_reply_authors_avg": [ 1121.5, 838.9268442480547 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 2.5, 1.118033988749895 ], "replies_avg": [ 20, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.5773502691896258, "corr_recommendation_correctness": 0.9045340337332909, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10271380620761897748&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "Nanyang Technological University;Shandong University;Singapore Institute of Manufacturing Technology", "aff_unique_dep": ";;", "aff_unique_url": "https://www.ntu.edu.sg;http://www.sdu.edu.cn;https://www.simtech.a-star.edu.sg", "aff_unique_abbr": "NTU;SDU;SIMTech", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0", "aff_country_unique": "Singapore;China" }, { "title": "Differentiable DAG Sampling", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7113", "id": "9wOQOgNe-w", "poster": "", "openreview": "https://openreview.net/forum?id=9wOQOgNe-w", "slides": "https://iclr.cc/virtual/2022/poster/7113", "video": "https://iclr.cc/virtual/2022/poster/7113", "author_site": "Bertrand Charpentier, Simon Kibler, Stephan G\u00fcnnemann", "tldr": "", "abstract": "We propose a new differentiable probabilistic model over DAGs (DP-DAG). DP-DAG allows fast and differentiable DAG sampling suited to continuous optimization. To this end, DP-DAG samples a DAG by successively (1) sampling a linear ordering of the node and (2) sampling edges consistent with the sampled linear ordering. We further propose VI-DP-DAG, a new method for DAG learning from observational data which combines DP-DAG with variational inference. Hence,VI-DP-DAG approximates the posterior probability over DAG edges given the observed data. VI-DP-DAG is guaranteed to output a valid DAG at any time during training and does not require any complex augmented Lagrangian optimization scheme in contrast to existing differentiable DAG learning approaches. In our extensive experiments, we compare VI-DP-DAG to other differentiable DAG learning baselines on synthetic and real datasets. VI-DP-DAG significantly improves DAG structure and causal mechanism learning while training faster than competitors.", "keywords": "DAG;Differentiable;Sampling;Probabilistic model", "primary_area": "", "supplementary_material": "", "author": "Bertrand Charpentier;Simon Kibler;Stephan G\u00fcnnemann", "authorids": "~Bertrand_Charpentier2;~Simon_Kibler1;~Stephan_G\u00fcnnemann1", "gender": ";M;M", "homepage": "https://sharpenb.github.io/;https://www.linkedin.com/in/simon-kibler-0904bb109/;http://www.daml.in.tum.de", "dblp": "222/1875;;43/3011", "google_scholar": "0rqI-ycAAAAJ;;", "orcid": ";;", "linkedin": "bertrand-charpentier-76995ab6/;;", "or_profile": "~Bertrand_Charpentier2;~Simon_Kibler1;~Stephan_G\u00fcnnemann1", "aff": "Technical University Munich;;Technical University Munich", "aff_domain": "tum.de;;tum.de", "position": "PhD student;;Professor", "bibtex": "@inproceedings{\ncharpentier2022differentiable,\ntitle={Differentiable {DAG} Sampling},\nauthor={Bertrand Charpentier and Simon Kibler and Stephan G{\\\"u}nnemann},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=9wOQOgNe-w}\n}", "github": "", "project": "", "reviewers": "oBzg;PQoz;cRyc", "pdf_size": 0, "recommendation": "5;5;8", "confidence": "4;4;3", "correctness": "3;3;4", "technical_novelty": "2;3;3", "empirical_novelty": "0;2;3", "wc_summary_paper": "36;91;106", "wc_summary_review": "36;78;29", "wc_main_review": "396;161;293", "wc_review": "468;330;428", "wc_reply_reviewers": "88;89;0", "wc_reply_authors": "827;933;594", "reply_reviewers": "1;1;0", "reply_authors": "2;2;1", "recommendation_avg": [ 6.0, 1.4142135623730951 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 1.6666666666666667, 1.247219128924647 ], "wc_summary_paper_avg": [ 77.66666666666667, 30.09245014211298 ], "wc_summary_review_avg": [ 47.666666666666664, 21.63844315615664 ], "wc_main_review_avg": [ 283.3333333333333, 96.18154131063241 ], "wc_review_avg": [ 408.6666666666667, 57.97317387282577 ], "wc_reply_reviewers_avg": [ 59.0, 41.72129751897305 ], "wc_reply_authors_avg": [ 784.6666666666666, 141.59645318847348 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -1.0, "corr_recommendation_correctness": 1.0, "gs_citation": 47, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10667986307237653289&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=9wOQOgNe-w", "email": "tum.de;;tum.de", "author_num": 3, "aff_unique_index": "0;0", "aff_unique_norm": "Technical University of Munich", "aff_unique_dep": "", "aff_unique_url": "https://www.tum.de", "aff_unique_abbr": "TUM", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Germany" }, { "title": "Is High Variance Unavoidable in RL? A Case Study in Continuous Control", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6507", "id": "9xhgmsNVHu", "poster": "", "openreview": "https://openreview.net/forum?id=9xhgmsNVHu", "slides": "https://iclr.cc/virtual/2022/poster/6507", "video": "https://iclr.cc/virtual/2022/poster/6507", "author_site": "Johan Bjorck, Carla Gomes, Kilian Weinberger", "tldr": "", "abstract": "Reinforcement learning (RL) experiments have notoriously high variance, and minor details can have disproportionately large effects on measured outcomes. This is problematic for creating reproducible research and also serves as an obstacle when applying RL to sensitive real-world applications. In this paper, we investigate causes for this perceived instability. To allow for an in-depth analysis, we focus on a specifically popular setup with high variance -- continuous control from pixels with an actor-critic agent. In this setting, we demonstrate that poor outlier runs which completely fail to learn are an important source of variance, but that weight initialization and initial exploration are not at fault. We show that one cause for these outliers is unstable network parametrization which leads to saturating nonlinearities. We investigate several fixes to this issue and find that simply normalizing penultimate features is surprisingly effective. For sparse tasks, we also find that partially disabling clipped double Q-learning decreases variance. By combining fixes we significantly decrease variances, lowering the average standard deviation across 21 tasks by a factor >3 for a state-of-the-art agent. This demonstrates that the perceived variance is not necessarily inherent to RL. Instead, it may be addressed via simple modifications and we argue that developing low-variance agents is an important goal for the RL community.", "keywords": "reinforcement learning;continuous control", "primary_area": "", "supplementary_material": "", "author": "Johan Bjorck;Carla P Gomes;Kilian Q Weinberger", "authorids": "~Johan_Bjorck2;~Carla_P_Gomes1;~Kilian_Q_Weinberger1", "gender": "M;;M", "homepage": "https://nilsjohanbjorck.github.io/;;http://www.cs.cornell.edu/~kilian/", "dblp": "188/6399;;88/4801", "google_scholar": "https://scholar.google.com/citations?hl=en;;jsxk8vsAAAAJ", "orcid": ";;0009-0008-9313-7239", "linkedin": ";;", "or_profile": "~Johan_Bjorck2;~Carla_P_Gomes1;~Kilian_Q_Weinberger1", "aff": "Microsoft;;ASAPP Inc.", "aff_domain": "microsoft.com;;asapp.com", "position": "Researcher;;Principal Researcher", "bibtex": "@inproceedings{\nbjorck2022is,\ntitle={Is High Variance Unavoidable in {RL}? A Case Study in Continuous Control},\nauthor={Johan Bjorck and Carla P Gomes and Kilian Q Weinberger},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=9xhgmsNVHu}\n}", "github": "", "project": "", "reviewers": "J8Mk;ti7w;N91S;cHfn", "pdf_size": 0, "recommendation": "6;6;6;10", "confidence": "3;3;4;4", "correctness": "4;3;4;3", "technical_novelty": "2;3;3;3", "empirical_novelty": "4;2;3;3", "wc_summary_paper": "51;65;53;136", "wc_summary_review": "23;147;56;175", "wc_main_review": "232;811;1062;205", "wc_review": "306;1023;1171;516", "wc_reply_reviewers": "24;717;0;277", "wc_reply_authors": "308;1969;770;1841", "reply_reviewers": "1;2;0;4", "reply_authors": "1;4;1;4", "recommendation_avg": [ 7.0, 1.7320508075688772 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 76.25, 34.90970495435331 ], "wc_summary_review_avg": [ 100.25, 62.647326359550256 ], "wc_main_review_avg": [ 577.5, 369.92870934816614 ], "wc_review_avg": [ 754.0, 354.8231954086429 ], "wc_reply_reviewers_avg": [ 254.5, 288.23297868217645 ], "wc_reply_authors_avg": [ 1222.0, 703.7169175172642 ], "reply_reviewers_avg": [ 1.75, 1.479019945774904 ], "reply_authors_avg": [ 2.5, 1.5 ], "replies_avg": [ 24, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": -0.5773502691896257, "gs_citation": 27, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1950919355997491562&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=9xhgmsNVHu", "email": "microsoft.com;;asapp.com", "author_num": 3, "aff_unique_index": "0;1", "aff_unique_norm": "Microsoft;ASAPP Inc.", "aff_unique_dep": "Microsoft Corporation;", "aff_unique_url": "https://www.microsoft.com;https://www.asapp.com", "aff_unique_abbr": "Microsoft;ASAPP", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "9zcjXdavnX", "title": "Sampling from Discrete Energy-Based Models with Quality/Efficiency Trade-offs", "track": "main", "status": "Reject", "tldr": "", "abstract": "Energy-Based Models (EBMs) allow for extremely flexible specifications of probability distributions. However, they do not provide a mechanism for obtaining exact samples from these distributions. Monte Carlo techniques can aid us in obtaining samples if some proposal distribution that we can easily sample from is available. For instance, rejection sampling can provide exact samples but is often difficult or impossible to apply due to the need to find a proposal distribution that upper-bounds the target distribution everywhere. Approximate Markov chain Monte Carlo sampling techniques like Metropolis-Hastings are usually easier to design, exploiting a local proposal distribution that performs local edits on an evolving sample. However, these techniques can be inefficient due to the local nature of the proposal distribution and do not provide an estimate of the quality of their samples. In this work, we propose a new approximate sampling technique, Quasi Rejection Sampling (QRS), that allows for a trade-off between sampling efficiency and sampling quality, while providing explicit convergence bounds and diagnostics. QRS capitalizes on the availability of high-quality global proposal distributions obtained from deep learning models. We demonstrate the effectiveness of QRS sampling for discrete EBMs over text for the tasks of controlled text generation with distributional constraints and paraphrase generation. We show that we can sample from such EBMs with arbitrary precision at the cost of sampling efficiency.", "keywords": "Monte Carlo;sampling;rejection sampling;Energy-Based Models;EBMs;controlled text generation;language models", "primary_area": "", "supplementary_material": "", "author": "Bryan Eikema;Germ\u00e1n Kruszewski;Hady Elsahar;Marc Dymetman", "authorids": "~Bryan_Eikema1;~Germ\u00e1n_Kruszewski1;~Hady_Elsahar2;~Marc_Dymetman1", "gender": "M;M;Unspecified;M", "homepage": "https://roxot.github.io/;https://germank.github.io;http://hadyelsahar.io;https://europe.naverlabs.com/people_user/marc-dymetman/", "dblp": "https://dblp.uni-trier.de/pid/223/9865;117/4112;144/6739;74/1221", "google_scholar": "https://scholar.google.nl/citations?user=LDgbm1IAAAAJ;uU3rQI8AAAAJ;SbcM6bsAAAAJ;bTXN9_0AAAAJ", "orcid": ";;;", "linkedin": ";;hadyelsahar/;", "or_profile": "~Bryan_Eikema1;~Germ\u00e1n_Kruszewski1;~Hady_Elsahar2;~Marc_Dymetman1", "aff": "University of Amsterdam;Naver Labs Europe;Naver Labs Europe;Naver Labs Europe", "aff_domain": "uva.nl;naverlabs.com;naverlabs.com;naverlabs.com", "position": "PhD student;Senior Scientist;Researcher;Principal Researcher", "bibtex": "@misc{\neikema2022sampling,\ntitle={Sampling from Discrete Energy-Based Models with Quality/Efficiency Trade-offs},\nauthor={Bryan Eikema and Germ{\\'a}n Kruszewski and Hady Elsahar and Marc Dymetman},\nyear={2022},\nurl={https://openreview.net/forum?id=9zcjXdavnX}\n}", "github": "", "project": "", "reviewers": "RQGP;rWrS;d6gm", "site": "https://openreview.net/forum?id=9zcjXdavnX", "pdf_size": 0, "recommendation": "1;5;6", "confidence": "5;3;4", "correctness": "3;2;4", "technical_novelty": "1;2;3", "empirical_novelty": "0;2;3", "wc_summary_paper": "58;52;321", "wc_summary_review": "13;32;113", "wc_main_review": "262;224;666", "wc_review": "333;308;1100", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "662;126;115", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "recommendation_avg": [ 4.0, 2.160246899469287 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "correctness_avg": [ 3.0, 0.816496580927726 ], "technical_novelty_avg": [ 2.0, 0.816496580927726 ], "empirical_novelty_avg": [ 1.6666666666666667, 1.247219128924647 ], "wc_summary_paper_avg": [ 143.66666666666666, 125.41752491400695 ], "wc_summary_review_avg": [ 52.666666666666664, 43.36152928832449 ], "wc_main_review_avg": [ 384.0, 200.00666655555926 ], "wc_review_avg": [ 580.3333333333334, 367.6015354822241 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 301.0, 255.30504630082552 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.7559289460184544, "corr_recommendation_correctness": 0.1889822365046136, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9546349876448023394&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;1;1;1", "aff_unique_norm": "University of Amsterdam;NAVER LABS", "aff_unique_dep": ";", "aff_unique_url": "https://www.uva.nl;https://labs.naver.com", "aff_unique_abbr": "UvA;NLE", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1", "aff_country_unique": "Netherlands;Unknown" }, { "title": "The Geometry of Memoryless Stochastic Policy Optimization in Infinite-Horizon POMDPs", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7080", "id": "A05I5IvrdL-", "poster": "", "openreview": "https://openreview.net/forum?id=A05I5IvrdL-", "slides": "https://iclr.cc/virtual/2022/poster/7080", "video": "https://iclr.cc/virtual/2022/poster/7080", "author_site": "Johannes M\u00fcller, Guido Montufar", "tldr": "", "abstract": "We consider the problem of finding the best memoryless stochastic policy for an infinite-horizon partially observable Markov decision process (POMDP) with finite state and action spaces with respect to either the discounted or mean reward criterion. We show that the (discounted) state-action frequencies and the expected cumulative reward are rational functions of the policy, whereby the degree is determined by the degree of partial observability. We then describe the optimization problem as a linear optimization problem in the space of feasible state-action frequencies subject to polynomial constraints that we characterize explicitly. This allows us to address the combinatorial and geometric complexity of the optimization problem using recent tools from polynomial optimization. In particular, we demonstrate how the partial observability constraints can lead to multiple smooth and non-smooth local optimizers and we estimate the number of critical points.", "keywords": "POMDPs;Memoryless Policies;Critical points;State-action frequencies;Algebraic degree", "primary_area": "", "supplementary_material": "/attachment/b420d3d6f7b1e76b532b6293961b7b88578c5de1.zip", "author": "Johannes M\u00fcller;Guido Montufar", "authorids": "~Johannes_M\u00fcller1;~Guido_Montufar1", "gender": "M;M", "homepage": "http://www.math.ucla.edu/~montufar/;https://muellerjohannes.github.io/", "dblp": ";", "google_scholar": "https://scholar.google.de/citations?user=pDIuuVwAAAAJ;https://scholar.google.de/citations?user=Wfww-P8AAAAJ", "orcid": "0000-0002-0131-2669;0000-0001-8729-0466", "linkedin": ";", "or_profile": "~Guido_Montufar1;~Johannes_Christoph_M\u00fcller1", "aff": "UCLA;Max Planck Institute for Mathematics in the Sciences", "aff_domain": "math.ucla.edu;mis.mpg.de", "position": "Assistant Professor;PhD student", "bibtex": "@inproceedings{\nm{\\\"u}ller2022the,\ntitle={The Geometry of Memoryless Stochastic Policy Optimization in Infinite-Horizon {POMDP}s},\nauthor={Johannes M{\\\"u}ller and Guido Montufar},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=A05I5IvrdL-}\n}", "github": "", "project": "", "reviewers": "ejas;JQge;4HAL;TytX", "pdf_size": 0, "recommendation": "6;6;8;8", "confidence": "2;3;3;2", "correctness": "3;4;4;4", "technical_novelty": "3;3;3;3", "empirical_novelty": "3;0;0;0", "wc_summary_paper": "49;89;121;46", "wc_summary_review": "57;140;41;36", "wc_main_review": "99;120;202;111", "wc_review": "205;349;364;193", "wc_reply_reviewers": "0;0;0;27", "wc_reply_authors": "581;1069;617;1008", "reply_reviewers": "0;0;0;1", "reply_authors": "1;2;1;3", "recommendation_avg": [ 7.0, 1.0 ], "confidence_avg": [ 2.5, 0.5 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 0.75, 1.299038105676658 ], "wc_summary_paper_avg": [ 76.25, 30.914195768287424 ], "wc_summary_review_avg": [ 68.5, 42.00297608503474 ], "wc_main_review_avg": [ 133.0, 40.527768258318886 ], "wc_review_avg": [ 277.75, 79.04231461691896 ], "wc_reply_reviewers_avg": [ 6.75, 11.691342951089922 ], "wc_reply_authors_avg": [ 818.75, 221.17230274155034 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.75, 0.82915619758885 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2565752681531472620&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=A05I5IvrdL-", "email": "math.ucla.edu;mis.mpg.de", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "University of California, Los Angeles;Max Planck Institute for Mathematics in the Sciences", "aff_unique_dep": ";Mathematics", "aff_unique_url": "https://www.ucla.edu;https://www.mis.mpg.de", "aff_unique_abbr": "UCLA;MPI MIS", "aff_campus_unique_index": "0", "aff_campus_unique": "Los Angeles;", "aff_country_unique_index": "0;1", "aff_country_unique": "United States;Germany" }, { "id": "A209HjoI2fq", "title": "POI-Transformers: POI Entity Matching through POI Embeddings by Incorporating Semantic and Geographic Information", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Point of Interest (POI) data is crucial to location-based applications and various user-oriented services. However, three problems are existing in POI entity matching. First, traditional approaches to general entity matching are designed without geographic location information, which ignores the geographic features when performing POI entity matching. Second, existing POI matching methods for feature design are heavily dependent on the experts\u2019 knowledge. Third, current deep learning-based entity matching approaches require a high computational complexity since all the potential POI entity pairs need input to the network. A general and robust POI embedding framework, the POI-Transformers, is initially proposed in this study to address these problems of POI entity matching. The POI-Transformers can generate semantically meaningful POI embeddings through aggregating the text attributes and geographic location, and minimize the inconsistency of a POI entity by measuring the distance between the newly generated POI embeddings. Moreover, the POI entities are matched by the similarity of POI embeddings instead of directly comparing the POI entities, which can greatly reduce the complexity of computation. The implementation of the POI-Transformers achieves a high F1 score of 95.8% on natural scenes data sets (from the Gaode Map and the Tencent Map) in POI entity matching and is comparable to the state-of-the-art (SOTA) entity matching methods of DeepER, DeepMatcher, and Ditto (in entity matching benchmark data set). Compared with the existing deep learning methods, our method reduces the effort for identifying one million pairs from about 20 hours to 228 seconds. These demonstrate that the proposed POI-Transformers framework significantly outstrips traditional methods both in accuracy and efficiency.", "keywords": "POI entity matching;POI entity embedding;transformer-based model;POI-Transformers", "primary_area": "", "supplementary_material": "", "author": "Jinbao Zhang;Changwang Zhang;Xiaojuan Liu;Xia Li;Weilin Liao;Penghua Liu;Yao Yao;Jihong Zhang", "authorids": "~Jinbao_Zhang1;changwzhang@tencent.com;liuxj58@mail2.sysu.edu.cn;lixia@geo.ecnu.edu.cn;liaoweilin@mail.sysu.edu.cn;liuphhhh@foxmail.com;yaoy@cug.edu.cn;jihongzhang@tencent.com", "gender": "M;;;;;;;", "homepage": ";;;;;;;", "dblp": ";;;;;;;", "google_scholar": "https://scholar.google.com.hk/citations?hl=zh-CN;;;;;;;", "orcid": ";;;;;;;", "linkedin": ";;;;;;;", "or_profile": "~Jinbao_Zhang1;changwzhang@tencent.com;liuxj58@mail2.sysu.edu.cn;lixia@geo.ecnu.edu.cn;liaoweilin@mail.sysu.edu.cn;liuphhhh@foxmail.com;yaoy@cug.edu.cn;jihongzhang@tencent.com", "aff": "SUN YAT-SEN UNIVERSITY;;;;;;;", "aff_domain": "sysu.edu.cn;;;;;;;", "position": "PhD student;;;;;;;", "bibtex": "@misc{\nzhang2022poitransformers,\ntitle={{POI}-Transformers: {POI} Entity Matching through {POI} Embeddings by Incorporating Semantic and Geographic Information},\nauthor={Jinbao Zhang and Changwang Zhang and Xiaojuan Liu and Xia Li and Weilin Liao and Penghua Liu and Yao Yao and Jihong Zhang},\nyear={2022},\nurl={https://openreview.net/forum?id=A209HjoI2fq}\n}", "github": "", "project": "", "reviewers": "8Kqf;3Eij;g3td;1Sbq", "site": "https://openreview.net/forum?id=A209HjoI2fq", "pdf_size": 0, "recommendation": "1;5;5;5", "confidence": "2;3;4;4", "correctness": "1;4;3;3", "technical_novelty": "1;2;2;2", "empirical_novelty": "1;3;2;2", "wc_summary_paper": "41;43;30;17", "wc_summary_review": "12;44;36;24", "wc_main_review": "92;232;214;115", "wc_review": "145;319;280;156", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.0, 1.7320508075688772 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "correctness_avg": [ 2.75, 1.0897247358851685 ], "technical_novelty_avg": [ 1.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 32.75, 10.353139620424328 ], "wc_summary_review_avg": [ 29.0, 12.12435565298214 ], "wc_main_review_avg": [ 163.25, 60.63569493293534 ], "wc_review_avg": [ 225.0, 75.86501169841075 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": 0.8703882797784891, "corr_recommendation_correctness": 0.9271726499455306, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12632931004778117061&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "Sun Yat-sen University", "aff_unique_dep": "", "aff_unique_url": "http://www.sysu.edu.cn", "aff_unique_abbr": "SYSU", "aff_country_unique_index": "0", "aff_country_unique": "China" }, { "title": "Task Relatedness-Based Generalization Bounds for Meta Learning", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6485", "id": "A3HHaEdqAJL", "poster": "", "openreview": "https://openreview.net/forum?id=A3HHaEdqAJL", "slides": "https://iclr.cc/virtual/2022/poster/6485", "video": "https://iclr.cc/virtual/2022/poster/6485", "author_site": "Jiechao Guan, Zhiwu Lu", "tldr": "", "abstract": "Supposing the $n$ training tasks and the new task are sampled from the same environment, traditional meta learning theory derives an error bound on the expected loss over the new task in terms of the empirical training loss, uniformly over the set of all hypothesis spaces. However, there is still little research on how the relatedness of these tasks can affect the full utilization of all $mn$ training data (with $m$ examples per task). In this paper, we propose to address this problem by defining a new notion of task relatedness according to the existence of the bijective transformation between two tasks. A novel generalization bound of $\\mathcal{O}(\\frac{1}{\\sqrt{mn}})$ for meta learning is thus derived by exploiting the proposed task relatedness. Moreover, when investigating a special branch of meta learning that involves representation learning with deep neural networks, we establish spectrally-normalized bounds for both classification and regression problems. Finally, we demonstrate that the relatedness requirement between two tasks is satisfied when the sample space possesses the completeness and separability properties, validating the rationality and applicability of our proposed task-relatedness measure.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jiechao Guan;Zhiwu Lu", "authorids": "~Jiechao_Guan2;~Zhiwu_Lu1", "gender": "M;M", "homepage": "https://gsai.ruc.edu.cn/luzhiwu;", "dblp": "53/5234;228/8337", "google_scholar": "OUXS8doAAAAJ;https://scholar.google.com/citations?hl=zh-CN", "orcid": ";", "linkedin": ";", "or_profile": "~Zhiwu_Lu1;~Jiechao_Guan1", "aff": "Renmin University of China;School of Information, Renmin University of China", "aff_domain": "ruc.edu.cn;ruc.edu.cn", "position": "Full Professor;PhD student", "bibtex": "@inproceedings{\nguan2022task,\ntitle={Task Relatedness-Based Generalization Bounds for Meta Learning},\nauthor={Jiechao Guan and Zhiwu Lu},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=A3HHaEdqAJL}\n}", "github": "", "project": "", "reviewers": "VEYq;bHHr;7ER7;fpV1", "pdf_size": 0, "recommendation": "8;8;8;8", "confidence": "3;3;2;1", "correctness": "4;3;3;4", "technical_novelty": "4;3;4;3", "empirical_novelty": "0;0;0;0", "wc_summary_paper": "123;72;91;104", "wc_summary_review": "37;59;23;50", "wc_main_review": "218;266;115;59", "wc_review": "378;397;229;213", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 8.0, 0.0 ], "confidence_avg": [ 2.25, 0.82915619758885 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.5, 0.5 ], "empirical_novelty_avg": [ 0, 0 ], "wc_summary_paper_avg": [ 97.5, 18.607794065928395 ], "wc_summary_review_avg": [ 42.25, 13.589977924926883 ], "wc_main_review_avg": [ 164.5, 81.76949308880421 ], "wc_review_avg": [ 304.25, 83.71193164656995 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2796030644023192828&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=A3HHaEdqAJL", "email": "ruc.edu.cn;ruc.edu.cn", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Renmin University of China", "aff_unique_dep": "", "aff_unique_url": "http://www.ruc.edu.cn", "aff_unique_abbr": "RUC", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "id": "A4-dkBuXbA", "title": "Deep convolutional recurrent neural network for short-interval EEG motor imagery classification", "track": "main", "status": "Reject", "tldr": "", "abstract": "In this paper, a high-performance short-interval motor imagery classifier is presented that has good potential for use in real-time EEG-based brain-computer interfaces (BCIs). A hybrid deep Convolutional Recurrent Neural Network with Temporal Attention (CRNN-TA) is described that achieves state-of-art performance in four-class classification (73% accuracy, 60% kappa, 3% higher than the winner of the BCI IV 2A competition). An adaptation of the guided grad-CAM method is proposed for decision visualization. A novel EEG data augmentation technique, shuffled-crossover, is introduced that leads to a 3% increase in classification accuracy (relative to a comparable baseline). Classification accuracies for different windows sizes and time intervals are evaluated. An attention mechanism is also proposed that could serve as a feedback loop during data capture for the rejection of bad trials (e.g., those in which participants were inattentive).", "keywords": "Attention;Brain-Computer Interface (BCI);Electroencephalography (EEG);Convolutional Neural Networks (CNN);Motor Imagery (MI);Recurrent Neural Networks (RNN);grad-CAM", "primary_area": "", "supplementary_material": "", "author": "Ahmed Bahaa Selim;Ian van der Linde", "authorids": "~Ahmed_Bahaa_Selim1;~Ian_van_der_Linde1", "gender": "M;", "homepage": ";", "dblp": ";", "google_scholar": "nRSFXjsAAAAJ;", "orcid": ";", "linkedin": "ahmedbselim/;", "or_profile": "~Ahmed_Bahaa_Selim1;~Ian_van_der_Linde1", "aff": "Sensyne Health;", "aff_domain": "sensynehealth.com;", "position": "Researcher;", "bibtex": "@misc{\nselim2022deep,\ntitle={Deep convolutional recurrent neural network for short-interval {EEG} motor imagery classification},\nauthor={Ahmed Bahaa Selim and Ian van der Linde},\nyear={2022},\nurl={https://openreview.net/forum?id=A4-dkBuXbA}\n}", "github": "", "project": "", "reviewers": "tCsE;M3gg;uzaX", "site": "https://openreview.net/forum?id=A4-dkBuXbA", "pdf_size": 0, "recommendation": "1;1;3", "confidence": "5;5;2", "correctness": "2;3;2", "technical_novelty": "1;2;2", "empirical_novelty": "1;0;2", "wc_summary_paper": "17;18;100", "wc_summary_review": "1;30;50", "wc_main_review": "165;239;101", "wc_review": "183;287;251", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 1.6666666666666667, 0.9428090415820634 ], "confidence_avg": [ 4.0, 1.4142135623730951 ], "correctness_avg": [ 2.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 1.6666666666666667, 0.4714045207910317 ], "empirical_novelty_avg": [ 1.0, 0.816496580927726 ], "wc_summary_paper_avg": [ 45.0, 38.89301565405628 ], "wc_summary_review_avg": [ 27.0, 20.11632835948615 ], "wc_main_review_avg": [ 168.33333333333334, 56.387547876774676 ], "wc_review_avg": [ 240.33333333333334, 43.12256434345661 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.9999999999999999, "corr_recommendation_correctness": -0.4999999999999999, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:I6dL_EZPdu0J:scholar.google.com/&scioq=Deep+convolutional+recurrent+neural+network+for+short-interval+EEG+motor+imagery+classification&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "Sensyne Health", "aff_unique_dep": "", "aff_unique_url": "https://www.sensynehealth.com", "aff_unique_abbr": "", "aff_country_unique_index": "0", "aff_country_unique": "United Kingdom" }, { "id": "A89KIvRYooT", "title": "FoxInst: A Frustratingly Simple Baseline for Weakly Few-shot Instance Segmentation", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "We propose the first weakly-supervised few-shot instance segmentation task and a frustratingly simple but strong baseline model, FoxInst. Our work is distinguished from other approaches in that our method is trained with weak annotations, i.e., class and box annotations, during all phases, which leads to further data efficiency and practicality. Considering the challenging regime of our problem, we design the network to be an anchor-free architecture to avoid anchor box restriction, and train the network in a simple and stable way that first trains the whole network on the base classes, and then only fine-tunes the heads partially with few novel class data. To establish the foundation as a strong baseline, we carefully design evaluation setups by correcting the existing problems in the evaluation metric and test set, so that the effects of each component are well revealed. We show that FoxInst achieves comparable or even higher performance with the prior fully-supervised FSIS networks on COCO and PASCAL VOC datasets. We will release the code if accepted for reproduction.", "keywords": "few-shot learning;instance segmentation;weakly supervised learning", "primary_area": "", "supplementary_material": "", "author": "Dongmin Choi;Moon Ye-Bin;Junsik Kim;Tae-Hyun Oh", "authorids": "~Dongmin_Choi1;~Moon_Ye-Bin1;~Junsik_Kim1;~Tae-Hyun_Oh3", "gender": "M;F;M;M", "homepage": ";https://sites.google.com/g.postech.edu/moon-ye-bin/\ud648;https://sites.google.com/site/jskimcv/;https://ami.kaist.ac.kr", "dblp": ";299/7654;89/6937-1;119/1450", "google_scholar": "mVf5dHQAAAAJ;Nwq4vPAAAAAJ;https://scholar.google.co.kr/citations?user=p5tuyxwAAAAJ;dMCBjeIAAAAJ", "orcid": ";0000-0002-0390-6567;0000-0003-2555-5232;0000-0003-0468-1571", "linkedin": ";moon-ye-bin-451b5a245/;;tae-hyun-oh-at-mit/", "or_profile": "~Dongmin_Choi1;~Moon_Ye-Bin1;~Junsik_Kim1;~Tae-Hyun_Oh3", "aff": "Yonsei University;Pohang University of Science and Technology;Harvard University;POSTECH", "aff_domain": "yonsei.ac.kr;postech.edu;harvard.edu;postech.ac.kr", "position": "Undergrad student;MS student;Postdoctoral fellow;Assistant Professor", "bibtex": "@misc{\nchoi2022foxinst,\ntitle={FoxInst: A Frustratingly Simple Baseline for Weakly Few-shot Instance Segmentation},\nauthor={Dongmin Choi and Moon Ye-Bin and Junsik Kim and Tae-Hyun Oh},\nyear={2022},\nurl={https://openreview.net/forum?id=A89KIvRYooT}\n}", "github": "", "project": "", "reviewers": "RR32;2SQm;KYwy;zDFf", "site": "https://openreview.net/forum?id=A89KIvRYooT", "pdf_size": 0, "recommendation": "3;3;3;3", "confidence": "5;4;3;5", "correctness": "3;2;3;2", "technical_novelty": "1;2;2;2", "empirical_novelty": "2;2;3;2", "wc_summary_paper": "104;66;137;36", "wc_summary_review": "39;44;39;31", "wc_main_review": "265;332;211;293", "wc_review": "408;442;387;360", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 4.25, 0.82915619758885 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 1.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 85.75, 38.160024895170075 ], "wc_summary_review_avg": [ 38.25, 4.656984002549289 ], "wc_main_review_avg": [ 275.25, 44.070256409510485 ], "wc_review_avg": [ 399.25, 29.978117018918983 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12887517199413130045&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;2;1", "aff_unique_norm": "Yonsei University;Pohang University of Science and Technology;Harvard University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.yonsei.ac.kr;https://www.postech.ac.kr;https://www.harvard.edu", "aff_unique_abbr": "Yonsei;POSTECH;Harvard", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Pohang", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "South Korea;United States" }, { "id": "A9PXKSUYrJe", "title": "Robust Learning with Adaptive Sample Credibility Modeling", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Training deep neural network (DNN) with noisy labels is practically challenging since inaccurate labels severely degrade the generalization ability of DNN. Previous efforts tend to handle part or full data in a unified denoising flow to mitigate the noisy label problem, while they lack the consideration of intrinsic difference among difficulties of various noisy samples. In this paper, a novel and adaptive end-to-end robust learning method, called CREMA, is proposed. The insight behind is that the credibility of a training sample can be estimated by the joint distribution of its data-label pair, thus to roughly separate clean and noisy samples from original samples, which will be processed with different denoising process in a divide-and-conquer manner. For the clean set, we deliberately design a memory-based modulation scheme to dynamically adjust the contribution of each sample in terms of its historical credibility sequence during training, thus to alleviate the effect from potential hard noisy samples in clean set. Meanwhile, for those samples categorized into noisy set, we try to correct their labels in a selective manner to maximize data utilization and further boost performance. Extensive experiments on mainstream benchmarks, including synthetic (noisy versions of MNIST, CIFAR-10 and CIFAR-100) and real-world (Clothing1M and Animal-10N) noisy datasets demonstrate superiority of the proposed method.", "keywords": "robust learning;label noise;divide-and-conquer", "primary_area": "", "supplementary_material": "", "author": "Boshen Zhang;Yuxi Li;Yuanpeng Tu;Yabiao Wang;Yang Xiao;Cai Rong Zhao;Chengjie Wang", "authorids": "~Boshen_Zhang1;~Yuxi_Li2;~Yuanpeng_Tu1;~Yabiao_Wang1;~Yang_Xiao1;~Cai_Rong_Zhao1;~Chengjie_Wang1", "gender": "M;M;M;;M;M;M", "homepage": "https://openreview.net/profile?id=~Boshen_Zhang1;https://github.com/lyxok1;https://github.com/helloTongji;;;https://vill.tongji.edu.cn/;", "dblp": "145/3980;;;;181/1848-7;81/8614;", "google_scholar": "GOnKOMcAAAAJ;-24oYQoAAAAJ;https://scholar.google.com.sg/citations?user=a70oH2wAAAAJ;;NeKBuXEAAAAJ;z-XzWZcAAAAJ;fqte5H4AAAAJ", "orcid": "0000-0001-9204-5676;;;;;0000-0001-6745-9674;0000-0003-4216-8090", "linkedin": ";;;;;;", "or_profile": "~Boshen_Zhang1;~Yuxi_Li2;~Yuanpeng_Tu1;~Yabiao_Wang1;~Yang_Xiao1;~Cai_Rong_Zhao1;~Chengjie_Wang1", "aff": "Tencent;Tencent Youtu Lab;Tongji University;;Huazhong University of Science and Technology;Tongji University;Tencent YouTu Lab", "aff_domain": "tencent.com;tencent.com;tongji.edu.cn;;hust.edu.cn;tongji.edu.cn;tencent.com", "position": "Researcher;Researcher;MS student;;Associate Professor;Full Professor;Researcher", "bibtex": "@misc{\nzhang2022robust,\ntitle={Robust Learning with Adaptive Sample Credibility Modeling},\nauthor={Boshen Zhang and Yuxi Li and Yuanpeng Tu and Yabiao Wang and Yang Xiao and Cai Rong Zhao and Chengjie Wang},\nyear={2022},\nurl={https://openreview.net/forum?id=A9PXKSUYrJe}\n}", "github": "", "project": "", "reviewers": "QR5V;MnUN;2NuP", "site": "https://openreview.net/forum?id=A9PXKSUYrJe", "pdf_size": 0, "recommendation": "3;3;5", "confidence": "5;5;3", "correctness": "3;2;3", "technical_novelty": "3;2;2", "empirical_novelty": "3;2;2", "wc_summary_paper": "99;56;110", "wc_summary_review": "66;39;58", "wc_main_review": "378;396;395", "wc_review": "543;491;563", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 3.6666666666666665, 0.9428090415820634 ], "confidence_avg": [ 4.333333333333333, 0.9428090415820634 ], "correctness_avg": [ 2.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 88.33333333333333, 23.299976156401723 ], "wc_summary_review_avg": [ 54.333333333333336, 11.323525167642018 ], "wc_main_review_avg": [ 389.6666666666667, 8.259674462242577 ], "wc_review_avg": [ 532.3333333333334, 30.34615113797611 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": -1.0, "corr_recommendation_correctness": 0.5, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:P54LKBxVUVQJ:scholar.google.com/&scioq=Robust+Learning+with+Adaptive+Sample+Credibility+Modeling&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0;1;2;1;0", "aff_unique_norm": "Tencent;Tongji University;Huazhong University of Science and Technology", "aff_unique_dep": "Tencent Holdings Limited;;", "aff_unique_url": "https://www.tencent.com;https://www.tongji.edu.cn;http://www.hust.edu.cn", "aff_unique_abbr": "Tencent;Tongji;HUST", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "China" }, { "id": "AAHL45-O7tV", "title": "TransTCN: An Attention-based TCN Framework for Sequential Modeling", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Among the sequential modeling issues, the ability to model the long-term dependency remains a significant issue yet to be overcome. Although, recurrent networks extract this information via a recurrent connection, the training step also considers the temporal connection, which reduces the efficiency. However, a temporal connection network (TCN) exploits the benefit of parallelization of convolution and consequently models the sequential information via causal-dilated connection of layers. Moreover, Transformer has exhibited great ability to capture long-term dependency. Thus, in this study, based on the TCN model, the attention blocks in Transformer were introduced to form a model called TransTCN. TransTCN models the sequential information considering attention modules. The model was evaluated across a wide range of the tasks in time series, which are commonly used to the benchmark of TCN and recurrent networks. To the best of our knowledge, TransTCN is the first framework to combine the attention in transformer with TCN to achieve a SOTA performance. The experimental results showed that the perplexity of the word-level prediction on PennTreebank reached only $1.33$ while TCN achieved $87.90$, which is $66$ times of the original TCN. In addition, nearly all loss and perplexity/bpc was improved on other datasets that are commonly used in TCN, except for several datasets wherein our approach maintained performance similar to the original TCN. Furthermore, the training process of TransTCN converges faster than that of TCN.", "keywords": "sequential modeling;multi-head attention;TCN;Transformer", "primary_area": "", "supplementary_material": "/attachment/41ee4d0ab80ce333ca48cde62bdde00ceb641276.zip", "author": "Yuan Chai;Liang He;Yang Zhao;Xueyan Li;Zhenxin Wang", "authorids": "~Yuan_Chai2;~Liang_He6;zhaoyang@xiangyun.com;leexy@jlu.edu.cn;wangzhenxin@jilinxiangyun.com", "gender": ";M;;;", "homepage": "https://zhihu.com/people/chai-yuan-96-45;https://orcid.org/my-orcid?orcid=0000-0002-9867-7508;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~Yuan_Chai2;~Liang_He6;zhaoyang@xiangyun.com;leexy@jlu.edu.cn;wangzhenxin@jilinxiangyun.com", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nchai2022transtcn,\ntitle={Trans{TCN}: An Attention-based {TCN} Framework for Sequential Modeling},\nauthor={Yuan Chai and Liang He and Yang Zhao and Xueyan Li and Zhenxin Wang},\nyear={2022},\nurl={https://openreview.net/forum?id=AAHL45-O7tV}\n}", "github": "", "project": "", "reviewers": "xoiC;T4or;aNrd", "site": "https://openreview.net/forum?id=AAHL45-O7tV", "pdf_size": 0, "recommendation": "1;3;5", "confidence": "3;3;4", "correctness": "1;2;3", "technical_novelty": "1;3;2", "empirical_novelty": "1;2;2", "wc_summary_paper": "212;36;43", "wc_summary_review": "15;27;29", "wc_main_review": "212;562;160", "wc_review": "439;625;232", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 3.0, 1.632993161855452 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "correctness_avg": [ 2.0, 0.816496580927726 ], "technical_novelty_avg": [ 2.0, 0.816496580927726 ], "empirical_novelty_avg": [ 1.6666666666666667, 0.4714045207910317 ], "wc_summary_paper_avg": [ 97.0, 81.36747917114471 ], "wc_summary_review_avg": [ 23.666666666666668, 6.182412330330469 ], "wc_main_review_avg": [ 311.3333333333333, 178.51486088153993 ], "wc_review_avg": [ 432.0, 160.51791177311023 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.8660254037844385, "corr_recommendation_correctness": 1.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:IHgCjo1HlAwJ:scholar.google.com/&scioq=TransTCN:+An+Attention-based+TCN+Framework+for+Sequential+Modeling&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "title": "Conditional Contrastive Learning with Kernel", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6076", "id": "AAJLBoGt0XM", "poster": "", "openreview": "https://openreview.net/forum?id=AAJLBoGt0XM", "slides": "https://iclr.cc/virtual/2022/poster/6076", "video": "https://iclr.cc/virtual/2022/poster/6076", "author_site": "Yao-Hung Hubert Tsai, Tianqin Li, Martin Ma, Han Zhao, Kun Zhang, Louis-Philippe Morency, Ruslan Salakhutdinov", "tldr": "", "abstract": "Conditional contrastive learning frameworks consider the conditional sampling procedure that constructs positive or negative data pairs conditioned on specific variables. Fair contrastive learning constructs negative pairs, for example, from the same gender (conditioning on sensitive information), which in turn reduces undesirable information from the learned representations; weakly supervised contrastive learning constructs positive pairs with similar annotative attributes (conditioning on auxiliary information), which in turn are incorporated into the representations. Although conditional contrastive learning enables many applications, the conditional sampling procedure can be challenging if we cannot obtain sufficient data pairs for some values of the conditioning variable. This paper presents Conditional Contrastive Learning with Kernel (CCL-K) that converts existing conditional contrastive objectives into alternative forms that mitigate the insufficient data problem. Instead of sampling data according to the value of the conditioning variable, CCL-K uses the Kernel Conditional Embedding Operator that samples data from all available data and assigns weights to each sampled data given the kernel similarity between the values of the conditioning variable. We conduct experiments using weakly supervised, fair, and hard negatives contrastive learning, showing CCL-K outperforms state-of-the-art baselines.\n", "keywords": "Contrastive Learning;Conditional Sampling;Kernel methods", "primary_area": "", "supplementary_material": "", "author": "Yao-Hung Hubert Tsai;Tianqin Li;Martin Q. Ma;Han Zhao;Kun Zhang;Louis-Philippe Morency;Ruslan Salakhutdinov", "authorids": "~Yao-Hung_Hubert_Tsai1;~Tianqin_Li2;~Martin_Q._Ma1;~Han_Zhao1;~Kun_Zhang1;~Louis-Philippe_Morency1;~Ruslan_Salakhutdinov1", "gender": "M;M;M;M;M;M;M", "homepage": ";https://github.com/Crazy-Jack;https://hanzhaoml.github.io/;http://www.andrew.cmu.edu/user/kunz1/;https://www.cs.cmu.edu/~morency/;http://www.cs.cmu.edu/~qianlim/;https://www.cs.cmu.edu/~rsalakhu/", "dblp": "154/3702;294/5434;03/3520-2;96/3115-1;31/739;251/5669.html;", "google_scholar": ";sQjEQEUAAAAJ;x942ipYAAAAJ;RGoypN4AAAAJ;https://scholar.google.com.tw/citations?user=APgaFK0AAAAJ;TFCtuaQAAAAJ;", "orcid": ";0000-0003-2567-8283;0000-0002-8579-1600;;0000-0001-6376-7696;;", "linkedin": ";tianqin-li-b16299170/;;;morency?challengeId=AQELGK_OvMa0vwAAAY72L-VV4X9hW8juuY80VHVeeSGHZ1PJHeeEa5LTFoeTmDGU0t1OL07MXJTYC9EAi6qgPDd2z9ztnbdFYA&submissionId=09a0ff34-04ac-c717-bef7-8c9c8811b463&challengeSource=AgFhxWkU3q7v4wAAAY72L-1xRE0eG-BnZUNE9e3eAG95pgOCZ9u1nxEg-1dK2Dw&challegeType=AgHMzV0lqKgEFwAAAY72L-11X6DHMd3V_A3Iur8XZeyYF2-oBzoufs8&memberId=AgH4yz7pZ_riCgAAAY72L-146jmR2pdr3dmhy2icxBtEQzQ&recognizeDevice=AgFDCNyrhKiFSAAAAY72L-16m7z2EH2t0ueWmMKjyk1_ZJAkfFVe;;", "or_profile": "~Yao-Hung_Hubert_Tsai1;~Tianqin_Li2;~Han_Zhao1;~Kun_Zhang1;~Louis-Philippe_Morency1;~Martin_Ma2;~Russ_Salakhutdinov1", "aff": "Apple;Carnegie Mellon University;University of Illinois, Urbana Champaign;Carnegie Mellon University;Carnegie Mellon University;Carnegie Mellon University;School of Computer Science, Carnegie Mellon University", "aff_domain": "apple.com;andrew.cmu.edu;illinois.edu;cmu.edu;cmu.edu;cs.cmu.edu;cs.cmu.edu", "position": "Principal Researcher;PhD student;Assistant Professor;Associate Professor;Associate Professor;PhD student;Full Professor", "bibtex": "@inproceedings{\ntsai2022conditional,\ntitle={Conditional Contrastive Learning with Kernel},\nauthor={Yao-Hung Hubert Tsai and Tianqin Li and Martin Q. Ma and Han Zhao and Kun Zhang and Louis-Philippe Morency and Ruslan Salakhutdinov},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=AAJLBoGt0XM}\n}", "github": "", "project": "", "reviewers": "F3Tg;imU8;oJk7;kZnA", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "4;4;4;4", "correctness": "3;4;2;4", "technical_novelty": "3;3;3;3", "empirical_novelty": "3;2;3;3", "wc_summary_paper": "55;92;86;30", "wc_summary_review": "48;95;79;41", "wc_main_review": "247;379;395;371", "wc_review": "350;566;560;442", "wc_reply_reviewers": "0;647;0;26", "wc_reply_authors": "629;1943;1106;803", "reply_reviewers": "0;2;0;1", "reply_authors": "1;5;2;3", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 65.75, 24.963723680572976 ], "wc_summary_review_avg": [ 65.75, 22.128883839904805 ], "wc_main_review_avg": [ 348.0, 58.94913061275798 ], "wc_review_avg": [ 479.5, 89.63676700997198 ], "wc_reply_reviewers_avg": [ 168.25, 276.61017244490483 ], "wc_reply_authors_avg": [ 1120.25, 504.75111441184555 ], "reply_reviewers_avg": [ 0.75, 0.82915619758885 ], "reply_authors_avg": [ 2.75, 1.479019945774904 ], "replies_avg": [ 20, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.48420012470625223, "gs_citation": 33, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14273339449801655874&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=AAJLBoGt0XM", "email": "apple.com;andrew.cmu.edu;illinois.edu;cmu.edu;cmu.edu;cs.cmu.edu;cs.cmu.edu", "author_num": 7, "aff_unique_index": "0;1;2;1;1;1;1", "aff_unique_norm": "Apple;Carnegie Mellon University;University of Illinois Urbana-Champaign", "aff_unique_dep": "Apple Inc.;;", "aff_unique_url": "https://www.apple.com;https://www.cmu.edu;https://illinois.edu", "aff_unique_abbr": "Apple;CMU;UIUC", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Urbana-Champaign;Pittsburgh", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "AAeMQz0x4nA", "title": "Learning Explicit Credit Assignment for Multi-agent Joint Q-learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Multi-agent joint Q-learning based on Centralized Training with Decentralized Execution (CTDE) has become an effective technique for multi-agent cooperation. During centralized training, these methods are essentially addressing the multi-agent credit assignment problem. However, most of the existing methods \\emph{implicitly} learn the credit assignment just by ensuring that the joint Q-value satisfies the Bellman optimality equation. In contrast, we formulate an \\emph{explicit} credit assignment problem where each agent gives its suggestion about how to weight individual Q-values to explicitly maximize the joint Q-value, besides guaranteeing the Bellman optimality of the joint Q-value. In this way, we can conduct credit assignment among multiple agents and along the time horizon. Theoretically, we give a gradient ascent solution for this problem. Empirically, we instantiate the core idea with deep neural networks and propose Explicit Credit Assignment joint Q-learning (ECAQ) to facilitate multi-agent cooperation in complex problems. Extensive experiments justify that ECAQ achieves interpretable credit assignment and superior performance compared to several advanced baselines.", "keywords": "Mutli-agent Credit Assignment;Mutli-agent Joint Q-learning", "primary_area": "", "supplementary_material": "", "author": "Hangyu Mao;Jianye HAO;Dong Li;Jun Wang;Weixun Wang;Xiaotian Hao;Bin Wang;Kun Shao;Zhen Xiao;Wulong Liu", "authorids": "~Hangyu_Mao2;~Jianye_HAO1;~Dong_Li10;~Jun_Wang2;~Weixun_Wang1;~Xiaotian_Hao1;~Bin_Wang12;~Kun_Shao1;~Zhen_Xiao1;~Wulong_Liu1", "gender": ";M;M;M;;M;M;;;M", "homepage": ";http://www.icdai.org/jianye.html;;http://www0.cs.ucl.ac.uk/staff/jun.wang/;http://n.musk.ndu.com;;http://binwang.top;;;", "dblp": ";21/7664.html;47/4826-16;w/JunWang12;84/998;144/3359;13/1898-34;;39/3878;36/9257.html", "google_scholar": ";;;https://scholar.google.co.uk/citations?user=wIE1tY4AAAAJ;;xgk9NPwAAAAJ;KWZG_YsAAAAJ;;;https://scholar.google.ca/citations?user=od00FfIAAAAJ", "orcid": ";0000-0002-0422-8235;;;;;0000-0002-0267-3749;;;", "linkedin": ";;;;;;;;;wulong-liu-28006155/", "or_profile": "~Hangyu_Mao2;~Jianye_HAO1;~Dong_Li10;~Jun_Wang2;~Weixun_Wang1;~Xiaotian_Hao1;~Bin_Wang12;~Kun_Shao1;~Zhen_Xiao1;~Wulong_Liu1", "aff": ";Tianjin University;Huawei Technologies Ltd.;University College London;Tianjin University;university of tianjin of china, Tianjin University;Huawei Noah's Ark Lab;;;Huawei Noah's Ark Lab", "aff_domain": ";tju.edu.cn;huawei.com;ucl.ac.uk;tju.edu.cn;tju.edu.cn;huawei.com;;;huawei.com", "position": ";Associate Professor;Principal Researcher;Professor;PhD student;PhD student;Senior Researcher;;;Researcher", "bibtex": "@misc{\nmao2022learning,\ntitle={Learning Explicit Credit Assignment for Multi-agent Joint Q-learning},\nauthor={Hangyu Mao and Jianye HAO and Dong Li and Jun Wang and Weixun Wang and Xiaotian Hao and Bin Wang and Kun Shao and Zhen Xiao and Wulong Liu},\nyear={2022},\nurl={https://openreview.net/forum?id=AAeMQz0x4nA}\n}", "github": "", "project": "", "reviewers": "trcZ;wZYm;49nJ;56iQ", "site": "https://openreview.net/forum?id=AAeMQz0x4nA", "pdf_size": 0, "recommendation": "3;3;5;6", "confidence": "4;3;3;4", "correctness": "1;2;3;4", "technical_novelty": "3;1;3;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "132;43;87;82", "wc_summary_review": "18;28;36;110", "wc_main_review": "498;158;619;219", "wc_review": "648;229;742;411", "wc_reply_reviewers": "1225;1592;28;64", "wc_reply_authors": "1494;2340;238;248", "reply_reviewers": "3;5;1;1", "reply_authors": "4;6;2;2", "recommendation_avg": [ 4.25, 1.299038105676658 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 2.5, 1.118033988749895 ], "technical_novelty_avg": [ 2.5, 0.8660254037844386 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 86.0, 31.551545128567 ], "wc_summary_review_avg": [ 48.0, 36.359317925395686 ], "wc_main_review_avg": [ 373.5, 191.10272106906274 ], "wc_review_avg": [ 507.5, 201.0006218895852 ], "wc_reply_reviewers_avg": [ 727.25, 693.6135000848816 ], "wc_reply_authors_avg": [ 1080.0, 888.8453183766003 ], "reply_reviewers_avg": [ 2.5, 1.6583123951777 ], "reply_authors_avg": [ 3.5, 1.6583123951777 ], "replies_avg": [ 29, 0 ], "authors#_avg": [ 10, 0 ], "corr_recommendation_confidence": 0.19245008972987526, "corr_recommendation_correctness": 0.9467292624062574, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12273631906755010429&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;2;0;0;1;1", "aff_unique_norm": "Tianjin University;Huawei;University College London", "aff_unique_dep": ";Huawei Technologies;", "aff_unique_url": "http://www.tju.edu.cn;https://www.huawei.com;https://www.ucl.ac.uk", "aff_unique_abbr": "TJU;Huawei;UCL", "aff_campus_unique_index": "1", "aff_campus_unique": ";Tianjin", "aff_country_unique_index": "0;0;1;0;0;0;0", "aff_country_unique": "China;United Kingdom" }, { "id": "AB2r0YKBSpD", "title": "Data Scaling Laws in NMT: The Effect of Noise and Architecture", "track": "main", "status": "Reject", "tldr": "", "abstract": "In this work, we empirically study the data scaling properties of neural machine translation (NMT). We first establish that the test loss of encoder-decoder transformer models scales as a power law in the number of training samples, with a dependence on the model size. We then systematically vary various aspects of the training setup to understand how they impact the data scaling laws. In particular, we change the (1) Architecture and task setup, to a Transformer-LSTM Hybrid as well as a Decoder-only transformer with language modeling loss (2) Noise level in the training distribution, starting with noisy data with filtering applied as well as clean data corrupted with synthetic iid noise. In all the above cases, we find that the data scaling exponents are minimally impacted, suggesting that marginally worse architectures or training data quality can be compensated for by adding more data. Lastly, we find that changing the training distribution to use back-translated data instead of parallel data, can impact the scaling exponent.", "keywords": "Scaling laws;Neural Machine Translation", "primary_area": "", "supplementary_material": "", "author": "Yamini Bansal;Behrooz Ghorbani;Ankush Garg;Biao Zhang;Colin Cherry;Maxim Krikun;Behnam Neyshabur;Orhan Firat", "authorids": "~Yamini_Bansal1;~Behrooz_Ghorbani1;~Ankush_Garg1;~Biao_Zhang2;~Colin_Cherry1;~Maxim_Krikun1;~Behnam_Neyshabur1;~Orhan_Firat1", "gender": "F;;M;M;M;;M;M", "homepage": ";;;;https://sites.google.com/site/colinacherry/;;https://www.neyshabur.net;", "dblp": ";162/0166;86/7221;https://dblp.uni-trier.de/pers/hd/z/Zhang_0002:Biao;99/6601;05/1775;131/9898;120/2225", "google_scholar": "uj1OljkAAAAJ;;https://scholar.google.com/citations?hl=en;gqPKjaIAAAAJ;TNr_OWMAAAAJ;;e1ucbCYAAAAJ;https://scholar.google.com.tr/citations?user=dLaR9lgAAAAJ", "orcid": ";;;;;;;", "linkedin": ";;agbgarg/;;colincherry/;;;", "or_profile": "~Yamini_Bansal1;~Behrooz_Ghorbani1;~Ankush_Garg1;~Biao_Zhang2;~Colin_Cherry1;~Maxim_Krikun1;~Behnam_Neyshabur1;~Orhan_Firat1", "aff": "Harvard University;Google;Google;University of Edinburgh;Google;Google;Google;Google", "aff_domain": "harvard.edu;google.com;google.com;ed.ac.uk;google.com;google.com;google.com;google.com", "position": "PhD student;Researcher;research engineer;PhD student;Researcher;Software Engineer;Research Scientist;Research Scientist", "bibtex": "@misc{\nbansal2022data,\ntitle={Data Scaling Laws in {NMT}: The Effect of Noise and Architecture},\nauthor={Yamini Bansal and Behrooz Ghorbani and Ankush Garg and Biao Zhang and Colin Cherry and Maxim Krikun and Behnam Neyshabur and Orhan Firat},\nyear={2022},\nurl={https://openreview.net/forum?id=AB2r0YKBSpD}\n}", "github": "", "project": "", "reviewers": "Pwfc;fUfX;Nuk2;XWAu", "site": "https://openreview.net/forum?id=AB2r0YKBSpD", "pdf_size": 0, "recommendation": "3;5;6;6", "confidence": "4;3;4;4", "correctness": "2;3;3;4", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "56;56;178;77", "wc_summary_review": "14;104;44;28", "wc_main_review": "354;153;531;145", "wc_review": "424;313;753;250", "wc_reply_reviewers": "0;0;0;30", "wc_reply_authors": "508;414;347;144", "reply_reviewers": "0;0;0;1", "reply_authors": "1;1;1;2", "recommendation_avg": [ 5.0, 1.224744871391589 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 91.75, 50.529075788104414 ], "wc_summary_review_avg": [ 47.5, 34.30378987808781 ], "wc_main_review_avg": [ 295.75, 159.5609209675101 ], "wc_review_avg": [ 435.0, 193.87753866809842 ], "wc_reply_reviewers_avg": [ 7.5, 12.99038105676658 ], "wc_reply_authors_avg": [ 353.25, 133.66258825864475 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.8660254037844386, "gs_citation": 48, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13927416013286476769&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff_unique_index": "0;1;1;2;1;1;1;1", "aff_unique_norm": "Harvard University;Google;University of Edinburgh", "aff_unique_dep": ";Google;", "aff_unique_url": "https://www.harvard.edu;https://www.google.com;https://www.ed.ac.uk", "aff_unique_abbr": "Harvard;Google;Edinburgh", "aff_campus_unique_index": "1;1;1;1;1;1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0;1;0;0;0;0", "aff_country_unique": "United States;United Kingdom" }, { "id": "ABv1puMlSgp", "title": "On Neurons Invariant to Sentence Structural Changes in Neural Machine Translation", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "To gain insight into the role neurons play, we study the activation patterns corresponding to meaning-preserving paraphrases (e.g., active-passive). We compile a dataset of controlled syntactic paraphrases in English with their reference German translations and demonstrate our model-agnostic approach with the Transformer translation model. First, we identify neurons that correlate across paraphrases and dissect the observed correlation into possible confounds. Although lower-level components are found as the cause of similar activations, no sentence-level semantics or syntax are detected locally. Later, we manipulate neuron activations to influence translation towards a particular syntactic form. We find that a simple value shift is effective, and more so when many neurons are modified. These suggest that complex syntactic constructions are indeed encoded in the model. We conclude by discussing how to better manipulate it using the correlations we first obtained.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Gal Patel;Leshem Choshen;Omri Abend", "authorids": "~Gal_Patel1;~Leshem_Choshen1;~Omri_Abend1", "gender": ";Not Specified;M", "homepage": ";https://ktilana.wixsite.com/leshem-choshen;http://www.cs.huji.ac.il/~oabend/", "dblp": ";218/5237;30/8159", "google_scholar": ";https://scholar.google.com/citations?hl=en;https://scholar.google.com.tw/citations?user=BD_hRzYAAAAJ", "orcid": ";0000-0002-0085-6496;", "linkedin": "gal-patel/;leshemchoshen/;", "or_profile": "~Gal_Patel1;~Leshem_Choshen1;~Omri_Abend1", "aff": ";hebrew university jerusalem israel;Hebrew University of Jerusalem", "aff_domain": ";huji.ac.il;huji.ac.il", "position": ";PhD student;Associate Professor", "bibtex": "@misc{\npatel2022on,\ntitle={On Neurons Invariant to Sentence Structural Changes in Neural Machine Translation},\nauthor={Gal Patel and Leshem Choshen and Omri Abend},\nyear={2022},\nurl={https://openreview.net/forum?id=ABv1puMlSgp}\n}", "github": "", "project": "", "reviewers": "aCQ3;t6Go;Wued;MeZD;Z7ka", "site": "https://openreview.net/forum?id=ABv1puMlSgp", "pdf_size": 0, "recommendation": "1;3;5;5;5", "confidence": "4;3;3;4;4", "correctness": "2;3;3;3;3", "technical_novelty": "1;2;3;1;2", "empirical_novelty": "2;2;3;2;2", "wc_summary_paper": "63;75;81;92;183", "wc_summary_review": "26;19;21;64;18", "wc_main_review": "432;140;171;242;643", "wc_review": "521;234;273;398;844", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "0;0;0;0;0", "reply_reviewers": "0;0;0;0;0", "reply_authors": "0;0;0;0;0", "recommendation_avg": [ 3.8, 1.6 ], "confidence_avg": [ 3.6, 0.4898979485566356 ], "correctness_avg": [ 2.8, 0.39999999999999997 ], "technical_novelty_avg": [ 1.8, 0.7483314773547883 ], "empirical_novelty_avg": [ 2.2, 0.39999999999999997 ], "wc_summary_paper_avg": [ 98.8, 43.129572221388884 ], "wc_summary_review_avg": [ 29.6, 17.41952927033334 ], "wc_main_review_avg": [ 325.6, 188.36199191981382 ], "wc_review_avg": [ 454.0, 219.4930522818433 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.10206207261596574, "corr_recommendation_correctness": 0.8750000000000001, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8130366420556632435&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0", "aff_unique_norm": "Hebrew University of Jerusalem", "aff_unique_dep": "", "aff_unique_url": "https://www.huji.ac.il", "aff_unique_abbr": "HUJI", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Jerusalem", "aff_country_unique_index": "0;0", "aff_country_unique": "Israel" }, { "id": "AEa_UepnMDX", "title": "Resolving label uncertainty with implicit generative models", "track": "main", "status": "Reject", "tldr": "", "abstract": "In prediction problems, coarse and imprecise sources of input can provide rich information about labels, but are not readily used by discriminative learners. In this work, we propose a method for jointly inferring labels across a collection of data samples, where each sample consists of an observation and a prior belief about the label. By implicitly assuming the existence of a generative model for which a differentiable predictor is the posterior, we derive a training objective that allows learning under weak beliefs. This formulation unifies various machine learning settings: the weak beliefs can come in the form of noisy or incomplete labels, likelihoods given by a different prediction mechanism on auxiliary input, or common-sense priors reflecting knowledge about the structure of the problem at hand. We demonstrate the proposed algorithms on diverse problems: classification with negative training examples, learning from rankings, weakly and self-supervised aerial imagery segmentation, co-segmentation of video frames, and coarsely supervised text classification.", "keywords": "weakly supervised learning;generative models;image segmentation", "primary_area": "", "supplementary_material": "/attachment/af8af06e026c34a933685bddcfbc1bdd10068e91.zip", "author": "Esther Rolf;Nikolay Malkin;Alexandros Graikos;Ana Jojic;Caleb Robinson;Nebojsa Jojic", "authorids": "esther.g.rolf@gmail.com;~Nikolay_Malkin1;~Alexandros_Graikos1;jojica@uw.edu;~Caleb_Robinson1;~Nebojsa_Jojic1", "gender": ";;;;M;", "homepage": ";;https://alexgraikos.github.io/;;http://calebrob.com;www.research.microsoft.com/~jojic", "dblp": ";;269/5950;;194/7729;20/1944", "google_scholar": ";;1J7ZAUAAAAAJ;;cjYgLT0AAAAJ;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": "esther.g.rolf@gmail.com;~Nikolay_Malkin1;~Alexandros_Graikos1;jojica@uw.edu;~Caleb_Robinson1;~Nebojsa_Jojic1", "aff": ";;Stony Brook University;;Microsoft;Microsoft Research", "aff_domain": ";;cs.stonybrook.edu;;microsoft.com; ", "position": ";;PhD student;;Principal Researcher;Researcher", "bibtex": "@misc{\nrolf2022resolving,\ntitle={Resolving label uncertainty with implicit generative models},\nauthor={Esther Rolf and Nikolay Malkin and Alexandros Graikos and Ana Jojic and Caleb Robinson and Nebojsa Jojic},\nyear={2022},\nurl={https://openreview.net/forum?id=AEa_UepnMDX}\n}", "github": "", "project": "", "reviewers": "5Mem;Da9G;HHeb;cjDc", "site": "https://openreview.net/forum?id=AEa_UepnMDX", "pdf_size": 0, "recommendation": "3;5;6;6", "confidence": "4;4;4;4", "correctness": "3;3;3;3", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;0;3;3", "wc_summary_paper": "44;108;73;56", "wc_summary_review": "71;43;42;24", "wc_main_review": "436;396;170;141", "wc_review": "551;547;285;221", "wc_reply_reviewers": "78;0;0;0", "wc_reply_authors": "701;796;409;290", "reply_reviewers": "1;0;0;0", "reply_authors": "2;1;1;1", "recommendation_avg": [ 5.0, 1.224744871391589 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 1.224744871391589 ], "wc_summary_paper_avg": [ 70.25, 24.107830678018296 ], "wc_summary_review_avg": [ 45.0, 16.80773631397161 ], "wc_main_review_avg": [ 285.75, 131.4160853929229 ], "wc_review_avg": [ 401.0, 149.72641717479252 ], "wc_reply_reviewers_avg": [ 19.5, 33.77499074759311 ], "wc_reply_authors_avg": [ 549.0, 206.63615366145393 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6163264529445679111&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;1", "aff_unique_norm": "Stony Brook University;Microsoft", "aff_unique_dep": ";Microsoft Corporation", "aff_unique_url": "https://www.stonybrook.edu;https://www.microsoft.com", "aff_unique_abbr": "SBU;Microsoft", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "AFH3FnBksHT", "title": "Model Fusion of Heterogeneous Neural Networks via Cross-Layer Alignment", "track": "main", "status": "Reject", "tldr": "", "abstract": "Layer-wise model fusion via optimal transport, named OTFusion, applies soft neuron association for unifying different pre-trained networks to save computational resources. While enjoying its success, OTFusion requires the input networks to have the same number of layers. To address this issue, we propose a novel model fusion framework, named CLAFusion, to fuse neural networks with a different number of layers, which we refer to as heterogeneous neural networks, via cross-layer alignment. The cross-layer alignment problem, which is an unbalanced assignment problem, can be solved efficiently using dynamic programming. Based on the cross-layer alignment, our framework balances the number of layers of neural networks before applying layer-wise model fusion. Our synthetic experiments indicate that the fused network from CLAFusion achieves a more favorable performance compared to the individual networks trained on heterogeneous data without the need for any retraining. With an extra finetuning process, it improves the accuracy of residual networks on the CIFAR10 dataset. Finally, we explore its application for model compression and knowledge distillation when applying to the teacher-student setting.", "keywords": "Model Fusion;Cross-layer Alignment;Knowledge Distillation;Model Compression;Model Transfer", "primary_area": "", "supplementary_material": "/attachment/8215ea67e2eb614c0ca7d2ac317492ff3aae1baf.zip", "author": "Dang Nguyen;Khai Nguyen;Nhat Ho;Dinh Phung;Hung Bui", "authorids": "~Dang_Nguyen2;~Khai_Nguyen1;~Nhat_Ho1;~Dinh_Phung2;~Hung_Bui1", "gender": "M;M;M;M;M", "homepage": "https://hsgser.github.io/;https://khainb.com;https://nhatptnk8912.github.io/;https://sites.google.com/site/buihhung/home;https://research.monash.edu/en/persons/dinh-phung", "dblp": ";120/4308;203/4479;;71/5859", "google_scholar": "https://scholar.google.co.jp/citations?user=WIqAtrcAAAAJ;im5fNaQAAAAJ;https://scholar.google.ca/citations?user=Xs7cKMwAAAAJ;mDLwSZAAAAAJ;https://scholar.google.com.au/citations?user=OtA9SwIAAAAJ", "orcid": ";;;;0000-0002-9977-8247", "linkedin": "dang-nguyen-50b7a7a0/;;nhat-pham-minh-ho-267b8164/;;https://linkedin.com/in/dinh-phung-6b537a6", "or_profile": "~Dang_Nguyen2;~Khai_Nguyen1;~Nhat_Ho1;~Hung_Bui1;~Dinh_Phung1", "aff": ";University of Texas, Austin;University of Texas, Austin;VinAI Research;Monash University", "aff_domain": ";utexas.edu;utexas.edu;vinai.io;monash.edu", "position": ";PhD student;Assistant Professor;Principal Researcher;Full Professor", "bibtex": "@misc{\nnguyen2022model,\ntitle={Model Fusion of Heterogeneous Neural Networks via Cross-Layer Alignment},\nauthor={Dang Nguyen and Khai Nguyen and Nhat Ho and Dinh Phung and Hung Bui},\nyear={2022},\nurl={https://openreview.net/forum?id=AFH3FnBksHT}\n}", "github": "", "project": "", "reviewers": "1twh;HQUA;pWLE;LJwg", "site": "https://openreview.net/forum?id=AFH3FnBksHT", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "4;3;4;5", "correctness": "3;3;3;3", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "44;97;78;122", "wc_summary_review": "33;36;33;76", "wc_main_review": "325;216;185;602", "wc_review": "402;349;296;800", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "489;792;811;1081", "reply_reviewers": "0;0;0;0", "reply_authors": "1;2;2;3", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 85.25, 28.472574523565655 ], "wc_summary_review_avg": [ 44.5, 18.227726133558185 ], "wc_main_review_avg": [ 332.0, 164.32741706726847 ], "wc_review_avg": [ 461.75, 198.85217499439125 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 793.25, 209.57382350856702 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.0, 0.7071067811865476 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.7071067811865475, "corr_recommendation_correctness": 0.0, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14301102199534018558&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;1;2", "aff_unique_norm": "University of Texas at Austin;VinAI Research;Monash University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.utexas.edu;https://www.vinai.io/;https://www.monash.edu", "aff_unique_abbr": "UT Austin;VinAI;Monash", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Austin;", "aff_country_unique_index": "0;0;1;2", "aff_country_unique": "United States;Vietnam;Australia" }, { "title": "Prospect Pruning: Finding Trainable Weights at Initialization using Meta-Gradients", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6758", "id": "AIgn9uwfcD1", "poster": "", "openreview": "https://openreview.net/forum?id=AIgn9uwfcD1", "slides": "https://iclr.cc/virtual/2022/poster/6758", "video": "https://iclr.cc/virtual/2022/poster/6758", "author_site": "Milad Alizadeh, Shyam Tailor, Luisa Zintgraf, Joost van Amersfoort, Sebastian Farquhar, Nicholas Lane, Yarin Gal", "tldr": "", "abstract": "Pruning neural networks at initialization would enable us to find sparse models that retain the accuracy of the original network while consuming fewer computational resources for training and inference. However, current methods are insufficient to enable this optimization and lead to a large degradation in model performance. In this paper, we identify a fundamental limitation in the formulation of current methods, namely that their saliency criteria look at a single step at the start of training without taking into account the trainability of the network. While pruning iteratively and gradually has been shown to improve pruning performance, explicit consideration of the training stage that will immediately follow pruning has so far been absent from the computation of the saliency criterion. To overcome the short-sightedness of existing methods, we propose Prospect Pruning (ProsPr), which uses meta-gradients through the first few steps of optimization to determine which weights to prune. ProsPr combines an estimate of the higher-order effects of pruning on the loss and the optimization trajectory to identify the trainable sub-network. Our method achieves state-of-the-art pruning performance on a variety of vision classification tasks, with less data and in a single shot compared to existing pruning-at-initialization methods.", "keywords": "pruning;lottery ticket hypothesis;pruning at initialization", "primary_area": "", "supplementary_material": "", "author": "Milad Alizadeh;Shyam A. Tailor;Luisa M Zintgraf;Joost van Amersfoort;Sebastian Farquhar;Nicholas Donald Lane;Yarin Gal", "authorids": "~Milad_Alizadeh1;~Shyam_A._Tailor1;~Luisa_M_Zintgraf1;~Joost_van_Amersfoort1;~Sebastian_Farquhar1;~Nicholas_Donald_Lane1;~Yarin_Gal1", "gender": "M;F;M;;;M;M", "homepage": "https://mil.ad;;;https://sebastianfarquhar.com/;http://www.cs.ox.ac.uk/people/yarin.gal/website//;http://niclane.org;https://www.shyamt.com", "dblp": ";177/9360;;215/5432;67/9076;03/2663.html;256/9384", "google_scholar": "YmGyDhcAAAAJ;lEzcLFwAAAAJ;https://scholar.google.co.uk/citations?user=C0LaV8IAAAAJ;bvShhTEAAAAJ;https://scholar.google.co.uk/citations?user=SIayDoQAAAAJ;https://scholar.google.co.uk/citations?hl=en;aJVp0DsAAAAJ", "orcid": ";;;;;0000-0002-2728-8273;", "linkedin": "miladalizadeh/;;;;;niclane;", "or_profile": "~Milad_Alizadeh1;~Luisa_M_Zintgraf1;~Joost_van_Amersfoort1;~Sebastian_Farquhar1;~Yarin_Gal1;~Nic_Lane2;~Shyam_Anil_Tailor1", "aff": "University of Oxford;University of Oxford;Google DeepMind;University of Oxford;University of Oxford;Samsung;Computer Laboratory", "aff_domain": "oxford.ac.uk;ox.ac.uk;deepmind.com;ox.ac.uk;ox.ac.uk;samsung.com;cl.cam.ac.uk", "position": "PhD student;PhD student;Intern;PhD student;Associate Professor;Laboratory Director;PhD student", "bibtex": "@inproceedings{\nalizadeh2022prospect,\ntitle={Prospect Pruning: Finding Trainable Weights at Initialization using Meta-Gradients},\nauthor={Milad Alizadeh and Shyam A. Tailor and Luisa M Zintgraf and Joost van Amersfoort and Sebastian Farquhar and Nicholas Donald Lane and Yarin Gal},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=AIgn9uwfcD1}\n}", "github": "", "project": "", "reviewers": "9cLN;MHq7;fSju;D88v", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "3;4;4;4", "correctness": "4;3;3;4", "technical_novelty": "3;2;3;4", "empirical_novelty": "3;3;3;4", "wc_summary_paper": "77;35;74;35", "wc_summary_review": "28;26;88;4", "wc_main_review": "124;199;524;92", "wc_review": "229;260;686;131", "wc_reply_reviewers": "0;0;166;0", "wc_reply_authors": "697;814;1665;471", "reply_reviewers": "0;0;1;0", "reply_authors": "2;2;4;2", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 3.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 55.25, 20.27775875189366 ], "wc_summary_review_avg": [ 36.5, 31.188940347501386 ], "wc_main_review_avg": [ 234.75, 171.45462227656623 ], "wc_review_avg": [ 326.5, 212.948937541374 ], "wc_reply_reviewers_avg": [ 41.5, 71.88010851410841 ], "wc_reply_authors_avg": [ 911.75, 452.0284144829836 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 2.5, 0.8660254037844386 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.6622661785325219, "corr_recommendation_correctness": 0.2294157338705618, "gs_citation": 55, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8783006285808358460&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=AIgn9uwfcD1", "email": "oxford.ac.uk;ox.ac.uk;deepmind.com;ox.ac.uk;ox.ac.uk;samsung.com;cl.cam.ac.uk", "author_num": 7, "aff_unique_index": "0;0;1;0;0;2;3", "aff_unique_norm": "University of Oxford;Google;Samsung;University of Cambridge", "aff_unique_dep": ";Google DeepMind;Samsung;Computer Laboratory", "aff_unique_url": "https://www.ox.ac.uk;https://deepmind.com;https://www.samsung.com;https://www.cl.cam.ac.uk", "aff_unique_abbr": "Oxford;DeepMind;Samsung;CL", "aff_campus_unique_index": "1", "aff_campus_unique": ";Cambridge", "aff_country_unique_index": "0;0;0;0;0;1;0", "aff_country_unique": "United Kingdom;South Korea" }, { "title": "DEPTS: Deep Expansion Learning for Periodic Time Series Forecasting", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7105", "id": "AJAR-JgNw__", "poster": "", "openreview": "https://openreview.net/forum?id=AJAR-JgNw__", "slides": "https://iclr.cc/virtual/2022/poster/7105", "video": "https://iclr.cc/virtual/2022/poster/7105", "author_site": "Wei Fan, Shun Zheng, Xiaohan Yi, Wei Cao, Yanjie Fu, Jiang Bian, Tie-Yan Liu", "tldr": "", "abstract": "Periodic time series (PTS) forecasting plays a crucial role in a variety of industries to foster critical tasks, such as early warning, pre-planning, resource scheduling, etc. However, the complicated dependencies of the PTS signal on its inherent periodicity as well as the sophisticated composition of various periods hinder the performance of PTS forecasting. In this paper, we introduce a deep expansion learning framework, DEPTS, for PTS forecasting. DEPTS starts with a decoupled formulation by introducing the periodic state as a hidden variable, which stimulates us to make two dedicated modules to tackle the aforementioned two challenges. First, we develop an expansion module on top of residual learning to perform a layer-by-layer expansion of those complicated dependencies. Second, we introduce a periodicity module with a parameterized periodic function that holds sufficient capacity to capture diversified periods. Moreover, our two customized modules also have certain interpretable capabilities, such as attributing the forecasts to either local momenta or global periodicity and characterizing certain core periodic properties, e.g., amplitudes and frequencies. Extensive experiments on both synthetic data and real-world data demonstrate the effectiveness of DEPTS on handling PTS. In most cases, DEPTS achieves significant improvements over the best baseline. Specifically, the error reduction can even reach up to 20% for a few cases. All codes for this paper are publicly available.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Wei Fan;Shun Zheng;Xiaohan Yi;Wei Cao;Yanjie Fu;Jiang Bian;Tie-Yan Liu", "authorids": "~Wei_Fan6;~Shun_Zheng1;xiaohan.yi@microsoft.com;~Wei_Cao1;~Yanjie_Fu2;~Jiang_Bian1;~Tie-Yan_Liu1", "gender": "M;M;;M;;M;M", "homepage": "https://weifan.site/;;;;;https://sites.google.com/view/jiangbian;http://member.acm.org/~tieyanliu", "dblp": "54/3488-10;179/2615.html;;54/6265;;09/851-2.html;l/TieYanLiu", "google_scholar": "cQ8zLJ4AAAAJ;21Q9To4AAAAJ;;;;pZBEnY8AAAAJ;Nh832fgAAAAJ", "orcid": "0000-0001-7656-445X;0009-0005-7355-7090;;;;0000-0002-9472-600X;0000-0002-0476-8020", "linkedin": ";;;;;jbian/;", "or_profile": "~Wei_Fan6;~Shun_Zheng1;xiaohan.yi@microsoft.com;~Wei_Cao1;~Yanjie_Fu2;~Jiang_Bian1;~Tie-Yan_Liu1", "aff": "University of Central Florida;Microsoft;;;;Microsoft;Microsoft", "aff_domain": "ucf.edu;microsoft.com;;;;microsoft.com;microsoft.com", "position": "PhD student;Senior Researcher;;;;Partner Research Manager;Distinguished Scientist", "bibtex": "@inproceedings{\nfan2022depts,\ntitle={{DEPTS}: Deep Expansion Learning for Periodic Time Series Forecasting},\nauthor={Wei Fan and Shun Zheng and Xiaohan Yi and Wei Cao and Yanjie Fu and Jiang Bian and Tie-Yan Liu},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=AJAR-JgNw__}\n}", "github": "", "project": "", "reviewers": "SZrb;qo99;xtYj;3Zt1", "pdf_size": 0, "recommendation": "6;8;8;8", "confidence": "3;3;4;4", "correctness": "4;3;3;4", "technical_novelty": "3;3;3;4", "empirical_novelty": "3;3;3;4", "wc_summary_paper": "43;48;115;97", "wc_summary_review": "16;250;21;28", "wc_main_review": "145;82;253;484", "wc_review": "204;380;389;609", "wc_reply_reviewers": "0;0;0;12", "wc_reply_authors": "808;568;1032;723", "reply_reviewers": "0;0;0;1", "reply_authors": "2;1;2;1", "recommendation_avg": [ 7.5, 0.8660254037844386 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 75.75, 30.96267914764483 ], "wc_summary_review_avg": [ 78.75, 98.96306129056437 ], "wc_main_review_avg": [ 241.0, 153.0441112882165 ], "wc_review_avg": [ 395.5, 143.6462669198194 ], "wc_reply_reviewers_avg": [ 3.0, 5.196152422706632 ], "wc_reply_authors_avg": [ 782.75, 167.66838551140165 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": -0.5773502691896257, "gs_citation": 78, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17674123888632220585&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=AJAR-JgNw__", "email": "ucf.edu;microsoft.com;;;;microsoft.com;microsoft.com", "author_num": 7, "aff_unique_index": "0;1;1;1", "aff_unique_norm": "University of Central Florida;Microsoft", "aff_unique_dep": ";Microsoft Corporation", "aff_unique_url": "https://www.ucf.edu;https://www.microsoft.com", "aff_unique_abbr": "UCF;Microsoft", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "AJO2mBSTOHl", "title": "Analytically Tractable Bayesian Deep Q-Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Reinforcement learning (RL) has gained increasing interest since the demonstration it was able to reach human performance on video game benchmarks using deep Q-learning (DQN). The current consensus of DQN for training neural networks (NNs) on such complex environments is to rely on gradient-descent optimization (GD). This consensus ignores the uncertainty of the NN's parameters which is a key aspect for the selection of an optimal action given a state. Although alternative Bayesian deep learning methods exist, most of them still rely on GD and numerical approximations, and they typically do not scale on complex benchmarks such as the Atari game environment. In this paper, we present how we can adapt the temporal difference Q-learning framework to make it compatible with the tractable approximate Gaussian inference (TAGI) which allows estimating the posterior distribution of NN's parameters using a closed-form analytical method. Throughout the experiments with on- and off-policy reinforcement learning approaches, we demonstrate that TAGI can reach a performance comparable to backpropagation-trained networks while using only half the number of hyperparameters, and without relying on GD or numerical approximations. ", "keywords": "Bayesian Learning;Probabilistic Methods;Uncertainty Quantification;Reinforcement Learning;Deep Q-learning", "primary_area": "", "supplementary_material": "", "author": "Luong-Ha Nguyen;James-A. Goulet", "authorids": "~Luong-Ha_Nguyen1;~James-A._Goulet1", "gender": "M;", "homepage": "http://profs.polymtl.ca/jagoulet/Site/Goulet_web_page_LHNGUYEN.html;http://bayes.works", "dblp": ";", "google_scholar": ";https://scholar.google.ch/citations?user=yP9fGUkAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Luong-Ha_Nguyen1;~James-A._Goulet1", "aff": ";Polytechnique Montreal", "aff_domain": ";polymtl.ca", "position": ";Associate Professor", "bibtex": "@misc{\nnguyen2022analytically,\ntitle={Analytically Tractable Bayesian Deep Q-Learning},\nauthor={Luong-Ha Nguyen and James-A. Goulet},\nyear={2022},\nurl={https://openreview.net/forum?id=AJO2mBSTOHl}\n}", "github": "", "project": "", "reviewers": "Aorz;H4dH;6wux;vNfB", "site": "https://openreview.net/forum?id=AJO2mBSTOHl", "pdf_size": 0, "recommendation": "3;3;3;3", "confidence": "3;2;3;2", "correctness": "2;4;3;2", "technical_novelty": "3;2;3;2", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "66;24;46;53", "wc_summary_review": "89;35;63;46", "wc_main_review": "1545;396;434;302", "wc_review": "1700;455;543;401", "wc_reply_reviewers": "204;0;558;0", "wc_reply_authors": "1371;168;331;249", "reply_reviewers": "1;0;1;0", "reply_authors": "3;1;1;1", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 2.5, 0.5 ], "correctness_avg": [ 2.75, 0.82915619758885 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 47.25, 15.22128443988877 ], "wc_summary_review_avg": [ 58.25, 20.363877332178173 ], "wc_main_review_avg": [ 669.25, 507.8923975607432 ], "wc_review_avg": [ 774.75, 536.5921985083272 ], "wc_reply_reviewers_avg": [ 190.5, 227.9358462374885 ], "wc_reply_authors_avg": [ 529.75, 489.10294161863305 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.5, 0.8660254037844386 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3526690336998439170&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2, "aff_unique_index": "0", "aff_unique_norm": "Polytechnique Montreal", "aff_unique_dep": "", "aff_unique_url": "https://www.polymtl.ca", "aff_unique_abbr": "PolyMTL", "aff_campus_unique_index": "0", "aff_campus_unique": "Montreal", "aff_country_unique_index": "0", "aff_country_unique": "Canada" }, { "id": "AJg35fkqOPA", "title": "Text-Driven Image Manipulation via Semantic-Aware Knowledge Transfer", "track": "main", "status": "Reject", "tldr": "", "abstract": "Semantic-level facial attribute transfer is a special task to edit facial attribute, when reference images are viewed as conditions to control the image editing. In order to achieve better performance, semantic-level facial attribute transfer needs to fulfil two requirements: (1) specific attributes extracted from reference face should be precisely transferred to target face; (2) irrelevant information should be completely retained after transferring. Some existing methods locate and modify local support regions of facial images, which are not effective when editing global attributes; the other methods disentangle the latent code as different attribute-relevant parts, which may transfer redundant knowledge to target faces. In this paper, we first propose a novel text-driven directional latent mapping network with semantic direction consistency (SDC) constrain to explore the latent semantic space for effective attribute editing, leveraging the semantic-aware knowledge of Contrastive Language-Image Pre-training (CLIP) model as guidance. This latent space manipulation strategy is designed to disentangle the facial attribute, removing the redundant knowledge in the transfer process. And on this basis, a novel attribute transfer method, named semantic directional decomposition network (SDD-Net), is proposed to achieve semantic-level facial attribute transfer by latent semantic direction decomposition, improving the interpretability and editability of our method. Extensive experiments on CelebA-HQ dataset show that our method achieves impressive performance over the state-of-the-art methods.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/1b598645b6bc0b70d2b0eee5f15bfdf7d260d2d2.zip", "author": "Ziqi Zhang;Cheng Deng;Kun Wei;Xu Yang", "authorids": "~Ziqi_Zhang4;~Cheng_Deng2;~Kun_Wei1;~Xu_Yang6", "gender": ";M;M;M", "homepage": "https://github.com/ZZZ9116;https://scholar.google.com.hk/citations?user=fju0aWQAAAAJ&hl=zh-CN;https://xdxuyang.github.io/;http://see.xidian.edu.cn/faculty/chdeng/", "dblp": ";;63/1534-19;", "google_scholar": "https://scholar.google.com.hk/citations?user=JITPPf8AAAAJ;https://scholar.google.com.hk/citations?user=fju0aWQAAAAJ;https://scholar.google.com.hk/citations?user=Uc3piAIAAAAJ;OROjmc8AAAAJ", "orcid": "0000-0001-7000-3539;;;0000-0003-2620-3247", "linkedin": ";;;", "or_profile": "~Ziqi_Zhang4;~Kun_Wei1;~Xu_Yang6;~Cheng_Deng1", "aff": "Xidian University;Xidian University;Xidian University;Xidian University", "aff_domain": "xidian.edu.cn;xidian.edu.cn;xidian.edu.cn;xidian.edu.cn", "position": "PhD student;PhD student;Associate Professor;Full Professor", "bibtex": "@misc{\nzhang2022textdriven,\ntitle={Text-Driven Image Manipulation via Semantic-Aware Knowledge Transfer},\nauthor={Ziqi Zhang and Cheng Deng and Kun Wei and Xu Yang},\nyear={2022},\nurl={https://openreview.net/forum?id=AJg35fkqOPA}\n}", "github": "", "project": "", "reviewers": "nZQ1;ksFh;wMtJ;V8mm", "site": "https://openreview.net/forum?id=AJg35fkqOPA", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "4;4;3;3", "correctness": "3;3;2;4", "technical_novelty": "2;2;3;2", "empirical_novelty": "2;0;0;2", "wc_summary_paper": "30;46;87;43", "wc_summary_review": "13;16;248;117", "wc_main_review": "296;142;32;84", "wc_review": "339;204;367;244", "wc_reply_reviewers": "0;0;42;0", "wc_reply_authors": "0;0;194;56", "reply_reviewers": "0;0;1;0", "reply_authors": "0;0;1;1", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.0, 1.0 ], "wc_summary_paper_avg": [ 51.5, 21.360009363293827 ], "wc_summary_review_avg": [ 98.5, 95.92835868501035 ], "wc_main_review_avg": [ 138.5, 98.90778533563473 ], "wc_review_avg": [ 288.5, 66.77012805139735 ], "wc_reply_reviewers_avg": [ 10.5, 18.186533479473212 ], "wc_reply_authors_avg": [ 62.5, 79.2890282447704 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 0.5, 0.5 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.6882472016116854, "corr_recommendation_correctness": 0.3244428422615251, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:DKiWrQJxaVsJ:scholar.google.com/&scioq=Text-Driven+Image+Manipulation+via+Semantic-Aware+Knowledge+Transfer&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Xidian University", "aff_unique_dep": "", "aff_unique_url": "http://www.xidian.edu.cn/", "aff_unique_abbr": "Xidian", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "title": "POETREE: Interpretable Policy Learning with Adaptive Decision Trees", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7015", "id": "AJsI-ymaKn_", "poster": "", "openreview": "https://openreview.net/forum?id=AJsI-ymaKn_", "slides": "https://iclr.cc/virtual/2022/poster/7015", "video": "https://iclr.cc/virtual/2022/poster/7015", "author_site": "Aliz\u00e9e Pace, Alex Chan, Mihaela van der Schaar", "tldr": "", "abstract": "Building models of human decision-making from observed behaviour is critical to better understand, diagnose and support real-world policies such as clinical care. As established policy learning approaches remain focused on imitation performance, they fall short of explaining the demonstrated decision-making process. Policy Extraction through decision Trees (POETREE) is a novel framework for interpretable policy learning, compatible with fully-offline and partially-observable clinical decision environments -- and builds probabilistic tree policies determining physician actions based on patients' observations and medical history. Fully-differentiable tree architectures are grown incrementally during optimization to adapt their complexity to the modelling task, and learn a representation of patient history through recurrence, resulting in decision tree policies that adapt over time with patient information. This policy learning method outperforms the state-of-the-art on real and synthetic medical datasets, both in terms of understanding, quantifying and evaluating observed behaviour as well as in accurately replicating it -- with potential to improve future decision support systems.", "keywords": "Imitation Learning;Interpretable ML;Clinical Decision Support;Sequential Decision-Making", "primary_area": "", "supplementary_material": "", "author": "Aliz\u00e9e Pace;Alex Chan;Mihaela van der Schaar", "authorids": "~Aliz\u00e9e_Pace1;~Alex_Chan2;~Mihaela_van_der_Schaar2", "gender": "F;F;M", "homepage": "https://alizeepace.com/;https://www.vanderschaar-lab.com;https://alexjchan.com", "dblp": "317/0381;;268/6948", "google_scholar": "p6gHZiUAAAAJ;DZ3S--MAAAAJ;yfy_BGIAAAAJ", "orcid": "0000-0002-8328-8817;;", "linkedin": "aliz%C3%A9e-pace-516b4314b/;;alex-chan-040081131/", "or_profile": "~Aliz\u00e9e_Pace1;~Mihaela_van_der_Schaar2;~Alex_James_Chan1", "aff": "Swiss Federal Institute of Technology;University of California, Los Angeles;University of Cambridge", "aff_domain": "ethz.ch;ucla.edu;cam.ac.uk", "position": "PhD student;Full Professor;PhD student", "bibtex": "@inproceedings{\npace2022poetree,\ntitle={{POETREE}: Interpretable Policy Learning with Adaptive Decision Trees},\nauthor={Aliz{\\'e}e Pace and Alex Chan and Mihaela van der Schaar},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=AJsI-ymaKn_}\n}", "github": "", "project": "", "reviewers": "wNRz;FrRV;kv5i;Kc79", "pdf_size": 0, "recommendation": "5;8;8;8", "confidence": "4;3;3;3", "correctness": "2;4;4;3", "technical_novelty": "2;3;2;3", "empirical_novelty": "3;3;3;3", "wc_summary_paper": "58;139;101;139", "wc_summary_review": "182;47;55;33", "wc_main_review": "193;643;1294;209", "wc_review": "433;829;1450;381", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "2450;1974;2988;1084", "reply_reviewers": "0;0;0;0", "reply_authors": "6;6;7;3", "recommendation_avg": [ 7.25, 1.299038105676658 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 109.25, 33.40939239196068 ], "wc_summary_review_avg": [ 79.25, 59.84302382065933 ], "wc_main_review_avg": [ 584.75, 447.5166896329119 ], "wc_review_avg": [ 773.25, 427.41336841516784 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 2124.0, 699.4412055348183 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 5.5, 1.5 ], "replies_avg": [ 29, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -1.0, "corr_recommendation_correctness": 0.8703882797784892, "gs_citation": 28, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6622214805938721915&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "pdf": "https://openreview.net/pdf?id=AJsI-ymaKn_", "email": "ethz.ch;ucla.edu;cam.ac.uk", "author_num": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "Swiss Federal Institute of Technology;University of California, Los Angeles;University of Cambridge", "aff_unique_dep": ";;", "aff_unique_url": "https://www.ethz.ch;https://www.ucla.edu;https://www.cam.ac.uk", "aff_unique_abbr": "ETH Zurich;UCLA;Cambridge", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Los Angeles;Cambridge", "aff_country_unique_index": "0;1;2", "aff_country_unique": "Switzerland;United States;United Kingdom" }, { "id": "AK9LrCKUp9", "title": "Towards Robust Point Cloud Models with Context-Consistency Network and Adaptive Augmentation", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "3D point cloud models based on deep neural networks were proven to be vulnerable to adversarial examples, with a quantity of novel attack techniques proposed by researchers recently. It is of paramount importance to preserve the robustness of 3D models under adversarial environments, considering their broad application in safety- and security-critical tasks. Unfortunately, defenses for 3D models are much less studied compared to 2D image models. In this paper, we reason about the vulnerability of 3D models based on the mutual information theory. Furthermore, we design an effective defense methodology, consisting of two innovations. (1) We introduce CCDGN, a novel 3D DNN architecture which includes robust and light-weight modules to alleviate adversarial examples. (2) We propose AA-AMS a novel data augmentation strategy to adaptively balance the model usability and robustness. Extensive evaluations indicate the integration of the two techniques provides much more robustness than existing defense solutions for 3D models. ", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Guanlin Li;Guowen Xu;Han Qiu;Ruan He;Jiwei Li;Tianwei Zhang", "authorids": "~Guanlin_Li2;~Guowen_Xu1;qiuhan@tsinghua.edu.cn;ruanhe@tencent.com;~Jiwei_Li1;~Tianwei_Zhang1", "gender": "M;M;;;M;M", "homepage": "https://guanlinlee.github.io/;https://guowen-xu.github.io/;;;https://nlp.stanford.edu/~bdlijiwei/;https://personal.ntu.edu.sg/tianwei.zhang/index.html", "dblp": ";87/10142;;;73/5746-1;77/7902-4", "google_scholar": "3LB0_wMAAAAJ;https://scholar.google.com.hk/citations?user=MDKdG80AAAAJ;;;PwU16JEAAAAJ;9vpiYDIAAAAJ", "orcid": ";0000-0002-9764-9345;;;;", "linkedin": ";guowen-xu-92b7201b1/?originalSubdomain=hk;;;;", "or_profile": "~Guanlin_Li2;~Guowen_Xu1;qiuhan@tsinghua.edu.cn;ruanhe@tencent.com;~Jiwei_Li1;~Tianwei_Zhang1", "aff": "Nanyang Technological University;Nanyang Technological University;;;Zhejiang University;Nanyang Technological University", "aff_domain": "ntu.edu.sg;ntu.edu.sg;;;zju.edu.cn;ntu.edu.sg", "position": "PhD student;Postdoc;;;Assistant Professor;Assistant Professor", "bibtex": "@misc{\nli2022towards,\ntitle={Towards Robust Point Cloud Models with Context-Consistency Network and Adaptive Augmentation},\nauthor={Guanlin Li and Guowen Xu and Han Qiu and Ruan He and Jiwei Li and Tianwei Zhang},\nyear={2022},\nurl={https://openreview.net/forum?id=AK9LrCKUp9}\n}", "github": "", "project": "", "reviewers": "vPF8;Hmn7;TvUj;LuUW", "site": "https://openreview.net/forum?id=AK9LrCKUp9", "pdf_size": 0, "recommendation": "3;5;6;6", "confidence": "3;4;4;4", "correctness": "3;2;4;3", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "45;91;98;55", "wc_summary_review": "35;51;42;20", "wc_main_review": "406;356;192;131", "wc_review": "486;498;332;206", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 5.0, 1.224744871391589 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 72.25, 22.664675157610354 ], "wc_summary_review_avg": [ 37.0, 11.335784048754634 ], "wc_main_review_avg": [ 271.25, 113.23730613185745 ], "wc_review_avg": [ 380.5, 120.14470441929599 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.9428090415820632, "corr_recommendation_correctness": 0.28867513459481287, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:rCvagVkCdA4J:scholar.google.com/&scioq=Towards+Robust+Point+Cloud+Models+with+Context-Consistency+Network+and+Adaptive+Augmentation&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Nanyang Technological University;Zhejiang University", "aff_unique_dep": ";", "aff_unique_url": "https://www.ntu.edu.sg;https://www.zju.edu.cn", "aff_unique_abbr": "NTU;ZJU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "Singapore;China" }, { "id": "AKIlm8fp1b", "title": "Generating Realistic Physical Adversarial Examplesby Patch Transformer Network", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Physical adversarial attacks apply carefully crafted adversarial perturbations onto real objects to maliciously alter the prediction of object classifiers or detectors. The current standard method for designing physical adversarial patches, i.e. Expectation over Transformations (EoT), simulates real-world environments by random physical transformations, resulting in adversarial examples far from satisfactory. To tackle this issue, we propose and develop a novel network to learn real-world physical transformations from data, including geometric transformation, printer color transformation and illumination adaption. Our approach produces realisticlooking adversarial examples and can be integrated into existing attack generation frameworks to generate adversarial patches effectively. We apply our approach to design adversarial T-shirts worn by moving people, one of the most challenging settings for physical attacks. Experiments show that our approach significantly outperforms the state of the arts when attacking DL-based object detectors in real life. Moreover, we build a first-kind-of adversarial T-shirts dataset to enable effective training of our approach and facilitate fair comparison on physical world attacks by considering a standard patch size, environment changes and object variances. Our code will be made publicly available.", "keywords": "Adversarial attack;Physical Adversarial Examples;object detection", "primary_area": "", "supplementary_material": "/attachment/f00dde299d35fc7b9ac8e6c17145a1d622a745e0.zip", "author": "Quanfu Fan;Kaidi Xu;Chun-Fu Chen;Sijia Liu;Gaoyuan Zhang;David Daniel Cox;Xue Lin", "authorids": "~Quanfu_Fan1;~Kaidi_Xu1;~Chun-Fu_Chen1;~Sijia_Liu1;~Gaoyuan_Zhang1;~David_Daniel_Cox1;~Xue_Lin1", "gender": "M;M;M;M;M;;F", "homepage": ";https://kaidixu.com/;;https://lsjxjtu.github.io/;;;https://coe.northeastern.edu/people/lin-xue/", "dblp": "66/3950;195/8175;48/915;128/6972-1;;48/7659;", "google_scholar": "kCxHiwUAAAAJ;lYK0wlsAAAAJ;9gqd5cYAAAAJ;C7dO_UgAAAAJ;;;p87KNLIAAAAJ", "orcid": ";;;;;;0000-0001-6210-8883", "linkedin": ";;;;;;", "or_profile": "~Quanfu_Fan1;~Kaidi_Xu1;~Chun-Fu_Chen1;~Sijia_Liu1;~Gaoyuan_Zhang1;~David_Daniel_Cox1;~Xue_Lin1", "aff": "MIT-IBM Watson AI Lab;Drexel University;JPMorganChase, GTAR;Michigan State University;International Business Machines;International Business Machines;Northeastern University", "aff_domain": "us.ibm.com;drexel.edu;jpmchase.com;msu.edu;ibm.com;ibm.com;neu.edu", "position": "Researcher;Assistant Professor;Executive Director;Assistant Professor;Research engineer;IBM Director, MIT-IBM Watson AI Lab;Assistant Professor", "bibtex": "@misc{\nfan2022generating,\ntitle={Generating Realistic Physical Adversarial Examplesby Patch Transformer Network},\nauthor={Quanfu Fan and Kaidi Xu and Chun-Fu Chen and Sijia Liu and Gaoyuan Zhang and David Daniel Cox and Xue Lin},\nyear={2022},\nurl={https://openreview.net/forum?id=AKIlm8fp1b}\n}", "github": "", "project": "", "reviewers": "ePTW;XmKG;pY1u;513H", "site": "https://openreview.net/forum?id=AKIlm8fp1b", "pdf_size": 0, "recommendation": "3;5;5;5", "confidence": "4;4;3;3", "correctness": "3;3;3;3", "technical_novelty": "1;2;2;2", "empirical_novelty": "2;3;2;3", "wc_summary_paper": "73;98;77;85", "wc_summary_review": "16;90;55;66", "wc_main_review": "245;501;173;153", "wc_review": "334;689;305;304", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.5, 0.8660254037844386 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 1.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 83.25, 9.54921462739214 ], "wc_summary_review_avg": [ 56.75, 26.714930282521795 ], "wc_main_review_avg": [ 268.0, 138.80561948278606 ], "wc_review_avg": [ 408.0, 162.6822055419707 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9398834665165106014&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;2;3;4;4;5", "aff_unique_norm": "Massachusetts Institute of Technology;Drexel University;JPMorgan Chase;Michigan State University;International Business Machines Corporation;Northeastern University", "aff_unique_dep": "IBM Watson AI Lab;;Global Technology, Analytics, and Research (GTAR);;;", "aff_unique_url": "https://www.mitibmwatsonailab.org;https://www.drexel.edu;https://www.jpmorganchase.com;https://www.msu.edu;https://www.ibm.com;https://www.northeastern.edu", "aff_unique_abbr": "MIT-IBM AI Lab;Drexel;JPM;MSU;IBM;NEU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Nonlinear ICA Using Volume-Preserving Transformations", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6091", "id": "AMpki9kp8Cn", "poster": "", "openreview": "https://openreview.net/forum?id=AMpki9kp8Cn", "slides": "https://iclr.cc/virtual/2022/poster/6091", "video": "https://iclr.cc/virtual/2022/poster/6091", "author_site": "Xiaojiang Yang, Yi Wang, Jiacheng Sun, Xing Zhang, Shifeng Zhang, Zhenguo Li, Junchi Yan", "tldr": "", "abstract": "Nonlinear ICA is a fundamental problem in machine learning, aiming to identify the underlying independent components (sources) from data which is assumed to be a nonlinear function (mixing function) of these sources. Recent works prove that if the sources have some particular structures (e.g. temporal structure), they are theoretically identifiable even if the mixing function is arbitrary. However, in many cases such restrictions on the sources are difficult to satisfy or even verify, hence it inhibits the applicability of the proposed methods. Different from these works, we propose a general framework for nonlinear ICA, in which the mixing function is assumed to be a volume-preserving transformation, and meanwhile the conditions on the sources can be much looser. We provide an insightful proof of the identifiability of the proposed framework. We implement the framework by volume-preserving Flow-based models, and verify our theory by experiments on artificial data and synthesized images. Moreover, results on real-world images indicate that our framework can disentangle interpretable features.", "keywords": "Independent Component Analysis;Nonlinear ICA;Identifiability", "primary_area": "", "supplementary_material": "", "author": "Xiaojiang Yang;Yi Wang;Jiacheng Sun;Xing Zhang;Shifeng Zhang;Zhenguo Li;Junchi Yan", "authorids": "~Xiaojiang_Yang1;~Yi_Wang24;~Jiacheng_Sun1;~Xing_Zhang6;~Shifeng_Zhang5;~Zhenguo_Li1;~Junchi_Yan2", "gender": "M;F;M;;M;M;M", "homepage": "https://thinklab.sjtu.edu.cn/;https://github.com/yiwang334;;;https://github.com/zsffq999;http://www.ee.columbia.edu/~zgli/;http://thinklab.sjtu.edu.cn/", "dblp": ";;165/5350;;;23/6479;60/7949.html", "google_scholar": ";;;5HlbQhkAAAAJ;;XboZC1AAAAAJ;ga230VoAAAAJ", "orcid": ";;;;;;0000-0001-9639-7679", "linkedin": ";;https://www.linkedin.cn/incareer/in/jiacheng-sun-ab622b131;;;;", "or_profile": "~Xiaojiang_Yang1;~Yi_Wang24;~Jiacheng_Sun1;~Xing_Zhang6;~Shifeng_Zhang5;~Zhenguo_Li1;~Junchi_Yan1", "aff": "Noah's Ark Lab, Huawei;Shanghai Jiaotong University;Huawei Noah's Ark Lab;Huawei Technologies Ltd.;Huawei Technologies Ltd.;Huawei Noah's Ark Lab;Shanghai Jiaotong University", "aff_domain": "huawei.com;sjtu.edu.cn;huawei.com;huawei.com;huawei.com;huawei.com;sjtu.edu.cn", "position": "Intern;Undergrad student;Senior Researcher;AI Researcher;Researcher;Principal Researcher;Associate Professor", "bibtex": "@inproceedings{\nyang2022nonlinear,\ntitle={Nonlinear {ICA} Using Volume-Preserving Transformations},\nauthor={Xiaojiang Yang and Yi Wang and Jiacheng Sun and Xing Zhang and Shifeng Zhang and Zhenguo Li and Junchi Yan},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=AMpki9kp8Cn}\n}", "github": "", "project": "", "reviewers": "7dbX;xbzu;kNre;kWud;eqBB", "pdf_size": 0, "recommendation": "6;6;6;6;6", "confidence": "3;4;3;3;3", "correctness": "4;3;3;4;3", "technical_novelty": "3;2;3;3;2", "empirical_novelty": "3;2;3;2;2", "wc_summary_paper": "35;180;66;194;222", "wc_summary_review": "20;20;34;38;86", "wc_main_review": "61;122;103;306;158", "wc_review": "116;322;203;538;466", "wc_reply_reviewers": "0;0;21;394;10", "wc_reply_authors": "81;602;114;1408;496", "reply_reviewers": "0;0;1;2;1", "reply_authors": "1;1;1;3;1", "recommendation_avg": [ 6.0, 0.0 ], "confidence_avg": [ 3.2, 0.39999999999999997 ], "correctness_avg": [ 3.4, 0.4898979485566356 ], "technical_novelty_avg": [ 2.6, 0.4898979485566356 ], "empirical_novelty_avg": [ 2.4, 0.4898979485566356 ], "wc_summary_paper_avg": [ 139.4, 74.48382374717345 ], "wc_summary_review_avg": [ 39.6, 24.311314238436392 ], "wc_main_review_avg": [ 150.0, 84.04046644325578 ], "wc_review_avg": [ 329.0, 157.31751332893614 ], "wc_reply_reviewers_avg": [ 85.0, 154.69453771869257 ], "wc_reply_authors_avg": [ 540.2, 479.8668148559556 ], "reply_reviewers_avg": [ 0.8, 0.7483314773547883 ], "reply_authors_avg": [ 1.4, 0.8 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 23, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8951364843584293405&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "pdf": "https://openreview.net/pdf?id=AMpki9kp8Cn", "email": "huawei.com;sjtu.edu.cn;huawei.com;huawei.com;huawei.com;huawei.com;sjtu.edu.cn", "author_num": 7, "aff_unique_index": "0;1;0;0;0;0;1", "aff_unique_norm": "Huawei;Shanghai Jiao Tong University", "aff_unique_dep": "Noah's Ark Lab;", "aff_unique_url": "https://www.huawei.com;https://www.sjtu.edu.cn", "aff_unique_abbr": "Huawei;SJTU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "China" }, { "id": "AOn-gHymcx", "title": "A neural network framework for learning Green's function", "track": "main", "status": "Reject", "tldr": "", "abstract": "Green's function plays a significant role in both theoretical analysis and numerical computing of partial differential equations (PDEs). However, in most cases, Green's function is difficult to compute. The troubles arise in the following three folds. Firstly, compared with the original PDE, the dimension of Green's function is doubled, making it impossible to be handled by traditional mesh-based methods. Secondly, Green's function usually contains singularities which increase the difficulty to get a good approximation. Lastly, the computational domain may be very complex or even unbounded. To override these problems, we leverage the fundamental solution, boundary integral method and neural networks to develop a new method for computing Green's function with high accuracy in this paper. We focus on Green's function of Poisson and Helmholtz equations in bounded domains, unbounded domains and domains with interfaces. Extensive experiments illustrate the efficiency and the accuracy of our method for solving Green's function. In addition, we also use the Green's function calculated by our method to solve a class of PDE, and also obtain high-precision solutions, which shows the good generalization ability of our method on solving PDEs.", "keywords": "Green's function;partial differential equation;boundary integral;neural network", "primary_area": "", "supplementary_material": "", "author": "Guochang Lin;Fukai Chen;Pipi Hu;Xiang Chen;Junqing Chen;Jun Wang;Zuoqiang Shi", "authorids": "~Guochang_Lin1;~Fukai_Chen1;~Pipi_Hu1;~Xiang_Chen8;~Junqing_Chen1;~Jun_Wang2;~Zuoqiang_Shi1", "gender": "M;M;;;;M;M", "homepage": ";;;;http://www.math.tsinghua.edu.cn/publish/math/2566/2019/20190705100358974271818/20190705100358974271818_.html;http://www0.cs.ucl.ac.uk/staff/jun.wang/;https://shizqi.github.io/", "dblp": ";;;;;w/JunWang12;18/1960", "google_scholar": ";;;2cj3OTIAAAAJ;;https://scholar.google.co.uk/citations?user=wIE1tY4AAAAJ;", "orcid": "0000-0003-0865-1732;;0000-0001-8129-0027;;;;0000-0002-9122-0302", "linkedin": "linguochang/;%E4%BB%98%E6%81%BA-%E9%99%88-810331213/;;;;;", "or_profile": "~Guochang_Lin1;~Fukai_Chen1;~Pipi_Hu1;~Xiang_Chen8;~Junqing_Chen1;~Jun_Wang2;~Zuoqiang_Shi1", "aff": "Tsinghua University;Tsinghua University;Tsinghua University;Huawei Technologies Ltd.;Tsinghua University;University College London;Tsinghua University", "aff_domain": "tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;huawei.com;tsinghua.edu.cn;ucl.ac.uk;tsinghua.edu.cn", "position": "PhD student;PhD student;Postdoc;Researcher;Associate Professor;Professor;Associate Professor", "bibtex": "@misc{\nlin2022a,\ntitle={A neural network framework for learning Green's function},\nauthor={Guochang Lin and Fukai Chen and Pipi Hu and Xiang Chen and Junqing Chen and Jun Wang and Zuoqiang Shi},\nyear={2022},\nurl={https://openreview.net/forum?id=AOn-gHymcx}\n}", "github": "", "project": "", "reviewers": "mJPD;YLib;Rxs8;dXB8", "site": "https://openreview.net/forum?id=AOn-gHymcx", "pdf_size": 0, "recommendation": "1;3;3;3", "confidence": "5;4;5;3", "correctness": "4;2;1;3", "technical_novelty": "1;1;2;2", "empirical_novelty": "0;1;2;1", "wc_summary_paper": "44;21;89;195", "wc_summary_review": "22;68;28;76", "wc_main_review": "110;125;401;284", "wc_review": "176;214;518;555", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 2.5, 0.8660254037844386 ], "confidence_avg": [ 4.25, 0.82915619758885 ], "correctness_avg": [ 2.5, 1.118033988749895 ], "technical_novelty_avg": [ 1.5, 0.5 ], "empirical_novelty_avg": [ 1.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 87.25, 66.84450239174498 ], "wc_summary_review_avg": [ 48.5, 23.76446927663229 ], "wc_main_review_avg": [ 230.0, 119.98124853492732 ], "wc_review_avg": [ 365.75, 171.77656272029662 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": -0.5222329678670935, "corr_recommendation_correctness": -0.7745966692414834, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:ELApcqb32W8J:scholar.google.com/&scioq=A+neural+network+framework+for+learning+Green%27s+function&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0;0;1;0;2;0", "aff_unique_norm": "Tsinghua University;Huawei;University College London", "aff_unique_dep": ";Huawei Technologies;", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.huawei.com;https://www.ucl.ac.uk", "aff_unique_abbr": "THU;Huawei;UCL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;1;0", "aff_country_unique": "China;United Kingdom" }, { "title": "Should I Run Offline Reinforcement Learning or Behavioral Cloning?", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6654", "id": "AP1MKT37rJ", "poster": "", "openreview": "https://openreview.net/forum?id=AP1MKT37rJ", "slides": "https://iclr.cc/virtual/2022/poster/6654", "video": "https://iclr.cc/virtual/2022/poster/6654", "author_site": "Aviral Kumar, Joey Hong, Anikait Singh, Sergey Levine", "tldr": "", "abstract": "Offline reinforcement learning (RL) algorithms can acquire effective policies by utilizing only previously collected experience, without any online interaction. While it is widely understood that offline RL is able to extract good policies even from highly suboptimal data, in practice offline RL is often used with data that resembles demonstrations. In this case, one can also use behavioral cloning (BC) algorithms, which mimic a subset of the dataset via supervised learning. It seems natural to ask: When should we prefer offline RL over BC? In this paper, our goal is to characterize environments and dataset compositions where offline RL leads to better performance than BC. In particular, we characterize the properties of environments that allow offline RL methods to perform better than BC methods even when only provided with expert data. Additionally, we show that policies trained on suboptimal data that is sufficiently noisy can attain better performance than even BC algorithms with expert data, especially on long-horizon problems. We validate our theoretical results via extensive experiments on both diagnostic and high-dimensional domains including robot manipulation, maze navigation and Atari games, when learning from a variety of data sources. We observe that modern offline RL methods trained on suboptimal, noisy data in sparse reward domains outperform cloning the expert data in several practical problems.", "keywords": "offline RL", "primary_area": "", "supplementary_material": "", "author": "Aviral Kumar;Joey Hong;Anikait Singh;Sergey Levine", "authorids": "~Aviral_Kumar2;~Joey_Hong2;~Anikait_Singh1;~Sergey_Levine1", "gender": "M;M;M;M", "homepage": "https://aviralkumar2907.github.io/;;https://asap7772.github.io/;https://people.eecs.berkeley.edu/~svlevine/", "dblp": "202/7961;188/6056.html;302/3876;80/7594", "google_scholar": ";SiBVfPUAAAAJ;lPaISmIAAAAJ;8R35rCwAAAAJ", "orcid": ";;;", "linkedin": ";;asap7772/;", "or_profile": "~Aviral_Kumar2;~Joey_Hong2;~Anikait_Singh1;~Sergey_Levine1", "aff": "University of California, Berkeley;University of California, Berkeley;University of California, Berkeley;Google", "aff_domain": "berkeley.edu;berkeley.edu;berkeley.edu;google.com", "position": "PhD student;PhD student;Undergrad student;Research Scientist", "bibtex": "@inproceedings{\nkumar2022should,\ntitle={Should I Run Offline Reinforcement Learning or Behavioral Cloning?},\nauthor={Aviral Kumar and Joey Hong and Anikait Singh and Sergey Levine},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=AP1MKT37rJ}\n}", "github": "", "project": "", "reviewers": "b23W;QvCh;4pq1;9b7d", "pdf_size": 0, "recommendation": "6;6;8;8", "confidence": "3;4;3;5", "correctness": "3;4;4;3", "technical_novelty": "2;3;4;3", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "40;99;242;127", "wc_summary_review": "87;60;51;46", "wc_main_review": "457;562;270;369", "wc_review": "584;721;563;542", "wc_reply_reviewers": "171;735;0;23", "wc_reply_authors": "2734;3238;261;623", "reply_reviewers": "1;2;0;1", "reply_authors": "4;6;1;1", "recommendation_avg": [ 7.0, 1.0 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 127.0, 73.44725998973685 ], "wc_summary_review_avg": [ 61.0, 15.827191791344413 ], "wc_main_review_avg": [ 414.5, 107.83436372511315 ], "wc_review_avg": [ 602.5, 70.008928002077 ], "wc_reply_reviewers_avg": [ 232.25, 297.58811720228346 ], "wc_reply_authors_avg": [ 1714.0, 1290.7813525148247 ], "reply_reviewers_avg": [ 1.0, 0.7071067811865476 ], "reply_authors_avg": [ 3.0, 2.1213203435596424 ], "replies_avg": [ 22, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.30151134457776363, "corr_recommendation_correctness": 0.0, "gs_citation": 44, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9072300047394514637&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2, "pdf": "https://openreview.net/pdf?id=AP1MKT37rJ", "email": "berkeley.edu;berkeley.edu;berkeley.edu;google.com", "author_num": 4, "aff_unique_index": "0;0;0;1", "aff_unique_norm": "University of California, Berkeley;Google", "aff_unique_dep": ";Google", "aff_unique_url": "https://www.berkeley.edu;https://www.google.com", "aff_unique_abbr": "UC Berkeley;Google", "aff_campus_unique_index": "0;0;0;1", "aff_campus_unique": "Berkeley;Mountain View", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "APS9U4pNiI8", "title": "Secure Byzantine-Robust Federated Learning with Dimension-free Error", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "In the present work, we propose a federated learning protocol with bi-directional security guarantees. First, our protocol is Byzantine-robust against malicious clients. Additionally, it is the first federated learning protocol with a per-round mean estimation error that is independent of the update size (e.g., the size of the model being trained). Second, our protocol is secure against a semi-honest server, as it only reveals sums of the updates. The code for evaluation is provided in the supplementary material.", "keywords": "Federated Learning;Robust Mean Estimator;Secure Aggregation", "primary_area": "", "supplementary_material": "/attachment/f075ce7e4de362a435d8b69cdc9ced06598e6354.zip", "author": "Lun Wang;Qi Pang;Shuai Wang;Dawn Song", "authorids": "~Lun_Wang1;~Qi_Pang1;~Shuai_Wang7;~Dawn_Song2", "gender": ";;M;F", "homepage": "https://wanglun1996.github.io/;;https://home.cse.ust.hk/~shuaiw/;http://people.eecs.berkeley.edu/~dawnsong/", "dblp": ";;42/1503-11;", "google_scholar": ";;;84WzBlYAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Lun_Wang1;~Qi_Pang1;~Shuai_Wang7;~Dawn_Song2", "aff": "University of California, Berkeley;;;University of California, Berkeley", "aff_domain": "berkeley.edu;;;berkeley.edu", "position": "PhD student;;;Full Professor", "bibtex": "@misc{\nwang2022secure,\ntitle={Secure Byzantine-Robust Federated Learning with Dimension-free Error},\nauthor={Lun Wang and Qi Pang and Shuai Wang and Dawn Song},\nyear={2022},\nurl={https://openreview.net/forum?id=APS9U4pNiI8}\n}", "github": "", "project": "", "reviewers": "dsjT;ULVU;mHLm", "site": "https://openreview.net/forum?id=APS9U4pNiI8", "pdf_size": 0, "recommendation": "3;3;6", "confidence": "3;4;4", "correctness": "2;2;3", "technical_novelty": "2;2;3", "empirical_novelty": "3;2;0", "wc_summary_paper": "84;35;138", "wc_summary_review": "46;10;78", "wc_main_review": "524;161;471", "wc_review": "654;206;687", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 4.0, 1.4142135623730951 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 2.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 1.6666666666666667, 1.247219128924647 ], "wc_summary_paper_avg": [ 85.66666666666667, 42.06608557443342 ], "wc_summary_review_avg": [ 44.666666666666664, 27.776888874666213 ], "wc_main_review_avg": [ 385.3333333333333, 160.096498677794 ], "wc_review_avg": [ 515.6666666666666, 219.3814536879137 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.5, "corr_recommendation_correctness": 1.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:9E4QtzxjNVgJ:scholar.google.com/&scioq=Secure+Byzantine-Robust+Federated+Learning+with+Dimension-free+Error&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "University of California, Berkeley", "aff_unique_dep": "", "aff_unique_url": "https://www.berkeley.edu", "aff_unique_abbr": "UC Berkeley", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Berkeley", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "AQV2-jDKEt2", "title": "PRNet: A Progressive Regression Network for No-Reference User-Generated-Content Video Quality Assessment", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Non-professional video, commonly known as User Generated Content (UGC) has become very popular in today\u2019s video sharing applications. However, objectively perceptual quality assessment of UGC-videos is still a challenge problem, which is arose from many reasons. First, the pristine sources of UGC-videos are not available, which makes the appropriate technique is the no-reference NR video quality assessment VQA (NR-VQA). Another factor leads the NR-UGC-VQA to a challenge is that subjective mean option scores (MOS) of all the UGC-datasets are not uniformly distributed. The largest UGC video dataset---YouTube-UGC still faces a problem that the database has right-skewed MOS distribution. In addition, authentic degradations occurred in the videos are not unique, therefore, not predicable. For example, an over- or under-exposure image/video, brightness and contrast static information is important for evaluation. Only employing verified priori statistic knowledge or generalized learning knowledge may not cover all possible distortions. To solve these problems, we introduce a novel NR-VQA framework---Progressive Regress Network (PRNet) in this paper. For the skewed MOS problem, a progressive regression model is proposed, which utilizes the coarse-to-fine strategy during the training process. This strategy can turn sparse subjective human rating scores into integers with denser samples, which can solve the in-balanced sample problem and make the training progress smoother. For the unpredictable distortions problem, a wide and deep model based on our PRNet is developed, which employs both low-level features generated from natural scene statistics (NSS) and high-level semantic features extracted by deep neural networks, to fuse memorizing priori knowledge and generalizing learning features. Our experimental results demonstrate that our proposed method PRNet achieves state-of-the-art performance in currently three main popular UGC-VQA datasets (KoNVid-1K, LIVE-VQC, and YouTube-UGC).", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yang YangR;Bo Jiang;Kailin Wu", "authorids": "~Yang_YangR1;~Bo_Jiang10;~Kailin_Wu1", "gender": "F;M;F", "homepage": "https://blog.csdn.net/weixin_42699651;http://expire7.blogcn.com/index.shtml;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;%E5%87%AF%E7%90%B3-%E5%90%B4-400204119/", "or_profile": "~Yang_YangR1;~Bo_Jiang10;~Kailin_Wu1", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nyangr2022prnet,\ntitle={{PRN}et: A Progressive Regression Network for No-Reference User-Generated-Content Video Quality Assessment},\nauthor={Yang YangR and Bo Jiang and Kailin Wu},\nyear={2022},\nurl={https://openreview.net/forum?id=AQV2-jDKEt2}\n}", "github": "", "project": "", "reviewers": "EUTL;zNgE;c4DP", "site": "https://openreview.net/forum?id=AQV2-jDKEt2", "pdf_size": 0, "recommendation": "3;3;3", "confidence": "3;5;4", "correctness": "2;2;2", "technical_novelty": "2;2;2", "empirical_novelty": "2;2;2", "wc_summary_paper": "64;10;45", "wc_summary_review": "23;6;20", "wc_main_review": "175;160;252", "wc_review": "262;176;317", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "correctness_avg": [ 2.0, 0.0 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 39.666666666666664, 22.365648262955006 ], "wc_summary_review_avg": [ 16.333333333333332, 7.408703590297623 ], "wc_main_review_avg": [ 195.66666666666666, 40.30164044083345 ], "wc_review_avg": [ 251.66666666666666, 58.02489887013065 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:fkCeYKgh3XYJ:scholar.google.com/&scioq=PRNet:+A+Progressive+Regression+Network+for+No-Reference+User-Generated-Content+Video+Quality+Assessment&hl=en&as_sdt=0,5", "gs_version_total": 2 }, { "id": "ARw4igiN2Qm", "title": "A stepped sampling method for video detection using LSTM", "track": "main", "status": "Reject", "tldr": "", "abstract": "Artificial neural networks that simulate human achieves great successes. From the perspective of simulating human memory method, we propose a stepped sampler based on the \u201crepeated input\u201d. We repeatedly inputted data to the LSTM model stepwise in a batch. The stepped sampler is used to strengthen the ability of fusing the temporal information in LSTM. We tested the stepped sampler on the LSTM built-in in PyTorch. Compared with the traditional sampler of PyTorch, such as sequential sampler, batch sampler, the training loss of the proposed stepped sampler converges faster in the training of the model, and the training loss after convergence is more stable. Meanwhile, it can maintain a higher test accuracy. We quantified the algorithm of the stepped sampler. We assume that, the artificial neural networks have human-like characteristics, and human learning method could be used for machine learning. Our code will be available online soon.", "keywords": "stepped sampler;LSTM;video detection;psychology;human memory rule", "primary_area": "", "supplementary_material": "", "author": "Dengshan Li", "authorids": "~Dengshan_Li1", "gender": "M", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "0000-0002-4653-5137", "linkedin": "", "or_profile": "~Dengshan_Li1", "aff": "University of Science and Technology of China", "aff_domain": "ustc.edu.cn", "position": "PhD student", "bibtex": "@misc{\nli2022a,\ntitle={A stepped sampling method for video detection using {LSTM}},\nauthor={Dengshan Li},\nyear={2022},\nurl={https://openreview.net/forum?id=ARw4igiN2Qm}\n}", "github": "", "project": "", "reviewers": "hB9n;JLJ8;BB9f", "site": "https://openreview.net/forum?id=ARw4igiN2Qm", "pdf_size": 0, "recommendation": "1;1;5", "confidence": "5;5;5", "correctness": "1;1;2", "technical_novelty": "1;1;2", "empirical_novelty": "1;1;3", "wc_summary_paper": "53;34;102", "wc_summary_review": "54;27;104", "wc_main_review": "134;515;261", "wc_review": "241;576;467", "wc_reply_reviewers": "0;388;130", "wc_reply_authors": "513;2811;1278", "reply_reviewers": "0;2;1", "reply_authors": "2;7;6", "recommendation_avg": [ 2.3333333333333335, 1.8856180831641267 ], "confidence_avg": [ 5.0, 0.0 ], "correctness_avg": [ 1.3333333333333333, 0.4714045207910317 ], "technical_novelty_avg": [ 1.3333333333333333, 0.4714045207910317 ], "empirical_novelty_avg": [ 1.6666666666666667, 0.9428090415820634 ], "wc_summary_paper_avg": [ 63.0, 28.647280266487194 ], "wc_summary_review_avg": [ 61.666666666666664, 31.899146627387317 ], "wc_main_review_avg": [ 303.3333333333333, 158.3968293734302 ], "wc_review_avg": [ 428.0, 139.51582944837 ], "wc_reply_reviewers_avg": [ 172.66666666666666, 161.24791126984834 ], "wc_reply_authors_avg": [ 1534.0, 955.4590519744946 ], "reply_reviewers_avg": [ 1.0, 0.816496580927726 ], "reply_authors_avg": [ 5.0, 2.160246899469287 ], "replies_avg": [ 23, 0 ], "authors#_avg": [ 1, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 1.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:EX7lHXX_JnAJ:scholar.google.com/&scioq=A+stepped+sampling+method+for+video+detection+using+LSTM&hl=en&as_sdt=0,5", "gs_version_total": 3, "aff_unique_index": "0", "aff_unique_norm": "University of Science and Technology of China", "aff_unique_dep": "", "aff_unique_url": "http://www.ustc.edu.cn", "aff_unique_abbr": "USTC", "aff_country_unique_index": "0", "aff_country_unique": "China" }, { "id": "ARyEf6Z77Y", "title": "Collaboration of Experts: Achieving 80% Top-1 Accuracy on ImageNet with 100M FLOPs", "track": "main", "status": "Reject", "tldr": "", "abstract": "In this paper, we propose a Collaboration of Experts (CoE) framework to pool together the expertise of multiple networks towards a common aim. Each expert is an individual network with expertise on a unique portion of the dataset, which enhances the collective capacity. Given a sample, an expert is selected by the delegator, which simultaneously outputs a rough prediction to support early termination. To make each model in CoE play its role, we propose a novel training algorithm that consists of three components: weight generation module (WGM), label generation module (LGM) and selection reweighting module (SRM). Our method achieves the state-of-the-art performance on ImageNet, 80.7% top-1 accuracy with 194M FLOPs. Combined with PWLU activation function and CondConv, CoE further achieves the accuracy of 80.0% with only 100M FLOPs for the first time. More importantly, CoE is hardware-friendly, achieving a 3~6x speedup compared with some existing conditional computation approaches. Experimental results on translation task also show the strong generalizability of CoE.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yikang Zhang;Zhuo Chen;Zhao Zhong", "authorids": "~Yikang_Zhang1;~Zhuo_Chen5;~Zhao_Zhong1", "gender": "M;;M", "homepage": "https://mail.google.com/mail/u/0/#inbox;;https://github.com/kenchen1024", "dblp": "233/3090;128/6801;", "google_scholar": ";igtXP_kAAAAJ;", "orcid": ";;", "linkedin": ";;", "or_profile": "~Yikang_Zhang1;~Zhao_Zhong1;~Ken_Chen2", "aff": "Huawei Technologies Ltd.;Huawei;Huawei Technologies Ltd.", "aff_domain": "huawei.com;huawei.com;huawei.com", "position": "Researcher;Principal Researcher;Postdoc", "bibtex": "@misc{\nzhang2022collaboration,\ntitle={Collaboration of Experts: Achieving 80\\% Top-1 Accuracy on ImageNet with 100M {FLOP}s},\nauthor={Yikang Zhang and Zhuo Chen and Zhao Zhong},\nyear={2022},\nurl={https://openreview.net/forum?id=ARyEf6Z77Y}\n}", "github": "", "project": "", "reviewers": "u2in;u2o7;dWQ7;SUb5", "site": "https://openreview.net/forum?id=ARyEf6Z77Y", "pdf_size": 0, "recommendation": "5;5;6;8", "confidence": "3;3;3;5", "correctness": "4;3;3;4", "technical_novelty": "3;2;3;4", "empirical_novelty": "3;3;3;4", "wc_summary_paper": "52;80;86;108", "wc_summary_review": "11;53;25;57", "wc_main_review": "103;249;243;849", "wc_review": "166;382;354;1014", "wc_reply_reviewers": "0;0;58;468", "wc_reply_authors": "1541;1273;1269;2009", "reply_reviewers": "0;0;1;1", "reply_authors": "3;2;4;3", "recommendation_avg": [ 6.0, 1.224744871391589 ], "confidence_avg": [ 3.5, 0.8660254037844386 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 3.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 81.5, 19.96872554771586 ], "wc_summary_review_avg": [ 36.5, 19.20286436967152 ], "wc_main_review_avg": [ 361.0, 287.7394654891817 ], "wc_review_avg": [ 479.0, 319.85465449169254 ], "wc_reply_reviewers_avg": [ 131.5, 195.71599321465786 ], "wc_reply_authors_avg": [ 1523.0, 301.4697331408246 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 3.0, 0.7071067811865476 ], "replies_avg": [ 22, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.9428090415820632, "corr_recommendation_correctness": 0.40824829046386296, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3356202487479523066&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;0", "aff_unique_norm": "Huawei", "aff_unique_dep": "Huawei Technologies", "aff_unique_url": "https://www.huawei.com", "aff_unique_abbr": "Huawei", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "id": "AS0dhAKIYA0", "title": "Interpretable Semantic Role Relation Table for Supporting Facts Recognition of Reading Comprehension", "track": "main", "status": "Reject", "tldr": "", "abstract": "The current Machine Reading Comprehension (MRC) model has poor interpretability. \nInterpretable semantic features can enhancethe interpretability of the model. \nSemantic role labeling (SRL) captures predicate-argument relations, such as \"who did what to whom,\" which\nare critical to comprehension and interpretation. \nTo enhance the interpretability of the model, we propose the semantic role relation table,\nwhich represents the semantic relation of the sentence itself and the semantic relations among sentences. \nWe use the name of entities to integrate into the semantic role relation table to establish the semantic relation between sentences. \nThis paper makes the fi\frst attempt to utilize contextual semantic's explicit relation to the recognition supporting fact of reading \ncomprehension. \nWe have established nine semantic relationtables between target sentence, question, and article. \nThen we take each semantic relationship table's overall semantic role relevance and each\nsemantic role relevance as important judgment information. \nDetailed experiments on HotpotQA, a challenging multi-hop MRC data set, our method achieves better performance. \nWith few training data sets, the model performance is still stable.", "keywords": "interpretability;semantic role relation table;supporting facts", "primary_area": "", "supplementary_material": "/attachment/ed426a1e5511cb03578afdf2f8e015636be7e560.zip", "author": "YanQing Bai;Zhichang Zhang;HaoYuan Chen;Xiaohui Qin;Yanglong Qiu", "authorids": "~YanQing_Bai1;zzc@nwnu.edu.cn;347087659@qq.com;qinxh_qj@163.com;lankyqiu@163.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~YanQing_Bai1;zzc@nwnu.edu.cn;347087659@qq.com;qinxh_qj@163.com;lankyqiu@163.com", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nbai2022interpretable,\ntitle={Interpretable Semantic Role Relation Table for Supporting Facts Recognition of Reading Comprehension},\nauthor={YanQing Bai and Zhichang Zhang and HaoYuan Chen and Xiaohui Qin and Yanglong Qiu},\nyear={2022},\nurl={https://openreview.net/forum?id=AS0dhAKIYA0}\n}", "github": "", "project": "", "reviewers": "aAdr;GpZ5;dcWb;uT9c", "site": "https://openreview.net/forum?id=AS0dhAKIYA0", "pdf_size": 0, "recommendation": "1;1;3;5", "confidence": "4;4;4;3", "correctness": "1;2;2;3", "technical_novelty": "2;2;2;3", "empirical_novelty": "1;2;2;3", "wc_summary_paper": "56;72;78;83", "wc_summary_review": "21;21;39;38", "wc_main_review": "193;598;591;233", "wc_review": "270;691;708;354", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 2.5, 1.6583123951777 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 2.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 72.25, 10.158124826955023 ], "wc_summary_review_avg": [ 29.75, 8.757139944068497 ], "wc_main_review_avg": [ 403.75, 191.28953839664103 ], "wc_review_avg": [ 505.75, 196.10504200555374 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.8703882797784891, "corr_recommendation_correctness": 0.8528028654224417, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:GDaOC7WnW7YJ:scholar.google.com/&scioq=Interpretable+Semantic+Role+Relation+Table+for+Supporting+Facts+Recognition+of+Reading+Comprehension&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "AT0K-SZ3QGq", "title": "On Heterogeneously Distributed Data, Sparsity Matters", "track": "main", "status": "Reject", "tldr": "", "abstract": "Federated learning (FL) is particularly vulnerable to heterogeneously distributed data, since a common global model in FL may not adapt to the heterogeneous data distribution of each user. To counter this issue, personalized FL (PFL) was proposed to produce dedicated local models for each individual user. However, PFL is far from its maturity, because existing PFL solutions either demonstrate unsatisfactory generalization towards different model architectures or cost enormous extra computation and memory. In this work, we propose federated learning with personalized sparse mask (FedSpa), a novel personalized federated learning scheme that employs personalized sparse masks to customize sparse local models on the edge. Instead of training fully dense PFL models, FedSpa only maintains a fixed number of active parameters throughout training (aka sparse-to-sparse training), which enables users' models to achieve personalization with consistently cheap communication, computation, and memory cost. We theoretically show that with the rise of data heterogeneity, setting a higher sparsity of FedSpa may potentially result in a smaller error bound on its personalized models, which also coincides with our empirical observations. Comprehensive experiments demonstrate that FedSpa significantly saves communication and computation costs, while simultaneously achieves higher model accuracy and faster convergence speed against several state-of-the-art PFL methods.", "keywords": "Federated Learning;Dynamic Sparse Training;Communication-efficient Personalized Federated Learning", "primary_area": "", "supplementary_material": "/attachment/957e5c49e90be246686095ed24b30525757e9d4e.zip", "author": "Tiansheng Huang;Shiwei Liu;Li Shen;Fengxiang He;Weiwei Lin;Dacheng Tao", "authorids": "~Tiansheng_Huang1;~Shiwei_Liu2;~Li_Shen1;~Fengxiang_He1;~Weiwei_Lin1;~Dacheng_Tao1", "gender": "M;M;M;;M;", "homepage": "https://huangtiansheng.github.io/;https://shiweiliuiiiiiii.github.io/;https://sites.google.com/site/mathshenli/home;https://fengxianghe.github.io/;https://www.scholat.com/linweiwei;", "dblp": "249/2114;234/8697-3.html;91/3680-8;225/4682;53/282-1;", "google_scholar": "zz6Oq8wAAAAJ;73IbXtsAAAAJ;yVhgENIAAAAJ;QSx-Yu0AAAAJ;IWsha94AAAAJ;", "orcid": "0000-0002-4557-1865;;;;0000-0001-6876-1795;", "linkedin": ";;;fengxiang-he-35b173122;;", "or_profile": "~Tiansheng_Huang1;~Shiwei_Liu2;~Li_Shen1;~Fengxiang_He1;~Weiwei_Lin1;~Dacheng_Tao1", "aff": "South China University of Technology;Eindhoven University of Technology;JD Explore Academy;JD.com, Inc.;South China University of Technology;", "aff_domain": "scut.edu.cn;tue.nl;jd.com;jd.com;scut.edu.cn;", "position": "MS student;PhD student;Researcher;Algorithm Scientist;Full Professor;", "bibtex": "@misc{\nhuang2022on,\ntitle={On Heterogeneously Distributed Data, Sparsity Matters},\nauthor={Tiansheng Huang and Shiwei Liu and Li Shen and Fengxiang He and Weiwei Lin and Dacheng Tao},\nyear={2022},\nurl={https://openreview.net/forum?id=AT0K-SZ3QGq}\n}", "github": "", "project": "", "reviewers": "ECdR;Zg2F;PGV4;mQ15", "site": "https://openreview.net/forum?id=AT0K-SZ3QGq", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "4;3;3;3", "correctness": "2;3;3;3", "technical_novelty": "3;2;2;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "110;50;38;54", "wc_summary_review": "307;21;39;24", "wc_main_review": "655;358;148;117", "wc_review": "1072;429;225;195", "wc_reply_reviewers": "0;550;38;0", "wc_reply_authors": "2970;2811;1490;343", "reply_reviewers": "0;1;1;0", "reply_authors": "5;5;4;1", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 63.0, 27.76688675382964 ], "wc_summary_review_avg": [ 97.75, 121.00284087574143 ], "wc_main_review_avg": [ 319.5, 214.74461576486615 ], "wc_review_avg": [ 480.25, 353.3110350668374 ], "wc_reply_reviewers_avg": [ 147.0, 233.18876473792642 ], "wc_reply_authors_avg": [ 1903.5, 1068.5411784297319 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 3.75, 1.6393596310755 ], "replies_avg": [ 23, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14152016886187725187&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;2;3;0", "aff_unique_norm": "South China University of Technology;Eindhoven University of Technology;JD;JD.com", "aff_unique_dep": ";;JD Explore Academy;", "aff_unique_url": "https://www.scut.edu.cn;https://www.tue.nl;;https://www.jd.com", "aff_unique_abbr": "SCUT;TU/e;;JD.com", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0", "aff_country_unique": "China;Netherlands;" }, { "title": "Graph Auto-Encoder via Neighborhood Wasserstein Reconstruction", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6434", "id": "ATUh28lnSuW", "poster": "", "openreview": "https://openreview.net/forum?id=ATUh28lnSuW", "slides": "https://iclr.cc/virtual/2022/poster/6434", "video": "https://iclr.cc/virtual/2022/poster/6434", "author_site": "Mingyue Tang, Pan Li, Carl Yang", "tldr": "", "abstract": "Graph neural networks (GNNs) have drawn significant research attention recently, mostly under the setting of semi-supervised learning. When task-agnostic representations are preferred or supervision is simply unavailable, the auto-encoder framework comes in handy with a natural graph reconstruction objective for unsupervised GNN training. However, existing graph auto-encoders are designed to reconstruct the direct links, so GNNs trained in this way are only optimized towards proximity-oriented graph mining tasks, and will fall short when the topological structures matter. In this work, we revisit the graph encoding process of GNNs which essentially learns to encode the neighborhood information of each node into an embedding vector, and propose a novel graph decoder to reconstruct the entire neighborhood information regarding both proximity and structure via Neighborhood Wasserstein Reconstruction (NWR). Specifically, from the GNN embedding of each node, NWR jointly predicts its node degree and neighbor feature distribution, where the distribution prediction adopts an optimal-transport loss based on the Wasserstein distance. Extensive experiments on both synthetic and real-world network datasets show that the unsupervised node representations learned with NWR have much more advantageous in structure-oriented graph mining tasks, while also achieving competitive performance in proximity-oriented ones.", "keywords": "graph representation learning;unsupervised learning;autoencoder;wasserstein distance", "primary_area": "", "supplementary_material": "/attachment/84a6ffeded2cf63f97ae0f08c294e37de61a18c3.zip", "author": "Mingyue Tang;Pan Li;Carl Yang", "authorids": "~Mingyue_Tang1;~Pan_Li2;~Carl_Yang1", "gender": "F;;M", "homepage": ";;https://cs.emory.edu/~jyang71/", "dblp": ";https://dblp.org/pers/hd/l/Li_0005:Pan;305/0254", "google_scholar": ";IroP0EwAAAAJ;mOINlwcAAAAJ", "orcid": ";;0000-0001-9145-4531", "linkedin": "mingyue-tang-353238167/;pan-li-b951105a/;", "or_profile": "~Mingyue_Tang1;~Pan_Li2;~Carl_Yang1", "aff": "University of Virginia;Purdue University;Emory University", "aff_domain": "virginia.edu;purdue.edu;emory.edu", "position": "MS student;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\ntang2022graph,\ntitle={Graph Auto-Encoder via Neighborhood Wasserstein Reconstruction},\nauthor={Mingyue Tang and Pan Li and Carl Yang},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=ATUh28lnSuW}\n}", "github": "", "project": "", "reviewers": "dc6B;mWmk;hC5R;W8Ew", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "4;3;4;4", "correctness": "4;4;3;3", "technical_novelty": "2;3;3;2", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "41;104;76;32", "wc_summary_review": "23;32;37;35", "wc_main_review": "175;206;272;287", "wc_review": "239;342;385;354", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "111;351;454;318", "reply_reviewers": "0;0;0;0", "reply_authors": "1;2;2;2", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 63.25, 28.699956445959984 ], "wc_summary_review_avg": [ 31.75, 5.356071321407137 ], "wc_main_review_avg": [ 235.0, 46.13566949768909 ], "wc_review_avg": [ 330.0, 54.83156025502101 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 308.5, 124.57226818196737 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.75, 0.4330127018922193 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.13245323570650439, "corr_recommendation_correctness": -0.6882472016116854, "gs_citation": 74, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13928177988336343335&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "pdf": "https://openreview.net/pdf?id=ATUh28lnSuW", "email": "virginia.edu;purdue.edu;emory.edu", "author_num": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "University of Virginia;Purdue University;Emory University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.virginia.edu;https://www.purdue.edu;https://www.emory.edu", "aff_unique_abbr": "UVA;Purdue;Emory", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Emergent Communication at Scale", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6459", "id": "AUGBfDIV9rL", "poster": "", "openreview": "https://openreview.net/forum?id=AUGBfDIV9rL", "slides": "https://iclr.cc/virtual/2022/poster/6459", "video": "https://iclr.cc/virtual/2022/poster/6459", "author_site": "Rahma Chaabouni, Florian Strub, Florent Altch\u00e9, Eugene Tarassov, Corentin Tallec, Elnaz Davoodi, Kory Mathewson, Olivier Tieleman, Angeliki Lazaridou, Bilal Piot", "tldr": "", "abstract": "Emergent communication aims for a better understanding of human language evolution and building more efficient representations. We posit that reaching these goals will require scaling up, in contrast to a significant amount of literature that focuses on setting up small-scale problems to tease out desired properties of the emergent languages. We focus on three independent aspects to scale up, namely the dataset, task complexity, and population size. We provide a first set of results for large populations solving complex tasks on realistic large-scale datasets, as well as an easy-to-use codebase to enable further experimentation. In more complex tasks and datasets, we find that RL training can become unstable, but responds well to established stabilization techniques.\nWe also identify the need for a different metric than topographic similarity, which does not correlate with the generalization performances when working with natural images. In this context, we probe ease-of-learnability and transfer methods to assess emergent languages. Finally, we observe that larger populations do not induce robust emergent protocols with high generalization performance, leading us to explore different ways to leverage population, through voting and imitation learning. ", "keywords": "emergent communication;multi-agent reinforcement learning;representation learning", "primary_area": "", "supplementary_material": "", "author": "Rahma Chaabouni;Florian Strub;Florent Altch\u00e9;Eugene Tarassov;Corentin Tallec;Elnaz Davoodi;Kory Wallace Mathewson;Olivier Tieleman;Angeliki Lazaridou;Bilal Piot", "authorids": "~Rahma_Chaabouni1;~Florian_Strub1;~Florent_Altch\u00e91;~Eugene_Tarassov1;~Corentin_Tallec2;~Elnaz_Davoodi2;~Kory_Wallace_Mathewson1;~Olivier_Tieleman1;~Angeliki_Lazaridou1;~Bilal_Piot1", "gender": "F;M;;;M;F;M;Unspecified;M;F", "homepage": ";http://www.florian-strub.com;;http://deepmind.com;;;https://korymathewson.com/;;;", "dblp": ";;177/8921.html;;;59/11078;182/1971;;;79/9656", "google_scholar": "https://scholar.google.com/citations?hl=fr;zxO5kccAAAAJ;;;OPKX4GgLCxIC;https://scholar.google.ca/citations?hl=en;K8MFvX4AAAAJ;;https://scholar.google.fr/citations?user=fqxNUREAAAAJ;BMgUIC0AAAAJ", "orcid": ";;;;;;0000-0002-5688-6221;;;", "linkedin": ";florian-strub-64443527/;;;;;korymath/?originalSubdomain=ca;;;", "or_profile": "~Rahma_Chaabouni1;~Florian_Strub1;~Florent_Altch\u00e91;~Eugene_Tarassov1;~Corentin_Tallec2;~Elnaz_Davoodi2;~Kory_Wallace_Mathewson1;~Olivier_Tieleman1;~Bilal_Piot1;~Angeliki_Lazaridou2", "aff": "Google;Google DeepMind;Google DeepMind;;Google DeepMind;Google;;Google DeepMind;University Lille;", "aff_domain": "google.com;google.com;deepmind.com;;deepmind.com;google.com;;google.com;univ-lille1.fr;unitn.it", "position": "Researcher;Research Scientist;Researcher;;Research Scientist;Research Engineer;;Research Scientist;Associate Professor;PhD student", "bibtex": "@inproceedings{\nchaabouni2022emergent,\ntitle={Emergent Communication at Scale},\nauthor={Rahma Chaabouni and Florian Strub and Florent Altch{\\'e} and Eugene Tarassov and Corentin Tallec and Elnaz Davoodi and Kory Wallace Mathewson and Olivier Tieleman and Angeliki Lazaridou and Bilal Piot},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=AUGBfDIV9rL}\n}", "github": "", "project": "", "reviewers": "HRb2;MhrQ;zVAw;YDs6", "pdf_size": 0, "recommendation": "8;8;8;8", "confidence": "3;4;4;3", "correctness": "4;4;3;4", "technical_novelty": "3;3;3;3", "empirical_novelty": "4;4;4;4", "wc_summary_paper": "106;151;142;21", "wc_summary_review": "35;70;51;17", "wc_main_review": "143;392;551;213", "wc_review": "284;613;744;251", "wc_reply_reviewers": "0;85;15;0", "wc_reply_authors": "0;379;850;567", "reply_reviewers": "0;1;1;0", "reply_authors": "0;1;2;2", "recommendation_avg": [ 8.0, 0.0 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 4.0, 0.0 ], "wc_summary_paper_avg": [ 105.0, 51.337121072378025 ], "wc_summary_review_avg": [ 43.25, 19.57517560585345 ], "wc_main_review_avg": [ 324.75, 159.08547230969899 ], "wc_review_avg": [ 473.0, 210.9774869506223 ], "wc_reply_reviewers_avg": [ 25.0, 35.17811819867572 ], "wc_reply_authors_avg": [ 449.0, 308.71750841181654 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.25, 0.82915619758885 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 10, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 91, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4797610842429518149&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "pdf": "https://openreview.net/pdf?id=AUGBfDIV9rL", "email": "google.com;google.com;deepmind.com;;deepmind.com;google.com;;google.com;univ-lille1.fr;unitn.it", "author_num": 10, "aff_unique_index": "0;0;0;0;0;0;1", "aff_unique_norm": "Google;University of Lille", "aff_unique_dep": "Google;", "aff_unique_url": "https://www.google.com;https://www.univ-lille.fr", "aff_unique_abbr": "Google;ULille", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;1;1;1;0;1;2", "aff_country_unique": "United States;United Kingdom;France" }, { "id": "AUszBTiYBB6", "title": "FEDERATED LEARNING FRAMEWORK BASED ON TRIMMED MEAN AGGREGATION RULES", "track": "main", "status": "Reject", "tldr": "", "abstract": "This paper studies the problem of information security in the distributed learning framework. In particular, we consider the clients will always be attacked by Byzantine nodes and poisoning in the federated learning. Typically, aggregation rules are utilized to protect the model from the attacks in federated learning. The classical aggregation methods are Krum(\u00b7) and Mean(\u00b7), which however, are not capable enough to deal with Byzantine attacks in which general deviations and multiple clients are attacked at the same time. We propose new aggregation rules, Tmean(\u00b7), to the federated learning algorithm, and propose a federated learning framework based on Byzantine-resilient aggregation algorithm. Our novel Tmean(\u00b7) rules are derived from Mean(\u00b7) by appropriately trimming some of the values before averaging them. Theoretically, we provide rigorous theoretical proof and understanding of Tmean(\u00b7). Extensive experiments validate the effectiveness of our approaches.", "keywords": "Tmean\uff0cByzantine attack\uff0cByzantine-resilient", "primary_area": "", "supplementary_material": "", "author": "Wang Tian Xiang;Meiyue Shao;Yanwei Fu;Riheng Jia;Feilong Lin;Zhonglong Zheng", "authorids": "~Wang_Tian_Xiang1;myshao@fudan.edu.cn;~Yanwei_Fu2;~Riheng_Jia1;bruce_lin@zjnu.cn;zhonglong@zjnu.edu.cn", "gender": "M;;M;M;;", "homepage": ";;http://yanweifu.github.io;http://mypage.zjnu.edu.cn/JRH1/zh_CN/index.htm;;", "dblp": ";;63/9065;;;", "google_scholar": ";;https://scholar.google.co.uk/citations?user=Vg54TcsAAAAJ;;;", "orcid": "0000-0003-2220-5501;;0000-0002-6595-6893;;;", "linkedin": ";;;;;", "or_profile": "~Wang_Tian_Xiang1;myshao@fudan.edu.cn;~Yanwei_Fu2;~Riheng_Jia1;bruce_lin@zjnu.cn;zhonglong@zjnu.edu.cn", "aff": "zhejiang normal university;;Fudan University,;Zhejiang Normal University;;", "aff_domain": "zjnu.edu.cn;;fudan.edu.cn;zjnu.edu.cn;;", "position": "PhD student;;Professor;Associate Professor;;", "bibtex": "@misc{\nxiang2022federated,\ntitle={{FEDERATED} {LEARNING} {FRAMEWORK} {BASED} {ON} {TRIMMED} {MEAN} {AGGREGATION} {RULES}},\nauthor={Wang Tian Xiang and Meiyue Shao and Yanwei Fu and Riheng Jia and Feilong Lin and Zhonglong Zheng},\nyear={2022},\nurl={https://openreview.net/forum?id=AUszBTiYBB6}\n}", "github": "", "project": "", "reviewers": "VRXh;j3Cv;AKLN", "site": "https://openreview.net/forum?id=AUszBTiYBB6", "pdf_size": 0, "recommendation": "3;3;3", "confidence": "4;5;4", "correctness": "2;3;3", "technical_novelty": "1;2;2", "empirical_novelty": "1;2;2", "wc_summary_paper": "38;49;26", "wc_summary_review": "36;7;40", "wc_main_review": "428;185;253", "wc_review": "502;241;319", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "169;73;112", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "correctness_avg": [ 2.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 1.6666666666666667, 0.4714045207910317 ], "empirical_novelty_avg": [ 1.6666666666666667, 0.4714045207910317 ], "wc_summary_paper_avg": [ 37.666666666666664, 9.392668535736913 ], "wc_summary_review_avg": [ 27.666666666666668, 14.70449666674185 ], "wc_main_review_avg": [ 288.6666666666667, 102.35993139679 ], "wc_review_avg": [ 354.0, 109.38921336219582 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 118.0, 39.42080668885405 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 21, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8204930796020891122&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "Zhejiang Normal University;Fudan University", "aff_unique_dep": ";", "aff_unique_url": "http://www.zjnu.edu.cn;https://www.fudan.edu.cn", "aff_unique_abbr": "ZJNU;Fudan", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "Shallow and Deep Networks are Near-Optimal Approximators of Korobov Functions", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6865", "id": "AV8FPoMTTa", "poster": "", "openreview": "https://openreview.net/forum?id=AV8FPoMTTa", "slides": "https://iclr.cc/virtual/2022/poster/6865", "video": "https://iclr.cc/virtual/2022/poster/6865", "author_site": "Moise Blanchard, Mohammed Amine Bennouna", "tldr": "", "abstract": "In this paper, we analyze the number of neurons and training parameters that a neural network needs to approximate multivariate functions of bounded second mixed derivatives --- Korobov functions. We prove upper bounds on these quantities for shallow and deep neural networks, drastically lessening the curse of dimensionality. Our bounds hold for general activation functions, including ReLU. We further prove that these bounds nearly match the minimal number of parameters any continuous function approximator needs to approximate Korobov functions, showing that neural networks are near-optimal function approximators.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Moise Blanchard;Mohammed Amine Bennouna", "authorids": "~Moise_Blanchard1;~Mohammed_Amine_Bennouna1", "gender": "M;M", "homepage": "https://moiseb.github.io/;https://www.mit.edu/~amineben/", "dblp": "304/2559;", "google_scholar": "https://scholar.google.com/citations?view_op=list_works;-acf4MQAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Moise_Blanchard1;~Mohammed_Amine_Bennouna1", "aff": "Massachusetts Institute of Technology;Massachusetts Institute of Technology", "aff_domain": "mit.edu;mit.edu", "position": "PhD student;PhD student", "bibtex": "@inproceedings{\nblanchard2022shallow,\ntitle={Shallow and Deep Networks are Near-Optimal Approximators of Korobov Functions},\nauthor={Moise Blanchard and Mohammed Amine Bennouna},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=AV8FPoMTTa}\n}", "github": "", "project": "", "reviewers": "X13n;nqre;G4AZ;bQiP", "pdf_size": 0, "recommendation": "6;6;6;8", "confidence": "3;3;4;3", "correctness": "3;4;4;4", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;0;0;0", "wc_summary_paper": "119;105;60;205", "wc_summary_review": "50;69;31;21", "wc_main_review": "177;255;237;255", "wc_review": "346;429;328;481", "wc_reply_reviewers": "18;15;0;0", "wc_reply_authors": "1016;1721;436;753", "reply_reviewers": "1;1;0;0", "reply_authors": "3;4;1;2", "recommendation_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 0.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 122.25, 52.5136886916164 ], "wc_summary_review_avg": [ 42.75, 18.38987493160299 ], "wc_main_review_avg": [ 231.0, 32.03123475609393 ], "wc_review_avg": [ 396.0, 62.12487424534556 ], "wc_reply_reviewers_avg": [ 8.25, 8.317902379807062 ], "wc_reply_authors_avg": [ 981.5, 473.7702502268373 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 2.5, 1.118033988749895 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6074949531950043851&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "pdf": "https://openreview.net/pdf?id=AV8FPoMTTa", "email": "mit.edu;mit.edu", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Massachusetts Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://web.mit.edu", "aff_unique_abbr": "MIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "AVPSfvFXqJy", "title": "Self-supervised Models are Good Teaching Assistants for Vision Transformers", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Transformers have shown remarkable progress on computer vision tasks in the past year. Compared to their CNN counterparts, transformers usually need the help of distillation to achieve comparable results on middle or small sized datasets. Meanwhile, recent researches discover that when transformers are trained with supervised and self-supervised manner respectively, the captured patterns are quite different both qualitatively and quantitatively. These findings motivate us to introduce an self-supervised teaching assistant (SSTA) besides the commonly used supervised teacher to improve the performance of transformers. Specifically, we propose a head-level knowledge distillation method that selects the most important head of the supervised teacher and self-supervised teaching assistant, and let the student mimic the attention distribution of these two heads, so as to make the student focus on the relationship between tokens deemed by the teacher and the teacher assistant. Extensive experiments verify the effectiveness of SSTA and demonstrate that the proposed SSTA is a good compensation to the supervised teacher. Meanwhile, some analytical experiments towards multiple perspectives (e.g. prediction, shape bias, robustness, and transferability to downstream tasks) with supervised teachers, self-supervised teaching assistants and students are inductive and may inspire future researches.", "keywords": "Transformer;Knowledge Distillation;Self-supervised Learning", "primary_area": "", "supplementary_material": "/attachment/0ab6c14d482900710ba217bf1996bbd8bc6938d2.zip", "author": "Haiyan Wu;Yuting Gao;Ke Li;Yinqi Zhang;Shaohui Lin;Yuan Xie;Xing Sun", "authorids": "~Haiyan_Wu3;~Yuting_Gao1;~Ke_Li4;~Yinqi_Zhang1;~Shaohui_Lin1;~Yuan_Xie5;~Xing_Sun1", "gender": "F;F;M;M;M;;M", "homepage": "https://scholar.google.com/citations?user=8_Npkp4AAAAJ&hl=zh-CN&oi=ao;https://scholar.google.com/citations?user=uk8ckNYAAAAJ&hl=zh-TW&oi=ao;http://keli.info;https://inchzhang.com/;https://sites.google.com/site/shaohuilin007/home;;https://www.sunxing.org", "dblp": ";;;74/2371;183/0917.html;;", "google_scholar": ";;mfWsFM0AAAAJ;HcxTFQQAAAAJ;k8AMa1kAAAAJ;;IUtix9IAAAAJ", "orcid": ";;0000-0001-7998-0731;0000-0003-4775-5147;0000-0003-0284-9940;;0000-0001-8132-9083", "linkedin": ";;;inchcheung;;;sunxings/", "or_profile": "~Haiyan_Wu3;~Yuting_Gao1;~Ke_Li4;~Yinqi_Zhang1;~Shaohui_Lin1;~Yuan_Xie5;~Xing_Sun1", "aff": "East China Normal University, Tsinghua University;Tencent Youtu Lab;Tencent;Jinan University;East China Normal University;;Tencent YouTu Lab", "aff_domain": "ecnu.edu.cn;tencent.com;tencent.com;jnu.edu.cn;ecnu.edu.cn;;tencent.com", "position": "MS student;Researcher;Principal Researcher;Undergrad student;Researcher;;Principal Researcher", "bibtex": "@misc{\nwu2022selfsupervised,\ntitle={Self-supervised Models are Good Teaching Assistants for Vision Transformers},\nauthor={Haiyan Wu and Yuting Gao and Ke Li and Yinqi Zhang and Shaohui Lin and Yuan Xie and Xing Sun},\nyear={2022},\nurl={https://openreview.net/forum?id=AVPSfvFXqJy}\n}", "github": "", "project": "", "reviewers": "uB1e;FN8W;Vuzf;nKd7", "site": "https://openreview.net/forum?id=AVPSfvFXqJy", "pdf_size": 0, "recommendation": "3;3;8;8", "confidence": "5;4;5;4", "correctness": "2;4;4;4", "technical_novelty": "3;1;4;4", "empirical_novelty": "2;1;4;3", "wc_summary_paper": "36;52;64;86", "wc_summary_review": "49;33;52;77", "wc_main_review": "255;210;150;329", "wc_review": "340;295;266;492", "wc_reply_reviewers": "95;220;0;0", "wc_reply_authors": "912;555;207;259", "reply_reviewers": "2;1;0;0", "reply_authors": "3;3;1;1", "recommendation_avg": [ 5.5, 2.5 ], "confidence_avg": [ 4.5, 0.5 ], "correctness_avg": [ 3.5, 0.8660254037844386 ], "technical_novelty_avg": [ 3.0, 1.224744871391589 ], "empirical_novelty_avg": [ 2.5, 1.118033988749895 ], "wc_summary_paper_avg": [ 59.5, 18.2414363469547 ], "wc_summary_review_avg": [ 52.75, 15.75396775418815 ], "wc_main_review_avg": [ 236.0, 65.3490627323759 ], "wc_review_avg": [ 348.25, 87.08149918323639 ], "wc_reply_reviewers_avg": [ 78.75, 90.30330835578506 ], "wc_reply_authors_avg": [ 483.25, 280.8810913892211 ], "reply_reviewers_avg": [ 0.75, 0.82915619758885 ], "reply_authors_avg": [ 2.0, 1.0 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10892407507586795078&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1;1;2;0;1", "aff_unique_norm": "East China Normal University;Tencent;Jinan University", "aff_unique_dep": ";Youtu Lab;", "aff_unique_url": "http://www.ecnu.edu.cn;https://www.tencent.com;https://www.jnu.edu.cn", "aff_unique_abbr": "ECNU;Tencent;JNU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "China" }, { "id": "AVShGWiL9z", "title": "Tractable Dendritic RNNs for Identifying Unknown Nonlinear Dynamical Systems", "track": "main", "status": "Reject", "tldr": "", "abstract": "In many scientific disciplines, we are interested in inferring the nonlinear dynamical system underlying a set of observed time series, a challenging task in the face of chaotic behavior and noise. Previous deep learning approaches toward this goal often suffered from a lack of interpretability and tractability. In particular, the high-dimensional latent spaces often required for a faithful embedding, even when the underlying dynamics lives on a lower-dimensional manifold, can hamper theoretical analysis. Motivated by the emerging principles of dendritic computation, we augment a dynamically interpretable and mathematically tractable piecewise-linear (PL) recurrent neural network (RNN) by a linear spline basis expansion. We show that this approach retains all the theoretically appealing properties of the simple PLRNN, yet boosts its capacity for approximating arbitrary nonlinear dynamical systems in comparatively low dimensions. We introduce two frameworks for training the system, one based on fast and scalable variational inference, and another combining BPTT with teacher forcing. We show that the dendritically expanded PLRNN achieves better reconstructions with fewer parameters and dimensions on various dynamical systems benchmarks and compares favorably to other methods, while retaining a tractable and interpretable structure.", "keywords": "chaos;dendritic computation;piecewise linear;recurrent neural network;variational inference;interpretability;tractability;basis expansion", "primary_area": "", "supplementary_material": "", "author": "Manuel Brenner;Leonard Bereska;Jonas Magdy Mikhaeil;Florian Hess;Zahra Monfared;Po-Chen Kuo;Daniel Durstewitz", "authorids": "~Manuel_Brenner1;~Leonard_Bereska1;~Jonas_Magdy_Mikhaeil1;~Florian_Hess1;~Zahra_Monfared1;~Po-Chen_Kuo1;~Daniel_Durstewitz1", "gender": "M;Not Specified;M;M;F;M;", "homepage": ";https://leonardbereska.github.io/;;https://www.zi-mannheim.de/forschung/abteilungen-ags-institute/theoret-neurowissenschaften/infos-theor-neurowiss.html;;;https://durstewitzlab.github.io", "dblp": "323/8935;238/0441;;;;143/6904;98/2120", "google_scholar": "HCUeyg8AAAAJ;HAnhujQAAAAJ;;nOZM-1AAAAAJ;https://scholar.google.pl/citations?user=OPUIwIoAAAAJ;1-XqW4IAAAAJ;https://scholar.google.de/citations?user=2bcbKU0AAAAJ", "orcid": ";;0000-0001-6745-7505;;;0000-0001-6151-6404;0000-0002-9340-3786", "linkedin": "manuel-brenner-772261191/;leonard-bereska/;;;;;", "or_profile": "~Manuel_Brenner1;~Leonard_Bereska1;~Jonas_Magdy_Mikhaeil1;~Florian_Hess1;~Zahra_Monfared1;~Po-Chen_Kuo1;~Daniel_Durstewitz1", "aff": "Heidelberg University;University of Amsterdam;Heidelberg University;Ruprecht-Karls-Universit\u00e4t Heidelberg;Heidelberg University(STRUCTURES);University of Washington;Heidelberg University", "aff_domain": "uni-heidelberg.de;uva.nl;uni-heidelberg.de;uni-heidelberg.de;uni-heidelberg.de;uw.edu;uni-heidelberg.de", "position": "PhD student;PhD student;MS student;MS student;Postdoc;PhD student;Full Professor", "bibtex": "@misc{\nbrenner2022tractable,\ntitle={Tractable Dendritic {RNN}s for Identifying Unknown Nonlinear Dynamical Systems},\nauthor={Manuel Brenner and Leonard Bereska and Jonas Magdy Mikhaeil and Florian Hess and Zahra Monfared and Po-Chen Kuo and Daniel Durstewitz},\nyear={2022},\nurl={https://openreview.net/forum?id=AVShGWiL9z}\n}", "github": "", "project": "", "reviewers": "caaB;B5hP;VV4w", "site": "https://openreview.net/forum?id=AVShGWiL9z", "pdf_size": 0, "recommendation": "3;5;6", "confidence": "4;4;4", "correctness": "3;2;4", "technical_novelty": "2;2;4", "empirical_novelty": "1;2;4", "wc_summary_paper": "44;53;54", "wc_summary_review": "14;56;33", "wc_main_review": "299;271;127", "wc_review": "357;380;214", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "518;971;223", "reply_reviewers": "0;0;0", "reply_authors": "1;2;1", "recommendation_avg": [ 4.666666666666667, 1.247219128924647 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.0, 0.816496580927726 ], "technical_novelty_avg": [ 2.6666666666666665, 0.9428090415820634 ], "empirical_novelty_avg": [ 2.3333333333333335, 1.247219128924647 ], "wc_summary_paper_avg": [ 50.333333333333336, 4.4969125210773475 ], "wc_summary_review_avg": [ 34.333333333333336, 17.172329163188344 ], "wc_main_review_avg": [ 232.33333333333334, 75.35397947170556 ], "wc_review_avg": [ 317.0, 73.43477831835993 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 570.6666666666666, 307.63217574817423 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.3273268353539886, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16908768724542254919&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;0;2;0;3;0", "aff_unique_norm": "Heidelberg University;University of Amsterdam;Ruprecht-Karls-Universit\u00e4t Heidelberg;University of Washington", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.uni-heidelberg.de;https://www.uva.nl;https://www.uni-heidelberg.de/;https://www.washington.edu", "aff_unique_abbr": "Uni Heidelberg;UvA;Uni Heidelberg;UW", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0;0;2;0", "aff_country_unique": "Germany;Netherlands;United States" }, { "title": "Retriever: Learning Content-Style Representation as a Token-Level Bipartite Graph", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/5926", "id": "AXWygMvuT6Q", "poster": "", "openreview": "https://openreview.net/forum?id=AXWygMvuT6Q", "slides": "https://iclr.cc/virtual/2022/poster/5926", "video": "https://iclr.cc/virtual/2022/poster/5926", "author_site": "Dacheng Yin, Xuanchi Ren, Chong Luo, Yuwang Wang, Zhiwei Xiong, Wenjun Zeng", "tldr": "", "abstract": "This paper addresses the unsupervised learning of content-style decomposed representation. We first give a definition of style and then model the content-style representation as a token-level bipartite graph. An unsupervised framework, named Retriever, is proposed to learn such representations. First, a cross-attention module is employed to retrieve permutation invariant (P.I.) information, defined as style, from the input data. Second, a vector quantization (VQ) module is used, together with man-induced constraints, to produce interpretable content tokens. Last, an innovative link attention module serves as the decoder to reconstruct data from the decomposed content and style, with the help of the linking keys. Being modal-agnostic, the proposed Retriever is evaluated in both speech and image domains. The state-of-the-art zero-shot voice conversion performance confirms the disentangling ability of our framework. Top performance is also achieved in the part discovery task for images, verifying the interpretability of our representation. In addition, the vivid part-based style transfer quality demonstrates the potential of Retriever to support various fascinating generative tasks. Project page at https://ydcustc.github.io/retriever-demo/.", "keywords": "Content-style decomposed representation;Zero-shot voice conversion;Style transfer;Transformer;Unsupervised learning", "primary_area": "", "supplementary_material": "/attachment/24e2b696432ca001c85230322a935ce35db1d5dc.zip", "author": "Dacheng Yin;Xuanchi Ren;Chong Luo;Yuwang Wang;Zhiwei Xiong;Wenjun Zeng", "authorids": "~Dacheng_Yin1;~Xuanchi_Ren1;~Chong_Luo1;~Yuwang_Wang3;~Zhiwei_Xiong1;~Wenjun_Zeng3", "gender": "M;M;F;M;M;M", "homepage": ";https://xuanchiren.com/;https://www.microsoft.com/en-us/research/people/cluo/;;;https://www.eias.ac.cn/h-col-187.html", "dblp": "254/0985;255/5432;79/3712;161/2633;54/6827;57/145", "google_scholar": "https://scholar.google.com/citations?hl=en;fDHUk18AAAAJ;01iBf38AAAAJ;;Snl0HPEAAAAJ;_cUfvYQAAAAJ", "orcid": ";;0000-0003-0939-474X;;;", "linkedin": ";;;;;", "or_profile": "~Dacheng_Yin1;~Xuanchi_Ren1;~Chong_Luo1;~Yuwang_Wang3;~Zhiwei_Xiong1;~Wenjun_Zeng3", "aff": "University of Science and Technology of China;Hong Kong University of Science and Technology;Microsoft Research Asia;Microsoft Research Asia;USTC;Eastern Institute for Advanced Study", "aff_domain": "ustc.edu;hkust.edu;microsoft.com;microsoft.com;ustc.edu.cn;eias.ac.cn", "position": "PhD student;Undergrad student;Principal Researcher;Researcher;Professor;Full Professor", "bibtex": "@inproceedings{\nyin2022retriever,\ntitle={Retriever: Learning Content-Style Representation as a Token-Level Bipartite Graph},\nauthor={Dacheng Yin and Xuanchi Ren and Chong Luo and Yuwang Wang and Zhiwei Xiong and Wenjun Zeng},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=AXWygMvuT6Q}\n}", "github": "", "project": "", "reviewers": "c2Ut;1d43;WCnU", "pdf_size": 0, "recommendation": "6;6;8", "confidence": "3;3;4", "correctness": "3;3;4", "technical_novelty": "3;3;3", "empirical_novelty": "0;2;4", "wc_summary_paper": "67;47;53", "wc_summary_review": "59;21;38", "wc_main_review": "132;272;473", "wc_review": "258;340;564", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "543;656;679", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "recommendation_avg": [ 6.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.0, 1.632993161855452 ], "wc_summary_paper_avg": [ 55.666666666666664, 8.379870059984356 ], "wc_summary_review_avg": [ 39.333333333333336, 15.542057635833023 ], "wc_main_review_avg": [ 292.3333333333333, 139.95316676977657 ], "wc_review_avg": [ 387.3333333333333, 129.32989686156185 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 626.0, 59.43624034767565 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.9999999999999998, "corr_recommendation_correctness": 0.9999999999999998, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17348549797304685480&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=AXWygMvuT6Q", "email": "ustc.edu;hkust.edu;microsoft.com;microsoft.com;ustc.edu.cn;eias.ac.cn", "author_num": 6, "aff_unique_index": "0;1;2;2;0;3", "aff_unique_norm": "University of Science and Technology of China;Hong Kong University of Science and Technology;Microsoft;Eastern Institute for Advanced Study", "aff_unique_dep": ";;Research;", "aff_unique_url": "http://www.ustc.edu.cn;https://www.ust.hk;https://www.microsoft.com/en-us/research/group/asia;", "aff_unique_abbr": "USTC;HKUST;MSR Asia;", "aff_campus_unique_index": "1;2;2", "aff_campus_unique": ";Hong Kong SAR;Asia", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China;" }, { "id": "AXXohj2qWlw", "title": "Discovering Novel Customer Features with Recurrent Neural Networks for Personality Based Financial Services", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "The micro-segmentation of customers in the finance sector is a non-trivial task and has been an atypical omission from recent scientific literature. Where traditional segmentation classifies customers based on coarse features such as demographics, micro-segmentation depicts more nuanced differences between individuals, bringing forth several advantages including the potential for improved personalization in financial services. AI and representation learning offer a unique opportunity to solve the problem of micro-segmentation. Although ubiquitous in many industries, the proliferation of AI in sensitive industries such as finance has become contingent on the imperatives of responsible AI. We had previously solved the micro-segmentation problem by extracting temporal features from the state space of a recurrent neural network (RNN). However, due to the inherent opacity of RNNs our solution lacked an explanation - one of the imperatives of responsible AI. In this study, we address this issue by extracting an explanation for and providing an interpretation of our temporal features. We investigate the state space of our RNN and through a linear regression model reconstruct the trajectories in the state space with high fidelity. We show that our linear regression coefficients have not only learned the rules used to create the RNN\u2019s output data but have also learned the relationships that were not directly evident in the raw data.", "keywords": "AI in finance;micro-segmentation;personality traits;explainability;recurrent neural networks;trajectory clustering;attractors", "primary_area": "", "supplementary_material": "", "author": "Charl Maree;Christian W. Omlin", "authorids": "~Charl_Maree2;~Christian_W._Omlin1", "gender": ";", "homepage": ";", "dblp": ";25/2392", "google_scholar": "hqar1hgAAAAJ;", "orcid": "my-orcid?orcid=0000-0002-7282-4661;", "linkedin": "charlmaree/;", "or_profile": "~Charl_Maree2;~Christian_W._Omlin1", "aff": "University of Agder;", "aff_domain": "uia.no;", "position": "PhD student;", "bibtex": "@misc{\nmaree2022discovering,\ntitle={Discovering Novel Customer Features with Recurrent Neural Networks for Personality Based Financial Services},\nauthor={Charl Maree and Christian W. Omlin},\nyear={2022},\nurl={https://openreview.net/forum?id=AXXohj2qWlw}\n}", "github": "", "project": "", "reviewers": "sbVx;qCF2;Dnr2;Sccv", "site": "https://openreview.net/forum?id=AXXohj2qWlw", "pdf_size": 0, "recommendation": "1;3;3;3", "confidence": "4;4;4;4", "correctness": "2;2;3;2", "technical_novelty": "2;1;1;2", "empirical_novelty": "2;1;2;2", "wc_summary_paper": "47;22;65;62", "wc_summary_review": "26;32;16;19", "wc_main_review": "132;270;172;241", "wc_review": "205;324;253;322", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 2.5, 0.8660254037844386 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 2.25, 0.4330127018922193 ], "technical_novelty_avg": [ 1.5, 0.5 ], "empirical_novelty_avg": [ 1.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 49.0, 17.014699527173555 ], "wc_summary_review_avg": [ 23.25, 6.219927652312364 ], "wc_main_review_avg": [ 203.75, 54.61856369404087 ], "wc_review_avg": [ 276.0, 49.97499374687305 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8371593937482940483&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "University of Agder", "aff_unique_dep": "", "aff_unique_url": "https://www.uia.no", "aff_unique_abbr": "UiA", "aff_country_unique_index": "0", "aff_country_unique": "Norway" }, { "id": "AawMbgacl0t", "title": "Image Functions In Neural Networks: A Perspective On Generalization", "track": "main", "status": "Reject", "tldr": "", "abstract": "In this work, we show that training with SGD on ReLU neural networks gives rise to a natural set of functions for each image that are not perfectly correlated until later in training. Furthermore, we show experimentally that the intersection of paths for different images also changes during the course of training. We hypothesize that this lack of correlation and changing intersection may be a factor in explaining generalization, because it encourages the model to use different features at different times, and pass the same image through different functions during training. This may improve generalization in two ways. 1) By encouraging the model to learn the same image in different ways, and learn different commonalities between images, comparable to model ensembling. 2) By improving algorithmic stability, as for a particular feature, the model is not always reliant on the same set of images, so the removal of an image may not adversely affect the loss.", "keywords": "generalization;ensembling;algorithmic stability", "primary_area": "", "supplementary_material": "/attachment/65366a952fa3995b970febaca54ab1738540ec74.zip", "author": "Arushi Gupta", "authorids": "~Arushi_Gupta1", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "~Arushi_Gupta1", "aff": "Department of Computer Science, Princeton University", "aff_domain": "cs.princeton.edu", "position": "PhD student", "bibtex": "@misc{\ngupta2022image,\ntitle={Image Functions In Neural Networks: A Perspective On Generalization},\nauthor={Arushi Gupta},\nyear={2022},\nurl={https://openreview.net/forum?id=AawMbgacl0t}\n}", "github": "", "project": "", "reviewers": "fEgL;ec11;M7Dz;ypTe", "site": "https://openreview.net/forum?id=AawMbgacl0t", "pdf_size": 0, "recommendation": "3;3;3;3", "confidence": "3;2;4;3", "correctness": "2;2;2;3", "technical_novelty": "1;1;2;2", "empirical_novelty": "1;2;3;2", "wc_summary_paper": "35;75;41;123", "wc_summary_review": "39;21;17;124", "wc_main_review": "402;484;377;493", "wc_review": "476;580;435;740", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 3.0, 0.7071067811865476 ], "correctness_avg": [ 2.25, 0.4330127018922193 ], "technical_novelty_avg": [ 1.5, 0.5 ], "empirical_novelty_avg": [ 2.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 68.5, 34.96784236981173 ], "wc_summary_review_avg": [ 50.25, 43.37842205521081 ], "wc_main_review_avg": [ 439.0, 50.38352905464245 ], "wc_review_avg": [ 557.75, 117.75053078436632 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 1, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:D-C3MhHAS7kJ:scholar.google.com/&scioq=Image+Functions+In+Neural+Networks:+A+Perspective+On+Generalization&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "Princeton University", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.princeton.edu", "aff_unique_abbr": "Princeton", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "id": "Ab0o8YMJ8a", "title": "Automated Channel Pruning with Learned Importance", "track": "main", "status": "Reject", "tldr": "", "abstract": "Neural network pruning allows for significant reduction of model size and latency. However, most of the current network pruning methods do not consider channel interdependencies and a lot of manual adjustments are required before they can be applied to new network architectures. Moreover, these algorithms are often based on hand-picked, sometimes complicated heuristics and can require thousands of GPU computation hours. In this paper, we introduce a simple neural network pruning and fine-tuning framework that requires no manual heuristics, is highly efficient to train (2-6 times speed up compared to NAS-based competitors) and produces comparable performance. The framework contains 1) an automatic channel detection algorithm that groups the interdependent blocks of channels; 2) a non-iterative pruning algorithm that learns channel importance directly from feature maps while masking the coupled computational blocks using Gumbel-Softmax sampling and 3) a hierarchical knowledge distillation approach to fine-tune the pruned neural networks. We validate our pipeline on ImageNet classification, human segmentation and image denoising, creating lightweight and low latency models, easy to deploy on mobile devices. Using our pruning algorithm and hierarchical knowledge distillation for fine-tuning we are able to prune EfficientNet B0, EfficientNetV2 B0 and MobileNetV2 to 75% of their original FLOPs with no loss of accuracy on ImageNet. We release a set pruned backbones as Keras models - all of them proved beneficial when deployed in other projects.", "keywords": "channel pruning;knowledge distillation", "primary_area": "", "supplementary_material": "", "author": "\u0141ukasz Treszczotko;Pawel Kubik", "authorids": "~\u0141ukasz_Treszczotko1;~Pawel_Kubik1", "gender": "M;M", "homepage": ";", "dblp": "263/3986.html;", "google_scholar": ";", "orcid": ";my-orcid?orcid=0000-0002-1229-5286", "linkedin": "%C5%82ukasz-treszczotko-77653b14b/;pkubik/", "or_profile": "~\u0141ukasz_Treszczotko1;~Pawel_Kubik1", "aff": "TCL Research Europe;Warsaw University of Technology", "aff_domain": "tcl-research.pl;pw.edu.pl", "position": "Researcher;PhD student", "bibtex": "@misc{\ntreszczotko2022automated,\ntitle={Automated Channel Pruning with Learned Importance},\nauthor={{\\L}ukasz Treszczotko and Pawel Kubik},\nyear={2022},\nurl={https://openreview.net/forum?id=Ab0o8YMJ8a}\n}", "github": "", "project": "", "reviewers": "b957;Www2;Gsnx", "site": "https://openreview.net/forum?id=Ab0o8YMJ8a", "pdf_size": 0, "recommendation": "3;5;5", "confidence": "3;5;3", "correctness": "3;3;3", "technical_novelty": "2;3;3", "empirical_novelty": "2;2;3", "wc_summary_paper": "57;89;39", "wc_summary_review": "31;54;4", "wc_main_review": "421;209;92", "wc_review": "509;352;135", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 4.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.9428090415820634 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 61.666666666666664, 20.677416559027762 ], "wc_summary_review_avg": [ 29.666666666666668, 20.43417616532547 ], "wc_main_review_avg": [ 240.66666666666666, 136.1673806590828 ], "wc_review_avg": [ 332.0, 153.3384057132024 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.5, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:0f9ysA119YQJ:scholar.google.com/&scioq=Automated+Channel+Pruning+with+Learned+Importance&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1", "aff_unique_norm": "TCL Communication;Warsaw University of Technology", "aff_unique_dep": "TCL Research;", "aff_unique_url": "https://www.tclcommunication.com/;https://www.pw.edu.pl", "aff_unique_abbr": "TCL;WUT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "Unknown;Poland" }, { "title": "A Reduction-Based Framework for Conservative Bandits and Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7122", "id": "AcrlgZ9BKed", "poster": "", "openreview": "https://openreview.net/forum?id=AcrlgZ9BKed", "slides": "https://iclr.cc/virtual/2022/poster/7122", "video": "https://iclr.cc/virtual/2022/poster/7122", "author_site": "Yunchang Yang, Tianhao Wu, Han Zhong, Evrard Garcelon, Matteo Pirotta, Alessandro Lazaric, Liwei Wang, Simon Du", "tldr": "", "abstract": "We study bandits and reinforcement learning (RL) subject to a conservative constraint where the agent is asked to perform at least as well as a given baseline policy. This setting is particular relevant in real-world domains including digital marketing, healthcare, production, finance, etc. In this paper, we present a reduction-based framework for conservative bandits and RL, in which our core technique is to calculate the necessary and sufficient budget obtained from running the baseline policy. For lower bounds, we improve the existing lower bound for conservative multi-armed bandits and obtain new lower bounds for conservative linear bandits, tabular RL and low-rank MDP, through a black-box reduction that turns a certain lower bound in the nonconservative setting into a new lower bound in the conservative setting. For upper bounds, in multi-armed bandits, linear bandits and tabular RL, our new upper bounds tighten or match existing ones with significantly simpler analyses. We also obtain a new upper bound for conservative low-rank MDP.", "keywords": "bandits;lower bound;reinforcement learning theory", "primary_area": "", "supplementary_material": "/attachment/45ba5a4be1d2fc184a6934b192f04e6bb712b358.zip", "author": "Yunchang Yang;Tianhao Wu;Han Zhong;Evrard Garcelon;Matteo Pirotta;Alessandro Lazaric;Liwei Wang;Simon Shaolei Du", "authorids": "~Yunchang_Yang2;~Tianhao_Wu1;~Han_Zhong1;~Evrard_Garcelon1;~Matteo_Pirotta1;~Alessandro_Lazaric2;~Liwei_Wang1;~Simon_Shaolei_Du1", "gender": "M;M;;M;;M;M;M", "homepage": ";https://thwu1.github.io/tianhaowu/;https://hanzhong-ml.github.io/;;;;http://www.liweiwang-pku.com/;http://simonshaoleidu.com", "dblp": "249/8267;;137/8096.html;;137/3249;36/321;;176/5602", "google_scholar": "https://scholar.google.com.hk/citations?user=m8m9nD0AAAAJ;df-THM0AAAAJ;Bk5q_pAAAAAJ;cKtU3eAAAAAJ;https://scholar.google.ca/citations?user=6qWcDTAAAAAJ;6JZ3R6wAAAAJ;VZHxoh8AAAAJ;OttawxUAAAAJ", "orcid": ";;;;;;;", "linkedin": ";tianhao-wu-b069296/;;evrard-garcelon-51219412a/;;;;", "or_profile": "~Yunchang_Yang2;~Tianhao_Wu1;~Han_Zhong1;~Evrard_Garcelon1;~Matteo_Pirotta1;~Alessandro_Lazaric2;~Liwei_Wang1;~Simon_Shaolei_Du1", "aff": "Peking University;University of California, Berkeley;Peking University;Ensae ParisTech;Meta;Meta Facebook;Peking University;Meta Facebook", "aff_domain": "pku.edu.cn;berkeley.edu;stu.pku.edu.cn;ensae.fr;meta.com;fb.com;pku.edu.cn;fb.com", "position": "PhD student;PhD student;PhD student;Phd;Research Scientist;Research Scientist;Full Professor;Visiting Professor", "bibtex": "@inproceedings{\nyang2022a,\ntitle={A Reduction-Based Framework for Conservative Bandits and Reinforcement Learning},\nauthor={Yunchang Yang and Tianhao Wu and Han Zhong and Evrard Garcelon and Matteo Pirotta and Alessandro Lazaric and Liwei Wang and Simon Shaolei Du},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=AcrlgZ9BKed}\n}", "github": "", "project": "", "reviewers": "nbyJ;wDiu;g4nC;yivy", "pdf_size": 0, "recommendation": "6;6;8;8", "confidence": "4;4;4;3", "correctness": "3;4;4;4", "technical_novelty": "3;2;3;3", "empirical_novelty": "1;0;0;3", "wc_summary_paper": "52;66;217;66", "wc_summary_review": "71;51;13;125", "wc_main_review": "523;69;219;219", "wc_review": "646;186;449;410", "wc_reply_reviewers": "13;0;0;0", "wc_reply_authors": "691;430;389;475", "reply_reviewers": "1;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 7.0, 1.0 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.0, 1.224744871391589 ], "wc_summary_paper_avg": [ 100.25, 67.64752397538287 ], "wc_summary_review_avg": [ 65.0, 40.422765862815474 ], "wc_main_review_avg": [ 257.5, 165.06589593250328 ], "wc_review_avg": [ 422.75, 163.3575449742068 ], "wc_reply_reviewers_avg": [ 3.25, 5.629165124598851 ], "wc_reply_authors_avg": [ 496.25, 116.480416809007 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3224136827881645269&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=AcrlgZ9BKed", "email": "pku.edu.cn;berkeley.edu;stu.pku.edu.cn;ensae.fr;meta.com;fb.com;pku.edu.cn;fb.com", "author_num": 8, "aff_unique_index": "0;1;0;2;3;3;0;3", "aff_unique_norm": "Peking University;University of California, Berkeley;ENSAE ParisTech;Meta", "aff_unique_dep": ";;;Meta Platforms, Inc.", "aff_unique_url": "http://www.pku.edu.cn;https://www.berkeley.edu;https://www.ensae.fr;https://meta.com", "aff_unique_abbr": "Peking U;UC Berkeley;Ensae;Meta", "aff_campus_unique_index": "1", "aff_campus_unique": ";Berkeley", "aff_country_unique_index": "0;1;0;2;1;1;0;1", "aff_country_unique": "China;United States;France" }, { "id": "AdEM_SzfSd", "title": "Assessing two novel distance-based loss functions for few-shot image classification", "track": "main", "status": "Reject", "tldr": "", "abstract": "Few-shot learning is a challenging area of research which aims to learn new concepts with only a few labeled samples of data. Recent works based on metric-learning approaches benefit from the meta-learning process in which we have episodic tasks conformed by support set (training) and query set (test), and the objective is to learn a similarity comparison metric between those sets. Due to the lack of data, the learning process of the embedding network becomes an important part of the few-shot task. In this work, we propose two different loss functions which consider the importance of the embedding vectors by looking at the intra-class and inter-class distance between the few data. The first loss function is the Proto-Triplet Loss, which is based on the original triplet loss with the modifications needed to better work on few-shot scenarios. The second loss function is based on an inter and intra class nearest neighbors score, which help us to know the quality of embeddings obtained from the trained network. Extensive experimental results on the miniImagenNet benchmark increase the accuracy performance from other metric-based few-shot learning methods by a margin of $2\\%$, demonstrating the capability of these loss functions to allow the network to generalize better to previously unseen classes.", "keywords": "Metric learning;few-shot learning;image classification", "primary_area": "", "supplementary_material": "", "author": "Mauricio Mendez Ruiz;Gilberto Ochoa Ruiz;Andres Mendez Vazquez;Jorge Gonzalez-Zapata", "authorids": "~Mauricio_Mendez_Ruiz1;~Gilberto_Ochoa_Ruiz1;andres.mendez@cinvestav.mx;jorge.gonzalezzapata@cinvestav.mx", "gender": "M;M;;", "homepage": "https://github.com/maumruiz;http://beton-ochoa.github.io;;", "dblp": ";77/7138;;", "google_scholar": ";DDtiliwAAAAJ;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Mauricio_Mendez_Ruiz1;~Gilberto_Ochoa_Ruiz1;andres.mendez@cinvestav.mx;jorge.gonzalezzapata@cinvestav.mx", "aff": ";Tecnologico de Monterrey;;", "aff_domain": ";tec.mx;;", "position": ";Assistant Professor;;", "bibtex": "@misc{\nruiz2022assessing,\ntitle={Assessing two novel distance-based loss functions for few-shot image classification},\nauthor={Mauricio Mendez Ruiz and Gilberto Ochoa Ruiz and Andres Mendez Vazquez and Jorge Gonzalez-Zapata},\nyear={2022},\nurl={https://openreview.net/forum?id=AdEM_SzfSd}\n}", "github": "", "project": "", "reviewers": "TCLP;dCJD;noXn;6rdQ", "site": "https://openreview.net/forum?id=AdEM_SzfSd", "pdf_size": 0, "recommendation": "3;3;3;3", "confidence": "3;5;4;4", "correctness": "3;2;2;2", "technical_novelty": "2;2;3;2", "empirical_novelty": "2;2;2;1", "wc_summary_paper": "112;80;48;70", "wc_summary_review": "41;20;41;32", "wc_main_review": "173;209;144;146", "wc_review": "326;309;233;248", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 2.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 77.5, 23.038012067016545 ], "wc_summary_review_avg": [ 33.5, 8.616843969807043 ], "wc_main_review_avg": [ 168.0, 26.296387584609413 ], "wc_review_avg": [ 279.0, 39.32556420447137 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:pbT-KICHFakJ:scholar.google.com/&scioq=Assessing+two+novel+distance-based+loss+functions+for+few-shot+image+classification&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "Tecnologico de Monterrey", "aff_unique_dep": "", "aff_unique_url": "https://www.tec.mx", "aff_unique_abbr": "Tec de Monterrey", "aff_country_unique_index": "0", "aff_country_unique": "Mexico" }, { "id": "AfeeU77SUx", "title": "Learning with Few-Shot Complementary Labels", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Complementary-label (CL) learning deals with the weak supervision scenario where each training instance is associated with one complementary label, which specifies the class label that the instance does not belong to. Since these CL algorithms rely on the assumption of a large amount of labeled/unlabeled training data, they cannot be applied in few-shot scenarios and perform well. To bridge the gap, we propose a Few-shot Complementary-Label (FsCL) training pattern with three kinds of surrogate loss, which is based on the Model-Agnostic Meta-Learning (MAML) and bilevel optimization. FsCL firstly modifies the inductive bias of the meta-learner apart from the misleading of complementary label and insufficient sample diversity in the outer loop. Next, the inner loop is used to solve the target FsCL classification problem with base learner initialized from meta-learner. Accordingly, an unseen example can be precisely classified via the maximize probability output of base learner. We demonstrate the effectiveness of our approach in an extensive empirical study and theoretical analysis.", "keywords": "Complementary-label learning;Few-shot learning", "primary_area": "", "supplementary_material": "/attachment/63411b8f6d77e4245144df338959936acb79ef4a.zip", "author": "Qiufeng Wang", "authorids": "~Qiufeng_Wang3", "gender": "M", "homepage": "http://palm.seu.edu.cn/homepage/wangqiufeng/demo/index.html", "dblp": "", "google_scholar": "HQYQkTwAAAAJ", "orcid": "0000-0001-7680-6607", "linkedin": "", "or_profile": "~Qiufeng_Wang3", "aff": "Southeast University", "aff_domain": "seu.edu.cn", "position": "PhD student", "bibtex": "@misc{\nwang2022learning,\ntitle={Learning with Few-Shot Complementary Labels},\nauthor={Qiufeng Wang},\nyear={2022},\nurl={https://openreview.net/forum?id=AfeeU77SUx}\n}", "github": "", "project": "", "reviewers": "", "site": "https://openreview.net/forum?id=AfeeU77SUx", "pdf_size": 0, "recommendation": "", "confidence": "", "correctness": "", "technical_novelty": "", "empirical_novelty": "", "wc_summary_paper": "", "wc_summary_review": "", "wc_main_review": "", "wc_review": "", "wc_reply_reviewers": "", "wc_reply_authors": "", "reply_reviewers": "", "reply_authors": "", "recommendation_avg": [ 0, 0 ], "confidence_avg": [ 0, 0 ], "correctness_avg": [ 0, 0 ], "technical_novelty_avg": [ 0, 0 ], "empirical_novelty_avg": [ 0, 0 ], "wc_summary_paper_avg": [ 0, 0 ], "wc_summary_review_avg": [ 0, 0 ], "wc_main_review_avg": [ 0, 0 ], "wc_review_avg": [ 0, 0 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 1, 0 ], "authors#_avg": [ 1, 0 ], "corr_recommendation_confidence": 0, "corr_recommendation_correctness": 0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:jfLv04QAe-cJ:scholar.google.com/&scioq=Learning+with+Few-Shot+Complementary+Labels&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "Southeast University", "aff_unique_dep": "", "aff_unique_url": "https://www.seu.edu.cn/", "aff_unique_abbr": "SEU", "aff_country_unique_index": "0", "aff_country_unique": "China" }, { "id": "AgDwZa1AiJt", "title": "When in Doubt, Summon the Titans: A Framework for Efficient Inference with Large Models", "track": "main", "status": "Reject", "tldr": "", "abstract": "Scaling neural networks to \"large\" sizes, with billions of parameters, has been shown to yield impressive results on many challenging problems. However, the inference cost incurred by such large models often prevent their application in most real-world settings. In this paper, we propose a two-stage framework based on distillation that realizes the modelling benefits of the large models, while largely preserving the computational benefits of inference with more lightweight models. In a nutshell, we use the large teacher models to guide the lightweight student models to only make correct predictions on a subset of \"easy\" examples; for the \"hard\" examples, we fall-back to the teacher. Such an approach allows us to efficiently employ large models in practical scenarios where easy examples are much more frequent than rare hard examples. Our proposed use of distillation to only handle easy instances allows for a more aggressive trade-off in the student size, thereby reducing the amortized cost of inference and achieving better accuracy than standard distillation. Empirically, we demonstrate the benefits of our approach on both image classification and natural language processing benchmarks.", "keywords": "Distillation;Large models;Efficient inference", "primary_area": "", "supplementary_material": "", "author": "Ankit Singh Rawat;Manzil Zaheer;Aditya Krishna Menon;Amr Ahmed;Sanjiv Kumar", "authorids": "~Ankit_Singh_Rawat1;~Manzil_Zaheer1;~Aditya_Krishna_Menon1;~Amr_Ahmed1;~Sanjiv_Kumar1", "gender": "M;M;M;;M", "homepage": "https://ankitsrawat.github.io/home/;https://www.aclweb.org/anthology/people/m/manzil-zaheer/;https://research.google/people/AmrAhmed/;http://www.sanjivk.com/;https://akmenon.github.io/", "dblp": "https://dblp.org/pers/hd/r/Rawat:Ankit_Singh;40/10701;49/2951;;89/3514", "google_scholar": "http://scholar.google.com/citations?user=U0_ab4cAAAAJ;A33FhJMAAAAJ;ivUi2T0AAAAJ;https://scholar.google.com/citations?hl=en;", "orcid": ";;;;", "linkedin": ";;amr-ahmed-b998965/;;", "or_profile": "~Ankit_Singh_Rawat1;~Manzil_Zaheer1;~Amr_Ahmed1;~Sanjiv_Kumar1;~Aditya_Menon1", "aff": "Google;Google DeepMind;;Google;Google", "aff_domain": "google.com;deepmind.com;;google.com;google.com", "position": "Research Scientist;Researcher;;Research Scientist;Research Scientist", "bibtex": "@misc{\nrawat2022when,\ntitle={When in Doubt, Summon the Titans: A Framework for Efficient Inference with Large Models},\nauthor={Ankit Singh Rawat and Manzil Zaheer and Aditya Krishna Menon and Amr Ahmed and Sanjiv Kumar},\nyear={2022},\nurl={https://openreview.net/forum?id=AgDwZa1AiJt}\n}", "github": "", "project": "", "reviewers": "npvb;wETK;sKmp;nhWf", "site": "https://openreview.net/forum?id=AgDwZa1AiJt", "pdf_size": 0, "recommendation": "3;5;6;6", "confidence": "4;3;3;4", "correctness": "2;3;3;3", "technical_novelty": "1;2;3;2", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "51;45;184;33", "wc_summary_review": "29;24;38;132", "wc_main_review": "154;169;262;78", "wc_review": "234;238;484;243", "wc_reply_reviewers": "0;0;20;8", "wc_reply_authors": "618;731;782;224", "reply_reviewers": "0;0;1;1", "reply_authors": "1;1;1;1", "recommendation_avg": [ 5.0, 1.224744871391589 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 78.25, 61.39778090452455 ], "wc_summary_review_avg": [ 55.75, 44.30787176112163 ], "wc_main_review_avg": [ 165.75, 65.40785503286284 ], "wc_review_avg": [ 299.75, 106.42456248441898 ], "wc_reply_reviewers_avg": [ 7.0, 8.18535277187245 ], "wc_reply_authors_avg": [ 588.75, 218.7914246491393 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.40824829046386296, "corr_recommendation_correctness": 0.9428090415820632, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:xrqZ_AsDROMJ:scholar.google.com/&scioq=When+in+Doubt,+Summon+the+Titans:+A+Framework+for+Efficient+Inference+with+Large+Models&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Google", "aff_unique_dep": "Google", "aff_unique_url": "https://www.google.com", "aff_unique_abbr": "Google", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;1;0;0", "aff_country_unique": "United States;United Kingdom" }, { "title": "Learning Efficient Image Super-Resolution Networks via Structure-Regularized Pruning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/5963", "id": "AjGC97Aofee", "poster": "", "openreview": "https://openreview.net/forum?id=AjGC97Aofee", "slides": "https://iclr.cc/virtual/2022/poster/5963", "video": "https://iclr.cc/virtual/2022/poster/5963", "author_site": "Yulun Zhang, Huan Wang, Can Qin, Yun Fu", "tldr": "", "abstract": "Several image super-resolution (SR) networks have been proposed of late for efficient SR, achieving promising results. However, they are still not lightweight enough and neglect to be extended to larger networks. At the same time, model compression techniques, like neural architecture search and knowledge distillation, typically consume considerable computation resources. In contrast, network pruning is a cheap and effective model compression technique. However, it is hard to be applied to SR networks directly because filter pruning for residual blocks is well-known tricky. To address the above issues, we propose structure-regularized pruning (SRP), which imposes regularization on the pruned structure to ensure the locations of pruned filters are aligned across different layers. Specifically, for the layers connected by the same residual, we select the filters of the same indices as unimportant filters. To transfer the expressive power in the unimportant filters to the rest of the network, we employ $L_2$ regularization to drive the weights towards zero so that eventually, their absence will cause minimal performance degradation. We apply SRP to train efficient image SR networks, resulting in a lightweight network SRPN-Lite and a very deep one SRPN. We conduct extensive comparisons with both lightweight and larger networks. SRPN-Lite and SRPN perform favorably against other recent efficient SR approaches quantitatively and visually.", "keywords": "image super-resolution", "primary_area": "", "supplementary_material": "", "author": "Yulun Zhang;Huan Wang;Can Qin;Yun Fu", "authorids": "~Yulun_Zhang1;~Huan_Wang3;~Can_Qin1;~Yun_Fu1", "gender": "M;M;M;M", "homepage": "http://yulunzhang.com/;https://huanwang.tech/;http://canqin.tech;http://www1.ece.neu.edu/~yunfu/", "dblp": "166/2763-1.html;70/6155-14;214/2488;00/5815-1", "google_scholar": "ORmLjWoAAAAJ;0-On0y4AAAAJ;QCik-YcAAAAJ;https://scholar.google.com.tw/citations?user=h-JEcQ8AAAAJ", "orcid": "0000-0002-2288-5079;0000-0001-6951-901X;;0000-0002-5098-2853", "linkedin": "yulun-zhang-1116b5b9/;huanwang-zju/;;furaymond/", "or_profile": "~Yulun_Zhang1;~Huan_Wang3;~Can_Qin1;~Yun_Fu1", "aff": "Swiss Federal Institute of Technology;Northeastern University;Northeastern University;Northeastern University", "aff_domain": "ethz.ch;neu.edu;neu.edu;northeastern.edu", "position": "Postdoc;PhD student;PhD student;Full Professor", "bibtex": "@inproceedings{\nzhang2022learning,\ntitle={Learning Efficient Image Super-Resolution Networks via Structure-Regularized Pruning},\nauthor={Yulun Zhang and Huan Wang and Can Qin and Yun Fu},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=AjGC97Aofee}\n}", "github": "", "project": "", "reviewers": "N1G5;xzfp;t8M1;yDGk", "pdf_size": 0, "recommendation": "5;6;8;8", "confidence": "4;4;4;5", "correctness": "3;1;4;3", "technical_novelty": "2;3;3;4", "empirical_novelty": "3;2;3;4", "wc_summary_paper": "68;56;200;34", "wc_summary_review": "32;15;11;63", "wc_main_review": "237;253;153;316", "wc_review": "337;324;364;413", "wc_reply_reviewers": "55;0;0;98", "wc_reply_authors": "1061;1262;393;391", "reply_reviewers": "1;0;0;1", "reply_authors": "4;2;1;2", "recommendation_avg": [ 6.75, 1.299038105676658 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 2.75, 1.0897247358851685 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 89.5, 64.9519052838329 ], "wc_summary_review_avg": [ 30.25, 20.48627589387588 ], "wc_main_review_avg": [ 239.75, 58.14367979411004 ], "wc_review_avg": [ 359.5, 34.09178786746157 ], "wc_reply_reviewers_avg": [ 38.25, 41.16050898616294 ], "wc_reply_authors_avg": [ 776.75, 391.2584663620712 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 2.25, 1.0897247358851685 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.5555555555555555, "corr_recommendation_correctness": 0.4856618642571827, "gs_citation": 69, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3162888533545125217&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "pdf": "https://openreview.net/pdf?id=AjGC97Aofee", "email": "ethz.ch;neu.edu;neu.edu;northeastern.edu", "author_num": 4, "aff_unique_index": "0;1;1;1", "aff_unique_norm": "Swiss Federal Institute of Technology;Northeastern University", "aff_unique_dep": ";", "aff_unique_url": "https://www.ethz.ch;https://www.northeastern.edu", "aff_unique_abbr": "ETH Zurich;NEU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1", "aff_country_unique": "Switzerland;United States" }, { "id": "AkJyAE46GA", "title": "Pretrained models are active learners", "track": "main", "status": "Reject", "tldr": "", "abstract": "An important barrier to the safe deployment of machine learning systems is the risk of \\emph{task ambiguity}, where multiple behaviors are consistent with the provided examples. We investigate whether pretrained models are better active learners, capable of asking for example labels that \\textit{disambiguate} between the possible tasks a user may be trying to specify. Across a range of image and text datasets with spurious correlations, latent minority groups, or domain shifts, finetuning pretrained models with data acquired through simple uncertainty sampling achieves the same accuracy with \\textbf{up to 6$\\times$ fewer labels} compared to random sampling. Moreover, the examples chosen by these models are preferentially minority classes or informative examples where the spurious feature and class label are decorrelated. Notably, gains from active learning are not seen in unpretrained models, which do not select such examples, suggesting that the ability to actively learn is an emergent property of the pretraining process.", "keywords": "pretraining;active learning;alignment;safety", "primary_area": "", "supplementary_material": "", "author": "Alex Tamkin;Dat Nguyen;Salil Deshpande;Jesse Mu;Noah Goodman", "authorids": "~Alex_Tamkin1;datpn2@gmail.com;salil512@stanford.edu;~Jesse_Mu1;~Noah_Goodman1", "gender": ";;;;", "homepage": ";;;https://www.jesse.mu/;https://cocolab.stanford.edu/", "dblp": ";;;205/9022;96/1216", "google_scholar": ";;;djLcGEQAAAAJ;OUpIbcQAAAAJ", "orcid": ";;;0000-0002-0812-2710;", "linkedin": ";;;jayelm;", "or_profile": "~Alex_Tamkin1;datpn2@gmail.com;salil512@stanford.edu;~Jesse_Mu1;~Noah_Goodman1", "aff": ";;;Stanford University;Stanford University", "aff_domain": ";;;stanford.edu;stanford.edu", "position": ";;;PhD student;Full Professor", "bibtex": "@misc{\ntamkin2022pretrained,\ntitle={Pretrained models are active learners},\nauthor={Alex Tamkin and Dat Nguyen and Salil Deshpande and Jesse Mu and Noah Goodman},\nyear={2022},\nurl={https://openreview.net/forum?id=AkJyAE46GA}\n}", "github": "", "project": "", "reviewers": "Gazs;bL9o;uqL6;KGpV", "site": "https://openreview.net/forum?id=AkJyAE46GA", "pdf_size": 0, "recommendation": "3;3;5;8", "confidence": "4;4;5;3", "correctness": "4;3;3;4", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "58;139;70;152", "wc_summary_review": "45;103;23;48", "wc_main_review": "594;484;344;100", "wc_review": "697;726;437;300", "wc_reply_reviewers": "0;171;0;0", "wc_reply_authors": "359;403;395;143", "reply_reviewers": "0;1;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 4.75, 2.0463381929681126 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 104.75, 41.227266462864115 ], "wc_summary_review_avg": [ 54.75, 29.4819860253681 ], "wc_main_review_avg": [ 380.5, 184.5988894874506 ], "wc_review_avg": [ 540.0, 178.5035013662197 ], "wc_reply_reviewers_avg": [ 42.75, 74.0451720235695 ], "wc_reply_authors_avg": [ 325.0, 106.3766891757776 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.5183210553488161, "corr_recommendation_correctness": 0.3665083330689157, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:R5D2ltfMDYIJ:scholar.google.com/&scioq=Pretrained+models+are+active+learners&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Stanford University", "aff_unique_dep": "", "aff_unique_url": "https://www.stanford.edu", "aff_unique_abbr": "Stanford", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "AlPBx2zq7Jt", "title": "Align-RUDDER: Learning From Few Demonstrations by Reward Redistribution", "track": "main", "status": "Reject", "tldr": "", "abstract": "Reinforcement Learning algorithms require a large number of samples to solve complex tasks with sparse and delayed rewards. Complex tasks are often hierarchically composed of sub-tasks. Solving a sub-task increases the return expectation and leads to a step in the $Q$-function. RUDDER identifies these steps and then redistributes reward to them, thus immediately giving reward if sub-tasks are solved. Since the delay of rewards is reduced, learning is considerably sped up. However, for complex tasks, current exploration strategies struggle with discovering episodes with high rewards. Therefore, we assume that episodes with high rewards are given as demonstrations and do not have to be discovered by exploration. Unfortunately, the number of demonstrations is typically small and RUDDER's LSTM as a deep learning model does not learn well on these few training samples. Hence, we introduce Align-RUDDER, which is RUDDER with two major modifications. First, Align-RUDDER assumes that episodes with high rewards are given as demonstrations, replacing RUDDER\u2019s safe exploration and lessons replay buffer. Second, we substitute RUDDER\u2019s LSTM model by a profile model that is obtained from multiple sequence alignment of demonstrations. Profile models can be constructed from as few as two demonstrations. Align-RUDDER uses reward redistribution to speed up learning by reducing the delay of rewards. Align-RUDDER outperforms competitors on complex artificial tasks with delayed rewards and few demonstrations. On the MineCraft ObtainDiamond task, Align-RUDDER is able to mine a diamond, though not frequently. ", "keywords": "RUDDER;reinforcement learning;reward redistribution;return decomposition;delayed reward;sparse reward;episodic reward;minecraft", "primary_area": "", "supplementary_material": "/attachment/5500bc943d6247c427f68a63e4b93dad4d29ffd8.zip", "author": "Vihang Prakash Patil;Markus Hofmarcher;Marius-Constantin Dinu;Matthias Dorfer;Patrick M Blies;Johannes Brandstetter;Jose Arjona-Medina;Sepp Hochreiter", "authorids": "~Vihang_Prakash_Patil1;~Markus_Hofmarcher1;~Marius-Constantin_Dinu1;~Matthias_Dorfer1;~Patrick_M_Blies1;~Johannes_Brandstetter1;~Jose_Arjona-Medina1;~Sepp_Hochreiter1", "gender": "M;M;;M;M;M;M;", "homepage": "https://vihangp.github.io;;https://www.jku.at/en/institut-fuer-computational-perception/ueber-uns/ehemalige-mitarbeiter/matthias-dorfer/;;https://www.jku.at/en/institute-for-machine-learning/about-us/team/sepp-hochreiter/;http://www.arjonamedina.com;https://www.dinu.at;", "dblp": "https://dblp.uni-trier.de/pid/275/2942;224/9960;;251/8691;h/SeppHochreiter.html;;275/3387;", "google_scholar": "1iwYpk0AAAAJ;FD27EMIAAAAJ;;KiRvOHcAAAAJ;https://scholar.google.at/citations?user=tvUH3WMAAAAJ;;https://scholar.google.com/citations?hl=en;https://scholar.google.at/citations?user=1PmRuNsAAAAJ", "orcid": ";;;;0000-0001-7449-2528;0000-0002-5033-4725;;", "linkedin": ";;;;https://linkedin.com/in/sepp-hochreiter-41514846;;mariusconstantindinu/;", "or_profile": "~Vihang_Prakash_Patil1;~Markus_Hofmarcher1;~Matthias_Dorfer1;~Johannes_Brandstetter1;~Sepp_Hochreiter1;~Jos\u00e9_Arjona-Medina1;~Marius-Constantin_Dinu_Dinu1;~Patrick_Matthias_Blies1", "aff": "Johannes Kepler University Linz;Johannes Kepler Universit\u00e4t Linz;;Microsoft;Johannes Kepler University Linz;Johannes Kepler Universit\u00e4t Linz;Johannes Kepler University Linz;EnliteAI GmbH", "aff_domain": "jku.at;jku.at;;microsoft.com;jku.at;jku.at;jku.at;enlite.ai", "position": "PhD student;PhD student;;Researcher;Full Professor;Lecturer;PhD student;Research Engineer", "bibtex": "@misc{\npatil2022alignrudder,\ntitle={Align-{RUDDER}: Learning From Few Demonstrations by Reward Redistribution},\nauthor={Vihang Prakash Patil and Markus Hofmarcher and Marius-Constantin Dinu and Matthias Dorfer and Patrick M Blies and Johannes Brandstetter and Jose Arjona-Medina and Sepp Hochreiter},\nyear={2022},\nurl={https://openreview.net/forum?id=AlPBx2zq7Jt}\n}", "github": "", "project": "", "reviewers": "nk2L;mK3T;YcqX;bJpP", "site": "https://openreview.net/forum?id=AlPBx2zq7Jt", "pdf_size": 0, "recommendation": "3;5;6;6", "confidence": "4;4;3;4", "correctness": "2;3;4;3", "technical_novelty": "3;3;2;2", "empirical_novelty": "3;3;3;2", "wc_summary_paper": "78;73;26;84", "wc_summary_review": "56;43;50;60", "wc_main_review": "179;473;640;91", "wc_review": "313;589;716;235", "wc_reply_reviewers": "119;0;262;68", "wc_reply_authors": "404;690;682;185", "reply_reviewers": "1;0;2;1", "reply_authors": "1;2;2;1", "recommendation_avg": [ 5.0, 1.224744871391589 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 65.25, 22.993205518152532 ], "wc_summary_review_avg": [ 52.25, 6.417748826496718 ], "wc_main_review_avg": [ 345.75, 221.06489431838787 ], "wc_review_avg": [ 463.25, 196.44894374875116 ], "wc_reply_reviewers_avg": [ 112.25, 96.2142790857989 ], "wc_reply_authors_avg": [ 490.25, 210.5259782069662 ], "reply_reviewers_avg": [ 1.0, 0.7071067811865476 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": -0.4714045207910316, "corr_recommendation_correctness": 0.8660254037844386, "gs_citation": 64, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17099796649634976721&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 11, "aff_unique_index": "0;1;2;0;1;0;3", "aff_unique_norm": "Johannes Kepler University;Johannes Kepler University Linz;Microsoft;enliteAI", "aff_unique_dep": ";;Microsoft Corporation;", "aff_unique_url": "https://www.jku.at;https://www.jku.at;https://www.microsoft.com;", "aff_unique_abbr": "JKU;JKU;Microsoft;EnliteAI", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Linz;", "aff_country_unique_index": "0;0;1;0;0;0;2", "aff_country_unique": "Austria;United States;Germany" }, { "title": "Trans-Encoder: Unsupervised sentence-pair modelling through self- and mutual-distillations", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6242", "id": "AmUhwTOHgm", "poster": "", "openreview": "https://openreview.net/forum?id=AmUhwTOHgm", "slides": "https://iclr.cc/virtual/2022/poster/6242", "video": "https://iclr.cc/virtual/2022/poster/6242", "author_site": "Fangyu Liu, Yunlong Jiao, Jordan Massiah, Emine Yilmaz, Serhii Havrylov", "tldr": "", "abstract": "In NLP, a large volume of tasks involve pairwise comparison between two sequences (e.g. sentence similarity and paraphrase identification). Predominantly, two formulations are used for sentence-pair tasks: bi-encoders and cross-encoders. Bi-encoders produce fixed-dimensional sentence representations and are computationally efficient, however, they usually underperform cross-encoders. Cross-encoders can leverage their attention heads to exploit inter-sentence interactions for better performance but they require task fine-tuning and are computationally more expensive. In this paper, we present a completely unsupervised sentence representation model termed as Trans-Encoder that combines the two learning paradigms into an iterative joint framework to simultaneously learn enhanced bi- and cross-encoders. Specifically, on top of a pre-trained Language Model (PLM), we start with converting it to an unsupervised bi-encoder, and then alternate between the bi- and cross-encoder task formulations. In each alternation, one task formulation will produce pseudo-labels which are used as learning signals for the other task formulation. We then propose an extension to conduct such self-distillation approach on multiple PLMs in parallel and use the average of their pseudo-labels for mutual distillation. Trans-Encoder creates, to the best of our knowledge, the first completely unsupervised cross-encoder and also a state-of-the-art unsupervised bi-encoder for sentence similarity. Both the bi-encoder and cross-encoder formulations of Trans-Encoder outperform recently proposed state-of-the-art unsupervised sentence encoders such as Mirror-BERT and SimCSE by up to 5% on the sentence similarity benchmarks.", "keywords": "self-supervised learning;sentence embeddings;sentence representations;knowledge distillation", "primary_area": "", "supplementary_material": "", "author": "Fangyu Liu;Yunlong Jiao;Jordan Massiah;Emine Yilmaz;Serhii Havrylov", "authorids": "~Fangyu_Liu1;~Yunlong_Jiao1;~Jordan_Massiah1;~Emine_Yilmaz1;~Serhii_Havrylov1", "gender": "M;M;;F;M", "homepage": "http://fangyuliu.me/about;https://yunlongjiao.github.io/;;https://sites.google.com/site/emineyilmaz/;http://serhii-havrylov.github.io", "dblp": "84/11483-1;164/7317;;36/3270;https://dblp.uni-trier.de/pers/hd/h/Havrylov:Serhii", "google_scholar": "https://scholar.google.ch/citations?user=d19PiS0AAAAJ;https://scholar.google.co.uk/citations?user=NgTM33MAAAAJ;;https://scholar.google.com.tw/citations?user=ocmAN4YAAAAJ;MC4vtXAAAAAJ", "orcid": "0000-0001-7038-3623;0000-0002-0776-0550;;;", "linkedin": "fangyu-liu-48a003b0/;yunlong-jiao/;jordan-massiah-562862136/;;sergii-gavrylov-666796a7/", "or_profile": "~Fangyu_Liu1;~Yunlong_Jiao1;~Jordan_Massiah1;~Emine_Yilmaz1;~Serhii_Havrylov1", "aff": "Microsoft Research;Amazon;Amazon;Department of Computer Science, University College London;University of Edinburgh", "aff_domain": "research.microsoft.com;amazon.com;amazon.com;cs.ucl.ac.uk;ed.ac.uk", "position": "Researcher Intern;Machine Learning Scientist;Research Engineer;Full Professor;PhD student", "bibtex": "@inproceedings{\nliu2022transencoder,\ntitle={Trans-Encoder: Unsupervised sentence-pair modelling through self- and mutual-distillations},\nauthor={Fangyu Liu and Yunlong Jiao and Jordan Massiah and Emine Yilmaz and Serhii Havrylov},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=AmUhwTOHgm}\n}", "github": "", "project": "", "reviewers": "53d8;z3e4;DZEu;ofUo", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "3;4;4;4", "correctness": "2;3;4;4", "technical_novelty": "3;3;3;2", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "74;53;171;211", "wc_summary_review": "44;26;65;117", "wc_main_review": "149;185;201;346", "wc_review": "267;264;437;674", "wc_reply_reviewers": "0;0;37;0", "wc_reply_authors": "641;682;368;724", "reply_reviewers": "0;0;1;0", "reply_authors": "1;1;2;1", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 127.25, 65.72052571305254 ], "wc_summary_review_avg": [ 63.0, 34.09545424246464 ], "wc_main_review_avg": [ 220.25, 75.00458319329559 ], "wc_review_avg": [ 410.5, 167.47313217349225 ], "wc_reply_reviewers_avg": [ 9.25, 16.021469970012117 ], "wc_reply_authors_avg": [ 603.75, 139.2378809807159 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": 0.9045340337332909, "gs_citation": 36, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14228123078269305039&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "pdf": "https://openreview.net/pdf?id=AmUhwTOHgm", "email": "research.microsoft.com;amazon.com;amazon.com;cs.ucl.ac.uk;ed.ac.uk", "author_num": 5, "aff_unique_index": "0;1;1;2;3", "aff_unique_norm": "Microsoft;Amazon;University College London;University of Edinburgh", "aff_unique_dep": "Microsoft Research;Amazon.com, Inc.;Department of Computer Science;", "aff_unique_url": "https://www.microsoft.com/en-us/research;https://www.amazon.com;https://www.ucl.ac.uk;https://www.ed.ac.uk", "aff_unique_abbr": "MSR;Amazon;UCL;Edinburgh", "aff_campus_unique_index": "1", "aff_campus_unique": ";London", "aff_country_unique_index": "0;0;0;1;1", "aff_country_unique": "United States;United Kingdom" }, { "id": "Aot3sKdraW", "title": "AA-PINN: ATTENTION AUGMENTED PHYSICS INFORMED NEURAL NETWORKS", "track": "main", "status": "Reject", "tldr": "", "abstract": "Physics Informed Neural Networks has been quite successful in modelling the complex nature of fluid flow. Computational Fluid Dynamics using parallel processing\nalgorithms on GPUs have considerably reduced the time to solve the Navier Stokes\nEquations. CFD based approaches uses approximates to make the modelling easy\nbut it comes at the cost of decrease in accuracy. In this paper, we propose an\nattention based network architecture named AA-PINN to model PDEs behind fluid\nflow. We use a combination of channel and spatial attention module. We propose a\nnovel loss function which is more robust in handling the initial as well as boundary\nconditions imposed. Using evaluation metrics like RMSE, divergence and thermal\nkinetic energy, our network outperforms previous PINNs for modelling Navier\nStokes and Burgers Equation.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Abhinav Sagar", "authorids": "~Abhinav_Sagar1", "gender": "M", "homepage": "", "dblp": "", "google_scholar": "5ntkLcgAAAAJ", "orcid": "", "linkedin": "https://linkedin.com/in/abhinavsagar4", "or_profile": "~Abhinav_Sagar1", "aff": "University of Maryland, College Park", "aff_domain": "umd.edu", "position": "MS student", "bibtex": "@misc{\nsagar2022aapinn,\ntitle={{AA}-{PINN}: {ATTENTION} {AUGMENTED} {PHYSICS} {INFORMED} {NEURAL} {NETWORKS}},\nauthor={Abhinav Sagar},\nyear={2022},\nurl={https://openreview.net/forum?id=Aot3sKdraW}\n}", "github": "", "project": "", "reviewers": "Ck1m;oeTZ;kPHW;qXxf", "site": "https://openreview.net/forum?id=Aot3sKdraW", "pdf_size": 0, "recommendation": "3;3;3;3", "confidence": "3;4;4;3", "correctness": "3;2;3;3", "technical_novelty": "3;2;2;2", "empirical_novelty": "3;1;2;2", "wc_summary_paper": "50;77;64;37", "wc_summary_review": "74;133;39;38", "wc_main_review": "263;631;215;251", "wc_review": "387;841;318;326", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 57.0, 14.983324063771697 ], "wc_summary_review_avg": [ 71.0, 38.61994303465504 ], "wc_main_review_avg": [ 340.0, 168.93489870361304 ], "wc_review_avg": [ 468.0, 216.99884792320904 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 1, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:s1WiyF8Rx4sJ:scholar.google.com/&scioq=AA-PINN:+ATTENTION+AUGMENTED+PHYSICS+INFORMED+NEURAL+NETWORKS&hl=en&as_sdt=0,5", "gs_version_total": 2, "aff_unique_index": "0", "aff_unique_norm": "University of Maryland", "aff_unique_dep": "", "aff_unique_url": "https://www/umd.edu", "aff_unique_abbr": "UMD", "aff_campus_unique_index": "0", "aff_campus_unique": "College Park", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "id": "ArY-zkyHI_l", "title": "Resilience to Multiple Attacks via Adversarially Trained MIMO Ensembles", "track": "main", "status": "Reject", "tldr": "", "abstract": "While ensemble methods have been widely used for robustness against random perturbations (\\ie the average case), ensemble approaches for robustness against adversarial perturbations (\\ie the worst case) have remained elusive despite multiple prior attempts. We show that ensemble methods can improve adversarial robustness to multiple attacks if the ensemble is \\emph{adversarially diverse}, which is defined by two properties: 1) the sub-models are adversarially robust themselves and yet 2) adversarial attacks do not transfer easily between sub-models. While at first glance, creating such an ensemble would seem computationally expensive, we demonstrate that an adversarially diverse ensemble can be trained with minimal computational overhead via a Multiple-Input Multiple-Output (MIMO) model. Specifically, we propose to train a MIMO model with adversarial training ({\\emph{MAT}}), where each sub-model can be trained on a different attack type. When computing gradients for generating adversarial examples during training, we use the gradient with respect to the ensemble objective. This has a two-fold benefit: 1) it only requires 1 backward pass and 2) the cross-gradient information between the models promotes robustness against transferable attacks. We empirically demonstrate that {\\emph{MAT}} produces an ensemble of models that is adversarially diverse and significantly improves performance over single models or vanilla ensembles while being comparable to previous state-of-the-art methods. On MNIST, we obtain $99.5\\%$ clean accuracy and ($88.6\\%, 57.1\\%,71.6\\%$) against $(\\ell_\\infty, \\ell_2, \\ell_1)$ attacks, and on CIFAR10, we achieve $79.7\\%$ clean accuracy and ($47.9\\%, 61.8\\%,47.6\\%$) against $(\\ell_\\infty, \\ell_2, \\ell_1)$ attacks, which are comparable to previous state-of-the-art methods.", "keywords": "Adversarial Example;Adversarial Attack;Evasion Attack;Defense;Adversarial Training;Security in Machine Learning", "primary_area": "", "supplementary_material": "", "author": "Ruqi Bai;David I. Inouye;Saurabh Bagchi", "authorids": "~Ruqi_Bai1;~David_I._Inouye1;~Saurabh_Bagchi1", "gender": "M;M;M", "homepage": "https://ruqibai.netlify.app/;https://saurabhbagchi.us;http://davidinouye.com", "dblp": ";57/95.html;76/10817", "google_scholar": ";https://scholar.google.com.tw/citations?user=3EfsOvYAAAAJ;SVMQ_g4AAAAJ", "orcid": ";;", "linkedin": "ruqi-bai/;;", "or_profile": "~Ruqi_Bai1;~Saurabh_Bagchi1;~David_I_Inouye1", "aff": "Purdue University;Purdue University;Purdue University", "aff_domain": "purdue.edu;purdue.edu;purdue.edu", "position": "PhD student;Full Professor;Assistant Professor", "bibtex": "@misc{\nbai2022resilience,\ntitle={Resilience to Multiple Attacks via Adversarially Trained {MIMO} Ensembles},\nauthor={Ruqi Bai and David I. Inouye and Saurabh Bagchi},\nyear={2022},\nurl={https://openreview.net/forum?id=ArY-zkyHI_l}\n}", "github": "", "project": "", "reviewers": "U6VC;YRoM;zrcs;7nra", "site": "https://openreview.net/forum?id=ArY-zkyHI_l", "pdf_size": 0, "recommendation": "3;5;5;5", "confidence": "4;4;4;5", "correctness": "3;3;4;3", "technical_novelty": "3;2;3;2", "empirical_novelty": "2;0;2;2", "wc_summary_paper": "46;29;86;86", "wc_summary_review": "30;54;102;101", "wc_main_review": "304;130;255;595", "wc_review": "380;213;443;782", "wc_reply_reviewers": "0;0;200;310", "wc_reply_authors": "1025;375;1390;2572", "reply_reviewers": "0;0;1;1", "reply_authors": "2;1;3;4", "recommendation_avg": [ 4.5, 0.8660254037844386 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 1.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 61.75, 24.983744715314394 ], "wc_summary_review_avg": [ 71.75, 30.93844695520446 ], "wc_main_review_avg": [ 321.0, 170.44207227090382 ], "wc_review_avg": [ 454.5, 206.91846220190212 ], "wc_reply_reviewers_avg": [ 127.5, 133.29947486768282 ], "wc_reply_authors_avg": [ 1340.5, 798.5569798079533 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 2.5, 1.118033988749895 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:MnxKxCLn2AkJ:scholar.google.com/&scioq=Resilience+to+Multiple+Attacks+via+Adversarially+Trained+MIMO+Ensembles&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0;0", "aff_unique_norm": "Purdue University", "aff_unique_dep": "", "aff_unique_url": "https://www.purdue.edu", "aff_unique_abbr": "Purdue", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "AsDSpwXYGeT", "title": "Back to Basics: Efficient Network Compression via IMP", "track": "main", "status": "Reject", "tldr": "", "abstract": "Network pruning is a widely used technique for effectively compressing Deep Neural Networks with little to no degradation in performance during inference. Iterative Magnitude Pruning (IMP) (Han et al., 2015) is one of the most established approaches for network pruning, consisting of several iterative training and pruning steps, where a significant amount of the network\u2019s performance is lost after pruning and then recovered in the subsequent retraining phase. While commonly used as a benchmark reference, it is often argued that a) it reaches suboptimal states by not incorporating sparsification into the training phase, b) its global selection criterion fails to properly determine optimal layer-wise pruning rates and c) its iterative nature makes it slow and non-competitive. In light of recently proposed retraining techniques, we investigate these claims through rigorous and consistent experiments where we compare IMP to pruning-during-training algorithms, evaluate proposed modifications of its selection criterion and study the number of iterations and total training time actually required. We find that IMP with SLR (Le & Hua, 2021) for retraining can outperform state-of-the-art pruning-during-training approaches without or with only little computational overhead, that the global magnitude selection criterion is largely competitive with more complex approaches and that only few retraining epochs are needed in practice to achieve most of the sparsity-vs.-performance trade-off of IMP. Our goals are both to demonstrate that basic IMP can already provide state-of-the-art pruning results on par or outperforming more complex or heavily parameterized approaches and also to establish a more realistic yet easily realisable baseline for future research.", "keywords": "Magnitude pruning;Sparsity;IMP;Model Compression", "primary_area": "", "supplementary_material": "/attachment/0e060a4e764075cc2a7531cdc4cabbed507f97e1.zip", "author": "Max Zimmer;Sebastian Pokutta;Christoph Spiegel", "authorids": "~Max_Zimmer1;~Sebastian_Pokutta1;~Christoph_Spiegel1", "gender": ";M;", "homepage": ";http://www.pokutta.com;", "dblp": ";75/7718;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": "~Max_Zimmer1;~Sebastian_Pokutta1;~Christoph_Spiegel1", "aff": ";TU Berlin;", "aff_domain": ";tu-berlin.de;", "position": ";Full Professor;", "bibtex": "@misc{\nzimmer2022back,\ntitle={Back to Basics: Efficient Network Compression via {IMP}},\nauthor={Max Zimmer and Sebastian Pokutta and Christoph Spiegel},\nyear={2022},\nurl={https://openreview.net/forum?id=AsDSpwXYGeT}\n}", "github": "", "project": "", "reviewers": "WMwJ;dL1d;fSGB", "site": "https://openreview.net/forum?id=AsDSpwXYGeT", "pdf_size": 0, "recommendation": "5;5;6", "confidence": "4;4;3", "correctness": "3;3;3", "technical_novelty": "2;2;2", "empirical_novelty": "2;3;4", "wc_summary_paper": "97;58;57", "wc_summary_review": "68;92;55", "wc_main_review": "465;291;181", "wc_review": "630;441;293", "wc_reply_reviewers": "268;0;0", "wc_reply_authors": "1448;1067;342", "reply_reviewers": "1;0;0", "reply_authors": "3;2;2", "recommendation_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 3.0, 0.816496580927726 ], "wc_summary_paper_avg": [ 70.66666666666667, 18.624953392931992 ], "wc_summary_review_avg": [ 71.66666666666667, 15.326085243430198 ], "wc_main_review_avg": [ 312.3333333333333, 116.91972554801102 ], "wc_review_avg": [ 454.6666666666667, 137.91865557478275 ], "wc_reply_reviewers_avg": [ 89.33333333333333, 126.33641157199649 ], "wc_reply_authors_avg": [ 952.3333333333334, 458.74490611764713 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 2.3333333333333335, 0.4714045207910317 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.9999999999999997, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:CxSuf93UmWkJ:scholar.google.com/&scioq=Back+to+Basics:+Efficient+Network+Compression+via+IMP&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "Technische Universit\u00e4t Berlin", "aff_unique_dep": "", "aff_unique_url": "https://www.tu-berlin.de", "aff_unique_abbr": "TU Berlin", "aff_campus_unique_index": "0", "aff_campus_unique": "Berlin", "aff_country_unique_index": "0", "aff_country_unique": "Germany" }, { "id": "AsQz_GFFDQp", "title": "Agnostic Personalized Federated Learning with Kernel Factorization", "track": "main", "status": "Reject", "tldr": "", "abstract": "Considering the futuristic scenarios of federated learning at a worldwide scale, it is highly probable that local participants can have their own personalized labels, which might not be compatible with each other even for the same class, and can be also possibly from a variety of multiple domains. Nevertheless, they should be benefited from others while selectively taking helpful knowledge. Toward such extreme scenarios of federated learning, however, most existing approaches are limited in that they often assume: (1) labeling schemes are all synchronized amongst clients; (2) the local data is from the same single dataset (domain). In this sense, we introduce an intensively realistic problem of federated learning, namely Agnostic Personalized Federated Learning (APFL), where any clients, regardless of what they have learned with their personalized labels, can collaboratively learn while benefiting each other. We then study two essential challenges of the agnostic personalized federated learning, which are (1) Label Heterogeneity where local clients learn from the same single domain but labeling schemes are not synchronized with each other and (2) Domain Heterogeneity where the clients learn from the different datasets which can be semantically similar or dissimilar for each other. To tackle these problems, we propose our novel method, namely Similarity Matching and Kernel Factorization (SimFed). Our method measures semantic similarity/dissimilarity between locally learned knowledge and matches/aggregates the relevant ones that are beneficial to each other. Furthermore, we factorize our model parameters into two basis vectors and the sparse masks to effectively capture permutation-robust representations and reduce information loss when aggregating the heterogeneous knowledge. We exhaustively validate our method on both single- and multi-domain datasets, showing that our method outperforms the current state-of-the-art federated learning methods.", "keywords": "Federated Learning;Personalized Federated Learning", "primary_area": "", "supplementary_material": "", "author": "Wonyong Jeong;Sung Ju Hwang", "authorids": "~Wonyong_Jeong1;~Sung_Ju_Hwang1", "gender": "M;", "homepage": "https://wyjeong.github.io/;", "dblp": ";", "google_scholar": "0PC5-GEAAAAJ;", "orcid": ";", "linkedin": "wyjeong/;", "or_profile": "~Wonyong_Jeong1;~Sung_Ju_Hwang1", "aff": "Korea Advanced Institute of Science & Technology;", "aff_domain": "kaist.ac.kr;", "position": "Ph.D. student;", "bibtex": "@misc{\njeong2022agnostic,\ntitle={Agnostic Personalized Federated Learning with Kernel Factorization},\nauthor={Wonyong Jeong and Sung Ju Hwang},\nyear={2022},\nurl={https://openreview.net/forum?id=AsQz_GFFDQp}\n}", "github": "", "project": "", "reviewers": "o4KB;85V7;3krS;EkKK;9bEM", "site": "https://openreview.net/forum?id=AsQz_GFFDQp", "pdf_size": 0, "recommendation": "3;3;5;6;6", "confidence": "4;4;4;3;3", "correctness": "2;3;3;3;3", "technical_novelty": "2;1;3;3;3", "empirical_novelty": "2;1;2;3;2", "wc_summary_paper": "81;71;121;51;57", "wc_summary_review": "79;40;51;21;46", "wc_main_review": "442;117;308;330;184", "wc_review": "602;228;480;402;287", "wc_reply_reviewers": "0;60;0;0;0", "wc_reply_authors": "981;1002;1301;615;657", "reply_reviewers": "0;2;0;0;0", "reply_authors": "3;3;3;1;1", "recommendation_avg": [ 4.6, 1.3564659966250536 ], "confidence_avg": [ 3.6, 0.4898979485566356 ], "correctness_avg": [ 2.8, 0.39999999999999997 ], "technical_novelty_avg": [ 2.4, 0.8 ], "empirical_novelty_avg": [ 2.0, 0.6324555320336759 ], "wc_summary_paper_avg": [ 76.2, 24.741867350707384 ], "wc_summary_review_avg": [ 47.4, 18.78935869049287 ], "wc_main_review_avg": [ 276.2, 114.22854284284642 ], "wc_review_avg": [ 399.8, 133.8363179409834 ], "wc_reply_reviewers_avg": [ 12.0, 24.0 ], "wc_reply_authors_avg": [ 911.2, 251.95745672632913 ], "reply_reviewers_avg": [ 0.4, 0.8000000000000002 ], "reply_authors_avg": [ 2.2, 0.9797958971132712 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.8427009716003845, "corr_recommendation_correctness": 0.5897678246195884, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:PFNguAaA8lQJ:scholar.google.com/&scioq=Agnostic+Personalized+Federated+Learning+with+Kernel+Factorization&hl=en&as_sdt=0,14", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "Korea Advanced Institute of Science and Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.kaist.ac.kr", "aff_unique_abbr": "KAIST", "aff_country_unique_index": "0", "aff_country_unique": "South Korea" }, { "id": "AsyICRrQ7Lp", "title": "Bootstrapped Hindsight Experience replay with Counterintuitive Prioritization", "track": "main", "status": "Reject", "tldr": "", "abstract": "Goal-conditioned environments are known as sparse rewards tasks, in which the agent gains a positive reward only when it achieves the goal. Such an setting results in much difficulty for the agent to explore successful trajectories. Hindsight experience replay (HER) replaces the goal in failed experiences with any practically achieved one, so that the agent has a much higher chance to see successful trajectories even if they are fake. Comprehensive results have demonstrated the effectiveness of HER in the literature. However, the importance of the fake trajectories differs in terms of exploration and exploitation, and it is usually inefficient to learn with a fixed proportion of fake and original data as HER did. In this paper, inspired by Bootstrapped DQN, we use multiple heads in DDPG and take advantage of the diversity and uncertainty among multiple heads to improve the data efficiency with relabeled goals. The method is referred to as Bootstrapped HER (BHER). Specifically, in addition to the benefit from the Bootstrapped version, we explicitly leverage the uncertainty measured by the variance of estimated Q-values from multiple heads. A common knowledge is that higher uncertainty will promote exploration and hence maximizing the uncertainty via a bonus term will induce better performance in Q-learning. However, in this paper, we reveal a counterintuitive conclusion that for hindsight experiences, exploiting lower uncertainty data samples will significantly improve the performance. The explanation behind this fact is that hindsight relabeling largely promotes exploration, and then exploiting lower uncertainty data (whose goals are generated by hindsight relabeling) provides a good trade-off between exploration and exploitation, resulting in further improved data efficiency. Comprehensive experiments demonstrate that our method can achieve state-of-the-art results in many goal-conditioned tasks.", "keywords": "Reinforcement learning;hindsight experience replay;counterintuitive prioritization", "primary_area": "", "supplementary_material": "/attachment/e729822e62b74f3aec3aadf0a3e56b428d94b600.zip", "author": "Jiawei Xu;Shuxing Li;Chun Yuan;Zhengyou Zhang;Lei Han", "authorids": "~Jiawei_Xu1;~Shuxing_Li1;~Chun_Yuan1;~Zhengyou_Zhang2;~Lei_Han1", "gender": "M;M;M;M;M", "homepage": "https://github.com/jiawei415;;https://www.sigs.tsinghua.edu.cn/fg3/105064.jhtml;;https://www.leihan.org", "dblp": ";;;;75/2307-1", "google_scholar": ";;https://scholar.google.com.hk/citations?user=fYdxi2sAAAAJ;1I-DKy8AAAAJ;Tz4_zi8AAAAJ", "orcid": ";;;;", "linkedin": ";%E8%88%92%E5%85%B4-%E6%9D%8E-739678133/;;;", "or_profile": "~Jiawei_Xu1;~Shuxing_Li1;~Chun_Yuan1;~Zhengyou_Zhang2;~Lei_Han1", "aff": "Tsinghua University;Tsinghua University;Tsinghua University;Tencent AI Lab;Tencent Robotics X", "aff_domain": "tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;tencent.com;tencent.com", "position": "MS student;MS student;Full Professor;Director;Principal Researcher", "bibtex": "@misc{\nxu2022bootstrapped,\ntitle={Bootstrapped Hindsight Experience replay with Counterintuitive Prioritization},\nauthor={Jiawei Xu and Shuxing Li and Chun Yuan and Zhengyou Zhang and Lei Han},\nyear={2022},\nurl={https://openreview.net/forum?id=AsyICRrQ7Lp}\n}", "github": "", "project": "", "reviewers": "nX4W;BnLV;PtA1;yX9d", "site": "https://openreview.net/forum?id=AsyICRrQ7Lp", "pdf_size": 0, "recommendation": "1;3;5;5", "confidence": "4;4;4;2", "correctness": "2;3;4;4", "technical_novelty": "1;2;3;2", "empirical_novelty": "2;3;3;2", "wc_summary_paper": "102;34;224;89", "wc_summary_review": "201;128;108;86", "wc_main_review": "1301;492;611;214", "wc_review": "1604;654;943;389", "wc_reply_reviewers": "0;83;188;0", "wc_reply_authors": "235;432;464;195", "reply_reviewers": "0;2;1;0", "reply_authors": "1;3;1;1", "recommendation_avg": [ 3.5, 1.6583123951777 ], "confidence_avg": [ 3.5, 0.8660254037844386 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 112.25, 69.38434621728449 ], "wc_summary_review_avg": [ 130.75, 43.19360485071835 ], "wc_main_review_avg": [ 654.5, 400.0940514429076 ], "wc_review_avg": [ 897.5, 452.514364412888 ], "wc_reply_reviewers_avg": [ 67.75, 77.25404520152973 ], "wc_reply_authors_avg": [ 331.5, 117.89932145691085 ], "reply_reviewers_avg": [ 0.75, 0.82915619758885 ], "reply_authors_avg": [ 1.5, 0.8660254037844386 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.5222329678670935, "corr_recommendation_correctness": 1.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Y6k621T11TIJ:scholar.google.com/&scioq=Bootstrapped+Hindsight+Experience+replay+with+Counterintuitive+Prioritization&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;0;1;1", "aff_unique_norm": "Tsinghua University;Tencent", "aff_unique_dep": ";Tencent AI Lab", "aff_unique_url": "https://www.tsinghua.edu.cn;https://ai.tencent.com", "aff_unique_abbr": "THU;Tencent AI Lab", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Understanding the Role of Self Attention for Efficient Speech Recognition", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6116", "id": "AvcfxqRy4Y", "poster": "", "openreview": "https://openreview.net/forum?id=AvcfxqRy4Y", "slides": "https://iclr.cc/virtual/2022/poster/6116", "video": "https://iclr.cc/virtual/2022/poster/6116", "author_site": "Kyuhong Shim, Jungwook Choi, Wonyong Sung", "tldr": "", "abstract": "Self-attention (SA) is a critical component of Transformer neural networks that have succeeded in automatic speech recognition (ASR). In this paper, we analyze the role of SA in Transformer-based ASR models for not only understanding the mechanism of improved recognition accuracy but also lowering the computational complexity. We reveal that SA performs two distinct roles: phonetic and linguistic localization. Especially, we show by experiments that phonetic localization in the lower layers extracts phonologically meaningful features from speech and reduces the phonetic variance in the utterance for proper linguistic localization in the upper layers. From this understanding, we discover that attention maps can be reused as long as their localization capability is preserved. To evaluate this idea, we implement the layer-wise attention map reuse on real GPU platforms and achieve up to 1.96 times speedup in inference and 33% savings in training time with noticeably improved ASR performance for the challenging benchmark on LibriSpeech dev/test-other dataset.\n", "keywords": "transformer;self attention;speech recognition", "primary_area": "", "supplementary_material": "/attachment/e5bec4a4f19f93bf6a75046f866f76da9f215624.zip", "author": "Kyuhong Shim;Jungwook Choi;Wonyong Sung", "authorids": "~Kyuhong_Shim1;~Jungwook_Choi1;~Wonyong_Sung1", "gender": "M;M;", "homepage": "https://sites.google.com/view/khshim;;", "dblp": "209/4981;97/4140;22/1975", "google_scholar": "https://scholar.google.co.kr/citations?user=msFkCLEAAAAJ;YPT98zwAAAAJ;https://scholar.google.co.kr/citations?user=1IfNFz4AAAAJ", "orcid": "0000-0002-0123-3100;;0000-0001-8801-210X", "linkedin": ";jungwook-choi-5854996b/;", "or_profile": "~Kyuhong_Shim1;~Jungwook_Choi1;~Wonyong_Sung1", "aff": "Seoul National University;Hanyang University;Seoul National University", "aff_domain": "snu.ac.kr;hanyang.ac.kr;snu.ac.kr", "position": "PhD student;Assistant Professor;Emeritus", "bibtex": "@inproceedings{\nshim2022understanding,\ntitle={Understanding the Role of Self Attention for Efficient Speech Recognition},\nauthor={Kyuhong Shim and Jungwook Choi and Wonyong Sung},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=AvcfxqRy4Y}\n}", "github": "", "project": "", "reviewers": "ki2s;mbFx;kKif;Vyqx", "pdf_size": 0, "recommendation": "6;8;8;8", "confidence": "4;5;4;4", "correctness": "3;4;3;4", "technical_novelty": "3;3;3;4", "empirical_novelty": "2;3;3;4", "wc_summary_paper": "123;146;99;137", "wc_summary_review": "97;40;88;74", "wc_main_review": "894;203;602;190", "wc_review": "1114;389;789;401", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "784;340;592;427", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 7.5, 0.8660254037844386 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 126.25, 17.73943347460679 ], "wc_summary_review_avg": [ 74.75, 21.672274915199836 ], "wc_main_review_avg": [ 472.25, 294.47782174554334 ], "wc_review_avg": [ 673.25, 301.0717314860364 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 535.75, 169.51161464631267 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 60, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2771784824933237823&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "pdf": "https://openreview.net/pdf?id=AvcfxqRy4Y", "email": "snu.ac.kr;hanyang.ac.kr;snu.ac.kr", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "Seoul National University;Hanyang University", "aff_unique_dep": ";", "aff_unique_url": "https://www.snu.ac.kr;https://www.hanyang.ac.kr", "aff_unique_abbr": "SNU;HYU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "South Korea" }, { "title": "Domain Adversarial Training: A Game Perspective", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6466", "id": "AwgtcUAhBq", "poster": "", "openreview": "https://openreview.net/forum?id=AwgtcUAhBq", "slides": "https://iclr.cc/virtual/2022/poster/6466", "video": "https://iclr.cc/virtual/2022/poster/6466", "author_site": "David Acuna, Marc T Law, Guojun Zhang, Sanja Fidler", "tldr": "", "abstract": "The dominant line of work in domain adaptation has focused on learning invariant representations using domain-adversarial training. In this paper, we interpret this approach from a game theoretical perspective. Defining optimal solutions in domain-adversarial training as a local Nash equilibrium, we show that gradient descent in domain-adversarial training can violate the asymptotic convergence guarantees of the optimizer, oftentimes hindering the transfer performance. Our analysis leads us to replace gradient descent with high-order ODE solvers (i.e., Runge\u2013Kutta), for which we derive asymptotic convergence guarantees. This family of optimizers is significantly more stable and allows more aggressive learning rates, leading to high performance gains when used as a drop-in replacement over standard optimizers. Our experiments show that in conjunction with state-of-the-art domain-adversarial methods, we achieve up to 3.5% improvement with less than of half training iterations. Our optimizers are easy to implement, free of additional parameters, and can be plugged into any domain-adversarial framework.", "keywords": "Domain Adversarial Training;Domain Adaptation;Neural Networks Optimization;Game Theory", "primary_area": "", "supplementary_material": "", "author": "David Acuna;Marc T Law;Guojun Zhang;Sanja Fidler", "authorids": "~David_Acuna1;~Marc_T_Law1;~Guojun_Zhang1;~Sanja_Fidler1", "gender": "M;M;M;F", "homepage": "http://www.cs.toronto.edu/~davidj/;http://www.cs.toronto.edu/~law/;https://gordon-guojun-zhang.github.io/;http://www.cs.toronto.edu/~fidler/", "dblp": "217/2906;117/7668;56/4451;08/6607", "google_scholar": "https://scholar.google.ca/citations?user=9aFd9dEAAAAJ;https://scholar.google.fr/citations?user=_7QgnUcAAAAJ;https://scholar.google.ca/citations?user=p8Y0xJEAAAAJ;CUlqK5EAAAAJ", "orcid": ";;;", "linkedin": ";;guojun-zhang-bbb009a4/;sanja-fidler-2846a1a?trk=hp-identity-name", "or_profile": "~David_Acuna1;~Marc_T_Law1;~Guojun_Zhang1;~Sanja_Fidler1", "aff": "Department of Computer Science, University of Toronto;NVIDIA;Huawei Technologies Ltd.;Department of Computer Science, University of Toronto", "aff_domain": "cs.toronto.edu;nvidia.com;huawei.com;cs.toronto.edu", "position": "PhD student;Research Scientist;Researcher;Associate Professor", "bibtex": "@inproceedings{\nacuna2022domain,\ntitle={Domain Adversarial Training: A Game Perspective},\nauthor={David Acuna and Marc T Law and Guojun Zhang and Sanja Fidler},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=AwgtcUAhBq}\n}", "github": "", "project": "", "reviewers": "uzT5;LuBk;1YN1;tiyE", "pdf_size": 0, "recommendation": "6;6;8;8", "confidence": "5;3;3;2", "correctness": "3;4;4;4", "technical_novelty": "4;2;3;3", "empirical_novelty": "4;3;3;3", "wc_summary_paper": "105;136;120;81", "wc_summary_review": "45;14;24;37", "wc_main_review": "86;63;137;237", "wc_review": "236;213;281;355", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "216;303;179;188", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 7.0, 1.0 ], "confidence_avg": [ 3.25, 1.0897247358851685 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 3.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 110.5, 20.254629100529094 ], "wc_summary_review_avg": [ 30.0, 11.895377253370318 ], "wc_main_review_avg": [ 130.75, 66.9342027665976 ], "wc_review_avg": [ 271.25, 54.18659889677521 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 221.5, 48.99234634103576 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.6882472016116854, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2851038308510792381&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=AwgtcUAhBq", "email": "cs.toronto.edu;nvidia.com;huawei.com;cs.toronto.edu", "author_num": 4, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "University of Toronto;NVIDIA;Huawei", "aff_unique_dep": "Department of Computer Science;NVIDIA Corporation;Huawei Technologies", "aff_unique_url": "https://www.utoronto.ca;https://www.nvidia.com;https://www.huawei.com", "aff_unique_abbr": "U of T;NVIDIA;Huawei", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Toronto;", "aff_country_unique_index": "0;1;2;0", "aff_country_unique": "Canada;United States;China" }, { "id": "AypVMhFfuc5", "title": "FrugalMCT: Efficient Online ML API Selection for Multi-Label Classification Tasks", "track": "main", "status": "Reject", "tldr": "", "abstract": "Multi-label classification tasks such as OCR and multi-object recognition are a major focus of the growing machine learning as a service industry. While many multi-label APIs are available, it is challenging for users to decide which API to use for their own data and budget, due to the heterogeneity in their prices and performance. Recent work has shown how to efficiently select and combine single label APIs to optimize performance and cost. However, its computation cost is exponential in the number of labels, and is not suitable for settings like OCR. In this work, we propose FrugalMCT, a principled framework that adaptively selects the APIs to use for different data in an online fashion while respecting the user\u2019s budget. It allows combining ML APIs\u2019 predictions for any single data point, and selects the best combination based on an accuracy estimator. We run systematic experiments using ML APIs from Google, Microsoft, Amazon, IBM, Tencent, and other providers for tasks including multi-label image classification, scene text recognition, and named entity recognition. Across these tasks, FrugalMCT can achieve over 90% cost reduction while matching the accuracy of the best single API, or up to 8% better accuracy while matching the best API\u2019s cost.", "keywords": "ML as a Service;Multi-label classifications;ML systems;adaptive learning", "primary_area": "", "supplementary_material": "/attachment/16f53bffe8ff79ab31c7f158a46e0cd28f71af58.zip", "author": "Lingjiao Chen;Matei Zaharia;James Zou", "authorids": "~Lingjiao_Chen1;~Matei_Zaharia1;~James_Zou1", "gender": ";M;", "homepage": ";https://cs.stanford.edu/~matei/;", "dblp": "131/6638.html;36/2133;", "google_scholar": ";I1EvjZsAAAAJ;23ZXZvEAAAAJ", "orcid": ";0000-0002-7547-7204;", "linkedin": ";mateizaharia/;", "or_profile": "~Lingjiao_Chen1;~Matei_Zaharia1;~James_Zou1", "aff": "Stanford University;Stanford University;Stanford University", "aff_domain": "stanford.edu;stanford.edu;stanford.edu", "position": "PhD student;Assistant Professor;Assistant Professor", "bibtex": "@misc{\nchen2022frugalmct,\ntitle={Frugal{MCT}: Efficient Online {ML} {API} Selection for Multi-Label Classification Tasks},\nauthor={Lingjiao Chen and Matei Zaharia and James Zou},\nyear={2022},\nurl={https://openreview.net/forum?id=AypVMhFfuc5}\n}", "github": "", "project": "", "reviewers": "S7of;rXPL;py8t;R1P4", "site": "https://openreview.net/forum?id=AypVMhFfuc5", "pdf_size": 0, "recommendation": "3;6;8;8", "confidence": "4;3;3;4", "correctness": "2;3;4;4", "technical_novelty": "1;3;3;3", "empirical_novelty": "1;3;4;4", "wc_summary_paper": "32;83;57;66", "wc_summary_review": "208;41;25;42", "wc_main_review": "234;192;157;226", "wc_review": "474;316;239;334", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "695;198;130;11", "reply_reviewers": "0;0;0;0", "reply_authors": "2;1;1;1", "recommendation_avg": [ 6.25, 2.0463381929681126 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.5, 0.8660254037844386 ], "empirical_novelty_avg": [ 3.0, 1.224744871391589 ], "wc_summary_paper_avg": [ 59.5, 18.418740456393863 ], "wc_summary_review_avg": [ 79.0, 74.78301946297702 ], "wc_main_review_avg": [ 202.25, 30.515364982251153 ], "wc_review_avg": [ 340.75, 84.80381772066633 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 258.5, 260.74940076632964 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.3665083330689157, "corr_recommendation_correctness": 0.9945577827230725, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2899218346894551510&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "aff_unique_index": "0;0;0", "aff_unique_norm": "Stanford University", "aff_unique_dep": "", "aff_unique_url": "https://www.stanford.edu", "aff_unique_abbr": "Stanford", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Relational Learning with Variational Bayes", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6522", "id": "Az-7gJc6lpr", "poster": "", "openreview": "https://openreview.net/forum?id=Az-7gJc6lpr", "slides": "https://iclr.cc/virtual/2022/poster/6522", "video": "https://iclr.cc/virtual/2022/poster/6522", "tldr": "", "abstract": "In psychology, relational learning refers to the ability to recognize and respond to relationship among objects irrespective of the nature of those objects. Relational learning has long been recognized as a hallmark of human cognition and a key question in artificial intelligence research. In this work, we propose an unsupervised learning method for addressing the relational learning problem where we learn the underlying relationship between a pair of data irrespective of the nature of those data. The central idea of the proposed method is to encapsulate the relational learning problem with a probabilistic graphical model in which we perform inference to learn about data relationship and other relational processing tasks.", "keywords": "Relational learning;psychology;unsupervised learning;variational inference;probabilistic graphical model.", "primary_area": "", "supplementary_material": "", "author": "Kuang-Hung Liu", "authorids": "~Kuang-Hung_Liu1", "gender": "", "homepage": "https://scholar.google.com/citations?user=eaxkzLcAAAAJ&hl=en", "dblp": "", "google_scholar": "eaxkzLcAAAAJ", "orcid": "", "linkedin": "", "or_profile": "~Kuang-Hung_Liu1", "aff": "ExxonMobil", "aff_domain": "exxonmobil.com", "position": "Research Scientist", "bibtex": "@inproceedings{\nliu2022relational,\ntitle={Relational Learning with Variational Bayes},\nauthor={Kuang-Hung Liu},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=Az-7gJc6lpr}\n}", "github": "", "project": "", "reviewers": "Znwn;2C7q;Jew5;ahUC;7vpw", "pdf_size": 0, "recommendation": "5;6;6;6;6", "confidence": "4;3;3;3;3", "correctness": "3;3;3;3;3", "technical_novelty": "3;3;3;3;3", "empirical_novelty": "3;3;3;2;3", "wc_summary_paper": "115;40;49;41;72", "wc_summary_review": "132;7;42;49;47", "wc_main_review": "62;120;488;267;140", "wc_review": "309;167;579;357;259", "wc_reply_reviewers": "150;0;280;0;0", "wc_reply_authors": "1583;398;1617;609;316", "reply_reviewers": "1;0;2;0;0", "reply_authors": "3;1;4;1;1", "recommendation_avg": [ 5.8, 0.39999999999999997 ], "confidence_avg": [ 3.2, 0.39999999999999997 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.8, 0.39999999999999997 ], "wc_summary_paper_avg": [ 63.4, 28.26021939051429 ], "wc_summary_review_avg": [ 55.4, 41.23396658096331 ], "wc_main_review_avg": [ 215.4, 151.8493990768485 ], "wc_review_avg": [ 334.2, 137.61744075516012 ], "wc_reply_reviewers_avg": [ 86.0, 113.06635220082056 ], "wc_reply_authors_avg": [ 904.6, 575.8842244757187 ], "reply_reviewers_avg": [ 0.6, 0.7999999999999999 ], "reply_authors_avg": [ 2.0, 1.2649110640673518 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 1, 0 ], "corr_recommendation_confidence": -1.0, "corr_recommendation_correctness": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5420984774213966581&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "pdf": "https://openreview.net/pdf?id=Az-7gJc6lpr", "email": "exxonmobil.com", "author_num": 1, "aff_unique_index": "0", "aff_unique_norm": "ExxonMobil Corporation", "aff_unique_dep": "", "aff_unique_url": "https://www.exxonmobil.com", "aff_unique_abbr": "ExxonMobil", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "title": "Heteroscedastic Temporal Variational Autoencoder For Irregularly Sampled Time Series", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6271", "id": "Az7opqbQE-3", "poster": "", "openreview": "https://openreview.net/forum?id=Az7opqbQE-3", "slides": "https://iclr.cc/virtual/2022/poster/6271", "video": "https://iclr.cc/virtual/2022/poster/6271", "author_site": "Satya Narayan Shukla, Benjamin M Marlin", "tldr": "", "abstract": "Irregularly sampled time series commonly occur in several domains where they present a significant challenge to standard deep learning models. In this paper, we propose a new deep learning framework for probabilistic interpolation of irregularly sampled time series that we call the Heteroscedastic Temporal Variational Autoencoder (HeTVAE). HeTVAE includes a novel input layer to encode information about input observation sparsity, a temporal VAE architecture to propagate uncertainty due to input sparsity, and a heteroscedastic output layer to enable variable uncertainty in the output interpolations. Our results show that the proposed architecture is better able to reflect variable uncertainty through time due to sparse and irregular sampling than a range of baseline and traditional models, as well as recently proposed deep latent variable models that use homoscedastic output layers.", "keywords": "irregular sampling;uncertainty;imputation;interpolation;multivariate time series;missing data;variational autoencoder", "primary_area": "", "supplementary_material": "", "author": "Satya Narayan Shukla;Benjamin Marlin", "authorids": "~Satya_Narayan_Shukla1;~Benjamin_Marlin1", "gender": "M;M", "homepage": "https://satyanshukla.github.io/;https://groups.cs.umass.edu/marlin/", "dblp": "161/3356;03/7058.html", "google_scholar": "l1tsmesAAAAJ;ey960FIAAAAJ", "orcid": ";0000-0002-2626-3410", "linkedin": ";", "or_profile": "~Satya_Narayan_Shukla1;~Benjamin_Marlin1", "aff": "Meta;University of Massachusetts at Amherst", "aff_domain": "fb.com;umass.edu", "position": "Research Scientist;Associate Professor", "bibtex": "@inproceedings{\nshukla2022heteroscedastic,\ntitle={Heteroscedastic Temporal Variational Autoencoder For Irregularly Sampled Time Series},\nauthor={Satya Narayan Shukla and Benjamin Marlin},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=Az7opqbQE-3}\n}", "github": "", "project": "", "reviewers": "7Ard;i1vq;oEK3;EXpk", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "4;3;3;4", "correctness": "4;4;4;4", "technical_novelty": "3;3;3;3", "empirical_novelty": "3;3;3;3", "wc_summary_paper": "66;83;57;99", "wc_summary_review": "34;155;17;69", "wc_main_review": "386;737;56;495", "wc_review": "486;975;130;663", "wc_reply_reviewers": "82;308;0;0", "wc_reply_authors": "1545;1575;624;680", "reply_reviewers": "1;1;0;0", "reply_authors": "3;3;1;1", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 4.0, 0.0 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 76.25, 16.11482236948332 ], "wc_summary_review_avg": [ 68.75, 53.20890433000852 ], "wc_main_review_avg": [ 418.5, 244.82493745531724 ], "wc_review_avg": [ 563.5, 305.4345265355572 ], "wc_reply_reviewers_avg": [ 97.5, 126.05851815724314 ], "wc_reply_authors_avg": [ 1106.0, 454.555277166595 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 2.0, 1.0 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.2294157338705618, "corr_recommendation_correctness": 0.0, "gs_citation": 26, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3281039634260173349&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=Az7opqbQE-3", "email": "fb.com;umass.edu", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "Meta;University of Massachusetts Amherst", "aff_unique_dep": "Meta Platforms, Inc.;", "aff_unique_url": "https://meta.com;https://www.umass.edu", "aff_unique_abbr": "Meta;UMass Amherst", "aff_campus_unique_index": "1", "aff_campus_unique": ";Amherst", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Reducing Excessive Margin to Achieve a Better Accuracy vs. Robustness Trade-off", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/5907", "id": "Azh9QBQ4tR7", "poster": "", "openreview": "https://openreview.net/forum?id=Azh9QBQ4tR7", "slides": "https://iclr.cc/virtual/2022/poster/5907", "video": "https://iclr.cc/virtual/2022/poster/5907", "author_site": "Rahul Rade, Seyed-Mohsen Moosavi-Dezfooli", "tldr": "", "abstract": "While adversarial training has become the de facto approach for training robust classifiers, it leads to a drop in accuracy. This has led to prior works postulating that accuracy is inherently at odds with robustness. Yet, the phenomenon remains inexplicable. In this paper, we closely examine the changes induced in the decision boundary of a deep network during adversarial training. We find that adversarial training leads to unwarranted increase in the margin along certain adversarial directions, thereby hurting accuracy. Motivated by this observation, we present a novel algorithm, called Helper-based Adversarial Training (HAT), to reduce this effect by incorporating additional wrongly labelled examples during training. Our proposed method provides a notable improvement in accuracy without compromising robustness. It achieves a better trade-off between accuracy and robustness in comparison to existing defenses. Code is available at https://github.com/imrahulr/hat.", "keywords": "adversarial training;robustness", "primary_area": "", "supplementary_material": "/attachment/a0c1b26035db8ee49a4270005489c63c6a99ddea.zip", "author": "Rahul Rade;Seyed-Mohsen Moosavi-Dezfooli", "authorids": "~Rahul_Rade1;~Seyed-Mohsen_Moosavi-Dezfooli1", "gender": "Not Specified;M", "homepage": "https://imrahulr.github.io/;", "dblp": ";", "google_scholar": "https://scholar.google.co.in/citations?user=Uesh1YIAAAAJ;https://scholar.google.ch/citations?user=qosS83IAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Rahul_Rade1;~Seyed-Mohsen_Moosavi-Dezfooli1", "aff": ";Swiss Federal Institute of Technology", "aff_domain": ";ethz.ch", "position": ";Postdoc", "bibtex": "@inproceedings{\nrade2022reducing,\ntitle={Reducing Excessive Margin to Achieve a Better Accuracy vs. Robustness Trade-off},\nauthor={Rahul Rade and Seyed-Mohsen Moosavi-Dezfooli},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=Azh9QBQ4tR7}\n}", "github": "", "project": "", "reviewers": "vhey;fdwg;P4Wc;dhm9", "pdf_size": 0, "recommendation": "6;6;6;8", "confidence": "4;5;4;5", "correctness": "3;3;4;4", "technical_novelty": "2;3;3;4", "empirical_novelty": "3;4;4;4", "wc_summary_paper": "30;127;99;187", "wc_summary_review": "17;38;132;33", "wc_main_review": "182;591;416;315", "wc_review": "229;756;647;535", "wc_reply_reviewers": "0;395;0;83", "wc_reply_authors": "436;3139;1216;364", "reply_reviewers": "0;2;0;1", "reply_authors": "1;6;2;1", "recommendation_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 4.5, 0.5 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 3.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 110.75, 56.42860533452869 ], "wc_summary_review_avg": [ 55.0, 45.1275968781853 ], "wc_main_review_avg": [ 376.0, 149.31677735606272 ], "wc_review_avg": [ 541.75, 196.74777635338094 ], "wc_reply_reviewers_avg": [ 119.5, 162.62917942362003 ], "wc_reply_authors_avg": [ 1288.75, 1119.2701584068075 ], "reply_reviewers_avg": [ 0.75, 0.82915619758885 ], "reply_authors_avg": [ 2.5, 2.0615528128088303 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 98, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15441955675965962376&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "pdf": "https://openreview.net/pdf?id=Azh9QBQ4tR7", "email": ";ethz.ch", "author_num": 2, "aff_unique_index": "0", "aff_unique_norm": "Swiss Federal Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.ethz.ch", "aff_unique_abbr": "ETH Zurich", "aff_country_unique_index": "0", "aff_country_unique": "Switzerland" }, { "id": "B0JH7vR2iGh", "title": "PMIC: Improving Multi-Agent Reinforcement Learning with Progressive Mutual Information Collaboration", "track": "main", "status": "Reject", "tldr": "", "abstract": "Learning to collaborate is critical in multi-agent reinforcement learning (MARL). A branch of previous works proposes to promote collaboration by maximizing the correlation of agents\u2019 behaviors, which is typically characterised by mutual information (MI) in different forms. However, simply maximizing the MI of agents\u2019 behaviors cannot guarantee achieving better collaboration because suboptimal collaboration can also lead to high MI. In this paper, we first propose a new collaboration criterion to evaluate collaboration from three perspectives, which arrives at a form of the mutual information between global state and joint policy. This bypasses the introduction of explicit additional input of policies and mitigates the scalability issue meanwhile. Moreover, to better leverage MI-based collaboration signals, we propose a novel MARL framework, called Progressive Mutual Information Collaboration (PMIC) which contains two main components. The first component is Dual Progressive Collaboration Buffer (DPCB) which separately stores superior and inferior trajectories in a progressive manner. The second component is Dual Mutual Information Estimator (DMIE), including two neural estimators of our new designed MI based on separate samples in DPCB. We then make use of the neural MI estimates to improve agents' policies: to maximize the MI lower bound associated with superior collaboration to facilitate better collaboration and to minimize the MI upper bound associated with inferior collaboration to avoid falling into local optimal. PMIC is general and can be combined with existing MARL algorithms. Experiments on a wide range of MARL benchmarks show the superior performance of PMIC compared with other MARL algorithms.", "keywords": "Multi-Agent Reinforcement Learning;Collaboration", "primary_area": "", "supplementary_material": "/attachment/017ff6b0f4079b9d5c4bb68c3ce730c84af45051.zip", "author": "Pengyi Li;Hongyao Tang;Tianpei Yang;Xiaotian Hao;Sang Tong;YAN ZHENG;Jianye HAO;Matthew E. Taylor;Jinyi Liu", "authorids": "~Pengyi_Li1;~Hongyao_Tang1;~Tianpei_Yang1;~Xiaotian_Hao1;~Sang_Tong1;~YAN_ZHENG1;~Jianye_HAO1;~Matthew_E._Taylor2;~Jinyi_Liu1", "gender": "M;M;F;M;F;M;M;;M", "homepage": "https://yeshenpy.github.io/;https://bluecontra.github.io/;https://tianpeiyang.github.io/;;https://github.com/SuiJiGuoChengSuiJiGuo;https://yanzzzzz.github.io;http://www.icdai.org/jianye.html;;https://irll.ca", "dblp": "195/6948;220/4275;184/8221;144/3359;;10/2381-2;21/7664.html;192/6688-2;46/4287.html", "google_scholar": "https://scholar.google.com/citations?hl=zh-CN;yIqzRH4AAAAJ;https://scholar.google.com/citations?hl=zh-CN;xgk9NPwAAAAJ;;https://scholar.google.com.hk/citations?user=tJuhd1kAAAAJ;;kaQS7NAAAAAJ;edQgLXcAAAAJ", "orcid": "0009-0009-8546-2346;;0000-0002-5497-7146;;;;0000-0002-0422-8235;;0000-0001-8946-0211", "linkedin": ";;tianpei-yang/;;;;;\u91d1\u6bc5-\u5218-5b7447118;", "or_profile": "~Pengyi_Li1;~Hongyao_Tang1;~Tianpei_Yang1;~Xiaotian_Hao1;~Sang_Tong1;~YAN_ZHENG1;~Jianye_HAO1;~Jinyi_Liu1;~Matthew_Taylor1", "aff": "Tianjin University;College of Intelligence and Computing, Tianjin University;University of Alberta;university of tianjin of china, Tianjin University;Inspir.ai;Tianjin Unibersity, China;Tianjin University;Tianjin University;Washington State University, Pullman", "aff_domain": "tju.edu.cn;tju.edu.cn;ualberta.ca;tju.edu.cn;inspirai.com;tju.edu.cn;tju.edu.cn;tju.edu.cn;wsu.edu", "position": "MS student;PhD student;Postdoc;PhD student;Researcher;Associate Professor;Associate Professor;MS student;Adjunct Professor", "bibtex": "@misc{\nli2022pmic,\ntitle={{PMIC}: Improving Multi-Agent Reinforcement Learning with Progressive Mutual Information Collaboration},\nauthor={Pengyi Li and Hongyao Tang and Tianpei Yang and Xiaotian Hao and Sang Tong and YAN ZHENG and Jianye HAO and Matthew E. Taylor and Jinyi Liu},\nyear={2022},\nurl={https://openreview.net/forum?id=B0JH7vR2iGh}\n}", "github": "", "project": "", "reviewers": "QvKR;iVbL;5fQS;vXVk", "site": "https://openreview.net/forum?id=B0JH7vR2iGh", "pdf_size": 0, "recommendation": "5;6;6;6", "confidence": "4;4;3;4", "correctness": "3;3;3;4", "technical_novelty": "3;3;3;3", "empirical_novelty": "3;2;3;3", "wc_summary_paper": "66;265;85;90", "wc_summary_review": "26;78;33;126", "wc_main_review": "292;349;73;1511", "wc_review": "384;692;191;1727", "wc_reply_reviewers": "252;50;0;0", "wc_reply_authors": "1694;839;223;1709", "reply_reviewers": "2;2;0;0", "reply_authors": "4;2;1;4", "recommendation_avg": [ 5.75, 0.4330127018922193 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 126.5, 80.46272428895259 ], "wc_summary_review_avg": [ 65.75, 40.10221315588455 ], "wc_main_review_avg": [ 556.25, 560.7715109561112 ], "wc_review_avg": [ 748.5, 592.5202528184163 ], "wc_reply_reviewers_avg": [ 75.5, 103.92665683067074 ], "wc_reply_authors_avg": [ 1116.25, 624.4819352871626 ], "reply_reviewers_avg": [ 1.0, 1.0 ], "reply_authors_avg": [ 2.75, 1.299038105676658 ], "replies_avg": [ 22, 0 ], "authors#_avg": [ 9, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 36, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2755470732694105502&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;1;0;2;0;0;0;3", "aff_unique_norm": "Tianjin University;University of Alberta;inspir.ai;Washington State University", "aff_unique_dep": ";;;", "aff_unique_url": "http://www.tju.edu.cn;https://www.ualberta.ca;;https://wsu.edu", "aff_unique_abbr": "TJU;UAlberta;;WSU", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Tianjin;Pullman", "aff_country_unique_index": "0;0;1;0;0;0;0;3", "aff_country_unique": "China;Canada;;United States" }, { "title": "Neural Deep Equilibrium Solvers", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/5919", "id": "B0oHOwT5ENL", "poster": "", "openreview": "https://openreview.net/forum?id=B0oHOwT5ENL", "slides": "https://iclr.cc/virtual/2022/poster/5919", "video": "https://iclr.cc/virtual/2022/poster/5919", "author_site": "Shaojie Bai, Vladlen Koltun, Zico Kolter", "tldr": "", "abstract": "A deep equilibrium (DEQ) model abandons traditional depth by solving for the fixed point of a single nonlinear layer $f_\\theta$. This structure enables decoupling the internal structure of the layer (which controls representational capacity) from how the fixed point is actually computed (which impacts inference-time efficiency), which is usually via classic techniques such as Broyden's method or Anderson acceleration. In this paper, we show that one can exploit such decoupling and substantially enhance this fixed point computation using a custom neural solver. Specifically, our solver uses a parameterized network to both guess an initial value of the optimization and perform iterative updates, in a method that generalizes a learnable form of Anderson acceleration and can be trained end-to-end in an unsupervised manner. Such a solution is particularly well suited to the implicit model setting, because inference in these models requires repeatedly solving for a fixed point of the same nonlinear layer for different inputs, a task at which our network excels. Our experiments show that these neural equilibrium solvers are fast to train (only taking an extra 0.9-1.1% over the original DEQ's training time), require few additional parameters (1-3% of the original model size), yet lead to a $2\\times$ speedup in DEQ network inference without any degradation in accuracy across numerous domains and tasks.", "keywords": "Deep learning;Implicit models;Deep equilibrium models", "primary_area": "", "supplementary_material": "/attachment/fbaca583f2e35426db627e36a8de0d15aee37d4a.zip", "author": "Shaojie Bai;Vladlen Koltun;J Zico Kolter", "authorids": "~Shaojie_Bai1;~Vladlen_Koltun1;~J_Zico_Kolter1", "gender": "M;M;M", "homepage": "https://jerrybai1995.github.io;http://vladlen.info/;http://www.zicokolter.com", "dblp": ";66/5458.html;67/2526", "google_scholar": "DLVP3PcAAAAJ;kg4bCpgAAAAJ;UXh1I6UAAAAJ", "orcid": ";0000-0003-0858-0970;", "linkedin": ";vladlenkoltun/;", "or_profile": "~Shaojie_Bai1;~Vladlen_Koltun1;~Zico_Kolter1", "aff": "School of Computer Science, Carnegie Mellon University;Apple;Carnegie Mellon University", "aff_domain": "cs.cmu.edu;apple.com;cmu.edu", "position": "PhD student;Distinguished Scientist;Full Professor", "bibtex": "@inproceedings{\nbai2022neural,\ntitle={Neural Deep Equilibrium Solvers},\nauthor={Shaojie Bai and Vladlen Koltun and J Zico Kolter},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=B0oHOwT5ENL}\n}", "github": "", "project": "", "reviewers": "CYHx;eZVG;YWYD", "pdf_size": 0, "recommendation": "8;8;8", "confidence": "4;4;4", "correctness": "4;3;4", "technical_novelty": "3;2;4", "empirical_novelty": "4;4;4", "wc_summary_paper": "107;104;105", "wc_summary_review": "44;92;31", "wc_main_review": "423;443;133", "wc_review": "574;639;269", "wc_reply_reviewers": "0;0;24", "wc_reply_authors": "665;989;454", "reply_reviewers": "0;0;1", "reply_authors": "1;2;1", "recommendation_avg": [ 8.0, 0.0 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 3.0, 0.816496580927726 ], "empirical_novelty_avg": [ 4.0, 0.0 ], "wc_summary_paper_avg": [ 105.33333333333333, 1.247219128924647 ], "wc_summary_review_avg": [ 55.666666666666664, 26.233989826601334 ], "wc_main_review_avg": [ 333.0, 141.65686240583852 ], "wc_review_avg": [ 494.0, 161.29682782580278 ], "wc_reply_reviewers_avg": [ 8.0, 11.313708498984761 ], "wc_reply_authors_avg": [ 702.6666666666666, 220.03080592397865 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 48, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11628830115303196547&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=B0oHOwT5ENL", "email": "cs.cmu.edu;apple.com;cmu.edu", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "Carnegie Mellon University;Apple", "aff_unique_dep": "School of Computer Science;Apple Inc.", "aff_unique_url": "https://www.cmu.edu;https://www.apple.com", "aff_unique_abbr": "CMU;Apple", "aff_campus_unique_index": "0", "aff_campus_unique": "Pittsburgh;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "B2pZkS2urk_", "title": "Do What Nature Did To Us: Evolving Plastic Recurrent Neural Networks For Generalized Tasks", "track": "main", "status": "Reject", "tldr": "", "abstract": "While artificial neural networks (ANNs) have been widely adopted in machine learning, researchers are increasingly obsessed by the gaps between ANNs and natural neural networks (NNNs). In this paper, we propose a framework named as Evolutionary Plastic Recurrent Neural Networks (EPRNN). Inspired by NNN, EPRNN composes Evolution Strategies, Plasticity Rules, and Recursion-based Learning all in one meta learning framework for generalization to different tasks. More specifically, EPRNN incorporates with nested loops for meta learning --- an outer loop searches for optimal initial parameters of the neural network and learning rules; an inner loop adapts to specific tasks. In the inner loop of EPRNN, we effectively attain both long term memory and short term memory by forging plasticity with recursion-based learning mechanisms, both of which are believed to be responsible for the formation of memories in NNNs. The inner-loop setting closely simulate that of NNNs, which neither query from any gradient oracle for optimization nor require the exact forms of learning objectives. To evaluate the performance of EPRNN, we carry out extensive experiments in two groups of tasks: Sequence Predicting, and Wheeled Robot Navigating. The experiment results demonstrate the unique advantage of EPRNN compared to state-of-the-arts based on plasticity and recursion while yielding comparably good performance against deep learning based approaches in the tasks. The experiment results suggest the potential of EPRNN to generalize to variety of tasks and encourage more efforts in plasticity and recursion based learning mechanisms.", "keywords": "Evolving Plasticity;Learning to learn", "primary_area": "", "supplementary_material": "", "author": "Fan Wang;Hao Tian;Haoyi Xiong;hua wu;Yang Cao;Yu Kang;Haifeng Wang", "authorids": "~Fan_Wang4;tianhao@baidu.com;~Haoyi_Xiong1;~hua_wu1;~Yang_Cao5;kangduyu@ustc.edu.cn;~Haifeng_Wang3", "gender": "M;;M;F;M;;M", "homepage": ";;https://sites.google.com/site/haoyixiongshomepage/;https://wuhuanlp.github.io/;;;https://haifengwang.net/", "dblp": ";;06/2700;27/6045-3;25/7045-10;;10/5209-1.html", "google_scholar": "vgFErZQAAAAJ;;f_Kcie0AAAAJ;9X2ThuAAAAAJ;K7rTHNcAAAAJ;;jgy4jCAAAAAJ", "orcid": ";;;0000-0001-8254-1561;;;0000-0002-0672-7468", "linkedin": ";;;;;;", "or_profile": "~Fan_Wang4;tianhao@baidu.com;~Haoyi_Xiong1;~hua_wu1;~Yang_Cao5;kangduyu@ustc.edu.cn;~Haifeng_Wang3", "aff": "Baidu Inc.;;Baidu;Baidu;University of Science and Technology of China;;Baidu", "aff_domain": "baidu.com;;baidu.com;baidu.com;ustc.edu.cn;;baidu.com", "position": "Principal Researcher;;Principal Researcher;Principal Researcher;Associate Professor;;CTO", "bibtex": "@misc{\nwang2022do,\ntitle={Do What Nature Did To Us: Evolving Plastic Recurrent Neural Networks For Generalized Tasks},\nauthor={Fan Wang and Hao Tian and Haoyi Xiong and hua wu and Yang Cao and Yu Kang and Haifeng Wang},\nyear={2022},\nurl={https://openreview.net/forum?id=B2pZkS2urk_}\n}", "github": "", "project": "", "reviewers": "byce;RBhw;mN1j;oHzr", "site": "https://openreview.net/forum?id=B2pZkS2urk_", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "4;4;4;4", "correctness": "2;4;3;3", "technical_novelty": "2;1;2;2", "empirical_novelty": "1;1;3;2", "wc_summary_paper": "31;46;72;33", "wc_summary_review": "59;16;27;27", "wc_main_review": "1183;69;305;143", "wc_review": "1273;131;404;203", "wc_reply_reviewers": "212;0;46;0", "wc_reply_authors": "239;0;184;52", "reply_reviewers": "1;0;1;0", "reply_authors": "1;0;1;1", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 1.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 45.5, 16.347782724271816 ], "wc_summary_review_avg": [ 32.25, 16.08376510646683 ], "wc_main_review_avg": [ 425.0, 445.87666456095235 ], "wc_review_avg": [ 502.75, 455.81924871597954 ], "wc_reply_reviewers_avg": [ 64.5, 87.20521773380307 ], "wc_reply_authors_avg": [ 118.75, 96.5333491597593 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 0.75, 0.4330127018922193 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:5ZBq_IYOytMJ:scholar.google.com/&scioq=Do+What+Nature+Did+To+Us:+Evolving+Plastic+Recurrent+Neural+Networks+For+Generalized+Tasks&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0;0;1;0", "aff_unique_norm": "Baidu;University of Science and Technology of China", "aff_unique_dep": "Baidu Inc.;", "aff_unique_url": "https://www.baidu.com;http://www.ustc.edu.cn", "aff_unique_abbr": "Baidu;USTC", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "id": "B31WdoD2VXQ", "title": "IDENTIFYING CONCEALED OBJECTS FROM VIDEOS", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Concealed objects are often hard to identify from still images, as often camouflaged objects exhibit patterns seamless to the background. In this work, we propose a novel video concealed object detection (VCOD) framework, called \\textbf{\\Ourmodel}, as the concealed state is likely to break when the object moves. The proposed SLT-Net leverages on both short-term dynamics and long-term temporal consistency to detect concealed objects in continuous video frames. Unlike previous methods that often utilize homography or optical flows to explicitly represent motions, we build a dense correlation volume to implicitly capture motions between neighbouring frames. To enforce the temporal consistency within a video sequence, we utilize a spatial-temporal transformer to jointly refine the short-term predictions. Extensive experiments on existing image and VCOD benchmarks demonstrate the architectural effectiveness of our approach. We further collect a large-scale VCOD dataset named MoCA-Mask with pixel-level handcrafted ground-truth masks and construct a comprehensive VCOD benchmark with previous methods. Videos and codes can be found at: https://anonymous.4open.science/r/long-short-vcod-C0AF/README.md. ", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/229813ebaa6b632bbbb008db3dcc7b3f93540fe9.zip", "author": "Xuelian Cheng;Huan Xiong;Deng-Ping Fan;Yiran Zhong;Mehrtash Harandi;Tom Drummond;Zongyuan Ge", "authorids": "~Xuelian_Cheng2;~Huan_Xiong1;~Deng-Ping_Fan1;~Yiran_Zhong1;~Mehrtash_Harandi2;~Tom_Drummond1;~Zongyuan_Ge1", "gender": "F;M;M;M;M;M;M", "homepage": "https://xueliancheng.github.io/;https://scholar.google.com/citations?user=l4hm14MAAAAJ&hl=en;https://dengpingfan.github.io/;;;https://research.monash.edu/en/persons/zongyuan-ge;https://sites.google.com/site/mehrtashharandi/", "dblp": "199/2381;;205/3148;158/9624;50/1633;147/2757;92/5921", "google_scholar": "gQ5kHH8AAAAJ;l4hm14MAAAAJ;kakwJ5QAAAAJ;https://scholar.google.com.sg/citations?user=E9NVOBUAAAAJ;https://scholar.google.com.au/citations?user=6sWGL5wAAAAJ;https://scholar.google.com.au/citations?user=Q0gUrcIAAAAJ;--M1XEkAAAAJ", "orcid": ";;0000-0002-5245-7518;;0000-0001-8204-5904;0000-0002-5880-8673;0000-0002-6937-6300", "linkedin": ";;deng-ping-fan-584b25198/;;;;mehrtash-harandi-b99358155/", "or_profile": "~Xuelian_Cheng2;~Huan_Xiong1;~Deng-Ping_Fan1;~Yiran_Zhong1;~Tom_Drummond1;~Zongyuan_Ge1;~Mehrtash_T._Harandi1", "aff": "Monash University;;ETHZ - ETH Zurich;SenseTime;University of Melbourne;Monash University;Monash University", "aff_domain": "monash.edu;;ethz.ch;sensetime.com;unimelb.edu.au;monash.edu;monash.edu", "position": "PhD student;;Postdoc;Researcher;Full Professor;Associate Professor;Assistant Professor", "bibtex": "@misc{\ncheng2022identifying,\ntitle={{IDENTIFYING} {CONCEALED} {OBJECTS} {FROM} {VIDEOS}},\nauthor={Xuelian Cheng and Huan Xiong and Deng-Ping Fan and Yiran Zhong and Mehrtash Harandi and Tom Drummond and Zongyuan Ge},\nyear={2022},\nurl={https://openreview.net/forum?id=B31WdoD2VXQ}\n}", "github": "", "project": "", "reviewers": "raPk;ayMr;Fh9N;6mkd", "site": "https://openreview.net/forum?id=B31WdoD2VXQ", "pdf_size": 0, "recommendation": "3;3;5;6", "confidence": "5;4;3;3", "correctness": "2;3;3;4", "technical_novelty": "1;2;2;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "95;129;94;86", "wc_summary_review": "57;67;58;55", "wc_main_review": "579;124;210;133", "wc_review": "731;320;362;274", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.25, 1.299038105676658 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 101.0, 16.537835408541227 ], "wc_summary_review_avg": [ 59.25, 4.602988159880492 ], "wc_main_review_avg": [ 261.5, 186.3310226451838 ], "wc_review_avg": [ 421.75, 181.2379306326355 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": -0.8703882797784892, "corr_recommendation_correctness": 0.816496580927726, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:XlhIJGKabOkJ:scholar.google.com/&scioq=IDENTIFYING+CONCEALED+OBJECTS+FROM+VIDEOS&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;2;3;0;0", "aff_unique_norm": "Monash University;ETH Zurich;SenseTime;University of Melbourne", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.monash.edu;https://www.ethz.ch;https://www.sensetime.com;https://www.unimelb.edu.au", "aff_unique_abbr": "Monash;ETHZ;SenseTime;UniMelb", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2;0;0;0", "aff_country_unique": "Australia;Switzerland;China" }, { "title": "Eliminating Sharp Minima from SGD with Truncated Heavy-tailed Noise", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6392", "id": "B3Nde6lvab", "poster": "", "openreview": "https://openreview.net/forum?id=B3Nde6lvab", "slides": "https://iclr.cc/virtual/2022/poster/6392", "video": "https://iclr.cc/virtual/2022/poster/6392", "author_site": "Xingyu Wang, Sewoong Oh, Chang-Han Rhee", "tldr": "", "abstract": "The empirical success of deep learning is often attributed to SGD\u2019s mysterious ability to avoid sharp local minima in the loss landscape, as sharp minima are known to lead to poor generalization. Recently, empirical evidence of heavy-tailed gradient noise was reported in many deep learning tasks; and it was shown in (Simsekli et al., 2019a;b) that SGD can escape sharp local minima under the presence of such heavy-tailed gradient noise, providing a partial solution to the mystery. In this work, we analyze a popular variant of SGD where gradients are truncated above a fixed threshold. We show that it achieves a stronger notion of avoiding sharp minima: it can effectively eliminate sharp local minima entirely from its training trajectory. We characterize the dynamics of truncated SGD driven by heavy-tailed noises. First, we show that the truncation threshold and width of the attraction field dictate the order of the first exit time from the associated local minimum. Moreover, when the objective function satisfies appropriate structural conditions, we prove that as the learning rate decreases, the dynamics of the heavy-tailed truncated SGD closely resemble those of a continuous-time Markov chain that never visits any sharp minima. Real data experiments on deep learning confirm our theoretical prediction that heavy-tailed SGD with gradient clipping finds a flatter local minima and achieves better generalization.", "keywords": "Stochastic Gradient Descent;SGD;Heavy-Tails;Generalization", "primary_area": "", "supplementary_material": "/attachment/ae4cd522622a243461f59a82a126e19e97edc66b.zip", "author": "Xingyu Wang;Sewoong Oh;Chang-Han Rhee", "authorids": "~Xingyu_Wang2;~Sewoong_Oh1;~Chang-Han_Rhee1", "gender": "M;M;M", "homepage": ";https://homes.cs.washington.edu/~sewoong/;https://chrhee.github.io", "dblp": ";80/4366;", "google_scholar": ";55TAOdgAAAAJ;", "orcid": ";;0000-0002-1651-4677", "linkedin": "xingyu-wang-01483a128/;;", "or_profile": "~Xingyu_Wang2;~Sewoong_Oh1;~Chang-Han_Rhee1", "aff": "Northwestern University, Northwestern University;University of Washington;Northwestern University", "aff_domain": "u.northwestern.edu;uw.edu;northwestern.edu", "position": "PhD student;Associate Professor;Assistant Professor", "bibtex": "@inproceedings{\nwang2022eliminating,\ntitle={Eliminating Sharp Minima from {SGD} with Truncated Heavy-tailed Noise},\nauthor={Xingyu Wang and Sewoong Oh and Chang-Han Rhee},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=B3Nde6lvab}\n}", "github": "", "project": "", "reviewers": "HGyL;QXyo;kP7Y", "pdf_size": 0, "recommendation": "5;6;8", "confidence": "3;3;5", "correctness": "3;3;3", "technical_novelty": "2;3;4", "empirical_novelty": "2;3;3", "wc_summary_paper": "51;64;84", "wc_summary_review": "73;32;23", "wc_main_review": "366;350;482", "wc_review": "490;446;589", "wc_reply_reviewers": "454;208;0", "wc_reply_authors": "6835;2287;2848", "reply_reviewers": "2;1;0", "reply_authors": "11;5;5", "recommendation_avg": [ 6.333333333333333, 1.247219128924647 ], "confidence_avg": [ 3.6666666666666665, 0.9428090415820634 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 3.0, 0.816496580927726 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 66.33333333333333, 13.572848714334887 ], "wc_summary_review_avg": [ 42.666666666666664, 21.761331658599286 ], "wc_main_review_avg": [ 399.3333333333333, 58.817986666967414 ], "wc_review_avg": [ 508.3333333333333, 59.801523577767 ], "wc_reply_reviewers_avg": [ 220.66666666666666, 185.56101123050846 ], "wc_reply_authors_avg": [ 3990.0, 2024.7138069366742 ], "reply_reviewers_avg": [ 1.0, 0.816496580927726 ], "reply_authors_avg": [ 7.0, 2.8284271247461903 ], "replies_avg": [ 28, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.944911182523068, "corr_recommendation_correctness": 0.0, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1787999645865129435&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "pdf": "https://openreview.net/pdf?id=B3Nde6lvab", "email": "u.northwestern.edu;uw.edu;northwestern.edu", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "Northwestern University;University of Washington", "aff_unique_dep": ";", "aff_unique_url": "https://www.northwestern.edu;https://www.washington.edu", "aff_unique_abbr": "NU;UW", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "B4uS3efOEW", "title": "Confidence Adaptive Regularization for Deep Learning with Noisy Labels", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Recent studies on the memorization effects of deep neural networks on noisy labels show that the networks first fit the correctly labeled training samples before memorizing the mislabeled samples. Motivated by this early-learning phenomenon, we propose a novel method to prevent memorization of the mislabeled samples. Unlike the existing approaches which use confidence (captured by winning score from model prediction) to identify or ignore the mislabeled samples, we introduce an indicator branch to the original model and enable the model to produce a new confidence (i.e. indicates whether a sample is clean or mislabeled) for each sample. The confidence values are incorporated in the proposed loss function which is learned to assign large values to correctly-labeled samples and small values to mislabeled ones. We also discuss the limitation of our approach and propose an auxiliary regularization term to enhance the robustness of the model in challenging cases. Our empirical analysis shows that the model predicts correctly for both clean and mislabeled samples in the early learning phase. Based on the predictions in each iteration, we correct the noisy labels to steer the model towards corrected targets. Further, we provide the theoretical analysis and conduct numerous experiments on synthetic and real-world datasets, demonstrating that our approach achieves comparable and even better results to the state-of-the-art methods.", "keywords": "noisy labels;regularization;label correction", "primary_area": "", "supplementary_material": "/attachment/30c3e552b1204a45a648e3e57f743b0bc0f6fa98.zip", "author": "Yangdi Lu;Yang Bo;Wenbo He", "authorids": "~Yangdi_Lu1;~Yang_Bo1;~Wenbo_He2", "gender": "M;;F", "homepage": "https://maclll.github.io/;;http://www.cas.mcmaster.ca/wenbohe/", "dblp": "202/7639;;", "google_scholar": "https://scholar.google.ca/citations?user=LJ0x1FAAAAAJ;https://scholar.google.com/citations?hl=en;", "orcid": ";;", "linkedin": ";;", "or_profile": "~Yangdi_Lu1;~Yang_Bo1;~Wenbo_He2", "aff": "McMaster University;;McMaster University", "aff_domain": "mcmaster.ca;;mcmaster.ca", "position": "PhD student;;Associate Professor", "bibtex": "@misc{\nlu2022confidence,\ntitle={Confidence Adaptive Regularization for Deep Learning with Noisy Labels},\nauthor={Yangdi Lu and Yang Bo and Wenbo He},\nyear={2022},\nurl={https://openreview.net/forum?id=B4uS3efOEW}\n}", "github": "", "project": "", "reviewers": "RxaW;j8bG;fyLz;711k", "site": "https://openreview.net/forum?id=B4uS3efOEW", "pdf_size": 0, "recommendation": "3;5;5;5", "confidence": "5;5;4;5", "correctness": "3;3;2;3", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;3;2;3", "wc_summary_paper": "68;61;86;106", "wc_summary_review": "32;58;72;118", "wc_main_review": "239;419;314;339", "wc_review": "339;538;472;563", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.5, 0.8660254037844386 ], "confidence_avg": [ 4.75, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 80.25, 17.440971876589906 ], "wc_summary_review_avg": [ 70.0, 31.20897306865447 ], "wc_main_review_avg": [ 327.75, 64.26264467013476 ], "wc_review_avg": [ 478.0, 86.86483753510393 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": -0.3333333333333333, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12281756243711133634&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0", "aff_unique_norm": "McMaster University", "aff_unique_dep": "", "aff_unique_url": "https://www.mcmaster.ca", "aff_unique_abbr": "McMaster", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Canada" }, { "title": "Data Poisoning Won\u2019t Save You From Facial Recognition", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7006", "id": "B5XahNLmna", "poster": "", "openreview": "https://openreview.net/forum?id=B5XahNLmna", "slides": "https://iclr.cc/virtual/2022/poster/7006", "video": "https://iclr.cc/virtual/2022/poster/7006", "author_site": "Evani Radiya-Dixit, Sanghyun Hong, Nicholas Carlini, Florian Tramer", "tldr": "", "abstract": "Data poisoning has been proposed as a compelling defense against facial recognition models trained on Web-scraped pictures. Users can perturb images they post online, so that models will misclassify future (unperturbed) pictures.\n \n We demonstrate that this strategy provides a false sense of security, as it ignores an inherent asymmetry between the parties: users' pictures are perturbed once and for all before being published (at which point they are scraped) and must thereafter fool all future models---including models trained adaptively against the users' past attacks, or models that use new technologies discovered after the attack.\n \nWe evaluate two systems for poisoning attacks against large-scale facial recognition, Fawkes (500,000+ downloads) and LowKey. We demonstrate how an \"oblivious\" model trainer can simply wait for future developments in computer vision to nullify the protection of pictures collected in the past. We further show that an adversary with black-box access to the attack can (i) train a robust model that resists the perturbations of collected pictures and (ii) detect poisoned pictures uploaded online.\n \nWe caution that facial recognition poisoning will not admit an \"arms race\" between attackers and defenders. Once perturbed pictures are scraped, the attack cannot be changed so any future successful defense irrevocably undermines users' privacy.", "keywords": "Poisoning attacks;adversarial examples;facial recognition;arms race;defenses", "primary_area": "", "supplementary_material": "", "author": "Evani Radiya-Dixit;Sanghyun Hong;Nicholas Carlini;Florian Tramer", "authorids": "~Evani_Radiya-Dixit1;~Sanghyun_Hong1;~Nicholas_Carlini1;~Florian_Tramer1", "gender": "F;M;;M", "homepage": ";http://www.sanghyun-hong.com;http://nicholas.carlini.com;http://floriantramer.com", "dblp": ";135/8991;145/1806;158/7224", "google_scholar": ";https://scholar.google.com/citations?hl=en;;https://scholar.google.ch/citations?user=ijH0-a8AAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Evani_Radiya-Dixit1;~Sanghyun_Hong1;~Nicholas_Carlini1;~Florian_Tramer1", "aff": ";Oregon State University;Google;Google", "aff_domain": ";oregonstate.edu;google.com;google.com", "position": ";Assistant Professor;Researcher;Visiting Researcher", "bibtex": "@inproceedings{\nradiya-dixit2022data,\ntitle={Data Poisoning Won{\\textquoteright}t Save You From Facial Recognition},\nauthor={Evani Radiya-Dixit and Sanghyun Hong and Nicholas Carlini and Florian Tramer},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=B5XahNLmna}\n}", "github": "", "project": "", "reviewers": "6s7m;8c4m;3JiV;6oYM", "pdf_size": 0, "recommendation": "1;6;8;8", "confidence": "5;4;3;5", "correctness": "3;4;3;3", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "63;96;152;32", "wc_summary_review": "42;71;61;47", "wc_main_review": "59;109;194;199", "wc_review": "164;276;407;278", "wc_reply_reviewers": "405;110;13;0", "wc_reply_authors": "995;285;395;185", "reply_reviewers": "2;1;1;0", "reply_authors": "3;1;1;1", "recommendation_avg": [ 5.75, 2.8613807855648994 ], "confidence_avg": [ 4.25, 0.82915619758885 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 85.75, 44.443081576326364 ], "wc_summary_review_avg": [ 55.25, 11.453711188955307 ], "wc_main_review_avg": [ 140.25, 58.988876069984585 ], "wc_review_avg": [ 281.25, 86.0214362818943 ], "wc_reply_reviewers_avg": [ 132.0, 163.24674575623246 ], "wc_reply_authors_avg": [ 465.0, 314.88092987667574 ], "reply_reviewers_avg": [ 1.0, 0.7071067811865476 ], "reply_authors_avg": [ 1.5, 0.8660254037844386 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.5005202012851406, "corr_recommendation_correctness": 0.050443327230531826, "gs_citation": 65, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12334665611277654156&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=B5XahNLmna", "email": ";oregonstate.edu;google.com;google.com", "author_num": 4, "aff_unique_index": "0;1;1", "aff_unique_norm": "Oregon State University;Google", "aff_unique_dep": ";Google", "aff_unique_url": "https://oregonstate.edu;https://www.google.com", "aff_unique_abbr": "OSU;Google", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Learning Object-Oriented Dynamics for Planning from Text", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7132", "id": "B6EIcyp-Rb7", "poster": "", "openreview": "https://openreview.net/forum?id=B6EIcyp-Rb7", "slides": "https://iclr.cc/virtual/2022/poster/7132", "video": "https://iclr.cc/virtual/2022/poster/7132", "author_site": "Guiliang Liu, Ashutosh Adhikari, Amir-massoud Farahmand, Pascal Poupart", "tldr": "", "abstract": "The advancement of dynamics models enables model-based planning in complex environments. Existing dynamics models commonly study image-based games with fully observable states. Generalizing these models to Text-Based Games (TBGs), which commonly describe the partially observable states with noisy text observations, is challenging. In this work, we propose an Object-Oriented Text Dynamics (OOTD) model that enables planning algorithms to solve decision-making problems in text domains. OOTD predicts a memory graph that dynamically remembers the history of object observations and filters object-irrelevant information. To facilitate the robustness of dynamics, our OOTD model identifies the objects influenced by input actions and predicts the belief of object states with independently parameterized transition layers. We develop variational objectives under the object-supervised and self-supervised settings to model the stochasticity of predicted dynamics. Empirical results show OOTD-based planner significantly outperforms model-free baselines in terms of sample efficiency and running scores.", "keywords": "Object Oriented Markov Decision Process;Reinforcement Learning;Model-Based Planning;Text-Based Games;Knowledge Extraction", "primary_area": "", "supplementary_material": "/attachment/b968a74e2830048aef8553379efb22c9302edb3f.zip", "author": "Guiliang Liu;Ashutosh Adhikari;Amir-massoud Farahmand;Pascal Poupart", "authorids": "~Guiliang_Liu1;~Ashutosh_Adhikari1;~Amir-massoud_Farahmand1;~Pascal_Poupart2", "gender": "M;;M;M", "homepage": "http://guiliang.me/;https://ashutosh-adhikari.github.io/;http://academic.sologen.net/;https://cs.uwaterloo.ca/~ppoupart", "dblp": "220/5411;230/3772;17/671;26/2122", "google_scholar": "CuMylvEAAAAJ;;https://scholar.google.ca/citations?user=G5SAV7gAAAAJ;https://scholar.google.ca/citations?user=KhAJWroAAAAJ", "orcid": ";;;", "linkedin": ";;amir-massoud-farahmand/;", "or_profile": "~Guiliang_Liu1;~Ashutosh_Adhikari1;~Amir-massoud_Farahmand1;~Pascal_Poupart2", "aff": "University of Waterloo / Vector Institute;Microsoft;Vector Institute;University of Waterloo", "aff_domain": "uwaterloo.ca;microsoft.com;vectorinstitute.ai;uwaterloo.ca", "position": "Postdoc;Applied Scientist;Faculty Member;Full Professor", "bibtex": "@inproceedings{\nliu2022learning,\ntitle={Learning Object-Oriented Dynamics for Planning from Text},\nauthor={Guiliang Liu and Ashutosh Adhikari and Amir-massoud Farahmand and Pascal Poupart},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=B6EIcyp-Rb7}\n}", "github": "", "project": "", "reviewers": "W88S;DWsm;enCT;cFdN", "pdf_size": 0, "recommendation": "5;6;8;8", "confidence": "3;2;4;3", "correctness": "3;4;4;4", "technical_novelty": "3;2;3;3", "empirical_novelty": "3;3;3;3", "wc_summary_paper": "123;52;69;312", "wc_summary_review": "56;2;28;119", "wc_main_review": "710;199;324;577", "wc_review": "889;253;421;1008", "wc_reply_reviewers": "59;9;0;359", "wc_reply_authors": "1177;70;415;1386", "reply_reviewers": "1;1;0;1", "reply_authors": "2;1;1;3", "recommendation_avg": [ 6.75, 1.299038105676658 ], "confidence_avg": [ 3.0, 0.7071067811865476 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 139.0, 103.26422420180185 ], "wc_summary_review_avg": [ 51.25, 43.52800822459029 ], "wc_main_review_avg": [ 452.5, 201.60667151659442 ], "wc_review_avg": [ 642.75, 314.2947462176229 ], "wc_reply_reviewers_avg": [ 106.75, 147.360739343965 ], "wc_reply_authors_avg": [ 762.0, 538.7193146713787 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.75, 0.82915619758885 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.5443310539518174, "corr_recommendation_correctness": 0.7777777777777777, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11040334031426121165&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=B6EIcyp-Rb7", "email": "uwaterloo.ca;microsoft.com;vectorinstitute.ai;uwaterloo.ca", "author_num": 4, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "University of Waterloo;Microsoft;Vector Institute", "aff_unique_dep": ";Microsoft Corporation;", "aff_unique_url": "https://uwaterloo.ca;https://www.microsoft.com;https://vectorinstitute.ai/", "aff_unique_abbr": "UW;Microsoft;Vector Institute", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0", "aff_country_unique": "Canada;United States" }, { "id": "B6YDcqpMk30", "title": "PRIMA: Planner-Reasoner Inside a Multi-task Reasoning Agent", "track": "main", "status": "Reject", "tldr": "", "abstract": "In multi-task reasoning (MTR), an agent can solve multiple tasks via (first-order) logic reasoning. This capability is essential for human-like intelligence due to its strong generalizability and simplicity for handling multiple tasks. However, a major challenge in developing effective MTR is the intrinsic conflict between reasoning capability and efficiency. An MTR-capable agent must master a large set of \"skills'' to perform diverse tasks, but executing a particular task at the inference stage requires only a small subset of immediately relevant skills. How can we maintain broad reasoning capability yet efficient specific-task performance? To address this problem, we propose a Planner-Reasoner framework capable of state-of-the-art MTR capability and high efficiency. The Reasoner models shareable (first-order) logic deduction rules, from which the Planner selects a subset to compose into efficient reasoning paths. The entire model is trained in an end-to-end manner using deep reinforcement learning, and experimental studies over various domains validate its effectiveness.", "keywords": "inductive logic programming;logic reasoning;first-order logic;reinforcement learning;Monte Carlo tree search", "primary_area": "", "supplementary_material": "", "author": "Daoming Lyu;Bo Liu;Jianshu Chen", "authorids": "~Daoming_Lyu1;~Bo_Liu2;~Jianshu_Chen1", "gender": "M;M;M", "homepage": "http://webhome.auburn.edu/~dzl0053/;https://liubo-cs.github.io/;https://chenjianshu.github.io/", "dblp": "199/2378;58/2670-6.html;11/3124", "google_scholar": "rnQv_ggAAAAJ;https://scholar.google.com/citations?hl=en;jQeFWdoAAAAJ", "orcid": ";0000-0003-2519-6196;", "linkedin": ";bo-liu-8b2b8118/;", "or_profile": "~Daoming_Lyu1;~Bo_Liu2;~Jianshu_Chen1", "aff": "Auburn University;Auburn University;Tencent AI Lab", "aff_domain": "auburn.edu;auburn.edu;tencent.com", "position": "PhD student;Assistant Professor;Principal Researcher", "bibtex": "@misc{\nlyu2022prima,\ntitle={{PRIMA}: Planner-Reasoner Inside a Multi-task Reasoning Agent},\nauthor={Daoming Lyu and Bo Liu and Jianshu Chen},\nyear={2022},\nurl={https://openreview.net/forum?id=B6YDcqpMk30}\n}", "github": "", "project": "", "reviewers": "z1ve;Z6sN;eedL;Phsj", "site": "https://openreview.net/forum?id=B6YDcqpMk30", "pdf_size": 0, "recommendation": "5;6;6;6", "confidence": "3;3;3;3", "correctness": "2;3;4;3", "technical_novelty": "2;3;4;3", "empirical_novelty": "2;3;3;2", "wc_summary_paper": "47;66;103;324", "wc_summary_review": "74;31;47;92", "wc_main_review": "198;178;610;521", "wc_review": "319;275;760;937", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 5.75, 0.4330127018922193 ], "confidence_avg": [ 3.0, 0.0 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 135.0, 110.96170510586073 ], "wc_summary_review_avg": [ 61.0, 23.590252224170897 ], "wc_main_review_avg": [ 376.75, 191.4854759505274 ], "wc_review_avg": [ 572.75, 283.1893138873711 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.816496580927726, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1406013220258911503&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0;1", "aff_unique_norm": "Auburn University;Tencent", "aff_unique_dep": ";Tencent AI Lab", "aff_unique_url": "https://www.auburn.edu;https://ai.tencent.com", "aff_unique_abbr": "Auburn;Tencent AI Lab", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1", "aff_country_unique": "United States;China" }, { "title": "Taming Sparsely Activated Transformer with Stochastic Experts", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6179", "id": "B72HXs80q4", "poster": "", "openreview": "https://openreview.net/forum?id=B72HXs80q4", "slides": "https://iclr.cc/virtual/2022/poster/6179", "video": "https://iclr.cc/virtual/2022/poster/6179", "author_site": "Simiao Zuo, Xiaodong Liu, Jian Jiao, Young Jin Kim, Hany Hassan Awadalla, Ruofei Zhang, Jianfeng Gao, Tuo Zhao", "tldr": "", "abstract": "Sparsely activated models (SAMs), such as Mixture-of-Experts (MoE), can easily scale to have outrageously large amounts of parameters without significant increase in computational cost. However, SAMs are reported to be parameter inefficient such that larger models do not always lead to better performance. While most on-going research focuses on improving SAMs models by exploring methods of routing inputs to experts, our analysis reveals that such research might not lead to the solution we expect, i.e., the commonly-used routing methods based on gating mechanisms do not work better than randomly routing inputs to experts. In this paper, we propose a new expert-based model, THOR ($\\underline{\\textbf{T}}$ransformer wit$\\underline{\\textbf{H}}$ St$\\underline{\\textbf{O}}$chastic Expe$\\underline{\\textbf{R}}$ts). Unlike classic expert-based models, such as the Switch Transformer, experts in THOR are randomly activated for each input during training and inference. THOR models are trained using a consistency regularized loss, where experts learn not only from training data but also from other experts as teachers, such that all the experts make consistent predictions. We validate the effectiveness of THOR on machine translation tasks. Results show that THOR models are more parameter efficient in that they significantly outperform the Transformer and MoE models across various settings. For example, in multilingual translation, THOR outperforms the Switch Transformer by 2 BLEU scores, and obtains the same BLEU score as that of a state-of-the-art MoE model that is 18 times larger. Our code is publicly available at: https://github.com/microsoft/Stochastic-Mixture-of-Experts.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Simiao Zuo;Xiaodong Liu;Jian Jiao;Young Jin Kim;Hany Hassan;Ruofei Zhang;Jianfeng Gao;Tuo Zhao", "authorids": "~Simiao_Zuo1;~Xiaodong_Liu1;~Jian_Jiao2;~Young_Jin_Kim1;~Hany_Hassan1;~Ruofei_Zhang1;~Jianfeng_Gao1;~Tuo_Zhao1", "gender": ";;M;M;M;M;M;M", "homepage": ";;;https://www.microsoft.com/en-us/research/people/youki/;;;https://www.microsoft.com/en-us/research/people/jfgao/;http://www2.isye.gatech.edu/~tzhao80", "dblp": "232/2089;65/622;29/265-7.html;00/8110-1.html;83/64;36/2351.html;92/5339;", "google_scholar": "J8TSTXMAAAAJ;NIewcxMAAAAJ;D6KwmF8AAAAJ;;;;https://scholar.google.com/citations?hl=en;EJXN6tYAAAAJ", "orcid": ";;0000-0003-4779-9588;;;;;", "linkedin": ";;jian-jiao-82897810/;ykim362/;;;;", "or_profile": "~Simiao_Zuo1;~Xiaodong_Liu1;~Jian_Jiao2;~Young_Jin_Kim1;~Hany_Hassan1;~Ruofei_Zhang1;~Jianfeng_Gao1;~Tuo_Zhao1", "aff": "Georgia Institute of Technology;Microsoft Research;Microsoft;Microsoft;Microsoft;Microsoft;Microsoft Research;Georgia Institute of Technology", "aff_domain": "gatech.edu;microsoft.com;microsoft.com;microsoft.com;microsoft.com;microsoft.com;microsoft.com;gatech.edu", "position": "PhD student;Researcher;Principal Researcher;Principal Researcher;Research Scientist;Researcher;Principal Researcher;Associate Professor", "bibtex": "@inproceedings{\nzuo2022taming,\ntitle={Taming Sparsely Activated Transformer with Stochastic Experts},\nauthor={Simiao Zuo and Xiaodong Liu and Jian Jiao and Young Jin Kim and Hany Hassan and Ruofei Zhang and Jianfeng Gao and Tuo Zhao},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=B72HXs80q4}\n}", "github": "", "project": "", "reviewers": "zyWV;PXVZ;PQMm;hGSm", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "4;5;3;4", "correctness": "3;3;3;4", "technical_novelty": "2;3;3;4", "empirical_novelty": "2;4;3;4", "wc_summary_paper": "73;89;55;59", "wc_summary_review": "33;41;17;35", "wc_main_review": "991;984;75;248", "wc_review": "1097;1114;147;342", "wc_reply_reviewers": "31;0;0;95", "wc_reply_authors": "798;1030;151;368", "reply_reviewers": "1;0;0;1", "reply_authors": "2;3;1;1", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 3.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 69.0, 13.341664064126334 ], "wc_summary_review_avg": [ 31.5, 8.874119674649425 ], "wc_main_review_avg": [ 574.5, 417.5119758761418 ], "wc_review_avg": [ 675.0, 436.0269487084485 ], "wc_reply_reviewers_avg": [ 31.5, 38.78466191679386 ], "wc_reply_authors_avg": [ 586.75, 345.98654236834125 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.75, 0.82915619758885 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.9271726499455306, "gs_citation": 142, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2351258339090586276&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=B72HXs80q4", "email": "gatech.edu;microsoft.com;microsoft.com;microsoft.com;microsoft.com;microsoft.com;microsoft.com;gatech.edu", "author_num": 8, "aff_unique_index": "0;1;1;1;1;1;1;0", "aff_unique_norm": "Georgia Institute of Technology;Microsoft", "aff_unique_dep": ";Microsoft Research", "aff_unique_url": "https://www.gatech.edu;https://www.microsoft.com/en-us/research", "aff_unique_abbr": "Georgia Tech;MSR", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "B7O85qTDgU4", "title": "Meta-Learning Dynamics Forecasting Using Task Inference", "track": "main", "status": "Reject", "tldr": "", "abstract": "Current deep learning models for dynamics forecasting struggle with generalization. They can only forecast in a specific domain and fail when applied to systems with different parameters, external forces, or boundary conditions. We propose a model-based meta-learning method called DyAd which can generalize across heterogeneous domains by partitioning them into different tasks. DyAd has two parts: an encoder which infers the time-invariant hidden features of the task with weak supervision, and a forecaster which learns the shared dynamics of the entire domain. The encoder adapts and controls the forecaster during inference using adaptive instance normalization and adaptive padding. Theoretically, we prove that the generalization error of such procedure is related to the task relatedness in the source domain, as well as the domain differences between source and target. Experimentally, we demonstrate that our model outperforms state-of-the-art approaches on both turbulent flow and real-world ocean data forecasting tasks. ", "keywords": "meta-learning;generalizability;dynamical systems", "primary_area": "", "supplementary_material": "/attachment/ab2d57fd0c6bb557ed7514defec467fc10b94430.zip", "author": "Rui Wang;Robin Walters;Rose Yu", "authorids": "~Rui_Wang11;~Robin_Walters1;~Rose_Yu1", "gender": "M;M;F", "homepage": "https://rui1521.github.io/online-cv/;http://www.robinwalters.com;http://roseyu.com", "dblp": "06/2293-86;258/3416;164/7314", "google_scholar": "lEmjtfIAAAAJ;fnprJmUAAAAJ;", "orcid": ";;", "linkedin": "rui-ray-wang-41a398149/;;", "or_profile": "~Rui_Wang11;~Robin_Walters1;~Rose_Yu1", "aff": "University of California, San Diego;Northeastern University ;University of California, San Diego", "aff_domain": "ucsd.edu;northeastern.edu;ucsd.edu", "position": "PhD student;Assistant Professor;Assistant Professor", "bibtex": "@misc{\nwang2022metalearning,\ntitle={Meta-Learning Dynamics Forecasting Using Task Inference},\nauthor={Rui Wang and Robin Walters and Rose Yu},\nyear={2022},\nurl={https://openreview.net/forum?id=B7O85qTDgU4}\n}", "github": "", "project": "", "reviewers": "SrGv;nhax;cEq7;4kg2", "site": "https://openreview.net/forum?id=B7O85qTDgU4", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "4;4;4;3", "correctness": "3;3;4;4", "technical_novelty": "2;3;3;3", "empirical_novelty": "3;4;3;3", "wc_summary_paper": "88;104;55;54", "wc_summary_review": "231;57;22;22", "wc_main_review": "400;1095;178;160", "wc_review": "719;1256;255;236", "wc_reply_reviewers": "142;185;0;0", "wc_reply_authors": "1383;1586;351;266", "reply_reviewers": "1;1;0;0", "reply_authors": "3;3;1;1", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 75.25, 21.510172012329424 ], "wc_summary_review_avg": [ 83.0, 86.63428882376769 ], "wc_main_review_avg": [ 458.25, 379.58422978306146 ], "wc_review_avg": [ 616.5, 416.81200798441495 ], "wc_reply_reviewers_avg": [ 81.75, 83.1515934904437 ], "wc_reply_authors_avg": [ 896.5, 593.1258298202836 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 2.0, 1.0 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.9271726499455306, "corr_recommendation_correctness": 0.6882472016116854, "gs_citation": 44, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3740432482666416329&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff_unique_index": "0;1;0", "aff_unique_norm": "University of California, San Diego;Northeastern University", "aff_unique_dep": ";", "aff_unique_url": "https://www.ucsd.edu;https://www.northeastern.edu", "aff_unique_abbr": "UCSD;NEU", "aff_campus_unique_index": "0;0", "aff_campus_unique": "San Diego;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Recycling Model Updates in Federated Learning: Are Gradient Subspaces Low-Rank?", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6546", "id": "B7ZbqNLDn-_", "poster": "", "openreview": "https://openreview.net/forum?id=B7ZbqNLDn-_", "slides": "https://iclr.cc/virtual/2022/poster/6546", "video": "https://iclr.cc/virtual/2022/poster/6546", "author_site": "Sheikh Shams Azam, Seyyedali Hosseinalipour, Qiang Qiu, Christopher Brinton", "tldr": "", "abstract": "In this paper, we question the rationale behind propagating large numbers of parameters through a distributed system during federated learning. We start by examining the rank characteristics of the subspace spanned by gradients (i.e., the gradient-space) in centralized model training, and observe that the gradient-space often consists of a few leading principal components accounting for an overwhelming majority (95-99%) of the explained variance. Motivated by this, we propose the \"Look-back Gradient Multiplier\" (LBGM) algorithm, which utilizes this low-rank property of the gradient-space in federated learning. Operationally, LBGM recycles the gradients between model update rounds to significantly reduce the number of parameters to be propagated through the system. We analytically characterize the convergence behavior of LBGM, revealing the nature of the trade-off between communication savings and model performance. Our subsequent experimental results demonstrate the improvement LBGM obtains on communication overhead compared to federated learning baselines. Additionally, we show that LBGM is a general plug-and-play algorithm that can be used standalone or stacked on top of existing sparsification techniques for distributed model training.", "keywords": "Distributed Machine Learning;Federated Learning;Gradient Subspace;SGD", "primary_area": "", "supplementary_material": "", "author": "Sheikh Shams Azam;Seyyedali Hosseinalipour;Qiang Qiu;Christopher Brinton", "authorids": "~Sheikh_Shams_Azam1;~Seyyedali_Hosseinalipour1;~Qiang_Qiu1;~Christopher_Brinton1", "gender": "M;;;", "homepage": "https://shams.pairml.com/;;https://web.ics.purdue.edu/~qqiu/;https://www.cbrinton.net/", "dblp": "218/6739;;97/360;", "google_scholar": "nLEZZDwAAAAJ;;jdLtt_YAAAAJ;vWmHA5MAAAAJ", "orcid": "0000-0001-7678-5092;;;", "linkedin": "sshamsazam/;;;", "or_profile": "~Sheikh_Shams_Azam1;~Seyyedali_Hosseinalipour1;~Qiang_Qiu1;~Christopher_Brinton1", "aff": "Purdue University;;Purdue University;Purdue University", "aff_domain": "purdue.edu;;purdue.edu;purdue.edu", "position": "PhD student;;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\nazam2022recycling,\ntitle={Recycling Model Updates in Federated Learning: Are Gradient Subspaces Low-Rank?},\nauthor={Sheikh Shams Azam and Seyyedali Hosseinalipour and Qiang Qiu and Christopher Brinton},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=B7ZbqNLDn-_}\n}", "github": "", "project": "", "reviewers": "X8h7;FQxe;LE1r;X6Uf", "pdf_size": 0, "recommendation": "5;8;8;8", "confidence": "4;3;4;4", "correctness": "3;3;3;3", "technical_novelty": "2;2;3;4", "empirical_novelty": "2;3;2;4", "wc_summary_paper": "85;43;224;115", "wc_summary_review": "26;33;80;42", "wc_main_review": "563;613;1105;192", "wc_review": "674;689;1409;349", "wc_reply_reviewers": "217;0;53;174", "wc_reply_authors": "2394;539;1917;1092", "reply_reviewers": "1;0;1;2", "reply_authors": "5;1;4;3", "recommendation_avg": [ 7.25, 1.299038105676658 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.75, 0.82915619758885 ], "empirical_novelty_avg": [ 2.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 116.75, 66.99393629277205 ], "wc_summary_review_avg": [ 45.25, 20.849160654568326 ], "wc_main_review_avg": [ 618.25, 324.6901407496076 ], "wc_review_avg": [ 780.25, 387.59474648142486 ], "wc_reply_reviewers_avg": [ 111.0, 87.87775600230128 ], "wc_reply_authors_avg": [ 1485.5, 718.0301177527305 ], "reply_reviewers_avg": [ 1.0, 0.7071067811865476 ], "reply_authors_avg": [ 3.25, 1.479019945774904 ], "replies_avg": [ 24, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": 0.0, "gs_citation": 27, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11357128239739448107&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=B7ZbqNLDn-_", "email": "purdue.edu;;purdue.edu;purdue.edu", "author_num": 4, "aff_unique_index": "0;0;0", "aff_unique_norm": "Purdue University", "aff_unique_dep": "", "aff_unique_url": "https://www.purdue.edu", "aff_unique_abbr": "Purdue", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "B7abCaIiN_v", "title": "Triangular Dropout: Variable Network Width without Retraining", "track": "main", "status": "Reject", "tldr": "", "abstract": "One of the most fundamental design choices in neural networks is layer width: it affects the capacity of what a network can learn and determines the complexity of the solution. This latter property is often exploited when introducing information bottlenecks, forcing a network to learn compressed representations. However, such an architecture decision is typically immutable once training begins; switching to a more compressed architecture requires retraining. In this paper we present a new layer design, called Triangular Dropout, which does not have this limitation. After training, the layer can be arbitrarily reduced in width to exchange performance for narrowness. We demonstrate the construction and potential use cases of such a mechanism in three areas. Firstly, we describe the formulation of Triangular Dropout in autoencoders, creating an MNIST autoencoder with selectable compression after training. Secondly, we add Triangular Dropout to VGG19 on ImageNet, creating a powerful network which, without retraining, can be significantly reduced in parameters with only small changes to classification accuracy. Lastly, we explore the application of Triangular Dropout to reinforcement learning (RL) policies on selected control problems, showing that it can be used to characterize the complexity of RL tasks, a critical measurement in multitask learning and lifelong-learning domains.", "keywords": "architecture;compression;variable network;neural network design;deep learning", "primary_area": "", "supplementary_material": "", "author": "Edward W Staley;Corban G Rivera;Neil Joshi", "authorids": "~Edward_W_Staley1;corban.rivera@jhuapl.edu;neil.joshi@jhuapl.edu", "gender": "M;;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": "edward-staley-7284874b/;;", "or_profile": "~Edward_W_Staley1;corban.rivera@jhuapl.edu;neil.joshi@jhuapl.edu", "aff": "Johns Hopkins University Applied Physics Laboratory;;", "aff_domain": "jhuapl.edu;;", "position": "Researcher;;", "bibtex": "@misc{\nstaley2022triangular,\ntitle={Triangular Dropout: Variable Network Width without Retraining},\nauthor={Edward W Staley and Corban G Rivera and Neil Joshi},\nyear={2022},\nurl={https://openreview.net/forum?id=B7abCaIiN_v}\n}", "github": "", "project": "", "reviewers": "yaJC;Te1U;CzhB;8MmW", "site": "https://openreview.net/forum?id=B7abCaIiN_v", "pdf_size": 0, "recommendation": "1;5;5;6", "confidence": "5;4;3;5", "correctness": "3;3;3;4", "technical_novelty": "3;2;3;3", "empirical_novelty": "1;2;2;3", "wc_summary_paper": "71;60;67;95", "wc_summary_review": "33;65;35;101", "wc_main_review": "445;888;223;601", "wc_review": "549;1013;325;797", "wc_reply_reviewers": "0;0;70;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;1;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.25, 1.920286436967152 ], "confidence_avg": [ 4.25, 0.82915619758885 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 73.25, 13.160072188251856 ], "wc_summary_review_avg": [ 58.5, 27.617928959282953 ], "wc_main_review_avg": [ 539.25, 242.04170611694175 ], "wc_review_avg": [ 671.0, 258.57300709857554 ], "wc_reply_reviewers_avg": [ 17.5, 30.31088913245535 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.3532809023904868, "corr_recommendation_correctness": 0.5261522196019801, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18371491451793468641&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0", "aff_unique_norm": "Johns Hopkins University", "aff_unique_dep": "Applied Physics Laboratory", "aff_unique_url": "https://www.jhuapl.edu", "aff_unique_abbr": "JHU APL", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "title": "Relating transformers to models and neural representations of the hippocampal formation", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6743", "id": "B8DVo9B1YE0", "poster": "", "openreview": "https://openreview.net/forum?id=B8DVo9B1YE0", "slides": "https://iclr.cc/virtual/2022/poster/6743", "video": "https://iclr.cc/virtual/2022/poster/6743", "author_site": "James Whittington, Joseph Warren, Timothy Behrens", "tldr": "", "abstract": "Many deep neural network architectures loosely based on brain networks have recently been shown to replicate neural firing patterns observed in the brain. One of the most exciting and promising novel architectures, the Transformer neural network, was developed without the brain in mind. In this work, we show that transformers, when equipped with recurrent position encodings, replicate the precisely tuned spatial representations of the hippocampal formation; most notably place and grid cells. Furthermore, we show that this result is no surprise since it is closely related to current hippocampal models from neuroscience. We additionally show the transformer version offers dramatic performance gains over the neuroscience version. This work continues to bind computations of artificial and brain networks, offers a novel understanding of the hippocampal-cortical interaction, and suggests how wider cortical areas may perform complex tasks beyond current neuroscience models such as language comprehension.", "keywords": "Neuroscience;representation learning;hippocampus;cortex;transformers", "primary_area": "", "supplementary_material": "", "author": "James C. R. Whittington;Joseph Warren;Tim E.J. Behrens", "authorids": "~James_C._R._Whittington1;joseph.warren@ucl.ac.uk;behrens@fmrib.ox.ac.uk", "gender": ";;", "homepage": "http://www.jcrwhittington.com;;", "dblp": "198/7308;;", "google_scholar": "https://scholar.google.co.uk/citations?user=zUu0JKYAAAAJ;;", "orcid": "0000-0001-5680-5586;;", "linkedin": ";;", "or_profile": "~James_C._R._Whittington1;joseph.warren@ucl.ac.uk;behrens@fmrib.ox.ac.uk", "aff": "University of Oxford;;", "aff_domain": "oxford.ac.uk;;", "position": "Postdoc;;", "bibtex": "@inproceedings{\nwhittington2022relating,\ntitle={Relating transformers to models and neural representations of the hippocampal formation},\nauthor={James C. R. Whittington and Joseph Warren and Tim E.J. Behrens},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=B8DVo9B1YE0}\n}", "github": "", "project": "", "reviewers": "hKkg;skR7;RQx3;K1VL", "pdf_size": 0, "recommendation": "6;8;8;8", "confidence": "3;3;4;4", "correctness": "3;4;3;4", "technical_novelty": "3;3;3;3", "empirical_novelty": "0;3;3;3", "wc_summary_paper": "134;81;58;173", "wc_summary_review": "195;72;25;27", "wc_main_review": "656;215;502;376", "wc_review": "985;368;585;576", "wc_reply_reviewers": "266;85;95;89", "wc_reply_authors": "2973;507;1709;978", "reply_reviewers": "3;2;1;2", "reply_authors": "7;3;5;4", "recommendation_avg": [ 7.5, 0.8660254037844386 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.25, 1.299038105676658 ], "wc_summary_paper_avg": [ 111.5, 44.947191235938206 ], "wc_summary_review_avg": [ 79.75, 69.14251586397475 ], "wc_main_review_avg": [ 437.25, 162.16561750260132 ], "wc_review_avg": [ 628.5, 223.3836386130372 ], "wc_reply_reviewers_avg": [ 133.75, 76.43747444807423 ], "wc_reply_authors_avg": [ 1541.75, 930.7215950540742 ], "reply_reviewers_avg": [ 2.0, 0.7071067811865476 ], "reply_authors_avg": [ 4.75, 1.479019945774904 ], "replies_avg": [ 32, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 110, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1471152261845071335&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=B8DVo9B1YE0", "email": "oxford.ac.uk;;", "author_num": 3, "aff_unique_index": "0", "aff_unique_norm": "University of Oxford", "aff_unique_dep": "", "aff_unique_url": "https://www.ox.ac.uk", "aff_unique_abbr": "Oxford", "aff_country_unique_index": "0", "aff_country_unique": "United Kingdom" }, { "id": "B9LUI0pZFGc", "title": "The KFIoU Loss for Rotated Object Detection", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "As a fundamental building block for visual analysis across aerial images, scene text etc., rotated object detection has established itself an emerging area, which is more general than classic horizontal object detection. Differing from the horizontal detection case whereby the alignment between final detection performance and regression loss is well kept thanks to the differentiable IoU loss, rotation detection involves the so-called SkewIoU that is undifferentiable. In this paper, we design a novel approximate SkewIoU loss based on Kalman filter, namely KFIoU loss. To avoid the standing and well-known boundary discontinuity and square-like problems, we convert the rotating bounding box into a Gaussian distribution, in line with recent Gaussian-based rotation detection works. Then we use the center loss to narrow the distance between the center of the two Gaussian distributions, followed by calculating the overlap area under the new position through Kalman filter. We qualitatively show the value consistency between KFIoU loss and the SkewIoU loss for rotation detection in different cases. We further extend our technique to the 3-D case which also suffers from the same issues as 2-D object detection. Extensive experimental results on various public datasets (2-D/3-D, aerial/text images) with different base detectors show the effectiveness of our approach. The source code will be made public available.", "keywords": "Rotation Detection;SkewIoU loss;Kalman Filter", "primary_area": "", "supplementary_material": "", "author": "Xue Yang;Gefan Zhang;Jirui Yang;Junchi Yan", "authorids": "~Xue_Yang2;~Gefan_Zhang1;~Jirui_Yang1;~Junchi_Yan2", "gender": "M;M;M;M", "homepage": "https://yangxue.site/;https://github.com/zhanggefan;;http://thinklab.sjtu.edu.cn/", "dblp": "13/1779-5;;;60/7949.html", "google_scholar": "2xTlvV0AAAAJ;;https://scholar.google.com/citations?hl=zh-CN;ga230VoAAAAJ", "orcid": "0000-0002-7084-9101;;;0000-0001-9639-7679", "linkedin": ";;;", "or_profile": "~Xue_Yang2;~Gefan_Zhang1;~Jirui_Yang1;~Junchi_Yan1", "aff": "Shanghai Jiaotong University;Shanghai Jiaotong University;Alibaba Group;Shanghai Jiaotong University", "aff_domain": "sjtu.edu.cn;sjtu.edu.cn;alibaba-inc.com;sjtu.edu.cn", "position": "PhD student;MS student;Researcher;Associate Professor", "bibtex": "@misc{\nyang2022the,\ntitle={The {KFI}oU Loss for Rotated Object Detection},\nauthor={Xue Yang and Gefan Zhang and Jirui Yang and Junchi Yan},\nyear={2022},\nurl={https://openreview.net/forum?id=B9LUI0pZFGc}\n}", "github": "", "project": "", "reviewers": "qdSN;qnDZ;L9fE;Kacf", "site": "https://openreview.net/forum?id=B9LUI0pZFGc", "pdf_size": 0, "recommendation": "3;3;3;6", "confidence": "3;4;3;4", "correctness": "3;2;2;3", "technical_novelty": "1;2;2;3", "empirical_novelty": "2;1;2;2", "wc_summary_paper": "47;62;62;104", "wc_summary_review": "36;12;94;54", "wc_main_review": "78;66;286;211", "wc_review": "161;140;442;369", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "449;355;345;381", "reply_reviewers": "0;0;0;0", "reply_authors": "2;2;2;2", "recommendation_avg": [ 3.75, 1.299038105676658 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 1.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 68.75, 21.25294097295713 ], "wc_summary_review_avg": [ 49.0, 29.949958263743873 ], "wc_main_review_avg": [ 160.25, 92.24525733066172 ], "wc_review_avg": [ 278.0, 130.29773597419106 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 382.5, 40.58016756988566 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.0, 0.0 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 249, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4612145705765974530&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Shanghai Jiao Tong University;Alibaba Group", "aff_unique_dep": ";", "aff_unique_url": "https://www.sjtu.edu.cn;https://www.alibaba.com", "aff_unique_abbr": "SJTU;Alibaba", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "id": "BAtutOziapg", "title": "Can Stochastic Gradient Langevin Dynamics Provide Differential Privacy for Deep Learning?", "track": "main", "status": "Reject", "tldr": "", "abstract": "Bayesian learning via Stochastic Gradient Langevin Dynamics (SGLD) has been suggested for differentially private learning. While previous research provides differential privacy bounds for SGLD when close to convergence or at the initial steps of the algorithm, the question of what differential privacy guarantees can be made in between remains unanswered. This interim region is essential, especially for Bayesian neural networks, as it is hard to guarantee convergence to the posterior. This paper will show that using SGLD might result in unbounded privacy loss for this interim region, even when sampling from the posterior is as differentially private as desired.", "keywords": "differential privacy;bayesian inference;sgld", "primary_area": "", "supplementary_material": "", "author": "Guy Heller;Ethan Fetaya", "authorids": "~Guy_Heller1;~Ethan_Fetaya1", "gender": ";M", "homepage": "https://www.linkedin.com/in/guy-heller-891327134;http://www.cs.toronto.edu/~ethanf/", "dblp": ";01/10046", "google_scholar": ";zLuqh-0AAAAJ", "orcid": ";0000-0003-3125-1665", "linkedin": ";", "or_profile": "~Guy_Heller1;~Ethan_Fetaya1", "aff": "Bar Ilan University;Bar Ilan University", "aff_domain": "biu.ac.il;biu.ac.il", "position": "MS student;Assistant Professor", "bibtex": "@misc{\nheller2022can,\ntitle={Can Stochastic Gradient Langevin Dynamics Provide Differential Privacy for Deep Learning?},\nauthor={Guy Heller and Ethan Fetaya},\nyear={2022},\nurl={https://openreview.net/forum?id=BAtutOziapg}\n}", "github": "", "project": "", "reviewers": "1F53;gB6E;AyXE;48Es", "site": "https://openreview.net/forum?id=BAtutOziapg", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "4;4;4;3", "correctness": "3;4;4;3", "technical_novelty": "3;2;3;3", "empirical_novelty": "0;2;0;0", "wc_summary_paper": "37;42;73;57", "wc_summary_review": "54;114;53;53", "wc_main_review": "309;127;184;380", "wc_review": "400;283;310;490", "wc_reply_reviewers": "0;0;0;34", "wc_reply_authors": "267;336;442;495", "reply_reviewers": "0;0;0;1", "reply_authors": "1;1;1;1", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 0.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 52.25, 14.060138690638865 ], "wc_summary_review_avg": [ 68.5, 26.272609310839304 ], "wc_main_review_avg": [ 250.0, 99.83235948328579 ], "wc_review_avg": [ 370.75, 81.34302367136348 ], "wc_reply_reviewers_avg": [ 8.5, 14.722431864335457 ], "wc_reply_authors_avg": [ 385.0, 88.9859539478001 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.6622661785325219, "corr_recommendation_correctness": 0.2294157338705618, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2947612589870667797&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff_unique_index": "0;0", "aff_unique_norm": "Bar-Ilan University", "aff_unique_dep": "", "aff_unique_url": "https://www.biu.ac.il", "aff_unique_abbr": "BIU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Israel" }, { "title": "Scalable Sampling for Nonsymmetric Determinantal Point Processes", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7097", "id": "BB4e8Atc1eR", "poster": "", "openreview": "https://openreview.net/forum?id=BB4e8Atc1eR", "slides": "https://iclr.cc/virtual/2022/poster/7097", "video": "https://iclr.cc/virtual/2022/poster/7097", "author_site": "Insu Han, Mike Gartrell, Jennifer Gillenwater, Elvis Dohmatob, Amin Karbasi", "tldr": "", "abstract": "A determinantal point process (DPP) on a collection of $M$ items is a model, parameterized by a symmetric kernel matrix, that assigns a probability to every subset of those items. Recent work shows that removing the kernel symmetry constraint, yielding nonsymmetric DPPs (NDPPs), can lead to significant predictive performance gains for machine learning applications. However, existing work leaves open the question of scalable NDPP sampling. There is only one known DPP sampling algorithm, based on Cholesky decomposition, that can directly apply to NDPPs as well. Unfortunately, its runtime is cubic in $M$, and thus does not scale to large item collections. In this work, we first note that this algorithm can be transformed into a linear-time one for kernels with low-rank structure. Furthermore, we develop a scalable sublinear-time rejection sampling algorithm by constructing a novel proposal distribution. Additionally, we show that imposing certain structural constraints on the NDPP kernel enables us to bound the rejection rate in a way that depends only on the kernel rank. In our experiments we compare the speed of all of these samplers for a variety of real-world tasks.", "keywords": "determinantal point processes;sampling", "primary_area": "", "supplementary_material": "/attachment/7b1db943c21151769505bd80e41b0039fb0352e8.zip", "author": "Insu Han;Mike Gartrell;Jennifer Gillenwater;Elvis Dohmatob;amin karbasi", "authorids": "~Insu_Han1;~Mike_Gartrell1;~Jennifer_Gillenwater1;~Elvis_Dohmatob1;~amin_karbasi1", "gender": "M;M;F;M;M", "homepage": "https://insuhan.github.io/;https://cgartrel.github.io;http://jgillenw.com;http://dohmatob.github.io/;http://seas.yale.edu/faculty-research/faculty-directory/amin-karbasi", "dblp": "160/8272;75/3021;73/3828;134/9794;49/7411", "google_scholar": "0w39xsoAAAAJ;NX6eiWYAAAAJ;5lUnZgsAAAAJ;https://scholar.google.fr/citations?user=FDWgJY8AAAAJ;https://scholar.google.com.tw/citations?user=VusVB38AAAAJ", "orcid": ";;;;", "linkedin": ";mikegartrell/;;;", "or_profile": "~Insu_Han1;~Mike_Gartrell1;~Jennifer_Gillenwater1;~Elvis_Dohmatob1;~amin_karbasi1", "aff": ";Criteo AI Lab;Google;Meta Facebook;Google", "aff_domain": ";criteo.com;google.com;facebook.com;google.com", "position": ";Senior Researcher;Research Scientist;Researcher;Researcher", "bibtex": "@inproceedings{\nhan2022scalable,\ntitle={Scalable Sampling for Nonsymmetric Determinantal Point Processes},\nauthor={Insu Han and Mike Gartrell and Jennifer Gillenwater and Elvis Dohmatob and amin karbasi},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=BB4e8Atc1eR}\n}", "github": "", "project": "", "reviewers": "ccEk;uhQa;MEEx;Tzr3", "pdf_size": 0, "recommendation": "8;8;8;8", "confidence": "4;3;4;3", "correctness": "4;3;4;4", "technical_novelty": "3;3;3;3", "empirical_novelty": "2;3;3;2", "wc_summary_paper": "293;89;75;149", "wc_summary_review": "22;55;45;43", "wc_main_review": "228;152;256;284", "wc_review": "543;296;376;476", "wc_reply_reviewers": "26;110;45;23", "wc_reply_authors": "119;333;193;143", "reply_reviewers": "1;1;1;1", "reply_authors": "1;1;1;1", "recommendation_avg": [ 8.0, 0.0 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 151.5, 86.29455370995322 ], "wc_summary_review_avg": [ 41.25, 12.007809958522827 ], "wc_main_review_avg": [ 230.0, 49.193495504995376 ], "wc_review_avg": [ 422.75, 94.26922880770798 ], "wc_reply_reviewers_avg": [ 51.0, 35.092734290733176 ], "wc_reply_authors_avg": [ 197.0, 82.93370846646133 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15995544588787166682&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=BB4e8Atc1eR", "email": ";criteo.com;google.com;facebook.com;google.com", "author_num": 5, "aff_unique_index": "0;1;2;1", "aff_unique_norm": "Criteo;Google;Meta", "aff_unique_dep": "Criteo AI Lab;Google;Meta Platforms, Inc.", "aff_unique_url": "https://www.criteo.com;https://www.google.com;https://meta.com", "aff_unique_abbr": "Criteo;Google;Meta", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;1;1;1", "aff_country_unique": "France;United States" }, { "title": "Lipschitz-constrained Unsupervised Skill Discovery", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6488", "id": "BGvt0ghNgA", "poster": "", "openreview": "https://openreview.net/forum?id=BGvt0ghNgA", "slides": "https://iclr.cc/virtual/2022/poster/6488", "video": "https://iclr.cc/virtual/2022/poster/6488", "author_site": "Seohong Park, Jongwook Choi, Jaekyeom Kim, Honglak Lee, Gunhee Kim", "tldr": "", "abstract": "We study the problem of unsupervised skill discovery, whose goal is to learn a set of diverse and useful skills with no external reward. There have been a number of skill discovery methods based on maximizing the mutual information (MI) between skills and states. However, we point out that their MI objectives usually prefer static skills to dynamic ones, which may hinder the application for downstream tasks. To address this issue, we propose Lipschitz-constrained Skill Discovery (LSD), which encourages the agent to discover more diverse, dynamic, and far-reaching skills. Another benefit of LSD is that its learned representation function can be utilized for solving goal-following downstream tasks even in a zero-shot manner \u2014 i.e., without further training or complex planning. Through experiments on various MuJoCo robotic locomotion and manipulation environments, we demonstrate that LSD outperforms previous approaches in terms of skill diversity, state space coverage, and performance on seven downstream tasks including the challenging task of following multiple goals on Humanoid. Our code and videos are available at https://shpark.me/projects/lsd/.", "keywords": "Reinforcement learning", "primary_area": "", "supplementary_material": "", "author": "Seohong Park;Jongwook Choi;Jaekyeom Kim;Honglak Lee;Gunhee Kim", "authorids": "~Seohong_Park1;~Jongwook_Choi1;~Jaekyeom_Kim1;~Honglak_Lee2;~Gunhee_Kim1", "gender": ";M;M;M;M", "homepage": "https://seohong.me/;https://wook.kr;https://jaekyeom.github.io/;http://vision.snu.ac.kr/gunhee/;http://web.eecs.umich.edu/~honglak", "dblp": "227/6308;131/0227;228/6696;45/115;58/2562", "google_scholar": ";UX-H08cAAAAJ;8PR-AaoAAAAJ;https://scholar.google.co.kr/citations?user=CiSdOV0AAAAJ;fmSHtE8AAAAJ", "orcid": ";;;0000-0002-9543-7453;", "linkedin": ";;jaekyeom-kim-14157428;;", "or_profile": "~Seohong_Park1;~Jongwook_Choi1;~Jaekyeom_Kim1;~Gunhee_Kim1;~Honglak_Lee1", "aff": "Seoul National University;University of Michigan;Seoul National University;Seoul National University;University of Michigan", "aff_domain": "snu.ac.kr;umich.edu;snu.ac.kr;snu.ac.kr;umich.edu", "position": "Undergrad student;PhD student;PhD student;Full Professor;Associate Professor", "bibtex": "@inproceedings{\npark2022lipschitzconstrained,\ntitle={Lipschitz-constrained Unsupervised Skill Discovery},\nauthor={Seohong Park and Jongwook Choi and Jaekyeom Kim and Honglak Lee and Gunhee Kim},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=BGvt0ghNgA}\n}", "github": "", "project": "", "reviewers": "8wK8;QUmk;an2L;wpgt", "pdf_size": 0, "recommendation": "6;6;6;8", "confidence": "4;3;3;3", "correctness": "4;4;4;4", "technical_novelty": "3;3;3;4", "empirical_novelty": "4;4;3;4", "wc_summary_paper": "102;96;102;280", "wc_summary_review": "79;45;37;29", "wc_main_review": "366;290;241;216", "wc_review": "547;431;380;525", "wc_reply_reviewers": "413;0;0;0", "wc_reply_authors": "2773;392;555;437", "reply_reviewers": "2;0;0;0", "reply_authors": "6;1;1;1", "recommendation_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 4.0, 0.0 ], "technical_novelty_avg": [ 3.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 145.0, 77.9807668595276 ], "wc_summary_review_avg": [ 47.5, 19.04599695474091 ], "wc_main_review_avg": [ 278.25, 57.229253882957444 ], "wc_review_avg": [ 470.75, 68.14093850248909 ], "wc_reply_reviewers_avg": [ 103.25, 178.83424588148657 ], "wc_reply_authors_avg": [ 1039.25, 1002.7493143852057 ], "reply_reviewers_avg": [ 0.5, 0.8660254037844386 ], "reply_authors_avg": [ 2.25, 2.165063509461097 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": 0.0, "gs_citation": 72, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10843211345776336790&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=BGvt0ghNgA", "email": "snu.ac.kr;umich.edu;snu.ac.kr;snu.ac.kr;umich.edu", "author_num": 5, "aff_unique_index": "0;1;0;0;1", "aff_unique_norm": "Seoul National University;University of Michigan", "aff_unique_dep": ";", "aff_unique_url": "https://www.snu.ac.kr;https://www.umich.edu", "aff_unique_abbr": "SNU;UM", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0;1", "aff_country_unique": "South Korea;United States" }, { "id": "BIpTWmO_BY", "title": "Sleeper Agent: Scalable Hidden Trigger Backdoors for Neural Networks Trained from Scratch", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "As the curation of data for machine learning becomes increasingly automated, dataset tampering is a mounting threat. Backdoor attackers tamper with training data to embed a vulnerability in models that are trained on that data. This vulnerability is then activated at inference time by placing a \"trigger'' into the model's input. Typical backdoor attacks insert the trigger directly into the training data, although the presence of such an attack may be visible upon inspection. In contrast, the Hidden Trigger Backdoor Attack achieves poisoning without placing a trigger into the training data at all. However, this hidden trigger attack is ineffective at poisoning neural networks trained from scratch. We develop a new hidden trigger attack, Sleeper Agent, which employs gradient matching, data selection, and target model re-training during the crafting process. Sleeper Agent is the first hidden trigger backdoor attack to be effective against neural networks trained from scratch. We demonstrate its effectiveness on ImageNet and in black-box settings.", "keywords": "Backdoor attacks;data poisoning;clean labels;adversarial examples;security", "primary_area": "", "supplementary_material": "/attachment/f3597c82013d3f71d247a8a570e5405c5b464e12.zip", "author": "Hossein Souri;Liam H Fowl;Rama Chellappa;Micah Goldblum;Tom Goldstein", "authorids": "~Hossein_Souri1;~Liam_H_Fowl1;~Rama_Chellappa1;~Micah_Goldblum1;~Tom_Goldstein1", "gender": "M;;;;M", "homepage": "https://hsouri.github.io/;;;;https://www.cs.umd.edu/~tomg/", "dblp": "250/2286;241/6940;;241/7231;25/8184", "google_scholar": "rurbhy0AAAAJ;IXv3ToAAAAAJ;;pGDKzuUAAAAJ;KmSuVtgAAAAJ", "orcid": "0000-0001-5264-798X;;;;", "linkedin": "hossein-souri-b7574795/;;;;", "or_profile": "~Hossein_Souri1;~Liam_H_Fowl1;~Rama_Chellappa1;~Micah_Goldblum1;~Tom_Goldstein1", "aff": "Johns Hopkins University;University of Maryland, College Park;;New York University;University of Maryland, College Park", "aff_domain": "jhu.edu;umd.edu;;nyu.edu;umd.edu", "position": "PhD student;PhD student;;Postdoc;Associate Professor", "bibtex": "@misc{\nsouri2022sleeper,\ntitle={Sleeper Agent: Scalable Hidden Trigger Backdoors for Neural Networks Trained from Scratch},\nauthor={Hossein Souri and Liam H Fowl and Rama Chellappa and Micah Goldblum and Tom Goldstein},\nyear={2022},\nurl={https://openreview.net/forum?id=BIpTWmO_BY}\n}", "github": "", "project": "", "reviewers": "ebjw;Mdau;2hGu;Cnrx", "site": "https://openreview.net/forum?id=BIpTWmO_BY", "pdf_size": 0, "recommendation": "5;5;5;5", "confidence": "5;3;4;4", "correctness": "2;3;4;3", "technical_novelty": "3;2;2;4", "empirical_novelty": "3;3;3;4", "wc_summary_paper": "129;41;90;66", "wc_summary_review": "15;43;83;37", "wc_main_review": "611;228;1018;354", "wc_review": "755;312;1191;457", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "354;246;667;187", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 5.0, 0.0 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.75, 0.82915619758885 ], "empirical_novelty_avg": [ 3.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 81.5, 32.438403166617185 ], "wc_summary_review_avg": [ 44.5, 24.550967394381836 ], "wc_main_review_avg": [ 552.75, 301.99948261545086 ], "wc_review_avg": [ 678.75, 336.1148427249234 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 363.5, 185.17626737786892 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 155, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17766312751155997039&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff_unique_index": "0;1;2;1", "aff_unique_norm": "Johns Hopkins University;University of Maryland;New York University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.jhu.edu;https://www/umd.edu;https://www.nyu.edu", "aff_unique_abbr": "JHU;UMD;NYU", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";College Park", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "BJ-NSus8wXk", "title": "Towards Unknown-aware Deep Q-Learning", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Deep reinforcement learning (RL) has achieved remarkable success in known environments where the agents are trained, yet the agents do not necessarily know what they don\u2019t know. In particular, RL agents deployed in the open world are naturally subject to environmental shifts and encounter unknown out-of-distribution (OOD) states---i.e., states from outside the training environment. Currently, the study of handling OOD states in the RL environment remains underexplored. This paper bridges this critical gap by proposing and exploring an unknown-aware RL framework, which improves the safety and reliability of deep Q-learning. Our key idea is to regularize the training of Q-learning so that OOD states will have higher OOD uncertainty, while in-distribution states will have lower OOD uncertainty; therefore making them distinguishable. This is in contrast with vanilla Q-learning which does not take into account unknowns during training. Furthermore, we provide theoretical guarantees that our method can improve OOD uncertainty estimation while ensuring the convergence performance of the in-distribution environment. Empirically, we demonstrate state-of-the-art performance on six diverse environments, achieving near-optimal OOD detection performance. ", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ying Fan;Sharon Li", "authorids": "~Ying_Fan2;~Sharon_Li1", "gender": ";F", "homepage": "https://yingfan-bot.github.io/;http://pages.cs.wisc.edu/~sharonli/", "dblp": ";144/6087-1", "google_scholar": "1aj4dZcAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";", "linkedin": "ying-fan-5b7b07147/;liyixuan", "or_profile": "~Ying_Fan2;~Yixuan_Li1", "aff": "University of Wisconsin-Madison;Cornell University", "aff_domain": "cs.wisc.edu;cornell.edu", "position": "Graduate student;Graduate Student", "bibtex": "@misc{\nfan2022towards,\ntitle={Towards Unknown-aware Deep Q-Learning},\nauthor={Ying Fan and Sharon Li},\nyear={2022},\nurl={https://openreview.net/forum?id=BJ-NSus8wXk}\n}", "github": "", "project": "", "reviewers": "2EQA;s6L8;KbRi;udTy", "site": "https://openreview.net/forum?id=BJ-NSus8wXk", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "3;5;4;3", "correctness": "4;4;2;3", "technical_novelty": "1;2;2;2", "empirical_novelty": "1;1;3;3", "wc_summary_paper": "47;79;79;68", "wc_summary_review": "24;41;110;106", "wc_main_review": "373;588;874;810", "wc_review": "444;708;1063;984", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 1.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 1.0 ], "wc_summary_paper_avg": [ 68.25, 13.06474263045392 ], "wc_summary_review_avg": [ 70.25, 38.25163395202877 ], "wc_main_review_avg": [ 661.25, 197.38461819503564 ], "wc_review_avg": [ 799.75, 244.04136432170674 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.30151134457776363, "corr_recommendation_correctness": -0.9045340337332909, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:BgpNzL48S0kJ:scholar.google.com/&scioq=Towards+Unknown-aware+Deep+Q-Learning&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1", "aff_unique_norm": "University of Wisconsin-Madison;Cornell University", "aff_unique_dep": ";", "aff_unique_url": "https://www.wisc.edu;https://www.cornell.edu", "aff_unique_abbr": "UW-Madison;Cornell", "aff_campus_unique_index": "0", "aff_campus_unique": "Madison;", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "No One Representation to Rule Them All: Overlapping Features of Training Methods", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6754", "id": "BK-4qbGgIE3", "poster": "", "openreview": "https://openreview.net/forum?id=BK-4qbGgIE3", "slides": "https://iclr.cc/virtual/2022/poster/6754", "video": "https://iclr.cc/virtual/2022/poster/6754", "author_site": "Raphael Gontijo Lopes, Yann Dauphin, Ekin Cubuk", "tldr": "", "abstract": "Despite being able to capture a range of features of the data, high accuracy models trained with supervision tend to make similar predictions. This seemingly implies that high-performing models share similar biases regardless of training methodology, which would limit ensembling benefits and render low-accuracy models as having little practical use. Against this backdrop, recent work has developed quite different training techniques, such as large-scale contrastive learning, yielding competitively high accuracy on generalization and robustness benchmarks. This motivates us to revisit the assumption that models necessarily learn similar functions. We conduct a large-scale empirical study of models across hyper-parameters, architectures, frameworks, and datasets. We find that model pairs that diverge more in training methodology display categorically different generalization behavior, producing increasingly uncorrelated errors. We show these models specialize in subdomains of the data, leading to higher ensemble performance: with just 2 models (each with ImageNet accuracy \\~76.5\\%), we can create ensembles with 83.4\\% (+7\\% boost). Surprisingly, we find that even significantly low-accuracy models can be used to improve high-accuracy models. Finally, we show diverging training methodology yield representations that capture overlapping (but not supersetting) feature sets which, when combined, lead to increased downstream performance.", "keywords": "Representation Learning;Understanding Deep Learning;Deep Phenomena;Diversity;Novelty;Features;Training Methodologies;Contrastive Learning", "primary_area": "", "supplementary_material": "", "author": "Raphael Gontijo-Lopes;Yann Dauphin;Ekin Dogus Cubuk", "authorids": "~Raphael_Gontijo-Lopes1;~Yann_Dauphin1;~Ekin_Dogus_Cubuk1", "gender": "M;M;M", "homepage": "https://www.dauphin.io;;https://raphagl.com", "dblp": "22/9988;83/7734;", "google_scholar": "XSforroAAAAJ;Mu_8iOEAAAAJ;-wpZQY0AAAAJ", "orcid": ";;", "linkedin": ";ekin-dogus-cubuk-9148b8114/;raphaelgontijolopes/", "or_profile": "~Yann_Dauphin1;~Ekin_Dogus_Cubuk1;~Raphael_Gontijo_Lopes1", "aff": "Google;Google;Google Brain", "aff_domain": "google.com;google.com;google.com", "position": "Researcher;Staff Research Scientist;Research Associate", "bibtex": "@inproceedings{\ngontijo-lopes2022no,\ntitle={No One Representation to Rule Them All: Overlapping Features of Training Methods},\nauthor={Raphael Gontijo-Lopes and Yann Dauphin and Ekin Dogus Cubuk},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=BK-4qbGgIE3}\n}", "github": "", "project": "", "reviewers": "7RNN;yy7C;ZeBH;iemd", "pdf_size": 0, "recommendation": "6;8;8;8", "confidence": "4;4;4;3", "correctness": "4;4;3;3", "technical_novelty": "3;2;4;4", "empirical_novelty": "3;4;4;4", "wc_summary_paper": "89;62;61;164", "wc_summary_review": "23;46;28;60", "wc_main_review": "598;235;481;324", "wc_review": "710;343;570;548", "wc_reply_reviewers": "224;124;84;0", "wc_reply_authors": "697;473;1017;640", "reply_reviewers": "1;1;1;0", "reply_authors": "1;2;2;1", "recommendation_avg": [ 7.5, 0.8660254037844386 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.25, 0.82915619758885 ], "empirical_novelty_avg": [ 3.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 94.0, 41.94639436232869 ], "wc_summary_review_avg": [ 39.25, 14.720309100015529 ], "wc_main_review_avg": [ 409.5, 140.00446421453853 ], "wc_review_avg": [ 542.75, 130.9988072464784 ], "wc_reply_reviewers_avg": [ 108.0, 80.54812226240907 ], "wc_reply_authors_avg": [ 706.75, 197.13241108452968 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": -0.5773502691896257, "gs_citation": 72, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2831227418630330599&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=BK-4qbGgIE3", "email": "google.com;google.com;google.com", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Google", "aff_unique_dep": "Google", "aff_unique_url": "https://www.google.com", "aff_unique_abbr": "Google", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "BKOiqcdpml3", "title": "Low Entropy Deep Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "The movement of data between processes and memory, not arithmetic operations, dominate the energy costs of deep learning inference calculations. This work focuses on reducing these data movement costs by reducing the number of unique weights in a network. The thinking goes that if the number of unique weights is kept small enough, then the entire network can be distributed and stored on processing elements (PEs) within accelerator designs, and the data movement costs for weight reads substantially reduced. To this end, we investigate the merits of a method, which we call Weight Fixing Networks (WFN). We design the approach to realise four model outcome objectives: i) very few unique weights, ii) low-entropy weight encodings, iii) unique weight values which are amenable to energy-saving versions of hardware multiplication, and iv) lossless task-performance. Some of these goals are conflicting. To best balance these conflicts, we combine a few novel (and some well-trodden) tricks; a novel regularisation term, (i, ii) a view of clustering cost as relative distance change (i, ii, iv), and a focus on whole-network re-use of weights (i, iii). Our Imagenet experiments demonstrate lossless compression using 56x fewer unique weights and a 1.9x lower weight-space entropy than SOTA quantisation approaches. ", "keywords": "Quantisation;Compression;AI Accelerators", "primary_area": "", "supplementary_material": "", "author": "Chris Subia-Waud;Srinandan Dasmahapatra", "authorids": "~Chris_Subia-Waud1;sd@ecs.soton.ac.uk", "gender": "M;", "homepage": ";", "dblp": ";", "google_scholar": "https://scholar.google.com/citations?hl=en;", "orcid": ";", "linkedin": "https://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=&cad=rja&uact=8&ved=2ahUKEwj8hp-59YLzAhV95-AKHboQADQQFnoECCgQAQ&url=https%3A%2F%2Fuk.linkedin.com%2Fin%2Fchris-subia-waud&usg=AOvVaw3fubacqImpPQuXMtcRciXH;", "or_profile": "~Chris_Subia-Waud1;sd@ecs.soton.ac.uk", "aff": "University of Southampton;", "aff_domain": "soton.ac.uk;", "position": "PhD student;", "bibtex": "@misc{\nsubia-waud2022low,\ntitle={Low Entropy Deep Networks},\nauthor={Chris Subia-Waud and Srinandan Dasmahapatra},\nyear={2022},\nurl={https://openreview.net/forum?id=BKOiqcdpml3}\n}", "github": "", "project": "", "reviewers": "vtcG;kvqy;wcJb;geXX", "site": "https://openreview.net/forum?id=BKOiqcdpml3", "pdf_size": 0, "recommendation": "5;5;5;8", "confidence": "4;3;3;4", "correctness": "3;2;3;4", "technical_novelty": "3;3;3;3", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "84;68;86;76", "wc_summary_review": "26;38;80;37", "wc_main_review": "137;302;383;175", "wc_review": "247;408;549;288", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 5.75, 1.299038105676658 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 78.5, 7.123903424387503 ], "wc_summary_review_avg": [ 45.25, 20.60794749605113 ], "wc_main_review_avg": [ 249.25, 98.4692210794825 ], "wc_review_avg": [ 373.0, 117.58188636010225 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": 0.816496580927726, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:sSIGCtfxOMoJ:scholar.google.com/&scioq=Low+Entropy+Deep+Networks&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "University of Southampton", "aff_unique_dep": "", "aff_unique_url": "https://www.southampton.ac.uk", "aff_unique_abbr": "Southampton", "aff_country_unique_index": "0", "aff_country_unique": "United Kingdom" }, { "id": "BKmoW5K4sS", "title": "On Adversarial Bias and the Robustness of Fair Machine Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Optimizing prediction accuracy can come at the expense of fairness. Towards minimizing discrimination against a group, fair machine learning algorithms strive to equalize the error of a model across different groups, through imposing fairness constraints on the learning algorithm. But, are decisions made by fair models trustworthy? How sensitive are fair models to changes in their training data? By giving equal importance to groups of different sizes and distributions in the training set, we show that fair models become more fragile to outliers. We study the trade-off between fairness and robustness, by analyzing the adversarial (worst-case) bias against group fairness in machine learning and by comparing it with the effect of similar adversarial manipulations on regular models. We show that the adversarial bias introduced in training data, via the sampling or labeling processes, can significantly reduce the test accuracy on fair models, compared with regular models. Our results demonstrate that adversarial bias can also worsen a model's fairness gap on test data, even though the model satisfies the fairness constraint on training data. We analyze the robustness of multiple fair machine learning algorithms that satisfy equalized odds (and equal opportunity) notion of fairness.", "keywords": "Robustness;Algorithmic fairness", "primary_area": "", "supplementary_material": "", "author": "Hongyan Chang;Ta Duy Nguyen;Sasi Kumar Murakonda;Ehsan Kazemi;Reza Shokri", "authorids": "~Hongyan_Chang1;~Ta_Duy_Nguyen1;~Sasi_Kumar_Murakonda1;~Ehsan_Kazemi4;~Reza_Shokri1", "gender": "F;;;M;", "homepage": "https://www.comp.nus.edu.sg/~hongyan/;https://nguyentaduy.github.io/;;https://sites.google.com/view/ekazemi;", "dblp": "152/5447.html;;241/9846.html;https://dblp.org/pers/k/Kazemi_0001:Ehsan.html;", "google_scholar": "5d1AHgIAAAAJ;;;kdyalCwAAAAJ;", "orcid": ";;;;", "linkedin": ";;;ehsankazemi/;", "or_profile": "~Hongyan_Chang1;~Ta_Duy_Nguyen1;~Sasi_Kumar_Murakonda1;~Ehsan_Kazemi4;~Reza_Shokri1", "aff": "National University of Singapore;Boston University;Privitar;Google;", "aff_domain": "nus.edu.sg;bu.edu;privitar.com;google.com;", "position": "PhD student;PhD student;Researcher;Researcher;", "bibtex": "@misc{\nchang2022on,\ntitle={On Adversarial Bias and the Robustness of Fair Machine Learning},\nauthor={Hongyan Chang and Ta Duy Nguyen and Sasi Kumar Murakonda and Ehsan Kazemi and Reza Shokri},\nyear={2022},\nurl={https://openreview.net/forum?id=BKmoW5K4sS}\n}", "github": "", "project": "", "reviewers": "bW76;tNTd;oMrk;iCR1", "site": "https://openreview.net/forum?id=BKmoW5K4sS", "pdf_size": 0, "recommendation": "3;3;5;8", "confidence": "5;4;3;4", "correctness": "3;2;3;4", "technical_novelty": "2;1;2;3", "empirical_novelty": "1;2;2;3", "wc_summary_paper": "139;61;52;46", "wc_summary_review": "68;29;32;60", "wc_main_review": "374;367;167;209", "wc_review": "581;457;251;315", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.75, 2.0463381929681126 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 74.5, 37.61980861195336 ], "wc_summary_review_avg": [ 47.25, 17.020208576865326 ], "wc_main_review_avg": [ 279.25, 92.4834444644013 ], "wc_review_avg": [ 401.0, 127.89839717525783 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.34554737023254406, "corr_recommendation_correctness": 0.8638684255813602, "gs_citation": 79, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2413206652262271751&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "National University of Singapore;Boston University;Privitar;Google", "aff_unique_dep": ";;;Google", "aff_unique_url": "https://www.nus.edu.sg;https://www.bu.edu;https://www.privitar.com;https://www.google.com", "aff_unique_abbr": "NUS;BU;;Google", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;1;2;1", "aff_country_unique": "Singapore;United States;United Kingdom" }, { "id": "BM7RjuhAK7W", "title": "Model-Invariant State Abstractions for Model-Based Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Accuracy and generalization of dynamics models is key to the success of model-based reinforcement learning (MBRL). As the complexity of tasks increases, learning accurate dynamics models becomes increasingly sample inefficient. However, many complex tasks also exhibit sparsity in dynamics, i.e., actions have only a local effect on the system dynamics. In this paper, we exploit this property with a causal invariance perspective in the single-task setting, introducing a new type of state abstraction called \\textit{model-invariance}. Unlike previous forms of state abstractions, a model-invariance state abstraction leverages causal sparsity over state variables. This allows for compositional generalization to unseen states, something that non-factored forms of state abstractions cannot do. We prove that an optimal policy can be learned over this model-invariance state abstraction and show improved generalization in a simple toy domain. Next, we propose a practical method to approximately learn a model-invariant representation for complex domains and validate our approach by showing improved modelling performance over standard maximum likelihood approaches on challenging tasks, such as the MuJoCo-based Humanoid. Finally, within the MBRL setting we show strong performance gains with respect to sample efficiency across a host of continuous control tasks. ", "keywords": "Reinforcement Learning;Model-based RL;State Abstractions;Generalization in RL", "primary_area": "", "supplementary_material": "/attachment/f2afcdd43e2b6b6e2db3f2741d2139e1b81851ef.zip", "author": "Manan Tomar;Amy Zhang;Roberto Calandra;Matthew E. Taylor;Joelle Pineau", "authorids": "~Manan_Tomar1;~Amy_Zhang1;~Roberto_Calandra1;~Matthew_E._Taylor2;~Joelle_Pineau1", "gender": "M;M;F;F;M", "homepage": "https://manantomar.github.io/;https://www.robertocalandra.com;http://www.cs.mcgill.ca/~jpineau;;https://irll.ca", "dblp": "241/6227;118/8239;p/JoellePineau;43/2754;46/4287.html", "google_scholar": ";FdE3LOEAAAAJ;https://scholar.google.ca/citations?user=CEt6_mMAAAAJ;;edQgLXcAAAAJ", "orcid": ";0000-0001-9430-8433;;;0000-0001-8946-0211", "linkedin": ";rcalandra;;;", "or_profile": "~Manan_Tomar1;~Roberto_Calandra1;~Joelle_Pineau1;~Amy_Zhang2;~Matthew_Taylor1", "aff": "Microsoft;Meta Facebook;Meta Facebook;University of California, Berkeley;Washington State University, Pullman", "aff_domain": "microsoft.com;fb.com;fb.com;berkeley.edu;wsu.edu", "position": "Intern;Research Scientist;Researcher Manager;Postdoc;Adjunct Professor", "bibtex": "@misc{\ntomar2022modelinvariant,\ntitle={Model-Invariant State Abstractions for Model-Based Reinforcement Learning},\nauthor={Manan Tomar and Amy Zhang and Roberto Calandra and Matthew E. Taylor and Joelle Pineau},\nyear={2022},\nurl={https://openreview.net/forum?id=BM7RjuhAK7W}\n}", "github": "", "project": "", "reviewers": "5dqM;hNYg;avfm;9B8W", "site": "https://openreview.net/forum?id=BM7RjuhAK7W", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "4;3;4;4", "correctness": "4;2;3;3", "technical_novelty": "3;2;2;3", "empirical_novelty": "3;2;2;3", "wc_summary_paper": "91;176;62;42", "wc_summary_review": "52;76;54;54", "wc_main_review": "635;842;521;316", "wc_review": "778;1094;637;412", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 92.75, 51.12423593561081 ], "wc_summary_review_avg": [ 59.0, 9.848857801796104 ], "wc_main_review_avg": [ 578.5, 190.28728281206813 ], "wc_review_avg": [ 730.25, 247.27148541633343 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": 0.0, "gs_citation": 27, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5070637192183186264&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;1;1;2;3", "aff_unique_norm": "Microsoft;Meta;University of California, Berkeley;Washington State University", "aff_unique_dep": "Microsoft Corporation;Meta Platforms, Inc.;;", "aff_unique_url": "https://www.microsoft.com;https://meta.com;https://www.berkeley.edu;https://wsu.edu", "aff_unique_abbr": "Microsoft;Meta;UC Berkeley;WSU", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Berkeley;Pullman", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "BNIt2myzSzS", "title": "IA-MARL: Imputation Assisted Multi-Agent Reinforcement Learning for Missing Training Data", "track": "main", "status": "Reject", "tldr": "", "abstract": "Recently, multi-agent reinforcement learning (MARL) adopts the centralized training with decentralized execution (CTDE) framework that trains agents using the data from all agents at a centralized server while each agent takes an action from its observation. In the real world, however, the training data from some agents can be unavailable at the centralized server due to practical reasons including communication failures and security attacks (e.g., data modification), which can slow down training and harm performance. Therefore, we consider the missing training data problem in MARL, and then propose the imputation assisted multiagent reinforcement learning (IA-MARL). IA-MARL consists of two steps: 1) the imputation of missing training data, which uses generative adversarial imputation networks (GAIN), and 2) the mask-based update of the networks, which trains each agent using the training data of corresponding agent, not missed over consecutive times. In the experimental results, we explore the effects of the data missing probability, the number of agents, and the number of pre-training episodes for GAIN on the performance of IA-MARL. We show IA-MARL outperforms a decentralized approach and even can achieve the performance of MARL without missing training data when sufficient imputation accuracy is supported. Our ablation study also shows that both the mask-based update and the imputation accuracy play important roles in achieving the high performance in IA-MARL. ", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/05d6eafc2317daa95257c8faa231f9569af78924.zip", "author": "Dongsun Kim;Sinwoong Yun;Jemin Lee;Eunbyung Park", "authorids": "~Dongsun_Kim1;~Sinwoong_Yun1;~Jemin_Lee2;~Eunbyung_Park1", "gender": "M;;F;M", "homepage": ";;;https://silverbottlep.github.io/", "dblp": ";;;92/9727", "google_scholar": ";;Mp8mflYAAAAJ;iPyuJmQAAAAJ", "orcid": "0000-0003-2733-0301;0000-0001-6382-6713;;", "linkedin": ";;;eunbyung-park-286384b4/", "or_profile": "~Dongsun_Kim1;~Sinwoong_Yun1;~Jemin_Lee2;~Eunbyung_Park1", "aff": "DGIST;DGIST;SKKU;Microsoft", "aff_domain": "dgist.ac.kr;dgist.ac.kr;skku.edu;microsoft.com", "position": "PhD student;PhD student;Associate Professor;Applied Scientist", "bibtex": "@misc{\nkim2022iamarl,\ntitle={{IA}-{MARL}: Imputation Assisted Multi-Agent Reinforcement Learning for Missing Training Data},\nauthor={Dongsun Kim and Sinwoong Yun and Jemin Lee and Eunbyung Park},\nyear={2022},\nurl={https://openreview.net/forum?id=BNIt2myzSzS}\n}", "github": "", "project": "", "reviewers": "vfgR;SVEY;oFfr;XnkH", "site": "https://openreview.net/forum?id=BNIt2myzSzS", "pdf_size": 0, "recommendation": "1;3;3;5", "confidence": "4;4;3;4", "correctness": "2;3;3;3", "technical_novelty": "2;2;2;4", "empirical_novelty": "2;3;2;3", "wc_summary_paper": "52;36;53;68", "wc_summary_review": "49;40;51;60", "wc_main_review": "633;375;264;603", "wc_review": "734;451;368;731", "wc_reply_reviewers": "524;0;0;0", "wc_reply_authors": "411;495;282;675", "reply_reviewers": "1;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 3.0, 1.4142135623730951 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.8660254037844386 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 52.25, 11.321991874224253 ], "wc_summary_review_avg": [ 50.0, 7.106335201775948 ], "wc_main_review_avg": [ 468.75, 154.6873863635946 ], "wc_review_avg": [ 571.0, 164.1477992542087 ], "wc_reply_reviewers_avg": [ 131.0, 226.8986557915229 ], "wc_reply_authors_avg": [ 465.75, 142.65583584277232 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.816496580927726, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9233314262118260519&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;1;2", "aff_unique_norm": "Daegu Gyeongbuk Institute of Science and Technology;Seoul National University;Microsoft", "aff_unique_dep": ";;Microsoft Corporation", "aff_unique_url": "https://www.dgist.ac.kr;https://www.snu.ac.kr;https://www.microsoft.com", "aff_unique_abbr": "DGIST;SNU;Microsoft", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1", "aff_country_unique": "South Korea;United States" }, { "title": "LOSSY COMPRESSION WITH DISTRIBUTION SHIFT AS ENTROPY CONSTRAINED OPTIMAL TRANSPORT", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6136", "id": "BRFWxcZfAdC", "poster": "", "openreview": "https://openreview.net/forum?id=BRFWxcZfAdC", "slides": "https://iclr.cc/virtual/2022/poster/6136", "video": "https://iclr.cc/virtual/2022/poster/6136", "author_site": "Huan Liu, George Zhang, Jun Chen, Ashish Khisti", "tldr": "", "abstract": "We study an extension of lossy compression where the reconstruction distribution is different from the source distribution in order to account for distributional shift due to processing. We formulate this as a generalization of optimal transport with an entropy bottleneck to account for the rate constraint due to compression. We provide expressions for the tradeoff between compression rate and the achievable distortion with and without shared common randomness between the encoder and decoder. We study the examples of binary, uniform and Gaussian sources (in an asymptotic setting) in detail and demonstrate that shared randomness can strictly improve the tradeoff. For the case without common randomness and squared-Euclidean distortion, we show that the optimal solution partially decouples into the problem of optimal compression and transport and also characterize the penalty associated with fully decoupling them. We provide experimental results by training deep learning end-to-end compression systems for performing denoising on SVHN and super-resolution on MNIST suggesting consistency with our theoretical results.", "keywords": "Image Compression;Image Restoration;Optimal Transport;Deep Learning", "primary_area": "", "supplementary_material": "", "author": "Huan Liu;George Zhang;Jun Chen;Ashish J Khisti", "authorids": "~Huan_Liu4;~George_Zhang1;~Jun_Chen8;~Ashish_J_Khisti1", "gender": "M;;M;M", "homepage": ";https://Maytide.github.io/;https://www.ece.mcmaster.ca/~junchen/;https://www.comm.utoronto.ca/~akhisti/", "dblp": ";;85/5901-5.html;84/5679.html", "google_scholar": "6QbMAhkAAAAJ;https://scholar.google.ca/citations?user=iHcxnFsAAAAJ;https://scholar.google.ca/citations?user=XI79Mw0AAAAJ;https://scholar.google.ca/citations?user=jiGeAg4AAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Huan_Liu4;~George_Zhang1;~Jun_Chen8;~Ashish_J_Khisti1", "aff": "McMaster University;University of Toronto;McMaster University;Toronto University", "aff_domain": "mcmaster.ca;utoronto.ca;mcmaster.ca;utoronto.ca", "position": "PhD student;MS student;Full Professor;Professor", "bibtex": "@inproceedings{\nliu2022lossy,\ntitle={{LOSSY} {COMPRESSION} {WITH} {DISTRIBUTION} {SHIFT} {AS} {ENTROPY} {CONSTRAINED} {OPTIMAL} {TRANSPORT}},\nauthor={Huan Liu and George Zhang and Jun Chen and Ashish J Khisti},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=BRFWxcZfAdC}\n}", "github": "", "project": "", "reviewers": "LfvG;mTA9;SKN1;GKce;YvyS", "pdf_size": 0, "recommendation": "3;6;6;8;8", "confidence": "4;2;3;3;3", "correctness": "1;4;4;4;4", "technical_novelty": "1;3;3;3;3", "empirical_novelty": "1;3;3;3;3", "wc_summary_paper": "23;148;52;105;74", "wc_summary_review": "27;35;23;37;26", "wc_main_review": "223;212;157;931;81", "wc_review": "273;395;232;1073;181", "wc_reply_reviewers": "218;0;0;0;0", "wc_reply_authors": "703;1255;296;823;125", "reply_reviewers": "1;0;0;0;0", "reply_authors": "3;3;1;2;1", "recommendation_avg": [ 6.2, 1.8330302779823362 ], "confidence_avg": [ 3.0, 0.6324555320336759 ], "correctness_avg": [ 3.4, 1.2000000000000002 ], "technical_novelty_avg": [ 2.6, 0.8000000000000002 ], "empirical_novelty_avg": [ 2.6, 0.8000000000000002 ], "wc_summary_paper_avg": [ 80.4, 43.167580427909094 ], "wc_summary_review_avg": [ 29.6, 5.425863986500215 ], "wc_main_review_avg": [ 320.8, 309.2186281581367 ], "wc_review_avg": [ 430.8, 328.7931872773523 ], "wc_reply_reviewers_avg": [ 43.6, 87.19999999999999 ], "wc_reply_authors_avg": [ 640.4, 399.8107552330227 ], "reply_reviewers_avg": [ 0.2, 0.4000000000000001 ], "reply_authors_avg": [ 2.0, 0.8944271909999159 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.5175491695067655, "corr_recommendation_correctness": 0.8728715609439694, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5464977269991302093&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "pdf": "https://openreview.net/pdf?id=BRFWxcZfAdC", "email": "mcmaster.ca;utoronto.ca;mcmaster.ca;utoronto.ca", "author_num": 4, "aff_unique_index": "0;1;0;1", "aff_unique_norm": "McMaster University;University of Toronto", "aff_unique_dep": ";", "aff_unique_url": "https://www.mcmaster.ca;https://www.utoronto.ca", "aff_unique_abbr": "McMaster;U of T", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Canada" }, { "title": "GNN-LM: Language Modeling based on Global Contexts via GNN", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6501", "id": "BS49l-B5Bql", "poster": "", "openreview": "https://openreview.net/forum?id=BS49l-B5Bql", "slides": "https://iclr.cc/virtual/2022/poster/6501", "video": "https://iclr.cc/virtual/2022/poster/6501", "author_site": "Yuxian Meng, Shi Zong, Xiaoya Li, Xiaofei Sun, Tianwei Zhang, Fei Wu, Jiwei Li", "tldr": "", "abstract": "Inspired by the notion that \"it to copy is easier than to memorize\", in this work, we introduce GNN-LM, which extends vanilla neural language model (LM) by allowing to reference similar contexts in the entire training corpus. We build a directed heterogeneous graph between an input context and its semantically related neighbors selected from the training corpus, where nodes are tokens in the input context and retrieved neighbor contexts, and edges represent connections between nodes. Graph neural networks (GNNs) are constructed upon the graph to aggregate information from similar contexts to decode the token. This learning paradigm provides direct access to the reference contexts and helps improve a model's generalization ability. We conduct comprehensive experiments to validate the effectiveness of the GNN-LM: GNN-LM achieves a new state-of-the-art perplexity of 14.8 on WikiText-103 (a 3.9 point improvement over its counterpart of the vanilla LM model), and shows substantial improvement on One Billion Word and Enwiki8 datasets against strong baselines. In-depth ablation studies are performed to understand the mechanics of GNN-LM. The code can be found at https://github.com/ShannonAI/GNN-LM.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yuxian Meng;Shi Zong;Xiaoya Li;Xiaofei Sun;Tianwei Zhang;Fei Wu;Jiwei Li", "authorids": "~Yuxian_Meng1;~Shi_Zong1;~Xiaoya_Li2;~Xiaofei_Sun1;~Tianwei_Zhang1;~Fei_Wu1;~Jiwei_Li1", "gender": "M;;;M;M;M;M", "homepage": "https://yuxianmeng.github.io/;;;;https://personal.ntu.edu.sg/tianwei.zhang/index.html;https://person.zju.edu.cn/wufei;https://nlp.stanford.edu/~bdlijiwei/", "dblp": "234/8585;;;;77/7902-4;84/3254-1;73/5746-1", "google_scholar": "https://scholar.google.com/citations?hl=zh-CN;;;hIokU_IAAAAJ;9vpiYDIAAAAJ;XJLn4MYAAAAJ;PwU16JEAAAAJ", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": "~Yuxian_Meng1;~Shi_Zong1;~Xiaoya_Li2;~Xiaofei_Sun1;~Tianwei_Zhang1;~Fei_Wu1;~Jiwei_Li1", "aff": "Shannon.AI;;;;Nanyang Technological University;Zhejiang University;Zhejiang University", "aff_domain": "shannon.ai;;;;ntu.edu.sg;zju.edu.cn;zju.edu.cn", "position": "Researcher;;;;Assistant Professor;Full Professor;Assistant Professor", "bibtex": "@inproceedings{\nmeng2022gnnlm,\ntitle={{GNN}-{LM}: Language Modeling based on Global Contexts via {GNN}},\nauthor={Yuxian Meng and Shi Zong and Xiaoya Li and Xiaofei Sun and Tianwei Zhang and Fei Wu and Jiwei Li},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=BS49l-B5Bql}\n}", "github": "", "project": "", "reviewers": "BmDz;54po;eytL", "pdf_size": 0, "recommendation": "6;8;10", "confidence": "3;3;4", "correctness": "3;4;4", "technical_novelty": "2;3;3", "empirical_novelty": "3;4;3", "wc_summary_paper": "59;68;64", "wc_summary_review": "52;76;29", "wc_main_review": "334;114;133", "wc_review": "445;258;226", "wc_reply_reviewers": "0;0;12", "wc_reply_authors": "750;329;258", "reply_reviewers": "0;0;1", "reply_authors": "1;1;1", "recommendation_avg": [ 8.0, 1.632993161855452 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "correctness_avg": [ 3.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 63.666666666666664, 3.6817870057290873 ], "wc_summary_review_avg": [ 52.333333333333336, 19.189117286165672 ], "wc_main_review_avg": [ 193.66666666666666, 99.53335565974298 ], "wc_review_avg": [ 309.6666666666667, 96.58272217235452 ], "wc_reply_reviewers_avg": [ 4.0, 5.656854249492381 ], "wc_reply_authors_avg": [ 445.6666666666667, 217.13948410078615 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.8660254037844385, "corr_recommendation_correctness": 0.8660254037844385, "gs_citation": 39, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7267447337261309550&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "pdf": "https://openreview.net/pdf?id=BS49l-B5Bql", "email": "shannon.ai;;;;ntu.edu.sg;zju.edu.cn;zju.edu.cn", "author_num": 7, "aff_unique_index": "0;1;2;2", "aff_unique_norm": "Shannon.AI;Nanyang Technological University;Zhejiang University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.shannon.ai;https://www.ntu.edu.sg;https://www.zju.edu.cn", "aff_unique_abbr": "Shannon.AI;NTU;ZJU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2;2", "aff_country_unique": "United States;Singapore;China" }, { "id": "BZbUtxOy3R", "title": "Character Generation through Self-Supervised Vectorization", "track": "main", "status": "Reject", "tldr": "", "abstract": "The prevalent approach in self-supervised image generation is to operate on pixel level representations. While this approach can produce high quality images, it cannot benefit from the simplicity and innate quality of vectorization. Here we present a drawing agent that operates on stroke-level representation of images. At each time step, the agent first assesses the current canvas and decides whether to stop or keep drawing. When a `draw\u2019 decision is made, the agent outputs a program indicating the stroke to be drawn. As a result, it produces a final raster image by drawing the strokes on a canvas, using a minimal number of strokes and dynamically deciding when to stop. We train our agent through reinforcement learning on MNIST and Omniglot datasets for unconditional generation and parsing (reconstruction) tasks. We utilize our parsing agent for exemplar generation and type conditioned concept generation in Omniglot challenge without any further training. We present successful results on all three generation tasks and the parsing task. Crucially, we do not need any stroke-level or vector supervision; we only use raster images for training. Code will be made available upon acceptance. \n", "keywords": "character generation;parsing;reconstruction;self-supervised;omniglot", "primary_area": "", "supplementary_material": "", "author": "Gokcen Gokceoglu;Emre Akbas", "authorids": "~Gokcen_Gokceoglu1;~Emre_Akbas1", "gender": "F;M", "homepage": ";http://user.ceng.metu.edu.tr/~emre/", "dblp": ";78/1103", "google_scholar": ";HeXAdnEAAAAJ", "orcid": ";0000-0002-3760-6722", "linkedin": "gokcen-gokceoglu-620772135/;", "or_profile": "~Gokcen_Gokceoglu1;~Emre_Akbas1", "aff": "METU;Middle East Technical University", "aff_domain": "metu.edu.tr;metu.edu.tr", "position": "MS student;Associate Professor", "bibtex": "@misc{\ngokceoglu2022character,\ntitle={Character Generation through Self-Supervised Vectorization},\nauthor={Gokcen Gokceoglu and Emre Akbas},\nyear={2022},\nurl={https://openreview.net/forum?id=BZbUtxOy3R}\n}", "github": "", "project": "", "reviewers": "c9oL;hg9W;aFpU", "site": "https://openreview.net/forum?id=BZbUtxOy3R", "pdf_size": 0, "recommendation": "3;5;5", "confidence": "4;3;4", "correctness": "2;3;3", "technical_novelty": "2;2;3", "empirical_novelty": "1;1;2", "wc_summary_paper": "67;71;67", "wc_summary_review": "50;36;48", "wc_main_review": "283;319;144", "wc_review": "400;426;259", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 4.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 2.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 1.3333333333333333, 0.4714045207910317 ], "wc_summary_paper_avg": [ 68.33333333333333, 1.8856180831641267 ], "wc_summary_review_avg": [ 44.666666666666664, 6.182412330330469 ], "wc_main_review_avg": [ 248.66666666666666, 75.45565290656198 ], "wc_review_avg": [ 361.6666666666667, 73.36817354199906 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.49999999999999983, "corr_recommendation_correctness": 0.9999999999999998, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15940143651723776101&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0", "aff_unique_norm": "Middle East Technical University", "aff_unique_dep": "", "aff_unique_url": "https://www.metu.edu.tr", "aff_unique_abbr": "METU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "T\u00fcrkiye" }, { "title": "Promoting Saliency From Depth: Deep Unsupervised RGB-D Saliency Detection", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/5929", "id": "BZnnMbt0pW", "poster": "", "openreview": "https://openreview.net/forum?id=BZnnMbt0pW", "slides": "https://iclr.cc/virtual/2022/poster/5929", "video": "https://iclr.cc/virtual/2022/poster/5929", "author_site": "Wei Ji, Jingjing Li, Qi Bi, chuan guo, Jie Liu, Li Cheng", "tldr": "", "abstract": "Growing interests in RGB-D salient object detection (RGB-D SOD) have been witnessed in recent years, owing partly to the popularity of depth sensors and the rapid progress of deep learning techniques. Unfortunately, existing RGB-D SOD methods typically demand large quantity of training images being thoroughly annotated at pixel-level. The laborious and time-consuming manual annotation has become a real bottleneck in various practical scenarios. On the other hand, current unsupervised RGB-D SOD methods still heavily rely on handcrafted feature representations. This inspires us to propose in this paper a deep unsupervised RGB-D saliency detection approach, which requires no manual pixel-level annotation during training. It is realized by two key ingredients in our training pipeline. First, a depth-disentangled saliency update (DSU) framework is designed to automatically produce pseudo-labels with iterative follow-up refinements, which provides more trustworthy supervision signals for training the saliency network. Second, an attentive training strategy is introduced to tackle the issue of noisy pseudo-labels, by properly re-weighting to highlight the more reliable pseudo-labels. Extensive experiments demonstrate the superior efficiency and effectiveness of our approach in tackling the challenging unsupervised RGB-D SOD scenarios. Moreover, our approach can also be adapted to work in fully-supervised situation. Empirical studies show the incorporation of our approach gives rise to notably performance improvement in existing supervised RGB-D SOD models.", "keywords": "RGB-D saliency detection;salient object detection;deep learning;unsupervised learning", "primary_area": "", "supplementary_material": "", "author": "Wei Ji;Jingjing Li;Qi Bi;chuan guo;Jie Liu;Li Cheng", "authorids": "~Wei_Ji2;~Jingjing_Li5;~Qi_Bi1;~chuan_guo2;~Jie_Liu8;~Li_Cheng1", "gender": ";F;;M;M;Not Specified", "homepage": ";;;https://ericguo5513.github.io/;https://github.com/Roudgers;https://www.ece.ualberta.ca/~lcheng5/", "dblp": ";;;147/5346-2;;13/4938-1", "google_scholar": ";1QYsOAUAAAAJ;;eCdqvJoAAAAJ;;https://scholar.google.ca/citations?user=9IRFiEQAAAAJ", "orcid": ";;;0000-0002-4539-0634;;0000-0003-3261-3533", "linkedin": ";;;chuan-guo-59b6a810a/;;", "or_profile": "~Wei_Ji2;~Jingjing_Li5;~Qi_Bi1;~chuan_guo2;~Jie_Liu8;~Li_Cheng1", "aff": ";University of Alberta;;University of Alberta;;University of Alberta", "aff_domain": ";ualberta.ca;;ualberta.ca;;ualberta.ca", "position": ";PhD student;;PhD student;;Full Professor", "bibtex": "@inproceedings{\nji2022promoting,\ntitle={Promoting Saliency From Depth: Deep Unsupervised {RGB}-D Saliency Detection},\nauthor={Wei Ji and Jingjing Li and Qi Bi and chuan guo and Jie Liu and Li Cheng},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=BZnnMbt0pW}\n}", "github": "", "project": "", "reviewers": "kVSL;WhBE;xcAY", "pdf_size": 0, "recommendation": "6;8;8", "confidence": "3;4;4", "correctness": "2;4;3", "technical_novelty": "2;3;3", "empirical_novelty": "0;3;3", "wc_summary_paper": "50;83;85", "wc_summary_review": "15;40;46", "wc_main_review": "309;189;352", "wc_review": "374;312;483", "wc_reply_reviewers": "183;0;0", "wc_reply_authors": "954;143;1472", "reply_reviewers": "2;0;0", "reply_authors": "2;1;3", "recommendation_avg": [ 7.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.0, 0.816496580927726 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.0, 1.4142135623730951 ], "wc_summary_paper_avg": [ 72.66666666666667, 16.048537489614297 ], "wc_summary_review_avg": [ 33.666666666666664, 13.424687043734844 ], "wc_main_review_avg": [ 283.3333333333333, 68.97503574160405 ], "wc_review_avg": [ 389.6666666666667, 70.68396014812853 ], "wc_reply_reviewers_avg": [ 61.0, 86.2670273047588 ], "wc_reply_authors_avg": [ 856.3333333333334, 546.939566029821 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.9428090415820634 ], "reply_authors_avg": [ 2.0, 0.816496580927726 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.9999999999999998, "corr_recommendation_correctness": 0.8660254037844385, "gs_citation": 47, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15400914580495629111&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=BZnnMbt0pW", "email": ";ualberta.ca;;ualberta.ca;;ualberta.ca", "author_num": 6, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Alberta", "aff_unique_dep": "", "aff_unique_url": "https://www.ualberta.ca", "aff_unique_abbr": "UAlberta", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Canada" }, { "id": "Bc4fwa76mRp", "title": "Head2Toe: Utilizing Intermediate Representations for Better OOD Generalization", "track": "main", "status": "Reject", "tldr": "", "abstract": "Transfer-learning methods aim to improve performance in a data-scarce target domain using a model pretrained on a data-rich source domain. A cost-efficient strategy, linear probing, involves freezing the source model and training a new classification head for the target domain. This strategy is outperformed by a more costly but state-of-the-art method---fine-tuning all parameters of the source model to the target domain---possibly because fine-tuning allows the model to leverage useful information from intermediate layers which is otherwise discarded by the later pretrained layers. We explore the hypothesis that these intermediate layers might be directly exploited by linear probing. We propose a method, Head-to-Toe probing (Head2Toe), that selects features from all layers of the source model to train a classification head for the target-domain. In evaluations on the VTAB, Head2Toe matches performance obtained with fine-tuning on average, but critically, for out-of-distribution transfer, Head2Toe outperforms fine-tuning.", "keywords": "efficient training;transfer learning;efficient transfer;fine tuning;computer vision;linear probe", "primary_area": "", "supplementary_material": "", "author": "Utku Evci;Vincent Dumoulin;Hugo Larochelle;Michael Curtis Mozer", "authorids": "~Utku_Evci1;~Vincent_Dumoulin1;~Hugo_Larochelle1;~Michael_Curtis_Mozer1", "gender": ";M;M;M", "homepage": "http://evcu.github.io;;https://mila.quebec/en/directory/hugo-larochelle;https://www.cs.colorado.edu/~mozer", "dblp": "179/8146;133/8606;86/3862.html;m/MichaelCMozer", "google_scholar": "8yGMMwcAAAAJ;https://scholar.google.ca/citations?user=mZfgLA4AAAAJ;https://scholar.google.ca/citations?user=U89FHq4AAAAJ;lmjR_qMAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Utku_Evci1;~Vincent_Dumoulin1;~Hugo_Larochelle1;~Michael_Curtis_Mozer1", "aff": "Google;Google;Universit\u00e9 de Sherbrooke;Google DeepMind", "aff_domain": "google.com;google.com;usherbrooke.ca;google.com", "position": "Researcher;Research Scientist;Adjunct Professor;Research Scientist", "bibtex": "@misc{\nevci2022headtoe,\ntitle={Head2Toe: Utilizing Intermediate Representations for Better {OOD} Generalization},\nauthor={Utku Evci and Vincent Dumoulin and Hugo Larochelle and Michael Curtis Mozer},\nyear={2022},\nurl={https://openreview.net/forum?id=Bc4fwa76mRp}\n}", "github": "", "project": "", "reviewers": "VhK8;nxiA;kz8L;yqBZ", "site": "https://openreview.net/forum?id=Bc4fwa76mRp", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "3;4;3;5", "correctness": "3;2;3;3", "technical_novelty": "3;2;2;3", "empirical_novelty": "3;2;3;3", "wc_summary_paper": "140;84;136;61", "wc_summary_review": "12;51;65;45", "wc_main_review": "139;274;561;466", "wc_review": "291;409;762;572", "wc_reply_reviewers": "0;66;140;744", "wc_reply_authors": "92;581;1357;1782", "reply_reviewers": "0;1;3;3", "reply_authors": "1;3;6;6", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 105.25, 33.77406549410361 ], "wc_summary_review_avg": [ 43.25, 19.447043477094404 ], "wc_main_review_avg": [ 360.0, 164.22088783099426 ], "wc_review_avg": [ 508.5, 177.1306015345739 ], "wc_reply_reviewers_avg": [ 237.5, 296.5918913254373 ], "wc_reply_authors_avg": [ 953.0, 657.6781127572971 ], "reply_reviewers_avg": [ 1.75, 1.299038105676658 ], "reply_authors_avg": [ 4.0, 2.1213203435596424 ], "replies_avg": [ 28, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.30151134457776363, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:f0YECdcN2mUJ:scholar.google.com/&scioq=Head2Toe:+Utilizing+Intermediate+Representations+for+Better+OOD+Generalization&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Google;Universit\u00e9 de Sherbrooke", "aff_unique_dep": "Google;", "aff_unique_url": "https://www.google.com;https://www.usherbrooke.ca", "aff_unique_abbr": "Google;UdeS", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;0;1;2", "aff_country_unique": "United States;Canada;United Kingdom" }, { "id": "Bd8JSwLVWQ5", "title": "Equivalence of State Equations from Different Methods in High-dimensional Regression", "track": "main", "status": "Reject", "tldr": "", "abstract": "State equations were firstly introduced in the approximate message passing (AMP) to describe the mean square error (MSE) in compressed sensing. Since then a set of state equations have appeared in studies of logistic regression, robust estimator and other high-dimensional statistics problems. Recently, a convex Gaussian min-max theorem(CGMT) approach was proposed to study high-dimensional statistic problems accompanying with another set of different state equations. This Paper provides a uniform viewpoint on these methods and shows the equivalence of their reduction forms, which causes that the resulting SE are essentially equivalent and can be converted into the same expression through parameter transformations. Combining these results, we show that these different state equations are derived from several equivalent reduction forms. We believe this equivalence shed light on discovering a deeper structure in high dimensional statistics.", "keywords": "Approximate message passing;Lasso;High dimensional statistics", "primary_area": "", "supplementary_material": "", "author": "Saidi Luo;Songtao Tian;Qian Lin", "authorids": "~Saidi_Luo1;~Songtao_Tian1;~Qian_Lin2", "gender": "M;M;M", "homepage": ";;https://sites.google.com/site/qianlincd/", "dblp": ";;79/3108", "google_scholar": ";https://scholar.google.com.hk/citations?user=SqdVztsAAAAJ;kHPrqdgAAAAJ", "orcid": ";;", "linkedin": "%E8%B5%9B%E8%BF%AA-%E7%BD%97-4b8214222/;%E6%9D%BE%E6%B6%9B-%E7%94%B0-824210222/;", "or_profile": "~Saidi_Luo1;~Songtao_Tian1;~Qian_Lin2", "aff": "Tsinghua University;Tsinghua University;Tsinghua University", "aff_domain": "tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn", "position": "PhD student;PhD student;Associate Professor", "bibtex": "@misc{\nluo2022equivalence,\ntitle={Equivalence of State Equations from Different Methods in High-dimensional Regression},\nauthor={Saidi Luo and Songtao Tian and Qian Lin},\nyear={2022},\nurl={https://openreview.net/forum?id=Bd8JSwLVWQ5}\n}", "github": "", "project": "", "reviewers": "rXB2;12zP;U2rr;e98M", "site": "https://openreview.net/forum?id=Bd8JSwLVWQ5", "pdf_size": 0, "recommendation": "1;3;3;8", "confidence": "5;4;4;4", "correctness": "2;3;4;4", "technical_novelty": "1;2;2;4", "empirical_novelty": "0;0;0;0", "wc_summary_paper": "50;100;42;101", "wc_summary_review": "151;46;11;111", "wc_main_review": "514;157;287;194", "wc_review": "715;303;340;406", "wc_reply_reviewers": "15;101;0;0", "wc_reply_authors": "717;372;569;572", "reply_reviewers": "1;1;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 3.75, 2.5860201081971503 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.25, 1.0897247358851685 ], "empirical_novelty_avg": [ 0, 0 ], "wc_summary_paper_avg": [ 73.25, 27.39867697535777 ], "wc_summary_review_avg": [ 79.75, 54.58651390224512 ], "wc_main_review_avg": [ 288.0, 138.81102261708182 ], "wc_review_avg": [ 441.0, 162.4392194022121 ], "wc_reply_reviewers_avg": [ 29.0, 42.017853348308975 ], "wc_reply_authors_avg": [ 557.5, 122.67130878897477 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.6139601294045424, "corr_recommendation_correctness": 0.7287050466613612, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:R-80-i8t-8AJ:scholar.google.com/&scioq=Equivalence+of+State+Equations+from+Different+Methods+in+High-dimensional+Regression&hl=en&as_sdt=0,5", "gs_version_total": 7, "aff_unique_index": "0;0;0", "aff_unique_norm": "Tsinghua University", "aff_unique_dep": "", "aff_unique_url": "https://www.tsinghua.edu.cn", "aff_unique_abbr": "THU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "id": "BdPhV0Y6qkk", "title": "InterTrain: Accelerating DNN Training using Input Interpolation", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Training Deep Neural Networks (DNNs) places immense compute requirements on the underlying hardware platforms, expending large amounts of time and energy. An important factor contributing to the long training times is the increasing dataset complexity required to reach state-of-the-art performance in real-world applications. To this end, we propose to reduce training runtimes by combining a subset of inputs in the training dataset via an interpolation operation. The goal is for training on the interpolated input to achieve a similar effect as training separately on each the constituent inputs that it represents. This results in a lower number of inputs (or mini-batches) to be processed in each epoch. However, we find that naively interpolating inputs leads to a considerable drop in learning performance and model accuracy. This is because the efficacy of learning on interpolated inputs is reduced by the interference between the forward/backward propagation of their constituent inputs. We propose two strategies to address this challenge and realize training speedups with minimal impact on accuracy. First, we reduce the impact of interference by exploiting the spatial separation between the features of the constituent inputs in the network\u2019s intermediate representations. We also adaptively vary the weightage of constituent inputs based on their loss in previous epochs. Second, we propose loss-based metrics to automatically identify the subset of the training dataset that is subject to interpolation in each epoch. For ResNets of varying depth and MobileNetV2, we obtain upto 1.6x and 1.8x speed-ups in training for the ImageNet and Cifar10 datasets, respectively, on an Nvidia RTX 2080Ti GPU, with negligible loss in classification accuracy. \n\n\n", "keywords": "Efficient DNN Training", "primary_area": "", "supplementary_material": "", "author": "Sarada Krithivasan;Swagath Venkataramani;Sanchari Sen;Anand Raghunathan", "authorids": "~Sarada_Krithivasan1;~Swagath_Venkataramani2;sanchari.sen@ibm.com;~Anand_Raghunathan1", "gender": "F;;;", "homepage": ";;;https://engineering.purdue.edu/~araghu/", "dblp": ";;;74/3747.html", "google_scholar": ";l2RQ_S8AAAAJ;;OP7F8jEAAAAJ", "orcid": ";;;", "linkedin": "https://www.linkedin.com/public-profile/in/sarada-krithivasan-07534a140?challengeId=AQFDneomrWAQDwAAAXTHrOpLW0SbE1JOenD8VUJsG7ck5nE43Oc2dQgYO7K62iHBUunN5QpK0LjFU9zlb7Toh-1Kxm7ou2Ti7A&submissionId=92b67a2a-ce2b-3816-5e0e-1676a0a76d6d;;;", "or_profile": "~Sarada_Krithivasan1;~Swagath_Venkataramani2;sanchari.sen@ibm.com;~Anand_Raghunathan1", "aff": "Purdue University;;;Purdue University", "aff_domain": "purdue.edu;;;purdue.edu", "position": "Graduate Research Assistant;;;Full Professor", "bibtex": "@misc{\nkrithivasan2022intertrain,\ntitle={InterTrain: Accelerating {DNN} Training using Input Interpolation},\nauthor={Sarada Krithivasan and Swagath Venkataramani and Sanchari Sen and Anand Raghunathan},\nyear={2022},\nurl={https://openreview.net/forum?id=BdPhV0Y6qkk}\n}", "github": "", "project": "", "reviewers": "V2BX;LecF;fcVp;URmj", "site": "https://openreview.net/forum?id=BdPhV0Y6qkk", "pdf_size": 0, "recommendation": "3;5;5;5", "confidence": "4;4;4;4", "correctness": "2;3;3;3", "technical_novelty": "2;2;1;1", "empirical_novelty": "1;2;2;3", "wc_summary_paper": "64;141;67;61", "wc_summary_review": "65;148;7;43", "wc_main_review": "448;496;584;222", "wc_review": "577;785;658;326", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.5, 0.8660254037844386 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 1.5, 0.5 ], "empirical_novelty_avg": [ 2.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 83.25, 33.40939239196068 ], "wc_summary_review_avg": [ 65.75, 51.80431931798738 ], "wc_main_review_avg": [ 437.5, 133.6366341988603 ], "wc_review_avg": [ 586.5, 167.6790088234064 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 1.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:6jxhNqoQ2twJ:scholar.google.com/&scioq=InterTrain:+Accelerating+DNN+Training+using+Input+Interpolation&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Purdue University", "aff_unique_dep": "", "aff_unique_url": "https://www.purdue.edu", "aff_unique_abbr": "Purdue", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "BduNVoPyXBK", "title": "Task-driven Discovery of Perceptual Schemas for Generalization in Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Deep reinforcement learning (Deep RL) has recently seen significant progress in developing algorithms for generalization. However, most algorithms target a single type of generalization setting. In this work, we study generalization across three disparate task structures: (a) tasks composed of spatial and temporal compositions of regularly occurring object motions; (b) tasks composed of active perception of and navigation towards regularly occurring 3D objects; and (c) tasks composed of navigating through sequences of regularly occurring object-configurations. These diverse task structures all share an underlying idea of compositionality: task completion always involves combining reoccurring segments of task-oriented perception and behavior. We hypothesize that an agent can generalize within a task structure if it can discover representations that capture these reoccurring task-segments. For our tasks, this corresponds to representations for recognizing individual object motions, for navigation towards 3D objects, and for navigating through object-configurations. Taking inspiration from cognitive science, we term representations for reoccurring segments of an agent's experience, \"perceptual schemas\". We propose Composable Perceptual Schemas (CPS), which learns a composable state representation where perceptual schemas are distributed across multiple, relatively small recurrent \"subschema\" modules. Our main technical novelty is an expressive attention function that enables subschemas to dynamically attend to features shared across all positions in the agent's observation. Our experiments indicate our feature-attention mechanism enables CPS to generalize better than recurrent architectures that attend to observations with spatial attention.", "keywords": "deep reinforcement learning;reinforcement learning;deep learning;compositional generalization;generalization;recurrent architecture", "primary_area": "", "supplementary_material": "/attachment/26aa36884123289a127950d77be9a92e526b5c00.zip", "author": "Wilka Torrico Carvalho;Andrew Kyle Lampinen;Kyriacos Nikiforou;Felix Hill;Murray Shanahan", "authorids": "~Wilka_Torrico_Carvalho1;~Andrew_Kyle_Lampinen1;~Kyriacos_Nikiforou1;~Felix_Hill1;~Murray_Shanahan1", "gender": "M;M;M;;M", "homepage": "https://wcarvalho.github.io/;https://github.com/google/BIG-bench;;https://fh295.github.io/;https://www.doc.ic.ac.uk/~mpsha/", "dblp": "230/3919;https://dblp.uni-trier.de/pers/hd/l/Lampinen:Andrew_K=;;116/0509;11/5268", "google_scholar": "tvJTXwoAAAAJ;_N44XxAAAAAJ;;https://scholar.google.co.uk/citations?user=4HLUnhIAAAAJ;https://scholar.google.co.uk/citations?user=00bnGpAAAAAJ", "orcid": ";;0000-0002-1504-5725;;0000-0001-5984-2964", "linkedin": "wilkacarvalho;;;;", "or_profile": "~Wilka_Torrico_Carvalho1;~Andrew_Kyle_Lampinen1;~Kyriacos_Nikiforou1;~Felix_Hill1;~Murray_Shanahan1", "aff": "Google;Google DeepMind;Google DeepMind;Google;Imperial College London", "aff_domain": "google.com;google.com;deepmind.com;google.com;", "position": "Research Scientist Intern;Research Scientist;Researcher;Researcher;Full Professor", "bibtex": "@misc{\ncarvalho2022taskdriven,\ntitle={Task-driven Discovery of Perceptual Schemas for Generalization in Reinforcement Learning},\nauthor={Wilka Torrico Carvalho and Andrew Kyle Lampinen and Kyriacos Nikiforou and Felix Hill and Murray Shanahan},\nyear={2022},\nurl={https://openreview.net/forum?id=BduNVoPyXBK}\n}", "github": "", "project": "", "reviewers": "4R1W;Y5Kp;jU4K", "site": "https://openreview.net/forum?id=BduNVoPyXBK", "pdf_size": 0, "recommendation": "5;5;6", "confidence": "4;4;4", "correctness": "3;2;3", "technical_novelty": "2;2;2", "empirical_novelty": "3;2;3", "wc_summary_paper": "67;334;268", "wc_summary_review": "115;343;72", "wc_main_review": "759;712;1044", "wc_review": "941;1389;1384", "wc_reply_reviewers": "16;228;99", "wc_reply_authors": "593;1006;927", "reply_reviewers": "1;1;1", "reply_authors": "1;2;2", "recommendation_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 2.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 223.0, 113.55175031676086 ], "wc_summary_review_avg": [ 176.66666666666666, 118.91827258901618 ], "wc_main_review_avg": [ 838.3333333333334, 146.6886347184251 ], "wc_review_avg": [ 1238.0, 210.02063390692513 ], "wc_reply_reviewers_avg": [ 114.33333333333333, 87.2251238017019 ], "wc_reply_authors_avg": [ 842.0, 178.9990688988819 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.4999999999999999, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:3Tyk5oL4a0EJ:scholar.google.com/&scioq=Task-driven+Discovery+of+Perceptual+Schemas+for+Generalization+in+Reinforcement+Learning&hl=en&as_sdt=0,33", "gs_version_total": 2, "aff_unique_index": "0;0;0;0;1", "aff_unique_norm": "Google;Imperial College London", "aff_unique_dep": "Google;", "aff_unique_url": "https://www.google.com;https://www.imperial.ac.uk", "aff_unique_abbr": "Google;ICL", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;1;1;0;1", "aff_country_unique": "United States;United Kingdom" }, { "id": "BefW4ttKMFt", "title": "Meta Learning with Minimax Regularization", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Even though meta-learning has attracted research wide attention in recent years, the generalization problem of meta-learning is still not well addressed. Existing works focus on meta-generalization to unseen tasks at the meta-level, while ignoring that adapted-models may not be generalized to the tasks domain at the adaptation-level, which can not be solved trivially. To this end, we propose a new regularization mechanism for meta-learning -- Minimax-Meta Regularization. Especially, we maximize the regularizer in the inner-loop to encourage the adapted-model to be more sensitive to the new task, and minimize the regularizer in the outer-loop to resist overfitting of the meta-model. This adversarial regularization forces the meta-algorithm to maintain generality at the meta-level while it is easy to learn specific assumptions at the task-specific level, thereby improving the generalization of meta-learning. We conduct extensive experiments on the representative meta-learning scenarios to verify our proposed method, including few-shot learning and robust reweighting. The results show that our method consistently improves the performance of the meta-learning algorithms and demonstrates the effectiveness of Minimax-Meta Regularization.", "keywords": "meta learning;generalization;minimax regularization", "primary_area": "", "supplementary_material": "/attachment/4f5b25651eafe5c8e79109f268f636cde02ef24d.zip", "author": "Lianzhe Wang;Shiji Zhou;Shanghang Zhang;Wenpeng Zhang;Heng Chang;Wenwu Zhu", "authorids": "~Lianzhe_Wang1;~Shiji_Zhou1;~Shanghang_Zhang4;~Wenpeng_Zhang1;~Heng_Chang2;~Wenwu_Zhu1", "gender": ";M;;M;M;M", "homepage": ";https://arnoldshijizhou.github.io;;;https://hchang95.github.io;http://media.cs.tsinghua.edu.cn/en/zww", "dblp": ";294/8684;;203/4474.html;79/5668;97/6308-1.html", "google_scholar": ";Do5jf8oAAAAJ;;EMMkuFMAAAAJ;e9NeskoAAAAJ;https://scholar.google.com.tw/citations?user=7t2jzpgAAAAJ", "orcid": ";0009-0000-0677-7396;;;0000-0002-4978-8041;0000-0003-2236-9290", "linkedin": ";shiji-zhou-05b766ba/;;;;", "or_profile": "~Lianzhe_Wang1;~Shiji_Zhou1;~Shanghang_Zhang4;~Wenpeng_Zhang1;~Heng_Chang2;~Wenwu_Zhu1", "aff": ";Tsinghua University;;Ant Group;Tsinghua-Berkeley Shenzhen Institute;Tsinghua University", "aff_domain": ";mails.tsinghua.edu.cn;;ant.com;mails.tsinghua.edu.cn;tsinghua.edu.cn", "position": ";PhD student;;Researcher;PhD student;Full Professor", "bibtex": "@misc{\nwang2022meta,\ntitle={Meta Learning with Minimax Regularization},\nauthor={Lianzhe Wang and Shiji Zhou and Shanghang Zhang and Wenpeng Zhang and Heng Chang and Wenwu Zhu},\nyear={2022},\nurl={https://openreview.net/forum?id=BefW4ttKMFt}\n}", "github": "", "project": "", "reviewers": "FMen;TQeK;Ehei;Q8fH", "site": "https://openreview.net/forum?id=BefW4ttKMFt", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "4;3;3;3", "correctness": "1;3;3;4", "technical_novelty": "3;2;2;2", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "68;13;19;83", "wc_summary_review": "63;14;39;88", "wc_main_review": "343;190;335;160", "wc_review": "474;217;393;331", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 2.75, 1.0897247358851685 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 45.75, 30.293357357678268 ], "wc_summary_review_avg": [ 51.0, 27.504545078950134 ], "wc_main_review_avg": [ 257.0, 82.73149339882606 ], "wc_review_avg": [ 353.75, 93.83329632918158 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": 0.6622661785325219, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:WCt226ZRRB4J:scholar.google.com/&scioq=Meta+Learning+with+Minimax+Regularization&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "Tsinghua University;Ant Group;Tsinghua-Berkeley Shenzhen Institute", "aff_unique_dep": ";;", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.antgroup.com;http://www.tbsi.edu.cn/", "aff_unique_abbr": "THU;Ant Group;TBSI", "aff_campus_unique_index": "1", "aff_campus_unique": ";Shenzhen", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "id": "Bel1Do_eZC", "title": "Inductive Lottery Ticket Learning for Graph Neural Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "Deep graph neural networks (GNNs) have gained increasing popularity, while usually suffer from unaffordable computations for real-world large-scale applications. Hence, pruning GNNs is of great need but largely unexplored. A recent work, UGS, studies lottery ticket learning for GNNs, aiming to find a subset of model parameters and graph structure that can best maintain the GNN performance. However, it is tailed for the transductive setting, failing to generalize to unseen graphs, which are common in inductive tasks like graph classification. In this work, we propose a simple and effective learning paradigm, Inductive Co-Pruning of GNNs (ICPG), to endow graph lottery tickets with inductive pruning capacity. To prune the input graphs, we design a generative probabilistic model to generate importance scores for each edge based on the input; to prune the model parameters, it views the weight's magnitude as their importance scores. Then we design an iterative co-pruning strategy to trim the graph edges and GNN weights based on their importance scores. Although it might be strikingly simple, ICPG surpasses the existing pruning method and can be universally applicable in both inductive and transductive learning settings. On ten graph-classification and two node-classification benchmarks, ICPG achieves the same performance level with $14.26\\%\\sim43.12\\%$ sparsity for graphs and $48.80\\%\\sim91.41\\%$ sparsity for the model.", "keywords": "Lottery Ticket Hypothesis;Graph Neural Networks;Neural Network Pruning", "primary_area": "", "supplementary_material": "/attachment/12612318398f07d5facab3214a3f28abc05323d8.zip", "author": "Yongduo Sui;Xiang Wang;Tianlong Chen;Xiangnan He;Tat-Seng Chua", "authorids": "~Yongduo_Sui1;~Xiang_Wang6;~Tianlong_Chen1;~Xiangnan_He1;~Tat-Seng_Chua2", "gender": "M;M;M;M;M", "homepage": "https://yongduosui.github.io/;https://github.com/xiangwang1223;https://tianlong-chen.github.io;http://staff.ustc.edu.cn/~hexn;http://www.comp.nus.edu.sg/~chuats/", "dblp": "277/5175;31/2864-10;;59/1007;", "google_scholar": "VD9g6ogAAAAJ;https://scholar.google.com.sg/citations?user=HdhaQB0AAAAJ;LE3ctn0AAAAJ;https://scholar.google.com.sg/citations?user=X45Go24AAAAJ;https://scholar.google.com.tw/citations?user=Z9DWCBEAAAAJ", "orcid": "0000-0003-4492-147X;0000-0002-6148-6329;0000-0001-7774-8197;0000-0001-8472-7992;0000-0001-6097-7807", "linkedin": "yongduosui/;;tianlong-chen-783862167/;;", "or_profile": "~Yongduo_Sui1;~Xiang_Wang6;~Tianlong_Chen1;~Xiangnan_He1;~Tat-seng_Chua1", "aff": "University of Science and Technology of China;National University of Singapore;University of Texas, Austin;University of Science and Technology of China;National University of Singapore", "aff_domain": "ustc.edu.cn;nus.edu.sg;utexas.edu;ustc.edu.cn;nus.edu.sg", "position": "PhD student;Postdoc;PhD student;Professor;Full Professor", "bibtex": "@misc{\nsui2022inductive,\ntitle={Inductive Lottery Ticket Learning for Graph Neural Networks},\nauthor={Yongduo Sui and Xiang Wang and Tianlong Chen and Xiangnan He and Tat-Seng Chua},\nyear={2022},\nurl={https://openreview.net/forum?id=Bel1Do_eZC}\n}", "github": "", "project": "", "reviewers": "HhcH;FXPK;4F4b;6qg1", "site": "https://openreview.net/forum?id=Bel1Do_eZC", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "3;4;3;3", "correctness": "3;3;3;3", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;0;3;3", "wc_summary_paper": "58;58;143;40", "wc_summary_review": "45;127;61;57", "wc_main_review": "433;195;132;183", "wc_review": "536;380;336;280", "wc_reply_reviewers": "201;0;0;0", "wc_reply_authors": "3094;3012;1492;468", "reply_reviewers": "2;0;0;0", "reply_authors": "14;7;4;2", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 1.224744871391589 ], "wc_summary_paper_avg": [ 74.75, 40.08350658313217 ], "wc_summary_review_avg": [ 72.5, 32.01171660501823 ], "wc_main_review_avg": [ 235.75, 116.31288621644637 ], "wc_review_avg": [ 383.0, 95.17877914745492 ], "wc_reply_reviewers_avg": [ 50.25, 87.03555308033609 ], "wc_reply_authors_avg": [ 2016.5, 1098.2917417517078 ], "reply_reviewers_avg": [ 0.5, 0.8660254037844386 ], "reply_authors_avg": [ 6.75, 4.548351349665063 ], "replies_avg": [ 36, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.0, "gs_citation": 21, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12475572580303512720&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff_unique_index": "0;1;2;0;1", "aff_unique_norm": "University of Science and Technology of China;National University of Singapore;University of Texas at Austin", "aff_unique_dep": ";;", "aff_unique_url": "http://www.ustc.edu.cn;https://www.nus.edu.sg;https://www.utexas.edu", "aff_unique_abbr": "USTC;NUS;UT Austin", "aff_campus_unique_index": "1", "aff_campus_unique": ";Austin", "aff_country_unique_index": "0;1;2;0;1", "aff_country_unique": "China;Singapore;United States" }, { "title": "EViT: Expediting Vision Transformers via Token Reorganizations", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6168", "id": "BjyvwnXXVn_", "poster": "", "openreview": "https://openreview.net/forum?id=BjyvwnXXVn_", "slides": "https://iclr.cc/virtual/2022/poster/6168", "video": "https://iclr.cc/virtual/2022/poster/6168", "author_site": "Youwei Liang, Chongjian GE, Zhan Tong, Yibing Song, Jue Wang, Pengtao Xie", "tldr": "", "abstract": "Vision Transformers (ViTs) take all the image patches as tokens and construct multi-head self-attention (MHSA) among them. Complete leverage of these image tokens brings redundant computations since not all the tokens are attentive in MHSA. Examples include that tokens containing semantically meaningless or distractive image backgrounds do not positively contribute to the ViT predictions. In this work, we propose to reorganize image tokens during the feed-forward process of ViT models, which is integrated into ViT during training. For each forward inference, we identify the attentive image tokens between MHSA and FFN (i.e., feed-forward network) modules, which is guided by the corresponding class token attention. Then, we reorganize image tokens by preserving attentive image tokens and fusing inattentive ones to expedite subsequent MHSA and FFN computations. To this end, our method EViT improves ViTs from two perspectives. First, under the same amount of input image tokens, our method reduces MHSA and FFN computation for efficient inference. For instance, the inference speed of DeiT-S is increased by 50% while its recognition accuracy is decreased by only 0.3% for ImageNet classification. Second, by maintaining the same computational cost, our method empowers ViTs to take more image tokens as input for recognition accuracy improvement, where the image tokens are from higher resolution images. An example is that we improve the recognition accuracy of DeiT-S by 1% for ImageNet classification at the same computational cost of a vanilla DeiT-S. Meanwhile, our method does not introduce more parameters to ViTs. Experiments on the standard benchmarks show the effectiveness of our method. The code is available at https://github.com/youweiliang/evit", "keywords": "Vision Transformers;multi-head self-attention;efficient inference", "primary_area": "", "supplementary_material": "", "author": "Youwei Liang;Chongjian GE;Zhan Tong;Yibing Song;Jue Wang;Pengtao Xie", "authorids": "~Youwei_Liang1;~Chongjian_GE1;~Zhan_Tong1;~Yibing_Song1;~Jue_Wang2;~Pengtao_Xie3", "gender": "M;M;M;;M;M", "homepage": "https://youweiliang.github.io/;https://chongjiange.github.io;https://github.com/yztongzhan;https://ybsong00.github.io/;https://juewang725.github.io/;https://pengtaoxie.github.io/", "dblp": "257/5626;287/4197;236/0753;77/2117;;133/1998", "google_scholar": "zMofZR4AAAAJ;https://scholar.google.com.hk/citations?user=7DA_vcUAAAAJ;6FsgWBMAAAAJ;oRhJHmIAAAAJ;Bt4uDWMAAAAJ;cnncomYAAAAJ", "orcid": ";;0000-0002-3169-0599;;;", "linkedin": ";chongjian-ge-%EF%BC%88%E8%91%9B%E5%B4%87%E5%89%91%EF%BC%89-3b393310b/;;;;", "or_profile": "~Youwei_Liang1;~Chongjian_GE1;~Zhan_Tong1;~Yibing_Song1;~Jue_Wang2;~Pengtao_Xie3", "aff": "Meta AI;The University of Hong Kong;Nanjing University;Tencent AI Lab;Tencent AI Lab;Carnegie Mellon University", "aff_domain": "meta.com;hku.hk;nju.edu.cn;tencent.com;tencent.com; ", "position": "Research intern;PhD student;MS student;Senior Researcher;Director;Graduate Student", "bibtex": "@inproceedings{\nliang2022evit,\ntitle={{EV}iT: Expediting Vision Transformers via Token Reorganizations},\nauthor={Youwei Liang and Chongjian GE and Zhan Tong and Yibing Song and Jue Wang and Pengtao Xie},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=BjyvwnXXVn_}\n}", "github": "", "project": "", "reviewers": "soMj;WtKh;y9ec;SSye", "pdf_size": 0, "recommendation": "8;8;8;8", "confidence": "5;5;4;4", "correctness": "3;3;4;4", "technical_novelty": "4;4;4;4", "empirical_novelty": "4;3;4;4", "wc_summary_paper": "65;31;48;76", "wc_summary_review": "36;34;38;20", "wc_main_review": "372;357;235;203", "wc_review": "473;422;321;299", "wc_reply_reviewers": "41;92;0;21", "wc_reply_authors": "657;1401;397;899", "reply_reviewers": "1;1;0;1", "reply_authors": "1;2;1;2", "recommendation_avg": [ 8.0, 0.0 ], "confidence_avg": [ 4.5, 0.5 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 4.0, 0.0 ], "empirical_novelty_avg": [ 3.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 55.0, 17.073371078963874 ], "wc_summary_review_avg": [ 32.0, 7.0710678118654755 ], "wc_main_review_avg": [ 291.75, 73.81522539422338 ], "wc_review_avg": [ 378.75, 71.4995629357271 ], "wc_reply_reviewers_avg": [ 38.5, 34.121107836645635 ], "wc_reply_authors_avg": [ 838.5, 370.11180743121395 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1, "pdf": "https://openreview.net/pdf?id=BjyvwnXXVn_", "email": "meta.com;hku.hk;nju.edu.cn;tencent.com;tencent.com; ", "author_num": 6, "aff_unique_index": "0;1;2;3;3;4", "aff_unique_norm": "Meta;University of Hong Kong;Nanjing University;Tencent;Carnegie Mellon University", "aff_unique_dep": "Meta AI;;;Tencent AI Lab;", "aff_unique_url": "https://meta.com;https://www.hku.hk;https://www.nju.edu.cn;https://ai.tencent.com;https://www.cmu.edu", "aff_unique_abbr": "Meta;HKU;Nanjing U;Tencent AI Lab;CMU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;1;1;1;1;0", "aff_country_unique": "United States;China" }, { "id": "BkIV7EOXkSs", "title": "Implicit Regularization of Bregman Proximal Point Algorithm and Mirror Descent on Separable Data", "track": "main", "status": "Reject", "tldr": "", "abstract": "Bregman proximal point algorithm (BPPA), as one of the centerpieces in the optimization toolbox, has been witnessing emerging applications. With a simple and easy-to-implement update rule, the algorithm bears several compelling intuitions for empirical successes, yet rigorous justifications are still largely unexplored. We study the computational properties of BPPA through classification tasks with separable data, and demonstrate provable algorithmic regularization effects associated with BPPA. We show that BPPA attains a non-trivial margin, which closely depends on the condition number of the distance-generating function inducing the Bregman divergence. We further demonstrate that the dependence on the condition number is tight for a class of problems, thus showing the importance of divergence in affecting the quality of the obtained solutions. In addition, we extend our findings to mirror descent (MD), for which we establish similar connections between the margin and Bregman divergence. We demonstrate through a concrete example, and show BPPA/MD converges in direction to the maximal margin solution with respect to the squared Mahalanobis distance. Our theoretical findings are among the first to demonstrate the benign learning properties of BPPA/MD, and also provide strong corroborations \nfor a careful choice of divergence in the algorithmic design. ", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/5fad29a9208538c308fd959dcb0a748c97f25774.zip", "author": "Yan Li;Caleb Ju;Ethan Fang;Tuo Zhao", "authorids": "~Yan_Li9;~Caleb_Ju2;~Ethan_Fang1;~Tuo_Zhao1", "gender": "M;M;M;M", "homepage": "https://gzliyan113.github.io/;https://www.jucaleb4.github.io;;http://www2.isye.gatech.edu/~tzhao80", "dblp": ";https://dblp.uni-trier.de/pid/251/9040.html;223/3184;", "google_scholar": "wLfoeakAAAAJ;;;EJXN6tYAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Yan_Li9;~Caleb_Ju2;~Ethan_Fang1;~Tuo_Zhao1", "aff": "Georgia Institute of Technology;Georgia Institute of Technology;Pennsylvania State University;Georgia Institute of Technology", "aff_domain": "gatech.edu;gatech.edu;psu.edu;gatech.edu", "position": "PhD student;PhD student;Assistant Professor;Associate Professor", "bibtex": "@misc{\nli2022implicit,\ntitle={Implicit Regularization of Bregman Proximal Point Algorithm and Mirror Descent on Separable Data},\nauthor={Yan Li and Caleb Ju and Ethan Fang and Tuo Zhao},\nyear={2022},\nurl={https://openreview.net/forum?id=BkIV7EOXkSs}\n}", "github": "", "project": "", "reviewers": "ZTvX;nntq;Mehh;yiCx", "site": "https://openreview.net/forum?id=BkIV7EOXkSs", "pdf_size": 0, "recommendation": "5;6;6;6", "confidence": "4;3;2;2", "correctness": "3;3;3;4", "technical_novelty": "2;3;2;2", "empirical_novelty": "2;2;1;2", "wc_summary_paper": "72;75;42;130", "wc_summary_review": "34;39;51;50", "wc_main_review": "556;208;173;150", "wc_review": "662;322;266;330", "wc_reply_reviewers": "342;0;62;70", "wc_reply_authors": "2707;642;918;301", "reply_reviewers": "1;0;1;1", "reply_authors": "6;1;3;1", "recommendation_avg": [ 5.75, 0.4330127018922193 ], "confidence_avg": [ 2.75, 0.82915619758885 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 79.75, 31.751968442917047 ], "wc_summary_review_avg": [ 43.5, 7.22841614740048 ], "wc_main_review_avg": [ 271.75, 165.40612896745998 ], "wc_review_avg": [ 395.0, 156.1121391820636 ], "wc_reply_reviewers_avg": [ 118.5, 131.8512419357512 ], "wc_reply_authors_avg": [ 1142.0, 929.607712962839 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 2.75, 2.0463381929681126 ], "replies_avg": [ 20, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.8703882797784891, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14123735841473761504&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Georgia Institute of Technology;Pennsylvania State University", "aff_unique_dep": ";", "aff_unique_url": "https://www.gatech.edu;https://www.psu.edu", "aff_unique_abbr": "Georgia Tech;PSU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "cosFormer: Rethinking Softmax In Attention", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6040", "id": "Bl8CQrx2Up4", "poster": "", "openreview": "https://openreview.net/forum?id=Bl8CQrx2Up4", "slides": "https://iclr.cc/virtual/2022/poster/6040", "video": "https://iclr.cc/virtual/2022/poster/6040", "author_site": "Qin Zhen, Weixuan Sun, Hui Deng, DONGXU LI, Yunshen Wei, Baohong Lv, Junjie Yan, Lingpeng Kong, Yiran Zhong", "tldr": "", "abstract": "Transformer has shown great successes in natural language processing, computer vision, and audio processing. As one of its core components, the softmax attention helps to capture long-range dependencies yet prohibits its scale-up due to the quadratic space and time complexity to the sequence length. Kernel methods are often adopted to reduce the complexity by approximating the softmax operator. Nevertheless, due to the approximation errors, their performances vary in different tasks/corpus and suffer crucial performance drops when compared with the vanilla softmax attention. In this paper, we propose a linear transformer called cosFormer that can achieve comparable or better accuracy to the vanilla transformer in both casual and cross attentions. cosFormer is based on two key properties of softmax attention: i). non-negativeness of the attention matrix; ii). a non-linear re-weighting scheme that can concentrate the distribution of the attention matrix. As its linear substitute, cosFormer fulfills these properties with a linear operator and a cosine-based distance re-weighting mechanism. Extensive experiments on language modeling and text understanding tasks demonstrate the effectiveness of our method. We further examine our method on long sequences and achieve state-of-the-art performance on the Long-Range Arena benchmark. The source code is available at https://github.com/OpenNLPLab/cosFormer.", "keywords": "Linear Transformer;softmax attention", "primary_area": "", "supplementary_material": "", "author": "Zhen Qin;Weixuan Sun;Hui Deng;Dongxu Li;Yunshen Wei;Baohong Lv;Junjie Yan;Lingpeng Kong;Yiran Zhong", "authorids": "~Zhen_Qin6;~Weixuan_Sun1;~Hui_Deng2;~Dongxu_Li1;~Yunshen_Wei1;~Baohong_Lv1;~Junjie_Yan3;~Lingpeng_Kong1;~Yiran_Zhong1", "gender": ";M;M;M;M;M;;M;M", "homepage": "https://github.com/Doraemonzzz;https://weixuansun.github.io/weixuansun-github.io/;https://www.researchgate.net/profile/Hui-Deng-24;https://sites.google.com/view/dongxu-li/home;https://www.zhihu.com/people/wei-yun-shen;https://github.com/lkjx82;;https://ikekonglp.github.io/;", "dblp": ";186/6724;88/2704;;;;http://dblp.uni-trier.de/pers/hd/y/Yan:Junjie;144/7656;158/9624", "google_scholar": "https://scholar.google.com.sg/citations?user=IcBRtycAAAAJ;vIS56AoAAAAJ;;https://scholar.google.com/citations?view_op=list_works;;;https://scholar.google.com.hk/citations?user=rEYarG0AAAAJ;f1hBi5wAAAAJ;https://scholar.google.com.sg/citations?user=E9NVOBUAAAAJ", "orcid": ";;0009-0009-5985-3976;;;;;;", "linkedin": ";;;;;;;;", "or_profile": "~Zhen_Qin6;~Weixuan_Sun1;~Hui_Deng2;~Dongxu_Li1;~Yunshen_Wei1;~Baohong_Lv1;~Junjie_Yan3;~Lingpeng_Kong1;~Yiran_Zhong1", "aff": "Sensetime;Australian National University;Northwestern Polytechnical University;Australian National University;sensetime;SenseTime;SenseTime Group Limited;Department of Computer Science, The University of Hong Kong;SenseTime", "aff_domain": "sensetime.com;anu.edu.au;nwpu.edu.cn;anu.edu.au;sensetime.com;sensetime.com; ;cs.hku.hk;sensetime.com", "position": "Researcher;PhD student;MS student;PhD student;Senior R&D Manager;Deputy Director of R&D;Research Scientist;Assistant Professor;Researcher", "bibtex": "@inproceedings{\nzhen2022cosformer,\ntitle={cosFormer: Rethinking Softmax In Attention},\nauthor={Zhen Qin and Weixuan Sun and Hui Deng and Dongxu Li and Yunshen Wei and Baohong Lv and Junjie Yan and Lingpeng Kong and Yiran Zhong},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=Bl8CQrx2Up4}\n}", "github": "", "project": "", "reviewers": "M72S;ZpU1;iBUe;yo75", "pdf_size": 0, "recommendation": "6;6;8;8", "confidence": "4;5;5;3", "correctness": "3;2;3;4", "technical_novelty": "3;3;3;3", "empirical_novelty": "3;3;3;3", "wc_summary_paper": "148;53;59;108", "wc_summary_review": "7;11;43;62", "wc_main_review": "317;109;330;210", "wc_review": "472;173;432;380", "wc_reply_reviewers": "94;394;0;0", "wc_reply_authors": "487;735;310;263", "reply_reviewers": "2;3;0;0", "reply_authors": "2;3;1;1", "recommendation_avg": [ 7.0, 1.0 ], "confidence_avg": [ 4.25, 0.82915619758885 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 92.0, 38.73628789649313 ], "wc_summary_review_avg": [ 30.75, 22.807619340913245 ], "wc_main_review_avg": [ 241.5, 89.55584849690165 ], "wc_review_avg": [ 364.25, 115.13551797772918 ], "wc_reply_reviewers_avg": [ 122.0, 161.6601373251922 ], "wc_reply_authors_avg": [ 448.75, 185.17339846749047 ], "reply_reviewers_avg": [ 1.25, 1.299038105676658 ], "reply_authors_avg": [ 1.75, 0.82915619758885 ], "replies_avg": [ 20, 0 ], "authors#_avg": [ 9, 0 ], "corr_recommendation_confidence": -0.30151134457776363, "corr_recommendation_correctness": 0.7071067811865475, "gs_citation": 290, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11701536560712216954&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=Bl8CQrx2Up4", "email": "sensetime.com;anu.edu.au;nwpu.edu.cn;anu.edu.au;sensetime.com;sensetime.com; ;cs.hku.hk;sensetime.com", "author_num": 9, "aff_unique_index": "0;1;2;1;0;0;3;4;0", "aff_unique_norm": "SenseTime;Australian National University;Northwestern Polytechnical University;SenseTime Group Limited;University of Hong Kong", "aff_unique_dep": ";;;;Department of Computer Science", "aff_unique_url": "https://www.sensetime.com;https://www.anu.edu.au;https://www.nwpu.edu.cn;https://www.sensetime.com;https://www.hku.hk", "aff_unique_abbr": "SenseTime;ANU;NWPU;SenseTime;HKU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;1;0;1;0;0;0;0;0", "aff_country_unique": "China;Australia" }, { "id": "BlyXYc4wF2-", "title": "Multi-Agent Constrained Policy Optimisation", "track": "main", "status": "Reject", "tldr": "", "abstract": "Developing reinforcement learning algorithms that satisfy safety constraints is becoming increasingly important in real-world applications. In multi-agent reinforcement learning (MARL) settings, policy optimisation with safety awareness is particularly challenging because each individual agent has to not only meet its own safety constraints, but also consider those of others so that their joint behaviour can be guaranteed safe. Despite its importance, the problem of safe multi-agent learning has not been rigorously studied; very few solutions have been proposed, nor a sharable testing environment or benchmarks. To fill these gaps, in this work, we formulate the safe MARL problem as a constrained Markov game and solve it with policy optimisation methods. Our solutions---Multi-Agent Constrained Policy Optimisation (MACPO) and MAPPO-Lagrangian---leverage the theories from both constrained policy optimisation and multi-agent trust region learning. Crucially, our methods enjoy theoretical guarantees of both monotonic improvement in reward and satisfaction of safety constraints at every iteration. To examine the effectiveness of our methods, we develop the benchmark suite of Safe Multi-Agent MuJoCo that involves a variety of MARL baselines. Experimental results justify that MACPO/MAPPO-Lagrangian can consistently satisfy safety constraints, meanwhile achieving comparable performance to strong baselines.", "keywords": "Safe Multi-Agent Reinforcement Learning;Safe Multi-Agent Trust Region Policy Optimisation;Safe Multi-Agent Proximal Policy Optimisation;Constrained Policy Optimisation", "primary_area": "", "supplementary_material": "", "author": "Shangding Gu;Jakub Grudzien Kuba;Muning Wen;Ruiqing Chen;Ziyan Wang;Zheng Tian;Jun Wang;Alois Knoll;Yaodong Yang", "authorids": "~Shangding_Gu1;~Jakub_Grudzien_Kuba1;~Muning_Wen2;~Ruiqing_Chen1;~Ziyan_Wang3;~Zheng_Tian1;~Jun_Wang2;~Alois_Knoll1;~Yaodong_Yang1", "gender": "M;;M;;M;M;M;M;M", "homepage": "https://people.eecs.berkeley.edu/~shangding.gu/index.html;;https://github.com/morning9393;;https://ziyan-wang98.github.io/;;http://www0.cs.ucl.ac.uk/staff/jun.wang/;https://www.in.tum.de/i06/people/prof-dr-ing-habil-alois-knoll/;https://www.yangyaodong.com", "dblp": "268/8183;;295/0261;;;17/2752-2.html;w/JunWang12;k/AloisKnoll;170/1496-1", "google_scholar": "E1GCDXUAAAAJ;;Zt1WFtQAAAAJ;;1Yu8JFIAAAAJ;;https://scholar.google.co.uk/citations?user=wIE1tY4AAAAJ;https://scholar.google.de/citations?user=-CA8QgwAAAAJ;https://scholar.google.co.uk/citations?user=6yL0xw8AAAAJ", "orcid": ";;0009-0000-7868-1262;;;;;0000-0003-4840-076X;0000-0001-8132-5613", "linkedin": ";;;;;;;alois-knoll-505480166;yaodong-yang", "or_profile": "~Shangding_Gu1;~Jakub_Grudzien_Kuba1;~Muning_Wen2;~Ruiqing_Chen1;~Ziyan_Wang3;~Zheng_Tian1;~Jun_Wang2;~Alois_Knoll1;~Yaodong_Yang1", "aff": ";;Shanghai Jiaotong University;;King's College London;ShanghaiTech University;University College London;Technical University Munich;King's College London", "aff_domain": ";;sjtu.edu.cn;;kcl.ac.uk;shanghaitech.edu.cn;ucl.ac.uk;tum.de;kcl.ac.uk", "position": ";;PhD student;;PhD student;Assistant Professor;Professor;Full Professor;Assistant Professor", "bibtex": "@misc{\ngu2022multiagent,\ntitle={Multi-Agent Constrained Policy Optimisation },\nauthor={Shangding Gu and Jakub Grudzien Kuba and Muning Wen and Ruiqing Chen and Ziyan Wang and Zheng Tian and Jun Wang and Alois Knoll and Yaodong Yang},\nyear={2022},\nurl={https://openreview.net/forum?id=BlyXYc4wF2-}\n}", "github": "", "project": "", "reviewers": "Aw6n;rBTj;MyQj", "site": "https://openreview.net/forum?id=BlyXYc4wF2-", "pdf_size": 0, "recommendation": "5;5;5", "confidence": "3;3;3", "correctness": "3;3;3", "technical_novelty": "3;3;2", "empirical_novelty": "2;3;2", "wc_summary_paper": "24;61;81", "wc_summary_review": "17;46;17", "wc_main_review": "171;214;240", "wc_review": "212;321;338", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "590;567;322", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "recommendation_avg": [ 5.0, 0.0 ], "confidence_avg": [ 3.0, 0.0 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 55.333333333333336, 23.612614331233114 ], "wc_summary_review_avg": [ 26.666666666666668, 13.670731102939918 ], "wc_main_review_avg": [ 208.33333333333334, 28.4526897771644 ], "wc_review_avg": [ 290.3333333333333, 55.82313339666829 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 493.0, 121.27929199441537 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 9, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 65, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14028582225757657685&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff_unique_index": "0;1;2;3;4;1", "aff_unique_norm": "Shanghai Jiao Tong University;King's College London;ShanghaiTech University;University College London;Technical University of Munich", "aff_unique_dep": ";;;;", "aff_unique_url": "https://www.sjtu.edu.cn;https://www.kcl.ac.uk;https://www.shanghaitech.edu.cn;https://www.ucl.ac.uk;https://www.tum.de", "aff_unique_abbr": "SJTU;KCL;ShanghaiTech;UCL;TUM", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;1;2;1", "aff_country_unique": "China;United Kingdom;Germany" }, { "title": "Towards Understanding the Robustness Against Evasion Attack on Categorical Data", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6787", "id": "BmJV7kyAmg", "poster": "", "openreview": "https://openreview.net/forum?id=BmJV7kyAmg", "slides": "https://iclr.cc/virtual/2022/poster/6787", "video": "https://iclr.cc/virtual/2022/poster/6787", "author_site": "Hongyan Bao, Yufei Han, Yujun Zhou, Yun Shen, Xiangliang Zhang", "tldr": "", "abstract": "Characterizing and assessing the adversarial vulnerability of classification models with categorical input has been a practically important, while rarely explored research problem. Our work echoes the challenge by first unveiling the impact factors of adversarial vulnerability of classification models with categorical data based on an information-theoretic adversarial risk analysis about the targeted classifier. Though certifying the robustness of such classification models is intrinsically an NP-hard combinatorial problem, our study shows that the robustness certification can be solved via an efficient greedy exploration of the discrete attack space for any measurable classifiers with a mild smoothness constraint. Our proposed robustness certification framework is instantiated with deep neural network models applied on real-world safety-critic data sources. Our empirical observations confirm the impact of the key adversarial risk factors with categorical input.", "keywords": "robustness certification;adversarial learning;categorical data", "primary_area": "", "supplementary_material": "", "author": "Hongyan Bao;Yufei Han;Yujun Zhou;Yun Shen;Xiangliang Zhang", "authorids": "~Hongyan_Bao1;~Yufei_Han1;~Yujun_Zhou1;~Yun_Shen2;~Xiangliang_Zhang1", "gender": ";M;M;F;M", "homepage": "https://mine.kaust.edu.sa/Pages/Hongyan.aspx;;https://yujunzhou.github.io/;https://sites.nd.edu/xiangliang-zhang/;https://uk.linkedin.com/in/yun-shen-24336257", "dblp": "234/6902;74/2507;162/3265-2;74/1890-1;", "google_scholar": ";xdCvBg0AAAAJ;t0c7rQQAAAAJ;BhRJe4wAAAAJ;Gx_JJ6cAAAAJ", "orcid": ";;0000-0003-1376-5187;0000-0002-3574-5665;", "linkedin": ";;yujun-zhou-zyj/;;", "or_profile": "~Hongyan_Bao1;~Yufei_Han1;~Yujun_Zhou1;~Xiangliang_Zhang1;~Yun_Shen3", "aff": "KAUST;INRIA;KAUST;University of Notre Dame;NortonLifeLock", "aff_domain": "kaust.edu.sa;inria.fr;kaust.edu.sa;nd.edu;nortonlifelock.com", "position": "PhD student;Researcher;MS student;Associate Professor;Technical Director", "bibtex": "@inproceedings{\nbao2022towards,\ntitle={Towards Understanding the Robustness Against Evasion Attack on Categorical Data},\nauthor={Hongyan Bao and Yufei Han and Yujun Zhou and Yun Shen and Xiangliang Zhang},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=BmJV7kyAmg}\n}", "github": "", "project": "", "reviewers": "W7KQ;SsEb;FWFM", "pdf_size": 0, "recommendation": "6;6;8", "confidence": "4;3;4", "correctness": "4;3;3", "technical_novelty": "3;3;3", "empirical_novelty": "3;3;3", "wc_summary_paper": "65;55;84", "wc_summary_review": "14;49;84", "wc_main_review": "311;172;317", "wc_review": "390;276;485", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 6.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 68.0, 12.027745701779143 ], "wc_summary_review_avg": [ 49.0, 28.577380332470412 ], "wc_main_review_avg": [ 266.6666666666667, 66.98424358674873 ], "wc_review_avg": [ 383.6666666666667, 85.44133790046959 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.49999999999999983, "corr_recommendation_correctness": -0.49999999999999983, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=537947518440614327&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "pdf": "https://openreview.net/pdf?id=BmJV7kyAmg", "email": "kaust.edu.sa;inria.fr;kaust.edu.sa;nd.edu;nortonlifelock.com", "author_num": 5, "aff_unique_index": "0;1;0;2;3", "aff_unique_norm": "King Abdullah University of Science and Technology;INRIA;University of Notre Dame;NortonLifeLock", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.kaust.edu.sa;https://www.inria.fr;https://www.nd.edu;https://www.nortonlifelock.com", "aff_unique_abbr": "KAUST;INRIA;Notre Dame;NortonLifeLock", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;2;2", "aff_country_unique": "Saudi Arabia;France;United States" }, { "title": "How to Inject Backdoors with Better Consistency: Logit Anchoring on Clean Data", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6256", "id": "Bn09TnDngN", "poster": "", "openreview": "https://openreview.net/forum?id=Bn09TnDngN", "slides": "https://iclr.cc/virtual/2022/poster/6256", "video": "https://iclr.cc/virtual/2022/poster/6256", "author_site": "Zhiyuan Zhang, Lingjuan Lyu, Weiqiang Wang, Lichao Sun, Xu Sun", "tldr": "", "abstract": "Since training a large-scale backdoored model from scratch requires a large training dataset, several recent attacks have considered to inject backdoors into a trained clean model without altering model behaviors on the clean data. Previous work finds that backdoors can be injected into a trained clean model with Adversarial Weight Perturbation (AWP), which means the variation of parameters are small in backdoor learning. In this work, we observe an interesting phenomenon that the variations of parameters are always AWPs when tuning the trained clean model to inject backdoors. We further provide theoretical analysis to explain this phenomenon. We are the first to formulate the behavior of maintaining accuracy on clean data as the consistency of backdoored models, which includes both global consistency and instance-wise consistency. We extensively analyze the effects of AWPs on the consistency of backdoored models. In order to achieve better consistency, we propose a novel anchoring loss to anchor or freeze the model behaviors on the clean data, with a theoretical guarantee. ", "keywords": "backdoor learning;weight perturbation;consistency", "primary_area": "", "supplementary_material": "", "author": "Zhiyuan Zhang;Lingjuan Lyu;Weiqiang Wang;Lichao Sun;Xu Sun", "authorids": "~Zhiyuan_Zhang1;~Lingjuan_Lyu1;~Weiqiang_Wang4;~Lichao_Sun1;~Xu_Sun1", "gender": "M;F;M;M;M", "homepage": "https://pkuzzy.github.io/;https://sites.google.com/view/lingjuan-lyu;https://www.linkedin.com/in/weiqiang-wang-489b925/;https://lichao-sun.github.io/;https://xusun.org/", "dblp": "https://dblp.uni-trier.de/pid/72/1760-1;178/9876;;121/0780-1.html;37/1971-1", "google_scholar": "gSEzCUkAAAAJ;;;WhGUE7AAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";;0000-0002-6159-619X;;", "linkedin": ";;weiqiang-wang-489b925/;lichao-sun-b273a290/;", "or_profile": "~Zhiyuan_Zhang1;~Lingjuan_Lyu1;~Weiqiang_Wang4;~Lichao_Sun1;~Xu_Sun1", "aff": "Peking University;Sony;Ant Group;Lehigh University;Peking University", "aff_domain": "pku.edu.cn;sony.com;antgroup.com;lehigh.edu;pku.edu.cn", "position": "PhD student;scientist;Researcher;Assistant Professor;Associate Professor", "bibtex": "@inproceedings{\nzhang2022how,\ntitle={How to Inject Backdoors with Better Consistency: Logit Anchoring on Clean Data},\nauthor={Zhiyuan Zhang and Lingjuan Lyu and Weiqiang Wang and Lichao Sun and Xu Sun},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=Bn09TnDngN}\n}", "github": "", "project": "", "reviewers": "oesM;RJ1F;GfFx;rNcw", "pdf_size": 0, "recommendation": "6;8;8;8", "confidence": "3;4;4;4", "correctness": "4;3;4;4", "technical_novelty": "2;4;4;4", "empirical_novelty": "3;4;4;4", "wc_summary_paper": "45;113;74;134", "wc_summary_review": "30;86;109;98", "wc_main_review": "212;484;289;166", "wc_review": "287;683;472;398", "wc_reply_reviewers": "0;26;22;83", "wc_reply_authors": "426;287;334;171", "reply_reviewers": "0;1;1;1", "reply_authors": "1;1;1;1", "recommendation_avg": [ 7.5, 0.8660254037844386 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.5, 0.8660254037844386 ], "empirical_novelty_avg": [ 3.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 91.5, 34.41293361513953 ], "wc_summary_review_avg": [ 80.75, 30.408674749156695 ], "wc_main_review_avg": [ 287.75, 121.52854602931774 ], "wc_review_avg": [ 460.0, 144.60809105994034 ], "wc_reply_reviewers_avg": [ 32.75, 30.65432269680738 ], "wc_reply_authors_avg": [ 304.5, 91.87083323884681 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 1.0, "corr_recommendation_correctness": -0.3333333333333333, "gs_citation": 40, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3698228710800290266&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=Bn09TnDngN", "email": "pku.edu.cn;sony.com;antgroup.com;lehigh.edu;pku.edu.cn", "author_num": 5, "aff_unique_index": "0;1;2;3;0", "aff_unique_norm": "Peking University;Sony Corporation;Ant Group;Lehigh University", "aff_unique_dep": ";;;", "aff_unique_url": "http://www.pku.edu.cn;https://www.sony.com;https://www.antgroup.com;https://www.lehigh.edu", "aff_unique_abbr": "Peking U;Sony;Ant Group;Lehigh", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;2;0", "aff_country_unique": "China;Japan;United States" }, { "title": "Probabilistic Implicit Scene Completion", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/5910", "id": "BnQhMqDfcKG", "poster": "", "openreview": "https://openreview.net/forum?id=BnQhMqDfcKG", "slides": "https://iclr.cc/virtual/2022/poster/5910", "video": "https://iclr.cc/virtual/2022/poster/5910", "author_site": "Dongsu Zhang, Changwoon Choi, Inbum Park, Young Min Kim", "tldr": "", "abstract": "We propose a probabilistic shape completion method extended to the continuous geometry of large-scale 3D scenes. Real-world scans of 3D scenes suffer from a considerable amount of missing data cluttered with unsegmented objects. The problem of shape completion is inherently ill-posed, and high-quality result requires scalable solutions that consider multiple possible outcomes. We employ the Generative Cellular Automata that learns the multi-modal distribution and transform the formulation to process large-scale continuous geometry. The local continuous shape is incrementally generated as a sparse voxel embedding, which contains the latent code for each occupied cell. We formally derive that our training objective for the sparse voxel embedding maximizes the variational lower bound of the complete shape distribution and therefore our progressive generation constitutes a valid generative model. Experiments show that our model successfully generates diverse plausible scenes faithful to the input, especially when the input suffers from a significant amount of missing data. We also demonstrate that our approach outperforms deterministic models even in less ambiguous cases with a small amount of missing data, which infers that probabilistic formulation is crucial for high-quality geometry completion on input scans exhibiting any levels of completeness.", "keywords": "3D shape completion;3D generative model", "primary_area": "", "supplementary_material": "", "author": "Dongsu Zhang;Changwoon Choi;Inbum Park;Young Min Kim", "authorids": "~Dongsu_Zhang1;~Changwoon_Choi1;~Inbum_Park1;~Young_Min_Kim1", "gender": "M;M;M;", "homepage": "https://dszhang.me/about;http://changwoon.info;https://inbumpark.github.io/;https://3d.snu.ac.kr", "dblp": "256/0992;287/1062;272/0987;61/9605-1", "google_scholar": "ydEYx7QAAAAJ;DmPZo4QAAAAJ;Xa-ETLoAAAAJ;TjYQs-AAAAAJ", "orcid": ";0000-0001-5748-6003;;", "linkedin": ";changwoon-choi-0bbb311a1/;;", "or_profile": "~Dongsu_Zhang1;~Changwoon_Choi1;~Inbum_Park1;~Young_Min_Kim1", "aff": "Seoul National University;Seoul National University;Seoul National University;Seoul National University", "aff_domain": "snu.ac.kr;snu.ac.kr;snu.ac.kr;snu.ac.kr", "position": "MS student;PhD student;Undergrad student;Assistant Professor", "bibtex": "@inproceedings{\nzhang2022probabilistic,\ntitle={Probabilistic Implicit Scene Completion},\nauthor={Dongsu Zhang and Changwoon Choi and Inbum Park and Young Min Kim},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=BnQhMqDfcKG}\n}", "github": "", "project": "", "reviewers": "8mES;pTw1;iZhE;yCG6;mswD", "pdf_size": 0, "recommendation": "8;8;8;8;8", "confidence": "3;4;4;4;3", "correctness": "3;3;4;4;3", "technical_novelty": "3;3;3;3;3", "empirical_novelty": "2;3;3;3;0", "wc_summary_paper": "68;175;106;171;103", "wc_summary_review": "90;43;70;44;80", "wc_main_review": "562;260;303;384;189", "wc_review": "720;478;479;599;372", "wc_reply_reviewers": "249;24;28;0;26", "wc_reply_authors": "2335;476;1183;610;600", "reply_reviewers": "1;1;1;0;1", "reply_authors": "5;1;2;1;1", "recommendation_avg": [ 8.0, 0.0 ], "confidence_avg": [ 3.6, 0.4898979485566356 ], "correctness_avg": [ 3.4, 0.4898979485566356 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.2, 1.16619037896906 ], "wc_summary_paper_avg": [ 124.6, 41.73535671346299 ], "wc_summary_review_avg": [ 65.4, 18.969449122207003 ], "wc_main_review_avg": [ 339.6, 127.89777167722666 ], "wc_review_avg": [ 529.6, 119.27212582996917 ], "wc_reply_reviewers_avg": [ 65.4, 92.35929839490987 ], "wc_reply_authors_avg": [ 1040.8, 691.9634094372332 ], "reply_reviewers_avg": [ 0.8, 0.4 ], "reply_authors_avg": [ 2.0, 1.5491933384829668 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9910483105138162817&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=BnQhMqDfcKG", "email": "snu.ac.kr;snu.ac.kr;snu.ac.kr;snu.ac.kr", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Seoul National University", "aff_unique_dep": "", "aff_unique_url": "https://www.snu.ac.kr", "aff_unique_abbr": "SNU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "South Korea" }, { "id": "BpUXKoZM0J", "title": "Rethinking Rehearsal in Lifelong Learning: Does An Example Contribute the Plasticity or Stability?", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Lifelong Learning (LL) is the sequential transformation of Multi-Task Learning, which learns new tasks in order like human-beings.\nTraditionally, the primary goal of LL is to achieve the trade-off between the Stability (remembering past tasks) and Plasticity (adapting to new tasks). Rehearsal, seeking to remind the model by storing examples from old tasks in LL, is one of the most effective ways to get such trade-off. However, the Stability and Plasticity (SP) are only evaluated when a model is trained well, and it is still unknown what leads to the final SP in rehearsal-based LL. In this paper, we study the cause of SP from the perspective of example difference. First, we theoretically analyze the example-level SP via the influence function and deduce the influence of each example on the final SP. Moreover, to avoid the calculation burden of Hessian for each example, we propose a simple yet effective MetaSP algorithm to simulate the acquisition of example-level SP. Last but not least, we find that by adjusting the weights of each training example, a solution on the SP Pareto front can be obtained, resulting in a better SP trade-off for LL. Empirical results show that our algorithm significantly outperforms state-of-the-art methods on benchmark LL datasets.", "keywords": "Lifelong Learning;Rehearsal", "primary_area": "", "supplementary_material": "", "author": "Qing Sun;Fan Lyu;Fanhua Shang;Wei Feng;Liang Wan", "authorids": "~Qing_Sun3;~Fan_Lyu1;~Fanhua_Shang2;~Wei_Feng1;~Liang_Wan1", "gender": "F;;M;M;F", "homepage": "https://github.com/SUN3015218123;;https://sites.google.com/site/fanhua217/home;;http://cic.tju.edu.cn/faculty/lwan/index.html", "dblp": ";;66/9057;17/1152-5;", "google_scholar": ";;rk_HZTkAAAAJ;https://scholar.google.co.jp/citations?user=7ory1i8AAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";;0000-0002-1040-352X;;", "linkedin": ";;;;", "or_profile": "~Qing_Sun3;~Fan_Lyu1;~Fanhua_Shang2;~Wei_Feng1;~Liang_Wan1", "aff": "Tianjin University;;The Chinese University of Hong Kong;Tianjin University;Tianjin University", "aff_domain": "tju.edu.cn;;cuhk.edu.hk;tju.edu.cn;tju.edu.cn", "position": "MS student;;Associate Professor;Full Professor;Full Professor", "bibtex": "@misc{\nsun2022rethinking,\ntitle={Rethinking Rehearsal in Lifelong Learning: Does An Example Contribute the Plasticity or Stability?},\nauthor={Qing Sun and Fan Lyu and Fanhua Shang and Wei Feng and Liang Wan},\nyear={2022},\nurl={https://openreview.net/forum?id=BpUXKoZM0J}\n}", "github": "", "project": "", "reviewers": "G5Ae;mD7C;wonC", "site": "https://openreview.net/forum?id=BpUXKoZM0J", "pdf_size": 0, "recommendation": "3;3;5", "confidence": "4;4;3", "correctness": "3;2;3", "technical_novelty": "3;3;3", "empirical_novelty": "2;2;2", "wc_summary_paper": "116;109;47", "wc_summary_review": "62;108;107", "wc_main_review": "438;1225;500", "wc_review": "616;1442;654", "wc_reply_reviewers": "0;530;265", "wc_reply_authors": "768;1229;1648", "reply_reviewers": "0;1;1", "reply_authors": "2;2;3", "recommendation_avg": [ 3.6666666666666665, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 2.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 90.66666666666667, 31.008959278820623 ], "wc_summary_review_avg": [ 92.33333333333333, 21.452790546272116 ], "wc_main_review_avg": [ 721.0, 357.2795357513031 ], "wc_review_avg": [ 904.0, 380.7396310691424 ], "wc_reply_reviewers_avg": [ 265.0, 216.3715939458474 ], "wc_reply_authors_avg": [ 1215.0, 359.3948617699851 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 2.3333333333333335, 0.4714045207910317 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -1.0, "corr_recommendation_correctness": 0.5, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:yLhjoe2sRbMJ:scholar.google.com/&scioq=Rethinking+Rehearsal+in+Lifelong+Learning:+Does+An+Example+Contribute+the+Plasticity+or+Stability%3F&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1;0;0", "aff_unique_norm": "Tianjin University;Chinese University of Hong Kong", "aff_unique_dep": ";", "aff_unique_url": "http://www.tju.edu.cn;https://www.cuhk.edu.hk", "aff_unique_abbr": "TJU;CUHK", "aff_campus_unique_index": "1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "title": "Fair Normalizing Flows", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7045", "id": "BrFIKuxrZE", "poster": "", "openreview": "https://openreview.net/forum?id=BrFIKuxrZE", "slides": "https://iclr.cc/virtual/2022/poster/7045", "video": "https://iclr.cc/virtual/2022/poster/7045", "author_site": "Mislav Balunovic, Anian Ruoss, Martin Vechev", "tldr": "", "abstract": "Fair representation learning is an attractive approach that promises fairness of downstream predictors by encoding sensitive data. Unfortunately, recent work has shown that strong adversarial predictors can still exhibit unfairness by recovering sensitive attributes from these representations. In this work, we present Fair Normalizing Flows (FNF), a new approach offering more rigorous fairness guarantees for learned representations. Specifically, we consider a practical setting where we can estimate the probability density for sensitive groups. The key idea is to model the encoder as a normalizing flow trained to minimize the statistical distance between the latent representations of different groups. The main advantage of FNF is that its exact likelihood computation allows us to obtain guarantees on the maximum unfairness of any potentially adversarial downstream predictor. We experimentally demonstrate the effectiveness of FNF in enforcing various group fairness notions, as well as other attractive properties such as interpretability and transfer learning, on a variety of challenging real-world datasets.", "keywords": "fairness;fair representation learning;adversarial fairness;trustworthy machine learning", "primary_area": "", "supplementary_material": "/attachment/51959f7876af18e13ea9452aec7febd2d8e30434.zip", "author": "Mislav Balunovic;Anian Ruoss;Martin Vechev", "authorids": "~Mislav_Balunovic1;~Anian_Ruoss1;~Martin_Vechev1", "gender": "M;M;M", "homepage": "https://www.sri.inf.ethz.ch/people/mislav;;https://www.sri.inf.ethz.ch/people/martin", "dblp": "231/7686;259/2083;93/2189.html", "google_scholar": "fxkgmGwAAAAJ;gFkwD3kAAAAJ;https://scholar.google.ch/citations?user=aZ1Rh50AAAAJ", "orcid": ";;", "linkedin": ";anian-ruoss;", "or_profile": "~Mislav_Balunovic1;~Anian_Ruoss1;~Martin_Vechev1", "aff": "Swiss Federal Institute of Technology;Google DeepMind;Swiss Federal Institute of Technology", "aff_domain": "ethz.ch;deepmind.com;ethz.ch", "position": "PhD student;Researcher;Full Professor", "bibtex": "@inproceedings{\nbalunovic2022fair,\ntitle={Fair Normalizing Flows},\nauthor={Mislav Balunovic and Anian Ruoss and Martin Vechev},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=BrFIKuxrZE}\n}", "github": "", "project": "", "reviewers": "Vaqy;BBgE;xsbe;My3D;yCUe", "pdf_size": 0, "recommendation": "5;6;6;6;8", "confidence": "4;4;3;2;4", "correctness": "3;4;3;3;3", "technical_novelty": "2;4;3;3;2", "empirical_novelty": "2;4;3;2;3", "wc_summary_paper": "142;133;56;48;74", "wc_summary_review": "108;37;35;8;311", "wc_main_review": "385;294;210;105;1207", "wc_review": "635;464;301;161;1592", "wc_reply_reviewers": "1217;0;10;0;2008", "wc_reply_authors": "2002;295;265;97;1965", "reply_reviewers": "4;0;1;0;7", "reply_authors": "4;1;1;1;4", "recommendation_avg": [ 6.2, 0.9797958971132712 ], "confidence_avg": [ 3.4, 0.8 ], "correctness_avg": [ 3.2, 0.39999999999999997 ], "technical_novelty_avg": [ 2.8, 0.7483314773547882 ], "empirical_novelty_avg": [ 2.8, 0.7483314773547882 ], "wc_summary_paper_avg": [ 90.6, 39.31208465599351 ], "wc_summary_review_avg": [ 99.8, 110.67321265780622 ], "wc_main_review_avg": [ 440.2, 394.3994929002825 ], "wc_review_avg": [ 630.6, 506.20651912040796 ], "wc_reply_reviewers_avg": [ 647.0, 827.0680745863668 ], "wc_reply_authors_avg": [ 924.8, 867.134683887111 ], "reply_reviewers_avg": [ 2.4, 2.727636339397171 ], "reply_authors_avg": [ 2.2, 1.469693845669907 ], "replies_avg": [ 33, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.15309310892394865, "corr_recommendation_correctness": -0.10206207261596578, "gs_citation": 45, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12495034483324120127&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=BrFIKuxrZE", "email": "ethz.ch;deepmind.com;ethz.ch", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "Swiss Federal Institute of Technology;Google", "aff_unique_dep": ";Google DeepMind", "aff_unique_url": "https://www.ethz.ch;https://deepmind.com", "aff_unique_abbr": "ETH Zurich;DeepMind", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", "aff_country_unique": "Switzerland;United Kingdom" }, { "title": "DemoDICE: Offline Imitation Learning with Supplementary Imperfect Demonstrations", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6025", "id": "BrPdX1bDZkQ", "poster": "", "openreview": "https://openreview.net/forum?id=BrPdX1bDZkQ", "slides": "https://iclr.cc/virtual/2022/poster/6025", "video": "https://iclr.cc/virtual/2022/poster/6025", "author_site": "Geon-Hyeong Kim, Seokin Seo, Jongmin Lee, Wonseok Jeon, HyeongJoo Hwang, Hongseok Yang, Kee-Eung Kim", "tldr": "", "abstract": "We consider offline imitation learning (IL), which aims to mimic the expert's behavior from its demonstration without further interaction with the environment. One of the main challenges in offline IL is to deal with the narrow support of the data distribution exhibited by the expert demonstrations that cover only a small fraction of the state and the action spaces. As a result, offline IL algorithms that rely only on expert demonstrations are very unstable since the situation easily deviates from those in the expert demonstrations. In this paper, we assume additional demonstration data of unknown degrees of optimality, which we call imperfect demonstrations. Under this setting, we propose DemoDICE, which effectively utilizes imperfect demonstrations by matching the stationary distribution of a policy with experts' distribution while penalizing its deviation from the overall demonstrations. Compared with the recent IL algorithms that adopt adversarial minimax training objectives, we substantially stabilize overall learning process by reducing minimax optimization to a direct convex optimization in a principled manner. Using extensive tasks, we show that DemoDICE achieves promising results in the offline IL from expert and imperfect demonstrations.", "keywords": "imitation learning;offline imitation learning;imperfect demonstration;non-expert demonstration", "primary_area": "", "supplementary_material": "", "author": "Geon-Hyeong Kim;Seokin Seo;Jongmin Lee;Wonseok Jeon;HyeongJoo Hwang;Hongseok Yang;Kee-Eung Kim", "authorids": "~Geon-Hyeong_Kim2;~Seokin_Seo1;~Jongmin_Lee1;~Wonseok_Jeon1;~HyeongJoo_Hwang1;~Hongseok_Yang2;~Kee-Eung_Kim2", "gender": "M;;M;M;M;M;M", "homepage": "https://sites.google.com/view/ghkim;https://sites.google.com/view/siseo0;https://www.jmlee.kr;;https://github.com/gr8joo;http://ailab.kaist.ac.kr;https://sites.google.com/view/hongseokyang/home", "dblp": "231/7707;231/7699;68/222-4.html;;;35/6703;82/5808", "google_scholar": "https://scholar.google.co.kr/citations?user=IJL0uXoAAAAJ;https://scholar.google.com/citations?hl=en;https://scholar.google.co.kr/citations?user=rFcK8EEAAAAJ;https://scholar.google.com/citations?hl=en;IK5bNo0AAAAJ;https://scholar.google.com/citations?hl=ko;cLuwH14AAAAJ", "orcid": ";;;;;;", "linkedin": ";seokin-seo-026ab4150/;jmlee123/;;;;", "or_profile": "~Geon-Hyeong_Kim2;~Seokin_Seo1;~Jongmin_Lee1;~Wonseok_Jeon1;~HyeongJoo_Hwang1;~Kee-Eung_Kim2;~Hongseok_Yang1", "aff": "Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology;Qualcomm AI Research;Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology;Institute for Basic Science", "aff_domain": "kaist.ac.kr;kaist.ac.kr;kaist.ac.kr;qualcomm.com;kaist.ac.kr;kaist.ac.kr;ibs.re.kr", "position": "PhD student;PhD student;PhD student;Staff Machine Learning Engineer;PhD student;Full Professor;Visiting Research Fellow", "bibtex": "@inproceedings{\nkim2022demodice,\ntitle={Demo{DICE}: Offline Imitation Learning with Supplementary Imperfect Demonstrations},\nauthor={Geon-Hyeong Kim and Seokin Seo and Jongmin Lee and Wonseok Jeon and HyeongJoo Hwang and Hongseok Yang and Kee-Eung Kim},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=BrPdX1bDZkQ}\n}", "github": "", "project": "", "reviewers": "SEq7;ERsJ;5YKV", "pdf_size": 0, "recommendation": "8;8;8", "confidence": "4;3;3", "correctness": "4;3;4", "technical_novelty": "2;2;3", "empirical_novelty": "3;3;2", "wc_summary_paper": "162;142;123", "wc_summary_review": "25;72;75", "wc_main_review": "235;491;373", "wc_review": "422;705;571", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "447;536;688", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "recommendation_avg": [ 8.0, 0.0 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "correctness_avg": [ 3.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 142.33333333333334, 15.92342788332825 ], "wc_summary_review_avg": [ 57.333333333333336, 22.89589968143253 ], "wc_main_review_avg": [ 366.3333333333333, 104.61782299822956 ], "wc_review_avg": [ 566.0, 115.58835004734112 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 557.0, 99.50209378031532 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 105, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16971346574304729154&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=BrPdX1bDZkQ", "email": "kaist.ac.kr;kaist.ac.kr;kaist.ac.kr;qualcomm.com;kaist.ac.kr;kaist.ac.kr;ibs.re.kr", "author_num": 7, "aff_unique_index": "0;0;0;1;0;0;2", "aff_unique_norm": "Korea Advanced Institute of Science and Technology;Qualcomm;Institute for Basic Science", "aff_unique_dep": ";Qualcomm AI Research;", "aff_unique_url": "https://www.kaist.ac.kr;https://www.qualcomm.com/research;https://www.ibs.re.kr", "aff_unique_abbr": "KAIST;QAI;IBS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;0;0;0", "aff_country_unique": "South Korea;United States" }, { "id": "BrfHcL-99sy", "title": "Defending Graph Neural Networks via Tensor-Based Robust Graph Aggregation", "track": "main", "status": "Reject", "tldr": "", "abstract": "Graph Neural Networks (GNNs) have achieved outstanding success in a wide variety of domains and applications. However, they are still vulnerable to unnoticeable perturbations of graphs specially designed by attackers, causing significant performance drops. Developing algorithms to defend GNNs with robust graphs vaccinating from adversarial attacks still remains a challenging issue. Existing methods treat every edges individually and regularize them by specific robust properties, which ignores the structural relationships among edges and correlations among different properties. In this paper, we propose a tensor-based framework for GNNs to learn robust graphs from adversarial graphs by aggregating predefined robust graphs to enhance the robustness of GNNs via tensor approximation. All the predefined robust graphs are linearly compressed into and recovered from a low-rank space, which aggregates the robust graphs and the structural information in a balanced manner. Extensive experiments on real-world graph datasets show that the proposed framework effectively mitigates the adverse effects of adversarial attacks and outperforms state-of-the-art defense methods.", "keywords": "Graph Neural Networks;Adversarial Attacks;Robustness;Tensor Decomposition", "primary_area": "", "supplementary_material": "/attachment/ec364b5ae4a7e8e125e69843df2cb79353a331c7.zip", "author": "Jianfu Zhang;Yan Hong;Dawei Cheng;Liqing Zhang;Qibin Zhao", "authorids": "~Jianfu_Zhang2;~Yan_Hong1;~Dawei_Cheng2;~Liqing_Zhang2;~Qibin_Zhao1", "gender": "M;F;M;M;M", "homepage": "https://matt-sjtu.github.io/;https://github.com/hy-zpg;http://bcmi.sjtu.edu.cn/~zhangliqing/;https://qibinzhao.github.io;http://cs1.tongji.edu.cn/~dawei/", "dblp": "78/3993-3;68/974-2.html;20/4627-1.html;13/1193;135/6864", "google_scholar": "https://scholar.google.com/citations?hl=en;https://scholar.google.com.hk/citations?user=ztq5-xcAAAAJ;1smFmxAAAAAJ;https://scholar.google.co.jp/citations?hl=en;4UD20ukAAAAJ", "orcid": "0000-0002-2673-5860;0000-0001-6401-0812;;0000-0002-4442-3182;0000-0002-5877-7387", "linkedin": ";;;;", "or_profile": "~Jianfu_Zhang2;~Yan_Hong1;~Liqing_Zhang2;~Qibin_Zhao1;~Dawei_Cheng1", "aff": "RIKEN;Shanghai Jiaotong University;Shanghai Jiaotong University;RIKEN;Tongji University", "aff_domain": "riken.jp;sjtu.edu;sjtu.edu.cn;riken.jp;tongji.edu.cn", "position": "Postdoc;PhD student;Full Professor;Team Leader;Assistant Professor", "bibtex": "@misc{\nzhang2022defending,\ntitle={Defending Graph Neural Networks via Tensor-Based Robust Graph Aggregation},\nauthor={Jianfu Zhang and Yan Hong and Dawei Cheng and Liqing Zhang and Qibin Zhao},\nyear={2022},\nurl={https://openreview.net/forum?id=BrfHcL-99sy}\n}", "github": "", "project": "", "reviewers": "xxhm;WHUo;5TVQ;RnpN", "site": "https://openreview.net/forum?id=BrfHcL-99sy", "pdf_size": 0, "recommendation": "3;6;6;6", "confidence": "4;5;4;4", "correctness": "3;4;4;3", "technical_novelty": "2;2;3;2", "empirical_novelty": "2;4;2;3", "wc_summary_paper": "60;32;61;67", "wc_summary_review": "72;56;23;36", "wc_main_review": "282;167;217;271", "wc_review": "414;255;301;374", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 5.25, 1.299038105676658 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 55.0, 13.546217184144066 ], "wc_summary_review_avg": [ 46.75, 18.726652130052504 ], "wc_main_review_avg": [ 234.25, 45.963980462966866 ], "wc_review_avg": [ 336.0, 61.87487373724491 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:89i21yr08UwJ:scholar.google.com/&scioq=Defending+Graph+Neural+Networks+via+Tensor-Based+Robust+Graph+Aggregation&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;1;0;2", "aff_unique_norm": "RIKEN;Shanghai Jiao Tong University;Tongji University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.riken.jp;https://www.sjtu.edu.cn;https://www.tongji.edu.cn", "aff_unique_abbr": "RIKEN;SJTU;Tongji", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;0;1", "aff_country_unique": "Japan;China" }, { "id": "BsDYmsrCjr", "title": "Scalable Robust Federated Learning with Provable Security Guarantees", "track": "main", "status": "Reject", "tldr": "", "abstract": "Federated averaging, the most popular aggregation approach in federated learning, is known to be vulnerable to failures and adversarial updates from clients that wish to disrupt training. While median aggregation remains one of the most popular alternatives to improve training robustness, the naive combination of median and secure multi-party computation (MPC) is unscalable. To this end, we propose an efficient approximate median aggregation with MPC privacy guarantees on the multi-silo setting, e.g., across hospitals, with two semi-honest non-colluding servers. The proposed method protects the confidentiality of client gradient updates against both semi-honest clients and servers. Asymptotically, the cost of our approach scales only linearly with the number of clients, whereas the naive MPC median scales quadratically. Moreover, we prove that the convergence of the proposed federated learning method is robust to a wide range of failures and attacks. Empirically, we show that our method inherits the robustness properties of the median while converging faster than the naive MPC median for even a small number of clients.", "keywords": "Federated Learning;Robustness;MPC;Privacy Preserving ML", "primary_area": "", "supplementary_material": "", "author": "Andrew Liu;Jacky Y. Zhang;Nishant Kumar;Dakshita Khurana;Oluwasanmi O Koyejo", "authorids": "~Andrew_Liu4;~Jacky_Y._Zhang1;nkumar13@illinois.edu;dakshita@illinois.edu;~Oluwasanmi_O_Koyejo1", "gender": "M;;;;M", "homepage": "https://liu-xin.github.io;;;;https://cs.stanford.edu/~sanmi/", "dblp": ";;;;14/8885", "google_scholar": ";;;;EaaOeJwAAAAJ", "orcid": ";;;;0000-0002-4023-419X", "linkedin": ";;;;sanmi-koyejo-984754/", "or_profile": "~Andrew_Liu4;~Jacky_Y._Zhang1;nkumar13@illinois.edu;dakshita@illinois.edu;~Oluwasanmi_O_Koyejo1", "aff": "University of Illinois, Urbana Champaign;;;;University of Illinois, Urbana Champaign", "aff_domain": "illinois.edu;;;;illinois.edu", "position": "MS student;;;;Associate Professor", "bibtex": "@misc{\nliu2022scalable,\ntitle={Scalable Robust Federated Learning with Provable Security Guarantees},\nauthor={Andrew Liu and Jacky Y. Zhang and Nishant Kumar and Dakshita Khurana and Oluwasanmi O Koyejo},\nyear={2022},\nurl={https://openreview.net/forum?id=BsDYmsrCjr}\n}", "github": "", "project": "", "reviewers": "6dep;SNQM;moZd;mthn", "site": "https://openreview.net/forum?id=BsDYmsrCjr", "pdf_size": 0, "recommendation": "3;5;5;5", "confidence": "3;4;3;3", "correctness": "2;3;4;3", "technical_novelty": "3;3;3;2", "empirical_novelty": "3;3;3;1", "wc_summary_paper": "36;65;77;70", "wc_summary_review": "42;20;102;33", "wc_main_review": "347;273;377;750", "wc_review": "425;358;556;853", "wc_reply_reviewers": "0;0;0;142", "wc_reply_authors": "220;689;724;1128", "reply_reviewers": "0;0;0;1", "reply_authors": "1;1;1;2", "recommendation_avg": [ 4.5, 0.8660254037844386 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 62.0, 15.604486534327235 ], "wc_summary_review_avg": [ 49.25, 31.443401533549135 ], "wc_main_review_avg": [ 436.75, 184.77334088011722 ], "wc_review_avg": [ 548.0, 189.94604497067056 ], "wc_reply_reviewers_avg": [ 35.5, 61.48780366869514 ], "wc_reply_authors_avg": [ 690.25, 321.6755935721577 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": 0.816496580927726, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:1bSUXZauFYMJ:scholar.google.com/&scioq=Scalable+Robust+Federated+Learning+with+Provable+Security+Guarantees&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "University of Illinois Urbana-Champaign", "aff_unique_dep": "", "aff_unique_url": "https://illinois.edu", "aff_unique_abbr": "UIUC", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Urbana-Champaign", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "Bvk8Dn6Cfy", "title": "DaSeGAN: Domain Adaptation for Segmentation Tasks via Generative Adversarial Networks", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "A weakness of deep learning methods is that they can fail when there is a mismatch between source and target data domains. In medical image applications, this is a common situation when data from new vendor devices or different hospitals is available. Domain adaptation techniques aim to fill this gap by generating mappings between image domains when unlabeled data from the new target domain is available. In other cases, no target domain data (labeled or unlabeled) is available during training. In this latter case, domain generalization methods focus on learning domain-invariant representations which are more robust to new domains. In this paper, a combination of domain adaptation and generalization techniques is proposed by leveraging domain-invariant image translations for image segmentation problems. This is achieved by adversarially training a generator that transforms source images to a universal domain. To preserve the semantic consistency between the source and universal domains a segmentation consistency loss between the source and universal predictions is used. Our method was validated on the M&Ms dataset, a multi-source unsupervised domain adaptation, and generalization problem, outperforming previous methods. Particularly, our method significantly boosts the test results over the unlabeled and unseen domains without hurting source-labeled domains.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Mario Parre\u00f1o Lara;Roberto Paredes;Alberto Albiol", "authorids": "~Mario_Parre\u00f1o_Lara1;~Roberto_Paredes1;~Alberto_Albiol1", "gender": "M;;M", "homepage": "https://maparla.es/;;http://personales.upv.es/alalbiol/index-english.html", "dblp": ";http://dblp.uni-trier.de/pers/hd/p/Paredes:Roberto;", "google_scholar": "https://scholar.google.es/citations?user=HxF4CrwAAAAJ;https://scholar.google.es/citations?user=I815O2UAAAAJ;goKzK8AAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Mario_Parre\u00f1o_Lara1;~Roberto_Paredes1;~Alberto_Albiol1", "aff": "Universidad Polit\u00e9cnica de Valencia (UPV);UPV;Universitat Politecnica Valencia", "aff_domain": "upv.es;dsic.upv.es;upv.es", "position": "PhD student;Full Professor;Assistant Professor", "bibtex": "@misc{\nlara2022dasegan,\ntitle={DaSe{GAN}: Domain Adaptation for Segmentation Tasks via Generative Adversarial Networks},\nauthor={Mario Parre{\\~n}o Lara and Roberto Paredes and Alberto Albiol},\nyear={2022},\nurl={https://openreview.net/forum?id=Bvk8Dn6Cfy}\n}", "github": "", "project": "", "reviewers": "zDyp;THEM;2fkp;o2f4", "site": "https://openreview.net/forum?id=Bvk8Dn6Cfy", "pdf_size": 0, "recommendation": "1;3;3;5", "confidence": "5;5;3;3", "correctness": "2;3;4;3", "technical_novelty": "1;2;2;3", "empirical_novelty": "1;2;2;2", "wc_summary_paper": "58;59;63;130", "wc_summary_review": "15;83;33;43", "wc_main_review": "247;517;146;205", "wc_review": "320;659;242;378", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.0, 1.4142135623730951 ], "confidence_avg": [ 4.0, 1.0 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 1.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 77.5, 30.36856927812043 ], "wc_summary_review_avg": [ 43.5, 24.914855006601986 ], "wc_main_review_avg": [ 278.75, 142.15550464192373 ], "wc_review_avg": [ 399.75, 157.2647051947766 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.7071067811865476, "corr_recommendation_correctness": 0.5, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:gkoVwXzsHmUJ:scholar.google.com/&scioq=DaSeGAN:+Domain+Adaptation+for+Segmentation+Tasks+via+Generative+Adversarial+Networks&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0;1", "aff_unique_norm": "Universidad Polit\u00e9cnica de Valencia;Universitat Polit\u00e8cnica de Val\u00e8ncia", "aff_unique_dep": ";", "aff_unique_url": "https://www.upv.es;https://www.upv.es", "aff_unique_abbr": "UPV;UPV", "aff_campus_unique_index": "1", "aff_campus_unique": ";Valencia", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Spain" }, { "id": "BvowzJp_Yl6", "title": "Homogeneous Learning: Self-Attention Decentralized Deep Learning", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Federated learning (FL) has been facilitating privacy-preserving deep learning in many walks of life such as medical image classification, network intrusion detection, and so forth. Whereas it necessitates a central parameter server for model aggregation, which brings about delayed model communication and vulnerability to adversarial attacks. A fully decentralized architecture like Swarm Learning allows peer-to-peer communication among distributed nodes, without the central server. One of the most challenging issues in decentralized deep learning is that data owned by each node are usually non-independent and identically distributed (non-IID), causing time-consuming convergence of model training.To this end, we propose a decentralized learning model called Homogeneous Learning (HL) for tackling non-IID data with a self-attention mechanism. In HL, training performs on each round\u2019s selected node, and the trained model of a node is sent to the next selected node at the end of each round. Notably, for the selection, the self-attention mechanism leverages reinforcement learning to observe a node\u2019s inner state and its surrounding environment\u2019s state, and find out which node should be selected to optimize the training. We evaluate our method with various scenarios for an image classification task. The result suggests that HL can produce a better performance compared with standalone learning and greatly reduce both the total training rounds by 50.8% and the communication cost by 74.6% compared with random policy-based decentralized learning for training on non-IID data.", "keywords": "supervised representation learning;reinforcement learning;privacy;decentralized learning;data heterogeneity;communication efficiency", "primary_area": "", "supplementary_material": "/attachment/9bd5b5d09de748b26dee8241345a36bc3974b098.zip", "author": "Yuwei Sun;Hideya Ochiai", "authorids": "~Yuwei_Sun1;ochiai@elab.ic.i.u-tokyo.ac.jp", "gender": "M;", "homepage": "https://yuweisunn.github.io/;", "dblp": "69/6154;", "google_scholar": "CG_AluYAAAAJ;", "orcid": ";", "linkedin": ";", "or_profile": "~Yuwei_Sun1;ochiai@elab.ic.i.u-tokyo.ac.jp", "aff": "RIKEN AIP;", "aff_domain": "riken.jp;", "position": "Researcher;", "bibtex": "@misc{\nsun2022homogeneous,\ntitle={Homogeneous Learning: Self-Attention Decentralized Deep Learning},\nauthor={Yuwei Sun and Hideya Ochiai},\nyear={2022},\nurl={https://openreview.net/forum?id=BvowzJp_Yl6}\n}", "github": "", "project": "", "reviewers": "Pyuc;mrxm;k33V", "site": "https://openreview.net/forum?id=BvowzJp_Yl6", "pdf_size": 0, "recommendation": "3;3;5", "confidence": "4;4;3", "correctness": "2;2;3", "technical_novelty": "2;2;2", "empirical_novelty": "1;2;2", "wc_summary_paper": "115;82;72", "wc_summary_review": "79;41;73", "wc_main_review": "281;119;549", "wc_review": "475;242;694", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "502;380;1230", "reply_reviewers": "0;0;0", "reply_authors": "1;1;2", "recommendation_avg": [ 3.6666666666666665, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 2.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 1.6666666666666667, 0.4714045207910317 ], "wc_summary_paper_avg": [ 89.66666666666667, 18.372685039360892 ], "wc_summary_review_avg": [ 64.33333333333333, 16.679994670929073 ], "wc_main_review_avg": [ 316.3333333333333, 177.31578860577784 ], "wc_review_avg": [ 470.3333333333333, 184.55772960121598 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 704.0, 375.25813337843414 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -1.0, "corr_recommendation_correctness": 1.0, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=296693499620563319&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 11, "aff_unique_index": "0", "aff_unique_norm": "RIKEN", "aff_unique_dep": "Advanced Institute for Computational Science", "aff_unique_url": "https://www.aip.riken.jp", "aff_unique_abbr": "RIKEN AIP", "aff_country_unique_index": "0", "aff_country_unique": "Japan" }, { "title": "Provable Learning-based Algorithm For Sparse Recovery", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6689", "id": "BwPaPxwgyQb", "poster": "", "openreview": "https://openreview.net/forum?id=BwPaPxwgyQb", "slides": "https://iclr.cc/virtual/2022/poster/6689", "video": "https://iclr.cc/virtual/2022/poster/6689", "author_site": "Xinshi Chen, Haoran Sun, Le Song", "tldr": "", "abstract": "Recovering sparse parameters from observational data is a fundamental problem in machine learning with wide applications. Many classic algorithms can solve this problem with theoretical guarantees, but their performances rely on choosing the correct hyperparameters. Besides, hand-designed algorithms do not fully exploit the particular problem distribution of interest. In this work, we propose a deep learning method for algorithm learning called PLISA (Provable Learning-based Iterative Sparse recovery Algorithm). PLISA is designed by unrolling a classic path-following algorithm for sparse recovery, with some components being more flexible and learnable. We theoretically show the improved recovery accuracy achievable by PLISA. Furthermore, we analyze the empirical Rademacher complexity of PLISA to characterize its generalization ability to solve new problems outside the training set. This paper contains novel theoretical contributions to the area of learning-based algorithms in the sense that (i) PLISA is generically applicable to a broad class of sparse estimation problems, (ii) generalization analysis has received less attention so far, and (iii) our analysis makes novel connections between the generalization ability and algorithmic properties such as stability and convergence of the unrolled algorithm, which leads to a tighter bound that can explain the empirical observations. The techniques could potentially be applied to analyze other learning-based algorithms in the literature.", "keywords": "learning to learn;sparse parameter estimation;learning to optimize;algorithm unrolling;generalization bound", "primary_area": "", "supplementary_material": "", "author": "Xinshi Chen;Haoran Sun;Le Song", "authorids": "~Xinshi_Chen1;~Haoran_Sun2;~Le_Song1", "gender": "F;M;M", "homepage": "https://xinshi-chen.github.io/;;http://www.cc.gatech.edu/~lsong", "dblp": "232/3197;;94/3481", "google_scholar": ";p7of_yoAAAAJ;Xl4E0CsAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Xinshi_Chen1;~Haoran_Sun2;~Le_Song1", "aff": "Georgia Institute of Technology;Georgia Institute of Technology;College of Computing, Georgia Institute of Technology", "aff_domain": "gatech.edu;gatech.edu;cc.gatech.edu", "position": "PhD student;PhD student;Associate Professor", "bibtex": "@inproceedings{\nchen2022provable,\ntitle={Provable Learning-based Algorithm For Sparse Recovery},\nauthor={Xinshi Chen and Haoran Sun and Le Song},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=BwPaPxwgyQb}\n}", "github": "", "project": "", "reviewers": "ZD71;AZru;uCY5;7xPS", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "4;4;3;3", "correctness": "3;3;3;4", "technical_novelty": "2;3;2;3", "empirical_novelty": "2;4;2;3", "wc_summary_paper": "49;54;66;98", "wc_summary_review": "54;27;196;20", "wc_main_review": "423;363;324;387", "wc_review": "526;444;586;505", "wc_reply_reviewers": "516;0;265;0", "wc_reply_authors": "3311;1493;2079;511", "reply_reviewers": "4;0;2;0", "reply_authors": "7;3;4;2", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 66.75, 19.070592544543548 ], "wc_summary_review_avg": [ 74.25, 71.4295982069058 ], "wc_main_review_avg": [ 374.25, 36.02342987556848 ], "wc_review_avg": [ 515.25, 50.751231512151506 ], "wc_reply_reviewers_avg": [ 195.25, 214.47071478409353 ], "wc_reply_authors_avg": [ 1848.5, 1013.3265761836112 ], "reply_reviewers_avg": [ 1.5, 1.6583123951777 ], "reply_authors_avg": [ 4.0, 1.8708286933869707 ], "replies_avg": [ 28, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.6882472016116854, "corr_recommendation_correctness": 0.9271726499455306, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14019998184383758148&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "pdf": "https://openreview.net/pdf?id=BwPaPxwgyQb", "email": "gatech.edu;gatech.edu;cc.gatech.edu", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Georgia Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.gatech.edu", "aff_unique_abbr": "Georgia Tech", "aff_campus_unique_index": "1", "aff_campus_unique": ";Atlanta", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "By5Uwd_xzNF", "title": "Neural Structure Mapping For Learning Abstract Visual Analogies", "track": "main", "status": "Reject", "tldr": "", "abstract": "Building conceptual abstractions from sensory information and then reasoning about them is central to human intelligence. Abstract reasoning both relies on, and is facilitated by, our ability to make analogies about concepts from known domains to novel domains. Structure Mapping Theory of human analogical reasoning posits that analogical mappings rely on (higher-order) relations and not on the sensory content of the domain. This enables humans to reason systematically about novel domains, a problem with which machine learning (ML) models tend to struggle. We introduce a two-stage neural framework, which we label Neural Structure Mapping (NSM), to learn visual analogies from Raven's Progressive Matrices, an abstract visual reasoning test of fluid intelligence. Our framework uses (1) a multi-task visual relationship encoder to extract constituent concepts from raw visual input in the source domain, and (2) a neural module net analogy inference engine to reason compositionally about the inferred relation in the target domain. Our NSM approach (a) isolates the relational structure from the source domain with high accuracy, and (b) successfully utilizes this structure for analogical reasoning in the target domain.", "keywords": "cognitive science;analogy;psychology;cognitive theory;cognition;abstraction;abstract reasoning;generalization;systematic generalization", "primary_area": "", "supplementary_material": "", "author": "Shashank Shekhar;Graham W. Taylor", "authorids": "~Shashank_Shekhar2;~Graham_W._Taylor1", "gender": "M;", "homepage": "http://shashankshekhar.com;https://www.gwtaylor.ca", "dblp": "18/6368-6;17/1633", "google_scholar": "https://scholar.google.fr/citations?hl=en;https://scholar.google.ca/citations?user=PUeKU8kAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Shashank_Shekhar2;~Graham_W_Taylor1", "aff": "Meta Facebook;University of Guelph", "aff_domain": "fb.com;uoguelph.ca", "position": "AI Resident;Full Professor", "bibtex": "@misc{\nshekhar2022neural,\ntitle={Neural Structure Mapping For Learning Abstract Visual Analogies},\nauthor={Shashank Shekhar and Graham W. Taylor},\nyear={2022},\nurl={https://openreview.net/forum?id=By5Uwd_xzNF}\n}", "github": "", "project": "", "reviewers": "opcn;z4sn;QLDy;zpNi;Kq5G", "site": "https://openreview.net/forum?id=By5Uwd_xzNF", "pdf_size": 0, "recommendation": "3;5;5;5;5", "confidence": "5;5;4;4;4", "correctness": "2;2;3;3;2", "technical_novelty": "1;2;2;3;3", "empirical_novelty": "1;2;2;2;2", "wc_summary_paper": "34;86;82;211;249", "wc_summary_review": "18;89;98;48;92", "wc_main_review": "170;397;523;892;1024", "wc_review": "222;572;703;1151;1365", "wc_reply_reviewers": "0;0;67;0;382", "wc_reply_authors": "514;1066;1176;799;1267", "reply_reviewers": "0;0;1;0;1", "reply_authors": "1;2;3;1;2", "recommendation_avg": [ 4.6, 0.7999999999999999 ], "confidence_avg": [ 4.4, 0.48989794855663565 ], "correctness_avg": [ 2.4, 0.4898979485566356 ], "technical_novelty_avg": [ 2.2, 0.7483314773547882 ], "empirical_novelty_avg": [ 1.8, 0.4000000000000001 ], "wc_summary_paper_avg": [ 132.4, 82.64284603037338 ], "wc_summary_review_avg": [ 69.0, 31.02257242718598 ], "wc_main_review_avg": [ 601.2, 315.3001110053722 ], "wc_review_avg": [ 802.6, 409.3578385715852 ], "wc_reply_reviewers_avg": [ 89.8, 148.38652229902823 ], "wc_reply_authors_avg": [ 964.4, 274.54733653779994 ], "reply_reviewers_avg": [ 0.4, 0.48989794855663565 ], "reply_authors_avg": [ 1.8, 0.7483314773547883 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.6123724356957946, "corr_recommendation_correctness": 0.408248290463863, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4349562106018137597&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 2, "aff_unique_index": "0;1", "aff_unique_norm": "Meta;University of Guelph", "aff_unique_dep": "Meta Platforms, Inc.;", "aff_unique_url": "https://meta.com;https://www.uoguelph.ca", "aff_unique_abbr": "Meta;U of G", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "United States;Canada" }, { "title": "An Autoregressive Flow Model for 3D Molecular Geometry Generation from Scratch", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7066", "id": "C03Ajc-NS5W", "poster": "", "openreview": "https://openreview.net/forum?id=C03Ajc-NS5W", "slides": "https://iclr.cc/virtual/2022/poster/7066", "video": "https://iclr.cc/virtual/2022/poster/7066", "author_site": "Youzhi Luo, Shuiwang Ji", "tldr": "", "abstract": "We consider the problem of generating 3D molecular geometries from scratch. While multiple methods have been developed for generating molecular graphs, generating 3D molecular geometries from scratch is largely under-explored. In this work, we propose G-SphereNet, a novel autoregressive flow model for generating 3D molecular geometries. G-SphereNet employs a flexible sequential generation scheme by placing atoms in 3D space step-by-step. Instead of generating 3D coordinates directly, we propose to determine 3D positions of atoms by generating distances, angles and torsion angles, thereby ensuring both invariance and equivariance properties. In addition, we propose to use spherical message passing and attention mechanism for conditional information extraction. Experimental results show that G-SphereNet outperforms previous methods on random molecular geometry generation and targeted molecule discovery tasks. Our code is publicly available as part of the DIG package (https://github.com/divelab/DIG).", "keywords": "3D molecular geometry generation;flow models;SphereNet", "primary_area": "", "supplementary_material": "/attachment/bf572e50a365a0ae0a5fbe28ee06f1cd4cbe6ca7.zip", "author": "Youzhi Luo;Shuiwang Ji", "authorids": "~Youzhi_Luo1;~Shuiwang_Ji1", "gender": "M;M", "homepage": "https://lyzustc.github.io/;http://people.tamu.edu/~sji", "dblp": "280/0590;84/6405", "google_scholar": "3lqQFIoAAAAJ;BZGj6sAAAAAJ", "orcid": "0000-0002-3763-0239;0000-0002-4205-4563", "linkedin": "youzhi-luo-139981172/;shuiwang-ji-9a040715/", "or_profile": "~Youzhi_Luo1;~Shuiwang_Ji1", "aff": "Texas A&M University;Texas A&M University", "aff_domain": "tamu.edu;tamu.edu", "position": "PhD student;Professor", "bibtex": "@inproceedings{\nluo2022an,\ntitle={An Autoregressive Flow Model for 3D Molecular Geometry Generation from Scratch},\nauthor={Youzhi Luo and Shuiwang Ji},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=C03Ajc-NS5W}\n}", "github": "", "project": "", "reviewers": "UA8J;euwK;1x5L;JRzq", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "3;3;3;3", "correctness": "3;3;3;3", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;2;0;3", "wc_summary_paper": "50;67;83;106", "wc_summary_review": "30;58;22;78", "wc_main_review": "370;253;153;344", "wc_review": "450;378;258;528", "wc_reply_reviewers": "0;39;0;0", "wc_reply_authors": "1719;716;416;469", "reply_reviewers": "0;1;0;0", "reply_authors": "5;2;2;2", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 3.0, 0.0 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.75, 1.0897247358851685 ], "wc_summary_paper_avg": [ 76.5, 20.645822822062577 ], "wc_summary_review_avg": [ 47.0, 22.338307903688676 ], "wc_main_review_avg": [ 280.0, 85.22616968983178 ], "wc_review_avg": [ 403.5, 99.35164819971534 ], "wc_reply_reviewers_avg": [ 9.75, 16.887495373796554 ], "wc_reply_authors_avg": [ 830.0, 525.602987053917 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 2.75, 1.299038105676658 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 104, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1292131869179489903&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=C03Ajc-NS5W", "email": "tamu.edu;tamu.edu", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Texas A&M University", "aff_unique_dep": "", "aff_unique_url": "https://www.tamu.edu", "aff_unique_abbr": "TAMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Learning Synthetic Environments and Reward Networks for Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6495", "id": "C1_esHN6AVn", "poster": "", "openreview": "https://openreview.net/forum?id=C1_esHN6AVn", "slides": "https://iclr.cc/virtual/2022/poster/6495", "video": "https://iclr.cc/virtual/2022/poster/6495", "author_site": "Fabio Ferreira, Thomas Nierhoff, Andreas S\u00e4linger, Frank Hutter", "tldr": "", "abstract": "We introduce Synthetic Environments (SEs) and Reward Networks (RNs), represented by neural networks, as proxy environment models for training Reinforcement Learning (RL) agents. We show that an agent, after being trained exclusively on the SE, is able to solve the corresponding real environment. While an SE acts as a full proxy to a real environment by learning about its state dynamics and rewards, an RN is a partial proxy that learns to augment or replace rewards. We use bi-level optimization to evolve SEs and RNs: the inner loop trains the RL agent, and the outer loop trains the parameters of the SE / RN via an evolution strategy. We evaluate our proposed new concept on a broad range of RL algorithms and classic control environments. In a one-to-one comparison, learning an SE proxy requires more interactions with the real environment than training agents only on the real environment. However, once such an SE has been learned, we do not need any interactions with the real environment to train new agents. Moreover, the learned SE proxies allow us to train agents with fewer interactions while maintaining the original task performance. Our empirical results suggest that SEs achieve this result by learning informed representations that bias the agents towards relevant states. Moreover, we find that these proxies are robust against hyperparameter variation and can also transfer to unseen agents.", "keywords": "Synthetic Environments;Synthetic Data;Meta-Learning;Reinforcement Learning;Evolution Strategies;Reward Shaping", "primary_area": "", "supplementary_material": "", "author": "Fabio Ferreira;Thomas Nierhoff;Andreas S\u00e4linger;Frank Hutter", "authorids": "~Fabio_Ferreira1;~Thomas_Nierhoff1;~Andreas_S\u00e4linger1;~Frank_Hutter1", "gender": "M;;M;M", "homepage": "http://ferreirafabio.github.io;http://www.google.de;https://ml.informatik.uni-freiburg.de/index.html;http://ml.informatik.uni-freiburg.de/~hutter/", "dblp": "128/6466;;;89/5383", "google_scholar": "https://scholar.google.de/citations?user=LFtEAeYAAAAJ;;;https://scholar.google.de/citations?user=YUrxwrkAAAAJ", "orcid": "0000-0002-0816-2042;;;0000-0002-2037-3694", "linkedin": ";;;frank-hutter-9190b24b/", "or_profile": "~Fabio_Ferreira1;~Thomas_Nierhoff1;~Andreas_S\u00e4linger1;~Frank_Hutter1", "aff": "Universit\u00e4t Freiburg;;University of Freiburg, Universit\u00e4t Freiburg;Albert-Ludwigs-Universit\u00e4t Freiburg", "aff_domain": "uni-freiburg.de;;cs.uni-freiburg.de;uni-freiburg.de", "position": "PhD student;;MS student;Full Professor", "bibtex": "@inproceedings{\nferreira2022learning,\ntitle={Learning Synthetic Environments and Reward Networks for Reinforcement Learning},\nauthor={Fabio Ferreira and Thomas Nierhoff and Andreas S{\\\"a}linger and Frank Hutter},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=C1_esHN6AVn}\n}", "github": "", "project": "", "reviewers": "Q4vZ;ktr8;1Zys;pG56", "pdf_size": 0, "recommendation": "3;6;6;8", "confidence": "3;4;4;3", "correctness": "3;3;3;4", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;3;3;4", "wc_summary_paper": "140;42;62;81", "wc_summary_review": "95;104;58;45", "wc_main_review": "759;268;819;577", "wc_review": "994;414;939;703", "wc_reply_reviewers": "101;0;485;0", "wc_reply_authors": "2916;574;2702;915", "reply_reviewers": "2;0;4;0", "reply_authors": "5;1;6;2", "recommendation_avg": [ 5.75, 1.7853571071357126 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 81.25, 36.615399765672365 ], "wc_summary_review_avg": [ 75.5, 24.642443060703215 ], "wc_main_review_avg": [ 605.75, 214.39609954474452 ], "wc_review_avg": [ 762.5, 228.985261534449 ], "wc_reply_reviewers_avg": [ 146.5, 199.73545003328778 ], "wc_reply_authors_avg": [ 1776.75, 1042.0171243794414 ], "reply_reviewers_avg": [ 1.5, 1.6583123951777 ], "reply_authors_avg": [ 3.5, 2.0615528128088303 ], "replies_avg": [ 26, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.14002800840280097, "corr_recommendation_correctness": 0.7276068751089989, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7208756410872374371&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=C1_esHN6AVn", "email": "uni-freiburg.de;;cs.uni-freiburg.de;uni-freiburg.de", "author_num": 4, "aff_unique_index": "0;0;1", "aff_unique_norm": "University of Freiburg;Albert-Ludwigs-Universit\u00e4t Freiburg", "aff_unique_dep": ";", "aff_unique_url": "https://www.uni-freiburg.de;https://www.uni-freiburg.de", "aff_unique_abbr": "Uni Freiburg;Albert-Ludwigs-Universit\u00e4t", "aff_campus_unique_index": "1", "aff_campus_unique": ";Freiburg", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Germany" }, { "id": "C1lXY_T1LTs", "title": "Shap-CAM: Visual Explanations for Convolutional Neural Networks based on Shapley Value", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Explaining deep convolutional neural networks has been recently drawing increasing attention since it helps to understand the networks' internal operations and why they make certain decisions. Saliency maps, which emphasize salient regions largely connected to the network's decision-making, are one of the most common ways for visualizing and analyzing deep networks in the computer vision community. However, saliency maps generated by existing methods cannot represent authentic information in images due to the unproven proposals about the weights of activation maps which lack solid theoretical foundation and fail to consider the relations between each pixels. In this paper, we develop a novel post-hoc visual explanation method called Shap-CAM based on class activation mapping. Unlike previous class activation mapping based approaches, Shap-CAM gets rid of the dependence on gradients by obtaining the importance of each pixels through Shapley value. We demonstrate that Shap-CAM achieves better visual performance and fairness for interpreting the decision making process. Our approach outperforms previous methods on both recognition and localization tasks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Quan Zheng;Ziwei Wang;Jiwen Lu;Jie Zhou", "authorids": "~Quan_Zheng1;~Ziwei_Wang2;~Jiwen_Lu1;~Jie_Zhou3", "gender": "M;M;M;M", "homepage": "https://github.com/Howard-zq;https://ziweiwangthu.github.io/;http://ivg.au.tsinghua.edu.cn/Jiwen_Lu/;https://www.tsinghua.edu.cn/publish/auen/1713/2011/20110506105532098625469/20110506105532098625469_.html", "dblp": ";136/5574-1;http://dblp.uni-trier.de/pers/hd/l/Lu:Jiwen;00/5012-1", "google_scholar": ";cMTW09EAAAAJ;TN8uDQoAAAAJ;", "orcid": ";0000-0001-9225-8495;0000-0002-6121-5529;", "linkedin": ";;;", "or_profile": "~Quan_Zheng1;~Ziwei_Wang2;~Jiwen_Lu1;~Jie_Zhou3", "aff": "Tsinghua University;Tsinghua University;Tsinghua University;Tsinghua University", "aff_domain": "tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn", "position": "PhD student;PhD student;Associate Professor;Full Professor", "bibtex": "@misc{\nzheng2022shapcam,\ntitle={Shap-{CAM}: Visual Explanations for Convolutional Neural Networks based on Shapley Value},\nauthor={Quan Zheng and Ziwei Wang and Jiwen Lu and Jie Zhou},\nyear={2022},\nurl={https://openreview.net/forum?id=C1lXY_T1LTs}\n}", "github": "", "project": "", "reviewers": "", "site": "https://openreview.net/forum?id=C1lXY_T1LTs", "pdf_size": 0, "recommendation": "", "confidence": "", "correctness": "", "technical_novelty": "", "empirical_novelty": "", "wc_summary_paper": "", "wc_summary_review": "", "wc_main_review": "", "wc_review": "", "wc_reply_reviewers": "", "wc_reply_authors": "", "reply_reviewers": "", "reply_authors": "", "recommendation_avg": [ 0, 0 ], "confidence_avg": [ 0, 0 ], "correctness_avg": [ 0, 0 ], "technical_novelty_avg": [ 0, 0 ], "empirical_novelty_avg": [ 0, 0 ], "wc_summary_paper_avg": [ 0, 0 ], "wc_summary_review_avg": [ 0, 0 ], "wc_main_review_avg": [ 0, 0 ], "wc_review_avg": [ 0, 0 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 1, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0, "corr_recommendation_correctness": 0, "gs_citation": 49, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16454528957993282988&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Tsinghua University", "aff_unique_dep": "", "aff_unique_url": "https://www.tsinghua.edu.cn", "aff_unique_abbr": "THU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "id": "C4o-EEUx-6", "title": "Flashlight: Enabling Innovation in Tools for Machine Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "As the computational requirements for machine learning systems and the size and complexity of machine learning frameworks increases, essential framework innovation has become challenging. While computational needs have driven recent compiler, networking, and hardware advancements, utilization of those advancements by machine learning tools is occurring at a slower pace. This is in part due to the difficulties involved in prototyping new computational paradigms with existing frameworks. Large frameworks prioritize machine learning researchers and practitioners as end users and pay comparatively little attention to systems researchers who can push frameworks forward --- we argue that both are equally-important stakeholders. We introduce Flashlight, an open source library built to spur innovation in machine learning tools and systems by prioritizing open, modular, customizable internals and state-of-the-art, research-ready models and training setups across a variety of domains. Flashlight enables systems researchers to rapidly prototype and experiment with novel ideas in machine learning computation and has low overhead, competing with and often outperforming other popular machine learning frameworks. We see Flashlight as a tool enabling research that can benefit widely-used libraries downstream and bring machine learning and systems researchers closer together.", "keywords": "machine learning;deep learning;systems;frameworks;autograd library;tensor library", "primary_area": "", "supplementary_material": "", "author": "Jacob Kahn;Vineel Pratap;Tatiana Likhomanenko;Qiantong Xu;Awni Hannun;Jeff Cai;Paden Tomasello;Ann Lee;Edouard Grave;Gilad Avidov;Benoit Steiner;Vitaliy Liptchinsky;Gabriel Synnaeve;Ronan Collobert", "authorids": "~Jacob_Kahn1;vineelkpratap@fb.com;~Tatiana_Likhomanenko1;~Qiantong_Xu1;~Awni_Hannun1;jcai@fb.com;padentomasello@fb.com;annl@fb.com;~Edouard_Grave1;avidov@fb.com;~Benoit_Steiner1;~Vitaliy_Liptchinsky3;~Gabriel_Synnaeve1;~Ronan_Collobert1", "gender": "M;;F;M;M;;;;;;;;M;M", "homepage": "https://jacobkahn.me/;;https://github.com/tlikhomanenko/tlikhomanenko;;https://www.awnihannun.com/;;;;;;https://bsteiner.info;;;http://ronan.collobert.com", "dblp": "232/2341;;202/2094;;https://dblp.uni-trier.de/pers/hd/h/Hannun:Awni;;;;50/10261;;177/9377;;http://dblp.uni-trier.de/pers/hd/s/Synnaeve:Gabriel;03/4032", "google_scholar": "_-pugt8AAAAJ;;https://scholar.google.ru/citations?user=x7Z3ysQAAAAJ;EONOwy4AAAAJ;3-mdTUAAAAAJ;;;;7UV4ET4AAAAJ;;rT11mdcAAAAJ;;wN9rBkcAAAAJ;32w7x1cAAAAJ", "orcid": "0000-0003-2911-2500;;0000-0003-0351-9839;;;;;;;;;;;", "linkedin": "jacobdavidkahn/;;;;;;;;edouard-grave-63099823/;;;;;", "or_profile": "~Jacob_Kahn1;vineelkpratap@fb.com;~Tatiana_Likhomanenko1;~Qiantong_Xu1;~Awni_Hannun1;jcai@fb.com;padentomasello@fb.com;annl@fb.com;~Edouard_Grave1;avidov@fb.com;~Benoit_Steiner1;~Vitaliy_Liptchinsky3;~Gabriel_Synnaeve1;~Ronan_Collobert1", "aff": "Meta AI;;Apple;Sambanova Systems;Zoom;;;;Meta Facebook;;Meta Facebook;;Meta Facebook;Meta Facebook", "aff_domain": "meta.com;;apple.com;sambanovasystems.ai;zoom.us;;;;fb.com;;fb.com;;fb.com;fb.com", "position": "Research Engineer;;Research Scientist;Engineer;Distinguished Scientist;;;;Research Scientist;;Researcher;;Research Scientist;Research Scientist", "bibtex": "@misc{\nkahn2022flashlight,\ntitle={Flashlight: Enabling Innovation in Tools for Machine Learning},\nauthor={Jacob Kahn and Vineel Pratap and Tatiana Likhomanenko and Qiantong Xu and Awni Hannun and Jeff Cai and Paden Tomasello and Ann Lee and Edouard Grave and Gilad Avidov and Benoit Steiner and Vitaliy Liptchinsky and Gabriel Synnaeve and Ronan Collobert},\nyear={2022},\nurl={https://openreview.net/forum?id=C4o-EEUx-6}\n}", "github": "", "project": "", "reviewers": "1j8s;DYUa;KzrA;gStW", "site": "https://openreview.net/forum?id=C4o-EEUx-6", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "4;2;3;4", "correctness": "4;3;2;3", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;1;2;3", "wc_summary_paper": "63;96;56;26", "wc_summary_review": "25;53;57;93", "wc_main_review": "164;292;416;434", "wc_review": "252;441;529;553", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "120;366;670;728", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 60.25, 24.883478454589103 ], "wc_summary_review_avg": [ 57.0, 24.166091947189145 ], "wc_main_review_avg": [ 326.5, 108.58521998872591 ], "wc_review_avg": [ 443.75, 118.29914412200961 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 471.0, 244.88568761771276 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 14, 0 ], "corr_recommendation_confidence": -0.20751433915982243, "corr_recommendation_correctness": -0.6488856845230502, "gs_citation": 29, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13806487547053815832&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff_unique_index": "0;1;2;3;0;0;0;0", "aff_unique_norm": "Meta;Apple;SambaNova Systems;Zoom Video Communications Inc.", "aff_unique_dep": "Meta AI;Apple Inc.;;", "aff_unique_url": "https://meta.com;https://www.apple.com;https://www.sambanova.com;https://zoom.us", "aff_unique_abbr": "Meta;Apple;;Zoom", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "MonoDistill: Learning Spatial Features for Monocular 3D Object Detection", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6677", "id": "C54V-xTWfi", "poster": "", "openreview": "https://openreview.net/forum?id=C54V-xTWfi", "slides": "https://iclr.cc/virtual/2022/poster/6677", "video": "https://iclr.cc/virtual/2022/poster/6677", "author_site": "Zhiyu Chong, Xinzhu Ma, Hong Zhang, Yuxin Yue, Haojie Li, Zhihui Wang, Wanli Ouyang", "tldr": "", "abstract": "3D object detection is a fundamental and challenging task for 3D scene understanding, and the monocular-based methods can serve as an economical alternative to the stereo-based or LiDAR-based methods. However, accurately locating objects in the 3D space from a single image is extremely difficult due to the lack of spatial cues. To mitigate this issue, we propose a simple and effective scheme to introduce the spatial information from LiDAR signals to the monocular 3D detectors, without introducing any extra cost in the inference phase. In particular, we first project the LiDAR signals into the image plane and align them with the RGB images. After that, we use the resulting data to train a 3D detector (LiDAR Net) using the same architecture as the baseline model. Finally, this LiDAR Net can serve as the teacher to transfer the learned knowledge to the baseline model. Experimental results show that the proposed method can significantly boost the performance of the baseline model and ranks the $1^{st}$ place among all monocular-based methods on the KITTI benchmark. Besides, extensive ablation studies are conducted, which further prove the effectiveness of each part of our designs and illustrate what the baseline model has learned from the LiDAR Net.", "keywords": "3D object detection;monocular images", "primary_area": "", "supplementary_material": "", "author": "Zhiyu Chong;Xinzhu Ma;Hong Zhang;Yuxin Yue;Haojie Li;Zhihui Wang;Wanli Ouyang", "authorids": "~Zhiyu_Chong1;~Xinzhu_Ma1;~Hong_Zhang8;~Yuxin_Yue2;~Haojie_Li2;~Zhihui_Wang4;~Wanli_Ouyang1", "gender": "M;M;F;F;M;F;", "homepage": ";https://github.com/xinzhuma;;;https://cise.sdust.edu.cn/home/Page/teacher_detail/catId/20/id/1913.html;http://dlutir.dlut.edu.cn/Scholar/Detail/6293;", "dblp": ";191/3902;;;;65/2749-1.html;", "google_scholar": ";8PuKa_8AAAAJ;;;pMnlgVMAAAAJ;;", "orcid": "0000-0002-3684-0517;;0000-0001-9790-0555;0000-0002-4238-786X;0000-0003-3882-2205;;", "linkedin": ";;;;;;", "or_profile": "~Zhiyu_Chong1;~Xinzhu_Ma1;~Hong_Zhang8;~Yuxin_Yue2;~Haojie_Li2;~Zhihui_Wang4;~Wanli_Ouyang1", "aff": "Dalian University of Technology;University of Sydney;;Dalian University of Technology;Dalian University of Technology;Dalian University of Technology;", "aff_domain": "edu.cn;sydney.edu.au;;dlut.edu.cn;dlut.edu.cn;dlut.edu.cn;", "position": "Undergrad student;PhD student;;Undergrad student;Full Professor;Full Professor;", "bibtex": "@inproceedings{\nchong2022monodistill,\ntitle={MonoDistill: Learning Spatial Features for Monocular 3D Object Detection},\nauthor={Zhiyu Chong and Xinzhu Ma and Hong Zhang and Yuxin Yue and Haojie Li and Zhihui Wang and Wanli Ouyang},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=C54V-xTWfi}\n}", "github": "", "project": "", "reviewers": "J4VF;ZFtD;mGRP;nFhM;2Mbm", "pdf_size": 0, "recommendation": "5;6;8;8;8", "confidence": "5;5;4;4;3", "correctness": "3;4;3;3;3", "technical_novelty": "3;3;2;2;2", "empirical_novelty": "3;3;3;3;3", "wc_summary_paper": "28;87;90;85;86", "wc_summary_review": "31;82;60;30;103", "wc_main_review": "274;322;439;270;765", "wc_review": "333;491;589;385;954", "wc_reply_reviewers": "0;264;76;0;45", "wc_reply_authors": "454;1444;771;83;760", "reply_reviewers": "0;2;1;0;1", "reply_authors": "1;3;1;1;1", "recommendation_avg": [ 7.0, 1.2649110640673518 ], "confidence_avg": [ 4.2, 0.7483314773547882 ], "correctness_avg": [ 3.2, 0.39999999999999997 ], "technical_novelty_avg": [ 2.4, 0.4898979485566356 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 75.2, 23.659247663440187 ], "wc_summary_review_avg": [ 61.2, 28.519467035693356 ], "wc_main_review_avg": [ 414.0, 185.80957994678315 ], "wc_review_avg": [ 550.4, 220.2413221900014 ], "wc_reply_reviewers_avg": [ 77.0, 97.8284212281891 ], "wc_reply_authors_avg": [ 702.4, 447.9627663098798 ], "reply_reviewers_avg": [ 0.8, 0.7483314773547883 ], "reply_authors_avg": [ 1.4, 0.8000000000000002 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": -0.8451542547285165, "corr_recommendation_correctness": -0.39528470752104744, "gs_citation": 120, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14851758558366902605&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=C54V-xTWfi", "email": "edu.cn;sydney.edu.au;;dlut.edu.cn;dlut.edu.cn;dlut.edu.cn;", "author_num": 7, "aff_unique_index": "0;1;0;0;0", "aff_unique_norm": "Dalian University of Technology;University of Sydney", "aff_unique_dep": ";", "aff_unique_url": "http://www.dlut.edu.cn/;https://www.sydney.edu.au", "aff_unique_abbr": "DUT;USYD", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0;0", "aff_country_unique": "China;Australia" }, { "id": "C5Q04gnc4f", "title": "An object-centric sensitivity analysis of deep learning based instance segmentation", "track": "main", "status": "Reject", "tldr": "", "abstract": "In this study we establish a comprehensive baseline regarding the object-centric robustness of deep learning models for instance segmentation. Our approach is motivated by the work of Geirhos et al. (2019) on texture bias in CNNs. However, we do not compare against human performance but instead incorporate ideas from object-centric representation learning. In addition, we analyze and control the effect of strong stylization that can lead to disappearing objects. The result is a stylized and object-centric version of MS COCO on which we perform an extensive sensitivity analysis regarding visual feature corruptions. We evaluate a broad range of frameworks including Cascade and Mask R-CNN, Swin Transformer, YOLACT(++), DETR, SOTR and SOLOv2. We find that framework choice, data augmentation and dynamic architectures improve robustness whereas supervised and self supervised pre-training does surprisingly not. In summary we evaluate 63 models on 61 versions of COCO for a total of 3843 evaluations.", "keywords": "robust vision;instance segmentation;deep learning;object-centric;robustness;sensitivity analysis", "primary_area": "", "supplementary_material": "", "author": "Johannes Theodoridis;Jessica Hofmann;Johannes Maucher;Andreas Schilling", "authorids": "~Johannes_Theodoridis1;~Jessica_Hofmann1;~Johannes_Maucher1;~Andreas_Schilling1", "gender": "M;F;M;M", "homepage": ";;http://maucher.pages.mi.hdm-stuttgart.de/ai/;", "dblp": ";;52/5980;14/2967", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";jessica-hofmann-69b0a2209/;dr-johannes-maucher-0047213/?msgControlName=reply_to_sender&msgConversationId=2-ZmFlNTM5NDQtNzYwZS00YzM1LWE5OGUtOTFkZWU2YzRkNDdlXzAxMw%3D%3D&msgOverlay=true;", "or_profile": "~Johannes_Theodoridis1;~Jessica_Hofmann1;~Johannes_Maucher1;~Andreas_Schilling1", "aff": ";Hochschule der Medien Stuttgart;HdM Stuttgart;University of Tuebingen", "aff_domain": ";hdm-stuttgart.de;hdm-stuttgart.de;uni-tuebingen.de", "position": ";MS student;Full Professor;Full Professor", "bibtex": "@misc{\ntheodoridis2022an,\ntitle={An object-centric sensitivity analysis of deep learning based instance segmentation},\nauthor={Johannes Theodoridis and Jessica Hofmann and Johannes Maucher and Andreas Schilling},\nyear={2022},\nurl={https://openreview.net/forum?id=C5Q04gnc4f}\n}", "github": "", "project": "", "reviewers": "GTkh;uJDe;Hfwx;kn4q", "site": "https://openreview.net/forum?id=C5Q04gnc4f", "pdf_size": 0, "recommendation": "3;3;6;6", "confidence": "5;2;5;3", "correctness": "1;4;3;3", "technical_novelty": "1;1;2;2", "empirical_novelty": "1;1;2;3", "wc_summary_paper": "213;83;45;50", "wc_summary_review": "18;64;97;59", "wc_main_review": "312;100;357;180", "wc_review": "543;247;499;289", "wc_reply_reviewers": "644;0;179;49", "wc_reply_authors": "2456;1055;2231;776", "reply_reviewers": "2;0;2;1", "reply_authors": "4;3;5;3", "recommendation_avg": [ 4.5, 1.5 ], "confidence_avg": [ 3.75, 1.299038105676658 ], "correctness_avg": [ 2.75, 1.0897247358851685 ], "technical_novelty_avg": [ 1.5, 0.5 ], "empirical_novelty_avg": [ 1.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 97.75, 68.12259170055114 ], "wc_summary_review_avg": [ 59.5, 28.0579756931964 ], "wc_main_review_avg": [ 237.25, 102.52164405626746 ], "wc_review_avg": [ 394.5, 128.3150419865107 ], "wc_reply_reviewers_avg": [ 218.0, 254.50049115866162 ], "wc_reply_authors_avg": [ 1629.5, 725.1580862129306 ], "reply_reviewers_avg": [ 1.25, 0.82915619758885 ], "reply_authors_avg": [ 3.75, 0.82915619758885 ], "replies_avg": [ 26, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.1924500897298753, "corr_recommendation_correctness": 0.2294157338705618, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:0Ehy79i4TgwJ:scholar.google.com/&scioq=An+object-centric+sensitivity+analysis+of+deep+learning+based+instance+segmentation&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;2", "aff_unique_norm": "Hochschule der Medien;Stuttgart Media University;University of Tuebingen", "aff_unique_dep": ";;", "aff_unique_url": "https://www.hdm-stuttgart.de;https://www.hdm-stuttgart.de;https://www.uni-tuebingen.de/", "aff_unique_abbr": "HdM;HdM;Uni T\u00fcbingen", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Stuttgart;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Germany" }, { "id": "C5u6Z9voQ1", "title": "Evaluating the Robustness of Time Series Anomaly and Intrusion Detection Methods against Adversarial Attacks", "track": "main", "status": "Reject", "tldr": "", "abstract": "Time series anomaly and intrusion detection are extensively studied in statistics, economics, and computer science. Over the years, numerous methods have been proposed for time series anomaly and intrusion detection using deep learning-based methods. Many of these methods demonstrate state-of-the-art performance on benchmark datasets, giving the false impression that these systems are robust and deployable in practical and industrial scenarios. In this paper, we demonstrate that state-of-the-art anomaly and intrusion detection methods can be easily fooled by adding adversarial perturbations to the sensor data. We use different scoring metrics such as prediction errors, anomaly, and classification scores over several public and private datasets belong to aerospace applications, automobiles, server machines, and cyber-physical systems. We evaluate state-of-the-art deep neural networks (DNNs) and graph neural networks (GNNs) methods, which claim to be robust against anomalies and intrusions, and find their performance can drop to as low as 0\\% under adversarial attacks from Fast Gradient Sign Method (FGSM) and Projected Gradient Descent (PGD) methods. To the best of our knowledge, we are the first to demonstrate the vulnerabilities of anomaly and intrusion detection systems against adversarial attacks. Our code is available here: https://anonymous.4open.science/r/ICLR298", "keywords": "Time series;Anomaly Detection;Intrusion Detection;Adversarial Attack", "primary_area": "", "supplementary_material": "", "author": "Shahroz Tariq;Simon S. Woo", "authorids": "~Shahroz_Tariq1;~Simon_S._Woo1", "gender": "M;M", "homepage": "https://sites.google.com/view/shahroztariq/;", "dblp": "194/9281;53/2716", "google_scholar": "Kpmz_BgAAAAJ;mHnj60cAAAAJ", "orcid": "0000-0001-9090-0579;", "linkedin": "shahroztariq/;", "or_profile": "~Shahroz_Tariq1;~Simon_S._Woo1", "aff": "Sungkyunkwan University (Suwon Campus);Sungkyunkwan University", "aff_domain": "skku.edu;skku.edu", "position": "PhD student;Associate Professor", "bibtex": "@misc{\ntariq2022evaluating,\ntitle={Evaluating the Robustness of Time Series Anomaly and Intrusion Detection Methods against Adversarial Attacks},\nauthor={Shahroz Tariq and Simon S. Woo},\nyear={2022},\nurl={https://openreview.net/forum?id=C5u6Z9voQ1}\n}", "github": "", "project": "", "reviewers": "iy4H;VZJZ;hY71", "site": "https://openreview.net/forum?id=C5u6Z9voQ1", "pdf_size": 0, "recommendation": "5;5;5", "confidence": "5;4;4", "correctness": "3;3;4", "technical_novelty": "2;2;2", "empirical_novelty": "2;2;2", "wc_summary_paper": "138;43;52", "wc_summary_review": "159;34;48", "wc_main_review": "878;579;234", "wc_review": "1175;656;334", "wc_reply_reviewers": "0;0;50", "wc_reply_authors": "767;764;587", "reply_reviewers": "0;0;1", "reply_authors": "1;1;2", "recommendation_avg": [ 5.0, 0.0 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 77.66666666666667, 42.82003684673281 ], "wc_summary_review_avg": [ 80.33333333333333, 55.91859162111371 ], "wc_main_review_avg": [ 563.6666666666666, 263.13536862653456 ], "wc_review_avg": [ 721.6666666666666, 346.46243984337207 ], "wc_reply_reviewers_avg": [ 16.666666666666668, 23.570226039551585 ], "wc_reply_authors_avg": [ 706.0, 84.15461959987698 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10422929135248889565&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Sungkyunkwan University", "aff_unique_dep": "", "aff_unique_url": "https://www.sungkyunkwan.ac.kr", "aff_unique_abbr": "SKKU", "aff_campus_unique_index": "0", "aff_campus_unique": "Suwon;", "aff_country_unique_index": "0;0", "aff_country_unique": "South Korea" }, { "id": "C7LB5_Zt_Vp", "title": "ZeroLiers: Diminishing Large Outliers in ReLU-like Activations", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "As the number of learnable parameters is getting bigger and bigger, overfitting is still one of the main challenges in training DNNs. Even though DNNs with billions or even a few hundred billions of parameters are proposed and used, it is still hard to determine the appropriate training set size that prevents overfitting. In this work, we propose a new activation function, called ZeroLiers, to prevent overfitting. It eliminates the need to use Dropout and leads to better generalization when training DNNs with fully connected layers. ZeroLiers can be easily implemented by replacing large outliers in ReLU-like activations with zeros. We perform an empirical evaluation of ZeroLiers' regularization effect against Dropout. Interestingly, the validation loss decreases much faster using ZeroLiers than Dropout, and the generalization performance improves. Moreover, we train several recent DNNs with fully connected layers and investigate the effect of ZeroLiers. Specifically, we find that ZeroLiers accelerates the convergence speed of both their training and validation losses.", "keywords": "Deep Neural Network;Rectified Linear Units;Generalization;Regularization;Dropout", "primary_area": "", "supplementary_material": "/attachment/09981bf81bebc452d43ebef945c14181b6c65c5b.zip", "author": "Yeha Kim;Wookeun Jung;Jaejin Lee", "authorids": "~Yeha_Kim1;wookeun.jung@moreh.io;~Jaejin_Lee1", "gender": ";;M", "homepage": "https://github.com/yeha-adry;;https://sites.google.com/view/jaejinlee", "dblp": ";;30/880.html", "google_scholar": ";;6JaKru0AAAAJ", "orcid": ";;0000-0003-4638-8170", "linkedin": ";;", "or_profile": "~Yeha_Kim1;wookeun.jung@moreh.io;~Jaejin_Lee1", "aff": "Seoul National University;;Seoul National University", "aff_domain": "snu.ac.kr;;snu.ac.kr", "position": "PhD student;;Full Professor", "bibtex": "@misc{\nkim2022zeroliers,\ntitle={ZeroLiers: Diminishing Large Outliers in Re{LU}-like Activations},\nauthor={Yeha Kim and Wookeun Jung and Jaejin Lee},\nyear={2022},\nurl={https://openreview.net/forum?id=C7LB5_Zt_Vp}\n}", "github": "", "project": "", "reviewers": "mCps;zeke;redf;FWxu", "site": "https://openreview.net/forum?id=C7LB5_Zt_Vp", "pdf_size": 0, "recommendation": "3;3;3;3", "confidence": "4;4;4;4", "correctness": "3;3;2;2", "technical_novelty": "2;2;2;2", "empirical_novelty": "3;2;2;2", "wc_summary_paper": "84;65;77;78", "wc_summary_review": "36;29;29;28", "wc_main_review": "332;478;618;301", "wc_review": "452;572;724;407", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 76.0, 6.892024376045111 ], "wc_summary_review_avg": [ 30.5, 3.2015621187164243 ], "wc_main_review_avg": [ 432.25, 126.36529389037166 ], "wc_review_avg": [ 538.75, 122.78716341702825 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:h9urvDwkpa0J:scholar.google.com/&scioq=ZeroLiers:+Diminishing+Large+Outliers+in+ReLU-like+Activations&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Seoul National University", "aff_unique_dep": "", "aff_unique_url": "https://www.snu.ac.kr", "aff_unique_abbr": "SNU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "South Korea" }, { "id": "C7ViqmpuBl", "title": "On Learning the Transformer Kernel", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "In this work we introduce Kernelised Transformer, a generic, scalable, data driven framework for learning the kernel function in Transformers. Our framework approximates the Transformer kernel as a dot product between spectral feature maps and learns the kernel by learning the spectral distribution. This not only helps in learning a generic kernel end-to-end, but also reduces the time and space complexity of Transformers from quadratic to linear. We show that Kernelized Transformers achieve performance comparable to existing efficient Transformer architectures, both in terms of accuracy as well as computational efficiency. Our study also demonstrates that the choice of the kernel has a substantial impact on performance, and kernel learning variants are competitive alternatives to fixed kernel Transformers, both in long as well as short sequence tasks.", "keywords": "Transformers;Kernel learning", "primary_area": "", "supplementary_material": "/attachment/5d51e636597c977e385709fc93110190ae37d630.zip", "author": "Sankalan Pal Chowdhury;Adamos Solomou;Avinava Dubey;MRINMAYA SACHAN", "authorids": "~Sankalan_Pal_Chowdhury1;~Adamos_Solomou1;~Avinava_Dubey1;~MRINMAYA_SACHAN2", "gender": "M;;M;M", "homepage": ";;https://sites.google.com/site/mrinsachan/;https://sites.google.com/site/kumaravinavadubey/", "dblp": "250/9552.html;;86/10440.html;10/7789", "google_scholar": ";;Tpp9ZjoAAAAJ;tBbUAfsAAAAJ", "orcid": ";;;", "linkedin": "sankalan-palchowdhury-343a7760/;adamossolomou/;;", "or_profile": "~Sankalan_Pal_Chowdhury1;~Adamos_Solomou1;~MRINMAYA_SACHAN2;~Kumar_A_Dubey1", "aff": "Swiss Federal Institute of Technology;;Swiss Federal Institute of Technology;Google Research", "aff_domain": "ethz.ch;;ethz.ch;google.com", "position": "MS student;;Assistant Professor;Research Scientist", "bibtex": "@misc{\nchowdhury2022on,\ntitle={On Learning the Transformer Kernel},\nauthor={Sankalan Pal Chowdhury and Adamos Solomou and Avinava Dubey and MRINMAYA SACHAN},\nyear={2022},\nurl={https://openreview.net/forum?id=C7ViqmpuBl}\n}", "github": "", "project": "", "reviewers": "v5xT;uHV9;mNN5;8nc9", "site": "https://openreview.net/forum?id=C7ViqmpuBl", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "5;4;3;3", "correctness": "3;3;3;3", "technical_novelty": "3;2;2;2", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "42;19;56;35", "wc_summary_review": "32;27;19;20", "wc_main_review": "182;107;220;169", "wc_review": "256;153;295;224", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 38.0, 13.322912594474229 ], "wc_summary_review_avg": [ 24.5, 5.315072906367325 ], "wc_main_review_avg": [ 169.5, 40.660177077823946 ], "wc_review_avg": [ 232.0, 52.08166663999915 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.899228803025897, "corr_recommendation_correctness": 0.0, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9345975744176007716&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0;1", "aff_unique_norm": "Swiss Federal Institute of Technology;Google", "aff_unique_dep": ";Google Research", "aff_unique_url": "https://www.ethz.ch;https://research.google", "aff_unique_abbr": "ETH Zurich;Google Research", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;1", "aff_country_unique": "Switzerland;United States" }, { "id": "C81udlH5yMv", "title": "Invariant Causal Mechanisms through Distribution Matching", "track": "main", "status": "Reject", "tldr": "", "abstract": "Learning representations that capture the underlying data generating process is akey problem for data efficient and robust use of neural networks. One key property for robustness which the learned representation should capture and which recently received a lot of attention is described by the notion of invariance. In this work we provide a causal perspective and new algorithm for learning invariant representations. Empirically we show that this algorithm works well on a diverse set of tasks and in particular we observe state-of-the-art performance on domain generalization, where we are able to significantly boost the score of existing models.", "keywords": "representation learning;causality;invariance;distribution matching", "primary_area": "", "supplementary_material": "", "author": "Mathieu Chevalley;Charlotte Bunne;Andreas Krause;Stefan Bauer", "authorids": "~Mathieu_Chevalley1;~Charlotte_Bunne1;~Andreas_Krause1;~Stefan_Bauer1", "gender": ";F;M;", "homepage": ";https://aimm.epfl.ch;https://las.inf.ethz.ch/krausea;https://cifar.ca/bios/stefan-bauer/", "dblp": ";217/2348;87/1831-1.html;", "google_scholar": ";https://scholar.google.com/citations?hl=en;https://scholar.google.ch/citations?user=eDHv58AAAAAJ;O-oICE8AAAAJ", "orcid": ";0000-0003-1431-103X;0000-0001-7260-9673;", "linkedin": "mathieu-chevalley/?locale=en_US;bunnech/;krausea/;", "or_profile": "~Mathieu_Chevalley1;~Charlotte_Bunne1;~Andreas_Krause1;~Stefan_Bauer1", "aff": "ETHZ - ETH Zurich;ETHZ - ETH Zurich;ETH Zurich;KTH Royal Institute of Technology", "aff_domain": "ethz.ch;ethz.ch;ethz.ch;kth.se", "position": "PhD student;PhD student;Full Professor;Assistant Professor", "bibtex": "@misc{\nchevalley2022invariant,\ntitle={Invariant Causal Mechanisms through Distribution Matching},\nauthor={Mathieu Chevalley and Charlotte Bunne and Andreas Krause and Stefan Bauer},\nyear={2022},\nurl={https://openreview.net/forum?id=C81udlH5yMv}\n}", "github": "", "project": "", "reviewers": "sMkT;H74w;PYVV", "site": "https://openreview.net/forum?id=C81udlH5yMv", "pdf_size": 0, "recommendation": "1;3;5", "confidence": "5;3;4", "correctness": "3;3;3", "technical_novelty": "1;2;2", "empirical_novelty": "1;2;3", "wc_summary_paper": "60;56;49", "wc_summary_review": "28;44;25", "wc_main_review": "395;128;265", "wc_review": "483;228;339", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "485;182;295", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "recommendation_avg": [ 3.0, 1.632993161855452 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 1.6666666666666667, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.0, 0.816496580927726 ], "wc_summary_paper_avg": [ 55.0, 4.546060565661952 ], "wc_summary_review_avg": [ 32.333333333333336, 8.339997335464536 ], "wc_main_review_avg": [ 262.6666666666667, 109.0147798338474 ], "wc_review_avg": [ 350.0, 104.39348638684312 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 320.6666666666667, 125.02355333651691 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.5, "corr_recommendation_correctness": 0.0, "gs_citation": 41, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7466224660352818120&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0;0;1", "aff_unique_norm": "ETH Zurich;KTH Royal Institute of Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.ethz.ch;https://www.kth.se", "aff_unique_abbr": "ETHZ;KTH", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1", "aff_country_unique": "Switzerland;Sweden" }, { "id": "C8L4I381u2C", "title": "On The Transferability of Deep-Q Networks", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Transfer Learning (TL) is an efficient machine learning paradigm that allows overcoming some of the hurdles that characterize the successful training of deep neural networks, ranging from long training times to the needs of large datasets. While exploiting TL is a well established and successful training practice in Supervised Learning (SL), its applicability in Deep Reinforcement Learning (DRL) is rarer. In this paper, we study the level of transferability of three different variants of Deep-Q Networks on popular DRL benchmarks as well as on a set of novel, carefully designed control tasks. Our results show that transferring neural networks in a DRL context can be particularly challenging and is a process which in most cases results in negative transfer. In the attempt of understanding why Deep-Q Networks transfer so poorly, we gain novel insights into the training dynamics that characterizes this family of algorithms.", "keywords": "Transfer Learning;Deep-Q Networks;Model-Free Deep Reinforcement Learning", "primary_area": "", "supplementary_material": "", "author": "Matthia Sabatelli;Pierre Geurts", "authorids": "~Matthia_Sabatelli1;~Pierre_Geurts1", "gender": "M;M", "homepage": "https://paintception.github.io/;https://people.montefiore.uliege.be/geurts/", "dblp": "160/6434;69/4142", "google_scholar": "https://scholar.google.nl/citations?user=YO2Php8AAAAJ;tyFTsmIAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Matthia_Sabatelli1;~Pierre_Geurts1", "aff": "University of Groningen;University of Li\u00e8ge", "aff_domain": "rug.nl;uliege.be", "position": "Lecturer;Full Professor", "bibtex": "@misc{\nsabatelli2022on,\ntitle={On The Transferability of Deep-Q Networks},\nauthor={Matthia Sabatelli and Pierre Geurts},\nyear={2022},\nurl={https://openreview.net/forum?id=C8L4I381u2C}\n}", "github": "", "project": "", "reviewers": "4U1f;S71i;KheK;gE2H", "site": "https://openreview.net/forum?id=C8L4I381u2C", "pdf_size": 0, "recommendation": "3;3;3;3", "confidence": "3;3;4;4", "correctness": "3;3;2;2", "technical_novelty": "3;2;1;2", "empirical_novelty": "2;3;2;2", "wc_summary_paper": "65;50;84;63", "wc_summary_review": "43;51;71;23", "wc_main_review": "305;268;355;636", "wc_review": "413;369;510;722", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 65.5, 12.134661099511597 ], "wc_summary_review_avg": [ 47.0, 17.204650534085253 ], "wc_main_review_avg": [ 391.0, 144.78086890193745 ], "wc_review_avg": [ 503.5, 136.07442816341356 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2890667076261939355&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;1", "aff_unique_norm": "University of Groningen;University of Li\u00e8ge", "aff_unique_dep": ";", "aff_unique_url": "https://www.rug.nl;https://www.ulg.ac.be", "aff_unique_abbr": "RUG;ULi\u00e8ge", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "Netherlands;Belgium" }, { "title": "Distributional Reinforcement Learning with Monotonic Splines", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6650", "id": "C8Ltz08PtBp", "poster": "", "openreview": "https://openreview.net/forum?id=C8Ltz08PtBp", "slides": "https://iclr.cc/virtual/2022/poster/6650", "video": "https://iclr.cc/virtual/2022/poster/6650", "author_site": "Yudong Luo, Guiliang Liu, Haonan Duan, Oliver Schulte, Pascal Poupart", "tldr": "", "abstract": "Distributional Reinforcement Learning (RL) differs from traditional RL by estimating the distribution over returns to capture the intrinsic uncertainty of MDPs. One key challenge in distributional RL lies in how to parameterize the quantile function when minimizing the Wasserstein metric of temporal differences. Existing algorithms use step functions or piecewise linear functions. In this paper, we propose to learn smooth continuous quantile functions represented by monotonic rational-quadratic splines, which also naturally solve the quantile crossing problem. Experiments in stochastic environments show that a dense estimation for quantile functions enhances distributional RL in terms of faster empirical convergence and higher rewards in most cases.", "keywords": "Distributional RL", "primary_area": "", "supplementary_material": "/attachment/da455063f7268429bbaa204c0f3826a1de46e40f.zip", "author": "Yudong Luo;Guiliang Liu;Haonan Duan;Oliver Schulte;Pascal Poupart", "authorids": "~Yudong_Luo1;~Guiliang_Liu1;~Haonan_Duan2;~Oliver_Schulte1;~Pascal_Poupart2", "gender": ";M;M;M;M", "homepage": "http://miyunluo.com;http://guiliang.me/;https://www.cs.toronto.edu/~haonand/;http://www.cs.sfu.ca/~oschulte/;https://cs.uwaterloo.ca/~ppoupart", "dblp": "161/8157;220/5411;273/7767;s/OliverSchulte;26/2122", "google_scholar": ";CuMylvEAAAAJ;5WVNRqoAAAAJ;;https://scholar.google.ca/citations?user=KhAJWroAAAAJ", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~Yudong_Luo1;~Guiliang_Liu1;~Haonan_Duan2;~Oliver_Schulte1;~Pascal_Poupart2", "aff": "Shanghai Jiaotong University;University of Waterloo / Vector Institute;Department of Computer Science, University of Toronto;Simon Fraser University;University of Waterloo", "aff_domain": "sjtu.edu.cn;uwaterloo.ca;cs.toronto.edu;sfu.ca;uwaterloo.ca", "position": "Undergrad student;Postdoc;PhD student;Full Professor;Full Professor", "bibtex": "@inproceedings{\nluo2022distributional,\ntitle={Distributional Reinforcement Learning with Monotonic Splines},\nauthor={Yudong Luo and Guiliang Liu and Haonan Duan and Oliver Schulte and Pascal Poupart},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=C8Ltz08PtBp}\n}", "github": "", "project": "", "reviewers": "uP3K;zyuH;Gqyh;2kiV", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "4;5;5;3", "correctness": "3;3;4;4", "technical_novelty": "2;2;3;4", "empirical_novelty": "2;2;3;4", "wc_summary_paper": "154;75;41;50", "wc_summary_review": "87;117;32;10", "wc_main_review": "421;311;317;99", "wc_review": "662;503;390;159", "wc_reply_reviewers": "0;97;0;0", "wc_reply_authors": "719;1109;358;8", "reply_reviewers": "0;1;0;0", "reply_authors": "2;2;1;1", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 4.25, 0.82915619758885 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.82915619758885 ], "empirical_novelty_avg": [ 2.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 80.0, 44.50280890011326 ], "wc_summary_review_avg": [ 61.5, 42.58227330709341 ], "wc_main_review_avg": [ 287.0, 117.02136557056579 ], "wc_review_avg": [ 428.5, 183.15635397113581 ], "wc_reply_reviewers_avg": [ 24.25, 42.00223208354527 ], "wc_reply_authors_avg": [ 548.5, 409.7746331826801 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.6225430174794673, "corr_recommendation_correctness": 0.6882472016116854, "gs_citation": 21, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15038416806754316577&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=C8Ltz08PtBp", "email": "sjtu.edu.cn;uwaterloo.ca;cs.toronto.edu;sfu.ca;uwaterloo.ca", "author_num": 5, "aff_unique_index": "0;1;2;3;1", "aff_unique_norm": "Shanghai Jiao Tong University;University of Waterloo;University of Toronto;Simon Fraser University", "aff_unique_dep": ";;Department of Computer Science;", "aff_unique_url": "https://www.sjtu.edu.cn;https://uwaterloo.ca;https://www.utoronto.ca;https://www.sfu.ca", "aff_unique_abbr": "SJTU;UW;U of T;SFU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Toronto", "aff_country_unique_index": "0;1;1;1;1", "aff_country_unique": "China;Canada" }, { "id": "CA51pvZJ0xX", "title": "Robust Feature Selection using Sparse Centroid-Encoder", "track": "main", "status": "Reject", "tldr": "", "abstract": "We develop a sparse optimization problem for the determination of the total set of features that discriminate two or more classes. This is a sparse implementa- tion of the centroid-encoder for nonlinear data reduction and visualization called Sparse Centroid-Encoder (SCE). We also provide an iterative feature selection al- gorithm that first ranks each feature by its occurrence, and the optimal number of features is chosen using a validation set. The algorithm is applied to a wide vari- ety of data sets including, single-cell biological data, high dimensional infectious disease data, hyperspectral data, image data, and GIS data. We compared our method to various state-of-the-art feature selection techniques, including three neural network-based models (DFS, SG-L1-NN, G-L1-NN), Sparse SVM, and Random Forest. We empirically showed that SCE features produced better classi- fication accuracy on the unseen test data, often with fewer features.", "keywords": "Feature Selection;Sparse Centroid-encoder;Non-linear feature Selection;Deep Feature Selection;Multi-class Feature Selection;Iterative Feature Selection", "primary_area": "", "supplementary_material": "/attachment/01f135905c69bb33e981936a5ab5e52a82bda383.zip", "author": "Tomojit Ghosh;Michael Kirby", "authorids": "~Tomojit_Ghosh1;~Michael_Kirby1", "gender": "M;", "homepage": "https://web1.cs.wright.edu/tomojit.ghosh/;https://www.math.colostate.edu/~kirby/", "dblp": "192/2727;", "google_scholar": "6AHKhcUAAAAJ;", "orcid": ";", "linkedin": ";", "or_profile": "~Tomojit_Ghosh1;~Michael_Kirby1", "aff": "Colorado State University;Colorado State University", "aff_domain": "colostate.edu;colostate.edu", "position": "PhD student;Full Professor", "bibtex": "@misc{\nghosh2022robust,\ntitle={Robust Feature Selection using Sparse Centroid-Encoder},\nauthor={Tomojit Ghosh and Michael Kirby},\nyear={2022},\nurl={https://openreview.net/forum?id=CA51pvZJ0xX}\n}", "github": "", "project": "", "reviewers": "zAAP;mFQD;p4kd;gY6z", "site": "https://openreview.net/forum?id=CA51pvZJ0xX", "pdf_size": 0, "recommendation": "3;3;3;3", "confidence": "5;4;3;5", "correctness": "4;3;4;3", "technical_novelty": "1;2;2;1", "empirical_novelty": "1;1;2;1", "wc_summary_paper": "53;23;73;105", "wc_summary_review": "32;58;21;320", "wc_main_review": "251;217;64;192", "wc_review": "336;298;158;617", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 4.25, 0.82915619758885 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 1.5, 0.5 ], "empirical_novelty_avg": [ 1.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 63.5, 29.845435161846776 ], "wc_summary_review_avg": [ 107.75, 123.27687333802719 ], "wc_main_review_avg": [ 181.0, 70.72128392499673 ], "wc_review_avg": [ 352.25, 166.60788546764525 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:KqxJlbvSQ-sJ:scholar.google.com/&scioq=Robust+Feature+Selection+using+Sparse+Centroid-Encoder&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Colorado State University", "aff_unique_dep": "", "aff_unique_url": "https://www.colostate.edu", "aff_unique_abbr": "CSU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Dynamics-Aware Comparison of Learned Reward Functions", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7041", "id": "CALFyKVs87", "poster": "", "openreview": "https://openreview.net/forum?id=CALFyKVs87", "slides": "https://iclr.cc/virtual/2022/poster/7041", "video": "https://iclr.cc/virtual/2022/poster/7041", "author_site": "Blake W Wulfe, Logan Ellis, Jean Mercat, Rowan T McAllister, Adrien Gaidon", "tldr": "", "abstract": "The ability to learn reward functions plays an important role in enabling the deployment of intelligent agents in the real world. However, $\\textit{comparing}$ reward functions, for example as a means of evaluating reward learning methods, presents a challenge. Reward functions are typically compared by considering the behavior of optimized policies, but this approach conflates deficiencies in the reward function with those of the policy search algorithm used to optimize it. To address this challenge, Gleave et al. (2020) propose the Equivalent-Policy Invariant Comparison (EPIC) distance. EPIC avoids policy optimization, but in doing so requires computing reward values at transitions that may be impossible under the system dynamics. This is problematic for learned reward functions because it entails evaluating them outside of their training distribution, resulting in inaccurate reward values that we show can render EPIC ineffective at comparing rewards. To address this problem, we propose the Dynamics-Aware Reward Distance (DARD), a new reward pseudometric. DARD uses an approximate transition model of the environment to transform reward functions into a form that allows for comparisons that are invariant to reward shaping while only evaluating reward functions on transitions close to their training distribution. Experiments in simulated physical domains demonstrate that DARD enables reliable reward comparisons without policy optimization and is significantly more predictive than baseline methods of downstream policy performance when dealing with learned reward functions.", "keywords": "Reward Learning;Inverse Reinforcement Learning;Reinforcement Learning;Comparing Reward Functions", "primary_area": "", "supplementary_material": "/attachment/66f4d83cb036753660d3553547f09c84531de4b2.zip", "author": "Blake Wulfe;Logan Michael Ellis;Jean Mercat;Rowan Thomas McAllister;Adrien Gaidon", "authorids": "~Blake_Wulfe1;logan.ellis@tri.global;jean.mercat@tri.global;~Rowan_Thomas_McAllister1;~Adrien_Gaidon1", "gender": ";;;M;", "homepage": ";;;https://rowanmcallister.github.io/;https://adriengaidon.com/", "dblp": ";;;123/6416;06/7548.html", "google_scholar": ";;;https://scholar.google.co.uk/citations?user=6uIhh6MAAAAJ;https://scholar.google.fr/citations?user=2StUgf4AAAAJ", "orcid": ";;;0000-0002-9519-2345;", "linkedin": ";;;rowantmcallister;adrien-gaidon-63ab2358/", "or_profile": "~Blake_Wulfe1;logan.ellis@tri.global;jean.mercat@tri.global;~Rowan_Thomas_McAllister1;~Adrien_Gaidon1", "aff": ";;;Toyota Research Institute;Toyota Research Institute (TRI)", "aff_domain": ";;;tri.global;tri.global", "position": ";;;Machine Learning Scientist;Head of ML", "bibtex": "@inproceedings{\nwulfe2022dynamicsaware,\ntitle={Dynamics-Aware Comparison of Learned Reward Functions},\nauthor={Blake Wulfe and Logan Michael Ellis and Jean Mercat and Rowan Thomas McAllister and Adrien Gaidon},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=CALFyKVs87}\n}", "github": "", "project": "", "reviewers": "4Nse;vDTh;6Mwn;LfGd", "pdf_size": 0, "recommendation": "5;6;8;8", "confidence": "3;3;3;3", "correctness": "3;3;4;4", "technical_novelty": "2;3;3;3", "empirical_novelty": "3;3;4;3", "wc_summary_paper": "52;140;95;129", "wc_summary_review": "21;73;40;64", "wc_main_review": "162;252;271;267", "wc_review": "235;465;406;460", "wc_reply_reviewers": "0;86;102;0", "wc_reply_authors": "907;676;739;403", "reply_reviewers": "0;1;1;0", "reply_authors": "2;3;3;2", "recommendation_avg": [ 6.75, 1.299038105676658 ], "confidence_avg": [ 3.0, 0.0 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 104.0, 34.30014577228499 ], "wc_summary_review_avg": [ 49.5, 20.402205763103165 ], "wc_main_review_avg": [ 238.0, 44.44659717008716 ], "wc_review_avg": [ 391.5, 93.26977002223175 ], "wc_reply_reviewers_avg": [ 47.0, 47.3392015141785 ], "wc_reply_authors_avg": [ 681.25, 181.48605318315785 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 2.5, 0.5 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.9622504486493761, "gs_citation": 26, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8954346332205578648&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=CALFyKVs87", "email": ";;;tri.global;tri.global", "author_num": 5, "aff_unique_index": "0;0", "aff_unique_norm": "Toyota Research Institute", "aff_unique_dep": "", "aff_unique_url": "https://www.tri.global", "aff_unique_abbr": "TRI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Generalized Decision Transformer for Offline Hindsight Information Matching", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6141", "id": "CAjxVodl_v", "poster": "", "openreview": "https://openreview.net/forum?id=CAjxVodl_v", "slides": "https://iclr.cc/virtual/2022/poster/6141", "video": "https://iclr.cc/virtual/2022/poster/6141", "author_site": "Hiroki Furuta, Yutaka Matsuo, Shixiang Gu", "tldr": "", "abstract": "How to extract as much learning signal from each trajectory data has been a key problem in reinforcement learning (RL), where sample inefficiency has posed serious challenges for practical applications. Recent works have shown that using expressive policy function approximators and conditioning on future trajectory information -- such as future states in hindsight experience replay (HER) or returns-to-go in Decision Transformer (DT) -- enables efficient learning of multi-task policies, where at times online RL is fully replaced by offline behavioral cloning (BC), e.g. sequence modeling. We demonstrate that all these approaches are doing hindsight information matching (HIM) -- training policies that can output the rest of trajectory that matches some statistics of future state information. We present Generalized Decision Transformer (GDT) for solving any HIM problem, and show how different choices for the feature function and the anti-causal aggregator not only recover DT as a special case, but also lead to novel Categorical DT (CDT) and Bi-directional DT (BDT) for matching different statistics of the future. For evaluating CDT and BDT, we define offline multi-task state-marginal matching (SMM) and imitation learning (IL) as two generic HIM problems, propose a Wasserstein distance loss as a metric for both, and empirically study them on MuJoCo continuous control benchmarks. Categorical DT, which simply replaces anti-causal summation with anti-causal binning in DT, enables arguably the first effective offline multi-task SMM algorithm that generalizes well to unseen (and even synthetic) multi-modal reward or state-feature distributions. Bi-directional DT, which uses an anti-causal second transformer as the aggregator, can learn to model any statistics of the future and outperforms DT variants in offline multi-task IL, i.e. one-shot IL. Our generalized formulations from HIM and GDT greatly expand the role of powerful sequence modeling architectures in modern RL.", "keywords": "Hindsight Information Matching;Decision Transformer;State-Marginal Matching;Hindsight Experience Replay;Reinforcement Learning", "primary_area": "", "supplementary_material": "/attachment/f9a4c7bcae4543507a50a18992eba14bd3d7eab0.zip", "author": "Hiroki Furuta;Yutaka Matsuo;Shixiang Shane Gu", "authorids": "~Hiroki_Furuta1;~Yutaka_Matsuo1;~Shixiang_Shane_Gu1", "gender": "M;M;M", "homepage": "https://github.com/frt03;http://ymatsuo.com;https://sites.google.com/view/gugurus/home", "dblp": "267/2065;m/YMatsuo.html;121/0550", "google_scholar": "M0OhM1UAAAAJ;Dy8iau4AAAAJ;B8wslVsAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Hiroki_Furuta1;~Yutaka_Matsuo1;~Shixiang_Gu1", "aff": "The University of Tokyo;The University of Tokyo;Google", "aff_domain": "u-tokyo.ac.jp;u-tokyo.ac.jp;google.com", "position": "MS student;Associate Professor;Senior Research Scientist", "bibtex": "@inproceedings{\nfuruta2022generalized,\ntitle={Generalized Decision Transformer for Offline Hindsight Information Matching},\nauthor={Hiroki Furuta and Yutaka Matsuo and Shixiang Shane Gu},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=CAjxVodl_v}\n}", "github": "", "project": "", "reviewers": "cUy5;Ffah;5xwG", "pdf_size": 0, "recommendation": "6;8;8", "confidence": "4;4;4", "correctness": "3;3;3", "technical_novelty": "2;3;3", "empirical_novelty": "2;3;3", "wc_summary_paper": "39;105;122", "wc_summary_review": "67;143;126", "wc_main_review": "903;268;945", "wc_review": "1009;516;1193", "wc_reply_reviewers": "588;0;407", "wc_reply_authors": "1752;678;1484", "reply_reviewers": "3;0;2", "reply_authors": "4;1;3", "recommendation_avg": [ 7.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 88.66666666666667, 35.798820588890294 ], "wc_summary_review_avg": [ 112.0, 32.567877834864625 ], "wc_main_review_avg": [ 705.3333333333334, 309.71635769236053 ], "wc_review_avg": [ 906.0, 285.8192902284006 ], "wc_reply_reviewers_avg": [ 331.6666666666667, 245.8893156596186 ], "wc_reply_authors_avg": [ 1304.6666666666667, 456.4276746892351 ], "reply_reviewers_avg": [ 1.6666666666666667, 1.247219128924647 ], "reply_authors_avg": [ 2.6666666666666665, 1.247219128924647 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 124, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4011968196773384178&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=CAjxVodl_v", "email": "u-tokyo.ac.jp;u-tokyo.ac.jp;google.com", "author_num": 3, "aff_unique_index": "0;0;1", "aff_unique_norm": "University of Tokyo;Google", "aff_unique_dep": ";Google", "aff_unique_url": "https://www.u-tokyo.ac.jp;https://www.google.com", "aff_unique_abbr": "UTokyo;Google", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;1", "aff_country_unique": "Japan;United States" }, { "id": "CBchIgBBrwj", "title": "Objective Evaluation of Deep Visual Interpretations on Time Series Data", "track": "main", "status": "Reject", "tldr": "", "abstract": "The correct interpretation and understanding of deep learning models is essential in many applications.\n(Explanatory) visual interpretation approaches for image and natural language processing allow domain experts to validate and understand almost any deep learning model. However, they fall short when generalizing to arbitrary time series data that is less intuitive and more diverse. Whether a visualization explains the true reasoning or captures the real features is more difficult to judge. Hence, instead of blind trust we need an objective evaluation to obtain reliable quality metrics. This paper proposes a framework of six orthogonal quality metrics for gradient- or perturbation-based post-hoc visual interpretation methods designed for time series classification and segmentation tasks. This comprehensive set is either based on \"human perception\" or on \"functional properties\". An extensive experimental study includes commonly used neural network architectures for time series and nine visual interpretation methods. We evaluate the visual interpretation methods with diverse datasets from the UCR repository as well another complex real-world dataset. We show that none of the existing methods consistently outperforms any of the others on all metrics while some of them are ahead in either functional or human-based metrics. Our results allow experts to make an informed choice of suitable visualization techniques for the model and task at hand.", "keywords": "explainable ai;deep learning;time series;visual interpretation;evaluation metrics;classification;segmentation", "primary_area": "", "supplementary_material": "/attachment/4647bf4d38a1c943070a1835e69ddf8b97b3a5a5.zip", "author": "Christoffer L\u00f6ffler;Wei-Cheng Lai;Lukas M Schmidt;Dario Zanca;Bjoern Eskofier;Christopher Mutschler", "authorids": "~Christoffer_L\u00f6ffler1;~Wei-Cheng_Lai2;~Lukas_M_Schmidt1;~Dario_Zanca1;~Bjoern_Eskofier1;~Christopher_Mutschler1", "gender": "M;;M;M;;M", "homepage": "https://christofferloeffler.com;;;https://www.mad.tf.fau.de/person/dario-zanca/;;https://www.cmutschler.de", "dblp": "141/5637;;;198/1401;;118/7748", "google_scholar": "bIaHh6gAAAAJ;;;KjwaSXkAAAAJ;;https://scholar.google.de/citations?user=gKDSp8YAAAAJ", "orcid": "0000-0003-1834-8323;;;0000-0001-5886-0597;;0000-0001-8108-0230", "linkedin": ";;lukas-schmidt-41a94395/;dariozanca/;;christopher-mutschler-28431576/", "or_profile": "~Christoffer_L\u00f6ffler1;~Wei-Cheng_Lai2;~Lukas_M_Schmidt1;~Dario_Zanca1;~Bjoern_Eskofier1;~Christopher_Mutschler1", "aff": "Fraunhofer IIS;;Fraunhofer-Institut f\u00fcr Integrierte Schaltungen;Friedrich-Alexander-Universit\u00e4t Erlangen-N\u00fcrnberg;;Fraunhofer IIS", "aff_domain": "fraunhofer.de;;iis.fraunhofer.de;fau.de;;fraunhofer.de", "position": "Scientific Associate;;PhD student;Postdoc;;Principal Researcher", "bibtex": "@misc{\nl{\\\"o}ffler2022objective,\ntitle={Objective Evaluation of Deep Visual Interpretations on Time Series Data},\nauthor={Christoffer L{\\\"o}ffler and Wei-Cheng Lai and Lukas M Schmidt and Dario Zanca and Bjoern Eskofier and Christopher Mutschler},\nyear={2022},\nurl={https://openreview.net/forum?id=CBchIgBBrwj}\n}", "github": "", "project": "", "reviewers": "9Qic;cLNa;oySW;aJfy", "site": "https://openreview.net/forum?id=CBchIgBBrwj", "pdf_size": 0, "recommendation": "3;5;6;6", "confidence": "4;2;2;4", "correctness": "2;4;4;3", "technical_novelty": "1;2;2;2", "empirical_novelty": "1;2;3;3", "wc_summary_paper": "35;66;65;34", "wc_summary_review": "42;22;18;79", "wc_main_review": "337;104;292;80", "wc_review": "414;192;375;193", "wc_reply_reviewers": "494;51;15;29", "wc_reply_authors": "1368;115;249;145", "reply_reviewers": "2;1;1;1", "reply_authors": "2;1;1;1", "recommendation_avg": [ 5.0, 1.224744871391589 ], "confidence_avg": [ 3.0, 1.0 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 1.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 50.0, 15.508062419270823 ], "wc_summary_review_avg": [ 40.25, 24.149275351446885 ], "wc_main_review_avg": [ 203.25, 112.70176351770189 ], "wc_review_avg": [ 293.5, 101.93748083997367 ], "wc_reply_reviewers_avg": [ 147.25, 200.6070474833823 ], "wc_reply_authors_avg": [ 469.25, 521.2707429925451 ], "reply_reviewers_avg": [ 1.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.40824829046386296, "corr_recommendation_correctness": 0.7385489458759963, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:LY7q4bAYStkJ:scholar.google.com/&scioq=Objective+Evaluation+of+Deep+Visual+Interpretations+on+Time+Series+Data&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "Fraunhofer Institute for Integrated Circuits;Fraunhofer-Institut f\u00fcr Integrierte Schaltungen;Friedrich-Alexander-Universit\u00e4t Erlangen-N\u00fcrnberg", "aff_unique_dep": ";;", "aff_unique_url": "https://www.iis.fraunhofer.de/;https://www.iais.fraunhofer.de/;https://www fau.de", "aff_unique_abbr": "Fraunhofer IIS;Fraunhofer IIS;FAU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Germany" }, { "id": "CC-BbehJKTe", "title": "Building the Building Blocks: From Simplification to Winning Trees in Genetic Programming", "track": "main", "status": "Reject", "tldr": "", "abstract": "Genetic Programming (GP) represents a powerful paradigm in diverse real-world applications. While GP can reach optimal (or at least ``good-enough'') solutions for many problems, such solutions are not without deficiencies. A frequent issue stems from the representation perspective where GP evolves solutions that contain unnecessary parts, known as program bloat.\nThis paper first investigates a combination of deterministic and random simplification to simplify the solutions while having a (relatively) small influence on the solution fitness. Afterward, we use the solutions to extract their subtrees, which we denote as winning trees. The winning trees can be used to initialize the population for the new GP run and result in improved convergence and fitness, provided some conditions on the size of solutions and winning trees are fulfilled. To experimentally validate our approach, we consider several synthetic benchmark problems and real-world symbolic regression problems.", "keywords": "Genetic Programming;Building Blocks;Regression;Bloat;Experimental evaluation", "primary_area": "", "supplementary_material": "", "author": "Lucija Planini\u0107;Marko \u0110urasevi\u0107;Stjepan Picek;Domagoj Jakobovic", "authorids": "~Lucija_Planini\u01071;~Marko_\u0110urasevi\u01071;stjepan@computer.org;~Domagoj_Jakobovic1", "gender": "F;M;;M", "homepage": ";http://www.zemris.fer.hr/~idurasevic/;;http://www.fer.unizg.hr/domagoj.jakobovic", "dblp": ";;;82/5852.html", "google_scholar": "https://scholar.google.no/citations?user=0Yf_H0cAAAAJ;;;https://scholar.google.hr/citations?user=C0YLdbwAAAAJ", "orcid": ";;;0000-0002-9201-2994", "linkedin": ";;;", "or_profile": "~Lucija_Planini\u01071;~Marko_\u0110urasevi\u01071;stjepan@computer.org;~Domagoj_Jakobovic1", "aff": "UniZg-FER, University of Zagreb;Faculty of Electrical Engineering and Computing, University of Zagreb;;University of Zagreb", "aff_domain": "fer.unizg.hr;fer.hr;;unizg.hr", "position": "PhD student;Assistant Professor;;Full Professor", "bibtex": "@misc{\nplanini{\\'c}2022building,\ntitle={Building the Building Blocks: From Simplification to Winning Trees in Genetic Programming},\nauthor={Lucija Planini{\\'c} and Marko {\\DJ}urasevi{\\'c} and Stjepan Picek and Domagoj Jakobovic},\nyear={2022},\nurl={https://openreview.net/forum?id=CC-BbehJKTe}\n}", "github": "", "project": "", "reviewers": "Q7WW;77nm;mNLH;W75E", "site": "https://openreview.net/forum?id=CC-BbehJKTe", "pdf_size": 0, "recommendation": "1;3;3;3", "confidence": "5;5;4;3", "correctness": "1;2;2;3", "technical_novelty": "1;2;2;2", "empirical_novelty": "1;2;2;2", "wc_summary_paper": "106;41;66;86", "wc_summary_review": "60;151;46;76", "wc_main_review": "498;820;389;158", "wc_review": "664;1012;501;320", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 2.5, 0.8660254037844386 ], "confidence_avg": [ 4.25, 0.82915619758885 ], "correctness_avg": [ 2.0, 0.7071067811865476 ], "technical_novelty_avg": [ 1.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 74.75, 24.076700355322778 ], "wc_summary_review_avg": [ 83.25, 40.53008142108772 ], "wc_main_review_avg": [ 466.25, 238.29223130433775 ], "wc_review_avg": [ 624.25, 254.7983271138176 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.5222329678670935, "corr_recommendation_correctness": 0.816496580927726, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:FGMVHzw-7sAJ:scholar.google.com/&scioq=Building+the+Building+Blocks:+From+Simplification+to+Winning+Trees+in+Genetic+Programming&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Zagreb", "aff_unique_dep": "Faculty of Electrical Engineering and Computing", "aff_unique_url": "https://www.fer.unizg.hr", "aff_unique_abbr": "UniZg-FER", "aff_campus_unique_index": "0", "aff_campus_unique": "Zagreb;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Croatia" }, { "title": "Neural Link Prediction with Walk Pooling", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7079", "id": "CCu6RcUMwK0", "poster": "", "openreview": "https://openreview.net/forum?id=CCu6RcUMwK0", "slides": "https://iclr.cc/virtual/2022/poster/7079", "video": "https://iclr.cc/virtual/2022/poster/7079", "author_site": "Liming Pan, Cheng Shi, Ivan Dokmanic", "tldr": "", "abstract": "Graph neural networks achieve high accuracy in link prediction by jointly leveraging graph topology and node attributes. Topology, however, is represented indirectly; state-of-the-art methods based on subgraph classification label nodes with distance to the target link, so that, although topological information is present, it is tempered by pooling. This makes it challenging to leverage features like loops and motifs associated with network formation mechanisms. We propose a link prediction algorithm based on a new pooling scheme called WalkPool. WalkPool combines the expressivity of topological heuristics with the feature-learning ability of neural networks. It summarizes a putative link by random walk probabilities of adjacent paths. Instead of extracting transition probabilities from the original graph, it computes the transition matrix of a ``predictive'' latent graph by applying attention to learned features; this may be interpreted as feature-sensitive topology fingerprinting. WalkPool can leverage unsupervised node features or be combined with GNNs and trained end-to-end. It outperforms state-of-the-art methods on all common link prediction benchmarks, both homophilic and heterophilic, with and without node attributes. Applying WalkPool to a set of unsupervised GNNs significantly improves prediction accuracy, suggesting that it may be used as a general-purpose graph pooling scheme. ", "keywords": "Graph neural network;Link prediction;Random walk;Graph topology.", "primary_area": "", "supplementary_material": "/attachment/5a654633d2e6d28cb2b30509b34926fd851a1c48.zip", "author": "Liming Pan;Cheng Shi;Ivan Dokmani\u0107", "authorids": "~Liming_Pan1;~Cheng_Shi2;~Ivan_Dokmani\u01071", "gender": "M;M;M", "homepage": "http://schools.njnu.edu.cn/computer/person/liming-pan;https://dmi.unibas.ch/de/personen/cheng-shi/;http://dokmanic.ece.illinois.edu", "dblp": "142/2952;;52/8859", "google_scholar": "j_k7KJAAAAAJ;;0SQnwL4AAAAJ", "orcid": "0000-0001-7617-1939;;", "linkedin": ";;", "or_profile": "~Liming_Pan1;~Cheng_Shi2;~Ivan_Dokmanic1", "aff": "Nanjing Normal University;University of Basel;University of Basel", "aff_domain": "njnu.edu.cn;unibas.ch;unibas.ch", "position": "Assistant Professor;PhD student;Associate Professor", "bibtex": "@inproceedings{\npan2022neural,\ntitle={Neural Link Prediction with Walk Pooling},\nauthor={Liming Pan and Cheng Shi and Ivan Dokmani{\\'c}},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=CCu6RcUMwK0}\n}", "github": "", "project": "", "reviewers": "yAGw;ye2u;C9p9;MHUH", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "4;4;4;4", "correctness": "2;3;3;3", "technical_novelty": "1;3;3;3", "empirical_novelty": "1;2;3;3", "wc_summary_paper": "47;108;97;95", "wc_summary_review": "38;133;144;45", "wc_main_review": "126;43;96;326", "wc_review": "211;284;337;466", "wc_reply_reviewers": "0;0;0;59", "wc_reply_authors": "1418;520;891;995", "reply_reviewers": "0;0;0;1", "reply_authors": "3;2;3;3", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.8660254037844386 ], "empirical_novelty_avg": [ 2.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 86.75, 23.47738273317535 ], "wc_summary_review_avg": [ 90.0, 48.71857961804716 ], "wc_main_review_avg": [ 147.75, 107.11763393578109 ], "wc_review_avg": [ 324.5, 93.14102211163457 ], "wc_reply_reviewers_avg": [ 14.75, 25.54774941164094 ], "wc_reply_authors_avg": [ 956.0, 319.8773202338672 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 2.75, 0.4330127018922193 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.6622661785325219, "gs_citation": 83, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11799693892452603057&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=CCu6RcUMwK0", "email": "njnu.edu.cn;unibas.ch;unibas.ch", "author_num": 3, "aff_unique_index": "0;1;1", "aff_unique_norm": "Nanjing Normal University;University of Basel", "aff_unique_dep": ";", "aff_unique_url": "http://www.nju.edu.cn;https://www.unibas.ch", "aff_unique_abbr": "NNU;UniBas", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1", "aff_country_unique": "China;Switzerland" }, { "id": "CD_gGnX9RnD", "title": "Early-Stopping for Meta-Learning: Estimating Generalization from the Activation Dynamics", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Early-stopping, a fundamental element of machine learning practice, aims to halt the training of a model when it reaches optimal generalization to unseen examples, right before the overfitting regime on the training data. Meta-Learning algorithms for few-shot learning aim to train neural networks capable of adapting to novel tasks using only a few labelled examples, in order to achieve good generalization. However, current early-stopping practices in meta-learning are problematic since there may be an arbitrary large distributional shift between the meta-validation set coming from the training data, and the meta-test set. This is even more critical in few-shot transfer learning where the meta-test set comes from a different target dataset. To this end, we empirically show that as meta-training progresses, a model's generalization to a target distribution of novel tasks can be estimated by analysing the dynamics of its neural activations. We propose a method for estimating optimal early-stopping time from the neural activation dynamics of just a few unlabelled support examples from the target distribution, and we demonstrate its performance with various meta-learning algorithms, few-shot datasets and transfer regimes.", "keywords": "Meta-Learning;Generalization;Early-Stopping;Out-of-Distribution", "primary_area": "", "supplementary_material": "", "author": "Simon Guiroy;Christopher Pal;Sarath Chandar", "authorids": "~Simon_Guiroy1;~Christopher_Pal1;~Sarath_Chandar1", "gender": "Not Specified;;M", "homepage": "https://simonguiroy.github.io/;https://scholar.google.ca/citations?user=1ScWJOoAAAAJ&hl=en&oi=ao;http://sarathchandar.in/", "dblp": ";45/1217;45/8542", "google_scholar": ";https://scholar.google.ca/citations?user=1ScWJOoAAAAJ;https://scholar.google.co.in/citations?user=yxWtZLAAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Simon_Guiroy1;~Christopher_Pal1;~Sarath_Chandar1", "aff": "Montreal Institute for Learning Algorithms, University of Montreal, University of Montreal;Polytechnique Montreal;\u00c9cole Polytechnique de Montr\u00e9al", "aff_domain": "mila.umontreal.ca;polymtl.ca;polymtl.ca", "position": "PhD student;Full Professor;Assistant Professor", "bibtex": "@misc{\nguiroy2022earlystopping,\ntitle={Early-Stopping for Meta-Learning: Estimating Generalization from the Activation Dynamics},\nauthor={Simon Guiroy and Christopher Pal and Sarath Chandar},\nyear={2022},\nurl={https://openreview.net/forum?id=CD_gGnX9RnD}\n}", "github": "", "project": "", "reviewers": "kBMs;eCkK;BsK1;uucr", "site": "https://openreview.net/forum?id=CD_gGnX9RnD", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "3;4;4;4", "correctness": "3;2;3;4", "technical_novelty": "2;1;3;1", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "52;90;102;67", "wc_summary_review": "23;39;40;89", "wc_main_review": "276;830;202;697", "wc_review": "351;959;344;853", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 1.75, 0.82915619758885 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 77.75, 19.472737352514155 ], "wc_summary_review_avg": [ 47.75, 24.752525123712125 ], "wc_main_review_avg": [ 501.25, 267.7138164159631 ], "wc_review_avg": [ 626.75, 281.7644184420737 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": 0.816496580927726, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:zI5thhzpdQ8J:scholar.google.com/&scioq=Early-Stopping+for+Meta-Learning:+Estimating+Generalization+from+the+Activation+Dynamics&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;2", "aff_unique_norm": "University of Montreal;Polytechnique Montreal;\u00c9cole Polytechnique de Montr\u00e9al", "aff_unique_dep": "Montreal Institute for Learning Algorithms;;", "aff_unique_url": "https://www.umontreal.ca;https://www.polymtl.ca;https://www.polymtl.ca", "aff_unique_abbr": "UM;PolyMTL;Polytechnique Montr\u00e9al", "aff_campus_unique_index": "0;0;1", "aff_campus_unique": "Montreal;Montr\u00e9al", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Canada" }, { "id": "CES-KyrKcTM", "title": "The weighted mean trick \u2013 optimization strategies for robustness", "track": "main", "status": "Reject", "tldr": "", "abstract": "We prove that minimizing a weighted mean results in optimizing the higher-order moments of the loss distribution such as the variance, skewness, and kurtosis. By optimizing the higher-order moments, one can tighten the upper bound of the loss mean deviating from the true expectation and improve the robustness against outliers. Such types of optimization problems often lead to non-convex objectives, therefore, we explore the extent to which the proposed weighted mean trick preserves convexity, albeit at times at a decrease in efficiency. Experimental results show that the weighted mean trick exhibits similar performance with other specialized robust loss functions when training on noisy datasets while providing a stronger theoretical background. The proposed weighted mean trick is a simple yet powerful optimization framework that is easy to integrate into existing works.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/d3fb280849c8e3103e6d46c6d18fa39eafa3122c.zip", "author": "Valeriu Balaban;Paul Bogdan", "authorids": "~Valeriu_Balaban1;~Paul_Bogdan1", "gender": "M;M", "homepage": ";https://cps.usc.edu/", "dblp": ";05/5539", "google_scholar": ";Xw_v8-gAAAAJ", "orcid": "0000-0001-8752-1703;0000-0003-2118-0816", "linkedin": ";paul-bogdan-4b098a6/", "or_profile": "~Valeriu_Balaban1;~Paul_Bogdan1", "aff": ";University of Southern California", "aff_domain": ";usc.edu", "position": ";Jack Munushian Early Career Chair associate professor", "bibtex": "@misc{\nbalaban2022the,\ntitle={The weighted mean trick {\\textendash} optimization strategies for robustness},\nauthor={Valeriu Balaban and Paul Bogdan},\nyear={2022},\nurl={https://openreview.net/forum?id=CES-KyrKcTM}\n}", "github": "", "project": "", "reviewers": "ndU8;i96S;Mbr3;fmnG", "site": "https://openreview.net/forum?id=CES-KyrKcTM", "pdf_size": 0, "recommendation": "1;3;5;6", "confidence": "3;4;3;4", "correctness": "2;4;3;4", "technical_novelty": "1;2;2;3", "empirical_novelty": "0;2;2;3", "wc_summary_paper": "80;101;40;131", "wc_summary_review": "65;33;30;61", "wc_main_review": "482;987;276;65", "wc_review": "627;1121;346;257", "wc_reply_reviewers": "277;121;0;0", "wc_reply_authors": "763;1868;806;269", "reply_reviewers": "2;1;0;0", "reply_authors": "2;3;1;1", "recommendation_avg": [ 3.75, 1.920286436967152 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 1.75, 1.0897247358851685 ], "wc_summary_paper_avg": [ 88.0, 33.1134413795969 ], "wc_summary_review_avg": [ 47.25, 15.848895860595462 ], "wc_main_review_avg": [ 452.5, 342.00475142898233 ], "wc_review_avg": [ 587.75, 336.7991797792863 ], "wc_reply_reviewers_avg": [ 99.5, 113.76401012622577 ], "wc_reply_authors_avg": [ 926.5, 583.0911163789069 ], "reply_reviewers_avg": [ 0.75, 0.82915619758885 ], "reply_authors_avg": [ 1.75, 0.82915619758885 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.39056673294247163, "corr_recommendation_correctness": 0.6673083711820306, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:byeV0a28f_AJ:scholar.google.com/&scioq=The+weighted+mean+trick+%E2%80%93+optimization+strategies+for+robustness&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "University of Southern California", "aff_unique_dep": "", "aff_unique_url": "https://www.usc.edu", "aff_unique_abbr": "USC", "aff_campus_unique_index": "0", "aff_campus_unique": "Los Angeles", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "title": "On Distributed Adaptive Optimization with Gradient Compression", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6171", "id": "CI-xXX9dg9l", "poster": "", "openreview": "https://openreview.net/forum?id=CI-xXX9dg9l", "slides": "https://iclr.cc/virtual/2022/poster/6171", "video": "https://iclr.cc/virtual/2022/poster/6171", "author_site": "Xiaoyun Li, Belhal Karimi, Ping Li", "tldr": "", "abstract": "We study COMP-AMS, a distributed optimization framework based on gradient averaging and adaptive AMSGrad algorithm. Gradient compression with error feedback is applied to reduce the communication cost in the gradient transmission process. Our convergence analysis of COMP-AMS shows that such compressed gradient averaging strategy yields same convergence rate as standard AMSGrad, and also exhibits the linear speedup effect w.r.t. the number of local workers. Compared with recently proposed protocols on distributed adaptive methods, COMP-AMS is simple and convenient. Numerical experiments are conducted to justify the theoretical findings, and demonstrate that the proposed method can achieve same test accuracy as the full-gradient AMSGrad with substantial communication savings. With its simplicity and efficiency, COMP-AMS can serve as a useful distributed training framework for adaptive methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xiaoyun Li;Belhal Karimi;Ping Li", "authorids": "~Xiaoyun_Li1;~Belhal_Karimi1;~Ping_Li3", "gender": "M;M;M", "homepage": "https://lixiaoyun0239.github.io/cv/;http://belhalk.github.io;http://www.stat.rutgers.edu/home/pingli/", "dblp": ";;62/5860-1", "google_scholar": ";https://scholar.google.fr/citations?user=Xh_OIWkAAAAJ;", "orcid": ";;", "linkedin": ";belhal-karimi-2baa71a5/;", "or_profile": "~Xiaoyun_Li1;~Belhal_Karimi1;~Ping_Li3", "aff": "Baidu;Baidu Research;LinkedIn", "aff_domain": "baidu.com;baidu.com;linkedin.com", "position": "Researcher;Postdoc;Engineer", "bibtex": "@inproceedings{\nli2022on,\ntitle={On Distributed Adaptive Optimization with Gradient Compression},\nauthor={Xiaoyun Li and Belhal Karimi and Ping Li},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=CI-xXX9dg9l}\n}", "github": "", "project": "", "reviewers": "7SPd;c4KT;ou6Y", "pdf_size": 0, "recommendation": "5;8;8", "confidence": "3;2;3", "correctness": "4;4;4", "technical_novelty": "3;3;3", "empirical_novelty": "3;3;3", "wc_summary_paper": "63;62;77", "wc_summary_review": "39;33;29", "wc_main_review": "280;164;134", "wc_review": "382;259;240", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "767;73;186", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "recommendation_avg": [ 7.0, 1.4142135623730951 ], "confidence_avg": [ 2.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 4.0, 0.0 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 67.33333333333333, 6.847546194724712 ], "wc_summary_review_avg": [ 33.666666666666664, 4.109609335312651 ], "wc_main_review_avg": [ 192.66666666666666, 62.95677529508286 ], "wc_review_avg": [ 293.6666666666667, 62.94088937690312 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 342.0, 304.0405674686631 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.5, "corr_recommendation_correctness": 0.0, "gs_citation": 27, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10565420961049511001&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=CI-xXX9dg9l", "email": "baidu.com;baidu.com;linkedin.com", "author_num": 3, "aff_unique_index": "0;0;1", "aff_unique_norm": "Baidu;LinkedIn Corporation", "aff_unique_dep": "Baidu, Inc.;", "aff_unique_url": "https://www.baidu.com;https://www.linkedin.com", "aff_unique_abbr": "Baidu;LinkedIn", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1", "aff_country_unique": "China;United States" }, { "title": "Learning to Generalize across Domains on Single Test Samples", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/5984", "id": "CIaQKbTBwtU", "poster": "", "openreview": "https://openreview.net/forum?id=CIaQKbTBwtU", "slides": "https://iclr.cc/virtual/2022/poster/5984", "video": "https://iclr.cc/virtual/2022/poster/5984", "author_site": "Zehao Xiao, Xiantong Zhen, Ling Shao, Cees G Snoek", "tldr": "", "abstract": "We strive to learn a model from a set of source domains that generalizes well to unseen target domains. The main challenge in such a domain generalization scenario is the unavailability of any target domain data during training, resulting in the learned model not being explicitly adapted to the unseen target domains. We propose learning to generalize across domains on single test samples. We leverage a meta-learning paradigm to learn our model to acquire the ability of adaptation with single samples at training time so as to further adapt itself to each single test sample at test time. We formulate the adaptation to the single test sample as a variational Bayesian inference problem, which incorporates the test sample as a conditional into the generation of model parameters. The adaptation to each test sample requires only one feed-forward computation at test time without any fine-tuning or self-supervised training on additional data from the unseen domains. Extensive ablation studies demonstrate that our model learns the ability to adapt models to each single sample by mimicking domain shifts during training. Further, our model achieves at least comparable -- and often better -- performance than state-of-the-art methods on multiple benchmarks for domain generalization.", "keywords": "domain generalization;single test sample generalization;meta learning;variational inference", "primary_area": "", "supplementary_material": "", "author": "Zehao Xiao;Xiantong Zhen;Ling Shao;Cees G. M. Snoek", "authorids": "~Zehao_Xiao1;~Xiantong_Zhen1;~Ling_Shao1;~Cees_G._M._Snoek1", "gender": "M;M;M;M", "homepage": "https://zzzx1224.github.io/;;;http://www.ceessnoek.info", "dblp": "225/5426;78/10651;;s/CeesSnoek", "google_scholar": "https://scholar.google.com/citations?hl=zh-CN;https://scholar.google.ca/citations?user=DnBb3e0AAAAJ;z84rLjoAAAAJ;https://scholar.google.nl/citations?user=0uKdbscAAAAJ", "orcid": ";;;0000-0001-9092-1556", "linkedin": ";;;cgmsnoek/", "or_profile": "~Zehao_Xiao1;~Xiantong_Zhen1;~Ling_Shao1;~Cees_Snoek1", "aff": "University of Amsterdam;Inception Institute of Artificial Intelligence;Terminus Group;University of Amsterdam", "aff_domain": "uva.nl;inceptioniai.org;terminusgroup.com;uva.nl", "position": "PhD student;Senior Scientist;Chief Scientist;Full Professor", "bibtex": "@inproceedings{\nxiao2022learning,\ntitle={Learning to Generalize across Domains on Single Test Samples},\nauthor={Zehao Xiao and Xiantong Zhen and Ling Shao and Cees G. M. Snoek},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=CIaQKbTBwtU}\n}", "github": "", "project": "", "reviewers": "xA1m;6n8V;n6z3;dNDg;wnp4", "pdf_size": 0, "recommendation": "5;5;8;8;8", "confidence": "3;3;4;4;4", "correctness": "3;4;3;3;4", "technical_novelty": "3;3;2;3;3", "empirical_novelty": "3;2;2;2;3", "wc_summary_paper": "65;53;91;75;277", "wc_summary_review": "359;39;21;109;224", "wc_main_review": "244;239;818;253;602", "wc_review": "668;331;930;437;1103", "wc_reply_reviewers": "390;0;373;138;606", "wc_reply_authors": "691;712;556;296;272", "reply_reviewers": "1;0;1;1;2", "reply_authors": "2;1;1;1;1", "recommendation_avg": [ 6.8, 1.469693845669907 ], "confidence_avg": [ 3.6, 0.4898979485566356 ], "correctness_avg": [ 3.4, 0.4898979485566356 ], "technical_novelty_avg": [ 2.8, 0.39999999999999997 ], "empirical_novelty_avg": [ 2.4, 0.4898979485566356 ], "wc_summary_paper_avg": [ 112.2, 83.33642660925653 ], "wc_summary_review_avg": [ 150.4, 126.33226032965611 ], "wc_main_review_avg": [ 431.2, 237.70856105744278 ], "wc_review_avg": [ 693.8, 290.3345656307564 ], "wc_reply_reviewers_avg": [ 301.4, 211.32874863586355 ], "wc_reply_authors_avg": [ 505.4, 188.68767845304578 ], "reply_reviewers_avg": [ 1.0, 0.6324555320336759 ], "reply_authors_avg": [ 1.2, 0.4000000000000001 ], "replies_avg": [ 22, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 1.0, "corr_recommendation_correctness": -0.16666666666666669, "gs_citation": 37, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10799367073706985191&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=CIaQKbTBwtU", "email": "uva.nl;inceptioniai.org;terminusgroup.com;uva.nl", "author_num": 4, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "University of Amsterdam;Inception Institute of Artificial Intelligence;Terminus Group", "aff_unique_dep": ";;", "aff_unique_url": "https://www.uva.nl;https://www.inceptioniai.org;", "aff_unique_abbr": "UvA;;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", "aff_country_unique": "Netherlands;United Arab Emirates;" }, { "title": "Connectome-constrained Latent Variable Model of Whole-Brain Neural Activity", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6591", "id": "CJzi3dRlJE-", "poster": "", "openreview": "https://openreview.net/forum?id=CJzi3dRlJE-", "slides": "https://iclr.cc/virtual/2022/poster/6591", "video": "https://iclr.cc/virtual/2022/poster/6591", "author_site": "Lu Mi, Richard Xu, Sridhama Prakhya, Albert Lin, Nir Shavit, Aravinthan Samuel, Srinivas C Turaga", "tldr": "", "abstract": "The availability of both anatomical connectivity and brain-wide neural activity measurements in C. elegans make the worm a promising system for learning detailed, mechanistic models of an entire nervous system in a data-driven way. However, one faces several challenges when constructing such a model. We often do not have direct experimental access to important modeling details such as single-neuron dynamics and the signs and strengths of the synaptic connectivity. Further, neural activity can only be measured in a subset of neurons, often indirectly via calcium imaging, and significant trial-to-trial variability has been observed. To address these challenges, we introduce a connectome-constrained latent variable model (CC-LVM) of the unobserved voltage dynamics of the entire C. elegans nervous system and the observed calcium signals. We used the framework of variational autoencoders to fit parameters of the mechanistic simulation constituting the generative model of the LVM to calcium imaging observations. A variational approximate posterior distribution over latent voltage traces for all neurons is efficiently inferred using an inference network, and constrained by a prior distribution given by the biophysical simulation of neural dynamics. We applied this model to an experimental whole-brain dataset, and found that connectomic constraints enable our LVM to predict the activity of neurons whose activity were withheld significantly better than models unconstrained by a connectome. We explored models with different degrees of biophysical detail, and found that models with realistic conductance-based synapses provide markedly better predictions than current-based synapses for this system.", "keywords": "connectome;latent-variable model;variational autoencoder;biophysics;whole-brain;neural activity;calcium imaging;caenorhabditis elegans;voltage;generative model;inference network", "primary_area": "", "supplementary_material": "", "author": "Lu Mi;Richard Xu;Sridhama Prakhya;Albert Lin;Nir Shavit;Aravinthan Samuel;Srinivas C Turaga", "authorids": "~Lu_Mi1;~Richard_Xu2;~Sridhama_Prakhya1;~Albert_Lin2;~Nir_Shavit1;~Aravinthan_Samuel1;~Srinivas_C_Turaga1", "gender": "F;M;M;M;M;M;M", "homepage": "https://lumimim.github.io;https://sridhama.com;;http://people.csail.mit.edu/shanir/;https://scholar.harvard.edu/aravisamuel;https://www.janelia.org/lab/turaga-lab;", "dblp": "185/3258;https://dblp.uni-trier.de/pid/242/9030;;s/NirShavit;;91/747;", "google_scholar": "vokCG-MAAAAJ;f1X0cXMAAAAJ;https://scholar.google.com/citations?view_op=list_works;;;V_NdI3sAAAAJ;", "orcid": ";;0000-0002-4541-5889;;;0000-0003-3247-6487;", "linkedin": "lu-mi-698899172/;;;;;srini-turaga-4934923/;richard-xu-8100ba20b/", "or_profile": "~Lu_Mi1;~Sridhama_Prakhya1;~Albert_Lin2;~Nir_Shavit1;~Aravinthan_Samuel1;~Srinivas_C_Turaga1;~Richard_Fan_Xu1", "aff": "Massachusetts Institute of Technology;Research, Google;Princeton University;Massachusetts Institute of Technology;Harvard University;HHMI Janelia Research Campus;Sanofi", "aff_domain": "mit.edu;research.google.com;princeton.edu;mit.edu;harvard.edu;janelia.hhmi.org;sanofi.com", "position": "PhD student;Researcher;Postdoc;;Full Professor;Associate Professor;Researcher", "bibtex": "@inproceedings{\nmi2022connectomeconstrained,\ntitle={Connectome-constrained Latent Variable Model of Whole-Brain Neural Activity},\nauthor={Lu Mi and Richard Xu and Sridhama Prakhya and Albert Lin and Nir Shavit and Aravinthan Samuel and Srinivas C Turaga},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=CJzi3dRlJE-}\n}", "github": "", "project": "", "reviewers": "aiwW;iieb;UhCh;5GJS", "pdf_size": 0, "recommendation": "3;6;8;8", "confidence": "4;4;3;4", "correctness": "3;3;3;3", "technical_novelty": "2;3;2;2", "empirical_novelty": "1;3;3;3", "wc_summary_paper": "41;156;66;160", "wc_summary_review": "34;141;84;71", "wc_main_review": "214;257;396;671", "wc_review": "289;554;546;902", "wc_reply_reviewers": "0;0;166;0", "wc_reply_authors": "308;441;684;985", "reply_reviewers": "0;0;1;0", "reply_authors": "1;1;1;2", "recommendation_avg": [ 6.25, 2.0463381929681126 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 105.75, 53.0112016464445 ], "wc_summary_review_avg": [ 82.5, 38.43501008195523 ], "wc_main_review_avg": [ 384.5, 178.5644141479483 ], "wc_review_avg": [ 572.75, 217.93734764835511 ], "wc_reply_reviewers_avg": [ 41.5, 71.88010851410841 ], "wc_reply_authors_avg": [ 604.5, 257.7523035784549 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": -0.49374193110101877, "corr_recommendation_correctness": 0.0, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8263950356253362857&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "pdf": "https://openreview.net/pdf?id=CJzi3dRlJE-", "email": "mit.edu;research.google.com;princeton.edu;mit.edu;harvard.edu;janelia.hhmi.org;sanofi.com", "author_num": 7, "aff_unique_index": "0;1;2;0;3;4;5", "aff_unique_norm": "Massachusetts Institute of Technology;Google;Princeton University;Harvard University;HHMI Janelia Research Campus;Sanofi", "aff_unique_dep": ";Google Research;;;;", "aff_unique_url": "https://web.mit.edu;https://research.google;https://www.princeton.edu;https://www.harvard.edu;https://www.janelia.org;https://www.sanofi.com", "aff_unique_abbr": "MIT;Google;Princeton;Harvard;HHMI Janelia;Sanofi", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Mountain View;Janelia", "aff_country_unique_index": "0;0;0;0;0;0;1", "aff_country_unique": "United States;France" }, { "title": "Learning State Representations via Retracing in Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7072", "id": "CLpxpXqqBV", "poster": "", "openreview": "https://openreview.net/forum?id=CLpxpXqqBV", "slides": "https://iclr.cc/virtual/2022/poster/7072", "video": "https://iclr.cc/virtual/2022/poster/7072", "author_site": "Changmin Yu, Dong Li, Jianye HAO, Jun Wang, Neil Burgess", "tldr": "", "abstract": "We propose learning via retracing, a novel self-supervised approach for learning the state representation (and the associated dynamics model) for reinforcement learning tasks. In addition to the predictive (reconstruction) supervision in the forward direction, we propose to include \"retraced\" transitions for representation/model learning, by enforcing the cycle-consistency constraint between the original and retraced states, hence improve upon the sample efficiency of learning. Moreover, learning via retracing explicitly propagates information about future transitions backward for inferring previous states, thus facilitates stronger representation learning for the downstream reinforcement learning tasks. We introduce Cycle-Consistency World Model (CCWM), a concrete model-based instantiation of learning via retracing. Additionally we propose a novel adaptive \"truncation\" mechanism for counteracting the negative impacts brought by \"irreversible\" transitions such that learning via retracing can be maximally effective. Through extensive empirical studies on visual-based continuous control benchmarks, we demonstrate that CCWM achieves state-of-the-art performance in terms of sample efficiency and asymptotic performance, whilst exhibiting behaviours that are indicative of stronger representation learning. ", "keywords": "Representation learning;model-based reinforcement learning", "primary_area": "", "supplementary_material": "/attachment/e39030f397b3ff1339770b1966f14b3366dd2a78.zip", "author": "Changmin Yu;Dong Li;Jianye HAO;Jun Wang;Neil Burgess", "authorids": "~Changmin_Yu1;~Dong_Li10;~Jianye_HAO1;~Jun_Wang2;~Neil_Burgess1", "gender": "M;M;M;M;M", "homepage": "https://changmin-yu.github.io;;http://www.icdai.org/jianye.html;http://www0.cs.ucl.ac.uk/staff/jun.wang/;https://www.ucl.ac.uk/icn/people/neil-burgess", "dblp": "266/9733;47/4826-16;21/7664.html;w/JunWang12;54/4203", "google_scholar": ";;;https://scholar.google.co.uk/citations?user=wIE1tY4AAAAJ;", "orcid": ";;0000-0002-0422-8235;;0000-0003-0646-6584", "linkedin": ";;;;", "or_profile": "~Changmin_Yu1;~Dong_Li10;~Jianye_HAO1;~Jun_Wang2;~Neil_Burgess1", "aff": "University College London;Huawei Technologies Ltd.;Tianjin University;University College London;University College London", "aff_domain": "ucl.ac.uk;huawei.com;tju.edu.cn;ucl.ac.uk;ucl.ac.uk", "position": "PhD student;Principal Researcher;Associate Professor;Professor;Full Professor", "bibtex": "@inproceedings{\nyu2022learning,\ntitle={Learning State Representations via Retracing in Reinforcement Learning},\nauthor={Changmin Yu and Dong Li and Jianye HAO and Jun Wang and Neil Burgess},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=CLpxpXqqBV}\n}", "github": "", "project": "", "reviewers": "NEVM;dZvG;HLp2;VSAG", "pdf_size": 0, "recommendation": "3;5;6;8", "confidence": "4;3;3;3", "correctness": "2;3;4;3", "technical_novelty": "3;3;3;3", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "96;58;105;70", "wc_summary_review": "33;31;48;45", "wc_main_review": "312;159;179;407", "wc_review": "441;248;332;522", "wc_reply_reviewers": "175;0;0;9", "wc_reply_authors": "2100;917;1188;834", "reply_reviewers": "1;0;0;1", "reply_authors": "3;2;2;3", "recommendation_avg": [ 5.5, 1.8027756377319946 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 82.25, 19.00493356999703 ], "wc_summary_review_avg": [ 39.25, 7.361215932167728 ], "wc_main_review_avg": [ 264.25, 101.2456789201396 ], "wc_review_avg": [ 385.75, 104.26019134837611 ], "wc_reply_reviewers_avg": [ 46.0, 74.56876021498547 ], "wc_reply_authors_avg": [ 1259.75, 502.47108125741926 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 2.5, 0.5 ], "replies_avg": [ 20, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.8006407690254357, "corr_recommendation_correctness": 0.5883484054145521, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5497480692580123615&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=CLpxpXqqBV", "email": "ucl.ac.uk;huawei.com;tju.edu.cn;ucl.ac.uk;ucl.ac.uk", "author_num": 5, "aff_unique_index": "0;1;2;0;0", "aff_unique_norm": "University College London;Huawei;Tianjin University", "aff_unique_dep": ";Huawei Technologies;", "aff_unique_url": "https://www.ucl.ac.uk;https://www.huawei.com;http://www.tju.edu.cn", "aff_unique_abbr": "UCL;Huawei;TJU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;0;0", "aff_country_unique": "United Kingdom;China" }, { "id": "CNY9h3uyfiO", "title": "Reward Shifting for Optimistic Exploration and Conservative Exploitation", "track": "main", "status": "Reject", "tldr": "", "abstract": "In this work, we study the simple yet universally applicable case of reward shaping, the linear transformation, in value-based Deep Reinforcement Learning. We show that reward shifting, as the simplest linear reward transformation, is equivalent to changing initialization of the $Q$-function in function approximation. Based on such an equivalence, we bring the key insight that a positive reward shifting leads to conservative exploitation, while a negative reward shifting leads to curiosity-driven exploration. In this case, a conservative exploitation improves offline RL value estimation, and the optimistic value estimation benefits the exploration of online RL. We verify our insight on a range of tasks: (1) In offline RL, the conservative exploitation leads to improved learning performance based on off-the-shelf algorithms; (2) In online continuous control, multiple value functions with different shifting constants can be used to trade-off between exploration and exploitation thus improving learning efficiency; (3) In online RL with discrete action space, a negative reward shifting brings an improvement over the previous curiosity-based exploration method.", "keywords": "Reward Shift;Reinforcement Learning;Batch RL;Offline RL;Online RL;Curiosity-Driven Method", "primary_area": "", "supplementary_material": "/attachment/3debede7b8c5647bea2c83bf0b31f04422d5e337.zip", "author": "Hao Sun;Lei Han;Jian Guo;Bolei Zhou", "authorids": "~Hao_Sun3;~Lei_Han1;~Jian_Guo2;~Bolei_Zhou5", "gender": "M;M;M;M", "homepage": "https://www.leihan.org;https://idea.edu.cn/person/guojian/;https://boleizhou.github.io/;https://holarissun.github.io", "dblp": "75/2307-1;96/2596-2;46/8066;SunLLZL19", "google_scholar": "Tz4_zi8AAAAJ;;9D4aG8AAAAAJ;7ZNoHJkAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Lei_Han1;~Jian_Guo2;~Bolei_Zhou5;~Hao_Sun1", "aff": "Tencent Robotics X;International Digital Economy Academy, International Digital Economy Academy;University of California, Los Angeles;University of Cambridge", "aff_domain": "tencent.com;idea.edu.cn;ucla.edu;cam.ac.uk", "position": "Principal Researcher;Researcher;Assistant Professor;PhD student", "bibtex": "@misc{\nsun2022reward,\ntitle={Reward Shifting for Optimistic Exploration and Conservative Exploitation},\nauthor={Hao Sun and Lei Han and Jian Guo and Bolei Zhou},\nyear={2022},\nurl={https://openreview.net/forum?id=CNY9h3uyfiO}\n}", "github": "", "project": "", "reviewers": "fFaW;4ENc;dgHr;Mgm9", "site": "https://openreview.net/forum?id=CNY9h3uyfiO", "pdf_size": 0, "recommendation": "3;3;5;6", "confidence": "4;3;3;4", "correctness": "3;3;4;3", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "60;171;54;102", "wc_summary_review": "57;114;124;105", "wc_main_review": "437;257;365;684", "wc_review": "554;542;543;891", "wc_reply_reviewers": "0;0;60;192", "wc_reply_authors": "493;425;838;805", "reply_reviewers": "0;0;1;1", "reply_authors": "1;1;2;2", "recommendation_avg": [ 4.25, 1.299038105676658 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 96.75, 46.687123492457744 ], "wc_summary_review_avg": [ 100.0, 25.71964229922337 ], "wc_main_review_avg": [ 435.75, 156.99263517757768 ], "wc_review_avg": [ 632.5, 149.31928877408973 ], "wc_reply_reviewers_avg": [ 63.0, 78.40280607223188 ], "wc_reply_authors_avg": [ 640.25, 183.20940887410777 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.19245008972987526, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:seMep6ZtOC0J:scholar.google.com/&scioq=Reward+Shifting+for+Optimistic+Exploration+and+Conservative+Exploitation&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "Tencent;International Digital Economy Academy;University of California, Los Angeles;University of Cambridge", "aff_unique_dep": "Tencent Robotics X;;;", "aff_unique_url": "https://www.tencent.com;;https://www.ucla.edu;https://www.cam.ac.uk", "aff_unique_abbr": "Tencent Robotics X;;UCLA;Cambridge", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Los Angeles;Cambridge", "aff_country_unique_index": "0;2;3", "aff_country_unique": "China;;United States;United Kingdom" }, { "id": "CO0ZuH5vaMu", "title": "Using Document Similarity Methods to create Parallel Datasets for Code Translation", "track": "main", "status": "Reject", "tldr": "", "abstract": "Translating source code from one programming language to another is a critical, time-consuming task in modernizing legacy applications and codebases. Recent work in this space has drawn inspiration from the software naturalness hypothesis by applying natural language processing techniques towards automating the code translation task. However, due to the paucity of parallel data in this domain, supervised techniques have only been applied to a limited set of popular programming languages. To bypass this limitation, unsupervised neural machine translation techniques have been proposed to learn code translation using only monolingual corpora. In this work, we propose to use document similarity methods to create noisy parallel datasets of code, thus enabling supervised techniques to be applied for automated code translation without having to rely on the availability or expensive curation of parallel code datasets. We explore the noise tolerance of models trained on such automatically-created datasets and show that these models perform comparably to models trained on ground truth for reasonable levels of noise. Finally, we exhibit the practical utility of the proposed method by creating parallel datasets for languages beyond the ones explored in prior work, thus expanding the set of programming languages for automated code translation.", "keywords": "code translation;machine translation;document similarity", "primary_area": "", "supplementary_material": "/attachment/93f0048a54c39fdaf553ed1f9740f03cc9f18f38.zip", "author": "Mayank Agarwal;Kartik Talamadupula;Fernando Martinez;Stephanie Houde;Michael Muller;John Richards;Steven I Ross;Justin D. Weisz", "authorids": "~Mayank_Agarwal1;~Kartik_Talamadupula1;~Fernando_Martinez2;~Stephanie_Houde1;~Michael_Muller1;ajtr@us.ibm.com;~Steven_I_Ross1;~Justin_D._Weisz1", "gender": "M;M;M;F;M;;M;M", "homepage": ";http://www.ktalamad.com/;;;https://researcher.watson.ibm.com/researcher/view.php?person=us-michael_muller;;https://researcher.watson.ibm.com/researcher/view.php?person=us-steven_ross;", "dblp": "38/5693;https://dblp.org/pers/hd/t/Talamadupula:Kartik;;;;;;02/133", "google_scholar": ";OGEfX0UAAAAJ;;;G3Rfeg8AAAAJ;;;", "orcid": ";;;;;;;0000-0003-2228-2398", "linkedin": "https://linkedin.com/in/mayank312/;ktalamad/;fernando-martinez-9170582a/;stephanie-houde-82b73a3/;;;;", "or_profile": "~Mayank_Agarwal1;~Kartik_Talamadupula1;~Fernando_Martinez2;~Stephanie_Houde1;~Michael_Muller1;ajtr@us.ibm.com;~Steven_I_Ross1;~Justin_D._Weisz1", "aff": "International Business Machines;IBM, International Business Machines;;International Business Machines;;;International Business Machines;International Business Machines", "aff_domain": "ibm.com;us.ibm.com;;ibm.com;;;ibm.com;ibm.com", "position": "Research Engineeer;Senior Research Scientist;;UX Design Lead;;;Researcher;Researcher", "bibtex": "@misc{\nagarwal2022using,\ntitle={Using Document Similarity Methods to create Parallel Datasets for Code Translation},\nauthor={Mayank Agarwal and Kartik Talamadupula and Fernando Martinez and Stephanie Houde and Michael Muller and John Richards and Steven I Ross and Justin D. Weisz},\nyear={2022},\nurl={https://openreview.net/forum?id=CO0ZuH5vaMu}\n}", "github": "", "project": "", "reviewers": "3acE;rSze;xtd8;nRDP", "site": "https://openreview.net/forum?id=CO0ZuH5vaMu", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "4;4;3;5", "correctness": "4;2;3;4", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;2;3;2", "wc_summary_paper": "28;135;92;42", "wc_summary_review": "56;39;10;39", "wc_main_review": "122;292;233;161", "wc_review": "206;466;335;242", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "774;1223;578;1377", "reply_reviewers": "0;0;0;0", "reply_authors": "1;2;1;2", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 74.25, 42.381452311123084 ], "wc_summary_review_avg": [ 36.0, 16.537835408541227 ], "wc_main_review_avg": [ 202.0, 65.4637304161625 ], "wc_review_avg": [ 312.25, 100.47481027600898 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 988.0, 324.2074952865834 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.30151134457776363, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14982305784357820711&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1;0;0;0", "aff_unique_norm": "International Business Machines Corporation;International Business Machines", "aff_unique_dep": ";", "aff_unique_url": "https://www.ibm.com;https://www.ibm.com", "aff_unique_abbr": "IBM;IBM", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "CQzlxFVcmw1", "title": "Message Function Search for Hyper-relational Knowledge Graph", "track": "main", "status": "Reject", "tldr": "", "abstract": "Recently, the hyper-relational knowledge graph (HKG) has attracted much attention due to its widespread existence and potential applications. The pioneer works have adapted powerful graph neural networks (GNNs) to embed HKGs by proposing domain-specific message functions. These message functions for HKG embedding are utilized to learn relational representations and capture the correlation between entities and relations of HKGs. However, these works often manually design and fix structures and operators of message functions, which makes them difficult to handle complex and diverse relational patterns in various HKGs (i.e., data patterns). To overcome these shortcomings, we plan to develop a method to dynamically search suitable message functions that can adapt to patterns of the given HKG. Unfortunately, it is not trivial to design an expressive search space and an efficient search algorithm to make the search effective and efficient. In this paper, we first unify a search space of message functions that enables both structures and operators to be searchable. Especially, the classic KG/HKG models and message functions of existing GNNs can be instantiated as special cases in the proposed search space. Then, we design an efficient search algorithm to search the message function and other GNN components for any given HKGs. Through empirical study, we show that the searched message functions are data-dependent, and can achieve leading performance in link/relation prediction tasks on benchmark data sets.", "keywords": "Graph Neural Network;Hyper-relational Knowledge Graph;Knowledge Base Embedding", "primary_area": "", "supplementary_material": "", "author": "Shimin Di;Lei Chen", "authorids": "~Shimin_Di1;~Lei_Chen7", "gender": "M;M", "homepage": "https://sdiaa.github.io;http://www.cs.ust.hk/~leichen/", "dblp": "223/3142;c/LeiChen0002", "google_scholar": "zLAZJLMAAAAJ;gtglwgYAAAAJ", "orcid": "0000-0002-7394-0082;0000-0002-8257-5806", "linkedin": ";", "or_profile": "~Shimin_Di1;~Lei_Chen7", "aff": "Hong Kong University of Science and Technology;Hong Kong University of Science and Technology", "aff_domain": "ust.hk;hkust.edu", "position": "PhD student;Full Professor", "bibtex": "@misc{\ndi2022message,\ntitle={Message Function Search for Hyper-relational Knowledge Graph},\nauthor={Shimin Di and Lei Chen},\nyear={2022},\nurl={https://openreview.net/forum?id=CQzlxFVcmw1}\n}", "github": "", "project": "", "reviewers": "y4dP;gMZD;WuXj", "site": "https://openreview.net/forum?id=CQzlxFVcmw1", "pdf_size": 0, "recommendation": "5;6;6", "confidence": "4;3;3", "correctness": "3;4;4", "technical_novelty": "2;3;3", "empirical_novelty": "3;3;3", "wc_summary_paper": "68;66;54", "wc_summary_review": "63;41;23", "wc_main_review": "308;130;195", "wc_review": "439;237;272", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "1407;460;719", "reply_reviewers": "0;0;0", "reply_authors": "3;1;2", "recommendation_avg": [ 5.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "correctness_avg": [ 3.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 62.666666666666664, 6.182412330330469 ], "wc_summary_review_avg": [ 42.333333333333336, 16.35712552851373 ], "wc_main_review_avg": [ 211.0, 73.5436378394941 ], "wc_review_avg": [ 316.0, 88.14004008773009 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 862.0, 399.6156486759079 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.0, 0.816496580927726 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.9999999999999997, "corr_recommendation_correctness": 0.9999999999999997, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12123359088177631582&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Hong Kong University of Science and Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.ust.hk", "aff_unique_abbr": "HKUST", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Hong Kong SAR", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "title": "Geometric Transformers for Protein Interface Contact Prediction", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6202", "id": "CS4463zx6Hi", "poster": "", "openreview": "https://openreview.net/forum?id=CS4463zx6Hi", "slides": "https://iclr.cc/virtual/2022/poster/6202", "video": "https://iclr.cc/virtual/2022/poster/6202", "author_site": "Alex Morehead, Chen Chen, Jianlin Cheng", "tldr": "", "abstract": "Computational methods for predicting the interface contacts between proteins come highly sought after for drug discovery as they can significantly advance the accuracy of alternative approaches, such as protein-protein docking, protein function analysis tools, and other computational methods for protein bioinformatics. In this work, we present the Geometric Transformer, a novel geometry-evolving graph transformer for rotation and translation-invariant protein interface contact prediction, packaged within DeepInteract, an end-to-end prediction pipeline. DeepInteract predicts partner-specific protein interface contacts (i.e., inter-protein residue-residue contacts) given the 3D tertiary structures of two proteins as input. In rigorous benchmarks, DeepInteract, on challenging protein complex targets from the 13th and 14th CASP-CAPRI experiments as well as Docking Benchmark 5, achieves 14% and 1.1% top L/5 precision (L: length of a protein unit in a complex), respectively. In doing so, DeepInteract, with the Geometric Transformer as its graph-based backbone, outperforms existing methods for interface contact prediction in addition to other graph-based neural network backbones compatible with DeepInteract, thereby validating the effectiveness of the Geometric Transformer for learning rich relational-geometric features for downstream tasks on 3D protein structures.", "keywords": "Geometric Deep Learning;Graph Transformers;Protein Bioinformatics;Invariance", "primary_area": "", "supplementary_material": "/attachment/aa045d03845c89621853ad3cf7ca962d8d47ad60.zip", "author": "Alex Morehead;Chen Chen;Jianlin Cheng", "authorids": "~Alex_Morehead1;~Chen_Chen31;~Jianlin_Cheng1", "gender": "M;M;M", "homepage": "https://amorehead.github.io/;;http://calla.rnet.missouri.edu/cheng/", "dblp": "259/6116;;11/5762.html", "google_scholar": "IYHJU5EAAAAJ;;t9MY6lwAAAAJ", "orcid": "0000-0002-0586-6191;0000-0002-2973-461X;0000-0003-0305-2853", "linkedin": "alexmorehead;;jianlin-cheng-26b3135/", "or_profile": "~Alex_Morehead1;~Chen_Chen31;~Jianlin_Cheng1", "aff": "University of Missouri, Columbia;;University of Missouri - Columbia", "aff_domain": "missouri.edu;;missouri.edu", "position": "PhD student;;Full Professor", "bibtex": "@inproceedings{\nmorehead2022geometric,\ntitle={Geometric Transformers for Protein Interface Contact Prediction},\nauthor={Alex Morehead and Chen Chen and Jianlin Cheng},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=CS4463zx6Hi}\n}", "github": "", "project": "", "reviewers": "Qnz5;rPiF;HqpY;uZ6F", "pdf_size": 0, "recommendation": "5;6;6;6", "confidence": "2;5;4;4", "correctness": "3;4;4;3", "technical_novelty": "2;2;3;3", "empirical_novelty": "1;3;3;3", "wc_summary_paper": "65;72;37;48", "wc_summary_review": "26;17;33;258", "wc_main_review": "208;288;154;385", "wc_review": "299;377;224;691", "wc_reply_reviewers": "91;18;0;967", "wc_reply_authors": "1423;937;504;3864", "reply_reviewers": "1;1;0;3", "reply_authors": "4;3;1;7", "recommendation_avg": [ 5.75, 0.4330127018922193 ], "confidence_avg": [ 3.75, 1.0897247358851685 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 55.5, 13.793114224133722 ], "wc_summary_review_avg": [ 83.5, 100.90713552569015 ], "wc_main_review_avg": [ 258.75, 87.09585236967372 ], "wc_review_avg": [ 397.75, 177.74050607557075 ], "wc_reply_reviewers_avg": [ 269.0, 404.4286092748632 ], "wc_reply_authors_avg": [ 1682.0, 1301.0489998458936 ], "reply_reviewers_avg": [ 1.25, 1.0897247358851685 ], "reply_authors_avg": [ 3.75, 2.165063509461097 ], "replies_avg": [ 28, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.9271726499455306, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 40, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11431746960941491092&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "pdf": "https://openreview.net/pdf?id=CS4463zx6Hi", "email": "missouri.edu;;missouri.edu", "author_num": 3, "aff_unique_index": "0;0", "aff_unique_norm": "University of Missouri", "aff_unique_dep": "", "aff_unique_url": "https://www.missouri.edu", "aff_unique_abbr": "MU", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Columbia", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Recursive Disentanglement Network", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6663", "id": "CSfcOznpDY", "poster": "", "openreview": "https://openreview.net/forum?id=CSfcOznpDY", "slides": "https://iclr.cc/virtual/2022/poster/6663", "video": "https://iclr.cc/virtual/2022/poster/6663", "author_site": "Yixuan Chen, Yubin Shi, Dongsheng Li, Yujiang Wang, Mingzhi Dong, Yingying Zhao, Robert Dick, Qin Lv, Fan Yang, Li Shang", "tldr": "", "abstract": "Disentangled feature representation is essential for data-efficient learning. The feature space of deep models is inherently compositional. Existing $\\beta$-VAE-based methods, which only apply disentanglement regularization to the resulting embedding space of deep models, cannot effectively regularize such compositional feature space, resulting in unsatisfactory disentangled results. In this paper, we formulate the compositional disentanglement learning problem from an information-theoretic perspective and propose a recursive disentanglement network (RecurD) that propagates regulatory inductive bias recursively across the compositional feature space during disentangled representation learning. \nExperimental studies demonstrate that RecurD outperforms $\\beta$-VAE and several of its state-of-the-art variants on disentangled representation learning and enables more data-efficient downstream machine learning tasks.", "keywords": "disentanglement;representation learning;compositional", "primary_area": "", "supplementary_material": "", "author": "Yixuan Chen;Yubin Shi;Dongsheng Li;Yujiang Wang;Mingzhi Dong;Yingying Zhao;Robert P. Dick;Qin Lv;Fan Yang;Li Shang", "authorids": "~Yixuan_Chen1;~Yubin_Shi1;~Dongsheng_Li2;~Yujiang_Wang1;~Mingzhi_Dong7;~Yingying_Zhao1;~Robert_P._Dick1;~Qin_Lv1;~Fan_Yang31;~Li_Shang3", "gender": "F;M;M;M;;F;M;F;M;", "homepage": ";;http://recmind.cn;;;;http://robertdick.org/;https://home.cs.colorado.edu/~lv/;https://ephonic.github.io;https://cscw.fudan.edu.cn/lishang/list.htm", "dblp": "30/7103-3;221/2003;254/0830-2.html;125/0429-1;;;84/523.html;11/808;;", "google_scholar": "cmdWHrIAAAAJ;IyLkK_kAAAAJ;VNg5rA8AAAAJ;https://scholar.google.co.uk/citations?user=3xxDPJUAAAAJ;;;;dTkWR0MAAAAJ;;AnBUn0QAAAAJ", "orcid": ";;0000-0003-3103-8442;;;0000-0001-5902-1306;;0000-0002-9437-1376;;", "linkedin": ";;;;;;;;;", "or_profile": "~Yixuan_Chen1;~Yubin_Shi1;~Dongsheng_Li2;~Yujiang_Wang1;~Mingzhi_Dong7;~Yingying_Zhao1;~Robert_P._Dick1;~Qin_Lv1;~Fan_Yang31;~Li_Shang3", "aff": "Fudan University;Fudan University;Microsoft Research Asia;University of Oxford;;Fudan University;University of Michigan;University of Colorado at Boulder;Fudan University;Fudan University", "aff_domain": "fudan.edu;fudan.edu.cn;microsoft.com;ox.ac.uk;;fudan.edu.cn;umich.edu;colorado.edu;fudan.edu.cn;fudan.edu.cn", "position": "PhD student;MS student;Principal Researcher;Postdoc;;Postdoc;Full Professor;Full Professor;Professor;Full Professor", "bibtex": "@inproceedings{\nchen2022recursive,\ntitle={Recursive Disentanglement Network},\nauthor={Yixuan Chen and Yubin Shi and Dongsheng Li and Yujiang Wang and Mingzhi Dong and Yingying Zhao and Robert P. Dick and Qin Lv and Fan Yang and Li Shang},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=CSfcOznpDY}\n}", "github": "", "project": "", "reviewers": "p3Zz;1v1o;VLG5;yTV3", "pdf_size": 0, "recommendation": "6;6;6;6", "confidence": "5;4;3;3", "correctness": "3;4;4;4", "technical_novelty": "3;3;3;4", "empirical_novelty": "3;3;3;3", "wc_summary_paper": "31;104;42;130", "wc_summary_review": "16;111;26;90", "wc_main_review": "593;228;241;360", "wc_review": "640;443;309;580", "wc_reply_reviewers": "87;438;9;0", "wc_reply_authors": "1831;1884;658;1105", "reply_reviewers": "4;2;1;0", "reply_authors": "7;5;1;2", "recommendation_avg": [ 6.0, 0.0 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 76.75, 41.46911501346514 ], "wc_summary_review_avg": [ 60.75, 40.59171713539598 ], "wc_main_review_avg": [ 355.5, 146.45221063541513 ], "wc_review_avg": [ 493.0, 127.99804686009861 ], "wc_reply_reviewers_avg": [ 133.5, 179.0286289954766 ], "wc_reply_authors_avg": [ 1369.5, 513.2945061073614 ], "reply_reviewers_avg": [ 1.75, 1.479019945774904 ], "reply_authors_avg": [ 3.75, 2.384848003542364 ], "replies_avg": [ 28, 0 ], "authors#_avg": [ 10, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2911472738585722871&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=CSfcOznpDY", "email": "fudan.edu;fudan.edu.cn;microsoft.com;ox.ac.uk;;fudan.edu.cn;umich.edu;colorado.edu;fudan.edu.cn;fudan.edu.cn", "author_num": 10, "aff_unique_index": "0;0;1;2;0;3;4;0;0", "aff_unique_norm": "Fudan University;Microsoft;University of Oxford;University of Michigan;University of Colorado", "aff_unique_dep": ";Research;;;", "aff_unique_url": "https://www.fudan.edu.cn;https://www.microsoft.com/en-us/research/group/asia;https://www.ox.ac.uk;https://www.umich.edu;https://www.colorado.edu", "aff_unique_abbr": "Fudan;MSR Asia;Oxford;UM;CU", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Asia;Boulder", "aff_country_unique_index": "0;0;0;1;0;2;2;0;0", "aff_country_unique": "China;United Kingdom;United States" }, { "id": "CSw5zgTjXyb", "title": "Learning to Collaborate", "track": "main", "status": "Reject", "tldr": "", "abstract": "In this paper, we focus on effective learning over a collaborative research network involving multiple clients. Each client has its own sample population which may not be shared with other clients due to privacy concerns. The goal is to learn a model for each client, which behaves better than the one learned from its own data, through secure collaborations with other clients in the network. Due to the discrepancies of the sample distributions across different clients, it is not necessarily that collaborating with everyone will lead to the best local models. We propose a learning to collaborate framework, where each client can choose to collaborate with certain members in the network to achieve a ``collaboration equilibrium\", where smaller collaboration coalitions are formed within the network so that each client can obtain the model with the best utility. We propose the concept of benefit graph which describes how each client can benefit from collaborating with other clients and develop a Pareto optimization approach to obtain it. Finally the collaboration coalitions can be derived from it based on graph operations. Our framework provides a new way of setting up collaborations in a research network. Experiments on both synthetic and real world data sets are provided to demonstrate the effectiveness of our method.", "keywords": "collaboration equilibrium", "primary_area": "", "supplementary_material": "", "author": "Sen Cui;Jian Liang;Weishen Pan;Kun Chen;Changshui Zhang;Fei Wang", "authorids": "~Sen_Cui1;~Jian_Liang3;~Weishen_Pan1;~Kun_Chen1;~Changshui_Zhang2;~Fei_Wang3", "gender": "M;M;M;M;M;", "homepage": ";;https://scholar.google.com/citations?user=PtTBMhUAAAAJ;https://kun-chen.uconn.edu;http://bigeye.au.tsinghua.edu.cn/english/Introduction.html;https://wcm-wanglab.github.io/index.html", "dblp": "267/5483;19/2208;161/2032;;z/ChangshuiZhang;52/3194-9.html", "google_scholar": "UzQuG1UAAAAJ;mrunnpoAAAAJ;PtTBMhUAAAAJ;;GL9M37YAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";;0009-0006-0431-5642;;;", "linkedin": ";;;;;fei-wang-50682425/", "or_profile": "~Sen_Cui1;~Jian_Liang3;~Weishen_Pan1;~Kun_Chen1;~Changshui_Zhang2;~Fei_Wang3", "aff": "Tsinghua University;Alibaba Group;Tsinghua University;University of Connecticut;Tsinghua University;Cornell University", "aff_domain": "tsinghua.edu.cn;alibaba-inc.com;tsinghua.edu.cn;uconn.edu;mail.tsinghua.edu.cn;cornell.edu", "position": "PhD student;Senior Algorithm Engineer;PhD student;Associate Professor;Full Professor;Full Professor", "bibtex": "@misc{\ncui2022learning,\ntitle={Learning to Collaborate},\nauthor={Sen Cui and Jian Liang and Weishen Pan and Kun Chen and Changshui Zhang and Fei Wang},\nyear={2022},\nurl={https://openreview.net/forum?id=CSw5zgTjXyb}\n}", "github": "", "project": "", "reviewers": "yALe;Mxha;Psa6;eyZ6", "site": "https://openreview.net/forum?id=CSw5zgTjXyb", "pdf_size": 0, "recommendation": "3;5;5;8", "confidence": "2;3;4;2", "correctness": "4;4;3;4", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "41;70;188;244", "wc_summary_review": "55;31;63;70", "wc_main_review": "177;302;431;345", "wc_review": "273;403;682;659", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "751;1180;1749;462", "reply_reviewers": "0;0;0;0", "reply_authors": "2;3;4;1", "recommendation_avg": [ 5.25, 1.7853571071357126 ], "confidence_avg": [ 2.75, 0.82915619758885 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 135.75, 83.28978028545879 ], "wc_summary_review_avg": [ 54.75, 14.703315952532613 ], "wc_main_review_avg": [ 313.75, 91.60069595805481 ], "wc_review_avg": [ 504.25, 172.67798788496466 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 1035.5, 484.7177013479083 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.5, 1.118033988749895 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.1266600992762247, "corr_recommendation_correctness": 0.08084520834544431, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15379684258966074950&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1;0;2;0;3", "aff_unique_norm": "Tsinghua University;Alibaba Group;University of Connecticut;Cornell University", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.alibaba.com;https://www.uconn.edu;https://www.cornell.edu", "aff_unique_abbr": "THU;Alibaba;UConn;Cornell", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;0;1", "aff_country_unique": "China;United States" }, { "id": "CTOJRqLMsl", "title": "On the Convergence of Nonconvex Continual Learning with Adaptive Learning Rate", "track": "main", "status": "Reject", "tldr": "", "abstract": "One of the objectives of continual learning is to prevent catastrophic forgetting in learning multiple tasks sequentially.\nThe memory based continual learning stores a small subset of the data for previous tasks and applies various methods such as quadratic programming and sample selection.\nSome memory-based approaches are formulated as a constrained optimization problem and rephrase constraints on the objective for memory as the inequalities on gradients.\nHowever, there have been little theoretical results on the convergence of continual learning.\nIn this paper, we propose a theoretical convergence analysis of memory-based continual learning with stochastic gradient descent.\nThe proposed method called nonconvex continual learning (NCCL) adapts the learning rates of both previous and current tasks with the gradients.\nThe proposed method can achieve the same convergence rate as the SGD method for a single task when the catastrophic forgetting term which we define in the paper is suppressed at each iteration.\nIt is also shown that memory-based approaches inherently overfit to memory, which degrades the performance on previously learned tasks. Experiments show that the proposed algorithm improves the performance of continual learning over existing methods for several image classification tasks. ", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/3717d5151c1f87a45a329acc516499769688fc40.zip", "author": "Seungyub Han;Yeongmo Kim;Jungwoo Lee", "authorids": "~Seungyub_Han1;~Yeongmo_Kim1;~Jungwoo_Lee1", "gender": "M;;M", "homepage": ";https://cml.snu.ac.kr;https://cml.snu.ac.kr", "dblp": "347/8731;;34/516-1", "google_scholar": "ot1-XNAAAAAJ;;j98IWfoAAAAJ", "orcid": "0009-0001-8704-8968;;0000-0002-6804-980X", "linkedin": ";;", "or_profile": "~Seungyub_Han1;~Yeongmo_Kim1;~Jungwoo_Lee1", "aff": "Seoul National University;Seoul National University;Seoul National University", "aff_domain": "snu.ac.kr;snu.ac.kr;snu.ac.kr", "position": "PhD student;PhD student;Full Professor", "bibtex": "@misc{\nhan2022on,\ntitle={On the Convergence of Nonconvex Continual Learning with Adaptive Learning Rate},\nauthor={Seungyub Han and Yeongmo Kim and Jungwoo Lee},\nyear={2022},\nurl={https://openreview.net/forum?id=CTOJRqLMsl}\n}", "github": "", "project": "", "reviewers": "UMAT;TD2U;jqxZ;qew6", "site": "https://openreview.net/forum?id=CTOJRqLMsl", "pdf_size": 0, "recommendation": "3;5;5;8", "confidence": "4;3;3;3", "correctness": "3;3;3;4", "technical_novelty": "2;3;2;3", "empirical_novelty": "3;3;2;3", "wc_summary_paper": "71;181;68;36", "wc_summary_review": "77;46;182;19", "wc_main_review": "228;393;225;84", "wc_review": "376;620;475;139", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "391;786;561;88", "reply_reviewers": "0;0;0;0", "reply_authors": "2;3;1;1", "recommendation_avg": [ 5.25, 1.7853571071357126 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 89.0, 54.85890994177701 ], "wc_summary_review_avg": [ 81.0, 61.818282085480185 ], "wc_main_review_avg": [ 232.5, 109.4177773490213 ], "wc_review_avg": [ 402.5, 175.14065775827154 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 456.5, 254.74153567881308 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.75, 0.82915619758885 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.7276068751089989, "corr_recommendation_correctness": 0.8892972917998875, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:voYI6i1OYuMJ:scholar.google.com/&scioq=On+the+Convergence+of+Nonconvex+Continual+Learning+with+Adaptive+Learning+Rate&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;0", "aff_unique_norm": "Seoul National University", "aff_unique_dep": "", "aff_unique_url": "https://www.snu.ac.kr", "aff_unique_abbr": "SNU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "South Korea" }, { "id": "CTvr5sjVi2_", "title": "Training with Worst-Case Distributional Shift causes Overestimation and Inaccuracies in State-Action Value Functions", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "The utilization of deep neural networks as function approximators for the state-action value function created a new research area for self learning systems, and made it possible to learn optimal policies from high dimensional state representations. While this initial success led deep neural policies to be employed in many diverse disciplines with manifold applications, the issues related to their resilience with respect to specifically crafted imperceptible adversarial perturbations remains a concern. To eliminate these concerns several studies have focused on building deep neural policies resilient towards these perturbations via training with the presence of such perturbations (i.e. adversarial training). In this paper we focus on conducting an investigation on the state-action value function learned by state-of-the-art adversarially trained deep neural policies and vanilla trained deep neural policies. We theoretically motivate that the idea behind the state-of-the-art adversarial training method causes overestimation bias and inaccuracies in the state-action value function. We perform several experiments in the Arcade Learning Environment (ALE) and show that indeed adversarially trained deep neural policies suffer from overestimation bias. Furthermore, the state-action value functions learned by vanilla trained deep neural policies have more accurate estimates for the non-optimal actions than state-of-the-art adversarially trained deep neural policies. We believe our study lays out intriguing properties of adversarial training and could be a critical step towards obtaining robust and reliable policies.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/5ed53646a06a557c04bc7d5d86d00e68cb93dee9.zip", "author": "Ezgi Korkmaz", "authorids": "~Ezgi_Korkmaz2", "gender": "", "homepage": "https://ezgikorkmaz.github.io/", "dblp": "300/7830.html", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "~Ezgi_Korkmaz2", "aff": "University College London, University of London", "aff_domain": "ucl.ac.uk", "position": "PhD student", "bibtex": "@misc{\nkorkmaz2022training,\ntitle={Training with Worst-Case Distributional Shift causes Overestimation and Inaccuracies in State-Action Value Functions},\nauthor={Ezgi Korkmaz},\nyear={2022},\nurl={https://openreview.net/forum?id=CTvr5sjVi2_}\n}", "github": "", "project": "", "reviewers": "i5oi;puh6;A1yk", "site": "https://openreview.net/forum?id=CTvr5sjVi2_", "pdf_size": 0, "recommendation": "3;5;5", "confidence": "4;5;4", "correctness": "2;4;3", "technical_novelty": "2;3;2", "empirical_novelty": "2;2;2", "wc_summary_paper": "74;30;38", "wc_summary_review": "83;39;34", "wc_main_review": "548;123;296", "wc_review": "705;192;368", "wc_reply_reviewers": "293;0;0", "wc_reply_authors": "193;0;0", "reply_reviewers": "3;0;0", "reply_authors": "3;0;0", "recommendation_avg": [ 4.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "correctness_avg": [ 3.0, 0.816496580927726 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 47.333333333333336, 19.136933459209764 ], "wc_summary_review_avg": [ 52.0, 22.015146301277824 ], "wc_main_review_avg": [ 322.3333333333333, 174.50183061758662 ], "wc_review_avg": [ 421.6666666666667, 212.84162082533473 ], "wc_reply_reviewers_avg": [ 97.66666666666667, 138.12152459177227 ], "wc_reply_authors_avg": [ 64.33333333333333, 90.98107251266912 ], "reply_reviewers_avg": [ 1.0, 1.4142135623730951 ], "reply_authors_avg": [ 1.0, 1.4142135623730951 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 1, 0 ], "corr_recommendation_confidence": 0.4999999999999999, "corr_recommendation_correctness": 0.8660254037844385, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:HgNTf_FTSxgJ:scholar.google.com/&scioq=Training+with+Worst-Case+Distributional+Shift+causes+Overestimation+and+Inaccuracies+in+State-Action+Value+Functions&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "University College London", "aff_unique_dep": "", "aff_unique_url": "https://www.ucl.ac.uk", "aff_unique_abbr": "UCL", "aff_country_unique_index": "0", "aff_country_unique": "United Kingdom" }, { "title": "ARTEMIS: Attention-based Retrieval with Text-Explicit Matching and Implicit Similarity", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6121", "id": "CVfLvQq9gLo", "poster": "", "openreview": "https://openreview.net/forum?id=CVfLvQq9gLo", "slides": "https://iclr.cc/virtual/2022/poster/6121", "video": "https://iclr.cc/virtual/2022/poster/6121", "author_site": "Ginger Delmas, Rafael S Rezende, Gabriela Csurka, Diane Larlus", "tldr": "", "abstract": "An intuitive way to search for images is to use queries composed of an example image and a complementary text. While the first provides rich and implicit context for the search, the latter explicitly calls for new traits, or specifies how some elements of the example image should be changed to retrieve the desired target image. Current approaches typically combine the features of each of the two elements of the query into a single representation, which can then be compared to the ones of the potential target images. Our work aims at shedding new light on the task by looking at it through the prism of two familiar and related frameworks: text-to-image and image-to-image retrieval. Taking inspiration from them, we exploit the specific relation of each query element with the targeted image and derive light-weight attention mechanisms which enable to mediate between the two complementary modalities. We validate our approach on several retrieval benchmarks, querying with images and their associated free-form text modifiers. Our method obtains state-of-the-art results without resorting to side information, multi-level features, heavy pre-training nor large architectures as in previous works.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ginger Delmas;Rafael S. Rezende;Gabriela Csurka;Diane Larlus", "authorids": "ginger.delmas@naverlabs.com;~Rafael_S._Rezende1;~Gabriela_Csurka2;~Diane_Larlus1", "gender": ";M;F;F", "homepage": ";https://europe.naverlabs.com/people_user/rafael-sampaio-de-rezende/;https://europe.naverlabs.com/people_user/gabriela-csurka-khedari;https://dlarlus.github.io/", "dblp": ";200/8093;c/GabrielaCsurka;48/4033", "google_scholar": ";https://scholar.google.com/citations?hl=fr;https://scholar.google.fr/citations?user=PXm1lPAAAAAJ;https://scholar.google.fr/citations?user=nI2oJqkAAAAJ", "orcid": ";;;", "linkedin": ";;gabriela-csurka-0387bb2a/;", "or_profile": "ginger.delmas@naverlabs.com;~Rafael_S._Rezende1;~Gabriela_Csurka2;~Diane_Larlus1", "aff": ";Naver Labs Europe;Naver Labs Europe;NAVER LABS Europe", "aff_domain": ";naverlabs.com;naverlabs.com;naverlabs.com", "position": ";Research Scientist;Principal Researcher;Principal Researcher", "bibtex": "@inproceedings{\ndelmas2022artemis,\ntitle={{ARTEMIS}: Attention-based Retrieval with Text-Explicit Matching and Implicit Similarity},\nauthor={Ginger Delmas and Rafael S. Rezende and Gabriela Csurka and Diane Larlus},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=CVfLvQq9gLo}\n}", "github": "", "project": "", "reviewers": "pgma;L8FA;5uBZ", "pdf_size": 0, "recommendation": "6;8;8", "confidence": "4;4;4", "correctness": "3;4;4", "technical_novelty": "2;3;2", "empirical_novelty": "0;4;2", "wc_summary_paper": "94;100;144", "wc_summary_review": "39;30;34", "wc_main_review": "189;157;140", "wc_review": "322;287;318", "wc_reply_reviewers": "28;37;26", "wc_reply_authors": "886;445;608", "reply_reviewers": "1;1;1", "reply_authors": "2;1;1", "recommendation_avg": [ 7.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.0, 1.632993161855452 ], "wc_summary_paper_avg": [ 112.66666666666667, 22.29100466306732 ], "wc_summary_review_avg": [ 34.333333333333336, 3.6817870057290873 ], "wc_main_review_avg": [ 162.0, 20.314198646923455 ], "wc_review_avg": [ 309.0, 15.641824275533422 ], "wc_reply_reviewers_avg": [ 30.333333333333332, 4.784233364802441 ], "wc_reply_authors_avg": [ 646.3333333333334, 182.0665324056627 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.9999999999999998, "gs_citation": 111, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15218636624672765176&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=CVfLvQq9gLo", "email": ";naverlabs.com;naverlabs.com;naverlabs.com", "author_num": 4, "aff_unique_index": "0;0;0", "aff_unique_norm": "NAVER LABS", "aff_unique_dep": "", "aff_unique_url": "https://labs.naver.com", "aff_unique_abbr": "NLE", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1", "aff_country_unique": "Unknown;France" }, { "id": "CZZ7KWOP0-M", "title": "ShiftAddNAS: Hardware-Inspired Search for More Accurate and Efficient Neural Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "Neural networks (NNs) with intensive multiplications (e.g., convolutions and transformers) are powerful yet power hungry, impeding their more extensive deployment into resource-constrained edge devices. As such, multiplication-free networks, which follow a common practice in energy-efficient hardware implementation to parameterize NNs with more efficient operators (e.g., bitwise shifts and additions), have gained growing attention. However, multiplication-free networks in general under-perform their vanilla counterparts in terms of the achieved accuracy. To this end, this work advocates hybrid NNs that consist of both powerful yet costly multiplications and efficient yet less powerful operators for marrying the best of both worlds, and proposes ShiftAddNAS, which can automatically search for more accurate and more efficient NNs. Our ShiftAddNAS highlights two enablers. Specifically, it integrates (1) the first hybrid search space that incorporates both multiplication-based and multiplication-free operators for facilitating the development of both accurate and efficient hybrid NNs; and (2) a novel weight sharing strategy that enables effective weight sharing among different operators that follow heterogeneous distributions (e.g., Gaussian for convolutions vs. Laplacian for add operators) and simultaneously leads to a largely reduced supernet size and much better searched networks. Extensive experiments and ablation studies on various models, datasets, and tasks consistently validate the effectiveness of ShiftAddNAS, e.g., achieving up to a +7.7% higher accuracy or a +4.9 better BLEU score as compared to state-of-the-art expert-designed and neural architecture searched NNs, while leading to up to 93% or 69% energy and latency savings, respectively. All the codes will be released upon acceptance.", "keywords": "Neural Architecture Search;Bit-wise Shift and Add;Hardware Acceleration;Multiplication-Reduced Networks", "primary_area": "", "supplementary_material": "", "author": "Haoran You;Baopu Li;Huihong Shi;Yingyan Lin", "authorids": "~Haoran_You1;~Baopu_Li1;~Huihong_Shi1;~Yingyan_Lin1", "gender": "M;;F;F", "homepage": "http://haoranyou.com/;;https://shihuihong214.github.io/huihong.shi/;https://eiclab.scs.gatech.edu/", "dblp": "230/4247;;253/3178;120/6981", "google_scholar": "z5Eku1sAAAAJ;;https://scholar.google.com/citations?hl=en;dio8IesAAAAJ", "orcid": "0000-0002-2873-2153;;0000-0002-7845-0154;", "linkedin": "haoran-you-b4b958165/;;;yingyan-celine-lin-a281211a/", "or_profile": "~Haoran_You1;~Baopu_Li1;~Huihong_Shi1;~Yingyan_Lin1", "aff": "Rice University;;Nanjing University;Rice University", "aff_domain": "rice.edu;;nju.edu.cn;rice.edu", "position": "PhD student;;PhD student;Assistant Professor", "bibtex": "@misc{\nyou2022shiftaddnas,\ntitle={ShiftAdd{NAS}: Hardware-Inspired Search for More Accurate and Efficient Neural Networks},\nauthor={Haoran You and Baopu Li and Huihong Shi and Yingyan Lin},\nyear={2022},\nurl={https://openreview.net/forum?id=CZZ7KWOP0-M}\n}", "github": "", "project": "", "reviewers": "1Due;evhx;KyCd;PKFY", "site": "https://openreview.net/forum?id=CZZ7KWOP0-M", "pdf_size": 0, "recommendation": "5;6;6;6", "confidence": "4;5;4;3", "correctness": "3;3;4;3", "technical_novelty": "2;3;3;2", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "39;30;38;74", "wc_summary_review": "39;20;28;37", "wc_main_review": "297;206;86;237", "wc_review": "375;256;152;348", "wc_reply_reviewers": "327;90;0;0", "wc_reply_authors": "2259;794;182;468", "reply_reviewers": "2;2;0;0", "reply_authors": "5;3;1;1", "recommendation_avg": [ 5.75, 0.4330127018922193 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 45.25, 16.96135312998347 ], "wc_summary_review_avg": [ 31.0, 7.582875444051551 ], "wc_main_review_avg": [ 206.5, 76.87815034195347 ], "wc_review_avg": [ 282.75, 87.43390360723922 ], "wc_reply_reviewers_avg": [ 104.25, 133.75046728890334 ], "wc_reply_authors_avg": [ 925.75, 799.6269051876632 ], "reply_reviewers_avg": [ 1.0, 1.0 ], "reply_authors_avg": [ 2.5, 1.6583123951777 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17026416337828414455&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff_unique_index": "0;1;0", "aff_unique_norm": "Rice University;Nanjing University", "aff_unique_dep": ";", "aff_unique_url": "https://www.rice.edu;https://www.nju.edu.cn", "aff_unique_abbr": "Rice;Nanjing U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", "aff_country_unique": "United States;China" }, { "id": "C_RTGckbu-A", "title": "Multi-Subspace Structured Meta-Learning", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Meta-learning aims to extract meta-knowledge from historical tasks to accelerate learning on new tasks. A critical challenge in meta-learning is to handle task heterogeneity, i.e., tasks lie in different distributions. Unlike typical meta-learning algorithms that learn a globally shared initialization, recent structured meta-learning algorithms formulate tasks into multiple groups and learn an initialization for tasks in each group using centroid-based clustering. However, those algorithms still require task models in the same group to be close together and fail to take advantage of negative correlations between tasks. In this paper, task models are formulated into a subspace structure. We propose a MUlti-Subspace structured Meta-Learning (MUSML) algorithm to learn the subspace bases. We establish the convergence and analyze the generalization performance. Experimental results confirm the effectiveness of the proposed MUSML algorithm.", "keywords": "meta-learning", "primary_area": "", "supplementary_material": "/attachment/b2a1c1891b51477c7f9fffba3cd2eb08093f4b33.zip", "author": "Weisen Jiang;James Kwok;Yu Zhang", "authorids": "~Weisen_Jiang1;~James_Kwok1;~Yu_Zhang3", "gender": "M;;M", "homepage": "https://wayson-ust.github.io/;;http://cse.sustech.edu.cn/faculty/~zhangy/", "dblp": "302/7625;;50/671-6", "google_scholar": "https://scholar.google.com/citations?hl=en;;https://scholar.google.com.hk/citations?user=jaRS5w4AAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Weisen_Jiang1;~James_Kwok1;~Yu_Zhang3", "aff": "Hong Kong University of Science and Technology;;Southern University of Science and Technology", "aff_domain": "ust.hk;;sustc.edu.cn", "position": "PhD student;;Associate Professor", "bibtex": "@misc{\njiang2022multisubspace,\ntitle={Multi-Subspace Structured Meta-Learning},\nauthor={Weisen Jiang and James Kwok and Yu Zhang},\nyear={2022},\nurl={https://openreview.net/forum?id=C_RTGckbu-A}\n}", "github": "", "project": "", "reviewers": "5VBv;dNxy;npx5;xuka", "site": "https://openreview.net/forum?id=C_RTGckbu-A", "pdf_size": 0, "recommendation": "5;5;5;6", "confidence": "4;3;4;4", "correctness": "2;2;2;3", "technical_novelty": "3;2;2;3", "empirical_novelty": "2;3;2;3", "wc_summary_paper": "36;40;56;74", "wc_summary_review": "27;28;93;39", "wc_main_review": "421;49;201;235", "wc_review": "484;117;350;348", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 5.25, 0.4330127018922193 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 2.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 51.5, 14.99166435056495 ], "wc_summary_review_avg": [ 46.75, 27.11434122378783 ], "wc_main_review_avg": [ 226.5, 132.3433035706756 ], "wc_review_avg": [ 324.75, 132.00260414097897 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": 1.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:gwiTx_yWy3UJ:scholar.google.com/&scioq=Multi-Subspace+Structured+Meta-Learning&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1", "aff_unique_norm": "Hong Kong University of Science and Technology;Southern University of Science and Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.ust.hk;https://www.sustech.edu.cn", "aff_unique_abbr": "HKUST;SUSTech", "aff_campus_unique_index": "0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "title": "Trivial or Impossible --- dichotomous data difficulty masks model differences (on ImageNet and beyond)", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/5966", "id": "C_vsGwEIjAr", "poster": "", "openreview": "https://openreview.net/forum?id=C_vsGwEIjAr", "slides": "https://iclr.cc/virtual/2022/poster/5966", "video": "https://iclr.cc/virtual/2022/poster/5966", "author_site": "Kristof Meding, Luca Schulze Buschoff, Robert Geirhos, Felix Wichmann", "tldr": "", "abstract": "\"The power of a generalization system follows directly from its biases\" (Mitchell 1980). Today, CNNs are incredibly powerful generalisation systems---but to what degree have we understood how their inductive bias influences model decisions? We here attempt to disentangle the various aspects that determine how a model decides. In particular, we ask: what makes one model decide differently from another? In a meticulously controlled setting, we find that (1.) irrespective of the network architecture or objective (e.g. self-supervised, semi-supervised, vision transformers, recurrent models) all models end up with a similar decision boundary. (2.) To understand these findings, we analysed model decisions on the ImageNet validation set from epoch to epoch and image by image. We find that the ImageNet validation set, among others, suffers from dichotomous data difficulty (DDD): For the range of investigated models and their accuracies, it is dominated by 46.0% \"trivial\" and 11.5% \"impossible\" images (beyond label errors). Only 42.5% of the images could possibly be responsible for the differences between two models' decision boundaries. (3.) Only removing the \"impossible\" and \"trivial\" images allows us to see pronounced differences between models. (4.) Humans are highly accurate at predicting which images are \"trivial\" and \"impossible\" for CNNs (81.4%). This implies that in future comparisons of brains, machines and behaviour, much may be gained from investigating the decisive role of images and the distribution of their difficulties.", "keywords": "CNNs;Cognitive Science;Vision Science;Psychophysics;Neuroscience;Visual perception;Inductive bias;ImageNet;CIFAR;RSA;Representation similarity analysis;Error consistency;Datasets", "primary_area": "", "supplementary_material": "/attachment/644316362c11915e9e9592575bd35d82d8635092.zip", "author": "Kristof Meding;Luca M. Schulze Buschoff;Robert Geirhos;Felix A. Wichmann", "authorids": "~Kristof_Meding1;~Luca_M._Schulze_Buschoff1;~Robert_Geirhos1;~Felix_A._Wichmann1", "gender": ";M;M;M", "homepage": "https://kmeding.com/;https://robertgeirhos.com/;http://www.wichmannlab.org;https://github.com/lsbuschoff", "dblp": "201/7024;176/0076;42/5049;", "google_scholar": "XUezFPYAAAAJ;w3kGtMIAAAAJ;NxrQ794AAAAJ;K0uLclsAAAAJ", "orcid": "0000-0001-5073-2347;0000-0001-7698-3187;0000-0002-2592-634X;", "linkedin": ";rgeirhos/;;", "or_profile": "~Kristof_Meding1;~Robert_Geirhos1;~Felix_A._Wichmann1;~Luca_Maximilian_Schulze_Buschoff1", "aff": "Eberhard-Karls-Universit\u00e4t T\u00fcbingen;University of T\u00fcbingen;University of T\u00fcbingen;University of Tuebingen", "aff_domain": "uni-tuebingen.de;uni-tuebingen.de;uni-tuebingen.de;uni-tuebingen.de", "position": "Postdoc;Postdoc;Full Professor;MS student", "bibtex": "@inproceedings{\nmeding2022trivial,\ntitle={Trivial or Impossible --- dichotomous data difficulty masks model differences (on ImageNet and beyond)},\nauthor={Kristof Meding and Luca M. Schulze Buschoff and Robert Geirhos and Felix A. Wichmann},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=C_vsGwEIjAr}\n}", "github": "", "project": "", "reviewers": "3xZg;rhYo;MrqK;oQTc", "pdf_size": 0, "recommendation": "6;6;6;8", "confidence": "3;5;5;3", "correctness": "2;4;4;4", "technical_novelty": "3;4;3;4", "empirical_novelty": "3;3;3;4", "wc_summary_paper": "71;82;101;127", "wc_summary_review": "34;99;52;23", "wc_main_review": "360;380;635;125", "wc_review": "465;561;788;275", "wc_reply_reviewers": "127;443;606;0", "wc_reply_authors": "865;1072;1631;33", "reply_reviewers": "2;2;2;0", "reply_authors": "3;4;4;1", "recommendation_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 4.0, 1.0 ], "correctness_avg": [ 3.5, 0.8660254037844386 ], "technical_novelty_avg": [ 3.5, 0.5 ], "empirical_novelty_avg": [ 3.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 95.25, 21.241174637952582 ], "wc_summary_review_avg": [ 52.0, 29.04307146291521 ], "wc_main_review_avg": [ 375.0, 180.52008198535697 ], "wc_review_avg": [ 522.25, 184.75304462985176 ], "wc_reply_reviewers_avg": [ 294.0, 241.8005376338109 ], "wc_reply_authors_avg": [ 900.25, 573.7723307201211 ], "reply_reviewers_avg": [ 1.5, 0.8660254037844386 ], "reply_authors_avg": [ 3.0, 1.224744871391589 ], "replies_avg": [ 30, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 47, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=704065869120035263&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "pdf": "https://openreview.net/pdf?id=C_vsGwEIjAr", "email": "uni-tuebingen.de;uni-tuebingen.de;uni-tuebingen.de;uni-tuebingen.de", "author_num": 4, "aff_unique_index": "0;1;1;2", "aff_unique_norm": "Eberhard Karls University of T\u00fcbingen;University of T\u00fcbingen;University of Tuebingen", "aff_unique_dep": ";;", "aff_unique_url": "https://www.uni-tuebingen.de/;https://www.uni-tuebingen.de/;https://www.uni-tuebingen.de/", "aff_unique_abbr": "Uni T\u00fcbingen;Uni T\u00fcbingen;Uni T\u00fcbingen", "aff_campus_unique_index": "0", "aff_campus_unique": "T\u00fcbingen;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Germany" }, { "id": "CdBDMQkx3hU", "title": "DAAS: Differentiable Architecture and Augmentation Policy Search", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Neural architecture search (NAS) has been an active direction of automatic machine learning (Auto-ML), aiming to explore efficient network structures. The searched architecture is evaluated by training on datasets with fixed data augmentation policies. However, recent works on auto-augmentation show that the suited augmentation policies can vary over different structures. Therefore, this work considers the possible coupling between neural architectures and data augmentation and proposes an effective algorithm jointly searching for them. Specifically, 1) for the NAS task, we adopt a single-path based differentiable method with Gumbel-softmax reparameterization strategy due to its memory efficiency; 2) for the auto-augmentation task, we introduce a novel search method based on policy gradient algorithm, which can significantly reduce the computation complexity. Our approach achieves 97.91% accuracy on CIFAR-10 and 76.6% Top-1 accuracy on ImageNet dataset, showing the outstanding performance of our search algorithm.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xiaoxing Wang;Xiangxiang Chu;Junchi Yan;Xiaokang Yang", "authorids": "~Xiaoxing_Wang1;~Xiangxiang_Chu1;~Junchi_Yan2;~Xiaokang_Yang1", "gender": "M;M;M;M", "homepage": "https://cxxgtxy.github.io/;https://icne.sjtu.edu.cn/info/1064/1078.htm;http://thinklab.sjtu.edu.cn/;https://scholar.google.com/citations?user=n2ewxUIAAAAJ&hl=zh-CN", "dblp": "207/8002;06/3071-1.html;60/7949.html;78/885", "google_scholar": "jn21pUsAAAAJ;yDEavdMAAAAJ;ga230VoAAAAJ;n2ewxUIAAAAJ", "orcid": "0000-0003-2548-0605;0000-0003-4029-3322;0000-0001-9639-7679;0000-0002-7830-9521", "linkedin": ";;;", "or_profile": "~Xiangxiang_Chu1;~Xiaokang_Yang1;~Junchi_Yan1;~Victor_Wang1", "aff": "MeiTuan;Shanghai Jiaotong University;Shanghai Jiaotong University;Shanghai Jiaotong University", "aff_domain": "meituan.com;sjtu.edu.cn;sjtu.edu.cn;sjtu.edu.cn", "position": "Senior Engineer;Full Professor;Associate Professor;PhD student", "bibtex": "@misc{\nwang2022daas,\ntitle={{DAAS}: Differentiable Architecture and Augmentation Policy Search},\nauthor={Xiaoxing Wang and Xiangxiang Chu and Junchi Yan and Xiaokang Yang},\nyear={2022},\nurl={https://openreview.net/forum?id=CdBDMQkx3hU}\n}", "github": "", "project": "", "reviewers": "m7yy;ARL5;Evxf;3vfq", "site": "https://openreview.net/forum?id=CdBDMQkx3hU", "pdf_size": 0, "recommendation": "5;5;5;5", "confidence": "4;5;5;4", "correctness": "2;3;3;3", "technical_novelty": "3;3;2;2", "empirical_novelty": "2;3;2;3", "wc_summary_paper": "35;50;45;65", "wc_summary_review": "32;26;16;22", "wc_main_review": "300;135;114;377", "wc_review": "367;211;175;464", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 5.0, 0.0 ], "confidence_avg": [ 4.5, 0.5 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 48.75, 10.825317547305483 ], "wc_summary_review_avg": [ 24.0, 5.830951894845301 ], "wc_main_review_avg": [ 231.5, 110.65825771265332 ], "wc_review_avg": [ 304.25, 117.10972419060681 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16820103358408629605&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;1;1", "aff_unique_norm": "Meituan;Shanghai Jiao Tong University", "aff_unique_dep": ";", "aff_unique_url": "https://www.meituan.com;https://www.sjtu.edu.cn", "aff_unique_abbr": "MeiTuan;SJTU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "id": "CdNRpVj215", "title": "Towards Non-Parametric Models for Confidence Aware Video Prediction on Smooth Dynamics", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "The ability to envision future states is crucial to informed decision making while interacting with dynamic environments. \nWith vision providing an information rich sensing modality, the field of video prediction has garnered a lot of attention in pursuit of this ability. Current state of the art methods rely on neural network based models for prediction. Though often accurate, these methods require large amounts of training data that are often unavailable when encountering unknown environments. Furthermore, the predictive accuracy of such methods breaks down without warning, when tested on data far outside their training distribution. This problem is exacerbated by the fact that these networks can be prohibitively expensive to update with recent data acquired online. To overcome these drawbacks we use non-parametric models to take a probabilistic approach to video prediction for problems with little training data. We generate probability distributions over sequentially predicted images and propagate our uncertainty through time to generate a confidence metric for our predictions. We use non-parametric Gaussian Process models for their data efficiency and ability to readily incorporate new training data online. To showcase our method we successfully predict future frames of a smooth fluid simulation environment. In this paper we propose a non-parametric method using Gaussian Process models to propagate probability distributions over sequentially predicted images for confidence aware video prediction with little training. \n", "keywords": "Video Prediction;Non-Parametric Models;Gaussian Processes;Confidence Aware", "primary_area": "", "supplementary_material": "/attachment/facd1c47d8a32adaaeb4ede3168e004983cf15dd.zip", "author": "Nikhil Uday Shinde;Florian Richter;Michael C. Yip", "authorids": "~Nikhil_Uday_Shinde1;~Florian_Richter2;~Michael_C._Yip1", "gender": "M;;", "homepage": ";;http://www.ucsdarclab.com", "dblp": "183/4792;;", "google_scholar": ";HP3e2owAAAAJ;gSYxbCYAAAAJ", "orcid": ";0000-0002-7669-1923;", "linkedin": "nikhil-uday-shinde/;;michael-yip-43913421/", "or_profile": "~Nikhil_Uday_Shinde1;~Florian_Richter2;~Michael_C._Yip1", "aff": "University of California, San Diego;University of California, San Diego;University of California, San Diego", "aff_domain": "ucsd.edu;ucsd.edu;ucsd.edu", "position": "PhD student;PhD student;Associate Professor", "bibtex": "@misc{\nshinde2022towards,\ntitle={Towards Non-Parametric Models for Confidence Aware Video Prediction on Smooth Dynamics},\nauthor={Nikhil Uday Shinde and Florian Richter and Michael C. Yip},\nyear={2022},\nurl={https://openreview.net/forum?id=CdNRpVj215}\n}", "github": "", "project": "", "reviewers": "b9Q9;MgK8;zDWA", "site": "https://openreview.net/forum?id=CdNRpVj215", "pdf_size": 0, "recommendation": "3;3;3", "confidence": "3;2;4", "correctness": "3;3;4", "technical_novelty": "3;1;1", "empirical_novelty": "2;1;1", "wc_summary_paper": "66;46;45", "wc_summary_review": "217;15;16", "wc_main_review": "478;147;313", "wc_review": "761;208;374", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 3.0, 0.816496580927726 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 1.6666666666666667, 0.9428090415820634 ], "empirical_novelty_avg": [ 1.3333333333333333, 0.4714045207910317 ], "wc_summary_paper_avg": [ 52.333333333333336, 9.672412085697939 ], "wc_summary_review_avg": [ 82.66666666666667, 94.98888823904029 ], "wc_main_review_avg": [ 312.6666666666667, 135.13038970646915 ], "wc_review_avg": [ 447.6666666666667, 231.69280428091753 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:MYb5TaJ5D6EJ:scholar.google.com/&scioq=Towards+Non-Parametric+Models+for+Confidence+Aware+Video+Prediction+on+Smooth+Dynamics&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of California, San Diego", "aff_unique_dep": "", "aff_unique_url": "https://www.ucsd.edu", "aff_unique_abbr": "UCSD", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "San Diego", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "CdqsSPLNx-", "title": "Deep Dynamic Attention Model with Gate Mechanism for Solving Time-dependent Vehicle Routing Problems", "track": "main", "status": "Desk Reject", "tldr": "", "abstract": "Vehicle routing problems (VRPs) are a type of classical combinatorial optimization problems widely existing in logistics and transportation operations. There has been an increasing interest to use deep reinforcement learning (DRL) techniques to tackle VRPs, and previous DRL-based studies assumed time-independent travel times between customers. However, travel times in real-world road networks are time-varying, which need to be considered in practical VRPs. We thus propose a Deep Dynamic Attention Models with Gate Mechanisms (DDAM-GM) to learn heuristics for time-dependent VRPs (TDVRPs) in real-world road networks. It extracts the information of node location, node demand, and time-varying travel times between nodes to obtain enhanced node embeddings through a dimension-reducing MHA layer and a synchronous encoder. In addition, we use a gate mechanism to obtain better context embedding. On the basis of a 110-day travel time dataset with 240 time periods per day from an urban road network with 408 nodes and 1250 directed links, we conduct a series of experiments to validate the effectiveness of the proposed model on TDVRPs without and with consideration of time windows, respectively. Experimental results show that our model outperforms significantly two state-of-the-art DRL-based models.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Feng Guo;Qu Wei;Miao Wang;Zhaoxia Guo", "authorids": "~Feng_Guo7;~Qu_Wei1;~Miao_Wang2;~Zhaoxia_Guo1", "gender": "M;M;M;M", "homepage": ";;;http://bs.scu.edu.cn/gongye/201904/5203.html", "dblp": ";;;", "google_scholar": ";-u07w_8AAAAJ;;https://scholar.google.com.au/citations?user=PH45-qEAAAAJ", "orcid": "0000-0003-1760-8488;;0000-0002-7949-938X;", "linkedin": ";;;", "or_profile": "~Feng_Guo7;~Qu_Wei1;~Miao_Wang2;~Zhaoxia_Guo1", "aff": "Sichuan University;Politecnico di Torino;Sichuan University;Sichuan University", "aff_domain": "scu.edu.cn;polito.it;scu.edu.cn;scu.edu.cn", "position": "PhD student;PhD student;MS student;Full Professor", "bibtex": "@misc{\nguo2022deep,\ntitle={Deep Dynamic Attention Model with Gate Mechanism for Solving Time-dependent Vehicle Routing Problems},\nauthor={Feng Guo and Qu Wei and Miao Wang and Zhaoxia Guo},\nyear={2022},\nurl={https://openreview.net/forum?id=CdqsSPLNx-}\n}", "github": "", "project": "", "reviewers": "", "site": "https://openreview.net/forum?id=CdqsSPLNx-", "pdf_size": 0, "recommendation": "", "confidence": "", "correctness": "", "technical_novelty": "", "empirical_novelty": "", "wc_summary_paper": "", "wc_summary_review": "", "wc_main_review": "", "wc_review": "", "wc_reply_reviewers": "", "wc_reply_authors": "", "reply_reviewers": "", "reply_authors": "", "recommendation_avg": [ 0, 0 ], "confidence_avg": [ 0, 0 ], "correctness_avg": [ 0, 0 ], "technical_novelty_avg": [ 0, 0 ], "empirical_novelty_avg": [ 0, 0 ], "wc_summary_paper_avg": [ 0, 0 ], "wc_summary_review_avg": [ 0, 0 ], "wc_main_review_avg": [ 0, 0 ], "wc_review_avg": [ 0, 0 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 1, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0, "corr_recommendation_correctness": 0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:HDqhMkuZeu0J:scholar.google.com/&scioq=Deep+Dynamic+Attention+Model+with+Gate+Mechanism+for+Solving+Time-dependent+Vehicle+Routing+Problems&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1;0;0", "aff_unique_norm": "Sichuan University;Politecnico di Torino", "aff_unique_dep": ";", "aff_unique_url": "https://www.scu.edu.cn;https://www.polito.it", "aff_unique_abbr": "SCU;Polito", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0", "aff_country_unique": "China;Italy" }, { "title": "ADAVI: Automatic Dual Amortized Variational Inference Applied To Pyramidal Bayesian Models", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6309", "id": "CgIEctmcXx1", "poster": "", "openreview": "https://openreview.net/forum?id=CgIEctmcXx1", "slides": "https://iclr.cc/virtual/2022/poster/6309", "video": "https://iclr.cc/virtual/2022/poster/6309", "author_site": "Louis Rouillard, Demian Wassermann", "tldr": "", "abstract": "Frequently, population studies feature pyramidally-organized data represented using Hierarchical Bayesian Models (HBM) enriched with plates. These models can become prohibitively large in settings such as neuroimaging, where a sample is composed of a functional MRI signal measured on 300 brain locations, across 4 measurement sessions, and 30 subjects, resulting in around 1 million latent parameters.\n\nSuch high dimensionality hampers the usage of modern, expressive flow-based techniques.\n\nTo infer parameter posterior distributions in this challenging class of problems, we designed a novel methodology that automatically produces a variational family dual to a target HBM. This variational family, represented as a neural network, consists in the combination of an attention-based hierarchical encoder feeding summary statistics to a set of normalizing flows. Our automatically-derived neural network exploits exchangeability in the plate-enriched HBM and factorizes its parameter space. The resulting architecture reduces by orders of magnitude its parameterization with respect to that of a typical flow-based representation, while maintaining expressivity.\n\nOur method performs inference on the specified HBM in an amortized setup: once trained, it can readily be applied to a new data sample to compute the parameters' full posterior.\n\nWe demonstrate the capability and scalability of our method on simulated data, as well as a challenging high-dimensional brain parcellation experiment. We also open up several questions that lie at the intersection between normalizing flows, SBI, structured Variational Inference, and inference amortization.", "keywords": "Bayesian inference;Hierarchical Bayesian Models;structured Variational Inference;Simulation Based Inference;Inference amortization;Neuroimaging", "primary_area": "", "supplementary_material": "/attachment/316d3e7273e81b94cd40e69922ff34ca6863bc8f.zip", "author": "Louis Rouillard;Demian Wassermann", "authorids": "~Louis_Rouillard1;~Demian_Wassermann1", "gender": "M;M", "homepage": ";https://pages.saclay.inria.fr/demian.wassermann/", "dblp": ";27/2827", "google_scholar": ";https://scholar.google.fr/citations?user=TX-vLhIAAAAJ", "orcid": "0000-0002-8446-3509;0000-0001-5194-6056", "linkedin": "louis-rouillard/;", "or_profile": "~Louis_Rouillard1;~Demian_Wassermann1", "aff": "INRIA;INRIA", "aff_domain": "inria.fr;inria.fr", "position": "PhD student;Associate Professor", "bibtex": "@inproceedings{\nrouillard2022adavi,\ntitle={{ADAVI}: Automatic Dual Amortized Variational Inference Applied To Pyramidal Bayesian Models},\nauthor={Louis Rouillard and Demian Wassermann},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=CgIEctmcXx1}\n}", "github": "", "project": "", "reviewers": "bt4m;bbMw;KUup;4eiB", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "3;3;2;5", "correctness": "3;3;4;4", "technical_novelty": "3;3;4;2", "empirical_novelty": "2;2;0;0", "wc_summary_paper": "80;85;49;63", "wc_summary_review": "55;132;7;103", "wc_main_review": "464;850;113;269", "wc_review": "599;1067;169;435", "wc_reply_reviewers": "0;0;0;70", "wc_reply_authors": "2426;1925;127;1735", "reply_reviewers": "0;0;0;1", "reply_authors": "4;3;1;4", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 3.25, 1.0897247358851685 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 1.0, 1.0 ], "wc_summary_paper_avg": [ 69.25, 14.254385290148432 ], "wc_summary_review_avg": [ 74.25, 47.57822506147114 ], "wc_main_review_avg": [ 424.0, 275.60025399117467 ], "wc_review_avg": [ 567.5, 326.66917516043657 ], "wc_reply_reviewers_avg": [ 17.5, 30.31088913245535 ], "wc_reply_authors_avg": [ 1553.25, 861.2654570456195 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 3.0, 1.224744871391589 ], "replies_avg": [ 22, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.22941573387056177, "corr_recommendation_correctness": 1.0, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2913763519956847949&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "pdf": "https://openreview.net/pdf?id=CgIEctmcXx1", "email": "inria.fr;inria.fr", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "INRIA", "aff_unique_dep": "", "aff_unique_url": "https://www.inria.fr", "aff_unique_abbr": "INRIA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "France" }, { "id": "CgV7NVOgDJZ", "title": "Guided-TTS:Text-to-Speech with Untranscribed Speech", "track": "main", "status": "Reject", "tldr": "", "abstract": "Most neural text-to-speech (TTS) models require $\\langle$speech, transcript$\\rangle$ paired data from the desired speaker for high-quality speech synthesis, which limits the usage of large amounts of untranscribed data for training. In this work, we present Guided-TTS, a high-quality TTS model that learns to generate speech from untranscribed speech data. Guided-TTS combines an unconditional diffusion probabilistic model with a separately trained phoneme classifier for text-to-speech. By modeling the unconditional distribution for speech, our model can utilize the untranscribed data for training. For text-to-speech synthesis, we guide the generative process of the unconditional DDPM via phoneme classification to produce mel-spectrograms from the conditional distribution given transcript. We show that Guided-TTS achieves comparable performance with the existing methods without any transcript for LJSpeech. Our results further show that a single speaker-dependent phoneme classifier trained on multispeaker large-scale data can guide unconditional DDPMs for various speakers to perform TTS.", "keywords": "Text-to-Speech;Speech Synthesis;DDPM;TTS;Untranscribed speech", "primary_area": "", "supplementary_material": "", "author": "Heeseung Kim;Sungwon Kim;Sungroh Yoon", "authorids": "~Heeseung_Kim1;~Sungwon_Kim2;~Sungroh_Yoon1", "gender": "M;M;", "homepage": "https://gmltmd789.github.io;;http://ailab.snu.ac.kr", "dblp": "294/8710;;99/1474", "google_scholar": "4ojbJpoAAAAJ;6qGppvkAAAAJ;Bphl_fIAAAAJ", "orcid": ";;0000-0002-2367-197X", "linkedin": "gmltmd789/;sungwon-kim-dsail/;", "or_profile": "~Heeseung_Kim1;~Sungwon_Kim2;~Sungroh_Yoon1", "aff": "Seoul National University;Seoul National University;Seoul National University", "aff_domain": "snu.ac.kr;snu.ac.kr;snu.ac.kr", "position": "PhD student;PhD student;Full Professor", "bibtex": "@misc{\nkim2022guidedttstexttospeech,\ntitle={Guided-{TTS}:Text-to-Speech with Untranscribed Speech},\nauthor={Heeseung Kim and Sungwon Kim and Sungroh Yoon},\nyear={2022},\nurl={https://openreview.net/forum?id=CgV7NVOgDJZ}\n}", "github": "", "project": "", "reviewers": "6P3J;MW7v;XRxC;T6xj", "site": "https://openreview.net/forum?id=CgV7NVOgDJZ", "pdf_size": 0, "recommendation": "3;5;5;8", "confidence": "4;5;4;4", "correctness": "3;2;3;3", "technical_novelty": "2;2;3;3", "empirical_novelty": "1;3;3;3", "wc_summary_paper": "54;123;44;129", "wc_summary_review": "27;156;21;28", "wc_main_review": "378;350;101;337", "wc_review": "459;629;166;494", "wc_reply_reviewers": "125;0;0;0", "wc_reply_authors": "1937;728;548;728", "reply_reviewers": "1;0;0;0", "reply_authors": "3;1;1;1", "recommendation_avg": [ 5.25, 1.7853571071357126 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 87.5, 38.720149793098685 ], "wc_summary_review_avg": [ 58.0, 56.64362276549762 ], "wc_main_review_avg": [ 291.5, 110.97860154101781 ], "wc_review_avg": [ 437.0, 168.84756438871128 ], "wc_reply_reviewers_avg": [ 31.25, 54.12658773652741 ], "wc_reply_authors_avg": [ 985.25, 554.3849632701089 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.5, 0.8660254037844386 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.08084520834544431, "corr_recommendation_correctness": 0.08084520834544431, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4665420343112439520&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;0", "aff_unique_norm": "Seoul National University", "aff_unique_dep": "", "aff_unique_url": "https://www.snu.ac.kr", "aff_unique_abbr": "SNU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "South Korea" }, { "id": "ChKNCDB0oYj", "title": "Mistake-driven Image Classification with FastGAN and SpinalNet", "track": "main", "status": "Reject", "tldr": "", "abstract": "Image classification with classes of varying difficulty can cause performance disparity in deep learning models and reduce the overall performance and reliability of the predictions. In this paper, we address the problem of imbalanced performance in image classification, where the trained model has performance deficits in some of the dataset's classes. By employing Generative Adversarial Networks (GANs) to augment these deficit classes, we finetune the model towards a balanced performance among the different classes and an overall better performance on the whole dataset. Specifically, we combine a light-weight GAN method, FastGAN (Liu et al., 2021), for class-wise data augmentation with Progressive SpinalNet (Chopra, 2021) and Sharpness-Aware Minimization (SAM) (Foret et al., 2020) for training. Unlike earlier works, during training, our method focuses on those classes with lowest accuracy after the initial training phase, which leads to better performance. Only these classes are augmented to boost the accuracy. Due to the use of a light-weight GAN method, the GAN-based augmentation is viable and effective for mistake-driven training even for datasets with only few images per class, while simultaneously requiring less computation than other, more complex GAN methods. Our extensive experiments, including ablation studies on all key components, show competitive or better accuracy than the previous state-of-the-art on five datasets with different sizes and image resolutions.", "keywords": "Deep Learning;Data Augmentation;Image Classification;Supervised Learning;Generative models", "primary_area": "", "supplementary_material": "", "author": "Mohit Kumar Ahuja;Sahil Sahil;Helge Spieker", "authorids": "~Mohit_Kumar_Ahuja1;~Sahil_Sahil1;~Helge_Spieker1", "gender": "M;M;", "homepage": "https://mohitkumarahuja.com;;https://hspieker.de/", "dblp": "241/2555.html;;169/5121", "google_scholar": "8vnZq_4AAAAJ;;SMvVsioAAAAJ", "orcid": "0000-0002-1418-0909;;0000-0003-2494-4279", "linkedin": "mohit-kumar-ahuja-211b4183;sahiliiserb/;helge-spieker-3b4a20208/", "or_profile": "~Mohit_Kumar_Ahuja1;~Sahil_Sahil1;~Helge_Spieker1", "aff": "Simula Research Laboratory;Indian Institute of Science Education and Research Bhopal;Simula Research Laboratory", "aff_domain": "simula.no;iiserb.ac.in;simula.no", "position": "PhD student;MS student;Researcher", "bibtex": "@misc{\nahuja2022mistakedriven,\ntitle={Mistake-driven Image Classification with Fast{GAN} and SpinalNet},\nauthor={Mohit Kumar Ahuja and Sahil Sahil and Helge Spieker},\nyear={2022},\nurl={https://openreview.net/forum?id=ChKNCDB0oYj}\n}", "github": "", "project": "", "reviewers": "nk5o;N1Vg;hoDS", "site": "https://openreview.net/forum?id=ChKNCDB0oYj", "pdf_size": 0, "recommendation": "1;3;3", "confidence": "5;5;4", "correctness": "3;3;4", "technical_novelty": "2;1;1", "empirical_novelty": "2;2;1", "wc_summary_paper": "76;93;77", "wc_summary_review": "56;36;17", "wc_main_review": "126;153;139", "wc_review": "258;282;233", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "335;445;376", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "recommendation_avg": [ 2.3333333333333335, 0.9428090415820634 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 1.3333333333333333, 0.4714045207910317 ], "empirical_novelty_avg": [ 1.6666666666666667, 0.4714045207910317 ], "wc_summary_paper_avg": [ 82.0, 7.788880963698615 ], "wc_summary_review_avg": [ 36.333333333333336, 15.923427883328248 ], "wc_main_review_avg": [ 139.33333333333334, 11.025223605694151 ], "wc_review_avg": [ 257.6666666666667, 20.005554784164875 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 385.3333333333333, 45.38967087589667 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.4999999999999999, "corr_recommendation_correctness": 0.5, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2518109341995611924&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;0", "aff_unique_norm": "Simula Research Laboratory;Indian Institute of Science Education and Research", "aff_unique_dep": ";", "aff_unique_url": "https://www.simula.no;https://www.iiserbhopal.ac.in", "aff_unique_abbr": "Simula;IISER Bhopal", "aff_campus_unique_index": "1", "aff_campus_unique": ";Bhopal", "aff_country_unique_index": "0;1;0", "aff_country_unique": "Norway;India" }, { "title": "How many degrees of freedom do we need to train deep networks: a loss landscape perspective", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6283", "id": "ChMLTGRjFcU", "poster": "", "openreview": "https://openreview.net/forum?id=ChMLTGRjFcU", "slides": "https://iclr.cc/virtual/2022/poster/6283", "video": "https://iclr.cc/virtual/2022/poster/6283", "author_site": "Brett Larsen, Stanislav Fort, Nic Becker, Surya Ganguli", "tldr": "", "abstract": "A variety of recent works, spanning pruning, lottery tickets, and training within random subspaces, have shown that deep neural networks can be trained using far fewer degrees of freedom than the total number of parameters. We analyze this phenomenon for random subspaces by first examining the success probability of hitting a training loss sublevel set when training within a random subspace of a given training dimensionality. We find a sharp phase transition in the success probability from $0$ to $1$ as the training dimension surpasses a threshold. This threshold training dimension increases as the desired final loss decreases, but decreases as the initial loss decreases. We then theoretically explain the origin of this phase transition, and its dependence on initialization and final desired loss, in terms of properties of the high-dimensional geometry of the loss landscape. In particular, we show via Gordon's escape theorem, that the training dimension plus the Gaussian width of the desired loss sublevel set, projected onto a unit sphere surrounding the initialization, must exceed the total number of parameters for the success probability to be large. In several architectures and datasets, we measure the threshold training dimension as a function of initialization and demonstrate that it is a small fraction of the total parameters, implying by our theory that successful training with so few dimensions is possible precisely because the Gaussian width of low loss sublevel sets is very large. Moreover, we compare this threshold training dimension to more sophisticated ways of reducing training degrees of freedom, including lottery tickets as well as a new, analogous method: lottery subspaces. ", "keywords": "loss landscape;high-dimensional geometry;random hyperplanes;optimization", "primary_area": "", "supplementary_material": "/attachment/54854bf5a8a6f71e2bc06b9d150b440470715d92.zip", "author": "Brett W Larsen;Stanislav Fort;Nic Becker;Surya Ganguli", "authorids": "~Brett_W_Larsen1;~Stanislav_Fort1;~Nic_Becker1;~Surya_Ganguli1", "gender": "M;M;;M", "homepage": "http://www.bwlarsen.com;http://stanford.edu/~sfort1/;http://npb.space;http://ganguli-gang.stanford.edu/surya.html", "dblp": "268/6684;205/3072;;56/10453", "google_scholar": "qzNuoRoAAAAJ;https://scholar.google.cz/citations?user=eu2Kzn0AAAAJ;;", "orcid": ";;;", "linkedin": ";stanislav-fort-38199a58/;;", "or_profile": "~Brett_W_Larsen1;~Stanislav_Fort1;~Nic_Becker1;~Surya_Ganguli1", "aff": "Stanford University;Stanford University;Stanford University;Stanford University", "aff_domain": "stanford.edu;stanford.edu;stanford.edu;@stanford.edu", "position": "PhD student;PhD student;Undergrad student;Assistant Professor", "bibtex": "@inproceedings{\nlarsen2022how,\ntitle={How many degrees of freedom do we need to train deep networks: a loss landscape perspective},\nauthor={Brett W Larsen and Stanislav Fort and Nic Becker and Surya Ganguli},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=ChMLTGRjFcU}\n}", "github": "", "project": "", "reviewers": "L6aD;fVom;17kR;krRB", "pdf_size": 0, "recommendation": "6;6;6;8", "confidence": "4;4;4;4", "correctness": "4;4;3;4", "technical_novelty": "3;4;3;4", "empirical_novelty": "3;3;3;4", "wc_summary_paper": "235;92;83;120", "wc_summary_review": "35;67;47;19", "wc_main_review": "199;311;460;153", "wc_review": "469;470;590;292", "wc_reply_reviewers": "0;58;38;34", "wc_reply_authors": "361;781;339;468", "reply_reviewers": "0;1;1;1", "reply_authors": "1;1;1;1", "recommendation_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.5, 0.5 ], "empirical_novelty_avg": [ 3.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 132.5, 60.730964095755965 ], "wc_summary_review_avg": [ 42.0, 17.52141546793523 ], "wc_main_review_avg": [ 280.75, 118.37308604577309 ], "wc_review_avg": [ 455.25, 106.31880125358826 ], "wc_reply_reviewers_avg": [ 32.5, 20.850659461993043 ], "wc_reply_authors_avg": [ 487.25, 176.47715857866706 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 27, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11943963795167204430&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=ChMLTGRjFcU", "email": "stanford.edu;stanford.edu;stanford.edu;@stanford.edu", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Stanford University", "aff_unique_dep": "", "aff_unique_url": "https://www.stanford.edu", "aff_unique_abbr": "Stanford", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "Ck_iw4jMC4l", "title": "Logical Activation Functions: Logit-space equivalents of Boolean Operators", "track": "main", "status": "Reject", "tldr": "", "abstract": "Neuronal representations within artificial neural networks are commonly understood as logits, representing the log-odds score of presence (versus absence) of features within the stimulus. Under this interpretation, we can derive the probability $P(x_0 \\cap x_1)$ that a pair of independent features are both present in the stimulus from their logits. By converting the resulting probability back into a logit, we obtain a logit-space equivalent of the AND operation. However, since this function involves taking multiple exponents and logarithms, it is not well suited to be directly used within neural networks. We thus constructed an efficient approximation named $\\text{AND}_\\text{AIL}$ (the AND operator Approximate for Independent Logits) utilizing only comparison and addition operations, which can be deployed as an activation function in neural networks. Like MaxOut, $\\text{AND}_\\text{AIL}$ is a generalization of ReLU to two-dimensions. Additionally, we constructed efficient approximations of the logit-space equivalents to the OR and XNOR operators. We deployed these new activation functions, both in isolation and in conjunction, and demonstrated their effectiveness on a variety of tasks including image classification, transfer learning, abstract reasoning, and compositional zero-shot learning.", "keywords": "activation functions;logits", "primary_area": "", "supplementary_material": "", "author": "Scott C Lowe;Robert Earle;Jason d'Eon;Thomas Trappenberg;Sageev Oore", "authorids": "~Scott_C_Lowe1;robearle11@gmail.com;~Jason_d'Eon1;~Thomas_Trappenberg1;~Sageev_Oore1", "gender": ";;M;M;M", "homepage": "https://scottclowe.com/;;;https://projects.cs.dal.ca/hallab;", "dblp": "245/0038;;;t/ThomasTrappenberg;67/4980", "google_scholar": "https://scholar.google.ca/citations?user=ZFPhxuAAAAAJ;;https://scholar.google.ca/citations?hl=en;https://scholar.google.com.tw/citations?user=EwkaTYEAAAAJ;https://scholar.google.ca/citations?user=cI0dYX4AAAAJ", "orcid": "0000-0002-5237-3867;;;;", "linkedin": "scottclowe/;;;;", "or_profile": "~Scott_C_Lowe1;robearle11@gmail.com;~Jason_d'Eon1;~Thomas_Trappenberg1;~Sageev_Oore1", "aff": "Dalhousie University;;Dalhousie University;Dalhousie University;Vector Institute", "aff_domain": "dal.ca;;dal.ca;dal.ca;vectorinstitute.ai", "position": "Postdoc;;PhD student;Full Professor;Researcher", "bibtex": "@misc{\nlowe2022logical,\ntitle={Logical Activation Functions: Logit-space equivalents of Boolean Operators},\nauthor={Scott C Lowe and Robert Earle and Jason d'Eon and Thomas Trappenberg and Sageev Oore},\nyear={2022},\nurl={https://openreview.net/forum?id=Ck_iw4jMC4l}\n}", "github": "", "project": "", "reviewers": "k3Hn;M59f;d99Z;FLuV", "site": "https://openreview.net/forum?id=Ck_iw4jMC4l", "pdf_size": 0, "recommendation": "3;3;5;6", "confidence": "3;3;4;3", "correctness": "3;3;3;4", "technical_novelty": "2;2;3;4", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "125;70;113;61", "wc_summary_review": "38;23;49;80", "wc_main_review": "333;201;402;312", "wc_review": "496;294;564;453", "wc_reply_reviewers": "0;0;224;0", "wc_reply_authors": "965;2192;953;887", "reply_reviewers": "0;0;2;0", "reply_authors": "2;3;2;2", "recommendation_avg": [ 4.25, 1.299038105676658 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.82915619758885 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 92.25, 27.270634389394026 ], "wc_summary_review_avg": [ 47.5, 20.910523666326483 ], "wc_main_review_avg": [ 312.0, 72.21841870326433 ], "wc_review_avg": [ 451.75, 99.30351202248589 ], "wc_reply_reviewers_avg": [ 56.0, 96.99484522385713 ], "wc_reply_authors_avg": [ 1249.25, 545.1065836146175 ], "reply_reviewers_avg": [ 0.5, 0.8660254037844386 ], "reply_authors_avg": [ 2.25, 0.4330127018922193 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": 0.7777777777777777, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=256544908089473868&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2, "aff_unique_index": "0;0;0;1", "aff_unique_norm": "Dalhousie University;Vector Institute", "aff_unique_dep": ";", "aff_unique_url": "https://www.dal.ca;https://vectorinstitute.ai/", "aff_unique_abbr": "Dal;Vector Institute", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Canada" }, { "id": "Clre-Prt128", "title": "Complex-valued deep learning with differential privacy", "track": "main", "status": "Reject", "tldr": "", "abstract": "We present $\\zeta$-DP, an extension of differential privacy (DP) to complex-valued functions. After introducing the complex Gaussian mechanism, whose properties we characterise in terms of $(\\varepsilon, \\delta)$-DP and R\u00e9nyi-DP, we present $\\zeta$-DP stochastic gradient descent ($\\zeta$-DP-SGD), a variant of DP-SGD for training complex-valued neural networks. We experimentally evaluate $\\zeta$-DP-SGD on three complex-valued tasks, i.e. electrocardiogram classification, speech classification and magnetic resonance imaging (MRI) reconstruction. Moreover, we provide $\\zeta$-DP-SGD benchmarks for a large variety of complex-valued activation functions and on a complex-valued variant of the MNIST dataset. Our experiments demonstrate that DP training of complex-valued neural networks is possible with rigorous privacy guarantees and excellent utility.", "keywords": "Differential privacy;complex-valued deep learning", "primary_area": "", "supplementary_material": "", "author": "Alexander Ziller;Dmitrii Usynin;Moritz Knolle;Kerstin Hammernik;Daniel Rueckert;Georgios Kaissis", "authorids": "~Alexander_Ziller1;~Dmitrii_Usynin1;~Moritz_Knolle1;~Kerstin_Hammernik1;~Daniel_Rueckert2;~Georgios_Kaissis1", "gender": "M;;M;;M;", "homepage": ";https://www.dmitrii.usyn.in/;;;https://aim-lab.io/author/daniel-ruckert/;", "dblp": "179/1249;281/7105;;;69/2478;", "google_scholar": "https://scholar.google.de/citations?user=Ir90mU4AAAAJ;https://scholar.google.com/citations?hl=en;;IIqyUmAAAAAJ;https://scholar.google.co.uk/citations?user=H0O0WnQAAAAJ;", "orcid": "0000-0002-3242-0195;0000-0003-0179-6138;0000-0002-3065-2363;;;", "linkedin": "a1302z/;dusynin/;;;;", "or_profile": "~Alexander_Ziller1;~Dmitrii_Usynin1;~Moritz_Knolle1;~Kerstin_Hammernik1;~Daniel_Rueckert2;~Georgios_Kaissis1", "aff": "Technical University Munich;Imperial College London;;Imperial College London;Imperial College London;", "aff_domain": "tum.de;ic.ac.uk;;imperial.ac.uk;imperial.ac.uk;", "position": "PhD student;PhD student;;Postdoc;Full Professor;", "bibtex": "@misc{\nziller2022complexvalued,\ntitle={Complex-valued deep learning with differential privacy},\nauthor={Alexander Ziller and Dmitrii Usynin and Moritz Knolle and Kerstin Hammernik and Daniel Rueckert and Georgios Kaissis},\nyear={2022},\nurl={https://openreview.net/forum?id=Clre-Prt128}\n}", "github": "", "project": "", "reviewers": "n9FY;kQQG;SGmD", "site": "https://openreview.net/forum?id=Clre-Prt128", "pdf_size": 0, "recommendation": "5;6;8", "confidence": "4;3;4", "correctness": "4;4;4", "technical_novelty": "2;2;4", "empirical_novelty": "2;4;4", "wc_summary_paper": "50;82;55", "wc_summary_review": "57;57;34", "wc_main_review": "88;349;256", "wc_review": "195;488;345", "wc_reply_reviewers": "0;110;0", "wc_reply_authors": "641;1029;509", "reply_reviewers": "0;1;0", "reply_authors": "1;2;1", "recommendation_avg": [ 6.333333333333333, 1.247219128924647 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 4.0, 0.0 ], "technical_novelty_avg": [ 2.6666666666666665, 0.9428090415820634 ], "empirical_novelty_avg": [ 3.3333333333333335, 0.9428090415820634 ], "wc_summary_paper_avg": [ 62.333333333333336, 14.055445761538676 ], "wc_summary_review_avg": [ 49.333333333333336, 10.842303978193728 ], "wc_main_review_avg": [ 231.0, 108.00925886237717 ], "wc_review_avg": [ 342.6666666666667, 119.6281274988825 ], "wc_reply_reviewers_avg": [ 36.666666666666664, 51.85449728701349 ], "wc_reply_authors_avg": [ 726.3333333333334, 220.69788298838654 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.18898223650461363, "corr_recommendation_correctness": 0.0, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6089936314788194494&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;1;1", "aff_unique_norm": "Technical University of Munich;Imperial College London", "aff_unique_dep": ";", "aff_unique_url": "https://www.tum.de;https://www.imperial.ac.uk", "aff_unique_abbr": "TUM;ICL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1", "aff_country_unique": "Germany;United Kingdom" }, { "id": "Cm08egNmrl3", "title": "BLOOD: Bi-level Learning Framework for Out-of-distribution Generalization", "track": "main", "status": "Reject", "tldr": "", "abstract": "Empirical risk minimization (ERM) based machine learning algorithms have suffered from weak generalization performance on the out-of-distribution (OOD) data when the training data are collected from separate environments with unknown spurious correlations. To address this problem, previous works either exploit prior human knowledge for biases in the dataset or apply the two-stage process, which re-weights spuriously correlated samples after they were identified by the biased classifier. However, most of them fail to remove multiple types of spurious correlations that exist in training data. In this paper, we propose a novel bi-level learning framework for OOD generalization, which can effectively remove multiple unknown types of biases without any prior bias information or separate re-training steps of a model. In our bi-level learning framework, we uncover spurious correlations in the inner-loop with shallow model-based predictions and dynamically re-group the data to leverage the group distributionally robust optimization method in the outer-loop, minimizing the worst-case risk across all batches. Our main idea applies the unknown bias discovering process to the group construction method of the group DRO algorithm in a bi-level optimization setting and provides a unified de-biasing framework that can handle multiple types of biases in data. In empirical evaluations on both synthetic and real-world datasets, our framework shows superior OOD performance compared to all other state-of-the-art OOD methods by a large margin. Furthermore, it successfully removes multiple types of biases in the training data groups that most other OOD models fail.", "keywords": "Out-of-Distribution Generalization;Generalization;Spurious Correlations;Bi-level Optimization", "primary_area": "", "supplementary_material": "/attachment/16d2ecfd1d4198ab01b3e668eb135d8521258a9e.zip", "author": "Jun-Hyun Bae;Inchul Choi;Minho Lee", "authorids": "~Jun-Hyun_Bae1;~Inchul_Choi1;~Minho_Lee2", "gender": ";M;M", "homepage": ";;https://www.knu.ac.kr/", "dblp": ";;", "google_scholar": ";JUEWM6QAAAAJ;", "orcid": ";;", "linkedin": ";;", "or_profile": "~Jun-Hyun_Bae1;~Inchul_Choi1;~Minho_Lee2", "aff": ";;Kyungpook National University", "aff_domain": ";;knu.ac.kr", "position": ";;Full Professor", "bibtex": "@misc{\nbae2022blood,\ntitle={{BLOOD}: Bi-level Learning Framework for Out-of-distribution Generalization},\nauthor={Jun-Hyun Bae and Inchul Choi and Minho Lee},\nyear={2022},\nurl={https://openreview.net/forum?id=Cm08egNmrl3}\n}", "github": "", "project": "", "reviewers": "t2w7;9XTF;r5uG;gQqn", "site": "https://openreview.net/forum?id=Cm08egNmrl3", "pdf_size": 0, "recommendation": "3;5;5;5", "confidence": "3;4;4;4", "correctness": "2;3;2;3", "technical_novelty": "1;1;3;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "66;40;109;50", "wc_summary_review": "19;142;49;65", "wc_main_review": "156;565;282;256", "wc_review": "241;747;440;371", "wc_reply_reviewers": "0;0;0;35", "wc_reply_authors": "973;474;1161;863", "reply_reviewers": "0;0;0;1", "reply_authors": "2;1;2;2", "recommendation_avg": [ 4.5, 0.8660254037844386 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.0, 1.0 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 66.25, 26.366408553308887 ], "wc_summary_review_avg": [ 68.75, 45.40030286242593 ], "wc_main_review_avg": [ 314.75, 151.9463309856477 ], "wc_review_avg": [ 449.75, 185.89698087919555 ], "wc_reply_reviewers_avg": [ 8.75, 15.155444566227676 ], "wc_reply_authors_avg": [ 867.75, 251.065106098 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.75, 0.4330127018922193 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 1.0, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:5UIrFjLlOhsJ:scholar.google.com/&scioq=BLOOD:+Bi-level+Learning+Framework+for+Out-of-distribution+Generalization&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "Kyungpook National University", "aff_unique_dep": "", "aff_unique_url": "https://www.knu.ac.kr", "aff_unique_abbr": "KNU", "aff_country_unique_index": "0", "aff_country_unique": "South Korea" }, { "title": "Reinforcement Learning in Presence of Discrete Markovian Context Evolution", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6691", "id": "CmsfC7u054S", "poster": "", "openreview": "https://openreview.net/forum?id=CmsfC7u054S", "slides": "https://iclr.cc/virtual/2022/poster/6691", "video": "https://iclr.cc/virtual/2022/poster/6691", "author_site": "Hang Ren, Aivar Sootla, Taher Jafferjee, Junxiao Shen, Jun Wang, Haitham Bou Ammar", "tldr": "", "abstract": "We consider a context-dependent Reinforcement Learning (RL) setting, which is characterized by: a) an unknown finite number of not directly observable contexts; b) abrupt (discontinuous) context changes occurring during an episode; and c) Markovian context evolution. We argue that this challenging case is often met in applications and we tackle it using a Bayesian model-based approach and variational inference. We adapt a sticky Hierarchical Dirichlet Process (HDP) prior for model learning, which is arguably best-suited for infinite Markov chain modeling. We then derive a context distillation procedure, which identifies and removes spurious contexts in an unsupervised fashion. We argue that the combination of these two components allows inferring the number of contexts from data thus dealing with the context cardinality assumption. We then find the representation of the optimal policy enabling efficient policy learning using off-the-shelf RL algorithms. Finally, we demonstrate empirically (using gym environments cart-pole swing-up, drone, intersection) that our approach succeeds where state-of-the-art methods of other frameworks fail and elaborate on the reasons for such failures.", "keywords": "context-dependent Reinforcement Learning;model-based reinforcement learning;hierarchical Dirichlet process", "primary_area": "", "supplementary_material": "", "author": "Hang Ren;Aivar Sootla;Taher Jafferjee;Junxiao Shen;Jun Wang;Haitham Bou Ammar", "authorids": "~Hang_Ren2;~Aivar_Sootla1;~Taher_Jafferjee1;~Junxiao_Shen1;~Jun_Wang2;~Haitham_Bou_Ammar1", "gender": "M;M;Not Specified;M;M;M", "homepage": ";;https://atlashugs.github.io/;https://shawnshenjx.github.io/;http://www0.cs.ucl.ac.uk/staff/jun.wang/;", "dblp": ";66/9184;267/1551;293/8962;w/JunWang12;", "google_scholar": "6nY_XbwAAAAJ;https://scholar.google.co.uk/citations?hl=en;;0-qmmqkAAAAJ;https://scholar.google.co.uk/citations?user=wIE1tY4AAAAJ;https://scholar.google.co.uk/citations?user=AE5suDoAAAAJ", "orcid": ";;;0000-0002-1552-4689;;", "linkedin": ";;;shawn-shen-oix/;;", "or_profile": "~Hang_Ren2;~Aivar_Sootla1;~Taher_Jafferjee1;~Junxiao_Shen1;~Jun_Wang2;~Haitham_Bou_Ammar1", "aff": "Huawei Technologies Ltd.;Huawei R&D UK;Huawei Technologies Ltd.;University of Cambridge;University College London;Huawei R&D UK", "aff_domain": "huawei.com;huawei.com;huawei.com;cam.ac.uk;ucl.ac.uk;huawei.com", "position": "Researcher;Research scientist;Researcher;PhD student;Professor;Principal Researcher", "bibtex": "@inproceedings{\nren2022reinforcement,\ntitle={Reinforcement Learning in Presence of Discrete Markovian Context Evolution },\nauthor={Hang Ren and Aivar Sootla and Taher Jafferjee and Junxiao Shen and Jun Wang and Haitham Bou Ammar},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=CmsfC7u054S}\n}", "github": "", "project": "", "reviewers": "w49w;w8qw;q8Tn;HPaX;D986", "pdf_size": 0, "recommendation": "6;6;6;8;8", "confidence": "4;4;3;3;2", "correctness": "4;3;3;3;4", "technical_novelty": "3;3;2;4;3", "empirical_novelty": "2;2;2;3;0", "wc_summary_paper": "111;91;76;61;102", "wc_summary_review": "52;64;75;38;25", "wc_main_review": "680;316;462;154;283", "wc_review": "843;471;613;253;410", "wc_reply_reviewers": "146;0;145;0;0", "wc_reply_authors": "1975;1039;2363;692;284", "reply_reviewers": "1;0;2;0;0", "reply_authors": "5;3;5;2;2", "recommendation_avg": [ 6.8, 0.9797958971132712 ], "confidence_avg": [ 3.2, 0.7483314773547882 ], "correctness_avg": [ 3.4, 0.4898979485566356 ], "technical_novelty_avg": [ 3.0, 0.6324555320336759 ], "empirical_novelty_avg": [ 1.8, 0.9797958971132712 ], "wc_summary_paper_avg": [ 88.2, 17.92651667223725 ], "wc_summary_review_avg": [ 50.8, 17.837040113202637 ], "wc_main_review_avg": [ 379.0, 179.61069010501575 ], "wc_review_avg": [ 518.0, 199.37301723152007 ], "wc_reply_reviewers_avg": [ 58.2, 71.2808529690828 ], "wc_reply_authors_avg": [ 1270.6, 781.193087527021 ], "reply_reviewers_avg": [ 0.6, 0.7999999999999999 ], "reply_authors_avg": [ 3.4, 1.3564659966250538 ], "replies_avg": [ 27, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.7637626158259733, "corr_recommendation_correctness": 0.16666666666666663, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1037235936590031861&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=CmsfC7u054S", "email": "huawei.com;huawei.com;huawei.com;cam.ac.uk;ucl.ac.uk;huawei.com", "author_num": 6, "aff_unique_index": "0;0;0;1;2;0", "aff_unique_norm": "Huawei;University of Cambridge;University College London", "aff_unique_dep": "Huawei Technologies;;", "aff_unique_url": "https://www.huawei.com;https://www.cam.ac.uk;https://www.ucl.ac.uk", "aff_unique_abbr": "Huawei;Cambridge;UCL", "aff_campus_unique_index": "1", "aff_campus_unique": ";Cambridge", "aff_country_unique_index": "0;1;0;1;1;1", "aff_country_unique": "China;United Kingdom" }, { "id": "CoMOKHYWf2", "title": "AdaFocal: Calibration-aware Adaptive Focal Loss", "track": "main", "status": "Reject", "tldr": "", "abstract": "Much recent work has been devoted to the problem of ensuring that a neural network's confidence scores match the true probability of being correct, i.e. the calibration problem. Of note, it was found that training with Focal loss leads to better calibrated deep networks than cross-entropy loss, while achieving the same level of accuracy \\cite{mukhoti2020}. This success stems from Focal loss regularizing the entropy of the network's prediction (controlled by the hyper-parameter $\\gamma$), thereby reining in the network's overconfidence. Further improvements in calibration can be achieved if $\\gamma$ is selected independently for each training sample. However, the proposed strategy (named FLSD-53) is based on simple heuristics which, when selecting the $\\gamma$, does not take into account any knowledge of whether the network is under or over confident about such samples and by how much. As a result, in most cases, this strategy performs only slightly better. In this paper, we propose a calibration-aware sample-dependent Focal loss called AdaFocal that adaptively modifies $\\gamma$ from one training step to the next based on the information about the network's current calibration behaviour. At each training step $t$, AdaFocal adjusts the $\\gamma_t$ based on (1) $\\gamma_{t-1}$ of the previous training step (2) the magnitude of the network's under/over-confidence. We evaluate our proposed method on various image recognition and NLP tasks, covering a variety of network architectures, and confirm that AdaFocal consistently achieves significantly better calibration than the competing state-of-the-art methods without loss of accuracy.", "keywords": "neural networks;uncertainity calibration;out of distribution detection", "primary_area": "", "supplementary_material": "/attachment/829d0640d358dd3f62e9b6cf025d685212cee9e1.zip", "author": "Arindam Ghosh;Thomas Schaaf;Matthew R. Gormley", "authorids": "~Arindam_Ghosh3;~Thomas_Schaaf2;~Matthew_R._Gormley1", "gender": "M;M;M", "homepage": "https://arndmghosh.github.io/;;http://www.cs.cmu.edu/~mgormley/", "dblp": "62/10648-3;75/5600;116/0475", "google_scholar": "qk39CdYAAAAJ;gyDVO1IAAAAJ;GU0SZmYAAAAJ", "orcid": "0000-0002-4545-0879;0000-0002-9569-4759;", "linkedin": "arndm-ghosh/;thomasschaaf/;", "or_profile": "~Arindam_Ghosh3;~Thomas_Schaaf2;~Matthew_R._Gormley1", "aff": "3M/Solventum Healthcare;3M | M*Modal;Carnegie Mellon University", "aff_domain": "solventum.com;mmm.com;cs.cmu.edu", "position": "Researcher;Principal Researcher;Assistant Teaching Professor", "bibtex": "@misc{\nghosh2022adafocal,\ntitle={AdaFocal: Calibration-aware Adaptive Focal Loss},\nauthor={Arindam Ghosh and Thomas Schaaf and Matthew R. Gormley},\nyear={2022},\nurl={https://openreview.net/forum?id=CoMOKHYWf2}\n}", "github": "", "project": "", "reviewers": "u2LZ;veY9;5y1y;3Y6N", "site": "https://openreview.net/forum?id=CoMOKHYWf2", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "3;3;3;4", "correctness": "3;3;4;4", "technical_novelty": "3;3;4;4", "empirical_novelty": "3;2;3;4", "wc_summary_paper": "89;64;41;48", "wc_summary_review": "29;81;80;38", "wc_main_review": "352;256;391;137", "wc_review": "470;401;512;223", "wc_reply_reviewers": "59;0;213;0", "wc_reply_authors": "1501;930;1271;254", "reply_reviewers": "1;0;2;0", "reply_authors": "3;2;3;1", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.5, 0.5 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 60.5, 18.445866745696716 ], "wc_summary_review_avg": [ 57.0, 23.717082451262844 ], "wc_main_review_avg": [ 284.0, 98.06375477208692 ], "wc_review_avg": [ 401.5, 110.41399367833772 ], "wc_reply_reviewers_avg": [ 68.0, 87.11199687758283 ], "wc_reply_authors_avg": [ 989.0, 470.47157193607353 ], "reply_reviewers_avg": [ 0.75, 0.82915619758885 ], "reply_authors_avg": [ 2.25, 0.82915619758885 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": 1.0, "gs_citation": 41, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11259339387617997995&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff_unique_index": "0;0;1", "aff_unique_norm": "3M;Carnegie Mellon University", "aff_unique_dep": "Solventum Healthcare;", "aff_unique_url": "https://www.3m.com;https://www.cmu.edu", "aff_unique_abbr": "3M;CMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "LIGS: Learnable Intrinsic-Reward Generation Selection for Multi-Agent Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6617", "id": "CpTuR2ECuW", "poster": "", "openreview": "https://openreview.net/forum?id=CpTuR2ECuW", "slides": "https://iclr.cc/virtual/2022/poster/6617", "video": "https://iclr.cc/virtual/2022/poster/6617", "author_site": "David Mguni, Taher Jafferjee, Jianhong Wang, Nicolas Perez-Nieves, Oliver Slumbers, Feifei Tong, , Jiangcheng Zhu, Yaodong Yang, Jun Wang", "tldr": "", "abstract": "Efficient exploration is important for reinforcement learners (RL) to achieve high rewards. In multi-agent systems, coordinated exploration and behaviour is critical for agents to jointly achieve optimal outcomes. In this paper, we introduce a new general framework for improving coordination and performance of multi-agent reinforcement learners (MARL). Our framework, named Learnable Intrinsic-Reward Generation Selection algorithm (LIGS) introduces an adaptive learner, Generator that observes the agents and learns to construct intrinsic rewards online that coordinate the agents\u2019 joint exploration and joint behaviour. Using a novel combination of reinforcement learning (RL) and switching controls, LIGS determines the best states to learn to add intrinsic rewards which leads to a highly efficient learning process. LIGS can subdivide complex tasks making them easier to solve and enables systems of RL agents to quickly solve environments with sparse rewards. LIGS can seamlessly adopt existing multi-agent RL algorithms and our theory shows that it ensures convergence to joint policies that deliver higher system performance. We demonstrate the superior performance of the LIGS framework in challenging tasks in Foraging and StarCraft II and show LIGS is capable of tackling tasks previously unsolvable by MARL methods.", "keywords": "multi-agent;reinforcement learning;intrinsic rewards;exploration", "primary_area": "", "supplementary_material": "/attachment/b823113899439f18f37f06f2b8037d3fb3001610.zip", "author": "David Henry Mguni;Taher Jafferjee;Jianhong Wang;Nicolas Perez-Nieves;Oliver Slumbers;Feifei Tong;Yang Li;Jiangcheng Zhu;Yaodong Yang;Jun Wang", "authorids": "~David_Henry_Mguni1;~Taher_Jafferjee1;~Jianhong_Wang1;~Nicolas_Perez-Nieves1;~Oliver_Slumbers1;~Feifei_Tong1;liyang2@shanghaitech.edu.cn;~Jiangcheng_Zhu1;~Yaodong_Yang1;~Jun_Wang2", "gender": "M;Not Specified;M;;;M;;M;M;M", "homepage": ";https://atlashugs.github.io/;https://hsvgbkhgbv.github.io/;;;http://huaweicloud.com;;;https://www.yangyaodong.com;http://www0.cs.ucl.ac.uk/staff/jun.wang/", "dblp": "217/2369;267/1551;;;285/5044;;;202/5904.html;170/1496-1;w/JunWang12", "google_scholar": "K-_yzBsAAAAJ;;K1FKF3IAAAAJ;OqOeYNoAAAAJ;obYGSVIAAAAJ;;;ZosT8hcAAAAJ;https://scholar.google.co.uk/citations?user=6yL0xw8AAAAJ;https://scholar.google.co.uk/citations?user=wIE1tY4AAAAJ", "orcid": ";;;0000-0003-1586-0399;;;;;0000-0001-8132-5613;", "linkedin": ";;jianhong-wang-45995b100/;;;;;https://cn.linkedin.com/in/%E7%96%86%E6%88%90-%E6%9C%B1-85672b169;yaodong-yang;", "or_profile": "~David_Henry_Mguni1;~Taher_Jafferjee1;~Jianhong_Wang1;~Nicolas_Perez-Nieves1;~Oliver_Slumbers1;~Feifei_Tong1;liyang2@shanghaitech.edu.cn;~Jiangcheng_Zhu1;~Yaodong_Yang1;~Jun_Wang2", "aff": "Queen Mary University, London;Huawei Technologies Ltd.;Imperial College London;Imperial College London;University College London;Huawei Technologies Ltd.;;Huawei Technologies Ltd.;King's College London;University College London", "aff_domain": "qmul.ac.uk;huawei.com;ic.ac.uk;ic.ac.uk;ucl.ac.uk;huawei.com;;huawei.com;kcl.ac.uk;ucl.ac.uk", "position": "Lecturer;Researcher;PhD student;PhD student;PhD student;Engineer;;Researcher;Assistant Professor;Professor", "bibtex": "@inproceedings{\nmguni2022ligs,\ntitle={{LIGS}: Learnable Intrinsic-Reward Generation Selection for Multi-Agent Learning },\nauthor={David Henry Mguni and Taher Jafferjee and Jianhong Wang and Nicolas Perez-Nieves and Oliver Slumbers and Feifei Tong and Yang Li and Jiangcheng Zhu and Yaodong Yang and Jun Wang},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=CpTuR2ECuW}\n}", "github": "", "project": "", "reviewers": "crmy;vU2u;W87Z;nTCC", "pdf_size": 0, "recommendation": "5;5;6;8", "confidence": "4;4;4;3", "correctness": "3;3;3;4", "technical_novelty": "3;3;3;4", "empirical_novelty": "3;2;0;4", "wc_summary_paper": "54;55;87;115", "wc_summary_review": "29;60;131;57", "wc_main_review": "248;365;669;425", "wc_review": "331;480;887;597", "wc_reply_reviewers": "0;0;0;26", "wc_reply_authors": "371;520;815;378", "reply_reviewers": "0;0;0;1", "reply_authors": "1;1;1;2", "recommendation_avg": [ 6.0, 1.224744871391589 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 3.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 1.479019945774904 ], "wc_summary_paper_avg": [ 77.75, 25.272267409158207 ], "wc_summary_review_avg": [ 69.25, 37.64555086593899 ], "wc_main_review_avg": [ 426.75, 153.66583061956226 ], "wc_review_avg": [ 573.75, 203.95020838430148 ], "wc_reply_reviewers_avg": [ 6.5, 11.258330249197702 ], "wc_reply_authors_avg": [ 521.0, 179.85132749023566 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 10, 0 ], "corr_recommendation_confidence": -0.9428090415820632, "corr_recommendation_correctness": 0.9428090415820632, "gs_citation": 26, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5525262951349473167&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=CpTuR2ECuW", "email": "qmul.ac.uk;huawei.com;ic.ac.uk;ic.ac.uk;ucl.ac.uk;huawei.com;;huawei.com;kcl.ac.uk;ucl.ac.uk", "author_num": 10, "aff_unique_index": "0;1;2;2;3;1;1;4;3", "aff_unique_norm": "Queen Mary University of London;Huawei;Imperial College London;University College London;King's College London", "aff_unique_dep": ";Huawei Technologies;;;", "aff_unique_url": "https://www.qmul.ac.uk;https://www.huawei.com;https://www.imperial.ac.uk;https://www.ucl.ac.uk;https://www.kcl.ac.uk", "aff_unique_abbr": "QMUL;Huawei;ICL;UCL;KCL", "aff_campus_unique_index": "0", "aff_campus_unique": "London;", "aff_country_unique_index": "0;1;0;0;0;1;1;0;0", "aff_country_unique": "United Kingdom;China" }, { "id": "CpgtwW8GBxe", "title": "Label Refining: a semi-supervised method to extract voice characteristics without ground truth", "track": "main", "status": "Reject", "tldr": "", "abstract": "A characteristic is a distinctive trait shared by a group of observations which may be used to identify them. In the context of voice casting for audiovisual productions, characteristic extraction has an important role since it can help explaining the decisions of a voice recommendation system, or give modalities to the user with the aim to express a voice search request. Unfortunately, the lack of standard taxonomy to describe comedian voices prevents the implementation of an annotation protocol. To address this problem, we propose a new semi-supervised learning method entitled Label Refining that consists in extracting refined labels (e.g. vocal characteristics) from known initial labels (e.g. character played in a recording). Our proposed method first suggests using a representation extractor based on the initial labels, then computing refined labels using a clustering algorithm to finally train a refined representation extractor. The method is validated by applying Label Refining on recordings from the video game MassEffect 3. Experiments show that, using a subsidiary corpus, it is possible to bring out interesting voice characteristics without any a priori knowledge.", "keywords": "characteristics;characteristics extraction;characteristics evaluation;voice characteristics;label refining;refined labels;semi-supervision", "primary_area": "", "supplementary_material": "", "author": "Mathias Quillot;Richard Dufour;Jean-fran\u00e7ais Bonastre", "authorids": "~Mathias_Quillot1;richard.dufour@univ-avignon.fr;jean-francois.bonastre@univ-avignon.fr", "gender": "M;;", "homepage": "https://www.researchgate.net/profile/Mathias-Quillot;;", "dblp": ";;", "google_scholar": ";;", "orcid": "0000-0002-5858-2416;;", "linkedin": "mathias-quillot-43958894/;;", "or_profile": "~Mathias_Quillot1;richard.dufour@univ-avignon.fr;jean-francois.bonastre@univ-avignon.fr", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nquillot2022label,\ntitle={Label Refining: a semi-supervised method to extract voice characteristics without ground truth},\nauthor={Mathias Quillot and Richard Dufour and Jean-fran{\\c{c}}ais Bonastre},\nyear={2022},\nurl={https://openreview.net/forum?id=CpgtwW8GBxe}\n}", "github": "", "project": "", "reviewers": "LKP9;eoTk;dJJ2;acRU;AFut", "site": "https://openreview.net/forum?id=CpgtwW8GBxe", "pdf_size": 0, "recommendation": "3;3;3;3;5", "confidence": "3;4;3;3;3", "correctness": "3;2;2;2;3", "technical_novelty": "1;2;2;1;3", "empirical_novelty": "1;2;2;1;2", "wc_summary_paper": "65;70;208;75;40", "wc_summary_review": "39;75;44;48;18", "wc_main_review": "62;248;426;263;144", "wc_review": "166;393;678;386;202", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "0;0;0;0;0", "reply_reviewers": "0;0;0;0;0", "reply_authors": "0;0;0;0;0", "recommendation_avg": [ 3.4, 0.8 ], "confidence_avg": [ 3.2, 0.39999999999999997 ], "correctness_avg": [ 2.4, 0.4898979485566356 ], "technical_novelty_avg": [ 1.8, 0.7483314773547883 ], "empirical_novelty_avg": [ 1.6, 0.4898979485566356 ], "wc_summary_paper_avg": [ 91.6, 59.43265095887949 ], "wc_summary_review_avg": [ 44.8, 18.301912468373352 ], "wc_main_review_avg": [ 228.6, 122.83256897093702 ], "wc_review_avg": [ 365.0, 181.85928626275867 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.25, "corr_recommendation_correctness": 0.6123724356957946, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:cXsH62iB5ZcJ:scholar.google.com/&scioq=Label+Refining:+a+semi-supervised+method+to+extract+voice+characteristics+without+ground+truth&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "title": "Explainable GNN-Based Models over Knowledge Graphs", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/5985", "id": "CrCvGNHAIrz", "poster": "", "openreview": "https://openreview.net/forum?id=CrCvGNHAIrz", "slides": "https://iclr.cc/virtual/2022/poster/5985", "video": "https://iclr.cc/virtual/2022/poster/5985", "author_site": "David Jaime Tena Cucala, Bernardo Grau, Egor Kostylev, Boris Motik", "tldr": "", "abstract": "Graph Neural Networks (GNNs) are often used to learn transformations of graph data. While effective in practice, such approaches make predictions via numeric manipulations so their output cannot be easily explained symbolically. We propose a new family of GNN-based transformations of graph data that can be trained effectively, but where all predictions can be explained symbolically as logical inferences in Datalog\u2014a well-known rule-based formalism. In particular, we show how to encode an input knowledge graph into a graph with numeric feature vectors, process this graph using a GNN, and decode the result into an output knowledge graph. We use a new class of monotonic GNNs (MGNNs) to ensure that this process is equivalent to a round of application of a set of Datalog rules. We also show that, given an arbitrary MGNN, we can automatically extract rules that completely characterise the transformation. We evaluate our approach by applying it to classification tasks in knowledge graph completion.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/e3d778f6167712f6cecccb920f51e85ce3ccba69.zip", "author": "David Jaime Tena Cucala;Bernardo Cuenca Grau;Egor V. Kostylev;Boris Motik", "authorids": "~David_Jaime_Tena_Cucala1;~Bernardo_Cuenca_Grau1;~Egor_V._Kostylev2;~Boris_Motik1", "gender": "Non-Binary;;M;M", "homepage": "https://www.cs.ox.ac.uk/people/david.tenacucala/;https://www.cs.ox.ac.uk/people/bernardo.cuencagrau/;https://www.mn.uio.no/ifi/english/people/aca/egork/index.html;https://www.cs.ox.ac.uk/people/boris.motik/", "dblp": ";71/6448;;56/1508", "google_scholar": ";THu1uZMAAAAJ;Y2gdVmIAAAAJ;gKlqqSEAAAAJ", "orcid": ";;0000-0002-8886-6129;0000-0003-2506-4118", "linkedin": ";;egor-kostylev/;", "or_profile": "~David_Jaime_Tena_Cucala1;~Bernardo_Cuenca_Grau1;~Egor_V._Kostylev2;~Boris_Motik1", "aff": "University of Oxford;University of Oxford;University of Oslo, Norway;Department of Computer Science, University of Oxford", "aff_domain": "ox.ac.uk;cs.ox.ac.uk;uio.no;cs.ox.ac.uk", "position": "Postdoc;Professor;Associate Professor;Full Professor", "bibtex": "@inproceedings{\ncucala2022explainable,\ntitle={Explainable {GNN}-Based Models over Knowledge Graphs},\nauthor={David Jaime Tena Cucala and Bernardo Cuenca Grau and Egor V. Kostylev and Boris Motik},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=CrCvGNHAIrz}\n}", "github": "", "project": "", "reviewers": "PNwA;EWub;AzED;Eofg", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "4;4;4;3", "correctness": "3;3;4;4", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;2;3;2", "wc_summary_paper": "89;26;174;145", "wc_summary_review": "25;44;60;274", "wc_main_review": "540;355;694;2306", "wc_review": "654;425;928;2725", "wc_reply_reviewers": "35;194;0;2902", "wc_reply_authors": "1348;1570;1345;5849", "reply_reviewers": "1;1;0;8", "reply_authors": "3;2;2;13", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 108.5, 56.58842637854493 ], "wc_summary_review_avg": [ 100.75, 100.79031451483819 ], "wc_main_review_avg": [ 973.75, 778.4826186242054 ], "wc_review_avg": [ 1183.0, 907.9088610648098 ], "wc_reply_reviewers_avg": [ 782.75, 1225.7318987037909 ], "wc_reply_authors_avg": [ 2528.0, 1919.5503379698068 ], "reply_reviewers_avg": [ 2.5, 3.2015621187164243 ], "reply_authors_avg": [ 5.0, 4.636809247747852 ], "replies_avg": [ 37, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.9271726499455306, "corr_recommendation_correctness": 0.6882472016116854, "gs_citation": 41, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9041168522961299201&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "pdf": "https://openreview.net/pdf?id=CrCvGNHAIrz", "email": "ox.ac.uk;cs.ox.ac.uk;uio.no;cs.ox.ac.uk", "author_num": 4, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "University of Oxford;University of Oslo", "aff_unique_dep": ";", "aff_unique_url": "https://www.ox.ac.uk;https://www.uio.no", "aff_unique_abbr": "Oxford;UiO", "aff_campus_unique_index": "1", "aff_campus_unique": ";Oxford", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "United Kingdom;Norway" }, { "id": "CrXLp_yeA-K", "title": "Conversational Artificial Intelligence in Natural Language Processing Application with Lifelong Learning", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Conversational AI bot known as chatbot has many roles in human life today, such as answering uncomplicated questions on E-commerce website pages or even help as an assistant like Siri and Google assistant. However, the chatbot is currently limited due to insufficient knowledge available on a predetermined training dataset. So when a chatbot receives an unfamiliar question, it may be hard to understand the question. Therefore, we formed a chatbot using the lifelong learning method. Thus, that chatbot can conduct training on unfamiliar incoming data without train the model from scratch. The performance benchmark the system used is the average confidence score of the tests carried out. The system has an average confidence score of 0.80952.", "keywords": "Conversational AI;Lifelong Learning;Machine Learning;Natural Language Processing;Neural Network.", "primary_area": "", "supplementary_material": "/attachment/818863ca66622bceab38842ba9c76a4701935f84.zip", "author": "Ade Oktavianus Kurniawan;Kevin Natio Banjarnahor", "authorids": "~Ade_Oktavianus_Kurniawan1;~Kevin_Natio_Banjarnahor1", "gender": "M;M", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": "ade-oktavianus-kurniawan-9959a71b3/;kevin-natio-b-782628107/", "or_profile": "~Ade_Oktavianus_Kurniawan1;~Kevin_Natio_Banjarnahor1", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nkurniawan2022conversational,\ntitle={Conversational Artificial Intelligence in Natural Language Processing Application with Lifelong Learning},\nauthor={Ade Oktavianus Kurniawan and Kevin Natio Banjarnahor},\nyear={2022},\nurl={https://openreview.net/forum?id=CrXLp_yeA-K}\n}", "github": "", "project": "", "reviewers": "AyHu;eFye;swzn;1QsS;pknj", "site": "https://openreview.net/forum?id=CrXLp_yeA-K", "pdf_size": 0, "recommendation": "1;1;1;1;3", "confidence": "5;5;4;4;4", "correctness": "2;1;1;2;2", "technical_novelty": "1;1;1;1;2", "empirical_novelty": "1;1;1;1;1", "wc_summary_paper": "184;23;77;38;104", "wc_summary_review": "21;7;114;16;40", "wc_main_review": "129;37;294;53;111", "wc_review": "334;67;485;107;255", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "0;0;0;0;0", "reply_reviewers": "0;0;0;0;0", "reply_authors": "0;0;0;0;0", "recommendation_avg": [ 1.4, 0.8 ], "confidence_avg": [ 4.4, 0.48989794855663565 ], "correctness_avg": [ 1.6, 0.4898979485566356 ], "technical_novelty_avg": [ 1.2, 0.4 ], "empirical_novelty_avg": [ 1.0, 0.0 ], "wc_summary_paper_avg": [ 85.2, 57.05926743308224 ], "wc_summary_review_avg": [ 39.6, 38.73293172482558 ], "wc_main_review_avg": [ 124.8, 91.32447645620532 ], "wc_review_avg": [ 249.6, 152.47504713886795 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.40824829046386296, "corr_recommendation_correctness": 0.4082482904638631, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:vfZRUi5evuIJ:scholar.google.com/&scioq=Conversational+Artificial+Intelligence+in+Natural+Language+Processing+Application+with+Lifelong+Learning&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "Ctjb37IOldV", "title": "A Variance Principle Explains why Dropout Finds Flatter Minima", "track": "main", "status": "Reject", "tldr": "", "abstract": "Although dropout has achieved great success in deep learning, little is known about how it helps the training find a good generalization solution in the high-dimensional parameter space. In this work, we show that the training with dropout finds the neural network with a flatter minimum compared with standard gradient descent training. We further study the underlying mechanism of why dropout finds flatter minima through experiments. We propose a Variance Principle that the variance of a noise is larger at the sharper direction of the loss landscape. Existing works show that SGD satisfies the variance principle, which leads the training to flatter minima. Our work show that the noise induced by the dropout also satisfies the variance principle that explains why dropout finds flatter minima. In general, our work points out that the variance principle is an important similarity between dropout and SGD that lead the training to find flatter minima and obtain good generalization. ", "keywords": "dropout;stochastic gradient descent;loss landscape;flatness;neural network", "primary_area": "", "supplementary_material": "/attachment/270d480c7abbf3c27ac59076eeb50fb7535adb5f.zip", "author": "Zhongwang Zhang;Hanxu Zhou;Zhiqin Xu", "authorids": "~Zhongwang_Zhang1;~Hanxu_Zhou1;~Zhiqin_Xu1", "gender": ";;M", "homepage": "https://sjtuzzw.github.io/;;https://ins.sjtu.edu.cn/people/xuzhiqin/", "dblp": "293/9763;;223/4493.html", "google_scholar": "https://scholar.google.com/citations?hl=zh-CN;https://scholar.google.com.hk/citations?user=ypD3aL8AAAAJ;EjLvG5cAAAAJ", "orcid": ";;0000-0002-0122-0879", "linkedin": ";;", "or_profile": "~Zhongwang_Zhang1;~Hanxu_Zhou1;~Zhiqin_Xu1", "aff": "Shanghai Jiaotong University;Shanghai Jiaotong University;Shanghai Jiaotong University", "aff_domain": "sjtu.edu.cn;sjtu.edu.cn;sjtu.edu.cn", "position": "PhD student;PhD student;Associate Professor", "bibtex": "@misc{\nzhang2022a,\ntitle={A Variance Principle Explains why Dropout Finds Flatter Minima},\nauthor={Zhongwang Zhang and Hanxu Zhou and Zhiqin Xu},\nyear={2022},\nurl={https://openreview.net/forum?id=Ctjb37IOldV}\n}", "github": "", "project": "", "reviewers": "Y6pQ;dtVh;HpHx;mVLJ", "site": "https://openreview.net/forum?id=Ctjb37IOldV", "pdf_size": 0, "recommendation": "5;5;5;5", "confidence": "5;2;3;2", "correctness": "3;2;3;3", "technical_novelty": "2;2;2;2", "empirical_novelty": "3;2;2;2", "wc_summary_paper": "39;143;41;27", "wc_summary_review": "5;82;61;3", "wc_main_review": "81;272;228;76", "wc_review": "125;497;330;106", "wc_reply_reviewers": "0;0;279;54", "wc_reply_authors": "308;541;1450;564", "reply_reviewers": "0;0;2;1", "reply_authors": "3;3;5;3", "recommendation_avg": [ 5.0, 0.0 ], "confidence_avg": [ 3.0, 1.224744871391589 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 62.5, 46.78407848830626 ], "wc_summary_review_avg": [ 37.75, 34.56425176392511 ], "wc_main_review_avg": [ 164.25, 87.16758285050699 ], "wc_review_avg": [ 264.5, 160.41274886990746 ], "wc_reply_reviewers_avg": [ 83.25, 115.14637423731587 ], "wc_reply_authors_avg": [ 715.75, 435.58832342017615 ], "reply_reviewers_avg": [ 0.75, 0.82915619758885 ], "reply_authors_avg": [ 3.5, 0.8660254037844386 ], "replies_avg": [ 23, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13657794825539774053&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;0", "aff_unique_norm": "Shanghai Jiao Tong University", "aff_unique_dep": "", "aff_unique_url": "https://www.sjtu.edu.cn", "aff_unique_abbr": "SJTU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "Scarf: Self-Supervised Contrastive Learning using Random Feature Corruption", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6296", "id": "CuV_qYkmKb3", "poster": "", "openreview": "https://openreview.net/forum?id=CuV_qYkmKb3", "slides": "https://iclr.cc/virtual/2022/poster/6296", "video": "https://iclr.cc/virtual/2022/poster/6296", "author_site": "Dara Bahri, Heinrich Jiang, Yi Tay, Donald Metzler", "tldr": "", "abstract": "Self-supervised contrastive representation learning has proved incredibly successful in the vision and natural language domains, enabling state-of-the-art performance with orders of magnitude less labeled data. However, such methods are domain-specific and little has been done to leverage this technique on real-world \\emph{tabular} datasets. We propose \\textsc{Scarf}, a simple, widely-applicable technique for contrastive learning, where views are formed by corrupting a random subset of features. When applied to pre-train deep neural networks on the 69 real-world, tabular classification datasets from the OpenML-CC18 benchmark, \\textsc{Scarf} not only improves classification accuracy in the fully-supervised setting but does so also in the presence of label noise and in the semi-supervised setting where only a fraction of the available training data is labeled. We show that \\textsc{Scarf} complements existing strategies and outperforms alternatives like autoencoders. We conduct comprehensive ablations, detailing the importance of a range of factors.", "keywords": "self-supervised learning;tabular data;pre-training;contrastive learning;openML", "primary_area": "", "supplementary_material": "", "author": "Dara Bahri;Heinrich Jiang;Yi Tay;Donald Metzler", "authorids": "~Dara_Bahri1;~Heinrich_Jiang1;~Yi_Tay1;~Donald_Metzler1", "gender": "M;M;M;M", "homepage": "http://www.dara.run;;http://yitay.net;https://research.google/people/DonaldMetzler/", "dblp": "231/7656;182/2472;;95/2272", "google_scholar": "j5PpTOwAAAAJ;;VBclY_cAAAAJ;bmXpOd8AAAAJ", "orcid": ";;;0000-0003-4276-6269", "linkedin": ";;;donmetzler/", "or_profile": "~Dara_Bahri1;~Heinrich_Jiang1;~Yi_Tay1;~Donald_Metzler1", "aff": "Google Research;Google;Google;Google", "aff_domain": "google.com;google.com;google.com;google.com", "position": "Research Scientist;Research scientist;Research Scientist;Research Scientist", "bibtex": "@inproceedings{\nbahri2022scarf,\ntitle={Scarf: Self-Supervised Contrastive Learning using Random Feature Corruption},\nauthor={Dara Bahri and Heinrich Jiang and Yi Tay and Donald Metzler},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=CuV_qYkmKb3}\n}", "github": "", "project": "", "reviewers": "7Frm;swwj;6ZpH;y3t1", "pdf_size": 0, "recommendation": "6;6;8;8", "confidence": "4;4;4;4", "correctness": "3;4;4;3", "technical_novelty": "3;2;2;3", "empirical_novelty": "3;3;4;3", "wc_summary_paper": "41;82;105;93", "wc_summary_review": "34;69;47;40", "wc_main_review": "285;241;145;385", "wc_review": "360;392;297;518", "wc_reply_reviewers": "0;281;17;27", "wc_reply_authors": "450;920;42;388", "reply_reviewers": "0;1;1;1", "reply_authors": "1;2;2;1", "recommendation_avg": [ 7.0, 1.0 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 3.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 80.25, 24.076700355322778 ], "wc_summary_review_avg": [ 47.5, 13.238202294873727 ], "wc_main_review_avg": [ 264.0, 86.27282306729043 ], "wc_review_avg": [ 391.75, 80.50582277077851 ], "wc_reply_reviewers_avg": [ 81.25, 115.72893976875447 ], "wc_reply_authors_avg": [ 450.0, 312.733113053287 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 215, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14781334777712117363&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=CuV_qYkmKb3", "email": "google.com;google.com;google.com;google.com", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Google", "aff_unique_dep": "Google Research", "aff_unique_url": "https://research.google", "aff_unique_abbr": "Google Research", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "CxebB5Psl1", "title": "Graph Similarities and Dual Approach for Sequential Text-to-Image Retrieval", "track": "main", "status": "Reject", "tldr": "", "abstract": "Sequential text-to-image retrieval, a.k.a. Story-to-images task, requires semantic alignment with a given story and maintaining global coherence in drawn image sequence simultaneously. Most of the previous works have only focused on modeling how to follow the content of a given story faithfully. This kind of overfitting tendency hinders matching structural similarity between images, causing an inconsistency in global visual information such as backgrounds. To handle this imbalanced problem, we propose a novel image sequence retrieval framework that utilizes scene graph similarities of the images and a dual learning scheme. Scene graph describes high-level information of visual groundings and adjacency relations of the key entities in a visual scene. In our proposed retriever, the graph encoding head learns to maximize graph embedding similarities among sampled images, giving a strong signal that forces the retriever to also consider morphological relevance with previously sampled images. We set a video captioning as a dual learning task that reconstructs the input story from the sampled image sequence. This inverse mapping gives informative feedback for our proposed retrieval system to maintain global contextual information of a given story. We also suggest a new contextual sentence encoding architecture to embed a sentence in consideration of the surrounding context. Through extensive experiments, Our proposed framework shows better qualitative and quantitative performance with Visual Storytelling benchmark compared to conventional story-to-image models.", "keywords": "sequential text-to-image retrieval;story-to-image retrieval;scene graph embedding;dual learning", "primary_area": "", "supplementary_material": "", "author": "Keonwoo Kim;Sihyeon Jo;Seong-Woo Kim", "authorids": "~Keonwoo_Kim1;sihyeonjo@snu.ac.kr;~Seong-Woo_Kim1", "gender": "M;;M", "homepage": ";;https://arisnu.squarespace.com/", "dblp": ";;00/653", "google_scholar": ";;VlVqpq8AAAAJ", "orcid": ";;", "linkedin": "keonwoo-kim-b08688201;;", "or_profile": "~Keonwoo_Kim1;sihyeonjo@snu.ac.kr;~Seong-Woo_Kim1", "aff": "Seoul National University;;Seoul National University", "aff_domain": "snu.ac.kr;;snu.ac.kr", "position": "PhD student;;Associate Professor", "bibtex": "@misc{\nkim2022graph,\ntitle={Graph Similarities and Dual Approach for Sequential Text-to-Image Retrieval},\nauthor={Keonwoo Kim and Sihyeon Jo and Seong-Woo Kim},\nyear={2022},\nurl={https://openreview.net/forum?id=CxebB5Psl1}\n}", "github": "", "project": "", "reviewers": "1pAQ;6mhj;iXYw", "site": "https://openreview.net/forum?id=CxebB5Psl1", "pdf_size": 0, "recommendation": "1;3;5", "confidence": "5;5;3", "correctness": "2;2;4", "technical_novelty": "1;2;3", "empirical_novelty": "2;2;2", "wc_summary_paper": "235;100;94", "wc_summary_review": "104;30;92", "wc_main_review": "994;274;477", "wc_review": "1333;404;663", "wc_reply_reviewers": "11;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "1;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 3.0, 1.632993161855452 ], "confidence_avg": [ 4.333333333333333, 0.9428090415820634 ], "correctness_avg": [ 2.6666666666666665, 0.9428090415820634 ], "technical_novelty_avg": [ 2.0, 0.816496580927726 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 143.0, 65.09992319503918 ], "wc_summary_review_avg": [ 75.33333333333333, 32.42769735204082 ], "wc_main_review_avg": [ 581.6666666666666, 303.1131068686334 ], "wc_review_avg": [ 800.0, 391.43922474206215 ], "wc_reply_reviewers_avg": [ 3.6666666666666665, 5.185449728701348 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.8660254037844385, "corr_recommendation_correctness": 0.8660254037844387, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:RLRQFdJkrcgJ:scholar.google.com/&scioq=Graph+Similarities+and+Dual+Approach+for+Sequential+Text-to-Image+Retrieval&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Seoul National University", "aff_unique_dep": "", "aff_unique_url": "https://www.snu.ac.kr", "aff_unique_abbr": "SNU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "South Korea" }, { "id": "Cy0n0WCvLPU", "title": "Topic Aware Neural Language Model: Domain Adaptation of Unconditional Text Generation Models", "track": "main", "status": "Reject", "tldr": "", "abstract": "Our goal is to adapt pre-trained neural language models (NLMs) to the unconditional text generation task within the target domain.\nBecause many Transformer based NLMs are trained on more massive and heterogeneous corpora than this target domain,\nthe difference between these corpora and the target domain raises the question of whether these NLMs can provide their benefits to this task even after the fine-tuning.\nTo tackle these problems, our approach focuses on topics to bridge the semantic gap between these corpora and the target domain corpus,\nand relates them at a topic level.\nThat is, this approach injects topics into these NLMs and trains them via topics behind these dependencies over segments,\nintroducing both topic alignment (TA) and training tasks (TDM and TEM),\nwhile previous Transformer based NLMs are better at learning from the predefined segment length such as the context.\nExperiments show that this approach contributes to resolve the imbalance between these corpora,\nand can tailor previous pre-trained NLMs to generate coherent and semantically valid text reflecting a given small fine-tuning corpus.", "keywords": "Neural language models;Unconditional Text Generation;Transformer", "primary_area": "", "supplementary_material": "", "author": "Noriaki Kawamae", "authorids": "~Noriaki_Kawamae1", "gender": "", "homepage": "", "dblp": "70/1704", "google_scholar": "https://scholar.google.co.jp/citations?user=ylU8pzwAAAAJ", "orcid": "0000-0002-0746-9624", "linkedin": "", "or_profile": "~Noriaki_Kawamae1", "aff": "NTT Comware", "aff_domain": "nttcom.co.jp", "position": "Evangelist", "bibtex": "@misc{\nkawamae2022topic,\ntitle={Topic Aware Neural Language Model: Domain Adaptation of Unconditional Text Generation Models},\nauthor={Noriaki Kawamae},\nyear={2022},\nurl={https://openreview.net/forum?id=Cy0n0WCvLPU}\n}", "github": "", "project": "", "reviewers": "zi8M;ifRk;Lp9h", "site": "https://openreview.net/forum?id=Cy0n0WCvLPU", "pdf_size": 0, "recommendation": "1;5;5", "confidence": "2;3;2", "correctness": "2;3;3", "technical_novelty": "2;2;3", "empirical_novelty": "1;2;3", "wc_summary_paper": "135;54;64", "wc_summary_review": "18;18;15", "wc_main_review": "265;179;208", "wc_review": "418;251;287", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 3.6666666666666665, 1.8856180831641267 ], "confidence_avg": [ 2.3333333333333335, 0.4714045207910317 ], "correctness_avg": [ 2.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.0, 0.816496580927726 ], "wc_summary_paper_avg": [ 84.33333333333333, 36.058594290712755 ], "wc_summary_review_avg": [ 17.0, 1.4142135623730951 ], "wc_main_review_avg": [ 217.33333333333334, 35.72425257751689 ], "wc_review_avg": [ 318.6666666666667, 71.76040381405024 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 1, 0 ], "corr_recommendation_confidence": 0.4999999999999999, "corr_recommendation_correctness": 0.9999999999999997, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:kQLvqH5BrJ4J:scholar.google.com/&scioq=Topic+Aware+Neural+Language+Model:+Domain+Adaptation+of+Unconditional+Text+Generation+Models&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "NTT Comware", "aff_unique_dep": "", "aff_unique_url": "https://www.ntt-comware.co.jp", "aff_unique_abbr": "", "aff_country_unique_index": "0", "aff_country_unique": "Japan" }, { "title": "Transition to Linearity of Wide Neural Networks is an Emerging Property of Assembling Weak Models", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7200", "id": "CyKHoKyvgnp", "poster": "", "openreview": "https://openreview.net/forum?id=CyKHoKyvgnp", "slides": "https://iclr.cc/virtual/2022/poster/7200", "video": "https://iclr.cc/virtual/2022/poster/7200", "author_site": "Chaoyue Liu, Libin Zhu, Mikhail Belkin", "tldr": "", "abstract": "Wide neural networks with linear output layer have been shown to be near-linear, and to have near-constant neural tangent kernel (NTK), in a region containing the optimization path of gradient descent. These findings seem counter-intuitive since in general neural networks are highly complex models. Why does a linear structure emerge when the neural networks become wide? \nIn this work, we provide a new perspective on this \"transition to linearity\" by considering a neural network as an assembly model recursively built from a set of sub-models corresponding to individual neurons. In this view, we show that the linearity of wide neural networks is, in fact, an emerging property of assembling a large number of diverse ``weak'' sub-models, none of which dominate the assembly. ", "keywords": "Assembling;linearity;Transition to linearity;wide neural networks", "primary_area": "", "supplementary_material": "", "author": "Chaoyue Liu;Libin Zhu;Misha Belkin", "authorids": "~Chaoyue_Liu2;~Libin_Zhu1;~Misha_Belkin1", "gender": "M;M;", "homepage": "https://cliu212.github.io/;;http://misha.belkin-wang.org/", "dblp": "191/6684-1;260/0355;", "google_scholar": "sRjoMX0AAAAJ;hyTGiUcAAAAJ;Iwd9DdkAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Chaoyue_Liu2;~Libin_Zhu1;~Misha_Belkin1", "aff": "Meta Facebook;University of California, San Diego;University of California, San Diego", "aff_domain": "fb.com;ucsd.edu;ucsd.edu", "position": "Researcher;PhD student;Professor", "bibtex": "@inproceedings{\nliu2022transition,\ntitle={Transition to Linearity of Wide Neural Networks is an Emerging Property of Assembling Weak Models},\nauthor={Chaoyue Liu and Libin Zhu and Misha Belkin},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=CyKHoKyvgnp}\n}", "github": "", "project": "", "reviewers": "nTFG;rv6p;r126", "pdf_size": 0, "recommendation": "6;8;8", "confidence": "2;3;3", "correctness": "3;4;4", "technical_novelty": "2;4;3", "empirical_novelty": "0;0;0", "wc_summary_paper": "69;88;105", "wc_summary_review": "25;93;95", "wc_main_review": "123;167;310", "wc_review": "217;348;510", "wc_reply_reviewers": "0;48;76", "wc_reply_authors": "168;537;453", "reply_reviewers": "0;1;1", "reply_authors": "1;2;2", "recommendation_avg": [ 7.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 2.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 3.0, 0.816496580927726 ], "empirical_novelty_avg": [ 0, 0 ], "wc_summary_paper_avg": [ 87.33333333333333, 14.70449666674185 ], "wc_summary_review_avg": [ 71.0, 32.53715824509981 ], "wc_main_review_avg": [ 200.0, 79.82898387594989 ], "wc_review_avg": [ 358.3333333333333, 119.83970775813648 ], "wc_reply_reviewers_avg": [ 41.333333333333336, 31.38293945583952 ], "wc_reply_authors_avg": [ 386.0, 157.91770008456936 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.9999999999999998, "corr_recommendation_correctness": 0.9999999999999998, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14454290545063106484&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=CyKHoKyvgnp", "email": "fb.com;ucsd.edu;ucsd.edu", "author_num": 3, "aff_unique_index": "0;1;1", "aff_unique_norm": "Meta;University of California, San Diego", "aff_unique_dep": "Meta Platforms, Inc.;", "aff_unique_url": "https://meta.com;https://www.ucsd.edu", "aff_unique_abbr": "Meta;UCSD", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";San Diego", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "CyKQiiCPBEv", "title": "Stepping Back to SMILES Transformers for Fast Molecular Representation Inference", "track": "main", "status": "Reject", "tldr": "", "abstract": "In the intersection of molecular science and deep learning, tasks like virtual screening have driven the need for a high-throughput molecular representation generator on large chemical databases. However, as SMILES strings are the most common storage format for molecules, using deep graph models to extract molecular feature from raw SMILES data requires an SMILES-to-graph conversion, which significantly decelerates the whole process. Directly deriving molecular representations from SMILES is feasible, yet there exists a large performance gap between the existing SMILES-based models and graph-based models at benchmark results. To address this issue, we propose ST-KD, an end-to-end SMILES Transformer for molecular representation learning boosted by Knowledge Distillation. In order to conduct knowledge transfer from graph Transformers to ST-KD, we have redesigned the attention layers and introduced a pre-transformation step to tokenize the SMILES strings and inject structure-based positional embeddings. ST-KD shows competitive results on latest standard molecular datasets PCQM4M-LSC and QM9, with $3\\text{-}14\\times$ inference speed compared with existing graph models.", "keywords": "molecular representation learning;knowledge distillation", "primary_area": "", "supplementary_material": "/attachment/a5f1eea9fca5080ff67eaa361132ab8bd3a35eaa.zip", "author": "Wenhao Zhu;Ziyao Li;Lingsheng Cai;Guojie Song", "authorids": "~Wenhao_Zhu2;~Ziyao_Li1;~Lingsheng_Cai1;~Guojie_Song1", "gender": "M;M;M;M", "homepage": ";;https://github.com/cailingsheng;http://sai.pku.edu.cn/info/1022/2212.htm", "dblp": ";230/4058;;37/2900", "google_scholar": "https://scholar.google.com/citations?hl=en;KzJYwbMAAAAJ;;https://scholar.google.com.tw/citations?user=a832IIMAAAAJ", "orcid": ";;;0000-0001-8295-2520", "linkedin": ";ziyao-li-3a4594146/;;", "or_profile": "~Wenhao_Zhu2;~Ziyao_Li1;~Lingsheng_Cai1;~Guojie_Song1", "aff": "Peking University;Peking University;Peking University;Peking University", "aff_domain": "pku.edu.cn;pku.edu.cn;pku.edu.cn;pku.edu.cn", "position": "Undergrad student;PhD student;Undergrad student;Associate Professor", "bibtex": "@misc{\nzhu2022stepping,\ntitle={Stepping Back to {SMILES} Transformers for Fast Molecular Representation Inference},\nauthor={Wenhao Zhu and Ziyao Li and Lingsheng Cai and Guojie Song},\nyear={2022},\nurl={https://openreview.net/forum?id=CyKQiiCPBEv}\n}", "github": "", "project": "", "reviewers": "PfwA;Maqb;2H4F;xrsP", "site": "https://openreview.net/forum?id=CyKQiiCPBEv", "pdf_size": 0, "recommendation": "3;5;5;8", "confidence": "4;3;4;4", "correctness": "4;3;2;4", "technical_novelty": "3;3;3;3", "empirical_novelty": "3;2;0;4", "wc_summary_paper": "33;54;96;110", "wc_summary_review": "42;13;99;71", "wc_main_review": "227;260;195;222", "wc_review": "302;327;390;403", "wc_reply_reviewers": "0;20;0;16", "wc_reply_authors": "541;625;411;136", "reply_reviewers": "0;1;0;1", "reply_authors": "1;2;1;1", "recommendation_avg": [ 5.25, 1.7853571071357126 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.25, 1.479019945774904 ], "wc_summary_paper_avg": [ 73.25, 31.059418861272984 ], "wc_summary_review_avg": [ 56.25, 32.08874413248359 ], "wc_main_review_avg": [ 226.0, 23.097618924902193 ], "wc_review_avg": [ 355.5, 42.19300889957956 ], "wc_reply_reviewers_avg": [ 9.0, 9.1104335791443 ], "wc_reply_authors_avg": [ 428.25, 185.15584651854772 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.08084520834544431, "corr_recommendation_correctness": 0.1266600992762247, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11549184015787014851&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Peking University", "aff_unique_dep": "", "aff_unique_url": "http://www.pku.edu.cn", "aff_unique_abbr": "Peking U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "id": "CyhUPn9RDT3", "title": "IQNAS: Interpretable Integer Quadratic programming Neural Architecture Search", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Realistic use of neural networks often requires adhering to multiple constraints on latency, energy and memory among others. A popular approach to find fitting networks is through constrained Neural Architecture Search (NAS). However, previous methods use complicated predictors for the accuracy of the network. Those predictors are hard to interpret and sensitive to many hyperparameters to be tuned, hence, the resulting accuracy of the generated models is often harmed. In this work we resolve this by introducing Interpretable Integer Quadratic programming Neural Architecture Search (IQNAS), that is based on an accurate and simple quadratic formulation of both the accuracy predictor and the expected resource requirement, together with a scalable search method with theoretical guarantees. The simplicity of our proposed predictor together with the intuitive way it is constructed bring interpretability through many insights about the contribution of different design choices. For example, we find that in the examined search space, adding depth and width is more effective at deeper stages of the network and at the beginning of each resolution stage. Our experiments show that IQNAS generates comparable to or better architectures than other state-of-the-art NAS methods within a reduced search cost for each additional generated network, while strictly satisfying the resource constraints.", "keywords": "Neural Architecture Search", "primary_area": "", "supplementary_material": "", "author": "Niv Nayman;Yonathan Aflalo;Asaf Noy;Rong Jin;Lihi Zelnik-Manor", "authorids": "~Niv_Nayman1;~Yonathan_Aflalo2;~Asaf_Noy2;~Rong_Jin1;~Lihi_Zelnik-Manor1", "gender": "M;M;M;F;M", "homepage": ";;;https://lihi.net.technion.ac.il/;https://www.cse.msu.edu/~rongjin/", "dblp": "239/5936.html;52/10673;145/9804.html;z/LihiZelnikManor;j/RongJin", "google_scholar": "2tGCONsAAAAJ;;https://scholar.google.co.il/citations?user=dPZI69EAAAAJ;https://scholar.google.com.tw/citations?user=E_ejWvYAAAAJ;", "orcid": ";;;;", "linkedin": "niv-nayman-1ab29bb3/?originalSubdomain=il;;;;", "or_profile": "~Niv_Nayman1;~Yonathan_Aflalo2;~Asaf_Noy2;~Lihi_Zelnik-Manor1;~Rong_Jin3", "aff": "Technion - Israel Institute of Technology, Technion - Israel Institute of Technology;;Alibaba Group;Technion - Israel Institute of Technology, Technion;Alibaba Group", "aff_domain": "campus.technion.ac.il;;alibaba-inc.com;technion.ac.il;alibaba-inc.com", "position": "PhD student;;reseaecher;Full Professor;Researcher", "bibtex": "@misc{\nnayman2022iqnas,\ntitle={{IQNAS}: Interpretable Integer Quadratic programming Neural Architecture Search},\nauthor={Niv Nayman and Yonathan Aflalo and Asaf Noy and Rong Jin and Lihi Zelnik-Manor},\nyear={2022},\nurl={https://openreview.net/forum?id=CyhUPn9RDT3}\n}", "github": "", "project": "", "reviewers": "SHia;B5Jb;82xa;7X6j;31pM", "site": "https://openreview.net/forum?id=CyhUPn9RDT3", "pdf_size": 0, "recommendation": "3;5;5;5;5", "confidence": "4;4;3;4;4", "correctness": "3;3;3;3;4", "technical_novelty": "2;3;3;2;2", "empirical_novelty": "2;3;3;2;2", "wc_summary_paper": "54;33;81;67;112", "wc_summary_review": "28;48;46;89;42", "wc_main_review": "141;88;226;251;172", "wc_review": "223;169;353;407;326", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "883;471;740;1481;944", "reply_reviewers": "0;0;0;0;0", "reply_authors": "2;1;1;3;2", "recommendation_avg": [ 4.6, 0.7999999999999999 ], "confidence_avg": [ 3.8, 0.39999999999999997 ], "correctness_avg": [ 3.2, 0.39999999999999997 ], "technical_novelty_avg": [ 2.4, 0.4898979485566356 ], "empirical_novelty_avg": [ 2.4, 0.4898979485566356 ], "wc_summary_paper_avg": [ 69.4, 26.522443326360413 ], "wc_summary_review_avg": [ 50.6, 20.431348462595412 ], "wc_main_review_avg": [ 175.6, 58.496495621532745 ], "wc_review_avg": [ 295.6, 87.09443150971249 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 903.8, 331.4678868306853 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.8, 0.7483314773547883 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.25000000000000006, "corr_recommendation_correctness": 0.2500000000000001, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:4JjrILkq-aQJ:scholar.google.com/&scioq=IQNAS:+Interpretable+Integer+Quadratic+programming+Neural+Architecture+Search&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;0;1", "aff_unique_norm": "Technion - Israel Institute of Technology;Alibaba Group", "aff_unique_dep": ";", "aff_unique_url": "https://www.technion.ac.il/en/;https://www.alibaba.com", "aff_unique_abbr": "Technion;Alibaba", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;1", "aff_country_unique": "Israel;China" }, { "title": "Score-Based Generative Modeling with Critically-Damped Langevin Diffusion", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6687", "id": "CzceR82CYc", "poster": "", "openreview": "https://openreview.net/forum?id=CzceR82CYc", "slides": "https://iclr.cc/virtual/2022/poster/6687", "video": "https://iclr.cc/virtual/2022/poster/6687", "author_site": "Tim Dockhorn, Arash Vahdat, Karsten Kreis", "tldr": "", "abstract": "Score-based generative models (SGMs) have demonstrated remarkable synthesis quality. SGMs rely on a diffusion process that gradually perturbs the data towards a tractable distribution, while the generative model learns to denoise. The complexity of this denoising task is, apart from the data distribution itself, uniquely determined by the diffusion process. We argue that current SGMs employ overly simplistic diffusions, leading to unnecessarily complex denoising processes, which limit generative modeling performance. Based on connections to statistical mechanics, we propose a novel critically-damped Langevin diffusion (CLD) and show that CLD-based SGMs achieve superior performance. CLD can be interpreted as running a joint diffusion in an extended space, where the auxiliary variables can be considered \"velocities\" that are coupled to the data variables as in Hamiltonian dynamics. We derive a novel score matching objective for CLD and show that the model only needs to learn the score function of the conditional distribution of the velocity given data, an easier task than learning scores of the data directly. We also derive a new sampling scheme for efficient synthesis from CLD-based diffusion models. We find that CLD outperforms previous SGMs in synthesis quality for similar network architectures and sampling compute budgets. We show that our novel sampler for CLD significantly outperforms solvers such as Euler\u2013Maruyama. Our framework provides new insights into score-based denoising diffusion models and can be readily used for high-resolution image synthesis. Project page and code: https://nv-tlabs.github.io/CLD-SGM.", "keywords": "Score-based generative modeling;denoising diffusion models;image synthesis", "primary_area": "", "supplementary_material": "", "author": "Tim Dockhorn;Arash Vahdat;Karsten Kreis", "authorids": "~Tim_Dockhorn1;~Arash_Vahdat3;~Karsten_Kreis1", "gender": ";M;", "homepage": "https://timudk.github.io/;http://latentspace.cc/;https://karstenkreis.github.io/", "dblp": "239/4951;92/8108;238/6834", "google_scholar": "EtPn_v4AAAAJ;https://scholar.google.ca/citations?user=p9-nlRIAAAAJ;https://scholar.google.de/citations?user=rFd-DiAAAAAJ", "orcid": ";;", "linkedin": ";;karstenkreis", "or_profile": "~Tim_Dockhorn1;~Arash_Vahdat3;~Karsten_Kreis1", "aff": "University of Waterloo;NVIDIA;NVIDIA", "aff_domain": "uwaterloo.ca;nvidia.com;nvidia.com", "position": "PhD student;Research Scientist;Research Scientist", "bibtex": "@inproceedings{\ndockhorn2022scorebased,\ntitle={Score-Based Generative Modeling with Critically-Damped Langevin Diffusion},\nauthor={Tim Dockhorn and Arash Vahdat and Karsten Kreis},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=CzceR82CYc}\n}", "github": "", "project": "", "reviewers": "F5kX;95SQ;tK3A;AJW8", "pdf_size": 0, "recommendation": "8;8;8;10", "confidence": "3;4;4;5", "correctness": "4;3;3;4", "technical_novelty": "4;4;4;4", "empirical_novelty": "4;3;3;4", "wc_summary_paper": "138;102;534;186", "wc_summary_review": "45;60;111;76", "wc_main_review": "108;608;913;354", "wc_review": "291;770;1558;616", "wc_reply_reviewers": "0;0;167;0", "wc_reply_authors": "260;1177;1890;175", "reply_reviewers": "0;0;1;0", "reply_authors": "1;2;4;1", "recommendation_avg": [ 8.5, 0.8660254037844386 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 4.0, 0.0 ], "empirical_novelty_avg": [ 3.5, 0.5 ], "wc_summary_paper_avg": [ 240.0, 172.33687939614086 ], "wc_summary_review_avg": [ 73.0, 24.525496936861444 ], "wc_main_review_avg": [ 495.75, 298.80627085119886 ], "wc_review_avg": [ 808.75, 465.85801216679744 ], "wc_reply_reviewers_avg": [ 41.75, 72.31312121600062 ], "wc_reply_authors_avg": [ 875.5, 705.2753008577572 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 2.0, 1.224744871391589 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.816496580927726, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 268, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1032753694243444141&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=CzceR82CYc", "email": "uwaterloo.ca;nvidia.com;nvidia.com", "author_num": 3, "aff_unique_index": "0;1;1", "aff_unique_norm": "University of Waterloo;NVIDIA", "aff_unique_dep": ";NVIDIA Corporation", "aff_unique_url": "https://uwaterloo.ca;https://www.nvidia.com", "aff_unique_abbr": "UW;NVIDIA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1", "aff_country_unique": "Canada;United States" }, { "title": "Generating Videos with Dynamics-aware Implicit Generative Adversarial Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/5954", "id": "Czsdv-S4-w9", "poster": "", "openreview": "https://openreview.net/forum?id=Czsdv-S4-w9", "slides": "https://iclr.cc/virtual/2022/poster/5954", "video": "https://iclr.cc/virtual/2022/poster/5954", "author_site": "Sihyun Yu, Jihoon Tack, Sangwoo Mo, Hyunsu Kim, Junho Kim, Jung-Woo Ha, Jinwoo Shin", "tldr": "", "abstract": "In the deep learning era, long video generation of high-quality still remains challenging due to the spatio-temporal complexity and continuity of videos. Existing prior works have attempted to model video distribution by representing videos as 3D grids of RGB values, which impedes the scale of generated videos and neglects continuous dynamics. In this paper, we found that the recent emerging paradigm of implicit neural representations (INRs) that encodes a continuous signal into a parameterized neural network effectively mitigates the issue. By utilizing INRs of video, we propose dynamics-aware implicit generative adversarial network (DIGAN), a novel generative adversarial network for video generation. Specifically, we introduce (a) an INR-based video generator that improves the motion dynamics by manipulating the space and time coordinates differently and (b) a motion discriminator that efficiently identifies the unnatural motions without observing the entire long frame sequences. We demonstrate the superiority of DIGAN under various datasets, along with multiple intriguing properties, e.g., long video synthesis, video extrapolation, and non-autoregressive video generation. For example, DIGAN improves the previous state-of-the-art FVD score on UCF-101 by 30.7% and can be trained on 128 frame videos of 128x128 resolution, 80 frames longer than the 48 frames of the previous state-of-the-art method.", "keywords": "video generation;implicit neural representations;generative adversarial networks", "primary_area": "", "supplementary_material": "", "author": "Sihyun Yu;Jihoon Tack;Sangwoo Mo;Hyunsu Kim;Junho Kim;Jung-Woo Ha;Jinwoo Shin", "authorids": "~Sihyun_Yu2;~Jihoon_Tack1;~Sangwoo_Mo1;~Hyunsu_Kim1;~Junho_Kim3;~Jung-Woo_Ha1;~Jinwoo_Shin1", "gender": "M;M;M;M;M;M;M", "homepage": "https://sihyun-yu.github.io;https://jihoontack.github.io;https://sites.google.com/view/sangwoomo;https://github.com/blandocs;http://bit.ly/jhkim_resume;https://aidljwha.wordpress.com/;https://sites.google.com/site/mijirim/", "dblp": "287/4627;267/5487;198/0432;239/8447;;66/867-1;31/7062", "google_scholar": "https://scholar.google.com/citations?hl=en;eW8-OT4AAAAJ;https://scholar.google.co.kr/citations?user=Sq9y3NMAAAAJ;VY5PodkAAAAJ;WtjDugkAAAAJ;https://scholar.google.co.kr/citations?user=eGj3ay4AAAAJ;https://scholar.google.com.tw/citations?user=m3eDp7kAAAAJ", "orcid": ";;;;0000-0003-3712-8510;0000-0002-7400-7681;", "linkedin": ";;;blandocs/;taki0112/;jung-woo-ha-b2782862?trk=hp-identity-name;", "or_profile": "~Sihyun_Yu2;~Jihoon_Tack1;~Sangwoo_Mo1;~Hyunsu_Kim1;~Junho_Kim3;~Jung-Woo_Ha1;~Jinwoo_Shin1", "aff": "Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology;KAIST;NAVER;NAVER;NAVER AI Lab;Korea Advanced Institute of Science & Technology", "aff_domain": "kaist.ac.kr;kaist.ac.kr;kaist.ac.kr;navercorp.com;navercorp.com;navercorp.com;kaist.ac.kr", "position": "PhD student;PhD student;PhD student;Researcher;Research Scientist;Head (Executive Director);Associate Professor", "bibtex": "@inproceedings{\nyu2022generating,\ntitle={Generating Videos with Dynamics-aware Implicit Generative Adversarial Networks},\nauthor={Sihyun Yu and Jihoon Tack and Sangwoo Mo and Hyunsu Kim and Junho Kim and Jung-Woo Ha and Jinwoo Shin},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=Czsdv-S4-w9}\n}", "github": "", "project": "", "reviewers": "3svg;ErkP;Tj68;vhpn", "pdf_size": 0, "recommendation": "5;6;8;10", "confidence": "4;4;4;5", "correctness": "3;4;4;4", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "53;119;76;96", "wc_summary_review": "178;15;93;116", "wc_main_review": "455;513;413;446", "wc_review": "686;647;582;658", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "1844;1104;1101;964", "reply_reviewers": "0;0;0;0", "reply_authors": "5;3;3;4", "recommendation_avg": [ 7.25, 1.920286436967152 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 86.0, 24.38237068047322 ], "wc_summary_review_avg": [ 100.5, 58.337380811963094 ], "wc_main_review_avg": [ 456.75, 36.044243645830605 ], "wc_review_avg": [ 643.25, 38.114137796885814 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 1253.25, 345.7263187840926 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 3.75, 0.82915619758885 ], "replies_avg": [ 22, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.8268106308031117, "corr_recommendation_correctness": 0.676481425202546, "gs_citation": 234, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5205772214796442228&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "pdf": "https://openreview.net/pdf?id=Czsdv-S4-w9", "email": "kaist.ac.kr;kaist.ac.kr;kaist.ac.kr;navercorp.com;navercorp.com;navercorp.com;kaist.ac.kr", "author_num": 7, "aff_unique_index": "0;0;0;1;1;1;0", "aff_unique_norm": "Korea Advanced Institute of Science and Technology;NAVER Corporation", "aff_unique_dep": ";", "aff_unique_url": "https://www.kaist.ac.kr;https://www.naver.com", "aff_unique_abbr": "KAIST;NAVER", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "South Korea" }, { "id": "D1TYemnoRN", "title": "Short optimization paths lead to good generalization", "track": "main", "status": "Reject", "tldr": "", "abstract": "Optimization and generalization are two essential aspects of machine learning. In this paper, we propose a framework to connect optimization with generalization by analyzing the generalization error based on the length of optimization trajectory under the gradient flow algorithm after convergence. Through our approach, we show that, with a proper initialization, gradient flow converges following a short path with an explicit length estimate. Such an estimate induces a length-based generalization bound, showing that short optimization paths after convergence indicate good generalization. Our framework can be applied to broad settings. For example, we use it to obtain generalization estimates on three distinct machine learning models: underdetermined $\\ell_p$ linear regression, kernel regression, and overparameterized two-layer ReLU neural networks.", "keywords": "optimization;generalization;machine learning theory", "primary_area": "", "supplementary_material": "/attachment/93d9c0dec480d42f91653624e26c432b388f42e5.zip", "author": "Fusheng Liu;Haizhao Yang;Qianxiao Li", "authorids": "~Fusheng_Liu1;~Haizhao_Yang1;~Qianxiao_Li1", "gender": ";M;M", "homepage": "https://mathematicallfs.github.io;https://haizhaoyang.github.io;https://blog.nus.edu.sg/qianxiaoli/", "dblp": ";139/1215;172/0930.html", "google_scholar": ";p4mxTIwAAAAJ;https://scholar.google.com.sg/citations?user=zLgReYoAAAAJ", "orcid": ";;0000-0002-3903-3737", "linkedin": ";;", "or_profile": "~Fusheng_Liu1;~Haizhao_Yang1;~Qianxiao_Li1", "aff": "National University of Singapore;Purdue University;National University of Singapore", "aff_domain": "u.nus.edu;purdue.edu;nus.edu.sg", "position": "PhD student;Assistant Professor;Assistant Professor", "bibtex": "@misc{\nliu2022short,\ntitle={Short optimization paths lead to good generalization},\nauthor={Fusheng Liu and Haizhao Yang and Qianxiao Li},\nyear={2022},\nurl={https://openreview.net/forum?id=D1TYemnoRN}\n}", "github": "", "project": "", "reviewers": "shcV;8Scj;xkEt;ZB5L", "site": "https://openreview.net/forum?id=D1TYemnoRN", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "4;4;3;3", "correctness": "3;4;4;3", "technical_novelty": "3;2;3;3", "empirical_novelty": "0;0;0;0", "wc_summary_paper": "72;117;204;64", "wc_summary_review": "28;55;46;46", "wc_main_review": "271;552;265;305", "wc_review": "371;724;515;415", "wc_reply_reviewers": "153;0;33;46", "wc_reply_authors": "1031;1011;414;665", "reply_reviewers": "1;0;1;1", "reply_authors": "2;2;1;1", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 0, 0 ], "wc_summary_paper_avg": [ 114.25, 55.616431924387236 ], "wc_summary_review_avg": [ 43.75, 9.807522622966516 ], "wc_main_review_avg": [ 348.25, 118.61992876409933 ], "wc_review_avg": [ 506.25, 136.11644830805716 ], "wc_reply_reviewers_avg": [ 58.0, 57.35416288291548 ], "wc_reply_authors_avg": [ 780.25, 256.68207475396486 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -1.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:wkhD7VrCGxwJ:scholar.google.com/&scioq=Short+optimization+paths+lead+to+good+generalization&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;0", "aff_unique_norm": "National University of Singapore;Purdue University", "aff_unique_dep": ";", "aff_unique_url": "https://www.nus.edu.sg;https://www.purdue.edu", "aff_unique_abbr": "NUS;Purdue", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", "aff_country_unique": "Singapore;United States" }, { "id": "D1hTwPPmMVv", "title": "Enhanced countering adversarial attacks via input denoising and feature restoring", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Despite the fact that deep neural networks (DNNs) have achieved prominent performance in various applications, it is well known that DNNs are vulnerable to adversarial examples/samples (AEs) with imperceptible perturbations in clean/original samples. To overcome the weakness of the existing defense methods against adversarial attacks, which damages the information on the original samples, leading to the decrease of the target classifier accuracy, this paper presents an enhanced countering adversarial attack method IDFR (via Input Denoising and Feature Restoring). The proposed IDFR is made up of an enhanced input denoiser (ID) and a hidden lossy feature restorer (FR) based on the convex hull optimization. Extensive experiments conducted on benchmark datasets show that the proposed IDFR outperforms the various state-of-the-art defense methods, and is highly effective for protecting target models against various adversarial black-box or white-box attacks. ", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yanni Li;Zhang Wenhui;Jiawei Liu;Xiaoli Kou;Hui Li;Jiangtao Cui", "authorids": "~Yanni_Li2;~Zhang_Wenhui1;~Jiawei_Liu4;~Xiaoli_Kou1;~Hui_Li10;~Jiangtao_Cui1", "gender": "F;M;M;;M;M", "homepage": ";;;https://nbefe.com/facultyxidiannbefecom/KXL1/zh_CN/jxcg/347894/list/index.htm;https://lihuixidian.github.io;https://web.xidian.edu.cn/cuijt/", "dblp": ";;;;l/HuiLi5;00/1740.html", "google_scholar": "https://scholar.google.com/citations?hl=zh-CN;https://scholar.google.com/citations?hl=zh-CN;Tm8B3BIAAAAJ;;;u9IRSacAAAAJ", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": "~Yanni_Li2;~Zhang_Wenhui1;~Jiawei_Liu4;~Xiaoli_Kou1;~Hui_Li10;~Jiangtao_Cui1", "aff": "Xidian University;;Xidian University;Xidian University;Xidian University;Xidian University", "aff_domain": "xidian.edu;;xidian.edu;xidian.edu;xidian.edu.cn;xidian.edu.cn", "position": "Full Professor;;MS student;Associate Professor;Full Professor;Full Professor", "bibtex": "@misc{\nli2022enhanced,\ntitle={Enhanced countering adversarial attacks via input denoising and feature restoring},\nauthor={Yanni Li and Zhang Wenhui and Jiawei Liu and Xiaoli Kou and Hui Li and Jiangtao Cui},\nyear={2022},\nurl={https://openreview.net/forum?id=D1hTwPPmMVv}\n}", "github": "", "project": "", "reviewers": "hZ2y;QPMx;3U7M;JSpM", "site": "https://openreview.net/forum?id=D1hTwPPmMVv", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "4;4;4;3", "correctness": "2;2;3;3", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "69;22;49;59", "wc_summary_review": "30;32;26;28", "wc_main_review": "487;548;318;242", "wc_review": "586;602;393;329", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 49.75, 17.512495538900218 ], "wc_summary_review_avg": [ 29.0, 2.23606797749979 ], "wc_main_review_avg": [ 398.75, 123.64743224183833 ], "wc_review_avg": [ 477.5, 118.81182601071326 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -1.0, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7900521910416401278&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Xidian University", "aff_unique_dep": "", "aff_unique_url": "http://www.xidian.edu.cn/", "aff_unique_abbr": "Xidian", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "id": "D637S6zBRLD", "title": "Learning Symmetric Representations for Equivariant World Models", "track": "main", "status": "Reject", "tldr": "", "abstract": "Encoding known symmetries into world models can improve generalization. However, identifying how latent symmetries manifest in the input space can be difficult. As an example, rotations of objects are equivariant with respect to their orientation, but extracting this orientation from an image is difficult in absence of supervision. In this paper, we use equivariant transition models as an inductive bias to learn symmetric latent representations in a self-supervised manner. This allows us to train non-equivariant networks to encode input data, for which the underlying symmetry may be non-obvious, into a latent space where symmetries may be used to reason about outcomes of actions in a data-efficient manner. Our method is agnostic to the type of latent symmetry; we demonstrate its usefulness over $C_4 \\times S_5$ using $G$-convolutions and GNNs, over $D_4 \\ltimes (\\mathbb{R}^2,+)$ using $E(2)$-steerable CNNs, and over $\\mathrm{SO}(3)$ using tensor field networks. In all three cases, we demonstrate improvements relative to both fully-equivariant and non-equivariant baselines. ", "keywords": "equivariant;symmetry;contrastive loss;world models;transition;representation theory;generalization", "primary_area": "", "supplementary_material": "", "author": "Jung Yeon Park;Ondrej Biza;Linfeng Zhao;Jan-Willem van de Meent;Robin Walters", "authorids": "~Jung_Yeon_Park1;~Ondrej_Biza1;~Linfeng_Zhao1;~Jan-Willem_van_de_Meent1;~Robin_Walters1", "gender": "M;M;;M;M", "homepage": ";https://sites.google.com/view/obiza;http://lfzhao.com;https://jwvdm.github.io/;http://www.robinwalters.com", "dblp": "240/2704;230/8616.html;221/4652;137/3263;258/3416", "google_scholar": "LZSRm9sAAAAJ;Gi9Xq8YAAAAJ;;CX9Lu38AAAAJ;fnprJmUAAAAJ", "orcid": ";0000-0003-3390-8050;;0000-0001-9465-5398;", "linkedin": ";ond%C5%99ej-b%C3%AD%C5%BEa-a9405353/;;;", "or_profile": "~Jung_Yeon_Park1;~Ondrej_Biza1;~Linfeng_Zhao1;~Jan-Willem_van_de_Meent1;~Robin_Walters1", "aff": "Northeastern University;Google Brain;Northeastern University;Northeastern University;Northeastern University ", "aff_domain": "northeastern.edu;google.com;northeastern.edu;northeastern.edu;northeastern.edu", "position": "PhD student;Intern;PhD student;Assistant Professor;Assistant Professor", "bibtex": "@misc{\npark2022learning,\ntitle={Learning Symmetric Representations for Equivariant World Models},\nauthor={Jung Yeon Park and Ondrej Biza and Linfeng Zhao and Jan-Willem van de Meent and Robin Walters},\nyear={2022},\nurl={https://openreview.net/forum?id=D637S6zBRLD}\n}", "github": "", "project": "", "reviewers": "LG1G;Uu3z;5ro3;hasv", "site": "https://openreview.net/forum?id=D637S6zBRLD", "pdf_size": 0, "recommendation": "6;6;6;6", "confidence": "3;4;3;2", "correctness": "2;3;3;3", "technical_novelty": "2;2;1;3", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "124;110;102;48", "wc_summary_review": "120;2;82;11", "wc_main_review": "341;416;281;197", "wc_review": "585;528;465;256", "wc_reply_reviewers": "201;688;0;0", "wc_reply_authors": "693;1953;437;539", "reply_reviewers": "1;3;0;0", "reply_authors": "1;4;1;1", "recommendation_avg": [ 6.0, 0.0 ], "confidence_avg": [ 3.0, 0.7071067811865476 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 96.0, 28.809720581775867 ], "wc_summary_review_avg": [ 53.75, 49.225882419719 ], "wc_main_review_avg": [ 308.75, 80.31305933657364 ], "wc_review_avg": [ 458.5, 124.37945971903882 ], "wc_reply_reviewers_avg": [ 222.25, 281.14264617805674 ], "wc_reply_authors_avg": [ 905.5, 611.6017903832526 ], "reply_reviewers_avg": [ 1.0, 1.224744871391589 ], "reply_authors_avg": [ 1.75, 1.299038105676658 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6567940547407534925&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;0;0;0", "aff_unique_norm": "Northeastern University;Google", "aff_unique_dep": ";Google Brain", "aff_unique_url": "https://www.northeastern.edu;https://brain.google.com", "aff_unique_abbr": "NEU;Google Brain", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "On Improving Adversarial Transferability of Vision Transformers", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6341", "id": "D6nH3719vZy", "poster": "", "openreview": "https://openreview.net/forum?id=D6nH3719vZy", "slides": "https://iclr.cc/virtual/2022/poster/6341", "video": "https://iclr.cc/virtual/2022/poster/6341", "author_site": "Muzammal Naseer, Kanchana Ranasinghe, Salman Khan, Fahad Khan, Fatih Porikli", "tldr": "", "abstract": "Vision transformers (ViTs) process input images as sequences of patches via self-attention; a radically different architecture than convolutional neural networks (CNNs). This makes it interesting to study the adversarial feature space of ViT models and their transferability. In particular, we observe that adversarial patterns found via conventional adversarial attacks show very \\emph{low} black-box transferability even for large ViT models. We show that this phenomenon is only due to the sub-optimal attack procedures that do not leverage the true representation potential of ViTs. A deep ViT is composed of multiple blocks, with a consistent architecture comprising of self-attention and feed-forward layers, where each block is capable of independently producing a class token. Formulating an attack using only the last class token (conventional approach) does not directly leverage the discriminative information stored in the earlier tokens, leading to poor adversarial transferability of ViTs.Using the compositional nature of ViT models, we enhance transferability of existing attacks by introducing two novel strategies specific to the architecture of ViT models. \\emph{(i) Self-Ensemble:} We propose a method to find multiple discriminative pathways by dissecting a single ViT model into an ensemble of networks. This allows explicitly utilizing class-specific information at each ViT block. \\emph{(ii) Token Refinement:} We then propose to refine the tokens to further enhance the discriminative capacity at each block of ViT.Our token refinement systematically combines the class tokens with structural information preserved within the patch tokens. An adversarial attack when applied to such refined tokens within the ensemble of classifiers found in a single vision transformer has significantly higher transferability and thereby brings out the true generalization potential of the ViT's adversarial space. Code: https://t.ly/hBbW.", "keywords": "Vision Transformers;Adversarial Perturbations", "primary_area": "", "supplementary_material": "", "author": "Muzammal Naseer;Kanchana Ranasinghe;Salman Khan;Fahad Khan;Fatih Porikli", "authorids": "~Muzammal_Naseer1;~Kanchana_Ranasinghe1;~Salman_Khan4;~Fahad_Khan1;~Fatih_Porikli2", "gender": "M;M;M;M;M", "homepage": "https://muzammal-naseer.com/;https://salman-h-khan.github.io/;https://sites.google.com/view/fahadkhans/home;https://www.porikli.com;http://kahnchana.github.io/", "dblp": ";32/11535-1;05/8618;p/FatihMuratPorikli;211/4048", "google_scholar": "https://scholar.google.ch/citations?user=tM9xKA8AAAAJ;https://scholar.google.es/citations?user=M59O9lkAAAAJ;zvaeYnUAAAAJ;https://scholar.google.com.tw/citations?user=VpB8NZ8AAAAJ;K2WBZTwAAAAJ", "orcid": "0000-0001-7663-7161;0000-0002-9502-1749;;0000-0002-1520-4466;0000-0003-2374-7804", "linkedin": "muzammalnaseer/;;;fatih-porikli-a95643/;", "or_profile": "~Muzammal_Naseer1;~Salman_Khan4;~Fahad_Khan1;~Fatih_Porikli2;~Kanchana_Nisal_Ranasinghe1", "aff": "Mohamed bin Zayed University of Artificial Intelligence;Australian National University;Link\u00f6ping University;QualComm;Apple", "aff_domain": "mbzuai.ac.ae;anu.edu.au;liu.se;qualcomm.com;apple.com", "position": "Researcher;Lecturer;Associate Professor;Senior Director;Intern", "bibtex": "@inproceedings{\nnaseer2022on,\ntitle={On Improving Adversarial Transferability of Vision Transformers },\nauthor={Muzammal Naseer and Kanchana Ranasinghe and Salman Khan and Fahad Khan and Fatih Porikli},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=D6nH3719vZy}\n}", "github": "", "project": "", "reviewers": "aX8L;HKsh;NMLw;U2B6", "pdf_size": 0, "recommendation": "6;8;8;8", "confidence": "4;5;3;3", "correctness": "4;4;3;4", "technical_novelty": "3;3;3;3", "empirical_novelty": "3;3;3;4", "wc_summary_paper": "210;84;44;78", "wc_summary_review": "48;53;57;37", "wc_main_review": "584;178;117;177", "wc_review": "842;315;218;292", "wc_reply_reviewers": "0;22;0;24", "wc_reply_authors": "509;502;605;381", "reply_reviewers": "0;1;0;1", "reply_authors": "1;1;1;1", "recommendation_avg": [ 7.5, 0.8660254037844386 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 3.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 104.0, 63.071388124885914 ], "wc_summary_review_avg": [ 48.75, 7.495832175282475 ], "wc_main_review_avg": [ 264.0, 186.39608364984497 ], "wc_review_avg": [ 416.75, 248.12030851987913 ], "wc_reply_reviewers_avg": [ 11.5, 11.521718621802913 ], "wc_reply_authors_avg": [ 499.25, 79.48073665989766 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.17407765595569782, "corr_recommendation_correctness": -0.3333333333333333, "gs_citation": 112, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9927551479269048485&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "pdf": "https://openreview.net/pdf?id=D6nH3719vZy", "email": "mbzuai.ac.ae;anu.edu.au;liu.se;qualcomm.com;apple.com", "author_num": 5, "aff_unique_index": "0;1;2;3;4", "aff_unique_norm": "Mohamed bin Zayed University of Artificial Intelligence;Australian National University;Link\u00f6ping University;Qualcomm Incorporated;Apple", "aff_unique_dep": ";;;;Apple Inc.", "aff_unique_url": "https://mbzuai.ac.ae;https://www.anu.edu.au;https://www.liu.se;https://www.qualcomm.com;https://www.apple.com", "aff_unique_abbr": "MBZUAI;ANU;LiU;Qualcomm;Apple", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2;3;3", "aff_country_unique": "United Arab Emirates;Australia;Sweden;United States" }, { "title": "How Do Vision Transformers Work?", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6017", "id": "D78Go4hVcxO", "poster": "", "openreview": "https://openreview.net/forum?id=D78Go4hVcxO", "slides": "https://iclr.cc/virtual/2022/poster/6017", "video": "https://iclr.cc/virtual/2022/poster/6017", "author_site": "Namuk Park, Songkuk Kim", "tldr": "", "abstract": "The success of multi-head self-attentions (MSAs) for computer vision is now indisputable. However, little is known about how MSAs work. We present fundamental explanations to help better understand the nature of MSAs. In particular, we demonstrate the following properties of MSAs and Vision Transformers (ViTs): (1) MSAs improve not only accuracy but also generalization by flattening the loss landscapes. Such improvement is primarily attributable to their data specificity, not long-range dependency. On the other hand, ViTs suffer from non-convex losses. Large datasets and loss landscape smoothing methods alleviate this problem; (2) MSAs and Convs exhibit opposite behaviors. For example, MSAs are low-pass filters, but Convs are high-pass filters. Therefore, MSAs and Convs are complementary; (3) Multi-stage neural networks behave like a series connection of small individual models. In addition, MSAs at the end of a stage play a key role in prediction. Based on these insights, we propose AlterNet, a model in which Conv blocks at the end of a stage are replaced with MSA blocks. AlterNet outperforms CNNs not only in large data regimes but also in small data regimes. The code is available at https://github.com/xxxnell/how-do-vits-work.", "keywords": "vision transformer;self-attention;multi-head self-attention;loss landscape", "primary_area": "", "supplementary_material": "/attachment/c639dac30096d33d22a89d8688679b49c5737d4e.zip", "author": "Namuk Park;Songkuk Kim", "authorids": "~Namuk_Park1;~Songkuk_Kim1", "gender": ";M", "homepage": "http://namukpark.com/;", "dblp": "244/9940;78/2018", "google_scholar": "c2vdTRAAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";", "linkedin": ";", "or_profile": "~Namuk_Park1;~Songkuk_Kim1", "aff": "Yonsei University;Yonsei University", "aff_domain": "yonsei.ac.kr;yonsei.ac.kr", "position": "PhD student;Assistant Professor", "bibtex": "@inproceedings{\npark2022how,\ntitle={How Do Vision Transformers Work?},\nauthor={Namuk Park and Songkuk Kim},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=D78Go4hVcxO}\n}", "github": "", "project": "", "reviewers": "GbX6;aDUi;DmHc;Kvf7", "pdf_size": 0, "recommendation": "5;8;8;8", "confidence": "4;3;4;3", "correctness": "2;4;4;3", "technical_novelty": "3;2;3;4", "empirical_novelty": "2;4;4;4", "wc_summary_paper": "58;84;65;117", "wc_summary_review": "127;37;56;150", "wc_main_review": "287;233;148;269", "wc_review": "472;354;269;536", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "772;720;122;67", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 7.25, 1.299038105676658 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 3.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 81.0, 22.85825890132492 ], "wc_summary_review_avg": [ 92.5, 47.193749586147526 ], "wc_main_review_avg": [ 234.25, 53.45734280713923 ], "wc_review_avg": [ 407.75, 103.34015434476571 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 420.25, 326.8473458665375 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.8703882797784892, "gs_citation": 687, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8029612233773990665&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=D78Go4hVcxO", "email": "yonsei.ac.kr;yonsei.ac.kr", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Yonsei University", "aff_unique_dep": "", "aff_unique_url": "https://www.yonsei.ac.kr", "aff_unique_abbr": "Yonsei", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "South Korea" }, { "id": "D7hX1d3ov2c", "title": "Ensembles and Encoders for Task-Free Continual Learning", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "We present an architecture that is effective for continual learning in an especially demanding setting, where task boundaries do not exist or are unknown, and where classes have to be learned online (with each presented only once). To obtain good performance under these constraints, while mitigating catastrophic forgetting, we exploit recent advances in contrastive, self-supervised learning, allowing us to use a pre-trained, general purpose image encoder whose weights can be frozen, which precludes forgetting. The pre-trained encoder also greatly simplifies the downstream task of classification, which we solve with an ensemble of very simple classifiers. Collectively, the ensemble exhibits much better performance than any individual classifier, an effect which is amplified through specialisation and competitive selection. We assess the performance of the encoders-and-ensembles architecture on standard continual learning benchmarks, where it out-performs prior state-of-the-art by a large margin on the hardest problems, as well as in less familiar settings where the data distribution changes gradually or the classes are presented one at a time.", "keywords": "continual learning;task-free continual learning;self-supervised learning;pre-training;ensemble methods", "primary_area": "", "supplementary_material": "", "author": "Murray Shanahan;Christos Kaplanis;Jovana Mitrovi\u0107", "authorids": "~Murray_Shanahan1;kaplanis@deepmind.com;mitrovic@deepmind.com", "gender": "M;;", "homepage": "https://www.doc.ic.ac.uk/~mpsha/;;", "dblp": "11/5268;;", "google_scholar": "https://scholar.google.co.uk/citations?user=00bnGpAAAAAJ;;", "orcid": "0000-0001-5984-2964;;", "linkedin": ";;", "or_profile": "~Murray_Shanahan1;kaplanis@deepmind.com;mitrovic@deepmind.com", "aff": "Imperial College London;;", "aff_domain": ";;", "position": "Full Professor;;", "bibtex": "@misc{\nshanahan2022ensembles,\ntitle={Ensembles and Encoders for Task-Free Continual Learning},\nauthor={Murray Shanahan and Christos Kaplanis and Jovana Mitrovi{\\'c}},\nyear={2022},\nurl={https://openreview.net/forum?id=D7hX1d3ov2c}\n}", "github": "", "project": "", "reviewers": "EtAk;fzpG;DzWF;z27W", "site": "https://openreview.net/forum?id=D7hX1d3ov2c", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "4;3;4;5", "correctness": "2;3;3;3", "technical_novelty": "1;2;2;3", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "40;182;60;134", "wc_summary_review": "59;51;99;5", "wc_main_review": "235;488;498;541", "wc_review": "334;721;657;680", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 104.0, 57.043842787806646 ], "wc_summary_review_avg": [ 53.5, 33.3878720495931 ], "wc_main_review_avg": [ 440.5, 120.3048211835253 ], "wc_review_avg": [ 598.0, 154.13468136665415 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.7071067811865475, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3317207659025807808&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "Imperial College London", "aff_unique_dep": "", "aff_unique_url": "https://www.imperial.ac.uk", "aff_unique_abbr": "ICL", "aff_country_unique_index": "0", "aff_country_unique": "United Kingdom" }, { "id": "D8njK_Ix5dJ", "title": "Maximum Mean Discrepancy for Generalization in the Presence of Distribution and Missingness Shift", "track": "main", "status": "Reject", "tldr": "", "abstract": "Covariate shifts are a common problem in predictive modeling on real-world problems. This paper proposes addressing the covariate shift problem by minimizing Maximum Mean Discrepancy (MMD) statistics between the training and test sets in either feature input space, feature representation space, or both. We designed three techniques that we call MMD Representation, MMD Mask, and MMD Hybrid to deal with the scenarios where only a distribution shift exists, only a missingness shift exists, or both types of shift exist, respectively. We find that integrating an MMD loss component helps models use the best features for generalization and avoid dangerous extrapolation as much as possible for each test sample. Models treated with this MMD approach show better performance, calibration, and extrapolation on the test set.", "keywords": "maximum mean discrepancy;data shift;covariate shift;representation learning;missing data", "primary_area": "", "supplementary_material": "", "author": "Liwen Ouyang;Aaron Key", "authorids": "~Liwen_Ouyang1;~Aaron_Key1", "gender": ";M", "homepage": "https://www.linkedin.com/in/liwenouyang/;", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";aaron-key-b982a080", "or_profile": "~Liwen_Ouyang1;~Aaron_Key1", "aff": "Bloomberg L.P.;", "aff_domain": "bloomberg.net;", "position": "Quant Researcher;", "bibtex": "@misc{\nouyang2022maximum,\ntitle={Maximum Mean Discrepancy for Generalization in the Presence of Distribution and Missingness Shift},\nauthor={Liwen Ouyang and Aaron Key},\nyear={2022},\nurl={https://openreview.net/forum?id=D8njK_Ix5dJ}\n}", "github": "", "project": "", "reviewers": "jTXA;UfTS;x6L8;4NFR", "site": "https://openreview.net/forum?id=D8njK_Ix5dJ", "pdf_size": 0, "recommendation": "3;3;3;3", "confidence": "3;5;4;2", "correctness": "3;4;2;3", "technical_novelty": "2;2;1;2", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "366;61;44;61", "wc_summary_review": "78;42;33;57", "wc_main_review": "658;137;163;227", "wc_review": "1102;240;240;345", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 3.5, 1.118033988749895 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 1.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 133.0, 134.70152189192223 ], "wc_summary_review_avg": [ 52.5, 17.03672503740082 ], "wc_main_review_avg": [ 296.25, 211.4088160413373 ], "wc_review_avg": [ 481.75, 360.65799242495655 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13382418150538314222&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 4, "aff_unique_index": "0", "aff_unique_norm": "Bloomberg", "aff_unique_dep": "", "aff_unique_url": "https://www.bloomberg.com", "aff_unique_abbr": "Bloomberg", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "id": "D8pn0BlHaGe", "title": "Single-Cell Capsule Attention : an interpretable method of cell type classification for single-cell RNA-sequencing data", "track": "main", "status": "Reject", "tldr": "", "abstract": "Single-cell RNA-sequencing technique can obtain genes\u2019 expression level of every cell. Cell type classification (also known as cell type annotation) on single-cell RNA-seq data helps to explore cellular heterogeneity and diversity. Previous methods for cell type classification are either based on statistical hypotheses of gene expression or deep neural networks. However, the hypotheses may not reflect the true expression level. Deep neural networks lack interpretation for the result. Here we present an interpretable neural-network based method single-cell capsule attention(scCA) which assigns cells to different cell types based on their different feature patterns. In our model, we first generate capsules which extract different features of the cells. Then we obtain compound features which combine a set of features\u2019 information through a LSTM model. In the end, we train attention weights and apply them to the compound features. scCA provides a strong interpretation for cell type classification result. Cells from the same cell type share a similar pattern of capsules\u2019 relationship and similar distribution of attention weights for compound features. Compared with previous methods for cell type classification on nine datasets, scCA shows high accuracy on all datasets with robustness and reliable interpretation.", "keywords": "Single-Cell RNA-sequencing;cell type classification;capsule network;attention;interpretable model", "primary_area": "", "supplementary_material": "", "author": "Tianxu Wang;Xiuli Ma", "authorids": "~Tianxu_Wang2;~Xiuli_Ma1", "gender": "F;M", "homepage": "http://sai.pku.edu.cn/info/1362/2239.htm;", "dblp": ";", "google_scholar": ";", "orcid": ";0000-0002-6642-7838", "linkedin": ";", "or_profile": "~Xiuli_Ma1;~tianxu_Wang1", "aff": "Peking University;Peking University", "aff_domain": "pku.edu.cn;pku.edu.cn", "position": "Assistant Professor;MS student", "bibtex": "@misc{\nwang2022singlecell,\ntitle={Single-Cell Capsule Attention : an interpretable method of cell type classification for single-cell {RNA}-sequencing data},\nauthor={Tianxu Wang and Xiuli Ma},\nyear={2022},\nurl={https://openreview.net/forum?id=D8pn0BlHaGe}\n}", "github": "", "project": "", "reviewers": "mrct;YcRA;khq7;HMhu", "site": "https://openreview.net/forum?id=D8pn0BlHaGe", "pdf_size": 0, "recommendation": "1;1;3;3", "confidence": "3;4;4;5", "correctness": "4;2;3;2", "technical_novelty": "2;1;2;1", "empirical_novelty": "1;1;2;2", "wc_summary_paper": "38;48;91;75", "wc_summary_review": "68;37;66;64", "wc_main_review": "230;185;785;414", "wc_review": "336;270;942;553", "wc_reply_reviewers": "0;0;235;0", "wc_reply_authors": "557;477;851;780", "reply_reviewers": "0;0;1;0", "reply_authors": "1;1;2;1", "recommendation_avg": [ 2.0, 1.0 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 2.75, 0.82915619758885 ], "technical_novelty_avg": [ 1.5, 0.5 ], "empirical_novelty_avg": [ 1.5, 0.5 ], "wc_summary_paper_avg": [ 63.0, 21.083168642308014 ], "wc_summary_review_avg": [ 58.75, 12.636751956100111 ], "wc_main_review_avg": [ 403.5, 236.37734663034018 ], "wc_review_avg": [ 525.25, 262.40176733398727 ], "wc_reply_reviewers_avg": [ 58.75, 101.75798494467153 ], "wc_reply_authors_avg": [ 666.25, 153.96651421656594 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.7071067811865475, "corr_recommendation_correctness": -0.30151134457776363, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:ub4ScFBpw-QJ:scholar.google.com/&scioq=Single-Cell+Capsule+Attention+:+an+interpretable+method+of+cell+type+classification+for+single-cell+RNA-sequencing+data&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Peking University", "aff_unique_dep": "", "aff_unique_url": "http://www.pku.edu.cn", "aff_unique_abbr": "Peking U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "id": "D9E8MKsfhw", "title": "An Empirical Investigation of the Role of Pre-training in Lifelong Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "The lifelong learning paradigm in machine learning is an attractive alternative to the more prominent isolated learning scheme not only due to its resemblance to biological learning, but also its potential to reduce energy waste by obviating excessive model re-training. A key challenge to this paradigm is the phenomenon of catastrophic forgetting. With the increasing popularity and success of pre-trained models in machine learning, we pose the question: What role does pre-training play in lifelong learning, specifically with respect to catastrophic forgetting? We investigate existing methods in the context of large, pre-trained models and evaluate their performance on a variety of text and image classification tasks, including a large-scale study using a novel dataset of 15 diverse NLP tasks. Across all settings, we observe that generic pre-training implicitly alleviates the effects of catastrophic forgetting when learning multiple tasks sequentially compared to randomly initialized models. We then further investigate why pre-training alleviates forgetting in this setting. We study this phenomenon by analyzing the loss landscape, finding that pre-trained weights appear to ease forgetting by leading to wider minima. Based on this insight, we propose jointly optimizing for current task loss and loss basin sharpness in order to explicitly encourage wider basins during sequential fine-tuning. We show that this optimization approach leads to performance comparable to the state-of-the-art in task-sequential continual learning across multiple settings, without retaining a memory that scales in size with the number of tasks.", "keywords": "Lifelong Learning;Continual Learning;Catastrophic Forgetting;Pre-training", "primary_area": "", "supplementary_material": "/attachment/8869e4ce52b68e4e377651de6aceb60f521bcc46.zip", "author": "Sanket Vaibhav Mehta;Darshan Patil;Sarath Chandar;Emma Strubell", "authorids": "~Sanket_Vaibhav_Mehta2;~Darshan_Patil1;~Sarath_Chandar1;~Emma_Strubell1", "gender": "M;M;M;Non-Binary", "homepage": "https://sanketvmehta.github.io;http://www.darshanpatil.com/;http://sarathchandar.in/;http://strubell.github.io", "dblp": "225/7804;211/8734;45/8542;153/2253", "google_scholar": "H4pn-ogAAAAJ;https://scholar.google.ca/citations?user=X3HJD0AAAAAJ;https://scholar.google.co.in/citations?user=yxWtZLAAAAAJ;UCDMtM0AAAAJ", "orcid": "0000-0003-1809-4685;;;", "linkedin": "sanketvmehta/;;;", "or_profile": "~Sanket_Vaibhav_Mehta2;~Darshan_Patil1;~Sarath_Chandar1;~Emma_Strubell1", "aff": "Carnegie Mellon University;Universit\u00e9 de Montr\u00e9al;\u00c9cole Polytechnique de Montr\u00e9al;Google", "aff_domain": "cmu.edu;umontreal.ca;polymtl.ca;google.com", "position": "PhD student;PhD student;Assistant Professor;Research Scientist", "bibtex": "@misc{\nmehta2022an,\ntitle={An Empirical Investigation of the Role of Pre-training in Lifelong Learning},\nauthor={Sanket Vaibhav Mehta and Darshan Patil and Sarath Chandar and Emma Strubell},\nyear={2022},\nurl={https://openreview.net/forum?id=D9E8MKsfhw}\n}", "github": "", "project": "", "reviewers": "UX5R;6rM5;xCMa", "site": "https://openreview.net/forum?id=D9E8MKsfhw", "pdf_size": 0, "recommendation": "5;5;6", "confidence": "4;4;4", "correctness": "3;3;4", "technical_novelty": "2;3;2", "empirical_novelty": "2;3;2", "wc_summary_paper": "63;60;78", "wc_summary_review": "136;72;121", "wc_main_review": "574;243;504", "wc_review": "773;375;703", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "1995;1278;522", "reply_reviewers": "0;0;0", "reply_authors": "3;2;1", "recommendation_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 67.0, 7.874007874011811 ], "wc_summary_review_avg": [ 109.66666666666667, 27.329267990359508 ], "wc_main_review_avg": [ 440.3333333333333, 142.4320500761289 ], "wc_review_avg": [ 617.0, 173.48967308363535 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 1265.0, 601.4199863656013 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.0, 0.816496580927726 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.9999999999999997, "gs_citation": 160, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1664711701505383608&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "Carnegie Mellon University;Universit\u00e9 de Montr\u00e9al;\u00c9cole Polytechnique de Montr\u00e9al;Google", "aff_unique_dep": ";;;Google", "aff_unique_url": "https://www.cmu.edu;https://www.umontreal.ca;https://www.polymtl.ca;https://www.google.com", "aff_unique_abbr": "CMU;UdeM;Polytechnique Montr\u00e9al;Google", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Montr\u00e9al;Mountain View", "aff_country_unique_index": "0;1;1;0", "aff_country_unique": "United States;Canada" }, { "id": "D9SuLzhgK9", "title": "Adam is no better than normalized SGD: Dissecting how adaptivity improves GAN performance", "track": "main", "status": "Reject", "tldr": "", "abstract": " Adaptive methods are widely used for training generative adversarial networks (GAN). While there has been some work to pinpoint the marginal value of adaptive methods in minimization problems, it remains unclear why it is still the method of choice for GAN training. This paper formally studies how adaptive methods help performance in GANs. First, we dissect Adam---the most popular adaptive method for GAN training---by comparing with SGDA the direction and the norm of its update vector. We empirically show that SGDA with the same vector norm as Adam reaches similar or even better performance than the latter. This empirical study encourages us to consider normalized stochastic gradient descent ascent (nSGDA) as a simpler alternative to Adam. We then propose a synthetic theoretical framework to understand why nSGDA yields better performance than SGDA for GANs. In that situation, we prove that a GAN trained with nSGDA provably recovers all the modes of the true distribution. In contrast, the same networks trained with SGDA (and any learning rate configuration) suffers from mode collapsing. The critical insight in our analysis is that normalizing the gradients forces the discriminator and generator to update at the same pace. We empirically show the competitive performance of nSGDA on real-world datasets.\n ", "keywords": "Generative adversarial networks;non-convex optimization", "primary_area": "", "supplementary_material": "/attachment/99ad41e6fbf801665415379ca90c305b66cc252e.zip", "author": "Samy Jelassi;Arthur Mensch;Gauthier Gidel;Yuanzhi Li", "authorids": "~Samy_Jelassi1;~Arthur_Mensch1;~Gauthier_Gidel1;~Yuanzhi_Li1", "gender": "M;M;M;M", "homepage": "https://sjelassi.github.io/;https://gauthiergidel.github.io/;;https://amensch.fr", "dblp": "222/3149;188/6326;73/3628;156/2229", "google_scholar": ";https://scholar.google.fr/citations?user=bDrXQPUAAAAJ;;https://scholar.google.fr/citations?user=F8riAN8AAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Samy_Jelassi1;~Gauthier_Gidel1;~Yuanzhi_Li1;~Arthur_Mensch2", "aff": "Princeton University;Mila - Quebec Artificial Intelligence Institute;Carnegie Mellon University;Google DeepMind", "aff_domain": "princeton.edu;mila.quebec;andrew.cmu.edu;deepmind.com", "position": "PhD student;Assistant Professor;Assistant Professor;Researcher", "bibtex": "@misc{\njelassi2022adam,\ntitle={Adam is no better than normalized {SGD}: Dissecting how adaptivity improves {GAN} performance},\nauthor={Samy Jelassi and Arthur Mensch and Gauthier Gidel and Yuanzhi Li},\nyear={2022},\nurl={https://openreview.net/forum?id=D9SuLzhgK9}\n}", "github": "", "project": "", "reviewers": "5J1v;5fFW;QjHh", "site": "https://openreview.net/forum?id=D9SuLzhgK9", "pdf_size": 0, "recommendation": "5;5;8", "confidence": "3;4;4", "correctness": "3;4;4", "technical_novelty": "2;4;4", "empirical_novelty": "2;3;4", "wc_summary_paper": "170;59;76", "wc_summary_review": "33;96;15", "wc_main_review": "381;165;141", "wc_review": "584;320;232", "wc_reply_reviewers": "331;0;0", "wc_reply_authors": "1649;794;400", "reply_reviewers": "1;0;0", "reply_authors": "2;1;1", "recommendation_avg": [ 6.0, 1.4142135623730951 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 3.3333333333333335, 0.9428090415820634 ], "empirical_novelty_avg": [ 3.0, 0.816496580927726 ], "wc_summary_paper_avg": [ 101.66666666666667, 48.8148429157453 ], "wc_summary_review_avg": [ 48.0, 34.72751070837067 ], "wc_main_review_avg": [ 229.0, 107.92590050585633 ], "wc_review_avg": [ 378.6666666666667, 149.57123906538834 ], "wc_reply_reviewers_avg": [ 110.33333333333333, 156.0348963818315 ], "wc_reply_authors_avg": [ 947.6666666666666, 521.3510227177932 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.5, "corr_recommendation_correctness": 0.5, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17846257232977074932&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "Princeton University;Quebec Artificial Intelligence Institute;Carnegie Mellon University;Google", "aff_unique_dep": ";Artificial Intelligence;;Google DeepMind", "aff_unique_url": "https://www.princeton.edu;https://mila.quebec;https://www.cmu.edu;https://deepmind.com", "aff_unique_abbr": "Princeton;Mila;CMU;DeepMind", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;2", "aff_country_unique": "United States;Canada;United Kingdom" }, { "id": "D9hpqJyXAi", "title": "Open-sampling: Re-balancing Long-tailed Datasets with Out-of-Distribution Data", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Deep neural networks usually perform poorly when the training dataset suffers from extreme class imbalance. To handle this issue, popular re-sampling methods generally require in-distribution data to balance the class priors. However, obtaining suitable in-distribution data with precise labels for selected classes is challenging. In this paper, we theoretically show that out-of-distribution data (i.e., open-set samples) could be leveraged to augment the minority classes from a Bayesian perspective. Based on this motivation, we propose a novel method called Open-sampling, which utilizes open-set noisy labels to re-balance the class priors of the training dataset. For each open-set instance, the label is sampled from our pre-defined distribution that is complementary to the original class priors. Furthermore, class-dependent weights are generated to provide stronger regularization on the minority classes than on the majority classes. \nWe empirically show that Open-sampling not only re-balances the class prior but also encourages the neural network to learn separable representations. Extensive experiments on benchmark datasets demonstrate that our proposed method significantly outperforms existing data re-balancing methods and can be easily incorporated into existing state-of-the-art methods to enhance their performance.", "keywords": "long-tailed recognition;Out-of-Distribution;open-set noisy labels;deep learning", "primary_area": "", "supplementary_material": "/attachment/e0177612c77adada8885248ef01d26ef2c509b2c.zip", "author": "Hongxin Wei;Lue Tao;RENCHUNZI XIE;Lei Feng;Bo An", "authorids": "~Hongxin_Wei1;~Lue_Tao1;~RENCHUNZI_XIE1;~Lei_Feng1;~Bo_An2", "gender": "M;M;;M;M", "homepage": "https://hongxin001.github.io/;http://www.lamda.nju.edu.cn/taol/;;https://lfeng1995.github.io/;https://personal.ntu.edu.sg/boan/", "dblp": "150/6350;247/1090;;76/847-6;42/6178-1.html", "google_scholar": "cABH034AAAAJ;9Cc-vdAAAAAJ;;https://scholar.google.com.sg/citations?user=KomQOFkAAAAJ;PEEpuNwAAAAJ", "orcid": ";;;0000-0003-2839-5799;0000-0002-7064-7438", "linkedin": ";;;;", "or_profile": "~Hongxin_Wei1;~Lue_Tao1;~RENCHUNZI_XIE1;~Lei_Feng1;~Bo_An2", "aff": "Nanyang Technological University;Nanjing University of Aeronautics and Astronautics;;Chongqing University;Nanyang Technological University", "aff_domain": "ntu.edu.sg;nuaa.edu.cn;;cqu.edu.cn;ntu.edu.sg", "position": "PhD student;MS student;;Full Professor;Full Professor", "bibtex": "@misc{\nwei2022opensampling,\ntitle={Open-sampling: Re-balancing Long-tailed Datasets with Out-of-Distribution Data},\nauthor={Hongxin Wei and Lue Tao and RENCHUNZI XIE and Lei Feng and Bo An},\nyear={2022},\nurl={https://openreview.net/forum?id=D9hpqJyXAi}\n}", "github": "", "project": "", "reviewers": "hWTp;jcgX;ZEQn;jWzC", "site": "https://openreview.net/forum?id=D9hpqJyXAi", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "4;3;5;3", "correctness": "2;3;3;3", "technical_novelty": "2;3;2;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "109;68;94;49", "wc_summary_review": "131;22;32;29", "wc_main_review": "460;267;193;167", "wc_review": "700;357;319;245", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 80.0, 23.140872930812268 ], "wc_summary_review_avg": [ 53.5, 44.89153595055531 ], "wc_main_review_avg": [ 271.75, 114.71132245772428 ], "wc_review_avg": [ 405.25, 174.87477662601887 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.3458572319330373, "corr_recommendation_correctness": 0.9271726499455306, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11455632836328005554&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "Nanyang Technological University;Nanjing University of Aeronautics and Astronautics;Chongqing University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.ntu.edu.sg;http://www.nuaa.edu.cn;https://www.cqu.edu.cn", "aff_unique_abbr": "NTU;NUAA;CQU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;0", "aff_country_unique": "Singapore;China" }, { "id": "DBOibe1ISzB", "title": "SiT: Simulation Transformer for Particle-based Physics Simulation", "track": "main", "status": "Reject", "tldr": "", "abstract": "Most existing particle-based simulators adopt graph convolutional networks (GCNs) to model the underlying physics of particles. \nHowever, they force particles to interact with all neighbors without selection, and they fall short in capturing material semantics for different particles, leading to unsatisfactory performance, especially in generalization.\nThis paper proposes Simulation Transformer (SiT) to simulate particle dynamics with more careful modeling of particle states, interactions, and their intrinsic properties.\nSpecifically, besides the particle tokens, SiT generates interaction tokens and selectively focuses on essential interactions by allowing both tokens to attend to each other.\nIn addition, SiT learns material-aware representations by learnable abstract tokens, which will participate in the attention mechanism and boost the generalization capability further.\nWe evaluate our model on diverse environments, including fluid, rigid, and deformable objects, which cover systems of different complexity and materials.\nWithout bells and whistles, SiT shows strong abilities to simulate particles of different materials and achieves superior performance and generalization across these environments with fewer parameters than existing methods. Codes and models will be released.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yidi Shao;Chen Change Loy;Bo Dai", "authorids": "~Yidi_Shao1;~Chen_Change_Loy2;~Bo_Dai2", "gender": "M;M;M", "homepage": "https://github.com/ftbabi;https://www.mmlab-ntu.com/person/ccloy/index.html;http://daibo.info/", "dblp": "291/7118;01/5855;64/2903-2", "google_scholar": ";https://scholar.google.co.uk/citations?user=559LF80AAAAJ;https://scholar.google.com.hk/citations?user=KNWTvgEAAAAJ", "orcid": ";0000-0001-5345-1591;0000-0003-0777-9232", "linkedin": ";;", "or_profile": "~Yidi_Shao1;~Chen_Change_Loy2;~Bo_Dai2", "aff": "Nanyang Technological University;Nanyang Technological University;Nanyang Technological University", "aff_domain": "ntu.edu.sg;ntu.edu.sg;ntu.edu.sg", "position": "PhD student;Full Professor;Research Assistant Professor", "bibtex": "@misc{\nshao2022sit,\ntitle={SiT: Simulation Transformer for Particle-based Physics Simulation},\nauthor={Yidi Shao and Chen Change Loy and Bo Dai},\nyear={2022},\nurl={https://openreview.net/forum?id=DBOibe1ISzB}\n}", "github": "", "project": "", "reviewers": "ZsKn;dYcg;DCt1;yAKg", "site": "https://openreview.net/forum?id=DBOibe1ISzB", "pdf_size": 0, "recommendation": "3;3;6;6", "confidence": "4;4;3;3", "correctness": "2;2;3;3", "technical_novelty": "3;2;3;2", "empirical_novelty": "3;2;3;2", "wc_summary_paper": "99;60;37;106", "wc_summary_review": "307;254;17;84", "wc_main_review": "2092;536;55;458", "wc_review": "2498;850;109;648", "wc_reply_reviewers": "1425;0;0;0", "wc_reply_authors": "3919;834;207;1334", "reply_reviewers": "5;0;0;0", "reply_authors": "9;2;1;2", "recommendation_avg": [ 4.5, 1.5 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 75.5, 28.30635970943632 ], "wc_summary_review_avg": [ 165.5, 118.90016820845966 ], "wc_main_review_avg": [ 785.25, 776.2214165429862 ], "wc_review_avg": [ 1026.25, 891.8425799994077 ], "wc_reply_reviewers_avg": [ 356.25, 617.0431001964125 ], "wc_reply_authors_avg": [ 1573.5, 1411.817357167704 ], "reply_reviewers_avg": [ 1.25, 2.165063509461097 ], "reply_authors_avg": [ 3.5, 3.2015621187164243 ], "replies_avg": [ 24, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -1.0, "corr_recommendation_correctness": 1.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11323718626441030782&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;0", "aff_unique_norm": "Nanyang Technological University", "aff_unique_dep": "", "aff_unique_url": "https://www.ntu.edu.sg", "aff_unique_abbr": "NTU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Singapore" }, { "title": "Environment Predictive Coding for Visual Navigation", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6109", "id": "DBiQQYWykyy", "poster": "", "openreview": "https://openreview.net/forum?id=DBiQQYWykyy", "slides": "https://iclr.cc/virtual/2022/poster/6109", "video": "https://iclr.cc/virtual/2022/poster/6109", "author_site": "Santhosh Kumar Ramakrishnan, Tushar Nagarajan, Ziad Al-Halah, Kristen Grauman", "tldr": "", "abstract": "We introduce environment predictive coding, a self-supervised approach to learn environment-level representations for embodied agents. In contrast to prior work on self-supervised learning for individual images, we aim to encode a 3D environment using a series of images observed by an agent moving in it. We learn these representations via a masked-zone prediction task, which segments an agent\u2019s trajectory into zones and then predicts features of randomly masked zones, conditioned on the agent\u2019s camera poses. This explicit spatial conditioning encourages learning representations that capture the geometric and semantic regularities of 3D environments. We learn such representations on a collection of video walkthroughs and demonstrate successful transfer to multiple downstream navigation tasks. Our experiments on the real-world scanned 3D environments of Gibson and Matterport3D show that our method obtains 2 - 6\u00d7 higher sample-ef\ufb01ciency and up to 57% higher performance over standard image-representation learning.", "keywords": "Self-supervised learning;visual navigation;representation learning", "primary_area": "", "supplementary_material": "/attachment/00b1bea34971cfafc0c359201b1bc9930ef55c83.zip", "author": "Santhosh Kumar Ramakrishnan;Tushar Nagarajan;Ziad Al-Halah;Kristen Grauman", "authorids": "~Santhosh_Kumar_Ramakrishnan1;~Tushar_Nagarajan1;~Ziad_Al-Halah2;~Kristen_Grauman1", "gender": "M;;;F", "homepage": "https://srama2512.github.io/;https://tushar-n.github.io/;https://www.cs.utah.edu/~ziad/;http://www.cs.utexas.edu/~grauman/", "dblp": "199/1913;207/8308;147/2698;57/4553", "google_scholar": "zr9B1YgAAAAJ;KAKqSwIAAAAJ;https://scholar.google.com/citations?hl=en;Jp6Mz1sAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Santhosh_Kumar_Ramakrishnan1;~Tushar_Nagarajan1;~Ziad_Al-Halah2;~Kristen_Grauman1", "aff": "University of Texas, Austin;Meta Facebook;University of Texas at Austin;University of Texas, Austin", "aff_domain": "utexas.edu;fb.com;cs.utexas.edu;utexas.edu", "position": "PhD student;Visiting Researcher;Postdoc;Professor", "bibtex": "@inproceedings{\nramakrishnan2022environment,\ntitle={Environment Predictive Coding for Visual Navigation},\nauthor={Santhosh Kumar Ramakrishnan and Tushar Nagarajan and Ziad Al-Halah and Kristen Grauman},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=DBiQQYWykyy}\n}", "github": "", "project": "", "reviewers": "JRbw;dLzX;q87t;MWw9", "pdf_size": 0, "recommendation": "6;8;8;8", "confidence": "2;5;5;4", "correctness": "4;3;3;3", "technical_novelty": "3;2;2;3", "empirical_novelty": "3;3;3;3", "wc_summary_paper": "56;132;194;57", "wc_summary_review": "60;79;130;23", "wc_main_review": "375;585;604;271", "wc_review": "491;796;928;351", "wc_reply_reviewers": "0;263;533;18", "wc_reply_authors": "440;1281;1453;575", "reply_reviewers": "0;1;2;1", "reply_authors": "1;3;3;1", "recommendation_avg": [ 7.5, 0.8660254037844386 ], "confidence_avg": [ 4.0, 1.224744871391589 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 109.75, 57.58634820858152 ], "wc_summary_review_avg": [ 73.0, 38.58108344772085 ], "wc_main_review_avg": [ 458.75, 140.80194423373564 ], "wc_review_avg": [ 641.5, 230.75582332846986 ], "wc_reply_reviewers_avg": [ 203.5, 216.75619944998112 ], "wc_reply_authors_avg": [ 937.25, 436.64766975216986 ], "reply_reviewers_avg": [ 1.0, 0.7071067811865476 ], "reply_authors_avg": [ 2.0, 1.0 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.9428090415820632, "corr_recommendation_correctness": -1.0, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10884454032788227715&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=DBiQQYWykyy", "email": "utexas.edu;fb.com;cs.utexas.edu;utexas.edu", "author_num": 4, "aff_unique_index": "0;1;0;0", "aff_unique_norm": "University of Texas at Austin;Meta", "aff_unique_dep": ";Meta Platforms, Inc.", "aff_unique_url": "https://www.utexas.edu;https://meta.com", "aff_unique_abbr": "UT Austin;Meta", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Austin;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "DF4ebNexXta", "title": "Fine-Tuning from Limited Feedbacks", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Instead of learning from scratch, fine-tuning a pre-trained model to fit a related target dataset of interest or downstream tasks has been a handy trick to achieve the desired performance. However, according to the study of~\\cite{song2017machine}, standard fine-tuning may expose the information about target data if the pre-trained model is supplied by a malicious provider. Instead of reckoning that data holders are always expert to select reliable models and execute fine-tuning themselves, this paper confronts this problem by exploring a new learning paradigm named Fine-Tuning from limited FeedBacks (FTFB). The appealing trait of FTFB is that the model tuning does not require directly seeing the target data but leveraging the model performances as feedbacks instead. To learn from query-feedback, we propose to fine-tune the pre-trained model on the parameter distribution with the gradient descent scheme. For the deep models whose tuning parameters distribute across multiple layers, a more query-efficient algorithm is further designed which refines the model layer by layer sequentially with importance weight. Extensive experiments on various tasks demonstrate that the proposed algorithms significantly improve the pre-trained model with limited feedbacks only. For downstream tasks which adopt inconsistent evaluation measurement with pre-training, such as fairness or fault-intolerance, we verify our algorithms can also reach good performance.", "keywords": "Fine-tuning;Query-feedback;Transfer learning;Weakly supervised learning", "primary_area": "", "supplementary_material": "", "author": "Jing Li;Yuangang Pan;Yueming Lyu;Yinghua Yao;Ivor Tsang", "authorids": "~Jing_Li17;~Yuangang_Pan2;~Yueming_Lyu1;~Yinghua_Yao1;~Ivor_Tsang1", "gender": ";;M;;", "homepage": ";;https://yueminglyu.github.io/;;", "dblp": ";;;256/0363;", "google_scholar": ";;uQXB6-oAAAAJ;;", "orcid": ";;;0000-0003-3204-0739;", "linkedin": ";;;;", "or_profile": "~Jing_Li17;~Yuangang_Pan2;~Yueming_Lyu1;~Yinghua_Yao1;~Ivor_Tsang1", "aff": ";;University of Technology Sydney;Southern University of Science and Technology;", "aff_domain": ";;uts.edu.au;sustech.edu.cn;", "position": ";;PhD student;PhD student;", "bibtex": "@misc{\nli2022finetuning,\ntitle={Fine-Tuning from Limited Feedbacks},\nauthor={Jing Li and Yuangang Pan and Yueming Lyu and Yinghua Yao and Ivor Tsang},\nyear={2022},\nurl={https://openreview.net/forum?id=DF4ebNexXta}\n}", "github": "", "project": "", "reviewers": "MoZr;WFGZ;ZjU4", "site": "https://openreview.net/forum?id=DF4ebNexXta", "pdf_size": 0, "recommendation": "3;3;5", "confidence": "4;4;4", "correctness": "2;1;3", "technical_novelty": "1;1;2", "empirical_novelty": "2;2;2", "wc_summary_paper": "72;93;112", "wc_summary_review": "42;47;18", "wc_main_review": "236;783;303", "wc_review": "350;923;433", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 3.6666666666666665, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 2.0, 0.816496580927726 ], "technical_novelty_avg": [ 1.3333333333333333, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 92.33333333333333, 16.33673433979046 ], "wc_summary_review_avg": [ 35.666666666666664, 12.657891697365017 ], "wc_main_review_avg": [ 440.6666666666667, 243.60669576639765 ], "wc_review_avg": [ 568.6666666666666, 252.83239947091874 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.8660254037844387, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:4atbrZCBTDMJ:scholar.google.com/&scioq=Fine-Tuning+from+Limited+Feedbacks&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1", "aff_unique_norm": "University of Technology Sydney;Southern University of Science and Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.uts.edu.au;https://www.sustech.edu.cn", "aff_unique_abbr": "UTS;SUSTech", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "Australia;China" }, { "id": "DFYtZFo_1u", "title": "Federated Inference through Aligning Local Representations and Learning a Consensus Graph", "track": "main", "status": "Reject", "tldr": "", "abstract": "Machine learning is faced with many data challenges when applied in practice. Among them, a notable barrier is that data are distributed and sharing is unrealistic for volume and privacy reasons. Federated learning is a recent formalism to tackle this challenge, so that data owners can develop a common model jointly but use it separately. In this work, we consider a less addressed scenario where a datum consists of multiple parts, each of which belongs to a separate owner. In this scenario, joint efforts are required not only in learning but also in inference. We study \\emph{federated inference}, which allows each data owner to learn its own model that captures local data characteristics and copes with data heterogeneity. On the top is a federation of the local data representations, performing global inference that incorporates all distributed parts collectively. To enhance this local--global framework, we propose aligning the ambiguous data representations caused by arbitrary arrangement of neurons in local neural network models, as well as learning a consensus graph among data owners in the global model to improve performance. We demonstrate effectiveness of the proposed framework on four real-life data sets including power grid systems and traffic networks.\n", "keywords": "Federated inference;local latent representation;feature alignment;graph structure learning;Gumbel softmax", "primary_area": "", "supplementary_material": "", "author": "Tengfei Ma;Trong Nghia Hoang;Jie Chen", "authorids": "~Tengfei_Ma1;~Trong_Nghia_Hoang1;~Jie_Chen1", "gender": "M;;M", "homepage": "https://sites.google.com/site/matf0123/;https://jiechenjiechen.github.io;https://htnghia87.github.io/", "dblp": "94/9023-1;92/6289-7;62/540", "google_scholar": "9OvNakkAAAAJ;Z-lkme8AAAAJ;E-kZZeQAAAAJ", "orcid": "0000-0002-1086-529X;;", "linkedin": ";;", "or_profile": "~Tengfei_Ma1;~Jie_Chen1;~Nghia_Hoang2", "aff": "International Business Machines;International Business Machines;Amazon", "aff_domain": "ibm.com;ibm.com;amazon.com", "position": "Researcher;Research Staff Member;Senior Machine Learning Scientist", "bibtex": "@misc{\nma2022federated,\ntitle={Federated Inference through Aligning Local Representations and Learning a Consensus Graph},\nauthor={Tengfei Ma and Trong Nghia Hoang and Jie Chen},\nyear={2022},\nurl={https://openreview.net/forum?id=DFYtZFo_1u}\n}", "github": "", "project": "", "reviewers": "oPhp;hLWJ;ePJ1;ETbu", "site": "https://openreview.net/forum?id=DFYtZFo_1u", "pdf_size": 0, "recommendation": "3;3;3;3", "confidence": "4;5;5;3", "correctness": "2;2;2;3", "technical_novelty": "3;1;2;2", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "146;61;146;66", "wc_summary_review": "108;50;46;40", "wc_main_review": "421;221;328;404", "wc_review": "675;332;520;510", "wc_reply_reviewers": "0;15;0;0", "wc_reply_authors": "472;527;626;487", "reply_reviewers": "0;1;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 4.25, 0.82915619758885 ], "correctness_avg": [ 2.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 104.75, 41.287861412284364 ], "wc_summary_review_avg": [ 61.0, 27.367864366808018 ], "wc_main_review_avg": [ 343.5, 78.91926254090316 ], "wc_review_avg": [ 509.25, 121.45652514377315 ], "wc_reply_reviewers_avg": [ 3.75, 6.49519052838329 ], "wc_reply_authors_avg": [ 528.0, 60.045815840906016 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8752310709894260426&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;1", "aff_unique_norm": "International Business Machines Corporation;Amazon", "aff_unique_dep": ";Amazon.com, Inc.", "aff_unique_url": "https://www.ibm.com;https://www.amazon.com", "aff_unique_abbr": "IBM;Amazon", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "DHLngM1mR3W", "title": "AAVAE: Augmentation-Augmented Variational Autoencoders", "track": "main", "status": "Reject", "tldr": "", "abstract": "Recent methods for self-supervised learning can be grouped into two paradigms: contrastive and non-contrastive approaches. Their success can largely be attributed to data augmentation pipelines which generate multiple views of a single input that preserve the underlying semantics. In this work, we introduce augmentation-augmented variational autoencoders (AAVAE), yet another alternative to self-supervised learning, based on autoencoding. We derive AAVAE starting from the conventional variational autoencoder (VAE), by replacing the KL divergence regularization, which is agnostic to the input domain, with data augmentations that explicitly encourage the internal representations to encode domain-specific invariances and equivariances. We empirically evaluate the proposed AAVAE on image classification, similar to how recent contrastive and non-contrastive learning algorithms have been evaluated. Our experiments confirm the effectiveness of data augmentation as a replacement for KL divergence regularization. The AAVAE outperforms the VAE by 30% on CIFAR-10, 40% on STL-10 and 45% on Imagenet. On CIFAR-10 and STL-10, the results for AAVAE are largely comparable to the state-of-the-art algorithms for self-supervised learning.", "keywords": "Self-Supervised Learning;Autoencoders;Variational Autoencoders;Data Augmentation", "primary_area": "", "supplementary_material": "", "author": "William Alejandro Falcon;Ananya Harsh Jha;Teddy Koker;Kyunghyun Cho", "authorids": "~William_Alejandro_Falcon1;~Ananya_Harsh_Jha2;~Teddy_Koker1;~Kyunghyun_Cho1", "gender": "M;M;M;M", "homepage": "http://williamfalcon.com;;https://teddykoker.com;http://kyunghyuncho.me", "dblp": "207/8553;;283/5878;41/9736", "google_scholar": "0ngL-30AAAAJ;KK_RffoAAAAJ;br990A8AAAAJ;https://scholar.google.fi/citations?user=0RAmmIAAAAAJ", "orcid": ";;;", "linkedin": "wfalcon/;ananyaharshjha/;teddykoker/;", "or_profile": "~William_Alejandro_Falcon1;~Ananya_Harsh_Jha2;~Teddy_Koker1;~Kyunghyun_Cho1", "aff": "New York University;PyTorch Lightning;MIT Lincoln Laboratory, Massachusetts Institute of Technology;New York University", "aff_domain": "nyu.edu;pytorchlightning.ai;ll.mit.edu;nyu.edu", "position": "PhD student;Researcher;Researcher;Associate Professor", "bibtex": "@misc{\nfalcon2022aavae,\ntitle={{AAVAE}: Augmentation-Augmented Variational Autoencoders},\nauthor={William Alejandro Falcon and Ananya Harsh Jha and Teddy Koker and Kyunghyun Cho},\nyear={2022},\nurl={https://openreview.net/forum?id=DHLngM1mR3W}\n}", "github": "", "project": "", "reviewers": "cQeC;Dk62;ebYf;qnqK;61ZD", "site": "https://openreview.net/forum?id=DHLngM1mR3W", "pdf_size": 0, "recommendation": "3;3;3;5;5", "confidence": "5;5;4;3;4", "correctness": "4;1;2;2;3", "technical_novelty": "2;1;2;3;3", "empirical_novelty": "2;1;3;2;3", "wc_summary_paper": "83;63;47;40;49", "wc_summary_review": "51;35;11;4;35", "wc_main_review": "331;346;546;110;401", "wc_review": "465;444;604;154;485", "wc_reply_reviewers": "0;0;114;0;0", "wc_reply_authors": "742;779;697;227;745", "reply_reviewers": "0;0;1;0;0", "reply_authors": "1;1;1;1;1", "recommendation_avg": [ 3.8, 0.9797958971132712 ], "confidence_avg": [ 4.2, 0.7483314773547882 ], "correctness_avg": [ 2.4, 1.019803902718557 ], "technical_novelty_avg": [ 2.2, 0.7483314773547882 ], "empirical_novelty_avg": [ 2.2, 0.7483314773547882 ], "wc_summary_paper_avg": [ 56.4, 15.252540771950095 ], "wc_summary_review_avg": [ 27.2, 17.255723688098392 ], "wc_main_review_avg": [ 346.8, 140.67181665138187 ], "wc_review_avg": [ 430.4, 148.92763343315437 ], "wc_reply_reviewers_avg": [ 22.8, 45.6 ], "wc_reply_authors_avg": [ 638.0, 207.14632509412277 ], "reply_reviewers_avg": [ 0.2, 0.4000000000000001 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.7637626158259733, "corr_recommendation_correctness": 0.08006407690254354, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "New York University;PyTorch Lightning;Massachusetts Institute of Technology", "aff_unique_dep": ";;Lincoln Laboratory", "aff_unique_url": "https://www.nyu.edu;https://pytorch.org/lightning;https://web.mit.edu", "aff_unique_abbr": "NYU;PyTorch Lightning;MIT", "aff_campus_unique_index": "1", "aff_campus_unique": ";Cambridge", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States;" }, { "title": "Controlling Directions Orthogonal to a Classifier", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6557", "id": "DIjCrlsu6Z", "poster": "", "openreview": "https://openreview.net/forum?id=DIjCrlsu6Z", "slides": "https://iclr.cc/virtual/2022/poster/6557", "video": "https://iclr.cc/virtual/2022/poster/6557", "author_site": "Yilun Xu, Hao He, Tianxiao Shen, Tommi Jaakkola", "tldr": "", "abstract": "We propose to identify directions invariant to a given classifier so that these directions can be controlled in tasks such as style transfer. While orthogonal decomposition is directly identifiable when the given classifier is linear, we formally define a notion of orthogonality in the non-linear case. We also provide a surprisingly simple method for constructing the orthogonal classifier (a classifier utilizing directions other than those of the given classifier). Empirically, we present three use cases where controlling orthogonal variation is important: style transfer, domain adaptation, and fairness. The orthogonal classifier enables desired style transfer when domains vary in multiple aspects, improves domain adaptation with label shifts and mitigates the unfairness as a predictor. The code is available at https://github.com/Newbeeer/orthogonal_classifier", "keywords": "orthogonal classifier;invariance", "primary_area": "", "supplementary_material": "/attachment/9dd0aa8269f9ea535a0d6590ab6567747052c9c6.zip", "author": "Yilun Xu;Hao He;Tianxiao Shen;Tommi S. Jaakkola", "authorids": "~Yilun_Xu1;~Hao_He1;~Tianxiao_Shen1;~Tommi_S._Jaakkola1", "gender": "M;M;;", "homepage": "http://yilun-xu.com;http://people.csail.mit.edu/hehaodele;https://shentianxiao.github.io/;", "dblp": ";;185/5533;", "google_scholar": ";https://scholar.google.com/citations?hl=en;aYtllNgAAAAJ;", "orcid": ";;;", "linkedin": ";;shentianxiao/;", "or_profile": "~Yilun_Xu1;~Hao_He1;~Tianxiao_Shen1;~Tommi_S._Jaakkola1", "aff": "Massachusetts Institute of Technology;Massachusetts Institute of Technology;Massachusetts Institute of Technology;", "aff_domain": "mit.edu;mit.edu;mit.edu;", "position": "PhD student;PhD student;PhD student;", "bibtex": "@inproceedings{\nxu2022controlling,\ntitle={Controlling Directions Orthogonal to a Classifier},\nauthor={Yilun Xu and Hao He and Tianxiao Shen and Tommi S. Jaakkola},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=DIjCrlsu6Z}\n}", "github": "", "project": "", "reviewers": "dNRJ;fQv4;SmEk", "pdf_size": 0, "recommendation": "6;8;8", "confidence": "3;4;3", "correctness": "3;4;4", "technical_novelty": "3;3;3", "empirical_novelty": "3;3;3", "wc_summary_paper": "172;61;50", "wc_summary_review": "46;69;25", "wc_main_review": "160;733;179", "wc_review": "378;863;254", "wc_reply_reviewers": "18;489;14", "wc_reply_authors": "300;876;639", "reply_reviewers": "1;2;1", "reply_authors": "1;3;1", "recommendation_avg": [ 7.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "correctness_avg": [ 3.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 94.33333333333333, 55.10192575783738 ], "wc_summary_review_avg": [ 46.666666666666664, 17.96910929592474 ], "wc_main_review_avg": [ 357.3333333333333, 265.7496733561283 ], "wc_review_avg": [ 498.3333333333333, 262.78043221586256 ], "wc_reply_reviewers_avg": [ 173.66666666666666, 222.98031801534015 ], "wc_reply_authors_avg": [ 605.0, 236.37681781426875 ], "reply_reviewers_avg": [ 1.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.9428090415820634 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.49999999999999983, "corr_recommendation_correctness": 0.9999999999999998, "gs_citation": 25, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14918052753119850605&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=DIjCrlsu6Z", "email": "mit.edu;mit.edu;mit.edu;", "author_num": 4, "aff_unique_index": "0;0;0", "aff_unique_norm": "Massachusetts Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://web.mit.edu", "aff_unique_abbr": "MIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "DIsWHvtU7lF", "title": "Composing Partial Differential Equations with Physics-Aware Neural Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "We introduce a compositional physics-aware neural network (FINN) for learning spatiotemporal advection-diffusion processes. FINN implements a new way of combining the learning abilities of artificial neural networks with physical and structural knowledge from numerical simulation by modeling the constituents of partial differential equations (PDEs) in a compositional manner. Results on both one- and two-dimensional PDEs (Burger's, diffusion-sorption, diffusion-reaction) demonstrate FINN's superior process modeling accuracy and excellent out-of-distribution generalization ability beyond initial and boundary conditions. With only one tenth of the number of parameters on average, FINN outperforms pure machine learning and other state-of-the-art physics-aware models in all cases---often even by multiple orders of magnitude. Moreover, FINN outperforms a calibrated physical model when approximating sparse real-world data in a diffusion-sorption scenario, confirming its generalization abilities and showing explanatory potential by revealing the unknown retardation factor of the observed process.", "keywords": "physics-aware neural networks;partial differential equations;advection-diffusion equations;learning constituents;out-of-distribution generalization", "primary_area": "", "supplementary_material": "/attachment/c8a43bcb42aafb62f26952ba943db0c979e82461.zip", "author": "Matthias Karlbauer;Timothy Praditia;Sebastian Otte;Sergey Oladyshkin;Wolfgang Nowak;Martin V. Butz", "authorids": "~Matthias_Karlbauer1;~Timothy_Praditia1;~Sebastian_Otte1;~Sergey_Oladyshkin1;~Wolfgang_Nowak1;~Martin_V._Butz2", "gender": "M;M;;M;M;M", "homepage": "https://uni-tuebingen.de/fakultaeten/mathematisch-naturwissenschaftliche-fakultaet/fachbereiche/informatik/lehrstuehle/cognitive-modeling/staff/matthias-karlbauer/;https://www.iws.uni-stuttgart.de/en/institute/team/Praditia-00001/;;http://www.iws.uni-stuttgart.de;https://www.iws.uni-stuttgart.de/en/institute/team/Nowak-00003/;https://cm.inf.uni-tuebingen.de", "dblp": ";;;;59/10398;b/MartinVButz.html", "google_scholar": ";kIKsjt8AAAAJ;;;Cb-GaJoAAAAJ;https://scholar.google.de/citations?user=dIcpfzAAAAAJ", "orcid": "0000-0002-4509-7921;0000-0003-3619-9122;;;0000-0003-2583-8865;0000-0002-8120-8537", "linkedin": ";timothypraditia/;;;wolfgang-nowak-67a99a107/;martin-butz-85b971150/", "or_profile": "~Matthias_Karlbauer1;~Timothy_Praditia1;~Sebastian_Otte1;~Sergey_Oladyshkin1;~Wolfgang_Nowak1;~Martin_V._Butz2", "aff": "University of Washington;University of Stuttgart;;University of Stuttgart;Universit\u00e4t Stuttgart;University of Tuebingen", "aff_domain": "u.washington.edu;uni-stuttgart.de;;uni-stuttgart.de;uni-stuttgart.de;uni-tuebingen.de", "position": "Intern;PhD student;;Professor;Full Professor;Full Professor", "bibtex": "@misc{\nkarlbauer2022composing,\ntitle={Composing Partial Differential Equations with Physics-Aware Neural Networks},\nauthor={Matthias Karlbauer and Timothy Praditia and Sebastian Otte and Sergey Oladyshkin and Wolfgang Nowak and Martin V. Butz},\nyear={2022},\nurl={https://openreview.net/forum?id=DIsWHvtU7lF}\n}", "github": "", "project": "", "reviewers": "rFkv;TVaa;LWqu;j4js", "site": "https://openreview.net/forum?id=DIsWHvtU7lF", "pdf_size": 0, "recommendation": "3;6;6;6", "confidence": "3;2;3;4", "correctness": "4;3;4;3", "technical_novelty": "3;3;2;3", "empirical_novelty": "2;3;0;3", "wc_summary_paper": "42;71;62;62", "wc_summary_review": "50;28;39;37", "wc_main_review": "353;183;150;203", "wc_review": "445;282;251;302", "wc_reply_reviewers": "235;0;17;56", "wc_reply_authors": "1686;374;900;1011", "reply_reviewers": "1;0;1;1", "reply_authors": "2;1;2;2", "recommendation_avg": [ 5.25, 1.299038105676658 ], "confidence_avg": [ 3.0, 0.7071067811865476 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 1.224744871391589 ], "wc_summary_paper_avg": [ 59.25, 10.615436872781073 ], "wc_summary_review_avg": [ 38.5, 7.826237921249264 ], "wc_main_review_avg": [ 222.25, 77.82472293558133 ], "wc_review_avg": [ 320.0, 74.42109915877352 ], "wc_reply_reviewers_avg": [ 77.0, 93.45319684205565 ], "wc_reply_authors_avg": [ 992.75, 467.0071600093515 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.75, 0.4330127018922193 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": -0.5773502691896257, "gs_citation": 28, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5219761110162787549&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff_unique_index": "0;1;1;1;2", "aff_unique_norm": "University of Washington;University of Stuttgart;University of Tuebingen", "aff_unique_dep": ";;", "aff_unique_url": "https://www.washington.edu;https://www.uni-stuttgart.de;https://www.uni-tuebingen.de/", "aff_unique_abbr": "UW;USTuttgart;Uni T\u00fcbingen", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1;1", "aff_country_unique": "United States;Germany" }, { "title": "On the Importance of Firth Bias Reduction in Few-Shot Classification", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6221", "id": "DNRADop4ksB", "poster": "", "openreview": "https://openreview.net/forum?id=DNRADop4ksB", "slides": "https://iclr.cc/virtual/2022/poster/6221", "video": "https://iclr.cc/virtual/2022/poster/6221", "author_site": "Saba Ghaffari, Ehsan Saleh, David Forsyth, Yu-Xiong Wang", "tldr": "", "abstract": "Learning accurate classifiers for novel categories from very few examples, known as few-shot image classification, is a challenging task in statistical machine learning and computer vision. The performance in few-shot classification suffers from the bias in the estimation of classifier parameters; however, an effective underlying bias reduction technique that could alleviate this issue in training few-shot classifiers has been overlooked. In this work, we demonstrate the effectiveness of Firth bias reduction in few-shot classification. Theoretically, Firth bias reduction removes the $O(N^{-1})$ first order term from the small-sample bias of the Maximum Likelihood Estimator. Here we show that the general Firth bias reduction technique simplifies to encouraging uniform class assignment probabilities for multinomial logistic classification, and almost has the same effect in cosine classifiers. We derive an easy-to-implement optimization objective for Firth penalized multinomial logistic and cosine classifiers, which is equivalent to penalizing the cross-entropy loss with a KL-divergence between the predictions and the uniform label distribution. Then, we empirically evaluate that it is consistently effective across the board for few-shot image classification, regardless of (1) the feature representations from different backbones, (2) the number of samples per class, and (3) the number of classes. Furthermore, we demonstrate the effectiveness of Firth bias reduction on cross-domain and imbalanced data settings. Our implementation is available at https://github.com/ehsansaleh/firth_bias_reduction.", "keywords": "Few-shot Classification;Firth Regularization;MLE Bias", "primary_area": "", "supplementary_material": "", "author": "Saba Ghaffari;Ehsan Saleh;David Forsyth;Yu-Xiong Wang", "authorids": "~Saba_Ghaffari1;~Ehsan_Saleh1;~David_Forsyth1;~Yu-Xiong_Wang1", "gender": "F;;M;", "homepage": ";;https://cs.illinois.edu/directory/profile/daf;https://yxw.cs.illinois.edu/", "dblp": ";;f/DavidAForsyth;35/10700", "google_scholar": "https://scholar.google.com/citations?hl=en;;https://scholar.google.com.tw/citations?user=5H0arvkAAAAJ;T_Q-xDkAAAAJ", "orcid": ";;0000-0002-2278-0752;", "linkedin": "saba-ghaffari-171a3356/;;;", "or_profile": "~Saba_Ghaffari1;~Ehsan_Saleh1;~David_Forsyth1;~Yu-Xiong_Wang1", "aff": "University of Illinois, Urbana Champaign;;University of Illinois, Urbana-Champaign;Department of Computer Science, University of Illinois Urbana-Champaign", "aff_domain": "illinois.edu;;uiuc.edu;cs.illinois.edu", "position": "PhD student;;Full Professor;Assistant Professor", "bibtex": "@inproceedings{\nghaffari2022on,\ntitle={On the Importance of Firth Bias Reduction in Few-Shot Classification},\nauthor={Saba Ghaffari and Ehsan Saleh and David Forsyth and Yu-Xiong Wang},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=DNRADop4ksB}\n}", "github": "", "project": "", "reviewers": "s811;4kx4;37fi;gyef", "pdf_size": 0, "recommendation": "6;8;8;8", "confidence": "4;5;3;3", "correctness": "4;3;4;4", "technical_novelty": "3;4;4;3", "empirical_novelty": "3;4;3;3", "wc_summary_paper": "101;106;52;111", "wc_summary_review": "19;50;9;82", "wc_main_review": "185;349;257;194", "wc_review": "305;505;318;387", "wc_reply_reviewers": "0;24;0;21", "wc_reply_authors": "436;901;1051;717", "reply_reviewers": "0;1;0;1", "reply_authors": "1;2;2;1", "recommendation_avg": [ 7.5, 0.8660254037844386 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.5, 0.5 ], "empirical_novelty_avg": [ 3.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 92.5, 23.648467180770936 ], "wc_summary_review_avg": [ 40.0, 28.574464124459098 ], "wc_main_review_avg": [ 246.25, 65.48807143289532 ], "wc_review_avg": [ 378.75, 79.27286736330407 ], "wc_reply_reviewers_avg": [ 11.25, 11.299889379989523 ], "wc_reply_authors_avg": [ 776.25, 229.30915267385208 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.17407765595569782, "corr_recommendation_correctness": -0.3333333333333333, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9186667972571213142&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "pdf": "https://openreview.net/pdf?id=DNRADop4ksB", "email": "illinois.edu;;uiuc.edu;cs.illinois.edu", "author_num": 4, "aff_unique_index": "0;1;0", "aff_unique_norm": "University of Illinois Urbana-Champaign;University of Illinois", "aff_unique_dep": ";", "aff_unique_url": "https://illinois.edu;https://illinois.edu", "aff_unique_abbr": "UIUC;UIUC", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Urbana-Champaign", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "DOrrKPEDnBp", "title": "The Impact of Spatiotemporal Augmentations on Self-Supervised Audiovisual Representation Learning", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Contrastive learning of auditory and visual perception has been extremely successful when investigated individually. However, there are still major questions on how we could integrate principles learned from both domains to attain effective audiovisual representations. In this paper, we present a contrastive framework to learn audiovisual representations from unlabeled videos. The type and strength of augmentations utilized during self-supervised pre-training play a crucial role for contrastive frameworks to work sufficiently. Hence, we extensively investigate composition of temporal augmentations suitable for learning audiovisual representations; we find lossy spatio-temporal transformations that do not corrupt the temporal coherency of videos are the most effective. Furthermore, we show that the effectiveness of these transformations scales with higher temporal resolution and stronger transformation intensity. Compared to self-supervised models pre-trained on only sampling-based temporal augmentation, self-supervised models pre-trained with our temporal augmentations lead to approximately 6.5% gain on linear classifier performance on AVE dataset. Lastly, we show that despite their simplicity, our proposed transformations work well across self-supervised learning frameworks (SimSiam, MoCoV3, etc), and benchmark dataset (AVE).", "keywords": "Representation Learning;Audiovisual;Self-Supervised Learning;Spatiotemporal Augmentations", "primary_area": "", "supplementary_material": "", "author": "Haider Al-Tahan;Yalda Mohsenzadeh", "authorids": "~Haider_Al-Tahan2;~Yalda_Mohsenzadeh1", "gender": ";female", "homepage": "https://haideraltahan.com;https://mohsenzadehlab.com/people", "dblp": "276/6441;", "google_scholar": "https://scholar.google.ca/citations?user=kfpoNrEAAAAJ;xZIgSigAAAAJ", "orcid": ";0000-0001-8525-957X", "linkedin": ";", "or_profile": "~Haider_Al-Tahan2;~Yalda_Mohsenzadeh1", "aff": "University of Western Ontario;University of Western Ontario", "aff_domain": "uwo.ca;uwo.ca", "position": "MS student;Assistant Professor", "bibtex": "@misc{\nal-tahan2022the,\ntitle={The Impact of Spatiotemporal Augmentations on Self-Supervised Audiovisual Representation Learning},\nauthor={Haider Al-Tahan and Yalda Mohsenzadeh},\nyear={2022},\nurl={https://openreview.net/forum?id=DOrrKPEDnBp}\n}", "github": "", "project": "", "reviewers": "tD7g;76MK;TeDB;bKjb", "site": "https://openreview.net/forum?id=DOrrKPEDnBp", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "5;4;3;4", "correctness": "2;3;3;2", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "66;58;49;134", "wc_summary_review": "46;92;94;66", "wc_main_review": "386;477;317;286", "wc_review": "498;627;460;486", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 76.75, 33.5959446957516 ], "wc_summary_review_avg": [ 74.5, 19.817921182606415 ], "wc_main_review_avg": [ 366.5, 73.35018745715651 ], "wc_review_avg": [ 517.75, 64.55375666837678 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.7071067811865475, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:RvE_qlcNXV4J:scholar.google.com/&scioq=The+Impact+of+Spatiotemporal+Augmentations+on+Self-Supervised+Audiovisual+Representation+Learning&hl=en&as_sdt=0,33", "gs_version_total": 3, "aff_unique_index": "0;0", "aff_unique_norm": "University of Western Ontario", "aff_unique_dep": "", "aff_unique_url": "https://www.uwo.ca", "aff_unique_abbr": "UWO", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Canada" }, { "id": "DSCsslei9r", "title": "Multi-modal Self-supervised Pre-training for Regulatory Genome Across Cell Types", "track": "main", "status": "Reject", "tldr": "", "abstract": "In the genome biology research, regulatory genome modeling is an important topic for many regulatory downstream tasks, such as promoter classification, transaction factor binding sites prediction. The core problem is to model how regulatory elements interact with each other and its variability across different cell types. However, current deep learning methods often focus on modeling genome sequences of a fixed set of cell types and do not account for the interaction between multiple regulatory elements, making them only perform well on the cell types in the training set and lack the generalizability required in biological applications. In this work, we propose a simple yet effective approach for pre-training genome data in a multi-modal and self-supervised manner, which we call $\\textbf{\\texttt{GeneBERT}}$. Specifically, we simultaneously take the 1d sequence of genome data and a 2d matrix of (transcription factors \u00d7 regions) as the input, where three pre-training tasks are proposed to improve the robustness and generalizability of our model. We pre-train our model on the ATAC-seq dataset with 17 million genome sequences. We evaluate our GeneBERT on regulatory downstream tasks across different cell types, including promoter classification, transaction factor binding sites prediction, disease risk estimation, and splicing sites prediction. Extensive experiments demonstrate the effectiveness of multi-modal and self-supervised pre-training for large-scale regulatory genomics data. ", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Shentong Mo;Xi Fu;Chenyang Hong;Yizhen Chen;Yuxuan Zheng;Xiangru Tang;Yanyan Lan;Zhiqiang Shen;Eric Xing", "authorids": "~Shentong_Mo1;~Xi_Fu1;~Chenyang_Hong2;~Yizhen_Chen1;~Yuxuan_Zheng1;~Xiangru_Tang2;~Yanyan_Lan2;~Zhiqiang_Shen1;~Eric_Xing1", "gender": ";;;F;;M;;;M", "homepage": ";http://fuxialexander.github.io;;;;https://xiangrutang.github.io/;;;http://www.cs.cmu.edu/~epxing/", "dblp": ";;;;;246/8064;00/6040.html;;36/3855", "google_scholar": ";;;dnZwP6YAAAAJ;;;;;https://scholar.google.com.tw/citations?user=5pKTRxEAAAAJ", "orcid": ";;;;;;;;", "linkedin": ";;chenyang-hong-b54369102/;;;;;;", "or_profile": "~Shentong_Mo1;~Xi_Fu1;~Chenyang_Hong2;~Yizhen_Chen1;~Yuxuan_Zheng1;~Xiangru_Tang2;~Yanyan_Lan2;~Zhiqiang_Shen1;~Eric_Xing1", "aff": ";Columbia University;The Chinese University of Hong Kong;The Chinese University of Hong Kong;;Yale University;Tsinghua University;;School of Computer Science, Carnegie Mellon University", "aff_domain": ";columbia.edu;cuhk.edu.hk;cuhk.edu.hk;;yale.edu;tsinghua.edu.cn;;cs.cmu.edu", "position": ";PhD student;PhD student;PhD student;;MS student;Full Professor;;Full Professor", "bibtex": "@misc{\nmo2022multimodal,\ntitle={Multi-modal Self-supervised Pre-training for Regulatory Genome Across Cell Types},\nauthor={Shentong Mo and Xi Fu and Chenyang Hong and Yizhen Chen and Yuxuan Zheng and Xiangru Tang and Yanyan Lan and Zhiqiang Shen and Eric Xing},\nyear={2022},\nurl={https://openreview.net/forum?id=DSCsslei9r}\n}", "github": "", "project": "", "reviewers": "9xx4;omfC;9J4F;FiEi", "site": "https://openreview.net/forum?id=DSCsslei9r", "pdf_size": 0, "recommendation": "1;3;6;6", "confidence": "5;5;4;4", "correctness": "2;4;3;4", "technical_novelty": "2;2;2;3", "empirical_novelty": "0;1;2;2", "wc_summary_paper": "74;53;117;147", "wc_summary_review": "48;30;144;51", "wc_main_review": "167;523;456;183", "wc_review": "289;606;717;381", "wc_reply_reviewers": "0;43;38;23", "wc_reply_authors": "833;756;288;232", "reply_reviewers": "0;1;1;1", "reply_authors": "2;2;1;1", "recommendation_avg": [ 4.0, 2.1213203435596424 ], "confidence_avg": [ 4.5, 0.5 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 97.75, 36.615399765672365 ], "wc_summary_review_avg": [ 68.25, 44.46557657334491 ], "wc_main_review_avg": [ 332.25, 159.12475451669988 ], "wc_review_avg": [ 498.25, 171.02247659299056 ], "wc_reply_reviewers_avg": [ 26.0, 16.718253497300488 ], "wc_reply_authors_avg": [ 527.25, 269.3616295985752 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 9, 0 ], "corr_recommendation_confidence": -0.9428090415820635, "corr_recommendation_correctness": 0.5685352436149612, "gs_citation": 28, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3196852034012778454&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff_unique_index": "0;1;1;2;3;4", "aff_unique_norm": "Columbia University;Chinese University of Hong Kong;Yale University;Tsinghua University;Carnegie Mellon University", "aff_unique_dep": ";;;;School of Computer Science", "aff_unique_url": "https://www.columbia.edu;https://www.cuhk.edu.hk;https://www.yale.edu;https://www.tsinghua.edu.cn;https://www.cmu.edu", "aff_unique_abbr": "Columbia;CUHK;Yale;THU;CMU", "aff_campus_unique_index": "1;1;2", "aff_campus_unique": ";Hong Kong SAR;Pittsburgh", "aff_country_unique_index": "0;1;1;0;1;0", "aff_country_unique": "United States;China" }, { "title": "Online Facility Location with Predictions", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7174", "id": "DSQHjibtgKR", "poster": "", "openreview": "https://openreview.net/forum?id=DSQHjibtgKR", "slides": "https://iclr.cc/virtual/2022/poster/7174", "video": "https://iclr.cc/virtual/2022/poster/7174", "author_site": "Shaofeng Jiang, Erzhi Liu, You Lyu, Zhihao Tang, Yubo Zhang", "tldr": "", "abstract": "We provide nearly optimal algorithms for online facility location (OFL) with predictions. In OFL, $n$ demand points arrive in order and the algorithm must irrevocably assign each demand point to an open facility upon its arrival. The objective is to minimize the total connection costs from demand points to assigned facilities plus the facility opening cost. We further assume the algorithm is additionally given for each demand point $x_i$ a natural prediction $f_{x_i}^{\\mathrm{pred}}$ which is supposed to be the facility $f_{x_i}^{\\mathrm{opt}}$ that serves $x_i$ in the offline optimal solution.\n\nOur main result is an $O(\\min\\{\\log {\\frac{n\\eta_\\infty}{\\mathrm{OPT}}}, \\log{n} \\})$-competitive algorithm where $\\eta_\\infty$ is the maximum prediction error (i.e., the distance between $f_{x_i}^{\\mathrm{pred}}$ and $f_{x_i}^{\\mathrm{opt}}$). Our algorithm overcomes the fundamental $\\Omega(\\frac{\\log n}{\\log \\log n})$ lower bound of OFL (without predictions) when $\\eta_\\infty$ is small, and it still maintains $O(\\log n)$ ratio even when $\\eta_\\infty$ is unbounded. Furthermore, our theoretical analysis is supported by empirical evaluations for the tradeoffs between $\\eta_\\infty$ and the competitive ratio on various real datasets of different types.", "keywords": "online algorithms;facility location;prediction;learning-augmented", "primary_area": "", "supplementary_material": "/attachment/348bec02cc49ca65158df822f2ef8909b22eebc9.zip", "author": "Shaofeng H.-C. Jiang;Erzhi Liu;You Lyu;Zhihao Gavin Tang;Yubo Zhang", "authorids": "~Shaofeng_H.-C._Jiang1;~Erzhi_Liu1;~You_Lyu1;~Zhihao_Gavin_Tang1;~Yubo_Zhang4", "gender": "M;M;;;M", "homepage": "https://shaofengjiang.cn;https://erzhiliu.xyz;;;http://saigyouji.github.io/", "dblp": "157/6062;;;164/1771;", "google_scholar": ";;;;", "orcid": "0000-0001-7972-827X;;0000-0002-8148-3643;;", "linkedin": ";;;;", "or_profile": "~Shaofeng_H.-C._Jiang1;~Erzhi_Liu1;~You_Lyu1;~Zhihao_Gavin_Tang1;~Yubo_Zhang4", "aff": "Peking University;Shanghai Jiaotong University;Shanghai Jiaotong University;ITCS, Shanghai University of Finance and Economics;Peking University", "aff_domain": "pku.edu.cn;sjtu.edu.cn;sjtu.edu;mail.shufe.edu.cn;pku.edu.cn", "position": "Assistant Professor;Undergrad student;Undergrad student;Associate Professor;Undergrad student", "bibtex": "@inproceedings{\njiang2022online,\ntitle={Online Facility Location with Predictions},\nauthor={Shaofeng H.-C. Jiang and Erzhi Liu and You Lyu and Zhihao Gavin Tang and Yubo Zhang},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=DSQHjibtgKR}\n}", "github": "", "project": "", "reviewers": "m2EN;3sjS;TTSB;m1RY;tWdT;DWJH", "pdf_size": 0, "recommendation": "6;6;6;6;8;8", "confidence": "4;4;5;4;4;4", "correctness": "3;4;4;4;4;4", "technical_novelty": "3;3;3;3;3;3", "empirical_novelty": "2;4;1;2;2;3", "wc_summary_paper": "550;223;33;305;61;324", "wc_summary_review": "117;157;76;35;50;38", "wc_main_review": "737;518;244;481;266;456", "wc_review": "1404;898;353;821;377;818", "wc_reply_reviewers": "49;0;0;0;0;0", "wc_reply_authors": "397;788;516;772;290;153", "reply_reviewers": "1;0;0;0;0;0", "reply_authors": "1;1;1;1;1;1", "recommendation_avg": [ 6.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 4.166666666666667, 0.372677996249965 ], "correctness_avg": [ 3.8333333333333335, 0.3726779962499649 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.9428090415820634 ], "wc_summary_paper_avg": [ 249.33333333333334, 174.18253516227037 ], "wc_summary_review_avg": [ 78.83333333333333, 44.71173099857451 ], "wc_main_review_avg": [ 450.3333333333333, 165.46970182550712 ], "wc_review_avg": [ 778.5, 353.824226605999 ], "wc_reply_reviewers_avg": [ 8.166666666666666, 18.26122181624828 ], "wc_reply_authors_avg": [ 486.0, 234.92055394679 ], "reply_reviewers_avg": [ 0.16666666666666666, 0.372677996249965 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.31622776601683794, "corr_recommendation_correctness": 0.31622776601683805, "gs_citation": 33, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15324871102823156211&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=DSQHjibtgKR", "email": "pku.edu.cn;sjtu.edu.cn;sjtu.edu;mail.shufe.edu.cn;pku.edu.cn", "author_num": 5, "aff_unique_index": "0;1;1;2;0", "aff_unique_norm": "Peking University;Shanghai Jiao Tong University;Shanghai University of Finance and Economics", "aff_unique_dep": ";;ITCS", "aff_unique_url": "http://www.pku.edu.cn;https://www.sjtu.edu.cn;http://www.sufe.edu.cn", "aff_unique_abbr": "Peking U;SJTU;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Actor-Critic Policy Optimization in a Large-Scale Imperfect-Information Game", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6627", "id": "DTXZqTNV5nW", "poster": "", "openreview": "https://openreview.net/forum?id=DTXZqTNV5nW", "slides": "https://iclr.cc/virtual/2022/poster/6627", "video": "https://iclr.cc/virtual/2022/poster/6627", "author_site": "Haobo Fu, Weiming Liu, Shuang Wu, Yijia Wang, Tao Yang, Kai Li, Junliang Xing, Bin Li, Bo Ma, QIANG FU, Yang Wei", "tldr": "", "abstract": "The deep policy gradient method has demonstrated promising results in many large-scale games, where the agent learns purely from its own experience. Yet, policy gradient methods with self-play suffer convergence problems to a Nash Equilibrium (NE) in multi-agent situations. Counterfactual regret minimization (CFR) has a convergence guarantee to a NE in 2-player zero-sum games, but it usually needs domain-specific abstractions to deal with large-scale games. Inheriting merits from both methods, in this paper we extend the actor-critic algorithm framework in deep reinforcement learning to tackle a large-scale 2-player zero-sum imperfect-information game, 1-on-1 Mahjong, whose information set size and game length are much larger than poker. The proposed algorithm, named Actor-Critic Hedge (ACH), modifies the policy optimization objective from originally maximizing the discounted returns to minimizing a type of weighted cumulative counterfactual regret. This modification is achieved by approximating the regret via a deep neural network and minimizing the regret via generating self-play policies using Hedge. ACH is theoretically justified as it is derived from a neural-based weighted CFR, for which we prove the convergence to a NE under certain conditions. Experimental results on the proposed 1-on-1 Mahjong benchmark and benchmarks from the literature demonstrate that ACH outperforms related state-of-the-art methods. Also, the agent obtained by ACH defeats a human champion in 1-on-1 Mahjong.", "keywords": "Policy Optimization;Nash Equilibrium;Mahjong AI", "primary_area": "", "supplementary_material": "/attachment/9c55b719595babe4f312b0161fc0b181bde70b0e.zip", "author": "Haobo Fu;Weiming Liu;Shuang Wu;Yijia Wang;Tao Yang;Kai Li;Junliang Xing;Bin Li;Bo Ma;QIANG FU;Yang Wei", "authorids": "~Haobo_Fu2;~Weiming_Liu3;~Shuang_Wu3;~Yijia_Wang1;~Tao_Yang10;~Kai_Li2;~Junliang_Xing1;~Bin_Li8;~Bo_Ma4;~QIANG_FU8;~Yang_Wei2", "gender": "M;M;;M;M;M;M;;M;M;M", "homepage": ";;;;;http://people.ucas.ac.cn/~jlxing?language=en;http://staff.ustc.edu.cn/~binli;;;;", "dblp": "85/8571;00/105-4.html;;;181/2853;43/7659.html;89/6764-25;;;03/1094-32.html;85/3231", "google_scholar": "LFdJXNcAAAAJ;fIPGDMMAAAAJ;ea5X3y8AAAAJ;;_cY_PXgAAAAJ;jSwNd3MAAAAJ;;;gANaxT0AAAAJ;;https://scholar.google.com.hk/citations?user=_MtBmxkAAAAJ", "orcid": ";;;;;0000-0001-6801-0510;0000-0002-2332-3959;;;;0000-0003-2772-4511", "linkedin": "haobo-fu-382b0784/;;;tao-yang-0b023616b/;;https://www.linkedin.cn/incareer/in/ACoAAAvlU14B40ZWH1pxg5JJDtQ6LlgMYkp0e5s;;;;;", "or_profile": "~Haobo_Fu2;~Weiming_Liu3;~Yijia_Wang1;~Tao_Yang10;~Kai_Li2;~Junliang_Xing1;~Bin_Li8;~Bo_Ma4;~QIANG_FU8;~Yang_Wei2;~shuang_wu2", "aff": "Tencent AI Lab;University of Science and Technology of China;;;Institute of Automation, Chinese Academy of Sciences;Institute of Automation, Chinese Academy of Sciences; University of Chinese Academy of Sciences;University of Science and Technology of China;;Tencent AI Lab;Tencent AI Lab;Tencent AI Lab", "aff_domain": "tencent.com;ustc.edu.cn;;;ia.ac.cn;ia.ac.cn;ustc.edu.cn;;tencent.com;tencent.com;tencent.com", "position": "Principal Researcher;PhD student;;;Associate Professor;Full Professor;Full Professor;;Principal Researcher;Researcher;Researcher", "bibtex": "@inproceedings{\nfu2022actorcritic,\ntitle={Actor-Critic Policy Optimization in a Large-Scale Imperfect-Information Game},\nauthor={Haobo Fu and Weiming Liu and Shuang Wu and Yijia Wang and Tao Yang and Kai Li and Junliang Xing and Bin Li and Bo Ma and QIANG FU and Yang Wei},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=DTXZqTNV5nW}\n}", "github": "", "project": "", "reviewers": "8wVT;Mmjs;m118;SULh", "pdf_size": 0, "recommendation": "5;6;8;8", "confidence": "3;5;5;4", "correctness": "4;4;4;4", "technical_novelty": "3;2;3;3", "empirical_novelty": "3;2;3;4", "wc_summary_paper": "60;34;53;98", "wc_summary_review": "74;55;83;484", "wc_main_review": "148;474;1506;2808", "wc_review": "282;563;1642;3390", "wc_reply_reviewers": "123;199;647;166", "wc_reply_authors": "1879;901;1373;883", "reply_reviewers": "2;1;2;1", "reply_authors": "6;2;3;2", "recommendation_avg": [ 6.75, 1.299038105676658 ], "confidence_avg": [ 4.25, 0.82915619758885 ], "correctness_avg": [ 4.0, 0.0 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 61.25, 23.25268801665734 ], "wc_summary_review_avg": [ 174.0, 179.26377213480697 ], "wc_main_review_avg": [ 1234.0, 1037.841028289015 ], "wc_review_avg": [ 1469.25, 1219.6285038896065 ], "wc_reply_reviewers_avg": [ 283.75, 211.446654028859 ], "wc_reply_authors_avg": [ 1259.0, 408.3307482911371 ], "reply_reviewers_avg": [ 1.5, 0.5 ], "reply_authors_avg": [ 3.25, 1.6393596310755 ], "replies_avg": [ 28, 0 ], "authors#_avg": [ 11, 0 ], "corr_recommendation_confidence": 0.5222329678670935, "corr_recommendation_correctness": 0.0, "gs_citation": 33, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15309575399018141241&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "pdf": "https://openreview.net/pdf?id=DTXZqTNV5nW", "email": "tencent.com;ustc.edu.cn;;;ia.ac.cn;ia.ac.cn;ustc.edu.cn;;tencent.com;tencent.com;tencent.com", "author_num": 11, "aff_unique_index": "0;1;2;2;3;1;0;0;0", "aff_unique_norm": "Tencent;University of Science and Technology of China;Chinese Academy of Sciences;University of Chinese Academy of Sciences", "aff_unique_dep": "Tencent AI Lab;;Institute of Automation;", "aff_unique_url": "https://ai.tencent.com;http://www.ustc.edu.cn;http://www.ia.cas.cn;http://www.ucas.ac.cn", "aff_unique_abbr": "Tencent AI Lab;USTC;CAS;UCAS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0", "aff_country_unique": "China" }, { "id": "DTg98fkyoyn", "title": "Unsupervised Contrastive Learning for Signal-Dependent Noise Synthesis", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "We present a simple yet robust noise synthesis framework based on unsupervised contrastive learning. With access to clean images only, the proposed contrastive noise synthesis framework trains a Glow-based generative model to synthesize image noise in a self-supervised fashion. We utilize the signal-dependency of the synthetic noise as a discriminative feature for the instance-wise discrimination pretext task and introduce a noise contrastive loss based on maximum mean discrepancy. The empirical results show that, with only 4312 parameters, the noise synthesized by the proposed framework shows advantages over the noise synthesized by traditional statistical models both qualitatively and quantitatively. The proposed framework fills a methodological gap in learning-based noise synthesis and can be used as an alternative to traditional statistical models.", "keywords": "Noise Synthesis;Self-Supervised Learning;Contrastive Learning", "primary_area": "", "supplementary_material": "", "author": "Nanqing Dong;Jianwen Xie;Ping Li", "authorids": "~Nanqing_Dong1;~Jianwen_Xie1;~Ping_Li3", "gender": ";;M", "homepage": ";;http://www.stat.rutgers.edu/home/pingli/", "dblp": "198/1455;;62/5860-1", "google_scholar": "0DX2YsQAAAAJ;;", "orcid": ";;", "linkedin": ";;", "or_profile": "~Nanqing_Dong1;~Jianwen_Xie1;~Ping_Li3", "aff": "University of Oxford;;LinkedIn", "aff_domain": "ox.ac.uk;;linkedin.com", "position": "PhD student;;Engineer", "bibtex": "@misc{\ndong2022unsupervised,\ntitle={Unsupervised Contrastive Learning for Signal-Dependent Noise Synthesis},\nauthor={Nanqing Dong and Jianwen Xie and Ping Li},\nyear={2022},\nurl={https://openreview.net/forum?id=DTg98fkyoyn}\n}", "github": "", "project": "", "reviewers": "a4bS;SqsX;yC7E", "site": "https://openreview.net/forum?id=DTg98fkyoyn", "pdf_size": 0, "recommendation": "1;5;5", "confidence": "4;3;3", "correctness": "1;3;3", "technical_novelty": "1;3;3", "empirical_novelty": "1;2;2", "wc_summary_paper": "57;54;28", "wc_summary_review": "100;50;19", "wc_main_review": "404;305;194", "wc_review": "561;409;241", "wc_reply_reviewers": "705;346;213", "wc_reply_authors": "1433;1277;1017", "reply_reviewers": "1;1;1", "reply_authors": "2;2;2", "recommendation_avg": [ 3.6666666666666665, 1.8856180831641267 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "correctness_avg": [ 2.3333333333333335, 0.9428090415820634 ], "technical_novelty_avg": [ 2.3333333333333335, 0.9428090415820634 ], "empirical_novelty_avg": [ 1.6666666666666667, 0.4714045207910317 ], "wc_summary_paper_avg": [ 46.333333333333336, 13.02134998974974 ], "wc_summary_review_avg": [ 56.333333333333336, 33.369979855486214 ], "wc_main_review_avg": [ 301.0, 85.77878525602937 ], "wc_review_avg": [ 403.6666666666667, 130.69387471832368 ], "wc_reply_reviewers_avg": [ 421.3333333333333, 207.80172173385753 ], "wc_reply_authors_avg": [ 1242.3333333333333, 171.59124556793554 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 2.0, 0.0 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.9999999999999997, "corr_recommendation_correctness": 1.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:2HoUOPJpFp4J:scholar.google.com/&scioq=Unsupervised+Contrastive+Learning+for+Signal-Dependent+Noise+Synthesis&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1", "aff_unique_norm": "University of Oxford;LinkedIn Corporation", "aff_unique_dep": ";", "aff_unique_url": "https://www.ox.ac.uk;https://www.linkedin.com", "aff_unique_abbr": "Oxford;LinkedIn", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "United Kingdom;United States" }, { "title": "Learning meta-features for AutoML", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6788", "id": "DTkEfj0Ygb8", "poster": "", "openreview": "https://openreview.net/forum?id=DTkEfj0Ygb8", "slides": "https://iclr.cc/virtual/2022/poster/6788", "video": "https://iclr.cc/virtual/2022/poster/6788", "author_site": "Herilalaina Rakotoarison, Louisot Milijaona, Andry RASOANAIVO, Michele Sebag, Marc Schoenauer", "tldr": "", "abstract": "This paper tackles the AutoML problem, aimed to automatically select an ML algorithm and its hyper-parameter configuration most appropriate to the dataset at hand. The proposed approach, MetaBu, learns new meta-features via an Optimal Transport procedure, aligning the manually designed \\mf s with the space of distributions on the hyper-parameter configurations. MetaBu meta-features, learned once and for all, induce a topology on the set of datasets that is exploited to define a distribution of promising hyper-parameter configurations amenable to AutoML. Experiments on the OpenML CC-18 benchmark demonstrate that using MetaBu meta-features boosts the performance of state of the art AutoML systems, AutoSklearn (Feurer et al. 2015) and Probabilistic Matrix Factorization (Fusi et al. 2018). Furthermore, the inspection of MetaBu meta-features gives some hints into when an ML algorithm does well. Finally, the topology based on MetaBu meta-features enables to estimate the intrinsic dimensionality of the OpenML benchmark w.r.t. a given ML algorithm or pipeline. The source code is available at https://github.com/luxusg1/metabu.", "keywords": "AutoML;Meta-features;Hyper-parameter Optimization;Optimal Transport", "primary_area": "", "supplementary_material": "", "author": "Herilalaina Rakotoarison;Louisot Milijaona;Andry RASOANAIVO;Michele Sebag;Marc Schoenauer", "authorids": "~Herilalaina_Rakotoarison1;~Louisot_Milijaona1;~Andry_RASOANAIVO1;~Michele_Sebag1;~Marc_Schoenauer1", "gender": "M;M;;F;M", "homepage": "https://scholar.google.fr/citations?user=pyws4AQAAAAJ&hl=en;;http://rasoanaivo.misa-madagascar.com/;http://www.lri.fr/~sebag;http://www.lri.fr/~marc", "dblp": "242/7961;;;s/MicheleSebag;67/5235", "google_scholar": "https://scholar.google.fr/citations?user=pyws4AQAAAAJ;;4v_32vsAAAAJ;https://scholar.google.fr/citations?user=l-ys2jMAAAAJ;GrCk6WoAAAAJ", "orcid": ";;;;0000-0003-1450-6830", "linkedin": ";louisot-yvans-milijaona-15287713a;;;marc-schoenauer-521a6610/", "or_profile": "~Herilalaina_Rakotoarison1;~Louisot_Milijaona1;~Andry_RASOANAIVO1;~Michele_Sebag1;~Marc_Schoenauer1", "aff": "INRIA;MISA;MISA;CNRS, Universit\u00e9 Paris-Saclay;INRIA", "aff_domain": "inria.fr;misa-madagascar.com;misa-madagascar.com;lri.fr;inria.fr", "position": "PhD student;MS student;Lecturer;Senior Researcher CNRS;Principal Researcher", "bibtex": "@inproceedings{\nrakotoarison2022learning,\ntitle={Learning meta-features for Auto{ML}},\nauthor={Herilalaina Rakotoarison and Louisot Milijaona and Andry RASOANAIVO and Michele Sebag and Marc Schoenauer},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=DTkEfj0Ygb8}\n}", "github": "", "project": "", "reviewers": "cRCu;A4JL;paJ4;H82u;f2By", "pdf_size": 0, "recommendation": "5;6;6;8;8", "confidence": "2;3;3;4;4", "correctness": "3;3;3;4;3", "technical_novelty": "2;3;3;3;4", "empirical_novelty": "3;3;3;3;3", "wc_summary_paper": "137;53;46;118;178", "wc_summary_review": "46;56;33;104;82", "wc_main_review": "112;544;167;1433;373", "wc_review": "295;653;246;1655;633", "wc_reply_reviewers": "0;0;0;0;160", "wc_reply_authors": "345;959;379;745;481", "reply_reviewers": "0;0;0;0;1", "reply_authors": "1;2;1;1;2", "recommendation_avg": [ 6.6, 1.2 ], "confidence_avg": [ 3.2, 0.7483314773547882 ], "correctness_avg": [ 3.2, 0.39999999999999997 ], "technical_novelty_avg": [ 3.0, 0.6324555320336759 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 106.4, 50.392856636630555 ], "wc_summary_review_avg": [ 64.2, 25.584370228715812 ], "wc_main_review_avg": [ 525.8, 478.88595719649163 ], "wc_review_avg": [ 696.4, 507.7005416581708 ], "wc_reply_reviewers_avg": [ 32.0, 64.0 ], "wc_reply_authors_avg": [ 581.8, 235.06458686922622 ], "reply_reviewers_avg": [ 0.2, 0.4 ], "reply_authors_avg": [ 1.4, 0.4898979485566356 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.9799578870122229, "corr_recommendation_correctness": 0.5833333333333334, "gs_citation": 24, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9378213080876956800&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "pdf": "https://openreview.net/pdf?id=DTkEfj0Ygb8", "email": "inria.fr;misa-madagascar.com;misa-madagascar.com;lri.fr;inria.fr", "author_num": 5, "aff_unique_index": "0;1;1;2;0", "aff_unique_norm": "INRIA;MISA;CNRS", "aff_unique_dep": ";;", "aff_unique_url": "https://www.inria.fr;;https://www.cnrs.fr", "aff_unique_abbr": "INRIA;;CNRS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "France;" }, { "id": "DVSN9nJB1_", "title": "E-LANG: Energy-based Joint Inferencing of Super and Swift Language Models", "track": "main", "status": "Desk Reject", "tldr": "", "abstract": "Building very large and highly capable language models has been a trend in the past several years. Despite their great performance, they incur a high computational cost. A common solution is to apply model compression or choose light-weight architectures, which often need a separate fixed-size model for each desirable computational budget, and may lose performance in case of heavy compression. This paper proposes an effective dynamic inference approach, which distributes the inference between large accurate Super-models and light-weight Swift models. To this end, a decision making module routes the incoming samples to one of the two models based on the energy characteristics of the representations in the latent space. The proposed approach is easily adoptable and architecture agnostic. As such, it can be applied to black-box pre-trained models without a need for architectural manipulations, careful reassembling of modules, or re-training. Unlike existing methods that are for the most part only applicable to encoder-only backbones and classification tasks, our method also works for encoder-decoder structures and sequence-to-sequence tasks such as translation. The performance of the proposed Energy-based joint inferencing of LANGuage models, E-LANG, is verified through an extensive set of experiments with T5 and BERT architectures on GLUE, SuperGLUE, and WMT benchmarks. In particular, we outperform T5-11B with an average computations speed-up of 3.3X on GLUE and 2.9X on SuperGLUE. We also achieve BERT-based SOTA (state-of-the-art) on GLUE with 3.2X less computations. Code is available in the supplementary materials.", "keywords": "energy-based models;dynamic inference;joint language models;super model optimization;NLP;BERT;T5", "primary_area": "", "supplementary_material": "/attachment/97795209435d1d717e145adf67e06f9f486632e7.zip", "author": "Mohammad Akbari;Amin Banitalebi-Dehkordi;Yong Zhang", "authorids": "~Mohammad_Akbari3;~Amin_Banitalebi-Dehkordi1;~Yong_Zhang2", "gender": "M;;M", "homepage": ";;https://sites.google.com/site/yongzhangai", "dblp": ";;66/4615-4", "google_scholar": "https://scholar.google.com/citations?hl=en;https://scholar.google.ca/citations?user=fSz1PtYAAAAJ;K2zamrwAAAAJ", "orcid": ";;0000-0002-0238-0719", "linkedin": ";;yong-zhang-ai/", "or_profile": "~Mohammad_Akbari3;~Amin_Banitalebi-Dehkordi1;~Yong_Zhang2", "aff": "Huawei Technologies Ltd.;Huawei Technologies Canada Co., Ltd.;Huawei Technologies Ltd., Canada", "aff_domain": "huawei.com;huawei.com;huawei.com", "position": "Researcher;Principal Researcher;Distinguished Researcher", "bibtex": "@misc{\nakbari2022elang,\ntitle={E-{LANG}: Energy-based Joint Inferencing of Super and Swift Language Models},\nauthor={Mohammad Akbari and Amin Banitalebi-Dehkordi and Yong Zhang},\nyear={2022},\nurl={https://openreview.net/forum?id=DVSN9nJB1_}\n}", "github": "", "project": "", "reviewers": "", "site": "https://openreview.net/forum?id=DVSN9nJB1_", "pdf_size": 0, "recommendation": "", "confidence": "", "correctness": "", "technical_novelty": "", "empirical_novelty": "", "wc_summary_paper": "", "wc_summary_review": "", "wc_main_review": "", "wc_review": "", "wc_reply_reviewers": "", "wc_reply_authors": "", "reply_reviewers": "", "reply_authors": "", "recommendation_avg": [ 0, 0 ], "confidence_avg": [ 0, 0 ], "correctness_avg": [ 0, 0 ], "technical_novelty_avg": [ 0, 0 ], "empirical_novelty_avg": [ 0, 0 ], "wc_summary_paper_avg": [ 0, 0 ], "wc_summary_review_avg": [ 0, 0 ], "wc_main_review_avg": [ 0, 0 ], "wc_review_avg": [ 0, 0 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 1, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0, "corr_recommendation_correctness": 0, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3848088173937684382&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff_unique_index": "0;0;0", "aff_unique_norm": "Huawei", "aff_unique_dep": "Huawei Technologies", "aff_unique_url": "https://www.huawei.com", "aff_unique_abbr": "Huawei", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1", "aff_country_unique": "China;Canada" }, { "title": "The Rich Get Richer: Disparate Impact of Semi-Supervised Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6926", "id": "DXPftn5kjQK", "poster": "", "openreview": "https://openreview.net/forum?id=DXPftn5kjQK", "slides": "https://iclr.cc/virtual/2022/poster/6926", "video": "https://iclr.cc/virtual/2022/poster/6926", "author_site": "Zhaowei Zhu, Tianyi Luo, Yang Liu", "tldr": "", "abstract": "Semi-supervised learning (SSL) has demonstrated its potential to improve the model accuracy for a variety of learning tasks when the high-quality supervised data is severely limited. Although it is often established that the average accuracy for the entire population of data is improved, it is unclear how SSL fares with different sub-populations. Understanding the above question has substantial fairness implications when different sub-populations are defined by the demographic groups that we aim to treat fairly. In this paper, we reveal the disparate impacts of deploying SSL: the sub-population who has a higher baseline accuracy without using SSL (the \"rich\" one) tends to benefit more from SSL; while the sub-population who suffers from a low baseline accuracy (the \"poor\" one) might even observe a performance drop after adding the SSL module. We theoretically and empirically establish the above observation for a broad family of SSL algorithms, which either explicitly or implicitly use an auxiliary \"pseudo-label\". Experiments on a set of image and text classification tasks confirm our claims. We introduce a new metric, Benefit Ratio, and promote the evaluation of the fairness of SSL (Equalized Benefit Ratio). We further discuss how the disparate impact can be mitigated. We hope our paper will alarm the potential pitfall of using SSL and encourage a multifaceted evaluation of future SSL algorithms. ", "keywords": "semi-supervised learning;fairness;disparate impact;Matthew effect;consistency regularization", "primary_area": "", "supplementary_material": "/attachment/e12dbf1d09d5d2b1202eb9145d191aceec0b1476.zip", "author": "Zhaowei Zhu;Tianyi Luo;Yang Liu", "authorids": "~Zhaowei_Zhu1;~Tianyi_Luo1;~Yang_Liu3", "gender": "M;M;M", "homepage": "https://www.zzw.ai;;http://www.yliuu.com", "dblp": "202/1712;;51/3710-18", "google_scholar": "YS8pSQoAAAAJ;;jKrIVCIAAAAJ", "orcid": "0000-0003-3894-5862;;0000-0001-8420-6011", "linkedin": ";tianyi-luo-a7ba1096/;", "or_profile": "~Zhaowei_Zhu1;~Tianyi_Luo1;~Yang_Liu3", "aff": "University of California, Santa Cruz;Amazon Science;University of California, Santa Cruz", "aff_domain": "ucsc.edu;amazon.com;ucsc.edu", "position": "PhD student;Applied Scientist;Assistant Professor", "bibtex": "@inproceedings{\nzhu2022the,\ntitle={The Rich Get Richer: Disparate Impact of Semi-Supervised Learning},\nauthor={Zhaowei Zhu and Tianyi Luo and Yang Liu},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=DXPftn5kjQK}\n}", "github": "", "project": "", "reviewers": "x3NJ;fRGJ;FRon;fzF9", "pdf_size": 0, "recommendation": "6;6;6;6", "confidence": "4;4;4;3", "correctness": "3;3;3;4", "technical_novelty": "2;4;3;3", "empirical_novelty": "3;4;3;3", "wc_summary_paper": "88;4;87;71", "wc_summary_review": "34;25;39;20", "wc_main_review": "559;655;490;123", "wc_review": "681;684;616;214", "wc_reply_reviewers": "286;71;37;0", "wc_reply_authors": "3017;1081;1319;284", "reply_reviewers": "2;1;1;0", "reply_authors": "6;3;3;1", "recommendation_avg": [ 6.0, 0.0 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 3.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 62.5, 34.44198019858905 ], "wc_summary_review_avg": [ 29.5, 7.433034373659253 ], "wc_main_review_avg": [ 456.75, 201.40304739501832 ], "wc_review_avg": [ 548.75, 195.16835681021655 ], "wc_reply_reviewers_avg": [ 98.5, 111.12717939370188 ], "wc_reply_authors_avg": [ 1425.25, 995.7304793466955 ], "reply_reviewers_avg": [ 1.0, 0.7071067811865476 ], "reply_authors_avg": [ 3.25, 1.7853571071357126 ], "replies_avg": [ 23, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 41, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7060479972986139346&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=DXPftn5kjQK", "email": "ucsc.edu;amazon.com;ucsc.edu", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "University of California, Santa Cruz;Amazon", "aff_unique_dep": ";Amazon Science", "aff_unique_url": "https://www.ucsc.edu;https://www.amazon.science", "aff_unique_abbr": "UCSC;Amazon Science", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Santa Cruz;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "DXRwVRh4i8g", "title": "Reachability Traces for Curriculum Design in Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "The objective in goal-based reinforcement learning is to learn a policy to reach a particular goal state within the environment. However, the underlying reward function may be too sparse for the agent to efficiently learn useful behaviors. Recent studies have demonstrated that reward sparsity can be overcome by instead learning a curriculum of simpler subtasks. In this work, we design an agent's curriculum by focusing on the aspect of goal reachability, and introduce the idea of a reachability trace, which is used as a basis to determine a sequence of intermediate subgoals to guide the agent towards its primary goal. We discuss several properties of the trace function, and in addition, validate our proposed approach empirically in a range of environments, while comparing its performance against appropriate baselines.", "keywords": "reinforcement learning;curriculum learning;sparse rewards", "primary_area": "", "supplementary_material": "/attachment/e97dd92013927f99e00fb25479947b0c62903bb8.zip", "author": "Thommen Karimpanal George;Majid Abdolshah;Hung Le;Santu Rana;Sunil Gupta;Truyen Tran;Svetha Venkatesh", "authorids": "~Thommen_Karimpanal_George1;~Majid_Abdolshah1;~Hung_Le1;~Santu_Rana1;~Sunil_Gupta2;~Truyen_Tran1;~Svetha_Venkatesh1", "gender": "M;M;M;M;F;M;M", "homepage": "https://www.thommengk.com/;http://majid.website;;http://truyentran.github.io;https://www.deakin.edu.au/about-deakin/people/svetha-venkatesh;https://thaihungle.github.io/;https://personal-sites.deakin.edu.au/~sunilg/", "dblp": "133/3358;190/6649;57/6712;55/2269;81/1984;45/466-2;47/333-1", "google_scholar": "v3-hy24AAAAJ;https://scholar.google.com.au/citations?user=RKC-MCUAAAAJ;S9PwnMYAAAAJ;https://scholar.google.com.au/citations?user=zvspVLwAAAAJ;AEkRUQcAAAAJ;https://scholar.google.com.au/citations?user=q2HbxngAAAAJ;https://scholar.google.com.au/citations?user=bXeL2t8AAAAJ", "orcid": "0000-0001-8918-3314;;0000-0003-2247-850X;0000-0001-6531-8907;;0000-0002-3126-184X;0000-0002-3308-1930", "linkedin": "thommen-george-karimpanal-762451149/;;santur/;truyen-tran;;;", "or_profile": "~Thommen_Karimpanal_George1;~Majid_Abdolshah1;~Santu_Rana1;~Truyen_Tran1;~Svetha_Venkatesh1;~Hung_Thai_Le1;~Sunil_Kumar_Gupta1", "aff": "Deakin University;Amazon;Deakin University;Deakin University, Australia;Deakin University;Deakin University;Deakin University", "aff_domain": "deakin.edu.au;amazon.com;deakin.edu.au;deakin.edu.au;deakin.edu.au;deakin.edu.au;deakin.edu.au", "position": "Postdoc;Machine Learning Scientist;Associate Professor;Associate Professor;Full Professor;Lecturer;Associate Professor", "bibtex": "@misc{\ngeorge2022reachability,\ntitle={Reachability Traces for Curriculum Design in Reinforcement Learning},\nauthor={Thommen Karimpanal George and Majid Abdolshah and Hung Le and Santu Rana and Sunil Gupta and Truyen Tran and Svetha Venkatesh},\nyear={2022},\nurl={https://openreview.net/forum?id=DXRwVRh4i8g}\n}", "github": "", "project": "", "reviewers": "psX9;8Hsg;WVWS;zxez", "site": "https://openreview.net/forum?id=DXRwVRh4i8g", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "4;4;3;5", "correctness": "3;1;2;3", "technical_novelty": "1;3;3;3", "empirical_novelty": "2;1;2;2", "wc_summary_paper": "57;113;95;114", "wc_summary_review": "26;301;22;51", "wc_main_review": "310;767;570;886", "wc_review": "393;1181;687;1051", "wc_reply_reviewers": "75;0;0;80", "wc_reply_authors": "399;603;712;661", "reply_reviewers": "1;0;0;1", "reply_authors": "1;1;1;1", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 2.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.5, 0.8660254037844386 ], "empirical_novelty_avg": [ 1.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 94.75, 23.069189409253198 ], "wc_summary_review_avg": [ 100.0, 116.57829986751393 ], "wc_main_review_avg": [ 633.25, 218.09559257353186 ], "wc_review_avg": [ 828.0, 309.6142761566398 ], "wc_reply_reviewers_avg": [ 38.75, 38.79030162295725 ], "wc_reply_authors_avg": [ 593.75, 118.868362064933 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.816496580927726, "corr_recommendation_correctness": 0.5222329678670935, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:bvTKjOFQpv4J:scholar.google.com/&scioq=Reachability+Traces+for+Curriculum+Design+in+Reinforcement+Learning&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;0;0;0;0;0", "aff_unique_norm": "Deakin University;Amazon", "aff_unique_dep": ";Amazon.com, Inc.", "aff_unique_url": "https://www.deakin.edu.au;https://www.amazon.com", "aff_unique_abbr": "Deakin;Amazon", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0;0;0;0", "aff_country_unique": "Australia;United States" }, { "id": "DXU0DQUDWLA", "title": "Disentangling One Factor at a Time", "track": "main", "status": "Reject", "tldr": "", "abstract": "With the overabundance of data for machines to process in the current state of machine learning, data discovery, organization, and interpretation of the data becomes a critical need. Specifically of need are unsupervised methods that do not require laborious labeling by human observers. One promising approach to this enedeavour is \\textit{Disentanglement}, which aims at learning the underlying generative latent factors of the data. The factors should also be as human interpretable as possible for the purposes of data discovery. \\textit{Unsupervised disentanglement} is a particularly difficult open subset of the problem, which asks the network to learn on its own the generative factors without any link to the true labels. This problem area is currently dominated by two approaches: Variational Autoencoder and Generative Adversarial Network approaches. While GANs have good performance, they suffer from difficulty in training and mode collapse, and while VAEs are stable to train, they do not perform as well as GANs in terms of interpretability. In current state of the art versions of these approaches, the networks require the user to specify the number of factors that we expect to find in the data. This limitation prevents \"true\" disentanglement, in the sense that learning how many factors is actually one of the tasks we wish the network to solve. In this work we propose a novel network for unsupervised disentanglement that combines the stable training of the VAE with the interpretability offered by GANs without the training instabilities. We aim to disentangle interpretable latent factors \"one at a time\", or OAT factor learning, making no prior assumptions about the number or distribution of factors, in a completely unsupervised manner. We demonstrate its quantitative and qualitative effectiveness by evaluating the latent representations learned on two benchmark datasets, DSprites and CelebA. ", "keywords": "unsupervised representation learning;disentanglement;Variational Autoencoders;Generative Adversarial Networks", "primary_area": "", "supplementary_material": "", "author": "Vaishnavi S Patil;Matthew S Evanusa;Joseph JaJa", "authorids": "~Vaishnavi_S_Patil1;mevanusa@cs.umd.edu;~Joseph_JaJa1", "gender": "F;;M", "homepage": "https://www.cs.umd.edu/people/vspatil;;http://users.umiacs.umd.edu/~josephj/", "dblp": ";;j/JosephJaJa", "google_scholar": ";;https://scholar.google.com/citations?hl=en", "orcid": ";;", "linkedin": ";;", "or_profile": "~Vaishnavi_S_Patil1;mevanusa@cs.umd.edu;~Joseph_JaJa1", "aff": "University of Maryland, College Park;;University of Maryland, College Park", "aff_domain": "umd.edu;;umd.edu", "position": "PhD student;;Full Professor", "bibtex": "@misc{\npatil2022disentangling,\ntitle={Disentangling One Factor at a Time},\nauthor={Vaishnavi S Patil and Matthew S Evanusa and Joseph JaJa},\nyear={2022},\nurl={https://openreview.net/forum?id=DXU0DQUDWLA}\n}", "github": "", "project": "", "reviewers": "E94u;EBrM;roR5;Uqhc", "site": "https://openreview.net/forum?id=DXU0DQUDWLA", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "4;3;2;4", "correctness": "3;3;2;3", "technical_novelty": "2;2;3;2", "empirical_novelty": "2;2;3;2", "wc_summary_paper": "74;158;54;94", "wc_summary_review": "90;42;74;48", "wc_main_review": "500;388;176;449", "wc_review": "664;588;304;591", "wc_reply_reviewers": "0;40;0;135", "wc_reply_authors": "912;1370;764;807", "reply_reviewers": "0;1;0;2", "reply_authors": "2;2;1;2", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 95.0, 39.02563260217571 ], "wc_summary_review_avg": [ 63.5, 19.461500456028563 ], "wc_main_review_avg": [ 378.25, 123.31742577592188 ], "wc_review_avg": [ 536.75, 137.78130315830228 ], "wc_reply_reviewers_avg": [ 43.75, 55.15602868227552 ], "wc_reply_authors_avg": [ 963.25, 240.92880172366276 ], "reply_reviewers_avg": [ 0.75, 0.82915619758885 ], "reply_authors_avg": [ 1.75, 0.4330127018922193 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.5222329678670935, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:GsLa_strIzwJ:scholar.google.com/&scioq=Disentangling+One+Factor+at+a+Time&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "University of Maryland", "aff_unique_dep": "", "aff_unique_url": "https://www/umd.edu", "aff_unique_abbr": "UMD", "aff_campus_unique_index": "0;0", "aff_campus_unique": "College Park", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "DYaFB19z1ig", "title": "Self-Distribution Distillation: Efficient Uncertainty Estimation", "track": "main", "status": "Reject", "tldr": "", "abstract": "Deep learning is increasingly being applied in safety-critical domains. For these scenarios it is important to know the level of uncertainty in a model\u2019s prediction to ensure that appropriate decisions are made by a system. Deep ensembles are the de-facto standard approach to obtaining various measures of uncertainty. However, ensembles normally significantly increase the resources required in both the training and deployment phases. Approaches have been developed that typically address the costs in one of these phases. In this work we propose a novel training approach, self-distribution distillation (S2D), which is able to efficiently, both in time and memory, train a single model that can estimate uncertainties in an integrated training phase. Furthermore it is possible to build ensembles of these models and apply ensemble distillation approaches, hierarchical distribution distillation, in cases where one is less limited by computational resources in the training phase, but still requires efficiency in the deployment phase. Experiments on CIFAR-100 showed that S2D models outperformed standard models and Monte-Carlo dropout. Additional out-of-distribution detection experiments on LSUN, Tiny ImageNet, SVHN showed that even a standard deep ensemble can be outperformed using S2D based ensembles and novel distilled models.\n", "keywords": "distillation;self-distillation;distribution distillation;uncertainty;robustness", "primary_area": "", "supplementary_material": "", "author": "Yassir Fathullah;Mark Gales", "authorids": "~Yassir_Fathullah1;~Mark_Gales1", "gender": "M;M", "homepage": ";http://mi.eng.cam.ac.uk/~mjfg/index.html", "dblp": "254/3044;74/4419.html", "google_scholar": "https://scholar.google.com/citations?hl=en;https://scholar.google.co.uk/citations?hl=en", "orcid": ";", "linkedin": "yassir-fathullah/;", "or_profile": "~Yassir_Fathullah1;~Mark_Gales1", "aff": "University of Cambridge;University of Cambridge", "aff_domain": "cam.ac.uk;cam.ac.uk", "position": "PhD student;Full Professor", "bibtex": "@misc{\nfathullah2022selfdistribution,\ntitle={Self-Distribution Distillation: Efficient Uncertainty Estimation},\nauthor={Yassir Fathullah and Mark Gales},\nyear={2022},\nurl={https://openreview.net/forum?id=DYaFB19z1ig}\n}", "github": "", "project": "", "reviewers": "Wumr;MF93;EMdF;MHeN", "site": "https://openreview.net/forum?id=DYaFB19z1ig", "pdf_size": 0, "recommendation": "5;5;5;5", "confidence": "3;3;5;3", "correctness": "4;2;3;4", "technical_novelty": "3;2;3;3", "empirical_novelty": "3;2;2;3", "wc_summary_paper": "67;51;105;77", "wc_summary_review": "51;58;45;46", "wc_main_review": "94;442;280;272", "wc_review": "212;551;430;395", "wc_reply_reviewers": "0;215;131;0", "wc_reply_authors": "68;629;241;216", "reply_reviewers": "0;3;1;0", "reply_authors": "1;3;2;1", "recommendation_avg": [ 5.0, 0.0 ], "confidence_avg": [ 3.5, 0.8660254037844386 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 75.0, 19.6468827043885 ], "wc_summary_review_avg": [ 50.0, 5.1478150704935 ], "wc_main_review_avg": [ 272.0, 123.13407326974935 ], "wc_review_avg": [ 397.0, 121.48456692107027 ], "wc_reply_reviewers_avg": [ 86.5, 91.45627370497881 ], "wc_reply_authors_avg": [ 288.5, 207.40841352269197 ], "reply_reviewers_avg": [ 1.0, 1.224744871391589 ], "reply_authors_avg": [ 1.75, 0.82915619758885 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13134736411002287454&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff_unique_index": "0;0", "aff_unique_norm": "University of Cambridge", "aff_unique_dep": "", "aff_unique_url": "https://www.cam.ac.uk", "aff_unique_abbr": "Cambridge", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Cambridge", "aff_country_unique_index": "0;0", "aff_country_unique": "United Kingdom" }, { "title": "Inverse Online Learning: Understanding Non-Stationary and Reactionary Policies", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7211", "id": "DYypjaRdph2", "poster": "", "openreview": "https://openreview.net/forum?id=DYypjaRdph2", "slides": "https://iclr.cc/virtual/2022/poster/7211", "video": "https://iclr.cc/virtual/2022/poster/7211", "author_site": "Alex Chan, Alicia Curth, Mihaela van der Schaar", "tldr": "", "abstract": "Human decision making is well known to be imperfect and the ability to analyse such processes individually is crucial when attempting to aid or improve a decision-maker's ability to perform a task, e.g. to alert them to potential biases or oversights on their part. To do so, it is necessary to develop interpretable representations of how agents make decisions and how this process changes over time as the agent learns online in reaction to the accrued experience. To then understand the decision-making processes underlying a set of observed trajectories, we cast the policy inference problem as the inverse to this online learning problem. By interpreting actions within a potential outcomes framework, we introduce a meaningful mapping based on agents choosing an action they believe to have the greatest treatment effect. We introduce a practical algorithm for retrospectively estimating such perceived effects, alongside the process through which agents update them, using a novel architecture built upon an expressive family of deep state-space models. Through application to the analysis of UNOS organ donation acceptance decisions, we demonstrate that our approach can bring valuable insights into the factors that govern decision processes and how they change over time. ", "keywords": "Decision Modelling;Imitation Learning;Inverse Online Learning", "primary_area": "", "supplementary_material": "", "author": "Alex Chan;Alicia Curth;Mihaela van der Schaar", "authorids": "~Alex_Chan2;~Alicia_Curth1;~Mihaela_van_der_Schaar2", "gender": "F;F;M", "homepage": ";https://www.vanderschaar-lab.com;https://alexjchan.com", "dblp": "261/8064;;268/6948", "google_scholar": "eWRBqsYAAAAJ;DZ3S--MAAAAJ;yfy_BGIAAAAJ", "orcid": ";;", "linkedin": ";;alex-chan-040081131/", "or_profile": "~Alicia_Curth1;~Mihaela_van_der_Schaar2;~Alex_James_Chan1", "aff": "University of Cambridge;University of California, Los Angeles;University of Cambridge", "aff_domain": "cam.ac.uk;ucla.edu;cam.ac.uk", "position": "PhD student;Full Professor;PhD student", "bibtex": "@inproceedings{\nchan2022inverse,\ntitle={Inverse Online Learning: Understanding Non-Stationary and Reactionary Policies},\nauthor={Alex Chan and Alicia Curth and Mihaela van der Schaar},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=DYypjaRdph2}\n}", "github": "", "project": "", "reviewers": "S6Ms;vs7Z;F1MZ", "pdf_size": 0, "recommendation": "6;6;8", "confidence": "3;3;3", "correctness": "3;2;3", "technical_novelty": "3;2;3", "empirical_novelty": "3;2;3", "wc_summary_paper": "300;124;97", "wc_summary_review": "48;68;13", "wc_main_review": "457;352;280", "wc_review": "805;544;390", "wc_reply_reviewers": "0;0;18", "wc_reply_authors": "586;831;464", "reply_reviewers": "0;0;1", "reply_authors": "1;1;1", "recommendation_avg": [ 6.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 3.0, 0.0 ], "correctness_avg": [ 2.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 173.66666666666666, 90.00864156043883 ], "wc_summary_review_avg": [ 43.0, 22.73030282830976 ], "wc_main_review_avg": [ 363.0, 72.6773692424265 ], "wc_review_avg": [ 579.6666666666666, 171.2898777576253 ], "wc_reply_reviewers_avg": [ 6.0, 8.48528137423857 ], "wc_reply_authors_avg": [ 627.0, 152.60624714167722 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.49999999999999983, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11445017493131489229&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=DYypjaRdph2", "email": "cam.ac.uk;ucla.edu;cam.ac.uk", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "University of Cambridge;University of California, Los Angeles", "aff_unique_dep": ";", "aff_unique_url": "https://www.cam.ac.uk;https://www.ucla.edu", "aff_unique_abbr": "Cambridge;UCLA", "aff_campus_unique_index": "0;1;0", "aff_campus_unique": "Cambridge;Los Angeles", "aff_country_unique_index": "0;1;0", "aff_country_unique": "United Kingdom;United States" }, { "id": "Da3ZcbjRWy", "title": "Self-Supervised Representation Learning via Latent Graph Prediction", "track": "main", "status": "Reject", "tldr": "", "abstract": "Self-supervised learning (SSL) of graph neural networks is emerging as a promising way of leveraging unlabeled data. Currently, most methods are based on contrastive learning adapted from the image domain, which requires view generation and a sufficient number of negative samples. In contrast, existing predictive models do not require negative sampling, but lack theoretical guidance on the design of pretext training tasks. In this work, we propose the LaGraph, a theoretically grounded predictive SSL framework based on latent graph prediction. Learning objectives of LaGraph are derived as self-supervised upper bounds to objectives for predicting unobserved latent graphs. In addition to its improved performance, LaGraph provides explanations for recent successes of predictive models that include invariance-based objectives. We provide theoretical analysis comparing LaGraph to related methods in different domains. Our experimental results demonstrate the superiority of LaGraph in performance and the robustness to decreasing of training sample size on both graph-level and node-level tasks.", "keywords": "Self-supervised learning;representation learning;graph neural networks", "primary_area": "", "supplementary_material": "", "author": "Yaochen Xie;Zhao Xu;Shuiwang Ji", "authorids": "~Yaochen_Xie1;~Zhao_Xu3;~Shuiwang_Ji1", "gender": "M;F;M", "homepage": "https://ycremar.github.io/;;http://people.tamu.edu/~sji", "dblp": "227/7154;96/5046-5.html;84/6405", "google_scholar": "Xw3ZjnMAAAAJ;L5UBY3wAAAAJ;BZGj6sAAAAAJ", "orcid": ";;0000-0002-4205-4563", "linkedin": ";zhao-xu-9170b0b8/;shuiwang-ji-9a040715/", "or_profile": "~Yaochen_Xie1;~Zhao_Xu3;~Shuiwang_Ji1", "aff": "Texas A&M;Texas A&M University - College Station;Texas A&M University", "aff_domain": "tamu.edu;tamu.edu;tamu.edu", "position": "PhD student;PhD student;Professor", "bibtex": "@misc{\nxie2022selfsupervised,\ntitle={Self-Supervised Representation Learning via Latent Graph Prediction},\nauthor={Yaochen Xie and Zhao Xu and Shuiwang Ji},\nyear={2022},\nurl={https://openreview.net/forum?id=Da3ZcbjRWy}\n}", "github": "", "project": "", "reviewers": "e11u;t1HQ;X3u8;Fi7g", "site": "https://openreview.net/forum?id=Da3ZcbjRWy", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "3;3;3;3", "correctness": "4;3;3;3", "technical_novelty": "3;3;3;3", "empirical_novelty": "2;3;2;2", "wc_summary_paper": "50;74;51;88", "wc_summary_review": "79;102;28;49", "wc_main_review": "256;332;193;1737", "wc_review": "385;508;272;1874", "wc_reply_reviewers": "0;82;74;406", "wc_reply_authors": "950;1633;1103;3774", "reply_reviewers": "0;1;1;2", "reply_authors": "4;4;4;10", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 3.0, 0.0 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 65.75, 16.037066439969625 ], "wc_summary_review_avg": [ 64.5, 28.235615806991 ], "wc_main_review_avg": [ 629.5, 641.306673908825 ], "wc_review_avg": [ 759.75, 648.7042373069564 ], "wc_reply_reviewers_avg": [ 140.5, 156.58464164789598 ], "wc_reply_authors_avg": [ 1865.0, 1130.9259480620294 ], "reply_reviewers_avg": [ 1.0, 0.7071067811865476 ], "reply_authors_avg": [ 5.5, 2.598076211353316 ], "replies_avg": [ 32, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": -0.5773502691896257, "gs_citation": 43, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15436923059083544697&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff_unique_index": "0;0;0", "aff_unique_norm": "Texas A&M University", "aff_unique_dep": "", "aff_unique_url": "https://www.tamu.edu", "aff_unique_abbr": "TAMU", "aff_campus_unique_index": "1", "aff_campus_unique": ";College Station", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "DaQVj6qY2-s", "title": "Understanding Graph Learning with Local Intrinsic Dimensionality", "track": "main", "status": "Reject", "tldr": "", "abstract": "Many real-world problems can be formulated as graphs and solved by graph learning techniques. Whilst the rise of Graph Neural Networks (GNNs) has greatly advanced graph learning, there is still a lack of understanding of the intrinsic properties of graph data and their impact on graph learning. In this paper, we narrow the gap by studying the intrinsic dimension of graphs with \\emph{Local Intrinsic Dimensionality (LID)}. The LID of a graph measures the expansion rate of the graph as the local neighborhood size of the nodes grows.\nWith LID, we estimate and analyze the intrinsic dimensions of node features, graph structure and representations learned by GNNs. We first show that feature LID (FLID) and structure LID (SLID) are well correlated with the complexity of synthetic graphs. Following this, we conduct a comprehensive analysis of 12 popular graph datasets of diverse categories and show that 1) graphs of lower FLIDs and SLIDs are generally easier to learn; 2) GNNs learn by mapping graphs (feature and structure together) to low-dimensional manifolds that are of much lower representation LIDs (RLIDs), i.e., RLID $\\ll$ FLID/SLID; and 3) when the layers go deep in message-passing based GNNs, the underlying graph will converge to a complete graph of $\\operatorname{SLID}=0.5$, losing structural information and causing the over-smoothing problem.", "keywords": "Local Intrinsic Dimensionality;Graph Neural Networks", "primary_area": "", "supplementary_material": "", "author": "Xiaojun Guo;Xingjun Ma;Yisen Wang", "authorids": "~Xiaojun_Guo1;~Xingjun_Ma1;~Yisen_Wang1", "gender": "F;M;M", "homepage": "https://zero-lab-pku.github.io/personwise/guoxiaojun/;http://xingjunma.com/;https://yisenwang.github.io/", "dblp": ";195/8270;172/1346-1", "google_scholar": ";https://scholar.google.com.au/citations?user=XQViiyYAAAAJ;uMWPDboAAAAJ", "orcid": ";;", "linkedin": ";xingjun-ma-173532129/;", "or_profile": "~Xiaojun_Guo1;~Xingjun_Ma1;~Yisen_Wang1", "aff": "Peking University;Deakin University;Peking University", "aff_domain": "pku.edu.cn;deakin.edu.au;pku.edu.cn", "position": "PhD student;Assistant Professor;Assistant Professor", "bibtex": "@misc{\nguo2022understanding,\ntitle={Understanding Graph Learning with Local Intrinsic Dimensionality},\nauthor={Xiaojun Guo and Xingjun Ma and Yisen Wang},\nyear={2022},\nurl={https://openreview.net/forum?id=DaQVj6qY2-s}\n}", "github": "", "project": "", "reviewers": "XMPe;5d9E;pEhv", "site": "https://openreview.net/forum?id=DaQVj6qY2-s", "pdf_size": 0, "recommendation": "1;5;6", "confidence": "5;3;4", "correctness": "1;4;4", "technical_novelty": "2;2;3", "empirical_novelty": "0;3;4", "wc_summary_paper": "16;74;71", "wc_summary_review": "15;16;92", "wc_main_review": "460;111;271", "wc_review": "491;201;434", "wc_reply_reviewers": "290;0;0", "wc_reply_authors": "1906;391;386", "reply_reviewers": "1;0;0", "reply_authors": "4;1;1", "recommendation_avg": [ 4.0, 2.160246899469287 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "correctness_avg": [ 3.0, 1.4142135623730951 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.3333333333333335, 1.699673171197595 ], "wc_summary_paper_avg": [ 53.666666666666664, 26.662499674428293 ], "wc_summary_review_avg": [ 41.0, 36.0647565729573 ], "wc_main_review_avg": [ 280.6666666666667, 142.64252132126973 ], "wc_review_avg": [ 375.3333333333333, 125.44941432926483 ], "wc_reply_reviewers_avg": [ 96.66666666666667, 136.7073110293992 ], "wc_reply_authors_avg": [ 894.3333333333334, 715.3592725958677 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 2.0, 1.4142135623730951 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.7559289460184544, "corr_recommendation_correctness": 0.9819805060619657, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:FuplKpDIyGEJ:scholar.google.com/&scioq=Understanding+Graph+Learning+with+Local+Intrinsic+Dimensionality&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1;0", "aff_unique_norm": "Peking University;Deakin University", "aff_unique_dep": ";", "aff_unique_url": "http://www.pku.edu.cn;https://www.deakin.edu.au", "aff_unique_abbr": "Peking U;Deakin", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", "aff_country_unique": "China;Australia" }, { "title": "Transferable Adversarial Attack based on Integrated Gradients", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6584", "id": "DesNW4-5ai9", "poster": "", "openreview": "https://openreview.net/forum?id=DesNW4-5ai9", "slides": "https://iclr.cc/virtual/2022/poster/6584", "video": "https://iclr.cc/virtual/2022/poster/6584", "author_site": "Yi Huang, Adams Kong", "tldr": "", "abstract": "The vulnerability of deep neural networks to adversarial examples has drawn tremendous attention from the community. Three approaches, optimizing standard objective functions, exploiting attention maps, and smoothing decision surfaces, are commonly used to craft adversarial examples. By tightly integrating the three approaches, we propose a new and simple algorithm named Transferable Attack based on Integrated Gradients (TAIG) in this paper, which can find highly transferable adversarial examples for black-box attacks. Unlike previous methods using multiple computational terms or combining with other methods, TAIG integrates the three approaches into one single term. Two versions of TAIG that compute their integrated gradients on a straight-line path and a random piecewise linear path are studied. Both versions offer strong transferability and can seamlessly work together with the previous methods. Experimental results demonstrate that TAIG outperforms the state-of-the-art methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yi Huang;Adams Wai-Kin Kong", "authorids": "~Yi_Huang2;~Adams_Wai-Kin_Kong1", "gender": "F;M", "homepage": "https://github.com/yihuang2016;https://personal.ntu.edu.sg/AdamsKong/", "dblp": "15/6040;16/3792", "google_scholar": ";2GfXvbUAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Yi_Huang2;~Adams_Wai-Kin_Kong1", "aff": "National Technological University;Nanyang Technological University", "aff_domain": "ntu.edu;ntu.edu.sg", "position": "Postdoc;Associate Professor", "bibtex": "@inproceedings{\nhuang2022transferable,\ntitle={Transferable Adversarial Attack based on Integrated Gradients},\nauthor={Yi Huang and Adams Wai-Kin Kong},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=DesNW4-5ai9}\n}", "github": "", "project": "", "reviewers": "xf97;bFo6;N6RH;cPsc", "pdf_size": 0, "recommendation": "5;5;6;8", "confidence": "3;4;3;4", "correctness": "3;2;3;4", "technical_novelty": "2;2;3;3", "empirical_novelty": "3;0;3;4", "wc_summary_paper": "66;37;72;54", "wc_summary_review": "48;27;27;12", "wc_main_review": "507;198;211;75", "wc_review": "621;262;310;141", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "2813;2255;1797;909", "reply_reviewers": "0;0;0;0", "reply_authors": "5;4;3;3", "recommendation_avg": [ 6.0, 1.224744871391589 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 1.5 ], "wc_summary_paper_avg": [ 57.25, 13.36740438529485 ], "wc_summary_review_avg": [ 28.5, 12.816005617976296 ], "wc_main_review_avg": [ 247.75, 158.80707635366883 ], "wc_review_avg": [ 333.5, 177.0430738549238 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 1943.5, 697.2651934522474 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 3.75, 0.82915619758885 ], "replies_avg": [ 23, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.40824829046386296, "corr_recommendation_correctness": 0.8660254037844386, "gs_citation": 71, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12897064558581398673&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=DesNW4-5ai9", "email": "ntu.edu;ntu.edu.sg", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "National Technological University;Nanyang Technological University", "aff_unique_dep": ";", "aff_unique_url": "https://www.ntu.edu;https://www.ntu.edu.sg", "aff_unique_abbr": "NTU;NTU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "United States;Singapore" }, { "title": "Interpretable Unsupervised Diversity Denoising and Artefact Removal", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/5977", "id": "DfMqlB0PXjM", "poster": "", "openreview": "https://openreview.net/forum?id=DfMqlB0PXjM", "slides": "https://iclr.cc/virtual/2022/poster/5977", "video": "https://iclr.cc/virtual/2022/poster/5977", "author_site": "Mangal Prakash, Mauricio Delbracio, Peyman Milanfar, Florian Jug", "tldr": "", "abstract": "Image denoising and artefact removal are complex inverse problems admitting multiple valid solutions. Unsupervised diversity restoration, that is, obtaining a diverse set of possible restorations given a corrupted image, is important for ambiguity removal in many applications such as microscopy where paired data for supervised training are often unobtainable. In real world applications, imaging noise and artefacts are typically hard to model, leading to unsatisfactory performance of existing unsupervised approaches. This work presents an interpretable approach for unsupervised and diverse image restoration. To this end, we introduce a capable architecture called Hierarchical DivNoising (HDN) based on hierarchical Variational Autoencoder. We show that HDN learns an interpretable multi-scale representation of artefacts and we leverage this interpretability to remove imaging artefacts commonly occurring in microscopy data. Our method achieves state-of-the-art results on twelve benchmark image denoising datasets while providing access to a whole distribution of sensibly restored solutions.\nAdditionally, we demonstrate on three real microscopy datasets that HDN removes artefacts without supervision, being the first method capable of doing so while generating multiple plausible restorations all consistent with the given corrupted image.", "keywords": "Interpretable Unsupervised Image Restoration;Diversity Image Restoration;Unsupervised Image Denoising;Unsupervised Artefact Removal", "primary_area": "", "supplementary_material": "", "author": "Mangal Prakash;Mauricio Delbracio;Peyman Milanfar;Florian Jug", "authorids": "~Mangal_Prakash1;~Mauricio_Delbracio1;~Peyman_Milanfar1;~Florian_Jug1", "gender": "M;M;M;M", "homepage": ";;http://www.milanfar.org;http://www.humantechnopole.it/jug", "dblp": "156/0559;90/10811;48/6882;05/1828", "google_scholar": ";lDDm920AAAAJ;iGzDl8IAAAAJ;https://scholar.google.de/citations?user=vYInKgwAAAAJ", "orcid": ";;;0000-0002-8499-5812", "linkedin": "mangalprakash/;;;https://de.linkedin.com/in/florianjug", "or_profile": "~Mangal_Prakash1;~Mauricio_Delbracio1;~Peyman_Milanfar1;~Florian_Jug1", "aff": "Exscientia;Google;Google;Fondazione Human Technopole", "aff_domain": "exscientia.co.uk;google.com;google.com;fht.org", "position": "Researcher;Research Scientist;Distinguished Scientist;Associate Professor", "bibtex": "@inproceedings{\nprakash2022interpretable,\ntitle={Interpretable Unsupervised Diversity Denoising and Artefact Removal},\nauthor={Mangal Prakash and Mauricio Delbracio and Peyman Milanfar and Florian Jug},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=DfMqlB0PXjM}\n}", "github": "", "project": "", "reviewers": "iaBg;5Ptv;3vAh;kqm1", "pdf_size": 0, "recommendation": "6;8;8;8", "confidence": "3;4;3;4", "correctness": "4;4;4;4", "technical_novelty": "3;3;3;3", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "107;110;83;26", "wc_summary_review": "67;79;22;40", "wc_main_review": "368;445;357;164", "wc_review": "542;634;462;230", "wc_reply_reviewers": "67;63;19;7", "wc_reply_authors": "1057;359;358;212", "reply_reviewers": "1;1;1;1", "reply_authors": "2;1;1;1", "recommendation_avg": [ 7.5, 0.8660254037844386 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 4.0, 0.0 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 81.5, 33.70830758136635 ], "wc_summary_review_avg": [ 52.0, 22.34949663862701 ], "wc_main_review_avg": [ 333.5, 103.56761076707332 ], "wc_review_avg": [ 467.0, 149.75646897546696 ], "wc_reply_reviewers_avg": [ 39.0, 26.38181191654584 ], "wc_reply_authors_avg": [ 496.5, 329.0854752188252 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": 0.0, "gs_citation": 29, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5658559510120248446&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=DfMqlB0PXjM", "email": "exscientia.co.uk;google.com;google.com;fht.org", "author_num": 4, "aff_unique_index": "0;1;1;2", "aff_unique_norm": "Exscientia;Google;Fondazione Human Technopole", "aff_unique_dep": ";Google;", "aff_unique_url": "https://www.exscientia.co.uk;https://www.google.com;https://www.humantechnopole.it", "aff_unique_abbr": ";Google;", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;1;1;2", "aff_country_unique": "United Kingdom;United States;Italy" }, { "title": "Information Prioritization through Empowerment in Visual Model-based RL", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6014", "id": "DfUjyyRW90", "poster": "", "openreview": "https://openreview.net/forum?id=DfUjyyRW90", "slides": "https://iclr.cc/virtual/2022/poster/6014", "video": "https://iclr.cc/virtual/2022/poster/6014", "author_site": "Homanga Bharadhwaj, Mohammad Babaeizadeh, Dumitru Erhan, Sergey Levine", "tldr": "", "abstract": "Model-based reinforcement learning (RL) algorithms designed for handling complex visual observations typically learn some sort of latent state representation, either explicitly or implicitly. Standard methods of this sort do not distinguish between functionally relevant aspects of the state and irrelevant distractors, instead aiming to represent all available information equally. We propose a modified objective for model-based RL that, in combination with mutual information maximization, allows us to learn representations and dynamics for visual model-based RL without reconstruction in a way that explicitly prioritizes functionally relevant factors. The key principle behind our design is to integrate a term inspired by variational empowerment into a state-space learning model based on mutual information. This term prioritizes information that is correlated with action, thus ensuring that functionally relevant factors are captured first. Furthermore, the same empowerment term also promotes faster exploration during the RL process, especially for sparse-reward tasks where the reward signal is insufficient to drive exploration in the early stages of learning. We evaluate the approach on a suite of vision-based robot control tasks with natural video backgrounds, and show that the proposed prioritized information objective outperforms state-of-the-art model based RL approaches by an average of 20\\% in terms of episodic returns at 1M environment interactions with 30\\% higher sample efficiency at 100k interactions.", "keywords": "model-based reinforcement learning;visual distractors;empowerment", "primary_area": "", "supplementary_material": "", "author": "Homanga Bharadhwaj;Mohammad Babaeizadeh;Dumitru Erhan;Sergey Levine", "authorids": "~Homanga_Bharadhwaj1;~Mohammad_Babaeizadeh1;~Dumitru_Erhan1;~Sergey_Levine1", "gender": "M;M;M;M", "homepage": "https://homangab.github.io/;;http://dumitru.ca;https://people.eecs.berkeley.edu/~svlevine/", "dblp": "223/5842;;http://dblp.uni-trier.de/pers/hd/e/Erhan:Dumitru;80/7594", "google_scholar": "https://scholar.google.ca/citations?user=wwW4HRQAAAAJ;3Y4egcYAAAAJ;wfGiqXEAAAAJ;8R35rCwAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Homanga_Bharadhwaj1;~Mohammad_Babaeizadeh1;~Dumitru_Erhan1;~Sergey_Levine1", "aff": "Meta Facebook;Google;Google;Google", "aff_domain": "facebook.com;google.com;google.com;google.com", "position": "Visiting Researcher;Research Enginner;Research Scientist;Research Scientist", "bibtex": "@inproceedings{\nbharadhwaj2022information,\ntitle={Information Prioritization through Empowerment in Visual Model-based {RL}},\nauthor={Homanga Bharadhwaj and Mohammad Babaeizadeh and Dumitru Erhan and Sergey Levine},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=DfUjyyRW90}\n}", "github": "", "project": "", "reviewers": "EU46;sJgS;8ezz;Kbjg", "pdf_size": 0, "recommendation": "6;8;8;8", "confidence": "4;3;3;4", "correctness": "4;4;4;4", "technical_novelty": "3;3;3;3", "empirical_novelty": "3;0;3;4", "wc_summary_paper": "86;42;103;81", "wc_summary_review": "24;88;27;50", "wc_main_review": "711;733;506;319", "wc_review": "821;863;636;450", "wc_reply_reviewers": "143;24;130;16", "wc_reply_authors": "996;560;1498;231", "reply_reviewers": "1;1;1;1", "reply_authors": "3;2;4;1", "recommendation_avg": [ 7.5, 0.8660254037844386 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 4.0, 0.0 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.5, 1.5 ], "wc_summary_paper_avg": [ 78.0, 22.327113561766108 ], "wc_summary_review_avg": [ 47.25, 25.586861863073402 ], "wc_main_review_avg": [ 567.25, 168.46123441314325 ], "wc_review_avg": [ 692.5, 163.99771339869346 ], "wc_reply_reviewers_avg": [ 78.25, 58.49946580952684 ], "wc_reply_authors_avg": [ 821.25, 475.7033608247896 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 2.5, 1.118033988749895 ], "replies_avg": [ 20, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.0, "gs_citation": 33, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1057468429684152647&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=DfUjyyRW90", "email": "facebook.com;google.com;google.com;google.com", "author_num": 4, "aff_unique_index": "0;1;1;1", "aff_unique_norm": "Meta;Google", "aff_unique_dep": "Meta Platforms, Inc.;Google", "aff_unique_url": "https://meta.com;https://www.google.com", "aff_unique_abbr": "Meta;Google", "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "PAC Prediction Sets Under Covariate Shift", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6314", "id": "DhP9L8vIyLc", "poster": "", "openreview": "https://openreview.net/forum?id=DhP9L8vIyLc", "slides": "https://iclr.cc/virtual/2022/poster/6314", "video": "https://iclr.cc/virtual/2022/poster/6314", "author_site": "Sangdon Park, Edgar Dobriban, Insup Lee, Osbert Bastani", "tldr": "", "abstract": "An important challenge facing modern machine learning is how to rigorously quantify the uncertainty of model predictions. Conveying uncertainty is especially important when there are changes to the underlying data distribution that might invalidate the predictive model. Yet, most existing uncertainty quantification algorithms break down in the presence of such shifts. We propose a novel approach that addresses this challenge by constructing \\emph{probably approximately correct (PAC)} prediction sets in the presence of covariate shift. Our approach focuses on the setting where there is a covariate shift from the source distribution (where we have labeled training examples) to the target distribution (for which we want to quantify uncertainty). Our algorithm assumes given importance weights that encode how the probabilities of the training examples change under the covariate shift. In practice, importance weights typically need to be estimated; thus, we extend our algorithm to the setting where we are given confidence intervals for the importance weights. We demonstrate the effectiveness of our approach on covariate shifts based on DomainNet and ImageNet. Our algorithm satisfies the PAC constraint, and gives prediction sets with the smallest average normalized size among approaches that always satisfy the PAC constraint.", "keywords": "probably approximately correct;prediction set;covariate shift;importance weight;calibration;Clopper-Pearson binomial interval;rejection sampling", "primary_area": "", "supplementary_material": "", "author": "Sangdon Park;Edgar Dobriban;Insup Lee;Osbert Bastani", "authorids": "~Sangdon_Park1;~Edgar_Dobriban2;~Insup_Lee1;~Osbert_Bastani1", "gender": "M;;;M", "homepage": "https://sangdon.github.io/;https://statistics.wharton.upenn.edu/profile/dobriban/;https://www.cis.upenn.edu/~lee/;http://obastani.github.io", "dblp": "119/1530-1;99/11269;l/InsupLee.html;21/11275", "google_scholar": "Vi2E2F4AAAAJ;aGvH4yMAAAAJ;qPlUgrgAAAAJ;cxYepGkAAAAJ", "orcid": ";;0000-0003-2672-1132;", "linkedin": ";edgar-dobriban/;;", "or_profile": "~Sangdon_Park1;~Edgar_Dobriban2;~Insup_Lee1;~Osbert_Bastani1", "aff": "Georgia Institute of Technology;The Wharton School, University of Pennsylvania;University of Pennsylvania;University of Pennsylvania", "aff_domain": "gatech.edu;wharton.upenn.edu;upenn.edu;upenn.edu", "position": "Postdoc;Assistant Professor;Full Professor;Assistant Professor", "bibtex": "@inproceedings{\npark2022pac,\ntitle={{PAC} Prediction Sets Under Covariate Shift},\nauthor={Sangdon Park and Edgar Dobriban and Insup Lee and Osbert Bastani},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=DhP9L8vIyLc}\n}", "github": "", "project": "", "reviewers": "ZLVy;vZMq;vXNT;qXST", "pdf_size": 0, "recommendation": "6;6;6;8", "confidence": "3;2;3;4", "correctness": "3;4;3;3", "technical_novelty": "3;2;3;3", "empirical_novelty": "3;3;2;4", "wc_summary_paper": "137;301;102;639", "wc_summary_review": "40;205;36;47", "wc_main_review": "664;144;201;440", "wc_review": "841;650;339;1126", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "1574;332;629;646", "reply_reviewers": "0;0;0;0", "reply_authors": "2;1;1;1", "recommendation_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 3.0, 0.7071067811865476 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 294.75, 212.4763222102642 ], "wc_summary_review_avg": [ 82.0, 71.12313266441517 ], "wc_main_review_avg": [ 362.25, 206.5990984975491 ], "wc_review_avg": [ 739.0, 286.39745110597613 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 795.25, 466.62799690974396 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.816496580927726, "corr_recommendation_correctness": -0.3333333333333333, "gs_citation": 46, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15533837197233330118&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "pdf": "https://openreview.net/pdf?id=DhP9L8vIyLc", "email": "gatech.edu;wharton.upenn.edu;upenn.edu;upenn.edu", "author_num": 4, "aff_unique_index": "0;1;1;1", "aff_unique_norm": "Georgia Institute of Technology;University of Pennsylvania", "aff_unique_dep": ";The Wharton School", "aff_unique_url": "https://www.gatech.edu;https://www.wharton.upenn.edu", "aff_unique_abbr": "Georgia Tech;UPenn Wharton", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "P-Adapters: Robustly Extracting Factual Information from Language Models with Diverse Prompts", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6900", "id": "DhzIU48OcZh", "poster": "", "openreview": "https://openreview.net/forum?id=DhzIU48OcZh", "slides": "https://iclr.cc/virtual/2022/poster/6900", "video": "https://iclr.cc/virtual/2022/poster/6900", "author_site": "Benjamin Newman, Prafulla Kumar Choubey, Nazneen Rajani", "tldr": "", "abstract": "Recent work (e.g. LAMA (Petroni et al., 2019)) has found that the quality of the factual information extracted from Large Language Models (LLMs) depends on the prompts used to query them. This inconsistency is problematic because different users will query LLMs for the same information using different wording, but should receive the same, accurate responses regardless. In this work we aim to address this shortcoming by introducing P-Adapters: lightweight models that sit between the embedding layer and first attention layer of LLMs. They take LLM embeddings as input and output continuous prompts that are used to query the LLM. Additionally, we investigate Mixture of Experts (MoE) models that learn a set of continuous prompts (the \"experts\") and select one to query the LLM. These require a separate classifier trained on human-annotated data to map natural language prompts to the continuous ones. P-Adapters perform comparably to the more complex MoE models in extracting factual information from BERT and RoBERTa while eliminating the need for additional annotations. P-Adapters show between 12-26% absolute improvement in precision and 36-50% absolute improvement in consistency over a baseline of just using natural language queries alone. Finally, we investigate what makes P-Adapters successful and conclude that a significant factor is access to the LLM's embeddings of the original natural language prompt, particularly the subject of the entity pair being queried.", "keywords": "NLP;Prompting;Commonsense;information extraction;factual extraction;Large Language Models", "primary_area": "", "supplementary_material": "/attachment/8a56d5b5fdfed44dea41e1ef4de0741e07de905e.zip", "author": "Benjamin Newman;Prafulla Kumar Choubey;Nazneen Rajani", "authorids": "~Benjamin_Newman1;~Prafulla_Kumar_Choubey2;~Nazneen_Rajani1", "gender": ";M;F", "homepage": "http://blnewman.com;;https://www.nazneenrajani.com/", "dblp": "126/5109;203/8260;", "google_scholar": "QehvrDoAAAAJ;k7aMOCsAAAAJ;eIRG81YAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Benjamin_Newman1;~Prafulla_Kumar_Choubey2;~Nazneen_Fatema_Fatema_Rajani1", "aff": "Stanford University;SalesForce.com;SalesForce.com", "aff_domain": "stanford.edu;salesforce.com;salesforce.com", "position": "MS student;Researcher;Researcher", "bibtex": "@inproceedings{\nnewman2022padapters,\ntitle={P-Adapters: Robustly Extracting Factual Information from Language Models with Diverse Prompts},\nauthor={Benjamin Newman and Prafulla Kumar Choubey and Nazneen Rajani},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=DhzIU48OcZh}\n}", "github": "", "project": "", "reviewers": "jddf;anXM;2hiF;v5xA;AfJQ", "pdf_size": 0, "recommendation": "5;6;6;8;8", "confidence": "4;4;3;3;4", "correctness": "3;3;3;4;4", "technical_novelty": "2;3;3;3;3", "empirical_novelty": "2;2;3;3;3", "wc_summary_paper": "54;226;167;84;115", "wc_summary_review": "148;255;18;12;126", "wc_main_review": "775;367;98;193;888", "wc_review": "977;848;283;289;1129", "wc_reply_reviewers": "348;16;0;0;0", "wc_reply_authors": "1046;585;194;149;625", "reply_reviewers": "2;1;0;0;0", "reply_authors": "2;1;1;1;1", "recommendation_avg": [ 6.6, 1.2 ], "confidence_avg": [ 3.6, 0.4898979485566356 ], "correctness_avg": [ 3.4, 0.4898979485566356 ], "technical_novelty_avg": [ 2.8, 0.39999999999999997 ], "empirical_novelty_avg": [ 2.6, 0.4898979485566356 ], "wc_summary_paper_avg": [ 129.2, 61.1535771643818 ], "wc_summary_review_avg": [ 111.8, 90.30703184137988 ], "wc_main_review_avg": [ 464.2, 314.10278572467325 ], "wc_review_avg": [ 705.2, 353.6520323708037 ], "wc_reply_reviewers_avg": [ 72.8, 137.73946420688588 ], "wc_reply_authors_avg": [ 519.8, 327.3660947624235 ], "reply_reviewers_avg": [ 0.6, 0.7999999999999999 ], "reply_authors_avg": [ 1.2, 0.4000000000000001 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.2721655269759087, "corr_recommendation_correctness": 0.9525793444156803, "gs_citation": 29, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1866558597598479566&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=DhzIU48OcZh", "email": "stanford.edu;salesforce.com;salesforce.com", "author_num": 3, "aff_unique_index": "0;1;1", "aff_unique_norm": "Stanford University;Salesforce", "aff_unique_dep": ";", "aff_unique_url": "https://www.stanford.edu;https://www.salesforce.com", "aff_unique_abbr": "Stanford;Salesforce", "aff_campus_unique_index": "0", "aff_campus_unique": "Stanford;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "DkeCkhLIVGZ", "title": "Understanding Metric Learning on Unit Hypersphere and Generating Better Examples for Adversarial Training", "track": "main", "status": "Reject", "tldr": "", "abstract": "Recent works have shown that adversarial examples can improve the performance of representation learning tasks. In this paper, we boost the performance of deep metric learning (DML) models with adversarial examples generated by attacking two new objective functions: \\textit{intra-class alignment} and \\textit{hyperspherical uniformity}. These two new objectives come from our theoretical and empirical analysis of the tuple-based metric losses on the hyperspherical embedding space. Our analytical results reveal that a) the metric losses on positive sample pairs are related to intra-class alignment; b) the metric losses on negative sample pairs serve as uniformity regularization on hypersphere. Based on our new understanding on the DML models, we propose Adversarial Deep Metric Learning model with adversarial samples generated by Alignment or Uniformity objective (ADML+A or U). With the same network structure and training settings, ADML+A and ADML+U consistently outperform the state-of-the-art vanilla DML models and a baseline model, adversarial DML model with attacking triplet objective function, on four metric learning benchmarks.", "keywords": "Metric learning;Adversarial learning", "primary_area": "", "supplementary_material": "/attachment/3a99bd59aade3b260ae0769caac1d23bde4011d0.zip", "author": "Yihan Wu;Heng Huang", "authorids": "~Yihan_Wu1;~Heng_Huang1", "gender": "M;M", "homepage": "https://yihwu.github.io/;https://www.cs.umd.edu/~heng/", "dblp": ";03/281", "google_scholar": "cajTg_wAAAAJ;4OqLaDwAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Yihan_Wu1;~Heng_Huang1", "aff": "University of Pittsburgh;University of Pittsburgh", "aff_domain": "pitt.edu;pitt.edu", "position": "PhD student;Full Professor", "bibtex": "@misc{\nwu2022understanding,\ntitle={Understanding Metric Learning on Unit Hypersphere and Generating Better Examples for Adversarial Training},\nauthor={Yihan Wu and Heng Huang},\nyear={2022},\nurl={https://openreview.net/forum?id=DkeCkhLIVGZ}\n}", "github": "", "project": "", "reviewers": "UWyt;xCnR;t4QZ;MuXo", "site": "https://openreview.net/forum?id=DkeCkhLIVGZ", "pdf_size": 0, "recommendation": "5;5;6;8", "confidence": "3;4;3;4", "correctness": "3;3;3;4", "technical_novelty": "2;3;2;3", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "48;99;53;75", "wc_summary_review": "8;82;42;119", "wc_main_review": "392;172;505;281", "wc_review": "448;353;600;475", "wc_reply_reviewers": "0;0;0;61", "wc_reply_authors": "828;519;817;547", "reply_reviewers": "0;0;0;1", "reply_authors": "2;2;2;2", "recommendation_avg": [ 6.0, 1.224744871391589 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 68.75, 20.20365066021485 ], "wc_summary_review_avg": [ 62.75, 41.72154719087009 ], "wc_main_review_avg": [ 337.5, 124.10580163715152 ], "wc_review_avg": [ 469.0, 88.16745431280184 ], "wc_reply_reviewers_avg": [ 15.25, 26.413774815425377 ], "wc_reply_authors_avg": [ 677.75, 145.1402339118964 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 2.0, 0.0 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.40824829046386296, "corr_recommendation_correctness": 0.9428090415820632, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4450896287373382472&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "University of Pittsburgh", "aff_unique_dep": "", "aff_unique_url": "https://www.pitt.edu", "aff_unique_abbr": "Pitt", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "A Fine-Grained Analysis on Distribution Shift", "status": "Oral", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7002", "id": "Dl4LetuLdyK", "poster": "", "openreview": "https://openreview.net/forum?id=Dl4LetuLdyK", "slides": "https://iclr.cc/virtual/2022/poster/7002", "video": "https://iclr.cc/virtual/2022/poster/7002", "author_site": "Olivia Wiles, Sven Gowal, Florian Stimberg, Sylvestre-Alvise Rebuffi, Ira Ktena, Krishnamurthy Dvijotham, Ali Taylan Cemgil", "tldr": "", "abstract": "Robustness to distribution shifts is critical for deploying machine learning models in the real world. Despite this necessity, there has been little work in defining the underlying mechanisms that cause these shifts and evaluating the robustness of algorithms across multiple, different distribution shifts. To this end, we introduce a framework that enables fine-grained analysis of various distribution shifts. We provide a holistic analysis of current state-of-the-art methods by evaluating 19 distinct methods grouped into five categories across both synthetic and real-world datasets. Overall, we train more than 85K models. Our experimental framework can be easily extended to include new methods, shifts, and datasets. We find, unlike previous work (Gulrajani & Lopez-Paz, 2021), that progress has been made over a standard ERM baseline; in particular, pretraining and augmentations (learned or heuristic) offer large gains in many cases. However, the best methods are not consistent over different datasets and shifts. We will open source our experimental framework, allowing future work to evaluate new methods over multiple shifts to obtain a more complete picture of a method's effectiveness. \nCode is available at github.com/deepmind/distribution_shift_framework.\n", "keywords": "robustness;distribution shifts", "primary_area": "", "supplementary_material": "/attachment/98bd936d076633fae496d57988805e933efaca79.zip", "author": "Olivia Wiles;Sven Gowal;Florian Stimberg;Sylvestre-Alvise Rebuffi;Ira Ktena;Krishnamurthy Dj Dvijotham;Ali Taylan Cemgil", "authorids": "~Olivia_Wiles1;~Sven_Gowal2;~Florian_Stimberg1;~Sylvestre-Alvise_Rebuffi1;iraktena@google.com;~Krishnamurthy_Dj_Dvijotham1;~Ali_Taylan_Cemgil2", "gender": ";;M;M;;;", "homepage": ";;;;;;", "dblp": "194/3191;;57/11107;190/7811;;;", "google_scholar": "https://scholar.google.co.uk/citations?user=XQzHJSgAAAAJ;;https://scholar.google.com/citations?hl=en;swP3h24AAAAJ;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": "~Olivia_Wiles1;~Sven_Gowal2;~Florian_Stimberg1;~Sylvestre-Alvise_Rebuffi1;iraktena@google.com;~Krishnamurthy_Dj_Dvijotham1;~Ali_Taylan_Cemgil2", "aff": "Google;;Google DeepMind;Google DeepMind;;;", "aff_domain": "google.com;;deepmind.com;deepmind.com;;;", "position": "Researcher;;Researcher;Researcher;;;", "bibtex": "@inproceedings{\nwiles2022a,\ntitle={A Fine-Grained Analysis on Distribution Shift},\nauthor={Olivia Wiles and Sven Gowal and Florian Stimberg and Sylvestre-Alvise Rebuffi and Ira Ktena and Krishnamurthy Dj Dvijotham and Ali Taylan Cemgil},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=Dl4LetuLdyK}\n}", "github": "", "project": "", "reviewers": "zAfw;HwsU;U5qL", "pdf_size": 0, "recommendation": "8;8;10", "confidence": "4;4;4", "correctness": "2;3;4", "technical_novelty": "2;4;3", "empirical_novelty": "3;4;3", "wc_summary_paper": "101;99;71", "wc_summary_review": "35;120;391", "wc_main_review": "362;661;1462", "wc_review": "498;880;1924", "wc_reply_reviewers": "63;298;393", "wc_reply_authors": "571;2151;1541", "reply_reviewers": "1;2;2", "reply_authors": "1;4;4", "recommendation_avg": [ 8.666666666666666, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.0, 0.816496580927726 ], "technical_novelty_avg": [ 3.0, 0.816496580927726 ], "empirical_novelty_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 90.33333333333333, 13.695092389449425 ], "wc_summary_review_avg": [ 182.0, 151.80469909283661 ], "wc_main_review_avg": [ 828.3333333333334, 464.39949277415116 ], "wc_review_avg": [ 1100.6666666666667, 602.7101754206208 ], "wc_reply_reviewers_avg": [ 251.33333333333334, 138.7043218104212 ], "wc_reply_authors_avg": [ 1421.0, 650.5894762956642 ], "reply_reviewers_avg": [ 1.6666666666666667, 0.4714045207910317 ], "reply_authors_avg": [ 3.0, 1.4142135623730951 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.8660254037844385, "gs_citation": 272, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16772366369879216800&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=Dl4LetuLdyK", "email": "google.com;;deepmind.com;deepmind.com;;;", "author_num": 7, "aff_unique_index": "0;0;0", "aff_unique_norm": "Google", "aff_unique_dep": "Google", "aff_unique_url": "https://www.google.com", "aff_unique_abbr": "Google", "aff_campus_unique_index": "0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;1;1", "aff_country_unique": "United States;United Kingdom" }, { "id": "DmKu5T2gEqc", "title": "CDNet: A cascaded decoupling architecture for video prediction", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Video prediction is an essential task in the computer vision community, helping to solve many downstream vision tasks by predicting and modeling future motion dynamics and appearance. In the deterministic video prediction task, current methods mainly employ variants of stacked Recurrent Neural Networks (RNN) to capture spatiotemporal coherence, overlooking the conflict between long-term motion dynamics modeling and legible appearance generation. In this work, we propose a Cascaded Decoupling Network (CDNet) to solve the video prediction problem through two modules: motion LSTM to capture the motion trend and variation in the temporal highway without considering the appearance details, and refine LSTM to recover the detailed appearance according to the predicted motion dynamics and historical appearance iteratively. The cascaded structure provides a preliminary solution for the above conflict. We verify the rationality of our model on two real-world challenging video prediction datasets and yield state-of-the-art performance.", "keywords": "Video Prediction;RNNs", "primary_area": "", "supplementary_material": "", "author": "Chuanqi Zang;Mingtao Pei", "authorids": "~Chuanqi_Zang1;~Mingtao_Pei2", "gender": "M;M", "homepage": ";https://peimingtao.github.io", "dblp": ";77/7398.html", "google_scholar": "pYGh1nkAAAAJ;", "orcid": ";", "linkedin": ";", "or_profile": "~Chuanqi_Zang1;~Mingtao_Pei2", "aff": "Beijing Institute of Technology;Beijing Institute of Technology", "aff_domain": "bit.edu.cn;bit.edu.cn", "position": "PhD student;Full Professor", "bibtex": "@misc{\nzang2022cdnet,\ntitle={{CDN}et: A cascaded decoupling architecture for video prediction},\nauthor={Chuanqi Zang and Mingtao Pei},\nyear={2022},\nurl={https://openreview.net/forum?id=DmKu5T2gEqc}\n}", "github": "", "project": "", "reviewers": "nJdu;xiuF;RsCF;zKrJ", "site": "https://openreview.net/forum?id=DmKu5T2gEqc", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "4;4;4;4", "correctness": "2;3;2;4", "technical_novelty": "2;3;2;3", "empirical_novelty": "2;0;2;3", "wc_summary_paper": "53;58;61;63", "wc_summary_review": "88;24;52;47", "wc_main_review": "412;175;149;340", "wc_review": "553;257;262;450", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 2.75, 0.82915619758885 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 1.75, 1.0897247358851685 ], "wc_summary_paper_avg": [ 58.75, 3.766629793329841 ], "wc_summary_review_avg": [ 52.75, 22.92787604642 ], "wc_main_review_avg": [ 269.0, 110.36983283488291 ], "wc_review_avg": [ 380.5, 126.37345449104413 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.7608859102526822, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:GhNPAifGTCoJ:scholar.google.com/&scioq=CDNet:+A+cascaded+decoupling+architecture+for+video+prediction&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Beijing Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "http://www.bit.edu.cn/", "aff_unique_abbr": "BIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "title": "Omni-Dimensional Dynamic Convolution", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6455", "id": "DmpCfq6Mg39", "poster": "", "openreview": "https://openreview.net/forum?id=DmpCfq6Mg39", "slides": "https://iclr.cc/virtual/2022/poster/6455", "video": "https://iclr.cc/virtual/2022/poster/6455", "author_site": "Chao Li, Aojun Zhou, Anbang Yao", "tldr": "", "abstract": "Learning a single static convolutional kernel in each convolutional layer is the common training paradigm of modern Convolutional Neural Networks (CNNs). Instead, recent research in dynamic convolution shows that learning a linear combination of n convolutional kernels weighted with their input-dependent attentions can significantly improve the accuracy of light-weight CNNs, while maintaining efficient inference. However, we observe that existing works endow convolutional kernels with the dynamic property through one dimension (regarding the convolutional kernel number) of the kernel space, but the other three dimensions (regarding the spatial size, the input channel number and the output channel number for each convolutional kernel) are overlooked. Inspired by this, we present Omni-dimensional Dynamic Convolution (ODConv), a more generalized yet elegant dynamic convolution design, to advance this line of research. ODConv leverages a novel multi-dimensional attention mechanism with a parallel strategy to learn complementary attentions for convolutional kernels along all four dimensions of the kernel space at any convolutional layer. As a drop-in replacement of regular convolutions, ODConv can be plugged into many CNN architectures. Extensive experiments on the ImageNet and MS-COCO datasets show that ODConv brings solid accuracy boosts for various prevailing CNN backbones including both light-weight and large ones, e.g., 3.77%~5.71%|1.86%~3.72% absolute top-1 improvements to MobivleNetV2|ResNet family on the ImageNet dataset. Intriguingly, thanks to its improved feature learning ability, ODConv with even one single kernel can compete with or outperform existing dynamic convolution counterparts with multiple kernels, substantially reducing extra parameters. Furthermore, ODConv is also superior to other attention modules for modulating the output features or the convolutional weights. Code and models will be available at https://github.com/OSVAI/ODConv.", "keywords": "Convolutional Neural Networks;Dynamic Convolution;Attention;Image Classification", "primary_area": "", "supplementary_material": "", "author": "Chao Li;Aojun Zhou;Anbang Yao", "authorids": "~Chao_Li16;~Aojun_Zhou2;~Anbang_Yao1", "gender": "M;;M", "homepage": "https://github.com/chaoli-ai/chaoli.github.io;https://yaoanbang.github.io/;", "dblp": ";http://dblp.uni-trier.de/pers/hd/y/Yao:Anbang;195/6034", "google_scholar": ";b9hCmPYAAAAJ;cC8lXi8AAAAJ", "orcid": ";0000-0002-3878-8679;", "linkedin": ";anbang-yao-1805b712a/;", "or_profile": "~Chao_Li16;~Anbang_Yao1;~Aojun_Zhou3", "aff": "Intel;Intel;The Chinese University of Hong Kong", "aff_domain": "intel.com;intel.com;cuhk.edu.hk", "position": "Researcher;Principal Researcher;Researcher", "bibtex": "@inproceedings{\nli2022omnidimensional,\ntitle={Omni-Dimensional Dynamic Convolution},\nauthor={Chao Li and Aojun Zhou and Anbang Yao},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=DmpCfq6Mg39}\n}", "github": "", "project": "", "reviewers": "kYmD;PSFr;CKv4;fTd9", "pdf_size": 0, "recommendation": "6;8;8;8", "confidence": "4;5;3;4", "correctness": "3;4;3;4", "technical_novelty": "2;4;4;3", "empirical_novelty": "3;1;3;0", "wc_summary_paper": "44;52;35;77", "wc_summary_review": "64;86;40;61", "wc_main_review": "280;449;165;382", "wc_review": "388;587;240;520", "wc_reply_reviewers": "65;129;50;0", "wc_reply_authors": "2235;2274;646;1103", "reply_reviewers": "1;2;1;0", "reply_authors": "3;3;1;2", "recommendation_avg": [ 7.5, 0.8660254037844386 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.25, 0.82915619758885 ], "empirical_novelty_avg": [ 1.75, 1.299038105676658 ], "wc_summary_paper_avg": [ 52.0, 15.636495771111889 ], "wc_summary_review_avg": [ 62.75, 16.29992331270304 ], "wc_main_review_avg": [ 319.0, 107.36153873711014 ], "wc_review_avg": [ 433.75, 132.8126029411366 ], "wc_reply_reviewers_avg": [ 61.0, 46.048887065813005 ], "wc_reply_authors_avg": [ 1564.5, 708.7991605525503 ], "reply_reviewers_avg": [ 1.0, 0.7071067811865476 ], "reply_authors_avg": [ 2.25, 0.82915619758885 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 472, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3010782089276051732&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=DmpCfq6Mg39", "email": "intel.com;intel.com;cuhk.edu.hk", "author_num": 3, "aff_unique_index": "0;0;1", "aff_unique_norm": "Intel;Chinese University of Hong Kong", "aff_unique_dep": "Intel Corporation;", "aff_unique_url": "https://www.intel.com;https://www.cuhk.edu.hk", "aff_unique_abbr": "Intel;CUHK", "aff_campus_unique_index": "1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;1", "aff_country_unique": "United States;China" }, { "title": "MoReL: Multi-omics Relational Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7199", "id": "DnG75_KyHjX", "poster": "", "openreview": "https://openreview.net/forum?id=DnG75_KyHjX", "slides": "https://iclr.cc/virtual/2022/poster/7199", "video": "https://iclr.cc/virtual/2022/poster/7199", "author_site": "Arman Hasanzadeh, Ehsan Hajiramezanali, Nick Duffield, Xiaoning Qian", "tldr": "", "abstract": "Multi-omics data analysis has the potential to discover hidden molecular interactions, revealing potential regulatory and/or signal transduction pathways for cellular processes of interest when studying life and disease systems. One of critical challenges when dealing with real-world multi-omics data is that they may manifest heterogeneous structures and data quality as often existing data may be collected from different subjects under different conditions for each type of omics data. We propose a novel deep Bayesian generative model to efficiently infer a multi-partite graph encoding molecular interactions across such heterogeneous views, using a fused Gromov-Wasserstein (FGW) regularization between latent representations of corresponding views for integrative analysis. With such an optimal transport regularization in the deep Bayesian generative model, it not only allows incorporating view-specific side information, either with graph-structured or unstructured data in different views, but also increases the model flexibility with the distribution-based regularization. This allows efficient alignment of heterogeneous latent variable distributions to derive reliable interaction predictions compared to the existing point-based graph embedding methods. Our experiments on several real-world datasets demonstrate enhanced performance of MoReL in inferring meaningful interactions compared to existing baselines.", "keywords": "relational learning;data integration;multi-view learning;Bayesian generative model", "primary_area": "", "supplementary_material": "", "author": "Arman Hasanzadeh;Ehsan Hajiramezanali;Nick Duffield;Xiaoning Qian", "authorids": "~Arman_Hasanzadeh1;~Ehsan_Hajiramezanali1;~Nick_Duffield1;~Xiaoning_Qian2", "gender": ";M;M;", "homepage": ";http://ehsanhajiramezanali.github.io/;https://www.ece.tamu.edu/~xqian;https://tx.ag/duffield", "dblp": "213/7415;225/3486;62/4504;d/NickGDuffield.html", "google_scholar": "jjNcpoEAAAAJ;20I_DMoAAAAJ;dXGlddgAAAAJ;hrQOcZcAAAAJ", "orcid": ";;0000-0002-4347-2476;", "linkedin": ";ehsan-hajiramezanali-978a3b52/;;", "or_profile": "~Arman_Hasanzadeh1;~Ehsan_Hajiramezanali1;~Xiaoning_Qian2;~Nicholas_Duffield1", "aff": "Google;AstraZeneca;Texas A&M;Texas A&M", "aff_domain": "google.com;astrazeneca.com;tamu.edu;tamu.edu", "position": "Researcher;AI Research Scientist;Full Professor;Full Professor", "bibtex": "@inproceedings{\nhasanzadeh2022morel,\ntitle={MoReL: Multi-omics Relational Learning},\nauthor={Arman Hasanzadeh and Ehsan Hajiramezanali and Nick Duffield and Xiaoning Qian},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=DnG75_KyHjX}\n}", "github": "", "project": "", "reviewers": "dcdZ;2ULW;cXWm;wNNw", "pdf_size": 0, "recommendation": "5;5;6;8", "confidence": "2;3;3;3", "correctness": "3;3;3;3", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "22;42;57;22", "wc_summary_review": "33;40;34;30", "wc_main_review": "211;137;247;340", "wc_review": "266;219;338;392", "wc_reply_reviewers": "167;0;0;0", "wc_reply_authors": "585;317;251;618", "reply_reviewers": "1;0;0;0", "reply_authors": "2;1;1;1", "recommendation_avg": [ 6.0, 1.224744871391589 ], "confidence_avg": [ 2.75, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 35.75, 14.737282653189496 ], "wc_summary_review_avg": [ 34.25, 3.6314597615834874 ], "wc_main_review_avg": [ 233.75, 73.04579043312489 ], "wc_review_avg": [ 303.75, 66.27358070905781 ], "wc_reply_reviewers_avg": [ 41.75, 72.31312121600062 ], "wc_reply_authors_avg": [ 442.75, 160.87941913122387 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.4714045207910316, "corr_recommendation_correctness": 0.0, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6342258791267813011&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "pdf": "https://openreview.net/pdf?id=DnG75_KyHjX", "email": "google.com;astrazeneca.com;tamu.edu;tamu.edu", "author_num": 4, "aff_unique_index": "0;1;2;2", "aff_unique_norm": "Google;AstraZeneca;Texas A&M University", "aff_unique_dep": "Google;;", "aff_unique_url": "https://www.google.com;https://www.astrazeneca.com;https://www.tamu.edu", "aff_unique_abbr": "Google;AZ;TAMU", "aff_campus_unique_index": "0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;1;0;0", "aff_country_unique": "United States;United Kingdom" }, { "id": "DnG8f7gweH4", "title": "Piecing and Chipping: An effective solution for the information-erasing view generation in Self-supervised Learning", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "In self-supervised learning frameworks, deep networks are optimized to align different views of an instance that contains the similar visual semantic information. The views are generated by conducting series of data augmentation to the anchor samples. Although the data augmentation operations are often designed to be aggressive and extensive to lower the mutual information between views, the family of Information-Erasing data augmentation that masks out region of images is barely considered. In this work, we propose the Piecing and Chipping enhanced Erasing Augmentation (PCEA) approach to making the self-supervised learning algorithms benefit from the effectiveness of Information-Erasing data augmentation. Specifically, we design a pipeline to generate mutually weakly related transformed views using random erasing and build corresponding loss terms to take advantage of these views. Extensive experiments demonstrate the effectiveness of our method. Particularly, applying our PCEA to MoCo v2 improves the baseline by 12.84\\%, 3.3\\% in terms of linear classification on ImageNet-100 and ImageNet-1K.", "keywords": "Data Augmentation;Self-supervised;Contrastive Learning;Deep Learning", "primary_area": "", "supplementary_material": "", "author": "Jingwei Liu;Yi Gu;Shentong Mo;Zhun Sun;Shumin Han;Jiafeng Guo;Xueqi Cheng", "authorids": "~Jingwei_Liu1;guyi03@baidu.com;~Shentong_Mo2;~Zhun_Sun1;~Shumin_Han1;~Jiafeng_Guo1;~Xueqi_Cheng1", "gender": "M;;;Non-Binary;M;M;M", "homepage": "https://www.zhihu.com/people/elvis-62-43;;https://stonemo.github.io/;https://minogame.github.io/;https://www.researchgate.net/scientific-contributions/Shumin-Han-2149208232;http://www.bigdatalab.ac.cn/gjf/;https://people.ucas.ac.cn/~cxq?language=en", "dblp": ";;;185/6899;https://dblp.uni-trier.de/pid/119/8234;02/146;44/912", "google_scholar": ";;;Y-3iZ9EAAAAJ;eFoLdbQAAAAJ;https://scholar.google.com/citations?view_op=list_works;hY8aLqAAAAAJ", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": "~Jingwei_Liu1;guyi03@baidu.com;~Shentong_Mo2;~Zhun_Sun1;~Shumin_Han1;~Jiafeng_Guo1;~Xueqi_Cheng1", "aff": "University of Chinese Academy of Sciences;;;Baidu;Baidu;Institute of Computing Technolgy, Chinese Academy of Sciences;Institute of Computing Technology, Chinese Academy", "aff_domain": "ucas.edu;;;baidu.com;baidu.com;ict.ac.cn;ict.ac.cn", "position": "PhD student;;;Researcher;Researcher;Researcher;Full Professor", "bibtex": "@misc{\nliu2022piecing,\ntitle={Piecing and Chipping: An effective solution for the information-erasing view generation in Self-supervised Learning},\nauthor={Jingwei Liu and Yi Gu and Shentong Mo and Zhun Sun and Shumin Han and Jiafeng Guo and Xueqi Cheng},\nyear={2022},\nurl={https://openreview.net/forum?id=DnG8f7gweH4}\n}", "github": "", "project": "", "reviewers": "aEwU;ib9n;YmBT;ZDgn", "site": "https://openreview.net/forum?id=DnG8f7gweH4", "pdf_size": 0, "recommendation": "3;3;5;6", "confidence": "2;4;4;3", "correctness": "2;2;3;3", "technical_novelty": "2;2;3;2", "empirical_novelty": "2;4;0;3", "wc_summary_paper": "75;58;43;71", "wc_summary_review": "23;22;50;68", "wc_main_review": "205;97;141;131", "wc_review": "303;177;234;270", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.25, 1.299038105676658 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 1.479019945774904 ], "wc_summary_paper_avg": [ 61.75, 12.517487767120047 ], "wc_summary_review_avg": [ 40.75, 19.330998422223306 ], "wc_main_review_avg": [ 143.5, 39.073648409126065 ], "wc_review_avg": [ 246.0, 46.71723450719231 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.17407765595569782, "corr_recommendation_correctness": 0.9622504486493761, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4497605718974102158&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;1;2;2", "aff_unique_norm": "University of Chinese Academy of Sciences;Baidu;Chinese Academy of Sciences", "aff_unique_dep": ";Baidu, Inc.;Institute of Computing Technology", "aff_unique_url": "http://www.ucas.ac.cn;https://www.baidu.com;http://www.ict.ac.cn", "aff_unique_abbr": "UCAS;Baidu;CAS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "id": "DqJgzrcA8lH", "title": "Latent Space Smoothing for Individually Fair Representations", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Fair representation learning encodes user data to ensure fairness and utility, regardless of the downstream application. However, learning individually fair representations, i.e., guaranteeing that similar individuals are treated similarly, remains challenging in high-dimensional settings such as computer vision. In this work, we introduce LASSI, the first representation learning method for certifying individual fairness of high-dimensional data. Our key insight is to leverage recent advances in generative modeling to capture the set of similar individuals in the generative latent space. This allows learning an individually fair representation where similar individuals are mapped close together, by using adversarial training to minimize the distance between the representations of similar individuals. Finally, we employ randomized smoothing to provably map similar individuals close together, in turn ensuring that local robustness verification of the downstream application results in end-to-end fairness certification. Our experimental evaluation on challenging real-world image data demonstrates that our method increases certified individual fairness by more than 60%, without significantly affecting task utility.", "keywords": "fairness;fair representation learning;adversarial fairness;trustworthy machine learning;randomized smoothing", "primary_area": "", "supplementary_material": "", "author": "Momchil Peychev;Anian Ruoss;Mislav Balunovic;Maximilian Baader;Martin Vechev", "authorids": "~Momchil_Peychev1;~Anian_Ruoss1;~Mislav_Balunovic1;~Maximilian_Baader1;~Martin_Vechev1", "gender": "M;M;M;;M", "homepage": "https://www.sri.inf.ethz.ch/people/momchil;;https://www.sri.inf.ethz.ch/people/mislav;https://www.sri.inf.ethz.ch/people/max;https://www.sri.inf.ethz.ch/people/martin", "dblp": "210/2351;259/2083;231/7686;249/8060;93/2189.html", "google_scholar": "RuhLJ8oAAAAJ;gFkwD3kAAAAJ;fxkgmGwAAAAJ;LKqCkWoAAAAJ;https://scholar.google.ch/citations?user=aZ1Rh50AAAAJ", "orcid": "0000-0003-0927-6356;;;0000-0002-9271-6422;", "linkedin": ";anian-ruoss;;;", "or_profile": "~Momchil_Peychev1;~Anian_Ruoss1;~Mislav_Balunovic1;~Maximilian_Baader1;~Martin_Vechev1", "aff": "ETH Zurich;Google DeepMind;Swiss Federal Institute of Technology;ETH Zurich;Swiss Federal Institute of Technology", "aff_domain": "ethz.ch;deepmind.com;ethz.ch;ethz.ch;ethz.ch", "position": "PhD student;Researcher;PhD student;PhD student;Full Professor", "bibtex": "@misc{\npeychev2022latent,\ntitle={Latent Space Smoothing for Individually Fair Representations},\nauthor={Momchil Peychev and Anian Ruoss and Mislav Balunovic and Maximilian Baader and Martin Vechev},\nyear={2022},\nurl={https://openreview.net/forum?id=DqJgzrcA8lH}\n}", "github": "", "project": "", "reviewers": "PQfq;YrMd;GWWq;rzfS", "site": "https://openreview.net/forum?id=DqJgzrcA8lH", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "4;4;4;4", "correctness": "2;2;3;3", "technical_novelty": "2;2;4;3", "empirical_novelty": "1;1;3;3", "wc_summary_paper": "56;131;89;143", "wc_summary_review": "70;55;51;398", "wc_main_review": "518;379;548;1206", "wc_review": "644;565;688;1747", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.82915619758885 ], "empirical_novelty_avg": [ 2.0, 1.0 ], "wc_summary_paper_avg": [ 104.75, 34.55701810052482 ], "wc_summary_review_avg": [ 143.5, 147.10625411586008 ], "wc_main_review_avg": [ 662.75, 320.0604435102845 ], "wc_review_avg": [ 911.0, 484.67256988610364 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 1.0, "gs_citation": 22, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12962197532517960978&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "aff_unique_index": "0;1;2;0;2", "aff_unique_norm": "ETH Zurich;Google;Swiss Federal Institute of Technology", "aff_unique_dep": ";Google DeepMind;", "aff_unique_url": "https://www.ethz.ch;https://deepmind.com;https://www.ethz.ch", "aff_unique_abbr": "ETHZ;DeepMind;ETH Zurich", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0;0", "aff_country_unique": "Switzerland;United Kingdom" }, { "id": "DrCsriMQ1o", "title": "Gradient-based Counterfactual Explanations using Tractable Probabilistic Models", "track": "main", "status": "Reject", "tldr": "", "abstract": "Counterfactual examples are an appealing class of post-hoc explanations for machine learning models. Given input x of class y, its counterfactual is a contrastive example x' of another class y'. Current approaches primarily solve this task by a complex optimization: define an objective function based on the loss of the counterfactual outcome y' with hard or soft constraints, then optimize this function as a black-box. This \u201cdeep learning\u201d approach, however, is rather slow, sometimes tricky, and may result in unrealistic counterfactual examples. In this work, we propose a novel approach to deal with these problems using only two gradient computations based on tractable probabilistic models. First, we compute an unconstrained counterfactual u of x to induce the counterfactual outcome y'. Then, we adapt u to higher density regions, resulting in x'. Empirical evidence demonstrates the dominant advantages of our approach.", "keywords": "Counterfactual example;Sum product networks;tractable probabilistic models;counterfactual explanation.", "primary_area": "", "supplementary_material": "", "author": "Xiaoting Shao;Kristian Kersting", "authorids": "~Xiaoting_Shao1;~Kristian_Kersting1", "gender": "F;M", "homepage": "https://ml-research.github.io/people/xshao/;http://www.ml.informatik.tu-darmstadt.de/", "dblp": "241/7076.html;40/3793", "google_scholar": "nTwh6GAAAAAJ;QY-earAAAAAJ", "orcid": ";0000-0002-2873-9152", "linkedin": ";", "or_profile": "~Xiaoting_Shao1;~Kristian_Kersting1", "aff": ";TU Darmstadt", "aff_domain": ";tu-darmstadt.de", "position": ";Full Professor", "bibtex": "@misc{\nshao2022gradientbased,\ntitle={Gradient-based Counterfactual Explanations using Tractable Probabilistic Models},\nauthor={Xiaoting Shao and Kristian Kersting},\nyear={2022},\nurl={https://openreview.net/forum?id=DrCsriMQ1o}\n}", "github": "", "project": "", "reviewers": "Nam5;Hebe;eBHW;LeuZ", "site": "https://openreview.net/forum?id=DrCsriMQ1o", "pdf_size": 0, "recommendation": "3;3;5;8", "confidence": "2;3;5;4", "correctness": "2;3;3;4", "technical_novelty": "2;2;3;3", "empirical_novelty": "3;2;3;3", "wc_summary_paper": "112;105;50;69", "wc_summary_review": "62;44;39;39", "wc_main_review": "522;193;539;256", "wc_review": "696;342;628;364", "wc_reply_reviewers": "0;0;437;0", "wc_reply_authors": "764;459;1257;442", "reply_reviewers": "0;0;1;0", "reply_authors": "1;1;3;1", "recommendation_avg": [ 4.75, 2.0463381929681126 ], "confidence_avg": [ 3.5, 1.118033988749895 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 84.0, 25.524498036200438 ], "wc_summary_review_avg": [ 46.0, 9.460443964212251 ], "wc_main_review_avg": [ 377.5, 154.72960285607923 ], "wc_review_avg": [ 507.5, 156.552706779538 ], "wc_reply_reviewers_avg": [ 109.25, 189.22655072689983 ], "wc_reply_authors_avg": [ 730.5, 329.8745973851276 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.5, 0.8660254037844386 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.6009942011789684, "corr_recommendation_correctness": 0.8638684255813602, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16837635309924643361&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0", "aff_unique_norm": "Technische Universit\u00e4t Darmstadt", "aff_unique_dep": "", "aff_unique_url": "https://www.tu-darmstadt.de", "aff_unique_abbr": "TU Darmstadt", "aff_campus_unique_index": "0", "aff_campus_unique": "Darmstadt", "aff_country_unique_index": "0", "aff_country_unique": "Germany" }, { "title": "Shuffle Private Stochastic Convex Optimization", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6703", "id": "DrZXuTGg2A-", "poster": "", "openreview": "https://openreview.net/forum?id=DrZXuTGg2A-", "slides": "https://iclr.cc/virtual/2022/poster/6703", "video": "https://iclr.cc/virtual/2022/poster/6703", "author_site": "Albert Cheu, Matthew Joseph, Jieming Mao, Binghui Peng", "tldr": "", "abstract": "In shuffle privacy, each user sends a collection of randomized messages to a trusted shuffler, the shuffler randomly permutes these messages, and the resulting shuffled collection of messages must satisfy differential privacy. Prior work in this model has largely focused on protocols that use a single round of communication to compute algorithmic primitives like means, histograms, and counts. In this work, we present interactive shuffle protocols for stochastic convex optimization. Our optimization protocols rely on a new noninteractive protocol for summing vectors of bounded $\\ell_2$ norm. By combining this sum subroutine with techniques including mini-batch stochastic gradient descent, accelerated gradient descent, and Nesterov's smoothing method, we obtain loss guarantees for a variety of convex loss functions that significantly improve on those of the local model and sometimes match those of the central model.", "keywords": "shuffle privacy;stochastic convex optimization;differential privacy", "primary_area": "", "supplementary_material": "", "author": "Albert Cheu;Matthew Joseph;Jieming Mao;Binghui Peng", "authorids": "~Albert_Cheu1;~Matthew_Joseph1;~Jieming_Mao1;~Binghui_Peng1", "gender": "M;M;M;M", "homepage": "https://albertcheu.com;https://www.majos.net/;https://sites.google.com/seas.upenn.edu/jiemingmao/;http://www.cs.columbia.edu/~binghuip/", "dblp": "209/9888;180/5618;123/4948;210/2619", "google_scholar": ";uzY-OQ-QMAEC;;twlFI3sAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Albert_Cheu1;~Matthew_Joseph1;~Jieming_Mao1;~Binghui_Peng1", "aff": "Georgetown University;Google;Google;Columbia University", "aff_domain": "georgetown.edu;google.com;google.com;columbia.edu", "position": "Postdoc;Research scientist;Research Scientist;PhD student", "bibtex": "@inproceedings{\ncheu2022shuffle,\ntitle={Shuffle Private Stochastic Convex Optimization},\nauthor={Albert Cheu and Matthew Joseph and Jieming Mao and Binghui Peng},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=DrZXuTGg2A-}\n}", "github": "", "project": "", "reviewers": "d49M;Frn5;6c6H;tikt", "pdf_size": 0, "recommendation": "6;6;8;8", "confidence": "4;4;3;3", "correctness": "3;4;4;4", "technical_novelty": "3;2;3;4", "empirical_novelty": "0;0;0;0", "wc_summary_paper": "69;33;284;69", "wc_summary_review": "85;62;123;33", "wc_main_review": "811;818;521;272", "wc_review": "965;913;928;374", "wc_reply_reviewers": "231;1753;0;0", "wc_reply_authors": "1172;2818;14;149", "reply_reviewers": "3;4;0;0", "reply_authors": "3;4;1;1", "recommendation_avg": [ 7.0, 1.0 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 0, 0 ], "wc_summary_paper_avg": [ 113.75, 99.38655593187642 ], "wc_summary_review_avg": [ 75.75, 32.91940916845258 ], "wc_main_review_avg": [ 605.5, 226.79781744981585 ], "wc_review_avg": [ 795.0, 243.80012305165064 ], "wc_reply_reviewers_avg": [ 496.0, 731.830923096312 ], "wc_reply_authors_avg": [ 1038.25, 1120.8537761456666 ], "reply_reviewers_avg": [ 1.75, 1.7853571071357126 ], "reply_authors_avg": [ 2.25, 1.299038105676658 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -1.0, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 30, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17835472974352842959&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=DrZXuTGg2A-", "email": "georgetown.edu;google.com;google.com;columbia.edu", "author_num": 4, "aff_unique_index": "0;1;1;2", "aff_unique_norm": "Georgetown University;Google;Columbia University", "aff_unique_dep": ";Google;", "aff_unique_url": "https://www.georgetown.edu;https://www.google.com;https://www.columbia.edu", "aff_unique_abbr": "GU;Google;Columbia", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "DrpKmCmPMSC", "title": "Meta-free few-shot learning via representation learning with weight averaging", "track": "main", "status": "Reject", "tldr": "", "abstract": "Recent studies on few-shot classification using transfer learning pose challenges to the effectiveness and efficiency of episodic meta-learning algorithms. Transfer learning approaches are a natural alternative, but they are restricted to few-shot classification. Moreover, little attention has been on the development of probabilistic models with well-calibrated uncertainty from few-shot samples, except for some Bayesian episodic learning algorithms. To tackle the aforementioned issues, we propose a new transfer learning method to obtain accurate and reliable models for few-shot regression and classification. The resulting method does not require episodic meta-learning and is called meta-free representation learning (MFRL). MFRL first finds low-rank representation generalizing well on meta-test tasks. Given the learned representation, probabilistic linear models are fine-tuned with few-shot samples to obtain models with well-calibrated uncertainty. The proposed method not only achieves the highest accuracy on a wide range of few-shot learning benchmark datasets but also correctly quantifies the prediction uncertainty. In addition, weight averaging and temperature scaling are effective in improving the accuracy and reliability of few-shot learning in existing meta-learning algorithms with a wide range of learning paradigms and model architectures.", "keywords": "few-shot learning;representation learning", "primary_area": "", "supplementary_material": "", "author": "Kuilin Chen;Chi-Guhn Lee", "authorids": "~Kuilin_Chen1;~Chi-Guhn_Lee1", "gender": "M;M", "homepage": ";http://cglee.mie.utoronto.ca", "dblp": ";62/4690", "google_scholar": "Q7eOfgoAAAAJ;https://scholar.google.ca/citations?user=ZpALG2AAAAAJ", "orcid": ";0000-0002-0916-0241", "linkedin": ";", "or_profile": "~Kuilin_Chen1;~Chi-Guhn_Lee1", "aff": "University of Toronto;University of Toronto", "aff_domain": "toronto.ca;mie.utoronto.ca", "position": "PhD student;Full Professor", "bibtex": "@misc{\nchen2022metafree,\ntitle={Meta-free few-shot learning via representation learning with weight averaging},\nauthor={Kuilin Chen and Chi-Guhn Lee},\nyear={2022},\nurl={https://openreview.net/forum?id=DrpKmCmPMSC}\n}", "github": "", "project": "", "reviewers": "useT;jVyb;7iTd", "site": "https://openreview.net/forum?id=DrpKmCmPMSC", "pdf_size": 0, "recommendation": "5;5;6", "confidence": "3;4;4", "correctness": "3;3;4", "technical_novelty": "2;2;3", "empirical_novelty": "0;2;3", "wc_summary_paper": "75;82;189", "wc_summary_review": "49;51;81", "wc_main_review": "577;731;441", "wc_review": "701;864;711", "wc_reply_reviewers": "0;0;84", "wc_reply_authors": "1378;647;1026", "reply_reviewers": "0;0;1", "reply_authors": "3;1;3", "recommendation_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 1.6666666666666667, 1.247219128924647 ], "wc_summary_paper_avg": [ 115.33333333333333, 52.16853031814827 ], "wc_summary_review_avg": [ 60.333333333333336, 14.636332266733433 ], "wc_main_review_avg": [ 583.0, 118.46799849185714 ], "wc_review_avg": [ 758.6666666666666, 74.59371436134697 ], "wc_reply_reviewers_avg": [ 28.0, 39.59797974644666 ], "wc_reply_authors_avg": [ 1017.0, 298.49734783858077 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 2.3333333333333335, 0.9428090415820634 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.4999999999999999, "corr_recommendation_correctness": 0.9999999999999997, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8875638106185817091&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0", "aff_unique_norm": "University of Toronto", "aff_unique_dep": "", "aff_unique_url": "https://www.utoronto.ca", "aff_unique_abbr": "U of T", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Canada" }, { "id": "DtfrnB1fiX", "title": "Squeezing SGD Parallelization Performance in Distributed Training Using Delayed Averaging", "track": "main", "status": "Reject", "tldr": "", "abstract": "State-of-the-art deep learning algorithms rely on distributed training to tackle the increasing model size and training data. Mini-batch Stochastic Gradient Descent (SGD) requires workers to halt forward/backward propagations, to wait for gradients synchronized among all workers before the next batch of tasks. The synchronous execution model exposes the overhead of gradient communication among a large number of workers in a distributed training system.\n\nTo this end, we propose a new SGD algorithm with delayed averaging, namely DaSGD, which can fully parallelize SGD and forward/backward propagations to hide 100\\% of gradient communication. By adjusting the gradient update scheme, this algorithm uses hardware resources more efficiently and reduces the reliance on high-throughput inter-connects. The theoretical analysis and experimental results conducted in this paper both show its convergence rate of $ O (1 / \\sqrt {K} )$ stays the same as Mini-batch SGD. A analytical model shows that it enables linear performance scalability with the cluster size.", "keywords": "SGD;distributed training;hide communication cost;convergence", "primary_area": "", "supplementary_material": "", "author": "Pengcheng Li;Yixin Guo;Yawen Zhang;Qinggang Zhou", "authorids": "~Pengcheng_Li2;yixinguo@pku.edu.cn;~Yawen_Zhang2;~Qinggang_Zhou1", "gender": "M;;F;", "homepage": ";;https://www.linkedin.com/in/zhywenwen/;https://www.linkedin.com/in/qinggang-zhou-18a5a61", "dblp": ";;;", "google_scholar": "w_j9E10AAAAJ;;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Pengcheng_Li2;yixinguo@pku.edu.cn;~Yawen_Zhang2;~Qinggang_Zhou1", "aff": "TikTok Inc.;;;Amazon", "aff_domain": "tiktok.com;;;amazon.com", "position": "Researcher;;;Principal Engineer", "bibtex": "@misc{\nli2022squeezing,\ntitle={Squeezing {SGD} Parallelization Performance in Distributed Training Using Delayed Averaging},\nauthor={Pengcheng Li and Yixin Guo and Yawen Zhang and Qinggang Zhou},\nyear={2022},\nurl={https://openreview.net/forum?id=DtfrnB1fiX}\n}", "github": "", "project": "", "reviewers": "SC8P;MLkr;NVwg;1qEJ", "site": "https://openreview.net/forum?id=DtfrnB1fiX", "pdf_size": 0, "recommendation": "3;3;3;3", "confidence": "4;3;3;5", "correctness": "4;4;3;3", "technical_novelty": "1;1;2;2", "empirical_novelty": "1;2;2;2", "wc_summary_paper": "21;38;23;80", "wc_summary_review": "16;32;223;99", "wc_main_review": "127;203;121;505", "wc_review": "164;273;367;684", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 1.5, 0.5 ], "empirical_novelty_avg": [ 1.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 40.5, 23.732888572611635 ], "wc_summary_review_avg": [ 92.5, 81.52453618389987 ], "wc_main_review_avg": [ 239.0, 156.9394787808345 ], "wc_review_avg": [ 372.0, 193.92911075957628 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Eqn_AS2mocwJ:scholar.google.com/&scioq=Squeezing+SGD+Parallelization+Performance+in+Distributed+Training+Using+Delayed+Averaging&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1", "aff_unique_norm": "TikTok Inc.;Amazon", "aff_unique_dep": ";Amazon.com, Inc.", "aff_unique_url": "https://www.tiktok.com;https://www.amazon.com", "aff_unique_abbr": "TikTok;Amazon", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "China;United States" }, { "title": "Latent Variable Sequential Set Transformers for Joint Multi-Agent Motion Prediction", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6106", "id": "Dup_dDqkZC5", "poster": "", "openreview": "https://openreview.net/forum?id=Dup_dDqkZC5", "slides": "https://iclr.cc/virtual/2022/poster/6106", "video": "https://iclr.cc/virtual/2022/poster/6106", "author_site": "Roger Girgis, Florian Golemo, Felipe Codevilla, Martin Weiss, Jim D'Souza, Samira Ebrahimi Kahou, Felix Heide, Chris J Pal", "tldr": "", "abstract": "Robust multi-agent trajectory prediction is essential for the safe control of robotic systems. A major challenge is to efficiently learn a representation that approximates the true joint distribution of contextual, social, and temporal information to enable planning. We propose Latent Variable Sequential Set Transformers which are encoder-decoder architectures that generate scene-consistent multi-agent trajectories. We refer to these architectures as \u201cAutoBots\u201d. The encoder is a stack of interleaved temporal and social multi-head self-attention (MHSA) modules which alternately perform equivariant processing across the temporal and social dimensions. The decoder employs learnable seed parameters in combination with temporal and social MHSA modules allowing it to perform inference over the\nentire future scene in a single forward pass efficiently. AutoBots can produce either the trajectory of one ego-agent or a distribution over the future trajectories for all agents in the scene. For the single-agent prediction case, our model achieves top results on the global nuScenes vehicle motion prediction leaderboard, and produces strong results on the Argoverse vehicle prediction challenge. In the multi-agent setting, we evaluate on the synthetic partition of TrajNet++ dataset to showcase the model\u2019s socially-consistent predictions. We also demonstrate our model on general sequences of sets and provide illustrative experiments modelling the sequential structure of the multiple strokes that make up symbols in the Omniglot data. A distinguishing feature of AutoBots is that all models are trainable on a\nsingle desktop GPU (1080 Ti) in under 48h.", "keywords": "trajectory prediction;motion forecasting;transformers;latent variable models", "primary_area": "", "supplementary_material": "/attachment/fa09bc152c94a14f94ed217063872f9875575325.zip", "author": "Roger Girgis;Florian Golemo;Felipe Codevilla;Martin Weiss;Jim Aldon D'Souza;Samira Ebrahimi Kahou;Felix Heide;Christopher Pal", "authorids": "~Roger_Girgis1;~Florian_Golemo1;~Felipe_Codevilla1;~Martin_Weiss4;~Jim_Aldon_D'Souza1;~Samira_Ebrahimi_Kahou1;~Felix_Heide2;~Christopher_Pal1", "gender": "M;M;M;M;M;F;;", "homepage": ";https://fgolemo.github.io/;http://www.codevilla.info/;https://www.martincsweiss.com/;http://jimaldon.com/about.html;https://saebrahimi.github.io;https://www.cs.princeton.edu/~fheide/;https://scholar.google.ca/citations?user=1ScWJOoAAAAJ&hl=en&oi=ao", "dblp": ";08/8643;163/9067;12/3210;;20/11069;01/9396;45/1217", "google_scholar": "https://scholar.google.ca/citations?user=mBLay8oAAAAJ;https://scholar.google.de/citations?user=qvRf9xsAAAAJ;NamIygIAAA;t7lQYWwAAAAJ;QZqLtxYAAAAJ;https://scholar.google.ca/citations?user=F99FuaAAAAAJ;gRqzSHsAAAAJ;https://scholar.google.ca/citations?user=1ScWJOoAAAAJ", "orcid": ";0000-0001-9238-7764;;;0000-0002-5230-8029;;;", "linkedin": ";;;martin-clyde-weiss/;jimaldon/;;;", "or_profile": "~Roger_Girgis1;~Florian_Golemo1;~Felipe_Codevilla1;~Martin_Weiss4;~Jim_Aldon_D'Souza1;~Samira_Ebrahimi_Kahou1;~Felix_Heide2;~Christopher_Pal1", "aff": "Mila - Quebec Artificial Intelligence Institute;Mila;Independent Robotics;Montreal Institute for Learning Algorithms, University of Montreal, University of Montreal;Algolux;\u00c9cole de technologie sup\u00e9rieure;Algolux;Polytechnique Montreal", "aff_domain": "mila.quebec;mila.quebec;independentrobotics.com;mila.umontreal.ca;algolux.com;etsmtl.ca;algolux.com;polymtl.ca", "position": "PhD student;Postdoc;Researcher;PhD student;Robotics Team Lead;Associate Professor;CTO;Full Professor", "bibtex": "@inproceedings{\ngirgis2022latent,\ntitle={Latent Variable Sequential Set Transformers for Joint Multi-Agent Motion Prediction},\nauthor={Roger Girgis and Florian Golemo and Felipe Codevilla and Martin Weiss and Jim Aldon D'Souza and Samira Ebrahimi Kahou and Felix Heide and Christopher Pal},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=Dup_dDqkZC5}\n}", "github": "", "project": "", "reviewers": "HbLp;rcqg;DomP;DDxv", "pdf_size": 0, "recommendation": "6;8;8;8", "confidence": "4;4;3;5", "correctness": "3;4;4;3", "technical_novelty": "2;3;2;3", "empirical_novelty": "3;3;2;3", "wc_summary_paper": "116;62;98;55", "wc_summary_review": "62;62;110;16", "wc_main_review": "602;238;158;779", "wc_review": "780;362;366;850", "wc_reply_reviewers": "0;6;48;0", "wc_reply_authors": "1259;431;471;1246", "reply_reviewers": "0;1;1;0", "reply_authors": "3;2;2;2", "recommendation_avg": [ 7.5, 0.8660254037844386 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 82.75, 25.193004981542 ], "wc_summary_review_avg": [ 62.5, 33.237779709240506 ], "wc_main_review_avg": [ 444.25, 255.64660666631192 ], "wc_review_avg": [ 589.5, 226.8584360344574 ], "wc_reply_reviewers_avg": [ 13.5, 20.068632240389476 ], "wc_reply_authors_avg": [ 851.75, 401.0257940581877 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 2.25, 0.4330127018922193 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 192, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1206042525359273292&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "pdf": "https://openreview.net/pdf?id=Dup_dDqkZC5", "email": "mila.quebec;mila.quebec;independentrobotics.com;mila.umontreal.ca;algolux.com;etsmtl.ca;algolux.com;polymtl.ca", "author_num": 8, "aff_unique_index": "0;1;2;3;4;5;4;6", "aff_unique_norm": "Quebec Artificial Intelligence Institute;Mila;Independent Robotics;University of Montreal;Algolux;\u00c9cole de technologie sup\u00e9rieure;Polytechnique Montreal", "aff_unique_dep": "Artificial Intelligence;Quebec Artificial Intelligence Institute;;Montreal Institute for Learning Algorithms;;;", "aff_unique_url": "https://mila.quebec;https://mila.quebec;;https://www.umontreal.ca;https://www.algolux.com;https://www.etsmtl.ca;https://www.polymtl.ca", "aff_unique_abbr": "Mila;Mila;;UM;;ETS;PolyMTL", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Montreal", "aff_country_unique_index": "0;0;0;2;0;2;0", "aff_country_unique": "Canada;;Sweden" }, { "id": "DvcMMKmDJ3q", "title": "Generating Symbolic Reasoning Problems with Transformer GANs", "track": "main", "status": "Reject", "tldr": "", "abstract": "Constructing training data for symbolic reasoning domains is challenging: Existing instances are typically hand-crafted and too few to be trained on directly and synthetically generated instances are often hard to evaluate in terms of their meaningfulness. We study the capabilities of GANs and Wasserstein GANs equipped with Transformer encoders to generate sensible and challenging training data for symbolic reasoning domains. We conduct experiments on two problem domains where Transformers have been successfully applied recently: symbolic mathematics and temporal specifications in verification. Even without autoregression, our GAN models produce syntactically correct instances and we show that these can be used as meaningful substitutes for real training data when training a classifier. Using a GAN setting also allows us to alter the target distribution: We show that by adding a classifier uncertainty part to the generator objective, we obtain a dataset that is even harder to solve for a classifier than our original dataset.", "keywords": "Transformer;GAN;symbolic reasoning;temporal logic", "primary_area": "", "supplementary_material": "/attachment/41e787818a00ffe8cfe96e267a823429bc0bbfe1.zip", "author": "Jens U. Kreber;Christopher Hahn", "authorids": "~Jens_U._Kreber1;~Christopher_Hahn1", "gender": "M;M", "homepage": ";https://www.christopherhahn.io", "dblp": "295/5439;91/9661", "google_scholar": "j_ANgPgAAAAJ;bADdSwYAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Jens_U._Kreber1;~Christopher_Hahn1", "aff": "Saarland University;Stanford University", "aff_domain": "uni-saarland.de;stanford.edu", "position": "MS student;Assistant Professor", "bibtex": "@misc{\nkreber2022generating,\ntitle={Generating Symbolic Reasoning Problems with Transformer {GAN}s},\nauthor={Jens U. Kreber and Christopher Hahn},\nyear={2022},\nurl={https://openreview.net/forum?id=DvcMMKmDJ3q}\n}", "github": "", "project": "", "reviewers": "qNn9;3zMF;vhbE;JuGn", "site": "https://openreview.net/forum?id=DvcMMKmDJ3q", "pdf_size": 0, "recommendation": "3;5;5;8", "confidence": "4;4;3;4", "correctness": "3;2;3;4", "technical_novelty": "2;2;2;3", "empirical_novelty": "3;0;2;3", "wc_summary_paper": "83;42;64;110", "wc_summary_review": "52;60;56;27", "wc_main_review": "770;816;388;340", "wc_review": "905;918;508;477", "wc_reply_reviewers": "413;90;128;54", "wc_reply_authors": "1248;2130;775;526", "reply_reviewers": "1;1;1;1", "reply_authors": "2;4;2;1", "recommendation_avg": [ 5.25, 1.7853571071357126 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 1.224744871391589 ], "wc_summary_paper_avg": [ 74.75, 24.993749218554626 ], "wc_summary_review_avg": [ 48.75, 12.871965661856 ], "wc_main_review_avg": [ 578.5, 215.78403555406967 ], "wc_review_avg": [ 702.0, 209.8368413792011 ], "wc_reply_reviewers_avg": [ 171.25, 142.00594177709607 ], "wc_reply_authors_avg": [ 1169.75, 612.0548892869004 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 2.25, 1.0897247358851685 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.08084520834544431, "corr_recommendation_correctness": 0.5940885257860046, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4520266378986400123&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1", "aff_unique_norm": "Saarland University;Stanford University", "aff_unique_dep": ";", "aff_unique_url": "https://www.uni-saarland.de;https://www.stanford.edu", "aff_unique_abbr": "UdS;Stanford", "aff_campus_unique_index": "1", "aff_campus_unique": ";Stanford", "aff_country_unique_index": "0;1", "aff_country_unique": "Germany;United States" }, { "id": "Dy8gq-LuckD", "title": "Recognizing and overcoming the greedy nature of learning in multi-modal deep neural networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "We hypothesize that due to the greedy nature of learning in multi-modal deep neural networks (DNNs), these models tend to rely on just one modality while under-utilizing the other modalities. We observe empirically that such behavior hurts its overall generalization. We validate our hypothesis by estimating the gain on the accuracy when the model has access to an additional modality. We refer to this gain as the conditional utilization rate of the modality. In the experiments, we consistently observe an imbalance in conditional utilization rate between modalities, across multiple tasks and architectures. Since conditional utilization rate cannot be computed efficiently during training, we introduce an efficient proxy based on the pace at which a DNN learns from each modality, which we refer to as conditional learning speed. We thus propose a training algorithm, balanced multi-modal learning, and demonstrate that it indeed addresses the issue of greedy learning. The proposed algorithm is found to improve the model\u2019s generalization on three datasets: Colored MNIST (Kim et al., 2019), Princeton ModelNet40 (Wu et al., 2015), and NVIDIA Dynamic Hand Gesture Dataset (Molchanov et al., 2016).", "keywords": "multi-modal learning;deep neural networks;multi-view learning", "primary_area": "", "supplementary_material": "", "author": "Nan Wu;Stanislaw Kamil Jastrzebski;Kyunghyun Cho;Krzysztof J. Geras", "authorids": "~Nan_Wu1;~Stanislaw_Kamil_Jastrzebski1;~Kyunghyun_Cho1;~Krzysztof_J._Geras1", "gender": "F;;M;M", "homepage": ";http://sjastrzebski.com;http://kyunghyuncho.me;https://cs.nyu.edu/~kgeras/", "dblp": "58/2484;139/0187;41/9736;124/8920", "google_scholar": ";wbJxGQ8AAAAJ;https://scholar.google.fi/citations?user=0RAmmIAAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";0000-0003-4138-1818;;", "linkedin": ";sjastrzebski/;;", "or_profile": "~Nan_Wu1;~Stanislaw_Kamil_Jastrzebski1;~Kyunghyun_Cho1;~Krzysztof_Jerzy_Geras1", "aff": "New York University;Molecule.one;New York University;NYU Grossman School of Medicine", "aff_domain": "nyu.edu;molecule.one;nyu.edu;nyulangone.org", "position": "PhD student;Chief Scientific Officer;Associate Professor;Assistant Professor", "bibtex": "@misc{\nwu2022recognizing,\ntitle={Recognizing and overcoming the greedy nature of learning in multi-modal deep neural networks},\nauthor={Nan Wu and Stanislaw Kamil Jastrzebski and Kyunghyun Cho and Krzysztof J. Geras},\nyear={2022},\nurl={https://openreview.net/forum?id=Dy8gq-LuckD}\n}", "github": "", "project": "", "reviewers": "PUm3;ndMY;BHoa;B4j4", "site": "https://openreview.net/forum?id=Dy8gq-LuckD", "pdf_size": 0, "recommendation": "3;3;5;8", "confidence": "4;4;4;4", "correctness": "2;2;3;3", "technical_novelty": "2;2;3;4", "empirical_novelty": "3;2;3;4", "wc_summary_paper": "77;157;88;90", "wc_summary_review": "74;87;93;72", "wc_main_review": "275;229;112;315", "wc_review": "426;473;293;477", "wc_reply_reviewers": "91;154;27;0", "wc_reply_authors": "1197;475;292;501", "reply_reviewers": "1;1;1;0", "reply_authors": "3;2;1;1", "recommendation_avg": [ 4.75, 2.0463381929681126 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.82915619758885 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 103.0, 31.567388235329194 ], "wc_summary_review_avg": [ 81.5, 8.789197915623474 ], "wc_main_review_avg": [ 232.75, 76.06699349915179 ], "wc_review_avg": [ 417.25, 74.48615643191692 ], "wc_reply_reviewers_avg": [ 68.0, 59.6447818337866 ], "wc_reply_authors_avg": [ 616.25, 344.8342899132857 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.75, 0.82915619758885 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.8551861104941366, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:_ntZMqGzrXEJ:scholar.google.com/&scioq=Recognizing+and+overcoming+the+greedy+nature+of+learning+in+multi-modal+deep+neural+networks&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;0;2", "aff_unique_norm": "New York University;Molecule.one;New York University Grossman School of Medicine", "aff_unique_dep": ";;School of Medicine", "aff_unique_url": "https://www.nyu.edu;https://molecule.one;https://med.nyu.edu", "aff_unique_abbr": "NYU;;NYU Grossman SOM", "aff_campus_unique_index": "1", "aff_campus_unique": ";New York", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States;" }, { "id": "DyPCANHXFRI", "title": "How Curriculum Learning Impacts Model Calibration", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Despite the significant progress made on deep learning models, concerns yet exist when a trained model is deployed to real-world applications. Model calibration is a key consideration that has recently attracted more attention---a learned model should not only achieve high predictive performance but also attain that with a proper level of confidence---a mismatch between predictive performance and confidence creates miscalibration and hence raises concerns about trusting a (miscalibrated) model. Even with the importance of the problem and many recent research efforts, calibration has not been fully understood yet, particularly when it faces the common challenges that deep learning models struggle with: specifically limited training resources and noisy data. In this paper, we study calibration emphasizing these scenarios. We particularly investigate the effect of curriculum learning, which, inspired by human curricula, leverages a guided learning regime to improve model generalization and has been found to improve predictive performance in the aforementioned cases. Specifically, we provide an empirical understanding on the impact of curriculum learning on model calibration under a variety of general contexts. Our studies suggest the following: most of the time curriculum learning has a negligible effect on calibration, but in certain cases under the context of limited training time and noisy data, curriculum learning can substantially reduce calibration error in a manner that cannot be explained by dynamically sampling the dataset. Second, curriculum and anti-curriculum learning appear to have nearly identical effects on model calibration. Lastly, the choice of pacing function and its parameters in curriculum learning can significantly impact model calibration, indicating that extra care should be taken to minimize the risk of severe model miscalibration. We hope the empirical insights will help us better understand calibration and guide the utilization of curriculum learning in practice.", "keywords": "calibration;deep learning;curriculum learning;image classification", "primary_area": "", "supplementary_material": "", "author": "Stephen Obadinma;Xiaodan Zhu;Hongyu Guo", "authorids": "~Stephen_Obadinma1;~Xiaodan_Zhu1;~Hongyu_Guo1", "gender": "M;M;M", "homepage": ";http://www.xiaodanzhu.com;https://hongyuharryguo.github.io/", "dblp": "271/8187;93/310.html;", "google_scholar": "https://scholar.google.ca/citations?user=bRbQBNsAAAAJ;https://scholar.google.ca/citations?user=a6MYnuUAAAAJ;https://scholar.google.ca/citations?user=bZUqlakAAAAJ", "orcid": ";0000-0003-3856-3696;", "linkedin": ";xiaodan-zhu-066833101/?originalSubdomain=ca;harry-h-y-guo-a582087/", "or_profile": "~Stephen_Obadinma1;~Xiaodan_Zhu1;~Hongyu_Guo1", "aff": "Queens University;Queen's University;National Research Council Canada", "aff_domain": "queensu.ca;queensu.ca;nrc-cnrc.gc.ca", "position": "MS student;Associate Professor;Senior Research Officer", "bibtex": "@misc{\nobadinma2022how,\ntitle={How Curriculum Learning Impacts Model Calibration},\nauthor={Stephen Obadinma and Xiaodan Zhu and Hongyu Guo},\nyear={2022},\nurl={https://openreview.net/forum?id=DyPCANHXFRI}\n}", "github": "", "project": "", "reviewers": "dE9F;ZTWs;f9fC;oWJv", "site": "https://openreview.net/forum?id=DyPCANHXFRI", "pdf_size": 0, "recommendation": "1;3;5;5", "confidence": "4;4;4;4", "correctness": "2;2;3;3", "technical_novelty": "1;1;2;2", "empirical_novelty": "1;3;2;2", "wc_summary_paper": "72;70;339;89", "wc_summary_review": "22;35;41;25", "wc_main_review": "555;880;478;137", "wc_review": "649;985;858;251", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.5, 1.6583123951777 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 1.5, 0.5 ], "empirical_novelty_avg": [ 2.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 142.5, 113.68926950244689 ], "wc_summary_review_avg": [ 30.75, 7.628073151196179 ], "wc_main_review_avg": [ 512.5, 264.127336714699 ], "wc_review_avg": [ 685.75, 278.19900700757364 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.9045340337332909, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:SkRumjXVLWIJ:scholar.google.com/&scioq=How+Curriculum+Learning+Impacts+Model+Calibration&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1;2", "aff_unique_norm": "Queens University;Queen's University;National Research Council Canada", "aff_unique_dep": ";;", "aff_unique_url": "https://www.queensu.ca;https://www.queensu.ca;https://www.nrc-cnrc.gc.ca", "aff_unique_abbr": "Queen's U;Queen's;NRC-CNRC", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Canada" }, { "id": "DzBDB7y8UOy", "title": "CoLLIE: Continual Learning of Language Grounding from Language-Image Embeddings", "track": "main", "status": "Reject", "tldr": "", "abstract": "This paper presents CoLLIE: a simple, yet effective model for continual learning of how language is grounded in vision. Given a pre-trained multimodal embedding model, where language and images are projected in the same semantic space (in this case CLIP by OpenAI), CoLLIE learns a transformation function that adjusts the language embeddings when needed to accommodate new language use. Unlike traditional few-shot learning, the model does not just learn new classes and labels, but can also generalize to similar language use. We verify the model's performance on two different tasks of continual learning and show that it can efficiently learn and generalize from only a few examples, with little interference with the model's original zero-shot performance. ", "keywords": "Continual Learning;Language Grounding;Language-Image Embeddings;Multimodal Distributional Semantics;Reference Resolution", "primary_area": "", "supplementary_material": "/attachment/6ecca011b58fa4873393499ff9d0728e72a0dbc0.zip", "author": "Gabriel Skantze;Bram Willemsen", "authorids": "~Gabriel_Skantze1;bramw@kth.se", "gender": "M;", "homepage": "https://www.kth.se/profile/skantze;", "dblp": "54/3812;", "google_scholar": "https://scholar.google.se/citations?user=iSzzd_MAAAAJ;", "orcid": "0000-0002-8579-1790;", "linkedin": ";", "or_profile": "~Gabriel_Skantze1;bramw@kth.se", "aff": "KTH Royal Institute of Technology, Stockholm, Sweden;", "aff_domain": "kth.se;", "position": "Full Professor;", "bibtex": "@misc{\nskantze2022collie,\ntitle={Co{LLIE}: Continual Learning of Language Grounding from Language-Image Embeddings},\nauthor={Gabriel Skantze and Bram Willemsen},\nyear={2022},\nurl={https://openreview.net/forum?id=DzBDB7y8UOy}\n}", "github": "", "project": "", "reviewers": "tBdi;qatX;jzf6;Y4pM", "site": "https://openreview.net/forum?id=DzBDB7y8UOy", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "3;5;4;4", "correctness": "3;2;3;4", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;3;2;3", "wc_summary_paper": "78;100;262;63", "wc_summary_review": "98;128;110;24", "wc_main_review": "208;497;833;217", "wc_review": "384;725;1205;304", "wc_reply_reviewers": "0;776;515;0", "wc_reply_authors": "235;1084;1488;268", "reply_reviewers": "0;2;2;0", "reply_authors": "1;3;3;1", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 125.75, 79.75705297965817 ], "wc_summary_review_avg": [ 90.0, 39.57271787481876 ], "wc_main_review_avg": [ 438.75, 255.56053588142282 ], "wc_review_avg": [ 654.5, 354.9792247442095 ], "wc_reply_reviewers_avg": [ 322.75, 335.6824206001857 ], "wc_reply_authors_avg": [ 768.75, 536.7361432771227 ], "reply_reviewers_avg": [ 1.0, 1.0 ], "reply_authors_avg": [ 2.0, 1.0 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.6488856845230502, "corr_recommendation_correctness": 0.3244428422615251, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15797909456442478747&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "aff_unique_index": "0", "aff_unique_norm": "KTH Royal Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.kth.se", "aff_unique_abbr": "KTH", "aff_campus_unique_index": "0", "aff_campus_unique": "Stockholm", "aff_country_unique_index": "0", "aff_country_unique": "Sweden" }, { "id": "DzKPXXr-CLK", "title": "Abelian Neural Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "In several domains such as natural language processing, it has been empirically reported that simple addition and subtraction in a somehow learned embedding space capture analogical relations. However, there is no guarantee that such relation holds for a new embedding space acquired by some training strategies. To tackle this issue, we propose to explicitly model analogical structure with an Abelian group. We construct an Abelian group network using invertible neural networks and show its universal approximation property. In experiments, our model successfully learns to capture word analogies from word2vec representations and shows better performance than other learning-based strategies. As a byproduct of modeling Abelian group operations, we furthermore obtain its natural extension to permutation invariant models with theoretical size-generalization capability.", "keywords": "algebra;Abelian group;word analogy;invertible neural networks;permutation invariant;size generalization", "primary_area": "", "supplementary_material": "/attachment/730809a7761e16362227d00e71a58f47b8ec3a42.zip", "author": "Kenshin Abe;Takanori Maehara;Issei Sato", "authorids": "~Kenshin_Abe1;~Takanori_Maehara1;~Issei_Sato1", "gender": ";M;M", "homepage": ";https://tmaehara.gitlab.io;", "dblp": "https://dblp.uni-trier.de/pid/241/9512.html;05/8510;13/2665", "google_scholar": ";3ei4ZqoAAAAJ;i4t2aUEAAAAJ", "orcid": ";0000-0002-2101-1484;", "linkedin": ";;", "or_profile": "~Kenshin_Abe1;~Takanori_Maehara1;~Issei_Sato1", "aff": "Preferred Networks, Inc.;Meta (aka. Facebook);the University of Tokyo", "aff_domain": "preferred.jp;fb.com;u-tokyo.ac.jp", "position": "Researcher;Software Engineer;Associate Professor", "bibtex": "@misc{\nabe2022abelian,\ntitle={Abelian Neural Networks},\nauthor={Kenshin Abe and Takanori Maehara and Issei Sato},\nyear={2022},\nurl={https://openreview.net/forum?id=DzKPXXr-CLK}\n}", "github": "", "project": "", "reviewers": "aHe9;ezJt;uoK8", "site": "https://openreview.net/forum?id=DzKPXXr-CLK", "pdf_size": 0, "recommendation": "3;6;6", "confidence": "4;3;4", "correctness": "4;3;3", "technical_novelty": "3;3;3", "empirical_novelty": "3;0;2", "wc_summary_paper": "53;109;59", "wc_summary_review": "75;47;28", "wc_main_review": "254;840;156", "wc_review": "382;996;243", "wc_reply_reviewers": "263;88;182", "wc_reply_authors": "701;788;295", "reply_reviewers": "1;1;1", "reply_authors": "2;1;2", "recommendation_avg": [ 5.0, 1.4142135623730951 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 1.6666666666666667, 1.247219128924647 ], "wc_summary_paper_avg": [ 73.66666666666667, 25.104227178350307 ], "wc_summary_review_avg": [ 50.0, 19.30457631409368 ], "wc_main_review_avg": [ 416.6666666666667, 302.0036791534537 ], "wc_review_avg": [ 540.3333333333334, 327.1639072731316 ], "wc_reply_reviewers_avg": [ 177.66666666666666, 71.50912917631955 ], "wc_reply_authors_avg": [ 594.6666666666666, 214.85240411863109 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.5, "corr_recommendation_correctness": -1.0, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2532600832237937746&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1;2", "aff_unique_norm": "Preferred Networks, Inc.;Meta;University of Tokyo", "aff_unique_dep": ";Meta Platforms, Inc.;", "aff_unique_url": "https://www.preferred-networks.com;https://meta.com;https://www.u-tokyo.ac.jp", "aff_unique_abbr": "PFN;Meta;UTokyo", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", "aff_country_unique": "Japan;United States" }, { "title": "A Unified Wasserstein Distributional Robustness Framework for Adversarial Training", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6448", "id": "Dzpe9C1mpiv", "poster": "", "openreview": "https://openreview.net/forum?id=Dzpe9C1mpiv", "slides": "https://iclr.cc/virtual/2022/poster/6448", "video": "https://iclr.cc/virtual/2022/poster/6448", "author_site": "Anh Bui, Trung Le, Quan Tran, He Zhao, Dinh Phung", "tldr": "", "abstract": "It is well-known that deep neural networks (DNNs) are susceptible to adversarial attacks, exposing a severe fragility of deep learning systems. As the result, adversarial training (AT) method, by incorporating adversarial examples during training, represents a natural and effective approach to strengthen the robustness of a DNN-based classifier. However, most AT-based methods, notably PGD-AT and TRADES, typically seek a pointwise adversary that generates the worst-case adversarial example by independently perturbing each data sample, as a way to ``probe'' the vulnerability of the classifier. Arguably, there are unexplored benefits in considering such adversarial effects from an entire distribution. To this end, this paper presents a unified framework that connects Wasserstein distributional robustness with current state-of-the-art AT methods. We introduce a new Wasserstein cost function and a new series of risk functions, with which we show that standard AT methods are special cases of their counterparts in our framework. This connection leads to an intuitive relaxation and generalization of existing AT methods and facilitates the development of a new family of distributional robustness AT-based algorithms. Extensive experiments show that our distributional robustness AT algorithms robustify further their standard AT counterparts in various settings.", "keywords": "Adversarial Machine Learning;Distributional Robustness", "primary_area": "", "supplementary_material": "", "author": "Anh Tuan Bui;Trung Le;Quan Hung Tran;He Zhao;Dinh Phung", "authorids": "~Anh_Tuan_Bui2;~Trung_Le2;~Quan_Hung_Tran1;~He_Zhao1;~Dinh_Phung2", "gender": "M;M;M;;", "homepage": "https://tuananhbui89.github.io/;;;;", "dblp": "120/0106;;151/8700;;", "google_scholar": "jEjMZ7oAAAAJ;https://scholar.google.com/citations?hl=en;ehs5ImcAAAAJ;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~Anh_Tuan_Bui2;~Trung_Le2;~Quan_Hung_Tran1;~He_Zhao1;~Dinh_Phung2", "aff": "Monash University;Monash University;Adobe Systems;;", "aff_domain": "monash.edu;monash.edu;adobe.com;;", "position": "PhD student;Assistant Professor;Research Scientist;;", "bibtex": "@inproceedings{\nbui2022a,\ntitle={A Unified Wasserstein Distributional Robustness Framework for Adversarial Training},\nauthor={Anh Tuan Bui and Trung Le and Quan Hung Tran and He Zhao and Dinh Phung},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=Dzpe9C1mpiv}\n}", "github": "", "project": "", "reviewers": "wEoc;Rag4;1ivF;7bWw;T7rh", "pdf_size": 0, "recommendation": "5;6;6;8;8", "confidence": "3;3;3;2;4", "correctness": "4;3;3;3;3", "technical_novelty": "2;3;3;3;3", "empirical_novelty": "2;3;3;3;3", "wc_summary_paper": "64;97;91;57;42", "wc_summary_review": "26;38;83;51;31", "wc_main_review": "291;293;990;220;231", "wc_review": "381;428;1164;328;304", "wc_reply_reviewers": "0;0;226;0;0", "wc_reply_authors": "804;725;1169;696;402", "reply_reviewers": "0;0;1;0;0", "reply_authors": "2;1;3;2;1", "recommendation_avg": [ 6.6, 1.2 ], "confidence_avg": [ 3.0, 0.6324555320336759 ], "correctness_avg": [ 3.2, 0.39999999999999997 ], "technical_novelty_avg": [ 2.8, 0.39999999999999997 ], "empirical_novelty_avg": [ 2.8, 0.39999999999999997 ], "wc_summary_paper_avg": [ 70.2, 20.778835386036434 ], "wc_summary_review_avg": [ 45.8, 20.409801566894274 ], "wc_main_review_avg": [ 405.0, 294.02925024561756 ], "wc_review_avg": [ 521.0, 324.35659389011965 ], "wc_reply_reviewers_avg": [ 45.2, 90.4 ], "wc_reply_authors_avg": [ 759.2, 246.04828794364735 ], "reply_reviewers_avg": [ 0.2, 0.4000000000000001 ], "reply_authors_avg": [ 1.8, 0.7483314773547883 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": -0.6666666666666666, "gs_citation": 51, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2935072374086624118&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "pdf": "https://openreview.net/pdf?id=Dzpe9C1mpiv", "email": "monash.edu;monash.edu;adobe.com;;", "author_num": 5, "aff_unique_index": "0;0;1", "aff_unique_norm": "Monash University;Adobe", "aff_unique_dep": ";Adobe Systems Incorporated", "aff_unique_url": "https://www.monash.edu;https://www.adobe.com", "aff_unique_abbr": "Monash;Adobe", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1", "aff_country_unique": "Australia;United States" }, { "id": "E-dq2kN8lt", "title": "FedPAGE: A Fast Local Stochastic Gradient Method for Communication-Efficient Federated Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Federated Averaging (FedAvg, also known as Local-SGD) (McMahan et al., 2017) is a classical federated learning algorithm in which clients run multiple local SGD steps before communicating their update to an orchestrating server. We propose a new federated learning algorithm, FedPAGE, able to further reduce the communication complexity by utilizing the recent optimal PAGE method (Li et al., 2021) instead of plain SGD in FedAvg. We show that FedPAGE uses much fewer communication rounds than previous local methods for both federated convex and nonconvex optimization. Concretely, 1) in the convex setting, the number of communication rounds of FedPAGE is $O(\\frac{N^{3/4}}{S\\epsilon})$, improving the best-known result $O(\\frac{N}{S\\epsilon})$ of SCAFFOLD (Karimireddy et al.,2020) by a factor of $N^{1/4}$, where $N$ is the total number of clients (usually is very large in federated learning), $S$ is the sampled subset of clients in each communication round, and $\\epsilon$ is the target error; 2) in the nonconvex setting, the number of communication rounds of FedPAGE is $O(\\frac{\\sqrt{N}+S}{S\\epsilon^2})$, improving the best-known result $O(\\frac{N^{2/3}}{S^{2/3}\\epsilon^2})$ of SCAFFOLD (Karimireddy et al.,2020) by a factor of $N^{1/6}S^{1/3}$, if the sampled clients $S\\leq \\sqrt{N}$. Note that in both settings, the communication cost for each round is the same for both FedPAGE and SCAFFOLD. As a result, FedPAGE achieves new state-of-the-art results in terms of communication complexity for both federated convex and nonconvex optimization.", "keywords": "federated learning;nonconvex optimization;convex optimization;local gradient method", "primary_area": "", "supplementary_material": "", "author": "Haoyu Zhao;Zhize Li;Peter Richt\u00e1rik", "authorids": "~Haoyu_Zhao1;~Zhize_Li1;~Peter_Richt\u00e1rik1", "gender": "M;M;M", "homepage": "http://hyzhao.me;https://zhizeli.github.io/;https://richtarik.org", "dblp": ";178/3238;62/8001", "google_scholar": "1MjanHUAAAAJ;uAFPPigAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";;0000-0003-4380-5848", "linkedin": ";;richtarik/", "or_profile": "~Haoyu_Zhao1;~Zhize_Li1;~Peter_Richtarik1", "aff": "Princeton University;King Abdullah University of Science and Technology;King Abdullah University of Science and Technology (KAUST)", "aff_domain": "princeton.edu;kaust.edu.sa;kaust.edu.sa", "position": "PhD student;Research Scientist;Full Professor", "bibtex": "@misc{\nzhao2022fedpage,\ntitle={Fed{PAGE}: A Fast Local Stochastic Gradient Method for Communication-Efficient Federated Learning},\nauthor={Haoyu Zhao and Zhize Li and Peter Richt{\\'a}rik},\nyear={2022},\nurl={https://openreview.net/forum?id=E-dq2kN8lt}\n}", "github": "", "project": "", "reviewers": "Xsi6;uLBz;tXTB;oGr1;Gwh9", "site": "https://openreview.net/forum?id=E-dq2kN8lt", "pdf_size": 0, "recommendation": "3;5;5;5;5", "confidence": "4;4;3;3;4", "correctness": "3;4;4;4;3", "technical_novelty": "2;3;3;3;2", "empirical_novelty": "2;2;3;3;2", "wc_summary_paper": "19;78;92;55;56", "wc_summary_review": "19;44;14;16;55", "wc_main_review": "191;759;75;687;312", "wc_review": "229;881;181;758;423", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "0;0;0;0;0", "reply_reviewers": "0;0;0;0;0", "reply_authors": "0;0;0;0;0", "recommendation_avg": [ 4.6, 0.7999999999999999 ], "confidence_avg": [ 3.6, 0.4898979485566356 ], "correctness_avg": [ 3.6, 0.4898979485566356 ], "technical_novelty_avg": [ 2.6, 0.4898979485566356 ], "empirical_novelty_avg": [ 2.4, 0.4898979485566356 ], "wc_summary_paper_avg": [ 60.0, 24.779023386727733 ], "wc_summary_review_avg": [ 29.6, 16.692513291892265 ], "wc_main_review_avg": [ 404.8, 271.36130895910713 ], "wc_review_avg": [ 494.4, 280.24960303272513 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.408248290463863, "corr_recommendation_correctness": 0.6123724356957948, "gs_citation": 33, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13205229249162210255&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff_unique_index": "0;1;1", "aff_unique_norm": "Princeton University;King Abdullah University of Science and Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.princeton.edu;https://www.kast.kau.edu.sa", "aff_unique_abbr": "Princeton;KAUST", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1", "aff_country_unique": "United States;Saudi Arabia" }, { "id": "E0zOKxQsZhN", "title": "Recurrent Model-Free RL is a Strong Baseline for Many POMDPs", "track": "main", "status": "Reject", "tldr": "", "abstract": "Many problems in RL, such as meta RL, robust RL, and generalization in RL can be cast as POMDPs. In theory, simply augmenting model-free RL with memory, such as recurrent neural networks, provides a general approach to solving all types of POMDPs. However, prior work has found that such recurrent model-free RL methods tend to perform worse than more specialized algorithms that are designed for specific types of POMDPs. This paper revisits this claim. We find that a careful architecture and hyperparameter decisions yield a recurrent model-free implementation that performs on par with (and occasionally substantially better than) more sophisticated recent techniques in their respective domains. We also release a simple and efficient implementation of recurrent model-free RL for future work to use as a baseline for POMDPs.", "keywords": "POMDP;RNN;recurrent model-free RL;baseline;meta RL;robust RL;generalization in RL", "primary_area": "", "supplementary_material": "", "author": "Tianwei Ni;Benjamin Eysenbach;Sergey Levine;Ruslan Salakhutdinov", "authorids": "~Tianwei_Ni1;~Benjamin_Eysenbach1;~Sergey_Levine1;~Ruslan_Salakhutdinov1", "gender": "M;M;M;M", "homepage": "https://twni2016.github.io/;https://ben-eysenbach.github.io/;https://people.eecs.berkeley.edu/~svlevine/;https://www.cs.cmu.edu/~rsalakhu/", "dblp": "230/8153;192/1863;80/7594;", "google_scholar": "njAD34UAAAAJ;DRnOvU8AAAAJ;8R35rCwAAAAJ;", "orcid": ";0009-0000-7136-6307;;", "linkedin": ";benjamin-eysenbach-a7235775/;;", "or_profile": "~Tianwei_Ni1;~Benjamin_Eysenbach1;~Sergey_Levine1;~Russ_Salakhutdinov1", "aff": "Mila - Quebec Artificial Intelligence Institute;Carnegie Mellon University;Google;School of Computer Science, Carnegie Mellon University", "aff_domain": "mila.quebec;cmu.edu;google.com;cs.cmu.edu", "position": "PhD student;PhD student;Research Scientist;Full Professor", "bibtex": "@misc{\nni2022recurrent,\ntitle={Recurrent Model-Free {RL} is a Strong Baseline for Many {POMDP}s},\nauthor={Tianwei Ni and Benjamin Eysenbach and Sergey Levine and Ruslan Salakhutdinov},\nyear={2022},\nurl={https://openreview.net/forum?id=E0zOKxQsZhN}\n}", "github": "", "project": "", "reviewers": "2WFY;iXRL;3HH1", "site": "https://openreview.net/forum?id=E0zOKxQsZhN", "pdf_size": 0, "recommendation": "5;6;8", "confidence": "4;4;4", "correctness": "2;3;3", "technical_novelty": "3;3;1", "empirical_novelty": "3;3;4", "wc_summary_paper": "100;54;83", "wc_summary_review": "5;12;83", "wc_main_review": "175;255;480", "wc_review": "280;321;646", "wc_reply_reviewers": "215;59;812", "wc_reply_authors": "926;503;1396", "reply_reviewers": "1;1;2", "reply_authors": "3;3;5", "recommendation_avg": [ 6.333333333333333, 1.247219128924647 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 2.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.3333333333333335, 0.9428090415820634 ], "empirical_novelty_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 79.0, 18.991226044325487 ], "wc_summary_review_avg": [ 33.333333333333336, 35.235714205271265 ], "wc_main_review_avg": [ 303.3333333333333, 129.1209596549771 ], "wc_review_avg": [ 415.6666666666667, 163.72809437872564 ], "wc_reply_reviewers_avg": [ 362.0, 324.508859663338 ], "wc_reply_authors_avg": [ 941.6666666666666, 364.7339974404482 ], "reply_reviewers_avg": [ 1.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 3.6666666666666665, 0.9428090415820634 ], "replies_avg": [ 22, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.7559289460184545, "gs_citation": 36, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6544482618525869854&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;2;1", "aff_unique_norm": "Quebec Artificial Intelligence Institute;Carnegie Mellon University;Google", "aff_unique_dep": "Artificial Intelligence;;Google", "aff_unique_url": "https://mila.quebec;https://www.cmu.edu;https://www.google.com", "aff_unique_abbr": "Mila;CMU;Google", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Mountain View;Pittsburgh", "aff_country_unique_index": "0;1;1;1", "aff_country_unique": "Canada;United States" }, { "id": "E3gF8L-mmS3", "title": "Use of small auxiliary networks and scarce data to improve the adversarial robustness of deep learning models", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Deep Learning models for image classification are known to be vulnerable to adversarial examples. \nAdversarial training is one of the most effective ways to provide defense against such threats, however it is a cumbersome process which requires many data points and long computation times. \nIn a setting where only small amounts of data are available for this process, adversarial training may negatively impact the classification performance on clean images by overfitting on the small amount of data.\nThis would be undesirable, especially when a large pre-trained model with satisfactory performance on clean data is already available.\nWe propose a new strategy to make a previously-trained model more robust against adversarial attacks, using scarce data and without degrading its performance on clean samples.\nThe proposed strategy consists in freezing the parameters of the originally trained base model and adding small auxiliary networks along the architecture, which process the features to reduce the effect of any adversarial perturbation.\nThis method can be used to defend a model against any arbitrary attack.\nA practical advantage of using auxiliary networks is that no modifications on the originally trained base model is required. \nTherefore, it can serve as a patch or add on to fix large and expensive existing deep learning models with little additional resources.\nExperiments on the CIFAR10 dataset showed that using only $10\\%$ of the full training set, the proposed method was able to adequately defend the model against the AutoPGD attack while maintaining a classification accuracy on clean images outperforming the model with adversarial training by $7\\%$. Indeed, the proposed method still performs reasonably well compared to adversarial training using $1\\%$ of the full training set.", "keywords": "adversarial learning;adversarial robustness;deep learning;cnn", "primary_area": "", "supplementary_material": "", "author": "Davide Coppola;Hwee Kuan Lee;Cuntai Guan", "authorids": "~Davide_Coppola1;~Hwee_Kuan_Lee1;~Cuntai_Guan1", "gender": "M;M;M", "homepage": ";https://web.bii.a-star.edu.sg/~leehk/index.html;https://personal.ntu.edu.sg/ctguan/index.html", "dblp": ";;95/7006", "google_scholar": ";;https://scholar.google.com.tw/citations?user=sg4vxPoAAAAJ", "orcid": "0000-0003-0785-8678;;0000-0002-0872-3276", "linkedin": ";;", "or_profile": "~Davide_Coppola1;~Hwee_Kuan_Lee1;~Cuntai_Guan1", "aff": "A*STAR;BII;Nanyang Technological University", "aff_domain": "a-star.edu.sg;astar.edu.sg;ntu.edu.sg", "position": "PhD student;Principal Researcher;Full Professor", "bibtex": "@misc{\ncoppola2022use,\ntitle={Use of small auxiliary networks and scarce data to improve the adversarial robustness of deep learning models},\nauthor={Davide Coppola and Hwee Kuan Lee and Cuntai Guan},\nyear={2022},\nurl={https://openreview.net/forum?id=E3gF8L-mmS3}\n}", "github": "", "project": "", "reviewers": "fuD2;3Dpg;gp6d", "site": "https://openreview.net/forum?id=E3gF8L-mmS3", "pdf_size": 0, "recommendation": "3;3;3", "confidence": "4;4;4", "correctness": "2;3;3", "technical_novelty": "2;2;2", "empirical_novelty": "2;1;2", "wc_summary_paper": "83;66;88", "wc_summary_review": "41;38;27", "wc_main_review": "118;418;848", "wc_review": "242;522;963", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 2.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 1.6666666666666667, 0.4714045207910317 ], "wc_summary_paper_avg": [ 79.0, 9.41629792788369 ], "wc_summary_review_avg": [ 35.333333333333336, 6.018490028422596 ], "wc_main_review_avg": [ 461.3333333333333, 299.59231558161764 ], "wc_review_avg": [ 575.6666666666666, 296.78312320989926 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:3z5zOeJ28gYJ:scholar.google.com/&scioq=Use+of+small+auxiliary+networks+and+scarce+data+to+improve+the+adversarial+robustness+of+deep+learning+models&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;2", "aff_unique_norm": "Agency for Science, Technology and Research;Bioinformatics Institute;Nanyang Technological University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.a-star.edu.sg;https://www.bii.a-star.edu.sg;https://www.ntu.edu.sg", "aff_unique_abbr": "A*STAR;BII;NTU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Singapore" }, { "title": "Diurnal or Nocturnal? Federated Learning of Multi-branch Networks from Periodically Shifting Distributions", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6742", "id": "E4EE_ohFGz", "poster": "", "openreview": "https://openreview.net/forum?id=E4EE_ohFGz", "slides": "https://iclr.cc/virtual/2022/poster/6742", "video": "https://iclr.cc/virtual/2022/poster/6742", "author_site": "Chen Zhu, Zheng Xu, Mingqing Chen, Jakub Kone\u010dn\u00fd, Andrew Hard, Tom Goldstein", "tldr": "", "abstract": "Federated learning has been deployed to train machine learning models from decentralized client data on mobile devices in practice. The clients available for training are observed to have periodically shifting distributions changing with the time of day, which can cause instability in training and degrade the model performance. In this paper, instead of modeling the distribution shift with a block-cyclic pattern as previous works, we model it with a mixture of distributions that gradually shifts between daytime and nighttime modes, and find this intuitive model to better match the observations in practical federated learning systems. \nFurthermore, we propose to jointly train a clustering model and a multi-branch network to allocate lightweight specialized branches to clients from different modes. A temporal prior is used to significantly boost the training performance.\nExperiments for image classification on EMNIST and CIFAR datasets, and next word prediction on the Stack Overflow dataset show that the proposed algorithm can counter the effects of the distribution shift and significantly improve the final model performance. ", "keywords": "Federated Learning;Peroredical Distribution Shift", "primary_area": "", "supplementary_material": "", "author": "Chen Zhu;Zheng Xu;Mingqing Chen;Jakub Kone\u010dn\u00fd;Andrew Hard;Tom Goldstein", "authorids": "~Chen_Zhu2;~Zheng_Xu2;~Mingqing_Chen1;~Jakub_Kone\u010dn\u00fd1;~Andrew_Hard1;~Tom_Goldstein1", "gender": "M;;M;M;M;M", "homepage": "http://www.cs.umd.edu/~chenzhu/;https://sites.google.com/site/xuzhustc/;;http://jakubkonecny.com/;https://research.google/people/106203/;https://www.cs.umd.edu/~tomg/", "dblp": "59/10522-1.html;83/2535-2;;139/0872;230/3939;25/8184", "google_scholar": "m-om5O8AAAAJ;TfWlMTYAAAAJ;c421fKoAAAAJ;https://scholar.google.sk/citations?user=4vq7eXQAAAAJ;hJBLsu8AAAAJ;KmSuVtgAAAAJ", "orcid": ";0009-0003-6747-3953;;;;", "linkedin": ";zheng-xu-0a125236/;;;andrew-hard-25b690a5;", "or_profile": "~Chen_Zhu2;~Zheng_Xu2;~Mingqing_Chen1;~Jakub_Kone\u010dn\u00fd1;~Andrew_Hard1;~Tom_Goldstein1", "aff": "Department of Computer Science, University of Maryland, College Park;Google;;Google;Google;University of Maryland, College Park", "aff_domain": "cs.umd.edu;google.com;;google.com;google.com;umd.edu", "position": "PhD student;Research Scientist;;Research Scientist;Researcher;Associate Professor", "bibtex": "@inproceedings{\nzhu2022diurnal,\ntitle={Diurnal or Nocturnal? Federated Learning of Multi-branch Networks from Periodically Shifting Distributions},\nauthor={Chen Zhu and Zheng Xu and Mingqing Chen and Jakub Kone{\\v{c}}n{\\'y} and Andrew Hard and Tom Goldstein},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=E4EE_ohFGz}\n}", "github": "", "project": "", "reviewers": "4Moy;QWqv;YzWj;JqZ9", "pdf_size": 0, "recommendation": "3;5;6;8", "confidence": "4;5;4;2", "correctness": "3;3;4;4", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;0;3;4", "wc_summary_paper": "273;49;80;125", "wc_summary_review": "197;36;86;22", "wc_main_review": "320;334;255;216", "wc_review": "790;419;421;363", "wc_reply_reviewers": "0;89;82;0", "wc_reply_authors": "1575;1764;665;293", "reply_reviewers": "0;1;1;0", "reply_authors": "3;3;2;1", "recommendation_avg": [ 5.5, 1.8027756377319946 ], "confidence_avg": [ 3.75, 1.0897247358851685 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.25, 1.479019945774904 ], "wc_summary_paper_avg": [ 131.75, 85.91092771004163 ], "wc_summary_review_avg": [ 85.25, 68.7654528088051 ], "wc_main_review_avg": [ 281.25, 48.038396101452015 ], "wc_review_avg": [ 498.25, 170.04319304223853 ], "wc_reply_reviewers_avg": [ 42.75, 42.82157750480475 ], "wc_reply_authors_avg": [ 1074.25, 613.2582551421546 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 2.25, 0.82915619758885 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.6999132392733556, "corr_recommendation_correctness": 0.8320502943378437, "gs_citation": 35, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11759797930276650912&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2, "pdf": "https://openreview.net/pdf?id=E4EE_ohFGz", "email": "cs.umd.edu;google.com;;google.com;google.com;umd.edu", "author_num": 6, "aff_unique_index": "0;1;1;1;2", "aff_unique_norm": "University of Maryland, College Park;Google;University of Maryland", "aff_unique_dep": "Department of Computer Science;Google;", "aff_unique_url": "https://www/umd.edu;https://www.google.com;https://www/umd.edu", "aff_unique_abbr": "UMD;Google;UMD", "aff_campus_unique_index": "0;1;1;1;0", "aff_campus_unique": "College Park;Mountain View", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "E7rUJ4uRbzt", "title": "Extraneousness-Aware Imitation Learning", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Visual imitation learning is an effective approach for intelligent agents to obtain control policies from visual demonstration sequences. However, standard visual imitation learning assumes expert demonstration that only contains the task-relevant frames. While previous works propose to learn from \\textit{noisy} demonstration, it still remains challenging when there are locally consistent yet task irrelevant subsequences in the demonstration. We term this kind of imitation learning ``imitation-learning-with-extraneousness'' and introduce Extraneousness-Aware Imitation Learning (EIL), a self-supervised approach that learns visuomotor policies from third-person demonstrations where extraneous subsequences exist. EIL learns action-conditioned self-supervised frame embeddings and aligns task-relevant frames across videos while excluding the extraneous parts. Our method allows agents to learn from extraneousness-rich demonstrations by intelligently ignoring irrelevant components. Experimental results show that EIL significantly outperforms strong baselines and approaches the level of training from the perfect demonstration on various simulated continuous control tasks and a ``learning-from-slides'' task. The project page can be found here: https://sites.google.com/view/iclr2022eil/home.", "keywords": "visual imitation learning;imitation learning from noisy video", "primary_area": "", "supplementary_material": "", "author": "Ray Chen Zheng;Kaizhe Hu;Boyuan Chen;Huazhe Xu", "authorids": "~Ray_Chen_Zheng1;hukz18@mails.tsinghua.edu.cn;boyuanc@mit.edu;huazhexu@stanford.edu", "gender": "M;;;", "homepage": "https://zhengrc19.github.io/;;;", "dblp": "235/8101;;;", "google_scholar": "gwUGHwsAAAAJ;;;", "orcid": ";;;", "linkedin": "ray-zheng-366053132/;;;", "or_profile": "~Ray_Chen_Zheng1;hukz18@mails.tsinghua.edu.cn;boyuanc@mit.edu;huazhexu@stanford.edu", "aff": "Department of Computer Science and Technology, Tsinghua University;;;", "aff_domain": "cs.tsinghua.edu.cn;;;", "position": "Undergrad student;;;", "bibtex": "@misc{\nzheng2022extraneousnessaware,\ntitle={Extraneousness-Aware Imitation Learning},\nauthor={Ray Chen Zheng and Kaizhe Hu and Boyuan Chen and Huazhe Xu},\nyear={2022},\nurl={https://openreview.net/forum?id=E7rUJ4uRbzt}\n}", "github": "", "project": "", "reviewers": "KUFj;uePM;EjrN;CDcV", "site": "https://openreview.net/forum?id=E7rUJ4uRbzt", "pdf_size": 0, "recommendation": "1;3;3;5", "confidence": "4;4;5;4", "correctness": "2;2;2;2", "technical_novelty": "3;2;2;3", "empirical_novelty": "2;3;1;2", "wc_summary_paper": "75;100;19;76", "wc_summary_review": "44;64;32;24", "wc_main_review": "449;641;626;359", "wc_review": "568;805;677;459", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "805;811;789;408", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 3.0, 1.4142135623730951 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 2.0, 0.0 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 67.5, 29.736341402398512 ], "wc_summary_review_avg": [ 41.0, 15.066519173319364 ], "wc_main_review_avg": [ 518.75, 119.19810191441809 ], "wc_review_avg": [ 627.25, 128.34402011780682 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 703.25, 170.6522414151071 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13854586391329345385&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff_unique_index": "0", "aff_unique_norm": "Tsinghua University", "aff_unique_dep": "Department of Computer Science and Technology", "aff_unique_url": "https://www.tsinghua.edu.cn", "aff_unique_abbr": "THU", "aff_country_unique_index": "0", "aff_country_unique": "China" }, { "id": "E8tsHT1YG0", "title": "Ridgeless Interpolation with Shallow ReLU Networks in $1D$ is Nearest Neighbor Curvature Extrapolation and Provably Generalizes on Lipschitz Functions", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "We prove a precise geometric description of all one layer ReLU networks $z(x;\\theta)$ with a single linear unit and input/output dimensions equal to one that interpolate a given dataset $\\mathcal D=\\{(x_i,f(x_i))\\}$ and, among all such interpolants, minimize the $\\ell_2$-norm of the neuron weights. Such networks can intuitively be thought of as those that minimize the mean-squared error over $\\mathcal D$ plus an infinitesimal weight decay penalty. We therefore refer to them as ridgeless ReLU interpolants. Our description proves that, to extrapolate values $z(x;\\theta)$ for inputs $x\\in (x_i,x_{i+1})$ lying between two consecutive datapoints, a ridgeless ReLU interpolant simply compares the signs of the discrete estimates for the curvature of $f$ at $x_i$ and $x_{I+1}$ derived from the dataset $\\mathcal D$. If the curvature estimates at $x_i$ and $x_{i+1}$ have different signs, then $z(x;\\theta)$ must be linear on $(x_i,x_{i+1})$. If in contrast the curvature estimates at $x_i$ and $x_{i+1}$ are both positive (resp. negative), then $z(x;\\theta)$ is convex (resp. concave) on $(x_i,x_{i+1})$. Our results show that ridgeless ReLU interpolants achieve the best possible generalization for learning $1d$ Lipschitz functions, up to universal constants. \n", "keywords": "theory;deep learning theory;implicit bias;generalization;interpolation;ridgeless regression", "primary_area": "", "supplementary_material": "/attachment/d7c21c52c70d7a7229dd46f29289eda8fc44e2e8.zip", "author": "Boris Hanin", "authorids": "~Boris_Hanin1", "gender": "", "homepage": "https://hanin.princeton.edu", "dblp": "205/2534", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "~Boris_Hanin1", "aff": "Princeton University", "aff_domain": "princeton.edu", "position": "Assistant Professor", "bibtex": "@misc{\nhanin2022ridgeless,\ntitle={Ridgeless Interpolation with Shallow Re{LU} Networks in \\$1D\\$ is Nearest Neighbor Curvature Extrapolation and Provably Generalizes on Lipschitz Functions},\nauthor={Boris Hanin},\nyear={2022},\nurl={https://openreview.net/forum?id=E8tsHT1YG0}\n}", "github": "", "project": "", "reviewers": "48JK;2ffv;ndeA;7Rcw", "site": "https://openreview.net/forum?id=E8tsHT1YG0", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "3;3;3;4", "correctness": "2;4;3;3", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;0;1;0", "wc_summary_paper": "121;43;94;158", "wc_summary_review": "27;21;9;138", "wc_main_review": "265;274;281;608", "wc_review": "413;338;384;904", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 0.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 104.0, 41.91061917939175 ], "wc_summary_review_avg": [ 48.75, 51.934453881792194 ], "wc_main_review_avg": [ 357.0, 145.02585976300915 ], "wc_review_avg": [ 509.75, 229.18592343335575 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 1, 0 ], "corr_recommendation_confidence": 0.6622661785325219, "corr_recommendation_correctness": 0.6488856845230502, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15801000142754527386&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0", "aff_unique_norm": "Princeton University", "aff_unique_dep": "", "aff_unique_url": "https://www.princeton.edu", "aff_unique_abbr": "Princeton", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "id": "E9UL8UJvysW", "title": "Structured Federated Aggregation for Personalizing On-device Intelligence", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Personalizing on-device intelligence with privacy-preserving is an emerging requirement for the Mobile Internet and many other service areas. The recent development of federated learning is to embody personalization by tackling statistical heterogeneity across devices. However, these methods ignore the structural information between clients which can indicate a similar behavior pattern or decision logic among clients who are connected to each other in a graph. For example, the traffic condition is very similar to its adjacent blocks. Motivated by this assumption, we propose structured federated learning(SFL) to update each device's personalized model by leveraging its neighbors' local model. This problem has been formulated to a new optimization problem to integrate the prediction loss, federated aggregation, and structured aggregation into a unified framework. Moreover, it could be further enhanced by adding the structure learning component to learn the relation graph in the same optimization framework. The effectiveness of the proposed method has been demonstrated in experimental analysis by comparing it with other baselines in public datasets.", "keywords": "Federated Learning;Structure Aggregation;Personalisation;Graph Neural Network", "primary_area": "", "supplementary_material": "", "author": "Fengwen Chen;Guodong Long;Tianyi Zhou;Zonghan Wu;Jing Jiang", "authorids": "~Fengwen_Chen1;~Guodong_Long2;~Tianyi_Zhou1;zonghan.wu-3@student.uts.edu.au;~Jing_Jiang6", "gender": "M;M;M;;F", "homepage": "https://github.com/dawenzi123/DAGCN;https://www.uts.edu.au/staff/guodong.long;https://tianyizhou.github.io/;;https://www.uts.edu.au/staff/jing.jiang", "dblp": ";34/10089;88/8205-1;;68/1974-2", "google_scholar": ";https://scholar.google.com.au/citations?user=Pl8m7hMAAAAJ;OKvgizMAAAAJ;;https://scholar.google.com.au/citations?hl=en", "orcid": ";0000-0003-3740-9515;0000-0001-5348-0632;;", "linkedin": ";;tianyizhou;;", "or_profile": "~Fengwen_Chen1;~Guodong_Long2;~Tianyi_Zhou1;zonghan.wu-3@student.uts.edu.au;~Jing_Jiang6", "aff": "University of Technology Sydney;University of Technology Sydney;University of Washington, Seattle;;University of Technology Sydney", "aff_domain": "uts.edu.au;uts.edu.au;uw.edu;;uts.edu.au", "position": "PhD student;Associate Professor;PhD student;;Lecturer", "bibtex": "@misc{\nchen2022structured,\ntitle={Structured Federated Aggregation for Personalizing On-device Intelligence},\nauthor={Fengwen Chen and Guodong Long and Tianyi Zhou and Zonghan Wu and Jing Jiang},\nyear={2022},\nurl={https://openreview.net/forum?id=E9UL8UJvysW}\n}", "github": "", "project": "", "reviewers": "", "site": "https://openreview.net/forum?id=E9UL8UJvysW", "pdf_size": 0, "recommendation": "", "confidence": "", "correctness": "", "technical_novelty": "", "empirical_novelty": "", "wc_summary_paper": "", "wc_summary_review": "", "wc_main_review": "", "wc_review": "", "wc_reply_reviewers": "", "wc_reply_authors": "", "reply_reviewers": "", "reply_authors": "", "recommendation_avg": [ 0, 0 ], "confidence_avg": [ 0, 0 ], "correctness_avg": [ 0, 0 ], "technical_novelty_avg": [ 0, 0 ], "empirical_novelty_avg": [ 0, 0 ], "wc_summary_paper_avg": [ 0, 0 ], "wc_summary_review_avg": [ 0, 0 ], "wc_main_review_avg": [ 0, 0 ], "wc_review_avg": [ 0, 0 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 1, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0, "corr_recommendation_correctness": 0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:NnCNQSWbvP0J:scholar.google.com/&scioq=Structured+Federated+Aggregation+for+Personalizing+On-device+Intelligence&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "University of Technology Sydney;University of Washington", "aff_unique_dep": ";", "aff_unique_url": "https://www.uts.edu.au;https://www.washington.edu", "aff_unique_abbr": "UTS;UW", "aff_campus_unique_index": "1", "aff_campus_unique": ";Seattle", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "Australia;United States" }, { "id": "E9e18Ms5TeV", "title": "A Large Batch Optimizer Reality Check: Traditional, Generic Optimizers Suffice Across Batch Sizes", "track": "main", "status": "Reject", "tldr": "", "abstract": "Recently the LARS and LAMB optimizers have been proposed for training neural networks faster using large batch sizes. LARS and LAMB add layer-wise normalization to the update rules of Heavy-ball momentum and Adam, respectively, and have become popular in prominent benchmarks and deep learning libraries. However, without fair comparisons to standard optimizers, it remains an open question whether LARS and LAMB have any benefit over traditional, generic algorithms. In this work we demonstrate that standard optimization algorithms such as Nesterov momentum and Adam can match or exceed the results of LARS and LAMB at large batch sizes. Our results establish new, stronger baselines for future comparisons at these batch sizes and shed light on the difficulties of comparing optimizers for neural network training more generally.", "keywords": "neural networks;deep learning;neural network optimization;hyperparameter tuning;optimizer comparison", "primary_area": "", "supplementary_material": "/attachment/9efbc4dfb52f90b0e576bbe534b18669dc099fb8.zip", "author": "Zachary Nado;Justin Gilmer;Christopher J Shallue;Rohan Anil;George Edward Dahl", "authorids": "~Zachary_Nado1;~Justin_Gilmer1;~Christopher_J_Shallue1;~Rohan_Anil1;~George_Edward_Dahl1", "gender": "M;M;M;M;M", "homepage": "http://zna.do;;;;https://www.cs.toronto.edu/~gdahl", "dblp": "228/7785;;;182/1833;10/7998", "google_scholar": "tazGc34AAAAJ;Ml_vQ8MAAAAJ;pWtqAaIAAAAJ;;ghbWy-0AAAAJ", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~Zachary_Nado1;~Justin_Gilmer1;~Christopher_J_Shallue1;~Rohan_Anil1;~George_Edward_Dahl1", "aff": "Google;Google Brain;Harvard University;Google Brain ;Google", "aff_domain": "google.com;google.com;harvard.edu;google.com;google.com", "position": "Research Engineer;Researcher;PhD student;Principal Engineer;Research Scientist", "bibtex": "@misc{\nnado2022a,\ntitle={A Large Batch Optimizer Reality Check: Traditional, Generic Optimizers Suffice Across Batch Sizes},\nauthor={Zachary Nado and Justin Gilmer and Christopher J Shallue and Rohan Anil and George Edward Dahl},\nyear={2022},\nurl={https://openreview.net/forum?id=E9e18Ms5TeV}\n}", "github": "", "project": "", "reviewers": "uA2C;F1kk;9Uka;Fjz9", "site": "https://openreview.net/forum?id=E9e18Ms5TeV", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "5;4;4;4", "correctness": "3;4;4;4", "technical_novelty": "2;1;2;1", "empirical_novelty": "0;2;3;2", "wc_summary_paper": "88;50;233;127", "wc_summary_review": "120;95;53;50", "wc_main_review": "580;324;936;212", "wc_review": "788;469;1222;389", "wc_reply_reviewers": "127;0;691;0", "wc_reply_authors": "908;0;756;0", "reply_reviewers": "2;0;1;0", "reply_authors": "2;0;1;0", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 1.5, 0.5 ], "empirical_novelty_avg": [ 1.75, 1.0897247358851685 ], "wc_summary_paper_avg": [ 124.5, 68.30263538107442 ], "wc_summary_review_avg": [ 79.5, 29.381116384507923 ], "wc_main_review_avg": [ 513.0, 278.2714502064486 ], "wc_review_avg": [ 717.0, 327.54923294063747 ], "wc_reply_reviewers_avg": [ 204.5, 285.6260667376141 ], "wc_reply_authors_avg": [ 416.0, 419.4567915769156 ], "reply_reviewers_avg": [ 0.75, 0.82915619758885 ], "reply_authors_avg": [ 0.75, 0.82915619758885 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.9271726499455306, "corr_recommendation_correctness": 0.9271726499455306, "gs_citation": 45, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13555844602460749161&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;1;0;0", "aff_unique_norm": "Google;Harvard University", "aff_unique_dep": "Google;", "aff_unique_url": "https://www.google.com;https://www.harvard.edu", "aff_unique_abbr": "Google;Harvard", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "E9z2A1-O7e", "title": "HyperTransformer: Attention-Based CNN Model Generation from Few Samples", "track": "main", "status": "Reject", "tldr": "", "abstract": "In this work we propose a HyperTransformer, a transformer based model that generates all weights of a CNN model directly from the support samples. This approach allows to use a high-capacity model for encoding task-dependent variations in the weights of a smaller model. We show for multiple few-shot benchmarks with different architectures and datasets that our method beats or matches that of the traditional learning methods in a few-shot regime. Specifically, we show that for very small target models, our method can generate significantly better performing models than traditional few-shot learning methods. For larger models we discover that applying generation to the last layer only, allows to produce competitive or better results while being end-to-end differentiable. Finally, we extend our approach to semi-supervised regime utilizing unlabeled samples in the support set and further improving few-shot performance in the presence of unlabeled data.", "keywords": "few-shot learning;transformer model;weight generation;supervised learning;semi-supervised learning", "primary_area": "", "supplementary_material": "/attachment/6683efaacfbeb940c1d56273a50b3a82fad797f0.zip", "author": "Andrey Zhmoginov;Max Vladymyrov;Mark Sandler", "authorids": "~Andrey_Zhmoginov1;~Max_Vladymyrov1;~Mark_Sandler1", "gender": "M;M;M", "homepage": ";https://max-vladymyrov.github.io/;", "dblp": "182/1825;116/3059;s/MarkSandler", "google_scholar": "jj6IfzEAAAAJ;pQZCrqcAAAAJ;IcPc-OUAAAAJ", "orcid": ";;0000-0003-0352-6051", "linkedin": ";max-vladymyrov-5803b711/;", "or_profile": "~Andrey_Zhmoginov1;~Max_Vladymyrov1;~Mark_Sandler1", "aff": "Google DeepMind;Google Research;Google", "aff_domain": "google.com;google.com;google.com", "position": "Researcher;Research Scientist;Research Scientist", "bibtex": "@misc{\nzhmoginov2022hypertransformer,\ntitle={HyperTransformer: Attention-Based {CNN} Model Generation from Few Samples},\nauthor={Andrey Zhmoginov and Max Vladymyrov and Mark Sandler},\nyear={2022},\nurl={https://openreview.net/forum?id=E9z2A1-O7e}\n}", "github": "", "project": "", "reviewers": "As45;1KF1;2iqR", "site": "https://openreview.net/forum?id=E9z2A1-O7e", "pdf_size": 0, "recommendation": "3;5;8", "confidence": "4;4;4", "correctness": "2;3;4", "technical_novelty": "2;2;3", "empirical_novelty": "2;2;3", "wc_summary_paper": "46;42;66", "wc_summary_review": "18;68;45", "wc_main_review": "202;143;288", "wc_review": "266;253;399", "wc_reply_reviewers": "144;153;0", "wc_reply_authors": "826;1480;289", "reply_reviewers": "1;1;0", "reply_authors": "2;3;1", "recommendation_avg": [ 5.333333333333333, 2.0548046676563256 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.0, 0.816496580927726 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 51.333333333333336, 10.498677165349081 ], "wc_summary_review_avg": [ 43.666666666666664, 20.434176165325468 ], "wc_main_review_avg": [ 211.0, 59.53710327742412 ], "wc_review_avg": [ 306.0, 65.9747426419131 ], "wc_reply_reviewers_avg": [ 99.0, 70.09992867328754 ], "wc_reply_authors_avg": [ 865.0, 487.00513344317017 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 2.0, 0.816496580927726 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.9933992677987828, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:XZFyNEXP7EMJ:scholar.google.com/&scioq=HyperTransformer:+Attention-Based+CNN+Model+Generation+from+Few+Samples&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;0", "aff_unique_norm": "Google", "aff_unique_dep": "Google DeepMind", "aff_unique_url": "https://deepmind.com", "aff_unique_abbr": "DeepMind", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;1;1", "aff_country_unique": "United Kingdom;United States" }, { "title": "Increasing the Cost of Model Extraction with Calibrated Proof of Work", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6513", "id": "EAy7C1cgE1L", "poster": "", "openreview": "https://openreview.net/forum?id=EAy7C1cgE1L", "slides": "https://iclr.cc/virtual/2022/poster/6513", "video": "https://iclr.cc/virtual/2022/poster/6513", "author_site": "Adam Dziedzic, Muhammad Ahmad Kaleem, Yu Shen Lu, Nicolas Papernot", "tldr": "", "abstract": "In model extraction attacks, adversaries can steal a machine learning model exposed via a public API by repeatedly querying it and adjusting their own model based on obtained predictions. To prevent model stealing, existing defenses focus on detecting malicious queries, truncating, or distorting outputs, thus necessarily introducing a tradeoff between robustness and model utility for legitimate users. Instead, we propose to impede model extraction by requiring users to complete a proof-of-work before they can read the model's predictions. This deters attackers by greatly increasing (even up to 100x) the computational effort needed to leverage query access for model extraction. Since we calibrate the effort required to complete the proof-of-work to each query, this only introduces a slight overhead for regular users (up to 2x). To achieve this, our calibration applies tools from differential privacy to measure the information revealed by a query. Our method requires no modification of the victim model and can be applied by machine learning practitioners to guard their publicly exposed models against being easily stolen.", "keywords": "model extraction;model stealing;model functionality stealing;proof-of-work;adversarial machine learning;trustworthy machine learning;deep learning", "primary_area": "", "supplementary_material": "/attachment/32bc06f0c03c140a3e8513c39087e341066fd727.zip", "author": "Adam Dziedzic;Muhammad Ahmad Kaleem;Yu Shen Lu;Nicolas Papernot", "authorids": "~Adam_Dziedzic1;~Muhammad_Ahmad_Kaleem1;~Yu_Shen_Lu1;~Nicolas_Papernot1", "gender": ";;;M", "homepage": ";;;https://www.papernot.fr", "dblp": ";;;162/1405", "google_scholar": ";;;cGxq0cMAAAAJ", "orcid": ";;;", "linkedin": ";;lucy-lu-b68344135/;nicolaspapernot", "or_profile": "~Adam_Dziedzic1;~Muhammad_Ahmad_Kaleem1;~Yu_Shen_Lu1;~Nicolas_Papernot1", "aff": ";;;Google", "aff_domain": ";;;google.com", "position": ";;;Research Scientist", "bibtex": "@inproceedings{\ndziedzic2022increasing,\ntitle={Increasing the Cost of Model Extraction with Calibrated Proof of Work},\nauthor={Adam Dziedzic and Muhammad Ahmad Kaleem and Yu Shen Lu and Nicolas Papernot},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=EAy7C1cgE1L}\n}", "github": "", "project": "", "reviewers": "LLkZ;emnb;hR7i;aHDe", "pdf_size": 0, "recommendation": "3;6;8;8", "confidence": "4;4;4;3", "correctness": "3;3;4;3", "technical_novelty": "3;3;3;3", "empirical_novelty": "2;3;2;3", "wc_summary_paper": "80;49;101;124", "wc_summary_review": "201;3;11;21", "wc_main_review": "167;459;251;89", "wc_review": "448;511;363;234", "wc_reply_reviewers": "0;0;74;0", "wc_reply_authors": "0;1203;1313;589", "reply_reviewers": "0;0;2;0", "reply_authors": "0;3;6;2", "recommendation_avg": [ 6.25, 2.0463381929681126 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 88.5, 27.608875384557045 ], "wc_summary_review_avg": [ 59.0, 82.23138087129512 ], "wc_main_review_avg": [ 241.5, 138.0244543550164 ], "wc_review_avg": [ 389.0, 103.76174632300673 ], "wc_reply_reviewers_avg": [ 18.5, 32.04293994002423 ], "wc_reply_authors_avg": [ 776.25, 526.2705459172116 ], "reply_reviewers_avg": [ 0.5, 0.8660254037844386 ], "reply_authors_avg": [ 2.75, 2.165063509461097 ], "replies_avg": [ 26, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.49374193110101877, "corr_recommendation_correctness": 0.49374193110101877, "gs_citation": 35, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12827839340462888738&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=EAy7C1cgE1L", "email": ";;;google.com", "author_num": 4, "aff_unique_index": "0", "aff_unique_norm": "Google", "aff_unique_dep": "Google", "aff_unique_url": "https://www.google.com", "aff_unique_abbr": "Google", "aff_campus_unique_index": "0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "title": "Model-Based Offline Meta-Reinforcement Learning with Regularization", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6873", "id": "EBn0uInJZWh", "poster": "", "openreview": "https://openreview.net/forum?id=EBn0uInJZWh", "slides": "https://iclr.cc/virtual/2022/poster/6873", "video": "https://iclr.cc/virtual/2022/poster/6873", "author_site": "Sen Lin, Jialin Wan, Tengyu Xu, Yingbin Liang, Junshan Zhang", "tldr": "", "abstract": "Existing offline reinforcement learning (RL) methods face a few major challenges, particularly the distributional shift between the learned policy and the behavior policy. Offline Meta-RL is emerging as a promising approach to address these challenges, aiming to learn an informative meta-policy from a collection of tasks. Nevertheless, as shown in our empirical studies, offline Meta-RL could be outperformed by offline single-task RL methods on tasks with good quality of datasets, indicating that a right balance has to be delicately calibrated between \"exploring\" the out-of-distribution state-actions by following the meta-policy and \"exploiting\" the offline dataset by staying close to the behavior policy. Motivated by such empirical analysis, we propose model-based offline $\\text{\\bf Me}$ta-RL with $\\text{\\bf r}$egularized $\\text{\\bf P}$olicy $\\text{\\bf O}$ptimization (MerPO), which learns a meta-model for efficient task structure inference and an informative meta-policy for safe exploration of out-of-distribution state-actions. In particular, we devise a new meta-Regularized model-based Actor-Critic (RAC) method for within-task policy optimization, as a key building block of MerPO, using both conservative policy evaluation and regularized policy improvement; and the intrinsic tradeoff therein is achieved via striking the right balance between two regularizers, one based on the behavior policy and the other on the meta-policy. We theoretically show that the learnt policy offers guaranteed improvement over both the behavior policy and the meta-policy, thus ensuring the performance improvement on new tasks via offline Meta-RL. Our experiments corroborate the superior performance of MerPO over existing offline Meta-RL methods.", "keywords": "offline reinforcement learning;model-based reinforcement learning;behavior policy;Meta-reinforcement learning", "primary_area": "", "supplementary_material": "/attachment/efdfca464a58c1372bad2071a5e5ace74ee46795.zip", "author": "Sen Lin;Jialin Wan;Tengyu Xu;Yingbin Liang;Junshan Zhang", "authorids": "~Sen_Lin1;jwan20@asu.edu;~Tengyu_Xu1;~Yingbin_Liang1;~Junshan_Zhang1", "gender": ";;;F;M", "homepage": "https://slin70.github.io/;;;https://sites.google.com/view/yingbinliang/home;https://faculty.engineering.ucdavis.edu/jzhang/", "dblp": "70/9499-1.html;;;51/332;59/1232.html", "google_scholar": "94-TbUsAAAAJ;;;lGgLAiIAAAAJ;UtAdFs8AAAAJ", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~Sen_Lin1;jwan20@asu.edu;~Tengyu_Xu1;~Yingbin_Liang1;~Junshan_Zhang1", "aff": "Arizona State University;;;The Ohio State University;University of California, Davis", "aff_domain": "asu.edu;;;osu.edu;ucdavis.edu", "position": "Postdoc;;;Professor;Full Professor", "bibtex": "@inproceedings{\nlin2022modelbased,\ntitle={Model-Based Offline Meta-Reinforcement Learning with Regularization},\nauthor={Sen Lin and Jialin Wan and Tengyu Xu and Yingbin Liang and Junshan Zhang},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=EBn0uInJZWh}\n}", "github": "", "project": "", "reviewers": "xD1s;i86B;jFYQ;zWVS", "pdf_size": 0, "recommendation": "6;6;6;8", "confidence": "3;4;4;4", "correctness": "2;4;4;4", "technical_novelty": "2;3;2;3", "empirical_novelty": "2;3;2;3", "wc_summary_paper": "163;101;140;131", "wc_summary_review": "82;50;28;32", "wc_main_review": "668;432;150;405", "wc_review": "913;583;318;568", "wc_reply_reviewers": "584;0;22;29", "wc_reply_authors": "2538;139;653;933", "reply_reviewers": "2;0;1;1", "reply_authors": "5;1;3;2", "recommendation_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.8660254037844386 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 133.75, 22.219079638904937 ], "wc_summary_review_avg": [ 48.0, 21.307275752662516 ], "wc_main_review_avg": [ 413.75, 183.4507767767692 ], "wc_review_avg": [ 595.5, 211.3793982392797 ], "wc_reply_reviewers_avg": [ 158.75, 245.75127161420752 ], "wc_reply_authors_avg": [ 1065.75, 896.4333145861995 ], "reply_reviewers_avg": [ 1.0, 0.7071067811865476 ], "reply_authors_avg": [ 2.75, 1.479019945774904 ], "replies_avg": [ 20, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 24, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11910287316055960041&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "pdf": "https://openreview.net/pdf?id=EBn0uInJZWh", "email": "asu.edu;;;osu.edu;ucdavis.edu", "author_num": 5, "aff_unique_index": "0;1;2", "aff_unique_norm": "Arizona State University;Ohio State University;University of California, Davis", "aff_unique_dep": ";;", "aff_unique_url": "https://www.asu.edu;https://www.osu.edu;https://www.ucdavis.edu", "aff_unique_abbr": "ASU;OSU;UC Davis", "aff_campus_unique_index": "1", "aff_campus_unique": ";Davis", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "ECC7T-torK", "title": "Early Stop And Adversarial Training Yield Better surrogate Model: Very Non-Robust Features Harm Adversarial Transferability", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "The transferability of adversarial examples (AE); known as adversarial transferability, has attracted significant attention because it can be exploited for TransferableBlack-box Attacks (TBA). Most lines of works attribute the existence of the non-robust features improves the adversarial transferability. As a motivating example, we test the adversarial transferability on the early stopped surrogate models, which are known to be concentrated on robust features than non-robust features from prior works. We find that the early stopped models yield better adversarial transferability than the models at the final epoch, which leaves non-intuitive interpretation from the perspective of the robust and non-robust features (NRFs). In this work, we articulate a novel Very Non-Robust Feature(VNRF) hypothesis that the VNRFslearned can harm the adversarial transferability to explain this phenomenon. This hypothesis is partly verified through zeroing some filters with highl1norm values. This insight further motivates us to adopt light adversarial training that mainly removes the VNRFs for significantly improving the transferability.", "keywords": "Adversarial Tranferability;Early stop;Adversarial Training;Non-robust Features", "primary_area": "", "supplementary_material": "/attachment/31a57a727fccaa247d39e916a43323d5d2dafa9b.zip", "author": "Chaoning Zhang;Gyusang Cho;Philipp Benz;Kang Zhang;Chenshuang Zhang;Chan-Hyun Youn;In So Kweon", "authorids": "~Chaoning_Zhang1;~Gyusang_Cho1;~Philipp_Benz1;~Kang_Zhang6;~Chenshuang_Zhang2;~Chan-Hyun_Youn1;~In_So_Kweon2", "gender": "M;M;M;M;F;M;M", "homepage": ";http://ncl.kaist.ac.kr;https://phibenz.github.io;;https://chenshuang-zhang.github.io/;http://ncl.kaist.ac.kr;https://ee.kaist.ac.kr/en/professor-s2/2/", "dblp": ";15/4738;220/8644;29/177-8;165/5102.html;31/5293;74/4917.html", "google_scholar": "https://scholar.google.co.kr/citations?user=lvhxhyQAAAAJ;Fjd05KwAAAAJ;hRqVJWMAAAAJ;nj19btQAAAAJ;HbqjLHYAAAAJ;https://scholar.google.co.kr/scholar?q=chan-hyun+youn;XA8EOlEAAAAJ", "orcid": ";;;0000-0003-2761-9383;;0000-0002-3970-7308;", "linkedin": ";;;;;;", "or_profile": "~Chaoning_Zhang1;~Gyusang_Cho1;~Philipp_Benz1;~Kang_Zhang6;~Chenshuang_Zhang2;~Chan-Hyun_Youn1;~In-So_Kweon1", "aff": "Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology;Deeping Source;Korea Advanced Institute of Science & Technology;Kyung Hee University;Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology", "aff_domain": "kaist.ac.kr;kaist.ac.kr;deepingsource.io;kaist.ac.kr;khu.ac.kr;kaist.ac.kr;kaist.ac.kr", "position": "Postdoc;PhD student;Researcher;PhD student;Researcher;Full Professor;Emeritus", "bibtex": "@misc{\nzhang2022early,\ntitle={Early Stop And Adversarial Training Yield Better surrogate Model: Very Non-Robust Features Harm Adversarial Transferability},\nauthor={Chaoning Zhang and Gyusang Cho and Philipp Benz and Kang Zhang and Chenshuang Zhang and Chan-Hyun Youn and In So Kweon},\nyear={2022},\nurl={https://openreview.net/forum?id=ECC7T-torK}\n}", "github": "", "project": "", "reviewers": "QwmD;yMtx;kXLc;JXac", "site": "https://openreview.net/forum?id=ECC7T-torK", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "4;4;3;4", "correctness": "3;2;2;2", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;3;2;2", "wc_summary_paper": "37;63;43;38", "wc_summary_review": "20;18;15;33", "wc_main_review": "178;277;185;85", "wc_review": "235;358;243;156", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 2.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 45.25, 10.497023387608508 ], "wc_summary_review_avg": [ 21.5, 6.87386354243376 ], "wc_main_review_avg": [ 181.25, 67.92781094662186 ], "wc_review_avg": [ 248.0, 72.03818431915118 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": -0.5773502691896257, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18416798534873206291&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;1;0;2;0;0", "aff_unique_norm": "Korea Advanced Institute of Science and Technology;Deeping Source;Kyung Hee University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.kaist.ac.kr;;http://www.khu.ac.kr", "aff_unique_abbr": "KAIST;;KHU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "South Korea;" }, { "title": "Chaos is a Ladder: A New Theoretical Understanding of Contrastive Learning via Augmentation Overlap", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6659", "id": "ECvgmYVyeUz", "poster": "", "openreview": "https://openreview.net/forum?id=ECvgmYVyeUz", "slides": "https://iclr.cc/virtual/2022/poster/6659", "video": "https://iclr.cc/virtual/2022/poster/6659", "author_site": "Yifei Wang, Qi Zhang, Yisen Wang, Jiansheng Yang, Zhouchen Lin", "tldr": "", "abstract": "Recently, contrastive learning has risen to be a promising approach for large-scale self-supervised learning. However, theoretical understanding of how it works is still unclear. In this paper, we propose a new guarantee on the downstream performance without resorting to the conditional independence assumption that is widely adopted in previous work but hardly holds in practice. Our new theory hinges on the insight that the support of different intra-class samples will become more overlapped under aggressive data augmentations, thus simply aligning the positive samples (augmented views of the same sample) could make contrastive learning cluster intra-class samples together. Based on this augmentation overlap perspective, theoretically, we obtain asymptotically closed bounds for downstream performance under weaker assumptions, and empirically, we propose an unsupervised model selection metric ARC that aligns well with downstream accuracy. Our theory suggests an alternative understanding of contrastive learning: the role of aligning positive samples is more like a surrogate task than an ultimate goal, and the overlapped augmented views (i.e., the chaos) create a ladder for contrastive learning to gradually learn class-separated representations. The code for computing ARC is available at https://github.com/zhangq327/ARC.", "keywords": "Contrastive Learning;Representation Learning;Self-supervised Learning", "primary_area": "", "supplementary_material": "", "author": "Yifei Wang;Qi Zhang;Yisen Wang;Jiansheng Yang;Zhouchen Lin", "authorids": "~Yifei_Wang1;zhangq327@mail2.sysu.edu.cn;~Yisen_Wang1;yjs@math.pku.edu.cn;~Zhouchen_Lin1", "gender": "M;;M;;M", "homepage": "https://yifeiwang77.com;;https://yisenwang.github.io/;;https://zhouchenlin.github.io", "dblp": "00/555-1;;172/1346-1;;l/ZhouchenLin", "google_scholar": "-CLy6YsAAAAJ;;uMWPDboAAAAJ;;https://scholar.google.com.tw/citations?user=TanjFwoAAAAJ", "orcid": ";;;;0000-0003-1493-7569", "linkedin": ";;;;", "or_profile": "~Yifei_Wang1;zhangq327@mail2.sysu.edu.cn;~Yisen_Wang1;yjs@math.pku.edu.cn;~Zhouchen_Lin1", "aff": "Peking University;;Peking University;;Peking University", "aff_domain": "pku.edu.cn;;pku.edu.cn;;pku.edu.cn", "position": "PhD student;;Assistant Professor;;Professor", "bibtex": "@inproceedings{\nwang2022chaos,\ntitle={Chaos is a Ladder: A New Theoretical Understanding of Contrastive Learning via Augmentation Overlap},\nauthor={Yifei Wang and Qi Zhang and Yisen Wang and Jiansheng Yang and Zhouchen Lin},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=ECvgmYVyeUz}\n}", "github": "", "project": "", "reviewers": "bTLa;tWSB;Pr6A;qd5j", "pdf_size": 0, "recommendation": "6;6;8;8", "confidence": "4;4;4;3", "correctness": "4;3;3;4", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "75;379;123;281", "wc_summary_review": "71;97;73;60", "wc_main_review": "374;1145;212;343", "wc_review": "520;1621;408;684", "wc_reply_reviewers": "0;773;179;0", "wc_reply_authors": "1731;3752;610;792", "reply_reviewers": "0;1;1;0", "reply_authors": "4;8;2;2", "recommendation_avg": [ 7.0, 1.0 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 214.5, 121.77335504945242 ], "wc_summary_review_avg": [ 75.25, 13.497684986693088 ], "wc_main_review_avg": [ 518.5, 366.78501877803023 ], "wc_review_avg": [ 808.25, 479.3977341414955 ], "wc_reply_reviewers_avg": [ 238.0, 317.4090420892259 ], "wc_reply_authors_avg": [ 1721.25, 1247.2392262513235 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 4.0, 2.449489742783178 ], "replies_avg": [ 27, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.0, "gs_citation": 145, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7197581293948710911&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "pdf": "https://openreview.net/pdf?id=ECvgmYVyeUz", "email": "pku.edu.cn;;pku.edu.cn;;pku.edu.cn", "author_num": 5, "aff_unique_index": "0;0;0", "aff_unique_norm": "Peking University", "aff_unique_dep": "", "aff_unique_url": "http://www.pku.edu.cn", "aff_unique_abbr": "Peking U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "id": "ECzghp9oujq", "title": "Look at here : Utilizing supervision to attend subtle key regions", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Despite the success of deep learning in computer vision, algorithms to recognize subtle and small objects (or regions) is still challenging. For example, recognizing a baseball or a frisbee on a ground scene or a bone fracture in an X-ray image can easily result in overfitting, unless a huge amount of training data is available. To mitigate this problem, we need a way to force a model should identify subtle regions in limited training data. In this paper, we propose a simple but efficient supervised augmentation method called Cut&Remain. It achieved better performance in multi-class classification tasks (clavicle and pelvic X-ray) and a multi-label classification task of small objects (MS-COCO$_s$) than other supervised augmentation and the explicit guidance methods. In addition, using the class activation map, we identified that the Cut&Remain methods drive a model to focus on relevant subtle and small regions efficiently. We also show that the performance monotonically increased along the Cut&Remain ratio, indicating that a model can be improved even though only limited amount of Cut&Remain is applied for, so that it allows low supervising(annotation) cost for improvement.", "keywords": "Medical image diagnosis;Deep learning;Regularization strategy;Data augmentation", "primary_area": "", "supplementary_material": "/attachment/4342892a89d4de11ef882425d34fec6da6a38cea.zip", "author": "Changhwan Lee;Yeesuk Kim;Bong Gun Lee;Doosup Kim;Jongseong Jang", "authorids": "~Changhwan_Lee1;estone96@hanyang.ac.kr;bglee@hanyang.ac.kr;dskim1974@yonsei.ac.kr;~Jongseong_Jang1", "gender": ";;;;M", "homepage": ";;;;https://sites.google.com/view/jongseong-jang", "dblp": ";;;;140/4388", "google_scholar": "PGdLezIAAAAJ;;;;https://scholar.google.co.kr/citations?user=-DJPQqgAAAAJ", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~Changhwan_Lee1;estone96@hanyang.ac.kr;bglee@hanyang.ac.kr;dskim1974@yonsei.ac.kr;~Jongseong_Jang1", "aff": ";;;;LG AI Research", "aff_domain": ";;;;lgresearch.ai", "position": ";;;;Researcher", "bibtex": "@misc{\nlee2022look,\ntitle={Look at here : Utilizing supervision to attend subtle key regions},\nauthor={Changhwan Lee and Yeesuk Kim and Bong Gun Lee and Doosup Kim and Jongseong Jang},\nyear={2022},\nurl={https://openreview.net/forum?id=ECzghp9oujq}\n}", "github": "", "project": "", "reviewers": "caSD;virW;xF8L", "site": "https://openreview.net/forum?id=ECzghp9oujq", "pdf_size": 0, "recommendation": "3;5;6", "confidence": "5;4;4", "correctness": "2;2;3", "technical_novelty": "2;2;2", "empirical_novelty": "2;2;3", "wc_summary_paper": "56;30;88", "wc_summary_review": "44;70;74", "wc_main_review": "426;171;584", "wc_review": "526;271;746", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 4.666666666666667, 1.247219128924647 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "correctness_avg": [ 2.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 58.0, 23.72059583287626 ], "wc_summary_review_avg": [ 62.666666666666664, 13.299958228840001 ], "wc_main_review_avg": [ 393.6666666666667, 170.14960737212675 ], "wc_review_avg": [ 514.3333333333334, 194.09333379130314 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.944911182523068, "corr_recommendation_correctness": 0.7559289460184545, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=344793948487424620&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0", "aff_unique_norm": "LG", "aff_unique_dep": "LG AI Research", "aff_unique_url": "https://www.lgaires.com", "aff_unique_abbr": "LG AI", "aff_country_unique_index": "0", "aff_country_unique": "South Korea" }, { "title": "Deconstructing the Inductive Biases of Hamiltonian Neural Networks", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6095", "id": "EDeVYpT42oS", "poster": "", "openreview": "https://openreview.net/forum?id=EDeVYpT42oS", "slides": "https://iclr.cc/virtual/2022/poster/6095", "video": "https://iclr.cc/virtual/2022/poster/6095", "author_site": "Nate Gruver, Marc A Finzi, Samuel Stanton, Andrew Wilson", "tldr": "", "abstract": "Physics-inspired neural networks (NNs), such as Hamiltonian or Lagrangian NNs, dramatically outperform other learned dynamics models by leveraging strong inductive biases. These models, however, are challenging to apply to many real world systems, such as those that don\u2019t conserve energy or contain contacts, a common setting for robotics and reinforcement learning. In this paper, we examine the inductive biases that make physics-inspired models successful in practice. We show that, contrary to conventional wisdom, the improved generalization of HNNs is the result of modeling acceleration directly and avoiding artificial complexity from the coordinate system, rather than symplectic structure or energy conservation. We show that by relaxing the inductive biases of these models, we can match or exceed performance on energy-conserving systems while dramatically improving performance on practical, non-conservative systems. We extend this approach to constructing transition models for common Mujoco environments, showing that our model can appropriately balance inductive biases with the flexibility required for model-based control. ", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Nate Gruver;Marc Anton Finzi;Samuel Don Stanton;Andrew Gordon Wilson", "authorids": "~Nate_Gruver1;~Marc_Anton_Finzi1;~Samuel_Don_Stanton1;~Andrew_Gordon_Wilson1", "gender": "M;M;M;Not Specified", "homepage": "https://ngruver.github.io/;https://mfinzi.github.io;https://samuelstanton.github.io/;https://cims.nyu.edu/~andrewgw", "dblp": "223/5568;222/3062;264/1895;65/10453", "google_scholar": "R5QNdhcAAAAJ;ysMAhlwAAAAJ;https://scholar.google.com/citations?hl=en;https://scholar.google.com.tw/citations?user=twWX2LIAAAAJ", "orcid": ";;;", "linkedin": ";;samuel-stanton-06004997/;", "or_profile": "~Nate_Gruver1;~Marc_Anton_Finzi1;~Samuel_Don_Stanton1;~Andrew_Gordon_Wilson1", "aff": "New York University;New York University;New York University;New York University", "aff_domain": "nyu.edu;nyu.edu;nyu.edu;nyu.edu", "position": "PhD student;PhD student;PhD student;Associate Professor", "bibtex": "@inproceedings{\ngruver2022deconstructing,\ntitle={Deconstructing the Inductive Biases of Hamiltonian Neural Networks},\nauthor={Nate Gruver and Marc Anton Finzi and Samuel Don Stanton and Andrew Gordon Wilson},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=EDeVYpT42oS}\n}", "github": "", "project": "", "reviewers": "wu5x;nLbj;7KKB;SW9u", "pdf_size": 0, "recommendation": "6;8;8;8", "confidence": "4;4;4;3", "correctness": "3;3;4;3", "technical_novelty": "3;4;3;3", "empirical_novelty": "3;4;4;3", "wc_summary_paper": "115;103;123;158", "wc_summary_review": "90;16;91;14", "wc_main_review": "438;522;429;159", "wc_review": "643;641;643;331", "wc_reply_reviewers": "24;54;132;5", "wc_reply_authors": "424;527;354;86", "reply_reviewers": "1;1;1;1", "reply_authors": "1;1;1;1", "recommendation_avg": [ 7.5, 0.8660254037844386 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 3.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.5, 0.5 ], "wc_summary_paper_avg": [ 124.75, 20.474068965401088 ], "wc_summary_review_avg": [ 52.75, 37.75827723824274 ], "wc_main_review_avg": [ 387.0, 136.5412025727033 ], "wc_review_avg": [ 564.5, 134.81376042526222 ], "wc_reply_reviewers_avg": [ 53.75, 48.43745967740257 ], "wc_reply_authors_avg": [ 347.75, 163.1691989929472 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 46, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=301233728507989887&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=EDeVYpT42oS", "email": "nyu.edu;nyu.edu;nyu.edu;nyu.edu", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "New York University", "aff_unique_dep": "", "aff_unique_url": "https://www.nyu.edu", "aff_unique_abbr": "NYU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "EFSctTwY4xn", "title": "Towards Generalizable Personalized Federated Learning with Adaptive Local Adaptation", "track": "main", "status": "Reject", "tldr": "", "abstract": "Personalized federated learning aims to find a shared global model that can be adapted to meet personal needs on each individual device. Starting from such a shared initial model, devices should be able to easily adapt to their local dataset to obtain personalized models. However, we find that existing works cannot generalize well on non-iid scenarios with different heterogeneity degrees of the underlying data distribution among devices. Thus, it is challenging for these methods to train a suitable global model to effectively induce high-quality personalized models without changing learning objectives. In this paper, we point out that this issue can be addressed by balancing information flow from the initial model and training dataset to the local adaptation. We then prove a theorem referred to as the {\\em adaptive trade-off theorem}, showing adaptive local adaptation is equivalent to optimizing such information flow based on the information theory. With these theoretical insights, we propose a new framework called {\\em adaptive federated meta-learning} (AFML), designed to achieve generalizable personalized federated learning that maintains solid performance under non-IID data scenarios with different degrees of diversity among devices. We test AFML in an extensive set of these non-IID data scenarios, with both CIFAR-100 and Shakespeare datasets. Experimental results demonstrate that AFML can maintain the highest personalized accuracy compared to alternative leading frameworks, yet with a minimal number of communication rounds and local updates needed.", "keywords": "Personalized federated learning;Meta-learning;Information theory", "primary_area": "", "supplementary_material": "/attachment/3ebe54c01a7dc765b178b8a82fd64bd0a25c3f9c.zip", "author": "Sijia Chen;Baochun Li", "authorids": "~Sijia_Chen2;~Baochun_Li1", "gender": "M;M", "homepage": "https://csjdeveloper.github.io/sjiachen.github.io/;http://iqua.ece.toronto.edu/bli/", "dblp": "241/8721;l/BaochunLi", "google_scholar": "https://scholar.google.ca/citations?user=QWGJWDMAAAAJ;https://scholar.google.com.tw/citations?user=rkb3_FgAAAAJ", "orcid": ";0000-0003-2404-0974", "linkedin": ";https://linkedin.com/in/baochun", "or_profile": "~Sijia_Chen2;~Baochun_Li1", "aff": "Toronto University;University of Toronto", "aff_domain": "utoronto.ca;toronto.edu", "position": "PhD student;Full Professor", "bibtex": "@misc{\nchen2022towards,\ntitle={Towards Generalizable Personalized Federated Learning with Adaptive Local Adaptation},\nauthor={Sijia Chen and Baochun Li},\nyear={2022},\nurl={https://openreview.net/forum?id=EFSctTwY4xn}\n}", "github": "", "project": "", "reviewers": "qKrd;Mvwu;uetq", "site": "https://openreview.net/forum?id=EFSctTwY4xn", "pdf_size": 0, "recommendation": "3;5;6", "confidence": "4;3;4", "correctness": "3;4;4", "technical_novelty": "2;2;3", "empirical_novelty": "2;2;3", "wc_summary_paper": "147;78;137", "wc_summary_review": "14;42;76", "wc_main_review": "455;458;207", "wc_review": "616;578;420", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 4.666666666666667, 1.247219128924647 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 120.66666666666667, 30.44484995674784 ], "wc_summary_review_avg": [ 44.0, 25.350871122442058 ], "wc_main_review_avg": [ 373.3333333333333, 117.62180447896932 ], "wc_review_avg": [ 538.0, 84.86852577172922 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.18898223650461363, "corr_recommendation_correctness": 0.9449111825230683, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:dkXrSfAr3qwJ:scholar.google.com/&scioq=Towards+Generalizable+Personalized+Federated+Learning+with+Adaptive+Local+Adaptation&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "University of Toronto", "aff_unique_dep": "", "aff_unique_url": "https://www.utoronto.ca", "aff_unique_abbr": "U of T", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Canada" }, { "id": "EFgzhSJYIj6", "title": "RL-DARTS: Differentiable Architecture Search for Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Recently, Differentiable Architecture Search (DARTS) has become one of the most popular Neural Architecture Search (NAS) methods successfully applied in supervised learning (SL). However, its applications in other domains, in particular for reinforcement learning (RL), has seldom been studied. This is due in part to RL possessing a significantly different optimization paradigm than SL, especially with regards to the notion of replay data, which is continually generated via inference in RL. In this paper, we introduce RL-DARTS, one of the first applications of end-to-end DARTS in RL to search for convolutional cells, applied to the challenging, infinitely procedurally generated Procgen benchmark. We demonstrate that the benefits of DARTS become amplified when applied to RL, namely search efficiency in terms of time and compute, as well as simplicity in integration with complex preexisting RL code via simply replacing the image encoder with a DARTS supernet, compatible with both off-policy and on-policy RL algorithms. At the same time however, we provide one of the first extensive studies of DARTS outside of the standard fixed dataset setting in SL via RL-DARTS. We show that throughout training, the supernet gradually learns better cells, leading to alternative architectures which can be highly competitive against manually designed policies, but also verify previous design choices for RL policies.", "keywords": "darts;differentiable;architecture;search;neural;nas;rl;reinforcement;learning;procgen;supernet;softmax;variable;ppo;rainbow;off-policy;on-policy;convolutional;autorl;automated;one-shot;efficient", "primary_area": "", "supplementary_material": "", "author": "Yingjie Miao;Xingyou Song;Daiyi Peng;Summer Yue;John D Co-Reyes;Eugene Brevdo;Aleksandra Faust", "authorids": "~Yingjie_Miao1;~Xingyou_Song1;~Daiyi_Peng1;~Summer_Yue1;~John_D_Co-Reyes1;~Eugene_Brevdo1;~Aleksandra_Faust1", "gender": ";M;M;F;M;M;F", "homepage": ";https://xingyousong.github.io/;http://www.daiyip.org;;;https://ebrevdo.github.io/;http://www.afaust.info", "dblp": "22/10043;211/7623;;;198/1129;34/8758;135/8420", "google_scholar": "ScqM05wAAAAJ;GnpHmO8AAAAJ;_8Egwg8AAAAJ;;;NvMCACEAAAAJ;RK72t68AAAAJ", "orcid": ";;;;;;0000-0002-3268-8685", "linkedin": "yingjiemiao/;xingyou-song-355629a1/;;yutingyue;;;aleksandrafaust", "or_profile": "~Yingjie_Miao1;~Xingyou_Song1;~Daiyi_Peng1;~Summer_Yue1;~John_D_Co-Reyes1;~Eugene_Brevdo1;~Aleksandra_Faust1", "aff": "Google DeepMind;Google DeepMind;;;University of California, Berkeley;Google;Google Brain", "aff_domain": "google.com;google.com;;;berkeley.edu;google.com;google.com", "position": "Software Engineer;Senior Research Scientist;;;PhD student;Researcher;Principal Researcher", "bibtex": "@misc{\nmiao2022rldarts,\ntitle={{RL}-{DARTS}: Differentiable Architecture Search for Reinforcement Learning},\nauthor={Yingjie Miao and Xingyou Song and Daiyi Peng and Summer Yue and John D Co-Reyes and Eugene Brevdo and Aleksandra Faust},\nyear={2022},\nurl={https://openreview.net/forum?id=EFgzhSJYIj6}\n}", "github": "", "project": "", "reviewers": "KUPC;wzZ5;wMrv;mu9V", "site": "https://openreview.net/forum?id=EFgzhSJYIj6", "pdf_size": 0, "recommendation": "3;5;6;6", "confidence": "4;4;5;3", "correctness": "4;3;3;3", "technical_novelty": "2;2;2;4", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "51;125;133;73", "wc_summary_review": "38;48;22;15", "wc_main_review": "326;685;431;172", "wc_review": "415;858;586;260", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "1092;1191;659;330", "reply_reviewers": "0;0;0;0", "reply_authors": "3;3;2;2", "recommendation_avg": [ 5.0, 1.224744871391589 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.8660254037844386 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 95.5, 34.50724561595724 ], "wc_summary_review_avg": [ 30.75, 12.987975207860538 ], "wc_main_review_avg": [ 403.5, 186.81340958293117 ], "wc_review_avg": [ 529.75, 221.83594726734438 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 818.0, 345.5539031757564 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.5, 0.5 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": -0.9428090415820632, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12637769145437028600&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;1;0;0", "aff_unique_norm": "Google;University of California, Berkeley", "aff_unique_dep": "Google DeepMind;", "aff_unique_url": "https://deepmind.com;https://www.berkeley.edu", "aff_unique_abbr": "DeepMind;UC Berkeley", "aff_campus_unique_index": "1;2;2", "aff_campus_unique": ";Berkeley;Mountain View", "aff_country_unique_index": "0;0;1;1;1", "aff_country_unique": "United Kingdom;United States" }, { "id": "EG5Pgd7-MY", "title": "Privacy Auditing of Machine Learning using Membership Inference Attacks", "track": "main", "status": "Reject", "tldr": "", "abstract": "Membership inference attacks determine if a given data point is used for training a target model. Thus, this attack could be used as an auditing tool to quantify the private information that a model leaks about the individual data points in its training set. In the last five years, a variety of membership inference attacks against machine learning models are proposed, where each attack exploits a slightly different clue. Also, the attacks are designed under different implicit assumptions about the uncertainties that an attacker has to resolve. Thus attack success rates do not precisely capture the information leakage of models about their data, as they also reflect other uncertainties that the attack algorithm has (for example, about data distribution or characteristics of the target model). In this paper, we present a framework that can explain the implicit assumptions and also the simplifications made in the prior work. We also derive new attack algorithms from our framework that can achieve a high AUC score while also highlighting the different factors that affect their performance. Thus, our algorithms can be used as a tool to perform an accurate and informed estimation of privacy risk in machine learning models. We provide a thorough empirical evaluation of our attack strategies on various machine learning tasks trained on benchmark datasets.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jiayuan Ye;Aadyaa Maddi;Sasi Kumar Murakonda;Reza Shokri", "authorids": "~Jiayuan_Ye1;~Aadyaa_Maddi1;~Sasi_Kumar_Murakonda1;~Reza_Shokri1", "gender": ";;;", "homepage": ";;;", "dblp": ";;241/9846.html;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";aadyaa-maddi/;;", "or_profile": "~Jiayuan_Ye1;~Aadyaa_Maddi1;~Sasi_Kumar_Murakonda1;~Reza_Shokri1", "aff": ";National University of Singapore;Privitar;", "aff_domain": ";nus.edu.sg;privitar.com;", "position": ";Researcher;Researcher;", "bibtex": "@misc{\nye2022privacy,\ntitle={Privacy Auditing of Machine Learning using Membership Inference Attacks},\nauthor={Jiayuan Ye and Aadyaa Maddi and Sasi Kumar Murakonda and Reza Shokri},\nyear={2022},\nurl={https://openreview.net/forum?id=EG5Pgd7-MY}\n}", "github": "", "project": "", "reviewers": "Sgo6;BGkr;XT1N", "site": "https://openreview.net/forum?id=EG5Pgd7-MY", "pdf_size": 0, "recommendation": "3;5;5", "confidence": "4;4;4", "correctness": "3;3;2", "technical_novelty": "2;3;3", "empirical_novelty": "2;3;2", "wc_summary_paper": "100;78;105", "wc_summary_review": "75;48;99", "wc_main_review": "263;270;619", "wc_review": "438;396;823", "wc_reply_reviewers": "49;0;176", "wc_reply_authors": "1282;514;2082", "reply_reviewers": "1;0;1", "reply_authors": "3;1;4", "recommendation_avg": [ 4.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 2.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 94.33333333333333, 11.728408057172787 ], "wc_summary_review_avg": [ 74.0, 20.83266665599966 ], "wc_main_review_avg": [ 384.0, 166.1946649765469 ], "wc_review_avg": [ 552.3333333333334, 192.15676470585734 ], "wc_reply_reviewers_avg": [ 75.0, 74.16647940051264 ], "wc_reply_authors_avg": [ 1292.6666666666667, 640.177753093276 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 2.6666666666666665, 1.247219128924647 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": -0.49999999999999983, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17035894125887710575&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1", "aff_unique_norm": "National University of Singapore;Privitar", "aff_unique_dep": ";", "aff_unique_url": "https://www.nus.edu.sg;https://www.privitar.com", "aff_unique_abbr": "NUS;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "Singapore;United Kingdom" }, { "id": "EGtUVDm991w", "title": "Token Pooling in Vision Transformers", "track": "main", "status": "Reject", "tldr": "", "abstract": "Despite the recent success in many applications, the high computational requirements of vision transformers limit their use in resource-constrained settings. While many existing methods improve the quadratic complexity of attention, in most vision transformers, self-attention is not the major computation bottleneck, e.g., more than 80% of the computation is spent on fully-connected layers. To improve the computational complexity of all layers, we propose a novel token downsampling method, called Token Pooling, efficiently exploiting redundancies in the images and intermediate token representations. We show that, under mild assumptions, softmax-attention acts as a high-dimensional low-pass (smoothing) filter. Thus, its output contains redundancy that can be pruned to achieve a better trade-off between the computational cost and accuracy. Our new technique accurately approximates a set of tokens by minimizing the reconstruction error caused by downsampling. We solve this optimization problem via cost-efficient clustering. We rigorously analyze and compare to prior downsampling methods. Our experiments show that Token Pooling significantly improves the cost-accuracy trade-off over the state-of-the-art downsampling. Token Pooling is a simple and effective operator that can benefit many architectures. Applied to DeiT, it achieves the same ImageNet top-1 accuracy using 42% fewer computations.", "keywords": "Transformer;Pooling;Downsampling;Efficiency", "primary_area": "", "supplementary_material": "", "author": "Dmitrii Marin;Jen-Hao Rick Chang;Anurag Ranjan;Anish Prabhu;Mohammad Rastegari;Oncel Tuzel", "authorids": "~Dmitrii_Marin2;~Jen-Hao_Rick_Chang1;~Anurag_Ranjan1;anish_prabhu@apple.com;~Mohammad_Rastegari2;~Oncel_Tuzel2", "gender": "M;M;M;;M;M", "homepage": "http://maryin.net;https://rick-chang.github.io;http://anuragranjan.com;;https://mrastegari.github.io/;http://www.onceltuzel.net", "dblp": "164/5695;169/4938;;;31/5228;73/2943.html", "google_scholar": "https://scholar.google.ca/citations?user=rPbo3nQAAAAJ;F5Z9kN4AAAAJ;;;N4-2Z_cAAAAJ;Fe7NTe0AAAAJ", "orcid": "0000-0002-4799-3412;;;;;", "linkedin": ";;;;;", "or_profile": "~Dmitrii_Marin2;~Jen-Hao_Rick_Chang1;~Anurag_Ranjan1;anish_prabhu@apple.com;~Mohammad_Rastegari2;~Oncel_Tuzel2", "aff": "Intrinsic;Apple;Apple;;Department of Computer Science, University of Washington;Apple", "aff_domain": "intrinsic.ai;apple.com;apple.com;;cs.washington.edu;apple.com", "position": "Researcher;Researcher;Researcher;;Assistant Professor;Principal Researcher", "bibtex": "@misc{\nmarin2022token,\ntitle={Token Pooling in Vision Transformers},\nauthor={Dmitrii Marin and Jen-Hao Rick Chang and Anurag Ranjan and Anish Prabhu and Mohammad Rastegari and Oncel Tuzel},\nyear={2022},\nurl={https://openreview.net/forum?id=EGtUVDm991w}\n}", "github": "", "project": "", "reviewers": "E53C;jHZj;RU8U", "site": "https://openreview.net/forum?id=EGtUVDm991w", "pdf_size": 0, "recommendation": "5;5;8", "confidence": "4;4;4", "correctness": "4;3;4", "technical_novelty": "3;3;4", "empirical_novelty": "3;2;3", "wc_summary_paper": "38;86;83", "wc_summary_review": "42;45;38", "wc_main_review": "228;247;180", "wc_review": "308;378;301", "wc_reply_reviewers": "0;311;75", "wc_reply_authors": "712;548;132", "reply_reviewers": "0;1;1", "reply_authors": "1;2;1", "recommendation_avg": [ 6.0, 1.4142135623730951 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 3.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 69.0, 21.95449840010015 ], "wc_summary_review_avg": [ 41.666666666666664, 2.8674417556808756 ], "wc_main_review_avg": [ 218.33333333333334, 28.193773938387338 ], "wc_review_avg": [ 329.0, 34.76588366008646 ], "wc_reply_reviewers_avg": [ 128.66666666666666, 132.51498867004526 ], "wc_reply_authors_avg": [ 464.0, 244.12018897802506 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.5, "gs_citation": 82, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10951604485053798493&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;1;2;1", "aff_unique_norm": "Intrinsic;Apple;University of Washington", "aff_unique_dep": ";Apple Inc.;Department of Computer Science", "aff_unique_url": ";https://www.apple.com;https://www.washington.edu", "aff_unique_abbr": ";Apple;UW", "aff_campus_unique_index": "1", "aff_campus_unique": ";Seattle", "aff_country_unique_index": "1;1;1;1", "aff_country_unique": ";United States" }, { "title": "Policy Gradients Incorporating the Future", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6264", "id": "EHaUTlm2eHg", "poster": "", "openreview": "https://openreview.net/forum?id=EHaUTlm2eHg", "slides": "https://iclr.cc/virtual/2022/poster/6264", "video": "https://iclr.cc/virtual/2022/poster/6264", "author_site": "David Venuto, Elaine Lau, Doina Precup, Ofir Nachum", "tldr": "", "abstract": "Reasoning about the future -- understanding how decisions in the present time affect outcomes in the future -- is one of the central challenges for reinforcement learning (RL), especially in highly-stochastic or partially observable environments. While predicting the future directly is hard, in this work we introduce a method that allows an agent to ``look into the future'' without explicitly predicting it. Namely, we propose to allow an agent, during its training on past experience, to observe what \\emph{actually} happened in the future at that time, while enforcing an information bottleneck to avoid the agent overly relying on this privileged information. Coupled with recent advances in variational inference and a latent-variable autoregressive model, this gives our agent the ability to utilize rich and \\emph{useful} information about the future trajectory dynamics in addition to the present. Our method, Policy Gradients Incorporating the Future (PGIF), is easy to implement and versatile, being applicable to virtually any policy gradient algorithm. We apply our proposed method to a number of off-the-shelf RL algorithms and show that PGIF is able to achieve higher reward faster in a variety of online and offline RL domains, as well as sparse-reward and partially observable environments. ", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/f74ebf676ec83a8c504e8f613dfd5ff0ca2feefc.zip", "author": "David Venuto;Elaine Lau;Doina Precup;Ofir Nachum", "authorids": "~David_Venuto1;~Elaine_Lau1;~Doina_Precup1;~Ofir_Nachum1", "gender": "F;F;M;M", "homepage": ";http://cs.mcgill.ca/~dprecup/;https://scholar.google.com/citations?user=C-ZlBWMAAAAJ&hl=en;", "dblp": ";p/DoinaPrecup;;", "google_scholar": "jC63xPkAAAAJ;https://scholar.google.com.tw/citations?user=j54VcVEAAAAJ;C-ZlBWMAAAAJ;https://scholar.google.ca/citations?user=32rbUtYAAAAJ", "orcid": ";;;", "linkedin": "yunglau/;;;", "or_profile": "~Elaine_Lau1;~Doina_Precup1;~Ofir_Nachum1;~David_Anthony_Venuto1", "aff": "McGill University;McGill University;OpenAI;Mila", "aff_domain": "mcgill.ca;mcgill.ca;openai.com;mila.quebec", "position": "Undergrad student;Associate Professor;Researcher;PhD student", "bibtex": "@inproceedings{\nvenuto2022policy,\ntitle={Policy Gradients Incorporating the Future},\nauthor={David Venuto and Elaine Lau and Doina Precup and Ofir Nachum},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=EHaUTlm2eHg}\n}", "github": "", "project": "", "reviewers": "GXWR;UbCM;EPSM;9hJX", "pdf_size": 0, "recommendation": "6;6;6;8", "confidence": "3;5;3;3", "correctness": "3;3;4;4", "technical_novelty": "3;2;3;3", "empirical_novelty": "3;2;3;3", "wc_summary_paper": "282;74;71;151", "wc_summary_review": "442;62;183;65", "wc_main_review": "1015;451;775;410", "wc_review": "1739;587;1029;626", "wc_reply_reviewers": "29;23;362;0", "wc_reply_authors": "2254;1150;2469;362", "reply_reviewers": "1;1;2;0", "reply_authors": "4;2;4;1", "recommendation_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 3.5, 0.8660254037844386 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 144.5, 85.61687917694735 ], "wc_summary_review_avg": [ 188.0, 154.55258004963878 ], "wc_main_review_avg": [ 662.75, 247.6897000280795 ], "wc_review_avg": [ 995.25, 462.95700394313076 ], "wc_reply_reviewers_avg": [ 103.5, 149.63706091740775 ], "wc_reply_authors_avg": [ 1558.75, 853.1141116521283 ], "reply_reviewers_avg": [ 1.0, 0.7071067811865476 ], "reply_authors_avg": [ 2.75, 1.299038105676658 ], "replies_avg": [ 20, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12990856209767480128&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=EHaUTlm2eHg", "email": "mcgill.ca;mcgill.ca;openai.com;mila.quebec", "author_num": 4, "aff_unique_index": "0;0;1;2", "aff_unique_norm": "McGill University;OpenAI;Mila", "aff_unique_dep": ";;Quebec Artificial Intelligence Institute", "aff_unique_url": "https://www.mcgill.ca;https://openai.com;https://mila.quebec", "aff_unique_abbr": "McGill;OpenAI;Mila", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "Canada;United States" }, { "id": "EIm_pvFJx5k", "title": "Meta-Forecasting by combining Global Deep Representations with Local Adaptation", "track": "main", "status": "Reject", "tldr": "", "abstract": "While classical time series forecasting considers individual time series in isolation, recent advances based on deep learning showed that jointly learning from a large pool of related time series can boost the forecasting accuracy. However, the accuracy of these methods suffers greatly when modeling out-of-sample time series, significantly limiting their applicability compared to classical forecasting methods. To bridge this gap, we adopt a meta-learning view of the time series forecasting problem. We introduce a novel forecasting method, called Meta Global- Local Auto-Regression (Meta-GLAR), that adapts to each time series by learning in closed-form the mapping from the representations produced by a recurrent neural network (RNN) to one-step-ahead forecasts. Crucially, the parameters of the RNN are learned across multiple time series by backpropagating through the closed-form adaptation mechanism. In our extensive empirical evaluation we show that our method is competitive with the state-of-the-art in out-of-sample forecasting accuracy reported in earlier work.", "keywords": "time-series;meta-learning;closed-form;solvers", "primary_area": "", "supplementary_material": "/attachment/781da94bca50cb31bb3b811610885b44b7d84d8f.zip", "author": "Riccardo Grazzi;Valentin Flunkert;David Salinas;Tim Januschowski;Matthias Seeger;Cedric Archambeau", "authorids": "~Riccardo_Grazzi2;~Valentin_Flunkert2;~David_Salinas2;~Tim_Januschowski2;~Matthias_Seeger2;~Cedric_Archambeau1", "gender": "M;M;M;M;M;M", "homepage": ";https://mseeger.github.io/;http://www0.cs.ucl.ac.uk/staff/c.archambeau/;https://geoalgo.github.io/;;", "dblp": ";43/5832;59/1878;99/7083.html;222/2069;54/8909", "google_scholar": "https://scholar.google.ca/citations?user=DzlwsFwAAAAJ;V-lc8A8AAAAJ;pPx5WWIAAAAJ;https://scholar.google.fr/citations?user=D0WjJlsAAAAJ;9Tlyx1IAAAAJ;https://scholar.google.de/citations?user=EFdp8UMAAAAJ", "orcid": ";;;;;", "linkedin": ";matthias-seeger-3010b765/?locale=de_DE;carchambeau/;david-salinas-184a7582/;;", "or_profile": "~Valentin_Flunkert2;~Matthias_Seeger2;~Cedric_Archambeau1;~David_Salinas1;~Riccardo_Grazzi1;~Tim_Januschowski1", "aff": "Amazon;Amazon Development Center Germany;Amazon Web Services;Amazon;Istituto Italiano di Tecnologia;", "aff_domain": "amazon.com;amazon.de;amazon.com;amazon.com;iit.it;", "position": "Principal Researcher;Principal Applied Scientist;Principal Researcher;Researcher;PhD student;", "bibtex": "@misc{\ngrazzi2022metaforecasting,\ntitle={Meta-Forecasting by combining Global Deep Representations with Local Adaptation},\nauthor={Riccardo Grazzi and Valentin Flunkert and David Salinas and Tim Januschowski and Matthias Seeger and Cedric Archambeau},\nyear={2022},\nurl={https://openreview.net/forum?id=EIm_pvFJx5k}\n}", "github": "", "project": "", "reviewers": "zvPZ;FZbR;Tvo1", "site": "https://openreview.net/forum?id=EIm_pvFJx5k", "pdf_size": 0, "recommendation": "3;3;5", "confidence": "4;3;3", "correctness": "2;2;2", "technical_novelty": "2;2;2", "empirical_novelty": "1;2;1", "wc_summary_paper": "75;71;112", "wc_summary_review": "154;51;162", "wc_main_review": "262;454;576", "wc_review": "491;576;850", "wc_reply_reviewers": "0;0;227", "wc_reply_authors": "713;451;601", "reply_reviewers": "0;0;1", "reply_authors": "1;1;1", "recommendation_avg": [ 3.6666666666666665, 0.9428090415820634 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "correctness_avg": [ 2.0, 0.0 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 1.3333333333333333, 0.4714045207910317 ], "wc_summary_paper_avg": [ 86.0, 18.457157599876172 ], "wc_summary_review_avg": [ 122.33333333333333, 50.545908725522864 ], "wc_main_review_avg": [ 430.6666666666667, 129.24739412803993 ], "wc_review_avg": [ 639.0, 153.1818091898208 ], "wc_reply_reviewers_avg": [ 75.66666666666667, 107.00882621956418 ], "wc_reply_authors_avg": [ 588.3333333333334, 107.33540370674017 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.5, "corr_recommendation_correctness": 0.0, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6153429647650009523&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0;0;0;1", "aff_unique_norm": "Amazon;Istituto Italiano di Tecnologia", "aff_unique_dep": "Amazon.com, Inc.;", "aff_unique_url": "https://www.amazon.com;https://www.iit.it", "aff_unique_abbr": "Amazon;IIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0;2", "aff_country_unique": "United States;Germany;Italy" }, { "id": "EJKLVMB_9T", "title": "SplitRegex: Faster Regex Synthesis via Neural Example Splitting", "track": "main", "status": "Reject", "tldr": "", "abstract": "Due to the practical importance of regular expressions (regexes, for short), there has been a lot of research to automatically generate regexes from positive and negative string examples. A basic idea of learning a regex is a search-and-repair; search for a correct regex and repair it if incorrect. The problem is known to be PSPACE-complete and the main issue is to obtain a regex quickly within a time limit. \nWhile classical regex learning methods do not perform well, recent approaches using deep neural networks show better performance\nwith respect to the accuracy of the resulting regexes. However, all these approaches including SOTA models are often extremely slow because of the slow searching mechanism, and do not produce desired regexes within a given time limit. \nWe tackle the problem of learning regexes faster from positive and negative strings by relying on a novel approach called `neural example splitting'. Our approach essentially split up example strings into multiple parts using a neural network trained to group similar substrings from positive strings. This helps to learn a regex faster and, thus, more accurately since we now learn from several short-length strings.\nWe propose an effective regex synthesis framework called `SplitRegex' that synthesizes subregexes from `split' positive substrings and produces the final regex by concatenating the synthesized subregexes. For the negative sample, we exploit pre-generated subregexes during the subregex synthesis process and perform the matching against negative strings. Then the final regex becomes consistent with all negative strings. SplitRegex is a divided-and-conquer framework for learning target regexes; split (=divide) positive strings and infer partial regexes for multiple parts, which is much more accurate than the whole string inferring, and concatenate (=conquer) inferred regexes while satisfying negative strings. We empirically demonstrate that the proposed SplitRegex framework substantially improves the previous regex synthesis approaches over four benchmark datasets. ", "keywords": "regular expression;program synthesis;programming by examples;deep learning;neural network", "primary_area": "", "supplementary_material": "/attachment/a51cde8db4cd762d429a9cf3003b711cddef1b53.zip", "author": "Su-Hyeon Kim;Hyunjoon Cheon;Yo-Sub Han;Sang-Ki Ko", "authorids": "tngus98207@kangwon.ac.kr;hyunjooncheon@yonsei.ac.kr;~Yo-Sub_Han1;~Sang-Ki_Ko1", "gender": ";;;M", "homepage": ";;http://toc.yonsei.ac.kr/~emmous/;https://sites.google.com/site/sangkikotoc/home", "dblp": ";;h/YoSubHan;71/9491.html", "google_scholar": ";;yDOh26sAAAAJ;https://scholar.google.com/scholar?hl=en", "orcid": ";;;", "linkedin": ";;;", "or_profile": "tngus98207@kangwon.ac.kr;hyunjooncheon@yonsei.ac.kr;~Yo-Sub_Han1;~Sang-Ki_Ko1", "aff": ";;Yonsei University;Kangwon National University", "aff_domain": ";;yonsei.ac.kr;kangwon.ac.kr", "position": ";;Full Professor;Assistant Professor", "bibtex": "@misc{\nkim2022splitregex,\ntitle={SplitRegex: Faster Regex Synthesis via Neural Example Splitting},\nauthor={Su-Hyeon Kim and Hyunjoon Cheon and Yo-Sub Han and Sang-Ki Ko},\nyear={2022},\nurl={https://openreview.net/forum?id=EJKLVMB_9T}\n}", "github": "", "project": "", "reviewers": "fKY9;Nx3i;Nyta;qHid", "site": "https://openreview.net/forum?id=EJKLVMB_9T", "pdf_size": 0, "recommendation": "5;5;6;8", "confidence": "4;4;3;3", "correctness": "3;3;4;4", "technical_novelty": "3;2;2;3", "empirical_novelty": "2;2;3;4", "wc_summary_paper": "88;48;95;176", "wc_summary_review": "65;46;51;82", "wc_main_review": "414;510;287;435", "wc_review": "567;604;433;693", "wc_reply_reviewers": "457;0;0;0", "wc_reply_authors": "2294;1415;603;660", "reply_reviewers": "2;0;0;0", "reply_authors": "5;2;1;1", "recommendation_avg": [ 6.0, 1.224744871391589 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 101.75, 46.46705822407956 ], "wc_summary_review_avg": [ 61.0, 13.982131454109563 ], "wc_main_review_avg": [ 411.5, 80.25116821579608 ], "wc_review_avg": [ 574.25, 93.52907302010429 ], "wc_reply_reviewers_avg": [ 114.25, 197.88680476474423 ], "wc_reply_authors_avg": [ 1243.0, 686.2350180513962 ], "reply_reviewers_avg": [ 0.5, 0.8660254037844386 ], "reply_authors_avg": [ 2.25, 1.6393596310755 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.8164965809277259, "corr_recommendation_correctness": 0.8164965809277259, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8324920319186546689&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1", "aff_unique_norm": "Yonsei University;Kangwon National University", "aff_unique_dep": ";", "aff_unique_url": "https://www.yonsei.ac.kr;http://www.kangwon.ac.kr", "aff_unique_abbr": "Yonsei;KNU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "South Korea" }, { "id": "EKjUnoX-7M0", "title": "A new look at fairness in stochastic multi-armed bandit problems", "track": "main", "status": "Reject", "tldr": "", "abstract": "We study an important variant of the stochastic multi-armed bandit (MAB) problem, which takes fairness into consideration. Instead of directly maximizing cumulative expected reward, we need to balance between the total reward and fairness level. In this paper, we present a new insight in MAB with fairness and formulate the problem in the penalization framework, where rigorous penalized regret can be well defined and more sophisticated regret analysis is possible. Under such a framework, we propose a hard-threshold UCB-like algorithm, which enjoys many merits including asymptotic fairness, nearly optimal regret, better tradeoff between reward and fairness. Both gap-dependent and gap-independent upper bounds have been established. Lower bounds are also given to illustrate the tightness of our theoretical analysis. Numerous experimental results corroborate the theory and show the superiority of our method over other existing methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Guanhua Fang;Ping Li;Gennady Samorodnitsky", "authorids": "~Guanhua_Fang1;~Ping_Li3;~Gennady_Samorodnitsky1", "gender": ";M;", "homepage": "https://sites.google.com/view/hyperfang2020/home;http://www.stat.rutgers.edu/home/pingli/;https://people.orie.cornell.edu/gennady/", "dblp": "274/1306;62/5860-1;", "google_scholar": "VG7zQQsAAAAJ;;", "orcid": ";;", "linkedin": ";;", "or_profile": "~Guanhua_Fang1;~Ping_Li3;~Gennady_Samorodnitsky1", "aff": "Baidu;LinkedIn;Cornell University", "aff_domain": "baidu.com;linkedin.com;cornell.edu", "position": "Postdoc;Engineer;Full Professor", "bibtex": "@misc{\nfang2022a,\ntitle={A new look at fairness in stochastic multi-armed bandit problems},\nauthor={Guanhua Fang and Ping Li and Gennady Samorodnitsky},\nyear={2022},\nurl={https://openreview.net/forum?id=EKjUnoX-7M0}\n}", "github": "", "project": "", "reviewers": "rUcE;NynE;PJx4;rWC1", "site": "https://openreview.net/forum?id=EKjUnoX-7M0", "pdf_size": 0, "recommendation": "5;5;5;6", "confidence": "4;4;4;4", "correctness": "3;3;4;4", "technical_novelty": "3;1;3;2", "empirical_novelty": "2;2;3;2", "wc_summary_paper": "163;57;54;115", "wc_summary_review": "28;20;61;51", "wc_main_review": "277;288;297;614", "wc_review": "468;365;412;780", "wc_reply_reviewers": "0;0;46;0", "wc_reply_authors": "333;501;729;507", "reply_reviewers": "0;0;1;0", "reply_authors": "2;1;2;1", "recommendation_avg": [ 5.25, 0.4330127018922193 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.25, 0.82915619758885 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 97.25, 45.07979037218341 ], "wc_summary_review_avg": [ 40.0, 16.62828914831589 ], "wc_main_review_avg": [ 369.0, 141.6280339480853 ], "wc_review_avg": [ 506.25, 162.20107120484747 ], "wc_reply_reviewers_avg": [ 11.5, 19.91858428704209 ], "wc_reply_authors_avg": [ 517.5, 140.67249198048637 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:YGUKLf0spSEJ:scholar.google.com/&scioq=A+new+look+at+fairness+in+stochastic+multi-armed+bandit+problems&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;2", "aff_unique_norm": "Baidu;LinkedIn Corporation;Cornell University", "aff_unique_dep": "Baidu, Inc.;;", "aff_unique_url": "https://www.baidu.com;https://www.linkedin.com;https://www.cornell.edu", "aff_unique_abbr": "Baidu;LinkedIn;Cornell", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1", "aff_country_unique": "China;United States" }, { "id": "EMLJ_mTz_z", "title": "Convolutional Neural Network Dynamics: A Graph Perspective", "track": "main", "status": "Reject", "tldr": "", "abstract": "The success of neural networks (NNs) in a wide range of applications has led to increased interest in understanding the underlying learning dynamics of these models. In this paper, we go beyond mere descriptions of the learning dynamics by taking a graph perspective and investigating the relationship between the graph structure of NNs and their performance. \nSpecifically, we propose (1) representing the neural network learning process as a time-evolving graph (i.e., a series of static graph snapshots over epochs), (2) capturing the structural changes of the NN during the training phase in a simple temporal summary, and (3) leveraging the structural summary to predict the accuracy of the underlying NN in a classification or regression task. For the dynamic graph representation of NNs, we explore structural representations for fully-connected and convolutional layers, which are key components of powerful NN models. Our analysis shows that a simple summary of graph statistics, such as weighted degree and eigenvector centrality, over just a few epochs, can be used to accurately predict the performance of NNs. For example, a weighted degree-based summary of the time-evolving graph that is constructed based on 5 training epochs of the LeNet architecture achieves classification accuracy of over 93\\%. Our findings are consistent for different NN architectures, including LeNet, VGG, AlexNet, and ResNet.", "keywords": "neural network dynamics;time-evolving graphs;interpretation of neural networks;performance prediction", "primary_area": "", "supplementary_material": "", "author": "Fatemeh Vahedian;Ruiyu Li;Puja Trivedi;Di Jin;Danai Koutra", "authorids": "~Fatemeh_Vahedian1;~Ruiyu_Li3;~Puja_Trivedi1;~Di_Jin3;~Danai_Koutra1", "gender": "F;F;F;M;F", "homepage": "https://fvahedian.github.io/publications/;https://github.com/HoningJade;https://pujacomputes.github.io/;https://derekdijin.github.io/;http://web.eecs.umich.edu/~dkoutra/", "dblp": ";;274/2080;;91/9987", "google_scholar": ";;1y9cR50AAAAJ;WwZ7biAAAAAJ;https://scholar.google.com.tw/citations?user=bDrA1-8AAAAJ", "orcid": ";;0000-0003-1874-8992;0000-0001-8028-0556;0000-0002-3206-8179", "linkedin": "fvahedian/;ruiyu-li-a58726178/;;;", "or_profile": "~Fatemeh_Vahedian1;~Ruiyu_Li3;~Puja_Trivedi1;~Di_Jin3;~Danai_Koutra1", "aff": "University of Michigan;Shanghai Jiaotong University;University of Michigan;Amazon;Amazon", "aff_domain": "umich.edu;sjtu.edu.cn;umich.edu;amazon.com;amazon.com", "position": "Postdoc;Undergrad student;PhD student;Researcher;Scholar", "bibtex": "@misc{\nvahedian2022convolutional,\ntitle={Convolutional Neural Network Dynamics: A Graph Perspective},\nauthor={Fatemeh Vahedian and Ruiyu Li and Puja Trivedi and Di Jin and Danai Koutra},\nyear={2022},\nurl={https://openreview.net/forum?id=EMLJ_mTz_z}\n}", "github": "", "project": "", "reviewers": "DNSD;zP52;Txru;cuEB", "site": "https://openreview.net/forum?id=EMLJ_mTz_z", "pdf_size": 0, "recommendation": "3;5;6;8", "confidence": "5;3;3;3", "correctness": "2;3;4;4", "technical_novelty": "3;3;3;3", "empirical_novelty": "2;3;2;3", "wc_summary_paper": "103;100;70;109", "wc_summary_review": "58;9;35;22", "wc_main_review": "161;215;445;91", "wc_review": "322;324;550;222", "wc_reply_reviewers": "42;0;53;0", "wc_reply_authors": "1616;948;984;49", "reply_reviewers": "1;0;1;0", "reply_authors": "5;2;2;1", "recommendation_avg": [ 5.5, 1.8027756377319946 ], "confidence_avg": [ 3.5, 0.8660254037844386 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 95.5, 15.074813431681335 ], "wc_summary_review_avg": [ 31.0, 18.096961070853858 ], "wc_main_review_avg": [ 228.0, 132.7742444904131 ], "wc_review_avg": [ 354.5, 120.1696717146219 ], "wc_reply_reviewers_avg": [ 23.75, 24.06631463269771 ], "wc_reply_authors_avg": [ 899.25, 558.1699449988328 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 2.5, 1.5 ], "replies_avg": [ 20, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.8006407690254357, "corr_recommendation_correctness": 0.9198662110077999, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=809565733238051918&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;0;2;2", "aff_unique_norm": "University of Michigan;Shanghai Jiao Tong University;Amazon", "aff_unique_dep": ";;Amazon.com, Inc.", "aff_unique_url": "https://www.umich.edu;https://www.sjtu.edu.cn;https://www.amazon.com", "aff_unique_abbr": "UM;SJTU;Amazon", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0;0", "aff_country_unique": "United States;China" }, { "title": "Hybrid Random Features", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6410", "id": "EMigfE6ZeS", "poster": "", "openreview": "https://openreview.net/forum?id=EMigfE6ZeS", "slides": "https://iclr.cc/virtual/2022/poster/6410", "video": "https://iclr.cc/virtual/2022/poster/6410", "author_site": "Krzysztof Choromanski, Han Lin, Haoxian Chen, Arijit Sehanobish, Yuanzhe Ma, Deepali Jain, Jake Varley, Andy Zeng, Michael Ryoo, Valerii Likhosherstov, Dmitry Kalashnikov, Vikas Sindhwani, Adrian Weller", "tldr": "", "abstract": "We propose a new class of random feature methods for linearizing softmax and Gaussian kernels called hybrid random features (HRFs) that automatically adapt the quality of kernel estimation to provide most accurate approximation in the defined regions of interest. Special instantiations of HRFs lead to well-known methods such as trigonometric (Rahimi & Recht, 2007) or (recently introduced in the context of linear-attention Transformers) positive random features (Choromanski et al., 2021). By generalizing Bochner\u2019s Theorem for softmax/Gaussian kernels and leveraging random features for compositional kernels, the HRF-mechanism provides strong theoretical guarantees - unbiased approximation and strictly smaller worst-case relative errors than its counterparts. We conduct exhaustive empirical evaluation of HRF ranging from pointwise kernel estimation experiments, through tests on data admitting clustering structure to benchmarking implicit-attention Transformers (also for downstream Robotics applications), demonstrating its quality in a wide spectrum of machine learning problems.", "keywords": "random features;softmax kernel;attention mechanism;compositional kernels", "primary_area": "", "supplementary_material": "/attachment/1e565f73725ab0fcc22d7315f40dd525194cf5f2.zip", "author": "Krzysztof Marcin Choromanski;Han Lin;Haoxian Chen;Arijit Sehanobish;Yuanzhe Ma;Deepali Jain;Jake Varley;Andy Zeng;Michael S Ryoo;Valerii Likhosherstov;Dmitry Kalashnikov;Vikas Sindhwani;Adrian Weller", "authorids": "~Krzysztof_Marcin_Choromanski1;~Han_Lin1;~Haoxian_Chen2;~Arijit_Sehanobish1;~Yuanzhe_Ma1;~Deepali_Jain1;~Jake_Varley1;~Andy_Zeng1;~Michael_S_Ryoo1;~Valerii_Likhosherstov2;~Dmitry_Kalashnikov1;~Vikas_Sindhwani1;~Adrian_Weller1", "gender": ";M;;M;M;F;M;M;M;;;M;M", "homepage": ";https://hl-hanlin.github.io/;;https://github.com/arijitthegame/;https://yuanzhe-ma.com/;;http://www.cs.columbia.edu/~jvarley/;http://andyzeng.github.io/;http://michaelryoo.com/;https://valerytyumen.github.io/;;http://vikas.sindhwani.org;http://mlg.eng.cam.ac.uk/adrian/", "dblp": "78/11411;;;249/5322;304/2466;84/8010;;http://dblp.uni-trier.de/pers/hd/z/Zeng:Andy;r/MichaelSRyoo;232/4391.html;222/2882;26/4825;73/8324", "google_scholar": ";https://scholar.google.com/citations?view_op=list_works;;MEby6-QAAAAJ;4d8UV8sAAAAJ;;UJcm1MoAAAAJ;q7nFtUcAAAAJ;vcw0TJIAAAAJ;iiVVfxUAAAAJ;;https://scholar.google.com/citations?hl=en;https://scholar.google.co.uk/citations?user=Ek4hM10AAAAJ", "orcid": ";;;0000-0003-2769-2003;;;;;;;;;", "linkedin": ";han-lin-9336981a3/;;arijit-sehanobish-b76627112/;yuanzhema;;;;;;;vikassindhwani;", "or_profile": "~Krzysztof_Marcin_Choromanski1;~Han_Lin1;~Haoxian_Chen2;~Arijit_Sehanobish1;~Yuanzhe_Ma1;~Deepali_Jain1;~Jake_Varley1;~Andy_Zeng1;~Michael_S_Ryoo1;~Valerii_Likhosherstov2;~Dmitry_Kalashnikov1;~Vikas_Sindhwani1;~Adrian_Weller1", "aff": "Google Brain Robotics & Columbia University;Columbia University;;Covera Health;Columbia University;Google;Google;Google;Google DeepMind;;Google;Google;University of Cambridge", "aff_domain": "columbia.edu;columbia.edu;;coverahealth.com;columbia.edu;google.com;google.com;google.com;google.com;;google.com;google.com;cam.ac.uk", "position": "research scientist & adjunct assistant professor;MS student;;AI Scientist;PhD student;Researcher;Engineer;Research Scientist;Research Scientist;;Researcher;Senior Staff Research Scientist;Principal Researcher", "bibtex": "@inproceedings{\nchoromanski2022hybrid,\ntitle={Hybrid Random Features},\nauthor={Krzysztof Marcin Choromanski and Han Lin and Haoxian Chen and Arijit Sehanobish and Yuanzhe Ma and Deepali Jain and Jake Varley and Andy Zeng and Michael S Ryoo and Valerii Likhosherstov and Dmitry Kalashnikov and Vikas Sindhwani and Adrian Weller},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=EMigfE6ZeS}\n}", "github": "", "project": "", "reviewers": "YmNP;Ks6B;iboB", "pdf_size": 0, "recommendation": "6;8;8", "confidence": "4;5;4", "correctness": "3;3;4", "technical_novelty": "3;3;4", "empirical_novelty": "3;3;2", "wc_summary_paper": "111;92;211", "wc_summary_review": "32;41;86", "wc_main_review": "1128;500;672", "wc_review": "1271;633;969", "wc_reply_reviewers": "123;91;21", "wc_reply_authors": "2250;1479;1598", "reply_reviewers": "1;1;1", "reply_authors": "9;8;6", "recommendation_avg": [ 7.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 3.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 138.0, 52.19833969262496 ], "wc_summary_review_avg": [ 53.0, 23.62202362203543 ], "wc_main_review_avg": [ 766.6666666666666, 264.9746319094633 ], "wc_review_avg": [ 957.6666666666666, 260.58566516385525 ], "wc_reply_reviewers_avg": [ 78.33333333333333, 42.59368758656876 ], "wc_reply_authors_avg": [ 1775.6666666666667, 338.9044440874481 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 7.666666666666667, 1.247219128924647 ], "replies_avg": [ 31, 0 ], "authors#_avg": [ 13, 0 ], "corr_recommendation_confidence": 0.4999999999999999, "corr_recommendation_correctness": 0.49999999999999983, "gs_citation": 24, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2876447462470263027&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 7, "pdf": "https://openreview.net/pdf?id=EMigfE6ZeS", "email": "columbia.edu;columbia.edu;;coverahealth.com;columbia.edu;google.com;google.com;google.com;google.com;;google.com;google.com;cam.ac.uk", "author_num": 13, "aff_unique_index": "0;1;2;1;0;0;0;0;0;0;3", "aff_unique_norm": "Google;Columbia University;Covera Health;University of Cambridge", "aff_unique_dep": "Google Brain Robotics;;;", "aff_unique_url": "https://ai.google;https://www.columbia.edu;https://www.coverahealth.com;https://www.cam.ac.uk", "aff_unique_abbr": "Google;Columbia;;Cambridge", "aff_campus_unique_index": "0;0;0;0;0;0;2", "aff_campus_unique": "Mountain View;;Cambridge", "aff_country_unique_index": "0;0;0;0;0;0;0;1;0;0;1", "aff_country_unique": "United States;United Kingdom" }, { "title": "GRAND++: Graph Neural Diffusion with A Source Term", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7172", "id": "EMxu-dzvJk", "poster": "", "openreview": "https://openreview.net/forum?id=EMxu-dzvJk", "slides": "https://iclr.cc/virtual/2022/poster/7172", "video": "https://iclr.cc/virtual/2022/poster/7172", "author_site": "Matthew Thorpe, Tan M Nguyen, Hedi Xia, Thomas Strohmer, Andrea Bertozzi, Stanley J Osher, Bao Wang", "tldr": "", "abstract": "We propose GRAph Neural Diffusion with a source term (GRAND++) for graph deep learning with a limited number of labeled nodes, i.e., low-labeling rate. GRAND++ is a class of continuous-depth graph deep learning architectures whose theoretical underpinning is the diffusion process on graphs with a source term. The source term guarantees two interesting theoretical properties of GRAND++: (i) the representation of graph nodes, under the dynamics of GRAND++, will not converge to a constant vector over all nodes even as the time goes to infinity, which mitigates the over-smoothing issue of graph neural networks and enables graph learning in very deep architectures. (ii) GRAND++ can provide accurate classification even when the model is trained with a very limited number of labeled training data. We experimentally verify the above two advantages on various graph deep learning benchmark tasks, showing a significant improvement over many existing graph neural networks.", "keywords": "graph deep learning;low-labeling rates;diffusion on graphs;random walk", "primary_area": "", "supplementary_material": "/attachment/617e1ec32444e0e31f6a46f16e5a6deca155f535.zip", "author": "Matthew Thorpe;Tan Minh Nguyen;Hedi Xia;Thomas Strohmer;Andrea Bertozzi;Stanley Osher;Bao Wang", "authorids": "matthew.thorpe-2@manchester.ac.uk;~Tan_Minh_Nguyen1;~Hedi_Xia2;~Thomas_Strohmer1;~Andrea_Bertozzi1;~Stanley_Osher1;~Bao_Wang1", "gender": ";M;;;F;M;M", "homepage": ";https://tanmnguyen89.github.io/;https://www.math.ucla.edu/~hedixia/;;http://www.math.ucla.edu/~bertozzi;https://www.math.ucla.edu/~sjo/;https://www.math.utah.edu/~bwang/index.html", "dblp": ";255/4725;;;80/2099.html;;", "google_scholar": ";OizOh88AAAAJ;jpIL6mgAAAAJ;;VJPRn1oAAAAJ;;", "orcid": ";;;;0000-0003-0396-7391;;", "linkedin": ";;hedixia/;;;;", "or_profile": "matthew.thorpe-2@manchester.ac.uk;~Tan_Minh_Nguyen1;~Hedi_Xia2;~Thomas_Strohmer1;~Andrea_Bertozzi1;~Stanley_Osher1;~Bao_Wang1", "aff": ";University of California, Los Angeles;University of California, Los Angeles;;University of California, Los Angeles;University of California, Los Angeles;University of Utah", "aff_domain": ";ucla.edu;ucla.edu;;math.ucla.edu;ucla.edu;utah.edu", "position": ";Postdoc;PhD student;;Full Professor;Full Professor;Assistant Professor", "bibtex": "@inproceedings{\nthorpe2022grand,\ntitle={{GRAND}++: Graph Neural Diffusion with A Source Term},\nauthor={Matthew Thorpe and Tan Minh Nguyen and Hedi Xia and Thomas Strohmer and Andrea Bertozzi and Stanley Osher and Bao Wang},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=EMxu-dzvJk}\n}", "github": "", "project": "", "reviewers": "qrMB;p1wb;zJ38;y1z1;durM", "pdf_size": 0, "recommendation": "6;6;6;6;8", "confidence": "4;4;3;3;3", "correctness": "4;3;3;3;4", "technical_novelty": "3;3;2;2;3", "empirical_novelty": "2;2;2;2;0", "wc_summary_paper": "39;72;104;102;58", "wc_summary_review": "37;76;7;210;38", "wc_main_review": "164;820;252;403;217", "wc_review": "240;968;363;715;313", "wc_reply_reviewers": "26;155;36;0;0", "wc_reply_authors": "372;2394;502;1782;421", "reply_reviewers": "1;1;1;0;0", "reply_authors": "2;4;2;3;1", "recommendation_avg": [ 6.4, 0.7999999999999999 ], "confidence_avg": [ 3.4, 0.4898979485566356 ], "correctness_avg": [ 3.4, 0.4898979485566356 ], "technical_novelty_avg": [ 2.6, 0.4898979485566356 ], "empirical_novelty_avg": [ 1.6, 0.8 ], "wc_summary_paper_avg": [ 75.0, 25.155516293648198 ], "wc_summary_review_avg": [ 73.6, 71.6284859535646 ], "wc_main_review_avg": [ 371.2, 238.05495163932213 ], "wc_review_avg": [ 519.8, 277.3542139575312 ], "wc_reply_reviewers_avg": [ 43.4, 57.58333092136995 ], "wc_reply_authors_avg": [ 1094.2, 835.2270110574729 ], "reply_reviewers_avg": [ 0.6, 0.48989794855663565 ], "reply_authors_avg": [ 2.4, 1.019803902718557 ], "replies_avg": [ 23, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": -0.408248290463863, "corr_recommendation_correctness": 0.6123724356957947, "gs_citation": 96, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1545935141008006690&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=EMxu-dzvJk", "email": ";ucla.edu;ucla.edu;;math.ucla.edu;ucla.edu;utah.edu", "author_num": 7, "aff_unique_index": "0;0;0;0;1", "aff_unique_norm": "University of California, Los Angeles;University of Utah", "aff_unique_dep": ";", "aff_unique_url": "https://www.ucla.edu;https://www.utah.edu", "aff_unique_abbr": "UCLA;Utah", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Los Angeles;", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "EO4VJGAllb", "title": "Adversarial Training: A simple and efficient technique to Improving NLP Robustness", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "NLP models are shown to be prone to adversarial attacks which undermines their robustness, i.e. a small perturbation to the input text can fool an NLP model to incorrectly classify text. In this study, we present a new Adversarial Text Generation technique that, given an input text, generates adversarial texts through quickly and efficiently. For example, in order to attack a model for sentiment classification, we can use the product categories as the attribute which should not change the sentiment of the reviews. We conducted experiments on real-world NLP datasets to demonstrate that our technique can generate more meaningful and diverse adversarial texts, compared to many existing adversarial text generation approaches. We further use our generated adversarial examples to improve models through adversarial training, and we demonstrate that our generated attacks are more robust against model re training and different model architectures.", "keywords": "Adversarial training;NLP models;NLP robustness;adversarial attacks.", "primary_area": "", "supplementary_material": "", "author": "marwan omar", "authorids": "~marwan_omar1", "gender": "M", "homepage": "", "dblp": "", "google_scholar": "5T5iAZQAAAAJ", "orcid": "", "linkedin": "", "or_profile": "~marwan_omar1", "aff": "Saint Leo University", "aff_domain": "saintleo.edu", "position": "Associate Professor", "bibtex": "@misc{\nomar2022adversarial,\ntitle={Adversarial Training: A simple and efficient technique to Improving {NLP} Robustness},\nauthor={marwan omar},\nyear={2022},\nurl={https://openreview.net/forum?id=EO4VJGAllb}\n}", "github": "", "project": "", "reviewers": "", "site": "https://openreview.net/forum?id=EO4VJGAllb", "pdf_size": 0, "recommendation": "", "confidence": "", "correctness": "", "technical_novelty": "", "empirical_novelty": "", "wc_summary_paper": "", "wc_summary_review": "", "wc_main_review": "", "wc_review": "", "wc_reply_reviewers": "", "wc_reply_authors": "", "reply_reviewers": "", "reply_authors": "", "recommendation_avg": [ 0, 0 ], "confidence_avg": [ 0, 0 ], "correctness_avg": [ 0, 0 ], "technical_novelty_avg": [ 0, 0 ], "empirical_novelty_avg": [ 0, 0 ], "wc_summary_paper_avg": [ 0, 0 ], "wc_summary_review_avg": [ 0, 0 ], "wc_main_review_avg": [ 0, 0 ], "wc_review_avg": [ 0, 0 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 2, 0 ], "authors#_avg": [ 1, 0 ], "corr_recommendation_confidence": 0, "corr_recommendation_correctness": 0, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1, "aff_unique_index": "0", "aff_unique_norm": "Saint Leo University", "aff_unique_dep": "", "aff_unique_url": "https://www.saintleo.edu", "aff_unique_abbr": "SLU", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "id": "EPIeOo3ql96", "title": "Testing-Time Adaptation through Online Normalization Estimation", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "We present a simple and effective way to estimate the batch-norm statistics during test time, to fast adapt a source model to target test samples. Known as Test-Time Adaptation, most prior work studying this task follows two assumptions in their evaluation where (1) test samples come together as a large batch, and (2) all from a single test distribution. However, in practice, these two assumptions may not stand, the reasons for which we propose two new evaluation settings where batch sizes are arbitrary and multiple distributions are considered. Unlike the previous methods that require a large batch of single distribution during test time to calculate stable batch-norm statistics, our method avoid any dependency on large online batches and is able to estimate accurate batch-norm statistics with a single sample. The proposed method significantly outperforms the State-Of-The-Art in the newly proposed settings in Test-Time Adaptation Task and also demonstrates improvements in various other settings such as Source-Free Unsupervised Domain Adaptation and Zero-Shot Classification.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xuefeng Hu;Mustafa Uzunbas;Bor-Chun Chen;Rui Wang;Ashish Shah;Ram Nevatia;Ser-Nam Lim", "authorids": "~Xuefeng_Hu1;~Mustafa_Uzunbas1;~Bor-Chun_Chen1;~Rui_Wang23;~Ashish_Shah1;~Ram_Nevatia1;~Ser-Nam_Lim3", "gender": "M;M;;M;M;M;M", "homepage": "https://xuefenghu.me;;;;;http://iris.usc.edu/people/nevatia/;https://sites.google.com/site/sernam", "dblp": ";15/2822;86/10575;;01/2068;n/RamakantNevatia;04/6633", "google_scholar": "https://scholar.google.com/citations?hl=en;_2KhezQAAAAJ;nShMV5oAAAAJ;FWpOydIAAAAJ;KQrLwIAAAAAJ;https://scholar.google.com.tw/citations?user=EUMYhUvzt6IC;HX0BfLYAAAAJ", "orcid": ";;;;;;", "linkedin": "xuefeng-hu-137b9485/;gokhan-uzunbas-9baaa04/;;;ashish217/;;", "or_profile": "~Xuefeng_Hu1;~Mustafa_Uzunbas1;~Bor-Chun_Chen1;~Rui_Wang23;~Ashish_Shah1;~Ram_Nevatia1;~Ser-Nam_Lim1", "aff": "University of Southern California;;Meta Facebook;Meta Facebook;Meta Facebook;University of Southern California;Meta Facebook", "aff_domain": "usc.edu;;fb.com;fb.com;fb.com;usc.edu;facebook.com", "position": "PhD student;;Researcher;Researcher;Researcher;Full Professor;Research Scientist Manager", "bibtex": "@misc{\nhu2022testingtime,\ntitle={Testing-Time Adaptation through Online Normalization Estimation},\nauthor={Xuefeng Hu and Mustafa Uzunbas and Bor-Chun Chen and Rui Wang and Ashish Shah and Ram Nevatia and Ser-Nam Lim},\nyear={2022},\nurl={https://openreview.net/forum?id=EPIeOo3ql96}\n}", "github": "", "project": "", "reviewers": "bnBw;9Rbo;E7UF", "site": "https://openreview.net/forum?id=EPIeOo3ql96", "pdf_size": 0, "recommendation": "3;5;5", "confidence": "4;5;4", "correctness": "2;3;3", "technical_novelty": "2;2;2", "empirical_novelty": "2;2;3", "wc_summary_paper": "44;83;76", "wc_summary_review": "29;90;65", "wc_main_review": "349;173;279", "wc_review": "422;346;420", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 4.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "correctness_avg": [ 2.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 67.66666666666667, 16.97710877099579 ], "wc_summary_review_avg": [ 61.333333333333336, 25.037749277618563 ], "wc_main_review_avg": [ 267.0, 72.35099630735341 ], "wc_review_avg": [ 396.0, 35.364765892999586 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.4999999999999999, "corr_recommendation_correctness": 0.9999999999999998, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:sCa9Y_ggc-cJ:scholar.google.com/&scioq=Testing-Time+Adaptation+through+Online+Normalization+Estimation&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;1;1;0;1", "aff_unique_norm": "University of Southern California;Meta", "aff_unique_dep": ";Meta Platforms, Inc.", "aff_unique_url": "https://www.usc.edu;https://meta.com", "aff_unique_abbr": "USC;Meta", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Los Angeles;", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "EQ7A6F7k0r_", "title": "QTN-VQC: An End-to-End Learning Framework for Quantum Neural Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "The advent of noisy intermediate-scale quantum (NISQ) computers raises a crucial challenge to design quantum neural networks for fully quantum learning tasks. To bridge the gap, this work proposes an end-to-end learning framework named QTN-VQC, by introducing a trainable quantum tensor network (QTN) for quantum embedding on a variational quantum circuit (VQC). The architecture of QTN is composed of a parametric tensor-train network for feature extraction and a tensor product encoding for quantum encoding. We highlight the QTN for quantum embedding in terms of two perspectives: (1) we theoretically characterize QTN by analyzing its representation power of input features; (2) QTN enables an end-to-end parametric model pipeline, namely QTN-VQC, from the generation of quantum embedding to the output measurement. Our experiments on the MNIST dataset demonstrate the advantages of QTN for quantum embedding over other quantum embedding approaches.", "keywords": "quantum neural networks;variational quantum circuits;end-to-end learning framework;tensor-train network", "primary_area": "", "supplementary_material": "/attachment/a1a2a7add091d9a00e97964615fe41f1d26b8b47.zip", "author": "Jun Qi;Chao-Han Huck Yang;Pin-Yu Chen", "authorids": "~Jun_Qi1;~Chao-Han_Huck_Yang1;~Pin-Yu_Chen1", "gender": "M;M;M", "homepage": "https://sites.google.com/site/uwjunqi/home;https://huckiyang.github.io/;http://www.pinyuchen.com", "dblp": "133/4051-2;230/4012;39/8969", "google_scholar": "7oZpnlkAAAAJ;TT3XJW8AAAAJ;jxwlCUUAAAAJ", "orcid": ";0000-0003-2879-8811;0000-0003-1039-8369", "linkedin": ";;pin-yu-chen-940062a2", "or_profile": "~Jun_Qi1;~Chao-Han_Huck_Yang1;~Pin-Yu_Chen1", "aff": "Georgia Institute of Technology;Georgia Institute of Technology;International Business Machines", "aff_domain": "gatech.edu;gatech.edu;ibm.com", "position": "PhD student;PhD student;Research Staff Member", "bibtex": "@misc{\nqi2022qtnvqc,\ntitle={{QTN}-{VQC}: An End-to-End Learning Framework for Quantum Neural Networks},\nauthor={Jun Qi and Chao-Han Huck Yang and Pin-Yu Chen},\nyear={2022},\nurl={https://openreview.net/forum?id=EQ7A6F7k0r_}\n}", "github": "", "project": "", "reviewers": "8JcS;sUwU;Awxc;JaBi;nHEv", "site": "https://openreview.net/forum?id=EQ7A6F7k0r_", "pdf_size": 0, "recommendation": "3;3;5;5;5", "confidence": "3;4;3;4;4", "correctness": "3;2;3;3;3", "technical_novelty": "2;2;2;2;2", "empirical_novelty": "2;1;2;2;2", "wc_summary_paper": "125;80;93;58;65", "wc_summary_review": "48;25;16;43;51", "wc_main_review": "275;203;141;384;208", "wc_review": "448;308;250;485;324", "wc_reply_reviewers": "31;150;0;108;413", "wc_reply_authors": "614;504;336;1068;1151", "reply_reviewers": "1;1;0;1;1", "reply_authors": "1;1;1;2;3", "recommendation_avg": [ 4.2, 0.9797958971132712 ], "confidence_avg": [ 3.6, 0.4898979485566356 ], "correctness_avg": [ 2.8, 0.39999999999999997 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 1.8, 0.4000000000000001 ], "wc_summary_paper_avg": [ 84.2, 23.726778120933318 ], "wc_summary_review_avg": [ 36.6, 13.690872872099865 ], "wc_main_review_avg": [ 242.2, 82.62057855038294 ], "wc_review_avg": [ 363.0, 88.7963963232743 ], "wc_reply_reviewers_avg": [ 140.4, 146.37841370912585 ], "wc_reply_authors_avg": [ 734.6, 319.733388935219 ], "reply_reviewers_avg": [ 0.8, 0.4 ], "reply_authors_avg": [ 1.6, 0.8 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.16666666666666663, "corr_recommendation_correctness": 0.6123724356957947, "gs_citation": 54, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3649968645861602362&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 12, "aff_unique_index": "0;0;1", "aff_unique_norm": "Georgia Institute of Technology;International Business Machines Corporation", "aff_unique_dep": ";", "aff_unique_url": "https://www.gatech.edu;https://www.ibm.com", "aff_unique_abbr": "Georgia Tech;IBM", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "The Three Stages of Learning Dynamics in High-dimensional Kernel Methods", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7181", "id": "EQmAP4F859", "poster": "", "openreview": "https://openreview.net/forum?id=EQmAP4F859", "slides": "https://iclr.cc/virtual/2022/poster/7181", "video": "https://iclr.cc/virtual/2022/poster/7181", "author_site": "Nikhil Ghosh, Song Mei, Bin Yu", "tldr": "", "abstract": "To understand how deep learning works, it is crucial to understand the training dynamics of neural networks. Several interesting hypotheses about these dynamics have been made based on empirically observed phenomena, but there exists a limited theoretical understanding of when and why such phenomena occur. \n\nIn this paper, we consider the training dynamics of gradient flow on kernel least-squares objectives, which is a limiting dynamics of SGD trained neural networks. Using precise high-dimensional asymptotics, we characterize the dynamics of the fitted model in two \u201cworlds\u201d: in the Oracle World the model is trained on the population distribution and in the Empirical World the model is trained on an i.i.d finite dataset. We show that under mild conditions on the kernel and $L^2$ target regression function the training dynamics have three stages that are based on the behaviors of the models in the two worlds. Our theoretical results also mathematically formalize some interesting deep learning phenomena. Specifically, in our setting we show that SGD progressively learns more complex functions and that there is a \"deep bootstrap\" phenomenon: during the second stage, the test error of both worlds remain close despite the empirical training error being much smaller. Finally, we give a concrete example comparing the dynamics of two different kernels which shows that faster training is not necessary for better generalization.", "keywords": "training dynamics;kernels;SGD;deep bootstrap;gradient flow;random features;high-dimensional asymptotics;random matrix theory", "primary_area": "", "supplementary_material": "", "author": "Nikhil Ghosh;Song Mei;Bin Yu", "authorids": "~Nikhil_Ghosh1;~Song_Mei1;~Bin_Yu1", "gender": "M;M;M", "homepage": ";https://www.stat.berkeley.edu/~songmei/;https://binyu.stat.berkeley.edu", "dblp": "251/8779;https://dblp.org/pers/hd/m/Mei:Song;27/116", "google_scholar": "0Fv4bikAAAAJ;https://scholar.google.com.hk/citations?hl=en;https://scholar.google.com.hk/citations?user=z1iJa3UAAAAJ", "orcid": ";;0000-0003-3097-1433", "linkedin": "nikhil-ghosh-03389199/;;bin-yu-b665063/", "or_profile": "~Nikhil_Ghosh1;~Song_Mei1;~Bin_Yu5", "aff": "University of California, Berkeley;University of California, Berkeley;University of California, Berkeley", "aff_domain": "berkeley.edu;berkeley.edu;berkeley.edu", "position": "PhD student;Assistant Professor;Full Professor", "bibtex": "@inproceedings{\nghosh2022the,\ntitle={The Three Stages of Learning Dynamics in High-dimensional Kernel Methods},\nauthor={Nikhil Ghosh and Song Mei and Bin Yu},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=EQmAP4F859}\n}", "github": "", "project": "", "reviewers": "qXRH;5uQz;Ud7D;WyHh", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "2;4;4;5", "correctness": "3;4;4;4", "technical_novelty": "1;3;3;4", "empirical_novelty": "2;0;3;4", "wc_summary_paper": "79;126;88;347", "wc_summary_review": "42;47;76;64", "wc_main_review": "181;268;374;285", "wc_review": "302;441;538;696", "wc_reply_reviewers": "162;6;0;67", "wc_reply_authors": "655;116;439;220", "reply_reviewers": "1;1;0;1", "reply_authors": "2;1;1;1", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 3.75, 1.0897247358851685 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 1.0897247358851685 ], "empirical_novelty_avg": [ 2.25, 1.479019945774904 ], "wc_summary_paper_avg": [ 160.0, 109.39606939922476 ], "wc_summary_review_avg": [ 57.25, 13.5531361684298 ], "wc_main_review_avg": [ 277.0, 68.50182479321262 ], "wc_review_avg": [ 494.25, 143.53810469697584 ], "wc_reply_reviewers_avg": [ 58.75, 65.12056126907999 ], "wc_reply_authors_avg": [ 357.5, 207.59154607064326 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.894736842105263, "corr_recommendation_correctness": 0.6622661785325219, "gs_citation": 24, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4936393763423320645&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=EQmAP4F859", "email": "berkeley.edu;berkeley.edu;berkeley.edu", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of California, Berkeley", "aff_unique_dep": "", "aff_unique_url": "https://www.berkeley.edu", "aff_unique_abbr": "UC Berkeley", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Berkeley", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "ET1UAOYeU42", "title": "Edge Partition Modulated Graph Convolutional Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "Graph convolutional networks (GCNs), which propagate the node features through the edges and learn how to transform the aggregated features under label supervision, have achieved great success in supervised feature extraction for both graph-level and node-level classification tasks. However, GCNs typically treat the graph adjacency matrix as given and ignore how the edges could be formed by different latent inter-node relations. In this paper, we introduce a relational graph generative process to model how the observed edges are generated by aggregating the node interactions over multiple overlapping node communities, each of which represents a particular type of relation that contributes to the edges via a logical OR mechanism. Based on this relational generative model, we partition each edge into the summation of multiple relation-specific weighted edges, and use the weighted edges in each community to define a relation-specific GCN. We introduce a variational inference framework to jointly learn how to partition the edges into different communities and combine relation-specific GCNs for the end classification tasks. Extensive evaluations on real-world datasets have demonstrated the working mechanisms of the edge partition modulated GCNs and their efficacy in learning both node and graph-level representations.", "keywords": "Latent Variable Models;Bayesian Methods;Variational Inference;Graph Neural Networks", "primary_area": "", "supplementary_material": "/attachment/57b8677c86061f21d72c76e37fc8f48d40130bde.zip", "author": "Yilin He;Chaojie Wang;Hao Zhang;Bo Chen;Mingyuan Zhou", "authorids": "~Yilin_He1;~Chaojie_Wang1;~Hao_Zhang1;~Bo_Chen1;~Mingyuan_Zhou1", "gender": "M;M;M;M;M", "homepage": ";https://chaojiewang94.github.io/;https://haozhangxidian.github.io/;http://web.xidian.edu.cn/bchen/en/index.html;http://mingyuanzhou.github.io", "dblp": "15/938;134/9314-1;55/2270-50;89/5615-1;", "google_scholar": ";https://scholar.google.com/citations?hl=en;Eo8e5icAAAAJ;;LXwCIisAAAAJ", "orcid": ";;;0000-0001-5151-9388;", "linkedin": "yilin-he-a75903212/;;;;", "or_profile": "~Yilin_He1;~Chaojie_Wang1;~Hao_Zhang1;~Bo_Chen1;~Mingyuan_Zhou1", "aff": "University of Texas, Austin;Nanyang Technological University;Cornell University;Xidian University;The University of Texas at Austin", "aff_domain": "utexas.edu;ntu.edu;med.cornell.edu;xidian.edu.cn;utexas.edu", "position": "PhD student;Researcher;Postdoc;Full Professor;Associate Professor", "bibtex": "@misc{\nhe2022edge,\ntitle={Edge Partition Modulated Graph Convolutional Networks},\nauthor={Yilin He and Chaojie Wang and Hao Zhang and Bo Chen and Mingyuan Zhou},\nyear={2022},\nurl={https://openreview.net/forum?id=ET1UAOYeU42}\n}", "github": "", "project": "", "reviewers": "1iDH;6jg3;tuFU;LpaE", "site": "https://openreview.net/forum?id=ET1UAOYeU42", "pdf_size": 0, "recommendation": "3;3;5;8", "confidence": "5;4;3;3", "correctness": "2;3;3;4", "technical_novelty": "2;2;3;4", "empirical_novelty": "2;0;3;3", "wc_summary_paper": "92;79;72;342", "wc_summary_review": "43;75;24;91", "wc_main_review": "1220;186;200;355", "wc_review": "1355;340;296;788", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "1851;418;635;485", "reply_reviewers": "0;0;0;0", "reply_authors": "3;1;1;1", "recommendation_avg": [ 4.75, 2.0463381929681126 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.75, 0.82915619758885 ], "empirical_novelty_avg": [ 2.0, 1.224744871391589 ], "wc_summary_paper_avg": [ 146.25, 113.24392919710972 ], "wc_summary_review_avg": [ 58.25, 26.261902063635834 ], "wc_main_review_avg": [ 490.25, 426.50930529122104 ], "wc_review_avg": [ 694.75, 427.0464699537979 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 847.25, 584.817225720994 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.5, 0.8660254037844386 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.7735449421179452, "corr_recommendation_correctness": 0.8638684255813602, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1792685183316840592&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;2;3;0", "aff_unique_norm": "University of Texas at Austin;Nanyang Technological University;Cornell University;Xidian University", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.utexas.edu;https://www.ntu.edu.sg;https://www.cornell.edu;http://www.xidian.edu.cn/", "aff_unique_abbr": "UT Austin;NTU;Cornell;Xidian", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Austin;", "aff_country_unique_index": "0;1;0;2;0", "aff_country_unique": "United States;Singapore;China" }, { "id": "ETiaOyNwJW", "title": "Revisiting Virtual Nodes in Graph Neural Networks for Link Prediction", "track": "main", "status": "Reject", "tldr": "", "abstract": "It is well known that the graph classification performance of graph neural networks often improves by adding an artificial virtual node to the graphs, which is connected to all nodes in the graph. Intuitively, the virtual node provides a shortcut for message passing between nodes along the graph edges. Surprisingly, the impact of virtual nodes with other problems is still an open research question. \n\nIn this paper, we adapt the concept of virtual nodes to the link prediction scenario, where we usually have much larger, often dense, and more heterogeneous graphs. In particular, we use multiple virtual nodes per graph and graph-based clustering to determine the connections to the graph nodes. We also investigate alternative clustering approaches (e.g., random or more advanced) and compare to the original model with a single virtual node. We conducted extensive experiments over different datasets of the Open Graph Benchmark (OGB) and analyze the results in detail. We show that our virtual node extensions yield rather stable performance increases and allow standard graph neural networks to compete with complex state-of-the-art models, as well as with the models leading the OGB leaderboards.", "keywords": "graph neural network;link prediction;virtual node", "primary_area": "", "supplementary_material": "/attachment/7fedd4e8a4ed765b2af07f1b1853424cc1273279.zip", "author": "EunJeong Hwang;Veronika Thost;Shib Sankar Dasgupta;Tengfei Ma", "authorids": "~EunJeong_Hwang1;~Veronika_Thost1;~Shib_Sankar_Dasgupta2;~Tengfei_Ma1", "gender": "F;F;M;M", "homepage": "https://eujhwang.github.io/;https://mitibmwatsonailab.mit.edu/people/veronika-thost/;https://ssdasgupta.github.io/;https://sites.google.com/site/matf0123/", "dblp": ";132/3874;222/9398;94/9023-1", "google_scholar": "Z0TA4NEAAAAJ;TyScgJ0AAAAJ;0KpQR94AAAAJ;9OvNakkAAAAJ", "orcid": ";0000-0003-4984-1532;;0000-0002-1086-529X", "linkedin": ";;shib-sankar-dasgupta-iisc/;", "or_profile": "~EunJeong_Hwang1;~Veronika_Thost1;~Shib_Sankar_Dasgupta2;~Tengfei_Ma1", "aff": "University of British Columbia;IBM Research;University of Massachusetts, Amherst;International Business Machines", "aff_domain": "cs.ubc.ca;ibm.com;umass.edu;ibm.com", "position": "PhD student;Research Scientist;PhD student;Researcher", "bibtex": "@misc{\nhwang2022revisiting,\ntitle={Revisiting Virtual Nodes in Graph Neural Networks for Link Prediction},\nauthor={EunJeong Hwang and Veronika Thost and Shib Sankar Dasgupta and Tengfei Ma},\nyear={2022},\nurl={https://openreview.net/forum?id=ETiaOyNwJW}\n}", "github": "", "project": "", "reviewers": "gjKs;mmjp;CjZq;RCd9", "site": "https://openreview.net/forum?id=ETiaOyNwJW", "pdf_size": 0, "recommendation": "5;6;6;6", "confidence": "4;4;4;3", "correctness": "4;4;3;4", "technical_novelty": "3;2;2;3", "empirical_novelty": "2;2;3;4", "wc_summary_paper": "71;37;109;34", "wc_summary_review": "26;19;32;9", "wc_main_review": "203;184;263;138", "wc_review": "300;240;404;181", "wc_reply_reviewers": "123;1165;0;64", "wc_reply_authors": "613;1568;695;329", "reply_reviewers": "1;5;0;2", "reply_authors": "4;6;2;4", "recommendation_avg": [ 5.75, 0.4330127018922193 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 62.75, 30.400452299266863 ], "wc_summary_review_avg": [ 21.5, 8.558621384311845 ], "wc_main_review_avg": [ 197.0, 44.83859944289072 ], "wc_review_avg": [ 281.25, 82.4177620419288 ], "wc_reply_reviewers_avg": [ 338.0, 479.4460345023202 ], "wc_reply_authors_avg": [ 801.25, 463.04771622371703 ], "reply_reviewers_avg": [ 2.0, 1.8708286933869707 ], "reply_authors_avg": [ 4.0, 1.4142135623730951 ], "replies_avg": [ 31, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": -0.3333333333333333, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2930135845166638438&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "University of British Columbia;IBM;University of Massachusetts Amherst;International Business Machines Corporation", "aff_unique_dep": ";IBM Research;;", "aff_unique_url": "https://www.ubc.ca;https://www.ibm.com/research;https://www.umass.edu;https://www.ibm.com", "aff_unique_abbr": "UBC;IBM;UMass Amherst;IBM", "aff_campus_unique_index": "1", "aff_campus_unique": ";Amherst", "aff_country_unique_index": "0;1;1;1", "aff_country_unique": "Canada;United States" }, { "title": "Bayesian Modeling and Uncertainty Quantification for Learning to Optimize: What, Why, and How", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6796", "id": "EVVadRFRgL7", "poster": "", "openreview": "https://openreview.net/forum?id=EVVadRFRgL7", "slides": "https://iclr.cc/virtual/2022/poster/6796", "video": "https://iclr.cc/virtual/2022/poster/6796", "author_site": "Yuning You, Yue Cao, Tianlong Chen, Zhangyang Wang, Yang Shen", "tldr": "", "abstract": "Optimizing an objective function with uncertainty awareness is well-known to improve the accuracy and confidence of optimization solutions. Meanwhile, another relevant but very different question remains yet open: how to model and quantify the uncertainty of an optimization algorithm (a.k.a., optimizer) itself? To close such a gap, the prerequisite is to consider the optimizers as sampled from a distribution, rather than a few prefabricated and fixed update rules. We first take the novel angle to consider the algorithmic space of optimizers, and provide definitions for the optimizer prior and likelihood, that intrinsically determine the posterior and therefore uncertainty. We then leverage the recent advance of learning to optimize (L2O) for the space parameterization, with the end-to-end training pipeline built via variational inference, referred to as uncertainty-aware L2O (UA-L2O). Our study represents the first effort to recognize and quantify the uncertainty of the optimization algorithm. The extensive numerical results show that, UA-L2O achieves superior uncertainty calibration with accurate confidence estimation and tight confidence intervals, suggesting the improved posterior estimation thanks to considering optimizer uncertainty. Intriguingly, UA-L2O even improves optimization performances for two out of three test functions, the loss function in data privacy attack, and four of five cases of the energy function in protein docking. Our codes are released at https://github.com/Shen-Lab/Bayesian-L2O.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/b9de7afbe02ec0247ab7d1580b00e7df501eed0c.zip", "author": "Yuning You;Yue Cao;Tianlong Chen;Zhangyang Wang;Yang Shen", "authorids": "~Yuning_You1;~Yue_Cao4;~Tianlong_Chen1;~Zhangyang_Wang1;~Yang_Shen4", "gender": "M;M;M;M;", "homepage": "https://yyou1996.github.io/;;https://tianlong-chen.github.io;https://vita-group.github.io;https://shen-lab.github.io/", "dblp": "240/8556;;;119/4026;95/5308-1.html", "google_scholar": "Pv-V2igAAAAJ;Q0f5JRAAAAAJ;LE3ctn0AAAAJ;pxFyKAIAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";;0000-0001-7774-8197;;0000-0002-1703-7796", "linkedin": ";;tianlong-chen-783862167/;;", "or_profile": "~Yuning_You1;~Yue_Cao4;~Tianlong_Chen1;~Zhangyang_Wang1;~Yang_Shen4", "aff": "Texas A&M University;;University of Texas, Austin;University of Texas, Austin;Texas A&M University - College Station", "aff_domain": "tamu.edu;;utexas.edu;utexas.edu;tamu.edu", "position": "PhD student;;PhD student;Assistant Professor;Associate Professor", "bibtex": "@inproceedings{\nyou2022bayesian,\ntitle={Bayesian Modeling and Uncertainty Quantification for Learning to Optimize: What, Why, and How},\nauthor={Yuning You and Yue Cao and Tianlong Chen and Zhangyang Wang and Yang Shen},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=EVVadRFRgL7}\n}", "github": "", "project": "", "reviewers": "x9M2;vJE3;Q3k1;3nHB", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "3;3;4;4", "correctness": "2;3;3;3", "technical_novelty": "3;3;3;3", "empirical_novelty": "2;3;3;2", "wc_summary_paper": "34;47;106;133", "wc_summary_review": "25;67;19;13", "wc_main_review": "261;493;319;860", "wc_review": "320;607;444;1006", "wc_reply_reviewers": "35;0;8;267", "wc_reply_authors": "951;821;562;1469", "reply_reviewers": "1;0;1;2", "reply_authors": "3;3;2;3", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 80.0, 40.89621009335706 ], "wc_summary_review_avg": [ 31.0, 21.213203435596427 ], "wc_main_review_avg": [ 483.25, 233.67110968196303 ], "wc_review_avg": [ 594.25, 258.5965728697888 ], "wc_reply_reviewers_avg": [ 77.5, 110.17372645054718 ], "wc_reply_authors_avg": [ 950.75, 330.35766602275174 ], "reply_reviewers_avg": [ 1.0, 0.7071067811865476 ], "reply_authors_avg": [ 2.75, 0.4330127018922193 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 1.0, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8991214620654684563&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=EVVadRFRgL7", "email": "tamu.edu;;utexas.edu;utexas.edu;tamu.edu", "author_num": 5, "aff_unique_index": "0;1;1;0", "aff_unique_norm": "Texas A&M University;University of Texas at Austin", "aff_unique_dep": ";", "aff_unique_url": "https://www.tamu.edu;https://www.utexas.edu", "aff_unique_abbr": "TAMU;UT Austin", "aff_campus_unique_index": "1;1;2", "aff_campus_unique": ";Austin;College Station", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "EVqFdCB5PfV", "title": "Iterative Hierarchical Attention for Answering Complex Questions over Long Documents", "track": "main", "status": "Reject", "tldr": "", "abstract": "We propose a new model, DocHopper, that iteratively attends to different parts of long, hierarchically structured documents to answer complex questions. Similar to multi-hop question-answering (QA) systems, at each step, DocHopper uses a query q to attend to information from a document, combines this \u201cretrieved\u201d information with q to produce the next query. However, in contrast to most previous multi-hop QA systems, DocHopper is able to \u201cretrieve\u201d either short passages or long sections of the document, thus emulating a multi-step process of \u201cnavigating\u201d through a long document to answer a question. To enable this novel behavior, DocHopper does not combine document information with q by concatenating text to the text of q, but by combining a compact neural representation of q with a compact neural representation of a hierarchical part of the document -- potentially a large part. We experiment with DocHopper on four different QA tasks that require reading long and complex documents to answer multi-hop questions, and show that DocHopper outperforms all baseline models and achieves state-of-the-art results on all datasets. Additionally, DocHopper is efficient at inference time, being 3 - 10 times faster than the baselines.", "keywords": "Question Answering;Natural Language Processing;Attention Methods", "primary_area": "", "supplementary_material": "", "author": "Haitian Sun;William W. Cohen;Ruslan Salakhutdinov", "authorids": "~Haitian_Sun2;~William_W._Cohen2;~Ruslan_Salakhutdinov1", "gender": "M;M;M", "homepage": ";https://wwcohen.github.io/;https://www.cs.cmu.edu/~rsalakhu/", "dblp": "185/6000;c/WWCohen.html;", "google_scholar": "o7-PJu8AAAAJ;8ys-38kAAAAJ;", "orcid": ";;", "linkedin": ";;", "or_profile": "~Haitian_Sun2;~William_W._Cohen2;~Russ_Salakhutdinov1", "aff": "School of Computer Science, Carnegie Mellon University;Google DeepMind;School of Computer Science, Carnegie Mellon University", "aff_domain": "cs.cmu.edu;google.com;cs.cmu.edu", "position": "PhD student;Principle Scientist;Full Professor", "bibtex": "@misc{\nsun2022iterative,\ntitle={Iterative Hierarchical Attention for Answering Complex Questions over Long Documents},\nauthor={Haitian Sun and William W. Cohen and Ruslan Salakhutdinov},\nyear={2022},\nurl={https://openreview.net/forum?id=EVqFdCB5PfV}\n}", "github": "", "project": "", "reviewers": "XkRC;qv3L;zXR3;Ld3M", "site": "https://openreview.net/forum?id=EVqFdCB5PfV", "pdf_size": 0, "recommendation": "3;5;5;5", "confidence": "4;3;4;3", "correctness": "2;2;2;2", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "189;135;94;113", "wc_summary_review": "125;6;38;56", "wc_main_review": "271;245;597;404", "wc_review": "585;386;729;573", "wc_reply_reviewers": "0;101;0;0", "wc_reply_authors": "350;261;596;256", "reply_reviewers": "0;1;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 4.5, 0.8660254037844386 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 2.0, 0.0 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 132.75, 35.56947427219019 ], "wc_summary_review_avg": [ 56.25, 43.54523510098436 ], "wc_main_review_avg": [ 379.25, 139.43524482712397 ], "wc_review_avg": [ 568.25, 121.81825602100861 ], "wc_reply_reviewers_avg": [ 25.25, 43.73428289111415 ], "wc_reply_authors_avg": [ 365.75, 138.094849650521 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.0, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8688740771422916548&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "Carnegie Mellon University;Google", "aff_unique_dep": "School of Computer Science;Google DeepMind", "aff_unique_url": "https://www.cmu.edu;https://deepmind.com", "aff_unique_abbr": "CMU;DeepMind", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Pittsburgh;", "aff_country_unique_index": "0;1;0", "aff_country_unique": "United States;United Kingdom" }, { "title": "Efficient Token Mixing for Transformers via Adaptive Fourier Neural Operators", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6073", "id": "EXHG-A3jlM", "poster": "", "openreview": "https://openreview.net/forum?id=EXHG-A3jlM", "slides": "https://iclr.cc/virtual/2022/poster/6073", "video": "https://iclr.cc/virtual/2022/poster/6073", "author_site": "John Guibas, Morteza Mardani, Zongyi Li, Andrew Tao, Anima Anandkumar, Bryan Catanzaro", "tldr": "", "abstract": "Vision transformers have delivered tremendous success in representation learning. This is primarily due to effective token mixing through self attention. However, this scales quadratically with the number of pixels, which becomes infeasible for high-resolution inputs. To cope with this challenge, we propose Adaptive Fourier Neural Operator (AFNO) as an efficient token mixer that learns to mix in the Fourier domain. AFNO is based on a principled foundation of operator learning which allows us to frame token mixing as a continuous global convolution without any dependence on the input resolution. This principle was previously used to design FNO, which solves global convolution efficiently in the Fourier domain and has shown promise in learning challenging PDEs. To handle challenges in visual representation learning such as discontinuities in images and high resolution inputs, we propose principled architectural modifications to FNO which results in memory and computational efficiency. This includes imposing a block-diagonal structure on the channel mixing weights, adaptively sharing weights across tokens, and sparsifying the frequency modes via soft-thresholding and shrinkage. The resulting model is highly parallel with a quasi-linear complexity and has linear memory in the sequence size. AFNO outperforms self-attention mechanisms for few-shot segmentation in terms of both efficiency and accuracy. For Cityscapes segmentation with the Segformer-B3 backbone, AFNO can handle a sequence size of 65k and outperforms other efficient self-attention mechanisms.", "keywords": "self attention;linear complexity;high-resolution inputs;operator learning;Fourier transform", "primary_area": "", "supplementary_material": "", "author": "John Guibas;Morteza Mardani;Zongyi Li;Andrew Tao;Anima Anandkumar;Bryan Catanzaro", "authorids": "~John_Guibas1;~Morteza_Mardani1;~Zongyi_Li1;~Andrew_Tao1;~Anima_Anandkumar1;~Bryan_Catanzaro1", "gender": "M;M;M;M;M;F", "homepage": ";http://web.stanford.edu/~morteza/;https://zongyi-li.github.io;;https://ctnzr.io;http://tensorlab.cms.caltech.edu/users/anima/", "dblp": ";74/258;;210/2645;14/4826;", "google_scholar": "4UvZdF8AAAAJ;H7edsyEAAAAJ;;Wel9l1wAAAAJ;UZ6kI2AAAAAJ;bEcLezcAAAAJ", "orcid": ";;;;0000-0003-0034-7728;", "linkedin": ";;;;bryancatanzaro/;anima-anandkumar-35171b1/", "or_profile": "~John_Guibas1;~Morteza_Mardani1;~Zongyi_Li1;~Andrew_Tao1;~Bryan_Catanzaro1;~anima_anandkumar1", "aff": "Stanford University;NVIDIA;California Institute of Technology;NVIDIA Corporation;NVIDIA;California Institute of Technology", "aff_domain": "stanford.edu;nvidia.com;caltech.edu;nvidia.com;nvidia.com;caltech.edu", "position": "Undergrad student;Principal Researcher;PhD student;Vice President of Applied Research;Vice President;Full Professor", "bibtex": "@inproceedings{\nguibas2022efficient,\ntitle={Efficient Token Mixing for Transformers via Adaptive Fourier Neural Operators},\nauthor={John Guibas and Morteza Mardani and Zongyi Li and Andrew Tao and Anima Anandkumar and Bryan Catanzaro},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=EXHG-A3jlM}\n}", "github": "", "project": "", "reviewers": "PwZa;UFjk;H7fs", "pdf_size": 0, "recommendation": "6;6;8", "confidence": "4;4;3", "correctness": "4;1;3", "technical_novelty": "3;2;3", "empirical_novelty": "4;2;2", "wc_summary_paper": "79;84;80", "wc_summary_review": "13;13;29", "wc_main_review": "189;197;184", "wc_review": "281;294;293", "wc_reply_reviewers": "0;52;0", "wc_reply_authors": "439;517;476", "reply_reviewers": "0;1;0", "reply_authors": "1;1;1", "recommendation_avg": [ 6.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 2.6666666666666665, 1.247219128924647 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.9428090415820634 ], "wc_summary_paper_avg": [ 81.0, 2.160246899469287 ], "wc_summary_review_avg": [ 18.333333333333332, 7.542472332656507 ], "wc_main_review_avg": [ 190.0, 5.354126134736337 ], "wc_review_avg": [ 289.3333333333333, 5.90668171555645 ], "wc_reply_reviewers_avg": [ 17.333333333333332, 24.513035081133648 ], "wc_reply_authors_avg": [ 477.3333333333333, 31.857320805254304 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.9999999999999998, "corr_recommendation_correctness": 0.1889822365046136, "gs_citation": 110, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17093350371004881402&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "pdf": "https://openreview.net/pdf?id=EXHG-A3jlM", "email": "stanford.edu;nvidia.com;caltech.edu;nvidia.com;nvidia.com;caltech.edu", "author_num": 6, "aff_unique_index": "0;1;2;1;1;2", "aff_unique_norm": "Stanford University;NVIDIA;California Institute of Technology", "aff_unique_dep": ";NVIDIA Corporation;", "aff_unique_url": "https://www.stanford.edu;https://www.nvidia.com;https://www.caltech.edu", "aff_unique_abbr": "Stanford;NVIDIA;Caltech", "aff_campus_unique_index": "0;2;2", "aff_campus_unique": "Stanford;;Pasadena", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "EXe93Md8RqS", "title": "Data Quality Matters For Adversarial Training: An Empirical Study", "track": "main", "status": "Reject", "tldr": "", "abstract": "Multiple intriguing problems are hovering in adversarial training, including robust overfitting, robustness overestimation, and robustness-accuracy trade-off. These problems pose great challenges to both reliable evaluation and practical deployment. Here, we empirically show that these problems share one common cause --- low-quality samples in the dataset. Specifically, we first propose a strategy to measure the data quality based on the learning behaviors of the data during adversarial training and find that low-quality data may not be useful and even detrimental to the adversarial robustness. We then design controlled experiments to investigate the interconnections between data quality and problems in adversarial training. We find that when low-quality data is removed, robust overfitting and robustness overestimation can be largely alleviated; and robustness-accuracy trade-off becomes less significant. These observations not only verify our intuition about data quality but may also open new opportunities to advance adversarial training. ", "keywords": "Adversarial training;Data quality;Robust overfitting;Robustness overestimation;Robustness-accuracy trade-off", "primary_area": "", "supplementary_material": "/attachment/860d67c85b465b8caa2b5a697d28833e0bda6b2a.zip", "author": "Chengyu Dong;Liyuan Liu;Jingbo Shang", "authorids": "~Chengyu_Dong1;~Liyuan_Liu3;~Jingbo_Shang2", "gender": ";M;M", "homepage": "https://www.chengyu-dong.me/;https://shangjingbo1226.github.io/;https://liyuanlucasliu.github.io/", "dblp": "14/3155;151/3145.html;06/1624", "google_scholar": "Ppfi7j0AAAAJ;0SkFI4MAAAAJ;RmvbkzYAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Chengyu_Dong1;~Jingbo_Shang2;~Liyuan_Liu1", "aff": "University of California, San Diego;University of California, San Diego;University of Illinois, Urbana Champaign", "aff_domain": "ucsd.edu;ucsd.edu;illinois.edu", "position": "PhD student;Assistant Professor;PhD student", "bibtex": "@misc{\ndong2022data,\ntitle={Data Quality Matters For Adversarial Training: An Empirical Study},\nauthor={Chengyu Dong and Liyuan Liu and Jingbo Shang},\nyear={2022},\nurl={https://openreview.net/forum?id=EXe93Md8RqS}\n}", "github": "", "project": "", "reviewers": "mNG6;soLM;Rp8o;VxQb", "site": "https://openreview.net/forum?id=EXe93Md8RqS", "pdf_size": 0, "recommendation": "6;6;6;6", "confidence": "4;4;5;3", "correctness": "3;3;4;3", "technical_novelty": "2;2;3;2", "empirical_novelty": "4;2;3;2", "wc_summary_paper": "84;63;159;90", "wc_summary_review": "50;32;69;110", "wc_main_review": "428;153;310;534", "wc_review": "562;248;538;734", "wc_reply_reviewers": "5;181;0;103", "wc_reply_authors": "289;440;388;1261", "reply_reviewers": "1;1;0;1", "reply_authors": "1;2;1;2", "recommendation_avg": [ 6.0, 0.0 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 99.0, 36.062445840513924 ], "wc_summary_review_avg": [ 65.25, 28.960101864461734 ], "wc_main_review_avg": [ 356.25, 141.591622280416 ], "wc_review_avg": [ 520.5, 174.54727153410334 ], "wc_reply_reviewers_avg": [ 72.25, 75.0245793057182 ], "wc_reply_authors_avg": [ 594.5, 388.608093070641 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5812138803761852858&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0;1", "aff_unique_norm": "University of California, San Diego;University of Illinois Urbana-Champaign", "aff_unique_dep": ";", "aff_unique_url": "https://www.ucsd.edu;https://illinois.edu", "aff_unique_abbr": "UCSD;UIUC", "aff_campus_unique_index": "0;0;1", "aff_campus_unique": "San Diego;Urbana-Champaign", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "EYCm0AFjaSS", "title": "ZerO Initialization: Initializing Residual Networks with only Zeros and Ones", "track": "main", "status": "Reject", "tldr": "", "abstract": "Deep neural networks are usually initialized with random weights, with adequately selected initial variance to ensure stable signal propagation during training. However, there is no consensus on how to select the variance, and this becomes challenging especially as the number of layers grows. In this work, we replace the widely used random weight initialization with a fully deterministic initialization scheme ZerO, which initializes residual networks with only zeros and ones. By augmenting the standard ResNet architectures with a few extra skip connections and Hadamard transforms, ZerO allows us to start the training from zeros and ones entirely. This has many benefits such as improving reproducibility (by reducing the variance over different experimental runs) and allowing network training without batch normalization. Surprisingly, we find that ZerO achieves state-of-the-art performance over various image classification datasets, including ImageNet, which suggests random weights may be unnecessary for modern network initialization.", "keywords": "weight initialization;deep residual network;deterministic initialization;optimization", "primary_area": "", "supplementary_material": "", "author": "Jiawei Zhao;Florian Tobias Schaefer;Anima Anandkumar", "authorids": "~Jiawei_Zhao2;~Florian_Tobias_Schaefer1;~Anima_Anandkumar1", "gender": "M;M;F", "homepage": "https://jiaweizhao.com/;https://github.com/f-t-s;http://tensorlab.cms.caltech.edu/users/anima/", "dblp": ";39/9203;", "google_scholar": ";8piLNEUAAAAJ;bEcLezcAAAAJ", "orcid": ";0000-0002-4891-0172;", "linkedin": ";;anima-anandkumar-35171b1/", "or_profile": "~Jiawei_Zhao2;~Florian_Tobias_Schaefer1;~anima_anandkumar1", "aff": "California Institute of Technology;Georgia Institute of Technology;California Institute of Technology", "aff_domain": "caltech.edu;gatech.edu;caltech.edu", "position": "PhD student;Assistant Professor;Full Professor", "bibtex": "@misc{\nzhao2022zero,\ntitle={ZerO Initialization: Initializing Residual Networks with only Zeros and Ones},\nauthor={Jiawei Zhao and Florian Tobias Schaefer and Anima Anandkumar},\nyear={2022},\nurl={https://openreview.net/forum?id=EYCm0AFjaSS}\n}", "github": "", "project": "", "reviewers": "RE44;8Znp;P5yd;PQDc;9bJj", "site": "https://openreview.net/forum?id=EYCm0AFjaSS", "pdf_size": 0, "recommendation": "5;5;5;5;6", "confidence": "3;3;4;4;3", "correctness": "4;2;3;3;4", "technical_novelty": "2;3;3;1;3", "empirical_novelty": "2;3;2;2;3", "wc_summary_paper": "70;70;114;18;64", "wc_summary_review": "15;23;63;51;22", "wc_main_review": "357;218;454;582;98", "wc_review": "442;311;631;651;184", "wc_reply_reviewers": "55;0;0;204;0", "wc_reply_authors": "768;583;1076;1923;275", "reply_reviewers": "1;0;0;2;0", "reply_authors": "2;1;2;4;1", "recommendation_avg": [ 5.2, 0.39999999999999997 ], "confidence_avg": [ 3.4, 0.4898979485566356 ], "correctness_avg": [ 3.2, 0.7483314773547882 ], "technical_novelty_avg": [ 2.4, 0.8 ], "empirical_novelty_avg": [ 2.4, 0.4898979485566356 ], "wc_summary_paper_avg": [ 67.2, 30.452586097078846 ], "wc_summary_review_avg": [ 34.8, 18.723247581549522 ], "wc_main_review_avg": [ 341.8, 170.45867534390848 ], "wc_review_avg": [ 443.8, 180.61605687202896 ], "wc_reply_reviewers_avg": [ 51.8, 79.02505931664967 ], "wc_reply_authors_avg": [ 925.0, 562.6576223601703 ], "reply_reviewers_avg": [ 0.6, 0.7999999999999999 ], "reply_authors_avg": [ 2.0, 1.0954451150103321 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.408248290463863, "corr_recommendation_correctness": 0.5345224838248488, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1242816793364277064&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "California Institute of Technology;Georgia Institute of Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.caltech.edu;https://www.gatech.edu", "aff_unique_abbr": "Caltech;Georgia Tech", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Pasadena;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "ClimateGAN: Raising Climate Change Awareness by Generating Images of Floods", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6224", "id": "EZNOb_uNpJk", "poster": "", "openreview": "https://openreview.net/forum?id=EZNOb_uNpJk", "slides": "https://iclr.cc/virtual/2022/poster/6224", "video": "https://iclr.cc/virtual/2022/poster/6224", "author_site": "Victor Schmidt, Alexandra Luccioni, M\u00e9lisande Teng, Tianyu Zhang, Alexia Reynaud, Sunand Raghupathi, Gautier Cosne, Adrien Juraver, Vahe Vardanyan, Alex Hernandez-Garcia, Yoshua Bengio", "tldr": "", "abstract": "Climate change is a major threat to humanity and the actions required to prevent its catastrophic consequences include changes in both policy-making and individual behaviour. However, taking action requires understanding its seemingly abstract and distant consequences. Projecting the potential impacts of extreme climate events such as flooding in familiar places can help make the impacts of climate change more concrete and encourage action. As part of a larger initiative to build a website (https://thisclimatedoesnotexist.com) that projects extreme climate events onto user-chosen photos, we present our solution to simulate photo-realistic floods on authentic images. To address this complex task in the absence of suitable data, we propose ClimateGAN, a model that leverages both simulated and real data through unsupervised domain adaptation and conditional image generation. In this paper, we describe the details of our framework, thoroughly evaluate the main components of our architecture and demonstrate that our model is capable of robustly generating photo-realistic flooding on street images.", "keywords": "GAN;Climate Change;Domain Adaptation;Representation Learning;Computer Vision;Application", "primary_area": "", "supplementary_material": "/attachment/e35ffdfb3f901392fc63cf210a0634fe93414b4b.zip", "author": "Victor Schmidt;Alexandra Luccioni;M\u00e9lisande Teng;Tianyu Zhang;Alexia Reynaud;Sunand Raghupathi;Gautier Cosne;Adrien Juraver;Vahe Vardanyan;Alex Hern\u00e1ndez-Garc\u00eda;Yoshua Bengio", "authorids": "~Victor_Schmidt2;~Alexandra_Luccioni1;~M\u00e9lisande_Teng1;tianyu.zhang@mila.quebec;alexia.reynaud@polymtl.ca;~Sunand_Raghupathi1;cosne.gautier@gmail.com;a.juraver@gmail.com;vardanyan.vahe@gmail.com;~Alex_Hern\u00e1ndez-Garc\u00eda1;~Yoshua_Bengio1", "gender": "M;F;F;;;M;;;;;M", "homepage": "https://vict0rs.ch;http://sashaluccioni.com/;;;;https://mila.quebec/en/person/sunand-raghupathi/;;;;https://alexhernandezgarcia.github.io;http://yoshuabengio.org", "dblp": ";162/5449;;;;;;;;213/8573;56/953", "google_scholar": "https://scholar.google.fr/citations?user=mKLme1kAAAAJ;;eUNoxBMAAAAJ;;;;;;;f8vQCOAAAAAJ;kukA0LcAAAAJ", "orcid": ";0000-0001-6238-7050;;;;http://orcid.org/0000-0002-8061-076X;;;;;", "linkedin": ";alexandraluccioniphd/;;;;sunand-raghupathi?trk=public_profile_browsemap_profile-result-card_result-card_full-click;;;;;yoshuabengio/?originalSubdomain=ca", "or_profile": "~Victor_Schmidt2;~Alexandra_Luccioni1;~M\u00e9lisande_Teng1;tianyu.zhang@mila.quebec;alexia.reynaud@polymtl.ca;~Sunand_Raghupathi1;cosne.gautier@gmail.com;a.juraver@gmail.com;vardanyan.vahe@gmail.com;~Alex_Hern\u00e1ndez-Garc\u00eda1;~Yoshua_Bengio1", "aff": "University of Montreal;Hugging Face;Mila - Quebec Artificial Intelligence Institute;;;;;;;Universit\u00e9 de Montr\u00e9al;University of Montreal", "aff_domain": "umontreal.ca;huggingface.co;mila.quebec;;;;;;;umontreal.ca;umontreal.ca", "position": "PhD student;Researcher;PhD student;;;;;;;Postdoc;Full Professor", "bibtex": "@inproceedings{\nschmidt2022climategan,\ntitle={Climate{GAN}: Raising Climate Change Awareness by Generating Images of Floods},\nauthor={Victor Schmidt and Alexandra Luccioni and M{\\'e}lisande Teng and Tianyu Zhang and Alexia Reynaud and Sunand Raghupathi and Gautier Cosne and Adrien Juraver and Vahe Vardanyan and Alex Hern{\\'a}ndez-Garc{\\'\\i}a and Yoshua Bengio},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=EZNOb_uNpJk}\n}", "github": "", "project": "", "reviewers": "TBDz;PcPf;6bHT", "pdf_size": 0, "recommendation": "5;5;6", "confidence": "4;4;4", "correctness": "3;3;4", "technical_novelty": "2;2;3", "empirical_novelty": "2;2;2", "wc_summary_paper": "71;145;34", "wc_summary_review": "48;39;20", "wc_main_review": "706;226;71", "wc_review": "825;410;125", "wc_reply_reviewers": "276;205;0", "wc_reply_authors": "3939;1934;349", "reply_reviewers": "2;1;0", "reply_authors": "7;4;2", "recommendation_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 83.33333333333333, 46.14710777021194 ], "wc_summary_review_avg": [ 35.666666666666664, 11.67142760000773 ], "wc_main_review_avg": [ 334.3333333333333, 270.31874189967334 ], "wc_review_avg": [ 453.3333333333333, 287.41182222649707 ], "wc_reply_reviewers_avg": [ 160.33333333333334, 117.01946656670229 ], "wc_reply_authors_avg": [ 2074.0, 1468.9508727886941 ], "reply_reviewers_avg": [ 1.0, 0.816496580927726 ], "reply_authors_avg": [ 4.333333333333333, 2.0548046676563256 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 11, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.9999999999999997, "gs_citation": 26, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15878188522352807376&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=EZNOb_uNpJk", "email": "umontreal.ca;huggingface.co;mila.quebec;;;;;;;umontreal.ca;umontreal.ca", "author_num": 11, "aff_unique_index": "0;1;2;3;0", "aff_unique_norm": "University of Montreal;Hugging Face;Quebec Artificial Intelligence Institute;Universit\u00e9 de Montr\u00e9al", "aff_unique_dep": ";;Artificial Intelligence;", "aff_unique_url": "https://wwwumontreal.ca;https://huggingface.co;https://mila.quebec;https://www.umontreal.ca", "aff_unique_abbr": "UM;Hugging Face;Mila;UdeM", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0;0", "aff_country_unique": "Canada;United States" }, { "title": "Trust Region Policy Optimisation in Multi-Agent Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6244", "id": "EcGGFkNTxdJ", "poster": "", "openreview": "https://openreview.net/forum?id=EcGGFkNTxdJ", "slides": "https://iclr.cc/virtual/2022/poster/6244", "video": "https://iclr.cc/virtual/2022/poster/6244", "author_site": "Jakub Grudzien Kuba, Ruiqing Chen, Muning Wen, Ying Wen, Fanglei Sun, Jun Wang, Yaodong Yang", "tldr": "", "abstract": "Trust region methods rigorously enabled reinforcement learning (RL) agents to learn monotonically improving policies, leading to superior performance on a variety of tasks. Unfortunately, when it comes to multi-agent reinforcement learning (MARL), the property of monotonic improvement may not simply apply; this is because agents, even in cooperative games, could have conflicting directions of policy updates. As a result, achieving a guaranteed improvement on the joint policy where each agent acts individually remains an open challenge. In this paper, we extend the theory of trust region learning to MARL. Central to our findings are the multi-agent advantage decomposition lemma and the sequential policy update scheme. Based on these, we develop Heterogeneous-Agent Trust Region Policy Optimisation (HATPRO) and Heterogeneous-Agent Proximal Policy Optimisation (HAPPO) algorithms. Unlike many existing MARL algorithms, HATRPO/HAPPO do not need agents to share parameters, nor do they need any restrictive assumptions on decomposibility of the joint value function. Most importantly, we justify in theory the monotonic improvement property of HATRPO/HAPPO. We evaluate the proposed methods on a series of Multi-Agent MuJoCo and StarCraftII tasks. Results show that HATRPO and HAPPO significantly outperform strong baselines such as IPPO, MAPPO and MADDPG on all tested tasks, thereby establishing a new state of the art. ", "keywords": "Multi-Agent Reinforcement Learning;trust-region method;policy gradient method", "primary_area": "", "supplementary_material": "", "author": "Jakub Grudzien Kuba;Ruiqing Chen;Muning Wen;Ying Wen;Fanglei Sun;Jun Wang;Yaodong Yang", "authorids": "~Jakub_Grudzien_Kuba1;~Ruiqing_Chen1;~Muning_Wen2;~Ying_Wen1;~Fanglei_Sun1;~Jun_Wang2;~Yaodong_Yang1", "gender": ";;M;M;F;M;M", "homepage": ";;https://github.com/morning9393;https://yingwen.io;;http://www0.cs.ucl.ac.uk/staff/jun.wang/;https://www.yangyaodong.com", "dblp": ";;295/0261;41/4203-1;;w/JunWang12;170/1496-1", "google_scholar": ";;Zt1WFtQAAAAJ;_A1CxG8AAAAJ;VugNoHkAAAAJ;https://scholar.google.co.uk/citations?user=wIE1tY4AAAAJ;https://scholar.google.co.uk/citations?user=6yL0xw8AAAAJ", "orcid": ";;0009-0000-7868-1262;0000-0003-1247-2382;;;0000-0001-8132-5613", "linkedin": ";;;wenying45;;;yaodong-yang", "or_profile": "~Jakub_Grudzien_Kuba1;~Ruiqing_Chen1;~Muning_Wen2;~Ying_Wen1;~Fanglei_Sun1;~Jun_Wang2;~Yaodong_Yang1", "aff": ";;Shanghai Jiaotong University;Shanghai Jiaotong University;ShanghaiTech;University College London;King's College London", "aff_domain": ";;sjtu.edu.cn;sjtu.edu.cn;shanghaitech.edu.cn;ucl.ac.uk;kcl.ac.uk", "position": ";;PhD student;Assistant Professor;Researcher;Professor;Assistant Professor", "bibtex": "@inproceedings{\nkuba2022trust,\ntitle={Trust Region Policy Optimisation in Multi-Agent Reinforcement Learning},\nauthor={Jakub Grudzien Kuba and Ruiqing Chen and Muning Wen and Ying Wen and Fanglei Sun and Jun Wang and Yaodong Yang},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=EcGGFkNTxdJ}\n}", "github": "", "project": "", "reviewers": "be2v;ubwY;ZAQm;WgFJ", "pdf_size": 0, "recommendation": "6;6;6;8", "confidence": "3;3;4;4", "correctness": "3;4;3;3", "technical_novelty": "3;3;3;3", "empirical_novelty": "3;3;3;3", "wc_summary_paper": "79;84;46;73", "wc_summary_review": "34;40;24;18", "wc_main_review": "529;372;213;631", "wc_review": "642;496;283;722", "wc_reply_reviewers": "0;42;16;67", "wc_reply_authors": "1667;834;813;963", "reply_reviewers": "0;1;1;1", "reply_authors": "3;2;2;2", "recommendation_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 70.5, 14.67140075112121 ], "wc_summary_review_avg": [ 29.0, 8.54400374531753 ], "wc_main_review_avg": [ 436.25, 158.507689087943 ], "wc_review_avg": [ 535.75, 166.91371273804918 ], "wc_reply_reviewers_avg": [ 31.25, 25.508576988926684 ], "wc_reply_authors_avg": [ 1069.25, 349.8573816571547 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 2.25, 0.4330127018922193 ], "replies_avg": [ 24, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": -0.3333333333333333, "gs_citation": 329, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6957916250368924359&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=EcGGFkNTxdJ", "email": ";;sjtu.edu.cn;sjtu.edu.cn;shanghaitech.edu.cn;ucl.ac.uk;kcl.ac.uk", "author_num": 7, "aff_unique_index": "0;0;1;2;3", "aff_unique_norm": "Shanghai Jiao Tong University;ShanghaiTech University;University College London;King's College London", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.sjtu.edu.cn;https://www.shanghaitech.edu.cn;https://www.ucl.ac.uk;https://www.kcl.ac.uk", "aff_unique_abbr": "SJTU;ShanghaiTech;UCL;KCL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;1", "aff_country_unique": "China;United Kingdom" }, { "id": "Eceabn-Spyz", "title": "Generalizable Learning to Optimize into Wide Valleys", "track": "main", "status": "Reject", "tldr": "", "abstract": "Learning to optimize (L2O) has gained increasing popularity in various optimization tasks, since classical optimizers usually require laborious, problem-specific design and hyperparameter tuning. However, current L2O approaches are designed for fast minimization of the objective function value (i.e., training error), hence often suffering from poor generalization ability such as in training deep neural networks (DNNs), including ($i$) disappointing performance across unseen optimizees $\\textit{(optimizer generalization)}$; ($ii$) unsatisfactory test-set accuracy of trained DNNs ($\\textit{optmizee generalization}$). To overcome the limitations, this paper introduces $\\textit{flatness-aware}$ regularizers into L2O for shaping the local geometry of optimizee's loss landscape. Specifically, it guides optimizee to locate well-generalizable minimas in large flat regions of loss surface, while tending to avoid sharp valleys. Such optimizee generalization abilities of $\\textit{flatness-aware}$ regularizers have been proved theoretically. Extensive experiments consistently validate the effectiveness of our proposals with substantially improved generalization on multiple sophisticated L2O models and diverse optimizees. Our theoretical and empirical results solidify the foundation for L2O's practically usage. All codes and pre-trained models will be shared upon acceptance.", "keywords": "L2O;Generalization;Flatness;Entropy-SGD", "primary_area": "", "supplementary_material": "/attachment/d71cafd91df4133a71836612a37c9da44de43495.zip", "author": "Junjie Yang;Tianlong Chen;Mingkang Zhu;Fengxiang He;Dacheng Tao;Yingbin Liang;Zhangyang Wang", "authorids": "~Junjie_Yang2;~Tianlong_Chen1;~Mingkang_Zhu1;~Fengxiang_He1;~Dacheng_Tao1;~Yingbin_Liang1;~Zhangyang_Wang1", "gender": "M;M;;;;F;M", "homepage": "https://sites.google.com/view/junjieyang;https://tianlong-chen.github.io;;https://fengxianghe.github.io/;;https://sites.google.com/view/yingbinliang/home;https://vita-group.github.io", "dblp": ";;290/8807;225/4682;;51/332;119/4026", "google_scholar": "https://scholar.google.com/citations?hl=en;LE3ctn0AAAAJ;;QSx-Yu0AAAAJ;;lGgLAiIAAAAJ;pxFyKAIAAAAJ", "orcid": ";0000-0001-7774-8197;;;;;", "linkedin": ";tianlong-chen-783862167/;mingkang-zhu-96705a1b9/;fengxiang-he-35b173122;;;", "or_profile": "~Junjie_Yang2;~Tianlong_Chen1;~Mingkang_Zhu1;~Fengxiang_He1;~Dacheng_Tao1;~Yingbin_Liang1;~Zhangyang_Wang1", "aff": "Ohio State University;University of Texas, Austin;University of Texas, Austin;JD.com, Inc.;;The Ohio State University;University of Texas, Austin", "aff_domain": "osu.edu;utexas.edu;utexas.edu;jd.com;;osu.edu;utexas.edu", "position": "PhD student;PhD student;Undergrad student;Algorithm Scientist;;Professor;Assistant Professor", "bibtex": "@misc{\nyang2022generalizable,\ntitle={Generalizable Learning to Optimize into Wide Valleys},\nauthor={Junjie Yang and Tianlong Chen and Mingkang Zhu and Fengxiang He and Dacheng Tao and Yingbin Liang and Zhangyang Wang},\nyear={2022},\nurl={https://openreview.net/forum?id=Eceabn-Spyz}\n}", "github": "", "project": "", "reviewers": "qg5Z;wp2u;ZY6B;6eeZ", "site": "https://openreview.net/forum?id=Eceabn-Spyz", "pdf_size": 0, "recommendation": "5;5;5;6", "confidence": "4;3;4;4", "correctness": "3;3;3;3", "technical_novelty": "2;2;3;3", "empirical_novelty": "3;0;2;2", "wc_summary_paper": "49;98;90;78", "wc_summary_review": "54;13;68;56", "wc_main_review": "442;309;519;326", "wc_review": "545;420;677;460", "wc_reply_reviewers": "46;0;0;0", "wc_reply_authors": "1322;658;924;606", "reply_reviewers": "1;0;0;0", "reply_authors": "3;1;2;1", "recommendation_avg": [ 5.25, 0.4330127018922193 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 1.75, 1.0897247358851685 ], "wc_summary_paper_avg": [ 78.75, 18.59267328815305 ], "wc_summary_review_avg": [ 47.75, 20.765054779605084 ], "wc_main_review_avg": [ 399.0, 86.13651954891142 ], "wc_review_avg": [ 525.5, 98.42890835521848 ], "wc_reply_reviewers_avg": [ 11.5, 19.91858428704209 ], "wc_reply_authors_avg": [ 877.5, 283.56436659072665 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.75, 0.82915619758885 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:anSAFOzx3SwJ:scholar.google.com/&scioq=Generalizable+Learning+to+Optimize+into+Wide+Valleys&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1;1;2;0;1", "aff_unique_norm": "Ohio State University;University of Texas at Austin;JD.com", "aff_unique_dep": ";;", "aff_unique_url": "https://www.osu.edu;https://www.utexas.edu;https://www.jd.com", "aff_unique_abbr": "OSU;UT Austin;JD.com", "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Austin", "aff_country_unique_index": "0;0;0;1;0;0", "aff_country_unique": "United States;China" }, { "id": "Ee2ugKwgvyy", "title": "Graph Information Matters: Understanding Graph Filters from Interaction Probability", "track": "main", "status": "Reject", "tldr": "", "abstract": "Graph Neural Networks (GNNs) have received extensive affirmation for their promising performance in graph learning problems. Despite their various neural architectures, most are intrinsically graph filters that provide theoretical foundations for model explanations. In particular, low-pass filters show superiority in label prediction in many benchmarks. However, recent empirical research suggests that models with only low pass filters do not always perform well. Although increasing attempts to understand graph filters, it is unclear how a particular graph affects the performance of different filters. In this paper, we carry out a comprehensive theoretical analysis of the synergy of graph structure and node features on graph filters\u2019 behaviors in node classification, relying on the introduction of interaction probability and frequency distribution. We show that the homophily degree of graphs significantly affects the prediction error of graph filters. Our theory provides a guideline for graph filter design in a data-driven manner. Since it is hard for a single graph filter to live up to this, we propose a general strategy for exploring a data-specified filter bank. Experimental results show that our model achieves consistent and significant performance improvements across all benchmarks. Furthermore, we empirically validate our theoretical analysis and explain the behavior of baselines and our model.", "keywords": "Node classification;graph filters;homophily degree;interaction probability;frequency distribution;filter bank;spectral graph neural networks", "primary_area": "", "supplementary_material": "", "author": "Zhixian Chen;Tengfei Ma;Yang Wang", "authorids": "~Zhixian_Chen1;~Tengfei_Ma1;~Yang_Wang25", "gender": "M;M;F", "homepage": "https://sites.google.com/site/matf0123/;http://www.math.ust.hk/~yangwang;https://github.com/Sherczxk", "dblp": "94/9023-1;;", "google_scholar": "9OvNakkAAAAJ;;", "orcid": "0000-0002-1086-529X;0000-0002-8903-2388;", "linkedin": ";;", "or_profile": "~Tengfei_Ma1;~Yang_Wang25;~Chen_Zhixian1", "aff": "International Business Machines;Hong Kong University of Science and Technology;Hong Kong University of Science and Technology", "aff_domain": "ibm.com;hkust.edu.hk;ust.hk", "position": "Researcher;Full Professor;PhD student", "bibtex": "@misc{\nchen2022graph,\ntitle={Graph Information Matters: Understanding Graph Filters from Interaction Probability},\nauthor={Zhixian Chen and Tengfei Ma and Yang Wang},\nyear={2022},\nurl={https://openreview.net/forum?id=Ee2ugKwgvyy}\n}", "github": "", "project": "", "reviewers": "kKsu;RH1y;nP66", "site": "https://openreview.net/forum?id=Ee2ugKwgvyy", "pdf_size": 0, "recommendation": "3;5;6", "confidence": "4;4;5", "correctness": "2;2;3", "technical_novelty": "2;3;3", "empirical_novelty": "2;0;1", "wc_summary_paper": "84;79;108", "wc_summary_review": "71;22;440", "wc_main_review": "271;315;369", "wc_review": "426;416;917", "wc_reply_reviewers": "84;0;600", "wc_reply_authors": "891;992;1548", "reply_reviewers": "1;0;4", "reply_authors": "2;2;4", "recommendation_avg": [ 4.666666666666667, 1.247219128924647 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "correctness_avg": [ 2.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 1.0, 0.816496580927726 ], "wc_summary_paper_avg": [ 90.33333333333333, 12.657891697365017 ], "wc_summary_review_avg": [ 177.66666666666666, 186.5731908811005 ], "wc_main_review_avg": [ 318.3333333333333, 40.07770230717103 ], "wc_review_avg": [ 586.3333333333334, 233.8522800592051 ], "wc_reply_reviewers_avg": [ 228.0, 265.26967410542807 ], "wc_reply_authors_avg": [ 1143.6666666666667, 288.8648280578459 ], "reply_reviewers_avg": [ 1.6666666666666667, 1.699673171197595 ], "reply_authors_avg": [ 2.6666666666666665, 0.9428090415820634 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.7559289460184544, "corr_recommendation_correctness": 0.7559289460184545, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:KdNK1kbkRC8J:scholar.google.com/&scioq=Graph+Information+Matters:+Understanding+Graph+Filters+from+Interaction+Probability&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1;1", "aff_unique_norm": "International Business Machines Corporation;Hong Kong University of Science and Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.ibm.com;https://www.ust.hk", "aff_unique_abbr": "IBM;HKUST", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;1;1", "aff_country_unique": "United States;China" }, { "id": "EgkZwzEwciE", "title": "Adversarial Collaborative Learning on Non-IID Features", "track": "main", "status": "Reject", "tldr": "", "abstract": "Federated learning has been a popular approach to enable collaborative learning on multiple parties without exchanging raw data. However, the model performance of federated learning may degrade a lot due to non-IID data. While most existing studies focus on non-IID labels, federated learning on non-IID features has largely been overlooked. Different from typical federated learning approaches, the paper proposes a new learning concept called ADCOL (Adversarial Collaborative Learning) for non-IID features. Instead of adopting the widely used model-averaging scheme, ADCOL conducts training in an adversarial way: the server aims to train a discriminator to distinguish the representations of the parties, while the parties aim to generate a common representation distribution. Our experiments on three real-world datasets show that ADCOL achieves better accuracy and is much more communication-efficient than state-of-the-art federated learning algorithms on non-IID features. More importantly, ADCOL points out a promising research direction for collaborative learning.", "keywords": "Federated Learning;Collaborative Learning", "primary_area": "", "supplementary_material": "", "author": "Qinbin Li;Bingsheng He;Dawn Song", "authorids": "~Qinbin_Li1;~Bingsheng_He1;~Dawn_Song1", "gender": "M;M;F", "homepage": "https://qinbinli.com/;http://www.comp.nus.edu.sg/~hebs/;", "dblp": "225/9769;h/BingshengHe.html;s/DXSong", "google_scholar": "https://scholar.google.com.sg/citations?user=1EMOEqQAAAAJ;https://scholar.google.com.tw/citations?user=RogYLKYAAAAJ;", "orcid": ";0000-0001-8618-4581;", "linkedin": ";bingsheng-he-7734b131;", "or_profile": "~Qinbin_Li1;~Bingsheng_He1;~Dawn_Song1", "aff": "School of Computing, National University of Singapore;National University of Singapore;University of California, Berkeley", "aff_domain": "comp.nus.edu.sg;nus.edu.sg;berkeley.edu", "position": "PhD student;Full Professor;Full Professor", "bibtex": "@misc{\nli2022adversarial,\ntitle={Adversarial Collaborative Learning on Non-{IID} Features},\nauthor={Qinbin Li and Bingsheng He and Dawn Song},\nyear={2022},\nurl={https://openreview.net/forum?id=EgkZwzEwciE}\n}", "github": "", "project": "", "reviewers": "tbKF;VD2k;3uWW;gd59", "site": "https://openreview.net/forum?id=EgkZwzEwciE", "pdf_size": 0, "recommendation": "3;5;5;8", "confidence": "5;4;4;4", "correctness": "3;3;3;4", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "24;111;97;27", "wc_summary_review": "18;26;41;58", "wc_main_review": "243;271;202;407", "wc_review": "285;408;340;492", "wc_reply_reviewers": "255;0;0;0", "wc_reply_authors": "1223;706;299;551", "reply_reviewers": "1;0;0;0", "reply_authors": "2;1;1;1", "recommendation_avg": [ 5.25, 1.7853571071357126 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 64.75, 39.57508686029634 ], "wc_summary_review_avg": [ 35.75, 15.270478054075452 ], "wc_main_review_avg": [ 280.75, 76.91025614311786 ], "wc_review_avg": [ 381.25, 77.37368738789692 ], "wc_reply_reviewers_avg": [ 63.75, 110.41823898251593 ], "wc_reply_authors_avg": [ 694.75, 337.8079150937704 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.7276068751089989, "corr_recommendation_correctness": 0.8892972917998875, "gs_citation": 18, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6413644133558844298&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0;1", "aff_unique_norm": "National University of Singapore;University of California, Berkeley", "aff_unique_dep": "School of Computing;", "aff_unique_url": "https://www.nus.edu.sg;https://www.berkeley.edu", "aff_unique_abbr": "NUS;UC Berkeley", "aff_campus_unique_index": "0;2", "aff_campus_unique": "Singapore;;Berkeley", "aff_country_unique_index": "0;0;1", "aff_country_unique": "Singapore;United States" }, { "title": "PiCO: Contrastive Label Disambiguation for Partial Label Learning", "status": "Oral", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6038", "id": "EhYjZy6e1gJ", "poster": "", "openreview": "https://openreview.net/forum?id=EhYjZy6e1gJ", "slides": "https://iclr.cc/virtual/2022/poster/6038", "video": "https://iclr.cc/virtual/2022/poster/6038", "author_site": "Haobo Wang, Ruixuan Xiao, Yixuan Li, Lei Feng, Gang Niu, Gang Chen, Junbo Zhao", "tldr": "", "abstract": "Partial label learning (PLL) is an important problem that allows each training example to be labeled with a coarse candidate set, which well suits many real-world data annotation scenarios with label ambiguity. Despite the promise, the performance of PLL often lags behind the supervised counterpart. In this work, we bridge the gap by addressing two key research challenges in PLL---representation learning and label disambiguation---in one coherent framework. Specifically, our proposed framework PiCO consists of a contrastive learning module along with a novel class prototype-based label disambiguation algorithm. PiCO produces closely aligned representations for examples from the same classes and facilitates label disambiguation. Theoretically, we show that these two components are mutually beneficial, and can be rigorously justified from an expectation-maximization (EM) algorithm perspective. Extensive experiments demonstrate that PiCO significantly outperforms the current state-of-the-art approaches in PLL and even achieves comparable results to fully supervised learning. Code and data available: https://github.com/hbzju/PiCO.", "keywords": "Partial Label Learning;Contrastive Learning;Prototype-based Disambiguation", "primary_area": "", "supplementary_material": "/attachment/23c0ecc1a1d5842f6f810a3a17d15f9016553246.zip", "author": "Haobo Wang;Ruixuan Xiao;Yixuan Li;Lei Feng;Gang Niu;Gang Chen;Junbo Zhao", "authorids": "~Haobo_Wang1;~Ruixuan_Xiao1;~Yixuan_Li1;~Lei_Feng1;~Gang_Niu1;~Gang_Chen6;~Junbo_Zhao1", "gender": "M;M;F;M;M;M;M", "homepage": "https://hbzju.github.io/;https://github.com/Justherozen;http://pages.cs.wisc.edu/~sharonli/;https://lfeng1995.github.io/;https://niug1984.github.io;;http://jakezhao.net/", "dblp": ";312/5605;144/6087-1;76/847-6;26/3367-1;67/6383-1;191/6665", "google_scholar": "DnN-rggAAAAJ;OLQeOJgAAAAJ;https://scholar.google.com/citations?hl=en;https://scholar.google.com.sg/citations?user=KomQOFkAAAAJ;https://scholar.google.co.jp/citations?user=HOkcy00AAAAJ;;8ipao8MAAAAJ", "orcid": "0000-0001-8586-3048;;;0000-0003-2839-5799;;0000-0002-7483-0045;", "linkedin": ";;liyixuan;;;;", "or_profile": "~Haobo_Wang1;~Ruixuan_Xiao1;~Yixuan_Li1;~Lei_Feng1;~Gang_Niu1;~Gang_Chen6;~Junbo_Zhao1", "aff": "Zhejiang University;Zhejiang University;Cornell University;Chongqing University;RIKEN;College of Computer Science and Technology, Zhejiang University;Zhejiang University", "aff_domain": "zju.edu.cn;zju.edu.cn;cornell.edu;cqu.edu.cn;riken.jp;cs.zju.edu.cn;zju.edu.cn", "position": "PhD student;Undergrad student;Graduate Student;Full Professor;Research Scientist (tenured);Full Professor;Assistant Professor", "bibtex": "@inproceedings{\nwang2022pico,\ntitle={Pi{CO}: Contrastive Label Disambiguation for Partial Label Learning},\nauthor={Haobo Wang and Ruixuan Xiao and Yixuan Li and Lei Feng and Gang Niu and Gang Chen and Junbo Zhao},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=EhYjZy6e1gJ}\n}", "github": "", "project": "", "reviewers": "fAwc;k7zr;7qAy", "pdf_size": 0, "recommendation": "8;8;8", "confidence": "4;3;3", "correctness": "3;4;3", "technical_novelty": "3;4;3", "empirical_novelty": "3;4;4", "wc_summary_paper": "89;176;86", "wc_summary_review": "12;44;24", "wc_main_review": "168;251;163", "wc_review": "269;471;273", "wc_reply_reviewers": "0;58;637", "wc_reply_authors": "294;509;2249", "reply_reviewers": "0;1;3", "reply_authors": "1;1;5", "recommendation_avg": [ 8.0, 0.0 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 3.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 117.0, 41.737273509418415 ], "wc_summary_review_avg": [ 26.666666666666668, 13.199326582148887 ], "wc_main_review_avg": [ 194.0, 40.3567425180312 ], "wc_review_avg": [ 337.6666666666667, 94.29504523332896 ], "wc_reply_reviewers_avg": [ 231.66666666666666, 287.5903722696958 ], "wc_reply_authors_avg": [ 1017.3333333333334, 875.3316831667614 ], "reply_reviewers_avg": [ 1.3333333333333333, 1.247219128924647 ], "reply_authors_avg": [ 2.3333333333333335, 1.8856180831641267 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 190, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15337885614159171676&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=EhYjZy6e1gJ", "email": "zju.edu.cn;zju.edu.cn;cornell.edu;cqu.edu.cn;riken.jp;cs.zju.edu.cn;zju.edu.cn", "author_num": 7, "aff_unique_index": "0;0;1;2;3;0;0", "aff_unique_norm": "Zhejiang University;Cornell University;Chongqing University;RIKEN", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.zju.edu.cn;https://www.cornell.edu;https://www.cqu.edu.cn;https://www.riken.jp", "aff_unique_abbr": "ZJU;Cornell;CQU;RIKEN", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;2;0;0", "aff_country_unique": "China;United States;Japan" }, { "id": "EhdacditHf9", "title": "The Number of Steps Needed for Nonconvex Optimization of a Deep Learning Optimizer is a Rational Function of Batch Size", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Recently, convergence as well as convergence rate analyses of deep learning optimizers for nonconvex optimization have been widely studied. Meanwhile, numerical evaluations for the optimizers have precisely clarified the relationship between batch size and the number of steps needed for training deep neural networks. The main contribution of this paper is to show theoretically that the number of steps needed for nonconvex optimization of each of the optimizers can be expressed as a rational function of batch size. Having these rational functions leads to two particularly important facts, which were validated numerically in previous studies. The first fact is that there exists an optimal batch size such that the number of steps needed for nonconvex optimization is minimized. This implies that using larger batch sizes than the optimal batch size does not decrease the number of steps needed for nonconvex optimization. The second fact is that the optimal batch size depends on the optimizer. In particular, it is shown theoretically that momentum and Adam-type optimizers can exploit larger optimal batches and further reduce the minimum number of steps needed for nonconvex optimization than can the stochastic gradient descent optimizer.", "keywords": "Adam;deep learning optimizer;momentum;nonconvex optimization;optimal batch size;SGD", "primary_area": "", "supplementary_material": "", "author": "Hideaki Iiduka", "authorids": "~Hideaki_Iiduka1", "gender": "M", "homepage": "https://iiduka.net/en/", "dblp": "48/8221", "google_scholar": "https://scholar.google.co.jp/citations?user=jr5sK30AAAAJ", "orcid": "0000-0001-9173-6723", "linkedin": "", "or_profile": "~Hideaki_Iiduka1", "aff": "Meiji University", "aff_domain": "meiji.ac.jp", "position": "Full Professor", "bibtex": "@misc{\niiduka2022the,\ntitle={The Number of Steps Needed for Nonconvex Optimization of a Deep Learning Optimizer is a Rational Function of Batch Size},\nauthor={Hideaki Iiduka},\nyear={2022},\nurl={https://openreview.net/forum?id=EhdacditHf9}\n}", "github": "", "project": "", "reviewers": "jMjg;4PcN;fYVK;DwtJ;TxyV", "site": "https://openreview.net/forum?id=EhdacditHf9", "pdf_size": 0, "recommendation": "1;3;5;6;6", "confidence": "5;3;3;3;3", "correctness": "1;1;3;4;3", "technical_novelty": "4;2;2;3;4", "empirical_novelty": "4;0;2;2;2", "wc_summary_paper": "45;34;78;121;125", "wc_summary_review": "393;10;1;34;109", "wc_main_review": "362;245;395;339;1096", "wc_review": "800;289;474;494;1330", "wc_reply_reviewers": "1461;182;0;0;0", "wc_reply_authors": "4220;2570;260;239;1309", "reply_reviewers": "6;3;0;0;0", "reply_authors": "8;7;1;1;2", "recommendation_avg": [ 4.2, 1.9390719429665317 ], "confidence_avg": [ 3.4, 0.8000000000000002 ], "correctness_avg": [ 2.4, 1.2 ], "technical_novelty_avg": [ 3.0, 0.8944271909999159 ], "empirical_novelty_avg": [ 2.0, 1.2649110640673518 ], "wc_summary_paper_avg": [ 80.6, 37.547836156029014 ], "wc_summary_review_avg": [ 109.4, 146.79591274964028 ], "wc_main_review_avg": [ 487.4, 308.36251393449237 ], "wc_review_avg": [ 677.4, 365.1791888922478 ], "wc_reply_reviewers_avg": [ 328.6, 570.5708019168173 ], "wc_reply_authors_avg": [ 1719.6, 1514.3567083088449 ], "reply_reviewers_avg": [ 1.8, 2.4000000000000004 ], "reply_authors_avg": [ 3.8, 3.059411708155671 ], "replies_avg": [ 43, 0 ], "authors#_avg": [ 1, 0 ], "corr_recommendation_confidence": -0.8251369970070346, "corr_recommendation_correctness": 0.9110887675286008, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9479484026798663616&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0", "aff_unique_norm": "Meiji University", "aff_unique_dep": "", "aff_unique_url": "https://www.meiji.ac.jp", "aff_unique_abbr": "Meiji", "aff_country_unique_index": "0", "aff_country_unique": "Japan" }, { "id": "Ehhk6jyas6v", "title": "On The Quality Assurance Of Concept-Based Representations", "track": "main", "status": "Reject", "tldr": "", "abstract": "Recent work on Explainable AI has focused on concept-based explanations, where deep learning models are explained in terms of high-level units of information, referred to as concepts. In parallel, the field of disentanglement learning has explored the related notion of finding underlying factors of variation in the data that have interpretability properties. Despite their overlapping purpose, the metrics to evaluate the quality of concepts and factors of variation in the two fields are not aligned, hindering a systematic comparison. In this paper we consider factors of variation as concepts and thus unify the notations in concept and disentanglement learning. Next, we propose metrics for evaluating the quality of concept representations in both approaches, in the presence and in the absence of ground truth concept labels. Via our proposed metrics, we benchmark state-of-the-art methods from both families, and propose a set of guidelines to determine the impact that supervision may have on the quality of learnt concept representations.", "keywords": "Concept learning;Disentanglement learning;Explainability;Interpretability", "primary_area": "", "supplementary_material": "", "author": "Mateo Espinosa Zarlenga;Pietro Barbiero;Zohreh Shams;Dmitry Kazhdan;Umang Bhatt;Mateja Jamnik", "authorids": "~Mateo_Espinosa_Zarlenga1;~Pietro_Barbiero1;~Zohreh_Shams1;~Dmitry_Kazhdan1;~Umang_Bhatt1;~Mateja_Jamnik1", "gender": "M;M;;M;M;F", "homepage": "https://mateoespinosa.github.io/;http://www.pietrobarbiero.eu/;;;https://umangsbhatt.github.io;http://www.cl.cam.ac.uk/~mj201", "dblp": "307/3045.html;238/7860;;;207/7955;41/1392", "google_scholar": "4ikoEiMAAAAJ;https://scholar.google.it/citations?user=4gbToQoAAAAJ;;MSFAgbkAAAAJ;https://scholar.google.com/citations?hl=en;d5QiyJkAAAAJ", "orcid": ";0000-0003-3155-2564;;;;0000-0003-2772-2532", "linkedin": "mateoespinosa/;;;dmitry-kazhdan/;umangsbhatt/;", "or_profile": "~Mateo_Espinosa_Zarlenga1;~Pietro_Barbiero1;~Zohreh_Shams1;~Dmitry_Kazhdan1;~Umang_Bhatt1;~Mateja_Jamnik1", "aff": "University of Cambridge;University of Cambridge;;University of Cambridge;University of Cambridge;University of Cambridge", "aff_domain": "cam.ac.uk;cam.ac.uk;;cam.ac.uk;cam.ac.uk;cam.ac.uk", "position": "PhD student;PhD student;;PhD student;PhD student;Professor in Artificial Intelligence", "bibtex": "@misc{\nzarlenga2022on,\ntitle={On The Quality Assurance Of Concept-Based Representations},\nauthor={Mateo Espinosa Zarlenga and Pietro Barbiero and Zohreh Shams and Dmitry Kazhdan and Umang Bhatt and Mateja Jamnik},\nyear={2022},\nurl={https://openreview.net/forum?id=Ehhk6jyas6v}\n}", "github": "", "project": "", "reviewers": "iQK1;yfkD;7LHy", "site": "https://openreview.net/forum?id=Ehhk6jyas6v", "pdf_size": 0, "recommendation": "5;5;5", "confidence": "4;4;4", "correctness": "3;4;4", "technical_novelty": "4;2;3", "empirical_novelty": "4;3;3", "wc_summary_paper": "46;109;88", "wc_summary_review": "44;89;45", "wc_main_review": "1079;1453;342", "wc_review": "1169;1651;475", "wc_reply_reviewers": "264;254;0", "wc_reply_authors": "2709;1817;892", "reply_reviewers": "1;1;0", "reply_authors": "5;3;2", "recommendation_avg": [ 5.0, 0.0 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 3.0, 0.816496580927726 ], "empirical_novelty_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 81.0, 26.19160170741759 ], "wc_summary_review_avg": [ 59.333333333333336, 20.98147330914162 ], "wc_main_review_avg": [ 958.0, 461.5632856571964 ], "wc_review_avg": [ 1098.3333333333333, 482.69336942710214 ], "wc_reply_reviewers_avg": [ 172.66666666666666, 122.16200536809943 ], "wc_reply_authors_avg": [ 1806.0, 741.8279225444851 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 3.3333333333333335, 1.247219128924647 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3939403035056636527&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "University of Cambridge", "aff_unique_dep": "", "aff_unique_url": "https://www.cam.ac.uk", "aff_unique_abbr": "Cambridge", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Cambridge", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United Kingdom" }, { "id": "EhwEUb2ynIa", "title": "How to Adapt Your Large-Scale Vision-and-Language Model", "track": "main", "status": "Reject", "tldr": "", "abstract": "Pre-training large-scale vision and language models (e.g. CLIP) has shown promising results in representation and transfer learning. We investigate the question of how to efficiently adapt these models to downstream tasks. For image classification, linear probes have been the standard for ease of use and efficiency, while for language, other approaches like prompt tuning have emerged. We analyze several fine-tuning methods across a diverse set of image classification tasks across two spectra investigating the amount and similarity of downstream data to that of pretraining one. We find that just tuning LayerNorm parameters is a surprisingly effective baseline across the board. We further demonstrate a simple yet effective strategy that combines LayerNorm-tuning with general fine-tuning methods to improve their performance and benchmark them on few-shot adaption and distribution shift tasks. Finally, we provide an empirical analysis and recommend general recipes for efficient transfer learning of vision and language models. Website at https://sites.google.com/view/adapt-large-scale-models", "keywords": "transfer learning;fine-tuning;layernorm;CLIP;prompt-tuning;adaptation;zero-shot;pretraining", "primary_area": "", "supplementary_material": "", "author": "Konwoo Kim;Michael Laskin;Igor Mordatch;Deepak Pathak", "authorids": "~Konwoo_Kim1;~Michael_Laskin1;~Igor_Mordatch4;~Deepak_Pathak1", "gender": ";M;;M", "homepage": ";http://mishalaskin.com;;https://www.cs.cmu.edu/~dpathak/", "dblp": ";;;155/9860", "google_scholar": ";DOGDnwsAAAAJ;;https://scholar.google.cl/citations?user=AEsPCAUAAAAJ", "orcid": ";;;", "linkedin": ";mishalaskin;;pathak22/", "or_profile": "~Konwoo_Kim1;~Michael_Laskin1;~Igor_Mordatch4;~Deepak_Pathak1", "aff": ";Google DeepMind;;Carnegie Mellon University", "aff_domain": ";deepmind.com;;cmu.edu", "position": ";Researcher;;Assistant Professor", "bibtex": "@misc{\nkim2022how,\ntitle={How to Adapt Your Large-Scale Vision-and-Language Model},\nauthor={Konwoo Kim and Michael Laskin and Igor Mordatch and Deepak Pathak},\nyear={2022},\nurl={https://openreview.net/forum?id=EhwEUb2ynIa}\n}", "github": "", "project": "", "reviewers": "GtTP;Dgds;Vfao;aQ8A", "site": "https://openreview.net/forum?id=EhwEUb2ynIa", "pdf_size": 0, "recommendation": "3;5;5;5", "confidence": "4;4;4;3", "correctness": "2;3;3;3", "technical_novelty": "1;2;2;2", "empirical_novelty": "1;2;3;2", "wc_summary_paper": "89;108;59;33", "wc_summary_review": "34;71;36;78", "wc_main_review": "178;345;212;373", "wc_review": "301;524;307;484", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "750;762;759;671", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 4.5, 0.8660254037844386 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 1.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 72.25, 28.612715704735194 ], "wc_summary_review_avg": [ 54.75, 19.917015338649513 ], "wc_main_review_avg": [ 277.0, 83.46556176052492 ], "wc_review_avg": [ 404.0, 101.017325246712 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 735.5, 37.5 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": 1.0, "gs_citation": 25, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8187358490282038793&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1", "aff_unique_norm": "Google;Carnegie Mellon University", "aff_unique_dep": "Google DeepMind;", "aff_unique_url": "https://deepmind.com;https://www.cmu.edu", "aff_unique_abbr": "DeepMind;CMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "United Kingdom;United States" }, { "title": "Multi-Stage Episodic Control for Strategic Exploration in Text Games", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6725", "id": "Ek7PSN7Y77z", "poster": "", "openreview": "https://openreview.net/forum?id=Ek7PSN7Y77z", "slides": "https://iclr.cc/virtual/2022/poster/6725", "video": "https://iclr.cc/virtual/2022/poster/6725", "author_site": "Jens Tuyls, Shunyu Yao, Sham M Kakade, Karthik Narasimhan", "tldr": "", "abstract": "Text adventure games present unique challenges to reinforcement learning methods due to their combinatorially large action spaces and sparse rewards. The interplay of these two factors is particularly demanding because large action spaces require extensive exploration, while sparse rewards provide limited feedback. This work proposes to tackle the explore-vs-exploit dilemma using a multi-stage approach that explicitly disentangles these two strategies within each episode. Our algorithm, called eXploit-Then-eXplore (XTX), begins each episode using an exploitation policy that imitates a set of promising trajectories from the past, and then switches over to an exploration policy aimed at discovering novel actions that lead to unseen state spaces. This policy decomposition allows us to combine global decisions about which parts of the game space to return to with curiosity-based local exploration in that space, motivated by how a human may approach these games. Our method significantly outperforms prior approaches by 27% and 11% average normalized score over 12 games from the Jericho benchmark (Hausknecht et al., 2020) in both deterministic and stochastic settings, respectively. On the game of Zork1, in particular, XTX obtains a score of 103, more than a 2x improvement over prior methods, and pushes past several known bottlenecks in the game that have plagued previous state-of-the-art methods.", "keywords": "reinforcement learning;language understanding;text-based games", "primary_area": "", "supplementary_material": "/attachment/bf8204996411ab421764000863c72d3222ad3f14.zip", "author": "Jens Tuyls;Shunyu Yao;Sham M. Kakade;Karthik R Narasimhan", "authorids": "~Jens_Tuyls1;~Shunyu_Yao1;~Sham_M._Kakade1;~Karthik_R_Narasimhan1", "gender": "M;M;M;M", "homepage": "https://jens321.github.io;https://ysymyth.github.io;https://shamulent.github.io;http://www.karthiknarasimhan.com", "dblp": ";156/1038;s/SMKakade;147/0322", "google_scholar": "TPnedXMAAAAJ;qJBXk9cAAAAJ;https://scholar.google.com.tw/citations?user=wb-DKCIAAAAJ;euc0GX4AAAAJ", "orcid": ";;;", "linkedin": "jens-tuyls-144852128/;;;", "or_profile": "~Jens_Tuyls1;~Shunyu_Yao1;~Sham_M._Kakade1;~Karthik_R_Narasimhan1", "aff": "Princeton University;Princeton University;Harvard University;Princeton University", "aff_domain": "princeton.edu;princeton.edu;harvard.edu;princeton.edu", "position": "PhD student;PhD student;Full Professor;Assistant Professor", "bibtex": "@inproceedings{\ntuyls2022multistage,\ntitle={Multi-Stage Episodic Control for Strategic Exploration in Text Games},\nauthor={Jens Tuyls and Shunyu Yao and Sham M. Kakade and Karthik R Narasimhan},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=Ek7PSN7Y77z}\n}", "github": "", "project": "", "reviewers": "Exgo;TAdH;PsKh;186e", "pdf_size": 0, "recommendation": "6;6;8;8", "confidence": "5;4;3;3", "correctness": "3;4;4;4", "technical_novelty": "2;3;3;3", "empirical_novelty": "3;3;0;4", "wc_summary_paper": "236;66;134;89", "wc_summary_review": "112;58;75;64", "wc_main_review": "890;265;571;242", "wc_review": "1238;389;780;395", "wc_reply_reviewers": "148;109;0;23", "wc_reply_authors": "1396;1057;329;905", "reply_reviewers": "1;1;0;1", "reply_authors": "4;3;1;3", "recommendation_avg": [ 7.0, 1.0 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 1.5 ], "wc_summary_paper_avg": [ 131.25, 65.23563060168883 ], "wc_summary_review_avg": [ 77.25, 20.96872671384698 ], "wc_main_review_avg": [ 492.0, 263.94791152801344 ], "wc_review_avg": [ 700.5, 348.42108145173995 ], "wc_reply_reviewers_avg": [ 70.0, 60.65063890842371 ], "wc_reply_authors_avg": [ 921.75, 385.62895054702517 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 2.75, 1.0897247358851685 ], "replies_avg": [ 22, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.9045340337332909, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 26, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10027236272852708486&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=Ek7PSN7Y77z", "email": "princeton.edu;princeton.edu;harvard.edu;princeton.edu", "author_num": 4, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Princeton University;Harvard University", "aff_unique_dep": ";", "aff_unique_url": "https://www.princeton.edu;https://www.harvard.edu", "aff_unique_abbr": "Princeton;Harvard", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "El9kZ2caYVy", "title": "Noise-Contrastive Variational Information Bottleneck Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "While deep neural networks for classification have shown impressive predictive performance, e.g. in image classification, they generally tend to be overconfident. We start from the observation that popular methods for reducing overconfidence by regularizing the distribution of outputs or intermediate variables achieve better calibration by sacrificing the separability of correct and incorrect predictions, another important facet of uncertainty estimation. To circumvent this, we propose a novel method that builds upon the distributional alignment of the variational information bottleneck and encourages assigning lower confidence to samples from the latent prior. Our experiments show that this simultaneously improves prediction accuracy and calibration compared to a multitude of output regularization methods without impacting the uncertainty-based separability in multiple classification settings, including under distributional shift.", "keywords": "uncertainty estimation;variational information bottleneck", "primary_area": "", "supplementary_material": "/attachment/fba8e507b829350c3c0ba19ee74f5eb40b3f1967.zip", "author": "Jannik Schmitt;Stefan Roth", "authorids": "~Jannik_Schmitt1;~Stefan_Roth1", "gender": ";M", "homepage": "https://www.visinf.tu-darmstadt.de/visinf/team_members/jschmitt/jschmitt.en.jsp;https://www.visinf.tu-darmstadt.de/visual_inference/people_vi/stefan_roth.en.jsp", "dblp": ";24/3452", "google_scholar": ";0yDoR0AAAAAJ", "orcid": ";0000-0001-9002-9832", "linkedin": ";stefanroth13", "or_profile": "~Jannik_Schmitt1;~Stefan_Roth1", "aff": "TU Darmstadt;Technische Universit\u00e4t Darmstadt", "aff_domain": "tu-darmstadt.de;tu-darmstadt.de", "position": "PhD student;Full Professor", "bibtex": "@misc{\nschmitt2022noisecontrastive,\ntitle={Noise-Contrastive Variational Information Bottleneck Networks},\nauthor={Jannik Schmitt and Stefan Roth},\nyear={2022},\nurl={https://openreview.net/forum?id=El9kZ2caYVy}\n}", "github": "", "project": "", "reviewers": "mHTV;DCEH;r8pu;dzx4", "site": "https://openreview.net/forum?id=El9kZ2caYVy", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "3;3;3;4", "correctness": "2;3;4;4", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;3;2;3", "wc_summary_paper": "57;28;104;59", "wc_summary_review": "86;39;26;34", "wc_main_review": "472;383;157;291", "wc_review": "615;450;287;384", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 62.0, 27.175356483402386 ], "wc_summary_review_avg": [ 46.25, 23.41340428045439 ], "wc_main_review_avg": [ 325.75, 116.56623653528494 ], "wc_review_avg": [ 434.0, 119.50523001107524 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.6622661785325219, "corr_recommendation_correctness": 0.899228803025897, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Ah0PjIcWJq0J:scholar.google.com/&scioq=Noise-Contrastive+Variational+Information+Bottleneck+Networks&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Technische Universit\u00e4t Darmstadt", "aff_unique_dep": "", "aff_unique_url": "https://www.tu-darmstadt.de", "aff_unique_abbr": "TU Darmstadt", "aff_campus_unique_index": "0", "aff_campus_unique": "Darmstadt;", "aff_country_unique_index": "0;0", "aff_country_unique": "Germany" }, { "title": "On Evaluation Metrics for Graph Generative Models", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6661", "id": "EnwCZixjSh", "poster": "", "openreview": "https://openreview.net/forum?id=EnwCZixjSh", "slides": "https://iclr.cc/virtual/2022/poster/6661", "video": "https://iclr.cc/virtual/2022/poster/6661", "author_site": "Rylee Thompson, Boris Knyazev, Elahe Ghalebi, Jungtaek Kim, Graham W Taylor", "tldr": "", "abstract": "In image generation, generative models can be evaluated naturally by visually inspecting model outputs. However, this is not always the case for graph generative models (GGMs), making their evaluation challenging. Currently, the standard process for evaluating GGMs suffers from three critical limitations: i) it does not produce a single score which makes model selection challenging, ii) in many cases it fails to consider underlying edge and node features, and iii) it is prohibitively slow to perform. In this work, we mitigate these issues by searching for \\emph{scalar, domain-agnostic, and scalable metrics} for evaluating and ranking GGMs. To this end, we study existing GGM metrics and neural-network-based metrics emerging from generative models of images that use embeddings extracted from a task-specific network. Motivated by the power of Graph Neural Networks (GNNs) to extract meaningful graph representations \\emph{without any training}, we introduce several metrics based on the features extracted by an untrained random GNN. We design experiments to thoroughly test and objectively score metrics on their ability to measure the diversity and fidelity of generated graphs, as well as their sample and computational efficiency. Depending on the quantity of samples, we recommend one of two metrics from our collection of random-GNN-based metrics. We show these two metrics to be more expressive than pre-existing and alternative random-GNN-based metrics using our objective scoring. While we focus on applying these metrics to GGM evaluation, in practice this enables the ability to easily compute the dissimilarity between any two sets of graphs \\emph{regardless of domain}. Our code is released at: https://github.com/uoguelph-mlrg/GGM-metrics.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Rylee Thompson;Boris Knyazev;Elahe Ghalebi;Jungtaek Kim;Graham W. Taylor", "authorids": "~Rylee_Thompson1;~Boris_Knyazev1;~Elahe_Ghalebi1;~Jungtaek_Kim1;~Graham_W._Taylor1", "gender": "M;M;F;M;", "homepage": ";https://bknyaz.github.io/;;https://jungtaekkim.github.io;https://www.gwtaylor.ca", "dblp": ";181/5675-1;171/2142;31/3193-1;17/1633", "google_scholar": "https://scholar.google.ca/citations?user=pRy6BiAAAAAJ;https://scholar.google.ca/citations?user=Dp9VFB0AAAAJ;;KXNUYWgAAAAJ;https://scholar.google.ca/citations?user=PUeKU8kAAAAJ", "orcid": ";0000-0002-9484-1534;;0000-0002-1905-1399;", "linkedin": "rylee-thompson/;boris-knyazev-39690948/;;jungtaekkim;", "or_profile": "~Rylee_Thompson1;~Boris_Knyazev1;~Elahe_Ghalebi1;~Jungtaek_Kim1;~Graham_W_Taylor1", "aff": "University of Guelph;University of Guelph;Vector Institute for Artificial Intelligence;POSTECH;University of Guelph", "aff_domain": "uoguelph.ca;uoguelph.ca;vectorinstitute.ai;postech.ac.kr;uoguelph.ca", "position": "MS student;PhD student;Postdoc;PhD student;Full Professor", "bibtex": "@inproceedings{\nthompson2022on,\ntitle={On Evaluation Metrics for Graph Generative Models},\nauthor={Rylee Thompson and Boris Knyazev and Elahe Ghalebi and Jungtaek Kim and Graham W. Taylor},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=EnwCZixjSh}\n}", "github": "", "project": "", "reviewers": "XmrB;UXK8;CNrk;Ru33", "pdf_size": 0, "recommendation": "6;6;6;8", "confidence": "3;4;4;3", "correctness": "3;3;3;4", "technical_novelty": "1;2;2;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "55;109;55;118", "wc_summary_review": "56;76;41;59", "wc_main_review": "228;298;260;444", "wc_review": "339;483;356;621", "wc_reply_reviewers": "21;734;0;279", "wc_reply_authors": "1075;2337;724;1323", "reply_reviewers": "1;3;0;1", "reply_authors": "2;6;1;3", "recommendation_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 84.25, 29.422567868899545 ], "wc_summary_review_avg": [ 58.0, 12.429802894656053 ], "wc_main_review_avg": [ 307.5, 82.61204512660366 ], "wc_review_avg": [ 449.75, 113.45345962111513 ], "wc_reply_reviewers_avg": [ 258.5, 295.69790327291804 ], "wc_reply_authors_avg": [ 1364.75, 600.3184050984944 ], "reply_reviewers_avg": [ 1.25, 1.0897247358851685 ], "reply_authors_avg": [ 3.0, 1.8708286933869707 ], "replies_avg": [ 26, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 1.0, "gs_citation": 51, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7803067086316285274&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=EnwCZixjSh", "email": "uoguelph.ca;uoguelph.ca;vectorinstitute.ai;postech.ac.kr;uoguelph.ca", "author_num": 5, "aff_unique_index": "0;0;1;2;0", "aff_unique_norm": "University of Guelph;Vector Institute for Artificial Intelligence;Pohang University of Science and Technology", "aff_unique_dep": ";;", "aff_unique_url": "https://www.uoguelph.ca;https://vectorinstitute.ai/;https://www.postech.ac.kr", "aff_unique_abbr": "U of G;Vector Institute;POSTECH", "aff_campus_unique_index": "1", "aff_campus_unique": ";Pohang", "aff_country_unique_index": "0;0;0;1;0", "aff_country_unique": "Canada;South Korea" }, { "id": "Eot1M5o2Zy", "title": "AestheticNet: Reducing bias in facial data sets under ethical considerations", "track": "main", "status": "Reject", "tldr": "", "abstract": "Facial Beauty Prediction (FBP) aims to develop a machine that can automatically evaluate facial attractiveness. Usually, these results were highly correlated with human ratings, and therefore also reflected human bias in annotations. Everyone will have biases that are usually subconscious and not easy to notice. Unconscious bias deserves more attention than explicit discrimination. It affects moral judgement and can evade moral responsibility, and we cannot eliminate it completely. A new challenge for scientists is to provide training data and AI algorithms that can withstand distorted information. Our experiments prove that human aesthetic judgements are usually biased. In this work, we introduce AestheticNet, the most advanced attractiveness prediction network, with a Pearson correlation coefficient of 0.9601, which is significantly better than the competition. This network is then used to enrich the training data with synthetic images in order to overwrite the ground truth values with fair assessments.\nWe propose a new method to generate an unbiased CNN to improve the fairness of machine learning. Prediction and recommender systems based on Artificial Intelligence (AI) technology are widely used in various sectors of industry, such as intelligent recruitment, security, etc. Therefore, their fairness is very important. Our research provides a practical example of how to build a fair and trustable AI.", "keywords": "societal considerations of machine learning;fairness;safety;privacy;responsible AI;discrimination prevention;facial aesthetics;unconscious Bias", "primary_area": "", "supplementary_material": "", "author": "Michael Danner;Muhammad Awais Tanvir Rana;Thomas Weber;Tobias Gerlach;Patrik Huber;Matthias R\u00e4tsch;Josef Kittler", "authorids": "~Michael_Danner1;~Muhammad_Awais_Tanvir_Rana1;thomas.weber@reutlingen-university.de;tobias.gerlach@reutlingen-university.de;patrik.huber@york.ac.uk;matthias.raetsch@reutlingen-university.de;~Josef_Kittler1", "gender": "M;M;;;;;M", "homepage": "https://www.visir.org/;https://www.surrey.ac.uk/people/muhammad-awais;;;;;https://www.surrey.ac.uk/people/josef-kittler", "dblp": ";80/3639-1;;;;;k/JosefKittler.html", "google_scholar": ";X_oK7kQAAAAJ;;;;;https://scholar.google.co.uk/citations?user=pk-yb_kAAAAJ", "orcid": "0000-0002-8652-6905;0000-0002-1122-0709;;;;;0000-0002-8110-9205", "linkedin": ";;;;;;", "or_profile": "~Michael_Danner1;~Muhammad_Awais_Tanvir_Rana1;thomas.weber@reutlingen-university.de;tobias.gerlach@reutlingen-university.de;patrik.huber@york.ac.uk;matthias.raetsch@reutlingen-university.de;~Josef_Kittler1", "aff": "University of Surrey;University of Surrey;;;;;University of Surrey", "aff_domain": "surrey.ac.uk;surrey.ac.uk;;;;;surrey.ac.uk", "position": "PhD student;Associate Professor;;;;;Full Professor", "bibtex": "@misc{\ndanner2022aestheticnet,\ntitle={AestheticNet: Reducing bias in facial data sets under ethical considerations},\nauthor={Michael Danner and Muhammad Awais Tanvir Rana and Thomas Weber and Tobias Gerlach and Patrik Huber and Matthias R{\\\"a}tsch and Josef Kittler},\nyear={2022},\nurl={https://openreview.net/forum?id=Eot1M5o2Zy}\n}", "github": "", "project": "", "reviewers": "8hSi;V5EV;rA4x;txuX", "site": "https://openreview.net/forum?id=Eot1M5o2Zy", "pdf_size": 0, "recommendation": "1;1;1;6", "confidence": "4;5;5;4", "correctness": "3;2;1;3", "technical_novelty": "1;1;1;3", "empirical_novelty": "1;0;1;3", "wc_summary_paper": "57;14;14;46", "wc_summary_review": "16;21;63;43", "wc_main_review": "241;27;211;149", "wc_review": "314;62;288;238", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 2.25, 2.165063509461097 ], "confidence_avg": [ 4.5, 0.5 ], "correctness_avg": [ 2.25, 0.82915619758885 ], "technical_novelty_avg": [ 1.5, 0.8660254037844386 ], "empirical_novelty_avg": [ 1.25, 1.0897247358851685 ], "wc_summary_paper_avg": [ 32.75, 19.149086140074676 ], "wc_summary_review_avg": [ 35.75, 18.726652130052504 ], "wc_main_review_avg": [ 157.0, 82.06095295571457 ], "wc_review_avg": [ 225.5, 98.2687641114917 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.5222329678670935, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:yFfEMml8uzgJ:scholar.google.com/&scioq=AestheticNet:+Reducing+bias+in+facial+data+sets+under+ethical+considerations&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Surrey", "aff_unique_dep": "", "aff_unique_url": "https://www.surrey.ac.uk", "aff_unique_abbr": "Surrey", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United Kingdom" }, { "id": "ErX-xMSek2", "title": "A Study on Representation Transfer for Few-Shot Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Few-shot classification aims to learn to classify new object categories well using only a few labeled examples. Transfering feature representations from other models is a popular approach for solving few-shot classification problems.In this work we perform a systematic study of various feature representations for few-shot classification, including representations learned from MAML, supervised classification, and several common self-supervised tasks. We find that learning from more complex tasks tend to give better representations for few-shot classification, and thus we propose the use of representations learned from multiple tasks for few-shot classification. Coupled with new tricks on feature selection and voting to handle the issue of small sample size, our direct transfer learning method offers performance comparable to state-of-art on several benchmark datasets. \n", "keywords": "few-shot learning;transfer learning", "primary_area": "", "supplementary_material": "", "author": "Chun-Nam Yu;Yi Xie", "authorids": "~Chun-Nam_Yu1;~Yi_Xie3", "gender": "M;", "homepage": ";", "dblp": "http://dblp.uni-trier.de/pers/hd/y/Yu:Chun=Nam;", "google_scholar": "urpJRigAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";", "linkedin": ";", "or_profile": "~Chun-Nam_Yu1;~Yi_Xie3", "aff": "Department of Computer Science;", "aff_domain": "cs.cornell.edu;", "position": "Researcher;", "bibtex": "@misc{\nyu2022a,\ntitle={A Study on Representation Transfer for Few-Shot Learning},\nauthor={Chun-Nam Yu and Yi Xie},\nyear={2022},\nurl={https://openreview.net/forum?id=ErX-xMSek2}\n}", "github": "", "project": "", "reviewers": "G5x6;EEwV;28ox;Tk8a;63j7", "site": "https://openreview.net/forum?id=ErX-xMSek2", "pdf_size": 0, "recommendation": "3;3;5;5;6", "confidence": "5;4;5;4;4", "correctness": "3;3;3;3;3", "technical_novelty": "1;1;3;2;2", "empirical_novelty": "2;2;3;2;3", "wc_summary_paper": "45;99;123;217;99", "wc_summary_review": "29;30;116;83;68", "wc_main_review": "236;159;606;1118;392", "wc_review": "310;288;845;1418;559", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "0;0;0;0;0", "reply_reviewers": "0;0;0;0;0", "reply_authors": "0;0;0;0;0", "recommendation_avg": [ 4.4, 1.2 ], "confidence_avg": [ 4.4, 0.48989794855663565 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 1.8, 0.7483314773547883 ], "empirical_novelty_avg": [ 2.4, 0.4898979485566356 ], "wc_summary_paper_avg": [ 116.6, 56.33329388558777 ], "wc_summary_review_avg": [ 65.2, 33.02968361943542 ], "wc_main_review_avg": [ 502.2, 343.73734158511206 ], "wc_review_avg": [ 684.0, 418.8063991870229 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.2721655269759087, "corr_recommendation_correctness": 0.0, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2466417482006702094&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0", "aff_unique_norm": "Unknown Institution", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "", "aff_unique_abbr": "" }, { "id": "ErsRrojuPzw", "title": "Fast and Efficient Once-For-All Networks for Diverse Hardware Deployment", "track": "main", "status": "Reject", "tldr": "", "abstract": "Convolutional neural networks are widely used in practical application in many diverse environments. Each different environment requires a different optimized network to maximize accuracy under its unique hardware constraints and latency requirements. To find models for this varied array of potential deployment targets, once-for-all (OFA) was introduced as a way to simultaneously co-train many models at once, while keeping the total training cost constant. However, the total training cost is very high, requiring up to 1200 GPU-hours. Compound OFA (compOFA) decreased the training cost of OFA by 2$\\times$ by coupling model dimensions to reduce the search space of possible models by orders of magnitude, while also simplifying the training procedure.\n\nIn this work, we continue the effort to reduce the training cost of OFA methods. While both OFA and compOFA use a pre-trained teacher network, we propose an in-place knowledge distillation procedure to train the super-network simultaneously with the sub-networks. Within this in-place distillation framework, we develop an upper-attentive sample technique that reduces the training cost per epoch while maintaining accuracy. Through experiments on ImageNet, we demonstrate that, we can achieve a $2\\times$ - $3\\times$ ($1.5\\times$ - $1.8\\times$) reduction in training time compared to the state of the art OFA and compOFA, respectively, without loss of optimality.", "keywords": "neural architecture search;computer vision;convolutional neural networks", "primary_area": "", "supplementary_material": "", "author": "Jun Fang;Li Yang;Chengyao Shen;Hamzah Abdel-Aziz;David Thorsley;Joseph Hassoun", "authorids": "~Jun_Fang2;~Li_Yang6;~Chengyao_Shen1;~Hamzah_Abdel-Aziz1;~David_Thorsley1;~Joseph_Hassoun1", "gender": ";M;M;;M;M", "homepage": ";https://lyang-666.github.io/;;;https://www.linkedin.com/in/joseph-hassoun/;", "dblp": "55/2632-4;;87/10460;;;", "google_scholar": "https://scholar.google.com/citations?hl=en;qpUT1I8AAAAJ;;-yDXn_IAAAAJ;https://scholar.google.com/citations?hl=en;En4wZacAAAAJ", "orcid": ";0000-0002-2839-6196;;0000-0002-8991-0451;;", "linkedin": "jun-fang-12946085;li-yang-268710139/;;habdelaziz/;joseph-hassoun/;david-thorsley/", "or_profile": "~Jun_Fang2;~Li_Yang6;~Chengyao_Shen1;~Hamzah_Abdel-Aziz1;~Joseph_Hassoun1;~David_P_Thorsley1", "aff": "Amazon;Arizona State University;;Samsung Semiconductor, Inc.;;Samsung", "aff_domain": "amazon.com;asu.edu;;samsung.com;;samsung.com", "position": "Senior Applied Scientist;PhD student;;Machine Learning Engineer;;Deep Learning Engineer", "bibtex": "@misc{\nfang2022fast,\ntitle={Fast and Efficient Once-For-All Networks for Diverse Hardware Deployment},\nauthor={Jun Fang and Li Yang and Chengyao Shen and Hamzah Abdel-Aziz and David Thorsley and Joseph Hassoun},\nyear={2022},\nurl={https://openreview.net/forum?id=ErsRrojuPzw}\n}", "github": "", "project": "", "reviewers": "vCfG;cquh;vTmv;RKoZ", "site": "https://openreview.net/forum?id=ErsRrojuPzw", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "4;4;4;2", "correctness": "3;3;3;4", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;2;3;2", "wc_summary_paper": "50;48;57;106", "wc_summary_review": "121;43;52;111", "wc_main_review": "159;209;143;34", "wc_review": "330;300;252;251", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "350;357;245;256", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 3.5, 0.8660254037844386 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 65.25, 23.763154251908563 ], "wc_summary_review_avg": [ 81.75, 34.57871455100666 ], "wc_main_review_avg": [ 136.25, 63.85677332906823 ], "wc_review_avg": [ 283.25, 33.47667098144617 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 302.0, 51.70589908318006 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.6622661785325219, "corr_recommendation_correctness": 0.6622661785325219, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:WjI-H-SgPxkJ:scholar.google.com/&scioq=Fast+and+Efficient+Once-For-All+Networks+for+Diverse+Hardware+Deployment&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;2;2", "aff_unique_norm": "Amazon;Arizona State University;Samsung", "aff_unique_dep": "Amazon.com, Inc.;;Samsung Semiconductor, Inc.", "aff_unique_url": "https://www.amazon.com;https://www.asu.edu;https://www.samsung.com/us/business semiconductors/", "aff_unique_abbr": "Amazon;ASU;SSI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1", "aff_country_unique": "United States;South Korea" }, { "id": "Esk2g83ELUt", "title": "UniNet: Unified Architecture Search with Convolution, Transformer, and MLP", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Recently, transformer and multi-layer perceptron (MLP) architectures have achieved impressive results on various vision tasks. A few works investigated manually combining those operators to design visual network architectures, and can achieve satisfactory performances to some extent. In this paper, we propose to jointly search the optimal combination of convolution, transformer, and MLP for building a series of all-operator network architectures with high performances on visual tasks. We empirically identify that the widely-used strided convolution or pooling based down-sampling modules become the performance bottlenecks when the operators are combined to form a network. To better tackle the global context captured by the transformer and MLP operators, we propose two novel context-aware down-sampling modules, which can better adapt to the global information encoded by transformer and MLP operators. To this end, we jointly search all operators and down-sampling modules in a unified search space. Notably, Our searched network UniNet (Unified Network) outperforms state-of-the-art pure convolution-based architecture, EfficientNet, and pure transformer-based architecture, Swin-Transformer, on multiple public visual benchmarks, ImageNet classification, COCO object detection, and ADE20K semantic segmentation. ", "keywords": "Neural architecture search;transformer", "primary_area": "", "supplementary_material": "", "author": "Jihao Liu;Hongsheng Li;Yu Liu;Guanglu Song", "authorids": "~Jihao_Liu3;~Hongsheng_Li3;~Yu_Liu2;~Guanglu_Song2", "gender": "M;M;M;M", "homepage": "http://www.ee.cuhk.edu.hk/~hsli;http://liuyu.us;;https://jihaonew.github.io/", "dblp": "27/7402-1;97/2274-15;207/4745;167/0509", "google_scholar": "BN2Ze-QAAAAJ;;Bd3v08QAAAAJ;PP1HyToAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Hongsheng_Li3;~Yu_Liu2;~Guanglu_Song2;~Jihao_Liu4", "aff": "The Chinese University of Hong Kong;SenseTime;Sensetime;The Chinese University of Hong Kong", "aff_domain": "cuhk.edu.hk;sensetime.com;sensetime.com;cuhk.edu.hk", "position": "Assistant Professor;Principal Researcher;Computer Vision Researcher;PhD student", "bibtex": "@misc{\nliu2022uninet,\ntitle={UniNet: Unified Architecture Search with Convolution, Transformer, and {MLP}},\nauthor={Jihao Liu and Hongsheng Li and Yu Liu and Guanglu Song},\nyear={2022},\nurl={https://openreview.net/forum?id=Esk2g83ELUt}\n}", "github": "", "project": "", "reviewers": "K5pj;8dk7;VTjT", "site": "https://openreview.net/forum?id=Esk2g83ELUt", "pdf_size": 0, "recommendation": "3;5;8", "confidence": "5;4;4", "correctness": "3;3;3", "technical_novelty": "2;3;3", "empirical_novelty": "3;3;3", "wc_summary_paper": "49;65;57", "wc_summary_review": "14;35;20", "wc_main_review": "336;274;204", "wc_review": "399;374;281", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 5.333333333333333, 2.0548046676563256 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 57.0, 6.531972647421808 ], "wc_summary_review_avg": [ 23.0, 8.831760866327848 ], "wc_main_review_avg": [ 271.3333333333333, 53.92175401037652 ], "wc_review_avg": [ 351.3333333333333, 50.76963221804503 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.8029550685469661, "corr_recommendation_correctness": 0.0, "gs_citation": 46, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1584513298045969430&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff_unique_index": "0;1;1;0", "aff_unique_norm": "Chinese University of Hong Kong;SenseTime", "aff_unique_dep": ";", "aff_unique_url": "https://www.cuhk.edu.hk;https://www.sensetime.com", "aff_unique_abbr": "CUHK;SenseTime", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "title": "Resolving Training Biases via Influence-based Data Relabeling", "status": "Oral", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6491", "id": "EskfH0bwNVn", "poster": "", "openreview": "https://openreview.net/forum?id=EskfH0bwNVn", "slides": "https://iclr.cc/virtual/2022/poster/6491", "video": "https://iclr.cc/virtual/2022/poster/6491", "author_site": "Shuming Kong, Yanyan Shen, Linpeng Huang", "tldr": "", "abstract": "The performance of supervised learning methods easily suffers from the training bias issue caused by train-test distribution mismatch or label noise. Influence function is a technique that estimates the impacts of a training sample on the model\u2019s predictions. Recent studies on \\emph{data resampling} have employed influence functions to identify \\emph{harmful} training samples that will degrade model's test performance. They have shown that discarding or downweighting the identified harmful training samples is an effective way to resolve training biases. In this work, we move one step forward and propose an influence-based relabeling framework named RDIA for reusing harmful training samples toward better model performance. To achieve this, we use influence functions to estimate how relabeling a training sample would affect model's test performance and further develop a novel relabeling function R. We theoretically prove that applying R to relabel harmful training samples allows the model to achieve lower test loss than simply discarding them for any classification tasks using cross-entropy loss. Extensive experiments on ten real-world datasets demonstrate RDIA outperforms the state-of-the-art data resampling methods and improves model's robustness against label noise. ", "keywords": "Training bias;influence functions;data relabeling", "primary_area": "", "supplementary_material": "/attachment/7ef8b8e51f37c9823d3d6cd707735960abd6a147.zip", "author": "Shuming Kong;Yanyan Shen;Linpeng Huang", "authorids": "~Shuming_Kong1;~Yanyan_Shen1;~Linpeng_Huang1", "gender": ";;M", "homepage": ";;http://www.cs.sjtu.edu.cn/en/PeopleDetail.aspx?id=166", "dblp": ";;", "google_scholar": ";;https://scholar.google.com.tw/citations?user=IC620Q4AAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Shuming_Kong1;~Yanyan_Shen1;~Linpeng_Huang1", "aff": ";;, Shanghai Jiaotong University", "aff_domain": ";;cs.sjtu.edu.cn", "position": ";;Full Professor", "bibtex": "@inproceedings{\nkong2022resolving,\ntitle={Resolving Training Biases via Influence-based Data Relabeling},\nauthor={Shuming Kong and Yanyan Shen and Linpeng Huang},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=EskfH0bwNVn}\n}", "github": "", "project": "", "reviewers": "FfyB;136B;UYUF;XS8o", "pdf_size": 0, "recommendation": "6;6;8;8", "confidence": "4;5;4;3", "correctness": "4;4;4;4", "technical_novelty": "2;4;3;2", "empirical_novelty": "3;4;3;2", "wc_summary_paper": "86;36;59;158", "wc_summary_review": "255;51;9;79", "wc_main_review": "1458;323;121;562", "wc_review": "1799;410;189;799", "wc_reply_reviewers": "2214;0;0;93", "wc_reply_authors": "4089;346;315;760", "reply_reviewers": "6;0;0;1", "reply_authors": "9;1;1;1", "recommendation_avg": [ 7.0, 1.0 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 4.0, 0.0 ], "technical_novelty_avg": [ 2.75, 0.82915619758885 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 84.75, 45.84416538666616 ], "wc_summary_review_avg": [ 98.5, 93.72699717797428 ], "wc_main_review_avg": [ 616.0, 510.5766347963839 ], "wc_review_avg": [ 799.25, 617.1346591304041 ], "wc_reply_reviewers_avg": [ 576.75, 946.0289041567388 ], "wc_reply_authors_avg": [ 1377.5, 1575.3124293294966 ], "reply_reviewers_avg": [ 1.75, 2.48746859276655 ], "reply_authors_avg": [ 3.0, 3.4641016151377544 ], "replies_avg": [ 24, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.7071067811865475, "corr_recommendation_correctness": 0.0, "gs_citation": 74, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17665360430091557526&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "pdf": "https://openreview.net/pdf?id=EskfH0bwNVn", "email": ";;cs.sjtu.edu.cn", "author_num": 3, "aff_unique_index": "0", "aff_unique_norm": "Shanghai Jiao Tong University", "aff_unique_dep": "", "aff_unique_url": "https://www.sjtu.edu.cn", "aff_unique_abbr": "SJTU", "aff_country_unique_index": "0", "aff_country_unique": "China" }, { "id": "Ew4hVmrrqJE", "title": "Sample and Communication-Efficient Decentralized Actor-Critic Algorithms with Finite-Time Analysis", "track": "main", "status": "Reject", "tldr": "", "abstract": "Actor-critic (AC) algorithms have been widely adopted in decentralized multi-agent systems to learn the optimal joint control policy. However, existing decentralized AC algorithms either do not preserve the privacy of agents or are not sample and communication-efficient. In this work, we develop two decentralized AC and natural AC (NAC) algorithms that are private, and sample and communication-efficient. In both algorithms, agents share noisy information to preserve privacy and adopt mini-batch updates to improve sample and communication efficiency. Particularly for decentralized NAC, we develop a decentralized Markovian SGD algorithm with an adaptive mini-batch size to efficiently compute the natural policy gradient. Under Markovian sampling and linear function approximation, we prove the proposed decentralized AC and NAC algorithms achieve the state-of-the-art sample complexities $\\mathcal{O}(\\epsilon^{-2}\\ln\\epsilon^{-1})$ and $\\mathcal{O}(\\epsilon^{-3}\\ln\\epsilon^{-1})$, respectively, and the same small communication complexity $\\mathcal{O}(\\epsilon^{-1}\\ln\\epsilon^{-1})$. Numerical experiments demonstrate that the proposed algorithms achieve lower sample and communication complexities than the existing decentralized AC algorithm.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/6da75de964d376cffb0380162d63ae22950853a9.zip", "author": "Ziyi Chen;Yi Zhou;Rong-Rong Chen;Shaofeng Zou", "authorids": "~Ziyi_Chen2;~Yi_Zhou2;~Rong-Rong_Chen1;~Shaofeng_Zou1", "gender": "M;M;;", "homepage": ";https://sites.google.com/site/yizhouhomepage/home;;", "dblp": "37/1439-2;;;", "google_scholar": "zjSBVOIAAAAJ;4fK8bYIAAAAJ;G2pEqUQAAAAJ;", "orcid": ";;;", "linkedin": "ziyi-chen-84616184/;;;", "or_profile": "~Ziyi_Chen2;~Yi_Zhou2;~Rong-Rong_Chen1;~Shaofeng_Zou1", "aff": "University of Utah;University of Utah;University of Utah;", "aff_domain": "utah.edu;utah.edu;utah.edu;", "position": "PhD student;Assistant Professor;Associate Professor;", "bibtex": "@misc{\nchen2022sample,\ntitle={Sample and Communication-Efficient Decentralized Actor-Critic Algorithms with Finite-Time Analysis},\nauthor={Ziyi Chen and Yi Zhou and Rong-Rong Chen and Shaofeng Zou},\nyear={2022},\nurl={https://openreview.net/forum?id=Ew4hVmrrqJE}\n}", "github": "", "project": "", "reviewers": "8K9V;hsKp;RWPJ;ZhdW", "site": "https://openreview.net/forum?id=Ew4hVmrrqJE", "pdf_size": 0, "recommendation": "5;5;6;8", "confidence": "4;5;3;4", "correctness": "3;2;3;4", "technical_novelty": "3;2;2;4", "empirical_novelty": "2;3;2;3", "wc_summary_paper": "160;23;42;75", "wc_summary_review": "50;26;61;68", "wc_main_review": "995;403;208;280", "wc_review": "1205;452;311;423", "wc_reply_reviewers": "0;0;0;174", "wc_reply_authors": "2261;915;449;245", "reply_reviewers": "0;0;0;1", "reply_authors": "4;2;1;1", "recommendation_avg": [ 6.0, 1.224744871391589 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.75, 0.82915619758885 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 75.0, 52.48333068699051 ], "wc_summary_review_avg": [ 51.25, 15.927570436196476 ], "wc_main_review_avg": [ 471.5, 310.18099554937277 ], "wc_review_avg": [ 597.75, 354.5274143137594 ], "wc_reply_reviewers_avg": [ 43.5, 75.34421012924616 ], "wc_reply_authors_avg": [ 967.5, 785.2940532055493 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 2.0, 1.224744871391589 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.28867513459481287, "corr_recommendation_correctness": 0.8660254037844386, "gs_citation": 36, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7961220597958783817&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 7, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Utah", "aff_unique_dep": "", "aff_unique_url": "https://www.utah.edu", "aff_unique_abbr": "Utah", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "How Well Does Self-Supervised Pre-Training Perform with Streaming Data?", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/5967", "id": "EwqEx5ipbOu", "poster": "", "openreview": "https://openreview.net/forum?id=EwqEx5ipbOu", "slides": "https://iclr.cc/virtual/2022/poster/5967", "video": "https://iclr.cc/virtual/2022/poster/5967", "author_site": "Dapeng Hu, Shipeng Yan, Qizhengqiu Lu, Lanqing HONG, Hailin Hu, Yifan Zhang, Zhenguo Li, Xinchao Wang, Jiashi Feng", "tldr": "", "abstract": "Prior works on self-supervised pre-training focus on the joint training scenario, where massive unlabeled data are assumed to be given as input all at once, and only then is a learner trained. Unfortunately, such a problem setting is often impractical if not infeasible since many real-world tasks rely on sequential learning, e.g., data are decentralized or collected in a streaming fashion. In this paper, we conduct the first thorough and dedicated investigation on self-supervised pre-training with streaming data, aiming to shed light on the model behavior under this overlooked setup. Specifically, we pre-train over 500 models on four categories of pre-training streaming data from ImageNet and DomainNet and evaluate them on three types of downstream tasks and 12 different downstream datasets. Our studies show that, somehow beyond our expectation, with simple data replay or parameter regularization, sequential self-supervised pre-training turns out to be an efficient alternative for joint pre-training, as the performances of the former are mostly on par with those of the latter. Moreover, catastrophic forgetting, a common issue in sequential supervised learning, is much alleviated in sequential self-supervised learning (SSL), which is well justified through our comprehensive empirical analysis on representations and the sharpness of minima in the loss landscape. Our findings, therefore, suggest that, in practice, for SSL, the cumbersome joint training can be replaced mainly by sequential learning, which in turn enables a much broader spectrum of potential application scenarios. ", "keywords": "Pre-Training;Representation Learning;Continual Learning;Self-Supervised Learning", "primary_area": "", "supplementary_material": "", "author": "Dapeng Hu;Shipeng Yan;Qizhengqiu Lu;Lanqing HONG;Hailin Hu;Yifan Zhang;Zhenguo Li;Xinchao Wang;Jiashi Feng", "authorids": "~Dapeng_Hu2;~Shipeng_Yan1;~Qizhengqiu_Lu1;~Lanqing_HONG1;~Hailin_Hu1;~Yifan_Zhang1;~Zhenguo_Li1;~Xinchao_Wang1;~Jiashi_Feng1", "gender": "M;M;;F;;M;M;M;M", "homepage": "https://lhxxhb.github.io/;;;https://racheltechie.github.io/;;https://sites.google.com/view/yifan-zhang/%E9%A6%96%E9%A1%B5;http://www.ee.columbia.edu/~zgli/;https://sites.google.com/site/jshfeng/;https://sites.google.com/site/sitexinchaowang/", "dblp": "247/3382;63/9201;291/2916.html;226/4258;209/7188-2;57/4707-4;23/6479;56/8278;", "google_scholar": "wv9HjA0AAAAJ;oYILsyoAAAAJ;;https://scholar.google.com.sg/citations?user=2p7x6OUAAAAJ;rvYUgBwAAAAJ;https://scholar.google.com.hk/citations?user=zuYIUJEAAAAJ;XboZC1AAAAAJ;https://scholar.google.com.sg/citations?user=Q8iay0gAAAAJ;https://scholar.google.com.tw/citations?user=w69Buq0AAAAJ", "orcid": ";;;;;;;0000-0001-6843-0064;", "linkedin": ";;;;;;;;", "or_profile": "~Dapeng_Hu2;~Shipeng_Yan1;~Qizhengqiu_Lu1;~Lanqing_HONG1;~Hailin_Hu1;~Yifan_Zhang1;~Zhenguo_Li1;~Jiashi_Feng2;~Xinchao_WANG3", "aff": "National University of Singapore;ShanghaiTech University;Huawei Technologies Ltd.;Huawei Technologies Ltd.;Huawei Noah's Ark Lab;National University of Singapore;Huawei Noah's Ark Lab;ByteDance;National University of Singapore", "aff_domain": "u.nus.edu;shanghaitech.edu.cn;huawei.com;huawei.com;huawei.com;nus.edu;huawei.com;bytedance.com;nus.edu", "position": "PhD student;PhD student;Researcher;Researcher;Researcher;PhD student;Principal Researcher;Research Lead;Assistant Professor", "bibtex": "@inproceedings{\nhu2022how,\ntitle={How Well Does Self-Supervised Pre-Training Perform with Streaming Data?},\nauthor={Dapeng Hu and Shipeng Yan and Qizhengqiu Lu and Lanqing HONG and Hailin Hu and Yifan Zhang and Zhenguo Li and Xinchao Wang and Jiashi Feng},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=EwqEx5ipbOu}\n}", "github": "", "project": "", "reviewers": "QZfY;PFFi;4UNG;RJbi", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "4;4;4;4", "correctness": "4;3;3;4", "technical_novelty": "2;1;2;4", "empirical_novelty": "3;3;3;4", "wc_summary_paper": "75;146;440;99", "wc_summary_review": "95;51;113;63", "wc_main_review": "208;791;810;186", "wc_review": "378;988;1363;348", "wc_reply_reviewers": "0;103;490;0", "wc_reply_authors": "1374;2332;2795;1234", "reply_reviewers": "0;1;1;0", "reply_authors": "3;4;5;2", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.25, 1.0897247358851685 ], "empirical_novelty_avg": [ 3.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 190.0, 146.57933005714006 ], "wc_summary_review_avg": [ 80.5, 24.713356712514795 ], "wc_main_review_avg": [ 498.75, 301.9249699842661 ], "wc_review_avg": [ 769.25, 427.46893161959736 ], "wc_reply_reviewers_avg": [ 148.25, 201.74039630178186 ], "wc_reply_authors_avg": [ 1933.75, 652.5574208450931 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 3.5, 1.118033988749895 ], "replies_avg": [ 23, 0 ], "authors#_avg": [ 9, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.2294157338705618, "gs_citation": 39, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=527344378780744916&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=EwqEx5ipbOu", "email": "u.nus.edu;shanghaitech.edu.cn;huawei.com;huawei.com;huawei.com;nus.edu;huawei.com;bytedance.com;nus.edu", "author_num": 9, "aff_unique_index": "0;1;2;2;2;0;2;3;0", "aff_unique_norm": "National University of Singapore;ShanghaiTech University;Huawei;ByteDance", "aff_unique_dep": ";;Huawei Technologies;", "aff_unique_url": "https://www.nus.edu.sg;https://www.shanghaitech.edu.cn;https://www.huawei.com;https://www.bytedance.com", "aff_unique_abbr": "NUS;ShanghaiTech;Huawei;ByteDance", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1;1;0;1;1;0", "aff_country_unique": "Singapore;China" }, { "id": "ExJ4lMbZcqa", "title": "Learning Audio-Visual Dereverberation", "track": "main", "status": "Reject", "tldr": "", "abstract": "Reverberation from audio reflecting off surfaces and objects in the environment not only degrades the quality of speech for human perception, but also severely impacts the accuracy of automatic speech recognition. Prior work attempts to remove reverberation based on the audio modality only. Our idea is to learn to dereverberate speech from audio-visual observations. The visual environment surrounding a human speaker reveals important cues about the room geometry, materials, and speaker location, all of which influence the precise reverberation effects in the audio stream. We introduce Visually-Informed Dereverberation of Audio (VIDA), an end-to-end approach that learns to remove reverberation based on both the observed sounds and visual scene. In support of this new task, we develop a large-scale dataset that uses realistic acoustic renderings of speech in real-world 3D scans of homes offering a variety of room acoustics. Demonstrating our approach on both simulated and real imagery for speech enhancement, speech recognition, and speaker identification, we show it achieves state-of-the-art performance and substantially improves over traditional audio-only methods.", "keywords": "speech enhancement;audio-visual learning;speech dereverberation;room acoustics", "primary_area": "", "supplementary_material": "/attachment/b4efa1e8f5a4fdb27e915e2c417bb313345a99d9.zip", "author": "Changan Chen;Wei Sun;David Harwath;Kristen Grauman", "authorids": "~Changan_Chen2;~Wei_Sun10;~David_Harwath1;~Kristen_Grauman1", "gender": ";M;M;F", "homepage": ";https://www.cs.utexas.edu/~weisun/;https://www.cs.utexas.edu/~harwath/index.html;http://www.cs.utexas.edu/~grauman/", "dblp": ";09/5042.html;;57/4553", "google_scholar": ";odTy4-YAAAAJ;C0kDOzcAAAAJ;Jp6Mz1sAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Changan_Chen2;~Wei_Sun10;~David_Harwath1;~Kristen_Grauman1", "aff": ";University of Texas, Austin;University of Texas, Austin;University of Texas, Austin", "aff_domain": ";utexas.edu;utexas.edu;utexas.edu", "position": ";PhD student;Assistant Professor;Professor", "bibtex": "@misc{\nchen2022learning,\ntitle={Learning Audio-Visual Dereverberation},\nauthor={Changan Chen and Wei Sun and David Harwath and Kristen Grauman},\nyear={2022},\nurl={https://openreview.net/forum?id=ExJ4lMbZcqa}\n}", "github": "", "project": "", "reviewers": "NGNE;NeeT;Tgq4;j5yj", "site": "https://openreview.net/forum?id=ExJ4lMbZcqa", "pdf_size": 0, "recommendation": "3;6;6;8", "confidence": "4;5;5;4", "correctness": "4;3;4;4", "technical_novelty": "2;2;4;4", "empirical_novelty": "2;2;3;4", "wc_summary_paper": "75;153;122;103", "wc_summary_review": "27;50;117;21", "wc_main_review": "105;335;686;480", "wc_review": "207;538;925;604", "wc_reply_reviewers": "0;49;0;0", "wc_reply_authors": "246;714;812;399", "reply_reviewers": "0;1;0;0", "reply_authors": "1;2;2;1", "recommendation_avg": [ 5.75, 1.7853571071357126 ], "confidence_avg": [ 4.5, 0.5 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 1.0 ], "empirical_novelty_avg": [ 2.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 113.25, 28.39344114403888 ], "wc_summary_review_avg": [ 53.75, 38.08789177678386 ], "wc_main_review_avg": [ 401.5, 211.80002360717526 ], "wc_review_avg": [ 568.5, 254.93381494027034 ], "wc_reply_reviewers_avg": [ 12.25, 21.21762239271875 ], "wc_reply_authors_avg": [ 542.75, 229.42686743273987 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.14002800840280097, "corr_recommendation_correctness": -0.08084520834544431, "gs_citation": 38, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1788932046361186832&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Texas at Austin", "aff_unique_dep": "", "aff_unique_url": "https://www.utexas.edu", "aff_unique_abbr": "UT Austin", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Austin", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "F0v5uBM-q5K", "title": "Beyond Quantization: Power aware neural networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "Power consumption is a major obstacle in the deployment of deep neural networks (DNNs) on end devices. Existing approaches for reducing power consumption rely on quite general principles, including avoidance of multiplication operations and aggressive quantization of weights and activations. However, these methods do not take into account the precise power consumed by each module in the network, and are therefore far from optimal. In this paper we develop accurate power consumption models for all arithmetic operations in the DNN, under various working conditions. Surprisingly, we reveal several important factors that have been overlooked to date. Based on our analysis, we present PANN (power-aware neural network), a simple approach for approximating any full-precision network by a low-power fixed-precision variant. Our method can be applied to a pre-trained network, and can also be used during training to achieve improved performance. In contrast to previous approaches, our method incurs only a minor degradation in accuracy w.r.t. the full-precision version of the network, even when working at the power-budget of a 2-bit quantized variant. In addition, our scheme enables to seamlessly traverse the power-accuracy tradeoff at deployment time, which is a major advantage over existing quantization methods that are constrained to specific bit widths.", "keywords": "Deep neural networks;weight quantization;model compression;power-accuracy tradeoff;power consumption", "primary_area": "", "supplementary_material": "", "author": "Nurit Spingarn;Elad Hoffer;Ron Banner;Hilla Ben Yaacov;Tomer Michaeli", "authorids": "~Nurit_Spingarn1;~Elad_Hoffer1;~Ron_Banner1;hilla.ben.yaacov@gmail.com;~Tomer_Michaeli1", "gender": "F;M;M;;M", "homepage": "https://www.linkedin.com/in/nurit-spingarn-154958a7/;http://www.deeplearning.co.il;;;https://tomer.net.technion.ac.il/", "dblp": ";156/0135;03/5857;;70/3188.html", "google_scholar": ";https://scholar.google.co.il/citations?user=iEfTH7AAAAAJ;;;n2EbR2cAAAAJ", "orcid": ";;;;", "linkedin": "nurit-spingarn-154958a7/;;https://il.linkedin.com/in/ron-banner-69403a51;;", "or_profile": "~Nurit_Spingarn1;~Elad_Hoffer1;~Ron_Banner1;hilla.ben.yaacov@gmail.com;~Tomer_Michaeli1", "aff": "Technion, Technion;Habana Labs (Intel);Intel;;Technion, Technion", "aff_domain": "technion.ac.il;habana.ai;intel.com;;technion.ac.il", "position": "PhD student;Researcher;Researcher;;Associate Professor", "bibtex": "@misc{\nspingarn2022beyond,\ntitle={Beyond Quantization: Power aware neural networks},\nauthor={Nurit Spingarn and Elad Hoffer and Ron Banner and Hilla Ben Yaacov and Tomer Michaeli},\nyear={2022},\nurl={https://openreview.net/forum?id=F0v5uBM-q5K}\n}", "github": "", "project": "", "reviewers": "GEgt;NEeA;CvJm;yuqG", "site": "https://openreview.net/forum?id=F0v5uBM-q5K", "pdf_size": 0, "recommendation": "3;3;5;6", "confidence": "4;3;4;2", "correctness": "1;2;3;3", "technical_novelty": "2;2;4;3", "empirical_novelty": "1;0;3;3", "wc_summary_paper": "23;111;180;62", "wc_summary_review": "27;26;57;50", "wc_main_review": "94;288;666;274", "wc_review": "144;425;903;386", "wc_reply_reviewers": "168;270;0;0", "wc_reply_authors": "1330;1505;2123;575", "reply_reviewers": "2;3;0;0", "reply_authors": "4;4;3;1", "recommendation_avg": [ 4.25, 1.299038105676658 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "correctness_avg": [ 2.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.75, 0.82915619758885 ], "empirical_novelty_avg": [ 1.75, 1.299038105676658 ], "wc_summary_paper_avg": [ 94.0, 58.63019699779287 ], "wc_summary_review_avg": [ 40.0, 13.729530217745982 ], "wc_main_review_avg": [ 330.5, 208.26125419770236 ], "wc_review_avg": [ 464.5, 275.1022537166862 ], "wc_reply_reviewers_avg": [ 109.5, 115.2855151352502 ], "wc_reply_authors_avg": [ 1383.25, 551.850693122696 ], "reply_reviewers_avg": [ 1.25, 1.299038105676658 ], "reply_authors_avg": [ 3.0, 1.224744871391589 ], "replies_avg": [ 23, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.5222329678670935, "corr_recommendation_correctness": 0.8703882797784892, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:b7Y58ADRJcAJ:scholar.google.com/&scioq=Beyond+Quantization:+Power+aware+neural+networks&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "Technion - Israel Institute of Technology;Habana Labs;Intel", "aff_unique_dep": ";;Intel Corporation", "aff_unique_url": "https://www.technion.ac.il/en/;https://www.habana.ai;https://www.intel.com", "aff_unique_abbr": "Technion;Habana Labs;Intel", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;0", "aff_country_unique": "Israel;United States" }, { "id": "F1Z3QH-VjZE", "title": "A Fair Generative Model Using Total Variation Distance", "track": "main", "status": "Reject", "tldr": "", "abstract": "We explore a fairness-related challenge that arises in generative models. The challenge is that biased training data with imbalanced representations of demographic groups may yield a high asymmetry in size of generated samples across distinct groups. We focus on practically-relevant scenarios wherein demographic labels are not available and therefore the design of a fair generative model is particularly challenging. In this paper, we propose an optimization framework that regulates such unfairness by employing one prominent statistical notion, total variation distance (TVD). We quantify the degree of unfairness via the TVD between the generated samples and balanced-yet-small reference samples. We take a variational optimization approach to faithfully implement the TVD-based measure. Experiments on benchmark real datasets demonstrate that the proposed framework can significantly improve the fairness performance while maintaining realistic sample quality for a wide range of the reference set size all the way down to 1% relative to training set.", "keywords": "trustworthy AI;fairness;generative model;total variation distance", "primary_area": "", "supplementary_material": "/attachment/60b571df99ad99aba1288f287b4cae520d685e42.zip", "author": "Soobin Um;Changho Suh", "authorids": "~Soobin_Um1;~Changho_Suh1", "gender": ";M", "homepage": "https://sites.google.com/view/soobinum;https://csuh.kaist.ac.kr", "dblp": "339/0076;75/1420", "google_scholar": ";https://scholar.google.com.tw/citations?user=B1guGw8AAAAJ", "orcid": ";0000-0002-3101-4291", "linkedin": ";changho-suh-584aa732/?originalSubdomain=kr", "or_profile": "~Soobin_Um1;~Changho_Suh1", "aff": "Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology", "aff_domain": "kaist.ac.kr;kaist.ac.kr", "position": "PhD student;Associate Professor", "bibtex": "@misc{\num2022a,\ntitle={A Fair Generative Model Using Total Variation Distance},\nauthor={Soobin Um and Changho Suh},\nyear={2022},\nurl={https://openreview.net/forum?id=F1Z3QH-VjZE}\n}", "github": "", "project": "", "reviewers": "P5wy;VNCK;E8Px;4j8d", "site": "https://openreview.net/forum?id=F1Z3QH-VjZE", "pdf_size": 0, "recommendation": "3;3;5;6", "confidence": "3;5;4;3", "correctness": "4;2;3;3", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "26;88;52;159", "wc_summary_review": "11;45;50;93", "wc_main_review": "333;259;323;782", "wc_review": "370;392;425;1034", "wc_reply_reviewers": "0;0;0;192", "wc_reply_authors": "426;581;353;949", "reply_reviewers": "0;0;0;2", "reply_authors": "1;1;1;2", "recommendation_avg": [ 4.25, 1.299038105676658 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 81.25, 49.996874902337645 ], "wc_summary_review_avg": [ 49.75, 29.13224158900238 ], "wc_main_review_avg": [ 424.25, 208.48905846590608 ], "wc_review_avg": [ 555.25, 277.09869631595166 ], "wc_reply_reviewers_avg": [ 48.0, 83.13843876330611 ], "wc_reply_authors_avg": [ 577.25, 229.87863645845823 ], "reply_reviewers_avg": [ 0.5, 0.8660254037844386 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.4061811972299616, "corr_recommendation_correctness": 0.0, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4082085887521008638&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Korea Advanced Institute of Science and Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.kaist.ac.kr", "aff_unique_abbr": "KAIST", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "South Korea" }, { "id": "F2r3wYar3Py", "title": "Learning from One and Only One Shot", "track": "main", "status": "Reject", "tldr": "", "abstract": "Humans can generalize from one or a few examples, and even from very little pre-training on similar tasks. Machine learning (ML) algorithms, however, typically require large data to either learn or pre-learn to transfer. Inspired by nativism, we directly model very basic human innate priors in abstract visual tasks like character or doodle recognition. The result is a white-box model that learns transformation-based topological similarity akin to how a human would naturally and unconsciously ``distort'' an object when first seeing it. Using the simple Nearest-Neighbor classifier in this similarity space, our model approaches human-level character recognition using only one to ten examples per class and nothing else (no pre-training). This is in contrast to one-shot and few-shot settings that require significant pre-training. On standard benchmarks including MNIST, EMNIST-letters, and the harder Omniglot challenge, our model outperforms both neural-network-based and classical ML methods in the ``tiny-data'' regime, including few-shot learning models that use an extra background set to perform transfer learning. Moreover, mimicking simple clustering methods like $k$-means but in a non-Euclidean space, our model can adapt to an unsupervised setting and generate human-interpretable archetypes of a class.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Haizi Yu;Igor Mineyev;Lav R. Varshney;James Evans", "authorids": "~Haizi_Yu1;~Igor_Mineyev1;~Lav_R._Varshney1;~James_Evans1", "gender": ";;M;M", "homepage": ";;http://www.varshney.csl.illinois.edu/;https://macss.uchicago.edu/directory/James-Evans", "dblp": ";;36/4028;", "google_scholar": ";;https://scholar.google.com.tw/citations?user=JIJGu30AAAAJ;kV4N4zoAAAAJ", "orcid": ";;0000-0003-2798-5308;", "linkedin": ";;;", "or_profile": "~Haizi_Yu1;~Igor_Mineyev1;~Lav_R._Varshney1;~James_Evans1", "aff": ";University of Illinois, Urbana Champaign;University of Illinois, Urbana Champaign;University of Chicago", "aff_domain": ";;illinois.edu;uchicago.edu", "position": ";;Associate Professor;Full Professor", "bibtex": "@misc{\nyu2022learning,\ntitle={Learning from One and Only One Shot},\nauthor={Haizi Yu and Igor Mineyev and Lav R. Varshney and James Evans},\nyear={2022},\nurl={https://openreview.net/forum?id=F2r3wYar3Py}\n}", "github": "", "project": "", "reviewers": "CPtf;7LJy;MBhM;gUCx", "site": "https://openreview.net/forum?id=F2r3wYar3Py", "pdf_size": 0, "recommendation": "5;5;5;6", "confidence": "5;4;3;5", "correctness": "4;3;4;3", "technical_novelty": "4;4;3;3", "empirical_novelty": "4;2;1;3", "wc_summary_paper": "75;150;67;72", "wc_summary_review": "78;31;41;51", "wc_main_review": "405;527;257;185", "wc_review": "558;708;365;308", "wc_reply_reviewers": "0;812;0;96", "wc_reply_authors": "734;2823;1238;1118", "reply_reviewers": "0;3;0;1", "reply_authors": "1;5;2;3", "recommendation_avg": [ 5.25, 0.4330127018922193 ], "confidence_avg": [ 4.25, 0.82915619758885 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 1.118033988749895 ], "wc_summary_paper_avg": [ 91.0, 34.18332927027442 ], "wc_summary_review_avg": [ 50.25, 17.512495538900218 ], "wc_main_review_avg": [ 343.5, 132.3433035706756 ], "wc_review_avg": [ 484.75, 158.7346449266826 ], "wc_reply_reviewers_avg": [ 227.0, 340.0161760857857 ], "wc_reply_authors_avg": [ 1478.25, 798.3985142145493 ], "reply_reviewers_avg": [ 1.0, 1.224744871391589 ], "reply_authors_avg": [ 2.75, 1.479019945774904 ], "replies_avg": [ 22, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.5222329678670935, "corr_recommendation_correctness": -0.5773502691896257, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3189083423114289132&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0;1", "aff_unique_norm": "University of Illinois Urbana-Champaign;University of Chicago", "aff_unique_dep": ";", "aff_unique_url": "https://illinois.edu;https://www.uchicago.edu", "aff_unique_abbr": "UIUC;UChicago", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Urbana-Champaign;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Causal Contextual Bandits with Targeted Interventions", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6917", "id": "F5Em8ASCosV", "poster": "", "openreview": "https://openreview.net/forum?id=F5Em8ASCosV", "slides": "https://iclr.cc/virtual/2022/poster/6917", "video": "https://iclr.cc/virtual/2022/poster/6917", "author_site": "Chandrasekar Subramanian, Balaraman Ravindran", "tldr": "", "abstract": "We study a contextual bandit setting where the learning agent has the ability to perform interventions on targeted subsets of the population, apart from possessing qualitative causal side-information. This novel formalism captures intricacies in real-world scenarios such as software product experimentation where targeted experiments can be conducted. However, this fundamentally changes the set of options that the agent has, compared to standard contextual bandit settings, necessitating new techniques. This is also the first work that integrates causal side-information in a contextual bandit setting, where the agent aims to learn a policy that maps contexts to arms (as opposed to just identifying one best arm). We propose a new algorithm, which we show empirically performs better than baselines on experiments that use purely synthetic data and on real world-inspired experiments. We also prove a bound on regret that theoretically guards performance.", "keywords": "causality;contextual bandits;causal inference;bandits", "primary_area": "", "supplementary_material": "/attachment/da21059b96b7d6d0110fcc48f3cee6ee8fc9bb80.zip", "author": "Chandrasekar Subramanian;Balaraman Ravindran", "authorids": "~Chandrasekar_Subramanian1;~Balaraman_Ravindran1", "gender": "M;M", "homepage": ";http://www.cse.iitm.ac.in/~ravi", "dblp": "46/10761.html;69/2281", "google_scholar": "https://scholar.google.co.in/citations?user=l3_ndJcAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": "0009-0004-8572-125X;0000-0002-5364-7639", "linkedin": "chandrasekar-subramanian-50928719;ravindran-balaraman-427a307", "or_profile": "~Chandrasekar_Subramanian1;~Balaraman_Ravindran1", "aff": "Microsoft;Indian Institute of Technology Madras", "aff_domain": "microsoft.com;iitm.ac.in", "position": "Senior Applied Researcher;Full Professor", "bibtex": "@inproceedings{\nsubramanian2022causal,\ntitle={Causal Contextual Bandits with Targeted Interventions},\nauthor={Chandrasekar Subramanian and Balaraman Ravindran},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=F5Em8ASCosV}\n}", "github": "", "project": "", "reviewers": "DesA;swHk;i8oB;cUNV", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "3;3;3;3", "correctness": "3;3;3;4", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;2;3;2", "wc_summary_paper": "76;61;37;87", "wc_summary_review": "29;60;28;44", "wc_main_review": "323;151;279;364", "wc_review": "428;272;344;495", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 3.0, 0.0 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 65.25, 18.73999733191016 ], "wc_summary_review_avg": [ 40.25, 13.045593125649749 ], "wc_main_review_avg": [ 279.25, 79.91362524626197 ], "wc_review_avg": [ 384.75, 84.25964336501787 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16163829726660196686&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=F5Em8ASCosV", "email": "microsoft.com;iitm.ac.in", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "Microsoft;Indian Institute of Technology Madras", "aff_unique_dep": "Microsoft Corporation;", "aff_unique_url": "https://www.microsoft.com;https://www.iitm.ac.in", "aff_unique_abbr": "Microsoft;IIT Madras", "aff_campus_unique_index": "1", "aff_campus_unique": ";Madras", "aff_country_unique_index": "0;1", "aff_country_unique": "United States;India" }, { "id": "F6S_3RSWFI7", "title": "Revisiting the Monotonicity Constraint in Cooperative Multi-Agent Reinforcement Learning", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "QMIX, a popular MARL algorithm based on the monotonicity constraint, has been used as a baseline for the benchmark environments, such as Starcraft Multi-Agent Challenge (SMAC), Predator-Prey (PP). \nRecent variants of QMIX target relaxing the monotonicity constraint of QMIX to improve the expressive power of QMIX, allowing for performance improvement in SMAC. \nHowever, we find that such performance improvements of the variants are significantly affected by various implementation tricks. In this paper, we revisit the monotonicity constraint of QMIX,\n(1) we design a novel model RMC to further investigate the monotonicity constraint; the results show that monotonicity constraint can improve sample efficiency in some purely cooperative tasks;\n(2) we then re-evaluate the performance of QMIX and these variants by a grid hyperparameter search for the tricks; the results show QMIX achieves the best performance among them, achieving SOTA performance on SMAC and PP;\n(3) we analyze the monotonic mixing network from a theoretical perspective and show that it can represent any tasks which can be interpreted as purely cooperative. These analyses demonstrate that relaxing the monotonicity constraint of the mixing network will not always improve the performance of QMIX, which breaks our previous impressions of the monotonicity constraints.", "keywords": "multi-agent;reinforcement learning;monotonicity constraint", "primary_area": "", "supplementary_material": "/attachment/a03f1f1340ec7a5db7a300a2786acc7c25eca339.zip", "author": "Jian Hu;Siyang Jiang;Seth Austin Harding;Haibin Wu;Shih-wei Liao", "authorids": "~Jian_Hu5;~Siyang_Jiang1;~Seth_Austin_Harding1;~Haibin_Wu1;~Shih-wei_Liao1", "gender": "M;M;M;M;M", "homepage": "https://hujian.website;http://syjiang.com/;;https://hbwu-ntu.github.io/;https://www.csie.ntu.edu.tw/~liao/", "dblp": ";;;151/8366.html;", "google_scholar": "-xt5vGkAAAAJ;8qJS75YAAAAJ;;-bB-WHEAAAAJ;", "orcid": ";0000-0002-9926-6532;;0000-0001-7166-5534;", "linkedin": ";;sethaustin/;haibin-wu-479a39252/;", "or_profile": "~Jian_Hu5;~Siyang_Jiang1;~Seth_Austin_Harding1;~Haibin_Wu1;~Shih-wei_Liao1", "aff": "Northwest Polytechnical University Xi'an;The Chinese University of Hong Kong;National Taiwan University;National Taiwan University;Department of computer science and informational engineering, National Taiwan University", "aff_domain": "nwpu.edu.cn;ie.cuhk.edu;ntu.edu.tw;ntu.edu.tw;csie.ntu.edu.tw", "position": "Undergrad student;PhD student;Undergrad student;PhD student;Full Professor", "bibtex": "@misc{\nhu2022revisiting,\ntitle={Revisiting the Monotonicity Constraint in Cooperative Multi-Agent Reinforcement Learning},\nauthor={Jian Hu and Siyang Jiang and Seth Austin Harding and Haibin Wu and Shih-wei Liao},\nyear={2022},\nurl={https://openreview.net/forum?id=F6S_3RSWFI7}\n}", "github": "", "project": "", "reviewers": "KDJA;gQyw;zNcZ", "site": "https://openreview.net/forum?id=F6S_3RSWFI7", "pdf_size": 0, "recommendation": "3;3;3", "confidence": "5;3;3", "correctness": "2;3;3", "technical_novelty": "2;2;2", "empirical_novelty": "3;2;2", "wc_summary_paper": "90;70;65", "wc_summary_review": "107;13;33", "wc_main_review": "1303;247;146", "wc_review": "1500;330;244", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 3.6666666666666665, 0.9428090415820634 ], "correctness_avg": [ 2.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 75.0, 10.801234497346433 ], "wc_summary_review_avg": [ 51.0, 40.431011200150145 ], "wc_main_review_avg": [ 565.3333333333334, 523.236296736209 ], "wc_review_avg": [ 691.3333333333334, 572.8905266763935 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17774727140331904963&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;2;2;2", "aff_unique_norm": "Northwest Polytechnical University;Chinese University of Hong Kong;National Taiwan University", "aff_unique_dep": ";;", "aff_unique_url": "http://www.nwpu.edu.cn;https://www.cuhk.edu.hk;https://www.ntu.edu.tw", "aff_unique_abbr": "NWPU;CUHK;NTU", "aff_campus_unique_index": "0;1;2;2;2", "aff_campus_unique": "Xi'an;Hong Kong SAR;Taiwan", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "title": "How Attentive are Graph Attention Networks?", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6366", "id": "F72ximsx7C1", "poster": "", "openreview": "https://openreview.net/forum?id=F72ximsx7C1", "slides": "https://iclr.cc/virtual/2022/poster/6366", "video": "https://iclr.cc/virtual/2022/poster/6366", "author_site": "Shaked Brody, Uri Alon, Eran Yahav", "tldr": "", "abstract": "Graph Attention Networks (GATs) are one of the most popular GNN architectures and are considered as the state-of-the-art architecture for representation learning with graphs. In GAT, every node attends to its neighbors given its own representation as the query.\nHowever, in this paper we show that GAT computes a very limited kind of attention: the ranking of the attention scores is unconditioned on the query node. We formally define this restricted kind of attention as static attention and distinguish it from a strictly more expressive dynamic attention.\nBecause GATs use a static attention mechanism, there are simple graph problems that GAT cannot express: in a controlled problem, we show that static attention hinders GAT from even fitting the training data. \nTo remove this limitation, we introduce a simple fix by modifying the order of operations and propose GATv2: a dynamic graph attention variant that is strictly more expressive than GAT. We perform an extensive evaluation and show that GATv2 outperforms GAT across 12 OGB and other benchmarks while we match their parametric costs. \nOur code is available at https://github.com/tech-srl/how_attentive_are_gats . GATv2 is available as part of the PyTorch Geometric library, the Deep Graph Library, and the TensorFlow GNN library.", "keywords": "graph attention networks;dynamic attention;GAT;GNN", "primary_area": "", "supplementary_material": "/attachment/19dbb6aeb236683cd556af10c1238bfb14434f2b.zip", "author": "Shaked Brody;Uri Alon;Eran Yahav", "authorids": "~Shaked_Brody1;~Uri_Alon1;~Eran_Yahav1", "gender": "M;M;M", "homepage": "https://shakedbr.cswp.cs.technion.ac.il/;https://urialon.ml/;http://www.cs.technion.ac.il/~yahave/", "dblp": "245/4818;40/2257-2;54/5133", "google_scholar": "https://scholar.google.co.il/citations?user=02jdt98AAAAJ;https://scholar.google.co.il/citations?user=QBn7vq8AAAAJ;https://scholar.google.com.tw/citations?user=grAfX0MAAAAJ", "orcid": ";;", "linkedin": ";https://linkedin.com/in/urialon1/;", "or_profile": "~Shaked_Brody1;~Uri_Alon1;~Eran_Yahav1", "aff": "Technion;Carnegie Mellon University;Technion, Technion", "aff_domain": "technion.ac.il;cmu.edu;technion.ac.il", "position": "PhD student;Postdoc;Associate Professor", "bibtex": "@inproceedings{\nbrody2022how,\ntitle={How Attentive are Graph Attention Networks? },\nauthor={Shaked Brody and Uri Alon and Eran Yahav},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=F72ximsx7C1}\n}", "github": "", "project": "", "reviewers": "BePW;MFSQ;jXKx;QN6d", "pdf_size": 0, "recommendation": "5;5;6;8", "confidence": "5;4;3;4", "correctness": "3;3;3;4", "technical_novelty": "2;2;3;4", "empirical_novelty": "3;2;3;3", "wc_summary_paper": "47;37;31;65", "wc_summary_review": "12;20;41;46", "wc_main_review": "107;231;75;304", "wc_review": "166;288;147;415", "wc_reply_reviewers": "243;0;0;69", "wc_reply_authors": "1039;940;424;875", "reply_reviewers": "2;0;0;1", "reply_authors": "3;2;1;3", "recommendation_avg": [ 6.0, 1.224744871391589 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.82915619758885 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 45.0, 12.884098726725126 ], "wc_summary_review_avg": [ 29.75, 14.148763196831021 ], "wc_main_review_avg": [ 179.25, 92.64009661048503 ], "wc_review_avg": [ 254.0, 107.55231285286244 ], "wc_reply_reviewers_avg": [ 78.0, 99.34032413879069 ], "wc_reply_authors_avg": [ 819.5, 235.69100534386118 ], "reply_reviewers_avg": [ 0.75, 0.82915619758885 ], "reply_authors_avg": [ 2.25, 0.82915619758885 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.28867513459481287, "corr_recommendation_correctness": 0.9428090415820632, "gs_citation": 1723, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5656297883023258429&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "pdf": "https://openreview.net/pdf?id=F72ximsx7C1", "email": "technion.ac.il;cmu.edu;technion.ac.il", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "Technion - Israel Institute of Technology;Carnegie Mellon University", "aff_unique_dep": ";", "aff_unique_url": "https://www.technion.ac.il/en/;https://www.cmu.edu", "aff_unique_abbr": "Technion;CMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", "aff_country_unique": "Israel;United States" }, { "id": "F7_odJIeQ26", "title": "Pretrained Language Models are Symbolic Mathematics Solvers too!", "track": "main", "status": "Reject", "tldr": "", "abstract": "Solving symbolic mathematics has always been of in the arena of human ingenuity that needs compositional reasoning and recurrence. However, recent studies have shown that large scale language models such as transformers are universal and surprisingly can be trained as a sequence-to-sequence task to solve complex mathematical equations. These large transformer models need humongous amounts of training data to generalize to unseen symbolic mathematics problems. In this paper, we present a sample efficient way of solving the symbolic tasks by first pretraining the transformer model with language translation and then fine-tuning the pretrained transformer model to solve the downstream task of symbolic mathematics. We achieve comparable accuracy on the integration task with our pretrained model while using around $1.5$ orders of magnitude less number of training samples with respect to the state-of-the-art deep learning for symbolic mathematics. The test accuracy on differential equation tasks is considerably lower comparing with integration as they need higher order recursions that are not present in language translations. We pretrain our model with different pairs of language translations. Our results show language bias in solving symbolic mathematics tasks. Finally, we study the robustness of the fine-tuned model on symbolic math tasks against distribution shift, and our approach generalizes better in distribution shift scenarios for the function integration.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/b90c427934014a99954b56f7adcfbbce5438fe72.zip", "author": "Kimia Noorbakhsh;Modar Sulaiman;Mahdi Sharifi;KALLOL ROY;Pooyan Jamshidi", "authorids": "~Kimia_Noorbakhsh1;~Modar_Sulaiman1;~Mahdi_Sharifi1;~KALLOL_ROY1;~Pooyan_Jamshidi1", "gender": "F;M;M;M;M", "homepage": ";https://www.linkedin.com/in/modar-sulaiman-baaa5a150/;https://github.com/Sharifi-Mahdi;https://sail.cs.ut.ee/;http://pooyanjamshidi.github.io", "dblp": "304/4444;;;;https://dblp.uni-trier.de/pers/hd/j/Jamshidi:Pooyan", "google_scholar": "https://scholar.google.ca/citations?user=H9K3_hYAAAAJ;;;https://scholar.google.co.kr/citations?user=SnrokbkAAAAJ;Jre2RUQAAAAJ", "orcid": ";;;;0000-0002-9342-0703", "linkedin": "kimia-noorbakhsh/;modar-sulaiman-baaa5a150/;;kallolr/;pooyanjamshidi", "or_profile": "~Kimia_Noorbakhsh1;~Modar_Sulaiman1;~Mahdi_Sharifi1;~KALLOL_ROY1;~Pooyan_Jamshidi1", "aff": "Institute of Science and Technology;University of Tartu;University of South Carolina;institute of computer science, University of Tartu;University of South Carolina", "aff_domain": "ist.ac.at;ut.ee;sc.edu;cs.ut.ee;sc.edu", "position": "Intern;PhD student;PhD student;Assistant Professor;Assistant Professor", "bibtex": "@misc{\nnoorbakhsh2022pretrained,\ntitle={Pretrained Language Models are Symbolic Mathematics Solvers too!},\nauthor={Kimia Noorbakhsh and Modar Sulaiman and Mahdi Sharifi and KALLOL ROY and Pooyan Jamshidi},\nyear={2022},\nurl={https://openreview.net/forum?id=F7_odJIeQ26}\n}", "github": "", "project": "", "reviewers": "3m2T;2CuZ;oP2C;4ER6", "site": "https://openreview.net/forum?id=F7_odJIeQ26", "pdf_size": 0, "recommendation": "1;3;3;6", "confidence": "4;4;4;5", "correctness": "2;2;3;3", "technical_novelty": "1;2;1;2", "empirical_novelty": "2;3;1;4", "wc_summary_paper": "44;78;35;67", "wc_summary_review": "29;75;22;86", "wc_main_review": "265;384;237;897", "wc_review": "338;537;294;1050", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "613;1053;649;1030", "reply_reviewers": "0;0;0;0", "reply_authors": "2;3;3;3", "recommendation_avg": [ 3.25, 1.7853571071357126 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 1.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 1.118033988749895 ], "wc_summary_paper_avg": [ 56.0, 17.24818831066034 ], "wc_summary_review_avg": [ 53.0, 27.883686987197372 ], "wc_main_review_avg": [ 445.75, 266.31126055801695 ], "wc_review_avg": [ 554.75, 300.2327222339364 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 836.25, 205.804974429677 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.75, 0.4330127018922193 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.8892972917998875, "corr_recommendation_correctness": 0.7001400420140049, "gs_citation": 27, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2803819095976771157&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff_unique_index": "0;1;2;1;2", "aff_unique_norm": "Institute of Science and Technology;University of Tartu;University of South Carolina", "aff_unique_dep": ";;", "aff_unique_url": ";https://www.ut.ee;https://www.sc.edu", "aff_unique_abbr": ";UT;USC", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "1;2;1;2", "aff_country_unique": ";Estonia;United States" }, { "id": "F7nD--1JIC", "title": "Shapley-NAS: Discovering Operation Contribution for Neural Architecture Search", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "In this paper, we propose a Shapley value based operation contribution evaluation method (Shapley-NAS) for neural architecture search. Differentiable architecture search (DARTS) acquires the expected architectures by optimizing the architecture parameters with gradient descent, which benefits from the high efficiency due to the significantly reduced search cost. However, DARTS leverages the learnable architecture parameters of the supernet to represent the operation importance during the search process, which fails to reveal the actual impacts of operations on the task performance and therefore harms the effectiveness of obtained architectures. On the contrary, we evaluate the direct influence of operations on accuracy via Shapley value for supernet optimization and architecture discretization, so that the optimal architectures are acquired by selecting the operations that contribute significantly to the tasks. Specifically, we iteratively employ Monte-Carlo sampling based algorithm with early truncation to efficiently approximate the Shapley value of operations, and update weights of the supernet whose architecture parameters are assigned with the operation contribution evaluated by Shapley value. At the end of the search process, operations with the largest Shapley value are preserved to form the final architecture. Extensive experiments on CIFAR-10 and ImageNet for image classification and on NAS-Bench-201 for optimal architecture search show that our Shapley-NAS outperforms the state-of-the-art methods by a sizable margin with light search cost.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/e084a9769a72a246ede9ff563934640c00071eb0.zip", "author": "Han Xiao;Ziwei Wang;Jiwen Lu;Jie Zhou", "authorids": "~Han_Xiao3;~Ziwei_Wang2;~Jiwen_Lu1;~Jie_Zhou3", "gender": "F;M;M;M", "homepage": "https://www.researchgate.net/profile/Han_Xiao42;https://ziweiwangthu.github.io/;http://ivg.au.tsinghua.edu.cn/Jiwen_Lu/;https://www.tsinghua.edu.cn/publish/auen/1713/2011/20110506105532098625469/20110506105532098625469_.html", "dblp": "98/627-10;136/5574-1;http://dblp.uni-trier.de/pers/hd/l/Lu:Jiwen;00/5012-1", "google_scholar": "N-u2i-QAAAAJ;cMTW09EAAAAJ;TN8uDQoAAAAJ;", "orcid": ";0000-0001-9225-8495;0000-0002-6121-5529;", "linkedin": ";;;", "or_profile": "~Han_Xiao3;~Ziwei_Wang2;~Jiwen_Lu1;~Jie_Zhou3", "aff": "Tsinghua University;Tsinghua University;Tsinghua University;Tsinghua University", "aff_domain": "tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn", "position": "MS student;PhD student;Associate Professor;Full Professor", "bibtex": "@misc{\nxiao2022shapleynas,\ntitle={Shapley-{NAS}: Discovering Operation Contribution for Neural Architecture Search},\nauthor={Han Xiao and Ziwei Wang and Jiwen Lu and Jie Zhou},\nyear={2022},\nurl={https://openreview.net/forum?id=F7nD--1JIC}\n}", "github": "", "project": "", "reviewers": "JRFh;gE7m;oJRu;iEWL", "site": "https://openreview.net/forum?id=F7nD--1JIC", "pdf_size": 0, "recommendation": "3;5;5;5", "confidence": "2;4;4;4", "correctness": "2;2;3;3", "technical_novelty": "3;2;2;2", "empirical_novelty": "3;2;3;2", "wc_summary_paper": "71;47;82;90", "wc_summary_review": "87;28;87;76", "wc_main_review": "470;163;2053;200", "wc_review": "628;238;2222;366", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;196;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;1;0", "recommendation_avg": [ 4.5, 0.8660254037844386 ], "confidence_avg": [ 3.5, 0.8660254037844386 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 72.5, 16.194134740701646 ], "wc_summary_review_avg": [ 69.5, 24.37724348649781 ], "wc_main_review_avg": [ 721.5, 777.8221197677526 ], "wc_review_avg": [ 863.5, 796.8279299823771 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 49.0, 84.870489570875 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0.25, 0.4330127018922193 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 1.0, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 59, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10417871289325994119&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Tsinghua University", "aff_unique_dep": "", "aff_unique_url": "https://www.tsinghua.edu.cn", "aff_unique_abbr": "THU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "id": "F9McnN1dITx", "title": "Evolving Neural Update Rules for Sequence Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "We consider the problem of searching, end to end, for effective weight and activation update rules governing online learning of a recurrent network on problems of character sequence memorisation and prediction. We experiment with a number of functional forms and find that the performance depends on them significantly. We find update rules that allow us to scale to a much larger number of recurrent units and much longer sequence lengths than has been achieved with this approach previously. We also find that natural evolution strategies significantly outperforms meta-gradients on this problem, aligning with previous studies suggesting that such evolutionary strategies are more robust than gradient back-propagation over sequences with thousands(s) of steps.", "keywords": "Neural Update Rules;Evolution", "primary_area": "", "supplementary_material": "", "author": "Karol Gregor;Peter Conway Humphreys", "authorids": "~Karol_Gregor1;~Peter_Conway_Humphreys1", "gender": ";M", "homepage": ";", "dblp": "51/7660;", "google_scholar": ";W_BEUq8AAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Karol_Gregor1;~Peter_Conway_Humphreys1", "aff": "Google;Google DeepMind", "aff_domain": "google.com;deepmind.com", "position": "Researcher;Research Scientist", "bibtex": "@misc{\ngregor2022evolving,\ntitle={Evolving Neural Update Rules for Sequence Learning},\nauthor={Karol Gregor and Peter Conway Humphreys},\nyear={2022},\nurl={https://openreview.net/forum?id=F9McnN1dITx}\n}", "github": "", "project": "", "reviewers": "92Z4;3DR8;2qrk", "site": "https://openreview.net/forum?id=F9McnN1dITx", "pdf_size": 0, "recommendation": "3;3;5", "confidence": "3;3;3", "correctness": "2;3;3", "technical_novelty": "3;2;3", "empirical_novelty": "2;2;3", "wc_summary_paper": "88;57;66", "wc_summary_review": "61;42;26", "wc_main_review": "1017;309;525", "wc_review": "1166;408;617", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 3.6666666666666665, 0.9428090415820634 ], "confidence_avg": [ 3.0, 0.0 ], "correctness_avg": [ 2.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 70.33333333333333, 13.021349989749739 ], "wc_summary_review_avg": [ 43.0, 14.30617582258329 ], "wc_main_review_avg": [ 617.0, 296.2701469942593 ], "wc_review_avg": [ 730.3333333333334, 319.66058388373267 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.5, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:3qzU7QyaPKsJ:scholar.google.com/&scioq=Evolving+Neural+Update+Rules+for+Sequence+Learning&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Google", "aff_unique_dep": "Google", "aff_unique_url": "https://www.google.com", "aff_unique_abbr": "Google", "aff_campus_unique_index": "0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;1", "aff_country_unique": "United States;United Kingdom" }, { "id": "FASW5Ed837", "title": "Bandwidth-based Step-Sizes for Non-Convex Stochastic Optimization", "track": "main", "status": "Reject", "tldr": "", "abstract": "Many popular learning-rate schedules for deep neural networks combine a decaying trend with local perturbations that attempt to escape saddle points and bad local minima. We derive convergence guarantees for bandwidth-based step-sizes, a general class of learning-rates that are allowed to vary in a banded region. This framework includes many popular cyclic and non-monotonic step-sizes for which no theoretical guarantees were previously known. We provide worst-case guarantees for SGD on smooth non-convex problems under several bandwidth-based step sizes, including stagewise $1/\\sqrt{t}$ and the popular \\emph{step-decay} (``constant and then drop by a constant\u2019\u2019), which is also shown to be optimal. Moreover, we show that its momentum variant converges as fast as SGD with the bandwidth-based step-decay step-size. Finally, we propose novel step-size schemes in the bandwidth-based family and verify their efficiency on several deep neural network training tasks.", "keywords": "Stochastic gradient descent;bandwidth-based step size;non-asymptotic analysis", "primary_area": "", "supplementary_material": "/attachment/6f034b969c7154f6989f61aabb8e2a6ec4186a1d.zip", "author": "Xiaoyu Wang;Mikael Johansson", "authorids": "~Xiaoyu_Wang4;~Mikael_Johansson3", "gender": "F;M", "homepage": "https://xiaoyuwang2821.github.io/xiaoyu_wang.github.io/;https://people.KTH.se/~mikaelj", "dblp": "58/4775-8;53/764-1", "google_scholar": "https://scholar.google.com/citations?view_op=list_works;wQSRT18AAAAJ", "orcid": "0000-0003-4102-4909;", "linkedin": ";", "or_profile": "~Xiaoyu_Wang4;~Mikael_Johansson3", "aff": "KTH Royal Institute of Technology, Stockholm, Sweden;KTH Royal Institute of Technology, Stockholm, Sweden", "aff_domain": "kth.se;kth.se", "position": "Postdoc;Full Professor", "bibtex": "@misc{\nwang2022bandwidthbased,\ntitle={Bandwidth-based Step-Sizes for Non-Convex Stochastic Optimization},\nauthor={Xiaoyu Wang and Mikael Johansson},\nyear={2022},\nurl={https://openreview.net/forum?id=FASW5Ed837}\n}", "github": "", "project": "", "reviewers": "xLQE;wt4Y;ij2i;p6jS", "site": "https://openreview.net/forum?id=FASW5Ed837", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "5;4;3;3", "correctness": "3;3;3;3", "technical_novelty": "2;3;2;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "127;173;42;88", "wc_summary_review": "78;46;138;41", "wc_main_review": "484;957;219;161", "wc_review": "689;1176;399;290", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 107.5, 48.32442446630896 ], "wc_summary_review_avg": [ 75.75, 38.64178437908891 ], "wc_main_review_avg": [ 455.25, 314.23587875988954 ], "wc_review_avg": [ 638.5, 342.8808101950297 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.899228803025897, "corr_recommendation_correctness": 0.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4308527161201953826&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0", "aff_unique_norm": "KTH Royal Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.kth.se", "aff_unique_abbr": "KTH", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Stockholm", "aff_country_unique_index": "0;0", "aff_country_unique": "Sweden" }, { "id": "FCxWzalZp9N", "title": "AF$_2$: Adaptive Focus Framework for Aerial Imagery Segmentation", "track": "main", "status": "Reject", "tldr": "", "abstract": "As a specific semantic segmentation task, aerial imagery segmentation has been widely employed in high spatial resolution (HSR) remote sensing images understanding. Besides common issues (e.g. large scale variation) faced by general semantic segmentation tasks, aerial imagery segmentation has some unique challenges, the most critical one among which lies in foreground-background imbalance. There have been some recent efforts that attempt to address this issue by proposing sophisticated neural network architectures, since they can be used to extract informative multi-scale feature representations and increase the discrimination of object boundaries. Nevertheless, many of them merely utilize those multi-scale representations in ad-hoc measures but disregard the fact that the semantic meaning of objects with various sizes could be better identified via receptive fields of diverse ranges. In this paper, we propose Adaptive Focus Framework (AF$_2$), which adopts a hierarchical segmentation procedure and focuses on adaptively utilizing multi-scale representations generated by widely adopted neural network architectures. Particularly, a learnable module, called Adaptive Confidence Mechanism (ACM), is proposed to determine which scale of representation should be used for the segmentation of different objects. Comprehensive experiments show that AF$_2$ has significantly improved the accuracy on three widely used aerial benchmarks, as fast as the mainstream method.\n", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/fb504252909dee35087f31c511291ea792bbe4d2.zip", "author": "Lin Huang;Qiyuan Dong;Jia Zhang;Lijun Wu;Jiang Bian;Tie-Yan Liu", "authorids": "~Lin_Huang2;~Qiyuan_Dong1;~Jia_Zhang1;~Lijun_Wu1;~Jiang_Bian1;~Tie-Yan_Liu1", "gender": "M;M;M;M;M;M", "homepage": "https://www.researchgate.net/profile/Lin-Huang-3;;;https://apeterswu.github.io/;https://sites.google.com/view/jiangbian;http://member.acm.org/~tieyanliu", "dblp": ";227/4452;80/2266;68/1284-3;09/851-2.html;l/TieYanLiu", "google_scholar": ";;https://scholar.google.com.hk/citations?user=ElUqcbEAAAAJ;https://scholar.google.com/citations?hl=en;pZBEnY8AAAAJ;Nh832fgAAAAJ", "orcid": ";;;0000-0002-3530-590X;0000-0002-9472-600X;0000-0002-0476-8020", "linkedin": "linhuang6385/;;;lijun-wu-59340478/;jbian/;", "or_profile": "~Lin_Huang2;~Qiyuan_Dong1;~Jia_Zhang1;~Lijun_Wu1;~Jiang_Bian1;~Tie-Yan_Liu1", "aff": "Microsoft;;Microsoft;Microsoft Research;Microsoft;Microsoft", "aff_domain": "microsoft.com;;microsoft.com;microsoft.com;microsoft.com;microsoft.com", "position": "Researcher;;Associate Researcher;Researcher;Partner Research Manager;Distinguished Scientist", "bibtex": "@misc{\nhuang2022af,\ntitle={{AF}\\$\\_2\\$: Adaptive Focus Framework for Aerial Imagery Segmentation},\nauthor={Lin Huang and Qiyuan Dong and Jia Zhang and Lijun Wu and Jiang Bian and Tie-Yan Liu},\nyear={2022},\nurl={https://openreview.net/forum?id=FCxWzalZp9N}\n}", "github": "", "project": "", "reviewers": "hDsQ;VRMT;464N;wDBh", "site": "https://openreview.net/forum?id=FCxWzalZp9N", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "4;4;3;5", "correctness": "3;4;3;4", "technical_novelty": "2;1;2;2", "empirical_novelty": "2;1;2;2", "wc_summary_paper": "56;56;101;65", "wc_summary_review": "48;157;19;15", "wc_main_review": "409;62;261;246", "wc_review": "513;275;381;326", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 1.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 69.5, 18.553975315279473 ], "wc_summary_review_avg": [ 59.75, 57.57332281534565 ], "wc_main_review_avg": [ 244.5, 123.12696698936428 ], "wc_review_avg": [ 373.75, 88.70562270792084 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10003211169846635513&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Microsoft", "aff_unique_dep": "Microsoft Corporation", "aff_unique_url": "https://www.microsoft.com", "aff_unique_abbr": "Microsoft", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "FD8xldQIgdq", "title": "Robust Models Are More Interpretable Because Attributions Look Normal", "track": "main", "status": "Reject", "tldr": "", "abstract": "Recent work has found that adversarially-robust deep networks used for image classification are more interpretable: their feature attributions tend to be sharper, and are more concentrated on the objects associated with the image's ground-truth class. We show that smooth decision boundaries play an important role in this enhanced interpretability, as the model's input gradients around data points will more closely align with boundaries' normal vectors when they are smooth. Thus, because robust models have smoother boundaries, the results of gradient-based attribution methods will capture more accurate information about nearby decision boundaries. This understanding of robust interpretability leads to our second contribution: \\emph{boundary attributions}, which aggregate information about the normal vectors of local decision boundaries to explain a classification outcome. We show that by leveraging the key factors underpinning robust interpretability, boundary attributions produce sharper, more concentrated visual explanations---even on non-robust models.", "keywords": "Explainability;Decision Boundary;Attribution;Adversarial Robustness", "primary_area": "", "supplementary_material": "", "author": "Zifan Wang;Matt Fredrikson;Anupam Datta", "authorids": "~Zifan_Wang1;~Matt_Fredrikson1;~Anupam_Datta1", "gender": "M;M;M", "homepage": "https://www.zifanw.net;https://cs.cmu.edu/~mfredrik;http://www.andrew.cmu.edu/user/danupam/", "dblp": ";38/2612;d/AnupamDatta", "google_scholar": "HJOP3wMAAAAJ;https://scholar.google.com.tw/citations?user=tMYCvLAAAAAJ;", "orcid": ";;", "linkedin": "zifan-wang-sail/;;", "or_profile": "~Zifan_Wang1;~Matt_Fredrikson1;~Anupam_Datta1", "aff": "Carnegie Mellon University;Carnegie Mellon University;", "aff_domain": "cmu.edu;cmu.edu;", "position": "PhD student;Associate Professor;", "bibtex": "@misc{\nwang2022robust,\ntitle={Robust Models Are More Interpretable Because Attributions Look Normal},\nauthor={Zifan Wang and Matt Fredrikson and Anupam Datta},\nyear={2022},\nurl={https://openreview.net/forum?id=FD8xldQIgdq}\n}", "github": "", "project": "", "reviewers": "oVRP;t25u;qvFb;NMCL", "site": "https://openreview.net/forum?id=FD8xldQIgdq", "pdf_size": 0, "recommendation": "3;6;6;6", "confidence": "3;3;4;3", "correctness": "2;3;4;4", "technical_novelty": "4;2;2;3", "empirical_novelty": "0;2;2;0", "wc_summary_paper": "98;46;139;74", "wc_summary_review": "418;14;34;175", "wc_main_review": "519;65;324;983", "wc_review": "1035;125;497;1232", "wc_reply_reviewers": "590;47;37;713", "wc_reply_authors": "1966;346;1102;2791", "reply_reviewers": "2;1;1;2", "reply_authors": "4;1;2;6", "recommendation_avg": [ 5.25, 1.299038105676658 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.75, 0.82915619758885 ], "empirical_novelty_avg": [ 1.0, 1.0 ], "wc_summary_paper_avg": [ 89.25, 34.11286414243166 ], "wc_summary_review_avg": [ 160.25, 161.2302313463576 ], "wc_main_review_avg": [ 472.75, 335.7382723193768 ], "wc_review_avg": [ 722.25, 437.35076026 ], "wc_reply_reviewers_avg": [ 346.75, 307.85741423587643 ], "wc_reply_authors_avg": [ 1551.25, 916.9856528321476 ], "reply_reviewers_avg": [ 1.5, 0.5 ], "reply_authors_avg": [ 3.25, 1.920286436967152 ], "replies_avg": [ 24, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": 0.8703882797784892, "gs_citation": 30, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14430069598728045155&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff_unique_index": "0;0", "aff_unique_norm": "Carnegie Mellon University", "aff_unique_dep": "", "aff_unique_url": "https://www.cmu.edu", "aff_unique_abbr": "CMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "TPU-GAN: Learning temporal coherence from dynamic point cloud sequences", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6936", "id": "FEBFJ98FKx", "poster": "", "openreview": "https://openreview.net/forum?id=FEBFJ98FKx", "slides": "https://iclr.cc/virtual/2022/poster/6936", "video": "https://iclr.cc/virtual/2022/poster/6936", "author_site": "Zijie Li, Tianqin Li, Amir Barati Farimani", "tldr": "", "abstract": "Point cloud sequence is an important data representation that provides flexible shape and motion information. Prior work demonstrates that incorporating scene flow information into loss can make model learn temporally coherent feature spaces. However, it is prohibitively expensive to acquire point correspondence information across frames in real-world environments. In this work, we propose a super-resolution generative adversarial network (GAN) for upsampling dynamic point cloud sequences, which does not require point correspondence annotation. Our model, Temporal Point cloud Upsampling GAN (TPU-GAN), can implicitly learn the underlying temporal coherence from point cloud sequence, which in turn guides the generator to produce temporally coherent output. In addition, we propose a learnable masking module to adapt upsampling ratio according to the point distribution. We conduct extensive experiments on point cloud sequences from two different domains: particles in the fluid dynamical system and human action scanned data. The quantitative and qualitative evaluation demonstrates the effectiveness of our method on upsampling tasks as well as learning temporal coherence from irregular point cloud sequences.", "keywords": "Point cloud super resolution;Temporal learning;Generative Adversarial Networks", "primary_area": "", "supplementary_material": "", "author": "Zijie Li;Tianqin Li;Amir Barati Farimani", "authorids": "~Zijie_Li2;~Tianqin_Li2;~Amir_Barati_Farimani2", "gender": "M;M;M", "homepage": "https://github.com/Crazy-Jack;https://sites.google.com/view/barati;https://zijieli-jlee.github.io/", "dblp": "294/5434;;", "google_scholar": "sQjEQEUAAAAJ;aH52nxkAAAAJ;ji7TXTMAAAAJ", "orcid": "0000-0003-2567-8283;0000-0002-2952-8576;0000-0002-8566-7538", "linkedin": "tianqin-li-b16299170/;amir-barati-farimani-a0b74169/;", "or_profile": "~Tianqin_Li2;~Amir_Barati_Farimani2;~zijie_li1", "aff": "Carnegie Mellon University;Carnegie Mellon University;Carnegie Mellon University", "aff_domain": "andrew.cmu.edu;andrew.cmu.edu;cmu.edu", "position": "PhD student;Assistant Professor;PhD student", "bibtex": "@inproceedings{\nli2022tpugan,\ntitle={{TPU}-{GAN}: Learning temporal coherence from dynamic point cloud sequences},\nauthor={Zijie Li and Tianqin Li and Amir Barati Farimani},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=FEBFJ98FKx}\n}", "github": "", "project": "", "reviewers": "evrR;gFvn;7Wbg;dDgo;waNz", "pdf_size": 0, "recommendation": "6;6;6;6;6", "confidence": "4;4;3;3;4", "correctness": "3;4;4;3;3", "technical_novelty": "3;2;2;2;3", "empirical_novelty": "3;3;3;3;3", "wc_summary_paper": "131;97;97;29;66", "wc_summary_review": "39;25;90;61;23", "wc_main_review": "171;292;604;549;315", "wc_review": "341;414;791;639;404", "wc_reply_reviewers": "31;43;102;156;0", "wc_reply_authors": "306;367;695;1139;718", "reply_reviewers": "1;1;1;1;0", "reply_authors": "1;1;1;2;1", "recommendation_avg": [ 6.0, 0.0 ], "confidence_avg": [ 3.6, 0.4898979485566356 ], "correctness_avg": [ 3.4, 0.4898979485566356 ], "technical_novelty_avg": [ 2.4, 0.4898979485566356 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 84.0, 34.3394816501356 ], "wc_summary_review_avg": [ 47.6, 25.168233946782994 ], "wc_main_review_avg": [ 386.2, 163.82600526168 ], "wc_review_avg": [ 517.8, 169.88866942795215 ], "wc_reply_reviewers_avg": [ 66.4, 55.68698232082611 ], "wc_reply_authors_avg": [ 645.0, 298.01677805116947 ], "reply_reviewers_avg": [ 0.8, 0.4 ], "reply_authors_avg": [ 1.2, 0.4 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2827518776693752617&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "pdf": "https://openreview.net/pdf?id=FEBFJ98FKx", "email": "andrew.cmu.edu;andrew.cmu.edu;cmu.edu", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Carnegie Mellon University", "aff_unique_dep": "", "aff_unique_url": "https://www.cmu.edu", "aff_unique_abbr": "CMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "RelaxLoss: Defending Membership Inference Attacks without Losing Utility", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/5964", "id": "FEDfGWVZYIn", "poster": "", "openreview": "https://openreview.net/forum?id=FEDfGWVZYIn", "slides": "https://iclr.cc/virtual/2022/poster/5964", "video": "https://iclr.cc/virtual/2022/poster/5964", "author_site": "Dingfan Chen, Ning Yu, Mario Fritz", "tldr": "", "abstract": "As a long-term threat to the privacy of training data, membership inference attacks (MIAs) emerge ubiquitously in machine learning models.\nExisting works evidence strong connection between the distinguishability of the training and testing loss distributions and the model's vulnerability to MIAs. Motivated by existing results, we propose a novel training framework based on a relaxed loss ($\\textbf{RelaxLoss}$) with a more achievable learning target, which leads to narrowed generalization gap and reduced privacy leakage. RelaxLoss is applicable to any classification model with added benefits of easy implementation and negligible overhead. Through extensive evaluations on five datasets with diverse modalities (images, medical data, transaction records), our approach consistently outperforms state-of-the-art defense mechanisms in terms of resilience against MIAs as well as model utility. Our defense is the first that can withstand a wide range of attacks while preserving (or even improving) the target model's utility.", "keywords": "membership inference attack;defense", "primary_area": "", "supplementary_material": "", "author": "Dingfan Chen;Ning Yu;Mario Fritz", "authorids": "~Dingfan_Chen1;~Ning_Yu2;~Mario_Fritz1", "gender": "F;;M", "homepage": "https://dingfanchen.github.io/homepage/;;https://cispa.saarland/group/fritz/", "dblp": "248/8198;;", "google_scholar": "iARn00oAAAAJ;;https://scholar.google.de/citations?user=4V1nNm4AAAAJ", "orcid": ";;", "linkedin": "dingfan-chen-44174012b/;;", "or_profile": "~Dingfan_Chen1;~Ning_Yu2;~Mario_Fritz1", "aff": "CISPA, saarland university, saarland informatics campus;;Saarland University", "aff_domain": "cispa.saarland;;uni-saarland.de", "position": "PhD student;;Full Professor", "bibtex": "@inproceedings{\nchen2022relaxloss,\ntitle={RelaxLoss: Defending Membership Inference Attacks without Losing Utility},\nauthor={Dingfan Chen and Ning Yu and Mario Fritz},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=FEDfGWVZYIn}\n}", "github": "", "project": "", "reviewers": "sYYW;zx7i;A7eb", "pdf_size": 0, "recommendation": "8;8;8", "confidence": "4;4;2", "correctness": "3;3;3", "technical_novelty": "3;3;3", "empirical_novelty": "3;0;3", "wc_summary_paper": "97;100;53", "wc_summary_review": "35;48;49", "wc_main_review": "124;380;214", "wc_review": "256;528;316", "wc_reply_reviewers": "0;40;0", "wc_reply_authors": "272;666;368", "reply_reviewers": "0;1;0", "reply_authors": "1;1;1", "recommendation_avg": [ 8.0, 0.0 ], "confidence_avg": [ 3.3333333333333335, 0.9428090415820634 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.0, 1.4142135623730951 ], "wc_summary_paper_avg": [ 83.33333333333333, 21.483844059096025 ], "wc_summary_review_avg": [ 44.0, 6.377042156569663 ], "wc_main_review_avg": [ 239.33333333333334, 106.03563342365413 ], "wc_review_avg": [ 366.6666666666667, 116.6799992381823 ], "wc_reply_reviewers_avg": [ 13.333333333333334, 18.856180831641264 ], "wc_reply_authors_avg": [ 435.3333333333333, 167.7484889814378 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 64, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6734125501477574187&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "pdf": "https://openreview.net/pdf?id=FEDfGWVZYIn", "email": "cispa.saarland;;uni-saarland.de", "author_num": 3, "aff_unique_index": "0;0", "aff_unique_norm": "Saarland University", "aff_unique_dep": "CISPA", "aff_unique_url": "https://www.uni-saarland.de", "aff_unique_abbr": "Saarland U", "aff_campus_unique_index": "0", "aff_campus_unique": "Saarland Informatics Campus;", "aff_country_unique_index": "0;0", "aff_country_unique": "Germany" }, { "id": "FFGDKzLasUa", "title": "Stochastic Deep Networks with Linear Competing Units for Model-Agnostic Meta-Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "This work addresses meta-learning (ML) by considering deep networks with stochastic local winner-takes-all (LWTA) activations. This type of network units result in sparse representations from each model layer, as the units are organized into blocks where only one unit generates a non-zero output. The main operating principle of the introduced units lies on stochastic arguments, as the network performs posterior sampling over competing units to select the winner. Therefore, the proposed networks are explicitly designed to extract input data representations of sparse stochastic nature, as opposed to the currently standard deterministic representation paradigm. We posit that these modeling arguments, inspired from Bayesian statistics, allow for more robust modeling when uncertainty is high due to the limited availability of task-related training data; this is exactly the case with ML, which is the focus of this work. At training time, we rely on the reparameterization trick for Discrete distributions to perform reliable training via Monte-Carlo sampling. At inference time, we rely on Bayesian Model Averaging, which effectively averages over a number of sampled representations. As we experimentally show, our approach produces state-of-the-art predictive accuracy on standard few-shot image classification benchmarks; this is achieved without compromising computational efficiency.", "keywords": "Stochastic Deep Networks;LWTA;Meta-Learning", "primary_area": "", "supplementary_material": "/attachment/28c80372eac8edd2269727fc3c04989cc34cadc7.zip", "author": "Konstantinos \u0399. Kalais;Sotirios Chatzis", "authorids": "~Konstantinos_\u0399._Kalais1;~Sotirios_Chatzis1", "gender": "M;M", "homepage": ";https://www.cut.ac.cy/eecei/staff/sotirios.chatzis/", "dblp": "323/9203;25/6133", "google_scholar": "https://scholar.google.com/citations?hl=en;https://scholar.google.gr/citations?user=__Y_0hQAAAAJ", "orcid": ";", "linkedin": "konstantinos-kalais-025085151/;", "or_profile": "~Konstantinos_\u0399._Kalais1;~Sotirios_Chatzis1", "aff": "Cyprus University of Technology;Cyprus University of Technology", "aff_domain": "cut.ac.cy;cut.ac.cy", "position": "PhD student;Associate Professor", "bibtex": "@misc{\nkalais2022stochastic,\ntitle={Stochastic Deep Networks with Linear Competing Units for Model-Agnostic Meta-Learning},\nauthor={Konstantinos I. Kalais and Sotirios Chatzis},\nyear={2022},\nurl={https://openreview.net/forum?id=FFGDKzLasUa}\n}", "github": "", "project": "", "reviewers": "rRh2;Nj3T;3Mcm;czsy", "site": "https://openreview.net/forum?id=FFGDKzLasUa", "pdf_size": 0, "recommendation": "3;5;5;5", "confidence": "3;3;3;4", "correctness": "3;2;2;4", "technical_novelty": "3;2;3;2", "empirical_novelty": "3;3;3;2", "wc_summary_paper": "77;50;64;84", "wc_summary_review": "53;43;141;68", "wc_main_review": "169;194;1335;220", "wc_review": "299;287;1540;372", "wc_reply_reviewers": "0;0;0;261", "wc_reply_authors": "254;286;1042;475", "reply_reviewers": "0;0;0;1", "reply_authors": "1;1;2;1", "recommendation_avg": [ 4.5, 0.8660254037844386 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.82915619758885 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 68.75, 12.987975207860538 ], "wc_summary_review_avg": [ 76.25, 38.42769183804825 ], "wc_main_review_avg": [ 479.5, 494.2522129439584 ], "wc_review_avg": [ 624.5, 529.564207627366 ], "wc_reply_reviewers_avg": [ 65.25, 113.01631519386925 ], "wc_reply_authors_avg": [ 514.25, 316.18378753503475 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": -0.17407765595569782, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12812982432289049616&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff_unique_index": "0;0", "aff_unique_norm": "Cyprus University of Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.cut.ac.cy", "aff_unique_abbr": "CUT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Cyprus" }, { "id": "FFM_oJeqZx", "title": "Adaptive Pseudo-labeling for Quantum Calculations", "track": "main", "status": "Reject", "tldr": "", "abstract": "Machine learning models have recently shown promise in predicting molecular quantum chemical properties. However, the path to real-life adoption requires (1) learning under low-resource constraint and (2) out-of-distribution generalization to unseen, structurally diverse molecules. We observe that these two challenges can be alleviated via abundant labels, which are often not the case in quantum chemistry. We hypothesize that pseudo-labeling on vast array of unlabeled molecules can serve as gold-label proxies to greatly expand the training labeled dataset. The challenge in pseudo-labeling is to prevent the bad pseudo-labels from biasing the model. We develop a simple and effective strategy Pseudo that can assign pseudo-labels, detect bad pseud-labels through evidential uncertainty, and then prevent them from biasing the model using adaptive weighting. Empirically, Pseudo improves quantum calculations accuracy across full data, low data and out-of-distribution settings. ", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/f19616197f0a701f494ea3c9fec146a77b885e35.zip", "author": "Kexin Huang;Vishnu Sresht;Brajesh Rai;Mykola Bordyuh", "authorids": "~Kexin_Huang1;vishnu.sresht@pfizer.com;brajesh.rai@pfizer.com;mykola.bordyuh@pfizer.com", "gender": "M;;;", "homepage": "https://www.kexinhuang.com/;;;", "dblp": ";;;", "google_scholar": "ogEXTOgAAAAJ;;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Kexin_Huang1;vishnu.sresht@pfizer.com;brajesh.rai@pfizer.com;mykola.bordyuh@pfizer.com", "aff": "Stanford University;;;", "aff_domain": "stanford.edu;;;", "position": "PhD student;;;", "bibtex": "@misc{\nhuang2022adaptive,\ntitle={Adaptive Pseudo-labeling for Quantum Calculations},\nauthor={Kexin Huang and Vishnu Sresht and Brajesh Rai and Mykola Bordyuh},\nyear={2022},\nurl={https://openreview.net/forum?id=FFM_oJeqZx}\n}", "github": "", "project": "", "reviewers": "ianT;zDAo;5bAg;iJ3j", "site": "https://openreview.net/forum?id=FFM_oJeqZx", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "3;3;4;5", "correctness": "3;4;3;4", "technical_novelty": "1;2;4;3", "empirical_novelty": "2;3;4;4", "wc_summary_paper": "250;76;34;243", "wc_summary_review": "78;18;58;104", "wc_main_review": "920;95;382;1484", "wc_review": "1248;189;474;1831", "wc_reply_reviewers": "0;0;0;327", "wc_reply_authors": "1169;438;517;852", "reply_reviewers": "0;0;0;1", "reply_authors": "2;1;1;1", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.5, 1.118033988749895 ], "empirical_novelty_avg": [ 3.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 150.75, 96.92619614944145 ], "wc_summary_review_avg": [ 64.5, 31.412577098990145 ], "wc_main_review_avg": [ 720.25, 531.1696409811087 ], "wc_review_avg": [ 935.5, 646.1077696483768 ], "wc_reply_reviewers_avg": [ 81.75, 141.59515351875572 ], "wc_reply_authors_avg": [ 744.0, 290.4539550427916 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.7608859102526822, "corr_recommendation_correctness": 0.6882472016116854, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:JJKVeiJqakkJ:scholar.google.com/&scioq=Adaptive+Pseudo-labeling+for+Quantum+Calculations&hl=en&as_sdt=0,5", "gs_version_total": 2, "aff_unique_index": "0", "aff_unique_norm": "Stanford University", "aff_unique_dep": "", "aff_unique_url": "https://www.stanford.edu", "aff_unique_abbr": "Stanford", "aff_campus_unique_index": "0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "id": "FH_mZOKFX-b", "title": "Takeuchi's Information Criteria as Generalization Measures for DNNs Close to NTK Regime", "track": "main", "status": "Reject", "tldr": "", "abstract": "Generalization measures are intensively studied in the machine learning community for better modeling generalization gaps. However, establishing a reliable generalization measure for statistical singular models such as deep neural networks (DNNs) is challenging due to the complex nature of the singular models. \nWe focus on a classical measure called Takeuchi's Information Criteria (TIC) to investigate allowed conditions in which the criteria can well explain generalization gaps caused by DNNs. In fact, theory indicates the applicability of TIC near the neural tangent kernel (NTK) regime.\nExperimentally, we trained more than 5,000 DNN models with 12 DNN architectures including large models (e.g., VGG16) and 4 datasets, and estimated corresponding TICs in order to comprehensively study the relationship between the generalization gap and the TIC estimates. \nWe examine several approximation methods to estimate TIC with feasible computational load and investigate the accuracy trade-off. Experimental results indicate that estimated TIC well correlates generalization gaps under the conditions that are close to NTK regime. Outside the NTK regime, such correlation disappears, shown theoretically and empirically. We further demonstrate that TIC can yield better trial pruning ability for hyperparameter optimization over existing methods.", "keywords": "Generalization;correlation;experiments", "primary_area": "", "supplementary_material": "", "author": "Hiroki Naganuma;Taiji Suzuki;Rio Yokota;Masahiro Nomura;Kohta Ishikawa;Ikuro Sato", "authorids": "~Hiroki_Naganuma1;~Taiji_Suzuki1;~Rio_Yokota1;~Masahiro_Nomura1;~Kohta_Ishikawa1;~Ikuro_Sato1", "gender": "M;M;M;M;M;M", "homepage": "https://hiroki11x.github.io/;http://ibis.t.u-tokyo.ac.jp/suzuki/;https://www.rio.scrc.iir.isct.ac.jp/en/index.html;;;", "dblp": "206/0082;08/312;61/7413;10/370;157/8482;68/10406", "google_scholar": "https://scholar.google.co.jp/citations?user=xx7O2voAAAAJ;x8osrBsAAAAJ;klw9KE0AAAAJ;dml10nwAAAAJ;;https://scholar.google.com/citations?hl=ja", "orcid": "0000-0002-4595-8381;;0000-0001-7573-7873;;;", "linkedin": "hiroki11x/;;rio-yokota-62857235/?originalSubdomain=jp;;;", "or_profile": "~Hiroki_Naganuma1;~Taiji_Suzuki1;~Rio_Yokota1;~Masahiro_Nomura1;~Kohta_Ishikawa1;~Ikuro_Sato1", "aff": "LinkedIn;The University of Tokyo;Tokyo Institute of Technology;CyberAgent, Inc.;Denso IT Laboratory, Inc.;Denso IT Laboratory, Inc.", "aff_domain": "linkedin.com;tokyo.ac.jp;titech.ac.jp;cyberagent.co.jp;d-itlab.co.jp;d-itlab.co.jp", "position": "Intern;Associate Professor;Associate Professor;Research scientist;Researcher;Researcher", "bibtex": "@misc{\nnaganuma2022takeuchis,\ntitle={Takeuchi's Information Criteria as Generalization Measures for {DNN}s Close to {NTK} Regime},\nauthor={Hiroki Naganuma and Taiji Suzuki and Rio Yokota and Masahiro Nomura and Kohta Ishikawa and Ikuro Sato},\nyear={2022},\nurl={https://openreview.net/forum?id=FH_mZOKFX-b}\n}", "github": "", "project": "", "reviewers": "Z8n4;2vba;VPJq;fZpy", "site": "https://openreview.net/forum?id=FH_mZOKFX-b", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "3;3;4;3", "correctness": "2;3;3;3", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "273;117;37;92", "wc_summary_review": "34;106;47;48", "wc_main_review": "1725;681;580;266", "wc_review": "2032;904;664;406", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "1556;567;703;788", "reply_reviewers": "0;0;0;0", "reply_authors": "2;1;1;2", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 129.75, 87.62241436984033 ], "wc_summary_review_avg": [ 58.75, 27.83320858255476 ], "wc_main_review_avg": [ 813.0, 548.3306484230113 ], "wc_review_avg": [ 1001.5, 620.4762283923535 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 903.5, 384.8795266054041 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9107132857161323168&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;2;3;4;4", "aff_unique_norm": "LinkedIn Corporation;University of Tokyo;Tokyo Institute of Technology;CyberAgent;Denso IT Laboratory, Inc.", "aff_unique_dep": ";;;;", "aff_unique_url": "https://www.linkedin.com;https://www.u-tokyo.ac.jp;https://www.titech.ac.jp;https://www.cyberagent.co.jp;https://www.denso.com", "aff_unique_abbr": "LinkedIn;UTokyo;Titech;CyberAgent;Denso IT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1;1;1", "aff_country_unique": "United States;Japan" }, { "title": "Wish you were here: Hindsight Goal Selection for long-horizon dexterous manipulation", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6489", "id": "FKp8-pIRo3y", "poster": "", "openreview": "https://openreview.net/forum?id=FKp8-pIRo3y", "slides": "https://iclr.cc/virtual/2022/poster/6489", "video": "https://iclr.cc/virtual/2022/poster/6489", "author_site": "Todor Davchev, Oleg Sushkov, Jean-Baptiste Regli, Stefan Schaal, Yusuf Aytar, Markus Wulfmeier, Jon Scholz", "tldr": "", "abstract": "Complex sequential tasks in continuous-control settings often require agents to successfully traverse a set of ``narrow passages'' in their state space. Solving such tasks with a sparse reward in a sample-efficient manner poses a challenge to modern reinforcement learning (RL) due to the associated long-horizon nature of the problem and the lack of sufficient positive signal during learning. \nVarious tools have been applied to address this challenge. When available, large sets of demonstrations can guide agent exploration. Hindsight relabelling on the other hand does not require additional sources of information. However, existing strategies explore based on task-agnostic goal distributions, which can render the solution of long-horizon tasks impractical. In this work, we extend hindsight relabelling mechanisms to guide exploration along task-specific distributions implied by a small set of successful demonstrations. We evaluate the approach on four complex, single and dual arm, robotics manipulation tasks against strong suitable baselines. The method requires far fewer demonstrations to solve all tasks and achieves a significantly higher overall performance as task complexity increases. Finally, we investigate the robustness of the proposed solution with respect to the quality of input representations and the number of demonstrations.", "keywords": "goal-conditioned reinforcement learning;learning from demonstrations;long-horizon dexterous manipulation;bi-manual manipulation", "primary_area": "", "supplementary_material": "", "author": "Todor Davchev;Oleg Olegovich Sushkov;Jean-Baptiste Regli;Stefan Schaal;Yusuf Aytar;Markus Wulfmeier;Jon Scholz", "authorids": "~Todor_Davchev1;~Oleg_Olegovich_Sushkov1;~Jean-Baptiste_Regli1;~Stefan_Schaal1;~Yusuf_Aytar1;~Markus_Wulfmeier1;~Jon_Scholz1", "gender": ";M;Not Specified;M;M;M;M", "homepage": "https://tdavchev.github.io/;;https://github.com/jbregli;http://www-clmc.net;;;https://sites.google.com/site/jonathanscholz/", "dblp": "241/7187;;;32/3952;41/5577;166/1552;", "google_scholar": "h_q7XhoAAAAJ;;;;0ncQNL8AAAAJ;;bwORIKIAAAAJ", "orcid": "0000-0002-0584-5163;;;;;;", "linkedin": ";oleg-sushkov-55a0038;;;;;jonathan-scholz-689aa34/", "or_profile": "~Todor_Davchev1;~Oleg_Olegovich_Sushkov1;~Jean-Baptiste_Regli1;~Stefan_Schaal1;~Yusuf_Aytar1;~Markus_Wulfmeier1;~Jonathan_Scholz2", "aff": "University of Edinburgh;;Google DeepMind;;Google DeepMind;Google DeepMind;Google DeepMind", "aff_domain": "ed.ac.uk;;deepmind.com;;google.com;deepmind.com;deepmind.com", "position": "PhD student;;Researcher;;Research Scientist;Research Scientist;Researcher", "bibtex": "@inproceedings{\ndavchev2022wish,\ntitle={Wish you were here: Hindsight Goal Selection for long-horizon dexterous manipulation},\nauthor={Todor Davchev and Oleg Olegovich Sushkov and Jean-Baptiste Regli and Stefan Schaal and Yusuf Aytar and Markus Wulfmeier and Jon Scholz},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=FKp8-pIRo3y}\n}", "github": "", "project": "", "reviewers": "jLXD;rSZQ;BCwv;qVKc", "pdf_size": 0, "recommendation": "6;6;6;6", "confidence": "3;3;4;4", "correctness": "4;2;4;3", "technical_novelty": "2;3;3;3", "empirical_novelty": "3;2;2;2", "wc_summary_paper": "117;99;124;112", "wc_summary_review": "25;98;76;17", "wc_main_review": "174;407;477;165", "wc_review": "316;604;677;294", "wc_reply_reviewers": "32;80;30;87", "wc_reply_authors": "484;1528;1365;340", "reply_reviewers": "1;1;1;1", "reply_authors": "1;2;3;1", "recommendation_avg": [ 6.0, 0.0 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 113.0, 9.137833441248533 ], "wc_summary_review_avg": [ 54.0, 34.022051672408 ], "wc_main_review_avg": [ 305.75, 138.51601892921988 ], "wc_review_avg": [ 472.75, 169.9019938081952 ], "wc_reply_reviewers_avg": [ 57.25, 26.37588861062315 ], "wc_reply_authors_avg": [ 929.25, 522.9346876044847 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 1.75, 0.82915619758885 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14449678176978256781&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=FKp8-pIRo3y", "email": "ed.ac.uk;;deepmind.com;;google.com;deepmind.com;deepmind.com", "author_num": 7, "aff_unique_index": "0;1;1;1;1", "aff_unique_norm": "University of Edinburgh;Google", "aff_unique_dep": ";Google DeepMind", "aff_unique_url": "https://www.ed.ac.uk;https://deepmind.com", "aff_unique_abbr": "Edinburgh;DeepMind", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United Kingdom" }, { "title": "COptiDICE: Offline Constrained Reinforcement Learning via Stationary Distribution Correction Estimation", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6824", "id": "FLA55mBee6Q", "poster": "", "openreview": "https://openreview.net/forum?id=FLA55mBee6Q", "slides": "https://iclr.cc/virtual/2022/poster/6824", "video": "https://iclr.cc/virtual/2022/poster/6824", "author_site": "Jongmin Lee, Cosmin Paduraru, Daniel Mankowitz, Nicolas Heess, Doina Precup, Kee-Eung Kim, Arthur Guez", "tldr": "", "abstract": "We consider the offline constrained reinforcement learning (RL) problem, in which the agent aims to compute a policy that maximizes expected return while satisfying given cost constraints, learning only from a pre-collected dataset. This problem setting is appealing in many real-world scenarios, where direct interaction with the environment is costly or risky, and where the resulting policy should comply with safety constraints. However, it is challenging to compute a policy that guarantees satisfying the cost constraints in the offline RL setting, since the off-policy evaluation inherently has an estimation error. In this paper, we present an offline constrained RL algorithm that optimizes the policy in the space of the stationary distribution. Our algorithm, COptiDICE, directly estimates the stationary distribution corrections of the optimal policy with respect to returns, while constraining the cost upper bound, with the goal of yielding a cost-conservative policy for actual constraint satisfaction. Experimental results show that COptiDICE attains better policies in terms of constraint satisfaction and return-maximization, outperforming baseline algorithms.", "keywords": "Offline Reinforcement Learning;Offline Constrained Reinforcement Learning;Stationary Distribution Correction Estimation", "primary_area": "", "supplementary_material": "", "author": "Jongmin Lee;Cosmin Paduraru;Daniel J Mankowitz;Nicolas Heess;Doina Precup;Kee-Eung Kim;Arthur Guez", "authorids": "~Jongmin_Lee1;~Cosmin_Paduraru1;~Daniel_J_Mankowitz2;~Nicolas_Heess1;~Doina_Precup1;~Kee-Eung_Kim2;~Arthur_Guez1", "gender": "M;M;;F;M;M;M", "homepage": "https://www.jmlee.kr;https://sites.google.com/site/cosminpaduraru/;;http://cs.mcgill.ca/~dprecup/;http://ailab.kaist.ac.kr;https://www.gatsby.ucl.ac.uk/~aguez/;https://danielmankowitz.wixsite.com/danielm", "dblp": "68/222-4.html;http://dblp.uni-trier.de/pers/hd/p/Paduraru:Cosmin;76/9181;p/DoinaPrecup;35/6703;;", "google_scholar": "https://scholar.google.co.kr/citations?user=rFcK8EEAAAAJ;y_wvIywAAAAJ;79k7bGEAAAAJ;https://scholar.google.com.tw/citations?user=j54VcVEAAAAJ;https://scholar.google.com/citations?hl=ko;https://scholar.google.co.uk/citations?user=iyD9aw8AAAAJ;v84tWxsAAAAJ", "orcid": ";;;;;;", "linkedin": "jmlee123/;;;;;;", "or_profile": "~Jongmin_Lee1;~Cosmin_Paduraru1;~Nicolas_Heess1;~Doina_Precup1;~Kee-Eung_Kim2;~Arthur_Guez1;~Daniel_J_Mankowitz1", "aff": "Korea Advanced Institute of Science & Technology;Google DeepMind;Google DeepMind;McGill University;Korea Advanced Institute of Science & Technology;Google DeepMind;Google DeepMind", "aff_domain": "kaist.ac.kr;google.com;google.com;mcgill.ca;kaist.ac.kr;google.com;deepmind.com", "position": "PhD student;Research Engineer;Research Scientist;Associate Professor;Full Professor;Research Scientist;Research Scientist", "bibtex": "@inproceedings{\nlee2022coptidice,\ntitle={{CO}pti{DICE}: Offline Constrained Reinforcement Learning via Stationary Distribution Correction Estimation},\nauthor={Jongmin Lee and Cosmin Paduraru and Daniel J Mankowitz and Nicolas Heess and Doina Precup and Kee-Eung Kim and Arthur Guez},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=FLA55mBee6Q}\n}", "github": "", "project": "", "reviewers": "qywn;QeFP;jgQ7;NsmH", "pdf_size": 0, "recommendation": "6;6;8;8", "confidence": "3;3;4;2", "correctness": "4;4;4;4", "technical_novelty": "2;3;3;3", "empirical_novelty": "3;2;3;3", "wc_summary_paper": "51;84;86;102", "wc_summary_review": "50;74;54;25", "wc_main_review": "265;381;141;195", "wc_review": "366;539;281;322", "wc_reply_reviewers": "136;0;11;14", "wc_reply_authors": "435;523;268;123", "reply_reviewers": "2;0;1;1", "reply_authors": "2;1;1;1", "recommendation_avg": [ 7.0, 1.0 ], "confidence_avg": [ 3.0, 0.7071067811865476 ], "correctness_avg": [ 4.0, 0.0 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 80.75, 18.538810641462412 ], "wc_summary_review_avg": [ 50.75, 17.426631917843448 ], "wc_main_review_avg": [ 245.5, 89.73711606687614 ], "wc_review_avg": [ 377.0, 98.24204802425487 ], "wc_reply_reviewers_avg": [ 40.25, 55.526457657588786 ], "wc_reply_authors_avg": [ 337.25, 153.91292180970382 ], "reply_reviewers_avg": [ 1.0, 0.7071067811865476 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 66, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10582988785015243548&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=FLA55mBee6Q", "email": "kaist.ac.kr;google.com;google.com;mcgill.ca;kaist.ac.kr;google.com;deepmind.com", "author_num": 7, "aff_unique_index": "0;1;1;2;0;1;1", "aff_unique_norm": "Korea Advanced Institute of Science and Technology;Google;McGill University", "aff_unique_dep": ";Google DeepMind;", "aff_unique_url": "https://www.kaist.ac.kr;https://deepmind.com;https://www.mcgill.ca", "aff_unique_abbr": "KAIST;DeepMind;McGill", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;2;0;1;1", "aff_country_unique": "South Korea;United Kingdom;Canada" }, { "id": "FLa1RPjpm2L", "title": "ED2: An Environment Dynamics Decomposition Framework for World Model Construction", "track": "main", "status": "Reject", "tldr": "", "abstract": "Model-based reinforcement learning methods achieve significant sample efficiency in many tasks, but their performance is often limited by the existence of the model error. To reduce the model error, previous works use a single well-designed network to fit the entire environment dynamics, which treats the environment dynamics as a black box. However, these methods lack to consider the environmental decomposed property that the dynamics may contain multiple sub-dynamics, which can be modeled separately, allowing us to construct the world model more accurately. In this paper, we propose the Environment Dynamics Decomposition (ED2), a novel world model construction framework that models the environment in a decomposing manner. ED2 contains two key components: sub-dynamics discovery (SD2) and dynamics decomposition prediction (D2P). SD2 discovers the sub-dynamics in an environment and then D2P constructs the decomposed world model following the sub-dynamics. ED2 can be easily combined with existing MBRL algorithms and empirical results show that ED2 significantly reduces the model error and boosts the performance of the state-of-the-art MBRL algorithms on various continuous control tasks.", "keywords": "model based reinforcement learning", "primary_area": "", "supplementary_material": "/attachment/d27d968b39b8ea5f798252a438f8de15bf8fad80.zip", "author": "Cong Wang;Tianpei Yang;Jianye HAO;YAN ZHENG;Hongyao Tang;Fazl Barez;Jinyi Liu;Jiajie Peng;haiyin piao;Zhixiao Sun", "authorids": "~Cong_Wang7;~Tianpei_Yang1;~Jianye_HAO1;~YAN_ZHENG1;~Hongyao_Tang1;~Fazl_Barez1;~Jinyi_Liu1;~Jiajie_Peng1;~haiyin_piao1;~Zhixiao_Sun1", "gender": ";F;M;M;M;;;M;M;", "homepage": "https://wangdachui627.github.io/;https://tianpeiyang.github.io/;http://www.icdai.org/jianye.html;https://yanzzzzz.github.io;https://bluecontra.github.io/;;;;https://www.researchgate.net/profile/Haiyin-Piao;https://szx-github.github.io/", "dblp": ";184/8221;21/7664.html;10/2381-2;220/4275;;192/6688-2;130/7286;269/4228.html;", "google_scholar": ";https://scholar.google.com/citations?hl=zh-CN;;https://scholar.google.com.hk/citations?user=tJuhd1kAAAAJ;yIqzRH4AAAAJ;;kaQS7NAAAAAJ;;;", "orcid": ";0000-0002-5497-7146;0000-0002-0422-8235;;;;;;;", "linkedin": ";tianpei-yang/;;;;;\u91d1\u6bc5-\u5218-5b7447118;;;", "or_profile": "~Cong_Wang7;~Tianpei_Yang1;~Jianye_HAO1;~YAN_ZHENG1;~Hongyao_Tang1;~Fazl_Barez1;~Jinyi_Liu1;~Jiajie_Peng1;~haiyin_piao1;~Zhixiao_Sun1", "aff": "Tianjin University;University of Alberta;Tianjin University;Tianjin Unibersity, China;College of Intelligence and Computing, Tianjin University;;Tianjin University;;Northwestern Polytechnical University;Northwestern Polytechnical University", "aff_domain": "tju.edu.cn;ualberta.ca;tju.edu.cn;tju.edu.cn;tju.edu.cn;;tju.edu.cn;;nwpu.edu.cn;nwpu.edu.cn", "position": "MS student;Postdoc;Associate Professor;Associate Professor;PhD student;;MS student;;PhD student;PhD student", "bibtex": "@misc{\nwang2022ed,\ntitle={{ED}2: An Environment Dynamics Decomposition Framework for World Model Construction},\nauthor={Cong Wang and Tianpei Yang and Jianye HAO and YAN ZHENG and Hongyao Tang and Fazl Barez and Jinyi Liu and Jiajie Peng and haiyin piao and Zhixiao Sun},\nyear={2022},\nurl={https://openreview.net/forum?id=FLa1RPjpm2L}\n}", "github": "", "project": "", "reviewers": "SSLz;6o3p;BTNs", "site": "https://openreview.net/forum?id=FLa1RPjpm2L", "pdf_size": 0, "recommendation": "3;5;5", "confidence": "4;5;3", "correctness": "4;3;2", "technical_novelty": "2;2;2", "empirical_novelty": "2;3;3", "wc_summary_paper": "104;45;37", "wc_summary_review": "55;17;99", "wc_main_review": "288;215;417", "wc_review": "447;277;553", "wc_reply_reviewers": "0;58;559", "wc_reply_authors": "1320;470;776", "reply_reviewers": "0;1;3", "reply_authors": "2;2;3", "recommendation_avg": [ 4.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "correctness_avg": [ 3.0, 0.816496580927726 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 62.0, 29.87752778706208 ], "wc_summary_review_avg": [ 57.0, 33.50621832834417 ], "wc_main_review_avg": [ 306.6666666666667, 83.51580023497881 ], "wc_review_avg": [ 425.6666666666667, 113.68181717212106 ], "wc_reply_reviewers_avg": [ 205.66666666666666, 250.96391949618754 ], "wc_reply_authors_avg": [ 855.3333333333334, 351.516081503472 ], "reply_reviewers_avg": [ 1.3333333333333333, 1.247219128924647 ], "reply_authors_avg": [ 2.3333333333333335, 0.4714045207910317 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 10, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": -0.8660254037844385, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17566679594458221991&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2, "aff_unique_index": "0;1;0;0;0;0;2;2", "aff_unique_norm": "Tianjin University;University of Alberta;Northwestern Polytechnical University", "aff_unique_dep": ";;", "aff_unique_url": "http://www.tju.edu.cn;https://www.ualberta.ca;https://www.nwpu.edu.cn", "aff_unique_abbr": "TJU;UAlberta;NWPU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0;0;0;0;0", "aff_country_unique": "China;Canada" }, { "id": "FNSR8Okx8a", "title": "Beyond Prioritized Replay: Sampling States in Model-Based Reinforcement Learning via Simulated Priorities", "track": "main", "status": "Reject", "tldr": "", "abstract": "Prioritized Experience Replay (ER) has been empirically shown to improve sample efficiency across many domains and attracted great attention; however, there is little theoretical understanding of why such prioritized sampling helps and its limitations. In this work, we take a deep look at the prioritized ER. In a supervised learning setting, we show the equivalence between the error-based prioritized sampling method for mean squared error and uniform sampling for cubic power loss. We then provide theoretical insight into why it improves convergence rate upon uniform sampling during early learning. Based on the insight, we further point out two limitations of the prioritized ER method: 1) outdated priorities and 2) insufficient coverage of the sample space. To mitigate the limitations, we propose our model-based stochastic gradient Langevin dynamics sampling method. We show that our method does provide states distributed close to an ideal prioritized sampling distribution estimated by the brute-force method, which does not suffer from the two limitations. We conduct experiments on both discrete and continuous control problems to show our approach's efficacy and examine the practical implication of our method in an autonomous driving application. ", "keywords": "experience replay;model-based reinforcement learning;sampling distribution;search-control;Dyna;stochastic gradient Langevin dynamics", "primary_area": "", "supplementary_material": "", "author": "Yangchen Pan;Jincheng Mei;Amir-massoud Farahmand;Martha White;Hengshuai Yao;Mohsen Rohani;Jun Luo", "authorids": "~Yangchen_Pan2;~Jincheng_Mei1;~Amir-massoud_Farahmand1;~Martha_White1;~Hengshuai_Yao2;~Mohsen_Rohani1;~Jun_Luo1", "gender": "M;M;M;F;M;;M", "homepage": "https://yannickycpan.github.io/yangchenpan/;https://jinchengmei.github.io;http://academic.sologen.net/;http://marthawhite.ca;;;https://hengshuaiyao.github.io/", "dblp": "183/0925;149/1408;17/671;60/7057;123/7116;42/2501;25/4960", "google_scholar": "4M4pOp4AAAAJ;;https://scholar.google.ca/citations?user=G5SAV7gAAAAJ;t5zdD_IAAAAJ;;;R_wcnUgAAAAJ", "orcid": ";;;0000-0002-5356-2950;;;", "linkedin": ";;amir-massoud-farahmand/;;;;", "or_profile": "~Yangchen_Pan2;~Jincheng_Mei1;~Amir-massoud_Farahmand1;~Martha_White1;~Mohsen_Rohani1;~Jun_Luo1;~hengshuai_yao1", "aff": "Huawei Technologies Ltd.;Google DeepMind;Vector Institute;University of Alberta;Huawei Technologies Ltd.;Huawei Technologies Ltd.;Huawei Technologies Ltd.", "aff_domain": "huawei.com;google.com;vectorinstitute.ai;ualberta.ca;huawei.com;huawei.com;huawei.com", "position": "Researcher;Research Scientist;Faculty Member;Associate Professor;ML Researcher;Researcher;Principal Researcher", "bibtex": "@misc{\npan2022beyond,\ntitle={Beyond Prioritized Replay: Sampling States in Model-Based Reinforcement Learning via Simulated Priorities},\nauthor={Yangchen Pan and Jincheng Mei and Amir-massoud Farahmand and Martha White and Hengshuai Yao and Mohsen Rohani and Jun Luo},\nyear={2022},\nurl={https://openreview.net/forum?id=FNSR8Okx8a}\n}", "github": "", "project": "", "reviewers": "VdDV;DyBg;xiwV;iRvC", "site": "https://openreview.net/forum?id=FNSR8Okx8a", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "5;4;4;4", "correctness": "3;3;2;3", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "70;37;89;111", "wc_summary_review": "120;49;43;84", "wc_main_review": "1639;413;289;556", "wc_review": "1829;499;421;751", "wc_reply_reviewers": "207;220;0;114", "wc_reply_authors": "747;576;660;623", "reply_reviewers": "1;1;0;1", "reply_authors": "1;2;1;2", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 76.75, 27.151197027018902 ], "wc_summary_review_avg": [ 74.0, 30.83018001893599 ], "wc_main_review_avg": [ 724.25, 536.5153189798032 ], "wc_review_avg": [ 875.0, 564.1329630503787 ], "wc_reply_reviewers_avg": [ 135.25, 88.14015827079051 ], "wc_reply_authors_avg": [ 651.5, 62.659795722616266 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:5jhat3qxnZAJ:scholar.google.com/&scioq=Beyond+Prioritized+Replay:+Sampling+States+in+Model-Based+Reinforcement+Learning+via+Simulated+Priorities&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1;2;3;0;0;0", "aff_unique_norm": "Huawei;Google;Vector Institute;University of Alberta", "aff_unique_dep": "Huawei Technologies;Google DeepMind;;", "aff_unique_url": "https://www.huawei.com;https://deepmind.com;https://vectorinstitute.ai/;https://www.ualberta.ca", "aff_unique_abbr": "Huawei;DeepMind;Vector Institute;UAlberta", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2;2;0;0;0", "aff_country_unique": "China;United Kingdom;Canada" }, { "id": "FOfKpDnp2P", "title": "BIGRoC: Boosting Image Generation via a Robust Classifier", "track": "main", "status": "Reject", "tldr": "", "abstract": "The interest of the machine learning community in image synthesis has grown significantly in recent years, with the introduction of a wide range of deep generative models and means for training them. Such machines\u2019 ultimate goal is to match the distributions of the given training images and the synthesized ones. In this work, we propose a general model-agnostic technique for improving the image quality and the distribution fidelity of generated images, obtained by any generative model. Our method, termed BIGRoC (boosting image generation via a robust classifier), is based on a post-processing procedure via the guidance of a given robust classifier and without a need for additional training of the generative model. Given a synthesized image, we propose to update it through projected gradient steps over the robust classifier, in an attempt to refine its recognition. We demonstrate this post-processing algorithm on various image synthesis methods and show a significant improvement of the generated images, both quantitatively and qualitatively.", "keywords": "Image Generation;Adversarial Robustness;Perceptually Aligned Gradients", "primary_area": "", "supplementary_material": "/attachment/2cdab8131c655b4d8a60801ea85f31f3392218ec.zip", "author": "Roy Ganz;Michael Elad", "authorids": "~Roy_Ganz1;~Michael_Elad1", "gender": "M;M", "homepage": "https://royg27.github.io/;https://elad.cs.technion.ac.il/", "dblp": "289/5822;e/MichaelElad", "google_scholar": "2E0FHMoAAAAJ;UpZbV44AAAAJ", "orcid": ";0000-0001-8131-6928", "linkedin": "roy-ganz-270592/;michael-elad-5553852a3/", "or_profile": "~Roy_Ganz1;~Michael_Elad1", "aff": "Technion, Technion;Verily", "aff_domain": "technion.ac.il;verily.com", "position": "MS student;Principal Researcher", "bibtex": "@misc{\nganz2022bigroc,\ntitle={{BIGR}oC: Boosting Image Generation via a Robust Classifier},\nauthor={Roy Ganz and Michael Elad},\nyear={2022},\nurl={https://openreview.net/forum?id=FOfKpDnp2P}\n}", "github": "", "project": "", "reviewers": "Ezr2;jEYy;v7Qv", "site": "https://openreview.net/forum?id=FOfKpDnp2P", "pdf_size": 0, "recommendation": "3;5;5", "confidence": "5;3;3", "correctness": "3;3;3", "technical_novelty": "2;3;2", "empirical_novelty": "2;2;2", "wc_summary_paper": "119;80;86", "wc_summary_review": "46;65;27", "wc_main_review": "283;230;225", "wc_review": "448;375;338", "wc_reply_reviewers": "224;91;48", "wc_reply_authors": "1355;495;681", "reply_reviewers": "2;1;1", "reply_authors": "3;2;2", "recommendation_avg": [ 4.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.9428090415820634 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 95.0, 17.146428199482248 ], "wc_summary_review_avg": [ 46.0, 15.513435037626794 ], "wc_main_review_avg": [ 246.0, 26.242459234352765 ], "wc_review_avg": [ 387.0, 45.70193285482209 ], "wc_reply_reviewers_avg": [ 121.0, 74.9177326583411 ], "wc_reply_authors_avg": [ 843.6666666666666, 369.4548536545283 ], "reply_reviewers_avg": [ 1.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 2.3333333333333335, 0.4714045207910317 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -1.0, "corr_recommendation_correctness": 0.0, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=128128249237651060&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff_unique_index": "0;1", "aff_unique_norm": "Technion - Israel Institute of Technology;Verily", "aff_unique_dep": ";", "aff_unique_url": "https://www.technion.ac.il/en/;https://www.verily.com", "aff_unique_abbr": "Technion;Verily", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "Israel;United States" }, { "title": "Domino: Discovering Systematic Errors with Cross-Modal Embeddings", "status": "Oral", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6148", "id": "FPCMqjI0jXN", "poster": "", "openreview": "https://openreview.net/forum?id=FPCMqjI0jXN", "slides": "https://iclr.cc/virtual/2022/poster/6148", "video": "https://iclr.cc/virtual/2022/poster/6148", "author_site": "Sabri Eyuboglu, Maya Varma, Khaled Saab, Jean-Benoit Delbrouck, Christopher Lee-Messer, Jared Dunnmon, James Y Zou, Christopher Re", "tldr": "", "abstract": "Machine learning models that achieve high overall accuracy often make systematic errors on important subsets (or slices) of data. Identifying underperforming slices is particularly challenging when working with high-dimensional inputs (e.g. images, audio), where important slices are often unlabeled. In order to address this issue, recent studies have proposed automated slice discovery methods (SDMs), which leverage learned model representations to mine input data for slices on which a model performs poorly. To be useful to a practitioner, these methods must identify slices that are both underperforming and coherent (i.e. united by a human-understandable concept). However, no quantitative evaluation framework currently exists for rigorously assessing SDMs with respect to these criteria. Additionally, prior qualitative evaluations have shown that SDMs often identify slices that are incoherent. In this work, we address these challenges by first designing a principled evaluation framework that enables a quantitative comparison of SDMs across 1,235 slice discovery settings in three input domains (natural images, medical images, and time-series data).\nThen, motivated by the recent development of powerful cross-modal representation learning approaches, we present Domino, an SDM that leverages cross-modal embeddings and a novel error-aware mixture model to discover and describe coherent slices. We find that Domino accurately identifies 36% of the 1,235 slices in our framework -- a 12 percentage point improvement over prior methods. Further, Domino is the first SDM that can provide natural language descriptions of identified slices, correctly generating the exact name of the slice in 35% of settings. ", "keywords": "robustness;subgroup analysis;error analysis;multimodal;slice discovery", "primary_area": "", "supplementary_material": "", "author": "Sabri Eyuboglu;Maya Varma;Khaled Kamal Saab;Jean-Benoit Delbrouck;Christopher Lee-Messer;Jared Dunnmon;James Zou;Christopher Re", "authorids": "~Sabri_Eyuboglu1;~Maya_Varma1;~Khaled_Kamal_Saab1;~Jean-Benoit_Delbrouck1;~Christopher_Lee-Messer1;~Jared_Dunnmon1;~James_Zou1;~Christopher_Re1", "gender": ";;;;M;M;;", "homepage": "http://www.sabrieyuboglu.com/;https://maya-varma.com/;https://web.stanford.edu/~ksaab/;;;;;", "dblp": "298/7563;233/4077;176/4061;;;200/8265;;", "google_scholar": ";;W77CiNUAAAAJ;;zEAw56MAAAAJ;;23ZXZvEAAAAJ;", "orcid": ";;0000-0003-1427-0469;;0000-0002-2938-6184;;;", "linkedin": ";;khaled-saab-181034122/;;;;;", "or_profile": "~Sabri_Eyuboglu1;~Maya_Varma1;~Khaled_Kamal_Saab1;~Jean-Benoit_Delbrouck1;~Christopher_Lee-Messer1;~Jared_Dunnmon1;~James_Zou1;~Christopher_Re1", "aff": "Stanford University;Stanford University;Stanford University;;;;Stanford University;", "aff_domain": "stanford.edu;stanford.edu;stanford.edu;;;;stanford.edu;", "position": "PhD student;PhD student;PhD student;;;;Assistant Professor;", "bibtex": "@inproceedings{\neyuboglu2022domino,\ntitle={Domino: Discovering Systematic Errors with Cross-Modal Embeddings},\nauthor={Sabri Eyuboglu and Maya Varma and Khaled Kamal Saab and Jean-Benoit Delbrouck and Christopher Lee-Messer and Jared Dunnmon and James Zou and Christopher Re},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=FPCMqjI0jXN}\n}", "github": "", "project": "", "reviewers": "hQrb;LuSN;si46", "pdf_size": 0, "recommendation": "6;8;8", "confidence": "3;2;2", "correctness": "2;3;4", "technical_novelty": "2;2;3", "empirical_novelty": "2;3;3", "wc_summary_paper": "73;28;360", "wc_summary_review": "27;53;17", "wc_main_review": "259;220;110", "wc_review": "359;301;487", "wc_reply_reviewers": "0;0;10", "wc_reply_authors": "1172;1172;445", "reply_reviewers": "0;0;1", "reply_authors": "3;2;1", "recommendation_avg": [ 7.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 2.3333333333333335, 0.4714045207910317 ], "correctness_avg": [ 3.0, 0.816496580927726 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 153.66666666666666, 147.051767150967 ], "wc_summary_review_avg": [ 32.333333333333336, 15.173075568988056 ], "wc_main_review_avg": [ 196.33333333333334, 63.08900238728001 ], "wc_review_avg": [ 382.3333333333333, 77.70599862444483 ], "wc_reply_reviewers_avg": [ 3.3333333333333335, 4.714045207910316 ], "wc_reply_authors_avg": [ 929.6666666666666, 342.71108661508003 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 2.0, 0.816496580927726 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": -0.9999999999999998, "corr_recommendation_correctness": 0.8660254037844385, "gs_citation": 160, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3701715996616106211&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=FPCMqjI0jXN", "email": "stanford.edu;stanford.edu;stanford.edu;;;;stanford.edu;", "author_num": 8, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Stanford University", "aff_unique_dep": "", "aff_unique_url": "https://www.stanford.edu", "aff_unique_abbr": "Stanford", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "FPGs276lUeq", "title": "Palette: Image-to-Image Diffusion Models", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "We introduce Palette, a simple and general framework for image-to-image translation using conditional diffusion models. On four challenging image-to-image translation tasks (colorization, inpainting, uncropping, and JPEG decompression), Palette outperforms strong GAN and regression baselines, and establishes a new state-of-the-art result. This is accomplished without task-specific hyper-parameter tuning, architecture customization, or any auxiliary loss demonstrating a desirable degree of generality and flexibility. We uncover the impact of using L2 vs. L1 loss in the denoising diffusion objective on sample diversity, and demonstrate the importance of self-attention through empirical architecture studies. Importantly, we advocate a unified evaluation protocol based on ImageNet, and report several sample quality scores including FID, Inception Score, Classification Accuracy of a pre-trained ResNet-50, and Perceptual Distance against reference images for various baselines. We expect this standardized evaluation protocol to play a critical role in advancing image-to-image translation research. Finally, we show that a single generalist Palette model trained on 3 tasks (colorization, inpainting, JPEG decompression) performs as well or better than task-specific specialist counterparts.", "keywords": "machine learning;artificial intelligence;computer vision", "primary_area": "", "supplementary_material": "", "author": "Chitwan Saharia;William Chan;Huiwen Chang;Chris A. Lee;Jonathan Ho;Tim Salimans;David J. Fleet;Mohammad Norouzi", "authorids": "~Chitwan_Saharia1;~William_Chan1;~Huiwen_Chang2;~Chris_A._Lee1;~Jonathan_Ho1;~Tim_Salimans1;~David_J._Fleet1;~Mohammad_Norouzi1", "gender": "M;;F;;;M;M;M", "homepage": "https://www.chitwansaharia.github.io;http://williamchan.ca;;;;;http://www.cs.toronto.edu/~fleet/index.html;https://norouzi.github.io/", "dblp": "228/8172;58/2301;131/4389;;80/8677;116/2791;07/2099;https://dblp.org/pers/hd/n/Norouzi_0002:Mohammad", "google_scholar": ";Nla9qfUAAAAJ;eZQNcvcAAAAJ;jbK_A2sAAAAJ;iVLAQysAAAAJ;;https://scholar.google.com.tw/citations?user=njOmQFsAAAAJ;Lncr-VoAAAAJ", "orcid": ";;;;;;;", "linkedin": ";;;;;;;", "or_profile": "~Chitwan_Saharia1;~William_Chan1;~Huiwen_Chang2;~Chris_A._Lee1;~Jonathan_Ho1;~Tim_Salimans1;~David_J._Fleet1;~Mohammad_Norouzi1", "aff": "Google;Google Brain;Research, Google;;Google;Google;Department of Computer Science, University of Toronto;Google Brain", "aff_domain": "google.com;google.com;research.google.com;;google.com;google.com;cs.toronto.edu;google.com", "position": "AI Resident;Research Scientist;Researcher;;Researcher;Research Scientist;Full Professor;Research Scientist", "bibtex": "@misc{\nsaharia2022palette,\ntitle={Palette: Image-to-Image Diffusion Models},\nauthor={Chitwan Saharia and William Chan and Huiwen Chang and Chris A. Lee and Jonathan Ho and Tim Salimans and David J. Fleet and Mohammad Norouzi},\nyear={2022},\nurl={https://openreview.net/forum?id=FPGs276lUeq}\n}", "github": "", "project": "", "reviewers": "sFWV;TLGP;1nqv;tr4L", "site": "https://openreview.net/forum?id=FPGs276lUeq", "pdf_size": 0, "recommendation": "3;3;3;10", "confidence": "4;4;3;4", "correctness": "1;3;4;4", "technical_novelty": "1;2;2;3", "empirical_novelty": "2;2;2;4", "wc_summary_paper": "60;109;37;58", "wc_summary_review": "44;45;130;11", "wc_main_review": "171;255;162;470", "wc_review": "275;409;329;539", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "87;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "1;0;0;0", "recommendation_avg": [ 4.75, 3.031088913245535 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.0, 1.224744871391589 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 66.0, 26.410225292488512 ], "wc_summary_review_avg": [ 57.5, 44.03691633164157 ], "wc_main_review_avg": [ 264.5, 124.06550689051329 ], "wc_review_avg": [ 388.0, 99.36297097007517 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 21.75, 37.67210506462308 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0.25, 0.4330127018922193 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": 0.4714045207910316, "gs_citation": 1667, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7120993259580259431&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "aff_unique_index": "0;0;0;0;0;1;0", "aff_unique_norm": "Google;University of Toronto", "aff_unique_dep": "Google;Department of Computer Science", "aff_unique_url": "https://www.google.com;https://www.utoronto.ca", "aff_unique_abbr": "Google;U of T", "aff_campus_unique_index": "0;0;0;0;0;1;0", "aff_campus_unique": "Mountain View;Toronto", "aff_country_unique_index": "0;0;0;0;0;1;0", "aff_country_unique": "United States;Canada" }, { "title": "Handling Distribution Shifts on Graphs: An Invariance Perspective", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7180", "id": "FQOC5u-1egI", "poster": "", "openreview": "https://openreview.net/forum?id=FQOC5u-1egI", "slides": "https://iclr.cc/virtual/2022/poster/7180", "video": "https://iclr.cc/virtual/2022/poster/7180", "author_site": "Qitan Wu, Hengrui Zhang, Junchi Yan, David Wipf", "tldr": "", "abstract": "There is increasing evidence suggesting neural networks' sensitivity to distribution shifts, so that research on out-of-distribution (OOD) generalization comes into the spotlight. Nonetheless, current endeavors mostly focus on Euclidean data, and its formulation for graph-structured data is not clear and remains under-explored, given two-fold fundamental challenges: 1) the inter-connection among nodes in one graph, which induces non-IID generation of data points even under the same environment, and 2) the structural information in the input graph, which is also informative for prediction. In this paper, we formulate the OOD problem on graphs and develop a new invariant learning approach, Explore-to-Extrapolate Risk Minimization (EERM), that facilitates graph neural networks to leverage invariance principles for prediction. EERM resorts to multiple context explorers (specified as graph structure editers in our case) that are adversarially trained to maximize the variance of risks from multiple virtual environments. Such a design enables the model to extrapolate from a single observed environment which is the common case for node-level prediction. We prove the validity of our method by theoretically showing its guarantee of a valid OOD solution and further demonstrate its power on various real-world datasets for handling distribution shifts from artificial spurious features, cross-domain transfers and dynamic graph evolution.", "keywords": "Representation Learning on Graphs;Out-of-Distribution Generalization;Domain Shift;Graph Structure Learning;Invariant Models", "primary_area": "", "supplementary_material": "", "author": "Qitian Wu;Hengrui Zhang;Junchi Yan;David Wipf", "authorids": "~Qitian_Wu1;~Hengrui_Zhang1;~Junchi_Yan2;~David_Wipf1", "gender": ";M;;M", "homepage": ";https://hengruizhang98.github.io;;http://www.davidwipf.com/", "dblp": ";;;81/6421", "google_scholar": ";iwffiD0AAAAJ;;YJx1WSgAAAAJ", "orcid": ";0009-0006-1330-0899;;", "linkedin": ";;;", "or_profile": "~Qitian_Wu1;~Hengrui_Zhang1;~Junchi_Yan2;~David_Wipf1", "aff": ";University of Illinois, Chicago;;Amazon AI Research Lab", "aff_domain": ";uic.edu;;amazon.com", "position": ";PhD student;;Principal Research Scientist", "bibtex": "@inproceedings{\nwu2022handling,\ntitle={Handling Distribution Shifts on Graphs: An Invariance Perspective},\nauthor={Qitian Wu and Hengrui Zhang and Junchi Yan and David Wipf},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=FQOC5u-1egI}\n}", "github": "", "project": "", "reviewers": "DyjE;SVEc;xk81;Wo9P", "pdf_size": 0, "recommendation": "5;6;6;6", "confidence": "4;2;4;3", "correctness": "3;4;4;3", "technical_novelty": "2;3;3;2", "empirical_novelty": "2;3;3;0", "wc_summary_paper": "107;163;61;63", "wc_summary_review": "42;7;76;68", "wc_main_review": "276;228;669;199", "wc_review": "425;398;806;330", "wc_reply_reviewers": "33;69;37;45", "wc_reply_authors": "1588;1032;1747;1059", "reply_reviewers": "1;1;1;1", "reply_authors": "3;3;3;2", "recommendation_avg": [ 5.75, 0.4330127018922193 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.0, 1.224744871391589 ], "wc_summary_paper_avg": [ 98.5, 41.53010955921017 ], "wc_summary_review_avg": [ 48.25, 26.929305598176867 ], "wc_main_review_avg": [ 343.0, 190.21435277076228 ], "wc_review_avg": [ 489.75, 185.83914415429274 ], "wc_reply_reviewers_avg": [ 46.0, 13.96424004376894 ], "wc_reply_authors_avg": [ 1356.5, 316.1838863699414 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 2.75, 0.4330127018922193 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.5222329678670935, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 245, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15550862662340330123&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=FQOC5u-1egI", "email": ";uic.edu;;amazon.com", "author_num": 4, "aff_unique_index": "0;1", "aff_unique_norm": "University of Illinois at Chicago;Amazon", "aff_unique_dep": ";Amazon AI Research Lab", "aff_unique_url": "https://www.uic.edu;https://www.amazon.com", "aff_unique_abbr": "UIC;Amazon AI", "aff_campus_unique_index": "0", "aff_campus_unique": "Chicago;", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "FRct9agbco", "title": "Constrained Mean Shift for Representation Learning", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "We are interested in representation learning from labeled or unlabeled data. Inspired by the recent success of self-supervised learning (SSL), we develop a non-contrastive representation learning method that can exploit additional knowledge. This additional knowledge may come from annotated labels in the supervised setting or an SSL model from another modality in the SSL setting. Our main idea is to generalize the mean-shift algorithm by constraining the search space of nearest neighbors, resulting in semantically purer representations.\nOur method simply pulls the embedding of an instance closer to its nearest neighbors in a search space that is constrained using the additional knowledge. By leveraging this non-contrastive loss, we show that the supervised ImageNet-1k pretraining with our method results in better transfer performance as compared to the baselines. Further, we demonstrate that our method is relatively robust to label noise. Finally, we show that it is possible to use the noisy constraint across modalities to train self-supervised video models.", "keywords": "Computer Vision;Self-Supervised Learning;Representation Learning", "primary_area": "", "supplementary_material": "/attachment/48deb11c6b68a2ffa36d05b620422874a450dcd0.zip", "author": "Ajinkya Tejankar;Soroush Abbasi Koohpayegani;Hamed Pirsiavash", "authorids": "~Ajinkya_Tejankar1;~Soroush_Abbasi_Koohpayegani1;~Hamed_Pirsiavash1", "gender": "M;M;M", "homepage": "https://ajtejankar.github.io;http://soroush-abbasi.github.io;https://web.cs.ucdavis.edu/~hpirsiav/", "dblp": "255/5662;277/5486;07/6340", "google_scholar": "zt4D3G4AAAAJ;JS10DM0AAAAJ;https://scholar.google.com.tw/citations?user=c9XXy4MAAAAJ", "orcid": ";;", "linkedin": "ajinkya-tejankar-79854445/;;hpirsiav/", "or_profile": "~Ajinkya_Tejankar1;~Soroush_Abbasi_Koohpayegani1;~Hamed_Pirsiavash1", "aff": "University of Maryland, Baltimore County;University of Maryland, Baltimore County;University of California, Davis", "aff_domain": "umbc.edu;umbc.edu;ucdavis.edu", "position": "PhD student;PhD student;Associate Professor", "bibtex": "@misc{\ntejankar2022constrained,\ntitle={Constrained Mean Shift for Representation Learning},\nauthor={Ajinkya Tejankar and Soroush Abbasi Koohpayegani and Hamed Pirsiavash},\nyear={2022},\nurl={https://openreview.net/forum?id=FRct9agbco}\n}", "github": "", "project": "", "reviewers": "UJEx;1Tnt;HVeh;SGaC", "site": "https://openreview.net/forum?id=FRct9agbco", "pdf_size": 0, "recommendation": "5;5;5;5", "confidence": "5;3;4;4", "correctness": "3;3;4;3", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "215;83;116;89", "wc_summary_review": "34;106;49;80", "wc_main_review": "346;287;169;637", "wc_review": "595;476;334;806", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 5.0, 0.0 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 125.75, 53.00648545225386 ], "wc_summary_review_avg": [ 67.25, 27.851166941440713 ], "wc_main_review_avg": [ 359.75, 172.2895455330938 ], "wc_review_avg": [ 552.75, 172.961520287028 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3612147122512044460&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0;1", "aff_unique_norm": "University of Maryland, Baltimore County;University of California, Davis", "aff_unique_dep": ";", "aff_unique_url": "https://www.umbc.edu;https://www.ucdavis.edu", "aff_unique_abbr": "UMBC;UC Davis", "aff_campus_unique_index": "0;0;1", "aff_campus_unique": "Baltimore County;Davis", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Amortized Tree Generation for Bottom-up Synthesis Planning and Synthesizable Molecular Design", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6883", "id": "FRxhHdnxt1", "poster": "", "openreview": "https://openreview.net/forum?id=FRxhHdnxt1", "slides": "https://iclr.cc/virtual/2022/poster/6883", "video": "https://iclr.cc/virtual/2022/poster/6883", "author_site": "Wenhao Gao, Roc\u00edo Mercado, Connor Coley", "tldr": "", "abstract": "Molecular design and synthesis planning are two critical steps in the process of molecular discovery that we propose to formulate as a single shared task of conditional synthetic pathway generation. We report an amortized approach to generate synthetic pathways as a Markov decision process conditioned on a target molecular embedding. This approach allows us to conduct synthesis planning in a bottom-up manner and design synthesizable molecules by decoding from optimized conditional codes, demonstrating the potential to solve both problems of design and synthesis simultaneously. The approach leverages neural networks to probabilistically model the synthetic trees, one reaction step at a time, according to reactivity rules encoded in a discrete action space of reaction templates. We train these networks on hundreds of thousands of artificial pathways generated from a pool of purchasable compounds and a list of expert-curated templates. We validate our method with (a) the recovery of molecules using conditional generation, (b) the identification of synthesizable structural analogs, and (c) the optimization of molecular structures given oracle functions relevant to bioactivity and drug discovery.", "keywords": "molecular design;synthesis planning;tree generation;graph generation", "primary_area": "", "supplementary_material": "/attachment/1eae0d5d5f7f74a3ff02aeb14e519820d38a5072.zip", "author": "Wenhao Gao;Roc\u00edo Mercado;Connor W. Coley", "authorids": "~Wenhao_Gao1;~Roc\u00edo_Mercado1;~Connor_W._Coley1", "gender": "M;F;M", "homepage": "https://wenhao-gao.github.io;https://rociomer.github.io/;https://coley.mit.edu", "dblp": "177/0968;;206/6284", "google_scholar": "s4eywrUAAAAJ;v2P0-IoAAAAJ;l015S80AAAAJ", "orcid": "0000-0002-6506-8044;0000-0002-6170-6088;0000-0002-8271-8723", "linkedin": ";rociomer/;", "or_profile": "~Wenhao_Gao1;~Roc\u00edo_Mercado1;~Connor_Coley1", "aff": "Massachusetts Institute of Technology;Massachusetts Institute of Technology;Massachusetts Institute of Technology", "aff_domain": "mit.edu;mit.edu;mit.edu", "position": "PhD student;Postdoc;Assistant Professor", "bibtex": "@inproceedings{\ngao2022amortized,\ntitle={Amortized Tree Generation for Bottom-up Synthesis Planning and Synthesizable Molecular Design},\nauthor={Wenhao Gao and Roc{\\'\\i}o Mercado and Connor W. Coley},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=FRxhHdnxt1}\n}", "github": "", "project": "", "reviewers": "JJJC;1rvZ;GkiB;vmGs", "pdf_size": 0, "recommendation": "3;8;8;8", "confidence": "4;3;4;5", "correctness": "2;3;4;4", "technical_novelty": "3;3;3;3", "empirical_novelty": "2;3;3;4", "wc_summary_paper": "153;81;103;29", "wc_summary_review": "92;12;98;71", "wc_main_review": "233;166;698;897", "wc_review": "478;259;899;997", "wc_reply_reviewers": "841;0;63;26", "wc_reply_authors": "1643;598;552;663", "reply_reviewers": "5;0;1;1", "reply_authors": "3;1;1;1", "recommendation_avg": [ 6.75, 2.165063509461097 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 91.5, 44.52808102759426 ], "wc_summary_review_avg": [ 68.25, 33.98804937033015 ], "wc_main_review_avg": [ 498.5, 308.0783179647669 ], "wc_review_avg": [ 658.25, 301.9117213690121 ], "wc_reply_reviewers_avg": [ 232.5, 352.03018336500634 ], "wc_reply_authors_avg": [ 864.0, 451.48145033877086 ], "reply_reviewers_avg": [ 1.75, 1.920286436967152 ], "reply_authors_avg": [ 1.5, 0.8660254037844386 ], "replies_avg": [ 23, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.8703882797784891, "gs_citation": 78, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15831555537133301162&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=FRxhHdnxt1", "email": "mit.edu;mit.edu;mit.edu", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Massachusetts Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://web.mit.edu", "aff_unique_abbr": "MIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "FS0XKbpkdOu", "title": "Sphere2Vec: Self-Supervised Location Representation Learning on Spherical Surfaces", "track": "main", "status": "Reject", "tldr": "", "abstract": "Location encoding is valuable for a multitude of tasks where both the absolute positions and local contexts (image, text, and other types of metadata) of spatial objects are needed for accurate predictions. However, most existing approaches do not leverage unlabeled data, which is crucial for use cases with limited labels. Furthermore, the availability of large-scale real-world GPS coordinate data demands representation and prediction at global scales. However, existing location encoding models assume that the input coordinates are in Euclidean space, which can lead to modeling errors due to distortions introduced when mapping coordinates from other manifolds (e.g., spherical surfaces) to Euclidean space. We introduceSphere2Vec, a location encoder, which can directly encode spherical coordinates while preserving spherical distances.Sphere2Vecis trained with a self-supervised learning framework which pre-trains deep location representations from unlabeled geo-tagged images with contrastive losses, and then fine-tunes to perform super-vised geographic object classification tasks.Sphere2Vecachieves the performances of state-of-the-art results on various image classification tasks ranging from species, Point of Interest (POI) facade, to remote sensing. The self-supervised pertaining significantly improves the performance ofSphere2Vecespecially when the labeled data is limited", "keywords": "Self-Supervised Learning;Location Representation Learning;Double Fourier Sphere", "primary_area": "", "supplementary_material": "/attachment/ea0d4567af19c3e6d30e3e9fc380ee822e5ad78c.zip", "author": "Gengchen Mai;Yao Xuan;Wenyun Zuo;Yutong He;Stefano Ermon;Jiaming Song;Krzysztof Janowicz;Ni Lao", "authorids": "~Gengchen_Mai1;~Yao_Xuan1;wyzuo@stanford.edu;~Yutong_He1;~Stefano_Ermon1;~Jiaming_Song1;~Krzysztof_Janowicz2;~Ni_Lao1", "gender": "M;M;;F;M;M;;M", "homepage": "https://gengchenmai.github.io/;http://linkedin.com/in/yao-xuan-752a5518a;;https://kellyyutonghe.github.io/;http://cs.stanford.edu/~ermon/;http://tsong.me;;http://www.cs.cmu.edu/~nlao", "dblp": "151/5583;244/2235;;;47/8135;173/5104;95/5567;82/283", "google_scholar": "X2Wfl1UAAAAJ;;;uNF3hk0AAAAJ;;;;iUgWR3MAAAAJ", "orcid": "0000-0002-7818-7309;;;;;;;0000-0002-4034-7784", "linkedin": "gengchen-mai-144439121/;;;yutong-he-b7608b12b/;;jiamings/;;ni-lao", "or_profile": "~Gengchen_Mai1;~Yao_Xuan1;wyzuo@stanford.edu;~Yutong_He1;~Stefano_Ermon1;~Jiaming_Song1;~Krzysztof_Janowicz2;~Ni_Lao1", "aff": "Computer Science Department, Stanford University;Meta Platforms, Inc.;;Computer Science Department, Stanford University;Stanford University;Computer Science Department, Stanford University;UC Santa Barbara;Google", "aff_domain": "cs.stanford.edu;meta.com;;cs.stanford.edu;stanford.edu;cs.stanford.edu;ucsb.edu;google.com", "position": "Postdoc;Research scientist;;MS student;Assistant Professor;Postdoc;Full Professor;Researcher", "bibtex": "@misc{\nmai2022spherevec,\ntitle={Sphere2Vec: Self-Supervised Location Representation Learning on Spherical Surfaces},\nauthor={Gengchen Mai and Yao Xuan and Wenyun Zuo and Yutong He and Stefano Ermon and Jiaming Song and Krzysztof Janowicz and Ni Lao},\nyear={2022},\nurl={https://openreview.net/forum?id=FS0XKbpkdOu}\n}", "github": "", "project": "", "reviewers": "6cCW;toM9;6Te4;vjnw", "site": "https://openreview.net/forum?id=FS0XKbpkdOu", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "4;4;2;4", "correctness": "3;3;3;4", "technical_novelty": "2;2;1;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "154;46;98;89", "wc_summary_review": "70;31;88;156", "wc_main_review": "529;190;151;259", "wc_review": "753;267;337;504", "wc_reply_reviewers": "163;0;0;41", "wc_reply_authors": "1333;375;212;230", "reply_reviewers": "1;0;0;1", "reply_authors": "3;2;2;1", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 3.5, 0.8660254037844386 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 96.75, 38.45370593323874 ], "wc_summary_review_avg": [ 86.25, 45.234804078275836 ], "wc_main_review_avg": [ 282.25, 147.6166911294248 ], "wc_review_avg": [ 465.25, 187.11811109563928 ], "wc_reply_reviewers_avg": [ 51.0, 66.7944608481871 ], "wc_reply_authors_avg": [ 537.5, 463.60894081111076 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 2.0, 0.7071067811865476 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": -0.13245323570650439, "corr_recommendation_correctness": 0.6622661785325219, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6323938051564683573&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;0;0;0;2;3", "aff_unique_norm": "Stanford University;Meta;University of California, Santa Barbara;Google", "aff_unique_dep": "Computer Science Department;Meta Platforms, Inc.;;Google", "aff_unique_url": "https://www.stanford.edu;https://www.meta.com;https://www.ucsb.edu;https://www.google.com", "aff_unique_abbr": "Stanford;Meta;UCSB;Google", "aff_campus_unique_index": "0;0;0;0;2;3", "aff_campus_unique": "Stanford;;Santa Barbara;Mountain View", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "FVJTyOUJzti", "title": "Adaptive Differential Privacy in Federated Learning: A Priority-Based Approach", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Federated learning (FL) as one of the novel branches of distributed machine learning (ML), develops global models through a private procedure without direct access to local datasets. However, access to model updates (e.g. gradient updates in deep neural networks) transferred between clients and servers can reveal sensitive information to adversaries. Differential privacy (DP) offers a framework that gives a privacy guarantee by adding certain amounts of noise to parameters. This approach, although being effective in terms of privacy, adversely affects model performance due to noise involvement. Hence, it is always needed to find a balance between noise injection and the sacrificed accuracy. To address this challenge, we propose adaptive noise addition in FL which decides the value of injected noise based on features\u2019 relative importance. Here, we first propose two effective methods for prioritizing features in deep neural network models and then perturb models' weights based on this information. Specifically, we try to figure out whether the idea of adding more noise to less important parameters and less noise to more important parameters can effectively save the model accuracy while preserving privacy. Our experiments confirm this statement under some conditions. The amount of noise injected, the proportion of parameters involved, and the number of global iterations can significantly change the output. While a careful choice of parameters by considering the properties of datasets can improve privacy without intense loss of accuracy, a bad choice can make the model performance worse.", "keywords": "Federated Learning;Differential privacy;Feature importance;Deep neural networks", "primary_area": "", "supplementary_material": "", "author": "Mahtab Talaei;Iman Izadi", "authorids": "~Mahtab_Talaei1;~Iman_Izadi1", "gender": ";", "homepage": ";https://izadi.iut.ac.ir", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": "~Mahtab_Talaei1;~Iman_Izadi1", "aff": ";Isfahan University of Technology, University of Tehran", "aff_domain": ";iut.ac.ir", "position": ";Associate Professor", "bibtex": "@misc{\ntalaei2022adaptive,\ntitle={Adaptive Differential Privacy in Federated Learning: A Priority-Based Approach},\nauthor={Mahtab Talaei and Iman Izadi},\nyear={2022},\nurl={https://openreview.net/forum?id=FVJTyOUJzti}\n}", "github": "", "project": "", "reviewers": "Mmzg;6Zhp;Ty7k;LQV7;QbNp", "site": "https://openreview.net/forum?id=FVJTyOUJzti", "pdf_size": 0, "recommendation": "1;1;3;3;3", "confidence": "3;5;4;4;4", "correctness": "2;1;2;2;4", "technical_novelty": "1;1;2;1;3", "empirical_novelty": "2;1;2;1;2", "wc_summary_paper": "62;30;121;26;94", "wc_summary_review": "56;6;79;12;70", "wc_main_review": "186;76;1064;125;253", "wc_review": "304;112;1264;163;417", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "0;0;0;0;0", "reply_reviewers": "0;0;0;0;0", "reply_authors": "0;0;0;0;0", "recommendation_avg": [ 2.2, 0.9797958971132712 ], "confidence_avg": [ 4.0, 0.6324555320336759 ], "correctness_avg": [ 2.2, 0.9797958971132712 ], "technical_novelty_avg": [ 1.6, 0.8 ], "empirical_novelty_avg": [ 1.6, 0.4898979485566356 ], "wc_summary_paper_avg": [ 66.6, 36.65842331579469 ], "wc_summary_review_avg": [ 44.6, 30.037310132566798 ], "wc_main_review_avg": [ 340.8, 366.4365702273724 ], "wc_review_avg": [ 452.0, 419.9033222064336 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.5833333333333334, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12982171505242464018&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 3, "aff_unique_index": "0", "aff_unique_norm": "Isfahan University of Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.iut.ac.ir", "aff_unique_abbr": "IUT", "aff_campus_unique_index": "0", "aff_campus_unique": "Isfahan", "aff_country_unique_index": "0", "aff_country_unique": "Iran" }, { "id": "FWiwSGJ_Bpa", "title": "Non-Parametric Neuro-Adaptive Control Subject to Task Specifications", "track": "main", "status": "Reject", "tldr": "", "abstract": "We develop a learning-based algorithm for the control of autonomous systems governed by unknown, nonlinear dynamics to satisfy user-specified spatio-temporal tasks expressed as signal temporal logic specifications. Most existing algorithms either assume certain parametric forms for the unknown dynamic terms or resort to unnecessarily large control inputs in order to provide theoretical guarantees. \nThe proposed algorithm addresses these drawbacks by integrating neural-network-based learning with adaptive control. More specifically, the algorithm learns a controller, represented as a neural network, using training data that correspond to a collection of system parameters and tasks. These parameters and tasks are derived by varying the nominal parameters and the spatio-temporal constraints of the user-specified task, respectively. It then incorporates this neural network into an online closed-form adaptive control policy in such a way that the resulting behavior satisfies the user-defined task. The proposed algorithm does not use any a priori information on the unknown dynamic terms or any approximation schemes. We provide formal theoretical guarantees on the satisfaction of the task. Numerical experiments on a robotic manipulator and a unicycle robot demonstrate that the proposed algorithm guarantees the satisfaction of 50 user-defined tasks, and outperforms control policies that do not employ online adaptation or the neural-network controller. Finally, we show that the proposed algorithm achieves greater performance than standard reinforcement-learning algorithms in the pendulum benchmarking environment.", "keywords": "Neural-network-control;nonlinear systems;continuous control;adaptive control;task specification;signal temporal logic", "primary_area": "", "supplementary_material": "/attachment/3b8e695bac7286e38e54d178020ef9481609af93.zip", "author": "Christos Verginis;Zhe Xu;ufuk topcu", "authorids": "~Christos_Verginis1;~Zhe_Xu7;~ufuk_topcu1", "gender": "M;;Unspecified", "homepage": "https://cverginis.github.io/;https://sites.google.com/site/zhexudavid00710;https://autonomy.oden.utexas.edu/", "dblp": ";;12/6659.html", "google_scholar": "IlwjoJ4AAAAJ;j8ilzcsAAAAJ;jeNGFfQAAAAJ", "orcid": ";;0000-0003-0819-9985", "linkedin": ";;", "or_profile": "~Christos_Verginis1;~Zhe_Xu7;~ufuk_topcu1", "aff": "Uppsala University;Arizona State University;University of Texas, Austin", "aff_domain": "uu.se;asu.edu;utexas.edu", "position": "Assistant Professor;Assistant Professor;Full Professor", "bibtex": "@misc{\nverginis2022nonparametric,\ntitle={Non-Parametric Neuro-Adaptive Control Subject to Task Specifications},\nauthor={Christos Verginis and Zhe Xu and ufuk topcu},\nyear={2022},\nurl={https://openreview.net/forum?id=FWiwSGJ_Bpa}\n}", "github": "", "project": "", "reviewers": "HYqY;GDik;YVCm", "site": "https://openreview.net/forum?id=FWiwSGJ_Bpa", "pdf_size": 0, "recommendation": "3;5;5", "confidence": "4;4;3", "correctness": "3;4;3", "technical_novelty": "2;1;3", "empirical_novelty": "2;1;3", "wc_summary_paper": "76;76;138", "wc_summary_review": "34;102;59", "wc_main_review": "204;482;498", "wc_review": "314;660;695", "wc_reply_reviewers": "110;46;149", "wc_reply_authors": "1112;620;658", "reply_reviewers": "1;1;1", "reply_authors": "2;1;1", "recommendation_avg": [ 4.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.0, 0.816496580927726 ], "empirical_novelty_avg": [ 2.0, 0.816496580927726 ], "wc_summary_paper_avg": [ 96.66666666666667, 29.227080289043965 ], "wc_summary_review_avg": [ 65.0, 28.083209693100727 ], "wc_main_review_avg": [ 394.6666666666667, 134.97983388475316 ], "wc_review_avg": [ 556.3333333333334, 171.95025120333173 ], "wc_reply_reviewers_avg": [ 101.66666666666667, 42.460439103816256 ], "wc_reply_authors_avg": [ 796.6666666666666, 223.5133602767902 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.49999999999999983, "corr_recommendation_correctness": 0.49999999999999983, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13085106470600669928&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "Uppsala University;Arizona State University;University of Texas at Austin", "aff_unique_dep": ";;", "aff_unique_url": "https://www.uu.se;https://www.asu.edu;https://www.utexas.edu", "aff_unique_abbr": "UU;ASU;UT Austin", "aff_campus_unique_index": "1", "aff_campus_unique": ";Austin", "aff_country_unique_index": "0;1;1", "aff_country_unique": "Sweden;United States" }, { "id": "FYUzzBPh_j", "title": "Communicating via Markov Decision Processes", "track": "main", "status": "Reject", "tldr": "", "abstract": "We consider the problem of communicating exogenous information by means of Markov decision process trajectories. This setting, which we call a Markov coding game (MCG), generalizes both source coding and a large class of referential games. MCGs also isolate a problem that is important in decentralized control settings in which cheap-talk is not available---namely, they require balancing communication with the associated cost of communicating. We contribute a theoretically grounded approach to MCGs based on maximum entropy reinforcement learning and minimum entropy coupling that we call greedy minimum entropy coupling (GME). We show both that GME is able to outperform a relevant baseline on small MCGs and that GME is able to scale efficiently to extremely large MCGs. To the latter point, we demonstrate that GME is able to losslessly communicate binary images via trajectories of Cartpole and Pong, while simultaneously achieving the maximal or near maximal expected returns, and that it is even capable of performing well in the presence of actuator noise.", "keywords": "coding;communication;maximum entropy reinforcement learning;minimum entropy coupling", "primary_area": "", "supplementary_material": "/attachment/8e7d3f542c0482f81b949bf147982af579bdc654.zip", "author": "Samuel Sokota;Christian Schroeder de Witt;Maximilian Igl;Luisa M Zintgraf;Philip Torr;J Zico Kolter;Shimon Whiteson;Jakob Nicolaus Foerster", "authorids": "~Samuel_Sokota1;~Christian_Schroeder_de_Witt1;~Maximilian_Igl1;~Luisa_M_Zintgraf1;~Philip_Torr1;~J_Zico_Kolter1;~Shimon_Whiteson1;~Jakob_Nicolaus_Foerster1", "gender": "M;M;M;F;;;M;M", "homepage": "https://ssokota.github.io/;https://www.schroederdewitt.com;https://maximilianigl.com;;http://www.robots.ox.ac.uk/~tvg/;;https://www.jakobfoerster.com;http://www.zicokolter.com", "dblp": "243/5881;;207/8245.html;177/9360;;https://dblp.uni-trier.de/pers/w/Whiteson:Shimon.html;176/5095;67/2526", "google_scholar": ";DE60h_0AAAAJ;https://scholar.google.com/citations?hl=en;lEzcLFwAAAAJ;;;6z4lQzMAAAAJ;UXh1I6UAAAAJ", "orcid": ";;;;;;;", "linkedin": "samuel-sokota-87a153149/;;maximilian-igl-21116992/;;;;;", "or_profile": "~Samuel_Sokota1;~Christian_Schroeder_de_Witt1;~Maximilian_Igl1;~Luisa_M_Zintgraf1;~Philip_Torr1;~Shimon_Whiteson1;~Jakob_Nicolaus_Foerster1;~Zico_Kolter1", "aff": "Carnegie Mellon University;University of Oxford;Waymo;University of Oxford;University of Oxford;University of Oxford;University of Oxford, University of Oxford;Carnegie Mellon University", "aff_domain": "cmu.edu;oxford.ac.uk;waymo.com;ox.ac.uk;ox.ac.uk;ox.ac.uk;eng.ox.ac.uk;cmu.edu", "position": "PhD student;Postdoc;Researcher;PhD student;Full Professor;Professor;Associate Professor;Full Professor", "bibtex": "@misc{\nsokota2022communicating,\ntitle={Communicating via Markov Decision Processes},\nauthor={Samuel Sokota and Christian Schroeder de Witt and Maximilian Igl and Luisa M Zintgraf and Philip Torr and J Zico Kolter and Shimon Whiteson and Jakob Nicolaus Foerster},\nyear={2022},\nurl={https://openreview.net/forum?id=FYUzzBPh_j}\n}", "github": "", "project": "", "reviewers": "8Cj5;Er8e;o2vx;FcQX", "site": "https://openreview.net/forum?id=FYUzzBPh_j", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "4;3;4;3", "correctness": "2;3;2;3", "technical_novelty": "2;2;3;2", "empirical_novelty": "2;1;2;2", "wc_summary_paper": "121;35;78;107", "wc_summary_review": "23;260;10;44", "wc_main_review": "232;17;275;477", "wc_review": "376;312;363;628", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "379;236;239;635", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 85.25, 32.89661836724255 ], "wc_summary_review_avg": [ 84.25, 102.19191504223805 ], "wc_main_review_avg": [ 250.25, 163.37590856671616 ], "wc_review_avg": [ 419.75, 122.58950811549902 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 372.25, 162.3289484349603 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1909863582927997201&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff_unique_index": "0;1;2;1;1;1;1;0", "aff_unique_norm": "Carnegie Mellon University;University of Oxford;Waymo", "aff_unique_dep": ";;", "aff_unique_url": "https://www.cmu.edu;https://www.ox.ac.uk;https://www.waymo.com", "aff_unique_abbr": "CMU;Oxford;Waymo", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;1;1;1;1;0", "aff_country_unique": "United States;United Kingdom" }, { "title": "Ancestral protein sequence reconstruction using a tree-structured Ornstein-Uhlenbeck variational autoencoder", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6608", "id": "FZoZ7a31GCW", "poster": "", "openreview": "https://openreview.net/forum?id=FZoZ7a31GCW", "slides": "https://iclr.cc/virtual/2022/poster/6608", "video": "https://iclr.cc/virtual/2022/poster/6608", "author_site": "Lys Sanz Moreta, Ola R\u00f8nning, Ahmad Salim Al-Sibahi, Jotun Hein, Douglas Theobald, Thomas Hamelryck", "tldr": "", "abstract": "We introduce a deep generative model for representation learning of biological sequences that, unlike existing models, explicitly represents the evolutionary process. The model makes use of a tree-structured Ornstein-Uhlenbeck process, obtained from a given phylogenetic tree, as an informative prior for a variational autoencoder. We show the model performs well on the task of ancestral sequence reconstruction of single protein families. Our results and ablation studies indicate that the explicit representation of evolution using a suitable tree-structured prior has the potential to improve representation learning of biological sequences considerably. Finally, we briefly discuss extensions of the model to genomic-scale data sets and the case of a latent phylogenetic tree.", "keywords": "biological sequences;variational autoencoders;latent representations;ornstein-uhlenbeck process;evolution", "primary_area": "", "supplementary_material": "", "author": "Lys Sanz Moreta;Ola R\u00f8nning;Ahmad Salim Al-Sibahi;Jotun Hein;Douglas Theobald;Thomas Hamelryck", "authorids": "~Lys_Sanz_Moreta1;~Ola_R\u00f8nning1;~Ahmad_Salim_Al-Sibahi1;hein@stats.ox.ac.uk;dtheobald@brandeis.edu;~Thomas_Hamelryck1", "gender": "F;M;M;;;M", "homepage": ";;https://alsibahi.xyz;;;https://thamelry.github.io", "dblp": ";220/2035;166/7500.html;;;18/2705", "google_scholar": ";V-RLwukAAAAJ;mcVJvU8AAAAJ;;;YoTlzjkAAAAJ", "orcid": "0000-0003-1580-539X;;;;;0000-0003-2917-3602", "linkedin": ";;;;;thomas-hamelryck-41a0a64/", "or_profile": "~Lys_Sanz_Moreta1;~Ola_R\u00f8nning1;~Ahmad_Salim_Al-Sibahi1;hein@stats.ox.ac.uk;dtheobald@brandeis.edu;~Thomas_Hamelryck1", "aff": "University of Copenhagen;University of Copenhagen;University of Copenhagen;;;University of Copenhagen", "aff_domain": "ku.dk;ku.dk;ku.dk;;;bio.ku.dk", "position": "PhD student;PhD student;Assistant Professor;;;Associate Professor", "bibtex": "@inproceedings{\nmoreta2022ancestral,\ntitle={Ancestral protein sequence reconstruction using a tree-structured Ornstein-Uhlenbeck variational autoencoder},\nauthor={Lys Sanz Moreta and Ola R{\\o}nning and Ahmad Salim Al-Sibahi and Jotun Hein and Douglas Theobald and Thomas Hamelryck},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=FZoZ7a31GCW}\n}", "github": "", "project": "", "reviewers": "Exob;mia3;1Min", "pdf_size": 0, "recommendation": "5;8;8", "confidence": "4;5;4", "correctness": "3;3;3", "technical_novelty": "3;3;4", "empirical_novelty": "1;3;2", "wc_summary_paper": "37;33;56", "wc_summary_review": "83;33;55", "wc_main_review": "420;250;768", "wc_review": "540;316;879", "wc_reply_reviewers": "0;0;116", "wc_reply_authors": "286;303;408", "reply_reviewers": "0;0;1", "reply_authors": "1;1;1", "recommendation_avg": [ 7.0, 1.4142135623730951 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 3.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.0, 0.816496580927726 ], "wc_summary_paper_avg": [ 42.0, 10.03327796219494 ], "wc_summary_review_avg": [ 57.0, 20.46134567096374 ], "wc_main_review_avg": [ 479.3333333333333, 215.59426914667486 ], "wc_review_avg": [ 578.3333333333334, 231.43657638517058 ], "wc_reply_reviewers_avg": [ 38.666666666666664, 54.68292441175968 ], "wc_reply_authors_avg": [ 332.3333333333333, 53.95265414128288 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.5, "corr_recommendation_correctness": 0.0, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5071693180811788700&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 2, "pdf": "https://openreview.net/pdf?id=FZoZ7a31GCW", "email": "ku.dk;ku.dk;ku.dk;;;bio.ku.dk", "author_num": 6, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "University of Copenhagen", "aff_unique_dep": "", "aff_unique_url": "https://www.ku.dk", "aff_unique_abbr": "UCPH", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Denmark" }, { "id": "FZyZiRYbdK8", "title": "Distributionally Robust Learning for Uncertainty Calibration under Domain Shift", "track": "main", "status": "Reject", "tldr": "", "abstract": "We propose a framework for learning calibrated uncertainties under domain shifts. We consider the case where the source (training) distribution differs significantly from the target (test) distribution. We detect such domain shifts through the use of binary domain classifier and integrate it with the task network and train them jointly end-to-end. The binary domain classifier yields a density ratio that reflects the closeness of a target (test) sample to the source (training) distribution. We employ it to adjust the uncertainty of prediction in the task network. This idea of using the density ratio is based on the distributionally robust learning (DRL) framework, which accounts for the domain shift through adversarial risk minimization. We demonstrate that our method generates calibrated uncertainties that benefit many downstream tasks, such as unsupervised domain adaptation (UDA) and semi-supervised learning (SSL). In these tasks, methods like self-training and FixMatch use uncertainties to select confident pseudo-labels for re-training. Our experiments show that the introduction of DRL leads to significant improvements in cross-domain performance. We also demonstrate that the estimated density ratios show agreement with the human selection frequencies, suggesting a match with a proxy of human perceived uncertainties. ", "keywords": "Domain shift;uncertainty estimation;calibration;distributional robustness;unsupervised domain adaptation;semi-supervised learning", "primary_area": "", "supplementary_material": "/attachment/0c75990350cc89de3ff1a97fb429b61b4557d83d.zip", "author": "Haoxuan Wang;Anqi Liu;Zhiding Yu;Junchi Yan;Yisong Yue;Anima Anandkumar", "authorids": "~Haoxuan_Wang1;~Anqi_Liu2;~Zhiding_Yu1;~Junchi_Yan2;~Yisong_Yue1;~Anima_Anandkumar1", "gender": "M;F;;;M;", "homepage": "https://hatchetproject.github.io/;https://anqiliu-ai.github.io/;;;http://www.yisongyue.com;", "dblp": ";;;;28/1244;", "google_scholar": "vRXYQvYAAAAJ;Q8yp6zQAAAAJ;;;tEk4qo8AAAAJ;", "orcid": "0009-0003-4014-6610;0000-0002-0468-5698;;;0000-0001-9127-1989;", "linkedin": ";;;;yisongyue/;", "or_profile": "~Haoxuan_Wang1;~Anqi_Liu2;~Zhiding_Yu1;~Junchi_Yan2;~Yisong_Yue1;~Anima_Anandkumar1", "aff": "Shanghai Jiaotong University;University of Illinois, Chicago;;;Argo AI;", "aff_domain": "sjtu.edu.cn;uic.edu;;;argo.ai;", "position": "MS student;PhD student;;;Principal Researcher;", "bibtex": "@misc{\nwang2022distributionally,\ntitle={Distributionally Robust Learning for Uncertainty Calibration under Domain Shift},\nauthor={Haoxuan Wang and Anqi Liu and Zhiding Yu and Junchi Yan and Yisong Yue and Anima Anandkumar},\nyear={2022},\nurl={https://openreview.net/forum?id=FZyZiRYbdK8}\n}", "github": "", "project": "", "reviewers": "6Mv8;xXg4;zkq9;St2k", "site": "https://openreview.net/forum?id=FZyZiRYbdK8", "pdf_size": 0, "recommendation": "5;5;5;6", "confidence": "3;3;5;4", "correctness": "3;3;3;3", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "76;42;91;26", "wc_summary_review": "39;37;24;30", "wc_main_review": "413;268;348;225", "wc_review": "528;347;463;281", "wc_reply_reviewers": "0;150;76;0", "wc_reply_authors": "868;1058;1074;705", "reply_reviewers": "0;1;1;0", "reply_authors": "2;2;3;2", "recommendation_avg": [ 5.25, 0.4330127018922193 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 58.75, 25.936219847926953 ], "wc_summary_review_avg": [ 32.5, 5.937171043518958 ], "wc_main_review_avg": [ 313.5, 72.44480657714533 ], "wc_review_avg": [ 404.75, 96.47894848100285 ], "wc_reply_reviewers_avg": [ 56.5, 62.26355274155178 ], "wc_reply_authors_avg": [ 926.25, 151.27189924106855 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 2.25, 0.4330127018922193 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.17407765595569782, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:5vhXaPG9dMgJ:scholar.google.com/&scioq=Distributionally+Robust+Learning+for+Uncertainty+Calibration+under+Domain+Shift&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1;2", "aff_unique_norm": "Shanghai Jiao Tong University;University of Illinois at Chicago;Argo AI", "aff_unique_dep": ";;", "aff_unique_url": "https://www.sjtu.edu.cn;https://www.uic.edu;https://www.argo.ai", "aff_unique_abbr": "SJTU;UIC;Argo AI", "aff_campus_unique_index": "1", "aff_campus_unique": ";Chicago", "aff_country_unique_index": "0;1;1", "aff_country_unique": "China;United States" }, { "id": "FeaitX_a5Av", "title": "GSD: Generalized Stochastic Decoding", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Although substantial progress has been made in various text generation tasks, there remains a vast gap between current generations and human languages. One reason is that virtually all decoding methods currently developed are pragmatic to address the text degeneration problem, which exists in both deterministic and stochastic decoding algorithms. So, why text generated from these algorithms are divergent? What is the critical difference between these algorithms? Moreover, is it possible to design a generalized framework where existing decoding algorithms can be naturally connected, uniformly described, and mutually inspired?\nIn this paper, we try to explore answers to these intriguing questions. Correctly, we propose a generalized decoding framework that can be used to describe and connect existing popular decoding algorithms. Based on the framework, we propose a novel implementation with a distinctive core from existing decoding algorithms. As far as we know, this is the first work trying to propose a generalized framework to bridge these decoding algorithms using formal theorems and concrete implementations. By setting up different conditions, our framework provides infinite space to develop new decoding algorithms. Experiments show that text produced by our method is closest to the characteristics of human languages. Source code and the generated text can be accessed from https://github.com/ginoailab/gsd.git.", "keywords": "Natural Language Processing;Decoding Algorithms", "primary_area": "", "supplementary_material": "", "author": "Ning Gong;Nianmin Yao", "authorids": "~Ning_Gong1;~Nianmin_Yao1", "gender": ";M", "homepage": "https://www.askgn.com;", "dblp": ";18/1894", "google_scholar": ";https://scholar.google.com.hk/citations?user=ztMJF3gAAAAJ", "orcid": ";0000-0001-9705-6649", "linkedin": ";", "or_profile": "~Ning_Gong1;~Nianmin_Yao1", "aff": "Dalian University of Technology;Dalian University of Technology", "aff_domain": "dlut.edu.cn;dlut.edu.cn", "position": "PhD student;Full Professor", "bibtex": "@misc{\ngong2022gsd,\ntitle={{GSD}: Generalized Stochastic Decoding},\nauthor={Ning Gong and Nianmin Yao},\nyear={2022},\nurl={https://openreview.net/forum?id=FeaitX_a5Av}\n}", "github": "", "project": "", "reviewers": "Gvhk;fbQi;eAC1;z9d8", "site": "https://openreview.net/forum?id=FeaitX_a5Av", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "5;4;4;4", "correctness": "2;3;3;3", "technical_novelty": "2;3;2;3", "empirical_novelty": "1;2;2;1", "wc_summary_paper": "32;116;72;101", "wc_summary_review": "35;16;70;75", "wc_main_review": "267;194;1107;703", "wc_review": "334;326;1249;879", "wc_reply_reviewers": "0;99;41;62", "wc_reply_authors": "491;1423;785;786", "reply_reviewers": "0;1;1;1", "reply_authors": "1;2;1;1", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 1.5, 0.5 ], "wc_summary_paper_avg": [ 80.25, 32.03416145304884 ], "wc_summary_review_avg": [ 49.0, 24.50510150968569 ], "wc_main_review_avg": [ 567.75, 367.1589403786867 ], "wc_review_avg": [ 697.0, 389.62738610113126 ], "wc_reply_reviewers_avg": [ 50.5, 35.79455265819088 ], "wc_reply_authors_avg": [ 871.25, 340.48669210411146 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:3LSz3xATk5MJ:scholar.google.com/&scioq=GSD:+Generalized+Stochastic+Decoding&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Dalian University of Technology", "aff_unique_dep": "", "aff_unique_url": "http://www.dlut.edu.cn/", "aff_unique_abbr": "DUT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "id": "Fh_NyEuejsZ", "title": "ZenDet: Revisiting Efficient Object Detection Backbones from Zero-Shot Neural Architecture Search", "track": "main", "status": "Reject", "tldr": "", "abstract": "In object detection models, the detection backbone consumes more than half of the overall inference cost. Recent researches attempt to reduce this cost by optimizing the backbone architecture with the help of Neural Architecture Search (NAS). However, existing NAS methods for object detection require hundreds to thousands of GPU hours of searching, making them impractical in fast-paced research and development. In this work, we propose a novel zero-shot NAS method to address this issue. The proposed method, named ZenDet, automatically designs efficient detection backbones without training network parameters, reducing the architecture design cost to nearly zero yet delivering the state-of-the-art (SOTA) performance. Under the hood, ZenDet maximizes the differential entropy of detection backbones, leading to a better feature extractor for object detection under the same computational budgets. After merely one GPU day of fully automatic design, ZenDet innovates SOTA detection backbones on multiple detection benchmark datasets with little human intervention. Comparing to ResNet-50 backbone, ZenDet is $+2.0\\%$ better in mAP when using the same amount of FLOPs/parameters and is $1.54$ times faster on NVIDIA V100 at the same mAP. Code and pre-trained models will be released after publication.", "keywords": "Object Detection;Detection Backbone;Neural Architecture Search;Zero-Shot NAS", "primary_area": "", "supplementary_material": "", "author": "Zhenhong Sun;Ming Lin;Zhiyu Tan;Xiuyu Sun;Rong Jin", "authorids": "~Zhenhong_Sun1;~Ming_Lin4;~Zhiyu_Tan2;~Xiuyu_Sun1;~Rong_Jin1", "gender": ";M;M;M;", "homepage": ";https://minglin-home.github.io/;https://scholar.google.com/citations?user=XprTQQ8AAAAJ&hl=en;https://sites.google.com/view/sunxiuyu/home;", "dblp": ";;136/4997;40/8845;", "google_scholar": ";https://scholar.google.com/citations?hl=en;XprTQQ8AAAAJ;https://scholar.google.com/citations?hl=zh-CN;", "orcid": ";;;0000-0002-7208-8078;", "linkedin": ";;;;", "or_profile": "~Zhenhong_Sun1;~Ming_Lin4;~Zhiyu_Tan2;~Xiuyu_Sun1;~Rong_Jin1", "aff": ";Alibaba Group;Alibaba DAMO Academy;Alibaba Group;", "aff_domain": ";alibaba-inc.com;alibaba-inc.com;alibaba-inc.com;", "position": ";Algorithm Engineer;Researcher;Staff Algorithm Engineer;", "bibtex": "@misc{\nsun2022zendet,\ntitle={ZenDet: Revisiting Efficient Object Detection Backbones from Zero-Shot Neural Architecture Search},\nauthor={Zhenhong Sun and Ming Lin and Zhiyu Tan and Xiuyu Sun and Rong Jin},\nyear={2022},\nurl={https://openreview.net/forum?id=Fh_NyEuejsZ}\n}", "github": "", "project": "", "reviewers": "YY18;KNoQ;34Uv", "site": "https://openreview.net/forum?id=Fh_NyEuejsZ", "pdf_size": 0, "recommendation": "6;6;6", "confidence": "4;4;4", "correctness": "3;3;3", "technical_novelty": "4;2;3", "empirical_novelty": "4;3;3", "wc_summary_paper": "69;63;33", "wc_summary_review": "82;133;3", "wc_main_review": "230;563;92", "wc_review": "381;759;128", "wc_reply_reviewers": "140;246;0", "wc_reply_authors": "634;1333;234", "reply_reviewers": "2;1;0", "reply_authors": "3;4;1", "recommendation_avg": [ 6.0, 0.0 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 3.0, 0.816496580927726 ], "empirical_novelty_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 55.0, 15.748015748023622 ], "wc_summary_review_avg": [ 72.66666666666667, 53.48104544810453 ], "wc_main_review_avg": [ 295.0, 197.7017956418201 ], "wc_review_avg": [ 422.6666666666667, 259.2840570151243 ], "wc_reply_reviewers_avg": [ 128.66666666666666, 100.74831126238406 ], "wc_reply_authors_avg": [ 733.6666666666666, 454.1661467886933 ], "reply_reviewers_avg": [ 1.0, 0.816496580927726 ], "reply_authors_avg": [ 2.6666666666666665, 1.247219128924647 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7447832509723112920&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;0", "aff_unique_norm": "Alibaba Group", "aff_unique_dep": "", "aff_unique_url": "https://www.alibaba.com", "aff_unique_abbr": "Alibaba", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "id": "Fia60I79-4B", "title": "TS-BERT: A fusion model for Pre-trainning Time Series-Text Representations", "track": "main", "status": "Reject", "tldr": "", "abstract": "There are many tasks to use news text information and stock data to predict the crisis. In the existing research, the two usually play one master and one follower in the prediction task.\nUse one of the news text and the stock data as the primary information source for the prediction task and the other as the auxiliary information source.\nThis paper proposes a fusion model for pre-training time series-Text representations, in which news text and stock data have the same status and are treated as two different modes to describe crises. Our model has achieved the best results in the task of predicting financial crises.", "keywords": "Time Series-Text Representations;Pre-training;Mutilmodal", "primary_area": "", "supplementary_material": "", "author": "Jiahao Qin;Lu Zong", "authorids": "~Jiahao_Qin1;~Lu_Zong1", "gender": "M;F", "homepage": ";https://www.xjtlu.edu.cn/en/departments/academic-departments/mathematical-sciences/staff/lu-zong", "dblp": ";", "google_scholar": ";XMHLnkEAAAAJ", "orcid": ";", "linkedin": "jiahao-qin-543416185/;", "or_profile": "~Jiahao_Qin1;~Lu_Zong1", "aff": "University of Liverpool;Xi'an Jiaotong-Liverpool University", "aff_domain": "liverpool.ac.uk;xjtlu.edu.cn", "position": "PhD student;Assistant Professor", "bibtex": "@misc{\nqin2022tsbert,\ntitle={{TS}-{BERT}: A fusion model for Pre-trainning Time Series-Text Representations},\nauthor={Jiahao Qin and Lu Zong},\nyear={2022},\nurl={https://openreview.net/forum?id=Fia60I79-4B}\n}", "github": "", "project": "", "reviewers": "oxEh;7P8j;YK4B", "site": "https://openreview.net/forum?id=Fia60I79-4B", "pdf_size": 0, "recommendation": "1;3;3", "confidence": "4;5;4", "correctness": "2;2;3", "technical_novelty": "1;1;2", "empirical_novelty": "2;1;2", "wc_summary_paper": "27;46;52", "wc_summary_review": "34;19;28", "wc_main_review": "176;284;631", "wc_review": "237;349;711", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 2.3333333333333335, 0.9428090415820634 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "correctness_avg": [ 2.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 1.3333333333333333, 0.4714045207910317 ], "empirical_novelty_avg": [ 1.6666666666666667, 0.4714045207910317 ], "wc_summary_paper_avg": [ 41.666666666666664, 10.656244908763853 ], "wc_summary_review_avg": [ 27.0, 6.164414002968976 ], "wc_main_review_avg": [ 363.6666666666667, 194.1070723996309 ], "wc_review_avg": [ 432.3333333333333, 202.2825306896822 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.4999999999999999, "corr_recommendation_correctness": 0.5, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6642710248441876407&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1", "aff_unique_norm": "University of Liverpool;Xi'an Jiao Tong-Liverpool University", "aff_unique_dep": ";", "aff_unique_url": "https://www.liverpool.ac.uk;https://www.xjtu.edu.cn/en", "aff_unique_abbr": "Liv Uni;XJTLU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Xi'an", "aff_country_unique_index": "0;1", "aff_country_unique": "United Kingdom;China" }, { "id": "Fj1Tpym9KxH", "title": "A Closer Look at Smoothness in Domain Adversarial Training", "track": "main", "status": "Reject", "tldr": "", "abstract": "Domain adversarial training has been ubiquitous for achieving invariant representations and is used widely for various domain adaptation tasks. In recent times methods converging to smooth optima have shown improved generalization for supervised learning tasks like classification. In this work, we analyze the effect of smoothness enhancing formulations on domain adversarial training, the objective of which is a combination of classification and adversarial terms. In contrast to classification loss, our analysis shows that \\textit{converging to smooth minima w.r.t. adversarial loss leads to sub-optimal generalization on the target domain}. Based on the analysis, we introduce the Smooth Domain Adversarial training (SDAT) procedure, which effectively enhances the performance of existing domain adversarial methods for both classification and object detection tasks. Our smoothness analysis also provides insight into the extensive usage of SGD over Adam in domain adversarial training. ", "keywords": "Domain Adaptation;Optimization", "primary_area": "", "supplementary_material": "", "author": "Harsh Rangwani;Sumukh K Aithal;Arihant Jain;Venkatesh Babu Radhakrishnan", "authorids": "~Harsh_Rangwani1;~Sumukh_K_Aithal1;arihantjain@iisc.ac.in;~Venkatesh_Babu_Radhakrishnan2", "gender": "M;;;M", "homepage": "https://rangwani-harsh.github.io/about/;;;http://cds.iisc.ac.in/faculty/venky", "dblp": "220/0991;;;20/6289", "google_scholar": "OQK0WREAAAAJ;;;cVg7HrEAAAAJ", "orcid": ";;;0000-0002-1926-1804", "linkedin": ";;;venkatesh-babu-radhakrishnan-16568939", "or_profile": "~Harsh_Rangwani1;~Sumukh_K_Aithal1;arihantjain@iisc.ac.in;~Venkatesh_Babu_Radhakrishnan2", "aff": "Indian Institute of Science;;;Indian Institute of Science", "aff_domain": "iisc.ac.in;;;iisc.ac.in", "position": "PhD student;;;Full Professor", "bibtex": "@misc{\nrangwani2022a,\ntitle={A Closer Look at Smoothness in Domain Adversarial Training},\nauthor={Harsh Rangwani and Sumukh K Aithal and Arihant Jain and Venkatesh Babu Radhakrishnan},\nyear={2022},\nurl={https://openreview.net/forum?id=Fj1Tpym9KxH}\n}", "github": "", "project": "", "reviewers": "C1yh;zPwA;rxW2;nwci", "site": "https://openreview.net/forum?id=Fj1Tpym9KxH", "pdf_size": 0, "recommendation": "5;5;5;8", "confidence": "2;3;4;4", "correctness": "4;3;3;4", "technical_novelty": "2;2;2;4", "empirical_novelty": "2;2;3;4", "wc_summary_paper": "62;76;185;88", "wc_summary_review": "43;85;111;26", "wc_main_review": "253;169;674;189", "wc_review": "358;330;970;303", "wc_reply_reviewers": "0;0;379;0", "wc_reply_authors": "1048;708;2624;43", "reply_reviewers": "0;0;2;0", "reply_authors": "2;1;5;1", "recommendation_avg": [ 5.75, 1.299038105676658 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.5, 0.8660254037844386 ], "empirical_novelty_avg": [ 2.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 102.75, 48.37031631072925 ], "wc_summary_review_avg": [ 66.25, 33.5959446957516 ], "wc_main_review_avg": [ 321.25, 206.0101635842271 ], "wc_review_avg": [ 490.25, 277.66560373946214 ], "wc_reply_reviewers_avg": [ 94.75, 164.11181401715112 ], "wc_reply_authors_avg": [ 1105.75, 948.1641142228491 ], "reply_reviewers_avg": [ 0.5, 0.8660254037844386 ], "reply_authors_avg": [ 2.25, 1.6393596310755 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.5222329678670935, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 156, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11164597139581450427&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff_unique_index": "0;0", "aff_unique_norm": "Indian Institute of Science", "aff_unique_dep": "", "aff_unique_url": "https://www.iisc.ac.in", "aff_unique_abbr": "IISc", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "India" }, { "title": "On Lottery Tickets and Minimal Task Representations in Deep Reinforcement Learning", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7077", "id": "Fl3Mg_MZR-", "poster": "", "openreview": "https://openreview.net/forum?id=Fl3Mg_MZR-", "slides": "https://iclr.cc/virtual/2022/poster/7077", "video": "https://iclr.cc/virtual/2022/poster/7077", "author_site": "Marc Vischer, Robert Lange, Henning Sprekeler", "tldr": "", "abstract": "The lottery ticket hypothesis questions the role of overparameterization in supervised deep learning. But how is the performance of winning lottery tickets affected by the distributional shift inherent to reinforcement learning problems? In this work, we address this question by comparing sparse agents who have to address the non-stationarity of the exploration-exploitation problem with supervised agents trained to imitate an expert. We show that feed-forward networks trained with behavioural cloning compared to reinforcement learning can be pruned to higher levels of sparsity without performance degradation. This suggests that in order to solve the RL-specific distributional shift agents require more degrees of freedom. Using a set of carefully designed baseline conditions, we find that the majority of the lottery ticket effect in both learning paradigms can be attributed to the identified mask rather than the weight initialization. The input layer mask selectively prunes entire input dimensions that turn out to be irrelevant for the task at hand. At a moderate level of sparsity the mask identified by iterative magnitude pruning yields minimal task-relevant representations, i.e., an interpretable inductive bias. Finally, we propose a simple initialization rescaling which promotes the robust identification of sparse task representations in low-dimensional control tasks.", "keywords": "Reinforcement Learning;Sparsity;Pruning;Lottery Ticket Hypothesis", "primary_area": "", "supplementary_material": "", "author": "Marc Vischer;Robert Tjarko Lange;Henning Sprekeler", "authorids": "~Marc_Vischer1;~Robert_Tjarko_Lange1;h.sprekeler@tu-berlin.de", "gender": ";;", "homepage": ";https://roberttlange.github.io/;", "dblp": ";245/9152;", "google_scholar": ";https://scholar.google.es/citations?user=cTrc3x4AAAAJ;", "orcid": ";;", "linkedin": ";;", "or_profile": "~Marc_Vischer1;~Robert_Tjarko_Lange1;h.sprekeler@tu-berlin.de", "aff": ";TU Berlin;", "aff_domain": ";tu-berlin.de;", "position": ";PhD student;", "bibtex": "@inproceedings{\nvischer2022on,\ntitle={On Lottery Tickets and Minimal Task Representations in Deep Reinforcement Learning},\nauthor={Marc Vischer and Robert Tjarko Lange and Henning Sprekeler},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=Fl3Mg_MZR-}\n}", "github": "", "project": "", "reviewers": "SmNw;HrAw;BwRq", "pdf_size": 0, "recommendation": "5;8;8", "confidence": "4;3;4", "correctness": "3;4;3", "technical_novelty": "2;2;3", "empirical_novelty": "3;3;4", "wc_summary_paper": "81;60;577", "wc_summary_review": "25;47;135", "wc_main_review": "236;809;749", "wc_review": "342;916;1461", "wc_reply_reviewers": "0;31;603", "wc_reply_authors": "375;729;777", "reply_reviewers": "0;1;1", "reply_authors": "1;1;1", "recommendation_avg": [ 7.0, 1.4142135623730951 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 239.33333333333334, 238.92025633857185 ], "wc_summary_review_avg": [ 69.0, 47.525431788324305 ], "wc_main_review_avg": [ 598.0, 257.1419841255022 ], "wc_review_avg": [ 906.3333333333334, 456.88097161320064 ], "wc_reply_reviewers_avg": [ 211.33333333333334, 277.2391667054919 ], "wc_reply_authors_avg": [ 627.0, 179.26516672237247 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.5, "corr_recommendation_correctness": 0.5, "gs_citation": 30, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15390218923269251393&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2, "pdf": "https://openreview.net/pdf?id=Fl3Mg_MZR-", "email": ";tu-berlin.de;", "author_num": 3, "aff_unique_index": "0", "aff_unique_norm": "Technische Universit\u00e4t Berlin", "aff_unique_dep": "", "aff_unique_url": "https://www.tu-berlin.de", "aff_unique_abbr": "TU Berlin", "aff_campus_unique_index": "0", "aff_campus_unique": "Berlin", "aff_country_unique_index": "0", "aff_country_unique": "Germany" }, { "title": "Multi-objective Optimization by Learning Space Partition", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/5994", "id": "FlwzVjfMryn", "poster": "", "openreview": "https://openreview.net/forum?id=FlwzVjfMryn", "slides": "https://iclr.cc/virtual/2022/poster/5994", "video": "https://iclr.cc/virtual/2022/poster/5994", "author_site": "Yiyang Zhao, Linnan Wang, Kevin Yang, Tianjun Zhang, Tian Guo, Yuandong Tian", "tldr": "", "abstract": "In contrast to single-objective optimization (SOO), multi-objective optimization (MOO) requires an optimizer to find the Pareto frontier, a subset of feasible solutions that are not dominated by other feasible solutions. In this paper, we propose LaMOO, a novel multi-objective optimizer that learns a model from observed samples to partition the search space and then focus on promising regions that are likely to contain a subset of the Pareto frontier. The partitioning is based on the dominance number, which measures \"how close'' a data point is to the Pareto frontier among existing samples. To account for possible partition errors due to limited samples and model mismatch, we leverage Monte Carlo Tree Search (MCTS) to exploit promising regions while exploring suboptimal regions that may turn out to contain good solutions later. Theoretically, we prove the efficacy of learning space partitioning via LaMOO under certain assumptions. Empirically, on the HyperVolume (HV) benchmark, a popular MOO metric, LaMOO substantially outperforms strong baselines on multiple real-world MOO tasks, by up to 225% in sample efficiency for neural architecture search on Nasbench201, and up to 10% for molecular design.", "keywords": "Optimization;Machine Learning", "primary_area": "", "supplementary_material": "/attachment/33d50adbebba7e7d1f11bb94dadbff6bcb243266.zip", "author": "Yiyang Zhao;Linnan Wang;Kevin Yang;Tianjun Zhang;Tian Guo;Yuandong Tian", "authorids": "~Yiyang_Zhao1;~Linnan_Wang2;~Kevin_Yang2;~Tianjun_Zhang1;~Tian_Guo3;~Yuandong_Tian1", "gender": "M;M;;F;M;M", "homepage": "https://zhaoyiyang.me/;https://linnanwang.github.io/;https://tianjunz.github.io;http://tianguo.info;http://yuandong-tian.com;https://people.eecs.berkeley.edu/~yangk/", "dblp": "33/3791;;;55/3523-1.html;t/YuandongTian;13/10565", "google_scholar": "pcerjpMeoAAAAJ;k1cGv3MAAAAJ;UE9jz_MAAAAJ;vDzUD84AAAAJ;0mgEF28AAAAJ;sRpY9TIAAAAJ", "orcid": ";;;;0000-0003-4202-4847;", "linkedin": ";;;;yuandongtian;", "or_profile": "~Yiyang_Zhao1;~Linnan_Wang2;~Tianjun_Zhang1;~Tian_Guo3;~Yuandong_Tian1;~Kevin_Yang1", "aff": "Worcester Polytechnic Institute;Brown University;University of California, Berkeley;Worcester Polytechnic Institute;Meta AI (FAIR);University of California, Berkeley", "aff_domain": "wpi.edu;brown.edu;berkeley.edu;wpi.edu;meta.com;berkeley.edu", "position": "PhD student;PhD student;PhD student;Assistant Professor;Research Scientist;PhD student", "bibtex": "@inproceedings{\nzhao2022multiobjective,\ntitle={Multi-objective Optimization by Learning Space Partition},\nauthor={Yiyang Zhao and Linnan Wang and Kevin Yang and Tianjun Zhang and Tian Guo and Yuandong Tian},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=FlwzVjfMryn}\n}", "github": "", "project": "", "reviewers": "Vetj;2MfT;xfNd;vdwH", "pdf_size": 0, "recommendation": "6;6;8;8", "confidence": "4;4;4;4", "correctness": "3;3;4;4", "technical_novelty": "3;3;3;3", "empirical_novelty": "3;3;3;4", "wc_summary_paper": "77;81;72;32", "wc_summary_review": "20;60;46;33", "wc_main_review": "261;1218;250;123", "wc_review": "358;1359;368;188", "wc_reply_reviewers": "11;531;9;9", "wc_reply_authors": "483;1957;399;68", "reply_reviewers": "1;2;1;1", "reply_authors": "1;4;1;1", "recommendation_avg": [ 7.0, 1.0 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 3.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 65.5, 19.60229578391266 ], "wc_summary_review_avg": [ 39.75, 14.872373717735847 ], "wc_main_review_avg": [ 463.0, 439.2601734735349 ], "wc_review_avg": [ 568.25, 462.10949730556285 ], "wc_reply_reviewers_avg": [ 140.0, 225.7454318474684 ], "wc_reply_authors_avg": [ 726.75, 727.0317651244683 ], "reply_reviewers_avg": [ 1.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.75, 1.299038105676658 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 1.0, "gs_citation": 30, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13117986140951713985&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=FlwzVjfMryn", "email": "wpi.edu;brown.edu;berkeley.edu;wpi.edu;meta.com;berkeley.edu", "author_num": 6, "aff_unique_index": "0;1;2;0;3;2", "aff_unique_norm": "Worcester Polytechnic Institute;Brown University;University of California, Berkeley;Meta", "aff_unique_dep": ";;;Facebook AI Research (FAIR)", "aff_unique_url": "https://www.wpi.edu;https://www.brown.edu;https://www.berkeley.edu;https://ai.facebook.com", "aff_unique_abbr": "WPI;Brown;UC Berkeley;Meta AI", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Berkeley", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Procedural generalization by planning with self-supervised world models", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7007", "id": "FmBegXJToY", "poster": "", "openreview": "https://openreview.net/forum?id=FmBegXJToY", "slides": "https://iclr.cc/virtual/2022/poster/7007", "video": "https://iclr.cc/virtual/2022/poster/7007", "author_site": "Ankesh Anand, Jacob C Walker, Yazhe Li, Eszter Vertes, Julian Schrittwieser, Sherjil Ozair, Theophane Weber, Jessica Hamrick", "tldr": "", "abstract": "One of the key promises of model-based reinforcement learning is the ability to generalize using an internal model of the world to make predictions in novel environments and tasks. However, the generalization ability of model-based agents is not well understood because existing work has focused on model-free agents when benchmarking generalization. Here, we explicitly measure the generalization ability of model-based agents in comparison to their model-free counterparts. We focus our analysis on MuZero (Schrittwieser et al., 2020), a powerful model-based agent, and evaluate its performance on both procedural and task generalization. We identify three factors of procedural generalization---planning, self-supervised representation learning, and procedural data diversity---and show that by combining these techniques, we achieve state-of-the art generalization performance and data efficiency on Procgen (Cobbe et al., 2019). However, we find that these factors do not always provide the same benefits for the task generalization benchmarks in Meta-World (Yu et al., 2019), indicating that transfer remains a challenge and may require different approaches than procedural generalization. Overall, we suggest that building generalizable agents requires moving beyond the single-task, model-free paradigm and towards self-supervised model-based agents that are trained in rich, procedural, multi-task environments.", "keywords": "Self-Supervised Learning;Model-Based RL;Generalization in RL", "primary_area": "", "supplementary_material": "", "author": "Ankesh Anand;Jacob C Walker;Yazhe Li;Eszter V\u00e9rtes;Julian Schrittwieser;Sherjil Ozair;Theophane Weber;Jessica B Hamrick", "authorids": "~Ankesh_Anand1;~Jacob_C_Walker1;~Yazhe_Li2;~Eszter_V\u00e9rtes1;~Julian_Schrittwieser1;~Sherjil_Ozair1;~Theophane_Weber1;~Jessica_B_Hamrick1", "gender": "M;;;;;M;M;F", "homepage": "https://ankeshanand.com;;;;http://www.furidamu.org;http://sherjil.ozair.io;http://www.thphn.com/;http://www.jesshamrick.com", "dblp": ";135/1696;182/2163;220/5681;;139/0736;;155/1885", "google_scholar": ";0dR_wD0AAAAJ;lpswgyIAAAAJ;RahgPAEAAAAJ;;O7MZStwAAAAJ;LZxqcX4AAAAJ;2ylcZSsAAAAJ", "orcid": ";;;;;;;", "linkedin": ";;;;;;;", "or_profile": "~Ankesh_Anand1;~Jacob_C_Walker1;~Yazhe_Li2;~Eszter_V\u00e9rtes1;~Julian_Schrittwieser1;~Sherjil_Ozair1;~Theophane_Weber1;~Jessica_B_Hamrick1", "aff": "Mila, University of Montreal;Google;Google DeepMind;Google DeepMind;Google DeepMind;Google;;Google DeepMind", "aff_domain": "umontreal.ca;google.com;deepmind.com;google.com;deepmind.com;google.com;;google.com", "position": "PhD student;Research Scientist;Researcher;Research Scientist;Researcher;Intern;;Research Scientist", "bibtex": "@inproceedings{\nanand2022procedural,\ntitle={Procedural generalization by planning with self-supervised world models},\nauthor={Ankesh Anand and Jacob C Walker and Yazhe Li and Eszter V{\\'e}rtes and Julian Schrittwieser and Sherjil Ozair and Theophane Weber and Jessica B Hamrick},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=FmBegXJToY}\n}", "github": "", "project": "", "reviewers": "hXT2;KYiU;mZzY;Nzu6", "pdf_size": 0, "recommendation": "6;6;8;8", "confidence": "3;4;4;2", "correctness": "4;2;4;4", "technical_novelty": "1;2;2;3", "empirical_novelty": "3;2;4;4", "wc_summary_paper": "123;118;170;64", "wc_summary_review": "41;70;141;35", "wc_main_review": "118;283;115;70", "wc_review": "282;471;426;169", "wc_reply_reviewers": "0;91;118;0", "wc_reply_authors": "282;646;305;56", "reply_reviewers": "0;1;1;0", "reply_authors": "1;1;2;1", "recommendation_avg": [ 7.0, 1.0 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "correctness_avg": [ 3.5, 0.8660254037844386 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 3.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 118.75, 37.559120064240055 ], "wc_summary_review_avg": [ 71.75, 42.115169476092575 ], "wc_main_review_avg": [ 146.5, 81.06941470122996 ], "wc_review_avg": [ 337.0, 119.50523001107524 ], "wc_reply_reviewers_avg": [ 52.25, 53.11485197192966 ], "wc_reply_authors_avg": [ 322.25, 210.72538409028942 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": -0.30151134457776363, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 39, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12624091686670964510&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=FmBegXJToY", "email": "umontreal.ca;google.com;deepmind.com;google.com;deepmind.com;google.com;;google.com", "author_num": 8, "aff_unique_index": "0;1;1;1;1;1;1", "aff_unique_norm": "University of Montreal;Google", "aff_unique_dep": "Mila;Google", "aff_unique_url": "https://www.mila.quebec;https://www.google.com", "aff_unique_abbr": "Mila;Google", "aff_campus_unique_index": "0;1;1", "aff_campus_unique": "Montreal;Mountain View;", "aff_country_unique_index": "0;1;2;2;2;1;2", "aff_country_unique": "Canada;United States;United Kingdom" }, { "title": "Do deep networks transfer invariances across classes?", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6042", "id": "Fn7i_r5rR0q", "poster": "", "openreview": "https://openreview.net/forum?id=Fn7i_r5rR0q", "slides": "https://iclr.cc/virtual/2022/poster/6042", "video": "https://iclr.cc/virtual/2022/poster/6042", "author_site": "Allan Zhou, Fahim Tajwar, Alexander Robey, Tom Knowles, George Pappas, Hamed Hassani, Chelsea Finn", "tldr": "", "abstract": "In order to generalize well, classifiers must learn to be invariant to nuisance transformations that do not alter an input's class. Many problems have \"class-agnostic\" nuisance transformations that apply similarly to all classes, such as lighting and background changes for image classification. Neural networks can learn these invariances given sufficient data, but many real-world datasets are heavily class imbalanced and contain only a few examples for most of the classes. We therefore pose the question: how well do neural networks transfer class-agnostic invariances learned from the large classes to the small ones? Through careful experimentation, we observe that invariance to class-agnostic transformations is still heavily dependent on class size, with the networks being much less invariant on smaller classes. This result holds even when using data balancing techniques, and suggests poor invariance transfer across classes. Our results provide one explanation for why classifiers generalize poorly on unbalanced and long-tailed distributions. Based on this analysis, we show how a generative approach for learning the nuisance transformations can help transfer invariances across classes and improve performance on a set of imbalanced image classification benchmarks.", "keywords": "invariance;augmentation;nuisance transformation;imbalance;long tail", "primary_area": "", "supplementary_material": "", "author": "Allan Zhou;Fahim Tajwar;Alexander Robey;Tom Knowles;George J. Pappas;Hamed Hassani;Chelsea Finn", "authorids": "~Allan_Zhou1;~Fahim_Tajwar1;~Alexander_Robey1;tknowles@stanford.edu;~George_J._Pappas1;~Hamed_Hassani2;~Chelsea_Finn1", "gender": ";M;M;;;M;F", "homepage": "http://bland.website;https://tajwarfahim.github.io/;https://arobey1.github.io/;;;https://www.seas.upenn.edu/~hassani/;https://ai.stanford.edu/~cbfinn/", "dblp": "195/6907;292/1504;242/9113;;;73/4984;131/1783", "google_scholar": ";iMlmLO4AAAAJ;V5NWZc8AAAAJ;;;;vfPE6hgAAAAJ", "orcid": ";0000-0001-9257-6282;;;;;", "linkedin": ";fahim-tajwar-8a5377162/;alexrobey/;;;;", "or_profile": "~Allan_Zhou1;~Fahim_Tajwar1;~Alexander_Robey1;tknowles@stanford.edu;~George_J._Pappas1;~Hamed_Hassani2;~Chelsea_Finn1", "aff": "Stanford University;Stanford University;Google;;;University of Pennsylvania;Google", "aff_domain": "stanford.edu;stanford.edu;google.com;;;upenn.edu;google.com", "position": "PhD student;Undergrad student;Intern;;;;Research Scientist", "bibtex": "@inproceedings{\nzhou2022do,\ntitle={Do deep networks transfer invariances across classes?},\nauthor={Allan Zhou and Fahim Tajwar and Alexander Robey and Tom Knowles and George J. Pappas and Hamed Hassani and Chelsea Finn},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=Fn7i_r5rR0q}\n}", "github": "", "project": "", "reviewers": "Zq9K;4nAf;k4Fm;PgZ2", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "5;2;4;4", "correctness": "4;2;2;3", "technical_novelty": "2;2;3;3", "empirical_novelty": "3;3;3;4", "wc_summary_paper": "90;95;77;146", "wc_summary_review": "59;29;121;93", "wc_main_review": "618;335;940;368", "wc_review": "767;459;1138;607", "wc_reply_reviewers": "167;141;19;0", "wc_reply_authors": "990;467;1138;330", "reply_reviewers": "1;2;1;0", "reply_authors": "4;3;3;1", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 3.75, 1.0897247358851685 ], "correctness_avg": [ 2.75, 0.82915619758885 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 3.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 102.0, 26.239283526803852 ], "wc_summary_review_avg": [ 75.5, 34.68068626771967 ], "wc_main_review_avg": [ 565.25, 242.4575993859545 ], "wc_review_avg": [ 742.75, 252.86001562129192 ], "wc_reply_reviewers_avg": [ 81.75, 73.1415579544215 ], "wc_reply_authors_avg": [ 731.25, 340.30381646405317 ], "reply_reviewers_avg": [ 1.0, 0.7071067811865476 ], "reply_authors_avg": [ 2.75, 1.0897247358851685 ], "replies_avg": [ 20, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": -0.15789473684210528, "corr_recommendation_correctness": -0.20751433915982243, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8418380015111535138&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=Fn7i_r5rR0q", "email": "stanford.edu;stanford.edu;google.com;;;upenn.edu;google.com", "author_num": 7, "aff_unique_index": "0;0;1;2;1", "aff_unique_norm": "Stanford University;Google;University of Pennsylvania", "aff_unique_dep": ";Google;", "aff_unique_url": "https://www.stanford.edu;https://www.google.com;https://www.upenn.edu", "aff_unique_abbr": "Stanford;Google;UPenn", "aff_campus_unique_index": "0;0;1;1", "aff_campus_unique": "Stanford;Mountain View;", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Learn Locally, Correct Globally: A Distributed Algorithm for Training Graph Neural Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7152", "id": "FndDxSz3LxQ", "poster": "", "openreview": "https://openreview.net/forum?id=FndDxSz3LxQ", "slides": "https://iclr.cc/virtual/2022/poster/7152", "video": "https://iclr.cc/virtual/2022/poster/7152", "author_site": "Morteza Ramezani, Weilin Cong, Mehrdad Mahdavi, Mahmut Kandemir, Anand Sivasubramaniam", "tldr": "", "abstract": "Despite the recent success of Graph Neural Networks (GNNs), training GNNs on large graphs remains challenging. The limited resource capacities of the existing servers, the dependency between nodes in a graph, and the privacy concern due to the centralized storage and model learning have spurred the need to design an effective distributed algorithm for GNN training. However, existing distributed GNN training methods impose either excessive communication costs or large memory overheads that hinders their scalability. To overcome these issues, we propose a communication-efficient distributed GNN training technique named $\\text{\\textit{Learn Locally, Correct Globally}}$ (LLCG). To reduce the communication and memory overhead, each local machine in LLCG first trains a GNN on its local data by ignoring the dependency between nodes among different machines, then sends the locally trained model to the server for periodic model averaging. However, ignoring node dependency could result in significant performance degradation. To solve the performance degradation, we propose to apply $\\text{\\textit{Global Server Corrections}}$ on the server to refine the locally learned models. We rigorously analyze the convergence of distributed methods with periodic model averaging for training GNNs and show that naively applying periodic model averaging but ignoring the dependency between nodes will suffer from an irreducible residual error. However, this residual error can be eliminated by utilizing the proposed global corrections to entail fast convergence rate. Extensive experiments on real-world datasets show that LLCG can significantly improve the efficiency without hurting the performance.", "keywords": "Graph Neural Networks;GNN;GCN;Distributed Training", "primary_area": "", "supplementary_material": "/attachment/a9e70eb4cee8090fdba47e299ac2bbf91f455dd3.zip", "author": "Morteza Ramezani;Weilin Cong;Mehrdad Mahdavi;Mahmut Kandemir;Anand Sivasubramaniam", "authorids": "~Morteza_Ramezani1;~Weilin_Cong1;~Mehrdad_Mahdavi2;~Mahmut_Kandemir1;~Anand_Sivasubramaniam1", "gender": "M;M;M;M;M", "homepage": "http://morteza.me;https://congweilin.github.io/CongWeilin.io/;http://www.cse.psu.edu/~mzm616/;http://www.cse.psu.edu/hpcl/kandemir/;", "dblp": "149/4523;203/8227;88/4321;k/MahmutTKandemir.html;", "google_scholar": ";yYHxZ6MAAAAJ;HzxnwocAAAAJ;j67v24EAAAAJ;https://scholar.google.co.in/citations?user=JWXlepgAAAAJ", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~Morteza_Ramezani1;~Weilin_Cong1;~Mehrdad_Mahdavi2;~Mahmut_Kandemir1;~Anand_Sivasubramaniam1", "aff": "Pennsylvania State University;Meta;Toyota Technological Institute at Chicago;Pennsylvania State University;Pennsylvania State University, Pennsylvania State University", "aff_domain": "psu.edu;meta.com;ttic.edu;psu.edu;cse.psu.edu", "position": "PhD student;Intern;Researcher;Full Professor;Full Professor", "bibtex": "@inproceedings{\nramezani2022learn,\ntitle={Learn Locally, Correct Globally: A Distributed Algorithm for Training Graph Neural Networks},\nauthor={Morteza Ramezani and Weilin Cong and Mehrdad Mahdavi and Mahmut Kandemir and Anand Sivasubramaniam},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=FndDxSz3LxQ}\n}", "github": "", "project": "", "reviewers": "wvQK;Wu2z;qLfR;5Wg7", "pdf_size": 0, "recommendation": "5;6;6;6", "confidence": "3;2;3;3", "correctness": "3;3;4;3", "technical_novelty": "2;3;3;3", "empirical_novelty": "0;3;3;3", "wc_summary_paper": "88;87;108;145", "wc_summary_review": "25;43;54;25", "wc_main_review": "330;480;316;229", "wc_review": "443;610;478;399", "wc_reply_reviewers": "0;329;12;17", "wc_reply_authors": "851;1647;325;696", "reply_reviewers": "0;1;1;1", "reply_authors": "2;4;2;2", "recommendation_avg": [ 5.75, 0.4330127018922193 ], "confidence_avg": [ 2.75, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 1.299038105676658 ], "wc_summary_paper_avg": [ 107.0, 23.484037131634757 ], "wc_summary_review_avg": [ 36.75, 12.376893794486563 ], "wc_main_review_avg": [ 338.75, 90.26454176474836 ], "wc_review_avg": [ 482.5, 78.75436495839453 ], "wc_reply_reviewers_avg": [ 89.5, 138.41333028288858 ], "wc_reply_authors_avg": [ 879.75, 482.4444916257206 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 2.5, 0.8660254037844386 ], "replies_avg": [ 20, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 44, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5671502127283735116&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "pdf": "https://openreview.net/pdf?id=FndDxSz3LxQ", "email": "psu.edu;meta.com;ttic.edu;psu.edu;cse.psu.edu", "author_num": 5, "aff_unique_index": "0;1;2;0;0", "aff_unique_norm": "Pennsylvania State University;Meta;Toyota Technological Institute at Chicago", "aff_unique_dep": ";Meta Platforms, Inc.;", "aff_unique_url": "https://www.psu.edu;https://meta.com;https://www.tti-chicago.org", "aff_unique_abbr": "PSU;Meta;TTI Chicago", "aff_campus_unique_index": "1", "aff_campus_unique": ";Chicago", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "FpKgG31Z_i9", "title": "Learning Rate Grafting: Transferability of Optimizer Tuning", "track": "main", "status": "Reject", "tldr": "", "abstract": "In the empirical science of training large neural networks, the learning rate schedule is a notoriously challenging-to-tune hyperparameter, which can depend on all other properties (architecture, optimizer, batch size, dataset, regularization, ...) of the problem. In this work, we probe the entanglements between the optimizer and the learning rate schedule. We propose the technique of optimizer grafting, which allows for the transfer of the overall implicit step size schedule from a tuned optimizer to a new optimizer, preserving empirical performance. This provides a robust plug-and-play baseline for optimizer comparisons, leading to reductions to the computational cost of optimizer hyperparameter search. Using grafting, we discover a non-adaptive learning rate correction to SGD which allows it to train a BERT model to state-of-the-art performance. Besides providing a resource-saving tool for practitioners, the invariances discovered via grafting shed light on the successes and failure modes of optimizers in deep learning.", "keywords": "Optimization;Learning Rate Schedules;BERT", "primary_area": "", "supplementary_material": "", "author": "Naman Agarwal;Rohan Anil;Elad Hazan;Tomer Koren;Cyril Zhang", "authorids": "~Naman_Agarwal1;~Rohan_Anil1;~Elad_Hazan1;~Tomer_Koren1;~Cyril_Zhang1", "gender": "M;M;M;M;", "homepage": "https://naman33k.github.io;;https://www.ehazan.com;https://tomerkoren.github.io;https://cyrilzhang.com", "dblp": "72/3910;182/1833;72/739;12/10044;203/4448", "google_scholar": "sEMrGicAAAAJ;;LnhCGNMAAAAJ;wGG1voYAAAAJ;sXtjq8IAAAAJ", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~Naman_Agarwal1;~Rohan_Anil1;~Elad_Hazan1;~Tomer_Koren1;~Cyril_Zhang1", "aff": "Google;Google Brain ;Princeton University;Tel Aviv University;Microsoft", "aff_domain": "google.com;google.com;princeton.edu;tau.ac.il;microsoft.com", "position": "Researcher;Principal Engineer;Full Professor;Assistant Professor;Senior Researcher", "bibtex": "@misc{\nagarwal2022learning,\ntitle={Learning Rate Grafting: Transferability of Optimizer Tuning},\nauthor={Naman Agarwal and Rohan Anil and Elad Hazan and Tomer Koren and Cyril Zhang},\nyear={2022},\nurl={https://openreview.net/forum?id=FpKgG31Z_i9}\n}", "github": "", "project": "", "reviewers": "4k79;JLsZ;9GnN;B67d", "site": "https://openreview.net/forum?id=FpKgG31Z_i9", "pdf_size": 0, "recommendation": "3;3;3;8", "confidence": "5;3;5;3", "correctness": "3;2;2;3", "technical_novelty": "1;2;2;3", "empirical_novelty": "2;2;2;4", "wc_summary_paper": "85;374;51;76", "wc_summary_review": "12;32;14;44", "wc_main_review": "198;357;219;793", "wc_review": "295;763;284;913", "wc_reply_reviewers": "25;0;0;0", "wc_reply_authors": "299;413;423;254", "reply_reviewers": "1;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 4.25, 2.165063509461097 ], "confidence_avg": [ 4.0, 1.0 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 146.5, 131.93653777479534 ], "wc_summary_review_avg": [ 25.5, 13.219304066402286 ], "wc_main_review_avg": [ 391.75, 239.57814487135508 ], "wc_review_avg": [ 563.75, 279.3576336884317 ], "wc_reply_reviewers_avg": [ 6.25, 10.825317547305483 ], "wc_reply_authors_avg": [ 347.25, 72.60294415517872 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:K2hGvqeqeGgJ:scholar.google.com/&scioq=Learning+Rate+Grafting:+Transferability+of+Optimizer+Tuning&hl=en&as_sdt=0,33", "gs_version_total": 2, "aff_unique_index": "0;0;1;2;3", "aff_unique_norm": "Google;Princeton University;Tel Aviv University;Microsoft", "aff_unique_dep": "Google;;;Microsoft Corporation", "aff_unique_url": "https://www.google.com;https://www.princeton.edu;https://www.tau.ac.il;https://www.microsoft.com", "aff_unique_abbr": "Google;Princeton;TAU;Microsoft", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;0;0;1;0", "aff_country_unique": "United States;Israel" }, { "id": "FpnQMmnsE8Y", "title": "Recurrent Parameter Generators", "track": "main", "status": "Reject", "tldr": "", "abstract": "We present a generic method for recurrently using the same parameters for many different convolution layers to build a deep network. Specifically, for a network, we create a recurrent parameter generator (RPG), from which the parameters of each convolution layer are generated. Though using recurrent models to build a deep convolutional neural network (CNN) is not entirely new, our method achieves significant performance gain compared to the existing works. We demonstrate how to build a one-layer-size neural network to achieve similar performance compared to other traditional CNN models on various applications and datasets. We use the RPG to build a ResNet18 network with the number of weights equivalent to one convolutional layer of a conventional ResNet and show this model can achieve $67.2\\%$ ImageNet top-1 accuracy. Additionally, such a method allows us to build an arbitrarily complex neural network with any amount of parameters. For example, we build a ResNet34 with model parameters reduced by more than $400$ times, which still achieves $41.6\\%$ ImageNet top-1 accuracy. Furthermore, the RPG can be further pruned and quantized for better run-time performance in addition to the model size reduction. We provide a new perspective for model compression. Rather than shrinking parameters from a large model, RPG sets a certain parameter-size constraint and uses the gradient descent algorithm to automatically find the best model under the constraint. Extensive experiment results are provided to demonstrate the power of the proposed recurrent parameter generator.\n", "keywords": "recurrent;parameters;degrees of freedom", "primary_area": "", "supplementary_material": "/attachment/41381dc77027d2e270140c749ff957bd1625d9d4.zip", "author": "Jiayun Wang;Yubei Chen;Stella Yu;Brian Cheung;Yann LeCun", "authorids": "~Jiayun_Wang1;~Yubei_Chen1;~Stella_Yu2;~Brian_Cheung1;~Yann_LeCun1", "gender": "M;M;F;M;M", "homepage": "http://pwang.pw/;https://redwood.berkeley.edu/people/yubei-chen/;http://www.eecs.umich.edu/~stellayu;https://briancheung.github.io/;http://yann.lecun.com", "dblp": "203/8972;30/10064;58/5089;;l/YannLeCun", "google_scholar": "IBn7PdYAAAAJ;WeyLqFUAAAAJ;https://scholar.google.com/citations?hl=en;7N-ethYAAAAJ;WLN3QrAAAAAJ", "orcid": ";;;;", "linkedin": "peterjwang;yubei-chen-05998a39/;;;", "or_profile": "~Jiayun_Wang1;~Yubei_Chen1;~Stella_Yu2;~Brian_Cheung1;~Yann_LeCun1", "aff": "Amazon;Facebook AI Research;University of California, Berkeley;Massachusetts Institute of Technology;New York University", "aff_domain": "amazon.com;facebook.com;berkeley.edu;mit.edu;nyu.edu", "position": "Intern;Postdoc Researcher;Director, ICSI Vision Group;Research Fellow;Full Professor", "bibtex": "@misc{\nwang2022recurrent,\ntitle={Recurrent Parameter Generators},\nauthor={Jiayun Wang and Yubei Chen and Stella Yu and Brian Cheung and Yann LeCun},\nyear={2022},\nurl={https://openreview.net/forum?id=FpnQMmnsE8Y}\n}", "github": "", "project": "", "reviewers": "cgCS;W9rN;HpYd;xUeP", "site": "https://openreview.net/forum?id=FpnQMmnsE8Y", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "5;5;4;3", "correctness": "3;3;3;3", "technical_novelty": "2;3;3;2", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "50;44;117;94", "wc_summary_review": "38;34;53;111", "wc_main_review": "158;521;612;551", "wc_review": "246;599;782;756", "wc_reply_reviewers": "34;280;0;159", "wc_reply_authors": "810;1643;930;1252", "reply_reviewers": "1;2;0;1", "reply_authors": "2;3;2;2", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 4.25, 0.82915619758885 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 76.25, 30.433328769623607 ], "wc_summary_review_avg": [ 59.0, 30.84639363037436 ], "wc_main_review_avg": [ 460.5, 177.69988745072408 ], "wc_review_avg": [ 595.75, 213.7198809189262 ], "wc_reply_reviewers_avg": [ 118.25, 110.57209186770412 ], "wc_reply_authors_avg": [ 1158.75, 322.93449413154985 ], "reply_reviewers_avg": [ 1.0, 0.7071067811865476 ], "reply_authors_avg": [ 2.25, 0.4330127018922193 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.9045340337332909, "corr_recommendation_correctness": 0.0, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13826277910632091923&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;2;3;4", "aff_unique_norm": "Amazon;Meta;University of California, Berkeley;Massachusetts Institute of Technology;New York University", "aff_unique_dep": "Amazon.com, Inc.;Facebook AI Research;;;", "aff_unique_url": "https://www.amazon.com;https://research.facebook.com;https://www.berkeley.edu;https://web.mit.edu;https://www.nyu.edu", "aff_unique_abbr": "Amazon;FAIR;UC Berkeley;MIT;NYU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Berkeley", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "FqKolXKrQGA", "title": "Learning to Infer the Structure of Network Games", "track": "main", "status": "Reject", "tldr": "", "abstract": "Strategic interactions between a group of individuals or organisations can be modelled as games played on networks, where a player's payoff depends not only on their actions but also on those of their neighbors. \nInferring the network structure from observed game outcomes (equilibrium actions) is an important problem with numerous potential applications in economics and social sciences. \nCurrently available methods require the knowledge of the utility function associated with the game, which is often unrealistic to obtain in real-world scenarios. To address this limitation, we propose a novel transformer-like architecture which correctly accounts for the symmetries of the problem and learns a mapping from the equilibrium actions to the network structure of the game without explicit knowledge of the utility function. We test our method on three different types of network games using both synthetic and real-world data, and demonstrate its effectiveness in network structure inference and superior performance over existing methods.", "keywords": "graphs;networks;game theory;graph neural networks", "primary_area": "", "supplementary_material": "/attachment/73fdca73ae7f06811c14fc5d7029a60817976cca.zip", "author": "Emanuele Rossi;Federico Monti;Yan Leng;Michael M. Bronstein;Xiaowen Dong", "authorids": "~Emanuele_Rossi1;~Federico_Monti2;~Yan_Leng1;~Michael_M._Bronstein1;~Xiaowen_Dong1", "gender": "M;M;;M;", "homepage": "https://www.emanuelerossi.co.uk/;https://www.ics.usi.ch/index.php/people-detail-page/268-federico-monti;http://web.mit.edu/yleng/www/;http://www.inf.usi.ch/bronstein/;https://web.media.mit.edu/~xdong/", "dblp": ";170/0002;;07/2668;91/9827-1", "google_scholar": "DHlkBOYAAAAJ;NUdNFucAAAAJ;WfU3qjQAAAAJ;UU3N6-UAAAAJ;_8tUq8kAAAAJ", "orcid": ";;;;", "linkedin": ";federico-monti;;mbronstein/;", "or_profile": "~Emanuele_Rossi1;~Federico_Monti2;~Yan_Leng1;~Michael_M._Bronstein1;~Xiaowen_Dong1", "aff": "Twitter;Twitter;University of Texas, Austin;Twitter;Massachusetts Institute of Technology", "aff_domain": "twitter.com;twitter.com;utexas.edu;twitter.com;mit.edu", "position": "Machine Learning Researcher;Researcher;Assistant Professor;Head of Graph ML;Research Affiliate", "bibtex": "@misc{\nrossi2022learning,\ntitle={Learning to Infer the Structure of Network Games},\nauthor={Emanuele Rossi and Federico Monti and Yan Leng and Michael M. Bronstein and Xiaowen Dong},\nyear={2022},\nurl={https://openreview.net/forum?id=FqKolXKrQGA}\n}", "github": "", "project": "", "reviewers": "awAV;bS2U;BhGi;XNyt;rnrt", "site": "https://openreview.net/forum?id=FqKolXKrQGA", "pdf_size": 0, "recommendation": "3;3;5;6;6", "confidence": "3;3;3;3;2", "correctness": "4;2;4;4;4", "technical_novelty": "2;1;1;3;2", "empirical_novelty": "3;1;2;0;3", "wc_summary_paper": "84;55;75;120;182", "wc_summary_review": "87;17;52;37;16", "wc_main_review": "403;244;579;142;174", "wc_review": "574;316;706;299;372", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "0;0;0;0;0", "reply_reviewers": "0;0;0;0;0", "reply_authors": "0;0;0;0;0", "recommendation_avg": [ 4.6, 1.3564659966250536 ], "confidence_avg": [ 2.8, 0.39999999999999997 ], "correctness_avg": [ 3.6, 0.8000000000000002 ], "technical_novelty_avg": [ 1.8, 0.7483314773547883 ], "empirical_novelty_avg": [ 1.8, 1.1661903789690602 ], "wc_summary_paper_avg": [ 103.2, 44.67392975774573 ], "wc_summary_review_avg": [ 41.8, 26.270896444544864 ], "wc_main_review_avg": [ 308.4, 162.52581333437467 ], "wc_review_avg": [ 453.4, 159.8093864577422 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.5160468465421401, "corr_recommendation_correctness": 0.5897678246195885, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16150914369122606457&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff_unique_index": "0;0;1;0;2", "aff_unique_norm": "Twitter, Inc.;University of Texas at Austin;Massachusetts Institute of Technology", "aff_unique_dep": ";;", "aff_unique_url": "https://twitter.com;https://www.utexas.edu;https://web.mit.edu", "aff_unique_abbr": "Twitter;UT Austin;MIT", "aff_campus_unique_index": "1", "aff_campus_unique": ";Austin", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "FqMXxvHquTA", "title": "SegTime: Precise Time Series Segmentation without Sliding Window", "track": "main", "status": "Reject", "tldr": "", "abstract": "Time series are common in a wide range of domains and tasks such as stock market partitioning, sleep stage labelling, and human activity recognition, where segmentation, i.e. splitting time series into segments that correspond to given categories, is often required. A common approach to segmentation is to sub-sample the time series using a sliding window with a certain length and overlapping stride, to create sub-sequences of fixed length, and then classify these sub-sequences into the given categories. This reduces time series segmentation to classification. However, this approach guarantees to find only approximate breakpoints: the precise breakpoints can appear in sub-sequences, and thus the accuracy of segmentation degrades when labels change fast. Also, it ignores possible long-term dependencies between sub-sequences. We propose a neural networks approach SegTime that finds precise breakpoints, obviates sliding windows, handles long-term dependencies, and it is insensitive to the label changing frequency. SegTime does so, thanks to its bi-pass architecture with several structures that can process information in a multi-scale fashion. We extensively evaluated the effectiveness of SegTime with very promising results. ", "keywords": "time series;time series segmentation;lstm;rnn;architecture;cnn;pyramid pooling;multi-scale pooling;sequence;encoder;decoder;resnet;step-wise", "primary_area": "", "supplementary_material": "/attachment/c391bcb86faee3d2776bc24fc6353c9662828197.zip", "author": "Li Zeng;Baifan Zhou;Mohammad Al-Rifai;Evgeny Kharlamov", "authorids": "li.zeng@volkswagen.de;~Baifan_Zhou1;mohammad.al-rifai@volkswagen.de;~Evgeny_Kharlamov1", "gender": ";;;M", "homepage": ";https://www.mn.uio.no/ifi/english/people/aca/baifanz/;;https://www.mn.uio.no/ifi/english/people/aca/evgenykh/", "dblp": ";276/5109;;20/4833", "google_scholar": ";zaxtQDEAAAAJ;;https://scholar.google.de/citations?user=-slpMF8AAAAJ", "orcid": ";0000-0003-3698-0541;;0000-0003-3247-4166", "linkedin": ";;;", "or_profile": "li.zeng@volkswagen.de;~Baifan_Zhou1;mohammad.al-rifai@volkswagen.de;~Evgeny_Kharlamov1", "aff": ";University of Oslo, Norway;;Robert Bosch GmbH, Bosch", "aff_domain": ";uio.no;;de.bosch.com", "position": ";Postdoc;;Bosch Center for Artificial Intelligence", "bibtex": "@misc{\nzeng2022segtime,\ntitle={SegTime: Precise Time Series Segmentation without Sliding Window},\nauthor={Li Zeng and Baifan Zhou and Mohammad Al-Rifai and Evgeny Kharlamov},\nyear={2022},\nurl={https://openreview.net/forum?id=FqMXxvHquTA}\n}", "github": "", "project": "", "reviewers": "SjGH;gLxG;DKC1;1Qsp", "site": "https://openreview.net/forum?id=FqMXxvHquTA", "pdf_size": 0, "recommendation": "3;5;5;5", "confidence": "5;4;3;3", "correctness": "2;3;2;4", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "122;128;125;100", "wc_summary_review": "58;42;75;70", "wc_main_review": "473;292;377;418", "wc_review": "653;462;577;588", "wc_reply_reviewers": "495;26;0;48", "wc_reply_authors": "1220;336;231;241", "reply_reviewers": "1;1;0;1", "reply_authors": "3;2;1;1", "recommendation_avg": [ 4.5, 0.8660254037844386 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 2.75, 0.82915619758885 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 118.75, 11.031205736455105 ], "wc_summary_review_avg": [ 61.25, 12.71563997602952 ], "wc_main_review_avg": [ 390.0, 66.04165352260647 ], "wc_review_avg": [ 570.0, 68.78589971789276 ], "wc_reply_reviewers_avg": [ 142.25, 204.36777510165345 ], "wc_reply_authors_avg": [ 507.0, 413.6852668394174 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.75, 0.82915619758885 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.8703882797784891, "corr_recommendation_correctness": 0.5222329678670935, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:O0VbpsQu6CgJ:scholar.google.com/&scioq=SegTime:+Precise+Time+Series+Segmentation+without+Sliding+Window&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1", "aff_unique_norm": "University of Oslo;Robert Bosch GmbH", "aff_unique_dep": ";", "aff_unique_url": "https://www.uio.no;https://www.bosch.com", "aff_unique_abbr": "UiO;Bosch", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "Norway;Germany" }, { "id": "FqRHeQTDU5N", "title": "Learning to Give Checkable Answers with Prover-Verifier Games", "track": "main", "status": "Reject", "tldr": "", "abstract": "Our ability to know when to trust the decisions made by machine learning systems has not kept up with the staggering improvements in their performance, limiting their applicability in high-stakes applications. We propose Prover-Verifier Games (PVGs), a game-theoretic framework to encourage neural networks to solve decision problems in a verifiable manner. The PVG consists of two learners with competing objectives: a trusted verifier network tries to choose the correct answer, and a more powerful but untrusted prover network attempts to persuade the verifier of a particular answer, regardless of its correctness. The goal is for a reliable justification protocol to emerge from this game. We analyze several variants of the basic framework, including both simultaneous and sequential games, and narrow the space down to a subset of games which provably have the desired equilibria. We then develop practical instantiations of the PVG for several algorithmic tasks, and show that in practice, the verifier is able to receive useful and reliable information from an untrusted prover. Importantly, the protocol still works even when the verifier is frozen and the prover's message is directly optimized to convince the verifier.", "keywords": "AI Safety;verifiable learning;robustness;adversarial learning;proof systems", "primary_area": "", "supplementary_material": "/attachment/104c6683ea32366baae43ec5bd79e22223c1fa10.zip", "author": "Cem Anil;Guodong Zhang;Yuhuai Wu;Roger Baker Grosse", "authorids": "~Cem_Anil1;~Guodong_Zhang1;~Yuhuai_Wu1;~Roger_Baker_Grosse1", "gender": "M;M;M;M", "homepage": "https://www.cs.toronto.edu/~anilcem/;http://www.cs.toronto.edu/~gdzhang/;http://www.cs.toronto.edu/~ywu/;http://www.cs.toronto.edu/~rgrosse/", "dblp": "218/6350;28/4937;;26/7058", "google_scholar": "1VDV6ZEAAAAJ;B_TZBtwAAAAJ;https://scholar.google.ca/citations?user=bOQGfFIAAAAJ;xgQd1qgAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Cem_Anil1;~Guodong_Zhang1;~Yuhuai_Wu1;~Roger_Baker_Grosse1", "aff": "Toronto University;Department of Computer Science, University of Toronto;Stanford University;Department of Computer Science, University of Toronto", "aff_domain": "utoronto.ca;cs.toronto.edu;stanford.edu;cs.toronto.edu", "position": "PhD student;PhD student;Postdoc;Assistant Professor", "bibtex": "@misc{\nanil2022learning,\ntitle={Learning to Give Checkable Answers with Prover-Verifier Games},\nauthor={Cem Anil and Guodong Zhang and Yuhuai Wu and Roger Baker Grosse},\nyear={2022},\nurl={https://openreview.net/forum?id=FqRHeQTDU5N}\n}", "github": "", "project": "", "reviewers": "wJaU;cqwN;uyg3;RcRe", "site": "https://openreview.net/forum?id=FqRHeQTDU5N", "pdf_size": 0, "recommendation": "5;6;6;6", "confidence": "3;3;1;3", "correctness": "4;4;4;3", "technical_novelty": "2;3;2;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "132;104;83;113", "wc_summary_review": "47;41;60;45", "wc_main_review": "224;469;198;185", "wc_review": "403;614;341;343", "wc_reply_reviewers": "0;240;0;0", "wc_reply_authors": "559;1229;278;317", "reply_reviewers": "0;1;0;0", "reply_authors": "2;3;2;2", "recommendation_avg": [ 5.75, 0.4330127018922193 ], "confidence_avg": [ 2.5, 0.8660254037844386 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 108.0, 17.621010186706094 ], "wc_summary_review_avg": [ 48.25, 7.119515432949071 ], "wc_main_review_avg": [ 269.0, 116.3206774395679 ], "wc_review_avg": [ 425.25, 111.78634755639885 ], "wc_reply_reviewers_avg": [ 60.0, 103.92304845413264 ], "wc_reply_authors_avg": [ 595.75, 381.1242415538534 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 2.25, 0.4330127018922193 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": -0.3333333333333333, "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4584645678923057630&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "University of Toronto;Stanford University", "aff_unique_dep": ";", "aff_unique_url": "https://www.utoronto.ca;https://www.stanford.edu", "aff_unique_abbr": "U of T;Stanford", "aff_campus_unique_index": "1;2;1", "aff_campus_unique": ";Toronto;Stanford", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "Canada;United States" }, { "id": "FrJFF4YxWm", "title": "Learning Rational Skills for Planning from Demonstrations and Instructions", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "We present a framework for learning compositional, rational skill models (RatSkills) that support efficient planning and inverse planning for achieving novel goals and recognizing activities. In contrast to directly learning a set of policies that maps states to actions, in RatSkills, we represent each skill as a subgoal and can be executed based on a planning subroutine. RatSkills can be learned by observing expert demonstrations and reading abstract language descriptions of thecorresponding task (e.g.,collect wood then craft a boat then go across the river).The learned subgoal-based representation enables inference of another agent\u2019s intended task from their actions via Bayesian inverse planning. It also supports planning for novel objectives given in the form of either temporal task descriptions or black-box goal tests. We demonstrate through experiments in both discrete and continuous domains that our learning algorithms recover a set of RatSkills by observing and explaining other agents\u2019 movements, and plan efficiently for novel goals by composing learned skills.", "keywords": "Learning for Planning;Compositional Generalization", "primary_area": "", "supplementary_material": "", "author": "Zhezheng Luo;Jiayuan Mao;Jiajun Wu;Tomas Perez;Joshua B. Tenenbaum;Leslie Pack Kaelbling", "authorids": "~Zhezheng_Luo1;~Jiayuan_Mao1;~Jiajun_Wu1;~Tomas_Perez1;~Joshua_B._Tenenbaum1;~Leslie_Pack_Kaelbling1", "gender": "M;F;M;;F;M", "homepage": "https://www.csail.mit.edu/person/zhezheng-luo;http://jiayuanm.com;https://jiajunwu.com;;http://people.csail.mit.edu/lpk/;http://people.csail.mit.edu/tlp/", "dblp": "https://dblp.uni-trier.de/pid/258/3621.html;200/8283;117/4768;t/JoshuaBTenenbaum;k/LesliePackKaelbling;90/752", "google_scholar": ";-xaOIZIAAAAJ;2efgcS0AAAAJ;;IcasIiwAAAAJ;gQOKAggAAAAJ", "orcid": ";0000-0003-4798-3748;0000-0002-4176-343X;;0000-0001-6054-7145;", "linkedin": ";;jiajunwu/;;;", "or_profile": "~Zhezheng_Luo1;~Jiayuan_Mao1;~Jiajun_Wu1;~Joshua_B._Tenenbaum1;~Leslie_Pack_Kaelbling1;~Tom\u00e1s_Lozano-P\u00e9rez1", "aff": ";Massachusetts Institute of Technology;Stanford University;Massachusetts Institute of Technology;Massachusetts Institute of Technology;Massachusetts Institute of Technology", "aff_domain": ";mit.edu;stanford.edu;mit.edu;mit.edu;mit.edu", "position": ";PhD student;Assistant Professor;Professor;Full Professor;Full Professor", "bibtex": "@misc{\nluo2022learning,\ntitle={Learning Rational Skills for Planning from Demonstrations and Instructions},\nauthor={Zhezheng Luo and Jiayuan Mao and Jiajun Wu and Tomas Perez and Joshua B. Tenenbaum and Leslie Pack Kaelbling},\nyear={2022},\nurl={https://openreview.net/forum?id=FrJFF4YxWm}\n}", "github": "", "project": "", "reviewers": "jHeZ;sPaL;B9ia;Ugk6", "site": "https://openreview.net/forum?id=FrJFF4YxWm", "pdf_size": 0, "recommendation": "3;3;6;6", "confidence": "4;4;2;2", "correctness": "3;2;3;4", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;3;3;2", "wc_summary_paper": "102;205;107;66", "wc_summary_review": "74;90;72;29", "wc_main_review": "1269;1455;1145;400", "wc_review": "1445;1750;1324;495", "wc_reply_reviewers": "0;466;567;0", "wc_reply_authors": "2136;2449;1397;266", "reply_reviewers": "0;1;1;0", "reply_authors": "3;4;3;1", "recommendation_avg": [ 4.5, 1.5 ], "confidence_avg": [ 3.0, 1.0 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 120.0, 51.56064390598706 ], "wc_summary_review_avg": [ 66.25, 22.609455986378798 ], "wc_main_review_avg": [ 1067.25, 400.7245781082064 ], "wc_review_avg": [ 1253.5, 464.6173156480503 ], "wc_reply_reviewers_avg": [ 258.25, 260.7070913880173 ], "wc_reply_authors_avg": [ 1562.0, 840.1020771311067 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 2.75, 1.0897247358851685 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -1.0, "corr_recommendation_correctness": 0.7071067811865476, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:5WCRg85qzy8J:scholar.google.com/&scioq=Learning+Rational+Skills+for+Planning+from+Demonstrations+and+Instructions&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1;0;0;0", "aff_unique_norm": "Massachusetts Institute of Technology;Stanford University", "aff_unique_dep": ";", "aff_unique_url": "https://web.mit.edu;https://www.stanford.edu", "aff_unique_abbr": "MIT;Stanford", "aff_campus_unique_index": "1", "aff_campus_unique": ";Stanford", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "FuLL40HLCRn", "title": "ST-DDPM: Explore Class Clustering for Conditional Diffusion Probabilistic Models", "track": "main", "status": "Reject", "tldr": "", "abstract": "Score-based generative models involve sequentially corrupting the data distribution with noise and then learns to recover the data distribution based on score matching. In this paper, for the diffusion probabilistic models, we first delve into the changes of data distribution during the forward process of the Markov chain and explore the class clustering phenomenon. Inspired by the class clustering phenomenon, we devise a novel conditional diffusion probabilistic model by explicitly modeling the class center in the forward and reverse process, and make an elegant modification to the original formulation, which enables controllable generation and gets interpretability. We also provide another direction for faster sampling and more analysis of our method. To verify the effectiveness of the formulated framework, we conduct extensive experiments on multiple tasks, and achieve competitive results compared with the state-of-the-art methods(conditional image generation on CIFAR-10 with an inception score of 9.58 and FID score of 3.05).", "keywords": "conditional generation;diffusion models;decoupling;interpretability", "primary_area": "", "supplementary_material": "", "author": "Zhijie Lin;Zijian Zhang;Zhou Zhao", "authorids": "~Zhijie_Lin1;~Zijian_Zhang3;~Zhou_Zhao2", "gender": "M;M;M", "homepage": ";https://ckczzj.com;https://dblp.uni-trier.de/pid/75/7785.html?", "dblp": ";43/6524-2;75/7785", "google_scholar": "xXMj6_EAAAAJ;TZ0nnhgAAAAJ;https://scholar.google.com.hk/citations?user=IIoFY90AAAAJ", "orcid": "0000-0003-3461-8952;0000-0001-8308-768X;0000-0001-6121-0384", "linkedin": ";;", "or_profile": "~Zhijie_Lin1;~Zijian_Zhang3;~Zhou_Zhao2", "aff": "Zhejiang University;Zhejiang University;Zhejiang University", "aff_domain": "zju.edu.cn;zju.edu.cn;zju.edu.cn", "position": "MS student;PhD student;Associate Professor", "bibtex": "@misc{\nlin2022stddpm,\ntitle={{ST}-{DDPM}: Explore Class Clustering for Conditional Diffusion Probabilistic Models},\nauthor={Zhijie Lin and Zijian Zhang and Zhou Zhao},\nyear={2022},\nurl={https://openreview.net/forum?id=FuLL40HLCRn}\n}", "github": "", "project": "", "reviewers": "HJtB;FjT6;Xxqm;49RM", "site": "https://openreview.net/forum?id=FuLL40HLCRn", "pdf_size": 0, "recommendation": "6;6;6;6", "confidence": "4;3;3;3", "correctness": "3;3;4;3", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "40;166;93;70", "wc_summary_review": "14;30;7;26", "wc_main_review": "363;286;209;319", "wc_review": "417;482;309;415", "wc_reply_reviewers": "389;71;89;0", "wc_reply_authors": "1560;875;399;313", "reply_reviewers": "4;1;2;0", "reply_authors": "4;2;2;1", "recommendation_avg": [ 6.0, 0.0 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 92.25, 46.54231945230061 ], "wc_summary_review_avg": [ 19.25, 9.202581159652981 ], "wc_main_review_avg": [ 294.25, 56.29109609876148 ], "wc_review_avg": [ 405.75, 62.02166960022924 ], "wc_reply_reviewers_avg": [ 137.25, 149.10797262386743 ], "wc_reply_authors_avg": [ 786.75, 495.0991693590285 ], "reply_reviewers_avg": [ 1.75, 1.479019945774904 ], "reply_authors_avg": [ 2.25, 1.0897247358851685 ], "replies_avg": [ 22, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:zD_D80ddwOwJ:scholar.google.com/&scioq=ST-DDPM:+Explore+Class+Clustering+for+Conditional+Diffusion+Probabilistic+Models&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;0", "aff_unique_norm": "Zhejiang University", "aff_unique_dep": "", "aff_unique_url": "https://www.zju.edu.cn", "aff_unique_abbr": "ZJU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "id": "FvfV64rovnY", "title": "Explaining Scaling Laws of Neural Network Generalization", "track": "main", "status": "Reject", "tldr": "", "abstract": "The test loss of well-trained neural networks often follows precise power-law scaling relations with either the size of the training dataset or the number of parameters in the network. We propose a theory that explains and connects these scaling laws. We identify variance-limited and resolution-limited scaling behavior for both dataset and model size, for a total of four scaling regimes. The variance-limited scaling follows simply from the existence of a well-behaved infinite data or infinite width limit, while the resolution-limited regime can be explained by positing that models are effectively resolving a smooth data manifold. In the large width limit, this can be equivalently obtained from the spectrum of certain kernels, and we present evidence that large width and large dataset resolution-limited scaling exponents are related by a duality. We exhibit all four scaling regimes in the controlled setting of large random feature and pretrained models and test the predictions empirically on a range of standard architectures and datasets. We also observe several empirical relationships between datasets and scaling exponents: super-classing image tasks does not change exponents, while changing input distribution (via changing datasets or adding noise) has a strong effect. We further explore the effect of architecture aspect ratio on scaling exponents.", "keywords": "scaling laws;neural networks;generalization;overparameterized models;underparameterized models", "primary_area": "", "supplementary_material": "", "author": "Yasaman Bahri;Ethan Dyer;Jared Kaplan;Jaehoon Lee;Utkarsh Sharma", "authorids": "~Yasaman_Bahri1;~Ethan_Dyer1;~Jared_Kaplan1;~Jaehoon_Lee2;usharma7@jhu.edu", "gender": "F;M;;;", "homepage": "https://yasamanb.github.io/;;https://sites.krieger.jhu.edu/jared-kaplan/;https://jaehlee.github.io;", "dblp": ";;;95/386-1.html;", "google_scholar": "p2_vHmAAAAAJ;;KNr3vb4AAAAJ;d3YhiooAAAAJ;", "orcid": ";;;;", "linkedin": "yasamanbahri;;;eejaehoon/;", "or_profile": "~Yasaman_Bahri1;~Ethan_Dyer1;~Jared_Kaplan1;~Jaehoon_Lee2;usharma7@jhu.edu", "aff": "Google Brain;Google;Johns Hopkins University;Google;", "aff_domain": "google.com;google.com;jhu.edu;google.com;", "position": "Research Scientist;Staff;Associate Professor;Research Scientist;", "bibtex": "@misc{\nbahri2022explaining,\ntitle={Explaining Scaling Laws of Neural Network Generalization},\nauthor={Yasaman Bahri and Ethan Dyer and Jared Kaplan and Jaehoon Lee and Utkarsh Sharma},\nyear={2022},\nurl={https://openreview.net/forum?id=FvfV64rovnY}\n}", "github": "", "project": "", "reviewers": "TzFb;f3X7;78Lv", "site": "https://openreview.net/forum?id=FvfV64rovnY", "pdf_size": 0, "recommendation": "3;3;5", "confidence": "4;3;2", "correctness": "2;3;3", "technical_novelty": "3;2;3", "empirical_novelty": "3;3;4", "wc_summary_paper": "64;66;121", "wc_summary_review": "104;39;58", "wc_main_review": "597;574;307", "wc_review": "765;679;486", "wc_reply_reviewers": "287;418;0", "wc_reply_authors": "1160;1855;649", "reply_reviewers": "1;1;0", "reply_authors": "2;3;1", "recommendation_avg": [ 3.6666666666666665, 0.9428090415820634 ], "confidence_avg": [ 3.0, 0.816496580927726 ], "correctness_avg": [ 2.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 83.66666666666667, 26.411277052720408 ], "wc_summary_review_avg": [ 67.0, 27.28858125052797 ], "wc_main_review_avg": [ 492.6666666666667, 131.62151124425756 ], "wc_review_avg": [ 643.3333333333334, 116.65999980951293 ], "wc_reply_reviewers_avg": [ 235.0, 174.56421931961506 ], "wc_reply_authors_avg": [ 1221.3333333333333, 494.2538708891301 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 2.0, 0.816496580927726 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.8660254037844387, "corr_recommendation_correctness": 0.5, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16500022929134675704&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Google;Johns Hopkins University", "aff_unique_dep": "Google Brain;", "aff_unique_url": "https://brain.google.com;https://www.jhu.edu", "aff_unique_abbr": "Google Brain;JHU", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "FxBdFwFjXX", "title": "Multi-Task Distribution Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Multi-Task Learning describes training on multiple tasks simultaneously to leverage the shared information between tasks. Tasks are typically defined as alternative ways to label data. Given an image of a face, a model could either classify the presence of sunglasses, or the presence of facial hair. This example highlights how the same input image can be posed as two separate binary classification problems. We present Multi-Task Distribution Learning, highlighting the similarities between Multi-Task Learning and preparing for Distribution Shift. Even with rapid advances in large-scale models, a Multi-Task Learner that is trained with object detection will outperform zero-shot inference on object detection. Similarly, we show how training with a data distribution aids with performance on that data distribution. We begin our experiments with a pairing of distribution tasks. We then show that this scales to optimizing 10 distribution tasks simultaneously. We further perform a task grouping analysis to see which augmentations train well together and which do not. Multi-Task Distribution Learning highlights the similarities between Distribution Shift and Zero-Shot task inference. These experiments will continue to improve with advances in generative modeling that enables simulating more interesting distribution shifts outside of standard augmentations. In addition, we discuss how the WILDS benchmark of Domain Generalizations and Subpopulation Shifts will aid in future work. Utilizing the prior knowledge of data augmentation and understanding multi-task interference is a promising direction to understand the phenomenon of Distribution Shift. To facilitate reproduction, we are open-sourcing code, leaderboards, and experimental data upon publication.", "keywords": "Data Augmentation;Distribution Shift;Multi-Task Learning", "primary_area": "", "supplementary_material": "", "author": "Connor Shorten", "authorids": "~Connor_Shorten1", "gender": "", "homepage": "https://www.youtube.com/channel/UCHB9VepY6kYvZjj0Bgxnpbw", "dblp": "", "google_scholar": "YKuiy20AAAAJ", "orcid": "", "linkedin": "connor-shorten-34923a178/", "or_profile": "~Connor_Shorten1", "aff": "Florida Atlantic University", "aff_domain": "fau.edu", "position": "PhD student", "bibtex": "@misc{\nshorten2022multitask,\ntitle={Multi-Task Distribution Learning},\nauthor={Connor Shorten},\nyear={2022},\nurl={https://openreview.net/forum?id=FxBdFwFjXX}\n}", "github": "", "project": "", "reviewers": "w1qQ;F3K8;PK1M", "site": "https://openreview.net/forum?id=FxBdFwFjXX", "pdf_size": 0, "recommendation": "1;1;3", "confidence": "4;4;4", "correctness": "2;3;2", "technical_novelty": "1;1;3", "empirical_novelty": "0;1;3", "wc_summary_paper": "22;16;64", "wc_summary_review": "10;9;54", "wc_main_review": "79;18;775", "wc_review": "111;43;893", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "54;55;0", "reply_reviewers": "0;0;0", "reply_authors": "1;1;0", "recommendation_avg": [ 1.6666666666666667, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 2.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 1.6666666666666667, 0.9428090415820634 ], "empirical_novelty_avg": [ 1.3333333333333333, 1.247219128924647 ], "wc_summary_paper_avg": [ 34.0, 21.354156504062622 ], "wc_summary_review_avg": [ 24.333333333333332, 20.98147330914162 ], "wc_main_review_avg": [ 290.6666666666667, 343.37960853195045 ], "wc_review_avg": [ 349.0, 385.6665226159339 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 36.333333333333336, 25.69478978746902 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0.6666666666666666, 0.4714045207910317 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 1, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": -0.4999999999999999, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1, "aff_unique_index": "0", "aff_unique_norm": "Florida Atlantic University", "aff_unique_dep": "", "aff_unique_url": "https://www.fau.edu", "aff_unique_abbr": "FAU", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "title": "The Evolution of Uncertainty of Learning in Games", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6694", "id": "Fza94Y8VS4a", "poster": "", "openreview": "https://openreview.net/forum?id=Fza94Y8VS4a", "slides": "https://iclr.cc/virtual/2022/poster/6694", "video": "https://iclr.cc/virtual/2022/poster/6694", "author_site": "Yun Kuen Cheung, Georgios Piliouras, Yixin Tao", "tldr": "", "abstract": "Learning in games has become an object of intense interest for ML due to its connections to numerous AI architectures. We study standard online learning in games but from a non-standard perspective. Instead of studying the behavior of a single initial condition and whether it converges to equilibrium or not, we study the behavior of a probability distribution/measure over a set of initial conditions. This initial uncertainty is well-motivated both from a standard game-theoretic perspective (e.g. a modeler's uncertainty about the agents' initial beliefs) as well as from a ML one (e.g. noisy measurements, system initialization from a dataset distribution). Despite this, little is formally known about whether and under what conditions uncertainty is amplified or reduced in these systems. We use the popular measure of differential entropy to quantify the evolution of uncertainty. We find that such analysis shares an intimate relationship with volume analysis, a technique which was recently used to demonstrate the occurrence of Lyapunov chaos when using Multiplicative Weights Update (MWU) or Follow-the-Regularized-Leader (FTRL) algorithms in zero-sum games. This allows us to show that the differential entropy of these learning-in-game systems increases linearly with time, formalizing their increased unpredictability over time. We showcase the power of the framework by applying it in the study of multiple related systems, including different standard online optimization algorithms in numerous games and dynamics of evolutionary game theory.", "keywords": "learning in games;differential entropy", "primary_area": "", "supplementary_material": "", "author": "Yun Kuen Cheung;Georgios Piliouras;Yixin Tao", "authorids": "~Yun_Kuen_Cheung1;~Georgios_Piliouras1;~Yixin_Tao1", "gender": "M;;M", "homepage": "http://comp-math-econ.academy/;;https://tomtao26.github.io/", "dblp": "https://dblp.org/pers/hd/c/Cheung:Yun_Kuen;62/1236;133/3849", "google_scholar": "7rlVH7gAAAAJ;;YQQ_K8YAAAAJ", "orcid": "0000-0002-9280-0149;;", "linkedin": ";;", "or_profile": "~Yun_Kuen_Cheung1;~Georgios_Piliouras1;~Yixin_Tao1", "aff": "Royal Holloway, University of London;Singapore University of Technology and Design;London School of Economics", "aff_domain": "rhul.ac.uk;sutd.edu.sg;lse.ac.uk", "position": "Assistant Professor;Associate Professor;Postdoc", "bibtex": "@inproceedings{\ncheung2022the,\ntitle={The Evolution of Uncertainty of Learning in Games},\nauthor={Yun Kuen Cheung and Georgios Piliouras and Yixin Tao},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=Fza94Y8VS4a}\n}", "github": "", "project": "", "reviewers": "8gHb;2Gu6;Xwzj;8PmF", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "2;3;2;3", "correctness": "3;4;4;4", "technical_novelty": "2;3;4;2", "empirical_novelty": "0;0;0;0", "wc_summary_paper": "47;91;70;91", "wc_summary_review": "50;38;81;94", "wc_main_review": "82;129;151;296", "wc_review": "179;258;302;481", "wc_reply_reviewers": "0;0;0;23", "wc_reply_authors": "632;280;308;810", "reply_reviewers": "0;0;0;1", "reply_authors": "1;1;1;1", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 2.5, 0.5 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.82915619758885 ], "empirical_novelty_avg": [ 0, 0 ], "wc_summary_paper_avg": [ 74.75, 18.171062159378575 ], "wc_summary_review_avg": [ 65.75, 22.63155982251334 ], "wc_main_review_avg": [ 164.5, 79.90775932285925 ], "wc_review_avg": [ 305.0, 110.75874683292511 ], "wc_reply_reviewers_avg": [ 5.75, 9.959292143521045 ], "wc_reply_authors_avg": [ 507.5, 222.8020421809459 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.6882472016116854, "corr_recommendation_correctness": 0.6622661785325219, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15379188003762856337&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "pdf": "https://openreview.net/pdf?id=Fza94Y8VS4a", "email": "rhul.ac.uk;sutd.edu.sg;lse.ac.uk", "author_num": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "University of London;Singapore University of Technology and Design;London School of Economics", "aff_unique_dep": ";;", "aff_unique_url": "https://www.royalholloway.ac.uk;https://www.sutd.edu.sg;https://www.lse.ac.uk", "aff_unique_abbr": "RHUL;SUTD;LSE", "aff_campus_unique_index": "0", "aff_campus_unique": "Royal Holloway;", "aff_country_unique_index": "0;1;0", "aff_country_unique": "United Kingdom;Singapore" }, { "id": "G-7GlfTneYg", "title": "VoiceFixer: Toward General Speech Restoration with Neural Vocoder", "track": "main", "status": "Reject", "tldr": "", "abstract": "Speech restoration aims to remove distortions in speech signals. Prior methods mainly focus on single-task speech restoration (SSR), such as speech denoising or speech declipping. However, SSR systems only focus on one task and do not address the general speech restoration problem. In addition, previous SSR systems show limited performance in some speech restoration tasks such as speech super-resolution. To overcome those limitations, we propose a general speech restoration (GSR) task that attempts to remove multiple distortions \u0002simultaneously. Furthermore, we propose VoiceFixer, a generative framework to address the GSR task. VoiceFixer consists of an analysis stage and a synthesis stage to mimic the speech analysis and comprehension of the human auditory system. We employ a ResUNet to model the analysis stage and a neural vocoder to model the synthesis stage. We evaluate VoiceFixer with additive noise, room \u0002reverberation, low-resolution, and clipping distortions. Our baseline GSR model achieves a 0.499 higher mean opinion score (MOS) than the speech denoising SSR model. VoiceFixer further surpasses the GSR baseline model on the MOS score by 0.256. Moreover, we observe that VoiceFixer generalizes well to severely degraded real speech recordings, indicating its potential in restoring old movies and historical speeches.", "keywords": "Speech Restoration;Neural Vocoder;Speech Denoising;Speech Declipping;Speech Dereverberation;Speech Super-resolution", "primary_area": "", "supplementary_material": "/attachment/81deb27891403246f1e8c165d4ab8c33041db84e.zip", "author": "Haohe Liu;Qiuqiang Kong;Qiao Tian;Yan Zhao;DeLiang Wang;Chuanzeng Huang;Yuxuan Wang", "authorids": "~Haohe_Liu1;~Qiuqiang_Kong1;~Qiao_Tian1;~Yan_Zhao6;~DeLiang_Wang1;~Chuanzeng_Huang1;~Yuxuan_Wang1", "gender": "M;M;M;M;;M;M", "homepage": "https://qiuqiangkong.github.io/;https://scholar.google.com/citations?user=PMH1tnEAAAAJ&hl=en;https://cliffzhao.github.io/;http://web.cse.ohio-state.edu/~dwang/;;;https://haoheliu.github.io/", "dblp": ";206/9465-1.html;;31/6085;;;272/5570", "google_scholar": ";PMH1tnEAAAAJ;cHzLMDEAAAAJ;https://scholar.google.com.tw/citations?user=yO59sggAAAAJ;;3RaOfJkAAAAJ;g3O4lJMAAAAJ", "orcid": ";;0000-0001-8595-3297;;;;0000-0003-1036-7888", "linkedin": ";;yan-zhao-8785a356/;;https://www.linkedin.com/mwlite/in/\u4f20\u589e-chuanzeng-\u9ec4-huang-84947842;;haohe-liu-4483a71a4/", "or_profile": "~Qiuqiang_Kong1;~Qiao_Tian1;~Yan_Zhao6;~DeLiang_Wang1;~Chuanzeng_Huang1;~Yuxuan_Wang1;~Haohe_Liu2", "aff": "ByteDance;ByteDance;ByteDance;Ohio State University;;ByteDance;Microsoft Research", "aff_domain": "bytedance.com;bytedance.com;bytedance.com;;;bytedance.com;research.microsoft.com", "position": "Researcher;Researcher;Research scientist;Full Professor;;Researcher;Intern", "bibtex": "@misc{\nliu2022voicefixer,\ntitle={VoiceFixer: Toward General Speech Restoration with Neural Vocoder},\nauthor={Haohe Liu and Qiuqiang Kong and Qiao Tian and Yan Zhao and DeLiang Wang and Chuanzeng Huang and Yuxuan Wang},\nyear={2022},\nurl={https://openreview.net/forum?id=G-7GlfTneYg}\n}", "github": "", "project": "", "reviewers": "AG4n;Hsxt;Q5gX;UVJ3", "site": "https://openreview.net/forum?id=G-7GlfTneYg", "pdf_size": 0, "recommendation": "3;3;5;6", "confidence": "5;5;4;4", "correctness": "2;3;2;3", "technical_novelty": "1;2;2;2", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "91;84;113;143", "wc_summary_review": "86;64;145;86", "wc_main_review": "1153;666;993;217", "wc_review": "1330;814;1251;446", "wc_reply_reviewers": "16;40;348;0", "wc_reply_authors": "329;684;415;48", "reply_reviewers": "1;1;1;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 4.25, 1.299038105676658 ], "confidence_avg": [ 4.5, 0.5 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 1.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 107.75, 22.993205518152532 ], "wc_summary_review_avg": [ 95.25, 30.094642380330754 ], "wc_main_review_avg": [ 757.25, 357.9080712976448 ], "wc_review_avg": [ 960.25, 356.05222580402443 ], "wc_reply_reviewers_avg": [ 101.0, 143.3143398268296 ], "wc_reply_authors_avg": [ 369.0, 226.9261994570041 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": -0.9622504486493761, "corr_recommendation_correctness": 0.19245008972987526, "gs_citation": 50, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=808547138237064037&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;0;1;0;2", "aff_unique_norm": "ByteDance;Ohio State University;Microsoft", "aff_unique_dep": ";;Microsoft Research", "aff_unique_url": "https://www.bytedance.com;https://www.osu.edu;https://www.microsoft.com/en-us/research", "aff_unique_abbr": "ByteDance;OSU;MSR", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;0;1", "aff_country_unique": "China;United States" }, { "id": "G0CuTynjgQa", "title": "Generalization of GANs and overparameterized models under Lipschitz continuity", "track": "main", "status": "Reject", "tldr": "", "abstract": "Generative adversarial networks (GANs) are really complex, and little has been known about their generalization. The existing learning theories lack efficient tools to analyze generalization of GANs. To fill this gap, we introduce a novel tool to analyze generalization: Lipschitz continuity. We demonstrate its simplicity by showing generalization and consistency of overparameterized neural networks. We then use this tool to derive Lipschitz-based generalization bounds for GANs. In particular, our bounds show that penalizing the zero- and first-order informations of the GAN loss will improve generalization. Therefore, this work provides a unified theory for answering the long mystery of why imposing a Lipschitz constraint can help GANs to generalize well in practice. ", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Khoat Than;Nghia Vu", "authorids": "~Khoat_Than1;vutrungnghiahust99@gmail.com", "gender": "M;", "homepage": "https://users.soict.hust.edu.vn/khoattq/;", "dblp": "118/4726;", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": "~Khoat_Than1;vutrungnghiahust99@gmail.com", "aff": "VinAI Research;", "aff_domain": "vinai.io;", "position": "Scientist;", "bibtex": "@misc{\nthan2022generalization,\ntitle={Generalization of {GAN}s and overparameterized models under Lipschitz continuity},\nauthor={Khoat Than and Nghia Vu},\nyear={2022},\nurl={https://openreview.net/forum?id=G0CuTynjgQa}\n}", "github": "", "project": "", "reviewers": "wKt9;wjLb;yK3Y;NvVe", "site": "https://openreview.net/forum?id=G0CuTynjgQa", "pdf_size": 0, "recommendation": "3;5;6;8", "confidence": "4;4;3;3", "correctness": "3;3;4;4", "technical_novelty": "2;3;3;3", "empirical_novelty": "0;0;0;0", "wc_summary_paper": "75;89;57;76", "wc_summary_review": "90;185;9;16", "wc_main_review": "337;1134;434;215", "wc_review": "502;1408;500;307", "wc_reply_reviewers": "317;183;0;0", "wc_reply_authors": "1331;2687;917;251", "reply_reviewers": "1;1;0;0", "reply_authors": "3;5;2;1", "recommendation_avg": [ 5.5, 1.8027756377319946 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 0, 0 ], "wc_summary_paper_avg": [ 74.25, 11.388041973930374 ], "wc_summary_review_avg": [ 75.0, 70.9964787859229 ], "wc_main_review_avg": [ 530.0, 357.24851294302124 ], "wc_review_avg": [ 679.25, 428.1339597602601 ], "wc_reply_reviewers_avg": [ 125.0, 133.67684915496775 ], "wc_reply_authors_avg": [ 1296.5, 890.4733291907175 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 2.75, 1.479019945774904 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.8320502943378437, "corr_recommendation_correctness": 0.8320502943378437, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4672511382713782112&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0", "aff_unique_norm": "VinAI Research", "aff_unique_dep": "", "aff_unique_url": "https://www.vinai.io/", "aff_unique_abbr": "VinAI", "aff_country_unique_index": "0", "aff_country_unique": "Vietnam" }, { "id": "G1J5OYjoiWb", "title": "An Attempt to Model Human Trust with Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Existing works to compute trust as a numerical value mainly rely on ranking, rating or assessments of agents by other agents. However, the concept of trust is manifold, and should not be limited to reputation. Recent research in neuroscience converges with Berg's hypothesis in economics that trust is an encoded function in the human brain. Based on this new assumption, we propose an approach where a trust level is learned by an overlay of any model-free off-policy reinforcement learning algorithm. The main issues were i) to use recent findings on dopaminergic system and reward circuit to simulate trust, ii) to assess our model with reliable and unbiased real life models. In this work, we address these problems by extending Q-Learning to trust evaluation, and comparing our results to a social science case study. Our main contributions are threefold. (1) We model the trust-decision making process with a reinforcement learning algorithm. (2) We propose a dynamic reinforcement of the trust reward inspired by recent findings of neuroscience. (3) We propose a method to explore and exploit the trust space. The experiments reveal that it is possible to find a set of hyperparameters of our algorithm to reproduce recent findings on overconfidence effect in social psychology research.", "keywords": "Trust;Confidence;Q-learning;Reward Circuit", "primary_area": "", "supplementary_material": "", "author": "Vincent Frey;Simon B\u00e9cot", "authorids": "~Vincent_Frey1;~Simon_B\u00e9cot1", "gender": "M;M", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": "vincent-frey-a696231/;simon-b%C3%A9cot/", "or_profile": "~Vincent_Frey1;~Simon_B\u00e9cot1", "aff": "Orange-labs;Orange-labs", "aff_domain": "orange.com;orange.com", "position": "Researcher;Researcher", "bibtex": "@misc{\nfrey2022an,\ntitle={An Attempt to Model Human Trust with Reinforcement Learning},\nauthor={Vincent Frey and Simon B{\\'e}cot},\nyear={2022},\nurl={https://openreview.net/forum?id=G1J5OYjoiWb}\n}", "github": "", "project": "", "reviewers": "owda;fiz1;GxNj;dEMd", "site": "https://openreview.net/forum?id=G1J5OYjoiWb", "pdf_size": 0, "recommendation": "3;3;3;6", "confidence": "4;3;3;3", "correctness": "1;2;2;3", "technical_novelty": "1;3;2;3", "empirical_novelty": "1;2;1;3", "wc_summary_paper": "85;111;31;56", "wc_summary_review": "63;65;111;33", "wc_main_review": "493;568;472;379", "wc_review": "641;744;614;468", "wc_reply_reviewers": "0;81;0;0", "wc_reply_authors": "1991;915;1360;1343", "reply_reviewers": "0;1;0;0", "reply_authors": "3;2;2;3", "recommendation_avg": [ 3.75, 1.299038105676658 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 2.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.25, 0.82915619758885 ], "empirical_novelty_avg": [ 1.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 70.75, 30.086334107032716 ], "wc_summary_review_avg": [ 68.0, 27.874719729532707 ], "wc_main_review_avg": [ 478.0, 67.3832323356486 ], "wc_review_avg": [ 616.75, 98.63410921177318 ], "wc_reply_reviewers_avg": [ 20.25, 35.074028853269766 ], "wc_reply_authors_avg": [ 1402.25, 383.84070589243134 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 2.5, 0.5 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": 0.816496580927726, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:61ynQs13mCYJ:scholar.google.com/&scioq=An+Attempt+to+Model+Human+Trust+with+Reinforcement+Learning&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Orange Labs", "aff_unique_dep": "", "aff_unique_url": "https://www.orange.com/en/innovation/orange-labs", "aff_unique_abbr": "Orange Labs", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "France" }, { "id": "G33_uTwQiL", "title": "Equivariant Vector Field Network for Many-body System Modeling", "track": "main", "status": "Reject", "tldr": "", "abstract": "Modeling many-body systems has been a long-standing challenge in science, from classical and quantum physics to computational biology. Equivariance is a critical physical symmetry for many-body dynamic systems, which enables robust and accurate prediction under arbitrary reference transformations. In light of this, great efforts have been put on encoding this symmetry into deep neural networks, which significantly boosts the prediction performance of down-streaming tasks. Some general equivariant models which are computationally efficient have been proposed, however, these models have no guarantee on the approximation power and may have information loss. In this paper, we leverage insights from the scalarization technique in differential geometry to model many-body systems by learning the gradient vector fields, which are SE(3) and permutation equivariant. Specifically, we propose the Equivariant Vector Field Network (EVFN), which is built on a novel tuple of equivariant basis and the associated scalarization and vectorization layers. Since our tuple equivariant basis forms a complete basis, learning the dynamics with our EVFN has no information loss. We evaluate our method on predicting trajectories of simulated Newton mechanics systems with both full and partially observed data, as well as the equilibrium state of small molecules (molecular conformation) evolving as a statistical mechanics system. Experimental results across multiple tasks demonstrate that our model achieves best or competitive performance on baseline models in various types of datasets.", "keywords": "equivariant neural network;gradient fields;many-body system;molecular conformation generation", "primary_area": "", "supplementary_material": "", "author": "weitao Du;He Zhang;Yuanqi Du;Qi Meng;Wei Chen;Bin Shao;Tie-Yan Liu", "authorids": "~weitao_Du1;~He_Zhang1;~Yuanqi_Du1;~Qi_Meng1;~Wei_Chen1;~Bin_Shao1;~Tie-Yan_Liu1", "gender": "M;M;M;F;F;;M", "homepage": ";;https://yuanqidu.github.io/;;https://weichen-cas.github.io/;https://www.binshao.info/;http://member.acm.org/~tieyanliu", "dblp": "17/10015;24/2058;266/2837;;;;l/TieYanLiu", "google_scholar": ";https://scholar.google.com/citations?hl=en;fAc_zZMAAAAJ;t-z3K34AAAAJ;https://scholar.google.com/citations?hl=en;h9L4CgIAAAAJ;Nh832fgAAAAJ", "orcid": ";0000-0003-4294-5697;;;;;0000-0002-0476-8020", "linkedin": ";%E8%B4%BA-%E5%BC%A0-8a592a16b/;;;;;", "or_profile": "~weitao_Du1;~He_Zhang1;~Yuanqi_Du1;~Qi_Meng1;~Wei_Chen1;~Bin_Shao1;~Tie-Yan_Liu1", "aff": "Academy of Mathematics and Systems Science, Chinese Academy of Sciences, Chinese Academy of Sciences;Xi'an Jiaotong University;University of Amsterdam;Microsoft; Chinese Academy of Sciences;Microsoft;Microsoft", "aff_domain": "amss.ac.cn;xjtu.edu;uva.nl;microsoft.com;ict.ac.cn;microsoft.com;microsoft.com", "position": "Postdoc;PhD student;Researcher;associate researcher;Full Professor;Principal Research Manager;Distinguished Scientist", "bibtex": "@misc{\ndu2022equivariant,\ntitle={Equivariant Vector Field Network for Many-body System Modeling},\nauthor={weitao Du and He Zhang and Yuanqi Du and Qi Meng and Wei Chen and Bin Shao and Tie-Yan Liu},\nyear={2022},\nurl={https://openreview.net/forum?id=G33_uTwQiL}\n}", "github": "", "project": "", "reviewers": "gmFP;YiG5;BR6H;V2nd", "site": "https://openreview.net/forum?id=G33_uTwQiL", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "4;5;3;4", "correctness": "3;3;2;2", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;3;3;0", "wc_summary_paper": "35;47;60;67", "wc_summary_review": "37;14;30;44", "wc_main_review": "381;431;345;316", "wc_review": "453;492;435;427", "wc_reply_reviewers": "70;192;33;131", "wc_reply_authors": "1559;1776;996;862", "reply_reviewers": "1;1;1;1", "reply_authors": "3;4;3;2", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.0, 1.224744871391589 ], "wc_summary_paper_avg": [ 52.25, 12.275483697190918 ], "wc_summary_review_avg": [ 31.25, 11.121488209767612 ], "wc_main_review_avg": [ 368.25, 42.92653608200876 ], "wc_review_avg": [ 451.75, 25.073641538476217 ], "wc_reply_reviewers_avg": [ 106.5, 60.508263898413084 ], "wc_reply_authors_avg": [ 1298.25, 380.1002334911148 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 3.0, 0.7071067811865476 ], "replies_avg": [ 22, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": -0.5773502691896257, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:e435IbVd0c0J:scholar.google.com/&scioq=Equivariant+Vector+Field+Network+for+Many-body+System+Modeling&hl=en&as_sdt=0,14", "gs_version_total": 0, "aff_unique_index": "0;1;2;3;0;3;3", "aff_unique_norm": "Chinese Academy of Sciences;Xi'an Jiao Tong University;University of Amsterdam;Microsoft", "aff_unique_dep": "Academy of Mathematics and Systems Science;;;Microsoft Corporation", "aff_unique_url": "http://www.cas.cn;https://www.xjtu.edu.cn;https://www.uva.nl;https://www.microsoft.com", "aff_unique_abbr": "CAS;XJTU;UvA;Microsoft", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;2;0;2;2", "aff_country_unique": "China;Netherlands;United States" }, { "id": "G7PfyLimZBp", "title": "Understanding the Generalization of Adam in Learning Neural Networks with Proper Regularization", "track": "main", "status": "Reject", "tldr": "", "abstract": "Adaptive gradient methods such as Adam have gained increasing popularity in deep learning optimization. However, it has been observed in many deep learning applications such as image classification, Adam can converge to a different solution with a worse test error compared to (stochastic) gradient descent, even with a fine-tuned regularization. In this paper, we provide a theoretical explanation for this phenomenon: we show that in the nonconvex setting of learning over-parameterized two-layer convolutional neural networks starting from the same random initialization, for a class of data distributions (inspired from image data), Adam and gradient descent (GD) can converge to different global solutions of the training objective with provably different generalization errors, even with weight decay regularization. In contrast, we show that if the training objective is convex, and the weight decay regularization is employed, any optimization algorithms including Adam and GD will converge to the same solution if the training is successful. This suggests that the generalization gap between Adam and SGD is fundamentally tied to the nonconvex landscape of deep learning optimization, which cannot be covered by the recent neural tangent kernel (NTK) based analysis. ", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Difan Zou;Yuan Cao;Yuanzhi Li;Quanquan Gu", "authorids": "~Difan_Zou1;~Yuan_Cao1;~Yuanzhi_Li1;~Quanquan_Gu1", "gender": "M;M;M;M", "homepage": "https://difanzou.github.io/;https://yuancaohku.github.io/;;http://web.cs.ucla.edu/~qgu/", "dblp": "161/8923;;73/3628;50/4597", "google_scholar": "Cp4fcTQAAAAJ;-VGnHI4AAAAJ;;GU9HgNAAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Difan_Zou1;~Yuan_Cao1;~Yuanzhi_Li1;~Quanquan_Gu1", "aff": "University of California, Los Angeles;University of Hong Kong;Carnegie Mellon University;University of California, Los Angeles", "aff_domain": "ucla.edu;hku.hk;andrew.cmu.edu;cs.ucla.edu", "position": "PhD student;Assistant Professor;Assistant Professor;Assistant Professor", "bibtex": "@misc{\nzou2022understanding,\ntitle={Understanding the Generalization of Adam in Learning Neural Networks with Proper Regularization},\nauthor={Difan Zou and Yuan Cao and Yuanzhi Li and Quanquan Gu},\nyear={2022},\nurl={https://openreview.net/forum?id=G7PfyLimZBp}\n}", "github": "", "project": "", "reviewers": "ahKf;yf6t;9CZz;6gZd", "site": "https://openreview.net/forum?id=G7PfyLimZBp", "pdf_size": 0, "recommendation": "3;5;6;6", "confidence": "4;4;3;3", "correctness": "2;4;3;4", "technical_novelty": "3;4;3;3", "empirical_novelty": "2;0;3;3", "wc_summary_paper": "86;319;71;19", "wc_summary_review": "31;76;77;21", "wc_main_review": "231;440;386;162", "wc_review": "348;835;534;202", "wc_reply_reviewers": "148;174;128;0", "wc_reply_authors": "1294;1681;525;328", "reply_reviewers": "1;2;1;0", "reply_authors": "2;4;2;1", "recommendation_avg": [ 5.0, 1.224744871391589 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 3.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 1.224744871391589 ], "wc_summary_paper_avg": [ 123.75, 115.43694166080458 ], "wc_summary_review_avg": [ 51.25, 25.498774480354932 ], "wc_main_review_avg": [ 304.75, 112.5952374658893 ], "wc_review_avg": [ 479.75, 236.45758076238536 ], "wc_reply_reviewers_avg": [ 112.5, 66.96827607158482 ], "wc_reply_authors_avg": [ 957.0, 552.270314248376 ], "reply_reviewers_avg": [ 1.0, 0.7071067811865476 ], "reply_authors_avg": [ 2.25, 1.0897247358851685 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.8164965809277259, "corr_recommendation_correctness": 0.7385489458759963, "gs_citation": 64, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3249364327866748267&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "University of California, Los Angeles;University of Hong Kong;Carnegie Mellon University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.ucla.edu;https://www.hku.hk;https://www.cmu.edu", "aff_unique_abbr": "UCLA;HKU;CMU", "aff_campus_unique_index": "0;1;0", "aff_campus_unique": "Los Angeles;Hong Kong SAR;", "aff_country_unique_index": "0;1;0;0", "aff_country_unique": "United States;China" }, { "title": "Data Efficient Language-Supervised Zero-Shot Recognition with Optimal Transport Distillation", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6582", "id": "G89-1yZLFHk", "poster": "", "openreview": "https://openreview.net/forum?id=G89-1yZLFHk", "slides": "https://iclr.cc/virtual/2022/poster/6582", "video": "https://iclr.cc/virtual/2022/poster/6582", "author_site": "Bichen Wu, Ruizhe Cheng, Peizhao Zhang, Tianren Gao, Joseph E Gonzalez, Peter Vajda", "tldr": "", "abstract": "Traditional computer vision models are trained to predict a fixed set of predefined categories. Recently, natural language has been shown to be a broader and richer source of supervision that provides finer descriptions to visual concepts than supervised \"gold\" labels. Previous works, such as CLIP, use InfoNCE loss to train a model to predict the pairing between images and text captions. CLIP, however, is data hungry and requires more than 400M image-text pairs for training. The inefficiency can be \\textit{partially} attributed to the fact that the image-text pairs are noisy. To address this, we propose OTTER (Optimal TransporT distillation for Efficient zero-shot Recognition), which uses online entropic optimal transport to find a soft image-text match as labels for contrastive learning. Based on pretrained image and text encoders, models trained with OTTER achieve strong performance with only 3M image text pairs. Compared with InfoNCE loss, label smoothing, and knowledge distillation, OTTER consistently outperforms these baselines in zero-shot evaluation on Google Open Images (19,958 classes) and multi-labeled ImageNet 10K (10032 classes) from Tencent ML-Images. Over 42 evaluations on 7 different dataset/architecture settings x 6 metrics, OTTER outperforms (32) or ties (2) all baselines in 34 of them. Our source code is open sourced at https://github.com/facebookresearch/OTTER.", "keywords": "Zero shot learning;contrastive learning;optimal transport;vision and language", "primary_area": "", "supplementary_material": "/attachment/02362d4e7f2d04c71ffc5fbdde9810b855a072fe.zip", "author": "Bichen Wu;Ruizhe Cheng;Peizhao Zhang;Tianren Gao;Joseph E. Gonzalez;Peter Vajda", "authorids": "~Bichen_Wu1;~Ruizhe_Cheng1;~Peizhao_Zhang1;~Tianren_Gao1;~Joseph_E._Gonzalez1;~Peter_Vajda1", "gender": "M;M;M;M;M;", "homepage": ";;;https://github.com/tianrengao;http://eecs.berkeley.edu/~jegonzal;https://sites.google.com/site/vajdap", "dblp": "130/1371;;23/8011.html;;61/8262;44/5953", "google_scholar": "K3QJPdMAAAAJ;;eqQQkM4AAAAJ;QQSBW8gAAAAJ;https://scholar.google.com.tw/citations?user=gM2WW9UAAAAJ;k8QB5VUAAAAJ", "orcid": ";;;;0000-0003-2921-956X;", "linkedin": "bichenwu/;ryan-cheng-a6210aa2/;;;;p%C3%A9ter-vajda-9a03aaa/", "or_profile": "~Bichen_Wu1;~Ruizhe_Cheng1;~Peizhao_Zhang1;~Tianren_Gao1;~Joseph_E._Gonzalez1;~Peter_Vajda1", "aff": "Meta Facebook;;Meta;University of California, Berkeley;University of California, Berkeley;Meta", "aff_domain": "fb.com;;meta.com;berkeley.edu;berkeley.edu;meta.com", "position": "Research Scientist;;Research Scientist;MS student;Associate Professor;Researcher", "bibtex": "@inproceedings{\nwu2022data,\ntitle={Data Efficient Language-Supervised Zero-Shot Recognition with Optimal Transport Distillation},\nauthor={Bichen Wu and Ruizhe Cheng and Peizhao Zhang and Tianren Gao and Joseph E. Gonzalez and Peter Vajda},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=G89-1yZLFHk}\n}", "github": "", "project": "", "reviewers": "ayPf;pk8X;DJW4;qBzH;Qrbe", "pdf_size": 0, "recommendation": "5;6;6;6;6", "confidence": "4;4;5;4;4", "correctness": "3;3;3;3;3", "technical_novelty": "3;3;3;4;3", "empirical_novelty": "2;4;2;2;3", "wc_summary_paper": "95;107;165;104;77", "wc_summary_review": "14;141;51;64;46", "wc_main_review": "323;500;604;389;102", "wc_review": "432;748;820;557;225", "wc_reply_reviewers": "83;172;93;26;124", "wc_reply_authors": "369;291;1270;1148;667", "reply_reviewers": "1;1;2;1;1", "reply_authors": "2;1;4;2;2", "recommendation_avg": [ 5.8, 0.39999999999999997 ], "confidence_avg": [ 4.2, 0.39999999999999997 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 3.2, 0.39999999999999997 ], "empirical_novelty_avg": [ 2.6, 0.8 ], "wc_summary_paper_avg": [ 109.6, 29.608106997915282 ], "wc_summary_review_avg": [ 63.2, 42.23458298598436 ], "wc_main_review_avg": [ 383.6, 170.36736776742194 ], "wc_review_avg": [ 556.4, 215.17304663921084 ], "wc_reply_reviewers_avg": [ 99.6, 48.11070566932063 ], "wc_reply_authors_avg": [ 749.0, 397.8768653742009 ], "reply_reviewers_avg": [ 1.2, 0.4 ], "reply_authors_avg": [ 2.2, 0.9797958971132712 ], "replies_avg": [ 23, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.25000000000000006, "corr_recommendation_correctness": 0.0, "gs_citation": 51, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16240113248211357205&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=G89-1yZLFHk", "email": "fb.com;;meta.com;berkeley.edu;berkeley.edu;meta.com", "author_num": 6, "aff_unique_index": "0;0;1;1;0", "aff_unique_norm": "Meta;University of California, Berkeley", "aff_unique_dep": "Meta Platforms, Inc.;", "aff_unique_url": "https://meta.com;https://www.berkeley.edu", "aff_unique_abbr": "Meta;UC Berkeley", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Berkeley", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "G9JXCpShpni", "title": "The guide and the explorer: smart agents for resource-limited iterated batch reinforcement learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Iterated batch reinforcement learning (RL) is a growing subfield fueled by the demand from systems engineers for intelligent control solutions that they can apply within their technical and organizational constraints. Model-based RL (MBRL) suits this scenario well for its sample efficiency and modularity. Recent MBRL techniques combine efficient neural system models with classical planning (like model predictive control; MPC). In this paper we add two components to this classical setup. The first is a Dyna-style policy learned on the system model using model-free techniques. We call it the guide since it guides the planner. The second component is the explorer, a strategy to expand the limited knowledge of the guide during planning. Through a rigorous ablation study we show that exploration is crucial for optimal performance. We apply this approach with a DQN guide and a heating explorer to improve the state of the art of the resource-limited Acrobot benchmark system by about 10%.", "keywords": "Model-based reinforcement learning;Dyna;exploration;planning;DQN", "primary_area": "", "supplementary_material": "/attachment/be79d7f1fe33dc3bfa581e464ed61b837b6ed7ba.zip", "author": "Albert Thomas;Bal\u00e1zs K\u00e9gl;Othman Gaizi;Gabriel Hurtado", "authorids": "~Albert_Thomas1;~Bal\u00e1zs_K\u00e9gl2;~Othman_Gaizi1;~Gabriel_Hurtado1", "gender": ";;M;M", "homepage": "https://albertcthomas.github.io/;;;https://scholar.google.com/citations?user=s0njcGgAAAAJ&hl=en&oi=ao", "dblp": "172/7718-1;;;k/BalazsKegl.html", "google_scholar": "GzXiITUAAAAJ;;;s0njcGgAAAAJ", "orcid": ";;;", "linkedin": ";othman-gaizi-801343177/;hurtadogabriel;balazskegl", "or_profile": "~Albert_Thomas1;~Othman_Gaizi1;~Gabriel_Hurtado1;~Balazs_Kegl1", "aff": "Huawei Technologies Ltd.;;Huawei Technologies Ltd.;CNRS (on leave)", "aff_domain": "huawei.com;;huawei.com;in2p3.fr", "position": "Researcher;;Research engineer;Principal Researcher", "bibtex": "@misc{\nthomas2022the,\ntitle={The guide and the explorer: smart agents for resource-limited iterated batch reinforcement learning},\nauthor={Albert Thomas and Bal{\\'a}zs K{\\'e}gl and Othman Gaizi and Gabriel Hurtado},\nyear={2022},\nurl={https://openreview.net/forum?id=G9JXCpShpni}\n}", "github": "", "project": "", "reviewers": "EmNA;H38y;jVBs", "site": "https://openreview.net/forum?id=G9JXCpShpni", "pdf_size": 0, "recommendation": "3;3;5", "confidence": "4;2;4", "correctness": "1;3;3", "technical_novelty": "2;1;3", "empirical_novelty": "2;2;2", "wc_summary_paper": "41;22;165", "wc_summary_review": "50;36;110", "wc_main_review": "483;320;817", "wc_review": "574;378;1092", "wc_reply_reviewers": "0;240;163", "wc_reply_authors": "367;58;740", "reply_reviewers": "0;1;1", "reply_authors": "1;1;1", "recommendation_avg": [ 3.6666666666666665, 0.9428090415820634 ], "confidence_avg": [ 3.3333333333333335, 0.9428090415820634 ], "correctness_avg": [ 2.3333333333333335, 0.9428090415820634 ], "technical_novelty_avg": [ 2.0, 0.816496580927726 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 76.0, 63.40872705445731 ], "wc_summary_review_avg": [ 65.33333333333333, 32.097074979228594 ], "wc_main_review_avg": [ 540.0, 206.8638843942235 ], "wc_review_avg": [ 681.3333333333334, 301.2079385112919 ], "wc_reply_reviewers_avg": [ 134.33333333333334, 100.0544296315205 ], "wc_reply_authors_avg": [ 388.3333333333333, 278.83368200815016 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.5000000000000001, "corr_recommendation_correctness": 0.5000000000000001, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:zvKq7MzuFpkJ:scholar.google.com/&scioq=The+guide+and+the+explorer:+smart+agents+for+resource-limited+iterated+batch+reinforcement+learning&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;1", "aff_unique_norm": "Huawei;CNRS", "aff_unique_dep": "Huawei Technologies;", "aff_unique_url": "https://www.huawei.com;https://www.cnrs.fr", "aff_unique_abbr": "Huawei;CNRS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1", "aff_country_unique": "China;France" }, { "id": "G9M4FU8Ggo", "title": "Neural Architecture Search via Ensemble-based Knowledge Distillation", "track": "main", "status": "Reject", "tldr": "", "abstract": "Neural Architecture Search (NAS) automatically searches for well-performed network architectures from a given search space. The One-shot NAS method improves the training efficiency by sharing weights among the possible architectures in the search space, but unfortunately suffers from insufficient parameterization of each architecture due to interferences from other architectures. Recent works attempt to alleviate the insufficient parameterization problem by knowledge distillation, which let the learning of all architectures (students) be guided by the knowledge (i.e., parameters) from a better-parameterized network (teacher), which can be either a pre-trained one (e.g., ResNet50) or some searched out networks with good accuracy performance up to now. \n\nHowever, all these methods fall short in providing a sufficiently outstanding teacher, as they either depend on a pre-trained network that does not fit the NAS task the best, or the selected fitting teachers are still undertrained and inaccurate. In this paper, we take the first step to propose an ensemble-based knowledge distillation method for NAS, called EnNAS, which assembles an outstanding teacher by aggregating a set of architectures currently searched out with the most diversity (high diversity brings highly accurate ensembles); by doing so, EnNAS can deliver a high-quality knowledge distillation with outstanding teacher network (i.e., the ensemble network) all the time. Eventually, compared with existing works, on the real-world dataset ImageNet, EnNAS improved the top-1 accuracy of architectures searched out by 1.2% on average and 3.3% at most.", "keywords": "NAS;Knowledge Distillation;Imagenet", "primary_area": "", "supplementary_material": "", "author": "Fanxin Li;Shixiong Zhao;Haowen Pi;Yuhao QING;Yichao Fu;Sen Wang;Heming Cui", "authorids": "~Fanxin_Li1;~Shixiong_Zhao1;~Haowen_Pi1;~Yuhao_QING1;fuyc@zju.edu.cn;~Sen_Wang6;heming@cs.hku.hk", "gender": "M;;M;;;;", "homepage": ";https://i.cs.hku.hk/~sxzhao/;;;;;", "dblp": "271/6800.html;141/1483.html;;;;;", "google_scholar": ";https://scholar.google.com.hk/citations?user=tf9h_ngAAAAJ;;https://scholar.google.com/citations?view_op=list_works;;;", "orcid": ";0000-0002-1643-2583;;;;0000-0001-9633-2840;", "linkedin": ";;%E7%9A%93%E6%96%87-%E7%9A%AE-637775188/;;;;", "or_profile": "~Fanxin_Li1;~Shixiong_Zhao1;~Haowen_Pi1;~Yuhao_QING1;fuyc@zju.edu.cn;~Sen_Wang6;heming@cs.hku.hk", "aff": "The University of Hong Kong;The University of Hong Kong;The University of Hong Kong;The University of Hong Kong;;Chongqing University;", "aff_domain": "hku.hk;hku.hk;hku.hk;hku.hk;;cqu.edu.cn;", "position": "PhD student;PhD student;PhD student;PhD student;;Associate Professor;", "bibtex": "@misc{\nli2022neural,\ntitle={Neural Architecture Search via Ensemble-based Knowledge Distillation},\nauthor={Fanxin Li and Shixiong Zhao and Haowen Pi and Yuhao QING and Yichao Fu and Sen Wang and Heming Cui},\nyear={2022},\nurl={https://openreview.net/forum?id=G9M4FU8Ggo}\n}", "github": "", "project": "", "reviewers": "4GyZ;72m4;JuuW", "site": "https://openreview.net/forum?id=G9M4FU8Ggo", "pdf_size": 0, "recommendation": "3;3;5", "confidence": "4;4;5", "correctness": "3;2;2", "technical_novelty": "2;3;2", "empirical_novelty": "2;2;2", "wc_summary_paper": "65;79;52", "wc_summary_review": "38;91;14", "wc_main_review": "245;571;223", "wc_review": "348;741;289", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 3.6666666666666665, 0.9428090415820634 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "correctness_avg": [ 2.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 65.33333333333333, 11.025223605694151 ], "wc_summary_review_avg": [ 47.666666666666664, 32.1696889771861 ], "wc_main_review_avg": [ 346.3333333333333, 159.11700796024988 ], "wc_review_avg": [ 459.3333333333333, 200.6195958081419 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 1.0, "corr_recommendation_correctness": -0.5, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:5g53iY4omkwJ:scholar.google.com/&scioq=Neural+Architecture+Search+via+Ensemble-based+Knowledge+Distillation&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;0;0;1", "aff_unique_norm": "University of Hong Kong;Chongqing University", "aff_unique_dep": ";", "aff_unique_url": "https://www.hku.hk;https://www.cqu.edu.cn", "aff_unique_abbr": "HKU;CQU", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "id": "GBszJ1XlKDj", "title": "Quasi-Newton policy gradient algorithms", "track": "main", "status": "Reject", "tldr": "", "abstract": "Policy gradient algorithms have been widely applied to reinforcement learning (RL) problems in recent years. Regularization with various entropy functions is often used to encourage exploration and improve stability. In this paper, we propose a quasi-Newton method for the policy gradient algorithm with entropy regularization. In the case of Shannon entropy, the resulting algorithm reproduces the natural policy gradient (NPG) algorithm. For other entropy functions, this method results in brand new policy gradient algorithms. We provide a simple proof that all these algorithms enjoy the Newton-type quadratic convergence near the optimal policy. Using synthetic and industrial-scale examples, we demonstrate that the proposed quasi-Newton method typically converges in single-digit iterations, often orders of magnitude faster than other state-of-the-art algorithms. ", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Haoya Li;Samarth Gupta;Hsiang-Fu Yu;Lexing Ying;Inderjit S Dhillon", "authorids": "~Haoya_Li1;~Samarth_Gupta1;~Hsiang-Fu_Yu2;~Lexing_Ying1;~Inderjit_S_Dhillon1", "gender": ";M;;;M", "homepage": "https://mathematics.stanford.edu/people/haoya-li;http://www.andrew.cmu.edu/user/samarthg/;https://www.cs.utexas.edu/~rofuyu/;http://web.stanford.edu/~lexing;http://www.cs.utexas.edu/users/inderjit/", "dblp": ";;97/1729;68/3945;d/InderjitSDhillon", "google_scholar": ";4sOngq0AAAAJ;hfvjmbUAAAAJ;OwA3zyMAAAAJ;xBv5ZfkAAAAJ", "orcid": ";0000-0001-6225-0230;;;", "linkedin": ";;;;inderjit-dhillon-a20888b0/", "or_profile": "~Haoya_Li1;~Samarth_Gupta1;~Hsiang-Fu_Yu2;~Lexing_Ying1;~Inderjit_S_Dhillon1", "aff": "Stanford University;Carnegie Mellon University;;Stanford University;University of Texas, Austin", "aff_domain": "stanford.edu;cmu.edu;;stanford.edu;utexas.edu", "position": "PhD student;PhD student;;Professor;Full Professor", "bibtex": "@misc{\nli2022quasinewton,\ntitle={Quasi-Newton policy gradient algorithms},\nauthor={Haoya Li and Samarth Gupta and Hsiang-Fu Yu and Lexing Ying and Inderjit S Dhillon},\nyear={2022},\nurl={https://openreview.net/forum?id=GBszJ1XlKDj}\n}", "github": "", "project": "", "reviewers": "RnjS;QCBv;eYzw;Wec3", "site": "https://openreview.net/forum?id=GBszJ1XlKDj", "pdf_size": 0, "recommendation": "3;5;5;5", "confidence": "3;3;4;3", "correctness": "3;4;3;4", "technical_novelty": "2;2;2;3", "empirical_novelty": "1;0;2;3", "wc_summary_paper": "86;81;45;44", "wc_summary_review": "19;46;3;38", "wc_main_review": "594;207;251;412", "wc_review": "699;334;299;494", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.5, 0.8660254037844386 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.5, 1.118033988749895 ], "wc_summary_paper_avg": [ 64.0, 19.58315602756614 ], "wc_summary_review_avg": [ 26.5, 16.740669042783207 ], "wc_main_review_avg": [ 366.0, 152.15616977303287 ], "wc_review_avg": [ 456.5, 158.13364600868468 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12348411722344503308&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;0;2", "aff_unique_norm": "Stanford University;Carnegie Mellon University;University of Texas at Austin", "aff_unique_dep": ";;", "aff_unique_url": "https://www.stanford.edu;https://www.cmu.edu;https://www.utexas.edu", "aff_unique_abbr": "Stanford;CMU;UT Austin", "aff_campus_unique_index": "0;0;2", "aff_campus_unique": "Stanford;;Austin", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "GDUfz1phf06", "title": "AutoNF: Automated Architecture Optimization of Normalizing Flows Using a Mixture Distribution Formulation", "track": "main", "status": "Reject", "tldr": "", "abstract": "Although various flow models based on different transformations have been proposed, there still lacks a quantitative analysis of performance-cost trade-offs between different flows as well as a systematic way of constructing the best flow architecture. To tackle this challenge, we present an automated normalizing flow (NF) architecture search method. Our method aims to find the optimal sequence of transformation layers from a given set of unique transformations with three folds. First, a mixed distribution is formulated to enable efficient architecture optimization originally on the discrete space without violating the invertibility of the resulting NF architecture. Second, the mixture NF is optimized with an approximate upper bound which has a more preferable global minimum. Third, a block-wise alternating optimization algorithm is proposed to ensure efficient architecture optimization of deep flow models. ", "keywords": "normalizing flow;architecture optimization", "primary_area": "", "supplementary_material": "/attachment/aa5c807c59755db2e81ce6a7a6112b099b3e15b1.zip", "author": "Yu Wang;Jan Drgona;Jiaxin Zhang;Karthik Somayaji NS;Frank Y Liu;Malachi Schram;Peng Li", "authorids": "~Yu_Wang29;~Jan_Drgona1;~Jiaxin_Zhang2;~Karthik_Somayaji_NS1;~Frank_Y_Liu1;~Malachi_Schram1;~Peng_Li8", "gender": "M;;M;M;;;M", "homepage": ";https://drgona.github.io/;https://jxzhangjhu.github.io/;;;;https://www.ece.ucsb.edu/~lip/", "dblp": ";;32/7698-5.html;305/9153;18/2008.html;;83/6353-1.html", "google_scholar": "https://scholar.google.com/citations?authuser=1;A-EA2KsAAAAJ;LiDm8jEAAAAJ;XCVHBNwAAAAJ;v69y--0AAAAJ;HC_cywMAAAAJ;QYQUS7gAAAAJ", "orcid": ";0000-0003-1223-208X;;;0000-0001-6615-0739;;0000-0003-3548-4589", "linkedin": "yu-wang-b526a4220/;drgona/;jiaxin-zhang-1425289b/;karthik-somayaji-453166137/;;;peng-li-ucsb/", "or_profile": "~Yu_Wang29;~Jan_Drgona1;~Jiaxin_Zhang2;~Karthik_Somayaji_NS1;~Frank_Y_Liu1;~Malachi_Schram1;~Peng_Li8", "aff": "UC Santa Barbara;Pacific Northwest National Laboratory;Oak Ridge National Laboratory;UC Santa Barbara;Oak Ridge National Laboratory;Thomas Jefferson National Laboratory;UC Santa Barbara", "aff_domain": "ucsb.edu;pnnl.gov;ornl.gov;ucsb.edu;ornl.gov;jlab.org;ucsb.edu", "position": "PhD student;Researcher;Researcher;PhD student;Principal Researcher;Researcher;Professor", "bibtex": "@misc{\nwang2022autonf,\ntitle={Auto{NF}: Automated Architecture Optimization of Normalizing Flows Using a Mixture Distribution Formulation},\nauthor={Yu Wang and Jan Drgona and Jiaxin Zhang and Karthik Somayaji NS and Frank Y Liu and Malachi Schram and Peng Li},\nyear={2022},\nurl={https://openreview.net/forum?id=GDUfz1phf06}\n}", "github": "", "project": "", "reviewers": "bHa6;ZCmD;ucQj;gQWy", "site": "https://openreview.net/forum?id=GDUfz1phf06", "pdf_size": 0, "recommendation": "3;5;5;8", "confidence": "4;2;4;3", "correctness": "3;3;2;3", "technical_novelty": "3;2;3;4", "empirical_novelty": "3;1;3;4", "wc_summary_paper": "80;66;148;159", "wc_summary_review": "12;45;65;30", "wc_main_review": "294;269;202;305", "wc_review": "386;380;415;494", "wc_reply_reviewers": "0;724;0;0", "wc_reply_authors": "476;1263;352;299", "reply_reviewers": "0;2;0;0", "reply_authors": "1;3;1;1", "recommendation_avg": [ 5.25, 1.7853571071357126 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.75, 1.0897247358851685 ], "wc_summary_paper_avg": [ 113.25, 40.739262389002576 ], "wc_summary_review_avg": [ 38.0, 19.480759738778158 ], "wc_main_review_avg": [ 267.5, 40.00312487793922 ], "wc_review_avg": [ 418.75, 45.41681957160805 ], "wc_reply_reviewers_avg": [ 181.0, 313.5011961699668 ], "wc_reply_authors_avg": [ 597.5, 389.55904558872714 ], "reply_reviewers_avg": [ 0.5, 0.8660254037844386 ], "reply_authors_avg": [ 1.5, 0.8660254037844386 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": -0.37998029782867415, "corr_recommendation_correctness": 0.08084520834544431, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:r9cTGpCsXIYJ:scholar.google.com/&scioq=AutoNF:+Automated+Architecture+Optimization+of+Normalizing+Flows+Using+a+Mixture+Distribution+Formulation&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1;2;0;2;3;0", "aff_unique_norm": "University of California, Santa Barbara;Pacific Northwest National Laboratory;Oak Ridge National Laboratory;Thomas Jefferson National Accelerator Facility", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.ucsb.edu;https://www.pnnl.gov;https://www.ornl.gov;https://www.jlab.org", "aff_unique_abbr": "UCSB;PNNL;ORNL;Jefferson Lab", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Santa Barbara;", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "GE0w59n2mqe", "title": "Learning to Estimate Epistemic Uncertainty in Neural Networks", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Epistemic uncertainty quantification provides useful insight into a deep neural network's understanding of the relationship between its training distribution and unseen instances. A Bayesian-based approaches have been shown to quantify this relationship better than softmax probabilities. Unfortunately, however, those approaches to uncertainty quantification require multiple Monte-Carlo samples of a neural network, augmenting the neural network to learn distributions for its weights, or utilizing an ensemble of neural networks. Such extra calculations are problematic in time-critical, resource-limited scenarios such as trauma triage. In this work, we propose a technique that allows epistemic uncertainty to be estimated using learned regression algorithms. We find that this technique, once trained, allows epistemic uncertainty to be effectively and efficiently predicted.", "keywords": "uncertainty quantification;regression;machine learning;deep learning", "primary_area": "", "supplementary_material": "", "author": "Katherine Elizabeth Brown;Doug Talbert", "authorids": "~Katherine_Elizabeth_Brown1;~Doug_Talbert1", "gender": "F;", "homepage": ";https://www.tntech.edu/directory/engineering/faculty/doug-talbert.php", "dblp": ";", "google_scholar": "awuKWAEAAAAJ;p-PV344AAAAJ", "orcid": "0000-0003-4443-8541;", "linkedin": ";", "or_profile": "~Katherine_Elizabeth_Brown1;~Doug_Talbert1", "aff": "Tennessee Technological University;Tennessee Technological University", "aff_domain": "tntech.edu;tntech.edu", "position": "PhD student;Full Professor", "bibtex": "@misc{\nbrown2022learning,\ntitle={Learning to Estimate Epistemic Uncertainty in Neural Networks},\nauthor={Katherine Elizabeth Brown and Doug Talbert},\nyear={2022},\nurl={https://openreview.net/forum?id=GE0w59n2mqe}\n}", "github": "", "project": "", "reviewers": "kDkL;CZgj;beJH;c85n", "site": "https://openreview.net/forum?id=GE0w59n2mqe", "pdf_size": 0, "recommendation": "1;1;1;3", "confidence": "4;5;4;5", "correctness": "2;3;1;2", "technical_novelty": "1;1;1;2", "empirical_novelty": "1;1;1;2", "wc_summary_paper": "38;53;47;57", "wc_summary_review": "30;58;24;85", "wc_main_review": "218;96;375;183", "wc_review": "286;207;446;325", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 1.5, 0.8660254037844386 ], "confidence_avg": [ 4.5, 0.5 ], "correctness_avg": [ 2.0, 0.7071067811865476 ], "technical_novelty_avg": [ 1.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 48.75, 7.1545440106270926 ], "wc_summary_review_avg": [ 49.25, 24.304063446263466 ], "wc_main_review_avg": [ 218.0, 100.94305325281181 ], "wc_review_avg": [ 316.0, 86.25833293079573 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:-ExAVueotoAJ:scholar.google.com/&scioq=Learning+to+Estimate+Epistemic+Uncertainty+in+Neural+Networks&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Tennessee Technological University", "aff_unique_dep": "", "aff_unique_url": "https://www.ttu.edu", "aff_unique_abbr": "TTU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "GFRq2JxiI7d", "title": "How much pre-training is enough to discover a good subnetwork?", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Neural network pruning is useful for discovering efficient, high-performing subnetworks within pre-trained, dense network architectures. However, more often than not, it involves a three-step process\u2014pre-training, pruning, and re-training\u2014that is computationally expensive, as the dense model must be fully pre-trained. Luckily, several works have empirically shown that high-performing subnetworks can be discovered via pruning without fully pre-training the dense network. Aiming to theoretically analyze the amount of dense network pre-training needed for a pruned network to perform well, we discover a theoretical bound in the number of SGD pre-training iterations on a two-layer, fully-connected network, beyond which pruning via greedy forward selection (Ye et al., 2020) yields a subnetwork that achieves good training error. This threshold is shown to be logarithmically dependent upon the size of the dataset, meaning that experiments with larger datasets require more pre-training for subnetworks obtained via pruning to perform well. We empirically demonstrate the validity of our theoretical results across a variety of architectures and datasets, including fully-connected networks trained on MNIST and several deep convolutional neural network (CNN) architectures trained on CIFAR10 and ImageNet.", "keywords": "lottery ticket hypothesis;pruning;greedy selection", "primary_area": "", "supplementary_material": "/attachment/17980cac4f4ea13a1f1108e548d42ca45420328d.zip", "author": "Cameron R. Wolfe;Qihan Wang;Junhyung Lyle Kim;Anastasios Kyrillidis", "authorids": "~Cameron_R._Wolfe1;~Qihan_Wang1;~Junhyung_Lyle_Kim1;~Anastasios_Kyrillidis2", "gender": "M;;M;M", "homepage": "https://wolfecameron.github.io;http://wangqihan.com/;http://jlylekim.github.io;http://akyrillidis.github.io", "dblp": "238/0394;;290/2228;53/9879", "google_scholar": "jXLvrUwAAAAJ;;Ku197mP8hmUC;TEGzkZMAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Cameron_R._Wolfe1;~Qihan_Wang1;~Junhyung_Lyle_Kim1;~Anastasios_Kyrillidis2", "aff": "Rice University;;Meta (Fundamental AI Research);Rice University", "aff_domain": "rice.edu;;facebook.com;rice.edu", "position": "PhD student;;Research intern;Assistant Professor", "bibtex": "@misc{\nwolfe2022how,\ntitle={How much pre-training is enough to discover a good subnetwork?},\nauthor={Cameron R. Wolfe and Qihan Wang and Junhyung Lyle Kim and Anastasios Kyrillidis},\nyear={2022},\nurl={https://openreview.net/forum?id=GFRq2JxiI7d}\n}", "github": "", "project": "", "reviewers": "5KAU;888m;Sdn8;Fxr2", "site": "https://openreview.net/forum?id=GFRq2JxiI7d", "pdf_size": 0, "recommendation": "5;5;5;6", "confidence": "3;3;2;2", "correctness": "3;2;3;3", "technical_novelty": "3;3;3;2", "empirical_novelty": "2;2;3;0", "wc_summary_paper": "38;43;42;66", "wc_summary_review": "46;23;16;62", "wc_main_review": "186;100;254;649", "wc_review": "270;166;312;777", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "460;415;353;757", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 5.25, 0.4330127018922193 ], "confidence_avg": [ 2.5, 0.5 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.75, 1.0897247358851685 ], "wc_summary_paper_avg": [ 47.25, 10.985786271359915 ], "wc_summary_review_avg": [ 36.75, 18.32177666057525 ], "wc_main_review_avg": [ 297.25, 210.28715486210754 ], "wc_review_avg": [ 381.25, 234.5862048373689 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 496.25, 155.26328445579142 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14515485100256601285&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1;0", "aff_unique_norm": "Rice University;Meta", "aff_unique_dep": ";Fundamental AI Research", "aff_unique_url": "https://www.rice.edu;https://meta.com", "aff_unique_abbr": "Rice;Meta", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "GIBm-_kax6", "title": "Expected Improvement-based Contextual Bandits", "track": "main", "status": "Reject", "tldr": "", "abstract": "The expected improvement (EI) is a popular technique to handle the tradeoff between exploration and exploitation under uncertainty. However, compared to other techniques as Upper Confidence Bound (UCB) and Thompson Sampling (TS), the theoretical properties of EI have not been well studied even for non-contextual settings such as standard bandit and Bayesian optimization. In this paper, we introduce and study the EI technique as a new tool for the contextual bandit problem which is a generalization of the standard bandit. We propose two novel EI-based algorithms for this problem, one when the reward function is assumed to be linear and the other when no assumption is made about the reward function other than it being bounded. With a linear reward function, we demonstrate that our algorithm achieves a near-optimal regret. In particular, our regret bound reduces a factor of $\\sqrt{\\text{log}(T)}$ compared to the popular OFUL algorithm \\citep{Abbasi11} which uses the UCB approach, and reduces a factor of $\\sqrt{d\\text{log}(T)}$ compared to another popular algorithm \\citep{agrawal13} which uses the TS approach. Here $T$ is the horizon and $d$ is the feature vector dimension. Further, when no assumptions are made about the form of reward, we use deep neural networks to model the reward function. We prove that this algorithm also achieves a near-optimal regret. Finally, we provide an empirical evaluation of the algorithms on both synthetic functions and various benchmark datasets. Our experiments show that our algorithms work well and consistently outperform existing approaches.", "keywords": "Linear Bandits;Contextual Bandits;Expected Improvement;Neural Tangent Kernel", "primary_area": "", "supplementary_material": "/attachment/f150d571292b88d971f126f63d76f43d94b572b0.zip", "author": "Hung Tran-The;Sunil Gupta;Santu Rana;Long Tran-Thanh;Svetha Venkatesh", "authorids": "~Hung_Tran-The1;~Sunil_Gupta2;~Santu_Rana1;~Long_Tran-Thanh1;~Svetha_Venkatesh1", "gender": "M;M;;F;M", "homepage": ";;https://warwick.ac.uk/fac/sci/dcs/people/long_tran-thanh/;https://www.deakin.edu.au/about-deakin/people/svetha-venkatesh;https://personal-sites.deakin.edu.au/~sunilg/", "dblp": "76/9697;57/6712;46/8333;81/1984;47/333-1", "google_scholar": "https://scholar.google.com.au/citations?user=um-FS-gAAAAJ;S9PwnMYAAAAJ;https://scholar.google.co.uk/citations?user=YBQai3gAAAAJ;AEkRUQcAAAAJ;https://scholar.google.com.au/citations?user=bXeL2t8AAAAJ", "orcid": ";0000-0003-2247-850X;;;0000-0002-3308-1930", "linkedin": ";santur/;;;", "or_profile": "~Hung_Tran-The1;~Santu_Rana1;~Long_Tran-Thanh1;~Svetha_Venkatesh1;~Sunil_Kumar_Gupta1", "aff": "Deakin University;Deakin University;;Deakin University;Deakin University", "aff_domain": "deakin.edu.au;deakin.edu.au;;deakin.edu.au;deakin.edu.au", "position": "Researcher;Associate Professor;;Full Professor;Associate Professor", "bibtex": "@misc{\ntran-the2022expected,\ntitle={Expected Improvement-based Contextual Bandits},\nauthor={Hung Tran-The and Sunil Gupta and Santu Rana and Long Tran-Thanh and Svetha Venkatesh},\nyear={2022},\nurl={https://openreview.net/forum?id=GIBm-_kax6}\n}", "github": "", "project": "", "reviewers": "vUVo;UDHJ;co1L;ccLd;TJpE", "site": "https://openreview.net/forum?id=GIBm-_kax6", "pdf_size": 0, "recommendation": "3;5;6;6;6", "confidence": "4;4;4;3;4", "correctness": "2;1;4;4;3", "technical_novelty": "2;3;3;4;3", "empirical_novelty": "3;2;2;3;2", "wc_summary_paper": "55;56;34;80;61", "wc_summary_review": "63;57;54;28;27", "wc_main_review": "394;411;239;140;171", "wc_review": "512;524;327;248;259", "wc_reply_reviewers": "0;101;0;0;0", "wc_reply_authors": "701;319;501;298;205", "reply_reviewers": "0;1;0;0;0", "reply_authors": "1;1;1;1;1", "recommendation_avg": [ 5.2, 1.16619037896906 ], "confidence_avg": [ 3.8, 0.39999999999999997 ], "correctness_avg": [ 2.8, 1.16619037896906 ], "technical_novelty_avg": [ 3.0, 0.6324555320336759 ], "empirical_novelty_avg": [ 2.4, 0.4898979485566356 ], "wc_summary_paper_avg": [ 57.2, 14.688771221582833 ], "wc_summary_review_avg": [ 45.8, 15.223665787188052 ], "wc_main_review_avg": [ 271.0, 112.17308054965773 ], "wc_review_avg": [ 374.0, 120.70956880048905 ], "wc_reply_reviewers_avg": [ 20.2, 40.4 ], "wc_reply_authors_avg": [ 404.8, 176.4521464873692 ], "reply_reviewers_avg": [ 0.2, 0.4000000000000001 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.34299717028501764, "corr_recommendation_correctness": 0.6176470588235294, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:XcMvckNm9b4J:scholar.google.com/&scioq=Expected+Improvement-based+Contextual+Bandits&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Deakin University", "aff_unique_dep": "", "aff_unique_url": "https://www.deakin.edu.au", "aff_unique_abbr": "Deakin", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Australia" }, { "id": "GIEPR9OomyX", "title": "Langevin Autoencoders for Learning Deep Latent Variable Models", "track": "main", "status": "Reject", "tldr": "", "abstract": "Markov chain Monte Carlo (MCMC), such as Langevin dynamics, is valid for approximating intractable distributions. However, its usage is limited in the context of deep latent variable models since it is not scalable to data size owing to its datapoint-wise iterations and slow convergence. This paper proposes the amortized Langevin dynamics (ALD), wherein datapoint-wise MCMC iterations are entirely replaced with updates of an inference model that maps observations into latent variables. Since it no longer depends on datapoint-wise iterations, ALD enables scalable inference from large-scale datasets. Despite its efficiency, it retains the excellent property of MCMC; we prove that ALD has the target posterior as a stationary distribution with a mild assumption. Furthermore, ALD can be extended to sampling from an unconditional distribution such as an energy-based model, enabling more flexible generative modeling by applying it to the prior distribution of the latent variable. Based on ALD, we construct a new deep latent variable model named the Langevin autoencoder (LAE). LAE uses ALD for autoencoder-like posterior inference and sampling from the latent space EBM. Using toy datasets, we empirically validate that ALD can properly obtain samples from target distributions in both conditional and unconditional cases, and ALD converges significantly faster than traditional LD. We also evaluate LAE on the image generation task using three datasets (SVHN, CIFAR-10, and CelebA-HQ). Not only can LAE be trained faster than non-amortized MCMC methods, but LAE can also generate better samples in terms of the Fr\u00e9chet Inception Distance (FID) compared to AVI-based methods, such as the variational autoencoder.", "keywords": "Langevin dynamics;amortized inference;latent variable model;deep generative model", "primary_area": "", "supplementary_material": "", "author": "Shohei Taniguchi;Yusuke Iwasawa;Wataru Kumagai;Yutaka Matsuo", "authorids": "~Shohei_Taniguchi1;~Yusuke_Iwasawa1;~Wataru_Kumagai2;~Yutaka_Matsuo1", "gender": "M;M;M;M", "homepage": ";;https://sites.google.com/site/watarukumagaiswebpage/;http://ymatsuo.com", "dblp": ";117/7377;;m/YMatsuo.html", "google_scholar": "MOcH0c0AAAAJ;https://scholar.google.co.jp/citations?user=pvvZgj0AAAAJ;https://scholar.google.co.jp/citations?user=rd5MEO8AAAAJ;Dy8iau4AAAAJ", "orcid": ";0000-0002-1321-2622;;", "linkedin": ";;;", "or_profile": "~Shohei_Taniguchi1;~Yusuke_Iwasawa1;~Wataru_Kumagai2;~Yutaka_Matsuo1", "aff": ";The University of Tokyo, The University of Tokyo;Omron Sinic X;The University of Tokyo", "aff_domain": ";weblab.t.u-tokyo.ac.jp;sinicx.com;u-tokyo.ac.jp", "position": ";Lecturer;Researcher;Associate Professor", "bibtex": "@misc{\ntaniguchi2022langevin,\ntitle={Langevin Autoencoders for Learning Deep Latent Variable Models},\nauthor={Shohei Taniguchi and Yusuke Iwasawa and Wataru Kumagai and Yutaka Matsuo},\nyear={2022},\nurl={https://openreview.net/forum?id=GIEPR9OomyX}\n}", "github": "", "project": "", "reviewers": "vRp6;Lr3z;siJZ;ZHba", "site": "https://openreview.net/forum?id=GIEPR9OomyX", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "3;4;3;3", "correctness": "3;2;3;3", "technical_novelty": "3;2;3;3", "empirical_novelty": "3;2;0;2", "wc_summary_paper": "297;58;58;43", "wc_summary_review": "106;31;20;36", "wc_main_review": "828;212;84;308", "wc_review": "1231;301;162;387", "wc_reply_reviewers": "1791;0;0;0", "wc_reply_authors": "2874;485;260;398", "reply_reviewers": "5;0;0;0", "reply_authors": "8;2;2;2", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.75, 1.0897247358851685 ], "wc_summary_paper_avg": [ 114.0, 105.83241469417581 ], "wc_summary_review_avg": [ 48.25, 33.840619084171614 ], "wc_main_review_avg": [ 358.0, 282.7507736505773 ], "wc_review_avg": [ 520.25, 418.131184558148 ], "wc_reply_reviewers_avg": [ 447.75, 775.5257490889649 ], "wc_reply_authors_avg": [ 1004.25, 1082.4778000033073 ], "reply_reviewers_avg": [ 1.25, 2.165063509461097 ], "reply_authors_avg": [ 3.5, 2.598076211353316 ], "replies_avg": [ 25, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4451193403290607763&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff_unique_index": "0;1;0", "aff_unique_norm": "University of Tokyo;OMRON Corporation", "aff_unique_dep": ";Sinic X Division", "aff_unique_url": "https://www.u-tokyo.ac.jp;https://www.omron.com", "aff_unique_abbr": "UTokyo;Omron", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Japan" }, { "id": "GJyRarXzT7Q", "title": "Your Fairness May Vary: Pretrained Language Model Fairness in Toxic Text Classification", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Warning: This paper contains samples of offensive text.\nThe popularity of pretrained language models in natural language processing systems calls for a careful evaluation of such models in down-stream tasks, which have a higher potential for societal impact. The evaluation of such systems usually focuses on accuracy measures. Our findings in this paper call for fairness measures to also be considered. Through the analysis of more than a dozen pretrained language models of varying sizes on two toxic text classification tasks, we demonstrate that focusing on accuracy measures alone can lead to models with wide variation in fairness characteristics. Specifically, we observe that fairness can vary even more than accuracy with increasing training data size and different random initializations. At the same time, we find that little of the fairness variation is explained by model size/compression, despite claims in the literature. To improve model fairness without retraining, we show that two post-processing methods developed for structured, tabular data can be successfully applied to a range of pretrained language models.", "keywords": "group fairness;language models;toxic text classification", "primary_area": "", "supplementary_material": "", "author": "Ioana Baldini;Dennis Wei;Karthikeyan Natesan Ramamurthy;Mikhail Yurochkin;Moninder Singh", "authorids": "~Ioana_Baldini1;~Dennis_Wei1;~Karthikeyan_Natesan_Ramamurthy1;~Mikhail_Yurochkin1;~Moninder_Singh2", "gender": "F;M;;M;", "homepage": ";https://sites.google.com/site/dennislwei/;https://nrkarthikeyan.github.io/;https://moonfolk.github.io/;https://researcher.watson.ibm.com/researcher/view.php?person=us-moninder", "dblp": "140/2229;59/8761;58/7800;191/6719;11/1286.html", "google_scholar": "6Om-j_4AAAAJ;r4ldy4AAAAAJ;mG8HuhEAAAAJ;QjBF9sUAAAAJ;", "orcid": ";;0000-0002-6021-5930;;", "linkedin": "ioanabaldini/;dennis-wei-4886036b/;;mikhail-yurochkin-a45659114/;", "or_profile": "~Ioana_Baldini1;~Dennis_Wei1;~Karthikeyan_Natesan_Ramamurthy1;~Mikhail_Yurochkin1;~Moninder_Singh2", "aff": "IBM, International Business Machines, IBM Research;International Business Machines;International Business Machines;IBM Research;International Business Machines", "aff_domain": "us.ibm.com;ibm.com;ibm.com;ibm.com;ibm.com", "position": "Researcher;Research Staff Member;Research Staff Member;Researcher;Research Staff Member", "bibtex": "@misc{\nbaldini2022your,\ntitle={Your Fairness May Vary: Pretrained Language Model Fairness in Toxic Text Classification},\nauthor={Ioana Baldini and Dennis Wei and Karthikeyan Natesan Ramamurthy and Mikhail Yurochkin and Moninder Singh},\nyear={2022},\nurl={https://openreview.net/forum?id=GJyRarXzT7Q}\n}", "github": "", "project": "", "reviewers": "VVPt;ytNg;JRih;7qH5;cUAG", "site": "https://openreview.net/forum?id=GJyRarXzT7Q", "pdf_size": 0, "recommendation": "3;3;3;5;6", "confidence": "4;5;3;4;4", "correctness": "4;2;3;3;4", "technical_novelty": "2;2;2;3;2", "empirical_novelty": "2;3;2;4;3", "wc_summary_paper": "177;106;34;66;112", "wc_summary_review": "416;70;70;6;141", "wc_main_review": "195;527;334;557;478", "wc_review": "788;703;438;629;731", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "0;0;0;0;0", "reply_reviewers": "0;0;0;0;0", "reply_authors": "0;0;0;0;0", "recommendation_avg": [ 4.0, 1.2649110640673518 ], "confidence_avg": [ 4.0, 0.6324555320336759 ], "correctness_avg": [ 3.2, 0.7483314773547882 ], "technical_novelty_avg": [ 2.2, 0.39999999999999997 ], "empirical_novelty_avg": [ 2.8, 0.7483314773547882 ], "wc_summary_paper_avg": [ 99.0, 48.199585060454616 ], "wc_summary_review_avg": [ 140.6, 144.17433890952995 ], "wc_main_review_avg": [ 418.2, 135.34164178108674 ], "wc_review_avg": [ 657.8, 121.23926756624687 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.42257712736425823, "gs_citation": 72, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5796653994665162733&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff_unique_index": "0;1;1;2;1", "aff_unique_norm": "International Business Machines;International Business Machines Corporation;IBM", "aff_unique_dep": "IBM Research;;IBM Research", "aff_unique_url": "https://www.ibm.com;https://www.ibm.com;https://www.ibm.com/research", "aff_unique_abbr": "IBM;IBM;IBM", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "GMYWzWztDx5", "title": "NormFormer: Improved Transformer Pretraining with Extra Normalization", "track": "main", "status": "Reject", "tldr": "", "abstract": "During pretraining, the Pre-LayerNorm transformer suffers from a gradient magnitude mismatch: gradients at early layers are much larger than at later layers, while the optimal weighting of residuals is larger at earlier than at later layers. These issues can be alleviated by the addition of two normalization and two new scaling operations inside each layer. \nThe extra operations incur negligible compute cost (+0.5\\% parameter increase), but improve pretraining perplexity and downstream task performance for both causal and masked language models of multiple sizes. \nAdding NormFormer on top of the GPT3-Medium architecture can reach the SOTA perplexity 22\\% faster, or converge 0.33 perplexity better in the same compute budget. This results in significantly stronger zero shot performance.\nFor masked language modeling, NormFormer improves fine-tuned GLUE performance by 1.9\\% on average.", "keywords": "Language Modeling;NLP;Transformer;Zero Shot Learning", "primary_area": "", "supplementary_material": "", "author": "Sam Shleifer;Myle Ott", "authorids": "~Sam_Shleifer1;~Myle_Ott1", "gender": "M;", "homepage": ";http://myleott.com", "dblp": ";92/9767", "google_scholar": "_d5MbfoAAAAJ;", "orcid": ";", "linkedin": "https://www.linkedin.com/feed/;", "or_profile": "~Sam_Shleifer1;~Myle_Ott1", "aff": "FAIR;Meta Facebook", "aff_domain": "fb.com;fb.com", "position": "Research Engineer;Research Engineer", "bibtex": "@misc{\nshleifer2022normformer,\ntitle={NormFormer: Improved Transformer Pretraining with Extra Normalization},\nauthor={Sam Shleifer and Myle Ott},\nyear={2022},\nurl={https://openreview.net/forum?id=GMYWzWztDx5}\n}", "github": "", "project": "", "reviewers": "uRwh;ust2;jDo1;o2vf", "site": "https://openreview.net/forum?id=GMYWzWztDx5", "pdf_size": 0, "recommendation": "5;5;8;8", "confidence": "4;4;5;4", "correctness": "4;3;3;4", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;4;3;3", "wc_summary_paper": "83;17;107;32", "wc_summary_review": "61;54;49;37", "wc_main_review": "153;339;657;196", "wc_review": "297;410;813;265", "wc_reply_reviewers": "342;127;196;0", "wc_reply_authors": "381;571;791;172", "reply_reviewers": "1;1;1;0", "reply_authors": "1;3;2;2", "recommendation_avg": [ 6.5, 1.5 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 59.75, 36.64270050091832 ], "wc_summary_review_avg": [ 50.25, 8.757139944068497 ], "wc_main_review_avg": [ 336.25, 197.57198055392368 ], "wc_review_avg": [ 446.25, 218.487270796264 ], "wc_reply_reviewers_avg": [ 166.25, 123.44305367253355 ], "wc_reply_authors_avg": [ 478.75, 228.9436338927117 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 2.0, 0.7071067811865476 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.5773502691896258, "corr_recommendation_correctness": 0.0, "gs_citation": 68, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4003753090949121461&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0", "aff_unique_norm": "Meta", "aff_unique_dep": "Facebook AI Research", "aff_unique_url": "https://research.facebook.com", "aff_unique_abbr": "FAIR", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "GOr80bgf52v", "title": "Factored World Models for Zero-Shot Generalization in Robotic Manipulation", "track": "main", "status": "Reject", "tldr": "", "abstract": "World models for environments with many objects face a combinatorial explosion of states: as the number of objects increases, the number of possible arrangements grows exponentially. In this paper, we learn to generalize over robotic pick-and-place tasks using object-factored world models, which combat the combinatorial explosion by ensuring that predictions are equivariant to permutations of objects. We build on one such model, C-SWM, which we extend to overcome the assumption that each action is associated with one object. To do so, we introduce an action attention module to determine which objects are likely to be affected by an action. The attention module is used in conjunction with a residual graph neural network block that receives action information at multiple levels. Based on RGB images and parameterized motion primitives, our model can accurately predict the dynamics of a robot building structures from blocks of various shapes. Our model generalizes over training structures built in different positions. Moreover crucially, the learned model can make predictions about tasks not represented in training data. That is, we demonstrate successful zero-shot generalization to novel tasks. For example, we measure only 2.4% absolute decrease in our action ranking metric in the case of a block assembly task.", "keywords": "reinforcement learning;world models;robotic manipulation;zero-shot transfer", "primary_area": "", "supplementary_material": "", "author": "Ondrej Biza;Thomas Kipf;David Klee;Robert Platt;Jan-Willem van de Meent;Lawson L.S. Wong", "authorids": "~Ondrej_Biza1;~Thomas_Kipf2;~David_Klee1;~Robert_Platt1;~Jan-Willem_van_de_Meent1;~Lawson_L.S._Wong2", "gender": "M;M;;M;M;M", "homepage": "https://sites.google.com/view/obiza;;http://www.ccs.neu.edu/home/rplatt/;https://jwvdm.github.io/;https://www.ccs.neu.edu/home/lsw/;http://tkipf.github.io/", "dblp": "230/8616.html;313/9930;39/5434;137/3263;35/2573;186/8206", "google_scholar": "Gi9Xq8YAAAAJ;TJEEkJoAAAAJ;Z4Y5S2oAAAAJ;CX9Lu38AAAAJ;https://scholar.google.com/citations?hl=en;83HL5FwAAAAJ", "orcid": "0000-0003-3390-8050;;;0000-0001-9465-5398;;", "linkedin": "ond%C5%99ej-b%C3%AD%C5%BEa-a9405353/;;;;;thomas-kipf-6b260410a", "or_profile": "~Ondrej_Biza1;~David_Klee1;~Robert_Platt1;~Jan-Willem_van_de_Meent1;~Lawson_L._S._Wong1;~Thomas_N._Kipf1", "aff": "Google Brain;Northeastern University;Northeastern University;Northeastern University;Northeastern University;Google", "aff_domain": "google.com;northeastern.edu;neu.edu;northeastern.edu;northeastern.edu;google.com", "position": "Intern;PhD student;Associate Professor;Assistant Professor;Assistant Professor;Research Scientist", "bibtex": "@misc{\nbiza2022factored,\ntitle={Factored World Models for Zero-Shot Generalization in Robotic Manipulation},\nauthor={Ondrej Biza and Thomas Kipf and David Klee and Robert Platt and Jan-Willem van de Meent and Lawson L.S. Wong},\nyear={2022},\nurl={https://openreview.net/forum?id=GOr80bgf52v}\n}", "github": "", "project": "", "reviewers": "iwuh;4oxK;toZ5;WeZs", "site": "https://openreview.net/forum?id=GOr80bgf52v", "pdf_size": 0, "recommendation": "5;5;5;6", "confidence": "4;3;4;4", "correctness": "1;3;3;4", "technical_novelty": "3;2;2;3", "empirical_novelty": "0;2;2;3", "wc_summary_paper": "85;337;60;129", "wc_summary_review": "83;132;61;82", "wc_main_review": "659;259;145;251", "wc_review": "827;728;266;462", "wc_reply_reviewers": "0;11;0;95", "wc_reply_authors": "755;550;253;478", "reply_reviewers": "0;1;0;1", "reply_authors": "1;1;1;1", "recommendation_avg": [ 5.25, 0.4330127018922193 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 2.75, 1.0897247358851685 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 1.75, 1.0897247358851685 ], "wc_summary_paper_avg": [ 152.75, 109.20708539284436 ], "wc_summary_review_avg": [ 89.5, 26.062425059844298 ], "wc_main_review_avg": [ 328.5, 196.04782579768641 ], "wc_review_avg": [ 570.75, 220.84539275248648 ], "wc_reply_reviewers_avg": [ 26.5, 39.80263810352274 ], "wc_reply_authors_avg": [ 509.0, 179.36972988773775 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": 0.6622661785325219, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3757923965082189889&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1;1;1;1;0", "aff_unique_norm": "Google;Northeastern University", "aff_unique_dep": "Google Brain;", "aff_unique_url": "https://brain.google.com;https://www.northeastern.edu", "aff_unique_abbr": "Google Brain;NEU", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "GQcB1D2bxSC", "title": "SHAQ: Incorporating Shapley Value Theory into Multi-Agent Q-Learning", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Value factorisation proves to be a useful technique in multi-agent reinforcement learning (MARL), but the underlying mechanism is not yet fully understood. This paper explores a theoretical framework for value factorisation with interpretability. We generalise Shapley value in coalitional game theory to Markov convex game (MCG) and use it as a value factorisation method for MARL. We show that the generalised Shapley value possesses several features such as (1) efficiency: the sum of optimal local values is equal to the optimal global value, (2) fairness in factorisation of the global value, and (3) sensitiveness to dummy agents. Moreover, we show that MCG with the grand coalition and the generalised Shapley value is within $\\epsilon$-core, which means no agents would deviate from the grand coalition. Since MCG with the grand coalition is equivalent to global reward game, it is the first time that Shapley value is rigorously proved to be rationally applied as a value factorisation method for global reward game. Moreover, extending from the Bellman operator we propose Shapley-Q operator that is proved to converge to the optimal generalised Shapley value. With stochastic approximation, a new MARL algorithm called Shapley Q-learning (SHAQ) is yielded. We show the performance of SHAQ on Predator-Prey for modelling relative overgeneralisation and StarCraft Multi-Agent Challenge (SMAC). In experiments, we also demonstrate the interpretability of SHAQ that is lacking in other state-of-the-art baselines.", "keywords": "multi-agent reinforcement learning;Shapley value;value factorisation;Q-learning", "primary_area": "", "supplementary_material": "/attachment/e6f44d44dac4507a853b0be6b117ab970cacec89.zip", "author": "Jianhong Wang;Jinxin Wang;Yuan Zhang;Yunjie Gu;Tae-Kyun Kim", "authorids": "~Jianhong_Wang1;~Jinxin_Wang1;~Yuan_Zhang8;yunjie.gu@imperial.ac.uk;~Tae-Kyun_Kim2", "gender": "M;M;;;", "homepage": "https://hsvgbkhgbv.github.io/;https://www.cnblogs.com/wangjinxinmachinelearning/;;;", "dblp": ";;;;", "google_scholar": "K1FKF3IAAAAJ;;gMzGCV0AAAAJ;;", "orcid": ";;;;", "linkedin": "jianhong-wang-45995b100/;;;;", "or_profile": "~Jianhong_Wang1;~Jinxin_Wang1;~Yuan_Zhang8;yunjie.gu@imperial.ac.uk;~Tae-Kyun_Kim2", "aff": "Imperial College London;College of Computer Science and Artificial Intelligence;University of Freiburg;;", "aff_domain": "ic.ac.uk;wzu.edu.cn;uni-freiburg.de;;", "position": "PhD student;MS student;PhD student;;", "bibtex": "@misc{\nwang2022shaq,\ntitle={{SHAQ}: Incorporating Shapley Value Theory into Multi-Agent Q-Learning},\nauthor={Jianhong Wang and Jinxin Wang and Yuan Zhang and Yunjie Gu and Tae-Kyun Kim},\nyear={2022},\nurl={https://openreview.net/forum?id=GQcB1D2bxSC}\n}", "github": "", "project": "", "reviewers": "t47P;mYrf;VzRQ;iyEm", "site": "https://openreview.net/forum?id=GQcB1D2bxSC", "pdf_size": 0, "recommendation": "3;5;5;5", "confidence": "2;3;2;2", "correctness": "2;4;3;4", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;3;2;3", "wc_summary_paper": "64;95;21;85", "wc_summary_review": "29;49;26;44", "wc_main_review": "513;591;317;239", "wc_review": "606;735;364;368", "wc_reply_reviewers": "383;0;0;0", "wc_reply_authors": "2033;1750;917;1063", "reply_reviewers": "1;0;0;0", "reply_authors": "3;4;2;2", "recommendation_avg": [ 4.5, 0.8660254037844386 ], "confidence_avg": [ 2.25, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 66.25, 28.41984341969533 ], "wc_summary_review_avg": [ 37.0, 9.72111104761179 ], "wc_main_review_avg": [ 415.0, 142.44297104455524 ], "wc_review_avg": [ 518.25, 158.940830185324 ], "wc_reply_reviewers_avg": [ 95.75, 165.84386482472001 ], "wc_reply_authors_avg": [ 1440.75, 464.59787720134926 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 2.75, 0.82915619758885 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": 0.8703882797784891, "gs_citation": 37, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17989755640794467045&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 12, "aff_unique_index": "0;1;2", "aff_unique_norm": "Imperial College London;College of Computer Science and Artificial Intelligence;University of Freiburg", "aff_unique_dep": ";Computer Science and Artificial Intelligence;", "aff_unique_url": "https://www.imperial.ac.uk;;https://www.uni-freiburg.de", "aff_unique_abbr": "ICL;;UoF", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;2", "aff_country_unique": "United Kingdom;;Germany" }, { "title": "Meta Learning Low Rank Covariance Factors for Energy Based Deterministic Uncertainty", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6127", "id": "GQd7mXSPua", "poster": "", "openreview": "https://openreview.net/forum?id=GQd7mXSPua", "slides": "https://iclr.cc/virtual/2022/poster/6127", "video": "https://iclr.cc/virtual/2022/poster/6127", "author_site": "Jeff Willette, Hae Beom Lee, Juho Lee, Sung Ju Hwang", "tldr": "", "abstract": "Numerous recent works utilize bi-Lipschitz regularization of neural network layers to preserve relative distances between data instances in the feature spaces of each layer. This distance sensitivity with respect to the data aids in tasks such as uncertainty calibration and out-of-distribution (OOD) detection. In previous works, features extracted with a distance sensitive model are used to construct feature covariance matrices which are used in deterministic uncertainty estimation or OOD detection. However, in cases where there is a distribution over tasks, these methods result in covariances which are sub-optimal, as they may not leverage all of the meta information which can be shared among tasks. With the use of an attentive set encoder, we propose to meta learn either diagonal or diagonal plus low-rank factors to efficiently construct task specific covariance matrices. Additionally, we propose an inference procedure which utilizes scaled energy to achieve a final predictive distribution which is well calibrated under a distributional dataset shift. ", "keywords": "calibration;meta-learning", "primary_area": "", "supplementary_material": "/attachment/e13025311124d129bb380faf1997dadc5b06f9c2.zip", "author": "Jeffrey Ryan Willette;Hae Beom Lee;Juho Lee;Sung Ju Hwang", "authorids": "~Jeffrey_Ryan_Willette1;~Hae_Beom_Lee1;~Juho_Lee2;~Sung_Ju_Hwang1", "gender": "M;M;M;", "homepage": "https://jeffwillette.github.io;https://haebeom-lee.github.io;https://juho.lee.github.io;", "dblp": "286/0937;326/7260;55/3410-1;", "google_scholar": "https://scholar.google.com/citations?hl=en;;Py4URJUAAAAJ;", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Jeffrey_Ryan_Willette1;~Hae_Beom_Lee1;~Juho_Lee2;~Sung_Ju_Hwang1", "aff": "Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology;", "aff_domain": "kaist.ac.kr;kaist.ac.kr;kaist.ac.kr;", "position": "Student;PhD student;Assistant Professor;", "bibtex": "@inproceedings{\nwillette2022meta,\ntitle={Meta Learning Low Rank Covariance Factors for Energy Based Deterministic Uncertainty},\nauthor={Jeffrey Ryan Willette and Hae Beom Lee and Juho Lee and Sung Ju Hwang},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=GQd7mXSPua}\n}", "github": "", "project": "", "reviewers": "QrCN;eieK;Zz5v", "pdf_size": 0, "recommendation": "5;6;6", "confidence": "2;3;4", "correctness": "3;3;4", "technical_novelty": "2;3;3", "empirical_novelty": "2;3;3", "wc_summary_paper": "114;115;81", "wc_summary_review": "52;30;45", "wc_main_review": "276;156;209", "wc_review": "442;301;335", "wc_reply_reviewers": "519;0;0", "wc_reply_authors": "2101;584;725", "reply_reviewers": "2;0;0", "reply_authors": "6;1;2", "recommendation_avg": [ 5.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.0, 0.816496580927726 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 103.33333333333333, 15.797327481430381 ], "wc_summary_review_avg": [ 42.333333333333336, 9.177266598624136 ], "wc_main_review_avg": [ 213.66666666666666, 49.100803342602134 ], "wc_review_avg": [ 359.3333333333333, 60.07957685899223 ], "wc_reply_reviewers_avg": [ 173.0, 244.65894629054543 ], "wc_reply_authors_avg": [ 1136.6666666666667, 684.3119821316071 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.9428090415820634 ], "reply_authors_avg": [ 3.0, 2.160246899469287 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.8660254037844385, "corr_recommendation_correctness": 0.4999999999999999, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12208252233572804159&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=GQd7mXSPua", "email": "kaist.ac.kr;kaist.ac.kr;kaist.ac.kr;", "author_num": 4, "aff_unique_index": "0;0;0", "aff_unique_norm": "Korea Advanced Institute of Science and Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.kaist.ac.kr", "aff_unique_abbr": "KAIST", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "South Korea" }, { "title": "Independent SE(3)-Equivariant Models for End-to-End Rigid Protein Docking", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7055", "id": "GQjaI9mLet", "poster": "", "openreview": "https://openreview.net/forum?id=GQjaI9mLet", "slides": "https://iclr.cc/virtual/2022/poster/7055", "video": "https://iclr.cc/virtual/2022/poster/7055", "author_site": "Octavian Ganea, xinyuan huang, Charlotte Bunne, Yatao Bian, Regina Barzilay, Tommi Jaakkola, Andreas Krause", "tldr": "", "abstract": "Protein complex formation is a central problem in biology, being involved in most of the cell's processes, and essential for applications, e.g. drug design or protein engineering. We tackle rigid body protein-protein docking, i.e., computationally predicting the 3D structure of a protein-protein complex from the individual unbound structures, assuming no conformational change within the proteins happens during binding. We design a novel pairwise-independent SE(3)-equivariant graph matching network to predict the rotation and translation to place one of the proteins at the right docked position relative to the second protein. We mathematically guarantee a basic principle: the predicted complex is always identical regardless of the initial locations and orientations of the two structures. Our model, named EquiDock, approximates the binding pockets and predicts the docking poses using keypoint matching and alignment, achieved through optimal transport and a differentiable Kabsch algorithm. Empirically, we achieve significant running time improvements and often outperform existing docking software despite not relying on heavy candidate sampling, structure refinement, or templates.", "keywords": "protein complexes;protein structure;rigid body docking;SE(3) equivariance;graph neural networks", "primary_area": "", "supplementary_material": "", "author": "Octavian-Eugen Ganea;Xinyuan Huang;Charlotte Bunne;Yatao Bian;Regina Barzilay;Tommi S. Jaakkola;Andreas Krause", "authorids": "~Octavian-Eugen_Ganea1;~Xinyuan_Huang1;~Charlotte_Bunne1;~Yatao_Bian1;~Regina_Barzilay1;~Tommi_S._Jaakkola1;~Andreas_Krause1", "gender": ";;F;;female;;M", "homepage": ";;https://aimm.epfl.ch;;https://www.regina.csail.mit.edu/;;https://las.inf.ethz.ch/krausea", "dblp": ";;217/2348;;b/ReginaBarzilay;;87/1831-1.html", "google_scholar": ";;https://scholar.google.com/citations?hl=en;;;;https://scholar.google.ch/citations?user=eDHv58AAAAAJ", "orcid": ";;0000-0003-1431-103X;;;;0000-0001-7260-9673", "linkedin": ";xinyuan-huang-a869a6176/;bunnech/;;;;krausea/", "or_profile": "~Octavian-Eugen_Ganea1;~Xinyuan_Huang1;~Charlotte_Bunne1;~Yatao_Bian1;~Regina_Barzilay1;~Tommi_S._Jaakkola1;~Andreas_Krause1", "aff": ";;ETHZ - ETH Zurich;;Massachusetts Institute of Technology;;ETH Zurich", "aff_domain": ";;ethz.ch;;mit.edu;;ethz.ch", "position": ";;PhD student;;Professor;;Full Professor", "bibtex": "@inproceedings{\nganea2022independent,\ntitle={Independent {SE}(3)-Equivariant Models for End-to-End Rigid Protein Docking},\nauthor={Octavian-Eugen Ganea and Xinyuan Huang and Charlotte Bunne and Yatao Bian and Regina Barzilay and Tommi S. Jaakkola and Andreas Krause},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=GQjaI9mLet}\n}", "github": "", "project": "", "reviewers": "ydqb;VRJP;ArSS", "pdf_size": 0, "recommendation": "8;8;8", "confidence": "3;5;5", "correctness": "4;4;4", "technical_novelty": "3;3;3", "empirical_novelty": "3;2;3", "wc_summary_paper": "116;180;60", "wc_summary_review": "57;27;19", "wc_main_review": "335;356;262", "wc_review": "508;563;341", "wc_reply_reviewers": "105;84;0", "wc_reply_authors": "453;736;431", "reply_reviewers": "1;1;0", "reply_authors": "2;2;1", "recommendation_avg": [ 8.0, 0.0 ], "confidence_avg": [ 4.333333333333333, 0.9428090415820634 ], "correctness_avg": [ 4.0, 0.0 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 118.66666666666667, 49.026070162267295 ], "wc_summary_review_avg": [ 34.333333333333336, 16.35712552851373 ], "wc_main_review_avg": [ 317.6666666666667, 40.28509512076258 ], "wc_review_avg": [ 470.6666666666667, 94.39750467511782 ], "wc_reply_reviewers_avg": [ 63.0, 45.36518488885502 ], "wc_reply_authors_avg": [ 540.0, 138.8836443454256 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 188, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4354925472865069663&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "pdf": "https://openreview.net/pdf?id=GQjaI9mLet", "email": ";;ethz.ch;;mit.edu;;ethz.ch", "author_num": 7, "aff_unique_index": "0;1;0", "aff_unique_norm": "ETH Zurich;Massachusetts Institute of Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.ethz.ch;https://web.mit.edu", "aff_unique_abbr": "ETHZ;MIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", "aff_country_unique": "Switzerland;United States" }, { "id": "GU11Lbci5J", "title": "Understanding AdamW through Proximal Methods and Scale-Freeness", "track": "main", "status": "Reject", "tldr": "", "abstract": "Adam has been widely adopted for training deep neural networks due to less hyperparameter tuning and remarkable performance. To improve generalization, Adam is typically used in tandem with a squared $\\ell_2$ regularizer (referred to as Adam-$\\ell_2$). However, even better performance can be obtained with AdamW, which decouples the gradient of the regularizer from the update rule of Adam-$\\ell_2$. Yet, we are still lacking a complete explanation of the advantages of AdamW. In this paper, we tackle this question from both an optimization and an empirical point of view. First, we show how to re-interpret AdamW as an approximation of a proximal gradient method, which takes advantage of the closed-form proximal mapping of the regularizer instead of only utilizing its gradient information as in Adam-$\\ell_2$. Next, we consider the property of ``scale-freeness'' enjoyed by AdamW and by its proximal counterpart: their updates are invariant to component-wise rescaling of the gradients. We provide empirical evidence across a wide range of deep learning experiments showing a correlation between the problems in which AdamW exhibits an advantage over Adam-$\\ell_2$ and the degree to which we expect the gradients of the network to exhibit multiple scales, thus motivating the hypothesis that the advantage of AdamW could be due to the scale-free updates.", "keywords": "Optimization of Deep Neural Networks;Scale-free;AdamW", "primary_area": "", "supplementary_material": "/attachment/db594673f9b2b6663825dc57769fa256ef80636a.zip", "author": "Zhenxun Zhuang;Mingrui Liu;Ashok Cutkosky;Francesco Orabona", "authorids": "~Zhenxun_Zhuang1;~Mingrui_Liu2;~Ashok_Cutkosky1;~Francesco_Orabona1", "gender": "M;;;M", "homepage": "http://cs-people.bu.edu/zxzhuang;https://mingrliu.github.io;http://www.cs.stanford.edu/~ashokc;https://francesco.orabona.com/", "dblp": "234/8537;;191/6725;80/3790.html", "google_scholar": ";KFoEnFQAAAAJ;h4AbGp0AAAAJ;g1ha-iYAAAAJ", "orcid": ";;;", "linkedin": "zhenxunzhuang/;mingrui-liu-447a2aab/;;", "or_profile": "~Zhenxun_Zhuang1;~Mingrui_Liu2;~Ashok_Cutkosky1;~Francesco_Orabona1", "aff": "Boston University;George Mason University;Boston University;Boston University", "aff_domain": "bu.edu;gmu.edu;bu.edu;bu.edu", "position": "PhD student;Assistant Professor;Assistant Professor;Associate Professor", "bibtex": "@misc{\nzhuang2022understanding,\ntitle={Understanding AdamW through Proximal Methods and Scale-Freeness},\nauthor={Zhenxun Zhuang and Mingrui Liu and Ashok Cutkosky and Francesco Orabona},\nyear={2022},\nurl={https://openreview.net/forum?id=GU11Lbci5J}\n}", "github": "", "project": "", "reviewers": "Vxer;rZ6j;tVBU;xwd7", "site": "https://openreview.net/forum?id=GU11Lbci5J", "pdf_size": 0, "recommendation": "3;6;6;6", "confidence": "4;4;4;4", "correctness": "3;4;2;3", "technical_novelty": "1;4;2;3", "empirical_novelty": "1;3;2;0", "wc_summary_paper": "62;75;47;14", "wc_summary_review": "30;41;6;7", "wc_main_review": "240;167;206;198", "wc_review": "332;283;259;219", "wc_reply_reviewers": "280;0;207;0", "wc_reply_authors": "1536;35;1445;745", "reply_reviewers": "1;0;2;0", "reply_authors": "3;1;3;2", "recommendation_avg": [ 5.25, 1.299038105676658 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.5, 1.118033988749895 ], "empirical_novelty_avg": [ 1.5, 1.118033988749895 ], "wc_summary_paper_avg": [ 49.5, 22.76510487566442 ], "wc_summary_review_avg": [ 21.0, 15.016657417681207 ], "wc_main_review_avg": [ 202.75, 25.974747351995553 ], "wc_review_avg": [ 273.25, 40.90461465409496 ], "wc_reply_reviewers_avg": [ 121.75, 124.45556435933268 ], "wc_reply_authors_avg": [ 940.25, 605.6588870808386 ], "reply_reviewers_avg": [ 0.75, 0.82915619758885 ], "reply_authors_avg": [ 2.25, 0.82915619758885 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 94, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1130122642364561914&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;1;0;0", "aff_unique_norm": "Boston University;George Mason University", "aff_unique_dep": ";", "aff_unique_url": "https://www.bu.edu;https://www.gmu.edu", "aff_unique_abbr": "BU;GMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "SimVLM: Simple Visual Language Model Pretraining with Weak Supervision", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6262", "id": "GUrhfTuf_3", "poster": "", "openreview": "https://openreview.net/forum?id=GUrhfTuf_3", "slides": "https://iclr.cc/virtual/2022/poster/6262", "video": "https://iclr.cc/virtual/2022/poster/6262", "author_site": "Zirui Wang, Jiahui Yu, Wei Yu, Zihang Dai, Yulia Tsvetkov, Yuan Cao", "tldr": "", "abstract": "With recent progress in joint modeling of visual and textual representations, Vision-Language Pretraining (VLP) has achieved impressive performance on many multimodal downstream tasks. However, the requirement for expensive annotations including clean image captions and regional labels limits the scalability of existing approaches, and complicates the pretraining procedure with the introduction of multiple dataset-specific objectives. In this work, we relax these constraints and present a minimalist pretraining framework, named Simple Visual Language Model (SimVLM). Unlike prior work, SimVLM reduces the training complexity by exploiting large-scale weak supervision, and is trained end-to-end with a single prefix language modeling objective. Without utilizing extra data or task-specific customization, the resulting model significantly outperforms previous pretraining methods and achieves new state-of-the-art results on a wide range of discriminative and generative vision-language benchmarks, including VQA (+3.74% vqa-score), NLVR2 (+1.17% accuracy), SNLI-VE (+1.37% accuracy) and image captioning tasks (+10.1% average CIDEr score). Furthermore, we demonstrate that SimVLM acquires strong generalization and transfer ability, enabling zero-shot behavior including open-ended visual question answering and cross-modality transfer.", "keywords": "Vision-Language Pretraining;Multimodal Language Model;Weak Supervision", "primary_area": "", "supplementary_material": "", "author": "Zirui Wang;Jiahui Yu;Adams Wei Yu;Zihang Dai;Yulia Tsvetkov;Yuan Cao", "authorids": "~Zirui_Wang1;~Jiahui_Yu1;~Adams_Wei_Yu1;~Zihang_Dai1;~Yulia_Tsvetkov1;~Yuan_Cao2", "gender": "M;M;M;M;F;M", "homepage": ";http://jiahuiyu.com/;https://adamsyu.github.io/;http://zihangdai.github.io/;https://homes.cs.washington.edu/~yuliats/;", "dblp": ";185/1060;65/10635;;75/8157;52/4472-7.html", "google_scholar": "GgD-B68AAAAJ;-CLCMk4AAAAJ;-hW6cvgAAAAJ;uZqsVXkAAAAJ;SEDPkrsAAAAJ;Q82vvqcAAAAJ", "orcid": ";;;;0000-0002-4634-7128;0000-0002-1267-8930", "linkedin": ";jiahuiyuu/;;;;", "or_profile": "~Zirui_Wang1;~Jiahui_Yu1;~Adams_Wei_Yu1;~Zihang_Dai1;~Yulia_Tsvetkov1;~Yuan_Cao2", "aff": "Google Brain;Google Brain;Google Brain;Google;Department of Computer Science, University of Washington;Google DeepMind", "aff_domain": "google.com;google.com;google.com;google.com;cs.washington.edu;google.com", "position": "Research Scientist;Research Scientist;Research Scientist;Research Scientist;Assistant Professor;Research scientist", "bibtex": "@inproceedings{\nwang2022simvlm,\ntitle={Sim{VLM}: Simple Visual Language Model Pretraining with Weak Supervision},\nauthor={Zirui Wang and Jiahui Yu and Adams Wei Yu and Zihang Dai and Yulia Tsvetkov and Yuan Cao},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=GUrhfTuf_3}\n}", "github": "", "project": "", "reviewers": "PHG8;wzgn;Z7A6", "pdf_size": 0, "recommendation": "6;6;8", "confidence": "5;4;4", "correctness": "3;3;3", "technical_novelty": "3;2;3", "empirical_novelty": "3;3;4", "wc_summary_paper": "82;48;65", "wc_summary_review": "113;34;22", "wc_main_review": "686;343;526", "wc_review": "881;425;613", "wc_reply_reviewers": "0;0;60", "wc_reply_authors": "712;467;516", "reply_reviewers": "0;0;1", "reply_authors": "1;1;1", "recommendation_avg": [ 6.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 65.0, 13.880441875771343 ], "wc_summary_review_avg": [ 56.333333333333336, 40.36775390773625 ], "wc_main_review_avg": [ 518.3333333333334, 140.13406279543727 ], "wc_review_avg": [ 639.6666666666666, 187.1137503112894 ], "wc_reply_reviewers_avg": [ 20.0, 28.284271247461902 ], "wc_reply_authors_avg": [ 565.0, 105.85209807399505 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.4999999999999999, "corr_recommendation_correctness": 0.0, "gs_citation": 918, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9618435703828650575&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=GUrhfTuf_3", "email": "google.com;google.com;google.com;google.com;cs.washington.edu;google.com", "author_num": 6, "aff_unique_index": "0;0;0;0;1;0", "aff_unique_norm": "Google;University of Washington", "aff_unique_dep": "Google Brain;Department of Computer Science", "aff_unique_url": "https://brain.google.com;https://www.washington.edu", "aff_unique_abbr": "Google Brain;UW", "aff_campus_unique_index": "0;0;0;0;1", "aff_campus_unique": "Mountain View;Seattle;", "aff_country_unique_index": "0;0;0;0;0;1", "aff_country_unique": "United States;United Kingdom" }, { "id": "GVDwiINkMR", "title": "Picking Daisies in Private: Federated Learning from Small Datasets", "track": "main", "status": "Reject", "tldr": "", "abstract": "Federated learning allows multiple parties to collaboratively train a joint model without sharing local data. This enables applications of machine learning in settings of inherently distributed, undisclosable data such as in the medical domain. In practice, joint training is usually achieved by aggregating local models, for which local training objectives have to be in expectation similar to the joint (global) objective. Often, however, local datasets are so small that local objectives differ greatly from the global objective, resulting in federated learning to fail. We propose a novel approach that intertwines model aggregations with permutations of local models. The permutations expose each local model to a daisy chain of local datasets resulting in more efficient training in data-sparse domains. This enables training on extremely small local datasets, such as patient data across hospitals, while retaining the training efficiency and privacy benefits of federated learning.", "keywords": "federated learning;distributed;sparse data;daisy chain;small datasets", "primary_area": "", "supplementary_material": "/attachment/76c837824bfe3e0527248d518f2b93094717f3a1.zip", "author": "Michael Kamp;Jonas Fischer;Jilles Vreeken", "authorids": "~Michael_Kamp1;~Jonas_Fischer1;~Jilles_Vreeken2", "gender": "M;;M", "homepage": "http://michaelkamp.org;;https://vreeken.eu", "dblp": "133/7744;;94/6462", "google_scholar": "https://scholar.google.de/citations?user=8R5jbvQAAAAJ;;p5HEQfIAAAAJ", "orcid": "0000-0001-6231-0694;;0000-0002-2310-2806", "linkedin": "michael-kamp-29096a95/;;jilles-vreeken-b3b05b58/", "or_profile": "~Michael_Kamp1;~Jonas_Fischer1;~Jilles_Vreeken2", "aff": "Institute for AI in Medicine IKIM;;Max-Planck Institute for Informatics", "aff_domain": "uk-essen.de;;mpi-inf.mpg.de", "position": "Research Group Leader;;Senior Researcher", "bibtex": "@misc{\nkamp2022picking,\ntitle={Picking Daisies in Private: Federated Learning from Small Datasets},\nauthor={Michael Kamp and Jonas Fischer and Jilles Vreeken},\nyear={2022},\nurl={https://openreview.net/forum?id=GVDwiINkMR}\n}", "github": "", "project": "", "reviewers": "pbKL;U9P6;oYPW;hTem", "site": "https://openreview.net/forum?id=GVDwiINkMR", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "3;3;4;4", "correctness": "3;2;3;3", "technical_novelty": "2;2;3;2", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "59;174;109;89", "wc_summary_review": "57;47;50;58", "wc_main_review": "212;457;571;198", "wc_review": "328;678;730;345", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 107.75, 42.18634257671551 ], "wc_summary_review_avg": [ 53.0, 4.636809247747852 ], "wc_main_review_avg": [ 359.5, 159.74745694376483 ], "wc_review_avg": [ 520.25, 184.76522264755346 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 1.0, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:ZEEsP0GEzKcJ:scholar.google.com/&scioq=Picking+Daisies+in+Private:+Federated+Learning+from+Small+Datasets&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1", "aff_unique_norm": "Institute for AI in Medicine;Max-Planck Institute for Informatics", "aff_unique_dep": "AI in Medicine;", "aff_unique_url": ";https://mpi-inf.mpg.de", "aff_unique_abbr": "IKIM;MPII", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "Unknown;Germany" }, { "title": "DictFormer: Tiny Transformer with Shared Dictionary", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6092", "id": "GWQWAeE9EpB", "poster": "", "openreview": "https://openreview.net/forum?id=GWQWAeE9EpB", "slides": "https://iclr.cc/virtual/2022/poster/6092", "video": "https://iclr.cc/virtual/2022/poster/6092", "author_site": "Qian Lou, Ting Hua, Yen-Chang Hsu, Yilin Shen, Hongxia Jin", "tldr": "", "abstract": "We introduce DictFormer with the efficient shared dictionary to provide a compact, fast, and accurate transformer model. DictFormer significantly reduces the redundancy in the transformer's parameters by replacing the prior transformer's parameters with a compact, shared dictionary, few unshared coefficients, and indices. Also, DictFormer enables faster computations since expensive weights multiplications are converted into cheap shared look-ups on dictionary and few linear projections. Training dictionary and coefficients are not trivial since indices used for looking up dictionary are not differentiable. We adopt a sparse-constraint training with $l_1\\,\\,norm$ relaxation to learn coefficients and indices in DictFormer. DictFormer is flexible to support different model sizes by dynamically changing dictionary size. Compared to existing lightweight Transformers, DictFormer consistently reduces model size over Transformer on multiple tasks, e.g., machine translation, abstractive summarization, and language modeling. Extensive experiments show that DictFormer reduces $3.6\\times$ to $8.9\\times$ model size with similar accuracy over multiple tasks, compared to Transformer. ", "keywords": "Transformer;Parameters Sharing;Tiny;On-device Transformer;Machine Translation;Attention;Dictionary Sharing", "primary_area": "", "supplementary_material": "", "author": "Qian Lou;Ting Hua;Yen-Chang Hsu;Yilin Shen;Hongxia Jin", "authorids": "~Qian_Lou1;ting.hua@samsung.com;~Yen-Chang_Hsu1;~Yilin_Shen1;~Hongxia_Jin1", "gender": "M;;M;M;", "homepage": "https://qlou.org;;;;", "dblp": "207/3962.html;;172/1140;30/383;", "google_scholar": "SBYgXLoAAAAJ;;7QWAiigAAAAJ;9PSFMzAAAAAJ;", "orcid": ";;;;", "linkedin": ";;yenchanghsu/;;", "or_profile": "~Qian_Lou1;ting.hua@samsung.com;~Yen-Chang_Hsu1;~Yilin_Shen1;~Hongxia_Jin1", "aff": "Samsung;;Samsung Research America;Samsung Research America;", "aff_domain": "samsung.com;;samsung.com;gmail.com;", "position": "Research Scientist;;Research Scientist;Principal Researcher;", "bibtex": "@inproceedings{\nlou2022dictformer,\ntitle={DictFormer: Tiny Transformer with Shared Dictionary},\nauthor={Qian Lou and Ting Hua and Yen-Chang Hsu and Yilin Shen and Hongxia Jin},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=GWQWAeE9EpB}\n}", "github": "", "project": "", "reviewers": "uPqt;jV7A;EmuN;aKni", "pdf_size": 0, "recommendation": "6;6;6;6", "confidence": "3;4;2;5", "correctness": "4;3;3;3", "technical_novelty": "3;4;3;2", "empirical_novelty": "3;4;3;2", "wc_summary_paper": "62;79;47;101", "wc_summary_review": "54;101;14;38", "wc_main_review": "170;525;237;544", "wc_review": "286;705;298;683", "wc_reply_reviewers": "13;283;92;1102", "wc_reply_authors": "323;1355;754;2339", "reply_reviewers": "1;2;1;4", "reply_authors": "3;4;3;8", "recommendation_avg": [ 6.0, 0.0 ], "confidence_avg": [ 3.5, 1.118033988749895 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 72.25, 20.09197601033806 ], "wc_summary_review_avg": [ 51.75, 31.799174517587716 ], "wc_main_review_avg": [ 369.0, 167.32154672964268 ], "wc_review_avg": [ 493.0, 201.19517886867965 ], "wc_reply_reviewers_avg": [ 372.5, 432.46416036476364 ], "wc_reply_authors_avg": [ 1192.75, 756.5019415044485 ], "reply_reviewers_avg": [ 2.0, 1.224744871391589 ], "reply_authors_avg": [ 4.5, 2.0615528128088303 ], "replies_avg": [ 32, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9598071575760572921&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "pdf": "https://openreview.net/pdf?id=GWQWAeE9EpB", "email": "samsung.com;;samsung.com;gmail.com;", "author_num": 5, "aff_unique_index": "0;0;0", "aff_unique_norm": "Samsung", "aff_unique_dep": "Samsung", "aff_unique_url": "https://www.samsung.com", "aff_unique_abbr": "Samsung", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1", "aff_country_unique": "South Korea;United States" }, { "id": "GdPZJxjk46V", "title": "Dataset transformations trade-offs to adapt machine learning methods across domains", "track": "main", "status": "Reject", "tldr": "", "abstract": "Machine learning-based methods have been proved to be quite successful in different domains. However, applying the same techniques across disciplines is not a trivial task with benefits and drawbacks. In the literature, the most common approach is to convert a dataset into the same format as the original domain to employ the same architecture that was successful in the original domain. Although this approach is fast and convenient, we argue it is suboptimal due to the lack of tailoring to the specific problem at hand. To prove our point, we examine dataset transformations used in the literature to adapt machine learning-based methods across domains and show that these dataset transformations are not always beneficial in terms of performance. In addition, we show that these data transformations open the door to unforeseen vulnerabilities in the new applied different domain. To quantify how different the original dataset is with respect to the transformed one, we compute the dataset distances via Optimal Transport. Also, we present simulations with the original and transformed data to show that the data conversion is not always needed and exposes the new domain to unsought menaces.", "keywords": "Datasets;multiple domains;cyber-attacks;optimal transport", "primary_area": "", "supplementary_material": "/attachment/f348055bf16d0efb5dd41dda0178bd6b62abffd5.zip", "author": "Napoleon Costilla-Enriquez;Yang Weng", "authorids": "~Napoleon_Costilla-Enriquez1;~Yang_Weng1", "gender": "M;", "homepage": "http://www.public.asu.edu/~yweng2/group.html;", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": "~Napoleon_Costilla-Enriquez1;~Yang_Weng1", "aff": "Arizona State University;", "aff_domain": "asu.edu;", "position": "PhD student;", "bibtex": "@misc{\ncostilla-enriquez2022dataset,\ntitle={Dataset transformations trade-offs to adapt machine learning methods across domains},\nauthor={Napoleon Costilla-Enriquez and Yang Weng},\nyear={2022},\nurl={https://openreview.net/forum?id=GdPZJxjk46V}\n}", "github": "", "project": "", "reviewers": "GQQh;JxT2;57HV", "site": "https://openreview.net/forum?id=GdPZJxjk46V", "pdf_size": 0, "recommendation": "1;3;3", "confidence": "4;5;4", "correctness": "2;3;2", "technical_novelty": "1;1;1", "empirical_novelty": "1;2;1", "wc_summary_paper": "63;61;54", "wc_summary_review": "15;71;37", "wc_main_review": "100;431;197", "wc_review": "178;563;288", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 2.3333333333333335, 0.9428090415820634 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "correctness_avg": [ 2.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 1.0, 0.0 ], "empirical_novelty_avg": [ 1.3333333333333333, 0.4714045207910317 ], "wc_summary_paper_avg": [ 59.333333333333336, 3.8586123009300755 ], "wc_summary_review_avg": [ 41.0, 23.03620339089466 ], "wc_main_review_avg": [ 242.66666666666666, 138.93483684407192 ], "wc_review_avg": [ 343.0, 161.9156158826772 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.4999999999999999, "corr_recommendation_correctness": 0.5, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:NI_DGjzqaEUJ:scholar.google.com/&scioq=Dataset+transformations+trade-offs+to+adapt+machine+learning+methods+across+domains&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "Arizona State University", "aff_unique_dep": "", "aff_unique_url": "https://www.asu.edu", "aff_unique_abbr": "ASU", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "id": "GesLOTU_r23", "title": "Gradient Explosion and Representation Shrinkage in Infinite Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "We study deep fully-connected neural networks using the mean field formalism,\nand carry out a non-perturbative analysis of signal propagation. As a result, we\ndemonstrate that increasing the depth leads to gradient explosion or to another\nundesirable phenomenon we call representation shrinkage. The appearance of at\nleast one of these problems is not restricted to a specific initialization scheme or\na choice of activation function, but rather is an inherent property of the fully-\nconnected architecture itself. Additionally, we show that many popular normal-\nization techniques fail to mitigate these problems. Our method can also be applied\nto residual networks to guide the choice of initialization variances.", "keywords": "deep learning theory;mean-field approximation", "primary_area": "", "supplementary_material": "", "author": "Adam Klukowski", "authorids": "~Adam_Klukowski1", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "adam-klukowski-361958193/", "or_profile": "~Adam_Klukowski1", "aff": "Huawei Technologies Ltd.", "aff_domain": "huawei.com", "position": "Researcher", "bibtex": "@misc{\nklukowski2022gradient,\ntitle={Gradient Explosion and Representation Shrinkage in Infinite Networks},\nauthor={Adam Klukowski},\nyear={2022},\nurl={https://openreview.net/forum?id=GesLOTU_r23}\n}", "github": "", "project": "", "reviewers": "W3CL;PiMb;vYPS;ZVUC;Vkqm", "site": "https://openreview.net/forum?id=GesLOTU_r23", "pdf_size": 0, "recommendation": "3;5;5;5;8", "confidence": "3;3;4;4;3", "correctness": "2;3;4;2;3", "technical_novelty": "2;2;2;2;3", "empirical_novelty": "2;2;3;0;3", "wc_summary_paper": "82;105;62;55;45", "wc_summary_review": "145;80;38;37;38", "wc_main_review": "445;228;535;633;198", "wc_review": "672;413;635;725;281", "wc_reply_reviewers": "0;0;0;214;21", "wc_reply_authors": "303;749;454;1012;250", "reply_reviewers": "0;0;0;3;1", "reply_authors": "1;1;1;4;1", "recommendation_avg": [ 5.2, 1.6 ], "confidence_avg": [ 3.4, 0.4898979485566356 ], "correctness_avg": [ 2.8, 0.7483314773547882 ], "technical_novelty_avg": [ 2.2, 0.39999999999999997 ], "empirical_novelty_avg": [ 2.0, 1.0954451150103321 ], "wc_summary_paper_avg": [ 69.8, 21.36726468221892 ], "wc_summary_review_avg": [ 67.6, 42.03141682123029 ], "wc_main_review_avg": [ 407.8, 170.07221995375963 ], "wc_review_avg": [ 545.2, 169.55754185526516 ], "wc_reply_reviewers_avg": [ 47.0, 83.89517268591798 ], "wc_reply_authors_avg": [ 553.6, 287.47354660907496 ], "reply_reviewers_avg": [ 0.8, 1.1661903789690602 ], "reply_authors_avg": [ 1.6, 1.2 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 1, 0 ], "corr_recommendation_confidence": -0.10206207261596574, "corr_recommendation_correctness": 0.36748420762958356, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:J6De6jTEUzkJ:scholar.google.com/&scioq=Gradient+Explosion+and+Representation+Shrinkage+in+Infinite+Networks&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "Huawei", "aff_unique_dep": "Huawei Technologies", "aff_unique_url": "https://www.huawei.com", "aff_unique_abbr": "Huawei", "aff_country_unique_index": "0", "aff_country_unique": "China" }, { "id": "GgIq3pALeHW", "title": "UAE-PUPET: An Uncertainty-Autoencoder-Based Privacy and Utility Preserving End-to-End Transformation", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "We propose a new framework that deals with the privacy-utility tradeoff problem under two centralized settings: a dynamic setting and a constant setting. The dynamic setting corresponds to the min-max two-player game whereas the constant setting corresponds to a generator which tries to outperform an adversary already trained using ground truth data. In both settings, we use the same architecture consisting of a generator and a discriminator, where the generator consists of an encoder-decoder pair, and the discriminator consists of an adversary and a utility provider. Unlike previous research considering this kind of architecture, which leverage variational autoencoders (VAEs) based on learning a latent representation which is forced into a Gaussian assumption, our proposed technique removes the Gaussian assumption restriction on the latent variables, and only focuses on the end-to-end stochastic mapping of the input to privatized data. We also show that testing the privacy mechanism against a single adversary is usually not sufficient to capture the leakage of private information, as better adversaries can always be created by training under different conditions. Therefore, we test our proposed mechanism under five different types of adversary models. To compare privacy mechanisms under a fair framework, we propose a new metric called the Utility-Privacy Tradeoff (UPT) curve, obtained by using the upper convex hull of the utility-privacy tradeoff operation points achievable under the most powerful of the five adversary models. Finally, we test our framework on four different datasets: MNIST, Fashion MNIST, UCI Adult and US Census Demographic Data, providing a wide range of possible private and utility attributes. Through comparative analysis, our results show better privacy and utility guarantees, under our more rigorous adversary model, than the existing works, even when the latter are considered under their original restrictive single-adversary models. ", "keywords": "privacy;privacy utility tradeoff;autoencoders;deep learning;machine learning;adversary;privacy utility metric;gan;game theory;min-max;variational autoencoder", "primary_area": "", "supplementary_material": "", "author": "Bishwas Mandal;George Amariucai;Shuangqing Wei", "authorids": "~Bishwas_Mandal1;~George_Amariucai1;~Shuangqing_Wei1", "gender": ";Not Specified;M", "homepage": ";http://people.cs.ksu.edu/~amariucai/;https://www.ece.lsu.edu/swei/", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": "~Bishwas_Mandal1;~George_Amariucai1;~Shuangqing_Wei1", "aff": ";Kansas State University;Louisiana State University", "aff_domain": ";k-state.edu;lsu.edu", "position": ";Associate Professor;Full Professor", "bibtex": "@misc{\nmandal2022uaepupet,\ntitle={{UAE}-{PUPET}: An Uncertainty-Autoencoder-Based Privacy and Utility Preserving End-to-End Transformation},\nauthor={Bishwas Mandal and George Amariucai and Shuangqing Wei},\nyear={2022},\nurl={https://openreview.net/forum?id=GgIq3pALeHW}\n}", "github": "", "project": "", "reviewers": "EHuJ;iPzM;mRLK", "site": "https://openreview.net/forum?id=GgIq3pALeHW", "pdf_size": 0, "recommendation": "1;3;6", "confidence": "5;3;3", "correctness": "2;4;4", "technical_novelty": "1;2;3", "empirical_novelty": "1;3;3", "wc_summary_paper": "95;159;43", "wc_summary_review": "36;63;44", "wc_main_review": "401;231;151", "wc_review": "532;453;238", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 3.3333333333333335, 2.0548046676563256 ], "confidence_avg": [ 3.6666666666666665, 0.9428090415820634 ], "correctness_avg": [ 3.3333333333333335, 0.9428090415820634 ], "technical_novelty_avg": [ 2.0, 0.816496580927726 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.9428090415820634 ], "wc_summary_paper_avg": [ 99.0, 47.44119166575252 ], "wc_summary_review_avg": [ 47.666666666666664, 11.323525167642018 ], "wc_main_review_avg": [ 261.0, 104.24330514074593 ], "wc_review_avg": [ 407.6666666666667, 124.23186207875803 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.802955068546966, "corr_recommendation_correctness": 0.802955068546966, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:VEDebW7XUQ4J:scholar.google.com/&scioq=UAE-PUPET:+An+Uncertainty-Autoencoder-Based+Privacy+and+Utility+Preserving+End-to-End+Transformation&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1", "aff_unique_norm": "Kansas State University;Louisiana State University", "aff_unique_dep": ";", "aff_unique_url": "https://www.k-state.edu;https://www.lsu.edu", "aff_unique_abbr": "K-State;LSU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "GgOEm9twFO_", "title": "PhaseFool: Phase-oriented Audio Adversarial Examples via Energy Dissipation", "track": "main", "status": "Reject", "tldr": "", "abstract": "Audio adversarial attacks design perturbations onto inputs that lead an automatic speech recognition (ASR) model to predict incorrect outputs. Current audio adversarial attacks optimize perturbations with different constraints (e.g. lp-norm for waveform or the principle of auditory masking for magnitude spectrogram) to achieve their imperceptibility. Since phase is not relevant for speech recognition, the existing audio adversarial attacks neglect the influence of phase spectrogram. In this work, we propose a novel phase-oriented algorithm named PhaseFool that can efficiently construct imperceptible audio adversarial examples with energy dissipation. Specifically, we leverage the spectrogram consistency of short-time Fourier transform (STFT) to adversarially transfer phase perturbations to the adjacent frames of magnitude spectrogram and dissipate the energy that is crucial for ASR systems. Moreover, we propose a weighted loss function to improve the imperceptibility of PhaseFool. Experimental results demonstrate that PhaseFool can inherently generate full-sentence imperceptible audio adversarial examples with the 100% targeted success rate within 500 steps on average (9.24x speed-up over current state-of-the-art imperceptible counterparts), which is verified through a human study. Most importantly, our PhaseFool is the first to exploit the phase-oriented energy dissipation in the audio adversarial examples rather than add perturbations on the audio waveform like most previous works.", "keywords": "Audio adversarial examples;audio adversarial attacks;automatic speech recognition", "primary_area": "", "supplementary_material": "/attachment/4f061680b5a55a5d43720b8020bf5ea3e185ab2a.zip", "author": "Ziyue Jiang;Yi Ren;Zhou Zhao", "authorids": "~Ziyue_Jiang1;~Yi_Ren2;~Zhou_Zhao2", "gender": "M;M;M", "homepage": ";https://rayeren.github.io/;https://dblp.uni-trier.de/pid/75/7785.html?", "dblp": "258/6865;75/6568-6;75/7785", "google_scholar": "wDgSBssAAAAJ;4FA6C0AAAAAJ;https://scholar.google.com.hk/citations?user=IIoFY90AAAAJ", "orcid": ";;0000-0001-6121-0384", "linkedin": ";;", "or_profile": "~Ziyue_Jiang1;~Yi_Ren2;~Zhou_Zhao2", "aff": "Zhejiang University;Zhejiang University;Zhejiang University", "aff_domain": "zju.edu.cn;zju.edu.cn;zju.edu.cn", "position": "PhD student;MS student;Associate Professor", "bibtex": "@misc{\njiang2022phasefool,\ntitle={PhaseFool: Phase-oriented Audio Adversarial Examples via Energy Dissipation},\nauthor={Ziyue Jiang and Yi Ren and Zhou Zhao},\nyear={2022},\nurl={https://openreview.net/forum?id=GgOEm9twFO_}\n}", "github": "", "project": "", "reviewers": "rkdE;sPgr;BtbA;vUjU", "site": "https://openreview.net/forum?id=GgOEm9twFO_", "pdf_size": 0, "recommendation": "5;5;5;8", "confidence": "3;4;4;4", "correctness": "2;3;3;4", "technical_novelty": "3;3;3;4", "empirical_novelty": "2;2;2;4", "wc_summary_paper": "66;57;56;109", "wc_summary_review": "67;53;41;24", "wc_main_review": "973;95;494;86", "wc_review": "1106;205;591;219", "wc_reply_reviewers": "590;0;184;36", "wc_reply_authors": "2662;837;1541;266", "reply_reviewers": "1;0;2;1", "reply_authors": "5;3;4;1", "recommendation_avg": [ 5.75, 1.299038105676658 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 3.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 72.0, 21.714050750608465 ], "wc_summary_review_avg": [ 46.25, 15.801503093060482 ], "wc_main_review_avg": [ 412.0, 363.3902860561906 ], "wc_review_avg": [ 530.25, 366.6888156189114 ], "wc_reply_reviewers_avg": [ 202.5, 234.10841505593086 ], "wc_reply_authors_avg": [ 1326.5, 893.5660300168086 ], "reply_reviewers_avg": [ 1.0, 0.7071067811865476 ], "reply_authors_avg": [ 3.25, 1.479019945774904 ], "replies_avg": [ 26, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": 0.816496580927726, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:LN09GNCre3wJ:scholar.google.com/&scioq=PhaseFool:+Phase-oriented+Audio+Adversarial+Examples+via+Energy+Dissipation&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;0", "aff_unique_norm": "Zhejiang University", "aff_unique_dep": "", "aff_unique_url": "https://www.zju.edu.cn", "aff_unique_abbr": "ZJU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "Effect of scale on catastrophic forgetting in neural networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6452", "id": "GhVS8_yPeEa", "poster": "", "openreview": "https://openreview.net/forum?id=GhVS8_yPeEa", "slides": "https://iclr.cc/virtual/2022/poster/6452", "video": "https://iclr.cc/virtual/2022/poster/6452", "author_site": "Vinay Ramasesh, Aitor Lewkowycz, Ethan Dyer", "tldr": "", "abstract": "Catastrophic forgetting presents a challenge in developing deep learning models capable of continual learning, i.e. learning tasks sequentially. Recently, both computer vision and natural-language processing have witnessed great progress through the use of large-scale pretrained models. In this work, we present an empirical study of catastrophic forgetting in this pretraining paradigm.\nOur experiments indicate that large, pretrained ResNets and Transformers are significantly more resistant to forgetting than randomly-initialized, trained-from-scratch models; this robustness systematically improves with scale of both model and pretraining dataset size.\nWe take initial steps towards characterizing what aspect of model representations allows them to perform continual learning so well, finding that in the pretrained models, distinct class representations grow more orthogonal with scale. Our results suggest that, when possible, scale and a diverse pretraining dataset can be useful ingredients in mitigating catastrophic forgetting. ", "keywords": "Catastrophic forgetting;continual learning;scaling;language modeling;image classification", "primary_area": "", "supplementary_material": "", "author": "Vinay Venkatesh Ramasesh;Aitor Lewkowycz;Ethan Dyer", "authorids": "~Vinay_Venkatesh_Ramasesh1;~Aitor_Lewkowycz2;~Ethan_Dyer1", "gender": "M;M;M", "homepage": "http://ramasesh.github.io;https://scholar.google.com/citations?user=Yum1ah0AAAAJ&hl=en&authuser=1;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": "~Vinay_Venkatesh_Ramasesh1;~Aitor_Lewkowycz2;~Ethan_Dyer1", "aff": ";Google;Google", "aff_domain": ";google.com;google.com", "position": ";Postdoc;Staff", "bibtex": "@inproceedings{\nramasesh2022effect,\ntitle={Effect of scale on catastrophic forgetting in neural networks},\nauthor={Vinay Venkatesh Ramasesh and Aitor Lewkowycz and Ethan Dyer},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=GhVS8_yPeEa}\n}", "github": "", "project": "", "reviewers": "ZtJq;Tmvz;Dwjt;r2yB", "pdf_size": 0, "recommendation": "5;5;8;8", "confidence": "4;4;4;5", "correctness": "2;3;3;3", "technical_novelty": "2;3;2;3", "empirical_novelty": "3;3;3;3", "wc_summary_paper": "61;129;156;30", "wc_summary_review": "63;67;78;24", "wc_main_review": "624;215;1105;468", "wc_review": "748;411;1339;522", "wc_reply_reviewers": "0;0;182;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;1;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 6.5, 1.5 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 94.0, 50.631018160807315 ], "wc_summary_review_avg": [ 58.0, 20.38381711063951 ], "wc_main_review_avg": [ 603.0, 324.50500766552125 ], "wc_review_avg": [ 755.0, 358.3748038018298 ], "wc_reply_reviewers_avg": [ 45.5, 78.80831174438391 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.5773502691896258, "corr_recommendation_correctness": 0.5773502691896258, "gs_citation": 204, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8846119472466252006&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "pdf": "https://openreview.net/pdf?id=GhVS8_yPeEa", "email": ";google.com;google.com", "author_num": 3, "aff_unique_index": "0;0", "aff_unique_norm": "Google", "aff_unique_dep": "Google", "aff_unique_url": "https://www.google.com", "aff_unique_abbr": "Google", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "GiddFXGDmqp", "title": "Spatially Invariant Unsupervised 3D Object-Centric Learning and Scene Decomposition", "track": "main", "status": "Reject", "tldr": "", "abstract": "We tackle the problem of deep object-centric learning from a point cloud which is crucial for high-level relational reasoning and scalable machine intelligence. \nIn particular, we introduce a framework, SPAIR3D, to factorize a 3D point cloud into a spatial mixture model where each component corresponds to one object. \nTo model the spatial mixture model on point clouds, we derive the Chamfer Mixture Loss, which fits naturally into our variational training pipeline. Moreover, we adopt an object-specification scheme that describes each object\u2019s location relative to its local voxel grid cell. \nSuch a scheme allows SPAIR3D to model scenes with an arbitrary number of objects. \nWe evaluate our method on the task of unsupervised scene decomposition.\nExperimental results demonstrate that SPAIR3D has strong scalability and is capable of detecting and segmenting an unknown number of objects from a point cloud in an unsupervised manner.", "keywords": "generatvie model;variational autoencoder;mixture model;unsupervised object centric learning", "primary_area": "", "supplementary_material": "/attachment/d0ba17c133e44841d0ef1234ef9e394db6c25474.zip", "author": "Tianyu Wang;miaomiao Liu;Kee Siong Ng", "authorids": "~Tianyu_Wang5;~miaomiao_Liu2;~Kee_Siong_Ng2", "gender": "M;F;", "homepage": "https://cecs.anu.edu.au/people/tianyu-wang;http://users.cecs.anu.edu.au/~mliu/;", "dblp": ";66/8063-1.html;", "google_scholar": ";https://scholar.google.com.au/citations?user=ptAR7tUAAAAJ;https://scholar.google.com.au/citations?user=4bL3ThUAAAAJ", "orcid": "0000-0001-9032-8488;;", "linkedin": ";;", "or_profile": "~Tianyu_Wang5;~miaomiao_Liu2;~Kee_Siong_Ng2", "aff": "Australian National University;Australian National University;Australian National University", "aff_domain": "anu.edu.au;anu.edu.au;anu.edu.au", "position": "PhD student;Assistant Professor;Associate Professor", "bibtex": "@misc{\nwang2022spatially,\ntitle={Spatially Invariant Unsupervised 3D Object-Centric Learning and Scene Decomposition},\nauthor={Tianyu Wang and miaomiao Liu and Kee Siong Ng},\nyear={2022},\nurl={https://openreview.net/forum?id=GiddFXGDmqp}\n}", "github": "", "project": "", "reviewers": "oa19;tBSM;PtZ1;orM7;yyA6", "site": "https://openreview.net/forum?id=GiddFXGDmqp", "pdf_size": 0, "recommendation": "5;5;5;6;6", "confidence": "4;4;2;4;5", "correctness": "3;3;3;4;3", "technical_novelty": "2;3;2;3;4", "empirical_novelty": "2;3;2;3;3", "wc_summary_paper": "48;65;108;60;99", "wc_summary_review": "74;46;30;65;26", "wc_main_review": "120;219;247;382;746", "wc_review": "242;330;385;507;871", "wc_reply_reviewers": "0;203;179;417;275", "wc_reply_authors": "1101;645;261;1145;1510", "reply_reviewers": "0;1;1;3;1", "reply_authors": "2;2;1;3;3", "recommendation_avg": [ 5.4, 0.48989794855663565 ], "confidence_avg": [ 3.8, 0.9797958971132712 ], "correctness_avg": [ 3.2, 0.39999999999999997 ], "technical_novelty_avg": [ 2.8, 0.7483314773547882 ], "empirical_novelty_avg": [ 2.6, 0.4898979485566356 ], "wc_summary_paper_avg": [ 76.0, 23.29806858947754 ], "wc_summary_review_avg": [ 48.2, 18.850994668717085 ], "wc_main_review_avg": [ 342.8, 218.28916601608975 ], "wc_review_avg": [ 467.0, 219.51491976628833 ], "wc_reply_reviewers_avg": [ 214.8, 135.69731021652566 ], "wc_reply_authors_avg": [ 932.4, 433.7333743211375 ], "reply_reviewers_avg": [ 1.2, 0.9797958971132712 ], "reply_authors_avg": [ 2.2, 0.7483314773547882 ], "replies_avg": [ 24, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.5833333333333333, "corr_recommendation_correctness": 0.6123724356957945, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6696966827415481399&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff_unique_index": "0;0;0", "aff_unique_norm": "Australian National University", "aff_unique_dep": "", "aff_unique_url": "https://www.anu.edu.au", "aff_unique_abbr": "ANU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Australia" }, { "id": "GlN8MUkciwi", "title": "Learning Context-Adapted Video-Text Retrieval by Attending to User Comments", "track": "main", "status": "Reject", "tldr": "", "abstract": "Learning strong representations for multi-modal retrieval is an important problem for many applications, such as recommendation and search. Current benchmarks and even datasets are often manually constructed and consist of mostly clean samples where all modalities are well-correlated with the content. Thus, current video-text retrieval literature largely focuses on video titles or audio transcripts, while ignoring user comments, since users often tend to discuss topics only vaguely related to the video.\nIn this paper we present a novel method that learns meaningful representations from videos, titles and comments, which are abundant on the internet. Due to the nature of user comments, we introduce an attention-based mechanism that allows the model to disregard text with irrelevant content. \nIn our experiments, we demonstrate that, by using comments, our method is able to learn better, more contextualised, representations, while also achieving competitive results on standard video-text retrieval benchmarks.\n", "keywords": "Multimodal Representation Learning;Video;Text;Retrieval;User Comments", "primary_area": "", "supplementary_material": "/attachment/e3089fce59e4ee822d3f965672c4a10abe64d2ff.zip", "author": "Laura Hanu;Yuki M Asano;James Thewlis;Christian Rupprecht", "authorids": "~Laura_Hanu1;~Yuki_M_Asano1;~James_Thewlis1;~Christian_Rupprecht1", "gender": "F;;M;M", "homepage": "https://laurahanu.github.io/;http://jamesthewlis.com;http://chrirupp.github.io;https://yukimasano.github.io/", "dblp": ";https://dblp.uni-trier.de/pers/hd/t/Thewlis:James;https://dblp.uni-trier.de/pid/76/744-1;239/8823", "google_scholar": "07hm3DYAAAAJ;https://scholar.google.co.uk/citations?user=UQHlWF4AAAAJ;https://scholar.google.de/citations?user=IrYlproAAAAJ;CdpLhlgAAAAJ", "orcid": ";0000-0001-8410-2570;;", "linkedin": "laura-hanu-a0941691/;jamesthewlis/;;", "or_profile": "~Laura_Hanu1;~James_Thewlis1;~Christian_Rupprecht1;~Yuki_Asano1", "aff": ";Unitary;University of Oxford;University of Amsterdam", "aff_domain": ";unitary.ai;ox.ac.uk;uva.nl", "position": ";CTO;Lecturer;Assistant Professor", "bibtex": "@misc{\nhanu2022learning,\ntitle={Learning Context-Adapted Video-Text Retrieval by Attending to User Comments},\nauthor={Laura Hanu and Yuki M Asano and James Thewlis and Christian Rupprecht},\nyear={2022},\nurl={https://openreview.net/forum?id=GlN8MUkciwi}\n}", "github": "", "project": "", "reviewers": "599p;PRNC;SsvR;2J7x", "site": "https://openreview.net/forum?id=GlN8MUkciwi", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "4;4;3;3", "correctness": "2;3;3;2", "technical_novelty": "3;3;2;2", "empirical_novelty": "2;3;3;2", "wc_summary_paper": "76;62;54;80", "wc_summary_review": "50;9;29;22", "wc_main_review": "561;747;204;312", "wc_review": "687;818;287;414", "wc_reply_reviewers": "0;0;6;0", "wc_reply_authors": "948;1822;666;718", "reply_reviewers": "0;0;1;0", "reply_authors": "2;3;3;2", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 68.0, 10.488088481701515 ], "wc_summary_review_avg": [ 27.5, 14.84082207965583 ], "wc_main_review_avg": [ 456.0, 212.10021216396743 ], "wc_review_avg": [ 551.5, 211.09772618386964 ], "wc_reply_reviewers_avg": [ 1.5, 2.598076211353316 ], "wc_reply_authors_avg": [ 1038.5, 464.63399574288576 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 2.5, 0.5 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -1.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:42H1qPOX0UYJ:scholar.google.com/&scioq=Learning+Context-Adapted+Video-Text+Retrieval+by+Attending+to+User+Comments&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;2", "aff_unique_norm": "Unitary;University of Oxford;University of Amsterdam", "aff_unique_dep": ";;", "aff_unique_url": ";https://www.ox.ac.uk;https://www.uva.nl", "aff_unique_abbr": ";Oxford;UvA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "1;2", "aff_country_unique": ";United Kingdom;Netherlands" }, { "id": "Gnh9rFw6ff0", "title": "What Makes for Good Representations for Contrastive Learning", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Contrastive learning between different views of the data achieves outstanding success in the field of self-supervised representation learning and the learned representations are useful in various downstream tasks. Since all supervision information for one view comes from the other view, contrastive learning tends to obtain the minimal sufficient representation which contains the shared information and eliminates the non-shared information between views. Considering the diversity of the downstream tasks, it can not be guaranteed that all task-relevant information is shared between views. Therefore, we assume the task-relevant information that is not shared between views can not be ignored and theoretically prove that the minimal sufficient representation in contrastive learning is not sufficient for the downstream tasks, which causes performance degradation. This reveals a new problem that the contrastive learning models have the risk of over-fitting to the shared information between views. To alleviate this problem, we propose to increase the mutual information between the representation and input as regularization to approximately introduce more task-relevant information since we can not utilize any downstream task information during training. Extensive experiments verify the rationality of our analysis and the effectiveness of our method. It significantly improves the performance of several classic contrastive learning models in downstream tasks.", "keywords": "Unsupervised learning;Self-supervised learning;Contrastive learning;Minimal sufficient representation", "primary_area": "", "supplementary_material": "/attachment/88ba9cacd132a69548541eb7a6abc5a0d39aaac3.zip", "author": "Haoqing Wang;Xun Guo;Zhi-Hong Deng;Yan Lu", "authorids": "~Haoqing_Wang1;~Xun_Guo1;~Zhi-Hong_Deng1;~Yan_Lu7", "gender": "M;M;M;M", "homepage": ";;http://www.cis.pku.edu.cn/jzyg/szdw/dzh.htm;https://www.microsoft.com/en-us/research/people/yanlu/", "dblp": "251/8849;32/5851;161/4814-1;15/4830-1", "google_scholar": "A2kCYnUAAAAJ;Ow4R8-EAAAAJ;https://scholar.google.com.tw/citations?user=tRoAxlsAAAAJ;djk5l-4AAAAJ", "orcid": ";;0000-0002-0263-8142;0000-0001-5383-6424", "linkedin": ";;;", "or_profile": "~Haoqing_Wang1;~Xun_Guo1;~Zhi-Hong_Deng1;~Yan_Lu7", "aff": "Peking University;Microsoft Research Asia;Peking University;Microsoft Research Asia", "aff_domain": "pku.edu.cn;microsoft.com;pku.edu.cn;microsoft.com", "position": "PhD student;Principal Researcher;Full Professor;Partner Research Manager", "bibtex": "@misc{\nwang2022what,\ntitle={What Makes for Good Representations for Contrastive Learning},\nauthor={Haoqing Wang and Xun Guo and Zhi-Hong Deng and Yan Lu},\nyear={2022},\nurl={https://openreview.net/forum?id=Gnh9rFw6ff0}\n}", "github": "", "project": "", "reviewers": "ajmM;dvdQ;Qxmr;BnTY", "site": "https://openreview.net/forum?id=Gnh9rFw6ff0", "pdf_size": 0, "recommendation": "3;3;5;6", "confidence": "4;4;4;4", "correctness": "2;3;2;4", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "45;100;57;33", "wc_summary_review": "11;112;55;26", "wc_main_review": "407;246;623;179", "wc_review": "463;458;735;238", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;24;30", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;1;2", "recommendation_avg": [ 4.25, 1.299038105676658 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 2.75, 0.82915619758885 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 58.75, 25.282157740192982 ], "wc_summary_review_avg": [ 51.0, 38.60699418499192 ], "wc_main_review_avg": [ 363.75, 171.08386101558497 ], "wc_review_avg": [ 473.5, 176.2051361339958 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 13.5, 13.665650368716449 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0.75, 0.82915619758885 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.5222329678670935, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2021712767747382392&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;0;1", "aff_unique_norm": "Peking University;Microsoft", "aff_unique_dep": ";Research", "aff_unique_url": "http://www.pku.edu.cn;https://www.microsoft.com/en-us/research/group/asia", "aff_unique_abbr": "Peking U;MSR Asia", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Asia", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "id": "GoCNFW6Emb", "title": "VORTEX: Physics-Driven Data Augmentations for Consistency Training for Robust Accelerated MRI Reconstruction", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Deep neural networks have enabled improved image quality and fast inference times for various inverse problems, including accelerated magnetic resonance imaging (MRI) reconstruction. However, such models require large amounts of fully-sampled ground truth data, which are difficult to curate and are sensitive to distribution drifts. In this work, we propose applying physics-driven data augmentations for consistency training that leverage our domain knowledge of the forward MRI data acquisition process and MRI physics for improved data efficiency and robustness to clinically-relevant distribution drifts. Our approach, termed VORTEX, (1) demonstrates strong improvements over supervised baselines with and without augmentation in robustness to signal-to-noise ratio change and motion corruption in data-limited regimes; (2) considerably outperforms state-of-the-art data augmentation techniques that are purely image-based on both in-distribution and out-of-distribution data; and (3) enables composing heterogeneous image-based and physics-driven augmentations.", "keywords": "medical imaging;MRI;radiology;image reconstruction;inverse problems;distribution shift;robustness", "primary_area": "", "supplementary_material": "/attachment/e2bc9f5bd87f1304ee01c956561d69df1743231f.zip", "author": "Arjun D Desai;Beliz Gunel;Batu Ozturkler;Harris Beg;Shreyas Vasanawala;Brian Hargreaves;Christopher Re;John M. Pauly;Akshay Chaudhari", "authorids": "~Arjun_D_Desai1;~Beliz_Gunel1;~Batu_Ozturkler1;~Harris_Beg1;~Shreyas_Vasanawala1;~Brian_Hargreaves1;~Christopher_Re1;~John_M._Pauly1;~Akshay_Chaudhari1", "gender": ";F;;M;M;M;;M;", "homepage": ";http://web.stanford.edu/~bgunel/;https://batuozt.github.io;https://www.its.caltech.edu/~hbeg/;https://profiles.stanford.edu/shreyas-vasanawala;https://profiles.stanford.edu/brian-hargreaves;;http://www.stanford.edu/~pauly;", "dblp": ";206/6726;281/6970;;54/9775;;;95/6728;225/4729", "google_scholar": ";FckK6vUAAAAJ;O_tiFfoAAAAJ;;n9mOA2IAAAAJ;https://scholar.google.com/citations?authuser=1;;Fc6GIIQAAAAJ;08Y4NhMAAAAJ", "orcid": ";;;;;;;;", "linkedin": ";beliz-gunel-7252bb78/;;~harris/;;;;john-pauly-69805911/;", "or_profile": "~Arjun_D_Desai1;~Beliz_Gunel1;~Batu_Ozturkler1;~Harris_Beg1;~Shreyas_Vasanawala1;~Brian_Hargreaves1;~Christopher_Re1;~John_M._Pauly1;~Akshay_Chaudhari1", "aff": ";Stanford University;Microsoft;California Institute of Technology;Stanford University;Stanford University;;;Subtle Medical", "aff_domain": ";stanford.edu;microsoft.com;caltech.edu;stanford.edu;stanford.edu;;;subtlemedical.com", "position": ";PhD student;Intern;Undergrad student;Assistant Professor;Full Professor;;;Consultant", "bibtex": "@misc{\ndesai2022vortex,\ntitle={{VORTEX}: Physics-Driven Data Augmentations for Consistency Training for Robust Accelerated {MRI} Reconstruction},\nauthor={Arjun D Desai and Beliz Gunel and Batu Ozturkler and Harris Beg and Shreyas Vasanawala and Brian Hargreaves and Christopher Re and John M. Pauly and Akshay Chaudhari},\nyear={2022},\nurl={https://openreview.net/forum?id=GoCNFW6Emb}\n}", "github": "", "project": "", "reviewers": "Pyg2;BiXF;DjEt;ZVy8", "site": "https://openreview.net/forum?id=GoCNFW6Emb", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "3;4;3;5", "correctness": "2;3;3;3", "technical_novelty": "1;2;2;2", "empirical_novelty": "0;2;2;3", "wc_summary_paper": "184;92;90;75", "wc_summary_review": "37;59;140;54", "wc_main_review": "337;210;449;506", "wc_review": "558;361;679;635", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "880;1693;911;2038", "reply_reviewers": "0;0;0;0", "reply_authors": "2;3;2;4", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 1.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.75, 1.0897247358851685 ], "wc_summary_paper_avg": [ 110.25, 43.083494519363214 ], "wc_summary_review_avg": [ 72.5, 39.81519810323691 ], "wc_main_review_avg": [ 375.5, 113.25303527941315 ], "wc_review_avg": [ 558.25, 121.83877666818557 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 1380.5, 500.2232001816789 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.75, 0.82915619758885 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 9, 0 ], "corr_recommendation_confidence": 0.7608859102526822, "corr_recommendation_correctness": 0.9271726499455306, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8547825810086934268&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "aff_unique_index": "0;1;2;0;0;3", "aff_unique_norm": "Stanford University;Microsoft;California Institute of Technology;Subtle Medical", "aff_unique_dep": ";Microsoft Corporation;;", "aff_unique_url": "https://www.stanford.edu;https://www.microsoft.com;https://www.caltech.edu;https://www.subtlemedical.com", "aff_unique_abbr": "Stanford;Microsoft;Caltech;", "aff_campus_unique_index": "0;2;0;0", "aff_campus_unique": "Stanford;;Pasadena", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "Gpp1dfvZYYH", "title": "ProgFed: Effective, Communication, and Computation Efficient Federated Learning by Progressive Training", "track": "main", "status": "Reject", "tldr": "", "abstract": "Federated learning is a powerful distributed learning scheme that allows numerous edge devices to collaboratively train a model without sharing their data. However, training is resource-intensive for edge devices, and limited network bandwidth is often the main bottleneck. Prior work often overcomes the constraints by condensing the models or messages into compact formats, e.g., by gradient compression or distillation. In contrast, we propose ProgFed, the first progressive training framework for efficient and effective federated learning. It inherently reduces computation and two-way communication costs while maintaining the strong performance of the final models. We theoretically prove that ProgFed converges at the same asymptotic rate as standard training on full models. Extensive results on a broad range of architectures, including CNNs (VGG, ResNet, ConvNets) and U-nets, and diverse tasks from simple classification to medical image segmentation show that our highly effective training approach saves up to $20\\%$ computation and up to $63\\%$ communication costs for converged models. As our approach is also complimentary to prior work on compression, we can achieve a wide range of trade-offs, showing reduced communication of up to $50\\times$ at only $0.1\\%$ loss in utility. ", "keywords": "federated learning;progressive learning", "primary_area": "", "supplementary_material": "/attachment/a46befdffaafe5ec0667499a13b8f3c35912bd2e.zip", "author": "Hui-Po Wang;Sebastian U Stich;Yang He;Mario Fritz", "authorids": "~Hui-Po_Wang1;~Sebastian_U_Stich1;yang.he@cispa.saarland;~Mario_Fritz1", "gender": "M;M;;M", "homepage": "https://hui-po-wang.github.io/;https://www.sstich.ch;;https://cispa.saarland/group/fritz/", "dblp": "237/0049;04/10549;;", "google_scholar": "UAnfs8UAAAAJ;https://scholar.google.ch/citations?user=8l-mDfQAAAAJ;;https://scholar.google.de/citations?user=4V1nNm4AAAAJ", "orcid": ";;;", "linkedin": "hui-po-wang-7a0158137/;;;", "or_profile": "~Hui-Po_Wang1;~Sebastian_U_Stich1;yang.he@cispa.saarland;~Mario_Fritz1", "aff": "CISPA Helmholtz Center for Information Security;CISPA Helmholtz Center for Information Security;;Saarland University", "aff_domain": "cispa.de;cispa.de;;uni-saarland.de", "position": "PhD student;Tenure Track Faculty;;Full Professor", "bibtex": "@misc{\nwang2022progfed,\ntitle={ProgFed: Effective, Communication, and Computation Efficient Federated Learning by Progressive Training},\nauthor={Hui-Po Wang and Sebastian U Stich and Yang He and Mario Fritz},\nyear={2022},\nurl={https://openreview.net/forum?id=Gpp1dfvZYYH}\n}", "github": "", "project": "", "reviewers": "3zVc;1N8v;3Huv;5a4S", "site": "https://openreview.net/forum?id=Gpp1dfvZYYH", "pdf_size": 0, "recommendation": "5;5;6;8", "confidence": "3;5;3;4", "correctness": "3;2;3;4", "technical_novelty": "2;3;3;3", "empirical_novelty": "3;3;3;3", "wc_summary_paper": "173;34;138;218", "wc_summary_review": "80;4;42;55", "wc_main_review": "511;167;596;483", "wc_review": "764;205;776;756", "wc_reply_reviewers": "305;145;0;97", "wc_reply_authors": "842;1146;552;760", "reply_reviewers": "1;1;0;1", "reply_authors": "2;3;2;3", "recommendation_avg": [ 6.0, 1.224744871391589 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 140.75, 67.8431094511447 ], "wc_summary_review_avg": [ 45.25, 27.453369556395078 ], "wc_main_review_avg": [ 439.25, 162.5982395353652 ], "wc_review_avg": [ 625.25, 242.7358389278353 ], "wc_reply_reviewers_avg": [ 136.75, 110.29137545610718 ], "wc_reply_authors_avg": [ 825.0, 213.3565091577944 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 2.5, 0.5 ], "replies_avg": [ 23, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.8660254037844386, "gs_citation": 67, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14093452975120098193&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 12, "aff_unique_index": "0;0;1", "aff_unique_norm": "CISPA Helmholtz Center for Information Security;Saarland University", "aff_unique_dep": ";", "aff_unique_url": "https://www.cispa.de/;https://www.uni-saarland.de", "aff_unique_abbr": "CISPA;UdS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Germany" }, { "id": "GrFix2vWsh4", "title": "The hidden label-marginal biases of segmentation losses", "track": "main", "status": "Reject", "tldr": "", "abstract": "Most segmentation losses are arguably variants of the Cross-Entropy (CE) or Dice losses. In the abundant segmentation literature, there is no clear consensus as to which of these losses is a better choice, with varying performances for each across different benchmarks and applications. In this work, we develop a theoretical analysis that links these two types of losses, exposing their advantages and weaknesses. First, we provide a constrained-optimization perspective showing that CE and Dice share a much deeper connection than previously thought: They both decompose into label-marginal penalties and closely related ground-truth matching penalties. Then, we provide bound relationships and an information-theoretic analysis, which uncover hidden label-marginal biases: Dice has an intrinsic bias towards specific extremely imbalanced solutions, whereas CE implicitly encourages the ground-truth region proportions. Our theoretical results explain the wide experimental evidence in the medical-imaging literature, whereby Dice losses bring improvements for imbalanced segmentation. It also explains why CE dominates natural-image problems with diverse class proportions, in which case Dice might have difficulty adapting to different label-marginal distributions. Based on our theoretical analysis, we propose a principled and simple solution, which enables to control explicitly the label-marginal bias. Our loss integrates CE with explicit ${\\cal L}_1$ regularization, which encourages label marginals to match target class proportions, thereby mitigating class imbalance but without losing generality. Comprehensive experiments and ablation studies over different losses and applications validate our theoretical analysis, as well as the effectiveness of our explicit label-marginal regularizers.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/9248077e14dd6d170efac7cd44aa684c29273979.zip", "author": "Bingyuan Liu;Jose Dolz;Adrian Galdran;Riadh Kobbi;Ismail Ben Ayed", "authorids": "~Bingyuan_Liu3;~Jose_Dolz1;~Adrian_Galdran1;rkobbi@diagnos.ca;~Ismail_Ben_Ayed1", "gender": "M;;;;M", "homepage": "https://by-liu.github.io/;https://josedolz.github.io;https://agaldran.github.io/;;https://profs.etsmtl.ca/ibenayed/", "dblp": "136/5447;165/8035;160/2676.html;;68/4478", "google_scholar": "jrWPhioAAAAJ;https://scholar.google.ca/citations?user=yHQIFFMAAAAJ;https://scholar.google.es/citations?user=VKx-rswAAAAJ;;https://scholar.google.ca/citations?user=29vyUccAAAAJ", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~Bingyuan_Liu3;~Jose_Dolz1;~Adrian_Galdran1;rkobbi@diagnos.ca;~Ismail_Ben_Ayed1", "aff": "\u00c9cole de technologie sup\u00e9rieure;\u00c9cole de technologie sup\u00e9rieure;Universitat Pompeu Fabra;;\u00c9cole de technologie sup\u00e9rieure, Universit\u00e9 du Qu\u00e9bec", "aff_domain": "etsmtl.ca;etsmtl.ca;upf.edu;;etsmtl.ca", "position": "Research Associate;Associate Professor;Postdoc;;Full Professor", "bibtex": "@misc{\nliu2022the,\ntitle={The hidden label-marginal biases of segmentation losses},\nauthor={Bingyuan Liu and Jose Dolz and Adrian Galdran and Riadh Kobbi and Ismail Ben Ayed},\nyear={2022},\nurl={https://openreview.net/forum?id=GrFix2vWsh4}\n}", "github": "", "project": "", "reviewers": "3tvn;njoP;QsrX;P7HM", "site": "https://openreview.net/forum?id=GrFix2vWsh4", "pdf_size": 0, "recommendation": "3;5;6;6", "confidence": "5;3;4;3", "correctness": "4;3;3;3", "technical_novelty": "1;2;3;3", "empirical_novelty": "1;2;3;3", "wc_summary_paper": "48;61;124;85", "wc_summary_review": "3;128;23;49", "wc_main_review": "84;430;383;156", "wc_review": "135;619;530;290", "wc_reply_reviewers": "0;1169;113;0", "wc_reply_authors": "736;2160;912;694", "reply_reviewers": "0;3;1;0", "reply_authors": "1;4;3;1", "recommendation_avg": [ 5.0, 1.224744871391589 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.82915619758885 ], "empirical_novelty_avg": [ 2.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 79.5, 28.91798748184251 ], "wc_summary_review_avg": [ 50.75, 47.488814472462884 ], "wc_main_review_avg": [ 263.25, 146.44004745970278 ], "wc_review_avg": [ 393.5, 191.71397966762882 ], "wc_reply_reviewers_avg": [ 320.5, 492.04903210960595 ], "wc_reply_authors_avg": [ 1125.5, 602.8422264573045 ], "reply_reviewers_avg": [ 1.0, 1.224744871391589 ], "reply_authors_avg": [ 2.25, 1.299038105676658 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.7385489458759963, "corr_recommendation_correctness": -0.9428090415820632, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5402591632448925561&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;1;2", "aff_unique_norm": "\u00c9cole de technologie sup\u00e9rieure;Universitat Pompeu Fabra;Universit\u00e9 du Qu\u00e9bec", "aff_unique_dep": ";;", "aff_unique_url": "https://www.etsmtl.ca;https://www.upf.edu/;https://www.etsmtl.ca", "aff_unique_abbr": "ETS;UPF;ETS", "aff_campus_unique_index": "1", "aff_campus_unique": ";\u00c9cole de technologie sup\u00e9rieure", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "Canada;Spain" }, { "id": "GrJDb8KXPA3", "title": "FEATURE-AUGMENTED HYPERGRAPH NEURAL NETWORKS", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Graph neural networks (GNNs) and their variants have demonstrated superior performance in learning graph representations by aggregating features based on graph or hypergraph structures. However, it is becoming evident that most exist- ing graph-based GNNs are susceptible to over-smoothing and are non-robust to perturbations. For representation learning tasks, hypergraphs usually have more expressive power than graphs through their ability to encode higher-order data correlations. In this paper, we propose Feature-Augmented Hypergraph Neural Networks (FAHGNN) focusing on hypergraph structures. In FAHGNN, we explore the influence of node features for the expressive power of GNNs and augment features by introducing common features and personal features to model information. Specifically, for a node, the common features contain the shared information with other nodes in hyperedges, while the personal features represent its special information. In this way, the feature types each possess different distinguishing powers. Considering the different properties of these two kinds of features, we design different propagation strategies for information aggregation on hypergraphs. Furthermore, during the propagation process, we further augment features by randomly dropping node features. We leverage consistency regularization across different data augmentations of the two feature types to optimize the prediction consistency for the model. Extensive experiments on several benchmarks show that FAHGNN significantly outperforms other state-of-the-art methods for node classification tasks. Our theoretical study and experimental results further support the effectiveness of FAHGNN for mitigating issues of over-smoothing and enhancing the robustness of the model.", "keywords": "graph representation learning\uff0chypergraph learning", "primary_area": "", "supplementary_material": "", "author": "Xueqi Ma;Pan Li;Qiong Cao;James Bailey;Yue Gao", "authorids": "~Xueqi_Ma1;~Pan_Li2;mathqiong2012@gmail.com;~James_Bailey1;~Yue_Gao4", "gender": "F;;;;M", "homepage": ";;;;http://www.gaoyue.org", "dblp": "194/4773;https://dblp.org/pers/hd/l/Li_0005:Pan;;;33/3099-2", "google_scholar": "https://scholar.google.com/citations?hl=en;IroP0EwAAAAJ;;;UTDfWocAAAAJ", "orcid": ";;;;", "linkedin": ";pan-li-b951105a/;;;", "or_profile": "~Xueqi_Ma1;~Pan_Li2;mathqiong2012@gmail.com;~James_Bailey1;~Yue_Gao4", "aff": "University of Melbourne;Purdue University;;;Tsinghua University", "aff_domain": "unimelb.edu;purdue.edu;;;tsinghua.edu.cn", "position": "PhD student;Assistant Professor;;;Associate Professor", "bibtex": "@misc{\nma2022featureaugmented,\ntitle={{FEATURE}-{AUGMENTED} {HYPERGRAPH} {NEURAL} {NETWORKS}},\nauthor={Xueqi Ma and Pan Li and Qiong Cao and James Bailey and Yue Gao},\nyear={2022},\nurl={https://openreview.net/forum?id=GrJDb8KXPA3}\n}", "github": "", "project": "", "reviewers": "agWc;pySa;Kwpa;Xk1z", "site": "https://openreview.net/forum?id=GrJDb8KXPA3", "pdf_size": 0, "recommendation": "3;3;3;6", "confidence": "5;2;3;4", "correctness": "3;2;2;4", "technical_novelty": "2;1;2;3", "empirical_novelty": "1;2;2;2", "wc_summary_paper": "110;43;91;12", "wc_summary_review": "43;81;36;6", "wc_main_review": "362;798;287;158", "wc_review": "515;922;414;176", "wc_reply_reviewers": "1148;0;0;0", "wc_reply_authors": "1837;749;200;253", "reply_reviewers": "6;0;0;0", "reply_authors": "9;2;1;1", "recommendation_avg": [ 3.75, 1.299038105676658 ], "confidence_avg": [ 3.5, 1.118033988749895 ], "correctness_avg": [ 2.75, 0.82915619758885 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 1.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 64.0, 38.6975451417787 ], "wc_summary_review_avg": [ 41.5, 26.706740722147284 ], "wc_main_review_avg": [ 401.25, 240.40317697567974 ], "wc_review_avg": [ 506.75, 269.4896797652927 ], "wc_reply_reviewers_avg": [ 287.0, 497.0985817722678 ], "wc_reply_authors_avg": [ 759.75, 657.7801209370803 ], "reply_reviewers_avg": [ 1.5, 2.598076211353316 ], "reply_authors_avg": [ 3.25, 3.344772040064913 ], "replies_avg": [ 24, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.2581988897471611, "corr_recommendation_correctness": 0.8703882797784892, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:t_fCJBj59j0J:scholar.google.com/&scioq=FEATURE-AUGMENTED+HYPERGRAPH+NEURAL+NETWORKS&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;2", "aff_unique_norm": "University of Melbourne;Purdue University;Tsinghua University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.unimelb.edu.au;https://www.purdue.edu;https://www.tsinghua.edu.cn", "aff_unique_abbr": "UniMelb;Purdue;THU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2", "aff_country_unique": "Australia;United States;China" }, { "id": "GrvigKxc13E", "title": "Gradient play in stochastic games: stationary points, convergence, and sample complexity", "track": "main", "status": "Reject", "tldr": "", "abstract": "We study the performance of the gradient play algorithm for stochastic games (SGs), where each agent tries to maximize its own total discounted reward by making decisions independently based on current state information which is shared between agents. Policies are directly parameterized by the probability of choosing a certain action at a given state. We show that Nash equilibria (NEs) and first-order stationary policies are equivalent in this setting, and give a local convergence rate around strict NEs. Further, for a subclass of SGs called Markov potential games (which includes the cooperative setting with identical rewards among agents as an important special case), we design a sample-based reinforcement learning algorithm and give a non-asymptotic global convergence rate analysis for both exact gradient play and our sample-based learning algorithm. Our result shows that the number of iterations to reach an $\\epsilon$-NE scales linearly, instead of exponentially, with the number of agents. Local geometry and local stability are also considered, where we prove that strict NEs are local maxima of the total potential function and fully-mixed NEs are saddle points.", "keywords": "multiagent reinforcement learning;stochastic game;policy gradient;Nash equilibrium;sample complexity", "primary_area": "", "supplementary_material": "", "author": "Runyu Zhang;Zhaolin Ren;Na Li", "authorids": "~Runyu_Zhang1;~Zhaolin_Ren1;~Na_Li3", "gender": ";M;F", "homepage": "https://dianyu420376.github.io/runyu-cathy-zhang.github.io/;;https://nali.seas.harvard.edu/", "dblp": ";;", "google_scholar": "h3SuftsAAAAJ;;qdGelXoAAAAJ", "orcid": ";;", "linkedin": ";zhaolin-ren-1b1b94108;", "or_profile": "~Runyu_Zhang1;~Zhaolin_Ren1;~Na_Li3", "aff": "Harvard University;Harvard University;Harvard University", "aff_domain": "harvard.edu;harvard.edu;harvard.edu", "position": "PhD student;PhD student;Full Professor", "bibtex": "@misc{\nzhang2022gradient,\ntitle={Gradient play in stochastic games: stationary points, convergence, and sample complexity},\nauthor={Runyu Zhang and Zhaolin Ren and Na Li},\nyear={2022},\nurl={https://openreview.net/forum?id=GrvigKxc13E}\n}", "github": "", "project": "", "reviewers": "NPmA;oPGh;ByWn", "site": "https://openreview.net/forum?id=GrvigKxc13E", "pdf_size": 0, "recommendation": "3;6;8", "confidence": "5;4;5", "correctness": "4;4;4", "technical_novelty": "4;3;3", "empirical_novelty": "3;0;0", "wc_summary_paper": "49;91;156", "wc_summary_review": "90;81;31", "wc_main_review": "181;429;93", "wc_review": "320;601;280", "wc_reply_reviewers": "118;51;117", "wc_reply_authors": "1653;1028;1045", "reply_reviewers": "2;1;2", "reply_authors": "6;3;5", "recommendation_avg": [ 5.666666666666667, 2.0548046676563256 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "correctness_avg": [ 4.0, 0.0 ], "technical_novelty_avg": [ 3.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 1.0, 1.4142135623730951 ], "wc_summary_paper_avg": [ 98.66666666666667, 44.01767321832852 ], "wc_summary_review_avg": [ 67.33333333333333, 25.952948879762307 ], "wc_main_review_avg": [ 234.33333333333334, 142.26110579572415 ], "wc_review_avg": [ 400.3333333333333, 142.82934650211848 ], "wc_reply_reviewers_avg": [ 95.33333333333333, 31.351058816073326 ], "wc_reply_authors_avg": [ 1242.0, 290.7037438126084 ], "reply_reviewers_avg": [ 1.6666666666666667, 0.4714045207910317 ], "reply_authors_avg": [ 4.666666666666667, 1.247219128924647 ], "replies_avg": [ 23, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.11470786693528084, "corr_recommendation_correctness": 0.0, "gs_citation": 52, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=271677938676017510&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;0", "aff_unique_norm": "Harvard University", "aff_unique_dep": "", "aff_unique_url": "https://www.harvard.edu", "aff_unique_abbr": "Harvard", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Data-Driven Offline Optimization for Architecting Hardware Accelerators", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6616", "id": "GsH-K1VIyy", "poster": "", "openreview": "https://openreview.net/forum?id=GsH-K1VIyy", "slides": "https://iclr.cc/virtual/2022/poster/6616", "video": "https://iclr.cc/virtual/2022/poster/6616", "author_site": "Aviral Kumar, Amir Yazdanbakhsh, Milad Hashemi, Kevin Swersky, Sergey Levine", "tldr": "", "abstract": "To attain higher efficiency, the industry has gradually reformed towards application-specific hardware accelerators. While such a paradigm shift is already starting to show promising results, designers need to spend considerable manual effort and perform large number of time-consuming simulations to find accelerators that can accelerate multiple target applications while obeying design constraints. Moreover, such a simulation-driven approach must be re-run from scratch every time the set of target applications or design constraints change. An alternative paradigm is to use a data-driven, offline approach that utilizes logged simulation data, to architect hardware accelerators, without needing any form of simulations. Such an approach not only alleviates the need to run time-consuming simulation, but also enables data reuse and applies even when set of target applications changes. In this paper, we develop such a data-driven offline optimization method for designing hardware accelerators, dubbed PRIME, that enjoys all of these properties. Our approach learns a conservative, robust estimate of the desired cost function, utilizes infeasible points and optimizes the design against this estimate without any additional simulator queries during optimization. PRIME architects accelerators---tailored towards both single- and multi-applications---improving performance upon stat-of-the-art simulation-driven methods by about 1.54x and 1.20x, while considerably reducing the required total simulation time by 93% and 99%, respectively. In addition, PRIME also architects effective accelerators for unseen applications in a zero-shot setting, outperforming simulation-based methods by 1.26x.", "keywords": "computer architecture and systems;machine learning;data-driven optimization", "primary_area": "", "supplementary_material": "", "author": "Aviral Kumar;Amir Yazdanbakhsh;Milad Hashemi;Kevin Swersky;Sergey Levine", "authorids": "~Aviral_Kumar2;~Amir_Yazdanbakhsh1;~Milad_Hashemi1;~Kevin_Swersky1;~Sergey_Levine1", "gender": "M;M;;M;M", "homepage": "https://aviralkumar2907.github.io/;https://www.ayazdan.com/;;http://www.cs.toronto.edu/~kswersky;https://people.eecs.berkeley.edu/~svlevine/", "dblp": "202/7961;44/8745;127/9046;35/9381;80/7594", "google_scholar": ";Vdu_sqwAAAAJ;;https://scholar.google.ca/citations?user=IrixA8MAAAAJ;8R35rCwAAAAJ", "orcid": ";0000-0001-8199-7671;;;", "linkedin": ";ayazdanb/;;;", "or_profile": "~Aviral_Kumar2;~Amir_Yazdanbakhsh1;~Milad_Hashemi1;~Kevin_Swersky1;~Sergey_Levine1", "aff": "University of California, Berkeley;Google Brain;Google;Google Deepmind;Google", "aff_domain": "berkeley.edu;google.com;google.com;google.com;google.com", "position": "PhD student;Researcher;Research Scientist;Research Scientist;Research Scientist", "bibtex": "@inproceedings{\nkumar2022datadriven,\ntitle={Data-Driven Offline Optimization for Architecting Hardware Accelerators},\nauthor={Aviral Kumar and Amir Yazdanbakhsh and Milad Hashemi and Kevin Swersky and Sergey Levine},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=GsH-K1VIyy}\n}", "github": "", "project": "", "reviewers": "7gpC;TEzV;aghq;cuvZ", "pdf_size": 0, "recommendation": "6;6;8;8", "confidence": "3;3;4;4", "correctness": "3;3;4;3", "technical_novelty": "3;2;3;2", "empirical_novelty": "3;3;3;0", "wc_summary_paper": "113;174;45;118", "wc_summary_review": "50;31;45;139", "wc_main_review": "281;155;484;952", "wc_review": "444;360;574;1209", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "944;196;227;1262", "reply_reviewers": "0;0;0;0", "reply_authors": "2;1;1;2", "recommendation_avg": [ 7.0, 1.0 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.25, 1.299038105676658 ], "wc_summary_paper_avg": [ 112.5, 45.741119356657634 ], "wc_summary_review_avg": [ 66.25, 42.57566793369189 ], "wc_main_review_avg": [ 468.0, 303.08827097068604 ], "wc_review_avg": [ 646.75, 333.44817813267474 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 657.25, 459.8409371728446 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 1.0, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 47, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3626400440105766805&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=GsH-K1VIyy", "email": "berkeley.edu;google.com;google.com;google.com;google.com", "author_num": 5, "aff_unique_index": "0;1;1;2;1", "aff_unique_norm": "University of California, Berkeley;Google;DeepMind", "aff_unique_dep": ";Google Brain;DeepMind", "aff_unique_url": "https://www.berkeley.edu;https://brain.google.com;https://deepmind.com", "aff_unique_abbr": "UC Berkeley;Google Brain;DeepMind", "aff_campus_unique_index": "0;1;1;1", "aff_campus_unique": "Berkeley;Mountain View;", "aff_country_unique_index": "0;0;0;1;0", "aff_country_unique": "United States;United Kingdom" }, { "id": "GthNKCqdDg", "title": "Selective Token Generation for Few-shot Language Modeling", "track": "main", "status": "Reject", "tldr": "", "abstract": "Natural language modeling with limited training data is challenging problem, and many algorithms make use of large-scale pretrained language models (PLMs) for this due to its great generalization ability. Among these transfer learning algorithms from PLMs, additive learning that incorporates a task-specific adapter on top of the fixed PLM has been popularly used to alleviate the severe overfitting problem in the few-shot setting. However, this added task-specific adapter is generally trained by maximum likelihood estimation that can easily suffer from the so-called exposure bias problem, especially in sequential text generation. Therefore, in this work, we develop a novel additive learning algorithm based on reinforcement learning (RL) for few-shot natural language generation (NLG) tasks. In particular, we propose to use a selective token generation between the transformer-based PLM and the task-specific adapter during both training and inference. This output token selection between the two generators allows the adapter to take into account only on the task-relevant parts in sequence generation, and therefore makes it more robust to overfitting as well as more stable in RL training. In addition, in order to obtain the complementary adapter from the PLM for each few-shot task, we exploit a separate selecting module that is also simultaneously trained using RL. Experimental results on various few-shot NLG tasks including data-to-text generation and text summarization demonstrate that the proposed selective token generation significantly outperforms the previous additive learning algorithms based on the PLMs.", "keywords": "Natural Language Generation;Reinforcement Learning;Few-shot Learning;Deep Learning", "primary_area": "", "supplementary_material": "", "author": "Daejin Jo;Taehwan Kwon;Sungwoong Kim;Eun-Sol Kim", "authorids": "~Daejin_Jo1;~Taehwan_Kwon1;~Sungwoong_Kim2;~Eun-Sol_Kim1", "gender": "M;;M;F", "homepage": ";https://github.com/TaehwanKwon;;", "dblp": "264/5824;;74/8063;52/10086", "google_scholar": ";;https://scholar.google.co.kr/citations?user=3DSA90AAAAAJ;JhZBnfYAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Daejin_Jo1;~Taehwan_Kwon1;~Sungwoong_Kim2;~Eun-Sol_Kim1", "aff": "Kakao Brain;;Kakao Brain;Hanyang University", "aff_domain": "kakaobrain.com;;kakaobrain.com;hanyang.ac.kr", "position": "Researcher;;Research Scientist;Assistant Professor", "bibtex": "@misc{\njo2022selective,\ntitle={Selective Token Generation for Few-shot Language Modeling},\nauthor={Daejin Jo and Taehwan Kwon and Sungwoong Kim and Eun-Sol Kim},\nyear={2022},\nurl={https://openreview.net/forum?id=GthNKCqdDg}\n}", "github": "", "project": "", "reviewers": "mErH;K8FA;PchS;3Ee3", "site": "https://openreview.net/forum?id=GthNKCqdDg", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "4;5;4;3", "correctness": "3;2;2;4", "technical_novelty": "2;1;3;2", "empirical_novelty": "2;1;2;2", "wc_summary_paper": "117;59;116;48", "wc_summary_review": "43;20;52;32", "wc_main_review": "619;175;301;343", "wc_review": "779;254;469;423", "wc_reply_reviewers": "0;121;30;0", "wc_reply_authors": "398;723;279;470", "reply_reviewers": "0;1;1;0", "reply_authors": "1;2;1;1", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 2.75, 0.82915619758885 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 1.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 85.0, 31.741140496207755 ], "wc_summary_review_avg": [ 36.75, 11.986972094736853 ], "wc_main_review_avg": [ 359.5, 162.07637088730732 ], "wc_review_avg": [ 481.25, 189.63171543810913 ], "wc_reply_reviewers_avg": [ 37.75, 49.60027721696724 ], "wc_reply_authors_avg": [ 467.5, 162.5184604898779 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.7071067811865475, "corr_recommendation_correctness": 0.30151134457776363, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:msgW2TYTPO4J:scholar.google.com/&scioq=Selective+Token+Generation+for+Few-shot+Language+Modeling&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0;1", "aff_unique_norm": "Kakao Brain;Hanyang University", "aff_unique_dep": ";", "aff_unique_url": "https://brain.kakao.com;https://www.hanyang.ac.kr", "aff_unique_abbr": "Kakao Brain;HYU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "South Korea" }, { "id": "GuEEPa5tqW", "title": "Enhancing Transformer Efficiency for Multivariate Time Series Classification", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "The majority of current multivariate time series (MTS) classification algorithms aim to improve the predictive accuracy. However, when it comes to large-scale (either high-dimensional or long-sequential) time series (TS) datasets, it is crucial to design an efficient network architecture to reduce computational costs. In this work, we propose a mixing framework based on Transformer and Fourier transform. By pruning each module of the network separately and sequentially, we investigate the impact of each module on the predictive accuracy. We conduct comprehensive experiments on 18 benchmark MTS datasets. Ablation studies are used to evaluate the impact of each module. Through module-by-module pruning, our results demonstrate the trade-offs between efficiency and effectiveness, as well as efficiency and complexity of the network. Finally, we evaluate, via Pareto analysis, the trade-off between network efficiency and performance.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yuqing Wang;Yun Zhao;Linda Petzold", "authorids": "~Yuqing_Wang5;~Yun_Zhao1;~Linda_Petzold1", "gender": "F;M;", "homepage": "https://yuqingwangcs.github.io/;https://yunzhaocs.github.io/;", "dblp": ";42/2862-1;", "google_scholar": "DHImZjIAAAAJ;30s9RtsAAAAJ;", "orcid": ";0000-0002-5544-8983;", "linkedin": "yuqingwang98/;yun-zhao/;", "or_profile": "~Yuqing_Wang5;~Yun_Zhao1;~Linda_Petzold1", "aff": "UC Santa Barbara;Meta Platforms, Inc;", "aff_domain": "ucsb.edu;meta.com;", "position": "PhD student;Researcher;", "bibtex": "@misc{\nwang2022enhancing,\ntitle={Enhancing Transformer Efficiency for Multivariate Time Series Classification},\nauthor={Yuqing Wang and Yun Zhao and Linda Petzold},\nyear={2022},\nurl={https://openreview.net/forum?id=GuEEPa5tqW}\n}", "github": "", "project": "", "reviewers": "tkNA;Wddd;2rjk;GKXP", "site": "https://openreview.net/forum?id=GuEEPa5tqW", "pdf_size": 0, "recommendation": "3;3;3;6", "confidence": "4;4;4;3", "correctness": "4;3;2;3", "technical_novelty": "1;2;1;2", "empirical_novelty": "1;2;1;2", "wc_summary_paper": "24;123;47;92", "wc_summary_review": "12;80;118;88", "wc_main_review": "130;232;236;130", "wc_review": "166;435;401;310", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;330;415;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;1;1;0", "recommendation_avg": [ 3.75, 1.299038105676658 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 1.5, 0.5 ], "empirical_novelty_avg": [ 1.5, 0.5 ], "wc_summary_paper_avg": [ 71.5, 38.5 ], "wc_summary_review_avg": [ 74.5, 38.76531955240405 ], "wc_main_review_avg": [ 182.0, 52.01922721455981 ], "wc_review_avg": [ 328.0, 104.09851103642166 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 186.25, 188.658918421579 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0.5, 0.5 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -1.0, "corr_recommendation_correctness": 0.0, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7304711279510151118&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1", "aff_unique_norm": "University of California, Santa Barbara;Meta", "aff_unique_dep": ";Meta Platforms, Inc", "aff_unique_url": "https://www.ucsb.edu;https://www.meta.com", "aff_unique_abbr": "UCSB;Meta", "aff_campus_unique_index": "0", "aff_campus_unique": "Santa Barbara;", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Permutation Compressors for Provably Faster Distributed Nonconvex Optimization", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7208", "id": "GugZ5DzzAu", "poster": "", "openreview": "https://openreview.net/forum?id=GugZ5DzzAu", "slides": "https://iclr.cc/virtual/2022/poster/7208", "video": "https://iclr.cc/virtual/2022/poster/7208", "author_site": "Rafa\u0142 Szlendak, Alexander Tyurin, Peter Richtarik", "tldr": "", "abstract": "In this work we study the MARINA method of Gorbunov et al (ICML, 2021) -- the current state-of-the-art distributed non-convex optimization method in terms of theoretical communication complexity. Theoretical superiority of this method can be largely attributed to two sources: a carefully engineered biased stochastic gradient estimator, which leads to a reduction in the number of communication rounds, and the reliance on\n {\\em independent} stochastic communication compression, which leads to a reduction in the number of transmitted bits within each communication round. In this paper we i) extend the theory of MARINA to support a much wider class of potentially {\\em correlated} compressors, extending the reach of the method beyond the classical independent compressors setting, ii) show that a new quantity, for which we coin the name {\\em Hessian variance}, allows us to significantly refine the original analysis of MARINA without any additional assumptions, and iii) identify a special class of correlated compressors based on the idea of {\\em random permutations}, for which we coin the term Perm$K$, the use of which leads to up to $O(\\sqrt{n})$ (resp. $O(1 + d/\\sqrt{n})$) improvement in the theoretical communication complexity of MARINA in the low Hessian variance regime when $d\\geq n$ (resp. $d \\leq n$), where $n$ is the number of workers and $d$ is the number of parameters describing the model we are learning. We corroborate our theoretical results with carefully engineered synthetic experiments with minimizing the average of nonconvex quadratics, and on autoencoder training with the MNIST dataset.", "keywords": "MARINA;distributed training;permutation compressor;correlated compressor;Hessian variance;communication complexity;nonconvex optimization", "primary_area": "", "supplementary_material": "", "author": "Rafa\u0142 Szlendak;Alexander Tyurin;Peter Richt\u00e1rik", "authorids": "~Rafa\u0142_Szlendak1;~Alexander_Tyurin1;~Peter_Richt\u00e1rik1", "gender": "M;M;M", "homepage": ";https://k3nfalt.github.io/;https://richtarik.org", "dblp": ";203/8919;62/8001", "google_scholar": ";;https://scholar.google.com/citations?hl=en", "orcid": ";;0000-0003-4380-5848", "linkedin": "rafa%C5%82-szlendak-552936220/;;richtarik/", "or_profile": "~Rafa\u0142_Szlendak1;~Alexander_Tyurin1;~Peter_Richtarik1", "aff": "The University of Warwick;KAUST;King Abdullah University of Science and Technology (KAUST)", "aff_domain": "warwick.ac.uk;kaust.edu.sa;kaust.edu.sa", "position": "Undergrad student;Postdoc;Full Professor", "bibtex": "@inproceedings{\nszlendak2022permutation,\ntitle={Permutation Compressors for Provably Faster Distributed Nonconvex Optimization},\nauthor={Rafa{\\l} Szlendak and Alexander Tyurin and Peter Richt{\\'a}rik},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=GugZ5DzzAu}\n}", "github": "", "project": "", "reviewers": "YEbP;JiFh;wJjy;mZxC", "pdf_size": 0, "recommendation": "5;6;6;6", "confidence": "3;2;3;3", "correctness": "3;4;4;4", "technical_novelty": "2;3;2;3", "empirical_novelty": "1;3;2;3", "wc_summary_paper": "54;74;64;46", "wc_summary_review": "18;2;18;30", "wc_main_review": "338;251;348;159", "wc_review": "410;327;430;235", "wc_reply_reviewers": "0;97;863;159", "wc_reply_authors": "3135;2249;4934;2276", "reply_reviewers": "0;1;2;2", "reply_authors": "7;4;11;5", "recommendation_avg": [ 5.75, 0.4330127018922193 ], "confidence_avg": [ 2.75, 0.4330127018922193 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 59.5, 10.523782589924593 ], "wc_summary_review_avg": [ 17.0, 9.9498743710662 ], "wc_main_review_avg": [ 274.0, 76.36425865547311 ], "wc_review_avg": [ 350.5, 77.06004152607238 ], "wc_reply_reviewers_avg": [ 279.75, 341.4742852690375 ], "wc_reply_authors_avg": [ 3148.5, 1090.7049326009303 ], "reply_reviewers_avg": [ 1.25, 0.82915619758885 ], "reply_authors_avg": [ 6.75, 2.680951323690902 ], "replies_avg": [ 40, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": 1.0, "gs_citation": 44, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13182132814240205416&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "pdf": "https://openreview.net/pdf?id=GugZ5DzzAu", "email": "warwick.ac.uk;kaust.edu.sa;kaust.edu.sa", "author_num": 3, "aff_unique_index": "0;1;1", "aff_unique_norm": "University of Warwick;King Abdullah University of Science and Technology", "aff_unique_dep": ";", "aff_unique_url": "https://warwick.ac.uk;https://www.kaust.edu.sa", "aff_unique_abbr": "Warwick;KAUST", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1", "aff_country_unique": "United Kingdom;Saudi Arabia" }, { "id": "Gw9vA80c8_n", "title": "HyperCube: Implicit Field Representations of Voxelized 3D Models", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Recently introduced implicit field representations offer an effective way of generating 3D object shapes. They leverage implicit decoder trained to take a 3D point coordinate concatenated with a shape encoding and to output a value which indicates whether the point is outside the shape or not. Although this approach enables efficient rendering of visually plausible objects, it has two significant limitations. First, it is based on a single neural network dedicated for all objects from a training set which results in a cumbersome training procedure and its application in real life. More importantly, the implicit decoder takes only points sampled within voxels (and not the entire voxels) which yields problems at the classification boundaries and results in empty spaces within the rendered mesh.\n\nTo solve the above limitations, we introduce a new HyperCube architecture based on interval arithmetic network, that enables direct processing of 3D voxels, trained using a hypernetwork paradigm to enforce model convergence. \nInstead of processing individual 3D samples from within a voxel, our approach allows to input the entire voxel (3D cube) represented with its convex hull coordinates, while the target network constructed by a hypernet assigns it to an inside or outside category. \nAs a result our HyperCube model outperforms the competing approaches both in terms of training and inference efficiency, as well as the final mesh quality. ", "keywords": "voxel;implicit field;3D objects", "primary_area": "", "supplementary_material": "/attachment/1a97efe686e7070daf502f77e2d19473ae17d756.zip", "author": "Magdalena Proszewska;Marcin Mazur;Tomasz Trzcinski;Przemys\u0142aw Spurek", "authorids": "~Magdalena_Proszewska1;~Marcin_Mazur1;~Tomasz_Trzcinski2;~Przemys\u0142aw_Spurek1", "gender": ";;M;M", "homepage": ";;https://cvlab.ii.pw.edu.pl/ttrzcins/;http://ww2.ii.uj.edu.pl/~spurek/", "dblp": "302/3555;;05/11408;77/10260", "google_scholar": "https://scholar.google.com/citations?view_op=list_works;;https://scholar.google.pl/citations?user=bJMRBFoAAAAJ;0kp0MbgAAAAJ", "orcid": ";;;0000-0003-0097-5521", "linkedin": ";;;spurek/", "or_profile": "~Magdalena_Proszewska1;~Marcin_Mazur1;~Tomasz_Trzcinski2;~Przemys\u0142aw_Spurek1", "aff": "Jagiellonian University;;;Jagiellonian University", "aff_domain": "uj.edu.pl;;;uj.edu.pl", "position": "MS student;;;Assistant Professor", "bibtex": "@misc{\nproszewska2022hypercube,\ntitle={HyperCube: Implicit Field Representations of Voxelized 3D Models},\nauthor={Magdalena Proszewska and Marcin Mazur and Tomasz Trzcinski and Przemys{\\l}aw Spurek},\nyear={2022},\nurl={https://openreview.net/forum?id=Gw9vA80c8_n}\n}", "github": "", "project": "", "reviewers": "18up;s2iC;zEyR;T28C", "site": "https://openreview.net/forum?id=Gw9vA80c8_n", "pdf_size": 0, "recommendation": "3;3;3;6", "confidence": "4;3;2;3", "correctness": "3;2;3;4", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;2;2;0", "wc_summary_paper": "66;111;23;111", "wc_summary_review": "46;30;13;42", "wc_main_review": "471;619;101;182", "wc_review": "583;760;137;335", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.75, 1.299038105676658 ], "confidence_avg": [ 3.0, 0.7071067811865476 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 1.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 77.75, 36.56073713698891 ], "wc_summary_review_avg": [ 32.75, 12.833062767710599 ], "wc_main_review_avg": [ 343.25, 210.38342971821712 ], "wc_review_avg": [ 453.75, 237.13221523023816 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.816496580927726, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6219260580113938799&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0", "aff_unique_norm": "Jagiellonian University", "aff_unique_dep": "", "aff_unique_url": "https://www.uj.edu.pl", "aff_unique_abbr": "UJ", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Poland" }, { "id": "GwA--zyF4w", "title": "Learning Canonical Embedding for Non-rigid Shape Matching", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "This paper provides a novel framework that learns canonical embeddings for non-rigid shape matching. In contrast to prior work in this direction, our framework is trained end-to-end and thus avoids instabilities and constraints associated with the commonly-used Laplace-Beltrami basis or sequential optimization schemes. On multiple datasets, we demonstrate that learning self symmetry maps with a deep functional map projects 3D shapes into a low dimensional canonical embedding that facilitates non-rigid shape correspondence via a simple nearest neighbor search. Our framework outperforms multiple recent learning based methods on FAUST and SHREC benchmarks while being computationally cheaper, data-efficient, and robust.", "keywords": "Deep Functional Maps;Symmetry group;Point Clouds;Linear Transformation;Canonical 3D shape Embedding", "primary_area": "", "supplementary_material": "", "author": "Abhishek Sharma;Maks Ovsjanikov", "authorids": "~Abhishek_Sharma1;~Maks_Ovsjanikov1", "gender": ";M", "homepage": ";http://www.lix.polytechnique.fr/~maks/", "dblp": ";94/5668", "google_scholar": ";https://scholar.google.com/citations?hl=en", "orcid": ";0000-0002-5867-4046", "linkedin": ";", "or_profile": "~Abhishek_Sharma1;~Maks_Ovsjanikov1", "aff": ";\u00c9cole Polytechnique", "aff_domain": ";polytechnique.edu", "position": ";Full Professor", "bibtex": "@misc{\nsharma2022learning,\ntitle={Learning Canonical Embedding for Non-rigid Shape Matching},\nauthor={Abhishek Sharma and Maks Ovsjanikov},\nyear={2022},\nurl={https://openreview.net/forum?id=GwA--zyF4w}\n}", "github": "", "project": "", "reviewers": "gaMP;NTUY;giDp;PGwj", "site": "https://openreview.net/forum?id=GwA--zyF4w", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "4;4;3;4", "correctness": "2;2;3;4", "technical_novelty": "2;2;3;2", "empirical_novelty": "2;2;3;2", "wc_summary_paper": "49;116;80;113", "wc_summary_review": "45;20;68;55", "wc_main_review": "695;634;756;280", "wc_review": "789;770;904;448", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "311;205;122;44", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.82915619758885 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 89.5, 27.31757675929547 ], "wc_summary_review_avg": [ 47.0, 17.592612085759182 ], "wc_main_review_avg": [ 591.25, 184.80445746788686 ], "wc_review_avg": [ 727.75, 169.45556202143382 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 170.5, 99.10221995495358 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.9045340337332909, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12087852276961828206&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff_unique_index": "0", "aff_unique_norm": "Ecole Polytechnique", "aff_unique_dep": "", "aff_unique_url": "https://www.polytechnique.edu", "aff_unique_abbr": "X", "aff_country_unique_index": "0", "aff_country_unique": "France" }, { "id": "Gx6Tvlm-hWW", "title": "Trading Coverage for Precision: Conformal Prediction with Limited False Discoveries", "track": "main", "status": "Reject", "tldr": "", "abstract": "In this paper, we develop a new approach to conformal prediction in which we aim to output a precise set of promising prediction candidates that is guaranteed to contain a limited number of incorrect answers. Standard conformal prediction provides the ability to adapt to model uncertainty by constructing a calibrated candidate set in place of a single prediction, with guarantees that the set contains the correct answer with high probability. In order to obey this coverage property, however, conformal sets can often become inundated with noisy candidates---which can render them unhelpful in practice. This is particularly relevant to large-scale settings where the cost (monetary or otherwise) of false positives is substantial, such as for in-silico screening for drug discovery, where any positively identified molecular compound is then manufactured and tested. We propose to trade coverage for precision by enforcing that the presence of incorrect candidates in the predicted conformal sets (i.e., the total number of false discoveries) is bounded according to a user-specified tolerance. Subject to this constraint, our algorithm then optimizes for a generalized notion of set coverage (i.e., the true discovery rate) that allows for any number of true answers for a given query (including zero). We demonstrate the effectiveness of this approach across a number of classification tasks in natural language processing, computer vision, and computational chemistry.\n", "keywords": "conformal prediction;confidence;uncertainty estimation;false discovery;natural language processing;computer vision;chemistry", "primary_area": "", "supplementary_material": "/attachment/b8ca37f91936da341e375e25ff83b47b62bb75b3.zip", "author": "Adam Fisch;Tal Schuster;Tommi S. Jaakkola;Regina Barzilay", "authorids": "~Adam_Fisch2;~Tal_Schuster1;~Tommi_S._Jaakkola1;~Regina_Barzilay1", "gender": ";Not Specified;;female", "homepage": ";https://people.csail.mit.edu/tals/;;https://www.regina.csail.mit.edu/", "dblp": ";190/7491;;b/ReginaBarzilay", "google_scholar": ";oo8QRmIAAAAJ;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Adam_Fisch2;~Tal_Schuster1;~Tommi_S._Jaakkola1;~Regina_Barzilay1", "aff": ";Google;;Massachusetts Institute of Technology", "aff_domain": ";google.com;;mit.edu", "position": ";Researcher;;Professor", "bibtex": "@misc{\nfisch2022trading,\ntitle={Trading Coverage for Precision: Conformal Prediction with Limited False Discoveries},\nauthor={Adam Fisch and Tal Schuster and Tommi S. Jaakkola and Regina Barzilay},\nyear={2022},\nurl={https://openreview.net/forum?id=Gx6Tvlm-hWW}\n}", "github": "", "project": "", "reviewers": "yMiK;dWtw;KRhW;2ZTR", "site": "https://openreview.net/forum?id=Gx6Tvlm-hWW", "pdf_size": 0, "recommendation": "6;6;6;6", "confidence": "4;3;4;4", "correctness": "3;3;4;1", "technical_novelty": "3;3;2;2", "empirical_novelty": "3;3;0;1", "wc_summary_paper": "181;191;47;94", "wc_summary_review": "84;119;50;36", "wc_main_review": "371;208;814;496", "wc_review": "636;518;911;626", "wc_reply_reviewers": "96;0;94;115", "wc_reply_authors": "943;385;558;1682", "reply_reviewers": "1;0;1;1", "reply_authors": "3;2;3;4", "recommendation_avg": [ 6.0, 0.0 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 2.75, 1.0897247358851685 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 1.75, 1.299038105676658 ], "wc_summary_paper_avg": [ 128.25, 60.197072187939504 ], "wc_summary_review_avg": [ 72.25, 32.14323412477344 ], "wc_main_review_avg": [ 472.25, 222.16927667884235 ], "wc_review_avg": [ 672.75, 145.1264534810935 ], "wc_reply_reviewers_avg": [ 76.25, 44.7793200037696 ], "wc_reply_authors_avg": [ 892.0, 498.8251196561777 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 3.0, 0.7071067811865476 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:oT8zHyw1RaEJ:scholar.google.com/&scioq=Trading+Coverage+for+Precision:+Conformal+Prediction+with+Limited+False+Discoveries&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1", "aff_unique_norm": "Google;Massachusetts Institute of Technology", "aff_unique_dep": "Google;", "aff_unique_url": "https://www.google.com;https://web.mit.edu", "aff_unique_abbr": "Google;MIT", "aff_campus_unique_index": "0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Switch to Generalize: Domain-Switch Learning for Cross-Domain Few-Shot Classification", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/5957", "id": "H-iABMvzIc", "poster": "", "openreview": "https://openreview.net/forum?id=H-iABMvzIc", "slides": "https://iclr.cc/virtual/2022/poster/5957", "video": "https://iclr.cc/virtual/2022/poster/5957", "author_site": "zhengdong Hu, Yifan Sun, Yi Yang", "tldr": "", "abstract": "This paper considers few-shot learning under the cross-domain scenario. The cross-domain setting imposes a critical challenge, i.e., using very few (support) samples to generalize the already-learned model to a novel domain. We hold a hypothesis, i.e., if a deep model is capable to fast generalize itself to different domains (using very few samples) during training, it will maintain such domain generalization capacity for testing. It motivates us to propose a novel Domain-Switch Learning (DSL) framework. DSL embeds the cross-domain scenario into the training stage in a ``fast switching'' manner. Specifically, DSL uses a single domain for a training iteration and switches into another domain for the following iteration. During the switching, DSL enforces two constraints: 1) the deep model should not over-fit the domain in the current iteration and 2) the deep model should not forget the already-learned knowledge of other domains. These two constraints jointly promote fast generalization across different domains. Experimental results confirm that the cross-domain generalization capacity can be inherited from the training stage to the testing stage, validating our key hypothesis. Consequentially, DSL significantly improves cross-domain few-shot classification and sets up new state of the art.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhengdong Hu;Yifan Sun;Yi Yang", "authorids": "~Zhengdong_Hu1;~Yifan_Sun2;~Yi_Yang22", "gender": "M;M;M", "homepage": ";https://yifansun-reid.github.io;https://person.zju.edu.cn/yiyang", "dblp": "323/9595;99/10261-3.html;33/4854-1.html", "google_scholar": "Udl0uiMAAAAJ;uUZEL7UAAAAJ;RMSuNFwAAAAJ", "orcid": ";0000-0003-3532-6521;", "linkedin": ";;", "or_profile": "~Zhengdong_Hu1;~Yifan_Sun2;~Yi_Yang22", "aff": "Zhejiang University;Baidu;Zhejiang University", "aff_domain": "zju.edu.cn;baidu.com;zju.edu.cn", "position": "MS student;Senior Expert;Full Professor", "bibtex": "@inproceedings{\nhu2022switch,\ntitle={Switch to Generalize: Domain-Switch Learning for Cross-Domain Few-Shot Classification},\nauthor={Zhengdong Hu and Yifan Sun and Yi Yang},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=H-iABMvzIc}\n}", "github": "", "project": "", "reviewers": "gTju;JgKH;xu2A;KRoA", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "4;3;4;3", "correctness": "3;3;3;3", "technical_novelty": "2;2;4;3", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "151;45;76;58", "wc_summary_review": "36;30;43;76", "wc_main_review": "414;159;269;652", "wc_review": "601;234;388;786", "wc_reply_reviewers": "274;0;0;0", "wc_reply_authors": "1999;605;751;1113", "reply_reviewers": "1;0;0;0", "reply_authors": "4;1;1;3", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.75, 0.82915619758885 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 82.5, 41.051796550212025 ], "wc_summary_review_avg": [ 46.25, 17.781661902083282 ], "wc_main_review_avg": [ 373.5, 184.4810288349455 ], "wc_review_avg": [ 502.25, 209.33033105596522 ], "wc_reply_reviewers_avg": [ 68.5, 118.64548031846809 ], "wc_reply_authors_avg": [ 1117.0, 541.7656319849018 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 2.25, 1.299038105676658 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.6882472016116854, "corr_recommendation_correctness": 0.0, "gs_citation": 22, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4405876905960045965&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "pdf": "https://openreview.net/pdf?id=H-iABMvzIc", "email": "zju.edu.cn;baidu.com;zju.edu.cn", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "Zhejiang University;Baidu", "aff_unique_dep": ";Baidu, Inc.", "aff_unique_url": "https://www.zju.edu.cn;https://www.baidu.com", "aff_unique_abbr": "ZJU;Baidu", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "id": "H-sddFpZAp4", "title": "ModeRNN: Harnessing Spatiotemporal Mode Collapse in Unsupervised Predictive Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Learning predictive models for unlabeled spatiotemporal data is challenging in part because visual dynamics can be highly entangled in real scenes, making existing approaches prone to overfit partial modes of physical processes while neglecting to reason about others. We name this phenomenon \\textit{spatiotemporal mode collapse} and explore it for the first time in predictive learning. The key is to provide the model with a strong inductive bias to discover the compositional structures of latent modes. To this end, we propose ModeRNN, which introduces a novel method to learn structured hidden representations between recurrent states. The core idea of this framework is to first extract various components of visual dynamics using a set of \\textit{spatiotemporal slots} with independent parameters. Considering that multiple space-time patterns may co-exist in a sequence, we leverage learnable importance weights to adaptively aggregate slot features into a unified hidden representation, which is then used to update the recurrent states. Across the entire dataset, different modes result in different responses on the mixtures of slots, which enhances the ability of ModeRNN to build structured representations and thus prevents the so-called mode collapse. Unlike existing models, ModeRNN is shown to prevent spatiotemporal mode collapse and further benefit from learning mixed visual dynamics. ", "keywords": "Predictive Learning;Video Prediction", "primary_area": "", "supplementary_material": "", "author": "Zhiyu Yao;Yunbo Wang;Haixu Wu;Jianmin Wang;Mingsheng Long", "authorids": "~Zhiyu_Yao2;~Yunbo_Wang2;~Haixu_Wu1;~Jianmin_Wang1;~Mingsheng_Long5", "gender": "M;M;M;M;M", "homepage": ";;https://www.thss.tsinghua.edu.cn/en/faculty/jianminwang.htm;http://ise.thss.tsinghua.edu.cn/~mlong;https://wyb15.github.io/", "dblp": "230/4609;286/8115;06/3456-1.html;74/9023;84/3894", "google_scholar": "https://scholar.google.com/citations?hl=zh-CN;oLL_x0wAAAAJ;https://scholar.google.com.tw/citations?user=MiovcboAAAAJ;_MjXpXkAAAAJ;C8bGfr0AAAAJ", "orcid": "0000-0002-0887-8809;;0000-0001-6841-7943;0000-0002-5412-9120;", "linkedin": ";;;;", "or_profile": "~Zhiyu_Yao2;~Haixu_Wu1;~Jianmin_Wang1;~Mingsheng_Long2;~Yunbo_Wang1", "aff": "Tsinghua University;Tsinghua University;Tsinghua University;Tsinghua University;Shanghai Jiaotong University", "aff_domain": "tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;sjtu.edu.cn", "position": "PhD student;PhD student;Full Professor;Associate Professor;Assistant Professor", "bibtex": "@misc{\nyao2022modernn,\ntitle={Mode{RNN}: Harnessing Spatiotemporal Mode Collapse in Unsupervised Predictive Learning},\nauthor={Zhiyu Yao and Yunbo Wang and Haixu Wu and Jianmin Wang and Mingsheng Long},\nyear={2022},\nurl={https://openreview.net/forum?id=H-sddFpZAp4}\n}", "github": "", "project": "", "reviewers": "VMMf;Wt6k;W8mj", "site": "https://openreview.net/forum?id=H-sddFpZAp4", "pdf_size": 0, "recommendation": "5;5;8", "confidence": "3;4;3", "correctness": "2;3;4", "technical_novelty": "2;2;4", "empirical_novelty": "1;1;4", "wc_summary_paper": "70;53;49", "wc_summary_review": "51;82;17", "wc_main_review": "159;301;435", "wc_review": "280;436;501", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "1031;1525;807", "reply_reviewers": "0;0;0", "reply_authors": "2;2;1", "recommendation_avg": [ 6.0, 1.4142135623730951 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "correctness_avg": [ 3.0, 0.816496580927726 ], "technical_novelty_avg": [ 2.6666666666666665, 0.9428090415820634 ], "empirical_novelty_avg": [ 2.0, 1.4142135623730951 ], "wc_summary_paper_avg": [ 57.333333333333336, 9.104333522498441 ], "wc_summary_review_avg": [ 50.0, 26.54555832275273 ], "wc_main_review_avg": [ 298.3333333333333, 112.69230477523989 ], "wc_review_avg": [ 405.6666666666667, 92.73738308914169 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 1121.0, 299.95110712692275 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.5, "corr_recommendation_correctness": 0.8660254037844387, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6021399692802488036&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "aff_unique_index": "0;0;0;0;1", "aff_unique_norm": "Tsinghua University;Shanghai Jiao Tong University", "aff_unique_dep": ";", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.sjtu.edu.cn", "aff_unique_abbr": "THU;SJTU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Hybrid Local SGD for Federated Learning with Heterogeneous Communications", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6077", "id": "H0oaWl6THa", "poster": "", "openreview": "https://openreview.net/forum?id=H0oaWl6THa", "slides": "https://iclr.cc/virtual/2022/poster/6077", "video": "https://iclr.cc/virtual/2022/poster/6077", "author_site": "Yuanxiong Guo, Ying Sun, Rui Hu, Yanmin Gong", "tldr": "", "abstract": "Communication is a key bottleneck in federated learning where a large number of edge devices collaboratively learn a model under the orchestration of a central server without sharing their own training data. While local SGD has been proposed to reduce the number of FL rounds and become the algorithm of choice for FL, its total communication cost is still prohibitive when each device needs to communicate with the remote server repeatedly for many times over bandwidth-limited networks. In light of both device-to-device (D2D) and device-to-server (D2S) cooperation opportunities in modern communication networks, this paper proposes a new federated optimization algorithm dubbed hybrid local SGD (HL-SGD) in FL settings where devices are grouped into a set of disjoint clusters with high D2D communication bandwidth. HL-SGD subsumes previous proposed algorithms such as local SGD and gossip SGD and enables us to strike the best balance between model accuracy and runtime. We analyze the convergence of HL-SGD in the presence of heterogeneous data for general nonconvex settings. We also perform extensive experiments and show that the use of hybrid model aggregation via D2D and D2S communications in HL-SGD can largely speed up the training time of federated learning. ", "keywords": "Federated Learning;Communication Efficiency;Heterogeneity;Local SGD", "primary_area": "", "supplementary_material": "", "author": "Yuanxiong Guo;Ying Sun;Rui Hu;Yanmin Gong", "authorids": "~Yuanxiong_Guo1;~Ying_Sun5;ruihu2017@gmail.com;~Yanmin_Gong1", "gender": ";F;;F", "homepage": ";https://ysunac.github.io;;https://yanmingong.github.io/", "dblp": "93/10800;;;145/6503-1", "google_scholar": "FK_8hMkAAAAJ;M9uQsUQAAAAJ;;o3BGIEMAAAAJ", "orcid": "0000-0003-2241-125X;;;", "linkedin": "yuanxiong-guo-7666749b/;;;", "or_profile": "~Yuanxiong_Guo1;~Ying_Sun5;ruihu2017@gmail.com;~Yanmin_Gong1", "aff": "University of Texas at San Antonio;Pennsylvania State University;;University of Texas, San Antonio", "aff_domain": "utsa.edu;psu.edu;;utsa.edu", "position": "Assistant Professor;Assistant Professor;;Assistant Professor", "bibtex": "@inproceedings{\nguo2022hybrid,\ntitle={Hybrid Local {SGD} for Federated Learning with Heterogeneous Communications},\nauthor={Yuanxiong Guo and Ying Sun and Rui Hu and Yanmin Gong},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=H0oaWl6THa}\n}", "github": "", "project": "", "reviewers": "4Rc7;y58n;87Gc;6z2f", "pdf_size": 0, "recommendation": "6;8;8;8", "confidence": "3;4;3;3", "correctness": "3;4;4;3", "technical_novelty": "2;3;4;3", "empirical_novelty": "2;3;4;3", "wc_summary_paper": "46;118;92;104", "wc_summary_review": "27;12;30;100", "wc_main_review": "235;344;86;269", "wc_review": "308;474;208;473", "wc_reply_reviewers": "0;58;0;27", "wc_reply_authors": "0;1208;222;976", "reply_reviewers": "0;1;0;1", "reply_authors": "0;3;1;3", "recommendation_avg": [ 7.5, 0.8660254037844386 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 90.0, 27.018512172212592 ], "wc_summary_review_avg": [ 42.25, 34.03215391361528 ], "wc_main_review_avg": [ 233.5, 93.84694987052056 ], "wc_review_avg": [ 365.75, 113.402766721099 ], "wc_reply_reviewers_avg": [ 21.25, 23.909987452945266 ], "wc_reply_authors_avg": [ 601.5, 503.46673177082914 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.75, 1.299038105676658 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 60, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1255835222260002045&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=H0oaWl6THa", "email": "utsa.edu;psu.edu;;utsa.edu", "author_num": 4, "aff_unique_index": "0;1;0", "aff_unique_norm": "University of Texas at San Antonio;Pennsylvania State University", "aff_unique_dep": ";", "aff_unique_url": "https://www.utsa.edu;https://www.psu.edu", "aff_unique_abbr": "UTSA;PSU", "aff_campus_unique_index": "0;0", "aff_campus_unique": "San Antonio;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "H2bV7F_lEjX", "title": "Directional Domain Generalization", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Domain generalization aims to learn a predictive model from multiple different but related source tasks that can generalize well to a target task without the need of accessing any target data. Existing domain generalization methods ignore the relation between tasks, implicitly assuming that all the tasks are sampled from a stationary environment. Therefore, they can fail when deployed in an evolving environment. To this end, we formulate and study the \\emph{directional domain generalization} (DDG) scenario, which exploits not only the source data but also their evolving pattern to generate a model for the unseen task. Our theoretical result reveals the benefits of modeling the relation between two consecutive tasks by learning a globally consistent directional mapping function. In practice, our analysis also suggest solving the DDG problem in a meta-learning manner, which leads to \\emph{directional prototypical network}, the first method for the DDG problem. Empirical evaluation on both synthetic and real-world data sets validates the effectiveness of our approach.", "keywords": "Domain Generalization;Domain Adaptation", "primary_area": "", "supplementary_material": "/attachment/6965324d824422d0591748f8df34eea53b029fd9.zip", "author": "Wei Wang;Jiaqi Li;Ruizhi Pu;Gezheng Xu;Fan Zhou;Changjian Shui;Charles Ling;Boyu Wang", "authorids": "~Wei_Wang59;~Jiaqi_Li2;~Ruizhi_Pu1;~Gezheng_Xu2;~Fan_Zhou8;~Changjian_Shui2;~Charles_Ling1;~Boyu_Wang3", "gender": "M;;M;F;M;;M;M", "homepage": "https://waybaba.com;;https://github.com/AllenPu;https://github.com/xugezheng;https://fzhou.cc/;;http://cling.csd.uwo.ca/;https://sites.google.com/site/borriewang/", "dblp": ";;301/9203.html;293/7645;63/3122-6;;;41/6565-4.html", "google_scholar": "https://scholar.google.ca/citations?user=brsAcesAAAAJ;;;;https://scholar.google.com/citations?hl=en;;https://scholar.google.co.uk/citations?hl=en;qAZM5KcAAAAJ", "orcid": ";;;0000-0001-5983-5756;0000-0003-1736-2641;;;0000-0002-7413-4162", "linkedin": ";;;;;;;", "or_profile": "~Wei_Wang59;~Jiaqi_Li2;~Ruizhi_Pu1;~Gezheng_Xu2;~Fan_Zhou8;~Changjian_Shui2;~Charles_Ling1;~Boyu_Wang3", "aff": "Western University;;University of Western Ontario;University of Western Ontario;Universit\u00e9 Laval;;Western University;University of Western Ontario", "aff_domain": "uwo.ca;;uwo.ca;uwo.ca;ulaval.ca;;uwo.ca;uwo.ca", "position": "MS student;;PhD student;PhD student;Postdoc;;Professor;Assistant Professor", "bibtex": "@misc{\nwang2022directional,\ntitle={Directional Domain Generalization},\nauthor={Wei Wang and Jiaqi Li and Ruizhi Pu and Gezheng Xu and Fan Zhou and Changjian Shui and Charles Ling and Boyu Wang},\nyear={2022},\nurl={https://openreview.net/forum?id=H2bV7F_lEjX}\n}", "github": "", "project": "", "reviewers": "TUL5;VdAQ;Hcay;ZjwE", "site": "https://openreview.net/forum?id=H2bV7F_lEjX", "pdf_size": 0, "recommendation": "3;5;8;8", "confidence": "4;3;4;4", "correctness": "3;3;4;3", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "59;100;183;93", "wc_summary_review": "22;67;28;73", "wc_main_review": "637;133;411;627", "wc_review": "718;300;622;793", "wc_reply_reviewers": "262;174;107;138", "wc_reply_authors": "2417;1367;798;1201", "reply_reviewers": "1;2;1;1", "reply_authors": "5;4;2;3", "recommendation_avg": [ 6.0, 2.1213203435596424 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 108.75, 45.58714182749342 ], "wc_summary_review_avg": [ 47.5, 22.699118925632334 ], "wc_main_review_avg": [ 452.0, 205.11703976023054 ], "wc_review_avg": [ 608.25, 188.00581772913304 ], "wc_reply_reviewers_avg": [ 170.25, 58.03608791088524 ], "wc_reply_authors_avg": [ 1445.75, 597.7061882731348 ], "reply_reviewers_avg": [ 1.25, 0.4330127018922193 ], "reply_authors_avg": [ 3.5, 1.118033988749895 ], "replies_avg": [ 24, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": 0.2721655269759087, "corr_recommendation_correctness": 0.5443310539518174, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:qmy1uTj3SqwJ:scholar.google.com/&scioq=Directional+Domain+Generalization&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;1;2;0;1", "aff_unique_norm": "Western University;University of Western Ontario;Universit\u00e9 Laval", "aff_unique_dep": ";;", "aff_unique_url": "https://www.uwo.ca;https://www.uwo.ca;https://www.ulaval.ca", "aff_unique_abbr": "Western;UWO;ULaval", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "Canada" }, { "id": "H3zl1mDHDTn", "title": "Lagrangian Method for Episodic Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "This paper considers the problem of learning optimal value functions for finite-time decision tasks via saddle-point optimization of a nonlinear Lagrangian function that is derived from the $Q$-form Bellman optimality equation. Despite a long history of research on this topic in the literature, previous works on this general approach have been almost exclusively focusing on a linear special case known as the linear programming approach to RL/MDP. Our paper brings new perspectives to this general approach in the following aspects: 1) Inspired by the usually-used linear $V$-form Lagrangian, we proposed a nonlinear $Q$-form Lagrangian function and proved that it enjoys strong duality property in spite of its nonlinearity. The Lagrangian duality property immediately leads to a new imitation learning algorithm, which we applied to Machine Translation and obtained favorable performance on standard MT benchmark. 2) We pointed out a fundamental limit of existing works, which seeks to find minimax-type saddle points of the Lagrangian function. We proved that another class of saddle points, the maximin-type ones, turn out to have better optimality property. 3) In contrast to most previous works, our theory and algorithm are oriented to the undiscounted episode-wise reward, which is practically more relevant than the usually considered discounted-MDP setting, thus have filled a gap between theory and practice on the topic.", "keywords": "Reinforcement Learning;Imitation Learning;Lagrangian Duality;Machine Translation", "primary_area": "", "supplementary_material": "/attachment/906849c057114f5bb39f7c027648a73ca959a573.zip", "author": "Huang Bojun", "authorids": "~Huang_Bojun1", "gender": "M", "homepage": "", "dblp": "54/9376", "google_scholar": "ljKDN0QAAAAJ", "orcid": "", "linkedin": "", "or_profile": "~Huang_Bojun1", "aff": "Rakuten Institute of Technology", "aff_domain": "rakuten.com", "position": "Principal Researcher", "bibtex": "@misc{\nbojun2022lagrangian,\ntitle={Lagrangian Method for Episodic Learning},\nauthor={Huang Bojun},\nyear={2022},\nurl={https://openreview.net/forum?id=H3zl1mDHDTn}\n}", "github": "", "project": "", "reviewers": "DWfZ;1YUz;fdL5", "site": "https://openreview.net/forum?id=H3zl1mDHDTn", "pdf_size": 0, "recommendation": "5;5;6", "confidence": "3;3;3", "correctness": "3;3;3", "technical_novelty": "2;2;3", "empirical_novelty": "2;3;2", "wc_summary_paper": "68;60;81", "wc_summary_review": "35;35;43", "wc_main_review": "170;275;256", "wc_review": "273;370;380", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "1964;3275;1523", "reply_reviewers": "0;0;0", "reply_authors": "3;5;3", "recommendation_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.0, 0.0 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 69.66666666666667, 8.65383665716478 ], "wc_summary_review_avg": [ 37.666666666666664, 3.7712361663282534 ], "wc_main_review_avg": [ 233.66666666666666, 45.68247901426639 ], "wc_review_avg": [ 341.0, 48.25626038833372 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 2254.0, 744.0658573002796 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 3.6666666666666665, 0.9428090415820634 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 1, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Hg5R2S6VrqoJ:scholar.google.com/&scioq=Lagrangian+Method+for+Episodic+Learning&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "Rakuten Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://rit.rakuten.com", "aff_unique_abbr": "RIT", "aff_country_unique_index": "0", "aff_country_unique": "Japan" }, { "id": "H4EXaI6HR2", "title": "Representing value functions in power systems using parametric network series", "track": "main", "status": "Reject", "tldr": "", "abstract": "We describe a novel architecture for modeling the cost-to-go function in approximate dynamic programming problems involving country-scale, real-life electrical power generation systems. Our particular scenario features a heterogeneous power grid including dozens of renewable energy plants as well as traditional ones; the corresponding state space is in the order of thousands of variables of different types and ranges. While Artificial Neural Networks are a natural choice for modeling such complex cost functions, their effective use hinges on exploiting the particular structure of the problem which, in this case, involves seasonal patterns at many different levels (day, week, year). Our proposed model consists of a series of neural networks whose parameters are themselves parametric functions of a time variable. The parameters of such functions are learned during training along with the network parameters themselves. The new method is shown to outperform the standard backward dynamic programming program currently in use, both in terms of the objective function (total cost of operation over a period) and computational cost. Last, but not least, the resulting model is readily interpretable in terms of the parameters of the learned functions, which capture general trends of the problem, providing useful insight for future improvements.", "keywords": "approximate dynamic programming;cost function approximation;artificial neural networks;parametric network series", "primary_area": "", "supplementary_material": "", "author": "Ruben Chaer;Ximena Caporale;Vanina Camacho;Ignacio Ram\u00edrez", "authorids": "~Ruben_Chaer1;~Ximena_Caporale1;~Vanina_Camacho1;~Ignacio_Ram\u00edrez1", "gender": "M;F;F;M", "homepage": "https://simsee.org;;;http://iie.fing.edu.uy/personal/nacho", "dblp": ";;;15/3667", "google_scholar": ";https://scholar.google.com/citations?hl=es;;https://scholar.google.com.uy/citations?user=JDHxMu4AAAAJ", "orcid": ";;;0000-0003-2954-9040", "linkedin": ";ximena-caporale-a98a70112/;Https://www.linkedin.com/in/vcamachocurbelo;irp/", "or_profile": "~Ruben_Chaer1;~Ximena_Caporale1;~Vanina_Camacho1;~Ignacio_Ram\u00edrez1", "aff": "Institute of Electrical Engeniering;Facultad de Ingenier\u00eda;Facultad de Ingenier\u00eda;Facultad de Ingenier\u00eda", "aff_domain": "iie.fing.edu.uy;fing.edu.uy;fing.edu.uy;fing.edu.uy", "position": "PhD student;MS student;MS student;Associate Professor", "bibtex": "@misc{\nchaer2022representing,\ntitle={Representing value functions in power systems using parametric network series},\nauthor={Ruben Chaer and Ximena Caporale and Vanina Camacho and Ignacio Ram{\\'\\i}rez},\nyear={2022},\nurl={https://openreview.net/forum?id=H4EXaI6HR2}\n}", "github": "", "project": "", "reviewers": "QHRg;wkjh;dQUM", "site": "https://openreview.net/forum?id=H4EXaI6HR2", "pdf_size": 0, "recommendation": "1;3;3", "confidence": "4;4;3", "correctness": "2;3;3", "technical_novelty": "1;1;1", "empirical_novelty": "1;2;4", "wc_summary_paper": "40;83;94", "wc_summary_review": "36;57;54", "wc_main_review": "159;302;483", "wc_review": "235;442;631", "wc_reply_reviewers": "0;418;1054", "wc_reply_authors": "524;1084;604", "reply_reviewers": "0;2;2", "reply_authors": "1;3;1", "recommendation_avg": [ 2.3333333333333335, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 2.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 1.0, 0.0 ], "empirical_novelty_avg": [ 2.3333333333333335, 1.247219128924647 ], "wc_summary_paper_avg": [ 72.33333333333333, 23.299976156401723 ], "wc_summary_review_avg": [ 49.0, 9.273618495495704 ], "wc_main_review_avg": [ 314.6666666666667, 132.5753454539049 ], "wc_review_avg": [ 436.0, 161.72198366332265 ], "wc_reply_reviewers_avg": [ 490.6666666666667, 433.35076888000197 ], "wc_reply_authors_avg": [ 737.3333333333334, 247.29649321321875 ], "reply_reviewers_avg": [ 1.3333333333333333, 0.9428090415820634 ], "reply_authors_avg": [ 1.6666666666666667, 0.9428090415820634 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.5, "corr_recommendation_correctness": 1.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:j1cY6uCFkKQJ:scholar.google.com/&scioq=Representing+value+functions+in+power+systems+using+parametric+network+series&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;1;1", "aff_unique_norm": "Institute of Electrical Engineering;Facultad de Ingenier\u00eda", "aff_unique_dep": ";Ingenier\u00eda", "aff_unique_url": ";", "aff_unique_abbr": ";", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "", "aff_country_unique": "" }, { "id": "H4J8FGHOhx_", "title": "A Principled Permutation Invariant Approach to Mean-Field Multi-Agent Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Multi-agent reinforcement learning (MARL) becomes more challenging in the presence of more agents, as the capacity of the joint state and action spaces grows exponentially in the number of agents. To address such a challenge of scale, we identify a class of cooperative MARL problems with permutation invariance, and formulate it as mean-field Markov decision processes (MDP). To exploit the permutation invariance therein, we propose the mean-field proximal policy optimization (MF-PPO) algorithm, at the core of which is a permutation- invariant actor-critic neural architecture. We prove that MF-PPO attains the globally optimal policy at a sublinear rate of convergence. Moreover, its sample complexity is independent of the number of agents. We validate the theoretical advantages of MF-PPO with numerical experiments in the multi-agent particle environment (MPE). In particular, we show that the inductive bias introduced by the permutation-invariant neural architecture enables MF-PPO to outperform existing competitors with a smaller number of model parameters, which is the key to its generalization performance.\n", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/c1afb3aa146c25ea30cd73272caeff085acd494f.zip", "author": "Yan Li;Lingxiao Wang;Jiachen Yang;Ethan Wang;Zhaoran Wang;Tuo Zhao;Hongyuan Zha", "authorids": "~Yan_Li9;~Lingxiao_Wang6;~Jiachen_Yang1;~Ethan_Wang1;~Zhaoran_Wang1;~Tuo_Zhao1;~Hongyuan_Zha1", "gender": "M;M;;M;Not Specified;M;", "homepage": "https://gzliyan113.github.io/;;;https://github.com/wange011;https://zhaoranwang.github.io/;http://www2.isye.gatech.edu/~tzhao80;", "dblp": ";140/1229;;;117/2756;;z/HongyuanZha", "google_scholar": "wLfoeakAAAAJ;;;;https://scholar.google.com.tw/citations?user=HSx0BgQAAAAJ;EJXN6tYAAAAJ;n1DQMIsAAAAJ", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": "~Yan_Li9;~Lingxiao_Wang6;~Jiachen_Yang1;~Ethan_Wang1;~Zhaoran_Wang1;~Tuo_Zhao1;~Hongyuan_Zha1", "aff": "Georgia Institute of Technology;Northwestern University;;Georgia Institute of Technology;;Georgia Institute of Technology;The Chinese University of Hong Kong, Shenzhen", "aff_domain": "gatech.edu;northwestern.edu;;gatech.edu;;gatech.edu;cuhk.edu.cn", "position": "PhD student;PhD student;;Undergrad student;;Associate Professor;Full Professor", "bibtex": "@misc{\nli2022a,\ntitle={A Principled Permutation Invariant Approach to Mean-Field Multi-Agent Reinforcement Learning},\nauthor={Yan Li and Lingxiao Wang and Jiachen Yang and Ethan Wang and Zhaoran Wang and Tuo Zhao and Hongyuan Zha},\nyear={2022},\nurl={https://openreview.net/forum?id=H4J8FGHOhx_}\n}", "github": "", "project": "", "reviewers": "fcc8;6Xxz;979E", "site": "https://openreview.net/forum?id=H4J8FGHOhx_", "pdf_size": 0, "recommendation": "3;5;8", "confidence": "4;2;5", "correctness": "2;3;4", "technical_novelty": "2;3;3", "empirical_novelty": "3;2;3", "wc_summary_paper": "128;13;41", "wc_summary_review": "49;36;14", "wc_main_review": "412;195;363", "wc_review": "589;244;418", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "720;979;297", "reply_reviewers": "0;0;0", "reply_authors": "1;2;1", "recommendation_avg": [ 5.333333333333333, 2.0548046676563256 ], "confidence_avg": [ 3.6666666666666665, 1.247219128924647 ], "correctness_avg": [ 3.0, 0.816496580927726 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 60.666666666666664, 48.96483999315791 ], "wc_summary_review_avg": [ 33.0, 14.445299120013633 ], "wc_main_review_avg": [ 323.3333333333333, 92.92410284145276 ], "wc_review_avg": [ 417.0, 140.84743519141554 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 665.3333333333334, 281.09587134799557 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.43355498476206006, "corr_recommendation_correctness": 0.9933992677987828, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:B2SS6WzziEcJ:scholar.google.com/&scioq=A+Principled+Permutation+Invariant+Approach+to+Mean-Field+Multi-Agent+Reinforcement+Learning&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;0;0;2", "aff_unique_norm": "Georgia Institute of Technology;Northwestern University;Chinese University of Hong Kong", "aff_unique_dep": ";;", "aff_unique_url": "https://www.gatech.edu;https://www.northwestern.edu;https://www.cuhk.edu.cn", "aff_unique_abbr": "Georgia Tech;NU;CUHK", "aff_campus_unique_index": "1", "aff_campus_unique": ";Shenzhen", "aff_country_unique_index": "0;0;0;0;1", "aff_country_unique": "United States;China" }, { "title": "Towards Empirical Sandwich Bounds on the Rate-Distortion Function", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6563", "id": "H4PmOqSZDY", "poster": "", "openreview": "https://openreview.net/forum?id=H4PmOqSZDY", "slides": "https://iclr.cc/virtual/2022/poster/6563", "video": "https://iclr.cc/virtual/2022/poster/6563", "author_site": "Yibo Yang, Stephan Mandt", "tldr": "", "abstract": "Rate-distortion (R-D) function, a key quantity in information theory, characterizes the fundamental limit of how much a data source can be compressed subject to a fidelity criterion, by any compression algorithm. As researchers push for ever-improving compression performance, establishing the R-D function of a given data source is not only of scientific interest, but also reveals the possible room for improvement in existing compression algorithms. Previous work on this problem relied on distributional assumptions on the data source (Gibson, 2017) or only applied to discrete data (Blahut, 1972; Arimoto, 1972). By contrast, this paper makes the first attempt at an algorithm for sandwiching the R-D function of a general (not necessarily discrete) source requiring only i.i.d. data samples. We estimate R-D sandwich bounds for a variety of artificial and real-world data sources, in settings far beyond the feasibility of any known method, and shed light on the optimality of neural data compression (Ball\u00e9 et al., 2021; Yang et al., 2022). Our R-D upper bound on natural images indicates theoretical room for improving state-of-the-art image compression methods by at least one dB in PSNR at various bitrates. Our data and code can be found at https://github.com/mandt-lab/RD-sandwich.", "keywords": "information theory;deep generative modeling;lossy data compression", "primary_area": "", "supplementary_material": "", "author": "Yibo Yang;Stephan Mandt", "authorids": "~Yibo_Yang1;~Stephan_Mandt1", "gender": "Unspecified;M", "homepage": ";https://www.stephanmandt.com", "dblp": ";147/5018", "google_scholar": "N0VVxNUAAAAJ;HOrGe7wAAAAJ", "orcid": ";", "linkedin": ";stephan-mandt-8702795a/", "or_profile": "~Yibo_Yang1;~Stephan_M_Mandt1", "aff": "University of California, Irvine;University of California, Irvine", "aff_domain": "uci.edu;uci.edu", "position": "PhD student;Assistant Professor", "bibtex": "@inproceedings{\nyang2022towards,\ntitle={Towards Empirical Sandwich Bounds on the Rate-Distortion Function},\nauthor={Yibo Yang and Stephan Mandt},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=H4PmOqSZDY}\n}", "github": "", "project": "", "reviewers": "9Pwc;T3TM;tH6q;zsSF", "pdf_size": 0, "recommendation": "3;6;6;8", "confidence": "4;3;3;3", "correctness": "2;4;4;4", "technical_novelty": "2;3;3;3", "empirical_novelty": "1;2;0;3", "wc_summary_paper": "18;92;54;176", "wc_summary_review": "27;20;92;13", "wc_main_review": "588;287;355;379", "wc_review": "633;399;501;568", "wc_reply_reviewers": "40;0;0;0", "wc_reply_authors": "1259;433;660;346", "reply_reviewers": "2;0;0;0", "reply_authors": "2;1;1;1", "recommendation_avg": [ 5.75, 1.7853571071357126 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.8660254037844386 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.5, 1.118033988749895 ], "wc_summary_paper_avg": [ 85.0, 58.69412236331676 ], "wc_summary_review_avg": [ 38.0, 31.567388235329194 ], "wc_main_review_avg": [ 402.25, 112.42636479047074 ], "wc_review_avg": [ 525.25, 86.55164643148044 ], "wc_reply_reviewers_avg": [ 10.0, 17.320508075688775 ], "wc_reply_authors_avg": [ 674.5, 356.40040684600797 ], "reply_reviewers_avg": [ 0.5, 0.8660254037844386 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.8892972917998875, "corr_recommendation_correctness": 0.8892972917998875, "gs_citation": 31, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3922055311859946203&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "pdf": "https://openreview.net/pdf?id=H4PmOqSZDY", "email": "uci.edu;uci.edu", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "University of California, Irvine", "aff_unique_dep": "", "aff_unique_url": "https://www.uci.edu", "aff_unique_abbr": "UCI", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Irvine", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "H6mR1eaBP1l", "title": "Training sequence labeling models using prior knowledge", "track": "main", "status": "Reject", "tldr": "", "abstract": "Sequence labeling task (part-of-speech tagging, named entity recognition) is one of the most common in NLP. At different times, the following architectures were used to solve it: CRF, BiLSTM, BERT (in chronological order). The combined model BiLSTM / BERT + CRF, where the last one is the topmost layer, however, performs better than just BiLSTM / BERT.\n\nIt is common when there is a small amount of labeled data available for the task. Hence it is difficult to train a model with good generalizing capability, so one has to resort to semi-supervised learning approaches. One of them is called pseudo-labeling, the gist of what is increasing the training samples with unlabeled data, but it cannot be used alongside with the CRF layer, as this layer simulates the probability distribution of the entire sequence, not of individual tokens.\n\nIn this paper, we propose an alternative to the CRF layer \u2014 the Prior Knowledge Layer (PKL), that allows one to obtain probability distributions of each token and also takes into account prior knowledge concerned the structure of label sequences.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Dani El-Ayyass", "authorids": "~Dani_El-Ayyass1", "gender": "M", "homepage": "https://github.com/dayyass", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "dayyass/", "or_profile": "~Dani_El-Ayyass1", "aff": "Lomonosov Moscow State University", "aff_domain": "msu.ru", "position": "MS student", "bibtex": "@misc{\nel-ayyass2022training,\ntitle={Training sequence labeling models using prior knowledge},\nauthor={Dani El-Ayyass},\nyear={2022},\nurl={https://openreview.net/forum?id=H6mR1eaBP1l}\n}", "github": "", "project": "", "reviewers": "UZd9;n5Vw;7Kg2;E7aG", "site": "https://openreview.net/forum?id=H6mR1eaBP1l", "pdf_size": 0, "recommendation": "1;1;1;1", "confidence": "5;5;4;5", "correctness": "4;1;3;2", "technical_novelty": "1;2;1;1", "empirical_novelty": "0;1;1;1", "wc_summary_paper": "49;54;29;24", "wc_summary_review": "19;20;12;24", "wc_main_review": "105;169;63;41", "wc_review": "173;243;104;89", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 1.0, 0.0 ], "confidence_avg": [ 4.75, 0.4330127018922193 ], "correctness_avg": [ 2.5, 1.118033988749895 ], "technical_novelty_avg": [ 1.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 0.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 39.0, 12.747548783981962 ], "wc_summary_review_avg": [ 18.75, 4.322904116447646 ], "wc_main_review_avg": [ 94.5, 48.77243073704652 ], "wc_review_avg": [ 152.25, 61.22652611409535 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 1, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:9j4KiABZkaIJ:scholar.google.com/&scioq=Training+sequence+labeling+models+using+prior+knowledge&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "Lomonosov Moscow State University", "aff_unique_dep": "", "aff_unique_url": "https://www.msu.ru", "aff_unique_abbr": "MSU", "aff_campus_unique_index": "0", "aff_campus_unique": "Moscow", "aff_country_unique_index": "0", "aff_country_unique": "Russian Federation" }, { "id": "H78NdTUTls8", "title": "A precortical module for robust CNNs to light variations", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "We present a simple mathematical model for the mammalian low visual pathway, taking into account its key elements: retina, lateral\ngeniculate nucleus (LGN), primary visual cortex (V1). The analogies between the cortical level of the visual system and the structure of popular CNNs, used in image classification tasks, suggests the introduction of an additional preliminary convolutional module inspired to precortical neuronal circuits to improve robustness with respect to global light intensity and contrast variations in the input images. We validate our hypothesis on the popular databases MNIST, FashionMNIST and SVHN, obtaining significantly more robust CNNs with respect to these variations, once such extra module is added. ", "keywords": "Neurogeometry;Deep Learning", "primary_area": "", "supplementary_material": "/attachment/a15c3fd9287afd78f4ed13a74977d44737c1aaff.zip", "author": "Rita Fioresi;Janko Petkovic", "authorids": "~Rita_Fioresi1;~Janko_Petkovic1", "gender": "F;", "homepage": "https://www.unibo.it/sitoweb/rita.fioresi;", "dblp": ";", "google_scholar": "DwTxLXAAAAAJ;", "orcid": ";", "linkedin": ";janko-petkovic-9a788994/", "or_profile": "~Rita_Fioresi1;~Janko_Petkovic1", "aff": ";Bologna University", "aff_domain": ";unibo.it", "position": ";Researcher", "bibtex": "@misc{\nfioresi2022a,\ntitle={A precortical module for robust {CNN}s to light variations},\nauthor={Rita Fioresi and Janko Petkovic},\nyear={2022},\nurl={https://openreview.net/forum?id=H78NdTUTls8}\n}", "github": "", "project": "", "reviewers": "xv7B;GNfy;2X18;cAGh", "site": "https://openreview.net/forum?id=H78NdTUTls8", "pdf_size": 0, "recommendation": "1;1;3;3", "confidence": "5;5;5;4", "correctness": "1;1;3;2", "technical_novelty": "1;1;1;2", "empirical_novelty": "1;0;2;2", "wc_summary_paper": "44;67;55;73", "wc_summary_review": "35;66;75;107", "wc_main_review": "360;772;418;434", "wc_review": "439;905;548;614", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 2.0, 1.0 ], "confidence_avg": [ 4.75, 0.4330127018922193 ], "correctness_avg": [ 1.75, 0.82915619758885 ], "technical_novelty_avg": [ 1.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 59.75, 11.166355717063647 ], "wc_summary_review_avg": [ 70.75, 25.655165171949292 ], "wc_main_review_avg": [ 496.0, 161.7096162879623 ], "wc_review_avg": [ 626.5, 172.5086954330129 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.9045340337332909, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10199485362639695969&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0", "aff_unique_norm": "University of Bologna", "aff_unique_dep": "", "aff_unique_url": "https://www.unibo.it", "aff_unique_abbr": "UNIBO", "aff_country_unique_index": "0", "aff_country_unique": "Italy" }, { "id": "H7Edu1_IZgR", "title": "Transformers are Meta-Reinforcement Learners", "track": "main", "status": "Reject", "tldr": "", "abstract": "The transformer architecture and variants presented a remarkable success across many machine learning tasks in recent years. This success is intrinsically related to the capability of handling long sequences and the presence of context-dependent weights from the attention mechanism. We argue that these capabilities suit the central role of a Meta-Reinforcement Learning algorithm. Indeed, a meta-RL agent needs to infer the task from a sequence of trajectories. Furthermore, it requires a fast adaptation strategy to adapt its policy for a new task - which can be achieved using the self-attention mechanism. In this work, we present TrMRL (Transformers for Meta-Reinforcement Learning), a meta-RL agent that mimics the memory reinstatement mechanism using the transformer architecture. It associates the recent past of working memories to build an episodic memory recursively through the transformer layers. This memory works as a proxy to the current task, and we condition a policy head on it. We conducted experiments in high-dimensional continuous control environments for locomotion and dexterous manipulation. Results show that TrMRL achieves or surpasses state-of-the-art performance, sample efficiency, and out-of-distribution generalization in these environments.", "keywords": "Reinforcement Learning;Meta-Reinforcement Learning;Transformers", "primary_area": "", "supplementary_material": "/attachment/8e40fd419a3c09b87fbb108685c88307c461205b.zip", "author": "Luckeciano Carvalho Melo", "authorids": "~Luckeciano_Carvalho_Melo1", "gender": "M", "homepage": "https://luckeciano.github.io", "dblp": "234/6096", "google_scholar": "b2aBi8UAAAAJ", "orcid": "", "linkedin": "", "or_profile": "~Luckeciano_Carvalho_Melo1", "aff": "Microsoft", "aff_domain": "microsoft.com", "position": "Researcher", "bibtex": "@misc{\nmelo2022transformers,\ntitle={Transformers are Meta-Reinforcement Learners},\nauthor={Luckeciano Carvalho Melo},\nyear={2022},\nurl={https://openreview.net/forum?id=H7Edu1_IZgR}\n}", "github": "", "project": "", "reviewers": "ouZ3;maih;irGj;VA5K", "site": "https://openreview.net/forum?id=H7Edu1_IZgR", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "5;3;4;4", "correctness": "2;2;3;3", "technical_novelty": "1;2;2;3", "empirical_novelty": "1;2;2;3", "wc_summary_paper": "102;32;87;79", "wc_summary_review": "98;62;62;139", "wc_main_review": "292;503;108;238", "wc_review": "492;597;257;456", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "750;287;751;715", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 75.0, 26.16295090390226 ], "wc_summary_review_avg": [ 90.25, 31.751968442917047 ], "wc_main_review_avg": [ 285.25, 142.39974543516573 ], "wc_review_avg": [ 450.5, 123.14320931338439 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 625.75, 196.1139655914387 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 1, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 1.0, "gs_citation": 88, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4334650228414799916&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff_unique_index": "0", "aff_unique_norm": "Microsoft", "aff_unique_dep": "Microsoft Corporation", "aff_unique_url": "https://www.microsoft.com", "aff_unique_abbr": "Microsoft", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "title": "Multi-Agent MDP Homomorphic Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6821", "id": "H7HDG--DJF0", "poster": "", "openreview": "https://openreview.net/forum?id=H7HDG--DJF0", "slides": "https://iclr.cc/virtual/2022/poster/6821", "video": "https://iclr.cc/virtual/2022/poster/6821", "author_site": "Elise van der Pol, Herke van Hoof, Frans Oliehoek, Max Welling", "tldr": "", "abstract": "This paper introduces Multi-Agent MDP Homomorphic Networks, a class of networks that allows distributed execution using only local information, yet is able to share experience between global symmetries in the joint state-action space of cooperative multi-agent systems. In cooperative multi-agent systems, complex symmetries arise between different configurations of the agents and their local observations. For example, consider a group of agents navigating: rotating the state globally results in a permutation of the optimal joint policy. Existing work on symmetries in single agent reinforcement learning can only be generalized to the fully centralized setting, because such approaches rely on the global symmetry in the full state-action spaces, and these can result in correspondences across agents. To encode such symmetries while still allowing distributed execution we propose a factorization that decomposes global symmetries into local transformations. Our proposed factorization allows for distributing the computation that enforces global symmetries over local agents and local interactions. We introduce a multi-agent equivariant policy network based on this factorization. We show empirically on symmetric multi-agent problems that globally symmetric distributable policies improve data efficiency compared to non-equivariant baselines.", "keywords": "multiagent systems;reinforcement learning;equivariance;symmetry", "primary_area": "", "supplementary_material": "", "author": "Elise van der Pol;Herke van Hoof;Frans A Oliehoek;Max Welling", "authorids": "~Elise_van_der_Pol1;~Herke_van_Hoof4;~Frans_A_Oliehoek1;~Max_Welling1", "gender": "F;M;M;M", "homepage": "http://elisevanderpol.nl;https://staff.fnwi.uva.nl/h.c.vanhoof/;https://staff.fnwi.uva.nl/m.welling/;http://www.fransoliehoek.net/", "dblp": "186/8470.html;123/6759;16/2286;82/2420", "google_scholar": "https://scholar.google.nl/citations?user=564o-vIAAAAJ;https://scholar.google.ca/citations?user=9owUkLYAAAAJ;https://scholar.google.nl/citations?user=8200InoAAAAJ;https://scholar.google.nl/", "orcid": ";;0000-0003-1484-2121;0000-0003-4372-5055", "linkedin": ";;;frans-oliehoek-0310078/", "or_profile": "~Elise_van_der_Pol1;~Herke_van_Hoof4;~Max_Welling1;~Frans_Oliehoek1", "aff": "University of Amsterdam;University of Amsterdam;University of Amsterdam;Delft University of Technology", "aff_domain": "uva.nl;uva.nl;uva.nl;tudelft.nl", "position": "PhD student;Assistant Professor;Full Professor;Associate Professor", "bibtex": "@inproceedings{\npol2022multiagent,\ntitle={Multi-Agent {MDP} Homomorphic Networks},\nauthor={Elise van der Pol and Herke van Hoof and Frans A Oliehoek and Max Welling},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=H7HDG--DJF0}\n}", "github": "", "project": "", "reviewers": "xEvs;Z7zL;Jvcy;XJk1", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "4;3;3;2", "correctness": "3;3;4;4", "technical_novelty": "4;2;3;3", "empirical_novelty": "3;3;3;4", "wc_summary_paper": "50;68;109;42", "wc_summary_review": "44;11;94;34", "wc_main_review": "150;168;806;139", "wc_review": "244;247;1009;215", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "167;238;578;72", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 3.0, 0.7071067811865476 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 3.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 67.25, 25.878321042911576 ], "wc_summary_review_avg": [ 45.75, 30.318105151872537 ], "wc_main_review_avg": [ 315.75, 283.2352158542437 ], "wc_review_avg": [ 428.75, 335.24049203519553 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 263.75, 190.75163826295176 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.9733285267845754, "corr_recommendation_correctness": 0.6882472016116854, "gs_citation": 32, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7742088366120766374&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=H7HDG--DJF0", "email": "uva.nl;uva.nl;uva.nl;tudelft.nl", "author_num": 4, "aff_unique_index": "0;0;0;1", "aff_unique_norm": "University of Amsterdam;Delft University of Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.uva.nl;https://www.tudelft.nl", "aff_unique_abbr": "UvA;TU Delft", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Netherlands" }, { "title": "Auto-scaling Vision Transformers without Training", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/5995", "id": "H94a1_Pyr-6", "poster": "", "openreview": "https://openreview.net/forum?id=H94a1_Pyr-6", "slides": "https://iclr.cc/virtual/2022/poster/5995", "video": "https://iclr.cc/virtual/2022/poster/5995", "author_site": "Wuyang Chen, Wei Huang, Xianzhi Du, Xiaodan Song, Zhangyang Wang, Dengyong Zhou", "tldr": "", "abstract": "This work targets automated designing and scaling of Vision Transformers (ViTs). The motivation comes from two pain spots: 1) the lack of efficient and principled methods for designing and scaling ViTs; 2) the tremendous computational cost of training ViT that is much heavier than its convolution counterpart. To tackle these issues, we propose As-ViT, an auto-scaling framework for ViTs without training, which automatically discovers and scales up ViTs in an efficient and principled manner. Specifically, we first design a \"seed\" ViT topology by leveraging a training-free search process. This extremely fast search is fulfilled by a comprehensive study of ViT's network complexity, yielding a strong Kendall-tau correlation with ground-truth accuracies. Second, starting from the \"seed\" topology, we automate the scaling rule for ViTs by growing widths/depths to different ViT layers. This results in a series of architectures with different numbers of parameters in a single run. Finally, based on the observation that ViTs can tolerate coarse tokenization in early training stages, we propose a progressive tokenization strategy to train ViTs faster and cheaper. As a unified framework, As-ViT achieves strong performance on classification (83.5% top1 on ImageNet-1k) and detection (52.7% mAP on COCO) without any manual crafting nor scaling of ViT architectures: the end-to-end model design and scaling process costs only 12 hours on one V100 GPU. Our code is available at https://github.com/VITA-Group/AsViT.", "keywords": "vision transformer;neural architecture search;training-free search;efficient training", "primary_area": "", "supplementary_material": "/attachment/dba5196ff5e54a10474c6fda22c90ef3376e36f6.zip", "author": "Wuyang Chen;Wei Huang;Xianzhi Du;Xiaodan Song;Zhangyang Wang;Denny Zhou", "authorids": "~Wuyang_Chen1;~Wei_Huang6;~Xianzhi_Du4;~Xiaodan_Song1;~Zhangyang_Wang1;~Denny_Zhou1", "gender": ";M;M;;M;", "homepage": ";https://weihuang05.github.io/;;;https://vita-group.github.io;", "dblp": ";81/6685-34;;93/3688;119/4026;", "google_scholar": ";RZfDh4MAAAAJ;l1hP40AAAAAJ;;pxFyKAIAAAAJ;", "orcid": ";0000-0001-5674-7021;;;;", "linkedin": ";;xianzhi-du-1b128934/;;;", "or_profile": "~Wuyang_Chen1;~Wei_Huang6;~Xianzhi_Du4;~Xiaodan_Song1;~Zhangyang_Wang1;~Denny_Zhou1", "aff": ";RIKEN AIP;Google;;University of Texas, Austin;", "aff_domain": ";riken.jp;google.com;;utexas.edu;", "position": ";Postdoc;Research Engineer;;Assistant Professor;", "bibtex": "@inproceedings{\nchen2022autoscaling,\ntitle={Auto-scaling Vision Transformers without Training},\nauthor={Wuyang Chen and Wei Huang and Xianzhi Du and Xiaodan Song and Zhangyang Wang and Denny Zhou},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=H94a1_Pyr-6}\n}", "github": "", "project": "", "reviewers": "Bp2F;eYy9;rFuj", "pdf_size": 0, "recommendation": "5;6;8", "confidence": "4;4;4", "correctness": "4;3;4", "technical_novelty": "3;3;3", "empirical_novelty": "2;3;4", "wc_summary_paper": "79;66;41", "wc_summary_review": "99;60;56", "wc_main_review": "115;169;689", "wc_review": "293;295;786", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "809;601;264", "reply_reviewers": "0;0;0", "reply_authors": "3;1;1", "recommendation_avg": [ 6.333333333333333, 1.247219128924647 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 3.0, 0.816496580927726 ], "wc_summary_paper_avg": [ 62.0, 15.769168230019828 ], "wc_summary_review_avg": [ 71.66666666666667, 19.39644870130154 ], "wc_main_review_avg": [ 324.3333333333333, 258.79893525455026 ], "wc_review_avg": [ 458.0, 231.93246143363947 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 558.0, 224.56327987154683 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.6666666666666667, 0.9428090415820634 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.18898223650461363, "gs_citation": 36, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10616211011095299898&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=H94a1_Pyr-6", "email": ";riken.jp;google.com;;utexas.edu;", "author_num": 6, "aff_unique_index": "0;1;2", "aff_unique_norm": "RIKEN;Google;University of Texas at Austin", "aff_unique_dep": "Advanced Institute for Computational Science;Google;", "aff_unique_url": "https://www.aip.riken.jp;https://www.google.com;https://www.utexas.edu", "aff_unique_abbr": "RIKEN AIP;Google;UT Austin", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Mountain View;Austin", "aff_country_unique_index": "0;1;1", "aff_country_unique": "Japan;United States" }, { "title": "Neural Relational Inference with Node-Specific Information", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/5937", "id": "HBsJNesj2S", "poster": "", "openreview": "https://openreview.net/forum?id=HBsJNesj2S", "slides": "https://iclr.cc/virtual/2022/poster/5937", "video": "https://iclr.cc/virtual/2022/poster/5937", "tldr": "", "abstract": "Inferring interactions among entities is an important problem in studying dynamical systems, which greatly impacts the performance of downstream tasks, such as prediction. In this paper, we tackle the relational inference problem in a setting where each entity can potentially have a set of individualized information that other entities cannot have access to. Specifically, we represent the system using a graph in which the individualized information become node-specific information (NSI). We build our model in the framework of Neural Relation Inference (NRI), where the interaction among entities are uncovered using variational inference. We adopt NRI model to incorporate the individualized information by introducing private nodes in the graph that represent NSI. Such representation enables us to uncover more accurate relations among the agents and therefore leads to better performance on the downstream tasks. Our experiment results over real-world datasets validate the merit of our proposed algorithm. ", "keywords": "Graph Neural Networks;Variational Inference;Trajectory Prediction", "primary_area": "", "supplementary_material": "", "author": "Ershad Banijamali", "authorids": "~Ershad_Banijamali2", "gender": "M", "homepage": "https://cs.uwaterloo.ca/~sbanijam/", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "~Ershad_Banijamali2", "aff": "Amazon", "aff_domain": "amazon.com", "position": "Researcher", "bibtex": "@inproceedings{\nbanijamali2022neural,\ntitle={Neural Relational Inference with Node-Specific Information },\nauthor={Ershad Banijamali},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=HBsJNesj2S}\n}", "github": "", "project": "", "reviewers": "hKEW;NM6N;SFbP", "pdf_size": 0, "recommendation": "5;8;8", "confidence": "2;3;2", "correctness": "4;3;3", "technical_novelty": "2;3;4", "empirical_novelty": "2;3;3", "wc_summary_paper": "91;22;155", "wc_summary_review": "27;59;145", "wc_main_review": "121;59;498", "wc_review": "239;140;798", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "431;57;761", "reply_reviewers": "0;0;0", "reply_authors": "2;1;2", "recommendation_avg": [ 7.0, 1.4142135623730951 ], "confidence_avg": [ 2.3333333333333335, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 3.0, 0.816496580927726 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 89.33333333333333, 54.309810859139944 ], "wc_summary_review_avg": [ 77.0, 49.82636517614612 ], "wc_main_review_avg": [ 226.0, 193.9914087444768 ], "wc_review_avg": [ 392.3333333333333, 289.6829684710895 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 416.3333333333333, 287.5938494165378 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 1, 0 ], "corr_recommendation_confidence": 0.5, "corr_recommendation_correctness": -1.0, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3748122333149997441&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "pdf": "https://openreview.net/pdf?id=HBsJNesj2S", "email": "amazon.com", "author_num": 1, "aff_unique_index": "0", "aff_unique_norm": "Amazon", "aff_unique_dep": "Amazon.com, Inc.", "aff_unique_url": "https://www.amazon.com", "aff_unique_abbr": "Amazon", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "title": "LFPT5: A Unified Framework for Lifelong Few-shot Language Learning Based on Prompt Tuning of T5", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6632", "id": "HCRVf71PMF", "poster": "", "openreview": "https://openreview.net/forum?id=HCRVf71PMF", "slides": "https://iclr.cc/virtual/2022/poster/6632", "video": "https://iclr.cc/virtual/2022/poster/6632", "author_site": "Chengwei Qin, Shafiq Joty", "tldr": "", "abstract": "Existing approaches to lifelong language learning rely on plenty of labeled data for learning a new task, which is hard to obtain in most real scenarios. Considering that humans can continually learn new tasks from a handful of examples, we expect the models also to be able to generalize well on new few-shot tasks without forgetting the previous ones. In this work, we define this more challenging yet practical problem as Lifelong Few-shot Language Learning (LFLL) and propose a unified framework for it based on prompt tuning of T5. Our framework called LFPT5 takes full advantage of PT's strong few-shot learning ability, and simultaneously trains the model as a task solver and a data generator. Before learning a new domain of the same task type, LFPT5 generates pseudo (labeled) samples of previously learned domains, and later gets trained on those samples to alleviate forgetting of previous knowledge as it learns the new domain. In addition, a KL divergence loss is minimized to achieve label consistency between the previous and the current model. While adapting to a new task type, LFPT5 includes and tunes additional prompt embeddings for the new task. With extensive experiments, we demonstrate that LFPT5 can be applied to various different types of tasks and significantly outperform previous methods in different LFLL settings.", "keywords": "lifelong few-shot language Learning;prompt tuning;pseudo samples;knowledge distillation", "primary_area": "", "supplementary_material": "/attachment/6a30a20920e8e38269c21b347da89d31a77d241f.zip", "author": "Chengwei Qin;Shafiq Joty", "authorids": "~Chengwei_Qin1;~Shafiq_Joty1", "gender": "M;M", "homepage": ";https://raihanjoty.github.io/", "dblp": "195/2732;62/2078", "google_scholar": ";hR249csAAAAJ", "orcid": ";", "linkedin": "chengwei-qin-3401a1107/;", "or_profile": "~Chengwei_Qin1;~Shafiq_Joty1", "aff": "Nanyang Technological University;SalesForce.com", "aff_domain": "ntu.edu.sg;salesforce.com", "position": "PhD student;Principal Researcher", "bibtex": "@inproceedings{\nqin2022lfpt,\ntitle={{LFPT}5: A Unified Framework for Lifelong Few-shot Language Learning Based on Prompt Tuning of T5},\nauthor={Chengwei Qin and Shafiq Joty},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=HCRVf71PMF}\n}", "github": "", "project": "", "reviewers": "Ap2b;9Wdo;jNL2;PawA", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "2;5;4;3", "correctness": "2;3;3;4", "technical_novelty": "3;2;3;3", "empirical_novelty": "3;3;3;3", "wc_summary_paper": "33;157;111;90", "wc_summary_review": "45;101;40;95", "wc_main_review": "428;753;458;187", "wc_review": "506;1011;609;372", "wc_reply_reviewers": "555;440;208;45", "wc_reply_authors": "1754;3578;952;985", "reply_reviewers": "3;1;1;1", "reply_authors": "5;6;3;4", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 3.5, 1.118033988749895 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 97.75, 44.54983164951356 ], "wc_summary_review_avg": [ 70.25, 27.887048965424793 ], "wc_main_review_avg": [ 456.5, 200.84633429565002 ], "wc_review_avg": [ 624.5, 238.44338950786621 ], "wc_reply_reviewers_avg": [ 312.0, 198.45528463611143 ], "wc_reply_authors_avg": [ 1817.25, 1066.0134555905006 ], "reply_reviewers_avg": [ 1.5, 0.8660254037844386 ], "reply_authors_avg": [ 4.5, 1.118033988749895 ], "replies_avg": [ 30, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.7071067811865475, "gs_citation": 107, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7716940912154178619&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=HCRVf71PMF", "email": "ntu.edu.sg;salesforce.com", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "Nanyang Technological University;Salesforce", "aff_unique_dep": ";", "aff_unique_url": "https://www.ntu.edu.sg;https://www.salesforce.com", "aff_unique_abbr": "NTU;Salesforce", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "Singapore;United States" }, { "title": "Doubly Adaptive Scaled Algorithm for Machine Learning Using Second-Order Information", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6864", "id": "HCelXXcSEuH", "poster": "", "openreview": "https://openreview.net/forum?id=HCelXXcSEuH", "slides": "https://iclr.cc/virtual/2022/poster/6864", "video": "https://iclr.cc/virtual/2022/poster/6864", "author_site": "Majid Jahani, Sergey Rusakov, Zheng Shi, Peter Richtarik, Michael W Mahoney, Martin Takac", "tldr": "", "abstract": "We present a novel adaptive optimization algorithm for large-scale machine learning problems. Equipped with a low-cost estimate of local curvature and Lipschitz smoothness, our method dynamically adapts the search direction and step-size. The search direction contains gradient information preconditioned by a well-scaled diagonal preconditioning matrix that captures the local curvature information. Our methodology does not require the tedious task of learning rate tuning, as the learning rate is updated automatically without adding an extra hyper-parameter. We provide convergence guarantees on a comprehensive collection of optimization problems, including convex, strongly convex, and nonconvex problems, in both deterministic and stochastic regimes. We also conduct an extensive empirical evaluation on standard machine learning problems, justifying our algorithm's versatility and demonstrating its strong performance compared to other start-of-the-art first-order and second-order methods.", "keywords": "Convex Optimization;Non-Convex Optimization;Stochastic Optimization;Second-Order Optimization;Deep Learning", "primary_area": "", "supplementary_material": "/attachment/59336c2f6b3763abd103701c8cc06d06e10e6bc2.zip", "author": "Majid Jahani;Sergey Rusakov;Zheng Shi;Peter Richt\u00e1rik;Michael W. Mahoney;Martin Takac", "authorids": "~Majid_Jahani2;~Sergey_Rusakov1;~Zheng_Shi2;~Peter_Richt\u00e1rik1;~Michael_W._Mahoney1;~Martin_Takac3", "gender": "M;M;M;;;M", "homepage": "https://coral.ise.lehigh.edu/maj316/;;https://www.zhengqxhs.com/;;;http://mtakac.com", "dblp": "207/8296;130/5837-1;;;;42/3759-1.html", "google_scholar": "T7kjWR0AAAAJ;https://scholar.google.ru/citations?user=tdyw7IAAAAAJ;YurlyCoAAAAJ;;;qKQD-2cAAAAJ", "orcid": ";;0000-0002-7202-0217;;;0000-0001-7455-2025", "linkedin": "majid-jahani-b1714150/;;zhengmartinshi/;;;martintakac/", "or_profile": "~Majid_Jahani2;~Sergey_Rusakov1;~Zheng_Shi2;~Peter_Richt\u00e1rik1;~Michael_W._Mahoney1;~Martin_Takac3", "aff": "Target AI;;Lehigh University;;;Mohamed bin Zayed University of Artificial Intelligence", "aff_domain": "target.com;;lehigh.edu;;;mbzuai.ac.ae", "position": "Senior AI - Optimization Scientist;;PhD Candidate;;;Associate Professor", "bibtex": "@inproceedings{\njahani2022doubly,\ntitle={Doubly Adaptive Scaled Algorithm for Machine Learning Using Second-Order Information},\nauthor={Majid Jahani and Sergey Rusakov and Zheng Shi and Peter Richt{\\'a}rik and Michael W. Mahoney and Martin Takac},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=HCelXXcSEuH}\n}", "github": "", "project": "", "reviewers": "524y;QUCj;doAo", "pdf_size": 0, "recommendation": "5;6;8", "confidence": "4;4;3", "correctness": "3;4;3", "technical_novelty": "2;4;2", "empirical_novelty": "2;0;3", "wc_summary_paper": "120;84;119", "wc_summary_review": "60;108;173", "wc_main_review": "436;265;569", "wc_review": "616;457;861", "wc_reply_reviewers": "82;158;0", "wc_reply_authors": "1612;982;1302", "reply_reviewers": "1;1;0", "reply_authors": "11;9;5", "recommendation_avg": [ 6.333333333333333, 1.247219128924647 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.6666666666666665, 0.9428090415820634 ], "empirical_novelty_avg": [ 1.6666666666666667, 1.247219128924647 ], "wc_summary_paper_avg": [ 107.66666666666667, 16.73983937265296 ], "wc_summary_review_avg": [ 113.66666666666667, 46.305747183500046 ], "wc_main_review_avg": [ 423.3333333333333, 124.43025712779384 ], "wc_review_avg": [ 644.6666666666666, 166.1732696782354 ], "wc_reply_reviewers_avg": [ 80.0, 64.51873113032111 ], "wc_reply_authors_avg": [ 1298.6666666666667, 257.2072229848057 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 8.333333333333334, 2.494438257849294 ], "replies_avg": [ 31, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.944911182523068, "corr_recommendation_correctness": -0.18898223650461363, "gs_citation": 25, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=884057319207402792&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "pdf": "https://openreview.net/pdf?id=HCelXXcSEuH", "email": "target.com;;lehigh.edu;;;mbzuai.ac.ae", "author_num": 6, "aff_unique_index": "0;1;2", "aff_unique_norm": "Target AI;Lehigh University;Mohamed bin Zayed University of Artificial Intelligence", "aff_unique_dep": ";;", "aff_unique_url": "https://www.target.com;https://www.lehigh.edu;https://mbzuai.ac.ae", "aff_unique_abbr": "Target AI;Lehigh;MBZUAI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1", "aff_country_unique": "United States;United Arab Emirates" }, { "id": "HFE5P8nhmmL", "title": "SVMnet: Non-parametric image classification based on convolutional SVM ensembles for small training sets", "track": "main", "status": "Reject", "tldr": "", "abstract": "Deep convolutional neural networks (DCNNs) have demonstrated superior power in their ability to classify image data. However, one of the downsides of DCNNs for supervised learning of image data is that their training normally requires large sets of labeled \"ground truth\" images. Since in many real-world problems large sets of pre-labeled images are not always available, DCNNs might not perform in an optimal manner in all real-world cases. Here we propose SVMnet -- a method based on a layered structure of Support Vector Machine (SVM) ensembles for non-parametric image classification. By utilizing the quick learning of SVMs compared to neural networks, the proposed method can reach higher accuracy than DCNNs when the training set is small. Experimental results show that while \"conventional\" DCNN architectures such as ResNet-50 outperform SVMnet when the size of the training set is large, SVMnet provides a much higher accuracy when the number of \"ground truth\" training samples is small.", "keywords": "machine learning;support vector machine;convolutional neural network", "primary_area": "", "supplementary_material": "", "author": "Hunter Goddard;Lior Shamir", "authorids": "~Hunter_Goddard1;~Lior_Shamir1", "gender": ";Not Specified", "homepage": ";https://people.cs.ksu.edu/~lshamir/", "dblp": ";", "google_scholar": ";https://scholar.google.com/citations?hl=en", "orcid": ";", "linkedin": "huntergoddard/;", "or_profile": "~Hunter_Goddard1;~Lior_Shamir1", "aff": "Kansas State University;Kansas State University", "aff_domain": "ksu.edu;ksu.edu", "position": "PhD student;Associate Professor", "bibtex": "@misc{\ngoddard2022svmnet,\ntitle={{SVM}net: Non-parametric image classification based on convolutional {SVM} ensembles for small training sets},\nauthor={Hunter Goddard and Lior Shamir},\nyear={2022},\nurl={https://openreview.net/forum?id=HFE5P8nhmmL}\n}", "github": "", "project": "", "reviewers": "SXM2;8kHE;ycpg;ZeYm", "site": "https://openreview.net/forum?id=HFE5P8nhmmL", "pdf_size": 0, "recommendation": "1;3;3;6", "confidence": "5;4;4;4", "correctness": "2;1;3;3", "technical_novelty": "1;1;2;3", "empirical_novelty": "1;1;2;3", "wc_summary_paper": "93;374;67;66", "wc_summary_review": "50;86;48;120", "wc_main_review": "209;915;236;188", "wc_review": "352;1375;351;374", "wc_reply_reviewers": "60;81;0;0", "wc_reply_authors": "208;315;210;0", "reply_reviewers": "1;1;0;0", "reply_authors": "1;1;1;0", "recommendation_avg": [ 3.25, 1.7853571071357126 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 2.25, 0.82915619758885 ], "technical_novelty_avg": [ 1.75, 0.82915619758885 ], "empirical_novelty_avg": [ 1.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 150.0, 129.77865772152214 ], "wc_summary_review_avg": [ 76.0, 29.563490998188964 ], "wc_main_review_avg": [ 387.0, 305.31541068213375 ], "wc_review_avg": [ 613.0, 440.0369302683583 ], "wc_reply_reviewers_avg": [ 35.25, 36.02342987556848 ], "wc_reply_authors_avg": [ 183.25, 114.30961245669587 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 0.75, 0.4330127018922193 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.7276068751089989, "corr_recommendation_correctness": 0.46442036401282394, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:0wWw-Abqm_wJ:scholar.google.com/&scioq=SVMnet:+Non-parametric+image+classification+based+on+convolutional+SVM+ensembles+for+small+training+sets&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Kansas State University", "aff_unique_dep": "", "aff_unique_url": "https://www.k-state.edu", "aff_unique_abbr": "K-State", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Measuring the Interpretability of Unsupervised Representations via Quantized Reversed Probing", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6146", "id": "HFPTzdwN39", "poster": "", "openreview": "https://openreview.net/forum?id=HFPTzdwN39", "slides": "https://iclr.cc/virtual/2022/poster/6146", "video": "https://iclr.cc/virtual/2022/poster/6146", "author_site": "Iro Laina, Yuki Asano, Andrea Vedaldi", "tldr": "", "abstract": "Self-supervised visual representation learning has recently attracted significant research interest. While a common way to evaluate self-supervised representations is through transfer to various downstream tasks, we instead investigate the problem of measuring their interpretability, i.e. understanding the semantics encoded in raw representations. We formulate the latter as estimating the mutual information between the representation and a space of manually labelled concepts. To quantify this we introduce a decoding bottleneck: information must be captured by simple predictors, mapping concepts to clusters in representation space. This approach, which we call reverse linear probing, provides a single number sensitive to the semanticity of the representation. This measure is also able to detect when the representation contains combinations of concepts (e.g., \"red apple'') instead of just individual attributes (\"red'' and \"apple'' independently). Finally, we propose to use supervised classifiers to automatically label large datasets in order to enrich the space of concepts used for probing. We use our method to evaluate a large number of self-supervised representations, ranking them by interpretability, highlight the differences that emerge compared to the standard evaluation with linear probes and discuss several qualitative insights. Code at: https://github.com/iro-cp/ssl-qrp.", "keywords": "Representation learning;Computer vision;Interpretability", "primary_area": "", "supplementary_material": "", "author": "Iro Laina;Yuki M Asano;Andrea Vedaldi", "authorids": "~Iro_Laina1;~Yuki_M_Asano1;~Andrea_Vedaldi1", "gender": "M;;M", "homepage": "https://www.robots.ox.ac.uk/~vedaldi/;;https://yukimasano.github.io/", "dblp": "99/2825;182/2070;239/8823", "google_scholar": "bRT7t28AAAAJ;n9nXAPcAAAAJ;CdpLhlgAAAAJ", "orcid": "0000-0003-1374-2858;0000-0001-8857-7709;", "linkedin": ";;", "or_profile": "~Andrea_Vedaldi1;~Iro_Laina2;~Yuki_Asano1", "aff": "Meta;University of Oxford;University of Amsterdam", "aff_domain": "meta.com;ox.ac.uk;uva.nl", "position": "Researcher;Postdoc;Assistant Professor", "bibtex": "@inproceedings{\nlaina2022measuring,\ntitle={Measuring the Interpretability of Unsupervised Representations via Quantized Reversed Probing},\nauthor={Iro Laina and Yuki M Asano and Andrea Vedaldi},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=HFPTzdwN39}\n}", "github": "", "project": "", "reviewers": "UJ5M;phRe;AZhK;RSYr", "pdf_size": 0, "recommendation": "3;5;6;8", "confidence": "4;3;4;4", "correctness": "2;4;3;4", "technical_novelty": "1;2;3;3", "empirical_novelty": "1;2;3;4", "wc_summary_paper": "112;78;97;241", "wc_summary_review": "41;86;55;45", "wc_main_review": "508;603;687;168", "wc_review": "661;767;839;454", "wc_reply_reviewers": "0;0;53;0", "wc_reply_authors": "1446;1302;1490;676", "reply_reviewers": "0;0;1;0", "reply_authors": "3;3;3;2", "recommendation_avg": [ 5.5, 1.8027756377319946 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.25, 0.82915619758885 ], "empirical_novelty_avg": [ 2.5, 1.118033988749895 ], "wc_summary_paper_avg": [ 132.0, 64.07417576528005 ], "wc_summary_review_avg": [ 56.75, 17.640507362318125 ], "wc_main_review_avg": [ 491.5, 197.21625186581355 ], "wc_review_avg": [ 680.25, 145.16090210521565 ], "wc_reply_reviewers_avg": [ 13.25, 22.949673200287624 ], "wc_reply_authors_avg": [ 1228.5, 326.47626253680374 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 2.75, 0.4330127018922193 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.16012815380508713, "corr_recommendation_correctness": 0.7526178090063818, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14413356555893743428&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "pdf": "https://openreview.net/pdf?id=HFPTzdwN39", "email": "meta.com;ox.ac.uk;uva.nl", "author_num": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "Meta;University of Oxford;University of Amsterdam", "aff_unique_dep": "Meta Platforms, Inc.;;", "aff_unique_url": "https://meta.com;https://www.ox.ac.uk;https://www.uva.nl", "aff_unique_abbr": "Meta;Oxford;UvA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2", "aff_country_unique": "United States;United Kingdom;Netherlands" }, { "title": "Learning the Dynamics of Physical Systems from Sparse Observations with Finite Element Networks", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6609", "id": "HFmAukZ-k-2", "poster": "", "openreview": "https://openreview.net/forum?id=HFmAukZ-k-2", "slides": "https://iclr.cc/virtual/2022/poster/6609", "video": "https://iclr.cc/virtual/2022/poster/6609", "author_site": "Marten Lienen, Stephan G\u00fcnnemann", "tldr": "", "abstract": "We propose a new method for spatio-temporal forecasting on arbitrarily distributed points. Assuming that the observed system follows an unknown partial differential equation, we derive a continuous-time model for the dynamics of the data via the finite element method. The resulting graph neural network estimates the instantaneous effects of the unknown dynamics on each cell in a meshing of the spatial domain. Our model can incorporate prior knowledge via assumptions on the form of the unknown PDE, which induce a structural bias towards learning specific processes. Through this mechanism, we derive a transport variant of our model from the convection equation and show that it improves the transfer performance to higher-resolution meshes on sea surface temperature and gas flow forecasting against baseline models representing a selection of spatio-temporal forecasting methods. A qualitative analysis shows that our model disentangles the data dynamics into their constituent parts, which makes it uniquely interpretable.", "keywords": "spatio-temporal;finite;elements;forecasting;continuous;partial;differential;equation;PDE;graph;gnn;time-series", "primary_area": "", "supplementary_material": "", "author": "Marten Lienen;Stephan G\u00fcnnemann", "authorids": "~Marten_Lienen1;~Stephan_G\u00fcnnemann1", "gender": ";M", "homepage": "https://martenlienen.com;http://www.daml.in.tum.de", "dblp": "192/3468;43/3011", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": "~Marten_Lienen1;~Stephan_G\u00fcnnemann1", "aff": "Technical University Munich;Technical University Munich", "aff_domain": "tum.de;tum.de", "position": "PhD student;Professor", "bibtex": "@inproceedings{\nlienen2022learning,\ntitle={Learning the Dynamics of Physical Systems from Sparse Observations with Finite Element Networks},\nauthor={Marten Lienen and Stephan G{\\\"u}nnemann},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=HFmAukZ-k-2}\n}", "github": "", "project": "", "reviewers": "4K4v;CGtm;EPhP;Zk77", "pdf_size": 0, "recommendation": "6;8;8;8", "confidence": "4;4;4;3", "correctness": "3;4;3;4", "technical_novelty": "4;3;3;4", "empirical_novelty": "2;4;0;4", "wc_summary_paper": "23;87;78;39", "wc_summary_review": "50;27;70;23", "wc_main_review": "668;464;733;468", "wc_review": "741;578;881;530", "wc_reply_reviewers": "0;43;61;48", "wc_reply_authors": "1393;521;1445;841", "reply_reviewers": "0;1;1;1", "reply_authors": "3;1;3;3", "recommendation_avg": [ 7.5, 0.8660254037844386 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 1.6583123951777 ], "wc_summary_paper_avg": [ 56.75, 26.55536668924005 ], "wc_summary_review_avg": [ 42.5, 18.9274932307477 ], "wc_main_review_avg": [ 583.25, 119.48927776164687 ], "wc_review_avg": [ 682.5, 138.74527018965367 ], "wc_reply_reviewers_avg": [ 38.0, 22.901964981197573 ], "wc_reply_authors_avg": [ 1050.0, 386.3922877077129 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 2.5, 0.8660254037844386 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 50, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10753878238660840723&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=HFmAukZ-k-2", "email": "tum.de;tum.de", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Technical University of Munich", "aff_unique_dep": "", "aff_unique_url": "https://www.tum.de", "aff_unique_abbr": "TUM", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Germany" }, { "id": "HG7vlodGGm", "title": "TempoRL: Temporal Priors for Exploration in Off-Policy Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Effective exploration is a crucial challenge in deep reinforcement learning. Behavioral priors have been shown to tackle this problem successfully, at the expense of reduced generality and restricted transferability. We thus propose temporal priors as a non-Markovian generalization of behavioral priors for guiding exploration in reinforcement learning. Critically, we focus on state-independent temporal priors, which exploit the idea of temporal consistency and are generally applicable and capable of transferring across a wide range of tasks. We show how dynamically sampling actions from a probabilistic mixture of policy and temporal prior can accelerate off-policy reinforcement learning in unseen downstream tasks. We provide empirical evidence that our approach improves upon strong baselines in long-horizon continuous control tasks under sparse reward settings.", "keywords": "deep reinforcement learning;exploration;prior", "primary_area": "", "supplementary_material": "", "author": "Marco Bagatella;Sammy Joe Christen;Otmar Hilliges", "authorids": "~Marco_Bagatella1;~Sammy_Joe_Christen1;~Otmar_Hilliges1", "gender": ";Unspecified;M", "homepage": ";;https://ait.ethz.ch/people/hilliges/", "dblp": ";243/6983;82/2289", "google_scholar": ";r1L_2qkAAAAJ;-epU9OsAAAAJ", "orcid": ";;0000-0002-5068-3474", "linkedin": "marco-bagatella-9b8017197/;;", "or_profile": "~Marco_Bagatella1;~Sammy_Joe_Christen1;~Otmar_Hilliges1", "aff": "ETHZ - ETH Zurich;NVIDIA;ETHZ - ETH Zurich", "aff_domain": "ethz.ch;nvidia.com;ethz.ch", "position": "Intern;Intern;Associate Professor", "bibtex": "@misc{\nbagatella2022temporl,\ntitle={Tempo{RL}: Temporal Priors for Exploration in Off-Policy Reinforcement Learning},\nauthor={Marco Bagatella and Sammy Joe Christen and Otmar Hilliges},\nyear={2022},\nurl={https://openreview.net/forum?id=HG7vlodGGm}\n}", "github": "", "project": "", "reviewers": "66v7;wMYF;eq2m;AE8g;txKn;xn1S;j8Xm", "site": "https://openreview.net/forum?id=HG7vlodGGm", "pdf_size": 0, "recommendation": "3;3;3;3;6;8;8", "confidence": "4;4;3;3;3;5;4", "correctness": "3;2;3;2;4;2;4", "technical_novelty": "2;2;3;2;2;4;4", "empirical_novelty": "3;2;2;2;3;4;3", "wc_summary_paper": "95;29;192;64;59;149;134", "wc_summary_review": "56;40;32;49;37;100;44", "wc_main_review": "576;167;1083;306;777;227;691", "wc_review": "727;236;1307;419;873;476;869", "wc_reply_reviewers": "21;84;190;0;27;108;46", "wc_reply_authors": "635;771;1690;616;623;627;1063", "reply_reviewers": "1;1;1;0;1;1;2", "reply_authors": "3;2;3;1;2;2;3", "recommendation_avg": [ 4.857142857142857, 2.2314999074019015 ], "confidence_avg": [ 3.7142857142857144, 0.6998542122237652 ], "correctness_avg": [ 2.857142857142857, 0.8329931278350429 ], "technical_novelty_avg": [ 2.7142857142857144, 0.880630571852711 ], "empirical_novelty_avg": [ 2.7142857142857144, 0.6998542122237652 ], "wc_summary_paper_avg": [ 103.14285714285714, 53.445375508693985 ], "wc_summary_review_avg": [ 51.142857142857146, 21.2362800847214 ], "wc_main_review_avg": [ 546.7142857142857, 308.6036358853095 ], "wc_review_avg": [ 701.0, 332.11314767281175 ], "wc_reply_reviewers_avg": [ 68.0, 60.63709378637092 ], "wc_reply_authors_avg": [ 860.7142857142857, 370.0666790441629 ], "reply_reviewers_avg": [ 1.0, 0.5345224838248488 ], "reply_authors_avg": [ 2.2857142857142856, 0.6998542122237652 ], "replies_avg": [ 32, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.5227083734893168, "corr_recommendation_correctness": 0.37328844382740006, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:m8QbZ022zQsJ:scholar.google.com/&scioq=TempoRL:+Temporal+Priors+for+Exploration+in+Off-Policy+Reinforcement+Learning&hl=en&as_sdt=0,5", "gs_version_total": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "ETH Zurich;NVIDIA", "aff_unique_dep": ";NVIDIA Corporation", "aff_unique_url": "https://www.ethz.ch;https://www.nvidia.com", "aff_unique_abbr": "ETHZ;NVIDIA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", "aff_country_unique": "Switzerland;United States" }, { "id": "HHUSDJb_4KJ", "title": "Unifying Distribution Alignment as a Loss for Imbalanced Semi-supervised Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "While remarkable progress in imbalanced supervised learning has been made recently, less attention has been given to the setting of imbalanced semi-supervised learning (SSL) where not only is a few labeled data provided, but the underlying data distribution can be severely imbalanced. Recent works require both complicated sampling-based strategies of pseudo-labeled data and distribution alignment of the pseudo-label distribution to accommodate this imbalance. We present a novel approach that relies only on a form of a distribution alignment but no sampling strategy where rather than aligning the pseudo-labels during inference, we move the distribution alignment component into the respective cross entropy loss computations for both the supervised and unsupervised losses. This alignment compensates for both imbalance in the data as well as the eventual distributional shift present during evaluation. Altogether, this provides a single, unified strategy that offers both significantly reduced training requirements and improved performance across both low and richly labeled regimes and over varying degrees of imbalance. In experiments, we validate the efficacy of our method on SSL variants of CIFAR10-LT, CIFAR100-LT, and ImageNet-127. On ImageNet-127, our method shows 1.6% accuracy improvement over the previous best method with 80% training time reduction.", "keywords": "semi-supervised learning;imbalanced learning", "primary_area": "", "supplementary_material": "", "author": "Justin Lazarow;Kihyuk Sohn;Chun-Liang Li;Zizhao Zhang;Chen-Yu Lee;Tomas Pfister", "authorids": "~Justin_Lazarow1;~Kihyuk_Sohn1;~Chun-Liang_Li1;~Zizhao_Zhang3;~Chen-Yu_Lee2;~Tomas_Pfister1", "gender": "M;M;M;M;;M", "homepage": ";https://sites.google.com/site/kihyuksml/;http://chunliangli.github.io;https://sites.google.com/corp/view/zizhaozhang;https://chl260.github.io/;http://tomas.pfister.fi", "dblp": "127/3611;53/10771;;;04/656;14/8360", "google_scholar": "PASh6VEAAAAJ;VxpypngAAAAJ;https://scholar.google.com.tw/citations?user=vqHIt_sAAAAJ;https://scholar.google.dk/citations?hl=en;uWPUSEgAAAAJ;ahSpJOAAAAAJ", "orcid": ";;;;;0009-0004-4088-8718", "linkedin": ";;;;chenyulee260/;", "or_profile": "~Justin_Lazarow1;~Kihyuk_Sohn1;~Chun-Liang_Li1;~Zizhao_Zhang3;~Chen-Yu_Lee2;~Tomas_Pfister1", "aff": "University of California, San Diego;Google;Google;Google;Google;Google", "aff_domain": "ucsd.edu;google.com;google.com;google.com;google.com;google.com", "position": "PhD student;Research Scientist;Researcher;Researcher;Research Scientist;Head of Research @ Cloud AI", "bibtex": "@misc{\nlazarow2022unifying,\ntitle={Unifying Distribution Alignment as a Loss for Imbalanced Semi-supervised Learning},\nauthor={Justin Lazarow and Kihyuk Sohn and Chun-Liang Li and Zizhao Zhang and Chen-Yu Lee and Tomas Pfister},\nyear={2022},\nurl={https://openreview.net/forum?id=HHUSDJb_4KJ}\n}", "github": "", "project": "", "reviewers": "inVm;qKfL;aN2N;WjMc", "site": "https://openreview.net/forum?id=HHUSDJb_4KJ", "pdf_size": 0, "recommendation": "3;3;6;6", "confidence": "4;2;4;4", "correctness": "3;2;4;4", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "53;29;108;86", "wc_summary_review": "21;32;123;41", "wc_main_review": "280;93;404;250", "wc_review": "354;154;635;377", "wc_reply_reviewers": "0;0;39;56", "wc_reply_authors": "510;766;309;563", "reply_reviewers": "0;0;1;1", "reply_authors": "1;1;1;2", "recommendation_avg": [ 4.5, 1.5 ], "confidence_avg": [ 3.5, 0.8660254037844386 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 69.0, 30.273751006441206 ], "wc_summary_review_avg": [ 54.25, 40.31981522775123 ], "wc_main_review_avg": [ 256.75, 110.77313528107797 ], "wc_review_avg": [ 380.0, 170.86983349907027 ], "wc_reply_reviewers_avg": [ 23.75, 24.498724456591614 ], "wc_reply_authors_avg": [ 537.0, 162.65761586842467 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.5773502691896258, "corr_recommendation_correctness": 0.9045340337332909, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2228272232515543026&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff_unique_index": "0;1;1;1;1;1", "aff_unique_norm": "University of California, San Diego;Google", "aff_unique_dep": ";Google", "aff_unique_url": "https://www.ucsd.edu;https://www.google.com", "aff_unique_abbr": "UCSD;Google", "aff_campus_unique_index": "0;1;1;1;1;1", "aff_campus_unique": "San Diego;Mountain View", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "HHpWuWayMo", "title": "Evaluating Robustness of Cooperative MARL", "track": "main", "status": "Reject", "tldr": "", "abstract": "In recent years, a proliferation of methods were developed for multi-agent reinforcement learning (MARL). In this paper, we focus on evaluating the robustness of MARL agents in continuous control tasks. In particular, we propose the first model-based approach to perform adversarial attacks for cooperative MARL. We design effective attacks to degrade the MARL agent's performance by adversarially perturbing the states of agent(s) and solving an optimization problem. In addition, we also developed several strategies to select the most vulnerable agents that help to further decrease the team reward of MARL. Extensive numerical experiments on multi-agent Mujoco tasks verify the effectiveness of our proposed approach.", "keywords": "cooperative multi-agent reinforcement learning;adversarial attack;continuous action", "primary_area": "", "supplementary_material": "", "author": "Nhan Pham;Lam M. Nguyen;Jie Chen;Thanh Lam Hoang;Subhro Das;Tsui-Wei Weng", "authorids": "~Nhan_Pham1;~Lam_M._Nguyen1;~Jie_Chen1;~Thanh_Lam_Hoang1;~Subhro_Das1;~Tsui-Wei_Weng1", "gender": "M;;;M;;F", "homepage": "https://nhanph.github.io/;;https://jiechenjiechen.github.io;http://researcher.watson.ibm.com/researcher/view.php?person=ie-t.l.hoang;;https://lilywenglab.github.io", "dblp": "195/1055;;92/6289-7;66/3866;;177/9197", "google_scholar": "nB_GbvYAAAAJ;;Z-lkme8AAAAJ;VTUBBWAAAAAJ;;v8GM4xoAAAAJ", "orcid": "0000-0002-4490-8649;;;;;", "linkedin": "nhanph/;;;;;", "or_profile": "~Nhan_Pham1;~Lam_M._Nguyen1;~Jie_Chen1;~Thanh_Lam_Hoang1;~Subhro_Das1;~Tsui-Wei_Weng1", "aff": "International Business Machines;;International Business Machines;International Business Machines;;University of California, San Diego", "aff_domain": "ibm.com;;ibm.com;ibm.com;;ucsd.edu", "position": "Researcher;;Research Staff Member;Researcher;;Assistant Professor", "bibtex": "@misc{\npham2022evaluating,\ntitle={Evaluating Robustness of Cooperative {MARL}},\nauthor={Nhan Pham and Lam M. Nguyen and Jie Chen and Thanh Lam Hoang and Subhro Das and Tsui-Wei Weng},\nyear={2022},\nurl={https://openreview.net/forum?id=HHpWuWayMo}\n}", "github": "", "project": "", "reviewers": "jFMh;1ANY;j4AK;55ss", "site": "https://openreview.net/forum?id=HHpWuWayMo", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "4;3;4;3", "correctness": "2;3;3;3", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "40;95;157;89", "wc_summary_review": "39;75;44;30", "wc_main_review": "249;399;521;372", "wc_review": "328;569;722;491", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "734;1184;1217;1168", "reply_reviewers": "0;0;0;0", "reply_authors": "1;2;2;2", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 95.25, 41.54741267515945 ], "wc_summary_review_avg": [ 47.0, 16.926310879810757 ], "wc_main_review_avg": [ 385.25, 96.63947174938406 ], "wc_review_avg": [ 527.5, 142.02552587475253 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 1075.75, 198.09893361651396 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.75, 0.4330127018922193 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:jjN0RhxrQrwJ:scholar.google.com/&scioq=Evaluating+Robustness+of+Cooperative+MARL&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;0;1", "aff_unique_norm": "International Business Machines Corporation;University of California, San Diego", "aff_unique_dep": ";", "aff_unique_url": "https://www.ibm.com;https://www.ucsd.edu", "aff_unique_abbr": "IBM;UCSD", "aff_campus_unique_index": "1", "aff_campus_unique": ";San Diego", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "HI99z0aLsl", "title": "Benign Overfitting in Adversarially Robust Linear Classification", "track": "main", "status": "Reject", "tldr": "", "abstract": "``Benign overfitting'', where classifiers memorize noisy training data yet still achieve a good generalization performance, has drawn great attention in the machine learning community. To explain this surprising phenomenon, a series of works have provided theoretical justification in over-parameterized linear regression, classification, and kernel methods. However, it is not clear if benign overfitting still occurs in the presence of adversarial examples, i.e., examples with tiny and intentional perturbations to fool the classifiers. In this paper, we show that benign overfitting indeed occurs in adversarial training, a principled approach to defend against adversarial examples. In detail, we prove the risk bounds of the adversarially trained linear classifier on the mixture of sub-Gaussian data under $\\ell_p$ adversarial perturbations. Our result suggests that under moderate perturbations, adversarially trained linear classifiers can achieve the near-optimal standard and adversarial risks, despite overfitting the noisy training data. Numerical experiments validate our theoretical findings. ", "keywords": "Benign Overfitting;Robust Linear Classification", "primary_area": "", "supplementary_material": "", "author": "Jinghui Chen;Yuan Cao;Quanquan Gu", "authorids": "~Jinghui_Chen1;~Yuan_Cao1;~Quanquan_Gu1", "gender": "M;M;M", "homepage": "https://jinghuichen.github.io/;https://yuancaohku.github.io/;http://web.cs.ucla.edu/~qgu/", "dblp": "67/5633;;50/4597", "google_scholar": "mKia7Y4AAAAJ;-VGnHI4AAAAJ;GU9HgNAAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Jinghui_Chen1;~Yuan_Cao1;~Quanquan_Gu1", "aff": "Pennsylvania State University;University of Hong Kong;University of California, Los Angeles", "aff_domain": "psu.edu;hku.hk;cs.ucla.edu", "position": "Assistant Professor;Assistant Professor;Assistant Professor", "bibtex": "@misc{\nchen2022benign,\ntitle={Benign Overfitting in Adversarially Robust Linear Classification},\nauthor={Jinghui Chen and Yuan Cao and Quanquan Gu},\nyear={2022},\nurl={https://openreview.net/forum?id=HI99z0aLsl}\n}", "github": "", "project": "", "reviewers": "V6MB;7uYN;kp1N;AeLK", "site": "https://openreview.net/forum?id=HI99z0aLsl", "pdf_size": 0, "recommendation": "5;5;5;6", "confidence": "3;2;4;4", "correctness": "3;4;4;4", "technical_novelty": "3;3;3;2", "empirical_novelty": "3;1;1;2", "wc_summary_paper": "81;51;30;111", "wc_summary_review": "56;29;25;59", "wc_main_review": "151;141;578;289", "wc_review": "288;221;633;459", "wc_reply_reviewers": "0;87;0;0", "wc_reply_authors": "470;886;540;318", "reply_reviewers": "0;1;0;0", "reply_authors": "1;2;1;1", "recommendation_avg": [ 5.25, 0.4330127018922193 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 68.25, 30.621683493890405 ], "wc_summary_review_avg": [ 42.25, 15.35211711784404 ], "wc_main_review_avg": [ 289.75, 176.39922760601874 ], "wc_review_avg": [ 400.25, 159.96464453122132 ], "wc_reply_reviewers_avg": [ 21.75, 37.67210506462308 ], "wc_reply_authors_avg": [ 553.5, 208.06909909931363 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.5222329678670935, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11535208795950269789&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 11, "aff_unique_index": "0;1;2", "aff_unique_norm": "Pennsylvania State University;University of Hong Kong;University of California, Los Angeles", "aff_unique_dep": ";;", "aff_unique_url": "https://www.psu.edu;https://www.hku.hk;https://www.ucla.edu", "aff_unique_abbr": "PSU;HKU;UCLA", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Hong Kong SAR;Los Angeles", "aff_country_unique_index": "0;1;0", "aff_country_unique": "United States;China" }, { "id": "HLTLhiBtUcW", "title": "Enhanced neural network regularization with macro-block dropout", "track": "main", "status": "Desk Reject", "tldr": "", "abstract": " This paper proposes a new regularization algorithm referred to as macro-block dropout. The overfitting issue has been a difficult problem in training large network models. The dropout technique has proven to be simple yet very effective for regularization by preventing complex co-adaptations on training data. In this work, we observe that in the hidden outputs, the correlations between geometrically close elements are usually stronger than those between distant elements. Motivated by this observation, we define a macro-block that contains multiple elements of the hidden output layer in order to reduce co-adaptations more effectively. Rather than applying dropout to each element, we apply random dropout to each macro-block. In our experiments with image classification tasks on the MNIST and the ImageNet datasets as well as a speech recognition task on the LibriSpeech set, this simple algorithm has shown a quite significant improvement over the conventional dropout approach", "keywords": "macro block dropout;regularization", "primary_area": "", "supplementary_material": "", "author": "Chanwoo Kim", "authorids": "~Chanwoo_Kim2", "gender": "M", "homepage": "https://www.facebook.com/chanwcom", "dblp": "", "google_scholar": "pJoZXxYAAAAJ", "orcid": "", "linkedin": "chanwoo-kim-2628a622/?originalSubdomain=kr", "or_profile": "~Chanwoo_Kim2", "aff": "Samsung Research", "aff_domain": "samsung.com", "position": "Corporate Vice President", "bibtex": "@misc{\nkim2022enhanced,\ntitle={Enhanced neural network regularization with macro-block dropout},\nauthor={Chanwoo Kim},\nyear={2022},\nurl={https://openreview.net/forum?id=HLTLhiBtUcW}\n}", "github": "", "project": "", "reviewers": "", "site": "https://openreview.net/forum?id=HLTLhiBtUcW", "pdf_size": 0, "recommendation": "", "confidence": "", "correctness": "", "technical_novelty": "", "empirical_novelty": "", "wc_summary_paper": "", "wc_summary_review": "", "wc_main_review": "", "wc_review": "", "wc_reply_reviewers": "", "wc_reply_authors": "", "reply_reviewers": "", "reply_authors": "", "recommendation_avg": [ 0, 0 ], "confidence_avg": [ 0, 0 ], "correctness_avg": [ 0, 0 ], "technical_novelty_avg": [ 0, 0 ], "empirical_novelty_avg": [ 0, 0 ], "wc_summary_paper_avg": [ 0, 0 ], "wc_summary_review_avg": [ 0, 0 ], "wc_main_review_avg": [ 0, 0 ], "wc_review_avg": [ 0, 0 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 1, 0 ], "authors#_avg": [ 1, 0 ], "corr_recommendation_confidence": 0, "corr_recommendation_correctness": 0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Fqey1W5Ayl8J:scholar.google.com/&scioq=Enhanced+neural+network+regularization+with+macro-block+dropout&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "Samsung", "aff_unique_dep": "Samsung Research", "aff_unique_url": "https://research.samsung.com", "aff_unique_abbr": "Samsung", "aff_country_unique_index": "0", "aff_country_unique": "South Korea" }, { "id": "HL_qE4fz-JZ", "title": "Input Dependent Sparse Gaussian Processes", "track": "main", "status": "Reject", "tldr": "", "abstract": "Gaussian Processes (GPs) are Bayesian models that provide uncertainty estimates associated to the predictions made. They are also very flexible due to their non-parametric nature. Nevertheless, GPs suffer from poor scalability as the number of training instances $N$ increases. More precisely, they have a cubic cost with respect to $N$. To overcome this problem, sparse GP approximations are often used, where a set of $M \\ll N$ inducing points is introduced during training. The location of the inducing points is learned by considering them as parameters of an approximate posterior distribution $q$. Sparse GPs, combined with variational inference for inferring $q$, reduce the training cost of GPs to $\\mathcal{O}(M^3)$. Critically, the inducing points determine the flexibility of the model and they are often located in regions of the input space where the latent function changes. A limitation is, however, that for some learning tasks a large number of inducing points may be required to obtain a good prediction performance. To address this limitation, we propose here to amortize the computation of the inducing points locations, as well as the parameters of the variational posterior approximation $q$. For this, we use a neural network that receives the observed data as an input and outputs the inducing points locations and the parameters of $q$. We evaluate our method in several experiments, showing that it performs similar or better than other state-of-the-art sparse variational GP approaches. However, with our method the number of inducing points is reduced drastically due to their dependency on the input data. This makes our method scale to larger datasets and have faster training and prediction times. ", "keywords": "gaussian processes;variational inference;neural networks;sparse approximations", "primary_area": "", "supplementary_material": "/attachment/9375cd415561d4f2f8443c365927ffee95c7df3b.zip", "author": "Bahram Jafrasteh;Carlos Villacampa-Calvo;Daniel Hern\u00e1ndez-Lobato", "authorids": "~Bahram_Jafrasteh1;~Carlos_Villacampa-Calvo1;~Daniel_Hern\u00e1ndez-Lobato1", "gender": "M;M;M", "homepage": ";;http://dhnzl.org", "dblp": ";203/4475;95/166", "google_scholar": "ShNTw54AAAAJ;https://scholar.google.es/citations?user=IYLA7vUAAAAJ;https://scholar.google.es/citations?user=rL6cvTUAAAAJ", "orcid": ";0000-0002-5732-9101;", "linkedin": "bahram-jafrasteh-866b79131/;carlos-villacampa-calvo/;", "or_profile": "~Bahram_Jafrasteh1;~Carlos_Villacampa-Calvo1;~Daniel_Hern\u00e1ndez-Lobato1", "aff": "Instituto de investigaci\u00f3n e innovaci\u00f3n biom\u00e9dica de C\u00e1diz;Universidad Aut\u00f3noma de Madrid;Universidad Aut\u00f3noma de Madrid", "aff_domain": "inibica.es;uam.es;uam.es", "position": "Postdoc;PhD student;Associate Professor", "bibtex": "@misc{\njafrasteh2022input,\ntitle={Input Dependent Sparse Gaussian Processes },\nauthor={Bahram Jafrasteh and Carlos Villacampa-Calvo and Daniel Hern{\\'a}ndez-Lobato},\nyear={2022},\nurl={https://openreview.net/forum?id=HL_qE4fz-JZ}\n}", "github": "", "project": "", "reviewers": "rpPD;spvN;m21i;QKXW;oyvg", "site": "https://openreview.net/forum?id=HL_qE4fz-JZ", "pdf_size": 0, "recommendation": "3;3;5;6;8", "confidence": "3;4;4;3;3", "correctness": "3;3;3;3;3", "technical_novelty": "2;3;2;2;4", "empirical_novelty": "2;2;2;2;4", "wc_summary_paper": "212;72;78;101;69", "wc_summary_review": "189;27;146;92;30", "wc_main_review": "746;256;511;445;409", "wc_review": "1147;355;735;638;508", "wc_reply_reviewers": "21;131;113;17;41", "wc_reply_authors": "828;1440;1021;636;333", "reply_reviewers": "1;1;1;1;1", "reply_authors": "1;2;2;1;1", "recommendation_avg": [ 5.0, 1.8973665961010275 ], "confidence_avg": [ 3.4, 0.4898979485566356 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.6, 0.8 ], "empirical_novelty_avg": [ 2.4, 0.8 ], "wc_summary_paper_avg": [ 106.4, 53.979996294923914 ], "wc_summary_review_avg": [ 96.8, 63.684849061609626 ], "wc_main_review_avg": [ 473.4, 159.9632457785225 ], "wc_review_avg": [ 676.6, 267.59267553503776 ], "wc_reply_reviewers_avg": [ 64.6, 47.90657574905557 ], "wc_reply_authors_avg": [ 851.6, 371.7249520815088 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 1.4, 0.4898979485566356 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.4303314829119353, "corr_recommendation_correctness": 0.0, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5773381144477961556&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff_unique_index": "0;1;1", "aff_unique_norm": "Instituto de Investigaci\u00f3n e Innovaci\u00f3n Biom\u00e9dica de C\u00e1diz;Universidad Aut\u00f3noma de Madrid", "aff_unique_dep": ";", "aff_unique_url": ";https://www.uam.es", "aff_unique_abbr": ";UAM", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Spain" }, { "title": "Online Target Q-learning with Reverse Experience Replay: Efficiently finding the Optimal Policy for Linear MDPs", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7207", "id": "HMJdXzbWKH", "poster": "", "openreview": "https://openreview.net/forum?id=HMJdXzbWKH", "slides": "https://iclr.cc/virtual/2022/poster/7207", "video": "https://iclr.cc/virtual/2022/poster/7207", "author_site": "Naman Agarwal, Syomantak Chaudhuri, Prateek Jain, Dheeraj Nagaraj, Praneeth Netrapalli", "tldr": "", "abstract": "Q-learning is a popular Reinforcement Learning (RL) algorithm which is widely used in practice with function approximation (Mnih et al., 2015). In contrast, existing theoretical results are pessimistic about Q-learning. For example, (Baird, 1995) shows that Q-learning does not converge even with linear function approximation for linear MDPs. Furthermore, even for tabular MDPs with synchronous updates, Q-learning was shown to have sub-optimal sample complexity (Li et al., 2021, Azar et al., 2013). The goal of this work is to bridge the gap between practical success of Q-learning and the relatively pessimistic theoretical results. The starting point of our work is the observation that in practice, Q-learning is used with two important modifications: (i) training with two networks, called online network and target network simultaneously (online target learning, or OTL) , and (ii) experience replay (ER) (Mnih et al., 2015). While they have been observed to play a significant role in the practical success of Q-learning, a thorough theoretical understanding of how these two modifications improve the convergence behavior of Q-learning has been missing in literature. By carefully combining the Q-learning with OTL and reverse experience replay (RER) (a form of experience replay), we present novel methods Q-Rex and Q-RexDaRe (Q-Rex+data reuse). We show that Q-Rex efficiently finds the optimal policy for linear MDPs and provide non-asymptotic bounds on sample complexity -- the first such result for a Q-learning method with linear MDPs. Furthermore, we demonstrate that Q-RexDaRe in fact achieves near optimal sample complexity in the tabular setting, improving upon the existing results for vanilla Q-learning. \n", "keywords": "Q Learning;RL with Function Approximation;Experience Replay;Online Target Learning", "primary_area": "", "supplementary_material": "", "author": "Naman Agarwal;Syomantak Chaudhuri;Prateek Jain;Dheeraj Mysore Nagaraj;Praneeth Netrapalli", "authorids": "~Naman_Agarwal1;~Syomantak_Chaudhuri1;~Prateek_Jain1;~Dheeraj_Mysore_Nagaraj1;~Praneeth_Netrapalli1", "gender": "M;;M;M;M", "homepage": "https://naman33k.github.io;;http://prateekjain.org;https://dheerajmn.mit.edu;http://praneethnetrapalli.org/", "dblp": "72/3910;;https://dblp.uni-trier.de/pers/j/Jain_0002:Prateek.html;215/5097;http://dblp.uni-trier.de/pers/hd/n/Netrapalli:Praneeth", "google_scholar": "sEMrGicAAAAJ;;qYhRbJoAAAAJ;0g80b7sAAAAJ;https://scholar.google.co.in/citations?user=mim8FQkAAAAJ", "orcid": ";;;;", "linkedin": ";;;dheeraj-m-nagaraj-01739792/;", "or_profile": "~Naman_Agarwal1;~Syomantak_Chaudhuri1;~Prateek_Jain1;~Dheeraj_Mysore_Nagaraj1;~Praneeth_Netrapalli1", "aff": "Google;;Google;Google;Google", "aff_domain": "google.com;;google.com;google.com;google.com", "position": "Researcher;;Researcher;Research Scientist;Research Scientist", "bibtex": "@inproceedings{\nagarwal2022online,\ntitle={Online Target Q-learning with Reverse Experience Replay: Efficiently finding the Optimal Policy for Linear {MDP}s},\nauthor={Naman Agarwal and Syomantak Chaudhuri and Prateek Jain and Dheeraj Mysore Nagaraj and Praneeth Netrapalli},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=HMJdXzbWKH}\n}", "github": "", "project": "", "reviewers": "Da1Z;EMzv;JvSF;UL8L", "pdf_size": 0, "recommendation": "5;6;8;8", "confidence": "4;3;3;3", "correctness": "4;3;4;4", "technical_novelty": "3;2;3;3", "empirical_novelty": "1;0;3;2", "wc_summary_paper": "65;87;33;61", "wc_summary_review": "39;62;58;49", "wc_main_review": "197;132;419;228", "wc_review": "301;281;510;338", "wc_reply_reviewers": "149;103;224;11", "wc_reply_authors": "558;1008;918;279", "reply_reviewers": "1;1;2;1", "reply_authors": "2;2;2;1", "recommendation_avg": [ 6.75, 1.299038105676658 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.5, 1.118033988749895 ], "wc_summary_paper_avg": [ 61.5, 19.20286436967152 ], "wc_summary_review_avg": [ 52.0, 8.860022573334675 ], "wc_main_review_avg": [ 244.0, 106.81058000029772 ], "wc_review_avg": [ 357.5, 90.3894352233711 ], "wc_reply_reviewers_avg": [ 121.75, 77.16014191277775 ], "wc_reply_authors_avg": [ 690.75, 291.31201056599093 ], "reply_reviewers_avg": [ 1.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.75, 0.4330127018922193 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.7777777777777777, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 31, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18284937111633047201&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=HMJdXzbWKH", "email": "google.com;;google.com;google.com;google.com", "author_num": 5, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Google", "aff_unique_dep": "Google", "aff_unique_url": "https://www.google.com", "aff_unique_abbr": "Google", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "HMR-7-4-Zr", "title": "Contractive error feedback for gradient compression", "track": "main", "status": "Reject", "tldr": "", "abstract": "\nOn-device memory concerns in distributed deep learning are becoming more severe due to i) the growth of model size in multi-GPU training, and ii) the adoption of neural networks for federated learning on IoT devices with limited storage. In such settings, this work deals with memory issues emerging with communication efficient methods. To tackle associated challenges, key advances are that i) instead of EFSGD that inefficiently manages memory, the sweet spot of convergence and memory usage can be attained via what is here termed contractive error feedback (ConEF); and, ii) communication efficiency in ConEF should be achieved by biased and allreducable gradient compression. ConEF is validated on various learning tasks that include image classification, language modeling, and machine translation. ConEF saves 80% \u2013 90% of the extra memory in EFSGD with almost no loss on test performance, while also achieving 1.3x \u2013 5x speedup of SGD.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/0fe7238f55b504f4699d5731e56878f520e34da9.zip", "author": "Bingcong Li;Shuai Zheng;Parameswaran Raman;Anshumali Shrivastava;Georgios B. Giannakis", "authorids": "~Bingcong_Li1;~Shuai_Zheng1;~Parameswaran_Raman1;~Anshumali_Shrivastava1;~Georgios_B._Giannakis1", "gender": ";;M;M;M", "homepage": ";http://www.cse.ust.hk/~szhengac/;https://paramsraman.github.io/;https://www.cs.rice.edu/~as143/;http://spincom.umn.edu/", "dblp": ";13/8659-4;142/2573;63/9828;33/4080", "google_scholar": ";82FZpFYAAAAJ;amJUMFEAAAAJ;https://scholar.google.com.tw/citations?user=SGT23RAAAAAJ;Nu_6R8sAAAAJ", "orcid": ";;;;", "linkedin": ";;;;georgios-b-giannakis-54023b18/", "or_profile": "~Bingcong_Li1;~Shuai_Zheng1;~Parameswaran_Raman1;~Anshumali_Shrivastava1;~Georgios_B._Giannakis1", "aff": ";Amazon Web Services;Amazon;ThirdAI Corp.;", "aff_domain": ";amazon.com;amazon.com;thirdai.com;", "position": ";Senior Applied Scientist;Applied Scientist;CEO;", "bibtex": "@misc{\nli2022contractive,\ntitle={Contractive error feedback for gradient compression},\nauthor={Bingcong Li and Shuai Zheng and Parameswaran Raman and Anshumali Shrivastava and Georgios B. Giannakis},\nyear={2022},\nurl={https://openreview.net/forum?id=HMR-7-4-Zr}\n}", "github": "", "project": "", "reviewers": "HC53;VhWp;2Hku", "site": "https://openreview.net/forum?id=HMR-7-4-Zr", "pdf_size": 0, "recommendation": "3;3;5", "confidence": "4;3;4", "correctness": "3;2;3", "technical_novelty": "2;2;2", "empirical_novelty": "2;2;3", "wc_summary_paper": "129;46;73", "wc_summary_review": "117;40;50", "wc_main_review": "558;444;358", "wc_review": "804;530;481", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 3.6666666666666665, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 2.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 82.66666666666667, 34.56716489515576 ], "wc_summary_review_avg": [ 69.0, 34.18576701884378 ], "wc_main_review_avg": [ 453.3333333333333, 81.91594608024876 ], "wc_review_avg": [ 605.0, 142.1290493413175 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.5, "corr_recommendation_correctness": 0.5, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:vy84blznoFEJ:scholar.google.com/&scioq=Contractive+error+feedback+for+gradient+compression&hl=en&as_sdt=0,33", "gs_version_total": 5, "aff_unique_index": "0;0;1", "aff_unique_norm": "Amazon;ThirdAI Corp.", "aff_unique_dep": "Amazon Web Services;", "aff_unique_url": "https://aws.amazon.com;", "aff_unique_abbr": "AWS;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "HO_LL-oqBzW", "title": "FCause: Flow-based Causal Discovery", "track": "main", "status": "Reject", "tldr": "", "abstract": "Current causal discovery methods either fail to scale, model only limited forms of functional relationships, or cannot handle missing values. This limits their reliability and applicability. We propose FCause, a new flow-based causal discovery method that addresses these drawbacks. Our method is scalable to both high dimensional as well as large volume of data, is able to model complex nonlinear relationships between variables, and can perform causal discovery under partially observed data. Furthermore, our formulation generalizes existing continuous optimization based causal discovery methods, providing a unified view of such models. We perform an extensive empirical evaluation, and show that FCause achieves state of the art results in several causal discovery benchmarks under different conditions reflecting real-world application needs.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/20e4576d5555d93404029ce0d5c9a070c5d2b85b.zip", "author": "Tomas Geffner;Emre Kiciman;Angus Lamb;Martin Kukla;Miltiadis Allamanis;Cheng Zhang", "authorids": "~Tomas_Geffner1;~Emre_Kiciman1;~Angus_Lamb1;~Martin_Kukla1;~Miltiadis_Allamanis1;~Cheng_Zhang1", "gender": "M;M;M;;;F", "homepage": "https://people.umass.edu/tgeffner/;http://kiciman.org;;;;http://cheng-zhang.org", "dblp": "201/5406;166/5285;;;;82/6384-5", "google_scholar": "KIIe2K8AAAAJ;QZCU3NkAAAAJ;;DksFc5QAAAAJ;;r40iAwIAAAAJ", "orcid": ";0000-0001-5429-468X;;;;", "linkedin": "tomasgeffner/;;angusjlamb/;;;", "or_profile": "~Tomas_Geffner1;~Emre_Kiciman1;~Angus_Lamb1;~Martin_Kukla1;~Miltiadis_Allamanis1;~Cheng_Zhang1", "aff": "Department of Computer Science, University of Massachusetts, Amherst;Microsoft;;Microsoft Research;;Microsoft", "aff_domain": "cs.umass.edu;microsoft.com;;research.microsoft.com;;microsoft.com", "position": "PhD student;Researcher;;Researcher;;Principal Researcher", "bibtex": "@misc{\ngeffner2022fcause,\ntitle={{FC}ause: Flow-based Causal Discovery},\nauthor={Tomas Geffner and Emre Kiciman and Angus Lamb and Martin Kukla and Miltiadis Allamanis and Cheng Zhang},\nyear={2022},\nurl={https://openreview.net/forum?id=HO_LL-oqBzW}\n}", "github": "", "project": "", "reviewers": "q8GH;2kV5;Du7p;m3am;DG4z", "site": "https://openreview.net/forum?id=HO_LL-oqBzW", "pdf_size": 0, "recommendation": "3;3;5;6;8", "confidence": "3;4;3;4;3", "correctness": "2;2;3;3;4", "technical_novelty": "2;2;3;3;3", "empirical_novelty": "2;2;0;3;0", "wc_summary_paper": "29;24;37;59;53", "wc_summary_review": "35;34;24;47;61", "wc_main_review": "397;125;246;433;113", "wc_review": "461;183;307;539;227", "wc_reply_reviewers": "0;0;0;203;75", "wc_reply_authors": "698;22;341;765;251", "reply_reviewers": "0;0;0;2;1", "reply_authors": "1;1;1;2;1", "recommendation_avg": [ 5.0, 1.8973665961010275 ], "confidence_avg": [ 3.4, 0.4898979485566356 ], "correctness_avg": [ 2.8, 0.7483314773547882 ], "technical_novelty_avg": [ 2.6, 0.4898979485566356 ], "empirical_novelty_avg": [ 1.4, 1.2 ], "wc_summary_paper_avg": [ 40.4, 13.52922762023021 ], "wc_summary_review_avg": [ 40.2, 12.702755606560334 ], "wc_main_review_avg": [ 262.8, 133.18318212146758 ], "wc_review_avg": [ 343.4, 136.15520555601242 ], "wc_reply_reviewers_avg": [ 55.6, 79.21767479546469 ], "wc_reply_authors_avg": [ 415.4, 279.07318036672746 ], "reply_reviewers_avg": [ 0.6, 0.8 ], "reply_authors_avg": [ 1.2, 0.4 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.21516574145596765, "corr_recommendation_correctness": 0.9860132971832695, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:uNg1qnUTmYkJ:scholar.google.com/&scioq=FCause:+Flow-based+Causal+Discovery&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1;1;1", "aff_unique_norm": "University of Massachusetts Amherst;Microsoft", "aff_unique_dep": "Department of Computer Science;Microsoft Corporation", "aff_unique_url": "https://www.umass.edu;https://www.microsoft.com", "aff_unique_abbr": "UMass Amherst;Microsoft", "aff_campus_unique_index": "0", "aff_campus_unique": "Amherst;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "GradSign: Model Performance Inference with Theoretical Insights", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6993", "id": "HObMhrCeAAF", "poster": "", "openreview": "https://openreview.net/forum?id=HObMhrCeAAF", "slides": "https://iclr.cc/virtual/2022/poster/6993", "video": "https://iclr.cc/virtual/2022/poster/6993", "author_site": "Zhihao Zhang, Zhihao Jia", "tldr": "", "abstract": "A key challenge in neural architecture search (NAS) is quickly inferring the predictive performance of a broad spectrum of networks to discover statistically accurate and computationally efficient ones. We refer to this task as model performance inference (MPI). The current practice for efficient MPI is gradient-based methods that leverage the gradients of a network at initialization to infer its performance. However, existing gradient-based methods rely only on heuristic metrics and lack the necessary theoretical foundations to consolidate their designs. We propose GradSign, an accurate, simple, and flexible metric for model performance inference with theoretical insights. The key idea behind GradSign is a quantity \u03a8 to analyze the sample-wise optimization landscape of different networks. Theoretically, we show that \u03a8 is an upper bound for both the training and true population losses of a neural network under reasonable assumptions. However, it is computationally prohibitive to directly calculate \u03a8 for modern neural networks. To\naddress this challenge, we design GradSign, an accurate and simple approximation of \u03a8 using the gradients of a network evaluated at a random initialization state. Evaluation on seven NAS benchmarks across three training datasets shows that GradSign generalizes well to real-world networks and consistently outperforms state-of-the-art gradient-based methods for MPI evaluated by Spearman\u2019s \u03c1 and Kendall\u2019s Tau. Additionally, we integrate GradSign into four existing NAS algorithms and show that the GradSign-assisted NAS algorithms outperform their vanilla counterparts by improving the accuracies of best-discovered networks by up to 0.3%, 1.1%, and 1.0% on three real-world tasks. Code is available at https://github.com/JackFram/GradSign", "keywords": "Model Performance Inference;Optimization Landscape;NAS", "primary_area": "", "supplementary_material": "/attachment/4a7b931e285817911856fbf0506272c72ed3d847.zip", "author": "Zhihao Zhang;Zhihao Jia", "authorids": "~Zhihao_Zhang2;~Zhihao_Jia1", "gender": ";M", "homepage": ";https://www.cs.cmu.edu/~zhihaoj2/", "dblp": "91/5464;", "google_scholar": "https://scholar.google.com/citations?hl=en;", "orcid": ";", "linkedin": ";", "or_profile": "~Zhihao_Zhang2;~Zhihao_Jia1", "aff": "Carnegie Mellon University;Carnegie Mellon University", "aff_domain": "andrew.cmu.edu;cmu.edu", "position": "MS student;Assistant Professor", "bibtex": "@inproceedings{\nzhang2022gradsign,\ntitle={GradSign: Model Performance Inference with Theoretical Insights},\nauthor={Zhihao Zhang and Zhihao Jia},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=HObMhrCeAAF}\n}", "github": "", "project": "", "reviewers": "YHPS;H9Ev;CHXp", "pdf_size": 0, "recommendation": "6;6;8", "confidence": "3;2;4", "correctness": "4;3;4", "technical_novelty": "3;3;3", "empirical_novelty": "3;2;3", "wc_summary_paper": "67;70;48", "wc_summary_review": "37;26;29", "wc_main_review": "72;114;498", "wc_review": "176;210;575", "wc_reply_reviewers": "0;0;99", "wc_reply_authors": "357;690;844", "reply_reviewers": "0;0;1", "reply_authors": "1;2;3", "recommendation_avg": [ 6.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 3.0, 0.816496580927726 ], "correctness_avg": [ 3.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 61.666666666666664, 9.741092797468305 ], "wc_summary_review_avg": [ 30.666666666666668, 4.642796092394707 ], "wc_main_review_avg": [ 228.0, 191.68724527208377 ], "wc_review_avg": [ 320.3333333333333, 180.61069243603 ], "wc_reply_reviewers_avg": [ 33.0, 46.66904755831214 ], "wc_reply_authors_avg": [ 630.3333333333334, 203.24424277755625 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 2.0, 0.816496580927726 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.8660254037844385, "corr_recommendation_correctness": 0.49999999999999983, "gs_citation": 37, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3694655977867314060&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=HObMhrCeAAF", "email": "andrew.cmu.edu;cmu.edu", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Carnegie Mellon University", "aff_unique_dep": "", "aff_unique_url": "https://www.cmu.edu", "aff_unique_abbr": "CMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "CROP: Certifying Robust Policies for Reinforcement Learning through Functional Smoothing", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6676", "id": "HOjLHrlZhmx", "poster": "", "openreview": "https://openreview.net/forum?id=HOjLHrlZhmx", "slides": "https://iclr.cc/virtual/2022/poster/6676", "video": "https://iclr.cc/virtual/2022/poster/6676", "author_site": "Fan Wu, Linyi Li, Zijian Huang, Yevgeniy Vorobeychik, DING ZHAO, Bo Li", "tldr": "", "abstract": "As reinforcement learning (RL) has achieved great success and been even adopted in safety-critical domains such as autonomous vehicles, a range of empirical studies have been conducted to improve its robustness against adversarial attacks. However, how to certify its robustness with theoretical guarantees still remains challenging. In this paper, we present the \ufb01rst uni\ufb01ed framework CROP (Certifying Robust Policies for RL) to provide robustness certi\ufb01cation on both action and reward levels. In particular, we propose two robustness certi\ufb01cation criteria: robustness of per-state actions and lower bound of cumulative rewards. We then develop a local smoothing algorithm for policies derived from Q-functions to guarantee the robustness of actions taken along the trajectory; we also develop a global smoothing algorithm for certifying the lower bound of a \ufb01nite-horizon cumulative reward, as well as a novel local smoothing algorithm to perform adaptive search in order to obtain tighter reward certi\ufb01cation. Empirically, we apply CROP to evaluate several existing empirically robust RL algorithms, including adversarial training and different robust regularization, in four environments (two representative Atari games, Highway, and CartPole). Furthermore, by evaluating these algorithms against adversarial attacks, we demonstrate that our certi\ufb01cations are often tight. All experiment results are available at website https://crop-leaderboard.github.io.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/526a90f67b614ec1870f2265a9abb52a644f13e2.zip", "author": "Fan Wu;Linyi Li;Zijian Huang;Yevgeniy Vorobeychik;Ding Zhao;Bo Li", "authorids": "~Fan_Wu6;~Linyi_Li1;~Zijian_Huang2;~Yevgeniy_Vorobeychik1;~Ding_Zhao1;~Bo_Li19", "gender": "F;M;M;;F;M", "homepage": ";http://linyil.com;http://vorobeychik.com;https://safeai-lab.github.io;http://boli.cs.illinois.edu/;https://zijianh4.github.io", "dblp": "07/6378-11;99/4340-1.html;70/2217;;50/3402-26;205/5823-2", "google_scholar": "qd8WzBMAAAAJ;-b0sk-YAAAAJ;https://scholar.google.com.tw/citations?user=ptI-HHkAAAAJ;z7tPc9IAAAAJ;K8vJkTcAAAAJ;9dlrr8MAAAAJ", "orcid": ";;;;;", "linkedin": ";;;;;zijian-huang-0b8b8b178/", "or_profile": "~Fan_Wu6;~Linyi_Li1;~Yevgeniy_Vorobeychik1;~Ding_Zhao1;~Bo_Li19;~Zijian_HUANG1", "aff": "University of Illinois, Urbana Champaign;Microsoft Research;Washington University, St. Louis;Carnegie Mellon University;University of Illinois, Urbana Champaign;University of Illinois, Urbana-Champaign", "aff_domain": "illinois.edu;microsoft.com;wustl.edu;cmu.edu;illinois.edu;uiuc.edu", "position": "PhD student;Research Intern;Full Professor;Associate Professor;Assistant Professor;MS student", "bibtex": "@inproceedings{\nwu2022crop,\ntitle={{CROP}: Certifying Robust Policies for Reinforcement Learning through Functional Smoothing},\nauthor={Fan Wu and Linyi Li and Zijian Huang and Yevgeniy Vorobeychik and Ding Zhao and Bo Li},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=HOjLHrlZhmx}\n}", "github": "", "project": "", "reviewers": "ZBtu;zGtv;PEk6;tzpV", "pdf_size": 0, "recommendation": "3;5;6;6", "confidence": "3;4;2;4", "correctness": "3;2;3;4", "technical_novelty": "1;3;3;3", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "43;122;50;80", "wc_summary_review": "44;29;29;166", "wc_main_review": "789;640;216;544", "wc_review": "876;791;295;790", "wc_reply_reviewers": "0;743;176;59", "wc_reply_authors": "1915;3634;587;1854", "reply_reviewers": "0;3;1;1", "reply_authors": "4;7;1;4", "recommendation_avg": [ 5.0, 1.224744871391589 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.5, 0.8660254037844386 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 73.75, 31.13177637077589 ], "wc_summary_review_avg": [ 67.0, 57.4847805945191 ], "wc_main_review_avg": [ 547.25, 210.2277039307617 ], "wc_review_avg": [ 688.0, 229.56807269304676 ], "wc_reply_reviewers_avg": [ 244.5, 294.69687816466603 ], "wc_reply_authors_avg": [ 1997.5, 1083.4021644800237 ], "reply_reviewers_avg": [ 1.25, 1.0897247358851685 ], "reply_authors_avg": [ 4.0, 2.1213203435596424 ], "replies_avg": [ 27, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.28867513459481287, "gs_citation": 59, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15014236512905424649&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "pdf": "https://openreview.net/pdf?id=HOjLHrlZhmx", "email": "illinois.edu;microsoft.com;wustl.edu;cmu.edu;illinois.edu;uiuc.edu", "author_num": 6, "aff_unique_index": "0;1;2;3;0;4", "aff_unique_norm": "University of Illinois Urbana-Champaign;Microsoft;Washington University in St. Louis;Carnegie Mellon University;University of Illinois", "aff_unique_dep": ";Microsoft Research;;;", "aff_unique_url": "https://illinois.edu;https://www.microsoft.com/en-us/research;https://wustl.edu;https://www.cmu.edu;https://illinois.edu", "aff_unique_abbr": "UIUC;MSR;WUSTL;CMU;UIUC", "aff_campus_unique_index": "0;2;0;0", "aff_campus_unique": "Urbana-Champaign;;St. Louis", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "HRF6T1SsyDn", "title": "On the Expressiveness and Learning of Relational Neural Networks on Hypergraphs", "track": "main", "status": "Reject", "tldr": "", "abstract": "This paper presents a framework for analyzing the expressiveness and learning of relational models applied to hypergraph reasoning tasks. We start with a general framework that unifies several relational neural network architectures: graph neural networks, neural logical machines, and transformers. Our first contribution is a fine-grained analysis of the expressiveness of these neural networks, that is, the set of functions that they can realize and the set of problems that they can solve. Our result is a hierarchy of problems they can solve, defined in terms of various hyperparameters such as depth and width. Next, we analyze the learning properties of these neural networks, especially focusing on how they can be trained on a small graphs and generalize to larger graphs. Our theoretical results are further supported by the empirical results illustrating the optimization and generalization of these models based on gradient-descent training.", "keywords": "graph neural networks;deep learning theory", "primary_area": "", "supplementary_material": "", "author": "Zhezheng Luo;Jiayuan Mao;Joshua B. Tenenbaum;Leslie Pack Kaelbling", "authorids": "~Zhezheng_Luo1;~Jiayuan_Mao1;~Joshua_B._Tenenbaum1;~Leslie_Pack_Kaelbling1", "gender": "M;F;;F", "homepage": "https://www.csail.mit.edu/person/zhezheng-luo;http://jiayuanm.com;;http://people.csail.mit.edu/lpk/", "dblp": "https://dblp.uni-trier.de/pid/258/3621.html;200/8283;t/JoshuaBTenenbaum;k/LesliePackKaelbling", "google_scholar": ";-xaOIZIAAAAJ;;IcasIiwAAAAJ", "orcid": ";0000-0003-4798-3748;;0000-0001-6054-7145", "linkedin": ";;;", "or_profile": "~Zhezheng_Luo1;~Jiayuan_Mao1;~Joshua_B._Tenenbaum1;~Leslie_Pack_Kaelbling1", "aff": ";Massachusetts Institute of Technology;Massachusetts Institute of Technology;Massachusetts Institute of Technology", "aff_domain": ";mit.edu;mit.edu;mit.edu", "position": ";PhD student;Professor;Full Professor", "bibtex": "@misc{\nluo2022on,\ntitle={On the Expressiveness and Learning of Relational Neural Networks on Hypergraphs},\nauthor={Zhezheng Luo and Jiayuan Mao and Joshua B. Tenenbaum and Leslie Pack Kaelbling},\nyear={2022},\nurl={https://openreview.net/forum?id=HRF6T1SsyDn}\n}", "github": "", "project": "", "reviewers": "d5Yc;6v52;m5XE;NLAa;4AVA", "site": "https://openreview.net/forum?id=HRF6T1SsyDn", "pdf_size": 0, "recommendation": "3;3;5;5;5", "confidence": "5;3;4;2;4", "correctness": "3;2;2;4;3", "technical_novelty": "1;3;3;2;2", "empirical_novelty": "1;2;1;2;3", "wc_summary_paper": "189;91;90;31;42", "wc_summary_review": "30;49;42;11;40", "wc_main_review": "325;411;232;271;167", "wc_review": "544;551;364;313;249", "wc_reply_reviewers": "95;0;0;0;0", "wc_reply_authors": "631;425;742;175;251", "reply_reviewers": "1;0;0;0;0", "reply_authors": "1;1;1;1;1", "recommendation_avg": [ 4.2, 0.9797958971132712 ], "confidence_avg": [ 3.6, 1.019803902718557 ], "correctness_avg": [ 2.8, 0.7483314773547882 ], "technical_novelty_avg": [ 2.2, 0.7483314773547882 ], "empirical_novelty_avg": [ 1.8, 0.7483314773547883 ], "wc_summary_paper_avg": [ 88.6, 55.81612670187712 ], "wc_summary_review_avg": [ 34.4, 13.184839779079608 ], "wc_main_review_avg": [ 281.2, 82.8647090141515 ], "wc_review_avg": [ 404.2, 122.5681851052711 ], "wc_reply_reviewers_avg": [ 19.0, 38.0 ], "wc_reply_authors_avg": [ 444.8, 216.21322808745998 ], "reply_reviewers_avg": [ 0.2, 0.4000000000000001 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.32025630761017426, "corr_recommendation_correctness": 0.3273268353539886, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11977776218298069843&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;0", "aff_unique_norm": "Massachusetts Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://web.mit.edu", "aff_unique_abbr": "MIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "HRL6el2SBQ", "title": "Intra-class Mixup for Out-of-Distribution Detection", "track": "main", "status": "Reject", "tldr": "", "abstract": "Deep neural networks have found widespread adoption in solving image recognition and natural language processing tasks. However, they make confident mispredictions when presented with data that does not belong to the training distribution, i.e. out-of-distribution (OoD) samples. Inter-class mixup has been shown to improve model calibration aiding OoD detection. However, we show that both empirical risk minimization and inter-class mixup create large angular spread in latent representation. This reduces the separability of in-distribution data from OoD data. In this paper we propose intra-class mixup supplemented with angular margin to improve OoD detection. Angular margin is the angle between the decision boundary normal and sample representation. We show that intra-class mixup forces the network to learn representations with low angular spread in the latent space. This improves the separability of OoD from in-distribution examples. Our approach when applied to various existing OoD detection techniques shows an improvement of 4.68% and 6.38% in AUROC performance over empirical risk minimization and inter-class mixup, respectively. Further, our approach aided with angular margin improves AUROC performance by 7.36% and 9.10% over empirical risk minimization and inter-class mixup, respectively.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/25b90a01f26c2da1f383915ec237f4619d0c7f6c.zip", "author": "Deepak Ravikumar;Sangamesh Kodge;Isha Garg;Kaushik Roy", "authorids": "~Deepak_Ravikumar1;~Sangamesh_Kodge1;~Isha_Garg1;~Kaushik_Roy1", "gender": ";M;F;M", "homepage": ";;;https://engineering.purdue.edu/NRL/Group", "dblp": ";203/5657.html;;r/KaushikRoy", "google_scholar": ";;;to4P8KgAAAAJ", "orcid": ";0000-0001-9713-5400;;", "linkedin": ";sangameshkodge;;", "or_profile": "~Deepak_Ravikumar1;~Sangamesh_Kodge1;~Isha_Garg1;~Kaushik_Roy1", "aff": ";Purdue University;Purdue University;Purdue University", "aff_domain": ";purdue.edu;purdue.edu;purdue.edu", "position": ";PhD student;PhD student;Full Professor", "bibtex": "@misc{\nravikumar2022intraclass,\ntitle={Intra-class Mixup for Out-of-Distribution Detection},\nauthor={Deepak Ravikumar and Sangamesh Kodge and Isha Garg and Kaushik Roy},\nyear={2022},\nurl={https://openreview.net/forum?id=HRL6el2SBQ}\n}", "github": "", "project": "", "reviewers": "DAih;WTWp;Fzqo;xKKx", "site": "https://openreview.net/forum?id=HRL6el2SBQ", "pdf_size": 0, "recommendation": "3;5;6;8", "confidence": "4;5;3;4", "correctness": "3;3;3;4", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "56;133;171;76", "wc_summary_review": "33;62;26;54", "wc_main_review": "705;498;95;134", "wc_review": "794;693;292;264", "wc_reply_reviewers": "377;55;39;0", "wc_reply_authors": "1101;795;219;186", "reply_reviewers": "1;1;1;0", "reply_authors": "2;1;1;1", "recommendation_avg": [ 5.5, 1.8027756377319946 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 109.0, 45.601535061881414 ], "wc_summary_review_avg": [ 43.75, 14.737282653189496 ], "wc_main_review_avg": [ 358.0, 254.63405114006258 ], "wc_review_avg": [ 510.75, 235.68132615886225 ], "wc_reply_reviewers_avg": [ 117.75, 151.00889874441174 ], "wc_reply_authors_avg": [ 575.25, 388.30810898048475 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.19611613513818402, "corr_recommendation_correctness": 0.8006407690254357, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16347118176299359721&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Purdue University", "aff_unique_dep": "", "aff_unique_url": "https://www.purdue.edu", "aff_unique_abbr": "Purdue", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Delaunay Component Analysis for Evaluation of Data Representations", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6421", "id": "HTVch9AMPa", "poster": "", "openreview": "https://openreview.net/forum?id=HTVch9AMPa", "slides": "https://iclr.cc/virtual/2022/poster/6421", "video": "https://iclr.cc/virtual/2022/poster/6421", "author_site": "Petra Poklukar, Vladislav Polianskii, Anastasiia Varava, Florian T. Pokorny, Danica Kragic", "tldr": "", "abstract": "Advanced representation learning techniques require reliable and general evaluation methods. Recently, several algorithms based on the common idea of geometric and topological analysis of a manifold approximated from the learned data representations have been proposed. In this work, we introduce Delaunay Component Analysis (DCA) -- an evaluation algorithm which approximates the data manifold using a more suitable neighbourhood graph called Delaunay graph. This provides a reliable manifold estimation even for challenging geometric arrangements of representations such as clusters with varying shape and density as well as outliers, which is where existing methods often fail. Furthermore, we exploit the nature of Delaunay graphs and introduce a framework for assessing the quality of individual novel data representations. We experimentally validate the proposed DCA method on representations obtained from neural networks trained with contrastive objective, supervised and generative models, and demonstrate various use cases of our extended single point evaluation framework.", "keywords": "Interpretation and Evaluation of Learned Representations;Generative Models;Contrastive Learning", "primary_area": "", "supplementary_material": "/attachment/af8f5937c9ea36621e5cde44a4273bc94bf1ea9d.zip", "author": "Petra Poklukar;Vladislav Polianskii;Anastasiia Varava;Florian T. Pokorny;Danica Kragic Jensfelt", "authorids": "~Petra_Poklukar1;~Vladislav_Polianskii1;~Anastasiia_Varava1;~Florian_T._Pokorny1;~Danica_Kragic_Jensfelt1", "gender": "F;M;;;", "homepage": "https://people.kth.se/~poklukar/;http://vpol.me/;https://www.kth.se/profile/varava;;", "dblp": "250/5642;242/3841;;;", "google_scholar": "HeIVDaQAAAAJ;dFoG9rkAAAAJ;;;", "orcid": "0000-0001-6920-5109;0000-0001-9805-0388;;;", "linkedin": "petra-poklukar/;;;;", "or_profile": "~Petra_Poklukar1;~Vladislav_Polianskii1;~Anastasiia_Varava1;~Florian_T._Pokorny1;~Danica_Kragic_Jensfelt1", "aff": "KTH Royal Institute of Technology, Stockholm, Sweden;KTH Royal Institute of Technology, Stockholm, Sweden;KTH Royal Institute of Technology, Stockholm, Sweden;;", "aff_domain": "kth.se;kth.se;kth.se;;", "position": "PhD student;PhD student;Postdoc;;", "bibtex": "@inproceedings{\npoklukar2022delaunay,\ntitle={Delaunay Component Analysis for Evaluation of Data Representations},\nauthor={Petra Poklukar and Vladislav Polianskii and Anastasiia Varava and Florian T. Pokorny and Danica Kragic Jensfelt},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=HTVch9AMPa}\n}", "github": "", "project": "", "reviewers": "JKBs;vso9;vfbH", "pdf_size": 0, "recommendation": "6;8;8", "confidence": "3;4;3", "correctness": "3;4;4", "technical_novelty": "3;3;2", "empirical_novelty": "3;3;3", "wc_summary_paper": "185;81;268", "wc_summary_review": "53;102;71", "wc_main_review": "522;588;105", "wc_review": "760;771;444", "wc_reply_reviewers": "86;28;0", "wc_reply_authors": "1423;1087;478", "reply_reviewers": "1;1;0", "reply_authors": "7;7;2", "recommendation_avg": [ 7.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "correctness_avg": [ 3.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 178.0, 76.5027232630752 ], "wc_summary_review_avg": [ 75.33333333333333, 20.237478982214054 ], "wc_main_review_avg": [ 405.0, 213.83638605251446 ], "wc_review_avg": [ 658.3333333333334, 151.62307065732298 ], "wc_reply_reviewers_avg": [ 38.0, 35.81433604950212 ], "wc_reply_authors_avg": [ 996.0, 391.12402125157183 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 5.333333333333333, 2.357022603955158 ], "replies_avg": [ 24, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.49999999999999983, "corr_recommendation_correctness": 0.9999999999999998, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10833565106730763520&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=HTVch9AMPa", "email": "kth.se;kth.se;kth.se;;", "author_num": 5, "aff_unique_index": "0;0;0", "aff_unique_norm": "KTH Royal Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.kth.se", "aff_unique_abbr": "KTH", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Stockholm", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Sweden" }, { "id": "HTfUrAxjPkR", "title": "Translatotron 2: Robust direct speech-to-speech translation", "track": "main", "status": "Reject", "tldr": "", "abstract": "We present Translatotron 2, a neural direct speech-to-speech translation model that can be trained end-to-end. Translatotron 2 consists of a speech encoder, a phoneme decoder, a mel-spectrogram synthesizer, and an attention module that connects all the previous three components. Experimental results suggest that Translatotron 2 outperforms the original Translatotron by a large margin in terms of translation quality and predicted speech naturalness, and drastically improves the robustness of the predicted speech by mitigating over-generation, such as babbling or long pause. We also propose a new method for retaining the source speaker's voice in the translated speech. The trained model is restricted to retain the source speaker's voice, but unlike the original Translatotron, it is not able to generate speech in a different speaker's voice, making the model more robust for production deployment, by mitigating potential misuse for creating spoofing audio artifacts. When the new method is used together with a simple concatenation-based data augmentation, the trained Translatotron 2 model is able to retain each speaker's voice for input with speaker turns.", "keywords": "Speech-to-speech translation;voice transferring;end-to-end", "primary_area": "", "supplementary_material": "/attachment/c2be5d2d9ba10f77c0b346384b085d3eb5b46e24.zip", "author": "Ye Jia;Michelle Tadmor Ramanovich;Tal Remez;Roi Pomerantz", "authorids": "~Ye_Jia1;tadmor@google.com;talremez@google.com;roi.pomerantz@gmail.com", "gender": "M;;;", "homepage": ";;;", "dblp": "217/2520;;;", "google_scholar": "kaO4R1kAAAAJ;;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Ye_Jia1;tadmor@google.com;talremez@google.com;roi.pomerantz@gmail.com", "aff": "Google;;;", "aff_domain": "google.com;;;", "position": "Researcher;;;", "bibtex": "@misc{\njia2022translatotron,\ntitle={Translatotron 2: Robust direct speech-to-speech translation},\nauthor={Ye Jia and Michelle Tadmor Ramanovich and Tal Remez and Roi Pomerantz},\nyear={2022},\nurl={https://openreview.net/forum?id=HTfUrAxjPkR}\n}", "github": "", "project": "", "reviewers": "rKi2;YR5V;gzba;aQuX;UaLy", "site": "https://openreview.net/forum?id=HTfUrAxjPkR", "pdf_size": 0, "recommendation": "5;5;6;6;6", "confidence": "3;5;4;2;4", "correctness": "4;4;3;3;4", "technical_novelty": "3;2;2;3;2", "empirical_novelty": "2;3;2;3;2", "wc_summary_paper": "293;167;75;177;168", "wc_summary_review": "72;8;66;33;104", "wc_main_review": "236;363;522;223;446", "wc_review": "601;538;663;433;718", "wc_reply_reviewers": "0;0;32;0;431", "wc_reply_authors": "426;390;780;225;895", "reply_reviewers": "0;0;1;0;1", "reply_authors": "1;1;1;1;2", "recommendation_avg": [ 5.6, 0.48989794855663565 ], "confidence_avg": [ 3.6, 1.019803902718557 ], "correctness_avg": [ 3.6, 0.4898979485566356 ], "technical_novelty_avg": [ 2.4, 0.4898979485566356 ], "empirical_novelty_avg": [ 2.4, 0.4898979485566356 ], "wc_summary_paper_avg": [ 176.0, 69.33397435601107 ], "wc_summary_review_avg": [ 56.6, 33.139704283532765 ], "wc_main_review_avg": [ 358.0, 116.42508320804413 ], "wc_review_avg": [ 590.6, 99.18185317889558 ], "wc_reply_reviewers_avg": [ 92.6, 169.65329351356547 ], "wc_reply_authors_avg": [ 543.2, 252.30727298276602 ], "reply_reviewers_avg": [ 0.4, 0.48989794855663565 ], "reply_authors_avg": [ 1.2, 0.4 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.32025630761017415, "corr_recommendation_correctness": -0.6666666666666665, "gs_citation": 31, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8178820033001167059&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0", "aff_unique_norm": "Google", "aff_unique_dep": "Google", "aff_unique_url": "https://www.google.com", "aff_unique_abbr": "Google", "aff_campus_unique_index": "0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "title": "Hot-Refresh Model Upgrades with Regression-Free Compatible Training in Image Retrieval", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/5920", "id": "HTp-6yLGGX", "poster": "", "openreview": "https://openreview.net/forum?id=HTp-6yLGGX", "slides": "https://iclr.cc/virtual/2022/poster/5920", "video": "https://iclr.cc/virtual/2022/poster/5920", "author_site": "Binjie Zhang, Yixiao Ge, Yantao Shen, Yu Li, Chun Yuan, XUYUAN XU, Yexin Wang, Ying Shan", "tldr": "", "abstract": "The task of hot-refresh model upgrades of image retrieval systems plays an essential role in the industry but has never been investigated in academia before. Conventional cold-refresh model upgrades can only deploy new models after the gallery is overall backfilled, taking weeks or even months for massive data. In contrast, hot-refresh model upgrades deploy the new model immediately and then gradually improve the retrieval accuracy by backfilling the gallery on-the-fly. Compatible training has made it possible, however, the problem of model regression with negative flips poses a great challenge to the stable improvement of user experience. We argue that it is mainly due to the fact that new-to-old positive query-gallery pairs may show less similarity than new-to-new negative pairs. To solve the problem, we introduce a Regression-Alleviating Compatible Training (RACT) method to properly constrain the feature compatibility while reducing negative flips. The core is to encourage the new-to-old positive pairs to be more similar than both the new-to-old negative pairs and the new-to-new negative pairs. An efficient uncertainty-based backfilling strategy is further introduced to fasten accuracy improvements. Extensive experiments on large-scale retrieval benchmarks (e.g., Google Landmark) demonstrate that our RACT effectively alleviates the model regression for one more step towards seamless model upgrades.", "keywords": "Compatible Representation Learning;Image Retrieval;Model Regression", "primary_area": "", "supplementary_material": "/attachment/8074b69255ff3099ae4fd679a3163eadb0581df7.zip", "author": "Binjie Zhang;Yixiao Ge;Yantao Shen;Yu Li;Chun Yuan;XUYUAN XU;Yexin Wang;Ying Shan", "authorids": "~Binjie_Zhang1;~Yixiao_Ge2;~Yantao_Shen2;~Yu_Li4;~Chun_Yuan1;~XUYUAN_XU1;~Yexin_Wang3;~Ying_Shan2", "gender": "M;F;M;M;M;M;M;M", "homepage": "https://binjiezhang.github.io/;https://geyixiao.com/;https://scholar.google.com.hk/citations?user=bEctTN0AAAAJ&hl=zh-CN;https://yu-li.github.io/;https://www.sigs.tsinghua.edu.cn/fg3/105064.jhtml;;;", "dblp": "275/3673;228/6649;86/3372;34/2997-3;;;51/2047.html;68/5910", "google_scholar": "https://scholar.google.com/citations?hl=en;TtU74NAAAAAJ;https://scholar.google.com.hk/citations?user=bEctTN0AAAAJ;j9lwU7kAAAAJ;https://scholar.google.com.hk/citations?user=fYdxi2sAAAAJ;https://scholar.google.com.hk/citations?hl=zh-CN;;4oXBp9UAAAAJ", "orcid": ";;;;;;;0000-0001-7673-8325", "linkedin": ";;;;;;yexin-wang-73304332/;YingShanProfile/", "or_profile": "~Binjie_Zhang1;~Yixiao_Ge2;~Yantao_Shen2;~Yu_Li4;~Chun_Yuan1;~XUYUAN_XU1;~Yexin_Wang3;~Ying_Shan2", "aff": "Tsinghua University;Tencent;Amazon;International Digital Economy Academy;Tsinghua University;PCG AI Technology Center;Tencent;Tencent PCG ARC Lab", "aff_domain": "tsinghua.edu.cn;tencent.com;amazon.com;idea.edu.cn;tsinghua.edu.cn;tencent.com;tencent.com;arc.tencent.com", "position": "MS student;Researcher;Researcher;Principal Researcher;Full Professor;expert engineer;Researcher;Director", "bibtex": "@inproceedings{\nzhang2022hotrefresh,\ntitle={Hot-Refresh Model Upgrades with Regression-Free Compatible Training in Image Retrieval},\nauthor={Binjie Zhang and Yixiao Ge and Yantao Shen and Yu Li and Chun Yuan and XUYUAN XU and Yexin Wang and Ying Shan},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=HTp-6yLGGX}\n}", "github": "", "project": "", "reviewers": "QLFh;Fu6z;6wRf;JYkL", "pdf_size": 0, "recommendation": "6;6;6;6", "confidence": "4;3;4;5", "correctness": "3;3;3;4", "technical_novelty": "3;2;3;3", "empirical_novelty": "3;3;3;2", "wc_summary_paper": "103;68;47;135", "wc_summary_review": "45;120;32;38", "wc_main_review": "300;510;362;371", "wc_review": "448;698;441;544", "wc_reply_reviewers": "73;187;0;0", "wc_reply_authors": "887;1472;1855;1289", "reply_reviewers": "2;1;0;0", "reply_authors": "2;4;4;4", "recommendation_avg": [ 6.0, 0.0 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 88.25, 33.5959446957516 ], "wc_summary_review_avg": [ 58.75, 35.66072769868837 ], "wc_main_review_avg": [ 385.75, 76.76709907245422 ], "wc_review_avg": [ 532.75, 103.7240931510129 ], "wc_reply_reviewers_avg": [ 65.0, 76.48202403179455 ], "wc_reply_authors_avg": [ 1375.75, 348.33416068482285 ], "reply_reviewers_avg": [ 0.75, 0.82915619758885 ], "reply_authors_avg": [ 3.5, 0.8660254037844386 ], "replies_avg": [ 22, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4969289680922162705&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "pdf": "https://openreview.net/pdf?id=HTp-6yLGGX", "email": "tsinghua.edu.cn;tencent.com;amazon.com;idea.edu.cn;tsinghua.edu.cn;tencent.com;tencent.com;arc.tencent.com", "author_num": 8, "aff_unique_index": "0;1;2;3;0;4;1;1", "aff_unique_norm": "Tsinghua University;Tencent;Amazon;International Digital Economy Academy;PCG AI Technology Center", "aff_unique_dep": ";Tencent Holdings Limited;Amazon.com, Inc.;;", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.tencent.com;https://www.amazon.com;;", "aff_unique_abbr": "THU;Tencent;Amazon;;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;0;0", "aff_country_unique": "China;United States;" }, { "title": "Half-Inverse Gradients for Physical Deep Learning", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6768", "id": "HTx7vrlLBEj", "poster": "", "openreview": "https://openreview.net/forum?id=HTx7vrlLBEj", "slides": "https://iclr.cc/virtual/2022/poster/6768", "video": "https://iclr.cc/virtual/2022/poster/6768", "author_site": "Patrick Schnell, Philipp Holl, Nils Thuerey", "tldr": "", "abstract": "Recent works in deep learning have shown that integrating differentiable physics simulators into the training process can greatly improve the quality of results. Although this combination represents a more complex optimization task than usual neural network training, the same gradient-based optimizers are used to minimize the loss function. However, the integrated physics solvers have a profound effect on the gradient flow as manipulating scales in magnitude and direction is an inherent property of many physical processes. Consequently, the gradient flow is often highly unbalanced and creates an environment in which existing gradient-based optimizers perform poorly. In this work, we analyze the characteristics of both physical and neural network optimizations separately to derive a new method based on a half-inversion of the Jacobian. Our approach combines principles of both classical network and physics optimizers to solve the combined optimization task. Compared to state-of-the-art neural network optimizers, our method converges more quickly and to better solutions, which we demonstrate on three complex learning problems involving nonlinear oscillators, the Schroedinger equation and the Poisson problem.", "keywords": "physical simulation;partial differential equations;physical loss functions;optimization", "primary_area": "", "supplementary_material": "", "author": "Patrick Schnell;Philipp Holl;Nils Thuerey", "authorids": "~Patrick_Schnell1;~Philipp_Holl1;~Nils_Thuerey1", "gender": ";M;M", "homepage": "https://ge.in.tum.de/about/patrick-schnell/;;https://ge.in.tum.de", "dblp": "293/4156;256/9374;42/478", "google_scholar": ";LilimmEAAAAJ;https://scholar.google.com.tw/citations?user=GEehwv8AAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Patrick_Schnell1;~Philipp_Holl1;~Nils_Thuerey1", "aff": "Technical University of Munich;Technical University Munich;Technical University Munich", "aff_domain": "tum.de;tum.de;tum.de", "position": "PhD student;PhD student;Associate Professor", "bibtex": "@inproceedings{\nschnell2022halfinverse,\ntitle={Half-Inverse Gradients for Physical Deep Learning},\nauthor={Patrick Schnell and Philipp Holl and Nils Thuerey},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=HTx7vrlLBEj}\n}", "github": "", "project": "", "reviewers": "itKt;6S7S;Khvk", "pdf_size": 0, "recommendation": "6;6;8", "confidence": "3;3;3", "correctness": "3;3;4", "technical_novelty": "3;3;4", "empirical_novelty": "3;3;3", "wc_summary_paper": "40;110;92", "wc_summary_review": "29;92;29", "wc_main_review": "106;469;303", "wc_review": "175;671;424", "wc_reply_reviewers": "0;0;65", "wc_reply_authors": "785;1160;781", "reply_reviewers": "0;0;1", "reply_authors": "1;2;1", "recommendation_avg": [ 6.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 3.0, 0.0 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 3.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 80.66666666666667, 29.67977238606942 ], "wc_summary_review_avg": [ 50.0, 29.698484809834994 ], "wc_main_review_avg": [ 292.6666666666667, 148.37415168717524 ], "wc_review_avg": [ 423.3333333333333, 202.4917007901531 ], "wc_reply_reviewers_avg": [ 21.666666666666668, 30.641293851417057 ], "wc_reply_authors_avg": [ 908.6666666666666, 177.72700663908367 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.9999999999999998, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1729142096110683757&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=HTx7vrlLBEj", "email": "tum.de;tum.de;tum.de", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Technical University of Munich", "aff_unique_dep": "", "aff_unique_url": "https://www.tum.de", "aff_unique_abbr": "TUM", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Germany" }, { "id": "HUeyM2qVey2", "title": "Universal Joint Approximation of Manifolds and Densities by Simple Injective Flows", "track": "main", "status": "Reject", "tldr": "", "abstract": "We analyze neural networks composed of bijective flows and injective expansive elements. We find that such networks universally approximate a large class of manifolds simultaneously with densities supported on them. Among others, our results apply to the well-known coupling and autoregressive flows. We build on the work of Teshima et al. 2020 on bijective flows and study injective architectures proposed in Brehmer et al. 2020 and Kothari et al. 2021. Our results leverage a new theoretical device called the \\emph{embedding gap}, which measures how far one continuous manifold is from embedding another. We relate the embedding gap to a relaxation of universally we call the \\emph{manifold embedding property}, capturing the geometric part of universality. Our proof also establishes that optimality of a network can be established ``in reverse,'' resolving a conjecture made in Brehmer et al. 2020 and opening the door for simple layer-wise training schemes. Finally, we show that the studied networks admit an exact layer-wise projection result, Bayesian uncertainty quantification, and black-box recovery of network weights.", "keywords": "Universality;Flow Networks;Manifold Learning;Density Estimation", "primary_area": "", "supplementary_material": "", "author": "Michael Anthony Puthawala;Matti Lassas;Ivan Dokmani\u0107;Maarten V. de Hoop", "authorids": "~Michael_Anthony_Puthawala1;~Matti_Lassas1;~Ivan_Dokmani\u01071;~Maarten_V._de_Hoop2", "gender": "M;M;M;", "homepage": "https://scholar.google.com/citations?user=ntwCDpoAAAAJ&hl=en;https://www.mv.helsinki.fi/home/lassas/index.html;http://dokmanic.ece.illinois.edu;http://maartendehoop.rice.edu/", "dblp": ";;52/8859;60/4525", "google_scholar": "ntwCDpoAAAAJ;;0SQnwL4AAAAJ;", "orcid": ";0000-0003-2043-3156;;", "linkedin": ";;;", "or_profile": "~Michael_Anthony_Puthawala1;~Matti_Lassas1;~Ivan_Dokmanic1;~Maarten_v._de_Hoop1", "aff": "Rice University;University of Helsinki;University of Basel;Rice University", "aff_domain": "rice.edu;helsinki.fi;unibas.ch;rice.edu", "position": "Simons Postdoctoral Fellow;Full Professor;Associate Professor;Full Professor", "bibtex": "@misc{\nputhawala2022universal,\ntitle={Universal Joint Approximation of Manifolds and Densities by Simple Injective Flows},\nauthor={Michael Anthony Puthawala and Matti Lassas and Ivan Dokmani{\\'c} and Maarten V. de Hoop},\nyear={2022},\nurl={https://openreview.net/forum?id=HUeyM2qVey2}\n}", "github": "", "project": "", "reviewers": "eR1p;4sjW;2dK9;WKWr", "site": "https://openreview.net/forum?id=HUeyM2qVey2", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "3;4;2;4", "correctness": "2;4;4;4", "technical_novelty": "3;4;3;4", "empirical_novelty": "0;0;2;4", "wc_summary_paper": "127;33;158;47", "wc_summary_review": "63;79;8;30", "wc_main_review": "1074;591;153;65", "wc_review": "1264;703;319;142", "wc_reply_reviewers": "0;509;0;0", "wc_reply_authors": "409;3367;559;194", "reply_reviewers": "0;3;0;0", "reply_authors": "1;9;2;1", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "correctness_avg": [ 3.5, 0.8660254037844386 ], "technical_novelty_avg": [ 3.5, 0.5 ], "empirical_novelty_avg": [ 1.5, 1.6583123951777 ], "wc_summary_paper_avg": [ 91.25, 52.642069678157604 ], "wc_summary_review_avg": [ 45.0, 27.721832551258224 ], "wc_main_review_avg": [ 470.75, 401.23831758694234 ], "wc_review_avg": [ 607.0, 430.12614428792864 ], "wc_reply_reviewers_avg": [ 127.25, 220.40346526313962 ], "wc_reply_authors_avg": [ 1132.25, 1296.7388663489655 ], "reply_reviewers_avg": [ 0.75, 1.299038105676658 ], "reply_authors_avg": [ 3.25, 3.344772040064913 ], "replies_avg": [ 22, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.48420012470625223, "corr_recommendation_correctness": 0.6622661785325219, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9340911345063794275&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 10, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "Rice University;University of Helsinki;University of Basel", "aff_unique_dep": ";;", "aff_unique_url": "https://www.rice.edu;https://www.helsinki.fi;https://www.unibas.ch", "aff_unique_abbr": "Rice;UH;UniBas", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2;0", "aff_country_unique": "United States;Finland;Switzerland" }, { "id": "HUjgF0G9FxN", "title": "SemiFL: Communication Efficient Semi-Supervised Federated Learning with Unlabeled Clients", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Federated Learning allows training machine learning models by using the computation and private data resources of many distributed clients such as smartphones and IoT devices. Most existing works on Federated Learning (FL) assume the clients have ground-truth labels. However, in many practical scenarios, clients may be unable to label task-specific data, e.g., due to a lack of expertise. This work considers a server that hosts a labeled dataset and wishes to leverage clients with unlabeled data for supervised learning. We propose a new Federated Learning framework referred to as SemiFL to address Semi-Supervised Federated Learning (SSFL). In SemiFL, clients have completely unlabeled data, while the server has a small amount of labeled data. SemiFL is communication efficient since it separates the training of server-side supervised data and client-side unsupervised data. We demonstrate several strategies of SemiFL that enhance efficiency and prediction and develop intuitions of why they work. In particular, we provide a theoretical understanding of the use of strong data augmentation for Semi-Supervised Learning (SSL), which can be interesting in its own right.\nExtensive empirical evaluations demonstrate that our communication efficient method can significantly improve the performance of a labeled server with unlabeled clients. Moreover, we demonstrate that SemiFL can outperform many existing FL results trained with fully supervised data, and perform competitively with the state-of-the-art centralized SSL methods. For instance, in standard communication efficient scenarios, our method can perform $93\\%$ accuracy on the CIFAR10 dataset with only $4000$ labeled samples at the server. Such accuracy is only $2\\%$ away from the result trained from $50000$ fully labeled data, and it improves about $30\\%$ upon existing SSFL methods in the communication efficient setting.", "keywords": "Federated Learning;Semi-Supervised Learning;Strong data augmentation;Unlabeled data", "primary_area": "", "supplementary_material": "", "author": "Enmao Diao;Jie Ding;Vahid Tarokh", "authorids": "~Enmao_Diao1;~Jie_Ding2;~Vahid_Tarokh1", "gender": "M;M;", "homepage": "https://diaoenmao.com/;http://jding.org;", "dblp": "226/5549;94/1825-2;", "google_scholar": "jhVVjF4AAAAJ;ZyqvoqcAAAAJ;", "orcid": "0000-0002-9151-7990;;", "linkedin": "enmaodiao/;;", "or_profile": "~Enmao_Diao1;~Jie_Ding2;~Vahid_Tarokh1", "aff": "Duke University;University of Minnesota, Minneapolis;", "aff_domain": "duke.edu;umn.edu;", "position": "PhD student;Assistant Professor;", "bibtex": "@misc{\ndiao2022semifl,\ntitle={Semi{FL}: Communication Efficient Semi-Supervised Federated Learning with Unlabeled Clients},\nauthor={Enmao Diao and Jie Ding and Vahid Tarokh},\nyear={2022},\nurl={https://openreview.net/forum?id=HUjgF0G9FxN}\n}", "github": "", "project": "", "reviewers": "Yyfe;v9ki;9q2b", "site": "https://openreview.net/forum?id=HUjgF0G9FxN", "pdf_size": 0, "recommendation": "3;3;5", "confidence": "4;5;3", "correctness": "3;3;3", "technical_novelty": "3;1;3", "empirical_novelty": "2;1;3", "wc_summary_paper": "124;61;83", "wc_summary_review": "104;23;49", "wc_main_review": "548;311;160", "wc_review": "776;395;292", "wc_reply_reviewers": "17;0;0", "wc_reply_authors": "626;848;216", "reply_reviewers": "1;0;0", "reply_authors": "1;3;1", "recommendation_avg": [ 3.6666666666666665, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.3333333333333335, 0.9428090415820634 ], "empirical_novelty_avg": [ 2.0, 0.816496580927726 ], "wc_summary_paper_avg": [ 89.33333333333333, 26.10661899893503 ], "wc_summary_review_avg": [ 58.666666666666664, 33.76717669901086 ], "wc_main_review_avg": [ 339.6666666666667, 159.69206478581066 ], "wc_review_avg": [ 487.6666666666667, 208.1735387176339 ], "wc_reply_reviewers_avg": [ 5.666666666666667, 8.013876853447538 ], "wc_reply_authors_avg": [ 563.3333333333334, 261.7904165973656 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.9428090415820634 ], "replies_avg": [ 22, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.8660254037844387, "corr_recommendation_correctness": 0.0, "gs_citation": 50, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11344906684401104391&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "aff_unique_index": "0;1", "aff_unique_norm": "Duke University;University of Minnesota", "aff_unique_dep": ";", "aff_unique_url": "https://www.duke.edu;https://www.minnesota.edu", "aff_unique_abbr": "Duke;UMN", "aff_campus_unique_index": "1", "aff_campus_unique": ";Minneapolis", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "HY6i9FYBeFG", "title": "S3: Supervised Self-supervised Learning under Label Noise", "track": "main", "status": "Reject", "tldr": "", "abstract": "Despite the large progress in supervised learning with Neural Networks, there are significant challenges in obtaining high-quality, large-scale and accurately labeled datasets. In this context, in this paper we address the problem of classification in the presence of noisy labels and more specifically, both close-set and open-set label noise, that is when the true label of a sample may, or may not belong to the set of the given labels. In the heart of our method is a sample selection mechanism that relies on the consistency between the annotated label of a sample and the distribution of the labels in its neighborhood in the feature space, a relabeling mechanism that relies on the confidence of the classifier across subsequent iterations and a training strategy that trains the encoder both with a self-consistency loss and the classifier-encoder with cross-entropy loss on the selected samples alone. Without bells and whistles, such as co-training so as to reduce the self-confirmation bias, our method significantly surpasses previous methods on both CIFAR10/CIFAR100 with artificial noise and real-world noisy datasets such as WebVision and ANIMAL-10N.", "keywords": "Learning under label noise;Supervised learning;Self-supervised learning", "primary_area": "", "supplementary_material": "", "author": "Chen Feng;Georgios Tzimiropoulos;Ioannis Patras", "authorids": "~Chen_Feng3;~Georgios_Tzimiropoulos1;~Ioannis_Patras2", "gender": "M;M;M", "homepage": "https://mrchenfeng.github.io/;https://ytzimiro.github.io/;http://www.eecs.qmul.ac.uk/~ioannisp/", "dblp": ";03/3273;18/1556", "google_scholar": "https://scholar.google.com/citations?hl=en;https://scholar.google.co.uk/citations?user=D4JkWxf-8fwC;https://scholar.google.com.tw/citations?user=OBYLxRkAAAAJ", "orcid": "0000-0001-9199-559X;;0000-0003-3913-4738", "linkedin": "drchenfeng/;;ioannis-patras-1053767/", "or_profile": "~Chen_Feng3;~Georgios_Tzimiropoulos1;~Ioannis_Patras2", "aff": "Queen Mary University London;Queen Mary University London;Queen Mary, University of London", "aff_domain": "qmul.ac.uk;qmul.ac.uk;qmul.ac.uk", "position": "PhD student;Associate Professor;Full Professor", "bibtex": "@misc{\nfeng2022s,\ntitle={S3: Supervised Self-supervised Learning under Label Noise},\nauthor={Chen Feng and Georgios Tzimiropoulos and Ioannis Patras},\nyear={2022},\nurl={https://openreview.net/forum?id=HY6i9FYBeFG}\n}", "github": "", "project": "", "reviewers": "pgYs;cqfk;bqh4", "site": "https://openreview.net/forum?id=HY6i9FYBeFG", "pdf_size": 0, "recommendation": "5;5;6", "confidence": "4;4;4", "correctness": "3;3;3", "technical_novelty": "1;2;3", "empirical_novelty": "1;2;3", "wc_summary_paper": "38;73;121", "wc_summary_review": "140;257;81", "wc_main_review": "87;292;546", "wc_review": "265;622;748", "wc_reply_reviewers": "0;820;105", "wc_reply_authors": "856;3606;1738", "reply_reviewers": "0;3;3", "reply_authors": "2;8;4", "recommendation_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.0, 0.816496580927726 ], "empirical_novelty_avg": [ 2.0, 0.816496580927726 ], "wc_summary_paper_avg": [ 77.33333333333333, 34.022868126534476 ], "wc_summary_review_avg": [ 159.33333333333334, 73.14065596886286 ], "wc_main_review_avg": [ 308.3333333333333, 187.74154811572447 ], "wc_review_avg": [ 545.0, 204.5629487468344 ], "wc_reply_reviewers_avg": [ 308.3333333333333, 364.33348581881535 ], "wc_reply_authors_avg": [ 2066.6666666666665, 1146.4848693094714 ], "reply_reviewers_avg": [ 2.0, 1.4142135623730951 ], "reply_authors_avg": [ 4.666666666666667, 2.494438257849294 ], "replies_avg": [ 27, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14826700535911012490&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;1", "aff_unique_norm": "Queen Mary University of London;Queen Mary, University of London", "aff_unique_dep": ";", "aff_unique_url": "https://www.qmul.ac.uk;https://www.qmul.ac.uk", "aff_unique_abbr": "QMUL;QMUL", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "London", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United Kingdom" }, { "id": "HZ83Rymg-tf", "title": "L2E: Learning to Exploit Your Opponent", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Opponent modeling is essential to exploit sub-optimal opponents in strategic interactions. Most previous works focus on building explicit models to directly predict the opponents' styles or strategies, which require a large amount of data to train the model and lack adaptability to unknown opponents. In this work, we propose a novel Learning to Exploit (L2E) framework for implicit opponent modeling. L2E acquires the ability to exploit opponents by a few interactions with different opponents during training, thus can adapt to new opponents with unknown styles during testing quickly. We propose a novel opponent strategy generation algorithm that produces effective opponents for training automatically. We evaluate L2E on two poker games, a grid-world soccer environment, and a high-dimensional simulated robot environment, which are very challenging benchmarks for opponent modeling. Comprehensive experimental results indicate that L2E quickly adapts to diverse styles of unknown opponents.", "keywords": "Opponent Modeling;Learning to Learn", "primary_area": "", "supplementary_material": "", "author": "Zhe Wu;Kai Li;Enmin Zhao;Hang Xu;Haobo Fu;QIANG FU;Bo An;Junliang Xing", "authorids": "~Zhe_Wu6;~Kai_Li2;~Enmin_Zhao1;~Hang_Xu5;~Haobo_Fu2;~QIANG_FU8;~Bo_An2;~Junliang_Xing1", "gender": ";M;M;M;M;M;M;M", "homepage": "https://github.com/GoooKuuu;;https://github.com/ZhaoEnMin;;;https://personal.ntu.edu.sg/boan/;http://people.ucas.ac.cn/~jlxing?language=en;https://github.com/rpSebastian", "dblp": ";181/2853;;85/8571;;42/6178-1.html;43/7659.html;", "google_scholar": ";_cY_PXgAAAAJ;;LFdJXNcAAAAJ;gANaxT0AAAAJ;PEEpuNwAAAAJ;jSwNd3MAAAAJ;", "orcid": ";;;;;0000-0002-7064-7438;0000-0001-6801-0510;", "linkedin": ";;;haobo-fu-382b0784/;;;https://www.linkedin.cn/incareer/in/ACoAAAvlU14B40ZWH1pxg5JJDtQ6LlgMYkp0e5s;", "or_profile": "~Zhe_Wu6;~Kai_Li2;~Enmin_Zhao1;~Haobo_Fu2;~QIANG_FU8;~Bo_An2;~Junliang_Xing1;~Xu_Hang1", "aff": "Institute of automation, Chinese academy of science;Institute of Automation, Chinese Academy of Sciences;Institute of Automation, Chinese Academy of Sciences;Tencent AI Lab;Tencent AI Lab;Nanyang Technological University;Institute of Automation, Chinese Academy of Sciences; University of Chinese Academy of Sciences;Institute of Automation, Chinese Academy of Sciences", "aff_domain": "ia.ac.cn;ia.ac.cn;ia.ac.cn;tencent.com;tencent.com;ntu.edu.sg;ia.ac.cn;ia.ac.cn", "position": "MS student;Associate Professor;PhD student;Principal Researcher;Principal Researcher;Full Professor;Full Professor;PhD student", "bibtex": "@misc{\nwu2022le,\ntitle={L2E: Learning to Exploit Your Opponent},\nauthor={Zhe Wu and Kai Li and Enmin Zhao and Hang Xu and Haobo Fu and QIANG FU and Bo An and Junliang Xing},\nyear={2022},\nurl={https://openreview.net/forum?id=HZ83Rymg-tf}\n}", "github": "", "project": "", "reviewers": "TKhY;jQFg;KhRh", "site": "https://openreview.net/forum?id=HZ83Rymg-tf", "pdf_size": 0, "recommendation": "3;3;5", "confidence": "5;4;4", "correctness": "2;2;3", "technical_novelty": "3;2;2", "empirical_novelty": "2;2;3", "wc_summary_paper": "91;34;74", "wc_summary_review": "68;75;44", "wc_main_review": "5181;346;261", "wc_review": "5340;455;379", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "18;0;0", "reply_reviewers": "0;0;0", "reply_authors": "1;0;0", "recommendation_avg": [ 3.6666666666666665, 0.9428090415820634 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "correctness_avg": [ 2.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 66.33333333333333, 23.893281249943232 ], "wc_summary_review_avg": [ 62.333333333333336, 13.27487183449325 ], "wc_main_review_avg": [ 1929.3333333333333, 2299.5373930906676 ], "wc_review_avg": [ 2058.0, 2320.931853085451 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 6.0, 8.48528137423857 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": -0.4999999999999999, "corr_recommendation_correctness": 1.0, "gs_citation": 34, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14581788608741994918&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0;0;1;1;2;0;3;0", "aff_unique_norm": "Chinese Academy of Sciences;Tencent;Nanyang Technological University;University of Chinese Academy of Sciences", "aff_unique_dep": "Institute of Automation;Tencent AI Lab;;", "aff_unique_url": "http://www.ia.cas.cn;https://ai.tencent.com;https://www.ntu.edu.sg;http://www.ucas.ac.cn", "aff_unique_abbr": "CAS;Tencent AI Lab;NTU;UCAS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;1;0;0;0", "aff_country_unique": "China;Singapore" }, { "id": "H_qwVb8DQb-", "title": "Balancing Average and Worst-case Accuracy in Multitask Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "When training and evaluating machine learning models on a large number of tasks, it is important to not only look at average task accuracy---which may be biased by easy or redundant tasks---but also worst-case accuracy (i.e. the performance on the task with the lowest accuracy). In this work, we show how to use techniques from the distributionally robust optimization (DRO) literature to improve worst-case performance in multitask learning. We highlight several failure cases of DRO when applied off-the-shelf and present an improved method, Lookahead-DRO (L-DRO), which mitigates these issues. The core idea of L-DRO is to anticipate the interaction between tasks during training in order to choose a dynamic re-weighting of the various task losses, which will (i) lead to minimal worst-case loss and (ii) train on as many tasks as possible. After demonstrating the efficacy of L-DRO on a small controlled synthetic setting, we evaluate it on two realistic benchmarks: a multitask version of the CIFAR-100 image classification dataset and a large-scale multilingual language modeling experiment. Our empirical results show that L-DRO achieves a better trade-off between average and worst-case accuracy with little computational overhead compared to several strong baselines.", "keywords": "multitask learning;distributionally robust optimization", "primary_area": "", "supplementary_material": "", "author": "Paul Michel;Sebastian Ruder;Dani Yogatama", "authorids": "~Paul_Michel1;~Sebastian_Ruder2;~Dani_Yogatama2", "gender": "M;;M", "homepage": "https://pmichel31415.github.io/;;http://sebastianruder.com/", "dblp": "185/1024;08/8178;186/7066", "google_scholar": "oyyIf0YAAAAJ;;https://scholar.google.de/citations?user=8ONXPV8AAAAJ", "orcid": ";;", "linkedin": "paul-michel-4954b799/;;sebastianruder", "or_profile": "~Paul_Michel1;~Dani_Yogatama1;~Sebastian_Ruder1", "aff": "Ecole Normale Sup\u00e9rieure de Paris;Google DeepMind;Google", "aff_domain": "ens.fr;google.com;google.com", "position": "Postdoc;Research Scientist;Research scientist", "bibtex": "@misc{\nmichel2022balancing,\ntitle={Balancing Average and Worst-case Accuracy in Multitask Learning},\nauthor={Paul Michel and Sebastian Ruder and Dani Yogatama},\nyear={2022},\nurl={https://openreview.net/forum?id=H_qwVb8DQb-}\n}", "github": "", "project": "", "reviewers": "Bw6i;NzDd;j3ej;TtaP", "site": "https://openreview.net/forum?id=H_qwVb8DQb-", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "4;4;3;4", "correctness": "4;3;3;3", "technical_novelty": "3;3;3;2", "empirical_novelty": "2;3;2;3", "wc_summary_paper": "67;184;58;170", "wc_summary_review": "39;36;7;89", "wc_main_review": "213;487;177;592", "wc_review": "319;707;242;851", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "550;642;262;588", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 119.75, 57.55160727555747 ], "wc_summary_review_avg": [ 42.75, 29.4819860253681 ], "wc_main_review_avg": [ 367.25, 176.66405265361712 ], "wc_review_avg": [ 529.75, 255.84895446337083 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 510.5, 147.14873427929987 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": -0.5773502691896257, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3459274576965650036&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;1", "aff_unique_norm": "Ecole Normale Sup\u00e9rieure de Paris;Google", "aff_unique_dep": ";Google DeepMind", "aff_unique_url": "https://www.ens.fr;https://deepmind.com", "aff_unique_abbr": "ENS Paris;DeepMind", "aff_campus_unique_index": "0;2", "aff_campus_unique": "Paris;;Mountain View", "aff_country_unique_index": "0;1;2", "aff_country_unique": "France;United Kingdom;United States" }, { "title": "Generative Pseudo-Inverse Memory", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6447", "id": "Harn4_EZBw", "poster": "", "openreview": "https://openreview.net/forum?id=Harn4_EZBw", "slides": "https://iclr.cc/virtual/2022/poster/6447", "video": "https://iclr.cc/virtual/2022/poster/6447", "author_site": "Kha Pham, Hung Le, Man Ngo, Truyen Tran, Bao Ho, Svetha Venkatesh", "tldr": "", "abstract": "We propose Generative Pseudo-Inverse Memory (GPM), a class of deep generative memory models that are fast to write in and read out. Memory operations are recast as seeking robust solutions of linear systems, which naturally lead to the use of matrix pseudo-inverses. The pseudo-inverses are iteratively approximated, with practical computation complexity of almost $O(1)$. We prove theoretically and verify empirically that our model can retrieve exactly what have been written to the memory under mild conditions. A key capability of GPM is iterative reading, during which the attractor dynamics towards fixed points are enabled, allowing the model to iteratively improve sample quality in denoising and generating. More impressively, GPM can store a large amount of data while maintaining key abilities of accurate retrieving of stored patterns, denoising of corrupted data and generating novel samples. Empirically we demonstrate the efficiency and versatility of GPM on a comprehensive suite of experiments involving binarized MNIST, binarized Omniglot, FashionMNIST, CIFAR10 & CIFAR100 and CelebA.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Kha Pham;Hung Le;Man Ngo;Truyen Tran;Bao Ho;Svetha Venkatesh", "authorids": "~Kha_Pham2;~Hung_Le1;nmman@hcmus.edu.vn;~Truyen_Tran1;bao.ho@jvn.edu.vn;~Svetha_Venkatesh1", "gender": ";;;M;;F", "homepage": ";;;http://truyentran.github.io;;https://www.deakin.edu.au/about-deakin/people/svetha-venkatesh", "dblp": ";;;55/2269;;81/1984", "google_scholar": ";;;https://scholar.google.com.au/citations?user=zvspVLwAAAAJ;;AEkRUQcAAAAJ", "orcid": ";;;0000-0001-6531-8907;;", "linkedin": ";;;truyen-tran;;", "or_profile": "~Kha_Pham2;~Hung_Le1;nmman@hcmus.edu.vn;~Truyen_Tran1;bao.ho@jvn.edu.vn;~Svetha_Venkatesh1", "aff": ";;;Deakin University, Australia;;Deakin University", "aff_domain": ";;;deakin.edu.au;;deakin.edu.au", "position": ";;;Associate Professor;;Full Professor", "bibtex": "@inproceedings{\npham2022generative,\ntitle={Generative Pseudo-Inverse Memory},\nauthor={Kha Pham and Hung Le and Man Ngo and Truyen Tran and Bao Ho and Svetha Venkatesh},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=Harn4_EZBw}\n}", "github": "", "project": "", "reviewers": "DcwF;NH7Y;m6LR", "pdf_size": 0, "recommendation": "5;5;8", "confidence": "2;4;4", "correctness": "3;3;4", "technical_novelty": "2;3;3", "empirical_novelty": "2;2;3", "wc_summary_paper": "158;86;116", "wc_summary_review": "40;47;58", "wc_main_review": "435;366;169", "wc_review": "633;499;343", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "563;741;257", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "recommendation_avg": [ 6.0, 1.4142135623730951 ], "confidence_avg": [ 3.3333333333333335, 0.9428090415820634 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 120.0, 29.5296461204668 ], "wc_summary_review_avg": [ 48.333333333333336, 7.408703590297623 ], "wc_main_review_avg": [ 323.3333333333333, 112.70709333883511 ], "wc_review_avg": [ 491.6666666666667, 118.50550854519615 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 520.3333333333334, 199.88218752277274 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.5000000000000001, "corr_recommendation_correctness": 1.0, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3746926696600690232&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=Harn4_EZBw", "email": ";;;deakin.edu.au;;deakin.edu.au", "author_num": 6, "aff_unique_index": "0;0", "aff_unique_norm": "Deakin University", "aff_unique_dep": "", "aff_unique_url": "https://www.deakin.edu.au", "aff_unique_abbr": "Deakin", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Australia" }, { "id": "HavXnq6KyT3", "title": "Optimizing Class Distribution in Memory for Multi-Label Continual Learning", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Continual learning, which tries to learn from a data stream with non-stationary distribution, is an important yet challenging problem. One of the most effective ways to solve this problem is replay-based methods, in which a replay buffer called memory is maintained to keep a small part of past samples and the model rehearses these samples to keep its performance on old distribution when learning on new distribution. Most existing replay-based methods focus on single-label problems in which each sample in the data stream has only one label. But many real applications are multi-label problems in which each sample may have more than one label. To the best of our knowledge, there exists only one method, called partition reservoir sampling (PRS), for multi-label continual learning problems. PRS suffers from low speed due to its complicated process. In this paper, we propose a novel method, called optimizing class distribution in memory (OCDM), for multi-label continual learning. OCDM formulates the memory update mechanism as an optimization problem and updates the memory by solving this problem. Experiments on two widely used multi-label datasets show that OCDM outperforms other state-of-the-art methods including PRS in terms of accuracy, and its speed is also much faster than PRS.", "keywords": "online continual learning", "primary_area": "", "supplementary_material": "/attachment/3de7ebf81dc621b470cae03cb6a7153a556bf9ff.zip", "author": "Yan-Shuo Liang;Wu-Jun Li", "authorids": "~Yan-Shuo_Liang1;~Wu-Jun_Li1", "gender": "M;M", "homepage": "https://liangyanshuo.github.io/;https://cs.nju.edu.cn/lwj/", "dblp": "329/6195;26/988.html", "google_scholar": ";NCCdqdcAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Yan-Shuo_Liang1;~Wu-Jun_Li1", "aff": "Nanjing University;Nanjing University", "aff_domain": "nju.edu.cn;nju.edu.cn", "position": "PhD student;Full Professor", "bibtex": "@misc{\nliang2022optimizing,\ntitle={Optimizing Class Distribution in Memory for Multi-Label Continual Learning},\nauthor={Yan-Shuo Liang and Wu-Jun Li},\nyear={2022},\nurl={https://openreview.net/forum?id=HavXnq6KyT3}\n}", "github": "", "project": "", "reviewers": "8uEn;k2FE;doDn;YjYY", "site": "https://openreview.net/forum?id=HavXnq6KyT3", "pdf_size": 0, "recommendation": "5;5;5;6", "confidence": "4;5;3;4", "correctness": "2;3;3;3", "technical_novelty": "3;2;3;4", "empirical_novelty": "3;2;2;3", "wc_summary_paper": "43;33;68;128", "wc_summary_review": "30;35;65;206", "wc_main_review": "342;211;303;711", "wc_review": "415;279;436;1045", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 5.25, 0.4330127018922193 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 68.0, 36.9120576505835 ], "wc_summary_review_avg": [ 84.0, 71.69728028314603 ], "wc_main_review_avg": [ 391.75, 190.35673746941558 ], "wc_review_avg": [ 543.75, 295.6056283293672 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16428506084841439098&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Nanjing University", "aff_unique_dep": "", "aff_unique_url": "https://www.nju.edu.cn", "aff_unique_abbr": "Nanjing U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "title": "Churn Reduction via Distillation", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6348", "id": "HbtFCX2PLq0", "poster": "", "openreview": "https://openreview.net/forum?id=HbtFCX2PLq0", "slides": "https://iclr.cc/virtual/2022/poster/6348", "video": "https://iclr.cc/virtual/2022/poster/6348", "author_site": "Heinrich Jiang, Harikrishna Narasimhan, Dara Bahri, Andrew Cotter, Afshin Rostamizadeh", "tldr": "", "abstract": "In real-world systems, models are frequently updated as more data becomes available, and in addition to achieving high accuracy, the goal is to also maintain a low difference in predictions compared to the base model (i.e. predictive churn). If model retraining results in vastly different behavior, then it could cause negative effects in downstream systems, especially if this churn can be avoided with limited impact on model accuracy. In this paper, we show an equivalence between training with distillation using the base model as the teacher and training with an explicit constraint on the predictive churn. We then show that distillation performs strongly for low churn training against a number of recent baselines on a wide range of datasets and model architectures, including fully-connected networks, convolutional networks, and transformers.", "keywords": "distillation;churn;constraints", "primary_area": "", "supplementary_material": "", "author": "Heinrich Jiang;Harikrishna Narasimhan;Dara Bahri;Andrew Cotter;Afshin Rostamizadeh", "authorids": "~Heinrich_Jiang1;~Harikrishna_Narasimhan1;~Dara_Bahri1;~Andrew_Cotter1;~Afshin_Rostamizadeh1", "gender": "M;M;M;M;", "homepage": ";https://hari-research.github.io/;http://www.dara.run;;", "dblp": "182/2472;56/7573;231/7656;https://dblp.org/pers/c/Cotter:Andrew.html;97/4479", "google_scholar": ";7X_oT4YAAAAJ;j5PpTOwAAAAJ;gh3ut4MAAAAJ;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~Heinrich_Jiang1;~Harikrishna_Narasimhan1;~Dara_Bahri1;~Andrew_Cotter1;~Afshin_Rostamizadeh1", "aff": "Google;Google;Google Research;Google;Google", "aff_domain": "google.com;google.com;google.com;google.com;google.com", "position": "Research scientist;Research Scientist;Research Scientist;Research Scientist;Researcher", "bibtex": "@inproceedings{\njiang2022churn,\ntitle={Churn Reduction via Distillation},\nauthor={Heinrich Jiang and Harikrishna Narasimhan and Dara Bahri and Andrew Cotter and Afshin Rostamizadeh},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=HbtFCX2PLq0}\n}", "github": "", "project": "", "reviewers": "nqfu;pZBb;TJ4g", "pdf_size": 0, "recommendation": "5;8;8", "confidence": "4;2;4", "correctness": "3;4;3", "technical_novelty": "2;4;3", "empirical_novelty": "2;4;4", "wc_summary_paper": "25;103;136", "wc_summary_review": "27;22;86", "wc_main_review": "160;62;393", "wc_review": "212;187;615", "wc_reply_reviewers": "0;19;0", "wc_reply_authors": "519;11;214", "reply_reviewers": "0;1;0", "reply_authors": "1;1;1", "recommendation_avg": [ 7.0, 1.4142135623730951 ], "confidence_avg": [ 3.3333333333333335, 0.9428090415820634 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 3.0, 0.816496580927726 ], "empirical_novelty_avg": [ 3.3333333333333335, 0.9428090415820634 ], "wc_summary_paper_avg": [ 88.0, 46.54030511288038 ], "wc_summary_review_avg": [ 45.0, 29.06314963431642 ], "wc_main_review_avg": [ 205.0, 138.82603022008036 ], "wc_review_avg": [ 338.0, 196.13430772474933 ], "wc_reply_reviewers_avg": [ 6.333333333333333, 8.956685895029603 ], "wc_reply_authors_avg": [ 248.0, 208.77899000298538 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.5000000000000001, "corr_recommendation_correctness": 0.5, "gs_citation": 25, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8111264285737066191&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=HbtFCX2PLq0", "email": "google.com;google.com;google.com;google.com;google.com", "author_num": 5, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Google", "aff_unique_dep": "Google", "aff_unique_url": "https://www.google.com", "aff_unique_abbr": "Google", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "HdnUQk9jbUO", "title": "Linear Convergence of SGD on Overparametrized Shallow Neural Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "Despite the non-convex landscape, first-order methods can be shown to reach global minima when training overparameterized neural networks, where the number of parameters far exceed the number of training data. In this work, we prove linear convergence of stochastic gradient descent when training a two-layer neural network with smooth activations. While the existing theory either requires a high degree of overparameterization or non-standard initialization and training strategies, e.g., training only a single layer, we show that a subquadratic scaling on the width is sufficient under standard initialization and training both layers simultaneously if the minibatch size is sufficiently large and it also grows with the number of training examples. Via the batch size, our results interpolate between the state-of-the-art subquadratic results for gradient descent and the quadratic results in the worst case.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Paul Rolland;Ali Ramezani-Kebrya;ChaeHwan Song;Fabian Latorre;Volkan Cevher", "authorids": "~Paul_Rolland1;~Ali_Ramezani-Kebrya1;~ChaeHwan_Song1;~Fabian_Latorre1;~Volkan_Cevher1", "gender": "M;;F;M;M", "homepage": ";https://alirk.github.io/;;https://fabianlatorre.com;http://lions.epfl.ch", "dblp": "215/4294;129/4841;;244/9638;70/5301", "google_scholar": ";qZ8KukkAAAAJ;;B46S5NwAAAAJ;https://scholar.google.ch/citations?user=hlWhzU8AAAAJ", "orcid": ";;;;", "linkedin": ";;https://ch.linkedin.com/in/chaehwan-song-177047170;;", "or_profile": "~Paul_Rolland1;~Ali_Ramezani-Kebrya1;~ChaeHwan_Song1;~Fabian_Latorre1;~Volkan_Cevher1", "aff": "Swiss Federal Institute of Technology Lausanne;Swiss Federal Institute of Technology Lausanne;Swiss Federal Institute of Technology Lausanne;SalesForce.com;Swiss Institute of Technology", "aff_domain": "epfl.ch;epfl.ch;epfl.ch;salesforce.com;epfl.ch", "position": "PhD student;Postdoc;PhD student;Intern;Associate Professor", "bibtex": "@misc{\nrolland2022linear,\ntitle={Linear Convergence of {SGD} on Overparametrized Shallow Neural Networks},\nauthor={Paul Rolland and Ali Ramezani-Kebrya and ChaeHwan Song and Fabian Latorre and Volkan Cevher},\nyear={2022},\nurl={https://openreview.net/forum?id=HdnUQk9jbUO}\n}", "github": "", "project": "", "reviewers": "SWMd;t48E;Vg9p;2P9T", "site": "https://openreview.net/forum?id=HdnUQk9jbUO", "pdf_size": 0, "recommendation": "1;3;3;5", "confidence": "5;3;4;4", "correctness": "2;4;3;4", "technical_novelty": "1;2;2;2", "empirical_novelty": "0;2;0;1", "wc_summary_paper": "52;103;73;110", "wc_summary_review": "6;41;21;119", "wc_main_review": "385;712;388;228", "wc_review": "443;856;482;457", "wc_reply_reviewers": "398;0;0;0", "wc_reply_authors": "1158;534;718;231", "reply_reviewers": "1;0;0;0", "reply_authors": "3;1;1;1", "recommendation_avg": [ 3.0, 1.4142135623730951 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 1.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 0.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 84.5, 23.350588857671234 ], "wc_summary_review_avg": [ 46.75, 43.52226441719227 ], "wc_main_review_avg": [ 428.25, 176.14252042025515 ], "wc_review_avg": [ 559.5, 171.75345702488787 ], "wc_reply_reviewers_avg": [ 99.5, 172.33905535310328 ], "wc_reply_authors_avg": [ 660.25, 335.88865342550645 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.5, 0.8660254037844386 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.5, "corr_recommendation_correctness": 0.8528028654224418, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2410539395427686474&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;0;1;2", "aff_unique_norm": "Swiss Federal Institute of Technology Lausanne;Salesforce;Swiss Federal Institute of Technology", "aff_unique_dep": ";;", "aff_unique_url": "https://www.epfl.ch;https://www.salesforce.com;https://www.ethz.ch", "aff_unique_abbr": "EPFL;Salesforce;ETH Zurich", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Lausanne;", "aff_country_unique_index": "0;0;0;1;0", "aff_country_unique": "Switzerland;United States" }, { "title": "Selective Ensembles for Consistent Predictions", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6729", "id": "HfUyCRBeQc", "poster": "", "openreview": "https://openreview.net/forum?id=HfUyCRBeQc", "slides": "https://iclr.cc/virtual/2022/poster/6729", "video": "https://iclr.cc/virtual/2022/poster/6729", "author_site": "Emily Black, Klas Leino, Matt Fredrikson", "tldr": "", "abstract": "Recent work has shown that models trained to the same objective, and which achieve similar measures of accuracy on consistent test data, may nonetheless behave very differently on individual predictions. This inconsistency is undesirable in high-stakes contexts, such as medical diagnosis and finance. We show that this duplicitous behavior extends beyond predictions to feature attributions, which may likewise have negative implications for the intelligibility of a model, and one's ability to find recourse for subjects. We then introduce selective ensembles to mitigate such inconsistencies by applying hypothesis testing to the predictions of a set of models trained using randomly-selected starting conditions; importantly, selective ensembles can abstain in cases where a consistent outcome cannot be achieved up to a specified confidence level. We prove that that prediction disagreement between selective ensembles is bounded, and empirically demonstrate that selective ensembles achieve consistent predictions and feature attributions while maintaining low abstention rates. On several benchmark datasets, selective ensembles reach zero inconsistently predicted points, with abstention rates as low as 1.5%.", "keywords": "consistency;prediction consistency;model duplicity;inconsistent predictions;deep models;deep networks;explanations;saliency maps;gradient-based explanations;fairness;interpretability", "primary_area": "", "supplementary_material": "/attachment/fa0e74655ac3aaa30612686210e7874379e58615.zip", "author": "Emily Black;Klas Leino;Matt Fredrikson", "authorids": "~Emily_Black1;~Klas_Leino1;~Matt_Fredrikson1", "gender": "F;M;M", "homepage": "https://emblack.github.io/;https://klas.leino.tech;https://cs.cmu.edu/~mfredrik", "dblp": "197/2977;;38/2612", "google_scholar": "dBkGY6gAAAAJ;;https://scholar.google.com.tw/citations?user=tMYCvLAAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Emily_Black1;~Klas_Leino1;~Matt_Fredrikson1", "aff": "Carnegie Mellon University;Carnegie Mellon University;Carnegie Mellon University", "aff_domain": "cmu.edu;cs.cmu.edu;cmu.edu", "position": "PhD student;PhD student;Associate Professor", "bibtex": "@inproceedings{\nblack2022selective,\ntitle={Selective Ensembles for Consistent Predictions},\nauthor={Emily Black and Klas Leino and Matt Fredrikson},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=HfUyCRBeQc}\n}", "github": "", "project": "", "reviewers": "dvdh;AzPU;5ZWm;hSPx", "pdf_size": 0, "recommendation": "5;5;6;8", "confidence": "4;2;2;4", "correctness": "4;2;4;4", "technical_novelty": "3;3;3;3", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "39;69;156;75", "wc_summary_review": "48;31;68;59", "wc_main_review": "215;245;155;469", "wc_review": "302;345;379;603", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "642;1663;19;1122", "reply_reviewers": "0;0;0;0", "reply_authors": "1;3;1;3", "recommendation_avg": [ 6.0, 1.224744871391589 ], "confidence_avg": [ 3.0, 1.0 ], "correctness_avg": [ 3.5, 0.8660254037844386 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 84.75, 43.338060639581 ], "wc_summary_review_avg": [ 51.5, 13.793114224133722 ], "wc_main_review_avg": [ 271.0, 118.81919036923287 ], "wc_review_avg": [ 407.25, 116.26344008328671 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 861.5, 605.8566249534621 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.0, 1.0 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.40824829046386296, "corr_recommendation_correctness": 0.4714045207910316, "gs_citation": 35, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11565273844338967232&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=HfUyCRBeQc", "email": "cmu.edu;cs.cmu.edu;cmu.edu", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Carnegie Mellon University", "aff_unique_dep": "", "aff_unique_url": "https://www.cmu.edu", "aff_unique_abbr": "CMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "Hfw5Q2Zn1w", "title": "Modeling and Eliminating Adversarial Examples using Function Theory of Several Complex Variables", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "The reliability of a learning model is key to the successful deployment of machine learning in various industries. Training a robust model, unaffected by adversarial attacks, requires a comprehensive understanding of the adversarial examples phenomenon. This paper presents a model and a solution for the existence and transfer of adversarial examples in analytic hypotheses. Grounded in the function theory of several complex variables, we propose the class of complex-valued holomorphic hypotheses as a natural way to represent the submanifold of the samples and the decision boundary simultaneously. To describe the mechanism in which the adversarial examples occur and transfer, we specialize the definitions of the optimal Bayes and the maximum margin classifiers to this class of hypotheses. The approach is validated initially on both synthetic and real-world classification problems using polynomials. Backed by theoretical and experimental results, we believe the analysis to apply to other classes of analytic hypotheses such as neural networks.", "keywords": "adversarial examples;learning theory;robust training;complex analysis", "primary_area": "", "supplementary_material": "/attachment/0310d6b59cd86e3b6fba394f4569faf0a338887c.zip", "author": "Ramin Barati;Reza Safabakhsh;Mohammad Rahmati", "authorids": "~Ramin_Barati1;~Reza_Safabakhsh1;~Mohammad_Rahmati1", "gender": "M;M;M", "homepage": ";https://scholar.google.com/citations?user=zFsdqo8AAAAJ&hl=nl&oi=ao;http://www.aut.ac.ir/rahmati", "dblp": "204/0693;44/839;90/6740", "google_scholar": "MMTey0gAAAAJ;zFsdqo8AAAAJ;", "orcid": ";;", "linkedin": "ramin-barati-b1678289/;;", "or_profile": "~Ramin_Barati1;~Reza_Safabakhsh1;~Mohammad_Rahmati1", "aff": "Amirkabir University of Technology;Amirkabir University of Technology;", "aff_domain": "aut.ac.ir;aut.ac.ir;", "position": "PhD student;Full Professor;", "bibtex": "@misc{\nbarati2022modeling,\ntitle={Modeling and Eliminating Adversarial Examples using Function Theory of Several Complex Variables},\nauthor={Ramin Barati and Reza Safabakhsh and Mohammad Rahmati},\nyear={2022},\nurl={https://openreview.net/forum?id=Hfw5Q2Zn1w}\n}", "github": "", "project": "", "reviewers": "yA5N;uYxa;QiEP;bNXx", "site": "https://openreview.net/forum?id=Hfw5Q2Zn1w", "pdf_size": 0, "recommendation": "1;3;3;3", "confidence": "2;4;2;2", "correctness": "2;2;3;2", "technical_novelty": "2;4;3;4", "empirical_novelty": "1;0;0;2", "wc_summary_paper": "54;95;24;18", "wc_summary_review": "19;30;34;54", "wc_main_review": "140;335;217;235", "wc_review": "213;460;275;307", "wc_reply_reviewers": "162;0;0;0", "wc_reply_authors": "680;38;209;509", "reply_reviewers": "1;0;0;0", "reply_authors": "2;1;1;1", "recommendation_avg": [ 2.5, 0.8660254037844386 ], "confidence_avg": [ 2.5, 0.8660254037844386 ], "correctness_avg": [ 2.25, 0.4330127018922193 ], "technical_novelty_avg": [ 3.25, 0.82915619758885 ], "empirical_novelty_avg": [ 0.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 47.75, 30.498975392625898 ], "wc_summary_review_avg": [ 34.25, 12.65652005884714 ], "wc_main_review_avg": [ 231.75, 69.47436577616236 ], "wc_review_avg": [ 313.75, 90.94881802420524 ], "wc_reply_reviewers_avg": [ 40.5, 70.14805770653953 ], "wc_reply_authors_avg": [ 359.0, 250.54041590130723 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:sH56XYSVIR0J:scholar.google.com/&scioq=Modeling+and+Eliminating+Adversarial+Examples+using+Function+Theory+of+Several+Complex+Variables&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Amirkabir University of Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.aut.ac.ir", "aff_unique_abbr": "AUT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Iran" }, { "id": "Hg7xLoENqHW", "title": "Robust Imitation via Mirror Descent Inverse Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Adversarial imitation learning techniques are based on modeling statistical divergences using agent and expert demonstration data. However, unbiased minimization of these divergences is not usually guaranteed due to the geometry of the underlying space. Furthermore, when the size of demonstrations is not sufficient, estimated reward functions from the discriminative signals become uncertain and fail to give informative feedback. Instead of formulating a global cost at once, we consider reward functions as an iterative sequence in a proximal method. In this paper, we show that rewards dervied by mirror descent ensures minimization of a Bregman divergence in terms of a rigorous regret bound of $\\mathcal{O}(1/T)$ for a particular condition of step sizes $\\{\\eta_t\\}_{t=1}^T$. The resulting mirror descent adversarial inverse reinforcement learning (MD-AIRL) algorithm gradually advances a parameterized reward function in an associated reward space, and the sequence of such functions provides optimization targets for the policy space. We empirically validate our method in discrete and continuous benchmarks and show that MD-AIRL outperforms previous methods in various settings.", "keywords": "inverse reinforcement learning;reward learning;regularized markov decision processes;imitation learning", "primary_area": "", "supplementary_material": "", "author": "Dong-Sig Han;Hyunseo Kim;Hyundo Lee;JeHwan Ryu;Byoung-Tak Zhang", "authorids": "~Dong-Sig_Han2;~Hyunseo_Kim1;~Hyundo_Lee1;~JeHwan_Ryu2;~Byoung-Tak_Zhang1", "gender": ";M;M;M;M", "homepage": "https://hskalena.github.io/;;http://bi.snu.ac.kr/~jhryu;https://bi.snu.ac.kr/~btzhang/;https://dshan4585.github.io", "dblp": "264/5421;242/8192;202/6803;09/5682;218/7109", "google_scholar": "5R0JMRwAAAAJ;https://scholar.google.com/citations?view_op=list_works;;sYTUOu8AAAAJ;h1hMIKcAAAAJ", "orcid": ";;;;", "linkedin": "hyunseo-kim-50a1b7160;;;;", "or_profile": "~Hyunseo_Kim1;~Hyundo_Lee1;~JeHwan_Ryu2;~Byoung-Tak_Zhang1;~Dong-Sig_Han_Han1", "aff": "Seoul National University;Seoul National University;Seoul National University;Seoul National University;Seoul National University", "aff_domain": "snu.ac.kr;snu.ac.kr;snu.ac.kr;snu.ac.kr;snu.ac.kr", "position": "PhD student;PhD student;PhD student;Full Professor;PhD student", "bibtex": "@misc{\nhan2022robust,\ntitle={Robust Imitation via Mirror Descent Inverse Reinforcement Learning},\nauthor={Dong-Sig Han and Hyunseo Kim and Hyundo Lee and JeHwan Ryu and Byoung-Tak Zhang},\nyear={2022},\nurl={https://openreview.net/forum?id=Hg7xLoENqHW}\n}", "github": "", "project": "", "reviewers": "c6kk;fhXG;4kT4", "site": "https://openreview.net/forum?id=Hg7xLoENqHW", "pdf_size": 0, "recommendation": "5;5;5", "confidence": "4;3;3", "correctness": "3;3;4", "technical_novelty": "3;3;2", "empirical_novelty": "3;2;1", "wc_summary_paper": "63;45;55", "wc_summary_review": "65;99;24", "wc_main_review": "381;169;331", "wc_review": "509;313;410", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "648;233;866", "reply_reviewers": "0;0;0", "reply_authors": "1;1;2", "recommendation_avg": [ 5.0, 0.0 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.0, 0.816496580927726 ], "wc_summary_paper_avg": [ 54.333333333333336, 7.363574011458174 ], "wc_summary_review_avg": [ 62.666666666666664, 30.663043264200347 ], "wc_main_review_avg": [ 293.6666666666667, 90.48511234206187 ], "wc_review_avg": [ 410.6666666666667, 80.018053518496 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 582.3333333333334, 262.55962285841963 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8704766597257533843&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Seoul National University", "aff_unique_dep": "", "aff_unique_url": "https://www.snu.ac.kr", "aff_unique_abbr": "SNU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "South Korea" }, { "id": "HiHWMiLP035", "title": "E$^2$CM: Early Exit via Class Means for Efficient Supervised and Unsupervised Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "State-of-the-art neural networks with early exit mechanisms often need considerable amount of training and fine-tuning to achieve good performance with low computational cost. We propose a novel early exit technique, E$^2$CM, based on the class means of samples. Unlike most existing schemes, E$^2$CM does not require gradient-based training of internal classifiers. This makes it particularly useful for neural network training in low-power devices, as in wireless edge networks. In particular, given a fixed training time budget, E$^2$CM achieves higher accuracy as compared to existing early exit mechanisms. Moreover, if there are no limitations on the training time budget, E$^2$CM can be combined with an existing early exit scheme to boost the latter's performance, achieving a better trade-off between computational cost and network accuracy. We also show that E$^2$CM can be used to decrease the computational cost in unsupervised learning tasks.", "keywords": "class means;early exit;efficient neural networks", "primary_area": "", "supplementary_material": "/attachment/f401c4ea8bb090cd0fe00ccc637f3c08f4688d2e.zip", "author": "Alperen Gormez;Erdem Koyuncu", "authorids": "~Alperen_Gormez1;~Erdem_Koyuncu1", "gender": ";", "homepage": "https://alperengormez.github.io/;https://sites.google.com/uic.edu/erdem", "dblp": "286/7913;88/1847", "google_scholar": "https://scholar.google.com.tr/citations?user=ahpMJhYAAAAJ;VxwcmkwAAAAJ", "orcid": "0009-0006-8657-6853;0000-0002-6238-0470", "linkedin": "alperengormez/;", "or_profile": "~Alperen_Gormez1;~Erdem_Koyuncu1", "aff": "University of Illinois, Chicago;", "aff_domain": "uic.edu;", "position": "PhD student;", "bibtex": "@misc{\ngormez2022ecm,\ntitle={E\\${\\textasciicircum}2\\${CM}: Early Exit via Class Means for Efficient Supervised and Unsupervised Learning},\nauthor={Alperen Gormez and Erdem Koyuncu},\nyear={2022},\nurl={https://openreview.net/forum?id=HiHWMiLP035}\n}", "github": "", "project": "", "reviewers": "Kvv8;7dU6;ZDik;jksT", "site": "https://openreview.net/forum?id=HiHWMiLP035", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "5;5;4;3", "correctness": "3;3;4;3", "technical_novelty": "3;3;2;2", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "127;73;80;69", "wc_summary_review": "40;83;5;110", "wc_main_review": "799;597;309;706", "wc_review": "966;753;394;885", "wc_reply_reviewers": "443;808;0;0", "wc_reply_authors": "498;767;180;455", "reply_reviewers": "1;1;0;0", "reply_authors": "2;2;1;1", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 4.25, 0.82915619758885 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 87.25, 23.284920012746447 ], "wc_summary_review_avg": [ 59.5, 40.16528351698765 ], "wc_main_review_avg": [ 602.75, 184.0494159186603 ], "wc_review_avg": [ 749.5, 218.87496430610787 ], "wc_reply_reviewers_avg": [ 312.75, 338.32778115312965 ], "wc_reply_authors_avg": [ 475.0, 208.09733299588441 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.9045340337332909, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9590433137259392381&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0", "aff_unique_norm": "University of Illinois at Chicago", "aff_unique_dep": "", "aff_unique_url": "https://www.uic.edu", "aff_unique_abbr": "UIC", "aff_campus_unique_index": "0", "aff_campus_unique": "Chicago", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "id": "HmFBdvBkUUY", "title": "SpecTRA: Spectral Transformer for Graph Representation Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Transformers have recently been applied in the more generic domain of graphs. For the same, researchers proposed various positional and structural encoding schemes to overcome the limitation of transformers in modeling the positional invariance in graphs and graph topology. Some of these encoding techniques use the spectrum of the graph. In addition to graph topology, graph signals could be multi-channeled and contain heterogeneous information. We argue that transformers cannot model multichannel signals inherently spread over the graph spectrum. To this end, we propose SpecTRA, a novel approach that induces a spectral module into the transformer architecture to enable decomposition of graph spectrum and selectively learn useful information akin to filtering in the frequency domain. Results on standard benchmark datasets show that the proposed method performs comparably or better than existing transformer and GNN based architectures.", "keywords": "Graph Representation Learning;Transformer;GNNs", "primary_area": "", "supplementary_material": "", "author": "Anson Bastos;Abhishek Nadgeri;Kuldeep Singh;Hiroki Kanezashi;Toyotaro Suzumura;Isaiah Onando Mulang'", "authorids": "~Anson_Bastos1;~Abhishek_Nadgeri1;~Kuldeep_Singh1;~Hiroki_Kanezashi1;~Toyotaro_Suzumura1;~Isaiah_Onando_Mulang'1", "gender": ";M;Not Specified;M;M;M", "homepage": ";;;https://hkanezashi.github.io/;;https://www.mulangonando.com/", "dblp": "220/4367;249/6479;81/4530;;99/844;210/4225.html", "google_scholar": "is7rRuAAAAAJ;b0FAYMkAAAAJ;23EKFE4AAAAJ;MP6ZscYAAAAJ;tY3BWm0AAAAJ;_mKqVYEAAAAJ", "orcid": ";;;0000-0002-8329-6235;0000-0001-6412-8386;0000-0002-0554-0511", "linkedin": ";abhishek-nadgeri-a7546456/?originalSubdomain=de;;;;mulang-onando-ph-d-31a16ab1/", "or_profile": "~Anson_Bastos1;~Abhishek_Nadgeri1;~Kuldeep_Singh1;~Hiroki_Kanezashi1;~Toyotaro_Suzumura1;~Isaiah_Onando_Mulang'1", "aff": "Indian Institute of Technology Hyderabad;;Cerence GmbH;AIST, National Institute of Advanced Industrial Science and Technology;The University of Tokyo;Jomo Kenyatta University of Agriculture and Technology", "aff_domain": "iith.ac.in;;cerence.com;aist.go.jp;u-tokyo.ac.jp;jkuat.ac.ke", "position": "PhD student;;Sr. Product Manager;Postdoc;Professor;Instructor", "bibtex": "@misc{\nbastos2022spectra,\ntitle={Spec{TRA}: Spectral Transformer for Graph Representation Learning},\nauthor={Anson Bastos and Abhishek Nadgeri and Kuldeep Singh and Hiroki Kanezashi and Toyotaro Suzumura and Isaiah Onando Mulang'},\nyear={2022},\nurl={https://openreview.net/forum?id=HmFBdvBkUUY}\n}", "github": "", "project": "", "reviewers": "7T9R;VYTd;5mRd;iKKm", "site": "https://openreview.net/forum?id=HmFBdvBkUUY", "pdf_size": 0, "recommendation": "3;5;5;5", "confidence": "5;4;2;4", "correctness": "2;4;4;3", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;2;3;2", "wc_summary_paper": "54;67;50;48", "wc_summary_review": "12;62;62;33", "wc_main_review": "387;293;209;300", "wc_review": "453;422;321;381", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "850;995;821;779", "reply_reviewers": "0;0;0;0", "reply_authors": "2;3;2;2", "recommendation_avg": [ 4.5, 0.8660254037844386 ], "confidence_avg": [ 3.75, 1.0897247358851685 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 54.75, 7.39509972887452 ], "wc_summary_review_avg": [ 42.25, 21.09946681790798 ], "wc_main_review_avg": [ 297.25, 62.98561343672061 ], "wc_review_avg": [ 394.25, 49.403314666123364 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 861.25, 81.24153802089175 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.25, 0.4330127018922193 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.6622661785325219, "corr_recommendation_correctness": 0.8703882797784891, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:7ulJfDah5_MJ:scholar.google.com/&scioq=SpecTRA:+Spectral+Transformer+for+Graph+Representation+Learning&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1;2;3;4", "aff_unique_norm": "Indian Institute of Technology Hyderabad;Cerence;National Institute of Advanced Industrial Science and Technology;University of Tokyo;Jomo Kenyatta University of Agriculture and Technology", "aff_unique_dep": ";;;;", "aff_unique_url": "https://www.iith.ac.in;https://www.cerence.com;https://www.aist.go.jp;https://www.u-tokyo.ac.jp;https://www.jkuat.ac.ke", "aff_unique_abbr": "IIT Hyderabad;Cerence;AIST;UTokyo;JKUAT", "aff_campus_unique_index": "0", "aff_campus_unique": "Hyderabad;", "aff_country_unique_index": "0;1;2;2;3", "aff_country_unique": "India;Germany;Japan;Kenya" }, { "title": "Learning to Downsample for Segmentation of Ultra-High Resolution Images", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6243", "id": "HndgQudNb91", "poster": "", "openreview": "https://openreview.net/forum?id=HndgQudNb91", "slides": "https://iclr.cc/virtual/2022/poster/6243", "video": "https://iclr.cc/virtual/2022/poster/6243", "author_site": "Chen Jin, Ryutaro Tanno, Thomy Mertzanidou, Eleftheria Panagiotaki, Daniel Alexander", "tldr": "", "abstract": "Many computer vision systems require low-cost segmentation algorithms based on deep learning, either because of the enormous size of input images or limited computational budget. Common solutions uniformly downsample the input images to meet memory constraints, assuming all pixels are equally informative. In this work, we demonstrate that this assumption can harm the segmentation performance\nbecause the segmentation difficulty varies spatially (see Figure 1 \u201cUniform\u201d). We combat this problem by introducing a learnable downsampling module, which can be optimised together with the given segmentation model in an end-to-end fashion. We formulate the problem of training such downsampling module as optimisation of sampling density distributions over the input images given their low-resolution views. To defend against degenerate solutions (e.g. over-sampling trivial regions like the backgrounds), we propose a regularisation term that encourages the sampling locations to concentrate around the object boundaries. We find the downsampling\nmodule learns to sample more densely at difficult locations, thereby improving the segmentation performance (see Figure 1 \"Ours\"). Our experiments on benchmarks of high-resolution street view, aerial and medical images demonstrate substantial improvements in terms of efficiency-and-accuracy trade-off compared to both uniform downsampling and two recent advanced downsampling techniques.", "keywords": "ultra-high resolution image segmentation;non-uniform dowmsampling;efficient segmentation;large volume image segmentation;medical image segmentation", "primary_area": "", "supplementary_material": "", "author": "Chen Jin;Ryutaro Tanno;Thomy Mertzanidou;Eleftheria Panagiotaki;Daniel C. Alexander", "authorids": "~Chen_Jin3;~Ryutaro_Tanno1;~Thomy_Mertzanidou1;~Eleftheria_Panagiotaki1;~Daniel_C._Alexander1", "gender": ";M;;;M", "homepage": "https://lxasqjc.github.io;https://rt416.github.io/;http://www0.cs.ucl.ac.uk/staff/T.Mertzanidou/;;http://www.cs.ucl.ac.uk/staff/d.alexander", "dblp": ";187/6071;;;37/6152", "google_scholar": "https://scholar.google.co.uk/citations?user=4on9TiAAAAAJ;https://scholar.google.co.uk/citations?user=NiEvNoEAAAAJ;https://scholar.google.co.uk/citations?user=xbXr0OoAAAAJ;;https://scholar.google.co.uk/citations?user=mH-ZOQEAAAAJ", "orcid": "0000-0002-2179-6445;;;;0000-0003-2439-350X", "linkedin": "chen-jin-33287593/;;;;daniel-alexander-2b096737", "or_profile": "~Chen_Jin3;~Ryutaro_Tanno1;~Thomy_Mertzanidou1;~Eleftheria_Panagiotaki1;~Daniel_C._Alexander1", "aff": "University College London;Microsoft Research Cambridge;University College London;;University College London", "aff_domain": "ucl.ac.uk;microsoft.com;ucl.ac.uk;;ucl.ac.uk", "position": "Researcher;Researcher;Postdoc;;Full Professor", "bibtex": "@inproceedings{\njin2022learning,\ntitle={Learning to Downsample for Segmentation of Ultra-High Resolution Images},\nauthor={Chen Jin and Ryutaro Tanno and Thomy Mertzanidou and Eleftheria Panagiotaki and Daniel C. Alexander},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=HndgQudNb91}\n}", "github": "", "project": "", "reviewers": "eqeg;EMME;wmVs;wnXt", "pdf_size": 0, "recommendation": "6;6;6;8", "confidence": "3;2;3;3", "correctness": "4;3;3;4", "technical_novelty": "3;3;2;3", "empirical_novelty": "2;3;2;3", "wc_summary_paper": "91;66;70;151", "wc_summary_review": "20;56;67;56", "wc_main_review": "99;153;414;111", "wc_review": "210;275;551;318", "wc_reply_reviewers": "0;0;45;0", "wc_reply_authors": "126;560;1368;572", "reply_reviewers": "0;0;1;0", "reply_authors": "1;1;3;1", "recommendation_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 2.75, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 94.5, 33.974254958718376 ], "wc_summary_review_avg": [ 49.75, 17.75352077758099 ], "wc_main_review_avg": [ 194.25, 128.44721678572876 ], "wc_review_avg": [ 338.5, 128.57001983355218 ], "wc_reply_reviewers_avg": [ 11.25, 19.48557158514987 ], "wc_reply_authors_avg": [ 656.5, 448.36229770131206 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.5, 0.8660254037844386 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 47, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11044772985924964414&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=HndgQudNb91", "email": "ucl.ac.uk;microsoft.com;ucl.ac.uk;;ucl.ac.uk", "author_num": 5, "aff_unique_index": "0;1;0;0", "aff_unique_norm": "University College London;Microsoft", "aff_unique_dep": ";Microsoft Research", "aff_unique_url": "https://www.ucl.ac.uk;https://www.microsoft.com/en-us/research/group/microsoft-research-cambridge", "aff_unique_abbr": "UCL;MSR Cambridge", "aff_campus_unique_index": "1", "aff_campus_unique": ";Cambridge", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United Kingdom" }, { "id": "HpLOYOBbnt", "title": "Generalized Maximum Entropy Reinforcement Learning via Reward Shaping", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Entropy regularization is a commonly used technique in reinforcement learning to improve exploration and cultivate a better pre-trained policy for later adaptation. Recent studies further show that the use of entropy regularization can smooth the optimization landscape and simplify the policy optimization process, which indicates the value of integrating entropy into reinforcement learning. However, existing studies only consider the policy\u2019s entropy at the current state as an extra regularization term in the policy gradient or in the objective function, while the topic of integrating the entropy into the reward function has not been investigated. In this paper, we propose a shaped reward that includes the agent\u2019s policy entropy into the reward function. In particular, the agent\u2019s entropy at the next state is added to the immediate reward associated with the current state. The addition of the agent\u2019s policy entropy at the next state, instead of the policy entropy at the current state as used in the existing maximum entropy reinforcement learning framework, considers both state and action uncertainties. This distinguishes our work from the existing maximum entropy reinforcement learning framework via providing better action exploration and better control policies. We also show the addition of the agent\u2019s policy entropy at the next state yields new soft Q function and state value function that are concise and modular. Hence, the new reinforcement learning framework can be easily applied to the existing standard reinforcement learning algorithms while inheriting the benefits of employing entropy regularization. We further present a soft stochastic policy gradient theorem based on the shaped reward and propose a new practical reinforcement learning algorithm. Finally, a few experimental studies are conducted in the MuJoCo environment to demonstrate that our method can outperform the existing state-of-the-art reinforcement learning approaches.", "keywords": "Reinforcement Learning;Reward Shaping;Soft Policy Gradient", "primary_area": "", "supplementary_material": "/attachment/446d37a0e403d46d1fd61e822494fe39331023c6.zip", "author": "Feng Tao;Yongcan Cao", "authorids": "~Feng_Tao2;~Yongcan_Cao1", "gender": ";M", "homepage": "https://utsausl.wixsite.com/utsausl;", "dblp": ";", "google_scholar": ";BLZoldYAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Yongcan_Cao1;~Feng_Tao3", "aff": ";Volvo Cars", "aff_domain": ";volvocars.com", "position": ";Researcher", "bibtex": "@misc{\ntao2022generalized,\ntitle={Generalized Maximum Entropy Reinforcement Learning via Reward Shaping},\nauthor={Feng Tao and Yongcan Cao},\nyear={2022},\nurl={https://openreview.net/forum?id=HpLOYOBbnt}\n}", "github": "", "project": "", "reviewers": "39DD;3vUj;e716;hUnE", "site": "https://openreview.net/forum?id=HpLOYOBbnt", "pdf_size": 0, "recommendation": "3;5;5;5", "confidence": "4;3;4;2", "correctness": "1;3;4;3", "technical_novelty": "1;4;2;3", "empirical_novelty": "0;4;3;0", "wc_summary_paper": "93;100;137;42", "wc_summary_review": "27;31;57;35", "wc_main_review": "399;175;185;164", "wc_review": "519;306;379;241", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.5, 0.8660254037844386 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "correctness_avg": [ 2.75, 1.0897247358851685 ], "technical_novelty_avg": [ 2.5, 1.118033988749895 ], "empirical_novelty_avg": [ 1.75, 1.7853571071357126 ], "wc_summary_paper_avg": [ 93.0, 33.860005906674026 ], "wc_summary_review_avg": [ 37.5, 11.6081867662439 ], "wc_main_review_avg": [ 230.75, 97.42272578818559 ], "wc_review_avg": [ 361.25, 103.33531584119729 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.5222329678670935, "corr_recommendation_correctness": 0.9271726499455306, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10494141279322062837&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0", "aff_unique_norm": "Volvo Cars", "aff_unique_dep": "", "aff_unique_url": "https://www.volvocars.com", "aff_unique_abbr": "Volvo", "aff_country_unique_index": "0", "aff_country_unique": "Sweden" }, { "title": "Efficient and Differentiable Conformal Prediction with General Function Classes", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6067", "id": "Ht85_jyihxp", "poster": "", "openreview": "https://openreview.net/forum?id=Ht85_jyihxp", "slides": "https://iclr.cc/virtual/2022/poster/6067", "video": "https://iclr.cc/virtual/2022/poster/6067", "author_site": "Yu Bai, Song Mei, Huan Wang, Yingbo Zhou, Caiming Xiong", "tldr": "", "abstract": " Quantifying the data uncertainty in learning tasks is often done by learning a prediction interval or prediction set of the label given the input. Two commonly desired properties for learned prediction sets are \\emph{valid coverage} and \\emph{good efficiency} (such as low length or low cardinality). Conformal prediction is a powerful technique for learning prediction sets with valid coverage, yet by default its conformalization step only learns a single parameter, and does not optimize the efficiency over more expressive function classes.\n In this paper, we propose a generalization of conformal prediction to multiple learnable parameters, by considering the constrained empirical risk minimization (ERM) problem of finding the most efficient prediction set subject to valid empirical coverage. This meta-algorithm generalizes existing conformal prediction algorithms, and we show that it achieves approximate valid population coverage and near-optimal efficiency within class, whenever the function class in the conformalization step is low-capacity in a certain sense. Next, this ERM problem is challenging to optimize as it involves a non-differentiable coverage constraint. We develop a gradient-based algorithm for it by approximating the original constrained ERM using differentiable surrogate losses and Lagrangians. Experiments show that our algorithm is able to learn valid prediction sets and improve the efficiency significantly over existing approaches in several applications such as prediction intervals with improved length, minimum-volume prediction sets for multi-output regression, and label prediction sets for image classification.", "keywords": "uncertainty quantification;conformal prediction;prediction sets", "primary_area": "", "supplementary_material": "/attachment/b1d5ff1b5876e5ac511edd9ef98d2abf2c9bd41e.zip", "author": "Yu Bai;Song Mei;Huan Wang;Yingbo Zhou;Caiming Xiong", "authorids": "~Yu_Bai1;~Song_Mei1;~Huan_Wang1;~Yingbo_Zhou1;~Caiming_Xiong1", "gender": ";M;M;;M", "homepage": "https://yubai.org;https://www.stat.berkeley.edu/~songmei/;http://www.cs.yale.edu/homes/wang-huan/;;http://cmxiong.com/", "dblp": "03/6325-17.html;https://dblp.org/pers/hd/m/Mei:Song;70/6155-16.html;72/8614;80/7282", "google_scholar": "owqhKD8AAAAJ;https://scholar.google.com.hk/citations?hl=en;7NpTttkAAAAJ;H_6RQ7oAAAAJ;vaSdahkAAAAJ", "orcid": ";;;;", "linkedin": ";;huanwangyale/;yingbozhou/;caiming-xiong-150a1417", "or_profile": "~Yu_Bai1;~Song_Mei1;~Huan_Wang1;~Yingbo_Zhou1;~Caiming_Xiong1", "aff": "Salesforce Research;University of California, Berkeley;Salesforce.com;Salesforce Research;Salesforce Research", "aff_domain": "salesforce.com;berkeley.edu;salesforce.com;salesforce.com;salesforce.com", "position": "Research Scientist;Assistant Professor;Researcher;Research Scientist;Research Scientist", "bibtex": "@inproceedings{\nbai2022efficient,\ntitle={Efficient and Differentiable Conformal Prediction with General Function Classes},\nauthor={Yu Bai and Song Mei and Huan Wang and Yingbo Zhou and Caiming Xiong},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=Ht85_jyihxp}\n}", "github": "", "project": "", "reviewers": "bDLk;G6jH;Pu7a;3LHz", "pdf_size": 0, "recommendation": "6;6;6;8", "confidence": "4;4;4;4", "correctness": "4;4;4;3", "technical_novelty": "3;2;2;4", "empirical_novelty": "3;2;0;3", "wc_summary_paper": "133;64;204;100", "wc_summary_review": "62;137;31;10", "wc_main_review": "653;551;932;333", "wc_review": "848;752;1167;443", "wc_reply_reviewers": "65;0;371;0", "wc_reply_authors": "598;1135;857;525", "reply_reviewers": "1;0;1;0", "reply_authors": "1;2;2;1", "recommendation_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.82915619758885 ], "empirical_novelty_avg": [ 2.0, 1.224744871391589 ], "wc_summary_paper_avg": [ 125.25, 51.601235450326186 ], "wc_summary_review_avg": [ 60.0, 48.15080477001397 ], "wc_main_review_avg": [ 617.25, 215.3675637137589 ], "wc_review_avg": [ 802.5, 258.22519241932997 ], "wc_reply_reviewers_avg": [ 109.0, 153.57571422591528 ], "wc_reply_authors_avg": [ 778.75, 239.84200528681376 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": -1.0, "gs_citation": 27, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=54755366591296300&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=Ht85_jyihxp", "email": "salesforce.com;berkeley.edu;salesforce.com;salesforce.com;salesforce.com", "author_num": 5, "aff_unique_index": "0;1;0;0;0", "aff_unique_norm": "Salesforce;University of California, Berkeley", "aff_unique_dep": "Salesforce Research;", "aff_unique_url": "https://research.salesforce.com;https://www.berkeley.edu", "aff_unique_abbr": "Salesforce;UC Berkeley", "aff_campus_unique_index": "1", "aff_campus_unique": ";Berkeley", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "FedBABU: Toward Enhanced Representation for Federated Image Classification", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6069", "id": "HuaYQfggn5u", "poster": "", "openreview": "https://openreview.net/forum?id=HuaYQfggn5u", "slides": "https://iclr.cc/virtual/2022/poster/6069", "video": "https://iclr.cc/virtual/2022/poster/6069", "author_site": "Jaehoon Oh, SangMook Kim, Se-Young Yun", "tldr": "", "abstract": "Federated learning has evolved to improve a single global model under data heterogeneity (as a curse) or to develop multiple personalized models using data heterogeneity (as a blessing). However, little research has considered both directions simultaneously. In this paper, we first investigate the relationship between them by analyzing Federated Averaging at the client level and determine that a better federated global model performance does not constantly improve personalization. To elucidate the cause of this personalization performance degradation problem, we decompose the entire network into the body (extractor), which is related to universality, and the head (classifier), which is related to personalization. We then point out that this problem stems from training the head. Based on this observation, we propose a novel federated learning algorithm, coined FedBABU, which only updates the body of the model during federated training (i.e., the head is randomly initialized and never updated), and the head is fine-tuned for personalization during the evaluation process. Extensive experiments show consistent performance improvements and an efficient personalization of FedBABU. The code is available at https://github.com/jhoon-oh/FedBABU.", "keywords": "Federated Learning;Representation Learning", "primary_area": "", "supplementary_material": "", "author": "Jaehoon Oh;SangMook Kim;Se-Young Yun", "authorids": "~Jaehoon_Oh1;~SangMook_Kim1;~Se-Young_Yun1", "gender": "M;M;M", "homepage": ";;https://fbsqkd.github.io", "dblp": "180/9951;;23/8862", "google_scholar": "_9XVeDF8AAAAJ;YjpFRuIAAAAJ;X_IAjb8AAAAJ", "orcid": ";;", "linkedin": "jaehoon-oh-841584184/;;seyoung-yun-395130ab/", "or_profile": "~Jaehoon_Oh1;~SangMook_Kim1;~Se-Young_Yun1", "aff": "Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology;KAIST", "aff_domain": "kaist.ac.kr;kaist.ac.kr;kaist.ac.kr", "position": "PhD student;PhD student;Assistant Professor", "bibtex": "@inproceedings{\noh2022fedbabu,\ntitle={Fed{BABU}: Toward Enhanced Representation for Federated Image Classification},\nauthor={Jaehoon Oh and SangMook Kim and Se-Young Yun},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=HuaYQfggn5u}\n}", "github": "", "project": "", "reviewers": "UZKM;yAVA;TnVE;4NJT", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "3;4;4;4", "correctness": "3;3;3;4", "technical_novelty": "2;2;2;3", "empirical_novelty": "3;3;3;3", "wc_summary_paper": "79;75;91;129", "wc_summary_review": "1088;59;110;62", "wc_main_review": "107;423;201;143", "wc_review": "1274;557;402;334", "wc_reply_reviewers": "0;96;0;0", "wc_reply_authors": "1842;594;707;117", "reply_reviewers": "0;1;0;0", "reply_authors": "3;1;1;1", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 93.5, 21.324868112136123 ], "wc_summary_review_avg": [ 329.75, 438.2432971535332 ], "wc_main_review_avg": [ 218.5, 122.73854325353548 ], "wc_review_avg": [ 641.75, 373.8692117572668 ], "wc_reply_reviewers_avg": [ 24.0, 41.569219381653056 ], "wc_reply_authors_avg": [ 815.0, 632.9372006763389 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.5, 0.8660254037844386 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.6622661785325219, "corr_recommendation_correctness": 0.9271726499455306, "gs_citation": 306, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16885136047959281855&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "pdf": "https://openreview.net/pdf?id=HuaYQfggn5u", "email": "kaist.ac.kr;kaist.ac.kr;kaist.ac.kr", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Korea Advanced Institute of Science and Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.kaist.ac.kr", "aff_unique_abbr": "KAIST", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "South Korea" }, { "id": "I-nQMZfQz7F", "title": "Learning Neural Implicit Functions as Object Representations for Robotic Manipulation", "track": "main", "status": "Reject", "tldr": "", "abstract": "Robotic manipulation planning is the problem of finding a sequence of robot configurations that involves interactions with objects in the scene, e.g., grasp, placement, tool-use, etc. To achieve such interactions, traditional approaches require hand-designed features and object representations, and it still remains an open question how to describe such interactions with arbitrary objects in a flexible and efficient way. Inspired by neural implicit representations in 3D modeling, e.g. NeRF, we propose a method to represent objects as neural implicit functions upon which we can define and jointly train interaction features. The proposed pixel-aligned representation is directly inferred from camera images with known camera geometry, naturally acting as a perception component in the whole manipulation pipeline, while at the same time enabling sequential robot manipulation planning.", "keywords": "Representation Learning;nerual implicit representation;robotic manipulation;task and motion planning", "primary_area": "", "supplementary_material": "", "author": "Jung-Su Ha;Danny Driess;Marc Toussaint", "authorids": "~Jung-Su_Ha1;~Danny_Driess1;~Marc_Toussaint3", "gender": "M;;M", "homepage": "https://sites.google.com/view/jung-su-ha;https://dannydriess.github.io/;https://www.user.tu-berlin.de/mtoussai/", "dblp": ";;t/MarcToussaint", "google_scholar": "cabvCW8AAAAJ;https://scholar.google.de/citations?user=wxnzyjwAAAAJ;t2X4Mg8AAAAJ", "orcid": ";;0000-0002-5487-6767", "linkedin": ";;marctoussaint/", "or_profile": "~Jung-Su_Ha1;~Danny_Driess1;~Marc_Toussaint3", "aff": ";Technische Universit\u00e4t Berlin;TU Berlin", "aff_domain": ";tu-berlin.de;tu-berlin.de", "position": ";PhD student;Full Professor", "bibtex": "@misc{\nha2022learning,\ntitle={Learning Neural Implicit Functions as Object Representations for Robotic Manipulation},\nauthor={Jung-Su Ha and Danny Driess and Marc Toussaint},\nyear={2022},\nurl={https://openreview.net/forum?id=I-nQMZfQz7F}\n}", "github": "", "project": "", "reviewers": "gsUt;seqp;23pA;zYJc", "site": "https://openreview.net/forum?id=I-nQMZfQz7F", "pdf_size": 0, "recommendation": "1;5;6;6", "confidence": "4;3;4;3", "correctness": "2;3;3;4", "technical_novelty": "1;2;3;3", "empirical_novelty": "2;2;3;2", "wc_summary_paper": "106;56;68;49", "wc_summary_review": "42;59;65;18", "wc_main_review": "282;427;575;239", "wc_review": "430;542;708;306", "wc_reply_reviewers": "0;74;426;61", "wc_reply_authors": "1033;1624;2023;885", "reply_reviewers": "0;1;2;1", "reply_authors": "2;3;4;2", "recommendation_avg": [ 4.5, 2.0615528128088303 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.25, 0.82915619758885 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 69.75, 22.00426095100674 ], "wc_summary_review_avg": [ 46.0, 18.23458252881047 ], "wc_main_review_avg": [ 380.75, 132.01964815890096 ], "wc_review_avg": [ 496.5, 147.91467134804444 ], "wc_reply_reviewers_avg": [ 140.25, 167.32658933953084 ], "wc_reply_authors_avg": [ 1391.25, 457.6878712616274 ], "reply_reviewers_avg": [ 1.0, 0.7071067811865476 ], "reply_authors_avg": [ 2.75, 0.82915619758885 ], "replies_avg": [ 23, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.48507125007266594, "corr_recommendation_correctness": 0.8574929257125441, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2984980030527443459&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Technische Universit\u00e4t Berlin", "aff_unique_dep": "", "aff_unique_url": "https://www.tu-berlin.de", "aff_unique_abbr": "TU Berlin", "aff_campus_unique_index": "1", "aff_campus_unique": ";Berlin", "aff_country_unique_index": "0;0", "aff_country_unique": "Germany" }, { "id": "I13PP8-cdvz", "title": "SSR-GNNs: Stroke-based Sketch Representation with Graph Neural Networks", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Existing end-to-end visual recognition models do not possess innate spatial invariance and are thus vulnerable to out-of-training attacks. This suggests the need of a better representation design. This paper follows existing cognitive studies to investigate a sketch representation that specify stroke information on vertices and inter-stroke information on edges. The resultant representation, combined with a graph neural network, achieves both high classification accuracy and high robustness against translation, rotation, and stroke-wise parametric and topological attacks thanks to the use of spatially invariant stroke features and GNN architecture. While prior studies demonstrated similar sketch representations for classification and generation, these attempts heavily relied on run-time statistical inference rather than more efficient bottom-up computation via GNN. The presented sketch representation poses good structured expression capability as it enables generation of sketches semantically different from the training dataset. Lastly, we show SSR-GNNs are able to accomplish all tasks (classification, robust feature learning, and novel pattern generation), which shows that the representation is task-agnostic. ", "keywords": "Stroke-based representation;Spatial robustness;Robust feature learning;Novel pattern generation", "primary_area": "", "supplementary_material": "", "author": "Sheng Cheng;Yi Ren;Yezhou Yang", "authorids": "~Sheng_Cheng1;~Yi_Ren3;~Yezhou_Yang1", "gender": ";M;M", "homepage": "https://shengcheng.github.io/;http://designinformaticslab.github.io/;https://yezhouyang.engineering.asu.edu", "dblp": ";;78/7455", "google_scholar": "TWAwdYsAAAAJ;https://scholar.google.com/citations?hl=en;k2suuZgAAAAJ", "orcid": "0000-0001-7244-5998;;", "linkedin": "sheng-cheng-661826118/;;", "or_profile": "~Sheng_Cheng1;~Yi_Ren3;~Yezhou_Yang1", "aff": "Arizona State University;Arizona State University;Arizona State University", "aff_domain": "asu.edu;asu.edu;asu.edu", "position": "PhD student;Assistant Professor;Assistant Professor", "bibtex": "@misc{\ncheng2022ssrgnns,\ntitle={{SSR}-{GNN}s: Stroke-based Sketch Representation with Graph Neural Networks},\nauthor={Sheng Cheng and Yi Ren and Yezhou Yang},\nyear={2022},\nurl={https://openreview.net/forum?id=I13PP8-cdvz}\n}", "github": "", "project": "", "reviewers": "GGYd;pmSP;dfQ2;GAdT", "site": "https://openreview.net/forum?id=I13PP8-cdvz", "pdf_size": 0, "recommendation": "1;3;3;5", "confidence": "5;2;5;5", "correctness": "2;4;2;2", "technical_novelty": "2;2;4;2", "empirical_novelty": "2;0;3;2", "wc_summary_paper": "38;283;117;96", "wc_summary_review": "38;216;80;18", "wc_main_review": "527;153;847;284", "wc_review": "603;652;1044;398", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.0, 1.4142135623730951 ], "confidence_avg": [ 4.25, 1.299038105676658 ], "correctness_avg": [ 2.5, 0.8660254037844386 ], "technical_novelty_avg": [ 2.5, 0.8660254037844386 ], "empirical_novelty_avg": [ 1.75, 1.0897247358851685 ], "wc_summary_paper_avg": [ 133.5, 91.0343341822194 ], "wc_summary_review_avg": [ 88.0, 77.21398836998384 ], "wc_main_review_avg": [ 452.75, 264.2313143819256 ], "wc_review_avg": [ 674.25, 233.77379558025746 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4131904476824407075&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff_unique_index": "0;0;0", "aff_unique_norm": "Arizona State University", "aff_unique_dep": "", "aff_unique_url": "https://www.asu.edu", "aff_unique_abbr": "ASU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "I1dg7let3Q", "title": "Semi-supervised learning objectives as log-likelihoods in a generative model of data curation", "track": "main", "status": "Reject", "tldr": "", "abstract": "We currently do not have an understanding of semi-supervised learning (SSL) objectives such as pseudo-labelling and entropy minimization as log-likelihoods, which precludes the development of e.g. Bayesian SSL. Here, we note that benchmark image datasets such as CIFAR-10 are carefully curated, and we formulate SSL objectives as a log-likelihood in a generative model of data curation that was initially developed to explain the cold-posterior effect (Aitchison 2020). SSL methods, from entropy minimization and pseudo-labelling, to state-of-the-art techniques similar to FixMatch can be understood as lower-bounds on our principled log-likelihood. We are thus able to give a proof-of-principle for Bayesian SSL on toy data. Finally, our theory suggests that SSL is effective in part due to the statistical patterns induced by data curation. This provides an explanation of past results which show SSL performs better on clean datasets without any ``out of distribution'' examples. Confirming these results we find that SSL gave much larger performance improvements on curated than on uncurated data, using matched curated and uncurated datasets based on Galaxy Zoo 2.", "keywords": "Bayesian neural network;semi-supervised learning;Bayesian inference", "primary_area": "", "supplementary_material": "", "author": "Stoil Krasimirov Ganev;Laurence Aitchison", "authorids": "~Stoil_Krasimirov_Ganev1;~Laurence_Aitchison1", "gender": "M;", "homepage": "http://www.bristol.ac.uk/cdt/interactive-ai/current-students/2019-cohort/ganev/;http://www.gatsby.ucl.ac.uk/~laurence/", "dblp": ";155/1918.html", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": "~Stoil_Krasimirov_Ganev1;~Laurence_Aitchison1", "aff": "University of Bristol;University of Bristol", "aff_domain": "bristol.ac.uk;bristol.ac.uk", "position": "PhD student;Assistant Professor", "bibtex": "@misc{\nganev2022semisupervised,\ntitle={Semi-supervised learning objectives as log-likelihoods in a generative model of data curation},\nauthor={Stoil Krasimirov Ganev and Laurence Aitchison},\nyear={2022},\nurl={https://openreview.net/forum?id=I1dg7let3Q}\n}", "github": "", "project": "", "reviewers": "vb5u;FjiX;B51g;1qyP", "site": "https://openreview.net/forum?id=I1dg7let3Q", "pdf_size": 0, "recommendation": "3;3;6;8", "confidence": "4;4;2;4", "correctness": "3;3;4;4", "technical_novelty": "2;2;3;4", "empirical_novelty": "1;2;3;4", "wc_summary_paper": "110;86;94;347", "wc_summary_review": "18;112;31;140", "wc_main_review": "177;499;46;794", "wc_review": "305;697;171;1281", "wc_reply_reviewers": "222;549;0;152", "wc_reply_authors": "1233;1206;16;668", "reply_reviewers": "2;2;0;1", "reply_authors": "3;2;1;1", "recommendation_avg": [ 5.0, 2.1213203435596424 ], "confidence_avg": [ 3.5, 0.8660254037844386 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.82915619758885 ], "empirical_novelty_avg": [ 2.5, 1.118033988749895 ], "wc_summary_paper_avg": [ 159.25, 108.74137896863364 ], "wc_summary_review_avg": [ 75.25, 51.91037950159871 ], "wc_main_review_avg": [ 379.0, 290.82554908398265 ], "wc_review_avg": [ 613.5, 431.1342598309719 ], "wc_reply_reviewers_avg": [ 230.75, 200.50358475598387 ], "wc_reply_authors_avg": [ 780.75, 495.712303155772 ], "reply_reviewers_avg": [ 1.25, 0.82915619758885 ], "reply_authors_avg": [ 1.75, 0.82915619758885 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.2721655269759087, "corr_recommendation_correctness": 0.9428090415820635, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17835368608917619204&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0", "aff_unique_norm": "University of Bristol", "aff_unique_dep": "", "aff_unique_url": "https://www.bristol.ac.uk", "aff_unique_abbr": "Bristol", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United Kingdom" }, { "title": "On Bridging Generic and Personalized Federated Learning for Image Classification", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6129", "id": "I1hQbx10Kxn", "poster": "", "openreview": "https://openreview.net/forum?id=I1hQbx10Kxn", "slides": "https://iclr.cc/virtual/2022/poster/6129", "video": "https://iclr.cc/virtual/2022/poster/6129", "author_site": "Hong-You Chen, Wei-Lun Chao", "tldr": "", "abstract": "Federated learning is promising for its capability to collaboratively train models with multiple clients without accessing their data, but vulnerable when clients' data distributions diverge from each other. This divergence further leads to a dilemma: \"Should we prioritize the learned model's generic performance (for future use at the server) or its personalized performance (for each client)?\" These two, seemingly competing goals have divided the community to focus on one or the other, yet in this paper we show that it is possible to approach both at the same time. Concretely, we propose a novel federated learning framework that explicitly decouples a model's dual duties with two prediction tasks. On the one hand, we introduce a family of losses that are robust to non-identical class distributions, enabling clients to train a generic predictor with a consistent objective across them. On the other hand, we formulate the personalized predictor as a lightweight adaptive module that is learned to minimize each client's empirical risk on top of the generic predictor. With this two-loss, two-predictor framework which we name Federated Robust Decoupling (Fed-RoD), the learned model can simultaneously achieve state-of-the-art generic and personalized performance, essentially bridging the two tasks. ", "keywords": "federated learning;personalization;image classification", "primary_area": "", "supplementary_material": "", "author": "Hong-You Chen;Wei-Lun Chao", "authorids": "~Hong-You_Chen1;~Wei-Lun_Chao1", "gender": ";M", "homepage": "https://sites.google.com/view/hongyouc/%E9%A6%96%E9%A0%81;https://sites.google.com/view/wei-lun-harry-chao", "dblp": "228/5569;64/8842", "google_scholar": "uxlU7J8AAAAJ;PGKakWwAAAAJ", "orcid": ";0000-0003-1269-7231", "linkedin": ";", "or_profile": "~Hong-You_Chen1;~Wei-Lun_Chao1", "aff": ";Ohio State University", "aff_domain": ";osu.edu", "position": ";Assistant Professor", "bibtex": "@inproceedings{\nchen2022on,\ntitle={On Bridging Generic and Personalized Federated Learning for Image Classification},\nauthor={Hong-You Chen and Wei-Lun Chao},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=I1hQbx10Kxn}\n}", "github": "", "project": "", "reviewers": "KbQt;RbND;ZY3s", "pdf_size": 0, "recommendation": "5;8;8", "confidence": "2;4;3", "correctness": "3;3;3", "technical_novelty": "3;3;2", "empirical_novelty": "3;3;3", "wc_summary_paper": "70;60;112", "wc_summary_review": "41;26;50", "wc_main_review": "134;530;146", "wc_review": "245;616;308", "wc_reply_reviewers": "0;0;31", "wc_reply_authors": "147;1048;497", "reply_reviewers": "0;0;1", "reply_authors": "1;2;1", "recommendation_avg": [ 7.0, 1.4142135623730951 ], "confidence_avg": [ 3.0, 0.816496580927726 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 80.66666666666667, 22.5289936649544 ], "wc_summary_review_avg": [ 39.0, 9.899494936611665 ], "wc_main_review_avg": [ 270.0, 183.9130229211624 ], "wc_review_avg": [ 389.6666666666667, 162.09530804094513 ], "wc_reply_reviewers_avg": [ 10.333333333333334, 14.613540144521982 ], "wc_reply_authors_avg": [ 564.0, 370.8701479853382 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.8660254037844387, "corr_recommendation_correctness": 0.0, "gs_citation": 344, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3469194395993782827&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=I1hQbx10Kxn", "email": ";osu.edu", "author_num": 2, "aff_unique_index": "0", "aff_unique_norm": "Ohio State University", "aff_unique_dep": "", "aff_unique_url": "https://www.osu.edu", "aff_unique_abbr": "OSU", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "title": "Improving Non-Autoregressive Translation Models Without Distillation", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7197", "id": "I2Hw58KHp8O", "poster": "", "openreview": "https://openreview.net/forum?id=I2Hw58KHp8O", "slides": "https://iclr.cc/virtual/2022/poster/7197", "video": "https://iclr.cc/virtual/2022/poster/7197", "author_site": "Xiao Shi (Gary) Huang, Felipe Perez, Maksims Volkovs", "tldr": "", "abstract": "Transformer-based autoregressive (AR) machine translation models have achieved significant performance improvements, nearing human-level accuracy on some languages. The AR framework translates one token at a time which can be time consuming, especially for long sequences. To accelerate inference, recent work has been exploring non-autoregressive (NAR) approaches that translate blocks of tokens in parallel. Despite significant progress, leading NAR models still lag behind their AR counterparts, and only become competitive when trained with distillation. In this paper we investigate possible reasons behind this performance gap, namely, the indistinguishability of tokens, and mismatch between training and inference. We then propose the Conditional Masked Language Model with Correction (CMLMC) that addresses these problems. Empirically, we show that CMLMC achieves state-of-the-art NAR performance when trained on raw data without distillation and approaches AR performance on multiple datasets. Full code for this work will be released at the time of publication.", "keywords": "Natural Language Processing;Deep Learning;Non-autoregressive Machine Translation;Transformer;Distillation", "primary_area": "", "supplementary_material": "", "author": "Xiao Shi Huang;Felipe Perez;Maksims Volkovs", "authorids": "~Xiao_Shi_Huang1;~Felipe_Perez1;~Maksims_Volkovs3", "gender": "M;;M", "homepage": ";;https://www.cs.toronto.edu/~mvolkovs", "dblp": "280/1580;;22/1815", "google_scholar": "dPnxxNEAAAAJ;mWHOTrNIYTUC;https://scholar.google.ca/citations?user=m9I8jgcAAAAJ", "orcid": ";;", "linkedin": "xiaoshihuang/;;", "or_profile": "~Xiao_Shi_Huang1;~Felipe_Perez1;~Maksims_Volkovs1", "aff": "Layer 6 AI;Layer6;Layer6 AI", "aff_domain": "layer6.ai;layer6.ai;layer6.ai", "position": "Senior Machine Learning Scientist;Researcher;Principal Researcher", "bibtex": "@inproceedings{\nhuang2022improving,\ntitle={Improving Non-Autoregressive Translation Models Without Distillation},\nauthor={Xiao Shi Huang and Felipe Perez and Maksims Volkovs},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=I2Hw58KHp8O}\n}", "github": "", "project": "", "reviewers": "6uRn;BqHZ;dfXP;ssed", "pdf_size": 0, "recommendation": "3;8;8;8", "confidence": "4;4;4;4", "correctness": "2;3;3;4", "technical_novelty": "2;4;3;2", "empirical_novelty": "2;3;0;3", "wc_summary_paper": "124;136;91;63", "wc_summary_review": "35;67;60;27", "wc_main_review": "313;162;306;251", "wc_review": "472;365;457;341", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "901;171;489;293", "reply_reviewers": "0;0;0;0", "reply_authors": "2;1;1;1", "recommendation_avg": [ 6.75, 2.165063509461097 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.75, 0.82915619758885 ], "empirical_novelty_avg": [ 2.0, 1.224744871391589 ], "wc_summary_paper_avg": [ 103.5, 28.605069480775605 ], "wc_summary_review_avg": [ 47.25, 16.67895380412093 ], "wc_main_review_avg": [ 258.0, 60.40281450396165 ], "wc_review_avg": [ 408.75, 56.64086422363275 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 463.5, 276.89483563259176 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.816496580927726, "gs_citation": 54, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=753824306564945246&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=I2Hw58KHp8O", "email": "layer6.ai;layer6.ai;layer6.ai", "author_num": 3, "aff_unique_index": "0;1;1", "aff_unique_norm": "Layer 6 AI;Layer6 AI", "aff_unique_dep": ";", "aff_unique_url": "https://layer6.ai;https://layer6.ai", "aff_unique_abbr": "Layer 6 AI;Layer6", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Canada" }, { "id": "I2KAe7x67JU", "title": "Benchmarking Graph Neural Networks on Dynamic Link Prediction", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Graph neural networks (GNNs) are rapidly becoming the dominant way to learn on graph-structured data. Link prediction is a near-universal benchmark for new GNN models. Many advanced models such as Dynamic graph neural networks(DGNNs) specifically target dynamic link prediction. However, these models, particularly DGNNs, are rarely compared to each other or existing heuristics. Different works evaluate their models in different ways, thus one cannot compare evaluation metrics directly. Motivated by this, we perform a comprehensive comparison study. We compare link prediction heuristics, GNNs, discrete DGNNs, and continuous DGNNs on dynamic link prediction. We find that simple link prediction heuristics often perform better than GNNs and DGNNs, different sliding window sizes greatly affect performance, and of all examined graph neural networks, that DGNNs consistently outperform static GNNs.", "keywords": "Graph neural network;dynamic graph neural network;link prediction;dynamic link prediction;temporal graph", "primary_area": "", "supplementary_material": "", "author": "Joakim Skarding;Matthew Hellmich;Bogdan Gabrys;Katarzyna Musial-Gabrys", "authorids": "~Joakim_Skarding1;matthew.hellmich@student.uts.edu.au;~Bogdan_Gabrys1;~Katarzyna_Musial-Gabrys1", "gender": ";;;F", "homepage": ";;;http://katarzyna-musial.com", "dblp": ";;;", "google_scholar": "https://scholar.google.com.au/citations?user=ge4q1N0AAAAJ;;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Joakim_Skarding1;matthew.hellmich@student.uts.edu.au;~Bogdan_Gabrys1;~Katarzyna_Musial-Gabrys1", "aff": "University of Technology Sydney;;;University of Technology Sydney", "aff_domain": "uts.edu.au;;;uts.edu.au", "position": "PhD student;;;Full Professor", "bibtex": "@misc{\nskarding2022benchmarking,\ntitle={Benchmarking Graph Neural Networks on Dynamic Link Prediction},\nauthor={Joakim Skarding and Matthew Hellmich and Bogdan Gabrys and Katarzyna Musial-Gabrys},\nyear={2022},\nurl={https://openreview.net/forum?id=I2KAe7x67JU}\n}", "github": "", "project": "", "reviewers": "e8bH;zw2c;JVhe;q6Fs", "site": "https://openreview.net/forum?id=I2KAe7x67JU", "pdf_size": 0, "recommendation": "1;3;3;5", "confidence": "4;5;4;4", "correctness": "4;4;3;3", "technical_novelty": "1;1;1;2", "empirical_novelty": "2;3;3;2", "wc_summary_paper": "51;37;41;79", "wc_summary_review": "15;6;34;53", "wc_main_review": "155;81;156;477", "wc_review": "221;124;231;609", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.0, 1.4142135623730951 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 1.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 52.0, 16.401219466856727 ], "wc_summary_review_avg": [ 27.0, 18.096961070853858 ], "wc_main_review_avg": [ 217.25, 153.02021925222823 ], "wc_review_avg": [ 296.25, 185.3393846434157 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": -0.7071067811865476, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:BTyTJRv-Y-wJ:scholar.google.com/&scioq=Benchmarking+Graph+Neural+Networks+on+Dynamic+Link+Prediction&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "University of Technology Sydney", "aff_unique_dep": "", "aff_unique_url": "https://www.uts.edu.au", "aff_unique_abbr": "UTS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Australia" }, { "id": "I7Tuih6s7Dj", "title": "Towards Axiomatic, Hierarchical, and Symbolic Explanation for Deep Models", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "This paper proposes a hierarchical and symbolic And-Or graph (AOG) to objectively explain the internal logic encoded by a well-trained deep model for inference. We first define the objectiveness of an explainer model in game theory, and we develop a rigorous representation of the And-Or logic encoded by the deep model. The objectiveness and trustworthiness of the AOG explainer are both theoretically guaranteed and experimentally verified. Furthermore, we propose several techniques to boost the conciseness of the explanation.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jie Ren;Mingjie Li;Qirui Chen;Huiqi Deng;Quanshi Zhang", "authorids": "~Jie_Ren1;~Mingjie_Li3;~Qirui_Chen1;~Huiqi_Deng1;~Quanshi_Zhang1", "gender": "F;M;;F;M", "homepage": "https://jie-ren.github.io/;http://lmjjjjjj.github.io;;;http://qszhang.com", "dblp": "r/JieRen-18;48/10103;;229/1317;http://dblp.uni-trier.de/pers/hd/z/Zhang:Quanshi", "google_scholar": "https://scholar.google.com/citations?hl=zh-CN;7dXDygoAAAAJ;;QEjqzXgAAAAJ;iFFhHK0AAAAJ", "orcid": "0000-0001-9918-3000;;;;", "linkedin": ";;;;", "or_profile": "~Jie_Ren1;~Mingjie_Li3;~Qirui_Chen1;~Huiqi_Deng1;~Quanshi_Zhang1", "aff": "Shanghai Jiaotong University;Shanghai Jiaotong University;;Shanghai jiaotong University;Shanghai Jiaotong University", "aff_domain": "sjtu.edu.cn;sjtu.edu.cn;;edu.cn;sjtu.edu.cn", "position": "PhD student;MS student;;Postdoc;Associate Professor", "bibtex": "@misc{\nren2022towards,\ntitle={Towards Axiomatic, Hierarchical, and Symbolic Explanation for Deep Models},\nauthor={Jie Ren and Mingjie Li and Qirui Chen and Huiqi Deng and Quanshi Zhang},\nyear={2022},\nurl={https://openreview.net/forum?id=I7Tuih6s7Dj}\n}", "github": "", "project": "", "reviewers": "Ayt8;GkxR;AMFg;tR84", "site": "https://openreview.net/forum?id=I7Tuih6s7Dj", "pdf_size": 0, "recommendation": "3;3;5;6", "confidence": "3;5;3;4", "correctness": "2;2;3;4", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "117;18;78;49", "wc_summary_review": "77;49;80;36", "wc_main_review": "822;470;223;252", "wc_review": "1016;537;381;337", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.25, 1.299038105676658 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 2.75, 0.82915619758885 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 65.5, 36.52738698565776 ], "wc_summary_review_avg": [ 60.5, 18.607794065928395 ], "wc_main_review_avg": [ 441.75, 239.39755115706592 ], "wc_review_avg": [ 567.75, 269.25580309438084 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.17407765595569782, "corr_recommendation_correctness": 0.986440050415621, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8708732615927782402&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Shanghai Jiao Tong University", "aff_unique_dep": "", "aff_unique_url": "https://www.sjtu.edu.cn", "aff_unique_abbr": "SJTU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "title": "Transformer-based Transform Coding", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7095", "id": "IDwN6xjHnK8", "poster": "", "openreview": "https://openreview.net/forum?id=IDwN6xjHnK8", "slides": "https://iclr.cc/virtual/2022/poster/7095", "video": "https://iclr.cc/virtual/2022/poster/7095", "author_site": "Yinhao Zhu, Yang Yang, Taco Cohen", "tldr": "", "abstract": "Neural data compression based on nonlinear transform coding has made great progress over the last few years, mainly due to improvements in prior models, quantization methods and nonlinear transforms. A general trend in many recent works pushing the limit of rate-distortion performance is to use ever more expensive prior models that can lead to prohibitively slow decoding. Instead, we focus on more expressive transforms that result in a better rate-distortion-computation trade-off. Specifically, we show that nonlinear transforms built on Swin-transformers can achieve better compression efficiency than transforms built on convolutional neural networks (ConvNets), while requiring fewer parameters and shorter decoding time. Paired with a compute-efficient Channel-wise Auto-Regressive Model prior, our SwinT-ChARM model outperforms VTM-12.1 by $3.68\\%$ in BD-rate on Kodak with comparable decoding speed. In P-frame video compression setting, we are able to outperform the popular ConvNet-based scale-space-flow model by $12.35\\%$ in BD-rate on UVG. We provide model scaling studies to verify the computational efficiency of the proposed solutions and conduct several analyses to reveal the source of coding gain of transformers over ConvNets, including better spatial decorrelation, flexible effective receptive field, and more localized response of latent pixels during progressive decoding.\n", "keywords": "transformer;transform coding;image compression;video compression", "primary_area": "", "supplementary_material": "", "author": "Yinhao Zhu;Yang Yang;Taco Cohen", "authorids": "~Yinhao_Zhu1;~Yang_Yang15;~Taco_Cohen1", "gender": "M;M;M", "homepage": "https://yinhaoz.github.io/;;http://www.ta.co.nl", "dblp": "202/3667;;142/2903", "google_scholar": "89uRjBkAAAAJ;;a3q4YxEAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Yinhao_Zhu1;~Yang_Yang15;~Taco_Cohen1", "aff": "Qualcomm AI Research;;Qualcomm Inc, QualComm", "aff_domain": "qti.qualcomm.com;;qti.qualcomm.com", "position": "Researcher;;Principal Researcher", "bibtex": "@inproceedings{\nzhu2022transformerbased,\ntitle={Transformer-based Transform Coding},\nauthor={Yinhao Zhu and Yang Yang and Taco Cohen},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=IDwN6xjHnK8}\n}", "github": "", "project": "", "reviewers": "aEdf;7pae;U2rZ;EyC9;xjay", "pdf_size": 0, "recommendation": "6;6;8;8;8", "confidence": "4;5;5;4;3", "correctness": "4;4;4;3;4", "technical_novelty": "3;3;4;2;4", "empirical_novelty": "3;4;4;4;3", "wc_summary_paper": "42;75;44;138;33", "wc_summary_review": "40;20;66;100;40", "wc_main_review": "71;217;519;348;90", "wc_review": "153;312;629;586;163", "wc_reply_reviewers": "34;0;0;0;0", "wc_reply_authors": "405;485;1077;607;10", "reply_reviewers": "1;0;0;0;0", "reply_authors": "1;1;2;1;1", "recommendation_avg": [ 7.2, 0.9797958971132712 ], "confidence_avg": [ 4.2, 0.7483314773547882 ], "correctness_avg": [ 3.8, 0.39999999999999997 ], "technical_novelty_avg": [ 3.2, 0.7483314773547882 ], "empirical_novelty_avg": [ 3.6, 0.4898979485566356 ], "wc_summary_paper_avg": [ 66.4, 38.50506460194555 ], "wc_summary_review_avg": [ 53.2, 27.58550343930667 ], "wc_main_review_avg": [ 249.0, 167.74385234636767 ], "wc_review_avg": [ 368.6, 203.4842500047608 ], "wc_reply_reviewers_avg": [ 6.8, 13.6 ], "wc_reply_authors_avg": [ 516.8, 344.185066497662 ], "reply_reviewers_avg": [ 0.2, 0.4000000000000001 ], "reply_authors_avg": [ 1.2, 0.4 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.3273268353539886, "corr_recommendation_correctness": -0.408248290463863, "gs_citation": 187, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=864579765943436164&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "pdf": "https://openreview.net/pdf?id=IDwN6xjHnK8", "email": "qti.qualcomm.com;;qti.qualcomm.com", "author_num": 3, "aff_unique_index": "0;1", "aff_unique_norm": "Qualcomm;Qualcomm Incorporated", "aff_unique_dep": "Qualcomm AI Research;", "aff_unique_url": "https://www.qualcomm.com/research;https://www.qualcomm.com", "aff_unique_abbr": "QAI;Qualcomm", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "IEKL-OihqX0", "title": "Gradient-Guided Importance Sampling for Learning Discrete Energy-Based Models", "track": "main", "status": "Reject", "tldr": "", "abstract": "Learning energy-based models (EBMs) is known to be difficult especially on discrete data where gradient-based learning strategies cannot be applied directly. Although ratio matching is a sound method to learn discrete EBMs, it suffers from expensive computation and excessive memory requirement, thereby resulting in difficulties for learning EBMs on high-dimensional data. In this study, we propose ratio matching with gradient-guided importance sampling (RMwGGIS) to alleviate the above limitations. Particularly, we leverage the gradient of the energy function w.r.t. the discrete data space to approximately construct the provable optimal proposal distribution, which is subsequently used by importance sampling to efficiently estimate the original ratio matching objective. We perform experiments on density modeling over synthetic discrete data and graph generation to evaluate our proposed method. The experimental results demonstrate that our method can significantly alleviate the limitations of ratio matching and perform more effectively in practice.", "keywords": "Discrete energy-based models;ratio matching;importance sampling;gradient", "primary_area": "", "supplementary_material": "", "author": "Meng Liu;Haoran Liu;Shuiwang Ji", "authorids": "~Meng_Liu3;~Haoran_Liu1;~Shuiwang_Ji1", "gender": "M;;M", "homepage": "https://mengliu1998.github.io;;http://people.tamu.edu/~sji", "dblp": "41/7841-15;;84/6405", "google_scholar": "https://scholar.google.com/citations?hl=en;;BZGj6sAAAAAJ", "orcid": ";;0000-0002-4205-4563", "linkedin": "meng-liu-4a1813197/;;shuiwang-ji-9a040715/", "or_profile": "~Meng_Liu3;~Haoran_Liu1;~Shuiwang_Ji1", "aff": "Texas A&M University - College Station;;Texas A&M University", "aff_domain": "tamu.edu;;tamu.edu", "position": "PhD student;;Professor", "bibtex": "@misc{\nliu2022gradientguided,\ntitle={Gradient-Guided Importance Sampling for Learning Discrete Energy-Based Models},\nauthor={Meng Liu and Haoran Liu and Shuiwang Ji},\nyear={2022},\nurl={https://openreview.net/forum?id=IEKL-OihqX0}\n}", "github": "", "project": "", "reviewers": "Btjs;DiXS;oca8;KK3T", "site": "https://openreview.net/forum?id=IEKL-OihqX0", "pdf_size": 0, "recommendation": "5;6;6;6", "confidence": "3;3;5;5", "correctness": "3;4;4;4", "technical_novelty": "3;3;3;3", "empirical_novelty": "3;2;3;2", "wc_summary_paper": "47;17;90;106", "wc_summary_review": "125;18;6;70", "wc_main_review": "387;69;326;414", "wc_review": "559;104;422;590", "wc_reply_reviewers": "757;0;111;36", "wc_reply_authors": "3526;580;1314;803", "reply_reviewers": "5;0;1;1", "reply_authors": "10;2;4;2", "recommendation_avg": [ 5.75, 0.4330127018922193 ], "confidence_avg": [ 4.0, 1.0 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 65.0, 35.12121865767189 ], "wc_summary_review_avg": [ 54.75, 47.15599113580373 ], "wc_main_review_avg": [ 299.0, 136.563172195142 ], "wc_review_avg": [ 418.75, 192.40241032793742 ], "wc_reply_reviewers_avg": [ 226.0, 309.177133695233 ], "wc_reply_authors_avg": [ 1555.75, 1168.230365766958 ], "reply_reviewers_avg": [ 1.75, 1.920286436967152 ], "reply_authors_avg": [ 4.5, 3.278719262151 ], "replies_avg": [ 31, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": 1.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3299235158330000192&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Texas A&M University", "aff_unique_dep": "", "aff_unique_url": "https://www.tamu.edu", "aff_unique_abbr": "TAMU", "aff_campus_unique_index": "0", "aff_campus_unique": "College Station;", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "IEsx-jwFk3g", "title": "Deep Representations for Time-varying Brain Datasets", "track": "main", "status": "Reject", "tldr": "", "abstract": "Finding an appropriate representation of dynamic activities in the brain is crucial for many downstream applications. Due to its highly dynamic nature, temporally averaged fMRI (functional magnetic resonance imaging) cannot capture the whole picture of underlying brain activities, and previous works lack the ability to learn and interpret the latent dynamics in brain architectures. In this paper, we build an efficient graph neural network model that incorporates both region-mapped fMRI sequences and structural connectivities obtained from DWI (diffusion-weighted imaging) as inputs. Through novel sample-level adaptive adjacency matrix learning and multi-resolution inner cluster smoothing, we find good representations of the latent brain dynamics. We also attribute inputs with integrated gradients, which enables us to infer (1) highly involved brain connections and subnetworks for each task (2) keyframes of imaging sequences along the temporal axis, and (3) subnetworks that discriminate between individual subjects. This ability to identify critical subnetworks that characterize brain states across heterogeneous tasks and individuals is of great importance to neuroscience research. Extensive experiments and ablation studies demonstrate our proposed method's superiority and efficiency in spatial-temporal graph signal modeling with insightful interpretations of brain dynamics.", "keywords": "fMRI;graph neural networks;feature attribution", "primary_area": "", "supplementary_material": "", "author": "Sikun Lin;Shuyun Tang;Ambuj Singh", "authorids": "~Sikun_Lin1;~Shuyun_Tang1;~Ambuj_Singh1", "gender": "F;M;", "homepage": ";https://dynamo.cs.ucsb.edu/people/tang-0;", "dblp": ";47/6841.html;", "google_scholar": "Vd3jzs4AAAAJ;Ni_RjZYAAAAJ;", "orcid": ";;", "linkedin": ";;", "or_profile": "~Sikun_Lin1;~Shuyun_Tang1;~Ambuj_Singh1", "aff": "UC Santa Barbara;University of California, Berkeley;", "aff_domain": "ucsb.edu;berkeley.edu;", "position": "PhD student;MS student;", "bibtex": "@misc{\nlin2022deep,\ntitle={Deep Representations for Time-varying Brain Datasets},\nauthor={Sikun Lin and Shuyun Tang and Ambuj Singh},\nyear={2022},\nurl={https://openreview.net/forum?id=IEsx-jwFk3g}\n}", "github": "", "project": "", "reviewers": "Whpw;Cuxq;wRj8;zFSB", "site": "https://openreview.net/forum?id=IEsx-jwFk3g", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "4;4;4;4", "correctness": "3;3;3;3", "technical_novelty": "3;3;3;2", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "83;117;142;44", "wc_summary_review": "79;68;31;69", "wc_main_review": "252;657;760;254", "wc_review": "414;842;933;367", "wc_reply_reviewers": "0;0;157;173", "wc_reply_authors": "464;956;1573;1144", "reply_reviewers": "0;0;2;2", "reply_authors": "1;2;3;2", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 96.5, 36.840874039577294 ], "wc_summary_review_avg": [ 61.75, 18.267115262131565 ], "wc_main_review_avg": [ 480.75, 230.64407102719983 ], "wc_review_avg": [ 639.0, 251.12447112935848 ], "wc_reply_reviewers_avg": [ 82.5, 82.69371197376498 ], "wc_reply_authors_avg": [ 1034.25, 397.996466692859 ], "reply_reviewers_avg": [ 1.0, 1.0 ], "reply_authors_avg": [ 2.0, 0.7071067811865476 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8471938612649593204&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;1", "aff_unique_norm": "University of California, Santa Barbara;University of California, Berkeley", "aff_unique_dep": ";", "aff_unique_url": "https://www.ucsb.edu;https://www.berkeley.edu", "aff_unique_abbr": "UCSB;UC Berkeley", "aff_campus_unique_index": "0;1", "aff_campus_unique": "Santa Barbara;Berkeley", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "IHLQyVXKbx", "title": "Unsupervised Domain Adaptation Via Pseudo-labels And Objectness Constraints", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Pseudo label self-training has emerged as a dominant approach to unsupervised domain adaptation (UDA) for semantic segmentation. Despite recent advances, this approach is susceptible to erroneous pseudo labels arising from confirmation bias that ultimately leads to sub-optimal segmentation. To mitigate the effect of noisy pseudo-labels, we propose regularising conventional self-training objectives with constraints that are derived from structure-preserving modalities, such as depth. Towards this end, we introduce a contrastive image-level objectness constraint that pulls the pixel representations of the same object instance closer while pushing those from different object categories apart. To identify pixels within an object, we subscribe to a notion of objectness derived from depth maps, that are robust to photometric variations, as well as superpixels, that are obtained via unsupervised clustering over the raw image space. Crucially, the objectness constraint is agnostic to the ground-truth semantic segmentation labels and, therefore, remains appropriate for unsupervised adaptation settings. In this paper, we show that our approach of leveraging multi-modal constraint improves top performing self-training methods in various UDA benchmarks for semantic segmentation. We make our code and data-splits available in the supplementary material.", "keywords": "Unsupervised Domain Adaptation;Self-Training;Semantic Segmentation;Multimodal Learning", "primary_area": "", "supplementary_material": "", "author": "Rajshekhar Das;Jonathan Francis;Sanket Vaibhav Mehta;Jean Oh;Emma Strubell;Jose Moura", "authorids": "~Rajshekhar_Das1;~Jonathan_Francis1;~Sanket_Vaibhav_Mehta2;~Jean_Oh2;~Emma_Strubell1;~Jose_Moura1", "gender": "M;;M;F;Non-Binary;", "homepage": ";;https://sanketvmehta.github.io;http://www.cs.cmu.edu/~jeanoh/;http://strubell.github.io;", "dblp": "202/5470;;225/7804;62/4860;153/2253;", "google_scholar": "https://scholar.google.com/citations?hl=en;;H4pn-ogAAAAJ;;UCDMtM0AAAAJ;", "orcid": ";;0000-0003-1809-4685;;;", "linkedin": "rajshekhar-das-90b48a5a/;;sanketvmehta/;;;", "or_profile": "~Rajshekhar_Das1;~Jonathan_Francis1;~Sanket_Vaibhav_Mehta2;~Jean_Oh2;~Emma_Strubell1;~Jose_Moura1", "aff": "Carnegie Mellon University;;Carnegie Mellon University;Carnegie Mellon University;Google;", "aff_domain": "cmu.edu;;cmu.edu;cmu.edu;google.com;", "position": "PhD student;;PhD student;Associate Professor;Research Scientist;", "bibtex": "@misc{\ndas2022unsupervised,\ntitle={Unsupervised Domain Adaptation Via Pseudo-labels And Objectness Constraints},\nauthor={Rajshekhar Das and Jonathan Francis and Sanket Vaibhav Mehta and Jean Oh and Emma Strubell and Jose Moura},\nyear={2022},\nurl={https://openreview.net/forum?id=IHLQyVXKbx}\n}", "github": "", "project": "", "reviewers": "xPma;4wHM;a16W;VXih", "site": "https://openreview.net/forum?id=IHLQyVXKbx", "pdf_size": 0, "recommendation": "3;3;3;3", "confidence": "4;4;4;5", "correctness": "2;3;2;2", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;2;1;2", "wc_summary_paper": "50;166;106;71", "wc_summary_review": "69;33;32;32", "wc_main_review": "147;194;316;419", "wc_review": "266;393;454;522", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 2.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 1.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 98.25, 43.93389921233944 ], "wc_summary_review_avg": [ 41.5, 15.88238017426859 ], "wc_main_review_avg": [ 269.0, 106.32262224004823 ], "wc_review_avg": [ 408.75, 94.2055598146946 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:QSeKgQ3URg8J:scholar.google.com/&scioq=Unsupervised+Domain+Adaptation+Via+Pseudo-labels+And+Objectness+Constraints&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;0;1", "aff_unique_norm": "Carnegie Mellon University;Google", "aff_unique_dep": ";Google", "aff_unique_url": "https://www.cmu.edu;https://www.google.com", "aff_unique_abbr": "CMU;Google", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "IJ-88dRfkdz", "title": "SoftHebb: Bayesian inference in unsupervised Hebbian soft winner-take-all networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "State-of-the-art artificial neural networks (ANNs) require labelled data or feedback between layers, are often biologically implausible, and are vulnerable to adversarial attacks that humans are not susceptible to. On the other hand, Hebbian learning in winner-take-all (WTA) networks, is unsupervised, feed-forward, and biologically plausible. However, a modern objective optimization theory for WTA networks has been missing, except under very limiting assumptions. Here we derive formally such a theory, based on biologically plausible but generic ANN elements. Through Hebbian learning, network parameters maintain a Bayesian generative model of the data. There is no supervisory loss function, but the network does minimize cross-entropy between its activations and the input distribution. The key is a \"soft\" WTA where there is no absolute \"hard\" winner neuron, and a specific type of Hebbian-like plasticity of weights and biases. We confirm our theory in practice, where, in handwritten digit (MNIST) recognition, our Hebbian algorithm, SoftHebb, minimizes cross-entropy without having access to it, and outperforms the more frequently used, hard-WTA-based method. Strikingly, it even outperforms supervised end-to-end backpropagation, under certain conditions. Specifically, in a two-layered network, SoftHebb outperforms backpropagation when the training dataset is only presented once, when the testing data is noisy, and under gradient-based adversarial attacks. Notably, adversarial attacks that confuse SoftHebb are also confusing to the human eye. Finally, the model can generate interpolations of objects from its input distribution. All in all, SoftHebb extends Hebbian WTA theory with modern machine learning tools, thus making these networks relevant to pertinent issues in deep learning.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/589bc9defe6175faa281d3bc0bed14067ea1a61a.zip", "author": "Timoleon Moraitis;Dmitry Toichkin;Yansong Chua;Qinghai Guo", "authorids": "~Timoleon_Moraitis1;~Dmitry_Toichkin1;~Yansong_Chua1;~Qinghai_Guo1", "gender": "M;M;;M", "homepage": "https://www.tmoraitis.com;https://www.github.com/Dmitry00;;https://www.semanticscholar.org/author/Qinghai-Guo/47747957", "dblp": ";;180/0351;12/8502", "google_scholar": "https://scholar.google.ch/citations?user=w3KiO1MAAAAJ;;;", "orcid": "0000-0002-6521-0717;;;0000-0003-4697-9464", "linkedin": "timoleon-moraitis-56a81217/;dmitry-toichkin-b7708b103/;;", "or_profile": "~Timoleon_Moraitis1;~Dmitry_Toichkin1;~Yansong_Chua1;~Qinghai_Guo1", "aff": "Huawei Technologies Ltd.;;China Nanhu Academy of Electronics and Information Technology;Huawei Technologies Ltd.", "aff_domain": "huawei.com;;cnaeit.com;huawei.com", "position": "Researcher;;Principal Researcher;Researcher", "bibtex": "@misc{\nmoraitis2022softhebb,\ntitle={SoftHebb: Bayesian inference in unsupervised Hebbian soft winner-take-all networks},\nauthor={Timoleon Moraitis and Dmitry Toichkin and Yansong Chua and Qinghai Guo},\nyear={2022},\nurl={https://openreview.net/forum?id=IJ-88dRfkdz}\n}", "github": "", "project": "", "reviewers": "4CvS;vMDj;F6Fz;ereX", "site": "https://openreview.net/forum?id=IJ-88dRfkdz", "pdf_size": 0, "recommendation": "6;6;6;6", "confidence": "3;4;4;4", "correctness": "2;4;3;3", "technical_novelty": "2;3;2;2", "empirical_novelty": "2;0;2;2", "wc_summary_paper": "45;44;93;133", "wc_summary_review": "26;51;78;30", "wc_main_review": "151;107;787;129", "wc_review": "222;202;958;292", "wc_reply_reviewers": "0;0;52;0", "wc_reply_authors": "1554;283;1197;1460", "reply_reviewers": "0;0;1;0", "reply_authors": "5;2;2;4", "recommendation_avg": [ 6.0, 0.0 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 78.75, 37.05654463114444 ], "wc_summary_review_avg": [ 46.25, 20.64430914319973 ], "wc_main_review_avg": [ 293.5, 285.346718922787 ], "wc_review_avg": [ 418.5, 313.26785663390365 ], "wc_reply_reviewers_avg": [ 13.0, 22.516660498395403 ], "wc_reply_authors_avg": [ 1123.5, 502.5945184738886 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 3.25, 1.299038105676658 ], "replies_avg": [ 23, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 28, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16550771160647969508&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1;0", "aff_unique_norm": "Huawei;China Nanhu Academy of Electronics and Information Technology", "aff_unique_dep": "Huawei Technologies;", "aff_unique_url": "https://www.huawei.com;", "aff_unique_abbr": "Huawei;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "Interacting Contour Stochastic Gradient Langevin Dynamics", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7052", "id": "IK9ap6nxXr2", "poster": "", "openreview": "https://openreview.net/forum?id=IK9ap6nxXr2", "slides": "https://iclr.cc/virtual/2022/poster/7052", "video": "https://iclr.cc/virtual/2022/poster/7052", "author_site": "Wei Deng, Siqi Liang, Botao Hao, Guang Lin, Faming Liang", "tldr": "", "abstract": "We propose an interacting contour stochastic gradient Langevin dynamics (ICSGLD) sampler, an embarrassingly parallel multiple-chain contour stochastic gradient Langevin dynamics (CSGLD) sampler with efficient interactions. We show that ICSGLD can be theoretically more efficient than a single-chain CSGLD with an equivalent computational budget. We also present a novel random-field function, which facilitates the estimation of self-adapting parameters in big data and obtains free mode explorations. Empirically, we compare the proposed algorithm with popular benchmark methods for posterior sampling. The numerical results show a great potential of ICSGLD for large-scale uncertainty estimation tasks.", "keywords": "stochastic gradient Langevin dynamics;MCMC;importance sampling;Wang-Landau algorithm;Parallel MCMC Methods;stochastic approximation", "primary_area": "", "supplementary_material": "/attachment/322c413292673974c1abaa360c081e05e58a9c8a.zip", "author": "Wei Deng;Siqi Liang;Botao Hao;Guang Lin;Faming Liang", "authorids": "~Wei_Deng1;~Siqi_Liang1;~Botao_Hao1;~Guang_Lin1;~Faming_Liang1", "gender": "M;F;;M;M", "homepage": "https://waynedw.github.io/;;https://haobotao000.github.io/;http://www.math.purdue.edu/~lin491/;https://www.stat.purdue.edu/~fmliang/", "dblp": "69/508-2;214/9483-5;222/2211;;29/1122", "google_scholar": "IYiyxssAAAAJ;mbfN-gwAAAAJ;;https://scholar.google.com/citations?hl=en;TboqoPIAAAAJ", "orcid": ";0000-0002-9600-3569;;0000-0002-0976-1987;", "linkedin": ";siqi-liang-593ba1158/;;;", "or_profile": "~Wei_Deng1;~Siqi_Liang1;~Botao_Hao1;~Guang_Lin1;~Faming_Liang1", "aff": "Morgan Stanley;Amazon;Google Deepmind;Purdue University;Purdue University", "aff_domain": "morganstanley.com;amazon.com;google.com;purdue.edu;purdue.edu", "position": "Researcher;Researcher;Research Scientist;Associate Professor;Full Professor", "bibtex": "@inproceedings{\ndeng2022interacting,\ntitle={Interacting Contour Stochastic Gradient Langevin Dynamics},\nauthor={Wei Deng and Siqi Liang and Botao Hao and Guang Lin and Faming Liang},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=IK9ap6nxXr2}\n}", "github": "", "project": "", "reviewers": "9d9U;aTJ6;mVcL;CBb1", "pdf_size": 0, "recommendation": "6;6;6;8", "confidence": "3;4;3;3", "correctness": "4;2;4;4", "technical_novelty": "3;4;4;3", "empirical_novelty": "3;4;4;3", "wc_summary_paper": "61;89;166;240", "wc_summary_review": "198;47;30;82", "wc_main_review": "112;261;188;315", "wc_review": "371;397;384;637", "wc_reply_reviewers": "11;881;28;59", "wc_reply_authors": "553;1353;467;819", "reply_reviewers": "1;5;1;1", "reply_authors": "1;3;1;3", "recommendation_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.8660254037844386 ], "technical_novelty_avg": [ 3.5, 0.5 ], "empirical_novelty_avg": [ 3.5, 0.5 ], "wc_summary_paper_avg": [ 139.0, 69.8462597423799 ], "wc_summary_review_avg": [ 89.25, 65.52623520392423 ], "wc_main_review_avg": [ 219.0, 76.46894794620886 ], "wc_review_avg": [ 447.25, 109.937197981393 ], "wc_reply_reviewers_avg": [ 244.75, 367.74201214982224 ], "wc_reply_authors_avg": [ 798.0, 345.70652293527814 ], "reply_reviewers_avg": [ 2.0, 1.7320508075688772 ], "reply_authors_avg": [ 2.0, 1.0 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=811536455190019406&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "pdf": "https://openreview.net/pdf?id=IK9ap6nxXr2", "email": "morganstanley.com;amazon.com;google.com;purdue.edu;purdue.edu", "author_num": 5, "aff_unique_index": "0;1;2;3;3", "aff_unique_norm": "Morgan Stanley;Amazon;DeepMind;Purdue University", "aff_unique_dep": ";Amazon.com, Inc.;DeepMind;", "aff_unique_url": "https://www.morganstanley.com;https://www.amazon.com;https://deepmind.com;https://www.purdue.edu", "aff_unique_abbr": "Morgan Stanley;Amazon;DeepMind;Purdue", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;0", "aff_country_unique": "United States;United Kingdom" }, { "id": "ILYX-vQnwe_", "title": "Breaking Down Questions for Outside-Knowledge VQA", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "While general Visual Question Answering (VQA) focuses on querying visual content within an image, there is a recent trend towards Knowledge-Based VQA (KB-VQA) where a system needs to link some aspects of the question to different types of knowledge beyond the image, such as commonsense concepts and factual information. To address this issue, we propose a novel approach that passes knowledge from various sources between different pieces of semantic content in the question. Questions are first segmented into several chunks, and each segment is used as a key to retrieve knowledge from ConceptNet and Wikipedia. Then, a graph neural network, taking advantage of the question's syntactic structure, integrates the knowledge for different segments to jointly predict the answer. Our experiments on the OK-VQA dataset show that our approach achieves new state-of-the-art results. ", "keywords": "knowledge-based VQA", "primary_area": "", "supplementary_material": "", "author": "Jialin Wu;Ray Mooney", "authorids": "~Jialin_Wu1;~Ray_Mooney1", "gender": "M;M", "homepage": "https://jialinwu.netlify.app/;https://www.cs.utexas.edu/~mooney/", "dblp": "149/5889;m/RaymondJMooney.html", "google_scholar": "M7EpKqsAAAAJ;p9RsPG4AAAAJ", "orcid": ";0000-0002-4504-0490", "linkedin": "jialin-wu-a50135175/;", "or_profile": "~Jialin_Wu1;~Ray_Mooney1", "aff": "University of Texas, Austin;University of Texas at Austin", "aff_domain": "cs.utexas.edu;cs.utexas.edu", "position": "PhD student;Full Professor", "bibtex": "@misc{\nwu2022breaking,\ntitle={Breaking Down Questions for Outside-Knowledge {VQA}},\nauthor={Jialin Wu and Ray Mooney},\nyear={2022},\nurl={https://openreview.net/forum?id=ILYX-vQnwe_}\n}", "github": "", "project": "", "reviewers": "9mxD;nHQB;ScvM;oAjx", "site": "https://openreview.net/forum?id=ILYX-vQnwe_", "pdf_size": 0, "recommendation": "5;5;5;6", "confidence": "3;4;4;5", "correctness": "4;4;3;3", "technical_novelty": "2;2;3;3", "empirical_novelty": "3;2;3;3", "wc_summary_paper": "120;39;71;72", "wc_summary_review": "74;5;81;62", "wc_main_review": "432;189;214;203", "wc_review": "626;233;366;337", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 5.25, 0.4330127018922193 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 75.5, 28.91798748184251 ], "wc_summary_review_avg": [ 55.5, 29.937434759845406 ], "wc_main_review_avg": [ 259.5, 99.9862490545575 ], "wc_review_avg": [ 390.5, 144.67981891058614 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.816496580927726, "corr_recommendation_correctness": -0.5773502691896257, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=537373591281188647&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "University of Texas at Austin", "aff_unique_dep": "", "aff_unique_url": "https://www.utexas.edu", "aff_unique_abbr": "UT Austin", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Austin", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "INO8hGXD2M", "title": "Adversarial Distributions Against Out-of-Distribution Detectors", "track": "main", "status": "Reject", "tldr": "", "abstract": "Out-of-distribution (OOD) detection is the task of determining whether an input lies outside the training data distribution. As an outlier may deviate from the training distribution in unexpected ways, an ideal OOD detector should be able to detect all types of outliers. However, current evaluation protocols test a detector over OOD datasets that cover only a small fraction of all possible outliers, leading to overly optimistic views of OOD detector performance. In this paper, we propose a novel evaluation framework for OOD detection that tests a detector over a larger, unexplored space of outliers. In our framework, a detector is evaluated with samples from its adversarial distribution, which generates diverse outlier samples that are likely to be misclassified as in-distribution by the detector. Using adversarial distributions, we investigate OOD detectors with reported near-perfect performance on standard benchmarks like CIFAR-10 vs SVHN. Our methods discover a wide range of samples that are obviously outlier but recognized as in-distribution by the detectors, indicating that current state-of-the-art detectors are not as perfect as they seem on existing benchmarks.", "keywords": "out-of-distribution detection;outlier detection;adversarial attack;model evaluation;markov chain monte carlo", "primary_area": "", "supplementary_material": "", "author": "Sangwoong Yoon;Jinwon Choi;Yonghyeon Lee;Yung-Kyun Noh;Frank C. Park", "authorids": "~Sangwoong_Yoon1;~Jinwon_Choi1;~Yonghyeon_Lee2;~Yung-Kyun_Noh1;~Frank_C._Park1", "gender": "M;M;M;M;M", "homepage": "https://swyoon.github.io/;;https://www.gabe-yhlee.com;http://aais.hanyang.ac.kr;http://robotics.snu.ac.kr", "dblp": "237/1318;;182/6796;54/6443;p/FrankChongwooPark", "google_scholar": "https://scholar.google.co.kr/citations?user=cH2rjfIAAAAJ;;;https://scholar.google.com/citations?hl=en;u-h3PJIAAAAJ", "orcid": "0000-0002-7251-3230;;;;0000-0002-0293-6975", "linkedin": ";choi-jinwon-73033b1ab/;;;", "or_profile": "~Sangwoong_Yoon1;~Jinwon_Choi1;~Yonghyeon_Lee2;~Yung-Kyun_Noh1;~Frank_C._Park1", "aff": "Seoul National University;;Seoul National University;Korea Institute for Advanced Study;Seoul National University", "aff_domain": "snu.ac.kr;;snu.ac.kr;kias.re.kr;snu.ac.kr", "position": "PhD student;;PhD student;Affiliate Professor;Full Professor", "bibtex": "@misc{\nyoon2022adversarial,\ntitle={Adversarial Distributions Against Out-of-Distribution Detectors},\nauthor={Sangwoong Yoon and Jinwon Choi and Yonghyeon Lee and Yung-Kyun Noh and Frank C. Park},\nyear={2022},\nurl={https://openreview.net/forum?id=INO8hGXD2M}\n}", "github": "", "project": "", "reviewers": "XMWK;HyPw;GYAe;hrej", "site": "https://openreview.net/forum?id=INO8hGXD2M", "pdf_size": 0, "recommendation": "3;3;6;6", "confidence": "4;3;3;4", "correctness": "2;2;4;4", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "26;18;91;46", "wc_summary_review": "30;21;78;32", "wc_main_review": "335;357;346;294", "wc_review": "391;396;515;372", "wc_reply_reviewers": "0;0;267;0", "wc_reply_authors": "690;516;561;332", "reply_reviewers": "0;0;1;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 4.5, 1.5 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.0, 1.0 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 45.25, 28.314086600135983 ], "wc_summary_review_avg": [ 40.25, 22.18529918662356 ], "wc_main_review_avg": [ 333.0, 23.822258499143192 ], "wc_review_avg": [ 418.5, 56.4291591289468 ], "wc_reply_reviewers_avg": [ 66.75, 115.61439140522256 ], "wc_reply_authors_avg": [ 524.75, 128.30700487502622 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 1.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:nVKle1oxVA4J:scholar.google.com/&scioq=Adversarial+Distributions+Against+Out-of-Distribution+Detectors&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Seoul National University;Korea Institute for Advanced Study", "aff_unique_dep": ";", "aff_unique_url": "https://www.snu.ac.kr;http://www.kaist.edu", "aff_unique_abbr": "SNU;KIAS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "South Korea" }, { "id": "IOA9fJUUa0", "title": "How does BERT address polysemy of Korean adverbial postpositions -ey, -eyse, and -(u)lo?", "track": "main", "status": "Reject", "tldr": "", "abstract": "The present study reports computational accounts of resolving word-level polysemy in a lesser-studied language\u2014Korean. Postpositions, which are characterized as multiple form-function mapping and thus polysemous, pose a challenge to automatic analysis and model performance in identifying their functions. In this study, we devised a classification model by employing BERT and introduces a computational simulation that interactively demonstrates how a BERT model simulates human interpretation of word-level polysemy involving Korean adverbial postpositions -ey, -eyse, and -(u)lo. Results reveal that (i) there is an inverse relationship between the classification accuracy and the number of functions that each postposition manifests, (ii) the model performance is affected by the corpus size of each function, and (iii) the performance gradually improves as the epoch proceeds.", "keywords": "polysemy;natural language processing;classification;language model;BERT;data visualization;Korean", "primary_area": "", "supplementary_material": "/attachment/e1d764d3d7ee681ff5805a93de3f315cc7d4f3d6.zip", "author": "Seongmin Mun;Guillaume Desagulier;Gyu-Ho Shin", "authorids": "~Seongmin_Mun1;guillaume.desagulier@univ-paris8.fr;gyuho.shin@upol.cz", "gender": "M;;", "homepage": "http://seongminmun.com/;;", "dblp": ";;", "google_scholar": "https://scholar.google.co.kr/citations?user=YiHNz_QAAAAJ;;", "orcid": "0000-0002-7289-5320;;", "linkedin": "seongmin-mun-163868140/;;", "or_profile": "~Seongmin_Mun1;guillaume.desagulier@univ-paris8.fr;gyuho.shin@upol.cz", "aff": "Chosun University;;", "aff_domain": "chosun.ac.kr;;", "position": "Postdoc;;", "bibtex": "@misc{\nmun2022how,\ntitle={How does {BERT} address polysemy of Korean adverbial postpositions -ey, -eyse, and -(u)lo?},\nauthor={Seongmin Mun and Guillaume Desagulier and Gyu-Ho Shin},\nyear={2022},\nurl={https://openreview.net/forum?id=IOA9fJUUa0}\n}", "github": "", "project": "", "reviewers": "3sR2;MV2W;eqSx;djh3", "site": "https://openreview.net/forum?id=IOA9fJUUa0", "pdf_size": 0, "recommendation": "1;3;3;3", "confidence": "3;4;4;4", "correctness": "2;2;3;3", "technical_novelty": "1;2;1;1", "empirical_novelty": "2;1;2;1", "wc_summary_paper": "137;26;91;52", "wc_summary_review": "21;26;60;61", "wc_main_review": "401;368;652;93", "wc_review": "559;420;803;206", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 2.5, 0.8660254037844386 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 1.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.5, 0.5 ], "wc_summary_paper_avg": [ 76.5, 41.89570383702844 ], "wc_summary_review_avg": [ 42.0, 18.587630295441105 ], "wc_main_review_avg": [ 378.5, 198.07132553703983 ], "wc_review_avg": [ 497.0, 216.84672005820147 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 1.0, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:2szH6LRM5vsJ:scholar.google.com/&scioq=How+does+BERT+address+polysemy+of+Korean+adverbial+postpositions+-ey,+-eyse,+and+-(u)lo%3F&hl=en&as_sdt=0,33", "gs_version_total": 2, "aff_unique_index": "0", "aff_unique_norm": "Chosun University", "aff_unique_dep": "", "aff_unique_url": "http://www.chosun.ac.kr", "aff_unique_abbr": "Chosun", "aff_country_unique_index": "0", "aff_country_unique": "South Korea" }, { "id": "IPwwNwMvHFW", "title": "Multi-Agent Decentralized Belief Propagation on Graphs", "track": "main", "status": "Desk Reject", "tldr": "", "abstract": "We consider the problem of interactive partially observable Markov decision processes (I-POMDPs),where the agents are located at the nodes of a communication network. Specifically, we assume a certain message type for all messages. Moreover, each agent makes individual decisions based on the interactive belief states, the information observed locally and the messages received from its neighbors over the network.Within this setting, the collective goal of the agents is to maximize the globally averaged return over the network through exchanging information with their neighbors. We propose a decentralized belief propagation algorithm for the problem, and prove the convergence of our algorithm.Finally we show multiple applications of our framework. Our work appears to be the first study of decentralized belief propagation algorithm for networked multi-agent I-POMDPs.", "keywords": "I-pomdps;Belief propagation;Multi-agent control", "primary_area": "", "supplementary_material": "", "author": "Yitao Chen;Deepanshu Vasal", "authorids": "~Yitao_Chen1;~Deepanshu_Vasal1", "gender": "M;", "homepage": "https://yitaochen.github.io/;https://sites.google.com/view/dvasal/home", "dblp": ";", "google_scholar": "T7teB94AAAAJ;", "orcid": ";", "linkedin": "yitao-chen-aa543b75/;", "or_profile": "~Yitao_Chen1;~Deepanshu_Vasal1", "aff": "QualComm;Northwestern University", "aff_domain": "qualcomm.com;northwestern.edu", "position": "Qualcomm;Researcher", "bibtex": "@misc{\nchen2022multiagent,\ntitle={Multi-Agent Decentralized Belief Propagation on Graphs},\nauthor={Yitao Chen and Deepanshu Vasal},\nyear={2022},\nurl={https://openreview.net/forum?id=IPwwNwMvHFW}\n}", "github": "", "project": "", "reviewers": "", "site": "https://openreview.net/forum?id=IPwwNwMvHFW", "pdf_size": 0, "recommendation": "", "confidence": "", "correctness": "", "technical_novelty": "", "empirical_novelty": "", "wc_summary_paper": "", "wc_summary_review": "", "wc_main_review": "", "wc_review": "", "wc_reply_reviewers": "", "wc_reply_authors": "", "reply_reviewers": "", "reply_authors": "", "recommendation_avg": [ 0, 0 ], "confidence_avg": [ 0, 0 ], "correctness_avg": [ 0, 0 ], "technical_novelty_avg": [ 0, 0 ], "empirical_novelty_avg": [ 0, 0 ], "wc_summary_paper_avg": [ 0, 0 ], "wc_summary_review_avg": [ 0, 0 ], "wc_main_review_avg": [ 0, 0 ], "wc_review_avg": [ 0, 0 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 1, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0, "corr_recommendation_correctness": 0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13871769199120233120&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff_unique_index": "0;1", "aff_unique_norm": "Qualcomm Incorporated;Northwestern University", "aff_unique_dep": ";", "aff_unique_url": "https://www.qualcomm.com;https://www.northwestern.edu", "aff_unique_abbr": "Qualcomm;NU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "IPy3URgH47U", "title": "ACTIVE REFINEMENT OF WEAKLY SUPERVISED MODELS", "track": "main", "status": "Reject", "tldr": "", "abstract": "Supervised machine learning (ML) has fueled major advances in several domains such as health, education and governance. However, most modern ML methods rely on vast quantities of point-by-point hand-labeled training data. In domains such as clinical research, where data collection and its careful characterization is particularly expensive and tedious, this reliance on pointillisticaly labeled data is one of the biggest roadblocks to the adoption of modern data-hungry ML algorithms. Data programming, a framework for learning from weak supervision, attempts to overcome this bottleneck by generating probabilistic training labels from simple yet imperfect heuristics obtained a priori from domain experts. We present WARM, Active Refinement of Weakly Supervised Models, a principled approach to iterative and interactive improvement of weakly supervised models via active learning. WARM directs domain experts' attention on a few selected data points that, when annotated, would improve the label model's probabilistic output in terms of accuracy the most. Gradient backpropagation is then used to iteratively update decision parameters of the heuristics of the label model. Experiments on multiple real-world medical classification datasets reveal that WARM can substantially improve the accuracy of probabilistic labels, a direct measure of training data quality, with as few as 30 queries to clinicians. Additional experiments with domain shift and artificial noise in the LFs, demonstrate WARM's ability to adapt heuristics and the end model to changing population characteristics as well as its robustness to mis-specification of domain-expert-acquired LFs. These capabilities make WARM a potentially useful tool for deploying, maintaining, and auditing weakly supervised systems in practice.", "keywords": "Weak Supervision;Active Learning;Fuzzy logic;AI in Healthcare", "primary_area": "", "supplementary_material": "", "author": "Mononito Goswami;Chufan Gao;Benedikt Boecking;Saswati Ray;Artur Dubrawski", "authorids": "~Mononito_Goswami1;~Chufan_Gao1;~Benedikt_Boecking1;sray@cs.cmu.edu;~Artur_Dubrawski2", "gender": "M;;M;;M", "homepage": "https://mononito.com;https://chufangao.github.io;http://www.cs.cmu.edu/~boecking/;;https://www.autonlab.org", "dblp": "243/3771;239/1854.html;146/0168;;76/48", "google_scholar": "https://scholar.google.co.in/citations?hl=en;rBlZICgAAAAJ;wNtfa1wAAAAJ;;O3gezzcAAAAJ", "orcid": "0000-0002-4117-5558;0000-0003-2807-0337;;;0000-0002-2372-0831", "linkedin": "https://linkedin.com/in/mononitogoswami/;chufangao/;;;artur-dubrawski-33a2a87/", "or_profile": "~Mononito_Goswami1;~Chufan_Gao1;~Benedikt_Boecking1;sray@cs.cmu.edu;~Artur_Dubrawski2", "aff": "Amazon;Carnegie Mellon University;Carnegie Mellon University;;Carnegie Mellon University", "aff_domain": "amazon.com;cmu.edu;cmu.edu;;cmu.edu", "position": "Intern;MS student;PhD student;;Research Professor", "bibtex": "@misc{\ngoswami2022active,\ntitle={{ACTIVE} {REFINEMENT} {OF} {WEAKLY} {SUPERVISED} {MODELS}},\nauthor={Mononito Goswami and Chufan Gao and Benedikt Boecking and Saswati Ray and Artur Dubrawski},\nyear={2022},\nurl={https://openreview.net/forum?id=IPy3URgH47U}\n}", "github": "", "project": "", "reviewers": "aMJi;jkgK;JFqf;379M;bVTy", "site": "https://openreview.net/forum?id=IPy3URgH47U", "pdf_size": 0, "recommendation": "5;5;5;6;6", "confidence": "3;5;2;4;3", "correctness": "4;3;3;4;3", "technical_novelty": "2;2;3;3;3", "empirical_novelty": "2;3;3;3;3", "wc_summary_paper": "84;98;66;173;66", "wc_summary_review": "65;37;32;52;61", "wc_main_review": "163;466;129;212;438", "wc_review": "312;601;227;437;565", "wc_reply_reviewers": "122;0;0;0;0", "wc_reply_authors": "983;697;257;185;550", "reply_reviewers": "1;0;0;0;0", "reply_authors": "2;1;1;1;1", "recommendation_avg": [ 5.4, 0.48989794855663565 ], "confidence_avg": [ 3.4, 1.019803902718557 ], "correctness_avg": [ 3.4, 0.4898979485566356 ], "technical_novelty_avg": [ 2.6, 0.4898979485566356 ], "empirical_novelty_avg": [ 2.8, 0.39999999999999997 ], "wc_summary_paper_avg": [ 97.4, 39.666610644218146 ], "wc_summary_review_avg": [ 49.4, 12.970736293672768 ], "wc_main_review_avg": [ 281.6, 141.88812494356247 ], "wc_review_avg": [ 428.4, 143.27260729113573 ], "wc_reply_reviewers_avg": [ 24.4, 48.8 ], "wc_reply_authors_avg": [ 534.4, 292.2174532775207 ], "reply_reviewers_avg": [ 0.2, 0.4000000000000001 ], "reply_authors_avg": [ 1.2, 0.4000000000000001 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0800640769025436, "corr_recommendation_correctness": 0.16666666666666669, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:lUITGA1znu0J:scholar.google.com/&scioq=ACTIVE+REFINEMENT+OF+WEAKLY+SUPERVISED+MODELS&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1;1;1", "aff_unique_norm": "Amazon;Carnegie Mellon University", "aff_unique_dep": "Amazon.com, Inc.;", "aff_unique_url": "https://www.amazon.com;https://www.cmu.edu", "aff_unique_abbr": "Amazon;CMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "IR-V6-aP-mv", "title": "Batch size-invariance for policy optimization", "track": "main", "status": "Reject", "tldr": "", "abstract": "We say an algorithm is batch size-invariant if changes to the batch size can largely be compensated for by changes to other hyperparameters. Stochastic gradient descent is well-known to have this property at small batch sizes, via the learning rate. However, some policy optimization algorithms (such as PPO) do not have this property, because of how they control the size of policy updates. In this work we show how to make these algorithms batch size-invariant. Our key insight is to decouple the proximal policy (used for controlling policy updates) from the behavior policy (used for off-policy corrections). Our experiments help explain why these algorithms work, and additionally show how they can make more efficient use of stale data.", "keywords": "reinforcement learning;policy gradient;learning rate", "primary_area": "", "supplementary_material": "/attachment/8e55ab4035bd24b449c4d93e3a4a75b44d0adefe.zip", "author": "Jacob Hilton;Karl Cobbe;John Schulman", "authorids": "~Jacob_Hilton1;~Karl_Cobbe1;~John_Schulman1", "gender": "M;M;", "homepage": "https://www.jacobh.co.uk/;;", "dblp": "182/7972;232/1982;", "google_scholar": "WyKvz7EAAAAJ;;", "orcid": ";;", "linkedin": ";;", "or_profile": "~Jacob_Hilton1;~Karl_Cobbe1;~John_Schulman1", "aff": "OpenAI;;OpenAI", "aff_domain": "openai.com;;openai.com", "position": "Researcher;;Researcher", "bibtex": "@misc{\nhilton2022batch,\ntitle={Batch size-invariance for policy optimization},\nauthor={Jacob Hilton and Karl Cobbe and John Schulman},\nyear={2022},\nurl={https://openreview.net/forum?id=IR-V6-aP-mv}\n}", "github": "", "project": "", "reviewers": "VVcj;pfgK;tiiD;kR2g", "site": "https://openreview.net/forum?id=IR-V6-aP-mv", "pdf_size": 0, "recommendation": "1;5;5;8", "confidence": "5;4;4;5", "correctness": "3;2;2;3", "technical_novelty": "2;2;3;2", "empirical_novelty": "2;2;3;2", "wc_summary_paper": "14;37;115;168", "wc_summary_review": "38;40;28;68", "wc_main_review": "149;163;1252;436", "wc_review": "201;240;1395;672", "wc_reply_reviewers": "0;205;0;235", "wc_reply_authors": "340;532;708;524", "reply_reviewers": "0;2;0;1", "reply_authors": "1;2;1;1", "recommendation_avg": [ 4.75, 2.48746859276655 ], "confidence_avg": [ 4.5, 0.5 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 83.5, 61.491869381244214 ], "wc_summary_review_avg": [ 43.5, 14.857657958103626 ], "wc_main_review_avg": [ 500.0, 448.990534421384 ], "wc_review_avg": [ 627.0, 480.38890495097826 ], "wc_reply_reviewers_avg": [ 110.0, 110.51018052650171 ], "wc_reply_authors_avg": [ 526.0, 130.15375522819156 ], "reply_reviewers_avg": [ 0.75, 0.82915619758885 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.10050378152592121, "corr_recommendation_correctness": -0.10050378152592121, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2353473566858975296&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff_unique_index": "0;0", "aff_unique_norm": "OpenAI", "aff_unique_dep": "", "aff_unique_url": "https://openai.com", "aff_unique_abbr": "OpenAI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "IRLKq_V1lt9", "title": "Dict-BERT: Enhancing Language Model Pre-training with Dictionary", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Pre-trained language models (PLMs) aim to learn universal language representations by conducting self-supervised training tasks on large-scale corpora. Since PLMs capture word semantics in different contexts, the quality of word representations highly depends on word frequency, which usually follows a heavy-tailed distributions in the pre-training corpus. Therefore, the embeddings of rare words on the tail are usually poorly optimized. In this work, we focus on enhancing language model pre-training by leveraging definitions of the rare words in dictionaries (e.g., Wiktionary). To incorporate a rare word definition as a part of input, we fetch its definition from the dictionary and append it to the end of the input text sequence. In addition to training with the masked language modeling objective, we propose two novel self-supervised pre-training tasks on word and sentence-level alignment between input text sequence and rare word definitions to enhance language modeling representation with dictionary. We evaluate the proposed Dict-BERT model on the language understanding benchmark GLUE and eight specialized domain benchmark datasets. Extensive experiments demonstrate that Dict-BERT can significantly improve the understanding of rare words and boost model performance on various NLP downstream tasks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Wenhao Yu;Chenguang Zhu;Yuwei Fang;Donghan Yu;Shuohang Wang;Yichong Xu;Michael Zeng;Meng Jiang", "authorids": "~Wenhao_Yu2;~Chenguang_Zhu1;~Yuwei_Fang1;~Donghan_Yu2;~Shuohang_Wang1;~Yichong_Xu1;~Michael_Zeng1;~Meng_Jiang3", "gender": "M;M;M;M;M;M;M;M", "homepage": "https://wyu97.github.io/;;https://yuwfan.github.io/;;;http://xycking.wixsite.com/yichongxu;https://www.microsoft.com/en-us/research/people/nzeng/;http://www.meng-jiang.com/", "dblp": "159/8117-2.html;48/7536-1.html;227/2871.html;204/0106;173/5469.html;154/6421;232/1866-1.html;69/339-1", "google_scholar": "z4qSdX8AAAAJ;1b2kKWoAAAAJ;Om_-hHsAAAAJ;KlwvYcEAAAAJ;mN-IO6wAAAAJ;sYza2XwAAAAJ;;LZIPfCkAAAAJ", "orcid": "0000-0002-4075-5980;;;;;;;0000-0002-3009-519X", "linkedin": ";;yuwei-fang-79220192/;;;;michaelnanshanzeng/;meng-jiang-94b10916/", "or_profile": "~Wenhao_Yu2;~Chenguang_Zhu1;~Yuwei_Fang1;~Donghan_Yu2;~Shuohang_Wang1;~Yichong_Xu1;~Michael_Zeng1;~Meng_Jiang3", "aff": "University of Notre Dame;Zoom;Microsoft;Carnegie Mellon University;Microsoft;Microsoft;Microsoft;University of Notre Dame", "aff_domain": "nd.edu;zoom.us;microsoft.com;cmu.edu;microsoft.com;microsoft.com;microsoft.com;nd.edu", "position": "PhD student;Principal Researcher;Senior Applied Scientist;PhD student;Researcher;Senior Researcher;Partner Research Manager;Assistant Professor", "bibtex": "@misc{\nyu2022dictbert,\ntitle={Dict-{BERT}: Enhancing Language Model Pre-training with Dictionary},\nauthor={Wenhao Yu and Chenguang Zhu and Yuwei Fang and Donghan Yu and Shuohang Wang and Yichong Xu and Michael Zeng and Meng Jiang},\nyear={2022},\nurl={https://openreview.net/forum?id=IRLKq_V1lt9}\n}", "github": "", "project": "", "reviewers": "xgBx;RUyA;jwBN;8bn3;BxT3", "site": "https://openreview.net/forum?id=IRLKq_V1lt9", "pdf_size": 0, "recommendation": "3;3;5;5;6", "confidence": "4;5;4;4;3", "correctness": "2;2;3;3;3", "technical_novelty": "2;2;2;3;3", "empirical_novelty": "2;0;2;3;3", "wc_summary_paper": "132;115;58;252;77", "wc_summary_review": "39;74;56;186;16", "wc_main_review": "722;400;424;311;231", "wc_review": "893;589;538;749;324", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "0;0;0;0;0", "reply_reviewers": "0;0;0;0;0", "reply_authors": "0;0;0;0;0", "recommendation_avg": [ 4.4, 1.2 ], "confidence_avg": [ 4.0, 0.6324555320336759 ], "correctness_avg": [ 2.6, 0.4898979485566356 ], "technical_novelty_avg": [ 2.4, 0.4898979485566356 ], "empirical_novelty_avg": [ 2.0, 1.0954451150103321 ], "wc_summary_paper_avg": [ 126.8, 67.90404995285628 ], "wc_summary_review_avg": [ 74.2, 59.08773138308831 ], "wc_main_review_avg": [ 417.6, 166.84915342907794 ], "wc_review_avg": [ 618.6, 193.1119882348064 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": -0.7905694150420948, "corr_recommendation_correctness": 0.9525793444156803, "gs_citation": 80, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7955035590205722186&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff_unique_index": "0;1;2;3;2;2;2;0", "aff_unique_norm": "University of Notre Dame;Zoom Video Communications Inc.;Microsoft;Carnegie Mellon University", "aff_unique_dep": ";;Microsoft Corporation;", "aff_unique_url": "https://www.nd.edu;https://zoom.us;https://www.microsoft.com;https://www.cmu.edu", "aff_unique_abbr": "Notre Dame;Zoom;Microsoft;CMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "IXrQxlxr0iB", "title": "ERNIE-SPARSE: Robust Efficient Transformer Through Hierarchically Unifying Isolated Information", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Sparse Transformer has recently attracted a lot of attention since the ability for reducing the quadratic dependency on the sequence length. In this paper, we argue that two factors could affect the robustness and causing performance degradation of the Sparse Transformer. The first factor is information bottleneck sensitivity, which is caused by the key feature of Sparse Transformer \u2014 only a small number of global tokens can attend to all other tokens. The second factor is sparse pattern sensitivity, which is caused by different token connections in different sparse patterns. To address these issues, we propose a well-designed model, named ERNIE-SPARSE. It consists of two distinctive parts: (i) a Hierarchical Sparse Transformer (HST) mechanism, which introduces special tokens to sequentially model local and global information. This method is not affected by bottleneck size and improves model robustness and performance. (ii) Sparse-Attention-Oriented Regularization (SAOR) method, the first robust training method designed for Sparse Transformer, which increases model robustness by forcing the output distributions of transformers with different sparse patterns to be consistent with each other. To evaluate the effectiveness of ERNIE-SPARSE, we perform extensive evaluations. Firstly, we perform experiments on a multi-modal long sequence modeling task benchmark, Long Range Arena (LRA). Experimental results demonstrate that ERNIE-SPARSE significantly outperforms a variety of strong baseline methods including the dense attention and other efficient sparse attention methods and achieves improvements by 2.7% (55.01% vs. 57.78%). Secondly, to further show the effectiveness of our method, we pretrain ERNIE-SPARSE and verified it on 3 text classification and 2 QA downstream tasks, achieve improvements on classification benchmark by 0.83% (91.63% vs. 92.46%), on QA benchmark by 3.27% (74.7% vs. 71.43%). Experimental results continue to demonstrate its superior performance. \n", "keywords": "sparse transformer;robustness;language model;dropout;regularization", "primary_area": "", "supplementary_material": "/attachment/cdf4f710a96c61dffced59dfa889bbc806e394a1.zip", "author": "Yang Liu;Jiaxiang Liu;Yuxiang Lu;shikun feng;Yu Sun;Zhida Feng;Li Chen;Hao Tian;hua wu;Haifeng Wang", "authorids": "liuyang148@baidu.com;~Jiaxiang_Liu1;luyuxiang@baidu.com;~shikun_feng1;sunyu02@baidu.com;fengzhida110@gmail.com;chenli@wust.edu.cn;tianhao@baidu.com;~hua_wu1;~Haifeng_Wang3", "gender": ";M;;M;;;;;F;M", "homepage": ";https://github.com/OleNet/;;;;;;;https://wuhuanlp.github.io/;https://haifengwang.net/", "dblp": ";121/1088-4;;26/7906;;;;;27/6045-3;10/5209-1.html", "google_scholar": ";nvW9CqkAAAAJ;;u9CYmnAAAAAJ;;;;;9X2ThuAAAAAJ;jgy4jCAAAAAJ", "orcid": ";;;;;;;;0000-0001-8254-1561;0000-0002-0672-7468", "linkedin": ";;;;;;;;;", "or_profile": "liuyang148@baidu.com;~Jiaxiang_Liu1;luyuxiang@baidu.com;~shikun_feng1;sunyu02@baidu.com;fengzhida110@gmail.com;chenli@wust.edu.cn;tianhao@baidu.com;~hua_wu1;~Haifeng_Wang3", "aff": ";Baidu;;Baidu;;;;;Baidu;Baidu", "aff_domain": ";baidu.com;;baidu.com;;;;;baidu.com;baidu.com", "position": ";Researcher;;Principal Architect;;;;;Principal Researcher;CTO", "bibtex": "@misc{\nliu2022erniesparse,\ntitle={{ERNIE}-{SPARSE}: Robust Efficient Transformer Through Hierarchically Unifying Isolated Information},\nauthor={Yang Liu and Jiaxiang Liu and Yuxiang Lu and shikun feng and Yu Sun and Zhida Feng and Li Chen and Hao Tian and hua wu and Haifeng Wang},\nyear={2022},\nurl={https://openreview.net/forum?id=IXrQxlxr0iB}\n}", "github": "", "project": "", "reviewers": "kdrW;gdqU;C3PP", "site": "https://openreview.net/forum?id=IXrQxlxr0iB", "pdf_size": 0, "recommendation": "3;5;6", "confidence": "5;5;4", "correctness": "2;3;3", "technical_novelty": "3;3;3", "empirical_novelty": "2;2;3", "wc_summary_paper": "111;78;47", "wc_summary_review": "40;51;29", "wc_main_review": "1091;621;155", "wc_review": "1242;750;231", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 4.666666666666667, 1.247219128924647 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "correctness_avg": [ 2.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 78.66666666666667, 26.132142830026183 ], "wc_summary_review_avg": [ 40.0, 8.981462390204987 ], "wc_main_review_avg": [ 622.3333333333334, 382.12156297294825 ], "wc_review_avg": [ 741.0, 412.78808122328337 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 10, 0 ], "corr_recommendation_confidence": -0.7559289460184544, "corr_recommendation_correctness": 0.9449111825230683, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:nsZBG14VfyYJ:scholar.google.com/&scioq=ERNIE-SPARSE:+Robust+Efficient+Transformer+Through+Hierarchically+Unifying+Isolated+Information&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Baidu", "aff_unique_dep": "Baidu, Inc.", "aff_unique_url": "https://www.baidu.com", "aff_unique_abbr": "Baidu", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "id": "IY4IsjvUhZ", "title": "Characterising the Area Under the Curve Loss Function Landscape", "track": "main", "status": "Reject", "tldr": "", "abstract": " One of the most common metrics to evaluate neural network classifiers is the\narea under the receiver operating characteristic curve (AUC). However, \noptimisation of the AUC as the loss function during network\ntraining is not a standard procedure. Here we compare minimising the cross-entropy (CE) loss\nand optimising the AUC directly. In particular, we analyse the loss function\nlandscape (LFL) of approximate AUC (appAUC) loss functions to discover\nthe organisation of this solution space. We discuss various surrogates for AUC approximation and show their differences.\nWe find that the characteristics of the appAUC landscape are significantly\ndifferent from the CE landscape. The approximate AUC loss function improves\ntesting AUC, and the appAUC landscape has substantially more minima, but\nthese minima are less robust, with larger average Hessian eigenvalues. We provide a theoretical foundation to explain these results.\nTo generalise our results, we lastly provide an overview of how the\nLFL can help to guide loss function analysis and selection. ", "keywords": "loss function landscape;loss function;AUC;area under the curve;alternative loss functions;loss function visualisation", "primary_area": "", "supplementary_material": "", "author": "Maximilian Paul Niroomand;Conor T Cafolla;John William Roger Morgan;David John Wales", "authorids": "~Maximilian_Paul_Niroomand1;~Conor_T_Cafolla1;~John_William_Roger_Morgan1;~David_John_Wales1", "gender": ";M;M;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;https://scholar.google.co.uk/citations?user=B8ps65sAAAAJ;", "orcid": ";0000-0003-2021-974X;0000-0002-9157-9278;", "linkedin": ";;;", "or_profile": "~Maximilian_Paul_Niroomand1;~Conor_T_Cafolla1;~John_William_Roger_Morgan1;~David_John_Wales1", "aff": ";University of Cambridge;;", "aff_domain": ";cam.ac.uk;;", "position": ";PhD student;;", "bibtex": "@misc{\nniroomand2022characterising,\ntitle={Characterising the Area Under the Curve Loss Function Landscape},\nauthor={Maximilian Paul Niroomand and Conor T Cafolla and John William Roger Morgan and David John Wales},\nyear={2022},\nurl={https://openreview.net/forum?id=IY4IsjvUhZ}\n}", "github": "", "project": "", "reviewers": "Q2LC;hg7P;qWyh;yYns;ifdg", "site": "https://openreview.net/forum?id=IY4IsjvUhZ", "pdf_size": 0, "recommendation": "3;3;3;5;6", "confidence": "3;4;4;2;3", "correctness": "2;2;3;3;3", "technical_novelty": "2;1;1;3;3", "empirical_novelty": "2;1;2;2;3", "wc_summary_paper": "26;40;90;30;35", "wc_summary_review": "37;2;53;40;13", "wc_main_review": "576;92;246;117;158", "wc_review": "639;134;389;187;206", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "0;0;0;0;0", "reply_reviewers": "0;0;0;0;0", "reply_authors": "0;0;0;0;0", "recommendation_avg": [ 4.0, 1.2649110640673518 ], "confidence_avg": [ 3.2, 0.7483314773547882 ], "correctness_avg": [ 2.6, 0.4898979485566356 ], "technical_novelty_avg": [ 2.0, 0.8944271909999159 ], "empirical_novelty_avg": [ 2.0, 0.6324555320336759 ], "wc_summary_paper_avg": [ 44.2, 23.37862271392393 ], "wc_summary_review_avg": [ 29.0, 18.686893802876924 ], "wc_main_review_avg": [ 237.8, 177.0111860872075 ], "wc_review_avg": [ 311.0, 185.14750875990742 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.6338656910463873, "corr_recommendation_correctness": 0.6454972243679028, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9830173222246489545&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff_unique_index": "0", "aff_unique_norm": "University of Cambridge", "aff_unique_dep": "", "aff_unique_url": "https://www.cam.ac.uk", "aff_unique_abbr": "Cambridge", "aff_campus_unique_index": "0", "aff_campus_unique": "Cambridge", "aff_country_unique_index": "0", "aff_country_unique": "United Kingdom" }, { "id": "IY6Zt3Qu0cT", "title": "Fragment-Based Sequential Translation for Molecular Optimization", "track": "main", "status": "Reject", "tldr": "", "abstract": "Searching for novel molecular compounds with desired properties is an important problem in drug discovery. Many existing frameworks generate molecules one atom at a time. We instead propose a flexible editing paradigm that generates molecules using learned molecular fragments---meaningful substructures of molecules. To do so, we train a variational autoencoder (VAE) to encode molecular fragments in a coherent latent space, which we then utilize as a vocabulary for editing molecules to explore the complex chemical property space. Equipped with the learned fragment vocabulary, we propose Fragment-based Sequential Translation (FaST), which learns a reinforcement learning (RL) policy to iteratively translate model-discovered molecules into increasingly novel molecules while satisfying desired properties. Empirical evaluation shows that FaST significantly improves over state-of-the-art methods on benchmark single/multi-objective molecular optimization tasks. ", "keywords": "molecular optimization;molecular generation;drug discovery;reinforcement learning", "primary_area": "", "supplementary_material": "", "author": "Benson Chen;Xiang Fu;Tommi S. Jaakkola;Regina Barzilay", "authorids": "~Benson_Chen1;~Xiang_Fu4;~Tommi_S._Jaakkola1;~Regina_Barzilay1", "gender": "M;M;;female", "homepage": ";https://xiangfu.co/;;https://www.regina.csail.mit.edu/", "dblp": ";97/374-5.html;;b/ReginaBarzilay", "google_scholar": "EZQHjx4AAAAJ;https://scholar.google.com/citations?view_op=list_works;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Benson_Chen1;~Xiang_Fu4;~Tommi_S._Jaakkola1;~Regina_Barzilay1", "aff": "Massachusetts Institute of Technology;Massachusetts Institute of Technology;;Massachusetts Institute of Technology", "aff_domain": "mit.edu;mit.edu;;mit.edu", "position": "PhD student;PhD student;;Professor", "bibtex": "@misc{\nchen2022fragmentbased,\ntitle={Fragment-Based Sequential Translation for Molecular Optimization},\nauthor={Benson Chen and Xiang Fu and Tommi S. Jaakkola and Regina Barzilay},\nyear={2022},\nurl={https://openreview.net/forum?id=IY6Zt3Qu0cT}\n}", "github": "", "project": "", "reviewers": "MhuZ;eP4U;UFDm;KpDT", "site": "https://openreview.net/forum?id=IY6Zt3Qu0cT", "pdf_size": 0, "recommendation": "3;3;6;6", "confidence": "4;4;4;4", "correctness": "3;3;3;3", "technical_novelty": "2;2;3;2", "empirical_novelty": "2;0;4;3", "wc_summary_paper": "51;122;90;80", "wc_summary_review": "28;45;100;26", "wc_main_review": "375;386;342;161", "wc_review": "454;553;532;267", "wc_reply_reviewers": "0;101;129;0", "wc_reply_authors": "1475;1053;871;195", "reply_reviewers": "0;1;1;0", "reply_authors": "2;2;2;1", "recommendation_avg": [ 4.5, 1.5 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 1.479019945774904 ], "wc_summary_paper_avg": [ 85.75, 25.36114153582208 ], "wc_summary_review_avg": [ 49.75, 29.93639089803579 ], "wc_main_review_avg": [ 316.0, 90.9422893927792 ], "wc_review_avg": [ 451.5, 112.72643877990646 ], "wc_reply_reviewers_avg": [ 57.5, 58.345951016330176 ], "wc_reply_authors_avg": [ 898.5, 461.4897073608468 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.75, 0.4330127018922193 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1758368000232322443&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff_unique_index": "0;0;0", "aff_unique_norm": "Massachusetts Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://web.mit.edu", "aff_unique_abbr": "MIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Representing Mixtures of Word Embeddings with Mixtures of Topic Embeddings", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6840", "id": "IYMuTbGzjFU", "poster": "", "openreview": "https://openreview.net/forum?id=IYMuTbGzjFU", "slides": "https://iclr.cc/virtual/2022/poster/6840", "video": "https://iclr.cc/virtual/2022/poster/6840", "author_site": "dongsheng wang, Dandan Guo, He Zhao, Huangjie Zheng, Korawat Tanwisuth, Bo Chen, Mingyuan Zhou", "tldr": "", "abstract": "A topic model is often formulated as a generative model that explains how each word of a document is generated given a set of topics and document-specific topic proportions. It is focused on capturing the word co-occurrences in a document and hence often suffers from poor performance in analyzing short documents. In addition, its parameter estimation often relies on approximate posterior inference that is either not scalable or suffering from large approximation error. This paper introduces a new topic-modeling framework where each document is viewed as a set of word embedding vectors and each topic is modeled as an embedding vector in the same embedding space. Embedding the words and topics in the same vector space, we define a method to measure the semantic difference between the embedding vectors of the words of a document and these of the topics, and optimize the topic embeddings to minimize the expected difference over all documents. Experiments on text analysis demonstrate that the proposed method, which is amenable to mini-batch stochastic gradient descent based optimization and hence scalable to big corpora, provides competitive performance in discovering more coherent and diverse topics and extracting better document representations. ", "keywords": "topic model;text mining;distribution matching", "primary_area": "", "supplementary_material": "", "author": "dongsheng wang;Dan dan Guo;He Zhao;Huangjie Zheng;Korawat Tanwisuth;Bo Chen;Mingyuan Zhou", "authorids": "~dongsheng_wang3;~Dan_dan_Guo1;~He_Zhao1;~Huangjie_Zheng1;~Korawat_Tanwisuth1;~Bo_Chen1;~Mingyuan_Zhou1", "gender": "M;F;;M;M;M;M", "homepage": "https://wds2014.github.io/;https://github.com/Dan123dan;;;;http://web.xidian.edu.cn/bchen/en/index.html;http://mingyuanzhou.github.io", "dblp": "21/841-3;121/1618;;192/2170;;89/5615-1;", "google_scholar": "https://scholar.google.com/citations?hl=en;https://scholar.google.com.hk/citations?user=QLOY4JkAAAAJ;;Vl5wCXsAAAAJ;;;LXwCIisAAAAJ", "orcid": "0000-0002-3380-5337;;;0000-0003-0508-5034;0009-0003-5875-5414;0000-0001-5151-9388;", "linkedin": ";;;;korawat-tanwisuth-238401a7/;;", "or_profile": "~dongsheng_wang3;~Dan_dan_Guo1;~He_Zhao1;~Huangjie_Zheng1;~Korawat_Tanwisuth1;~Bo_Chen1;~Mingyuan_Zhou1", "aff": "Xidian University;The Chinese University of Hong Kong(ShenZhen);;University of Texas, Austin;University of Texas, Austin;Xidian University;The University of Texas at Austin", "aff_domain": "xidian.edu.cn;cuhk.edu.hk;;utexas.edu;utexas.edu;xidian.edu.cn;utexas.edu", "position": "PhD student;Postdoc;;PhD student;PhD student;Full Professor;Associate Professor", "bibtex": "@inproceedings{\nwang2022representing,\ntitle={Representing Mixtures of Word Embeddings with Mixtures of Topic Embeddings},\nauthor={dongsheng wang and Dan dan Guo and He Zhao and Huangjie Zheng and Korawat Tanwisuth and Bo Chen and Mingyuan Zhou},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=IYMuTbGzjFU}\n}", "github": "", "project": "", "reviewers": "Bvas;2jw3;eWQp;W5Cw", "pdf_size": 0, "recommendation": "5;5;5;6", "confidence": "5;4;4;4", "correctness": "3;3;3;3", "technical_novelty": "2;2;2;3", "empirical_novelty": "3;0;2;3", "wc_summary_paper": "99;196;82;71", "wc_summary_review": "63;48;61;25", "wc_main_review": "214;314;605;107", "wc_review": "376;558;748;203", "wc_reply_reviewers": "0;0;23;0", "wc_reply_authors": "754;1036;1194;323", "reply_reviewers": "0;0;1;0", "reply_authors": "1;2;3;1", "recommendation_avg": [ 5.25, 0.4330127018922193 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 1.224744871391589 ], "wc_summary_paper_avg": [ 112.0, 49.512624652708524 ], "wc_summary_review_avg": [ 49.25, 15.138939857202683 ], "wc_main_review_avg": [ 310.0, 185.38203796484706 ], "wc_review_avg": [ 471.25, 203.19125842417532 ], "wc_reply_reviewers_avg": [ 5.75, 9.959292143521045 ], "wc_reply_authors_avg": [ 826.75, 330.8000717956391 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.75, 0.82915619758885 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": 0.0, "gs_citation": 51, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3518295104208201525&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=IYMuTbGzjFU", "email": "xidian.edu.cn;cuhk.edu.hk;;utexas.edu;utexas.edu;xidian.edu.cn;utexas.edu", "author_num": 7, "aff_unique_index": "0;1;2;2;0;2", "aff_unique_norm": "Xidian University;Chinese University of Hong Kong;University of Texas at Austin", "aff_unique_dep": ";;", "aff_unique_url": "http://www.xidian.edu.cn/;https://www.cuhk.edu.cn;https://www.utexas.edu", "aff_unique_abbr": "Xidian;CUHK;UT Austin", "aff_campus_unique_index": "1;2;2;2", "aff_campus_unique": ";Shenzhen;Austin", "aff_country_unique_index": "0;0;1;1;0;1", "aff_country_unique": "China;United States" }, { "id": "I_RLPhVUfw8", "title": "Dense Gaussian Processes for Few-Shot Segmentation", "track": "main", "status": "Reject", "tldr": "", "abstract": "Few-shot segmentation is a challenging dense prediction task, which entails segmenting a novel query image given only a small annotated support set. The key problem is thus to design a method that aggregates detailed information from the support set, while being robust to large variations in appearance and context. To this end, we propose a few-shot segmentation method based on dense Gaussian process (GP) regression. Given the support set, our dense GP learns the mapping from local deep image features to mask values, capable of capturing complex appearance distributions. Furthermore, it provides a principled means of capturing uncertainty, which serves as another powerful cue for the final segmentation, obtained by a CNN decoder. Instead of a one-dimensional mask output, we further exploit the end-to-end learning capabilities of our approach to learn a high-dimensional output space for the GP. Our approach sets a new state-of-the-art for both 1-shot and 5-shot FSS on the PASCAL-5$^i$ and COCO-20$^i$ benchmarks, achieving an absolute gain of $+14.9$ mIoU in the COCO-20$^i$ 5-shot setting. Furthermore, the segmentation quality of our approach scales gracefully when increasing the support set size, while achieving robust cross-dataset transfer. ", "keywords": "few-shot learning;few-shot segmentation;segmentation;gaussian processes", "primary_area": "", "supplementary_material": "/attachment/9c5f423a53cb6289932df9af221fb7a345208909.zip", "author": "Joakim Johnander;Johan Edstedt;Michael Felsberg;Fahad Khan;Martin Danelljan", "authorids": "~Joakim_Johnander1;~Johan_Edstedt1;~Michael_Felsberg2;~Fahad_Khan1;~Martin_Danelljan4", "gender": "M;M;;M;M", "homepage": ";;https://liu.se/en/employee/micfe03;https://sites.google.com/view/fahadkhans/home;https://martin-danelljan.github.io/", "dblp": "202/2479;289/1724;00/78;05/8618;151/8848", "google_scholar": "5sUDSxQAAAAJ;Ul-vMR0AAAAJ;https://scholar.google.se/citations?hl=en;zvaeYnUAAAAJ;NCSSpMkAAAAJ", "orcid": "0000-0003-2553-3367;0000-0002-1019-8634;0000-0002-6096-3648;;", "linkedin": ";;https://linkedin.com/in/michael-felsberg-668a202;;", "or_profile": "~Joakim_Johnander1;~Johan_Edstedt1;~Michael_Felsberg2;~Fahad_Khan1;~Martin_Danelljan4", "aff": "Link\u00f6ping University;Link\u00f6ping University;Link\u00f6ping University;Link\u00f6ping University;ETH Zurich", "aff_domain": "liu.se;liu.se;liu.se;liu.se;vision.ee.ethz.ch", "position": "PhD student;PhD student;Full Professor;Associate Professor;Principal Researcher", "bibtex": "@misc{\njohnander2022dense,\ntitle={Dense Gaussian Processes for Few-Shot Segmentation},\nauthor={Joakim Johnander and Johan Edstedt and Michael Felsberg and Fahad Khan and Martin Danelljan},\nyear={2022},\nurl={https://openreview.net/forum?id=I_RLPhVUfw8}\n}", "github": "", "project": "", "reviewers": "nhTR;Kjwm;uRb5;b84d", "site": "https://openreview.net/forum?id=I_RLPhVUfw8", "pdf_size": 0, "recommendation": "5;6;6;6", "confidence": "5;5;4;4", "correctness": "3;3;3;3", "technical_novelty": "2;3;3;2", "empirical_novelty": "2;3;2;2", "wc_summary_paper": "82;55;61;23", "wc_summary_review": "38;42;70;4", "wc_main_review": "236;143;125;204", "wc_review": "356;240;256;231", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "872;755;415;591", "reply_reviewers": "0;0;0;0", "reply_authors": "2;2;2;2", "recommendation_avg": [ 5.75, 0.4330127018922193 ], "confidence_avg": [ 4.5, 0.5 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 55.25, 21.146808269807526 ], "wc_summary_review_avg": [ 38.5, 23.425413550244958 ], "wc_main_review_avg": [ 177.0, 44.91658936295141 ], "wc_review_avg": [ 270.75, 50.026867781223324 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 658.25, 172.29534961803236 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.0, 0.0 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.0, "gs_citation": 37, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9696467800979236699&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 16, "aff_unique_index": "0;0;0;0;1", "aff_unique_norm": "Link\u00f6ping University;ETH Zurich", "aff_unique_dep": ";", "aff_unique_url": "https://www.liu.se;https://www.ethz.ch", "aff_unique_abbr": "LiU;ETHZ", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;1", "aff_country_unique": "Sweden;Switzerland" }, { "id": "IbyMcLKUCqT", "title": "Theoretical Analysis of Consistency Regularization with Limited Augmented Data", "track": "main", "status": "Reject", "tldr": "", "abstract": "Data augmentation is popular in the training of large neural networks; currently, however, there is no clear theoretical comparison between different algorithmic choices on how to use augmented data. In this paper, we take a small step in this direction; we present a simple new statistical framework to analyze data augmentation - specifically, one that captures what it means for one input sample to be an augmentation of another, and also the richness of the augmented set. We use this to interpret consistency regularization as a way to reduce function class complexity, and characterize its generalization performance. Specializing this analysis for linear regression shows that consistency regularization has strictly better sample efficiency as compared to empirical risk minimization on the augmented set. In addition, we also provide generalization bounds under consistency regularization for logistic regression and two-layer neural networks. We perform experiments that make a clean and apples-to-apples comparison (i.e. with no extra modeling or data tweaks) between ERM and consistency regularization using CIFAR-100 and WideResNet; these demonstrate the superior efficacy of consistency regularization.", "keywords": "data augmentation;consistency regularization;generalization bound", "primary_area": "", "supplementary_material": "/attachment/dd2f33a7c6070ab491ebe857a706ab48c40408e5.zip", "author": "Shuo Yang;Yijun Dong;Rachel Ward;Inderjit S Dhillon;sujay sanghavi;Qi Lei", "authorids": "~Shuo_Yang6;~Yijun_Dong1;~Rachel_Ward1;~Inderjit_S_Dhillon1;~sujay_sanghavi1;~Qi_Lei1", "gender": "M;F;;M;M;F", "homepage": "https://yangshuo-ut.github.io/;https://dyjdongyijun.github.io/;;http://www.cs.utexas.edu/users/inderjit/;https://sites.utexas.edu/sanghavi;https://cecilialeiqi.github.io/", "dblp": ";200/1432;80/7132;d/InderjitSDhillon;69/4911.html;", "google_scholar": ";l3bmbCkAAAAJ;;xBv5ZfkAAAAJ;O-DazBUAAAAJ;kGOgaowAAAAJ", "orcid": ";;;;;", "linkedin": ";yijun-dong-82638513b/;;inderjit-dhillon-a20888b0/;;", "or_profile": "~Shuo_Yang6;~Yijun_Dong1;~Rachel_Ward1;~Inderjit_S_Dhillon1;~sujay_sanghavi1;~Qi_Lei1", "aff": "University of Texas, Austin;University of Texas, Austin;University of Texas at Austin;University of Texas, Austin;University of Texas, Austin;Princeton University", "aff_domain": "utexas.edu;utexas.edu;utexas.edu;utexas.edu;utexas.edu;princeton.edu", "position": "PhD student;PhD student;Full Professor;Full Professor;Associate Professor;Postdoc", "bibtex": "@misc{\nyang2022theoretical,\ntitle={Theoretical Analysis of Consistency Regularization with Limited Augmented Data},\nauthor={Shuo Yang and Yijun Dong and Rachel Ward and Inderjit S Dhillon and sujay sanghavi and Qi Lei},\nyear={2022},\nurl={https://openreview.net/forum?id=IbyMcLKUCqT}\n}", "github": "", "project": "", "reviewers": "QuHm;TPuR;fxf3;s3Ue", "site": "https://openreview.net/forum?id=IbyMcLKUCqT", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "4;4;4;4", "correctness": "3;3;2;3", "technical_novelty": "2;2;3;3", "empirical_novelty": "1;2;2;4", "wc_summary_paper": "58;40;111;70", "wc_summary_review": "24;103;586;162", "wc_main_review": "501;620;1202;668", "wc_review": "583;763;1899;900", "wc_reply_reviewers": "0;0;743;0", "wc_reply_authors": "721;985;2609;578", "reply_reviewers": "0;0;3;0", "reply_authors": "1;2;5;1", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.25, 1.0897247358851685 ], "wc_summary_paper_avg": [ 69.75, 26.099568961957974 ], "wc_summary_review_avg": [ 218.75, 217.61132208596132 ], "wc_main_review_avg": [ 747.75, 269.2158752748433 ], "wc_review_avg": [ 1036.25, 510.6375304460102 ], "wc_reply_reviewers_avg": [ 185.75, 321.728437505919 ], "wc_reply_authors_avg": [ 1223.25, 813.2755913587964 ], "reply_reviewers_avg": [ 0.75, 1.299038105676658 ], "reply_authors_avg": [ 2.25, 1.6393596310755 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": -0.13245323570650439, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:6kFHLf1GeQoJ:scholar.google.com/&scioq=Theoretical+Analysis+of+Consistency+Regularization+with+Limited+Augmented+Data&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0;0;0;0;1", "aff_unique_norm": "University of Texas at Austin;Princeton University", "aff_unique_dep": ";", "aff_unique_url": "https://www.utexas.edu;https://www.princeton.edu", "aff_unique_abbr": "UT Austin;Princeton", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Austin;", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Monotonic Differentiable Sorting Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/5925", "id": "IcUWShptD7d", "poster": "", "openreview": "https://openreview.net/forum?id=IcUWShptD7d", "slides": "https://iclr.cc/virtual/2022/poster/5925", "video": "https://iclr.cc/virtual/2022/poster/5925", "author_site": "Felix Petersen, Christian Borgelt, Hilde Kuehne, Oliver Deussen", "tldr": "", "abstract": "Differentiable sorting algorithms allow training with sorting and ranking supervision, where only the ordering or ranking of samples is known. Various methods have been proposed to address this challenge, ranging from optimal transport-based differentiable Sinkhorn sorting algorithms to making classic sorting networks differentiable. One problem of current differentiable sorting methods is that they are non-monotonic. To address this issue, we propose a novel relaxation of conditional swap operations that guarantees monotonicity in differentiable sorting networks. We introduce a family of sigmoid functions and prove that they produce differentiable sorting networks that are monotonic. Monotonicity ensures that the gradients always have the correct sign, which is an advantage in gradient-based optimization. We demonstrate that monotonic differentiable sorting networks improve upon previous differentiable sorting methods.", "keywords": "differentiable sorting;monotonic;sorting;ranking;sorting networks", "primary_area": "", "supplementary_material": "", "author": "Felix Petersen;Christian Borgelt;Hilde Kuehne;Oliver Deussen", "authorids": "~Felix_Petersen1;~Christian_Borgelt1;~Hilde_Kuehne5;~Oliver_Deussen1", "gender": "Not Specified;M;F;M", "homepage": "http://www.petersen.ai/;https://www.borgelt.net/;https://hildekuehne.github.io;https://graphics.uni-konstanz.de", "dblp": "230/3983;b/ChristianBorgelt.html;45/4963;48/2158", "google_scholar": "v8Kat6YAAAAJ;https://scholar.google.de/citations?user=T50Bxb8AAAAJ;pxhCcH0AAAAJ;https://scholar.google.de/scholar?hl=en", "orcid": ";;0000-0003-1079-4441;0000-0001-5803-2185", "linkedin": ";christian-borgelt-a2429071/;hilde-kuehne-8b9aa661;", "or_profile": "~Felix_Petersen1;~Christian_Borgelt1;~Hilde_Kuehne5;~Oliver_Deussen1", "aff": "University of Konstanz;Paris-Lodron-University of Salzburg;Goethe University Frankfurt;University of Konstanz", "aff_domain": "uni-konstanz.de;sbg.ac.at;uni-frankfurt.de;uni-konstanz.de", "position": "PhD student;Full Professor;Assistant Professor;Full Professor", "bibtex": "@inproceedings{\npetersen2022monotonic,\ntitle={Monotonic Differentiable Sorting Networks},\nauthor={Felix Petersen and Christian Borgelt and Hilde Kuehne and Oliver Deussen},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=IcUWShptD7d}\n}", "github": "", "project": "", "reviewers": "28Ue;vsFK;QCdx;iDKW", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "2;4;3;3", "correctness": "3;4;4;4", "technical_novelty": "3;3;2;4", "empirical_novelty": "2;3;2;3", "wc_summary_paper": "87;87;40;136", "wc_summary_review": "28;63;5;89", "wc_main_review": "267;263;304;373", "wc_review": "382;413;349;598", "wc_reply_reviewers": "0;25;8;124", "wc_reply_authors": "774;229;702;901", "reply_reviewers": "0;1;1;1", "reply_authors": "1;2;1;2", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 3.0, 0.7071067811865476 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 87.5, 33.944808144987356 ], "wc_summary_review_avg": [ 46.25, 32.18209906143476 ], "wc_main_review_avg": [ 301.75, 44.13261265776138 ], "wc_review_avg": [ 435.5, 96.51036213795905 ], "wc_reply_reviewers_avg": [ 39.25, 49.75628101054178 ], "wc_reply_authors_avg": [ 651.5, 254.1225098254777 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.3244428422615251, "corr_recommendation_correctness": 0.6622661785325219, "gs_citation": 32, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11509121699002053809&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "pdf": "https://openreview.net/pdf?id=IcUWShptD7d", "email": "uni-konstanz.de;sbg.ac.at;uni-frankfurt.de;uni-konstanz.de", "author_num": 4, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "University of Konstanz;Paris-Lodron-University of Salzburg;Goethe University Frankfurt", "aff_unique_dep": ";;", "aff_unique_url": "https://www.uni-konstanz.de;https://www.uni-salzburg.at;https://www.uni-frankfurt.de", "aff_unique_abbr": "Uni Konstanz;PLUS;GU Frankfurt", "aff_campus_unique_index": "1", "aff_campus_unique": ";Frankfurt", "aff_country_unique_index": "0;1;0;0", "aff_country_unique": "Germany;Austria" }, { "id": "IeYEepOLsFT", "title": "Bayesian Imbalanced Regression Debiasing", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Imbalanced regression, where the training data has an uneven distribution on its range, is widely encountered in the real world, e.g., age estimation (uni-dimensional regression) and pose estimation (multi-dimensional regression). Compared to imbalanced and long-tailed classification, imbalanced regression has its unique challenges as the regression label space can be continuous, boundless, and high-dimensional. In this work, we present a principled framework, Bayesian Posterior Debiasing (Bayesian-PD), for re-balancing the regression among frequent and rare observations. Our key insight is that a balanced posterior can be obtained by debiasing the conditional probability with a regression label space prior. Importantly, through a normalization reparameterization technique, we derive a general debiasing function between the empirical posterior and the balanced posterior without relying on task-specific assumptions. We show that the Bayesian-PD framework has multiple instantiations in both training and testing time, with either closed-form or numerical implementations. We further uncover that several existing methods in imbalanced classification/regression serve as special cases of our Bayesian-PD framework. Extensive experiments on both uni- and multi-dimensional regression benchmarks demonstrate the effectiveness of the Bayesian-PD framework on various real-world tasks. Notably, Bayesian-PD exhibits strong robustness to different skewness of the training distributions.", "keywords": "Imbalanced Regression;Bayesian Debiasing", "primary_area": "", "supplementary_material": "", "author": "Jiawei Ren;Mingyuan Zhang;Cunjun Yu;Ziwei Liu", "authorids": "~Jiawei_Ren1;~Mingyuan_Zhang1;~Cunjun_Yu1;~Ziwei_Liu1", "gender": "Unspecified;M;Unspecified;M", "homepage": "https://jiawei-ren.github.io/;https://mingyuan-zhang.github.io/;;https://liuziwei7.github.io/", "dblp": "122/3626-1;;232/3014;05/6300-2", "google_scholar": "https://scholar.google.com.sg/citations?user=YUKPVCoAAAAJ;2QLD4fAAAAAJ;4xwyGM8AAAAJ;https://scholar.google.com.hk/citations?user=lc45xlcAAAAJ", "orcid": "0000-0003-1950-5976;;;", "linkedin": ";;;", "or_profile": "~Jiawei_Ren1;~Mingyuan_Zhang1;~Cunjun_Yu1;~Ziwei_Liu1", "aff": "Nanyang Technological University;Nanyang Technological University;National University of Singapore;Nanyang Technological University", "aff_domain": "ntu.edu.sg;ntu.edu.sg;u.nus.edu;ntu.edu.sg", "position": "PhD student;PhD student;PhD student;Assistant Professor", "bibtex": "@misc{\nren2022bayesian,\ntitle={Bayesian Imbalanced Regression Debiasing},\nauthor={Jiawei Ren and Mingyuan Zhang and Cunjun Yu and Ziwei Liu},\nyear={2022},\nurl={https://openreview.net/forum?id=IeYEepOLsFT}\n}", "github": "", "project": "", "reviewers": "6qYQ;FbhK;Cmso", "site": "https://openreview.net/forum?id=IeYEepOLsFT", "pdf_size": 0, "recommendation": "3;5;6", "confidence": "4;2;4", "correctness": "2;2;3", "technical_novelty": "2;1;3", "empirical_novelty": "2;2;3", "wc_summary_paper": "31;45;47", "wc_summary_review": "26;25;35", "wc_main_review": "536;346;344", "wc_review": "593;416;426", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 4.666666666666667, 1.247219128924647 ], "confidence_avg": [ 3.3333333333333335, 0.9428090415820634 ], "correctness_avg": [ 2.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.0, 0.816496580927726 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 41.0, 7.118052168020874 ], "wc_summary_review_avg": [ 28.666666666666668, 4.496912521077347 ], "wc_main_review_avg": [ 408.6666666666667, 90.04196552472384 ], "wc_review_avg": [ 478.3333333333333, 81.18428966794555 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.1889822365046137, "corr_recommendation_correctness": 0.7559289460184545, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:gf2rpETICRoJ:scholar.google.com/&scioq=Bayesian+Imbalanced+Regression+Debiasing&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Nanyang Technological University;National University of Singapore", "aff_unique_dep": ";", "aff_unique_url": "https://www.ntu.edu.sg;https://www.nus.edu.sg", "aff_unique_abbr": "NTU;NUS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Singapore" }, { "title": "Generalized Kernel Thinning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6234", "id": "IfNu7Dr-3fQ", "poster": "", "openreview": "https://openreview.net/forum?id=IfNu7Dr-3fQ", "slides": "https://iclr.cc/virtual/2022/poster/6234", "video": "https://iclr.cc/virtual/2022/poster/6234", "author_site": "Raaz Dwivedi, Lester Mackey", "tldr": "", "abstract": "The kernel thinning (KT) algorithm of Dwivedi and Mackey (2021) compresses a probability distribution more effectively than independent sampling by targeting a reproducing kernel Hilbert space (RKHS) and leveraging a less smooth square-root kernel. Here we provide four improvements. First, we show that KT applied directly to the target RKHS yields tighter, dimension-free guarantees for any kernel, any distribution, and any fixed function in the RKHS. Second, we show that, for analytic kernels like Gaussian, inverse multiquadric, and sinc, target KT admits maximum mean discrepancy (MMD) guarantees comparable to or better than those of square-root KT without making explicit use of a square-root kernel. Third, we prove that KT with a fractional power kernel yields better-than-Monte-Carlo MMD guarantees for non-smooth kernels, like Laplace and Matern, that do not have square-roots. Fourth, we establish that KT applied to a sum of the target and power kernels (a procedure we call KT+) simultaneously inherits the improved MMD guarantees of power KT and the tighter individual function guarantees of target KT. In our experiments with target KT and KT+, we witness significant improvements in integration error even in 100 dimensions and when compressing challenging differential equation posteriors.", "keywords": "coresets;maximum mean discrepancy;Markov chain Monte Carlo;reproducing kernel Hilbert space;thinning;compression", "primary_area": "", "supplementary_material": "/attachment/48b7fb8508b4a8f05d32b6b88dd9b2f62a6855fc.zip", "author": "Raaz Dwivedi;Lester Mackey", "authorids": "~Raaz_Dwivedi1;~Lester_Mackey1", "gender": "M;M", "homepage": "https://raazdwivedi.github.io/;https://stanford.edu/~lmackey", "dblp": "180/9006;05/2961", "google_scholar": "9ehX_58AAAAJ;erv7TP0AAAAJ", "orcid": ";0000-0002-1102-0387", "linkedin": "raaz-dwivedi;lester-mackey-5902909", "or_profile": "~Raaz_Dwivedi1;~Lester_Mackey1", "aff": "Massachusetts Institute of Technology;Microsoft Research New England", "aff_domain": "mit.edu;microsoft.com", "position": "Postdoc;Principal Researcher", "bibtex": "@inproceedings{\ndwivedi2022generalized,\ntitle={Generalized Kernel Thinning},\nauthor={Raaz Dwivedi and Lester Mackey},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=IfNu7Dr-3fQ}\n}", "github": "", "project": "", "reviewers": "8dTF;MW7b;5PwR;LtTV", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "3;2;3;2", "correctness": "4;4;3;4", "technical_novelty": "3;3;2;2", "empirical_novelty": "2;2;0;2", "wc_summary_paper": "60;40;44;62", "wc_summary_review": "42;24;66;41", "wc_main_review": "104;146;273;113", "wc_review": "206;210;383;216", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "484;748;1086;40", "reply_reviewers": "0;0;0;0", "reply_authors": "4;4;5;1", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 2.5, 0.5 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 1.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 51.5, 9.630680142129112 ], "wc_summary_review_avg": [ 43.25, 14.956186011146023 ], "wc_main_review_avg": [ 159.0, 67.64983370267808 ], "wc_review_avg": [ 253.75, 74.70734568969775 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 589.5, 382.33329700668236 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 3.5, 1.5 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.6882472016116854, "corr_recommendation_correctness": 0.13245323570650439, "gs_citation": 37, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11005160819787759649&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "pdf": "https://openreview.net/pdf?id=IfNu7Dr-3fQ", "email": "mit.edu;microsoft.com", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "Massachusetts Institute of Technology;Microsoft", "aff_unique_dep": ";Microsoft Research", "aff_unique_url": "https://web.mit.edu;https://www.microsoft.com/en-us/research/group/microsoft-research-new-england", "aff_unique_abbr": "MIT;MSR NE", "aff_campus_unique_index": "1", "aff_campus_unique": ";New England", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "Ih0iJBSy4eq", "title": "Can Reinforcement Learning Efficiently Find Stackelberg-Nash Equilibria in General-Sum Markov Games?", "track": "main", "status": "Reject", "tldr": "", "abstract": "We study multi-player general-sum Markov games with one of the players designated as the leader and the rest regarded as the followers. In particular, we focus on the class of games where the state transitions are only determined by the leader's action while the actions of all the players determine their immediate rewards. For such a game, our goal is to find the Stackelberg-Nash equilibrium (SNE), which is a policy pair $(\\pi^*, \\nu^*)$ such that (i) $\\pi^*$ is the optimal policy for the leader when the followers always play their best response, and (ii) $\\nu^*$ is the best response policy of the followers, which is a Nash equilibrium of the followers' game induced by $\\pi^*$. We develop sample efficient reinforcement learning (RL) algorithms for solving SNE for both the online and offline settings. Respectively, our algorithms are optimistic and pessimistic variants of least-squares value iteration and are readily able to incorporate function approximation for handling large state spaces. Furthermore, for the case with linear function approximation, we prove that our algorithms achieve sublinear regret and suboptimality under online and offline setups respectively. To our best knowledge, we establish the first provably efficient RL algorithms for solving SNE in general-sum Markov games with leader-controlled state transitions. ", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/28b5bc506fe224ad906e469dcdeb827792b90e2b.zip", "author": "Han Zhong;Zhuoran Yang;Zhaoran Wang;Michael Jordan", "authorids": "~Han_Zhong1;~Zhuoran_Yang1;~Zhaoran_Wang1;~Michael_Jordan1", "gender": ";M;Not Specified;M", "homepage": "https://hanzhong-ml.github.io/;https://zhuoranyang.github.io/;https://zhaoranwang.github.io/;http://www.cs.berkeley.edu/~jordan/", "dblp": "137/8096.html;;117/2756;j/MichaelIJordan", "google_scholar": "Bk5q_pAAAAAJ;;https://scholar.google.com.tw/citations?user=HSx0BgQAAAAJ;https://scholar.google.com.tw/citations?user=yxUduqMAAAAJ", "orcid": ";;;0000-0001-8935-817X", "linkedin": ";;;", "or_profile": "~Han_Zhong1;~Zhuoran_Yang1;~Zhaoran_Wang1;~Michael_Jordan1", "aff": "Peking University;University of California, Berkeley;;University of California, Berkeley", "aff_domain": "stu.pku.edu.cn;berkeley.edu;;berkeley.edu", "position": "PhD student;Postdoc;;Full Professor", "bibtex": "@misc{\nzhong2022can,\ntitle={Can Reinforcement Learning Efficiently Find Stackelberg-Nash Equilibria in General-Sum Markov Games?},\nauthor={Han Zhong and Zhuoran Yang and Zhaoran Wang and Michael Jordan},\nyear={2022},\nurl={https://openreview.net/forum?id=Ih0iJBSy4eq}\n}", "github": "", "project": "", "reviewers": "cJRF;9LtG;JEih;MweZ", "site": "https://openreview.net/forum?id=Ih0iJBSy4eq", "pdf_size": 0, "recommendation": "5;5;5;5", "confidence": "4;3;3;2", "correctness": "4;3;3;3", "technical_novelty": "2;3;3;3", "empirical_novelty": "0;3;0;2", "wc_summary_paper": "48;54;32;96", "wc_summary_review": "11;15;36;24", "wc_main_review": "224;152;349;315", "wc_review": "283;221;417;435", "wc_reply_reviewers": "296;0;0;0", "wc_reply_authors": "1183;399;1046;323", "reply_reviewers": "2;0;0;0", "reply_authors": "3;1;2;1", "recommendation_avg": [ 5.0, 0.0 ], "confidence_avg": [ 3.0, 0.7071067811865476 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.25, 1.299038105676658 ], "wc_summary_paper_avg": [ 57.5, 23.637893307145628 ], "wc_summary_review_avg": [ 21.5, 9.604686356149273 ], "wc_main_review_avg": [ 260.0, 77.3078262532326 ], "wc_review_avg": [ 339.0, 89.94442728707543 ], "wc_reply_reviewers_avg": [ 74.0, 128.17175976009693 ], "wc_reply_authors_avg": [ 737.75, 380.80006236869235 ], "reply_reviewers_avg": [ 0.5, 0.8660254037844386 ], "reply_authors_avg": [ 1.75, 0.82915619758885 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:XxVhWVDj61cJ:scholar.google.com/&scioq=Can+Reinforcement+Learning+Efficiently+Find+Stackelberg-Nash+Equilibria+in+General-Sum+Markov+Games%3F&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1;1", "aff_unique_norm": "Peking University;University of California, Berkeley", "aff_unique_dep": ";", "aff_unique_url": "http://www.pku.edu.cn;https://www.berkeley.edu", "aff_unique_abbr": "Peking U;UC Berkeley", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Berkeley", "aff_country_unique_index": "0;1;1", "aff_country_unique": "China;United States" }, { "id": "Ih7LAeOYIb0", "title": "Iterative Memory Network for Long Sequential User Behavior Modeling in Recommender Systems", "track": "main", "status": "Reject", "tldr": "", "abstract": "Sequential user behavior modeling is a key feature in modern recommender systems, seeking to capture users' interest based on their past activities. There are two usual approaches to sequential modeling : Recurrent Neural Networks (RNNs) and the attention mechanism. As the user behavior sequence gets longer, the usual approaches encounter problems. RNN-based methods incur the problem of fast forgetting, making it difficult to model the user's interests long time ago. The self-attention mechanism and its variations such as the transformer structure have the unfortunate property of a quadratic cost with respect to the input length, which makes it difficult to deal with long inputs. The target attention mechanism, despite having only $O(L)$ memory and time complexity, cannot model intra-sequence dependencies. In this paper, we propose Iterative Memory Network (IMN), an end-to-end differentiable framework for long sequential user behavior modeling. In our model, the target item acts as a memory trigger, continuously eliciting relevant information from the long sequence to represent the user's memory on the particular target item. In the Iterative Memory Update module, the model walks over the long sequence multiple iterations and keeps a memory vector to memorize the content walked over. Within each iteration, the sequence interacts with both the target item and the current memory for both target-sequence relation modeling and intra-sequence relation modeling. The memory is updated after each iteration. The framework incurs only $O(L)$ memory and time complexity while reduces the maximum length of network signal travelling paths to $O(1)$, which is achieved by the self-attention mechanism with $O(L^2)$ complexity. Various designs of efficient self-attention mechanisms are at best $O(LlogL)$. Extensive empirical studies show that our method outperforms various state-of-the-art sequential modeling methods on both public and industrial datasets for long sequential user behavior modeling. ", "keywords": "recommender systems;sequential behavior modeling", "primary_area": "", "supplementary_material": "/attachment/98e7eda67ff87ab92c5d27c2993a9588e9b26a97.zip", "author": "Qianying Lin;Wen-Ji Zhou;Yanshi Wang;Qing Da;Qing-Guo Chen;Bing Wang", "authorids": "~Qianying_Lin2;~Wen-Ji_Zhou1;~Yanshi_Wang1;~Qing_Da1;~Qing-Guo_Chen1;~Bing_Wang11", "gender": ";M;;M;M;M", "homepage": ";;;;;", "dblp": "https://dblp.uni-trier.de/pid/289/0027.html;279/3635.html;138/2474;63/10301;06/1909;204/2961.html", "google_scholar": ";;;GlqRHLcAAAAJ;https://scholar.google.com/citations?hl=en;https://scholar.google.com.hk/citations?user=PBfkOe4AAAAJ", "orcid": ";;;;0000-0001-7976-8642;0000-0002-5116-5679", "linkedin": ";;;;;", "or_profile": "~Qianying_Lin2;~Yanshi_Wang1;~Qing_Da1;~Qing-Guo_Chen1;~Bing_Wang11;~Wen-Ji_Zhou2", "aff": "Stanford University;;;Alibaba Group;Alibaba Group;Alibaba Group", "aff_domain": "stanford.edu;;;alibaba-inc.com;alibaba-inc.com;alibaba-inc.com", "position": "MS student;;;Researcher;Researcher;Researcher", "bibtex": "@misc{\nlin2022iterative,\ntitle={Iterative Memory Network for Long Sequential User Behavior Modeling in Recommender Systems},\nauthor={Qianying Lin and Wen-Ji Zhou and Yanshi Wang and Qing Da and Qing-Guo Chen and Bing Wang},\nyear={2022},\nurl={https://openreview.net/forum?id=Ih7LAeOYIb0}\n}", "github": "", "project": "", "reviewers": "Q1ux;p4tQ;UHbh;NSbP", "site": "https://openreview.net/forum?id=Ih7LAeOYIb0", "pdf_size": 0, "recommendation": "5;5;5;6", "confidence": "4;4;4;4", "correctness": "3;4;3;3", "technical_novelty": "3;2;2;3", "empirical_novelty": "2;2;3;4", "wc_summary_paper": "73;60;99;28", "wc_summary_review": "40;54;11;49", "wc_main_review": "357;173;185;155", "wc_review": "470;287;295;232", "wc_reply_reviewers": "0;167;1024;0", "wc_reply_authors": "803;1559;3062;462", "reply_reviewers": "0;1;3;0", "reply_authors": "2;3;6;2", "recommendation_avg": [ 5.25, 0.4330127018922193 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 65.0, 25.563646062328434 ], "wc_summary_review_avg": [ 38.5, 16.650825805346713 ], "wc_main_review_avg": [ 217.5, 81.24499984614438 ], "wc_review_avg": [ 321.0, 89.37840902589394 ], "wc_reply_reviewers_avg": [ 297.75, 424.8072356963803 ], "wc_reply_authors_avg": [ 1471.5, 1000.4160384560016 ], "reply_reviewers_avg": [ 1.0, 1.224744871391589 ], "reply_authors_avg": [ 3.25, 1.6393596310755 ], "replies_avg": [ 22, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": -0.3333333333333333, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:EwfljqnF_RUJ:scholar.google.com/&scioq=Iterative+Memory+Network+for+Long+Sequential+User+Behavior+Modeling+in+Recommender+Systems&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;1;1", "aff_unique_norm": "Stanford University;Alibaba Group", "aff_unique_dep": ";", "aff_unique_url": "https://www.stanford.edu;https://www.alibaba.com", "aff_unique_abbr": "Stanford;Alibaba", "aff_campus_unique_index": "0", "aff_campus_unique": "Stanford;", "aff_country_unique_index": "0;1;1;1", "aff_country_unique": "United States;China" }, { "id": "IhkSFe9YqMy", "title": "Experience Replay More When It's a Key Transition in Deep Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "We propose a experience replay mechanism in Deep Reinforcement Learning based on Add Noise to Noise (AN2N), which requires agent to replay more experience containing key state, abbreviated as Experience Replay More (ERM). In the AN2N algorithm, we refer to the states where exploring more as the key states. We found that how the transitions containing the key state participates in updating the policy and Q networks has a significant impact on the performance improvement of the deep reinforcement learning agent, and the problem of catastrophic forgetting in neural networks is further magnified in the AN2N algorithm. Therefore, we change the previous strategy of uniform sampling of experience transitions. We sample the transition used for experience replay according to whether the transition contains key states and whether it is the most recently generated, which is the core idea of the ERM algorithm. The experimental results show that this algorithm can significantly improve the performance of the agent. We combine the ERM algorithm with Deep Deterministic Policy Gradient (DDPG), Twin Delayed Deep Deterministic policy gradient (TD3) and Soft Actor-Critic (SAC), and evaluate algorithm on the suite of OpenAI gym tasks, SAC with ERM achieves a new state of the art, and DDPG with ERM can even exceed the average performance of SAC under certain random seeds, which is incredible.", "keywords": "Experience Replay More;Key Transitions;Sampling;Add Noise to Noise;Deep Reinforcement Learning", "primary_area": "", "supplementary_material": "", "author": "Youtian Guo;Qi Gao", "authorids": "~Youtian_Guo1;~Qi_Gao2", "gender": "M;M", "homepage": ";https://ac.bit.edu.cn/szdw/jsdw/mssbyznxtyjs/20150206115558187442/", "dblp": ";", "google_scholar": ";", "orcid": "0000-0002-0979-9344;", "linkedin": ";", "or_profile": "~Youtian_Guo1;~Qi_Gao2", "aff": "Beijing Institute of Technology;Beijing Institute of Technology", "aff_domain": "bit.edu.cn;bit.edu.cn", "position": "MS student;Associate Professor", "bibtex": "@misc{\nguo2022experience,\ntitle={Experience Replay More When It's a Key Transition in Deep Reinforcement Learning},\nauthor={Youtian Guo and Qi Gao},\nyear={2022},\nurl={https://openreview.net/forum?id=IhkSFe9YqMy}\n}", "github": "", "project": "", "reviewers": "Hemq;cBYQ;Jn1b;VS76", "site": "https://openreview.net/forum?id=IhkSFe9YqMy", "pdf_size": 0, "recommendation": "1;1;3;3", "confidence": "4;4;4;5", "correctness": "1;2;1;3", "technical_novelty": "1;2;1;2", "empirical_novelty": "2;2;1;2", "wc_summary_paper": "31;95;63;38", "wc_summary_review": "39;34;30;17", "wc_main_review": "788;321;309;300", "wc_review": "858;450;402;355", "wc_reply_reviewers": "418;417;0;400", "wc_reply_authors": "698;552;550;545", "reply_reviewers": "1;2;0;1", "reply_authors": "2;3;1;2", "recommendation_avg": [ 2.0, 1.0 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 1.75, 0.82915619758885 ], "technical_novelty_avg": [ 1.5, 0.5 ], "empirical_novelty_avg": [ 1.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 56.75, 25.083610186733488 ], "wc_summary_review_avg": [ 30.0, 8.154753215150045 ], "wc_main_review_avg": [ 429.5, 207.11409898893893 ], "wc_review_avg": [ 516.25, 200.14791405358187 ], "wc_reply_reviewers_avg": [ 308.75, 178.40035734269145 ], "wc_reply_authors_avg": [ 586.25, 64.56924577536894 ], "reply_reviewers_avg": [ 1.0, 0.7071067811865476 ], "reply_authors_avg": [ 2.0, 0.7071067811865476 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": 0.30151134457776363, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:O4UyROCNMUMJ:scholar.google.com/&scioq=Experience+Replay+More+When+It%27s+a+Key+Transition+in+Deep+Reinforcement+Learning&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Beijing Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "http://www.bit.edu.cn/", "aff_unique_abbr": "BIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "id": "Ihxw4h-JnC", "title": "Stochastic Induction of Decision Trees with Application to Learning Haar Tree", "track": "main", "status": "Reject", "tldr": "", "abstract": "Decision trees are a convenient and established approach for any supervised learning task. Decision trees are used in a broad range of applications from medical imaging to computer vision. Decision trees are trained by greedily splitting the leaf nodes into a split and two leaf nodes until a certain stopping criterion is reached. The procedure of splitting a node consists of finding the best feature and threshold that minimizes a criterion. The criterion minimization problem is solved through an exhaustive search algorithm. However, this exhaustive search algorithm is very expensive, especially, if the number of samples and features are high. In this paper, we propose a novel stochastic approach for the criterion minimization. Asymptotically, the proposed algorithm is faster than conventional exhaustive search by several orders of magnitude. It is further shown that the proposed approach minimizes an upper bound for the criterion. Experimentally, the algorithm is compared with several other related state-of-the-art decision tree learning methods, including the baseline non-stochastic approach. The proposed algorithm outperforms every other decision tree learning (including online and fast) approaches and performs as well as the baseline algorithm in terms of accuracy and computational cost, despite being non-deterministic. For empirical evaluation, we apply the proposed algorithm to learn a Haar tree over MNIST dataset that consists of over $200,000$ features and $60,000$ samples. This tree achieved a test accuracy of $94\\%$ over MNIST which is $4\\%$ higher than any other known axis-aligned tree. This result is comparable to the performance of oblique trees, while providing a significant speed-up at both inference and training times.", "keywords": "Decision Tree;Stochastic Optimization;Haar Filters;Haar Cascade", "primary_area": "", "supplementary_material": "/attachment/3e89071b821bfc3616194a86eaf7c57d353fe00f.zip", "author": "Azar Alizadeh;Pooya Tavallali;Vahid Behzadan;Mukesh Singhal", "authorids": "~Azar_Alizadeh1;~Pooya_Tavallali1;~Vahid_Behzadan2;~Mukesh_Singhal1", "gender": "F;M;M;M", "homepage": ";;http://www.sail-lab.org;", "dblp": ";231/7674;172/2715;s/MukeshSinghal", "google_scholar": ";T2Pa1vQAAAAJ;MYMANOYAAAAJ;", "orcid": ";;;", "linkedin": "azar-alizadeh-57634b155;pooya-tavallali-7b8949105/;vahid-behzadan/;", "or_profile": "~Azar_Alizadeh1;~Pooya_Tavallali1;~Vahid_Behzadan2;~Mukesh_Singhal1", "aff": "University of California at Merced;University of California at Merced;University of New Haven;University of California at Merced", "aff_domain": "ucmerced.edu;ucmerced.edu;newhaven.edu;ucmerced.edu", "position": "PhD student;PhD student;Assistant Professor;Full Professor", "bibtex": "@misc{\nalizadeh2022stochastic,\ntitle={Stochastic Induction of Decision Trees with Application to Learning Haar Tree},\nauthor={Azar Alizadeh and Pooya Tavallali and Vahid Behzadan and Mukesh Singhal},\nyear={2022},\nurl={https://openreview.net/forum?id=Ihxw4h-JnC}\n}", "github": "", "project": "", "reviewers": "yzxY;rLa6;a3QF;rpvo", "site": "https://openreview.net/forum?id=Ihxw4h-JnC", "pdf_size": 0, "recommendation": "3;3;3;3", "confidence": "4;3;4;3", "correctness": "2;2;2;2", "technical_novelty": "2;2;3;2", "empirical_novelty": "3;2;0;3", "wc_summary_paper": "128;221;68;168", "wc_summary_review": "74;78;34;42", "wc_main_review": "1002;369;1411;394", "wc_review": "1204;668;1513;604", "wc_reply_reviewers": "0;29;0;0", "wc_reply_authors": "476;613;1747;828", "reply_reviewers": "0;1;0;0", "reply_authors": "1;1;3;1", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 2.0, 0.0 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 1.224744871391589 ], "wc_summary_paper_avg": [ 146.25, 55.93914103738097 ], "wc_summary_review_avg": [ 57.0, 19.261360284258224 ], "wc_main_review_avg": [ 794.0, 437.2007548026421 ], "wc_review_avg": [ 997.25, 378.08555579392345 ], "wc_reply_reviewers_avg": [ 7.25, 12.55736835487436 ], "wc_reply_authors_avg": [ 916.0, 495.91178651046397 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.5, 0.8660254037844386 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5690178695723863363&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "University of California, Merced;University of New Haven", "aff_unique_dep": ";", "aff_unique_url": "https://www.ucmerced.edu;https://www.newhaven.edu", "aff_unique_abbr": "UC Merced;UNH", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Merced;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Better Supervisory Signals by Observing Learning Paths", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6630", "id": "Iog0djAdbHj", "poster": "", "openreview": "https://openreview.net/forum?id=Iog0djAdbHj", "slides": "https://iclr.cc/virtual/2022/poster/6630", "video": "https://iclr.cc/virtual/2022/poster/6630", "author_site": "YI REN, Shangmin Guo, Danica J Sutherland", "tldr": "", "abstract": "Better-supervised models might have better performance. In this paper, we first clarify what makes for good supervision for a classification problem, and then explain two existing label refining methods, label smoothing and knowledge distillation, in terms of our proposed criterion. To further answer why and how better supervision emerges, we observe the learning path, i.e., the trajectory of the model's predictions during training, for each training sample. We find that the model can spontaneously refine \"bad\" labels through a \"zig-zag\" learning path, which occurs on both toy and real datasets. Observing the learning path not only provides a new perspective for understanding knowledge distillation, overfitting, and learning dynamics, but also reveals that the supervisory signal of a teacher network can be very unstable near the best points in training on real tasks. Inspired by this, we propose a new knowledge distillation scheme, Filter-KD, which improves downstream classification performance in various settings.", "keywords": "Classification;Supervision;Knowledge Distillation", "primary_area": "", "supplementary_material": "", "author": "Yi Ren;Shangmin Guo;Danica J. Sutherland", "authorids": "~Yi_Ren6;~Shangmin_Guo1;~Danica_J._Sutherland1", "gender": "M;M;F", "homepage": "https://joshua-ren.github.io/;;http://www.djsutherland.ml", "dblp": ";183/0949;92/10966", "google_scholar": "5QNce38AAAAJ;cpOrbSoAAAAJ;https://scholar.google.co.uk/citations?user=uO_NqicAAAAJ", "orcid": ";0000-0003-1716-0994;0000-0002-1525-3532", "linkedin": ";;", "or_profile": "~Yi_Ren6;~Shangmin_Guo1;~Danica_J._Sutherland2", "aff": "University of British Columbia;University of Edinburgh;University of British Columbia", "aff_domain": "ubc.ca;ed.ac.uk;cs.ubc.ca", "position": "PhD student;PhD student;Assistant Professor", "bibtex": "@inproceedings{\nren2022better,\ntitle={Better Supervisory Signals by Observing Learning Paths},\nauthor={Yi Ren and Shangmin Guo and Danica J. Sutherland},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=Iog0djAdbHj}\n}", "github": "", "project": "", "reviewers": "JaQp;QVYW;bjwd;8bUF", "pdf_size": 0, "recommendation": "5;6;8;8", "confidence": "3;4;4;4", "correctness": "2;2;3;3", "technical_novelty": "3;3;3;4", "empirical_novelty": "2;3;3;4", "wc_summary_paper": "25;150;48;75", "wc_summary_review": "61;62;76;130", "wc_main_review": "31;816;874;403", "wc_review": "117;1028;998;608", "wc_reply_reviewers": "0;565;991;52", "wc_reply_authors": "119;1751;2376;326", "reply_reviewers": "0;2;2;1", "reply_authors": "1;4;5;2", "recommendation_avg": [ 6.75, 1.299038105676658 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 3.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 74.5, 47.04519104010526 ], "wc_summary_review_avg": [ 82.25, 28.199069133572475 ], "wc_main_review_avg": [ 531.0, 341.04911669728745 ], "wc_review_avg": [ 687.75, 368.82948295926667 ], "wc_reply_reviewers_avg": [ 402.0, 405.4608489114578 ], "wc_reply_authors_avg": [ 1143.0, 949.4759080671821 ], "reply_reviewers_avg": [ 1.25, 0.82915619758885 ], "reply_authors_avg": [ 3.0, 1.5811388300841898 ], "replies_avg": [ 25, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.7777777777777777, "corr_recommendation_correctness": 0.9622504486493761, "gs_citation": 26, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4997668798655366002&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=Iog0djAdbHj", "email": "ubc.ca;ed.ac.uk;cs.ubc.ca", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "University of British Columbia;University of Edinburgh", "aff_unique_dep": ";", "aff_unique_url": "https://www.ubc.ca;https://www.ed.ac.uk", "aff_unique_abbr": "UBC;Edinburgh", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", "aff_country_unique": "Canada;United Kingdom" }, { "title": "Information-theoretic Online Memory Selection for Continual Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/5961", "id": "IpctgL7khPp", "poster": "", "openreview": "https://openreview.net/forum?id=IpctgL7khPp", "slides": "https://iclr.cc/virtual/2022/poster/5961", "video": "https://iclr.cc/virtual/2022/poster/5961", "author_site": "Shengyang Sun, Daniele Calandriello, Huiyi Hu, Ang Li, Michalis Titsias", "tldr": "", "abstract": "A challenging problem in task-free continual learning is the online selection of a representative replay memory from data streams. In this work, we investigate the online memory selection problem from an information-theoretic perspective. To gather the most information, we propose the \\textit{surprise} and the \\textit{learnability} criteria to pick informative points and to avoid outliers. We present a Bayesian model to compute the criteria efficiently by exploiting rank-one matrix structures. We demonstrate that these criteria encourage selecting informative points in a greedy algorithm for online memory selection. Furthermore, by identifying the importance of \\textit{the timing to update the memory}, we introduce a stochastic information-theoretic reservoir sampler (InfoRS), which conducts sampling among selective points with high information. Compared to reservoir sampling, InfoRS demonstrates improved robustness against data imbalance. Finally, empirical performances over continual learning benchmarks manifest its efficiency and efficacy.", "keywords": "Task-free continual learning;replay memory;information theoretic;reservoir sampling", "primary_area": "", "supplementary_material": "", "author": "Shengyang Sun;Daniele Calandriello;Huiyi Hu;Ang Li;Michalis Titsias", "authorids": "~Shengyang_Sun4;~Daniele_Calandriello1;~Huiyi_Hu1;~Ang_Li1;~Michalis_Titsias1", "gender": "M;M;;M;M", "homepage": "http://www.cs.toronto.edu/~ssy/;;;https://angli.ai;https://mtitsias.github.io/", "dblp": "173/5093;129/1542;118/4146;33/2805-1;19/5385", "google_scholar": "https://scholar.google.ca/citations?user=NktP1NQAAAAJ;;2BQ4cKAAAAAJ;6bRXWXEAAAAJ;https://scholar.google.gr/citations?user=B-SbkAwAAAAJ", "orcid": ";;;;", "linkedin": ";;;angli-ai;", "or_profile": "~Shengyang_Sun4;~Daniele_Calandriello1;~Huiyi_Hu1;~Ang_Li1;~Michalis_Titsias1", "aff": "Department of Computer Science, University of Toronto;Google DeepMind;;Baidu Apollo;Google DeepMind", "aff_domain": "cs.toronto.edu;deepmind.com;;baidu.com;google.com", "position": "PhD student;Researcher;;Principal Researcher;Research Scientist", "bibtex": "@inproceedings{\nsun2022informationtheoretic,\ntitle={Information-theoretic Online Memory Selection for Continual Learning},\nauthor={Shengyang Sun and Daniele Calandriello and Huiyi Hu and Ang Li and Michalis Titsias},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=IpctgL7khPp}\n}", "github": "", "project": "", "reviewers": "TW8M;6ZWf;PFQ2", "pdf_size": 0, "recommendation": "5;6;8", "confidence": "3;4;4", "correctness": "4;3;4", "technical_novelty": "3;2;3", "empirical_novelty": "3;2;3", "wc_summary_paper": "65;86;143", "wc_summary_review": "31;105;45", "wc_main_review": "141;482;640", "wc_review": "237;673;828", "wc_reply_reviewers": "73;16;52", "wc_reply_authors": "335;705;665", "reply_reviewers": "1;1;1", "reply_authors": "2;1;2", "recommendation_avg": [ 6.333333333333333, 1.247219128924647 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 98.0, 32.95451410656816 ], "wc_summary_review_avg": [ 60.333333333333336, 32.097074979228594 ], "wc_main_review_avg": [ 421.0, 208.2322421400362 ], "wc_review_avg": [ 579.3333333333334, 250.20036415285696 ], "wc_reply_reviewers_avg": [ 47.0, 23.53720459187964 ], "wc_reply_authors_avg": [ 568.3333333333334, 165.79773487261184 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.7559289460184545, "corr_recommendation_correctness": 0.18898223650461363, "gs_citation": 57, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=464639194254891846&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=IpctgL7khPp", "email": "cs.toronto.edu;deepmind.com;;baidu.com;google.com", "author_num": 5, "aff_unique_index": "0;1;2;1", "aff_unique_norm": "University of Toronto;Google;Baidu", "aff_unique_dep": "Department of Computer Science;Google DeepMind;Apollo", "aff_unique_url": "https://www.utoronto.ca;https://deepmind.com;https://apollo.auto", "aff_unique_abbr": "U of T;DeepMind;Baidu Apollo", "aff_campus_unique_index": "0", "aff_campus_unique": "Toronto;", "aff_country_unique_index": "0;1;2;1", "aff_country_unique": "Canada;United Kingdom;China" }, { "id": "IptBMO1AR5g", "title": "Regularizing Deep Neural Networks with Stochastic Estimators of Hessian Trace", "track": "main", "status": "Reject", "tldr": "", "abstract": "In this paper we develop a novel regularization method for deep neural networks by penalizing the trace of Hessian. This regularizer is motivated by a recent guarantee bound of the generalization error. Hutchinson method is a classical unbiased estimator for the trace of a matrix, but it is very time-consuming on deep learning models. Hence a dropout scheme is proposed to efficiently implements the Hutchinson method. Then we discuss a connection to linear stability of a nonlinear dynamical system. Experiments demonstrate that our method outperforms existing regularizers such as Jacobian, confidence penalty, and label smoothing. Our regularization method is also orthogonal to data augmentation methods, achieving the best performance when our method is combined with data augmentation. ", "keywords": "Regularization;Hessian Trace;Stochastic Estimator;Nonlinear Dynamical System;Generalization Error", "primary_area": "", "supplementary_material": "", "author": "Yucong Liu;Tong Lin", "authorids": "~Yucong_Liu1;~Tong_Lin1", "gender": "M;M", "homepage": "https://stat.uchicago.edu/people/profile/yucong-liu/;https://sai.pku.edu.cn/szdw/zzjs/lt.htm", "dblp": ";74/5719-2", "google_scholar": ";", "orcid": ";0000-0002-0000-834X", "linkedin": ";", "or_profile": "~Yucong_Liu1;~Tong_Lin1", "aff": "University of Chicago;Peking University", "aff_domain": "uchicago.edu;pku.edu.cn", "position": "MS student;Associate Professor", "bibtex": "@misc{\nliu2022regularizing,\ntitle={Regularizing Deep Neural Networks with Stochastic Estimators of Hessian Trace},\nauthor={Yucong Liu and Tong Lin},\nyear={2022},\nurl={https://openreview.net/forum?id=IptBMO1AR5g}\n}", "github": "", "project": "", "reviewers": "YAnV;CMzT;GDik;imd7", "site": "https://openreview.net/forum?id=IptBMO1AR5g", "pdf_size": 0, "recommendation": "3;5;5;8", "confidence": "4;3;4;3", "correctness": "2;2;4;3", "technical_novelty": "3;2;3;4", "empirical_novelty": "1;2;2;2", "wc_summary_paper": "74;351;92;130", "wc_summary_review": "21;45;82;103", "wc_main_review": "325;813;772;280", "wc_review": "420;1209;946;513", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "770;1404;1395;416", "reply_reviewers": "0;0;0;0", "reply_authors": "2;3;3;2", "recommendation_avg": [ 5.25, 1.7853571071357126 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 2.75, 0.82915619758885 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 1.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 161.75, 111.1178990982101 ], "wc_summary_review_avg": [ 62.75, 31.814894310684107 ], "wc_main_review_avg": [ 547.5, 245.94359109356762 ], "wc_review_avg": [ 772.0, 321.0256999057864 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 996.25, 422.23830652843424 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.5, 0.5 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.7001400420140049, "corr_recommendation_correctness": 0.37998029782867415, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8175935902327330712&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1", "aff_unique_norm": "University of Chicago;Peking University", "aff_unique_dep": ";", "aff_unique_url": "https://www.uchicago.edu;http://www.pku.edu.cn", "aff_unique_abbr": "UChicago;Peking U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "United States;China" }, { "id": "Is5Hpwg2R-h", "title": "Targeted Environment Design from Offline Data", "track": "main", "status": "Reject", "tldr": "", "abstract": "In reinforcement learning (RL) the use of simulators is ubiquitous, allowing cheaper and safer agent training than training directly in the real target environment. However, this approach relies on the simulator being a sufficiently accurate reflection of the target environment, which is difficult to achieve in practice, resulting in the need to bridge sim2real gap. Accordingly, recent methods have proposed an alternative paradigm, utilizing offline datasets from the target environment to train an agent, avoiding online access to either the target or any simulated environment but leading to poor generalization outside the support of the offline data. We propose to combine the two paradigms: offline datasets and synthetic simulators, to reduce the sim2real gap by using limited offline data to train realistic simulators. We formalize our approach as offline targeted environment design(OTED), which automatically learns a distribution over simulator parameters to match a provided offline dataset, and then uses the learned simulator to train an RL agent in standard online fashion. We derive an objective for learning the simulator parameters which corresponds to minimizing a divergence between the target offline dataset and the state-action distribution induced by the simulator. We evaluate our method on standard offlineRL benchmarks and show that it learns using as few as 5 demonstrations, and yields up to 17 times higher score compared to strong existing offline RL, behavior cloning (BC), and domain randomization baseline, thus successfully leveraging both offline datasets and simulators for better RL", "keywords": "targeted environment design;offline reinforcement learning;deep learning;adversarial learning", "primary_area": "", "supplementary_material": "/attachment/dd0c179bb75dbaab3e12837ba1575ce9748c9ec4.zip", "author": "Izzeddin Gur;Ofir Nachum;Aleksandra Faust", "authorids": "~Izzeddin_Gur1;~Ofir_Nachum1;~Aleksandra_Faust1", "gender": ";M;F", "homepage": ";https://scholar.google.com/citations?user=C-ZlBWMAAAAJ&hl=en;http://www.afaust.info", "dblp": "188/9027;;135/8420", "google_scholar": "qS_ugJAAAAAJ;C-ZlBWMAAAAJ;RK72t68AAAAJ", "orcid": ";;0000-0002-3268-8685", "linkedin": ";;aleksandrafaust", "or_profile": "~Izzeddin_Gur1;~Ofir_Nachum1;~Aleksandra_Faust1", "aff": "Google;OpenAI;Google Brain", "aff_domain": "google.com;openai.com;google.com", "position": "Research Scientist;Researcher;Principal Researcher", "bibtex": "@misc{\ngur2022targeted,\ntitle={Targeted Environment Design from Offline Data},\nauthor={Izzeddin Gur and Ofir Nachum and Aleksandra Faust},\nyear={2022},\nurl={https://openreview.net/forum?id=Is5Hpwg2R-h}\n}", "github": "", "project": "", "reviewers": "2itJ;mcpN;huR6;rXE7", "site": "https://openreview.net/forum?id=Is5Hpwg2R-h", "pdf_size": 0, "recommendation": "3;5;6;8", "confidence": "5;4;4;4", "correctness": "4;3;3;3", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;3;2;3", "wc_summary_paper": "61;68;136;86", "wc_summary_review": "52;95;87;60", "wc_main_review": "415;255;454;598", "wc_review": "528;418;677;744", "wc_reply_reviewers": "593;162;0;182", "wc_reply_authors": "1104;442;286;714", "reply_reviewers": "1;1;0;1", "reply_authors": "3;2;1;2", "recommendation_avg": [ 5.5, 1.8027756377319946 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 87.75, 29.31190031369512 ], "wc_summary_review_avg": [ 73.5, 17.95132307101624 ], "wc_main_review_avg": [ 430.5, 122.11572380328424 ], "wc_review_avg": [ 591.75, 127.18171055619594 ], "wc_reply_reviewers_avg": [ 234.25, 218.81770380844418 ], "wc_reply_authors_avg": [ 636.5, 310.33973319573505 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 2.0, 0.7071067811865476 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.8006407690254357, "corr_recommendation_correctness": -0.8006407690254357, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1970831282039386373&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "aff_unique_index": "0;1;0", "aff_unique_norm": "Google;OpenAI", "aff_unique_dep": "Google;", "aff_unique_url": "https://www.google.com;https://openai.com", "aff_unique_abbr": "Google;OpenAI", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "IsHQmuOqRAG", "title": "Learning to perceive objects by prediction", "track": "main", "status": "Reject", "tldr": "", "abstract": "The representation of objects is the building block of higher-level concepts. Infants develop the notion of objects without supervision. The prediction error of future sensory input is likely the major teaching signal for infants. Inspired by this, we propose a new framework to extract object-centric representation from single 2D images by learning to predict future scenes in the presence of moving objects. We treat objects as latent causes whose function to an agent is to facilitate efficient prediction of the coherent motion of their parts in visual input. Distinct from previous object-centric models, our model learn to explicitly infer objects' location in 3D environment in addition to segmenting objects. Further, the network learns a latent code space where objects with the same geometric shape and texture/color frequently group together. The model requires no supervision or pre-training of any part of the network. We provide a new synthetic dataset with more complex textures on objects and background and found several previous models not based on predictive learning overly rely on clustering colors and lose specificity in object segmentation. Our work demonstrates a new approach for learning symbolic representation grounded in sensation and action.", "keywords": "predictive learning;object-centric representation;3D perception;sensory grounding", "primary_area": "", "supplementary_material": "", "author": "Tushar Arora;Li Erran Li;Ming Bo Cai", "authorids": "~Tushar_Arora1;erranli@gmail.com;~Ming_Bo_Cai1", "gender": "M;;", "homepage": "https://arora-tushar.github.io/;;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": "~Tushar_Arora1;erranli@gmail.com;~Ming_Bo_Cai1", "aff": "State University of New York at Stony Brook;;", "aff_domain": "stonybrook.edu;;", "position": "MS student;;", "bibtex": "@misc{\narora2022learning,\ntitle={Learning to perceive objects by prediction},\nauthor={Tushar Arora and Li Erran Li and Ming Bo Cai},\nyear={2022},\nurl={https://openreview.net/forum?id=IsHQmuOqRAG}\n}", "github": "", "project": "", "reviewers": "Z7dM;8qC3;iu2Y;uocA", "site": "https://openreview.net/forum?id=IsHQmuOqRAG", "pdf_size": 0, "recommendation": "3;5;5;8", "confidence": "3;4;4;3", "correctness": "4;3;3;3", "technical_novelty": "2;3;2;3", "empirical_novelty": "3;2;2;3", "wc_summary_paper": "142;300;182;119", "wc_summary_review": "68;37;79;24", "wc_main_review": "778;380;876;466", "wc_review": "988;717;1137;609", "wc_reply_reviewers": "158;96;242;94", "wc_reply_authors": "806;875;1088;476", "reply_reviewers": "1;1;1;1", "reply_authors": "1;2;2;1", "recommendation_avg": [ 5.25, 1.7853571071357126 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 185.75, 69.7078725826574 ], "wc_summary_review_avg": [ 52.0, 22.327113561766108 ], "wc_main_review_avg": [ 625.0, 207.19314660480447 ], "wc_review_avg": [ 862.75, 210.07900299649177 ], "wc_reply_reviewers_avg": [ 147.5, 60.32205235235287 ], "wc_reply_authors_avg": [ 811.25, 219.70135980462206 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.14002800840280097, "corr_recommendation_correctness": -0.7276068751089989, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15471103747894074254&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "aff_unique_index": "0", "aff_unique_norm": "State University of New York at Stony Brook", "aff_unique_dep": "", "aff_unique_url": "https://www.stonybrook.edu", "aff_unique_abbr": "SUNY Stony Brook", "aff_campus_unique_index": "0", "aff_campus_unique": "Stony Brook", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "title": "Convergent Graph Solvers", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6360", "id": "ItkxLQU01lD", "poster": "", "openreview": "https://openreview.net/forum?id=ItkxLQU01lD", "slides": "https://iclr.cc/virtual/2022/poster/6360", "video": "https://iclr.cc/virtual/2022/poster/6360", "author_site": "Junyoung Park, Jinhyun Choo, Jinkyoo Park", "tldr": "", "abstract": "We propose the convergent graph solver (CGS), a deep learning method that learns iterative mappings to predict the properties of a graph system at its stationary state (fixed point) with guaranteed convergence. The forward propagation of CGS proceeds in three steps: (1) constructing the input-dependent linear contracting iterative maps, (2) computing the fixed points of the iterative maps, and (3) decoding the fixed points to estimate the properties. The contractivity of the constructed linear maps guarantees the existence and uniqueness of the fixed points following the Banach fixed point theorem. To train CGS efficiently, we also derive a tractable analytical expression for its gradient by leveraging the implicit function theorem. We evaluate the performance of CGS by applying it to various network-analytic and graph benchmark problems. The results indicate that CGS has competitive capabilities for predicting the stationary properties of graph systems, irrespective of whether the target systems are linear or non-linear. CGS also shows high performance for graph classification problems where the existence or the meaning of a fixed point is hard to be clearly defined, which highlights the potential of CGS as a general graph neural network architecture.", "keywords": "Graph;Graph Neural Network;Fixed point;Implicit model;Implicit function theorem;Convergent", "primary_area": "", "supplementary_material": "", "author": "Junyoung Park;Jinhyun Choo;Jinkyoo Park", "authorids": "~Junyoung_Park1;~Jinhyun_Choo1;~Jinkyoo_Park1", "gender": ";;M", "homepage": ";https://www.choogroup.org;http://silab.kaist.ac.kr/", "dblp": ";;156/7535", "google_scholar": ";;sH2a0nkAAAAJ", "orcid": ";0000-0002-5861-3796;0000-0003-2620-1479", "linkedin": ";jinhyun-choo;", "or_profile": "~Junyoung_Park1;~Jinhyun_Choo1;~Jinkyoo_Park1", "aff": ";The University of Hong Kong;Korea Advanced Institute of Science & Technology", "aff_domain": ";hku.hk;kaist.ac.kr", "position": ";Assistant Professor;Associate Professor", "bibtex": "@inproceedings{\npark2022convergent,\ntitle={Convergent Graph Solvers},\nauthor={Junyoung Park and Jinhyun Choo and Jinkyoo Park},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=ItkxLQU01lD}\n}", "github": "", "project": "", "reviewers": "A3tH;R2aV;G7SD;i9Je", "pdf_size": 0, "recommendation": "8;8;8;8", "confidence": "4;4;4;5", "correctness": "4;4;3;4", "technical_novelty": "3;4;3;3", "empirical_novelty": "3;0;2;3", "wc_summary_paper": "62;41;49;120", "wc_summary_review": "21;52;45;13", "wc_main_review": "157;319;199;248", "wc_review": "240;412;293;381", "wc_reply_reviewers": "13;21;17;14", "wc_reply_authors": "33;673;380;496", "reply_reviewers": "1;1;1;2", "reply_authors": "1;1;1;3", "recommendation_avg": [ 8.0, 0.0 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 1.224744871391589 ], "wc_summary_paper_avg": [ 68.0, 30.943496893531602 ], "wc_summary_review_avg": [ 32.75, 16.192204914711276 ], "wc_main_review_avg": [ 230.75, 60.275928031014175 ], "wc_review_avg": [ 331.5, 68.52919086053767 ], "wc_reply_reviewers_avg": [ 16.25, 3.112474899497183 ], "wc_reply_authors_avg": [ 395.5, 233.8551902353249 ], "reply_reviewers_avg": [ 1.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.5, 0.8660254037844386 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16292715563047713132&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "pdf": "https://openreview.net/pdf?id=ItkxLQU01lD", "email": ";hku.hk;kaist.ac.kr", "author_num": 3, "aff_unique_index": "0;1", "aff_unique_norm": "University of Hong Kong;Korea Advanced Institute of Science and Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.hku.hk;https://www.kaist.ac.kr", "aff_unique_abbr": "HKU;KAIST", "aff_campus_unique_index": "0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;1", "aff_country_unique": "China;South Korea" }, { "title": "Sample Efficient Stochastic Policy Extragradient Algorithm for Zero-Sum Markov Game", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6982", "id": "IvepFxYRDG", "poster": "", "openreview": "https://openreview.net/forum?id=IvepFxYRDG", "slides": "https://iclr.cc/virtual/2022/poster/6982", "video": "https://iclr.cc/virtual/2022/poster/6982", "author_site": "Ziyi Chen, Shaocong Ma, Yi Zhou", "tldr": "", "abstract": "Two-player zero-sum Markov game is a fundamental problem in reinforcement learning and game theory. Although many algorithms have been proposed for solving zero-sum Markov games in the existing literature, many of them either require a full knowledge of the environment or are not sample-efficient. In this paper, we develop a fully decentralized and sample-efficient stochastic policy extragradient algorithm for solving tabular zero-sum Markov games. In particular, our algorithm utilizes multiple stochastic estimators to accurately estimate the value functions involved in the stochastic updates, and leverages entropy regularization to accelerate the convergence. Specifically, with a proper entropy-regularization parameter, we prove that the stochastic policy extragradient algorithm has a sample complexity of the order $\\widetilde{\\mathcal{O}}(\\frac{A_{\\max}}{\\mu_{\\text{min}}\\epsilon^{5.5}(1-\\gamma)^{13.5}})$ for finding a solution that achieves $\\epsilon$-Nash equilibrium duality gap, where $A_{\\max}$ is the maximum number of actions between the players, $\\mu_{\\min}$ is the lower bound of state stationary distribution, and $\\gamma$ is the discount factor. Such a sample complexity result substantially improves the state-of-the-art complexity result. ", "keywords": "Two-player Zero-sum Markov game;Entropy regularization;Policy extragradient;Nash equilibrium;Sample complexity", "primary_area": "", "supplementary_material": "", "author": "Ziyi Chen;Shaocong Ma;Yi Zhou", "authorids": "~Ziyi_Chen2;~Shaocong_Ma1;~Yi_Zhou2", "gender": "M;M;M", "homepage": ";https://mshaocong.github.io/;https://sites.google.com/site/yizhouhomepage/home", "dblp": "37/1439-2;270/3742;", "google_scholar": "zjSBVOIAAAAJ;;4fK8bYIAAAAJ", "orcid": ";;", "linkedin": "ziyi-chen-84616184/;;", "or_profile": "~Ziyi_Chen2;~Shaocong_Ma1;~Yi_Zhou2", "aff": "University of Utah;Lawrence Livermore National Labs;University of Utah", "aff_domain": "utah.edu;llnl.gov;utah.edu", "position": "PhD student;Intern;Assistant Professor", "bibtex": "@inproceedings{\nchen2022sample,\ntitle={Sample Efficient Stochastic Policy Extragradient Algorithm for Zero-Sum Markov Game},\nauthor={Ziyi Chen and Shaocong Ma and Yi Zhou},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=IvepFxYRDG}\n}", "github": "", "project": "", "reviewers": "sRxd;RwGu;5TU3;8RN2;Hsr5", "pdf_size": 0, "recommendation": "6;6;6;6;6", "confidence": "4;4;3;3;3", "correctness": "4;4;4;3;4", "technical_novelty": "2;3;3;2;3", "empirical_novelty": "0;0;0;0;0", "wc_summary_paper": "51;45;50;156;40", "wc_summary_review": "50;40;84;58;61", "wc_main_review": "615;406;200;234;344", "wc_review": "716;491;334;448;445", "wc_reply_reviewers": "0;0;0;23;96", "wc_reply_authors": "502;729;339;277;766", "reply_reviewers": "0;0;0;1;1", "reply_authors": "1;1;1;1;2", "recommendation_avg": [ 6.0, 0.0 ], "confidence_avg": [ 3.4, 0.4898979485566356 ], "correctness_avg": [ 3.8, 0.39999999999999997 ], "technical_novelty_avg": [ 2.6, 0.4898979485566356 ], "empirical_novelty_avg": [ 0, 0 ], "wc_summary_paper_avg": [ 68.4, 43.97544769527651 ], "wc_summary_review_avg": [ 58.6, 14.636939570825588 ], "wc_main_review_avg": [ 359.8, 147.5620547430809 ], "wc_review_avg": [ 486.8, 125.81796374127185 ], "wc_reply_reviewers_avg": [ 23.8, 37.18279171875076 ], "wc_reply_authors_avg": [ 522.6, 198.13995054001603 ], "reply_reviewers_avg": [ 0.4, 0.48989794855663565 ], "reply_authors_avg": [ 1.2, 0.4 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 21, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3911354254280155614&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=IvepFxYRDG", "email": "utah.edu;llnl.gov;utah.edu", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "University of Utah;Lawrence Livermore National Laboratory", "aff_unique_dep": ";", "aff_unique_url": "https://www.utah.edu;https://www.llnl.gov", "aff_unique_abbr": "Utah;LLNL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "Ivku4TZgEly", "title": "Exploring unfairness in Integrated Gradients based attribution methods", "track": "main", "status": "Reject", "tldr": "", "abstract": "Numerous methods have attempted to explain and interpret predictions made by machine learning models in terms of their inputs. Known as \u201cattribution methods\u201d they notably include the Integrated Gradients method and its variants.These are based upon the theory of Shapley Values, a rigorous method of fair allocation according to mathematical axioms. Integrated Gradients has axioms derived from this heritage with the implication of a similar rigorous, intuitive notion of fairness. We explore the difference between Integrated Gradients and more direct expressions of Shapley Values in deep learning and find Integrated Gradients\u2019 guarantees of fairness weaker; in certain conditions it can give wholly unrepresentative results. Integrated Gradients requires a choice of \u201cbaseline\u201d, a hyperparameter that represents the \u2018zero attribution\u2019 case. Research has shown that baseline choice critically affects attribution quality, and increasingly effective baselines have been developed. Using purpose-designed scenarios we identify sources of inaccuracy both from specific baselines and inherent to the method itself, sensitive to input distribution and loss landscape. Failure modes are identified for baselines including Zero, Mean,Additive Gaussian Noise, and the state of the art Expected Gradients. We develop a new method, Integrated Certainty Gradients, that we show avoids the failures in these challenging scenarios. By augmenting the input space with \u201ccertainty\u201dinformation, and training with random degradation of input features, the model learns to predict with varying amounts of incomplete information, supporting a zero-information case which becomes a natural baseline. We identify the axiomatic origin of unfairness in Integrated Gradients, which has been overlooked in past research.", "keywords": "Integrated Gradients;Expected Gradients;Explainable AI;Integrated Certainty Gradients;Attribution", "primary_area": "", "supplementary_material": "", "author": "David Drakard;Rosanne Liu;Jason Yosinski", "authorids": "~David_Drakard1;~Rosanne_Liu1;~Jason_Yosinski2", "gender": ";F;Unspecified", "homepage": "https://github.com/ddrakard/;https://rosanneliu.com/;http://yosinski.com", "dblp": ";218/6453;31/9882", "google_scholar": ";_GzrRGwAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";;", "linkedin": ";;jasonyosinski", "or_profile": "~David_Drakard1;~Rosanne_Liu1;~Jason_Yosinski1", "aff": ";ML Collective;Geometric Intelligence", "aff_domain": ";mlcollective.org;geometric.ai", "position": ";Researcher;VP of ML", "bibtex": "@misc{\ndrakard2022exploring,\ntitle={Exploring unfairness in Integrated Gradients based attribution methods},\nauthor={David Drakard and Rosanne Liu and Jason Yosinski},\nyear={2022},\nurl={https://openreview.net/forum?id=Ivku4TZgEly}\n}", "github": "", "project": "", "reviewers": "W9En;Ejjc;FgJc", "site": "https://openreview.net/forum?id=Ivku4TZgEly", "pdf_size": 0, "recommendation": "5;5;5", "confidence": "3;2;4", "correctness": "3;2;4", "technical_novelty": "3;3;2", "empirical_novelty": "2;2;1", "wc_summary_paper": "42;139;52", "wc_summary_review": "17;47;13", "wc_main_review": "196;430;265", "wc_review": "255;616;330", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 5.0, 0.0 ], "confidence_avg": [ 3.0, 0.816496580927726 ], "correctness_avg": [ 3.0, 0.816496580927726 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 1.6666666666666667, 0.4714045207910317 ], "wc_summary_paper_avg": [ 77.66666666666667, 43.560940710177 ], "wc_summary_review_avg": [ 25.666666666666668, 15.173075568988056 ], "wc_main_review_avg": [ 297.0, 98.17331613019904 ], "wc_review_avg": [ 400.3333333333333, 155.5427772529331 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8629206249157679265&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1", "aff_unique_norm": "ML Collective;Geometric Intelligence", "aff_unique_dep": ";", "aff_unique_url": ";https://geometricintelligence.com", "aff_unique_abbr": ";", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "1", "aff_country_unique": ";United Kingdom" }, { "title": "Compositional Attention: Disentangling Search and Retrieval", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6496", "id": "IwJPj2MBcIa", "poster": "", "openreview": "https://openreview.net/forum?id=IwJPj2MBcIa", "slides": "https://iclr.cc/virtual/2022/poster/6496", "video": "https://iclr.cc/virtual/2022/poster/6496", "author_site": "Sarthak Mittal, Sharath Chandra Raparthy, Irina Rish, Yoshua Bengio, Guillaume Lajoie", "tldr": "", "abstract": "Multi-head, key-value attention is the backbone of transformer-like model architectures which have proven to be widely successful in recent years. This attention mechanism uses multiple parallel key-value attention blocks (called heads), each performing two fundamental computations: (1) search - selection of a relevant entity from a set via query-key interaction, and (2) retrieval - extraction of relevant features from the selected entity via a value matrix. Standard attention heads learn a rigid mapping between search and retrieval. In this work, we first highlight how this static nature of the pairing can potentially: (a) lead to learning of redundant parameters in certain tasks, and (b) hinder generalization. To alleviate this problem, we propose a novel attention mechanism, called Compositional Attention, that replaces the standard head structure. The proposed mechanism disentangles search and retrieval and composes them in a dynamic, flexible and context-dependent manner. Through a series of numerical experiments, we show that it outperforms standard multi-head attention on a variety of tasks, including some out-of-distribution settings. Through our qualitative analysis, we demonstrate that Compositional Attention leads to dynamic specialization based on the type of retrieval needed. Our proposed mechanism generalizes multi-head attention, allows independent scaling of search and retrieval and is easy to implement in a variety of established network architectures.", "keywords": "compositional attention;flexible search and retrieval;better generalization", "primary_area": "", "supplementary_material": "/attachment/75e1b9d639848540262157374fb6695aaecc0d57.zip", "author": "Sarthak Mittal;Sharath Chandra Raparthy;Irina Rish;Yoshua Bengio;Guillaume Lajoie", "authorids": "~Sarthak_Mittal1;~Sharath_Chandra_Raparthy1;~Irina_Rish1;~Yoshua_Bengio1;~Guillaume_Lajoie1", "gender": "M;M;F;M;M", "homepage": "https://sarthmit.github.io/;https://sharathraparthy.github.io/;http://irina-rish.com;http://yoshuabengio.org;https://dms.umontreal.ca/~lajoie/", "dblp": "228/8275;302/4190;;56/953;31/10384", "google_scholar": "FGGgTrcAAAAJ;https://scholar.google.ca/citations?user=S1R0_UMAAAAJ;Avse5gIAAAAJ;kukA0LcAAAAJ;", "orcid": ";;;;", "linkedin": ";;irina-rish-8b2162;yoshuabengio/?originalSubdomain=ca;", "or_profile": "~Sarthak_Mittal1;~Sharath_Chandra_Raparthy1;~Irina_Rish1;~Yoshua_Bengio1;~Guillaume_Lajoie1", "aff": "Universit\u00e9 de Montr\u00e9al;Montreal Institute for Learning Algorithms, University of Montreal, University of Montreal;University of Montreal;University of Montreal;Mila - Quebec Artificial Intelligence Institute", "aff_domain": "umontreal.ca;mila.umontreal.ca;mila.quebec;umontreal.ca;mila.quebec", "position": "MS student;MS student;Professor;Full Professor;Associate Professor", "bibtex": "@inproceedings{\nmittal2022compositional,\ntitle={Compositional Attention: Disentangling Search and Retrieval},\nauthor={Sarthak Mittal and Sharath Chandra Raparthy and Irina Rish and Yoshua Bengio and Guillaume Lajoie},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=IwJPj2MBcIa}\n}", "github": "", "project": "", "reviewers": "RbM7;f5jQ;TCch;kYB6", "pdf_size": 0, "recommendation": "6;6;8;8", "confidence": "3;3;4;3", "correctness": "3;3;4;4", "technical_novelty": "3;3;3;3", "empirical_novelty": "3;3;3;3", "wc_summary_paper": "97;38;32;60", "wc_summary_review": "30;75;110;44", "wc_main_review": "97;294;282;205", "wc_review": "224;407;424;309", "wc_reply_reviewers": "22;0;0;6", "wc_reply_authors": "648;819;1517;681", "reply_reviewers": "1;0;0;1", "reply_authors": "2;2;3;2", "recommendation_avg": [ 7.0, 1.0 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 56.75, 25.469344318219107 ], "wc_summary_review_avg": [ 64.75, 30.784533454317607 ], "wc_main_review_avg": [ 219.5, 78.53820726245284 ], "wc_review_avg": [ 341.0, 80.5574329283152 ], "wc_reply_reviewers_avg": [ 7.0, 9.0 ], "wc_reply_authors_avg": [ 916.25, 352.7246624493388 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 2.25, 0.4330127018922193 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": 1.0, "gs_citation": 27, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1630545213475914915&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=IwJPj2MBcIa", "email": "umontreal.ca;mila.umontreal.ca;mila.quebec;umontreal.ca;mila.quebec", "author_num": 5, "aff_unique_index": "0;1;1;1;2", "aff_unique_norm": "Universit\u00e9 de Montr\u00e9al;University of Montreal;Quebec Artificial Intelligence Institute", "aff_unique_dep": ";Montreal Institute for Learning Algorithms;Artificial Intelligence", "aff_unique_url": "https://www.umontreal.ca;https://www.umontreal.ca;https://mila.quebec", "aff_unique_abbr": "UdeM;UM;Mila", "aff_campus_unique_index": "1", "aff_campus_unique": ";Montreal", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "Canada" }, { "id": "IxCAF8IMatf", "title": "A Unified Knowledge Distillation Framework for Deep Directed Graphical Models", "track": "main", "status": "Reject", "tldr": "", "abstract": " Knowledge distillation (KD) is a technique that transfers the knowledge from a large teacher network to a small student network. It has been widely applied to many different tasks, such as model compression and federated learning. However, the existing KD methods fail to generalize to general \\textit{deep directed graphical models (DGMs)} with arbitrary layers of random variables. We refer by \\textit{deep} DGMs to DGMs whose conditional distributions are parameterized by deep neural networks. In this work, we propose a novel unified knowledge distillation framework for deep DGMs on various applications. Specifically, we leverage the reparameterization trick to hide the intermediate latent variables, resulting in a compact DGM. Then we develop a surrogate distillation loss to reduce error accumulation through multiple layers of random variables. Moreover, we present the connections between our method and some existing knowledge distillation approaches. The proposed framework is evaluated on three applications: deep generative models compression, discriminative deep DGMs compression, and VAE continual learning. The results show that our distillation method outperforms the baselines in data-free compression of deep generative models, including variational autoencoder (VAE), variational recurrent neural networks (VRNN), and Helmholtz Machine (HM). Moreover, our method achieves good performance for discriminative deep DGMs compression. Finally, we also demonstrate that it significantly improves the continual learning performance of VAE.", "keywords": "Deep Directed Graphical Models;Knowledge Distillation;Reparameterization trick;Model compression", "primary_area": "", "supplementary_material": "", "author": "Yizhuo Chen;Kaizhao Liang;Zhe Zeng;Yifei Yang;Shuochao Yao;Huajie Shao", "authorids": "~Yizhuo_Chen2;~Kaizhao_Liang1;~Zhe_Zeng1;yfyang2018@zju.edu.cn;~Shuochao_Yao1;~Huajie_Shao1", "gender": ";M;F;;;M", "homepage": "https://yizhuochen99.github.io/;https://kaizhaoliang.github.io/Portfolio/;https://zzeng.me/;;https://yscacaca.github.io/;https://huajieshao.github.io/", "dblp": ";239/5146;27/10464;;148/1920;179/4173", "google_scholar": ";qKLmNfoAAAAJ;PyK6cB0AAAAJ;;https://scholar.google.com/citations?hl=en;5-D7ZLsAAAAJ", "orcid": ";;;;;0000-0001-7627-5615", "linkedin": ";kaizhao-liang-427a42132/;;;;huajie-shao-508465113/", "or_profile": "~Yizhuo_Chen2;~Kaizhao_Liang1;~Zhe_Zeng1;yfyang2018@zju.edu.cn;~Shuochao_Yao1;~Huajie_Shao1", "aff": "University of Illinois;SambaNova Systems, Inc;University of California, Los Angeles;;George Mason University;College of William and Mary", "aff_domain": "cs.illinois.edu;sambanovasystems.com;cs.ucla.edu;;gmu.edu;wm.edu", "position": "PhD student;Principal Engineer;PhD student;;Assistant Professor;Assistant Professor", "bibtex": "@misc{\nchen2022a,\ntitle={A Unified Knowledge Distillation Framework for Deep Directed Graphical Models},\nauthor={Yizhuo Chen and Kaizhao Liang and Zhe Zeng and Yifei Yang and Shuochao Yao and Huajie Shao},\nyear={2022},\nurl={https://openreview.net/forum?id=IxCAF8IMatf}\n}", "github": "", "project": "", "reviewers": "twdH;XcHW;sRSN;1kU6", "site": "https://openreview.net/forum?id=IxCAF8IMatf", "pdf_size": 0, "recommendation": "5;5;5;6", "confidence": "3;4;4;3", "correctness": "3;2;3;3", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "54;55;54;80", "wc_summary_review": "23;45;81;35", "wc_main_review": "217;475;253;235", "wc_review": "294;575;388;350", "wc_reply_reviewers": "0;0;0;115", "wc_reply_authors": "909;2019;1049;1265", "reply_reviewers": "0;0;0;2", "reply_authors": "2;4;3;5", "recommendation_avg": [ 5.25, 0.4330127018922193 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 60.75, 11.121488209767612 ], "wc_summary_review_avg": [ 46.0, 21.656407827707714 ], "wc_main_review_avg": [ 295.0, 104.69957019969088 ], "wc_review_avg": [ 401.75, 105.46652312463894 ], "wc_reply_reviewers_avg": [ 28.75, 49.79646071760522 ], "wc_reply_authors_avg": [ 1310.5, 428.2601429038196 ], "reply_reviewers_avg": [ 0.5, 0.8660254037844386 ], "reply_authors_avg": [ 3.5, 1.118033988749895 ], "replies_avg": [ 22, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14805109879989958396&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff_unique_index": "0;1;2;3;4", "aff_unique_norm": "University of Illinois;SambaNova Systems;University of California, Los Angeles;George Mason University;College of William and Mary", "aff_unique_dep": ";;;;", "aff_unique_url": "https://www.illinois.edu;https://www.sambanova.com;https://www.ucla.edu;https://www.gmu.edu;https://www.wm.edu", "aff_unique_abbr": "UIUC;SambaNova;UCLA;GMU;WM", "aff_campus_unique_index": "1", "aff_campus_unique": ";Los Angeles", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "PSA-GAN: Progressive Self Attention GANs for Synthetic Time Series", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6757", "id": "Ix_mh42xq5w", "poster": "", "openreview": "https://openreview.net/forum?id=Ix_mh42xq5w", "slides": "https://iclr.cc/virtual/2022/poster/6757", "video": "https://iclr.cc/virtual/2022/poster/6757", "author_site": "Paul Jeha, Michael Bohlke-Schneider, Pedro Mercado, Shubham Kapoor, Rajbir Nirwan, Valentin Flunkert, Jan Gasthaus, Tim Januschowski", "tldr": "", "abstract": "Realistic synthetic time series data of sufficient length enables practical applications in time series modeling tasks, such as forecasting, but remains a challenge. In this paper we present PSA-GAN, a generative adversarial network (GAN) that generates long time series samples of high quality using progressive growing of GANs and self-attention. We show that PSA-GAN can be used to reduce the error in several downstream forecasting tasks over baselines that only use real data. We also introduce a Frechet-Inception Distance-like score for time series, Context-FID, assessing the quality of synthetic time series samples. We find that Context-FID is indicative for downstream performance. Therefore, Context-FID could be a useful tool to develop time series GAN models. ", "keywords": "Synthetic Time Series;GAN;Generative Modeling;Time Series;Forecasting", "primary_area": "", "supplementary_material": "", "author": "Paul Jeha;Michael Bohlke-Schneider;Pedro Mercado;Shubham Kapoor;Rajbir Singh Nirwan;Valentin Flunkert;Jan Gasthaus;Tim Januschowski", "authorids": "~Paul_Jeha1;~Michael_Bohlke-Schneider1;~Pedro_Mercado2;~Shubham_Kapoor1;~Rajbir_Singh_Nirwan1;~Valentin_Flunkert2;~Jan_Gasthaus2;~Tim_Januschowski2", "gender": "M;M;;M;M;M;M;M", "homepage": ";;;https://github.com/RSNirwan;;http://www.gatsby.ucl.ac.uk/~ucabjga/;https://melopeo.github.io/;", "dblp": "298/7731;242/8809;193/4889;;;11/5155;https://dblp.uni-trier.de/pid/14/9077-1;54/8909", "google_scholar": ";https://scholar.google.de/citations?user=19k2WQEAAAAJ;4LLHaikAAAAJ;D3yw4goAAAAJ;https://scholar.google.ca/citations?user=DzlwsFwAAAAJ;sSAJdVwAAAAJ;https://scholar.google.de/citations?user=FzY9Jo0AAAAJ;https://scholar.google.de/citations?user=EFdp8UMAAAAJ", "orcid": ";0000-0002-4969-2218;;;;;;", "linkedin": "pauljeha/;michael-bohlke-schneider-16a4ab93/;shubhamkapoor007;;;jan-gasthaus/;pedro-mercado-6ba37746/;", "or_profile": "~Paul_Jeha1;~Michael_Bohlke-Schneider1;~Shubham_Kapoor1;~Rajbir_Singh_Nirwan1;~Valentin_Flunkert2;~Jan_Gasthaus2;~Pedro_Eduardo_Mercado_Lopez1;~Tim_Januschowski1", "aff": "Technical University of Denmark;Amazon Development Center Germany;Amazon;Amazon Development Center Germany;Amazon;Amazon Development Center Germany;Amazon;", "aff_domain": "dtu.dk;amazon.de;amazon.com;amazon.de;amazon.com;amazon.de;amazon.com;", "position": "PhD student;Researcher;Senior Applied Scientist ;Applied Scientist;Principal Researcher;Researcher;Applied Scientist;", "bibtex": "@inproceedings{\njeha2022psagan,\ntitle={{PSA}-{GAN}: Progressive Self Attention {GAN}s for Synthetic Time Series},\nauthor={Paul Jeha and Michael Bohlke-Schneider and Pedro Mercado and Shubham Kapoor and Rajbir Singh Nirwan and Valentin Flunkert and Jan Gasthaus and Tim Januschowski},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=Ix_mh42xq5w}\n}", "github": "", "project": "", "reviewers": "4v5L;RLDM;pbaT;DRP7", "pdf_size": 0, "recommendation": "6;6;6;6", "confidence": "4;3;3;5", "correctness": "3;4;4;4", "technical_novelty": "2;3;3;2", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "23;98;67;45", "wc_summary_review": "74;46;39;15", "wc_main_review": "385;261;430;120", "wc_review": "482;405;536;180", "wc_reply_reviewers": "0;24;50;46", "wc_reply_authors": "2498;1041;964;700", "reply_reviewers": "0;1;1;1", "reply_authors": "5;2;2;1", "recommendation_avg": [ 6.0, 0.0 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 58.25, 27.725214156071004 ], "wc_summary_review_avg": [ 43.5, 21.02974084481309 ], "wc_main_review_avg": [ 299.0, 120.45953677480252 ], "wc_review_avg": [ 400.75, 135.68598859130591 ], "wc_reply_reviewers_avg": [ 30.0, 19.949937343260004 ], "wc_reply_authors_avg": [ 1300.75, 702.7052636063003 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 2.5, 1.5 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 63, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13132868584712043956&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "pdf": "https://openreview.net/pdf?id=Ix_mh42xq5w", "email": "dtu.dk;amazon.de;amazon.com;amazon.de;amazon.com;amazon.de;amazon.com;", "author_num": 8, "aff_unique_index": "0;1;1;1;1;1;1", "aff_unique_norm": "Technical University of Denmark;Amazon", "aff_unique_dep": ";Development Center", "aff_unique_url": "https://www.tek.dk;https://www.amazon.de", "aff_unique_abbr": "DTU;Amazon", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2;1;2;1;2", "aff_country_unique": "Denmark;Germany;United States" }, { "title": "Learning Representation from Neural Fisher Kernel with Low-rank Approximation", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6511", "id": "J1rhANsCY9", "poster": "", "openreview": "https://openreview.net/forum?id=J1rhANsCY9", "slides": "https://iclr.cc/virtual/2022/poster/6511", "video": "https://iclr.cc/virtual/2022/poster/6511", "author_site": "Ruixiang Zhang, Shuangfei Zhai, Etai Littwin, Joshua Susskind", "tldr": "", "abstract": "In this paper, we study the representation of neural networks from the view of kernels. We first define the Neural Fisher Kernel (NFK), which is the Fisher Kernel applied to neural networks. We show that NFK can be computed for both supervised and unsupervised learning models, which can serve as a unified tool for representation extraction. Furthermore, we show that practical NFKs exhibit low-rank structures. We then propose an efficient algorithm that computes a low-rank approximation of NFK, which scales to large datasets and networks. We show that the low-rank approximation of NFKs derived from unsupervised generative models and supervised learning models gives rise to high-quality compact representations of data, achieving competitive results on a variety of machine learning tasks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ruixiang ZHANG;Shuangfei Zhai;Etai Littwin;Joshua M. Susskind", "authorids": "~Ruixiang_ZHANG1;~Shuangfei_Zhai3;~Etai_Littwin2;~Joshua_M._Susskind1", "gender": "M;M;M;M", "homepage": "http://ruixiangz.me/;http://cs.binghamton.edu/~szhai2;http://www.apple.com;", "dblp": "20/9860;;132/7797;", "google_scholar": "https://scholar.google.ca/citations?user=VQYdApgAAAAJ;G6vdBYsAAAAJ;Sv2TGqsAAAAJ;NOVS7vwAAAAJ", "orcid": ";;;", "linkedin": ";;joshua-susskind-8ab2ab5/;", "or_profile": "~Ruixiang_ZHANG1;~Shuangfei_Zhai3;~Joshua_M._Susskind1;~Etai_Littwin1", "aff": "Mila, UdeM;Apple;Apple;Apple", "aff_domain": "mila.qubec;apple.com;apple.com;apple.com", "position": "PhD student;Research Scientist;Researcher;Researcher", "bibtex": "@inproceedings{\nzhang2022learning,\ntitle={Learning Representation from Neural Fisher Kernel with Low-rank Approximation},\nauthor={Ruixiang ZHANG and Shuangfei Zhai and Etai Littwin and Joshua M. Susskind},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=J1rhANsCY9}\n}", "github": "", "project": "", "reviewers": "z7Tb;9i1k;945A", "pdf_size": 0, "recommendation": "6;6;6", "confidence": "4;3;4", "correctness": "3;4;3", "technical_novelty": "3;3;3", "empirical_novelty": "2;3;3", "wc_summary_paper": "45;55;52", "wc_summary_review": "65;18;18", "wc_main_review": "747;312;154", "wc_review": "857;385;224", "wc_reply_reviewers": "0;0;15", "wc_reply_authors": "2536;1436;378", "reply_reviewers": "0;0;1", "reply_authors": "5;3;1", "recommendation_avg": [ 6.0, 0.0 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 50.666666666666664, 4.189935029992179 ], "wc_summary_review_avg": [ 33.666666666666664, 22.15601247717849 ], "wc_main_review_avg": [ 404.3333333333333, 250.74068056238679 ], "wc_review_avg": [ 488.6666666666667, 268.6166206490002 ], "wc_reply_reviewers_avg": [ 5.0, 7.0710678118654755 ], "wc_reply_authors_avg": [ 1450.0, 881.0554276926433 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 3.0, 1.632993161855452 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18403283623622571101&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=J1rhANsCY9", "email": "mila.qubec;apple.com;apple.com;apple.com", "author_num": 4, "aff_unique_index": "0;1;1;1", "aff_unique_norm": "Universit\u00e9 de Montr\u00e9al;Apple", "aff_unique_dep": "Mila;Apple Inc.", "aff_unique_url": "https://www.udemontreal.ca;https://www.apple.com", "aff_unique_abbr": "UdeM;Apple", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1", "aff_country_unique": "Canada;United States" }, { "id": "J1uOGgf-bP", "title": "Test Time Robustification of Deep Models via Adaptation and Augmentation", "track": "main", "status": "Reject", "tldr": "", "abstract": "While deep neural networks can attain good accuracy on in-distribution test points, many applications require robustness even in the face of unexpected perturbations in the input, changes in the domain, or other sources of distribution shift. We study the problem of test time robustification, i.e., using the test input to improve model robustness. Recent prior works have proposed methods for test time adaptation, however, they each introduce additional assumptions, such as access to multiple test points, that prevent widespread adoption. In this work, we aim to study and devise methods that make no assumptions about the model training process and are broadly applicable at test time. We propose a simple approach that can be used in any test setting where the model is probabilistic and adaptable: when presented with a test example, perform different data augmentations on the data point, and then adapt (all of) the model parameters by minimizing the entropy of the model's average, or marginal, output distribution across the augmentations. Intuitively, this objective encourages the model to make the same prediction across different augmentations, thus enforcing the invariances encoded in these augmentations, while also maintaining confidence in its predictions. In our experiments, we demonstrate that this approach consistently improves robust ResNet and vision transformer models, achieving accuracy gains of 1-8% over standard model evaluation and also generally outperforming prior augmentation and adaptation strategies. We achieve state-of-the-art results for test shifts caused by image corruptions (ImageNet-C), renditions of common objects (ImageNet-R), and, among ResNet-50 models, adversarially chosen natural examples (ImageNet-A).", "keywords": "distribution shift;test time adaptation;data augmentation", "primary_area": "", "supplementary_material": "/attachment/ce162b1980dab50b8596319c3c5f1ccb5759efe4.zip", "author": "Marvin Mengxin Zhang;Sergey Levine;Chelsea Finn", "authorids": "~Marvin_Mengxin_Zhang2;~Sergey_Levine1;~Chelsea_Finn1", "gender": "M;F;M", "homepage": "https://people.eecs.berkeley.edu/~svlevine/;https://ai.stanford.edu/~cbfinn/;http://marvinzhang.com/", "dblp": "80/7594;131/1783;166/1402", "google_scholar": "8R35rCwAAAAJ;vfPE6hgAAAAJ;7ShMBcwAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Sergey_Levine1;~Chelsea_Finn1;~Marvin_Mengxin_Zhang1", "aff": "Google;Google;University of California, Berkeley", "aff_domain": "google.com;google.com;berkeley.edu", "position": "Research Scientist;Research Scientist;PhD student", "bibtex": "@misc{\nzhang2022test,\ntitle={Test Time Robustification of Deep Models via Adaptation and Augmentation},\nauthor={Marvin Mengxin Zhang and Sergey Levine and Chelsea Finn},\nyear={2022},\nurl={https://openreview.net/forum?id=J1uOGgf-bP}\n}", "github": "", "project": "", "reviewers": "8PMD;6Xgd;fKFu;ZgMY", "site": "https://openreview.net/forum?id=J1uOGgf-bP", "pdf_size": 0, "recommendation": "5;5;6;8", "confidence": "4;5;4;5", "correctness": "3;4;1;3", "technical_novelty": "2;2;2;4", "empirical_novelty": "2;2;3;4", "wc_summary_paper": "80;76;87;41", "wc_summary_review": "109;49;106;424", "wc_main_review": "382;237;267;1593", "wc_review": "571;362;460;2058", "wc_reply_reviewers": "410;173;88;1440", "wc_reply_authors": "1222;908;602;3246", "reply_reviewers": "2;1;1;3", "reply_authors": "2;2;1;5", "recommendation_avg": [ 6.0, 1.224744871391589 ], "confidence_avg": [ 4.5, 0.5 ], "correctness_avg": [ 2.75, 1.0897247358851685 ], "technical_novelty_avg": [ 2.5, 0.8660254037844386 ], "empirical_novelty_avg": [ 2.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 71.0, 17.76231966833161 ], "wc_summary_review_avg": [ 172.0, 147.44320940619815 ], "wc_main_review_avg": [ 619.75, 564.5065876497811 ], "wc_review_avg": [ 862.75, 694.0278722789164 ], "wc_reply_reviewers_avg": [ 527.75, 539.7436312732184 ], "wc_reply_authors_avg": [ 1494.5, 1034.7157822320098 ], "reply_reviewers_avg": [ 1.75, 0.82915619758885 ], "reply_authors_avg": [ 2.5, 1.5 ], "replies_avg": [ 22, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.40824829046386296, "corr_recommendation_correctness": -0.18731716231633877, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2012248582375679857&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;1", "aff_unique_norm": "Google;University of California, Berkeley", "aff_unique_dep": "Google;", "aff_unique_url": "https://www.google.com;https://www.berkeley.edu", "aff_unique_abbr": "Google;UC Berkeley", "aff_campus_unique_index": "0;0;1", "aff_campus_unique": "Mountain View;Berkeley", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Representation Learning for Online and Offline RL in Low-rank MDPs", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7191", "id": "J4iSIR9fhY0", "poster": "", "openreview": "https://openreview.net/forum?id=J4iSIR9fhY0", "slides": "https://iclr.cc/virtual/2022/poster/7191", "video": "https://iclr.cc/virtual/2022/poster/7191", "author_site": "Masatoshi Uehara, Xuezhou Zhang, Wen Sun", "tldr": "", "abstract": "This work studies the question of Representation Learning in RL: how can we learn a compact low-dimensional representation such that on top of the representation we can perform RL procedures such as exploration and exploitation, in a sample efficient manner. We focus on the low-rank Markov Decision Processes (MDPs) where the transition dynamics correspond to a low-rank transition matrix. Unlike prior works that assume the representation is known (e.g., linear MDPs), here we need to learn the representation for the low-rank MDP. We study both the online RL and offline RL settings. For the online setting, operating with the same computational oracles used in FLAMBE (Agarwal et.al), the state-of-art algorithm for learning representations in low-rank MDPs, we propose an algorithm REP-UCB Upper Confidence Bound driven Representation learning for RL), which significantly improves the sample complexity from $\\widetilde{O}( A^9 d^7 / (\\epsilon^{10} (1-\\gamma)^{22}))$ for FLAMBE to $\\widetilde{O}( A^4 d^4 / (\\epsilon^2 (1-\\gamma)^{2}) )$ with $d$ being the rank of the transition matrix (or dimension of the ground truth representation), $A$ being the number of actions, and $\\gamma$ being the discounted factor. Notably, REP-UCB is simpler than FLAMBE, as it directly balances the interplay between representation learning, exploration, and exploitation, while FLAMBE is an explore-then-commit style approach and has to perform reward-free exploration step-by-step forward in time. For the offline RL setting, we develop an algorithm that leverages pessimism to learn under a partial coverage condition: our algorithm is able to compete against any policy as long as it is covered by the offline distribution.", "keywords": "Provably sample efficient Reinforcement Learning;PAC bounds;Representation learning;Low-rank MDP", "primary_area": "", "supplementary_material": "/attachment/973cc093bf6b3fb1c46d3433387ee95d4d7f2561.zip", "author": "Masatoshi Uehara;Xuezhou Zhang;Wen Sun", "authorids": "~Masatoshi_Uehara1;~Xuezhou_Zhang2;~Wen_Sun1", "gender": "M;;M", "homepage": "https://www.masatoshiuehara.com/;https://wensun.github.io;https://zhangxz1123.github.io/", "dblp": "225/6517;;213/7993", "google_scholar": "https://scholar.google.co.jp/citations?user=xuLKJboAAAAJ;iOLC30YAAAAJ;tR-p-r8AAAAJ", "orcid": "0000-0001-9017-3105;;", "linkedin": ";;", "or_profile": "~Masatoshi_Uehara1;~Wen_Sun1;~Xuezhou_Zhang1", "aff": "Amazon;Cornell University;Princeton University", "aff_domain": "amazon.com;cornell.edu;princeton.edu", "position": "Intern;Assistant Professor;Postdoc", "bibtex": "@inproceedings{\nuehara2022representation,\ntitle={Representation Learning for Online and Offline {RL} in Low-rank {MDP}s},\nauthor={Masatoshi Uehara and Xuezhou Zhang and Wen Sun},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=J4iSIR9fhY0}\n}", "github": "", "project": "", "reviewers": "99V8;MSqC;SjqZ;cvhG", "pdf_size": 0, "recommendation": "5;6;8;8", "confidence": "4;4;4;4", "correctness": "3;4;3;4", "technical_novelty": "2;3;3;4", "empirical_novelty": "2;0;0;0", "wc_summary_paper": "106;29;42;61", "wc_summary_review": "24;98;77;49", "wc_main_review": "210;274;644;383", "wc_review": "340;401;763;493", "wc_reply_reviewers": "0;30;356;32", "wc_reply_authors": "686;766;1129;512", "reply_reviewers": "0;1;2;1", "reply_authors": "1;1;3;1", "recommendation_avg": [ 6.75, 1.299038105676658 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 0.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 59.5, 29.159046623646667 ], "wc_summary_review_avg": [ 62.0, 27.99107000455681 ], "wc_main_review_avg": [ 377.75, 165.69606965767173 ], "wc_review_avg": [ 499.25, 161.7225633608372 ], "wc_reply_reviewers_avg": [ 104.5, 145.75578890733638 ], "wc_reply_authors_avg": [ 773.25, 224.98597178490928 ], "reply_reviewers_avg": [ 1.0, 0.7071067811865476 ], "reply_authors_avg": [ 1.5, 0.8660254037844386 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.19245008972987526, "gs_citation": 172, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9888134444797666346&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=J4iSIR9fhY0", "email": "amazon.com;cornell.edu;princeton.edu", "author_num": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "Amazon;Cornell University;Princeton University", "aff_unique_dep": "Amazon.com, Inc.;;", "aff_unique_url": "https://www.amazon.com;https://www.cornell.edu;https://www.princeton.edu", "aff_unique_abbr": "Amazon;Cornell;Princeton", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "J7FaSJw-xCM", "title": "Mutual Information Estimation as a Difference of Entropies for Unsupervised Representation Learning", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Contrastive loss has been successfully exploited in the latest visual unsupervised representation learning methods. Contrastive loss is based on a lower-bound estimation of mutual information where its known limitations include batch size dependency expressed as $O(log (n))$. It is also commonly known as negative sampling size problem. To cope with the limitation, non-contrastive methods have been proposed and they have been shown to achieve outstanding performance. The non-contrastive methods, however, are limited in that they are not based on principled designs and their learning dynamics can be unstable. In this work, we derive a principled non-contrastive method where mutual information is estimated as a difference of entropies and thus no need for negative sampling. With our best knowledge, this is the first successful implementation of difference of entropies for visual unsupervised representation learning. Our method performs on par with or better than the state-of-the-art contrastive and non-contrastive methods. The main idea of our approach is to extend Shannon entropy $H(\\rmZ)$ to von Neumann entropy $S(\\rmZ)$. The von Neumann entropy can be shown to be a lower bound of Shannon entropy and it can be stably estimated with a small sample size. Additionally, we prove that the conditional entropy term $H(\\rmZ_1|\\rmZ_2)$ is upper bounded by the negative cosine similarity for the case of weak Gaussian noise augmentation. Even though the derivation is limited to a special case of augmentation, it provides a justification of cosine similarity as the measure between positive samples.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jaeill Kim;Wonjong Rhee", "authorids": "~Jaeill_Kim1;~Wonjong_Rhee1", "gender": "M;", "homepage": "https://sites.google.com/view/jaeillkim;http://drl.snu.ac.kr", "dblp": "311/1999;37/711", "google_scholar": "kVJRl3wAAAAJ;https://scholar.google.co.kr/citations?user=htFuYWsAAAAJ", "orcid": ";0000-0002-2590-8774", "linkedin": ";wonjong/", "or_profile": "~Jaeill_Kim1;~Wonjong_Rhee1", "aff": "Seoul National University;Seoul National University", "aff_domain": "snu.ac.kr;snu.ac.kr", "position": "PhD student;Associate Professor", "bibtex": "@misc{\nkim2022mutual,\ntitle={Mutual Information Estimation as a Difference of Entropies for Unsupervised Representation Learning},\nauthor={Jaeill Kim and Wonjong Rhee},\nyear={2022},\nurl={https://openreview.net/forum?id=J7FaSJw-xCM}\n}", "github": "", "project": "", "reviewers": "6s5e;heVY;g1dt;7tZk", "site": "https://openreview.net/forum?id=J7FaSJw-xCM", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "4;4;3;3", "correctness": "3;2;3;2", "technical_novelty": "3;2;4;3", "empirical_novelty": "3;2;0;2", "wc_summary_paper": "201;80;56;94", "wc_summary_review": "111;225;293;333", "wc_main_review": "514;496;619;721", "wc_review": "826;801;968;1148", "wc_reply_reviewers": "0;3401;0;0", "wc_reply_authors": "169;1792;155;270", "reply_reviewers": "0;7;0;0", "reply_authors": "1;5;1;1", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 1.75, 1.0897247358851685 ], "wc_summary_paper_avg": [ 107.75, 55.526457657588786 ], "wc_summary_review_avg": [ 240.5, 84.14719246653449 ], "wc_main_review_avg": [ 587.5, 90.2621182999823 ], "wc_review_avg": [ 935.75, 138.10571132288484 ], "wc_reply_reviewers_avg": [ 850.25, 1472.6761991354379 ], "wc_reply_authors_avg": [ 596.5, 691.6467667820041 ], "reply_reviewers_avg": [ 1.75, 3.031088913245535 ], "reply_authors_avg": [ 2.0, 1.7320508075688772 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -1.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:tLMoIbHFIdwJ:scholar.google.com/&scioq=Mutual+Information+Estimation+as+a+Difference+of+Entropies+for+Unsupervised+Representation+Learning&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Seoul National University", "aff_unique_dep": "", "aff_unique_url": "https://www.snu.ac.kr", "aff_unique_abbr": "SNU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "South Korea" }, { "id": "J7V_4aauV6B", "title": "Understanding and Scheduling Weight Decay", "track": "main", "status": "Reject", "tldr": "", "abstract": "Weight decay is a popular and even necessary regularization technique for training deep neural networks that generalize well. Previous work usually interpreted weight decay as a Gaussian prior from the Bayesian perspective. However, weight decay sometimes shows mysterious behaviors beyond the conventional understanding. For example, the optimal weight decay value tends to be zero given long enough training time. Moreover, existing work typically failed to recognize the importance of scheduling weight decay during training. Our work aims at theoretically understanding novel behaviors of weight decay and designing schedulers for weight decay in deep learning. This paper mainly has three contributions. First, we propose a novel theoretical interpretation of weight decay from the perspective of learning dynamics. Second, we propose a novel weight-decay linear scaling rule for large-batch training that proportionally increases weight decay rather than the learning rate as the batch size increases. Third, we provide an effective learning-rate-aware scheduler for weight decay, called the Stable Weight Decay (SWD) method, which, to the best of our knowledge, is the first practical design for weight decay scheduling. In our various experiments, the SWD method often makes improvements over $L_{2}$ Regularization and Decoupled Weight Decay.", "keywords": "Weight Decay;Regularization;Optimization;Deep Learning", "primary_area": "", "supplementary_material": "/attachment/c8830803e4d1587249a0dbf36bd56e38637db83e.zip", "author": "Zeke Xie;Issei Sato;Masashi Sugiyama", "authorids": "~Zeke_Xie1;~Issei_Sato1;~Masashi_Sugiyama1", "gender": "M;M;M", "homepage": "https://sites.google.com/view/zeke-xie;;http://www.ms.k.u-tokyo.ac.jp/sugi/", "dblp": "210/1039;13/2665;35/1228", "google_scholar": "https://scholar.google.co.jp/citations?user=ysXmZCMAAAAJ;i4t2aUEAAAAJ;https://scholar.google.co.jp/citations?user=GkYIrlIAAAAJ", "orcid": ";;0000-0001-6658-6743", "linkedin": ";;", "or_profile": "~Zeke_Xie1;~Issei_Sato1;~Masashi_Sugiyama1", "aff": "Baidu;the University of Tokyo;The University of Tokyo", "aff_domain": "baidu.com;u-tokyo.ac.jp;u-tokyo.ac.jp", "position": "Researcher;Associate Professor;Full Professor", "bibtex": "@misc{\nxie2022understanding,\ntitle={Understanding and Scheduling Weight Decay},\nauthor={Zeke Xie and Issei Sato and Masashi Sugiyama},\nyear={2022},\nurl={https://openreview.net/forum?id=J7V_4aauV6B}\n}", "github": "", "project": "", "reviewers": "saHk;AwnS;1qbp;6kHA", "site": "https://openreview.net/forum?id=J7V_4aauV6B", "pdf_size": 0, "recommendation": "3;3;6;8", "confidence": "4;4;2;4", "correctness": "3;2;1;4", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "109;57;125;51", "wc_summary_review": "68;66;42;12", "wc_main_review": "604;854;148;460", "wc_review": "781;977;315;523", "wc_reply_reviewers": "264;0;48;0", "wc_reply_authors": "1492;2465;871;409", "reply_reviewers": "1;0;1;0", "reply_authors": "3;4;2;1", "recommendation_avg": [ 5.0, 2.1213203435596424 ], "confidence_avg": [ 3.5, 0.8660254037844386 ], "correctness_avg": [ 2.5, 1.118033988749895 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 85.5, 32.07413287993925 ], "wc_summary_review_avg": [ 47.0, 22.64950330581225 ], "wc_main_review_avg": [ 516.5, 255.21902358562537 ], "wc_review_avg": [ 649.0, 251.21703763877161 ], "wc_reply_reviewers_avg": [ 78.0, 109.1604323919615 ], "wc_reply_authors_avg": [ 1309.25, 770.0111606334028 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 2.5, 1.118033988749895 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.2721655269759087, "corr_recommendation_correctness": 0.316227766016838, "gs_citation": 22, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6462754549009733096&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;1", "aff_unique_norm": "Baidu;University of Tokyo", "aff_unique_dep": "Baidu, Inc.;", "aff_unique_url": "https://www.baidu.com;https://www.u-tokyo.ac.jp", "aff_unique_abbr": "Baidu;UTokyo", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1", "aff_country_unique": "China;Japan" }, { "title": "How to deal with missing data in supervised deep learning?", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/5973", "id": "J7b4BCtDm4", "poster": "", "openreview": "https://openreview.net/forum?id=J7b4BCtDm4", "slides": "https://iclr.cc/virtual/2022/poster/5973", "video": "https://iclr.cc/virtual/2022/poster/5973", "author_site": "Niels Ipsen, Pierre-Alexandre Mattei, Jes Frellsen", "tldr": "", "abstract": "The issue of missing data in supervised learning has been largely overlooked, especially in the deep learning community. We investigate strategies to adapt neural architectures for handling missing values. Here, we focus on regression and classification problems where the features are assumed to be missing at random. Of particular interest are schemes that allow reusing as-is a neural discriminative architecture. To address supervised deep learning with missing values, we propose to marginalize over missing values in a joint model of covariates and outcomes. Thereby, we leverage both the flexibility of deep generative models to describe the distribution of the covariates and the power of purely discriminative models to make predictions. More precisely, a deep latent variable model can be learned jointly with the discriminative model, using importance-weighted variational inference, essentially using importance sampling to mimick averaging over multiple imputations. In low-capacity regimes, or when the discriminative model has a strong inductive bias, we find that our hybrid generative/discriminative approach generally outperforms single imputations methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Niels Bruun Ipsen;Pierre-Alexandre Mattei;Jes Frellsen", "authorids": "~Niels_Bruun_Ipsen1;~Pierre-Alexandre_Mattei3;~Jes_Frellsen1", "gender": ";M;M", "homepage": ";http://pamattei.github.io;https://frellsen.org", "dblp": "241/7122;177/7275;83/8247", "google_scholar": ";https://scholar.google.fr/citations?user=Tqa_-D0AAAAJ;Yj2sBWkAAAAJ", "orcid": ";;0000-0001-9224-1271", "linkedin": ";;frellsen/", "or_profile": "~Niels_Bruun_Ipsen1;~Pierre-Alexandre_Mattei3;~Jes_Frellsen1", "aff": ";INRIA;Technical University of Denmark", "aff_domain": ";inria.fr;dtu.dk", "position": ";Research scientist;Associate Professor", "bibtex": "@inproceedings{\nipsen2022how,\ntitle={How to deal with missing data in supervised deep learning?},\nauthor={Niels Bruun Ipsen and Pierre-Alexandre Mattei and Jes Frellsen},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=J7b4BCtDm4}\n}", "github": "", "project": "", "reviewers": "FaAu;U6yy;w981;zqzK", "pdf_size": 0, "recommendation": "5;5;8;8", "confidence": "4;3;4;3", "correctness": "3;3;4;4", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "91;51;127;51", "wc_summary_review": "24;44;50;26", "wc_main_review": "181;209;163;167", "wc_review": "296;304;340;244", "wc_reply_reviewers": "57;0;0;0", "wc_reply_authors": "466;526;286;30", "reply_reviewers": "1;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 6.5, 1.5 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 80.0, 31.670175244226233 ], "wc_summary_review_avg": [ 36.0, 11.224972160321824 ], "wc_main_review_avg": [ 180.0, 18.027756377319946 ], "wc_review_avg": [ 296.0, 34.292856398964496 ], "wc_reply_reviewers_avg": [ 14.25, 24.681724007856502 ], "wc_reply_authors_avg": [ 327.0, 192.88079220077876 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 1.0, "gs_citation": 50, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15579609963375586989&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "pdf": "https://openreview.net/pdf?id=J7b4BCtDm4", "email": ";inria.fr;dtu.dk", "author_num": 3, "aff_unique_index": "0;1", "aff_unique_norm": "INRIA;Technical University of Denmark", "aff_unique_dep": ";", "aff_unique_url": "https://www.inria.fr;https://www.tek.dk", "aff_unique_abbr": "INRIA;DTU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "France;Denmark" }, { "id": "J8P7g_mDpno", "title": "Search Spaces for Neural Model Training", "track": "main", "status": "Reject", "tldr": "", "abstract": "While larger neural models are pushing the boundaries of what deep learning can do, often more weights are needed to train models rather than to run inference for tasks. This paper seeks to understand this behavior using search spaces -- adding weights creates extra degrees of freedom that form new paths for optimization (or wider search spaces) rendering neural model training more effective. We then show how we can augment search spaces to train sparse models attaining competitive scores across dozens of deep learning workloads. They are also are tolerant of structures targeting current hardware, opening avenues for training and inference acceleration. Our work encourages research to explore beyond massive neural models being used today.", "keywords": "search space;sparsity;neural models;deep learning", "primary_area": "", "supplementary_material": "", "author": "Darko Stosic;Dusan Stosic", "authorids": "~Darko_Stosic1;~Dusan_Stosic1", "gender": "M;M", "homepage": ";", "dblp": ";173/2835", "google_scholar": "-rf3RzkAAAAJ;", "orcid": ";", "linkedin": "darkostosic;", "or_profile": "~Darko_Stosic1;~Dusan_Stosic1", "aff": "NVIDIA;NVIDIA", "aff_domain": "nvidia.com;nvidia.com", "position": "Architect;Researcher", "bibtex": "@misc{\nstosic2022search,\ntitle={Search Spaces for Neural Model Training},\nauthor={Darko Stosic and Dusan Stosic},\nyear={2022},\nurl={https://openreview.net/forum?id=J8P7g_mDpno}\n}", "github": "", "project": "", "reviewers": "LwfS;kChj;yWXp;t5MH", "site": "https://openreview.net/forum?id=J8P7g_mDpno", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "3;3;4;4", "correctness": "3;3;4;3", "technical_novelty": "2;2;4;2", "empirical_novelty": "3;2;3;3", "wc_summary_paper": "172;127;132;65", "wc_summary_review": "124;241;12;116", "wc_main_review": "106;529;210;960", "wc_review": "402;897;354;1141", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "420;1554;285;1527", "reply_reviewers": "0;0;0;0", "reply_authors": "1;2;1;3", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.8660254037844386 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 124.0, 38.268786236304905 ], "wc_summary_review_avg": [ 123.25, 81.07828007549247 ], "wc_main_review_avg": [ 451.25, 332.51719880331 ], "wc_review_avg": [ 698.5, 332.3405632780928 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 946.5, 595.9909814753911 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.75, 0.82915619758885 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 1.0, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8102394863971921074&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0", "aff_unique_norm": "NVIDIA", "aff_unique_dep": "NVIDIA Corporation", "aff_unique_url": "https://www.nvidia.com", "aff_unique_abbr": "NVIDIA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "J9_7t9m8xRj", "title": "Diverse and Consistent Multi-view Networks for Semi-supervised Regression", "track": "main", "status": "Reject", "tldr": "", "abstract": "Label collection is costly in many applications, which poses the need for label-efficient learning. In this work, we present Diverse and Consistent Multi-view Networks (DiCoM) \u2014 a novel semi-supervised regression technique based on a multi-view learning framework. DiCoM combines diversity with consistency \u2014 two seemingly opposing yet complementary principles of multi-view learning - based on underlying probabilistic graphical assumptions. Given multiple deep views of the same input, DiCoM encourages a negative correlation among the views' predictions on labeled data, while simultaneously enforces their agreement on unlabeled data. DiCoM can utilize either multi-network or multi-branch architectures to make a trade-off between computational cost and modeling performance. Under realistic evaluation setups, DiCoM outperforms competing methods on tabular and image data. Our ablation studies confirm the importance of having both consistency and diversity.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Cuong Manh Nguyen;Le Zhang;Arun Raja;Xun Xu;Balagopal Unnikrishnan;Kangkang Lu;Chuan-Sheng Foo", "authorids": "~Cuong_Manh_Nguyen1;~Le_Zhang3;~Arun_Raja1;~Xun_Xu1;~Balagopal_Unnikrishnan1;~Kangkang_Lu1;~Chuan-Sheng_Foo1", "gender": "M;M;;Not Specified;M;M;M", "homepage": ";http://zhangleuestc.cn/;;https://alex-xun-xu.github.io/;https://balagopal.me/;;http://ai.stanford.edu/~csfoo", "dblp": ";03/4043-1;;47/3944-2;232/3366;;73/1823", "google_scholar": ";https://scholar.google.com.sg/citations?user=61LOyWUAAAAJ;https://scholar.google.com/citations?view_op=list_works;https://scholar.google.com.sg/citations?user=pi0SGQUAAAAJ;https://scholar.google.com/citations?hl=en;QYkJHCYAAAAJ;AgbeqGkAAAAJ", "orcid": "0000-0002-6342-1393;;;;0000-0001-5845-8928;;0000-0002-4748-5792", "linkedin": "alfred-nguyen-2905/;;;;balagopalu/;;", "or_profile": "~Cuong_Manh_Nguyen1;~Le_Zhang3;~Arun_Raja1;~Xun_Xu1;~Balagopal_Unnikrishnan1;~Kangkang_Lu1;~Chuan-Sheng_Foo1", "aff": "Institute for Infocomm Research, A*STAR;University of Electronic Science and Technology of China;Institute for Infocomm Research, A*STAR;A*STAR;University of Toronto;A*STAR;Institute for Infocomm Research, A*STAR", "aff_domain": "i2r.a-star.edu.sg;uestc.edu.cn;i2r.a-star.edu.sg;i2r.a-star.edu.sg;utoronto.ca;a-star.edu.sg;i2r.a-star.edu.sg", "position": "Researcher;Professor;Researcher;Scientist;PhD student;Senior Research Engineer;Scientist", "bibtex": "@misc{\nnguyen2022diverse,\ntitle={Diverse and Consistent Multi-view Networks for Semi-supervised Regression},\nauthor={Cuong Manh Nguyen and Le Zhang and Arun Raja and Xun Xu and Balagopal Unnikrishnan and Kangkang Lu and Chuan-Sheng Foo},\nyear={2022},\nurl={https://openreview.net/forum?id=J9_7t9m8xRj}\n}", "github": "", "project": "", "reviewers": "6car;UrUc;BTsB;C8H3", "site": "https://openreview.net/forum?id=J9_7t9m8xRj", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "4;4;3;3", "correctness": "2;3;4;4", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "56;39;17;60", "wc_summary_review": "35;52;22;49", "wc_main_review": "393;187;68;470", "wc_review": "484;278;107;579", "wc_reply_reviewers": "0;0;0;135", "wc_reply_authors": "1455;1123;623;1040", "reply_reviewers": "0;0;0;1", "reply_authors": "3;3;1;3", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 43.0, 16.95582495781317 ], "wc_summary_review_avg": [ 39.5, 11.968709203585824 ], "wc_main_review_avg": [ 279.5, 160.04764915486888 ], "wc_review_avg": [ 362.0, 183.06692765215678 ], "wc_reply_reviewers_avg": [ 33.75, 58.45671475544961 ], "wc_reply_authors_avg": [ 1060.25, 296.3792966791034 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 2.5, 0.8660254037844386 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": -0.6882472016116854, "corr_recommendation_correctness": 0.899228803025897, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:kdQKbI9ksTAJ:scholar.google.com/&scioq=Diverse+and+Consistent+Multi-view+Networks+for+Semi-supervised+Regression&hl=en&as_sdt=0,33", "gs_version_total": 6, "aff_unique_index": "0;1;0;2;3;2;0", "aff_unique_norm": "Institute for Infocomm Research;University of Electronic Science and Technology of China;Agency for Science, Technology and Research;University of Toronto", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.i2r.a-star.edu.sg;https://www.uestc.edu.cn;https://www.a-star.edu.sg;https://www.utoronto.ca", "aff_unique_abbr": "I2R;UESTC;A*STAR;U of T", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0;2;0;0", "aff_country_unique": "Singapore;China;Canada" }, { "id": "JAJozcf0Kb", "title": "Memory-Driven Text-to-Image Generation", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "We introduce a memory-driven semi-parametric approach to text-to-image generation, which is based on both parametric and non-parametric techniques. The non-parametric component is a memory bank of image features constructed from a training set of images. The parametric component is a generative adversarial network. Given a new text description at inference time, the memory bank is used to selectively retrieve image features that are provided as basic information of target images, which enables the generator to produce realistic synthetic results. We also incorporate the content information into the discriminator, together with semantic features, allowing the discriminator to make a more reliable prediction. Experimental results demonstrate that the proposed memory-driven semi-parametric approach produces more realistic images than purely parametric approaches, in terms of both visual fidelity and text-image semantic consistency.\n", "keywords": "Generative Adversarial Networks;Text-to-Image Generation;Memory Bank;Semi-parametric", "primary_area": "", "supplementary_material": "", "author": "Bowen Li;Philip Torr;Thomas Lukasiewicz", "authorids": "~Bowen_Li2;~Philip_Torr1;~Thomas_Lukasiewicz2", "gender": "M;;", "homepage": "https://mrlibw.github.io;http://www.robots.ox.ac.uk/~tvg/;https://www.cs.ox.ac.uk/people/thomas.lukasiewicz/", "dblp": "75/10470-1;;l/ThomasLukasiewicz", "google_scholar": "https://scholar.google.co.uk/citations?hl=en;;arjucpEAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Bowen_Li2;~Philip_Torr1;~Thomas_Lukasiewicz2", "aff": "Department of Computer Science, University of Oxford;University of Oxford;Department of Computer Science, University of Oxford", "aff_domain": "cs.ox.ac.uk;ox.ac.uk;cs.ox.ac.uk", "position": "Research Associate;Full Professor;Full Professor", "bibtex": "@misc{\nli2022memorydriven,\ntitle={Memory-Driven Text-to-Image Generation},\nauthor={Bowen Li and Philip Torr and Thomas Lukasiewicz},\nyear={2022},\nurl={https://openreview.net/forum?id=JAJozcf0Kb}\n}", "github": "", "project": "", "reviewers": "EYRb;vAH2;UWGU;mwi9", "site": "https://openreview.net/forum?id=JAJozcf0Kb", "pdf_size": 0, "recommendation": "3;5;6;6", "confidence": "4;3;3;4", "correctness": "2;3;4;3", "technical_novelty": "3;2;2;3", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "81;66;100;137", "wc_summary_review": "37;22;56;44", "wc_main_review": "560;137;471;656", "wc_review": "678;225;627;837", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 5.0, 1.224744871391589 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 96.0, 26.56124997058685 ], "wc_summary_review_avg": [ 39.75, 12.295832627357937 ], "wc_main_review_avg": [ 456.0, 195.44948196401035 ], "wc_review_avg": [ 591.75, 225.46327306237706 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.40824829046386296, "corr_recommendation_correctness": 0.8660254037844386, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3809151683795781337&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Oxford", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.ox.ac.uk", "aff_unique_abbr": "Oxford", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Oxford;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United Kingdom" }, { "title": "A First-Occupancy Representation for Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6717", "id": "JBAZe2yN6Ub", "poster": "", "openreview": "https://openreview.net/forum?id=JBAZe2yN6Ub", "slides": "https://iclr.cc/virtual/2022/poster/6717", "video": "https://iclr.cc/virtual/2022/poster/6717", "author_site": "Ted Moskovitz, Spencer Wilson, Maneesh Sahani", "tldr": "", "abstract": "Both animals and artificial agents benefit from state representations that support rapid transfer of learning across tasks and which enable them to efficiently traverse their environments to reach rewarding states. The successor representation (SR), which measures the expected cumulative, discounted state occupancy under a fixed policy, enables efficient transfer to different reward structures in an otherwise constant Markovian environment and has been hypothesized to underlie aspects of biological behavior and neural activity. However, in the real world, rewards may only be available for consumption once, may shift location, or agents may simply aim to reach goal states as rapidly as possible without the constraint of artificially imposed task horizons. In such cases, the most behaviorally-relevant representation would carry information about when the agent was likely to first reach states of interest, rather than how often it should expect to visit them over a potentially infinite time span. To reflect such demands, we introduce the first-occupancy representation (FR), which measures the expected temporal discount to the first time a state is accessed. We demonstrate that the FR facilitates exploration, the selection of efficient paths to desired states, allows the agent, under certain conditions, to plan provably optimal trajectories defined by a sequence of subgoals, and induces similar behavior to animals avoiding threatening stimuli.", "keywords": "successor representation;successor features;generalized policy improvement;GPI", "primary_area": "", "supplementary_material": "/attachment/378bd6631f271e490141e223fce76b4742da14df.zip", "author": "Ted Moskovitz;Spencer R Wilson;Maneesh Sahani", "authorids": "~Ted_Moskovitz1;~Spencer_R_Wilson1;~Maneesh_Sahani1", "gender": "M;M;", "homepage": "https://tedmoskovitz.github.io/;;http://www.gatsby.ucl.ac.uk/~maneesh", "dblp": ";;44/3197", "google_scholar": "pPVXrTYAAAAJ;MWI_bxoAAAAJ;https://scholar.google.co.uk/citations?user=rwxX83UAAAAJ", "orcid": ";;0000-0001-5560-3341", "linkedin": ";;", "or_profile": "~Ted_Moskovitz1;~Spencer_R_Wilson1;~Maneesh_Sahani1", "aff": "Google DeepMind;University College London;University College London", "aff_domain": "google.com;ucl.ac.uk;ucl.ac.uk", "position": "Intern;PhD student;Full Professor", "bibtex": "@inproceedings{\nmoskovitz2022a,\ntitle={A First-Occupancy Representation for Reinforcement Learning},\nauthor={Ted Moskovitz and Spencer R Wilson and Maneesh Sahani},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=JBAZe2yN6Ub}\n}", "github": "", "project": "", "reviewers": "aieJ;Ugir;epv4;SXgZ", "pdf_size": 0, "recommendation": "5;6;8;8", "confidence": "4;4;4;4", "correctness": "3;3;4;4", "technical_novelty": "2;2;4;4", "empirical_novelty": "2;2;2;4", "wc_summary_paper": "39;56;103;148", "wc_summary_review": "92;75;48;97", "wc_main_review": "371;186;578;230", "wc_review": "502;317;729;475", "wc_reply_reviewers": "289;0;17;23", "wc_reply_authors": "1655;270;313;250", "reply_reviewers": "1;0;1;1", "reply_authors": "3;1;1;1", "recommendation_avg": [ 6.75, 1.299038105676658 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.0, 1.0 ], "empirical_novelty_avg": [ 2.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 86.5, 42.547032799009614 ], "wc_summary_review_avg": [ 78.0, 19.144189719076646 ], "wc_main_review_avg": [ 341.25, 152.81913329161372 ], "wc_review_avg": [ 505.75, 146.99213414329353 ], "wc_reply_reviewers_avg": [ 82.25, 119.66489669071711 ], "wc_reply_authors_avg": [ 622.0, 596.8370799472834 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.5, 0.8660254037844386 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.9622504486493761, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7350458936111734330&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=JBAZe2yN6Ub", "email": "google.com;ucl.ac.uk;ucl.ac.uk", "author_num": 3, "aff_unique_index": "0;1;1", "aff_unique_norm": "Google;University College London", "aff_unique_dep": "Google DeepMind;", "aff_unique_url": "https://deepmind.com;https://www.ucl.ac.uk", "aff_unique_abbr": "DeepMind;UCL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United Kingdom" }, { "id": "JDOpWxBqMw", "title": "Variational Perturbations for Visual Feature Attribution", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Explaining a complex black-box system in a post-hoc manner is important to understand its predictions. In this work we focus on two objectives, namely on how well the estimated explanation describes the classifier's behavior (faithfulness), and how sensitive the explanation is to input variations or model configurations (robustness). To achieve both faithfulness and robustness, we propose an uncertainty-aware explanation model, Variational Perturbations (VP), that learns a distribution of feature attribution for each image input and the corresponding classifier outputs. This differs from existing methods, which learn one deterministic estimate of feature attribution. We validate that according to several robustness and faithfulness metrics our VP method provides more reliable explanations compared to state-of-the-art methods on MNIST, CUB, and ImageNet datasets while also being more efficient.", "keywords": "Explainable AI;Faithfulness;Robustness;Variational inference", "primary_area": "", "supplementary_material": "", "author": "Jae Myung Kim;Eunji Kim;Sungroh Yoon;Jungwoo Lee;Cordelia Schmid;Zeynep Akata", "authorids": "~Jae_Myung_Kim1;~Eunji_Kim2;~Sungroh_Yoon1;~Jungwoo_Lee1;~Cordelia_Schmid1;~Zeynep_Akata1", "gender": "M;F;;M;F;F", "homepage": "https://jaemyung-kim.github.io;https://sites.google.com/snu.ac.kr/eunjikim;http://ailab.snu.ac.kr;https://cml.snu.ac.kr;https://cordeliaschmid.github.io/;https://eml-unitue.de/people/zeynep-akata", "dblp": "51/1888;;99/1474;34/516-1;s/CordeliaSchmid;117/4838", "google_scholar": "eP6FHFAAAAAJ;nShf6cgAAAAJ;Bphl_fIAAAAJ;j98IWfoAAAAJ;IvqCXP4AAAAJ;jQl9RtkAAAAJ", "orcid": ";0000-0001-5484-803X;0000-0002-2367-197X;0000-0002-6804-980X;;0000-0002-1432-7747", "linkedin": ";eunji-kim-049222243/;;;cordelia-schmid-47985a9;zeynep-akata-36182045/?ppe=1", "or_profile": "~Jae_Myung_Kim1;~Eunji_Kim2;~Sungroh_Yoon1;~Jungwoo_Lee1;~Cordelia_Schmid1;~Zeynep_Akata1", "aff": "University of Tuebingen;NAVER;Seoul National University;Seoul National University;Inria;University of T\u00fcbingen", "aff_domain": "uni-tuebingen.de;navercorp.com;snu.ac.kr;snu.ac.kr;inria.fr;uni-tuebingen.de", "position": "PhD student;Intern;Full Professor;Full Professor;Researcher;Full Professor", "bibtex": "@misc{\nkim2022variational,\ntitle={Variational Perturbations for Visual Feature Attribution},\nauthor={Jae Myung Kim and Eunji Kim and Sungroh Yoon and Jungwoo Lee and Cordelia Schmid and Zeynep Akata},\nyear={2022},\nurl={https://openreview.net/forum?id=JDOpWxBqMw}\n}", "github": "", "project": "", "reviewers": "4QNm;osnS;zBLa;nBT4", "site": "https://openreview.net/forum?id=JDOpWxBqMw", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "5;4;3;4", "correctness": "3;2;3;3", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "112;86;107;133", "wc_summary_review": "53;17;23;105", "wc_main_review": "410;772;295;315", "wc_review": "575;875;425;553", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 109.5, 16.710774967068403 ], "wc_summary_review_avg": [ 49.5, 34.8245602987317 ], "wc_main_review_avg": [ 448.0, 192.04036034125744 ], "wc_review_avg": [ 607.0, 164.99090884045702 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.7071067811865475, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:iy0cW7RHsZ8J:scholar.google.com/&scioq=Variational+Perturbations+for+Visual+Feature+Attribution&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1;2;2;3;4", "aff_unique_norm": "University of Tuebingen;NAVER Corporation;Seoul National University;INRIA;University of T\u00fcbingen", "aff_unique_dep": ";;;;", "aff_unique_url": "https://www.uni-tuebingen.de/;https://www.naver.com;https://www.snu.ac.kr;https://www.inria.fr;https://www.uni-tuebingen.de/", "aff_unique_abbr": "Uni T\u00fcbingen;NAVER;SNU;Inria;Uni T\u00fcbingen", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1;2;0", "aff_country_unique": "Germany;South Korea;France" }, { "id": "JEoDctbwCmP", "title": "Enforcing physics-based algebraic constraints for inference of PDE models on unstructured grids", "track": "main", "status": "Reject", "tldr": "", "abstract": "Data-driven neural network models have recently shown great success in modelling and learning complex PDE systems. Several works have proposed approaches to include specific physics-based constraints to avoid unrealistic modelling outcomes. While previous works focused on specific constraints and uniform spatial grids, we propose a novel approach for enforcing general pointwise, differential and integral constraints on unstructured spatial grids. The method is based on representing a black-box PDE model's output in terms of a function approximation and enforcing constraints directly on that function. We demonstrate applicability of our approach in learning PDE-driven systems and generating spatial fields with GANs, both on free-form spatial and temporal domains, and show how both kinds of models benefit from incorporation of physics-based constraints.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Valerii Iakovlev;Markus Heinonen;Harri L\u00e4hdesm\u00e4ki", "authorids": "~Valerii_Iakovlev1;~Markus_Heinonen1;~Harri_L\u00e4hdesm\u00e4ki1", "gender": ";M;M", "homepage": ";https://users.aalto.fi/~heinom10/;https://research.cs.aalto.fi/csb/", "dblp": ";22/7709;85/4466", "google_scholar": ";hFtfHZoAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";;", "linkedin": "valerii-iakovlev-a12596190/;;", "or_profile": "~Valerii_Iakovlev1;~Markus_Heinonen1;~Harri_L\u00e4hdesm\u00e4ki1", "aff": "Aalto University;Aalto University;Aalto University", "aff_domain": "aalto.fi;aalto.fi;aalto.fi", "position": "PhD student;Researcher;Associate Professor", "bibtex": "@misc{\niakovlev2022enforcing,\ntitle={Enforcing physics-based algebraic constraints for inference of {PDE} models on unstructured grids},\nauthor={Valerii Iakovlev and Markus Heinonen and Harri L{\\\"a}hdesm{\\\"a}ki},\nyear={2022},\nurl={https://openreview.net/forum?id=JEoDctbwCmP}\n}", "github": "", "project": "", "reviewers": "HR2b;3bQB;A9JW;vTXr", "site": "https://openreview.net/forum?id=JEoDctbwCmP", "pdf_size": 0, "recommendation": "5;5;5;5", "confidence": "4;4;4;4", "correctness": "3;3;3;3", "technical_novelty": "2;3;2;2", "empirical_novelty": "2;3;2;2", "wc_summary_paper": "62;49;21;134", "wc_summary_review": "94;71;146;82", "wc_main_review": "223;539;492;429", "wc_review": "379;659;659;645", "wc_reply_reviewers": "64;82;184;527", "wc_reply_authors": "277;785;1377;1275", "reply_reviewers": "1;1;1;2", "reply_authors": "1;2;2;3", "recommendation_avg": [ 5.0, 0.0 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 66.5, 41.692325432865935 ], "wc_summary_review_avg": [ 98.25, 28.74347752099596 ], "wc_main_review_avg": [ 420.75, 120.65731432449506 ], "wc_review_avg": [ 585.5, 119.35975033485953 ], "wc_reply_reviewers_avg": [ 214.25, 186.27449503353915 ], "wc_reply_authors_avg": [ 928.5, 437.68110537239323 ], "reply_reviewers_avg": [ 1.25, 0.4330127018922193 ], "reply_authors_avg": [ 2.0, 0.7071067811865476 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:ai49b5Kb-uEJ:scholar.google.com/&scioq=Enforcing+physics-based+algebraic+constraints+for+inference+of+PDE+models+on+unstructured+grids&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;0", "aff_unique_norm": "Aalto University", "aff_unique_dep": "", "aff_unique_url": "https://www.aalto.fi", "aff_unique_abbr": "Aalto", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Finland" }, { "title": "Universal Approximation Under Constraints is Possible with Transformers", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6477", "id": "JGO8CvG5S9", "poster": "", "openreview": "https://openreview.net/forum?id=JGO8CvG5S9", "slides": "https://iclr.cc/virtual/2022/poster/6477", "video": "https://iclr.cc/virtual/2022/poster/6477", "author_site": "Anastasis Kratsios, Behnoosh Zamanlooy, Tianlin Liu, Ivan Dokmanic", "tldr": "", "abstract": "Many practical problems need the output of a machine learning model to satisfy a set of constraints, $K$. Nevertheless, there is no known guarantee that classical neural network architectures can exactly encode constraints while simultaneously achieving universality. We provide a quantitative constrained universal approximation theorem which guarantees that for any non-convex compact set $K$ and any continuous function $f:\\mathbb{R}^n\\rightarrow K$, there is a probabilistic transformer $\\hat{F}$ whose randomized outputs all lie in $K$ and whose expected output uniformly approximates $f$. Our second main result is a ``deep neural version'' of Berge's Maximum Theorem (1963). The result guarantees that given an objective function $L$, a constraint set $K$, and a family of soft constraint sets, there is a probabilistic transformer $\\hat{F}$ that approximately minimizes $L$ and whose outputs belong to $K$; moreover, $\\hat{F}$ approximately satisfies the soft constraints. Our results imply the first universal approximation theorem for classical transformers with exact convex constraint satisfaction. They also yield that a chart-free universal approximation theorem for Riemannian manifold-valued functions subject to suitable geodesically convex constraints.", "keywords": "Constrained Universal Approximation;Probabilistic Attention;Transformer Networks;Geometric Deep Learning;Measurable Maximum Theorem;Non-Affine Random Projections;Optimal Transport.", "primary_area": "", "supplementary_material": "/attachment/e5c9d9dd1be5aa9fa528dee3a89e63f79de1c645.zip", "author": "Anastasis Kratsios;Behnoosh Zamanlooy;Tianlin Liu;Ivan Dokmani\u0107", "authorids": "~Anastasis_Kratsios1;~Behnoosh_Zamanlooy2;~Tianlin_Liu2;~Ivan_Dokmani\u01071", "gender": "Non-Binary;F;M;M", "homepage": "https://anastasiskratsios.github.io/;;http://www.tianlinliu.com;http://dokmanic.ece.illinois.edu", "dblp": ";;20/7667;52/8859", "google_scholar": "https://scholar.google.ca/citations?user=9D-bHFgAAAAJ;bns0iwUAAAAJ;;0SQnwL4AAAAJ", "orcid": "0000-0001-6791-3371;;;", "linkedin": "anastasiskratsios/;;;", "or_profile": "~Anastasis_Kratsios1;~Behnoosh_Zamanlooy2;~Tianlin_Liu2;~Ivan_Dokmanic1", "aff": "Universit\u00e4t Basel;McMaster University;University of Basel;University of Basel", "aff_domain": "unibas.ch;mcmaster.ca;unibas.ch;unibas.ch", "position": "Postdoc;PhD student;PhD student;Associate Professor", "bibtex": "@inproceedings{\nkratsios2022universal,\ntitle={Universal Approximation Under Constraints is Possible with Transformers},\nauthor={Anastasis Kratsios and Behnoosh Zamanlooy and Tianlin Liu and Ivan Dokmani{\\'c}},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=JGO8CvG5S9}\n}", "github": "", "project": "", "reviewers": "w3Aa;VnD6;tJ9y", "pdf_size": 0, "recommendation": "6;8;10", "confidence": "4;2;4", "correctness": "3;3;4", "technical_novelty": "3;3;4", "empirical_novelty": "1;3;0", "wc_summary_paper": "30;43;41", "wc_summary_review": "529;25;79", "wc_main_review": "141;138;433", "wc_review": "700;206;553", "wc_reply_reviewers": "127;0;304", "wc_reply_authors": "1654;306;470", "reply_reviewers": "2;0;1", "reply_authors": "5;1;2", "recommendation_avg": [ 8.0, 1.632993161855452 ], "confidence_avg": [ 3.3333333333333335, 0.9428090415820634 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 3.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 1.3333333333333333, 1.247219128924647 ], "wc_summary_paper_avg": [ 38.0, 5.715476066494082 ], "wc_summary_review_avg": [ 211.0, 225.93804460515275 ], "wc_main_review_avg": [ 237.33333333333334, 138.36264749643317 ], "wc_review_avg": [ 486.3333333333333, 207.11081306607073 ], "wc_reply_reviewers_avg": [ 143.66666666666666, 124.66577539788359 ], "wc_reply_authors_avg": [ 810.0, 600.5419774392683 ], "reply_reviewers_avg": [ 1.0, 0.816496580927726 ], "reply_authors_avg": [ 2.6666666666666665, 1.699673171197595 ], "replies_avg": [ 22, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.8660254037844385, "gs_citation": 37, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7575011029485054711&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=JGO8CvG5S9", "email": "unibas.ch;mcmaster.ca;unibas.ch;unibas.ch", "author_num": 4, "aff_unique_index": "0;1;0;0", "aff_unique_norm": "University of Basel;McMaster University", "aff_unique_dep": ";", "aff_unique_url": "https://www.unibas.ch;https://www.mcmaster.ca", "aff_unique_abbr": "UniBas;McMaster", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0", "aff_country_unique": "Switzerland;Canada" }, { "id": "JHXjK94yH-y", "title": "Explore and Control with Adversarial Surprise", "track": "main", "status": "Reject", "tldr": "", "abstract": "Unsupervised reinforcement learning (RL) studies how to leverage environment statistics to learn useful behaviors without the cost of reward engineering. However, a central challenge in unsupervised RL is to extract behaviors that meaningfully affect the world and cover the range of possible outcomes, without getting distracted by inherently unpredictable, uncontrollable, and stochastic elements in the environment. To this end, we propose an unsupervised RL method designed for high-dimensional, stochastic environments based on an adversarial game between two policies (which we call Explore and Control) controlling a single body and competing over the amount of observation entropy the agent experiences. The Explore agent seeks out states that maximally surprise the Control agent, which in turn aims to minimize surprise, and thereby manipulate the environment to return to familiar and predictable states. The competition between these two policies drives them to seek out increasingly surprising parts of the environment while learning to gain mastery over them. We show formally that the resulting algorithm maximizes coverage of the underlying state in block MDPs with stochastic observations, providing theoretical backing to our hypothesis that this procedure avoids uncontrollable and stochastic distractions. Our experiments further demonstrate that Adversarial Surprise leads to the emergence of complex and meaningful skills, and outperforms state-of-the-art unsupervised reinforcement learning methods in terms of both exploration and zero-shot transfer to downstream tasks.", "keywords": "reinforcement learning;intrinsic motivation;exploration;multi-agent", "primary_area": "", "supplementary_material": "", "author": "Arnaud Fickinger;Natasha Jaques;Samyak Parajuli;Michael Chang;Nicholas Rhinehart;Glen Berseth;Stuart Russell;Sergey Levine", "authorids": "~Arnaud_Fickinger1;~Natasha_Jaques1;~Samyak_Parajuli1;~Michael_Chang1;~Nicholas_Rhinehart1;~Glen_Berseth1;~Stuart_Russell1;~Sergey_Levine1", "gender": ";F;M;M;M;M;M;M", "homepage": "https://www.linkedin.com/in/arnaudfickinger/;https://natashajaques.ai/;;http://mbchang.github.io/;https://leaf.utias.utoronto.ca/;http://fracturedplane.com/;https://people.eecs.berkeley.edu/~russell/;https://people.eecs.berkeley.edu/~svlevine/", "dblp": "236/4896;145/7732;https://dblp.uni-trier.de/pers/hd/p/Parajuli:Samyak;192/1567;153/2193;147/5478;;80/7594", "google_scholar": ";8iCb2TwAAAAJ;NACSmGwAAAAJ;vgfGtykAAAAJ;xUGZX_MAAAAJ;https://scholar.google.ca/citations?user=-WZcuuwAAAAJ;https://scholar.google.com.tw/citations?user=KJGrjCAAAAAJ;8R35rCwAAAAJ", "orcid": ";;;;;0000-0001-7351-8028;;", "linkedin": ";natashajaques;samyakparajuli/;mbchang;;glen-berseth-0523278b?trk=hp-identity-name;;", "or_profile": "~Arnaud_Fickinger1;~Natasha_Jaques1;~Samyak_Parajuli1;~Michael_Chang1;~Nicholas_Rhinehart1;~Glen_Berseth1;~Stuart_Russell1;~Sergey_Levine1", "aff": "University of California, Berkeley;University of California, Berkeley;Scale AI;University of California, Berkeley;University of California, Berkeley;Montreal Institute for Learning Algorithms, University of Montreal, Universit\u00e9 de Montr\u00e9al;University of California, Berkeley;Google", "aff_domain": "berkeley.edu;berkeley.edu;scale.com;berkeley.edu;berkeley.edu;mila.umontreal.ca;berkeley.edu;google.com", "position": "PhD student;Postdoc;Researcher;PhD student;Postdoc;Assistant Professor;Full Professor;Research Scientist", "bibtex": "@misc{\nfickinger2022explore,\ntitle={Explore and Control with Adversarial Surprise},\nauthor={Arnaud Fickinger and Natasha Jaques and Samyak Parajuli and Michael Chang and Nicholas Rhinehart and Glen Berseth and Stuart Russell and Sergey Levine},\nyear={2022},\nurl={https://openreview.net/forum?id=JHXjK94yH-y}\n}", "github": "", "project": "", "reviewers": "w5NP;gVM6;MoPi", "site": "https://openreview.net/forum?id=JHXjK94yH-y", "pdf_size": 0, "recommendation": "3;5;5", "confidence": "4;4;4", "correctness": "3;4;4", "technical_novelty": "3;3;3", "empirical_novelty": "2;3;3", "wc_summary_paper": "110;64;88", "wc_summary_review": "25;145;130", "wc_main_review": "1096;785;697", "wc_review": "1231;994;915", "wc_reply_reviewers": "744;401;201", "wc_reply_authors": "604;306;683", "reply_reviewers": "2;1;1", "reply_authors": "2;1;3", "recommendation_avg": [ 4.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 87.33333333333333, 18.785337071473826 ], "wc_summary_review_avg": [ 100.0, 53.38539126015655 ], "wc_main_review_avg": [ 859.3333333333334, 171.1613923238013 ], "wc_review_avg": [ 1046.6666666666667, 134.27418052461002 ], "wc_reply_reviewers_avg": [ 448.6666666666667, 224.22657192125013 ], "wc_reply_authors_avg": [ 531.0, 162.33504448105674 ], "reply_reviewers_avg": [ 1.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 2.0, 0.816496580927726 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.9999999999999998, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4561668000042614110&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;1;0;0;2;0;3", "aff_unique_norm": "University of California, Berkeley;Scale AI;University of Montreal;Google", "aff_unique_dep": ";;Montreal Institute for Learning Algorithms;Google", "aff_unique_url": "https://www.berkeley.edu;https://scale.ai;https://www.mila.quebec;https://www.google.com", "aff_unique_abbr": "UC Berkeley;Scale AI;MILA;Google", "aff_campus_unique_index": "0;0;0;0;2;0;3", "aff_campus_unique": "Berkeley;;Montreal;Mountain View", "aff_country_unique_index": "0;0;0;0;0;1;0;0", "aff_country_unique": "United States;Canada" }, { "title": "Learning Discrete Structured Variational Auto-Encoder using Natural Evolution Strategies", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/5972", "id": "JJCjv4dAbyL", "poster": "", "openreview": "https://openreview.net/forum?id=JJCjv4dAbyL", "slides": "https://iclr.cc/virtual/2022/poster/5972", "video": "https://iclr.cc/virtual/2022/poster/5972", "author_site": "Alon Berliner, Guy Rotman, Yossi Adi, Roi Reichart, Tamir Hazan", "tldr": "", "abstract": "Discrete variational auto-encoders (VAEs) are able to represent semantic latent spaces in generative learning. In many real-life settings, the discrete latent space consists of high-dimensional structures, and propagating gradients through the relevant structures often requires enumerating over an exponentially large latent space. Recently, various approaches were devised to propagate approximated gradients without enumerating over the space of possible structures. In this work, we use Natural Evolution Strategies (NES), a class of gradient-free black-box optimization algorithms, to learn discrete structured VAEs. The NES algorithms are computationally appealing as they estimate gradients with forward pass evaluations only, thus they do not require to propagate gradients through their discrete structures. We demonstrate empirically that optimizing discrete structured VAEs using NES is as effective as gradient-based approximations. Lastly, we prove NES converges for non-Lipschitz functions as appear in discrete structured VAEs.", "keywords": "structured prediction;derivative-free optimization;variational autoencoder", "primary_area": "", "supplementary_material": "/attachment/bced5440302053278159398e732d35a70a01ed77.zip", "author": "Alon Berliner;Guy Rotman;Yossi Adi;Roi Reichart;Tamir Hazan", "authorids": "~Alon_Berliner1;~Guy_Rotman1;~Yossi_Adi1;~Roi_Reichart1;~Tamir_Hazan1", "gender": "M;M;M;M;", "homepage": ";;http://adiyoss.github.io/;https://roireichart.com/;https://ie.technion.ac.il/~tamir.hazan/tamir.html", "dblp": ";222/9410;171/0957.html;96/5429;36/5041", "google_scholar": ";xT_SBbYAAAAJ;https://scholar.google.co.il/citations?user=4W-HuYYAAAAJ;https://scholar.google.co.il/citations?user=xXJIsh4AAAAJ;fqi186AAAAAJ", "orcid": ";;0000-0003-2237-3898;;", "linkedin": "alon-berliner-113365150/;;yossi-adi-31a32858?trk=nav_responsive_tab_profile_pic;roi-reichart-ba2a8a7/;", "or_profile": "~Alon_Berliner1;~Guy_Rotman1;~Yossi_Adi1;~Roi_Reichart1;~Tamir_Hazan1", "aff": "Technion, Technion;Technion, Technion;Meta;Technion, Israel Institute of Technology;Technion", "aff_domain": "technion.ac.il;technion.ac.il;meta.com;technion.ac.il;technion.ac.il", "position": "MS student;PhD student;Research Scientist;Associate Professor;Assistant Professor", "bibtex": "@inproceedings{\nberliner2022learning,\ntitle={Learning Discrete Structured Variational Auto-Encoder using Natural Evolution Strategies},\nauthor={Alon Berliner and Guy Rotman and Yossi Adi and Roi Reichart and Tamir Hazan},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=JJCjv4dAbyL}\n}", "github": "", "project": "", "reviewers": "Fqt8;qpkv;ANuT;12Bq", "pdf_size": 0, "recommendation": "6;8;8;8", "confidence": "3;4;4;3", "correctness": "3;4;4;3", "technical_novelty": "3;4;3;3", "empirical_novelty": "3;4;2;3", "wc_summary_paper": "45;92;54;42", "wc_summary_review": "21;14;89;32", "wc_main_review": "194;131;161;605", "wc_review": "260;237;304;679", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "464;66;618;32", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 7.5, 0.8660254037844386 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 58.25, 19.97967717456916 ], "wc_summary_review_avg": [ 39.0, 29.57194616524249 ], "wc_main_review_avg": [ 272.75, 193.11444145894424 ], "wc_review_avg": [ 370.0, 180.01805465008226 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 295.0, 252.23996511258878 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14166545846763510753&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=JJCjv4dAbyL", "email": "technion.ac.il;technion.ac.il;meta.com;technion.ac.il;technion.ac.il", "author_num": 5, "aff_unique_index": "0;0;1;2;0", "aff_unique_norm": "Technion - Israel Institute of Technology;Meta;Israel Institute of Technology", "aff_unique_dep": ";Meta Platforms, Inc.;", "aff_unique_url": "https://www.technion.ac.il/en/;https://meta.com;https://www.technion.ac.il/en/", "aff_unique_abbr": "Technion;Meta;Technion", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;0", "aff_country_unique": "Israel;United States" }, { "title": "Blaschke Product Neural Networks (BPNN): A Physics-Infused Neural Network for Phase Retrieval of Meromorphic Functions", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7089", "id": "JJxiD-kg-oK", "poster": "", "openreview": "https://openreview.net/forum?id=JJxiD-kg-oK", "slides": "https://iclr.cc/virtual/2022/poster/7089", "video": "https://iclr.cc/virtual/2022/poster/7089", "author_site": "Juncheng Dong, Simiao Ren, Yang Deng, Omar Khatib, Jordan Malof, Mohammadreza Soltani, Willie Padilla, VAHID TAROKH", "tldr": "", "abstract": "Numerous physical systems are described by ordinary or partial differential equations whose solutions are given by holomorphic or meromorphic functions in the complex domain. In many cases, only the magnitude of these functions are observed on various points on the purely imaginary $j\\omega$-axis since coherent measurement of their phases is often expensive. However, it is desirable to retrieve the lost phases from the magnitudes when possible. To this end, we propose a physics-infused deep neural network based on the Blaschke products for phase retrieval. Inspired by the Helson and Sarason Theorem, we recover coefficients of a rational function of Blaschke products using a Blaschke Product Neural Network (BPNN), based upon the magnitude observations as input. The resulting rational function is then used for phase retrieval. We compare the BPNN to conventional deep neural networks (NNs) on several phase retrieval problems, comprising both synthetic and contemporary real-world problems (e.g., metamaterials for which data collection requires substantial expertise and is time consuming). On each phase retrieval problem, we compare against a population of conventional NNs of varying size and hyperparameter settings. Even without any hyper-parameter search, we find that BPNNs consistently outperform the population of optimized NNs in scarce data scenarios, and do so despite being much smaller models. The results can in turn be applied to calculate the refractive index of metamaterials, which is an important problem in emerging areas of material science.", "keywords": "Blaschke Product;Neural Network;Phase Retrieval;Metamaterial;Meromorphic Functions", "primary_area": "", "supplementary_material": "/attachment/b011a4845144730cb44930f023082051046c0eaf.zip", "author": "Juncheng Dong;Simiao Ren;Yang Deng;Omar Khatib;Jordan Malof;Mohammadreza Soltani;Willie Padilla;Vahid Tarokh", "authorids": "~Juncheng_Dong1;~Simiao_Ren2;~Yang_Deng3;~Omar_Khatib1;~Jordan_Malof1;~Mohammadreza_Soltani1;~Willie_Padilla1;~Vahid_Tarokh1", "gender": ";M;M;;M;M;;", "homepage": ";http://amll.pratt.duke.edu/people/current-members;;https://padillalab.pratt.duke.edu/;https://jmalof.com/;https://mrezasoltani.github.io/;https://padillalab.pratt.duke.edu/;", "dblp": ";https://dblp.uni-trier.de/pid/275/3437;;;77/8580;150/5633;275/3841;", "google_scholar": ";n4B3bH4AAAAJ;dnh1U0sAAAAJ;;4hX2HZ8AAAAJ;;5mFPlG0AAAAJ;", "orcid": ";0000-0002-7113-9208;0000-0003-2249-6556;;0000-0002-7851-4920;;0000-0001-7734-8847;", "linkedin": ";simiao-ren-ab8043103/;;;;mohammadreza-soltani-99bb1ba0/;williepadilla/;", "or_profile": "~Juncheng_Dong1;~Simiao_Ren2;~Yang_Deng3;~Omar_Khatib1;~Jordan_Malof1;~Mohammadreza_Soltani1;~Willie_Padilla1;~Vahid_Tarokh1", "aff": ";Duke University;Duke University;Duke University;Duke University;3M;Duke University;", "aff_domain": ";duke.edu;duke.edu;duke.edu;duke.edu;mmm.com;ece.duke.edu;", "position": ";PhD student;PhD student;Postdoc;Assistant Research Professor;Researcher;Full Professor;", "bibtex": "@inproceedings{\ndong2022blaschke,\ntitle={Blaschke Product Neural Networks ({BPNN}): A Physics-Infused Neural Network for Phase Retrieval of Meromorphic Functions},\nauthor={Juncheng Dong and Simiao Ren and Yang Deng and Omar Khatib and Jordan Malof and Mohammadreza Soltani and Willie Padilla and Vahid Tarokh},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=JJxiD-kg-oK}\n}", "github": "", "project": "", "reviewers": "kQWk;64Gx;cH7K;9QvZ", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "3;3;4;3", "correctness": "3;3;3;4", "technical_novelty": "3;4;3;3", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "93;110;123;15", "wc_summary_review": "46;89;50;67", "wc_main_review": "148;251;364;247", "wc_review": "287;450;537;329", "wc_reply_reviewers": "32;34;0;25", "wc_reply_authors": "777;772;549;180", "reply_reviewers": "1;1;0;1", "reply_authors": "1;1;1;1", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 3.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 85.25, 41.930746475587576 ], "wc_summary_review_avg": [ 63.0, 16.95582495781317 ], "wc_main_review_avg": [ 252.5, 76.46077425713135 ], "wc_review_avg": [ 400.75, 98.83920021934617 ], "wc_reply_reviewers_avg": [ 22.75, 13.5531361684298 ], "wc_reply_authors_avg": [ 569.5, 242.99845678522323 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": -0.13245323570650439, "corr_recommendation_correctness": 0.9271726499455306, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:xJIQ0nvd1lMJ:scholar.google.com/&scioq=Blaschke+Product+Neural+Networks+(BPNN):+A+Physics-Infused+Neural+Network+for+Phase+Retrieval+of+Meromorphic+Functions&hl=en&as_sdt=0,5", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=JJxiD-kg-oK", "email": ";duke.edu;duke.edu;duke.edu;duke.edu;mmm.com;ece.duke.edu;", "author_num": 8, "aff_unique_index": "0;0;0;0;1;0", "aff_unique_norm": "Duke University;3M Company", "aff_unique_dep": ";", "aff_unique_url": "https://www.duke.edu;https://www.3m.com", "aff_unique_abbr": "Duke;3M", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "JKRVarUs3A1", "title": "Distributed Optimal Margin Distribution Machine", "track": "main", "status": "Reject", "tldr": "", "abstract": "Optimal margin Distribution Machine (ODM), a newly proposed statistical learning framework rooting in the novel margin theory, demonstrates better generalization performance than the traditional large margin based counterparts. Nonetheless, the same with other kernel methods, it suffers from the ubiquitous scalability problem in terms of both computation time and memory. In this paper, we propose a Distributed solver for ODM (DiODM), which leads to nearly ten times speedup for training kernel ODM. It exploits a novel data partition method to make the local ODM trained on each partition has a solution close to the global one. When linear kernel used, we extend a communication efficient distributed SVRG method to further accelerate the training. Extensive empirical studies validate the superiority of our proposed method compared to other off-the-shelf distributed quadratic programming solvers for kernel methods.", "keywords": "Distributed machine learning;margin distribution;classification;kernel learning", "primary_area": "", "supplementary_material": "", "author": "Yilin Wang;nan cao;Teng Zhang;Hai Jin", "authorids": "~Yilin_Wang6;~nan_cao3;~Teng_Zhang3;~Hai_Jin1", "gender": "M;M;M;M", "homepage": ";;;http://www.linkedin.com/in/jinhust", "dblp": ";;38/5156-1;98/4156", "google_scholar": ";EiaDNawAAAAJ;https://scholar.google.com.hk/citations?user=gysI2pYAAAAJ;", "orcid": ";;;0000-0002-3934-7605", "linkedin": "%E5%BF%86%E9%BA%9F-%E7%8E%8B-68640616a/;;;jinhust", "or_profile": "~Yilin_Wang6;~nan_cao3;~Teng_Zhang3;~Hai_Jin1", "aff": "Huazhong University of Science and Technology;Huazhong University of Science and Technology;Huazhong University of Science and Technology;Huazhong University of Science and Technology", "aff_domain": "hust.edu.cn;hust.edu.cn;hust.edu.cn;hust.edu.cn", "position": "MS student;PhD student;Associate Professor;Full Professor", "bibtex": "@misc{\nwang2022distributed,\ntitle={Distributed Optimal Margin Distribution Machine},\nauthor={Yilin Wang and nan cao and Teng Zhang and Hai Jin},\nyear={2022},\nurl={https://openreview.net/forum?id=JKRVarUs3A1}\n}", "github": "", "project": "", "reviewers": "AGhv;cre7;NozY;jk73", "site": "https://openreview.net/forum?id=JKRVarUs3A1", "pdf_size": 0, "recommendation": "3;3;8;8", "confidence": "4;3;4;4", "correctness": "3;4;4;3", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "103;37;82;102", "wc_summary_review": "29;21;77;41", "wc_main_review": "389;75;102;255", "wc_review": "521;133;261;398", "wc_reply_reviewers": "0;0;0;88", "wc_reply_authors": "650;384;96;301", "reply_reviewers": "0;0;0;1", "reply_authors": "1;1;1;2", "recommendation_avg": [ 5.5, 2.5 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 81.0, 26.74883175019051 ], "wc_summary_review_avg": [ 42.0, 21.42428528562855 ], "wc_main_review_avg": [ 205.25, 126.35738007730296 ], "wc_review_avg": [ 328.25, 145.48432046100362 ], "wc_reply_reviewers_avg": [ 22.0, 38.1051177665153 ], "wc_reply_authors_avg": [ 357.75, 198.64085053180779 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:vRZR1kXa0lkJ:scholar.google.com/&scioq=Distributed+Optimal+Margin+Distribution+Machine&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Huazhong University of Science and Technology", "aff_unique_dep": "", "aff_unique_url": "http://www.hust.edu.cn", "aff_unique_abbr": "HUST", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "id": "JLbXkHkLCG6", "title": "Imitation Learning from Pixel Observations for Continuous Control", "track": "main", "status": "Reject", "tldr": "", "abstract": "We study imitation learning using only visual observations for controlling dynamical systems with continuous states and actions. This setting is attractive due to the large amount of video data available from which agents could learn from. However, it is challenging due to $i)$ not observing the actions and $ii)$ the high-dimensional visual space. In this setting, we explore recipes for imitation learning based on adversarial learning and optimal transport. A key feature of our methods is to use representations from the RL encoder to compute imitation rewards. These recipes enable us to scale these methods to attain expert-level performance on visual continuous control tasks in the DeepMind control suite. We investigate the tradeoffs of these approaches and present a comprehensive evaluation of the key design choices. To encourage reproducible research in this area, we provide an easy-to-use implementation for benchmarking visual imitation learning, including our methods.", "keywords": "imitation learning;optimal transport;GAIL;adversarial learning", "primary_area": "", "supplementary_material": "", "author": "Samuel Cohen;Brandon Amos;Marc Peter Deisenroth;Mikael Henaff;Eugene Vinitsky;Denis Yarats", "authorids": "~Samuel_Cohen1;~Brandon_Amos1;~Marc_Peter_Deisenroth1;~Mikael_Henaff1;~Eugene_Vinitsky1;~Denis_Yarats1", "gender": "M;;M;M;M;M", "homepage": ";http://bamos.github.io;http://www.mikaelhenaff.com;https://eugenevinitsky.github.io;http://denis-yarats.info/;https://deisenroth.cc", "dblp": ";133/4801.html;86/10571;207/7772;200/8142;76/5043", "google_scholar": "CmdjfTsAAAAJ;d8gdZR4AAAAJ;bX__wkYAAAAJ;6dr5fLEAAAAJ;7kaXqgMAAAAJ;https://scholar.google.co.uk/citations?user=GDabimYAAAAJ", "orcid": ";;;;;", "linkedin": ";bdamos;;;;", "or_profile": "~Samuel_Cohen1;~Brandon_Amos1;~Mikael_Henaff1;~Eugene_Vinitsky1;~Denis_Yarats1;~Marc_Deisenroth1", "aff": "University College London;Meta;Meta;New York University;New York University;University College London", "aff_domain": "ucl.ac.uk;meta.com;meta.com;nyu.edu;cs.nyu.edu;ucl.ac.uk", "position": "PhD student;Research Scientist;Researcher;Assistant Professor;PhD student;Full Professor", "bibtex": "@misc{\ncohen2022imitation,\ntitle={Imitation Learning from Pixel Observations for Continuous Control},\nauthor={Samuel Cohen and Brandon Amos and Marc Peter Deisenroth and Mikael Henaff and Eugene Vinitsky and Denis Yarats},\nyear={2022},\nurl={https://openreview.net/forum?id=JLbXkHkLCG6}\n}", "github": "", "project": "", "reviewers": "omV5;JN38;wukE;R6AN", "site": "https://openreview.net/forum?id=JLbXkHkLCG6", "pdf_size": 0, "recommendation": "3;5;5;5", "confidence": "4;4;4;3", "correctness": "2;3;4;3", "technical_novelty": "3;2;2;2", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "67;70;105;77", "wc_summary_review": "73;29;44;89", "wc_main_review": "475;157;222;89", "wc_review": "615;256;371;255", "wc_reply_reviewers": "364;45;96;0", "wc_reply_authors": "483;440;662;360", "reply_reviewers": "1;1;2;0", "reply_authors": "1;1;2;1", "recommendation_avg": [ 4.5, 0.8660254037844386 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 79.75, 15.022899187573616 ], "wc_summary_review_avg": [ 58.75, 23.562417108607512 ], "wc_main_review_avg": [ 235.75, 145.91671425851118 ], "wc_review_avg": [ 374.25, 146.7776805239816 ], "wc_reply_reviewers_avg": [ 126.25, 141.40434045672006 ], "wc_reply_authors_avg": [ 486.25, 110.6534567919141 ], "reply_reviewers_avg": [ 1.0, 0.7071067811865476 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": 0.816496580927726, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11624850855740701356&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "aff_unique_index": "0;1;1;2;2;0", "aff_unique_norm": "University College London;Meta;New York University", "aff_unique_dep": ";Meta Platforms, Inc.;", "aff_unique_url": "https://www.ucl.ac.uk;https://meta.com;https://www.nyu.edu", "aff_unique_abbr": "UCL;Meta;NYU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1;1;0", "aff_country_unique": "United Kingdom;United States" }, { "id": "JLxrtqUaVe", "title": "Integrating Attention Feedback into the Recurrent Neural Network", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "In this paper, an improved long short-term memory (LSTM) structure, called hidden attention long short-term memory (HA-LSTM), is proposed to reduce the long-term memory loss when updating the LSTM neural network at each time step. The HA-LSTM structure is different from the standard LSTM structure because a scaled dot-product attention-based sliding controller is introduced to the LSTM structure. The design of the sliding attention controller provides the traditional attention mechanism with a time-varying property, which makes the attention mechanism more suitable for time-series analysis tasks. Traditionally, the inputs to the attention mechanism are the hidden state vectors at all time steps. In contrast, the length of the inputs to the sliding attention controller is range-limited, which means it uses less memory than the traditional attention mechanism. In addition, the HA-LSTM structure integrates the attention mechanism into the standard LSTM structure, which provides the structure with advantages over both the standard LSTM structure and the attention mechanism. Different from most works, which perform unilateral computations on the attention mechanism after collecting the hidden state vectors at all time steps, the information of the gate vectors and the cell state vector of the HA-LSTM structure are based on feedback from the attention mechanism. In other words, the HA-LSTM structure's feedback property helps the cell state vector retain valuable information. To evaluate the performance of the HA-LSTM structure, four text benchmark datasets are used in experiments for text classification tasks. The model presented here is compared with two classic models and with a state-of-the-art model presented in recent years. Most outcomes indicate that the HA-LSTM structure is superior to the other structures.", "keywords": "recurrent neural network", "primary_area": "", "supplementary_material": "", "author": "Heng Li", "authorids": "~Heng_Li5", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "https://www.linkedin.com/public-profile/settings?trk=d_flagship3_profile_self_view_public_profile&lipi=urn%3Ali%3Apage%3Ad_flagship3_profile_view_base%3Bk4R47AJBT5C28ZXC03kNaQ%3D%3D", "or_profile": "~HENG_LI4", "aff": "", "aff_domain": "", "position": "", "bibtex": "@misc{\nli2022integrating,\ntitle={Integrating Attention Feedback into the Recurrent Neural Network},\nauthor={Heng Li},\nyear={2022},\nurl={https://openreview.net/forum?id=JLxrtqUaVe}\n}", "github": "", "project": "", "reviewers": "ho5V;Qk25;coU8;DVAC", "site": "https://openreview.net/forum?id=JLxrtqUaVe", "pdf_size": 0, "recommendation": "1;1;3;5", "confidence": "4;4;3;4", "correctness": "3;2;3;2", "technical_novelty": "2;1;2;3", "empirical_novelty": "1;1;2;2", "wc_summary_paper": "12;96;72;78", "wc_summary_review": "27;5;34;59", "wc_main_review": "104;5;66;704", "wc_review": "143;106;172;841", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 2.5, 1.6583123951777 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 1.5, 0.5 ], "wc_summary_paper_avg": [ 64.5, 31.571347769773784 ], "wc_summary_review_avg": [ 31.25, 19.266226926930972 ], "wc_main_review_avg": [ 219.75, 281.8034554436833 ], "wc_review_avg": [ 315.5, 304.29796253014905 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 1, 0 ], "corr_recommendation_confidence": -0.17407765595569782, "corr_recommendation_correctness": -0.30151134457776363, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:vZbeoItQ8CQJ:scholar.google.com/&scioq=Integrating+Attention+Feedback+into+the+Recurrent+Neural+Network&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "title": "Who Is the Strongest Enemy? Towards Optimal and Efficient Evasion Attacks in Deep RL", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7130", "id": "JM2kFbJvvI", "poster": "", "openreview": "https://openreview.net/forum?id=JM2kFbJvvI", "slides": "https://iclr.cc/virtual/2022/poster/7130", "video": "https://iclr.cc/virtual/2022/poster/7130", "author_site": "Yanchao Sun, Ruijie Zheng, Yongyuan Liang, Furong Huang", "tldr": "", "abstract": "Evaluating the worst-case performance of a reinforcement learning (RL) agent under the strongest/optimal adversarial perturbations on state observations (within some constraints) is crucial for understanding the robustness of RL agents. However, finding the optimal adversary is challenging, in terms of both whether we can find the optimal attack and how efficiently we can find it. Existing works on adversarial RL either use heuristics-based methods that may not find the strongest adversary, or directly train an RL-based adversary by treating the agent as a part of the environment, which can find the optimal adversary but may become intractable in a large state space. \nThis paper introduces a novel attacking method to find the optimal attacks through collaboration between a designed function named \"actor\" and an RL-based learner named \"director'\". The actor crafts state perturbations for a given policy perturbation direction, and the director learns to propose the best policy perturbation directions. Our proposed algorithm, PA-AD, is theoretically optimal and significantly more efficient than prior RL-based works in environments with large state spaces. Empirical results show that our proposed PA-AD universally outperforms state-of-the-art attacking methods in various Atari and MuJoCo environments. By applying PA-AD to adversarial training, we achieve state-of-the-art empirical robustness in multiple tasks under strong adversaries.", "keywords": "adversarial RL;robustness of RL;evasion attack;optimal attack;observation perturbation", "primary_area": "", "supplementary_material": "/attachment/63f167f8a582b919bb82474e697ab07a0d31207c.zip", "author": "Yanchao Sun;Ruijie Zheng;Yongyuan Liang;Furong Huang", "authorids": "~Yanchao_Sun1;~Ruijie_Zheng1;~Yongyuan_Liang1;~Furong_Huang1", "gender": "F;;F;F", "homepage": "https://ycsun2017.github.io/home/index.html;http://www.ruijiezheng.com;https://cheryyunl.github.io/;https://furong-huang.com", "dblp": "132/6840;294/8474;238/4104;72/8513", "google_scholar": "bloBY_QAAAAJ;;GQToORIAAAAJ;13yyuCcAAAAJ", "orcid": "0000-0002-1137-9939;;;", "linkedin": ";;https://linkedin.com/in/yongyuan-l-31462a17a;", "or_profile": "~Yanchao_Sun1;~Ruijie_Zheng1;~Yongyuan_Liang1;~Furong_Huang1", "aff": "University of Maryland, College Park;Department of Computer Science, University of Maryland, College Park;;University of Maryland", "aff_domain": "umd.edu;cs.umd.edu;;cs.umd.edu", "position": "PhD student;Undergrad student;;Assistant Professor", "bibtex": "@inproceedings{\nsun2022who,\ntitle={Who Is the Strongest Enemy? Towards Optimal and Efficient Evasion Attacks in Deep {RL}},\nauthor={Yanchao Sun and Ruijie Zheng and Yongyuan Liang and Furong Huang},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=JM2kFbJvvI}\n}", "github": "", "project": "", "reviewers": "RTbW;mGyi;73ny;pYuF", "pdf_size": 0, "recommendation": "3;6;8;8", "confidence": "2;3;4;3", "correctness": "2;4;3;3", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "48;50;207;105", "wc_summary_review": "24;21;53;28", "wc_main_review": "288;180;499;342", "wc_review": "360;251;759;475", "wc_reply_reviewers": "0;0;0;15", "wc_reply_authors": "2042;755;1381;784", "reply_reviewers": "0;0;0;1", "reply_authors": "4;1;2;1", "recommendation_avg": [ 6.25, 2.0463381929681126 ], "confidence_avg": [ 3.0, 0.7071067811865476 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 102.5, 64.52325162296147 ], "wc_summary_review_avg": [ 31.5, 12.658988901172163 ], "wc_main_review_avg": [ 327.25, 115.04211185474648 ], "wc_review_avg": [ 461.25, 189.2754276180614 ], "wc_reply_reviewers_avg": [ 3.75, 6.49519052838329 ], "wc_reply_authors_avg": [ 1240.5, 525.8909107410014 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 2.0, 1.224744871391589 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.8638684255813602, "corr_recommendation_correctness": 0.5183210553488161, "gs_citation": 79, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16507433832957753266&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=JM2kFbJvvI", "email": "umd.edu;cs.umd.edu;;cs.umd.edu", "author_num": 4, "aff_unique_index": "0;1;0", "aff_unique_norm": "University of Maryland;University of Maryland, College Park", "aff_unique_dep": ";Department of Computer Science", "aff_unique_url": "https://www/umd.edu;https://www/umd.edu", "aff_unique_abbr": "UMD;UMD", "aff_campus_unique_index": "0;0", "aff_campus_unique": "College Park;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "JMri406Cb-", "title": "Decoupling Strategy and Surface Realization for Task-oriented Dialogues", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Task-oriented dialogue systems assist users in completing various tasks by generating appropriate responses. The key lies in effective strategy learning and surface realization, which are largely mixed together by the cutting-edge methods. They thus face two problems: a) the learning of high-level strategy could easily be misled by the detailed word sequence optimization, and b) directly emphasizing the agent's goal through reinforcement learning (RL) also leads to corrupted solutions like ungrammatical or repetitive responses. In this work, we propose to decouple the strategy learning and surface realization in a general framework, called DSSR. The core is to construct a latent content space for strategy optimization and disentangle the surface style from it. Specifically, we optimize the latent content distribution for strategy towards task completion, and assume that such distribution is shared across different surface style realizations. By further constructing an encoder-decoder scheme for the surface part, it not only facilitates decoupled optimization via RL for both strategy and surface asynchronously, but also supports controllable surface style transfer of responses. We test DSSR on the multi-domain dialogue datasets MultiWoz 2.0 and MultiWoz 2.1 in comparison with methods mixing strategy and surface realization in different levels, showing improvements in the performance evaluated by various evaluation metrics. Finally, we demonstrate the semantic meanings of latent content distributions to show the disentangling effect of DSSR, and show that it can do effective surface style transfer as by-products.", "keywords": "Strategy Optimization;Surface Realization;Task-oriented Dialogue", "primary_area": "", "supplementary_material": "/attachment/179ce77c240122fdaeaa7032720d7f6525e40cba.zip", "author": "Chenchen Ye;Lizi Liao;Fuli Feng;Wei Ji;Tat-Seng Chua", "authorids": "e0261968@u.nus.edu;~Lizi_Liao1;~Fuli_Feng1;jiwei@nus.edu.sg;~Tat-Seng_Chua2", "gender": ";F;M;;", "homepage": ";https://liziliao.github.io/;https://fulifeng.github.io/;;", "dblp": ";149/1249;183/9198;;", "google_scholar": ";https://scholar.google.com.sg/citations?user=W2b08EUAAAAJ;https://scholar.google.com.sg/citations?user=QePM4u8AAAAJ;;", "orcid": ";;0000-0002-5828-9842;;", "linkedin": ";;;;", "or_profile": "e0261968@u.nus.edu;~Lizi_Liao1;~Fuli_Feng1;jiwei@nus.edu.sg;~Tat-Seng_Chua2", "aff": ";Singapore Management University;University of Science and Technology of China;;", "aff_domain": ";smu.edu.sg;ustc.edu.cn;;", "position": ";Assistant Professor;Full Professor;;", "bibtex": "@misc{\nye2022decoupling,\ntitle={Decoupling Strategy and Surface Realization for Task-oriented Dialogues},\nauthor={Chenchen Ye and Lizi Liao and Fuli Feng and Wei Ji and Tat-Seng Chua},\nyear={2022},\nurl={https://openreview.net/forum?id=JMri406Cb-}\n}", "github": "", "project": "", "reviewers": "KarD;eqmi;pvn8;SvPG;9zrd;gxp5", "site": "https://openreview.net/forum?id=JMri406Cb-", "pdf_size": 0, "recommendation": "3;3;5;5;5;5", "confidence": "3;3;4;4;4;3", "correctness": "2;3;3;3;2;3", "technical_novelty": "3;2;2;3;2;2", "empirical_novelty": "3;2;2;3;2;2", "wc_summary_paper": "41;49;59;222;152;64", "wc_summary_review": "40;29;82;71;300;15", "wc_main_review": "468;315;209;670;63;95", "wc_review": "549;393;350;963;515;174", "wc_reply_reviewers": "0;0;0;0;0;0", "wc_reply_authors": "0;0;0;0;0;0", "reply_reviewers": "0;0;0;0;0;0", "reply_authors": "0;0;0;0;0;0", "recommendation_avg": [ 4.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 2.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 97.83333333333333, 66.60684816309973 ], "wc_summary_review_avg": [ 89.5, 96.9273783131818 ], "wc_main_review_avg": [ 303.3333333333333, 212.82126042500755 ], "wc_review_avg": [ 490.6666666666667, 243.82279539771412 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.7071067811865476, "corr_recommendation_correctness": 0.2500000000000001, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:ccAG1QtogGAJ:scholar.google.com/&scioq=Decoupling+Strategy+and+Surface+Realization+for+Task-oriented+Dialogues&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1", "aff_unique_norm": "Singapore Management University;University of Science and Technology of China", "aff_unique_dep": ";", "aff_unique_url": "https://www.smu.edu.sg;http://www.ustc.edu.cn", "aff_unique_abbr": "SMU;USTC", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "Singapore;China" }, { "title": "Neural Processes with Stochastic Attention: Paying more attention to the context dataset", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6815", "id": "JPkQwEdYn8", "poster": "", "openreview": "https://openreview.net/forum?id=JPkQwEdYn8", "slides": "https://iclr.cc/virtual/2022/poster/6815", "video": "https://iclr.cc/virtual/2022/poster/6815", "author_site": "Mingyu Kim, Kyeong Ryeol Go, Se-Young Yun", "tldr": "", "abstract": "Neural processes (NPs) aim to stochastically complete unseen data points based on a given context dataset. NPs essentially leverage a given dataset as a context representation to derive a suitable identifier for a novel task. To improve the prediction accuracy, many variants of NPs have investigated context embedding approaches that generally design novel network architectures and aggregation functions satisfying permutation invariant. In this work, we propose a stochastic attention mechanism for NPs to capture appropriate context information. From the perspective of information theory, we demonstrate that the proposed method encourages context embedding to be differentiated from a target dataset, allowing NPs to consider features in a target dataset and context embedding independently. We observe that the proposed method can appropriately capture context embedding even under noisy data sets and restricted task distributions, where typical NPs suffer from a lack of context embeddings. We empirically show that our approach substantially outperforms conventional NPs in various domains through 1D regression, predator-prey model, and image completion. Moreover, the proposed method is also validated by MovieLens-10k dataset, a real-world problem.", "keywords": "neural processes;stochastic attention;variational inference;information theory", "primary_area": "", "supplementary_material": "", "author": "Mingyu Kim;Kyeong Ryeol Go;Se-Young Yun", "authorids": "~Mingyu_Kim2;~Kyeong_Ryeol_Go1;~Se-Young_Yun1", "gender": ";;M", "homepage": ";;https://fbsqkd.github.io", "dblp": ";;23/8862", "google_scholar": ";;X_IAjb8AAAAJ", "orcid": ";;", "linkedin": ";;seyoung-yun-395130ab/", "or_profile": "~Mingyu_Kim2;~Kyeong_Ryeol_Go1;~Se-Young_Yun1", "aff": ";;KAIST", "aff_domain": ";;kaist.ac.kr", "position": ";;Assistant Professor", "bibtex": "@inproceedings{\nkim2022neural,\ntitle={Neural Processes with Stochastic Attention: Paying more attention to the context dataset},\nauthor={Mingyu Kim and Kyeong Ryeol Go and Se-Young Yun},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=JPkQwEdYn8}\n}", "github": "", "project": "", "reviewers": "TBTA;DEA4;uH8j;fqvy", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "4;2;2;4", "correctness": "3;4;3;3", "technical_novelty": "3;2;3;3", "empirical_novelty": "3;3;2;2", "wc_summary_paper": "45;120;46;231", "wc_summary_review": "220;57;24;61", "wc_main_review": "215;176;104;892", "wc_review": "480;353;174;1184", "wc_reply_reviewers": "0;77;0;0", "wc_reply_authors": "3250;832;1377;4146", "reply_reviewers": "0;1;0;0", "reply_authors": "5;2;2;7", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 3.0, 1.0 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 110.5, 75.92924337829267 ], "wc_summary_review_avg": [ 90.5, 76.13310712167211 ], "wc_main_review_avg": [ 346.75, 317.3085052437139 ], "wc_review_avg": [ 547.75, 383.0863969132812 ], "wc_reply_reviewers_avg": [ 19.25, 33.34197804570089 ], "wc_reply_authors_avg": [ 2401.25, 1348.7181645918467 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 4.0, 2.1213203435596424 ], "replies_avg": [ 24, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.2294157338705618, "corr_recommendation_correctness": -0.13245323570650439, "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4366830755369002835&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=JPkQwEdYn8", "email": ";;kaist.ac.kr", "author_num": 3, "aff_unique_index": "0", "aff_unique_norm": "Korea Advanced Institute of Science and Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.kaist.ac.kr", "aff_unique_abbr": "KAIST", "aff_country_unique_index": "0", "aff_country_unique": "South Korea" }, { "id": "JQ1RLAEn-BO", "title": "Kernel Density Decision Trees", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "We propose kernel density decision trees (KDDTs), a novel fuzzy decision tree (FDT) formalism based on kernel density estimation that achieves state-of-the-art prediction performance often matching or exceeding that of conventional tree ensembles. Ensembles of KDDTs achieve even better generalization. FDTs address the sensitivity and tendency to overfitting of decision trees by representing uncertainty through fuzzy partitions. However, compared to conventional, crisp decision trees, FDTs are generally complex to apply, sensitive to design choices, slow to fit and make predictions, and difficult to interpret. Moreover, finding the optimal threshold for a given fuzzy split is challenging, resulting in methods that discretize data, settle for near-optimal thresholds, or fuzzify crisp trees. Our KDDTs address these shortcomings using a fast algorithm for finding optimal partitions for FDTs with piecewise-linear splitting functions or KDDTs with piecewise-constant fitting kernels. Prediction can take place with or without fuzziness; without it, KDDTs are identical to standard decision trees, but with a more robust fitting algorithm. Using KDDTs simplifies the process of fitting a model, grounds design choices in the well-studied theory of density estimation, supports optional incorporation of expert knowledge about uncertainty in the data, and enables interpretation in the context of kernels. We demonstrate prediction performance against conventional decision trees and tree ensembles on 12 publicly available datasets.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jack Henry Good;Kyle Miller;Artur Dubrawski", "authorids": "~Jack_Henry_Good1;~Kyle_Miller1;~Artur_Dubrawski2", "gender": "M;;M", "homepage": "https://www.ri.cmu.edu/ri-people/jack-henry-good/;;https://www.autonlab.org", "dblp": "221/2759.html;92/11514;76/48", "google_scholar": ";;O3gezzcAAAAJ", "orcid": "0000-0003-1886-9217;;0000-0002-2372-0831", "linkedin": ";;artur-dubrawski-33a2a87/", "or_profile": "~Jack_Henry_Good1;~Kyle_Miller1;~Artur_Dubrawski2", "aff": "Carnegie Mellon University;Carnegie Mellon University;Carnegie Mellon University", "aff_domain": "cmu.edu;andrew;cmu.edu", "position": "PhD student;Project scientist;Research Professor", "bibtex": "@misc{\ngood2022kernel,\ntitle={Kernel Density Decision Trees},\nauthor={Jack Henry Good and Kyle Miller and Artur Dubrawski},\nyear={2022},\nurl={https://openreview.net/forum?id=JQ1RLAEn-BO}\n}", "github": "", "project": "", "reviewers": "aDpD;jz39;a1om;L6mQ", "site": "https://openreview.net/forum?id=JQ1RLAEn-BO", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "4;5;4;4", "correctness": "2;2;4;2", "technical_novelty": "3;2;2;2", "empirical_novelty": "3;2;2;3", "wc_summary_paper": "125;83;56;21", "wc_summary_review": "69;59;23;21", "wc_main_review": "561;619;90;116", "wc_review": "755;761;169;158", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "271;677;140;241", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 2.5, 0.8660254037844386 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 71.25, 38.028772002261654 ], "wc_summary_review_avg": [ 43.0, 21.307275752662516 ], "wc_main_review_avg": [ 346.5, 244.53476235496663 ], "wc_review_avg": [ 460.75, 297.2830091007557 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 332.25, 204.8723688055566 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17069646896618366034&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Carnegie Mellon University", "aff_unique_dep": "", "aff_unique_url": "https://www.cmu.edu", "aff_unique_abbr": "CMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "JRrjhY3sJy_", "title": "Molecular Graph Generation via Geometric Scattering", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Graph neural networks (GNNs) have been used extensively for addressing problems in drug design and discovery. Both ligand and target molecules are represented as graphs with node and edge features encoding information about atomic elements and bonds respectively. Although existing deep learning models perform remarkably well at predicting physicochemical properties and binding affinities, the generation of new molecules with optimized properties remains challenging. Inherently, most GNNs perform poorly in whole-graph representation due to the limitations of the message-passing paradigm. Furthermore, step-by-step graph generation frameworks that use reinforcement learning or other sequential processing can be slow and result in a high proportion of invalid molecules with substantial post-processing needed in order to satisfy the principles of stoichiometry. To address these issues, we propose a representation-first approach to molecular graph generation. We guide the latent representation of an autoencoder by capturing graph structure information with the geometric scattering transform and apply penalties that structure the representation also by molecular properties. We show that this highly structured latent space can be directly used for molecular graph generation by the use of a GAN. We demonstrate that our architecture learns meaningful representations of drug datasets and provides a platform for goal-directed drug synthesis.", "keywords": "geometric scattering;drug discovery;drug design;graph generation;autoencoder", "primary_area": "", "supplementary_material": "", "author": "Dhananjay Bhaskar;Jackson David Grady;Michael Perlmutter;Smita Krishnaswamy", "authorids": "~Dhananjay_Bhaskar1;jackson.grady@yale.edu;~Michael_Perlmutter1;~Smita_Krishnaswamy1", "gender": "M;;;F", "homepage": "http://www.dhananjaybhaskar.com/;;https://sites.google.com/view/perlmutma/home;http://www.krishnaswamylab.org", "dblp": "215/8000;;143/2555.html;74/2457", "google_scholar": "9vji8eoAAAAJ;;3eoqaDsAAAAJ;l2Pr9m8AAAAJ", "orcid": "0000-0001-8068-3101;;;", "linkedin": "dhananjay-bhaskar-53391362;;;", "or_profile": "~Dhananjay_Bhaskar1;jackson.grady@yale.edu;~Michael_Perlmutter1;~Smita_Krishnaswamy1", "aff": "Yale University;;University of California, Los Angeles;Yale University", "aff_domain": "yale.edu;;ucla.edu;yale.edu", "position": "Postdoc;;Postdoc;Associate Professor", "bibtex": "@misc{\nbhaskar2022molecular,\ntitle={Molecular Graph Generation via Geometric Scattering},\nauthor={Dhananjay Bhaskar and Jackson David Grady and Michael Perlmutter and Smita Krishnaswamy},\nyear={2022},\nurl={https://openreview.net/forum?id=JRrjhY3sJy_}\n}", "github": "", "project": "", "reviewers": "y4ek;wrxP;LtPp;sYuj", "site": "https://openreview.net/forum?id=JRrjhY3sJy_", "pdf_size": 0, "recommendation": "3;3;5;6", "confidence": "4;4;4;4", "correctness": "1;2;3;4", "technical_novelty": "2;2;3;4", "empirical_novelty": "1;2;3;4", "wc_summary_paper": "49;93;35;51", "wc_summary_review": "25;51;51;7", "wc_main_review": "453;237;353;170", "wc_review": "527;381;439;228", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.25, 1.299038105676658 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 2.5, 1.118033988749895 ], "technical_novelty_avg": [ 2.75, 0.82915619758885 ], "empirical_novelty_avg": [ 2.5, 1.118033988749895 ], "wc_summary_paper_avg": [ 57.0, 21.6794833886788 ], "wc_summary_review_avg": [ 33.5, 18.621224449536072 ], "wc_main_review_avg": [ 303.25, 108.4490087552671 ], "wc_review_avg": [ 393.75, 108.90219235626067 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.9467292624062574, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7531054775262875976&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff_unique_index": "0;1;0", "aff_unique_norm": "Yale University;University of California, Los Angeles", "aff_unique_dep": ";", "aff_unique_url": "https://www.yale.edu;https://www.ucla.edu", "aff_unique_abbr": "Yale;UCLA", "aff_campus_unique_index": "1", "aff_campus_unique": ";Los Angeles", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Path Auxiliary Proposal for MCMC in Discrete Space", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7060", "id": "JSR-YDImK95", "poster": "", "openreview": "https://openreview.net/forum?id=JSR-YDImK95", "slides": "https://iclr.cc/virtual/2022/poster/7060", "video": "https://iclr.cc/virtual/2022/poster/7060", "author_site": "Haoran Sun, Hanjun Dai, Wei Xia, Arun Ramamurthy", "tldr": "", "abstract": "Energy-based Model (EBM) offers a powerful approach for modeling discrete structure, but both inference and learning of EBM are hard as it involves sampling from discrete distributions. Recent work shows Markov Chain Monte Carlo (MCMC) with the informed proposal is a powerful tool for such sampling. However, an informed proposal only allows local updates as it requires evaluating all energy changes in the neighborhood.\nIn this work, we present a path auxiliary algorithm that uses a composition of local moves to efficiently explore large neighborhoods. We also give a fast version of our algorithm that only queries the evaluation of energy function twice for each proposal via linearization of the energy function. Empirically, we show that our path auxiliary algorithms considerably outperform other generic samplers on various discrete models for sampling, inference, and learning. Our method can also be used to train deep EBMs for high-dimensional discrete data.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/a826a656d5887cf7c3db599b1c2f2b6192443522.zip", "author": "Haoran Sun;Hanjun Dai;Wei Xia;Arun Ramamurthy", "authorids": "~Haoran_Sun2;~Hanjun_Dai1;~Wei_Xia8;~Arun_Ramamurthy1", "gender": "M;M;M;M", "homepage": ";https://hanjun-dai.github.io;;", "dblp": ";144/7311;;", "google_scholar": "p7of_yoAAAAJ;obpl7GQAAAAJ;;", "orcid": ";;;", "linkedin": ";hanjun-dai;wei-xia-1118bb5a/;", "or_profile": "~Haoran_Sun2;~Hanjun_Dai1;~Wei_Xia8;~Arun_Ramamurthy1", "aff": "Georgia Institute of Technology;Google Research;Siemens Technology;", "aff_domain": "gatech.edu;google.com;siemens.com;", "position": "PhD student;Researcher;Research Scientist;", "bibtex": "@inproceedings{\nsun2022path,\ntitle={Path Auxiliary Proposal for {MCMC} in Discrete Space},\nauthor={Haoran Sun and Hanjun Dai and Wei Xia and Arun Ramamurthy},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=JSR-YDImK95}\n}", "github": "", "project": "", "reviewers": "CSos;EvqS;Eoj5;y5iJ", "pdf_size": 0, "recommendation": "8;8;8;8", "confidence": "5;5;3;5", "correctness": "4;4;3;4", "technical_novelty": "3;3;3;3", "empirical_novelty": "3;3;3;3", "wc_summary_paper": "306;137;65;127", "wc_summary_review": "84;75;75;31", "wc_main_review": "743;328;744;346", "wc_review": "1133;540;884;504", "wc_reply_reviewers": "0;290;169;725", "wc_reply_authors": "452;732;902;1380", "reply_reviewers": "0;2;1;5", "reply_authors": "1;2;3;5", "recommendation_avg": [ 8.0, 0.0 ], "confidence_avg": [ 4.5, 0.8660254037844386 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 158.75, 89.3766608237296 ], "wc_summary_review_avg": [ 66.25, 20.680606857633556 ], "wc_main_review_avg": [ 540.25, 203.349913941462 ], "wc_review_avg": [ 765.25, 259.00325770152006 ], "wc_reply_reviewers_avg": [ 296.0, 268.2452236294246 ], "wc_reply_authors_avg": [ 866.5, 337.2102459890565 ], "reply_reviewers_avg": [ 2.0, 1.8708286933869707 ], "reply_authors_avg": [ 2.75, 1.479019945774904 ], "replies_avg": [ 24, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 29, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14146195605258123964&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2, "pdf": "https://openreview.net/pdf?id=JSR-YDImK95", "email": "gatech.edu;google.com;siemens.com;", "author_num": 4, "aff_unique_index": "0;1;2", "aff_unique_norm": "Georgia Institute of Technology;Google;Siemens AG", "aff_unique_dep": ";Google Research;", "aff_unique_url": "https://www.gatech.edu;https://research.google;https://www.siemens.com", "aff_unique_abbr": "Georgia Tech;Google Research;Siemens", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;1", "aff_country_unique": "United States;Germany" }, { "id": "JSsjw8YuG1P", "title": "PERSONALIZED LAB TEST RESPONSE PREDICTION WITH KNOWLEDGE AUGMENTATION", "track": "main", "status": "Reject", "tldr": "", "abstract": "Personalized medical systems are rapidly gaining traction as opposed to \u201cone size\nfits all\u201d systems. The ability to predict patients\u2019 lab test responses and provide justification for the predictions would serve as an important decision support tool and\nhelp clinicians tailor treatment regimes for patients. This requires one to model\nthe complex interactions among different medications, diseases, and lab tests. We\nalso need to learn a strong patient representation, capturing both the sequential\ninformation accumulated over the visits and information from other similar patients. Further, we model the drug-lab interactions and diagnosis-lab interactions\nin the form of graphs and design a knowledge-augmented approach to predict patients\u2019 response to a target lab result. We also take into consideration patients'\npast lab responses to personalize the prediction. Experiments on the benchmark\nMIMIC-III and a real-world outpatient dataset demonstrate the effectiveness of\nthe proposed solution in reducing prediction errors by a significant margin. Case\nstudies show that the identified top factors for influencing the predicted lab results\nare consistent with the clinicians' understanding.", "keywords": "Lab test responses;Patient Representation;Electronic Health Records", "primary_area": "", "supplementary_material": "", "author": "Suman Bhoi;Mong-Li Lee;Wynne Hsu;Hao Sen Andrew Fang;Ngiap Chuan Tan", "authorids": "~Suman_Bhoi1;~Mong-Li_Lee1;~Wynne_Hsu1;andrew.fang.h.s@singhealth.com.sg;tan.ngiap.chuan@singhealth.com.sg", "gender": "F;F;F;;", "homepage": ";https://www.comp.nus.edu.sg/~leeml/;http://www.comp.nus.edu.sg/~whsu/;;", "dblp": "202/5448;l/MongLiLee;h/WynneHsu;;", "google_scholar": ";https://scholar.google.com.tw/citations?user=_xFTK8wAAAAJ;https://scholar.google.com.tw/citations?user=ljyBjv8AAAAJ;;", "orcid": "0000-0003-0460-9182;0000-0002-9636-388X;0000-0002-4142-8893;;", "linkedin": ";;;;", "or_profile": "~Suman_Bhoi1;~Mong-Li_Lee1;~Wynne_Hsu1;andrew.fang.h.s@singhealth.com.sg;tan.ngiap.chuan@singhealth.com.sg", "aff": "National University of Singapore;National University of Singapore;National University of Singapore;;", "aff_domain": "nus.edu;nus.edu.sg;nus.edu.sg;;", "position": "PhD student;Full Professor;Full Professor;;", "bibtex": "@misc{\nbhoi2022personalized,\ntitle={{PERSONALIZED} {LAB} {TEST} {RESPONSE} {PREDICTION} {WITH} {KNOWLEDGE} {AUGMENTATION}},\nauthor={Suman Bhoi and Mong-Li Lee and Wynne Hsu and Hao Sen Andrew Fang and Ngiap Chuan Tan},\nyear={2022},\nurl={https://openreview.net/forum?id=JSsjw8YuG1P}\n}", "github": "", "project": "", "reviewers": "MoEw;JWHn;3iB5;QtXC", "site": "https://openreview.net/forum?id=JSsjw8YuG1P", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "4;4;5;4", "correctness": "3;3;3;3", "technical_novelty": "2;2;2;2", "empirical_novelty": "0;2;3;3", "wc_summary_paper": "73;256;94;113", "wc_summary_review": "10;182;59;101", "wc_main_review": "322;114;548;984", "wc_review": "405;552;701;1198", "wc_reply_reviewers": "0;0;0;50", "wc_reply_authors": "549;717;710;418", "reply_reviewers": "0;0;0;1", "reply_authors": "1;1;1;1", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.0, 1.224744871391589 ], "wc_summary_paper_avg": [ 134.0, 71.84358008896828 ], "wc_summary_review_avg": [ 88.0, 63.107051903887886 ], "wc_main_review_avg": [ 492.0, 322.8714914637091 ], "wc_review_avg": [ 714.0, 298.3915213272656 ], "wc_reply_reviewers_avg": [ 12.5, 21.650635094610966 ], "wc_reply_authors_avg": [ 598.5, 124.0010080604186 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.13245323570650439, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:jogGkQX_Dr8J:scholar.google.com/&scioq=PERSONALIZED+LAB+TEST+RESPONSE+PREDICTION+WITH+KNOWLEDGE+AUGMENTATION&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;0", "aff_unique_norm": "National University of Singapore", "aff_unique_dep": "", "aff_unique_url": "https://www.nus.edu.sg", "aff_unique_abbr": "NUS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Singapore" }, { "id": "JTbUTe0B0J1", "title": "Network Pruning Spaces", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Network pruning techniques, including weight pruning and filter pruning, reveal that most state-of-the-art neural networks can be accelerated without a significant performance drop. This work focuses on filter pruning which enables accelerated inference with any off-the-shelf deep learning library and hardware. We propose the concept of network pruning spaces that parametrize populations of subnetwork architectures. Based on this concept, we explore the structure aspect of subnetworks that result in minimal loss of accuracy in different pruning regimes and arrive at a series of observations by comparing subnetwork distributions. We conjecture through empirical studies that there exists an optimal FLOPs-to-parameter-bucket ratio related to the design of original network in a pruning regime. Statistically, the structure of a winning subnetwork guarantees an approximately optimal ratio in this regime. Upon our conjectures, we further refine the initial pruning space to reduce the cost of searching a good subnetwork architecture. Our experimental results on ImageNet show that the subnetwork we found is superior to those from the state-of-the-art pruning methods under comparable FLOPs.", "keywords": "network pruning;convolutional neural network;deep learning", "primary_area": "", "supplementary_material": "", "author": "Xuanyu He;Yu-I Yang;Ran Song;Jiachen Pu;Conggang Hu;Feijun Jiang;Wei Zhang;Huanghao Ding", "authorids": "~Xuanyu_He1;yui.yi@alibaba-inc.com;~Ran_Song2;jiachen.pjc@alibaba-inc.com;conggang.hcg@alibaba-inc.com;~Feijun_Jiang1;~Wei_Zhang7;huanghao.dhh@alibaba-inc.com", "gender": "M;;M;;;M;M;", "homepage": ";;https://faculty.sdu.edu.cn/songran/en/index.htm;;;;https://www.vsislab.com;", "dblp": ";;10/8738;;;76/10697;;", "google_scholar": "kVae_BMAAAAJ;;;;;EbXcZLQAAAAJ;;", "orcid": ";;;;;;;", "linkedin": ";;;;;;;", "or_profile": "~Xuanyu_He1;yui.yi@alibaba-inc.com;~Ran_Song2;jiachen.pjc@alibaba-inc.com;conggang.hcg@alibaba-inc.com;~Feijun_Jiang1;~Wei_Zhang7;huanghao.dhh@alibaba-inc.com", "aff": "Alibaba Group;;Shandong University;;;;Shandong University;", "aff_domain": "alibaba-inc.com;;sdu.edu.cn;;;;sdu.edu.cn;", "position": "Engineer;;Full Professor;;;;Full Professor;", "bibtex": "@misc{\nhe2022network,\ntitle={Network Pruning Spaces},\nauthor={Xuanyu He and Yu-I Yang and Ran Song and Jiachen Pu and Conggang Hu and Feijun Jiang and Wei Zhang and Huanghao Ding},\nyear={2022},\nurl={https://openreview.net/forum?id=JTbUTe0B0J1}\n}", "github": "", "project": "", "reviewers": "GTjo;JQFv;4feH", "site": "https://openreview.net/forum?id=JTbUTe0B0J1", "pdf_size": 0, "recommendation": "3;3;3", "confidence": "4;4;4", "correctness": "2;2;2", "technical_novelty": "2;2;4", "empirical_novelty": "2;2;4", "wc_summary_paper": "46;97;52", "wc_summary_review": "16;46;74", "wc_main_review": "309;306;535", "wc_review": "371;449;661", "wc_reply_reviewers": "35;44;71", "wc_reply_authors": "261;106;226", "reply_reviewers": "1;1;1", "reply_authors": "1;1;1", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 2.0, 0.0 ], "technical_novelty_avg": [ 2.6666666666666665, 0.9428090415820634 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.9428090415820634 ], "wc_summary_paper_avg": [ 65.0, 22.759613353482084 ], "wc_summary_review_avg": [ 45.333333333333336, 23.683092891108814 ], "wc_main_review_avg": [ 383.3333333333333, 107.25152161572763 ], "wc_review_avg": [ 493.6666666666667, 122.53253536192835 ], "wc_reply_reviewers_avg": [ 50.0, 15.297058540778355 ], "wc_reply_authors_avg": [ 197.66666666666666, 66.37435917246626 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:L3nUPeIhFJQJ:scholar.google.com/&scioq=Network+Pruning+Spaces&hl=en&as_sdt=0,33", "gs_version_total": 3, "aff_unique_index": "0;1;1", "aff_unique_norm": "Alibaba Group;Shandong University", "aff_unique_dep": ";", "aff_unique_url": "https://www.alibaba.com;http://www.sdu.edu.cn", "aff_unique_abbr": "Alibaba;SDU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "id": "JV4tkMi4xg", "title": "Constrained Discrete Black-Box Optimization using Mixed-Integer Programming", "track": "main", "status": "Reject", "tldr": "", "abstract": "Discrete black-box optimization problems are challenging for model-based optimization (MBO) algorithms, such as Bayesian optimization, due to the size of the search space and the need to satisfy combinatorial constraints. In particular, these methods require repeatedly solving a complex discrete global optimization problem in the inner loop, where popular heuristic inner-loop solvers introduce approximations and are difficult to adapt to combinatorial constraints. In response, we propose NN+MILP, a general discrete MBO framework using piecewise-linear neural networks as surrogate models and mixed-integer linear programming (MILP) to optimize the acquisition function. MILP provides optimality guarantees and a versatile declarative language for domain-specific constraints. We test our approach on a range of unconstrained and constrained problems, including DNA binding and the NAS-Bench-101 neural architecture search benchmark. NN+MILP surpasses or matches the performance of algorithms tailored to the domain at hand, with global optimization of the acquisition problem running in a few minutes using only standard software packages and hardware.", "keywords": "discrete blackbox optimization;mixed integer programming", "primary_area": "", "supplementary_material": "", "author": "Theodore Papalexopoulos;Christian Tjandraatmadja;Ross Anderson;Juan Pablo Vielma;David Benjamin Belanger", "authorids": "~Theodore_Papalexopoulos1;~Christian_Tjandraatmadja1;rander@google.com;~Juan_Pablo_Vielma1;~David_Benjamin_Belanger1", "gender": "M;;;;M", "homepage": ";;;;http://people.cs.umass.edu/~belanger/", "dblp": ";95/5357;;;17/6391", "google_scholar": ";;;;AGnp8NAAAAAJ", "orcid": "0000-0002-4338-3625;;;;", "linkedin": ";;;;david-belanger-7ba86869", "or_profile": "~Theodore_Papalexopoulos1;~Christian_Tjandraatmadja1;rander@google.com;~Juan_Pablo_Vielma1;~David_Benjamin_Belanger1", "aff": "Massachusetts Institute of Technology;Google;;;Google Brain", "aff_domain": "mit.edu;google.com;;;google.com", "position": "PhD student;Researcher;;;research scientist", "bibtex": "@misc{\npapalexopoulos2022constrained,\ntitle={Constrained Discrete Black-Box Optimization using Mixed-Integer Programming},\nauthor={Theodore Papalexopoulos and Christian Tjandraatmadja and Ross Anderson and Juan Pablo Vielma and David Benjamin Belanger},\nyear={2022},\nurl={https://openreview.net/forum?id=JV4tkMi4xg}\n}", "github": "", "project": "", "reviewers": "qsFh;5edm;vgXk;j85k", "site": "https://openreview.net/forum?id=JV4tkMi4xg", "pdf_size": 0, "recommendation": "3;5;6;6", "confidence": "4;4;4;4", "correctness": "3;4;3;3", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;2;4;3", "wc_summary_paper": "92;74;280;69", "wc_summary_review": "37;42;75;53", "wc_main_review": "221;300;602;414", "wc_review": "350;416;957;536", "wc_reply_reviewers": "233;34;206;50", "wc_reply_authors": "654;651;571;665", "reply_reviewers": "2;1;1;1", "reply_authors": "2;1;2;2", "recommendation_avg": [ 5.0, 1.224744871391589 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 128.75, 87.74216489237088 ], "wc_summary_review_avg": [ 51.75, 14.618053906043718 ], "wc_main_review_avg": [ 384.25, 143.2207649050933 ], "wc_review_avg": [ 564.75, 236.0777149584433 ], "wc_reply_reviewers_avg": [ 130.75, 89.44097215482398 ], "wc_reply_authors_avg": [ 635.25, 37.459144410944575 ], "reply_reviewers_avg": [ 1.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.75, 0.4330127018922193 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 34, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4611976570746888116&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;1;1", "aff_unique_norm": "Massachusetts Institute of Technology;Google", "aff_unique_dep": ";Google", "aff_unique_url": "https://web.mit.edu;https://www.google.com", "aff_unique_abbr": "MIT;Google", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "JVR4JswsEM", "title": "A Dot Product Attention Free Transformer", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "We introduce Dot Product Attention Free Transformer (DAFT), an efficient variant of Transformers \\citep{transformer} that eliminates the query-key dot product in self attention. The core idea is to construct a decomposable attention map for each dimension of the query, key and value. This compositionality enables an implementation where the attention tensor does not to be computed or stored explicitly. A DAFT layer has a memory complexity linear w.r.t. both the context size and the dimension of features, making it compatible with both large input and model sizes. We also introduce DAFT-conv, a model variant that takes advantage of locality and spatial weight sharing while maintaining global connectivity. We conduct experiments on ImageNet-1K classification, as well as CIFAR10 and Enwik8, two autoregressive modeling tasks. We show that DAFT demonstrates competitive performance on all the benchmarks, while providing excellent efficiency at the same time.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Shuangfei Zhai;Walter Talbott;Nitish Srivastava;Chen Huang;Hanlin Goh;Ruixiang ZHANG;Joshua M. Susskind", "authorids": "~Shuangfei_Zhai3;~Walter_Talbott1;~Nitish_Srivastava1;~Chen_Huang6;~Hanlin_Goh2;~Ruixiang_ZHANG1;~Joshua_M._Susskind1", "gender": "M;;M;M;M;M;M", "homepage": "http://cs.binghamton.edu/~szhai2;;http://www.cs.toronto.edu/~nitish;;;http://ruixiangz.me/;http://www.apple.com", "dblp": ";241/6096;00/11304.html;05/8125-1;96/4057;20/9860;132/7797", "google_scholar": "G6vdBYsAAAAJ;;https://scholar.google.ca/citations?user=s1PgoeUAAAAJ;QZ-JKOUAAAAJ;;https://scholar.google.ca/citations?user=VQYdApgAAAAJ;Sv2TGqsAAAAJ", "orcid": ";;;;;;", "linkedin": ";;;;;;joshua-susskind-8ab2ab5/", "or_profile": "~Shuangfei_Zhai3;~Walter_Talbott1;~Nitish_Srivastava1;~Chen_Huang6;~Hanlin_Goh2;~Ruixiang_ZHANG1;~Joshua_M._Susskind1", "aff": "Apple;Apple;Apple Inc;Apple;Apple;Mila, UdeM;Apple", "aff_domain": "apple.com;apple.com;apple.com;apple.com;apple.com;mila.qubec;apple.com", "position": "Research Scientist;Research Scientist;Researcher;Research Scientist;Research Scientist;PhD student;Researcher", "bibtex": "@misc{\nzhai2022a,\ntitle={A Dot Product Attention Free Transformer},\nauthor={Shuangfei Zhai and Walter Talbott and Nitish Srivastava and Chen Huang and Hanlin Goh and Ruixiang ZHANG and Joshua M. Susskind},\nyear={2022},\nurl={https://openreview.net/forum?id=JVR4JswsEM}\n}", "github": "", "project": "", "reviewers": "T4rC;kT1q;doDF;pHng", "site": "https://openreview.net/forum?id=JVR4JswsEM", "pdf_size": 0, "recommendation": "3;5;5;5", "confidence": "4;3;4;4", "correctness": "4;4;3;3", "technical_novelty": "2;3;3;4", "empirical_novelty": "2;3;2;3", "wc_summary_paper": "47;43;54;97", "wc_summary_review": "49;110;41;33", "wc_main_review": "152;320;369;266", "wc_review": "248;473;464;396", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.5, 0.8660254037844386 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 60.25, 21.579793789561567 ], "wc_summary_review_avg": [ 58.25, 30.408674749156695 ], "wc_main_review_avg": [ 276.75, 80.71361409328664 ], "wc_review_avg": [ 395.25, 90.0760095696962 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": -0.5773502691896257, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:xE8W56eLyO4J:scholar.google.com/&scioq=A+Dot+Product+Attention+Free+Transformer&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0;0;0;0;1;0", "aff_unique_norm": "Apple;Universit\u00e9 de Montr\u00e9al", "aff_unique_dep": "Apple Inc.;Mila", "aff_unique_url": "https://www.apple.com;https://www.udemontreal.ca", "aff_unique_abbr": "Apple;UdeM", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;1;0", "aff_country_unique": "United States;Canada" }, { "id": "JVWB8QRUOi-", "title": "Learning Homophilic Incentives in Sequential Social Dilemmas", "track": "main", "status": "Reject", "tldr": "", "abstract": "Promoting cooperation among self-interested agents is a long-standing and interdisciplinary problem, but receives less attention in multi-agent reinforcement learning (MARL). Game-theoretical studies reveal that altruistic incentives are critical to the emergence of cooperation but their analyses are limited to non-sequential social dilemmas. Recent works using deep MARL also show that learning to incentivize other agents has the potential to promote cooperation in more realistic sequential social dilemmas (SSDs). However, we find that, with these incentivizing mechanisms, the team cooperation level does not converge and regularly oscillates between cooperation and defection during learning. We show that a second-order social dilemma resulting from the incentive mechanisms is the main reason for such fragile cooperation. We analyze the dynamics of second-order social dilemmas and find that a typical tendency of humans, called homophily, provides a promising solution. We propose a novel learning framework to encourage homophilic incentives and show that it achieves stable cooperation in both SSDs of public goods and tragedy of the commons.", "keywords": "Multi-Agent Reinforcement Learning;Sequential Social Dilemma;Cooperation Emergence", "primary_area": "", "supplementary_material": "/attachment/adfe3219a74e175df1b84f8de65e176ca3463cd8.zip", "author": "Heng Dong;Tonghan Wang;Jiayuan Liu;Chi Han;Chongjie Zhang", "authorids": "~Heng_Dong1;~Tonghan_Wang1;~Jiayuan_Liu1;~Chi_Han1;~Chongjie_Zhang1", "gender": "M;M;M;M;", "homepage": "https://drdh.cc;https://tonghanwang.github.io/;https://liu-jiayuan.github.io/;https://glaciohound.github.io;", "dblp": "387/8933.html;175/6039-1.html;;255/6993;29/6693", "google_scholar": "K26AU1EAAAAJ;-AR1yc4AAAAJ;MOFyr2MAAAAJ;https://scholar.google.com.sg/citations?user=DcSvbuAAAAAJ;LjxqXycAAAAJ", "orcid": "0000-0001-7548-3455;;;0000-0001-6235-5841;", "linkedin": ";;jiayuan-liu-50a55a222/;chi-han-b01a93141/;", "or_profile": "~Heng_Dong1;~Tonghan_Wang1;~Jiayuan_Liu1;~Chi_Han1;~Chongjie_Zhang1", "aff": "Tsinghua University;Tsinghua University;School of Engineering and Applied Sciences, Harvard University;University of Illinois, Urbana Champaign;Tsinghua University", "aff_domain": "tsinghua.edu.cn;tsinghua.edu.cn;seas.harvard.edu;illinois.edu;tsinghua.edu.cn", "position": "PhD student;MS student;Intern;PhD student;Assistant Professor", "bibtex": "@misc{\ndong2022learning,\ntitle={Learning Homophilic Incentives in Sequential Social Dilemmas},\nauthor={Heng Dong and Tonghan Wang and Jiayuan Liu and Chi Han and Chongjie Zhang},\nyear={2022},\nurl={https://openreview.net/forum?id=JVWB8QRUOi-}\n}", "github": "", "project": "", "reviewers": "neVn;bDVR;TQpe", "site": "https://openreview.net/forum?id=JVWB8QRUOi-", "pdf_size": 0, "recommendation": "3;3;5", "confidence": "4;5;3", "correctness": "1;3;3", "technical_novelty": "4;2;2", "empirical_novelty": "4;2;3", "wc_summary_paper": "107;86;76", "wc_summary_review": "41;16;43", "wc_main_review": "257;349;561", "wc_review": "405;451;680", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "244;720;625", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "recommendation_avg": [ 3.6666666666666665, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "correctness_avg": [ 2.3333333333333335, 0.9428090415820634 ], "technical_novelty_avg": [ 2.6666666666666665, 0.9428090415820634 ], "empirical_novelty_avg": [ 3.0, 0.816496580927726 ], "wc_summary_paper_avg": [ 89.66666666666667, 12.918548250050733 ], "wc_summary_review_avg": [ 33.333333333333336, 12.283683848458853 ], "wc_main_review_avg": [ 389.0, 127.28969583853466 ], "wc_review_avg": [ 512.0, 120.26914262048544 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 529.6666666666666, 205.68638479220954 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.8660254037844387, "corr_recommendation_correctness": 0.5000000000000001, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:W6f-iAPbdsQJ:scholar.google.com/&scioq=Learning+Homophilic+Incentives+in+Sequential+Social+Dilemmas&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0;1;2;0", "aff_unique_norm": "Tsinghua University;Harvard University;University of Illinois Urbana-Champaign", "aff_unique_dep": ";School of Engineering and Applied Sciences;", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.harvard.edu;https://illinois.edu", "aff_unique_abbr": "THU;Harvard;UIUC", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Cambridge;Urbana-Champaign", "aff_country_unique_index": "0;0;1;1;0", "aff_country_unique": "China;United States" }, { "id": "JVsvIuMDE0Z", "title": "Adaptive Behavior Cloning Regularization for Stable Offline-to-Online Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Offline reinforcement learning, by learning from a fixed dataset, makes it possible to learn agent behaviors without interacting with the environment. However, depending on the quality of the offline dataset, such pre-trained agents may have limited performance and would further need to be fine-tuned online by interacting with the environment. During online fine-tuning, the performance of the pre-trained agent may collapse quickly due to the sudden distribution shift from offline to online data. While constraints enforced by offline RL methods such as a behaviour cloning loss prevent this to an extent, these constraints also significantly slow down online fine-tuning by forcing the agent to stay close to the behavior policy. We propose to adaptively weigh the behavior cloning loss during online fine-tuning based on the agent's performance and training stability. Moreover, we use a randomized ensemble of Q functions to further increase the sample efficiency of online fine-tuning by performing a large number of learning updates. Experiments show that the proposed method yields state-of-the-art offline-to-online reinforcement learning performance on the popular D4RL benchmark.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yi Zhao;Rinu Boney;Alexander Ilin;Juho Kannala;Joni Pajarinen", "authorids": "~Yi_Zhao6;~Rinu_Boney1;~Alexander_Ilin1;~Juho_Kannala1;~Joni_Pajarinen2", "gender": "M;M;M;;M", "homepage": "https://zhaoyi11.github.io/;;https://users.aalto.fi/~alexilin/;;https://users.aalto.fi/~kannalj1/", "dblp": "51/4138-1;203/8118;85/5835;23/8355;47/4656.html", "google_scholar": "https://scholar.google.com/citations?hl=en;;i2gcTBQAAAAJ;https://scholar.google.fi/citations?user=-2fJStwAAAAJ;c4mWQPQAAAAJ", "orcid": "0009-0002-9979-595X;;;0000-0003-4469-8191;0000-0001-5088-4041", "linkedin": ";;alexanderilin/;;", "or_profile": "~Yi_Zhao6;~Rinu_Boney1;~Alexander_Ilin1;~Joni_Pajarinen2;~Juho_Kannala5", "aff": "Aalto University;Aalto University;Aalto University;Technische Universit\u00e4t Darmstadt;Aalto University", "aff_domain": "aalto.fi;aalto.fi;aalto.fi;tu-darmstadt.de;aalto.fi", "position": "PhD student;PhD student;Assistant Professor;Researcher;Assistant Professor", "bibtex": "@misc{\nzhao2022adaptive,\ntitle={Adaptive Behavior Cloning Regularization for Stable Offline-to-Online Reinforcement Learning},\nauthor={Yi Zhao and Rinu Boney and Alexander Ilin and Juho Kannala and Joni Pajarinen},\nyear={2022},\nurl={https://openreview.net/forum?id=JVsvIuMDE0Z}\n}", "github": "", "project": "", "reviewers": "aXQe;bnwi;q4CJ;6n3w", "site": "https://openreview.net/forum?id=JVsvIuMDE0Z", "pdf_size": 0, "recommendation": "3;5;5;5", "confidence": "5;3;5;4", "correctness": "3;4;4;3", "technical_novelty": "2;2;3;2", "empirical_novelty": "2;3;2;2", "wc_summary_paper": "77;33;129;28", "wc_summary_review": "73;27;57;39", "wc_main_review": "591;233;345;153", "wc_review": "741;293;531;220", "wc_reply_reviewers": "324;0;276;0", "wc_reply_authors": "691;183;642;0", "reply_reviewers": "1;0;1;0", "reply_authors": "2;1;2;0", "recommendation_avg": [ 4.5, 0.8660254037844386 ], "confidence_avg": [ 4.25, 0.82915619758885 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 66.75, 40.68399562481542 ], "wc_summary_review_avg": [ 49.0, 17.4928556845359 ], "wc_main_review_avg": [ 330.5, 165.13857816997213 ], "wc_review_avg": [ 446.25, 205.38667800030265 ], "wc_reply_reviewers_avg": [ 150.0, 150.9569475049095 ], "wc_reply_authors_avg": [ 379.0, 295.1990853644367 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.25, 0.82915619758885 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.5222329678670935, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 32, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6985242959602302250&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff_unique_index": "0;0;0;1;0", "aff_unique_norm": "Aalto University;Technische Universit\u00e4t Darmstadt", "aff_unique_dep": ";", "aff_unique_url": "https://www.aalto.fi;https://www.tu-darmstadt.de", "aff_unique_abbr": "Aalto;TUD", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;0", "aff_country_unique": "Finland;Germany" }, { "id": "JXSZuWSPH85", "title": "Deep Inverse Reinforcement Learning via Adversarial One-Class Classification", "track": "main", "status": "Reject", "tldr": "", "abstract": "Traditional inverse reinforcement learning (IRL) methods require a loop to find the optimal policy for each reward update (called an inner loop), resulting in very time-consuming reward estimation. In contrast, classification-based IRL methods, which have been studied recently, do not require an inner loop and estimate rewards quickly, although it is difficult to prepare an appropriate baseline corresponding to the expert trajectory. In this study, we introduced adversarial one-class classification into the classification-based IRL framework, and consequently developed a novel IRL method that requires only expert trajectories. We experimentally verified that the developed method can achieve the same performance as existing methods.", "keywords": "inverse reinforcement learning;one-class classification", "primary_area": "", "supplementary_material": "/attachment/353ab5ea79b7254f75f6a21723b393b9a4bbb559.zip", "author": "Daiko Kishikawa;Sachiyo Arai", "authorids": "~Daiko_Kishikawa1;sachiyo@faculty.chiba-u.jp", "gender": "M;", "homepage": ";", "dblp": ";", "google_scholar": "p37_-ooAAAAJ;", "orcid": "0000-0002-0832-2894;", "linkedin": ";", "or_profile": "~Daiko_Kishikawa1;sachiyo@faculty.chiba-u.jp", "aff": "Chiba University;", "aff_domain": "tu.chiba-u.ac.jp;", "position": "PhD student;", "bibtex": "@misc{\nkishikawa2022deep,\ntitle={Deep Inverse Reinforcement Learning via Adversarial One-Class Classification},\nauthor={Daiko Kishikawa and Sachiyo Arai},\nyear={2022},\nurl={https://openreview.net/forum?id=JXSZuWSPH85}\n}", "github": "", "project": "", "reviewers": "7Yke;uTpc;phAF", "site": "https://openreview.net/forum?id=JXSZuWSPH85", "pdf_size": 0, "recommendation": "3;5;6", "confidence": "3;4;2", "correctness": "3;3;4", "technical_novelty": "3;3;3", "empirical_novelty": "2;2;2", "wc_summary_paper": "94;84;102", "wc_summary_review": "11;19;87", "wc_main_review": "135;428;369", "wc_review": "240;531;558", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "112;745;502", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "recommendation_avg": [ 4.666666666666667, 1.247219128924647 ], "confidence_avg": [ 3.0, 0.816496580927726 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 93.33333333333333, 7.363574011458175 ], "wc_summary_review_avg": [ 39.0, 34.09789827345179 ], "wc_main_review_avg": [ 310.6666666666667, 126.5288724187312 ], "wc_review_avg": [ 443.0, 143.96527359054335 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 453.0, 260.73358049932887 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.3273268353539886, "corr_recommendation_correctness": 0.7559289460184545, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:fTEu5WSXnU4J:scholar.google.com/&scioq=Deep+Inverse+Reinforcement+Learning+via+Adversarial+One-Class+Classification&hl=en&as_sdt=0,33", "gs_version_total": 4, "aff_unique_index": "0", "aff_unique_norm": "Chiba University", "aff_unique_dep": "", "aff_unique_url": "https://www.chiba-u.ac.jp", "aff_unique_abbr": "Chiba U", "aff_country_unique_index": "0", "aff_country_unique": "Japan" }, { "title": "SQuant: On-the-Fly Data-Free Quantization via Diagonal Hessian Approximation", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7037", "id": "JXhROKNZzOc", "poster": "", "openreview": "https://openreview.net/forum?id=JXhROKNZzOc", "slides": "https://iclr.cc/virtual/2022/poster/7037", "video": "https://iclr.cc/virtual/2022/poster/7037", "author_site": "Cong Guo, Yuxian Qiu, Jingwen Leng, Xiaotian Gao, Chen Zhang, Yunxin Liu, Fan Yang, Yuhao Zhu, Minyi Guo", "tldr": "", "abstract": "Quantization of deep neural networks (DNN) has been proven effective for compressing and accelerating DNN models. Data-free quantization (DFQ) is a promising approach without the original datasets under privacy-sensitive and confidential scenarios. However, current DFQ solutions degrade accuracy, need synthetic data to calibrate networks, and are time-consuming and costly. This paper proposes an on-the-fly DFQ framework with sub-second quantization time, called SQuant, which can quantize networks on inference-only devices with low computation and memory requirements. With the theoretical analysis of the second-order information of DNN task loss, we decompose and approximate the Hessian-based optimization objective into three diagonal sub-items, which have different areas corresponding to three dimensions of weight tensor: element-wise, kernel-wise, and output channel-wise. Then, we progressively compose sub-items and propose a novel data-free optimization objective in the discrete domain, minimizing Constrained Absolute Sum of Error (or CASE in short), which surprisingly does not need any dataset and is even not aware of network architecture. We also design an efficient algorithm without back-propagation to further reduce the computation complexity of the objective solver. Finally, without fine-tuning and synthetic datasets, SQuant accelerates the data-free quantization process to a sub-second level with >30% accuracy improvement over the existing data-free post-training quantization works, with the evaluated models under 4-bit quantization. We have open-sourced the SQuant framework at https://github.com/clevercool/SQuant.", "keywords": "Data-Free Quantization;Hessian Matrix;Approximation", "primary_area": "", "supplementary_material": "/attachment/9246edccc6a08a3022f2eb56d883f8b3641dc453.zip", "author": "Cong Guo;Yuxian Qiu;Jingwen Leng;Xiaotian Gao;Chen Zhang;Yunxin Liu;Fan Yang;Yuhao Zhu;Minyi Guo", "authorids": "~Cong_Guo1;~Yuxian_Qiu1;~Jingwen_Leng1;~Xiaotian_Gao1;~Chen_Zhang11;~Yunxin_Liu2;~Fan_Yang28;~Yuhao_Zhu1;~Minyi_Guo1", "gender": "M;M;M;M;M;;M;;M", "homepage": "http://guocong.me/;;http://cs.sjtu.edu.cn/~leng-jw/;;;;https://fanyangcs.github.io/;;http://www.cs.sjtu.edu.cn/~guo-my/", "dblp": "117/1754-3.html;;131/5131;;94/4084-1.html;;29/3081-24.html;;", "google_scholar": "sp5VwJoAAAAJ;;L1y8y2MAAAAJ;AZVvpH4AAAAJ;https://scholar.google.com/citations?hl=en;;https://scholar.google.com/citations?hl=en;;https://scholar.google.com.tw/citations?user=8R8FO9IAAAAJ", "orcid": ";;;;0000-0003-2762-2726;;0000-0002-0378-060X;;", "linkedin": ";;;;;;;;", "or_profile": "~Cong_Guo1;~Yuxian_Qiu1;~Jingwen_Leng1;~Xiaotian_Gao1;~Chen_Zhang11;~Yunxin_Liu2;~Fan_Yang28;~Yuhao_Zhu1;~Minyi_Guo1", "aff": "Shanghai Jiaotong University;Shanghai Jiaotong University;Shanghai Jiaotong University;Microsoft Research;Alibaba Group;;Microsoft Research;;Shanghai Jiaotong University", "aff_domain": "sjtu.edu.cn;sjtu.edu.cn;sjtu.edu.cn;microsoft.com;alibaba-inc.com;;research.microsoft.com;;sjtu.edu.cn", "position": "PhD student;PhD student;Associate Professor;Researcher;Researcher;;Senior Principal Researcher;;Full Professor", "bibtex": "@inproceedings{\nguo2022squant,\ntitle={{SQ}uant: On-the-Fly Data-Free Quantization via Diagonal Hessian Approximation},\nauthor={Cong Guo and Yuxian Qiu and Jingwen Leng and Xiaotian Gao and Chen Zhang and Yunxin Liu and Fan Yang and Yuhao Zhu and Minyi Guo},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=JXhROKNZzOc}\n}", "github": "", "project": "", "reviewers": "xprS;hK2m;cncj;uKzc", "pdf_size": 0, "recommendation": "6;6;6;8", "confidence": "4;4;4;3", "correctness": "2;3;3;3", "technical_novelty": "3;3;3;4", "empirical_novelty": "3;3;3;3", "wc_summary_paper": "83;96;65;63", "wc_summary_review": "60;35;47;11", "wc_main_review": "193;292;233;109", "wc_review": "336;423;345;183", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "385;731;608;315", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 76.75, 13.571569548139964 ], "wc_summary_review_avg": [ 38.25, 18.046814123273947 ], "wc_main_review_avg": [ 206.75, 66.52208279962376 ], "wc_review_avg": [ 321.75, 86.95796398260484 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 509.75, 167.40277028771058 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 9, 0 ], "corr_recommendation_confidence": -1.0, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 83, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=748228209807839980&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "pdf": "https://openreview.net/pdf?id=JXhROKNZzOc", "email": "sjtu.edu.cn;sjtu.edu.cn;sjtu.edu.cn;microsoft.com;alibaba-inc.com;;research.microsoft.com;;sjtu.edu.cn", "author_num": 9, "aff_unique_index": "0;0;0;1;2;1;0", "aff_unique_norm": "Shanghai Jiao Tong University;Microsoft;Alibaba Group", "aff_unique_dep": ";Microsoft Research;", "aff_unique_url": "https://www.sjtu.edu.cn;https://www.microsoft.com/en-us/research;https://www.alibaba.com", "aff_unique_abbr": "SJTU;MSR;Alibaba", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;0;1;0", "aff_country_unique": "China;United States" }, { "id": "JYQYysrNT3M", "title": "Reinforcement Learning with Ex-Post Max-Min Fairness", "track": "main", "status": "Reject", "tldr": "", "abstract": "We consider reinforcement learning with vectorial rewards, where the agent receives a vector of $K\\geq 2$ different types of rewards at each time step. The agent aims to maximize the minimum total reward among the $K$ reward types. Different from existing works that focus on maximizing the minimum expected total reward, i.e. \\emph{ex-ante max-min fairness}, we maximize the expected minimum total reward, i.e. \\emph{ex-post max-min fairness}. Through an example and numerical experiments, we show that the optimal policy for the former objective generally does not converge to optimality under the latter, even as the number of time steps $T$ grows. Our main contribution is a novel algorithm, Online-ReOpt, that achieves near-optimality under our objective, assuming an optimization oracle that returns a near-optimal policy given any scalar reward. The expected objective value under Online-ReOpt is shown to converge to the asymptotic optimum as $T$ increases. Finally, we propose offline variants to ease the burden of online computation in Online-ReOpt, and we propose generalizations from the max-min objective to concave utility maximization.", "keywords": "Reinforcement learning;fairness;regret minimization;multi-objective optimization;constrained Markov decision processes", "primary_area": "", "supplementary_material": "/attachment/d5c9c595e91b5fe27138d955ab949a5221edf93f.zip", "author": "Wang Chi Cheung;Zi Yi Ewe", "authorids": "~Wang_Chi_Cheung1;ewe.ziyi@u.nus.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\ncheung2022reinforcement,\ntitle={Reinforcement Learning with Ex-Post Max-Min Fairness},\nauthor={Wang Chi Cheung and Zi Yi Ewe},\nyear={2022},\nurl={https://openreview.net/forum?id=JYQYysrNT3M}\n}", "github": "", "project": "", "reviewers": "gzxW;SQsM;oTCf;bwVS", "site": "https://openreview.net/forum?id=JYQYysrNT3M", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "3;3;3;3", "correctness": "3;4;4;3", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "23;61;84;96", "wc_summary_review": "18;20;25;62", "wc_main_review": "168;214;387;342", "wc_review": "209;295;496;500", "wc_reply_reviewers": "0;263;0;0", "wc_reply_authors": "721;1156;733;651", "reply_reviewers": "0;1;0;0", "reply_authors": "1;2;1;1", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 3.0, 0.0 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 66.0, 27.829840100151493 ], "wc_summary_review_avg": [ 31.25, 17.93564885918544 ], "wc_main_review_avg": [ 277.75, 89.68381961089749 ], "wc_review_avg": [ 375.0, 126.71029950244771 ], "wc_reply_reviewers_avg": [ 65.75, 113.88234059765368 ], "wc_reply_authors_avg": [ 815.25, 199.2089041684633 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:JFnvCvoI6U8J:scholar.google.com/&scioq=Reinforcement+Learning+with+Ex-Post+Max-Min+Fairness&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "title": "The Effects of Reward Misspecification: Mapping and Mitigating Misaligned Models", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6579", "id": "JYtwGwIL7ye", "poster": "", "openreview": "https://openreview.net/forum?id=JYtwGwIL7ye", "slides": "https://iclr.cc/virtual/2022/poster/6579", "video": "https://iclr.cc/virtual/2022/poster/6579", "author_site": "Alexander Pan, Kush Bhatia, Jacob Steinhardt", "tldr": "", "abstract": "Reward hacking---where RL agents exploit gaps in misspecified proxy rewards---has been widely observed, but not yet systematically studied. To understand reward hacking, we construct four RL environments with different misspecified rewards. We investigate reward hacking as a function of agent capabilities: model capacity, action space resolution, and observation space noise. Typically, more capable agents are able to better exploit reward misspecifications, causing them to attain higher proxy reward and lower true reward. Moreover, we find instances of \\emph{phase transitions}: capability thresholds at which the agent's behavior qualitatively shifts, leading to a sharp decrease in the true reward. Such phase transitions pose challenges to monitoring the safety of ML systems. To encourage further research on reward misspecification, address this, we propose an anomaly detection task for aberrant policies and offer several baseline detectors.", "keywords": "reward misspecification;reinforcement learning;reward hacking;alignment;ml safety", "primary_area": "", "supplementary_material": "/attachment/495a17da018002a191c5ceb4ad2cc2d563d40c64.zip", "author": "Alexander Pan;Kush Bhatia;Jacob Steinhardt", "authorids": "~Alexander_Pan1;~Kush_Bhatia3;~Jacob_Steinhardt1", "gender": "M;;M", "homepage": "https://aypan17.github.io;;http://people.eecs.berkeley.edu/~kush/", "dblp": "304/3394;35/10625;164/5807", "google_scholar": "PaltSA0AAAAJ;;X-Sd3-8AAAAJ", "orcid": ";;", "linkedin": "alexander-pan-0567a2102/;;", "or_profile": "~Alexander_Pan1;~Jacob_Steinhardt1;~kush_Bhatia2", "aff": "California Institute of Technology;University of California, Berkeley;University of California, Berkeley", "aff_domain": "caltech.edu;berkeley.edu;berkeley.edu", "position": "Undergrad student;Assistant Professor;PhD student", "bibtex": "@inproceedings{\npan2022the,\ntitle={The Effects of Reward Misspecification: Mapping and Mitigating Misaligned Models},\nauthor={Alexander Pan and Kush Bhatia and Jacob Steinhardt},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=JYtwGwIL7ye}\n}", "github": "", "project": "", "reviewers": "uYeb;bfGN;GVMn;16uL", "pdf_size": 0, "recommendation": "6;6;6;8", "confidence": "4;4;4;4", "correctness": "3;4;4;4", "technical_novelty": "3;3;2;3", "empirical_novelty": "2;3;4;4", "wc_summary_paper": "92;112;95;121", "wc_summary_review": "63;86;63;38", "wc_main_review": "759;996;556;329", "wc_review": "914;1194;714;488", "wc_reply_reviewers": "46;201;261;31", "wc_reply_authors": "829;474;644;446", "reply_reviewers": "1;1;2;1", "reply_authors": "2;1;2;1", "recommendation_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 105.0, 11.979148550710939 ], "wc_summary_review_avg": [ 62.5, 16.977926846349646 ], "wc_main_review_avg": [ 660.0, 246.51267715880252 ], "wc_review_avg": [ 827.5, 259.78212024694847 ], "wc_reply_reviewers_avg": [ 134.75, 98.7025202312484 ], "wc_reply_authors_avg": [ 598.25, 153.2618266235921 ], "reply_reviewers_avg": [ 1.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 23, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 196, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13629255034936383162&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=JYtwGwIL7ye", "email": "caltech.edu;berkeley.edu;berkeley.edu", "author_num": 3, "aff_unique_index": "0;1;1", "aff_unique_norm": "California Institute of Technology;University of California, Berkeley", "aff_unique_dep": ";", "aff_unique_url": "https://www.caltech.edu;https://www.berkeley.edu", "aff_unique_abbr": "Caltech;UC Berkeley", "aff_campus_unique_index": "0;1;1", "aff_campus_unique": "Pasadena;Berkeley", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "JZrETJlgyq", "title": "Exploring Non-Contrastive Representation Learning for Deep Clustering", "track": "main", "status": "Reject", "tldr": "", "abstract": "Existing deep clustering methods rely on contrastive learning for representation learning, which require negative examples to form an embedding space where all instances are well-separated. However, the negative examples inevitably give rise to the class collision issue, compromising the representation learning for clustering. In this paper, we explore the non-contrastive representation learning for deep clustering, termed NCC, which is based on BYOL, a representative method without negative examples. First, we propose a positive sampling strategy to align one augmented view of instance with the neighbors of another view so that we can avoid the class collision issue caused by the negative examples and hence improve the within-cluster compactness. Second, we propose a novel prototypical contrastive loss, ProtoCL, which can encourage prototypical alignment between two augmented views and prototypical uniformity, hence maximizing the inter-cluster distance. Moreover, we formulate NCC in an Expectation-Maximization (EM) framework, in which E-step utilizes spherical k-means to estimate the pseudo-labels of instances and distribution of prototypes from the target network and M-step leverages the proposed losses to optimize the online network. As a result, NCC is able to form an embedding space where all clusters are well-separated and within-cluster examples are compact. Experimental results on several clustering benchmark datasets as well as ImageNet-1K demonstrate that the proposed NCC outperforms the state-of-the-art methods by a significant margin.", "keywords": "Image Clustering;Representation Learning;Self-supervised Learning", "primary_area": "", "supplementary_material": "", "author": "Zhizhong Huang;Jie Chen;Junping Zhang;Hongming Shan", "authorids": "~Zhizhong_Huang1;~Jie_Chen19;~Junping_Zhang2;~Hongming_Shan1", "gender": "M;M;M;M", "homepage": "https://hzzone.github.io/;https://www.researchgate.net/profile/Jie_Chen364;http://www.pami.fudan.edu.cn;http://hmshan.io/", "dblp": ";;02/5388.html;184/8229", "google_scholar": "6Itl2tMAAAAJ;vuRdFLsAAAAJ;Aib_NTYAAAAJ;https://scholar.google.co.uk/citations?user=RYfSzKwAAAAJ", "orcid": ";;;0000-0002-0604-3197", "linkedin": ";;;", "or_profile": "~Zhizhong_Huang1;~Jie_Chen19;~Junping_Zhang2;~Hongming_Shan1", "aff": "Fudan University;Fudan University;Fudan University;Fudan University", "aff_domain": "fudan.edu.cn;fudan.edu.cn;fudan.edu.cn;fudan.edu.cn", "position": "PhD student;PhD student;Professor;Associate Professor", "bibtex": "@misc{\nhuang2022exploring,\ntitle={Exploring Non-Contrastive Representation Learning for Deep Clustering},\nauthor={Zhizhong Huang and Jie Chen and Junping Zhang and Hongming Shan},\nyear={2022},\nurl={https://openreview.net/forum?id=JZrETJlgyq}\n}", "github": "", "project": "", "reviewers": "Na7g;dith;bq8Q;nh13", "site": "https://openreview.net/forum?id=JZrETJlgyq", "pdf_size": 0, "recommendation": "3;6;6;8", "confidence": "5;4;4;3", "correctness": "4;3;4;3", "technical_novelty": "2;2;3;4", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "56;85;27;84", "wc_summary_review": "26;65;23;61", "wc_main_review": "252;216;121;582", "wc_review": "334;366;171;727", "wc_reply_reviewers": "397;79;0;130", "wc_reply_authors": "3429;1054;882;1090", "reply_reviewers": "1;1;0;2", "reply_authors": "8;3;2;3", "recommendation_avg": [ 5.75, 1.7853571071357126 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.82915619758885 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 63.0, 23.822258499143192 ], "wc_summary_review_avg": [ 43.75, 19.330998422223306 ], "wc_main_review_avg": [ 292.75, 173.720141319307 ], "wc_review_avg": [ 399.5, 203.0277074687098 ], "wc_reply_reviewers_avg": [ 151.5, 149.1148885926553 ], "wc_reply_authors_avg": [ 1613.75, 1050.9786808018514 ], "reply_reviewers_avg": [ 1.0, 0.7071067811865476 ], "reply_authors_avg": [ 4.0, 2.345207879911715 ], "replies_avg": [ 28, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.9901475429766743, "corr_recommendation_correctness": -0.7001400420140049, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2183192056246056479&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Fudan University", "aff_unique_dep": "", "aff_unique_url": "https://www.fudan.edu.cn", "aff_unique_abbr": "Fudan", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "title": "Optimizing Neural Networks with Gradient Lexicase Selection", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/5996", "id": "J_2xNmVcY4", "poster": "", "openreview": "https://openreview.net/forum?id=J_2xNmVcY4", "slides": "https://iclr.cc/virtual/2022/poster/5996", "video": "https://iclr.cc/virtual/2022/poster/5996", "author_site": "Li Ding, Lee Spector", "tldr": "", "abstract": "One potential drawback of using aggregated performance measurement in machine learning is that models may learn to accept higher errors on some training cases as compromises for lower errors on others, with the lower errors actually being instances of overfitting. This can lead both to stagnation at local optima and to poor generalization. Lexicase selection is an uncompromising method developed in evolutionary computation, which selects models on the basis of sequences of individual training case errors instead of using aggregated metrics such as loss and accuracy. In this paper, we investigate how the general idea of lexicase selection can fit into the context of deep learning to improve generalization. We propose Gradient Lexicase Selection, an optimization framework that combines gradient descent and lexicase selection in an evolutionary fashion. Experimental results show that the proposed method improves the generalization performance of various popular deep neural network architectures on three image classification benchmarks. Qualitative analysis also indicates that our method helps the networks learn more diverse representations.", "keywords": "deep learning;lexicase selection;optimization;evolutionary algorithms", "primary_area": "", "supplementary_material": "/attachment/0b767483a08145204f50ab702ea0425e41fe3d0f.zip", "author": "Li Ding;Lee Spector", "authorids": "~Li_Ding3;~Lee_Spector1", "gender": "M;", "homepage": "https://liding.info;", "dblp": "58/4543-10;", "google_scholar": "https://scholar.google.com/citations?hl=en;", "orcid": "0000-0002-1315-1196;", "linkedin": "liding256;", "or_profile": "~Li_Ding3;~Lee_Spector1", "aff": "University of Massachusetts, Amherst;Hampshire College", "aff_domain": "umass.edu;", "position": "PhD student;", "bibtex": "@inproceedings{\nding2022optimizing,\ntitle={Optimizing Neural Networks with Gradient Lexicase Selection},\nauthor={Li Ding and Lee Spector},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=J_2xNmVcY4}\n}", "github": "", "project": "", "reviewers": "U4uc;Fzye;ezPJ;bGuc", "pdf_size": 0, "recommendation": "6;6;6;8", "confidence": "3;5;4;3", "correctness": "3;3;3;4", "technical_novelty": "2;3;3;3", "empirical_novelty": "1;2;2;4", "wc_summary_paper": "70;124;78;63", "wc_summary_review": "81;24;23;63", "wc_main_review": "348;478;337;257", "wc_review": "499;626;438;383", "wc_reply_reviewers": "65;328;75;0", "wc_reply_authors": "794;1270;708;584", "reply_reviewers": "1;1;1;0", "reply_authors": "1;2;2;1", "recommendation_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 1.0897247358851685 ], "wc_summary_paper_avg": [ 83.75, 23.836683913665507 ], "wc_summary_review_avg": [ 47.75, 25.073641538476217 ], "wc_main_review_avg": [ 355.0, 79.22436493907666 ], "wc_review_avg": [ 486.5, 90.3894352233711 ], "wc_reply_reviewers_avg": [ 117.0, 125.17787344415146 ], "wc_reply_authors_avg": [ 839.0, 259.7941492797711 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.5222329678670935, "corr_recommendation_correctness": 1.0, "gs_citation": 30, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11319655096813743934&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=J_2xNmVcY4", "email": "umass.edu;", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "University of Massachusetts Amherst;Hampshire College", "aff_unique_dep": ";", "aff_unique_url": "https://www.umass.edu;https://www.hampshire.edu", "aff_unique_abbr": "UMass Amherst;Hampshire", "aff_campus_unique_index": "0", "aff_campus_unique": "Amherst;", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "DKM: Differentiable k-Means Clustering Layer for Neural Network Compression", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6260", "id": "J_F_qqCE3Z5", "poster": "", "openreview": "https://openreview.net/forum?id=J_F_qqCE3Z5", "slides": "https://iclr.cc/virtual/2022/poster/6260", "video": "https://iclr.cc/virtual/2022/poster/6260", "author_site": "Minsik Cho, Keivan Alizadeh-Vahid, Saurabh Adya, Mohammad Rastegari", "tldr": "", "abstract": "Deep neural network (DNN) model compression for efficient on-device inference is becoming increasingly important to reduce memory requirements and keep user data on-device. To this end, we propose a novel differentiable k-means clustering layer (DKM) and its application to train-time weight clustering-based DNN model compression. DKM casts k-means clustering as an attention problem and enables joint optimization of the DNN parameters and clustering centroids. Unlike prior works that rely on additional regularizers and parameters, DKM-based compression keeps the original loss function and model architecture fixed. We evaluated DKM-based compression on various DNN models for computer vision and natural language processing (NLP) tasks. Our results demonstrate that DKM delivers superior compression and accuracy trade-off on ImageNet1k and GLUE benchmarks. For example, DKM-based compression can offer 74.5% top-1 ImageNet1k accuracy on ResNet50 DNN model with 3.3MB model size (29.4x model compression factor). For MobileNet-v1, which is a challenging DNN to compress, DKM delivers 63.9% top-1 ImageNet1k accuracy with 0.72 MB model size (22.4x model compression factor). This result is 6.8% higher top-1accuracy and 33% relatively smaller model size than the current state-of-the-art DNN compression algorithms. Additionally, DKM enables compression of DistilBERT model by 11.8x with minimal (1.1%) accuracy loss on GLUE NLP benchmarks.", "keywords": "Deep learning;neural network;compression", "primary_area": "", "supplementary_material": "", "author": "Minsik Cho;Keivan Alizadeh-Vahid;Saurabh Adya;Mohammad Rastegari", "authorids": "~Minsik_Cho1;~Keivan_Alizadeh-Vahid1;sadya@apple.com;~Mohammad_Rastegari2", "gender": "M;;;M", "homepage": ";;;https://mrastegari.github.io/", "dblp": ";;;31/5228", "google_scholar": "_AZys7EAAAAJ;;;N4-2Z_cAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Minsik_Cho1;~Keivan_Alizadeh-Vahid1;sadya@apple.com;~Mohammad_Rastegari2", "aff": ";;;Department of Computer Science, University of Washington", "aff_domain": ";;;cs.washington.edu", "position": ";;;Assistant Professor", "bibtex": "@inproceedings{\ncho2022dkm,\ntitle={{DKM}: Differentiable k-Means Clustering Layer for Neural Network Compression},\nauthor={Minsik Cho and Keivan Alizadeh-Vahid and Saurabh Adya and Mohammad Rastegari},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=J_F_qqCE3Z5}\n}", "github": "", "project": "", "reviewers": "r68F;5RxN;JTKT;yNNv", "pdf_size": 0, "recommendation": "5;6;6;6", "confidence": "4;4;4;4", "correctness": "3;2;3;3", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "33;43;71;45", "wc_summary_review": "34;208;18;41", "wc_main_review": "308;208;442;345", "wc_review": "375;459;531;431", "wc_reply_reviewers": "357;0;117;68", "wc_reply_authors": "1936;600;1905;833", "reply_reviewers": "1;0;1;2", "reply_authors": "3;1;4;2", "recommendation_avg": [ 5.75, 0.4330127018922193 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 48.0, 14.035668847618199 ], "wc_summary_review_avg": [ 75.25, 77.09531438420885 ], "wc_main_review_avg": [ 325.75, 83.76268560642023 ], "wc_review_avg": [ 449.0, 56.178287620752556 ], "wc_reply_reviewers_avg": [ 135.5, 134.4628201399926 ], "wc_reply_authors_avg": [ 1318.5, 607.7090175404674 ], "reply_reviewers_avg": [ 1.0, 0.7071067811865476 ], "reply_authors_avg": [ 2.5, 1.118033988749895 ], "replies_avg": [ 20, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": -0.3333333333333333, "gs_citation": 53, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2217153050461470082&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=J_F_qqCE3Z5", "email": ";;;cs.washington.edu", "author_num": 4, "aff_unique_index": "0", "aff_unique_norm": "University of Washington", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.washington.edu", "aff_unique_abbr": "UW", "aff_campus_unique_index": "0", "aff_campus_unique": "Seattle", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "title": "Improving the Accuracy of Learning Example Weights for Imbalance Classification", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6446", "id": "J_PHjw4gvXJ", "poster": "", "openreview": "https://openreview.net/forum?id=J_PHjw4gvXJ", "slides": "https://iclr.cc/virtual/2022/poster/6446", "video": "https://iclr.cc/virtual/2022/poster/6446", "author_site": "Yuqi Liu, Bin Cao, JING FAN", "tldr": "", "abstract": "To solve the imbalance classification, methods of weighting examples have been proposed. Recent work has studied to assign adaptive weights to training examples through learning mechanisms, that is, the weights, similar to classification models, are regarded as parameters that need to be learned. However, the algorithms in recent work use local information to approximately optimize the weights, which may lead to inaccurate learning of the weights. In this work, we first propose a novel mechanism of learning with a constraint, which can accurately train the weights and model. Then, we propose a combined method of our learning mechanism and the work by Hu et al., which can promote each other to perform better. Our proposed method can be applied to any type of deep network model. Experiments show that compared with the state-of-the-art algorithms, our method has significant improvement in varieties of settings, including text and image classification over different imbalance ratios, binary and multi-class classification.", "keywords": "Imbalance classification;Meta learning;Data weighting.", "primary_area": "", "supplementary_material": "", "author": "Yuqi Liu;Bin Cao;Jing Fan", "authorids": "~Yuqi_Liu1;bincao@zjut.edu.cn;fanjing@zjut.edu.cn", "gender": "M;;", "homepage": ";;", "dblp": "35/9071;;", "google_scholar": ";;", "orcid": "0000-0003-0092-7001;;", "linkedin": ";;", "or_profile": "~Yuqi_Liu1;bincao@zjut.edu.cn;fanjing@zjut.edu.cn", "aff": "Zhejiang University of Technology;;", "aff_domain": "zjut.edu.cn;;", "position": "PhD student;;", "bibtex": "@inproceedings{\nliu2022improving,\ntitle={Improving the Accuracy of Learning Example Weights for Imbalance Classification},\nauthor={Yuqi Liu and Bin Cao and Jing Fan},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=J_PHjw4gvXJ}\n}", "github": "", "project": "", "reviewers": "auNr;yKjB;XCGn;fpgG", "pdf_size": 0, "recommendation": "6;6;6;8", "confidence": "4;4;3;3", "correctness": "4;3;3;4", "technical_novelty": "3;3;3;3", "empirical_novelty": "3;3;2;3", "wc_summary_paper": "86;43;86;103", "wc_summary_review": "94;42;125;33", "wc_main_review": "235;256;122;170", "wc_review": "415;341;333;306", "wc_reply_reviewers": "26;0;0;0", "wc_reply_authors": "202;835;355;412", "reply_reviewers": "1;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 79.5, 22.18670773233379 ], "wc_summary_review_avg": [ 73.5, 37.765725201563384 ], "wc_main_review_avg": [ 195.75, 53.08660377157311 ], "wc_review_avg": [ 348.75, 40.38796231552169 ], "wc_reply_reviewers_avg": [ 6.5, 11.258330249197702 ], "wc_reply_authors_avg": [ 451.0, 234.62416755313166 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16478119529238857180&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "pdf": "https://openreview.net/pdf?id=J_PHjw4gvXJ", "email": "zjut.edu.cn;;", "author_num": 3, "aff_unique_index": "0", "aff_unique_norm": "Zhejiang University of Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.zjut.edu.cn", "aff_unique_abbr": "ZJUT", "aff_country_unique_index": "0", "aff_country_unique": "China" }, { "id": "JbYk9VrZDS", "title": "Federated Learning with Data-Agnostic Distribution Fusion", "track": "main", "status": "Withdraw", "tldr": "", "abstract": " Federated learning has emerged as a promising distributed machine learning paradigm to preserve data privacy. One of the fundamental challenges of federated learning is that data samples across clients are usually not independent and identically distributed (non-IID), leading to slow convergence and severe performance drop of the aggregated global model. In this paper, we propose a novel data-agnostic distribution fusion based model aggregation method called \\texttt{FedDAF} to optimize federated learning with non-IID local datasets, based on which the heterogeneous clients' data distributions can be represented by the fusion of several virtual components with different parameters and weights. We develop a variational autoencoder (VAE) method to derive the optimal parameters for the fusion distribution using the limited statistical information extracted from local models, which optimizes model aggregation for federated learning by solving a probabilistic maximization problem. Extensive experiments based on various federated learning scenarios with real-world datasets show that \\texttt{FedDAF} achieves significant performance improvement compared to the state-of-the-art.\n", "keywords": "Federated Learning;variational inference", "primary_area": "", "supplementary_material": "", "author": "Jian-hui Duan;Wenzhong Li;Sanglu Lu", "authorids": "~Jian-hui_Duan1;~Wenzhong_Li1;~Sanglu_Lu1", "gender": "M;M;F", "homepage": "https://enzoduan.github.io/;https://cs.nju.edu.cn/lwz/;https://cs.nju.edu.cn/58/1e/c2639a153630/page.htm", "dblp": ";98/3150;24/3318", "google_scholar": ";;", "orcid": ";;0000-0003-1467-4519", "linkedin": ";;", "or_profile": "~Jian-hui_Duan1;~Wenzhong_Li1;~Sanglu_Lu1", "aff": "Nanjing University;Nanjing University;Nanjing University", "aff_domain": "nju.edu.cn;nju.edu.cn;nju.edu.cn", "position": "MS student;Full Professor;Full Professor", "bibtex": "@misc{\nduan2022federated,\ntitle={Federated Learning with Data-Agnostic Distribution Fusion},\nauthor={Jian-hui Duan and Wenzhong Li and Sanglu Lu},\nyear={2022},\nurl={https://openreview.net/forum?id=JbYk9VrZDS}\n}", "github": "", "project": "", "reviewers": "SecD;tJH3;HoCx;4LUc", "site": "https://openreview.net/forum?id=JbYk9VrZDS", "pdf_size": 0, "recommendation": "3;3;5;6", "confidence": "4;4;5;4", "correctness": "3;2;3;4", "technical_novelty": "3;2;2;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "47;121;78;127", "wc_summary_review": "32;35;34;28", "wc_main_review": "342;421;406;120", "wc_review": "421;577;518;275", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.25, 1.299038105676658 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 93.25, 32.713720363174836 ], "wc_summary_review_avg": [ 32.25, 2.680951323690902 ], "wc_main_review_avg": [ 322.25, 120.47899194465398 ], "wc_review_avg": [ 447.75, 114.23522880442793 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": 0.816496580927726, "gs_citation": 22, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12591182261595898711&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff_unique_index": "0;0;0", "aff_unique_norm": "Nanjing University", "aff_unique_dep": "", "aff_unique_url": "https://www.nju.edu.cn", "aff_unique_abbr": "Nanjing U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "id": "JeSIUeUSUuR", "title": "Variability of Neural Networks and Han-Layer: A Variability-Inspired Model", "track": "main", "status": "Reject", "tldr": "", "abstract": "What makes an artificial neural network easier to train or to generalize better than its peers? We introduce a notion of variability to view such issues under the setting of a fixed number of parameters which is, in general, a dominant cost-factor. Experiments verify that variability correlates positively to the number of activations and negatively to a phenomenon called Collapse to Constants, which is related but not identical to vanishing gradient. Further experiments on stylized problems show that variability is indeed a key performance indicator for fully-connected neural networks. Guided by variability considerations, we propose a new architecture called Householder-absolute neural layers, or Han-layers for short, to build high variability networks with a guaranteed immunity to gradient vanishing or exploding.\nOn small stylized models, Han-layer networks exhibit a far superior generalization ability over fully-connected networks. Extensive empirical results demonstrate that, by judiciously replacing fully-connected layers in large-scale networks such as MLP-Mixers, Han-layers can greatly reduce the number of model parameters while maintaining or improving generalization performance. We will also briefly discuss current limitations of the proposed Han-layer architecture.\n", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yueyao Yu;Yin Zhang", "authorids": "~Yueyao_Yu1;~Yin_Zhang4", "gender": ";M", "homepage": ";", "dblp": ";91/3045", "google_scholar": "https://scholar.google.com.hk/citations?user=I7Is5TMAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";", "linkedin": ";", "or_profile": "~Yueyao_Yu1;~Yin_Zhang4", "aff": "The Chinese University of HongKong, shenzhen;Rice University", "aff_domain": "cuhk.edu.cn;rice.edu", "position": "PhD student;Full Professor", "bibtex": "@misc{\nyu2022variability,\ntitle={Variability of Neural Networks and Han-Layer: A Variability-Inspired Model},\nauthor={Yueyao Yu and Yin Zhang},\nyear={2022},\nurl={https://openreview.net/forum?id=JeSIUeUSUuR}\n}", "github": "", "project": "", "reviewers": "7Vdi;8jvQ;xnGa;QXHA", "site": "https://openreview.net/forum?id=JeSIUeUSUuR", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "4;4;4;3", "correctness": "2;2;2;3", "technical_novelty": "2;2;1;3", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "73;130;113;62", "wc_summary_review": "35;90;48;43", "wc_main_review": "287;675;496;453", "wc_review": "395;895;657;558", "wc_reply_reviewers": "243;0;223;45", "wc_reply_authors": "295;437;502;297", "reply_reviewers": "1;0;1;1", "reply_authors": "2;1;2;1", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 2.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 94.5, 27.932955446926844 ], "wc_summary_review_avg": [ 54.0, 21.295539439046856 ], "wc_main_review_avg": [ 477.75, 138.05682706769701 ], "wc_review_avg": [ 626.25, 181.18136631563414 ], "wc_reply_reviewers_avg": [ 127.75, 106.6803051176739 ], "wc_reply_authors_avg": [ 382.75, 89.7451252158021 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -1.0, "corr_recommendation_correctness": 1.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:HbhXCkgEil0J:scholar.google.com/&scioq=Variability+of+Neural+Networks+and+Han-Layer:+A+Variability-Inspired+Model&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1", "aff_unique_norm": "Chinese University of Hong Kong;Rice University", "aff_unique_dep": ";", "aff_unique_url": "https://www.cuhk.edu.cn;https://www.rice.edu", "aff_unique_abbr": "CUHK;Rice", "aff_campus_unique_index": "0", "aff_campus_unique": "Shenzhen;", "aff_country_unique_index": "0;1", "aff_country_unique": "China;United States" }, { "id": "JedTK_aOaRa", "title": "Private Multi-Winner Voting For Machine Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Private multi-winner voting is the task of revealing k-hot binary vectors that satisfy a bounded differential privacy guarantee. This task has been understudied in the machine learning literature despite its prevalence in many domains such as healthcare. We propose three new privacy-preserving multi-label mechanisms: Binary, $\\tau$, and Powerset voting. Binary voting operates independently per label through composition. $\\tau$ voting bounds votes optimally in their $\\ell_2$ norm. Powerset voting operates over the entire binary vector by viewing the possible outcomes as a power set. We theoretically analyze tradeoffs showing that Powerset voting requires strong correlations between labels to outperform Binary voting. We use these mechanisms to enable privacy-preserving multi-label learning by extending the canonical single-label technique: PATE. We empirically compare our techniques with DPSGD on large real-world healthcare data and standard multi-label benchmarks. We find that our techniques outperform all others in the centralized setting. We enable multi-label CaPC and show that our mechanisms can be used to collaboratively improve models in a multi-site (distributed) setting.", "keywords": "multi-label;privacy;voting;confidentiality;differential privacy;disributed collaboration;collaboration", "primary_area": "", "supplementary_material": "/attachment/8bd3ac427daf991abb6066f5fc1ec015047453b9.zip", "author": "Adam Dziedzic;Christopher A. Choquette-Choo;Natalie Dullerud;Vinith Menon Suriyakumar;Ali Shahin Shamsabadi;Muhammad Ahmad Kaleem;Somesh Jha;Nicolas Papernot;Xiao Wang", "authorids": "~Adam_Dziedzic1;~Christopher_A._Choquette-Choo1;~Natalie_Dullerud1;~Vinith_Menon_Suriyakumar1;~Ali_Shahin_Shamsabadi1;~Muhammad_Ahmad_Kaleem1;~Somesh_Jha1;~Nicolas_Papernot1;~Xiao_Wang11", "gender": ";M;;M;M;;M;M;M", "homepage": ";https://www.christopherchoquette.com;;;https://alishahin.github.io;;;https://www.papernot.fr;https://wangxiao1254.github.io/", "dblp": ";250/9674;;;198/1244;;j/SomeshJha;162/1405;150/9413", "google_scholar": ";oDE4I64AAAAJ;;https://scholar.google.com/citations?hl=en;1kVnWYwAAAAJ;;BaI7l8QAAAAJ;cGxq0cMAAAAJ;QbWLR8QAAAAJ", "orcid": ";;;;;;;;", "linkedin": ";christopher-choquette-choo/;natalie-dullerud-777ba5178/;vsuriyakumar;ali-shahin-shamsabadi-492544259/;;;nicolaspapernot;", "or_profile": "~Adam_Dziedzic1;~Christopher_A._Choquette-Choo1;~Natalie_Dullerud1;~Vinith_Menon_Suriyakumar1;~Ali_Shahin_Shamsabadi1;~Muhammad_Ahmad_Kaleem1;~Somesh_Jha1;~Nicolas_Papernot1;~Xiao_Wang11", "aff": ";Google Research;Toronto University;Massachusetts Institute of Technology;Vector;;Department of Computer Science, University of Wisconsin, Madison;Google;Northwestern University", "aff_domain": ";google.com;utoronto.ca;mit.edu;vectorinstitute.ai;;cs.wisc.edu;google.com;northwestern.edu", "position": ";AI Resident;MS student;PhD student;Postdoc;;Full Professor;Research Scientist;Assistant Professor", "bibtex": "@misc{\ndziedzic2022private,\ntitle={Private Multi-Winner Voting For Machine Learning},\nauthor={Adam Dziedzic and Christopher A. Choquette-Choo and Natalie Dullerud and Vinith Menon Suriyakumar and Ali Shahin Shamsabadi and Muhammad Ahmad Kaleem and Somesh Jha and Nicolas Papernot and Xiao Wang},\nyear={2022},\nurl={https://openreview.net/forum?id=JedTK_aOaRa}\n}", "github": "", "project": "", "reviewers": "3vmh;Fb19;jFrr;6iWo;47xz", "site": "https://openreview.net/forum?id=JedTK_aOaRa", "pdf_size": 0, "recommendation": "3;5;5;5;8", "confidence": "4;2;2;3;3", "correctness": "3;3;4;4;3", "technical_novelty": "2;3;3;3;3", "empirical_novelty": "3;2;2;2;3", "wc_summary_paper": "106;124;124;41;168", "wc_summary_review": "14;7;40;15;68", "wc_main_review": "1687;49;268;359;228", "wc_review": "1807;180;432;415;464", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "5209;743;889;1223;1778", "reply_reviewers": "0;0;0;0;0", "reply_authors": "14;2;3;2;5", "recommendation_avg": [ 5.2, 1.6 ], "confidence_avg": [ 2.8, 0.7483314773547882 ], "correctness_avg": [ 3.4, 0.4898979485566356 ], "technical_novelty_avg": [ 2.8, 0.39999999999999997 ], "empirical_novelty_avg": [ 2.4, 0.4898979485566356 ], "wc_summary_paper_avg": [ 112.6, 41.229115925520404 ], "wc_summary_review_avg": [ 28.8, 22.569005294872877 ], "wc_main_review_avg": [ 518.2, 593.0266098582762 ], "wc_review_avg": [ 659.6, 582.4831671387595 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 1968.4, 1658.9196002217827 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 5.2, 4.534313619501854 ], "replies_avg": [ 36, 0 ], "authors#_avg": [ 9, 0 ], "corr_recommendation_confidence": -0.30066889715147743, "corr_recommendation_correctness": -0.10206207261596574, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:SPSdueyHAqwJ:scholar.google.com/&scioq=Private+Multi-Winner+Voting+For+Machine+Learning&hl=en&as_sdt=0,5", "gs_version_total": 6, "aff_unique_index": "0;1;2;3;4;0;5", "aff_unique_norm": "Google;University of Toronto;Massachusetts Institute of Technology;Vector Institute;University of Wisconsin-Madison;Northwestern University", "aff_unique_dep": "Google Research;;;;Department of Computer Science;", "aff_unique_url": "https://research.google;https://www.utoronto.ca;https://web.mit.edu;https://vectorinstitute.ai/;https://www.wisc.edu;https://www.northwestern.edu", "aff_unique_abbr": "Google Research;U of T;MIT;Vector;UW-Madison;NU", "aff_campus_unique_index": "0;2;0", "aff_campus_unique": "Mountain View;;Madison", "aff_country_unique_index": "0;1;0;1;0;0;0", "aff_country_unique": "United States;Canada" }, { "id": "Jep2ykGUdS", "title": "DEUP: Direct Epistemic Uncertainty Prediction", "track": "main", "status": "Reject", "tldr": "", "abstract": "Epistemic uncertainty is the part of out-of-sample prediction error due to the lack of knowledge of the learner. Whereas previous work was focusing on model variance, we propose a principled approach for directly estimating epistemic uncertainty by learning to predict generalization error and subtracting an estimate of aleatoric uncertainty, i.e., intrinsic unpredictability. This estimator of epistemic uncertainty includes the effect of model bias (or misspecification) and is useful in interactive learning environments arising in active learning or reinforcement learning. In addition to discussing these properties of Direct Epistemic Uncertainty Prediction (DEUP), we illustrate its advantage against existing methods for uncertainty estimation on downstream tasks including sequential model optimization and reinforcement learning. We also evaluate the quality of uncertainty estimates from DEUP for probabilistic classification of images and for estimating uncertainty about synergistic drug combinations. ", "keywords": "deep learning;uncertainty estimation", "primary_area": "", "supplementary_material": "/attachment/af466f8843bdfb77a8b6ddf892670ac368279ed1.zip", "author": "Moksh Jain;Salem Lahlou;Hadi Nekoei;Victor I Butoi;Paul Bertin;Jarrid Rector-Brooks;Maksym Korablyov;Yoshua Bengio", "authorids": "~Moksh_Jain1;~Salem_Lahlou1;~Hadi_Nekoei1;~Victor_I_Butoi1;~Paul_Bertin1;~Jarrid_Rector-Brooks2;~Maksym_Korablyov1;~Yoshua_Bengio1", "gender": "M;M;M;M;;M;M;M", "homepage": "https://mj10.github.io;https://hnekoeiq.github.io/;;;;http://yoshuabengio.org;https://la7.lu;https://victorbutoi.github.io/", "dblp": "249/9368;;;230/4010;;56/953;228/8314;", "google_scholar": "TD07G_wAAAAJ;https://scholar.google.ca/citations?user=7THNjzQAAAAJ;;gxRPZh4AAAAJ;TpuvCSwAAAAJ;kukA0LcAAAAJ;xLSkCrIAAAAJ;7aSLkMUAAAAJ", "orcid": ";;;;;;;", "linkedin": ";;;;;yoshuabengio/?originalSubdomain=ca;;victorbutoi/", "or_profile": "~Moksh_Jain1;~Hadi_Nekoei1;~Paul_Bertin1;~Jarrid_Rector-Brooks2;~Maksym_Korablyov1;~Yoshua_Bengio1;~Salem_Lahlou3;~Victor_Ion_Butoi1", "aff": "Universit\u00e9 de Montr\u00e9al;Mila - Quebec AI Institute;Montreal Institute for Learning Algorithms, University of Montreal, University of Montreal;Montreal Institute for Learning Algorithms, University of Montreal, University of Montreal;;University of Montreal;Montreal Institute for Learning Algorithms, University of Montreal, Universit\u00e9 de Montr\u00e9al;Cornell University", "aff_domain": "umontreal.ca;mila.umontreal.ca;mila.umontreal.ca;mila.umontreal.ca;;umontreal.ca;mila.umontreal.ca;cornell.edu", "position": "MS student;PhD student;PhD student;PhD student;;Full Professor;PhD student;Undergrad student", "bibtex": "@misc{\njain2022deup,\ntitle={{DEUP}: Direct Epistemic Uncertainty Prediction},\nauthor={Moksh Jain and Salem Lahlou and Hadi Nekoei and Victor I Butoi and Paul Bertin and Jarrid Rector-Brooks and Maksym Korablyov and Yoshua Bengio},\nyear={2022},\nurl={https://openreview.net/forum?id=Jep2ykGUdS}\n}", "github": "", "project": "", "reviewers": "Scim;zaec;4Lh6;iSGN", "site": "https://openreview.net/forum?id=Jep2ykGUdS", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "3;4;3;4", "correctness": "2;4;2;4", "technical_novelty": "2;3;3;4", "empirical_novelty": "2;3;3;2", "wc_summary_paper": "65;192;161;119", "wc_summary_review": "20;56;124;51", "wc_main_review": "317;214;1072;543", "wc_review": "402;462;1357;713", "wc_reply_reviewers": "187;0;0;0", "wc_reply_authors": "1479;517;2454;1307", "reply_reviewers": "1;0;0;0", "reply_authors": "4;3;4;4", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.0, 1.0 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 134.25, 47.64123738947174 ], "wc_summary_review_avg": [ 62.75, 37.95638944894522 ], "wc_main_review_avg": [ 536.5, 331.2812249434006 ], "wc_review_avg": [ 733.5, 378.4101610686478 ], "wc_reply_reviewers_avg": [ 46.75, 80.97337525384502 ], "wc_reply_authors_avg": [ 1439.25, 689.0814084707264 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 3.75, 0.4330127018922193 ], "replies_avg": [ 31, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 79, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1550764792476576858&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;2;2;2;2;3", "aff_unique_norm": "Universit\u00e9 de Montr\u00e9al;Quebec AI Institute;University of Montreal;Cornell University", "aff_unique_dep": ";AI Institute;Montreal Institute for Learning Algorithms;", "aff_unique_url": "https://www.umontreal.ca;https://mila.quebec;https://www.umontreal.ca;https://www.cornell.edu", "aff_unique_abbr": "UdeM;Mila;UM;Cornell", "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Montreal", "aff_country_unique_index": "0;0;0;0;0;0;1", "aff_country_unique": "Canada;United States" }, { "title": "Anisotropic Random Feature Regression in High Dimensions", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6646", "id": "JfaWawZ8BmX", "poster": "", "openreview": "https://openreview.net/forum?id=JfaWawZ8BmX", "slides": "https://iclr.cc/virtual/2022/poster/6646", "video": "https://iclr.cc/virtual/2022/poster/6646", "author_site": "Gabriel Mel, Jeffrey Pennington", "tldr": "", "abstract": "In contrast to standard statistical wisdom, modern learning algorithms typically find their best performance in the overparameterized regime in which the model has many more parameters than needed to fit the training data. A growing number of recent works have shown that random feature models can offer a detailed theoretical explanation for this unexpected behavior, but typically these analyses have utilized isotropic distributional assumptions on the underlying data generation process, thereby failing to provide a realistic characterization of real-world models that are designed to identify and harness the structure in natural data. In this work, we examine the high-dimensional asymptotics of random feature regression in the presence of structured data, allowing for arbitrary input correlations and arbitrary alignment between the data and the weights of the target function. We define a partial order on the space of weight-data alignments and prove that generalization performance improves in response to stronger alignment. We also clarify several previous observations in the literature by distinguishing the behavior of the sample-wise and parameter-wise learning curves, finding that sample-wise multiple descent can occur at scales dictated by the eigenstructure of the data covariance, but that parameter-wise multiple descent is limited to double descent, although strong anisotropy can induce additional signatures such as wide plateaus and steep cliffs. Finally, these signatures are related to phase transitions in the spectrum of the feature kernel matrix, and unlike the double descent peak, persist even under optimal regularization.", "keywords": "random feature models;high dimensional asymptotics;generalization;learning curves;double descent;multiple descent;alignment", "primary_area": "", "supplementary_material": "/attachment/3b2a48f01870eb81d540e9c5ce4f7b87bd016e94.zip", "author": "Gabriel Mel;Jeffrey Pennington", "authorids": "~Gabriel_Mel1;~Jeffrey_Pennington1", "gender": "M;M", "homepage": "https://ganguli-gang.stanford.edu/;", "dblp": ";https://dblp.org/pers/p/Pennington:Jeffrey.html", "google_scholar": ";cn_FoswAAAAJ", "orcid": ";", "linkedin": ";jpennin", "or_profile": "~Gabriel_Mel1;~Jeffrey_Pennington1", "aff": "Stanford University;Google", "aff_domain": "stanford.edu;google.com", "position": "PhD student;Research Scientist", "bibtex": "@inproceedings{\nmel2022anisotropic,\ntitle={Anisotropic Random Feature Regression in High Dimensions},\nauthor={Gabriel Mel and Jeffrey Pennington},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=JfaWawZ8BmX}\n}", "github": "", "project": "", "reviewers": "3RfG;ot31;6qzE;bozT", "pdf_size": 0, "recommendation": "6;6;6;8", "confidence": "4;4;3;4", "correctness": "3;3;3;4", "technical_novelty": "2;3;2;4", "empirical_novelty": "2;2;3;0", "wc_summary_paper": "204;123;101;169", "wc_summary_review": "255;334;59;169", "wc_main_review": "529;1195;326;98", "wc_review": "988;1652;486;436", "wc_reply_reviewers": "149;331;0;0", "wc_reply_authors": "1769;2530;1049;263", "reply_reviewers": "1;2;0;0", "reply_authors": "4;4;2;1", "recommendation_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.82915619758885 ], "empirical_novelty_avg": [ 1.75, 1.0897247358851685 ], "wc_summary_paper_avg": [ 149.25, 40.01484099681017 ], "wc_summary_review_avg": [ 204.25, 102.16500134586208 ], "wc_main_review_avg": [ 537.0, 409.3500946622585 ], "wc_review_avg": [ 890.5, 489.7905164455514 ], "wc_reply_reviewers_avg": [ 120.0, 136.16350465524894 ], "wc_reply_authors_avg": [ 1402.75, 840.9816808349633 ], "reply_reviewers_avg": [ 0.75, 0.82915619758885 ], "reply_authors_avg": [ 2.75, 1.299038105676658 ], "replies_avg": [ 20, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": 1.0, "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7151670483503779960&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "pdf": "https://openreview.net/pdf?id=JfaWawZ8BmX", "email": "stanford.edu;google.com", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "Stanford University;Google", "aff_unique_dep": ";Google", "aff_unique_url": "https://www.stanford.edu;https://www.google.com", "aff_unique_abbr": "Stanford;Google", "aff_campus_unique_index": "0;1", "aff_campus_unique": "Stanford;Mountain View", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "JgmY4TUgznC", "title": "Few-Shot Multi-task Learning via Implicit regularization", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Modern machine learning is highly data-intensive. Few-shot learning (FSL) aims to resolve this sample efficiency problem by learning from multiple tasks and quickly adapt to new tasks containing only a few samples. However, FSL problems proves to be significantly more challenging and require more compute expensive process to optimize. In this work, we consider multi-task linear regression (MTLR) as a canonical problem for few-shot learning, and investigate the source of challenge of FSL. We find that the MTLR exhibits local minimum problems that are not present in single-task problem, and thus making the learning much more challenging. We also show that the problem can be resolved by overparameterizing the model by increasing both the width and depth of the linear network and initializing the weights with small values, exploiting the implicit regularization bias of gradient descent-based learning. ", "keywords": "Few Shot Learning;Learning Instability", "primary_area": "", "supplementary_material": "", "author": "Dongsung Huh", "authorids": "~Dongsung_Huh1", "gender": "", "homepage": "", "dblp": "147/6326", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "~Dongsung_Huh1", "aff": "International Business Machines", "aff_domain": "ibm.com", "position": "Principal Researcher", "bibtex": "@misc{\nhuh2022fewshot,\ntitle={Few-Shot Multi-task Learning via Implicit regularization},\nauthor={Dongsung Huh},\nyear={2022},\nurl={https://openreview.net/forum?id=JgmY4TUgznC}\n}", "github": "", "project": "", "reviewers": "", "site": "https://openreview.net/forum?id=JgmY4TUgznC", "pdf_size": 0, "recommendation": "", "confidence": "", "correctness": "", "technical_novelty": "", "empirical_novelty": "", "wc_summary_paper": "", "wc_summary_review": "", "wc_main_review": "", "wc_review": "", "wc_reply_reviewers": "", "wc_reply_authors": "", "reply_reviewers": "", "reply_authors": "", "recommendation_avg": [ 0, 0 ], "confidence_avg": [ 0, 0 ], "correctness_avg": [ 0, 0 ], "technical_novelty_avg": [ 0, 0 ], "empirical_novelty_avg": [ 0, 0 ], "wc_summary_paper_avg": [ 0, 0 ], "wc_summary_review_avg": [ 0, 0 ], "wc_main_review_avg": [ 0, 0 ], "wc_review_avg": [ 0, 0 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 1, 0 ], "authors#_avg": [ 1, 0 ], "corr_recommendation_confidence": 0, "corr_recommendation_correctness": 0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:h6AOTBBU9DYJ:scholar.google.com/&scioq=Few-Shot+Multi-task+Learning+via+Implicit+regularization&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "International Business Machines Corporation", "aff_unique_dep": "", "aff_unique_url": "https://www.ibm.com", "aff_unique_abbr": "IBM", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "id": "Jh9VxCkrEZn", "title": "Spatiotemporal Representation Learning on Time Series with Dynamic Graph ODEs", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Spatiotemporal representation learning on multivariate time series has received tremendous attention in forecasting traffic and energy data. Recent works either rely on complicated discrete neural architectures or graph priors, hindering their effectiveness and applications in the real world. In this paper, inspired by neural ordinary differential equations and graph structure learning, we propose a fully continuous model named Dynamic Graph ODE (DyG-ODE) to capture both long-range spatial and temporal dependencies to learn expressive representations on arbitrary multivariate time series data without being restricted by rigid preconditions (e.g., graph priors). For modeling the continuous dynamics of spatiotemporal clues, we design a simple yet powerful dynamic graph ODE by coupling the proposed spatial and temporal ODEs, which not only allows the model to obtain infinite spatial and temporal receptive fields but also reduces numerical errors and model complexity significantly. Our empirical evaluations demonstrate the superior effectiveness and efficiency of DyG-ODE on a number of benchmark datasets.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ming Jin;Yuan-Fang Li;Yu Zheng;Bin Yang;Shirui Pan", "authorids": "~Ming_Jin3;~Yuan-Fang_Li1;~Yu_Zheng5;~Bin_Yang4;~Shirui_Pan1", "gender": "M;M;F;M;", "homepage": "https://mingjin.dev/;https://users.monash.edu.au/~yli/;;https://faculty.ecnu.edu.cn/_s37/yb2/main.psp;", "dblp": "34/3870-5;20/2537;;77/377-2;91/8171", "google_scholar": "I2xvKaIAAAAJ;https://scholar.google.com.tw/citations?user=wufXO1kAAAAJ;https://scholar.google.com.au/citations?user=j4pGvBgAAAAJ;qjBQhoUAAAAJ;https://scholar.google.com.au/citations?user=frWRJN4AAAAJ", "orcid": "0000-0002-6833-4811;;;0000-0002-1658-1079;0000-0003-0794-527X", "linkedin": ";;;;", "or_profile": "~Ming_Jin3;~Yuan-Fang_Li1;~Yu_Zheng5;~Bin_Yang4;~Shirui_Pan1", "aff": "Monash University;Monash University;Latrobe University;Aalborg University;Monash University", "aff_domain": "monash.edu;monash.edu;latrobe.edu.au;aau.dk;monash.edu", "position": "PhD student;Associate Professor;PhD student;Full Professor;Assistant Professor", "bibtex": "@misc{\njin2022spatiotemporal,\ntitle={Spatiotemporal Representation Learning on Time Series with Dynamic Graph {ODE}s},\nauthor={Ming Jin and Yuan-Fang Li and Yu Zheng and Bin Yang and Shirui Pan},\nyear={2022},\nurl={https://openreview.net/forum?id=Jh9VxCkrEZn}\n}", "github": "", "project": "", "reviewers": "JDxk;1brA;jWqH;ZzCu;yFFT", "site": "https://openreview.net/forum?id=Jh9VxCkrEZn", "pdf_size": 0, "recommendation": "3;3;3;5;6", "confidence": "4;5;5;4;3", "correctness": "3;1;2;3;3", "technical_novelty": "2;3;2;3;2", "empirical_novelty": "2;2;2;2;2", "wc_summary_paper": "60;137;47;105;80", "wc_summary_review": "286;64;17;84;36", "wc_main_review": "222;477;415;261;446", "wc_review": "568;678;479;450;562", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "0;0;0;0;0", "reply_reviewers": "0;0;0;0;0", "reply_authors": "0;0;0;0;0", "recommendation_avg": [ 4.0, 1.2649110640673518 ], "confidence_avg": [ 4.2, 0.7483314773547882 ], "correctness_avg": [ 2.4, 0.8 ], "technical_novelty_avg": [ 2.4, 0.4898979485566356 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 85.8, 32.232902444551904 ], "wc_summary_review_avg": [ 97.4, 97.05586020431738 ], "wc_main_review_avg": [ 364.2, 102.82684474396751 ], "wc_review_avg": [ 547.4, 79.82380597290509 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.8451542547285165, "corr_recommendation_correctness": 0.5929270612815711, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11373871794083557209&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;1;2;0", "aff_unique_norm": "Monash University;La Trobe University;Aalborg University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.monash.edu;https://www.latrobe.edu.au;https://www.aau.dk", "aff_unique_abbr": "Monash;LaTrobe;AAU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;0", "aff_country_unique": "Australia;Denmark" }, { "title": "Rethinking Supervised Pre-Training for Better Downstream Transferring", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6590", "id": "Jjcv9MTqhcq", "poster": "", "openreview": "https://openreview.net/forum?id=Jjcv9MTqhcq", "slides": "https://iclr.cc/virtual/2022/poster/6590", "video": "https://iclr.cc/virtual/2022/poster/6590", "author_site": "Yutong Feng, Jianwen Jiang, Mingqian Tang, Rong Jin, Yue Gao", "tldr": "", "abstract": "The pretrain-finetune paradigm has shown outstanding performance on many applications of deep learning, where a model is pre-trained on an upstream large dataset (e.g. ImageNet), and is then fine-tuned to different downstream tasks. Though for most cases, the pre-training stage is conducted based on supervised methods, recent works on self-supervised pre-training have shown powerful transferability and even outperform supervised pre-training on multiple downstream tasks. It thus remains an open question how to better generalize supervised pre- training model to downstream tasks. In this paper, we argue that the worse transferability of existing supervised pre-training methods arise from the negligence of valuable intra-class semantic difference. This is because these methods tend to push images from the same class close to each other despite of the large diversity in their visual contents, a problem to which referred as \u201coverfit of upstream tasks\u201d. To alleviate this problem, we propose a new supervised pre-training method based on Leave-One-Out K-Nearest-Neighbor, or LOOK for short. It relieves the problem of overfitting upstream tasks by only requiring each image to share its class label with most of its k nearest neighbors, thus allowing each class to exhibit a multi-mode distribution and consequentially preserving part of intra-class difference for better transferring to downstream tasks. We developed efficient implementation of the proposed method that scales well to large datasets. Experimental studies on multiple downstream tasks show that LOOK outperforms other state-of-the-art methods for supervised and self-supervised pre-training.", "keywords": "Pre-Training;Contrastive Learning;Representation Learning;Downstream Transferring", "primary_area": "", "supplementary_material": "", "author": "Yutong Feng;Jianwen Jiang;Mingqian Tang;Rong Jin;Yue Gao", "authorids": "~Yutong_Feng2;~Jianwen_Jiang2;~Mingqian_Tang1;~Rong_Jin1;~Yue_Gao4", "gender": "M;;F;;M", "homepage": ";;;;http://www.gaoyue.org", "dblp": ";;;;33/3099-2", "google_scholar": "https://scholar.google.com.hk/citations?user=mZwJLeUAAAAJ;;;;UTDfWocAAAAJ", "orcid": ";;0000-0002-7117-6666;;", "linkedin": ";;;;", "or_profile": "~Yutong_Feng2;~Jianwen_Jiang2;~Mingqian_Tang1;~Rong_Jin1;~Yue_Gao4", "aff": "Tsinghua University;;Alibaba Group;;Tsinghua University", "aff_domain": "tsinghua.edu.cn;;alibaba-inc.com;;tsinghua.edu.cn", "position": "MS student;;Staff Algorithm Engineer;;Associate Professor", "bibtex": "@inproceedings{\nfeng2022rethinking,\ntitle={Rethinking Supervised Pre-Training for Better Downstream Transferring},\nauthor={Yutong Feng and Jianwen Jiang and Mingqian Tang and Rong Jin and Yue Gao},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=Jjcv9MTqhcq}\n}", "github": "", "project": "", "reviewers": "9e8a;mRy7;wg1q;PfhP", "pdf_size": 0, "recommendation": "5;6;6;6", "confidence": "4;4;4;4", "correctness": "3;2;4;3", "technical_novelty": "2;2;3;2", "empirical_novelty": "1;2;3;3", "wc_summary_paper": "184;63;50;94", "wc_summary_review": "24;87;34;87", "wc_main_review": "404;342;124;414", "wc_review": "612;492;208;595", "wc_reply_reviewers": "137;148;101;0", "wc_reply_authors": "1257;2712;581;1180", "reply_reviewers": "2;1;2;0", "reply_authors": "4;5;3;3", "recommendation_avg": [ 5.75, 0.4330127018922193 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 97.75, 52.299020067301456 ], "wc_summary_review_avg": [ 58.0, 29.214722315983085 ], "wc_main_review_avg": [ 321.0, 117.0341830406826 ], "wc_review_avg": [ 476.75, 161.81374323585743 ], "wc_reply_reviewers_avg": [ 96.5, 58.36308764964376 ], "wc_reply_authors_avg": [ 1432.5, 783.6978052795605 ], "reply_reviewers_avg": [ 1.25, 0.82915619758885 ], "reply_authors_avg": [ 3.75, 0.82915619758885 ], "replies_avg": [ 27, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 52, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5859998130665760293&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=Jjcv9MTqhcq", "email": "tsinghua.edu.cn;;alibaba-inc.com;;tsinghua.edu.cn", "author_num": 5, "aff_unique_index": "0;1;0", "aff_unique_norm": "Tsinghua University;Alibaba Group", "aff_unique_dep": ";", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.alibaba.com", "aff_unique_abbr": "THU;Alibaba", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "id": "JkVSM0X_4w_", "title": "Switch Spaces: Learning Product Spaces with Sparse Gating", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Aligning the geometric inductive bias well with the underlying structure of data is critical for representation learning. To achieve this goal, we propose \\textit{switch spaces}, a data-driven representation learning approach. Switch space is a generalization of product space which is composed of multiple euclidean and non-euclidean (e.g., hyperbolic, spherical) spaces. Given $N$ spaces, our model utilizes a sparse gating mechanism to let each input data point choose $K (K 2$), we aim to efficiently estimate the density ratios between all pairs of distributions. Such a generalization leads to important new applications such as estimating statistical discrepancy among multiple random variables like multi-distribution $f$-divergence and bias correction via multiple importance sampling. We then develop a general framework from the perspective of Bregman divergence minimization, where each strictly convex multivariate function induces a proper loss for multi-distribution DRE. Moreover, we formally relate multi-distribution density ratio estimation and class probability estimation, theoretically justifying the use of any strictly proper scoring rule composite with a link function for multi-distribution DRE. We show that our framework leads to methods that strictly generalize their counterparts in binary DRE, as well as new methods that show comparable or superior performance on various downstream tasks.", "keywords": "multi-distribution density ratio estimation;Bregman divergence;proper scoring rules", "primary_area": "", "supplementary_material": "", "author": "Lantao Yu;Yujia Jin;Stefano Ermon", "authorids": "~Lantao_Yu2;~Yujia_Jin1;~Stefano_Ermon1", "gender": "M;F;M", "homepage": "http://lantaoyu.com/;https://web.stanford.edu/~yujiajin/;http://cs.stanford.edu/~ermon/", "dblp": "186/7892;https://dblp.uni-trier.de/pers/hd/j/Jin:Yujia;47/8135", "google_scholar": "Ixg9n-EAAAAJ;XTncVoQAAAAJ;", "orcid": ";;", "linkedin": ";;", "or_profile": "~Lantao_Yu2;~Yujia_Jin1;~Stefano_Ermon1", "aff": "Computer Science Department, Stanford University;Stanford University;Stanford University", "aff_domain": "stanford.edu;stanford.edu;stanford.edu", "position": "PhD student;PhD student;Assistant Professor", "bibtex": "@misc{\nyu2022a,\ntitle={A Unified Framework for Multi-distribution Density Ratio Estimation},\nauthor={Lantao Yu and Yujia Jin and Stefano Ermon},\nyear={2022},\nurl={https://openreview.net/forum?id=Lkx3Ta9rOSq}\n}", "github": "", "project": "", "reviewers": "Cnaf;PR1q;tUZo;mV2o", "site": "https://openreview.net/forum?id=Lkx3Ta9rOSq", "pdf_size": 0, "recommendation": "3;3;6;6", "confidence": "4;4;3;3", "correctness": "3;4;4;2", "technical_novelty": "2;2;4;3", "empirical_novelty": "2;3;2;3", "wc_summary_paper": "100;55;135;195", "wc_summary_review": "46;83;100;62", "wc_main_review": "524;911;329;357", "wc_review": "670;1049;564;614", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.5, 1.5 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.75, 0.82915619758885 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 121.25, 51.15845482420281 ], "wc_summary_review_avg": [ 72.75, 20.48627589387588 ], "wc_main_review_avg": [ 530.25, 232.12429321378664 ], "wc_review_avg": [ 724.25, 191.20718474994604 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -1.0, "corr_recommendation_correctness": -0.3015113445777637, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4550840615437174101&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Stanford University", "aff_unique_dep": "Computer Science Department", "aff_unique_url": "https://www.stanford.edu", "aff_unique_abbr": "Stanford", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Autoregressive Diffusion Models", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6228", "id": "Lm8T39vLDTE", "poster": "", "openreview": "https://openreview.net/forum?id=Lm8T39vLDTE", "slides": "https://iclr.cc/virtual/2022/poster/6228", "video": "https://iclr.cc/virtual/2022/poster/6228", "author_site": "Emiel Hoogeboom, Alexey Gritsenko, Jasmijn Bastings, Ben Poole, Rianne van den Berg, Tim Salimans", "tldr": "", "abstract": "We introduce Autoregressive Diffusion Models (ARDMs), a model class encompassing and generalizing order-agnostic autoregressive models (Uria et al., 2014) and absorbing discrete diffusion (Austin et al., 2021), which we show are special cases of ARDMs under mild assumptions. ARDMs are simple to implement and easy to train. Unlike standard ARMs, they do not require causal masking of model representations, and can be trained using an efficient objective similar to modern probabilistic diffusion models that scales favourably to highly-dimensional data. At test time, ARDMs support parallel generation which can be adapted to fit any given generation budget. We find that ARDMs require significantly fewer steps than discrete diffusion models to attain the same performance. Finally, we apply ARDMs to lossless compression, and show that they are uniquely suited to this task. Contrary to existing approaches based on bits-back coding, ARDMs obtain compelling results not only on complete datasets, but also on compressing single data points. Moreover, this can be done using a modest number of network calls for (de)compression due to the model's adaptable parallel generation.", "keywords": "diffusion;autoregressive models;lossless compression", "primary_area": "", "supplementary_material": "", "author": "Emiel Hoogeboom;Alexey A. Gritsenko;Jasmijn Bastings;Ben Poole;Rianne van den Berg;Tim Salimans", "authorids": "~Emiel_Hoogeboom1;~Alexey_A._Gritsenko1;~Jasmijn_Bastings1;~Ben_Poole1;~Rianne_van_den_Berg1;~Tim_Salimans1", "gender": ";M;F;M;Not Specified;F", "homepage": ";https://cs.stanford.edu/~poole;https://research.google/people/RiannevandenBerg/;;;https://bastings.github.io", "dblp": "217/1488;16/10397;198/1077;116/2791;30/11478;146/3824", "google_scholar": "https://scholar.google.nl/citations?user=nkTd_BIAAAAJ;i5FMLA4AAAAJ;KARgiboAAAAJ;;https://scholar.google.nl/citations?user=zTy9cUwAAAAJ;VG_wuYkAAAAJ", "orcid": ";;0000-0001-5076-2802;;;0000-0002-5445-4417", "linkedin": ";;;;agritsenko/;jasmijn-bastings/", "or_profile": "~Emiel_Hoogeboom1;~Ben_Poole1;~Rianne_van_den_Berg1;~Tim_Salimans1;~Alexey_Alexeevich_Gritsenko1;~Jasmijn_Bastings2", "aff": "University of Amsterdam;Google;Microsoft;Google;Google;Google DeepMind", "aff_domain": "uva.nl;google.com;microsoft.com;google.com;google.com;google.com", "position": "PhD student;Research Scientist;Researcher;Research Scientist;Researcher;Researcher", "bibtex": "@inproceedings{\nhoogeboom2022autoregressive,\ntitle={Autoregressive Diffusion Models},\nauthor={Emiel Hoogeboom and Alexey A. Gritsenko and Jasmijn Bastings and Ben Poole and Rianne van den Berg and Tim Salimans},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=Lm8T39vLDTE}\n}", "github": "", "project": "", "reviewers": "Y5ay;Pog3;y93W;g6Qq", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "3;3;3;4", "correctness": "4;4;4;4", "technical_novelty": "2;3;3;4", "empirical_novelty": "2;2;4;4", "wc_summary_paper": "322;65;93;131", "wc_summary_review": "113;47;54;46", "wc_main_review": "844;56;99;288", "wc_review": "1279;168;246;465", "wc_reply_reviewers": "0;18;15;130", "wc_reply_authors": "471;257;189;266", "reply_reviewers": "0;1;1;1", "reply_authors": "1;1;1;1", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 4.0, 0.0 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 3.0, 1.0 ], "wc_summary_paper_avg": [ 152.75, 100.48476252646468 ], "wc_summary_review_avg": [ 65.0, 27.883686987197372 ], "wc_main_review_avg": [ 321.75, 313.89677841609017 ], "wc_review_avg": [ 539.5, 440.61462753748884 ], "wc_reply_reviewers_avg": [ 40.75, 51.977759666996036 ], "wc_reply_authors_avg": [ 295.75, 105.46889351842087 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.9271726499455306, "corr_recommendation_correctness": 0.0, "gs_citation": 165, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8372819410014734365&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=Lm8T39vLDTE", "email": "uva.nl;google.com;microsoft.com;google.com;google.com;google.com", "author_num": 6, "aff_unique_index": "0;1;2;1;1;1", "aff_unique_norm": "University of Amsterdam;Google;Microsoft", "aff_unique_dep": ";Google;Microsoft Corporation", "aff_unique_url": "https://www.uva.nl;https://www.google.com;https://www.microsoft.com", "aff_unique_abbr": "UvA;Google;Microsoft", "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;1;1;1;1;2", "aff_country_unique": "Netherlands;United States;United Kingdom" }, { "id": "Ln5BeHxhVA3", "title": "FLBoost: On-the-Fly Fine-tuning Boosts Federated Learning via Data-free Distillation", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Federated Learning (FL) is an emerging distributed learning paradigm for protecting privacy. Data heterogeneity is one of the main challenges in FL, which causes slow convergence and degraded performance. Most existing approaches tackle the heterogeneity challenge by restricting the local model update in client, ignoring the performance drop caused by direct global model aggregation. On the contrary, we propose a new solution: on-the-fly fine-tuning the global model in server via data-free distillation to boost its performance, dubbed FLBoost to relieve the issue of direct model aggregation. Specifically, FLBoost adopts an adversarial distillation scheme to continually transfer the knowledge from local models to fine-tune the global model. In addition, focused distillation and attention-based ensemble techniques are developed to balance the extracted pseudo-knowledge to adapt the data heterogeneity scenario, which implicitly mitigates the distribution discrepancy across clients. Extensive experiments show that our FLBoost can achieve superior performance against the state-of-the-art FL algorithms and can serve as a strong plugin for enhancing FedAvg, FedProx, FedDyn, and SCAFFOLD.", "keywords": "federated learning;adversarial learning;on-the-fly fine-tuning", "primary_area": "", "supplementary_material": "", "author": "Lin Zhang;Li Shen;Liang Ding;Dacheng Tao;LINGYU DUAN", "authorids": "~Lin_Zhang8;~Li_Shen1;~Liang_Ding3;~Dacheng_Tao1;~LINGYU_DUAN1", "gender": ";M;M;;M", "homepage": ";https://sites.google.com/site/mathshenli/home;http://liamding.cc/;;http://eecs.pku.edu.cn/EN/People/Faculty/Detail/?ID=6096", "dblp": "d/LingyuDuan;91/3680-8;88/3340-6.html;;d/LingyuDuan", "google_scholar": "9jjtLOQAAAAJ;yVhgENIAAAAJ;lFCLvOAAAAAJ;;", "orcid": "0000-0003-1049-3104;;;;", "linkedin": ";;;;", "or_profile": "~Lin_Zhang8;~Li_Shen1;~Liang_Ding3;~Dacheng_Tao1;~LINGYU_DUAN1", "aff": "JD Explore Academy;JD Explore Academy;JD Explore Academy, JD.com Inc.;;Peking University", "aff_domain": "jd.com;jd.com;jd.com;;pku.edu.cn", "position": "Intern;Researcher;Research Scientist;;Full Professor", "bibtex": "@misc{\nzhang2022flboost,\ntitle={{FLB}oost: On-the-Fly Fine-tuning Boosts Federated Learning via Data-free Distillation},\nauthor={Lin Zhang and Li Shen and Liang Ding and Dacheng Tao and LINGYU DUAN},\nyear={2022},\nurl={https://openreview.net/forum?id=Ln5BeHxhVA3}\n}", "github": "", "project": "", "reviewers": "QdK9;Aw8p;8BsR", "site": "https://openreview.net/forum?id=Ln5BeHxhVA3", "pdf_size": 0, "recommendation": "3;5;5", "confidence": "5;4;4", "correctness": "3;3;4", "technical_novelty": "2;2;3", "empirical_novelty": "2;2;2", "wc_summary_paper": "108;97;44", "wc_summary_review": "15;30;16", "wc_main_review": "416;409;149", "wc_review": "539;536;209", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "832;750;421", "reply_reviewers": "0;0;0", "reply_authors": "2;1;1", "recommendation_avg": [ 4.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 83.0, 27.94041278626117 ], "wc_summary_review_avg": [ 20.333333333333332, 6.847546194724712 ], "wc_main_review_avg": [ 324.6666666666667, 124.24795996536746 ], "wc_review_avg": [ 428.0, 154.8612282012512 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 667.6666666666666, 177.6031781497417 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.9999999999999997, "corr_recommendation_correctness": 0.49999999999999983, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:SGXUUFF2nJgJ:scholar.google.com/&scioq=FLBoost:+On-the-Fly+Fine-tuning+Boosts+Federated+Learning+via+Data-free+Distillation&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;1;2", "aff_unique_norm": "JD;JD.com Inc.;Peking University", "aff_unique_dep": "JD Explore Academy;JD Explore Academy;", "aff_unique_url": ";https://www.jd.com;http://www.pku.edu.cn", "aff_unique_abbr": ";JD.com;Peking U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "1;1", "aff_country_unique": ";China" }, { "id": "Lr1iAFrtMcZ", "title": "Tuning Confidence Bound for Stochastic Bandits with Bandit Distance", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "We propose a novel modification of the standard upper confidence bound (UCB) method for the stochastic multi-armed bandit (MAB) problem which tunes the confidence bound of a given bandit based on its distance to others. Our UCB distance tuning (UCB-DT) formulation enables improved performance as measured by expected regret by preventing the MAB algorithm from focusing on non-optimal bandits which is a well-known deficiency of standard UCB. \"Distance tuning\" of the standard UCB is done using a proposed distance measure, which we call bandit distance, that is parameterizable and which therefore can be optimized to control the transition rate from exploration to exploitation based on problem requirements. We empirically demonstrate increased performance of UCB-DT versus many existing state-of-the-art methods which use the UCB formulation for the MAB problem. Our contribution also includes the development of a conceptual tool called the Exploration Bargain Point which gives insights into the tradeoffs between exploration and exploitation. We argue that the Exploration Bargain Point provides an intuitive perspective that is useful for comparatively analyzing the performance of UCB-based methods.", "keywords": "Multi-Armed Bandits;Stochastic Bandits;Upper Confidence Bound;Exploration-Exploitation Trade-off", "primary_area": "", "supplementary_material": "", "author": "Xinyu Zhang;Srinjoy Das;Ken Kreutz-Delgado\u202c", "authorids": "~Xinyu_Zhang7;srinjoy.das@mail.wvu.edu;kreutz@eng.ucsd.edu", "gender": "M;;", "homepage": "https://mlzxy.github.io/;;", "dblp": ";;", "google_scholar": "M7hnG9oAAAAJ;;", "orcid": ";;", "linkedin": ";;", "or_profile": "~Xinyu_Zhang7;srinjoy.das@mail.wvu.edu;kreutz@eng.ucsd.edu", "aff": "Rutgers University;;", "aff_domain": "rutgers.edu;;", "position": "PhD student;;", "bibtex": "@misc{\nzhang2022tuning,\ntitle={Tuning Confidence Bound for Stochastic Bandits with Bandit Distance},\nauthor={Xinyu Zhang and Srinjoy Das and Ken Kreutz-Delgado\u202c},\nyear={2022},\nurl={https://openreview.net/forum?id=Lr1iAFrtMcZ}\n}", "github": "", "project": "", "reviewers": "aj3u;43yC;3MuM;bFUP", "site": "https://openreview.net/forum?id=Lr1iAFrtMcZ", "pdf_size": 0, "recommendation": "1;3;5;5", "confidence": "5;3;4;5", "correctness": "2;3;3;3", "technical_novelty": "1;2;2;3", "empirical_novelty": "0;2;2;3", "wc_summary_paper": "67;54;45;233", "wc_summary_review": "2;52;26;65", "wc_main_review": "600;310;229;236", "wc_review": "669;416;300;534", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.5, 1.6583123951777 ], "confidence_avg": [ 4.25, 0.82915619758885 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 1.75, 1.0897247358851685 ], "wc_summary_paper_avg": [ 99.75, 77.32843914110772 ], "wc_summary_review_avg": [ 36.25, 24.252577182641847 ], "wc_main_review_avg": [ 343.75, 151.311557721147 ], "wc_review_avg": [ 479.75, 137.05176941579413 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.0909090909090909, "corr_recommendation_correctness": 0.8703882797784891, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9167622178638431626&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0", "aff_unique_norm": "Rutgers University", "aff_unique_dep": "", "aff_unique_url": "https://www.rutgers.edu", "aff_unique_abbr": "Rutgers", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "id": "LsLW5JE7qtV", "title": "Learning to Learn across Diverse Data Biases in Deep Face Recognition", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Convolutional Neural Networks have achieved remarkable success in face recognition, in part due to the abundant availability of data. However, the data used for training CNNs is often imbalanced. Prior works largely focus on the long-tailed nature of face datasets with respect to the number of instances per identity. In this paper, we show that besides the imbalanced class volume distribution, other variations such as ethnicity, head pose, occlusion and blur can also significantly affect accuracy. To address the problem, we propose a sample level weighting approach called Multi-variation Cosine Margin (MvCoM) which orthogonally enhances the conventional cosine loss function to incorporate the importance of training samples. Further, we leverage a learning to learn approach, guided by a held-out meta learning set and use an additive modeling to predict the MvCoM. Extensive experiments on challenging face recognition benchmarks demonstrate the advantages of our method in jointly handling imbalances due to multiple variations.", "keywords": "Imbalanced data classification;long-tailed classification;face recognition", "primary_area": "", "supplementary_material": "", "author": "Chang Liu;Xiang Yu;Yi-Hsuan Tsai;Ramin Moslemi;Masoud Faraki;Manmohan Chandraker;Yun Fu", "authorids": "~Chang_Liu13;~Xiang_Yu1;~Yi-Hsuan_Tsai1;rmoslemi@nec-labs.com;~Masoud_Faraki1;~Manmohan_Chandraker3;~Yun_Fu1", "gender": "M;M;M;;;;M", "homepage": "https://sites.google.com/view/cliu5/home;https://sites.google.com/site/xiangyurutgers/;https://sites.google.com/site/yihsuantsai/home;;https://www.nicta.com.au/category/research/computer-vision/people/mfaraki/;;http://www1.ece.neu.edu/~yunfu/", "dblp": "52/5716-22;19/2453-2.html;142/2924;;143/9779;;00/5815-1", "google_scholar": "unD8gu4AAAAJ;QJbtEKMAAAAJ;https://scholar.google.it/citations?user=zjI51wEAAAAJ;;zEVWJu0AAAAJ;;https://scholar.google.com.tw/citations?user=h-JEcQ8AAAAJ", "orcid": ";;;;;;0000-0002-5098-2853", "linkedin": ";;;;;;furaymond/", "or_profile": "~Chang_Liu13;~Xiang_Yu1;~Yi-Hsuan_Tsai1;rmoslemi@nec-labs.com;~Masoud_Faraki1;~Manmohan_Chandraker3;~Yun_Fu1", "aff": "Northeastern University;NEC;Phiar Technologies;;NEC-Labs;;Northeastern University", "aff_domain": "northeastern.edu;nec.com;phiar.net;;nec-labs.com;;northeastern.edu", "position": "PhD student;Researcher;Researcher;;Researcher;;Full Professor", "bibtex": "@misc{\nliu2022learning,\ntitle={Learning to Learn across Diverse Data Biases in Deep Face Recognition},\nauthor={Chang Liu and Xiang Yu and Yi-Hsuan Tsai and Ramin Moslemi and Masoud Faraki and Manmohan Chandraker and Yun Fu},\nyear={2022},\nurl={https://openreview.net/forum?id=LsLW5JE7qtV}\n}", "github": "", "project": "", "reviewers": "UXyx;SWjx;MGEP;hU49;WiDP", "site": "https://openreview.net/forum?id=LsLW5JE7qtV", "pdf_size": 0, "recommendation": "3;5;5;5;8", "confidence": "4;3;4;4;4", "correctness": "3;3;3;3;4", "technical_novelty": "2;3;3;3;3", "empirical_novelty": "2;2;2;2;3", "wc_summary_paper": "113;86;308;68;75", "wc_summary_review": "79;29;175;18;62", "wc_main_review": "384;248;530;304;195", "wc_review": "576;363;1013;390;332", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "0;0;0;0;0", "reply_reviewers": "0;0;0;0;0", "reply_authors": "0;0;0;0;0", "recommendation_avg": [ 5.2, 1.6 ], "confidence_avg": [ 3.8, 0.39999999999999997 ], "correctness_avg": [ 3.2, 0.39999999999999997 ], "technical_novelty_avg": [ 2.8, 0.39999999999999997 ], "empirical_novelty_avg": [ 2.2, 0.39999999999999997 ], "wc_summary_paper_avg": [ 130.0, 90.30836063178204 ], "wc_summary_review_avg": [ 72.6, 55.715706941579775 ], "wc_main_review_avg": [ 332.2, 117.06135143590303 ], "wc_review_avg": [ 534.8, 253.764772968984 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.0625, "corr_recommendation_correctness": 0.875, "gs_citation": 26, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15117528686003509918&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;1;2;3;0", "aff_unique_norm": "Northeastern University;NEC Corporation;Phiar Technologies;NEC Laboratories", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.northeastern.edu;https://www.nec.com;;https://www.nec-labs.com", "aff_unique_abbr": "NEU;NEC;;NEC-Labs", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0;0", "aff_country_unique": "United States;Japan" }, { "id": "LtI14EpWKH", "title": "Tessellated 2D Convolution Networks: A Robust Defence against Adversarial Attacks", "track": "main", "status": "Reject", "tldr": "", "abstract": "Data-driven (deep) learning approaches for image classification are prone to adversarial attacks. This means that an adversarial crafted image which is sufficiently close (visually indistinguishable) to its representative class can often be misclassified to be a member of a different class. A reason why deep neural approaches exhibits such vulnerability towards adversarial threats is mainly because the abstract representations learned in a data-driven manner often do not correlate well with human perceived features. To mitigate this problem, we propose the tessellated 2d convolution network, a novel divide-and-conquer based approach, which first independently learns the abstract representations of non-overlapping regions within an image, and then learns how to combine these representations to infer its class. It turns out that a non-uniform tiling of an image which ensures that the difference between the maximum and the minimum region sizes is not too large is the most robust way to construct such a tessellated 2d convolution network. This criterion can be achieved, among other schemes, by using a Mondrian tessellation of the input image. Our experiments demonstrate that our tessellated networks provides a more robust defence mechanism against gradient-based adversarial attacks in comparison to conventional deep neural models.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Swarnava Das;Pabitra Mitra;Debasis Ganguly", "authorids": "swarnava.das27@kgpian.iitkgp.ac.in;~Pabitra_Mitra1;~Debasis_Ganguly2", "gender": ";M;M", "homepage": ";http://cse.iitkgp.ac.in/~pabitra/;https://gdebasis.github.io/", "dblp": ";m/PabitraMitra;41/7272", "google_scholar": ";https://scholar.google.com.tw/citations?user=5bXSZPYAAAAJ;FhQENQgAAAAJ", "orcid": ";0000-0002-1908-9813;0000-0003-0050-7138", "linkedin": ";pabitra-mitra-8028235/;deb4it/", "or_profile": "swarnava.das27@kgpian.iitkgp.ac.in;~Pabitra_Mitra1;~Debasis_Ganguly2", "aff": ";Indian Institute of Technology Kharagpur;University of Glasgow", "aff_domain": ";iitkgp.ac.in;glasgow.ac.uk", "position": ";Full Professor;Assistant Professor", "bibtex": "@misc{\ndas2022tessellated,\ntitle={Tessellated 2D Convolution Networks: A Robust Defence against Adversarial Attacks},\nauthor={Swarnava Das and Pabitra Mitra and Debasis Ganguly},\nyear={2022},\nurl={https://openreview.net/forum?id=LtI14EpWKH}\n}", "github": "", "project": "", "reviewers": "Znf4;m2PC;CSai;HrGf", "site": "https://openreview.net/forum?id=LtI14EpWKH", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "4;4;5;3", "correctness": "2;2;4;3", "technical_novelty": "3;2;3;3", "empirical_novelty": "1;2;3;2", "wc_summary_paper": "49;61;41;43", "wc_summary_review": "25;37;86;62", "wc_main_review": "389;197;249;227", "wc_review": "463;295;376;332", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 2.75, 0.82915619758885 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 48.5, 7.794228634059948 ], "wc_summary_review_avg": [ 52.5, 23.5 ], "wc_main_review_avg": [ 265.5, 73.6529021831455 ], "wc_review_avg": [ 366.5, 62.659795722616266 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.9045340337332909, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:HMUJyA1u8BAJ:scholar.google.com/&scioq=Tessellated+2D+Convolution+Networks:+A+Robust+Defence+against+Adversarial+Attacks&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1", "aff_unique_norm": "Indian Institute of Technology Kharagpur;University of Glasgow", "aff_unique_dep": ";", "aff_unique_url": "https://www.iitkgp.ac.in;https://www.gla.ac.uk", "aff_unique_abbr": "IIT Kharagpur;Glasgow", "aff_campus_unique_index": "0", "aff_campus_unique": "Kharagpur;", "aff_country_unique_index": "0;1", "aff_country_unique": "India;United Kingdom" }, { "title": "When Vision Transformers Outperform ResNets without Pre-training or Strong Data Augmentations", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6357", "id": "LtKcMgGOeLt", "poster": "", "openreview": "https://openreview.net/forum?id=LtKcMgGOeLt", "slides": "https://iclr.cc/virtual/2022/poster/6357", "video": "https://iclr.cc/virtual/2022/poster/6357", "author_site": "Xiangning Chen, Cho-Jui Hsieh, Boqing Gong", "tldr": "", "abstract": "Vision Transformers (ViTs) and MLPs signal further efforts on replacing hand-wired features or inductive biases with general-purpose neural architectures. Existing works empower the models by massive data, such as large-scale pre-training and/or repeated strong data augmentations, and still report optimization-related problems (e.g., sensitivity to initialization and learning rates). Hence, this paper investigates ViTs and MLP-Mixers from the lens of loss geometry, intending to improve the models' data efficiency at training and generalization at inference. Visualization and Hessian reveal extremely sharp local minima of converged models. By promoting smoothness with a recently proposed sharpness-aware optimizer, we substantially improve the accuracy and robustness of ViTs and MLP-Mixers on various tasks spanning supervised, adversarial, contrastive, and transfer learning (e.g., +5.3\\% and +11.0\\% top-1 accuracy on ImageNet for ViT-B/16 and Mixer-B/16, respectively, with the simple Inception-style preprocessing). We show that the improved smoothness attributes to sparser active neurons in the first few layers. The resultant ViTs outperform ResNets of similar size and throughput when trained from scratch on ImageNet without large-scale pre-training or strong data augmentations. Model checkpoints are available at \\url{https://github.com/google-research/vision_transformer}.", "keywords": "Vision Transformers;Optimization", "primary_area": "", "supplementary_material": "", "author": "Xiangning Chen;Cho-Jui Hsieh;Boqing Gong", "authorids": "~Xiangning_Chen1;~Cho-Jui_Hsieh1;~Boqing_Gong1", "gender": "M;M;M", "homepage": ";http://web.cs.ucla.edu/~chohsieh/index.html;http://boqinggong.info", "dblp": "56/7393;14/2770;29/7457", "google_scholar": "vNcBx1sAAAAJ;Wy89g4IAAAAJ;lv9ZeVUAAAAJ", "orcid": ";;", "linkedin": ";;boqing-gong-46aa5821/", "or_profile": "~Xiangning_Chen1;~Cho-Jui_Hsieh1;~Boqing_Gong1", "aff": "University of California, Los Angeles;University of California, Los Angeles;Google", "aff_domain": "cs.ucla.edu;ucla.edu;google.com", "position": "PhD student;Assistant Professor;Research Scientist", "bibtex": "@inproceedings{\nchen2022when,\ntitle={When Vision Transformers Outperform ResNets without Pre-training or Strong Data Augmentations},\nauthor={Xiangning Chen and Cho-Jui Hsieh and Boqing Gong},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=LtKcMgGOeLt}\n}", "github": "", "project": "", "reviewers": "y3Uf;hGsf;CmdC;1uek;4nyX", "pdf_size": 0, "recommendation": "5;6;8;8;8", "confidence": "4;4;5;4;4", "correctness": "4;3;3;3;3", "technical_novelty": "2;3;3;2;4", "empirical_novelty": "3;3;3;4;4", "wc_summary_paper": "66;58;141;98;55", "wc_summary_review": "93;38;47;79;31", "wc_main_review": "67;122;272;1481;287", "wc_review": "226;218;460;1658;373", "wc_reply_reviewers": "0;0;0;166;0", "wc_reply_authors": "217;289;289;1771;12", "reply_reviewers": "0;0;0;1;0", "reply_authors": "2;1;1;4;1", "recommendation_avg": [ 7.0, 1.2649110640673518 ], "confidence_avg": [ 4.2, 0.39999999999999997 ], "correctness_avg": [ 3.2, 0.39999999999999997 ], "technical_novelty_avg": [ 2.8, 0.7483314773547882 ], "empirical_novelty_avg": [ 3.4, 0.4898979485566356 ], "wc_summary_paper_avg": [ 83.6, 32.512151574449824 ], "wc_summary_review_avg": [ 57.6, 24.146221236458512 ], "wc_main_review_avg": [ 445.8, 524.4804667478094 ], "wc_review_avg": [ 587.0, 543.2215017835726 ], "wc_reply_reviewers_avg": [ 33.2, 66.4 ], "wc_reply_authors_avg": [ 515.6, 635.8457674625192 ], "reply_reviewers_avg": [ 0.2, 0.4 ], "reply_authors_avg": [ 1.8, 1.1661903789690602 ], "replies_avg": [ 20, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.3952847075210474, "corr_recommendation_correctness": -0.7905694150420949, "gs_citation": 410, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4049796223449388186&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=LtKcMgGOeLt", "email": "cs.ucla.edu;ucla.edu;google.com", "author_num": 3, "aff_unique_index": "0;0;1", "aff_unique_norm": "University of California, Los Angeles;Google", "aff_unique_dep": ";Google", "aff_unique_url": "https://www.ucla.edu;https://www.google.com", "aff_unique_abbr": "UCLA;Google", "aff_campus_unique_index": "0;0;1", "aff_campus_unique": "Los Angeles;Mountain View", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "LtXNu_mJdJI", "title": "Mutual Information Continuity-constrained Estimator", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "The estimation of mutual information (MI) is vital to a variety of applications in machine learning. Recent developments in neural approaches have shown encouraging potential in estimating the MI between high-dimensional variables based on their latent representations. However, these estimators are prone to high variances owing to the inevitable outlier events. Recent approaches mitigate the outlier issue by smoothing the partition function using clipping or averaging strategies; however, these estimators either break the lower bound condition or sacrifice the level of accuracy. Accordingly, we propose Mutual Information Continuity-constrained Estimator (MICE). MICE alternatively smooths the partition function by constraining the Lipschitz constant of the log-density ratio estimator, thus alleviating the induced variances without clipping or averaging. Our proposed estimator outperforms most of the existing estimators in terms of bias and variance in the standard benchmark. In addition, we propose an experiment extension based on the standard benchmark, where variables are drawn from a multivariate normal distribution with correlations between each sample in a batch. The experimental results imply that when the i.i.d. assumption is unfulfilled, our proposed estimator can be more accurate than the existing approaches in which the MI tends to be underestimated. Finally, we demonstrate that MICE mitigates mode collapse in the kernel density estimation task.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Tsun-An Hsieh;Cheng Yu;Ying Hung;Chung-Ching Lin;Yu Tsao", "authorids": "~Tsun-An_Hsieh1;~Cheng_Yu3;~Ying_Hung1;~Chung-Ching_Lin2;~Yu_Tsao1", "gender": "M;M;F;M;", "homepage": "https://github.com/aleXiehta;https://williamyu1993.github.io;https://sites.google.com/view/ying-hung;https://www.citi.sinica.edu.tw/pages/yu.tsao/index_en.html;", "dblp": ";;;66/7146;37/8616", "google_scholar": "https://scholar.google.com.tw/citations?user=FWqLt5QAAAAJ;https://scholar.google.com.tw/citations?user=RaVT25oAAAAJ;Iy9yRb0AAAAJ;https://scholar.google.com.tw/citations?user=ZO5e5I4AAAAJ;legkbM0AAAAJ", "orcid": "0000-0002-3484-539X;0000-0001-5544-7449;;0000-0001-6956-0418;", "linkedin": ";williamchengyu/;;;", "or_profile": "~Tsun-An_Hsieh1;~Cheng_Yu3;~Ying_Hung1;~Yu_Tsao1;~Chung-ching_Lin1", "aff": "University of Illinois, Urbana Champaign;Academia Sinica;Rutgers University;Academia Sinica;Microsoft", "aff_domain": "illinois.edu;sinica.edu.tw;rutgers.edu;sinica.edu.tw;microsoft.com", "position": "PhD student;Researcher;Professor;Full Professor;Principal\u00a0Researcher", "bibtex": "@misc{\nhsieh2022mutual,\ntitle={Mutual Information Continuity-constrained Estimator},\nauthor={Tsun-An Hsieh and Cheng Yu and Ying Hung and Chung-Ching Lin and Yu Tsao},\nyear={2022},\nurl={https://openreview.net/forum?id=LtXNu_mJdJI}\n}", "github": "", "project": "", "reviewers": "KAMj;o3p1;rjDa;kvQy", "site": "https://openreview.net/forum?id=LtXNu_mJdJI", "pdf_size": 0, "recommendation": "1;3;5;6", "confidence": "5;2;4;3", "correctness": "1;3;2;4", "technical_novelty": "1;2;2;3", "empirical_novelty": "1;2;2;4", "wc_summary_paper": "32;140;52;96", "wc_summary_review": "63;27;96;52", "wc_main_review": "1183;187;519;243", "wc_review": "1278;354;667;391", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.75, 1.920286436967152 ], "confidence_avg": [ 3.5, 1.118033988749895 ], "correctness_avg": [ 2.5, 1.118033988749895 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.25, 1.0897247358851685 ], "wc_summary_paper_avg": [ 80.0, 41.66533331199932 ], "wc_summary_review_avg": [ 59.5, 24.78406746278746 ], "wc_main_review_avg": [ 533.0, 395.7625550756413 ], "wc_review_avg": [ 672.5, 369.9138413198403 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.40755575681770734, "corr_recommendation_correctness": 0.7568892626614565, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:gnmChjwL0lkJ:scholar.google.com/&scioq=Mutual+Information+Continuity-constrained+Estimator&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;2;1;3", "aff_unique_norm": "University of Illinois Urbana-Champaign;Academia Sinica;Rutgers University;Microsoft", "aff_unique_dep": ";;;Microsoft Corporation", "aff_unique_url": "https://illinois.edu;https://www.sinica.edu.tw;https://www.rutgers.edu;https://www.microsoft.com", "aff_unique_abbr": "UIUC;Academia Sinica;Rutgers;Microsoft", "aff_campus_unique_index": "0;1;1", "aff_campus_unique": "Urbana-Champaign;Taiwan;", "aff_country_unique_index": "0;1;0;1;0", "aff_country_unique": "United States;China" }, { "id": "Ltkwl64I91", "title": "Invariance-Guided Feature Evolution for Few-Shot Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Few-shot learning (FSL) aims to characterize the inherent visual relationship between support and query samples which can be well generalized to unseen classes so that we can accurately infer the labels of query samples from very few support samples. We observe that, in a successfully learned FSL model, this visual relationship and the learned features of the query samples should remain largely invariant across different configurations of the support set. Driven by this observation, we propose to construct a feature evolution network with an ensemble of few-shot learners evolving along different configuration dimensions. We choose to study two major parameters that control the support set configuration: the number of labeled samples per class (called shots) and the percentage of training samples (called partition) in the support set. Based on this network, we characterize and track the evolution behavior of learned query features across different shots-partition configurations, which will be minimized by a set of invariance loss functions during the training stage. Our extensive experimental results demonstrate that the proposed invariance-guided feature evolution (IGFE) method significantly improves the performance and generalization capability of few-shot learning and outperforms the state-of-the-art methods by large margins, especially in cross-domain classification tasks for generalization capability test. For example, in the cross-domain test on the fine-grained CUB image classification task, our method has improved the classification accuracy by more than 5%.", "keywords": "Few-shot learning;Invariance loss", "primary_area": "", "supplementary_material": "/attachment/946454715e4e60335db546c7615ebee50ff84d07.zip", "author": "Wenming Cao;Zhineng Zhao;Qifan Liu;Zhihai He", "authorids": "~Wenming_Cao2;~Zhineng_Zhao1;~Qifan_Liu1;~Zhihai_He3", "gender": ";M;;M", "homepage": "http://ceie.szu.edu.cn/info/1017/1069.htm;;;https://faculty.sustech.edu.cn/hezh/", "dblp": ";;243/3163;23/4027", "google_scholar": "https://scholar.google.com/citations?hl=zh-CN;https://scholar.google.com.hk/citations?hl=zh-CN;vbGNpy8AAAAJ;wtr6OgkAAAAJ", "orcid": "0000-0002-8174-6167;;;", "linkedin": ";;;", "or_profile": "~Wenming_Cao2;~Zhineng_Zhao1;~Qifan_Liu1;~Zhihai_He3", "aff": "Shenzhen University;College of Electronics and Information Engineering, Shenzhen University;College of Electronics and Information Engineering, Shenzhen University;Southern University of Science and Technology", "aff_domain": "szu.edu.cn;szu.edu;szu.edu;sustech.edu.cn", "position": "Full Professor;MS student;PhD student;Chair Professor", "bibtex": "@misc{\ncao2022invarianceguided,\ntitle={Invariance-Guided Feature Evolution for Few-Shot Learning},\nauthor={Wenming Cao and Zhineng Zhao and Qifan Liu and Zhihai He},\nyear={2022},\nurl={https://openreview.net/forum?id=Ltkwl64I91}\n}", "github": "", "project": "", "reviewers": "wbRK;dXjh;ZKYs;yhon", "site": "https://openreview.net/forum?id=Ltkwl64I91", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "4;4;4;5", "correctness": "3;3;2;4", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "93;97;77;32", "wc_summary_review": "51;10;30;25", "wc_main_review": "237;360;256;233", "wc_review": "381;467;363;290", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;237", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;1", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 74.75, 25.791229129298976 ], "wc_summary_review_avg": [ 29.0, 14.679918255903198 ], "wc_main_review_avg": [ 271.5, 51.829045910570265 ], "wc_review_avg": [ 375.25, 62.98561343672061 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 59.25, 102.62401034845598 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0.25, 0.4330127018922193 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.6622661785325219, "corr_recommendation_correctness": 0.3244428422615251, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:pWfCHAhzPLgJ:scholar.google.com/&scioq=Invariance-Guided+Feature+Evolution+for+Few-Shot+Learning&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0;0;1", "aff_unique_norm": "Shenzhen University;Southern University of Science and Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.szu.edu.cn;https://www.sustech.edu.cn", "aff_unique_abbr": "SZU;SUSTech", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Shenzhen", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "id": "Lv-G9XqLRRy", "title": "Restricted Category Removal from Model Representations using Limited Data", "track": "main", "status": "Reject", "tldr": "", "abstract": "Deep learning models are trained on multiple categories jointly to solve several real-world problems. However, there can be cases where some of the classes may become restricted in the future and need to be excluded after the model has already been trained on them (Class-level Privacy). It can be due to privacy, ethical or legal concerns. A naive solution is to simply train the model from scratch on the complete training data while leaving out the training samples from the restricted classes (FDR - full data retraining). But this can be a very time-consuming process. Further, this approach will not work well if we no longer have access to the complete training data and instead only have access to very few training data. The objective of this work is to remove the information about the restricted classes from the network representations of all layers using limited data without affecting the prediction power of the model for the remaining classes. Simply fine-tuning the model on the limited available training data for the remaining classes will not be able to sufficiently remove the restricted class information, and aggressive fine-tuning on the limited data may also lead to overfitting. We propose a novel solution to achieve this objective that is significantly faster ($\\sim200\\times$ on ImageNet) than the naive solution. Specifically, we propose a novel technique for identifying the model parameters that are mainly relevant to the restricted classes. We also propose a novel technique that uses the limited training data of the restricted classes to remove the restricted class information from these parameters and uses the limited training data of the remaining classes to reuse these parameters for the remaining classes. The model obtained through our approach behaves as if it was never trained on the restricted classes and performs similar to FDR (which needs the complete training data). We also propose several baseline approaches and compare our approach with them in order to demonstrate its efficacy.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Pratik Mazumder;Pravendra Singh;Mohammed Asad Karim", "authorids": "~Pratik_Mazumder2;~Pravendra_Singh1;~Mohammed_Asad_Karim1", "gender": "M;M;M", "homepage": "https://sites.google.com/view/pravendra/;;https://sites.google.com/view/pratikmazumder", "dblp": "160/8743;;237/9769", "google_scholar": "YwDTxJMAAAAJ;;https://scholar.google.com/citations?hl=en", "orcid": "0000-0003-1001-2219;;0000-0003-1103-1884", "linkedin": ";asad-karim-0938/;pratik-mazumder-553a8567/", "or_profile": "~Pravendra_Singh1;~Mohammed_Asad_Karim1;~PRATIK_MAZUMDER1", "aff": "Indian Institute of Technology, Roorkee;Carnegie Mellon University;IIT Kanpur", "aff_domain": "iitr.ac.in;andrew.cmu.edu;iitk.ac.in", "position": "Assistant Professor;MS student;PhD student", "bibtex": "@misc{\nmazumder2022restricted,\ntitle={Restricted Category Removal from Model Representations using Limited Data},\nauthor={Pratik Mazumder and Pravendra Singh and Mohammed Asad Karim},\nyear={2022},\nurl={https://openreview.net/forum?id=Lv-G9XqLRRy}\n}", "github": "", "project": "", "reviewers": "3V1Q;yBwg;Y25C;pyXW", "site": "https://openreview.net/forum?id=Lv-G9XqLRRy", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "4;4;4;4", "correctness": "2;3;3;3", "technical_novelty": "2;2;2;3", "empirical_novelty": "3;2;4;2", "wc_summary_paper": "70;132;66;189", "wc_summary_review": "26;29;29;69", "wc_main_review": "455;244;359;274", "wc_review": "551;405;454;532", "wc_reply_reviewers": "248;45;48;18", "wc_reply_authors": "2888;819;1131;1042", "reply_reviewers": "2;1;1;1", "reply_authors": "6;3;4;4", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 114.25, 50.4696691092779 ], "wc_summary_review_avg": [ 38.25, 17.795715776557007 ], "wc_main_review_avg": [ 333.0, 82.10054811022884 ], "wc_review_avg": [ 485.5, 59.00211860602973 ], "wc_reply_reviewers_avg": [ 89.75, 92.10964933165255 ], "wc_reply_authors_avg": [ 1470.0, 826.533423401619 ], "reply_reviewers_avg": [ 1.25, 0.4330127018922193 ], "reply_authors_avg": [ 4.25, 1.0897247358851685 ], "replies_avg": [ 33, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1028847864144409612&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;2", "aff_unique_norm": "Indian Institute of Technology;Carnegie Mellon University;Indian Institute of Technology Kanpur", "aff_unique_dep": ";;", "aff_unique_url": "https://www.iitr.ac.in;https://www.cmu.edu;https://www.iitk.ac.in", "aff_unique_abbr": "IIT Roorkee;CMU;IITK", "aff_campus_unique_index": "0;2", "aff_campus_unique": "Roorkee;;Kanpur", "aff_country_unique_index": "0;1;0", "aff_country_unique": "India;United States" }, { "id": "Lwclw6u3Pcw", "title": "Characterizing and Measuring the Similarity of Neural Networks with Persistent Homology", "track": "main", "status": "Reject", "tldr": "", "abstract": "Characterizing the structural properties of neural networks is crucial yet poorly understood, and there are no well-established similarity measures between networks. In this work, we observe that neural networks can be represented as abstract simplicial complex and analyzed using their topological 'fingerprints' via Persistent Homology (PH). We then describe a PH-based representation proposed for characterizing and measuring similarity of neural networks. We empirically show the effectiveness of this representation as a descriptor of different architectures in several datasets. This approach based on Topological Data Analysis is a step towards better understanding neural networks and a useful similarity measure.", "keywords": "Neural Networks;Topological Data Analysis;similarity;Persistent Homology", "primary_area": "", "supplementary_material": "/attachment/b69f4cb59e53d407658feebe26f9cd6440cdea43.zip", "author": "David P\u00e9rez Fern\u00e1ndez;Asier Guti\u00e9rrez-Fandi\u00f1o;Jordi Armengol-Estap\u00e9;Marta Villegas", "authorids": "~David_P\u00e9rez_Fern\u00e1ndez1;~Asier_Guti\u00e9rrez-Fandi\u00f1o1;~Jordi_Armengol-Estap\u00e91;~Marta_Villegas2", "gender": "M;M;M;", "homepage": ";;https://jordiae.com/;", "dblp": ";281/8078;263/2700.html;", "google_scholar": ";https://scholar.google.es/citations?user=sYKGw0wAAAAJ;https://scholar.google.es/citations?user=CiHoJfcAAAAJ;", "orcid": "0000-0002-2214-0245;0000-0002-7368-6950;0000-0001-8893-6185;", "linkedin": ";asier-gutierrez-fandino/;jordiae;", "or_profile": "~David_P\u00e9rez_Fern\u00e1ndez1;~Asier_Guti\u00e9rrez-Fandi\u00f1o1;~Jordi_Armengol-Estap\u00e91;~Marta_Villegas2", "aff": ";Barcelona Supercomputing Center;Barcelona Supercomputing Center;", "aff_domain": ";bsc.es;bsc.es;", "position": ";Researcher;Researcher;", "bibtex": "@misc{\nfern{\\'a}ndez2022characterizing,\ntitle={Characterizing and Measuring the Similarity of Neural Networks with Persistent Homology },\nauthor={David P{\\'e}rez Fern{\\'a}ndez and Asier Guti{\\'e}rrez-Fandi{\\~n}o and Jordi Armengol-Estap{\\'e} and Marta Villegas},\nyear={2022},\nurl={https://openreview.net/forum?id=Lwclw6u3Pcw}\n}", "github": "", "project": "", "reviewers": "rtBj;WhHV;HwgX;ZGgm", "site": "https://openreview.net/forum?id=Lwclw6u3Pcw", "pdf_size": 0, "recommendation": "3;5;5;5", "confidence": "4;3;4;5", "correctness": "2;3;3;1", "technical_novelty": "2;3;2;2", "empirical_novelty": "2;3;2;3", "wc_summary_paper": "123;108;76;52", "wc_summary_review": "38;65;68;32", "wc_main_review": "362;215;264;146", "wc_review": "523;388;408;230", "wc_reply_reviewers": "178;0;212;35", "wc_reply_authors": "890;398;1722;427", "reply_reviewers": "1;0;1;1", "reply_authors": "2;1;3;2", "recommendation_avg": [ 4.5, 0.8660254037844386 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 2.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 89.75, 27.625848403261752 ], "wc_summary_review_avg": [ 50.75, 15.927570436196476 ], "wc_main_review_avg": [ 246.75, 78.64278415722576 ], "wc_review_avg": [ 387.25, 104.38719988580975 ], "wc_reply_reviewers_avg": [ 106.25, 90.41121335321189 ], "wc_reply_authors_avg": [ 859.25, 534.9941004347618 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 2.0, 0.7071067811865476 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.17407765595569782, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18253218687104770823&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0", "aff_unique_norm": "Barcelona Supercomputing Center", "aff_unique_dep": "", "aff_unique_url": "https://www.bsc.es", "aff_unique_abbr": "BSC", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Spain" }, { "title": "A Biologically Interpretable Graph Convolutional Network to Link Genetic Risk Pathways and Imaging Phenotypes of Disease", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6978", "id": "Lwr8We4MIxn", "poster": "", "openreview": "https://openreview.net/forum?id=Lwr8We4MIxn", "slides": "https://iclr.cc/virtual/2022/poster/6978", "video": "https://iclr.cc/virtual/2022/poster/6978", "author_site": "Sayan Ghosal, Qiang Chen, Giulio Pergola, Aaron Goldman, William Ulrich, Daniel Weinberger, Archana Venkataraman", "tldr": "", "abstract": "We propose a novel end-to-end framework for whole-brain and whole-genome imaging-genetics. Our genetics network uses hierarchical graph convolution and pooling operations to embed subject-level data onto a low-dimensional latent space. The hierarchical network implicitly tracks the convergence of genetic risk across well-established biological pathways, while an attention mechanism automatically identifies the salient edges of this network at the subject level. In parallel, our imaging network projects multimodal data onto a set of latent embeddings. For interpretability, we implement a Bayesian feature selection strategy to extract the discriminative imaging biomarkers; these feature weights are optimized alongside the other model parameters. We couple the imaging and genetic embeddings with a predictor network, to ensure that the learned representations are linked to phenotype. We evaluate our framework on a schizophrenia dataset that includes two functional MRI paradigms and gene scores derived from Single Nucleotide Polymorphism data. Using repeated 10-fold cross-validation, we show that our imaging-genetics fusion achieves the better classification performance than state-of-the-art baselines. In an exploratory analysis, we further show that the biomarkers identified by our model are reproducible and closely associated with deficits in schizophrenia. ", "keywords": "Imaging-genetics;Hierarchical Graph Convolution;Gene Ontology;Bayesian Feature Selection;Schizophrenia", "primary_area": "", "supplementary_material": "/attachment/b4a7a1f16cabb63013ae6ad8bc7b899c840eba27.zip", "author": "Sayan Ghosal;Qiang Chen;Giulio Pergola;Aaron L Goldman;William Ulrich;Daniel R Weinberger;Archana Venkataraman", "authorids": "~Sayan_Ghosal1;~Qiang_Chen6;~Giulio_Pergola1;~Aaron_L_Goldman1;~William_Ulrich1;~Daniel_R_Weinberger1;~Archana_Venkataraman1", "gender": "M;;M;;Ma;M;F", "homepage": "https://sayangsep.github.io/;;https://persone.ict.uniba.it/rubrica/giulio.pergola;https://libd.org;https://www.libd.org;https://www.libd.org/;https://engineering.jhu.edu/nsa/", "dblp": ";;;;;;79/7823", "google_scholar": "uUsqt_4AAAAJ;s22hIQ0AAAAJ;https://scholar.google.it/citations?user=hqAZSycAAAAJ;;;;dDtlmCAAAAAJ", "orcid": ";;0000-0002-9193-1841;;;;", "linkedin": ";;giuliopergola/;;;;", "or_profile": "~Sayan_Ghosal1;~Qiang_Chen6;~Giulio_Pergola1;~Aaron_L_Goldman1;~William_Ulrich1;~Daniel_R_Weinberger1;~Archana_Venkataraman1", "aff": "Johns Hopkins University;Lieber Institute for Brain Development;University of Bari;Lieber Institute for Brain Development;;;Johns Hopkins University", "aff_domain": "jhu.edu;libd.org;uniba.it;libd.org;;;jhu.edu", "position": "PhD student;Principal Researcher;Assistant Professor;Research Associate;;;John C. Malone Assistant Professor", "bibtex": "@inproceedings{\nghosal2022a,\ntitle={A Biologically Interpretable Graph Convolutional Network to Link Genetic Risk Pathways and Imaging Phenotypes of Disease },\nauthor={Sayan Ghosal and Qiang Chen and Giulio Pergola and Aaron L Goldman and William Ulrich and Daniel R Weinberger and Archana Venkataraman},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=Lwr8We4MIxn}\n}", "github": "", "project": "", "reviewers": "1JN7;b8wX;WH5x;NHuX", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "5;4;2;4", "correctness": "2;2;4;3", "technical_novelty": "1;3;3;3", "empirical_novelty": "2;2;3;2", "wc_summary_paper": "55;61;122;74", "wc_summary_review": "27;24;21;75", "wc_main_review": "120;215;68;307", "wc_review": "202;300;211;456", "wc_reply_reviewers": "0;0;0;73", "wc_reply_authors": "532;1223;63;1840", "reply_reviewers": "0;0;0;1", "reply_authors": "1;3;1;4", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 3.75, 1.0897247358851685 ], "correctness_avg": [ 2.75, 0.82915619758885 ], "technical_novelty_avg": [ 2.5, 0.8660254037844386 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 78.0, 26.315394733881533 ], "wc_summary_review_avg": [ 36.75, 22.18529918662356 ], "wc_main_review_avg": [ 177.5, 91.4781394651203 ], "wc_review_avg": [ 292.25, 102.00582091233814 ], "wc_reply_reviewers_avg": [ 18.25, 31.60992723813201 ], "wc_reply_authors_avg": [ 914.5, 675.1075840190214 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 2.25, 1.299038105676658 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": -0.15789473684210528, "corr_recommendation_correctness": 0.3458572319330373, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4232651915105167205&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=Lwr8We4MIxn", "email": "jhu.edu;libd.org;uniba.it;libd.org;;;jhu.edu", "author_num": 7, "aff_unique_index": "0;1;2;1;0", "aff_unique_norm": "Johns Hopkins University;Lieber Institute for Brain Development;University of Bari", "aff_unique_dep": ";;", "aff_unique_url": "https://www.jhu.edu;https://www.lieberinstitute.org;https://www.uniba.it", "aff_unique_abbr": "JHU;;UNIBA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;0", "aff_country_unique": "United States;Italy" }, { "id": "Ly6_LGwoi_V", "title": "Target Layer Regularization for Continual Learning Using Cramer-Wold Generator", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "We propose an effective regularization strategy (CW-TaLaR) for solving continual learning problems. It uses a penalizing term expressed by the Cramer-Wold distance between two probability distributions defined on a target layer of an underlying neural network that is shared by all tasks, and the simple architecture of the Cramer-Wold generator for modeling output data representation. Our strategy preserves target layer distribution while learning a new task but does not require remembering previous tasks\u2019 datasets. We perform experiments involving several common supervised frameworks, which prove the competitiveness of the CWTaLaR method in comparison to a few existing state-of-the-art continual learning models.", "keywords": "Continual learning;Cramer-Wold distance;regularization", "primary_area": "", "supplementary_material": "/attachment/f9d4fbfff5a708a4de6f88d83300ee20845075b4.zip", "author": "Marcin Mazur;\u0141ukasz Pustelnik;Szymon Knop;Patryk Pagacz;Przemys\u0142aw Spurek", "authorids": "~Marcin_Mazur1;~\u0141ukasz_Pustelnik1;~Szymon_Knop1;patryk.pagacz@uj.edu.pl;~Przemys\u0142aw_Spurek1", "gender": ";M;M;;M", "homepage": ";https://github.com/lukinio;;;http://ww2.ii.uj.edu.pl/~spurek/", "dblp": ";;;;77/10260", "google_scholar": ";;;;0kp0MbgAAAAJ", "orcid": ";;;;0000-0003-0097-5521", "linkedin": ";;;;spurek/", "or_profile": "~Marcin_Mazur1;~\u0141ukasz_Pustelnik1;~Szymon_Knop1;patryk.pagacz@uj.edu.pl;~Przemys\u0142aw_Spurek1", "aff": ";Jagiellonian University;Jagiellonian University;;Jagiellonian University", "aff_domain": ";uj.edu.pl;uj.edu.pl;;uj.edu.pl", "position": ";MS student;PhD student;;Assistant Professor", "bibtex": "@misc{\nmazur2022target,\ntitle={Target Layer Regularization for Continual Learning Using Cramer-Wold Generator},\nauthor={Marcin Mazur and {\\L}ukasz Pustelnik and Szymon Knop and Patryk Pagacz and Przemys{\\l}aw Spurek},\nyear={2022},\nurl={https://openreview.net/forum?id=Ly6_LGwoi_V}\n}", "github": "", "project": "", "reviewers": "wnE5;9j3v;mig4;tUnP", "site": "https://openreview.net/forum?id=Ly6_LGwoi_V", "pdf_size": 0, "recommendation": "1;3;3;3", "confidence": "3;5;4;4", "correctness": "2;3;3;4", "technical_novelty": "2;2;2;2", "empirical_novelty": "1;1;1;2", "wc_summary_paper": "96;87;83;73", "wc_summary_review": "36;59;48;29", "wc_main_review": "358;970;207;344", "wc_review": "490;1116;338;446", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 2.5, 0.8660254037844386 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 1.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 84.75, 8.257572282456872 ], "wc_summary_review_avg": [ 43.0, 11.467344941179714 ], "wc_main_review_avg": [ 469.75, 294.7832890446811 ], "wc_review_avg": [ 597.5, 304.42199329220614 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.816496580927726, "corr_recommendation_correctness": 0.816496580927726, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1097668649830472125&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Jagiellonian University", "aff_unique_dep": "", "aff_unique_url": "https://www.uj.edu.pl", "aff_unique_abbr": "UJ", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Poland" }, { "id": "LzBBxCg-xpa", "title": "NViT: Vision Transformer Compression and Parameter Redistribution", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Transformers yield state-of-the-art results across many tasks. However, they still impose huge computational costs during inference. We apply global, structural pruning with latency-aware regularization on all parameters of the Vision Transformer (ViT) model for latency reduction. Furthermore, we analyze the pruned architectures and find interesting regularities in the final weight structure. Our discovered insights lead to a new architecture called NViT (Novel ViT), with a redistribution of where parameters are used. This architecture utilizes parameters more efficiently and enables control of the latency-accuracy trade-off. On ImageNet-1K, we prune the DEIT-Base (Touvron et al., 2021) model to a 2.6$\\times$ FLOPs reduction, 5.1$\\times$ parameter reduction, and 1.9$\\times$ run-time speedup with only 0.07% loss in accuracy. We achieve more than 1% accuracy gain when compressing the base model to the throughput of the Small/Tiny variants. NViT gains 0.1-1.1% accuracy over the hand-designed DEIT family when trained from scratch, while being faster.", "keywords": "Vision transformer;structual pruning;latency aware;novel architecture", "primary_area": "", "supplementary_material": "", "author": "Huanrui Yang;Hongxu Yin;Pavlo Molchanov;Hai Li;Jan Kautz", "authorids": "~Huanrui_Yang1;~Hongxu_Yin2;~Pavlo_Molchanov1;~Hai_Li1;~Jan_Kautz1", "gender": "M;M;F;;M", "homepage": "https://sites.google.com/view/huanrui-yang;;https://ece.duke.edu/faculty/hai-helen-li;http://jankautz.com;https://hongxu-yin.github.io/", "dblp": "221/2845;165/8169.html;30/5330-1;48/6214;166/3425", "google_scholar": "bjNCUt8AAAAJ;J9PoyoIAAAAJ;E6Tpfq8AAAAJ;P9FclNEAAAAJ;4gdSoOYAAAAJ", "orcid": ";;0000-0003-3228-6544;;", "linkedin": ";;;;", "or_profile": "~Huanrui_Yang1;~Pavlo_Molchanov1;~Hai_Li1;~Jan_Kautz1;~Hongxu_Yin1", "aff": "Duke University;NVIDIA Research;Duke University;NVIDIA;NVIDIA", "aff_domain": "duke.edu;nvidia.com;duke.edu;nvidia.com;nvidia.com", "position": "PhD student;Research Scientist;Professor;VP Research;Research Scientist", "bibtex": "@misc{\nyang2022nvit,\ntitle={{NV}iT: Vision Transformer Compression and Parameter Redistribution},\nauthor={Huanrui Yang and Hongxu Yin and Pavlo Molchanov and Hai Li and Jan Kautz},\nyear={2022},\nurl={https://openreview.net/forum?id=LzBBxCg-xpa}\n}", "github": "", "project": "", "reviewers": "ZWjq;awr6;jSkV;6mQe", "site": "https://openreview.net/forum?id=LzBBxCg-xpa", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "4;3;3;3", "correctness": "2;4;3;3", "technical_novelty": "3;3;3;2", "empirical_novelty": "3;3;2;2", "wc_summary_paper": "56;60;31;73", "wc_summary_review": "45;5;45;34", "wc_main_review": "676;139;202;164", "wc_review": "777;204;278;271", "wc_reply_reviewers": "176;0;604;0", "wc_reply_authors": "1098;276;2931;1112", "reply_reviewers": "1;0;3;0", "reply_authors": "3;1;8;2", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 55.0, 15.215124054702938 ], "wc_summary_review_avg": [ 32.25, 16.361158271956175 ], "wc_main_review_avg": [ 295.25, 220.96761640566248 ], "wc_review_avg": [ 382.5, 229.5893072423017 ], "wc_reply_reviewers_avg": [ 195.0, 246.82584953768517 ], "wc_reply_authors_avg": [ 1354.25, 971.2250962058177 ], "reply_reviewers_avg": [ 1.0, 1.224744871391589 ], "reply_authors_avg": [ 3.5, 2.692582403567252 ], "replies_avg": [ 27, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.6622661785325219, "corr_recommendation_correctness": 0.3244428422615251, "gs_citation": 66, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12090878931068945950&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "aff_unique_index": "0;1;0;1;1", "aff_unique_norm": "Duke University;NVIDIA", "aff_unique_dep": ";NVIDIA Research", "aff_unique_url": "https://www.duke.edu;https://www.nvidia.com/research", "aff_unique_abbr": "Duke;NVIDIA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Anomaly Transformer: Time Series Anomaly Detection with Association Discrepancy", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7023", "id": "LzQQ89U1qm_", "poster": "", "openreview": "https://openreview.net/forum?id=LzQQ89U1qm_", "slides": "https://iclr.cc/virtual/2022/poster/7023", "video": "https://iclr.cc/virtual/2022/poster/7023", "author_site": "Jiehui Xu, haixu wu, Jianmin Wang, Mingsheng Long", "tldr": "", "abstract": "Unsupervised detection of anomaly points in time series is a challenging problem, which requires the model to derive a distinguishable criterion. Previous methods tackle the problem mainly through learning pointwise representation or pairwise association, however, neither is sufficient to reason about the intricate dynamics. Recently, Transformers have shown great power in unified modeling of pointwise representation and pairwise association, and we find that the self-attention weight distribution of each time point can embody rich association with the whole series. Our key observation is that due to the rarity of anomalies, it is extremely difficult to build nontrivial associations from abnormal points to the whole series, thereby, the anomalies' associations shall mainly concentrate on their adjacent time points. This adjacent-concentration bias implies an association-based criterion inherently distinguishable between normal and abnormal points, which we highlight through the Association Discrepancy. Technically, we propose the Anomaly Transformer with a new Anomaly-Attention mechanism to compute the association discrepancy. A minimax strategy is devised to amplify the normal-abnormal distinguishability of the association discrepancy. The Anomaly Transformer achieves state-of-the-art results on six unsupervised time series anomaly detection benchmarks of three applications: service monitoring, space & earth exploration, and water treatment.", "keywords": "Time series anomaly detection;Transformers;Anomaly attention;Association discrepancy", "primary_area": "", "supplementary_material": "", "author": "Jiehui Xu;Haixu Wu;Jianmin Wang;Mingsheng Long", "authorids": "~Jiehui_Xu1;~Haixu_Wu1;~Jianmin_Wang1;~Mingsheng_Long5", "gender": "M;M;M;M", "homepage": ";https://www.thss.tsinghua.edu.cn/en/faculty/jianminwang.htm;https://scholar.google.com/citations?user=oV6zeAwAAAAJ&hl=zh-TW;http://ise.thss.tsinghua.edu.cn/~mlong", "dblp": "286/8115;06/3456-1.html;;74/9023", "google_scholar": "oLL_x0wAAAAJ;https://scholar.google.com.tw/citations?user=MiovcboAAAAJ;oV6zeAwAAAAJ;_MjXpXkAAAAJ", "orcid": ";0000-0001-6841-7943;;0000-0002-5412-9120", "linkedin": ";;;", "or_profile": "~Haixu_Wu1;~Jianmin_Wang1;~X_Jh1;~Mingsheng_Long2", "aff": "Tsinghua University;Tsinghua University;Tsinghua University;Tsinghua University", "aff_domain": "tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn", "position": "PhD student;Full Professor;MS student;Associate Professor", "bibtex": "@inproceedings{\nxu2022anomaly,\ntitle={Anomaly Transformer: Time Series Anomaly Detection with Association Discrepancy},\nauthor={Jiehui Xu and Haixu Wu and Jianmin Wang and Mingsheng Long},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=LzQQ89U1qm_}\n}", "github": "", "project": "", "reviewers": "nXE3;BKSk;88ie;bfbm", "pdf_size": 0, "recommendation": "6;8;8;8", "confidence": "4;4;4;5", "correctness": "3;3;3;3", "technical_novelty": "3;3;3;3", "empirical_novelty": "3;3;3;3", "wc_summary_paper": "115;115;79;49", "wc_summary_review": "67;69;12;23", "wc_main_review": "200;344;266;582", "wc_review": "382;528;357;654", "wc_reply_reviewers": "0;413;107;34", "wc_reply_authors": "421;2060;947;1020", "reply_reviewers": "0;2;1;1", "reply_authors": "2;5;4;3", "recommendation_avg": [ 7.5, 0.8660254037844386 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 89.5, 27.617928959282953 ], "wc_summary_review_avg": [ 42.75, 25.557533136044253 ], "wc_main_review_avg": [ 348.0, 144.39529078193652 ], "wc_review_avg": [ 480.25, 119.70040726747759 ], "wc_reply_reviewers_avg": [ 138.5, 163.1295497449803 ], "wc_reply_authors_avg": [ 1112.0, 594.1115215176357 ], "reply_reviewers_avg": [ 1.0, 0.7071067811865476 ], "reply_authors_avg": [ 3.5, 1.118033988749895 ], "replies_avg": [ 24, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": 0.0, "gs_citation": 768, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12471325118803603403&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=LzQQ89U1qm_", "email": "tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Tsinghua University", "aff_unique_dep": "", "aff_unique_url": "https://www.tsinghua.edu.cn", "aff_unique_abbr": "THU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "id": "M-9bPO0M2K5", "title": "MetaBalance: High-Performance Neural Networks for Class-Imbalanced Data", "track": "main", "status": "Reject", "tldr": "", "abstract": "Class-imbalanced data, in which some classes contain far more samples than others, is ubiquitous in real-world applications. Standard techniques for handling class-imbalance usually work by training on a re-weighted loss or on re-balanced data. Unfortunately, training overparameterized neural networks on such objectives causes rapid memorization of minority class data. To avoid this trap, we harness meta-learning, which uses both an \"outer-loop'' and an \"inner-loop'' loss, each of which may be balanced using different strategies. We evaluate our method, MetaBalance, on image classification, credit-card fraud detection, loan default prediction, and facial recognition tasks with severely imbalanced data. We find that MetaBalance outperforms a wide array of popular strategies designed to handle class-imbalance, especially in scenarios with very few samples in minority classes.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/6fb0d3fc66bc9bd5673196a525765519760b21b6.zip", "author": "Arpit Bansal;Micah Goldblum;Valeriia Cherepanova;Avi Schwarzschild;C. Bayan Bruss;Tom Goldstein", "authorids": "~Arpit_Bansal1;~Micah_Goldblum1;~Valeriia_Cherepanova1;~Avi_Schwarzschild1;~C._Bayan_Bruss1;~Tom_Goldstein1", "gender": ";F;M;M;M;M", "homepage": ";https://www.vcherepanova.com/;https://cs.umd.edu/~avi1;https://www.cbbruss.com;https://www.cs.umd.edu/~tomg/;https://arpitbansal297.github.io/", "dblp": "241/7231;;249/9334.html;;25/8184;190/9114", "google_scholar": "pGDKzuUAAAAJ;PySUqqUAAAAJ;WNvQ7AcAAAAJ;ClqvGRQAAAAJ;KmSuVtgAAAAJ;Pchxm4IAAAAJ", "orcid": ";;;;;", "linkedin": ";;;bayan-bruss/;;arpit-bansal-970865b1/", "or_profile": "~Micah_Goldblum1;~Valeriia_Cherepanova1;~Avi_Schwarzschild1;~C._Bayan_Bruss1;~Tom_Goldstein1;~Arpit_Amit_Bansal1", "aff": "New York University;Amazon;University of Maryland, College Park;Capital One;University of Maryland, College Park;University of Maryland, College Park", "aff_domain": "nyu.edu;amazon.com;umd.edu;capitalone.com;umd.edu;umd.edu", "position": "Postdoc;Intern;PhD student;Director of Applied Research;Associate Professor;PhD student", "bibtex": "@misc{\nbansal2022metabalance,\ntitle={MetaBalance: High-Performance Neural Networks for Class-Imbalanced Data},\nauthor={Arpit Bansal and Micah Goldblum and Valeriia Cherepanova and Avi Schwarzschild and C. Bayan Bruss and Tom Goldstein},\nyear={2022},\nurl={https://openreview.net/forum?id=M-9bPO0M2K5}\n}", "github": "", "project": "", "reviewers": "tbZ3;16HT;5Nup;bejS", "site": "https://openreview.net/forum?id=M-9bPO0M2K5", "pdf_size": 0, "recommendation": "3;3;5;6", "confidence": "5;5;4;4", "correctness": "3;2;3;3", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;1;3;3", "wc_summary_paper": "43;67;42;198", "wc_summary_review": "90;86;21;116", "wc_main_review": "436;476;366;305", "wc_review": "569;629;429;619", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.25, 1.299038105676658 ], "confidence_avg": [ 4.5, 0.5 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 87.5, 64.57747285238096 ], "wc_summary_review_avg": [ 78.25, 35.00267846894006 ], "wc_main_review_avg": [ 395.75, 65.5376800016601 ], "wc_review_avg": [ 561.5, 79.80444849756184 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.9622504486493761, "corr_recommendation_correctness": 0.5555555555555555, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5552710310646441043&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;2;3;2;2", "aff_unique_norm": "New York University;Amazon;University of Maryland;Capital One", "aff_unique_dep": ";Amazon.com, Inc.;;", "aff_unique_url": "https://www.nyu.edu;https://www.amazon.com;https://www/umd.edu;https://www.capitalone.com", "aff_unique_abbr": "NYU;Amazon;UMD;Capital One", "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";College Park", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "M2sNIiCC6C", "title": "Self-supervised regression learning using domain knowledge: Applications to improving self-supervised image denoising", "track": "main", "status": "Reject", "tldr": "", "abstract": "Regression that predicts continuous quantity is a central part of applications using computational imaging and computer vision technologies. Yet, studying and understanding self-supervised learning for regression tasks -- except for a particular regression task, image denoising -- have lagged behind. This paper proposes a general self-supervised regression learning (SSRL) framework that enables learning regression neural networks with only input data (but without ground-truth target data), by using a designable operator that encapsulates domain knowledge of a specific application. The paper underlines the importance of domain knowledge by showing that under some mild conditions, the better designable operator is used, the proposed SSRL loss becomes closer to ordinary supervised learning loss. Numerical experiments for natural image denoising and low-dose computational tomography denoising demonstrate that proposed SSRL significantly improves the denoising quality over several existing self-supervised denoising methods.", "keywords": "Self-supervised learning;Regression;Image denoising;Deep learning", "primary_area": "", "supplementary_material": "/attachment/8af2f9c67bdda1d70c896a60ad3a771f704b97fd.zip", "author": "Il Yong Chun;Dongwon Park;Xuehang Zheng;Se Young Chun;Yong Long", "authorids": "~Il_Yong_Chun1;~Dongwon_Park1;~Xuehang_Zheng1;~Se_Young_Chun2;~Yong_Long1", "gender": ";M;M;;F", "homepage": ";;;;https://www.ji.sjtu.edu.cn/about/faculty-staff/faculty-directory/faculty-detail/102/", "dblp": ";55/11454;;;", "google_scholar": ";https://scholar.google.co.kr/citations?user=7kEM5QYAAAAJ;2Ukz_u8AAAAJ;;", "orcid": ";;0000-0002-4063-0344;;", "linkedin": ";;;;", "or_profile": "~Il_Yong_Chun1;~Dongwon_Park1;~Xuehang_Zheng1;~Se_Young_Chun2;~Yong_Long1", "aff": ";Ulsan National Institute of Science and Technology;;;", "aff_domain": ";unist.ac.kr;;;", "position": ";PhD student;;;", "bibtex": "@misc{\nchun2022selfsupervised,\ntitle={Self-supervised regression learning using domain knowledge: Applications to improving self-supervised image denoising},\nauthor={Il Yong Chun and Dongwon Park and Xuehang Zheng and Se Young Chun and Yong Long},\nyear={2022},\nurl={https://openreview.net/forum?id=M2sNIiCC6C}\n}", "github": "", "project": "", "reviewers": "iEcr;1ABd;vDqQ", "site": "https://openreview.net/forum?id=M2sNIiCC6C", "pdf_size": 0, "recommendation": "3;6;6", "confidence": "5;2;3", "correctness": "3;3;4", "technical_novelty": "1;3;3", "empirical_novelty": "1;3;3", "wc_summary_paper": "58;77;46", "wc_summary_review": "180;23;24", "wc_main_review": "397;128;244", "wc_review": "635;228;314", "wc_reply_reviewers": "318;0;0", "wc_reply_authors": "2549;460;889", "reply_reviewers": "1;0;0", "reply_authors": "6;2;2", "recommendation_avg": [ 5.0, 1.4142135623730951 ], "confidence_avg": [ 3.3333333333333335, 1.247219128924647 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.3333333333333335, 0.9428090415820634 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.9428090415820634 ], "wc_summary_paper_avg": [ 60.333333333333336, 12.762793146051099 ], "wc_summary_review_avg": [ 75.66666666666667, 73.77593705869747 ], "wc_main_review_avg": [ 256.3333333333333, 110.16452342847138 ], "wc_review_avg": [ 392.3333333333333, 175.1462880629282 ], "wc_reply_reviewers_avg": [ 106.0, 149.90663761154806 ], "wc_reply_authors_avg": [ 1299.3333333333333, 900.8367715013019 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 3.3333333333333335, 1.8856180831641267 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.944911182523068, "corr_recommendation_correctness": 0.5, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:uBP4qe3erqMJ:scholar.google.com/&scioq=Self-supervised+regression+learning+using+domain+knowledge:+Applications+to+improving+self-supervised+image+denoising&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "Ulsan National Institute of Science and Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.unist.ac.kr", "aff_unique_abbr": "UNIST", "aff_country_unique_index": "0", "aff_country_unique": "South Korea" }, { "id": "M34fCMVKxn", "title": "Unsupervised Image Decomposition with Phase-Correlation Networks", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "The ability to decompose scenes into their object components is a desired property for autonomous agents, allowing them to reason and act in their surroundings.\nRecently, different methods have been proposed to learn object-centric representations from data in an unsupervised manner. These methods often rely on latent\nrepresentations learned by deep neural networks, hence requiring high computational costs and large amounts of curated data. Such models are also difficult to\ninterpret. To address these challenges, we propose the Phase-Correlation Decomposition Network (PCDNet), a novel model that decomposes a scene into its object\ncomponents, which are represented as transformed versions of a set of learned object prototypes. The core building block in PCDNet is the Phase-Correlation Cell\n(PC Cell), which exploits the frequency-domain representation of the images in order to estimate the transformation between an object prototype and its \ntransformed version in the image. In our experiments, we show how PCDNet outperforms state-of-the-art methods for unsupervised object discovery and \nsegmentation on simple benchmark datasets and on more challenging data, while using a small number of learnable parameters and being fully interpretable.\n", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Angel Villar-Corrales;Sven Behnke", "authorids": "~Angel_Villar-Corrales1;~Sven_Behnke1", "gender": "M;M", "homepage": "http://angelvillarcorrales.com;http://ais.uni-bonn.de/behnke", "dblp": "279/4028;16/6112", "google_scholar": "NCUoKLMAAAAJ;https://scholar.google.de/citations?user=1xx3X_0AAAAJ", "orcid": ";0000-0002-5040-7525", "linkedin": ";sven-behnke-a566492", "or_profile": "~Angel_Villar-Corrales1;~Sven_Behnke1", "aff": "University of Bonn;University of Bonn", "aff_domain": "uni-bonn.de;uni-bonn.de", "position": "PhD student;Full Professor", "bibtex": "@misc{\nvillar-corrales2022unsupervised,\ntitle={Unsupervised Image Decomposition with Phase-Correlation Networks},\nauthor={Angel Villar-Corrales and Sven Behnke},\nyear={2022},\nurl={https://openreview.net/forum?id=M34fCMVKxn}\n}", "github": "", "project": "", "reviewers": "3mzp;7cEo;VXuC;xPnA", "site": "https://openreview.net/forum?id=M34fCMVKxn", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "4;4;5;2", "correctness": "3;4;3;3", "technical_novelty": "3;2;3;2", "empirical_novelty": "3;1;0;2", "wc_summary_paper": "113;99;59;76", "wc_summary_review": "104;73;23;19", "wc_main_review": "589;270;241;225", "wc_review": "806;442;323;320", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 3.75, 1.0897247358851685 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 1.5, 1.118033988749895 ], "wc_summary_paper_avg": [ 86.75, 20.765054779605084 ], "wc_summary_review_avg": [ 54.75, 35.51320177060919 ], "wc_main_review_avg": [ 331.25, 149.68362468887503 ], "wc_review_avg": [ 472.75, 198.59427861849395 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.9271726499455306, "corr_recommendation_correctness": -0.3333333333333333, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14265776595550603387&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "aff_unique_index": "0;0", "aff_unique_norm": "University of Bonn", "aff_unique_dep": "", "aff_unique_url": "https://www.uni-bonn.de/", "aff_unique_abbr": "UBonn", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Germany" }, { "id": "M5hiCgL7qt", "title": "The NTK Adversary: An Approach to Adversarial Attacks without any Model Access", "track": "main", "status": "Reject", "tldr": "", "abstract": "Adversarial attacks carefully perturb natural inputs, so that a machine learning algorithm produces erroneous decisions on them. Most successful attacks on neural networks exploit gradient information of the model (either directly or by estimating it through querying the model). Harnessing recent advances in Deep Learning theory, we propose a radically different attack that eliminates that need. In particular, in the regime where the Neural Tangent Kernel theory holds, we derive a simple, but powerful strategy for attacking models, which in contrast to prior work, does not require any access to the model under attack, or any trained replica of it for that matter. Instead, we leverage the explicit description afforded by the NTK to maximally perturb the output of the model, using solely information about the model structure and the training data. We experimentally verify the efficacy of our approach, first on models that lie close to the theoretical assumptions (large width, proper initialization, etc.) and, further, on more practical scenarios, with those assumptions relaxed. In addition, we show that our perturbations exhibit strong transferability between models.", "keywords": "Adversarial Attack;Neural Tangent Kernel;Adversarial Examples", "primary_area": "", "supplementary_material": "", "author": "Nikolaos Tsilivis;Julia Kempe", "authorids": "~Nikolaos_Tsilivis1;~Julia_Kempe1", "gender": ";", "homepage": "https://tsili42.github.io;", "dblp": "312/6719;", "google_scholar": "uQ83NcQAAAAJ;", "orcid": ";", "linkedin": ";", "or_profile": "~Nikolaos_Tsilivis1;~Julia_Kempe1", "aff": "New York University;", "aff_domain": "nyu.edu;", "position": "PhD student;", "bibtex": "@misc{\ntsilivis2022the,\ntitle={The {NTK} Adversary: An Approach to Adversarial Attacks without any Model Access},\nauthor={Nikolaos Tsilivis and Julia Kempe},\nyear={2022},\nurl={https://openreview.net/forum?id=M5hiCgL7qt}\n}", "github": "", "project": "", "reviewers": "ttzq;CStK;YPLb;jJ86;BKjF", "site": "https://openreview.net/forum?id=M5hiCgL7qt", "pdf_size": 0, "recommendation": "3;3;3;5;5", "confidence": "4;3;4;5;4", "correctness": "1;3;3;3;3", "technical_novelty": "2;2;2;2;3", "empirical_novelty": "2;2;2;2;2", "wc_summary_paper": "47;44;37;249;167", "wc_summary_review": "41;32;24;39;309", "wc_main_review": "233;251;349;254;872", "wc_review": "321;327;410;542;1348", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "529;980;472;720;1277", "reply_reviewers": "0;0;0;0;0", "reply_authors": "1;2;1;1;2", "recommendation_avg": [ 3.8, 0.9797958971132712 ], "confidence_avg": [ 4.0, 0.6324555320336759 ], "correctness_avg": [ 2.6, 0.8000000000000002 ], "technical_novelty_avg": [ 2.2, 0.39999999999999997 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 108.8, 85.10793147527437 ], "wc_summary_review_avg": [ 89.0, 110.16169933329823 ], "wc_main_review_avg": [ 391.8, 243.49735111495565 ], "wc_review_avg": [ 589.6, 387.50411610717117 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 795.6, 299.08433593219155 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.4, 0.4898979485566356 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.6454972243679028, "corr_recommendation_correctness": 0.40824829046386296, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:EYQmxITZNq4J:scholar.google.com/&scioq=The+NTK+Adversary:+An+Approach+to+Adversarial+Attacks+without+any+Model+Access&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "New York University", "aff_unique_dep": "", "aff_unique_url": "https://www.nyu.edu", "aff_unique_abbr": "NYU", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "title": "PEARL: Data Synthesis via Private Embeddings and Adversarial Reconstruction Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6335", "id": "M6M8BEmd6dq", "poster": "", "openreview": "https://openreview.net/forum?id=M6M8BEmd6dq", "slides": "https://iclr.cc/virtual/2022/poster/6335", "video": "https://iclr.cc/virtual/2022/poster/6335", "author_site": "Seng Pei Liew, Tsubasa Takahashi, Michihiko Ueno", "tldr": "", "abstract": "We propose a new framework of synthesizing data using deep generative models in a differentially private manner.\nWithin our framework, sensitive data are sanitized with rigorous privacy guarantees in a one-shot fashion, such that training deep generative models is possible without re-using the original data.\nHence, no extra privacy costs or model constraints are incurred, in contrast to popular gradient sanitization approaches, which, among other issues, cause degradation in privacy guarantees as the training iteration increases.\nWe demonstrate a realization of our framework by making use of the characteristic function and an adversarial re-weighting objective, which are of independent interest as well.\nOur proposal has theoretical guarantees of performance, and empirical evaluations on multiple datasets show that our approach outperforms other methods at reasonable levels of privacy.", "keywords": "Differential Privacy;Generative Model", "primary_area": "", "supplementary_material": "", "author": "Seng Pei Liew;Tsubasa Takahashi;Michihiko Ueno", "authorids": "~Seng_Pei_Liew1;~Tsubasa_Takahashi1;~Michihiko_Ueno1", "gender": "Not Specified;M;M", "homepage": "https://spliew.github.io/;https://sites.google.com/view/tsubasa-takahashi/;", "dblp": "259/7221;85/5862-1;", "google_scholar": "KQL8tB8AAAAJ;s-jrZ94AAAAJ;", "orcid": "0000-0003-2419-2505;0000-0002-0646-0222;", "linkedin": ";;michihiko-ueno-1b740596/", "or_profile": "~Seng_Pei_Liew1;~Tsubasa_Takahashi1;~Michihiko_Ueno1", "aff": "LINE Corporation;LINE Corporation;LINE Corporation", "aff_domain": "linecorp.com;linecorp.com;linecorp.com", "position": "Researcher;Senior Researcher;Researcher", "bibtex": "@inproceedings{\nliew2022pearl,\ntitle={{PEARL}: Data Synthesis via Private Embeddings and Adversarial Reconstruction Learning},\nauthor={Seng Pei Liew and Tsubasa Takahashi and Michihiko Ueno},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=M6M8BEmd6dq}\n}", "github": "", "project": "", "reviewers": "Xui7;kXZv;qSyM", "pdf_size": 0, "recommendation": "6;6;8", "confidence": "3;3;4", "correctness": "3;3;4", "technical_novelty": "3;3;3", "empirical_novelty": "3;3;3", "wc_summary_paper": "109;10;95", "wc_summary_review": "104;22;37", "wc_main_review": "366;124;130", "wc_review": "579;156;262", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "716;477;193", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "recommendation_avg": [ 6.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 71.33333333333333, 43.74420596553966 ], "wc_summary_review_avg": [ 54.333333333333336, 35.64952859280034 ], "wc_main_review_avg": [ 206.66666666666666, 112.69230477523989 ], "wc_review_avg": [ 332.3333333333333, 179.7077875020693 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 462.0, 213.77714252619867 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.9999999999999998, "corr_recommendation_correctness": 0.9999999999999998, "gs_citation": 33, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=743073351071645886&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=M6M8BEmd6dq", "email": "linecorp.com;linecorp.com;linecorp.com", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "LINE Corporation", "aff_unique_dep": "", "aff_unique_url": "https://www.linecorp.com", "aff_unique_abbr": "LINE", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Japan" }, { "id": "M6jm8fRG5eq", "title": "Decentralized Cooperative Multi-Agent Reinforcement Learning with Exploration", "track": "main", "status": "Reject", "tldr": "", "abstract": "Many real-world applications of multi-agent reinforcement learning (RL), such as multi-robot navigation and decentralized control of cyber-physical systems, involve the cooperation of agents as a team with aligned objectives. We study multi-agent RL in the most basic cooperative setting --- Markov teams --- a class of Markov games where the cooperating agents share a common reward. We propose an algorithm in which each agent independently runs stage-based V-learning (a Q-learning style algorithm) to efficiently explore the unknown environment, while using a stochastic gradient descent (SGD) subroutine for policy updates. We show that the agents can learn an $\\epsilon$-approximate Nash equilibrium policy in at most $\\propto\\widetilde{O}(1/\\epsilon^4)$ episodes. Our results advocate the use of a novel \\emph{stage-based} V-learning approach to create a stage-wise stationary environment. We also show that under certain smoothness assumptions of the team, our algorithm can achieve a nearly \\emph{team-optimal} Nash equilibrium. Simulation results corroborate our theoretical findings. One key feature of our algorithm is being \\emph{decentralized}, in the sense that each agent has access to only the state and its local actions, and is even \\emph{oblivious} to the presence of the other agents. Neither communication among teammates nor coordination by a central controller is required during learning. Hence, our algorithm can readily generalize to an arbitrary number of agents, without suffering from the exponential dependence on the number of agents. ", "keywords": "decentralized control;decentralized learning;game theory;reinforcement learning;reinforcement learning theory", "primary_area": "", "supplementary_material": "/attachment/67d5471d88b5773b74a67da9d7e5d83fb15b0cfb.zip", "author": "Weichao Mao;Tamer Basar;Lin Yang;Kaiqing Zhang", "authorids": "~Weichao_Mao1;~Tamer_Basar1;~Lin_Yang12;~Kaiqing_Zhang3", "gender": ";M;;", "homepage": ";http://tamerbasar.csl.illinois.edu/;;", "dblp": ";b/TamerBasar;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Weichao_Mao1;~Tamer_Basar1;~Lin_Yang12;~Kaiqing_Zhang3", "aff": ";University of Illinois, Urbana Champaign;;", "aff_domain": ";illinois.edu;;", "position": ";Emeritus;;", "bibtex": "@misc{\nmao2022decentralized,\ntitle={Decentralized Cooperative Multi-Agent Reinforcement Learning with Exploration},\nauthor={Weichao Mao and Tamer Basar and Lin Yang and Kaiqing Zhang},\nyear={2022},\nurl={https://openreview.net/forum?id=M6jm8fRG5eq}\n}", "github": "", "project": "", "reviewers": "8WCV;iszg;hhmV;t9pe", "site": "https://openreview.net/forum?id=M6jm8fRG5eq", "pdf_size": 0, "recommendation": "3;6;6;8", "confidence": "4;2;4;2", "correctness": "2;3;3;1", "technical_novelty": "3;3;3;4", "empirical_novelty": "1;3;3;4", "wc_summary_paper": "45;65;37;142", "wc_summary_review": "5;33;30;36", "wc_main_review": "248;254;258;234", "wc_review": "298;352;325;412", "wc_reply_reviewers": "582;37;0;0", "wc_reply_authors": "957;728;382;314", "reply_reviewers": "3;1;0;0", "reply_authors": "3;2;1;1", "recommendation_avg": [ 5.75, 1.7853571071357126 ], "confidence_avg": [ 3.0, 1.0 ], "correctness_avg": [ 2.25, 0.82915619758885 ], "technical_novelty_avg": [ 3.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 1.0897247358851685 ], "wc_summary_paper_avg": [ 72.25, 41.541395017500314 ], "wc_summary_review_avg": [ 26.0, 12.30853362509117 ], "wc_main_review_avg": [ 248.5, 9.096702699330127 ], "wc_review_avg": [ 346.75, 42.23372467590326 ], "wc_reply_reviewers_avg": [ 154.75, 247.13495806947265 ], "wc_reply_authors_avg": [ 595.25, 261.27703209428876 ], "reply_reviewers_avg": [ 1.0, 1.224744871391589 ], "reply_authors_avg": [ 1.75, 0.82915619758885 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.7001400420140049, "corr_recommendation_correctness": -0.2955402316445243, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6243973783797236817&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "University of Illinois Urbana-Champaign", "aff_unique_dep": "", "aff_unique_url": "https://illinois.edu", "aff_unique_abbr": "UIUC", "aff_campus_unique_index": "0", "aff_campus_unique": "Urbana-Champaign", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "title": "Learning Strides in Convolutional Neural Networks", "status": "Oral", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7068", "id": "M752z9FKJP", "poster": "", "openreview": "https://openreview.net/forum?id=M752z9FKJP", "slides": "https://iclr.cc/virtual/2022/poster/7068", "video": "https://iclr.cc/virtual/2022/poster/7068", "author_site": "Rachid Riad, Olivier Teboul, David Grangier, Neil Zeghidour", "tldr": "", "abstract": "Convolutional neural networks typically contain several downsampling operators, such as strided convolutions or pooling layers, that progressively reduce the resolution of intermediate representations. This provides some shift-invariance while reducing the computational complexity of the whole architecture. A critical hyperparameter of such layers is their stride: the integer factor of downsampling. As strides are not differentiable, finding the best configuration either requires cross-validation or discrete optimization (e.g. architecture search), which rapidly become prohibitive as the search space grows exponentially with the number of downsampling layers. Hence, exploring this search space by gradient descent would allow finding better configurations at a lower computational cost. This work introduces DiffStride, the first downsampling layer with learnable strides. Our layer learns the size of a cropping mask in the Fourier domain, that effectively performs resizing in a differentiable way. Experiments on audio and image classification show the generality and effectiveness of our solution: we use DiffStride as a drop-in replacement to standard downsampling layers and outperform them. In particular, we show that introducing our layer into a ResNet-18 architecture allows keeping consistent high performance on CIFAR10, CIFAR100 and ImageNet even when training starts from poor random stride configurations. Moreover, formulating strides as learnable variables allows us to introduce a regularization term that controls the computational complexity of the architecture. We show how this regularization allows trading off accuracy for efficiency on ImageNet.", "keywords": "Strides;Convolutional neural networks;Downsampling;Spectral representations;Fourier", "primary_area": "", "supplementary_material": "", "author": "Rachid Riad;Olivier Teboul;David Grangier;Neil Zeghidour", "authorids": "~Rachid_Riad1;~Olivier_Teboul2;~David_Grangier1;~Neil_Zeghidour1", "gender": "M;M;M;M", "homepage": "https://rachine.github.io/;;http://david.grangier.info/;", "dblp": ";77/1749;57/1192;180/2570", "google_scholar": "https://scholar.google.fr/citations?user=eIWHQnoAAAAJ;ep0OfyAAAAAJ;CIQEGCYAAAAJ;", "orcid": ";;0000-0002-8847-9532;", "linkedin": ";;davidgrangier/;", "or_profile": "~Rachid_Riad1;~Olivier_Teboul2;~David_Grangier1;~Neil_Zeghidour1", "aff": "Ecole Normale Superieure;Google;Google;Google", "aff_domain": "ens.fr;google.com;google.com;google.com", "position": "PhD student;Software Engineer;Researcher;Research Scientist", "bibtex": "@inproceedings{\nriad2022learning,\ntitle={Learning Strides in Convolutional Neural Networks},\nauthor={Rachid Riad and Olivier Teboul and David Grangier and Neil Zeghidour},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=M752z9FKJP}\n}", "github": "", "project": "", "reviewers": "ubnz;34rN;e1mG;BTMH", "pdf_size": 0, "recommendation": "8;8;8;8", "confidence": "4;4;3;4", "correctness": "3;4;4;4", "technical_novelty": "3;4;3;3", "empirical_novelty": "3;4;4;3", "wc_summary_paper": "60;126;116;57", "wc_summary_review": "48;17;54;63", "wc_main_review": "174;216;698;120", "wc_review": "282;359;868;240", "wc_reply_reviewers": "22;0;116;0", "wc_reply_authors": "629;514;1435;744", "reply_reviewers": "1;0;1;0", "reply_authors": "1;1;2;1", "recommendation_avg": [ 8.0, 0.0 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.5, 0.5 ], "wc_summary_paper_avg": [ 89.75, 31.467244874631145 ], "wc_summary_review_avg": [ 45.5, 17.298843892006193 ], "wc_main_review_avg": [ 302.0, 231.14930239998563 ], "wc_review_avg": [ 437.25, 252.32853088780905 ], "wc_reply_reviewers_avg": [ 34.5, 47.90354892907205 ], "wc_reply_authors_avg": [ 830.5, 358.3563170923599 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 72, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1272651603956213806&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "pdf": "https://openreview.net/pdf?id=M752z9FKJP", "email": "ens.fr;google.com;google.com;google.com", "author_num": 4, "aff_unique_index": "0;1;1;1", "aff_unique_norm": "Ecole Normale Superieure;Google", "aff_unique_dep": ";Google", "aff_unique_url": "https://www.ens.fr;https://www.google.com", "aff_unique_abbr": "ENS;Google", "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;1;1;1", "aff_country_unique": "France;United States" }, { "id": "MACKPM_haAu", "title": "Adversarial Attack by Limited Point Cloud Surface Modifications", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Recent research has revealed that the security of deep neural networks that directly process 3D point clouds to classify objects can be threatened by adversarial samples. Although existing adversarial attack methods achieve high success rates, they do not restrict the point modifications enough to preserve the point cloud appearance. To overcome this shortcoming, two constraints are proposed. These include applying hard boundary constraints on the number of modified points and on the point perturbation norms. Due to the restrictive nature of the problem, the search space contains many local maxima. The proposed method addresses this issue by using a high step-size at the beginning of the algorithm to search the main surface of the point cloud fast and effectively. Then, in order to converge to the desired output, the step-size is gradually decreased. To evaluate the performance of the proposed method, it is run on the ModelNet40 and ScanObjectNN datasets by employing the state-of-the-art point cloud classification models; including PointNet, PointNet++, and DGCNN. The obtained results show that it can perform successful attacks and achieve state-of-the-art results by only a limited number of point modifications while preserving the appearance of the point cloud. Moreover, due to the effective search algorithm, it can perform successful attacks in just a few steps. Additionally, the proposed step-size scheduling algorithm shows an improvement of up to $14.5\\%$ when adopted by other methods as well. The proposed method also performs effectively against popular defense methods.", "keywords": "Adversarial Attack;3D Point Cloud;Step-Size Scheduling", "primary_area": "", "supplementary_material": "/attachment/ca3ea504b59fc510a1af44e71e302d70ceef5da3.zip", "author": "Atrin Arya;Hanieh Naderi;Shohreh Kasaei", "authorids": "~Atrin_Arya1;~Hanieh_Naderi2;~Shohreh_Kasaei2", "gender": "M;F;F", "homepage": ";;https://sharif.edu/~kasaei/", "dblp": ";;", "google_scholar": "4cuAcN0AAAAJ;j6DVuoQAAAAJ;mvx4PvgAAAAJ", "orcid": ";;0000-0002-3831-0878", "linkedin": "atrin-a-4182a4123/;hanieh-naderi-74975673/;shohreh-kasaei-b34558102/", "or_profile": "~Atrin_Arya1;~Hanieh_Naderi2;~Shohreh_Kasaei2", "aff": "Sharif University of Technology;Sharif University of Technology;Sharif University of Technology", "aff_domain": "sharif.ir;sharif.edu;sharif.edu", "position": "Undergrad student;PhD student;Full Professor", "bibtex": "@misc{\narya2022adversarial,\ntitle={Adversarial Attack by Limited Point Cloud Surface Modifications},\nauthor={Atrin Arya and Hanieh Naderi and Shohreh Kasaei},\nyear={2022},\nurl={https://openreview.net/forum?id=MACKPM_haAu}\n}", "github": "", "project": "", "reviewers": "jpG6;i9SR;WCF9;PzdS", "site": "https://openreview.net/forum?id=MACKPM_haAu", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "5;4;4;4", "correctness": "2;2;3;4", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;2;1;2", "wc_summary_paper": "75;28;61;90", "wc_summary_review": "14;11;33;19", "wc_main_review": "221;106;289;307", "wc_review": "310;145;383;416", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "263;195;426;232", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.82915619758885 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 1.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 63.5, 22.91833327273168 ], "wc_summary_review_avg": [ 19.25, 8.437268515343103 ], "wc_main_review_avg": [ 230.75, 78.8428024616071 ], "wc_review_avg": [ 313.5, 104.57174570599842 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 279.0, 88.21847878987712 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": 0.8703882797784891, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3862472917912502482&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff_unique_index": "0;0;0", "aff_unique_norm": "Sharif University of Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.sharif.edu", "aff_unique_abbr": "SUT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Iran" }, { "id": "MAYipnUpHHD", "title": "Reinforcement Learning for Adaptive Mesh Refinement", "track": "main", "status": "Reject", "tldr": "", "abstract": "Large-scale finite element simulations of complex physical systems governed by partial differential equations (PDE) crucially depend on adaptive mesh refinement (AMR) to allocate computational budget to regions where higher resolution is required. Existing scalable AMR methods make heuristic refinement decisions based on instantaneous error estimation and thus do not aim for long-term optimality over an entire simulation. We propose a novel formulation of AMR as a Markov decision process and apply deep reinforcement learning (RL) to train refinement {\\it policies} directly from simulation. AMR poses a new problem for RL as both the state dimension and available action set changes at every step, which we solve by proposing new policy architectures with differing generality and inductive bias. The model sizes of these policy architectures are independent of the mesh size and hence can be deployed on larger simulations than those used at train time. We demonstrate in comprehensive experiments on static function estimation and time-dependent equations that RL policies can be trained on problems without using ground truth solutions, are competitive with a widely-used error estimator, and generalize to larger, more complex, and unseen test problems.", "keywords": "reinforcement learning;adaptive mesh refinement;finite element method", "primary_area": "", "supplementary_material": "", "author": "Jiachen Yang;Tarik Dzanic;Brenden K. Petersen;Jun Kudo;Ketan Mittal;Jean-Sylvain Camier;Vladimir Tomov;Tuo Zhao;Hongyuan Zha;Tzanio Kolev;Robert Anderson;Daniel faissol", "authorids": "~Jiachen_Yang1;tdzanic96@gmail.com;~Brenden_K._Petersen1;kudo4@llnl.gov;~Ketan_Mittal1;camier1@llnl.gov;tomov2@llnl.gov;~Tuo_Zhao1;~Hongyuan_Zha1;kolev1@llnl.gov;anderson110@llnl.gov;~Daniel_faissol1", "gender": ";;;;M;;;M;;;;M", "homepage": ";;;;https://people.llnl.gov/mittal3;;;http://www2.isye.gatech.edu/~tzhao80;;;;http://www.llnl.gov", "dblp": ";;;;230/4556;;;;z/HongyuanZha;;;", "google_scholar": ";;;;3E1WydIAAAAJ;;;EJXN6tYAAAAJ;n1DQMIsAAAAJ;;;", "orcid": ";;;;my-orcid?orcid=0000-0002-2062-852X;;;;;;;0000-0001-6965-983X", "linkedin": ";;;;;;;;;;;", "or_profile": "~Jiachen_Yang1;tdzanic96@gmail.com;~Brenden_K._Petersen1;kudo4@llnl.gov;~Ketan_Mittal1;camier1@llnl.gov;tomov2@llnl.gov;~Tuo_Zhao1;~Hongyuan_Zha1;kolev1@llnl.gov;anderson110@llnl.gov;~Daniel_faissol1", "aff": ";;;;Lawrence Livermore National Labs;;;Georgia Institute of Technology;The Chinese University of Hong Kong, Shenzhen;;;Lawrence Livermore National Labs", "aff_domain": ";;;;llnl.gov;;;gatech.edu;cuhk.edu.cn;;;llnl.gov", "position": ";;;;Computational Mathematician;;;Associate Professor;Full Professor;;;Staff Scientist", "bibtex": "@misc{\nyang2022reinforcement,\ntitle={Reinforcement Learning for Adaptive Mesh Refinement},\nauthor={Jiachen Yang and Tarik Dzanic and Brenden K. Petersen and Jun Kudo and Ketan Mittal and Jean-Sylvain Camier and Vladimir Tomov and Tuo Zhao and Hongyuan Zha and Tzanio Kolev and Robert Anderson and Daniel faissol},\nyear={2022},\nurl={https://openreview.net/forum?id=MAYipnUpHHD}\n}", "github": "", "project": "", "reviewers": "LhZp;aatv;rRgy;Gif3;sdD7", "site": "https://openreview.net/forum?id=MAYipnUpHHD", "pdf_size": 0, "recommendation": "5;5;5;5;6", "confidence": "3;3;3;4;3", "correctness": "4;3;3;1;3", "technical_novelty": "2;2;4;3;3", "empirical_novelty": "2;3;0;2;3", "wc_summary_paper": "37;37;16;300;68", "wc_summary_review": "74;70;28;203;40", "wc_main_review": "239;542;248;1036;318", "wc_review": "350;649;292;1539;426", "wc_reply_reviewers": "132;810;0;453;0", "wc_reply_authors": "1301;2060;660;1532;723", "reply_reviewers": "1;3;0;1;0", "reply_authors": "3;5;1;3;1", "recommendation_avg": [ 5.2, 0.39999999999999997 ], "confidence_avg": [ 3.2, 0.39999999999999997 ], "correctness_avg": [ 2.8, 0.9797958971132712 ], "technical_novelty_avg": [ 2.8, 0.7483314773547882 ], "empirical_novelty_avg": [ 2.0, 1.0954451150103321 ], "wc_summary_paper_avg": [ 91.6, 105.51322192028827 ], "wc_summary_review_avg": [ 83.0, 62.48839892332016 ], "wc_main_review_avg": [ 476.6, 300.3568544248658 ], "wc_review_avg": [ 651.2, 460.1423258080048 ], "wc_reply_reviewers_avg": [ 279.0, 312.8987056540823 ], "wc_reply_authors_avg": [ 1255.2, 522.2870475131467 ], "reply_reviewers_avg": [ 1.0, 1.0954451150103321 ], "reply_authors_avg": [ 2.6, 1.4966629547095767 ], "replies_avg": [ 26, 0 ], "authors#_avg": [ 12, 0 ], "corr_recommendation_confidence": -0.25000000000000006, "corr_recommendation_correctness": 0.10206207261596575, "gs_citation": 56, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9213710282156170047&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "Lawrence Livermore National Laboratory;Georgia Institute of Technology;Chinese University of Hong Kong", "aff_unique_dep": ";;", "aff_unique_url": "https://www.llnl.gov;https://www.gatech.edu;https://www.cuhk.edu.cn", "aff_unique_abbr": "LLNL;Georgia Tech;CUHK", "aff_campus_unique_index": "1", "aff_campus_unique": ";Shenzhen", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "United States;China" }, { "id": "MDT30TEtaVY", "title": "Set Norm and Equivariant Skip Connections: Putting the Deep in Deep Sets", "track": "main", "status": "Reject", "tldr": "", "abstract": "Permutation invariant neural networks are a promising tool for predictive modeling of set data. We show, however, that existing architectures struggle to perform well when they are deep. In this work, we address this issue for the two most widely used permutation invariant networks, Deep Sets and its transformer analogue Set Transformer. We take inspiration from previous efforts to scale neural network architectures by incorporating normalization layers and skip connections that work for sets. First, we motivate and develop set norm, a normalization tailored for sets. Then, we employ equivariant residual connections and introduce the ``clean path principle'' for their placement. With these changes, our many-layer Deep Sets++ and Set Transformer++ models reach comparable or better performance than their original counterparts on a diverse suite of tasks, from point cloud classification to regression on sets of images. We additionally introduce Flow-RBC, a new single-cell dataset and real-world application of permutation invariant prediction. On this task, our new models outperform existing methods as well as a clinical baseline. We open-source our data and code here: link-omitted-for-anonymity.", "keywords": "deep learning;permutation invariance;normalization;residual connections", "primary_area": "", "supplementary_material": "/attachment/76949ef63aa0c01b665899a220d2b4e71b145bea.zip", "author": "Lily H Zhang;Veronica Tozzo;John M. Higgins;Rajesh Ranganath", "authorids": "~Lily_H_Zhang1;~Veronica_Tozzo2;~John_M._Higgins1;~Rajesh_Ranganath2", "gender": "F;F;;", "homepage": "https://lhz1029.github.io/;;;", "dblp": "267/6682;;;97/7057", "google_scholar": "fmCi9ZQAAAAJ;mTZf_NEAAAAJ;;", "orcid": ";;0000-0002-9182-0076;", "linkedin": ";;;", "or_profile": "~Lily_H_Zhang1;~Veronica_Tozzo2;~John_M._Higgins1;~Rajesh_Ranganath2", "aff": "New York University;Massachusetts General Hospital, Harvard University;Massachusetts General Hospital, Harvard University;New York University", "aff_domain": "nyu.edu;mgh.harvard.edu;mgh.harvard.edu;nyu.edu", "position": "PhD student;Postdoc;Associate Professor;Assistant Professor", "bibtex": "@misc{\nzhang2022set,\ntitle={Set Norm and Equivariant Skip Connections: Putting the Deep in Deep Sets},\nauthor={Lily H Zhang and Veronica Tozzo and John M. Higgins and Rajesh Ranganath},\nyear={2022},\nurl={https://openreview.net/forum?id=MDT30TEtaVY}\n}", "github": "", "project": "", "reviewers": "eZF1;Dj6B;ifHM;Qf1Y", "site": "https://openreview.net/forum?id=MDT30TEtaVY", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "4;4;4;4", "correctness": "2;2;4;4", "technical_novelty": "3;2;3;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "86;65;48;121", "wc_summary_review": "31;60;48;92", "wc_main_review": "347;642;256;299", "wc_review": "464;767;352;512", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "375;1013;110;185", "reply_reviewers": "0;0;0;0", "reply_authors": "1;2;1;1", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.0, 1.0 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 80.0, 27.230497608380205 ], "wc_summary_review_avg": [ 57.75, 22.29770167528483 ], "wc_main_review_avg": [ 386.0, 151.26632143342417 ], "wc_review_avg": [ 523.75, 151.9677186115525 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 420.75, 355.31561674094763 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.6882472016116854, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8359318767015654610&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "aff_unique_index": "0;1;1;0", "aff_unique_norm": "New York University;Harvard University", "aff_unique_dep": ";Massachusetts General Hospital", "aff_unique_url": "https://www.nyu.edu;https://www.harvard.edu", "aff_unique_abbr": "NYU;Harvard", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Meta Discovery: Learning to Discover Novel Classes given Very Limited Data", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6442", "id": "MEpKGLsY8f", "poster": "", "openreview": "https://openreview.net/forum?id=MEpKGLsY8f", "slides": "https://iclr.cc/virtual/2022/poster/6442", "video": "https://iclr.cc/virtual/2022/poster/6442", "author_site": "Haoang Chi, Feng Liu, Wenjing Yang, Long Lan, Tongliang Liu, Bo Han, Gang Niu, Mingyuan Zhou, Masashi Sugiyama", "tldr": "", "abstract": "In novel class discovery (NCD), we are given labeled data from seen classes and unlabeled data from unseen classes, and we train clustering models for the unseen classes. However, the implicit assumptions behind NCD are still unclear. In this paper, we demystify assumptions behind NCD and find that high-level semantic features should be shared among the seen and unseen classes. Based on this finding, NCD is theoretically solvable under certain assumptions and can be naturally linked to meta-learning that has exactly the same assumption as NCD. Thus, we can empirically solve the NCD problem by meta-learning algorithms after slight modifications. This meta-learning-based methodology significantly reduces the amount of unlabeled data needed for training and makes it more practical, as demonstrated in experiments. The use of very limited data is also justified by the application scenario of NCD: since it is unnatural to label only seen-class data, NCD is sampling instead of labeling in causality. Therefore, unseen-class data should be collected on the way of collecting seen-class data, which is why they are novel and first need to be clustered.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Haoang Chi;Feng Liu;Wenjing Yang;Long Lan;Tongliang Liu;Bo Han;Gang Niu;Mingyuan Zhou;Masashi Sugiyama", "authorids": "~Haoang_Chi1;~Feng_Liu2;~Wenjing_Yang1;~Long_Lan2;~Tongliang_Liu1;~Bo_Han1;~Gang_Niu1;~Mingyuan_Zhou1;~Masashi_Sugiyama1", "gender": "M;M;F;M;M;M;M;M;M", "homepage": ";https://fengliu90.github.io/index.html;https://www.researchgate.net/scientific-contributions/Wen-Jing-Yang-2056467943;https://lan-long.github.io/;https://tongliang-liu.github.io/;https://niug1984.github.io;http://mingyuanzhou.github.io;http://www.ms.k.u-tokyo.ac.jp/sugi/;https://bhanml.github.io/", "dblp": "284/9320;77/1318-3;48/3396-2;124/2136.html;150/6667;26/3367-1;;35/1228;241/0472-3", "google_scholar": ";https://scholar.google.com/citations?hl=en;;https://scholar.google.com.au/citations?user=huVW6Y8AAAAJ;https://scholar.google.com.au/citations?user=EiLdZ_YAAAAJ;https://scholar.google.co.jp/citations?user=HOkcy00AAAAJ;LXwCIisAAAAJ;https://scholar.google.co.jp/citations?user=GkYIrlIAAAAJ;nTNjqHwAAAAJ", "orcid": ";0000-0002-5005-9129;;;;;;0000-0001-6658-6743;", "linkedin": ";alexfengliu;;;;;;;", "or_profile": "~Haoang_Chi1;~Feng_Liu2;~Wenjing_Yang1;~Long_Lan2;~Tongliang_Liu1;~Gang_Niu1;~Mingyuan_Zhou1;~Masashi_Sugiyama1;~bo_han2", "aff": "Intelligent Game and Decision Laboratory, Beijing;University of Technology Sydney;National University of Defense Technology;National University of Defense Technology,;University of Sydney;RIKEN;The University of Texas at Austin;The University of Tokyo;Microsoft Research", "aff_domain": "nudt.edu.cn;uts.edu.au;nudt.edu.cn;nudt.edu.cn;sydney.edu.au;riken.jp;utexas.edu;u-tokyo.ac.jp;microsoft.com", "position": "PhD student;Assistant Professor;Associate Professor;Assistant Professor;Lecturer;Research Scientist (tenured);Associate Professor;Full Professor;Researcher", "bibtex": "@inproceedings{\nchi2022meta,\ntitle={Meta Discovery: Learning to Discover Novel Classes given Very Limited Data},\nauthor={Haoang Chi and Feng Liu and Wenjing Yang and Long Lan and Tongliang Liu and Bo Han and Gang Niu and Mingyuan Zhou and Masashi Sugiyama},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=MEpKGLsY8f}\n}", "github": "", "project": "", "reviewers": "ueqR;iZHD;V9Yq;o2Tq", "pdf_size": 0, "recommendation": "8;8;8;8", "confidence": "3;5;3;4", "correctness": "4;4;4;4", "technical_novelty": "3;4;3;4", "empirical_novelty": "3;4;4;3", "wc_summary_paper": "81;88;82;151", "wc_summary_review": "23;69;27;37", "wc_main_review": "138;449;145;394", "wc_review": "242;606;254;582", "wc_reply_reviewers": "29;30;0;43", "wc_reply_authors": "264;665;328;885", "reply_reviewers": "1;1;0;1", "reply_authors": "1;1;1;2", "recommendation_avg": [ 8.0, 0.0 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 4.0, 0.0 ], "technical_novelty_avg": [ 3.5, 0.5 ], "empirical_novelty_avg": [ 3.5, 0.5 ], "wc_summary_paper_avg": [ 100.5, 29.278831943914703 ], "wc_summary_review_avg": [ 39.0, 18.05547008526779 ], "wc_main_review_avg": [ 281.5, 141.36566061105503 ], "wc_review_avg": [ 421.0, 173.2599203508994 ], "wc_reply_reviewers_avg": [ 25.5, 15.724185193516387 ], "wc_reply_authors_avg": [ 535.5, 252.82849918472402 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 9, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 43, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11348139324456569930&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=MEpKGLsY8f", "email": "nudt.edu.cn;uts.edu.au;nudt.edu.cn;nudt.edu.cn;sydney.edu.au;riken.jp;utexas.edu;u-tokyo.ac.jp;microsoft.com", "author_num": 9, "aff_unique_index": "0;1;2;2;3;4;5;6;7", "aff_unique_norm": "Intelligent Game and Decision Laboratory;University of Technology Sydney;National University of Defense Technology;University of Sydney;RIKEN;University of Texas at Austin;University of Tokyo;Microsoft", "aff_unique_dep": "Intelligent Game and Decision Laboratory;;;;;;;Microsoft Research", "aff_unique_url": ";https://www.uts.edu.au;http://www.nudt.edu.cn/;https://www.sydney.edu.au;https://www.riken.jp;https://www.utexas.edu;https://www.u-tokyo.ac.jp;https://www.microsoft.com/en-us/research", "aff_unique_abbr": ";UTS;NUDT;USYD;RIKEN;UT Austin;UTokyo;MSR", "aff_campus_unique_index": "1", "aff_campus_unique": ";Austin", "aff_country_unique_index": "0;1;0;0;1;2;3;2;3", "aff_country_unique": "China;Australia;Japan;United States" }, { "id": "MGIg_Q4QtW2", "title": "RAR: Region-Aware Point Cloud Registration", "track": "main", "status": "Reject", "tldr": "", "abstract": "This paper concerns the research problem of point cloud registration to find the rigid transformation to optimally align the source point set with the target one. Learning robust point cloud registration models with deep neural networks has emerged as a powerful paradigm, offering promising performance in predicting the global geometric transformation for a pair of point sets. Existing methods firstly leverage an encoder to regress a latent shape embedding, which is then decoded into a shape-conditioned transformation via concatenation-based conditioning. However, different regions of a 3D shape vary in their geometric structures which makes it more sense that we have a region-conditioned transformation instead of the shape-conditioned one. With this observation, in this paper we present a \\underline{R}egion-\\underline{A}ware point cloud \\underline{R}egistration, denoted as RAR, to predict transformation for pairwise point sets in the self-supervised learning fashion. More specifically, we develop a novel region-aware decoder (RAD) module that is formed with an implicit neural region representation parameterized by neural networks. The implicit neural region representation is learned with a self-supervised 3D shape reconstruction loss without the need for region labels. Consequently, the region-aware decoder (RAD) module guides the training of the region-aware transformation (RAT) module and region-aware weight (RAW) module, which predict the transforms and weights for different regions respectively. The global geometric transformation from source point set to target one is then formed by the weighted fusion of region-aware transforms. Compared to the state-of-the-art approaches, our experiments show that our RAR achieves superior registration performance over various benchmark datasets (e.g. ModelNet40).", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yu Hao;Yi Fang", "authorids": "~Yu_Hao1;~Yi_Fang2", "gender": "M;M", "homepage": ";http://mmvc.engineering.nyu.edu/", "dblp": "33/32703;96/361-6", "google_scholar": ";j-cyhzwAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Yu_Hao1;~Yi_Fang2", "aff": "New York University;New York University", "aff_domain": "nyu.edu;nyu.edu", "position": "PhD student;Associate Professor", "bibtex": "@misc{\nhao2022rar,\ntitle={{RAR}: Region-Aware Point Cloud Registration},\nauthor={Yu Hao and Yi Fang},\nyear={2022},\nurl={https://openreview.net/forum?id=MGIg_Q4QtW2}\n}", "github": "", "project": "", "reviewers": "mwXA;EHWV;7prb;bd2E;tsa1", "site": "https://openreview.net/forum?id=MGIg_Q4QtW2", "pdf_size": 0, "recommendation": "3;3;3;3;5", "confidence": "2;4;4;4;3", "correctness": "3;3;2;2;2", "technical_novelty": "2;3;2;3;2", "empirical_novelty": "2;1;3;2;2", "wc_summary_paper": "60;70;105;64;34", "wc_summary_review": "67;59;54;67;68", "wc_main_review": "406;542;279;497;225", "wc_review": "533;671;438;628;327", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "0;0;0;0;0", "reply_reviewers": "0;0;0;0;0", "reply_authors": "0;0;0;0;0", "recommendation_avg": [ 3.4, 0.8 ], "confidence_avg": [ 3.4, 0.8 ], "correctness_avg": [ 2.4, 0.4898979485566356 ], "technical_novelty_avg": [ 2.4, 0.4898979485566356 ], "empirical_novelty_avg": [ 2.0, 0.6324555320336759 ], "wc_summary_paper_avg": [ 66.6, 22.799999999999997 ], "wc_summary_review_avg": [ 63.0, 5.549774770204643 ], "wc_main_review_avg": [ 389.8, 121.94654566653375 ], "wc_review_avg": [ 519.4, 125.38357149164321 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.25000000000000006, "corr_recommendation_correctness": -0.4082482904638631, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12076427954501865304&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2, "aff_unique_index": "0;0", "aff_unique_norm": "New York University", "aff_unique_dep": "", "aff_unique_url": "https://www.nyu.edu", "aff_unique_abbr": "NYU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "NeuPL: Neural Population Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6950", "id": "MIX3fJkl_1", "poster": "", "openreview": "https://openreview.net/forum?id=MIX3fJkl_1", "slides": "https://iclr.cc/virtual/2022/poster/6950", "video": "https://iclr.cc/virtual/2022/poster/6950", "author_site": "SIQI LIU, Luke Marris, Daniel Hennes, Josh Merel, Nicolas Heess, Thore Graepel", "tldr": "", "abstract": "Learning in strategy games (e.g. StarCraft, poker) requires the discovery of diverse policies. This is often achieved by iteratively training new policies against existing ones, growing a policy population that is robust to exploit. This iterative approach suffers from two issues in real-world games: a) under finite budget, approximate best-response operators at each iteration needs truncating, resulting in under-trained good-responses populating the population; b) repeated learning of basic skills at each iteration is wasteful and becomes intractable in the presence of increasingly strong opponents. In this work, we propose Neural Population Learning (NeuPL) as a solution to both issues. NeuPL offers convergence guarantees to a population of best-responses under mild assumptions. By representing a population of policies within a single conditional model, NeuPL enables transfer learning across policies. Empirically, we show the generality, improved performance and efficiency of NeuPL across several test domains. Most interestingly, we show that novel strategies become more accessible, not less, as the neural population expands.", "keywords": "Multi-Agent Learning;Game Theory;Population Learning", "primary_area": "", "supplementary_material": "", "author": "Siqi Liu;Luke Marris;Daniel Hennes;Josh Merel;Nicolas Heess;Thore Graepel", "authorids": "~Siqi_Liu1;~Luke_Marris2;~Daniel_Hennes1;~Josh_Merel1;~Nicolas_Heess1;~Thore_Graepel1", "gender": "M;;;;;", "homepage": "http://siqi.fr/;https://www.lukemarris.info/;;;;", "dblp": "60/9360-2.html;223/4422;;139/1361;76/9181;g/ThoreGraepel", "google_scholar": "7U_OA0oAAAAJ;dvTeSX4AAAAJ;;https://scholar.google.co.uk/citations?user=K4OcFXUAAAAJ;79k7bGEAAAAJ;", "orcid": "0000-0001-6381-4552;;;;;", "linkedin": ";;;;;", "or_profile": "~Siqi_Liu1;~Luke_Marris2;~Daniel_Hennes1;~Josh_Merel1;~Nicolas_Heess1;~Thore_Graepel1", "aff": "Google;University College London;;Meta Reality Labs;Google DeepMind;", "aff_domain": "google.com;ucl.ac.uk;;fb.com;google.com;", "position": "Research Engineer;PhD student;;Research Scientist;Research Scientist;", "bibtex": "@inproceedings{\nliu2022neupl,\ntitle={Neu{PL}: Neural Population Learning},\nauthor={Siqi Liu and Luke Marris and Daniel Hennes and Josh Merel and Nicolas Heess and Thore Graepel},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=MIX3fJkl_1}\n}", "github": "", "project": "", "reviewers": "TfQw;kTPT;xb7h;bxQq", "pdf_size": 0, "recommendation": "8;8;8;8", "confidence": "4;4;3;3", "correctness": "3;3;3;4", "technical_novelty": "3;3;3;3", "empirical_novelty": "3;3;3;3", "wc_summary_paper": "116;93;56;35", "wc_summary_review": "89;63;32;47", "wc_main_review": "868;397;159;217", "wc_review": "1073;553;247;299", "wc_reply_reviewers": "372;0;9;14", "wc_reply_authors": "1699;682;632;721", "reply_reviewers": "2;0;1;1", "reply_authors": "3;1;1;1", "recommendation_avg": [ 8.0, 0.0 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 75.0, 31.488092987667578 ], "wc_summary_review_avg": [ 57.75, 21.111312133545844 ], "wc_main_review_avg": [ 410.25, 278.4702632239213 ], "wc_review_avg": [ 543.0, 327.1666242146347 ], "wc_reply_reviewers_avg": [ 98.75, 157.8407029254495 ], "wc_reply_authors_avg": [ 933.5, 443.086052590239 ], "reply_reviewers_avg": [ 1.0, 0.7071067811865476 ], "reply_authors_avg": [ 1.5, 0.8660254037844386 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 24, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18317810267099730826&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=MIX3fJkl_1", "email": "google.com;ucl.ac.uk;;fb.com;google.com;", "author_num": 6, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "Google;University College London;Meta", "aff_unique_dep": "Google;;Meta Reality Labs", "aff_unique_url": "https://www.google.com;https://www.ucl.ac.uk;https://www.meta.com", "aff_unique_abbr": "Google;UCL;MRL", "aff_campus_unique_index": "0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;1;0;1", "aff_country_unique": "United States;United Kingdom" }, { "title": "$\\pi$BO: Augmenting Acquisition Functions with User Beliefs for Bayesian Optimization", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6843", "id": "MMAeCXIa89", "poster": "", "openreview": "https://openreview.net/forum?id=MMAeCXIa89", "slides": "https://iclr.cc/virtual/2022/poster/6843", "video": "https://iclr.cc/virtual/2022/poster/6843", "author_site": "Carl Hvarfner, Danny Stoll, Artur Souza, Marius Lindauer, Frank Hutter, Luigi Nardi", "tldr": "", "abstract": "Bayesian optimization (BO) has become an established framework and popular tool for hyperparameter optimization (HPO) of machine learning (ML) algorithms. While known for its sample-efficiency, vanilla BO can not utilize readily available prior beliefs the practitioner has on the potential location of the optimum. Thus, BO disregards a valuable source of information, reducing its appeal to ML practitioners. To address this issue, we propose $\\pi$BO, an acquisition function generalization which incorporates prior beliefs about the location of the optimum in the form of a probability distribution, provided by the user. In contrast to previous approaches, $\\pi$BO is conceptually simple and can easily be integrated with existing libraries and many acquisition functions. We provide regret bounds when $\\pi$BO is applied to the common Expected Improvement acquisition function and prove convergence at regular rates independently of the prior. Further, our experiments show that $\\pi$BO outperforms competing approaches across a wide suite of benchmarks and prior characteristics. We also demonstrate that $\\pi$BO improves on the state-of-the-art performance for a popular deep learning task, with a $12.5\\times$ time-to-accuracy speedup over prominent BO approaches.", "keywords": "Bayesian Optimization;Hyperparameter Optimization;Meta-Learning", "primary_area": "", "supplementary_material": "/attachment/2203a3a289d64bdf42903c2d94f031b27e7d5824.zip", "author": "Carl Hvarfner;Danny Stoll;Artur Souza;Marius Lindauer;Frank Hutter;Luigi Nardi", "authorids": "~Carl_Hvarfner1;~Danny_Stoll1;~Artur_Souza1;~Marius_Lindauer1;~Frank_Hutter1;~Luigi_Nardi1", "gender": "M;M;M;M;M;M", "homepage": "https://portal.research.lu.se/portal/sv/persons/carl-hvarfner(cd140b82-9fed-4e88-868e-1cf569dcbeb7).html;https://ml.informatik.uni-freiburg.de/profile/stoll/;http://buscatextual.cnpq.br/buscatextual/visualizacv.do;jsessionid=D9822F24D364299D11D8868EF46ADB21.buscatextual_0;https://www.ai.uni-hannover.de/de/institut/team/lindauer;http://ml.informatik.uni-freiburg.de/~hutter/;", "dblp": "319/3033;232/3297;;28/9142;89/5383;60/7206", "google_scholar": "https://scholar.google.se/citations?hl=en;;;https://scholar.google.de/citations?user=0Sxx7DUAAAAJ;https://scholar.google.de/citations?user=YUrxwrkAAAAJ;https://scholar.google.it/citations?user=Kgs3zQoAAAAJ", "orcid": ";;;;0000-0002-2037-3694;0000-0002-4601-2264", "linkedin": "carl-hvarfner-a97421153/;Danny-Stoll-AI/;;;frank-hutter-9190b24b/;nardiluigi/", "or_profile": "~Carl_Hvarfner1;~Danny_Stoll1;~Artur_Souza1;~Marius_Lindauer1;~Frank_Hutter1;~Luigi_Nardi1", "aff": "Lund University;University of Freiburg;Universidade Federal de Minas Gerais;Leibniz Universit\u00e4t Hannover;Albert-Ludwigs-Universit\u00e4t Freiburg;Stanford University", "aff_domain": "lu.se;uni-freiburg.de;ufmg.br;uni-hannover.de;uni-freiburg.de;stanford.edu", "position": "PhD student;PhD student;PhD student;Associate Professor;Full Professor;Researcher", "bibtex": "@inproceedings{\nhvarfner2022pibo,\ntitle={\\${\\textbackslash}pi\\${BO}: Augmenting Acquisition Functions with User Beliefs for Bayesian Optimization},\nauthor={Carl Hvarfner and Danny Stoll and Artur Souza and Luigi Nardi and Marius Lindauer and Frank Hutter},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=MMAeCXIa89}\n}", "github": "", "project": "", "reviewers": "VqSx;vMEc;eisp;gZca", "pdf_size": 0, "recommendation": "6;8;8;8", "confidence": "4;5;4;5", "correctness": "2;4;4;4", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "39;75;139;103", "wc_summary_review": "52;27;51;133", "wc_main_review": "294;154;235;242", "wc_review": "385;256;425;478", "wc_reply_reviewers": "86;93;40;103", "wc_reply_authors": "1042;849;817;747", "reply_reviewers": "1;2;1;1", "reply_authors": "4;2;2;1", "recommendation_avg": [ 7.5, 0.8660254037844386 ], "confidence_avg": [ 4.5, 0.5 ], "correctness_avg": [ 3.5, 0.8660254037844386 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 89.0, 36.71511950137164 ], "wc_summary_review_avg": [ 65.75, 40.095978601351035 ], "wc_main_review_avg": [ 231.25, 50.08679965819338 ], "wc_review_avg": [ 386.0, 81.98475468036725 ], "wc_reply_reviewers_avg": [ 80.5, 24.150569351466643 ], "wc_reply_authors_avg": [ 863.75, 109.32377371825397 ], "reply_reviewers_avg": [ 1.25, 0.4330127018922193 ], "reply_authors_avg": [ 2.25, 1.0897247358851685 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": 1.0, "gs_citation": 80, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12404450882639557328&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "pdf": "https://openreview.net/pdf?id=MMAeCXIa89", "email": "lu.se;uni-freiburg.de;ufmg.br;uni-hannover.de;uni-freiburg.de;stanford.edu", "author_num": 6, "aff_unique_index": "0;1;2;3;4;5", "aff_unique_norm": "Lund University;University of Freiburg;Universidade Federal de Minas Gerais;Leibniz Universit\u00e4t Hannover;Albert-Ludwigs-Universit\u00e4t Freiburg;Stanford University", "aff_unique_dep": ";;;;;", "aff_unique_url": "https://www.lunduniversity.lu.se;https://www.uni-freiburg.de;https://ufmg.br;https://www.leibniz.uni-hannover.de/;https://www.uni-freiburg.de;https://www.stanford.edu", "aff_unique_abbr": "LU;UoF;UFMG;LUH;Albert-Ludwigs-Universit\u00e4t;Stanford", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Freiburg;Stanford", "aff_country_unique_index": "0;1;2;1;1;3", "aff_country_unique": "Sweden;Germany;Brazil;United States" }, { "id": "MOm8xik_TmO", "title": "Isotropic Contextual Representations through Variational Regularization", "track": "main", "status": "Reject", "tldr": "", "abstract": "Contextual language representations achieve state-of-the-art performance across various natural language processing tasks. However, these representations have been shown to suffer from the degeneration problem, i.e. they occupy a narrow cone in the latent space. This problem can be addressed by enforcing isotropy in the latent space. In analogy to variational autoencoders, we suggest applying a token-level variational loss to a Transformer architecture and introduce the prior distribution's standard deviation as model parameter to optimize isotropy. The encoder-decoder architecture allows for learning interpretable embeddings that can be decoded into text again. Extracted features at sentence-level achieve competitive results on benchmark classification tasks.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/06d29e569cade278019930cfef1c5ac90fab9292.zip", "author": "Cornelia Ferner;Stefan Wegenkittl", "authorids": "~Cornelia_Ferner1;~Stefan_Wegenkittl1", "gender": "F;M", "homepage": ";https://its.fh-salzburg.ac.at/ueber-uns/lehrende/detail/wegenkittl/", "dblp": ";87/4665.html", "google_scholar": ";", "orcid": "0000-0003-1721-0453;0000-0002-3297-7997", "linkedin": ";https://at.linkedin.com/in/stefan-wegenkittl-4b474280", "or_profile": "~Cornelia_Ferner1;~Stefan_Wegenkittl1", "aff": "University of Salzburg;Salzburg University of Applied Sciences", "aff_domain": "sbg.ac.at;fh-salzburg.ac.at", "position": "PhD student;Lecturer", "bibtex": "@misc{\nferner2022isotropic,\ntitle={Isotropic Contextual Representations through Variational Regularization},\nauthor={Cornelia Ferner and Stefan Wegenkittl},\nyear={2022},\nurl={https://openreview.net/forum?id=MOm8xik_TmO}\n}", "github": "", "project": "", "reviewers": "Fv5U;pcez;ZoL1;j72L", "site": "https://openreview.net/forum?id=MOm8xik_TmO", "pdf_size": 0, "recommendation": "3;3;5;6", "confidence": "4;4;4;3", "correctness": "2;2;2;3", "technical_novelty": "3;2;2;4", "empirical_novelty": "2;1;2;3", "wc_summary_paper": "73;120;63;62", "wc_summary_review": "26;44;30;19", "wc_main_review": "503;441;324;321", "wc_review": "602;605;417;402", "wc_reply_reviewers": "137;146;184;8", "wc_reply_authors": "255;240;466;107", "reply_reviewers": "1;1;2;1", "reply_authors": "2;1;2;1", "recommendation_avg": [ 4.25, 1.299038105676658 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 2.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.82915619758885 ], "empirical_novelty_avg": [ 2.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 79.5, 23.77498685593748 ], "wc_summary_review_avg": [ 29.75, 9.12071817347735 ], "wc_main_review_avg": [ 397.25, 77.90499021243761 ], "wc_review_avg": [ 506.5, 97.15065619953373 ], "wc_reply_reviewers_avg": [ 118.75, 66.33014020790247 ], "wc_reply_authors_avg": [ 267.0, 128.5243167653499 ], "reply_reviewers_avg": [ 1.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.7777777777777777, "corr_recommendation_correctness": 0.7777777777777777, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8015788427793914609&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1", "aff_unique_norm": "University of Salzburg;Salzburg University of Applied Sciences", "aff_unique_dep": ";", "aff_unique_url": "https://www.uni-salzburg.at;https://www.fh-salzburg.ac.at", "aff_unique_abbr": "USAL;FH Salzburg", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Austria" }, { "title": "Provably convergent quasistatic dynamics for mean-field two-player zero-sum games", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6723", "id": "MP904TiHqJ-", "poster": "", "openreview": "https://openreview.net/forum?id=MP904TiHqJ-", "slides": "https://iclr.cc/virtual/2022/poster/6723", "video": "https://iclr.cc/virtual/2022/poster/6723", "author_site": "Chao Ma, Lexing Ying", "tldr": "", "abstract": "In this paper, we study the problem of finding mixed Nash equilibrium for mean-field two-player zero-sum games. Solving this problem requires optimizing over two probability distributions. We consider a quasistatic Wasserstein gradient flow dynamics in which one probability distribution follows the Wasserstein gradient flow, while the other one is always at the equilibrium. Theoretical analysis are conducted on this dynamics, showing its convergence to the mixed Nash equilibrium under mild conditions. Inspired by the continuous dynamics of probability distributions, we derive a quasistatic Langevin gradient descent method with inner-outer iterations, and test the method on different problems, including training mixture of GANs. ", "keywords": "quasistatic;minimax optimization;mixed Nash equilibrium;mean-field formulation", "primary_area": "", "supplementary_material": "", "author": "Chao Ma;Lexing Ying", "authorids": "~Chao_Ma8;~Lexing_Ying1", "gender": "M;", "homepage": ";http://web.stanford.edu/~lexing", "dblp": ";68/3945", "google_scholar": "n2BTRgUAAAAJ;OwA3zyMAAAAJ", "orcid": ";", "linkedin": "chao-ma-9b593a129/;", "or_profile": "~Chao_Ma8;~Lexing_Ying1", "aff": "Stanford University;Stanford University", "aff_domain": "stanford.edu;stanford.edu", "position": "Postdoc;Professor", "bibtex": "@inproceedings{\nma2022provably,\ntitle={Provably convergent quasistatic dynamics for mean-field two-player zero-sum games},\nauthor={Chao Ma and Lexing Ying},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=MP904TiHqJ-}\n}", "github": "", "project": "", "reviewers": "TF7Q;6QW7;izUn;4gr6", "pdf_size": 0, "recommendation": "6;6;6;6", "confidence": "3;5;4;3", "correctness": "4;4;4;2", "technical_novelty": "3;3;3;3", "empirical_novelty": "3;3;3;3", "wc_summary_paper": "44;102;51;90", "wc_summary_review": "4;38;36;57", "wc_main_review": "25;146;297;256", "wc_review": "73;286;384;403", "wc_reply_reviewers": "0;116;0;0", "wc_reply_authors": "92;640;477;778", "reply_reviewers": "0;2;0;0", "reply_authors": "1;3;1;1", "recommendation_avg": [ 6.0, 0.0 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 3.5, 0.8660254037844386 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 71.75, 24.742423082632794 ], "wc_summary_review_avg": [ 33.75, 19.031224343168255 ], "wc_main_review_avg": [ 181.0, 105.64326765109077 ], "wc_review_avg": [ 286.5, 131.0162203698458 ], "wc_reply_reviewers_avg": [ 29.0, 50.22947341949744 ], "wc_reply_authors_avg": [ 496.75, 256.8242346430726 ], "reply_reviewers_avg": [ 0.5, 0.8660254037844386 ], "reply_authors_avg": [ 1.5, 0.8660254037844386 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3343894024584098754&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=MP904TiHqJ-", "email": "stanford.edu;stanford.edu", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Stanford University", "aff_unique_dep": "", "aff_unique_url": "https://www.stanford.edu", "aff_unique_abbr": "Stanford", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "MPoQtFC588n", "title": "RMNet: Equivalently Removing Residual Connection from Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "Although residual connection enables training very deep neural networks, it is not friendly for online inference due to its multi-branch topology. This encourages many researchers to work on designing DNNs without residual connections at inference. For example, RepVGG re-parameterizes multi-branch topology to a VGG-like (single-branch) model when deploying, showing great performance when the network is relatively shallow. However, RepVGG can not transform ResNet to VGG equivalently because re-parameterizing methods can only be applied to linear Blocks and the non-linear layers (ReLU) have to be put outside of the residual connection which results in limited representation ability, especially for deeper networks. In this paper, we aim to remedy this problem and propose to remove the residual connection in a vanilla ResNet equivalently by a reserving and merging (RM) operation on ResBlock. Specifically, RM operation allows input feature maps to pass through the block while reserving their information and merges all the information at the end of each block, which can remove residual connection without changing original output. RMNet basically has two advantages: 1) it achieves a better accuracy-speed trade-off compared with ResNet and RepVGG; 2) its implementation makes it naturally friendly for high ratio network pruning. Extensive experiments are performed to verify the effectiveness of RMNet. We believe the ideology of RMNet can inspire many insights on model design for the community in the future.", "keywords": "Efficient Network;Residual Connection", "primary_area": "", "supplementary_material": "/attachment/0cc91919ce53a9be45e36f0a9693c0978d317a5d.zip", "author": "Fanxu Meng;Hao Cheng;Jia-Xin Zhuang;Ke Li;Xing Sun", "authorids": "~Fanxu_Meng1;~Hao_Cheng5;~Jia-Xin_Zhuang1;~Ke_Li4;~Xing_Sun1", "gender": "M;M;;M;M", "homepage": "https://fxmeng.github.io/;https://haochenglouis.github.io;;http://keli.info;https://www.sunxing.org", "dblp": ";;;;", "google_scholar": "xvfuhRUAAAAJ;ftlVqVIAAAAJ;;mfWsFM0AAAAJ;IUtix9IAAAAJ", "orcid": ";0000-0001-8864-7818;;0000-0001-7998-0731;0000-0001-8132-9083", "linkedin": ";;;;sunxings/", "or_profile": "~Fanxu_Meng1;~Hao_Cheng5;~Jia-Xin_Zhuang1;~Ke_Li4;~Xing_Sun1", "aff": "Tencent Youtu Lab;Tencent Youtu Lab;;Tencent;Tencent YouTu Lab", "aff_domain": "tencent.com;tencent.com;;tencent.com;tencent.com", "position": "Researcher;Researcher;;Principal Researcher;Principal Researcher", "bibtex": "@misc{\nmeng2022rmnet,\ntitle={{RMN}et: Equivalently Removing Residual Connection from Networks},\nauthor={Fanxu Meng and Hao Cheng and Jia-Xin Zhuang and Ke Li and Xing Sun},\nyear={2022},\nurl={https://openreview.net/forum?id=MPoQtFC588n}\n}", "github": "", "project": "", "reviewers": "BQPh;ybcv;Dt6G;oEx8", "site": "https://openreview.net/forum?id=MPoQtFC588n", "pdf_size": 0, "recommendation": "3;6;6;8", "confidence": "5;5;2;5", "correctness": "2;3;3;4", "technical_novelty": "2;3;2;3", "empirical_novelty": "2;3;2;4", "wc_summary_paper": "67;78;48;116", "wc_summary_review": "107;22;31;42", "wc_main_review": "189;453;86;287", "wc_review": "363;553;165;445", "wc_reply_reviewers": "420;0;0;78", "wc_reply_authors": "2214;558;388;83", "reply_reviewers": "1;0;0;1", "reply_authors": "5;1;1;1", "recommendation_avg": [ 5.75, 1.7853571071357126 ], "confidence_avg": [ 4.25, 1.299038105676658 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 77.25, 24.81305100143874 ], "wc_summary_review_avg": [ 50.5, 33.380383460949034 ], "wc_main_review_avg": [ 253.75, 135.22088411188562 ], "wc_review_avg": [ 381.5, 142.00264082051433 ], "wc_reply_reviewers_avg": [ 124.5, 173.55330593221208 ], "wc_reply_authors_avg": [ 810.75, 827.8482273339721 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 2.0, 1.7320508075688772 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.08084520834544433, "corr_recommendation_correctness": 0.9901475429766743, "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13690696229943227740&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Tencent", "aff_unique_dep": "Youtu Lab", "aff_unique_url": "https://www.tencent.com", "aff_unique_abbr": "Tencent", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "id": "MQ12ln81Jje", "title": "RankedDrop: Enhancing Deep Graph Convolutional Networks Training", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Graph Neural Networks (GNNs) are playing a more and more important role for analyzing unstructured data from the complex real world. Introducing random edge dropping from the input graph at training epochs could reduce over-fitting and over-smoothing phenomenon and increase the depth of GNNs. However, such method relies strongly on the chosen randomness. It makes the accuracy depend on the initialization of the randomness, which let the selection of hyper-parameters be even more difficult. We propose in this paper RankedDrop a novel method with a spatial-aware dropping-edge selection. The selection takes account of graph global information using PageRank, and graph local neighborhood information with node degree. RankedDrop provides a more stable training results comparing to the state-of-the-art solution, by maintaining the advantages of random edge dropping. Furthermore, RankedDrop is a general method that can be deployed on a deep learning framework for enhancing performance of GNNs.", "keywords": "graph neural network;graph convolutional network;PageRank selection;spatial-aware selection;deep learning", "primary_area": "", "supplementary_material": "", "author": "Quentin Petit;Chong Li;Kelun Chai;Serge G Petiton", "authorids": "~Quentin_Petit1;ch.l@huawei.com;chaikelun@gmail.com;serge.petiton@mailfence.com", "gender": "M;;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": "quentinpetit/;;;", "or_profile": "~Quentin_Petit1;ch.l@huawei.com;chaikelun@gmail.com;serge.petiton@mailfence.com", "aff": "LI-PARAD - UVSQ;;;", "aff_domain": "uvsq.fr;;;", "position": "PhD student;;;", "bibtex": "@misc{\npetit2022rankeddrop,\ntitle={RankedDrop: Enhancing Deep Graph Convolutional Networks Training},\nauthor={Quentin Petit and Chong Li and Kelun Chai and Serge G Petiton},\nyear={2022},\nurl={https://openreview.net/forum?id=MQ12ln81Jje}\n}", "github": "", "project": "", "reviewers": "6ToL;XjxK;86vL;eYKC", "site": "https://openreview.net/forum?id=MQ12ln81Jje", "pdf_size": 0, "recommendation": "3;3;3;3", "confidence": "5;5;5;4", "correctness": "3;4;3;3", "technical_novelty": "2;1;2;2", "empirical_novelty": "2;1;2;2", "wc_summary_paper": "20;29;43;102", "wc_summary_review": "20;18;14;58", "wc_main_review": "158;74;157;745", "wc_review": "198;121;214;905", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 4.75, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 1.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 48.5, 31.957002362549588 ], "wc_summary_review_avg": [ 27.5, 17.741194999210173 ], "wc_main_review_avg": [ 283.5, 268.61915419418625 ], "wc_review_avg": [ 359.5, 316.9010097806569 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:hUfYh1EFwE8J:scholar.google.com/&scioq=RankedDrop:+Enhancing+Deep+Graph+Convolutional+Networks+Training&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "Universit\u00e9 de Versailles Saint-Quentin-en-Yvelines", "aff_unique_dep": "LI-PARAD", "aff_unique_url": "https://www.uvsq.fr", "aff_unique_abbr": "UVSQ", "aff_country_unique_index": "0", "aff_country_unique": "France" }, { "title": "R4D: Utilizing Reference Objects for Long-Range Distance Estimation", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6625", "id": "MQ2sAGunyBP", "poster": "", "openreview": "https://openreview.net/forum?id=MQ2sAGunyBP", "slides": "https://iclr.cc/virtual/2022/poster/6625", "video": "https://iclr.cc/virtual/2022/poster/6625", "author_site": "Yinigwei Li, Tiffany Chen, Maya Kabkab, Ruichi Yu, Longlong Jing, Yurong You, Hang Zhao", "tldr": "", "abstract": "Estimating the distance of objects is a safety-critical task for autonomous driving. Focusing on short-range objects, existing methods and datasets neglect the equally important long-range objects. In this paper, we introduce a challenging and under-explored task, which we refer to as Long-Range Distance Estimation, as well as two datasets to validate new methods developed for this task. We then proposeR4D, the first framework to accurately estimate the distance of long-range objects by using references with known distances in the scene. Drawing inspiration from human perception, R4D builds a graph by connecting a target object to all references. An edge in the graph encodes the relative distance information between a pair of target and reference objects. An attention module is then used to weigh the importance of reference objects and combine them into one target object distance prediction. Experiments on the two proposed datasets demonstrate the effectiveness and robustness of R4D by showing significant improvements compared to existing baselines. We\u2019re looking to make the proposed dataset, Waymo OpenDataset - Long-Range Labels, available publicly at waymo.com/open/download.", "keywords": "Self-driving;distance estimation;long-range objects", "primary_area": "", "supplementary_material": "/attachment/0d9bd59a6464cc2bb95916bf42279df004dea28a.zip", "author": "Yingwei Li;Tiffany Chen;Maya Kabkab;Ruichi Yu;Longlong Jing;Yurong You;Hang Zhao", "authorids": "~Yingwei_Li4;yuhanc@waymo.com;~Maya_Kabkab1;~Ruichi_Yu2;~Longlong_Jing1;~Yurong_You1;~Hang_Zhao1", "gender": "M;;F;M;M;M;M", "homepage": "http://yingwei.li/;;;http://www.cs.umd.edu/~richyu/;https://longlong-jing.github.io/;http://yurongyou.com;http://www.mit.edu/~hangzhao/", "dblp": ";;;https://dblp.org/pers/hd/y/Yu:Ruichi;214/9050;199/1968;", "google_scholar": "phWmJeIAAAAJ;;;EfP9RrMAAAAJ;lhdhi5wAAAAJ;rdwkreIAAAAJ;DmahiOYAAAAJ", "orcid": ";;;;;;", "linkedin": ";;;;;yurong-you/;", "or_profile": "~Yingwei_Li4;yuhanc@waymo.com;~Maya_Kabkab1;~Ruichi_Yu2;~Longlong_Jing1;~Yurong_You1;~Hang_Zhao1", "aff": "Johns Hopkins University;;University of Maryland, College Park;University of Maryland;Waymo LLC;Cornell University;Tsinghua University", "aff_domain": "jhu.edu;;umd.edu;;waymo.com;cornell.edu;tsinghua.edu.cn", "position": "PhD student;;PhD student;PhD;Researcher;PhD student;Assistant Professor", "bibtex": "@inproceedings{\nli2022rd,\ntitle={R4D: Utilizing Reference Objects for Long-Range Distance Estimation},\nauthor={Yingwei Li and Tiffany Chen and Maya Kabkab and Ruichi Yu and Longlong Jing and Yurong You and Hang Zhao},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=MQ2sAGunyBP}\n}", "github": "", "project": "", "reviewers": "JYTu;4ncP;gPz7;EZAP", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "3;3;5;3", "correctness": "3;3;4;3", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "32;83;58;64", "wc_summary_review": "31;72;55;45", "wc_main_review": "153;382;264;222", "wc_review": "216;537;377;331", "wc_reply_reviewers": "156;90;0;0", "wc_reply_authors": "1387;1104;689;550", "reply_reviewers": "1;1;0;0", "reply_authors": "3;2;1;1", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 3.5, 0.8660254037844386 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 59.25, 18.239723133863627 ], "wc_summary_review_avg": [ 50.75, 14.939461168328663 ], "wc_main_review_avg": [ 255.25, 83.2207155701031 ], "wc_review_avg": [ 365.25, 115.2006401891934 ], "wc_reply_reviewers_avg": [ 61.5, 65.77803584784209 ], "wc_reply_authors_avg": [ 932.5, 332.25780652980905 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.75, 0.82915619758885 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": -0.13245323570650439, "corr_recommendation_correctness": -0.13245323570650439, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1603858470338210445&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=MQ2sAGunyBP", "email": "jhu.edu;;umd.edu;;waymo.com;cornell.edu;tsinghua.edu.cn", "author_num": 7, "aff_unique_index": "0;1;1;2;3;4", "aff_unique_norm": "Johns Hopkins University;University of Maryland;Waymo;Cornell University;Tsinghua University", "aff_unique_dep": ";;;;", "aff_unique_url": "https://www.jhu.edu;https://www/umd.edu;https://www.waymo.com;https://www.cornell.edu;https://www.tsinghua.edu.cn", "aff_unique_abbr": "JHU;UMD;Waymo;Cornell;THU", "aff_campus_unique_index": "1", "aff_campus_unique": ";College Park", "aff_country_unique_index": "0;0;0;0;0;1", "aff_country_unique": "United States;China" }, { "id": "MQRDLiWCSh", "title": "Towards Scaling Robustness Verification of Semantic Features via Proof Velocity", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Robustness analysis is important for understanding the reliability of neural networks. Despite the significant progress in the verification techniques for both $L_p$- and semantic features- neighborhoods, existing approaches struggle to scale to deep networks and large datasets. For example, we are unaware of any analyzer that scales to AlexNet trained for ImageNet (consisting of 224x224x3 images).\nIn this work, we take a step towards scaling robustness analysis. We focus on robustness to perturbations of semantic features and introduce the concept of proof guided by velocity to scale the analysis. The key idea is to phrase the verification task as a dynamic system and adaptively identify how to split it into subproblems each with maximal proof velocity. We propose a policy to determine the next subproblem based on the past and by leveraging input splitting, input refinement, and bound tightening. We evaluate our approach on CIFAR-10 and ImageNet and show that it can analyze neighborhoods of various features: hue, saturation, lightness, brightness, and PCA.", "keywords": "neural network robustness;local robustness;semantic features;verification of neural networks", "primary_area": "", "supplementary_material": "", "author": "Anan Kabaha;Dana Drachsler Cohen", "authorids": "~Anan_Kabaha3;~Dana_Drachsler_Cohen1", "gender": ";F", "homepage": "https://ddana.net.technion.ac.il;https://ddana.net.technion.ac.il/", "dblp": ";155/1628", "google_scholar": ";https://scholar.google.ch/citations?user=XOiO5xgAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Anan_Kabaha3;~Dana_Drachsler_Cohen1", "aff": "Technion, Technion;Technion, Technion", "aff_domain": "technion.ac.il;technion.ac.il", "position": "MS student;Assistant Professor", "bibtex": "@misc{\nkabaha2022towards,\ntitle={Towards Scaling Robustness Verification of Semantic Features via Proof Velocity},\nauthor={Anan Kabaha and Dana Drachsler Cohen},\nyear={2022},\nurl={https://openreview.net/forum?id=MQRDLiWCSh}\n}", "github": "", "project": "", "reviewers": "Gssv;V2CT;WVyT;MmF2", "site": "https://openreview.net/forum?id=MQRDLiWCSh", "pdf_size": 0, "recommendation": "3;3;3;3", "confidence": "5;5;4;4", "correctness": "3;2;2;3", "technical_novelty": "2;1;2;2", "empirical_novelty": "3;1;2;2", "wc_summary_paper": "116;79;58;58", "wc_summary_review": "93;59;43;60", "wc_main_review": "788;691;370;649", "wc_review": "997;829;471;767", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 4.5, 0.5 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 1.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 77.75, 23.689396362085716 ], "wc_summary_review_avg": [ 63.75, 18.18481509391833 ], "wc_main_review_avg": [ 624.5, 155.34236382905985 ], "wc_review_avg": [ 766.0, 189.971050426111 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:z_d70Q_y2tQJ:scholar.google.com/&scioq=Towards+Scaling+Robustness+Verification+of+Semantic+Features+via+Proof+Velocity&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Technion - Israel Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.technion.ac.il/en/", "aff_unique_abbr": "Technion", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Israel" }, { "id": "MQuxKr2F1Xw", "title": "Multi-Trigger-Key: Towards Multi-Task Privacy-Preserving In Deep Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Deep learning-based Multi-Task Classification (MTC) is widely used in applications like facial attribute and healthcare that warrant strong privacy guarantees. In this work, we aim to protect sensitive information in the inference phase of MTC and propose a novel Multi-Trigger-Key (MTK) framework to achieve the privacy-preserving objective. MTK associates each secured task in the multi-task dataset with a specifically designed trigger-key. The true information can be revealed by adding the trigger-key if the user is authorized. We obtain such an MTK model by training it with a newly generated training set. To address the information leakage malaise resulting from correlations among different tasks, we generalize the training process by incorporating an MTK decoupling process with a controllable trade-off between the protective efficacy and the model performance. Theoretical guarantees and experimental results demonstrate the effectiveness of the privacy protection without appreciable hindering on the model performance.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/c128d2a07d7cdfe44135fe2174462353194a8685.zip", "author": "Ren Wang;Zhe Xu;Alfred Hero", "authorids": "~Ren_Wang1;~Zhe_Xu7;~Alfred_Hero1", "gender": "M;;M", "homepage": "https://wangren09.github.io/;https://sites.google.com/site/zhexudavid00710;http://web.eecs.umich.edu/~hero/", "dblp": "29/50-8;;h/AlfredOHeroIII", "google_scholar": "TY_3K48AAAAJ;j8ilzcsAAAAJ;DSiNzkIAAAAJ", "orcid": "0000-0002-6366-8898;;0000-0002-2531-9670", "linkedin": "ren-wang-715525106/;;", "or_profile": "~Ren_Wang1;~Zhe_Xu7;~Alfred_Hero1", "aff": "University of Michigan;Arizona State University;University of Michigan", "aff_domain": "umich.edu;asu.edu;umich.edu", "position": "Postdoc;Assistant Professor;Researcher", "bibtex": "@misc{\nwang2022multitriggerkey,\ntitle={Multi-Trigger-Key: Towards Multi-Task Privacy-Preserving In Deep Learning},\nauthor={Ren Wang and Zhe Xu and Alfred Hero},\nyear={2022},\nurl={https://openreview.net/forum?id=MQuxKr2F1Xw}\n}", "github": "", "project": "", "reviewers": "NAzP;HvZa;trbx;CQsf", "site": "https://openreview.net/forum?id=MQuxKr2F1Xw", "pdf_size": 0, "recommendation": "3;3;3;3", "confidence": "2;3;3;4", "correctness": "3;3;3;1", "technical_novelty": "2;3;3;2", "empirical_novelty": "2;3;2;1", "wc_summary_paper": "337;241;80;14", "wc_summary_review": "5;55;84;14", "wc_main_review": "95;219;121;168", "wc_review": "437;515;285;196", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 3.0, 0.7071067811865476 ], "correctness_avg": [ 2.5, 0.8660254037844386 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 168.0, 127.81823031164217 ], "wc_summary_review_avg": [ 39.5, 31.862987932709636 ], "wc_main_review_avg": [ 150.75, 47.2989164780759 ], "wc_review_avg": [ 358.25, 124.9627444480954 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:uOozwGNwPz4J:scholar.google.com/&scioq=Multi-Trigger-Key:+Towards+Multi-Task+Privacy-Preserving+In+Deep+Learning&hl=en&as_sdt=0,33", "gs_version_total": 6, "aff_unique_index": "0;1;0", "aff_unique_norm": "University of Michigan;Arizona State University", "aff_unique_dep": ";", "aff_unique_url": "https://www.umich.edu;https://www.asu.edu", "aff_unique_abbr": "UM;ASU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Adversarial Retriever-Ranker for Dense Text Retrieval", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6182", "id": "MR7XubKUFB", "poster": "", "openreview": "https://openreview.net/forum?id=MR7XubKUFB", "slides": "https://iclr.cc/virtual/2022/poster/6182", "video": "https://iclr.cc/virtual/2022/poster/6182", "author_site": "Hang Zhang, Yeyun Gong, Yelong Shen, Jiancheng Lv, Nan Duan, Weizhu Chen", "tldr": "", "abstract": "Current dense text retrieval models face two typical challenges. First, it adopts a siamese dual-encoder architecture to encode query and document independently for fast indexing and searching, whereas neglecting the finer-grained term-wise interactions. This results in a sub-optimal recall performance. Second, it highly relies on a negative sampling technique to build up the negative documents in its contrastive loss. To address these challenges, we present Adversarial Retriever-Ranker (AR2), which consists of a dual-encoder retriever plus a cross-encoder ranker. The two models are jointly optimized according to a minimax adversarial objective: the retriever learns to retrieve negative documents to cheat the ranker, while the ranker learns to rank a collection of candidates including both the ground-truth and the retrieved ones, as well as providing progressive direct feedback to the dual-encoder retriever. Through this adversarial game, the retriever gradually produces harder negative documents to train a better ranker, whereas the cross-encoder ranker provides progressive feedback to improve retriever. We evaluate AR2 on three benchmarks. Experimental results show that AR2 consistently and significantly outperforms existing dense retriever methods and achieves new state-of-the-art results on all of them. This includes the improvements on Natural Questions R@5 to 77.9%(+2.1%), TriviaQA R@5 to 78.2%(+1.4), and MS-MARCO MRR@10 to 39.5%(+1.3%). We will make our code, models, and data publicly available. ", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Hang Zhang;Yeyun Gong;Yelong Shen;Jiancheng Lv;Nan Duan;Weizhu Chen", "authorids": "~Hang_Zhang6;~Yeyun_Gong2;~Yelong_Shen2;~Jiancheng_Lv2;~Nan_Duan1;~Weizhu_Chen1", "gender": "M;M;;M;M;M", "homepage": ";;;https://cs.scu.edu.cn/info/1303/13767.htm;https://nanduan.github.io/;https://www.microsoft.com/en-us/research/people/wzchen/", "dblp": "49/6156-29;06/10400.html;37/9376;;;79/2536", "google_scholar": "https://scholar.google.com.hk/citations?hl=zh-CN;piUkwMYAAAAJ;;https://scholar.google.com/citations?hl=zh-CN;Qaa6OxIAAAAJ;LG_E-4EAAAAJ", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": "~Hang_Zhang6;~Yeyun_Gong2;~Yelong_Shen2;~Jiancheng_Lv2;~Nan_Duan1;~Weizhu_Chen1", "aff": "Sichuan University;Microsoft;;Sichuan University;Microsoft Research Asia;Microsoft GenAI", "aff_domain": "scu.edu.cn;microsoft.com;;scu.edu.cn;microsoft.com;microsoft.com", "position": "PhD student;Researcher;;Full Professor;Principal Researcher;Vice President", "bibtex": "@inproceedings{\nzhang2022adversarial,\ntitle={Adversarial Retriever-Ranker for Dense Text Retrieval},\nauthor={Hang Zhang and Yeyun Gong and Yelong Shen and Jiancheng Lv and Nan Duan and Weizhu Chen},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=MR7XubKUFB}\n}", "github": "", "project": "", "reviewers": "hfGT;dE9M;idsf;ySzP", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "4;4;4;5", "correctness": "3;3;3;4", "technical_novelty": "2;3;3;3", "empirical_novelty": "0;2;2;3", "wc_summary_paper": "69;29;163;139", "wc_summary_review": "159;49;144;45", "wc_main_review": "460;320;492;354", "wc_review": "688;398;799;538", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "457;207;959;144", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;3;1", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.75, 1.0897247358851685 ], "wc_summary_paper_avg": [ 100.0, 53.600373133029585 ], "wc_summary_review_avg": [ 99.25, 52.53748661670065 ], "wc_main_review_avg": [ 406.5, 71.43353554178877 ], "wc_review_avg": [ 605.75, 151.54269200459652 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 441.75, 320.7579889885831 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.5, 0.8660254037844386 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.9271726499455306, "corr_recommendation_correctness": 0.9271726499455306, "gs_citation": 127, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9069461514425266804&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=MR7XubKUFB", "email": "scu.edu.cn;microsoft.com;;scu.edu.cn;microsoft.com;microsoft.com", "author_num": 6, "aff_unique_index": "0;1;0;1;1", "aff_unique_norm": "Sichuan University;Microsoft", "aff_unique_dep": ";Microsoft Corporation", "aff_unique_url": "https://www.scu.edu.cn;https://www.microsoft.com", "aff_unique_abbr": "SCU;Microsoft", "aff_campus_unique_index": "1", "aff_campus_unique": ";Asia", "aff_country_unique_index": "0;1;0;0;1", "aff_country_unique": "China;United States" }, { "id": "MRGFutr0p5e", "title": "Graph Barlow Twins: A self-supervised representation learning framework for graphs", "track": "main", "status": "Reject", "tldr": "", "abstract": "The self-supervised learning (SSL) paradigm is an essential exploration area, which tries to eliminate the need for expensive data labeling. Despite the great success of SSL methods in computer vision and natural language processing, most of them employ contrastive learning objectives that require negative samples, which are hard to define. This becomes even more challenging in the case of graphs and is a bottleneck for achieving robust representations. To overcome such limitations, we propose a framework for self-supervised graph representation learning - Graph Barlow Twins, which utilizes a cross-correlation-based loss function instead of negative samples. Moreover, it does not rely on non-symmetric neural network architectures - in contrast to state-of-the-art self-supervised graph representation learning method BGRL. We show that our method achieves as competitive results as the best self-supervised methods and fully supervised ones while requiring fewer hyperparameters and substantially shorter computation time (ca. 30 times faster than BGRL).", "keywords": "graph representation learning;self-supervised learning", "primary_area": "", "supplementary_material": "/attachment/668cec93bd7f7a19c8a2d478ef2cf5628642a701.zip", "author": "Piotr Bielak;Tomasz Jan Kajdanowicz;Nitesh Chawla", "authorids": "~Piotr_Bielak1;~Tomasz_Jan_Kajdanowicz1;~Nitesh_Chawla1", "gender": ";M;M", "homepage": "https://piotrbielak.com/;http://www.kajdanowicz.com;http://niteshchawla.nd.edu", "dblp": ";74/608;c/NiteshVChawla.html", "google_scholar": "https://scholar.google.ca/citations?user=Z0lkjn0AAAAJ;GOoaHHEAAAAJ;hDLBEhkAAAAJ", "orcid": ";0000-0002-8417-1012;", "linkedin": ";kajdanowicz;", "or_profile": "~Piotr_Bielak1;~Tomasz_Jan_Kajdanowicz1;~Nitesh_Chawla1", "aff": "Wroclaw University of Science and Technology;Wroclaw University of Science and Technology;University of Notre Dame", "aff_domain": "pwr.edu.pl;pwr.edu.pl;nd.edu", "position": "PhD student;Associate Professor;Full Professor", "bibtex": "@misc{\nbielak2022graph,\ntitle={Graph Barlow Twins: A self-supervised representation learning framework for graphs},\nauthor={Piotr Bielak and Tomasz Jan Kajdanowicz and Nitesh Chawla},\nyear={2022},\nurl={https://openreview.net/forum?id=MRGFutr0p5e}\n}", "github": "", "project": "", "reviewers": "Bwpq;bHed;H4MK", "site": "https://openreview.net/forum?id=MRGFutr0p5e", "pdf_size": 0, "recommendation": "3;5;6", "confidence": "5;5;5", "correctness": "4;4;3", "technical_novelty": "1;2;3", "empirical_novelty": "1;1;3", "wc_summary_paper": "48;106;49", "wc_summary_review": "68;33;91", "wc_main_review": "78;229;175", "wc_review": "194;368;315", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "974;1815;115", "reply_reviewers": "0;0;0", "reply_authors": "2;3;1", "recommendation_avg": [ 4.666666666666667, 1.247219128924647 ], "confidence_avg": [ 5.0, 0.0 ], "correctness_avg": [ 3.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.0, 0.816496580927726 ], "empirical_novelty_avg": [ 1.6666666666666667, 0.9428090415820634 ], "wc_summary_paper_avg": [ 67.66666666666667, 27.108834148463284 ], "wc_summary_review_avg": [ 64.0, 23.84673283002656 ], "wc_main_review_avg": [ 160.66666666666666, 62.473105324522564 ], "wc_review_avg": [ 292.3333333333333, 72.82093716019376 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 968.0, 694.0350615542897 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.0, 0.816496580927726 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": -0.7559289460184545, "gs_citation": 167, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12806651281777500023&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff_unique_index": "0;0;1", "aff_unique_norm": "Wroclaw University of Science and Technology;University of Notre Dame", "aff_unique_dep": ";", "aff_unique_url": "https://www.pwr.edu.pl;https://www.nd.edu", "aff_unique_abbr": "WUST;Notre Dame", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1", "aff_country_unique": "Poland;United States" }, { "title": "Post-Training Detection of Backdoor Attacks for Two-Class and Multi-Attack Scenarios", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7149", "id": "MSgB8D4Hy51", "poster": "", "openreview": "https://openreview.net/forum?id=MSgB8D4Hy51", "slides": "https://iclr.cc/virtual/2022/poster/7149", "video": "https://iclr.cc/virtual/2022/poster/7149", "author_site": "Zhen Xiang, David Miller, George Kesidis", "tldr": "", "abstract": "Backdoor attacks (BAs) are an emerging threat to deep neural network classifiers. A victim classifier will predict to an attacker-desired target class whenever a test sample is embedded with the same backdoor pattern (BP) that was used to poison the classifier's training set. Detecting whether a classifier is backdoor attacked is not easy in practice, especially when the defender is, e.g., a downstream user without access to the classifier's training set. This challenge is addressed here by a reverse-engineering defense (RED), which has been shown to yield state-of-the-art performance in several domains. However, existing REDs are not applicable when there are only two classes or when multiple attacks are present. These scenarios are first studied in the current paper, under the practical constraints that the defender neither has access to the classifier's training set nor to supervision from clean reference classifiers trained for the same domain. We propose a detection framework based on BP reverse-engineering and a novel expected transferability (ET) statistic. We show that our ET statistic is effective using the same detection threshold, irrespective of the classification domain, the attack configuration, and the BP reverse-engineering algorithm that is used. The excellent performance of our method is demonstrated on six benchmark datasets. Notably, our detection framework is also applicable to multi-class scenarios with multiple attacks. Code is available at https://github.com/zhenxianglance/2ClassBADetection.", "keywords": "backdoor;Trojan;adversarial learning;deep neural network", "primary_area": "", "supplementary_material": "", "author": "Zhen Xiang;David Miller;George Kesidis", "authorids": "~Zhen_Xiang1;~David_Miller8;~George_Kesidis1", "gender": "M;;M", "homepage": "https://zhenxianglance.github.io/;;http://www.cse.psu.edu/~gik2", "dblp": "20/2799.html;;", "google_scholar": "https://scholar.google.com/citations?hl=en;;", "orcid": ";;", "linkedin": ";;", "or_profile": "~Zhen_Xiang1;~David_Miller8;~George_Kesidis1", "aff": "Pennsylvania State University;;Pennsylvania State University", "aff_domain": "psu.edu;;psu.edu", "position": "PhD student;;Full Professor", "bibtex": "@inproceedings{\nxiang2022posttraining,\ntitle={Post-Training Detection of Backdoor Attacks for Two-Class and Multi-Attack Scenarios},\nauthor={Zhen Xiang and David Miller and George Kesidis},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=MSgB8D4Hy51}\n}", "github": "", "project": "", "reviewers": "BBKY;xYbV;FJ6Z;Xbfh", "pdf_size": 0, "recommendation": "5;6;8;8", "confidence": "4;2;3;3", "correctness": "2;3;4;3", "technical_novelty": "2;2;4;3", "empirical_novelty": "2;2;4;3", "wc_summary_paper": "77;83;65;60", "wc_summary_review": "32;46;27;49", "wc_main_review": "546;436;141;201", "wc_review": "655;565;233;310", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "2304;2467;429;775", "reply_reviewers": "0;0;0;0", "reply_authors": "4;4;1;1", "recommendation_avg": [ 6.75, 1.299038105676658 ], "confidence_avg": [ 3.0, 0.7071067811865476 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.75, 0.82915619758885 ], "empirical_novelty_avg": [ 2.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 71.25, 9.175374651751284 ], "wc_summary_review_avg": [ 38.5, 9.233092656309694 ], "wc_main_review_avg": [ 331.0, 166.01957715883992 ], "wc_review_avg": [ 440.75, 174.35362772251113 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 1493.75, 901.9443926872653 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.5, 1.5 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.2721655269759087, "corr_recommendation_correctness": 0.816496580927726, "gs_citation": 62, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12429921260786315326&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=MSgB8D4Hy51", "email": "psu.edu;;psu.edu", "author_num": 3, "aff_unique_index": "0;0", "aff_unique_norm": "Pennsylvania State University", "aff_unique_dep": "", "aff_unique_url": "https://www.psu.edu", "aff_unique_abbr": "PSU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Learning Weakly-supervised Contrastive Representations", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6060", "id": "MSwEFaztwkE", "poster": "", "openreview": "https://openreview.net/forum?id=MSwEFaztwkE", "slides": "https://iclr.cc/virtual/2022/poster/6060", "video": "https://iclr.cc/virtual/2022/poster/6060", "author_site": "Yao-Hung Hubert Tsai, Tianqin Li, Weixin Liu, Peiyuan Liao, Ruslan Salakhutdinov, Louis-Philippe Morency", "tldr": "", "abstract": "We argue that a form of the valuable information provided by the auxiliary information is its implied data clustering information. For instance, considering hashtags as auxiliary information, we can hypothesize that an Instagram image will be semantically more similar with the same hashtags. With this intuition, we present a two-stage weakly-supervised contrastive learning approach. The first stage is to cluster data according to its auxiliary information. The second stage is to learn similar representations within the same cluster and dissimilar representations for data from different clusters. Our empirical experiments suggest the following three contributions. First, compared to conventional self-supervised representations, the auxiliary-information-infused representations bring the performance closer to the supervised representations, which use direct downstream labels as supervision signals. Second, our approach performs the best in most cases, when comparing our approach with other baseline representation learning methods that also leverage auxiliary data information. Third, we show that our approach also works well with unsupervised constructed clusters (e.g., no auxiliary information), resulting in a strong unsupervised representation learning approach. ", "keywords": "Self-supervised Learning;Weakly Supervised Learning;Learning with Auxiliary Information;Clustering-based Representation Learning", "primary_area": "", "supplementary_material": "/attachment/6026402aee60f4b2b0b18faa1782b47207b13efb.zip", "author": "Yao-Hung Hubert Tsai;Tianqin Li;Weixin Liu;Peiyuan Liao;Ruslan Salakhutdinov;Louis-Philippe Morency", "authorids": "~Yao-Hung_Hubert_Tsai1;~Tianqin_Li2;~Weixin_Liu2;~Peiyuan_Liao1;~Ruslan_Salakhutdinov1;~Louis-Philippe_Morency1", "gender": "M;M;M;;M;M", "homepage": ";https://github.com/Crazy-Jack;;https://www.liaopeiyuan.com;https://www.cs.cmu.edu/~morency/;https://www.cs.cmu.edu/~rsalakhu/", "dblp": "154/3702;294/5434;;;31/739;", "google_scholar": ";sQjEQEUAAAAJ;;aP5VahUAAAAJ;https://scholar.google.com.tw/citations?user=APgaFK0AAAAJ;", "orcid": ";0000-0003-2567-8283;;;0000-0001-6376-7696;", "linkedin": ";tianqin-li-b16299170/;weixin-liu/;;morency?challengeId=AQELGK_OvMa0vwAAAY72L-VV4X9hW8juuY80VHVeeSGHZ1PJHeeEa5LTFoeTmDGU0t1OL07MXJTYC9EAi6qgPDd2z9ztnbdFYA&submissionId=09a0ff34-04ac-c717-bef7-8c9c8811b463&challengeSource=AgFhxWkU3q7v4wAAAY72L-1xRE0eG-BnZUNE9e3eAG95pgOCZ9u1nxEg-1dK2Dw&challegeType=AgHMzV0lqKgEFwAAAY72L-11X6DHMd3V_A3Iur8XZeyYF2-oBzoufs8&memberId=AgH4yz7pZ_riCgAAAY72L-146jmR2pdr3dmhy2icxBtEQzQ&recognizeDevice=AgFDCNyrhKiFSAAAAY72L-16m7z2EH2t0ueWmMKjyk1_ZJAkfFVe;", "or_profile": "~Yao-Hung_Hubert_Tsai1;~Tianqin_Li2;~Weixin_Liu2;~Peiyuan_Liao1;~Louis-Philippe_Morency1;~Russ_Salakhutdinov1", "aff": "Apple;Carnegie Mellon University;;Praxis Pioneering;Carnegie Mellon University;School of Computer Science, Carnegie Mellon University", "aff_domain": "apple.com;andrew.cmu.edu;;praxispioneering.com;cmu.edu;cs.cmu.edu", "position": "Principal Researcher;PhD student;;Principal Researcher;Associate Professor;Full Professor", "bibtex": "@inproceedings{\ntsai2022learning,\ntitle={Learning Weakly-supervised Contrastive Representations},\nauthor={Yao-Hung Hubert Tsai and Tianqin Li and Weixin Liu and Peiyuan Liao and Ruslan Salakhutdinov and Louis-Philippe Morency},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=MSwEFaztwkE}\n}", "github": "", "project": "", "reviewers": "1ziy;a8kw;h3zd;RhYi", "pdf_size": 0, "recommendation": "5;5;6;8", "confidence": "4;4;4;3", "correctness": "3;3;4;3", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "111;93;43;105", "wc_summary_review": "49;47;43;17", "wc_main_review": "481;280;145;157", "wc_review": "641;420;231;279", "wc_reply_reviewers": "520;114;0;0", "wc_reply_authors": "1012;874;130;307", "reply_reviewers": "2;1;0;0", "reply_authors": "3;3;1;1", "recommendation_avg": [ 6.0, 1.224744871391589 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 88.0, 26.77685567799177 ], "wc_summary_review_avg": [ 39.0, 12.884098726725126 ], "wc_main_review_avg": [ 265.75, 135.0395775319221 ], "wc_review_avg": [ 392.75, 159.27393854614132 ], "wc_reply_reviewers_avg": [ 158.5, 213.83813972254808 ], "wc_reply_authors_avg": [ 580.75, 370.83916662078724 ], "reply_reviewers_avg": [ 0.75, 0.82915619758885 ], "reply_authors_avg": [ 2.0, 1.0 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.9428090415820632, "corr_recommendation_correctness": 0.0, "gs_citation": 18, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16658448865785997630&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=MSwEFaztwkE", "email": "apple.com;andrew.cmu.edu;;praxispioneering.com;cmu.edu;cs.cmu.edu", "author_num": 6, "aff_unique_index": "0;1;2;1;1", "aff_unique_norm": "Apple;Carnegie Mellon University;Praxis Pioneering", "aff_unique_dep": "Apple Inc.;;", "aff_unique_url": "https://www.apple.com;https://www.cmu.edu;", "aff_unique_abbr": "Apple;CMU;", "aff_campus_unique_index": "1", "aff_campus_unique": ";Pittsburgh", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States;" }, { "title": "MetaShift: A Dataset of Datasets for Evaluating Contextual Distribution Shifts and Training Conflicts", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6433", "id": "MTex8qKavoS", "poster": "", "openreview": "https://openreview.net/forum?id=MTex8qKavoS", "slides": "https://iclr.cc/virtual/2022/poster/6433", "video": "https://iclr.cc/virtual/2022/poster/6433", "author_site": "Victor Weixin Liang, James Y Zou", "tldr": "", "abstract": "Understanding the performance of machine learning models across diverse data distributions is critically important for reliable applications. Motivated by this, there is a growing focus on curating benchmark datasets that capture distribution shifts. While valuable, the existing benchmarks are limited in that many of them only contain a small number of shifts and they lack systematic annotation about what is different across different shifts. We present MetaShift\u2014a collection of 12,868 sets of natural images across 410 classes\u2014to address this challenge. We leverage the natural heterogeneity of Visual Genome and its annotations to construct MetaShift. The key construction idea is to cluster images using its metadata, which provides context for each image (e.g. \u201ccats with cars\u201d or \u201ccats in bathroom\u201d) that represent distinct data distributions. MetaShift has two important benefits: first, it contains orders of magnitude more natural data shifts than previously available. Second, it provides explicit explanations of what is unique about each of its data sets and a distance score that measures the amount of distribution shift between any two of its data sets. We demonstrate the utility of MetaShift in benchmarking several recent proposals for training models to be robust to data shifts. We find that the simple empirical risk minimization performs the best when shifts are moderate and no method had a systematic advantage for large shifts. We also show how MetaShift can help to visualize conflicts between data subsets during model training. ", "keywords": "benchmark dataset;distribution shift;out-of-domain generalization", "primary_area": "", "supplementary_material": "/attachment/bbc1b2ddeb555f510a7c9e5cb29c636a5cf93c73.zip", "author": "Weixin Liang;James Zou", "authorids": "~Weixin_Liang1;~James_Zou1", "gender": ";", "homepage": "https://ai.stanford.edu/~wxliang/;", "dblp": "231/1803;", "google_scholar": "7z9P1jYAAAAJ;23ZXZvEAAAAJ", "orcid": ";", "linkedin": "weixin-liang-2562aa154/;", "or_profile": "~Weixin_Liang1;~James_Zou1", "aff": "Stanford University;Stanford University", "aff_domain": "stanford.edu;stanford.edu", "position": "PhD student;Assistant Professor", "bibtex": "@inproceedings{\nliang2022metashift,\ntitle={MetaShift: A Dataset of Datasets for Evaluating Contextual Distribution Shifts and Training Conflicts},\nauthor={Weixin Liang and James Zou},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=MTex8qKavoS}\n}", "github": "", "project": "", "reviewers": "T6b1;ameR;HKis", "pdf_size": 0, "recommendation": "6;6;6", "confidence": "3;4;4", "correctness": "4;3;4", "technical_novelty": "3;3;3", "empirical_novelty": "3;3;3", "wc_summary_paper": "90;53;263", "wc_summary_review": "46;52;81", "wc_main_review": "296;252;211", "wc_review": "432;357;555", "wc_reply_reviewers": "0;90;0", "wc_reply_authors": "465;644;521", "reply_reviewers": "0;1;0", "reply_authors": "1;2;1", "recommendation_avg": [ 6.0, 0.0 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 135.33333333333334, 91.52898751518862 ], "wc_summary_review_avg": [ 59.666666666666664, 15.2825245151302 ], "wc_main_review_avg": [ 253.0, 34.708308323320324 ], "wc_review_avg": [ 448.0, 81.62107570964744 ], "wc_reply_reviewers_avg": [ 30.0, 42.42640687119285 ], "wc_reply_authors_avg": [ 543.3333333333334, 74.76333028668236 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 100, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11769188169482891384&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=MTex8qKavoS", "email": "stanford.edu;stanford.edu", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Stanford University", "aff_unique_dep": "", "aff_unique_url": "https://www.stanford.edu", "aff_unique_abbr": "Stanford", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "MTsBazXmX00", "title": "Target Propagation via Regularized Inversion", "track": "main", "status": "Reject", "tldr": "", "abstract": "Target Propagation (TP) algorithms compute targets instead of gradients along neural networks and propagate them backward in a way that is similar yet different than gradient back-propagation (BP). The idea was first presented as a perturbative alternative to back-propagation that may achieve greater accuracy in gradient evaluation when training multi-layer neural networks (LeCun et al., 1989). However, TP has remained more of a template algorithm with many variations than a well-identified algorithm. Revisiting insights of LeCun et al., (1989) and more recently of Lee et al. (2015), we present a simple version of target propagation based on regularized inversion of network layers, easily implementable in a differentiable programming framework. We compare its computational complexity to the one of BP and delineate the regimes in which TP can be attractive compared to BP. We show how our TP can be used to train recurrent neural networks with long sequences on various sequence modeling problems. The experimental results underscore the importance of regularization in TP in practice.", "keywords": "Target propagation;differentiable programming;recurrent neural networks", "primary_area": "", "supplementary_material": "/attachment/138eb01303f46ed4e11dc1320afc2264accf23bb.zip", "author": "Vincent Roulet;Zaid Harchaoui", "authorids": "~Vincent_Roulet1;~Zaid_Harchaoui1", "gender": "M;", "homepage": "https://vroulet.github.io/;", "dblp": "164/6165;", "google_scholar": "https://scholar.google.fr/citations?user=vwoZrVMAAAAJ;", "orcid": ";", "linkedin": "vincentroulet/;", "or_profile": "~Vincent_Roulet1;~Zaid_Harchaoui1", "aff": "University of Washington;", "aff_domain": "uw.edu;", "position": "Acting Assistant Professor;", "bibtex": "@misc{\nroulet2022target,\ntitle={Target Propagation via Regularized Inversion},\nauthor={Vincent Roulet and Zaid Harchaoui},\nyear={2022},\nurl={https://openreview.net/forum?id=MTsBazXmX00}\n}", "github": "", "project": "", "reviewers": "NQCB;239T;Dczs;Q1ay", "site": "https://openreview.net/forum?id=MTsBazXmX00", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "3;4;3;3", "correctness": "3;3;3;4", "technical_novelty": "2;3;2;2", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "64;52;49;44", "wc_summary_review": "27;25;36;30", "wc_main_review": "553;456;250;184", "wc_review": "644;533;335;258", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 52.25, 7.361215932167728 ], "wc_summary_review_avg": [ 29.5, 4.153311931459037 ], "wc_main_review_avg": [ 360.75, 149.61513125349322 ], "wc_review_avg": [ 442.5, 153.6139642089872 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.13245323570650439, "corr_recommendation_correctness": 0.6622661785325219, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17936057561033934792&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0", "aff_unique_norm": "University of Washington", "aff_unique_dep": "", "aff_unique_url": "https://www.washington.edu", "aff_unique_abbr": "UW", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "id": "MUpxS9vDbZr", "title": "Why Should I Trust You, Bellman? Evaluating the Bellman Objective with Off-Policy Data", "track": "main", "status": "Reject", "tldr": "", "abstract": "In this work, we analyze the effectiveness of the Bellman equation as a proxy objective for value prediction accuracy in off-policy evaluation. While the Bellman equation is uniquely solved by the true value function over all state-action pairs, we show that in the finite data regime, the Bellman equation can be satisfied exactly by infinitely many suboptimal solutions. This eliminates any guarantees relating Bellman error to the accuracy of the value function. We find this observation extends to practical settings; when computed over an off-policy dataset, the Bellman error bears little relationship to the accuracy of the value function. Consequently, we show that the Bellman error is a poor metric for comparing value functions, and therefore, an ineffective objective for off-policy evaluation. Finally, we discuss differences between Bellman error and the non-stationary objective used by iterative methods and deep reinforcement learning, and highlight how the effectiveness of this objective relies on generalization during training. ", "keywords": "reinforcement learning;off-policy reinforcement learning;off-policy evaluation;deep reinforcement learning", "primary_area": "", "supplementary_material": "", "author": "Scott Fujimoto;David Meger;Doina Precup;Ofir Nachum;Shixiang Shane Gu", "authorids": "~Scott_Fujimoto1;~David_Meger2;~Doina_Precup1;~Ofir_Nachum1;~Shixiang_Shane_Gu1", "gender": ";M;F;M;M", "homepage": ";http://www.cim.mcgill.ca/~dmeger/;http://cs.mcgill.ca/~dprecup/;https://scholar.google.com/citations?user=C-ZlBWMAAAAJ&hl=en;https://sites.google.com/view/gugurus/home", "dblp": "215/5503;51/3415.html;p/DoinaPrecup;;121/0550", "google_scholar": "1Nk3WZoAAAAJ;https://scholar.google.com.tw/citations?user=gFwEytkAAAAJ;https://scholar.google.com.tw/citations?user=j54VcVEAAAAJ;C-ZlBWMAAAAJ;B8wslVsAAAAJ", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~Scott_Fujimoto1;~David_Meger2;~Doina_Precup1;~Ofir_Nachum1;~Shixiang_Gu1", "aff": "Google;McGill University;McGill University;OpenAI;Google", "aff_domain": "google.com;mcgill.ca;mcgill.ca;openai.com;google.com", "position": "Intern;Associate Professor;Associate Professor;Researcher;Senior Research Scientist", "bibtex": "@misc{\nfujimoto2022why,\ntitle={Why Should I Trust You, Bellman? Evaluating the Bellman Objective with Off-Policy Data},\nauthor={Scott Fujimoto and David Meger and Doina Precup and Ofir Nachum and Shixiang Shane Gu},\nyear={2022},\nurl={https://openreview.net/forum?id=MUpxS9vDbZr}\n}", "github": "", "project": "", "reviewers": "ZRun;fSvY;9cny;oTrW", "site": "https://openreview.net/forum?id=MUpxS9vDbZr", "pdf_size": 0, "recommendation": "3;6;6;8", "confidence": "5;4;3;4", "correctness": "3;4;4;4", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;3;3;0", "wc_summary_paper": "99;41;144;159", "wc_summary_review": "56;29;38;101", "wc_main_review": "436;134;420;229", "wc_review": "591;204;602;489", "wc_reply_reviewers": "0;0;0;411", "wc_reply_authors": "496;129;603;489", "reply_reviewers": "0;0;0;1", "reply_authors": "1;1;1;2", "recommendation_avg": [ 5.75, 1.7853571071357126 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 1.224744871391589 ], "wc_summary_paper_avg": [ 110.75, 45.92589139036933 ], "wc_summary_review_avg": [ 56.0, 27.739863013360395 ], "wc_main_review_avg": [ 304.75, 127.86980683492096 ], "wc_review_avg": [ 471.5, 160.60277083537505 ], "wc_reply_reviewers_avg": [ 102.75, 177.96822047770215 ], "wc_reply_authors_avg": [ 429.25, 179.1401336942674 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.5940885257860046, "corr_recommendation_correctness": 0.8892972917998875, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:gmQNzSkRJXEJ:scholar.google.com/&scioq=Why+Should+I+Trust+You,+Bellman%3F+Evaluating+the+Bellman+Objective+with+Off-Policy+Data&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;1;2;0", "aff_unique_norm": "Google;McGill University;OpenAI", "aff_unique_dep": "Google;;", "aff_unique_url": "https://www.google.com;https://www.mcgill.ca;https://openai.com", "aff_unique_abbr": "Google;McGill;OpenAI", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;1;1;0;0", "aff_country_unique": "United States;Canada" }, { "id": "MWQCPYSJRN", "title": "Generative Negative Replay for Continual Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Learning continually is a key aspect of intelligence and a necessary ability to solve many real-world problems. One of the most effective strategies to control catastrophic forgetting, the Achilles\u2019 heel of continual learning, is storing part of the old data and replay them interleaved with new experiences (also known as the replay approach). Generative replay, that is using generative models to provide replay patterns on demand, is particularly intriguing, however, it was shown to be effective mainly under simplified assumptions, such as simple scenarios and low-dimensional benchmarks. \nIn this paper, we show that, while the generated data are usually not able to improve the classification accuracy for the old classes, they can be effective as negative examples (or antagonists) to learn the new classes, especially when the learning experiences are small and contain examples of just one or few classes. The proposed approach is validated on complex class-incremental and data-incremental continual learning scenarios (CORe50 and ImageNet-1000) composed of high-dimensional data and a large number of training experiences: a setup where existing generative replay approaches usually fail. ", "keywords": "Continual Learning;Generative replay;Lifelong learning", "primary_area": "", "supplementary_material": "/attachment/aedeeb83dfbdad422e2331715370edad08b50478.zip", "author": "Gabriele Graffieti;Davide Maltoni;Lorenzo Pellegrini;Vincenzo Lomonaco", "authorids": "~Gabriele_Graffieti1;davide.maltoni@unibo.it;~Lorenzo_Pellegrini1;~Vincenzo_Lomonaco1", "gender": "M;;M;M", "homepage": "https://www.unibo.it/sitoweb/gabriele.graffieti/en;;;https://vincenzolomonaco.com", "dblp": ";;;157/5127", "google_scholar": ";;X3jGASoAAAAJ;https://scholar.google.it/citations?user=rQLINtQAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Gabriele_Graffieti1;davide.maltoni@unibo.it;~Lorenzo_Pellegrini1;~Vincenzo_Lomonaco1", "aff": "University of Bologna;;;University of Pisa", "aff_domain": "unibo.it;;;unipi.it", "position": "PhD student;;;Assistant Professor", "bibtex": "@misc{\ngraffieti2022generative,\ntitle={Generative Negative Replay for Continual Learning},\nauthor={Gabriele Graffieti and Davide Maltoni and Lorenzo Pellegrini and Vincenzo Lomonaco},\nyear={2022},\nurl={https://openreview.net/forum?id=MWQCPYSJRN}\n}", "github": "", "project": "", "reviewers": "mn5p;9MUi;onET;SNYr", "site": "https://openreview.net/forum?id=MWQCPYSJRN", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "5;4;4;4", "correctness": "2;3;3;3", "technical_novelty": "3;2;2;3", "empirical_novelty": "1;2;3;0", "wc_summary_paper": "61;45;149;108", "wc_summary_review": "58;17;91;45", "wc_main_review": "410;224;441;483", "wc_review": "529;286;681;636", "wc_reply_reviewers": "101;0;125;0", "wc_reply_authors": "1397;624;943;1122", "reply_reviewers": "1;0;1;0", "reply_authors": "2;1;2;2", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 1.5, 1.118033988749895 ], "wc_summary_paper_avg": [ 90.75, 40.831207427652686 ], "wc_summary_review_avg": [ 52.75, 26.592997198510737 ], "wc_main_review_avg": [ 389.5, 99.00126261821109 ], "wc_review_avg": [ 533.0, 152.91991368033138 ], "wc_reply_reviewers_avg": [ 56.5, 57.133615324080445 ], "wc_reply_authors_avg": [ 1021.5, 280.7441005613475 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.75, 0.4330127018922193 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.9271726499455306, "corr_recommendation_correctness": 0.9271726499455306, "gs_citation": 22, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14486484751414443501&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff_unique_index": "0;1", "aff_unique_norm": "University of Bologna;University of Pisa", "aff_unique_dep": ";", "aff_unique_url": "https://www.unibo.it;https://www.unipi.it", "aff_unique_abbr": "Unibo;UNIP", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Italy" }, { "title": "GraphENS: Neighbor-Aware Ego Network Synthesis for Class-Imbalanced Node Classification", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/5932", "id": "MXEl7i-iru", "poster": "", "openreview": "https://openreview.net/forum?id=MXEl7i-iru", "slides": "https://iclr.cc/virtual/2022/poster/5932", "video": "https://iclr.cc/virtual/2022/poster/5932", "author_site": "Joonhyung Park, Jaeyun Song, Eunho Yang", "tldr": "", "abstract": "In many real-world node classification scenarios, nodes are highly class-imbalanced, where graph neural networks (GNNs) can be readily biased to major class instances. Albeit existing class imbalance approaches in other domains can alleviate this issue to some extent, they do not consider the impact of message passing between nodes. In this paper, we hypothesize that overfitting to the neighbor sets of minor class due to message passing is a major challenge for class-imbalanced node classification. To tackle this issue, we propose GraphENS, a novel augmentation method that synthesizes the whole ego network for minor class (minor node and its one-hop neighbors) by combining two different ego networks based on their similarity. Additionally, we introduce a saliency-based node mixing method to exploit the abundant class-generic attributes of other nodes while blocking the injection of class-specific features. Our approach consistently outperforms the baselines over multiple node classification benchmark datasets and architectures.", "keywords": "Deep learning;Node classification;Class imbalance;Data Augmentation", "primary_area": "", "supplementary_material": "", "author": "Joonhyung Park;Jaeyun Song;Eunho Yang", "authorids": "~Joonhyung_Park1;~Jaeyun_Song2;~Eunho_Yang1", "gender": "M;M;M", "homepage": ";;https://sites.google.com/site/hleehome2/", "dblp": "306/1374;289/2048;96/2621", "google_scholar": "https://scholar.google.com/citations?hl=ko;;", "orcid": ";;", "linkedin": "joonhyung-park-495527145/;jaeyun-song-9a4111213/;", "or_profile": "~Joonhyung_Park1;~Jaeyun_Song2;~Eunho_Yang1", "aff": "Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology", "aff_domain": "kaist.ac.kr;kaist.ac.kr;kaist.ac.kr", "position": "PhD student;PhD student;Associate Professor", "bibtex": "@inproceedings{\npark2022graphens,\ntitle={Graph{ENS}: Neighbor-Aware Ego Network Synthesis for Class-Imbalanced Node Classification},\nauthor={Joonhyung Park and Jaeyun Song and Eunho Yang},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=MXEl7i-iru}\n}", "github": "", "project": "", "reviewers": "DE1Z;6ykZ;KQwg;ACEE", "pdf_size": 0, "recommendation": "6;6;6;8", "confidence": "4;4;4;4", "correctness": "2;3;3;3", "technical_novelty": "2;3;2;3", "empirical_novelty": "2;4;3;3", "wc_summary_paper": "68;99;17;78", "wc_summary_review": "25;33;15;11", "wc_main_review": "540;421;285;107", "wc_review": "633;553;317;196", "wc_reply_reviewers": "174;150;193;18", "wc_reply_authors": "2608;1750;1271;506", "reply_reviewers": "1;2;2;1", "reply_authors": "4;4;3;2", "recommendation_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 65.5, 30.153772566629204 ], "wc_summary_review_avg": [ 21.0, 8.602325267042627 ], "wc_main_review_avg": [ 338.25, 161.13872129317645 ], "wc_review_avg": [ 424.75, 175.8925453224212 ], "wc_reply_reviewers_avg": [ 133.75, 68.54332571447055 ], "wc_reply_authors_avg": [ 1533.75, 762.5753651279327 ], "reply_reviewers_avg": [ 1.5, 0.5 ], "reply_authors_avg": [ 3.25, 0.82915619758885 ], "replies_avg": [ 25, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 124, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12978316005226409032&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=MXEl7i-iru", "email": "kaist.ac.kr;kaist.ac.kr;kaist.ac.kr", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Korea Advanced Institute of Science and Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.kaist.ac.kr", "aff_unique_abbr": "KAIST", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "South Korea" }, { "title": "Differentiable Expectation-Maximization for Set Representation Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/5927", "id": "MXdFBmHT4C", "poster": "", "openreview": "https://openreview.net/forum?id=MXdFBmHT4C", "slides": "https://iclr.cc/virtual/2022/poster/5927", "video": "https://iclr.cc/virtual/2022/poster/5927", "tldr": "", "abstract": "We tackle the set2vec problem, the task of extracting a vector representation from an input set comprised of a variable number of feature vectors. Although recent approaches based on self attention such as (Set)Transformers were very successful due to the capability of capturing complex interaction between set elements, the computational overhead is the well-known downside. The inducing-point attention and the latest optimal transport kernel embedding (OTKE) are promising remedies that attain comparable or better performance with reduced computational cost, by incorporating a fixed number of learnable queries in attention. In this paper we approach the set2vec problem from a completely different perspective. The elements of an input set are considered as i.i.d.~samples from a mixture distribution, and we define our set embedding feed-forward network as the maximum-a-posterior (MAP) estimate of the mixture which is approximately attained by a few Expectation-Maximization (EM) steps. The whole MAP-EM steps are differentiable operations with a fixed number of mixture parameters, allowing efficient auto-diff back-propagation for any given downstream task. Furthermore, the proposed mixture set data fitting framework allows unsupervised set representation learning naturally via marginal likelihood maximization aka the empirical Bayes. Interestingly, we also find that OTKE can be seen as a special case of our framework, specifically a single-step EM with extra balanced assignment constraints on the E-step. Compared to OTKE, our approach provides more flexible set embedding as well as prior-induced model regularization. We evaluate our approach on various tasks demonstrating improved performance over the state-of-the-arts.", "keywords": "Representation learning;Bayesian models;Mixture estimation;Optimal transport;Attention", "primary_area": "", "supplementary_material": "/attachment/52d44551cdcc3e5908885e2fc289003de778c2cd.zip", "author": "Minyoung Kim", "authorids": "~Minyoung_Kim2", "gender": "M", "homepage": "https://sites.google.com/site/mikim21/", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "~Minyoung_Kim2", "aff": "Samsung AI Center, Cambridge, UK", "aff_domain": "samsung.com", "position": "Senior Researcher", "bibtex": "@inproceedings{\nkim2022differentiable,\ntitle={Differentiable Expectation-Maximization for Set Representation Learning},\nauthor={Minyoung Kim},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=MXdFBmHT4C}\n}", "github": "", "project": "", "reviewers": "xBWk;emDK;7HuD;Y94U", "pdf_size": 0, "recommendation": "6;6;6;8", "confidence": "2;4;2;3", "correctness": "4;3;3;3", "technical_novelty": "3;3;3;3", "empirical_novelty": "3;3;3;2", "wc_summary_paper": "91;151;78;70", "wc_summary_review": "36;20;3;31", "wc_main_review": "81;427;122;353", "wc_review": "208;598;203;454", "wc_reply_reviewers": "0;0;0;38", "wc_reply_authors": "79;620;136;759", "reply_reviewers": "0;0;0;1", "reply_authors": "1;1;1;1", "recommendation_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 2.75, 0.82915619758885 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 97.5, 31.784430150625635 ], "wc_summary_review_avg": [ 22.5, 12.658988901172163 ], "wc_main_review_avg": [ 245.75, 147.3183203135306 ], "wc_review_avg": [ 365.75, 168.15227473929693 ], "wc_reply_reviewers_avg": [ 9.5, 16.454482671904334 ], "wc_reply_authors_avg": [ 398.5, 295.8077923246783 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 1, 0 ], "corr_recommendation_confidence": 0.17407765595569782, "corr_recommendation_correctness": -0.3333333333333333, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5010738266469152581&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "pdf": "https://openreview.net/pdf?id=MXdFBmHT4C", "email": "samsung.com", "author_num": 1, "aff_unique_index": "0", "aff_unique_norm": "Samsung", "aff_unique_dep": "AI Center", "aff_unique_url": "https://www.samsung.com/global/research-innovation/ai-research/", "aff_unique_abbr": "SAC", "aff_campus_unique_index": "0", "aff_campus_unique": "Cambridge", "aff_country_unique_index": "0", "aff_country_unique": "United Kingdom" }, { "id": "MXrIVw-F_a4", "title": "FLOAT: FAST LEARNABLE ONCE-FOR-ALL ADVERSARIAL TRAINING FOR TUNABLE TRADE-OFF BETWEEN ACCURACY AND ROBUSTNESS", "track": "main", "status": "Reject", "tldr": "", "abstract": "Training a model that can be robust against adversarially-perturbed images with-out compromising accuracy on clean-images has proven to be challenging. Recent research has tried to resolve this issue by incorporating an additional layer after each batch-normalization layer in a network, that implements feature-wise linear modulation (FiLM). These extra layers enable in-situ calibration of a trained model, allowing the user to configure the desired priority between robustness and clean-image performance after deployment. However, these extra layers significantly increase training time, parameter count, and add latency which can prove costly for time or memory constrained applications. In this paper, we present Fast Learnable Once-for-all Adversarial Training (FLOAT) which transforms the weight tensors without using extra layers, thereby incurring no significant increase in parameter count, training time, or network latency compared to a standard adversarial training. In particular, we add configurable scaled noise to the weight tensors that enables a \u2018continuous\u2019 trade-off between clean and adversarial performance. Additionally, we extend FLOAT to slimmable neural networks to enable a three-way in-situ trade-off between robustness, accuracy, and complexity. Extensive experiments show that FLOAT can yield state-of-the-art performance improving both clean and perturbed image classification by up to \u223c6.5% and \u223c14.5%, respectively, while requiring up to 1.47x fewer parameters with similar hyperparameter settings compared to FiLM-based alternatives.", "keywords": "Once-for-all adversarial training;in-situ robustness-accuracy trade-off;parameter-efficient in-situ calibration", "primary_area": "", "supplementary_material": "", "author": "Souvik Kundu;Peter Anthony Beerel;Sairam Sundaresan", "authorids": "~Souvik_Kundu2;~Peter_Anthony_Beerel2;~Sairam_Sundaresan1", "gender": "M;M;M", "homepage": "https://ksouvik52.github.io;;http://sites.usc.edu/eessc.html", "dblp": "126/2210;239/6061;29/6330", "google_scholar": "https://scholar.google.com/citations?hl=en;https://scholar.google.com/citations?hl=en;JSdH7PsAAAAJ", "orcid": "0000-0002-3533-9405;;", "linkedin": "souvik-kundu-64922b50/;sairam-sundaresan;peter-beerel-b9902a1/", "or_profile": "~Souvik_Kundu2;~Sairam_Sundaresan1;~Peter_Anthony_Beerel1", "aff": "University of Southern California;Intel Labs;University of Southern California", "aff_domain": "usc.edu;intel.com;usc.edu", "position": "PhD student;Researcher;Full Professor", "bibtex": "@misc{\nkundu2022float,\ntitle={{FLOAT}: {FAST} {LEARNABLE} {ONCE}-{FOR}-{ALL} {ADVERSARIAL} {TRAINING} {FOR} {TUNABLE} {TRADE}-{OFF} {BETWEEN} {ACCURACY} {AND} {ROBUSTNESS}},\nauthor={Souvik Kundu and Peter Anthony Beerel and Sairam Sundaresan},\nyear={2022},\nurl={https://openreview.net/forum?id=MXrIVw-F_a4}\n}", "github": "", "project": "", "reviewers": "DN3s;BuXN;dQEN;Xsuw", "site": "https://openreview.net/forum?id=MXrIVw-F_a4", "pdf_size": 0, "recommendation": "3;5;6;8", "confidence": "3;4;4;5", "correctness": "2;3;3;4", "technical_novelty": "1;1;2;3", "empirical_novelty": "2;1;4;3", "wc_summary_paper": "57;68;70;70", "wc_summary_review": "16;27;23;26", "wc_main_review": "143;163;169;260", "wc_review": "216;258;262;356", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "1499;1509;1156;233", "reply_reviewers": "0;0;0;0", "reply_authors": "2;2;3;1", "recommendation_avg": [ 5.5, 1.8027756377319946 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 1.75, 0.82915619758885 ], "empirical_novelty_avg": [ 2.5, 1.118033988749895 ], "wc_summary_paper_avg": [ 66.25, 5.402545696243577 ], "wc_summary_review_avg": [ 23.0, 4.301162633521313 ], "wc_main_review_avg": [ 183.75, 45.06315013400639 ], "wc_review_avg": [ 273.0, 51.19570294468082 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 1099.25, 519.9290215981408 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.0, 0.7071067811865476 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.9805806756909202, "corr_recommendation_correctness": 0.9805806756909202, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18411706526001271424&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1;0", "aff_unique_norm": "University of Southern California;Intel", "aff_unique_dep": ";Intel Labs", "aff_unique_url": "https://www.usc.edu;https://www.intel.com", "aff_unique_abbr": "USC;Intel", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Los Angeles;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "M_o5E088xO5", "title": "PROMISSING: Pruning Missing Values in Neural Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "While data are the primary fuel for machine learning models, they often suffer from missing values, especially when collected in real-world scenarios. However, many off-the-shelf machine learning models, including artificial neural network models, are unable to handle these missing values directly. Therefore, extra data preprocessing and curation steps, such as data imputation, are inevitable before learning and prediction processes. In this study, we propose a simple and intuitive yet effective method for pruning missing values (PROMISSING) during learning and inference steps in neural networks. In this method, there is no need to remove or impute the missing values; instead, the missing values are treated as a new source of information (representing what we do not know). Our experiments on simulated data, several classification and regression benchmarks, and a multi-modal clinical dataset show that PROMISSING results in similar classification performance compared to various imputation techniques. In addition, our experiments show models trained using PROMISSING techniques are becoming less decisive in their predictions when facing incomplete samples with many unknowns. This finding hopefully advances machine learning models from being pure predicting machines to more realistic thinkers that can also say \"I do not know\" when facing incomplete sources of information.", "keywords": "Neural Networks;Missing Values;Data Imputation", "primary_area": "", "supplementary_material": "/attachment/4e8670a439646c72e68eea2b87a17e9cf96f0875.zip", "author": "Seyed Mostafa Kia;Nastaran Mohammadian Rad;Daniel van Opstal;Bart van Schie;Wiepke Cahn;Andre Marquand;Josien P.W. Pluim;Hugo G. Schnack", "authorids": "~Seyed_Mostafa_Kia1;~Nastaran_Mohammadian_Rad1;d.p.j.vanopstal@umcutrecht.nl;bart.vanschie@gmail.com;w.cahn@umcutrecht.nl;~Andre_Marquand1;~Josien_P.W._Pluim1;h.schnack@umcutrecht.nl", "gender": "M;;;;;;;", "homepage": "https://scholar.google.com/citations?user=T2HOkiIAAAAJ&hl=en;;;;;;https://www.tue.nl/en/research/researchers/josien-pluim;", "dblp": "134/1827;;;;;;;", "google_scholar": "T2HOkiIAAAAJ;;;;;;;", "orcid": "0000-0002-7128-814X;0000-0003-3068-4127;;;;;;", "linkedin": ";;;;;;;", "or_profile": "~Seyed_Mostafa_Kia1;~Nastaran_Mohammadian_Rad1;d.p.j.vanopstal@umcutrecht.nl;bart.vanschie@gmail.com;w.cahn@umcutrecht.nl;~Andre_Marquand1;~Josien_P.W._Pluim1;h.schnack@umcutrecht.nl", "aff": "Utrecht University;Eindhoven University of Technology;;;;;Eindhoven University of Technology;", "aff_domain": "uu.nl;tue.nl;;;;;tue.nl;", "position": "Postdoc;Postdoc;;;;;Full Professor;", "bibtex": "@misc{\nkia2022promissing,\ntitle={{PROMISSING}: Pruning Missing Values in Neural Networks},\nauthor={Seyed Mostafa Kia and Nastaran Mohammadian Rad and Daniel van Opstal and Bart van Schie and Wiepke Cahn and Andre Marquand and Josien P.W. Pluim and Hugo G. Schnack},\nyear={2022},\nurl={https://openreview.net/forum?id=M_o5E088xO5}\n}", "github": "", "project": "", "reviewers": "eatM;vNRV;Bfyk;FmEA;DWUQ", "site": "https://openreview.net/forum?id=M_o5E088xO5", "pdf_size": 0, "recommendation": "3;3;6;6;6", "confidence": "5;5;5;4;4", "correctness": "2;2;3;3;3", "technical_novelty": "1;3;4;4;4", "empirical_novelty": "1;2;2;3;3", "wc_summary_paper": "48;45;54;61;115", "wc_summary_review": "51;26;177;16;138", "wc_main_review": "856;194;927;448;961", "wc_review": "955;265;1158;525;1214", "wc_reply_reviewers": "0;0;206;0;0", "wc_reply_authors": "1756;1653;2001;1567;656", "reply_reviewers": "0;0;1;0;0", "reply_authors": "3;3;3;2;1", "recommendation_avg": [ 4.8, 1.469693845669907 ], "confidence_avg": [ 4.6, 0.48989794855663565 ], "correctness_avg": [ 2.6, 0.4898979485566356 ], "technical_novelty_avg": [ 3.2, 1.16619037896906 ], "empirical_novelty_avg": [ 2.2, 0.7483314773547882 ], "wc_summary_paper_avg": [ 64.6, 25.78836947152728 ], "wc_summary_review_avg": [ 81.6, 64.20778768965647 ], "wc_main_review_avg": [ 677.2, 303.6204209205962 ], "wc_review_avg": [ 823.4, 369.5124355146928 ], "wc_reply_reviewers_avg": [ 41.2, 82.4 ], "wc_reply_authors_avg": [ 1526.6, 458.9625692798924 ], "reply_reviewers_avg": [ 0.2, 0.4000000000000001 ], "reply_authors_avg": [ 2.4, 0.8 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": -0.6666666666666665, "corr_recommendation_correctness": 1.0, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=127153268290447122&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 11, "aff_unique_index": "0;1;1", "aff_unique_norm": "Utrecht University;Eindhoven University of Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.uu.nl;https://www.tue.nl", "aff_unique_abbr": "UU;TU/e", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Netherlands" }, { "id": "MbmwYwhD0Vy", "title": "A Novel Convergence Analysis for the Stochastic Proximal Point Algorithm", "track": "main", "status": "Reject", "tldr": "", "abstract": "In this paper, we study the stochastic proximal point algorithm (SPPA) for general empirical risk minimization (ERM) problems as well as deep learning problems. We present an efficient implementation of SPPA with minor modification for different problem definitions and we observe that efficiently implemented SPPA has faster and more stable convergence than the celebrated stochastic gradient descent (SGD) algorithm, and its many variations, for both convex and non-convex problems. Due to the fact that per-iteration update of SPPA is defined abstractly and has long been considered expensive, its convergence proof has not been well-studied until recently. In this paper, we close the theoretical gap by providing its convergence for convex problems. Our proof technique is different from some of the recent attempts. As a result, we present a surprising result that SPPA for convex problems may converge \\emph{arbitrarily fast}, depending on how the step sizes are chosen. As a second contribution, we also show that for some of the canonical ERM problems and deep learning problems, each iteration of SPPA can be efficiently calculated either in closed form or closed to closed form via bisection---the resulting complexity is exactly the same as that of SGD. Real data experiments showcase its effectiveness in terms of convergence compared to SGD and its variants.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Aysegul Bumin;Kejun Huang", "authorids": "~Aysegul_Bumin1;~Kejun_Huang1", "gender": "F;M", "homepage": ";https://www.cise.ufl.edu/~kejun/", "dblp": "308/4098;140/8874", "google_scholar": ";-RIDViAAAAAJ", "orcid": ";", "linkedin": "aysegulbumin;", "or_profile": "~Aysegul_Bumin1;~Kejun_Huang1", "aff": "University of Florida;University of Florida", "aff_domain": "ufl.edu;ufl.edu", "position": "PhD student;Assistant Professor", "bibtex": "@misc{\nbumin2022a,\ntitle={A Novel Convergence Analysis for the Stochastic Proximal Point Algorithm},\nauthor={Aysegul Bumin and Kejun Huang},\nyear={2022},\nurl={https://openreview.net/forum?id=MbmwYwhD0Vy}\n}", "github": "", "project": "", "reviewers": "QFm6;EvJ6;uk58;xJB8", "site": "https://openreview.net/forum?id=MbmwYwhD0Vy", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "4;4;4;3", "correctness": "1;1;4;3", "technical_novelty": "3;2;3;3", "empirical_novelty": "2;2;3;1", "wc_summary_paper": "51;198;34;41", "wc_summary_review": "26;167;25;14", "wc_main_review": "340;905;87;351", "wc_review": "417;1270;146;406", "wc_reply_reviewers": "0;339;0;0", "wc_reply_authors": "0;214;0;0", "reply_reviewers": "0;2;0;0", "reply_authors": "0;2;0;0", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 2.25, 1.299038105676658 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 81.0, 67.819613682179 ], "wc_summary_review_avg": [ 58.0, 63.107051903887886 ], "wc_main_review_avg": [ 420.75, 298.86148547445856 ], "wc_review_avg": [ 559.75, 424.1641044454375 ], "wc_reply_reviewers_avg": [ 84.75, 146.79130594146235 ], "wc_reply_authors_avg": [ 53.5, 92.66471820493493 ], "reply_reviewers_avg": [ 0.5, 0.8660254037844386 ], "reply_authors_avg": [ 0.5, 0.8660254037844386 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.9622504486493763, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:uzoXsWCr99YJ:scholar.google.com/&scioq=A+Novel+Convergence+Analysis+for+the+Stochastic+Proximal+Point+Algorithm&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "University of Florida", "aff_unique_dep": "", "aff_unique_url": "https://www.ufl.edu", "aff_unique_abbr": "UF", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "Mdn3eM7VHFn", "title": "Grounding Language Representation with Visual Object Information via Cross Modal Pretraining", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Previous studies of visual grounded language learning use a convolutional neural network (CNN) to extract features from the whole image for grounding with the sentence description. However, this approach has two main drawbacks: (i) the whole image usually contains more objects and backgrounds than the sentence itself; thus, matching them together will confuse the grounded model; (ii) CNN only extracts the features of the image but not the relationship between objects inside that, limiting the grounded model to learn complicated contexts. To overcome such shortcomings, we propose a novel object-level grounded language learning framework that empowers the language representation with visual object-grounded information. The framework is comprised of three main components: (i) ObjectGroundedBERT captures the visual-object relations and literary portrayals by cross-modal pretraining via a Text-grounding mechanism, (ii) Visual encoder represents a visual relation between objects and (iii) Cross-modal Transformer helps the Visual encoder and ObjectGroundedBERT learn the alignment and representation of image-text context. Experimental results show that our proposed framework consistently outperforms the baseline language models on various language tasks of GLUE and SQuAD datasets.", "keywords": "Grounded Language Learning;Language Model", "primary_area": "", "supplementary_material": "", "author": "Cong-Duy T Nguyen;Anh Tuan Luu;Tho Quan", "authorids": "~Cong-Duy_T_Nguyen1;~Anh_Tuan_Luu2;~Tho_Quan1", "gender": "M;M;M", "homepage": "https://duyngtr16061999.github.io/;https://tuanluu.github.io/;http://www.cse.hcmut.edu.vn/qttho/doku.php", "dblp": ";81/8329.html;26/8327.html", "google_scholar": "vIdT3F8AAAAJ;https://scholar.google.com.sg/citations?hl=en;IlW-MrAAAAAJ", "orcid": ";;0000-0003-0467-6254", "linkedin": ";;", "or_profile": "~Cong-Duy_T_Nguyen1;~Anh_Tuan_Luu2;~Tho_Quan1", "aff": "VinAI Research;Nanyang Technological University;", "aff_domain": "vinai.io;ntu.edu.sg;", "position": "Resident;Assistant Professor;", "bibtex": "@misc{\nnguyen2022grounding,\ntitle={Grounding Language Representation with Visual Object Information via Cross Modal Pretraining},\nauthor={Cong-Duy T Nguyen and Anh Tuan Luu and Tho Quan},\nyear={2022},\nurl={https://openreview.net/forum?id=Mdn3eM7VHFn}\n}", "github": "", "project": "", "reviewers": "EAX5;oQRJ;3RcU", "site": "https://openreview.net/forum?id=Mdn3eM7VHFn", "pdf_size": 0, "recommendation": "3;5;5", "confidence": "5;4;4", "correctness": "3;2;3", "technical_novelty": "1;2;2", "empirical_novelty": "1;3;2", "wc_summary_paper": "40;54;67", "wc_summary_review": "14;40;65", "wc_main_review": "135;305;304", "wc_review": "189;399;436", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 4.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "correctness_avg": [ 2.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 1.6666666666666667, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.0, 0.816496580927726 ], "wc_summary_paper_avg": [ 53.666666666666664, 11.025223605694151 ], "wc_summary_review_avg": [ 39.666666666666664, 20.82199691565522 ], "wc_main_review_avg": [ 248.0, 79.9041091976293 ], "wc_review_avg": [ 341.3333333333333, 108.7698896243298 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.9999999999999997, "corr_recommendation_correctness": -0.49999999999999983, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:5pLSy587tJYJ:scholar.google.com/&scioq=Grounding+Language+Representation+with+Visual+Object+Information+via+Cross+Modal+Pretraining&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1", "aff_unique_norm": "VinAI Research;Nanyang Technological University", "aff_unique_dep": ";", "aff_unique_url": "https://www.vinai.io/;https://www.ntu.edu.sg", "aff_unique_abbr": "VinAI;NTU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "Vietnam;Singapore" }, { "id": "MeMMmuWRXsy", "title": "Robust Robotic Control from Pixels using Contrastive Recurrent State-Space Models", "track": "main", "status": "Reject", "tldr": "", "abstract": "Modeling the world can benefit robot learning by providing a rich training signal for shaping an agent's latent state space. However, learning world models in unconstrained environments over high-dimensional observation spaces such as images is challenging. One source of difficulty is the presence of irrelevant but hard-to-model background distractions, and unimportant visual details of task-relevant entities. We address this issue by learning a recurrent latent dynamics model which contrastively predicts the next observation. This simple model leads to surprisingly robust robotic control even with simultaneous camera, background, and color distractions. We outperform alternatives such as bisimulation methods which impose state-similarity measures derived from divergence in future reward or future optimal actions. We obtain state-of-the-art results on the Distracting Control Suite, a challenging benchmark for pixel-based robotic control.", "keywords": "contrastive learning;model-based RL;distractions;predictive coding", "primary_area": "", "supplementary_material": "/attachment/dd0ee8e1f0a2ed6941b9122e21c580ae90fdf73d.zip", "author": "Nitish Srivastava;Walter Talbott;Martin Bertran Lopez;Shuangfei Zhai;Joshua M. Susskind", "authorids": "~Nitish_Srivastava1;~Walter_Talbott1;mbertran@apple.com;~Shuangfei_Zhai3;~Joshua_M._Susskind1", "gender": "M;;;M;M", "homepage": "http://www.cs.toronto.edu/~nitish;;;http://cs.binghamton.edu/~szhai2;http://www.apple.com", "dblp": "00/11304.html;241/6096;;;132/7797", "google_scholar": "https://scholar.google.ca/citations?user=s1PgoeUAAAAJ;;;G6vdBYsAAAAJ;Sv2TGqsAAAAJ", "orcid": ";;;;", "linkedin": ";;;;joshua-susskind-8ab2ab5/", "or_profile": "~Nitish_Srivastava1;~Walter_Talbott1;mbertran@apple.com;~Shuangfei_Zhai3;~Joshua_M._Susskind1", "aff": "Apple Inc;Apple;;Apple;Apple", "aff_domain": "apple.com;apple.com;;apple.com;apple.com", "position": "Researcher;Research Scientist;;Research Scientist;Researcher", "bibtex": "@misc{\nsrivastava2022robust,\ntitle={Robust Robotic Control from Pixels using Contrastive Recurrent State-Space Models},\nauthor={Nitish Srivastava and Walter Talbott and Martin Bertran Lopez and Shuangfei Zhai and Joshua M. Susskind},\nyear={2022},\nurl={https://openreview.net/forum?id=MeMMmuWRXsy}\n}", "github": "", "project": "", "reviewers": "cJX3;skrV;FBRd", "site": "https://openreview.net/forum?id=MeMMmuWRXsy", "pdf_size": 0, "recommendation": "3;5;5", "confidence": "5;4;3", "correctness": "3;3;4", "technical_novelty": "1;2;3", "empirical_novelty": "2;3;3", "wc_summary_paper": "78;100;76", "wc_summary_review": "30;29;87", "wc_main_review": "208;729;333", "wc_review": "316;858;496", "wc_reply_reviewers": "230;653;0", "wc_reply_authors": "568;868;189", "reply_reviewers": "1;1;0", "reply_authors": "1;2;1", "recommendation_avg": [ 4.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.0, 0.816496580927726 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 84.66666666666667, 10.873004286866728 ], "wc_summary_review_avg": [ 48.666666666666664, 27.10883414846328 ], "wc_main_review_avg": [ 423.3333333333333, 222.08156659710014 ], "wc_review_avg": [ 556.6666666666666, 225.39052528642122 ], "wc_reply_reviewers_avg": [ 294.3333333333333, 270.4395598938061 ], "wc_reply_authors_avg": [ 541.6666666666666, 277.82528482643346 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.8660254037844385, "corr_recommendation_correctness": 0.49999999999999983, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16187212411347019539&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Apple", "aff_unique_dep": "Apple Inc", "aff_unique_url": "https://www.apple.com", "aff_unique_abbr": "Apple", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Adversarial Unlearning of Backdoors via Implicit Hypergradient", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6180", "id": "MeeQkFYVbzW", "poster": "", "openreview": "https://openreview.net/forum?id=MeeQkFYVbzW", "slides": "https://iclr.cc/virtual/2022/poster/6180", "video": "https://iclr.cc/virtual/2022/poster/6180", "author_site": "Yi Zeng, Si Chen, Won Park, Zhuoqing Mao, Ming Jin, Ruoxi Jia", "tldr": "", "abstract": "We propose a minimax formulation for removing backdoors from a given poisoned model based on a small set of clean data. This formulation encompasses much of prior work on backdoor removal. We propose the Implicit Backdoor Adversarial Unlearning (I-BAU) algorithm to solve the minimax. Unlike previous work, which breaks down the minimax into separate inner and outer problems, our algorithm utilizes the implicit hypergradient to account for the interdependence between inner and outer optimization. We theoretically analyze its convergence and the generalizability of the robustness gained by solving minimax on clean data to unseen test data. In our evaluation, we compare I-BAU with six state-of-art backdoor defenses on eleven backdoor attacks over two datasets and various attack settings, including the common setting where the attacker targets one class as well as important but underexplored settings where multiple classes are targeted. I-BAU's performance is comparable to and most often significantly better than the best baseline. Particularly, its performance is more robust to the variation on triggers, attack settings, poison ratio, and clean data size. Moreover, I-BAU requires less computation to take effect; particularly, it is more than $13\\times$ faster than the most efficient baseline in the single-target attack setting. Furthermore, it can remain effective in the extreme case where the defender can only access 100 clean samples---a setting where all the baselines fail to produce acceptable results.", "keywords": "backdoor defense;backdoor removal;backdoor;minimax;implicit hypergradient", "primary_area": "", "supplementary_material": "/attachment/a03a5fae63542d6c13f448faa64a8adfc618ac91.zip", "author": "Yi Zeng;Si Chen;Won Park;Zhuoqing Mao;Ming Jin;Ruoxi Jia", "authorids": "~Yi_Zeng3;~Si_Chen5;~Won_Park1;~Zhuoqing_Mao1;~Ming_Jin2;~Ruoxi_Jia1", "gender": "M;;M;F;M;", "homepage": "https://yizeng623.github.io/;;https://wonpark.io/;https://web.eecs.umich.edu/~zmao/;http://www.jinming.tech/;https://ruoxijia.info/", "dblp": "75/148;;;;;147/5355-1", "google_scholar": "slUNmHQAAAAJ;;;Ba_Ci9UAAAAJ;YdxdTtkAAAAJ;JCrug-YAAAAJ", "orcid": "0000-0002-6901-9194;;;;;", "linkedin": "chnyizeng/;;;;;", "or_profile": "~Yi_Zeng3;~Si_Chen5;~Won_Park1;~Zhuoqing_Mao1;~Ming_Jin2;~Ruoxi_Jia1", "aff": "Virginia Tech;;University of Michigan;University of Michigan;Virginia Tech;Virginia Tech", "aff_domain": "vt.edu;;umich.edu;umich.edu;vt.edu;vt.edu", "position": "PhD student;;PhD student;Professor;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\nzeng2022adversarial,\ntitle={Adversarial Unlearning of Backdoors via Implicit Hypergradient},\nauthor={Yi Zeng and Si Chen and Won Park and Zhuoqing Mao and Ming Jin and Ruoxi Jia},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=MeeQkFYVbzW}\n}", "github": "", "project": "", "reviewers": "mJLa;u6du;EmBg;GDHf", "pdf_size": 0, "recommendation": "6;6;6;6", "confidence": "3;3;3;4", "correctness": "3;3;3;4", "technical_novelty": "3;2;3;3", "empirical_novelty": "3;2;3;3", "wc_summary_paper": "45;141;98;99", "wc_summary_review": "40;63;46;46", "wc_main_review": "238;158;676;304", "wc_review": "323;362;820;449", "wc_reply_reviewers": "18;75;182;50", "wc_reply_authors": "1373;1006;2308;1522", "reply_reviewers": "1;2;3;1", "reply_authors": "7;4;8;6", "recommendation_avg": [ 6.0, 0.0 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 95.75, 34.05418476487141 ], "wc_summary_review_avg": [ 48.75, 8.584142356694699 ], "wc_main_review_avg": [ 344.0, 198.5295947711575 ], "wc_review_avg": [ 488.5, 196.75174713328468 ], "wc_reply_reviewers_avg": [ 81.25, 61.57667983904296 ], "wc_reply_authors_avg": [ 1552.25, 475.0244072676687 ], "reply_reviewers_avg": [ 1.75, 0.82915619758885 ], "reply_authors_avg": [ 6.25, 1.479019945774904 ], "replies_avg": [ 41, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 233, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4522682349845084821&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=MeeQkFYVbzW", "email": "vt.edu;;umich.edu;umich.edu;vt.edu;vt.edu", "author_num": 6, "aff_unique_index": "0;1;1;0;0", "aff_unique_norm": "Virginia Tech;University of Michigan", "aff_unique_dep": ";", "aff_unique_url": "https://www.vt.edu;https://www.umich.edu", "aff_unique_abbr": "VT;UM", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "Mh40mAxxAUz", "title": "Bounding Membership Inference", "track": "main", "status": "Reject", "tldr": "", "abstract": "Differential Privacy (DP) is the de facto standard for reasoning about the privacy guarantees of a training algorithm. Despite the empirical observation that DP reduces the vulnerability of models to existing membership inference (MI) attacks, a theoretical underpinning as to why this is the case is largely missing in the literature. In practice, this means that models need to be trained with differential privacy guarantees that greatly decrease their accuracy. In this paper, we provide a tighter bound on the accuracy of any membership inference adversary when a training algorithm provides $\\epsilon$-DP. Our bound informs the design of a novel privacy amplification scheme, where an effective training set is sub-sampled from a larger set prior to the beginning of training, to greatly reduce the bound on MI accuracy. As a result, our scheme enables $\\epsilon$-DP users to employ looser differential privacy guarantees when training their model to limit the success of any MI adversary; this, in turn, ensures that the model's accuracy is less impacted by the privacy guarantee. Finally, we discuss the implications of our MI bound on machine unlearning.", "keywords": "differential privacy;membership inference", "primary_area": "", "supplementary_material": "", "author": "Anvith Thudi;I Shumailov;Franziska Boenisch;Nicolas Papernot", "authorids": "~Anvith_Thudi1;~I_Shumailov1;~Franziska_Boenisch1;~Nicolas_Papernot1", "gender": "M;Unspecified;;M", "homepage": "https://www.anvith.com;https://www.cl.cam.ac.uk/~is410/;;https://www.papernot.fr", "dblp": ";213/8587;;162/1405", "google_scholar": "https://scholar.google.ca/citations?hl=en;https://scholar.google.co.uk/citations?hl=en;;cGxq0cMAAAAJ", "orcid": ";;;", "linkedin": "anvith-thudi-54b5621bb/?originalSubdomain=ca;ilia-shumailov/;;nicolaspapernot", "or_profile": "~Anvith_Thudi1;~I_Shumailov1;~Franziska_Boenisch1;~Nicolas_Papernot1", "aff": "University of Toronto;Vector Institute;;Google", "aff_domain": "utoronto.ca;vectorinstitute.ai;;google.com", "position": "Undergrad student;Fellowship;;Research Scientist", "bibtex": "@misc{\nthudi2022bounding,\ntitle={Bounding Membership Inference},\nauthor={Anvith Thudi and I Shumailov and Franziska Boenisch and Nicolas Papernot},\nyear={2022},\nurl={https://openreview.net/forum?id=Mh40mAxxAUz}\n}", "github": "", "project": "", "reviewers": "gi1v;rm9k;rQvh;JCNw", "site": "https://openreview.net/forum?id=Mh40mAxxAUz", "pdf_size": 0, "recommendation": "3;6;6;8", "confidence": "4;3;2;3", "correctness": "4;4;3;4", "technical_novelty": "2;3;2;3", "empirical_novelty": "2;2;2;0", "wc_summary_paper": "43;39;64;51", "wc_summary_review": "32;38;58;35", "wc_main_review": "246;231;332;174", "wc_review": "321;308;454;260", "wc_reply_reviewers": "74;0;0;0", "wc_reply_authors": "765;454;873;291", "reply_reviewers": "1;0;0;0", "reply_authors": "2;1;2;1", "recommendation_avg": [ 5.75, 1.7853571071357126 ], "confidence_avg": [ 3.0, 0.7071067811865476 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 1.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 49.25, 9.54921462739214 ], "wc_summary_review_avg": [ 40.75, 10.18270592720815 ], "wc_main_review_avg": [ 245.75, 56.57903763762689 ], "wc_review_avg": [ 335.75, 71.95267541933379 ], "wc_reply_reviewers_avg": [ 18.5, 32.04293994002423 ], "wc_reply_authors_avg": [ 595.75, 233.70855247508595 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 20, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.5940885257860046, "corr_recommendation_correctness": -0.08084520834544431, "gs_citation": 32, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9331765561502181958&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1;2", "aff_unique_norm": "University of Toronto;Vector Institute;Google", "aff_unique_dep": ";;Google", "aff_unique_url": "https://www.utoronto.ca;https://vectorinstitute.ai/;https://www.google.com", "aff_unique_abbr": "U of T;Vector Institute;Google", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;1", "aff_country_unique": "Canada;United States" }, { "id": "Mi9xQBeZxY5", "title": "Towards Feature Overcorrelation in Deeper Graph Neural Networks", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Graph neural networks (GNNs) have achieved great success in graph representation learning, which has tremendously facilitated various real-world applications. Nevertheless, the performance of GNNs significantly deteriorates when the depth increases. Recent researches have attributed this phenomenon to the oversmoothing issue, which indicates that the learned node representations are highly indistinguishable. In this paper, we observe a new issue in deeper GNNs, i.e., feature overcorrelation, and perform a thorough study to deepen our understanding on this issue. In particular, we demonstrate the existence of feature overcorrelation in deeper GNNs, reveal potential reasons leading to this issue, and validate that overcorrelation and oversmoothing present different patterns though they are related. Since feature overcorrelation indicates that GNNs encode less information and can harm the downstream tasks, it is of great significance to mitigate this issue. Therefore, we propose the DeCorr, a general framework to effectively reduce feature correlation for deeper GNNs. Experimental results on various datasets demonstrate that DeCorr can help train deeper GNNs effectively and is complementary to methods tackling oversmoothing.", "keywords": "Graph Neural Networks;Feature Overcorrelation", "primary_area": "", "supplementary_material": "", "author": "Wei Jin;Xiaorui Liu;Yao Ma;Charu Aggarwal;Jiliang Tang", "authorids": "~Wei_Jin4;~Xiaorui_Liu1;~Yao_Ma3;charu@us.ibm.com;~Jiliang_Tang1", "gender": ";M;M;;M", "homepage": "http://www.cs.emory.edu/~wjin30/;https://sites.google.com/ncsu.edu/xiaorui/;https://yaoma24.github.io/;;https://www.cse.msu.edu/~tangjili/", "dblp": "66/2173-9;172/0995;212/7871.html;;64/10812", "google_scholar": "eWow24EAAAAJ;NhvN1KoAAAAJ;wf9TTOIAAAAJ;;WtzKMWAAAAAJ", "orcid": ";0000-0001-8217-5688;;;0000-0001-7125-3898", "linkedin": ";;;;", "or_profile": "~Wei_Jin4;~Xiaorui_Liu1;~Yao_Ma3;charu@us.ibm.com;~Jiliang_Tang1", "aff": "Michigan State University;Michigan State University;New Jersey Institute of Technology;;Michigan State University", "aff_domain": "msu.edu;msu.edu;njit.edu;;msu.edu", "position": "PhD student;PhD student;Assistant Professor;;Associate Professor", "bibtex": "@misc{\njin2022towards,\ntitle={Towards Feature Overcorrelation in Deeper Graph Neural Networks},\nauthor={Wei Jin and Xiaorui Liu and Yao Ma and Charu Aggarwal and Jiliang Tang},\nyear={2022},\nurl={https://openreview.net/forum?id=Mi9xQBeZxY5}\n}", "github": "", "project": "", "reviewers": "aWgY;EqPJ;9oHA;qKvG;uqX3", "site": "https://openreview.net/forum?id=Mi9xQBeZxY5", "pdf_size": 0, "recommendation": "3;5;5;5;5", "confidence": "4;4;5;4;5", "correctness": "3;3;3;3;3", "technical_novelty": "2;2;3;2;3", "empirical_novelty": "2;0;2;2;3", "wc_summary_paper": "48;63;48;83;47", "wc_summary_review": "26;42;4;54;35", "wc_main_review": "533;262;358;366;461", "wc_review": "607;367;410;503;543", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "0;0;0;0;0", "reply_reviewers": "0;0;0;0;0", "reply_authors": "0;0;0;0;0", "recommendation_avg": [ 4.6, 0.7999999999999999 ], "confidence_avg": [ 4.4, 0.4898979485566356 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.4, 0.4898979485566356 ], "empirical_novelty_avg": [ 1.8, 0.9797958971132713 ], "wc_summary_paper_avg": [ 57.8, 13.934130758680285 ], "wc_summary_review_avg": [ 32.2, 16.809521111560557 ], "wc_main_review_avg": [ 396.0, 93.0526732555277 ], "wc_review_avg": [ 486.0, 87.31093860450706 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.4082482904638632, "corr_recommendation_correctness": 0.0, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2073751683189706113&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Michigan State University;New Jersey Institute of Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.msu.edu;https://www.njit.edu", "aff_unique_abbr": "MSU;NJIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "MjbdO3_ihp", "title": "Iterative Bilinear Temporal-Spectral Fusion for Unsupervised Representation Learning in Time Series", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Unsupervised representation learning for multivariate time series has practical significances, but it is also a challenging problem because of its complex dynamics and sparse annotations. Existing works mainly adopt the framework of contrastive learning and involve the data augmentation techniques to sample positives and negatives for contrastive training. However, their designs of representation learning framework have two drawbacks. First, we revisit the augmentation methods for time series of existing works and note that they mostly use segment-level augmentation derived from time slicing, which may bring about sampling bias and incorrect optimization with false negatives due to the loss of global context. Second, they all pay no attention to incorporate the spectral information and temporal-spectral relations in feature representation. To address these problems, we propose a novel framework, namely Bilinear Temporal-Spectral Fusion (BTSF).\nIn contrast to segment-level augmentation, we utilize the instance-level augmentation by simply applying dropout on the entire time series for better preserving global context and capturing long-term dependencies. Also, an iterative bilinear temporal-spectral fusion module is devised to explicitly encode the affinities of abundant time-frequency pairs and iteratively refine representations of time series through cross-domain interactions with Spectrum-to-Time (S2T) and Time-to-Spectrum (T2S) Aggregation modules. Finally, we make sufficient assessments including alignment and uniformity to prove the effectiveness of our bilinear feature representations produced by BTSF. Extensive experiments are conducted on three major practical tasks for time series such as classification, forecasting and anomaly detection, which is the first to evaluate on all three tasks. Results shows that our BTSF achieves the superiority over the state-of-the-art methods and surpasses them by a large margin across downstream tasks. Code will be released.", "keywords": "Time Series;Unsupervised Learning;Representation Learning", "primary_area": "", "supplementary_material": "/attachment/e24f9599311cfd5b6d79a5c33ab833aa4043d2be.zip", "author": "Ling Yang;Shenda Hong;Luxia Zhang", "authorids": "~Ling_Yang1;~Shenda_Hong1;~Luxia_Zhang1", "gender": "M;;F", "homepage": "https://yangling0818.github.io/;;http://dshm.bjmu.edu.cn/info/1124/1244.htm", "dblp": "01/24-6.html;;", "google_scholar": "https://scholar.google.com.hk/citations?user=sIKujqAAAAAJ;;", "orcid": "0000-0003-1905-8053;;", "linkedin": ";;", "or_profile": "~Ling_Yang1;~Shenda_Hong1;~Luxia_Zhang1", "aff": "Peking University;;National Institute of Health Data Science, Peking University", "aff_domain": "pku.edu.cn;;pku.edu.cn", "position": "PhD student;;Full Professor", "bibtex": "@misc{\nyang2022iterative,\ntitle={Iterative Bilinear Temporal-Spectral Fusion for Unsupervised Representation Learning in Time Series},\nauthor={Ling Yang and Shenda Hong and Luxia Zhang},\nyear={2022},\nurl={https://openreview.net/forum?id=MjbdO3_ihp}\n}", "github": "", "project": "", "reviewers": "jWh3;WLjY;UjT2;XfG5;PxLL", "site": "https://openreview.net/forum?id=MjbdO3_ihp", "pdf_size": 0, "recommendation": "6;6;6;6;8", "confidence": "4;4;4;4;5", "correctness": "4;3;3;3;3", "technical_novelty": "3;3;2;2;3", "empirical_novelty": "3;0;2;3;3", "wc_summary_paper": "73;208;53;130;110", "wc_summary_review": "46;229;75;63;21", "wc_main_review": "244;690;347;362;158", "wc_review": "363;1127;475;555;289", "wc_reply_reviewers": "0;0;95;60;0", "wc_reply_authors": "252;402;433;674;182", "reply_reviewers": "0;0;3;1;0", "reply_authors": "6;4;6;7;4", "recommendation_avg": [ 6.4, 0.7999999999999999 ], "confidence_avg": [ 4.2, 0.39999999999999997 ], "correctness_avg": [ 3.2, 0.39999999999999997 ], "technical_novelty_avg": [ 2.6, 0.4898979485566356 ], "empirical_novelty_avg": [ 2.2, 1.16619037896906 ], "wc_summary_paper_avg": [ 114.8, 53.86427387424804 ], "wc_summary_review_avg": [ 86.8, 73.37683558181014 ], "wc_main_review_avg": [ 360.2, 180.73892773832648 ], "wc_review_avg": [ 561.8, 296.97568924071885 ], "wc_reply_reviewers_avg": [ 31.0, 39.54743986657038 ], "wc_reply_authors_avg": [ 388.6, 170.26285560861476 ], "reply_reviewers_avg": [ 0.8, 1.1661903789690602 ], "reply_authors_avg": [ 5.4, 1.2 ], "replies_avg": [ 42, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 1.0, "corr_recommendation_correctness": -0.2500000000000001, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15991584952318745877&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Peking University", "aff_unique_dep": "", "aff_unique_url": "http://www.pku.edu.cn", "aff_unique_abbr": "Peking U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "title": "On the Optimal Memorization Power of ReLU Neural Networks", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7214", "id": "MkTPtnjeYTV", "poster": "", "openreview": "https://openreview.net/forum?id=MkTPtnjeYTV", "slides": "https://iclr.cc/virtual/2022/poster/7214", "video": "https://iclr.cc/virtual/2022/poster/7214", "author_site": "Gal Vardi, Gilad Yehudai, Ohad Shamir", "tldr": "", "abstract": "We study the memorization power of feedforward ReLU neural networks. We show that such networks can memorize any $N$ points that satisfy a mild separability assumption using $\\tilde{O}\\left(\\sqrt{N}\\right)$ parameters. Known VC-dimension upper bounds imply that memorizing $N$ samples requires $\\Omega(\\sqrt{N})$ parameters, and hence our construction is optimal up to logarithmic factors. We also give a generalized construction for networks with depth bounded by $1 \\leq L \\leq \\sqrt{N}$, for memorizing $N$ samples using $\\tilde{O}(N/L)$ parameters. This bound is also optimal up to logarithmic factors. Our construction uses weights with large bit complexity. We prove that having such a large bit complexity is both necessary and sufficient for memorization with a sub-linear number of parameters.", "keywords": "Expressivness;Memorization;Theory;VC-dimension;Deep learning theory", "primary_area": "", "supplementary_material": "", "author": "Gal Vardi;Gilad Yehudai;Ohad Shamir", "authorids": "~Gal_Vardi1;~Gilad_Yehudai2;~Ohad_Shamir1", "gender": "M;M;", "homepage": "https://sites.google.com/view/galvardi/home;;http://www.wisdom.weizmann.ac.il/~shamiro/", "dblp": "https://dblp.uni-trier.de/pid/167/9638.html;239/4344;12/5897", "google_scholar": "https://scholar.google.co.il/citations?hl=en;opVT1qkAAAAJ;all0DHsAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Gal_Vardi1;~Gilad_Yehudai2;~Ohad_Shamir1", "aff": "Weizmann Institute;Weizmann Institute of Science;Weizmann Institute", "aff_domain": "weizmann.ac.il;weizmann.ac.il;weizmann.ac.il", "position": "Postdoc;PhD student;Associate Professor", "bibtex": "@inproceedings{\nvardi2022on,\ntitle={On the Optimal Memorization Power of Re{LU} Neural Networks},\nauthor={Gal Vardi and Gilad Yehudai and Ohad Shamir},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=MkTPtnjeYTV}\n}", "github": "", "project": "", "reviewers": "FKzU;P138;sa2e", "pdf_size": 0, "recommendation": "8;8;8", "confidence": "4;5;5", "correctness": "4;4;3", "technical_novelty": "4;3;4", "empirical_novelty": "3;0;0", "wc_summary_paper": "99;37;123", "wc_summary_review": "24;20;77", "wc_main_review": "470;364;916", "wc_review": "593;421;1116", "wc_reply_reviewers": "41;8;97", "wc_reply_authors": "180;207;372", "reply_reviewers": "1;1;1", "reply_authors": "1;1;2", "recommendation_avg": [ 8.0, 0.0 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "correctness_avg": [ 3.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 3.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 1.0, 1.4142135623730951 ], "wc_summary_paper_avg": [ 86.33333333333333, 36.23380864453651 ], "wc_summary_review_avg": [ 40.333333333333336, 25.978623691198287 ], "wc_main_review_avg": [ 583.3333333333334, 239.17822271733314 ], "wc_review_avg": [ 710.0, 295.5480784350774 ], "wc_reply_reviewers_avg": [ 48.666666666666664, 36.736297521056144 ], "wc_reply_authors_avg": [ 253.0, 84.86459803710851 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 32, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3133149822912558797&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=MkTPtnjeYTV", "email": "weizmann.ac.il;weizmann.ac.il;weizmann.ac.il", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Weizmann Institute of Science", "aff_unique_dep": "", "aff_unique_url": "https://www.weizmann.org.il", "aff_unique_abbr": "Weizmann", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Israel" }, { "title": "Know Your Action Set: Learning Action Relations for Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6594", "id": "MljXVdp4A3N", "poster": "", "openreview": "https://openreview.net/forum?id=MljXVdp4A3N", "slides": "https://iclr.cc/virtual/2022/poster/6594", "video": "https://iclr.cc/virtual/2022/poster/6594", "author_site": "Ayush Jain, Norio Kosaka, Kyung-Min Kim, Joseph Lim", "tldr": "", "abstract": "Intelligent agents can solve tasks in various ways depending on their available set of actions. However, conventional reinforcement learning (RL) assumes a fixed action set. This work asserts that tasks with varying action sets require reasoning of the relations between the available actions. For instance, taking a nail-action in a repair task is meaningful only if a hammer-action is also available. To learn and utilize such action relations, we propose a novel policy architecture consisting of a graph attention network over the available actions. We show that our model makes informed action decisions by correctly attending to other related actions in both value-based and policy-based RL. Consequently, it outperforms non-relational architectures on applications where the action space often varies, such as recommender systems and physical reasoning with tools and skills. Results and code at https://sites.google.com/view/varyingaction .", "keywords": "reinforcement learning;varying action space;relational reasoning", "primary_area": "", "supplementary_material": "/attachment/bb5d263ecab9babc9df0f10e53a8fcc609a8f305.zip", "author": "Ayush Jain;Norio Kosaka;Kyung-Min Kim;Joseph J Lim", "authorids": "~Ayush_Jain2;~Norio_Kosaka1;~Kyung-Min_Kim1;~Joseph_J_Lim1", "gender": ";M;M;M", "homepage": "https://ayushj240.github.io/;https://rowing0914.github.io/;;http://people.csail.mit.edu/lim/", "dblp": "131/6283-3.html;;85/8572;08/3086", "google_scholar": "-zEc_sAAAAAJ;dIpkfPAAAAAJ;https://scholar.google.com/citations?hl=en;jTnQTBoAAAAJ", "orcid": ";;0000-0003-2426-2198;", "linkedin": ";norio-kosaka-b73701117/;;", "or_profile": "~Ayush_Jain2;~Norio_Kosaka1;~Kyung-Min_Kim1;~Joseph_J_Lim1", "aff": "University of Southern California;NAVER;NAVER;Korea Advanced Institute of Science & Technology", "aff_domain": "usc.edu;navercorp.com;navercorp.com;kaist.ac.kr", "position": "PhD student;Researcher;Leader;Associate Professor", "bibtex": "@inproceedings{\njain2022know,\ntitle={Know Your Action Set: Learning Action Relations for Reinforcement Learning},\nauthor={Ayush Jain and Norio Kosaka and Kyung-Min Kim and Joseph J Lim},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=MljXVdp4A3N}\n}", "github": "", "project": "", "reviewers": "3F2U;mRLy;FMiS;RWVF", "pdf_size": 0, "recommendation": "6;8;8;8", "confidence": "3;5;4;3", "correctness": "3;3;3;4", "technical_novelty": "3;3;3;2", "empirical_novelty": "3;3;3;3", "wc_summary_paper": "48;58;103;102", "wc_summary_review": "21;62;98;65", "wc_main_review": "455;147;472;676", "wc_review": "524;267;673;843", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "499;742;748;757", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 7.5, 0.8660254037844386 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 77.75, 25.00374971879218 ], "wc_summary_review_avg": [ 61.5, 27.31757675929547 ], "wc_main_review_avg": [ 437.5, 188.92392648894423 ], "wc_review_avg": [ 576.75, 211.47148152883406 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 686.5, 108.38473139700075 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.5222329678670935, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5935789690770841759&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=MljXVdp4A3N", "email": "usc.edu;navercorp.com;navercorp.com;kaist.ac.kr", "author_num": 4, "aff_unique_index": "0;1;1;2", "aff_unique_norm": "University of Southern California;NAVER Corporation;Korea Advanced Institute of Science and Technology", "aff_unique_dep": ";;", "aff_unique_url": "https://www.usc.edu;https://www.naver.com;https://www.kaist.ac.kr", "aff_unique_abbr": "USC;NAVER;KAIST", "aff_campus_unique_index": "0", "aff_campus_unique": "Los Angeles;", "aff_country_unique_index": "0;1;1;1", "aff_country_unique": "United States;South Korea" }, { "id": "Mlwe37htstv", "title": "Efficient Wasserstein and Sinkhorn Policy Optimization", "track": "main", "status": "Reject", "tldr": "", "abstract": "Trust-region methods based on Kullback-Leibler divergence are pervasively used to stabilize policy optimization in reinforcement learning. In this paper, we examine two natural extensions of policy optimziation with Wasserstein and Sinkhorn trust regions, namely Wasserstein policy optimization (WPO) and Sinkhorn policy optimization (SPO). Instead of restricting the policy to a parametric distribution class, we directly optimize the policy distribution and derive their close-form policy updates based on the Lagrangian duality. Theoretically, we show that WPO guarantees a monotonic performance improvement, and SPO provably converges to WPO as the entropic regularizer diminishes. Experiments across tabular domains and robotic locomotion tasks further demonstrate the performance improvement of both approaches, more robustness of WPO to sample insufficiency, and faster convergence of SPO, over state-of-art policy gradient methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jun Song;Chaoyue Zhao;Niao He", "authorids": "~Jun_Song1;~Chaoyue_Zhao2;~Niao_He3", "gender": ";F;", "homepage": ";https://ise.washington.edu/facultyfinder/chaoyue-zhao;http://people.inf.ethz.ch/niaohe", "dblp": ";;https://dblp.uni-trier.de/pers/h/He:Niao.html", "google_scholar": ";B1Li7HoAAAAJ;iNcA81MAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Jun_Song1;~Chaoyue_Zhao2;~Niao_He1", "aff": "University of Illinois, Urbana Champaign;University of Washington, Seattle;Swiss Federal Institute of Technology", "aff_domain": ";uw.edu;ethz.ch", "position": ";Assistant Professor;Assistant Professor", "bibtex": "@misc{\nsong2022efficient,\ntitle={Efficient Wasserstein and Sinkhorn Policy Optimization},\nauthor={Jun Song and Chaoyue Zhao and Niao He},\nyear={2022},\nurl={https://openreview.net/forum?id=Mlwe37htstv}\n}", "github": "", "project": "", "reviewers": "i87P;2JvQ;PvnA;A8WK", "site": "https://openreview.net/forum?id=Mlwe37htstv", "pdf_size": 0, "recommendation": "3;6;6;6", "confidence": "4;4;3;3", "correctness": "3;4;3;4", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "59;42;96;52", "wc_summary_review": "41;13;48;48", "wc_main_review": "321;220;251;269", "wc_review": "421;275;395;369", "wc_reply_reviewers": "2418;7;0;16", "wc_reply_authors": "3080;190;504;308", "reply_reviewers": "7;1;0;1", "reply_authors": "8;3;2;2", "recommendation_avg": [ 5.25, 1.299038105676658 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 62.25, 20.40067400847335 ], "wc_summary_review_avg": [ 37.5, 14.430869689661812 ], "wc_main_review_avg": [ 265.25, 36.649522507121425 ], "wc_review_avg": [ 365.0, 55.11805511808268 ], "wc_reply_reviewers_avg": [ 610.25, 1043.7203588605523 ], "wc_reply_authors_avg": [ 1020.5, 1194.3302516473407 ], "reply_reviewers_avg": [ 2.25, 2.7726341266023544 ], "reply_authors_avg": [ 3.75, 2.48746859276655 ], "replies_avg": [ 31, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=705787033401690004&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;2", "aff_unique_norm": "University of Illinois Urbana-Champaign;University of Washington;Swiss Federal Institute of Technology", "aff_unique_dep": ";;", "aff_unique_url": "https://illinois.edu;https://www.washington.edu;https://www.ethz.ch", "aff_unique_abbr": "UIUC;UW;ETH Zurich", "aff_campus_unique_index": "0;1", "aff_campus_unique": "Urbana-Champaign;Seattle;", "aff_country_unique_index": "0;0;1", "aff_country_unique": "United States;Switzerland" }, { "id": "MmC5WTB-z7", "title": "A HYPOTHESIS FOR THE COGNITIVE DIFFICULTY OF IMAGES", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "This paper proposes a hypothesis to analyze the underlying reason for the cognitive difficulty of an image from two perspectives, i.e. a cognitive image usually makes a DNN strongly activated by cognitive concepts; discarding massive non-cognitive concepts may also help the DNN focus on cognitive concepts. Based on this hypothesis, we use multi-variate interactions to represent cognitive concepts and non-cognitive concepts contained in an image, and further design a set of image revision operations to decrease the cognitive difficulty of the image. In experiments, we found that the revised image was usually more cognitive than the original one. Besides, we also discovered that strengthening cognitive concepts and weakening non-cognitive concepts could improve the aesthetic level of an image.", "keywords": "cognitive difficulty;multi-variate interaction", "primary_area": "", "supplementary_material": "", "author": "Xu Cheng;Xin Wang;Haotian Xue;Zhengyang Liang;Xin Jin;Quanshi Zhang", "authorids": "~Xu_Cheng1;~Xin_Wang25;~Haotian_Xue1;~Zhengyang_Liang2;~Xin_Jin1;~Quanshi_Zhang1", "gender": "F;M;M;M;M;M", "homepage": "https://cx1208.github.io/ChengXuSJTU.github.io/;https://github.com/MaxLeung99;http://jinxin.me;http://qszhang.com;https://xavihart.github.io;https://xinwang98.github.io//", "dblp": "30/828-5;;68/3340-15;http://dblp.uni-trier.de/pers/hd/z/Zhang:Quanshi;;10/5630-108.html", "google_scholar": "https://scholar.google.com/citations?hl=zh-CN;;UGPgvcUAAAA;iFFhHK0AAAAJ;https://scholar.google.com/citations?hl=en;", "orcid": "0009-0001-5086-5673;;0000-0003-3873-1653;;;", "linkedin": ";;;;haotian-xue-gatech/;", "or_profile": "~Xu_Cheng1;~Zhengyang_Liang2;~Xin_Jin1;~Quanshi_Zhang1;~Xue_Haotian1;~Wang_Xin1", "aff": "Shanghai Jiaotong University;Tongji University;Beijing Electronic Science and Technology Institute;Shanghai Jiaotong University;Shanghai Jiaotong University;Shanghai Jiaotong University", "aff_domain": "sjtu.edu.cn;tongji.edu.cn;besti.edu.cn;sjtu.edu.cn;sjtu.edu.cn;sjtu.edu.cn", "position": "PhD student;Undergrad student;Associate Professor;Associate Professor;Undergrad student;PhD student", "bibtex": "@misc{\ncheng2022a,\ntitle={A {HYPOTHESIS} {FOR} {THE} {COGNITIVE} {DIFFICULTY} {OF} {IMAGES}},\nauthor={Xu Cheng and Xin Wang and Haotian Xue and Zhengyang Liang and Xin Jin and Quanshi Zhang},\nyear={2022},\nurl={https://openreview.net/forum?id=MmC5WTB-z7}\n}", "github": "", "project": "", "reviewers": "WPWP;xMGY;DBgF", "site": "https://openreview.net/forum?id=MmC5WTB-z7", "pdf_size": 0, "recommendation": "1;1;3", "confidence": "4;5;4", "correctness": "1;1;3", "technical_novelty": "2;2;2", "empirical_novelty": "1;1;2", "wc_summary_paper": "61;66;81", "wc_summary_review": "27;32;13", "wc_main_review": "507;361;252", "wc_review": "595;459;346", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 1.6666666666666667, 0.9428090415820634 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "correctness_avg": [ 1.6666666666666667, 0.9428090415820634 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 1.3333333333333333, 0.4714045207910317 ], "wc_summary_paper_avg": [ 69.33333333333333, 8.498365855987975 ], "wc_summary_review_avg": [ 24.0, 8.04155872120988 ], "wc_main_review_avg": [ 373.3333333333333, 104.46796425486407 ], "wc_review_avg": [ 466.6666666666667, 101.79827547109474 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.4999999999999999, "corr_recommendation_correctness": 0.9999999999999998, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:ejorMbEaUfcJ:scholar.google.com/&scioq=A+HYPOTHESIS+FOR+THE+COGNITIVE+DIFFICULTY+OF+IMAGES&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;2;0;0;0", "aff_unique_norm": "Shanghai Jiao Tong University;Tongji University;Beijing Electronic Science and Technology Institute", "aff_unique_dep": ";;", "aff_unique_url": "https://www.sjtu.edu.cn;https://www.tongji.edu.cn;", "aff_unique_abbr": "SJTU;Tongji;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "China" }, { "id": "MmXeLCOXL4R", "title": "Convolutional Networks on Enhanced Message-Passing Graph Improve Semi-Supervised Classification with Few Labels", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Efficient message propagation is critical to node classification in sparse graph with few labels that remains largely unaddressed until now. Recently popularized Graph Convolutional Networks (GCNs) lack the ability to propagate message to distant nodes because of over-smoothing. Besides, GCNs with numerous parameters suffer from overfitting when labeled nodes are scarce. We attack this problem via building GCNs on Enhanced Message-Passing Graph (EMPG). The key idea is node classification can benefit from various variants of the original graph that are more efficient for message propagation, based upon the assumption that each variant is a potential structure as more nodes are properly labeled. Specifically, we first map nodes to a latent space through graph embedding that captures structure information. Considering node attributes together, we construct EMPG by adding connections between nodes in close proximity in the latent space. With the help of added connections, EMPG allows a node to propagate message to the right nodes at distance, so that GCNs on EMPG need not stack multiple layers and therefore avoid over-smoothing. However, adding connections may cause message propagation saturation or lead to overfitting. Seeing EMPG as an accumulation of the potential variants of the original graph, we apply dropout to EMPG and train GCNs on various dropout graphs. The features learned from different dropout EMPGs are aggregated to compute the final prediction. Experiments demonstrate a significant improvement on node classification in sparse graph with few labels.", "keywords": "Shallow Graph Networks;Multi-Channel Message Aggregation;Data Augmentation;Over-Smoothing;Overfitting", "primary_area": "", "supplementary_material": "", "author": "Yu Song;Shan Lu;Dehong Qiu", "authorids": "m201876059@hust.edu.cn;m201976407@hust.edu.cn;~Dehong_Qiu1", "gender": ";;M", "homepage": ";;http://sse.hust.edu.cn/info/1037/4583.htm", "dblp": ";;", "google_scholar": ";;", "orcid": ";;my-orcid?orcid=0000-0003-2921-0303", "linkedin": ";;", "or_profile": "m201876059@hust.edu.cn;m201976407@hust.edu.cn;~Dehong_Qiu1", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nsong2022convolutional,\ntitle={Convolutional Networks on Enhanced Message-Passing Graph Improve Semi-Supervised Classification with Few Labels},\nauthor={Yu Song and Shan Lu and Dehong Qiu},\nyear={2022},\nurl={https://openreview.net/forum?id=MmXeLCOXL4R}\n}", "github": "", "project": "", "reviewers": "Q636;cRi9;dqos", "site": "https://openreview.net/forum?id=MmXeLCOXL4R", "pdf_size": 0, "recommendation": "3;3;5", "confidence": "5;4;3", "correctness": "2;2;3", "technical_novelty": "1;1;2", "empirical_novelty": "1;2;2", "wc_summary_paper": "66;63;36", "wc_summary_review": "3;43;6", "wc_main_review": "265;220;139", "wc_review": "334;326;181", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 3.6666666666666665, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "correctness_avg": [ 2.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 1.3333333333333333, 0.4714045207910317 ], "empirical_novelty_avg": [ 1.6666666666666667, 0.4714045207910317 ], "wc_summary_paper_avg": [ 55.0, 13.490737563232042 ], "wc_summary_review_avg": [ 17.333333333333332, 18.190351532856337 ], "wc_main_review_avg": [ 208.0, 52.1344415909483 ], "wc_review_avg": [ 280.3333333333333, 70.31516352979791 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.8660254037844387, "corr_recommendation_correctness": 1.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:soUMs_ZIH7EJ:scholar.google.com/&scioq=Convolutional+Networks+on+Enhanced+Message-Passing+Graph+Improve+Semi-Supervised+Classification+with+Few+Labels&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "MmujBClawFo", "title": "Attention: Self-Expression Is All You Need", "track": "main", "status": "Reject", "tldr": "", "abstract": "Transformer models have achieved significant improvements in performance for various learning tasks in natural language processing and computer vision. Much of their success is attributed to the use of attention layers that capture long-range interactions among data tokens (such as words and image patches) via attention coefficients that are global and adapted to the input data at test time. In this paper we study the principles behind attention and its connections with prior art. Specifically, we show that attention builds upon a long history of prior work on manifold learning and image processing, including methods such as kernel-based regression, non-local means, locally linear embedding, subspace clustering and sparse coding. Notably, we show that self-attention is closely related to the notion of self-expressiveness in subspace clustering, wherein data points to be clustered are expressed as linear combinations of other points with global coefficients that are adapted to the data and capture long-range interactions among data points. We also show that heuristics in sparse self-attention can be studied in a more principled manner using prior literature on sparse coding and sparse subspace clustering. We thus conclude that the key innovations of attention mechanisms relative to prior art are the use of many learnable parameters, and multiple heads and layers.", "keywords": "Self-attention;sparse representation;subspace clustering", "primary_area": "", "supplementary_material": "", "author": "Rene Vidal", "authorids": "~Rene_Vidal1", "gender": "", "homepage": "http://www.vision.jhu.edu", "dblp": "v/ReneVidal", "google_scholar": "https://scholar.google.com/citations?hl=en", "orcid": "", "linkedin": "rene-vidal-74844928/", "or_profile": "~Rene_Vidal1", "aff": "Johns Hopkins University", "aff_domain": "jhu.edu", "position": "Professor", "bibtex": "@misc{\nvidal2022attention,\ntitle={Attention: Self-Expression Is All You Need},\nauthor={Rene Vidal},\nyear={2022},\nurl={https://openreview.net/forum?id=MmujBClawFo}\n}", "github": "", "project": "", "reviewers": "vLjt;K4hc;Upbz", "site": "https://openreview.net/forum?id=MmujBClawFo", "pdf_size": 0, "recommendation": "5;5;5", "confidence": "4;3;4", "correctness": "3;3;3", "technical_novelty": "2;3;3", "empirical_novelty": "2;0;3", "wc_summary_paper": "30;124;115", "wc_summary_review": "36;64;62", "wc_main_review": "313;361;661", "wc_review": "379;549;838", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 5.0, 0.0 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 1.6666666666666667, 1.247219128924647 ], "wc_summary_paper_avg": [ 89.66666666666667, 42.35039026450117 ], "wc_summary_review_avg": [ 54.0, 12.754084313139327 ], "wc_main_review_avg": [ 445.0, 153.98701243936125 ], "wc_review_avg": [ 588.6666666666666, 189.4735396360722 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 1, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16329855485694074505&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "Johns Hopkins University", "aff_unique_dep": "", "aff_unique_url": "https://www.jhu.edu", "aff_unique_abbr": "JHU", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "title": "BadPre: Task-agnostic Backdoor Attacks to Pre-trained NLP Foundation Models", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6220", "id": "Mng8CQ9eBW", "poster": "", "openreview": "https://openreview.net/forum?id=Mng8CQ9eBW", "slides": "https://iclr.cc/virtual/2022/poster/6220", "video": "https://iclr.cc/virtual/2022/poster/6220", "author_site": "Kangjie Chen, Yuxian Meng, Xiaofei Sun, Shangwei Guo, Tianwei Zhang, Jiwei Li, Chun Fan", "tldr": "", "abstract": "Pre-trained Natural Language Processing (NLP) models, which can be adapted to a variety of downstream language tasks via fine-tuning, highly accelerate the learning progress of NLP models. However, NLP models have been shown to be vulnerable to backdoor attacks. Previous NLP backdoor attacks mainly focus on one specific task. This limitation makes existing solutions less applicable to different NLP models which have been widely used in various tasks.\nIn this work, we propose BadPre, the first backdoor attack against various downstream models built based on pre-trained NLP models. BadPre can launch trojan attacks against different language tasks with the same trigger.\nThe key insight of our approach is that downstream models can inherit the security characteristics from the pre-trained models. Specifically, we leverage data posing to the pre-trained NLP models and then inference the downstream models with sentences embedded triggers. Furthermore, to fool backdoor detectors, we design a novel adversarial attack method to generate a more robust trigger.\nExperimental results indicate that our approach can effectively attack a wide range of downstream NLP tasks and exhibit significant robustness against backdoor detectors.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Kangjie Chen;Yuxian Meng;Xiaofei Sun;Shangwei Guo;Tianwei Zhang;Jiwei Li;Chun Fan", "authorids": "~Kangjie_Chen1;~Yuxian_Meng1;~Xiaofei_Sun1;~Shangwei_Guo1;~Tianwei_Zhang1;~Jiwei_Li1;~Chun_Fan1", "gender": "M;M;M;M;M;M;M", "homepage": "https://kangjie.me;https://yuxianmeng.github.io/;;http://www.cs.cqu.edu.cn/info/1332/5290.htm;https://personal.ntu.edu.sg/tianwei.zhang/index.html;https://nlp.stanford.edu/~bdlijiwei/;", "dblp": "204/3003;234/8585;;176/6479;77/7902-4;73/5746-1;", "google_scholar": "vEPnP6oAAAAJ;https://scholar.google.com/citations?hl=zh-CN;hIokU_IAAAAJ;wQrVkBYAAAAJ;9vpiYDIAAAAJ;PwU16JEAAAAJ;", "orcid": "0000-0001-5099-7054;;;;;;", "linkedin": ";;;;;;chunfan/", "or_profile": "~Kangjie_Chen1;~Yuxian_Meng1;~Xiaofei_Sun1;~Shangwei_Guo1;~Tianwei_Zhang1;~Jiwei_Li1;~Chun_Fan1", "aff": "Nanyang Technological University;Shannon.AI;;Chongqing University;Nanyang Technological University;Zhejiang University;Peking University", "aff_domain": "ntu.edu.sg;shannon.ai;;cqu.edu.cn;ntu.edu.sg;zju.edu.cn;pku.edu.cn", "position": "PhD student;Researcher;;Associate Professor;Assistant Professor;Assistant Professor;Researcher", "bibtex": "@inproceedings{\nchen2022badpre,\ntitle={BadPre: Task-agnostic Backdoor Attacks to Pre-trained {NLP} Foundation Models},\nauthor={Kangjie Chen and Yuxian Meng and Xiaofei Sun and Shangwei Guo and Tianwei Zhang and Jiwei Li and Chun Fan},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=Mng8CQ9eBW}\n}", "github": "", "project": "", "reviewers": "Fftf;HcrX;3Pjx;NXbj", "pdf_size": 0, "recommendation": "3;5;8;8", "confidence": "5;3;4;4", "correctness": "2;3;4;3", "technical_novelty": "1;3;1;3", "empirical_novelty": "2;3;3;4", "wc_summary_paper": "76;112;66;153", "wc_summary_review": "70;62;49;14", "wc_main_review": "535;351;551;205", "wc_review": "681;525;666;372", "wc_reply_reviewers": "300;133;25;11", "wc_reply_authors": "1255;1565;591;287", "reply_reviewers": "1;1;1;1", "reply_authors": "2;3;1;1", "recommendation_avg": [ 6.0, 2.1213203435596424 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.0, 1.0 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 101.75, 34.17875802307626 ], "wc_summary_review_avg": [ 48.75, 21.41699091842736 ], "wc_main_review_avg": [ 410.5, 142.31215689462374 ], "wc_review_avg": [ 561.0, 124.9419865377528 ], "wc_reply_reviewers_avg": [ 117.25, 115.59060299176572 ], "wc_reply_authors_avg": [ 924.5, 509.19028859553083 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 1.75, 0.82915619758885 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": 0.8333333333333334, "gs_citation": 125, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14719408199909855554&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "pdf": "https://openreview.net/pdf?id=Mng8CQ9eBW", "email": "ntu.edu.sg;shannon.ai;;cqu.edu.cn;ntu.edu.sg;zju.edu.cn;pku.edu.cn", "author_num": 7, "aff_unique_index": "0;1;2;0;3;4", "aff_unique_norm": "Nanyang Technological University;Shannon.AI;Chongqing University;Zhejiang University;Peking University", "aff_unique_dep": ";;;;", "aff_unique_url": "https://www.ntu.edu.sg;https://www.shannon.ai;https://www.cqu.edu.cn;https://www.zju.edu.cn;http://www.pku.edu.cn", "aff_unique_abbr": "NTU;Shannon.AI;CQU;ZJU;Peking U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2;0;2;2", "aff_country_unique": "Singapore;United States;China" }, { "id": "Mo9R9oqzPo", "title": "New Definitions and Evaluations for Saliency Methods: Staying Intrinsic and Sound", "track": "main", "status": "Reject", "tldr": "", "abstract": " Saliency methods seek to provide human-interpretable explanations for the output of machine learning model on a given input. A plethora of saliency methods exist, as well as an extensive literature on their justifications/criticisms/evaluations. This paper focuses on heat maps based saliency methods that often provide explanations that look best to humans. It tries to introduce methods and evaluations for masked-based saliency methods that are {\\em intrinsic} --- use just the training dataset and the trained net, and do not use separately trained nets, distractor distributions, human evaluations or annotations. Since a mask can be seen as a \"certificate\" justifying the net's answer, we introduce notions of {\\em completeness} and {\\em soundness} (the latter being the new contribution) motivated by logical proof systems. These notions allow a new evaluation of saliency methods, that experimentally provides a novel and stronger justification for several heuristic tricks in the field (T.V. regularization, upscaling). ", "keywords": "saliency;masking based methods", "primary_area": "", "supplementary_material": "/attachment/2d8d96d5a72f8f785282975a54a56f83ac587928.zip", "author": "Arushi Gupta;Nikunj Saunshi;Dingli Yu;Kaifeng Lyu;Sanjeev Arora", "authorids": "~Arushi_Gupta1;~Nikunj_Saunshi1;~Dingli_Yu1;~Kaifeng_Lyu2;~Sanjeev_Arora1", "gender": ";;;M;", "homepage": ";https://www.nikunjsaunshi.com/;https://dingliyu.net/;https://kaifeng.ac/;http://www.cs.princeton.edu/~arora/", "dblp": ";199/2236;39/578;220/3283;a/SArora", "google_scholar": ";F24vXggAAAAJ;KJLJstYAAAAJ;843JJtgAAAAJ;RUP4S68AAAAJ", "orcid": ";;0000-0002-8824-8611;;", "linkedin": ";;;;", "or_profile": "~Arushi_Gupta1;~Nikunj_Saunshi1;~Dingli_Yu1;~Kaifeng_Lyu2;~Sanjeev_Arora1", "aff": "Department of Computer Science, Princeton University;Princeton University;Princeton University;Princeton University;Princeton University", "aff_domain": "cs.princeton.edu;princeton.edu;princeton.edu;princeton.edu;princeton.edu", "position": "PhD student;PhD student;PhD student;PhD student;Full Professor", "bibtex": "@misc{\ngupta2022new,\ntitle={New Definitions and Evaluations for Saliency Methods: Staying Intrinsic and Sound},\nauthor={Arushi Gupta and Nikunj Saunshi and Dingli Yu and Kaifeng Lyu and Sanjeev Arora},\nyear={2022},\nurl={https://openreview.net/forum?id=Mo9R9oqzPo}\n}", "github": "", "project": "", "reviewers": "nxmx;4yoz;LG7F;Ge5Z", "site": "https://openreview.net/forum?id=Mo9R9oqzPo", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "3;4;2;3", "correctness": "2;3;3;3", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "101;53;50;90", "wc_summary_review": "57;29;74;40", "wc_main_review": "389;184;655;194", "wc_review": "547;266;779;324", "wc_reply_reviewers": "127;56;453;24", "wc_reply_authors": "762;421;585;265", "reply_reviewers": "1;1;2;1", "reply_authors": "1;1;2;1", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 3.0, 0.7071067811865476 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 73.5, 22.36626924634504 ], "wc_summary_review_avg": [ 50.0, 17.073371078963874 ], "wc_main_review_avg": [ 355.5, 191.25702601473233 ], "wc_review_avg": [ 479.0, 202.49567896624362 ], "wc_reply_reviewers_avg": [ 165.0, 170.40393187951972 ], "wc_reply_authors_avg": [ 508.25, 185.1099335530106 ], "reply_reviewers_avg": [ 1.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.9271726499455306, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:kftm8bPZyO0J:scholar.google.com/&scioq=New+Definitions+and+Evaluations+for+Saliency+Methods:+Staying+Intrinsic+and+Sound&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Princeton University", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.princeton.edu", "aff_unique_abbr": "Princeton", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "MpJjrfSJ-Xs", "title": "Cross-Domain Cross-Set Few-Shot Learning via Learning Compact and Aligned Representations", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Few-shot learning (FSL) aims to recognize novel query examples with a small support set through leveraging prior knowledge learned from a large-scale training set. In this paper, we extend this task to a more practical setting where the domain shift exists between the support set and query examples and additional unlabeled data in the target domain can be adopted in the meta-training stage. Such new setting, termed cross-domain cross-set FSL (CDSC-FSL), requires the learning system not only to adapt to new classes with few examples but also to be consistent between different domains. To address this paradigm, we propose a novel approach, namely \\textit{stab}PA, to learn prototypical compact and cross-domain aligned representations, so that domain shift and few-shot adaptation can be addressed simultaneously. We evaluate our approach on two new CDCS-FSL benchmarks adapted from the DomainNet and Office-Home datasets, respectively. Remarkably, our approach outperforms multiple elaborated baselines by a large margin and improves 5-shot accuracy by up to 4.7 points.", "keywords": "few-shot learning;representation learning;domain alignment", "primary_area": "", "supplementary_material": "/attachment/d8f066db2279163a1fcf15f8c1ee5addc3d55d07.zip", "author": "Wentao Chen;Zhang Zhang;Wei Wang;Liang Wang;Zilei Wang;Tieniu Tan", "authorids": "~Wentao_Chen1;~Zhang_Zhang1;~Wei_Wang4;~Liang_Wang3;~Zilei_Wang1;~Tieniu_Tan1", "gender": "M;;M;M;M;", "homepage": ";https://zhangzhang80.github.io/;http://cognn.com/;;;", "dblp": ";94/2468-1;;56/4499-1;49/1878;", "google_scholar": "Ho3o8eQAAAAJ;rnRNwEMAAAAJ;https://scholar.google.com/citations?hl=en;;https://scholar.google.com.hk/citations?hl=zh-CN;", "orcid": ";0000-0001-9425-3065;;;;", "linkedin": ";;;;;", "or_profile": "~Wentao_Chen1;~Zhang_Zhang1;~Wei_Wang4;~Liang_Wang3;~Zilei_Wang1;~Tieniu_Tan1", "aff": "University of Science and Technology of China;Institute of Automation, Chinese Academy of Sciences;Beijing Institute for General Artificial Intelligence;Institute of Automation\uff0c CAS\uff0cChina;University of Science and Technology of China;", "aff_domain": "ustc.edu.cn;ia.ac.cn;bigai.ai;ia.ac.cn;ustc.edu.cn;", "position": "PhD student;Associate Professor;Research Scientist;Full Professor;Associate Professor;", "bibtex": "@misc{\nchen2022crossdomain,\ntitle={Cross-Domain Cross-Set Few-Shot Learning via Learning Compact and Aligned Representations},\nauthor={Wentao Chen and Zhang Zhang and Wei Wang and Liang Wang and Zilei Wang and Tieniu Tan},\nyear={2022},\nurl={https://openreview.net/forum?id=MpJjrfSJ-Xs}\n}", "github": "", "project": "", "reviewers": "aCBV;nd5a;9ksV", "site": "https://openreview.net/forum?id=MpJjrfSJ-Xs", "pdf_size": 0, "recommendation": "3;3;5", "confidence": "4;5;4", "correctness": "3;2;3", "technical_novelty": "2;2;2", "empirical_novelty": "2;0;2", "wc_summary_paper": "144;57;147", "wc_summary_review": "95;91;49", "wc_main_review": "519;291;323", "wc_review": "758;439;519", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "1078;563;1193", "reply_reviewers": "0;0;0", "reply_authors": "2;1;2", "recommendation_avg": [ 3.6666666666666665, 0.9428090415820634 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "correctness_avg": [ 2.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 1.3333333333333333, 0.9428090415820634 ], "wc_summary_paper_avg": [ 116.0, 41.737273509418415 ], "wc_summary_review_avg": [ 78.33333333333333, 20.805982045769646 ], "wc_main_review_avg": [ 377.6666666666667, 100.7880063411427 ], "wc_review_avg": [ 572.0, 135.5162966829697 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 944.6666666666666, 273.93227062339497 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.4999999999999999, "corr_recommendation_correctness": 0.5, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12324415757438647098&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 11, "aff_unique_index": "0;1;2;1;0", "aff_unique_norm": "University of Science and Technology of China;Chinese Academy of Sciences;Beijing Institute for General Artificial Intelligence", "aff_unique_dep": ";Institute of Automation;", "aff_unique_url": "http://www.ustc.edu.cn;http://www.ia.cas.cn;http://www.bigaiai.org/", "aff_unique_abbr": "USTC;CAS;BIGAI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "id": "MqEcDNQwOSA", "title": "Reconstructing Word Embeddings via Scattered $k$-Sub-Embedding", "track": "main", "status": "Reject", "tldr": "", "abstract": "The performance of modern neural language models relies heavily on the diversity of the vocabularies. Unfortunately, the language models tend to cover more vocabularies, the embedding parameters in the language models such as multilingual models used to occupy more than a half of their entire learning parameters. To solve this problem, we aim to devise a novel embedding structure to lighten the network without considerably performance degradation. To reconstruct $N$ embedding vectors, we initialize $k$ bundles of $M (\\ll N)$ $k$-sub-embeddings to apply Cartesian product. Furthermore, we assign $k$-sub-embedding using the contextual relationship between tokens from pretrained language models. We adjust our $k$-sub-embedding structure to masked language models to evaluate proposed structure on downstream tasks. Our experimental results show that over 99.9$\\%+$ compressed sub-embeddings for the language models performed comparably with the original embedding structure on GLUE and XNLI benchmarks.", "keywords": "word embedding;natural language understanding;weight sharing;contextual embedding", "primary_area": "", "supplementary_material": "/attachment/dfd514b003e29e12dd836fe51900bbe314ed9c6b.zip", "author": "Soonyong Hwang;Byung-Ro Moon", "authorids": "~Soonyong_Hwang1;~Byung-Ro_Moon1", "gender": "M;M", "homepage": "https://soar.snu.ac.kr;", "dblp": ";", "google_scholar": ";https://scholar.google.com/citations?hl=ko", "orcid": ";", "linkedin": ";", "or_profile": "~Soonyong_Hwang1;~Byung-Ro_Moon1", "aff": "Seoul National University;Seoul National University", "aff_domain": "cse.snu.ac.kr;snu.ac.kr", "position": "MS student;Full Professor", "bibtex": "@misc{\nhwang2022reconstructing,\ntitle={Reconstructing Word Embeddings via Scattered \\$k\\$-Sub-Embedding},\nauthor={Soonyong Hwang and Byung-Ro Moon},\nyear={2022},\nurl={https://openreview.net/forum?id=MqEcDNQwOSA}\n}", "github": "", "project": "", "reviewers": "stnW;SoR3;hAWX;6Swi", "site": "https://openreview.net/forum?id=MqEcDNQwOSA", "pdf_size": 0, "recommendation": "3;3;3;3", "confidence": "4;5;5;4", "correctness": "2;2;2;3", "technical_novelty": "2;1;2;2", "empirical_novelty": "2;1;2;1", "wc_summary_paper": "125;108;91;165", "wc_summary_review": "98;44;36;32", "wc_main_review": "481;417;411;350", "wc_review": "704;569;538;547", "wc_reply_reviewers": "0;104;0;0", "wc_reply_authors": "243;377;164;210", "reply_reviewers": "0;1;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 4.5, 0.5 ], "correctness_avg": [ 2.25, 0.4330127018922193 ], "technical_novelty_avg": [ 1.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.5, 0.5 ], "wc_summary_paper_avg": [ 122.25, 27.453369556395078 ], "wc_summary_review_avg": [ 52.5, 26.622359023948274 ], "wc_main_review_avg": [ 414.75, 46.370114297896656 ], "wc_review_avg": [ 589.5, 67.06153890271233 ], "wc_reply_reviewers_avg": [ 26.0, 45.033320996790806 ], "wc_reply_authors_avg": [ 248.5, 79.31740036082877 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:-ytgY5W-o54J:scholar.google.com/&scioq=Reconstructing+Word+Embeddings+via+Scattered+%24k%24-Sub-Embedding&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Seoul National University", "aff_unique_dep": "", "aff_unique_url": "https://www.snu.ac.kr", "aff_unique_abbr": "SNU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "South Korea" }, { "title": "iFlood: A Stable and Effective Regularizer", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6432", "id": "MsHnJPaBUZE", "poster": "", "openreview": "https://openreview.net/forum?id=MsHnJPaBUZE", "slides": "https://iclr.cc/virtual/2022/poster/6432", "video": "https://iclr.cc/virtual/2022/poster/6432", "author_site": "Yuexiang Xie, Zhen WANG, Yaliang Li, Ce Zhang, Jingren Zhou, Bolin Ding", "tldr": "", "abstract": "Various regularization methods have been designed to prevent overfitting of machine learning models. Among them, a surprisingly simple yet effective one, called Flooding, is proposed recently, which directly constrains the training loss on average to stay at a given level. However, our further studies uncover that the design of the loss function of Flooding can lead to a discrepancy between its objective and implementation, and cause the instability issue. To resolve these issues, in this paper, we propose a new regularizer, called individual Flood (denoted as iFlood). With instance-level constraints on training loss, iFlood encourages the trained models to better fit the under-fitted instances while suppressing the confidence on over-fitted ones. We theoretically show that the design of iFlood can be intrinsically connected with removing the noise or bias in training data, which makes it suitable for a variety of applications to improve the generalization performances of learned models. We also theoretically link iFlood to some other regularizers by comparing the inductive biases they introduce. Our experimental results on both image classification and language understanding tasks confirm that models learned with iFlood can stably converge to solutions with better generalization ability, and behave consistently at instance-level.", "keywords": "overfitting;regularizer", "primary_area": "", "supplementary_material": "", "author": "Yuexiang Xie;Zhen WANG;Yaliang Li;Ce Zhang;Jingren Zhou;Bolin Ding", "authorids": "~Yuexiang_Xie1;~Zhen_WANG2;~Yaliang_Li1;~Ce_Zhang1;~Jingren_Zhou1;~Bolin_Ding3", "gender": "M;M;M;;M;M", "homepage": "https://xieyxclack.github.io/;https://joneswong.github.io/;https://sites.google.com/site/yaliangli/;;;https://bolinding.github.io/", "dblp": "232/2045;78/6727-36;https://dblp.org/pers/hd/l/Li:Yaliang;97/919;84/2644;46/3522.html", "google_scholar": "https://scholar.google.com/citations?hl=zh-CN;e5CqTBMAAAAJ;CCPBcdYAAAAJ;;;AjYkTi8AAAAJ", "orcid": "0009-0005-6545-7882;0000-0002-8140-8782;0000-0002-4204-6096;;;", "linkedin": ";;;;;bolin-ding-50a0119/", "or_profile": "~Yuexiang_Xie1;~Zhen_WANG2;~Yaliang_Li1;~Ce_Zhang1;~Jingren_Zhou1;~Bolin_Ding3", "aff": "Alibaba Group;Alibaba Group;Alibaba Group;University of Chicago;Alibaba Group;Alibaba Group", "aff_domain": "alibaba-inc.com;alibaba-inc.com;alibaba-inc.com;uchicago.edu;alibaba-inc.com;alibaba-inc.com", "position": "Staff;Researcher;Staff Engineer;Associate Professor;Researcher;Senior Director", "bibtex": "@inproceedings{\nxie2022iflood,\ntitle={iFlood: A Stable and Effective Regularizer},\nauthor={Yuexiang Xie and Zhen WANG and Yaliang Li and Ce Zhang and Jingren Zhou and Bolin Ding},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=MsHnJPaBUZE}\n}", "github": "", "project": "", "reviewers": "bQVY;asHY;WRr1;fRos", "pdf_size": 0, "recommendation": "6;6;6;6", "confidence": "4;4;4;4", "correctness": "3;4;3;3", "technical_novelty": "3;4;4;3", "empirical_novelty": "2;3;3;2", "wc_summary_paper": "57;55;83;43", "wc_summary_review": "37;36;59;23", "wc_main_review": "116;219;249;492", "wc_review": "210;310;391;558", "wc_reply_reviewers": "59;22;0;20", "wc_reply_authors": "754;613;520;611", "reply_reviewers": "1;1;0;1", "reply_authors": "1;1;2;1", "recommendation_avg": [ 6.0, 0.0 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 3.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 59.5, 14.585952145814822 ], "wc_summary_review_avg": [ 38.75, 12.93010054098575 ], "wc_main_review_avg": [ 269.0, 137.87494333634376 ], "wc_review_avg": [ 367.25, 127.43110883924695 ], "wc_reply_reviewers_avg": [ 25.25, 21.299941314473145 ], "wc_reply_authors_avg": [ 624.5, 83.67347249875554 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3143223974306371618&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=MsHnJPaBUZE", "email": "alibaba-inc.com;alibaba-inc.com;alibaba-inc.com;uchicago.edu;alibaba-inc.com;alibaba-inc.com", "author_num": 6, "aff_unique_index": "0;0;0;1;0;0", "aff_unique_norm": "Alibaba Group;University of Chicago", "aff_unique_dep": ";", "aff_unique_url": "https://www.alibaba.com;https://www.uchicago.edu", "aff_unique_abbr": "Alibaba;UChicago", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;0;0", "aff_country_unique": "China;United States" }, { "title": "From Stars to Subgraphs: Uplifting Any GNN with Local Structure Awareness", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6178", "id": "Mspk_WYKoEH", "poster": "", "openreview": "https://openreview.net/forum?id=Mspk_WYKoEH", "slides": "https://iclr.cc/virtual/2022/poster/6178", "video": "https://iclr.cc/virtual/2022/poster/6178", "author_site": "Lingxiao Zhao, Wei Jin, Leman Akoglu, Neil Shah", "tldr": "", "abstract": "Message Passing Neural Networks (MPNNs) are a common type of Graph Neural Network (GNN), in which each node\u2019s representation is computed recursively by aggregating representations (\u201cmessages\u201d) from its immediate neighbors akin to a star-shaped pattern. MPNNs are appealing for being efficient and scalable, however their expressiveness is upper-bounded by the 1st-order Weisfeiler-Lehman isomorphism test (1-WL). In response, prior works propose highly expressive models at the cost of scalability and sometimes generalization performance. Our work stands between these two regimes: we introduce a general framework to uplift any MPNN to be more expressive, with limited scalability overhead and greatly improved practical performance. We achieve this by extending local aggregation in MPNNs from star patterns to general subgraph patterns (e.g., k-egonets): in our framework, each node representation is computed as the encoding of a surrounding induced subgraph rather than encoding of immediate neighbors only (i.e. a star). We choose the subgraph encoder to be a GNN (mainly MPNNs, considering scalability) to design a general framework that serves as a wrapper to uplift any GNN. We call our proposed method GNN-AK (GNN As Kernel), as the framework resembles a convolutional neural network by replacing the kernel with\nGNNs. Theoretically, we show that our framework is strictly more powerful than 1&2-WL, and is not less powerful than 3-WL. We also design subgraph sampling strategies which greatly reduce memory footprint and improve speed while maintaining performance. Our method sets new state-of-the-art performance by large margins for several well-known graph ML tasks; specifically, 0.08 MAE on ZINC,\n74.79% and 86.887% accuracy on CIFAR10 and PATTERN respectively.", "keywords": "Graph Neural Networks;Expressiveness;Message Passing Neural Network;Graph Classification", "primary_area": "", "supplementary_material": "", "author": "Lingxiao Zhao;Wei Jin;Leman Akoglu;Neil Shah", "authorids": "~Lingxiao_Zhao1;~Wei_Jin4;~Leman_Akoglu3;~Neil_Shah2", "gender": "M;;F;M", "homepage": "http://lingxiaozhao.com/;http://www.cs.emory.edu/~wjin30/;http://www.andrew.cmu.edu/user/lakoglu/;http://nshah.net", "dblp": ";66/2173-9;02/6979.html;71/7771", "google_scholar": "QKslW6EAAAAJ;eWow24EAAAAJ;4ITkr_kAAAAJ;Qut69OgAAAAJ", "orcid": ";;;0000-0003-3261-8430", "linkedin": ";;;", "or_profile": "~Lingxiao_Zhao1;~Wei_Jin4;~Leman_Akoglu3;~Neil_Shah2", "aff": "Carnegie Mellon University;Michigan State University;Carnegie Mellon University;Snap Inc.", "aff_domain": "andrew.cmu.edu;msu.edu;cmu.edu;snap.com", "position": "PhD student;PhD student;Assistant Professor;Research Scientist", "bibtex": "@inproceedings{\nzhao2022from,\ntitle={From Stars to Subgraphs: Uplifting Any {GNN} with Local Structure Awareness},\nauthor={Lingxiao Zhao and Wei Jin and Leman Akoglu and Neil Shah},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=Mspk_WYKoEH}\n}", "github": "", "project": "", "reviewers": "KytV;LzXw;MkR2;oHnh", "pdf_size": 0, "recommendation": "6;6;6;8", "confidence": "4;5;5;4", "correctness": "1;3;3;4", "technical_novelty": "3;3;2;3", "empirical_novelty": "3;3;3;3", "wc_summary_paper": "127;75;143;54", "wc_summary_review": "45;34;116;52", "wc_main_review": "270;200;1016;775", "wc_review": "442;309;1275;881", "wc_reply_reviewers": "36;23;793;524", "wc_reply_authors": "654;510;3153;1825", "reply_reviewers": "1;1;2;2", "reply_authors": "2;2;7;5", "recommendation_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 4.5, 0.5 ], "correctness_avg": [ 2.75, 1.0897247358851685 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 99.75, 36.46488036453705 ], "wc_summary_review_avg": [ 61.75, 31.971667144520318 ], "wc_main_review_avg": [ 565.25, 341.96152926901004 ], "wc_review_avg": [ 726.75, 380.77839683994677 ], "wc_reply_reviewers_avg": [ 344.0, 328.5977784465379 ], "wc_reply_authors_avg": [ 1535.5, 1064.0499283398312 ], "reply_reviewers_avg": [ 1.5, 0.5 ], "reply_authors_avg": [ 4.0, 2.1213203435596424 ], "replies_avg": [ 29, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.6622661785325219, "gs_citation": 210, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4598272290624376922&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=Mspk_WYKoEH", "email": "andrew.cmu.edu;msu.edu;cmu.edu;snap.com", "author_num": 4, "aff_unique_index": "0;1;0;2", "aff_unique_norm": "Carnegie Mellon University;Michigan State University;Snap Inc.", "aff_unique_dep": ";;", "aff_unique_url": "https://www.cmu.edu;https://www.msu.edu;https://www.snapinc.com", "aff_unique_abbr": "CMU;MSU;Snap", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "Muwg-ncP_ec", "title": "Exact Stochastic Newton Method for Deep Learning: the feedforward networks case.", "track": "main", "status": "Reject", "tldr": "", "abstract": "The inclusion of second-order information into Deep Learning optimization has drawn consistent interest as a way forward to improve upon gradient descent methods. Estimating the second-order update is often convoluted and computationally expensive, which drastically limits its usage scope and forces the use of various truncations and approximations.\nThis work demonstrates that it is possible to solve the Newton direction in the stochastic case exactly. We consider feedforward networks as a base model, build a second-order Lagrangian which we call Sifrian, and provide a closed-form formula for the exact stochastic Newton direction under some monotonicity and regularization conditions. We propose a convexity correction to escape saddle points, and we reconsider the intrinsic stochasticity of the online learning process to improve upon the formulas. We finally compare the performance of the developed solution with well-established training methods and show its viability as a training method for Deep Learning.", "keywords": "Deep Learning;Second-order Optimization;Newton Method;Sifrian;Hessian;Exact Stochastic Newton;Saddle-Free Newton;Non-Convex Optimization", "primary_area": "", "supplementary_material": "/attachment/2b919f6ab6703e656f25ab6c91e77b531edd3a42.zip", "author": "Fares B. Mehouachi;Chaouki Kasmi", "authorids": "~Fares_B._Mehouachi1;~Chaouki_Kasmi1", "gender": ";M", "homepage": ";", "dblp": ";", "google_scholar": ";1nuI3DcAAAAJ", "orcid": ";", "linkedin": ";https://ae.linkedin.com/in/kasmichaouki", "or_profile": "~Fares_B._Mehouachi1;~Chaouki_Kasmi1", "aff": ";Directed Energy Research Centre, Technology Innovation Institute, Abu Dhabi, UAE", "aff_domain": ";tii.ae", "position": ";Chief Researcher", "bibtex": "@misc{\nmehouachi2022exact,\ntitle={Exact Stochastic Newton Method for Deep Learning: the feedforward networks case.},\nauthor={Fares B. Mehouachi and Chaouki Kasmi},\nyear={2022},\nurl={https://openreview.net/forum?id=Muwg-ncP_ec}\n}", "github": "", "project": "", "reviewers": "CD8M;wxhq;Qnsb;3eEq", "site": "https://openreview.net/forum?id=Muwg-ncP_ec", "pdf_size": 0, "recommendation": "3;3;3;6", "confidence": "4;3;4;2", "correctness": "2;2;3;4", "technical_novelty": "1;2;1;2", "empirical_novelty": "2;2;1;2", "wc_summary_paper": "44;87;129;60", "wc_summary_review": "21;105;11;81", "wc_main_review": "470;762;427;81", "wc_review": "535;954;567;222", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "773;817;759;179", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 3.75, 1.299038105676658 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "correctness_avg": [ 2.75, 0.82915619758885 ], "technical_novelty_avg": [ 1.5, 0.5 ], "empirical_novelty_avg": [ 1.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 80.0, 32.1947200640105 ], "wc_summary_review_avg": [ 54.5, 39.58219296602956 ], "wc_main_review_avg": [ 435.0, 241.62677831730488 ], "wc_review_avg": [ 569.5, 259.7080091179323 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 632.0, 262.4137953690697 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.8703882797784892, "corr_recommendation_correctness": 0.8703882797784892, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2127765329509576607&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "Technology Innovation Institute", "aff_unique_dep": "Directed Energy Research Centre", "aff_unique_url": "https://www.tii.ae", "aff_unique_abbr": "TII", "aff_campus_unique_index": "0", "aff_campus_unique": "Abu Dhabi", "aff_country_unique_index": "0", "aff_country_unique": "United Arab Emirates" }, { "title": "Wisdom of Committees: An Overlooked Approach To Faster and More Accurate Models", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6397", "id": "MvO2t0vbs4-", "poster": "", "openreview": "https://openreview.net/forum?id=MvO2t0vbs4-", "slides": "https://iclr.cc/virtual/2022/poster/6397", "video": "https://iclr.cc/virtual/2022/poster/6397", "author_site": "Xiaofang Wang, Dan Kondratyuk, Eric Christiansen, Kris Kitani, Yair Movshovitz-Attias, Elad Eban", "tldr": "", "abstract": "Committee-based models (ensembles or cascades) construct models by combining existing pre-trained ones. While ensembles and cascades are well-known techniques that were proposed before deep learning, they are not considered a core building block of deep model architectures and are rarely compared to in recent literature on developing efficient models. In this work, we go back to basics and conduct a comprehensive analysis of the efficiency of committee-based models. We find that even the most simplistic method for building committees from existing, independently pre-trained models can match or exceed the accuracy of state-of-the-art models while being drastically more efficient. These simple committee-based models also outperform sophisticated neural architecture search methods (e.g., BigNAS). These findings hold true for several tasks, including image classification, video classification, and semantic segmentation, and various architecture families, such as ViT, EfficientNet, ResNet, MobileNetV2, and X3D. Our results show that an EfficientNet cascade can achieve a 5.4x speedup over B7 and a ViT cascade can achieve a 2.3x speedup over ViT-L-384 while being equally accurate.", "keywords": "Ensemble;Cascade;Efficiency", "primary_area": "", "supplementary_material": "", "author": "Xiaofang Wang;Dan Kondratyuk;Eric Christiansen;Kris M. Kitani;Yair Movshovitz-Attias;Elad Eban", "authorids": "~Xiaofang_Wang1;dankondratyuk@google.com;~Eric_Christiansen1;~Kris_M._Kitani1;~Yair_Movshovitz-Attias1;~Elad_Eban1", "gender": "M;;;M;M;M", "homepage": "http://www.cs.cmu.edu/~xiaofan2/;;;http://www.cs.cmu.edu/~kkitani/;https://research.google/people/YairMovshovitzAttias/;http://www.eladeban.name/", "dblp": ";;135/6105;42/163;http://dblp.uni-trier.de/pers/hd/m/Movshovitz=Attias:Yair;36/8530", "google_scholar": "YQomDVsAAAAJ;;;yv3sH74AAAAJ;https://scholar.google.co.il/citations?hl=en;jW80JWEAAAAJ", "orcid": ";;;0000-0002-9389-4060;0000-0002-7736-6444;", "linkedin": ";;;;;elade/", "or_profile": "~Xiaofang_Wang1;dankondratyuk@google.com;~Eric_Christiansen1;~Kris_M._Kitani1;~Yair_Movshovitz-Attias1;~Elad_Eban1", "aff": "Carnegie Mellon University;;;Carnegie Mellon University;Google Research;Google", "aff_domain": "cmu.edu;;;cmu.edu;google.com;google.com", "position": "PhD student;;;Associate Professor;Researcher;Researcher", "bibtex": "@inproceedings{\nwang2022wisdom,\ntitle={Wisdom of Committees: An Overlooked Approach To Faster and More Accurate Models},\nauthor={Xiaofang Wang and Dan Kondratyuk and Eric Christiansen and Kris M. Kitani and Yair Movshovitz-Attias and Elad Eban},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=MvO2t0vbs4-}\n}", "github": "", "project": "", "reviewers": "MpvP;KTja;62w1", "pdf_size": 0, "recommendation": "6;6;6", "confidence": "3;4;4", "correctness": "3;3;3", "technical_novelty": "2;3;2", "empirical_novelty": "3;3;2", "wc_summary_paper": "71;89;19", "wc_summary_review": "48;65;25", "wc_main_review": "686;528;517", "wc_review": "805;682;561", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "1719;1009;1784", "reply_reviewers": "0;0;0", "reply_authors": "5;2;4", "recommendation_avg": [ 6.0, 0.0 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 59.666666666666664, 29.67977238606942 ], "wc_summary_review_avg": [ 46.0, 16.391054470858997 ], "wc_main_review_avg": [ 577.0, 77.20535387307454 ], "wc_review_avg": [ 682.6666666666666, 99.61369829942511 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 1504.0, 351.0223164795462 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 3.6666666666666665, 1.247219128924647 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 60, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16247236605974061666&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=MvO2t0vbs4-", "email": "cmu.edu;;;cmu.edu;google.com;google.com", "author_num": 6, "aff_unique_index": "0;0;1;1", "aff_unique_norm": "Carnegie Mellon University;Google", "aff_unique_dep": ";Google Research", "aff_unique_url": "https://www.cmu.edu;https://research.google", "aff_unique_abbr": "CMU;Google Research", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "Mvf5zr2qs6", "title": "Bias Decay Matters : Improving Large Batch Optimization with Connectivity Sharpness", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "As deep learning becomes computationally intensive, the data parallelism is an essential option for the efficient training of high-performance models. Accordingly, the recent studies deal with the methods for increasing batch size in training the model. Many recent studies focused on learning rate, which determines the noise scale of parameter updates~\\citep{goyal2017accurate, you2017large, You2020Large} and found that a high learning rate is essential for maintaining generalization performance and flatness of the local minimizers~\\citep{Jastrzebski2020The, cohen2021gradient, lewkowycz2020large}. But to fill the performance gap that still exists in the large batch optimization, we study a method to directly control the flatness of local minima. Toward this, we define yet another sharpness measure called \\textit{Connectivity sharpness}, a reparameterization invariant, structurally separable sharpness measure. Armed with this measure, we experimentally found the standard \\textit{no bias decay heuristic}~\\citep{goyal2017accurate, he2019bag}, which recommends the bias parameters and $\\gamma$ and $\\beta$ in BN layers are left unregularized in training, is a crucial reason for performance degradation in large batch optimization. To mitigate this issue, we propose simple bias decay methods including a novel adaptive one and found that this simple remedy can fill a large portion of the performance gaps that occur in large batch optimization. ", "keywords": "large batch optimization;sharpness/flatness", "primary_area": "", "supplementary_material": "", "author": "SungYub Kim;Sihwan Park;Yong-Deok Kim;Eunho Yang", "authorids": "~SungYub_Kim1;~Sihwan_Park1;yd.mlg.kim@samsung.com;~Eunho_Yang1", "gender": "M;;;M", "homepage": "https://sungyubkim.github.io;https://siihwanpark.github.io;;https://sites.google.com/site/hleehome2/", "dblp": "236/4532;330/2380-1;;96/2621", "google_scholar": "m2rhgrkAAAAJ;https://scholar.google.com/citations?;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~SungYub_Kim1;~Sihwan_Park1;yd.mlg.kim@samsung.com;~Eunho_Yang1", "aff": "Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology;;Korea Advanced Institute of Science & Technology", "aff_domain": "kaist.ac.kr;kaist.ac.kr;;kaist.ac.kr", "position": "PhD student;MS student;;Associate Professor", "bibtex": "@misc{\nkim2022bias,\ntitle={Bias Decay Matters : Improving Large Batch Optimization with Connectivity Sharpness},\nauthor={SungYub Kim and Sihwan Park and Yong-Deok Kim and Eunho Yang},\nyear={2022},\nurl={https://openreview.net/forum?id=Mvf5zr2qs6}\n}", "github": "", "project": "", "reviewers": "VpB6;9x8U;6qpr;2CkH", "site": "https://openreview.net/forum?id=Mvf5zr2qs6", "pdf_size": 0, "recommendation": "3;3;3;6", "confidence": "4;3;4;3", "correctness": "2;3;2;4", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "137;82;29;36", "wc_summary_review": "115;71;66;40", "wc_main_review": "681;396;227;75", "wc_review": "933;549;322;151", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "354;154;251;67", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 3.75, 1.299038105676658 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 2.75, 0.82915619758885 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 71.0, 43.2030091544559 ], "wc_summary_review_avg": [ 73.0, 26.953663943887108 ], "wc_main_review_avg": [ 344.75, 224.9003946194848 ], "wc_review_avg": [ 488.75, 292.77497758517546 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 206.5, 107.18325428909127 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.8703882797784892, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:MDc3iYFUCAkJ:scholar.google.com/&scioq=Bias+Decay+Matters+:+Improving+Large+Batch+Optimization+with+Connectivity+Sharpness&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0;0", "aff_unique_norm": "Korea Advanced Institute of Science and Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.kaist.ac.kr", "aff_unique_abbr": "KAIST", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "South Korea" }, { "id": "MvtLspSX324", "title": "Go with the Flow: the distribution of information processing in multi-path networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "The architectures of convolution neural networks (CNN) have a great impact on the predictive performance and efficiency of the model.\nYet, the development of these architectures is still driven by trial and error, making the design of novel models a costly endeavor. \nTo move towards a more guided process, the impact of design decisions on information processing must be understood better. \nThis work contributes by analyzing the processing of the information in neural architectures with parallel pathways.\nUsing logistic regression probes and similarity indices, we characterize the role of different pathways in the network during the inference process.\nIn detail, we find that similar sized pathways advance the solution quality at a similar pace, with high redundancy.\nOn the other hand, shorter pathways dominate longer ones by majorly transporting (and improving) the main signal, while longer pathways do not advance the solution quality directly. \nAdditionally, we explore the situation in which networks start to ``skip'' layers and how the skipping of layers is expressed.", "keywords": "Deep Learning;Multi-path Networks;Similarity of Representations;Regression Probe;Centered Kernel Alignment", "primary_area": "", "supplementary_material": "", "author": "Mats Leon Richter;Krupal Shah;Anna Wiedenroth;Saketh Bachu;Ulf Krumnack", "authorids": "~Mats_Leon_Richter1;~Krupal_Shah3;~Anna_Wiedenroth1;~Saketh_Bachu1;~Ulf_Krumnack1", "gender": "M;M;;M;", "homepage": ";;;https://sakethbachu.github.io/;", "dblp": "245/2691.html;;;;15/778", "google_scholar": "xtlV5SAAAAAJ;;;HfhRAl0AAAAJ;", "orcid": ";;;;0000-0003-1976-8186", "linkedin": "https://de.linkedin.com/in/mats-richter-879609154;krupal-shah-09;anna-richter-25847a1b6;saketh-bachu-7133ab171/;", "or_profile": "~Mats_Leon_Richter1;~Krupal_Shah3;~Anna_Wiedenroth1;~Saketh_Bachu1;~Ulf_Krumnack1", "aff": "Universit\u00e4t Osnabr\u00fcck;University of Osnabr\u00fcck;Universit\u00e4t Osnabr\u00fcck;Visvesvaraya National Institute of Technology;Institute of Cognitive Science, Osnabr\u00fcck University, Universit\u00e4t Osnabr\u00fcck", "aff_domain": "uni-osnabrueck.de;uos.de;uni-osnabrueck.de;vnit.ac.in;ikw.uni-osnabrueck.de", "position": "PhD student;MS student;MS student;Undergrad student;Postdoc", "bibtex": "@misc{\nrichter2022go,\ntitle={Go with the Flow: the distribution of information processing in multi-path networks},\nauthor={Mats Leon Richter and Krupal Shah and Anna Wiedenroth and Saketh Bachu and Ulf Krumnack},\nyear={2022},\nurl={https://openreview.net/forum?id=MvtLspSX324}\n}", "github": "", "project": "", "reviewers": "ac9X;Hchc;PRea", "site": "https://openreview.net/forum?id=MvtLspSX324", "pdf_size": 0, "recommendation": "3;3;5", "confidence": "5;3;3", "correctness": "3;3;3", "technical_novelty": "1;2;2", "empirical_novelty": "2;2;2", "wc_summary_paper": "60;110;142", "wc_summary_review": "92;83;54", "wc_main_review": "697;373;290", "wc_review": "849;566;486", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 3.6666666666666665, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.9428090415820634 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 1.6666666666666667, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 104.0, 33.7441352929167 ], "wc_summary_review_avg": [ 76.33333333333333, 16.21384867602041 ], "wc_main_review_avg": [ 453.3333333333333, 175.59865856232753 ], "wc_review_avg": [ 633.6666666666666, 155.72696904803897 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.5000000000000001, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:abzv3i6msWEJ:scholar.google.com/&scioq=Go+with+the+Flow:+the+distribution+of+information+processing+in+multi-path+networks&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;0;1;2", "aff_unique_norm": "University of Osnabr\u00fcck;Visvesvaraya National Institute of Technology;Osnabr\u00fcck University", "aff_unique_dep": ";;Institute of Cognitive Science", "aff_unique_url": "https://www.uni-osnabrueck.de;https://vnit.ac.in;https://www.uni-osnabrueck.de", "aff_unique_abbr": "UOS;VNIT;Uni Osnabr\u00fcck", "aff_campus_unique_index": "1", "aff_campus_unique": ";Osnabr\u00fcck", "aff_country_unique_index": "0;0;0;1;0", "aff_country_unique": "Germany;India" }, { "title": "Unsupervised Vision-Language Grammar Induction with Shared Structure Modeling", "status": "Oral", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6110", "id": "N0n_QyQ5lBF", "poster": "", "openreview": "https://openreview.net/forum?id=N0n_QyQ5lBF", "slides": "https://iclr.cc/virtual/2022/poster/6110", "video": "https://iclr.cc/virtual/2022/poster/6110", "author_site": "Bo Wan, Wenjuan Han, Zilong Zheng, Tinne Tuytelaars", "tldr": "", "abstract": "We introduce a new task, unsupervised vision-language (VL) grammar induction. Given an image-caption pair, the goal is to extract a shared hierarchical structure for both image and language simultaneously. We argue that such structured output, grounded in both modalities, is a clear step towards the high-level understanding of multimodal information. Besides challenges existing in conventional visually grounded grammar induction tasks, VL grammar induction requires a model to capture contextual semantics and perform a fine-grained alignment. To address these challenges, we propose a novel method, CLIORA, which constructs a shared vision-language constituency tree structure with context-dependent semantics for all possible phrases in different levels of the tree. It computes a matching score between each constituent and image region, trained via contrastive learning. It integrates two levels of fusion, namely at feature-level and at score-level, so as to allow fine-grained alignment. We introduce a new evaluation metric for VL grammar induction, CCRA, and show a 3.3% improvement over a strong baseline on Flickr30k Entities. We also evaluate our model via two derived tasks, i.e., language grammar induction and phrase grounding, and improve over the state-of-the-art for both.", "keywords": "Grammar Induction;Vision-Language Matching;Unsupervised Learning", "primary_area": "", "supplementary_material": "/attachment/d6b7ddd3bf6e5c1f4edc51b23b8cbe88b0db163d.zip", "author": "Bo Wan;Wenjuan Han;Zilong Zheng;Tinne Tuytelaars", "authorids": "~Bo_Wan1;~Wenjuan_Han1;~Zilong_Zheng1;~Tinne_Tuytelaars1", "gender": "M;F;M;", "homepage": "https://bobwan.w3spaces.com/;https://scholar.google.com/citations?user=rfVLLfAAAAAJ;http://zilongzheng.github.io;", "dblp": "86/4321.html;188/9071;218/5234;", "google_scholar": "_7KkpE4AAAAJ;rfVLLfAAAAAJ;9sDx70IAAAAJ;", "orcid": ";0000-0002-2327-0842;;", "linkedin": ";;;", "or_profile": "~Bo_Wan1;~Wenjuan_Han1;~Zilong_Zheng1;~Tinne_Tuytelaars1", "aff": "Google;Beijing Jiaotong University;Beijing Institute for General Artificial Intelligence;", "aff_domain": "google.com;bjtu.edu.cn;bigai.ai;", "position": "Intern;Associate Professor;Researcher;", "bibtex": "@inproceedings{\nwan2022unsupervised,\ntitle={Unsupervised Vision-Language Grammar Induction with Shared Structure Modeling},\nauthor={Bo Wan and Wenjuan Han and Zilong Zheng and Tinne Tuytelaars},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=N0n_QyQ5lBF}\n}", "github": "", "project": "", "reviewers": "cvGw;3QG1;hzHn", "pdf_size": 0, "recommendation": "8;8;8", "confidence": "3;3;5", "correctness": "3;3;3", "technical_novelty": "3;3;3", "empirical_novelty": "3;3;3", "wc_summary_paper": "71;75;26", "wc_summary_review": "20;26;62", "wc_main_review": "157;191;783", "wc_review": "248;292;871", "wc_reply_reviewers": "23;0;464", "wc_reply_authors": "220;138;1294", "reply_reviewers": "1;0;2", "reply_authors": "1;1;3", "recommendation_avg": [ 8.0, 0.0 ], "confidence_avg": [ 3.6666666666666665, 0.9428090415820634 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 57.333333333333336, 22.21611027060218 ], "wc_summary_review_avg": [ 36.0, 18.547236990991408 ], "wc_main_review_avg": [ 377.0, 287.42071370495665 ], "wc_review_avg": [ 470.3333333333333, 283.8829962423878 ], "wc_reply_reviewers_avg": [ 162.33333333333334, 213.51710834393467 ], "wc_reply_authors_avg": [ 550.6666666666666, 526.6810124628464 ], "reply_reviewers_avg": [ 1.0, 0.816496580927726 ], "reply_authors_avg": [ 1.6666666666666667, 0.9428090415820634 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 24, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12709301496756816366&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=N0n_QyQ5lBF", "email": "google.com;bjtu.edu.cn;bigai.ai;", "author_num": 4, "aff_unique_index": "0;1;2", "aff_unique_norm": "Google;Beijing Jiao Tong University;Beijing Institute for General Artificial Intelligence", "aff_unique_dep": "Google;;", "aff_unique_url": "https://www.google.com;http://www.njtu.edu.cn/en;http://www.bigaiai.org/", "aff_unique_abbr": "Google;BJTU;BIGAI", "aff_campus_unique_index": "0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;1;1", "aff_country_unique": "United States;China" }, { "title": "Bag of Instances Aggregation Boosts Self-supervised Distillation", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/5909", "id": "N0uJGWDw21d", "poster": "", "openreview": "https://openreview.net/forum?id=N0uJGWDw21d", "slides": "https://iclr.cc/virtual/2022/poster/5909", "video": "https://iclr.cc/virtual/2022/poster/5909", "author_site": "Haohang Xu, Jiemin Fang, XIAOPENG ZHANG, Lingxi Xie, Xinggang Wang, Wenrui Dai, Hongkai Xiong, Qi Tian", "tldr": "", "abstract": "Recent advances in self-supervised learning have experienced remarkable progress, especially for contrastive learning based methods, which regard each image as well as its augmentations as an individual class and try to distinguish them from all other images. However, due to the large quantity of exemplars, this kind of pretext task intrinsically suffers from slow convergence and is hard for optimization. This is especially true for small-scale models, in which we find the performance drops dramatically comparing with its supervised counterpart. In this paper, we propose a simple but effective distillation strategy for unsupervised learning. The highlight is that the relationship among similar samples counts and can be seamlessly transferred to the student to boost the performance. Our method, termed as BINGO, which is short for Bag of InstaNces aGgregatiOn, targets at transferring the relationship learned by the teacher to the student. Here bag of instances indicates a set of similar samples constructed by the teacher and are grouped within a bag, and the goal of distillation is to aggregate compact representations over the student with respect to instances in a bag. Notably, BINGO achieves new state-of-the-art performance on small-scale models, i.e., 65.5% and 68.9% top-1 accuracies with linear evaluation on ImageNet, using ResNet-18 and ResNet-34 as the backbones respectively, surpassing baselines (52.5% and 57.4% top-1 accuracies) by a significant margin. The code is available at https://github.com/haohang96/bingo.", "keywords": "Self-supervised learning;knowledge distillation;instance bagging", "primary_area": "", "supplementary_material": "", "author": "Haohang Xu;Jiemin Fang;XIAOPENG ZHANG;Lingxi Xie;Xinggang Wang;Wenrui Dai;Hongkai Xiong;Qi Tian", "authorids": "~Haohang_Xu1;~Jiemin_Fang1;~XIAOPENG_ZHANG7;~Lingxi_Xie1;~Xinggang_Wang1;~Wenrui_Dai1;~Hongkai_Xiong1;~Qi_Tian3", "gender": "M;M;M;M;M;;M;M", "homepage": ";https://jaminfong.cn;https://sites.google.com/site/zxphistory/;http://lingxixie.com/;https://xwcv.github.io/index.htm;;http://min.sjtu.edu.cn;https://www.qitian1987.com/index.html", "dblp": "254/0948;233/1239;;123/2869;95/3056;16/5135.html;21/3569;78/1467-1.html", "google_scholar": ";-JcFoOoAAAAJ;Ud6aBAcAAAAJ;EEMm7hwAAAAJ;qNCTLV0AAAAJ;Xg8MhyAAAAAJ;bB16iN4AAAAJ;https://scholar.google.com/citations?hl=en", "orcid": "0000-0002-4715-1338;;;;0000-0001-6732-7823;;0000-0003-4552-0029;0000-0002-7252-5047", "linkedin": ";;;;;;;", "or_profile": "~Haohang_Xu1;~Jiemin_Fang1;~XIAOPENG_ZHANG7;~Lingxi_Xie1;~Xinggang_Wang1;~Wenrui_Dai1;~Hongkai_Xiong1;~Qi_Tian3", "aff": "Shanghai Jiaotong University;Huazhong University of Science and Technology;Huawei Technologies Ltd.;Huawei Technologies Ltd.;Huazhong University of Science and Technology;Shanghai Jiaotong University;Shanghai Jiaotong University;Huawei Technologies Ltd.", "aff_domain": "sjtu.edu.cn;hust.edu.cn;huawei.com;huawei.com;hust.edu.cn;sjtu.edu.cn;sjtu.edu.cn;huawei.com", "position": "PhD student;PhD student;Principal Researcher;Researcher;Full Professor;Associate Professor;Full Professor;Principal Researcher", "bibtex": "@inproceedings{\nxu2022bag,\ntitle={Bag of Instances Aggregation Boosts Self-supervised Distillation},\nauthor={Haohang Xu and Jiemin Fang and XIAOPENG ZHANG and Lingxi Xie and Xinggang Wang and Wenrui Dai and Hongkai Xiong and Qi Tian},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=N0uJGWDw21d}\n}", "github": "", "project": "", "reviewers": "HSTU;Xmuw;dQqV;5UEk", "pdf_size": 0, "recommendation": "6;6;6;8", "confidence": "5;4;5;3", "correctness": "3;4;4;4", "technical_novelty": "4;3;3;3", "empirical_novelty": "3;3;2;4", "wc_summary_paper": "56;95;116;137", "wc_summary_review": "53;71;90;27", "wc_main_review": "245;467;477;307", "wc_review": "354;633;683;471", "wc_reply_reviewers": "15;44;236;47", "wc_reply_authors": "1097;999;1648;932", "reply_reviewers": "1;1;1;1", "reply_authors": "5;3;5;3", "recommendation_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 4.25, 0.82915619758885 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 101.0, 29.924906014890006 ], "wc_summary_review_avg": [ 60.25, 23.23117517475171 ], "wc_main_review_avg": [ 374.0, 100.48382954485761 ], "wc_review_avg": [ 535.25, 130.73326852794585 ], "wc_reply_reviewers_avg": [ 85.5, 87.78524933039719 ], "wc_reply_authors_avg": [ 1169.0, 282.7074459578311 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 4.0, 1.0 ], "replies_avg": [ 26, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": -0.8703882797784891, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 31, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3290933725411237169&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=N0uJGWDw21d", "email": "sjtu.edu.cn;hust.edu.cn;huawei.com;huawei.com;hust.edu.cn;sjtu.edu.cn;sjtu.edu.cn;huawei.com", "author_num": 8, "aff_unique_index": "0;1;2;2;1;0;0;2", "aff_unique_norm": "Shanghai Jiao Tong University;Huazhong University of Science and Technology;Huawei", "aff_unique_dep": ";;Huawei Technologies", "aff_unique_url": "https://www.sjtu.edu.cn;http://www.hust.edu.cn;https://www.huawei.com", "aff_unique_abbr": "SJTU;HUST;Huawei", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Parallel Training of GRU Networks with a Multi-Grid Solver for Long Sequences", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6016", "id": "N1WI0vJLER", "poster": "", "openreview": "https://openreview.net/forum?id=N1WI0vJLER", "slides": "https://iclr.cc/virtual/2022/poster/6016", "video": "https://iclr.cc/virtual/2022/poster/6016", "author_site": "Gordon Euhyun Moon, Eric Cyr", "tldr": "", "abstract": "Parallelizing Gated Recurrent Unit (GRU) is a challenging task, as the training procedure of GRU is inherently sequential. Prior efforts to parallelize GRU have largely focused on conventional parallelization strategies such as data-parallel and model-parallel training algorithms. However, when the given sequences are very long, existing approaches are still inevitably performance limited in terms of both training time and model accuracy. In this paper, we present a novel parallel training scheme (called parallel-in-time) for GRU based on a multigrid reduction in time (MGRIT) solver. MGRIT partitions a sequence into multiple shorter sub-sequences and trains the sub-sequences on different processors in parallel. The key to achieving speedup is a hierarchical correction of the hidden state to accelerate end-to-end communication in both the forward and backward propagation phases of gradient descent. Experimental results on the HMDB51 dataset, where each video is an image sequence, demonstrate that a new parallel training scheme of GRU achieves up to $6.5 \\times$ speedup over a serial approach. As efficiency of our new parallelization strategy is associated with the sequence length, our parallel GRU algorithm achieves significant performance improvement as the length of sequence increases. Further, the proposed approach can be applied simultaneously with batch and other forms of model parallelism.", "keywords": "GRU;MGRIT;parallel-in-time;distributed machine learning", "primary_area": "", "supplementary_material": "/attachment/99ce66901d3d33245bc46fa1b7b162007f5e8cfc.zip", "author": "Euhyun Moon;Eric C Cyr", "authorids": "~Euhyun_Moon1;~Eric_C_Cyr1", "gender": "M;M", "homepage": "https://gordonmoon.github.io;", "dblp": "220/9907.html;13/5873", "google_scholar": "SmvStlcAAAAJ;ndPqO6kAAAAJ", "orcid": ";0000-0003-3833-9598", "linkedin": ";eric-c-cyr-2801738/", "or_profile": "~Euhyun_Moon1;~Eric_C_Cyr1", "aff": "Korea Aerospace University;Sandia National Laboratories", "aff_domain": "kau.ac.kr;sandia.gov", "position": "Assistant Professor;Researcher", "bibtex": "@inproceedings{\nmoon2022parallel,\ntitle={Parallel Training of {GRU} Networks with a Multi-Grid Solver for Long Sequences},\nauthor={Euhyun Moon and Eric C Cyr},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=N1WI0vJLER}\n}", "github": "", "project": "", "reviewers": "Y8yE;ix54;2GTo;bzSV", "pdf_size": 0, "recommendation": "6;6;6;8", "confidence": "4;4;3;2", "correctness": "4;4;3;4", "technical_novelty": "2;3;3;3", "empirical_novelty": "0;2;3;3", "wc_summary_paper": "73;97;95;60", "wc_summary_review": "90;20;1;29", "wc_main_review": "416;243;354;547", "wc_review": "579;360;450;636", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "1492;355;304;787", "reply_reviewers": "0;0;0;0", "reply_authors": "3;1;1;1", "recommendation_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 1.224744871391589 ], "wc_summary_paper_avg": [ 81.25, 15.465687828221544 ], "wc_summary_review_avg": [ 35.0, 33.324165405903265 ], "wc_main_review_avg": [ 390.0, 109.80664825045886 ], "wc_review_avg": [ 506.25, 108.02864203534172 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 734.5, 475.8973103517186 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.5, 0.8660254037844386 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.8703882797784891, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=546461240895153656&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=N1WI0vJLER", "email": "kau.ac.kr;sandia.gov", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "Korea Aerospace University;Sandia National Laboratories", "aff_unique_dep": ";", "aff_unique_url": "http://www.kau.ac.kr;https://www.sandia.gov", "aff_unique_abbr": "KAU;SNL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "South Korea;United States" }, { "id": "N2nJzgb_ldR", "title": "FastRPB: a Scalable Relative Positional Encoding for Long Sequence Tasks", "track": "main", "status": "Reject", "tldr": "", "abstract": "Transformers achieve remarkable performance in various domains, including NLP, CV, audio processing, and graph analysis. However, they do not scale well on long sequence tasks due to their quadratic complexity w.r.t. the input\u2019s length. Linear Transformers were proposed to address this limitation. However, these models have shown weaker performance on the long sequence tasks comparing to the original one. In this paper, we explore Linear Transformer models, rethinking their two core components. Firstly, we improved Linear Transformer with $\\textbf{S}$hift-$\\textbf{I}$nvariant $\\textbf{K}$ernel $\\textbf{F}$unction $\\textbf{SIKF}$, which achieve higher accuracy without loss in speed. Secondly, we introduce $\\textbf{FastRPB}$ which stands for $\\textbf{Fast}$ $\\textbf{R}$elative $\\textbf{P}$ositional $\\textbf{B}$ias, which efficiently adds positional information to self-attention using Fast Fourier Transformation. FastRPB is independent of the self-attention mechanism and can be combined with an original self-attention and all its efficient variants. FastRPB has $\\mathcal{O}(N\\log{N})$ computational complexity, requiring $\\mathcal{O}(N)$ memory w.r.t. input sequence length $N$. \n\nWe compared introduced modifications with recent Linear Transformers in different settings: text classification, document retrieval, and image classification. Extensive experiments with FastRPB and SIKF demonstrate that our model significantly outperforms another efficient positional encodings method in accuracy, having up to x1.5 times higher speed and requiring up to x10 times less memory than the original Transformer. ", "keywords": "transformer;linear transformer;long sequences;fast Fourier transform;positional encoding;long range arena", "primary_area": "", "supplementary_material": "/attachment/41aa875c664ac1fb974b670d4ed91dfb8819f6d8.zip", "author": "Maksim Zubkov;Daniil Gavrilov", "authorids": "~Maksim_Zubkov1;~Daniil_Gavrilov1", "gender": ";M", "homepage": "https://kefirski.me;https://github.com/maximzubkov", "dblp": "234/8563;", "google_scholar": "https://scholar.google.ru/citations?user=PAZUwukAAAAJ;", "orcid": ";", "linkedin": ";zubkovmd/", "or_profile": "~Daniil_Gavrilov1;~Maksim_Denisovich_Zubkov1", "aff": "T-Bank;Swiss Federal Institute of Technology Lausanne", "aff_domain": "tbank.ru;epfl.ch", "position": "Principal Researcher;MS student", "bibtex": "@misc{\nzubkov2022fastrpb,\ntitle={Fast{RPB}: a Scalable Relative Positional Encoding for Long Sequence Tasks},\nauthor={Maksim Zubkov and Daniil Gavrilov},\nyear={2022},\nurl={https://openreview.net/forum?id=N2nJzgb_ldR}\n}", "github": "", "project": "", "reviewers": "VGvT;7TLj;8WRr;L9JY", "site": "https://openreview.net/forum?id=N2nJzgb_ldR", "pdf_size": 0, "recommendation": "3;5;5;5", "confidence": "4;2;4;3", "correctness": "2;3;3;3", "technical_novelty": "2;3;2;2", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "60;64;68;142", "wc_summary_review": "24;18;44;80", "wc_main_review": "223;163;146;578", "wc_review": "307;245;258;800", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "26;26;26;26", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 4.5, 0.8660254037844386 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 83.5, 33.89321466016465 ], "wc_summary_review_avg": [ 41.5, 24.22292302757865 ], "wc_main_review_avg": [ 277.5, 175.83586096129537 ], "wc_review_avg": [ 402.5, 230.65829705432233 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 26.0, 0.0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.5222329678670935, "corr_recommendation_correctness": 1.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:HzCbok8X77MJ:scholar.google.com/&scioq=FastRPB:+a+Scalable+Relative+Positional+Encoding+for+Long+Sequence+Tasks&hl=en&as_sdt=0,33", "gs_version_total": 3, "aff_unique_index": "0;1", "aff_unique_norm": "T-Bank;Swiss Federal Institute of Technology Lausanne", "aff_unique_dep": ";", "aff_unique_url": "https://www.tbank.com.cn;https://www.epfl.ch", "aff_unique_abbr": ";EPFL", "aff_campus_unique_index": "1", "aff_campus_unique": ";Lausanne", "aff_country_unique_index": "0;1", "aff_country_unique": "China;Switzerland" }, { "id": "N3KYKkSvciP", "title": "Understanding Square Loss in Training Overparametrized Neural Network Classifiers", "track": "main", "status": "Reject", "tldr": "", "abstract": "Deep learning has achieved many breakthroughs in modern classification tasks. Numerous architectures have been proposed for different data structures but when it comes to the loss function, the cross-entropy loss is the predominant choice. Recently, several alternative losses have seen revived interests for deep classifiers. In particular, empirical evidence seems to promote square loss but a theoretical justification is still lacking. In this work, we contribute to the theoretical understanding of square loss in classification by systematically investigating how it performs for overparametrized neural networks in the neural tangent kernel (NTK) regime. Interesting properties regarding the generalization error, robustness, and calibration error are revealed. We consider two cases, according to whether classes are separable or not. In the general non-separable case, fast convergence rate is established for both misclassification rate and calibration error. When classes are separable, the misclassification rate improves to be exponentially fast. Further, the resulting margin is proven to be lower bounded away from zero, providing theoretical guarantees for robustness. We expect our findings to hold beyond the NTK regime and translate to practical settings. To this end, we conduct extensive empirical studies on practical neural networks, demonstrating the effectiveness of square loss in both synthetic low-dimensional data and real image data. Comparing to cross-entropy, square loss has comparable generalization error but noticeable advantages in robustness and model calibration.", "keywords": "classification;square loss;neural tangent kernel;convergence rate", "primary_area": "", "supplementary_material": "/attachment/ceb1b30e4bf5e2293030533ce22573a9c2549779.zip", "author": "Tianyang Hu;Jun Wang;Wenjia Wang;Zhenguo Li", "authorids": "~Tianyang_Hu1;~Jun_Wang23;~Wenjia_Wang2;~Zhenguo_Li1", "gender": "M;M;M;M", "homepage": "https://hu-tianyang.github.io/;https://www.wenjia-w.com/;http://www.ee.columbia.edu/~zgli/;", "dblp": "170/2551;;23/6479;", "google_scholar": "mlA_3r0AAAAJ;EKS1sO0AAAAJ;XboZC1AAAAAJ;https://scholar.google.com.hk/citations?user=mX8s9ZgAAAAJ", "orcid": ";;;0000-0001-5332-9879", "linkedin": ";;;", "or_profile": "~Tianyang_Hu1;~Wenjia_Wang2;~Zhenguo_Li1;~Jun_WANG22", "aff": "Huawei Noah's Ark Lab;Hong Kong University of Science and Technology;Huawei Noah's Ark Lab;Hong Kong University of Science and Technology", "aff_domain": "huawei.com;ust.hk;huawei.com;ust.hk", "position": "Researcher;Assistant Professor;Principal Researcher;PhD student", "bibtex": "@misc{\nhu2022understanding,\ntitle={Understanding Square Loss in Training Overparametrized Neural Network Classifiers},\nauthor={Tianyang Hu and Jun Wang and Wenjia Wang and Zhenguo Li},\nyear={2022},\nurl={https://openreview.net/forum?id=N3KYKkSvciP}\n}", "github": "", "project": "", "reviewers": "Yckn;MNxz;RzXw;Eo56;2eZX", "site": "https://openreview.net/forum?id=N3KYKkSvciP", "pdf_size": 0, "recommendation": "3;5;5;6;6", "confidence": "3;2;4;4;3", "correctness": "2;3;3;4;3", "technical_novelty": "1;3;3;3;3", "empirical_novelty": "0;2;3;3;3", "wc_summary_paper": "54;52;39;30;55", "wc_summary_review": "23;57;49;49;2", "wc_main_review": "375;842;498;221;410", "wc_review": "452;951;586;300;467", "wc_reply_reviewers": "0;93;187;30;0", "wc_reply_authors": "544;369;401;215;937", "reply_reviewers": "0;1;1;1;0", "reply_authors": "1;1;1;1;2", "recommendation_avg": [ 5.0, 1.0954451150103321 ], "confidence_avg": [ 3.2, 0.7483314773547882 ], "correctness_avg": [ 3.0, 0.6324555320336759 ], "technical_novelty_avg": [ 2.6, 0.8000000000000002 ], "empirical_novelty_avg": [ 2.2, 1.16619037896906 ], "wc_summary_paper_avg": [ 46.0, 9.85900603509299 ], "wc_summary_review_avg": [ 36.0, 20.513410247932935 ], "wc_main_review_avg": [ 469.2, 206.78046329380342 ], "wc_review_avg": [ 551.2, 219.58269512873733 ], "wc_reply_reviewers_avg": [ 62.0, 71.13086531176181 ], "wc_reply_authors_avg": [ 493.2, 245.30014268238818 ], "reply_reviewers_avg": [ 0.6, 0.48989794855663565 ], "reply_authors_avg": [ 1.2, 0.4 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.24397501823713333, "corr_recommendation_correctness": 0.8660254037844387, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12240136831806413505&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 5, "aff_unique_index": "0;1;0;1", "aff_unique_norm": "Huawei;Hong Kong University of Science and Technology", "aff_unique_dep": "Noah's Ark Lab;", "aff_unique_url": "https://www.huawei.com;https://www.ust.hk", "aff_unique_abbr": "Huawei;HKUST", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "id": "N3fJsZ7ghc", "title": "Deep Encryption: Protecting Pre-Trained Neural Networks with Confusion Neurons", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Having consumed huge amounts of training data and computational resource, large-scale pre-trained models are often considered key assets of AI service providers. This raises an important problem: how to prevent these models from being maliciously copied when they are running on customers' computing device? We answer this question by adding a set of confusion neurons into the pre-trained model, where the position of these neurons is encoded into a few integers that are easy to be encrypted. We find that most often, a small portion of confusion neurons are able to effectively contaminate the pre-trained model. Thereafter, we extend our study to a bigger picture that the customers may develop algorithms to eliminate the effect of confusion neurons and recover the original network, and we show that our simple approach is somewhat capable of defending itself against the fine-tuning attack.", "keywords": "Encryption;Confusion Neurons", "primary_area": "", "supplementary_material": "", "author": "Mengbiao Zhao;Shixiong Xu;Jianlong Chang;Lingxi Xie;Jie Chen;Qi Tian", "authorids": "~Mengbiao_Zhao2;~Shixiong_Xu2;~Jianlong_Chang2;~Lingxi_Xie1;~Jie_Chen15;~Qi_Tian3", "gender": "M;M;M;M;M;", "homepage": "http://www.nlpr.ia.ac.cn/pal/People/ZhaoMengBiao.html;https://jianlongchange.github.io/;http://lingxixie.com/;https://aimia-pku.github.io/;https://www.qitian1987.com/index.html;https://xsx1001.github.io/", "dblp": ";92/2332;123/2869;92/6289-1;78/1467-1.html;", "google_scholar": ";RDwnNsQAAAAJ;EEMm7hwAAAAJ;https://scholar.google.fi/citations?user=ZAZFfwwAAAAJ;https://scholar.google.com/citations?hl=en;", "orcid": ";;;;0000-0002-7252-5047;", "linkedin": ";;;;;", "or_profile": "~Mengbiao_Zhao2;~Jianlong_Chang2;~Lingxi_Xie1;~Jie_Chen15;~Qi_Tian3;~SHIXIONG_XU1", "aff": "Institute of Automation, Chinese Academy of Sciences;Huawei Technologies Ltd.;Huawei Technologies Ltd.;Peking University;Huawei Technologies Ltd.;Institute of Automation, Chinese Academy of Sciences", "aff_domain": "ia.ac.cn;huawei.com;huawei.com;pku.edu.cn;huawei.com;ia.ac.cn", "position": "PhD student;Principal Researcher;Researcher;Associate Professor;Principal Researcher;MS student", "bibtex": "@misc{\nzhao2022deep,\ntitle={Deep Encryption: Protecting Pre-Trained Neural Networks with Confusion Neurons},\nauthor={Mengbiao Zhao and Shixiong Xu and Jianlong Chang and Lingxi Xie and Jie Chen and Qi Tian},\nyear={2022},\nurl={https://openreview.net/forum?id=N3fJsZ7ghc}\n}", "github": "", "project": "", "reviewers": "t2SG;jMGr;YnnX;W8ex", "site": "https://openreview.net/forum?id=N3fJsZ7ghc", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "4;5;3;4", "correctness": "2;2;3;3", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;1;3;2", "wc_summary_paper": "77;63;84;131", "wc_summary_review": "22;31;60;32", "wc_main_review": "413;366;124;282", "wc_review": "512;460;268;445", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 88.75, 25.537961939042827 ], "wc_summary_review_avg": [ 36.25, 14.254385290148432 ], "wc_main_review_avg": [ 296.25, 109.96448290243536 ], "wc_review_avg": [ 421.25, 91.90586216341154 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.7071067811865475, "corr_recommendation_correctness": 1.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:50xqlRkcqMsJ:scholar.google.com/&scioq=Deep+Encryption:+Protecting+Pre-Trained+Neural+Networks+with+Confusion+Neurons&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;1;2;1;0", "aff_unique_norm": "Chinese Academy of Sciences;Huawei;Peking University", "aff_unique_dep": "Institute of Automation;Huawei Technologies;", "aff_unique_url": "http://www.ia.cas.cn;https://www.huawei.com;http://www.pku.edu.cn", "aff_unique_abbr": "CAS;Huawei;Peking U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "China" }, { "id": "N4KRX61-_1d", "title": "A Hierarchical Bayesian Approach to Inverse Reinforcement Learning with Symbolic Reward Machines", "track": "main", "status": "Reject", "tldr": "", "abstract": "A misspecified reward can degrade sample efficiency and induce undesired behaviors in reinforcement learning (RL) problems. We propose symbolic reward machines for incorporating high-level task knowledge when specifying the reward signals. Symbolic reward machines augment existing reward machine formalism by allowing transitions to carry predicates and symbolic reward outputs. This formalism lends itself well to inverse reinforcement learning, whereby the key challenge is determining appropriate assignments to the symbolic values from a few expert demonstrations. We propose a hierarchical Bayesian approach for inferring the most likely assignments such that the concretized reward machine can discriminate expert demonstrated trajectories from other trajectories with high accuracy. Experimental results show that learned reward machines can significantly improve training efficiency for complex RL tasks and generalize well across different task environment configurations.", "keywords": "Reward Machines;Finite State Automata;Finite State Transducers;Inverse Reinforcement Learning;Reinforcement Learning", "primary_area": "", "supplementary_material": "/attachment/5823dba1a7839ba183c33ea4a82971634e203cf8.zip", "author": "Weichao Zhou;Wenchao Li", "authorids": "~Weichao_Zhou1;~Wenchao_Li1", "gender": "M;", "homepage": "https://sites.google.com/view/zwc662/;http://sites.bu.edu/depend/", "dblp": "207/8077;23/5721-1", "google_scholar": "JdiJIF0AAAAJ;zwA5eokAAAAJ", "orcid": "0009-0002-0369-2113;", "linkedin": ";", "or_profile": "~Weichao_Zhou1;~Wenchao_Li1", "aff": "Boston University;Boston University", "aff_domain": "bu.edu;bu.edu", "position": "PhD student;Assistant Professor", "bibtex": "@misc{\nzhou2022a,\ntitle={A Hierarchical Bayesian Approach to Inverse Reinforcement Learning with Symbolic Reward Machines},\nauthor={Weichao Zhou and Wenchao Li},\nyear={2022},\nurl={https://openreview.net/forum?id=N4KRX61-_1d}\n}", "github": "", "project": "", "reviewers": "MnwS;KdBH;Mz1h;ZwPY", "site": "https://openreview.net/forum?id=N4KRX61-_1d", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "4;2;4;3", "correctness": "2;3;3;3", "technical_novelty": "3;3;3;3", "empirical_novelty": "2;3;2;3", "wc_summary_paper": "98;116;145;50", "wc_summary_review": "41;97;41;3", "wc_main_review": "685;474;1442;114", "wc_review": "824;687;1628;167", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "1008;738;1564;236", "reply_reviewers": "0;0;0;0", "reply_authors": "2;2;2;1", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 102.25, 34.513584282134474 ], "wc_summary_review_avg": [ 45.5, 33.53729267546801 ], "wc_main_review_avg": [ 678.75, 485.6579943746422 ], "wc_review_avg": [ 826.5, 523.6432468771081 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 886.5, 479.3148756297889 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.75, 0.4330127018922193 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.30151134457776363, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13941613076032869632&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0", "aff_unique_norm": "Boston University", "aff_unique_dep": "", "aff_unique_url": "https://www.bu.edu", "aff_unique_abbr": "BU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "N7WQ5SLlPrJ", "title": "Measure Twice, Cut Once: Quantifying Bias and Fairness in Deep Neural Networks", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Algorithmic bias is of increasing concern, both to the research community, and society at large. Bias in AI is more abstract and unintuitive than traditional forms of discrimination and can be more difficult to detect and mitigate. A clear gap exists in the current literature on evaluating the relative bias in the performance of multi-class classifiers. In this work, we propose two simple yet effective metrics, Combined Error Variance (CEV) and Symmetric Distance Error (SDE), to quantitatively evaluate the class-wise bias of two models in comparison to one another. By evaluating the performance of these new metrics and by demonstrating their practical application, we show that they can be used to measure fairness as well as bias. These demonstrations show that our metrics can address specific needs for measuring bias in multi-class classification. ", "keywords": "Interpretable AI;AI Fairness;Pruning;Compression;Knowledge Distillation;Image Recognition", "primary_area": "", "supplementary_material": "", "author": "Cody Blakeney;Gentry Atkinson;Nathaniel Huish;Yan Yan;Vangelis Metsis;Ziliang Zong", "authorids": "~Cody_Blakeney1;gma23@txstate.edu;~Nathaniel_Huish1;~Yan_Yan6;~Vangelis_Metsis1;~Ziliang_Zong1", "gender": "M;;M;M;M;", "homepage": "https://userweb.cs.txstate.edu/~cjb92/;;;;https://userweb.cs.txstate.edu/~v_m137/;https://userweb.cs.txstate.edu/~zz11/", "dblp": ";;;13/3953-2;;", "google_scholar": "Hx6daJ4AAAAJ;;;;7hDSfoQAAAAJ;", "orcid": ";;;;;", "linkedin": "cody-blakeney-796557105/;;nathaniel-huish/;;;", "or_profile": "~Cody_Blakeney1;gma23@txstate.edu;~Nathaniel_Huish1;~Yan_Yan6;~Vangelis_Metsis1;~Ziliang_Zong1", "aff": "Meta Facebook;;Texas State University;;Texas State University;Texas State University", "aff_domain": "fb.com;;txstate.edu;;txstate.edu;txstate.edu", "position": "Researcher;;Undergrad student;;Associate Professor;Associate Professor", "bibtex": "@misc{\nblakeney2022measure,\ntitle={Measure Twice, Cut Once: Quantifying Bias and Fairness in Deep Neural Networks},\nauthor={Cody Blakeney and Gentry Atkinson and Nathaniel Huish and Yan Yan and Vangelis Metsis and Ziliang Zong},\nyear={2022},\nurl={https://openreview.net/forum?id=N7WQ5SLlPrJ}\n}", "github": "", "project": "", "reviewers": "gq67;baMW;KivU;trVt", "site": "https://openreview.net/forum?id=N7WQ5SLlPrJ", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "4;4;4;4", "correctness": "3;2;3;3", "technical_novelty": "2;2;3;2", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "87;92;102;48", "wc_summary_review": "34;73;40;42", "wc_main_review": "408;583;326;243", "wc_review": "529;748;468;333", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 82.25, 20.498475553074673 ], "wc_summary_review_avg": [ 47.25, 15.155444566227676 ], "wc_main_review_avg": [ 390.0, 125.7755938169246 ], "wc_review_avg": [ 519.5, 149.78067298553574 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14970004965538150965&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1;1;1", "aff_unique_norm": "Meta;Texas State University", "aff_unique_dep": "Meta Platforms, Inc.;", "aff_unique_url": "https://meta.com;https://www.txstate.edu", "aff_unique_abbr": "Meta;TXST", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "New Insights on Reducing Abrupt Representation Change in Online Continual Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7188", "id": "N8MaByOzUfb", "poster": "", "openreview": "https://openreview.net/forum?id=N8MaByOzUfb", "slides": "https://iclr.cc/virtual/2022/poster/7188", "video": "https://iclr.cc/virtual/2022/poster/7188", "author_site": "Lucas Caccia, Rahaf Aljundi, Nader Asadi, Tinne Tuytelaars, Joelle Pineau, Eugene Belilovsky", "tldr": "", "abstract": "In the online continual learning paradigm, agents must learn from a changing distribution while respecting memory and compute constraints. Experience Replay (ER), where a small subset of past data is stored and replayed alongside new data, has emerged as a simple and effective learning strategy. In this work, we focus on the change in representations of observed data that arises when previously unobserved classes appear in the incoming data stream, and new classes must be distinguished from previous ones. We shed new light on this question by showing that applying ER causes the newly added classes\u2019 representations to overlap significantly with the previous classes, leading to highly disruptive parameter updates. Based on this empirical analysis, we propose a new method which mitigates this issue by shielding the learned representations from drastic adaptation to accommodate new classes. We show that using an asymmetric update rule pushes new classes to adapt to the older ones (rather than the reverse), which is more effective especially at task boundaries, where much of the forgetting typically occurs. Empirical results show significant gains over strong baselines on standard continual learning benchmarks.", "keywords": "continual learning", "primary_area": "", "supplementary_material": "", "author": "Lucas Caccia;Rahaf Aljundi;Nader Asadi;Tinne Tuytelaars;Joelle Pineau;Eugene Belilovsky", "authorids": "~Lucas_Caccia1;~Rahaf_Aljundi1;~Nader_Asadi1;~Tinne_Tuytelaars1;~Joelle_Pineau1;~Eugene_Belilovsky1", "gender": "M;F;M;;F;M", "homepage": "https://www.cs.mcgill.ca/~lpagec/;https://rahafaljundi.com/;https://naderasadi.github.io;;http://www.cs.mcgill.ca/~jpineau;http://eugenium.github.io", "dblp": ";169/4970;244/2603;;p/JoellePineau;42/11445", "google_scholar": "fuvIITUAAAAJ;https://scholar.google.be/citations?user=YLh7yrwAAAAJ;jaLI2noAAAAJ;;https://scholar.google.ca/citations?user=CEt6_mMAAAAJ;https://scholar.google.fr/citations?user=CffJDoEAAAAJ", "orcid": ";;;;;", "linkedin": ";;https://linkedin.com/in/naderasadi/;;;", "or_profile": "~Lucas_Caccia1;~Rahaf_Aljundi1;~Nader_Asadi1;~Tinne_Tuytelaars1;~Joelle_Pineau1;~Eugene_Belilovsky1", "aff": "McGill University;Toyota Motor Europe;Concordia University, Montreal;;Meta Facebook;Concordia University, Montreal", "aff_domain": "mcgill.ca;toyota-europe.com;concordia.ca;;fb.com;concordia.ca", "position": "PhD student;Researcher;MS student;;Researcher Manager;Assistant Professor", "bibtex": "@inproceedings{\ncaccia2022new,\ntitle={New Insights on Reducing Abrupt Representation Change in Online Continual Learning},\nauthor={Lucas Caccia and Rahaf Aljundi and Nader Asadi and Tinne Tuytelaars and Joelle Pineau and Eugene Belilovsky},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=N8MaByOzUfb}\n}", "github": "", "project": "", "reviewers": "mCT3;aUDf;MwZR;qLmu", "pdf_size": 0, "recommendation": "3;5;6;8", "confidence": "5;2;4;4", "correctness": "3;3;3;4", "technical_novelty": "2;3;3;2", "empirical_novelty": "2;2;2;4", "wc_summary_paper": "88;83;89;24", "wc_summary_review": "29;36;59;31", "wc_main_review": "340;207;238;142", "wc_review": "457;326;386;197", "wc_reply_reviewers": "77;0;0;47", "wc_reply_authors": "1821;287;507;443", "reply_reviewers": "1;0;0;1", "reply_authors": "6;1;1;1", "recommendation_avg": [ 5.5, 1.8027756377319946 ], "confidence_avg": [ 3.75, 1.0897247358851685 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 71.0, 27.230497608380205 ], "wc_summary_review_avg": [ 38.75, 11.96609794377432 ], "wc_main_review_avg": [ 231.75, 71.45759231880123 ], "wc_review_avg": [ 341.5, 95.4476296195982 ], "wc_reply_reviewers_avg": [ 31.0, 32.76430985081175 ], "wc_reply_authors_avg": [ 764.5, 615.1965133191183 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 2.25, 2.165063509461097 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.19088542889273336, "corr_recommendation_correctness": 0.8006407690254357, "gs_citation": 251, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4308693083356542713&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=N8MaByOzUfb", "email": "mcgill.ca;toyota-europe.com;concordia.ca;;fb.com;concordia.ca", "author_num": 6, "aff_unique_index": "0;1;2;3;2", "aff_unique_norm": "McGill University;Toyota Motor Corporation;Concordia University;Meta", "aff_unique_dep": ";;;Meta Platforms, Inc.", "aff_unique_url": "https://www.mcgill.ca;https://www.toyota-europe.com;https://www.concordia.ca;https://meta.com", "aff_unique_abbr": "McGill;TME;Concordia;Meta", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Montreal", "aff_country_unique_index": "0;1;0;2;0", "aff_country_unique": "Canada;Unknown;United States" }, { "title": "Steerable Partial Differential Operators for Equivariant Neural Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6313", "id": "N9W24a4zU", "poster": "", "openreview": "https://openreview.net/forum?id=N9W24a4zU", "slides": "https://iclr.cc/virtual/2022/poster/6313", "video": "https://iclr.cc/virtual/2022/poster/6313", "author_site": "Erik Jenner, Maurice Weiler", "tldr": "", "abstract": "Recent work in equivariant deep learning bears strong similarities to physics. Fields over a base space are fundamental entities in both subjects, as are equivariant maps between these fields. In deep learning, however, these maps are usually defined by convolutions with a kernel, whereas they are partial differential operators (PDOs) in physics. Developing the theory of equivariant PDOs in the context of deep learning could bring these subjects even closer together and lead to a stronger flow of ideas. In this work, we derive a $G$-steerability constraint that completely characterizes when a PDO between feature vector fields is equivariant, for arbitrary symmetry groups $G$. We then fully solve this constraint for several important groups. We use our solutions as equivariant drop-in replacements for convolutional layers and benchmark them in that role. Finally, we develop a framework for equivariant maps based on Schwartz distributions that unifies classical convolutions and differential operators and gives insight about the relation between the two.\n", "keywords": "partial differential operators;equivariance;deep learning;steerability", "primary_area": "", "supplementary_material": "/attachment/a29e25c4cd8981fdf8b7e773d00d5bd6c7ab7f6a.zip", "author": "Erik Jenner;Maurice Weiler", "authorids": "~Erik_Jenner1;~Maurice_Weiler1", "gender": "M;", "homepage": "https://ejenner.com;https://maurice-weiler.gitlab.io/", "dblp": "295/8670;210/0855", "google_scholar": "https://scholar.google.com/citations?hl=en;uQePx6EAAAAJ", "orcid": "0000-0002-6037-5715;", "linkedin": "erik-jenner/;maurice-weiler-78b6931a6/", "or_profile": "~Erik_Jenner1;~Maurice_Weiler1", "aff": "University of Amsterdam;University of Amsterdam", "aff_domain": "uva.nl;uva.nl", "position": "MS student;PhD student", "bibtex": "@inproceedings{\njenner2022steerable,\ntitle={Steerable Partial Differential Operators for Equivariant Neural Networks},\nauthor={Erik Jenner and Maurice Weiler},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=N9W24a4zU}\n}", "github": "", "project": "", "reviewers": "p5Vs;BPgr;mpGT", "pdf_size": 0, "recommendation": "6;6;8", "confidence": "3;3;3", "correctness": "4;4;4", "technical_novelty": "3;3;4", "empirical_novelty": "3;2;3", "wc_summary_paper": "63;59;80", "wc_summary_review": "30;82;38", "wc_main_review": "180;242;355", "wc_review": "273;383;473", "wc_reply_reviewers": "24;73;143", "wc_reply_authors": "255;1186;1284", "reply_reviewers": "1;3;2", "reply_authors": "1;4;2", "recommendation_avg": [ 6.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 3.0, 0.0 ], "correctness_avg": [ 4.0, 0.0 ], "technical_novelty_avg": [ 3.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 67.33333333333333, 9.104333522498443 ], "wc_summary_review_avg": [ 50.0, 22.861904265976328 ], "wc_main_review_avg": [ 259.0, 72.44768227256596 ], "wc_review_avg": [ 376.3333333333333, 81.78562764256866 ], "wc_reply_reviewers_avg": [ 80.0, 48.83304891839815 ], "wc_reply_authors_avg": [ 908.3333333333334, 463.7056058415607 ], "reply_reviewers_avg": [ 2.0, 0.816496580927726 ], "reply_authors_avg": [ 2.3333333333333335, 1.247219128924647 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 33, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18342593402456805321&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=N9W24a4zU", "email": "uva.nl;uva.nl", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "University of Amsterdam", "aff_unique_dep": "", "aff_unique_url": "https://www.uva.nl", "aff_unique_abbr": "UvA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Netherlands" }, { "id": "NB0czpQ3-m", "title": "RoMA: a Method for Neural Network Robustness Measurement and Assessment", "track": "main", "status": "Reject", "tldr": "", "abstract": "Neural network models have become the leading solution for various tasks, such\nas classification, language processing, protein folding, and others. However, their\nreliability is heavily plagued by adversarial inputs: small input perturbations that\ncause the model to produce erroneous output, thus impairing the model\u2019s robustness.\nAdversarial inputs can occur naturally when the system\u2019s environment behaves\nrandomly, even in the absence of a malicious adversary, and are thus a severe\ncause for concern when attempting to deploy neural networks within critical\nsystems. In this paper, we present a new statistical method, called Robustness\nMeasurement and Assessment (RoMA), which can accurately measure the robustness\nof a neural network model. Specifically, RoMA determines the probability\nthat a random input perturbation might cause misclassification. The method allows\nus to provide formal guarantees regarding the expected number of errors a\ntrained model will have after deployment. Our approach can be implemented on\nlarge-scale, black-box neural networks, which is a significant advantage compared\nto recently proposed verification methods. We apply our approach in two ways:\ncomparing the robustness of different models, and measuring how a model\u2019s robustness\nis affected by the scale of adversarial perturbation. One interesting insight\nobtained through this work is that, in a classification network, different output\nlabels can exhibit very different robustness levels. We term this phenomenon\nCategorial Robustness. Our ability to perform risk and robustness assessments\non a categorial basis opens the door to risk mitigation, which may prove to be a\nsignificant step towards neural network certification in safety-critical applications.", "keywords": "Neuran Network;Robustness;Safety Critical Software;Categorial Robustness", "primary_area": "", "supplementary_material": "", "author": "Natan Levy;Guy Katz", "authorids": "~Natan_Levy2;~Guy_Katz1", "gender": "M;M", "homepage": "https://www.natan-levy.com/;http://www.katz-lab.com", "dblp": ";23/10321", "google_scholar": ";https://scholar.google.com.tw/citations?user=3nYG5BMAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Natan_Levy2;~Guy_Katz1", "aff": "Hebrew University of Jerusalem, Technion;Hebrew University of Jerusalem", "aff_domain": "huji.ac.il;huji.ac.il", "position": "PhD student;Associate Professor", "bibtex": "@misc{\nlevy2022roma,\ntitle={Ro{MA}: a Method for Neural Network Robustness Measurement and Assessment },\nauthor={Natan Levy and Guy Katz},\nyear={2022},\nurl={https://openreview.net/forum?id=NB0czpQ3-m}\n}", "github": "", "project": "", "reviewers": "Y2p2;uGd4;JNVz;FezE", "site": "https://openreview.net/forum?id=NB0czpQ3-m", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "4;5;2;4", "correctness": "2;2;3;3", "technical_novelty": "2;1;3;2", "empirical_novelty": "0;1;3;2", "wc_summary_paper": "75;71;72;93", "wc_summary_review": "77;29;20;31", "wc_main_review": "652;202;319;394", "wc_review": "804;302;411;518", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 3.75, 1.0897247358851685 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 1.5, 1.118033988749895 ], "wc_summary_paper_avg": [ 77.75, 8.926785535678562 ], "wc_summary_review_avg": [ 39.25, 22.18529918662356 ], "wc_main_review_avg": [ 391.75, 165.10053755212306 ], "wc_review_avg": [ 508.75, 186.78781411002163 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.13245323570650439, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=626163282379519961&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0", "aff_unique_norm": "Hebrew University of Jerusalem", "aff_unique_dep": "", "aff_unique_url": "https://www.huji.ac.il", "aff_unique_abbr": "HUJI", "aff_campus_unique_index": "1", "aff_campus_unique": ";Jerusalem", "aff_country_unique_index": "0;0", "aff_country_unique": "Israel" }, { "id": "NCwIM2Q8ah6", "title": "MDFL: A UNIFIED FRAMEWORK WITH META-DROPOUT FOR FEW-SHOT LEARNING", "track": "main", "status": "Reject", "tldr": "", "abstract": "Conventional training of deep neural networks usually requires a substantial amount of data with expensive human annotations. In this paper, we utilize the idea of meta-learning to integrate two very different streams of few-shot learning, i.e., the episodic meta-learning-based and pre-train finetune-based few-shot learning, and form a unified meta-learning framework. In order to improve the generalization power of our framework, we propose a simple yet effective strategy named meta-dropout, which is applied to the transferable knowledge generalized from base categories to novel categories. The proposed strategy can effectively prevent neural units from co-adapting excessively in the meta-training stage. Extensive experiments on the few-shot object detection and few-shot image classification datasets, i.e., Pascal VOC, MS COCO, CUB, and mini-ImageNet, validate the effectiveness of our method.\n", "keywords": "Few-shot learning", "primary_area": "", "supplementary_material": "", "author": "Shaobo Lin;Xingyu Zeng;Rui Zhao", "authorids": "~Shaobo_Lin1;zengxingyu@sensetime.com;~Rui_Zhao6", "gender": ";;M", "homepage": ";;http://zhaorui.xyz/", "dblp": ";;26/2578-1", "google_scholar": ";;1c9oQNMAAAAJ", "orcid": ";;", "linkedin": "http://www.linkedin.com/in/shaobo-lin;;", "or_profile": "~Shaobo_Lin1;zengxingyu@sensetime.com;~Rui_Zhao6", "aff": "Sensetime Group Limited;;SenseTime Research", "aff_domain": "sensetime.com;;sensetime.com", "position": "Researcher;;Researcher", "bibtex": "@misc{\nlin2022mdfl,\ntitle={{MDFL}: A {UNIFIED} {FRAMEWORK} {WITH} {META}-{DROPOUT} {FOR} {FEW}-{SHOT} {LEARNING}},\nauthor={Shaobo Lin and Xingyu Zeng and Rui Zhao},\nyear={2022},\nurl={https://openreview.net/forum?id=NCwIM2Q8ah6}\n}", "github": "", "project": "", "reviewers": "Tnoc;9Uzi;hZYn;TnYh", "site": "https://openreview.net/forum?id=NCwIM2Q8ah6", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "3;4;4;3", "correctness": "1;2;3;3", "technical_novelty": "1;1;2;3", "empirical_novelty": "1;1;2;2", "wc_summary_paper": "60;47;38;49", "wc_summary_review": "13;27;19;94", "wc_main_review": "171;415;298;201", "wc_review": "244;489;355;344", "wc_reply_reviewers": "40;0;29;0", "wc_reply_authors": "220;567;483;324", "reply_reviewers": "1;0;1;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 2.25, 0.82915619758885 ], "technical_novelty_avg": [ 1.75, 0.82915619758885 ], "empirical_novelty_avg": [ 1.5, 0.5 ], "wc_summary_paper_avg": [ 48.5, 7.826237921249264 ], "wc_summary_review_avg": [ 38.25, 32.5681976781031 ], "wc_main_review_avg": [ 271.25, 95.34771890297114 ], "wc_review_avg": [ 358.0, 87.12347559642005 ], "wc_reply_reviewers_avg": [ 17.25, 17.68297203526602 ], "wc_reply_authors_avg": [ 398.5, 135.04166023860932 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.9045340337332909, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:ERgjXPdZIrIJ:scholar.google.com/&scioq=MDFL:+A+UNIFIED+FRAMEWORK+WITH+META-DROPOUT+FOR+FEW-SHOT+LEARNING&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1", "aff_unique_norm": "SenseTime Group;SenseTime", "aff_unique_dep": ";SenseTime Research", "aff_unique_url": "https://www.sensetime.com;https://www.sensetime.com", "aff_unique_abbr": "SenseTime;SenseTime", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "id": "NE8B5RQkau", "title": "Self-Distilled Pruning Of Neural Networks", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Pruning aims to reduce the number of parameters while maintaining performance close to the original network. This work proposes a novel \\emph{self-distillation} based pruning strategy, whereby the representational similarity between the pruned and unpruned versions of the same network is maximized. Unlike previous approaches that treat distillation and pruning separately, we use distillation to inform the pruning criteria, without requiring a separate student network as in knowledge distillation. We show that the proposed {\\em cross-correlation objective for self-distilled pruning} implicitly encourages sparse solutions, naturally complementing magnitude-based pruning criteria. Experiments on the GLUE and XGLUE benchmarks show that self-distilled pruning increases mono- and cross-lingual language model performance. Self-distilled pruned models also outperform smaller Transformers with an equal number of parameters and are competitive against (6 times) larger distilled networks. We also observe that self-distillation (1) maximizes class separability, (2) increases the signal-to-noise ratio, and (3) converges faster after pruning steps, providing further insights into why self-distilled pruning improves generalization. ", "keywords": "pruning;knowledge distillation;compression;transformers;neural networks;language models", "primary_area": "", "supplementary_material": "", "author": "James O' Neill;Sourav Dutta;Haytham Assem", "authorids": "~James_O'_Neill1;~Sourav_Dutta1;haytham.assem@huawei.com", "gender": ";M;", "homepage": "https://jamesoneill12.github.io/;;", "dblp": "156/0622;62/8171;", "google_scholar": "K69XZhcAAAAJ;9y1l5IoAAAAJ;", "orcid": ";0000-0002-8934-9166;", "linkedin": ";;", "or_profile": "~James_O'_Neill1;~Sourav_Dutta1;haytham.assem@huawei.com", "aff": "University of Liverpool;Huawei Research Center;", "aff_domain": "liverpool.ac.uk;huawei.com;", "position": "PhD student;Principal Scientist;", "bibtex": "@misc{\nneill2022selfdistilled,\ntitle={Self-Distilled Pruning Of Neural Networks},\nauthor={James O' Neill and Sourav Dutta and Haytham Assem},\nyear={2022},\nurl={https://openreview.net/forum?id=NE8B5RQkau}\n}", "github": "", "project": "", "reviewers": "U7vZ;HsUo;cz66;bdqP;GWkD", "site": "https://openreview.net/forum?id=NE8B5RQkau", "pdf_size": 0, "recommendation": "3;5;5;6;6", "confidence": "4;4;3;4;3", "correctness": "2;3;3;3;1", "technical_novelty": "2;2;3;2;2", "empirical_novelty": "2;2;3;3;2", "wc_summary_paper": "22;35;82;40;85", "wc_summary_review": "10;49;33;44;107", "wc_main_review": "152;631;250;160;194", "wc_review": "184;715;365;244;386", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "161;114;0;0;192", "reply_reviewers": "0;0;0;0;0", "reply_authors": "1;1;0;0;1", "recommendation_avg": [ 5.0, 1.0954451150103321 ], "confidence_avg": [ 3.6, 0.4898979485566356 ], "correctness_avg": [ 2.4, 0.8 ], "technical_novelty_avg": [ 2.2, 0.39999999999999997 ], "empirical_novelty_avg": [ 2.4, 0.4898979485566356 ], "wc_summary_paper_avg": [ 52.8, 25.763540129415446 ], "wc_summary_review_avg": [ 48.6, 32.140939625343876 ], "wc_main_review_avg": [ 277.4, 180.13728098314354 ], "wc_review_avg": [ 378.8, 184.0602075409022 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 93.4, 80.2037405611484 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0.6, 0.48989794855663565 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.372677996249965, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:YjUsFCKpdlwJ:scholar.google.com/&scioq=Self-Distilled+Pruning+Of+Neural+Networks&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1", "aff_unique_norm": "University of Liverpool;Huawei", "aff_unique_dep": ";Research Center", "aff_unique_url": "https://www.liverpool.ac.uk;https://www.huawei.com/en/", "aff_unique_abbr": "Liv Uni;Huawei", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "United Kingdom;China" }, { "title": "Who Is Your Right Mixup Partner in Positive and Unlabeled Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/5904", "id": "NH29920YEmj", "poster": "", "openreview": "https://openreview.net/forum?id=NH29920YEmj", "slides": "https://iclr.cc/virtual/2022/poster/5904", "video": "https://iclr.cc/virtual/2022/poster/5904", "author_site": "Changchun Li, Ximing Li, Lei Feng, Jihong Ouyang", "tldr": "", "abstract": "Positive and Unlabeled (PU) learning targets inducing a binary classifier from weak training datasets of positive and unlabeled instances, which arise in many real-world applications. In this paper, we propose a novel PU learning method, namely Positive and unlabeled learning with Partially Positive Mixup (P3Mix), which simultaneously benefits from data augmentation and supervision correction with a heuristic mixup technique. To be specific, we take inspiration from the directional boundary deviation phenomenon observed in our preliminary experiments, where the learned PU boundary tends to deviate from the fully supervised boundary towards the positive side. For the unlabeled instances with ambiguous predictive results, we select their mixup partners from the positive instances around the learned PU boundary, so as to transform them into augmented instances near to the boundary yet with more precise supervision. Accordingly, those augmented instances may push the learned PU boundary towards the fully supervised boundary, thereby improving the classification performance. Comprehensive experimental results demonstrate the effectiveness of the heuristic mixup technique in PU learning and show that P3Mix can consistently outperform the state-of-the-art PU learning methods.", "keywords": "Positive and Unlabeled Learning;Mixup;Heuristic", "primary_area": "", "supplementary_material": "/attachment/cd51230305ce5cc0e120580e58977ba7f4a7b903.zip", "author": "Changchun Li;Ximing Li;Lei Feng;Jihong Ouyang", "authorids": "~Changchun_Li1;~Ximing_Li1;~Lei_Feng1;~Jihong_Ouyang2", "gender": "M;M;M;F", "homepage": ";https://ccst.jlu.edu.cn/info/1367/19282.htm;https://lfeng1995.github.io/;http://ccst.jlu.edu.cn/info/1186/2081.htm", "dblp": "73/7819;130/1013-2;76/847-6;46/3783", "google_scholar": "https://scholar.google.com.hk/citations?user=tO6IqzAAAAAJ;2WQ--c4AAAAJ;https://scholar.google.com.sg/citations?user=KomQOFkAAAAJ;", "orcid": ";0000-0001-8190-5087;0000-0003-2839-5799;", "linkedin": ";;;", "or_profile": "~Changchun_Li1;~Ximing_Li1;~Lei_Feng1;~Jihong_Ouyang2", "aff": "Jilin University;Jilin University;Chongqing University;Jilin University", "aff_domain": "jlu.edu;jlu.edu;cqu.edu.cn;jlu.edu.cn", "position": "PhD student;Associate Professor;Full Professor;Full Professor", "bibtex": "@inproceedings{\nli2022who,\ntitle={Who Is Your Right Mixup Partner in Positive and Unlabeled Learning},\nauthor={Changchun Li and Ximing Li and Lei Feng and Jihong Ouyang},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=NH29920YEmj}\n}", "github": "", "project": "", "reviewers": "XfpN;fDKn;eqge;x3p8", "pdf_size": 0, "recommendation": "6;6;8;8", "confidence": "4;4;4;3", "correctness": "3;3;3;3", "technical_novelty": "2;3;3;3", "empirical_novelty": "3;3;3;3", "wc_summary_paper": "54;51;107;70", "wc_summary_review": "53;45;16;39", "wc_main_review": "349;215;131;169", "wc_review": "456;311;254;278", "wc_reply_reviewers": "116;31;0;30", "wc_reply_authors": "741;424;555;942", "reply_reviewers": "1;1;0;1", "reply_authors": "2;1;1;3", "recommendation_avg": [ 7.0, 1.0 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 70.5, 22.276669409945463 ], "wc_summary_review_avg": [ 38.25, 13.77270852083932 ], "wc_main_review_avg": [ 216.0, 82.34682750416071 ], "wc_review_avg": [ 324.75, 78.4326940758763 ], "wc_reply_reviewers_avg": [ 44.25, 43.25722483007896 ], "wc_reply_authors_avg": [ 665.5, 195.3746401148317 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.75, 0.82915619758885 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.0, "gs_citation": 36, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8343152560266768032&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "pdf": "https://openreview.net/pdf?id=NH29920YEmj", "email": "jlu.edu;jlu.edu;cqu.edu.cn;jlu.edu.cn", "author_num": 4, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Jilin University;Chongqing University", "aff_unique_dep": ";", "aff_unique_url": "http://www.jlu.edu.cn;https://www.cqu.edu.cn", "aff_unique_abbr": "JLU;CQU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "id": "NHHM1jjrH1", "title": "An Optimization Perspective on Realizing Backdoor Injection Attacks on Deep Neural Networks in Hardware", "track": "main", "status": "Reject", "tldr": "", "abstract": "State-of-the-art deep neural networks (DNNs) have been proven to be vulnerable to adversarial manipulation and backdoor attacks. Backdoored models deviate from expected behavior on inputs with predefined triggers while retaining performance on clean data. Recent works focus on software simulation of backdoor injection during the inference phase by modifying network weights, which we find often unrealistic in practice due to the hardware restriction such as bit allocation in memory. In contrast, in this work, we investigate the viability of backdoor injection attacks in real-life deployments of DNNs on hardware and address such practical issues in hardware implementation from a novel optimization perspective. \nWe are motivated by the fact that the vulnerable memory locations are very rare, device-specific, and sparsely distributed. Consequently, we propose a novel network training algorithm based on constrained optimization for realistic backdoor injection attack in hardware. By modifying parameters uniformly across the convolutional and fully-connected layers as well as optimizing the trigger pattern together, we achieve the state-of-the-art attack performance with fewer bit flips. For instance, our method on a hardware-deployed ResNet-20 model trained on CIFAR-10 can achieve over 91\\% test accuracy and 94\\% attack success rate by flipping only 10 bits out of 2.2 million bits. ", "keywords": "targeted attack;bit-flip;weight attack;backdoor;trojan", "primary_area": "", "supplementary_material": "/attachment/a41d67be35112b44d4d7450472e3cf09711ef5a4.zip", "author": "M. Caner Tol;Saad Islam;Berk Sunar;Ziming Zhang", "authorids": "~M._Caner_Tol1;~Saad_Islam1;~Berk_Sunar1;~Ziming_Zhang4", "gender": "M;;M;M", "homepage": "https://www.wpi.edu/people/doctoral-student/sislam;http://sunar.org;https://canertol.github.io/;https://zimingzhang.wordpress.com/", "dblp": ";;;", "google_scholar": "nK_n1HUAAAAJ;BH2Z9v0AAAAJ;8RW20C8AAAAJ;2yqx3oIAAAAJ", "orcid": ";;0000-0002-4512-9145;", "linkedin": "saad-islam-14994515;;canertol/;", "or_profile": "~Saad_Islam1;~Berk_Sunar1;~Caner_Tol1;~Ziming_Zhang1", "aff": "Worcester Polytechnic Institute;Worcester Polytechnic Institute;Worcester Polytechnic Institute;Worcester Polytechnic Institute", "aff_domain": "wpi.edu;wpi.edu;wpi.edu;wpi.edu", "position": "PhD student;Full Professor;PhD student;Assistant Professor", "bibtex": "@misc{\ntol2022an,\ntitle={An Optimization Perspective on Realizing Backdoor Injection Attacks on Deep Neural Networks in Hardware},\nauthor={M. Caner Tol and Saad Islam and Berk Sunar and Ziming Zhang},\nyear={2022},\nurl={https://openreview.net/forum?id=NHHM1jjrH1}\n}", "github": "", "project": "", "reviewers": "Yjkz;gfSk;erCz;Yjj7", "site": "https://openreview.net/forum?id=NHHM1jjrH1", "pdf_size": 0, "recommendation": "5;5;5;5", "confidence": "3;4;5;3", "correctness": "4;2;4;3", "technical_novelty": "3;3;3;2", "empirical_novelty": "3;3;2;2", "wc_summary_paper": "150;16;54;51", "wc_summary_review": "40;34;5;33", "wc_main_review": "373;214;390;335", "wc_review": "563;264;449;419", "wc_reply_reviewers": "17;27;0;0", "wc_reply_authors": "1769;1721;1689;2062", "reply_reviewers": "1;1;0;0", "reply_authors": "3;3;3;3", "recommendation_avg": [ 5.0, 0.0 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 67.75, 49.781397127842844 ], "wc_summary_review_avg": [ 28.0, 13.546217184144066 ], "wc_main_review_avg": [ 328.0, 68.76408946535975 ], "wc_review_avg": [ 423.75, 106.73653310839734 ], "wc_reply_reviewers_avg": [ 11.0, 11.554220008291344 ], "wc_reply_authors_avg": [ 1810.25, 148.1103895748033 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 3.0, 0.0 ], "replies_avg": [ 20, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5089932179531432454&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Worcester Polytechnic Institute", "aff_unique_dep": "", "aff_unique_url": "https://www.wpi.edu", "aff_unique_abbr": "WPI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "NJTRDt9TPb", "title": "Diverse Imitation Learning via Self-OrganizingGenerative Models", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Imitation learning is the problem of teaching an agent to replicate expert policy from demonstrations when the underlying reward function is unavailable. This task becomes particularly challenging when the expert demonstrates a mixture of behaviors, often modeled by a discrete or continuous latent variable.\nPrior work has addressed imitation learning in such mixture scenarios by recovering the underlying latent variables, in the context of both supervised learning (behavior cloning), and generative adversarial imitation learning (GAIL). In several robotic locomotion tasks, simulated in the MuJoCo platform, we observe that existing models fail in distinguishing and imitating different modes of behavior in both cases of discrete and continuous latent variables. To address this problem, we introduce a novel generative model for behavior cloning, in a mode-separating manner. We also integrate our model with GAIL, to achieve robustness to the problem of compounding error caused by unseen states. We show that our models outperform the state-of-the-art in aforementioned experiments.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/e865cae8ee98d7b2113ccf8566eb22e160cf69ef.zip", "author": "Arash Vahabpour;QIUJING LU;Tianyi Wang;Omead Pooladzandi;Vwani Roychowhury", "authorids": "~Arash_Vahabpour1;~QIUJING_LU1;~Tianyi_Wang3;~Omead_Pooladzandi1;vwani@g.ucla.edu", "gender": "M;;;M;", "homepage": ";;;;", "dblp": ";;;319/9453;", "google_scholar": ";;https://scholar.google.com/citations?hl=en;;", "orcid": ";;0000-0002-7263-5852;;", "linkedin": "arash-vahabpour-93141a51/;;;omead-pooladzandi-8a5051109;", "or_profile": "~Arash_Vahabpour1;~QIUJING_LU1;~Tianyi_Wang3;~Omead_Pooladzandi1;vwani@g.ucla.edu", "aff": ";;University of California, Los Angeles;;", "aff_domain": ";;ucla.edu;;", "position": ";;PhD student;;", "bibtex": "@misc{\nvahabpour2022diverse,\ntitle={Diverse Imitation Learning via Self-OrganizingGenerative Models},\nauthor={Arash Vahabpour and QIUJING LU and Tianyi Wang and Omead Pooladzandi and Vwani Roychowhury},\nyear={2022},\nurl={https://openreview.net/forum?id=NJTRDt9TPb}\n}", "github": "", "project": "", "reviewers": "baXn;PiJG;SX6m", "site": "https://openreview.net/forum?id=NJTRDt9TPb", "pdf_size": 0, "recommendation": "3;6;6", "confidence": "4;3;3", "correctness": "2;3;3", "technical_novelty": "1;2;2", "empirical_novelty": "3;2;4", "wc_summary_paper": "41;102;193", "wc_summary_review": "66;52;17", "wc_main_review": "189;275;341", "wc_review": "296;429;551", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 5.0, 1.4142135623730951 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "correctness_avg": [ 2.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 1.6666666666666667, 0.4714045207910317 ], "empirical_novelty_avg": [ 3.0, 0.816496580927726 ], "wc_summary_paper_avg": [ 112.0, 62.455317361027525 ], "wc_summary_review_avg": [ 45.0, 20.607442021431645 ], "wc_main_review_avg": [ 268.3333333333333, 62.232538827279804 ], "wc_review_avg": [ 425.3333333333333, 104.1355953659565 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -1.0, "corr_recommendation_correctness": 1.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10174569965146701400&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff_unique_index": "0", "aff_unique_norm": "University of California, Los Angeles", "aff_unique_dep": "", "aff_unique_url": "https://www.ucla.edu", "aff_unique_abbr": "UCLA", "aff_campus_unique_index": "0", "aff_campus_unique": "Los Angeles", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "id": "NK5hHymegzo", "title": "On the One-sided Convergence of Adam-type Algorithms in Non-convex Non-concave Min-max Optimization", "track": "main", "status": "Reject", "tldr": "", "abstract": "Adam-type methods, the extension of adaptive gradient methods, have shown great performance in the training of both supervised and unsupervised machine learning models. In particular, Adam-type optimizers have been widely used empirically as the default tool for training generative adversarial networks (GANs). On the theory side, however, despite the existence of theoretical results showing the efficiency of Adam-type methods in minimization problems, the reason of their wonderful performance still remains absent in GAN's training. In existing works, the fast convergence has long been considered as one of the most important reasons and multiple works have been proposed to give a theoretical guarantee of the convergence to a critical point of min-max optimization algorithms under certain assumptions. In this paper, we firstly argue empirically that in GAN's training, Adam does not converge to a critical point even upon successful training: Only the generator is converging while the discriminator's gradient norm remains high throughout the training. We name this one-sided convergence. Then we bridge the gap between experiments and theory by showing that Adam-type algorithms provably converge to a one-sided first order stationary points in min-max optimization problems under the one-sided MVI condition. We also empirically verify that such one-sided MVI condition is satisfied for standard GANs after trained over standard data sets. To the best of our knowledge, this is the very first result which provides an empirical observation and a strict theoretical guarantee on the one-sided convergence of Adam-type algorithms in min-max optimization. ", "keywords": "Optimization;GAN;Convergence", "primary_area": "", "supplementary_material": "", "author": "Zehao Dou;Yuanzhi Li", "authorids": "~Zehao_Dou2;~Yuanzhi_Li1", "gender": "M;M", "homepage": "https://zehaodou-official.github.io;", "dblp": "224/5549.html;73/3628", "google_scholar": "CypbdCkAAAAJ;", "orcid": ";", "linkedin": "zehao-dou-870b4b133/;", "or_profile": "~Zehao_Dou2;~Yuanzhi_Li1", "aff": "Yale University;Carnegie Mellon University", "aff_domain": "yale.edu;andrew.cmu.edu", "position": "PhD student;Assistant Professor", "bibtex": "@misc{\ndou2022on,\ntitle={On the One-sided Convergence of Adam-type Algorithms in Non-convex Non-concave Min-max Optimization},\nauthor={Zehao Dou and Yuanzhi Li},\nyear={2022},\nurl={https://openreview.net/forum?id=NK5hHymegzo}\n}", "github": "", "project": "", "reviewers": "wnya;M4vM;jDRv;nfde", "site": "https://openreview.net/forum?id=NK5hHymegzo", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "5;4;4;3", "correctness": "2;3;3;3", "technical_novelty": "2;2;2;4", "empirical_novelty": "2;2;2;4", "wc_summary_paper": "41;48;33;48", "wc_summary_review": "48;30;25;21", "wc_main_review": "253;225;279;393", "wc_review": "342;303;337;462", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.8660254037844386 ], "empirical_novelty_avg": [ 2.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 42.5, 6.18465843842649 ], "wc_summary_review_avg": [ 31.0, 10.319883720275147 ], "wc_main_review_avg": [ 287.5, 63.83376849285964 ], "wc_review_avg": [ 361.0, 60.212125024782175 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.9733285267845754, "corr_recommendation_correctness": 0.9271726499455306, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2844495666344293158&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1", "aff_unique_norm": "Yale University;Carnegie Mellon University", "aff_unique_dep": ";", "aff_unique_url": "https://www.yale.edu;https://www.cmu.edu", "aff_unique_abbr": "Yale;CMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "CycleMLP: A MLP-like Architecture for Dense Prediction", "status": "Oral", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6273", "id": "NMEceG4v69Y", "poster": "", "openreview": "https://openreview.net/forum?id=NMEceG4v69Y", "slides": "https://iclr.cc/virtual/2022/poster/6273", "video": "https://iclr.cc/virtual/2022/poster/6273", "author_site": "Shoufa Chen, Enze Xie, Chongjian GE, Runjian Chen, Ding Liang, Ping Luo", "tldr": "", "abstract": "This paper presents a simple MLP-like architecture, CycleMLP, which is a versatile backbone for visual recognition and dense predictions. As compared to modern MLP architectures, e.g. , MLP-Mixer, ResMLP, and gMLP, whose architectures are correlated to image size and thus are infeasible in object detection and segmentation, CycleMLP has two advantages compared to modern approaches. (1) It can cope\nwith various image sizes. (2) It achieves linear computational complexity to image size by using local windows. In contrast, previous MLPs have $O(N^2)$ computations due to fully spatial connections. We build a family of models which surpass existing MLPs and even state-of-the-art Transformer-based models, e.g. Swin Transformer, while using fewer parameters and FLOPs. We expand the MLP-like models\u2019 applicability, making them a versatile backbone for dense prediction tasks. CycleMLP achieves competitive results on object detection, instance segmentation, and semantic segmentation. In particular, CycleMLP-Tiny outperforms Swin-Tiny by 1.3% mIoU on ADE20K dataset with fewer FLOPs. Moreover, CycleMLP also shows excellent zero-shot robustness on ImageNet-C dataset.", "keywords": "MLP;Dense Prediction", "primary_area": "", "supplementary_material": "/attachment/fd8f09073416a32a523ca794fda894d24f943625.zip", "author": "Shoufa Chen;Enze Xie;Chongjian GE;Runjian Chen;Ding Liang;Ping Luo", "authorids": "~Shoufa_Chen1;~Enze_Xie1;~Chongjian_GE1;~Runjian_Chen1;~Ding_Liang1;~Ping_Luo2", "gender": "M;M;M;M;;", "homepage": "https://www.shoufachen.com;https://xieenze.github.io/;https://chongjiange.github.io;https://runjian-chen.github.io;;", "dblp": "187/4654;218/5441;287/4197;257/4647;;", "google_scholar": "ogoCvHEAAAAJ;42MVVPgAAAAJ;https://scholar.google.com.hk/citations?user=7DA_vcUAAAAJ;_USUMdAAAAAJ;;", "orcid": "0000-0002-6126-2595;;;0000-0003-0519-496X;;", "linkedin": ";;chongjian-ge-%EF%BC%88%E8%91%9B%E5%B4%87%E5%89%91%EF%BC%89-3b393310b/;;;", "or_profile": "~Shoufa_Chen1;~Enze_Xie1;~Chongjian_GE1;~Runjian_Chen1;~Ding_Liang1;~Ping_Luo2", "aff": "The University of Hong Kong;The University of Hong Kong;The University of Hong Kong;University of Hong Kong;;", "aff_domain": "hku.hk;hku.hk;hku.hk;hku.hk;;", "position": "PhD student;PhD student;PhD student;PhD student;;", "bibtex": "@inproceedings{\nchen2022cyclemlp,\ntitle={Cycle{MLP}: A {MLP}-like Architecture for Dense Prediction},\nauthor={Shoufa Chen and Enze Xie and Chongjian GE and Runjian Chen and Ding Liang and Ping Luo},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=NMEceG4v69Y}\n}", "github": "", "project": "", "reviewers": "4cNH;34rq;dMw3;QA1s", "pdf_size": 0, "recommendation": "6;8;8;8", "confidence": "4;3;3;4", "correctness": "3;4;3;3", "technical_novelty": "2;3;3;3", "empirical_novelty": "3;3;3;3", "wc_summary_paper": "53;55;138;65", "wc_summary_review": "38;61;69;40", "wc_main_review": "185;98;278;173", "wc_review": "276;214;485;278", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 7.5, 0.8660254037844386 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 77.75, 35.08115591026043 ], "wc_summary_review_avg": [ 52.0, 13.322912594474229 ], "wc_main_review_avg": [ 183.5, 63.93942445784135 ], "wc_review_avg": [ 313.25, 102.44358203421042 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1, "pdf": "https://openreview.net/pdf?id=NMEceG4v69Y", "email": "hku.hk;hku.hk;hku.hk;hku.hk;;", "author_num": 6, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "University of Hong Kong", "aff_unique_dep": "", "aff_unique_url": "https://www.hku.hk", "aff_unique_abbr": "HKU", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Hong Kong SAR", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "id": "NMSugaVzIT", "title": "Inductive Bias of Multi-Channel Linear Convolutional Networks with Bounded Weight Norm", "track": "main", "status": "Desk Reject", "tldr": "", "abstract": "We provide a function space characterization of the inductive bias resulting from minimizing the $\\ell_2$ norm of the weights in multi-channel linear convolutional networks. We define an \\textit{induced regularizer} in the function space as the minimum $\\ell_2$ norm of weights of a network required to realize a function. For two layer linear convolutional networks with $C$ output channels and kernel size $K$, we show the following: (a) If the inputs to the network have a single channel, the induced regularizer for any $K$ is \\textit{independent} of the number of output channels $C$. Furthermore, we derive the regularizer is a norm given by a semidefinite program (SDP). (b) In contrast, for networks with multi-channel inputs, multiple output channels can be necessary to merely realize all matrix-valued linear functions and thus the inductive bias \\emph{does} depend on $C$. However, for sufficiently large $C$, the induced regularizer is again given by an SDP that is independent of $C$. In particular, the induced regularizer for $K=1$ and $K=D$ are given in closed form as the nuclear norm and the $\\ell_{2,1}$ group-sparse norm, respectively, of the Fourier coefficients.\nWe investigate the applicability of our theoretical results to a broader scope of ReLU convolutional networks through experiments on MNIST and CIFAR-10 datasets.", "keywords": "minimizing parameter l2 norm;representation cost;implicit bias", "primary_area": "", "supplementary_material": "/attachment/2bc044af677c6cc9fb78fe0f921dd3166c12ef8e.zip", "author": "Meena Jagadeesan;Ilya Razenshteyn;Suriya Gunasekar", "authorids": "~Meena_Jagadeesan1;~Ilya_Razenshteyn1;~Suriya_Gunasekar1", "gender": "F;M;", "homepage": "https://mjagadeesan.github.io;https://ilyaraz.org;http://sgunasekar.github.io", "dblp": "205/2407;10/7611;", "google_scholar": "XW62DrcAAAAJ;;EkREu_QAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Meena_Jagadeesan1;~Ilya_Razenshteyn1;~Suriya_Gunasekar1", "aff": "University of California, Berkeley;;Microsoft", "aff_domain": "berkeley.edu;;microsoft.com", "position": "PhD student;;Senior Researcher", "bibtex": "@misc{\njagadeesan2022inductive,\ntitle={Inductive Bias of Multi-Channel Linear Convolutional Networks with Bounded Weight Norm},\nauthor={Meena Jagadeesan and Ilya Razenshteyn and Suriya Gunasekar},\nyear={2022},\nurl={https://openreview.net/forum?id=NMSugaVzIT}\n}", "github": "", "project": "", "reviewers": "", "site": "https://openreview.net/forum?id=NMSugaVzIT", "pdf_size": 0, "recommendation": "", "confidence": "", "correctness": "", "technical_novelty": "", "empirical_novelty": "", "wc_summary_paper": "", "wc_summary_review": "", "wc_main_review": "", "wc_review": "", "wc_reply_reviewers": "", "wc_reply_authors": "", "reply_reviewers": "", "reply_authors": "", "recommendation_avg": [ 0, 0 ], "confidence_avg": [ 0, 0 ], "correctness_avg": [ 0, 0 ], "technical_novelty_avg": [ 0, 0 ], "empirical_novelty_avg": [ 0, 0 ], "wc_summary_paper_avg": [ 0, 0 ], "wc_summary_review_avg": [ 0, 0 ], "wc_main_review_avg": [ 0, 0 ], "wc_review_avg": [ 0, 0 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 1, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0, "corr_recommendation_correctness": 0, "gs_citation": 26, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2276931106708519907&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff_unique_index": "0;1", "aff_unique_norm": "University of California, Berkeley;Microsoft", "aff_unique_dep": ";Microsoft Corporation", "aff_unique_url": "https://www.berkeley.edu;https://www.microsoft.com", "aff_unique_abbr": "UC Berkeley;Microsoft", "aff_campus_unique_index": "0", "aff_campus_unique": "Berkeley;", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "NOApNZTiTNU", "title": "Aggressive Q-Learning with Ensembles: Achieving Both High Sample Efficiency and High Asymptotic Performance", "track": "main", "status": "Reject", "tldr": "", "abstract": "Recently, Truncated Quantile Critics (TQC), using distributional representation of critics, was shown to provide state-of-the-art asymptotic training performance on all environments from the MuJoCo continuous control benchmark suite. Also recently, Randomized Ensemble Double Q-Learning (REDQ), using a high update-to-data ratio and target randomization, was shown to achieve high sample efficiency that is competitive with state-of-the-art model-based methods. In this paper, we propose a novel model-free algorithm, Aggressive Q-Learning with Ensembles (AQE), which improves the sample-efficiency performance of REDQ and the asymptotic performance of TQC, thereby providing overall state-of-the-art performance during all stages of training. Moreover, AQE is very simple, requiring neither distributional representation of critics nor target randomization. ", "keywords": "deep reinforcement learning;off-policy;model-free;sample efficiency;ensembles", "primary_area": "", "supplementary_material": "/attachment/5134b581519a43620a07b8088b55e9b59d88edb8.zip", "author": "Yanqiu Wu;Xinyue Chen;Che Wang;Yiming Zhang;Zijian Zhou;Keith W. Ross", "authorids": "~Yanqiu_Wu1;~Xinyue_Chen1;~Che_Wang1;~Yiming_Zhang1;~Zijian_Zhou1;~Keith_W._Ross1", "gender": "F;F;M;M;M;M", "homepage": ";;https://watchernyu.github.io/me/;;;http://www.nyu.edu/projects/keithwross/", "dblp": ";124/5261;130/6621;76/5416-10;;r/KWRoss", "google_scholar": ";83MbL0IAAAAJ;cx_Kg8MAAAAJ;A7-xkvcAAAAJ;KjC2xroAAAAJ;https://scholar.google.com.tw/citations?user=RhUcYmQAAAAJ", "orcid": ";;;;;", "linkedin": "yanqiu-wu-0993b0193/;;;;;", "or_profile": "~Yanqiu_Wu1;~Xinyue_Chen1;~Che_Wang1;~Yiming_Zhang1;~Zijian_Zhou1;~Keith_W._Ross1", "aff": "New York University;New York University;New York University;Woven by Toyota;Carnegie Mellon University;New York University", "aff_domain": "nyu.edu;nyu.edu;nyu.edu;woven.toyota;andrew.cmu.edu;nyu.edu", "position": "PhD student;Undergrad student;PhD student;Researcher;MS student;Full Professor", "bibtex": "@misc{\nwu2022aggressive,\ntitle={Aggressive Q-Learning with Ensembles: Achieving Both High Sample Efficiency and High Asymptotic Performance},\nauthor={Yanqiu Wu and Xinyue Chen and Che Wang and Yiming Zhang and Zijian Zhou and Keith W. Ross},\nyear={2022},\nurl={https://openreview.net/forum?id=NOApNZTiTNU}\n}", "github": "", "project": "", "reviewers": "fDzQ;q4e8;DEA7;H9au;iNq8", "site": "https://openreview.net/forum?id=NOApNZTiTNU", "pdf_size": 0, "recommendation": "3;3;5;5;6", "confidence": "4;4;4;4;4", "correctness": "3;3;4;3;3", "technical_novelty": "3;2;2;3;2", "empirical_novelty": "2;2;3;2;2", "wc_summary_paper": "44;38;132;125;31", "wc_summary_review": "13;23;58;38;52", "wc_main_review": "239;336;274;387;500", "wc_review": "296;397;464;550;583", "wc_reply_reviewers": "0;0;0;0;172", "wc_reply_authors": "175;358;307;285;162", "reply_reviewers": "0;0;0;0;1", "reply_authors": "1;1;1;1;1", "recommendation_avg": [ 4.4, 1.2 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.2, 0.39999999999999997 ], "technical_novelty_avg": [ 2.4, 0.4898979485566356 ], "empirical_novelty_avg": [ 2.2, 0.39999999999999997 ], "wc_summary_paper_avg": [ 74.0, 44.74371464239419 ], "wc_summary_review_avg": [ 36.8, 16.96349020691202 ], "wc_main_review_avg": [ 347.2, 91.78540188940724 ], "wc_review_avg": [ 458.0, 104.0096149401583 ], "wc_reply_reviewers_avg": [ 34.4, 68.8 ], "wc_reply_authors_avg": [ 257.4, 76.46332454190049 ], "reply_reviewers_avg": [ 0.2, 0.4 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.25000000000000006, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14947985579849006821&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0;0;1;2;0", "aff_unique_norm": "New York University;Toyota;Carnegie Mellon University", "aff_unique_dep": ";Woven;", "aff_unique_url": "https://www.nyu.edu;https://www.toyota-global.com;https://www.cmu.edu", "aff_unique_abbr": "NYU;Toyota;CMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;0;0", "aff_country_unique": "United States;Japan" }, { "id": "NP9T_pViXU", "title": "VIMPAC: Video Pre-Training via Masked Token Prediction and Contrastive Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Video understanding relies on perceiving the overall global content and modeling its internal connections (e.g., causality, movement, and spatio-temporal correspondence). To learn these interactions, we apply a mask-then-predict pre-training task on the discretized video tokens generated via VQ-VAE. Unlike language, where the text tokens are more independent, neighboring video tokens typically have strong correlations (e.g., consecutive video frames usually look very similar), and hence uniformly masking individual tokens will make the task too trivial to learn useful representations. To deal with this issue, we propose a block-wise masking strategy where we mask neighboring video tokens in both spatial and temporal domains. We also add an augmentation-free contrastive learning method to further capture the global content by predicting whether the video clips are sampled from the same video. We pre-train our model on uncurated videos and show that our pre-trained model can reach state-of-the-art results on several video understanding datasets (e.g., SSV2, Diving48). Lastly, we provide detailed analyses of the model scalability and pre-training method design. ", "keywords": "video;self-supervised learning;representation learning;pre-training;action recognition", "primary_area": "", "supplementary_material": "/attachment/92c5b4269e732fdb7fe2ce17a68fa9f54c7486d3.zip", "author": "Hao Tan;Jie Lei;Thomas Wolf;Mohit Bansal", "authorids": "~Hao_Tan1;~Jie_Lei3;~Thomas_Wolf1;~Mohit_Bansal2", "gender": "M;M;M;M", "homepage": "http://www.cs.unc.edu/~airsplay/;https://jayleicn.github.io/;https://thomwolf.io;https://www.cs.unc.edu/~mbansal/", "dblp": "94/877-2;;;32/5243.html", "google_scholar": "OV1Y3FUAAAAJ;SZN9FLIAAAAJ;D2H5EFEAAAAJ;DN8QtscAAAAJ", "orcid": ";;;", "linkedin": "hao-tan-23677180/;jie-lei-3b46a7126/;;", "or_profile": "~Hao_Tan1;~Jie_Lei3;~Thomas_Wolf1;~Mohit_Bansal2", "aff": "Adobe Systems;Department of Computer Science, University of North Carolina, Chapel Hill;Hugging Face;University of North Carolina at Chapel Hill", "aff_domain": "adobe.com;cs.unc.edu;huggingface.co;unc.edu", "position": "Research Scientist;PhD student;Researcher;Full Professor", "bibtex": "@misc{\ntan2022vimpac,\ntitle={{VIMPAC}: Video Pre-Training via Masked Token Prediction and Contrastive Learning},\nauthor={Hao Tan and Jie Lei and Thomas Wolf and Mohit Bansal},\nyear={2022},\nurl={https://openreview.net/forum?id=NP9T_pViXU}\n}", "github": "", "project": "", "reviewers": "E8qx;PC2U;hrwa", "site": "https://openreview.net/forum?id=NP9T_pViXU", "pdf_size": 0, "recommendation": "5;5;5", "confidence": "4;4;5", "correctness": "3;3;3", "technical_novelty": "2;2;3", "empirical_novelty": "2;2;2", "wc_summary_paper": "61;39;130", "wc_summary_review": "23;25;62", "wc_main_review": "279;191;406", "wc_review": "363;255;598", "wc_reply_reviewers": "252;144;193", "wc_reply_authors": "876;721;1304", "reply_reviewers": "2;1;1", "reply_authors": "2;2;3", "recommendation_avg": [ 5.0, 0.0 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 76.66666666666667, 38.76711091748892 ], "wc_summary_review_avg": [ 36.666666666666664, 17.93197020841702 ], "wc_main_review_avg": [ 292.0, 88.25342297422048 ], "wc_review_avg": [ 405.3333333333333, 143.19295451320997 ], "wc_reply_reviewers_avg": [ 196.33333333333334, 44.15377170248942 ], "wc_reply_authors_avg": [ 967.0, 246.55357767971378 ], "reply_reviewers_avg": [ 1.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 2.3333333333333335, 0.4714045207910317 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 91, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8537924174149509417&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;2;1", "aff_unique_norm": "Adobe;University of North Carolina;Hugging Face", "aff_unique_dep": "Adobe Systems Incorporated;Department of Computer Science;", "aff_unique_url": "https://www.adobe.com;https://www.unc.edu;https://huggingface.co", "aff_unique_abbr": "Adobe;UNC;Hugging Face", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Chapel Hill", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "NPJ5zWk_IQj", "title": "Translating Robot Skills: Learning Unsupervised Skill Correspondences Across Robots", "track": "main", "status": "Reject", "tldr": "", "abstract": "In this paper, we explore how we can endow robots with the ability to learn correspondences between their own skills, and those of morphologically different robots in different domains, in an entirely unsupervised manner. We make the insight that different morphological robots use similar task strategies to solve similar tasks. Based on this insight, we frame learning skill correspondences as a problem of matching distributions of sequences of skills across robots. We then present an unsupervised objective that encourages a learnt skill translation model to match these distributions across domains, inspired by recent advances in unsupervised machine translation. Our approach is able to learn semantically meaningful correspondences between skills across 3 robot domain pairs despite being completely unsupervised. Further, the learnt correspondences enable the transfer of task strategies across robots and domains. \nWe present dynamic visualizations of our results at https://sites.google.com/view/translatingrobotskills/home. ", "keywords": "Robot Skills;Unsupervised Correspondences;Unsupervised Learning;Alignment;Density Matching;Skill Learning;Robot Learning;Transfer Learning;Skill Transfer", "primary_area": "", "supplementary_material": "", "author": "Tanmay Shankar;Yixin Lin;Aravind Rajeswaran;Vikash Kumar;Stuart Anderson;Jean Oh", "authorids": "~Tanmay_Shankar1;~Yixin_Lin1;~Aravind_Rajeswaran1;~Vikash_Kumar2;~Stuart_Anderson1;~Jean_Oh2", "gender": "M;M;M;M;M;F", "homepage": "http://tanmayshankar.weebly.com/;https://yixinlin.net;http://aravindr93.github.io/;http://vikashplus.github.io/;;http://www.cs.cmu.edu/~jeanoh/", "dblp": ";236/9891;164/5778;82/7475;;62/4860", "google_scholar": "0k1qcvgAAAAJ;;_EJrRVAAAAAJ;nu3W--sAAAAJ;8orqBsYAAAAJ;", "orcid": ";;;;;", "linkedin": ";;;;stuartoanderson/;", "or_profile": "~Tanmay_Shankar1;~Yixin_Lin1;~Aravind_Rajeswaran1;~Vikash_Kumar2;~Stuart_Anderson1;~Jean_Oh2", "aff": "Carnegie Mellon University;Facebook AI Research;Meta Facebook;Meta Facebook;Meta;Carnegie Mellon University", "aff_domain": "cmu.edu;facebook.com;meta.com;facebook.com;meta.com;cmu.edu", "position": "PhD student;Research engineer;Research Scientist;Researcher;Researcher;Associate Professor", "bibtex": "@misc{\nshankar2022translating,\ntitle={Translating Robot Skills: Learning Unsupervised Skill Correspondences Across Robots},\nauthor={Tanmay Shankar and Yixin Lin and Aravind Rajeswaran and Vikash Kumar and Stuart Anderson and Jean Oh},\nyear={2022},\nurl={https://openreview.net/forum?id=NPJ5zWk_IQj}\n}", "github": "", "project": "", "reviewers": "dwTB;rjam;rKBg;KQQm", "site": "https://openreview.net/forum?id=NPJ5zWk_IQj", "pdf_size": 0, "recommendation": "3;5;6;6", "confidence": "4;3;4;3", "correctness": "2;3;3;3", "technical_novelty": "2;2;3;3", "empirical_novelty": "3;2;3;3", "wc_summary_paper": "44;74;37;124", "wc_summary_review": "40;22;41;41", "wc_main_review": "369;299;138;422", "wc_review": "453;395;216;587", "wc_reply_reviewers": "0;207;146;0", "wc_reply_authors": "2756;1981;2092;1363", "reply_reviewers": "0;1;1;0", "reply_authors": "4;3;3;2", "recommendation_avg": [ 5.0, 1.224744871391589 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 69.75, 34.266419421935524 ], "wc_summary_review_avg": [ 36.0, 8.093207028119323 ], "wc_main_review_avg": [ 307.0, 106.88077469779118 ], "wc_review_avg": [ 412.75, 133.23733523303443 ], "wc_reply_reviewers_avg": [ 88.25, 90.84705553841577 ], "wc_reply_authors_avg": [ 2048.0, 494.1947996488834 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 3.0, 0.7071067811865476 ], "replies_avg": [ 20, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.40824829046386296, "corr_recommendation_correctness": 0.9428090415820632, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17491515858263634565&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;1;1;1;0", "aff_unique_norm": "Carnegie Mellon University;Meta", "aff_unique_dep": ";Facebook AI Research", "aff_unique_url": "https://www.cmu.edu;https://research.facebook.com", "aff_unique_abbr": "CMU;FAIR", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "NQrx8EYMboO", "title": "Task-Agnostic Graph Neural Explanations", "track": "main", "status": "Reject", "tldr": "", "abstract": "Graph Neural Networks (GNNs) have emerged as powerful tools to encode graph structured data. Due to their broad applications, there is an increasing need to develop tools to explain how GNNs make decisions given graph structured data. Existing learning-based GNN explanation approaches are task-specific in training and hence suffer from crucial drawbacks. Specifically, they are incapable of producing explanations for a multitask prediction model with a single explainer. They are also unable to provide explanations in cases where the GNN is trained in a self-supervised manner, and the resulting representations are used in future down-stream tasks. To address these limitations, we propose a Task-Agnostic Graph Neural Explainer (TAGE) trained under self-supervision without knowledge about downstream tasks. TAGE enables the explanation of GNN embedding models without downstream tasks and allows efficient explanation of multitask models. Our extensive experiments show that TAGE can significantly speed up the explanation efficiency while achieving explanation quality as good as or even better than current state-of-the-art GNN explanation approaches.", "keywords": "Explainability;interpretability;graph neural networks;self-supervised learning", "primary_area": "", "supplementary_material": "/attachment/6e71213820477084613f11340b51e312a2743513.zip", "author": "Yaochen Xie;Sumeet Katariya;Xianfeng Tang;Edward W Huang;Nikhil Rao;Karthik Subbian;Shuiwang Ji", "authorids": "~Yaochen_Xie1;~Sumeet_Katariya1;~Xianfeng_Tang1;~Edward_W_Huang1;~Nikhil_Rao1;~Karthik_Subbian1;~Shuiwang_Ji1", "gender": "M;;M;M;M;M;M", "homepage": "https://ycremar.github.io/;;https://xta.ng/;;;http://mailtosuka.googlepages.com;http://people.tamu.edu/~sji", "dblp": "227/7154;72/9639;33/7694;192/2417.html;57/9513.html;32/5843;84/6405", "google_scholar": "Xw3ZjnMAAAAJ;;u1PEv-QAAAAJ;EqvdkCAAAAAJ;GhqD_rwAAAAJ;;BZGj6sAAAAAJ", "orcid": ";;;0000-0002-4461-8545;;;0000-0002-4205-4563", "linkedin": ";;xianfengtang/;ewhuang/;nikhil-rao-012068a1/;;shuiwang-ji-9a040715/", "or_profile": "~Yaochen_Xie1;~Sumeet_Katariya1;~Xianfeng_Tang1;~Edward_W_Huang1;~Nikhil_Rao1;~Karthik_Subbian1;~Shuiwang_Ji1", "aff": "Texas A&M;Amazon;Amazon;Amazon;Amazon;Amazon;Texas A&M University", "aff_domain": "tamu.edu;amazon.com;amazon.com;amazon.com;amazon.com;amazon.com;tamu.edu", "position": "PhD student;Applied Scientist;Researcher;Applied Scientist;Scientist;Researcher;Professor", "bibtex": "@misc{\nxie2022taskagnostic,\ntitle={Task-Agnostic Graph Neural Explanations},\nauthor={Yaochen Xie and Sumeet Katariya and Xianfeng Tang and Edward W Huang and Nikhil Rao and Karthik Subbian and Shuiwang Ji},\nyear={2022},\nurl={https://openreview.net/forum?id=NQrx8EYMboO}\n}", "github": "", "project": "", "reviewers": "CLGN;y6T6;QC3Y;ovjT", "site": "https://openreview.net/forum?id=NQrx8EYMboO", "pdf_size": 0, "recommendation": "5;5;5;6", "confidence": "5;5;3;3", "correctness": "4;3;4;3", "technical_novelty": "3;3;3;2", "empirical_novelty": "3;2;3;2", "wc_summary_paper": "127;87;42;83", "wc_summary_review": "30;34;18;13", "wc_main_review": "172;642;237;219", "wc_review": "329;763;297;315", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "655;1933;776;628", "reply_reviewers": "0;0;0;0", "reply_authors": "2;5;2;2", "recommendation_avg": [ 5.25, 0.4330127018922193 ], "confidence_avg": [ 4.0, 1.0 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 84.75, 30.086334107032716 ], "wc_summary_review_avg": [ 23.75, 8.554969316134336 ], "wc_main_review_avg": [ 317.5, 188.8471604234493 ], "wc_review_avg": [ 426.0, 194.89740891043164 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 998.0, 542.6919015426708 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.75, 1.299038105676658 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": -0.5773502691896257, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:E6-SLDV5yIkJ:scholar.google.com/&scioq=Task-Agnostic+Graph+Neural+Explanations&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1;1;1;1;1;0", "aff_unique_norm": "Texas A&M University;Amazon", "aff_unique_dep": ";Amazon.com, Inc.", "aff_unique_url": "https://www.tamu.edu;https://www.amazon.com", "aff_unique_abbr": "TAMU;Amazon", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "NRAZXJ9q3z", "title": "CDPS: Constrained DTW-Preserving Shapelets", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "The analysis of time series for clustering and classification is becoming ever more popular because of the increasingly ubiquitous nature of IoT, satellite constellations, and handheld and smart-wearable devices, etc. Euclidean distance is unsuitable because of potential phase shift, differences in sample duration, and compression and dilation of characteristic signals. As such, several similarity measures specific to time-series have been proposed, Dynamic Time Warping (DTW) being the most popular. Nevertheless, DTW does not respect the axioms of a metric and therefore DTW-preserving shapelets have been developed to regain these properties. This unsupervised approach to representation learning models DTW properties through the shapelet transform. This article proposes constrained DTW-preserving shapelets (CDPS), in which a limited amount of user knowledge is available in the form of must link and cannot link constraints, to guide the representation such that it better captures the user\u2019s interpretation of the data rather than the algorithm\u2019s bias. Subsequently, any unconstrained algorithm can be applied, e.g. K-means clustering, k-NN classification, etc, to obtain a result that fulfills the constraints (without explicit knowledge of them). Furthermore, this representation is generalisable to out-of-sample data, overcoming the limitations of standard transductive constrained-clustering algorithms. The proposed algorithm is studied on multiple time-series datasets, and its advantages over classical constrained clustering algorithms and unsupervised DTW-preserving shapelets are demonstrated. An open-source implementation based on PyTorch is available to take full advantage of GPU acceleration", "keywords": "Shapelets;Representational Learning;Clustering;Constrained Clustering;Constrained Learning", "primary_area": "", "supplementary_material": "", "author": "Hussein El Amouri;Thomas Lampert;Pierre Gan\u00e7arski;Clement Mallet", "authorids": "~Hussein_El_Amouri1;lampert@unistra.fr;gancarski@unistra.fr;~Clement_Mallet1", "gender": "M;;;M", "homepage": "https://sdc.icube.unistra.fr/en/index.php/Hussein_El_Amouri;;;https://www.umr-lastig.fr/clement-mallet/", "dblp": ";;;26/8109", "google_scholar": "https://scholar.google.com/citations?view_op=search_authors;;;https://scholar.google.fr/citations?user=1XoK-YMAAAAJ", "orcid": ";;;0000-0002-2675-165X", "linkedin": ";;;clementmallet/", "or_profile": "~Hussein_El_Amouri1;lampert@unistra.fr;gancarski@unistra.fr;~Clement_Mallet1", "aff": "university of strasbourg;;;IGN", "aff_domain": "unistra.fr;;;ign.fr", "position": "PhD student;;;Researcher", "bibtex": "@misc{\namouri2022cdps,\ntitle={{CDPS}: Constrained {DTW}-Preserving Shapelets},\nauthor={Hussein El Amouri and Thomas Lampert and Pierre Gan{\\c{c}}arski and Clement Mallet},\nyear={2022},\nurl={https://openreview.net/forum?id=NRAZXJ9q3z}\n}", "github": "", "project": "", "reviewers": "CqVd;H16k;h3Yy;eppg", "site": "https://openreview.net/forum?id=NRAZXJ9q3z", "pdf_size": 0, "recommendation": "3;3;3;3", "confidence": "4;5;5;4", "correctness": "2;2;2;2", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "20;54;65;68", "wc_summary_review": "71;26;57;27", "wc_main_review": "379;163;453;254", "wc_review": "470;243;575;349", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "89;87;77;74", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 4.5, 0.5 ], "correctness_avg": [ 2.0, 0.0 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 51.75, 19.057478846898924 ], "wc_summary_review_avg": [ 45.25, 19.395553614166314 ], "wc_main_review_avg": [ 312.25, 111.73042334118313 ], "wc_review_avg": [ 409.25, 124.9327319000109 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 81.75, 6.378675411086537 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10229706527860352137&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff_unique_index": "0;1", "aff_unique_norm": "University of Strasbourg;IGN Entertainment", "aff_unique_dep": ";", "aff_unique_url": "https://www.unistra.fr;https://www.ign.com", "aff_unique_abbr": "Unistra;IGN", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "France;United States" }, { "title": "Memory Augmented Optimizers for Deep Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6310", "id": "NRX9QZ6yqt", "poster": "", "openreview": "https://openreview.net/forum?id=NRX9QZ6yqt", "slides": "https://iclr.cc/virtual/2022/poster/6310", "video": "https://iclr.cc/virtual/2022/poster/6310", "author_site": "Paul-Aymeric McRae, Prasanna Parthasarathi, Mido Assran, Sarath Chandar", "tldr": "", "abstract": "Popular approaches for minimizing loss in data-driven learning often involve an abstraction or an explicit retention of the history of gradients for efficient parameter updates. \nThe aggregated history of gradients nudges the parameter updates in the right direction even when the gradients at any given step are not informative. \nAlthough the history of gradients summarized in meta-parameters or explicitly stored in memory has been shown effective in theory and practice, the question of whether $all$ or only a subset of the gradients in the history are sufficient in deciding the parameter updates remains unanswered. \nIn this paper, we propose a framework of memory-augmented gradient descent optimizers that retain a limited view of their gradient history in their internal memory. \nSuch optimizers scale well to large real-life datasets, and our experiments show that the memory augmented extensions of standard optimizers enjoy accelerated convergence and improved performance on a majority of computer vision and language tasks that we considered.\nAdditionally, we prove that the proposed class of optimizers with fixed-size memory converge under assumptions of strong convexity, regardless of which gradients are selected or how they are linearly combined to form the update step.", "keywords": "Optimization for Deep learning;Memory augmented Optimizers", "primary_area": "", "supplementary_material": "/attachment/9f4edb820d39b8c6dfcf0a140b6cb7cce18cf7b8.zip", "author": "Paul-Aymeric Martin McRae;Prasanna Parthasarathi;Mido Assran;Sarath Chandar", "authorids": "~Paul-Aymeric_Martin_McRae1;~Prasanna_Parthasarathi2;~Mido_Assran1;~Sarath_Chandar1", "gender": "M;M;M;M", "homepage": "https://aymeric-mcrae.github.io/;https://www.cs.mcgill.ca/~pparth2/;http://sarathchandar.in/;http://www.midoassran.ca/", "dblp": ";211/7503;45/8542;216/2717", "google_scholar": ";https://scholar.google.co.in/citations?hl=en;https://scholar.google.co.in/citations?user=yxWtZLAAAAAJ;gcQTTvkAAAAJ", "orcid": "0000-0002-7678-2147;;;0000-0001-9159-8447", "linkedin": ";prasanna-parthasarathi/;;", "or_profile": "~Paul-Aymeric_Martin_McRae1;~Prasanna_Parthasarathi2;~Sarath_Chandar1;~Mahmoud_Assran1", "aff": "Michigan State University;McGill University;\u00c9cole Polytechnique de Montr\u00e9al;Meta Facebook", "aff_domain": "msu.edu;mcgill.ca;polymtl.ca;fb.com", "position": "PhD student;PhD student;Assistant Professor;Researcher", "bibtex": "@inproceedings{\nmcrae2022memory,\ntitle={Memory Augmented Optimizers for Deep Learning},\nauthor={Paul-Aymeric Martin McRae and Prasanna Parthasarathi and Mido Assran and Sarath Chandar},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=NRX9QZ6yqt}\n}", "github": "", "project": "", "reviewers": "KE2X;tj5C;XUqb;v5GJ", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "4;4;3;4", "correctness": "3;3;4;4", "technical_novelty": "2;3;3;3", "empirical_novelty": "3;4;3;3", "wc_summary_paper": "102;34;70;83", "wc_summary_review": "63;7;44;73", "wc_main_review": "362;103;187;144", "wc_review": "527;144;301;300", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "421;323;325;437", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 72.25, 24.843258642939738 ], "wc_summary_review_avg": [ 46.75, 25.202926417382564 ], "wc_main_review_avg": [ 199.0, 98.68383859579035 ], "wc_review_avg": [ 318.0, 136.53754062528006 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 376.5, 52.80861672113747 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.13245323570650439, "corr_recommendation_correctness": 0.6882472016116854, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11073351928197752868&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=NRX9QZ6yqt", "email": "msu.edu;mcgill.ca;polymtl.ca;fb.com", "author_num": 4, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "Michigan State University;McGill University;\u00c9cole Polytechnique de Montr\u00e9al;Meta", "aff_unique_dep": ";;;Meta Platforms, Inc.", "aff_unique_url": "https://www.msu.edu;https://www.mcgill.ca;https://www.polymtl.ca;https://meta.com", "aff_unique_abbr": "MSU;McGill;Polytechnique Montr\u00e9al;Meta", "aff_campus_unique_index": "1", "aff_campus_unique": ";Montr\u00e9al", "aff_country_unique_index": "0;1;1;0", "aff_country_unique": "United States;Canada" }, { "id": "NUzrPpDjWp", "title": "Large-Scale Adversarial Attacks on Graph Neural Networks via Graph Coarsening", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Graph Neural Networks (GNNs) are fragile to adversarial attacks. However, existing state-of-the-art adversarial attack methods against GNNs are typically constrained by the graph's scale, failing to attack large graphs effectively. In this paper, we propose a novel attack method that attacks the graph in a divide-and-conquer manner to tackle large-scale adversarial attacks on GNNs. Specifically, the nodes are clustered based on node embeddings, coarsened graphs are constructed using the node clusters, and attacks are conducted on the coarsened graphs. Perturbations are selected starting with smaller coarsened graphs and progressing to larger detailed graphs while most of the irrelative nodes remain clustered, significantly reducing the complexity of generating adversarial graphs. Extensive empirical results show that the proposed method can greatly save the computational resources required to attack GNNs on large graphs while maintaining comparable performance on small graphs.", "keywords": "Graph Neural Network;Adversarial Attacks;Graph Coarsening", "primary_area": "", "supplementary_material": "/attachment/fc55b184e1b028411503dceaf72cde6076e810c9.zip", "author": "Jianfu Zhang;Yan Hong;Liqing Zhang;Qibin Zhao", "authorids": "~Jianfu_Zhang2;~Yan_Hong1;~Liqing_Zhang2;~Qibin_Zhao1", "gender": "M;F;M;M", "homepage": "https://matt-sjtu.github.io/;https://github.com/hy-zpg;http://bcmi.sjtu.edu.cn/~zhangliqing/;https://qibinzhao.github.io", "dblp": "78/3993-3;68/974-2.html;20/4627-1.html;13/1193", "google_scholar": "https://scholar.google.com/citations?hl=en;https://scholar.google.com.hk/citations?user=ztq5-xcAAAAJ;1smFmxAAAAAJ;https://scholar.google.co.jp/citations?hl=en", "orcid": "0000-0002-2673-5860;0000-0001-6401-0812;;0000-0002-4442-3182", "linkedin": ";;;", "or_profile": "~Jianfu_Zhang2;~Yan_Hong1;~Liqing_Zhang2;~Qibin_Zhao1", "aff": "RIKEN;Shanghai Jiaotong University;Shanghai Jiaotong University;RIKEN", "aff_domain": "riken.jp;sjtu.edu;sjtu.edu.cn;riken.jp", "position": "Postdoc;PhD student;Full Professor;Team Leader", "bibtex": "@misc{\nzhang2022largescale,\ntitle={Large-Scale Adversarial Attacks on Graph Neural Networks via Graph Coarsening},\nauthor={Jianfu Zhang and Yan Hong and Liqing Zhang and Qibin Zhao},\nyear={2022},\nurl={https://openreview.net/forum?id=NUzrPpDjWp}\n}", "github": "", "project": "", "reviewers": "EDtB;4SzH;h13J;J6he", "site": "https://openreview.net/forum?id=NUzrPpDjWp", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "4;5;4;4", "correctness": "2;2;3;3", "technical_novelty": "3;1;2;3", "empirical_novelty": "3;2;3;3", "wc_summary_paper": "84;89;46;74", "wc_summary_review": "103;56;19;37", "wc_main_review": "411;891;445;202", "wc_review": "598;1036;510;313", "wc_reply_reviewers": "0;0;0;12", "wc_reply_authors": "0;0;0;52", "reply_reviewers": "0;0;0;1", "reply_authors": "0;0;0;1", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.25, 0.82915619758885 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 73.25, 16.63392617513977 ], "wc_summary_review_avg": [ 53.75, 31.299960063872287 ], "wc_main_review_avg": [ 487.25, 250.98842104766507 ], "wc_review_avg": [ 614.25, 264.4601813128018 ], "wc_reply_reviewers_avg": [ 3.0, 5.196152422706632 ], "wc_reply_authors_avg": [ 13.0, 22.516660498395403 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 0.25, 0.4330127018922193 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 1.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:A1kDxQiMxp0J:scholar.google.com/&scioq=Large-Scale+Adversarial+Attacks+on+Graph+Neural+Networks+via+Graph+Coarsening&hl=en&as_sdt=0,14", "gs_version_total": 0, "aff_unique_index": "0;1;1;0", "aff_unique_norm": "RIKEN;Shanghai Jiao Tong University", "aff_unique_dep": ";", "aff_unique_url": "https://www.riken.jp;https://www.sjtu.edu.cn", "aff_unique_abbr": "RIKEN;SJTU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;0", "aff_country_unique": "Japan;China" }, { "id": "NX0nX7TE4lc", "title": "DIVERSIFY to Generalize: Learning Generalized Representations for Time Series Classification", "track": "main", "status": "Reject", "tldr": "", "abstract": "Time series classification is an important problem in real world. Due to its nonstationary property that the distribution changes over time, it remains challenging to build models for generalization to unseen distributions. In this paper, we propose to view the time series classification problem from the distribution perspective. We argue that the temporal complexity attributes to the unknown latent distributions within. To this end, we propose DIVERSIFY to learn generalized representations for time series classification. DIVERSIFY takes an iterative process: it first obtains the worst-case distribution scenario via adversarial training, then matches the distributions between all segments. We also present some theoretical insights. Extensive experiments on gesture recognition, speech commands recognition, and sensor-based human activity recognition demonstrate that DIVERSIFY significantly outperforms other baselines while effectively characterizing the latent distributions by qualitative and quantitative analysis.", "keywords": "Time series classification;domain generalization", "primary_area": "", "supplementary_material": "", "author": "Wang Lu;Jindong Wang;Yiqiang Chen;Xinwei Sun", "authorids": "~Wang_Lu1;~Jindong_Wang1;~Yiqiang_Chen1;~Xinwei_Sun1", "gender": "M;M;M;M", "homepage": "http://www.ict.cas.cn/sourcedb_2018_ict_cas/cn/jssrck/200909/t20090917_2496596.html;https://sunxinwei0625.github.io/sunxw.github.io/;https://jd92.wang/;", "dblp": ";145/6592-1;19/2969-1;", "google_scholar": "LC3SwhEAAAAJ;;hBZ_tKsAAAAJ;7OzlbvIAAAAJ", "orcid": ";;0000-0002-4833-0880;", "linkedin": ";;jindong-wang/;", "or_profile": "~Yiqiang_Chen1;~Xinwei_Sun1;~Jindong_Wang4;~wanglu2", "aff": "Chinese Academy of Sciences;Fudan University;Microsoft Research;, Chinese Academy of Sciences", "aff_domain": "ict.ac.cn;fudan.edu.cn;microsoft.com;ict.ac.cn", "position": "Full Professor;Assistant Professor;Researcher;PhD student", "bibtex": "@misc{\nlu2022diversify,\ntitle={{DIVERSIFY} to Generalize: Learning Generalized Representations for Time Series Classification},\nauthor={Wang Lu and Jindong Wang and Yiqiang Chen and Xinwei Sun},\nyear={2022},\nurl={https://openreview.net/forum?id=NX0nX7TE4lc}\n}", "github": "", "project": "", "reviewers": "8pYF;7NdQ;kiW8", "site": "https://openreview.net/forum?id=NX0nX7TE4lc", "pdf_size": 0, "recommendation": "6;6;8", "confidence": "4;3;4", "correctness": "2;3;4", "technical_novelty": "3;2;3", "empirical_novelty": "2;3;0", "wc_summary_paper": "139;93;51", "wc_summary_review": "30;63;53", "wc_main_review": "709;338;438", "wc_review": "878;494;542", "wc_reply_reviewers": "721;197;22", "wc_reply_authors": "2043;1667;397", "reply_reviewers": "2;1;1", "reply_authors": "4;4;1", "recommendation_avg": [ 6.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.0, 0.816496580927726 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 1.6666666666666667, 1.247219128924647 ], "wc_summary_paper_avg": [ 94.33333333333333, 35.93821859184948 ], "wc_summary_review_avg": [ 48.666666666666664, 13.816254517375139 ], "wc_main_review_avg": [ 495.0, 156.73119238577453 ], "wc_review_avg": [ 638.0, 170.83325203250098 ], "wc_reply_reviewers_avg": [ 313.3333333333333, 296.9852222286863 ], "wc_reply_authors_avg": [ 1369.0, 704.2404892269875 ], "reply_reviewers_avg": [ 1.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 3.0, 1.4142135623730951 ], "replies_avg": [ 20, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.49999999999999983, "corr_recommendation_correctness": 0.8660254037844385, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16332707287557126046&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "Chinese Academy of Sciences;Fudan University;Microsoft", "aff_unique_dep": ";;Microsoft Research", "aff_unique_url": "https://www.cas.cn;https://www.fudan.edu.cn;https://www.microsoft.com/en-us/research", "aff_unique_abbr": "CAS;Fudan;MSR", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "China;United States" }, { "title": "Safe Neurosymbolic Learning with Differentiable Symbolic Execution", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6860", "id": "NYBmJN4MyZ", "poster": "", "openreview": "https://openreview.net/forum?id=NYBmJN4MyZ", "slides": "https://iclr.cc/virtual/2022/poster/6860", "video": "https://iclr.cc/virtual/2022/poster/6860", "author_site": "Chenxi Yang, Swarat Chaudhuri", "tldr": "", "abstract": "We study the problem of learning verifiably safe parameters for programs that use neural networks as well as symbolic, human-written code. Such neurosymbolic programs arise in many safety-critical domains. However, because they need not be differentiable, it is hard to learn their parameters using existing gradient-based approaches to safe learning. Our method, Differentiable Symbolic Execution (DSE), samples control flow paths in a program, symbolically constructs worst-case \"safety loss\" along these paths, and backpropagates the gradients of these losses through program operations using a generalization of the REINFORCE estimator. We evaluate the method on a mix of synthetic tasks and real-world benchmarks. Our experiments show that DSE significantly outperforms the state-of-the-art DiffAI method on these tasks. ", "keywords": "Verified Learning;Neurosymbolic Programs;Safe Learning;Symbolic Execution", "primary_area": "", "supplementary_material": "", "author": "Chenxi Yang;Swarat Chaudhuri", "authorids": "~Chenxi_Yang1;~Swarat_Chaudhuri1", "gender": "F;M", "homepage": "https://chenxi-yang.github.io/;http://www.cs.utexas.edu/~swarat", "dblp": ";37/6100", "google_scholar": ";9j6RBYQAAAAJ", "orcid": ";0000-0002-6859-1391", "linkedin": ";swarat-chaudhuri-609b3092/", "or_profile": "~Chenxi_Yang1;~Swarat_Chaudhuri1", "aff": "University of Texas, Austin;University of Texas, Austin", "aff_domain": "utexas.edu;utexas.edu", "position": "PhD student;Associate Professor", "bibtex": "@inproceedings{\nyang2022safe,\ntitle={Safe Neurosymbolic Learning with Differentiable Symbolic Execution},\nauthor={Chenxi Yang and Swarat Chaudhuri},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=NYBmJN4MyZ}\n}", "github": "", "project": "", "reviewers": "yj7y;X2YM;ohyh", "pdf_size": 0, "recommendation": "6;6;8", "confidence": "4;4;4", "correctness": "3;3;3", "technical_novelty": "4;3;4", "empirical_novelty": "3;2;0", "wc_summary_paper": "85;64;124", "wc_summary_review": "56;46;122", "wc_main_review": "279;319;1506", "wc_review": "420;429;1752", "wc_reply_reviewers": "167;305;0", "wc_reply_authors": "1281;2902;4744", "reply_reviewers": "2;4;0", "reply_authors": "3;7;8", "recommendation_avg": [ 6.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 3.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 1.6666666666666667, 1.247219128924647 ], "wc_summary_paper_avg": [ 91.0, 24.859605789312106 ], "wc_summary_review_avg": [ 74.66666666666667, 33.717782977071444 ], "wc_main_review_avg": [ 701.3333333333334, 569.2195436638564 ], "wc_review_avg": [ 867.0, 625.8002876317652 ], "wc_reply_reviewers_avg": [ 157.33333333333334, 124.70320320219882 ], "wc_reply_authors_avg": [ 2975.6666666666665, 1414.7231374214375 ], "reply_reviewers_avg": [ 2.0, 1.632993161855452 ], "reply_authors_avg": [ 6.0, 2.160246899469287 ], "replies_avg": [ 29, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17654841530878333183&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "pdf": "https://openreview.net/pdf?id=NYBmJN4MyZ", "email": "utexas.edu;utexas.edu", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "University of Texas at Austin", "aff_unique_dep": "", "aff_unique_url": "https://www.utexas.edu", "aff_unique_abbr": "UT Austin", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Austin", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "NZQ8aTScT1-", "title": "Eigenspace Restructuring: a Principle of Space and Frequency in Neural Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "Understanding the fundamental principles behind the massive success of neural networks is one of the most important open questions in deep learning. However, due to the highly complex nature of the problem, progress has been relatively slow.In this note, through the lens of infinite-width networks, a.k.a. neural kernels, we present one such principle resulting from hierarchical locality. It is well-known that the eigenstructure of infinite-width multilayer perceptrons (MLPs) depends solely on the concept frequency, which measures the order of interactions. We show that the topologies from convolutional networks (CNNs) restructure the associated eigenspaces into finer subspaces. In addition to frequency, the new structure also depends on the concept space\u2014 the distance among interaction terms, defined via the length of a minimum spanning tree containing them. The resulting fine-grained eigenstructure dramatically improves the network\u2019s learnability, empowering them to simultaneously model a much richer class of interactions, including long-range-low-frequency interactions, short-range-high-frequency interactions, and various interpolations and extrapolations in-between. Finally, we show that increasing the depth of a CNN can improve the inter/extrapolation resolution and, therefore, the network\u2019s learnability.", "keywords": "neural network gaussian process;neural tangent kernels;eigenstructure;space and frequency;convolutional networks;spherical harmonics;hierarchical locality;over-parameterized networks", "primary_area": "", "supplementary_material": "", "author": "Lechao Xiao", "authorids": "~Lechao_Xiao2", "gender": "M", "homepage": "https://sites.google.com/site/lechaoxiao/", "dblp": "222/3238", "google_scholar": "fvwzUnIAAAAJ", "orcid": "", "linkedin": "", "or_profile": "~Lechao_Xiao2", "aff": "Google Research, Brain Team", "aff_domain": "google.com", "position": "Research Scientist", "bibtex": "@misc{\nxiao2022eigenspace,\ntitle={Eigenspace Restructuring: a Principle of Space and Frequency in Neural Networks},\nauthor={Lechao Xiao},\nyear={2022},\nurl={https://openreview.net/forum?id=NZQ8aTScT1-}\n}", "github": "", "project": "", "reviewers": "nazU;JW33;V5xU;aE2B", "site": "https://openreview.net/forum?id=NZQ8aTScT1-", "pdf_size": 0, "recommendation": "3;5;5;5", "confidence": "3;3;2;3", "correctness": "2;3;3;3", "technical_novelty": "2;2;3;3", "empirical_novelty": "3;3;2;3", "wc_summary_paper": "32;21;97;17", "wc_summary_review": "106;13;75;12", "wc_main_review": "220;202;347;484", "wc_review": "358;236;519;513", "wc_reply_reviewers": "0;0;34;33", "wc_reply_authors": "1151;356;1388;1445", "reply_reviewers": "0;0;1;1", "reply_authors": "2;1;3;3", "recommendation_avg": [ 4.5, 0.8660254037844386 ], "confidence_avg": [ 2.75, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 41.75, 32.36800117399899 ], "wc_summary_review_avg": [ 51.5, 40.512343797909296 ], "wc_main_review_avg": [ 313.25, 113.3211696903981 ], "wc_review_avg": [ 406.5, 117.70832595870183 ], "wc_reply_reviewers_avg": [ 16.75, 16.753730927766508 ], "wc_reply_authors_avg": [ 1085.0, 435.0879221490755 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 2.25, 0.82915619758885 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 1, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": 1.0, "gs_citation": 24, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18057674622423157017&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff_unique_index": "0", "aff_unique_norm": "Google", "aff_unique_dep": "Google Research", "aff_unique_url": "https://research.google", "aff_unique_abbr": "Google", "aff_campus_unique_index": "0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "id": "NblYkw2U2Yg", "title": "A Generalised Inverse Reinforcement Learning Framework", "track": "main", "status": "Reject", "tldr": "", "abstract": "The global objective of inverse Reinforcement Learning (IRL) is to estimate the unknown cost function of some MDP based on observed trajectories generated by (approximate) optimal policies. The classical approach consists in tuning this cost function so that associated optimal trajectories (that minimise the cumulative discounted cost, i.e. the classical RL loss) are \u201csimilar\u201d to the observed ones. Prior contributions focused on penalising degenerate solutions and improving algorithmic scalability. Quite orthogonally to them, we question the pertinence of characterising optimality with respect to the cumulative discounted cost as it induces an implicit bias against policies with longer mixing times. State of the art value based RL algorithms circumvent this issue by solving for the fixed point of the Bellman optimality operator, a stronger criterion that is not well defined for the inverse problem.\nTo alleviate this bias in IRL, we introduce an alternative training loss that puts more weights on future states which yields a reformulation of the (maximum entropy) IRL problem. The algorithms we devised exhibit enhanced performances (and similar tractability) than off-the-shelf ones in multiple OpenAI gym environments.", "keywords": "IRL", "primary_area": "", "supplementary_material": "/attachment/67bc9ff82df8aa1d749bcf9020cfc81aceefecbe.zip", "author": "Firas Jarboui;Vianney Perchet", "authorids": "~Firas_Jarboui1;~Vianney_Perchet3", "gender": "M;M", "homepage": ";", "dblp": ";83/7398", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": "~Firas_Jarboui1;~Vianney_Perchet1", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\njarboui2022a,\ntitle={A Generalised Inverse Reinforcement Learning Framework},\nauthor={Firas Jarboui and Vianney Perchet},\nyear={2022},\nurl={https://openreview.net/forum?id=NblYkw2U2Yg}\n}", "github": "", "project": "", "reviewers": "Msxm;hoAL;j2NT", "site": "https://openreview.net/forum?id=NblYkw2U2Yg", "pdf_size": 0, "recommendation": "5;5;6", "confidence": "3;4;3", "correctness": "3;3;3", "technical_novelty": "2;2;3", "empirical_novelty": "2;2;3", "wc_summary_paper": "232;139;59", "wc_summary_review": "26;53;28", "wc_main_review": "501;334;171", "wc_review": "759;526;258", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 143.33333333333334, 70.69339117311856 ], "wc_summary_review_avg": [ 35.666666666666664, 12.283683848458853 ], "wc_main_review_avg": [ 335.3333333333333, 134.72523478876883 ], "wc_review_avg": [ 514.3333333333334, 204.6986945624118 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.4999999999999999, "corr_recommendation_correctness": 0.0, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5236645994869262084&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "Nct9j3BVswZ", "title": "Self-Supervise, Refine, Repeat: Improving Unsupervised Anomaly Detection", "track": "main", "status": "Reject", "tldr": "", "abstract": "Anomaly detection (AD) - separating anomalies from normal data - has many applications across domains, from manufacturing to healthcare. While most previous works have been shown to be effective for cases with fully or partially labeled data, that setting is in practice less common due to labeling being particularly tedious for this task. In this paper, we focus on fully unsupervised AD, in which the entire training dataset, containing both normal and anomalous samples, is unlabeled. To tackle this problem effectively, we propose to improve the robustness of one-class classification trained on self-supervised representations using a data refinement process. Our proposed data refinement approach is based on an ensemble of one-class classifiers (OCCs), each of which is trained on a disjoint subset of training data. Representations learned by self-supervised learning on the refined data are iteratively updated as the refinement improves. We demonstrate our method on various unsupervised AD tasks with image and tabular data. With a 10% anomaly ratio on CIFAR-10 image data / 2.5% anomaly ratio on Thyroid tabular data, the proposed method outperforms the state-of-the-art one-class classification method by 6.3 AUC and 12.5 average precision / 22.9 F1-score.", "keywords": "Anomaly detection;Data refinement;Iterative training", "primary_area": "", "supplementary_material": "", "author": "Jinsung Yoon;Kihyuk Sohn;Chun-Liang Li;Sercan O Arik;Chen-Yu Lee;Tomas Pfister", "authorids": "~Jinsung_Yoon1;~Kihyuk_Sohn1;~Chun-Liang_Li1;~Sercan_O_Arik1;~Chen-Yu_Lee2;~Tomas_Pfister1", "gender": "M;M;M;M;;M", "homepage": "https://sites.google.com/corp/view/jinsungyoon;https://sites.google.com/site/kihyuksml/;http://chunliangli.github.io;https://www.sercanarik.com/;https://chl260.github.io/;http://tomas.pfister.fi", "dblp": "173/5409.html;53/10771;;;04/656;14/8360", "google_scholar": "kiFd6A8AAAAJ;VxpypngAAAAJ;https://scholar.google.com.tw/citations?user=vqHIt_sAAAAJ;;uWPUSEgAAAAJ;ahSpJOAAAAAJ", "orcid": ";;;0000-0001-6333-1729;;0009-0004-4088-8718", "linkedin": "jinsung-yoon-bb7751b8;;;;chenyulee260/;", "or_profile": "~Jinsung_Yoon1;~Kihyuk_Sohn1;~Chun-Liang_Li1;~Sercan_O_Arik1;~Chen-Yu_Lee2;~Tomas_Pfister1", "aff": "Google;Google;Google;Google;Google;Google", "aff_domain": "google.com;google.com;google.com;google.com;google.com;google.com", "position": "Research Scientist;Research Scientist;Researcher;Research Scientist;Research Scientist;Head of Research @ Cloud AI", "bibtex": "@misc{\nyoon2022selfsupervise,\ntitle={Self-Supervise, Refine, Repeat: Improving Unsupervised Anomaly Detection},\nauthor={Jinsung Yoon and Kihyuk Sohn and Chun-Liang Li and Sercan O Arik and Chen-Yu Lee and Tomas Pfister},\nyear={2022},\nurl={https://openreview.net/forum?id=Nct9j3BVswZ}\n}", "github": "", "project": "", "reviewers": "MspQ;s1MH;gVix;7NNr", "site": "https://openreview.net/forum?id=Nct9j3BVswZ", "pdf_size": 0, "recommendation": "5;6;6;6", "confidence": "4;5;5;5", "correctness": "3;4;4;3", "technical_novelty": "3;3;3;2", "empirical_novelty": "3;3;2;4", "wc_summary_paper": "67;38;111;120", "wc_summary_review": "44;6;117;65", "wc_main_review": "218;62;453;259", "wc_review": "329;106;681;444", "wc_reply_reviewers": "0;0;81;0", "wc_reply_authors": "703;17;997;778", "reply_reviewers": "0;0;1;0", "reply_authors": "1;1;2;1", "recommendation_avg": [ 5.75, 0.4330127018922193 ], "confidence_avg": [ 4.75, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 84.0, 33.279122584587476 ], "wc_summary_review_avg": [ 58.0, 40.09364039345891 ], "wc_main_review_avg": [ 248.0, 139.32157047636235 ], "wc_review_avg": [ 390.0, 207.34874004922239 ], "wc_reply_reviewers_avg": [ 20.25, 35.074028853269766 ], "wc_reply_authors_avg": [ 623.75, 366.58380692551054 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 1.0, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 24, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11138427271668234834&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;0;0;0;0", "aff_unique_norm": "Google", "aff_unique_dep": "Google", "aff_unique_url": "https://www.google.com", "aff_unique_abbr": "Google", "aff_campus_unique_index": "0;0;0;0;0;0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "BAM: Bayes with Adaptive Memory", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6508", "id": "NdOoQnYPj_", "poster": "", "openreview": "https://openreview.net/forum?id=NdOoQnYPj_", "slides": "https://iclr.cc/virtual/2022/poster/6508", "video": "https://iclr.cc/virtual/2022/poster/6508", "author_site": "Josue Nassar, Jennifer Brennan, Ben Evans, Kendall Lowrey", "tldr": "", "abstract": "Online learning via Bayes' theorem allows new data to be continuously integrated into an agent's current beliefs. However, a naive application of Bayesian methods in non-stationary environments leads to slow adaptation and results in state estimates that may converge confidently to the wrong parameter value. A common solution when learning in changing environments is to discard/downweight past data; however, this simple mechanism of \"forgetting\" fails to account for the fact that many real-world environments involve revisiting similar states. We propose a new framework, Bayes with Adaptive Memory (BAM), that takes advantage of past experience by allowing the agent to choose which past observations to remember and which to forget. We demonstrate that BAM generalizes many popular Bayesian update rules for non-stationary environments. Through a variety of experiments, we demonstrate the ability of BAM to continuously adapt in an ever-changing world.", "keywords": "Bayesian learning;online learning", "primary_area": "", "supplementary_material": "", "author": "Josue Nassar;Jennifer Rogers Brennan;Ben Evans;Kendall Lowrey", "authorids": "~Josue_Nassar1;~Jennifer_Rogers_Brennan1;~Ben_Evans1;~Kendall_Lowrey1", "gender": "M;F;M;M", "homepage": ";https://homes.cs.washington.edu/~jrb/;;https://bennevans.github.io/", "dblp": "230/8314;259/3055;153/7403;87/9175", "google_scholar": "a5RNqTYAAAAJ;;;JPQom2sAAAAJ", "orcid": ";;;", "linkedin": ";;;bnevans/", "or_profile": "~Josue_Nassar1;~Jennifer_Rogers_Brennan1;~Kendall_Lowrey1;~Benjamin_Evans1", "aff": "State University of New York, Stony Brook;University of Washington;University of Washington, Seattle;Microsoft", "aff_domain": "stonybrook.edu;washington.edu;uw.edu;microsoft.com", "position": "PhD student;PhD student;Postdoc;Intern", "bibtex": "@inproceedings{\nnassar2022bam,\ntitle={{BAM}: Bayes with Adaptive Memory},\nauthor={Josue Nassar and Jennifer Rogers Brennan and Ben Evans and Kendall Lowrey},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=NdOoQnYPj_}\n}", "github": "", "project": "", "reviewers": "EUxk;o4TK;1Svc;4Gvr", "pdf_size": 0, "recommendation": "5;6;8;8", "confidence": "4;4;3;3", "correctness": "3;4;4;4", "technical_novelty": "3;2;4;4", "empirical_novelty": "2;2;4;0", "wc_summary_paper": "89;162;48;69", "wc_summary_review": "36;50;173;27", "wc_main_review": "371;618;173;149", "wc_review": "496;830;394;245", "wc_reply_reviewers": "130;590;0;0", "wc_reply_authors": "867;1083;128;171", "reply_reviewers": "1;2;0;0", "reply_authors": "3;2;1;1", "recommendation_avg": [ 6.75, 1.299038105676658 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.25, 0.82915619758885 ], "empirical_novelty_avg": [ 2.0, 1.4142135623730951 ], "wc_summary_paper_avg": [ 92.0, 42.935998882057 ], "wc_summary_review_avg": [ 71.5, 59.17136131609615 ], "wc_main_review_avg": [ 327.75, 188.42422216901946 ], "wc_review_avg": [ 491.25, 214.9829935134405 ], "wc_reply_reviewers_avg": [ 180.0, 242.59018941416406 ], "wc_reply_authors_avg": [ 562.25, 420.0305792439403 ], "reply_reviewers_avg": [ 0.75, 0.82915619758885 ], "reply_authors_avg": [ 1.75, 0.82915619758885 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.9622504486493761, "corr_recommendation_correctness": 0.7777777777777777, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18388335473442065657&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=NdOoQnYPj_", "email": "stonybrook.edu;washington.edu;uw.edu;microsoft.com", "author_num": 4, "aff_unique_index": "0;1;1;2", "aff_unique_norm": "State University of New York;University of Washington;Microsoft", "aff_unique_dep": ";;Microsoft Corporation", "aff_unique_url": "https://www.stonybrook.edu;https://www.washington.edu;https://www.microsoft.com", "aff_unique_abbr": "SUNY Stony Brook;UW;Microsoft", "aff_campus_unique_index": "0;2", "aff_campus_unique": "Stony Brook;;Seattle", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "Ndffz5uo6H", "title": "Updater-Extractor Architecture for Inductive World State Representations", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Developing sequential models traditionally involves two stages - training and application. Retention of information acquired after training (at application time) is architecturally limited by the size of the model's context window (in the case of transformers), or by the practical difficulties associated with long sequences (in the case of RNNs). In this paper, we propose a novel transformer-based Updater-Extractor architecture that can work with sequences of arbitrary length and refine its long-term knowledge about the world based on inputs at application time. We explicitly train the model to incorporate incoming information into its world state representation, obtaining strong inductive generalization and the ability to handle extremely long-range dependencies. We propose a novel one-step training procedure that makes such training feasible, and prove a lemma that provides theoretical justification for this training procedure. Empirically, we investigate the model performance on a variety of different tasks: we use two new simulated tasks tasks to study the model's ability to handle extremely long-range dependencies, we demonstrate competitive performance on the challenging Pathfinder problem using vanilla attention.", "keywords": "transformers;long-term-memory;sequential processing;lifelong learning", "primary_area": "", "supplementary_material": "/attachment/99209303d5a6b8e420de49aafcf7366f373f5da2.zip", "author": "Arsenii Kirillovich Moskvichev;James A Liu", "authorids": "~Arsenii_Kirillovich_Moskvichev1;~James_A_Liu1", "gender": "M;", "homepage": "http://r-seny.com;", "dblp": "249/7049;", "google_scholar": "OuglBUgAAAAJ;", "orcid": ";", "linkedin": "arseny-moskvichev-601a809a;jamesaliu/", "or_profile": "~Arsenii_Kirillovich_Moskvichev1;~James_A_Liu1", "aff": "University of California, Irvine;", "aff_domain": "uci.edu;", "position": "PhD student;", "bibtex": "@misc{\nmoskvichev2022updaterextractor,\ntitle={Updater-Extractor Architecture for Inductive World State Representations},\nauthor={Arsenii Kirillovich Moskvichev and James A Liu},\nyear={2022},\nurl={https://openreview.net/forum?id=Ndffz5uo6H}\n}", "github": "", "project": "", "reviewers": "YoMT;GHMM;FtX8", "site": "https://openreview.net/forum?id=Ndffz5uo6H", "pdf_size": 0, "recommendation": "1;3;3", "confidence": "4;4;3", "correctness": "4;3;3", "technical_novelty": "1;2;2", "empirical_novelty": "1;2;2", "wc_summary_paper": "63;215;110", "wc_summary_review": "24;163;31", "wc_main_review": "134;1884;328", "wc_review": "221;2262;469", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;124;0", "reply_reviewers": "0;0;0", "reply_authors": "0;1;0", "recommendation_avg": [ 2.3333333333333335, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 1.6666666666666667, 0.4714045207910317 ], "empirical_novelty_avg": [ 1.6666666666666667, 0.4714045207910317 ], "wc_summary_paper_avg": [ 129.33333333333334, 63.54176229501 ], "wc_summary_review_avg": [ 72.66666666666667, 63.939207237986786 ], "wc_main_review_avg": [ 782.0, 783.2462362926915 ], "wc_review_avg": [ 984.0, 909.3363880691604 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 41.333333333333336, 58.45416057808793 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.5, "corr_recommendation_correctness": -1.0, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17162689115394923178&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0", "aff_unique_norm": "University of California, Irvine", "aff_unique_dep": "", "aff_unique_url": "https://www.uci.edu", "aff_unique_abbr": "UCI", "aff_campus_unique_index": "0", "aff_campus_unique": "Irvine", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "id": "NeRrtif_hfa", "title": "Better state exploration using action sequence equivalence", "track": "main", "status": "Reject", "tldr": "", "abstract": "Incorporating prior knowledge in reinforcement learning algorithms is mainly an open question. Even when insights about the environment dynamics are available, reinforcement learning is traditionally used in a \\emph{tabula rasa} setting and must explore and learn everything from scratch. In this paper, we consider the problem of exploiting priors about action sequence equivalence: that is, when different sequences of actions produce the same effect. We propose a new local exploration strategy calibrated to minimize collisions and maximize new state visitations. We show that this strategy can be computed at little cost, by solving a convex optimization problem. By replacing the usual $\\epsilon$-greedy strategy in a DQN, we demonstrate its potential in several environments with various dynamic structures.", "keywords": "Reinforcement learning;priors;structure;exploration", "primary_area": "", "supplementary_material": "/attachment/037fa687dd494d15672cb7b603d78694b6ed537d.zip", "author": "Nathan Grinsztajn;Toby Johnstone;Johan Ferret;Philippe Preux", "authorids": "~Nathan_Grinsztajn1;~Toby_Johnstone1;~Johan_Ferret1;~Philippe_Preux1", "gender": "M;;M;M", "homepage": "https://nathangrinsztajn.github.io/;;https://ferretj.github.io;https://philippe-preux.codeberg.page", "dblp": ";;;16/4835", "google_scholar": "yVHIYEYAAAAJ;;uyUnqjMAAAAJ;JTXxmeAAAAAJ", "orcid": "0000-0001-6817-5972;;;0000-0002-2067-2838", "linkedin": "nathan-grinsztajn-960379139/?locale=en_US;toby-johnstone-683b41143/;;", "or_profile": "~Nathan_Grinsztajn1;~Toby_Johnstone1;~Johan_Ferret1;~Philippe_Preux1", "aff": "INRIA;;Google;Universit\u00e9 de Lille", "aff_domain": "inria.fr;;google.com;univ-lille.fr", "position": "PhD student;;PhD student;Full Professor", "bibtex": "@misc{\ngrinsztajn2022better,\ntitle={Better state exploration using action sequence equivalence},\nauthor={Nathan Grinsztajn and Toby Johnstone and Johan Ferret and Philippe Preux},\nyear={2022},\nurl={https://openreview.net/forum?id=NeRrtif_hfa}\n}", "github": "", "project": "", "reviewers": "5h9r;Ln9a;FrAt", "site": "https://openreview.net/forum?id=NeRrtif_hfa", "pdf_size": 0, "recommendation": "5;5;8", "confidence": "4;4;4", "correctness": "4;4;4", "technical_novelty": "2;3;4", "empirical_novelty": "2;2;2", "wc_summary_paper": "115;53;117", "wc_summary_review": "30;41;73", "wc_main_review": "284;180;374", "wc_review": "429;274;564", "wc_reply_reviewers": "68;96;0", "wc_reply_authors": "540;232;264", "reply_reviewers": "1;1;0", "reply_authors": "1;1;1", "recommendation_avg": [ 6.0, 1.4142135623730951 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 4.0, 0.0 ], "technical_novelty_avg": [ 3.0, 0.816496580927726 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 95.0, 29.709706606876257 ], "wc_summary_review_avg": [ 48.0, 18.239152027072603 ], "wc_main_review_avg": [ 279.3333333333333, 79.26888138201241 ], "wc_review_avg": [ 422.3333333333333, 118.48581724784148 ], "wc_reply_reviewers_avg": [ 54.666666666666664, 40.30991055421593 ], "wc_reply_authors_avg": [ 345.3333333333333, 138.26865958062305 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10411329161568864789&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;1;2", "aff_unique_norm": "INRIA;Google;Universit\u00e9 de Lille", "aff_unique_dep": ";Google;", "aff_unique_url": "https://www.inria.fr;https://www.google.com;https://www.univ-lille.fr", "aff_unique_abbr": "INRIA;Google;UdeL", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;1;0", "aff_country_unique": "France;United States" }, { "title": "Pixelated Butterfly: Simple and Efficient Sparse training for Neural Network Models", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6571", "id": "Nfl-iXa-y7R", "poster": "", "openreview": "https://openreview.net/forum?id=Nfl-iXa-y7R", "slides": "https://iclr.cc/virtual/2022/poster/6571", "video": "https://iclr.cc/virtual/2022/poster/6571", "author_site": "Beidi Chen, Tri Dao, Kaizhao Liang, Jiaming Yang, Zhao Song, Atri Rudra, Christopher Re", "tldr": "", "abstract": "Overparameterized neural networks generalize well but are expensive to train. Ideally one would like to reduce their computational cost while retaining their generalization benefits. Sparse model training is a simple and promising approach to achieve this, but there remain challenges as existing methods struggle with accuracy loss, slow training runtime, or difficulty in sparsifying all model components. The core problem is that searching for a sparsity mask over a discrete set of sparse matrices is difficult and expensive. To address this, our main insight is to optimize over a continuous superset of sparse matrices with a fixed structure known as products of butterfly matrices. As butterfly matrices are not hardware efficient, we propose simple variants of butterfly (block and flat) to take advantage of modern hardware. Our method (Pixelated Butterfly) uses a simple fixed sparsity pattern based on flat block butterfly and low-rank matrices to sparsify most network layers (e.g., attention, MLP). We empirically validate that Pixelated Butterfly is $3\\times$ faster than Butterfly and speeds up training to achieve favorable accuracy--efficiency tradeoffs. On the ImageNet classification and WikiText-103 language modeling tasks, our sparse models train up to 2.3$\\times$ faster than the dense MLP-Mixer, Vision Transformer, and GPT-2 small with no drop in accuracy.", "keywords": "Sparse training;butterfly;low-rank;Lottery Tickets;Block sparsity;Hashing;Transformer;ViT;MLP-Mixer", "primary_area": "", "supplementary_material": "/attachment/275ea7734810800046956d024c7f07459c71cc48.zip", "author": "Beidi Chen;Tri Dao;Kaizhao Liang;Jiaming Yang;Zhao Song;Atri Rudra;Christopher Re", "authorids": "~Beidi_Chen1;~Tri_Dao1;~Kaizhao_Liang1;~Jiaming_Yang1;~Zhao_Song6;~Atri_Rudra1;~Christopher_Re1", "gender": "F;;M;M;;M;", "homepage": "https://www.andrew.cmu.edu/user/beidic/;https://tridao.me/;https://kaizhaoliang.github.io/Portfolio/;;;http://www.cse.buffalo.edu/faculty/atri/;", "dblp": "192/1339;206/7018;239/5146;;;04/4980;", "google_scholar": ";NQRw0bQAAAAJ;qKLmNfoAAAAJ;QBsx7kAAAAAJ;;https://scholar.google.com.tw/citations?user=_e5H8IoAAAAJ;", "orcid": ";;;0009-0005-2150-3453;;;", "linkedin": ";;kaizhao-liang-427a42132/;%E4%BD%B3%E6%98%8E-%E6%9D%A8-737140212/;;;", "or_profile": "~Beidi_Chen1;~Tri_Dao1;~Kaizhao_Liang1;~Jiaming_Yang1;~Zhao_Song6;~Atri_Rudra1;~Christopher_Re1", "aff": "Stanford University;Stanford University;SambaNova Systems, Inc;Peking University;;State University of New York, Buffalo;", "aff_domain": "stanford.edu;stanford.edu;sambanovasystems.com;pku.edu.cn;;buffalo.edu;", "position": "Postdoc;PhD student;Principal Engineer;Undergrad student;;Professor;", "bibtex": "@inproceedings{\nchen2022pixelated,\ntitle={Pixelated Butterfly: Simple and Efficient Sparse training for Neural Network Models},\nauthor={Beidi Chen and Tri Dao and Kaizhao Liang and Jiaming Yang and Zhao Song and Atri Rudra and Christopher Re},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=Nfl-iXa-y7R}\n}", "github": "", "project": "", "reviewers": "Ukv2;Q8VQ;GfcL;JNUH", "pdf_size": 0, "recommendation": "6;8;8;8", "confidence": "3;4;4;3", "correctness": "3;3;3;3", "technical_novelty": "3;3;4;3", "empirical_novelty": "3;3;0;3", "wc_summary_paper": "98;92;122;160", "wc_summary_review": "20;58;35;138", "wc_main_review": "367;236;279;264", "wc_review": "485;386;436;562", "wc_reply_reviewers": "0;17;0;0", "wc_reply_authors": "794;681;475;534", "reply_reviewers": "0;1;0;0", "reply_authors": "1;2;1;1", "recommendation_avg": [ 7.5, 0.8660254037844386 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 3.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 1.299038105676658 ], "wc_summary_paper_avg": [ 118.0, 26.720778431774775 ], "wc_summary_review_avg": [ 62.75, 45.50480743833557 ], "wc_main_review_avg": [ 286.5, 48.971930735881756 ], "wc_review_avg": [ 467.25, 64.94372563997233 ], "wc_reply_reviewers_avg": [ 4.25, 7.361215932167728 ], "wc_reply_authors_avg": [ 621.0, 124.91397039562868 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": 0.0, "gs_citation": 89, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9226195126053708660&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "pdf": "https://openreview.net/pdf?id=Nfl-iXa-y7R", "email": "stanford.edu;stanford.edu;sambanovasystems.com;pku.edu.cn;;buffalo.edu;", "author_num": 7, "aff_unique_index": "0;0;1;2;3", "aff_unique_norm": "Stanford University;SambaNova Systems;Peking University;State University of New York at Buffalo", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.stanford.edu;https://www.sambanova.com;http://www.pku.edu.cn;https://www.buffalo.edu", "aff_unique_abbr": "Stanford;SambaNova;Peking U;SUNY Buffalo", "aff_campus_unique_index": "0;0;2", "aff_campus_unique": "Stanford;;Buffalo", "aff_country_unique_index": "0;0;0;1;0", "aff_country_unique": "United States;China" }, { "id": "Ng8wWGXXIXh", "title": "On Invariance Penalties for Risk Minimization", "track": "main", "status": "Reject", "tldr": "", "abstract": "The Invariant Risk Minimization (IRM) principle was first proposed by Arjovsky et al. (2019) to address the domain generalization problem by leveraging data heterogeneity from differing experimental conditions. Specifically, IRM seeks to find a data representation under which an optimal classifier remains invariant across all domains. Despite the conceptual appeal of IRM, the effectiveness of the originally proposed invariance penalty has recently been brought into question through stylized experiments and counterexamples. In this work, we investigate the relationship between the data representation, invariance penalty, and risk. In doing so, we propose a novel invariance penalty, and utilize it to design an adaptive rule for tuning the coefficient of the penalty proposed by Arjovsky et al. (2019). More- over, we provide practical insights on how to avoid the potential failure of IRM considered in the nascent counterexamples. Finally, we conduct numerical experiments on both synthetic and real-world data sets with the objective of building invariant predictors. In our non-synthetic experiments, we sought to build a predictor of human health status using a collection of data sets from various studies which investigate the relationship between human gut microbiome and a particular disease. We substantiate the effectiveness of our proposed approach on these data sets and thus further facilitate the adoption of the IRM principle in other real-world applications.", "keywords": "domain generalization;out-of-distribution generalization;invariant risk minimization;invariant representation learning", "primary_area": "", "supplementary_material": "/attachment/06f4ce4da53fc259a3304dd2af3eba10b6d48fb7.zip", "author": "Kia Khezeli;Arno Blaas;Frank Soboczenski;Nicholas Chia;John Kalantari", "authorids": "~Kia_Khezeli1;~Arno_Blaas1;~Frank_Soboczenski1;~Nicholas_Chia1;~John_Kalantari1", "gender": ";;M;M;M", "homepage": ";https://github.com/arblox/;https://h21k.github.io/;;", "dblp": ";;133/1342;;", "google_scholar": ";;Nl3EwroAAAAJ;srHs6aoAAAAJ;", "orcid": ";;0000-0001-8185-6094;;", "linkedin": "kia-khezeli;;frank-soboczenski-5abb292b/;;jkalantari", "or_profile": "~Kia_Khezeli1;~Arno_Blaas1;~Frank_Soboczenski1;~Nicholas_Chia1;~John_Kalantari1", "aff": "Mayo Clinic;Apple;King's College London;Mayo Clinic;Mayo Clinic", "aff_domain": "mayo.edu;apple.com;kcl.ac.uk;mayo.edu;mayo.edu", "position": "ML Research Scientist;Researcher;Postdoc;Associate Professor;Assistant Professor", "bibtex": "@misc{\nkhezeli2022on,\ntitle={On Invariance Penalties for Risk Minimization},\nauthor={Kia Khezeli and Arno Blaas and Frank Soboczenski and Nicholas Chia and John Kalantari},\nyear={2022},\nurl={https://openreview.net/forum?id=Ng8wWGXXIXh}\n}", "github": "", "project": "", "reviewers": "AZJB;256p;2SD7;PkSM", "site": "https://openreview.net/forum?id=Ng8wWGXXIXh", "pdf_size": 0, "recommendation": "3;3;5;6", "confidence": "5;4;3;4", "correctness": "4;3;4;3", "technical_novelty": "2;1;2;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "49;26;111;235", "wc_summary_review": "46;80;197;212", "wc_main_review": "190;883;80;501", "wc_review": "285;989;388;948", "wc_reply_reviewers": "0;1012;0;0", "wc_reply_authors": "132;1383;242;312", "reply_reviewers": "0;2;0;0", "reply_authors": "1;3;1;1", "recommendation_avg": [ 4.25, 1.299038105676658 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 105.25, 81.10602628658366 ], "wc_summary_review_avg": [ 133.75, 71.95962409573858 ], "wc_main_review_avg": [ 413.5, 311.9539228796458 ], "wc_review_avg": [ 652.5, 318.42149738985904 ], "wc_reply_reviewers_avg": [ 253.0, 438.20885431492593 ], "wc_reply_authors_avg": [ 517.25, 503.9421469772101 ], "reply_reviewers_avg": [ 0.5, 0.8660254037844386 ], "reply_authors_avg": [ 1.5, 0.8660254037844386 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.5443310539518174, "corr_recommendation_correctness": -0.19245008972987526, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=661018617022437464&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1;2;0;0", "aff_unique_norm": "Mayo Clinic;Apple;King's College London", "aff_unique_dep": ";Apple Inc.;", "aff_unique_url": "https://www.mayoclinic.org;https://www.apple.com;https://www.kcl.ac.uk", "aff_unique_abbr": "Mayo Clinic;Apple;KCL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;0", "aff_country_unique": "United States;United Kingdom" }, { "id": "NgmcJ66xQz_", "title": "Divide and Explore: Multi-Agent Separate Exploration with Shared Intrinsic Motivations", "track": "main", "status": "Reject", "tldr": "", "abstract": "One of the greatest challenges of reinforcement learning is efficient exploration, especially when training signals are sparse or deceptive. The main difficulty of exploration lies in the size and complexity of the state space, which makes simple approaches such as exhaustive search infeasible. Our work is based on two important observations. On one hand, modern computing platforms are extremely scalable in terms of number of computing nodes and cores, which can complete asynchronous and well load-balanced computational tasks very fast. On the other hand, Divide-and-Conquer is a commonly used technique in computer science to solve similar problems (such as SAT) of doing efficient search in extremely large state space. In this paper, we apply the idea of divide-and-conquer in the context of intelligent exploration. The resulting exploration scheme can be combined with various specific intrinsic rewards designed for the given task. In our exploration scheme, the learning algorithm can automatically divide the state space into regions, and each agent is assigned to explore one of these regions. All the agents run asynchronously and they can be deployed onto modern distributed computing platforms. Our experiments show that the proposed method is highly efficient and is able to achieve state-of-the-art results in many RL tasks such as MiniGrid and Vizdoom. ", "keywords": "Deep Reinforcement Learning;Exploration;Intrinsic Motivation;Distributed Learning", "primary_area": "", "supplementary_material": "", "author": "Xiao Jing;Zhenwei Zhu;Hongliang Li;Xin Pei;Yoshua Bengio;Tong Che;Hongyong Song", "authorids": "~Xiao_Jing1;~Zhenwei_Zhu1;lihongliang.leon@bytedance.com;peixin@tsinghua.edu.cn;~Yoshua_Bengio1;~Tong_Che1;~Hongyong_Song1", "gender": "M;;;;M;M;", "homepage": ";;;;http://yoshuabengio.org;;", "dblp": ";;;;56/953;125/0738;", "google_scholar": ";;;;kukA0LcAAAAJ;7b5tlJkAAAAJ;s11zFYQAAAAJ", "orcid": "0000-0002-1615-6523;;;;;;", "linkedin": ";;;;yoshuabengio/?originalSubdomain=ca;;", "or_profile": "~Xiao_Jing1;~Zhenwei_Zhu1;lihongliang.leon@bytedance.com;peixin@tsinghua.edu.cn;~Yoshua_Bengio1;~Tong_Che1;~Hongyong_Song1", "aff": ";;;;University of Montreal;NVIDIA;ByteDance", "aff_domain": ";;;;umontreal.ca;nvidia.com;bytedance.com", "position": ";;;;Full Professor;Researcher;Researcher", "bibtex": "@misc{\njing2022divide,\ntitle={Divide and Explore: Multi-Agent Separate Exploration with Shared Intrinsic Motivations},\nauthor={Xiao Jing and Zhenwei Zhu and Hongliang Li and Xin Pei and Yoshua Bengio and Tong Che and Hongyong Song},\nyear={2022},\nurl={https://openreview.net/forum?id=NgmcJ66xQz_}\n}", "github": "", "project": "", "reviewers": "BfC2;VtVT;xEFb;cJEK", "site": "https://openreview.net/forum?id=NgmcJ66xQz_", "pdf_size": 0, "recommendation": "3;5;5;5", "confidence": "4;5;4;4", "correctness": "2;4;3;3", "technical_novelty": "1;2;3;1", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "71;43;85;116", "wc_summary_review": "56;46;70;43", "wc_main_review": "458;302;307;547", "wc_review": "585;391;462;706", "wc_reply_reviewers": "292;0;135;965", "wc_reply_authors": "560;440;340;371", "reply_reviewers": "1;0;1;2", "reply_authors": "1;1;1;1", "recommendation_avg": [ 4.5, 0.8660254037844386 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 1.75, 0.82915619758885 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 78.75, 26.290445032368698 ], "wc_summary_review_avg": [ 53.75, 10.54454835448157 ], "wc_main_review_avg": [ 403.5, 103.89538007053056 ], "wc_review_avg": [ 536.0, 120.2102325095497 ], "wc_reply_reviewers_avg": [ 348.0, 370.9103665307833 ], "wc_reply_authors_avg": [ 427.75, 84.49963017670551 ], "reply_reviewers_avg": [ 1.0, 0.7071067811865476 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": 0.816496580927726, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:WZE-CLnllV8J:scholar.google.com/&scioq=Divide+and+Explore:+Multi-Agent+Separate+Exploration+with+Shared+Intrinsic+Motivations&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1;2", "aff_unique_norm": "University of Montreal;NVIDIA;ByteDance", "aff_unique_dep": ";NVIDIA Corporation;", "aff_unique_url": "https://wwwumontreal.ca;https://www.nvidia.com;https://www.bytedance.com", "aff_unique_abbr": "UM;NVIDIA;ByteDance", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2", "aff_country_unique": "Canada;United States;China" }, { "title": "Normalization of Language Embeddings for Cross-Lingual Alignment", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6548", "id": "Nh7CtbyoqV5", "poster": "", "openreview": "https://openreview.net/forum?id=Nh7CtbyoqV5", "slides": "https://iclr.cc/virtual/2022/poster/6548", "video": "https://iclr.cc/virtual/2022/poster/6548", "author_site": "Prince Aboagye, Yan Zheng, Chin-Chia Michael Yeh, Junpeng Wang, Wei Zhang, Liang Wang, Hao Yang, Jeff Phillips", "tldr": "", "abstract": "Learning a good transfer function to map the word vectors from two languages into a shared cross-lingual word vector space plays a crucial role in cross-lingual NLP. It is useful in translation tasks and important in allowing complex models built on a high-resource language like English to be directly applied on an aligned low resource language. While Procrustes and other techniques can align language models with some success, it has recently been identified that structural differences (for instance, due to differing word frequency) create different profiles for various monolingual embedding. When these profiles differ across languages, it correlates with how well languages can align and their performance on cross-lingual downstream tasks. In this work, we develop a very general language embedding normalization procedure, building and subsuming various previous approaches, which removes these structural profiles across languages without destroying their intrinsic meaning. We demonstrate that meaning is retained and alignment is improved on similarity, translation, and cross-language classification tasks. Our proposed normalization clearly outperforms all prior approaches like centering and vector normalization on each task and with each alignment approach. ", "keywords": "cross-lingual word embeddings;natural language processing", "primary_area": "", "supplementary_material": "", "author": "Prince Osei Aboagye;Yan Zheng;Chin-Chia Michael Yeh;Junpeng Wang;Wei Zhang;Liang Wang;Hao Yang;Jeff Phillips", "authorids": "~Prince_Osei_Aboagye1;~Yan_Zheng2;~Chin-Chia_Michael_Yeh1;~Junpeng_Wang1;~Wei_Zhang52;~Liang_Wang11;~Hao_Yang8;~Jeff_Phillips1", "gender": "M;F;Unspecified;M;M;M;;M", "homepage": "https://poaboagye.github.io/;https://usa.visa.com/about-visa/visa-research/yan-zheng.html;https://mcyeh.github.io/;https://junpengw.github.io/;;;;http://www.cs.utah.edu/~jeffp/", "dblp": "326/7261;10/2381-1;117/5435;172/6642-1;10/4661-189.html;;;17/3933", "google_scholar": "T2ZJ6xYAAAAJ;fCDg0VQAAAAJ;F4d7Sv4AAAAJ;6_6MH5wAAAAJ;;SNToU-gAAAAJ;https://scholar.google.com/citations?hl=en;aFDuhV8AAAAJ", "orcid": ";;0000-0002-9807-2963;0000-0002-1130-9914;;;;", "linkedin": "prince-osei-aboagye-669514b6;;;;zhangwei0119/;liang-wang-7472712/;;", "or_profile": "~Prince_Osei_Aboagye1;~Yan_Zheng2;~Chin-Chia_Michael_Yeh1;~Junpeng_Wang1;~Wei_Zhang52;~Liang_Wang11;~Hao_Yang8;~Jeff_Phillips1", "aff": "University of Utah;VISA;VISA;VISA;VISA;VISA;Visa Research;University of Utah", "aff_domain": "utah.edu;visa.com;visa.com;visa.com;visa.com;visa.com;visa.com;utah.edu", "position": "PhD student;Principal Researcher;Research Scientist;Researcher;Principal Researcher;Principal Scientist;Vice President;Associate Professor", "bibtex": "@inproceedings{\naboagye2022normalization,\ntitle={Normalization of Language Embeddings for Cross-Lingual Alignment},\nauthor={Prince Osei Aboagye and Jeff Phillips and Yan Zheng and Junpeng Wang and Chin-Chia Michael Yeh and Wei Zhang and Liang Wang and Hao Yang},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=Nh7CtbyoqV5}\n}", "github": "", "project": "", "reviewers": "GCj8;zhG1;vFnV;Qc8y;ve8X", "pdf_size": 0, "recommendation": "3;5;6;8;8", "confidence": "4;4;4;4;4", "correctness": "3;2;4;4;4", "technical_novelty": "2;2;3;3;3", "empirical_novelty": "2;3;2;3;3", "wc_summary_paper": "64;75;105;67;36", "wc_summary_review": "44;131;34;40;44", "wc_main_review": "197;1215;157;138;508", "wc_review": "305;1421;296;245;588", "wc_reply_reviewers": "0;197;0;11;71", "wc_reply_authors": "1527;2900;889;1015;2255", "reply_reviewers": "0;1;0;1;1", "reply_authors": "3;6;2;2;4", "recommendation_avg": [ 6.0, 1.8973665961010275 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.4, 0.8 ], "technical_novelty_avg": [ 2.6, 0.4898979485566356 ], "empirical_novelty_avg": [ 2.6, 0.4898979485566356 ], "wc_summary_paper_avg": [ 69.4, 22.132329294495868 ], "wc_summary_review_avg": [ 58.6, 36.384612132053846 ], "wc_main_review_avg": [ 443.0, 408.78992159787896 ], "wc_review_avg": [ 571.0, 441.68903088032425 ], "wc_reply_reviewers_avg": [ 55.8, 75.36948984834646 ], "wc_reply_authors_avg": [ 1717.2, 762.0263512504013 ], "reply_reviewers_avg": [ 0.6, 0.48989794855663565 ], "reply_authors_avg": [ 3.4, 1.4966629547095764 ], "replies_avg": [ 26, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.6588078458684125, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10286218373304313543&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=Nh7CtbyoqV5", "email": "utah.edu;visa.com;visa.com;visa.com;visa.com;visa.com;visa.com;utah.edu", "author_num": 8, "aff_unique_index": "0;1;1;1;1;1;2;0", "aff_unique_norm": "University of Utah;VISA;Visa Inc.", "aff_unique_dep": ";;Research", "aff_unique_url": "https://www.utah.edu;https://www.visa.com;https://www.visa.com/", "aff_unique_abbr": "Utah;VISA;Visa", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Sound Adversarial Audio-Visual Navigation", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6057", "id": "NkZq4OEYN-", "poster": "", "openreview": "https://openreview.net/forum?id=NkZq4OEYN-", "slides": "https://iclr.cc/virtual/2022/poster/6057", "video": "https://iclr.cc/virtual/2022/poster/6057", "author_site": "Yinfeng Yu, Wenbing Huang, Fuchun Sun, Changan Chen, Yikai Wang, Xiaohong Liu", "tldr": "", "abstract": "Audio-visual navigation task requires an agent to find a sound source in a realistic, unmapped 3D environment by utilizing egocentric audio-visual observations. Existing audio-visual navigation works assume a clean environment that solely contains the target sound, which, however, would not be suitable in most real-world applications due to the unexpected sound noise or intentional interference. In this work, we design an acoustically complex environment in which, besides the target sound, there exists a sound attacker playing a zero-sum game with the agent. More specifically, the attacker can move and change the volume and category of the sound to make the agent suffer from finding the sounding object while the agent tries to dodge the attack and navigate to the goal under the intervention. Under certain constraints to the attacker, we can improve the robustness of the agent towards unexpected sound attacks in audio-visual navigation. For better convergence, we develop a joint training mechanism by employing the property of a centralized critic with decentralized actors. Experiments on two real-world 3D scan datasets, Replica, and Matterport3D, verify the effectiveness and the robustness of the agent trained under our designed environment when transferred to the clean environment or the one containing sound attackers with random policy. Project: https://yyf17.github.io/SAAVN .", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/7645c11196a90a3cf589f48b53a0b82fb5a56a1c.zip", "author": "Yinfeng Yu;Wenbing Huang;Fuchun Sun;Changan Chen;Yikai Wang;Xiaohong Liu", "authorids": "~Yinfeng_Yu1;~Wenbing_Huang1;~Fuchun_Sun2;~Changan_Chen2;~Yikai_Wang2;~Xiaohong_Liu3", "gender": ";M;;;M;", "homepage": "https://yyf17.github.io/;https://gsai.ruc.edu.cn/english/wenbing_huang;;;https://yikaiw.github.io/;", "dblp": "237/3612;155/3181-1.html;;;85/9555-1;", "google_scholar": "https://scholar.google.com/citations?hl=en;0yNkmO4AAAAJ;;;MnW5aegAAAAJ;", "orcid": "0000-0003-3089-4140;;;;;", "linkedin": ";;;;;", "or_profile": "~Yinfeng_Yu1;~Wenbing_Huang1;~Fuchun_Sun2;~Changan_Chen2;~Yikai_Wang2;~Xiaohong_Liu3", "aff": "Xinjiang University;Tsinghua University;;;Tsinghua University;", "aff_domain": "xju.edu.cn;tsinghua.edu.cn;;;tsinghua.edu.cn;", "position": "Lecturer;Researcher;;;PhD student;", "bibtex": "@inproceedings{\nyu2022sound,\ntitle={Sound Adversarial Audio-Visual Navigation},\nauthor={Yinfeng Yu and Wenbing Huang and Fuchun Sun and Changan Chen and Yikai Wang and Xiaohong Liu},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=NkZq4OEYN-}\n}", "github": "", "project": "", "reviewers": "i5Vv;TLMn;yUmi", "pdf_size": 0, "recommendation": "6;8;8", "confidence": "3;4;4", "correctness": "4;4;3", "technical_novelty": "3;3;3", "empirical_novelty": "3;4;3", "wc_summary_paper": "69;58;101", "wc_summary_review": "24;44;140", "wc_main_review": "109;145;213", "wc_review": "202;247;454", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;25;46", "reply_reviewers": "0;0;0", "reply_authors": "0;1;1", "recommendation_avg": [ 7.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 76.0, 18.239152027072603 ], "wc_summary_review_avg": [ 69.33333333333333, 50.63156678946007 ], "wc_main_review_avg": [ 155.66666666666666, 43.12256434345661 ], "wc_review_avg": [ 301.0, 109.73604694903129 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 23.666666666666668, 18.80307303489394 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0.6666666666666666, 0.4714045207910317 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.9999999999999998, "corr_recommendation_correctness": -0.49999999999999983, "gs_citation": 37, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14696002671492155830&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "pdf": "https://openreview.net/pdf?id=NkZq4OEYN-", "email": "xju.edu.cn;tsinghua.edu.cn;;;tsinghua.edu.cn;", "author_num": 6, "aff_unique_index": "0;1;1", "aff_unique_norm": "Xinjiang University;Tsinghua University", "aff_unique_dep": ";", "aff_unique_url": "http://www.xju.edu.cn;https://www.tsinghua.edu.cn", "aff_unique_abbr": "XJU;THU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "Practical Integration via Separable Bijective Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/5958", "id": "NlObxR0rosG", "poster": "", "openreview": "https://openreview.net/forum?id=NlObxR0rosG", "slides": "https://iclr.cc/virtual/2022/poster/5958", "video": "https://iclr.cc/virtual/2022/poster/5958", "author_site": "Christopher Bender, Patrick Emmanuel, Michael Reiter, Junier Oliva", "tldr": "", "abstract": "Neural networks have enabled learning over examples that contain thousands of dimensions.\nHowever, most of these models are limited to training and evaluating on a finite collection of \\textit{points} and do not consider the hypervolume in which the data resides.\nAny analysis of the model's local or global behavior is therefore limited to very expensive or imprecise estimators.\nWe propose to formulate neural networks as a composition of a bijective (flow) network followed by a learnable, separable network.\nThis construction allows for learning (or assessing) over full hypervolumes with precise estimators at tractable computational cost via integration over the \\textit{input space}.\nWe develop the necessary machinery, propose several practical integrals to use during training, and demonstrate their utility.", "keywords": "integration;flow;likelihood;classification;regression;out of distribution;regularization", "primary_area": "", "supplementary_material": "", "author": "Christopher M Bender;Patrick Emmanuel;Michael K. Reiter;Junier Oliva", "authorids": "~Christopher_M_Bender1;~Patrick_Emmanuel1;~Michael_K._Reiter1;~Junier_Oliva1", "gender": "M;M;M;M", "homepage": ";;http://lupalab.com;https://reitermk.github.io/", "dblp": ";;137/8390;r/MichaelKReiter", "google_scholar": ";;;GaRxukMAAAAJ", "orcid": ";;;0000-0001-7007-8274", "linkedin": ";patrick-emmanuel-849b7350/;;mikereiter/", "or_profile": "~Christopher_M_Bender1;~Patrick_Emmanuel1;~Junier_Oliva1;~Michael_Reiter1", "aff": "Department of Computer Science, University of North Carolina, Chapel Hill;Johns Hopkins University - APL;;Duke University", "aff_domain": "cs.unc.edu;jhuapl.edu;;duke.edu", "position": "PhD student;Researcher;;Full Professor", "bibtex": "@inproceedings{\nbender2022practical,\ntitle={Practical Integration via Separable Bijective Networks},\nauthor={Christopher M Bender and Patrick Emmanuel and Michael K. Reiter and Junier Oliva},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=NlObxR0rosG}\n}", "github": "", "project": "", "reviewers": "mYvf;PTfm;6qXG;MsPk", "pdf_size": 0, "recommendation": "1;6;6;8", "confidence": "3;4;4;2", "correctness": "3;3;4;3", "technical_novelty": "1;3;3;3", "empirical_novelty": "1;3;3;3", "wc_summary_paper": "77;31;69;45", "wc_summary_review": "91;91;96;16", "wc_main_review": "217;221;283;78", "wc_review": "385;343;448;139", "wc_reply_reviewers": "562;82;25;0", "wc_reply_authors": "1169;77;132;47", "reply_reviewers": "1;1;1;0", "reply_authors": "3;1;2;1", "recommendation_avg": [ 5.25, 2.5860201081971503 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.8660254037844386 ], "empirical_novelty_avg": [ 2.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 55.5, 18.405162319305962 ], "wc_summary_review_avg": [ 73.5, 33.26033673912518 ], "wc_main_review_avg": [ 199.75, 75.00458319329559 ], "wc_review_avg": [ 328.75, 115.75053995554406 ], "wc_reply_reviewers_avg": [ 167.25, 229.83839431217754 ], "wc_reply_authors_avg": [ 356.25, 470.23046211405745 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.75, 0.82915619758885 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.14574100933227221, "corr_recommendation_correctness": 0.16744367165578428, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16471930435813257809&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=NlObxR0rosG", "email": "cs.unc.edu;jhuapl.edu;;duke.edu", "author_num": 4, "aff_unique_index": "0;1;2", "aff_unique_norm": "University of North Carolina;Johns Hopkins University;Duke University", "aff_unique_dep": "Department of Computer Science;APL;", "aff_unique_url": "https://www.unc.edu;https://www.jhuapl.edu;https://www.duke.edu", "aff_unique_abbr": "UNC;JHU-APL;Duke", "aff_campus_unique_index": "0", "aff_campus_unique": "Chapel Hill;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "Nn4BjABPRPN", "title": "Encoding Event-Based Gesture Data With a Hybrid SNN Guided Variational Auto-encoder", "track": "main", "status": "Reject", "tldr": "", "abstract": "Commercial mid-air gesture recognition systems have existed for at least a decade, but they have not become a widespread method of interacting with machines. These systems require rigid, dramatic gestures to be performed for accurate recognition that can be fatiguing and unnatural. To address this limitation, we propose a neuromorphic gesture analysis system which encodes event-based gesture data at high temporal resolution. Our novel approach consists of an event-based guided Variational Autoencoder (VAE) which encodes event-based data sensed by a Dynamic Vision Sensor (DVS) into a latent space representation suitable to compute the similarity of mid-air gesture data. We show that the Hybrid Guided-VAE achieves 87% classification accuracy on the DVSGesture dataset and it can encode the sparse, noisy inputs into an interpretable latent space representation, visualized through T-SNE plots. We also implement the encoder component of the model on neuromorphic hardware and discuss the potential for our algorithm to enable real-time, self-supervised learning of natural mid-air gestures.", "keywords": "Neuromorphic Computing;Variational Auto-encoders;Representation Learning;Spiking Neural Networks;Self-supervised Learning", "primary_area": "", "supplementary_material": "/attachment/b3c8bf64591ee8e9c32a9280cf69163e0cb2f4d3.zip", "author": "Kenneth Michael Stewart;Andreea Danielescu;Timothy Shea;Emre Neftci", "authorids": "~Kenneth_Michael_Stewart1;~Andreea_Danielescu1;timothy.m.shea@accenture.com;~Emre_Neftci1", "gender": "M;F;;M", "homepage": ";;;https://nmi-lab.org/", "dblp": ";86/7510;;62/5283", "google_scholar": "8zJjZBkAAAAJ;RH4n-HoAAAAJ;;yYT6jtkAAAAJ", "orcid": "0000-0002-7719-5796;;;", "linkedin": "kenneth-stewart/;adanielescu/;;", "or_profile": "~Kenneth_Michael_Stewart1;~Andreea_Danielescu1;timothy.m.shea@accenture.com;~Emre_Neftci1", "aff": "University of California, Irvine;Accenture Labs;;Foschungszentrum Juelich and RWTH Aachen", "aff_domain": "uci.edu;accenture.com;;fz-juelich.de", "position": "PhD student;Principal Researcher;;Full Professor", "bibtex": "@misc{\nstewart2022encoding,\ntitle={Encoding Event-Based Gesture Data With a Hybrid {SNN} Guided Variational Auto-encoder},\nauthor={Kenneth Michael Stewart and Andreea Danielescu and Timothy Shea and Emre Neftci},\nyear={2022},\nurl={https://openreview.net/forum?id=Nn4BjABPRPN}\n}", "github": "", "project": "", "reviewers": "K3XW;t5op;LZrg;6Az2", "site": "https://openreview.net/forum?id=Nn4BjABPRPN", "pdf_size": 0, "recommendation": "1;3;3;6", "confidence": "4;3;3;4", "correctness": "1;2;3;3", "technical_novelty": "2;3;2;2", "empirical_novelty": "1;2;3;3", "wc_summary_paper": "134;79;75;112", "wc_summary_review": "58;111;98;80", "wc_main_review": "199;280;364;289", "wc_review": "391;470;537;481", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.25, 1.7853571071357126 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 2.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 100.0, 24.320773014030618 ], "wc_summary_review_avg": [ 86.75, 19.917015338649513 ], "wc_main_review_avg": [ 283.0, 58.44227921633447 ], "wc_review_avg": [ 469.75, 52.083466666496 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.14002800840280097, "corr_recommendation_correctness": 0.8021806287494232, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:nknzJ3meKVsJ:scholar.google.com/&scioq=Encoding+Event-Based+Gesture+Data+With+a+Hybrid+SNN+Guided+Variational+Auto-encoder&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1;2", "aff_unique_norm": "University of California, Irvine;Accenture;Forschungszentrum J\u00fclich", "aff_unique_dep": ";Accenture Labs;", "aff_unique_url": "https://www.uci.edu;https://www.accenture.com/us-en/labs;https://www.fz-juelich.de", "aff_unique_abbr": "UCI;Accenture Labs;FZJ", "aff_campus_unique_index": "0", "aff_campus_unique": "Irvine;", "aff_country_unique_index": "0;0;1", "aff_country_unique": "United States;Germany" }, { "title": "PI3NN: Out-of-distribution-aware Prediction Intervals from Three Neural Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6974", "id": "NoB8YgRuoFU", "poster": "", "openreview": "https://openreview.net/forum?id=NoB8YgRuoFU", "slides": "https://iclr.cc/virtual/2022/poster/6974", "video": "https://iclr.cc/virtual/2022/poster/6974", "author_site": "Siyan Liu, Pei Zhang, Dan Lu, Guannan Zhang", "tldr": "", "abstract": "We propose a novel prediction interval (PI) method for uncertainty quantification, which addresses three major issues with the state-of-the-art PI methods. First, existing PI methods require retraining of neural networks (NNs) for every given confidence level and suffer from the crossing issue in calculating multiple PIs. Second, they usually rely on customized loss functions with extra sensitive hyperparameters for which fine tuning is required to achieve a well-calibrated PI. Third, they usually underestimate uncertainties of out-of-distribution (OOD) samples leading to over-confident PIs. Our PI3NN method calculates PIs from linear combinations of three NNs, each of which is independently trained using the standard mean squared error loss. The coefficients of the linear combinations are computed using root-finding algorithms to ensure tight PIs for a given confidence level. We theoretically prove that PI3NN can calculate PIs for a series of confidence levels without retraining NNs and it completely avoids the crossing issue. Additionally, PI3NN does not introduce any unusual hyperparameters resulting in a stable performance. Furthermore, we address OOD identification challenge by introducing an initialization scheme which provides reasonably larger PIs of the OOD samples than those of the in-distribution samples. Benchmark and real-world experiments show that our method outperforms several state-of-the-art approaches with respect to predictive uncertainty quality, robustness, and OOD samples identification.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/9f1a45eff2b80564d6dd33651f9efd61ac745e40.zip", "author": "Siyan Liu;Pei Zhang;Dan Lu;Guannan Zhang", "authorids": "~Siyan_Liu1;~Pei_Zhang6;~Dan_Lu1;~Guannan_Zhang1", "gender": "M;;F;M", "homepage": ";;https://www.ornl.gov/staff-profile/dan-lu;https://sites.google.com/view/guannan-zhang/home", "dblp": ";;;", "google_scholar": "V8GOEvgAAAAJ;;;FAGohRkAAAAJ", "orcid": "0000-0003-2017-3251;0000-0002-8351-0529;;", "linkedin": ";;;", "or_profile": "~Siyan_Liu1;~Pei_Zhang6;~Dan_Lu1;~Guannan_Zhang1", "aff": "Oak Ridge National Laboratory;Oak Ridge National Laboratory;Oak Ridge National Laboratory;Oak Ridge National Laboratory", "aff_domain": "ornl.gov;ornl.gov;ornl.gov;ornl.gov", "position": "Postdoc;Researcher;Researcher;Senior Research Staff", "bibtex": "@inproceedings{\nliu2022pinn,\ntitle={{PI}3{NN}: Out-of-distribution-aware Prediction Intervals from Three Neural Networks},\nauthor={Siyan Liu and Pei Zhang and Dan Lu and Guannan Zhang},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=NoB8YgRuoFU}\n}", "github": "", "project": "", "reviewers": "QDYK;TzHb;AYgp;kjPV", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "3;3;4;4", "correctness": "4;3;3;4", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;2;3;2", "wc_summary_paper": "59;73;78;62", "wc_summary_review": "76;25;39;18", "wc_main_review": "128;273;233;240", "wc_review": "263;371;350;320", "wc_reply_reviewers": "0;154;0;98", "wc_reply_authors": "320;1132;313;534", "reply_reviewers": "0;1;0;1", "reply_authors": "2;2;1;1", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 68.0, 7.7781745930520225 ], "wc_summary_review_avg": [ 39.5, 22.38861317723811 ], "wc_main_review_avg": [ 218.5, 54.389796837274545 ], "wc_review_avg": [ 326.0, 40.6386515524322 ], "wc_reply_reviewers_avg": [ 63.0, 66.03786792439622 ], "wc_reply_authors_avg": [ 574.75, 333.7659172234337 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 1.0, "corr_recommendation_correctness": 0.0, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9729426911336956537&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "pdf": "https://openreview.net/pdf?id=NoB8YgRuoFU", "email": "ornl.gov;ornl.gov;ornl.gov;ornl.gov", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Oak Ridge National Laboratory", "aff_unique_dep": "", "aff_unique_url": "https://www.ornl.gov", "aff_unique_abbr": "ORNL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "NoE4RfaOOa", "title": "Where can quantum kernel methods make a big difference?", "track": "main", "status": "Reject", "tldr": "", "abstract": "The classification problem is a core problem of supervised learning, which is widely present in our life. As a class of algorithms for pattern analysis, Kernel methods have been widely and effectively applied to classification problems. However, when very complex patterns are encountered, the existing kernel methods are powerless. Recent studies have shown that quantum kernel methods can effectively handle some classification problems of complex patterns that classical kernel methods cannot handle. However, this does not mean that quantum kernel methods are better than classical kernel methods in all cases. It is still unclear under what circumstances quantum kernel methods can realize their great potential. In this paper, by exploring and summarizing the essential differences between quantum kernel functions and classical kernel functions, we propose a criterion based on inter-class and intra-class distance and geometric properties to determine under what circumstances quantum kernel methods will be superior. We validate our method with toy examples and multiple real datasets from Qiskit and Kaggle. The experiments show that our method can be used as a valid determination method.", "keywords": "Kernel;Quantum;Classification", "primary_area": "", "supplementary_material": "/attachment/768651dca0ec644f0777e156a35f95efe197b8c8.zip", "author": "Muhao Guo;Yang Weng", "authorids": "~Muhao_Guo1;~Yang_Weng1", "gender": "M;", "homepage": ";", "dblp": "345/6430;", "google_scholar": "wIOmifAAAAAJ;", "orcid": "0000-0002-9890-8214;", "linkedin": "muhaoguo/;", "or_profile": "~Muhao_Guo1;~Yang_Weng1", "aff": "Arizona State University;", "aff_domain": "asu.edu;", "position": "PhD student;", "bibtex": "@misc{\nguo2022where,\ntitle={Where can quantum kernel methods make a big difference?},\nauthor={Muhao Guo and Yang Weng},\nyear={2022},\nurl={https://openreview.net/forum?id=NoE4RfaOOa}\n}", "github": "", "project": "", "reviewers": "cR4C;kXVf;DX2D;3XVL", "site": "https://openreview.net/forum?id=NoE4RfaOOa", "pdf_size": 0, "recommendation": "1;1;3;5", "confidence": "4;5;2;4", "correctness": "1;1;2;2", "technical_novelty": "1;1;2;3", "empirical_novelty": "2;2;4;3", "wc_summary_paper": "42;16;38;54", "wc_summary_review": "103;30;82;87", "wc_main_review": "208;779;42;212", "wc_review": "353;825;162;353", "wc_reply_reviewers": "0;43;0;0", "wc_reply_authors": "811;884;638;577", "reply_reviewers": "0;1;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 2.5, 1.6583123951777 ], "confidence_avg": [ 3.75, 1.0897247358851685 ], "correctness_avg": [ 1.5, 0.5 ], "technical_novelty_avg": [ 1.75, 0.82915619758885 ], "empirical_novelty_avg": [ 2.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 37.5, 13.738631664034086 ], "wc_summary_review_avg": [ 75.5, 27.390691849604675 ], "wc_main_review_avg": [ 310.25, 279.1920262113515 ], "wc_review_avg": [ 423.25, 244.70632909673586 ], "wc_reply_reviewers_avg": [ 10.75, 18.619546181365433 ], "wc_reply_authors_avg": [ 727.5, 124.62443580614517 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.3458572319330373, "corr_recommendation_correctness": 0.9045340337332909, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3208306765676111337&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "Arizona State University", "aff_unique_dep": "", "aff_unique_url": "https://www.asu.edu", "aff_unique_abbr": "ASU", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "id": "NoxVNArZTeW", "title": "Adversarial Fairness Network", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Fairness is becoming a rising concern in machine learning. Recent research has discovered that state-of-the-art models are amplifying social bias by making biased predictions towards some population groups (characterized by sensitive features like race or gender). Such unfair prediction among groups renders trust issues and ethical concerns in machine learning, especially for sensitive fields such as employment, criminal justice, and trust score assessment. In this paper, we introduce a new framework to improve machine learning fairness. The goal of our model is to minimize the influence of sensitive feature from the perspectives of both data input and predictive model. To achieve this goal, we reformulate the data input by eliminating the sensitive information and strengthen model fairness by minimizing the marginal contribution of the sensitive feature. We propose to learn the sensitive-irrelevant input via sampling among features and design an adversarial network to minimize the dependence between the reformulated input and the sensitive information. Empirical results validate that our model achieves comparable or better results than related state-of-the-art methods w.r.t. both fairness metrics and prediction performance.", "keywords": "Fairness;Machine Learning", "primary_area": "", "supplementary_material": "/attachment/6c88b31fcf3b0aee56d29ce20f6a6d4535dbeada.zip", "author": "Taeuk Jang;Xiaoqian Wang;Heng Huang", "authorids": "~Taeuk_Jang1;~Xiaoqian_Wang1;~Heng_Huang1", "gender": "M;F;M", "homepage": ";https://engineering.purdue.edu/~joywang/;https://www.cs.umd.edu/~heng/", "dblp": "61/6076;151/3215-1;03/281", "google_scholar": "https://scholar.google.co.kr/citations?user=AWJhF1UAAAAJ;I3tc214AAAAJ;4OqLaDwAAAAJ", "orcid": ";;", "linkedin": "taeuk-jang-a52674178/;;", "or_profile": "~Taeuk_Jang1;~Xiaoqian_Wang1;~Heng_Huang1", "aff": "Purdue University;Purdue University;University of Pittsburgh", "aff_domain": "purdue.edu;purdue.edu;pitt.edu", "position": "PhD student;Assistant Professor;Full Professor", "bibtex": "@misc{\njang2022adversarial,\ntitle={Adversarial Fairness Network},\nauthor={Taeuk Jang and Xiaoqian Wang and Heng Huang},\nyear={2022},\nurl={https://openreview.net/forum?id=NoxVNArZTeW}\n}", "github": "", "project": "", "reviewers": "Agpp;X3xf;hrZD;gwEA", "site": "https://openreview.net/forum?id=NoxVNArZTeW", "pdf_size": 0, "recommendation": "3;5;5;5", "confidence": "4;4;4;5", "correctness": "3;3;4;4", "technical_novelty": "2;2;3;2", "empirical_novelty": "2;2;3;2", "wc_summary_paper": "60;111;91;101", "wc_summary_review": "40;45;52;27", "wc_main_review": "219;569;120;253", "wc_review": "319;725;263;381", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.5, 0.8660254037844386 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 90.75, 19.109879643786353 ], "wc_summary_review_avg": [ 41.0, 9.137833441248533 ], "wc_main_review_avg": [ 290.25, 168.18943932363888 ], "wc_review_avg": [ 422.0, 179.84715733088473 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17783260791639668338&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0;1", "aff_unique_norm": "Purdue University;University of Pittsburgh", "aff_unique_dep": ";", "aff_unique_url": "https://www.purdue.edu;https://www.pitt.edu", "aff_unique_abbr": "Purdue;Pitt", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "NqDLrS73nG", "title": "Transliteration: A Simple Technique For Improving Multilingual Language Modeling", "track": "main", "status": "Reject", "tldr": "", "abstract": "While impressive performance in natural language processing tasks has been achieved for many languages by transfer learning from large pretrained multilingual language models, it is limited by the unavailability of large corpora for most languages and the barrier of different scripts. Script difference forces the tokens of two languages to be separated at the input. Thus we hypothesize that transliterating all the languages to the same script can improve the performance of language models. Languages of South Asia and Southeast Asia present a unique opportunity of testing this hypothesis as almost all of the major languages in this region have their own script. Nevertheless, it is possible to transliterate them to a single representation easily. We validate our hypothesis empirically by pretraining ALBERT models on the Indo-Aryan languages available on the OSCAR corpus and measuring the model's performance on the Indo-Aryan subset of the IndicGLUE benchmark. Compared to the non-transliteration-based model, the transliteration-based model (termed XLM-Indic) shows significant improvement on almost all tasks of IndicGLUE. For example, XLM-Indic performed better on News Classification (0.41%), Multiple Choice QA (4.62%), NER (6.66%), and Cloze-Style QA (3.32%). In addition, XLM-Indic establishes new SOTA results for most tasks the on IndicGLUE benchmark while being competitive at the rest. Across the tasks of IndicGLUE, the most underrepresented languages seem to gain the most improvement. For instance, for the NER, XLM-Indic achieves 10%, 35%, and 58.5% better F1-scores on Gujarati, Panjabi, and Oriya languages compared to the current SOTA. ", "keywords": "Multilingual Language Model;Natural Language Processing;Transliteration;Underrepresented Language Modeling", "primary_area": "", "supplementary_material": "/attachment/c5a8e29d2e937314067ec0f7f306e25687022cec.zip", "author": "Ibraheem Muhammad Moosa;Mahmud Elahi Akhter;Ashfia Binte Habib", "authorids": "~Ibraheem_Muhammad_Moosa1;~Mahmud_Elahi_Akhter1;~Ashfia_Binte_Habib1", "gender": "M;;", "homepage": "https://ibraheem-moosa.github.io/;;http://ece.northsouth.edu/people/ms-ashfia-binte-habib/", "dblp": "242/0165;;", "google_scholar": "G7mYYS4AAAAJ;;3Yf3BPUAAAAJ", "orcid": ";;0000-0003-3397-949X", "linkedin": ";;", "or_profile": "~Ibraheem_Muhammad_Moosa1;~Mahmud_Elahi_Akhter1;~Ashfia_Binte_Habib1", "aff": "Pennsylvania State University;;North South University", "aff_domain": "psu.edu;;northsouth.edu", "position": "PhD student;;Lecturer", "bibtex": "@misc{\nmoosa2022transliteration,\ntitle={Transliteration: A Simple Technique For Improving Multilingual Language Modeling },\nauthor={Ibraheem Muhammad Moosa and Mahmud Elahi Akhter and Ashfia Binte Habib},\nyear={2022},\nurl={https://openreview.net/forum?id=NqDLrS73nG}\n}", "github": "", "project": "", "reviewers": "vmSB;M3CH;rME7;XxGN;yNTm", "site": "https://openreview.net/forum?id=NqDLrS73nG", "pdf_size": 0, "recommendation": "3;3;5;5;6", "confidence": "5;3;4;4;4", "correctness": "3;2;3;3;3", "technical_novelty": "2;1;1;2;2", "empirical_novelty": "2;2;2;2;3", "wc_summary_paper": "85;117;68;142;92", "wc_summary_review": "25;48;243;245;75", "wc_main_review": "347;236;230;632;271", "wc_review": "457;401;541;1019;438", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "983;890;1217;1631;787", "reply_reviewers": "0;0;0;0;0", "reply_authors": "2;2;3;4;2", "recommendation_avg": [ 4.4, 1.2 ], "confidence_avg": [ 4.0, 0.6324555320336759 ], "correctness_avg": [ 2.8, 0.39999999999999997 ], "technical_novelty_avg": [ 1.6, 0.4898979485566356 ], "empirical_novelty_avg": [ 2.2, 0.39999999999999997 ], "wc_summary_paper_avg": [ 100.8, 25.933761778808723 ], "wc_summary_review_avg": [ 127.2, 96.67347102488873 ], "wc_main_review_avg": [ 343.2, 150.29224863578293 ], "wc_review_avg": [ 571.2, 228.5558137523524 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 1101.6, 300.4647067460669 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.6, 0.8 ], "replies_avg": [ 23, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.5833333333333335, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:bWz6T4obXn0J:scholar.google.com/&scioq=Transliteration:+A+Simple+Technique+For+Improving+Multilingual+Language+Modeling&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1", "aff_unique_norm": "Pennsylvania State University;North South University", "aff_unique_dep": ";", "aff_unique_url": "https://www.psu.edu;https://www.northsouth.edu/", "aff_unique_abbr": "PSU;NSU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "United States;Bangladesh" }, { "id": "NrB52z3eOTY", "title": "Effective Uncertainty Estimation with Evidential Models for Open-World Recognition", "track": "main", "status": "Reject", "tldr": "", "abstract": "Reliable uncertainty estimation is crucial when deploying a classifier in the wild. In this paper, we tackle the challenge of jointly quantifying in-distribution and out-of-distribution (OOD) uncertainties. To this end, we leverage the second-order uncertainty representation provided by evidential models and we introduce KLoS, a Kullback\u2013Leibler divergence criterion defined on the class-probability simplex. By keeping the full distributional information, KLoS captures class confusion and lack of evidence in a single score. A crucial property of KLoS is to be a class-wise divergence measure built from in-distribution samples and to not require OOD training data, in contrast to current second-order uncertainty measures. We further design an auxiliary neural network, KLoSNet, to learn a refined criterion directly aligned with the evidential training objective. In the realistic context where no OOD data is available during training, our experiments show that KLoSNet outperforms first-order and second-order uncertainty measures to simultaneously detect misclassifications and OOD samples. When training with OOD samples, we also observe that existing measures are brittle to the choice of the OOD dataset, whereas KLoS remains more robust.", "keywords": "deep learning;uncertainty estimation;evidential models;misclassification detection;out-of-distribution detection;confidence learning", "primary_area": "", "supplementary_material": "", "author": "Charles Corbi\u00e8re;Marc Lafon;Nicolas THOME;Matthieu Cord;Patrick Perez", "authorids": "~Charles_Corbi\u00e8re1;~Marc_Lafon1;~Nicolas_THOME2;~Matthieu_Cord1;~Patrick_Perez1", "gender": "M;;;M;", "homepage": "https://chcorbi.github.io;https://cedric.cnam.fr/lab/author/lafonm/;;https://cord.isir.upmc.fr/;", "dblp": "https://dblp.uni-trier.de/pers/c/Corbi=egrave=re:Charles.html;;;68/3117;", "google_scholar": "https://scholar.google.fr/citations?user=UcnFUZ8AAAAJ;;;SpAotDcAAAAJ;", "orcid": "0000-0001-8024-7553;;;;", "linkedin": "https://linkedin.com/in/charles-corbi\u00e8re-6167015b;;;;", "or_profile": "~Charles_Corbi\u00e8re1;~Marc_Lafon1;~Nicolas_THOME2;~Matthieu_Cord1;~Patrick_Perez1", "aff": "Conservatoire National des Arts et Metiers;Conservatoire National des Arts et M\u00e9tiers;;Sorbonne Universit\u00e9;", "aff_domain": "cnam.fr;cnam.fr;;isir.upmc.fr;", "position": "PhD student;PhD student;;Full Professor;", "bibtex": "@misc{\ncorbi{\\`e}re2022effective,\ntitle={Effective Uncertainty Estimation with Evidential Models for Open-World Recognition},\nauthor={Charles Corbi{\\`e}re and Marc Lafon and Nicolas THOME and Matthieu Cord and Patrick Perez},\nyear={2022},\nurl={https://openreview.net/forum?id=NrB52z3eOTY}\n}", "github": "", "project": "", "reviewers": "rode;oef3;ucNF;FYgq", "site": "https://openreview.net/forum?id=NrB52z3eOTY", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "3;4;5;3", "correctness": "1;2;3;3", "technical_novelty": "2;2;3;2", "empirical_novelty": "1;3;2;3", "wc_summary_paper": "115;129;79;88", "wc_summary_review": "49;56;32;63", "wc_main_review": "603;259;419;317", "wc_review": "767;444;530;468", "wc_reply_reviewers": "154;0;118;34", "wc_reply_authors": "1203;629;763;415", "reply_reviewers": "1;0;1;1", "reply_authors": "3;1;2;1", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 2.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 102.75, 20.129269733400662 ], "wc_summary_review_avg": [ 50.0, 11.510864433221338 ], "wc_main_review_avg": [ 399.5, 130.70864546769658 ], "wc_review_avg": [ 552.25, 127.89522078639217 ], "wc_reply_reviewers_avg": [ 76.5, 62.02217345433809 ], "wc_reply_authors_avg": [ 752.5, 288.19221016536864 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.75, 0.82915619758885 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.20751433915982243, "corr_recommendation_correctness": 0.899228803025897, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Lz5pKKQAQPQJ:scholar.google.com/&scioq=Effective+Uncertainty+Estimation+with+Evidential+Models+for+Open-World+Recognition&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;2", "aff_unique_norm": "Conservatoire National des Arts et Metiers;Conservatoire National des Arts et M\u00e9tiers;Sorbonne Universit\u00e9", "aff_unique_dep": ";;", "aff_unique_url": "https://www.cnam.fr;https://www.cnam.fr;https://www.sorbonne-universite.fr", "aff_unique_abbr": "CNAM;CNAM;Sorbonne U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "France" }, { "id": "NrkAAcMpRoT", "title": "C-MinHash: Improving Minwise Hashing with Circulant Permutation", "track": "main", "status": "Reject", "tldr": "", "abstract": "Minwise hashing (MinHash) is an important and practical algorithm for generating random hashes to approximate the Jaccard (resemblance) similarity in massive binary (0/1) data. The basic theory of MinHash requires applying hundreds or even thousands of independent random permutations to each data vector in the dataset, in order to obtain reliable results for (e.g.,) building large-scale learning models or approximate near neighbor search in massive data. In this paper, we propose {\\bf Circulant MinHash (C-MinHash)} and provide the surprising theoretical results that using only \\textbf{two} independent random permutations in a circulant manner leads to uniformly smaller Jaccard estimation variance than that of the classical MinHash with $K$ independent permutations. Experiments are conducted to show the effectiveness of the proposed method. We also analyze a more convenient C-MinHash variant which reduces two permutations to just one, with extensive numerical results to validate that it achieves essentially the same estimation accuracy as using two permutations with rigorous theory.\n", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xiaoyun Li;Ping Li", "authorids": "~Xiaoyun_Li1;~Ping_Li3", "gender": "M;M", "homepage": "https://lixiaoyun0239.github.io/cv/;http://www.stat.rutgers.edu/home/pingli/", "dblp": ";62/5860-1", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": "~Xiaoyun_Li1;~Ping_Li3", "aff": "Baidu;LinkedIn", "aff_domain": "baidu.com;linkedin.com", "position": "Researcher;Engineer", "bibtex": "@misc{\nli2022cminhash,\ntitle={C-MinHash: Improving Minwise Hashing with Circulant Permutation},\nauthor={Xiaoyun Li and Ping Li},\nyear={2022},\nurl={https://openreview.net/forum?id=NrkAAcMpRoT}\n}", "github": "", "project": "", "reviewers": "5hPp;89pE;yZFL;7874", "site": "https://openreview.net/forum?id=NrkAAcMpRoT", "pdf_size": 0, "recommendation": "5;5;6;8", "confidence": "4;4;4;3", "correctness": "4;4;3;4", "technical_novelty": "3;3;3;4", "empirical_novelty": "2;2;3;2", "wc_summary_paper": "80;43;70;170", "wc_summary_review": "15;26;41;77", "wc_main_review": "163;154;310;129", "wc_review": "258;223;421;376", "wc_reply_reviewers": "0;0;226;34", "wc_reply_authors": "301;251;413;552", "reply_reviewers": "0;0;1;1", "reply_authors": "1;1;2;2", "recommendation_avg": [ 6.0, 1.224744871391589 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 90.75, 47.714646598293065 ], "wc_summary_review_avg": [ 39.75, 23.40272420040026 ], "wc_main_review_avg": [ 189.0, 70.96125703508922 ], "wc_review_avg": [ 319.5, 81.53066907612128 ], "wc_reply_reviewers_avg": [ 65.0, 93.98404119849285 ], "wc_reply_authors_avg": [ 379.25, 115.70733554965302 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.9428090415820632, "corr_recommendation_correctness": 0.0, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7553255132887924050&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1", "aff_unique_norm": "Baidu;LinkedIn Corporation", "aff_unique_dep": "Baidu, Inc.;", "aff_unique_url": "https://www.baidu.com;https://www.linkedin.com", "aff_unique_abbr": "Baidu;LinkedIn", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "China;United States" }, { "id": "NsyO8nGpaGG", "title": "Comparing Human and Machine Bias in Face Recognition", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Much recent research has uncovered and discussed serious concerns of bias in facial analysis technologies, finding performance disparities between groups of people based on perceived gender, skin type, lighting condition, etc. These audits are immensely important and successful at measuring algorithmic bias but have two major challenges: the audits (1) use facial recognition datasets which lack quality metadata, like LFW and CelebA, and (2) do not compare their observed algorithmic bias to the biases of their human alternatives. In this paper, we release improvements to the LFW and CelebA datasets which will enable future researchers to obtain measurements of algorithmic bias that are not tainted by major flaws in the dataset (e.g. identical images appearing in both the gallery and test set). We also use these new data to develop a series of challenging facial identification and verification questions that we administered to various algorithms and a large, balanced sample of human reviewers. We find that both computer models and human survey participants perform significantly better at the verification task, generally obtain lower accuracy rates on dark-skinned or female subjects for both tasks, and obtain higher accuracy rates when their demographics match that of the question. Computer models are observed to achieve a higher level of accuracy than the survey participants on both tasks and exhibit bias to similar degrees as the human survey participants.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/7eaac8da1bf7fa1441fd263d6b0dd575fd85a059.zip", "author": "Samuel Dooley;Ryan Downing;George Zhihong Wei;Nathan Shankar;Bradon Michael Thymes;Gudrun Lilja Thorkelsdottir;Tiye Kurtz-Miott;Rachel Mattson;Olufemi Obiwumi;Valeriia Cherepanova;Micah Goldblum;John P Dickerson;Tom Goldstein", "authorids": "~Samuel_Dooley1;~Ryan_Downing1;~George_Zhihong_Wei1;~Nathan_Shankar1;~Bradon_Michael_Thymes1;~Gudrun_Lilja_Thorkelsdottir1;tkurtzmiott@ucsd.edu;rachelmattson1008@gmail.com;oobiwumi@haverford.edu;~Valeriia_Cherepanova1;~Micah_Goldblum1;~John_P_Dickerson1;~Tom_Goldstein1", "gender": ";M;M;M;;F;;;;F;;M;M", "homepage": ";https://ryansdowning.com/;https://gzhihongwei.github.io;https://njshankar.github.io/personal/;;;;;;https://www.vcherepanova.com/;;https://jpdickerson.com/;https://www.cs.umd.edu/~tomg/", "dblp": ";;;;;;;;;;241/7231;75/8479;25/8184", "google_scholar": ";;https://scholar.google.com/citations?hl=en;;;;;;;PySUqqUAAAAJ;pGDKzuUAAAAJ;https://scholar.google.com.tw/citations?user=QgDpfCQAAAAJ;KmSuVtgAAAAJ", "orcid": ";;0000-0002-7773-1472;;;0000-0001-8627-3218;;;;;;0000-0003-2231-680X;", "linkedin": ";ryan-downing-074354170/;gzhihongwei;;;;;;;;;john-dickerson-83a74a7/;", "or_profile": "~Samuel_Dooley1;~Ryan_Downing1;~George_Zhihong_Wei1;~Nathan_Shankar1;~Bradon_Michael_Thymes1;~Gudrun_Lilja_Thorkelsdottir1;tkurtzmiott@ucsd.edu;rachelmattson1008@gmail.com;oobiwumi@haverford.edu;~Valeriia_Cherepanova1;~Micah_Goldblum1;~John_P_Dickerson1;~Tom_Goldstein1", "aff": ";University of Maryland, College Park;University of Massachusetts, Amherst;Pomona College;;University of Maryland, College Park;;;;Amazon;New York University;University of Maryland, College Park;University of Maryland, College Park", "aff_domain": ";umd.edu;umass.edu;pomona.edu;;umd.edu;;;;amazon.com;nyu.edu;umd.edu;umd.edu", "position": ";Undergrad student;Undergrad student;Undergrad student;;Undergrad student;;;;Intern;Postdoc;Assistant Professor;Associate Professor", "bibtex": "@misc{\ndooley2022comparing,\ntitle={Comparing Human and Machine Bias in Face Recognition},\nauthor={Samuel Dooley and Ryan Downing and George Zhihong Wei and Nathan Shankar and Bradon Michael Thymes and Gudrun Lilja Thorkelsdottir and Tiye Kurtz-Miott and Rachel Mattson and Olufemi Obiwumi and Valeriia Cherepanova and Micah Goldblum and John P Dickerson and Tom Goldstein},\nyear={2022},\nurl={https://openreview.net/forum?id=NsyO8nGpaGG}\n}", "github": "", "project": "", "reviewers": "EdBJ;P8Sr;zEo3", "site": "https://openreview.net/forum?id=NsyO8nGpaGG", "pdf_size": 0, "recommendation": "3;5;5", "confidence": "3;4;4", "correctness": "4;4;4", "technical_novelty": "1;2;2", "empirical_novelty": "2;2;2", "wc_summary_paper": "59;201;69", "wc_summary_review": "41;242;68", "wc_main_review": "91;203;204", "wc_review": "191;646;341", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 4.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 4.0, 0.0 ], "technical_novelty_avg": [ 1.6666666666666667, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 109.66666666666667, 64.71132478597201 ], "wc_summary_review_avg": [ 117.0, 89.07300376657341 ], "wc_main_review_avg": [ 166.0, 53.034579914115156 ], "wc_review_avg": [ 392.6666666666667, 189.31161847305856 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 13, 0 ], "corr_recommendation_confidence": 0.9999999999999998, "corr_recommendation_correctness": 0.0, "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17578125608394673471&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1;2;0;3;4;0;0", "aff_unique_norm": "University of Maryland;University of Massachusetts Amherst;Pomona College;Amazon;New York University", "aff_unique_dep": ";;;Amazon.com, Inc.;", "aff_unique_url": "https://www/umd.edu;https://www.umass.edu;https://www.pomona.edu;https://www.amazon.com;https://www.nyu.edu", "aff_unique_abbr": "UMD;UMass Amherst;Pomona College;Amazon;NYU", "aff_campus_unique_index": "0;1;0;0;0", "aff_campus_unique": "College Park;Amherst;", "aff_country_unique_index": "0;0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Natural Language Descriptions of Deep Visual Features", "status": "Oral", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/5988", "id": "NudBMY-tzDr", "poster": "", "openreview": "https://openreview.net/forum?id=NudBMY-tzDr", "slides": "https://iclr.cc/virtual/2022/poster/5988", "video": "https://iclr.cc/virtual/2022/poster/5988", "author_site": "Evan Hernandez, Sarah Schwettmann, David Bau, Teona Bagashvili, Antonio Torralba, Jacob Andreas", "tldr": "", "abstract": "Some neurons in deep networks specialize in recognizing highly specific perceptual, structural, or semantic features of inputs. In computer vision, techniques exist for identifying neurons that respond to individual concept categories like colors, textures, and object classes. But these techniques are limited in scope, labeling only a small subset of neurons and behaviors in any network. Is a richer characterization of neuron-level computation possible? We introduce a procedure (called MILAN, for mutual information-guided linguistic annotation of neurons) that automatically labels neurons with open-ended, compositional, natural language descriptions. Given a neuron, MILAN generates a description by searching for a natural language string that maximizes pointwise mutual information with the image regions in which the neuron is active. MILAN produces fine-grained descriptions that capture categorical, relational, and logical structure in learned features. These descriptions obtain high agreement with human-generated feature descriptions across a diverse set of model architectures and tasks, and can aid in understanding and controlling learned models. We highlight three applications of natural language neuron descriptions. First, we use MILAN for analysis, characterizing the distribution and importance of neurons selective for attribute, category, and relational information in vision models. Second, we use MILAN for auditing, surfacing neurons sensitive to human faces in datasets designed to obscure them. Finally, we use MILAN for editing, improving robustness in an image classifier by deleting neurons sensitive to text features spuriously correlated with class labels.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Evan Hernandez;Sarah Schwettmann;David Bau;Teona Bagashvili;Antonio Torralba;Jacob Andreas", "authorids": "~Evan_Hernandez1;~Sarah_Schwettmann2;~David_Bau1;~Teona_Bagashvili1;~Antonio_Torralba1;~Jacob_Andreas1", "gender": "M;F;M;F;M;M", "homepage": "https://evandez.com;;https://baulab.info/;;http://web.mit.edu/torralba/www//;http://web.mit.edu/jda/www", "dblp": ";;47/3614;;t/AntonioBTorralba;97/8154", "google_scholar": ";;CYI6cKgAAAAJ;;https://scholar.google.com.tw/citations?user=8cxDHS4AAAAJ;dnZ8udEAAAAJ", "orcid": "0000-0002-8876-1781;0000-0001-6385-1396;0000-0003-1744-6765;;;", "linkedin": "evandez/;;david-bau-4b8130/;https://www.linkedin.com/mwlite/in/teona-bagashvili;;", "or_profile": "~Evan_Hernandez1;~Sarah_Schwettmann2;~David_Bau1;~Teona_Bagashvili1;~Antonio_Torralba1;~Jacob_Andreas1", "aff": "Massachusetts Institute of Technology;Massachusetts Institute of Technology;Harvard University;Allegheny College;Massachusetts Institute of Technology;Microsoft", "aff_domain": "mit.edu;mit.edu;harvard.edu;allegheny.edu;mit.edu;microsoft.com", "position": "PhD student;Postdoc;Postdoc;Undergrad student;Full Professor;Researcher", "bibtex": "@inproceedings{\nhernandez2022natural,\ntitle={Natural Language Descriptions of Deep Features},\nauthor={Evan Hernandez and Sarah Schwettmann and David Bau and Teona Bagashvili and Antonio Torralba and Jacob Andreas},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=NudBMY-tzDr}\n}", "github": "", "project": "", "reviewers": "YviU;eaMc;CDo5", "pdf_size": 0, "recommendation": "8;8;8", "confidence": "4;4;3", "correctness": "3;4;4", "technical_novelty": "3;3;3", "empirical_novelty": "3;4;3", "wc_summary_paper": "116;195;164", "wc_summary_review": "25;39;54", "wc_main_review": "543;115;427", "wc_review": "684;349;645", "wc_reply_reviewers": "0;6;0", "wc_reply_authors": "951;58;465", "reply_reviewers": "0;1;0", "reply_authors": "2;1;1", "recommendation_avg": [ 8.0, 0.0 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 158.33333333333334, 32.49957264676294 ], "wc_summary_review_avg": [ 39.333333333333336, 11.841546445554407 ], "wc_main_review_avg": [ 361.6666666666667, 180.73430468200797 ], "wc_review_avg": [ 559.3333333333334, 149.57792469330343 ], "wc_reply_reviewers_avg": [ 2.0, 2.8284271247461903 ], "wc_reply_authors_avg": [ 491.3333333333333, 365.04094138724895 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 134, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14718294843289648598&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "pdf": "https://openreview.net/pdf?id=NudBMY-tzDr", "email": "mit.edu;mit.edu;harvard.edu;allegheny.edu;mit.edu;microsoft.com", "author_num": 6, "aff_unique_index": "0;0;1;2;0;3", "aff_unique_norm": "Massachusetts Institute of Technology;Harvard University;Allegheny College;Microsoft", "aff_unique_dep": ";;;Microsoft Corporation", "aff_unique_url": "https://web.mit.edu;https://www.harvard.edu;https://www.allegheny.edu;https://www.microsoft.com", "aff_unique_abbr": "MIT;Harvard;Allegheny;Microsoft", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "Nus6fOfh1HW", "title": "On the Relationship between Heterophily and Robustness of Graph Neural Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "Empirical studies on the robustness of graph neural networks (GNNs) have suggested a relation between the vulnerabilities of GNNs to adversarial attacks and the increased presence of heterophily in perturbed graphs (where edges tend to connect nodes with dissimilar features and labels). In this work, we formalize the relation between heterophily and robustness, bridging two topics previously investigated by separate lines of research. We theoretically and empirically show that for graphs exhibiting homophily (low heterophily), impactful structural attacks always lead to increased levels of heterophily, while for graph with heterophily the change in the homophily level depends on the node degrees. By leveraging these insights, we deduce that a design principle identified to significantly improve predictive performance under heterophily\u2014separate aggregators for ego- and neighbor-embeddings\u2014can also inherently offer increased robustness to GNNs. Our extensive empirical analysis shows that GNNs adopting this design alone can achieve significantly improved empirical and certifiable robustness compared to the best-performing unvaccinated model. Furthermore, models with this design can be readily combined with explicit defense mechanisms to yield improved robustness with up to 18.33% increase in performance under attacks compared to the best-performing vaccinated model.", "keywords": "graph neural networks;adversarial attacks;heterophily;structural perturbation;robustness;relation", "primary_area": "", "supplementary_material": "/attachment/d5d38657aee07450e63c44eb144fb9a8d5c7d543.zip", "author": "Jiong Zhu;Junchen Jin;Donald Loveland;Michael T Schaub;Danai Koutra", "authorids": "~Jiong_Zhu1;~Junchen_Jin1;~Donald_Loveland2;~Michael_T_Schaub1;~Danai_Koutra1", "gender": "M;M;M;;F", "homepage": "https://www.jiongzhu.net;https://mark-jin.com;https://www.donaldloveland.com;https://michaelschaub.github.io/;http://web.eecs.umich.edu/~dkoutra/", "dblp": "51/8525;;;72/10263;91/9987", "google_scholar": "KjGFQ0QAAAAJ;y8YVvSAAAAAJ;mycopgEAAAAJ;https://scholar.google.com/citations?hl=en;https://scholar.google.com.tw/citations?user=bDrA1-8AAAAJ", "orcid": "0000-0002-6145-3295;;0009-0004-3257-0128;0000-0003-2426-6404;0000-0002-3206-8179", "linkedin": ";junchen-jin/;;;", "or_profile": "~Jiong_Zhu1;~Junchen_Jin1;~Donald_Loveland2;~Michael_T_Schaub1;~Danai_Koutra1", "aff": "University of Michigan;Northwestern University;University of Michigan;RWTH Aachen University;Amazon", "aff_domain": "umich.edu;northwestern.edu;umich.edu;rwth-aachen.de;amazon.com", "position": "PhD student;MS student;PhD student;Assistant Professor;Scholar", "bibtex": "@misc{\nzhu2022on,\ntitle={On the Relationship between Heterophily and Robustness of Graph Neural Networks},\nauthor={Jiong Zhu and Junchen Jin and Donald Loveland and Michael T Schaub and Danai Koutra},\nyear={2022},\nurl={https://openreview.net/forum?id=Nus6fOfh1HW}\n}", "github": "", "project": "", "reviewers": "kkG8;jMaA;aMbV;Vyu4", "site": "https://openreview.net/forum?id=Nus6fOfh1HW", "pdf_size": 0, "recommendation": "5;5;6;8", "confidence": "4;3;4;4", "correctness": "4;2;3;4", "technical_novelty": "2;3;2;2", "empirical_novelty": "2;2;3;4", "wc_summary_paper": "70;31;65;118", "wc_summary_review": "31;26;26;69", "wc_main_review": "149;212;610;442", "wc_review": "250;269;701;629", "wc_reply_reviewers": "60;0;56;54", "wc_reply_authors": "839;579;2110;1574", "reply_reviewers": "1;0;1;1", "reply_authors": "3;2;5;4", "recommendation_avg": [ 6.0, 1.224744871391589 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 71.0, 31.008063467427306 ], "wc_summary_review_avg": [ 38.0, 18.01388353465182 ], "wc_main_review_avg": [ 353.25, 184.02903982795758 ], "wc_review_avg": [ 462.25, 204.4521643319043 ], "wc_reply_reviewers_avg": [ 42.5, 24.632295873507204 ], "wc_reply_authors_avg": [ 1275.5, 604.3874998707369 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 3.5, 1.118033988749895 ], "replies_avg": [ 24, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.4714045207910316, "corr_recommendation_correctness": 0.4923659639173309, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2111760058415735217&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;0;2;3", "aff_unique_norm": "University of Michigan;Northwestern University;RWTH Aachen University;Amazon", "aff_unique_dep": ";;;Amazon.com, Inc.", "aff_unique_url": "https://www.umich.edu;https://www.northwestern.edu;https://www.rwth-aachen.de;https://www.amazon.com", "aff_unique_abbr": "UM;NU;RWTH;Amazon", "aff_campus_unique_index": "1", "aff_campus_unique": ";Aachen", "aff_country_unique_index": "0;0;0;1;0", "aff_country_unique": "United States;Germany" }, { "id": "NuzF7PHTKRw", "title": "EAT-C: Environment-Adversarial sub-Task Curriculum for Efficient Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Reinforcement learning (RL)'s efficiency can drastically degrade on long-horizon tasks due to sparse rewards and the RL policy can be fragile to small changes in deployed environments. To improve RL's efficiency and generalization to varying environments, we study how to automatically generate a curriculum of tasks with coupled environments for RL. To this end, we train two curriculum policies together with RL: (1) a co-operative planning policy recursively decomposing a hard task into coarse-to-fine sub-task sequences as a tree; and (2) an adversarial policy modifying the environment (e.g., position/size of obstacles) in each sub-task. They are complementary in acquiring more informative feedback for RL: the planning policy provides dense reward of finishing easier sub-tasks while the environment policy modifies these sub-tasks to be adequately challenging and diverse so the RL agent can quickly adapt to different tasks/environments. On the other hand, they are trained using the RL agent's dense feedback on sub-tasks so the sub-task curriculum keeps adaptive to the agent's progress via this ``iterative mutual-boosting'' scheme. Moreover, the sub-task tree naturally enables an easy-to-hard curriculum for every policy: its top-down construction gradually increases sub-tasks the planning policy needs to generate, while the adversarial training between the environment policy and the RL policy follows a bottom-up traversal that starts from a dense sequence of easier sub-tasks allowing more frequent modifications to the environment. Therefore, jointly training the three policies leads to efficient RL guided by a curriculum progressively improving the sparse reward and generalization. We compare our method with popular RL/planning approaches targeting similar problems and the ones with environment generators or adversarial agents. Thorough experiments on diverse benchmark tasks demonstrate significant advantages of our method on improving RL's efficiency and generalization. ", "keywords": "reinforcement learning;curriculum learning;sub-tasks;adversarial environment;path planning", "primary_area": "", "supplementary_material": "", "author": "Shuang Ao;Tianyi Zhou;Jing Jiang;Guodong Long;Xuan Song;Chengqi Zhang", "authorids": "~Shuang_Ao3;~Tianyi_Zhou1;~Jing_Jiang6;~Guodong_Long2;~Xuan_Song5;~Chengqi_Zhang1", "gender": "M;M;F;M;;M", "homepage": "https://github.com/Shuang-AO;https://tianyizhou.github.io/;https://www.uts.edu.au/staff/jing.jiang;https://www.uts.edu.au/staff/guodong.long;;https://research.polyu.edu.hk/en/persons/chengqi-zhang", "dblp": ";88/8205-1;68/1974-2;34/10089;;71/964", "google_scholar": ";OKvgizMAAAAJ;https://scholar.google.com.au/citations?hl=en;https://scholar.google.com.au/citations?user=Pl8m7hMAAAAJ;;https://scholar.google.com.au/citations?user=B6lBmqEAAAAJ", "orcid": ";0000-0001-5348-0632;;0000-0003-3740-9515;;0000-0001-5715-7154", "linkedin": ";tianyizhou;;;;chengqi-zhang-55aa8910/", "or_profile": "~Shuang_Ao3;~Tianyi_Zhou1;~Jing_Jiang6;~Guodong_Long2;~Xuan_Song5;~Chengqi_Zhang1", "aff": "University of Technology Sydney;University of Washington, Seattle;University of Technology Sydney;University of Technology Sydney;;University of Technology Sydney", "aff_domain": "uts.edu.au;uw.edu;uts.edu.au;uts.edu.au;;uts.edu.au", "position": "PhD student;PhD student;Lecturer;Associate Professor;;Full Professor", "bibtex": "@misc{\nao2022eatc,\ntitle={{EAT}-C: Environment-Adversarial sub-Task Curriculum for Efficient Reinforcement Learning},\nauthor={Shuang Ao and Tianyi Zhou and Jing Jiang and Guodong Long and Xuan Song and Chengqi Zhang},\nyear={2022},\nurl={https://openreview.net/forum?id=NuzF7PHTKRw}\n}", "github": "", "project": "", "reviewers": "o38w;Zhvq;3c96;7T33", "site": "https://openreview.net/forum?id=NuzF7PHTKRw", "pdf_size": 0, "recommendation": "5;6;6;6", "confidence": "4;4;3;3", "correctness": "3;4;3;4", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "77;63;100;79", "wc_summary_review": "68;45;87;36", "wc_main_review": "844;631;296;212", "wc_review": "989;739;483;327", "wc_reply_reviewers": "1883;959;0;0", "wc_reply_authors": "3926;2516;467;348", "reply_reviewers": "6;3;0;0", "reply_authors": "8;5;1;1", "recommendation_avg": [ 5.75, 0.4330127018922193 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 79.75, 13.216939887886303 ], "wc_summary_review_avg": [ 59.0, 19.937402037376884 ], "wc_main_review_avg": [ 495.75, 254.94349864234624 ], "wc_review_avg": [ 634.5, 252.04116727233273 ], "wc_reply_reviewers_avg": [ 710.5, 782.0052749182706 ], "wc_reply_authors_avg": [ 1814.25, 1493.060008003697 ], "reply_reviewers_avg": [ 2.25, 2.48746859276655 ], "reply_authors_avg": [ 3.75, 2.947456530637899 ], "replies_avg": [ 30, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10853735307068062859&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1;0;0;0", "aff_unique_norm": "University of Technology Sydney;University of Washington", "aff_unique_dep": ";", "aff_unique_url": "https://www.uts.edu.au;https://www.washington.edu", "aff_unique_abbr": "UTS;UW", "aff_campus_unique_index": "1", "aff_campus_unique": ";Seattle", "aff_country_unique_index": "0;1;0;0;0", "aff_country_unique": "Australia;United States" }, { "title": "Neural Program Synthesis with Query", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7064", "id": "NyJ2KIN8P17", "poster": "", "openreview": "https://openreview.net/forum?id=NyJ2KIN8P17", "slides": "https://iclr.cc/virtual/2022/poster/7064", "video": "https://iclr.cc/virtual/2022/poster/7064", "author_site": "Di Huang, Rui Zhang, Xing Hu, Xishan Zhang, Pengwei Jin, Nan Li, Zidong Du, Qi Guo, Yunji Chen", "tldr": "", "abstract": "Aiming to find a program satisfying the user intent given input-output examples, program synthesis has attracted increasing interest in the area of machine learning. Despite the promising performance of existing methods, most of their success comes from the privileged information of well-designed input-output examples. However, providing such input-output examples is unrealistic because it requires the users to have the ability to describe the underlying program with a few input-output examples under the training distribution. In this work, we propose a query-based framework that trains a query neural network to generate informative input-output examples automatically and interactively from a large query space. The quality of the query depends on the amount of the mutual information between the query and the corresponding program, which can guide the optimization of the query framework. To estimate the mutual information more accurately, we introduce the functional space (F-space) which models the relevance between the input-output examples and the programs in a differentiable way. We evaluate the effectiveness and generalization of the proposed query-based framework on the Karel task and the list processing task. Experimental results show that the query-based framework can generate informative input-output examples which achieve\nand even outperform well-designed input-output examples.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/6723960ae0943718442d71de8c64a67c25fa4a89.zip", "author": "Di Huang;Rui Zhang;Xing Hu;Xishan Zhang;Pengwei Jin;Nan Li;Zidong Du;Qi Guo;Yunji Chen", "authorids": "~Di_Huang5;~Rui_Zhang1;~Xing_Hu3;~Xishan_Zhang1;~Pengwei_Jin1;~Nan_Li3;~Zidong_Du1;~Qi_Guo4;~Yunji_Chen1", "gender": "M;F;F;;M;M;;M;M", "homepage": ";;;;https://github.com/Pengwei-Jin;http://home.ustc.edu.cn/~ln2016/;https://zidongdu.github.io/;http://novel.ict.ac.cn/qguo;", "dblp": ";60/2536-40;49/10052-1;133/6391;304/2505;;44/11216;67/398-1;48/474", "google_scholar": ";dse6jAsAAAAJ;Hc3iRxUAAAAJ;;;;https://scholar.google.com.sg/citations?user=8N9ym9YAAAAJ;;", "orcid": "0000-0002-2370-0072;;;;0000-0002-8267-9824;;0000-0002-7603-4210;;", "linkedin": ";;;;;;;;", "or_profile": "~Di_Huang5;~Rui_Zhang1;~Xing_Hu3;~Xishan_Zhang1;~Pengwei_Jin1;~Nan_Li3;~Zidong_Du1;~Qi_Guo4;~Yunji_Chen1", "aff": "Institute of Computing Technology, Chinese Academy of Sciences;Institute of Computing Technology, CAS;Institute of Computing Technology, Chinese Academy of Sciences;, Cambricon Techonologies;Chinese Academy of Sciences;University of Science and Technology of China;Institute of Computing Technology, Chinese Academy of Sciences;Institute of Computing Technology, Chinese Academy of Sciences;Institute of Computing Technology, Chinese Academy of Sciences", "aff_domain": "ict.ac.cn;ict.ac.cn;ict.ac.cn;cambricon.com;ac.cn;ustc.edu.cn;ict.ac.cn;ict.ac.cn;ict.ac.cn", "position": "PhD student;Assistant Professor;Associate Professor;Researcher;PhD student;MS student;Associate Professor;Full Professor;Full Professor", "bibtex": "@inproceedings{\nhuang2022neural,\ntitle={Neural Program Synthesis with Query},\nauthor={Di Huang and Rui Zhang and Xing Hu and Xishan Zhang and Pengwei Jin and Nan Li and Zidong Du and Qi Guo and Yunji Chen},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=NyJ2KIN8P17}\n}", "github": "", "project": "", "reviewers": "Ytee;KXs9;WAY8", "pdf_size": 0, "recommendation": "3;3;8", "confidence": "4;5;4", "correctness": "3;2;4", "technical_novelty": "3;2;3", "empirical_novelty": "0;1;3", "wc_summary_paper": "197;172;223", "wc_summary_review": "29;20;210", "wc_main_review": "554;336;1523", "wc_review": "780;528;1956", "wc_reply_reviewers": "0;0;789", "wc_reply_authors": "1535;1926;2006", "reply_reviewers": "0;0;2", "reply_authors": "3;3;4", "recommendation_avg": [ 4.666666666666667, 2.357022603955158 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "correctness_avg": [ 3.0, 0.816496580927726 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 1.3333333333333333, 1.247219128924647 ], "wc_summary_paper_avg": [ 197.33333333333334, 20.82199691565522 ], "wc_summary_review_avg": [ 86.33333333333333, 87.52269546935939 ], "wc_main_review_avg": [ 804.3333333333334, 515.90847594855 ], "wc_review_avg": [ 1088.0, 622.3311015850003 ], "wc_reply_reviewers_avg": [ 263.0, 371.938166904124 ], "wc_reply_authors_avg": [ 1822.3333333333333, 205.78359722992715 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.9428090415820634 ], "reply_authors_avg": [ 3.3333333333333335, 0.4714045207910317 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 9, 0 ], "corr_recommendation_confidence": -0.5, "corr_recommendation_correctness": 0.8660254037844387, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2797759486685386369&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "pdf": "https://openreview.net/pdf?id=NyJ2KIN8P17", "email": "ict.ac.cn;ict.ac.cn;ict.ac.cn;cambricon.com;ac.cn;ustc.edu.cn;ict.ac.cn;ict.ac.cn;ict.ac.cn", "author_num": 9, "aff_unique_index": "0;0;0;1;0;2;0;0;0", "aff_unique_norm": "Chinese Academy of Sciences;Cambricon Technologies;University of Science and Technology of China", "aff_unique_dep": "Institute of Computing Technology;;", "aff_unique_url": "http://www.ict.ac.cn;https://www.cambricon.com;http://www.ustc.edu.cn", "aff_unique_abbr": "CAS;Cambricon;USTC", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0", "aff_country_unique": "China" }, { "id": "O-l4bthTxno", "title": "Not All Regions are Worthy to be Distilled: Region-aware Knowledge Distillation Towards Efficient Image-to-Image Translation", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Recent progress in image-to-image translation has witnessed the success of generative adversarial networks (GANs). However, GANs usually contain a huge number of parameters, which lead to intolerant memory and computation consumption and limit their deployment on edge devices. To address this issue, knowledge distillation is proposed to transfer the knowledge learned by a cumbersome teacher model to an efficient student model. However, previous knowledge distillation methods directly train the student to learn teacher knowledge in all the spatial regions of the images but ignore the fact that in image-to-image translation a large number of regions (e.g. background regions) should not be translated and teacher features in these regions are not worthy to be distilled. To tackle this challenge, in this paper, we propose Region-aware Knowledge Distillation which first localizes the crucial regions in the images with attention mechanism. Then, teacher features in these crucial regions are distilled to students with a region-wise contrastive learning framework. Besides distilling teacher knowledge in features, we further introduce perceptual distillation to distill teacher knowledge in the generated images. Experiments with four comparison methods demonstrate the substantial effectiveness of our method on both paired and unpaired image-to-image translation. For instance, our 7.08X compressed and 6.80X accelerated CycleGAN student outperforms its teacher by 1.36 and 1.16 FID scores on Horse to Zebra and Zebra to Horse, respectively. Codes have been released in the supplementary material and will be released on GitHub soon.", "keywords": "Knowledge Distillation;Image-to-Image Translation;GAN;CycleGAN;Pix2Pix", "primary_area": "", "supplementary_material": "/attachment/4b2a97144f7250fe28ea3a8d71fefe244f3bdbf4.zip", "author": "Linfeng Zhang;Kaisheng Ma", "authorids": "~Linfeng_Zhang2;~Kaisheng_Ma1", "gender": "M;M", "homepage": "http://www.zhanglinfeng.tech/;http://group.iiis.tsinghua.edu.cn/~maks/index.html", "dblp": "93/488-1;133/4053.html", "google_scholar": "AK9VF30AAAAJ;VtDpVoEAAAAJ", "orcid": "0000-0002-3341-183X;0000-0001-9226-3366", "linkedin": ";", "or_profile": "~Linfeng_Zhang2;~Kaisheng_Ma1", "aff": "Tsinghua University;", "aff_domain": "tsinghua.edu.cn;", "position": "PhD student;", "bibtex": "@misc{\nzhang2022not,\ntitle={Not All Regions are Worthy to be Distilled: Region-aware Knowledge Distillation Towards Efficient Image-to-Image Translation},\nauthor={Linfeng Zhang and Kaisheng Ma},\nyear={2022},\nurl={https://openreview.net/forum?id=O-l4bthTxno}\n}", "github": "", "project": "", "reviewers": "7RB2;BSRf;rd2a", "site": "https://openreview.net/forum?id=O-l4bthTxno", "pdf_size": 0, "recommendation": "3;5;6", "confidence": "5;5;4", "correctness": "4;3;3", "technical_novelty": "3;2;3", "empirical_novelty": "3;2;3", "wc_summary_paper": "106;93;131", "wc_summary_review": "22;127;73", "wc_main_review": "320;268;469", "wc_review": "448;488;673", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 4.666666666666667, 1.247219128924647 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 110.0, 15.769168230019828 ], "wc_summary_review_avg": [ 74.0, 42.871902220452036 ], "wc_main_review_avg": [ 352.3333333333333, 85.18346214038392 ], "wc_review_avg": [ 536.3333333333334, 98.00793618659438 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.7559289460184544, "corr_recommendation_correctness": -0.9449111825230683, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:IOfUnxP87noJ:scholar.google.com/&scioq=Not+All+Regions+are+Worthy+to+be+Distilled:+Region-aware+Knowledge+Distillation+Towards+Efficient+Image-to-Image+Translation&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "Tsinghua University", "aff_unique_dep": "", "aff_unique_url": "https://www.tsinghua.edu.cn", "aff_unique_abbr": "THU", "aff_country_unique_index": "0", "aff_country_unique": "China" }, { "title": "Open-World Semi-Supervised Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7118", "id": "O-r8LOR-CCA", "poster": "", "openreview": "https://openreview.net/forum?id=O-r8LOR-CCA", "slides": "https://iclr.cc/virtual/2022/poster/7118", "video": "https://iclr.cc/virtual/2022/poster/7118", "author_site": "Kaidi Cao, Maria Brbic, Jure Leskovec", "tldr": "", "abstract": "A fundamental limitation of applying semi-supervised learning in real-world settings is the assumption that unlabeled test data contains only classes previously encountered in the labeled training data. However, this assumption rarely holds for data in-the-wild, where instances belonging to novel classes may appear at testing time. Here, we introduce a novel open-world semi-supervised learning setting that formalizes the notion that novel classes may appear in the unlabeled test data. In this novel setting, the goal is to solve the class distribution mismatch problem between labeled and unlabeled data, where at the test time every input instance either needs to be classified into one of the existing classes or a new unseen class needs to be initialized and the instance assigned to it. To tackle this challenging problem, we propose ORCA, an end-to-end approach that assigns instances to previously seen classes or forms novel classes by grouping similar instances without assuming any prior knowledge. The key idea in ORCA is to utilize uncertainty adaptive margin to circumvent the bias towards seen classes caused by learning seen classes faster than the novel classes. In this way, ORCA gradually increases the discriminability of the model during the training and reduces the gap between intra-class variance of seen with respect to novel classes. Extensive experiments on image classification datasets and a single-cell dataset demonstrate that ORCA consistently outperforms alternative baselines, achieving 25% improvement on seen and 96% improvement on novel classes of the ImageNet dataset. ", "keywords": "deep learning;semi-supervised learning;novel class discovery;clustering", "primary_area": "", "supplementary_material": "", "author": "Kaidi Cao;Maria Brbic;Jure Leskovec", "authorids": "~Kaidi_Cao1;~Maria_Brbic1;~Jure_Leskovec1", "gender": "M;F;", "homepage": "https://ai.stanford.edu/~kaidicao/;https://brbiclab.epfl.ch/;http://cs.stanford.edu/~jure/", "dblp": "203/8207;130/3233;l/JureLeskovec", "google_scholar": "https://scholar.google.com.hk/citations?user=4Zw1PJ8AAAAJ;ltxmeroAAAAJ;Q_kKkIUAAAAJ", "orcid": ";0000-0002-1120-1778;0000-0002-5411-923X", "linkedin": ";;leskovec/", "or_profile": "~Kaidi_Cao1;~Maria_Brbic1;~Jure_Leskovec1", "aff": "Stanford University;;Kumo.AI", "aff_domain": "stanford.edu;;kumo.ai", "position": "PhD student;;Chief Scientist", "bibtex": "@inproceedings{\ncao2022openworld,\ntitle={Open-World Semi-Supervised Learning},\nauthor={Kaidi Cao and Maria Brbic and Jure Leskovec},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=O-r8LOR-CCA}\n}", "github": "", "project": "", "reviewers": "NjL4;VV71;YZto;zKw3;fpiF", "pdf_size": 0, "recommendation": "6;6;6;6;6", "confidence": "4;5;3;5;4", "correctness": "4;3;4;3;3", "technical_novelty": "2;4;2;3;2", "empirical_novelty": "3;3;0;3;3", "wc_summary_paper": "88;72;48;76;53", "wc_summary_review": "72;38;26;66;32", "wc_main_review": "518;254;133;685;209", "wc_review": "678;364;207;827;294", "wc_reply_reviewers": "0;0;0;0;34", "wc_reply_authors": "1189;681;421;1534;991", "reply_reviewers": "0;0;0;0;1", "reply_authors": "2;1;1;3;3", "recommendation_avg": [ 6.0, 0.0 ], "confidence_avg": [ 4.2, 0.7483314773547882 ], "correctness_avg": [ 3.4, 0.4898979485566356 ], "technical_novelty_avg": [ 2.6, 0.8 ], "empirical_novelty_avg": [ 2.4, 1.2000000000000002 ], "wc_summary_paper_avg": [ 67.4, 14.853955702101715 ], "wc_summary_review_avg": [ 46.8, 18.616122045152153 ], "wc_main_review_avg": [ 359.8, 207.92056175376212 ], "wc_review_avg": [ 474.0, 237.49273673104196 ], "wc_reply_reviewers_avg": [ 6.8, 13.6 ], "wc_reply_authors_avg": [ 963.2, 387.60515992437456 ], "reply_reviewers_avg": [ 0.2, 0.4 ], "reply_authors_avg": [ 2.0, 0.8944271909999159 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 251, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13685131570461746231&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "pdf": "https://openreview.net/pdf?id=O-r8LOR-CCA", "email": "stanford.edu;;kumo.ai", "author_num": 3, "aff_unique_index": "0;1", "aff_unique_norm": "Stanford University;Kumo.AI", "aff_unique_dep": ";", "aff_unique_url": "https://www.stanford.edu;https://www.kumo.ai", "aff_unique_abbr": "Stanford;Kumo.AI", "aff_campus_unique_index": "0", "aff_campus_unique": "Stanford;", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "O0g6uPDLW7", "title": "On the Adversarial Robustness of Vision Transformers", "track": "main", "status": "Reject", "tldr": "", "abstract": "Following the success in advancing natural language processing and understanding, transformers are expected to bring revolutionary changes to computer vision. This work provides the first and comprehensive study on the robustness of vision transformers (ViTs) against adversarial perturbations. Tested on various white-box and transfer attack settings, we find that ViTs possess better adversarial robustness when compared with convolutional neural networks (CNNs). This observation also holds for certified robustness. We summarize the following main observations contributing to the improved robustness of ViTs:\n 1) Features learned by ViTs contain less low-level information and are more generalizable, which contributes to superior robustness against adversarial perturbations. \n 2) Introducing convolutional or tokens-to-token blocks for learning low-level features in ViTs can improve classification accuracy but at the cost of adversarial robustness.\n 3) Increasing the proportion of transformers in the model structure (when the model consists of both transformer and CNN blocks) leads to better robustness. But for a pure transformer model, simply increasing the size or adding layers cannot guarantee a similar effect.\n 4) Pre-training on larger datasets does not significantly improve adversarial robustness though it is critical for training ViTs. \n 5) Adversarial training is also applicable to ViT for training robust models.\nFurthermore, feature visualization and frequency analysis are conducted for explanation. The results show that ViTs are less sensitive to high-frequency perturbations than CNNs and there is a high correlation between how well the model learns low-level features and its robustness against different frequency-based perturbations.", "keywords": "vision transformer (ViT);adversarial robustness", "primary_area": "", "supplementary_material": "/attachment/794f6e10ae4daacc98faf85a4602ea96a3a8f0eb.zip", "author": "Rulin Shao;Zhouxing Shi;Jinfeng Yi;Pin-Yu Chen;Cho-Jui Hsieh", "authorids": "~Rulin_Shao1;~Zhouxing_Shi1;~Jinfeng_Yi1;~Pin-Yu_Chen1;~Cho-Jui_Hsieh1", "gender": ";;M;M;M", "homepage": "https://rulinshao.github.io/;https://shizhouxing.github.io;http://jinfengyi.net/;http://www.pinyuchen.com;http://web.cs.ucla.edu/~chohsieh/index.html", "dblp": ";232/2169;117/4898;39/8969;14/2770", "google_scholar": "Vdwh6bcAAAAJ;YFIr4PwAAAAJ;lZxRZ84AAAAJ;jxwlCUUAAAAJ;Wy89g4IAAAAJ", "orcid": ";;;0000-0003-1039-8369;", "linkedin": ";;https://www.linkedin.com/nhome/?trk=;pin-yu-chen-940062a2;", "or_profile": "~Rulin_Shao1;~Zhouxing_Shi1;~Jinfeng_Yi1;~Pin-Yu_Chen1;~Cho-Jui_Hsieh1", "aff": ";University of California, Los Angeles;JD AI Research;International Business Machines;University of California, Los Angeles", "aff_domain": ";ucla.edu;jd.com;ibm.com;ucla.edu", "position": ";PhD student;Senior Director;Research Staff Member;Assistant Professor", "bibtex": "@misc{\nshao2022on,\ntitle={On the Adversarial Robustness of Vision Transformers},\nauthor={Rulin Shao and Zhouxing Shi and Jinfeng Yi and Pin-Yu Chen and Cho-Jui Hsieh},\nyear={2022},\nurl={https://openreview.net/forum?id=O0g6uPDLW7}\n}", "github": "", "project": "", "reviewers": "WHoT;x14o;7ATA;FTwY", "site": "https://openreview.net/forum?id=O0g6uPDLW7", "pdf_size": 0, "recommendation": "5;5;5;5", "confidence": "4;3;5;4", "correctness": "3;3;3;3", "technical_novelty": "3;2;2;2", "empirical_novelty": "3;3;2;3", "wc_summary_paper": "48;87;40;23", "wc_summary_review": "68;27;22;29", "wc_main_review": "337;220;1112;236", "wc_review": "453;334;1174;288", "wc_reply_reviewers": "93;0;245;239", "wc_reply_authors": "933;1021;971;977", "reply_reviewers": "1;0;1;1", "reply_authors": "3;3;3;2", "recommendation_avg": [ 5.0, 0.0 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 49.5, 23.4574082114798 ], "wc_summary_review_avg": [ 36.5, 18.364367672206956 ], "wc_main_review_avg": [ 476.25, 369.78126980689547 ], "wc_review_avg": [ 562.25, 358.28925116447465 ], "wc_reply_reviewers_avg": [ 144.25, 103.15370812530202 ], "wc_reply_authors_avg": [ 975.5, 31.22098653149833 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 2.75, 0.4330127018922193 ], "replies_avg": [ 20, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 242, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15056564293298660220&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "University of California, Los Angeles;JD;International Business Machines Corporation", "aff_unique_dep": ";JD AI Research;", "aff_unique_url": "https://www.ucla.edu;https://www.jd.com;https://www.ibm.com", "aff_unique_abbr": "UCLA;JD AI;IBM", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Los Angeles;", "aff_country_unique_index": "0;1;0;0", "aff_country_unique": "United States;China" }, { "id": "O17RRqiZc5x", "title": "Adjoined Networks: A Training Paradigm with Applications to Network Compression", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Compressing deep neural networks while maintaining accuracy is important when we want to deploy large, powerful models in production and/or edge devices. One common technique used to achieve this goal is knowledge distillation. Typically, the output of a static pre-defined teacher (a large base network) is used as soft labels to train and transfer information to a student (or smaller) network. In this paper, we introduce Adjoined Networks, or AN, a learning paradigm that trains both the original base network and the smaller compressed network together. In our training approach, the parameters of the smaller network are shared across both the base and the compressed networks. Using our training paradigm, we can simultaneously compress (the student network) and regularize (the teacher network) any architecture. In this paper, we focus on popular CNN-based architectures used for computer vision tasks. We conduct an extensive experimental evaluation of our training paradigm on various large-scale datasets. Using ResNet-50 as the base network, AN achieves 71.8% top-1 accuracy with only 1.8M parameters and 1.6 GFLOPs on the ImageNet data-set. We further propose Differentiable Adjoined Networks (DAN), a training paradigm that augments AN by using neural architecture search to jointly learn both the width and the weights for each layer of the smaller network. DAN achieves ResNet-50 level accuracy on ImageNet with $3.8\\times$ fewer parameters and $2.2\\times$ fewer FLOPs.", "keywords": "Deep Neural Network;Compression;Knowledge Distillation", "primary_area": "", "supplementary_material": "/attachment/6ceae8d6ce66d4b1331dd44f7a5772b3696d114c.zip", "author": "Utkarsh Nath;Shrinu Kushagra;Yingzhen Yang", "authorids": "~Utkarsh_Nath1;~Shrinu_Kushagra1;~Yingzhen_Yang1", "gender": "M;M;M", "homepage": ";https://cs.uwaterloo.ca/~skushagr/;http://yingzhenyang.com", "dblp": "267/2281.html;129/9107;66/3838.html", "google_scholar": "8Gz0AuoAAAAJ;https://scholar.google.ca/citations?user=8RYloKYAAAAJ;", "orcid": ";;", "linkedin": "utkarsh-nath-39793398/;;yingzhen-yang-9b869122", "or_profile": "~Utkarsh_Nath1;~Shrinu_Kushagra1;~Yingzhen_Yang1", "aff": "Arizona State University;Google;Arizona State University", "aff_domain": "asu.edu;google.com;asu.edu", "position": "PhD student;Researcher;Assistant Professor", "bibtex": "@misc{\nnath2022adjoined,\ntitle={Adjoined Networks: A Training Paradigm with Applications to Network Compression},\nauthor={Utkarsh Nath and Shrinu Kushagra and Yingzhen Yang},\nyear={2022},\nurl={https://openreview.net/forum?id=O17RRqiZc5x}\n}", "github": "", "project": "", "reviewers": "b2Bx;UxeW;jxrL;Q4KA", "site": "https://openreview.net/forum?id=O17RRqiZc5x", "pdf_size": 0, "recommendation": "3;3;6;6", "confidence": "5;4;4;4", "correctness": "2;3;2;4", "technical_novelty": "1;2;3;3", "empirical_novelty": "1;0;3;3", "wc_summary_paper": "34;68;129;92", "wc_summary_review": "20;42;198;40", "wc_main_review": "217;426;1249;156", "wc_review": "271;536;1576;288", "wc_reply_reviewers": "0;0;283;39", "wc_reply_authors": "375;520;994;299", "reply_reviewers": "0;0;2;1", "reply_authors": "1;1;2;1", "recommendation_avg": [ 4.5, 1.5 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.82915619758885 ], "technical_novelty_avg": [ 2.25, 0.82915619758885 ], "empirical_novelty_avg": [ 1.75, 1.299038105676658 ], "wc_summary_paper_avg": [ 80.75, 34.65093793824346 ], "wc_summary_review_avg": [ 75.0, 71.53320907103218 ], "wc_main_review_avg": [ 512.0, 437.12869958400125 ], "wc_review_avg": [ 667.75, 534.7655444211042 ], "wc_reply_reviewers_avg": [ 80.5, 117.99258451275657 ], "wc_reply_authors_avg": [ 547.0, 270.0120367687337 ], "reply_reviewers_avg": [ 0.75, 0.82915619758885 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.5773502691896258, "corr_recommendation_correctness": 0.3015113445777637, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7299251190472816244&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff_unique_index": "0;1;0", "aff_unique_norm": "Arizona State University;Google", "aff_unique_dep": ";Google", "aff_unique_url": "https://www.asu.edu;https://www.google.com", "aff_unique_abbr": "ASU;Google", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Learning Pruning-Friendly Networks via Frank-Wolfe: One-Shot, Any-Sparsity, And No Retraining", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6158", "id": "O1DEtITim__", "poster": "", "openreview": "https://openreview.net/forum?id=O1DEtITim__", "slides": "https://iclr.cc/virtual/2022/poster/6158", "video": "https://iclr.cc/virtual/2022/poster/6158", "author_site": "Miao Lu, Xiaolong Luo, Tianlong Chen, Wuyang Chen, Dong Liu, Zhangyang Wang", "tldr": "", "abstract": "We present a novel framework to train a large deep neural network (DNN) for only $\\textit{once}$, which can then be pruned to $\\textit{any sparsity ratio}$ to preserve competitive accuracy $\\textit{without any re-training}$. Conventional methods often require (iterative) pruning followed by re-training, which not only incurs large overhead beyond the original DNN training but also can be sensitive to retraining hyperparameters. Our core idea is to re-cast the DNN training as an explicit $\\textit{pruning-aware}$ process: that is formulated with an auxiliary $K$-sparse polytope constraint, to encourage network weights to lie in a convex hull spanned by $K$-sparse vectors, potentially resulting in more sparse weight matrices. We then leverage a stochastic Frank-Wolfe (SFW) algorithm to solve this new constrained optimization, which naturally leads to sparse weight updates each time. We further note an overlooked fact that existing DNN initializations were derived to enhance SGD training (e.g., avoid gradient explosion or collapse), but was unaligned with the challenges of training with SFW. We hence also present the first learning-based initialization scheme specifically for boosting SFW-based DNN training. Experiments on CIFAR-10 and Tiny-ImageNet datasets demonstrate that our new framework named $\\textbf{SFW-pruning}$ consistently achieves the state-of-the-art performance on various benchmark DNNs over a wide range of pruning ratios. Moreover, SFW-pruning only needs to train once on the same model and dataset, for obtaining arbitrary ratios, while requiring neither iterative pruning nor retraining. All codes will be released to the public. ", "keywords": "Pruning;Frank-Wolfe", "primary_area": "", "supplementary_material": "", "author": "Miao Lu;Xiaolong Luo;Tianlong Chen;Wuyang Chen;Dong Liu;Zhangyang Wang", "authorids": "~Miao_Lu3;~Xiaolong_Luo3;~Tianlong_Chen1;~Wuyang_Chen1;~Dong_Liu6;~Zhangyang_Wang1", "gender": ";M;M;;M;M", "homepage": "https://miaolu3.github.io;http://home.ustc.edu.cn/~lxl213/index.html;https://tianlong-chen.github.io;;http://faculty.ustc.edu.cn/dongeliu/;https://vita-group.github.io", "dblp": "09/1168;;;;98/1737-2;119/4026", "google_scholar": "3jS17zQAAAAJ;;LE3ctn0AAAAJ;;lOWByxoAAAAJ;pxFyKAIAAAAJ", "orcid": ";;0000-0001-7774-8197;;0000-0001-9100-2906;", "linkedin": "miao-lu-5bb9a31aa/;\u9704\u9f99-\u9a86-0bb88a194/;tianlong-chen-783862167/;;;", "or_profile": "~Miao_Lu3;~Xiaolong_Luo3;~Tianlong_Chen1;~Wuyang_Chen1;~Dong_Liu6;~Zhangyang_Wang1", "aff": "University of Science and Technology of China;;University of Texas, Austin;;University of Science and Technology of China;University of Texas, Austin", "aff_domain": "ustc.edu.cn;;utexas.edu;;ustc.edu.cn;utexas.edu", "position": "Undergrad student;;PhD student;;Professor;Assistant Professor", "bibtex": "@inproceedings{\nmiao2022learning,\ntitle={Learning Pruning-Friendly Networks via Frank-Wolfe: One-Shot, Any-Sparsity, And No Retraining},\nauthor={Miao Lu and Xiaolong Luo and Tianlong Chen and Wuyang Chen and Dong Liu and Zhangyang Wang},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=O1DEtITim__}\n}", "github": "", "project": "", "reviewers": "9FPN;urBu;22uh;BXpo", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "4;4;3;4", "correctness": "3;3;3;3", "technical_novelty": "2;3;3;4", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "39;24;39;44", "wc_summary_review": "70;40;39;28", "wc_main_review": "413;480;87;196", "wc_review": "522;544;165;268", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "93;690;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;0;0", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 36.5, 7.5 ], "wc_summary_review_avg": [ 44.25, 15.594470173750693 ], "wc_main_review_avg": [ 294.0, 159.0675956944091 ], "wc_review_avg": [ 374.75, 162.57209938977843 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 195.75, 287.87008788687996 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0.5, 0.5 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.13245323570650439, "corr_recommendation_correctness": 0.0, "gs_citation": 41, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4199834330155399253&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "pdf": "https://openreview.net/pdf?id=O1DEtITim__", "email": "ustc.edu.cn;;utexas.edu;;ustc.edu.cn;utexas.edu", "author_num": 6, "aff_unique_index": "0;1;0;1", "aff_unique_norm": "University of Science and Technology of China;University of Texas at Austin", "aff_unique_dep": ";", "aff_unique_url": "http://www.ustc.edu.cn;https://www.utexas.edu", "aff_unique_abbr": "USTC;UT Austin", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Austin", "aff_country_unique_index": "0;1;0;1", "aff_country_unique": "China;United States" }, { "id": "O2s9k4h0x7L", "title": "A Deep Latent Space Model for Directed Graph Representation Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Graph representation learning is a fundamental problem for modeling relational data and benefits a number of downstream applications. Traditional Bayesian-based random graph models and recent deep learning based methods are complementary to each other in interpretability and scalability. To take the advantages of both models, some combined methods have been proposed. However, existing models are mainly designed for \\textit{undirected graphs}, while a large portion of real-world graphs are directed. The focus of this paper is on \\textit{directed graphs}. We propose a Deep Latent Space Model (DLSM) for directed graphs to incorporate the traditional latent space random graph model into deep learning frameworks via a hierarchical variational auto-encoder architecture. To adapt to directed graphs, our model generates multiple highly interpretable latent variables as node representations, and the interpretability of representing node influences is theoretically proved. Moreover, our model achieves good scalability for large graphs via the fast stochastic gradient variational Bayes inference algorithm. The experimental results on real-world graphs demonstrate that our proposed model achieves the state-of-the-art performances on link prediction and community detection tasks while generating interpretable node representations.", "keywords": "graph representation learning;directed graph;latent space model;variational autoencoder", "primary_area": "", "supplementary_material": "/attachment/a148c89e991f9b3dc5dfcf480b477cee969cafac.zip", "author": "Hanxuan Yang;Qingchao Kong;Wenji Mao", "authorids": "~Hanxuan_Yang1;~Qingchao_Kong1;~Wenji_Mao1", "gender": "M;M;F", "homepage": ";https://people.ucas.ac.cn/~0044647;", "dblp": ";119/3777;16/2159.html", "google_scholar": "HJz2cw8AAAAJ;https://scholar.google.com.hk/citations?user=pu4V6MkAAAAJ;h6m4X_AAAAAJ", "orcid": "0000-0002-4473-2356;0000-0002-1929-8404;", "linkedin": "%E7%80%9A%E8%BD%A9-%E6%9D%A8-b0b010182/;;", "or_profile": "~Hanxuan_Yang1;~Qingchao_Kong1;~Wenji_Mao1", "aff": "Institute of Automation, Chinese Academy of Sciences;Institute of Automation, Chinese Academy of Sciences;Institute of Automation, Chinese Academy of Sciences", "aff_domain": "ia.ac.cn;ia.ac.cn;ia.ac.cn", "position": "PhD student;Associate Professor;Full Professor", "bibtex": "@misc{\nyang2022a,\ntitle={A Deep Latent Space Model for Directed Graph Representation Learning},\nauthor={Hanxuan Yang and Qingchao Kong and Wenji Mao},\nyear={2022},\nurl={https://openreview.net/forum?id=O2s9k4h0x7L}\n}", "github": "", "project": "", "reviewers": "s9T6;D8S8;NCEy;bcyE", "site": "https://openreview.net/forum?id=O2s9k4h0x7L", "pdf_size": 0, "recommendation": "3;3;6;6", "confidence": "4;3;4;3", "correctness": "2;3;4;3", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "106;82;68;102", "wc_summary_review": "81;98;45;71", "wc_main_review": "207;301;172;538", "wc_review": "394;481;285;711", "wc_reply_reviewers": "93;624;0;0", "wc_reply_authors": "795;2454;456;1155", "reply_reviewers": "1;2;0;0", "reply_authors": "2;5;1;2", "recommendation_avg": [ 4.5, 1.5 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 89.5, 15.386682553429118 ], "wc_summary_review_avg": [ 73.75, 19.201236939322424 ], "wc_main_review_avg": [ 304.5, 142.82594302156733 ], "wc_review_avg": [ 467.75, 156.67063381501973 ], "wc_reply_reviewers_avg": [ 179.25, 259.5682713661283 ], "wc_reply_authors_avg": [ 1215.0, 756.8358474596721 ], "reply_reviewers_avg": [ 0.75, 0.82915619758885 ], "reply_authors_avg": [ 2.5, 1.5 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.7071067811865476, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:bLm8GnbI-nIJ:scholar.google.com/&scioq=A+Deep+Latent+Space+Model+for+Directed+Graph+Representation+Learning&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;0", "aff_unique_norm": "Chinese Academy of Sciences", "aff_unique_dep": "Institute of Automation", "aff_unique_url": "http://www.ia.cas.cn", "aff_unique_abbr": "CAS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "Anti-Oversmoothing in Deep Vision Transformers via the Fourier Domain Analysis: From Theory to Practice", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6656", "id": "O476oWmiNNp", "poster": "", "openreview": "https://openreview.net/forum?id=O476oWmiNNp", "slides": "https://iclr.cc/virtual/2022/poster/6656", "video": "https://iclr.cc/virtual/2022/poster/6656", "author_site": "Peihao Wang, Wenqing Zheng, Tianlong Chen, Zhangyang Wang", "tldr": "", "abstract": "Vision Transformer (ViT) has recently demonstrated promise in computer vision problems. However, unlike Convolutional Neural Networks (CNN), it is known that the performance of ViT saturates quickly with depth increasing, due to the observed attention collapse or patch uniformity. Despite a couple of empirical solutions, a rigorous framework studying on this scalability issue remains elusive. In this paper, we first establish a rigorous theory framework to analyze ViT features from the Fourier spectrum domain. We show that the self-attention mechanism inherently amounts to a low-pass filter, which indicates when ViT scales up its depth, excessive low-pass filtering will cause feature maps to only preserve their Direct-Current (DC) component. We then propose two straightforward yet effective techniques to mitigate the undesirable low-pass limitation. The first technique, termed AttnScale, decomposes a self-attention block into low-pass and high-pass components, then rescales and combines these two filters to produce an all-pass self-attention matrix. The second technique, termed FeatScale, re-weights feature maps on separate frequency bands to amplify the high-frequency signals. Both techniques are efficient and hyperparameter-free, while effectively overcoming relevant ViT training artifacts such as attention collapse and patch uniformity. By seamlessly plugging in our techniques to multiple ViT variants, we demonstrate that they consistently help ViTs benefit from deeper architectures, bringing up to 1.1% performance gains \"for free\" (e.g., with little parameter overhead). We publicly release our codes and pre-trained models at https://github.com/VITA-Group/ViT-Anti-Oversmoothing.", "keywords": "Deep ViT;Spectral Analysis;Attention Collapse;Patch Diversity", "primary_area": "", "supplementary_material": "/attachment/d22bd8355ab7038de47ea1de25d54752449585ad.zip", "author": "Peihao Wang;Wenqing Zheng;Tianlong Chen;Zhangyang Wang", "authorids": "~Peihao_Wang1;~Wenqing_Zheng1;~Tianlong_Chen1;~Zhangyang_Wang1", "gender": "M;M;M;M", "homepage": "https://peihaowang.github.io/;https://wenqing-zheng.github.io;https://tianlong-chen.github.io;https://vita-group.github.io", "dblp": "239/4075;;;119/4026", "google_scholar": "fqf2tBsAAAAJ;https://scholar.google.com/citations?hl=zh-CN;LE3ctn0AAAAJ;pxFyKAIAAAAJ", "orcid": ";0000-0002-8283-7511;0000-0001-7774-8197;", "linkedin": "peihao-wang-25a411162/;;tianlong-chen-783862167/;", "or_profile": "~Peihao_Wang1;~Wenqing_Zheng1;~Tianlong_Chen1;~Zhangyang_Wang1", "aff": "University of Texas, Austin;University of Texas, Austin;University of Texas, Austin;University of Texas, Austin", "aff_domain": "utexas.edu;utexas.edu;utexas.edu;utexas.edu", "position": "PhD student;PhD student;PhD student;Assistant Professor", "bibtex": "@inproceedings{\nwang2022antioversmoothing,\ntitle={Anti-Oversmoothing in Deep Vision Transformers via the Fourier Domain Analysis: From Theory to Practice},\nauthor={Peihao Wang and Wenqing Zheng and Tianlong Chen and Zhangyang Wang},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=O476oWmiNNp}\n}", "github": "", "project": "", "reviewers": "kYZM;DHWs;D5xX", "pdf_size": 0, "recommendation": "6;6;6", "confidence": "4;5;2", "correctness": "3;2;3", "technical_novelty": "3;3;2", "empirical_novelty": "2;2;2", "wc_summary_paper": "53;70;89", "wc_summary_review": "50;37;19", "wc_main_review": "139;121;84", "wc_review": "242;228;192", "wc_reply_reviewers": "27;21;0", "wc_reply_authors": "505;998;536", "reply_reviewers": "1;1;0", "reply_authors": "3;5;2", "recommendation_avg": [ 6.0, 0.0 ], "confidence_avg": [ 3.6666666666666665, 1.247219128924647 ], "correctness_avg": [ 2.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 70.66666666666667, 14.70449666674185 ], "wc_summary_review_avg": [ 35.333333333333336, 12.710450643291745 ], "wc_main_review_avg": [ 114.66666666666667, 22.895899681432528 ], "wc_review_avg": [ 220.66666666666666, 21.060758665241753 ], "wc_reply_reviewers_avg": [ 16.0, 11.575836902790225 ], "wc_reply_authors_avg": [ 679.6666666666666, 225.45115263005914 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 3.3333333333333335, 1.247219128924647 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 168, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1886992923455463917&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=O476oWmiNNp", "email": "utexas.edu;utexas.edu;utexas.edu;utexas.edu", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "University of Texas at Austin", "aff_unique_dep": "", "aff_unique_url": "https://www.utexas.edu", "aff_unique_abbr": "UT Austin", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Austin", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "O4dxuEsIo9S", "title": "Spending Your Winning Lottery Better After Drawing It", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Lottery Ticket Hypothesis (LTH) (Frankle & Carbin, 2019) suggest suggests that a dense neural network contains a sparse sub-network that can match the performance of the original dense network when trained in isolation from scratch. Most works retrain the sparse sub-network with the same training protocols as its dense network, such as initialization, architecture blocks, and training recipes. However, till now it is unclear that whether these training protocols are optimal for sparse networks. \nIn this paper, we demonstrate that it is unnecessary for spare retraining to strictly inherit those properties from the dense network. Instead, by plugging in purposeful \"tweaks\" of the sparse subnetwork architecture or its training recipe, its retraining can be significantly improved than the default, especially at high sparsity levels. Combining all our proposed \"tweaks\" can yield the new state-of-the-art performance of LTH, and these modifications can be easily adapted to other sparse training algorithms in general. Specifically, we have achieved a significant and consistent performance gain of 1.05% - 4.93% for ResNet18 on CIFAR-100 over vanilla-LTH. Moreover, our methods are shown to generalize across datasets (CIFAR10, CIFAR100, TinyImageNet) and architectures (Vgg16, ResNet-18/ResNet-34, MobileNet). All codes will be publicly available.", "keywords": "Lottery Ticket Hypothesis;Sparse Network Optimization;Sparse Training", "primary_area": "", "supplementary_material": "", "author": "AJAY KUMAR JAISWAL;Haoyu Ma;Tianlong Chen;Ying Ding;Zhangyang Wang", "authorids": "~AJAY_KUMAR_JAISWAL1;~Haoyu_Ma1;~Tianlong_Chen1;~Ying_Ding4;~Zhangyang_Wang1", "gender": "M;M;M;F;M", "homepage": "https://ajay1994.github.io/;https://www.ics.uci.edu/~haoyum3/;https://tianlong-chen.github.io;https://yingding.ischool.utexas.edu/;https://vita-group.github.io", "dblp": "30/9707;144/1634;;38/6013-1.html;119/4026", "google_scholar": "I783HxYAAAAJ;8jugwosAAAAJ;LE3ctn0AAAAJ;riuIGwIAAAAJ;pxFyKAIAAAAJ", "orcid": ";0000-0001-6646-2644;0000-0001-7774-8197;;", "linkedin": ";haoyu-ma-53517915a/;tianlong-chen-783862167/;ying-ding-6a63bb/;", "or_profile": "~AJAY_KUMAR_JAISWAL1;~Haoyu_Ma1;~Tianlong_Chen1;~Ying_Ding4;~Zhangyang_Wang1", "aff": "Amazon;Meta Platforms, Inc;University of Texas, Austin;University of Texas, Austin;University of Texas, Austin", "aff_domain": "amazon.com;fb.com;utexas.edu;utexas.edu;utexas.edu", "position": "Researcher;Intern;PhD student;Full Professor;Assistant Professor", "bibtex": "@misc{\njaiswal2022spending,\ntitle={Spending Your Winning Lottery Better After Drawing It},\nauthor={AJAY KUMAR JAISWAL and Haoyu Ma and Tianlong Chen and Ying Ding and Zhangyang Wang},\nyear={2022},\nurl={https://openreview.net/forum?id=O4dxuEsIo9S}\n}", "github": "", "project": "", "reviewers": "vJA7;puN9;NPsy;eSrF", "site": "https://openreview.net/forum?id=O4dxuEsIo9S", "pdf_size": 0, "recommendation": "3;3;6;8", "confidence": "3;5;3;4", "correctness": "2;2;3;4", "technical_novelty": "1;2;2;3", "empirical_novelty": "1;2;0;4", "wc_summary_paper": "64;164;93;46", "wc_summary_review": "50;110;69;50", "wc_main_review": "272;696;842;296", "wc_review": "386;970;1004;392", "wc_reply_reviewers": "0;982;754;0", "wc_reply_authors": "1087;2943;2666;215", "reply_reviewers": "0;2;5;0", "reply_authors": "3;6;5;1", "recommendation_avg": [ 5.0, 2.1213203435596424 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 2.75, 0.82915619758885 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 1.75, 1.479019945774904 ], "wc_summary_paper_avg": [ 91.75, 44.95761893161158 ], "wc_summary_review_avg": [ 69.75, 24.498724456591614 ], "wc_main_review_avg": [ 526.5, 248.0781126983999 ], "wc_review_avg": [ 688.0, 299.24906014890007 ], "wc_reply_reviewers_avg": [ 434.0, 441.4226999147189 ], "wc_reply_authors_avg": [ 1727.75, 1124.29074865001 ], "reply_reviewers_avg": [ 1.75, 2.0463381929681126 ], "reply_authors_avg": [ 3.75, 1.920286436967152 ], "replies_avg": [ 27, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.1421338109037403, "corr_recommendation_correctness": 0.994936676326182, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13875180653259023898&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;2;2;2", "aff_unique_norm": "Amazon;Meta;University of Texas at Austin", "aff_unique_dep": "Amazon.com, Inc.;Meta Platforms, Inc;", "aff_unique_url": "https://www.amazon.com;https://www.meta.com;https://www.utexas.edu", "aff_unique_abbr": "Amazon;Meta;UT Austin", "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Austin", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "TAPEX: Table Pre-training via Learning a Neural SQL Executor", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6280", "id": "O50443AsCP", "poster": "", "openreview": "https://openreview.net/forum?id=O50443AsCP", "slides": "https://iclr.cc/virtual/2022/poster/6280", "video": "https://iclr.cc/virtual/2022/poster/6280", "author_site": "Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou", "tldr": "", "abstract": "Recent progress in language model pre-training has achieved a great success via leveraging large-scale unstructured textual data. However, it is still a challenge to apply pre-training on structured tabular data due to the absence of large-scale high-quality tabular data. In this paper, we propose TAPEX to show that table pre-training can be achieved by learning a neural SQL executor over a synthetic corpus, which is obtained by automatically synthesizing executable SQL queries and their execution outputs. TAPEX addresses the data scarcity challenge via guiding the language model to mimic a SQL executor on the diverse, large-scale and high-quality synthetic corpus. We evaluate TAPEX on four benchmark datasets. Experimental results demonstrate that TAPEX outperforms previous table pre-training approaches by a large margin and achieves new state-of-the-art results on all of them. This includes the improvements on the weakly-supervised WikiSQL denotation accuracy to 89.5% (+2.3%), the WikiTableQuestions denotation accuracy to 57.5% (+4.8%), the SQA denotation accuracy to 74.5% (+3.5%), and the TabFact accuracy to 84.2% (+3.2%). To our knowledge, this is the first work to exploit table pre-training via synthetic executable programs and to achieve new state-of-the-art results on various downstream tasks. Our code can be found at https://github.com/microsoft/Table-Pretraining.", "keywords": "table pre-training;sythetic pre-training;SQL execution;table-based question answering;table-based fact verification", "primary_area": "", "supplementary_material": "", "author": "Qian Liu;Bei Chen;Jiaqi Guo;Morteza Ziyadi;Zeqi Lin;Weizhu Chen;Jian-Guang Lou", "authorids": "~Qian_Liu2;~Bei_Chen3;~Jiaqi_Guo1;~Morteza_Ziyadi1;~Zeqi_Lin1;~Weizhu_Chen1;~Jian-Guang_Lou1", "gender": "M;F;M;M;M;M;M", "homepage": "http://siviltaram.github.io/;http://ml.cs.tsinghua.edu.cn/~beichen/;;;https://www.microsoft.com/en-us/research/people/zelin/;https://www.microsoft.com/en-us/research/people/wzchen/;https://www.microsoft.com/en-us/research/people/jlou/", "dblp": ";;173/0121;;https://dblp.uni-trier.de/pid/155/4370.html;79/2536;37/1917", "google_scholar": "bcbeUo0AAAAJ;Po65v_MAAAAJ;OCDyes4AAAAJ;;;LG_E-4EAAAAJ;alDxINIAAAAJ", "orcid": ";;;;;;", "linkedin": ";;;morteza-ziyadi-a3818ba9/;;;", "or_profile": "~Qian_Liu2;~Bei_Chen3;~Jiaqi_Guo1;~Morteza_Ziyadi1;~Zeqi_Lin1;~Weizhu_Chen1;~Jian-Guang_Lou1", "aff": "Beihang University;Microsoft;Xi'an Jiaotong University;;Microsoft Research;Microsoft GenAI;Microsoft Research Asia", "aff_domain": "buaa.edu.cn;microsoft.com;xjtu.edu;;microsoft.com;microsoft.com;microsoft.com", "position": "PhD student;Researcher;PhD student;;Researcher;Vice President;Principal Researcher", "bibtex": "@inproceedings{\nliu2022tapex,\ntitle={{TAPEX}: Table Pre-training via Learning a Neural {SQL} Executor},\nauthor={Qian Liu and Bei Chen and Jiaqi Guo and Morteza Ziyadi and Zeqi Lin and Weizhu Chen and Jian-Guang Lou},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=O50443AsCP}\n}", "github": "", "project": "", "reviewers": "YHKE;ukoF;Subg;xsFm", "pdf_size": 0, "recommendation": "6;8;8;8", "confidence": "3;4;5;4", "correctness": "1;4;4;4", "technical_novelty": "3;3;3;4", "empirical_novelty": "3;4;3;3", "wc_summary_paper": "93;258;230;80", "wc_summary_review": "24;34;70;71", "wc_main_review": "109;230;339;293", "wc_review": "226;522;639;444", "wc_reply_reviewers": "127;0;22;34", "wc_reply_authors": "1398;168;356;392", "reply_reviewers": "2;0;1;1", "reply_authors": "5;2;2;2", "recommendation_avg": [ 7.5, 0.8660254037844386 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.25, 1.299038105676658 ], "technical_novelty_avg": [ 3.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 165.25, 79.5027515247114 ], "wc_summary_review_avg": [ 49.75, 21.05201890555868 ], "wc_main_review_avg": [ 242.75, 86.37237695004116 ], "wc_review_avg": [ 457.75, 150.72885423833088 ], "wc_reply_reviewers_avg": [ 45.75, 48.468417552051356 ], "wc_reply_authors_avg": [ 578.5, 480.72315317654505 ], "reply_reviewers_avg": [ 1.0, 0.7071067811865476 ], "reply_authors_avg": [ 2.75, 1.299038105676658 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.816496580927726, "corr_recommendation_correctness": 1.0, "gs_citation": 260, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1887020545839431374&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=O50443AsCP", "email": "buaa.edu.cn;microsoft.com;xjtu.edu;;microsoft.com;microsoft.com;microsoft.com", "author_num": 7, "aff_unique_index": "0;1;2;1;1;1", "aff_unique_norm": "Beihang University;Microsoft;Xi'an Jiao Tong University", "aff_unique_dep": ";Microsoft Corporation;", "aff_unique_url": "http://www.buaa.edu.cn/;https://www.microsoft.com;https://www.xjtu.edu.cn", "aff_unique_abbr": "BUAA;Microsoft;XJTU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Asia", "aff_country_unique_index": "0;1;0;1;1;0", "aff_country_unique": "China;United States" }, { "id": "O5Wr-xX0U2y", "title": "Deep Reinforcement Learning for Equal Risk Option Pricing and Hedging under Dynamic Expectile Risk Measures", "track": "main", "status": "Reject", "tldr": "", "abstract": "Recently equal risk pricing, a framework for fair derivative pricing, was extended to consider coherent risk measures. However, all current implementations either employ a static risk measure or are based on traditional dynamic programming solution schemes that are impracticable in realistic settings: when the number of underlying assets is large or only historical trajectories are available. This paper extends for the first time the deep deterministic policy gradient algorithm to the problem of solving a risk averse Markov decision process that models risk using a time consistent dynamic expectile risk measure. Our numerical experiments, which involve both a simple vanilla option and a more exotic basket option, confirm that the new ACRL algorithm can produce high quality hedging strategies that produce accurate prices in simple settings, and outperform the strategies produced using static risk measures when the risk is evaluated at later points of time.", "keywords": "Deep reinforcement learning;risk averse Markov decision processes;expectile risk measures;derivative pricing", "primary_area": "", "supplementary_material": "/attachment/307f69fbf2d3c1a1528e486ab33f5a1aa9a833ff.zip", "author": "Saeed Marzban;Erick Delage;Jonathan Li", "authorids": "~Saeed_Marzban1;~Erick_Delage2;~Jonathan_Li1", "gender": "M;M;", "homepage": ";http://web.hec.ca/pages/erick.delage/;", "dblp": ";26/1546;85/6906", "google_scholar": ";https://scholar.google.ca/citations?user=ciH2ROgAAAAJ;", "orcid": ";0000-0002-6740-3600;", "linkedin": "saeed-marzban-07891a56/;erick-delage-2105361/?originalSubdomain=ca;", "or_profile": "~Saeed_Marzban1;~Erick_Delage2;~Jonathan_Li1", "aff": "HEC Montreal;Computer Science Department;University of Ottawa", "aff_domain": "hec.ca;cs.stanford.edu;uottawa.ca", "position": "PhD student;Researcher;Associate Professor", "bibtex": "@misc{\nmarzban2022deep,\ntitle={Deep Reinforcement Learning for Equal Risk Option Pricing and Hedging under Dynamic Expectile Risk Measures},\nauthor={Saeed Marzban and Erick Delage and Jonathan Li},\nyear={2022},\nurl={https://openreview.net/forum?id=O5Wr-xX0U2y}\n}", "github": "", "project": "", "reviewers": "2QwB;VoGN;wndf", "site": "https://openreview.net/forum?id=O5Wr-xX0U2y", "pdf_size": 0, "recommendation": "5;6;6", "confidence": "4;2;4", "correctness": "3;3;3", "technical_novelty": "2;3;3", "empirical_novelty": "2;3;3", "wc_summary_paper": "38;76;41", "wc_summary_review": "12;49;10", "wc_main_review": "305;266;216", "wc_review": "355;391;267", "wc_reply_reviewers": "113;23;0", "wc_reply_authors": "1111;470;376", "reply_reviewers": "1;1;0", "reply_authors": "2;1;1", "recommendation_avg": [ 5.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.3333333333333335, 0.9428090415820634 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 51.666666666666664, 17.249798710580816 ], "wc_summary_review_avg": [ 23.666666666666668, 17.93197020841702 ], "wc_main_review_avg": [ 262.3333333333333, 36.42648609032841 ], "wc_review_avg": [ 337.6666666666667, 52.085399958998956 ], "wc_reply_reviewers_avg": [ 45.333333333333336, 48.76018412142796 ], "wc_reply_authors_avg": [ 652.3333333333334, 326.58876622171533 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.4999999999999999, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:9_Hu6hL3XVoJ:scholar.google.com/&scioq=Deep+Reinforcement+Learning+for+Equal+Risk+Option+Pricing+and+Hedging+under+Dynamic+Expectile+Risk+Measures&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;2", "aff_unique_norm": "HEC Montreal;Computer Science Department;University of Ottawa", "aff_unique_dep": ";Computer Science;", "aff_unique_url": "https://www.hec.ca;;https://www.uottawa.ca", "aff_unique_abbr": "HEC;;U Ottawa", "aff_campus_unique_index": "0", "aff_campus_unique": "Montreal;", "aff_country_unique_index": "0;0", "aff_country_unique": "Canada;" }, { "id": "O9DAoNnYVlM", "title": "Federated Learning via Plurality Vote", "track": "main", "status": "Reject", "tldr": "", "abstract": "Federated learning allows collaborative workers to solve a machine learning problem while preserving data privacy. Recent studies have tackled various challenges in federated learning, but the joint optimization of communication overhead, learning reliability, and deployment efficiency is still an open problem. To this end, we propose a new scheme named federated learning via plurality vote (FedVote). In each communication round of FedVote, workers transmit binary or ternary weights to the server with low communication overhead. The model parameters are aggregated via weighted voting to enhance the resilience against Byzantine attacks. When deployed for inference, the model with binary or ternary weights is resource-friendly to edge devices. We show that our proposed method can reduce quantization error and converges faster compared with the methods directly quantizing the model updates.", "keywords": "Federated Learning via Plurality Vote", "primary_area": "", "supplementary_material": "", "author": "Kai Yue;Richeng Jin;Chau-Wai Wong;Huaiyu Dai", "authorids": "~Kai_Yue1;rjin2@ncsu.edu;~Chau-Wai_Wong1;~Huaiyu_Dai1", "gender": "M;;M;", "homepage": "https://kaiyue.netlify.app/;;https://ncsu-wong.org/;", "dblp": ";;24/10474;", "google_scholar": "5uWEGF8AAAAJ;;ggreZvcAAAAJ;", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Kai_Yue1;rjin2@ncsu.edu;~Chau-Wai_Wong1;~Huaiyu_Dai1", "aff": "North Carolina State University;;North Carolina State University;", "aff_domain": "ncsu.edu;;ncsu.edu;", "position": "PhD student;;Assistant Professor;", "bibtex": "@misc{\nyue2022federated,\ntitle={Federated Learning via Plurality Vote},\nauthor={Kai Yue and Richeng Jin and Chau-Wai Wong and Huaiyu Dai},\nyear={2022},\nurl={https://openreview.net/forum?id=O9DAoNnYVlM}\n}", "github": "", "project": "", "reviewers": "B8sD;mhMk;Cpmm;LKNM", "site": "https://openreview.net/forum?id=O9DAoNnYVlM", "pdf_size": 0, "recommendation": "3;3;5;6", "confidence": "3;4;3;4", "correctness": "3;3;3;3", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "32;205;35;157", "wc_summary_review": "38;23;19;83", "wc_main_review": "247;200;304;245", "wc_review": "317;428;358;485", "wc_reply_reviewers": "0;59;181;21", "wc_reply_authors": "673;952;439;623", "reply_reviewers": "0;1;1;1", "reply_authors": "1;2;1;1", "recommendation_avg": [ 4.25, 1.299038105676658 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 107.25, 75.68479041392663 ], "wc_summary_review_avg": [ 40.75, 25.40054133281415 ], "wc_main_review_avg": [ 249.0, 36.89850945499018 ], "wc_review_avg": [ 397.0, 64.4709236788182 ], "wc_reply_reviewers_avg": [ 65.25, 70.09413313537732 ], "wc_reply_authors_avg": [ 671.75, 183.7734678891379 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.19245008972987526, "corr_recommendation_correctness": 0.0, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10865358454002224843&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff_unique_index": "0;0", "aff_unique_norm": "North Carolina State University", "aff_unique_dep": "", "aff_unique_url": "https://www.ncsu.edu", "aff_unique_abbr": "NCSU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "OBwsUF4nFye", "title": "Private Multi-Task Learning: Formulation and Applications to Federated Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Many problems in machine learning rely on multi-task learning (MTL), in which the goal is to solve multiple related machine learning tasks simultaneously. MTL is particularly relevant for privacy-sensitive applications in areas such as healthcare, finance, and IoT computing, where sensitive data from multiple, varied sources are shared for the purpose of learning. In this work, we formalize notions of task-level privacy for MTL via joint differential privacy(JDP), a relaxation of differential privacy for mechanism design and distributed optimization. We then propose an algorithm for mean-regularized MTL, an objective commonly used for applications in personalized federated learning, subject to JDP. We analyze our objective and solver, providing certifiable guarantees on both privacy and utility. Empirically, we find that our method allows for improved privacy/utility trade-offs relative to global baselines across common federated learning benchmarks.", "keywords": "privacy preserving machine learning;large scale machine learning;multi-task learning", "primary_area": "", "supplementary_material": "/attachment/008deb6f070d163b6bc71cbdc8815f9e506a7f4d.zip", "author": "Shengyuan Hu;Steven Wu;Virginia Smith", "authorids": "~Shengyuan_Hu2;~Steven_Wu1;~Virginia_Smith1", "gender": ";F;M", "homepage": ";;https://zstevenwu.com/", "dblp": "226/6584-1;120/0921;137/8350", "google_scholar": "m_ZHHToAAAAJ;;MbF6rTEAAAAJ", "orcid": ";;", "linkedin": ";;zstevenwu/", "or_profile": "~Shengyuan_Hu2;~Virginia_Smith1;~Zhiwei_Steven_Wu1", "aff": "Carnegie Mellon University;Carnegie Mellon University;Carnegie Mellon University", "aff_domain": "cmu.edu;cmu.edu;cmu.edu", "position": "PhD student;Associate Professor;Assistant Professor", "bibtex": "@misc{\nhu2022private,\ntitle={Private Multi-Task Learning: Formulation and Applications to Federated Learning},\nauthor={Shengyuan Hu and Steven Wu and Virginia Smith},\nyear={2022},\nurl={https://openreview.net/forum?id=OBwsUF4nFye}\n}", "github": "", "project": "", "reviewers": "T1p3;fBjV;C2cr", "site": "https://openreview.net/forum?id=OBwsUF4nFye", "pdf_size": 0, "recommendation": "3;3;6", "confidence": "3;4;2", "correctness": "3;4;3", "technical_novelty": "2;2;3", "empirical_novelty": "3;2;0", "wc_summary_paper": "61;110;88", "wc_summary_review": "59;63;17", "wc_main_review": "467;289;45", "wc_review": "587;462;150", "wc_reply_reviewers": "268;266;0", "wc_reply_authors": "1706;1880;29", "reply_reviewers": "1;2;0", "reply_authors": "4;5;1", "recommendation_avg": [ 4.0, 1.4142135623730951 ], "confidence_avg": [ 3.0, 0.816496580927726 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 1.6666666666666667, 1.247219128924647 ], "wc_summary_paper_avg": [ 86.33333333333333, 20.038851153585515 ], "wc_summary_review_avg": [ 46.333333333333336, 20.805982045769646 ], "wc_main_review_avg": [ 267.0, 172.9816945999393 ], "wc_review_avg": [ 399.6666666666667, 183.76857426907597 ], "wc_reply_reviewers_avg": [ 178.0, 125.86765536334848 ], "wc_reply_authors_avg": [ 1205.0, 834.586124974529 ], "reply_reviewers_avg": [ 1.0, 0.816496580927726 ], "reply_authors_avg": [ 3.3333333333333335, 1.699673171197595 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.8660254037844387, "corr_recommendation_correctness": -0.5, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7480179952469202673&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff_unique_index": "0;0;0", "aff_unique_norm": "Carnegie Mellon University", "aff_unique_dep": "", "aff_unique_url": "https://www.cmu.edu", "aff_unique_abbr": "CMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "OCgCYv7KGZe", "title": "Auto-Encoding Inverse Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Reinforcement learning (RL) provides a powerful framework for decision-making, but its application in practice often requires a carefully designed reward function. Inverse Reinforcement Learning (IRL) has shed light on automatic reward acquisition, but it is still difficult to apply IRL to solve real-world tasks. In this work, we propose Auto-Encoding Inverse Reinforcement Learning (AEIRL), a robust and scalable IRL framework, which belongs to the adversarial imitation learning class. To recover reward functions from expert demonstrations, AEIRL utilizes the reconstruction error of an auto-encoder as the learning signal, which provides more information for optimizing policies, compared to the binary logistic loss. Subsequently, we use the derived objective functions to train the reward function and the RL agent. Experiments show that AEIRL performs superior in comparison with state-of-the-art methods in the MuJoCo environments. More importantly, in more realistic settings, AEIRL shows much better robustness when the expert demonstrations are noisy. Specifically, our method achieves $16\\%$ relative improvement compared to the best baseline FAIRL on clean expert data and $38\\%$ relative improvement compared to the best baseline PWIL on noisy expert data both with the metric overall averaged scaled rewards. ", "keywords": "Adversarial Imitation Learning;Inverse Reinforcement Learning;Auto-Encoding", "primary_area": "", "supplementary_material": "/attachment/0eb083796b1ca90cd49cf3f67d95ad3031b2257f.zip", "author": "Kaifeng Zhang;Rui Zhao;Ziming Zhang;Yang Gao", "authorids": "~Kaifeng_Zhang1;~Rui_Zhao1;~Ziming_Zhang4;~Yang_Gao1", "gender": "M;M;M;M", "homepage": "https://alexkfzhang.github.io;https://ruizhaogit.github.io;http://yang-gao.weebly.com;https://zimingzhang.wordpress.com/", "dblp": ";26/2578-11;89/4402-29;", "google_scholar": ";N1yNDnQAAAAJ;https://scholar.google.com/citations?hl=en;2yqx3oIAAAAJ", "orcid": ";;;", "linkedin": ";rui-zhao-profile/;yang-gao-45245348/;", "or_profile": "~Kaifeng_Zhang1;~Rui_Zhao1;~Yang_Gao1;~Ziming_Zhang1", "aff": "Shanghai Qi Zhi Institute;Tencent AI Lab;Tsinghua University;Worcester Polytechnic Institute", "aff_domain": "sqz.ac.cn;tencent.com;tsinghua.edu.cn;wpi.edu", "position": "Researcher;Researcher;Assistant Professor;Assistant Professor", "bibtex": "@misc{\nzhang2022autoencoding,\ntitle={Auto-Encoding Inverse Reinforcement Learning},\nauthor={Kaifeng Zhang and Rui Zhao and Ziming Zhang and Yang Gao},\nyear={2022},\nurl={https://openreview.net/forum?id=OCgCYv7KGZe}\n}", "github": "", "project": "", "reviewers": "Ui5q;4gyw;jqo8;vWWw", "site": "https://openreview.net/forum?id=OCgCYv7KGZe", "pdf_size": 0, "recommendation": "3;5;6;8", "confidence": "3;3;4;4", "correctness": "3;3;4;4", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;2;3;4", "wc_summary_paper": "39;28;75;65", "wc_summary_review": "14;48;149;25", "wc_main_review": "170;150;368;319", "wc_review": "223;226;592;409", "wc_reply_reviewers": "0;0;0;126", "wc_reply_authors": "651;212;619;649", "reply_reviewers": "0;0;0;1", "reply_authors": "1;1;1;2", "recommendation_avg": [ 5.5, 1.8027756377319946 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 51.75, 18.9917745353087 ], "wc_summary_review_avg": [ 59.0, 53.39007398384086 ], "wc_main_review_avg": [ 251.75, 93.63860048078463 ], "wc_review_avg": [ 362.5, 152.4180107467618 ], "wc_reply_reviewers_avg": [ 31.5, 54.559600438419636 ], "wc_reply_authors_avg": [ 532.75, 185.61839213827923 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.8320502943378437, "corr_recommendation_correctness": 0.8320502943378437, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:XqrO5j7nIGcJ:scholar.google.com/&scioq=Auto-Encoding+Inverse+Reinforcement+Learning&hl=en&as_sdt=0,14", "gs_version_total": 0, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "Shanghai Qi Zhi Institute;Tencent;Tsinghua University;Worcester Polytechnic Institute", "aff_unique_dep": ";Tencent AI Lab;;", "aff_unique_url": "https://www.qz.io;https://ai.tencent.com;https://www.tsinghua.edu.cn;https://www.wpi.edu", "aff_unique_abbr": ";Tencent AI Lab;THU;WPI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1", "aff_country_unique": "China;United States" }, { "id": "OD_dnx57ksK", "title": "Momentum Conserving Lagrangian Neural Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "Realistic models of physical world rely on differentiable symmetries that, in turn, correspond to conservation laws. Recent works on Lagrangian and Hamiltonian neural networks show that the underlying symmetries of a system can be easily learned by a neural network when provided with an appropriate inductive bias. However, these models still suffer from issues such as inability to generalize to arbitrary system sizes, poor interpretability, and most importantly, inability to learn translational and rotational symmetries, which lead to the conservation laws of linear and angular momentum, respectively. Here, we present a momentum conserving Lagrangian neural network (MCLNN) that learns the Lagrangian of a system, while also preserving the translational and rotational symmetries. We test our approach on linear and non-linear spring systems, and a gravitational system, demonstrating the energy and momentum conservation. We also show that the model developed can generalize to systems of any arbitrary size. Finally, we discuss the interpretability of the MCLNN, which directly provides physical insights into the interactions of multi-particle systems.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/f31f933ea1a4e56375ef940e16083bb3beda3d86.zip", "author": "Ravinder Bhattoo;Sayan Ranu;N M Anoop Krishnan", "authorids": "~Ravinder_Bhattoo1;~Sayan_Ranu2;~N_M_Anoop_Krishnan1", "gender": "M;M;M", "homepage": "https://ravinderbhattoo.github.io;https://www.cse.iitd.ac.in/~sayan/index.html;", "dblp": ";38/768;", "google_scholar": "lPTdGRMAAAAJ;K4w5qYUAAAAJ;https://scholar.google.co.in/citations?user=fGnjHcEAAAAJ", "orcid": "0000-0003-0323-9108;0000-0003-4147-9372;0000-0003-1500-4947", "linkedin": ";;", "or_profile": "~Ravinder_Bhattoo1;~Sayan_Ranu2;~N_M_Anoop_Krishnan1", "aff": ";Indian Institute of Technology Delhi;Indian Institute of Technology Delhi", "aff_domain": ";iitd.ac.in;iitd.ac.in", "position": ";Associate Professor;Assistant Professor", "bibtex": "@misc{\nbhattoo2022momentum,\ntitle={Momentum Conserving Lagrangian Neural Networks},\nauthor={Ravinder Bhattoo and Sayan Ranu and N M Anoop Krishnan},\nyear={2022},\nurl={https://openreview.net/forum?id=OD_dnx57ksK}\n}", "github": "", "project": "", "reviewers": "Qsuh;1UZJ;Vgjt;N3Bq", "site": "https://openreview.net/forum?id=OD_dnx57ksK", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "4;4;5;3", "correctness": "4;4;4;4", "technical_novelty": "1;2;2;2", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "107;95;43;60", "wc_summary_review": "34;121;20;8", "wc_main_review": "496;531;93;295", "wc_review": "637;747;156;363", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 4.0, 0.0 ], "technical_novelty_avg": [ 1.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 76.25, 25.82029240732955 ], "wc_summary_review_avg": [ 45.75, 44.40931771599289 ], "wc_main_review_avg": [ 353.75, 175.42430703867694 ], "wc_review_avg": [ 475.75, 231.58732154416398 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.816496580927726, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:OojG4cgXunMJ:scholar.google.com/&scioq=Momentum+Conserving+Lagrangian+Neural+Networks&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Indian Institute of Technology Delhi", "aff_unique_dep": "", "aff_unique_url": "https://www.iitd.ac.in", "aff_unique_abbr": "IIT Delhi", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Delhi", "aff_country_unique_index": "0;0", "aff_country_unique": "India" }, { "id": "ODdaICh-7dK", "title": "Neural Latent Traversal with Semantic Constraints", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Whilst Generative Adversarial Networks (GANs) generate visually appealing high resolution images, the latent representations (or codes) of these models do not allow controllable changes on the semantic attributes of the generated images. Recent approaches proposed to learn linear models to relate the latent codes with the attributes to enable adjustment of the attributes. However, as the latent spaces of GANs are learnt in an unsupervised manner and are semantically entangled, the linear models are not always effective. In this study, we learn multi-stage neural transformations of latent spaces of pre-trained GANs that enable more accurate modeling of the relation between the latent codes and the semantic attributes. To ensure identity preservation of images, we propose a sparsity constraint on the latent space transformations that is guided by the mutual information between the latent and the semantic space. We demonstrate our method on two face datasets (FFHQ and CelebA-HQ) and show that it outperforms current state-of-the-art baselines based on FID score and other numerical metrics.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Majid Abdolshah;Hung Le;Thommen Karimpanal George;Vuong Le;Sunil Gupta;Santu Rana;Svetha Venkatesh", "authorids": "~Majid_Abdolshah1;~Hung_Le1;~Thommen_Karimpanal_George1;~Vuong_Le2;~Sunil_Gupta2;~Santu_Rana1;~Svetha_Venkatesh1", "gender": "M;M;M;F;M;M;M", "homepage": "http://majid.website;https://www.thommengk.com/;;https://www.deakin.edu.au/about-deakin/people/svetha-venkatesh;https://thaihungle.github.io/;https://personal-sites.deakin.edu.au/~sunilg/;https://vuongle2.github.io/", "dblp": "190/6649;133/3358;57/6712;81/1984;45/466-2;47/333-1;", "google_scholar": "https://scholar.google.com.au/citations?user=RKC-MCUAAAAJ;v3-hy24AAAAJ;S9PwnMYAAAAJ;AEkRUQcAAAAJ;https://scholar.google.com.au/citations?user=q2HbxngAAAAJ;https://scholar.google.com.au/citations?user=bXeL2t8AAAAJ;WjMGgVIAAAAJ", "orcid": ";0000-0001-8918-3314;0000-0003-2247-850X;;0000-0002-3126-184X;0000-0002-3308-1930;", "linkedin": ";thommen-george-karimpanal-762451149/;santur/;;;;", "or_profile": "~Majid_Abdolshah1;~Thommen_Karimpanal_George1;~Santu_Rana1;~Svetha_Venkatesh1;~Hung_Thai_Le1;~Sunil_Kumar_Gupta1;~Vuong_Le1", "aff": "Amazon;Deakin University;Deakin University;Deakin University;Deakin University;Deakin University;Deakin University", "aff_domain": "amazon.com;deakin.edu.au;deakin.edu.au;deakin.edu.au;deakin.edu.au;deakin.edu.au;deakin.edu.au", "position": "Machine Learning Scientist;Postdoc;Associate Professor;Full Professor;Lecturer;Associate Professor;Postdoc", "bibtex": "@misc{\nabdolshah2022neural,\ntitle={Neural Latent Traversal with Semantic Constraints},\nauthor={Majid Abdolshah and Hung Le and Thommen Karimpanal George and Vuong Le and Sunil Gupta and Santu Rana and Svetha Venkatesh},\nyear={2022},\nurl={https://openreview.net/forum?id=ODdaICh-7dK}\n}", "github": "", "project": "", "reviewers": "mBUK;bZp5;UYQf;wqfV", "site": "https://openreview.net/forum?id=ODdaICh-7dK", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "4;5;4;3", "correctness": "2;4;3;3", "technical_novelty": "2;3;2;3", "empirical_novelty": "2;2;0;3", "wc_summary_paper": "83;66;115;95", "wc_summary_review": "66;46;50;47", "wc_main_review": "197;216;318;172", "wc_review": "346;328;483;314", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 1.75, 1.0897247358851685 ], "wc_summary_paper_avg": [ 89.75, 17.851820635442202 ], "wc_summary_review_avg": [ 52.25, 8.073877630977572 ], "wc_main_review_avg": [ 225.75, 55.49943693408069 ], "wc_review_avg": [ 367.75, 67.49953703544936 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": -0.3244428422615251, "corr_recommendation_correctness": 0.6488856845230502, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8488022369505425795&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;1;1;1;1;1", "aff_unique_norm": "Amazon;Deakin University", "aff_unique_dep": "Amazon.com, Inc.;", "aff_unique_url": "https://www.amazon.com;https://www.deakin.edu.au", "aff_unique_abbr": "Amazon;Deakin", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1;1;1;1", "aff_country_unique": "United States;Australia" }, { "id": "ODnCiZujily", "title": "DeepSplit: Scalable Verification of Deep Neural Networks via Operator Splitting", "track": "main", "status": "Reject", "tldr": "", "abstract": "Analyzing the worst-case performance of deep neural networks against input perturbations amounts to solving a large-scale non-convex optimization problem, for which several past works have proposed convex relaxations as a promising alternative. However, even for reasonably-sized neural networks, these relaxations are not tractable, and so must be replaced by even weaker relaxations in practice. In this work, we propose a novel operator splitting method that can directly solve a convex relaxation of the problem to high accuracy, by splitting it into smaller sub-problems that often have analytical solutions. The method is modular, scales to very large problem instances, and compromises of operations that are amenable to fast parallelization with GPU acceleration. We demonstrate our method in obtaining tighter bounds on the worst-case performance of large convolutional networks in image classification and reinforcement learning settings. ", "keywords": "neural network verification;operator splitting;ADMM", "primary_area": "", "supplementary_material": "/attachment/a6ac8811904dbe180bd19b97e631879b2afcc2fc.zip", "author": "Shaoru Chen;Eric Wong;J Zico Kolter;Mahyar Fazlyab", "authorids": "~Shaoru_Chen1;~Eric_Wong1;~J_Zico_Kolter1;~Mahyar_Fazlyab1", "gender": "M;M;M;M", "homepage": "https://www.shaoru.site/;http://riceric22.github.io/;https://www.ece.jhu.edu/mahyarfazlyab/;http://www.zicokolter.com", "dblp": "254/9531;64/1811-1.html;147/4846;67/2526", "google_scholar": "PUIfJYcAAAAJ;pWnTMRkAAAAJ;Y3bmjJwAAAAJ;UXh1I6UAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Shaoru_Chen1;~Eric_Wong1;~Mahyar_Fazlyab1;~Zico_Kolter1", "aff": "School of Engineering and Applied Science, University of Pennsylvania;Massachusetts Institute of Technology;Johns Hopkins University;Carnegie Mellon University", "aff_domain": "seas.upenn.edu;mit.edu;jhu.edu;cmu.edu", "position": "PhD student;Postdoc;Assistant Professor;Full Professor", "bibtex": "@misc{\nchen2022deepsplit,\ntitle={DeepSplit: Scalable Verification of Deep Neural Networks via Operator Splitting},\nauthor={Shaoru Chen and Eric Wong and J Zico Kolter and Mahyar Fazlyab},\nyear={2022},\nurl={https://openreview.net/forum?id=ODnCiZujily}\n}", "github": "", "project": "", "reviewers": "aJXo;LjMA;Km1h;ZCXL;JVyx", "site": "https://openreview.net/forum?id=ODnCiZujily", "pdf_size": 0, "recommendation": "3;3;5;5;8", "confidence": "5;4;4;3;5", "correctness": "2;3;3;3;4", "technical_novelty": "2;2;2;2;4", "empirical_novelty": "2;2;2;3;3", "wc_summary_paper": "75;58;18;62;93", "wc_summary_review": "145;37;30;53;21", "wc_main_review": "847;573;256;187;325", "wc_review": "1067;668;304;302;439", "wc_reply_reviewers": "276;317;0;0;171", "wc_reply_authors": "688;867;317;335;173", "reply_reviewers": "1;1;0;0;1", "reply_authors": "1;2;1;1;1", "recommendation_avg": [ 4.8, 1.8330302779823362 ], "confidence_avg": [ 4.2, 0.7483314773547882 ], "correctness_avg": [ 3.0, 0.6324555320336759 ], "technical_novelty_avg": [ 2.4, 0.8 ], "empirical_novelty_avg": [ 2.4, 0.4898979485566356 ], "wc_summary_paper_avg": [ 61.2, 24.8145118831703 ], "wc_summary_review_avg": [ 57.2, 45.132693250015556 ], "wc_main_review_avg": [ 437.6, 242.65168451918896 ], "wc_review_avg": [ 556.0, 288.2269938780891 ], "wc_reply_reviewers_avg": [ 152.8, 133.54160400414546 ], "wc_reply_authors_avg": [ 476.0, 258.76475803323757 ], "reply_reviewers_avg": [ 0.6, 0.48989794855663565 ], "reply_authors_avg": [ 1.2, 0.4000000000000001 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.1749635530559413, "corr_recommendation_correctness": 0.8625819491779426, "gs_citation": 21, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=869144177225734410&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "University of Pennsylvania;Massachusetts Institute of Technology;Johns Hopkins University;Carnegie Mellon University", "aff_unique_dep": "School of Engineering and Applied Science;;;", "aff_unique_url": "https://www.upenn.edu;https://web.mit.edu;https://www.jhu.edu;https://www.cmu.edu", "aff_unique_abbr": "UPenn;MIT;JHU;CMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "OGbbY4qmir5", "title": "Neurally boosted supervised spectral clustering", "track": "main", "status": "Reject", "tldr": "", "abstract": " Network embedding methods compute geometric representations of graphs that render various prediction problems amenable to machine learning techniques. Spectral network embeddings are based on the computation of eigenvectors of a normalized graph Laplacian. When coupled with standard classifiers, spectral embeddings yield strong baseline performance in node classification tasks. Remarkably, it has been recently shown that these `base' classifications followed by a simple `Correction and Smooth' procedure reach state-of-the-art performance on widely used benchmarks. All these recent works employ classifiers that are agnostic to the nature of the underlying embedding. We present simple neural models that leverage fundamental geometric properties of spectral embeddings and obtains significantly improved classification accuracy over commonly used standard classifiers. Our results are based on a specific variant of spectral clustering that is not well-known, but it is presently the only variant known to have analyzable theoretical properties. We provide a \\texttt{PyTorch} implementation of our classifier along with code for the fast computation of spectral embeddings. ", "keywords": "Supervised Node Classification;Spectral Embedding;Social Graphs;Neural Models", "primary_area": "", "supplementary_material": "", "author": "Ali Parviz;Ioannis Koutis", "authorids": "~Ali_Parviz1;~Ioannis_Koutis1", "gender": ";", "homepage": ";https://web.njit.edu/~ikoutis/", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": "~Ali_Parviz1;~Ioannis_Koutis1", "aff": ";New Jersey Institute of Technology", "aff_domain": ";njit.edu", "position": ";Associate Professor", "bibtex": "@misc{\nparviz2022neurally,\ntitle={Neurally boosted supervised spectral clustering},\nauthor={Ali Parviz and Ioannis Koutis},\nyear={2022},\nurl={https://openreview.net/forum?id=OGbbY4qmir5}\n}", "github": "", "project": "", "reviewers": "YsLB;7yDC;GXsL;os5L;L1xu", "site": "https://openreview.net/forum?id=OGbbY4qmir5", "pdf_size": 0, "recommendation": "3;3;5;5;8", "confidence": "4;4;4;3;4", "correctness": "4;3;3;4;4", "technical_novelty": "1;2;2;3;4", "empirical_novelty": "0;2;2;3;4", "wc_summary_paper": "77;61;41;57;34", "wc_summary_review": "17;61;47;36;19", "wc_main_review": "283;625;368;455;162", "wc_review": "377;747;456;548;215", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "169;0;0;0;0", "reply_reviewers": "0;0;0;0;0", "reply_authors": "1;0;0;0;0", "recommendation_avg": [ 4.8, 1.8330302779823362 ], "confidence_avg": [ 3.8, 0.39999999999999997 ], "correctness_avg": [ 3.6, 0.4898979485566356 ], "technical_novelty_avg": [ 2.4, 1.019803902718557 ], "empirical_novelty_avg": [ 2.2, 1.32664991614216 ], "wc_summary_paper_avg": [ 54.0, 15.20526224699857 ], "wc_summary_review_avg": [ 36.0, 16.70927885936434 ], "wc_main_review_avg": [ 378.6, 156.66346096011029 ], "wc_review_avg": [ 468.6, 177.01593148640606 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 33.8, 67.6 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0.2, 0.4000000000000001 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.05455447255899811, "corr_recommendation_correctness": 0.3563483225498992, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Qhf2F3nj3cgJ:scholar.google.com/&scioq=Neurally+boosted+supervised+spectral+clustering&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "New Jersey Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.njit.edu", "aff_unique_abbr": "NJIT", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "title": "VAE Approximation Error: ELBO and Exponential Families", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7070", "id": "OIs3SxU5Ynl", "poster": "", "openreview": "https://openreview.net/forum?id=OIs3SxU5Ynl", "slides": "https://iclr.cc/virtual/2022/poster/7070", "video": "https://iclr.cc/virtual/2022/poster/7070", "author_site": "Alexander (Oleksandr) Shekhovtsov, Dmitrij Schlesinger, Boris Flach", "tldr": "", "abstract": "The importance of Variational Autoencoders reaches far beyond standalone generative models -- the approach is also used for learning latent representations and can be generalized to semi-supervised learning. This requires a thorough analysis of their commonly known shortcomings: posterior collapse and approximation errors. This paper analyzes VAE approximation errors caused by the combination of the ELBO objective and encoder models from conditional exponential families, including, but not limited to, commonly used conditionally independent discrete and continuous models.\nWe characterize subclasses of generative models consistent with these encoder families. We show that the ELBO optimizer is pulled away from the likelihood optimizer towards the consistent subset and study this effect experimentally. Importantly, this subset can not be enlarged, and the respective error cannot be decreased, by considering deeper encoder/decoder networks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Alexander Shekhovtsov;Dmitrij Schlesinger;Boris Flach", "authorids": "~Alexander_Shekhovtsov1;~Dmitrij_Schlesinger1;~Boris_Flach1", "gender": "M;M;M", "homepage": "http://cmp.felk.cvut.cz/~shekhovt/;;http://cmp.felk.cvut.cz/~flachbor/", "dblp": "61/5386;36/3736;04/6820", "google_scholar": "https://scholar.google.cz/citations?hl=en;https://scholar.google.de/citations?hl=de;https://scholar.google.de/citations?user=14xctsUAAAAJ", "orcid": ";;0000-0002-4988-7541", "linkedin": ";;", "or_profile": "~Alexander_Shekhovtsov1;~Dmitrij_Schlesinger1;~Boris_Flach1", "aff": "Czech Technical University in Prague;TU Dresden;Czech Technical University in Prague", "aff_domain": "cvut.cz;tu-dresden.de;cvut.cz", "position": "Assistant Professor;Researcher;Associate Professor", "bibtex": "@inproceedings{\nshekhovtsov2022vae,\ntitle={{VAE} Approximation Error: {ELBO} and Exponential Families},\nauthor={Alexander Shekhovtsov and Dmitrij Schlesinger and Boris Flach},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=OIs3SxU5Ynl}\n}", "github": "", "project": "", "reviewers": "8hFF;vibV;GwqU;3pHE", "pdf_size": 0, "recommendation": "6;8;8;8", "confidence": "3;3;3;4", "correctness": "4;4;3;4", "technical_novelty": "3;3;3;3", "empirical_novelty": "2;4;3;3", "wc_summary_paper": "80;104;139;21", "wc_summary_review": "18;94;36;36", "wc_main_review": "108;530;563;405", "wc_review": "206;728;738;462", "wc_reply_reviewers": "0;0;80;0", "wc_reply_authors": "0;0;578;0", "reply_reviewers": "0;0;1;0", "reply_authors": "0;0;1;0", "recommendation_avg": [ 7.5, 0.8660254037844386 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 86.0, 42.99418565341132 ], "wc_summary_review_avg": [ 46.0, 28.6705423736629 ], "wc_main_review_avg": [ 401.5, 179.40805444572436 ], "wc_review_avg": [ 533.5, 219.0998630761781 ], "wc_reply_reviewers_avg": [ 20.0, 34.64101615137755 ], "wc_reply_authors_avg": [ 144.5, 250.28134169370276 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 0.25, 0.4330127018922193 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": -0.3333333333333333, "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7718789458985379761&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=OIs3SxU5Ynl", "email": "cvut.cz;tu-dresden.de;cvut.cz", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "Czech Technical University;Technische Universit\u00e4t Dresden", "aff_unique_dep": ";", "aff_unique_url": "https://www.ctu.cz;https://www.tu-dresden.de", "aff_unique_abbr": "CTU;TUD", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Prague;", "aff_country_unique_index": "0;1;0", "aff_country_unique": "Czech Republic;Germany" }, { "title": "Convergent and Efficient Deep Q Learning Algorithm", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7160", "id": "OJm3HZuj4r7", "poster": "", "openreview": "https://openreview.net/forum?id=OJm3HZuj4r7", "slides": "https://iclr.cc/virtual/2022/poster/7160", "video": "https://iclr.cc/virtual/2022/poster/7160", "author_site": "Zhikang T Wang, Masahito Ueda", "tldr": "", "abstract": "Despite the empirical success of the deep Q network (DQN) reinforcement learning algorithm and its variants, DQN is still not well understood and it does not guarantee convergence. In this work, we show that DQN can indeed diverge and cease to operate in realistic settings. Although there exist gradient-based convergent methods, we show that they actually have inherent problems in learning dynamics which cause them to fail even for simple tasks. To overcome these problems, we propose a convergent DQN algorithm (C-DQN) that is guaranteed to converge and can work with large discount factors (0.9998). It learns robustly in difficult settings and can learn several difficult games in the Atari 2600 benchmark that DQN fails to solve.", "keywords": "DQN;reinforcement learning;convergence", "primary_area": "", "supplementary_material": "/attachment/7414afa6296afdedebef45a3e500076a47622090.zip", "author": "Zhikang T. Wang;Masahito Ueda", "authorids": "~Zhikang_T._Wang1;~Masahito_Ueda1", "gender": "M;M", "homepage": ";http://cat.phys.s.u-tokyo.ac.jp/index-e.html", "dblp": ";", "google_scholar": "3JdpXLAAAAAJ;https://scholar.google.co.jp/citations?user=Xpjx9CwAAAAJ", "orcid": ";0000-0002-5367-1436", "linkedin": ";", "or_profile": "~Zhikang_T._Wang1;~Masahito_Ueda1", "aff": "The University of Tokyo;The University of Tokyo", "aff_domain": "u-tokyo.ac.jp;u-tokyo.ac.jp", "position": "PhD student;Full Professor", "bibtex": "@inproceedings{\nwang2022convergent,\ntitle={Convergent and Efficient Deep Q Learning Algorithm},\nauthor={Zhikang T. Wang and Masahito Ueda},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=OJm3HZuj4r7}\n}", "github": "", "project": "", "reviewers": "x6bM;SCA5;bKTr", "pdf_size": 0, "recommendation": "6;6;10", "confidence": "3;4;4", "correctness": "4;4;4", "technical_novelty": "2;3;3", "empirical_novelty": "2;2;3", "wc_summary_paper": "59;59;48", "wc_summary_review": "10;35;28", "wc_main_review": "501;193;512", "wc_review": "570;287;588", "wc_reply_reviewers": "123;57;170", "wc_reply_authors": "1647;706;629", "reply_reviewers": "1;1;2", "reply_authors": "3;1;2", "recommendation_avg": [ 7.333333333333333, 1.8856180831641267 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 4.0, 0.0 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 55.333333333333336, 5.185449728701348 ], "wc_summary_review_avg": [ 24.333333333333332, 10.530379332620875 ], "wc_main_review_avg": [ 402.0, 147.8535311268103 ], "wc_review_avg": [ 481.6666666666667, 137.8461299984717 ], "wc_reply_reviewers_avg": [ 116.66666666666667, 46.34891824220089 ], "wc_reply_authors_avg": [ 994.0, 462.8095360584812 ], "reply_reviewers_avg": [ 1.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 2.0, 0.816496580927726 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.5, "corr_recommendation_correctness": 0.0, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11755948594447432202&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "pdf": "https://openreview.net/pdf?id=OJm3HZuj4r7", "email": "u-tokyo.ac.jp;u-tokyo.ac.jp", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "University of Tokyo", "aff_unique_dep": "", "aff_unique_url": "https://www.u-tokyo.ac.jp", "aff_unique_abbr": "UTokyo", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Japan" }, { "id": "OKhFyMVz6t7", "title": "Deconfounding to Explanation Evaluation in Graph Neural Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "Explainability of graph neural networks (GNNs) aims to answer ``Why the GNN made a certain prediction?'', which is crucial to interpret the model prediction. The feature attribution framework distributes a GNN's prediction to its input features (e.g., edges), identifying an influential subgraph as the explanation. When evaluating the explanation (i.e., subgraph importance), a standard way is to audit the model prediction based on the subgraph solely. However, we argue that a distribution shift exists between the full graph and the subgraph, causing the out-of-distribution problem. Furthermore, with an in-depth causal analysis, we find the OOD effect acts as the confounder, which brings spurious associations between the subgraph importance and model prediction, making the evaluation less reliable. In this work, we propose Deconfounded Subgraph Evaluation (DSE) which assesses the causal effect of an explanatory subgraph on the model prediction. While the distribution shift is generally intractable, we employ the front-door adjustment and introduce a surrogate variable of the subgraphs. Specifically, we devise a generative model to generate the plausible surrogates that conform to the data distribution, thus approaching the unbiased estimation of subgraph importance. Empirical results demonstrate the effectiveness of DSE in terms of explanation fidelity.", "keywords": "Graph Neural Networks;Explanation Evaluation;Out-of-distribution;front-door Adjustment", "primary_area": "", "supplementary_material": "", "author": "Yingxin Wu;Xiang Wang;An Zhang;Xia Hu;Fuli Feng;Xiangnan He;Tat-Seng Chua", "authorids": "~Yingxin_Wu1;~Xiang_Wang6;~An_Zhang2;~Xia_Hu4;~Fuli_Feng1;~Xiangnan_He1;~Tat-Seng_Chua2", "gender": "F;M;M;M;F;M;M", "homepage": "https://cs.stanford.edu/~shirwu;https://github.com/xiangwang1223;https://fulifeng.github.io/;http://staff.ustc.edu.cn/~hexn;https://github.com/anzhang314;http://www.comp.nus.edu.sg/~chuats/;https://cs.rice.edu/~xh37/index.html", "dblp": "79/4173-2;31/2864-10;183/9198;59/1007;78/5581-3;;256/9406.html", "google_scholar": "r2cVEucAAAAJ;https://scholar.google.com.sg/citations?user=HdhaQB0AAAAJ;https://scholar.google.com.sg/citations?user=QePM4u8AAAAJ;https://scholar.google.com.sg/citations?user=X45Go24AAAAJ;https://scholar.google.com.sg/citations?user=BcX7GJcAAAAJ;https://scholar.google.com.tw/citations?user=Z9DWCBEAAAAJ;https://scholar.google.com.tw/citations?user=pcCS60IAAAAJ", "orcid": ";0000-0002-6148-6329;0000-0002-5828-9842;0000-0001-8472-7992;;0000-0001-6097-7807;", "linkedin": ";;;;;;", "or_profile": "~Yingxin_Wu1;~Xiang_Wang6;~Fuli_Feng1;~Xiangnan_He1;~AN_ZHANG1;~Tat-seng_Chua1;~Xia_Hu2", "aff": "University of Science and Technology of China;National University of Singapore;University of Science and Technology of China;University of Science and Technology of China;National University of Singapore;National University of Singapore;Rice University", "aff_domain": "ustc.edu.cn;nus.edu.sg;ustc.edu.cn;ustc.edu.cn;nus.edu.sg;nus.edu.sg;rice.edu", "position": "Undergrad student;Postdoc;Full Professor;Professor;Postdoc;Full Professor;Associate Professor", "bibtex": "@misc{\nwu2022deconfounding,\ntitle={Deconfounding to Explanation Evaluation in Graph Neural Networks},\nauthor={Yingxin Wu and Xiang Wang and An Zhang and Xia Hu and Fuli Feng and Xiangnan He and Tat-Seng Chua},\nyear={2022},\nurl={https://openreview.net/forum?id=OKhFyMVz6t7}\n}", "github": "", "project": "", "reviewers": "pL1y;Tnp6;eQDt;eWjy", "site": "https://openreview.net/forum?id=OKhFyMVz6t7", "pdf_size": 0, "recommendation": "6;8;8;8", "confidence": "3;4;4;3", "correctness": "4;3;3;4", "technical_novelty": "4;3;4;4", "empirical_novelty": "3;3;4;4", "wc_summary_paper": "142;85;116;61", "wc_summary_review": "24;23;75;15", "wc_main_review": "477;336;589;82", "wc_review": "643;444;780;158", "wc_reply_reviewers": "0;0;21;0", "wc_reply_authors": "927;655;524;15", "reply_reviewers": "0;0;1;0", "reply_authors": "3;1;1;1", "recommendation_avg": [ 7.5, 0.8660254037844386 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.5, 0.5 ], "wc_summary_paper_avg": [ 101.0, 30.667572450391308 ], "wc_summary_review_avg": [ 34.25, 23.7841859225831 ], "wc_main_review_avg": [ 371.0, 189.4109289349482 ], "wc_review_avg": [ 506.25, 233.87643639323736 ], "wc_reply_reviewers_avg": [ 5.25, 9.093266739736606 ], "wc_reply_authors_avg": [ 530.25, 331.09468056735676 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.5, 0.8660254037844386 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": -0.5773502691896257, "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8793720918000429576&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff_unique_index": "0;1;0;0;1;1;2", "aff_unique_norm": "University of Science and Technology of China;National University of Singapore;Rice University", "aff_unique_dep": ";;", "aff_unique_url": "http://www.ustc.edu.cn;https://www.nus.edu.sg;https://www.rice.edu", "aff_unique_abbr": "USTC;NUS;Rice", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0;1;1;2", "aff_country_unique": "China;Singapore;United States" }, { "title": "AEVA: Black-box Backdoor Detection Using Adversarial Extreme Value Analysis", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6929", "id": "OM_lYiHXiCL", "poster": "", "openreview": "https://openreview.net/forum?id=OM_lYiHXiCL", "slides": "https://iclr.cc/virtual/2022/poster/6929", "video": "https://iclr.cc/virtual/2022/poster/6929", "author_site": "junfeng guo, Ang Li, Cong Liu", "tldr": "", "abstract": "Deep neural networks (DNNs) are proved to be vulnerable against backdoor attacks. A backdoor could be embedded in the target DNNs through injecting a backdoor trigger into the training examples, which can cause the target DNNs misclassify an input attached with the backdoor trigger. Recent backdoor detection methods often require the access to the original poisoned training data, the parameters of the target DNNs, or the predictive confidence for each given input, which are impractical in many real-world applications, e.g., on-device de-ployed DNNs. We address the black-box hard-label backdoor detection problem where the DNN is a fully black-box and only its final output label is accessible. We approach this problem from the optimization perspective and show that the objective of backdoor detection is bounded by an adversarial objective. Further theoretical and empirical studies reveal that this adversarial objective leads to a solution with highly skewed distribution; a singularity is often observed in the adversarial map of a backdoor-infected example, which we call the adversarial singularity phenomenon. Based on this observation, we propose the adversarial extreme value analysis(AEVA) algorithm to detect backdoors in black-box neural networks. The AEVA algorithm is based on an extreme value analysis on the adversarial map, computed from the monte-carlo gradient estimation due to the black-box hard-label constraint. Evidenced by extensive experiments across three popular tasks and backdoor attacks, our approach is shown effective in detecting backdoor attacks under the black-box hard-label scenarios", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Junfeng Guo;Ang Li;Cong Liu", "authorids": "~Junfeng_Guo2;~Ang_Li1;~Cong_Liu2", "gender": "M;M;", "homepage": "https://junfenggo.github.io/;https://angli.ai;https://intra.ece.ucr.edu/~cong/", "dblp": ";33/2805-1;https://dblp.uni-trier.de/pers/l/Liu_0005:Cong.html", "google_scholar": "TqblqYcAAAAJ;6bRXWXEAAAAJ;vpc4bggAAAAJ", "orcid": ";;", "linkedin": ";angli-ai;", "or_profile": "~Junfeng_Guo2;~Ang_Li1;~Cong_Liu2", "aff": "University of Texas, Dallas;Baidu Apollo;University of Texas, Dallas", "aff_domain": "utdallas.edu;baidu.com;utdallas.edu", "position": "PhD student;Principal Researcher;Associate Professor", "bibtex": "@inproceedings{\nguo2022aeva,\ntitle={{AEVA}: Black-box Backdoor Detection Using Adversarial Extreme Value Analysis},\nauthor={Junfeng Guo and Ang Li and Cong Liu},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=OM_lYiHXiCL}\n}", "github": "", "project": "", "reviewers": "Zz4A;TYjG;hfE9;9qnh", "pdf_size": 0, "recommendation": "6;6;8;8", "confidence": "4;4;4;4", "correctness": "3;4;3;4", "technical_novelty": "3;2;3;4", "empirical_novelty": "3;2;3;4", "wc_summary_paper": "62;124;101;238", "wc_summary_review": "92;26;9;70", "wc_main_review": "95;172;365;685", "wc_review": "249;322;475;993", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 7.0, 1.0 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 131.25, 65.49570596611659 ], "wc_summary_review_avg": [ 49.25, 33.23683950077083 ], "wc_main_review_avg": [ 329.25, 227.72612388568862 ], "wc_review_avg": [ 509.75, 290.67969915355286 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 92, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1218468715415331882&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=OM_lYiHXiCL", "email": "utdallas.edu;baidu.com;utdallas.edu", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "University of Texas at Dallas;Baidu", "aff_unique_dep": ";Apollo", "aff_unique_url": "https://www.utdallas.edu;https://apollo.auto", "aff_unique_abbr": "UT Dallas;Baidu Apollo", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Dallas;", "aff_country_unique_index": "0;1;0", "aff_country_unique": "United States;China" }, { "id": "OMxLn4t03FG", "title": "Training Multi-Layer Over-Parametrized Neural Network in Subquadratic Time", "track": "main", "status": "Reject", "tldr": "", "abstract": "In the recent years of development of theoretical machine learning, over-parametrization has been shown to be a powerful tool to resolve many fundamental problems, such as the convergence analysis of deep neural network. While many works have been focusing on designing various algorithms for over-parametrized network with one-hidden layer, multiple-hidden layers framework has received much less attention due to the complexity of the analysis, and even fewer algorithms have been proposed. In this work, we initiate the study of the performance of second-order algorithm on multiple-hidden layers over-parametrized neural network. We propose a novel algorithm to train such network, in time subquadratic in the width of the neural network. Our algorithm combines the Gram-Gauss-Newton method, tensor-based sketching techniques and preconditioning.", "keywords": "Deep learning;optimization;over-parametrization", "primary_area": "", "supplementary_material": "", "author": "Zhao Song;Lichen Zhang;Ruizhe Zhang", "authorids": "zsong@adobe.com;~Lichen_Zhang2;~Ruizhe_Zhang2", "gender": ";M;M", "homepage": ";https://lczh.github.io/;", "dblp": ";00/6357-3;133/6407-1", "google_scholar": ";https://scholar.google.com/citations?view_op=list_works;", "orcid": ";;", "linkedin": ";;", "or_profile": "zsong@adobe.com;~Lichen_Zhang2;~Ruizhe_Zhang2", "aff": ";Carnegie Mellon University;The University of Texas at Austin", "aff_domain": ";cmu.edu;utexas.edu", "position": ";MS student;PhD student", "bibtex": "@misc{\nsong2022training,\ntitle={Training Multi-Layer Over-Parametrized Neural Network in Subquadratic Time},\nauthor={Zhao Song and Lichen Zhang and Ruizhe Zhang},\nyear={2022},\nurl={https://openreview.net/forum?id=OMxLn4t03FG}\n}", "github": "", "project": "", "reviewers": "aph9;djYB;xVuq;91V6", "site": "https://openreview.net/forum?id=OMxLn4t03FG", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "4;3;4;4", "correctness": "2;4;4;4", "technical_novelty": "3;3;3;2", "empirical_novelty": "0;0;1;0", "wc_summary_paper": "40;39;102;52", "wc_summary_review": "35;71;77;14", "wc_main_review": "469;428;277;354", "wc_review": "544;538;456;420", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "679;595;667;555", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.8660254037844386 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 0.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 58.25, 25.77183540223707 ], "wc_summary_review_avg": [ 49.25, 25.926579026165406 ], "wc_main_review_avg": [ 382.0, 73.30416086416923 ], "wc_review_avg": [ 489.5, 53.09190145398826 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 624.0, 51.176166327695945 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 82, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2524316222224087777&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff_unique_index": "0;1", "aff_unique_norm": "Carnegie Mellon University;University of Texas at Austin", "aff_unique_dep": ";", "aff_unique_url": "https://www.cmu.edu;https://www.utexas.edu", "aff_unique_abbr": "CMU;UT Austin", "aff_campus_unique_index": "1", "aff_campus_unique": ";Austin", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "ONTz_GFWkFR", "title": "A Sampling-Free Approximation of Gaussian Variational Auto-Encoders", "track": "main", "status": "Reject", "tldr": "", "abstract": "We propose a sampling-free approximate formulation of Gaussian variational auto-encoders. Instead of computing the loss via stochastic sampling, we propagate the Gaussian distributions from the latent space into the output space. As computing the exact likelihood probability is intractable, we propose to locally approximate the decoder network by its Taylor series. We demonstrate that this approximation allows us to approximate the Gaussian variational auto-encoder training objective in closed form. We evaluate the proposed method on the CelebA, the 3D Chairs, and the MNIST data sets. We find that our sampling-free approximation performs better than its sampling counterpart on the Frechet inception distance and on par on the estimated marginal likelihood.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Felix Petersen;Christian Borgelt;Hilde Kuehne;Oliver Deussen", "authorids": "~Felix_Petersen1;~Christian_Borgelt1;~Hilde_Kuehne5;~Oliver_Deussen1", "gender": "Not Specified;M;F;M", "homepage": "http://www.petersen.ai/;https://www.borgelt.net/;https://hildekuehne.github.io;https://graphics.uni-konstanz.de", "dblp": "230/3983;b/ChristianBorgelt.html;45/4963;48/2158", "google_scholar": "v8Kat6YAAAAJ;https://scholar.google.de/citations?user=T50Bxb8AAAAJ;pxhCcH0AAAAJ;https://scholar.google.de/scholar?hl=en", "orcid": ";;0000-0003-1079-4441;0000-0001-5803-2185", "linkedin": ";christian-borgelt-a2429071/;hilde-kuehne-8b9aa661;", "or_profile": "~Felix_Petersen1;~Christian_Borgelt1;~Hilde_Kuehne5;~Oliver_Deussen1", "aff": "University of Konstanz;Paris-Lodron-University of Salzburg;Goethe University Frankfurt;University of Konstanz", "aff_domain": "uni-konstanz.de;sbg.ac.at;uni-frankfurt.de;uni-konstanz.de", "position": "PhD student;Full Professor;Assistant Professor;Full Professor", "bibtex": "@misc{\npetersen2022a,\ntitle={A Sampling-Free Approximation of Gaussian Variational Auto-Encoders},\nauthor={Felix Petersen and Christian Borgelt and Hilde Kuehne and Oliver Deussen},\nyear={2022},\nurl={https://openreview.net/forum?id=ONTz_GFWkFR}\n}", "github": "", "project": "", "reviewers": "smp3;Zwr6;AwHy;SKjV", "site": "https://openreview.net/forum?id=ONTz_GFWkFR", "pdf_size": 0, "recommendation": "5;5;5;8", "confidence": "5;5;4;3", "correctness": "4;3;4;2", "technical_novelty": "2;2;2;4", "empirical_novelty": "2;2;2;0", "wc_summary_paper": "50;54;35;31", "wc_summary_review": "40;41;35;74", "wc_main_review": "513;390;115;521", "wc_review": "603;485;185;626", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "1478;724;283;659", "reply_reviewers": "0;0;0;0", "reply_authors": "3;1;1;1", "recommendation_avg": [ 5.75, 1.299038105676658 ], "confidence_avg": [ 4.25, 0.82915619758885 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.5, 0.8660254037844386 ], "empirical_novelty_avg": [ 1.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 42.5, 9.7082439194738 ], "wc_summary_review_avg": [ 47.5, 15.46770829825802 ], "wc_main_review_avg": [ 384.75, 164.16816835184585 ], "wc_review_avg": [ 474.75, 175.6308273054591 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 786.0, 433.5452686859816 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.5, 0.8660254037844386 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.8703882797784892, "corr_recommendation_correctness": -0.8703882797784892, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:v7BNMr7k4BMJ:scholar.google.com/&scioq=A+Sampling-Free+Approximation+of+Gaussian+Variational+Auto-Encoders&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "University of Konstanz;Paris-Lodron-University of Salzburg;Goethe University Frankfurt", "aff_unique_dep": ";;", "aff_unique_url": "https://www.uni-konstanz.de;https://www.uni-salzburg.at;https://www.uni-frankfurt.de", "aff_unique_abbr": "Uni Konstanz;PLUS;GU Frankfurt", "aff_campus_unique_index": "1", "aff_campus_unique": ";Frankfurt", "aff_country_unique_index": "0;1;0;0", "aff_country_unique": "Germany;Austria" }, { "id": "OOaY4GZIJ7", "title": "Efficient Semi-Discrete Optimal Transport Using the Maximum Relative Error between Distributions", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Semi-Discrete Optimal Transport (SDOT) transforms a continuous distribution to a discrete distribution. However, existing SDOT algorithms for high dimensional distributions have two limitations. 1) It is difficult to evaluate the quality of the transport maps produced by SDOT algorithms, because computing a high-dimensional Wasserstein distance for SDOT is intractable and 2) The transport map cannot guarantee that all target points have the corresponding source points that are mapped to them. To address these limitations, we introduce the Maximum Relative Error (\\texttt{MRE}) between the target distribution and the transported distribution computed by an SDOT map. If the \\texttt{MRE} is smaller than 1, then every target point is guaranteed to have an area in the source distribution that is mapped to it. We propose a statistical method to compute the lower and upper bounds of the \\texttt{MRE} given a confidence threshold and a precision. The gap between the lower bound and the upper bound approaches 0 as the number of samples goes to infinity. We present an efficient Epoch Gradient Descent algorithm for SDOT (SDOT-EGD) that computes the learning rate, number of iterations, and number of epochs in order to guarantee an arbitrarily small \\texttt{MRE} in expectation. Experiments on both low and high-dimensional data show that SDOT-EGD is much faster and converges much better than state-of-the-art SDOT algorithms. We also show our method's potential to improve GAN training by avoiding the oscillation caused by randomly changing the association between noise and the real images.", "keywords": "optimal transport", "primary_area": "", "supplementary_material": "", "author": "Huidong Liu;Ke Ma;Lei Zhou;Dimitris Samaras", "authorids": "~Huidong_Liu1;~Ke_Ma3;~Lei_Zhou9;~Dimitris_Samaras3", "gender": "M;M;M;M", "homepage": "https://harryliew.github.io/;;;https://www.cs.stonybrook.edu/~samaras/", "dblp": "174/9885;;;s/DimitrisSamaras", "google_scholar": "https://scholar.google.com/citations?hl=en;ovZamhQAAAAJ;AdsoCBgAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";;;0000-0002-1373-0294", "linkedin": ";;lei-zhou-800b41143/;", "or_profile": "~Huidong_Liu1;~Ke_Ma3;~Lei_Zhou9;~Dimitris_Samaras3", "aff": "State University of New York, Stony Brook;Snap Inc.;State University of New York, Stony Brook;Stony Brook University", "aff_domain": "stonybrook.edu;snapchat.com;stonybrook.edu;cs.stonybrook.edu", "position": "PhD student;Researcher;PhD student;Full Professor", "bibtex": "@misc{\nliu2022efficient,\ntitle={Efficient Semi-Discrete Optimal Transport Using the Maximum Relative Error between Distributions},\nauthor={Huidong Liu and Ke Ma and Lei Zhou and Dimitris Samaras},\nyear={2022},\nurl={https://openreview.net/forum?id=OOaY4GZIJ7}\n}", "github": "", "project": "", "reviewers": "PuAH;EiDa;Vbry;VHr5", "site": "https://openreview.net/forum?id=OOaY4GZIJ7", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "4;3;4;3", "correctness": "3;3;3;3", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "35;55;85;94", "wc_summary_review": "17;53;73;51", "wc_main_review": "596;596;431;197", "wc_review": "648;704;589;342", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 67.25, 23.562417108607512 ], "wc_summary_review_avg": [ 48.5, 20.11839953873071 ], "wc_main_review_avg": [ 455.0, 163.479356494941 ], "wc_review_avg": [ 570.75, 138.1871466526464 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:k47Dajx2C2sJ:scholar.google.com/&scioq=Efficient+Semi-Discrete+Optimal+Transport+Using+the+Maximum+Relative+Error+between+Distributions&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;0;2", "aff_unique_norm": "State University of New York;Snap Inc.;Stony Brook University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.stonybrook.edu;https://www.snapinc.com;https://www.stonybrook.edu", "aff_unique_abbr": "SUNY Stony Brook;Snap;SBU", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Stony Brook;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "OQL_tkK1vqO", "title": "ZARTS: On Zero-order Optimization for Neural Architecture Search", "track": "main", "status": "Reject", "tldr": "", "abstract": "Differentiable architecture search (DARTS) has been a popular one-shot paradigm for NAS due to its high efficiency. It introduces trainable architecture parameters to represent the importance of candidate operations and proposes first/second-order approximation to estimate their gradients, making it possible to solve NAS by gradient descent algorithm. However, our in-depth empirical results show that the approximation will often distort the loss landscape, leading to the biased objective to optimize and in turn inaccurate gradient estimation for architecture parameters. This work turns to zero-order optimization and proposes a novel NAS scheme, called ZARTS, to search without enforcing the above approximation. Specifically, three representative zero-order optimization methods are introduced: RS, MGS, and GLD, among which MGS performs best by balancing the accuracy and speed. Moreover, we explore the connections between RS/MGS and gradient descent algorithm and show that our ZARTS can be seen as a robust gradient-free counterpart to DARTS. Extensive experiments on multiple datasets and search spaces show the remarkable performance of our method. In particular, results on 12 benchmarks verify the outstanding robustness of ZARTS, where the performance of DARTS collapses due to its known instability issue. Also, we search on the search space of DARTS to compare with peer methods, and our discovered architecture achieves 97.54% accuracy on CIFAR-10 and 75.7% top-1 accuracy on ImageNet, which are state-of-the-art performance.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xiaoxing Wang;Wenxuan Guo;Junchi Yan;Xiaokang Yang;Jianlin Su", "authorids": "~Xiaoxing_Wang1;~Wenxuan_Guo1;~Junchi_Yan2;~Xiaokang_Yang1;~Jianlin_Su1", "gender": "F;M;M;M;M", "homepage": "http://aryaguo.github.io;https://icne.sjtu.edu.cn/info/1064/1078.htm;http://jianlin.su;http://thinklab.sjtu.edu.cn/;https://scholar.google.com/citations?user=n2ewxUIAAAAJ&hl=zh-CN", "dblp": ";06/3071-1.html;223/4243;60/7949.html;78/885", "google_scholar": ";yDEavdMAAAAJ;cdbdaksAAAAJ;ga230VoAAAAJ;n2ewxUIAAAAJ", "orcid": "0000-0001-6336-3819;0000-0003-4029-3322;;0000-0001-9639-7679;0000-0002-7830-9521", "linkedin": ";;;;", "or_profile": "~Wenxuan_Guo1;~Xiaokang_Yang1;~Jianlin_Su1;~Junchi_Yan1;~Victor_Wang1", "aff": "Shanghai Jiaotong University;Shanghai Jiaotong University;Shenzhen Zhuiyi Technology Co., Ltd.;Shanghai Jiaotong University;Shanghai Jiaotong University", "aff_domain": "sjtu.edu.cn;sjtu.edu.cn;wezhuiyi.com;sjtu.edu.cn;sjtu.edu.cn", "position": "PhD student;Full Professor;Engineer;Associate Professor;PhD student", "bibtex": "@misc{\nwang2022zarts,\ntitle={{ZARTS}: On Zero-order Optimization for Neural Architecture Search},\nauthor={Xiaoxing Wang and Wenxuan Guo and Junchi Yan and Xiaokang Yang and Jianlin Su},\nyear={2022},\nurl={https://openreview.net/forum?id=OQL_tkK1vqO}\n}", "github": "", "project": "", "reviewers": "5Kkb;JSTM;rwzN", "site": "https://openreview.net/forum?id=OQL_tkK1vqO", "pdf_size": 0, "recommendation": "6;6;6", "confidence": "4;4;5", "correctness": "4;3;3", "technical_novelty": "2;4;4", "empirical_novelty": "2;3;0", "wc_summary_paper": "88;54;53", "wc_summary_review": "58;33;23", "wc_main_review": "163;219;362", "wc_review": "309;306;438", "wc_reply_reviewers": "52;0;0", "wc_reply_authors": "311;217;783", "reply_reviewers": "1;0;0", "reply_authors": "1;1;2", "recommendation_avg": [ 6.0, 0.0 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 3.3333333333333335, 0.9428090415820634 ], "empirical_novelty_avg": [ 1.6666666666666667, 1.247219128924647 ], "wc_summary_paper_avg": [ 65.0, 16.268579122549905 ], "wc_summary_review_avg": [ 38.0, 14.719601443879744 ], "wc_main_review_avg": [ 248.0, 83.7894185841307 ], "wc_review_avg": [ 351.0, 61.53048025166064 ], "wc_reply_reviewers_avg": [ 17.333333333333332, 24.513035081133648 ], "wc_reply_authors_avg": [ 437.0, 247.6502910692145 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 41, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9929324085037187810&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff_unique_index": "0;0;1;0;0", "aff_unique_norm": "Shanghai Jiao Tong University;Shenzhen Zhuiyi Technology Co., Ltd.", "aff_unique_dep": ";", "aff_unique_url": "https://www.sjtu.edu.cn;", "aff_unique_abbr": "SJTU;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "id": "OQo6Tuyo0ih", "title": "Interpretable Multi-hop Reasoning for Forecasting Future Links on Temporal Knowledge Graphs", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Temporal knowledge graphs (KGs) have recently attracted growing attention. The temporal KG forecasting task, which plays a crucial role in applications such as event prediction, is predicting future links based on historical facts. The interpretability of the current temporal KG forecasting models is manifested in providing the reasoning paths. However, the comparison of reasoning paths is operated under the black box. Inspired by the observation that reasoning based on multi-hop paths is equivalent to answering questions step by step, this paper designs an Interpretable Multi-hop Reasoning (IMR) model for temporal KG forecasting. IMR transforms reasoning based on path searching into step-by-step question answering. Moreover, IMR designs three indicators according to the characteristics of temporal KGs and reasoning paths: question matching degree, answer completing level and path confidence. Unlike other models that can only utilize paths with a specified hop, IMR can effectively integrate paths of different hops; IMR can provide the reasoning paths like other interpretable models and further explain the basis for path comparison. While being more explainable, IMR has achieved state-of-the-art on four baseline datasets.", "keywords": "Temporal knowledge graphs;Forecasting;Question matching degree;Answer completing level;Path confidence", "primary_area": "", "supplementary_material": "", "author": "Liang Zongwei;Junan Yang;Keju Huang;Hui Liu;Lin Cui;Lingzhi Qu;Xiang Li", "authorids": "~Liang_Zongwei1;yangjunan@ustc.edu.cn;~Keju_Huang1;christ592604@163.com;cuilin17@nudt.edu.cn;qulingzhi@nudt.edu.cn;lix20@nudt.edu.cn", "gender": "M;;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": "~Liang_Zongwei1;yangjunan@ustc.edu.cn;~Keju_Huang1;christ592604@163.com;cuilin17@nudt.edu.cn;qulingzhi@nudt.edu.cn;lix20@nudt.edu.cn", "aff": "National University of Defense Technology;;;;;;", "aff_domain": "nudt.edu.cn;;;;;;", "position": "PhD student;;;;;;", "bibtex": "@misc{\nzongwei2022interpretable,\ntitle={Interpretable Multi-hop Reasoning for Forecasting Future Links on Temporal Knowledge Graphs},\nauthor={Liang Zongwei and Junan Yang and Keju Huang and Hui Liu and Lin Cui and Lingzhi Qu and Xiang Li},\nyear={2022},\nurl={https://openreview.net/forum?id=OQo6Tuyo0ih}\n}", "github": "", "project": "", "reviewers": "fqts;XtAH;zUEs;5a2j;hi7e", "site": "https://openreview.net/forum?id=OQo6Tuyo0ih", "pdf_size": 0, "recommendation": "3;3;5;5;6", "confidence": "4;4;5;4;2", "correctness": "3;2;3;3;2", "technical_novelty": "2;2;2;2;3", "empirical_novelty": "2;2;2;2;3", "wc_summary_paper": "41;183;41;151;140", "wc_summary_review": "7;42;68;155;46", "wc_main_review": "271;230;287;428;160", "wc_review": "319;455;396;734;346", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "762;784;477;795;352", "reply_reviewers": "0;0;0;0;0", "reply_authors": "1;1;1;1;1", "recommendation_avg": [ 4.4, 1.2 ], "confidence_avg": [ 3.8, 0.9797958971132712 ], "correctness_avg": [ 2.6, 0.4898979485566356 ], "technical_novelty_avg": [ 2.2, 0.39999999999999997 ], "empirical_novelty_avg": [ 2.2, 0.39999999999999997 ], "wc_summary_paper_avg": [ 111.2, 59.03354978315297 ], "wc_summary_review_avg": [ 63.6, 49.70553289121846 ], "wc_main_review_avg": [ 275.2, 88.11220119824496 ], "wc_review_avg": [ 450.0, 149.38139107666657 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 634.0, 183.83579629658638 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": -0.44226898133585163, "corr_recommendation_correctness": -0.06804138174397723, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:lspAszTI7xQJ:scholar.google.com/&scioq=Interpretable+Multi-hop+Reasoning+for+Forecasting+Future+Links+on+Temporal+Knowledge+Graphs&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "National University of Defense Technology", "aff_unique_dep": "", "aff_unique_url": "http://www.nudt.edu.cn/", "aff_unique_abbr": "NUDT", "aff_country_unique_index": "0", "aff_country_unique": "China" }, { "title": "IFR-Explore: Learning Inter-object Functional Relationships in 3D Indoor Scenes", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6870", "id": "OT3mLgR8Wg8", "poster": "", "openreview": "https://openreview.net/forum?id=OT3mLgR8Wg8", "slides": "https://iclr.cc/virtual/2022/poster/6870", "video": "https://iclr.cc/virtual/2022/poster/6870", "author_site": "QI LI, Kaichun Mo, Yanchao Yang, Hang Zhao, Leonidas Guibas", "tldr": "", "abstract": "Building embodied intelligent agents that can interact with 3D indoor environments has received increasing research attention in recent years. While most works focus on single-object or agent-object visual functionality and affordances, our work proposes to study a novel, underexplored, kind of visual relations that is also important to perceive and model -- inter-object functional relationships (e.g., a switch on the wall turns on or off the light, a remote control operates the TV). Humans often spend no effort or only a little to infer these relationships, even when entering a new room, by using our strong prior knowledge (e.g., we know that buttons control electrical devices) or using only a few exploratory interactions in cases of uncertainty (e.g., multiple switches and lights in the same room). In this paper, we take the first step in building AI system learning inter-object functional relationships in 3D indoor environments with key technical contributions of modeling prior knowledge by training over large-scale scenes and designing interactive policies for effectively exploring the training scenes and quickly adapting to novel test scenes. We create a new dataset based on the AI2Thor and PartNet datasets and perform extensive experiments that prove the effectiveness of our proposed method.", "keywords": "Inter-object Functional Relationship;Learning Interactive Policy for Exploration;Interactive Perception;3D Scene Understanding", "primary_area": "", "supplementary_material": "", "author": "QI LI;Kaichun Mo;Yanchao Yang;Hang Zhao;Leonidas Guibas", "authorids": "~QI_LI9;~Kaichun_Mo1;~Yanchao_Yang1;~Hang_Zhao1;~Leonidas_Guibas1", "gender": "M;M;M;M;M", "homepage": ";https://cs.stanford.edu/~kaichun/;https://yanchaoyang.github.io/;http://www.mit.edu/~hangzhao/;http://geometry.stanford.edu/", "dblp": ";172/1283;84/8637-1;;g/LeonidasJGuibas", "google_scholar": ";pL7JsOsAAAAJ;r2tKnV4AAAAJ;DmahiOYAAAAJ;https://scholar.google.com.tw/citations?user=5JlEyTAAAAAJ", "orcid": ";;;;", "linkedin": "li-qi-9b946a197/;;;;", "or_profile": "~QI_LI9;~Kaichun_Mo1;~Yanchao_Yang1;~Hang_Zhao1;~Leonidas_Guibas1", "aff": "Tsinghua University;Stanford University;Stanford University;Tsinghua University;Stanford University", "aff_domain": "tsinghua.edu.cn;stanford.edu;stanford.edu;tsinghua.edu.cn;stanford.edu", "position": "Undergrad student;PhD student;Postdoc;Assistant Professor;Full Professor", "bibtex": "@inproceedings{\nli2022ifrexplore,\ntitle={{IFR}-Explore: Learning Inter-object Functional Relationships in 3D Indoor Scenes},\nauthor={QI LI and Kaichun Mo and Yanchao Yang and Hang Zhao and Leonidas Guibas},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=OT3mLgR8Wg8}\n}", "github": "", "project": "", "reviewers": "sxhc;RFiX;fQs8;TQTu", "pdf_size": 0, "recommendation": "6;6;6;8", "confidence": "4;3;3;4", "correctness": "3;4;4;4", "technical_novelty": "2;3;2;3", "empirical_novelty": "2;3;2;3", "wc_summary_paper": "522;64;81;163", "wc_summary_review": "83;49;52;69", "wc_main_review": "268;204;248;329", "wc_review": "873;317;381;561", "wc_reply_reviewers": "0;32;46;38", "wc_reply_authors": "832;887;1724;1099", "reply_reviewers": "0;1;1;1", "reply_authors": "2;3;4;3", "recommendation_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 207.5, 185.39484890363053 ], "wc_summary_review_avg": [ 63.25, 13.718144918318949 ], "wc_main_review_avg": [ 262.25, 44.95761893161158 ], "wc_review_avg": [ 533.0, 215.72204337990127 ], "wc_reply_reviewers_avg": [ 29.0, 17.46424919657298 ], "wc_reply_authors_avg": [ 1135.5, 354.09356108237836 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 3.0, 0.7071067811865476 ], "replies_avg": [ 20, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16187374474741949088&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "pdf": "https://openreview.net/pdf?id=OT3mLgR8Wg8", "email": "tsinghua.edu.cn;stanford.edu;stanford.edu;tsinghua.edu.cn;stanford.edu", "author_num": 5, "aff_unique_index": "0;1;1;0;1", "aff_unique_norm": "Tsinghua University;Stanford University", "aff_unique_dep": ";", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.stanford.edu", "aff_unique_abbr": "THU;Stanford", "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Stanford", "aff_country_unique_index": "0;1;1;0;1", "aff_country_unique": "China;United States" }, { "title": "A Zest of LIME: Towards Architecture-Independent Model Distances", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6094", "id": "OUz_9TiTv9j", "poster": "", "openreview": "https://openreview.net/forum?id=OUz_9TiTv9j", "slides": "https://iclr.cc/virtual/2022/poster/6094", "video": "https://iclr.cc/virtual/2022/poster/6094", "author_site": "Hengrui Jia, Hongyu Chen, Jonas Guan, Ali Shahin Shamsabadi, Nicolas Papernot", "tldr": "", "abstract": "Definitions of the distance between two machine learning models either characterize the similarity of the models' predictions or of their weights. While similarity of weights is attractive because it implies similarity of predictions in the limit, it suffers from being inapplicable to comparing models with different architectures. On the other hand, the similarity of predictions is broadly applicable but depends heavily on the choice of model inputs during comparison. In this paper, we instead propose to compute distance between black-box models by comparing their Local Interpretable Model-Agnostic Explanations (LIME). To compare two models, we take a reference dataset, and locally approximate the models on each reference point with linear models trained by LIME. We then compute the cosine distance between the concatenated weights of the linear models. This yields an approach that is both architecture-independent and possesses the benefits of comparing models in weight space. We empirically show that our method, which we call Zest, can be applied to two problems that require measurements of model similarity: detecting model stealing and machine unlearning.", "keywords": "model distance;model stealing;machine unlearning;fairwashing", "primary_area": "", "supplementary_material": "", "author": "Hengrui Jia;Hongyu Chen;Jonas Guan;Ali Shahin Shamsabadi;Nicolas Papernot", "authorids": "~Hengrui_Jia1;hy.chen@mail.utoronto.ca;jonas@cs.toronto.edu;~Ali_Shahin_Shamsabadi1;~Nicolas_Papernot1", "gender": "M;;;M;M", "homepage": "https://nick-jia.github.io/;;;https://alishahin.github.io;https://www.papernot.fr", "dblp": "255/4934;;;198/1244;162/1405", "google_scholar": "g2vBgnoAAAAJ;;;1kVnWYwAAAAJ;cGxq0cMAAAAJ", "orcid": ";;;;", "linkedin": ";;;ali-shahin-shamsabadi-492544259/;nicolaspapernot", "or_profile": "~Hengrui_Jia1;hy.chen@mail.utoronto.ca;jonas@cs.toronto.edu;~Ali_Shahin_Shamsabadi1;~Nicolas_Papernot1", "aff": "University of Toronto;;;Vector;Google", "aff_domain": "utoronto.ca;;;vectorinstitute.ai;google.com", "position": "PhD student;;;Postdoc;Research Scientist", "bibtex": "@inproceedings{\njia2022a,\ntitle={A Zest of {LIME}: Towards Architecture-Independent Model Distances},\nauthor={Hengrui Jia and Hongyu Chen and Jonas Guan and Ali Shahin Shamsabadi and Nicolas Papernot},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=OUz_9TiTv9j}\n}", "github": "", "project": "", "reviewers": "MKjr;GFcZ;vdBq;9qy5", "pdf_size": 0, "recommendation": "3;6;6;8", "confidence": "4;4;3;3", "correctness": "3;4;3;3", "technical_novelty": "1;2;3;4", "empirical_novelty": "2;3;3;4", "wc_summary_paper": "77;49;55;166", "wc_summary_review": "26;54;46;69", "wc_main_review": "217;284;112;492", "wc_review": "320;387;213;727", "wc_reply_reviewers": "0;8;0;0", "wc_reply_authors": "1492;1011;774;1256", "reply_reviewers": "0;1;0;0", "reply_authors": "4;2;1;2", "recommendation_avg": [ 5.75, 1.7853571071357126 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 1.118033988749895 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 86.75, 46.92747063288197 ], "wc_summary_review_avg": [ 48.75, 15.514106484100203 ], "wc_main_review_avg": [ 276.25, 138.8315075910364 ], "wc_review_avg": [ 411.75, 192.29843343095646 ], "wc_reply_reviewers_avg": [ 2.0, 3.4641016151377544 ], "wc_reply_authors_avg": [ 1133.25, 268.22320462629625 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 2.25, 1.0897247358851685 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.7001400420140049, "corr_recommendation_correctness": 0.08084520834544431, "gs_citation": 26, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7636196651006629410&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2, "pdf": "https://openreview.net/pdf?id=OUz_9TiTv9j", "email": "utoronto.ca;;;vectorinstitute.ai;google.com", "author_num": 5, "aff_unique_index": "0;1;2", "aff_unique_norm": "University of Toronto;Vector Institute;Google", "aff_unique_dep": ";;Google", "aff_unique_url": "https://www.utoronto.ca;https://vectorinstitute.ai/;https://www.google.com", "aff_unique_abbr": "U of T;Vector;Google", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;1", "aff_country_unique": "Canada;United States" }, { "id": "OVShHe8Ce0", "title": "SAU: Smooth activation function using convolution with approximate identities", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Well-known activation functions like ReLU or Leaky ReLU are non-differentiable at the origin. Over the years, many smooth approximations of ReLU have been proposed using various smoothing techniques. We propose new smooth approximations of a non-differentiable activation function by convolving it with approximate identities. In particular, we present smooth approximations of Leaky ReLU and show that they outperform several well-known activation functions in various datasets and models. We call this function Smooth Activation Unit (SAU). Replacing ReLU by SAU, we get 5.12% improvement with ShuffleNet V2 (2.0x) model on the CIFAR100 dataset.", "keywords": "Deep Learning;Neural Networks;Parametric activation function.", "primary_area": "", "supplementary_material": "", "author": "Koushik Biswas;Sandeep Kumar;Shilpak Banerjee;Ashish Kumar Pandey", "authorids": "~Koushik_Biswas2;~Sandeep_Kumar7;~Shilpak_Banerjee1;~Ashish_Kumar_Pandey1", "gender": "M;M;M;M", "homepage": ";https://sites.google.com/view/shilpakbanerjee/home;;", "dblp": "06/5441-2;274/2014;234/3896.html;274/2151.html", "google_scholar": "nENVPxUAAAAJ;S3NP6lYAAAAJ;XJqGr1UAAAAJ;", "orcid": ";0000-0003-1036-9576;;0000-0002-9818-8966", "linkedin": ";shilpakbanerjee/;;", "or_profile": "~Sandeep_Kumar7;~Shilpak_Banerjee1;~Ashish_Kumar_Pandey1;~Koushik_Biswas3", "aff": "Univeristy of Delhi;Indian Institute of Technology Tirupati;Indraprastha Institute of Information Technology, Delhi;Indraprastha Institute of Information Technology, Delhi", "aff_domain": "du.ac.in;iittp.ac.in;iiitd.ac.in;iiitd.ac.in", "position": "Assistant Professor;Assistant Professor;Assistant Professor;PhD student", "bibtex": "@misc{\nbiswas2022sau,\ntitle={{SAU}: Smooth activation function using convolution with approximate identities},\nauthor={Koushik Biswas and Sandeep Kumar and Shilpak Banerjee and Ashish Kumar Pandey},\nyear={2022},\nurl={https://openreview.net/forum?id=OVShHe8Ce0}\n}", "github": "", "project": "", "reviewers": "gQcZ;UwNK;EZXb;VAt9", "site": "https://openreview.net/forum?id=OVShHe8Ce0", "pdf_size": 0, "recommendation": "3;3;3;8", "confidence": "3;4;4;3", "correctness": "3;3;3;3", "technical_novelty": "2;2;2;3", "empirical_novelty": "0;2;2;3", "wc_summary_paper": "75;97;22;102", "wc_summary_review": "17;165;27;31", "wc_main_review": "251;483;73;415", "wc_review": "343;745;122;548", "wc_reply_reviewers": "52;0;0;145", "wc_reply_authors": "285;405;217;376", "reply_reviewers": "1;0;0;1", "reply_authors": "1;1;1;1", "recommendation_avg": [ 4.25, 2.165063509461097 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.75, 1.0897247358851685 ], "wc_summary_paper_avg": [ 74.0, 31.693847983480957 ], "wc_summary_review_avg": [ 60.0, 60.83584469702052 ], "wc_main_review_avg": [ 305.5, 158.52681161242094 ], "wc_review_avg": [ 439.5, 231.95958699739057 ], "wc_reply_reviewers_avg": [ 49.25, 59.217290549298184 ], "wc_reply_authors_avg": [ 320.75, 74.48615643191692 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.0, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=344711880762679300&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff_unique_index": "0;1;2;2", "aff_unique_norm": "University of Delhi;Indian Institute of Technology;Indraprastha Institute of Information Technology", "aff_unique_dep": ";;", "aff_unique_url": "http://www.du.ac.in;https://iit Tirupati.ac.in;http://www.iiitd.ac.in", "aff_unique_abbr": "DU;IIT Tirupati;IIIT-D", "aff_campus_unique_index": "1;2;2", "aff_campus_unique": ";Tirupati;Delhi", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "India" }, { "id": "OVV_wIPf1e", "title": "Causal-TGAN: Causally-Aware Synthetic Tabular Data Generative Adversarial Network", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Synthetic tabular data generation has recently gained immense attention due to applications in medicine, finance, and other fields. Generative adversarial networks (GANs) designed initially for image generation have been demonstrated to be promising for generating certain types of tabular data. Tabular data may contain mixed data types such as continuous, ordered, binary, and categorical values. However, the causal relationships between the variables in tabular data have been largely ignored by the prior art. Causality encodes real-world relationships occurring naturally between variables measuring a phenomenon. \n\nIn this work, we propose Causal-TGAN, a data generation architecture that incorporates causal relationships at its core. The flexibility of this architecture is its capability to support different types of expert knowledge (e.g., complete or partial) about the causal nature of the underlying phenomenon. Extensive experimental results on both simulated and real-world datasets demonstrate that Causal-TGAN and its hybrid avatars consistently outperform other baseline GAN models. We also argue that the architecture's flexibility is promising for many practical applications. ", "keywords": "Tabular Data Generation;Generative Adversarial Networks;Causality", "primary_area": "", "supplementary_material": "", "author": "Bingyang Wen;Yupeng Cao;Fan Yang;Koduvayur Subbalakshmi;Rajarathnam Chandramouli", "authorids": "~Bingyang_Wen1;ycao33@stevens.edu;fyang14@stevens.edu;~Koduvayur_Subbalakshmi1;~Rajarathnam_Chandramouli1", "gender": "M;;;;", "homepage": ";;;https://sites.google.com/stevens.edu/infinitylab;", "dblp": ";;;s/KPSubbalakshmi.html;", "google_scholar": "4wcaCi4AAAAJ;;;SSAaI4AAAAAJ;", "orcid": ";;;0000-0002-1670-9378;", "linkedin": ";;;https://www.linkedin.com/kpsuba;", "or_profile": "~Bingyang_Wen1;ycao33@stevens.edu;fyang14@stevens.edu;~Koduvayur_Subbalakshmi1;~Rajarathnam_Chandramouli1", "aff": "Stevens Institute of Technology;;;Stevens Institute of Technology;Stevens Institute of Technology", "aff_domain": "stevens.edu;;;stevens.edu;stevens.edu", "position": "PhD student;;;Full Professor;", "bibtex": "@misc{\nwen2022causaltgan,\ntitle={Causal-{TGAN}: Causally-Aware Synthetic Tabular Data Generative Adversarial Network},\nauthor={Bingyang Wen and Yupeng Cao and Fan Yang and Koduvayur Subbalakshmi and Rajarathnam Chandramouli},\nyear={2022},\nurl={https://openreview.net/forum?id=OVV_wIPf1e}\n}", "github": "", "project": "", "reviewers": "yNaK;xZG5;uPCM;ALqi", "site": "https://openreview.net/forum?id=OVV_wIPf1e", "pdf_size": 0, "recommendation": "1;3;3;3", "confidence": "5;4;4;4", "correctness": "4;4;3;3", "technical_novelty": "1;1;2;2", "empirical_novelty": "2;1;2;2", "wc_summary_paper": "23;48;86;72", "wc_summary_review": "10;37;129;68", "wc_main_review": "408;114;344;248", "wc_review": "441;199;559;388", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 2.5, 0.8660254037844386 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 1.5, 0.5 ], "empirical_novelty_avg": [ 1.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 57.25, 23.99348870006194 ], "wc_summary_review_avg": [ 61.0, 44.30011286667337 ], "wc_main_review_avg": [ 278.5, 110.73730175509967 ], "wc_review_avg": [ 396.75, 129.86988680983748 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -1.0, "corr_recommendation_correctness": -0.5773502691896257, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7068160632756419492&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;0", "aff_unique_norm": "Stevens Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.stevens.edu", "aff_unique_abbr": "SIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Reward Uncertainty for Exploration in Preference-based Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6956", "id": "OWZVD-l-ZrC", "poster": "", "openreview": "https://openreview.net/forum?id=OWZVD-l-ZrC", "slides": "https://iclr.cc/virtual/2022/poster/6956", "video": "https://iclr.cc/virtual/2022/poster/6956", "author_site": "Xinran Liang, Katherine Shu, Kimin Lee, Pieter Abbeel", "tldr": "", "abstract": "Conveying complex objectives to reinforcement learning (RL) agents often requires meticulous reward engineering. Preference-based RL methods are able to learn a more flexible reward model based on human preferences by actively incorporating human feedback, i.e. teacher's preferences between two clips of behaviors. However, poor feedback-efficiency still remains as a problem in current preference-based RL algorithms, as tailored human feedback is very expensive. To handle this issue, previous methods have mainly focused on improving query selection and policy initialization. At the same time, recent exploration methods have proven to be a recipe for improving sample-efficiency in RL. We present an exploration method specifically for preference-based RL algorithms. Our main idea is to design an intrinsic reward by measuring the novelty based on learned reward. Specifically, we utilize disagreement across ensemble of learned reward models. Our intuition is that disagreement in learned reward model reflects uncertainty in tailored human feedback and could be useful for exploration. Our experiments show that reward uncertainty exploration improves both feedback- and sample-efficiency of preference-based RL algorithms on complex robot manipulation tasks from Meta-World benchmarks, compared with other existing exploration methods that measure the novelty of state visitation.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xinran Liang;Katherine Shu;Kimin Lee;Pieter Abbeel", "authorids": "~Xinran_Liang1;~Katherine_Shu1;~Kimin_Lee1;~Pieter_Abbeel2", "gender": "F;F;M;M", "homepage": "https://xinranliang.github.io/xinranliang/;;https://sites.google.com/view/kiminlee;https://people.eecs.berkeley.edu/~pabbeel/", "dblp": ";;183/6849;", "google_scholar": "jAbh0hcAAAAJ;;92M8xv4AAAAJ;https://scholar.google.com.tw/citations?user=vtwH6GkAAAAJ", "orcid": ";;;", "linkedin": ";katherine-shu/;;", "or_profile": "~Xinran_Liang1;~Katherine_Shu1;~Kimin_Lee1;~Pieter_Abbeel2", "aff": "University of California, Berkeley;University of California, Berkeley;University of California, Berkeley;Covariant", "aff_domain": "berkeley.edu;berkeley.edu;berkeley.edu;covariant.ai", "position": "Undergrad student;Undergrad student;Postdoc;Founder", "bibtex": "@inproceedings{\nliang2022reward,\ntitle={Reward Uncertainty for Exploration in Preference-based Reinforcement Learning},\nauthor={Xinran Liang and Katherine Shu and Kimin Lee and Pieter Abbeel},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=OWZVD-l-ZrC}\n}", "github": "", "project": "", "reviewers": "x6bk;v2Tv;WzMo;VUcZ", "pdf_size": 0, "recommendation": "5;6;6;6", "confidence": "4;3;4;4", "correctness": "3;3;2;3", "technical_novelty": "2;2;3;2", "empirical_novelty": "2;3;3;2", "wc_summary_paper": "135;31;54;45", "wc_summary_review": "30;55;30;69", "wc_main_review": "112;471;441;297", "wc_review": "277;557;525;411", "wc_reply_reviewers": "47;68;156;165", "wc_reply_authors": "822;935;1179;1031", "reply_reviewers": "1;1;1;1", "reply_authors": "3;3;4;4", "recommendation_avg": [ 5.75, 0.4330127018922193 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 66.25, 40.53008142108772 ], "wc_summary_review_avg": [ 46.0, 16.748134224444225 ], "wc_main_review_avg": [ 330.25, 142.13967602326946 ], "wc_review_avg": [ 442.5, 109.88516733390362 ], "wc_reply_reviewers_avg": [ 109.0, 52.12964607591346 ], "wc_reply_authors_avg": [ 991.75, 130.9949903622272 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 3.5, 0.5 ], "replies_avg": [ 23, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": -0.3333333333333333, "gs_citation": 93, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9905998768496237199&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=OWZVD-l-ZrC", "email": "berkeley.edu;berkeley.edu;berkeley.edu;covariant.ai", "author_num": 4, "aff_unique_index": "0;0;0;1", "aff_unique_norm": "University of California, Berkeley;Covariant", "aff_unique_dep": ";", "aff_unique_url": "https://www.berkeley.edu;", "aff_unique_abbr": "UC Berkeley;", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Berkeley;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States;" }, { "title": "Topological Experience Replay", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6567", "id": "OXRZeMmOI7a", "poster": "", "openreview": "https://openreview.net/forum?id=OXRZeMmOI7a", "slides": "https://iclr.cc/virtual/2022/poster/6567", "video": "https://iclr.cc/virtual/2022/poster/6567", "author_site": "Zhang-Wei Hong, Tao Chen, Yen-Chen Lin, Joni Pajarinen, Pulkit Agrawal", "tldr": "", "abstract": "State-of-the-art deep Q-learning methods update Q-values using state transition tuples sampled from the experience replay buffer. This strategy often randomly samples or prioritizes data sampling based on measures such as the temporal difference (TD) error. Such sampling strategies can be inefficient at learning Q-function since a state's correct Q-value preconditions on the accurate successor states' Q-value. Disregarding such a successor's value dependency leads to useless updates and even learning wrong values.\nTo expedite Q-learning, we maintain states' dependency by organizing the agent's experience into a graph. Each edge in the graph represents a transition between two connected states. We perform value backups via a breadth-first search that expands vertices in the graph starting from the set of terminal states successively moving backward. We empirically show that our method is substantially more data-efficient than several baselines on a diverse range of goal-reaching tasks. Notably, the proposed method also outperforms baselines that consume more batches of training experience. ", "keywords": "Deep reinforcement learning;experience replay", "primary_area": "", "supplementary_material": "/attachment/2b0f7b6044a4284d62d45f86e1df224be5056232.zip", "author": "Zhang-Wei Hong;Tao Chen;Yen-Chen Lin;Joni Pajarinen;Pulkit Agrawal", "authorids": "~Zhang-Wei_Hong1;~Tao_Chen1;~Yen-Chen_Lin1;~Joni_Pajarinen2;~Pulkit_Agrawal1", "gender": "M;M;M;;M", "homepage": ";https://taochenshh.github.io;http://yenchenlin.me/;;https://people.eecs.berkeley.edu/~pulkitag/", "dblp": "198/0600;;180/0954;23/8355;149/2672", "google_scholar": "GZkyN4cAAAAJ;gdUv1PIAAAAJ;RbCKRPcAAAAJ;https://scholar.google.fi/citations?user=-2fJStwAAAAJ;UpZmJI0AAAAJ", "orcid": ";;;0000-0003-4469-8191;", "linkedin": ";;;;", "or_profile": "~Zhang-Wei_Hong1;~Tao_Chen1;~Yen-Chen_Lin1;~Joni_Pajarinen2;~Pulkit_Agrawal1", "aff": "Microsoft Research;Massachusetts Institute of Technology;Massachusetts Institute of Technology;Technische Universit\u00e4t Darmstadt;Massachusetts Institute of Technology", "aff_domain": "research.microsoft.com;mit.edu;mit.edu;tu-darmstadt.de;mit.edu", "position": "Internship;PhD student;PhD student;Researcher;Assistant Professor", "bibtex": "@inproceedings{\nhong2022topological,\ntitle={Topological Experience Replay},\nauthor={Zhang-Wei Hong and Tao Chen and Yen-Chen Lin and Joni Pajarinen and Pulkit Agrawal},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=OXRZeMmOI7a}\n}", "github": "", "project": "", "reviewers": "JmAq;t7F3;ANXD;mMYf", "pdf_size": 0, "recommendation": "5;6;8;8", "confidence": "4;5;4;4", "correctness": "3;3;4;3", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;3;4;2", "wc_summary_paper": "72;44;121;115", "wc_summary_review": "140;16;123;52", "wc_main_review": "708;168;231;580", "wc_review": "920;228;475;747", "wc_reply_reviewers": "1189;129;0;0", "wc_reply_authors": "3251;855;524;1026", "reply_reviewers": "4;2;0;0", "reply_authors": "8;2;1;2", "recommendation_avg": [ 6.75, 1.299038105676658 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 88.0, 31.662280397975127 ], "wc_summary_review_avg": [ 82.75, 50.74137857804023 ], "wc_main_review_avg": [ 421.75, 227.9017057856303 ], "wc_review_avg": [ 592.5, 263.5303587824371 ], "wc_reply_reviewers_avg": [ 329.5, 499.01928820437394 ], "wc_reply_authors_avg": [ 1414.0, 1075.8361864150136 ], "reply_reviewers_avg": [ 1.5, 1.6583123951777 ], "reply_authors_avg": [ 3.25, 2.7726341266023544 ], "replies_avg": [ 24, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": 0.5555555555555555, "gs_citation": 21, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4282470605992998057&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=OXRZeMmOI7a", "email": "research.microsoft.com;mit.edu;mit.edu;tu-darmstadt.de;mit.edu", "author_num": 5, "aff_unique_index": "0;1;1;2;1", "aff_unique_norm": "Microsoft;Massachusetts Institute of Technology;Technische Universit\u00e4t Darmstadt", "aff_unique_dep": "Microsoft Research;;", "aff_unique_url": "https://www.microsoft.com/en-us/research;https://web.mit.edu;https://www.tu-darmstadt.de", "aff_unique_abbr": "MSR;MIT;TUD", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;0", "aff_country_unique": "United States;Germany" }, { "title": "Mention Memory: incorporating textual knowledge into Transformers through entity mention attention", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6941", "id": "OY1A8ejQgEX", "poster": "", "openreview": "https://openreview.net/forum?id=OY1A8ejQgEX", "slides": "https://iclr.cc/virtual/2022/poster/6941", "video": "https://iclr.cc/virtual/2022/poster/6941", "author_site": "Michiel de Jong, Yury Zemlyanskiy, Nicholas FitzGerald, Fei Sha, William Cohen", "tldr": "", "abstract": "Natural language understanding tasks such as open-domain question answering often require retrieving and assimilating factual information from multiple sources. We propose to address this problem by integrating a semi-parametric representation of a large text corpus into a Transformer model as a source of factual knowledge. \nSpecifically, our method represents knowledge with ``mention memory'', a table of dense vector representations of every entity mention in a corpus. The proposed model - TOME - is a Transformer that accesses the information through internal memory layers in which each entity mention in the input passage attends to the mention memory. This approach enables synthesis of and reasoning over many disparate sources of information within a single Transformer model. \nIn experiments using a memory of 150 million Wikipedia mentions, TOME achieves strong performance on several open-domain knowledge-intensive tasks, including the claim verification benchmarks HoVer and FEVER and several entity-based QA benchmarks. We also show that the model learns to attend to informative mentions without any direct supervision. Finally we demonstrate that the model can generalize to new unseen entities by updating the memory without retraining.", "keywords": "NLP;Entities and Relations;Memory", "primary_area": "", "supplementary_material": "/attachment/bf8f041e36cd9db3d801c0cfd721f1524ad3aec6.zip", "author": "Michiel de Jong;Yury Zemlyanskiy;Nicholas FitzGerald;Fei Sha;William W. Cohen", "authorids": "~Michiel_de_Jong1;~Yury_Zemlyanskiy1;~Nicholas_FitzGerald1;~Fei_Sha3;~William_W._Cohen2", "gender": "M;M;M;M;M", "homepage": ";https://urikz.github.io/;;https://wwcohen.github.io/;http://feisha.org", "dblp": "223/0153;225/5302;85/9686.html;c/WWCohen.html;13/3601", "google_scholar": "R7wXId8AAAAJ;fkkxyJUAAAAJ;fLcACE8AAAAJ;8ys-38kAAAAJ;HDHOS0QAAAAJ", "orcid": ";;;;", "linkedin": ";yury-zemlyanskiy/;;;", "or_profile": "~Michiel_de_Jong1;~Yury_Zemlyanskiy1;~Nicholas_FitzGerald1;~William_W._Cohen2;~Fei_Sha2", "aff": "University of Southern California;University of Southern California;Google;Google DeepMind;Google", "aff_domain": "usc.edu;usc.edu;google.com;google.com;google.com", "position": "PhD student;PhD student;Research Scientist;Principle Scientist;research scientist", "bibtex": "@inproceedings{\njong2022mention,\ntitle={Mention Memory: incorporating textual knowledge into Transformers through entity mention attention},\nauthor={Michiel de Jong and Yury Zemlyanskiy and Nicholas FitzGerald and Fei Sha and William W. Cohen},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=OY1A8ejQgEX}\n}", "github": "", "project": "", "reviewers": "giXh;ShxU;nJu8;djR9", "pdf_size": 0, "recommendation": "6;8;8;8", "confidence": "4;4;4;4", "correctness": "4;4;3;4", "technical_novelty": "3;3;3;3", "empirical_novelty": "3;3;3;4", "wc_summary_paper": "113;90;306;87", "wc_summary_review": "55;21;45;33", "wc_main_review": "91;256;379;167", "wc_review": "259;367;730;287", "wc_reply_reviewers": "0;181;128;52", "wc_reply_authors": "281;878;648;47", "reply_reviewers": "0;2;1;1", "reply_authors": "2;3;1;1", "recommendation_avg": [ 7.5, 0.8660254037844386 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 3.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 149.0, 91.20032894677519 ], "wc_summary_review_avg": [ 38.5, 12.757350822173073 ], "wc_main_review_avg": [ 223.25, 107.22027560121266 ], "wc_review_avg": [ 410.75, 188.5316617971634 ], "wc_reply_reviewers_avg": [ 90.25, 69.40596155950871 ], "wc_reply_authors_avg": [ 463.5, 321.1810237233825 ], "reply_reviewers_avg": [ 1.0, 0.7071067811865476 ], "reply_authors_avg": [ 1.75, 0.82915619758885 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": -0.3333333333333333, "gs_citation": 53, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=162154523875231297&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=OY1A8ejQgEX", "email": "usc.edu;usc.edu;google.com;google.com;google.com", "author_num": 5, "aff_unique_index": "0;0;1;1;1", "aff_unique_norm": "University of Southern California;Google", "aff_unique_dep": ";Google", "aff_unique_url": "https://www.usc.edu;https://www.google.com", "aff_unique_abbr": "USC;Google", "aff_campus_unique_index": "0;0;1;1", "aff_campus_unique": "Los Angeles;Mountain View;", "aff_country_unique_index": "0;0;0;1;0", "aff_country_unique": "United States;United Kingdom" }, { "id": "OZ_2rF2D4Nw", "title": "Kokoyi: Executable LaTeX for End-to-end Deep Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Despite substantial efforts from the deep learning system community to relieve researchers and practitioners from the burden of implementing models with ever-growing complexity, a considerable lingual gap remains between developing models in the language of mathematics and implementing them in the languages of computer. The mission of Kokoyi is to close this gap by enabling automatic translation of mathematics into efficient implementations, thereby making math-in-codes and math-in-model consistent. This paper presents our first step towards the goal: kokoyi-lang, a programming language with the syntax of LaTeX and the semantics of deep learning mathematics, and a prototype kokoyi-lang compiler and runtime supporting advanced optimizations such as auto-batching. Kokoyi is integrated with Jupyter Notebook, and will be released in open-source.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/98b3ee2415c0d0d459510548096831baa5395996.zip", "author": "Minjie Wang;Haoming Lu;Yu Gai;Lesheng Jin;Zihao Ye;Zheng Zhang", "authorids": "~Minjie_Wang2;~Haoming_Lu1;~Yu_Gai1;~Lesheng_Jin1;~Zihao_Ye1;zhaz@amazon.com", "gender": ";;;M;M;", "homepage": ";;https://www.linkedin.com/in/yu-g-0061641b7/;;https://homes.cs.washington.edu/~zhye/;", "dblp": "58/10312;;209/9750;268/1143;126/0507-1;", "google_scholar": "OJja8NgAAAAJ;2-93gvIAAAAJ;;4TpD01UAAAAJ;L5JfNCoAAAAJ;", "orcid": "0009-0009-8156-1179;;0009-0002-8921-6731;;;", "linkedin": ";;yu-g-0061641b7/;lesheng-jin-9618b0201/?originalSubdomain=cn;;", "or_profile": "~Minjie_Wang2;~Haoming_Lu1;~Yu_Gai1;~Lesheng_Jin1;~Zihao_Ye1;zhaz@amazon.com", "aff": "Amazon;Picsart AI Research;University of California, Berkeley;University of California, San Diego;University of Washington;", "aff_domain": "amazon.com;picsart.com;berkeley.edu;ucsd.edu;cs.washington.edu;", "position": "Senior Applied Scientist;Applied Scientist;PhD student;MS student;PhD student;", "bibtex": "@misc{\nwang2022kokoyi,\ntitle={Kokoyi: Executable LaTeX for End-to-end Deep Learning},\nauthor={Minjie Wang and Haoming Lu and Yu Gai and Lesheng Jin and Zihao Ye and Zheng Zhang},\nyear={2022},\nurl={https://openreview.net/forum?id=OZ_2rF2D4Nw}\n}", "github": "", "project": "", "reviewers": "8Mk2;aMmp;cySy", "site": "https://openreview.net/forum?id=OZ_2rF2D4Nw", "pdf_size": 0, "recommendation": "5;5;6", "confidence": "3;4;4", "correctness": "4;3;4", "technical_novelty": "3;2;4", "empirical_novelty": "2;2;4", "wc_summary_paper": "67;70;36", "wc_summary_review": "33;30;13", "wc_main_review": "285;341;156", "wc_review": "385;441;205", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "255;678;69", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "recommendation_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 3.0, 0.816496580927726 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.9428090415820634 ], "wc_summary_paper_avg": [ 57.666666666666664, 15.369522511198006 ], "wc_summary_review_avg": [ 25.333333333333332, 8.806563209081938 ], "wc_main_review_avg": [ 260.6666666666667, 77.46110134914312 ], "wc_review_avg": [ 343.6666666666667, 100.68211801948193 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 334.0, 254.82150615676065 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.4999999999999999, "corr_recommendation_correctness": 0.4999999999999999, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:GojWTIBaeeIJ:scholar.google.com/&scioq=Kokoyi:+Executable+LaTeX+for+End-to-end+Deep+Learning&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;2;3;4", "aff_unique_norm": "Amazon;Picsart;University of California, Berkeley;University of California, San Diego;University of Washington", "aff_unique_dep": "Amazon.com, Inc.;AI Research;;;", "aff_unique_url": "https://www.amazon.com;https://research.picsart.com;https://www.berkeley.edu;https://www.ucsd.edu;https://www.washington.edu", "aff_unique_abbr": "Amazon;Picsart AI;UC Berkeley;UCSD;UW", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Berkeley;San Diego", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "O_OJoU4_yj", "title": "Stabilized Self-training with Negative Sampling on Few-labeled Graph Data", "track": "main", "status": "Reject", "tldr": "", "abstract": "Graph neural networks (GNNs) are designed for semi-supervised node classification on graphs where only a small subset of nodes have class labels. However, under extreme cases when very few labels are available (e.g., 1 labeled node per class), GNNs suffer from severe result quality degradation. \nSpecifically, we observe that existing GNNs suffer from unstable training process on few-labeled graph data, resulting to inferior performance on node classification. Therefore, we propose an effective framework, Stabilized self-training with Negative sampling (SN), which is applicable to existing GNNs to stabilize the training process and enhance the training data, and consequently, boost classification accuracy on graphs with few labeled data. In experiments, we apply our SN framework to two existing GNN base models (GCN and DAGNN) to get SNGCN and SNDAGNN, and evaluate the two methods against 13 existing solutions over 4 benchmarking datasets. Extensive experiments show that the proposed SN framework is highly effective compared with existing solutions, especially under settings with very few labeled data. In particular, on a benchmark dataset Cora with only 1 labeled node per class, while GCN only has 44.6% accuracy, SNGCN achieves 62.5% accuracy, improving GCN by 17.9%; SNDAGNN has accuracy 66.4%, improving that of the base model DAGNN (59.8%) by 6.6%.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ziang Zhou;Jieming Shi;Shengzhong Zhang;Zengfeng Huang;Qing Li", "authorids": "~Ziang_Zhou2;~Jieming_Shi1;~Shengzhong_Zhang1;~Zengfeng_Huang1;~Qing_Li5", "gender": "M;Not Specified;M;M;M", "homepage": "https://scottjiao.github.io/;https://www4.comp.polyu.edu.hk/~jiemshi/;https://szzhang17.github.io/;https://zengfenghuang.github.io/;https://www4.comp.polyu.edu.hk/~csqli/", "dblp": "250/2528;147/1237-1.html;255/8703;97/9726;(2024-11-14-1812689)", "google_scholar": "Xgzk5qEAAAAJ;;bWD48lgAAAAJ;https://scholar.google.com.hk/citations?user=FwNBuXUAAAAJ;https://scholar.google.co.in/citations?user=D1LEg-YAAAAJ", "orcid": ";0000-0002-0465-1551;0000-0003-1783-6835;0000-0003-2671-7483;0000-0003-3370-471X", "linkedin": ";;;;", "or_profile": "~Ziang_Zhou2;~Jieming_Shi1;~Shengzhong_Zhang1;~Zengfeng_Huang1;~Qing_Li5", "aff": "Hong Kong Polytechnic University;The Hong Kong Polytechnic University;Fudan University;Fudan University;Hong Kong Polytechnic University", "aff_domain": "connect.polyu.hk;polyu.edu.hk;fudan.edu.cn;fudan.edu.cn;polyu.edu.hk", "position": "PhD student;Assistant Professor;PhD student;Associate Professor;Full Professor", "bibtex": "@misc{\nzhou2022stabilized,\ntitle={Stabilized Self-training with Negative Sampling on Few-labeled Graph Data},\nauthor={Ziang Zhou and Jieming Shi and Shengzhong Zhang and Zengfeng Huang and Qing Li},\nyear={2022},\nurl={https://openreview.net/forum?id=O_OJoU4_yj}\n}", "github": "", "project": "", "reviewers": "s9Vo;DW1F;XgQF;wfd3;S1e4", "site": "https://openreview.net/forum?id=O_OJoU4_yj", "pdf_size": 0, "recommendation": "1;3;3;5;5", "confidence": "4;4;5;4;4", "correctness": "2;2;2;3;3", "technical_novelty": "1;2;1;3;2", "empirical_novelty": "0;0;1;3;0", "wc_summary_paper": "54;48;47;38;72", "wc_summary_review": "59;41;28;52;39", "wc_main_review": "175;195;184;384;392", "wc_review": "288;284;259;474;503", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "0;0;0;0;0", "reply_reviewers": "0;0;0;0;0", "reply_authors": "0;0;0;0;0", "recommendation_avg": [ 3.4, 1.4966629547095767 ], "confidence_avg": [ 4.2, 0.39999999999999997 ], "correctness_avg": [ 2.4, 0.4898979485566356 ], "technical_novelty_avg": [ 1.8, 0.7483314773547883 ], "empirical_novelty_avg": [ 0.8, 1.1661903789690602 ], "wc_summary_paper_avg": [ 51.8, 11.320777358467925 ], "wc_summary_review_avg": [ 43.8, 10.759182125050211 ], "wc_main_review_avg": [ 266.0, 99.84588123703452 ], "wc_review_avg": [ 361.6, 104.49229636676571 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.13363062095621217, "corr_recommendation_correctness": 0.8728715609439696, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:N0ziIn-TkX8J:scholar.google.com/&scioq=Stabilized+Self-training+with+Negative+Sampling+on+Few-labeled+Graph+Data&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;1;1;0", "aff_unique_norm": "Hong Kong Polytechnic University;Fudan University", "aff_unique_dep": ";", "aff_unique_url": "https://www.polyu.edu.hk;https://www.fudan.edu.cn", "aff_unique_abbr": "PolyU;Fudan", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "title": "A Loss Curvature Perspective on Training Instabilities of Deep Learning Models", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6462", "id": "OcKMT-36vUs", "poster": "", "openreview": "https://openreview.net/forum?id=OcKMT-36vUs", "slides": "https://iclr.cc/virtual/2022/poster/6462", "video": "https://iclr.cc/virtual/2022/poster/6462", "author_site": "Justin Gilmer, Behrooz Ghorbani, Ankush Garg, Sneha Kudugunta, Behnam Neyshabur, David Cardoze, George Dahl, Zachary Nado, Orhan Firat", "tldr": "", "abstract": "In this work, we study the evolution of the loss Hessian across many classification tasks in order to understand the effect the curvature of the loss has on the training dynamics. Whereas prior work has focused on how different learning rates affect the loss Hessian observed during training, we also analyze the effects of model initialization, architectural choices, and common training heuristics such as gradient clipping and learning rate warmup. Our results demonstrate that successful model and hyperparameter choices allow the early optimization trajectory to either avoid---or navigate out of---regions of high curvature and into flatter regions that tolerate a higher learning rate. Our results suggest a unifying perspective on how disparate mitigation strategies for training instability ultimately address the same underlying failure mode of neural network optimization, namely poor conditioning. Inspired by the conditioning perspective, we show that learning rate warmup can improve training stability just as much as batch normalization, layer normalization, MetaInit, GradInit, and Fixup initialization.", "keywords": "Optimization;Deep Learning;Training Instability;Curvature;Loss Landscape;Hessian", "primary_area": "", "supplementary_material": "", "author": "Justin Gilmer;Behrooz Ghorbani;Ankush Garg;Sneha Kudugunta;Behnam Neyshabur;David Cardoze;George Edward Dahl;Zachary Nado;Orhan Firat", "authorids": "~Justin_Gilmer1;~Behrooz_Ghorbani1;~Ankush_Garg1;~Sneha_Kudugunta1;~Behnam_Neyshabur1;dcardoze@google.com;~George_Edward_Dahl1;~Zachary_Nado1;~Orhan_Firat1", "gender": "M;;M;F;M;;M;M;M", "homepage": ";;;;https://www.neyshabur.net;;https://www.cs.toronto.edu/~gdahl;http://zna.do;", "dblp": ";162/0166;86/7221;;131/9898;;10/7998;228/7785;120/2225", "google_scholar": "Ml_vQ8MAAAAJ;;https://scholar.google.com/citations?hl=en;LeEwxtgAAAAJ;e1ucbCYAAAAJ;;ghbWy-0AAAAJ;tazGc34AAAAJ;https://scholar.google.com.tr/citations?user=dLaR9lgAAAAJ", "orcid": ";;;;;;;;", "linkedin": ";;agbgarg/;;;;;;", "or_profile": "~Justin_Gilmer1;~Behrooz_Ghorbani1;~Ankush_Garg1;~Sneha_Kudugunta1;~Behnam_Neyshabur1;dcardoze@google.com;~George_Edward_Dahl1;~Zachary_Nado1;~Orhan_Firat1", "aff": "Google Brain;Google;Google;Google DeepMind;Google;;Google;Google;Google", "aff_domain": "google.com;google.com;google.com;google.com;google.com;;google.com;google.com;google.com", "position": "Researcher;Researcher;research engineer;Researcher;Research Scientist;;Research Scientist;Research Engineer;Research Scientist", "bibtex": "@inproceedings{\ngilmer2022a,\ntitle={A Loss Curvature Perspective on Training Instabilities of Deep Learning Models},\nauthor={Justin Gilmer and Behrooz Ghorbani and Ankush Garg and Sneha Kudugunta and Behnam Neyshabur and David Cardoze and George Edward Dahl and Zachary Nado and Orhan Firat},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=OcKMT-36vUs}\n}", "github": "", "project": "", "reviewers": "oiwU;P6at;vdgP;RGH4", "pdf_size": 0, "recommendation": "5;6;8;8", "confidence": "4;4;3;4", "correctness": "3;4;4;4", "technical_novelty": "2;3;2;1", "empirical_novelty": "2;3;3;4", "wc_summary_paper": "167;23;117;177", "wc_summary_review": "28;44;23;26", "wc_main_review": "403;262;152;232", "wc_review": "598;329;292;435", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "1130;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "2;0;0;0", "recommendation_avg": [ 6.75, 1.299038105676658 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 121.0, 60.975404877704584 ], "wc_summary_review_avg": [ 30.25, 8.13557004763649 ], "wc_main_review_avg": [ 262.25, 90.66524968255479 ], "wc_review_avg": [ 413.5, 118.74868420323654 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 282.5, 489.30435313820783 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0.5, 0.8660254037844386 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 9, 0 ], "corr_recommendation_confidence": -0.5555555555555555, "corr_recommendation_correctness": 0.7777777777777777, "gs_citation": 41, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=20901287272284784&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "pdf": "https://openreview.net/pdf?id=OcKMT-36vUs", "email": "google.com;google.com;google.com;google.com;google.com;;google.com;google.com;google.com", "author_num": 9, "aff_unique_index": "0;0;0;0;0;0;0;0", "aff_unique_norm": "Google", "aff_unique_dep": "Google Brain", "aff_unique_url": "https://brain.google.com", "aff_unique_abbr": "Google Brain", "aff_campus_unique_index": "0;0;0;0;0;0;0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;0;0;1;0;0;0;0", "aff_country_unique": "United States;United Kingdom" }, { "id": "OcvjQ3yqgTG", "title": "ImpressLearn: Continual Learning via Combined Task Impressions", "track": "main", "status": "Reject", "tldr": "", "abstract": "This work proposes a new method to sequentially train a deep neural network on multiple tasks without suffering catastrophic forgetting, while endowing it with the capability to quickly adapt to unknown tasks. Starting from existing work on network masking (Wortsman et al., 2020), we show that a simple to learn linear combination of a small number of task-specific masks (\u201dimpressions\u201d) ona randomly initialized backbone network is sufficient to both retain accuracy on previously learned tasks, as well as achieve high accuracy on new tasks.\n\nIn contrast to previous methods, we do not require to generate dedicated masks or contexts for each new task, instead leveraging transfer learning to keep per-task parameter overhead negligible. Our work illustrates the power of linearly combining individual impressions, each of which fares poorly in isolation, to achieve performance comparable to a dedicated mask. Moreover, even repeated impressions from the same task (homogeneous masks), when combined can approach the performance of heterogeneous combinations if sufficiently many impressions are used.\n\nOur approach scales more efficiently than existing methods, requiring orders of magnitude fewer parameters and can function without modification even when task identity is missing. In addition, in the setting where task labels are not given at inference, our algorithm gives an often favorable alternative to the entropy based task-inference methods proposed in (Wortsman et al., 2020). We evaluate our method on a number of well known image classification data sets and architectures", "keywords": "Catastrophic forgetting;Continual learning;Neural networks;Masking", "primary_area": "", "supplementary_material": "/attachment/6284a6811f7d52e2ac635d9d415434389ee596a0.zip", "author": "Dhrupad Bhardwaj;Julia Kempe;Artem M Vysogorets;Angela Teng;Evaristus Ezekwem", "authorids": "~Dhrupad_Bhardwaj1;~Julia_Kempe1;~Artem_M_Vysogorets1;~Angela_Teng1;~Evaristus_Ezekwem1", "gender": "M;;M;F;M", "homepage": ";;https://artem.vysogorets.org;;", "dblp": "330/4474.html;;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": "dhrupadb/;;avysogorets/;angelavteng/;https://linkedin.com/in/evaezekwem", "or_profile": "~Dhrupad_Bhardwaj1;~Julia_Kempe1;~Artem_M_Vysogorets1;~Angela_Teng1;~Evaristus_Ezekwem1", "aff": "New York University;;Bloomberg;;", "aff_domain": "nyu.edu;;bloomberg.com;;", "position": "MS student;;Intern;;", "bibtex": "@misc{\nbhardwaj2022impresslearn,\ntitle={ImpressLearn: Continual Learning via Combined Task Impressions},\nauthor={Dhrupad Bhardwaj and Julia Kempe and Artem M Vysogorets and Angela Teng and Evaristus Ezekwem},\nyear={2022},\nurl={https://openreview.net/forum?id=OcvjQ3yqgTG}\n}", "github": "", "project": "", "reviewers": "7LBU;Tj8S;Xchu;MxHs", "site": "https://openreview.net/forum?id=OcvjQ3yqgTG", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "4;5;3;3", "correctness": "3;3;3;4", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "82;103;57;79", "wc_summary_review": "254;19;65;26", "wc_main_review": "87;362;276;254", "wc_review": "423;484;398;359", "wc_reply_reviewers": "0;57;73;0", "wc_reply_authors": "199;167;150;108", "reply_reviewers": "0;1;1;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 80.25, 16.29992331270304 ], "wc_summary_review_avg": [ 91.0, 95.72617196984324 ], "wc_main_review_avg": [ 244.75, 99.61770675939093 ], "wc_review_avg": [ 416.0, 45.403744338985966 ], "wc_reply_reviewers_avg": [ 32.5, 32.98863440641337 ], "wc_reply_authors_avg": [ 156.0, 32.8252951243397 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.9045340337332909, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:cG3_XrbXgI4J:scholar.google.com/&scioq=ImpressLearn:+Continual+Learning+via+Combined+Task+Impressions&hl=en&as_sdt=0,33", "gs_version_total": 3, "aff_unique_index": "0;1", "aff_unique_norm": "New York University;Bloomberg", "aff_unique_dep": ";", "aff_unique_url": "https://www.nyu.edu;https://www.bloomberg.com", "aff_unique_abbr": "NYU;Bloomberg", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "OdTx-22f6H", "title": "Utilizing Attention, Linked Blocks, And Pyramid Pooling To Propel Brain Tumor Segmentation In 3D", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "We present an approach to detect and segment tumorous regions of the brain by establishing three varied segmentation architectures for multiclass semantic segmentation along with data specific customizations like residual blocks, soft attention mechanism, pyramid pooling, linked architecture and 3D compatibility to work with 3D brain MRI images. The proposed segmentation architectures namely, Attention Residual UNET 3D also referred to as AR-UNET 3D, LinkNet 3D and PSPNet 3D, segment the MRI images and succeed in isolating three classes of tumors. By assigning pixel probabilities, each of these models differentiates between pixels belonging to tumorous and non-tumorous regions of the brain. By experimenting and observing the performance of each of the three architectures using metrics like Dice loss and Dice score, on the BraTS2020 dataset, we successfully establish quality results.", "keywords": "Computer Vision;Deep Learning;3D Semantic Segmentation;Medical Imaging", "primary_area": "", "supplementary_material": "/attachment/b9f42371a7f64091277032d082120bd0455197d0.zip", "author": "Pooja Ravi;Srijarko Roy;Indira Dutta", "authorids": "~Pooja_Ravi1;~Srijarko_Roy1;~Indira_Dutta1", "gender": "F;M;F", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": "pooja-ravi-9b88861b2/;srijarko-roy-9193751b0/;indira-dutta-775445197", "or_profile": "~Pooja_Ravi1;~Srijarko_Roy1;~Indira_Dutta1", "aff": "SRM Institute of Science and Technology;SRM Institute of Science and Technology;SRM Institute of Science and Technology", "aff_domain": "srmist.edu.in;srmist.edu.in;srmist.edu.in", "position": "Undergrad student;Undergrad student;Undergrad student", "bibtex": "@misc{\nravi2022utilizing,\ntitle={Utilizing Attention, Linked Blocks, And Pyramid Pooling To Propel Brain Tumor Segmentation In 3D},\nauthor={Pooja Ravi and Srijarko Roy and Indira Dutta},\nyear={2022},\nurl={https://openreview.net/forum?id=OdTx-22f6H}\n}", "github": "", "project": "", "reviewers": "5vzk;iJki;bHSX;oQWR;PhdB", "site": "https://openreview.net/forum?id=OdTx-22f6H", "pdf_size": 0, "recommendation": "1;1;1;3;3", "confidence": "5;5;5;3;5", "correctness": "3;1;1;3;1", "technical_novelty": "1;1;1;1;1", "empirical_novelty": "1;1;1;1;1", "wc_summary_paper": "22;42;92;34;59", "wc_summary_review": "14;46;20;21;22", "wc_main_review": "43;568;546;239;141", "wc_review": "79;656;658;294;222", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "0;0;0;0;0", "reply_reviewers": "0;0;0;0;0", "reply_authors": "0;0;0;0;0", "recommendation_avg": [ 1.8, 0.9797958971132713 ], "confidence_avg": [ 4.6, 0.7999999999999999 ], "correctness_avg": [ 1.8, 0.9797958971132713 ], "technical_novelty_avg": [ 1.0, 0.0 ], "empirical_novelty_avg": [ 1.0, 0.0 ], "wc_summary_paper_avg": [ 49.8, 24.284974778656863 ], "wc_summary_review_avg": [ 24.6, 11.056219968868202 ], "wc_main_review_avg": [ 307.4, 213.12775511415683 ], "wc_review_avg": [ 381.8, 235.1190336829411 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.6123724356957945, "corr_recommendation_correctness": 0.16666666666666669, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:BaFKNCVGFX8J:scholar.google.com/&scioq=Utilizing+Attention,+Linked+Blocks,+And+Pyramid+Pooling+To+Propel+Brain+Tumor+Segmentation+In+3D&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;0", "aff_unique_norm": "SRM Institute of Science and Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.srmist.edu.in", "aff_unique_abbr": "SRMIST", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "India" }, { "id": "OdnNBNIdFul", "title": "A Closer Look at Loss Weighting in Multi-Task Learning", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Multi-Task Learning (MTL) has achieved great success in various fields, however, how to balance different tasks to avoid negative effects is still a key problem. To achieve the task balancing, there exist many works to balance task losses or gradients. In this paper, we unify eight representative task balancing methods from the perspective of loss weighting and provide a consistent experimental comparison. Moreover, we surprisingly find that training a MTL model with random weights sampled from a distribution can achieve comparable performance over state-of-the-art baselines. Based on this finding, we propose a simple yet effective weighting strategy called Random Loss Weighting (RLW), which can be implemented in only one additional line of code over existing works. Theoretically, we analyze the convergence of RLW and reveal that RLW has a higher probability to escape local minima than existing models with fixed task weights, resulting in a better generalization ability. Empirically, we extensively evaluate the proposed RLW method on six image datasets and four multilingual tasks from the XTREME benchmark to show the effectiveness of the proposed RLW strategy when compared with state-of-the-art strategies.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/7bde3daaf10a008a6ea962acf2f81b925ab52175.zip", "author": "Baijiong Lin;Feiyang Ye;Yu Zhang", "authorids": "~Baijiong_Lin1;~Feiyang_Ye4;~Yu_Zhang3", "gender": "M;M;M", "homepage": "https://baijiong-lin.github.io/;https://feiyang-ye.github.io/;http://cse.sustech.edu.cn/faculty/~zhangy/", "dblp": "279/2950;285/4704;50/671-6", "google_scholar": "KVdbYTYAAAAJ;3EX25cAAAAAJ;https://scholar.google.com.hk/citations?user=jaRS5w4AAAAJ", "orcid": "0000-0002-4257-0226;;", "linkedin": ";;", "or_profile": "~Baijiong_Lin1;~Feiyang_Ye4;~Yu_Zhang3", "aff": "Southern University of Science and Technology;A*STAR;Southern University of Science and Technology", "aff_domain": "mail.sustech.edu.cn;cfar.a-star.edu.sg;sustc.edu.cn", "position": "Research Assistant;Intern;Associate Professor", "bibtex": "@misc{\nlin2022a,\ntitle={A Closer Look at Loss Weighting in Multi-Task Learning},\nauthor={Baijiong Lin and Feiyang Ye and Yu Zhang},\nyear={2022},\nurl={https://openreview.net/forum?id=OdnNBNIdFul}\n}", "github": "", "project": "", "reviewers": "TUYA;wMzV;ony7;Dpi7", "site": "https://openreview.net/forum?id=OdnNBNIdFul", "pdf_size": 0, "recommendation": "3;5;6;6", "confidence": "5;3;3;3", "correctness": "3;3;4;2", "technical_novelty": "3;3;4;3", "empirical_novelty": "3;3;4;3", "wc_summary_paper": "72;132;54;56", "wc_summary_review": "61;106;37;53", "wc_main_review": "362;397;150;198", "wc_review": "495;635;241;307", "wc_reply_reviewers": "418;0;0;80", "wc_reply_authors": "671;732;416;455", "reply_reviewers": "1;0;0;1", "reply_authors": "1;1;1;1", "recommendation_avg": [ 5.0, 1.224744871391589 ], "confidence_avg": [ 3.5, 0.8660254037844386 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 3.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 78.5, 31.666228067138025 ], "wc_summary_review_avg": [ 64.25, 25.606395685453272 ], "wc_main_review_avg": [ 276.75, 104.8746275321157 ], "wc_review_avg": [ 419.5, 155.45015278217002 ], "wc_reply_reviewers_avg": [ 124.5, 172.5709998812083 ], "wc_reply_authors_avg": [ 568.5, 135.44094654128787 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.9428090415820632, "corr_recommendation_correctness": 0.0, "gs_citation": 57, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5668178311210785407&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;0", "aff_unique_norm": "Southern University of Science and Technology;Agency for Science, Technology and Research", "aff_unique_dep": ";", "aff_unique_url": "https://www.sustech.edu.cn;https://www.a-star.edu.sg", "aff_unique_abbr": "SUSTech;A*STAR", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", "aff_country_unique": "China;Singapore" }, { "id": "Odu6pOBshzQ", "title": "Sublinear Least-Squares Value Iteration via Locality Sensitive Hashing", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "We present the first provable Least-Squares Value Iteration (LSVI) algorithm that achieves runtime complexity sublinear in the number of actions. We formulate the value function estimation procedure in value iteration as an approximate maximum inner product search problem and propose a locality sensitive hashing (LSH) type data structure to solve this problem with sublinear time complexity. Moreover, we build the connections between the theory of approximate maximum inner product search and the regret analysis of reinforcement learning. We prove that, with our choice of approximation factor, our Sublinear LSVI algorithms maintain the same regret as the original LSVI algorithms while reducing the runtime complexity to sublinear in the number of actions. To the best of our knowledge, this is the first work that combines LSH with reinforcement learning that resulting in provable improvements. We hope that our novel way of combining data structures and iterative algorithm will open the door for further study into the cost reduction in optimization.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhaozhuo Xu;Zhao Song;Anshumali Shrivastava", "authorids": "~Zhaozhuo_Xu1;zsong@adobe.com;~Anshumali_Shrivastava1", "gender": "M;;M", "homepage": "https://ottovonxu.github.io/;;https://www.cs.rice.edu/~as143/", "dblp": "195/4352;;63/9828", "google_scholar": "7tDlVAsAAAAJ;;https://scholar.google.com.tw/citations?user=SGT23RAAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Zhaozhuo_Xu1;zsong@adobe.com;~Anshumali_Shrivastava1", "aff": "Rice University;;ThirdAI Corp.", "aff_domain": "rice.edu;;thirdai.com", "position": "PhD student;;CEO", "bibtex": "@misc{\nxu2022sublinear,\ntitle={Sublinear Least-Squares Value Iteration via Locality Sensitive Hashing},\nauthor={Zhaozhuo Xu and Zhao Song and Anshumali Shrivastava},\nyear={2022},\nurl={https://openreview.net/forum?id=Odu6pOBshzQ}\n}", "github": "", "project": "", "reviewers": "96D8;nc5a;RUh2;tbJU", "site": "https://openreview.net/forum?id=Odu6pOBshzQ", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "4;4;2;3", "correctness": "4;3;3;3", "technical_novelty": "2;2;2;4", "empirical_novelty": "0;0;2;3", "wc_summary_paper": "224;57;70;149", "wc_summary_review": "49;107;48;20", "wc_main_review": "283;807;140;245", "wc_review": "556;971;258;414", "wc_reply_reviewers": "54;361;0;0", "wc_reply_authors": "255;221;112;12", "reply_reviewers": "1;1;0;0", "reply_authors": "2;1;1;1", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.8660254037844386 ], "empirical_novelty_avg": [ 1.25, 1.299038105676658 ], "wc_summary_paper_avg": [ 125.0, 67.1304699819687 ], "wc_summary_review_avg": [ 56.0, 31.662280397975127 ], "wc_main_review_avg": [ 368.75, 258.38766901692503 ], "wc_review_avg": [ 549.75, 265.0644968682151 ], "wc_reply_reviewers_avg": [ 103.75, 150.15054944954414 ], "wc_reply_authors_avg": [ 150.0, 95.59550198623364 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.48420012470625223, "corr_recommendation_correctness": -0.9271726499455306, "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8908136420360263312&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 2, "aff_unique_index": "0;1", "aff_unique_norm": "Rice University;ThirdAI Corp.", "aff_unique_dep": ";", "aff_unique_url": "https://www.rice.edu;", "aff_unique_abbr": "Rice;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "OgCcfc1m0TO", "title": "Learning to Prompt for Vision-Language Models", "track": "main", "status": "Reject", "tldr": "", "abstract": "Vision-language pre-training has recently emerged as a promising alternative for representation learning. It shifts from the tradition of using images and discrete labels for learning a fixed set of weights, seen as visual concepts, to aligning images and raw text for two separate encoders. Such a paradigm benefits from a broader source of supervision and allows zero-shot transfer to downstream tasks since visual concepts can be diametrically generated from natural language, known as prompt. In this paper, we identify that a major challenge of deploying such models in practice is prompt engineering. This is because designing a proper prompt, especially for context words surrounding a class name, requires domain expertise and typically takes a significant amount of time for words tuning since a slight change in wording could have a huge impact on performance. Moreover, different downstream tasks require specific designs, further hampering the efficiency of deployment. To overcome this challenge, we propose a novel approach named \\emph{context optimization (CoOp)}. The main idea is to model context in prompts using continuous representations and perform end-to-end learning from data while keeping the pre-trained parameters fixed. In this way, the design of task-relevant prompts can be fully automated. Experiments on 11 datasets show that CoOp effectively turns pre-trained vision-language models into data-efficient visual learners, requiring as few as one or two shots to beat hand-crafted prompts with a decent margin and able to gain significant improvements when using more shots (e.g., at 16 shots the average gain is around 17\\% with the highest reaching over 50\\%). CoOp also exhibits strong robustness to distribution shift.", "keywords": "vision-language models;prompt learning;computer vision;transfer learning", "primary_area": "", "supplementary_material": "", "author": "Kaiyang Zhou;Jingkang Yang;Chen Change Loy;Ziwei Liu", "authorids": "~Kaiyang_Zhou1;~Jingkang_Yang1;~Chen_Change_Loy2;~Ziwei_Liu1", "gender": "M;M;M;M", "homepage": "https://kaiyangzhou.github.io/;https://jingkang50.github.io/;https://www.mmlab-ntu.com/person/ccloy/index.html;https://liuziwei7.github.io/", "dblp": "203/3155;175/5365.html;01/5855;05/6300-2", "google_scholar": "https://scholar.google.co.uk/citations?user=gRIejugAAAAJ;S-YjbUYAAAAJ;https://scholar.google.co.uk/citations?user=559LF80AAAAJ;https://scholar.google.com.hk/citations?user=lc45xlcAAAAJ", "orcid": ";;0000-0001-5345-1591;", "linkedin": ";;;", "or_profile": "~Kaiyang_Zhou1;~Jingkang_Yang1;~Chen_Change_Loy2;~Ziwei_Liu1", "aff": ";Nanyang Technological University;Nanyang Technological University;Nanyang Technological University", "aff_domain": ";ntu.edu.sg;ntu.edu.sg;ntu.edu.sg", "position": ";PhD student;Full Professor;Assistant Professor", "bibtex": "@misc{\nzhou2022learning,\ntitle={Learning to Prompt for Vision-Language Models},\nauthor={Kaiyang Zhou and Jingkang Yang and Chen Change Loy and Ziwei Liu},\nyear={2022},\nurl={https://openreview.net/forum?id=OgCcfc1m0TO}\n}", "github": "", "project": "", "reviewers": "KMdr;AYDN;1WJt;pGwp", "site": "https://openreview.net/forum?id=OgCcfc1m0TO", "pdf_size": 0, "recommendation": "1;5;5;6", "confidence": "5;3;4;5", "correctness": "3;4;3;3", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "24;97;63;83", "wc_summary_review": "34;94;19;49", "wc_main_review": "239;341;219;73", "wc_review": "297;532;301;205", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "884;410;430;228", "reply_reviewers": "0;0;0;0", "reply_authors": "2;1;1;1", "recommendation_avg": [ 4.25, 1.920286436967152 ], "confidence_avg": [ 4.25, 0.82915619758885 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 66.75, 27.48067502809929 ], "wc_summary_review_avg": [ 49.0, 28.062430400804562 ], "wc_main_review_avg": [ 218.0, 95.65040512198576 ], "wc_review_avg": [ 333.75, 120.72981197699266 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 488.0, 241.7974358838406 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.3532809023904868, "corr_recommendation_correctness": 0.22549380840084865, "gs_citation": 2991, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16117828918544644907&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 11, "aff_unique_index": "0;0;0", "aff_unique_norm": "Nanyang Technological University", "aff_unique_dep": "", "aff_unique_url": "https://www.ntu.edu.sg", "aff_unique_abbr": "NTU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Singapore" }, { "title": "Contextualized Scene Imagination for Generative Commonsense Reasoning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6866", "id": "Oh1r2wApbPv", "poster": "", "openreview": "https://openreview.net/forum?id=Oh1r2wApbPv", "slides": "https://iclr.cc/virtual/2022/poster/6866", "video": "https://iclr.cc/virtual/2022/poster/6866", "author_site": "PeiFeng Wang, Jonathan Zamora, Junfeng Liu, Filip Ilievski, Muhao Chen, Xiang Ren", "tldr": "", "abstract": "Humans use natural language to compose common concepts from their environment into plausible, day-to-day scene descriptions. However, such generative commonsense reasoning (GCSR) skills are lacking in state-of-the-art text generation methods. Descriptive sentences about arbitrary concepts generated by neural text generation models (e.g., pre-trained text-to-text Transformers) are often grammatically fluent but may not correspond to human common sense, largely due to their lack of mechanisms to capture concept relations, to identify implicit concepts, and to perform generalizable reasoning about unseen concept compositions. In this paper, we propose an Imagine-and-Verbalize (I\\&V) method, which learns to imagine a relational scene knowledge graph (SKG) with relations between the input concepts, and leverage the SKG as a constraint when generating a plausible scene description. We collect and harmonize a set of knowledge resources from different domains and modalities, providing a rich auxiliary supervision signal for I\\&V. The experiments demonstrate the effectiveness of I\\&V in improving language models on both concept-to-sentence and concept-to-story generation tasks, while enabling the model to learn well from fewer task examples and generate SKGs that make common sense to human annotators.", "keywords": "Commonsense reasoning;constrained text generation;knowledge representation", "primary_area": "", "supplementary_material": "", "author": "PeiFeng Wang;Jonathan Zamora;Junfeng Liu;Filip Ilievski;Muhao Chen;Xiang Ren", "authorids": "~PeiFeng_Wang1;jzamoraa@ucsd.edu;liujunfe@usc.edu;~Filip_Ilievski1;~Muhao_Chen1;~Xiang_Ren1", "gender": "M;;;M;M;M", "homepage": ";;;http://www.ilievski.info;https://muhaochen.github.io/;https://shanzhenren.github.io/", "dblp": "264/4849;;;167/4770;173/2608;36/360-1", "google_scholar": "3jfQnM4AAAAJ;;;4ZScBc0AAAAJ;k79yEZkAAAAJ;_moJlrIAAAAJ", "orcid": ";;;;0000-0003-0118-3147;", "linkedin": ";;;;;xren7", "or_profile": "~PeiFeng_Wang1;jzamoraa@ucsd.edu;liujunfe@usc.edu;~Filip_Ilievski1;~Muhao_Chen1;~Xiang_Ren1", "aff": "University of Southern California;;;USC/ISI;University of Southern California;University of Southern California", "aff_domain": "usc.edu;;;isi.edu;usc.edu;usc.edu", "position": "PhD student;;;Computer Scientist;Assistant Research Professor;Associate Professor", "bibtex": "@inproceedings{\nwang2022contextualized,\ntitle={Contextualized Scene Imagination for Generative Commonsense Reasoning},\nauthor={PeiFeng Wang and Jonathan Zamora and Junfeng Liu and Filip Ilievski and Muhao Chen and Xiang Ren},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=Oh1r2wApbPv}\n}", "github": "", "project": "", "reviewers": "9SQy;6Z6c;GjaV;y2yo", "pdf_size": 0, "recommendation": "6;6;8;8", "confidence": "4;4;4;3", "correctness": "4;3;4;3", "technical_novelty": "3;3;3;3", "empirical_novelty": "3;3;4;3", "wc_summary_paper": "94;62;143;177", "wc_summary_review": "64;57;46;96", "wc_main_review": "237;481;488;358", "wc_review": "395;600;677;631", "wc_reply_reviewers": "0;0;43;227", "wc_reply_authors": "993;2792;996;1195", "reply_reviewers": "0;0;1;1", "reply_authors": "3;6;2;4", "recommendation_avg": [ 7.0, 1.0 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 3.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 119.0, 44.19841626121914 ], "wc_summary_review_avg": [ 65.75, 18.606114586339622 ], "wc_main_review_avg": [ 391.0, 102.85183518051586 ], "wc_review_avg": [ 575.75, 107.89201777703484 ], "wc_reply_reviewers_avg": [ 67.5, 93.74566656651389 ], "wc_reply_authors_avg": [ 1494.0, 753.8584084561238 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 3.75, 1.479019945774904 ], "replies_avg": [ 25, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.0, "gs_citation": 24, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6593478295513742090&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=Oh1r2wApbPv", "email": "usc.edu;;;isi.edu;usc.edu;usc.edu", "author_num": 6, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "University of Southern California", "aff_unique_dep": "", "aff_unique_url": "https://www.usc.edu", "aff_unique_abbr": "USC", "aff_campus_unique_index": "0;1;0;0", "aff_campus_unique": "Los Angeles;ISI", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "OhmG-MzmC2v", "title": "User-Entity Differential Privacy in Learning Natural Language Models", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "In this paper, we introduce a novel concept of user-entity differential privacy (UeDP) to provide formal privacy protection simultaneously to both sensitive entities in textual data and data owners in learning natural language models. To preserve UeDP, we developed a novel algorithm, called UeDP-Alg, optimizing the trade-off between privacy loss and model utility with a tight sensitivity bound derived from seamlessly combining sensitive and non-sensitive textual data together. An extensive theoretical analysis and evaluation show that our UeDP-Alg outperforms baseline approaches in terms of model utility under the same privacy budget consumption on several NLM tasks, using benchmark datasets.", "keywords": "differential privacy;natural language models", "primary_area": "", "supplementary_material": "/attachment/5761499306b74fb5c02d6f1c43b6f2034f99f77c.zip", "author": "Phung Lai;Tong Sun;Rajiv Jain;Franck Dernoncourt;Jiuxiang Gu;Nikolaos Barmpalios;Han Hu;Hai Phan", "authorids": "~Phung_Lai1;~Tong_Sun1;~Rajiv_Jain1;~Franck_Dernoncourt1;~Jiuxiang_Gu2;~Nikolaos_Barmpalios1;hh255@njit.edu;~Hai_Phan1", "gender": "F;F;M;;M;M;;Not Specified", "homepage": "https://www.linkedin.com/in/phunglai/;https://research.adobe.com/person/tong-sun/;;http://francky.me;http://gujiuxiang.com;;;https://sites.google.com/site/ihaiphan/", "dblp": ";;;132/4043;173/4935.html;;;153/5204", "google_scholar": ";https://scholar.google.com/citations?hl=en;https://scholar.google.com/;kz2aIc8AAAAJ;https://scholar.google.com.sg/citations?user=zPxKV9EAAAAJ;Yp4dul4AAAAJ;;nsEbWjAAAAAJ", "orcid": ";;;0000-0002-1119-1346;;;;", "linkedin": ";tong-sun/?trk=hb_tab_pro_top;;franckdernoncourt;;;;", "or_profile": "~Phung_Lai1;~Tong_Sun1;~Rajiv_Jain1;~Franck_Dernoncourt1;~Jiuxiang_Gu2;~Nikolaos_Barmpalios1;hh255@njit.edu;~Hai_Phan1", "aff": "New Jersey Institute of Technology;Adobe Systems;Adobe Systems;Adobe Systems;Adobe Systems;Adobe Systems;;New Jersey Institute of Technology", "aff_domain": "njit.edu;adobe.com;adobe.com;adobe.com;adobe.com;adobe.com;;njit.edu", "position": "PhD student;Director, Document Intelligence Lab;Senior Research Scientist;Researcher;Researcher;Senior Machine Learning Scientist;;Assistant Professor", "bibtex": "@misc{\nlai2022userentity,\ntitle={User-Entity Differential Privacy in Learning Natural Language Models},\nauthor={Phung Lai and Tong Sun and Rajiv Jain and Franck Dernoncourt and Jiuxiang Gu and Nikolaos Barmpalios and Han Hu and Hai Phan},\nyear={2022},\nurl={https://openreview.net/forum?id=OhmG-MzmC2v}\n}", "github": "", "project": "", "reviewers": "DXWM;WJP4;o9aw", "site": "https://openreview.net/forum?id=OhmG-MzmC2v", "pdf_size": 0, "recommendation": "3;5;5", "confidence": "3;4;3", "correctness": "3;4;2", "technical_novelty": "2;3;3", "empirical_novelty": "2;3;3", "wc_summary_paper": "31;39;57", "wc_summary_review": "28;281;44", "wc_main_review": "444;65;244", "wc_review": "503;385;345", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 4.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "correctness_avg": [ 3.0, 0.816496580927726 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 42.333333333333336, 10.873004286866726 ], "wc_summary_review_avg": [ 117.66666666666667, 115.67867372837378 ], "wc_main_review_avg": [ 251.0, 154.80525400213867 ], "wc_review_avg": [ 411.0, 67.07210050883054 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": 0.49999999999999983, "corr_recommendation_correctness": 0.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17972045274765292198&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff_unique_index": "0;1;1;1;1;1;0", "aff_unique_norm": "New Jersey Institute of Technology;Adobe", "aff_unique_dep": ";Adobe Systems Incorporated", "aff_unique_url": "https://www.njit.edu;https://www.adobe.com", "aff_unique_abbr": "NJIT;Adobe", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "OhytAdNSzO-", "title": "An Investigation on Hardware-Aware Vision Transformer Scaling", "track": "main", "status": "Reject", "tldr": "", "abstract": "Vision Transformer (ViT) has demonstrated promising performance in various computer vision tasks, and recently attracted a lot of research attention. Many recent works have focused on proposing new architectures to improve ViT and deploying it into real-world applications. However, little effort has been made to analyze and understand ViT\u2019s architecture design space and its implication of hardware-cost on different devices. In this work, by simply scaling ViT's depth, width, input size, and other basic configurations, we show that a scaled vanilla ViT model without bells and whistles can achieve comparable or superior accuracy-efficiency trade-off than most of the latest ViT variants. Specifically, compared to DeiT-Tiny, our scaled model achieves a $\\uparrow1.9\\%$ higher ImageNet top-1 accuracy under the same FLOPs and a $\\uparrow3.7\\%$ better ImageNet top-1 accuracy under the same latency on an NVIDIA Edge GPU TX2. Motivated by this, we further investigate the extracted scaling strategies from the following two aspects: (1) \"can these scaling strategies be transferred across different real hardware devices?''; and (2) \"can these scaling strategies be transferred to different ViT variants and tasks?''. For (1), our exploration, based on various devices with different resource budgets, indicates that the transferability effectiveness depends on the underlying device together with its corresponding deployment tool; for (2), we validate the effective transferability of the aforementioned scaling strategies obtained from a vanilla ViT model on top of an image classification task to the PiT model, a strong ViT variant targeting efficiency, as well as object detection and video classification tasks. In particular, when transferred to PiT, our scaling strategies lead to a boosted ImageNet top-1 accuracy of from $74.6\\%$ to $76.7\\%$ ($\\uparrow2.1\\%$) under the same 0.7G FLOPs; and when transferred to the COCO object detection task, the average precision is boosted by $\\uparrow0.7\\%$ under a similar throughput on a V100 GPU. ", "keywords": "Model Scaling;Vision Transformer", "primary_area": "", "supplementary_material": "", "author": "Chaojian Li;Kyungmin Kim;Bichen Wu;Peizhao Zhang;Hang Zhang;Xiaoliang Dai;Peter Vajda;Yingyan Lin", "authorids": "~Chaojian_Li1;~Kyungmin_Kim1;~Bichen_Wu1;~Peizhao_Zhang1;~Hang_Zhang3;~Xiaoliang_Dai1;~Peter_Vajda1;~Yingyan_Lin1", "gender": ";;M;M;M;M;;F", "homepage": "https://licj15.github.io/;;;;https://hangzhang.org/;;https://sites.google.com/site/vajdap;https://eiclab.scs.gatech.edu/", "dblp": "249/5403;;130/1371;23/8011.html;49/6156-5;192/3904;44/5953;120/6981", "google_scholar": "HvEBdf4AAAAJ;;K3QJPdMAAAAJ;eqQQkM4AAAAJ;gCoWdkUAAAAJ;u4olrOcAAAAJ;k8QB5VUAAAAJ;dio8IesAAAAJ", "orcid": ";;;;;;;", "linkedin": ";;bichenwu/;;;;p%C3%A9ter-vajda-9a03aaa/;yingyan-celine-lin-a281211a/", "or_profile": "~Chaojian_Li1;~Kyungmin_Kim1;~Bichen_Wu1;~Peizhao_Zhang1;~Hang_Zhang3;~Xiaoliang_Dai1;~Peter_Vajda1;~Yingyan_Lin1", "aff": "Rice University;;Meta Facebook;Meta;Meta Facebook;Meta Facebook;Meta;Rice University", "aff_domain": "rice.edu;;fb.com;meta.com;fb.com;fb.com;meta.com;rice.edu", "position": "PhD student;;Research Scientist;Research Scientist;Research Scientist;Research Scientist;Researcher;Assistant Professor", "bibtex": "@misc{\nli2022an,\ntitle={An Investigation on Hardware-Aware Vision Transformer Scaling},\nauthor={Chaojian Li and Kyungmin Kim and Bichen Wu and Peizhao Zhang and Hang Zhang and Xiaoliang Dai and Peter Vajda and Yingyan Lin},\nyear={2022},\nurl={https://openreview.net/forum?id=OhytAdNSzO-}\n}", "github": "", "project": "", "reviewers": "d8q8;zFw4;Lm1r;WJMX", "site": "https://openreview.net/forum?id=OhytAdNSzO-", "pdf_size": 0, "recommendation": "3;3;5;6", "confidence": "3;3;3;5", "correctness": "4;3;3;3", "technical_novelty": "2;1;2;3", "empirical_novelty": "3;3;3;4", "wc_summary_paper": "57;117;36;51", "wc_summary_review": "94;66;15;62", "wc_main_review": "239;328;100;185", "wc_review": "390;511;151;298", "wc_reply_reviewers": "166;0;0;0", "wc_reply_authors": "1809;695;351;866", "reply_reviewers": "1;0;0;0", "reply_authors": "3;1;1;2", "recommendation_avg": [ 4.25, 1.299038105676658 ], "confidence_avg": [ 3.5, 0.8660254037844386 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 3.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 65.25, 30.84132779242813 ], "wc_summary_review_avg": [ 59.25, 28.367014294775544 ], "wc_main_review_avg": [ 213.0, 82.84624312544292 ], "wc_review_avg": [ 337.5, 131.53041473362728 ], "wc_reply_reviewers_avg": [ 41.5, 71.88010851410841 ], "wc_reply_authors_avg": [ 930.25, 540.1857897982878 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.75, 0.82915619758885 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": 0.7777777777777777, "corr_recommendation_correctness": -0.5555555555555555, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13191011031901224254&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1;1;1;1;1;0", "aff_unique_norm": "Rice University;Meta", "aff_unique_dep": ";Meta Platforms, Inc.", "aff_unique_url": "https://www.rice.edu;https://meta.com", "aff_unique_abbr": "Rice;Meta", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "OjFh4rBdrAP", "title": "A Two-Stage Framework to Generate Video Chapter", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "We aim to address the problem of video chapter generation. \nCompared to traditional video activity analysis, this task is significantly different.\nThe videos in chapter generation are much longer and contain many complex temporal structures.\nMoreover, the association between video frames and narrations plays a crucial role in expressing underlying information.\nTo facilitate the research along this direction, we introduce a large-scale dataset called ChapterGen, which consists of approximately $10k$ user-generated videos with annotated chapter descriptions. \nOur data collection procedure is fast, scalable, and does not require any additional manual annotation.\nOn top of this dataset, we propose a two-stage framework to perform chapter localization and chapter title generation. \nThis framework captures two aspects of a video, including visual dynamics and narration text.\nTo parse the whole video efficiently, we build the framework based on a flexible clip sliding window.\nOur experiments demonstrate that the proposed framework achieves superior results over existing methods on both accuracy and efficiency.", "keywords": "Video Chapter Generation;High-level Video Understanding;MultiModal Machine Learning", "primary_area": "", "supplementary_material": "", "author": "Canyu Le;Zhiyuan Tang;Ke Li;Jiandong Yang", "authorids": "~Canyu_Le1;~Zhiyuan_Tang2;~Ke_Li6;~Jiandong_Yang1", "gender": ";M;M;", "homepage": ";https://silenceender.com;;http://about.me/jdyang", "dblp": "https://dblp.uni-trier.de/pers/hd/l/Le:Canyu;;;", "google_scholar": ";;nx3Alr4AAAAJ;", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Canyu_Le1;~Zhiyuan_Tang2;~Ke_Li6;~Jiandong_Yang1", "aff": "Bytedance Inc.;Shanghai Jiaotong University;Bytedance Inc.;Peking University, Tsinghua University", "aff_domain": "bytedance.com;sjtu.edu.cn;bytedance.com;pku.edu.cn", "position": "Researcher;MS student;Researcher;Lecturer", "bibtex": "@misc{\nle2022a,\ntitle={A Two-Stage Framework to Generate Video Chapter},\nauthor={Canyu Le and Zhiyuan Tang and Ke Li and Jiandong Yang},\nyear={2022},\nurl={https://openreview.net/forum?id=OjFh4rBdrAP}\n}", "github": "", "project": "", "reviewers": "TzQD;GLbL;hxuF;4bCh", "site": "https://openreview.net/forum?id=OjFh4rBdrAP", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "4;5;3;5", "correctness": "3;2;4;3", "technical_novelty": "2;2;1;3", "empirical_novelty": "2;2;1;4", "wc_summary_paper": "59;65;100;112", "wc_summary_review": "45;29;71;150", "wc_main_review": "590;717;353;850", "wc_review": "694;811;524;1112", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 4.25, 0.82915619758885 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.25, 1.0897247358851685 ], "wc_summary_paper_avg": [ 84.0, 22.5055548698538 ], "wc_summary_review_avg": [ 73.75, 46.504704063137524 ], "wc_main_review_avg": [ 627.5, 183.21640210417843 ], "wc_review_avg": [ 785.25, 214.48003986385308 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.5222329678670935, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:cM1BopE-LQcJ:scholar.google.com/&scioq=A+Two-Stage+Framework+to+Generate+Video+Chapter&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;0;2", "aff_unique_norm": "Bytedance Inc.;Shanghai Jiao Tong University;Peking University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.bytedance.com;https://www.sjtu.edu.cn;http://www.pku.edu.cn", "aff_unique_abbr": "Bytedance;SJTU;Peking U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "title": "Enhancing Cross-lingual Transfer by Manifold Mixup", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6029", "id": "OjPmfr9GkVv", "poster": "", "openreview": "https://openreview.net/forum?id=OjPmfr9GkVv", "slides": "https://iclr.cc/virtual/2022/poster/6029", "video": "https://iclr.cc/virtual/2022/poster/6029", "author_site": "Huiyun Yang, Huadong Chen, Hao Zhou, Lei Li", "tldr": "", "abstract": "Based on large-scale pre-trained multilingual representations, recent cross-lingual transfer methods have achieved impressive transfer performances. However, the performance of target languages still lags far behind the source language. In this paper, our analyses indicate such a performance gap is strongly associated with the cross-lingual representation discrepancy. To achieve better cross-lingual transfer performance, we propose the cross-lingual manifold mixup (X-Mixup) method, which adaptively calibrates the representation discrepancy and gives a compromised representation for target languages. Experiments on the XTREME benchmark show X-Mixup achieves 1.8% performance gains on multiple text understanding tasks, compared with strong baselines, and significantly reduces the cross-lingual representation discrepancy.", "keywords": "cross-lingual transfer;cross-lingual understanding;manifold mixup", "primary_area": "", "supplementary_material": "", "author": "Huiyun Yang;Huadong Chen;Hao Zhou;Lei Li", "authorids": "~Huiyun_Yang1;~Huadong_Chen1;~Hao_Zhou5;~Lei_Li11", "gender": "F;M;M;M", "homepage": ";;https://zhouh.github.io/;https://www.cs.cmu.edu/~leili", "dblp": ";42/5877;63/778-12;13/7007-5.html", "google_scholar": "C9YuAMIAAAAJ;https://scholar.google.com.hk/citations?user=_DwLcxIAAAAJ;https://scholar.google.com/citations?hl=zh-CN;BYXqAlwAAAAJ", "orcid": ";;;0000-0003-3095-9776", "linkedin": ";;;", "or_profile": "~Huiyun_Yang1;~Huadong_Chen1;~Hao_Zhou5;~Lei_Li11", "aff": "ByteDance;Bytedance;Bytedance;Computer Science Department, UC Santa Barbara", "aff_domain": "bytedance.com;bytedance.com;bytedance.com;cs.ucsb.edu", "position": "Researcher;Researcher;Researcher;Assistant Professor", "bibtex": "@inproceedings{\nyang2022enhancing,\ntitle={Enhancing Cross-lingual Transfer by Manifold Mixup},\nauthor={Huiyun Yang and Huadong Chen and Hao Zhou and Lei Li},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=OjPmfr9GkVv}\n}", "github": "", "project": "", "reviewers": "9Shh;DF3R;pZJi;VD9r", "pdf_size": 0, "recommendation": "5;6;8;8", "confidence": "3;4;5;4", "correctness": "3;4;4;4", "technical_novelty": "3;4;3;3", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "86;91;97;71", "wc_summary_review": "47;27;37;57", "wc_main_review": "237;177;172;299", "wc_review": "370;295;306;427", "wc_reply_reviewers": "0;35;26;16", "wc_reply_authors": "453;628;436;621", "reply_reviewers": "0;1;1;1", "reply_authors": "2;2;2;2", "recommendation_avg": [ 6.75, 1.299038105676658 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 86.25, 9.627434756984853 ], "wc_summary_review_avg": [ 42.0, 11.180339887498949 ], "wc_main_review_avg": [ 221.25, 51.664180047688745 ], "wc_review_avg": [ 349.5, 53.12485294097293 ], "wc_reply_reviewers_avg": [ 19.25, 12.987975207860538 ], "wc_reply_authors_avg": [ 534.5, 90.23441693721969 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 2.0, 0.0 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.816496580927726, "corr_recommendation_correctness": 0.7777777777777777, "gs_citation": 42, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13560869660966503554&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=OjPmfr9GkVv", "email": "bytedance.com;bytedance.com;bytedance.com;cs.ucsb.edu", "author_num": 4, "aff_unique_index": "0;0;0;1", "aff_unique_norm": "ByteDance;University of California, Santa Barbara", "aff_unique_dep": ";Computer Science Department", "aff_unique_url": "https://www.bytedance.com;https://www.ucsb.edu", "aff_unique_abbr": "ByteDance;UCSB", "aff_campus_unique_index": "1", "aff_campus_unique": ";Santa Barbara", "aff_country_unique_index": "0;0;0;1", "aff_country_unique": "China;United States" }, { "id": "OkB0tlodmH", "title": "Q-learning for real time control of heterogeneous microagent collectives", "track": "main", "status": "Reject", "tldr": "", "abstract": "The effective control of microscopic collectives has many promising applications, from environmental remediation to targeted drug delivery. A key challenge is understanding how to control these agents given their limited programmability, and in many cases heterogeneous dynamics. The ability to learn control strategies in real time could allow for the application of robotics solutions to drive collective behaviours towards desired outcomes. Here, we demonstrate Q-learning on the closed-loop Dynamic Optical Micro-Environment (DOME) platform to control the motion of light-responsive Volvox agents. The results show that Q-learning is efficient in autonomously learning how to reduce the speed of agents on an individual basis.", "keywords": "q-learning;reinforcement learning;microsystems;closed-loop control;optical control", "primary_area": "", "supplementary_material": "", "author": "Ana Rubio Denniss;Laia Freixas Mateu;Thomas Gorochowski;Sabine Hauert", "authorids": "~Ana_Rubio_Denniss1;zu20532@bristol.ac.uk;thomas.gorochowski@bristol.ac.uk;sabine.hauert@bristol.ac.uk", "gender": "F;;;", "homepage": "https://research-information.bris.ac.uk/en/persons/ana-m-rubio-denniss;;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Ana_Rubio_Denniss1;zu20532@bristol.ac.uk;thomas.gorochowski@bristol.ac.uk;sabine.hauert@bristol.ac.uk", "aff": "University of Bristol;;;", "aff_domain": "bristol.ac.uk;;;", "position": "Researcher;;;", "bibtex": "@misc{\ndenniss2022qlearning,\ntitle={Q-learning for real time control of heterogeneous microagent collectives},\nauthor={Ana Rubio Denniss and Laia Freixas Mateu and Thomas Gorochowski and Sabine Hauert},\nyear={2022},\nurl={https://openreview.net/forum?id=OkB0tlodmH}\n}", "github": "", "project": "", "reviewers": "GPp7;QBsR;BtTc", "site": "https://openreview.net/forum?id=OkB0tlodmH", "pdf_size": 0, "recommendation": "3;3;6", "confidence": "2;4;2", "correctness": "3;3;4", "technical_novelty": "2;2;2", "empirical_novelty": "2;2;3", "wc_summary_paper": "64;51;78", "wc_summary_review": "73;52;40", "wc_main_review": "317;386;272", "wc_review": "454;489;390", "wc_reply_reviewers": "15;0;0", "wc_reply_authors": "115;216;95", "reply_reviewers": "1;0;0", "reply_authors": "1;1;1", "recommendation_avg": [ 4.0, 1.4142135623730951 ], "confidence_avg": [ 2.6666666666666665, 0.9428090415820634 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 64.33333333333333, 11.025223605694151 ], "wc_summary_review_avg": [ 55.0, 13.638181696985855 ], "wc_main_review_avg": [ 325.0, 46.88283267892417 ], "wc_review_avg": [ 444.3333333333333, 40.99051380773633 ], "wc_reply_reviewers_avg": [ 5.0, 7.0710678118654755 ], "wc_reply_authors_avg": [ 142.0, 52.959103718498355 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.5000000000000001, "corr_recommendation_correctness": 1.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:he3zEz2B1VsJ:scholar.google.com/&scioq=Q-learning+for+real+time+control+of+heterogeneous+microagent+collectives&hl=en&as_sdt=0,5", "gs_version_total": 4, "aff_unique_index": "0", "aff_unique_norm": "University of Bristol", "aff_unique_dep": "", "aff_unique_url": "https://www.bristol.ac.uk", "aff_unique_abbr": "Bristol", "aff_country_unique_index": "0", "aff_country_unique": "United Kingdom" }, { "title": "Surrogate NAS Benchmarks: Going Beyond the Limited Search Spaces of Tabular NAS Benchmarks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6063", "id": "OnpFa95RVqs", "poster": "", "openreview": "https://openreview.net/forum?id=OnpFa95RVqs", "slides": "https://iclr.cc/virtual/2022/poster/6063", "video": "https://iclr.cc/virtual/2022/poster/6063", "author_site": "Arber Zela, Julien Niklas Siems, Lucas Zimmer, Jovita Lukasik, Margret Keuper, Frank Hutter", "tldr": "", "abstract": "The most significant barrier to the advancement of Neural Architecture Search (NAS) is its demand for large computational resources, which hinders scientifically sound empirical evaluations of NAS methods. Tabular NAS benchmarks have alleviated this problem substantially, making it possible to properly evaluate NAS methods in seconds on commodity machines. However, an unintended consequence of tabular NAS benchmarks has been a focus on extremely small architectural search spaces since their construction relies on exhaustive evaluations of the space. This leads to unrealistic results that do not transfer to larger spaces. To overcome this fundamental limitation, we propose a methodology to create cheap NAS surrogate benchmarks for arbitrary search spaces. We exemplify this approach by creating surrogate NAS benchmarks on the existing tabular NAS-Bench-101 and on two widely used NAS search spaces with up to $10^{21}$ architectures ($10^{13}$ times larger than any previous tabular NAS benchmark). We show that surrogate NAS benchmarks can model the true performance of architectures better than tabular benchmarks (at a small fraction of the cost), that they lead to faithful estimates of how well different NAS methods work on the original non-surrogate benchmark, and that they can generate new scientific insight. We open-source all our code and believe that surrogate NAS benchmarks are an indispensable tool to extend scientifically sound work on NAS to large and exciting search spaces.", "keywords": "neural architecture search;AutoML;benchmarking;surrogate model", "primary_area": "", "supplementary_material": "", "author": "Arber Zela;Julien Niklas Siems;Lucas Zimmer;Jovita Lukasik;Margret Keuper;Frank Hutter", "authorids": "~Arber_Zela1;~Julien_Niklas_Siems1;~Lucas_Zimmer1;~Jovita_Lukasik1;~Margret_Keuper1;~Frank_Hutter1", "gender": "M;M;M;F;F;M", "homepage": "https://ml.informatik.uni-freiburg.de/people/zela/index.html;https://juliensiems.github.io;;https://www.uni-mannheim.de/dws/people/researchers/phd-students/jovita-lukasik/;https://www.vc.informatik.uni-siegen.de/en/keuper-margret;http://ml.informatik.uni-freiburg.de/~hutter/", "dblp": ";257/3075;;255/4833;95/7589;89/5383", "google_scholar": "hD_6YioAAAAJ;https://scholar.google.de/citations?user=rKgTTh8AAAAJ;;https://scholar.google.de/citations?user=TpsZenwAAAAJ;https://scholar.google.de/citations?user=KMqMQAcAAAAJ;https://scholar.google.de/citations?user=YUrxwrkAAAAJ", "orcid": ";;0000-0002-5167-2929;;0000-0002-8437-7993;0000-0002-2037-3694", "linkedin": "https://de.linkedin.com/in/arber-zela-ba85a2145;julien-niklas-siems/;lucas-z-5369ba170/;;;frank-hutter-9190b24b/", "or_profile": "~Arber_Zela1;~Julien_Niklas_Siems1;~Lucas_Zimmer1;~Jovita_Lukasik1;~Margret_Keuper1;~Frank_Hutter1", "aff": "University of Freiburg;;;University of Mannheim;Universit\u00e4t Siegen;Albert-Ludwigs-Universit\u00e4t Freiburg", "aff_domain": "uni-freiburg.de;;;uni-mannheim.de;uni-siegen.de;uni-freiburg.de", "position": "PhD student;;;PhD student;Full Professor;Full Professor", "bibtex": "@inproceedings{\nzela2022surrogate,\ntitle={Surrogate {NAS} Benchmarks: Going Beyond the Limited Search Spaces of Tabular {NAS} Benchmarks},\nauthor={Arber Zela and Julien Niklas Siems and Lucas Zimmer and Jovita Lukasik and Margret Keuper and Frank Hutter},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=OnpFa95RVqs}\n}", "github": "", "project": "", "reviewers": "yTPb;t8ZB;1eb8;pRnV", "pdf_size": 0, "recommendation": "3;5;8;8", "confidence": "5;4;4;5", "correctness": "2;3;4;4", "technical_novelty": "2;2;4;4", "empirical_novelty": "2;2;4;4", "wc_summary_paper": "33;57;35;71", "wc_summary_review": "27;59;59;34", "wc_main_review": "363;214;362;289", "wc_review": "423;330;456;394", "wc_reply_reviewers": "803;0;84;47", "wc_reply_authors": "4514;878;643;438", "reply_reviewers": "2;0;1;1", "reply_authors": "8;3;2;1", "recommendation_avg": [ 6.0, 2.1213203435596424 ], "confidence_avg": [ 4.5, 0.5 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 3.0, 1.0 ], "empirical_novelty_avg": [ 3.0, 1.0 ], "wc_summary_paper_avg": [ 49.0, 15.811388300841896 ], "wc_summary_review_avg": [ 44.75, 14.463315664120728 ], "wc_main_review_avg": [ 307.0, 61.51016176210236 ], "wc_review_avg": [ 400.75, 46.364722580858825 ], "wc_reply_reviewers_avg": [ 233.5, 330.14580112429115 ], "wc_reply_authors_avg": [ 1618.25, 1679.0950501683935 ], "reply_reviewers_avg": [ 1.0, 0.7071067811865476 ], "reply_authors_avg": [ 3.5, 2.692582403567252 ], "replies_avg": [ 23, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.23570226039551587, "corr_recommendation_correctness": 0.994936676326182, "gs_citation": 88, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7687520135740328822&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=OnpFa95RVqs", "email": "uni-freiburg.de;;;uni-mannheim.de;uni-siegen.de;uni-freiburg.de", "author_num": 6, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "University of Freiburg;University of Mannheim;University of Siegen;Albert-Ludwigs-Universit\u00e4t Freiburg", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.uni-freiburg.de;https://www.uni-mannheim.de;https://www.uni-siegen.de;https://www.uni-freiburg.de", "aff_unique_abbr": "UoF;UM;Uni Siegen;Albert-Ludwigs-Universit\u00e4t", "aff_campus_unique_index": "1", "aff_campus_unique": ";Freiburg", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Germany" }, { "title": "MetaMorph: Learning Universal Controllers with Transformers", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6939", "id": "Opmqtk_GvYL", "poster": "", "openreview": "https://openreview.net/forum?id=Opmqtk_GvYL", "slides": "https://iclr.cc/virtual/2022/poster/6939", "video": "https://iclr.cc/virtual/2022/poster/6939", "author_site": "Agrim Gupta, Jim Fan, Surya Ganguli, Li Fei-Fei", "tldr": "", "abstract": "Multiple domains like vision, natural language, and audio are witnessing tremendous progress by leveraging Transformers for large scale pre-training followed by task specific fine tuning. In contrast, in robotics we primarily train a single robot for a single task. However, modular robot systems now allow for the flexible combination of general-purpose building blocks into task optimized morphologies. However, given the exponentially large number of possible robot morphologies, training a controller for each new design is impractical. In this work, we propose MetaMorph, a Transformer based approach to learn a universal controller over a modular robot design space. MetaMorph is based on the insight that robot morphology is just another modality on which we can condition the output of a Transformer. Through extensive experiments we demonstrate that large scale pre-training on a variety of robot morphologies results in policies with combinatorial generalization capabilities, including zero shot generalization to unseen robot morphologies. We further demonstrate that our pre-trained policy can be used for sample-efficient transfer to completely new robot morphologies and tasks.", "keywords": "RL;Modular Robots;Transformers", "primary_area": "", "supplementary_material": "", "author": "Agrim Gupta;Linxi Fan;Surya Ganguli;Li Fei-Fei", "authorids": "~Agrim_Gupta1;~Linxi_Fan2;~Surya_Ganguli1;~Li_Fei-Fei1", "gender": ";;M;F", "homepage": ";;http://ganguli-gang.stanford.edu/surya.html;https://profiles.stanford.edu/fei-fei-li", "dblp": "200/8282;154/6778;56/10453;79/2528", "google_scholar": "AxzVaI8AAAAJ;sljtWIUAAAAJ;;rDfyQnIAAAAJ", "orcid": ";;;", "linkedin": ";;;fei-fei-li-4541247/", "or_profile": "~Agrim_Gupta1;~Linxi_Fan2;~Surya_Ganguli1;~Li_Fei-Fei1", "aff": "Google DeepMind;NVIDIA;Stanford University;Stanford University", "aff_domain": "deepmind.com;nvidia.com;@stanford.edu;stanford.edu", "position": "Intern;Researcher;Assistant Professor;Full Professor", "bibtex": "@inproceedings{\ngupta2022metamorph,\ntitle={MetaMorph: Learning Universal Controllers with Transformers},\nauthor={Agrim Gupta and Linxi Fan and Surya Ganguli and Li Fei-Fei},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=Opmqtk_GvYL}\n}", "github": "", "project": "", "reviewers": "PnzQ;Hxo2;XpGx;rhAB;m7dm", "pdf_size": 0, "recommendation": "6;6;8;8;8", "confidence": "5;5;4;4;4", "correctness": "3;3;3;3;3", "technical_novelty": "2;3;3;3;3", "empirical_novelty": "3;3;3;2;3", "wc_summary_paper": "27;41;89;51;143", "wc_summary_review": "135;81;41;65;35", "wc_main_review": "575;499;588;79;135", "wc_review": "737;621;718;195;313", "wc_reply_reviewers": "225;23;18;0;17", "wc_reply_authors": "770;1305;288;72;228", "reply_reviewers": "4;1;1;0;1", "reply_authors": "4;2;1;1;1", "recommendation_avg": [ 7.2, 0.9797958971132712 ], "confidence_avg": [ 4.4, 0.48989794855663565 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.8, 0.39999999999999997 ], "empirical_novelty_avg": [ 2.8, 0.39999999999999997 ], "wc_summary_paper_avg": [ 70.2, 41.811003336442425 ], "wc_summary_review_avg": [ 71.4, 35.85303334447449 ], "wc_main_review_avg": [ 375.2, 221.79305669925736 ], "wc_review_avg": [ 516.8, 221.3218470915151 ], "wc_reply_reviewers_avg": [ 56.6, 84.55672652131231 ], "wc_reply_authors_avg": [ 532.6, 451.1592180151039 ], "reply_reviewers_avg": [ 1.4, 1.3564659966250538 ], "reply_authors_avg": [ 1.8, 1.1661903789690604 ], "replies_avg": [ 23, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -1.0, "corr_recommendation_correctness": 0.0, "gs_citation": 97, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5095019871599200934&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=Opmqtk_GvYL", "email": "deepmind.com;nvidia.com;@stanford.edu;stanford.edu", "author_num": 4, "aff_unique_index": "0;1;2;2", "aff_unique_norm": "Google;NVIDIA;Stanford University", "aff_unique_dep": "Google DeepMind;NVIDIA Corporation;", "aff_unique_url": "https://deepmind.com;https://www.nvidia.com;https://www.stanford.edu", "aff_unique_abbr": "DeepMind;NVIDIA;Stanford", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Stanford", "aff_country_unique_index": "0;1;1;1", "aff_country_unique": "United Kingdom;United States" }, { "id": "OqHtVOo-zy", "title": "Estimating Instance-dependent Label-noise Transition Matrix using DNNs", "track": "main", "status": "Reject", "tldr": "", "abstract": "In label-noise learning, estimating the transition matrix is a hot topic as the matrix plays an important role in building statistically consistent classifiers. Traditionally, the transition from clean labels to noisy labels (i.e., clean label transition matrix) has been widely exploited to learn a clean label classifier by employing the noisy data. Motivated by that classifiers mostly output Bayes optimal labels for prediction, in this paper, we study to directly model the transition from Bayes optimal labels to noisy labels (i.e., Bayes label transition matrix) and learn a classifier to predict Bayes optimal labels. Note that given only noisy data, it is ill-posed to estimate either the clean label transition matrix or the Bayes label transition matrix. But favorably, Bayes optimal labels have less uncertainty compared with the clean labels, i.e., the class posteriors of Bayes optimal labels are one-hot vectors while those of clean labels are not. This enables two advantages to estimate the Bayes label transition matrix, i.e., (a) we could theoretically recover a set of noisy data with Bayes optimal labels under mild conditions; (b) the feasible solution space is much smaller. By exploiting the advantages, we estimate the Bayes label transition matrix by employing a deep neural network in a parameterized way, leading to better generalization and superior classification performance.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Shuo Yang;Erkun Yang;Bo Han;Yang Liu;Min Xu;Gang Niu;Tongliang Liu", "authorids": "~Shuo_Yang5;~Erkun_Yang2;~Bo_Han1;~Yang_Liu3;~Min_Xu5;~Gang_Niu1;~Tongliang_Liu1", "gender": "M;M;M;F;M;M;M", "homepage": "https://faculty.hitsz.edu.cn/yangshuo;;http://www.yliuu.com;https://www.uts.edu.au/staff/min.xu;https://niug1984.github.io;https://tongliang-liu.github.io/;https://bhanml.github.io/", "dblp": "78/1102-6;184/3481;51/3710-18;09/0-1.html;26/3367-1;150/6667;241/0472-3", "google_scholar": "mVtxxCkAAAAJ;jo8L49AAAAAJ;jKrIVCIAAAAJ;https://scholar.google.com.au/citations?user=Ac6VCMkAAAAJ;https://scholar.google.co.jp/citations?user=HOkcy00AAAAJ;https://scholar.google.com.au/citations?user=EiLdZ_YAAAAJ;nTNjqHwAAAAJ", "orcid": ";;0000-0001-8420-6011;0000-0001-9581-8849;;;", "linkedin": ";;;;;;", "or_profile": "~Shuo_Yang5;~Erkun_Yang2;~Yang_Liu3;~Min_Xu5;~Gang_Niu1;~Tongliang_Liu1;~bo_han2", "aff": "University of Technology Sydney, Australia;Xidian University;University of California, Santa Cruz;University of Technology Sydney;RIKEN;University of Sydney;Microsoft Research", "aff_domain": "student.uts.edu.au;xidian.edu;ucsc.edu;uts.edu.au;riken.jp;sydney.edu.au;microsoft.com", "position": "PhD student;Associate Professor;Assistant Professor;Associate Professor;Research Scientist (tenured);Lecturer;Researcher", "bibtex": "@misc{\nyang2022estimating,\ntitle={Estimating Instance-dependent Label-noise Transition Matrix using {DNN}s},\nauthor={Shuo Yang and Erkun Yang and Bo Han and Yang Liu and Min Xu and Gang Niu and Tongliang Liu},\nyear={2022},\nurl={https://openreview.net/forum?id=OqHtVOo-zy}\n}", "github": "", "project": "", "reviewers": "M4SW;dRUQ;ieRv;6uGG", "site": "https://openreview.net/forum?id=OqHtVOo-zy", "pdf_size": 0, "recommendation": "3;3;5;8", "confidence": "5;4;4;4", "correctness": "2;2;3;4", "technical_novelty": "1;2;3;4", "empirical_novelty": "2;4;3;3", "wc_summary_paper": "32;70;65;100", "wc_summary_review": "23;49;16;34", "wc_main_review": "694;498;107;162", "wc_review": "749;617;188;296", "wc_reply_reviewers": "299;286;0;0", "wc_reply_authors": "346;149;153;190", "reply_reviewers": "1;1;0;0", "reply_authors": "1;2;1;1", "recommendation_avg": [ 4.75, 2.0463381929681126 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.82915619758885 ], "technical_novelty_avg": [ 2.5, 1.118033988749895 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 66.75, 24.118198523106987 ], "wc_summary_review_avg": [ 30.5, 12.459935794377111 ], "wc_main_review_avg": [ 365.25, 241.71406144450927 ], "wc_review_avg": [ 462.5, 228.59625981192255 ], "wc_reply_reviewers_avg": [ 146.25, 146.32220439837556 ], "wc_reply_authors_avg": [ 209.5, 80.41299646201477 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": -0.49374193110101877, "corr_recommendation_correctness": 0.9945577827230725, "gs_citation": 34, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11514980593239384464&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;2;0;3;4;5", "aff_unique_norm": "University of Technology Sydney;Xidian University;University of California, Santa Cruz;RIKEN;University of Sydney;Microsoft", "aff_unique_dep": ";;;;;Microsoft Research", "aff_unique_url": "https://www.uts.edu.au;http://www.xidian.edu.cn/;https://www.ucsc.edu;https://www.riken.jp;https://www.sydney.edu.au;https://www.microsoft.com/en-us/research", "aff_unique_abbr": "UTS;Xidian;UCSC;RIKEN;USYD;MSR", "aff_campus_unique_index": "1", "aff_campus_unique": ";Santa Cruz", "aff_country_unique_index": "0;1;2;0;3;0;2", "aff_country_unique": "Australia;China;United States;Japan" }, { "title": "Pareto Policy Pool for Model-based Offline Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6152", "id": "OqcZu8JIIzS", "poster": "", "openreview": "https://openreview.net/forum?id=OqcZu8JIIzS", "slides": "https://iclr.cc/virtual/2022/poster/6152", "video": "https://iclr.cc/virtual/2022/poster/6152", "author_site": "Yijun Yang, Jing Jiang, Tianyi Zhou, Jie Ma, Yuhui Shi", "tldr": "", "abstract": "Online reinforcement learning (RL) can suffer from poor exploration, sparse reward, insufficient data, and overhead caused by inefficient interactions between an immature policy and a complicated environment. Model-based offline RL instead trains an environment model using a dataset of pre-collected experiences so online RL methods can learn in an offline manner by solely interacting with the model. However, the uncertainty and accuracy of the environment model can drastically vary across different state-action pairs so the RL agent may achieve high model return but perform poorly in the true environment. Unlike previous works that need to carefully tune the trade-off between the model return and uncertainty in a single objective, we study a bi-objective formulation for model-based offline RL that aims at producing a pool of diverse policies on the Pareto front performing different levels of trade-offs, which provides the flexibility to select the best policy for each realistic environment from the pool. Our method, ''Pareto policy pool (P3)'', does not need to tune the trade-off weight but can produce policies allocated at different regions of the Pareto front. For this purpose, we develop an efficient algorithm that solves multiple bi-objective optimization problems with distinct constraints defined by reference vectors targeting diverse regions of the Pareto front. We theoretically prove that our algorithm can converge to the targeted regions. In order to obtain more Pareto optimal policies without linearly increasing the cost, we leverage the achieved policies as initialization to find more Pareto optimal policies in their neighborhoods. On the D4RL benchmark for offline RL, P3 substantially outperforms several recent baseline methods over multiple tasks, especially when the quality of pre-collected experiences is low.", "keywords": "model-based offline RL;Pareto front;multi-objective optimization;policy pool;model return-uncertainty trade-off", "primary_area": "", "supplementary_material": "", "author": "Yijun Yang;Jing Jiang;Tianyi Zhou;Jie Ma;Yuhui Shi", "authorids": "~Yijun_Yang3;~Jing_Jiang6;~Tianyi_Zhou1;~Jie_Ma4;shiyh@sustech.edu.cn", "gender": "M;F;M;M;", "homepage": "https://stevenyangyj.github.io/;https://www.uts.edu.au/staff/jing.jiang;https://tianyizhou.github.io/;https://scholar.google.com.au/citations?user=tSmDoz0AAAAJ&hl=en;", "dblp": ";68/1974-2;88/8205-1;62/5110;", "google_scholar": "X0quXnsAAAAJ;https://scholar.google.com.au/citations?hl=en;OKvgizMAAAAJ;https://scholar.google.com.au/citations?user=tSmDoz0AAAAJ;", "orcid": ";;0000-0001-5348-0632;;", "linkedin": ";;tianyizhou;;", "or_profile": "~Yijun_Yang3;~Jing_Jiang6;~Tianyi_Zhou1;~Jie_Ma4;shiyh@sustech.edu.cn", "aff": "University of Technology Sydney;University of Technology Sydney;University of Washington, Seattle;University of Technology Sydney;", "aff_domain": "uts.edu.au;uts.edu.au;uw.edu;uts.edu.au;", "position": "PhD student;Lecturer;PhD student;PhD student;", "bibtex": "@inproceedings{\nyang2022pareto,\ntitle={Pareto Policy Pool for Model-based Offline Reinforcement Learning},\nauthor={Yijun Yang and Jing Jiang and Tianyi Zhou and Jie Ma and Yuhui Shi},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=OqcZu8JIIzS}\n}", "github": "", "project": "", "reviewers": "EL3w;ngyt;jeW5;xWzd", "pdf_size": 0, "recommendation": "5;6;8;8", "confidence": "3;3;4;4", "correctness": "3;3;3;4", "technical_novelty": "3;2;3;3", "empirical_novelty": "3;0;3;3", "wc_summary_paper": "108;151;316;253", "wc_summary_review": "39;82;96;47", "wc_main_review": "549;530;361;327", "wc_review": "696;763;773;627", "wc_reply_reviewers": "314;200;125;0", "wc_reply_authors": "1789;1212;1227;80", "reply_reviewers": "1;1;1;0", "reply_authors": "3;3;2;1", "recommendation_avg": [ 6.75, 1.299038105676658 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 1.299038105676658 ], "wc_summary_paper_avg": [ 207.0, 82.05790638323647 ], "wc_summary_review_avg": [ 66.0, 23.695991222145572 ], "wc_main_review_avg": [ 441.75, 98.7151837358367 ], "wc_review_avg": [ 714.75, 58.67868011467197 ], "wc_reply_reviewers_avg": [ 159.75, 114.1717456291179 ], "wc_reply_authors_avg": [ 1077.0, 620.8216329993664 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 2.25, 0.82915619758885 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.9622504486493761, "corr_recommendation_correctness": 0.5555555555555555, "gs_citation": 23, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11257363858302429300&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "pdf": "https://openreview.net/pdf?id=OqcZu8JIIzS", "email": "uts.edu.au;uts.edu.au;uw.edu;uts.edu.au;", "author_num": 5, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "University of Technology Sydney;University of Washington", "aff_unique_dep": ";", "aff_unique_url": "https://www.uts.edu.au;https://www.washington.edu", "aff_unique_abbr": "UTS;UW", "aff_campus_unique_index": "1", "aff_campus_unique": ";Seattle", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "Australia;United States" }, { "id": "OqlohL9sVO", "title": "Deep Fusion of Multi-attentive Local and Global Features with Higher Efficiency for Image Retrieval", "track": "main", "status": "Reject", "tldr": "", "abstract": "Image retrieval is to search images similar to the given query image by extracting features. Previously, methods that firstly search by global features then re-rank images using local feature matching were proposed, which has an excellent performance on many datasets. However, their drawbacks are also obvious. For example, the local feature matching consumes time and space greatly, the re-ranking process weakens the influence of global features, and the local feature learning is not accurate enough and semantic enough because of the trivial design. In this work, we proposed a Unifying Global and Attention-based Local Features Retrieval method (referred to as UGALR), which is an end-to-end and single-stage pipeline. Particularly, UGALR benefits from two aspects: 1) it accelerates extraction speed and reduces memory consumption by removing the re-ranking process and learning local feature matching with convolutional neural networks instead of RANSAC algorithm; 2) it learns more accurate and semantic local information through combining spatial and channel attention with the aid of intermediate supervision. Experiments on Revisited Oxford and Paris datasets validate the effectiveness of our approach, and we achieved state-of-the-art performance compared to other popular methods. The codes will be available soon.", "keywords": "Image retrieval;Homography learning;Attention;Intermediate supervision", "primary_area": "", "supplementary_material": "", "author": "Baorong Shi", "authorids": "~Baorong_Shi1", "gender": "F", "homepage": "https://dblp.org/pid/245/7615.html", "dblp": "245/7615.html", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "~Baorong_Shi1", "aff": "Chinese Academy of Sciences", "aff_domain": "ict.ac.cn", "position": "MS student", "bibtex": "@misc{\nshi2022deep,\ntitle={Deep Fusion of Multi-attentive Local and Global Features with Higher Efficiency for Image Retrieval},\nauthor={Baorong Shi},\nyear={2022},\nurl={https://openreview.net/forum?id=OqlohL9sVO}\n}", "github": "", "project": "", "reviewers": "wJ79;N1bK;MPBM;UMH7", "site": "https://openreview.net/forum?id=OqlohL9sVO", "pdf_size": 0, "recommendation": "1;3;5;6", "confidence": "5;5;5;3", "correctness": "2;2;3;3", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "71;65;141;127", "wc_summary_review": "42;15;102;150", "wc_main_review": "342;18;694;302", "wc_review": "455;98;937;579", "wc_reply_reviewers": "275;29;0;30", "wc_reply_authors": "1355;1030;792;376", "reply_reviewers": "2;1;0;1", "reply_authors": "2;2;1;1", "recommendation_avg": [ 3.75, 1.920286436967152 ], "confidence_avg": [ 4.5, 0.8660254037844386 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 101.0, 33.436506994600975 ], "wc_summary_review_avg": [ 77.25, 52.49464258379135 ], "wc_main_review_avg": [ 339.0, 240.02291557265943 ], "wc_review_avg": [ 517.25, 299.8536101166701 ], "wc_reply_reviewers_avg": [ 83.5, 111.21712997555727 ], "wc_reply_authors_avg": [ 888.25, 356.93583106771445 ], "reply_reviewers_avg": [ 1.0, 0.7071067811865476 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 1, 0 ], "corr_recommendation_confidence": -0.676481425202546, "corr_recommendation_correctness": 0.911322376865767, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:gqc0tB3JK_0J:scholar.google.com/&scioq=Deep+Fusion+of+Multi-attentive+Local+and+Global+Features+with+Higher+Efficiency+for+Image+Retrieval&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "Chinese Academy of Sciences", "aff_unique_dep": "", "aff_unique_url": "https://www.cas.cn", "aff_unique_abbr": "CAS", "aff_country_unique_index": "0", "aff_country_unique": "China" }, { "id": "Osoo_n9cMZ3", "title": "CRAFTING BETTER CONTRASTIVE VIEWS FOR SIAMESE REPRESENTATION LEARNING", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Recent self-supervised contrastive learning methods greatly benefit from the Siamese structure that aims at minimizing distances between positive pairs. For high performance Siamese representation learning, one of the keys is to design good contrastive pairs. Most previous works simply apply random sampling to make different crops of the same image, which overlooks the semantic information that may degrade the quality of views. In this work, we propose ContrastiveCrop, which could effectively generate better crops for Siamese representation learning. Firstly, a semantic-aware object localization strategy is proposed within the training process in a fully unsupervised manner. This guides us to generate contrastive views which could avoid most false positives (i.e. object v.s. background). Moreover, we empirically find that views with similar appearances are trivial for the Siamese model training. Thus, a center-suppressed sampling is further designed to enlarge the variance of crops. Remarkably, our method takes a careful consideration of positive pairs for contrastive learning with negligible extra training overhead. As a plug-and-play and framework-agnostic module, ContrastiveCrop consistently improves SimCLR, MoCo, BYOL, SimSiam by 0.4% \u223c 2.0% classification accuracy on CIFAR-10, CIFAR-100, Tiny ImageNet and STL-10. Superior results are also achieved on downstream detection and segmentation tasks when pre-trained on ImageNet-1K.\n", "keywords": "contrastive learning;data augmentation;Siamese representation learning", "primary_area": "", "supplementary_material": "", "author": "Xiangyu Peng;Kai Wang;Zheng Zhu;Yang You", "authorids": "~Xiangyu_Peng2;~Kai_Wang8;~Zheng_Zhu1;~Yang_You1", "gender": "M;M;M;M", "homepage": "https://github.com/xyupeng;https://kaiwang960112.github.io/;http://www.zhengzhu.net/;https://www.comp.nus.edu.sg/~youy/", "dblp": "120/1463;78/2022-36;29/4319.html/;33/8167-1.html", "google_scholar": "https://scholar.google.co.za/citations?user=KRUTk7sAAAAJ;i2II0XIAAAAJ;https://scholar.google.com.hk/citations?user=NmwjI0AAAAAJ;jF4dPZwAAAAJ", "orcid": ";0000-0002-1154-5175;;", "linkedin": "xiangyu-peng-aa10b11a5/;;;yang-you-0b92914b/", "or_profile": "~Xiangyu_Peng2;~Kai_Wang8;~Zheng_Zhu1;~Yang_You1", "aff": "National University of Singapore;National University of Singapore;PhiGent Robotics;National University of Singapore", "aff_domain": "nus.edu;u.nus.edu;phigent.ai;nus.edu.sg", "position": "PhD student;PhD student;Researcher;Professor", "bibtex": "@misc{\npeng2022crafting,\ntitle={{CRAFTING} {BETTER} {CONTRASTIVE} {VIEWS} {FOR} {SIAMESE} {REPRESENTATION} {LEARNING}},\nauthor={Xiangyu Peng and Kai Wang and Zheng Zhu and Yang You},\nyear={2022},\nurl={https://openreview.net/forum?id=Osoo_n9cMZ3}\n}", "github": "", "project": "", "reviewers": "2GWe;JGcY;umvA;UCJe", "site": "https://openreview.net/forum?id=Osoo_n9cMZ3", "pdf_size": 0, "recommendation": "3;5;6;6", "confidence": "5;4;5;3", "correctness": "3;3;3;4", "technical_novelty": "2;3;2;2", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "26;142;51;123", "wc_summary_review": "39;54;21;57", "wc_main_review": "264;419;168;197", "wc_review": "329;615;240;377", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 5.0, 1.224744871391589 ], "confidence_avg": [ 4.25, 0.82915619758885 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 85.5, 48.293374286748694 ], "wc_summary_review_avg": [ 42.75, 14.289419162443238 ], "wc_main_review_avg": [ 262.0, 97.10046343864688 ], "wc_review_avg": [ 390.25, 138.757657446355 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.4923659639173309, "corr_recommendation_correctness": 0.4714045207910316, "gs_citation": 146, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17284983713036766691&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "National University of Singapore;PhiGent Robotics", "aff_unique_dep": ";", "aff_unique_url": "https://www.nus.edu.sg;", "aff_unique_abbr": "NUS;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Singapore;" }, { "title": "Using Graph Representation Learning with Schema Encoders to Measure the Severity of Depressive Symptoms", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/5921", "id": "OtEDS2NWhqa", "poster": "", "openreview": "https://openreview.net/forum?id=OtEDS2NWhqa", "slides": "https://iclr.cc/virtual/2022/poster/5921", "video": "https://iclr.cc/virtual/2022/poster/5921", "author_site": "Simin Hong, Anthony Cohn, David Hogg", "tldr": "", "abstract": "Graph neural networks (GNNs) are widely used in regression and classification problems applied to text, in areas such as sentiment analysis and medical decision-making processes. We propose a novel form for node attributes within a GNN based model that captures node-specific embeddings for every word in the vocabulary. This provides a global representation at each node, coupled with node-level updates according to associations among words in a transcript. We demonstrate the efficacy of the approach by augmenting the accuracy of measuring major depressive disorder (MDD). Prior research has sought to make a diagnostic prediction of depression levels from patient data using several modalities, including audio, video, and text. On the DAIC-WOZ benchmark, our method outperforms state-of-art methods by a substantial margin, including those using multiple modalities. Moreover, we also evaluate the performance of our novel model on a Twitter sentiment dataset. We show that our model outperforms a general GNN model by leveraging our novel 2-D node attributes. These results demonstrate the generality of the proposed method.", "keywords": "Graph neural networks sentiment analysis node-embedding algorithm diagnostic prediction task", "primary_area": "", "supplementary_material": "", "author": "Simin Hong;Anthony Cohn;David Crossland Hogg", "authorids": "~Simin_Hong1;~Anthony_Cohn1;~David_Crossland_Hogg1", "gender": "F;M;M", "homepage": ";https://eps.leeds.ac.uk/computing/staff/76/professor-anthony-tony-g-cohn-freng-flsw-ceng-citp;https://eps.leeds.ac.uk/computing/staff/84/professor-david-hogg", "dblp": "326/7271;c/AnthonyGCohn.html;h/DHogg", "google_scholar": "https://scholar.google.com/citations?hl=en;tal4mMkAAAAJ;5VJ4YPQAAAAJ", "orcid": ";0000-0002-7652-8907;0000-0002-6125-9564", "linkedin": ";tonycohn/?originalSubdomain=uk;", "or_profile": "~Simin_Hong1;~Anthony_Cohn1;~David_Crossland_Hogg1", "aff": "University of Leeds;University of Leeds;University of Leeds", "aff_domain": "leeds.ac.uk;leeds.ac.uk;leeds.ac.uk", "position": "PhD student;Full Professor;Full Professor", "bibtex": "@inproceedings{\nhong2022using,\ntitle={Using Graph Representation Learning with Schema Encoders to Measure the Severity of Depressive Symptoms},\nauthor={Simin Hong and Anthony Cohn and David Crossland Hogg},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=OtEDS2NWhqa}\n}", "github": "", "project": "", "reviewers": "ehqq;Bk2m;jCNH", "pdf_size": 0, "recommendation": "5;6;8", "confidence": "4;4;3", "correctness": "4;4;4", "technical_novelty": "2;3;3", "empirical_novelty": "2;2;3", "wc_summary_paper": "26;62;71", "wc_summary_review": "20;35;40", "wc_main_review": "134;406;88", "wc_review": "180;503;199", "wc_reply_reviewers": "91;34;0", "wc_reply_authors": "1012;577;457", "reply_reviewers": "1;1;0", "reply_authors": "2;1;1", "recommendation_avg": [ 6.333333333333333, 1.247219128924647 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 4.0, 0.0 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 53.0, 19.44222209522358 ], "wc_summary_review_avg": [ 31.666666666666668, 8.498365855987975 ], "wc_main_review_avg": [ 209.33333333333334, 140.32660316403144 ], "wc_review_avg": [ 294.0, 147.9887383102737 ], "wc_reply_reviewers_avg": [ 41.666666666666664, 37.54404820415022 ], "wc_reply_authors_avg": [ 682.0, 238.43238035132728 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.944911182523068, "corr_recommendation_correctness": 0.0, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18226966908018577857&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=OtEDS2NWhqa", "email": "leeds.ac.uk;leeds.ac.uk;leeds.ac.uk", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Leeds", "aff_unique_dep": "", "aff_unique_url": "https://www.leeds.ac.uk", "aff_unique_abbr": "Leeds", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United Kingdom" }, { "title": "Vitruvion: A Generative Model of Parametric CAD Sketches", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6237", "id": "Ow1C7s3UcY", "poster": "", "openreview": "https://openreview.net/forum?id=Ow1C7s3UcY", "slides": "https://iclr.cc/virtual/2022/poster/6237", "video": "https://iclr.cc/virtual/2022/poster/6237", "author_site": "Ari Seff, Wenda Zhou, Nick Richardson, Ryan P Adams", "tldr": "", "abstract": "Parametric computer-aided design (CAD) tools are the predominant way that engineers specify physical structures, from bicycle pedals to airplanes to printed circuit boards. The key characteristic of parametric CAD is that design intent is encoded not only via geometric primitives, but also by parameterized constraints between the elements. This relational specification can be viewed as the construction of a constraint program, allowing edits to coherently propagate to other parts of the design. Machine learning offers the intriguing possibility of accelerating the design process via generative modeling of these structures, enabling new tools such as autocompletion, constraint inference, and conditional synthesis. In this work, we present such an approach to generative modeling of parametric CAD sketches, which constitute the basic computational building blocks of modern mechanical design. Our model, trained on real-world designs from the SketchGraphs dataset, autoregressively synthesizes sketches as sequences of primitives, with initial coordinates, and constraints that reference back to the sampled primitives. As samples from the model match the constraint graph representation used in standard CAD software, they may be directly imported, solved, and edited according to downstream design tasks. In addition, we condition the model on various contexts, including partial sketches (primers) and images of hand-drawn sketches. Evaluation of the proposed approach demonstrates its ability to synthesize realistic CAD sketches and its potential to aid the mechanical design workflow.", "keywords": "generative modeling;CAD;transformers;design;geometric constraints", "primary_area": "", "supplementary_material": "", "author": "Ari Seff;Wenda Zhou;Nick Richardson;Ryan P Adams", "authorids": "~Ari_Seff1;~Wenda_Zhou1;~Nick_Richardson1;~Ryan_P_Adams1", "gender": ";M;M;M", "homepage": "https://www.ariseff.com;https://wendazhou.com;https://www.cs.princeton.edu/~nr14/;http://www.cs.princeton.edu/~rpa/", "dblp": "147/5247;218/6092;;32/909", "google_scholar": "https://scholar.google.com/citations?hl=en;CdrwG8AAAAAJ;TwnmjhgAAAAJ;grQ_GBgAAAAJ", "orcid": ";;;", "linkedin": ";;nick-richardson-854271214/;", "or_profile": "~Ari_Seff1;~Wenda_Zhou1;~Nick_Richardson1;~Ryan_P_Adams1", "aff": "Waymo;Flatiron Institute;Princeton University;Princeton University", "aff_domain": "waymo.com;flatironinstitute.org;princeton.edu;princeton.edu", "position": "Research Scientist;Postdoc;PhD student;Professor", "bibtex": "@inproceedings{\nseff2022vitruvion,\ntitle={Vitruvion: A Generative Model of Parametric {CAD} Sketches},\nauthor={Ari Seff and Wenda Zhou and Nick Richardson and Ryan P Adams},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=Ow1C7s3UcY}\n}", "github": "", "project": "", "reviewers": "DfnC;9htv;rqPX;ZmEf", "pdf_size": 0, "recommendation": "6;8;8;8", "confidence": "4;4;4;4", "correctness": "3;4;4;4", "technical_novelty": "2;4;3;3", "empirical_novelty": "3;0;3;0", "wc_summary_paper": "80;197;74;79", "wc_summary_review": "38;347;20;77", "wc_main_review": "415;286;567;519", "wc_review": "533;830;661;675", "wc_reply_reviewers": "46;113;61;79", "wc_reply_authors": "455;394;264;660", "reply_reviewers": "1;1;1;1", "reply_authors": "1;1;1;1", "recommendation_avg": [ 7.5, 0.8660254037844386 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 1.5, 1.5 ], "wc_summary_paper_avg": [ 107.5, 51.7228189487 ], "wc_summary_review_avg": [ 120.5, 132.38296718233806 ], "wc_main_review_avg": [ 446.75, 107.85261934695883 ], "wc_review_avg": [ 674.75, 105.33844265034489 ], "wc_reply_reviewers_avg": [ 74.75, 24.983744715314394 ], "wc_reply_authors_avg": [ 443.25, 142.89397293098125 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 1.0, "gs_citation": 78, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10041386464218470519&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "pdf": "https://openreview.net/pdf?id=Ow1C7s3UcY", "email": "waymo.com;flatironinstitute.org;princeton.edu;princeton.edu", "author_num": 4, "aff_unique_index": "0;1;2;2", "aff_unique_norm": "Waymo;Flatiron Institute;Princeton University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.waymo.com;https://flatironinstitute.org;https://www.princeton.edu", "aff_unique_abbr": "Waymo;Flatiron;Princeton", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "Oxdln9khkxv", "title": "Learning the Representation of Behavior Styles with Imitation Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Imitation learning is one of the methods for reproducing expert demonstrations adaptively by learning a mapping between observations and actions. However, behavior styles such as motion trajectory and driving habit depend largely on the dataset of human maneuvers, and settle down to an average behavior style in most imitation learning algorithms. In this study, we propose a method named style behavior cloning (Style BC), which can not only infer the latent representation of behavior styles automatically, but also imitate different style policies from expert demonstrations. Our method is inspired by the word2vec algorithm and we construct a behavior-style to action mapping which is similar to the word-embedding to context mapping in word2vec. Empirical results on popular benchmark environments show that Style BC outperforms standard behavior cloning in prediction accuracy and expected reward significantly. Furthermore, compared with various baselines, our policy influenced by its assigned style embedding can better reproduce the expert behavior styles, especially in the complex environments or the number of the behavior styles is large.", "keywords": "Imitation Learning;Behavior Style", "primary_area": "", "supplementary_material": "/attachment/582850300de44a642e4f99b360d25289df0dbccf.zip", "author": "Xiao Liu;Meng Wang;Zhaorong Wang;Yingfeng Chen;Yujing Hu;Changjie Fan;Chongjie Zhang", "authorids": "~Xiao_Liu21;~Meng_Wang13;wangzhaorong@corp.netease.com;~Yingfeng_Chen2;~Yujing_Hu2;~Changjie_Fan1;~Chongjie_Zhang1", "gender": "M;M;;;;M;", "homepage": "https://scholar.google.com.tw/citations?hl=zh-CN&user=eYoMTwEAAAAJ&view_op=list_works&gmla=AJsN-F6yOGOTDUU-ZCrCBVSzT0cfpj-PSaWbIkSeaRwOobZhCvx8IYGEkdCrYen2ZwdhxtBuJrxMXmL-2LOvTZCgImRhm_OChf3gGo-yyxu4yxH0UZRoTsmuIXsZ1_2zdyM1cdYvExWX;;;;;;", "dblp": ";93/6765-16.html;;;https://dblp.uni-trier.de/pid/160/1923.html;71/882;29/6693", "google_scholar": ";vwhBzc4AAAAJ;;;IR5WY-wAAAAJ;;LjxqXycAAAAJ", "orcid": ";;;;;0000-0001-5420-0516;", "linkedin": ";;;;;;", "or_profile": "~Xiao_Liu21;~Meng_Wang13;wangzhaorong@corp.netease.com;~Yingfeng_Chen2;~Yujing_Hu2;~Changjie_Fan1;~Chongjie_Zhang1", "aff": "Zhejiang University;;;;NetEase, Inc.;Netease, Fuxi AI Lab;Tsinghua University", "aff_domain": "zju.edu.cn;;;;corp.netease.com;corp.netease.com;tsinghua.edu.cn", "position": "PhD student;;;;Researcher;Principal Researcher;Assistant Professor", "bibtex": "@misc{\nliu2022learning,\ntitle={Learning the Representation of Behavior Styles with Imitation Learning},\nauthor={Xiao Liu and Meng Wang and Zhaorong Wang and Yingfeng Chen and Yujing Hu and Changjie Fan and Chongjie Zhang},\nyear={2022},\nurl={https://openreview.net/forum?id=Oxdln9khkxv}\n}", "github": "", "project": "", "reviewers": "eS3s;5JcU;5Mw5;9MaF", "site": "https://openreview.net/forum?id=Oxdln9khkxv", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "4;4;4;3", "correctness": "3;2;3;3", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;1;3;2", "wc_summary_paper": "118;75;93;83", "wc_summary_review": "36;49;26;25", "wc_main_review": "252;355;222;288", "wc_review": "406;479;341;396", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 92.25, 16.17675801883678 ], "wc_summary_review_avg": [ 34.0, 9.669539802906858 ], "wc_main_review_avg": [ 279.25, 49.58515402819679 ], "wc_review_avg": [ 405.5, 49.1248409666637 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:SgPH98sHZgcJ:scholar.google.com/&scioq=Learning+the+Representation+of+Behavior+Styles+with+Imitation+Learning&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "Zhejiang University;NetEase, Inc.;Netease;Tsinghua University", "aff_unique_dep": ";;Fuxi AI Lab;", "aff_unique_url": "https://www.zju.edu.cn;https://www.163.com;https://www.netease.com;https://www.tsinghua.edu.cn", "aff_unique_abbr": "ZJU;NetEase;Netease;THU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "title": "Gaussian Mixture Convolution Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/5970", "id": "Oxeka7Z7Hor", "poster": "", "openreview": "https://openreview.net/forum?id=Oxeka7Z7Hor", "slides": "https://iclr.cc/virtual/2022/poster/5970", "video": "https://iclr.cc/virtual/2022/poster/5970", "author_site": "Adam Celarek, Pedro Hermosilla Casajus, Bernhard Kerbl, Timo Ropinski, Michael Wimmer", "tldr": "", "abstract": "This paper proposes a novel method for deep learning based on the analytical convolution of multidimensional Gaussian mixtures.\nIn contrast to tensors, these do not suffer from the curse of dimensionality and allow for a compact representation, as data is only stored where details exist.\nConvolution kernels and data are Gaussian mixtures with unconstrained weights, positions, and covariance matrices.\nSimilar to discrete convolutional networks, each convolution step produces several feature channels, represented by independent Gaussian mixtures.\nSince traditional transfer functions like ReLUs do not produce Gaussian mixtures, we propose using a fitting of these functions instead.\nThis fitting step also acts as a pooling layer if the number of Gaussian components is reduced appropriately.\nWe demonstrate that networks based on this architecture reach competitive accuracy on Gaussian mixtures fitted to the MNIST and ModelNet data sets.", "keywords": "deep learning architecture;gaussian convolution;gaussian mixture;3d", "primary_area": "", "supplementary_material": "/attachment/c1cc3c8dfeda01da7b0daf0413f5b96d77686e51.zip", "author": "Adam Celarek;Pedro Hermosilla;Bernhard Kerbl;Timo Ropinski;Michael Wimmer", "authorids": "~Adam_Celarek1;~Pedro_Hermosilla1;~Bernhard_Kerbl1;~Timo_Ropinski2;~Michael_Wimmer1", "gender": ";M;M;M;M", "homepage": "https://www.cg.tuwien.ac.at/staff/AdamCelarek;https://phermosilla.github.io/;https://www.cg.tuwien.ac.at/staff/BernhardKerbl;https://viscom.uni-ulm.de/members/timo-ropinski/;https://www.cg.tuwien.ac.at/staff/MichaelWimmer.html", "dblp": ";170/7065;;92/5590;00/4132-1.html", "google_scholar": ";C7F4B6MAAAAJ;;FuY-lbcAAAAJ;https://scholar.google.at/citations?user=pPLc_DoAAAAJ", "orcid": ";;;0000-0002-7857-5512;0000-0002-9370-2663", "linkedin": ";;;;michael-wimmer-2b69b336/", "or_profile": "~Adam_Celarek1;~Pedro_Hermosilla1;~Bernhard_Kerbl1;~Timo_Ropinski2;~Michael_Wimmer1", "aff": "TU Wien Vienna University of Technology;Ulm University;INRIA;Ulm University;TU Wien Vienna University of Technology", "aff_domain": "tuwien.ac.at;uni-ulm.de;inria.fr;uni-ulm.de;tuwien.ac.at", "position": "PhD student;Postdoc;Postdoc;Full Professor;Professor", "bibtex": "@inproceedings{\ncelarek2022gaussian,\ntitle={Gaussian Mixture Convolution Networks},\nauthor={Adam Celarek and Pedro Hermosilla and Bernhard Kerbl and Timo Ropinski and Michael Wimmer},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=Oxeka7Z7Hor}\n}", "github": "", "project": "", "reviewers": "Ao7K;RmKw;gTB4;eQt8", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "3;4;3;4", "correctness": "3;3;3;4", "technical_novelty": "3;3;4;4", "empirical_novelty": "3;3;0;2", "wc_summary_paper": "62;75;104;91", "wc_summary_review": "22;339;140;14", "wc_main_review": "220;118;350;261", "wc_review": "304;532;594;366", "wc_reply_reviewers": "0;43;226;0", "wc_reply_authors": "593;807;1183;324", "reply_reviewers": "0;1;3;0", "reply_authors": "1;1;4;1", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 3.5, 0.5 ], "empirical_novelty_avg": [ 2.0, 1.224744871391589 ], "wc_summary_paper_avg": [ 83.0, 15.890248582070704 ], "wc_summary_review_avg": [ 128.75, 131.23904716203938 ], "wc_main_review_avg": [ 237.25, 83.35878777909382 ], "wc_review_avg": [ 449.0, 118.1397477566293 ], "wc_reply_reviewers_avg": [ 67.25, 93.3203487991767 ], "wc_reply_authors_avg": [ 726.75, 314.12606943709716 ], "reply_reviewers_avg": [ 1.0, 1.224744871391589 ], "reply_authors_avg": [ 1.75, 1.299038105676658 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.6882472016116854, "corr_recommendation_correctness": 0.9271726499455306, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3285204199081775267&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=Oxeka7Z7Hor", "email": "tuwien.ac.at;uni-ulm.de;inria.fr;uni-ulm.de;tuwien.ac.at", "author_num": 5, "aff_unique_index": "0;1;2;1;0", "aff_unique_norm": "Vienna University of Technology;Ulm University;INRIA", "aff_unique_dep": ";;", "aff_unique_url": "https://www.tuwien.ac.at;https://www.uni-ulm.de/;https://www.inria.fr", "aff_unique_abbr": "TU Wien;U Ulm;INRIA", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Vienna;", "aff_country_unique_index": "0;1;2;1;0", "aff_country_unique": "Austria;Germany;France" }, { "id": "OxgLa0VEyg-", "title": "Loss Function Learning for Domain Generalization by Implicit Gradient", "track": "main", "status": "Reject", "tldr": "", "abstract": "Generalising robustly to distribution shift is a major challenge that is pervasive across most real-world applications of machine learning. A recent study highlighted that many advanced algorithms proposed to tackle such domain generalisation (DG) fail to outperform a properly tuned empirical risk minimisation (ERM) baseline. We take a different approach, and explore the impact of the ERM loss function on out-of-domain generalisation. In particular, we introduce a novel meta-learning approach to loss function search based on implicit gradient. This enables us to discover a general purpose parametric loss function that provides a drop-in replacement for cross-entropy. Our loss can be used in standard training pipelines to efficiently train robust models using any neural architecture on new datasets. The results show that it clearly surpasses cross-entropy, enables simple ERM to outperform significantly more complicated prior DG methods, and provides state-of-the-art performance across a variety of DG benchmarks. Furthermore, unlike most existing DG approaches, our setup applies to the most practical setting of single-source domain generalisation, on which we show significant improvement.", "keywords": "meta-learning;loss function learning;Domain Generalisation", "primary_area": "", "supplementary_material": "/attachment/ae94d5b5c65c67b114443b27324dc1267e6d9d9b.zip", "author": "Boyan Gao;Henry Gouk;Yongxin Yang;Timothy Hospedales", "authorids": "~Boyan_Gao1;~Henry_Gouk1;~Yongxin_Yang1;~Timothy_Hospedales1", "gender": ";M;M;M", "homepage": "https://www.researchgate.net/profile/Boyan_Gao2;https://www.henrygouk.com;http://homepages.inf.ed.ac.uk/thospeda/;", "dblp": "251/3330;172/0943;32/3545;150/4258", "google_scholar": "WIuM3SIAAAAJ;https://scholar.google.co.nz/citations?user=i1bzlyAAAAAJ;https://scholar.google.fr/citations?user=nHhtvqkAAAAJ;https://scholar.google.co.uk/citations?user=F7PtrL8AAAAJ", "orcid": ";;0000-0003-4867-7486;", "linkedin": ";;timothyhospedales/;", "or_profile": "~Boyan_Gao1;~Henry_Gouk1;~Timothy_Hospedales1;~Yongxin_Yang3", "aff": ";University of Edinburgh;Samsung AI Research Centre;Queen Mary University of London", "aff_domain": ";ed.ac.uk;samsung.com;qmul.ac.uk", "position": ";Postdoc;Principal Researcher;Assistant Professor", "bibtex": "@misc{\ngao2022loss,\ntitle={Loss Function Learning for Domain Generalization by Implicit Gradient},\nauthor={Boyan Gao and Henry Gouk and Yongxin Yang and Timothy Hospedales},\nyear={2022},\nurl={https://openreview.net/forum?id=OxgLa0VEyg-}\n}", "github": "", "project": "", "reviewers": "eZ71;57HS;DuTo;Hx46", "site": "https://openreview.net/forum?id=OxgLa0VEyg-", "pdf_size": 0, "recommendation": "3;6;6;8", "confidence": "4;3;4;3", "correctness": "2;3;3;4", "technical_novelty": "3;4;2;3", "empirical_novelty": "3;3;2;4", "wc_summary_paper": "509;66;29;104", "wc_summary_review": "58;441;25;42", "wc_main_review": "508;599;189;497", "wc_review": "1075;1106;243;643", "wc_reply_reviewers": "547;1319;35;0", "wc_reply_authors": "2090;1220;480;726", "reply_reviewers": "1;6;1;0", "reply_authors": "3;4;1;1", "recommendation_avg": [ 5.75, 1.7853571071357126 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 177.0, 193.50581386614718 ], "wc_summary_review_avg": [ 141.5, 173.30969390083175 ], "wc_main_review_avg": [ 448.25, 154.8246992569338 ], "wc_review_avg": [ 766.75, 353.4603054092496 ], "wc_reply_reviewers_avg": [ 475.25, 533.0911624666085 ], "wc_reply_authors_avg": [ 1129.0, 615.5103573458371 ], "reply_reviewers_avg": [ 2.0, 2.345207879911715 ], "reply_authors_avg": [ 2.25, 1.299038105676658 ], "replies_avg": [ 24, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.7001400420140049, "corr_recommendation_correctness": 0.9901475429766743, "gs_citation": 41, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2351875210347766586&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff_unique_index": "0;1;2", "aff_unique_norm": "University of Edinburgh;Samsung;Queen Mary University of London", "aff_unique_dep": ";AI Research;", "aff_unique_url": "https://www.ed.ac.uk;https://www.samsung.com/global/researchers/samsung-ai-research-centre/;https://www.qmul.ac.uk", "aff_unique_abbr": "Edinburgh;SARC;QMUL", "aff_campus_unique_index": "1", "aff_campus_unique": ";London", "aff_country_unique_index": "0;1;0", "aff_country_unique": "United Kingdom;South Korea" }, { "title": "A Statistical Framework for Efficient Out of Distribution Detection in Deep Neural Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6384", "id": "Oy9WeuZD51", "poster": "", "openreview": "https://openreview.net/forum?id=Oy9WeuZD51", "slides": "https://iclr.cc/virtual/2022/poster/6384", "video": "https://iclr.cc/virtual/2022/poster/6384", "author_site": "Matan Haroush, Tzviel Frostig, Ruth Heller, Daniel Soudry", "tldr": "", "abstract": "Background.\nCommonly, Deep Neural Networks (DNNs) generalize well on samples drawn from a distribution similar to that of the training set. However, DNNs' predictions are brittle and unreliable when the test samples are drawn from a dissimilar distribution.\nThis is a major concern for deployment in real-world applications, where such behavior may come at a considerable cost, such as industrial production lines, autonomous vehicles, or healthcare applications.\n\nContributions.\nWe frame Out Of Distribution (OOD) detection in DNNs as a statistical hypothesis testing problem. Tests generated within our proposed framework combine evidence from the entire network.\nUnlike previous OOD detection heuristics, this framework returns a $p$-value for each test sample. It is guaranteed to maintain the Type I Error (T1E - incorrectly predicting OOD for an actual in-distribution sample) for test data. Moreover, this allows to combine several detectors while maintaining the T1E.\n\nBuilding on this framework, we suggest a novel OOD procedure based on low-order statistics. Our method achieves comparable or better results than state-of-the-art methods on well-accepted OOD benchmarks, without retraining the network parameters or assuming prior knowledge on the test distribution --- and at a fraction of the computational cost.", "keywords": "out of distribution;DNNs;p-value;hypothesis testing;inductive conformal predictor", "primary_area": "", "supplementary_material": "", "author": "Matan Haroush;Tzviel Frostig;Ruth Heller;Daniel Soudry", "authorids": "~Matan_Haroush1;~Tzviel_Frostig1;ruheller@gmail.com;~Daniel_Soudry1", "gender": ";M;;M", "homepage": ";;;https://soudry.github.io/", "dblp": "227/3440;;;126/1779", "google_scholar": "2bPlSXQAAAAJ;https://scholar.google.com/scholar?hl=en;;https://scholar.google.co.il/citations?user=AEBWEm8AAAAJ", "orcid": ";;;0000-0001-9368-6352", "linkedin": ";;;daniel-soudry-2aa3a88/", "or_profile": "~Matan_Haroush1;~Tzviel_Frostig1;ruheller@gmail.com;~Daniel_Soudry1", "aff": "Technion, Technion;Tel Aviv University, Tel Aviv;;Technion - Israel Institute of Technology, Technion", "aff_domain": "technion.ac.il;tau.ac.il;;technion.ac.il", "position": "PhD student;PhD student;;Associate Professor", "bibtex": "@inproceedings{\nharoush2022a,\ntitle={A Statistical Framework for Efficient Out of Distribution Detection in Deep Neural Networks},\nauthor={Matan Haroush and Tzviel Frostig and Ruth Heller and Daniel Soudry},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=Oy9WeuZD51}\n}", "github": "", "project": "", "reviewers": "a8KZ;Ni17;XHJz;zW6V", "pdf_size": 0, "recommendation": "3;5;8;8", "confidence": "5;5;3;3", "correctness": "3;3;3;3", "technical_novelty": "2;2;3;4", "empirical_novelty": "2;2;3;4", "wc_summary_paper": "46;74;40;91", "wc_summary_review": "16;93;60;110", "wc_main_review": "188;335;406;519", "wc_review": "250;502;506;720", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "513;832;605;686", "reply_reviewers": "0;0;0;0", "reply_authors": "2;3;1;1", "recommendation_avg": [ 6.0, 2.1213203435596424 ], "confidence_avg": [ 4.0, 1.0 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.75, 0.82915619758885 ], "empirical_novelty_avg": [ 2.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 62.75, 20.753011829611623 ], "wc_summary_review_avg": [ 69.75, 35.86345633092271 ], "wc_main_review_avg": [ 362.0, 119.98958288118182 ], "wc_review_avg": [ 494.5, 166.44743915122274 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 659.0, 117.14307491268957 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.75, 0.82915619758885 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.9428090415820635, "corr_recommendation_correctness": 0.0, "gs_citation": 48, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8472510550990962179&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=Oy9WeuZD51", "email": "technion.ac.il;tau.ac.il;;technion.ac.il", "author_num": 4, "aff_unique_index": "0;1;0", "aff_unique_norm": "Technion - Israel Institute of Technology;Tel Aviv University", "aff_unique_dep": ";", "aff_unique_url": "https://www.technion.ac.il/en/;https://www.tau.ac.il", "aff_unique_abbr": "Technion;TAU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Tel Aviv", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Israel" }, { "id": "OzXAw20k_H", "title": "Deep Learning of Intrinsically Motivated Options in the Arcade Learning Environment", "track": "main", "status": "Reject", "tldr": "", "abstract": "Although Intrinsic Motivation allows a Reinforcement Learning agent to generate directed behaviors in an environment, even with sparse or noisy rewards, combining intrinsic and extrinsic rewards is non trivial. As an alternative to the widespread method of a weighted sum of rewards, Explore Options let the agent call an intrinsically motivated agent in order to observe and learn from interesting behaviors in the environment. Such options have only been established for simple tabular cases, and are unfit to high dimensional spaces. In this paper, we propose Deep Explore Options, revising Explore Options within the Deep Reinforcement Learning paradigm to tackle complex visual problems. Deep Explore Options can naturally learn from several unrelated intrinsic rewards, ignore harmful intrinsic rewards, learn to balance exploration, but also isolate exploitative or exploratory behaviors. In order to achieve this, we first introduce J-PER, a new transition-selection algorithm based on the interest of multiple agents. Next, we propose to consider intrinsic reward learning as an auxiliary task, with a resulting architecture achieving $50\\%$ faster wall-clock speed and building a stronger, shared representation. We test Deep Explore Options on hard and easy exploration games of the Atari Suite, following a benchmarking study to ensure fairness. Our results show that not only can they learn from multiple intrinsic rewards, they are a very strong alternative to a weighted sum of rewards, convincingly beating the baselines in 4 of the 6 tested environments, and with comparable performances in the other 2.", "keywords": "reinforcement learning;intrinsic motivation;auxiliary task learning;options;atari", "primary_area": "", "supplementary_material": "", "author": "Louis Bagot;Kevin Mets;Tom De Schepper;Peter Hellinckx;Steven Latre", "authorids": "~Louis_Bagot1;~Kevin_Mets1;tom.deschepper@uantwerpen.be;peter.hellinckx@uantwerpen.be;~Steven_Latre1", "gender": "M;M;;;M", "homepage": ";;;;https://www.uantwerpen.be/en/staff/steven-latre/", "dblp": ";;;;", "google_scholar": ";avinyLUAAAAJ;;;", "orcid": ";0000-0002-4812-4841;;;", "linkedin": "louis-bagot-32ba48152/;;;;", "or_profile": "~Louis_Bagot1;~Kevin_Mets1;tom.deschepper@uantwerpen.be;peter.hellinckx@uantwerpen.be;~Steven_Latre1", "aff": "University of Antwerp;University of Antwerp, IDLab, imec;;;University of Antwerp", "aff_domain": "uantwerpen.be;uantwerpen.be;;;uantwerpen.be", "position": "PhD student;Postdoc;;;Full Professor", "bibtex": "@misc{\nbagot2022deep,\ntitle={Deep Learning of Intrinsically Motivated Options in the Arcade Learning Environment},\nauthor={Louis Bagot and Kevin Mets and Tom De Schepper and Peter Hellinckx and Steven Latre},\nyear={2022},\nurl={https://openreview.net/forum?id=OzXAw20k_H}\n}", "github": "", "project": "", "reviewers": "jgim;yuhD;svsx;zfem", "site": "https://openreview.net/forum?id=OzXAw20k_H", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "4;2;3;4", "correctness": "2;3;2;2", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "228;85;79;80", "wc_summary_review": "117;76;47;88", "wc_main_review": "649;241;374;525", "wc_review": "994;402;500;693", "wc_reply_reviewers": "25;0;0;35", "wc_reply_authors": "363;313;389;354", "reply_reviewers": "1;0;0;1", "reply_authors": "1;1;1;1", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "correctness_avg": [ 2.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 118.0, 63.549193543270086 ], "wc_summary_review_avg": [ 82.0, 25.10975905897944 ], "wc_main_review_avg": [ 447.25, 153.82843527774702 ], "wc_review_avg": [ 647.25, 225.91964832656765 ], "wc_reply_reviewers_avg": [ 15.0, 15.411035007422441 ], "wc_reply_authors_avg": [ 354.75, 27.316432783216772 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.5222329678670935, "corr_recommendation_correctness": -0.3333333333333333, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16395956927318658105&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Antwerp", "aff_unique_dep": "", "aff_unique_url": "https://www.uantwerp.be", "aff_unique_abbr": "UA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Belgium" }, { "title": "Task-Induced Representation Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6090", "id": "OzyXtIZAzFv", "poster": "", "openreview": "https://openreview.net/forum?id=OzyXtIZAzFv", "slides": "https://iclr.cc/virtual/2022/poster/6090", "video": "https://iclr.cc/virtual/2022/poster/6090", "author_site": "Jun Yamada, Karl Pertsch, Anisha Gunjal, Joseph Lim", "tldr": "", "abstract": "In this work, we evaluate the effectiveness of representation learning approaches for decision making in visually complex environments. Representation learning is essential for effective reinforcement learning (RL) from high-dimensional in- puts. Unsupervised representation learning approaches based on reconstruction, prediction or contrastive learning have shown substantial learning efficiency gains. Yet, they have mostly been evaluated in clean laboratory or simulated settings. In contrast, real environments are visually complex and contain substantial amounts of clutter and distractors. Unsupervised representations will learn to model such distractors, potentially impairing the agent\u2019s learning efficiency. In contrast, an alternative class of approaches, which we call task-induced representation learning, leverages task information such as rewards or demonstrations from prior tasks to focus on task-relevant parts of the scene and ignore distractors. We investi- gate the effectiveness of unsupervised and task-induced representation learning approaches on four visually complex environments, from Distracting DMControl to the CARLA driving simulator. For both, RL and imitation learning, we find that representation learning generally improves sample efficiency on unseen tasks even in visually complex scenes and that task-induced representations can double learning efficiency compared to unsupervised alternatives.", "keywords": "representation learning;reinforcement learning;transfer learning;visually complex observations", "primary_area": "", "supplementary_material": "/attachment/1d15687656fe3dec165b650bf7d614c58433b286.zip", "author": "Jun Yamada;Karl Pertsch;Anisha Gunjal;Joseph J Lim", "authorids": "~Jun_Yamada1;~Karl_Pertsch1;~Anisha_Gunjal1;~Joseph_J_Lim1", "gender": "M;;M;F", "homepage": "http://junjungoal.github.io;https://kpertsch.github.io/;http://people.csail.mit.edu/lim/;https://anisha2102.github.io/", "dblp": ";211/7137;08/3086;", "google_scholar": "ESeyBEEAAAAJ;https://scholar.google.com/citations?view_op=list_works;jTnQTBoAAAAJ;KNsZ1u4AAAAJ", "orcid": ";;;", "linkedin": ";;;anisha-gunjal/", "or_profile": "~Jun_Yamada1;~Karl_Pertsch1;~Joseph_J_Lim1;~Anisha_Gunjal3", "aff": "University of Oxford;University of Southern California;Korea Advanced Institute of Science & Technology;University of Southern California", "aff_domain": "ox.ac.uk;usc.edu;kaist.ac.kr;usc.edu", "position": "PhD student;PhD student;Associate Professor;Researcher", "bibtex": "@inproceedings{\nyamada2022taskinduced,\ntitle={Task-Induced Representation Learning},\nauthor={Jun Yamada and Karl Pertsch and Anisha Gunjal and Joseph J Lim},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=OzyXtIZAzFv}\n}", "github": "", "project": "", "reviewers": "k7xA;TaF6;uzb1;s9b6", "pdf_size": 0, "recommendation": "5;6;6;6", "confidence": "4;4;4;5", "correctness": "3;3;3;4", "technical_novelty": "1;2;2;2", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "250;141;72;115", "wc_summary_review": "95;139;8;31", "wc_main_review": "288;909;162;283", "wc_review": "633;1189;242;429", "wc_reply_reviewers": "134;487;193;0", "wc_reply_authors": "834;1925;723;371", "reply_reviewers": "2;2;1;0", "reply_authors": "2;4;1;1", "recommendation_avg": [ 5.75, 0.4330127018922193 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 1.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 144.5, 65.7057836114904 ], "wc_summary_review_avg": [ 68.25, 51.81397012389612 ], "wc_main_review_avg": [ 410.5, 292.1972792481135 ], "wc_review_avg": [ 623.25, 354.70154707866726 ], "wc_reply_reviewers_avg": [ 203.5, 177.99227511327564 ], "wc_reply_authors_avg": [ 963.25, 580.9795069535586 ], "reply_reviewers_avg": [ 1.25, 0.82915619758885 ], "reply_authors_avg": [ 2.0, 1.224744871391589 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3635245913413414214&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "pdf": "https://openreview.net/pdf?id=OzyXtIZAzFv", "email": "ox.ac.uk;usc.edu;kaist.ac.kr;usc.edu", "author_num": 4, "aff_unique_index": "0;1;2;1", "aff_unique_norm": "University of Oxford;University of Southern California;Korea Advanced Institute of Science and Technology", "aff_unique_dep": ";;", "aff_unique_url": "https://www.ox.ac.uk;https://www.usc.edu;https://www.kaist.ac.kr", "aff_unique_abbr": "Oxford;USC;KAIST", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Los Angeles", "aff_country_unique_index": "0;1;2;1", "aff_country_unique": "United Kingdom;United States;South Korea" }, { "id": "P-gDXxGYCib", "title": "Feature Selection in the Contrastive Analysis Setting", "track": "main", "status": "Reject", "tldr": "", "abstract": "The goal of unsupervised feature selection is to select a small number of informative features for use in unknown downstream tasks. Here the definition of ``informative'' is subjective and dependent on the specifics of a given problem domain. In the contrastive analysis (CA) setting, machine learning practitioners are specifically interested in discovering patterns that are enriched in a target dataset as compared to a background dataset generated from sources of variation irrelevant to the task at hand. For example, a biomedical data analyst may wish to find a small set of genes to use as a proxy for variations in genomic data only present among patients with a given disease as opposed to healthy control subjects. However, as of yet the problem of unsupervised feature selection in the CA setting has received little attention from the machine learning community. In this work we present CFS (Contrastive Feature Selection), a method for performing feature selection in the CA setting. We experiment with multiple variations of our method on a semi-synthetic dataset and four real-world biomedical datasets, and we find that it consistently outperforms previous state-of-the-art methods designed for standard unsupervised feature selection scenarios. ", "keywords": "Feature selection;contrastive analysis;computational biology", "primary_area": "", "supplementary_material": "/attachment/44c3f38264d511431920234829703493b81fd4cb.zip", "author": "Ethan Weinberger;Ian Connick Covert;Su-In Lee", "authorids": "~Ethan_Weinberger2;~Ian_Connick_Covert1;~Su-In_Lee2", "gender": "M;M;F", "homepage": "https://homes.cs.washington.edu/~ewein/;https://iancovert.com;http://suinlee.cs.washington.edu/", "dblp": "217/3451;262/3443;17/1784", "google_scholar": "Jg40o3gAAAAJ;Np8Ek3cAAAAJ;", "orcid": ";;", "linkedin": ";ian-covert/;", "or_profile": "~Ethan_Weinberger2;~Ian_Connick_Covert1;~Su-In_Lee2", "aff": "Genentech Inc.;University of Washington;University of Washington", "aff_domain": "gene.com;uw.edu;uw.edu", "position": "Intern;PhD student;Assistant Professor", "bibtex": "@misc{\nweinberger2022feature,\ntitle={Feature Selection in the Contrastive Analysis Setting},\nauthor={Ethan Weinberger and Ian Connick Covert and Su-In Lee},\nyear={2022},\nurl={https://openreview.net/forum?id=P-gDXxGYCib}\n}", "github": "", "project": "", "reviewers": "Mgox;ft7b;UFq8;vsNM", "site": "https://openreview.net/forum?id=P-gDXxGYCib", "pdf_size": 0, "recommendation": "3;5;5;8", "confidence": "4;2;4;4", "correctness": "2;2;3;4", "technical_novelty": "1;2;2;3", "empirical_novelty": "1;2;1;2", "wc_summary_paper": "171;46;85;73", "wc_summary_review": "20;32;57;33", "wc_main_review": "372;165;153;267", "wc_review": "563;243;295;373", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "638;742;301;336", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 5.25, 1.7853571071357126 ], "confidence_avg": [ 3.5, 0.8660254037844386 ], "correctness_avg": [ 2.75, 0.82915619758885 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 1.5, 0.5 ], "wc_summary_paper_avg": [ 93.75, 46.78341052125208 ], "wc_summary_review_avg": [ 35.5, 13.425721582097552 ], "wc_main_review_avg": [ 239.25, 88.52224296751636 ], "wc_review_avg": [ 368.5, 121.45266567679772 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 504.25, 189.75823434043647 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.08084520834544431, "corr_recommendation_correctness": 0.8866206949335731, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12145438801910128468&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff_unique_index": "0;1;1", "aff_unique_norm": "Genentech;University of Washington", "aff_unique_dep": ";", "aff_unique_url": "https://www.gene.com;https://www.washington.edu", "aff_unique_abbr": "Genentech;UW", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "HTLM: Hyper-Text Pre-Training and Prompting of Language Models", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6281", "id": "P-pPW1nxf1r", "poster": "", "openreview": "https://openreview.net/forum?id=P-pPW1nxf1r", "slides": "https://iclr.cc/virtual/2022/poster/6281", "video": "https://iclr.cc/virtual/2022/poster/6281", "author_site": "Armen Aghajanyan, Dmytro Okhonko, Mike Lewis, Mandar Joshi, Hu Xu, Gargi Ghosh, Luke Zettlemoyer", "tldr": "", "abstract": "We introduce HTLM, a hyper-text language model trained on a large-scale web crawl. Modeling hyper-text has a number of advantages: (1) it is easily gathered at scale, (2) it provides rich document-level and end-task-adjacent supervision (e.g. 'class' and 'id' attributes often encode document category information), and (3) it allows for new structured prompting that follows the established semantics of HTML (e.g. to do zero-shot summarization by infilling '' tags for a webpage that contains the input text). We show that pretraining with a BART-style denoising loss directly on simplified HTML provides highly effective transfer for a wide range of end tasks and supervision levels. HTLM matches or exceeds the performance of comparably sized text-only LMs for zero-shot prompting and fine-tuning for classification benchmarks, while also setting new state-of-the-art performance levels for zero-shot summarization. We also find that hyper-text prompts provide more value to HTLM, in terms of data efficiency, than plain text prompts do for existing LMs, and that HTLM is highly effective at auto-prompting itself, by simply generating the most likely hyper-text formatting for any available training data. We will release all code and models to support future HTLM research. ", "keywords": "prompting;nlp;representational learning;priming", "primary_area": "", "supplementary_material": "", "author": "Armen Aghajanyan;Dmytro Okhonko;Mike Lewis;Mandar Joshi;Hu Xu;Gargi Ghosh;Luke Zettlemoyer", "authorids": "~Armen_Aghajanyan1;oxo@fb.com;~Mike_Lewis1;~Mandar_Joshi1;huxu@fb.com;gghosh@fb.com;~Luke_Zettlemoyer1", "gender": ";;M;;;;M", "homepage": ";;;https://homes.cs.washington.edu/~mandar90;;;https://www.cs.washington.edu/people/faculty/lsz/", "dblp": ";;19/6214;85/1261;;;21/6793", "google_scholar": ";;SnQnQicAAAAJ;;;;https://scholar.google.com.tw/citations?user=UjpbO6IAAAAJ", "orcid": ";;;;;;", "linkedin": ";;;;;;luke-zettlemoyer-a0109b226/", "or_profile": "~Armen_Aghajanyan1;oxo@fb.com;~Mike_Lewis1;~Mandar_Joshi1;huxu@fb.com;gghosh@fb.com;~Luke_Zettlemoyer1", "aff": ";;Facebook AI Research;Department of Computer Science, University of Washington;;;Meta", "aff_domain": ";;fb.com;cs.washington.edu;;;meta.com", "position": ";;Research Scientist;PhD student;;;Researcher", "bibtex": "@inproceedings{\naghajanyan2022htlm,\ntitle={{HTLM}: Hyper-Text Pre-Training and Prompting of Language Models},\nauthor={Armen Aghajanyan and Dmytro Okhonko and Mike Lewis and Mandar Joshi and Hu Xu and Gargi Ghosh and Luke Zettlemoyer},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=P-pPW1nxf1r}\n}", "github": "", "project": "", "reviewers": "36N9;2QhH;srNC;Pgkk", "pdf_size": 0, "recommendation": "6;6;6;8", "confidence": "4;4;3;4", "correctness": "3;3;4;4", "technical_novelty": "2;3;3;4", "empirical_novelty": "3;3;2;4", "wc_summary_paper": "59;137;170;78", "wc_summary_review": "72;79;97;37", "wc_main_review": "591;519;387;122", "wc_review": "722;735;654;237", "wc_reply_reviewers": "0;511;0;0", "wc_reply_authors": "282;502;188;52", "reply_reviewers": "0;1;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 111.0, 44.58138625031752 ], "wc_summary_review_avg": [ 71.25, 21.775846711436962 ], "wc_main_review_avg": [ 404.75, 178.88875733259482 ], "wc_review_avg": [ 587.0, 204.40034246546654 ], "wc_reply_reviewers_avg": [ 127.75, 221.26949066692407 ], "wc_reply_authors_avg": [ 256.0, 163.88410539158457 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 84, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11398563745498016990&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=P-pPW1nxf1r", "email": ";;fb.com;cs.washington.edu;;;meta.com", "author_num": 7, "aff_unique_index": "0;1;0", "aff_unique_norm": "Meta;University of Washington", "aff_unique_dep": "Facebook AI Research;Department of Computer Science", "aff_unique_url": "https://research.facebook.com;https://www.washington.edu", "aff_unique_abbr": "FAIR;UW", "aff_campus_unique_index": "1", "aff_campus_unique": ";Seattle", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Explaining Point Processes by Learning Interpretable Temporal Logic Rules", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6151", "id": "P07dq7iSAGr", "poster": "", "openreview": "https://openreview.net/forum?id=P07dq7iSAGr", "slides": "https://iclr.cc/virtual/2022/poster/6151", "video": "https://iclr.cc/virtual/2022/poster/6151", "author_site": "Shuang Li, Mingquan Feng, Lu Wang, Abdelmajid Essofi, Yufeng Cao, Junchi Yan, Le Song", "tldr": "", "abstract": "We propose a principled method to learn a set of human-readable logic rules to explain temporal point processes. \nWe assume that the generative mechanisms underlying the temporal point processes are governed by a set of first-order temporal logic rules, as a compact representation of domain knowledge. Our method formulates the rule discovery process from noisy event data as a maximum likelihood problem, and designs an efficient and tractable branch-and-price algorithm to progressively search for new rules and expand existing rules. The proposed algorithm alternates between the rule generation stage and the rule evaluation stage, and uncovers the most important collection of logic rules within a fixed time limit for both synthetic and real event data. In a real healthcare application, we also had human experts (i.e., doctors) verify the learned temporal logic rules and provide further improvements. These expert-revised interpretable rules lead to a point process model which outperforms previous state-of-the-arts for symptom prediction, both in their occurrence times and types. ", "keywords": "Temporal Point Process;Temporal Logic Rules;Explainable Models", "primary_area": "", "supplementary_material": "/attachment/790bfaeabe127324a6bbb70801920d20370de312.zip", "author": "Shuang Li;Mingquan Feng;Lu Wang;Abdelmajid Essofi;Yufeng Cao;Junchi Yan;Le Song", "authorids": "~Shuang_Li3;~Mingquan_Feng1;~Lu_Wang11;~Abdelmajid_Essofi1;~Yufeng_Cao1;~Junchi_Yan2;~Le_Song1", "gender": "F;;F;M;M;;M", "homepage": "https://shuangli01.github.io;;https://scholar.google.com/citations?user=hqlU92YAAAAJ&hl=en;https://github.com/MjidHub;https://acem.sjtu.edu.cn/en/faculty/caoyufeng.html;;http://www.cc.gatech.edu/~lsong", "dblp": "43/6294-2;;49/3800-3.html;;;;94/3481", "google_scholar": "https://scholar.google.com/citations?hl=en;;hqlU92YAAAAJ;;;;Xl4E0CsAAAAJ", "orcid": ";;0009-0003-6788-2204;;;;", "linkedin": ";;;;;;", "or_profile": "~Shuang_Li3;~Mingquan_Feng1;~Lu_Wang11;~Abdelmajid_Essofi1;~Yufeng_Cao1;~Junchi_Yan2;~Le_Song1", "aff": "The Chinese University of Hong Kong (Shenzhen);;Microsoft;Mohamed Bin Zayed University of Artificial Intelligence;Shanghai Jiaotong University;;College of Computing, Georgia Institute of Technology", "aff_domain": "cuhk.edu.cn;;microsoft.com;mbzuai.ac.ae;sjtu.edu.cn;;cc.gatech.edu", "position": "Assistant Professor;;Researcher;MS student;Assistant Professor;;Associate Professor", "bibtex": "@inproceedings{\nli2022explaining,\ntitle={Explaining Point Processes by Learning Interpretable Temporal Logic Rules},\nauthor={Shuang Li and Mingqaun Feng and Lu Wang and Abdelmajid Essofi and Yufeng Cao and Junchi Yan and Le Song},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=P07dq7iSAGr}\n}", "github": "", "project": "", "reviewers": "AiEu;GzHz;sdkz;BWwC", "pdf_size": 0, "recommendation": "6;6;6;8", "confidence": "4;2;2;3", "correctness": "3;3;4;3", "technical_novelty": "3;3;3;3", "empirical_novelty": "3;3;3;4", "wc_summary_paper": "73;56;48;113", "wc_summary_review": "90;57;7;24", "wc_main_review": "1229;143;54;269", "wc_review": "1392;256;109;406", "wc_reply_reviewers": "80;18;0;79", "wc_reply_authors": "1524;426;15;482", "reply_reviewers": "1;1;0;1", "reply_authors": "4;1;1;1", "recommendation_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 2.75, 0.82915619758885 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 3.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 72.5, 25.064915718988566 ], "wc_summary_review_avg": [ 44.5, 31.83158808479401 ], "wc_main_review_avg": [ 423.75, 471.14508115865965 ], "wc_review_avg": [ 540.75, 502.5621230256017 ], "wc_reply_reviewers_avg": [ 44.25, 35.82160660830276 ], "wc_reply_authors_avg": [ 611.75, 556.6975727448432 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.75, 1.299038105676658 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.17407765595569782, "corr_recommendation_correctness": -0.3333333333333333, "gs_citation": 26, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10807537001313527816&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=P07dq7iSAGr", "email": "cuhk.edu.cn;;microsoft.com;mbzuai.ac.ae;sjtu.edu.cn;;cc.gatech.edu", "author_num": 7, "aff_unique_index": "0;1;2;3;4", "aff_unique_norm": "Chinese University of Hong Kong;Microsoft;Mohamed bin Zayed University of Artificial Intelligence;Shanghai Jiao Tong University;Georgia Institute of Technology", "aff_unique_dep": ";Microsoft Corporation;;;College of Computing", "aff_unique_url": "https://www.cuhk.edu.cn;https://www.microsoft.com;https://www.mbzuai.ac.ae;https://www.sjtu.edu.cn;https://www.gatech.edu", "aff_unique_abbr": "CUHK;Microsoft;MBZUAI;SJTU;Georgia Tech", "aff_campus_unique_index": "0;2", "aff_campus_unique": "Shenzhen;;Atlanta", "aff_country_unique_index": "0;1;2;0;1", "aff_country_unique": "China;United States;United Arab Emirates" }, { "id": "P0EholD6_G", "title": "On Hard Episodes in Meta-Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Existing meta-learners primarily focus on improving the average task accuracy across multiple episodes. Different episodes, however, may vary in hardness and quality leading to a wide gap in the meta-learner's performance across episodes. Understanding this issue is particularly critical in industrial few-shot settings, where there is limited control over test episodes as they are typically uploaded by end-users. In this paper, we empirically analyse the behaviour of meta-learners on episodes of varying hardness across three standard benchmark datasets: CIFAR-FS, mini-ImageNet, and tiered-ImageNet. Surprisingly, we observe a wide gap in accuracy of around $50\\%$ between the hardest and easiest episodes across all the standard benchmarks and meta-learners. We additionally investigate various properties of hard episodes and highlight their connection to catastrophic forgetting during meta-training. To address the issue of sub-par performance on hard episodes, we investigate and benchmark different meta-training strategies based on adversarial training and curriculum learning. We find that adversarial training strategies are much more powerful than curriculum learning in improving the prediction performance on hard episodes.", "keywords": "meta-learning;few-shot learning", "primary_area": "", "supplementary_material": "", "author": "Samyadeep Basu;Amr Sharaf;Nicolo Fusi;Soheil Feizi", "authorids": "~Samyadeep_Basu1;~Amr_Sharaf1;~Nicolo_Fusi1;~Soheil_Feizi2", "gender": "M;M;M;M", "homepage": "https://samyadeepbasu.github.io/;http://cs.umd.edu/~amr;;https://www.cs.umd.edu/~sfeizi/", "dblp": "250/9138;159/1156;86/10995;57/2132", "google_scholar": "6aRwDecAAAAJ;It3Gm1EAAAAJ;GldD-lwAAAAJ;lptAmrMAAAAJ", "orcid": ";;;", "linkedin": ";amrsharaf/;;", "or_profile": "~Samyadeep_Basu1;~Amr_Sharaf1;~Nicolo_Fusi1;~Soheil_Feizi2", "aff": "Microsoft;Microsoft;Microsoft;University of Maryland, College Park", "aff_domain": "microsoft.com;microsoft.com;microsoft.com;umd.edu", "position": "Applied Scientist;Researcher;Researcher;Assistant Professor", "bibtex": "@misc{\nbasu2022on,\ntitle={On Hard Episodes in Meta-Learning},\nauthor={Samyadeep Basu and Amr Sharaf and Nicolo Fusi and Soheil Feizi},\nyear={2022},\nurl={https://openreview.net/forum?id=P0EholD6_G}\n}", "github": "", "project": "", "reviewers": "uYEr;NN1B;tHng;ceM1", "site": "https://openreview.net/forum?id=P0EholD6_G", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "4;4;4;4", "correctness": "3;3;3;4", "technical_novelty": "2;2;3;2", "empirical_novelty": "2;2;3;2", "wc_summary_paper": "70;101;169;71", "wc_summary_review": "26;342;87;40", "wc_main_review": "743;1013;529;386", "wc_review": "839;1456;785;497", "wc_reply_reviewers": "205;420;62;56", "wc_reply_authors": "334;1126;1082;825", "reply_reviewers": "1;1;1;1", "reply_authors": "1;2;2;2", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 102.75, 40.226701331329664 ], "wc_summary_review_avg": [ 123.75, 128.0163563768318 ], "wc_main_review_avg": [ 667.75, 236.37615679251576 ], "wc_review_avg": [ 894.25, 349.4133476271334 ], "wc_reply_reviewers_avg": [ 185.75, 147.81132399109345 ], "wc_reply_authors_avg": [ 841.75, 314.8844033927371 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 1.75, 0.4330127018922193 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5779327474061171269&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0;0;1", "aff_unique_norm": "Microsoft;University of Maryland", "aff_unique_dep": "Microsoft Corporation;", "aff_unique_url": "https://www.microsoft.com;https://www/umd.edu", "aff_unique_abbr": "Microsoft;UMD", "aff_campus_unique_index": "1", "aff_campus_unique": ";College Park", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Topologically Regularized Data Embeddings", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6844", "id": "P1QUVhOtEFP", "poster": "", "openreview": "https://openreview.net/forum?id=P1QUVhOtEFP", "slides": "https://iclr.cc/virtual/2022/poster/6844", "video": "https://iclr.cc/virtual/2022/poster/6844", "author_site": "Robin Vandaele, Bo Kang, Jefrey Lijffijt, Tijl De Bie, Yvan Saeys", "tldr": "", "abstract": "Unsupervised feature learning often finds low-dimensional embeddings that capture the structure of complex data. For tasks for which prior expert topological knowledge is available, incorporating this into the learned representation may lead to higher quality embeddings. For example, this may help one to embed the data into a given number of clusters, or to accommodate for noise that prevents one from deriving the distribution of the data over the model directly, which can then be learned more effectively. However, a general tool for integrating different prior topological knowledge into embeddings is lacking. Although differentiable topology layers have been recently developed that can (re)shape embeddings into prespecified topological models, they have two important limitations for representation learning, which we address in this paper. First, the currently suggested topological losses fail to represent simple models such as clusters and flares in a natural manner. Second, these losses neglect all original structural (such as neighborhood) information in the data that is useful for learning. We overcome these limitations by introducing a new set of topological losses, and proposing their usage as a way for topologically regularizing data embeddings to naturally represent a prespecified model. We include thorough experiments on synthetic and real data that highlight the usefulness and versatility of this approach, with applications ranging from modeling high-dimensional single-cell data, to graph embedding.", "keywords": "Embedding;Dimensionality Reduction;Topological Data Analysis;Persistent Homology;Optimization;Regularization", "primary_area": "", "supplementary_material": "/attachment/a25b264664ba40dc2024eb6ce17f9cfd303ceeca.zip", "author": "Robin Vandaele;Bo Kang;Jefrey Lijffijt;Tijl De Bie;Yvan Saeys", "authorids": "~Robin_Vandaele1;~Bo_Kang1;~Jefrey_Lijffijt1;~Tijl_De_Bie1;~Yvan_Saeys1", "gender": "M;M;M;M;M", "homepage": "https://users.ugent.be/~rvdaele/;http://users.ugent.be/~bkang/;http://users.ugent.be/~jlijffij/;http://www.tijldebie.net;", "dblp": ";26/4533;62/8320;49/2018;s/YvanSaeys", "google_scholar": ";_DxP-KUAAAAJ;https://scholar.google.be/citations?user=cBSEeSMAAAAJ;https://scholar.google.be/citations?user=eH_c4R4AAAAJ;1GNeT0cAAAAJ", "orcid": ";0000-0002-9895-9927;0000-0002-2930-5057;0000-0002-2692-7504;0000-0002-0415-1506", "linkedin": "rvdaele/;kangbo/;jefrey/;tijldebie/;", "or_profile": "~Robin_Vandaele1;~Bo_Kang1;~Jefrey_Lijffijt1;~Tijl_De_Bie1;~Yvan_Saeys1", "aff": "Ghent University;Ghent University;Ghent University;Ghent University;Ghent University", "aff_domain": "ugent.be;ugent.be;ugent.be;ugent.be;ugent.be", "position": "Postdoc;Postdoc;Assistant Professor;Full Professor;Associate Professor", "bibtex": "@inproceedings{\nvandaele2022topologically,\ntitle={Topologically Regularized Data Embeddings},\nauthor={Robin Vandaele and Bo Kang and Jefrey Lijffijt and Tijl De Bie and Yvan Saeys},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=P1QUVhOtEFP}\n}", "github": "", "project": "", "reviewers": "c3ai;fHUg;8muq;ey6b", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "4;3;2;3", "correctness": "3;3;4;4", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;2;3;0", "wc_summary_paper": "111;30;65;158", "wc_summary_review": "50;56;33;94", "wc_main_review": "574;324;187;791", "wc_review": "735;410;285;1043", "wc_reply_reviewers": "55;0;34;177", "wc_reply_authors": "1058;1129;444;2215", "reply_reviewers": "1;0;1;1", "reply_authors": "3;3;2;4", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 3.0, 0.7071067811865476 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 1.75, 1.0897247358851685 ], "wc_summary_paper_avg": [ 91.0, 48.181946826586405 ], "wc_summary_review_avg": [ 58.25, 22.29770167528483 ], "wc_main_review_avg": [ 469.0, 231.9795249585618 ], "wc_review_avg": [ 618.25, 295.1553616318023 ], "wc_reply_reviewers_avg": [ 66.5, 66.7476591349839 ], "wc_reply_authors_avg": [ 1211.5, 637.659195809172 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 3.0, 0.7071067811865476 ], "replies_avg": [ 22, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.6488856845230502, "corr_recommendation_correctness": 0.6882472016116854, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4782087340017155266&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "pdf": "https://openreview.net/pdf?id=P1QUVhOtEFP", "email": "ugent.be;ugent.be;ugent.be;ugent.be;ugent.be", "author_num": 5, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Ghent University", "aff_unique_dep": "", "aff_unique_url": "https://www.ugent.be/en", "aff_unique_abbr": "UGent", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "Belgium" }, { "id": "P1zfguZHowl", "title": "Robust Losses for Learning Value Functions", "track": "main", "status": "Reject", "tldr": "", "abstract": "Most value function learning algorithms in reinforcement learning are based on the mean squared (projected) Bellman error. However, squared errors are known to be sensitive to outliers, both skewing the solution of the objective and resulting in high-magnitude and high-variance gradients. Typical strategies to control these high-magnitude updates in RL involve clipping gradients, clipping rewards, rescaling rewards, and clipping errors. Clipping errors is related to using robust losses, like the Huber loss, but as yet no work explicitly formalizes and derives value learning algorithms with robust losses. In this work, we build on recent insights reformulating squared Bellman errors as a saddlepoint optimization problem, and propose a saddlepoint reformulation for a Huber Bellman error and Absolute Bellman error. We show that the resulting solutions have significantly lower error for certain problems and are otherwise comparable, in terms of both absolute and squared value error. We show that the resulting gradient-based algorithms are more robust, for both prediction and control, with less stepsize sensitivity.", "keywords": "Reinforcement Learning", "primary_area": "", "supplementary_material": "", "author": "Andrew Patterson;Victor Liao;Martha White", "authorids": "~Andrew_Patterson1;~Victor_Liao1;~Martha_White1", "gender": "M;;F", "homepage": "https://andnp.github.io;https://victorliao.com;http://marthawhite.ca", "dblp": "41/467;;60/7057", "google_scholar": "jd2nCqYAAAAJ;;t5zdD_IAAAAJ", "orcid": ";;0000-0002-5356-2950", "linkedin": ";;", "or_profile": "~Andrew_Patterson1;~Victor_Liao1;~Martha_White1", "aff": "University of Alberta;University of Waterloo;University of Alberta", "aff_domain": "ualberta.ca;uwaterloo.ca;ualberta.ca", "position": "PhD student;Undergrad student;Associate Professor", "bibtex": "@misc{\npatterson2022robust,\ntitle={Robust Losses for Learning Value Functions},\nauthor={Andrew Patterson and Victor Liao and Martha White},\nyear={2022},\nurl={https://openreview.net/forum?id=P1zfguZHowl}\n}", "github": "", "project": "", "reviewers": "78BK;9FeJ;xg5t;VsHs", "site": "https://openreview.net/forum?id=P1zfguZHowl", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "2;2;2;4", "correctness": "4;4;3;4", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;2;2;4", "wc_summary_paper": "20;33;51;261", "wc_summary_review": "74;38;2;100", "wc_main_review": "26;66;134;447", "wc_review": "120;137;187;808", "wc_reply_reviewers": "52;0;559;25", "wc_reply_authors": "618;397;1686;518", "reply_reviewers": "1;0;2;1", "reply_authors": "2;1;3;1", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 2.5, 0.8660254037844386 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 91.25, 98.6214352967954 ], "wc_summary_review_avg": [ 53.5, 36.99662146737185 ], "wc_main_review_avg": [ 168.25, 165.50283230204855 ], "wc_review_avg": [ 313.0, 286.84752047037114 ], "wc_reply_reviewers_avg": [ 159.0, 231.67110307502747 ], "wc_reply_authors_avg": [ 804.75, 514.7724618702908 ], "reply_reviewers_avg": [ 1.0, 0.7071067811865476 ], "reply_authors_avg": [ 1.75, 0.82915619758885 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.9271726499455306, "corr_recommendation_correctness": 0.13245323570650439, "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16793519689967745448&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff_unique_index": "0;1;0", "aff_unique_norm": "University of Alberta;University of Waterloo", "aff_unique_dep": ";", "aff_unique_url": "https://www.ualberta.ca;https://uwaterloo.ca", "aff_unique_abbr": "UAlberta;UW", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Canada" }, { "title": "X-model: Improving Data Efficiency in Deep Learning with A Minimax Model", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6998", "id": "P3Bh01hBYTH", "poster": "", "openreview": "https://openreview.net/forum?id=P3Bh01hBYTH", "slides": "https://iclr.cc/virtual/2022/poster/6998", "video": "https://iclr.cc/virtual/2022/poster/6998", "author_site": "Ximei Wang, Xinyang Chen, Jianmin Wang, Mingsheng Long", "tldr": "", "abstract": "To mitigate the burden of data labeling, we aim at improving data efficiency for both classification and regression setups in deep learning. However, the current focus is on classification problems while rare attention has been paid to deep regression, which usually requires more human effort to labeling. Further, due to the intrinsic difference between categorical and continuous label space, the common intuitions for classification, \\textit{e.g.} cluster assumptions or pseudo labeling strategies, cannot be naturally adapted into deep regression. To this end, we first delved into the existing data-efficient methods in deep learning and found that they either encourage invariance to \\textit{data stochasticity} (\\textit{e.g.}, consistency regularization under different augmentations) or \\textit{model stochasticity} (\\textit{e.g.}, difference penalty for predictions of models with different dropout). To take the power of both worlds, we propose a novel \\Chi-model by simultaneously encouraging the invariance to {data stochasticity} and {model stochasticity}. Further, the \\Chi-model plays a minimax game between the feature extractor and task-specific heads to further enhance the invariance to model stochasticity. Extensive experiments verify the superiority of the \\Chi-model among various tasks, from a single-value prediction task of age estimation to a dense-value prediction task of keypoint localization, a 2D synthetic and a 3D realistic dataset, as well as a multi-category object recognition task.", "keywords": "Data Efficiency;Deep Learning;Minimax Model", "primary_area": "", "supplementary_material": "/attachment/36a9fce1ebd67825186b697a60cc53377f6f7df3.zip", "author": "Ximei Wang;Xinyang Chen;Jianmin Wang;Mingsheng Long", "authorids": "~Ximei_Wang1;~Xinyang_Chen1;~Jianmin_Wang1;~Mingsheng_Long5", "gender": "M;Not Specified;M;M", "homepage": "https://wxm17.github.io/;https://chenxinyang123.github.io/;https://www.thss.tsinghua.edu.cn/en/faculty/jianminwang.htm;http://ise.thss.tsinghua.edu.cn/~mlong", "dblp": "89/8876;242/3871-1;06/3456-1.html;74/9023", "google_scholar": "WmOCCVgAAAAJ;qVxhGWUAAAAJ;https://scholar.google.com.tw/citations?user=MiovcboAAAAJ;_MjXpXkAAAAJ", "orcid": ";0000-0001-6743-838X;0000-0001-6841-7943;0000-0002-5412-9120", "linkedin": ";;;", "or_profile": "~Ximei_Wang1;~Xinyang_Chen1;~Jianmin_Wang1;~Mingsheng_Long2", "aff": "Tsinghua University;Tsinghua University;Tsinghua University;Tsinghua University", "aff_domain": "tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn", "position": "PhD student;PhD student;Full Professor;Associate Professor", "bibtex": "@inproceedings{\nwang2022xmodel,\ntitle={X-model: Improving Data Efficiency in Deep Learning with A Minimax Model},\nauthor={Ximei Wang and Xinyang Chen and Jianmin Wang and Mingsheng Long},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=P3Bh01hBYTH}\n}", "github": "", "project": "", "reviewers": "DfEj;6Dgh;3Tfv", "pdf_size": 0, "recommendation": "6;6;8", "confidence": "4;4;4", "correctness": "3;4;4", "technical_novelty": "2;3;3", "empirical_novelty": "2;3;3", "wc_summary_paper": "29;67;48", "wc_summary_review": "42;39;45", "wc_main_review": "317;367;141", "wc_review": "388;473;234", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "1072;676;356", "reply_reviewers": "0;0;0", "reply_authors": "3;2;1", "recommendation_avg": [ 6.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 48.0, 15.513435037626794 ], "wc_summary_review_avg": [ 42.0, 2.449489742783178 ], "wc_main_review_avg": [ 275.0, 96.92608867929556 ], "wc_review_avg": [ 365.0, 98.91747402085572 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 701.3333333333334, 292.8541540691468 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.0, 0.816496580927726 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.49999999999999983, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12000859868342305467&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "pdf": "https://openreview.net/pdf?id=P3Bh01hBYTH", "email": "tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Tsinghua University", "aff_unique_dep": "", "aff_unique_url": "https://www.tsinghua.edu.cn", "aff_unique_abbr": "THU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "id": "P3SQi2EWeR", "title": "Integrating Large Circular Kernels into CNNs through Neural Architecture Search", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "The square kernel is a standard unit for contemporary Convolutional Neural Networks (CNNs), as it fits well on the tensor computation for the convolution operation. However, the retinal ganglion cells in the biological visual system have approximately concentric receptive fields. Motivated by this observation, we propose using the circular kernel with a concentric and isotropic receptive field as an option for convolution operation. We first substitute the $3 \\times 3$ square kernels with the corresponding circular kernels or our proposed integrated kernels in the typical ResNet architecture, and the modified models after training yield similar or even competitive performance. We then show the advantages of large circular kernels over the corresponding square kernels in that the difference and the improvement are more distinct. Hence, we speculate that large circular kernels would help find advanced neural network models by the Neural Architecture Search (NAS). To validate our hypothesis, we expand the operation space in several typical NAS methods with convolutions of large circular kernels. \nExperimental results show that the searched new neural network models contain large circular kernels and significantly outperform the original searched models. The additional empirical analysis also reveals that the large circular kernel help the model to be more robust to rotated or sheared images due to its rotation invariance. ", "keywords": "circular kernel;Convolutional Neural Network;Neural Architecture Search;operation space;large kernel", "primary_area": "", "supplementary_material": "", "author": "Kun He;Chao Li;Yixiao Yang;Gao Huang;John E. Hopcroft", "authorids": "~Kun_He1;~Chao_Li14;~Yixiao_Yang2;~Gao_Huang1;~John_E._Hopcroft1", "gender": "F;M;M;M;", "homepage": "http://faculty.hust.edu.cn/hekun/zh_CN/more/1411001/jsjjgd/index.htm;https://github.com/lichaoaaron;http://www.gaohuang.net;http://www.cs.cornell.edu/jeh/;", "dblp": "59/1028-1;;;h/JohnEHopcroft;", "google_scholar": "YTQnGJsAAAAJ;Wik8bkIAAAAJ;-P9LwcgAAAAJ;4Z6vo5QAAAAJ;https://scholar.google.com/citations?hl=zh-CN", "orcid": "0000-0001-7627-4604;0000-0001-9066-1440;;0000-0001-8681-6075;", "linkedin": ";;;;", "or_profile": "~Kun_He1;~Chao_Li14;~Gao_Huang1;~John_E._Hopcroft1;~yixiao_yang1", "aff": "Huazhong University of Sceince and Technology;Huazhong University of Science and Technology;Tsinghua University;Department of Computer Science, Cornell University;Huazhong University of Science and Technology, Tsinghua University", "aff_domain": "hust.edu.cn;hust.edu.cn;tsinghua.edu.cn;cs.cornell.edu;hust.edu.cn", "position": "Full Professor;PhD student;Associate Professor;Full Professor;MS student", "bibtex": "@misc{\nhe2022integrating,\ntitle={Integrating Large Circular Kernels into {CNN}s through Neural Architecture Search},\nauthor={Kun He and Chao Li and Yixiao Yang and Gao Huang and John E. Hopcroft},\nyear={2022},\nurl={https://openreview.net/forum?id=P3SQi2EWeR}\n}", "github": "", "project": "", "reviewers": "24as;DTEL;GBeZ;njtf", "site": "https://openreview.net/forum?id=P3SQi2EWeR", "pdf_size": 0, "recommendation": "1;3;5;6", "confidence": "4;4;3;3", "correctness": "2;2;3;3", "technical_novelty": "1;3;2;3", "empirical_novelty": "1;2;2;3", "wc_summary_paper": "45;55;98;68", "wc_summary_review": "29;44;34;23", "wc_main_review": "262;100;192;241", "wc_review": "336;199;324;332", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.75, 1.920286436967152 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.25, 0.82915619758885 ], "empirical_novelty_avg": [ 2.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 66.5, 19.93113142799475 ], "wc_summary_review_avg": [ 32.5, 7.697402159170326 ], "wc_main_review_avg": [ 198.75, 62.41544280064029 ], "wc_review_avg": [ 297.75, 57.17680910998794 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.911322376865767, "corr_recommendation_correctness": 0.911322376865767, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14792686816695857035&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0;1;2;0", "aff_unique_norm": "Huazhong University of Science and Technology;Tsinghua University;Cornell University", "aff_unique_dep": ";;Department of Computer Science", "aff_unique_url": "http://www.hust.edu.cn;https://www.tsinghua.edu.cn;https://www.cornell.edu", "aff_unique_abbr": "HUST;THU;Cornell", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;0", "aff_country_unique": "China;United States" }, { "id": "P6OUJ2XziC", "title": "NeuRL: Closed-form Inverse Reinforcement Learning for Neural Decoding", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Current neural decoding methods typically aim at explaining behavior based on neural activity via supervised learning. However, since generally there is a strong connection between learning of subjects and their expectations on long-term rewards, we propose NeuRL, an inverse reinforcement learning approach that (1) extracts an intrinsic reward function from collected trajectories of a subject in closed form, (2) maps neural signals to this intrinsic reward to account for long-term dependencies in the behavior and (3) predicts the simulated behavior for unseen neural signals by extracting Q-values and the corresponding Boltzmann policy based on the intrinsic reward values for these unseen neural signals. We show that NeuRL leads to better generalization and improved decoding performance compared to supervised approaches. We study the behavior of rats in a response-preparation task and evaluate the performance of NeuRL within simulated inhibition and per-trial behavior prediction. By assigning clear functional roles to defined neuronal populations our approach offers a new interpretation tool for complex neuronal data with testable predictions. In per-trial behavior prediction, our approach furthermore improves accuracy by up to 15% compared to traditional methods.", "keywords": "Inverse Reinforcement Learning;Neural Decoding;Computational Biology", "primary_area": "", "supplementary_material": "/attachment/6d4f3b39ba5878e9f531a06380731d91a6e7ffea.zip", "author": "Gabriel Kalweit;Maria Kalweit;Mansour Alyahyay;Zoe Jaeckel;Florian Steenbergen;Stefanie Hardung;Thomas Brox;Ilka Diester;Joschka Boedecker", "authorids": "~Gabriel_Kalweit1;kalweitm@cs.uni-freiburg.de;mansour.alyahyay@biologie.uni-freiburg.de;zoe.jaeckel@biologie.uni-freiburg.de;florian.steenbergen@googlemail.com;stefanie.hardung@biologie.uni-freiburg.de;~Thomas_Brox1;ilka.diester@biologie.uni-freiburg.de;~Joschka_Boedecker1", "gender": ";;;;;;M;;", "homepage": "https://nr.informatik.uni-freiburg.de/people/gabriel-kalweit;;;;;;https://lmb.informatik.uni-freiburg.de/people/brox/index.en.html;;", "dblp": "208/0991;;;;;;97/4586;;", "google_scholar": "xLNPWK8AAAAJ;;;;;;https://scholar.google.com/citations?hl=de;;", "orcid": ";;;;;;0000-0002-6282-8861;;", "linkedin": ";;;;;;;;", "or_profile": "~Gabriel_Kalweit1;kalweitm@cs.uni-freiburg.de;mansour.alyahyay@biologie.uni-freiburg.de;zoe.jaeckel@biologie.uni-freiburg.de;florian.steenbergen@googlemail.com;stefanie.hardung@biologie.uni-freiburg.de;~Thomas_Brox1;ilka.diester@biologie.uni-freiburg.de;~Joschka_Boedecker1", "aff": "Universit\u00e4t Freiburg;;;;;;University of Freiburg;;", "aff_domain": "uni-freiburg.de;;;;;;uni-freiburg.de;;", "position": "PhD student;;;;;;Full Professor;;", "bibtex": "@misc{\nkalweit2022neurl,\ntitle={Neu{RL}: Closed-form Inverse Reinforcement Learning for Neural Decoding},\nauthor={Gabriel Kalweit and Maria Kalweit and Mansour Alyahyay and Zoe Jaeckel and Florian Steenbergen and Stefanie Hardung and Thomas Brox and Ilka Diester and Joschka Boedecker},\nyear={2022},\nurl={https://openreview.net/forum?id=P6OUJ2XziC}\n}", "github": "", "project": "", "reviewers": "h3mF;wD3d;pXEp", "site": "https://openreview.net/forum?id=P6OUJ2XziC", "pdf_size": 0, "recommendation": "3;5;5", "confidence": "4;4;3", "correctness": "2;3;3", "technical_novelty": "2;3;3", "empirical_novelty": "2;2;2", "wc_summary_paper": "171;68;69", "wc_summary_review": "61;55;47", "wc_main_review": "691;158;658", "wc_review": "923;281;774", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 4.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 2.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 102.66666666666667, 48.32068800098865 ], "wc_summary_review_avg": [ 54.333333333333336, 5.734883511361751 ], "wc_main_review_avg": [ 502.3333333333333, 243.85287003072617 ], "wc_review_avg": [ 659.3333333333334, 274.35054624006534 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 9, 0 ], "corr_recommendation_confidence": -0.49999999999999983, "corr_recommendation_correctness": 0.9999999999999998, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10006518147652252542&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff_unique_index": "0;0", "aff_unique_norm": "University of Freiburg", "aff_unique_dep": "", "aff_unique_url": "https://www.uni-freiburg.de", "aff_unique_abbr": "Uni Freiburg", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Germany" }, { "title": "The Spectral Bias of Polynomial Neural Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6710", "id": "P7FLfMLTSEX", "poster": "", "openreview": "https://openreview.net/forum?id=P7FLfMLTSEX", "slides": "https://iclr.cc/virtual/2022/poster/6710", "video": "https://iclr.cc/virtual/2022/poster/6710", "author_site": "Moulik Choraria, Leello Dadi, Grigorios Chrysos, Julien Mairal, Volkan Cevher", "tldr": "", "abstract": "Polynomial neural networks (PNNs) have been recently shown to be particularly effective at image generation and face recognition, where high-frequency information is critical. Previous studies have revealed that neural networks demonstrate a $\\text{\\it{spectral bias}}$ towards low-frequency functions, which yields faster learning of low-frequency components during training. Inspired by such studies, we conduct a spectral analysis of the Neural Tangent Kernel (NTK) of PNNs. We find that the $\\Pi$-Net family, i.e., a recently proposed parametrization of PNNs, speeds up the learning of the higher frequencies. \nWe verify the theoretical bias through extensive experiments. We expect our analysis to provide novel insights into designing architectures and learning frameworks by incorporating multiplicative interactions via polynomials. \n", "keywords": "Deep Neural Networks;Polynomials;Spectral Bias;Neural Tangent Kernel;Deep Image Prior;Infinite Width;Mercer Decomposition", "primary_area": "", "supplementary_material": "", "author": "Moulik Choraria;Leello Tadesse Dadi;Grigorios Chrysos;Julien Mairal;Volkan Cevher", "authorids": "~Moulik_Choraria1;~Leello_Tadesse_Dadi1;~Grigorios_Chrysos1;~Julien_Mairal1;~Volkan_Cevher1", "gender": ";M;M;;M", "homepage": ";;https://grigorisg9gr.github.io/;http://julien.mairal.org;http://lions.epfl.ch", "dblp": "258/0834.html;314/6241;75/6117-2;49/6555;70/5301", "google_scholar": "Gz7PN8oAAAAJ;bhAxvCIAAAAJ;1bU041kAAAAJ;https://scholar.google.fr/citations?user=Bx9WGD6lBFEC;https://scholar.google.ch/citations?user=hlWhzU8AAAAJ", "orcid": "0000-0001-7609-3563;0000-0003-2580-4913;;;", "linkedin": ";;;;", "or_profile": "~Moulik_Choraria1;~Leello_Tadesse_Dadi1;~Grigorios_Chrysos1;~Julien_Mairal1;~Volkan_Cevher1", "aff": "University of Illinois, Urbana-Champaign;EPFL;Swiss Federal Institute of Technology Lausanne;Inria;Swiss Institute of Technology", "aff_domain": "uiuc.edu;epfl.ch;epfl.ch;inria.fr;epfl.ch", "position": "PhD student;PhD student;Postdoc;Research Scientist;Associate Professor", "bibtex": "@inproceedings{\nchoraria2022the,\ntitle={The Spectral Bias of Polynomial Neural Networks},\nauthor={Moulik Choraria and Leello Tadesse Dadi and Grigorios Chrysos and Julien Mairal and Volkan Cevher},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=P7FLfMLTSEX}\n}", "github": "", "project": "", "reviewers": "FuRi;cEcf;nFMY;TQnp;KnZp", "pdf_size": 0, "recommendation": "5;6;6;6;8", "confidence": "4;3;2;3;3", "correctness": "3;4;3;3;4", "technical_novelty": "2;2;3;3;3", "empirical_novelty": "3;3;2;2;2", "wc_summary_paper": "51;34;119;50;61", "wc_summary_review": "53;38;31;73;65", "wc_main_review": "305;109;81;303;414", "wc_review": "409;181;231;426;540", "wc_reply_reviewers": "79;0;0;0;165", "wc_reply_authors": "1029;481;340;592;488", "reply_reviewers": "1;0;0;0;1", "reply_authors": "3;2;2;2;2", "recommendation_avg": [ 6.2, 0.9797958971132712 ], "confidence_avg": [ 3.0, 0.6324555320336759 ], "correctness_avg": [ 3.4, 0.4898979485566356 ], "technical_novelty_avg": [ 2.6, 0.4898979485566356 ], "empirical_novelty_avg": [ 2.4, 0.4898979485566356 ], "wc_summary_paper_avg": [ 63.0, 29.305289624912426 ], "wc_summary_review_avg": [ 52.0, 15.798734126505199 ], "wc_main_review_avg": [ 242.4, 127.18742076164608 ], "wc_review_avg": [ 357.4, 132.5180742389505 ], "wc_reply_reviewers_avg": [ 48.8, 65.66399317738757 ], "wc_reply_authors_avg": [ 586.0, 235.55466456854552 ], "reply_reviewers_avg": [ 0.4, 0.48989794855663565 ], "reply_authors_avg": [ 2.2, 0.39999999999999997 ], "replies_avg": [ 27, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.3227486121839514, "corr_recommendation_correctness": 0.6666666666666667, "gs_citation": 21, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2780414467030512954&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "pdf": "https://openreview.net/pdf?id=P7FLfMLTSEX", "email": "uiuc.edu;epfl.ch;epfl.ch;inria.fr;epfl.ch", "author_num": 5, "aff_unique_index": "0;1;2;3;4", "aff_unique_norm": "University of Illinois;EPFL;Swiss Federal Institute of Technology Lausanne;INRIA;Swiss Federal Institute of Technology", "aff_unique_dep": ";;;;", "aff_unique_url": "https://illinois.edu;https://www.epfl.ch;https://www.epfl.ch;https://www.inria.fr;https://www.ethz.ch", "aff_unique_abbr": "UIUC;EPFL;EPFL;Inria;ETH Zurich", "aff_campus_unique_index": "0;2", "aff_campus_unique": "Urbana-Champaign;;Lausanne", "aff_country_unique_index": "0;1;1;2;1", "aff_country_unique": "United States;Switzerland;France" }, { "title": "Hindsight Foresight Relabeling for Meta-Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6247", "id": "P7OVkHEoHOZ", "poster": "", "openreview": "https://openreview.net/forum?id=P7OVkHEoHOZ", "slides": "https://iclr.cc/virtual/2022/poster/6247", "video": "https://iclr.cc/virtual/2022/poster/6247", "author_site": "Michael Wan, Jian Peng, Tanmay Gangwani", "tldr": "", "abstract": "Meta-reinforcement learning (meta-RL) algorithms allow for agents to learn new behaviors from small amounts of experience, mitigating the sample inefficiency problem in RL. However, while meta-RL agents can adapt quickly to new tasks at test time after experiencing only a few trajectories, the meta-training process is still sample-inefficient. Prior works have found that in the multi-task RL setting, relabeling past transitions and thus sharing experience among tasks can improve sample efficiency and asymptotic performance. We apply this idea to the meta-RL setting and devise a new relabeling method called Hindsight Foresight Relabeling (HFR). We construct a relabeling distribution using the combination of \"hindsight\", which is used to relabel trajectories using reward functions from the training task distribution, and \"foresight\", which takes the relabeled trajectories and computes the utility of each trajectory for each task. HFR is easy to implement and readily compatible with existing meta-RL algorithms. We find that HFR improves performance when compared to other relabeling methods on a variety of meta-RL tasks.", "keywords": "Reinforcement Learning;Meta-Learning", "primary_area": "", "supplementary_material": "/attachment/959674b7baa884f77a1d7494758be003ae5d79ec.zip", "author": "Michael Wan;Jian Peng;Tanmay Gangwani", "authorids": "~Michael_Wan1;~Jian_Peng1;~Tanmay_Gangwani1", "gender": "M;M;M", "homepage": ";http://jianpeng.web.engr.illinois.edu/;https://tgangwani.github.io/", "dblp": ";29/4181-1;177/8611", "google_scholar": ";https://scholar.google.com.tw/citations?user=4wcAVXAAAAAJ;IUY5oVkAAAAJ", "orcid": ";;", "linkedin": "michael-wan-149658171/;;", "or_profile": "~Michael_Wan1;~Jian_Peng1;~Tanmay_Gangwani1", "aff": ";University of Illinois, Urbana Champaign;Amazon", "aff_domain": ";illinois.edu;amazon.com", "position": ";Assistant Professor;Researcher", "bibtex": "@inproceedings{\nwan2022hindsight,\ntitle={Hindsight Foresight Relabeling for Meta-Reinforcement Learning},\nauthor={Michael Wan and Jian Peng and Tanmay Gangwani},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=P7OVkHEoHOZ}\n}", "github": "", "project": "", "reviewers": "vofF;HrwG;P7KG;YEZ4", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "3;5;4;3", "correctness": "4;4;4;3", "technical_novelty": "3;3;2;2", "empirical_novelty": "3;3;3;3", "wc_summary_paper": "106;165;74;84", "wc_summary_review": "45;62;46;58", "wc_main_review": "576;555;375;577", "wc_review": "727;782;495;719", "wc_reply_reviewers": "216;63;0;0", "wc_reply_authors": "921;686;593;780", "reply_reviewers": "1;1;0;0", "reply_authors": "3;2;1;2", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 107.25, 35.29429840639986 ], "wc_summary_review_avg": [ 52.75, 7.39509972887452 ], "wc_main_review_avg": [ 520.75, 84.60607247709824 ], "wc_review_avg": [ 680.75, 109.95084128827756 ], "wc_reply_reviewers_avg": [ 69.75, 88.26770360669865 ], "wc_reply_authors_avg": [ 745.0, 121.22912191383719 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 2.0, 0.7071067811865476 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.20751433915982243, "corr_recommendation_correctness": -0.9271726499455306, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4008449180583505870&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=P7OVkHEoHOZ", "email": ";illinois.edu;amazon.com", "author_num": 3, "aff_unique_index": "0;1", "aff_unique_norm": "University of Illinois Urbana-Champaign;Amazon", "aff_unique_dep": ";Amazon.com, Inc.", "aff_unique_url": "https://illinois.edu;https://www.amazon.com", "aff_unique_abbr": "UIUC;Amazon", "aff_campus_unique_index": "0", "aff_campus_unique": "Urbana-Champaign;", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "P9TDsg-AoEK", "title": "Zero-Shot Dense Retrieval with Momentum Adversarial Domain Invariant Representation", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Dense retrieval (DR) methods conduct text retrieval by first encoding texts in the embedding space and then matching them by nearest neighbor search. This requires strong locality properties from the representation space, i.e, the close allocations of each small group of relevant texts, which is hard to generalize to domains without sufficient training data. In this paper, we aim to improve the generalization ability of DR models from source training domains with rich supervision signals to target domains without any relevant labels, in the zero-shot setting. To achieve that, we propose Momentum adversarial Domain Invariant Representation learning (MoDIR), which introduces a momentum method in the DR training process to train a domain classifier on the source versus target, and then adversarially updates the DR encoder to learn domain invariant representations. Our experiments show that MoDIR robustly outperforms its baselines on 10+ ranking datasets from the BEIR benchmark in the zero-shot setup, with more than 10% relative gains on datasets where the evaluation of DR models is sensitive enough. Source code of this paper will be released.", "keywords": "dense retrieval;zero-shot;unsupervised domain adaptation", "primary_area": "", "supplementary_material": "", "author": "Ji Xin;Chenyan Xiong;Ashwin Srinivasan;Ankita Sharma;Damien Jose;Paul N. Bennett", "authorids": "~Ji_Xin1;~Chenyan_Xiong1;~Ashwin_Srinivasan2;~Ankita_Sharma1;~Damien_Jose1;~Paul_N._Bennett1", "gender": ";M;;F;M;", "homepage": "https://ji-xin.gitlab.io;https://www.cs.cmu.edu/~cx/;;https://www.linkedin.com/in/sharma-ankita/;;https://www.microsoft.com/en-us/research/people/pauben/publications/", "dblp": "218/7227;18/10886;;;;33/6188", "google_scholar": ";E9BaEBYAAAAJ;;;VAV7J4gAAAAJ;AIncPrIAAAAJ", "orcid": ";;;;;0009-0006-7852-9651", "linkedin": ";;ashwin-srinivasan19/;sharma-ankita/;damienjose/;paulnbennett/", "or_profile": "~Ji_Xin1;~Chenyan_Xiong1;~Ashwin_Srinivasan2;~Ankita_Sharma1;~Damien_Jose1;~Paul_N._Bennett1", "aff": "University of Waterloo;Microsoft Research;Microsoft;Microsoft;Microsoft;Microsoft", "aff_domain": "uwaterloo.ca;research.microsoft.com;microsoft.com;microsoft.com;microsoft.com;microsoft.com", "position": "PhD student;Principal Researcher;Data & Applied Scientist;Microsoft;Researcher;Researcher", "bibtex": "@misc{\nxin2022zeroshot,\ntitle={Zero-Shot Dense Retrieval with Momentum Adversarial Domain Invariant Representation},\nauthor={Ji Xin and Chenyan Xiong and Ashwin Srinivasan and Ankita Sharma and Damien Jose and Paul N. Bennett},\nyear={2022},\nurl={https://openreview.net/forum?id=P9TDsg-AoEK}\n}", "github": "", "project": "", "reviewers": "6hdy;A2sH;F8xh;miCp", "site": "https://openreview.net/forum?id=P9TDsg-AoEK", "pdf_size": 0, "recommendation": "3;5;5;5", "confidence": "4;4;5;3", "correctness": "2;3;3;3", "technical_novelty": "2;2;3;2", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "101;161;108;39", "wc_summary_review": "26;63;30;23", "wc_main_review": "509;425;261;145", "wc_review": "636;649;399;207", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.5, 0.8660254037844386 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 102.25, 43.26300382543958 ], "wc_summary_review_avg": [ 35.5, 16.070158679988197 ], "wc_main_review_avg": [ 335.0, 141.37892346456738 ], "wc_review_avg": [ 472.75, 182.87752048844058 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 1.0, "gs_citation": 49, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3337665841069892456&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;1;1;1;1;1", "aff_unique_norm": "University of Waterloo;Microsoft", "aff_unique_dep": ";Microsoft Research", "aff_unique_url": "https://uwaterloo.ca;https://www.microsoft.com/en-us/research", "aff_unique_abbr": "UW;MSR", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1;1;1", "aff_country_unique": "Canada;United States" }, { "id": "PC8u74o7xc2", "title": "Embedding models through the lens of Stable Coloring", "track": "main", "status": "Reject", "tldr": "", "abstract": "Embedding-based approaches find the semantic meaning of tokens in structured data such as natural language, graphs, and even images. To a great degree, these approaches have developed independently in different domains. However, we find a common principle underlying these formulations, and it is rooted in solutions to the stable coloring problem in graphs (Weisfeiler-Lehman isomorphism test). For instance, we find links between stable coloring, distribution hypothesis in natural language processing, and non-local-means denoising algorithm in image signal processing. We even find that stable coloring has strong connections to a broad class of unsupervised embedding models which is surprising at first since stable coloring is generally applied for combinatorial problems. To establish this connection concretely we define a mathematical framework that defines continuous stable coloring on graphs and develops optimization problems to search for them. Grounded on this framework, we show that many algorithms ranging across different domains are, in fact, searching for continuous stable coloring solutions of an underlying graph corresponding to the domain. We show that popular and widely used embedding models such as Word2Vec, AWE, BERT, Node2Vec, and Vis-Transformer can be understood as instantiations of our general algorithm that solves the problem of continuous stable coloring. These instantiations offer useful insights into the workings of state-of-the-art models like BERT stimulating new research directions.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Aditya Desai;Shashank Sonkar;Anshumali Shrivastava;Richard Baraniuk", "authorids": "~Aditya_Desai1;~Shashank_Sonkar1;~Anshumali_Shrivastava1;~Richard_Baraniuk1", "gender": "M;M;;M", "homepage": "https://sites.google.com/view/shashanksonkar;https://www.cs.rice.edu/~as143/;http://richb.rice.edu/;https://apd10.github.io/", "dblp": "266/1460;63/9828;32/2804;18/8339", "google_scholar": "4Rv56n4AAAAJ;https://scholar.google.com.tw/citations?user=SGT23RAAAAAJ;https://scholar.google.com.tw/citations?user=N-BBA20AAAAJ;ymdbDZwAAAAJ", "orcid": ";;;0009-0002-9111-9391", "linkedin": ";;richard-baraniuk;aditya-desai-ai/", "or_profile": "~Shashank_Sonkar1;~Anshumali_Shrivastava1;~Richard_Baraniuk1;~Adity_Desai1", "aff": "Rice University;ThirdAI Corp.;William Marsh Rice University;Rice University", "aff_domain": "rice.edu;thirdai.com;rice.edu;rice.edu", "position": "PhD student;CEO;C. Sidney Burrus Professor;PhD student", "bibtex": "@misc{\ndesai2022embedding,\ntitle={Embedding models through the lens of Stable Coloring},\nauthor={Aditya Desai and Shashank Sonkar and Anshumali Shrivastava and Richard Baraniuk},\nyear={2022},\nurl={https://openreview.net/forum?id=PC8u74o7xc2}\n}", "github": "", "project": "", "reviewers": "K2fA;mH7f;EdGh;JX7R", "site": "https://openreview.net/forum?id=PC8u74o7xc2", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "4;3;3;4", "correctness": "3;3;2;3", "technical_novelty": "2;2;2;3", "empirical_novelty": "1;0;0;0", "wc_summary_paper": "89;83;45;59", "wc_summary_review": "105;90;15;9", "wc_main_review": "719;629;307;554", "wc_review": "913;802;367;622", "wc_reply_reviewers": "26;0;0;0", "wc_reply_authors": "501;592;509;538", "reply_reviewers": "1;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 0.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 69.0, 17.832554500127006 ], "wc_summary_review_avg": [ 54.75, 43.129891026989625 ], "wc_main_review_avg": [ 552.25, 153.17208459768378 ], "wc_review_avg": [ 676.0, 206.42310917142973 ], "wc_reply_reviewers_avg": [ 6.5, 11.258330249197702 ], "wc_reply_authors_avg": [ 535.0, 35.67211796347394 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:7yT9T9MqTXgJ:scholar.google.com/&scioq=Embedding+models+through+the+lens+of+Stable+Coloring&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1;0;0", "aff_unique_norm": "Rice University;ThirdAI Corp.", "aff_unique_dep": ";", "aff_unique_url": "https://www.rice.edu;", "aff_unique_abbr": "Rice;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Omni-Scale CNNs: a simple and effective kernel size configuration for time series classification", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7148", "id": "PDYs7Z2XFGv", "poster": "", "openreview": "https://openreview.net/forum?id=PDYs7Z2XFGv", "slides": "https://iclr.cc/virtual/2022/poster/7148", "video": "https://iclr.cc/virtual/2022/poster/7148", "author_site": "Wensi Tang, Guodong Long, Lu Liu, Tianyi Zhou, Michael Blumenstein, Jing Jiang", "tldr": "", "abstract": "The size of the receptive field has been one of the most important factors for One Dimensional Convolutional Neural Networks (1D-CNNs) on time series classification tasks. Large efforts have been taken to choose the appropriate receptive field size, for it has a huge influence on the performance and differs significantly for each dataset. In this paper, we propose an Omni-Scale block (OS-block) for 1D-CNNs, where the kernel sizes are set by a simple and universal rule. OS-block can efficiently cover the best size of the receptive field across different datasets. This set of kernel sizes consists of multiple prime numbers according to the length of the time series. We experimentally show 1D-CNNs built from OS-block can consistently achieve the state-of-the-art accuracy with a smaller model size on five time series benchmarks, including both univariate and multivariate data from multiple domains. Comprehensive analysis and ablation studies shed light on how our rule finds the best receptive field size and demonstrate the consistency of our OS-block for multiple 1D-CNN structures.", "keywords": "Time series classification", "primary_area": "", "supplementary_material": "/attachment/d21bb2a7402abaf120f148afdfd646f344167ba5.zip", "author": "Wensi Tang;Guodong Long;Lu Liu;Tianyi Zhou;Michael Blumenstein;Jing Jiang", "authorids": "~Wensi_Tang1;~Guodong_Long2;~Lu_Liu7;~Tianyi_Zhou1;michael.blumenstein@uts.edu.au;~Jing_Jiang6", "gender": "M;M;;M;;F", "homepage": ";https://www.uts.edu.au/staff/guodong.long;;https://tianyizhou.github.io/;;https://www.uts.edu.au/staff/jing.jiang", "dblp": ";34/10089;;88/8205-1;;68/1974-2", "google_scholar": ";https://scholar.google.com.au/citations?user=Pl8m7hMAAAAJ;;OKvgizMAAAAJ;;https://scholar.google.com.au/citations?hl=en", "orcid": ";0000-0003-3740-9515;;0000-0001-5348-0632;;", "linkedin": ";;;tianyizhou;;", "or_profile": "~Wensi_Tang1;~Guodong_Long2;~Lu_Liu7;~Tianyi_Zhou1;michael.blumenstein@uts.edu.au;~Jing_Jiang6", "aff": "University of Technology Sydney;University of Technology Sydney;;University of Washington, Seattle;;University of Technology Sydney", "aff_domain": "uts.edu.au;uts.edu.au;;uw.edu;;uts.edu.au", "position": "PhD student;Associate Professor;;PhD student;;Lecturer", "bibtex": "@inproceedings{\ntang2022omniscale,\ntitle={Omni-Scale {CNN}s: a simple and effective kernel size configuration for time series classification},\nauthor={Wensi Tang and Guodong Long and Lu Liu and Tianyi Zhou and Michael Blumenstein and Jing Jiang},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=PDYs7Z2XFGv}\n}", "github": "", "project": "", "reviewers": "aFAf;1qMR;rAQQ", "pdf_size": 0, "recommendation": "6;6;8", "confidence": "3;4;4", "correctness": "3;3;4", "technical_novelty": "3;2;3", "empirical_novelty": "3;2;3", "wc_summary_paper": "80;27;20", "wc_summary_review": "24;82;16", "wc_main_review": "534;239;168", "wc_review": "638;348;204", "wc_reply_reviewers": "18;129;0", "wc_reply_authors": "802;826;668", "reply_reviewers": "1;1;0", "reply_authors": "1;2;1", "recommendation_avg": [ 6.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 42.333333333333336, 26.78722747048592 ], "wc_summary_review_avg": [ 40.666666666666664, 29.4089933334837 ], "wc_main_review_avg": [ 313.6666666666667, 158.47257035700392 ], "wc_review_avg": [ 396.6666666666667, 180.49068920276437 ], "wc_reply_reviewers_avg": [ 49.0, 57.043842787806646 ], "wc_reply_authors_avg": [ 765.3333333333334, 69.51898221988645 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.49999999999999983, "corr_recommendation_correctness": 0.9999999999999998, "gs_citation": 168, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2762110290029984845&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "pdf": "https://openreview.net/pdf?id=PDYs7Z2XFGv", "email": "uts.edu.au;uts.edu.au;;uw.edu;;uts.edu.au", "author_num": 6, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "University of Technology Sydney;University of Washington", "aff_unique_dep": ";", "aff_unique_url": "https://www.uts.edu.au;https://www.washington.edu", "aff_unique_abbr": "UTS;UW", "aff_campus_unique_index": "1", "aff_campus_unique": ";Seattle", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "Australia;United States" }, { "id": "PGGjnBiQ84G", "title": "Learning Surface Parameterization for Document Image Unwarping", "track": "main", "status": "Reject", "tldr": "", "abstract": "In this paper, we present a novel approach to learn texture mapping for a 3D surface and apply it to document image unwarping. We propose an efficient method to learn surface parameterization by learning a continuous bijective mapping between 3D surface positions and 2D texture-space coordinates. Our surface parameterization network can be conveniently plugged into a differentiable rendering pipeline and trained using multi-view images and rendering loss. Recent work on differentiable rendering techniques for implicit surfaces has shown high-quality 3D scene reconstruction and view synthesis results. However, these methods typically learn the appearance color as a function of the surface points and lack explicit surface parameterization. Thus they do not allow texture map extraction or texture editing. By introducing explicit surface parameterization and learning with a recent differentiable renderer for implicit surfaces, we demonstrate state-of-the-art document-unwarping via texture extraction. We show that our approach can reconstruct high-frequency textures for arbitrary document shapes in both synthetic and real scenarios. We also demonstrate the usefulness of our system by applying it to document texture editing.", "keywords": "implicit functions;texture mapping;surface parameterization", "primary_area": "", "supplementary_material": "/attachment/4db7fbee8d9c11294ef400902959ed8ad52369e5.zip", "author": "Sagnik Das;Ke Ma;Zhixin Shu;Dimitris Samaras", "authorids": "~Sagnik_Das1;~Ke_Ma3;~Zhixin_Shu1;~Dimitris_Samaras3", "gender": "M;M;M;M", "homepage": "https://www3.cs.stonybrook.edu/~sadas/;;https://zhixinshu.github.io/;https://www.cs.stonybrook.edu/~samaras/", "dblp": "193/7173;;129/3987;s/DimitrisSamaras", "google_scholar": ";ovZamhQAAAAJ;gp6HUP0AAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";;;0000-0002-1373-0294", "linkedin": ";;;", "or_profile": "~Sagnik_Das1;~Ke_Ma3;~Zhixin_Shu1;~Dimitris_Samaras3", "aff": "State University of New York, Stony Brook;Snap Inc.;Adobe Systems;Stony Brook University", "aff_domain": "stonybrook.edu;snapchat.com;adobe.com;cs.stonybrook.edu", "position": "PhD student;Researcher;Researcher;Full Professor", "bibtex": "@misc{\ndas2022learning,\ntitle={Learning Surface Parameterization for Document Image Unwarping},\nauthor={Sagnik Das and Ke Ma and Zhixin Shu and Dimitris Samaras},\nyear={2022},\nurl={https://openreview.net/forum?id=PGGjnBiQ84G}\n}", "github": "", "project": "", "reviewers": "Cc2n;QH85;hfPz;K4gj", "site": "https://openreview.net/forum?id=PGGjnBiQ84G", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "4;4;4;4", "correctness": "3;3;3;3", "technical_novelty": "3;3;3;3", "empirical_novelty": "3;2;3;2", "wc_summary_paper": "44;240;61;157", "wc_summary_review": "31;232;85;151", "wc_main_review": "255;651;100;533", "wc_review": "330;1123;246;841", "wc_reply_reviewers": "171;0;0;374", "wc_reply_authors": "445;409;434;1207", "reply_reviewers": "1;0;0;1", "reply_authors": "2;1;1;2", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 125.5, 78.90659034580065 ], "wc_summary_review_avg": [ 124.75, 75.10118174835866 ], "wc_main_review_avg": [ 384.75, 218.39456838483872 ], "wc_review_avg": [ 635.0, 362.25888532926285 ], "wc_reply_reviewers_avg": [ 136.25, 153.99736199039256 ], "wc_reply_authors_avg": [ 623.75, 336.992117860344 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:xjWVdXE-NUAJ:scholar.google.com/&scioq=Learning+Surface+Parameterization+for+Document+Image+Unwarping&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "State University of New York;Snap Inc.;Adobe;Stony Brook University", "aff_unique_dep": ";;Adobe Systems Incorporated;", "aff_unique_url": "https://www.stonybrook.edu;https://www.snapinc.com;https://www.adobe.com;https://www.stonybrook.edu", "aff_unique_abbr": "SUNY Stony Brook;Snap;Adobe;SBU", "aff_campus_unique_index": "0", "aff_campus_unique": "Stony Brook;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "PHugX0j2xcE", "title": "Predictive Maintenance for Optical Networks in Robust Collaborative Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Machine learning (ML) has recently emerged as a powerful tool to enhance the proactive optical network maintenance and thereby, improve network reliability and operational efficiency, and reduce unplanned downtime and maintenance costs. However, it is challenging to develop an accurate and reliable ML based prognostic models due mainly to the unavailability of sufficient amount of training data since the device failure does not occur often in optical networks. Federated learning (FL) is a promising candidate to tackle the aforementioned challenge by enabling the development of a global ML model using datasets owned by many vendors without revealing their business-confidential data. While FL greatly enhances the data privacy, a global model can be strongly affected by a malicious local model. We propose a robust collaborative learning framework for predictive maintenance on cross-vendor in a dishonest setting. Our experiments confirm that a global ML model can be accurately built with sensitive datasets in federated learning even when a subset of vendors behave dishonestly.\n", "keywords": "predictive maintenance;federated learning;machine learning;anomaly detection;multi-party computation;autoencoder", "primary_area": "", "supplementary_material": "", "author": "Khouloud Abdelli;JOO YEON CHO", "authorids": "kabdelli@adva.com;~JOO_YEON_CHO1", "gender": ";M", "homepage": ";", "dblp": ";84/1152.html", "google_scholar": ";q5mu8LAAAAAJ", "orcid": ";0000-0003-0351-0885", "linkedin": ";joo-yeon-cho-94ab8939/", "or_profile": "kabdelli@adva.com;~JOO_YEON_CHO1", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nabdelli2022predictive,\ntitle={Predictive Maintenance for Optical Networks in Robust Collaborative Learning },\nauthor={Khouloud Abdelli and JOO YEON CHO},\nyear={2022},\nurl={https://openreview.net/forum?id=PHugX0j2xcE}\n}", "github": "", "project": "", "reviewers": "qcjG;BxDo;JagA;p63q", "site": "https://openreview.net/forum?id=PHugX0j2xcE", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "4;4;4;3", "correctness": "2;2;3;2", "technical_novelty": "2;2;1;2", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "45;50;72;85", "wc_summary_review": "132;37;9;79", "wc_main_review": "419;665;114;430", "wc_review": "596;752;195;594", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 2.25, 0.4330127018922193 ], "technical_novelty_avg": [ 1.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 63.0, 16.263455967290593 ], "wc_summary_review_avg": [ 64.25, 46.375505388081756 ], "wc_main_review_avg": [ 407.0, 195.63103025849452 ], "wc_review_avg": [ 534.25, 206.08781502068481 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -1.0, "corr_recommendation_correctness": -0.3333333333333333, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:FobEolCeoj0J:scholar.google.com/&scioq=Predictive+Maintenance+for+Optical+Networks+in+Robust+Collaborative+Learning&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "PIExE5KjaVL", "title": "Safety-aware Policy Optimisation for Autonomous Racing", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "To be viable for safety-critical applications, such as autonomous driving and assistive robotics, autonomous agents should adhere to safety constraints throughout the interactions with their environments. Instead of learning about safety by collecting samples, including unsafe ones, methods such as Hamilton-Jacobi (HJ) reachability compute safe sets with theoretical guarantees using models of the system dynamics. However, HJ reachability is not scalable to high-dimensional systems, and the guarantees hinge on the quality of the model. In this work, we inject HJ reachability theory into the constrained Markov decision process (CMDP) framework, as a control-theoretical approach for safety analysis via model-free updates on state-action pairs. Furthermore, we demonstrate that the HJ safety value can be learned directly on vision context, the highest-dimensional problem studied via the method to-date. We evaluate our method on several benchmark tasks, including Safety Gym and Learn-to-Race (L2R), a recently-released high-fidelity autonomous racing environment. Our approach has significantly fewer constraint violations in comparison to other constrained RL baselines, and achieve the new state-of-the-art results on the L2R benchmark task.", "keywords": "Safe Reinforcement Learning;Hamilton-Jacobi Reachability;Autonomous Driving", "primary_area": "", "supplementary_material": "/attachment/85d684da1d43fdb69efc2092fee09984e34d03d8.zip", "author": "Bingqing Chen;Jonathan Francis;James Herman;Jean Oh;Eric Nyberg;Sylvia Lee Herbert", "authorids": "~Bingqing_Chen1;~Jonathan_Francis1;~James_Herman1;~Jean_Oh2;~Eric_Nyberg1;~Sylvia_Lee_Herbert1", "gender": ";;M;F;;F", "homepage": ";;http://jimmyherman.me;http://www.cs.cmu.edu/~jeanoh/;https://www.cs.cmu.edu/~ehn;https://sylviaherbert.com", "dblp": ";;;62/4860;05/595;192/3242", "google_scholar": ";;;;https://scholar.google.com/citations?hl=en;", "orcid": ";;;;;0000-0002-3863-8945", "linkedin": ";;;;eric-nyberg-08620/;", "or_profile": "~Bingqing_Chen1;~Jonathan_Francis1;~James_Herman1;~Jean_Oh2;~Eric_Nyberg1;~Sylvia_Lee_Herbert1", "aff": ";;;Carnegie Mellon University;Carnegie Mellon University;University of California, San Diego", "aff_domain": ";;;cmu.edu;cmu.edu;ucsd.edu", "position": ";;;Associate Professor;Full Professor;Assistant Professor", "bibtex": "@misc{\nchen2022safetyaware,\ntitle={Safety-aware Policy Optimisation for Autonomous Racing},\nauthor={Bingqing Chen and Jonathan Francis and James Herman and Jean Oh and Eric Nyberg and Sylvia Lee Herbert},\nyear={2022},\nurl={https://openreview.net/forum?id=PIExE5KjaVL}\n}", "github": "", "project": "", "reviewers": "exfK;dXQY;JpVG;1oje", "site": "https://openreview.net/forum?id=PIExE5KjaVL", "pdf_size": 0, "recommendation": "3;3;5;8", "confidence": "4;4;2;2", "correctness": "2;3;3;4", "technical_novelty": "2;2;2;4", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "69;149;32;141", "wc_summary_review": "82;94;7;10", "wc_main_review": "624;466;110;101", "wc_review": "775;709;149;252", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.75, 2.0463381929681126 ], "confidence_avg": [ 3.0, 1.0 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.5, 0.8660254037844386 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 97.75, 49.10893503223217 ], "wc_summary_review_avg": [ 48.25, 39.98984246030484 ], "wc_main_review_avg": [ 325.25, 226.76130071068124 ], "wc_review_avg": [ 471.25, 274.18276295201343 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.8551861104941366, "corr_recommendation_correctness": 0.8638684255813602, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7911205721063099138&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;1", "aff_unique_norm": "Carnegie Mellon University;University of California, San Diego", "aff_unique_dep": ";", "aff_unique_url": "https://www.cmu.edu;https://www.ucsd.edu", "aff_unique_abbr": "CMU;UCSD", "aff_campus_unique_index": "1", "aff_campus_unique": ";San Diego", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "PKdNRKjwL4", "title": "DAIR: Data Augmented Invariant Regularization", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "While deep learning through empirical risk minimization (ERM) has succeeded at achieving human-level performance at a variety of complex tasks, ERM generalizes poorly to distribution shift. This is partly explained by overfitting to spurious features such as background in images or named entities in natural language. Synthetic data augmentation followed by empirical risk minimization (DA-ERM) is a simple yet powerful solution to remedy this problem. In this paper, we propose data augmented invariant regularization (DAIR). The idea of DAIR is based on the observation that the model performance (loss) is desired to be consistent on the augmented sample and the original one. DAIR introduces a regularizer on DA-ERM to penalize such loss inconsistency. Both theoretically and through empirical experiments, we show that a particular form of the DAIR regularizer consistently performs well in a variety of settings. We prove convergence guarantees for DAIR. We apply it to multiple real-world unsupervised and supervised learning problems involving domain shift. Our experiments show that DAIR consistently outperforms ERM and DA-ERM with little marginal cost. Furthermore, DAIR is competitive with state-of-the-art methods specifically designed for these problems.", "keywords": "data augmentation;domain shift;adversarial training", "primary_area": "", "supplementary_material": "/attachment/375e75918815707f728885d3031561f3daf0a4dc.zip", "author": "Tianjian Huang;Shaunak Ashish Halbe;Chinnadhurai Sankar;Pooyan Amini;Satwik Kottur;Alborz Geramifard;Meisam Razaviyayn;Ahmad Beirami", "authorids": "~Tianjian_Huang2;~Shaunak_Ashish_Halbe1;~Chinnadhurai_Sankar2;pamini@fb.com;~Satwik_Kottur1;alborzg@fb.com;~Meisam_Razaviyayn1;~Ahmad_Beirami1", "gender": "Unspecified;M;M;;M;;M;M", "homepage": "https://tianjian-huang.net/;https://shaunak27.github.io;https://chinnadhurai.github.io/;;https://satwikkottur.github.io/;;https://sites.usc.edu/razaviyayn/;https://beirami.github.io/", "dblp": "245/2447;349/7625;155/0592;;172/1012;;43/8577;41/9367", "google_scholar": "TNdMwp0AAAAJ;7-VApYcAAAAJ;KynAS2gAAAAJ;;iQxXG8kAAAAJ;;https://scholar.google.com/citations?hl=en;VuKWbMMAAAAJ", "orcid": ";0000-0001-7388-6963;;;;;;", "linkedin": "tianjian-huang-7801a0119;shaunak-halbe-565a0716b;chinnadhuraisankar/;;;;;ahmad-beirami-97001962", "or_profile": "~Tianjian_Huang2;~Shaunak_Ashish_Halbe1;~Chinnadhurai_Sankar2;pamini@fb.com;~Satwik_Kottur1;alborzg@fb.com;~Meisam_Razaviyayn1;~Ahmad_Beirami1", "aff": "University of Southern California;College of Engineering Pune;Meta AI;;Meta Facebook;;Google;Facebook AI", "aff_domain": "usc.edu;coep.ac.in;fb.com;;facebook.com;;google.com;fb.com", "position": "PhD student;Undergrad student;Research Scientist;;Research Scientist;;Researcher;Research Scientist", "bibtex": "@misc{\nhuang2022dair,\ntitle={{DAIR}: Data Augmented Invariant Regularization},\nauthor={Tianjian Huang and Shaunak Ashish Halbe and Chinnadhurai Sankar and Pooyan Amini and Satwik Kottur and Alborz Geramifard and Meisam Razaviyayn and Ahmad Beirami},\nyear={2022},\nurl={https://openreview.net/forum?id=PKdNRKjwL4}\n}", "github": "", "project": "", "reviewers": "5h6n;zaca;hj8W;VMQe", "site": "https://openreview.net/forum?id=PKdNRKjwL4", "pdf_size": 0, "recommendation": "5;5;5;6", "confidence": "3;4;3;4", "correctness": "3;4;3;3", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;2;2;0", "wc_summary_paper": "64;96;79;65", "wc_summary_review": "47;39;7;46", "wc_main_review": "284;327;246;553", "wc_review": "395;462;332;664", "wc_reply_reviewers": "25;136;0;29", "wc_reply_authors": "773;1574;827;1600", "reply_reviewers": "1;1;0;1", "reply_authors": "1;3;2;2", "recommendation_avg": [ 5.25, 0.4330127018922193 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 76.0, 12.98075498574717 ], "wc_summary_review_avg": [ 34.75, 16.315253599009733 ], "wc_main_review_avg": [ 352.5, 119.25288256474138 ], "wc_review_avg": [ 463.25, 124.68635651104735 ], "wc_reply_reviewers_avg": [ 47.5, 52.29005641611032 ], "wc_reply_authors_avg": [ 1193.5, 394.07010797572553 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 2.0, 0.7071067811865476 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": -0.3333333333333333, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12242108692396396316&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "aff_unique_index": "0;1;2;2;3;2", "aff_unique_norm": "University of Southern California;College of Engineering, Pune;Meta;Google", "aff_unique_dep": ";;Meta AI;Google", "aff_unique_url": "https://www.usc.edu;http://www.coep.ac.in;https://meta.com;https://www.google.com", "aff_unique_abbr": "USC;COEP;Meta;Google", "aff_campus_unique_index": "0;1;3", "aff_campus_unique": "Los Angeles;Pune;;Mountain View", "aff_country_unique_index": "0;1;0;0;0;0", "aff_country_unique": "United States;India" }, { "title": "Reinforcement Learning under a Multi-agent Predictive State Representation Model: Method and Theory", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6620", "id": "PLDOnFoVm4", "poster": "", "openreview": "https://openreview.net/forum?id=PLDOnFoVm4", "slides": "https://iclr.cc/virtual/2022/poster/6620", "video": "https://iclr.cc/virtual/2022/poster/6620", "author_site": "Zhi Zhang, Zhuoran Yang, Han Liu, Pratap Tokekar, Furong Huang", "tldr": "", "abstract": "We study reinforcement learning for partially observable multi-agent systems where each agent only has access to its own observation and reward and aims to maximize its cumulative rewards. To handle partial observations, we propose graph-assisted predictive state representations (GAPSR), a scalable multi-agent representation learning framework that leverages the agent connectivity graphs to aggregate local representations computed by each agent. In addition, our representations are readily able to incorporate dynamic interaction graphs and kernel space embeddings of the predictive states, and thus have strong flexibility and representation power. \nBased on GAPSR, we propose an end-to-end MARL algorithm that simultaneously infers the predictive representations and uses the representations as the input of a policy optimization algorithm. Empirically, we demonstrate the efficacy of the proposed algorithm provided on both a MAMuJoCo robotic learning experiment and a multi-agent particle learning environment.", "keywords": "Multi-agent Reinforcement Learning;Predictive State Representation;Dynamic Interaction Graph", "primary_area": "", "supplementary_material": "/attachment/92296e44ac86e038a7149ca9414df5ef0270ef18.zip", "author": "Zhi Zhang;Zhuoran Yang;Han Liu;Pratap Tokekar;Furong Huang", "authorids": "~Zhi_Zhang1;~Zhuoran_Yang1;~Han_Liu4;~Pratap_Tokekar1;~Furong_Huang1", "gender": ";M;;M;F", "homepage": ";https://zhuoranyang.github.io/;;https://tokekar.com/;https://furong-huang.com", "dblp": ";;;;72/8513", "google_scholar": "O__axAoAAAAJ;;;FKAovywAAAAJ;13yyuCcAAAAJ", "orcid": ";;;;", "linkedin": ";;;prataptokekar/;", "or_profile": "~Zhi_Zhang1;~Zhuoran_Yang1;~Han_Liu4;~Pratap_Tokekar1;~Furong_Huang1", "aff": "University of California, Los Angeles;University of California, Berkeley;Northwestern University;University of Maryland, College Park;University of Maryland", "aff_domain": "ucla.edu;berkeley.edu;u.northwestern.edu;umd.edu;cs.umd.edu", "position": "PhD student;Postdoc;Associate Professor;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\nzhang2022reinforcement,\ntitle={Reinforcement Learning under a Multi-agent Predictive State Representation Model: Method and Theory},\nauthor={Zhi Zhang and Zhuoran Yang and Han Liu and Pratap Tokekar and Furong Huang},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=PLDOnFoVm4}\n}", "github": "", "project": "", "reviewers": "6qEL;Jba3;zgHd", "pdf_size": 0, "recommendation": "8;8;8", "confidence": "3;3;3", "correctness": "3;4;4", "technical_novelty": "4;4;3", "empirical_novelty": "3;0;4", "wc_summary_paper": "66;223;15", "wc_summary_review": "15;45;15", "wc_main_review": "77;496;92", "wc_review": "158;764;122", "wc_reply_reviewers": "0;43;0", "wc_reply_authors": "676;1363;800", "reply_reviewers": "0;1;0", "reply_authors": "1;3;2", "recommendation_avg": [ 8.0, 0.0 ], "confidence_avg": [ 3.0, 0.0 ], "correctness_avg": [ 3.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 3.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.3333333333333335, 1.699673171197595 ], "wc_summary_paper_avg": [ 101.33333333333333, 88.5149077211793 ], "wc_summary_review_avg": [ 25.0, 14.142135623730951 ], "wc_main_review_avg": [ 221.66666666666666, 194.07959421043958 ], "wc_review_avg": [ 348.0, 294.5233437267749 ], "wc_reply_reviewers_avg": [ 14.333333333333334, 20.27039439401436 ], "wc_reply_authors_avg": [ 946.3333333333334, 298.9451826375903 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 2.0, 0.816496580927726 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10847733959591055190&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=PLDOnFoVm4", "email": "ucla.edu;berkeley.edu;u.northwestern.edu;umd.edu;cs.umd.edu", "author_num": 5, "aff_unique_index": "0;1;2;3;3", "aff_unique_norm": "University of California, Los Angeles;University of California, Berkeley;Northwestern University;University of Maryland", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.ucla.edu;https://www.berkeley.edu;https://www.northwestern.edu;https://www/umd.edu", "aff_unique_abbr": "UCLA;UC Berkeley;NU;UMD", "aff_campus_unique_index": "0;1;3", "aff_campus_unique": "Los Angeles;Berkeley;;College Park", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "PO-32ODWng", "title": "Improving the Post-hoc Calibration of Modern Neural Networks with Probe Scaling", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "We present \"probe scaling\": a post-hoc recipe for calibrating the predictions of modern neural networks. Our recipe is inspired by several lines of work, which demonstrate that early layers in the neural network learn general rules whereas later layers specialize. We show how such observations can be utilized in a post-hoc manner to calibrate the predictions of trained neural networks by injecting linear probes on the network's intermediate representations. Similar to temperature scaling, probe scaling neither retrains the architecture nor requires significantly more parameters. Unlike temperature scaling, however, it utilizes intermediate layers in the neural network. We demonstrate that probe scaling improves performance over temperature scaling on benchmark datasets across all five metrics: expected calibration error (ECE), negative log-likelihood, Brier score, classification accuracy, and the area under the ROC curve.", "keywords": "Calibration;Probes;Generalization;Temperature Scaling;Post-processing", "primary_area": "", "supplementary_material": "", "author": "Amr Khalifa;Ibrahim Alabdulmohsin", "authorids": "amrkhalifa@google.com;~Ibrahim_Alabdulmohsin1", "gender": ";M", "homepage": ";http://ibomohsin.com", "dblp": ";153/5393", "google_scholar": ";8WNMsPYAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "amrkhalifa@google.com;~Ibrahim_Alabdulmohsin1", "aff": ";Google", "aff_domain": ";google.com", "position": ";Research Scientist", "bibtex": "@misc{\nkhalifa2022improving,\ntitle={Improving the Post-hoc Calibration of Modern Neural Networks with Probe Scaling},\nauthor={Amr Khalifa and Ibrahim Alabdulmohsin},\nyear={2022},\nurl={https://openreview.net/forum?id=PO-32ODWng}\n}", "github": "", "project": "", "reviewers": "pwLn;xDKP;QYVy;5HcW", "site": "https://openreview.net/forum?id=PO-32ODWng", "pdf_size": 0, "recommendation": "3;3;3;3", "confidence": "4;4;4;4", "correctness": "2;3;2;2", "technical_novelty": "2;2;2;1", "empirical_novelty": "2;2;2;1", "wc_summary_paper": "46;64;81;38", "wc_summary_review": "22;46;76;19", "wc_main_review": "82;397;798;352", "wc_review": "150;507;955;409", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 2.25, 0.4330127018922193 ], "technical_novelty_avg": [ 1.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 57.25, 16.63392617513977 ], "wc_summary_review_avg": [ 40.75, 22.884219453588535 ], "wc_main_review_avg": [ 407.25, 255.74926686111928 ], "wc_review_avg": [ 505.25, 290.57905550813535 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:WjQCZ-MgKC0J:scholar.google.com/&scioq=Improving+the+Post-hoc+Calibration+of+Modern+Neural+Networks+with+Probe+Scaling&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "Google", "aff_unique_dep": "Google", "aff_unique_url": "https://www.google.com", "aff_unique_abbr": "Google", "aff_campus_unique_index": "0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "title": "Discovering Latent Concepts Learned in BERT", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6838", "id": "POTMtpYI1xH", "poster": "", "openreview": "https://openreview.net/forum?id=POTMtpYI1xH", "slides": "https://iclr.cc/virtual/2022/poster/6838", "video": "https://iclr.cc/virtual/2022/poster/6838", "author_site": "Fahim Dalvi, Abdul Khan, Firoj Alam, Nadir Durrani, Jia Xu, Hassan Sajjad", "tldr": "", "abstract": "A large number of studies that analyze deep neural network models and their ability to encode various linguistic and non-linguistic concepts provide an interpretation of the inner mechanics of these models. The scope of the analyses is limited to pre-defined concepts that reinforce the traditional linguistic knowledge and do not reflect on how novel concepts are learned by the model. We address this limitation by discovering and analyzing latent concepts learned in neural network models in an unsupervised fashion and provide interpretations from the model's perspective. In this work, we study: i) what latent concepts exist in the pre-trained BERT model, ii) how the discovered latent concepts align or diverge from classical linguistic hierarchy and iii) how the latent concepts evolve across layers. \nOur findings show: i) a model learns novel concepts (e.g. animal categories and demographic groups), which do not strictly adhere to any pre-defined categorization (e.g. POS, semantic tags), ii) several latent concepts are based on multiple properties which may include semantics, syntax, and morphology, iii) the lower layers in the model dominate in learning shallow lexical concepts while the higher layers learn semantic relations and iv) the discovered latent concepts highlight potential biases learned in the model. We also release a novel BERT ConceptNet dataset consisting of 174 concept labels and 1M annotated instances.", "keywords": "interpretation;BERT;NLP", "primary_area": "", "supplementary_material": "/attachment/a7fb817b68305b71d585b86e4d4fc7ab8460d008.zip", "author": "Fahim Dalvi;Abdul Rafae Khan;Firoj Alam;Nadir Durrani;Jia Xu;Hassan Sajjad", "authorids": "~Fahim_Dalvi1;~Abdul_Rafae_Khan1;~Firoj_Alam1;~Nadir_Durrani1;jxu70@stevens.edu;~Hassan_Sajjad1", "gender": "M;M;M;M;;M", "homepage": "https://fdalvi.github.io;http://arafae.com/?i=1;https://www.firojalam.one/;https://nadirdurrani.github.io/;;https://hsajjad.github.io/", "dblp": "194/2537;;126/2083;54/9012;;73/5938", "google_scholar": "uQGCv10AAAAJ;_zsfzMgAAAAJ;https://scholar.google.it/citations?user=j-RtwDQAAAAJ;https://scholar.google.co.uk/citations?user=K6uisFAAAAAJ;;https://scholar.google.de/citations?user=t3BH6NkAAAAJ", "orcid": ";;0000-0001-7172-1997;0000-0002-9378-4128;;", "linkedin": ";;firoj-alam-0a96206/;nadir-durrani-04048744/;;hassan-sajjad-154b043a/", "or_profile": "~Fahim_Dalvi1;~Abdul_Rafae_Khan1;~Firoj_Alam1;~Nadir_Durrani1;jxu70@stevens.edu;~Hassan_Sajjad1", "aff": "Hamad Bin Khalifa University;Stevens Institute of Technology;Qatar Computing Research Institute;Qatar Computing Research Institute;;Qatar Computing Research Institute", "aff_domain": "hbku.edu.qa;stevens.edu;hbku.edu.qa;hbku.edu.qa;;hbku.edu.qa", "position": "Researcher;Postdoctoral Fellow;Scientist;Scientist;;Researcher", "bibtex": "@inproceedings{\ndalvi2022discovering,\ntitle={Discovering Latent Concepts Learned in {BERT}},\nauthor={Fahim Dalvi and Abdul Rafae Khan and Firoj Alam and Nadir Durrani and Jia Xu and Hassan Sajjad},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=POTMtpYI1xH}\n}", "github": "", "project": "", "reviewers": "yqwt;3Uym;We8g;kVr9", "pdf_size": 0, "recommendation": "5;5;8;8", "confidence": "4;2;4;4", "correctness": "3;3;3;3", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "46;77;117;110", "wc_summary_review": "46;40;27;52", "wc_main_review": "630;237;484;367", "wc_review": "722;354;628;529", "wc_reply_reviewers": "176;253;49;5", "wc_reply_authors": "917;562;746;394", "reply_reviewers": "1;1;1;1", "reply_authors": "2;2;2;2", "recommendation_avg": [ 6.5, 1.5 ], "confidence_avg": [ 3.5, 0.8660254037844386 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 87.5, 28.324018076537094 ], "wc_summary_review_avg": [ 41.25, 9.256754290786809 ], "wc_main_review_avg": [ 429.5, 145.02844548570462 ], "wc_review_avg": [ 558.25, 136.2467889529878 ], "wc_reply_reviewers_avg": [ 120.75, 98.85437521930933 ], "wc_reply_authors_avg": [ 654.75, 196.01960998838865 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 2.0, 0.0 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.5773502691896258, "corr_recommendation_correctness": 0.0, "gs_citation": 77, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15818317356870035431&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "pdf": "https://openreview.net/pdf?id=POTMtpYI1xH", "email": "hbku.edu.qa;stevens.edu;hbku.edu.qa;hbku.edu.qa;;hbku.edu.qa", "author_num": 6, "aff_unique_index": "0;1;2;2;2", "aff_unique_norm": "Hamad Bin Khalifa University;Stevens Institute of Technology;Qatar Computing Research Institute", "aff_unique_dep": ";;", "aff_unique_url": "https://www.hbku.edu.qa;https://www.stevens.edu;https://www.qcri.org", "aff_unique_abbr": "HBKU;SIT;QCRI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0;0", "aff_country_unique": "Qatar;United States" }, { "title": "DR3: Value-Based Deep Reinforcement Learning Requires Explicit Regularization", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6927", "id": "POvMvLi91f", "poster": "", "openreview": "https://openreview.net/forum?id=POvMvLi91f", "slides": "https://iclr.cc/virtual/2022/poster/6927", "video": "https://iclr.cc/virtual/2022/poster/6927", "author_site": "Aviral Kumar, Rishabh Agarwal, Tengyu Ma, Aaron Courville, George Tucker, Sergey Levine", "tldr": "", "abstract": "Despite overparameterization, deep networks trained via supervised learning are surprisingly easy to optimize and exhibit excellent generalization. One hypothesis to explain this is that overparameterized deep networks enjoy the benefits of implicit regularization induced by stochastic gradient descent, which favors parsimonious solutions that generalize well on test inputs. It is reasonable to surmise that deep reinforcement learning (RL) methods could also benefit from this effect. In this paper, we discuss how the implicit regularization effect of SGD seen in supervised learning could in fact be harmful in the offline deep RL setting, leading to poor generalization and degenerate feature representations. Our theoretical analysis shows that when existing models of implicit regularization are applied to temporal difference learning, the resulting derived regularizer favors degenerate solutions with excessive aliasing, in stark contrast to the supervised learning case. We back up these findings empirically, showing that feature representations learned by a deep network value function trained via bootstrapping can indeed become degenerate, aliasing the representations for state-action pairs that appear on either side of the Bellman backup. To address this issue, we derive the form of this implicit regularizer and, inspired by this derivation, propose a simple and effective explicit regularizer, called DR3, that counteracts the undesirable effects of this implicit regularizer. When combined with existing offline RL methods, DR3 substantially improves performance and stability, alleviating unlearning in Atari 2600 games, D4RL domains and robotic manipulation from images.", "keywords": "Q-learning;offline RL;regularization", "primary_area": "", "supplementary_material": "", "author": "Aviral Kumar;Rishabh Agarwal;Tengyu Ma;Aaron Courville;George Tucker;Sergey Levine", "authorids": "~Aviral_Kumar2;~Rishabh_Agarwal2;~Tengyu_Ma1;~Aaron_Courville3;~George_Tucker1;~Sergey_Levine1", "gender": "M;M;M;;M;M", "homepage": "https://aviralkumar2907.github.io/;https://agarwl.github.io;http://ai.stanford.edu/~tengyuma/;;https://sites.google.com/view/gjt;https://people.eecs.berkeley.edu/~svlevine/", "dblp": "202/7961;;54/9061;56/1688;135/5748;80/7594", "google_scholar": ";https://scholar.google.ca/citations?user=aH8AJu4AAAAJ;i38QlUwAAAAJ;https://scholar.google.ca/citations?user=km6CP8cAAAAJ;-gJkPHIAAAAJ;8R35rCwAAAAJ", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": "~Aviral_Kumar2;~Rishabh_Agarwal2;~Tengyu_Ma1;~Aaron_Courville3;~George_Tucker1;~Sergey_Levine1", "aff": "University of California, Berkeley;Google DeepMind;Facebook AI Research;Universit\u00e9 de Montr\u00e9al;Google Brain;Google", "aff_domain": "berkeley.edu;google.com;fb.com; ;google.com;google.com", "position": "PhD student;Research Scientist;Visiting Scientist;Assistant Professor;Research Scientist;Research Scientist", "bibtex": "@inproceedings{\nkumar2022dr,\ntitle={{DR}3: Value-Based Deep Reinforcement Learning Requires Explicit Regularization},\nauthor={Aviral Kumar and Rishabh Agarwal and Tengyu Ma and Aaron Courville and George Tucker and Sergey Levine},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=POvMvLi91f}\n}", "github": "", "project": "", "reviewers": "5pgu;7Brr;oKwL;T3ZN", "pdf_size": 0, "recommendation": "6;6;8;8", "confidence": "3;4;3;4", "correctness": "3;3;4;3", "technical_novelty": "3;3;3;4", "empirical_novelty": "3;3;3;2", "wc_summary_paper": "60;83;92;77", "wc_summary_review": "20;46;34;104", "wc_main_review": "894;202;109;471", "wc_review": "974;331;235;652", "wc_reply_reviewers": "340;42;0;160", "wc_reply_authors": "2655;556;107;1248", "reply_reviewers": "3;1;0;1", "reply_authors": "5;1;1;2", "recommendation_avg": [ 7.0, 1.0 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 3.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 78.0, 11.683321445547923 ], "wc_summary_review_avg": [ 51.0, 31.953090617340916 ], "wc_main_review_avg": [ 419.0, 304.76138206800414 ], "wc_review_avg": [ 548.0, 290.4091940693338 ], "wc_reply_reviewers_avg": [ 135.5, 131.83607245363464 ], "wc_reply_authors_avg": [ 1141.5, 963.7200060183455 ], "reply_reviewers_avg": [ 1.25, 1.0897247358851685 ], "reply_authors_avg": [ 2.25, 1.6393596310755 ], "replies_avg": [ 20, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 70, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14830398725329787464&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=POvMvLi91f", "email": "berkeley.edu;google.com;fb.com; ;google.com;google.com", "author_num": 6, "aff_unique_index": "0;1;2;3;1;1", "aff_unique_norm": "University of California, Berkeley;Google;Meta;Universit\u00e9 de Montr\u00e9al", "aff_unique_dep": ";Google DeepMind;Facebook AI Research;", "aff_unique_url": "https://www.berkeley.edu;https://deepmind.com;https://research.facebook.com;https://www.umontreal.ca", "aff_unique_abbr": "UC Berkeley;DeepMind;FAIR;UdeM", "aff_campus_unique_index": "0;2;2", "aff_campus_unique": "Berkeley;;Mountain View", "aff_country_unique_index": "0;1;0;2;0;0", "aff_country_unique": "United States;United Kingdom;Canada" }, { "title": "You Mostly Walk Alone: Analyzing Feature Attribution in Trajectory Prediction", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6404", "id": "POxF-LEqnF", "poster": "", "openreview": "https://openreview.net/forum?id=POxF-LEqnF", "slides": "https://iclr.cc/virtual/2022/poster/6404", "video": "https://iclr.cc/virtual/2022/poster/6404", "author_site": "Osama Makansi, Julius von Kuegelgen, Francesco Locatello, Peter Gehler, Dominik Janzing, Thomas Brox, Bernhard Schoelkopf", "tldr": "", "abstract": "Predicting the future trajectory of a moving agent can be easy when the past trajectory continues smoothly but is challenging when complex interactions with other agents are involved. Recent deep learning approaches for trajectory prediction show promising performance and partially attribute this to successful reasoning about agent-agent interactions. However, it remains unclear which features such black-box models actually learn to use for making predictions. This paper proposes a procedure that quantifies the contributions of different cues to model performance based on a variant of Shapley values. Applying this procedure to state-of-the-art trajectory prediction methods on standard benchmark datasets shows that they are, in fact, unable to reason about interactions. Instead, the past trajectory of the target is the only feature used for predicting its future. For a task with richer social interaction patterns, on the other hand, the tested models do pick up such interactions to a certain extent, as quantified by our feature attribution method. We discuss the limits of the proposed method and its links to causality.", "keywords": "Feature Attribution;Shapley values;Trajectory Prediction;Causality", "primary_area": "", "supplementary_material": "", "author": "Osama Makansi;Julius Von K\u00fcgelgen;Francesco Locatello;Peter Vincent Gehler;Dominik Janzing;Thomas Brox;Bernhard Sch\u00f6lkopf", "authorids": "~Osama_Makansi1;~Julius_Von_K\u00fcgelgen1;~Francesco_Locatello1;~Peter_Vincent_Gehler1;~Dominik_Janzing1;~Thomas_Brox1;~Bernhard_Sch\u00f6lkopf1", "gender": "M;M;M;;M;M;", "homepage": "https://lmb.informatik.uni-freiburg.de/people/makansio/publications.html;https://sites.google.com/view/julius-von-kuegelgen/home;https://twitter.com/FrancescoLocat8;;https://janzing.github.io;https://lmb.informatik.uni-freiburg.de/people/brox/index.en.html;", "dblp": "203/8443;223/5666;195/6074;;17/6280;97/4586;", "google_scholar": "https://scholar.google.de/citations?user=xnd0Jz8AAAAJ;6EOl3hAAAAAJ;;;https://scholar.google.fr/citations?user=O-3bc_EAAAAJ;https://scholar.google.com/citations?hl=de;", "orcid": ";0000-0001-6469-4118;;;;0000-0002-6282-8861;", "linkedin": ";julius-von-k%C3%BCgelgen/;;;;;", "or_profile": "~Osama_Makansi1;~Julius_Von_K\u00fcgelgen1;~Francesco_Locatello1;~Peter_Vincent_Gehler1;~Dominik_Janzing1;~Thomas_Brox1;~Bernhard_Sch\u00f6lkopf1", "aff": "Universit\u00e4t Freiburg;, Max Planck Institute for Intelligent Systems;Amazon;;Amazon Development Center Germany;University of Freiburg;", "aff_domain": "uni-freiburg.de;is.tuebingen.mpg.de;amazon.com;;amazon.de;uni-freiburg.de;", "position": "PhD student;PhD student;Senior Applied Scientist;;Researcher;Full Professor;", "bibtex": "@inproceedings{\nmakansi2022you,\ntitle={You Mostly Walk Alone: Analyzing Feature Attribution in Trajectory Prediction},\nauthor={Osama Makansi and Julius Von K{\\\"u}gelgen and Francesco Locatello and Peter Vincent Gehler and Dominik Janzing and Thomas Brox and Bernhard Sch{\\\"o}lkopf},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=POxF-LEqnF}\n}", "github": "", "project": "", "reviewers": "hYNN;dgLJ;jvES;ef1N;9atf", "pdf_size": 0, "recommendation": "5;6;8;8;10", "confidence": "4;4;4;4;5", "correctness": "3;4;4;4;4", "technical_novelty": "3;3;3;2;4", "empirical_novelty": "3;3;3;4;4", "wc_summary_paper": "72;126;139;45;94", "wc_summary_review": "67;59;147;37;46", "wc_main_review": "598;216;461;203;343", "wc_review": "737;401;747;285;483", "wc_reply_reviewers": "0;25;0;17;35", "wc_reply_authors": "1016;395;587;464;277", "reply_reviewers": "0;1;0;1;1", "reply_authors": "2;2;2;1;2", "recommendation_avg": [ 7.4, 1.7435595774162693 ], "confidence_avg": [ 4.2, 0.39999999999999997 ], "correctness_avg": [ 3.8, 0.39999999999999997 ], "technical_novelty_avg": [ 3.0, 0.6324555320336759 ], "empirical_novelty_avg": [ 3.4, 0.4898979485566356 ], "wc_summary_paper_avg": [ 95.2, 34.42905749508691 ], "wc_summary_review_avg": [ 71.2, 39.2856207791095 ], "wc_main_review_avg": [ 364.2, 149.95385957020244 ], "wc_review_avg": [ 530.6, 183.74504074940364 ], "wc_reply_reviewers_avg": [ 15.4, 13.807244475274565 ], "wc_reply_authors_avg": [ 547.8, 254.73547063571655 ], "reply_reviewers_avg": [ 0.6, 0.48989794855663565 ], "reply_authors_avg": [ 1.8, 0.4 ], "replies_avg": [ 20, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.7456011350793259, "corr_recommendation_correctness": 0.6882472016116856, "gs_citation": 41, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4404187381169872534&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "pdf": "https://openreview.net/pdf?id=POxF-LEqnF", "email": "uni-freiburg.de;is.tuebingen.mpg.de;amazon.com;;amazon.de;uni-freiburg.de;", "author_num": 7, "aff_unique_index": "0;1;2;2;0", "aff_unique_norm": "University of Freiburg;Max Planck Institute for Intelligent Systems;Amazon", "aff_unique_dep": ";;Amazon.com, Inc.", "aff_unique_url": "https://www.uni-freiburg.de;https://www.mpi-is.mpg.de;https://www.amazon.com", "aff_unique_abbr": "Uni Freiburg;MPI-IS;Amazon", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;0", "aff_country_unique": "Germany;United States" }, { "title": "Particle Stochastic Dual Coordinate Ascent: Exponential convergent algorithm for mean field neural network optimization", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6575", "id": "PQQp7AJwz3", "poster": "", "openreview": "https://openreview.net/forum?id=PQQp7AJwz3", "slides": "https://iclr.cc/virtual/2022/poster/6575", "video": "https://iclr.cc/virtual/2022/poster/6575", "author_site": "Kazusato Oko, Taiji Suzuki, Atsushi Nitanda, Denny Wu", "tldr": "", "abstract": "We introduce Particle-SDCA, a gradient-based optimization algorithm for two-layer neural networks in the mean field regime that achieves exponential convergence rate in regularized empirical risk minimization. The proposed algorithm can be regarded as an infinite dimensional extension of Stochastic Dual Coordinate Ascent (SDCA) in the probability space: we exploit the convexity of the dual problem, for which the coordinate-wise proximal gradient method can be applied. Our proposed method inherits advantages of the original SDCA, including (i) exponential convergence (with respect to the outer iteration steps), and (ii) better dependency on the sample size and condition number than the full-batch gradient method. One technical challenge in implementing the SDCA update is the intractable integral over the entire parameter space at every step. To overcome this limitation, we propose a tractable \\textit{particle method} that approximately solves the dual problem, and an importance re-weighted technique to reduce the computational cost. The convergence rate of our method is verified by numerical experiments.", "keywords": "Neural Network Optimization;Mean field Regime;Overparameterization", "primary_area": "", "supplementary_material": "/attachment/f26eeda2e85718aa3a423b67a8ce2f90a1e99ae4.zip", "author": "Kazusato Oko;Taiji Suzuki;Atsushi Nitanda;Denny Wu", "authorids": "~Kazusato_Oko1;~Taiji_Suzuki1;~Atsushi_Nitanda1;~Denny_Wu2", "gender": "M;M;M;M", "homepage": ";http://ibis.t.u-tokyo.ac.jp/suzuki/;https://sites.google.com/site/atsushinitanda;https://dennywu1.github.io/", "dblp": ";08/312;155/1884;", "google_scholar": ";x8osrBsAAAAJ;https://scholar.google.co.jp/citations?user=LyVvaf8AAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";;;", "linkedin": "kazusatooko/;;;", "or_profile": "~Kazusato_Oko1;~Taiji_Suzuki1;~Atsushi_Nitanda1;~Denny_Wu2", "aff": "The University of Tokyo;The University of Tokyo;Kyushu Institute of Technology;University of Toronto", "aff_domain": "u-tokyo.ac.jp;tokyo.ac.jp;kyutech.ac.jp;toronto.edu", "position": "Undergrad student;Associate Professor;Associate Professor;PhD student", "bibtex": "@inproceedings{\noko2022particle,\ntitle={Particle Stochastic Dual Coordinate Ascent: Exponential convergent algorithm for mean field neural network optimization},\nauthor={Kazusato Oko and Taiji Suzuki and Atsushi Nitanda and Denny Wu},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=PQQp7AJwz3}\n}", "github": "", "project": "", "reviewers": "pSnQ;CmPk;M8tQ;ny66", "pdf_size": 0, "recommendation": "6;6;6;8", "confidence": "4;4;2;3", "correctness": "4;4;3;3", "technical_novelty": "3;3;3;4", "empirical_novelty": "0;3;0;0", "wc_summary_paper": "63;46;82;98", "wc_summary_review": "47;34;42;82", "wc_main_review": "179;251;239;456", "wc_review": "289;331;363;636", "wc_reply_reviewers": "0;0;0;94", "wc_reply_authors": "475;321;467;568", "reply_reviewers": "0;0;0;1", "reply_authors": "1;1;1;2", "recommendation_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 0.75, 1.299038105676658 ], "wc_summary_paper_avg": [ 72.25, 19.57517560585345 ], "wc_summary_review_avg": [ 51.25, 18.34904629674251 ], "wc_main_review_avg": [ 281.25, 104.5140540788654 ], "wc_review_avg": [ 404.75, 136.0668493792665 ], "wc_reply_reviewers_avg": [ 23.5, 40.703193977868615 ], "wc_reply_authors_avg": [ 457.75, 88.37243631359271 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.17407765595569782, "corr_recommendation_correctness": -0.5773502691896257, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14351740232233268638&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "pdf": "https://openreview.net/pdf?id=PQQp7AJwz3", "email": "u-tokyo.ac.jp;tokyo.ac.jp;kyutech.ac.jp;toronto.edu", "author_num": 4, "aff_unique_index": "0;0;1;2", "aff_unique_norm": "University of Tokyo;Kyushu Institute of Technology;University of Toronto", "aff_unique_dep": ";;", "aff_unique_url": "https://www.u-tokyo.ac.jp;https://www.kyutech.ac.jp;https://www.utoronto.ca", "aff_unique_abbr": "UTokyo;Kyutech;U of T", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1", "aff_country_unique": "Japan;Canada" }, { "title": "On feature learning in neural networks with global convergence guarantees", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6698", "id": "PQTW3iG4sC-", "poster": "", "openreview": "https://openreview.net/forum?id=PQTW3iG4sC-", "slides": "https://iclr.cc/virtual/2022/poster/6698", "video": "https://iclr.cc/virtual/2022/poster/6698", "author_site": "Zhengdao Chen, Eric Vanden-Eijnden, Joan Bruna", "tldr": "", "abstract": "We study the gradient flow optimization of over-parameterized neural networks (NNs) in a setup that allows feature learning while admitting non-asymptotic global convergence guarantees. First, we prove that for wide shallow NNs under the mean-field (MF) scaling and with a general class of activation functions, when the input dimension is at least the size of the training set, the training loss converges to zero at a linear rate under gradient flow. Building upon this analysis, we study a model of wide multi-layer NNs with random and untrained weights in earlier layers, and also prove a linear-rate convergence of the training loss to zero, regardless of the input dimension. We also show empirically that, unlike in the Neural Tangent Kernel (NTK) regime, our multi-layer model exhibits feature learning and can achieve better generalization performance than its NTK counterpart.", "keywords": "neural networks;feature learning;gradient descent;global convergence", "primary_area": "", "supplementary_material": "", "author": "Zhengdao Chen;Eric Vanden-Eijnden;Joan Bruna", "authorids": "~Zhengdao_Chen1;~Eric_Vanden-Eijnden1;~Joan_Bruna1", "gender": ";M;M", "homepage": ";https://wp.nyu.edu/courantinstituteofmathematicalsciences-eve2/;http://cims.nyu.edu/~bruna", "dblp": ";88/7927;44/8776", "google_scholar": ";A5Gx65gAAAAJ;L4bNmsMAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Zhengdao_Chen1;~Eric_Vanden-Eijnden1;~Joan_Bruna1", "aff": ";New York University;New York University", "aff_domain": ";nyu.edu;nyu.edu", "position": ";Full Professor;Associate Professor", "bibtex": "@inproceedings{\nchen2022on,\ntitle={On feature learning in neural networks with global convergence guarantees},\nauthor={Zhengdao Chen and Eric Vanden-Eijnden and Joan Bruna},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=PQTW3iG4sC-}\n}", "github": "", "project": "", "reviewers": "48yw;zizE;XJCF;jNHj", "pdf_size": 0, "recommendation": "3;6;8;8", "confidence": "4;3;2;4", "correctness": "1;4;4;3", "technical_novelty": "1;2;3;3", "empirical_novelty": "1;2;1;0", "wc_summary_paper": "90;82;156;54", "wc_summary_review": "25;18;21;32", "wc_main_review": "181;218;511;463", "wc_review": "296;318;688;549", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 6.25, 2.0463381929681126 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "correctness_avg": [ 3.0, 1.224744871391589 ], "technical_novelty_avg": [ 2.25, 0.82915619758885 ], "empirical_novelty_avg": [ 1.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 95.5, 37.399866309921485 ], "wc_summary_review_avg": [ 24.0, 5.244044240850758 ], "wc_main_review_avg": [ 343.25, 145.33818321418497 ], "wc_review_avg": [ 462.75, 163.50439596536847 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.47886115464444223, "corr_recommendation_correctness": 0.7980074688861063, "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8950111879472338132&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "pdf": "https://openreview.net/pdf?id=PQTW3iG4sC-", "email": ";nyu.edu;nyu.edu", "author_num": 3, "aff_unique_index": "0;0", "aff_unique_norm": "New York University", "aff_unique_dep": "", "aff_unique_url": "https://www.nyu.edu", "aff_unique_abbr": "NYU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "PQTkBlcrRs", "title": "AutoML to generate ensembles of deep neural networks", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Automated Machine Learning with ensembling seeks to automatically build ensembles of Deep Neural Networks (DNNs) to achieve qualitative predictions. \nAutoML and Ensemble of Deep Neural Network produce qualitative results but they are computing intensive methods in both building and inference run time. Therefore, an ideal method would produce at one AutoML run time different ensembles regarding accuracy and inference speed regarding the desired trade-off. \nDespite multiple initiative for non-deep machine learning have been proposed there still no consensus on how to automatically construct efficient ensembles of deep neural networks.\nFirst, we propose a new multi-objective ensemble selection method to generate efficient ensembles by controlling their computing cost named SMOBF. Second, we propose an AutoML workflow using Hyperband to generate DNNs, SMOBF to combine DNNs and the simple averaging as combination rule. Finally we compare this AutoML workflow to several baselines and its inherent characteristics are discussed.\nIt shows robust results leveraging multiple GPUs on two datasets but can be applied beyond.", "keywords": "Deep Learning;Ensemble;AutoML", "primary_area": "", "supplementary_material": "", "author": "Pierrick Pochelu;Serge G. Petiton;Bruno Conche", "authorids": "~Pierrick_Pochelu1;serge.petiton@univ-lille.fr;bruno.conche@total.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\npochelu2022automl,\ntitle={Auto{ML} to generate ensembles of deep neural networks},\nauthor={Pierrick Pochelu and Serge G. Petiton and Bruno Conche},\nyear={2022},\nurl={https://openreview.net/forum?id=PQTkBlcrRs}\n}", "github": "", "project": "", "reviewers": "mMZN;KXYo;axYE;ERfA", "site": "https://openreview.net/forum?id=PQTkBlcrRs", "pdf_size": 0, "recommendation": "1;3;3;3", "confidence": "5;4;5;3", "correctness": "2;3;2;3", "technical_novelty": "1;2;2;1", "empirical_novelty": "1;2;1;1", "wc_summary_paper": "62;84;109;132", "wc_summary_review": "28;60;64;12", "wc_main_review": "438;344;708;48", "wc_review": "528;488;881;192", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 2.5, 0.8660254037844386 ], "confidence_avg": [ 4.25, 0.82915619758885 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 1.5, 0.5 ], "empirical_novelty_avg": [ 1.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 96.75, 26.280934153869037 ], "wc_summary_review_avg": [ 41.0, 21.79449471770337 ], "wc_main_review_avg": [ 384.5, 235.7896308152672 ], "wc_review_avg": [ 522.25, 244.42419581538977 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.5222329678670935, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:HjW3v8AlP3kJ:scholar.google.com/&scioq=AutoML+to+generate+ensembles+of+deep+neural+networks&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "title": "Constrained Policy Optimization via Bayesian World Models", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6499", "id": "PRZoSmCinhf", "poster": "", "openreview": "https://openreview.net/forum?id=PRZoSmCinhf", "slides": "https://iclr.cc/virtual/2022/poster/6499", "video": "https://iclr.cc/virtual/2022/poster/6499", "author_site": "Yarden As, Ilnura Usmanova, Sebastian Curi, Andreas Krause", "tldr": "", "abstract": "Improving sample-efficiency and safety are crucial challenges when deploying reinforcement learning in high-stakes real world applications. We propose LAMBDA, a novel model-based approach for policy optimization in safety critical tasks modeled via constrained Markov decision processes. Our approach utilizes Bayesian world models, and harnesses the resulting uncertainty to maximize optimistic upper bounds on the task objective, as well as pessimistic upper bounds on the safety constraints. We demonstrate LAMBDA's state of the art performance on the Safety-Gym benchmark suite in terms of sample efficiency and constraint violation.", "keywords": "Reinforcement learning;Constrained Markov decision processes;Constrained policy optimization;Bayesian model-based RL", "primary_area": "", "supplementary_material": "", "author": "Yarden As;Ilnura Usmanova;Sebastian Curi;Andreas Krause", "authorids": "~Yarden_As1;~Ilnura_Usmanova1;~Sebastian_Curi1;~Andreas_Krause1", "gender": "M;F;M;M", "homepage": "https://github.com/yardenas;https://sites.google.com/view/ilnurausmanova/main;;https://las.inf.ethz.ch/krausea", "dblp": "312/4578;;213/3055;87/1831-1.html", "google_scholar": ";;;https://scholar.google.ch/citations?user=eDHv58AAAAAJ", "orcid": ";;;0000-0001-7260-9673", "linkedin": "yardenas/;;;krausea/", "or_profile": "~Yarden_As1;~Ilnura_Usmanova1;~Sebastian_Curi1;~Andreas_Krause1", "aff": "Department of Computer Science, ETHZ - ETH Zurich;ETHZ - ETH Zurich;Swiss Federal Institute of Technology;ETH Zurich", "aff_domain": "inf.ethz.ch;ethz.ch;ethz.ch;ethz.ch", "position": "PhD student;PhD student;PhD student;Full Professor", "bibtex": "@inproceedings{\nas2022constrained,\ntitle={Constrained Policy Optimization via Bayesian World Models},\nauthor={Yarden As and Ilnura Usmanova and Sebastian Curi and Andreas Krause},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=PRZoSmCinhf}\n}", "github": "", "project": "", "reviewers": "E48X;HHZf;Z22c;11DZ", "pdf_size": 0, "recommendation": "6;8;8;8", "confidence": "4;4;3;4", "correctness": "3;4;4;3", "technical_novelty": "2;3;3;3", "empirical_novelty": "3;4;3;3", "wc_summary_paper": "66;54;117;191", "wc_summary_review": "20;49;30;71", "wc_main_review": "427;293;315;441", "wc_review": "513;396;462;703", "wc_reply_reviewers": "0;0;13;0", "wc_reply_authors": "748;267;250;517", "reply_reviewers": "0;0;1;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 7.5, 0.8660254037844386 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 107.0, 53.95831724581485 ], "wc_summary_review_avg": [ 42.5, 19.474342094150447 ], "wc_main_review_avg": [ 369.0, 65.6505902486794 ], "wc_review_avg": [ 518.5, 114.31207285322054 ], "wc_reply_reviewers_avg": [ 3.25, 5.629165124598851 ], "wc_reply_authors_avg": [ 445.5, 204.1451689362254 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 69, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15728158487087331451&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "pdf": "https://openreview.net/pdf?id=PRZoSmCinhf", "email": "inf.ethz.ch;ethz.ch;ethz.ch;ethz.ch", "author_num": 4, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "ETH Zurich;Swiss Federal Institute of Technology", "aff_unique_dep": "Department of Computer Science;", "aff_unique_url": "https://www.ethz.ch;https://www.ethz.ch", "aff_unique_abbr": "ETHZ;ETH Zurich", "aff_campus_unique_index": "0", "aff_campus_unique": "Zurich;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Switzerland" }, { "title": "Inductive Relation Prediction Using Analogy Subgraph Embeddings", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/5914", "id": "PTRo58zPt3P", "poster": "", "openreview": "https://openreview.net/forum?id=PTRo58zPt3P", "slides": "https://iclr.cc/virtual/2022/poster/5914", "video": "https://iclr.cc/virtual/2022/poster/5914", "author_site": "Jiarui Jin, Yangkun Wang, Kounianhua Du, Weinan Zhang, Zheng Zhang, David Wipf, Yong Yu, Quan Gan", "tldr": "", "abstract": "Prevailing methods for relation prediction in heterogeneous graphs aim at learning latent representations (i.e., embeddings) of observed nodes and relations, and thus are limited to the transductive setting where the relation types must be known during training. Here, we propose ANalogy SubGraphEmbeddingLearning (GraphANGEL), a novel relation prediction framework that predicts relations5between each node pair based on the subgraphs containing the pair, as well as other (analogy) subgraphs with the same graph patterns. Each graph pattern explicitly represents a specific logical rule, which contributes to an inductive bias that facilitates generalization to unseen relations and leads to more explainable predictive models. Moreover, our method also removes the limited neighborhood constraint of graph neural networks. Our model consistently outperforms existing models on heterogeneous graph based recommendation as well as knowledge graph completion. We also empirically demonstrate our model\u2019s capability in generalizing to new relations while producing explainable heat maps of attention scores across the discovered logic.", "keywords": "Link Prediction;Relation Modelling;Heterogeneous Graphs;Knowledge Graphs", "primary_area": "", "supplementary_material": "", "author": "Jiarui Jin;Yangkun Wang;Kounianhua Du;Weinan Zhang;Zheng Zhang;David Wipf;Yong Yu;Quan Gan", "authorids": "~Jiarui_Jin1;~Yangkun_Wang1;~Kounianhua_Du1;~Weinan_Zhang1;~Zheng_Zhang1;~David_Wipf1;~Yong_Yu1;~Quan_Gan1", "gender": "M;;F;M;M;M;;M", "homepage": "https://jinjiarui.github.io/;;;http://wnzhang.net;https://shanghai.nyu.edu/academics/faculty/directory/zheng-zhang;http://www.davidwipf.com/;https://apex.sjtu.edu.cn/members/yyu;", "dblp": "241/9563;;268/8094;28/10261-1;;81/6421;43/5685.html;72/3872", "google_scholar": "unCPHQEAAAAJ;;https://scholar.google.com/citations?view_op=list_works;Qzss0GEAAAAJ;https://scholar.google.com.hk/citations?user=k0KiE4wAAAAJ;YJx1WSgAAAAJ;;", "orcid": "0000-0001-6458-1586;;;0000-0002-0127-2425;;;0000-0003-4457-2820;0009-0002-0986-457X", "linkedin": "jiarui-jerry-jin-ba4a84176/;;;;;;;quan-gan-231992136/", "or_profile": "~Jiarui_Jin1;~Yangkun_Wang1;~Kounianhua_Du1;~Weinan_Zhang1;~Zheng_Zhang1;~David_Wipf1;~Yong_Yu1;~Quan_Gan1", "aff": "Shanghai Jiaotong University;;Shanghai Jiaotong University;Shanghai Jiaotong University;Amazon;Amazon AI Research Lab;Shanghai Jiaotong University;Amazon", "aff_domain": "sjtu.edu.cn;;sjtu.edu.cn;sjtu.edu.cn;amazon.com;amazon.com;sjtu.edu.cn;amazon.com", "position": "PhD student;;MS student;Associate Professor;Senior Principal Scientist;Principal Research Scientist;Full Professor;Researcher", "bibtex": "@inproceedings{\njin2022inductive,\ntitle={Inductive Relation Prediction Using Analogy Subgraph Embeddings},\nauthor={Jiarui Jin and Yangkun Wang and Kounianhua Du and Weinan Zhang and Zheng Zhang and David Wipf and Yong Yu and Quan Gan},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=PTRo58zPt3P}\n}", "github": "", "project": "", "reviewers": "y1cR;fant;jqL2;1Ki2;3Zrj", "pdf_size": 0, "recommendation": "8;8;8;8;8", "confidence": "4;5;4;5;3", "correctness": "4;4;4;4;3", "technical_novelty": "2;3;3;4;3", "empirical_novelty": "2;0;2;4;3", "wc_summary_paper": "204;102;192;166;76", "wc_summary_review": "139;56;281;56;47", "wc_main_review": "686;322;371;215;250", "wc_review": "1029;480;844;437;373", "wc_reply_reviewers": "0;95;35;0;31", "wc_reply_authors": "902;1065;460;302;797", "reply_reviewers": "0;1;1;0;1", "reply_authors": "2;2;1;1;1", "recommendation_avg": [ 8.0, 0.0 ], "confidence_avg": [ 4.2, 0.7483314773547882 ], "correctness_avg": [ 3.8, 0.39999999999999997 ], "technical_novelty_avg": [ 3.0, 0.6324555320336759 ], "empirical_novelty_avg": [ 2.2, 1.32664991614216 ], "wc_summary_paper_avg": [ 148.0, 50.39047529047528 ], "wc_summary_review_avg": [ 115.8, 89.12328539725182 ], "wc_main_review_avg": [ 368.8, 167.67754769199124 ], "wc_review_avg": [ 632.6, 257.2007776038012 ], "wc_reply_reviewers_avg": [ 32.2, 34.71829488900629 ], "wc_reply_authors_avg": [ 705.2, 282.5975229898521 ], "reply_reviewers_avg": [ 0.6, 0.48989794855663565 ], "reply_authors_avg": [ 1.4, 0.4898979485566356 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17169053957586581937&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "pdf": "https://openreview.net/pdf?id=PTRo58zPt3P", "email": "sjtu.edu.cn;;sjtu.edu.cn;sjtu.edu.cn;amazon.com;amazon.com;sjtu.edu.cn;amazon.com", "author_num": 8, "aff_unique_index": "0;0;0;1;1;0;1", "aff_unique_norm": "Shanghai Jiao Tong University;Amazon", "aff_unique_dep": ";Amazon.com, Inc.", "aff_unique_url": "https://www.sjtu.edu.cn;https://www.amazon.com", "aff_unique_abbr": "SJTU;Amazon", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;1;0;1", "aff_country_unique": "China;United States" }, { "id": "PU3VGS93gxD", "title": "Sample Complexity of Deep Active Learning", "track": "main", "status": "Desk Reject", "tldr": "", "abstract": "Many machine learning algorithms require large numbers of labeled training data to deliver state-of-the-art results. However, in many domains of AI, there are abundant unlabeled data but it is costly to get data labeled by experts, such as medical diagnosis and fraud detection. In these domains, active learning, where an algorithm maximizes model accuracy while requiring the least number of labeled data, is appealing.\nActive learning uses both labeled and unlabeled data to train models, and the learning algorithm decides which subset of data should acquire labels.\nDue to the costly label acquisition, it is interesting to know whether it is possible from a theoretical perspective to understand how many labeled data are actually needed to train a machine learning model. This question is known as the sample complexity problem, and it has been extensively explored for training linear machine learning models (e.g., linear regression). Today, deep learning has become the de facto method for machine learning, but the sample complexity problem for deep active learning remains unsolved. This problem is challenging due to the non-linear nature of neural networks.\nIn this paper, we present the first deep active learning algorithm which has a provable sample complexity. Using this algorithm, we have derived the first upper bound on the number of required labeled data for training neural networks. \nOur upper bound shows that the minimum number of labeled data a neural net needs does not depend on the data distribution or the width of the neural network but is determined by the smoothness of non-linear activation and the dimension of the input data.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/1eb7aa263abafb1bca61e9ed3a7f8ee59a647185.zip", "author": "Zhao Song;Baocheng Sun;Danyang Zhuo", "authorids": "~Zhao_Song6;~Baocheng_Sun1;~Danyang_Zhuo1", "gender": ";M;M", "homepage": ";https://danyangzhuo.com/;https://www.youtube.com/@zhaosong2031", "dblp": "52/11245-2;151/7537;76/4051-2", "google_scholar": ";E3yOuvEAAAAJ;yDZct7UAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Baocheng_Sun1;~Danyang_Zhuo1;~Zhao_Song3", "aff": "School of Computer Science;Duke University;Adobe", "aff_domain": "buaa.edu;duke.edu;adobe.com", "position": "Undergrad student;Assistant Professor;Researcher", "bibtex": "@misc{\nsong2022sample,\ntitle={Sample Complexity of Deep Active Learning},\nauthor={Zhao Song and Baocheng Sun and Danyang Zhuo},\nyear={2022},\nurl={https://openreview.net/forum?id=PU3VGS93gxD}\n}", "github": "", "project": "", "reviewers": "", "site": "https://openreview.net/forum?id=PU3VGS93gxD", "pdf_size": 0, "recommendation": "", "confidence": "", "correctness": "", "technical_novelty": "", "empirical_novelty": "", "wc_summary_paper": "", "wc_summary_review": "", "wc_main_review": "", "wc_review": "", "wc_reply_reviewers": "", "wc_reply_authors": "", "reply_reviewers": "", "reply_authors": "", "recommendation_avg": [ 0, 0 ], "confidence_avg": [ 0, 0 ], "correctness_avg": [ 0, 0 ], "technical_novelty_avg": [ 0, 0 ], "empirical_novelty_avg": [ 0, 0 ], "wc_summary_paper_avg": [ 0, 0 ], "wc_summary_review_avg": [ 0, 0 ], "wc_main_review_avg": [ 0, 0 ], "wc_review_avg": [ 0, 0 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 1, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0, "corr_recommendation_correctness": 0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:KFNzr8WxoUoJ:scholar.google.com/&scioq=Sample+Complexity+of+Deep+Active+Learning&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1;2", "aff_unique_norm": "School of Computer Science;Duke University;Adobe", "aff_unique_dep": "Computer Science;;Adobe Inc.", "aff_unique_url": ";https://www.duke.edu;https://www.adobe.com", "aff_unique_abbr": ";Duke;Adobe", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "1;1", "aff_country_unique": ";United States" }, { "id": "PUrOJvOuSM1", "title": "A2B-GAN: Utilizing Unannotated Anomalous Images for Anomaly Detection in Medical Image Analysis", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Automated anomaly detection in medical images can significantly reduce human effort in disease diagnosis. Owing to the complexity in modeling anomalies and the high cost of manual annotation by domain experts, a typical technique in the current literature is to employ only data from healthy subjects to derive the model for normal images and then to detect anomalies as outliers to this model. In many real applications, mixed datasets with both normal and potential abnormal images (e.g., images of patients with confirmed diseases) are abundant. This paper poses the research question of how to improve anomaly detection by using an unannotated set of mixed images of both normal and anomalous samples (in addition to a set of normal images from healthy subjects). We propose a novel one-directional image-to-image translation method named A2B-GAN, which learns to translate any images to only normal images (hence \u201cone-directional\u201d). This alleviates the requirement of direct cycle consistency of existing unpaired image-to-image translation methods, which is unattainable with unannotated data. Once the translation is learned, we generate a difference map for any given image by subtracting its translated output. Regions of significant responses in the difference map correspond to potential anomalies (if any). In terms of average AUC, our A2B-GAN outperforms the state-of-the-art methods by 0.1 points (approximately 16.25%) on two medical imaging datasets: COVID-19 detection and Cardiomegaly detection by utilizing an unannotated set mixed with anomalies. Our code is available for public release upon the paper decision.", "keywords": "Anomaly detection;novelty detection;image-to-image translation;generative adversarial network;medical image analysis", "primary_area": "", "supplementary_material": "", "author": "Md Mahfuzur Rahman Siddiquee;Teresa Wu;Baoxin Li", "authorids": "~Md_Mahfuzur_Rahman_Siddiquee1;~Teresa_Wu1;~Baoxin_Li1", "gender": "M;F;", "homepage": ";http://faculty.engineering.asu.edu/twu/;", "dblp": "165/9403;;", "google_scholar": "Wjo6pwIAAAAJ;XJvLrVAAAAAJ;", "orcid": ";;", "linkedin": ";;", "or_profile": "~Md_Mahfuzur_Rahman_Siddiquee1;~Teresa_Wu1;~Baoxin_Li1", "aff": "Arizona State University;Arizona State University;", "aff_domain": "asu.edu;asu.edu;", "position": "PhD student;Full Professor;", "bibtex": "@misc{\nsiddiquee2022abgan,\ntitle={A2B-{GAN}: Utilizing Unannotated Anomalous Images for Anomaly Detection in Medical Image Analysis},\nauthor={Md Mahfuzur Rahman Siddiquee and Teresa Wu and Baoxin Li},\nyear={2022},\nurl={https://openreview.net/forum?id=PUrOJvOuSM1}\n}", "github": "", "project": "", "reviewers": "v3am;VxiX;xjzs;8ykh", "site": "https://openreview.net/forum?id=PUrOJvOuSM1", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "5;4;5;4", "correctness": "2;2;2;2", "technical_novelty": "2;2;1;2", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "179;112;90;76", "wc_summary_review": "57;122;21;42", "wc_main_review": "689;409;585;614", "wc_review": "925;643;696;732", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 4.5, 0.5 ], "correctness_avg": [ 2.0, 0.0 ], "technical_novelty_avg": [ 1.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 114.25, 39.52451770736741 ], "wc_summary_review_avg": [ 60.5, 37.73923687622737 ], "wc_main_review_avg": [ 574.25, 102.67759005742198 ], "wc_review_avg": [ 749.0, 106.43072864544337 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3938041904766938531&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Arizona State University", "aff_unique_dep": "", "aff_unique_url": "https://www.asu.edu", "aff_unique_abbr": "ASU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "PVB_t0HCMVC", "title": "Towards Defending Multiple $\\ell_p$-Norm Bounded Adversarial Perturbations via Gated Batch Normalization", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "There has been extensive evidence demonstrating that deep neural networks are vulnerable to adversarial examples, which motivates the development of defenses against adversarial attacks. Existing adversarial defenses typically improve model robustness against individual-specific perturbation types. However, adversaries are likely to generate multiple perturbations in practice. Some recent methods improve model robustness against adversarial attacks in multiple $\\ell_p$ balls, but their performance against each perturbation type is still far from satisfactory. We observe that different $\\ell_p$ bounded adversarial perturbations induce different statistical properties that can be separated and characterized by the statistics of Batch Normalization (BN). We thus propose Gated BN (GBN) to adversarially train a perturbation-invariant predictor for defending multiple $\\ell_p$ bounded adversarial perturbations. GBN consists of a multi-branch BN layer and a gated sub-network. Each BN branch in GBN is in charge of one perturbation type to ensure that the normalized output is aligned towards learning perturbation-invariant representation. Meanwhile, the gated sub-network is designed to separate inputs added with different perturbation types. We perform an extensive evaluation of our approach on MNIST, CIFAR-10, and Tiny-ImageNet, and demonstrate that GBN outperforms previous defense proposals against multiple perturbation types (\\ie, $\\ell_1$, $\\ell_2$, and $\\ell_{\\infty}$ perturbations) by large margins of 10-20\\%.", "keywords": "adversarial examples;multiple $\\ell_p$-norm bounded adversarial peturbations;adversarial robustness", "primary_area": "", "supplementary_material": "", "author": "Aishan Liu;Shiyu Tang;Xianglong Liu;Xinyun Chen;Lei Huang;Haotong Qin;Dawn Song;Dacheng Tao", "authorids": "~Aishan_Liu1;~Shiyu_Tang1;~Xianglong_Liu2;~Xinyun_Chen1;~Lei_Huang1;~Haotong_Qin1;~Dawn_Song1;~Dacheng_Tao1", "gender": "M;M;M;;M;M;F;", "homepage": "https://liuaishan.github.io/;https://www.sytang.com;http://www.nlsde.buaa.edu.cn/~xlliu;;https://huangleibuaa.github.io/;https://htqin.github.io/;;", "dblp": "177/5658;;55/7901;;18/1763-15;262/3626.html;s/DXSong;", "google_scholar": "88tzr_sAAAAJ;;https://scholar.google.com.hk/citations?user=8VY7ZDcAAAAJ;;https://scholar.google.com.hk/citations?user=yTshbKkAAAAJ;mK6n-KgAAAAJ;;", "orcid": ";;;;;;;", "linkedin": ";;;;;;;", "or_profile": "~Aishan_Liu1;~Shiyu_Tang1;~Xianglong_Liu2;~Xinyun_Chen1;~Lei_Huang1;~Haotong_Qin1;~Dawn_Song1;~Dacheng_Tao1", "aff": "Beihang University;;Beihang University;;Beihang University;Beihang University;University of California, Berkeley;", "aff_domain": "buaa.edu.cn;;buaa.edu.cn;;buaa.edu.cn;buaa.edu.cn;berkeley.edu;", "position": "Assistant Professor;;Associate Professor;;Associate Professor;PhD student;Full Professor;", "bibtex": "@misc{\nliu2022towards,\ntitle={Towards Defending Multiple \\${\\textbackslash}ell\\_p\\$-Norm Bounded Adversarial Perturbations via Gated Batch Normalization},\nauthor={Aishan Liu and Shiyu Tang and Xianglong Liu and Xinyun Chen and Lei Huang and Haotong Qin and Dawn Song and Dacheng Tao},\nyear={2022},\nurl={https://openreview.net/forum?id=PVB_t0HCMVC}\n}", "github": "", "project": "", "reviewers": "84MR;w2td;nbzi;hDFj", "site": "https://openreview.net/forum?id=PVB_t0HCMVC", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "4;5;3;5", "correctness": "2;3;3;3", "technical_novelty": "3;3;3;2", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "55;112;45;219", "wc_summary_review": "33;14;10;58", "wc_main_review": "513;217;254;986", "wc_review": "601;343;309;1263", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 4.25, 0.82915619758885 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 107.75, 69.12805146971814 ], "wc_summary_review_avg": [ 28.75, 18.9917745353087 ], "wc_main_review_avg": [ 492.5, 306.8977842865601 ], "wc_review_avg": [ 629.0, 383.05874223152773 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": -0.30151134457776363, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 57, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13588260549652361610&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff_unique_index": "0;0;0;0;1", "aff_unique_norm": "Beihang University;University of California, Berkeley", "aff_unique_dep": ";", "aff_unique_url": "http://www.buaa.edu.cn/;https://www.berkeley.edu", "aff_unique_abbr": "BUAA;UC Berkeley", "aff_campus_unique_index": "1", "aff_campus_unique": ";Berkeley", "aff_country_unique_index": "0;0;0;0;1", "aff_country_unique": "China;United States" }, { "title": "CoMPS: Continual Meta Policy Search", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6693", "id": "PVJ6j87gOHz", "poster": "", "openreview": "https://openreview.net/forum?id=PVJ6j87gOHz", "slides": "https://iclr.cc/virtual/2022/poster/6693", "video": "https://iclr.cc/virtual/2022/poster/6693", "author_site": "Glen Berseth, Zhiwei Zhang, Grace Zhang, Chelsea Finn, Sergey Levine", "tldr": "", "abstract": "We develop a new continual meta-learning method to address challenges in sequential multi-task learning. In this setting, the agent's goal is to achieve high reward over any sequence of tasks quickly. Prior meta-reinforcement learning algorithms have demonstrated promising results in accelerating the acquisition of new tasks. However, they require access to all tasks during training. Beyond simply transferring past experience to new tasks, our goal is to devise continual reinforcement learning algorithms that learn to learn, using their experience on previous tasks to learn new tasks more quickly. We introduce a new method, continual meta-policy search (CoMPS), that removes this limitation by meta-training in an incremental fashion, over each task in a sequence, without revisiting prior tasks. CoMPS continuously repeats two subroutines: learning a new task using RL and using the experience from RL to perform completely offline meta-learning to prepare for subsequent task learning. We find that CoMPS outperforms prior continual learning and off-policy meta-reinforcement methods on several sequences of challenging continuous control tasks.", "keywords": "Reinforcement Learning", "primary_area": "", "supplementary_material": "", "author": "Glen Berseth;Zhiwei Zhang;Grace Zhang;Chelsea Finn;Sergey Levine", "authorids": "~Glen_Berseth1;~Zhiwei_Zhang5;grace.zhang@berkeley.edu;~Chelsea_Finn1;~Sergey_Levine1", "gender": "M;;;F;M", "homepage": "http://fracturedplane.com/;;;https://ai.stanford.edu/~cbfinn/;https://people.eecs.berkeley.edu/~svlevine/", "dblp": "147/5478;;;131/1783;80/7594", "google_scholar": "https://scholar.google.ca/citations?user=-WZcuuwAAAAJ;;;vfPE6hgAAAAJ;8R35rCwAAAAJ", "orcid": "0000-0001-7351-8028;;;;", "linkedin": "glen-berseth-0523278b?trk=hp-identity-name;zhiwei-z-940645b4/;;;", "or_profile": "~Glen_Berseth1;~Zhiwei_Zhang5;grace.zhang@berkeley.edu;~Chelsea_Finn1;~Sergey_Levine1", "aff": "Montreal Institute for Learning Algorithms, University of Montreal, Universit\u00e9 de Montr\u00e9al;University of California, Berkeley;;Google;Google", "aff_domain": "mila.umontreal.ca;berkeley.edu;;google.com;google.com", "position": "Assistant Professor;Undergrad student;;Research Scientist;Research Scientist", "bibtex": "@inproceedings{\nberseth2022comps,\ntitle={Co{MPS}: Continual Meta Policy Search},\nauthor={Glen Berseth and Zhiwei Zhang and Grace Zhang and Chelsea Finn and Sergey Levine},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=PVJ6j87gOHz}\n}", "github": "", "project": "", "reviewers": "1xJQ;cHnK;FZ5X;S7vL;8AHL", "pdf_size": 0, "recommendation": "3;5;5;6;6", "confidence": "4;2;3;4;3", "correctness": "3;3;3;4;3", "technical_novelty": "2;2;1;4;2", "empirical_novelty": "2;2;3;4;3", "wc_summary_paper": "112;119;81;110;79", "wc_summary_review": "25;56;76;40;70", "wc_main_review": "276;249;462;751;327", "wc_review": "413;424;619;901;476", "wc_reply_reviewers": "54;177;55;229;188", "wc_reply_authors": "874;1433;1182;1471;981", "reply_reviewers": "1;1;1;3;2", "reply_authors": "4;5;4;4;5", "recommendation_avg": [ 5.0, 1.0954451150103321 ], "confidence_avg": [ 3.2, 0.7483314773547882 ], "correctness_avg": [ 3.2, 0.39999999999999997 ], "technical_novelty_avg": [ 2.2, 0.9797958971132712 ], "empirical_novelty_avg": [ 2.8, 0.7483314773547882 ], "wc_summary_paper_avg": [ 100.2, 16.773789077009404 ], "wc_summary_review_avg": [ 53.4, 18.863721796082555 ], "wc_main_review_avg": [ 413.0, 184.23137626365386 ], "wc_review_avg": [ 566.6, 182.59529019117662 ], "wc_reply_reviewers_avg": [ 140.6, 72.40607709301754 ], "wc_reply_authors_avg": [ 1188.2, 237.31616042739273 ], "reply_reviewers_avg": [ 1.6, 0.8 ], "reply_authors_avg": [ 4.4, 0.4898979485566356 ], "replies_avg": [ 37, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.24397501823713333, "corr_recommendation_correctness": 0.4564354645876385, "gs_citation": 22, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4365648142789974013&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=PVJ6j87gOHz", "email": "mila.umontreal.ca;berkeley.edu;;google.com;google.com", "author_num": 5, "aff_unique_index": "0;1;2;2", "aff_unique_norm": "University of Montreal;University of California, Berkeley;Google", "aff_unique_dep": "Montreal Institute for Learning Algorithms;;Google", "aff_unique_url": "https://www.mila.quebec;https://www.berkeley.edu;https://www.google.com", "aff_unique_abbr": "MILA;UC Berkeley;Google", "aff_campus_unique_index": "0;1;2;2", "aff_campus_unique": "Montreal;Berkeley;Mountain View", "aff_country_unique_index": "0;1;1;1", "aff_country_unique": "Canada;United States" }, { "id": "PZoy8i_Dp6", "title": "Attention-based Feature Aggregation", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Capturing object instances in different scales is a long-standing problem in the tasks of visual recognition, e.g., object detection and instance segmentation. The conventional way is to learn scale-invariant features, e.g., by summing up the feature maps output by different layers in the backbone. In this paper, we propose a novel and adaptive feature aggregation module based on attention where the attention parameters can be learned to handle different situations, e.g., adding shallow layers is learned to be conservative to mitigate the effect of noisy pixels, while for deep layers, it tends to be audacious to incorporate high-level semantics. To implement this module, we define two variants of attention: self-attention on the summed-up feature map, and cross-attention between two feature maps before summed up. The former uses the aggregated pixel values to capture global attention (to improve the feature for the next layer of aggregation), while the latter allows attention-based interactions between two features before aggregation. In addition, we apply multi-scale pooling in our attention module to reduce computational costs, and thus call the two variants Multi-Scale Self-Attention (MSSA) and Multi-Scale Cross-Attention (MSCA), respectively. We incorporate each variant into multiple baselines, e.g., the state-of-the-art object recognizer Cascade Mask-RCNN, and evaluate them on MSCOCO and LVIS datasets. Results show our significant improvements over baselines, e.g., boosting Cascade Mask-RCNN by 2.2% for AP^box and 2.7% for AP^mask on the MSCOCO dataset.", "keywords": "Deep Learning;Object Detection;Instance Segmentation;Attention", "primary_area": "", "supplementary_material": "", "author": "Xiongwei Wu;Ee-Peng Lim;Steven HOI;Qianru Sun", "authorids": "~Xiongwei_Wu1;~Ee-Peng_Lim1;~Steven_HOI1;~Qianru_Sun2", "gender": "M;M;F;M", "homepage": "https://sis.smu.edu.sg/faculty/profile/9626;https://www.smu.edu.sg/faculty/profile/110831/Steven-HOI;https://qianrusun.com/;http://xiongweiwu.github.io/", "dblp": "l/EePengLim.html;h/StevenCHHoi;127/6132.html;172/1093", "google_scholar": "https://scholar.google.com.tw/citations?user=r0wOAikAAAAJ;https://scholar.google.com.tw/citations?user=JoLjflYAAAAJ;https://scholar.google.de/citations?user=fNfrGMIAAAAJ;https://scholar.google.com.hk/citations?user=24SViVMAAAAJ", "orcid": "0000-0003-0065-8665;;0000-0003-2689-317X;", "linkedin": ";;;", "or_profile": "~Ee-Peng_Lim1;~Steven_HOI1;~Qianru_Sun2;~Wu_Xiongwei1", "aff": "Singapore Management University;Singapore Management University;Singapore Management University;Singapore Management University", "aff_domain": "smu.edu.sg;;smu.edu.sg;smu.edu.sg", "position": "Full Professor;Associate Professor;Assistant Professor;Postdoc", "bibtex": "@misc{\nwu2022attentionbased,\ntitle={Attention-based Feature Aggregation},\nauthor={Xiongwei Wu and Ee-Peng Lim and Steven HOI and Qianru Sun},\nyear={2022},\nurl={https://openreview.net/forum?id=PZoy8i_Dp6}\n}", "github": "", "project": "", "reviewers": "NNxX;LuYp;cz4p;R41h", "site": "https://openreview.net/forum?id=PZoy8i_Dp6", "pdf_size": 0, "recommendation": "5;5;5;6", "confidence": "5;4;3;3", "correctness": "3;4;4;3", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;2;0;3", "wc_summary_paper": "43;86;53;91", "wc_summary_review": "54;12;25;44", "wc_main_review": "429;235;98;168", "wc_review": "526;333;176;303", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 5.25, 0.4330127018922193 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.75, 1.0897247358851685 ], "wc_summary_paper_avg": [ 68.25, 20.632195714465293 ], "wc_summary_review_avg": [ 33.75, 16.315253599009733 ], "wc_main_review_avg": [ 232.5, 123.35821821021898 ], "wc_review_avg": [ 334.5, 125.28866668617717 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.5222329678670935, "corr_recommendation_correctness": -0.5773502691896257, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=370258956044876253&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Singapore Management University", "aff_unique_dep": "", "aff_unique_url": "https://www.smu.edu.sg", "aff_unique_abbr": "SMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Singapore" }, { "id": "PaQhL90tLmX", "title": "Robust Deep Neural Networks for Heterogeneous Tabular Data", "track": "main", "status": "Reject", "tldr": "", "abstract": "Although deep neural networks (DNNs) constitute the state-of-the-art in many tasks based on image, audio, or text data, their performance on heterogeneous, tabular data is typically inferior to that of decision tree ensembles. To bridge the gap between the difficulty of DNNs to handle tabular data and leverage the flexibility of deep learning under input heterogeneity, we propose DeepTLF, a framework for deep tabular learning. The core idea of our method is to transform the heterogeneous input data into homogeneous data to boost the performance of DNNs considerably. For the transformation step, we develop a novel knowledge distillations approach, TreeDrivenEncoder, which exploits the structure of decision trees trained on the available heterogeneous data to map the original input vectors onto homogeneous vectors that a DNN can use to improve the predictive performance. Through extensive and challenging experiments on various real-world datasets, we demonstrate that the DeepTLF pipeline leads to higher predictive performance. On average, our framework shows 19.6\\% performance improvement in comparison to DNNs. The DeepTLF code is publicly available.", "keywords": "Deep Learning;Deep Neural Networks;Tabular Data;Gradient Boosted Decision Trees", "primary_area": "", "supplementary_material": "/attachment/343cd8801679038a3aae65ff3dbcce5b031b1fc2.zip", "author": "Vadim Borisov;Klaus Broelemann;Enkelejda Kasneci;Gjergji Kasneci", "authorids": "~Vadim_Borisov1;~Klaus_Broelemann1;~Enkelejda_Kasneci1;~Gjergji_Kasneci2", "gender": ";;F;M", "homepage": "https://unnir.github.io;;https://www.edu.sot.tum.de/hctl/prof-dr-enkelejda-kasneci/;https://www.gov.sot.tum.de/rds/prof-dr-gjergji-kasneci/", "dblp": "137/9304;00/7271.html;08/1610;69/3216", "google_scholar": "gbwdmJEAAAAJ;;https://scholar.google.de/citations?user=bZVkVvoAAAAJ;Zbc8GK4AAAAJ", "orcid": ";;0000-0003-3146-4484;0000-0002-3123-7268", "linkedin": ";;;", "or_profile": "~Vadim_Borisov1;~Klaus_Broelemann1;~Enkelejda_Kasneci1;~Gjergji_Kasneci2", "aff": "University of Tuebingen;SCHUFA;University of Tuebingen;University of Tuebingen", "aff_domain": "uni-tuebingen.de;schufa.de;uni-tuebingen.de;uni-tuebingen.de", "position": "PhD student;Principal Researcher;Full Professor;Professor", "bibtex": "@misc{\nborisov2022robust,\ntitle={Robust Deep Neural Networks for Heterogeneous Tabular Data },\nauthor={Vadim Borisov and Klaus Broelemann and Enkelejda Kasneci and Gjergji Kasneci},\nyear={2022},\nurl={https://openreview.net/forum?id=PaQhL90tLmX}\n}", "github": "", "project": "", "reviewers": "Q6we;gWeP;vaip", "site": "https://openreview.net/forum?id=PaQhL90tLmX", "pdf_size": 0, "recommendation": "3;5;6", "confidence": "4;5;4", "correctness": "3;3;3", "technical_novelty": "2;3;3", "empirical_novelty": "2;3;3", "wc_summary_paper": "95;32;123", "wc_summary_review": "51;26;71", "wc_main_review": "322;130;522", "wc_review": "468;188;716", "wc_reply_reviewers": "90;39;36", "wc_reply_authors": "999;626;822", "reply_reviewers": "1;1;1", "reply_authors": "2;1;1", "recommendation_avg": [ 4.666666666666667, 1.247219128924647 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 83.33333333333333, 38.05551500403354 ], "wc_summary_review_avg": [ 49.333333333333336, 18.408935028645434 ], "wc_main_review_avg": [ 324.6666666666667, 160.044438273319 ], "wc_review_avg": [ 457.3333333333333, 215.68701604150607 ], "wc_reply_reviewers_avg": [ 55.0, 24.779023386727733 ], "wc_reply_authors_avg": [ 815.6666666666666, 152.34245049303303 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.18898223650461357, "corr_recommendation_correctness": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18268165071479217778&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;0;0", "aff_unique_norm": "University of Tuebingen;SCHUFA Holding AG", "aff_unique_dep": ";", "aff_unique_url": "https://www.uni-tuebingen.de/;https://www.schufa.de", "aff_unique_abbr": "Uni T\u00fcbingen;SCHUFA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Germany" }, { "id": "PeG-8G5ua3W", "title": "Normalized Attention Without Probability Cage", "track": "main", "status": "Reject", "tldr": "", "abstract": "Despite the popularity of attention based architectures like Transformers, the geometrical implications of softmax-attention remain largely unexplored. In this work we highlight the limitations of constraining attention weights to the probability simplex and the resulting convex hull of value vectors. We show that Transformers are biased towards local information at initialization and sensitive to hyperparameters, contrast attention to max- and sum-pooling and show the performance implications of different architectures with respect to biases in the data. Finally, we propose to replace the softmax in self-attention with normalization, resulting in a generally applicable architecture that is robust to hyperparameters and biases in the data. We support our insights with empirical results from more than 30,000 trained models. Implementations are in the supplementary material.", "keywords": "Attention;Transformers;Neural Architecture;Aggregators", "primary_area": "", "supplementary_material": "/attachment/9e509b1e3104a0d5739b3f582203754022fce988.zip", "author": "Oliver Paul Richter;Roger Wattenhofer", "authorids": "~Oliver_Paul_Richter1;~Roger_Wattenhofer1", "gender": "M;Not Specified", "homepage": "https://disco.ethz.ch/members/richtero;https://disco.ethz.ch/members/wroger", "dblp": ";w/RogerWattenhofer", "google_scholar": ";https://scholar.google.ch/citations?user=EG3VPm4AAAAJ", "orcid": ";", "linkedin": ";roger-wattenhofer-4466731/", "or_profile": "~Oliver_Paul_Richter1;~Roger_Wattenhofer1", "aff": "Swiss Federal Institute of Technology;Swiss Federal Institute of Technology", "aff_domain": "ethz.ch;ethz.ch", "position": "PhD student;Full Professor", "bibtex": "@misc{\nrichter2022normalized,\ntitle={Normalized Attention Without Probability Cage},\nauthor={Oliver Paul Richter and Roger Wattenhofer},\nyear={2022},\nurl={https://openreview.net/forum?id=PeG-8G5ua3W}\n}", "github": "", "project": "", "reviewers": "xpRQ;rjM7;fpkN;BH5E", "site": "https://openreview.net/forum?id=PeG-8G5ua3W", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "3;3;3;4", "correctness": "3;3;3;3", "technical_novelty": "3;3;3;4", "empirical_novelty": "2;4;3;4", "wc_summary_paper": "59;78;103;71", "wc_summary_review": "35;49;251;38", "wc_main_review": "232;454;72;446", "wc_review": "326;581;426;555", "wc_reply_reviewers": "0;13;0;0", "wc_reply_authors": "586;478;314;675", "reply_reviewers": "0;1;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 3.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 77.75, 16.08376510646683 ], "wc_summary_review_avg": [ 93.25, 91.22602424747008 ], "wc_main_review_avg": [ 301.0, 159.40200751558933 ], "wc_review_avg": [ 472.0, 102.71562685395051 ], "wc_reply_reviewers_avg": [ 3.25, 5.629165124598851 ], "wc_reply_authors_avg": [ 513.25, 134.5350790686206 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.9271726499455306, "corr_recommendation_correctness": 0.0, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6746827621677936440&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0", "aff_unique_norm": "Swiss Federal Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.ethz.ch", "aff_unique_abbr": "ETH Zurich", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Switzerland" }, { "id": "Pfj3SXBCbVQ", "title": "On the Effectiveness of Quasi Character-Level Models for Machine Translation", "track": "main", "status": "Reject", "tldr": "", "abstract": "Neural Machine Translation (NMT) models often use subword-level vocabularies to deal with rare or unknown words. Although some studies have shown the effectiveness of purely character-based models, these approaches have resulted in highly expensive models in computational terms. In this work, we explore the advantages of quasi character-level Transformers for low-resource NMT, as well as their ability to mitigate the catastrophic forgetting problem. We first present an empirical study on the effectiveness of these models as a function of the size of the training set. As a result, we found that for data-poor environments, quasi character-level Transformers present a competitive advantage over their large subword-level versions. Similarly, we study the generalization of this phenomenon in different languages, domains, and neural architectures. Finally, we conclude this work by studying the ability of these models to mitigate the effects of catastrophic forgetting in machine translation. Our work suggests that quasi character-level Transformers have a competitive advantage in data-poor environments and, although they do not mitigate the catastrophic forgetting problem, they greatly help to achieve greater consistency between domains.", "keywords": "Deep learning;Neural Machine Translation;Subword-level vocabulary", "primary_area": "", "supplementary_material": "", "author": "Salvador Carri\u00f3n Ponz;Francisco Casacuberta Nolla", "authorids": "~Salvador_Carri\u00f3n_Ponz1;fcn@prhlt.upv", "gender": "M;", "homepage": "https://github.com/salvacarrion;", "dblp": ";", "google_scholar": "qXjy-qkAAAAJ;", "orcid": ";", "linkedin": "salva-carrion/;", "or_profile": "~Salvador_Carri\u00f3n_Ponz1;fcn@prhlt.upv", "aff": "Universitat Polit\u00e8cnica de Val\u00e8ncia;", "aff_domain": "upv.es;", "position": "PhD student;", "bibtex": "@misc{\nponz2022on,\ntitle={On the Effectiveness of Quasi Character-Level Models for Machine Translation},\nauthor={Salvador Carri{\\'o}n Ponz and Francisco Casacuberta Nolla},\nyear={2022},\nurl={https://openreview.net/forum?id=Pfj3SXBCbVQ}\n}", "github": "", "project": "", "reviewers": "5PDf;FBrF;7Xbu;K79J", "site": "https://openreview.net/forum?id=Pfj3SXBCbVQ", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "5;5;5;4", "correctness": "2;3;1;3", "technical_novelty": "2;2;2;2", "empirical_novelty": "3;2;2;2", "wc_summary_paper": "82;135;84;21", "wc_summary_review": "101;49;1;71", "wc_main_review": "251;394;219;175", "wc_review": "434;578;304;267", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 4.75, 0.4330127018922193 ], "correctness_avg": [ 2.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 80.5, 40.38873605350878 ], "wc_summary_review_avg": [ 55.5, 36.479446267727255 ], "wc_main_review_avg": [ 259.75, 82.0712343028908 ], "wc_review_avg": [ 395.75, 122.1400323399335 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -1.0, "corr_recommendation_correctness": 0.5222329678670935, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5879054806758807242&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "aff_unique_index": "0", "aff_unique_norm": "Universitat Polit\u00e8cnica de Val\u00e8ncia", "aff_unique_dep": "", "aff_unique_url": "https://www.upv.es", "aff_unique_abbr": "UPV", "aff_country_unique_index": "0", "aff_country_unique": "Spain" }, { "title": "ComPhy: Compositional Physical Reasoning of Objects and Events from Videos", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6457", "id": "PgNEYaIc81Q", "poster": "", "openreview": "https://openreview.net/forum?id=PgNEYaIc81Q", "slides": "https://iclr.cc/virtual/2022/poster/6457", "video": "https://iclr.cc/virtual/2022/poster/6457", "author_site": "Zhenfang Chen, Kexin Yi, Yunzhu Li, Mingyu Ding, Antonio Torralba, Joshua B Tenenbaum, Chuang Gan", "tldr": "", "abstract": "Objects' motions in nature are governed by complex interactions and their properties. While some properties, such as shape and material, can be identified via the object's visual appearances, others like mass and electric charge are not directly visible. The compositionality between the visible and hidden properties poses unique challenges for AI models to reason from the physical world, whereas humans can effortlessly infer them with limited observations. Existing studies on video reasoning mainly focus on visually observable elements such as object appearance, movement, and contact interaction. In this paper, we take an initial step to highlight the importance of inferring the hidden physical properties not directly observable from visual appearances, by introducing the Compositional Physical Reasoning (ComPhy) dataset. For a given set of objects, ComPhy includes few videos of them moving and interacting under different initial conditions. The model is evaluated based on its capability to unravel the compositional hidden properties, such as mass and charge, and use this knowledge to answer a set of questions posted on one of the videos. Evaluation results of several state-of-the-art video reasoning models on ComPhy show unsatisfactory performance as they fail to capture these hidden properties. We further propose an oracle neural-symbolic framework named Compositional Physics Learner (CPL), combining visual perception, physical property learning, dynamic prediction, and symbolic execution into a unified framework. CPL can effectively identify objects' physical properties from their interactions and predict their dynamics to answer questions. ", "keywords": "Compositional;Intutive Physics;Video Reasoning;Neural-Symbolic", "primary_area": "", "supplementary_material": "", "author": "Zhenfang Chen;Kexin Yi;Yunzhu Li;Mingyu Ding;Antonio Torralba;Joshua B. Tenenbaum;Chuang Gan", "authorids": "~Zhenfang_Chen1;~Kexin_Yi1;~Yunzhu_Li1;~Mingyu_Ding1;~Antonio_Torralba1;~Joshua_B._Tenenbaum1;~Chuang_Gan1", "gender": "M;M;M;M;M;;M", "homepage": "https://zfchenunique.github.io;https://scholar.google.com/citations?user=SwxS_JkAAAAJ&hl=en;https://yunzhuli.github.io/;https://dingmyu.github.io/;http://web.mit.edu/torralba/www//;;http://people.csail.mit.edu/ganchuang/", "dblp": "207/5321;;182/1831;188/5243;t/AntonioBTorralba;t/JoshuaBTenenbaum;139/6993", "google_scholar": "QSRdIzAAAAAJ;;WlA92lcAAAAJ;w4yTWwoAAAAJ;https://scholar.google.com.tw/citations?user=8cxDHS4AAAAJ;;PTeSCbIAAAAJ", "orcid": ";;;0000-0001-6556-8359;;;", "linkedin": "\u632f\u65b9-\u9648-512011bb/;;;dingmyu/;;;", "or_profile": "~Zhenfang_Chen1;~Kexin_Yi1;~Yunzhu_Li1;~Mingyu_Ding1;~Antonio_Torralba1;~Joshua_B._Tenenbaum1;~Chuang_Gan1", "aff": "MIT-IBM Watson AI lab;;Massachusetts Institute of Technology;University of Hong Kong;Massachusetts Institute of Technology;Massachusetts Institute of Technology;MIT-IBM Watson AI Lab", "aff_domain": "ibm.com;;mit.edu;hku.hk;mit.edu;mit.edu;ibm.com", "position": "Researcher;;PhD student;PhD student;Full Professor;Professor;PhD student", "bibtex": "@inproceedings{\nchen2022comphy,\ntitle={ComPhy: Compositional Physical Reasoning of Objects and Events from Videos},\nauthor={Zhenfang Chen and Kexin Yi and Yunzhu Li and Mingyu Ding and Antonio Torralba and Joshua B. Tenenbaum and Chuang Gan},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=PgNEYaIc81Q}\n}", "github": "", "project": "", "reviewers": "3cQE;DJEq;8BUA;VByS", "pdf_size": 0, "recommendation": "3;5;6;6", "confidence": "4;4;4;4", "correctness": "3;3;3;3", "technical_novelty": "2;2;3;3", "empirical_novelty": "1;2;3;3", "wc_summary_paper": "50;92;140;39", "wc_summary_review": "38;51;170;21", "wc_main_review": "517;414;484;151", "wc_review": "605;557;794;211", "wc_reply_reviewers": "0;209;290;0", "wc_reply_authors": "1123;1073;1671;534", "reply_reviewers": "0;1;1;0", "reply_authors": "2;3;3;2", "recommendation_avg": [ 5.0, 1.224744871391589 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 80.25, 39.76414842543469 ], "wc_summary_review_avg": [ 70.0, 58.706899083497845 ], "wc_main_review_avg": [ 391.5, 143.74717388526287 ], "wc_review_avg": [ 541.75, 210.51054011616617 ], "wc_reply_reviewers_avg": [ 124.75, 127.99487294419258 ], "wc_reply_authors_avg": [ 1100.25, 402.3849990991215 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 2.5, 0.5 ], "replies_avg": [ 20, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 58, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14248628071094436569&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=PgNEYaIc81Q", "email": "ibm.com;;mit.edu;hku.hk;mit.edu;mit.edu;ibm.com", "author_num": 7, "aff_unique_index": "0;0;1;0;0;0", "aff_unique_norm": "Massachusetts Institute of Technology;University of Hong Kong", "aff_unique_dep": "IBM Watson AI lab;", "aff_unique_url": "https://www.mitibmwatsonailab.org;https://www.hku.hk", "aff_unique_abbr": "MIT-IBM AI Lab;HKU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;1;0;0;0", "aff_country_unique": "United States;China" }, { "id": "PiDkqc9saaL", "title": "Lower Bounds on the Robustness of Fixed Feature Extractors to Test-time Adversaries", "track": "main", "status": "Reject", "tldr": "", "abstract": "Understanding the robustness of machine learning models to adversarial examples generated by test-time adversaries is a problem of great interest. Recent theoretical work has derived lower bounds on how robust \\emph{any model} can be, when a data distribution and attacker constraints are specified. However, these bounds only apply to arbitrary classification functions and do not account for specific architectures and models used in practice, such as neural networks. In this paper, we develop a methodology to analyze the robustness of fixed feature extractors, which in turn provide bounds on the robustness of any classifier trained on top of it. In other words, this indicates how robust the representation obtained from that extractor is with respect to a given adversary. Our bounds hold for arbitrary feature extractors. The tightness of these bounds relies on the effectiveness of the method used to find collisions between pairs of perturbed examples at deeper layers. For linear feature extractors, we provide closed-form expressions for collision finding while for arbitrary feature extractors, we propose a bespoke algorithm based on the iterative solution of a convex program that provably finds collisions. We utilize our bounds to identify the layers of robustly trained models that contribute the most to a lack of robustness, as well as compare the same layer across different training methods to provide a quantitative comparison of their relative robustness. Our experiments establish that each of the following lead to a measurable drop in robustness: i) layers that linearly reduce dimension, ii) sparsity induced by ReLU activations and, iii) mismatches in the attacker constraints at train and test time. These findings point towards future design considerations for robust models that arise from our methodology.", "keywords": "robustness;lower bounds", "primary_area": "", "supplementary_material": "/attachment/c888473bfbc1de37d1904d963f996a1739598da0.zip", "author": "Arjun Nitin Bhagoji;Daniel Cullina;Ben Zhao", "authorids": "~Arjun_Nitin_Bhagoji1;~Daniel_Cullina1;~Ben_Zhao1", "gender": ";;M", "homepage": ";;https://people.cs.uchicago.edu/~ravenben/", "dblp": ";04/7480;z/BenYZhao", "google_scholar": ";tmYOiO0AAAAJ;cYReSuEAAAAJ", "orcid": ";;", "linkedin": ";;ravenben/", "or_profile": "~Arjun_Nitin_Bhagoji1;~Daniel_Cullina1;~Ben_Zhao1", "aff": ";Pennsylvania State University;University of Chicago", "aff_domain": ";psu.edu;uchicago.edu", "position": ";Assistant Professor;Full Professor", "bibtex": "@misc{\nbhagoji2022lower,\ntitle={Lower Bounds on the Robustness of Fixed Feature Extractors to Test-time Adversaries},\nauthor={Arjun Nitin Bhagoji and Daniel Cullina and Ben Zhao},\nyear={2022},\nurl={https://openreview.net/forum?id=PiDkqc9saaL}\n}", "github": "", "project": "", "reviewers": "btdg;hh5w;nQgV;r9vz;8XE9", "site": "https://openreview.net/forum?id=PiDkqc9saaL", "pdf_size": 0, "recommendation": "5;6;6;6;8", "confidence": "4;3;3;3;3", "correctness": "3;3;3;3;4", "technical_novelty": "3;3;3;2;3", "empirical_novelty": "3;3;2;2;3", "wc_summary_paper": "78;52;48;75;110", "wc_summary_review": "34;120;39;23;61", "wc_main_review": "551;271;322;545;350", "wc_review": "663;443;409;643;521", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "0;0;0;0;0", "reply_reviewers": "0;0;0;0;0", "reply_authors": "0;0;0;0;0", "recommendation_avg": [ 6.2, 0.9797958971132712 ], "confidence_avg": [ 3.2, 0.39999999999999997 ], "correctness_avg": [ 3.2, 0.39999999999999997 ], "technical_novelty_avg": [ 2.8, 0.39999999999999997 ], "empirical_novelty_avg": [ 2.6, 0.4898979485566356 ], "wc_summary_paper_avg": [ 72.6, 22.195495038408133 ], "wc_summary_review_avg": [ 55.4, 34.58670264711569 ], "wc_main_review_avg": [ 407.8, 117.25766499466036 ], "wc_review_avg": [ 535.8, 102.54833006928976 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.6123724356957947, "corr_recommendation_correctness": 0.9185586535436918, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:60ziLyyK5ucJ:scholar.google.com/&scioq=Lower+Bounds+on+the+Robustness+of+Fixed+Feature+Extractors+to+Test-time+Adversaries&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1", "aff_unique_norm": "Pennsylvania State University;University of Chicago", "aff_unique_dep": ";", "aff_unique_url": "https://www.psu.edu;https://www.uchicago.edu", "aff_unique_abbr": "PSU;UChicago", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "CoST: Contrastive Learning of Disentangled Seasonal-Trend Representations for Time Series Forecasting", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6829", "id": "PilZY3omXV2", "poster": "", "openreview": "https://openreview.net/forum?id=PilZY3omXV2", "slides": "https://iclr.cc/virtual/2022/poster/6829", "video": "https://iclr.cc/virtual/2022/poster/6829", "author_site": "Gerald Woo, Chenghao Liu, Doyen Sahoo, Akshat Kumar, Steven Hoi", "tldr": "", "abstract": "Deep learning has been actively studied for time series forecasting, and the mainstream paradigm is based on the end-to-end training of neural network architectures, ranging from classical LSTM/RNNs to more recent TCNs and Transformers. Motivated by the recent success of representation learning in computer vision and natural language processing, we argue that a more promising paradigm for time series forecasting, is to first learn disentangled feature representations, followed by a simple regression fine-tuning step -- we justify such a paradigm from a causal perspective. Following this principle, we propose a new time series representation learning framework for long sequence time series forecasting named CoST, which applies contrastive learning methods to learn disentangled seasonal-trend representations. CoST comprises both time domain and frequency domain contrastive losses to learn discriminative trend and seasonal representations, respectively. Extensive experiments on real-world datasets show that CoST consistently outperforms the state-of-the-art methods by a considerable margin, achieving a 21.3% improvement in MSE on multivariate benchmarks. It is also robust to various choices of backbone encoders, as well as downstream regressors. Code is available at https://github.com/salesforce/CoST.", "keywords": "Time Series;Representation Learning;Forecasting;Self-Supervised Learning", "primary_area": "", "supplementary_material": "/attachment/5a493c5b5ff0437526b3b89dc4e96665a2dc7e1c.zip", "author": "Gerald Woo;Chenghao Liu;Doyen Sahoo;Akshat Kumar;Steven Hoi", "authorids": "~Gerald_Woo1;chenghao.liu@salesforce.com;~Doyen_Sahoo1;~Akshat_Kumar2;~Steven_Hoi2", "gender": "M;;M;M;M", "homepage": ";;https://www.linkedin.com/in/doyensahoo/?originalSubdomain=sg;http://www.smu.edu.sg/faculty/profile/102291/Akshat-KUMAR;http://stevenhoi.com", "dblp": "246/5297;;151/3155;73/193;", "google_scholar": ";;https://scholar.google.com.sg/citations?hl=en;https://scholar.google.com.tw/citations?user=zsYC3R0AAAAJ;JoLjflYAAAAJ", "orcid": ";;;;", "linkedin": "gerald-woo/;;doyensahoo/?originalSubdomain=sg;;", "or_profile": "~Gerald_Woo1;chenghao.liu@salesforce.com;~Doyen_Sahoo1;~Akshat_Kumar2;~Steven_Hoi2", "aff": "Singapore Management University;;SalesForce.com;Singapore Management University;Singapore Management University", "aff_domain": "smu.edu.sg;;salesforce.com;smu.edu.sg;smu.edu.sg", "position": "PhD student;;Researcher;Associate Professor;Associate Professor", "bibtex": "@inproceedings{\nwoo2022cost,\ntitle={Co{ST}: Contrastive Learning of Disentangled Seasonal-Trend Representations for Time Series Forecasting},\nauthor={Gerald Woo and Chenghao Liu and Doyen Sahoo and Akshat Kumar and Steven Hoi},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=PilZY3omXV2}\n}", "github": "", "project": "", "reviewers": "bLmw;KrXv;S8vv;97uN", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "4;4;4;3", "correctness": "3;4;4;4", "technical_novelty": "3;3;3;3", "empirical_novelty": "3;3;3;3", "wc_summary_paper": "51;117;72;128", "wc_summary_review": "49;17;24;37", "wc_main_review": "585;159;219;219", "wc_review": "685;293;315;384", "wc_reply_reviewers": "0;121;41;26", "wc_reply_authors": "3099;890;832;905", "reply_reviewers": "0;1;1;1", "reply_authors": "5;2;3;3", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 92.0, 31.63068130786942 ], "wc_summary_review_avg": [ 31.75, 12.275483697190918 ], "wc_main_review_avg": [ 295.5, 168.92823920233113 ], "wc_review_avg": [ 419.25, 157.0610948007176 ], "wc_reply_reviewers_avg": [ 47.0, 45.17189391646093 ], "wc_reply_authors_avg": [ 1431.5, 963.1174642794097 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 3.25, 1.0897247358851685 ], "replies_avg": [ 23, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.9271726499455306, "corr_recommendation_correctness": 0.6622661785325219, "gs_citation": 354, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10071706504793887642&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "pdf": "https://openreview.net/pdf?id=PilZY3omXV2", "email": "smu.edu.sg;;salesforce.com;smu.edu.sg;smu.edu.sg", "author_num": 5, "aff_unique_index": "0;1;0;0", "aff_unique_norm": "Singapore Management University;Salesforce", "aff_unique_dep": ";", "aff_unique_url": "https://www.smu.edu.sg;https://www.salesforce.com", "aff_unique_abbr": "SMU;Salesforce", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0", "aff_country_unique": "Singapore;United States" }, { "id": "PlFtf_pnkZu", "title": "Examining Scaling and Transfer of Language Model Architectures for Machine Translation", "track": "main", "status": "Reject", "tldr": "", "abstract": "Natural language understanding and generation models follow one of the two dominant architectural paradigms: language models (LMs) that process concatenated sequences in a single stack of layers, and encoder-decoder models (EncDec) that utilize separate layer stacks for input and output processing. In machine translation, EncDec has long been the favoured approach, but with few studies investigating the performance of LMs. In this work, we thoroughly examine the role of several architectural design choices on the performance of LMs on bilingual, (massively) multilingual and zero-shot translation tasks, under systematic variations of data conditions and model sizes. Our results show that: (i) Different LMs have different scaling properties, where architectural differences often have a significant impact on model performance at small scales, but the performance gap narrows as the number of parameters increases, (ii) Several design choices, including causal masking and language-modeling objectives for the source sequence, have detrimental effects on translation quality, and (iii) When paired with full-visible masking for source sequences, LMs could perform on par with EncDec on supervised bilingual and multilingual translation tasks, but improve greatly on zero-shot directions by facilitating the reduction of off-target translations.", "keywords": "language modeling;machine translation;prefixlm;causallm;model scaling;zero-shot transfer", "primary_area": "", "supplementary_material": "", "author": "Biao Zhang;Behrooz Ghorbani;Ankur Bapna;Yong Cheng;Xavier Garcia;Jonathan Shen;Orhan Firat", "authorids": "~Biao_Zhang2;~Behrooz_Ghorbani1;~Ankur_Bapna1;~Yong_Cheng3;~Xavier_Garcia1;~Jonathan_Shen1;~Orhan_Firat1", "gender": "M;;M;M;M;M;M", "homepage": ";;;;;;", "dblp": "https://dblp.uni-trier.de/pers/hd/z/Zhang_0002:Biao;162/0166;200/8008;34/6276.html;192/1539;120/2225;", "google_scholar": "gqPKjaIAAAAJ;;6hK9IZoAAAAJ;rZ0mlMYAAAAJ;yDonAm4AAAAJ;https://scholar.google.com.tr/citations?user=dLaR9lgAAAAJ;Y2Hio6MAAAAJ", "orcid": ";;;;;;", "linkedin": ";;ankur-bapna-053b1269;;jonathanasdf/;;", "or_profile": "~Biao_Zhang2;~Behrooz_Ghorbani1;~Ankur_Bapna1;~Yong_Cheng3;~Jonathan_Shen1;~Orhan_Firat1;~Xavier_Garcia-rojas1", "aff": "University of Edinburgh;Google;Google;Google;Google;Google;", "aff_domain": "ed.ac.uk;google.com;google.com;google.com;google.com;google.com;", "position": "PhD student;Researcher;Software Engineer;Researcher;Employee;Research Scientist;", "bibtex": "@misc{\nzhang2022examining,\ntitle={Examining Scaling and Transfer of Language Model Architectures for Machine Translation},\nauthor={Biao Zhang and Behrooz Ghorbani and Ankur Bapna and Yong Cheng and Xavier Garcia and Jonathan Shen and Orhan Firat},\nyear={2022},\nurl={https://openreview.net/forum?id=PlFtf_pnkZu}\n}", "github": "", "project": "", "reviewers": "tsgN;AF5b;vGrd;aP6J", "site": "https://openreview.net/forum?id=PlFtf_pnkZu", "pdf_size": 0, "recommendation": "3;5;6;6", "confidence": "5;4;4;4", "correctness": "4;3;3;3", "technical_novelty": "2;2;3;2", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "53;68;138;142", "wc_summary_review": "34;79;32;96", "wc_main_review": "184;207;195;459", "wc_review": "271;354;365;697", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "912;790;276;614", "reply_reviewers": "0;0;0;0", "reply_authors": "3;3;2;2", "recommendation_avg": [ 5.0, 1.224744871391589 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 100.25, 40.12714168739159 ], "wc_summary_review_avg": [ 60.25, 27.913930214142187 ], "wc_main_review_avg": [ 261.25, 114.46041892287482 ], "wc_review_avg": [ 421.75, 163.0174453854556 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 648.0, 239.47860029656096 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.5, 0.5 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": -0.9428090415820632, "corr_recommendation_correctness": -0.9428090415820632, "gs_citation": 21, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13104352547978388815&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff_unique_index": "0;1;1;1;1;1", "aff_unique_norm": "University of Edinburgh;Google", "aff_unique_dep": ";Google", "aff_unique_url": "https://www.ed.ac.uk;https://www.google.com", "aff_unique_abbr": "Edinburgh;Google", "aff_campus_unique_index": "1;1;1;1;1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;1;1;1;1;1", "aff_country_unique": "United Kingdom;United States" }, { "title": "Pseudo Numerical Methods for Diffusion Models on Manifolds", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6066", "id": "PlKWVd2yBkY", "poster": "", "openreview": "https://openreview.net/forum?id=PlKWVd2yBkY", "slides": "https://iclr.cc/virtual/2022/poster/6066", "video": "https://iclr.cc/virtual/2022/poster/6066", "author_site": "Luping Liu, Yi Ren, Zhijie Lin, Zhou Zhao", "tldr": "", "abstract": "Denoising Diffusion Probabilistic Models (DDPMs) can generate high-quality samples such as image and audio samples. However, DDPMs require hundreds to thousands of iterations to produce a sample. Several prior works have successfully accelerated DDPMs through adjusting the variance schedule (e.g., Improved Denoising Diffusion Probabilistic Models) or the denoising equation (e.g., Denoising Diffusion Implicit Models (DDIMs)). However, these acceleration methods cannot maintain the quality of samples and even introduce new noise at high speedup rate, which limit their practicability. To accelerate the inference process while keeping the sample quality, we provide a new perspective that DDPMs should be treated as solving differential equations on manifolds. Under such a perspective, we propose pseudo numerical methods for diffusion models (PNDMs). Specifically, we figure out how to solve differential equations on manifolds and show that DDIMs are simple cases of pseudo numerical methods. We change several classical numerical methods to corresponding pseudo numerical methods and find that pseudo linear multi-step method is the best method in most situations. According to our experiments, by directly using pre-trained models on Cifar10, CelebA and LSUN, PNDMs can generate higher quality synthetic images with only 50 steps compared with 1000-step DDIMs (20x speedup), significantly outperform DDIMs with 250 steps (by around 0.4 in FID) and have good generalization on different variance schedules.", "keywords": "diffusion model;generative model;numerical method;manifold", "primary_area": "", "supplementary_material": "/attachment/99d3a44be1ad5b9430c9d90c9802de6867a40279.zip", "author": "Luping Liu;Yi Ren;Zhijie Lin;Zhou Zhao", "authorids": "~Luping_Liu2;~Yi_Ren2;~Zhijie_Lin1;~Zhou_Zhao2", "gender": ";M;M;M", "homepage": ";https://rayeren.github.io/;;https://dblp.uni-trier.de/pid/75/7785.html?", "dblp": ";75/6568-6;;75/7785", "google_scholar": ";4FA6C0AAAAAJ;xXMj6_EAAAAJ;https://scholar.google.com.hk/citations?user=IIoFY90AAAAJ", "orcid": ";;0000-0003-3461-8952;0000-0001-6121-0384", "linkedin": ";;;", "or_profile": "~Luping_Liu2;~Yi_Ren2;~Zhijie_Lin1;~Zhou_Zhao2", "aff": ";Zhejiang University;Zhejiang University;Zhejiang University", "aff_domain": ";zju.edu.cn;zju.edu.cn;zju.edu.cn", "position": ";MS student;MS student;Associate Professor", "bibtex": "@inproceedings{\nliu2022pseudo,\ntitle={Pseudo Numerical Methods for Diffusion Models on Manifolds},\nauthor={Luping Liu and Yi Ren and Zhijie Lin and Zhou Zhao},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=PlKWVd2yBkY}\n}", "github": "", "project": "", "reviewers": "c9bY;jhKu;HRpL;urhS", "pdf_size": 0, "recommendation": "5;5;6;8", "confidence": "3;2;2;3", "correctness": "3;3;3;4", "technical_novelty": "3;3;2;3", "empirical_novelty": "4;3;3;4", "wc_summary_paper": "121;48;28;97", "wc_summary_review": "26;26;29;61", "wc_main_review": "696;204;134;356", "wc_review": "843;278;191;514", "wc_reply_reviewers": "258;0;0;0", "wc_reply_authors": "1438;596;372;360", "reply_reviewers": "1;0;0;0", "reply_authors": "3;2;1;1", "recommendation_avg": [ 6.0, 1.224744871391589 ], "confidence_avg": [ 2.5, 0.5 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.5, 0.5 ], "wc_summary_paper_avg": [ 73.5, 37.178622890042604 ], "wc_summary_review_avg": [ 35.5, 14.773286702694158 ], "wc_main_review_avg": [ 347.5, 216.62121318098096 ], "wc_review_avg": [ 456.5, 252.50792066784757 ], "wc_reply_reviewers_avg": [ 64.5, 111.71727708819259 ], "wc_reply_authors_avg": [ 691.5, 441.1221486164575 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.75, 0.82915619758885 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.40824829046386296, "corr_recommendation_correctness": 0.9428090415820632, "gs_citation": 656, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13911281549093893446&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=PlKWVd2yBkY", "email": ";zju.edu.cn;zju.edu.cn;zju.edu.cn", "author_num": 4, "aff_unique_index": "0;0;0", "aff_unique_norm": "Zhejiang University", "aff_unique_dep": "", "aff_unique_url": "https://www.zju.edu.cn", "aff_unique_abbr": "ZJU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "id": "PnraKzlFvp", "title": "Automated hypothesis generation via Evolutionary Abduction", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Abduction is a powerful form of causal inference employed in many artificial intelligence tasks, such as medical diagnosis, criminology, root cause analysis, intent recognition. Given an effect, the abductive reasoning allows advancing a plausible set of explanatory hypotheses for its causes. This paper presents a new evolutionary strategy - called Evolutionary Abduction (EVA) - for automated abductive inference, aiming at effectively generating sets of hypotheses for explaining an occurred effect and/or predicting an effect that could occur in the future. EVA defines a set of abductive operators to repeatedly construct hypothetical cause-effect instances, and then automatically assesses their plausibility as well as their novelty with respect to already known instances - a mechanism mimicking the human reasoning employed whenever we need to select the best candidates from a set of hypotheses. Experiments with four datasets confirm that, given a background knowledge, EVA can construct new and realistic multiple-cause hypotheses for a given effect. EVA outperforms alternative strategies based on causal structure discovery, generating closer-to-real instances in most settings and datasets.", "keywords": "Causal inference;abduction;evolutionary computation;learning", "primary_area": "", "supplementary_material": "/attachment/88a0eda9a8907328b2e0b4ab4575a201ca1f1e6a.zip", "author": "Roberto Pietrantuono", "authorids": "~Roberto_Pietrantuono1", "gender": "M", "homepage": "http://wpage.unina.it/roberto.pietrantuono/", "dblp": "", "google_scholar": "Htert2EAAAAJ", "orcid": "", "linkedin": "", "or_profile": "~Roberto_Pietrantuono1", "aff": "University of Naples Federico II", "aff_domain": "unina.it", "position": "Assistant Professor", "bibtex": "@misc{\npietrantuono2022automated,\ntitle={Automated hypothesis generation via Evolutionary Abduction},\nauthor={Roberto Pietrantuono},\nyear={2022},\nurl={https://openreview.net/forum?id=PnraKzlFvp}\n}", "github": "", "project": "", "reviewers": "EnPo;Zhd8;GANF;4TpQ", "site": "https://openreview.net/forum?id=PnraKzlFvp", "pdf_size": 0, "recommendation": "3;3;3;3", "confidence": "3;4;4;4", "correctness": "3;3;2;2", "technical_novelty": "2;2;3;1", "empirical_novelty": "3;2;2;1", "wc_summary_paper": "49;29;70;45", "wc_summary_review": "48;31;26;48", "wc_main_review": "673;341;817;327", "wc_review": "770;401;913;420", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 48.25, 14.618053906043718 ], "wc_summary_review_avg": [ 38.25, 9.908960591303208 ], "wc_main_review_avg": [ 539.5, 211.77051258378725 ], "wc_review_avg": [ 626.0, 221.45315531732666 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 1, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:oZ0P-C7ncqMJ:scholar.google.com/&scioq=Automated+hypothesis+generation+via+Evolutionary+Abduction&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "University of Naples Federico II", "aff_unique_dep": "", "aff_unique_url": "https://www.unina.it", "aff_unique_abbr": "UNINA", "aff_country_unique_index": "0", "aff_country_unique": "Italy" }, { "id": "Pobz_8y2Q2_", "title": "BANANA: a Benchmark for the Assessment of Neural Architectures for Nucleic Acids", "track": "main", "status": "Reject", "tldr": "", "abstract": "Machine learning has always played an important role in bioinformatics and recent applications of deep learning have allowed solving a new spectrum of biologically relevant tasks.\nHowever, there is still a gap between the ``mainstream'' AI and the bioinformatics communities. This is partially due to the format of bioinformatics data, which are typically difficult to process and adapt to machine learning tasks without deep domain knowledge.\nMoreover, the lack of standardized evaluation methods makes it difficult to rigorously compare different models and assess their true performance.\nTo help to bridge this gap, and inspired by work such as SuperGLUE and TAPE, we present BANANA, a benchmark consisting of six supervised classification tasks designed to assess language model performance in the DNA and RNA domains. The tasks are defined over three genomics and one transcriptomics languages (human DNA, bacterial 16S gene, nematoda ITS2 gene, human mRNA) and measure a model's ability to perform whole-sequence classification in a variety of setups.\nEach task was built from readily available data and is presented in a ready-to-use format, with defined labels, splits, and evaluation metrics.\nWe use BANANA to test state-of-the-art NLP architectures, such as Transformer-based models, observing that, in general, self-supervised pretraining without external corpora is beneficial in every task.", "keywords": "bioinformatics;language modeling;natural language processing;dataset;benchmark;dna;rna", "primary_area": "", "supplementary_material": "", "author": "Luca Salvatore Lorello;Andrea Galassi;Paolo Torroni", "authorids": "~Luca_Salvatore_Lorello1;~Andrea_Galassi1;~Paolo_Torroni1", "gender": "M;M;M", "homepage": "https://github.com/HashakGik;http://ai.unibo.it/people/A.Galassi;https://www.unibo.it/sitoweb/p.torroni/en", "dblp": ";208/4245.html;t/PaoloTorroni", "google_scholar": "https://scholar.google.com/citations?hl=it;https://scholar.google.it/citations?user=OnzdCscAAAAJ;uOZZjwsAAAAJ", "orcid": ";0000-0001-9711-7042;0000-0002-9253-8638", "linkedin": ";a-galassi/;paolotorroni/", "or_profile": "~Luca_Salvatore_Lorello1;~Andrea_Galassi1;~Paolo_Torroni1", "aff": "University of Pisa;University of Bologna;University of Bologna", "aff_domain": "unipi.it;unibo.it;unibo.it", "position": "PhD student;Postdoc;Associate Professor", "bibtex": "@misc{\nlorello2022banana,\ntitle={{BANANA}: a Benchmark for the Assessment of Neural Architectures for Nucleic Acids},\nauthor={Luca Salvatore Lorello and Andrea Galassi and Paolo Torroni},\nyear={2022},\nurl={https://openreview.net/forum?id=Pobz_8y2Q2_}\n}", "github": "", "project": "", "reviewers": "QrMg;dxXz;xYqo;Bhcu;wY3Y", "site": "https://openreview.net/forum?id=Pobz_8y2Q2_", "pdf_size": 0, "recommendation": "3;3;5;5;5", "confidence": "4;4;3;4;4", "correctness": "2;2;3;2;3", "technical_novelty": "2;1;2;1;1", "empirical_novelty": "2;1;3;3;3", "wc_summary_paper": "23;187;78;74;65", "wc_summary_review": "32;168;96;66;50", "wc_main_review": "361;704;581;750;389", "wc_review": "416;1059;755;890;504", "wc_reply_reviewers": "0;0;0;438;125", "wc_reply_authors": "442;351;608;421;788", "reply_reviewers": "0;0;0;1;1", "reply_authors": "1;1;1;1;1", "recommendation_avg": [ 4.2, 0.9797958971132712 ], "confidence_avg": [ 3.8, 0.39999999999999997 ], "correctness_avg": [ 2.4, 0.4898979485566356 ], "technical_novelty_avg": [ 1.4, 0.4898979485566356 ], "empirical_novelty_avg": [ 2.4, 0.8 ], "wc_summary_paper_avg": [ 85.4, 54.437487083810176 ], "wc_summary_review_avg": [ 82.4, 47.68899244060415 ], "wc_main_review_avg": [ 557.0, 158.79168743986568 ], "wc_review_avg": [ 724.8, 238.32868060726557 ], "wc_reply_reviewers_avg": [ 112.6, 169.74993372605482 ], "wc_reply_authors_avg": [ 522.0, 157.47634743033635 ], "reply_reviewers_avg": [ 0.4, 0.48989794855663565 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.40824829046386296, "corr_recommendation_correctness": 0.6666666666666667, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12082205500336818247&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;1", "aff_unique_norm": "University of Pisa;University of Bologna", "aff_unique_dep": ";", "aff_unique_url": "https://www.unipi.it;https://www.unibo.it", "aff_unique_abbr": "UNIP;Unibo", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Italy" }, { "id": "Ps_m_Uwcu-E", "title": "Layer-wise Adaptive Model Aggregation for Scalable Federated Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "In Federated Learning, a common approach for aggregating local models across clients is periodic averaging of the full model parameters. It is, however, known that different layers of neural networks can have a different degree of model discrepancy across the clients. The conventional full aggregation scheme does not consider such a difference and synchronizes the whole model parameters at once, resulting in inefficient network bandwidth consumption. Aggregating the parameters that are similar across the clients does not make meaningful training progress while increasing the communication cost. We propose FedLAMA, a layer-wise model aggregation scheme for scalable Federated Learning. FedLAMA adaptively adjusts the aggregation interval in a layer-wise manner, jointly considering the model discrepancy and the communication cost. The layer-wise aggregation method enables to finely control the aggregation interval to relax the aggregation frequency without a significant impact on the model accuracy. Our empirical study shows that FedLAMA reduces the communication cost by up to $60\\%$ for IID data and $70\\%$ for non-IID data while achieving a comparable accuracy to FedAvg. ", "keywords": "federated learning;model aggregation;neural network", "primary_area": "", "supplementary_material": "/attachment/544a47b79c20e71845aafcf9a03c2a0d606c373b.zip", "author": "Sunwoo Lee;Tuo Zhang;Chaoyang He;Salman Avestimehr", "authorids": "~Sunwoo_Lee1;~Tuo_Zhang2;~Chaoyang_He1;~Salman_Avestimehr1", "gender": "M;M;M;", "homepage": "https://sites.google.com/view/sunwoolee;;http://chaoyanghe.com;", "dblp": "56/7811-1;;222/6721-1.html;", "google_scholar": "WA9KNNcAAAAJ;Rki45F4AAAAJ;2z2camUAAAAJ;", "orcid": "0000-0001-6334-3068;;;", "linkedin": "sunwoo-lee-90a7308a;tuo-zhang-ultraz/;;", "or_profile": "~Sunwoo_Lee1;~Tuo_Zhang2;~Chaoyang_He1;~Salman_Avestimehr1", "aff": "University of Southern California;University of Southern California;University of Southern California;", "aff_domain": "usc.edu;usc.edu;usc.edu;", "position": "Postdoc;PhD student;PhD student;", "bibtex": "@misc{\nlee2022layerwise,\ntitle={Layer-wise Adaptive Model Aggregation for Scalable Federated Learning},\nauthor={Sunwoo Lee and Tuo Zhang and Chaoyang He and Salman Avestimehr},\nyear={2022},\nurl={https://openreview.net/forum?id=Ps_m_Uwcu-E}\n}", "github": "", "project": "", "reviewers": "9DWJ;YGZf;7ks6;graH", "site": "https://openreview.net/forum?id=Ps_m_Uwcu-E", "pdf_size": 0, "recommendation": "5;5;5;8", "confidence": "5;5;4;3", "correctness": "3;1;3;3", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "82;38;31;45", "wc_summary_review": "34;35;17;54", "wc_main_review": "179;312;100;179", "wc_review": "295;385;148;278", "wc_reply_reviewers": "0;107;17;0", "wc_reply_authors": "402;691;517;169", "reply_reviewers": "0;1;1;0", "reply_authors": "1;1;2;1", "recommendation_avg": [ 5.75, 1.299038105676658 ], "confidence_avg": [ 4.25, 0.82915619758885 ], "correctness_avg": [ 2.5, 0.8660254037844386 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 49.0, 19.685019685029527 ], "wc_summary_review_avg": [ 35.0, 13.095800853708795 ], "wc_main_review_avg": [ 192.5, 76.15937237136346 ], "wc_review_avg": [ 276.5, 84.60053191322145 ], "wc_reply_reviewers_avg": [ 31.0, 44.42409256248236 ], "wc_reply_authors_avg": [ 444.75, 189.55523601314738 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.8703882797784892, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 59, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6899854277302127585&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Southern California", "aff_unique_dep": "", "aff_unique_url": "https://www.usc.edu", "aff_unique_abbr": "USC", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Los Angeles", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Maximum Entropy RL (Provably) Solves Some Robust RL Problems", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6782", "id": "PtSAD3caaA2", "poster": "", "openreview": "https://openreview.net/forum?id=PtSAD3caaA2", "slides": "https://iclr.cc/virtual/2022/poster/6782", "video": "https://iclr.cc/virtual/2022/poster/6782", "author_site": "Benjamin Eysenbach, Sergey Levine", "tldr": "", "abstract": "Many potential applications of reinforcement learning (RL) require guarantees that the agent will perform well in the face of disturbances to the dynamics or reward function. In this paper, we prove theoretically that maximum entropy (MaxEnt) RL maximizes a lower bound on a robust RL objective, and thus can be used to learn policies that are robust to some disturbances in the dynamics and the reward function. While this capability of MaxEnt RL has been observed empirically in prior work, to the best of our knowledge our work provides the first rigorous proof and theoretical characterization of the MaxEnt RL robust set. While a number of prior robust RL algorithms have been designed to handle similar disturbances to the reward function or dynamics, these methods typically require additional moving parts and hyperparameters on top of a base RL algorithm. In contrast, our results suggest that MaxEnt RL by itself is robust to certain disturbances, without requiring any additional modifications. While this does not imply that MaxEnt RL is the best available robust RL method, MaxEnt RL is a simple robust RL method with appealing formal guarantees.", "keywords": "reinforcement learning;robustness;maximum entropy", "primary_area": "", "supplementary_material": "", "author": "Benjamin Eysenbach;Sergey Levine", "authorids": "~Benjamin_Eysenbach1;~Sergey_Levine1", "gender": "M;M", "homepage": "https://ben-eysenbach.github.io/;https://people.eecs.berkeley.edu/~svlevine/", "dblp": "192/1863;80/7594", "google_scholar": "DRnOvU8AAAAJ;8R35rCwAAAAJ", "orcid": "0009-0000-7136-6307;", "linkedin": "benjamin-eysenbach-a7235775/;", "or_profile": "~Benjamin_Eysenbach1;~Sergey_Levine1", "aff": "Carnegie Mellon University;Google", "aff_domain": "cmu.edu;google.com", "position": "PhD student;Research Scientist", "bibtex": "@inproceedings{\neysenbach2022maximum,\ntitle={Maximum Entropy {RL} (Provably) Solves Some Robust {RL} Problems},\nauthor={Benjamin Eysenbach and Sergey Levine},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=PtSAD3caaA2}\n}", "github": "", "project": "", "reviewers": "bxqc;r4e7;qrWz;VSYD", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "3;5;3;3", "correctness": "2;4;3;3", "technical_novelty": "2;3;3;4", "empirical_novelty": "2;3;3;2", "wc_summary_paper": "81;45;23;28", "wc_summary_review": "83;7;25;16", "wc_main_review": "436;222;231;334", "wc_review": "600;274;279;378", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 3.5, 0.8660254037844386 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 44.25, 22.730761095924613 ], "wc_summary_review_avg": [ 32.75, 29.701641368786337 ], "wc_main_review_avg": [ 305.75, 87.12742105674883 ], "wc_review_avg": [ 382.75, 132.10862008211274 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.13245323570650439, "corr_recommendation_correctness": 0.3244428422615251, "gs_citation": 228, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6063415524085955764&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=PtSAD3caaA2", "email": "cmu.edu;google.com", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "Carnegie Mellon University;Google", "aff_unique_dep": ";Google", "aff_unique_url": "https://www.cmu.edu;https://www.google.com", "aff_unique_abbr": "CMU;Google", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "PtuQ8bk9xF5", "title": "Learning to Act with Affordance-Aware Multimodal Neural SLAM", "track": "main", "status": "Reject", "tldr": "", "abstract": "Recent years have witnessed an emerging paradigm shift toward embodied artificial intelligence, in which an agent must learn to solve challenging tasks by interacting with its environment. There are several challenges in solving embodied multimodal tasks, including long-horizon planning, vision-and-language grounding, and efficient exploration. We focus on a critical bottleneck, namely the performance of planning and navigation. To tackle this challenge, we propose a Neural SLAM approach that, for the first time, utilizes several modalities for exploration, predicts an affordance-aware semantic map, and plans over it at the same time. This significantly improves exploration efficiency, leads to robust long-horizon planning, and enables effective vision-and-language grounding. With the proposed Affordance-aware Multimodal Neural SLAM (AMSLAM) approach, we obtain more than 40% improvement over prior published work on the ALFRED benchmark and set a new state-of-the-art generalization performance at a success rate of 23.48% on the test unseen scenes.", "keywords": "Language-Guided Task Completion;Multimodal learning;Neural SLAM.", "primary_area": "", "supplementary_material": "", "author": "Zhiwei Jia;Kaixiang Lin;Yizhou Zhao;Qiaozi Gao;Govind Thattai;Gaurav S. Sukhatme", "authorids": "~Zhiwei_Jia1;~Kaixiang_Lin1;~Yizhou_Zhao1;~Qiaozi_Gao1;~Govind_Thattai1;~Gaurav_S._Sukhatme1", "gender": "M;;M;M;M;M", "homepage": "https://www.zjia.xyz/;http://kaixianglin.github.io;;;;http://www-robotics.usc.edu/~gaurav/", "dblp": ";;253/7894;173/1986;279/2880;s/GauravSSukhatme", "google_scholar": "nQhMGqAAAAAJ;egq785sAAAAJ;l1h5kY8AAAAJ;Ub3LlsgAAAAJ;ZiagaFYAAAAJ;https://scholar.google.com.tw/citations?user=lRUi-A8AAAAJ", "orcid": ";;;;;0000-0003-2408-474X", "linkedin": "zhiweijia;;;;govind-thattai-aaa5326/;gaurav-sukhatme-9b6420b/", "or_profile": "~Zhiwei_Jia1;~Kaixiang_Lin1;~Yizhou_Zhao1;~Qiaozi_Gao1;~Govind_Thattai1;~Gaurav_S._Sukhatme1", "aff": "University of California, San Diego;Amazon;University of California, Los Angeles;Amazon;Amazon;University of Southern California", "aff_domain": "ucsd.edu;amazon.com;ucla.edu;amazon.com;amazon.com;usc.edu", "position": "PhD student;Applied Scientist;PhD student;Scientist;Principal Scientist;Full Professor", "bibtex": "@misc{\njia2022learning,\ntitle={Learning to Act with Affordance-Aware Multimodal Neural {SLAM}},\nauthor={Zhiwei Jia and Kaixiang Lin and Yizhou Zhao and Qiaozi Gao and Govind Thattai and Gaurav S. Sukhatme},\nyear={2022},\nurl={https://openreview.net/forum?id=PtuQ8bk9xF5}\n}", "github": "", "project": "", "reviewers": "2ghY;kMKr;pomV", "site": "https://openreview.net/forum?id=PtuQ8bk9xF5", "pdf_size": 0, "recommendation": "3;5;5", "confidence": "4;4;3", "correctness": "2;4;3", "technical_novelty": "2;2;2", "empirical_novelty": "2;3;3", "wc_summary_paper": "50;63;90", "wc_summary_review": "60;62;77", "wc_main_review": "586;238;480", "wc_review": "696;363;647", "wc_reply_reviewers": "0;63;147", "wc_reply_authors": "1370;351;498", "reply_reviewers": "0;1;1", "reply_authors": "2;1;1", "recommendation_avg": [ 4.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.0, 0.816496580927726 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 67.66666666666667, 16.659998666133067 ], "wc_summary_review_avg": [ 66.33333333333333, 7.586537784494029 ], "wc_main_review_avg": [ 434.6666666666667, 145.64187432038753 ], "wc_review_avg": [ 568.6666666666666, 146.7976687674418 ], "wc_reply_reviewers_avg": [ 70.0, 60.21627686929839 ], "wc_reply_authors_avg": [ 739.6666666666666, 449.73498369101287 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.49999999999999983, "corr_recommendation_correctness": 0.8660254037844385, "gs_citation": 18, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13358599234793030099&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff_unique_index": "0;1;2;1;1;3", "aff_unique_norm": "University of California, San Diego;Amazon;University of California, Los Angeles;University of Southern California", "aff_unique_dep": ";Amazon.com, Inc.;;", "aff_unique_url": "https://www.ucsd.edu;https://www.amazon.com;https://www.ucla.edu;https://www.usc.edu", "aff_unique_abbr": "UCSD;Amazon;UCLA;USC", "aff_campus_unique_index": "0;2;2", "aff_campus_unique": "San Diego;;Los Angeles", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "Py8WbvKH_wv", "title": "DRIBO: Robust Deep Reinforcement Learning via Multi-View Information Bottleneck", "track": "main", "status": "Reject", "tldr": "", "abstract": "Deep reinforcement learning (DRL) agents are often sensitive to visual changes that were unseen in their training environments. To address this problem, we leverage the sequential nature of RL to learn robust representations that encode only task-relevant information from observations based on the unsupervised multi-view setting. Specifically, we introduce a novel contrastive version of Multi-View Information Bottleneck (MIB) objective for temporal data. We train RL agents from pixels with this auxiliary objective to learn robust representations that can compress away task-irrelevant information and are predictive of task-relevant dynamics. This approach enables us to train high-performance policies that are robust to visual distractions and can generalize well to unseen environments. We demonstrate that our approach can achieve SOTA performance on diverse visual control tasks on the DeepMind Control Suite when the background is replaced with natural videos. In addition, we show that our approach outperforms well-established baselines for generalization to unseen environments on the Procgen benchmark.", "keywords": "Representation Learning;Deep Reinforcement Learning;Information Bottleneck", "primary_area": "", "supplementary_material": "/attachment/71e8061e64e204e6765772dfe57fe1f2b0216c79.zip", "author": "Jiameng Fan;Wenchao Li", "authorids": "~Jiameng_Fan1;~Wenchao_Li1", "gender": "M;", "homepage": "https://www.jiamengf.com;http://sites.bu.edu/depend/", "dblp": "196/7836.html;23/5721-1", "google_scholar": "https://scholar.google.com/citations?hl=en;zwA5eokAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Jiameng_Fan1;~Wenchao_Li1", "aff": "Boston University;Boston University", "aff_domain": "bu.edu;bu.edu", "position": "PhD student;Assistant Professor", "bibtex": "@misc{\nfan2022dribo,\ntitle={{DRIBO}: Robust Deep Reinforcement Learning via Multi-View Information Bottleneck},\nauthor={Jiameng Fan and Wenchao Li},\nyear={2022},\nurl={https://openreview.net/forum?id=Py8WbvKH_wv}\n}", "github": "", "project": "", "reviewers": "kUtq;LDpb;uNzG;m1E2", "site": "https://openreview.net/forum?id=Py8WbvKH_wv", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "5;3;4;4", "correctness": "3;3;4;3", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;3;3;2", "wc_summary_paper": "86;54;59;72", "wc_summary_review": "197;26;79;67", "wc_main_review": "439;171;256;259", "wc_review": "722;251;394;398", "wc_reply_reviewers": "71;0;60;180", "wc_reply_authors": "833;251;611;546", "reply_reviewers": "1;0;1;1", "reply_authors": "2;1;2;2", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 67.75, 12.417225938187643 ], "wc_summary_review_avg": [ 92.25, 63.58999528227691 ], "wc_main_review_avg": [ 281.25, 97.68923942789195 ], "wc_review_avg": [ 441.25, 172.5679214106724 ], "wc_reply_reviewers_avg": [ 77.75, 64.92447535406043 ], "wc_reply_authors_avg": [ 560.25, 207.8501563627028 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.75, 0.4330127018922193 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 49, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17795910493641193453&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0", "aff_unique_norm": "Boston University", "aff_unique_dep": "", "aff_unique_url": "https://www.bu.edu", "aff_unique_abbr": "BU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "PyBp6nFfzuj", "title": "UNCERTAINTY QUANTIFICATION USING VARIATIONAL INFERENCE FOR BIOMEDICAL IMAGE SEGMENTATION", "track": "main", "status": "Reject", "tldr": "", "abstract": "Deep learning motivated by convolutional neural networks has been highly successful in a range of medical imaging problems like image classification, image\nsegmentation, image synthesis etc. However for validation and interpretability, not\nonly do we need the predictions made by the model but also how confident it is\nwhile making those predictions. This is important in safety critical applications\nfor the people to accept it. In this work, we used an encoder decoder architecture\nbased on variational inference techniques for segmenting brain tumour images. We\nevaluate our work on the publicly available BRATS dataset using Dice Similarity\nCoefficient (DSC) and Intersection Over Union (IOU) as the evaluation metrics.\nOur model is able to segment brain tumours while taking into account both aleatoric\nuncertainty and epistemic uncertainty in a principled bayesian manner.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Abhinav Sagar", "authorids": "~Abhinav_Sagar1", "gender": "M", "homepage": "", "dblp": "", "google_scholar": "5ntkLcgAAAAJ", "orcid": "", "linkedin": "https://linkedin.com/in/abhinavsagar4", "or_profile": "~Abhinav_Sagar1", "aff": "University of Maryland, College Park", "aff_domain": "umd.edu", "position": "MS student", "bibtex": "@misc{\nsagar2022uncertainty,\ntitle={{UNCERTAINTY} {QUANTIFICATION} {USING} {VARIATIONAL} {INFERENCE} {FOR} {BIOMEDICAL} {IMAGE} {SEGMENTATION}},\nauthor={Abhinav Sagar},\nyear={2022},\nurl={https://openreview.net/forum?id=PyBp6nFfzuj}\n}", "github": "", "project": "", "reviewers": "MfyG;5veL;UXjb;YKhZ", "site": "https://openreview.net/forum?id=PyBp6nFfzuj", "pdf_size": 0, "recommendation": "1;1;1;1", "confidence": "5;4;5;3", "correctness": "4;2;1;3", "technical_novelty": "1;1;1;1", "empirical_novelty": "1;1;1;2", "wc_summary_paper": "16;53;42;51", "wc_summary_review": "8;16;53;33", "wc_main_review": "145;165;53;481", "wc_review": "169;234;148;565", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 1.0, 0.0 ], "confidence_avg": [ 4.25, 0.82915619758885 ], "correctness_avg": [ 2.5, 1.118033988749895 ], "technical_novelty_avg": [ 1.0, 0.0 ], "empirical_novelty_avg": [ 1.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 40.5, 14.739402972983676 ], "wc_summary_review_avg": [ 27.5, 17.269916039170543 ], "wc_main_review_avg": [ 211.0, 161.50541786577935 ], "wc_review_avg": [ 279.0, 168.13833590231587 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 1, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 24, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2812179187437874546&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 12, "aff_unique_index": "0", "aff_unique_norm": "University of Maryland", "aff_unique_dep": "", "aff_unique_url": "https://www/umd.edu", "aff_unique_abbr": "UMD", "aff_campus_unique_index": "0", "aff_campus_unique": "College Park", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "title": "GeoDiff: A Geometric Diffusion Model for Molecular Conformation Generation", "status": "Oral", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7028", "id": "PzcvxEMzvQC", "poster": "", "openreview": "https://openreview.net/forum?id=PzcvxEMzvQC", "slides": "https://iclr.cc/virtual/2022/poster/7028", "video": "https://iclr.cc/virtual/2022/poster/7028", "author_site": "Minkai Xu, Lantao Yu, Yang Song, Chence Shi, Stefano Ermon, Jian Tang", "tldr": "", "abstract": "Predicting molecular conformations from molecular graphs is a fundamental problem in cheminformatics and drug discovery. Recently, significant progress has been achieved with machine learning approaches, especially with deep generative models. Inspired by the diffusion process in classical non-equilibrium thermodynamics where heated particles will diffuse from original states to a noise distribution, in this paper, we propose a novel generative model named GeoDiff for molecular conformation prediction. GeoDiff treats each atom as a particle and learns to directly reverse the diffusion process (i.e., transforming from a noise distribution to stable conformations) as a Markov chain. Modeling such a generation process is however very challenging as the likelihood of conformations should be roto-translational invariant. We theoretically show that Markov chains evolving with equivariant Markov kernels can induce an invariant distribution by design, and further propose building blocks for the Markov kernels to preserve the desirable equivariance property. The whole framework can be efficiently trained in an end-to-end fashion by optimizing a weighted variational lower bound to the (conditional) likelihood. Experiments on multiple benchmarks show that GeoDiff is superior or comparable to existing state-of-the-art approaches, especially on large molecules. ", "keywords": "molecular conformation generation;deep generative models;diffusion probabilistic models", "primary_area": "", "supplementary_material": "/attachment/26791ee1444d8190c207fe60b1cb08e3eded683d.zip", "author": "Minkai Xu;Lantao Yu;Yang Song;Chence Shi;Stefano Ermon;Jian Tang", "authorids": "~Minkai_Xu1;~Lantao_Yu2;~Yang_Song1;~Chence_Shi1;~Stefano_Ermon1;~Jian_Tang1", "gender": "M;M;M;M;M;", "homepage": "https://minkaixu.com;http://lantaoyu.com/;https://yang-song.net;https://chenceshi.com/;http://cs.stanford.edu/~ermon/;http://www.jian-tang.com", "dblp": "257/3355;186/7892;;228/8293;47/8135;181/2667-5", "google_scholar": "https://scholar.google.com/citations?hl=en;Ixg9n-EAAAAJ;o_J2CroAAAAJ;0Um1Kz0AAAAJ;;https://scholar.google.ca/citations?user=1ir6WUEAAAAJ", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": "~Minkai_Xu1;~Lantao_Yu2;~Yang_Song1;~Chence_Shi1;~Stefano_Ermon1;~Jian_Tang1", "aff": "Montreal Institute for Learning Algorithms, University of Montreal, Universit\u00e9 de Montr\u00e9al;Computer Science Department, Stanford University;Stanford University;University of Montreal;Stanford University;Mila, HEC Montreal", "aff_domain": "mila.umontreal.ca;stanford.edu;stanford.edu;umontreal.ca;stanford.edu;hec.ca", "position": "MS student;PhD student;PhD student;PhD student;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\nxu2022geodiff,\ntitle={GeoDiff: A Geometric Diffusion Model for Molecular Conformation Generation},\nauthor={Minkai Xu and Lantao Yu and Yang Song and Chence Shi and Stefano Ermon and Jian Tang},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=PzcvxEMzvQC}\n}", "github": "", "project": "", "reviewers": "s7Z3;PLjF;sVuD", "pdf_size": 0, "recommendation": "6;8;8", "confidence": "3;3;5", "correctness": "3;4;3", "technical_novelty": "3;4;4", "empirical_novelty": "3;4;0", "wc_summary_paper": "100;287;137", "wc_summary_review": "36;22;50", "wc_main_review": "289;223;278", "wc_review": "425;532;465", "wc_reply_reviewers": "0;0;27", "wc_reply_authors": "754;673;523", "reply_reviewers": "0;0;1", "reply_authors": "1;1;1", "recommendation_avg": [ 7.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.9428090415820634 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 3.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.3333333333333335, 1.699673171197595 ], "wc_summary_paper_avg": [ 174.66666666666666, 80.85515169459245 ], "wc_summary_review_avg": [ 36.0, 11.430952132988164 ], "wc_main_review_avg": [ 263.3333333333333, 28.871362204709975 ], "wc_review_avg": [ 474.0, 44.14370472294625 ], "wc_reply_reviewers_avg": [ 9.0, 12.727922061357855 ], "wc_reply_authors_avg": [ 650.0, 95.69743988216194 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.5, "corr_recommendation_correctness": 0.49999999999999983, "gs_citation": 645, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4830391195637525286&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=PzcvxEMzvQC", "email": "mila.umontreal.ca;stanford.edu;stanford.edu;umontreal.ca;stanford.edu;hec.ca", "author_num": 6, "aff_unique_index": "0;1;1;0;1;2", "aff_unique_norm": "University of Montreal;Stanford University;HEC Montreal", "aff_unique_dep": "Montreal Institute for Learning Algorithms;Computer Science Department;HEC Business School", "aff_unique_url": "https://www.mila.quebec;https://www.stanford.edu;https://www.hec.ca", "aff_unique_abbr": "MILA;Stanford;HEC", "aff_campus_unique_index": "0;1;1;1;0", "aff_campus_unique": "Montreal;Stanford;", "aff_country_unique_index": "0;1;1;0;1;0", "aff_country_unique": "Canada;United States" }, { "id": "Q0n61rV89bi", "title": "LEARNING PHONEME-LEVEL DISCRETE SPEECH REPRESENTATION WITH WORD-LEVEL SUPERVISION", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Phonemes are defined by their relationship to words: changing a phoneme changes the word. Learning a phoneme inventory with little supervision has been a long-standing challenge with important applications to under-resourced speech technology. In this paper, we bridge the gap between the linguistic and statistical definition of phonemes and propose a novel neural discrete representation learning model for self-supervised learning of phoneme inventory with raw speech and word labels. Under mild assumptions, we prove that the phoneme inventory learned by our approach converges to the true one with an exponentially low error rate. Moreover, in experiments on TIMIT and Mboshi benchmarks, our approach consistently learns better phoneme-level representation than previous state-of-the-art self-supervised representation learning algorithms and remains effective even in a low-resource scenario.", "keywords": "discrete speech representation;self-supervised learning;mutual information", "primary_area": "", "supplementary_material": "", "author": "Liming Wang;Siyuan Feng;Mark A. Hasegawa-Johnson;Chang D. Yoo", "authorids": "~Liming_Wang3;s.feng@tudelft.nl;~Mark_A._Hasegawa-Johnson1;~Chang_D._Yoo1", "gender": "M;;M;M", "homepage": "https://lwang114.github.io;;http://speechtechnology.web.illinois.edu;https://sanctusfactory.com/family.php", "dblp": ";;70/3186;31/7819", "google_scholar": "https://scholar.google.com/citations?hl=en;;18O0OAwAAAAJ;gFWgUQEAAAAJ", "orcid": ";;0000-0002-5631-2893;0000-0002-0756-7179", "linkedin": ";;mark-hasegawa-johnson-21a86825/;", "or_profile": "~Liming_Wang3;s.feng@tudelft.nl;~Mark_A._Hasegawa-Johnson1;~Chang_D._Yoo1", "aff": "University of Illinois, Urbana Champaign;;University of Illinois, Urbana Champaign;Korea Advanced Institute of Science & Technology", "aff_domain": "illinois.edu;;illinois.edu;kaist.ac.kr", "position": "PhD student;;Full Professor;Full Professor", "bibtex": "@misc{\nwang2022learning,\ntitle={{LEARNING} {PHONEME}-{LEVEL} {DISCRETE} {SPEECH} {REPRESENTATION} {WITH} {WORD}-{LEVEL} {SUPERVISION}},\nauthor={Liming Wang and Siyuan Feng and Mark A. Hasegawa-Johnson and Chang D. Yoo},\nyear={2022},\nurl={https://openreview.net/forum?id=Q0n61rV89bi}\n}", "github": "", "project": "", "reviewers": "mRVq;R7c7;aX2r;qXAr", "site": "https://openreview.net/forum?id=Q0n61rV89bi", "pdf_size": 0, "recommendation": "5;5;5;6", "confidence": "2;4;3;3", "correctness": "4;3;3;3", "technical_novelty": "4;2;2;2", "empirical_novelty": "4;2;2;3", "wc_summary_paper": "70;72;132;102", "wc_summary_review": "67;41;22;95", "wc_main_review": "328;803;302;348", "wc_review": "465;916;456;545", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 5.25, 0.4330127018922193 ], "confidence_avg": [ 3.0, 0.7071067811865476 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.8660254037844386 ], "empirical_novelty_avg": [ 2.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 94.0, 25.337718918639855 ], "wc_summary_review_avg": [ 56.25, 27.48977082479954 ], "wc_main_review_avg": [ 445.25, 207.18997924610156 ], "wc_review_avg": [ 595.5, 188.25581000330374 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": -0.3333333333333333, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Jhhr38YpSC4J:scholar.google.com/&scioq=LEARNING+PHONEME-LEVEL+DISCRETE+SPEECH+REPRESENTATION+WITH+WORD-LEVEL+SUPERVISION&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0;1", "aff_unique_norm": "University of Illinois Urbana-Champaign;Korea Advanced Institute of Science and Technology", "aff_unique_dep": ";", "aff_unique_url": "https://illinois.edu;https://www.kaist.ac.kr", "aff_unique_abbr": "UIUC;KAIST", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Urbana-Champaign;", "aff_country_unique_index": "0;0;1", "aff_country_unique": "United States;South Korea" }, { "id": "Q1XWSM8ftl", "title": "Learning rate optimization through step sampling", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Modern machine learning models require selecting hyper-parameters prior to training; important variables that define the way in which the model can learn, but which cannot be learned by the model itself and instead need to be assigned in advance. Of the hyper-parameters that must be selected when configuring a model, arguably the most important is the \u201clearning rate\u201d of the model, the step size that the model uses when learning its parameters with gradient descent. Here we propose a method to deliberately select a learning rate by training a model for a small number of steps over a variety of learning rates and resetting both the model parameters and dataset between each trial. A curve of the log of those rates vs the losses achieved for each is used to select a viable range for an optimal learning rate, and we compare several methods of selecting an optimal point within that range. The performance of the selections from these methods are then evaluated using a full grid search, and in our experiments, they reliably select learning rates that achieve a good accuracy for any given model.", "keywords": "Learning Rate Optimization;Hyper-parameter tuning;LR Search;Training Efficiency", "primary_area": "", "supplementary_material": "", "author": "Brendon Eby;Suhel Jaber", "authorids": "brendon.eby@gmail.com;~Suhel_Jaber1", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";suhel-jaber-0a7323114", "or_profile": "brendon.eby@gmail.com;~Suhel_Jaber1", "aff": ";Samsung", "aff_domain": ";samsung.com", "position": ";Researcher", "bibtex": "@misc{\neby2022learning,\ntitle={Learning rate optimization through step sampling},\nauthor={Brendon Eby and Suhel Jaber},\nyear={2022},\nurl={https://openreview.net/forum?id=Q1XWSM8ftl}\n}", "github": "", "project": "", "reviewers": "6xHa;ZAxS;2oX9;KwAu", "site": "https://openreview.net/forum?id=Q1XWSM8ftl", "pdf_size": 0, "recommendation": "3;3;3;3", "confidence": "3;4;4;4", "correctness": "2;2;2;2", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "61;46;89;54", "wc_summary_review": "47;27;41;41", "wc_main_review": "233;385;470;421", "wc_review": "341;458;600;516", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "93;93;87;123", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 2.0, 0.0 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 62.5, 16.194134740701646 ], "wc_summary_review_avg": [ 39.0, 7.3484692283495345 ], "wc_main_review_avg": [ 377.25, 88.57870793819473 ], "wc_review_avg": [ 478.75, 94.20025212280486 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 99.0, 14.071247279470288 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:jNN2nNQlQ3kJ:scholar.google.com/&scioq=Learning+rate+optimization+through+step+sampling&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "Samsung", "aff_unique_dep": "Samsung", "aff_unique_url": "https://www.samsung.com", "aff_unique_abbr": "Samsung", "aff_country_unique_index": "0", "aff_country_unique": "South Korea" }, { "id": "Q1foAP0IL4x", "title": "Noisy Adversarial Training", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "In image classification, data augmentation and the usage of additional data has been shown to increase the efficiency of clean training and the accuracy of the resulting model. However, this does not prevent models from being fooled by adversarial manipulations. To increase the robustness, Adversarial Training (AT) is an easy, yet effective and widely used method to harden neural networks against adversarial inputs. Still, AT is computationally expensive and inefficient in that way, that only one adversarial input per sample of the current batch is created. We propose Noisy Adversarial Training (N-AT), which, for the first time, combines data augmentation in the decision space and adversarial training. By adding random noise to the original adversarial output vector, we create multiple pseudo adversarial instances, thus increasing the data pool for adversarial training. We show that this general idea is applicable to two different learning paradigms, i.e., supervised and self-supervised learning. Using N-AT instead of AT, we can increase the robustness relatively by 1.06\\% for small seen attacks. For larger seen attacks, the relative gain in robustness increases up to 89.26\\%. When combining a larger corpus of input data with our proposed method, we report an increase of the clean accuracy and for all observed attacks, compared to AT. In self-supervised training, we observe a similar increase in robust accuracy for seen attacks and large unseen attacks, when it comes to the downstream task of image classification. In addition, when the pretrained model is finetuned, we also report a relative gain in clean accuracy between 0.5\\% and 1.11\\%.", "keywords": "Adversarial Training;Adversarial;Adversarial Defence;Decision Space", "primary_area": "", "supplementary_material": "/attachment/7606f7da0fc05af7b4927ad9801e44b6f9081351.zip", "author": "Nils Worzyk", "authorids": "~Nils_Worzyk1", "gender": "M", "homepage": "", "dblp": "172/9071", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "~Nils_Worzyk1", "aff": "International Computer Science Institute Berkeley", "aff_domain": "icsi.berkeley.edu", "position": "Postdoc", "bibtex": "@misc{\nworzyk2022noisy,\ntitle={Noisy Adversarial Training},\nauthor={Nils Worzyk},\nyear={2022},\nurl={https://openreview.net/forum?id=Q1foAP0IL4x}\n}", "github": "", "project": "", "reviewers": "asXc;vEQM;yyfb;Lfxt", "site": "https://openreview.net/forum?id=Q1foAP0IL4x", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "5;4;4;4", "correctness": "2;2;3;3", "technical_novelty": "3;2;2;2", "empirical_novelty": "2;2;2;0", "wc_summary_paper": "85;53;37;26", "wc_summary_review": "37;44;48;35", "wc_main_review": "641;550;274;180", "wc_review": "763;647;359;241", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 50.25, 22.24157143728833 ], "wc_summary_review_avg": [ 41.0, 5.244044240850758 ], "wc_main_review_avg": [ 411.25, 189.9675959209886 ], "wc_review_avg": [ 502.5, 210.78128474795858 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 1, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1, "aff_unique_index": "0", "aff_unique_norm": "International Computer Science Institute", "aff_unique_dep": "", "aff_unique_url": "https://www.icsi.berkeley.edu/", "aff_unique_abbr": "ICSI", "aff_campus_unique_index": "0", "aff_campus_unique": "Berkeley", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "id": "Q1gackXQrSV", "title": "Language Modulated Detection and Detection Modulated Language Grounding in 2D and 3D Scenes", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "To localize an object referent, humans attend to different locations in the scene and visual cues depending on the utterance. Existing language and vision systems often model such task-driven attention using object proposal bottlenecks: a pre-trained detector proposes objects in the scene, and the model is trained to selectively process those proposals and then predict the answer without attending to the original image. Object detectors are typically trained on a fixed vocabulary of objects and attributes that is often too restrictive for open-domain language grounding, where the language utterance may refer to visual entities in various levels of abstraction, such as a cat, the leg of a cat, or the stain on the front leg of the chair. This paper proposes a model that reconciles language grounding and object detection with two main contributions: i) Architectures that exhibit iterative attention across the language stream, the pixel stream, and object detection proposals.In this way, the model learns to condition on easy-to-detect objects (e.g., \u201ctable\u201d) and language hints (e.g. \u201con the table\u201d) to detect harder objects (e.g., \u201cmugs\u201d)mentioned in the utterance. ii) Optimization objectives that treat object detection as language grounding of a large predefined set of object categories. In this way,cheap object annotations are used to supervise our model, which results in performance improvements over models that are not co-trained across both referential grounding and object detection. Our model has a much lighter computational footprint, achieves faster convergence and has shown on par or higher performance compared to both detection-bottlenecked and non-detection bottlenecked language-vision models on both 2D and 3D language grounding benchmarks.", "keywords": "Language Grounding;Modulated Object Detection;Attention;Vision and Language", "primary_area": "", "supplementary_material": "/attachment/bfd1d72ea7d235717c770b5080a37fdd7e94d59d.zip", "author": "Ayush Jain;Nikolaos Gkanatsios;Ishita Mediratta;Katerina Fragkiadaki", "authorids": "~Ayush_Jain5;~Nikolaos_Gkanatsios1;~Ishita_Mediratta1;~Katerina_Fragkiadaki1", "gender": "M;M;F;F", "homepage": "https://ayushjain1144.github.io/;https://nickgkan.github.io/;https://ishita.io;https://www.cs.cmu.edu/~katef/", "dblp": ";225/5677;289/7108;21/8780", "google_scholar": "cV-Nm_0AAAAJ;https://scholar.google.gr/citations?user=jk7GqOEAAAAJ;hgaAO6QAAAAJ;FWp7728AAAAJ", "orcid": ";;;", "linkedin": "ayush-jain-010236150/?originalSubdomain=in;;;", "or_profile": "~Ayush_Jain5;~Nikolaos_Gkanatsios1;~Ishita_Mediratta1;~Katerina_Fragkiadaki1", "aff": "Carnegie Mellon University;Carnegie Mellon University;Meta AI;Carnegie Mellon University", "aff_domain": "andrew.cmu.edu;cmu.edu;meta.com;cmu.edu", "position": "MS student;Graduate student;Researcher;Assistant Professor", "bibtex": "@misc{\njain2022language,\ntitle={Language Modulated Detection and Detection Modulated Language Grounding in 2D and 3D Scenes},\nauthor={Ayush Jain and Nikolaos Gkanatsios and Ishita Mediratta and Katerina Fragkiadaki},\nyear={2022},\nurl={https://openreview.net/forum?id=Q1gackXQrSV}\n}", "github": "", "project": "", "reviewers": "MPkx;S2gH;axXF;HPdC", "site": "https://openreview.net/forum?id=Q1gackXQrSV", "pdf_size": 0, "recommendation": "5;5;5;6", "confidence": "2;4;3;3", "correctness": "2;3;3;3", "technical_novelty": "2;2;2;3", "empirical_novelty": "3;3;2;3", "wc_summary_paper": "55;67;74;133", "wc_summary_review": "65;64;42;84", "wc_main_review": "383;175;158;147", "wc_review": "503;306;274;364", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 5.25, 0.4330127018922193 ], "confidence_avg": [ 3.0, 0.7071067811865476 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 82.25, 30.07802353878991 ], "wc_summary_review_avg": [ 63.75, 14.872373717735847 ], "wc_main_review_avg": [ 215.75, 97.07567924047711 ], "wc_review_avg": [ 361.75, 87.69941561948974 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:otiMQuCA9VkJ:scholar.google.com/&scioq=Language+Modulated+Detection+and+Detection+Modulated+Language+Grounding+in+2D+and+3D+Scenes&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Carnegie Mellon University;Meta", "aff_unique_dep": ";Meta AI", "aff_unique_url": "https://www.cmu.edu;https://meta.com", "aff_unique_abbr": "CMU;Meta", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "Q42O1Qaho5N", "title": "$G^3$: Representation Learning and Generation for Geometric Graphs", "track": "main", "status": "Reject", "tldr": "", "abstract": "A geometric graph is a graph equipped with geometric information (i.e., node coordinates). A notable example is molecular graphs, where the combinatorial bonding is supplement with atomic coordinates that determine the three-dimensional structure. This work proposes a generative model for geometric graphs, capitalizing on the complementary information of structure and geometry to learn the underlying distribution. The proposed model, Geometric Graph Generator (G$^3$), orchestrates graph neural networks and point cloud models in a nontrivial manner under an autoencoding framework. Additionally, we augment this framework with a normalizing flow so that one can effectively sample from the otherwise intractable latent space. G$^3$ can be used in computer-aided drug discovery, where seeking novel and optimal molecular structures is critical. As a representation learning approach, the interaction of the graph structure and the geometric point cloud also improve significantly the performance of downstream tasks, such as molecular property prediction. We conduct a comprehensive set of experiments to demonstrate that G$^3$ learns more accurately the distribution of given molecules and helps identify novel molecules with better properties of interest.", "keywords": "Deep Learning;Generative Models;Graph Neural Networks", "primary_area": "", "supplementary_material": "", "author": "Han Huang;Stefan C Schonsheck;Rongjie Lai;Jie Chen", "authorids": "~Han_Huang1;~Stefan_C_Schonsheck1;~Rongjie_Lai2;~Jie_Chen1", "gender": "M;M;;M", "homepage": ";https://sites.google.com/view/stefancschonsheck/home;https://jiechenjiechen.github.io;https://www.rongjielai.com", "dblp": ";;92/6289-7;", "google_scholar": "ks73Pm8AAAAJ;;Z-lkme8AAAAJ;Wp3DnKUAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Han_Huang1;~Stefan_C_Schonsheck1;~Jie_Chen1;~Rongjie_Lai4", "aff": "Rensselaer Polytechnic Institute;University of California, Davis;International Business Machines;Rensselaer Polytechnic Institute", "aff_domain": "rpi.edu;ucdavis.edu;ibm.com;rpi.edu", "position": "PhD student;Assistant Professor;Research Staff Member;Associate Professor", "bibtex": "@misc{\nhuang2022g,\ntitle={\\$G{\\textasciicircum}3\\$: Representation Learning and Generation for Geometric Graphs},\nauthor={Han Huang and Stefan C Schonsheck and Rongjie Lai and Jie Chen},\nyear={2022},\nurl={https://openreview.net/forum?id=Q42O1Qaho5N}\n}", "github": "", "project": "", "reviewers": "qSkw;rFAq;PjEF;LcmV", "site": "https://openreview.net/forum?id=Q42O1Qaho5N", "pdf_size": 0, "recommendation": "3;5;8;8", "confidence": "3;2;4;4", "correctness": "3;4;4;4", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "44;109;32;70", "wc_summary_review": "50;91;37;30", "wc_main_review": "246;121;347;76", "wc_review": "340;321;416;176", "wc_reply_reviewers": "0;0;43;12", "wc_reply_authors": "802;669;839;231", "reply_reviewers": "0;0;1;1", "reply_authors": "2;1;2;1", "recommendation_avg": [ 6.0, 2.1213203435596424 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 63.75, 29.51588555337617 ], "wc_summary_review_avg": [ 52.0, 23.632604596192948 ], "wc_main_review_avg": [ 197.5, 106.4389496378088 ], "wc_review_avg": [ 313.25, 86.8486470821509 ], "wc_reply_reviewers_avg": [ 13.75, 17.583728273605686 ], "wc_reply_authors_avg": [ 635.25, 241.80402705496863 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.7106690545187014, "corr_recommendation_correctness": 0.8164965809277261, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:fDD0t4izgAQJ:scholar.google.com/&scioq=%24G%5E3%24:+Representation+Learning+and+Generation+for+Geometric+Graphs&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "Rensselaer Polytechnic Institute;University of California, Davis;International Business Machines Corporation", "aff_unique_dep": ";;", "aff_unique_url": "https://www.rpi.edu;https://www.ucdavis.edu;https://www.ibm.com", "aff_unique_abbr": "RPI;UC Davis;IBM", "aff_campus_unique_index": "1", "aff_campus_unique": ";Davis", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Differentially Private Fine-tuning of Language Models", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7161", "id": "Q42f0dfjECO", "poster": "", "openreview": "https://openreview.net/forum?id=Q42f0dfjECO", "slides": "https://iclr.cc/virtual/2022/poster/7161", "video": "https://iclr.cc/virtual/2022/poster/7161", "author_site": "Da Yu, Saurabh Naik, Arturs Backurs, Sivakanth Gopi, Huseyin Inan, Gautam Kamath, Janardhan Kulkarni, Yin Tat Lee, Andre Manoel, Lukas Wutschitz, Sergey Yekhanin, Huishuai Zhang", "tldr": "", "abstract": "We give simpler, sparser, and faster algorithms for differentially private fine-tuning of large-scale pre-trained language models, which achieve the state-of-the-art privacy versus utility tradeoffs on many standard NLP tasks. We propose a meta-framework for this problem, inspired by the recent success of highly parameter-efficient methods for fine-tuning. Our experiments show that differentially private adaptations of these approaches outperform previous private algorithms in three important dimensions: utility, privacy, and the computational and memory cost of private training. On many commonly studied datasets, the utility of private models approaches that of non-private models. For example, on the MNLI dataset we achieve an accuracy of $87.8\\%$ using RoBERTa-Large and $83.5\\%$ using RoBERTa-Base with a privacy budget of $\\epsilon = 6.7$. In comparison, absent privacy constraints, RoBERTa-Large achieves an accuracy of $90.2\\%$. Our findings are similar for natural language generation when privately fine-tuning GPT-2. Our experiments also show that larger models are better suited for private fine-tuning: while they are well known to achieve superior accuracy non-privately, we find that they also better maintain their accuracy when privacy is introduced.", "keywords": "differential privacy;large language models;fine-tuning", "primary_area": "", "supplementary_material": "", "author": "Da Yu;Saurabh Naik;Arturs Backurs;Sivakanth Gopi;Huseyin A Inan;Gautam Kamath;Janardhan Kulkarni;Yin Tat Lee;Andre Manoel;Lukas Wutschitz;Sergey Yekhanin;Huishuai Zhang", "authorids": "~Da_Yu1;~Saurabh_Naik1;~Arturs_Backurs1;~Sivakanth_Gopi1;~Huseyin_A_Inan1;~Gautam_Kamath1;~Janardhan_Kulkarni2;~Yin_Tat_Lee1;~Andre_Manoel1;~Lukas_Wutschitz1;~Sergey_Yekhanin1;~Huishuai_Zhang3", "gender": "M;M;;M;;M;M;;M;M;M;", "homepage": ";;http://www.mit.edu/~backurs/;https://aka.ms/sigopi;;http://www.gautamkamath.com/;;;;;https://www.microsoft.com/en-us/research/people/yekhanin/;", "dblp": "48/8545;;74/10669;123/7803.html;;73/11140;54/1978;;122/2913;263/8844;29/1329;", "google_scholar": "FcRGdiwAAAAJ;;UNHdIKoAAAAJ;bYhGFrwAAAAJ;;MK6zHkYAAAAJ;_fxnybwAAAAJ;;LpWGWAwAAAAJ;;4WEQ8h0AAAAJ;", "orcid": ";;;;;;;;0000-0002-5455-0230;0000-0003-4321-6509;;", "linkedin": ";saurabh-naik-9750b0b/;;sivakanthgopi/;;;;;andre-manoel/;;sergey-yekhanin-5242ba192/;", "or_profile": "~Da_Yu1;~Saurabh_Naik1;~Arturs_Backurs1;~Sivakanth_Gopi1;~Huseyin_A_Inan1;~Gautam_Kamath1;~Janardhan_Kulkarni2;~Yin_Tat_Lee1;~Andre_Manoel1;~Lukas_Wutschitz1;~Sergey_Yekhanin1;~Huishuai_Zhang3", "aff": "Microsoft;;Microsoft;Microsoft Research;;University of Waterloo;Microsoft Research, Redmond;;Microsoft;Microsoft;Microsoft;", "aff_domain": "microsoft.com;;microsoft.com;microsoft.com;;uwaterloo.ca;microsoft.com;;microsoft.com;microsoft.com;microsoft.com;", "position": "Research intern;;Researcher;Senior Researcher;;Assistant Professor;Researcher;;Research Engineer;Researcher;Principal Researcher;", "bibtex": "@inproceedings{\nyu2022differentially,\ntitle={Differentially Private Fine-tuning of Language Models},\nauthor={Da Yu and Saurabh Naik and Arturs Backurs and Sivakanth Gopi and Huseyin A Inan and Gautam Kamath and Janardhan Kulkarni and Yin Tat Lee and Andre Manoel and Lukas Wutschitz and Sergey Yekhanin and Huishuai Zhang},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=Q42f0dfjECO}\n}", "github": "", "project": "", "reviewers": "oFqs;eqgr;hxyL;BZJU", "pdf_size": 0, "recommendation": "6;6;6;8", "confidence": "4;5;4;4", "correctness": "3;3;4;4", "technical_novelty": "3;2;2;2", "empirical_novelty": "3;2;4;3", "wc_summary_paper": "95;72;104;21", "wc_summary_review": "34;49;37;18", "wc_main_review": "281;219;315;88", "wc_review": "410;340;456;127", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "463;297;526;76", "reply_reviewers": "0;0;0;0", "reply_authors": "2;1;1;1", "recommendation_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 73.0, 32.21024681681281 ], "wc_summary_review_avg": [ 34.5, 11.05667219374799 ], "wc_main_review_avg": [ 225.75, 86.65845313643672 ], "wc_review_avg": [ 333.25, 126.03843659772998 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 340.5, 174.11849413545937 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 12, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 403, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12904184554730666861&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=Q42f0dfjECO", "email": "microsoft.com;;microsoft.com;microsoft.com;;uwaterloo.ca;microsoft.com;;microsoft.com;microsoft.com;microsoft.com;", "author_num": 12, "aff_unique_index": "0;0;0;1;0;0;0;0", "aff_unique_norm": "Microsoft;University of Waterloo", "aff_unique_dep": "Microsoft Corporation;", "aff_unique_url": "https://www.microsoft.com;https://uwaterloo.ca", "aff_unique_abbr": "Microsoft;UW", "aff_campus_unique_index": "1", "aff_campus_unique": ";Redmond", "aff_country_unique_index": "0;0;0;1;0;0;0;0", "aff_country_unique": "United States;Canada" }, { "title": "AdaMatch: A Unified Approach to Semi-Supervised Learning and Domain Adaptation", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6340", "id": "Q5uh1Nvv5dm", "poster": "", "openreview": "https://openreview.net/forum?id=Q5uh1Nvv5dm", "slides": "https://iclr.cc/virtual/2022/poster/6340", "video": "https://iclr.cc/virtual/2022/poster/6340", "author_site": "David Berthelot, Rebecca Roelofs, Kihyuk Sohn, Nicholas Carlini, Alexey Kurakin", "tldr": "", "abstract": "We extend semi-supervised learning to the problem of domain adaptation to learn significantly higher-accuracy models that train on one data distribution and test on a different one. With the goal of generality, we introduce AdaMatch, a unified solution for unsupervised domain adaptation (UDA), semi-supervised learning (SSL), and semi-supervised domain adaptation (SSDA). In an extensive experimental study, we compare its behavior with respective state-of-the-art techniques from SSL, SSDA, and UDA and find that AdaMatch either matches or significantly exceeds the state-of-the-art in each case using the same hyper-parameters regardless of the dataset or task. For example, AdaMatch nearly doubles the accuracy compared to that of the prior state-of-the-art on the UDA task for DomainNet and even exceeds the accuracy of the prior state-of-the-art obtained with pre-training by 6.4% when AdaMatch is trained completely from scratch. Furthermore, by providing AdaMatch with just one labeled example per class from the target domain (i.e., the SSDA setting), we increase the target accuracy by an additional 6.1%, and with 5 labeled examples, by 13.6%.", "keywords": "unsupervised domain adaptation;semi-supervised learning;semi-supervised domain adaptation", "primary_area": "", "supplementary_material": "", "author": "David Berthelot;Rebecca Roelofs;Kihyuk Sohn;Nicholas Carlini;Alexey Kurakin", "authorids": "~David_Berthelot1;~Rebecca_Roelofs1;~Kihyuk_Sohn1;~Nicholas_Carlini1;~Alexey_Kurakin1", "gender": ";F;M;;M", "homepage": ";;https://sites.google.com/site/kihyuksml/;http://nicholas.carlini.com;http://kurakin.me", "dblp": "96/6489;145/2224;53/10771;145/1806;56/9834", "google_scholar": "46--eogAAAAJ;;VxpypngAAAAJ;;nCh4qyMAAAAJ", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~David_Berthelot1;~Rebecca_Roelofs1;~Kihyuk_Sohn1;~Nicholas_Carlini1;~Alexey_Kurakin1", "aff": "Apple;Google;Google;Google;Research, Google", "aff_domain": "apple.com;google.com;google.com;google.com;research.google.com", "position": "Researcher;Research scientist;Research Scientist;Researcher;Research Software Engineer", "bibtex": "@inproceedings{\nberthelot2022adamatch,\ntitle={AdaMatch: A Unified Approach to Semi-Supervised Learning and Domain Adaptation},\nauthor={David Berthelot and Rebecca Roelofs and Kihyuk Sohn and Nicholas Carlini and Alexey Kurakin},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=Q5uh1Nvv5dm}\n}", "github": "", "project": "", "reviewers": "wiKA;Jgga;aiq7;rNsu", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "4;4;4;5", "correctness": "3;3;3;4", "technical_novelty": "2;3;2;3", "empirical_novelty": "3;3;4;0", "wc_summary_paper": "135;51;70;72", "wc_summary_review": "109;28;38;50", "wc_main_review": "525;145;429;196", "wc_review": "769;224;537;318", "wc_reply_reviewers": "69;9;0;0", "wc_reply_authors": "933;137;288;281", "reply_reviewers": "1;1;0;0", "reply_authors": "3;1;1;1", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 1.5 ], "wc_summary_paper_avg": [ 82.0, 31.678068122914315 ], "wc_summary_review_avg": [ 56.25, 31.43544973433655 ], "wc_main_review_avg": [ 323.75, 157.99584646439286 ], "wc_review_avg": [ 462.0, 210.50771957341613 ], "wc_reply_reviewers_avg": [ 19.5, 28.81405906844782 ], "wc_reply_authors_avg": [ 409.75, 308.0514364517718 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.5, 0.8660254037844386 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.9271726499455306, "corr_recommendation_correctness": 0.9271726499455306, "gs_citation": 203, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9221339163655588943&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=Q5uh1Nvv5dm", "email": "apple.com;google.com;google.com;google.com;research.google.com", "author_num": 5, "aff_unique_index": "0;1;1;1;1", "aff_unique_norm": "Apple;Google", "aff_unique_dep": "Apple Inc.;Google", "aff_unique_url": "https://www.apple.com;https://www.google.com", "aff_unique_abbr": "Apple;Google", "aff_campus_unique_index": "1;1;1;1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Boosting the Certified Robustness of L-infinity Distance Nets", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6543", "id": "Q76Y7wkiji", "poster": "", "openreview": "https://openreview.net/forum?id=Q76Y7wkiji", "slides": "https://iclr.cc/virtual/2022/poster/6543", "video": "https://iclr.cc/virtual/2022/poster/6543", "author_site": "Bohang Zhang, Du Jiang, Di He, Liwei Wang", "tldr": "", "abstract": "Recently, Zhang et al. (2021) developed a new neural network architecture based on $\\ell_\\infty$-distance functions, which naturally possesses certified $\\ell_\\infty$ robustness by its construction. Despite the novel design and theoretical foundation, so far the model only achieved comparable performance to conventional networks. In this paper, we make the following two contributions: $\\mathrm{(i)}$ We demonstrate that $\\ell_\\infty$-distance nets enjoy a fundamental advantage in certified robustness over conventional networks (under typical certification approaches); $\\mathrm{(ii)}$ With an improved training process we are able to significantly boost the certified accuracy of $\\ell_\\infty$-distance nets. Our training approach largely alleviates the optimization problem that arose in the previous training scheme, in particular, the unexpected large Lipschitz constant due to the use of a crucial trick called \\textit{$\\ell_p$-relaxation}. The core of our training approach is a novel objective function that combines scaled cross-entropy loss and clipped hinge loss with a decaying mixing coefficient. Experiments show that using the proposed training strategy, the certified accuracy of $\\ell_\\infty$-distance net can be dramatically improved from 33.30% to 40.06% on CIFAR-10 ($\\epsilon=8/255$), meanwhile outperforming other approaches in this area by a large margin. Our results clearly demonstrate the effectiveness and potential of $\\ell_\\infty$-distance net for certified robustness. Codes are available at https://github.com/zbh2047/L_inf-dist-net-v2.", "keywords": "Adversarial Robustness;Certified Defense;Lipschitz Network", "primary_area": "", "supplementary_material": "/attachment/a9cf8b11614a3670161a19ff40037d41d9549ba2.zip", "author": "Bohang Zhang;Du Jiang;Di He;Liwei Wang", "authorids": "~Bohang_Zhang1;tlzmybm@gmail.com;~Di_He1;~Liwei_Wang1", "gender": "M;;M;M", "homepage": "https://zbh2047.github.io;;https://dihe-pku.github.io/;http://www.liweiwang-pku.com/", "dblp": "276/0156.html;;74/184;", "google_scholar": "https://scholar.google.com.hk/citations?hl=zh-CN;;https://scholar.google.co.jp/citations?user=orVoz4IAAAAJ;VZHxoh8AAAAJ", "orcid": ";;;", "linkedin": "zhangbohang;;;", "or_profile": "~Bohang_Zhang1;tlzmybm@gmail.com;~Di_He1;~Liwei_Wang1", "aff": "Peking University;;Microsoft;Peking University", "aff_domain": "pku.edu.cn;;microsoft.com;pku.edu.cn", "position": "PhD student;;Senior Researcher;Full Professor", "bibtex": "@inproceedings{\nzhang2022boosting,\ntitle={Boosting the Certified Robustness of L-infinity Distance Nets},\nauthor={Bohang Zhang and Du Jiang and Di He and Liwei Wang},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=Q76Y7wkiji}\n}", "github": "", "project": "", "reviewers": "WMpC;THnD;d51P;TteL", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "3;4;4;4", "correctness": "3;4;3;3", "technical_novelty": "3;3;3;3", "empirical_novelty": "2;4;4;3", "wc_summary_paper": "45;25;50;114", "wc_summary_review": "49;22;34;57", "wc_main_review": "169;136;163;179", "wc_review": "263;183;247;350", "wc_reply_reviewers": "0;99;36;0", "wc_reply_authors": "698;1389;829;246", "reply_reviewers": "0;2;1;0", "reply_authors": "2;4;2;1", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 3.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 58.5, 33.380383460949034 ], "wc_summary_review_avg": [ 40.5, 13.5 ], "wc_main_review_avg": [ 161.75, 15.927570436196476 ], "wc_review_avg": [ 260.75, 59.59184088447008 ], "wc_reply_reviewers_avg": [ 33.75, 40.43745170012571 ], "wc_reply_authors_avg": [ 790.5, 407.65211884644975 ], "reply_reviewers_avg": [ 0.75, 0.82915619758885 ], "reply_authors_avg": [ 2.25, 1.0897247358851685 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.6622661785325219, "corr_recommendation_correctness": -0.13245323570650439, "gs_citation": 35, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7903222136558927992&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=Q76Y7wkiji", "email": "pku.edu.cn;;microsoft.com;pku.edu.cn", "author_num": 4, "aff_unique_index": "0;1;0", "aff_unique_norm": "Peking University;Microsoft", "aff_unique_dep": ";Microsoft Corporation", "aff_unique_url": "http://www.pku.edu.cn;https://www.microsoft.com", "aff_unique_abbr": "Peking U;Microsoft", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", "aff_country_unique": "China;United States" }, { "title": "Bandit Learning with Joint Effect of Incentivized Sampling, Delayed Sampling Feedback, and Self-Reinforcing User Preferences", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7025", "id": "Q83vFlie_Pr", "poster": "", "openreview": "https://openreview.net/forum?id=Q83vFlie_Pr", "slides": "https://iclr.cc/virtual/2022/poster/7025", "video": "https://iclr.cc/virtual/2022/poster/7025", "author_site": "Tianchen Zhou, Jia Liu, Chaosheng Dong, Yi Sun", "tldr": "", "abstract": "In this paper, we consider a new multi-armed bandit (MAB) framework motivated by three common complications in online recommender systems in practice: (i) the platform (learning agent) cannot sample an intended product directly and has to incentivize customers to select this product (e.g., promotions and coupons); (ii) customer feedbacks are often received later than their selection times; and (iii) customer preferences among products are influenced and reinforced by historical feedbacks. From the platform's perspective, the goal of the MAB framework is to maximize total reward without incurring excessive incentive costs. A major challenge of this MAB framework is that the loss of information caused by feedback delay complicates both user preference evolution and arm incentivizing decisions, both of which are already highly non-trivial even by themselves. Toward this end, we first propose a policy called ``UCB-Filtering-with-Delayed-Feedback'' (UCB-FDF) policy for this new MAB framework. In our analysis, we consider delayed feedbacks that can have either arm-independent or arm-dependent distributions. In both cases, we allow unbounded support for the random delays, i.e., the random delay can be infinite. We show that the delay impacts in both cases can still be upper bounded by an additive penalty on both the regret and total incentive costs. This further implies that logarithmic regret and incentive cost growth rates are achievable under this new MAB framework. Experimental results corroborate our theoretical analysis on both regret and incentive costs.\n", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/269adcd3db2846a2315e5c6daa7a63dbc05dfa62.zip", "author": "Tianchen Zhou;Jia Liu;Chaosheng Dong;Yi Sun", "authorids": "~Tianchen_Zhou1;~Jia_Liu1;~Chaosheng_Dong1;yisun@amazon.com", "gender": "F;M;M;", "homepage": ";https://kevinliu-osu.github.io/index.html;https://chaoshengdong.github.io/;", "dblp": "293/7470;;225/6556;", "google_scholar": "oGYO-a0AAAAJ;Ofx3dScAAAAJ;nPratvEAAAAJ;", "orcid": ";;0000-0003-4491-0594;", "linkedin": "tianchen-zhou-6582b510b/;;chaosheng-dong/;", "or_profile": "~Tianchen_Zhou1;~Jia_Liu1;~Chaosheng_Dong1;yisun@amazon.com", "aff": "The Ohio State University;The Ohio State University;Amazon;", "aff_domain": "osu.edu;osu.edu;amazon.com;", "position": "PhD student;Assistant Professor;Researcher;", "bibtex": "@inproceedings{\nzhou2022bandit,\ntitle={Bandit Learning with Joint Effect of Incentivized Sampling, Delayed Sampling Feedback, and Self-Reinforcing User Preferences},\nauthor={Tianchen Zhou and Jia Liu and Chaosheng Dong and Yi Sun},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=Q83vFlie_Pr}\n}", "github": "", "project": "", "reviewers": "mJjk;8BXC;Rqug;wZfK", "pdf_size": 0, "recommendation": "5;6;6;6", "confidence": "4;3;4;4", "correctness": "3;4;4;3", "technical_novelty": "2;3;3;3", "empirical_novelty": "3;2;3;0", "wc_summary_paper": "42;92;66;71", "wc_summary_review": "35;130;37;108", "wc_main_review": "273;424;184;666", "wc_review": "350;646;287;845", "wc_reply_reviewers": "12;73;0;1002", "wc_reply_authors": "912;1294;468;2289", "reply_reviewers": "1;1;0;3", "reply_authors": "2;2;1;4", "recommendation_avg": [ 5.75, 0.4330127018922193 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 1.224744871391589 ], "wc_summary_paper_avg": [ 67.75, 17.781661902083282 ], "wc_summary_review_avg": [ 77.5, 42.228544848242166 ], "wc_main_review_avg": [ 386.75, 182.62991950937283 ], "wc_review_avg": [ 532.0, 225.89488706033165 ], "wc_reply_reviewers_avg": [ 271.75, 422.5176771449924 ], "wc_reply_authors_avg": [ 1240.75, 672.1016943141864 ], "reply_reviewers_avg": [ 1.25, 1.0897247358851685 ], "reply_authors_avg": [ 2.25, 1.0897247358851685 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1, "pdf": "https://openreview.net/pdf?id=Q83vFlie_Pr", "email": "osu.edu;osu.edu;amazon.com;", "author_num": 4, "aff_unique_index": "0;0;1", "aff_unique_norm": "Ohio State University;Amazon", "aff_unique_dep": ";Amazon.com, Inc.", "aff_unique_url": "https://www.osu.edu;https://www.amazon.com", "aff_unique_abbr": "OSU;Amazon", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "Q8OjAGkxwP5", "title": "Limitations of Active Learning With Deep Transformer Language Models", "track": "main", "status": "Reject", "tldr": "", "abstract": "Active Learning (AL) has the potential to reduce labeling cost when training natural language processing models, but its effectiveness with the large pretrained transformer language models that power today's NLP is uncertain. We present experiments showing that when applied to modern pretrained models, active learning offers inconsistent and often poor performance. As in prior work, we find that AL sometimes selects harmful \"unlearnable\" collective outliers, but we discover that some failures have a different explanation: the examples AL selects are informative but also increase training instability, reducing average performance. Our findings suggest that for some datasets this instability can be mitigated by training multiple models and selecting the best on a validation set, which we show impacts relative AL performance comparably to the outlier-pruning technique from prior work while also increasing absolute performance. Our experiments span three pretrained models, ten datasets, and four active learning approaches.", "keywords": "Active Learning;Machine Learning;Natural Language Processing", "primary_area": "", "supplementary_material": "/attachment/78e774c2e7f527cc493563b643621b8d02c9728b.zip", "author": "Mike D'Arcy;Doug Downey", "authorids": "~Mike_D'Arcy1;~Doug_Downey1", "gender": "M;M", "homepage": "https://mdarcy220.gitlab.io/;https://www.cs.northwestern.edu/~ddowney/", "dblp": ";57/5363", "google_scholar": "q5WUx2AAAAAJ;E8evkcQAAAAJ", "orcid": "0000-0003-0355-7157;", "linkedin": ";", "or_profile": "~Mike_D'Arcy1;~Doug_Downey1", "aff": "Northwestern University;Northwestern University", "aff_domain": "northwestern.edu;northwestern.edu", "position": "PhD student;Professor", "bibtex": "@misc{\nd'arcy2022limitations,\ntitle={Limitations of Active Learning With Deep Transformer Language Models},\nauthor={Mike D'Arcy and Doug Downey},\nyear={2022},\nurl={https://openreview.net/forum?id=Q8OjAGkxwP5}\n}", "github": "", "project": "", "reviewers": "Seyb;e7J6;DPi3;dz3i;igJc", "site": "https://openreview.net/forum?id=Q8OjAGkxwP5", "pdf_size": 0, "recommendation": "5;5;6;6;6", "confidence": "3;4;3;4;3", "correctness": "3;2;3;3;4", "technical_novelty": "3;1;2;2;3", "empirical_novelty": "2;2;3;2;2", "wc_summary_paper": "56;82;60;78;18", "wc_summary_review": "46;83;50;28;22", "wc_main_review": "333;754;365;191;83", "wc_review": "435;919;475;297;123", "wc_reply_reviewers": "5;450;61;59;0", "wc_reply_authors": "294;604;223;216;77", "reply_reviewers": "1;1;1;1;0", "reply_authors": "1;2;1;1;1", "recommendation_avg": [ 5.6, 0.48989794855663565 ], "confidence_avg": [ 3.4, 0.4898979485566356 ], "correctness_avg": [ 3.0, 0.6324555320336759 ], "technical_novelty_avg": [ 2.2, 0.7483314773547882 ], "empirical_novelty_avg": [ 2.2, 0.39999999999999997 ], "wc_summary_paper_avg": [ 58.8, 22.719154913860685 ], "wc_summary_review_avg": [ 45.8, 21.37662274541982 ], "wc_main_review_avg": [ 345.2, 228.1161107857137 ], "wc_review_avg": [ 449.8, 265.00898097989057 ], "wc_reply_reviewers_avg": [ 115.0, 169.47094146195093 ], "wc_reply_authors_avg": [ 282.8, 175.32073465508864 ], "reply_reviewers_avg": [ 0.8, 0.4 ], "reply_authors_avg": [ 1.2, 0.4000000000000001 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.16666666666666666, "corr_recommendation_correctness": 0.6454972243679027, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5936224528564491397&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Northwestern University", "aff_unique_dep": "", "aff_unique_url": "https://www.northwestern.edu", "aff_unique_abbr": "NU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "QCeFEThVn3", "title": "GraphEBM: Towards Permutation Invariant and Multi-Objective Molecular Graph Generation", "track": "main", "status": "Reject", "tldr": "", "abstract": "Although significant progress has been made in molecular graph generation recently, permutation invariance and multi-objective generation remain to be important but challenging goals to achieve. In this work, we propose GraphEBM, a molecular graph generation method via energy-based models (EBMs), as an exploratory work to perform permutation invariant and multi-objective molecule generation. Particularly, thanks to the flexibility of EBMs and our parameterized permutation-invariant energy function, our GraphEBM can define a permutation invariant distribution over molecular graphs. We learn the energy function by contrastive divergence and generate samples by Langevin dynamics. In addition, to generate molecules with a specific desirable property, we propose a simple yet effective learning strategy, which pushes down energies with flexible degrees according to the properties of corresponding molecules. Further, we explore to use our GraphEBM for generating molecules towards multiple objectives via compositional generation, which is practically desired in drug discovery. We conduct comprehensive experiments on random, single-objective, and multi-objective molecule generation tasks. The results demonstrate our method is effective.", "keywords": "molecular graph generation;energy-based models;permutation invariance;multi-objective", "primary_area": "", "supplementary_material": "", "author": "Meng Liu;Keqiang Yan;Bora Oztekin;Shuiwang Ji", "authorids": "~Meng_Liu3;~Keqiang_Yan2;~Bora_Oztekin1;~Shuiwang_Ji1", "gender": "M;M;;M", "homepage": "https://mengliu1998.github.io;;https://boraoztekin.com;http://people.tamu.edu/~sji", "dblp": "41/7841-15;272/6760;276/1721.html;84/6405", "google_scholar": "https://scholar.google.com/citations?hl=en;cv52C8oAAAAJ;k7tlPR0AAAAJ;BZGj6sAAAAAJ", "orcid": ";;0000-0003-4766-4106;0000-0002-4205-4563", "linkedin": "meng-liu-4a1813197/;;http://linkedin.com/in/boraoztekin;shuiwang-ji-9a040715/", "or_profile": "~Meng_Liu3;~Keqiang_Yan2;~Bora_Oztekin1;~Shuiwang_Ji1", "aff": "Texas A&M University - College Station;Texas A&M University;Texas A&M;Texas A&M University", "aff_domain": "tamu.edu;tamu.edu;tamu.edu;tamu.edu", "position": "PhD student;PhD student;Undergrad student;Professor", "bibtex": "@misc{\nliu2022graphebm,\ntitle={Graph{EBM}: Towards Permutation Invariant and Multi-Objective Molecular Graph Generation},\nauthor={Meng Liu and Keqiang Yan and Bora Oztekin and Shuiwang Ji},\nyear={2022},\nurl={https://openreview.net/forum?id=QCeFEThVn3}\n}", "github": "", "project": "", "reviewers": "JCco;tnpH;Fo2L;EHKx", "site": "https://openreview.net/forum?id=QCeFEThVn3", "pdf_size": 0, "recommendation": "3;3;5;6", "confidence": "3;4;4;3", "correctness": "3;3;4;4", "technical_novelty": "2;3;3;3", "empirical_novelty": "3;3;2;2", "wc_summary_paper": "86;84;65;81", "wc_summary_review": "31;39;49;13", "wc_main_review": "140;426;378;113", "wc_review": "257;549;492;207", "wc_reply_reviewers": "0;29;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;1;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.25, 1.299038105676658 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 79.0, 8.276472678623424 ], "wc_summary_review_avg": [ 33.0, 13.19090595827292 ], "wc_main_review_avg": [ 264.25, 139.11932827612415 ], "wc_review_avg": [ 376.25, 146.71975838311621 ], "wc_reply_reviewers_avg": [ 7.25, 12.55736835487436 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.19245008972987526, "corr_recommendation_correctness": 0.9622504486493761, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:EUF5-G068gUJ:scholar.google.com/&scioq=GraphEBM:+Towards+Permutation+Invariant+and+Multi-Objective+Molecular+Graph+Generation&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Texas A&M University", "aff_unique_dep": "", "aff_unique_url": "https://www.tamu.edu", "aff_unique_abbr": "TAMU", "aff_campus_unique_index": "0", "aff_campus_unique": "College Station;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "QDDVxweQJy0", "title": "Proving Theorems using Incremental Learning and Hindsight Experience Replay", "track": "main", "status": "Reject", "tldr": "", "abstract": "Traditional automated theorem provers for first-order logic depend on speed-optimized search and many handcrafted heuristics that are designed to work best over a wide range of domains. Machine learning approaches in literature either depend on these traditional provers to bootstrap themselves or fall short on reaching comparable performance. In this paper, we propose a general incremental learning algorithm for training domain-specific provers for first-order logic without equality, based only on a basic given-clause algorithm, but using a learned clause-scoring function. Clauses are represented as graphs and presented to transformer networks with spectral features. To address the sparsity and the initial lack of training data as well as the lack of a natural curriculum, we adapt hindsight experience replay to theorem proving, so as to be able to learn even when no proof can be found. We show that provers trained this way can match and sometimes surpass state-of-the-art traditional provers on the TPTP dataset in terms of both quantity and quality of the proofs.", "keywords": "theorem proving;incremental learning;hindsight experience replay;transformers", "primary_area": "", "supplementary_material": "/attachment/764fa2406f3be9846389906b407dc26b1e8b7f58.zip", "author": "Eser Ayg\u00fcn;Laurent Orseau;Ankit Anand;Xavier Glorot;Vlad Firoiu;Lei M Zhang;Doina Precup;Shibl Mourad", "authorids": "~Eser_Ayg\u00fcn1;~Laurent_Orseau1;~Ankit_Anand4;~Xavier_Glorot1;~Vlad_Firoiu1;~Lei_M_Zhang1;~Doina_Precup1;~Shibl_Mourad1", "gender": ";M;M;M;F;M;M;", "homepage": ";;;https://leiz86.github.io;http://cs.mcgill.ca/~dprecup/;;https://sites.google.com/corp/view/ankitsanand/home;https://eseraygun.com/", "dblp": "79/1040;;195/6056;;p/DoinaPrecup;;;", "google_scholar": ";_WnkXlkAAAAJ;;-kdBDxYAAAAJ;https://scholar.google.com.tw/citations?user=j54VcVEAAAAJ;;;mogd5nkAAAAJ", "orcid": ";0000-0002-6729-2189;;;;;;", "linkedin": ";xavier-glorot-33692956/;;;;;;", "or_profile": "~Laurent_Orseau1;~Xavier_Glorot1;~Vlad_Firoiu1;~Lei_M_Zhang1;~Doina_Precup1;~Shibl_Mourad1;~Ankit_Anand1;~Eser_Aygun1", "aff": ";Google DeepMind;;Google DeepMind;McGill University;;Google DeepMind;Google DeepMind", "aff_domain": ";google.com;;deepmind.com;mcgill.ca;;deepmind.com;deepmind.com", "position": ";Researcher;;Research Scientist;Associate Professor;;Research Scientist;Researcher", "bibtex": "@misc{\nayg{\\\"u}n2022proving,\ntitle={Proving Theorems using Incremental Learning and Hindsight Experience Replay},\nauthor={Eser Ayg{\\\"u}n and Laurent Orseau and Ankit Anand and Xavier Glorot and Vlad Firoiu and Lei M Zhang and Doina Precup and Shibl Mourad},\nyear={2022},\nurl={https://openreview.net/forum?id=QDDVxweQJy0}\n}", "github": "", "project": "", "reviewers": "89SU;MLCG;iBNz;8LVa;ywZc", "site": "https://openreview.net/forum?id=QDDVxweQJy0", "pdf_size": 0, "recommendation": "3;5;5;6;8", "confidence": "4;5;3;4;4", "correctness": "3;3;3;3;3", "technical_novelty": "2;2;2;2;3", "empirical_novelty": "2;3;3;3;3", "wc_summary_paper": "101;256;106;47;58", "wc_summary_review": "53;123;57;47;69", "wc_main_review": "543;424;224;290;344", "wc_review": "697;803;387;384;471", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "0;0;0;0;0", "reply_reviewers": "0;0;0;0;0", "reply_authors": "0;0;0;0;0", "recommendation_avg": [ 5.4, 1.624807680927192 ], "confidence_avg": [ 4.0, 0.6324555320336759 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.2, 0.39999999999999997 ], "empirical_novelty_avg": [ 2.8, 0.39999999999999997 ], "wc_summary_paper_avg": [ 113.6, 74.86147206674472 ], "wc_summary_review_avg": [ 69.8, 27.556487439439735 ], "wc_main_review_avg": [ 365.0, 110.55496370584181 ], "wc_review_avg": [ 548.4, 170.86322014991993 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 23, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6014838924852806153&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0;1;0;0", "aff_unique_norm": "Google;McGill University", "aff_unique_dep": "Google DeepMind;", "aff_unique_url": "https://deepmind.com;https://www.mcgill.ca", "aff_unique_abbr": "DeepMind;McGill", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;0", "aff_country_unique": "United Kingdom;Canada" }, { "title": "THOMAS: Trajectory Heatmap Output with learned Multi-Agent Sampling", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6793", "id": "QDdJhACYrlX", "poster": "", "openreview": "https://openreview.net/forum?id=QDdJhACYrlX", "slides": "https://iclr.cc/virtual/2022/poster/6793", "video": "https://iclr.cc/virtual/2022/poster/6793", "author_site": "Thomas Gilles, Stefano Sabatini, Dzmitry Tsishkou, Bogdan Stanciulescu, Fabien Moutarde", "tldr": "", "abstract": "In this paper, we propose THOMAS, a joint multi-agent trajectory prediction framework allowing for an efficient and consistent prediction of multi-agent multi-modal trajectories. We present a unified model architecture for simultaneous agent future heatmap estimation, in which we leverage hierarchical and sparse image generation for fast and memory-efficient inference. We propose a learnable trajectory recombination model that takes as input a set of predicted trajectories for each agent and outputs its consistent reordered recombination. This recombination module is able to realign the initially independent modalities so that they do no collide and are coherent with each other. We report our results on the Interaction multi-agent prediction challenge and rank $1^{st}$ on the online test leaderboard.", "keywords": "Trajectory prediction;Multi-agent;Motion forecasting;Motion estimation;Autonomous driving", "primary_area": "", "supplementary_material": "", "author": "Thomas Gilles;Stefano Sabatini;Dzmitry Tsishkou;Bogdan Stanciulescu;Fabien Moutarde", "authorids": "~Thomas_Gilles1;~Stefano_Sabatini1;~Dzmitry_Tsishkou1;~Bogdan_Stanciulescu1;~Fabien_Moutarde1", "gender": "M;;;M;M", "homepage": ";https://scholar.google.com/citations?user=7wHdhM_d-zYC&hl=en;;;http://people.minesparis.psl.eu/fabien.moutarde", "dblp": "250/9314;;;;69/3569", "google_scholar": "IrPWvM0AAAAJ;7wHdhM_d-zYC;;https://scholar.google.fr/citations?user=42aGR78AAAAJ;https://scholar.google.fr/citations?user=8IMaM0QAAAAJ", "orcid": "0000-0002-5840-1216;;;;0000-0003-4799-7285", "linkedin": "thomas-gilles/;;;bogdan-stanciulescu-5239012/;fabien-moutarde-b9990bb/", "or_profile": "~Thomas_Gilles1;~Stefano_Sabatini1;~Dzmitry_Tsishkou1;~Bogdan_Stanciulescu1;~Fabien_Moutarde1", "aff": "Mines ParisTech;Huawei Technologies Ltd.;;Mines ParisTech;MinesParis PSL", "aff_domain": "mines-paristech.fr;huawei.com;;mines-paristech.fr;minesparis.psl.eu", "position": "PhD student;Senior Engineer;;Associate Professor;Full Professor", "bibtex": "@inproceedings{\ngilles2022thomas,\ntitle={{THOMAS}: Trajectory Heatmap Output with learned Multi-Agent Sampling},\nauthor={Thomas Gilles and Stefano Sabatini and Dzmitry Tsishkou and Bogdan Stanciulescu and Fabien Moutarde},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=QDdJhACYrlX}\n}", "github": "", "project": "", "reviewers": "ZJY1;PgCY;kDAs;4LEw", "pdf_size": 0, "recommendation": "6;6;6;6", "confidence": "5;4;4;4", "correctness": "3;3;3;3", "technical_novelty": "3;2;2;2", "empirical_novelty": "2;2;0;2", "wc_summary_paper": "89;87;114;58", "wc_summary_review": "186;159;70;270", "wc_main_review": "1058;727;380;357", "wc_review": "1333;973;564;685", "wc_reply_reviewers": "0;11;0;88", "wc_reply_authors": "1744;1209;867;671", "reply_reviewers": "0;1;0;1", "reply_authors": "3;2;2;1", "recommendation_avg": [ 6.0, 0.0 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 87.0, 19.836834424877374 ], "wc_summary_review_avg": [ 171.25, 71.3630681795563 ], "wc_main_review_avg": [ 630.5, 287.06314636330455 ], "wc_review_avg": [ 888.75, 296.4088181886632 ], "wc_reply_reviewers_avg": [ 24.75, 36.79249244071404 ], "wc_reply_authors_avg": [ 1122.75, 407.0862162982186 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 2.0, 0.7071067811865476 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 184, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1445985947077165932&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 18, "pdf": "https://openreview.net/pdf?id=QDdJhACYrlX", "email": "mines-paristech.fr;huawei.com;;mines-paristech.fr;minesparis.psl.eu", "author_num": 5, "aff_unique_index": "0;1;0;2", "aff_unique_norm": "MINES ParisTech;Huawei;MinesParis PSL", "aff_unique_dep": ";Huawei Technologies;", "aff_unique_url": "https://www.mines-paristech.fr;https://www.huawei.com;https://www.minesparis.psl.eu", "aff_unique_abbr": "Mines ParisTech;Huawei;MinesParis PSL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0", "aff_country_unique": "France;China" }, { "id": "QEBHPRodWYE", "title": "InstaHide\u2019s Sample Complexity When Mixing Two Private Images", "track": "main", "status": "Reject", "tldr": "", "abstract": "Inspired by InstaHide challenge [Huang, Song, Li and Arora'20], [Chen, Song and Zhuo'20] recently provides one mathematical formulation of InstaHide attack problem under Gaussian images distribution. They show that it suffices to use $O(n_{\\mathsf{priv}}^{k_{\\mathsf{priv}} - 2/(k_{\\mathsf{priv}} + 1)})$ samples to recover one private image in $n_{\\mathsf{priv}}^{O(k_{\\mathsf{priv}})} + \\mathrm{poly}(n_{\\mathsf{pub}})$ time for any integer $k_{\\mathsf{priv}}$, where $n_{\\mathsf{priv}}$ and $n_{\\mathsf{pub}}$ denote the number of images used in the private and the public dataset to generate a mixed image sample. Under the current setup for the InstaHide challenge of mixing two private images ($k_{\\mathsf{priv}} = 2$), this means $n_{\\mathsf{priv}}^{4/3}$ samples are sufficient to recover a private image. In this work, we show that $n_{\\mathsf{priv}} \\log ( n_{\\mathsf{priv}} )$ samples are sufficient (information-theoretically) for recovering all the private images. \n\n", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/6554e6a07fbd7fbcad0a729ed9da4d9d85b57e34.zip", "author": "Baihe Huang;Zhao Song;Runzhou Tao;Ruizhe Zhang;Danyang Zhuo", "authorids": "~Baihe_Huang1;~Zhao_Song6;~Runzhou_Tao1;~Ruizhe_Zhang2;~Danyang_Zhuo1", "gender": ";M;M;M;M", "homepage": ";https://runzhoutao.github.io/;;https://danyangzhuo.com/;https://www.youtube.com/@zhaosong2031", "dblp": "279/4131;228/8097;133/6407-1;151/7537;76/4051-2", "google_scholar": "chICXXMAAAAJ;;;E3yOuvEAAAAJ;yDZct7UAAAAJ", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~Baihe_Huang1;~Runzhou_Tao1;~Ruizhe_Zhang2;~Danyang_Zhuo1;~Zhao_Song3", "aff": "Peking University;Columbia University;The University of Texas at Austin;Duke University;Adobe", "aff_domain": "pku.edu.cn;columbia.edu;utexas.edu;duke.edu;adobe.com", "position": "Undergrad student;PhD student;PhD student;Assistant Professor;Researcher", "bibtex": "@misc{\nhuang2022instahides,\ntitle={InstaHide{\\textquoteright}s Sample Complexity When Mixing Two Private Images },\nauthor={Baihe Huang and Zhao Song and Runzhou Tao and Ruizhe Zhang and Danyang Zhuo},\nyear={2022},\nurl={https://openreview.net/forum?id=QEBHPRodWYE}\n}", "github": "", "project": "", "reviewers": "3gkW;d4iK;x8b2", "site": "https://openreview.net/forum?id=QEBHPRodWYE", "pdf_size": 0, "recommendation": "5;5;6", "confidence": "3;4;4", "correctness": "4;4;4", "technical_novelty": "2;2;3", "empirical_novelty": "0;0;3", "wc_summary_paper": "46;127;27", "wc_summary_review": "48;122;57", "wc_main_review": "172;90;206", "wc_review": "266;339;290", "wc_reply_reviewers": "0;6;0", "wc_reply_authors": "420;245;237", "reply_reviewers": "0;1;0", "reply_authors": "1;1;1", "recommendation_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 4.0, 0.0 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 1.0, 1.4142135623730951 ], "wc_summary_paper_avg": [ 66.66666666666667, 43.36152928832448 ], "wc_summary_review_avg": [ 75.66666666666667, 32.96799795087486 ], "wc_main_review_avg": [ 156.0, 48.68949236402724 ], "wc_review_avg": [ 298.3333333333333, 30.379086373505196 ], "wc_reply_reviewers_avg": [ 2.0, 2.8284271247461903 ], "wc_reply_authors_avg": [ 300.6666666666667, 84.44459064314829 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.4999999999999999, "corr_recommendation_correctness": 0.0, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17958904921652803072&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;2;3;4", "aff_unique_norm": "Peking University;Columbia University;University of Texas at Austin;Duke University;Adobe", "aff_unique_dep": ";;;;Adobe Inc.", "aff_unique_url": "http://www.pku.edu.cn;https://www.columbia.edu;https://www.utexas.edu;https://www.duke.edu;https://www.adobe.com", "aff_unique_abbr": "Peking U;Columbia;UT Austin;Duke;Adobe", "aff_campus_unique_index": "1", "aff_campus_unique": ";Austin", "aff_country_unique_index": "0;1;1;1;1", "aff_country_unique": "China;United States" }, { "id": "QFNIpIrkANz", "title": "Learning Invariant Reward Functions through Trajectory Interventions", "track": "main", "status": "Reject", "tldr": "", "abstract": "Inverse reinforcement learning methods aim to retrieve the reward function of\na Markov decision process based on a dataset of expert demonstrations. The\ncommonplace scarcity of such demonstrations potentially leads to the absorption of\nspurious correlations in the data by the learning model, which as a result, exhibits\nbehavioural overfitting to the expert dataset when trained on the obtained reward\nfunction. We study the generalization properties of the maximum entropy method\nfor solving the inverse reinforcement learning problem for both exact and approximate\nformulations and demonstrate that by applying an instantiation of the invariant\nrisk minimization principle, we can recover reward functions which induce better\nperforming policies across domains in the transfer setting.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/a75ed0621e1fb03aac6345f9da5af98f80b47c7e.zip", "author": "Ivan Ovinnikov;Eugene Bykovets;Joachim M. Buhmann", "authorids": "~Ivan_Ovinnikov1;~Eugene_Bykovets1;~Joachim_M._Buhmann1", "gender": "M;M;M", "homepage": ";https://ise.ethz.ch;", "dblp": ";b/JMBuhmann;", "google_scholar": "https://scholar.google.ch/citations?user=m8UKFekAAAAJ;https://scholar.google.ch/citations?user=zQWbCzYAAAAJ;https://scholar.google.com/citations?hl=ru", "orcid": ";;", "linkedin": "ivan-ovinnikov-0b227593/;;eugenebykovets", "or_profile": "~Ivan_Ovinnikov1;~Joachim_M._Buhmann1;~Evgenii_V_Bykovetc1", "aff": "Swiss Federal Institute of Technology;Department of Computer Science, ETHZ - ETH Zurich;Swiss Federal Institute of Technology", "aff_domain": "ethz.ch;inf.ethz.ch;ethz.ch", "position": "PhD student;Professor;PhD student", "bibtex": "@misc{\novinnikov2022learning,\ntitle={Learning Invariant Reward Functions through Trajectory Interventions},\nauthor={Ivan Ovinnikov and Eugene Bykovets and Joachim M. Buhmann},\nyear={2022},\nurl={https://openreview.net/forum?id=QFNIpIrkANz}\n}", "github": "", "project": "", "reviewers": "33fy;nEw9;bhYo;YvDw", "site": "https://openreview.net/forum?id=QFNIpIrkANz", "pdf_size": 0, "recommendation": "3;3;5;8", "confidence": "4;4;4;3", "correctness": "2;4;3;3", "technical_novelty": "2;2;3;4", "empirical_novelty": "3;2;2;3", "wc_summary_paper": "56;58;103;65", "wc_summary_review": "14;25;46;35", "wc_main_review": "209;496;667;186", "wc_review": "279;579;816;286", "wc_reply_reviewers": "0;0;284;55", "wc_reply_authors": "207;335;727;94", "reply_reviewers": "0;0;1;1", "reply_authors": "1;1;2;1", "recommendation_avg": [ 4.75, 2.0463381929681126 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.75, 0.82915619758885 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 70.5, 19.059118552545918 ], "wc_summary_review_avg": [ 30.0, 11.853269591129697 ], "wc_main_review_avg": [ 389.5, 201.45781196071798 ], "wc_review_avg": [ 490.0, 223.7934315389976 ], "wc_reply_reviewers_avg": [ 84.75, 117.20788156092576 ], "wc_reply_authors_avg": [ 340.75, 238.74502612620017 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.9169493006161777, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:iW8sftYvmSsJ:scholar.google.com/&scioq=Learning+Invariant+Reward+Functions+through+Trajectory+Interventions&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1;0", "aff_unique_norm": "Swiss Federal Institute of Technology;ETH Zurich", "aff_unique_dep": ";Department of Computer Science", "aff_unique_url": "https://www.ethz.ch;https://www.ethz.ch", "aff_unique_abbr": "ETH Zurich;ETHZ", "aff_campus_unique_index": "1", "aff_campus_unique": ";Zurich", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Switzerland" }, { "title": "Ada-NETS: Face Clustering via Adaptive Neighbour Discovery in the Structure Space", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6907", "id": "QJWVP4CTmW4", "poster": "", "openreview": "https://openreview.net/forum?id=QJWVP4CTmW4", "slides": "https://iclr.cc/virtual/2022/poster/6907", "video": "https://iclr.cc/virtual/2022/poster/6907", "author_site": "Yaohua Wang, Yaobin Zhang, Fangyi Zhang, Senzhang Wang, Ming Lin, Yuqi Zhang, Xiuyu Sun", "tldr": "", "abstract": "Face clustering has attracted rising research interest recently to take advantage of massive amounts of face images on the web. State-of-the-art performance has been achieved by Graph Convolutional Networks (GCN) due to their powerful representation capacity. However, existing GCN-based methods build face graphs mainly according to $k$NN relations in the feature space, which may lead to a lot of noise edges connecting two faces of different classes. The face features will be polluted when messages pass along these noise edges, thus degrading the performance of GCNs. In this paper, a novel algorithm named Ada-NETS is proposed to cluster faces by constructing clean graphs for GCNs. In Ada-NETS, each face is transformed to a new structure space, obtaining robust features by considering face features of the neighbour images. Then, an adaptive neighbour discovery strategy is proposed to determine a proper number of edges connecting to each face image. It significantly reduces the noise edges while maintaining the good ones to build a graph with clean yet rich edges for GCNs to cluster faces. Experiments on multiple public clustering datasets show that Ada-NETS significantly outperforms current state-of-the-art methods, proving its superiority and generalization. Code is available at https://github.com/damo-cv/Ada-NETS.", "keywords": "Face Clustering;Graph Convolutional Networks (GCN);Computer Vision", "primary_area": "", "supplementary_material": "", "author": "Yaohua Wang;Yaobin Zhang;Fangyi Zhang;Senzhang Wang;Ming Lin;YuQi Zhang;Xiuyu Sun", "authorids": "~Yaohua_Wang2;~Yaobin_Zhang1;~Fangyi_Zhang4;~Senzhang_Wang2;~Ming_Lin4;~YuQi_Zhang5;~Xiuyu_Sun1", "gender": "M;M;M;M;M;M;M", "homepage": "https://thomas-wyh.github.io/;https://buptzyb.github.io;http://www.fangyizhang.com;https://senzhangwangcsu.github.io/index.html;https://minglin-home.github.io/;;https://sites.google.com/view/sunxiuyu/home", "dblp": ";;;118/5055;;;40/8845", "google_scholar": "TRAwmsgAAAAJ;5h6H9_kAAAAJ;5jFI06UAAAAJ;zdWyGRMAAAAJ;https://scholar.google.com/citations?hl=en;Ba0iNJoAAAAJ;https://scholar.google.com/citations?hl=zh-CN", "orcid": "0009-0005-6211-6388;;0000-0003-3938-5377;0000-0002-3615-4859;;;0000-0002-7208-8078", "linkedin": ";;fangyi-zhang-a6108088;;;;", "or_profile": "~Yaohua_Wang2;~Yaobin_Zhang1;~Fangyi_Zhang4;~Senzhang_Wang2;~Ming_Lin4;~YuQi_Zhang5;~Xiuyu_Sun1", "aff": "Alibaba Group;Alibaba Group;Queensland University of Technology;Central South University;Alibaba Group;;Alibaba Group", "aff_domain": "alibaba-inc.com;alibaba-inc.com;qut.edu.au;csu.edu.cn;alibaba-inc.com;;alibaba-inc.com", "position": "Researcher;Researcher;Postdoc;Full Professor;Algorithm Engineer;;Staff Algorithm Engineer", "bibtex": "@inproceedings{\nwang2022adanets,\ntitle={Ada-{NETS}: Face Clustering via Adaptive Neighbour Discovery in the Structure Space},\nauthor={Yaohua Wang and Yaobin Zhang and Fangyi Zhang and Senzhang Wang and Ming Lin and YuQi Zhang and Xiuyu Sun},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=QJWVP4CTmW4}\n}", "github": "", "project": "", "reviewers": "MCAC;FhxA;DwFA;Q8yu", "pdf_size": 0, "recommendation": "3;6;6;8", "confidence": "4;4;5;4", "correctness": "4;3;2;4", "technical_novelty": "2;3;3;4", "empirical_novelty": "3;2;3;4", "wc_summary_paper": "96;105;84;201", "wc_summary_review": "28;5;9;4", "wc_main_review": "172;35;121;326", "wc_review": "296;145;214;531", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "698;361;491;718", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 5.75, 1.7853571071357126 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 121.5, 46.5 ], "wc_summary_review_avg": [ 11.5, 9.7082439194738 ], "wc_main_review_avg": [ 163.5, 105.82650896632659 ], "wc_review_avg": [ 296.5, 145.55840752082995 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 567.0, 148.47053579751102 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.08084520834544431, "corr_recommendation_correctness": -0.1266600992762247, "gs_citation": 31, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7829793345163154347&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=QJWVP4CTmW4", "email": "alibaba-inc.com;alibaba-inc.com;qut.edu.au;csu.edu.cn;alibaba-inc.com;;alibaba-inc.com", "author_num": 7, "aff_unique_index": "0;0;1;2;0;0", "aff_unique_norm": "Alibaba Group;Queensland University of Technology;Central South University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.alibaba.com;https://www.qut.edu.au;https://www.csu.edu.cn", "aff_unique_abbr": "Alibaba;QUT;CSU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;0;0", "aff_country_unique": "China;Australia" }, { "id": "QJb1-8NH2Ux", "title": "Detecting Adversarial Examples Is (Nearly) As Hard As Classifying Them", "track": "main", "status": "Reject", "tldr": "", "abstract": "Making classifiers robust to adversarial examples is challenging. \nThus, many defenses tackle the seemingly easier task of \\emph{detecting} perturbed inputs.\n\nWe show a barrier towards this goal. We prove a general \\emph{hardness reduction} between detection and classification of adversarial examples: given a robust detector for attacks at distance $\\epsilon$ (in some metric), we show how to build a similarly robust (but inefficient) \\emph{classifier} for attacks at distance $\\epsilon/2$---and vice-versa.\n\nOur reduction is computationally inefficient, and thus cannot be used to build practical classifiers. Instead, it is a useful sanity check to test whether empirical detection results imply something much stronger than the authors presumably anticipated.\n\nTo illustrate, we revisit $14$ empirical detector defenses published over the past years. For $12/14$ defenses, we show that the claimed detection results imply an inefficient classifier with robustness far beyond the state-of-the-art--- thus casting some doubts on the results' validity.\n\nFinally, we show that our reduction applies in both directions: a robust classifier for attacks at distance $\\epsilon/2$ implies an inefficient robust detector at distance $\\epsilon$. Thus, we argue that robust classification and robust detection should be regarded as (near)-equivalent problems.", "keywords": "Adversarial Examples;Detection;Hardness Reductions", "primary_area": "", "supplementary_material": "", "author": "Florian Tramer", "authorids": "~Florian_Tramer1", "gender": "M", "homepage": "http://floriantramer.com", "dblp": "158/7224", "google_scholar": "https://scholar.google.ch/citations?user=ijH0-a8AAAAJ", "orcid": "", "linkedin": "", "or_profile": "~Florian_Tramer1", "aff": "Google", "aff_domain": "google.com", "position": "Visiting Researcher", "bibtex": "@misc{\ntramer2022detecting,\ntitle={Detecting Adversarial Examples Is (Nearly) As Hard As Classifying Them},\nauthor={Florian Tramer},\nyear={2022},\nurl={https://openreview.net/forum?id=QJb1-8NH2Ux}\n}", "github": "", "project": "", "reviewers": "RAPn;2ebX;hr6H;Jyt1", "site": "https://openreview.net/forum?id=QJb1-8NH2Ux", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "4;4;4;4", "correctness": "2;3;3;4", "technical_novelty": "2;3;4;3", "empirical_novelty": "1;3;2;3", "wc_summary_paper": "39;37;89;284", "wc_summary_review": "107;67;62;76", "wc_main_review": "364;269;393;461", "wc_review": "510;373;544;821", "wc_reply_reviewers": "127;188;346;424", "wc_reply_authors": "1410;871;866;625", "reply_reviewers": "1;1;1;3", "reply_authors": "3;3;3;4", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 112.25, 101.32466382870462 ], "wc_summary_review_avg": [ 78.0, 17.478558292948534 ], "wc_main_review_avg": [ 371.75, 68.98324071830781 ], "wc_review_avg": [ 562.0, 162.65761586842467 ], "wc_reply_reviewers_avg": [ 271.25, 119.01549268897726 ], "wc_reply_authors_avg": [ 943.0, 287.3699705953982 ], "reply_reviewers_avg": [ 1.5, 0.8660254037844386 ], "reply_authors_avg": [ 3.25, 0.4330127018922193 ], "replies_avg": [ 24, 0 ], "authors#_avg": [ 1, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.9733285267845754, "gs_citation": 79, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2383487011870063609&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 10, "aff_unique_index": "0", "aff_unique_norm": "Google", "aff_unique_dep": "Google", "aff_unique_url": "https://www.google.com", "aff_unique_abbr": "Google", "aff_campus_unique_index": "0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "id": "QJeN_cqtxvC", "title": "DistProp: A Scalable Approach to Lagrangian Training via Distributional Approximation", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "We develop a multiple shooting method for learning in deep neural networks based on the Lagrangian perspective on automatic differentiation. Our method leverages ideas from saddle-point optimization to derive stable first-order updates to solve a specific constrained optimization problem. Most importantly, we propose a novel solution allowing us to run our algorithm over mini-batches with stochastic gradient fashion and to decouple the number of auxiliary variables with the size of the dataset. We show empirically that our method reliably achieves higher accuracy than other comparable local (biologically plausible) learning methods on MNIST, CIFAR10 and ImageNet.\n", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Manuel Del Verme;Pierre-Luc Bacon", "authorids": "~Manuel_Del_Verme1;~Pierre-Luc_Bacon1", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": "JcOwyS0AAAAJ;", "orcid": ";", "linkedin": ";", "or_profile": "~Manuel_Del_Verme1;~Pierre-Luc_Bacon1", "aff": "Montreal Institute for Learning Algorithms, University of Montreal, University of Montreal;", "aff_domain": "mila.umontreal.ca;", "position": "PhD student;", "bibtex": "@misc{\nverme2022distprop,\ntitle={DistProp: A Scalable Approach to Lagrangian Training via Distributional Approximation},\nauthor={Manuel Del Verme and Pierre-Luc Bacon},\nyear={2022},\nurl={https://openreview.net/forum?id=QJeN_cqtxvC}\n}", "github": "", "project": "", "reviewers": "FFvQ;jeMc;XPve;RniD", "site": "https://openreview.net/forum?id=QJeN_cqtxvC", "pdf_size": 0, "recommendation": "3;3;3;3", "confidence": "3;3;3;3", "correctness": "2;2;2;2", "technical_novelty": "2;2;2;2", "empirical_novelty": "1;2;2;2", "wc_summary_paper": "104;63;59;90", "wc_summary_review": "39;15;2;31", "wc_main_review": "201;406;526;163", "wc_review": "344;484;587;284", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 3.0, 0.0 ], "correctness_avg": [ 2.0, 0.0 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 1.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 79.0, 18.721645226849056 ], "wc_summary_review_avg": [ 21.75, 14.306903927824496 ], "wc_main_review_avg": [ 324.0, 148.8102819028309 ], "wc_review_avg": [ 424.75, 118.4976265585096 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:2iq95usS3nAJ:scholar.google.com/&scioq=DistProp:+A+Scalable+Approach+to+Lagrangian+Training+via+Distributional+Approximation&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "University of Montreal", "aff_unique_dep": "Montreal Institute for Learning Algorithms", "aff_unique_url": "https://www.umontreal.ca", "aff_unique_abbr": "UM", "aff_campus_unique_index": "0", "aff_campus_unique": "Montreal", "aff_country_unique_index": "0", "aff_country_unique": "Canada" }, { "id": "QKEkEFpKBBv", "title": "DNBP: Differentiable Nonparametric Belief Propagation", "track": "main", "status": "Reject", "tldr": "", "abstract": "We present a differentiable approach to learn the probabilistic factors used for inference by a nonparametric belief propagation algorithm. Existing nonparametric belief propagation methods rely on domain-specific features encoded in the probabilistic factors of a graphical model. In this work, we replace each crafted factor with a differentiable neural network enabling the factors to be learned using an efficient optimization routine from labeled data. By combining differentiable neural networks with an efficient belief propagation algorithm, our method learns to maintain a set of marginal posterior samples using end-to-end training. We evaluate our differentiable nonparametric belief propagation (DNBP) method on a set of articulated pose tracking tasks and compare performance with learned baselines. Results from these experiments demonstrate the effectiveness of using learned factors for tracking and suggest the practical advantage over hand-crafted approaches. The project webpage is available at: https://sites.google.com/view/diff-nbp", "keywords": "Belief Propagation;Bayesian Inference;Nonparametric Inference", "primary_area": "", "supplementary_material": "/attachment/1f3350f63bf3123c660e2f987c453057ee561bde.zip", "author": "Anthony Opipari;Jana Pavlasek;Chao Chen;Shoutian Wang;Karthik Desingh;Odest Jenkins", "authorids": "~Anthony_Opipari1;~Jana_Pavlasek1;joecc@umich.edu;shoutian@umich.edu;~Karthik_Desingh1;~Odest_Jenkins1", "gender": ";F;;;M;M", "homepage": "https://topipari.com;http://janapavlasek.com;;;;http://ocj.me/", "dblp": ";;;;124/2740;99/4449.html", "google_scholar": ";https://scholar.google.ca/citations?user=yJS-u7IAAAAJ;;;zgezSpQAAAAJ;dp5LnVAAAAAJ", "orcid": ";0000-0001-6332-2646;;;;", "linkedin": ";;;;;", "or_profile": "~Anthony_Opipari1;~Jana_Pavlasek1;joecc@umich.edu;shoutian@umich.edu;~Karthik_Desingh1;~Odest_Jenkins1", "aff": "University of Michigan;University of Michigan;;;University of Washington;University of Michigan - Ann Arbor", "aff_domain": "umich.edu;umich.edu;;;washington.edu;umich.edu", "position": "PhD student;PhD student;;;Postdoc;Full Professor", "bibtex": "@misc{\nopipari2022dnbp,\ntitle={{DNBP}: Differentiable Nonparametric Belief Propagation},\nauthor={Anthony Opipari and Jana Pavlasek and Chao Chen and Shoutian Wang and Karthik Desingh and Odest Jenkins},\nyear={2022},\nurl={https://openreview.net/forum?id=QKEkEFpKBBv}\n}", "github": "", "project": "", "reviewers": "sUCc;Yvfb;Rbn6;yWQ2", "site": "https://openreview.net/forum?id=QKEkEFpKBBv", "pdf_size": 0, "recommendation": "3;3;3;3", "confidence": "4;3;4;2", "correctness": "4;2;3;3", "technical_novelty": "2;3;2;3", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "94;97;114;193", "wc_summary_review": "28;90;42;167", "wc_main_review": "223;378;405;300", "wc_review": "345;565;561;660", "wc_reply_reviewers": "6;0;102;8", "wc_reply_authors": "421;763;823;503", "reply_reviewers": "1;0;1;1", "reply_authors": "1;1;2;1", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 124.5, 40.277164746292655 ], "wc_summary_review_avg": [ 81.75, 54.32483317967944 ], "wc_main_review_avg": [ 326.5, 71.11434454454319 ], "wc_review_avg": [ 532.75, 115.41311667223964 ], "wc_reply_reviewers_avg": [ 29.0, 42.24926034855522 ], "wc_reply_authors_avg": [ 627.5, 169.3539193523433 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6196641887946288805&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "University of Michigan;University of Washington", "aff_unique_dep": ";", "aff_unique_url": "https://www.umich.edu;https://www.washington.edu", "aff_unique_abbr": "UM;UW", "aff_campus_unique_index": "1", "aff_campus_unique": ";Ann Arbor", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "QNW1OrjynpT", "title": "Short-term memory in neural language models", "track": "main", "status": "Reject", "tldr": "", "abstract": "When a language model is trained to predict natural language sequences, its prediction at each moment depends on a representation of prior context. Thus, language models require mechanisms to maintain and access memory. Although we design the architectural features of these models, we do not know how their memory systems are functionally organized via learning: what kind of information about the prior context can they retrieve? We reasoned that access to arbitrary individual tokens from the past could be computationally powerful, akin to the working memory which is important for flexible cognition in humans, and we therefore tested whether language models could ``retrieve'' the exact words that occurred previously in a text. In particular, we tested how the ability to retrieve prior words depended on (i) the number of words being retrieved, (ii) their semantic coherence, and (iii) the length and quality of the intervening text. We evaluated two particular architectures of neural language models: the attention-based transformer and the long short-term memory network (LSTM). In our paradigm, language models processed English text in which a list of nouns occurred twice. We operationalized retrieval as the reduction in surprisal from the first presentation of the list to its second presentation. We found that the transformer models retrieved both the identity and ordering of nouns from the first list. The transformer was successful even when the noun lists were semantically incoherent, and this effect was largely robust to the type or length of the intervening text. Further, the transformer\u2019s retrieval was markedly enhanced when it was trained on a larger corpus and with greater model depth. Lastly, its ability to index prior tokens was dependent on learned attention patterns. In contrast, the LSTM models exhibited less precise retrieval (smaller reductions in surprisal). The LSTM\u2019s retrieval was limited to list-initial tokens, and occurred only across short intervening texts. Moreover, the LSTM's retrieval was not sensitive to the order of nouns and this non-specific retrieval improved when the list was semantically coherent. In sum, the transformer, when trained to predict linguistic tokens, implements something akin to a working memory system, as it could flexibly retrieve individual token representations across arbitrary delays. Conversely, the LSTM maintained a coarser and more rapidly-decaying semantic gist of prior tokens, weighted heavily toward the earliest items. Thus, although the transformer and LSTM architectures were both trained to predict language sequences, only the transformer learned to flexibly index prior tokens.", "keywords": "short-term memory;language models;transformer;lstm;GPT-2", "primary_area": "", "supplementary_material": "", "author": "Kristijan Armeni;Christopher Honey;Tal Linzen", "authorids": "~Kristijan_Armeni1;~Christopher_Honey1;~Tal_Linzen1", "gender": "M;;M", "homepage": "https://www.kristijanarmeni.net;https://www.honeylab.org;http://tallinzen.net", "dblp": ";60/11540;169/3438", "google_scholar": "C9rSddwAAAAJ;https://scholar.google.com/citations?hl=en;5mJDXjoAAAAJ", "orcid": "0000-0001-8391-8965;0000-0002-0745-5089;", "linkedin": "kristijanarmeni/;;", "or_profile": "~Kristijan_Armeni1;~Christopher_Honey1;~Tal_Linzen1", "aff": "Johns Hopkins University;Johns Hopkins University;New York University", "aff_domain": "jhu.edu;jhu.edu;nyu.edu", "position": "Postdoc;Assistant Professor;Assistant Professor", "bibtex": "@misc{\narmeni2022shortterm,\ntitle={Short-term memory in neural language models},\nauthor={Kristijan Armeni and Christopher Honey and Tal Linzen},\nyear={2022},\nurl={https://openreview.net/forum?id=QNW1OrjynpT}\n}", "github": "", "project": "", "reviewers": "Tpb6;ax86;r2TC;WHFY;M2A9", "site": "https://openreview.net/forum?id=QNW1OrjynpT", "pdf_size": 0, "recommendation": "3;5;5;6;6", "confidence": "4;4;4;3;4", "correctness": "2;3;3;3;3", "technical_novelty": "2;2;2;3;3", "empirical_novelty": "2;2;3;3;3", "wc_summary_paper": "200;41;228;132;98", "wc_summary_review": "34;96;43;30;78", "wc_main_review": "440;471;489;412;145", "wc_review": "674;608;760;574;321", "wc_reply_reviewers": "307;186;26;46;0", "wc_reply_authors": "892;686;826;415;250", "reply_reviewers": "1;2;1;1;0", "reply_authors": "2;2;2;1;1", "recommendation_avg": [ 5.0, 1.0954451150103321 ], "confidence_avg": [ 3.8, 0.39999999999999997 ], "correctness_avg": [ 2.8, 0.39999999999999997 ], "technical_novelty_avg": [ 2.4, 0.4898979485566356 ], "empirical_novelty_avg": [ 2.6, 0.4898979485566356 ], "wc_summary_paper_avg": [ 139.8, 67.78318375526484 ], "wc_summary_review_avg": [ 56.2, 26.12584926849269 ], "wc_main_review_avg": [ 391.4, 125.98507848154082 ], "wc_review_avg": [ 587.4, 147.555548862115 ], "wc_reply_reviewers_avg": [ 113.0, 116.44054276754295 ], "wc_reply_authors_avg": [ 613.8, 244.74836056652146 ], "reply_reviewers_avg": [ 1.0, 0.6324555320336759 ], "reply_authors_avg": [ 1.6, 0.4898979485566356 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.4564354645876385, "corr_recommendation_correctness": 0.9128709291752771, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:9b9r0WfWMoMJ:scholar.google.com/&scioq=Short-term+memory+in+neural+language+models&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0;1", "aff_unique_norm": "Johns Hopkins University;New York University", "aff_unique_dep": ";", "aff_unique_url": "https://www.jhu.edu;https://www.nyu.edu", "aff_unique_abbr": "JHU;NYU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Multimeasurement Generative Models", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7044", "id": "QRX0nCX_gk", "poster": "", "openreview": "https://openreview.net/forum?id=QRX0nCX_gk", "slides": "https://iclr.cc/virtual/2022/poster/7044", "video": "https://iclr.cc/virtual/2022/poster/7044", "author_site": "Saeed Saremi, Rupesh K Srivastava", "tldr": "", "abstract": "We formally map the problem of sampling from an unknown distribution with a density in $\\mathbb{R}^d$ to the problem of learning and sampling a smoother density in $\\mathbb{R}^{Md}$ obtained by convolution with a fixed factorial kernel: the new density is referred to as M-density and the kernel as multimeasurement noise model (MNM). The M-density in $\\mathbb{R}^{Md}$ is smoother than the original density in $\\mathbb{R}^d$, easier to learn and sample from, yet for large $M$ the two problems are mathematically equivalent since clean data can be estimated exactly given a multimeasurement noisy observation using the Bayes estimator. To formulate the problem, we derive the Bayes estimator for Poisson and Gaussian MNMs in closed form in terms of the unnormalized M-density. This leads to a simple least-squares objective for learning parametric energy and score functions. We present various parametrization schemes of interest including one in which studying Gaussian M-densities directly leads to multidenoising autoencoders\u2014this is the first theoretical connection made between denoising autoencoders and empirical Bayes in the literature. Samples in $\\mathbb{R}^d$ are obtained by walk-jump sampling (Saremi & Hyvarinen, 2019) via underdamped Langevin MCMC (walk) to sample from M-density and the multimeasurement Bayes estimation (jump). We study permutation invariant Gaussian M-densities on MNIST, CIFAR-10, and FFHQ-256 datasets, and demonstrate the effectiveness of this framework for realizing fast-mixing stable Markov chains in high dimensions.", "keywords": "energy based models;Langevin MCMC;score matching;denoising autoencoders;empirical Bayes", "primary_area": "", "supplementary_material": "/attachment/c03ed89627764a0a1ea5a42d2e6a7afb2603e42f.zip", "author": "Saeed Saremi;Rupesh Kumar Srivastava", "authorids": "~Saeed_Saremi1;~Rupesh_Kumar_Srivastava1", "gender": "M;M", "homepage": "https://saeedsaremi.github.io/;https://rupeshks.cc/", "dblp": "128/2619;69/8778", "google_scholar": ";vTWuk1gAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Saeed_Saremi1;~Rupesh_K_Srivastava1", "aff": "Genentech;NNAISENSE", "aff_domain": "gene.com;nnaisense.com", "position": "Senior Principal Research Scientist;Research Scientist", "bibtex": "@inproceedings{\nsaremi2022multimeasurement,\ntitle={Multimeasurement Generative Models},\nauthor={Saeed Saremi and Rupesh Kumar Srivastava},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=QRX0nCX_gk}\n}", "github": "", "project": "", "reviewers": "mPcN;dnHy;6P94", "pdf_size": 0, "recommendation": "6;6;8", "confidence": "3;4;4", "correctness": "3;4;1", "technical_novelty": "3;3;1", "empirical_novelty": "3;2;3", "wc_summary_paper": "52;125;229", "wc_summary_review": "22;86;83", "wc_main_review": "175;421;535", "wc_review": "249;632;847", "wc_reply_reviewers": "19;186;0", "wc_reply_authors": "175;749;643", "reply_reviewers": "1;1;0", "reply_authors": "1;1;1", "recommendation_avg": [ 6.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 2.6666666666666665, 1.247219128924647 ], "technical_novelty_avg": [ 2.3333333333333335, 0.9428090415820634 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 135.33333333333334, 72.62843030720745 ], "wc_summary_review_avg": [ 63.666666666666664, 29.48822740612863 ], "wc_main_review_avg": [ 377.0, 150.22649566571138 ], "wc_review_avg": [ 576.0, 247.32300068264306 ], "wc_reply_reviewers_avg": [ 68.33333333333333, 83.56368163795135 ], "wc_reply_authors_avg": [ 522.3333333333334, 249.38502138037256 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.49999999999999983, "corr_recommendation_correctness": -0.9449111825230679, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5398070140675307056&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=QRX0nCX_gk", "email": "gene.com;nnaisense.com", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "Genentech;NNAISENSE", "aff_unique_dep": ";", "aff_unique_url": "https://www.genentech.com;https://www.nnaiseNSE.com", "aff_unique_abbr": "Genentech;NNAISENSE", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "United States;China" }, { "id": "QWc35QxXPzZ", "title": "The Power of Exploiter: Provable Multi-Agent RL in Large State Spaces", "track": "main", "status": "Desk Reject", "tldr": "", "abstract": "Modern reinforcement learning (RL) commonly engages practical problems with large state spaces, where function approximation must be deployed to approximate either the value function or the policy. While recent progresses in RL theory address a rich set of RL problems with general function approximation, such successes are mostly restricted to the single-agent setting. It remains elusive how to extend these results to multi-agent RL, especially due to the new challenges arising from its game-theoretical nature. This paper considers two-player zero-sum Markov Games (MGs). We propose a new algorithm that can provably find the Nash equilibrium policy using a polynomial number of samples, for any MG with low multi-agent Bellman-Eluder dimension -- a new complexity measure adapted from its single-agent version (Jin et al., 2021). A key component of our new algorithm is the exploiter, which facilitates the learning of the main player by deliberately exploiting her weakness. Our theoretical framework is generic, which applies to a wide range of models including but not limited to tabular MGs, MGs with linear or kernel function approximation, and MGs with rich observations.\n", "keywords": "theoretical reinforcement learning;Markov games with general function approximation", "primary_area": "", "supplementary_material": "", "author": "Chi Jin;Qinghua Liu;Tiancheng Yu", "authorids": "~Chi_Jin1;~Qinghua_Liu1;~Tiancheng_Yu1", "gender": "M;M;M", "homepage": "https://sites.google.com/view/cjin/home;http://qinghual2020.github.io/;https://yutc.me", "dblp": "126/1802-1;;215/4910", "google_scholar": "GINhGvwAAAAJ;CotFJJsAAAAJ;mVkGg80AAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Chi_Jin1;~Qinghua_Liu1;~Tiancheng_Yu1", "aff": "Princeton University;Princeton University;Massachusetts Institute of Technology", "aff_domain": "princeton.edu;princeton.edu;mit.edu", "position": "Assistant Professor;PhD student;PhD student", "bibtex": "@misc{\njin2022the,\ntitle={The Power of Exploiter: Provable Multi-Agent {RL} in Large State Spaces},\nauthor={Chi Jin and Qinghua Liu and Tiancheng Yu},\nyear={2022},\nurl={https://openreview.net/forum?id=QWc35QxXPzZ}\n}", "github": "", "project": "", "reviewers": "", "site": "https://openreview.net/forum?id=QWc35QxXPzZ", "pdf_size": 0, "recommendation": "", "confidence": "", "correctness": "", "technical_novelty": "", "empirical_novelty": "", "wc_summary_paper": "", "wc_summary_review": "", "wc_main_review": "", "wc_review": "", "wc_reply_reviewers": "", "wc_reply_authors": "", "reply_reviewers": "", "reply_authors": "", "recommendation_avg": [ 0, 0 ], "confidence_avg": [ 0, 0 ], "correctness_avg": [ 0, 0 ], "technical_novelty_avg": [ 0, 0 ], "empirical_novelty_avg": [ 0, 0 ], "wc_summary_paper_avg": [ 0, 0 ], "wc_summary_review_avg": [ 0, 0 ], "wc_main_review_avg": [ 0, 0 ], "wc_review_avg": [ 0, 0 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 1, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0, "corr_recommendation_correctness": 0, "gs_citation": 70, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7859714395115586271&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff_unique_index": "0;0;1", "aff_unique_norm": "Princeton University;Massachusetts Institute of Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.princeton.edu;https://web.mit.edu", "aff_unique_abbr": "Princeton;MIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "QXLWz6AguS", "title": "Modular Lagrangian Neural Networks: Designing Structures of Networks with Physical Inductive Biases", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Deep learning struggles at extrapolation in many cases. This issue happens when it comes to untrained data domains or different input dimensions and becomes more common in physical problems. Leveraging physical inductive biases can help relieve this issue and generalise the laws of physics. Based on this idea, we proposed a kind of structural neural network called Modular Lagrangian Neural Networks (ModLaNNs). This model can learn from the dynamics of simpler systems such as three-body systems and extrapolate to more complex ones like multi-body systems, which is not feasible using other relevant physical-informed neural networks. We tested our model on double-pendulum or three-body systems and reached the best results compared with our counterparts. At the same time, we directly applied our trained models to predict the motion of multi-pendulum and multi-body systems, demonstrating the intriguing performance in the extrapolation of our method.", "keywords": "Representation Learning;Physical Inductive Bias;Lagrangian Mechanics;Dynamics;Extrapolation", "primary_area": "", "supplementary_material": "", "author": "Yupu Lu;Shijie Lin;Jia Pan", "authorids": "~Yupu_Lu1;~Shijie_Lin1;~Jia_Pan1", "gender": "M;M;M", "homepage": ";;https://www.cs.hku.hk/people/academic-staff/jpan", "dblp": "182/7279;;97/896", "google_scholar": ";sQINQ-YAAAAJ;YYT8-7kAAAAJ", "orcid": ";;", "linkedin": "yupu-lu-51960b221/;;", "or_profile": "~Yupu_Lu1;~Shijie_Lin1;~Jia_Pan1", "aff": "The University of Hong Kong;The University of Hong Kong;University of Hong Kong", "aff_domain": "hku.hk;hku.hk;hku.hk", "position": "PhD student;PhD student;Associate Professor", "bibtex": "@misc{\nlu2022modular,\ntitle={Modular Lagrangian Neural Networks: Designing Structures of Networks with Physical Inductive Biases},\nauthor={Yupu Lu and Shijie Lin and Jia Pan},\nyear={2022},\nurl={https://openreview.net/forum?id=QXLWz6AguS}\n}", "github": "", "project": "", "reviewers": "MDa8;EPpn;tehX;spSB", "site": "https://openreview.net/forum?id=QXLWz6AguS", "pdf_size": 0, "recommendation": "1;3;3;3", "confidence": "5;3;3;4", "correctness": "2;2;3;2", "technical_novelty": "1;2;2;2", "empirical_novelty": "1;2;2;2", "wc_summary_paper": "74;138;91;60", "wc_summary_review": "56;84;55;83", "wc_main_review": "543;913;427;1039", "wc_review": "673;1135;573;1182", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 2.5, 0.8660254037844386 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 2.25, 0.4330127018922193 ], "technical_novelty_avg": [ 1.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 90.75, 29.40556920040828 ], "wc_summary_review_avg": [ 69.5, 14.0089257261219 ], "wc_main_review_avg": [ 730.5, 252.85717312348487 ], "wc_review_avg": [ 890.75, 270.58489887648943 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.8703882797784891, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:V30HQKeI3SwJ:scholar.google.com/&scioq=Modular+Lagrangian+Neural+Networks:+Designing+Structures+of+Networks+with+Physical+Inductive+Biases&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Hong Kong", "aff_unique_dep": "", "aff_unique_url": "https://www.hku.hk", "aff_unique_abbr": "HKU", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Hong Kong SAR", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "id": "QZTymB-n-Wz", "title": "Effective Certification of Monotone Deep Equilibrium Models", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Monotone Operator Equilibrium Models (monDEQs) represent a class of models that combine the powerful deep equilibrium paradigm with convergence guarantees. As monDEQs are inherently robust to adversarial perturbations, investigating new methods to certify their robustness is a promising research direction. Unfortunately, existing certification approaches are either imprecise or severely limited in their scalability. In this work, we propose the first scalable \\emph{and} precise monDEQ verifier, based on two key ideas: (i) a novel convex relaxation which enables efficient inclusion checks, and (ii) non-trivial mathematical insights characterizing the fixpoint operations at the heart of monDEQs on sets rather than concrete inputs. An extensive evaluation of our verifier demonstrates that on the challenging $\\ell_\\infty$ perturbations it exceeds state-of-the-art performance in terms of speed (two orders of magnitude) and scalability (an order of magnitude) while yielding 25\\% higher certified accuracies on the same networks.", "keywords": "Deep Equilibrium Models;Certified Robustness;Convex Relaxations", "primary_area": "", "supplementary_material": "", "author": "Mark Niklas Mueller;Robin Staab;Marc Fischer;Martin Vechev", "authorids": "~Mark_Niklas_Mueller2;~Robin_Staab1;~Marc_Fischer1;~Martin_Vechev1", "gender": "M;M;M;M", "homepage": "https://www.sri.inf.ethz.ch/people/mark;;;https://www.sri.inf.ethz.ch/people/martin", "dblp": "287/4254;304/3512;37/9373-2;93/2189.html", "google_scholar": "RBpmcCAAAAAJ;;;https://scholar.google.ch/citations?user=aZ1Rh50AAAAJ", "orcid": "0000-0002-2496-6542;;;", "linkedin": "mark-m%C3%BCller-8bb4b1140/;robin-staab-b778a51a6/;;", "or_profile": "~Mark_Niklas_Mueller2;~Robin_Staab1;~Marc_Fischer1;~Martin_Vechev1", "aff": "Swiss Federal Institute of Technology;ETHZ - ETH Zurich;Swiss Federal Institute of Technology;Swiss Federal Institute of Technology", "aff_domain": "ethz.ch;ethz.ch;ethz.ch;ethz.ch", "position": "PhD student;MS student;PhD student;Full Professor", "bibtex": "@misc{\nmueller2022effective,\ntitle={Effective Certification of Monotone Deep Equilibrium Models},\nauthor={Mark Niklas Mueller and Robin Staab and Marc Fischer and Martin Vechev},\nyear={2022},\nurl={https://openreview.net/forum?id=QZTymB-n-Wz}\n}", "github": "", "project": "", "reviewers": "W8Gd;hkzR;91s6;gsTH", "site": "https://openreview.net/forum?id=QZTymB-n-Wz", "pdf_size": 0, "recommendation": "3;3;5;6", "confidence": "5;4;4;4", "correctness": "3;3;4;3", "technical_novelty": "2;2;2;4", "empirical_novelty": "2;2;0;3", "wc_summary_paper": "91;43;43;207", "wc_summary_review": "235;58;21;104", "wc_main_review": "468;737;190;652", "wc_review": "794;838;254;963", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "379;545;25;425", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 4.25, 1.299038105676658 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.8660254037844386 ], "empirical_novelty_avg": [ 1.75, 1.0897247358851685 ], "wc_summary_paper_avg": [ 96.0, 67.01492371106603 ], "wc_summary_review_avg": [ 104.5, 80.87799453497843 ], "wc_main_review_avg": [ 511.75, 209.66923355609424 ], "wc_review_avg": [ 712.25, 271.73735021156 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 343.5, 193.6149529349425 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.5555555555555555, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11352094483866042684&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "aff_unique_index": "0;1;0;0", "aff_unique_norm": "Swiss Federal Institute of Technology;ETH Zurich", "aff_unique_dep": ";", "aff_unique_url": "https://www.ethz.ch;https://www.ethz.ch", "aff_unique_abbr": "ETH Zurich;ETHZ", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Switzerland" }, { "title": "NASViT: Neural Architecture Search for Efficient Vision Transformers with Gradient Conflict aware Supernet Training", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6085", "id": "Qaw16njk6L", "poster": "", "openreview": "https://openreview.net/forum?id=Qaw16njk6L", "slides": "https://iclr.cc/virtual/2022/poster/6085", "video": "https://iclr.cc/virtual/2022/poster/6085", "author_site": "Chengyue Gong, Dilin Wang, Meng Li, Xinlei Chen, Zhicheng Yan, Yuandong Tian, Qiang Liu, Vikas Chandra", "tldr": "", "abstract": "Designing accurate and efficient vision transformers (ViTs) is a highly important but challenging task. Supernet-based one-shot neural architecture search (NAS) enables fast architecture optimization and has achieved state-of-the-art (SOTA) results on convolutional neural networks (CNNs). However, directly applying the supernet-based NAS to optimize ViTs leads to poor performance - even worse compared to training single ViTs. In this work, we observe that the poor performance is due to a gradient conflict issue: the gradients of different sub-networks conflict with that of the supernet more severely in ViTs than CNNs, which leads to early saturation in training and inferior convergence. To alleviate this issue, we propose a series of techniques, including a gradient projection algorithm, a switchable layer scaling design, and a simplified data augmentation and regularization training recipe. The proposed techniques significantly improve the convergence and the performance of all sub-networks. Our discovered hybrid ViT model family, dubbed NASViT, achieves top-1 accuracy from 78.2% to 81.8% on ImageNet from 200M to 800M FLOPs, and outperforms all the prior art CNNs and ViTs, including AlphaNet and LeViT, etc. When transferred to semantic segmentation tasks, NASViTs also outperform previous backbones on both Cityscape and ADE20K datasets, achieving 73.2% and 37.9% mIoU with only 5G FLOPs, respectively. Code is available at\nhttps://github.com/facebookresearch/NASViT.\n", "keywords": "vision transformer;gradient conflict;neural architecture search", "primary_area": "", "supplementary_material": "", "author": "Chengyue Gong;Dilin Wang;Meng Li;Xinlei Chen;Zhicheng Yan;Yuandong Tian;qiang liu;Vikas Chandra", "authorids": "~Chengyue_Gong1;~Dilin_Wang1;~Meng_Li1;~Xinlei_Chen1;~Zhicheng_Yan2;~Yuandong_Tian1;~qiang_liu4;~Vikas_Chandra2", "gender": "M;M;M;M;M;M;M;M", "homepage": ";https://mengli.me;http://xinleic.xyz;https://sites.google.com/view/zhicheng-yan;http://yuandong-tian.com;https://v-chandra.github.io/;;https://www.cs.utexas.edu/~lqiang/", "dblp": "209/4862;70/1726-4;;;t/YuandongTian;57/5163;142/7035;61/3234-1", "google_scholar": "AscakBgAAAAJ;lvdRkEkAAAAJ;bSU7LYoAAAAJ;JFEHAwIAAAAJ;0mgEF28AAAAJ;p-h_BvcAAAAJ;dmTy9EIAAAAJ;https://scholar.google.com.tw/citations?user=2qDh4WUAAAAJ", "orcid": ";;;;0000-0003-4202-4847;;;", "linkedin": ";;;;yuandongtian;vchandra/;;", "or_profile": "~Chengyue_Gong1;~Meng_Li1;~Xinlei_Chen1;~Zhicheng_Yan2;~Yuandong_Tian1;~Vikas_Chandra2;~Dilin_Wang2;~Qiang_Liu1", "aff": "University of Texas at Austin;Meta Facebook;Meta;Meta Facebook;Meta AI (FAIR);Meta;Meta;University of Texas, Austin", "aff_domain": "cs.utexas.edu;fb.com;meta.com;meta.com;meta.com;meta.com;meta.com;utexas.edu", "position": "grad student;Researcher;Researcher;Researcher;Research Scientist;Director, AI;Research Scientist;Assistant Professor", "bibtex": "@inproceedings{\ngong2022nasvit,\ntitle={{NASV}iT: Neural Architecture Search for Efficient Vision Transformers with Gradient Conflict aware Supernet Training},\nauthor={Chengyue Gong and Dilin Wang and Meng Li and Xinlei Chen and Zhicheng Yan and Yuandong Tian and qiang liu and Vikas Chandra},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=Qaw16njk6L}\n}", "github": "", "project": "", "reviewers": "Y6a2;oqFz;XjJo;GnU6;G3NT", "pdf_size": 0, "recommendation": "5;6;6;6;8", "confidence": "5;5;3;4;4", "correctness": "3;4;3;3;3", "technical_novelty": "3;4;3;4;3", "empirical_novelty": "3;3;3;3;3", "wc_summary_paper": "74;70;45;51;45", "wc_summary_review": "96;16;30;23;28", "wc_main_review": "393;157;67;128;132", "wc_review": "563;243;142;202;205", "wc_reply_reviewers": "116;0;0;33;53", "wc_reply_authors": "1520;727;421;440;591", "reply_reviewers": "1;0;0;1;1", "reply_authors": "3;1;1;2;1", "recommendation_avg": [ 6.2, 0.9797958971132712 ], "confidence_avg": [ 4.2, 0.7483314773547882 ], "correctness_avg": [ 3.2, 0.39999999999999997 ], "technical_novelty_avg": [ 3.4, 0.4898979485566356 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 57.0, 12.505998560690786 ], "wc_summary_review_avg": [ 38.6, 29.103951621729994 ], "wc_main_review_avg": [ 175.4, 112.75566504615189 ], "wc_review_avg": [ 271.0, 149.53661758913768 ], "wc_reply_reviewers_avg": [ 40.4, 42.87936566694988 ], "wc_reply_authors_avg": [ 739.8, 405.59358969293385 ], "reply_reviewers_avg": [ 0.6, 0.48989794855663565 ], "reply_authors_avg": [ 1.6, 0.8 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": -0.3273268353539886, "corr_recommendation_correctness": -0.10206207261596578, "gs_citation": 98, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10871296489236109997&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2, "pdf": "https://openreview.net/pdf?id=Qaw16njk6L", "email": "cs.utexas.edu;fb.com;meta.com;meta.com;meta.com;meta.com;meta.com;utexas.edu", "author_num": 8, "aff_unique_index": "0;1;1;1;1;1;1;0", "aff_unique_norm": "University of Texas at Austin;Meta", "aff_unique_dep": ";Meta Platforms, Inc.", "aff_unique_url": "https://www.utexas.edu;https://meta.com", "aff_unique_abbr": "UT Austin;Meta", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Austin;", "aff_country_unique_index": "0;0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "Qb07sqX7dVl", "title": "Label Augmentation with Reinforced Labeling for Weak Supervision", "track": "main", "status": "Reject", "tldr": "", "abstract": "Weak supervision (WS) is an alternative to the traditional supervised learning to address the need for ground truth. Data programming is a practical WS approach that allows programmatic labeling data samples using labeling functions (LFs) instead of hand-labeling each data point. However, the existing approach fails to fully exploit the domain knowledge encoded into LFs, especially when the LFs' coverage is low. This is due to the common data programming pipeline that neglects to utilize data features during the generative process.\nThis paper proposes a new approach called reinforced labeling (RL). Given an unlabeled dataset and a set of LFs, RL augments the LFs' outputs to cases not covered by LFs based on similarities among samples. Thus, RL can lead to higher labeling coverage for training an end classifier. The experiments on several domains (classification of YouTube comments, wine quality, and weather prediction) result in considerable gains. The new approach produces significant performance improvement, leading up to +21 points in accuracy and +61 points in F1 scores compared to the state-of-the-art data programming approach.\n", "keywords": "weak supervision;data programming", "primary_area": "", "supplementary_material": "", "author": "G\u00fcrkan Solmaz;Flavio Cirillo;Fabio Maresca;Anagha GodeAnilKumar", "authorids": "~G\u00fcrkan_Solmaz2;~Flavio_Cirillo1;~Fabio_Maresca1;~Anagha_GodeAnilKumar1", "gender": ";M;M;F", "homepage": "https://www.eecs.ucf.edu/~gsolmaz/index.html;;;", "dblp": "122/5231;153/2837;;", "google_scholar": "Y-XTnQEAAAAJ;https://scholar.google.de/citations?user=GK7PQ7UAAAAJ;;", "orcid": ";0000-0001-5273-9366;;", "linkedin": ";flaviocirillo/;fabio-maresca-3aa8a0177;http://www.linkedin.com/in/anagha-gode-anil-kumar-48805a148", "or_profile": "~G\u00fcrkan_Solmaz2;~Flavio_Cirillo1;~Fabio_Maresca1;~Anagha_GodeAnilKumar1", "aff": "NEC Laboratories Europe;NEC;NEC;", "aff_domain": "neclab.eu;neclab.eu;neclab.eu;", "position": "Researcher;Researcher;Early Stage Researchers;", "bibtex": "@misc{\nsolmaz2022label,\ntitle={Label Augmentation with Reinforced Labeling for Weak Supervision},\nauthor={G{\\\"u}rkan Solmaz and Flavio Cirillo and Fabio Maresca and Anagha GodeAnilKumar},\nyear={2022},\nurl={https://openreview.net/forum?id=Qb07sqX7dVl}\n}", "github": "", "project": "", "reviewers": "eeud;HUob;s9uq;5fTq", "site": "https://openreview.net/forum?id=Qb07sqX7dVl", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "4;5;5;4", "correctness": "3;2;2;3", "technical_novelty": "3;2;2;2", "empirical_novelty": "1;1;2;2", "wc_summary_paper": "55;70;71;67", "wc_summary_review": "25;23;30;41", "wc_main_review": "245;355;485;173", "wc_review": "325;448;586;281", "wc_reply_reviewers": "0;150;0;0", "wc_reply_authors": "224;313;425;148", "reply_reviewers": "0;1;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 4.5, 0.5 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.5, 0.5 ], "wc_summary_paper_avg": [ 65.75, 6.378675411086537 ], "wc_summary_review_avg": [ 29.75, 6.977642868476432 ], "wc_main_review_avg": [ 314.5, 117.85902595898203 ], "wc_review_avg": [ 410.0, 118.62335351860527 ], "wc_reply_reviewers_avg": [ 37.5, 64.9519052838329 ], "wc_reply_authors_avg": [ 277.5, 103.25817158946792 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8465801956051205304&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1;1", "aff_unique_norm": "NEC Laboratories Europe;NEC Corporation", "aff_unique_dep": ";", "aff_unique_url": "https://www.nec-labs.eu;https://www.nec.com", "aff_unique_abbr": "NEC LE;NEC", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1", "aff_country_unique": "Unknown;Japan" }, { "id": "QbFfqWAEmMr", "title": "LASSO: Latent Sub-spaces Orientation for Domain Generalization", "track": "main", "status": "Reject", "tldr": "", "abstract": "To achieve a satisfactory generalization performance on prediction tasks in an unseen domain, existing domain generalization (DG) approaches often rely on the strict assumption of fixed domain-invariant features and common hypotheses learned from a set of training domains. While it is a natural and important premise to ground generalization capacity on the target domain, we argue that this assumption could be overly strict and sub-optimal. It is particularly evident when source domains share little information or the target domains leverages information from selective source domains in a compositional way instead of relying on a unique invariant hypothesis across all source domains. Unlike most existing approaches, instead of constructing a single hypothesis shared among domains, we propose a LAtent Sub-Space Orientation (LASSO) method that explores diverse latent sub-spaces and learning individual hypotheses on those sub-spaces. Moreover, in LASSO, since the latent sub-spaces are formed by the label-informative features captured in source domains, they allow us to project target examples onto appropriate sub-spaces, while preserving crucial label-informative features for the label prediction. Finally, we empirically evaluate our method on several well-known DG benchmarks, where it achieves state-of-the-art results.", "keywords": "Domain generalization;Image Classification;machine learning;deep learning", "primary_area": "", "supplementary_material": "/attachment/3dfdb24518b3f5e5012060f4dc361b1e58d57715.zip", "author": "Long Tung Vuong;Trung Quoc Phung;Toan Tran;Anh Tuan Tran;Dinh Phung;Trung Le", "authorids": "~Long_Tung_Vuong1;~Trung_Quoc_Phung1;~Toan_Tran1;~Anh_Tuan_Tran2;~Dinh_Phung2;~Trung_Le2", "gender": "M;M;M;M;M;M", "homepage": ";;;https://sites.google.com/site/anhttranusc/;;https://research.monash.edu/en/persons/dinh-phung", "dblp": "329/6838;307/5223;207/8479-3;150/5269-1;;71/5859", "google_scholar": "DCC657sAAAAJ;l6RZElkAAAAJ;https://scholar.google.com.au/citations?user=PnwSuNMAAAAJ;FYZ5ODQAAAAJ;https://scholar.google.com/citations?hl=en;https://scholar.google.com.au/citations?user=OtA9SwIAAAAJ", "orcid": ";0000-0002-4916-5055;0000-0001-7182-7548;0000-0002-3120-4036;;0000-0002-9977-8247", "linkedin": "long-vuong-783477131/;qtrungphung/;;https://linkedin.com/in/anh-tran-97814b19;;https://linkedin.com/in/dinh-phung-6b537a6", "or_profile": "~Long_Tung_Vuong1;~Trung_Quoc_Phung1;~Toan_Tran1;~Anh_Tuan_Tran2;~Trung_Le2;~Dinh_Phung1", "aff": ";VinAI Research, Vietnam;Hanoi University of Science and Technology;VinAI Research;Monash University;Monash University", "aff_domain": ";vinai.io;hust.edu.vn;vinai.io;monash.edu;monash.edu", "position": ";Resident;Lecturer;Research Scientist;Assistant Professor;Full Professor", "bibtex": "@misc{\nvuong2022lasso,\ntitle={{LASSO}: Latent Sub-spaces Orientation for Domain Generalization},\nauthor={Long Tung Vuong and Trung Quoc Phung and Toan Tran and Anh Tuan Tran and Dinh Phung and Trung Le},\nyear={2022},\nurl={https://openreview.net/forum?id=QbFfqWAEmMr}\n}", "github": "", "project": "", "reviewers": "H9Wv;aksw;gyy1;crRo;43Db", "site": "https://openreview.net/forum?id=QbFfqWAEmMr", "pdf_size": 0, "recommendation": "5;5;6;6;6", "confidence": "3;4;5;4;4", "correctness": "2;3;3;4;3", "technical_novelty": "3;3;2;3;3", "empirical_novelty": "2;0;2;3;3", "wc_summary_paper": "122;105;38;57;182", "wc_summary_review": "93;31;39;22;28", "wc_main_review": "386;233;300;229;202", "wc_review": "601;369;377;308;412", "wc_reply_reviewers": "261;189;28;168;156", "wc_reply_authors": "1426;1532;1281;1407;1367", "reply_reviewers": "1;2;1;1;1", "reply_authors": "4;5;4;3;3", "recommendation_avg": [ 5.6, 0.48989794855663565 ], "confidence_avg": [ 4.0, 0.6324555320336759 ], "correctness_avg": [ 3.0, 0.6324555320336759 ], "technical_novelty_avg": [ 2.8, 0.39999999999999997 ], "empirical_novelty_avg": [ 2.0, 1.0954451150103321 ], "wc_summary_paper_avg": [ 100.8, 50.83856803648191 ], "wc_summary_review_avg": [ 42.6, 25.788369471527275 ], "wc_main_review_avg": [ 270.0, 66.37770710110436 ], "wc_review_avg": [ 413.4, 99.60040160561604 ], "wc_reply_reviewers_avg": [ 160.4, 75.55818949657277 ], "wc_reply_authors_avg": [ 1402.6, 81.68867730597674 ], "reply_reviewers_avg": [ 1.2, 0.4000000000000001 ], "reply_authors_avg": [ 3.8, 0.7483314773547882 ], "replies_avg": [ 33, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.6454972243679027, "corr_recommendation_correctness": 0.6454972243679027, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:NDzyLDY3yEAJ:scholar.google.com/&scioq=LASSO:+Latent+Sub-spaces+Orientation+for+Domain+Generalization&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1;0;2;2", "aff_unique_norm": "VinAI Research;Hanoi University of Science and Technology;Monash University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.vin.ai;https://www.hust.edu.vn;https://www.monash.edu", "aff_unique_abbr": "VinAI;HUST;Monash", "aff_campus_unique_index": "1", "aff_campus_unique": ";Hanoi", "aff_country_unique_index": "0;0;0;1;1", "aff_country_unique": "Vietnam;Australia" }, { "id": "QdcbUq0-tYM", "title": "Universal Controllers with Differentiable Physics for Online System Identification", "track": "main", "status": "Reject", "tldr": "", "abstract": "Creating robots that can handle changing or unknown environments is a critical step towards real-world robot applications. Existing methods tackle this problem by training controllers robust to large ranges of environment parameters (Domain Randomization), or by combining ``Universal'' Controllers (UC) conditioned on environment parameters with learned identification modules that (implicitly or explicitly) identify the environment parameters from sensory inputs (Domain Adaptation). However, these methods can lead to over-conservative behaviors or poor generalization outside the training distribution. In this work, we present a domain adaptation approach that improves generalization of the identification module by leveraging prior knowledge in physics. Our proposed algorithm, UC-DiffOSI, combines a UC trained on a wide range of environments with an Online System Identification module based on a differentiable physics engine (DiffOSI). We evaluate UC-DiffOSI on articulated rigid body control tasks, including a wiping task that requires contact-rich environment interaction.\nCompared to previous works, UC-DiffOSI outperforms domain randomization baselines and is more robust than domain adaptation methods that rely on learned identification models. In addition, we perform two studies showing that UC-DiffOSI operates well in environments with changing or unknown dynamics. These studies test sudden changes in the robot's mass and inertia, and they evaluate in an environment (PyBullet) whose dynamics differs from training (NimblePhysics).", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Michelle Guo;Wenhao Yu;Daniel Ho;Jiajun Wu;Yunfei Bai;Karen Liu;Wenlong Lu", "authorids": "~Michelle_Guo1;~Wenhao_Yu1;~Daniel_Ho1;~Jiajun_Wu1;~Yunfei_Bai1;~Karen_Liu1;wenlongl@google.com", "gender": "F;M;M;M;M;;", "homepage": "https://shellguo.com;https://wenhaoyu.weebly.com/;https://itsdanielho.com/;https://jiajunwu.com;https://www.yunfei-bai.com/;https://cs.stanford.edu/~karenliu;", "dblp": "185/0671;;55/10982;117/4768;;;", "google_scholar": "lyjjpNMAAAAJ;1bF2s2kAAAAJ;i05Kw5cAAAAJ;2efgcS0AAAAJ;lgvyqMQAAAAJ;i28fU0MAAAAJ;", "orcid": "0000-0002-6574-6669;;;0000-0002-4176-343X;;0000-0001-5926-0905;", "linkedin": ";;;jiajunwu/;yunfei-bai-58a1451a;;", "or_profile": "~Michelle_Guo1;~Wenhao_Yu1;~Daniel_Ho1;~Jiajun_Wu1;~Yunfei_Bai1;~Karen_Liu1;wenlongl@google.com", "aff": "Computer Science Department, Stanford University;Google;Google;Stanford University;Google;;", "aff_domain": "cs.stanford.edu;google.com;google.com;stanford.edu;google.com;;", "position": "PhD student;Software Engineer;Software Engineer;Assistant Professor;Researcher;;", "bibtex": "@misc{\nguo2022universal,\ntitle={Universal Controllers with Differentiable Physics for Online System Identification},\nauthor={Michelle Guo and Wenhao Yu and Daniel Ho and Jiajun Wu and Yunfei Bai and Karen Liu and Wenlong Lu},\nyear={2022},\nurl={https://openreview.net/forum?id=QdcbUq0-tYM}\n}", "github": "", "project": "", "reviewers": "8qJx;3zvY;S8au;hY3g", "site": "https://openreview.net/forum?id=QdcbUq0-tYM", "pdf_size": 0, "recommendation": "5;5;5;6", "confidence": "4;4;2;4", "correctness": "2;4;4;3", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "88;48;135;34", "wc_summary_review": "27;50;28;42", "wc_main_review": "450;172;145;219", "wc_review": "565;270;308;295", "wc_reply_reviewers": "0;26;0;0", "wc_reply_authors": "235;103;156;108", "reply_reviewers": "0;1;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 5.25, 0.4330127018922193 ], "confidence_avg": [ 3.5, 0.8660254037844386 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 76.25, 39.283425257988895 ], "wc_summary_review_avg": [ 36.75, 9.67923034130297 ], "wc_main_review_avg": [ 246.5, 120.43774325351667 ], "wc_review_avg": [ 359.5, 119.42884911109208 ], "wc_reply_reviewers_avg": [ 6.5, 11.258330249197702 ], "wc_reply_authors_avg": [ 150.5, 52.99292405595298 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": -0.17407765595569782, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8184482206964421009&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;1;0;1", "aff_unique_norm": "Stanford University;Google", "aff_unique_dep": "Computer Science Department;Google", "aff_unique_url": "https://www.stanford.edu;https://www.google.com", "aff_unique_abbr": "Stanford;Google", "aff_campus_unique_index": "0;1;1;0;1", "aff_campus_unique": "Stanford;Mountain View", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "QevkqHTK3DJ", "title": "Compressing Transformer-Based Sequence to Sequence Models With Pre-trained Autoencoders for Text Summarization", "track": "main", "status": "Reject", "tldr": "", "abstract": "We proposed a technique to reduce the decoder\u2019s number of parameters in a sequence to sequence (seq2seq) architecture for automatic text summarization. This approach uses a pre-trained AutoEncoder (AE) trained on top of a pre-trained encoder to reduce the encoder\u2019s output dimension and allow to significantly reduce the size of the decoder. The ROUGE score is used to measure the effectiveness of this method by comparing four different latent space dimensionality reductions: 96%, 66%, 50%, 44%. A few well-known frozen pre-trained encoders (BART, BERT, and DistilBERT) have been tested, paired with the respective frozen pre-trained AEs to test the reduced dimension latent space\u2019s ability to train a 3-layer transformer decoder. We also repeated the same experiments on a small transformer model that has been trained for text summarization. This study shows an increase of the R-1 score by 5% while reducing the model size by 44% using the DistilBERT encoder, and competitive scores for all the other models associated to important size reduction.", "keywords": "Transformer;Automatic Text Summarization;sequence-to-sequence;Compression", "primary_area": "", "supplementary_material": "", "author": "Ala Alam Falaki;Robin Gras", "authorids": "~Ala_Alam_Falaki1;rgras@uwindsor.ca", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": "alafalaki/;", "or_profile": "~Ala_Alam_Falaki1;rgras@uwindsor.ca", "aff": "University of Windsor;", "aff_domain": "uwindsor.ca;", "position": "PhD student;", "bibtex": "@misc{\nfalaki2022compressing,\ntitle={Compressing Transformer-Based Sequence to Sequence Models With Pre-trained Autoencoders for Text Summarization},\nauthor={Ala Alam Falaki and Robin Gras},\nyear={2022},\nurl={https://openreview.net/forum?id=QevkqHTK3DJ}\n}", "github": "", "project": "", "reviewers": "1LU5;wmem;Lsjg;vaDv", "site": "https://openreview.net/forum?id=QevkqHTK3DJ", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "4;4;3;3", "correctness": "3;2;2;3", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "57;37;29;39", "wc_summary_review": "55;18;36;34", "wc_main_review": "293;134;120;172", "wc_review": "405;189;185;245", "wc_reply_reviewers": "37;0;0;0", "wc_reply_authors": "643;190;499;323", "reply_reviewers": "1;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 40.5, 10.23474474522936 ], "wc_summary_review_avg": [ 35.75, 13.12202347201071 ], "wc_main_review_avg": [ 179.75, 68.09689787354488 ], "wc_review_avg": [ 256.0, 89.23564310296643 ], "wc_reply_reviewers_avg": [ 9.25, 16.021469970012117 ], "wc_reply_authors_avg": [ 413.75, 171.84495191887365 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:aeSCnY7l89kJ:scholar.google.com/&scioq=Compressing+Transformer-Based+Sequence+to+Sequence+Models+With+Pre-trained+Autoencoders+for+Text+Summarization&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "University of Windsor", "aff_unique_dep": "", "aff_unique_url": "https://www.uwindsor.ca", "aff_unique_abbr": "UWindsor", "aff_country_unique_index": "0", "aff_country_unique": "Canada" }, { "title": "StyleAlign: Analysis and Applications of Aligned StyleGAN Models", "status": "Oral", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6948", "id": "Qg2vi4ZbHM9", "poster": "", "openreview": "https://openreview.net/forum?id=Qg2vi4ZbHM9", "slides": "https://iclr.cc/virtual/2022/poster/6948", "video": "https://iclr.cc/virtual/2022/poster/6948", "author_site": "Zongze Wu, Yotam Nitzan, Eli Shechtman, Dani Lischinski", "tldr": "", "abstract": "In this paper, we perform an in-depth study of the properties and applications of aligned generative models.\nWe refer to two models as aligned if they share the same architecture, and one of them (the child) is obtained from the other (the parent) via fine-tuning to another domain, a common practice in transfer learning. Several works already utilize some basic properties of aligned StyleGAN models to perform image-to-image translation. Here, we perform the first detailed exploration of model alignment, also focusing on StyleGAN. First, we empirically analyze aligned models and provide answers to important questions regarding their nature. In particular, we find that the child model's latent spaces are semantically aligned with those of the parent, inheriting incredibly rich semantics, even for distant data domains such as human faces and churches. Second, equipped with this better understanding, we leverage aligned models to solve a diverse set of tasks. In addition to image translation, we demonstrate fully automatic cross-domain image morphing. We further show that zero-shot vision tasks may be performed in the child domain, while relying exclusively on supervision in the parent domain. We demonstrate qualitatively and quantitatively that our approach yields state-of-the-art results, while requiring only simple fine-tuning and inversion. ", "keywords": "StyleGAN;transfer learning;fine tuning;model alignment;image-to-image translation;image morphing", "primary_area": "", "supplementary_material": "/attachment/feb237822a8f823627b38f1d6c8b71432d2daf0e.zip", "author": "Zongze Wu;Yotam Nitzan;Eli Shechtman;Dani Lischinski", "authorids": "~Zongze_Wu2;~Yotam_Nitzan1;~Eli_Shechtman3;~Dani_Lischinski2", "gender": "M;M;M;M", "homepage": "https://www.cs.huji.ac.il/w~wuzongze/;https://yotamnitzan.github.io/;https://www.cs.huji.ac.il/~danix/;https://research.adobe.com/person/eli-shechtman/", "dblp": "125/6476-2;265/5979;29/19;50/1918.html", "google_scholar": "V8FwQGkAAAAJ;pTUX5wEAAAAJ;haahCZ4AAAAJ;B_FTboQAAAAJ", "orcid": "0000-0001-9190-1717;;0000-0002-6191-0361;0000-0002-6783-1795", "linkedin": "zongze-wu-ba49419b/;;;elishechtman/", "or_profile": "~Zongze_Wu2;~Yotam_Nitzan1;~Dani_Lischinski2;~Eli_Shechtman1", "aff": "Hebrew University of Jerusalem;Tel Aviv University;The Hebrew University of Jerusalem, Israel;Adobe", "aff_domain": "huji.ac.il;tau.ac.il;cs.huji.ac.il;adobe.com", "position": "PhD student;PhD student;Full Professor;Research Scientist", "bibtex": "@inproceedings{\nwu2022stylealign,\ntitle={StyleAlign: Analysis and Applications of Aligned Style{GAN} Models},\nauthor={Zongze Wu and Yotam Nitzan and Eli Shechtman and Dani Lischinski},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=Qg2vi4ZbHM9}\n}", "github": "", "project": "", "reviewers": "x6dT;nPWN;fnkZ;2pb8", "pdf_size": 0, "recommendation": "6;8;8;8", "confidence": "5;4;4;4", "correctness": "3;4;3;4", "technical_novelty": "2;2;3;4", "empirical_novelty": "3;4;3;4", "wc_summary_paper": "69;56;62;52", "wc_summary_review": "58;24;50;60", "wc_main_review": "424;204;389;288", "wc_review": "551;284;501;400", "wc_reply_reviewers": "41;16;770;0", "wc_reply_authors": "727;420;1779;574", "reply_reviewers": "1;1;3;0", "reply_authors": "1;1;3;1", "recommendation_avg": [ 7.5, 0.8660254037844386 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.82915619758885 ], "empirical_novelty_avg": [ 3.5, 0.5 ], "wc_summary_paper_avg": [ 59.75, 6.417748826496718 ], "wc_summary_review_avg": [ 48.0, 14.352700094407323 ], "wc_main_review_avg": [ 326.25, 86.45916666265065 ], "wc_review_avg": [ 434.0, 102.26680790950698 ], "wc_reply_reviewers_avg": [ 206.75, 325.52064066660967 ], "wc_reply_authors_avg": [ 875.0, 533.0914555683668 ], "reply_reviewers_avg": [ 1.25, 1.0897247358851685 ], "reply_authors_avg": [ 1.5, 0.8660254037844386 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -1.0, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 62, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11079296793136133967&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=Qg2vi4ZbHM9", "email": "huji.ac.il;tau.ac.il;cs.huji.ac.il;adobe.com", "author_num": 4, "aff_unique_index": "0;1;0;2", "aff_unique_norm": "Hebrew University of Jerusalem;Tel Aviv University;Adobe", "aff_unique_dep": ";;Adobe Inc.", "aff_unique_url": "https://www.huji.ac.il;https://www.tau.ac.il;https://www.adobe.com", "aff_unique_abbr": "HUJI;TAU;Adobe", "aff_campus_unique_index": "0", "aff_campus_unique": "Jerusalem;", "aff_country_unique_index": "0;0;0;1", "aff_country_unique": "Israel;United States" }, { "id": "QguFu30t0d", "title": "FedGEMS: Federated Learning of Larger Server Models via Selective Knowledge Fusion", "track": "main", "status": "Reject", "tldr": "", "abstract": "Today data is often scattered among billions of resource-constrained edge devices with security and privacy constraints. Federated Learning (FL) has emerged as a viable solution to learn a global model while keeping data private, but the model complexity of FL is impeded by the computation resources of edge nodes. In this work, we investigate a novel paradigm to take advantage of a powerful server model to break through model capacity in FL. By selectively learning from multiple teacher clients and itself, a server model develops in-depth knowledge and transfers its knowledge back to clients in return to boost their respective performance. Our proposed framework achieves superior performance on both server and client models and provides several advantages in a unified framework, including flexibility for heterogeneous client architectures, robustness to poisoning attacks, and communication efficiency between clients and server. By bridging FL effectively with larger server model training, our proposed paradigm paves ways for robust and continual knowledge accumulation from distributed and private data.", "keywords": "Federated Learning;Knowledge Distillation", "primary_area": "", "supplementary_material": "", "author": "Sijie Cheng;Jingwen Wu;Yanghua Xiao;Yang Liu;Yang Liu", "authorids": "~Sijie_Cheng1;~Jingwen_Wu1;~Yanghua_Xiao1;~Yang_Liu59;~Yang_Liu19", "gender": "F;;;F;M", "homepage": "https://adacheng.github.io/;;;;http://nlp.csai.tsinghua.edu.cn/~ly/", "dblp": "160/7320;;96/999;;51/3710-5", "google_scholar": "pruwctkAAAAJ;https://scholar.google.com/citations?view_op=list_works;https://scholar.google.com/citations?hl=zh-CN;JEieoFsAAAAJ;https://scholar.google.com.hk/citations?user=lVhoKNcAAAAJ", "orcid": ";;0000-0001-8403-9591;;0000-0002-3087-242X", "linkedin": ";;;;", "or_profile": "~Sijie_Cheng1;~Jingwen_Wu1;~Yanghua_Xiao1;~Yang_Liu59;~Yang_Liu19", "aff": "Tsinghua University;Beijing University of Post and Telecommunication, Tsinghua University;Fudan University;Tsinghua University;Tsinghua University", "aff_domain": "tsinghua.edu.cn;bupt.edu.cn;fudan.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn", "position": "Research Intern;Undergrad student;Full Professor;Associate Professor;Professor", "bibtex": "@misc{\ncheng2022fedgems,\ntitle={Fed{GEMS}: Federated Learning of Larger Server Models via Selective Knowledge Fusion},\nauthor={Sijie Cheng and Jingwen Wu and Yanghua Xiao and Yang Liu and Yang Liu},\nyear={2022},\nurl={https://openreview.net/forum?id=QguFu30t0d}\n}", "github": "", "project": "", "reviewers": "tN4o;Q1kU;Ed7U;Cj8x", "site": "https://openreview.net/forum?id=QguFu30t0d", "pdf_size": 0, "recommendation": "3;5;6;6", "confidence": "4;4;4;3", "correctness": "2;3;3;3", "technical_novelty": "2;3;2;3", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "80;66;42;40", "wc_summary_review": "12;112;35;23", "wc_main_review": "131;829;397;457", "wc_review": "223;1007;474;520", "wc_reply_reviewers": "31;140;75;431", "wc_reply_authors": "274;2610;2069;2979", "reply_reviewers": "1;2;2;4", "reply_authors": "2;5;5;8", "recommendation_avg": [ 5.0, 1.224744871391589 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 57.0, 16.76305461424021 ], "wc_summary_review_avg": [ 45.5, 39.24601890638081 ], "wc_main_review_avg": [ 453.5, 249.10389398803062 ], "wc_review_avg": [ 556.0, 283.86176213079494 ], "wc_reply_reviewers_avg": [ 169.25, 156.01662571662035 ], "wc_reply_authors_avg": [ 1983.0, 1038.414897813008 ], "reply_reviewers_avg": [ 2.25, 1.0897247358851685 ], "reply_authors_avg": [ 5.0, 2.1213203435596424 ], "replies_avg": [ 34, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.4714045207910316, "corr_recommendation_correctness": 0.9428090415820632, "gs_citation": 75, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11002466621687944523&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;2;0;0", "aff_unique_norm": "Tsinghua University;Beijing University of Post and Telecommunication;Fudan University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.tsinghua.edu.cn;http://www.bupt.edu.cn/;https://www.fudan.edu.cn", "aff_unique_abbr": "THU;BUPT;Fudan", "aff_campus_unique_index": "1", "aff_campus_unique": ";Beijing", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "id": "QhHMf5J5Jom", "title": "A Scaling Law for Syn-to-Real Transfer: How Much Is Your Pre-training Effective?", "track": "main", "status": "Reject", "tldr": "", "abstract": "Synthetic-to-real transfer learning is a framework in which a synthetically generated dataset is used to pre-train a model to improve its performance on real vision tasks. The most significant advantage of using synthetic images is that the ground-truth labels are automatically available, enabling unlimited data size expansion without human cost. However, synthetic data may have a huge domain gap, in which case increasing the data size does not improve the performance. How can we know that? In this study, we derive a simple scaling law that predicts the performance from the amount of pre-training data. By estimating the parameters of the law, we can judge whether we should increase the data or change the setting of image synthesis. Further, we analyze the theory of transfer learning by considering learning dynamics and confirm that the derived generalization bound is compatible with our empirical findings. We empirically validated our scaling law on various experimental settings of benchmark tasks, model sizes, and complexities of synthetic images.", "keywords": "Transfer learning;Computer vision;Scaling law;Pre-training;Synthetic-to-real", "primary_area": "", "supplementary_material": "/attachment/44bc93c8bab69e8e42d96125b830d0230dbb460c.zip", "author": "Hiroaki Mikami;Kenji Fukumizu;Shogo Murai;Shuji Suzuki;Yuta Kikuchi;Taiji Suzuki;Shin-ichi Maeda;Kohei Hayashi", "authorids": "~Hiroaki_Mikami1;~Kenji_Fukumizu1;~Shogo_Murai1;~Shuji_Suzuki2;~Yuta_Kikuchi1;~Taiji_Suzuki1;~Shin-ichi_Maeda2;~Kohei_Hayashi1", "gender": "M;M;;M;M;M;M;M", "homepage": "http://hiroakimikami.github.io;http://www.ism.ac.jp/~fukumizu/;;;;http://ibis.t.u-tokyo.ac.jp/suzuki/;https://maeyon.github.io/publication/index.html;https://sites.google.com/site/koheihayashi84", "dblp": ";96/464;211/2849;;144/6937;08/312;90/4637;84/1101.html", "google_scholar": ";;;5Ica8zUAAAAJ;UwuggM4AAAAJ;x8osrBsAAAAJ;https://scholar.google.ca/citations?user=Fv-ifUQAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";0000-0002-3488-2625;;;;;0000-0002-3254-9722;", "linkedin": "hiroaki-mikami-69084b142/;;;;;;;koheih/", "or_profile": "~Hiroaki_Mikami1;~Kenji_Fukumizu1;~Shogo_Murai1;~Shuji_Suzuki2;~Yuta_Kikuchi1;~Taiji_Suzuki1;~Shin-ichi_Maeda2;~Kohei_Hayashi1", "aff": "Preferred Networks, Inc.;The Institute of Statistical Mathematics, Japan, Tokyo Institute of Technology;Preferred Networks, Inc.;Preferred Networks, Inc.;Preferred Networks, Inc.;The University of Tokyo;Preferred Networks, Inc.;Preferred Networks, Inc.", "aff_domain": "preferred.jp;ism.ac.jp;preferred.jp;preferred.jp;preferred.jp;tokyo.ac.jp;preferred.jp;preferred.jp", "position": "Researcher;Full Professor;Software engineer;Researcher;Researcher;Associate Professor;Senior Researcher;Researcher", "bibtex": "@misc{\nmikami2022a,\ntitle={A Scaling Law for Syn-to-Real Transfer: How Much Is Your Pre-training Effective?},\nauthor={Hiroaki Mikami and Kenji Fukumizu and Shogo Murai and Shuji Suzuki and Yuta Kikuchi and Taiji Suzuki and Shin-ichi Maeda and Kohei Hayashi},\nyear={2022},\nurl={https://openreview.net/forum?id=QhHMf5J5Jom}\n}", "github": "", "project": "", "reviewers": "9onL;EiPP;Ltke", "site": "https://openreview.net/forum?id=QhHMf5J5Jom", "pdf_size": 0, "recommendation": "3;5;6", "confidence": "3;3;3", "correctness": "2;3;3", "technical_novelty": "2;2;3", "empirical_novelty": "1;2;3", "wc_summary_paper": "50;66;98", "wc_summary_review": "47;82;68", "wc_main_review": "131;811;557", "wc_review": "228;959;723", "wc_reply_reviewers": "170;112;266", "wc_reply_authors": "653;991;862", "reply_reviewers": "1;2;1", "reply_authors": "2;3;2", "recommendation_avg": [ 4.666666666666667, 1.247219128924647 ], "confidence_avg": [ 3.0, 0.0 ], "correctness_avg": [ 2.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.0, 0.816496580927726 ], "wc_summary_paper_avg": [ 71.33333333333333, 19.955506062794353 ], "wc_summary_review_avg": [ 65.66666666666667, 14.383632673594278 ], "wc_main_review_avg": [ 499.6666666666667, 280.55342133401655 ], "wc_review_avg": [ 636.6666666666666, 304.6094038090237 ], "wc_reply_reviewers_avg": [ 182.66666666666666, 63.50503042191925 ], "wc_reply_authors_avg": [ 835.3333333333334, 139.27032067968474 ], "reply_reviewers_avg": [ 1.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 2.3333333333333335, 0.4714045207910317 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.9449111825230683, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Do_Sv32WeJMJ:scholar.google.com/&scioq=A+Scaling+Law+for+Syn-to-Real+Transfer:+How+Much+Is+Your+Pre-training+Effective%3F&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1;0;0;0;2;0;0", "aff_unique_norm": "Preferred Networks, Inc.;Institute of Statistical Mathematics;University of Tokyo", "aff_unique_dep": ";;", "aff_unique_url": "https://www.preferred-networks.com;https://www.ism.ac.jp;https://www.u-tokyo.ac.jp", "aff_unique_abbr": "PFN;ISM;UTokyo", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0", "aff_country_unique": "Japan" }, { "id": "QiM-fYm3gb7", "title": "McXai: Local model-agnostic explanation as two games", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "To this day, a variety of approaches for providing local interpretability of black-box machine learning models have been introduced. Unfortunately, all of these methods suffer from one or more of the following deficiencies: They are either difficult to understand themselves, they work on a per-feature basis and ignore the dependencies between features and/or they only focus on those features asserting the decision made by the model. To address these points, this work introduces a reinforcement learning-based approach called Monte Carlo tree search for eXplainable Artificial Intelligent (McXai) to explain the decisions of any black-box classification model (classifier). Our method leverages Monte Carlo tree search and models the process of generating explanations as two games. In one game, the reward is maximized by finding feature sets that support the decision of the classifier, while in the second game, finding feature sets leading to alternative decisions maximizes the reward. The result is a human friendly representation as a tree structure, in which each node represents a set of features to be studied with smaller explanations at the top of the tree. Our experiments show, that the features found by our method are more informative with respect to classifications than those found by classical approaches like LIME and SHAP. Furthermore, by also identifying misleading features, our approach is able to guide towards improved robustness of the black-box model in many situations. ", "keywords": "explainable AI;reinforcement learning;Monte Carlo tree search", "primary_area": "", "supplementary_material": "", "author": "Yiran Huang;Nicole Schaal;Michael Hefenbrock;Yexu Zhou;Till Riedel;Likun Fang;Michael Beigl", "authorids": "~Yiran_Huang1;schaalnicole@aol.com;michael.hefenbrock@kit.edu;zhou@teco.edu;riedel@teco.edu;fang@teco.edu;michael.beigl@kit.edu", "gender": "M;;;;;;", "homepage": "http://www.teco.edu/people/huang/;;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": "~Yiran_Huang1;schaalnicole@aol.com;michael.hefenbrock@kit.edu;zhou@teco.edu;riedel@teco.edu;fang@teco.edu;michael.beigl@kit.edu", "aff": "Karlsruhe Institute of Technology;;;;;;", "aff_domain": "kit.edu;;;;;;", "position": "PhD student;;;;;;", "bibtex": "@misc{\nhuang2022mcxai,\ntitle={McXai: Local model-agnostic explanation as two games},\nauthor={Yiran Huang and Nicole Schaal and Michael Hefenbrock and Yexu Zhou and Till Riedel and Likun Fang and Michael Beigl},\nyear={2022},\nurl={https://openreview.net/forum?id=QiM-fYm3gb7}\n}", "github": "", "project": "", "reviewers": "L9a7;Auyc;qkH1;fbHd", "site": "https://openreview.net/forum?id=QiM-fYm3gb7", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "3;4;3;4", "correctness": "3;2;2;2", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "133;16;112;84", "wc_summary_review": "121;41;19;28", "wc_main_review": "409;611;579;492", "wc_review": "663;668;710;604", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 2.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 86.25, 44.12694754908841 ], "wc_summary_review_avg": [ 52.25, 40.45599461142934 ], "wc_main_review_avg": [ 522.75, 78.79839782635177 ], "wc_review_avg": [ 661.25, 37.75827723824274 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": -0.3333333333333333, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5886501124619009924&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0", "aff_unique_norm": "Karlsruhe Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.kit.edu", "aff_unique_abbr": "KIT", "aff_country_unique_index": "0", "aff_country_unique": "Germany" }, { "title": "Distilling GANs with Style-Mixed Triplets for X2I Translation with Limited Data", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6708", "id": "QjOQkpzKbNk", "poster": "", "openreview": "https://openreview.net/forum?id=QjOQkpzKbNk", "slides": "https://iclr.cc/virtual/2022/poster/6708", "video": "https://iclr.cc/virtual/2022/poster/6708", "author_site": "Yaxing Wang, Joost van de Weijer, Lu Yu, SHANGLING JUI", "tldr": "", "abstract": "Conditional image synthesis is an integral part of many X2I translation systems, including image-to-image, text-to-image and audio-to-image translation systems. Training these large systems generally requires huge amounts of training data. \nTherefore, we investigate knowledge distillation to transfer knowledge from a high-quality unconditioned generative model (e.g., StyleGAN) to a conditioned synthetic image generation modules in a variety of systems. To initialize the conditional and reference branch (from a unconditional GAN) we exploit the style mixing characteristics of high-quality GANs to generate an infinite supply of style-mixed triplets to perform the knowledge distillation. Extensive experimental results in a number of image generation tasks (i.e., image-to-image, semantic segmentation-to-image, text-to-image and audio-to-image) demonstrate qualitatively and quantitatively that our method successfully transfers knowledge to the synthetic image generation modules, resulting in more realistic images than previous methods as confirmed by a significant drop in the FID. ", "keywords": "Transfer learning;image synthesis;limited data.", "primary_area": "", "supplementary_material": "/attachment/551707bfa58546c14c9e6c0d13a79cfbb6c6cff6.zip", "author": "Yaxing Wang;Joost van de weijer;Lu Yu;SHANGLING JUI", "authorids": "~Yaxing_Wang2;~Joost_van_de_weijer3;~Lu_Yu3;~SHANGLING_JUI1", "gender": "F;M;M;M", "homepage": ";;http://lamp.cvc.uab.es/;https://yaxingwang.netlify.app/author/yaxing-wang/", "dblp": "04/1781-4;;67/3379;", "google_scholar": ";;https://scholar.google.es/citations?user=Gsw2iUEAAAAJ;https://scholar.google.es/citations?user=6CsB8k0AAAAJ", "orcid": ";0000-0002-1047-4264;0000-0002-9656-9706;", "linkedin": ";;;", "or_profile": "~Lu_Yu3;~SHANGLING_JUI1;~Joost_van_de_Weijer1;~Yaxing_Wang3", "aff": "Tianjin University of Technology;Huawei Technologies Ltd.;Computer Vision Center, Universitat Aut\u00f3noma de Barcelona;Nankai University", "aff_domain": "tjut.edu.cn;huawei.com;cvc.uab.es;nku.nankai.edu.cn", "position": "Associate Professor;Principal Researcher;Researcher;Associate Professor", "bibtex": "@inproceedings{\nwang2022distilling,\ntitle={Distilling {GAN}s with Style-Mixed Triplets for X2I Translation with Limited Data},\nauthor={Yaxing Wang and Joost van de weijer and Lu Yu and SHANGLING JUI},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=QjOQkpzKbNk}\n}", "github": "", "project": "", "reviewers": "BBix;76so;EUGu", "pdf_size": 0, "recommendation": "5;6;8", "confidence": "4;3;4", "correctness": "3;3;3", "technical_novelty": "2;2;3", "empirical_novelty": "2;2;2", "wc_summary_paper": "88;87;77", "wc_summary_review": "66;41;57", "wc_main_review": "230;483;237", "wc_review": "384;611;371", "wc_reply_reviewers": "0;59;104", "wc_reply_authors": "1150;1005;418", "reply_reviewers": "0;1;1", "reply_authors": "2;2;1", "recommendation_avg": [ 6.333333333333333, 1.247219128924647 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 84.0, 4.96655480858378 ], "wc_summary_review_avg": [ 54.666666666666664, 10.338708279513881 ], "wc_main_review_avg": [ 316.6666666666667, 117.65014048251517 ], "wc_review_avg": [ 455.3333333333333, 110.2008267764912 ], "wc_reply_reviewers_avg": [ 54.333333333333336, 42.58586098173378 ], "wc_reply_authors_avg": [ 857.6666666666666, 316.4767851763468 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.18898223650461363, "corr_recommendation_correctness": 0.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9756467289641566532&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "pdf": "https://openreview.net/pdf?id=QjOQkpzKbNk", "email": "tjut.edu.cn;huawei.com;cvc.uab.es;nku.nankai.edu.cn", "author_num": 4, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "Tianjin University of Technology;Huawei;Universitat Aut\u00f3noma de Barcelona;Nankai University", "aff_unique_dep": ";Huawei Technologies;Computer Vision Center;", "aff_unique_url": "http://www.tjut.edu.cn;https://www.huawei.com;https://www.uab.cat;http://www.nankai.edu.cn", "aff_unique_abbr": "TUT;Huawei;UAB;NKU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "China;Spain" }, { "title": "Beyond ImageNet Attack: Towards Crafting Adversarial Examples for Black-box Domains", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6469", "id": "QkRV50TZyP", "poster": "", "openreview": "https://openreview.net/forum?id=QkRV50TZyP", "slides": "https://iclr.cc/virtual/2022/poster/6469", "video": "https://iclr.cc/virtual/2022/poster/6469", "author_site": "Qilong Zhang, Xiaodan Li, YueFeng Chen, Jingkuan Song, Lianli Gao, Yuan He, Hui Xue'", "tldr": "", "abstract": "Adversarial examples have posed a severe threat to deep neural networks due to their transferable nature. Currently, various works have paid great efforts to enhance the cross-model transferability, which mostly assume the substitute model is trained in the same domain as the target model.\nHowever, in reality, the relevant information of the deployed model is unlikely to leak.\nHence, it is vital to build a more practical black-box threat model to overcome this limitation and evaluate the vulnerability of deployed models.\nIn this paper, with only the knowledge of the ImageNet domain, we propose a Beyond ImageNet Attack (BIA) to investigate the transferability towards black-box domains (unknown classification tasks). Specifically, we leverage a generative model to learn the adversarial function for disrupting low-level features of input images. \nBased on this framework, we further propose two variants to narrow the gap between the source and target domains from the data and model perspectives, respectively. Extensive experiments on coarse-grained and fine-grained domains demonstrate the effectiveness of our proposed methods. Notably,\nour methods outperform state-of-the-art approaches by up to 7.71\\% (towards coarse-grained domains) and 25.91\\% (towards fine-grained domains) on average. Our code is available at \\url{https://github.com/Alibaba-AAIG/Beyond-ImageNet-Attack}.", "keywords": "practice black-box attack;cross-domain transferability", "primary_area": "", "supplementary_material": "/attachment/3cbae3878dc7337b05d588f87e92aceacf18622a.zip", "author": "Qilong Zhang;Xiaodan Li;YueFeng Chen;Jingkuan Song;Lianli Gao;Yuan He;Hui Xue'", "authorids": "~Qilong_Zhang2;~Xiaodan_Li1;~YueFeng_Chen1;~Jingkuan_Song3;~Lianli_Gao1;~Yuan_He2;~Hui_Xue'1", "gender": "M;F;M;M;F;M;M", "homepage": ";;;https://cfm.uestc.edu.cn/~songjingkuan/;https://lianligao.github.io/;http://www.alibaba.com;http://www.alibaba.com", "dblp": "22/3730;126/7789;52/8180;70/10575;123/9849.html;11/1735-1.html;", "google_scholar": "IgPyQWYAAAAJ;YximuHAAAAAJ;Kf-IpFsAAAAJ;F5Zy9V4AAAAJ;https://scholar.google.com.au/citations?user=zsm2dpYAAAAJ;cWbXLzgAAAAJ;", "orcid": "0009-0005-2591-5762;;;;;0000-0002-6885-1341;", "linkedin": ";;;;;;", "or_profile": "~Qilong_Zhang2;~Xiaodan_Li1;~YueFeng_Chen1;~Jingkuan_Song3;~Lianli_Gao1;~Yuan_He2;~Hui_Xue'1", "aff": "University of Electronic Science and Technology of China;Alibaba Group;Alibaba Group;University of Electronic Science and Technology of China,;University of Electronic Science and Technology of China;Alibaba Group;Alibaba Group", "aff_domain": "uestc.edu;alibaba-inc.com;alibaba-inc.com;uestc.edu.cn;uestc.edu.cn;alibaba-inc.com;alibaba-inc.com", "position": "MS student;Researcher;Staff Algorithm Engineer;Full Professor;Full Professor;Researcher;Principal Researcher", "bibtex": "@inproceedings{\nzhang2022beyond,\ntitle={Beyond ImageNet Attack: Towards Crafting Adversarial Examples for Black-box Domains},\nauthor={Qilong Zhang and Xiaodan Li and YueFeng Chen and Jingkuan Song and Lianli Gao and Yuan He and Hui Xue'},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=QkRV50TZyP}\n}", "github": "", "project": "", "reviewers": "9vb9;bvNx;YKsk", "pdf_size": 0, "recommendation": "6;6;8", "confidence": "4;4;4", "correctness": "3;3;3", "technical_novelty": "3;3;2", "empirical_novelty": "3;3;3", "wc_summary_paper": "100;88;109", "wc_summary_review": "52;46;56", "wc_main_review": "173;339;599", "wc_review": "325;473;764", "wc_reply_reviewers": "120;464;333", "wc_reply_authors": "1060;634;780", "reply_reviewers": "1;1;1", "reply_authors": "3;2;3", "recommendation_avg": [ 6.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 99.0, 8.602325267042627 ], "wc_summary_review_avg": [ 51.333333333333336, 4.109609335312651 ], "wc_main_review_avg": [ 370.3333333333333, 175.3193910806471 ], "wc_review_avg": [ 520.6666666666666, 182.36288608766375 ], "wc_reply_reviewers_avg": [ 305.6666666666667, 141.76114496653244 ], "wc_reply_authors_avg": [ 824.6666666666666, 176.7584667153332 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 2.6666666666666665, 0.4714045207910317 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 68, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8113530331285961655&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=QkRV50TZyP", "email": "uestc.edu;alibaba-inc.com;alibaba-inc.com;uestc.edu.cn;uestc.edu.cn;alibaba-inc.com;alibaba-inc.com", "author_num": 7, "aff_unique_index": "0;1;1;0;0;1;1", "aff_unique_norm": "University of Electronic Science and Technology of China;Alibaba Group", "aff_unique_dep": ";", "aff_unique_url": "https://www.uestc.edu.cn;https://www.alibaba.com", "aff_unique_abbr": "UESTC;Alibaba", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "China" }, { "id": "QkfMWTl520U", "title": "When do Convolutional Neural Networks Stop Learning?", "track": "main", "status": "Reject", "tldr": "", "abstract": "Convolutional Neural Networks (CNNs) is one of the most essential architectures that has shown impressive performance in computer vision tasks such as image classification, detection, and segmentation. In training phase of CNN, an arbitrary number of epochs is used to train the neural networks. In a single epoch, the entire training data---divided by batch size---are fed to the network. However, the optimal number of epochs required to train a neural network is not well established. In practice, validation data is used to identify the generalization gap. To avoid overfitting, it is recommended to stop training when the generalization gap increases. However, this is a trial and error based approach. This raises a critical question: Is it possible to estimate when neural networks stop learning based on only the training data? In this research work, we introduce the stability property of data in layers and based on this property, we predict the near optimal epoch number of a CNN. We do not use any validation data to predict the near optimal epoch number. We experiment our hypothesis on six different CNN models and on three different datasets (CIFIR 10, CIFIR 100, SVHN). We save on average 58.49\\% computational time to train a CNN model. Our code is available at https://github.com/PaperUnderReviewDeepLearning/Optimization.", "keywords": "Deep Learning;Convolutional Neural Network;CNN;Epoch;Training;Data", "primary_area": "", "supplementary_material": "", "author": "SAHAN AHMAD;Aminul Islam", "authorids": "~SAHAN_AHMAD1;aminul@louisiana.edu", "gender": "M;", "homepage": ";", "dblp": ";", "google_scholar": "KuTTZ68AAAAJ;", "orcid": ";", "linkedin": "sahanahmad/;", "or_profile": "~SAHAN_AHMAD1;aminul@louisiana.edu", "aff": "University of Louisiana, Lafayette;", "aff_domain": "lousiana.edu;", "position": "PhD student;", "bibtex": "@misc{\nahmad2022when,\ntitle={When do Convolutional Neural Networks Stop Learning?},\nauthor={SAHAN AHMAD and Aminul Islam},\nyear={2022},\nurl={https://openreview.net/forum?id=QkfMWTl520U}\n}", "github": "", "project": "", "reviewers": "QVY8;P5xn;mbyZ;sbHo", "site": "https://openreview.net/forum?id=QkfMWTl520U", "pdf_size": 0, "recommendation": "1;3;5;5", "confidence": "5;4;4;5", "correctness": "1;2;3;2", "technical_novelty": "1;3;2;3", "empirical_novelty": "1;2;2;3", "wc_summary_paper": "62;69;143;90", "wc_summary_review": "30;71;35;83", "wc_main_review": "180;284;630;357", "wc_review": "272;424;808;530", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.5, 1.6583123951777 ], "confidence_avg": [ 4.5, 0.5 ], "correctness_avg": [ 2.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.25, 0.82915619758885 ], "empirical_novelty_avg": [ 2.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 91.0, 31.741140496207755 ], "wc_summary_review_avg": [ 54.75, 22.71976012197312 ], "wc_main_review_avg": [ 362.75, 166.6243904715033 ], "wc_review_avg": [ 508.5, 195.72621183684112 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.30151134457776363, "corr_recommendation_correctness": 0.8528028654224417, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:PwLnl-yT23UJ:scholar.google.com/&scioq=When+do+Convolutional+Neural+Networks+Stop+Learning%3F&hl=en&as_sdt=0,5", "gs_version_total": 4, "aff_unique_index": "0", "aff_unique_norm": "University of Louisiana at Lafayette", "aff_unique_dep": "", "aff_unique_url": "https://www.louisiana.edu", "aff_unique_abbr": "UL Lafayette", "aff_campus_unique_index": "0", "aff_campus_unique": "Lafayette", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "id": "QmKblFEgQJ", "title": "DIGRAC: Digraph Clustering Based on Flow Imbalance", "track": "main", "status": "Reject", "tldr": "", "abstract": "Node clustering is a powerful tool in the analysis of networks. We introduce a graph neural network framework to obtain node embeddings for directed networks in a self-supervised manner, including a novel probabilistic imbalance loss, which can be used for network clustering. Here, we propose directed flow imbalance measures, which are tightly related to directionality, to reveal clusters in the network even when there is no density difference between clusters. In contrast to standard approaches in the literature, in this paper, directionality is not treated as a nuisance, but rather contains the main signal. DIGRAC optimizes directed flow imbalance for clustering without requiring label supervision, unlike existing GNN methods, and can naturally incorporate node features, unlike existing spectral methods. Experimental results on synthetic data, in the form of directed stochastic block models, and real-world data at different scales, demonstrate that our method, based on flow imbalance, attains state-of-the-art results on directed graph clustering, for a wide range of noise and sparsity levels and graph structures and topologies.", "keywords": "flow imbalance;directed networks;graph neural networks;clustering;directed stochastic block models", "primary_area": "", "supplementary_material": "/attachment/805a268bbb45b457585b94f6a3d6e91e19285f2c.zip", "author": "Yixuan He;Gesine Reinert;Mihai Cucuringu", "authorids": "~Yixuan_He2;~Gesine_Reinert1;~Mihai_Cucuringu1", "gender": "F;F;M", "homepage": "https://sherylhyx.github.io/;http://www.stats.ox.ac.uk/~reinert/;https://www.math.ucla.edu/~mihai/", "dblp": "226/6494;86/1736;58/6857", "google_scholar": "SWme_nYAAAAJ;2gvyN5oAAAAJ;GFvVRzwAAAAJ", "orcid": "0000-0002-5990-0658;;", "linkedin": "yixuan-he-sheryl/;gesine-reinert-77b64913/?originalSubdomain=uk;mihai-cucuringu-9a866634/", "or_profile": "~Yixuan_He2;~Gesine_Reinert1;~Mihai_Cucuringu1", "aff": "University of Oxford;University of Oxford;The Alan Turing Institute", "aff_domain": "ox.ac.uk;ox.ac.uk;turing.ac.uk", "position": "PhD student;Professor;Fellow", "bibtex": "@misc{\nhe2022,\ntitle={ {DIGRAC}: Digraph Clustering Based on Flow Imbalance},\nauthor={Yixuan He and Gesine Reinert and Mihai Cucuringu},\nyear={2022},\nurl={https://openreview.net/forum?id=QmKblFEgQJ}\n}", "github": "", "project": "", "reviewers": "mMXH;h5sc;SM6X", "site": "https://openreview.net/forum?id=QmKblFEgQJ", "pdf_size": 0, "recommendation": "3;5;5", "confidence": "3;4;3", "correctness": "3;3;4", "technical_novelty": "2;3;3", "empirical_novelty": "2;2;3", "wc_summary_paper": "90;95;56", "wc_summary_review": "26;36;33", "wc_main_review": "195;401;205", "wc_review": "311;532;294", "wc_reply_reviewers": "475;0;0", "wc_reply_authors": "1825;1090;679", "reply_reviewers": "2;0;0", "reply_authors": "5;2;1", "recommendation_avg": [ 4.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 80.33333333333333, 17.326921891156037 ], "wc_summary_review_avg": [ 31.666666666666668, 4.189935029992178 ], "wc_main_review_avg": [ 267.0, 94.84021650474374 ], "wc_review_avg": [ 379.0, 108.40971666168429 ], "wc_reply_reviewers_avg": [ 158.33333333333334, 223.91714737574003 ], "wc_reply_authors_avg": [ 1198.0, 474.0443017271698 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.9428090415820634 ], "reply_authors_avg": [ 2.6666666666666665, 1.699673171197595 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.49999999999999983, "corr_recommendation_correctness": 0.49999999999999983, "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13525583612493259183&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff_unique_index": "0;0;1", "aff_unique_norm": "University of Oxford;Alan Turing Institute", "aff_unique_dep": ";", "aff_unique_url": "https://www.ox.ac.uk;https://www.turing.ac.uk", "aff_unique_abbr": "Oxford;ATI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United Kingdom" }, { "title": "Pareto Set Learning for Neural Multi-Objective Combinatorial Optimization", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7076", "id": "QuObT9BTWo", "poster": "", "openreview": "https://openreview.net/forum?id=QuObT9BTWo", "slides": "https://iclr.cc/virtual/2022/poster/7076", "video": "https://iclr.cc/virtual/2022/poster/7076", "author_site": "Xi Lin, Zhiyuan Yang, Qingfu Zhang", "tldr": "", "abstract": "Multiobjective combinatorial optimization (MOCO) problems can be found in many real-world applications. However, exactly solving these problems would be very challenging, particularly when they are NP-hard. Many handcrafted heuristic methods have been proposed to tackle different MOCO problems over the past decades. In this work, we generalize the idea of neural combinatorial optimization, and develop a learning-based approach to approximate the whole Pareto set for a given MOCO problem without further search procedure. We propose a single preference-conditioned model to directly generate approximate Pareto solutions for any trade-off preference, and design an efficient multiobjective reinforcement learning algorithm to train this model. Our proposed method can be treated as a learning-based extension for the widely-used decomposition-based multiobjective evolutionary algorithm (MOEA/D). It uses a single model to accommodate all the possible preferences, whereas other methods use a finite number of solutions to approximate the Pareto set. Experimental results show that our proposed method significantly outperforms some other methods on the multiobjective traveling salesman problem, multiobjective vehicle routing problem, and multiobjective knapsack problem in terms of solution quality, speed, and model efficiency.", "keywords": "Multiobjective Combinatorial Optimization;Combinatorial Optimization;Neural Combinatorial Optimization;Multiobjective Optimization", "primary_area": "", "supplementary_material": "", "author": "Xi Lin;Zhiyuan Yang;Qingfu Zhang", "authorids": "~Xi_Lin2;~Zhiyuan_Yang2;~Qingfu_Zhang1", "gender": "M;;M", "homepage": "https://xi-l.github.io/;;https://www.cs.cityu.edu.hk/~qzhan7/index.html", "dblp": "43/489-1;;98/1240.html", "google_scholar": "QB_MUboAAAAJ;;https://scholar.google.co.uk/citations?user=nhL9PHwAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Xi_Lin2;~Zhiyuan_Yang2;~Qingfu_Zhang1", "aff": "City University of Hong Kong;;City University of Hong Kong", "aff_domain": "cityu.edu.hk;;cityu.edu.hk", "position": "Postdoc;;Full Professor", "bibtex": "@inproceedings{\nlin2022pareto,\ntitle={Pareto Set Learning for Neural Multi-Objective Combinatorial Optimization},\nauthor={Xi Lin and Zhiyuan Yang and Qingfu Zhang},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=QuObT9BTWo}\n}", "github": "", "project": "", "reviewers": "CKHU;7hop;tRFG;mTSr", "pdf_size": 0, "recommendation": "6;6;6;8", "confidence": "4;2;3;3", "correctness": "3;4;3;4", "technical_novelty": "3;3;3;4", "empirical_novelty": "4;3;3;3", "wc_summary_paper": "68;54;85;82", "wc_summary_review": "48;127;87;25", "wc_main_review": "139;127;284;150", "wc_review": "255;308;456;257", "wc_reply_reviewers": "53;0;0;0", "wc_reply_authors": "575;150;1146;678", "reply_reviewers": "1;0;0;0", "reply_authors": "2;1;2;1", "recommendation_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 3.0, 0.7071067811865476 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 72.25, 12.336429791475327 ], "wc_summary_review_avg": [ 71.75, 38.84182668207045 ], "wc_main_review_avg": [ 175.0, 63.45470825714984 ], "wc_review_avg": [ 319.0, 81.89932844657518 ], "wc_reply_reviewers_avg": [ 13.25, 22.949673200287624 ], "wc_reply_authors_avg": [ 637.25, 354.1803036590262 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 95, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10853796196468498279&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=QuObT9BTWo", "email": "cityu.edu.hk;;cityu.edu.hk", "author_num": 3, "aff_unique_index": "0;0", "aff_unique_norm": "City University of Hong Kong", "aff_unique_dep": "", "aff_unique_url": "https://www.cityu.edu.hk", "aff_unique_abbr": "CityU", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Hong Kong SAR", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "id": "Qu_XudmGajz", "title": "Structured Uncertainty in the Observation Space of Variational Autoencoders", "track": "main", "status": "Reject", "tldr": "", "abstract": "Variational autoencoders (VAEs) are a popular class of deep generative models with many variants and a wide range of applications. Improvements upon the standard VAE mostly focus on the modelling of the posterior distribution over the latent space and the properties of the neural network decoder. In contrast, improving the model for the observational distribution is rarely considered and typically defaults to a pixel-wise independent categorical or normal distribution. In image synthesis, sampling from such distributions produces spatially-incoherent results with uncorrelated pixel noise, resulting in only the sample mean being somewhat useful as an output prediction. In this paper, we aim to stay true to VAE theory by improving the samples from the observational distribution. We propose an alternative model for the observation space, encoding spatial dependencies via a low-rank parameterization. We demonstrate that this new observational distribution has the ability to capture relevant covariance between pixels, resulting in spatially-coherent samples. In contrast to pixel-wise independent distributions, our samples seem to contain semantically meaningful variations from the mean allowing the prediction of multiple plausible outputs with a single forward pass.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "James Langley;Miguel Monteiro;Charles Jones;Nick Pawlowski;Ben Glocker", "authorids": "~James_Langley1;~Miguel_Monteiro1;~Charles_Jones4;~Nick_Pawlowski2;~Ben_Glocker1", "gender": "M;M;M;M;M", "homepage": ";https://charl-ai.github.io/;http://nickpawlowski.de;https://www.doc.ic.ac.uk/~bglocker;", "dblp": ";;198/1040;86/2890;", "google_scholar": ";;https://scholar.google.de/citations?user=a5u9fVYAAAAJ;https://scholar.google.co.uk/citations?user=g_HtjLIAAAAJ;", "orcid": ";;0000-0002-2748-7977;0000-0002-4897-9356;", "linkedin": ";charles-jones1917/;nickpawlowski;;jwblangley/", "or_profile": "~Miguel_Monteiro1;~Charles_Jones4;~Nick_Pawlowski2;~Ben_Glocker1;~James_William_Brandon_Langley1", "aff": "Imperial College London;Imperial College London;Broad Institute;Imperial College London;", "aff_domain": "imperial.ac.uk;imperial.ac.uk;broadinstitute.org;imperial.ac.uk;", "position": "PhD student;PhD student;Visiting Researcher;Associate Professor;", "bibtex": "@misc{\nlangley2022structured,\ntitle={Structured Uncertainty in the Observation Space of Variational Autoencoders},\nauthor={James Langley and Miguel Monteiro and Charles Jones and Nick Pawlowski and Ben Glocker},\nyear={2022},\nurl={https://openreview.net/forum?id=Qu_XudmGajz}\n}", "github": "", "project": "", "reviewers": "DqfM;cypB;uSoj;2FaS", "site": "https://openreview.net/forum?id=Qu_XudmGajz", "pdf_size": 0, "recommendation": "3;5;6;6", "confidence": "4;4;4;4", "correctness": "2;4;4;3", "technical_novelty": "2;2;3;2", "empirical_novelty": "3;2;3;2", "wc_summary_paper": "97;140;38;51", "wc_summary_review": "57;36;54;41", "wc_main_review": "223;305;614;175", "wc_review": "377;481;706;267", "wc_reply_reviewers": "0;427;56;0", "wc_reply_authors": "543;776;426;361", "reply_reviewers": "0;1;1;0", "reply_authors": "3;4;4;3", "recommendation_avg": [ 5.0, 1.224744871391589 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 81.5, 40.26474885057648 ], "wc_summary_review_avg": [ 47.0, 8.74642784226795 ], "wc_main_review_avg": [ 329.25, 170.84550769628098 ], "wc_review_avg": [ 457.75, 162.07617807685372 ], "wc_reply_reviewers_avg": [ 120.75, 178.28541022753376 ], "wc_reply_authors_avg": [ 526.5, 158.12416007682066 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 3.5, 0.5 ], "replies_avg": [ 25, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.7385489458759963, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10008939485344362449&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Imperial College London;Broad Institute", "aff_unique_dep": ";", "aff_unique_url": "https://www.imperial.ac.uk;https://www.broadinstitute.org", "aff_unique_abbr": "ICL;Broad", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "United Kingdom;United States" }, { "id": "QvTH9nN2Io", "title": "Relative Entropy Gradient Sampler for Unnormalized Distributions", "track": "main", "status": "Reject", "tldr": "", "abstract": "We propose a relative entropy gradient sampler (REGS) for sampling from unnormalized distributions. REGS is a particle method that seeks a sequence of simple nonlinear transforms iteratively pushing the initial samples from a reference distribution into the samples from an unnormalized target distribution. To determine the nonlinear transforms at each iteration, we consider the Wasserstein gradient flow of relative entropy. This gradient flow determines a path of probability distributions that interpolates the reference distribution and the target distribution. It is characterized by an ODE system with velocity fields depending on the density ratios of the density of evolving particles and the unnormalized target density. To sample with REGS, we need to estimate the density ratios and simulate the ODE system with particle evolution. We propose a novel nonparametric approach to estimating the logarithmic density ratio using neural networks. Extensive simulation studies on challenging multimodal 1D and 2D distributions and Bayesian logistic regression on real datasets demonstrate that the REGS outperforms the state-of-the-art sampling methods included in the comparison.", "keywords": "Density ratio estimation;gradient flow;neural networks;particles;velocity fields", "primary_area": "", "supplementary_material": "/attachment/83c689c6aa3653760b5a06e3b0cc06c2fd1e3468.zip", "author": "Xingdong Feng;Yuan Gao;Jian Huang;Yuling Jiao;Xu Liu", "authorids": "~Xingdong_Feng1;~Yuan_Gao6;~Jian_Huang5;~Yuling_Jiao1;~Xu_Liu2", "gender": "M;;M;M;M", "homepage": "https://bb9.sufe.edu.cn/bbcswebdav/users/2011000070/index.htm;;https://www.polyu.edu.hk/ama/people/academic-staff/prof-huang-jian/;https://jszy.whu.edu.cn/jiaoyuling/en/index.htm;", "dblp": ";;;136/7658;93/3167", "google_scholar": "nQyBQOsAAAAJ;;https://scholar.google.com/citations?hl=en;yFDDsVgAAAAJ;", "orcid": ";;0000-0002-5218-9269;;", "linkedin": ";;;;", "or_profile": "~Xingdong_Feng1;~Yuan_Gao6;~Jian_Huang5;~Yuling_Jiao1;~Xu_Liu2", "aff": "Shanghai University of Finance and Economics;;Hong Kong Polytechnic University;Wuhan University;", "aff_domain": "sufe.edu.cn;;polyu.edu.hk;whu.edu.cn;", "position": "Full Professor;;Full Professor;Associate Professor;", "bibtex": "@misc{\nfeng2022relative,\ntitle={Relative Entropy Gradient Sampler for Unnormalized Distributions},\nauthor={Xingdong Feng and Yuan Gao and Jian Huang and Yuling Jiao and Xu Liu},\nyear={2022},\nurl={https://openreview.net/forum?id=QvTH9nN2Io}\n}", "github": "", "project": "", "reviewers": "g4HT;r2Jt;opqD;dwfT", "site": "https://openreview.net/forum?id=QvTH9nN2Io", "pdf_size": 0, "recommendation": "1;3;5;5", "confidence": "5;4;3;4", "correctness": "2;3;1;3", "technical_novelty": "2;3;2;2", "empirical_novelty": "2;2;4;2", "wc_summary_paper": "41;76;107;102", "wc_summary_review": "25;85;19;36", "wc_main_review": "525;519;181;231", "wc_review": "591;680;307;369", "wc_reply_reviewers": "420;85;0;38", "wc_reply_authors": "2299;809;156;243", "reply_reviewers": "1;1;0;1", "reply_authors": "4;1;1;1", "recommendation_avg": [ 3.5, 1.6583123951777 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 2.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 81.5, 26.177280225416848 ], "wc_summary_review_avg": [ 41.25, 25.984370302164336 ], "wc_main_review_avg": [ 364.0, 159.0 ], "wc_review_avg": [ 486.75, 153.61376077682624 ], "wc_reply_reviewers_avg": [ 135.75, 166.85079412457108 ], "wc_reply_authors_avg": [ 876.75, 858.5605322282174 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.75, 1.299038105676658 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.8528028654224417, "corr_recommendation_correctness": -0.0909090909090909, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9689494878349514944&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1;2", "aff_unique_norm": "Shanghai University of Finance and Economics;Hong Kong Polytechnic University;Wuhan University", "aff_unique_dep": ";;", "aff_unique_url": "http://www.sufe.edu.cn;https://www.polyu.edu.hk;http://www.whu.edu.cn/", "aff_unique_abbr": "SUFE;PolyU;WHU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "id": "Qx0EswNY_bW", "title": "Modeling Variable Space with Residual Tensor Networks for Multivariate Time Series", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Multivariate time series involve a series of valuable applications in the real world, and the basic premise of which is that multiple variables are interdependent. However, the relationship between variables in the latent space is dynamic and complex, and as the time window increases, the size of the space also increases exponentially. For fully exploiting the dependencies in the variable space, we propose Modeling Variable Space with Residual Tensor Networks (MVSRTN) for multivariate time series. In this framework, we derive the mathematical representation of the variable space, and then use a tensor network based on the idea of low-rank approximation to model the variable space. The tensor components are shared to ensure the translation invariance of the network. In order to improve the ability to model long-term sequences, we propose an N-order residual connection approach and couple it to the space-approximated tensor network. Moreover, the series-variable encoder is designed to improve the quality of the variable space, and we use the skip-connection layer to achieve the dissemination of information such as scale. Experimental results verify the effectiveness of our proposed method on four multivariate time series forecasting benchmark datasets.", "keywords": "Multivariate Time Series;Variable Space;Tensor Network;N-Order Residual Connection", "primary_area": "", "supplementary_material": "/attachment/cd9a7f19d9bc9d8b4a60fae4c807db127027f88f.zip", "author": "Jing Zhang;Peng Zhang;Yupeng He;Siwei Rao;Jun Wang;Guangjian Tian", "authorids": "~Jing_Zhang21;~Peng_Zhang17;~Yupeng_He1;~Siwei_Rao2;~Jun_Wang2;~Guangjian_Tian1", "gender": ";M;M;M;M;M", "homepage": "https://github.com/Anonymous-UserID;http://cic.tju.edu.cn/faculty/zhangpeng/index.html;https://hyp-yupenghe.github.io/;https://rouseway.github.io;http://www0.cs.ucl.ac.uk/staff/jun.wang/;", "dblp": ";21/1048-2%20;;;w/JunWang12;52/7695.html", "google_scholar": ";tvDb5_cAAAAJ;;;https://scholar.google.co.uk/citations?user=wIE1tY4AAAAJ;", "orcid": ";0000-0003-0228-9330;;;;", "linkedin": ";;;;;", "or_profile": "~Jing_Zhang21;~Peng_Zhang17;~Yupeng_He1;~Siwei_Rao2;~Jun_Wang2;~Guangjian_Tian1", "aff": "Tianjin University;Tianjin University;Tianjin University;;University College London;Huawei Technologies Ltd.", "aff_domain": "tju.edu.cn;tju.edu.cn;tju.edu.cn;;ucl.ac.uk;huawei.com", "position": "PhD student;Full Professor;Undergrad student;;Professor;Researcher", "bibtex": "@misc{\nzhang2022modeling,\ntitle={Modeling Variable Space with Residual Tensor Networks for Multivariate Time Series},\nauthor={Jing Zhang and Peng Zhang and Yupeng He and Siwei Rao and Jun Wang and Guangjian Tian},\nyear={2022},\nurl={https://openreview.net/forum?id=Qx0EswNY_bW}\n}", "github": "", "project": "", "reviewers": "WVW1;ZgXU;QquH;yugv", "site": "https://openreview.net/forum?id=Qx0EswNY_bW", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "3;3;4;3", "correctness": "3;2;2;3", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;2;1;2", "wc_summary_paper": "84;171;87;129", "wc_summary_review": "81;132;26;27", "wc_main_review": "749;473;379;321", "wc_review": "914;776;492;477", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 1.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 117.75, 35.520240708643854 ], "wc_summary_review_avg": [ 66.5, 43.877670858877636 ], "wc_main_review_avg": [ 480.5, 164.23382720986564 ], "wc_review_avg": [ 664.75, 186.81190406395413 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:ZEnSQ5Vcl9oJ:scholar.google.com/&scioq=Modeling+Variable+Space+with+Residual+Tensor+Networks+for+Multivariate+Time+Series&hl=en&as_sdt=0,14", "gs_version_total": 0, "aff_unique_index": "0;0;0;1;2", "aff_unique_norm": "Tianjin University;University College London;Huawei", "aff_unique_dep": ";;Huawei Technologies", "aff_unique_url": "http://www.tju.edu.cn;https://www.ucl.ac.uk;https://www.huawei.com", "aff_unique_abbr": "TJU;UCL;Huawei", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;0", "aff_country_unique": "China;United Kingdom" }, { "id": "QyX0pa4CDRM", "title": "Continual Learning via Low-Rank Network Updates", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Continual learning seeks to train a single network for multiple tasks (one after another), where training data for each task is only available during the training of that task. Neural networks tend to forget older tasks when they are trained for the newer tasks; this property is often known as catastrophic forgetting. To address this issue, continual learning methods use episodic memory, parameter regularization, masking and pruning, or extensible network structures. In this paper, we propose a new continual learning framework based on low-rank factorization. In particular, we represent the network weights for each layer as a linear combination of several low-rank (or rank-1) matrices. To update the network for a new task, we learn a low-rank (or rank-1) matrix and add that to the weights of every layer. We also introduce an additional selector vector that assigns different weights to the low-rank matrices learned for the previous tasks. We show that our approach performs better than the current state-of-the-art methods in terms of accuracy and forgetting. Our method also offers better memory efficiency compared to episodic memory-based approaches. ", "keywords": "continual learning;low-rank networks;rank-one update;multitask learning;task-incremental learning", "primary_area": "", "supplementary_material": "", "author": "Rakib Hyder;Ken Shao;Boyu Hou;Panos Markopoulos;Ashley Prater-Bennette;Salman Asif", "authorids": "~Rakib_Hyder1;~Ken_Shao1;bhou015@ucr.edu;panos@mail.rit.edu;~Ashley_Prater-Bennette1;~Salman_Asif1", "gender": "M;;;;F;M", "homepage": ";;;;;https://www.ece.ucr.edu/~sasif", "dblp": "212/6501.html;324/8663.html;;;158/9018;21/1910", "google_scholar": "1q1a4wsAAAAJ;YMl_sScAAAAJ;;;f1WPBE8AAAAJ;Dl0puDcAAAAJ", "orcid": "0000-0003-4191-301X;0000-0001-8249-1111;;;;0000-0001-5993-3903", "linkedin": "rakib-hyder-b03123a2;;;;;", "or_profile": "~Rakib_Hyder1;~Ken_Shao1;bhou015@ucr.edu;panos@mail.rit.edu;~Ashley_Prater-Bennette1;~Salman_Asif1", "aff": "University of California, Riverside;University of Michigan;;;Air Force Research Laboratory;University of California Riverside", "aff_domain": "ucr.edu;umich.edu;;;us.af.mil;ucr.edu", "position": "PhD student;MS student;;;Principal Researcher;Assistant Professor", "bibtex": "@misc{\nhyder2022continual,\ntitle={Continual Learning via Low-Rank Network Updates},\nauthor={Rakib Hyder and Ken Shao and Boyu Hou and Panos Markopoulos and Ashley Prater-Bennette and Salman Asif},\nyear={2022},\nurl={https://openreview.net/forum?id=QyX0pa4CDRM}\n}", "github": "", "project": "", "reviewers": "KB5A;VzLR;hHze;Pxuo", "site": "https://openreview.net/forum?id=QyX0pa4CDRM", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "4;4;5;4", "correctness": "4;2;3;3", "technical_novelty": "3;1;2;3", "empirical_novelty": "2;1;2;2", "wc_summary_paper": "114;78;66;91", "wc_summary_review": "54;15;79;39", "wc_main_review": "755;187;615;352", "wc_review": "923;280;760;482", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;23;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;1;0", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.25, 0.82915619758885 ], "empirical_novelty_avg": [ 1.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 87.25, 17.795715776557007 ], "wc_summary_review_avg": [ 46.75, 23.24193408475293 ], "wc_main_review_avg": [ 477.25, 221.38922173403114 ], "wc_review_avg": [ 611.25, 247.86425216234792 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 5.75, 9.959292143521045 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0.25, 0.4330127018922193 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:9ENkc5CUC_kJ:scholar.google.com/&scioq=Continual+Learning+via+Low-Rank+Network+Updates&hl=en&as_sdt=0,14", "gs_version_total": 0, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "University of California, Riverside;University of Michigan;Air Force Research Laboratory", "aff_unique_dep": ";;", "aff_unique_url": "https://www.ucr.edu;https://www.umich.edu;https://www.afrl.af.mil/", "aff_unique_abbr": "UCR;UM;AFRL", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Riverside;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Understanding the Variance Collapse of SVGD in High Dimensions", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6835", "id": "Qycd9j5Qp9J", "poster": "", "openreview": "https://openreview.net/forum?id=Qycd9j5Qp9J", "slides": "https://iclr.cc/virtual/2022/poster/6835", "video": "https://iclr.cc/virtual/2022/poster/6835", "author_site": "Jimmy Ba, Murat A Erdogdu, Marzyeh Ghassemi, Shengyang Sun, Taiji Suzuki, Denny Wu, Tianzong Zhang", "tldr": "", "abstract": "Stein variational gradient descent (SVGD) is a deterministic inference algorithm that evolves a set of particles to fit a target distribution. Despite its computational efficiency, SVGD often underestimates the variance of the target distribution in high dimensions. In this work we attempt to explain the variance collapse in SVGD. On the qualitative side, we compare the SVGD update with gradient descent on the maximum mean discrepancy (MMD) objective; we observe that the variance collapse phenomenon relates to the bias from deterministic updates present in the \"driving force\" of SVGD, and empirically verify that removal of such bias leads to more accurate variance estimation. On the quantitative side, we demonstrate that the variance collapse of SVGD can be accurately predicted in the proportional asymptotic limit, i.e., when the number of particles $n$ and dimensions $d$ diverge at the same rate. In particular, for learning high-dimensional isotropic Gaussians, we derive the exact equilibrium variance for both SVGD and MMD-descent under certain near-orthogonality assumption on the converged particles, and confirm that SVGD suffers from the \"curse of dimensionality\".", "keywords": "Stein Variational Gradient Descent;Approximate Inference;Particle-based Variational Inference", "primary_area": "", "supplementary_material": "", "author": "Jimmy Ba;Murat A Erdogdu;Marzyeh Ghassemi;Shengyang Sun;Taiji Suzuki;Denny Wu;Tianzong Zhang", "authorids": "~Jimmy_Ba1;~Murat_A_Erdogdu1;~Marzyeh_Ghassemi2;~Shengyang_Sun4;~Taiji_Suzuki1;~Denny_Wu2;~Tianzong_Zhang1", "gender": "M;M;F;M;M;M;M", "homepage": "http://jimmylba.github.io;http://www.cs.toronto.edu/~erdogdu/;https://www.healthyml.org/;http://www.cs.toronto.edu/~ssy/;http://ibis.t.u-tokyo.ac.jp/suzuki/;https://dennywu1.github.io/;https://people.epfl.ch/tianzong.zhang", "dblp": "https://dblp.org/pers/b/Ba:Jimmy.html;139/1292;145/6563;173/5093;08/312;;", "google_scholar": "https://scholar.google.ca/citations?user=ymzxRhAAAAAJ;Lqc4cdAAAAAJ;;https://scholar.google.ca/citations?user=NktP1NQAAAAJ;x8osrBsAAAAJ;https://scholar.google.com/citations?hl=en;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": "~Jimmy_Ba1;~Murat_A_Erdogdu1;~Marzyeh_Ghassemi2;~Shengyang_Sun4;~Taiji_Suzuki1;~Denny_Wu2;~Tianzong_Zhang1", "aff": "Department of Computer Science, University of Toronto;Vector Institute;Massachusetts Institute of Technology;Department of Computer Science, University of Toronto;The University of Tokyo;University of Toronto;EPFL - EPF Lausanne", "aff_domain": "cs.toronto.edu;vectorinstitute.ai;mit.edu;cs.toronto.edu;tokyo.ac.jp;toronto.edu;epfl.ch", "position": "Assistant Professor;Faculty;Assistant Professor;PhD student;Associate Professor;PhD student;PhD student", "bibtex": "@inproceedings{\nba2022understanding,\ntitle={Understanding the Variance Collapse of {SVGD} in High Dimensions},\nauthor={Jimmy Ba and Murat A Erdogdu and Marzyeh Ghassemi and Shengyang Sun and Taiji Suzuki and Denny Wu and Tianzong Zhang},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=Qycd9j5Qp9J}\n}", "github": "", "project": "", "reviewers": "PPJd;du4P;r5Ry;SH8F", "pdf_size": 0, "recommendation": "6;6;6;8", "confidence": "4;3;3;4", "correctness": "3;3;4;4", "technical_novelty": "3;3;3;3", "empirical_novelty": "3;3;3;3", "wc_summary_paper": "105;102;69;129", "wc_summary_review": "44;123;28;29", "wc_main_review": "364;198;232;115", "wc_review": "513;423;329;273", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "545;492;436;337", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 101.25, 21.358546298847212 ], "wc_summary_review_avg": [ 56.0, 39.198214245039274 ], "wc_main_review_avg": [ 227.25, 89.69218193354425 ], "wc_review_avg": [ 384.5, 91.5245868605808 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 452.5, 77.0211010048545 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 31, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5966177805550378405&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "pdf": "https://openreview.net/pdf?id=Qycd9j5Qp9J", "email": "cs.toronto.edu;vectorinstitute.ai;mit.edu;cs.toronto.edu;tokyo.ac.jp;toronto.edu;epfl.ch", "author_num": 7, "aff_unique_index": "0;1;2;0;3;0;4", "aff_unique_norm": "University of Toronto;Vector Institute;Massachusetts Institute of Technology;University of Tokyo;EPFL", "aff_unique_dep": "Department of Computer Science;;;;", "aff_unique_url": "https://www.utoronto.ca;https://vectorinstitute.ai/;https://web.mit.edu;https://www.u-tokyo.ac.jp;https://www.epfl.ch", "aff_unique_abbr": "U of T;Vector Institute;MIT;UTokyo;EPFL", "aff_campus_unique_index": "0;0;2", "aff_campus_unique": "Toronto;;Lausanne", "aff_country_unique_index": "0;0;1;0;2;0;3", "aff_country_unique": "Canada;United States;Japan;Switzerland" }, { "id": "QymmlaKpp_8", "title": "Inductive-Biases for Contrastive Learning of Disentangled Representations", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Learning disentangled representations is a core machine learning task. It has been shown that this task requires inductive biases. Recent work on class-content disentanglement has shown excellent performance, but required generative modeling of the entire dataset, which can be very demanding. Current discriminative approaches are typically based on adversarial-training and do not reach comparable accuracy. In this paper, we investigate how to transfer the inductive-biases implicit in generative-approaches to contrastive methods. Based on our findings we proposed a new, non-adversarial and non-generative method named \\modelName: Augmentation Based Contrastive Disentanglement. ABCD uses contrastive representation learning relying only on content-invariant augmentations to achieve domain-disentangled representations. The discriminative approach, makes ABCD much faster to train relative to other generative approaches. We evaluate ABCD on image translation and retrieval tasks, and obtain state-of-the-art results.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jonathan Kahana;Yedid Hoshen", "authorids": "~Jonathan_Kahana1;~Yedid_Hoshen3", "gender": "M;M", "homepage": ";https://www.cs.huji.ac.il/~ydidh/", "dblp": "317/0994;136/0280", "google_scholar": ";https://scholar.google.co.il/citations?user=6y1-qS4AAAAJ", "orcid": ";", "linkedin": "jonathan-kahana-a92b96221/;", "or_profile": "~Jonathan_Kahana1;~Yedid_Hoshen3", "aff": "Hebrew University of Jerusalem;Hebrew University of Jerusalem", "aff_domain": "huji.ac.il;huji.ac.il", "position": "MS student;Assistant Professor", "bibtex": "@misc{\nkahana2022inductivebiases,\ntitle={Inductive-Biases for Contrastive Learning of Disentangled Representations},\nauthor={Jonathan Kahana and Yedid Hoshen},\nyear={2022},\nurl={https://openreview.net/forum?id=QymmlaKpp_8}\n}", "github": "", "project": "", "reviewers": "hCMz;kjyy;a1vy", "site": "https://openreview.net/forum?id=QymmlaKpp_8", "pdf_size": 0, "recommendation": "3;3;5", "confidence": "3;5;3", "correctness": "3;3;3", "technical_novelty": "2;2;4", "empirical_novelty": "4;1;2", "wc_summary_paper": "71;77;94", "wc_summary_review": "24;122;52", "wc_main_review": "335;1289;400", "wc_review": "430;1488;546", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "169;268;197", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "recommendation_avg": [ 3.6666666666666665, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.9428090415820634 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.6666666666666665, 0.9428090415820634 ], "empirical_novelty_avg": [ 2.3333333333333335, 1.247219128924647 ], "wc_summary_paper_avg": [ 80.66666666666667, 9.741092797468305 ], "wc_summary_review_avg": [ 66.0, 41.21488404286329 ], "wc_main_review_avg": [ 674.6666666666666, 435.20901747193716 ], "wc_review_avg": [ 821.3333333333334, 473.7772566184334 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 211.33333333333334, 41.667999978667346 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.5000000000000001, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:B-U2m0aUehAJ:scholar.google.com/&scioq=Inductive-Biases+for+Contrastive+Learning+of+Disentangled+Representations&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Hebrew University of Jerusalem", "aff_unique_dep": "", "aff_unique_url": "https://www.huji.ac.il", "aff_unique_abbr": "HUJI", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Jerusalem", "aff_country_unique_index": "0;0", "aff_country_unique": "Israel" }, { "id": "R-I5CUDOAp7", "title": "STORM: Sketch Toward Online Risk Minimization", "track": "main", "status": "Reject", "tldr": "", "abstract": "Empirical risk minimization is perhaps the most influential idea in statistical learning, with applications to nearly all scientific and technical domains in the form of regression and classification models.\nThe growing concerns about the high energy cost of training and the increased prevalence of massive streaming datasets have led many ML practitioners to look for approximate ERM models that can achieve low cost on memory and latency for training.\nTo this end, we propose STORM, an online sketching-based method for empirical risk minimization. STORM compresses a data stream into a tiny array of integer counters. This sketch is sufficient to estimate a variety of surrogate losses over the original dataset. We provide rigorous theoretical analysis and show that STORM can estimate a carefully chosen surrogate loss for regularized least-squares regression and a margin loss for classification. \nWe perform an exhaustive experimental comparison for regression and classification training on real-world datasets, achieving an approximate solution with a size even less than a data sample.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/dc9c9b2a587b23fead8829ccb2b7f50505f19c95.zip", "author": "Gaurav Gupta;Benjamin Coleman;John Chen;Anshumali Shrivastava", "authorids": "~Gaurav_Gupta5;~Benjamin_Coleman1;~John_Chen3;~Anshumali_Shrivastava1", "gender": "M;M;;M", "homepage": "https://gaurav16gupta.github.io/;https://randorithms.com/research;https://johnchenresearch.github.io/;https://www.cs.rice.edu/~as143/", "dblp": ";217/2220;71/1897;63/9828", "google_scholar": ";fInuVkEAAAAJ;NbcgY4oAAAAJ;https://scholar.google.com.tw/citations?user=SGT23RAAAAAJ", "orcid": ";;;", "linkedin": ";;john-c/;", "or_profile": "~Gaurav_Gupta5;~Benjamin_Coleman1;~John_Chen3;~Anshumali_Shrivastava1", "aff": "Rice University;Rice University;Rice University;ThirdAI Corp.", "aff_domain": "rice.edu;rice.edu;rice.edu;thirdai.com", "position": "PhD student;PhD student;PhD student;CEO", "bibtex": "@misc{\ngupta2022storm,\ntitle={{STORM}: Sketch Toward Online Risk Minimization},\nauthor={Gaurav Gupta and Benjamin Coleman and John Chen and Anshumali Shrivastava},\nyear={2022},\nurl={https://openreview.net/forum?id=R-I5CUDOAp7}\n}", "github": "", "project": "", "reviewers": "7LQM;yRcK;MTPW;8LGD", "site": "https://openreview.net/forum?id=R-I5CUDOAp7", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "3;4;3;4", "correctness": "4;3;2;3", "technical_novelty": "1;3;2;3", "empirical_novelty": "1;0;2;3", "wc_summary_paper": "78;41;88;78", "wc_summary_review": "86;57;22;34", "wc_main_review": "945;377;277;316", "wc_review": "1109;475;387;428", "wc_reply_reviewers": "135;217;0;0", "wc_reply_authors": "331;200;336;0", "reply_reviewers": "2;1;0;0", "reply_authors": "1;1;1;0", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.25, 0.82915619758885 ], "empirical_novelty_avg": [ 1.5, 1.118033988749895 ], "wc_summary_paper_avg": [ 71.25, 17.93564885918544 ], "wc_summary_review_avg": [ 49.75, 24.416951079117148 ], "wc_main_review_avg": [ 478.75, 271.53855619414344 ], "wc_review_avg": [ 599.75, 295.65974954328834 ], "wc_reply_reviewers_avg": [ 88.0, 92.65257686648548 ], "wc_reply_authors_avg": [ 216.75, 136.5052654662083 ], "reply_reviewers_avg": [ 0.75, 0.82915619758885 ], "reply_authors_avg": [ 0.75, 0.4330127018922193 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.6882472016116854, "corr_recommendation_correctness": -0.6488856845230502, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:h0zNjIghEoYJ:scholar.google.com/&scioq=STORM:+Sketch+Toward+Online+Risk+Minimization&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;0;1", "aff_unique_norm": "Rice University;ThirdAI Corp.", "aff_unique_dep": ";", "aff_unique_url": "https://www.rice.edu;", "aff_unique_abbr": "Rice;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "R-piejobttn", "title": "Mixture Representation Learning with Coupled Autoencoders", "track": "main", "status": "Reject", "tldr": "", "abstract": "Latent representations help unravel complex phenomena. While continuous latent variables can be efficiently inferred, fitting mixed discrete-continuous models remains challenging despite recent progress, especially when the discrete factor dimensionality is large. A pressing application for such mixture representations is the analysis of single-cell omic datasets to understand neuronal diversity and its molecular underpinnings. Here, we propose an unsupervised variational framework using multiple interacting networks called cpl-mixVAE that significantly outperforms state-of-the-art in high-dimensional discrete settings. cpl-mixVAE introduces a consensus constraint on discrete factors of variability across the networks, which regularizes the mixture representations at the time of training. We justify the use of this framework with theoretical results and validate it with experiments on benchmark datasets. We demonstrate that our approach discovers interpretable discrete and continuous variables describing neuronal identity in two single-cell RNA sequencing datasets, each profiling over a hundred cortical neuron types.", "keywords": "Mixture representation;high-dimensional categorical variable;unsupervised learning;constrained variational framework;neuronal diversity;single-cell RNA sequencing;cell types", "primary_area": "", "supplementary_material": "/attachment/7ce5848fed09646cad287f088b031a8f145df4ac.zip", "author": "Yeganeh Marghi;Rohan Gala;Uygar S\u00fcmb\u00fcl", "authorids": "~Yeganeh_Marghi1;~Rohan_Gala1;~Uygar_S\u00fcmb\u00fcl2", "gender": "F;;M", "homepage": ";https://rhngla.github.io;", "dblp": ";;30/8374", "google_scholar": "https://scholar.google.com/citations?hl=en;https://scholar.google.com/citations?hl=en;dhiRjJIAAAAJ", "orcid": "0000-0002-5802-7439;;", "linkedin": "yeganehmarghi/;;", "or_profile": "~Yeganeh_Marghi1;~Rohan_Gala1;~Uygar_Sumbul1", "aff": "Allen Institute;Allen Institute;Allen Institute", "aff_domain": "alleninstitute.org;alleninstitute.org;alleninstitute.org", "position": "Researcher;Scientist II;Assistant Investigator", "bibtex": "@misc{\nmarghi2022mixture,\ntitle={Mixture Representation Learning with Coupled Autoencoders},\nauthor={Yeganeh Marghi and Rohan Gala and Uygar S{\\\"u}mb{\\\"u}l},\nyear={2022},\nurl={https://openreview.net/forum?id=R-piejobttn}\n}", "github": "", "project": "", "reviewers": "2RLY;mBXz;J8W6;uWQ6;giw4", "site": "https://openreview.net/forum?id=R-piejobttn", "pdf_size": 0, "recommendation": "5;5;5;5;8", "confidence": "3;4;5;4;4", "correctness": "4;2;2;3;4", "technical_novelty": "2;3;3;2;3", "empirical_novelty": "2;3;2;2;3", "wc_summary_paper": "60;116;39;42;140", "wc_summary_review": "21;59;70;41;10", "wc_main_review": "186;1137;531;293;70", "wc_review": "267;1312;640;376;220", "wc_reply_reviewers": "0;0;574;0;0", "wc_reply_authors": "178;940;2731;737;10", "reply_reviewers": "0;0;2;0;0", "reply_authors": "2;3;6;2;1", "recommendation_avg": [ 5.6, 1.2 ], "confidence_avg": [ 4.0, 0.6324555320336759 ], "correctness_avg": [ 3.0, 0.8944271909999159 ], "technical_novelty_avg": [ 2.6, 0.4898979485566356 ], "empirical_novelty_avg": [ 2.4, 0.4898979485566356 ], "wc_summary_paper_avg": [ 79.4, 41.03461953034291 ], "wc_summary_review_avg": [ 40.2, 22.4624130493587 ], "wc_main_review_avg": [ 443.4, 378.6970292991483 ], "wc_review_avg": [ 563.0, 401.78700825188463 ], "wc_reply_reviewers_avg": [ 114.8, 229.6 ], "wc_reply_authors_avg": [ 919.2, 968.738437350351 ], "reply_reviewers_avg": [ 0.4, 0.8000000000000002 ], "reply_authors_avg": [ 2.8, 1.7204650534085253 ], "replies_avg": [ 23, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.5590169943749475, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:rumW2REULOsJ:scholar.google.com/&scioq=Mixture+Representation+Learning+with+Coupled+Autoencoders&hl=en&as_sdt=0,33", "gs_version_total": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Allen Institute for Artificial Intelligence", "aff_unique_dep": "", "aff_unique_url": "https://allenai.org", "aff_unique_abbr": "AI2", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "R0AzpCND-M_", "title": "Model-Agnostic Meta-Attack: Towards Reliable Evaluation of Adversarial Robustness", "track": "main", "status": "Reject", "tldr": "", "abstract": "The vulnerability of deep neural networks to adversarial examples has motivated an increasing number of defense strategies for promoting model robustness. However, the progress is usually hampered by insufficient robustness evaluations. As the de facto standard to evaluate adversarial robustness, adversarial attacks typically solve an optimization problem of crafting adversarial examples with an iterative process. In this work, we propose a Model-Agnostic Meta-Attack (MAMA) approach to discover stronger attack algorithms automatically. Our method learns the optimizer in adversarial attacks parameterized by a recurrent neural network, which is trained over a class of data samples and defenses to produce effective update directions during adversarial example generation. Furthermore, we develop a model-agnostic training algorithm to improve the generalization ability of the learned optimizer when attacking unseen defenses. Our approach can be flexibly incorporated with various attacks and consistently improves the performance with little extra computational cost. Extensive experiments demonstrate the effectiveness of the learned attacks by MAMA compared to the state-of-the-art attacks on different defenses, leading to a more reliable evaluation of adversarial robustness. ", "keywords": "Adversarial attacks;robust evaluation", "primary_area": "", "supplementary_material": "/attachment/77e8775c191c3bee2fc825362ca4e583a149d4dd.zip", "author": "Xiao Yang;Yinpeng Dong;Wenzhao Xiang;Tianyu Pang;Hang Su;Jun Zhu", "authorids": "~Xiao_Yang4;~Yinpeng_Dong2;~Wenzhao_Xiang1;~Tianyu_Pang1;~Hang_Su3;~Jun_Zhu2", "gender": "M;M;M;M;M;M", "homepage": "https://ml.cs.tsinghua.edu.cn/~xiaoyang/;https://dongyp13.github.io;https://wenzhao-xiang.github.io/Blog/;https://p2333.github.io/;http://ml.cs.tsinghua.edu.cn/~jun;", "dblp": "57/33851;183/0980;;202/2550;50/2644-1;26/5371-6", "google_scholar": "bwkwp0MAAAAJ;6_4ad84AAAAJ;;wYDbtFsAAAAJ;axsP38wAAAAJ;dxN1_X0AAAAJ", "orcid": "0000-0001-9502-9962;;;0000-0003-0639-6176;;", "linkedin": ";;;%E5%A4%A9%E5%AE%87-%E5%BA%9E-b3999017a/;;", "or_profile": "~Xiao_Yang4;~Yinpeng_Dong2;~Wenzhao_Xiang1;~Tianyu_Pang1;~Jun_Zhu2;~Hang_Su2", "aff": "Tsinghua University;Tsinghua University;Shanghai Jiaotong University;Tsinghua University;Tsinghua University;Tsinghua University", "aff_domain": "mail.tsinghua.edu.cn;tsinghua.edu.cn;sjtu.edu.cn;tsinghua.edu.cn;mail.tsinghua.edu.cn;tsinghua.edu.cn", "position": "PhD student;PhD student;MS student;PhD student;Professor;Associate Professor", "bibtex": "@misc{\nyang2022modelagnostic,\ntitle={Model-Agnostic Meta-Attack: Towards Reliable Evaluation of Adversarial Robustness},\nauthor={Xiao Yang and Yinpeng Dong and Wenzhao Xiang and Tianyu Pang and Hang Su and Jun Zhu},\nyear={2022},\nurl={https://openreview.net/forum?id=R0AzpCND-M_}\n}", "github": "", "project": "", "reviewers": "8AXD;jTw1;Xe9v", "site": "https://openreview.net/forum?id=R0AzpCND-M_", "pdf_size": 0, "recommendation": "3;6;6", "confidence": "3;2;4", "correctness": "3;4;4", "technical_novelty": "2;2;2", "empirical_novelty": "2;2;2", "wc_summary_paper": "102;32;45", "wc_summary_review": "95;92;41", "wc_main_review": "405;263;129", "wc_review": "602;387;215", "wc_reply_reviewers": "418;0;0", "wc_reply_authors": "1398;181;297", "reply_reviewers": "3;0;0", "reply_authors": "5;1;1", "recommendation_avg": [ 5.0, 1.4142135623730951 ], "confidence_avg": [ 3.0, 0.816496580927726 ], "correctness_avg": [ 3.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 59.666666666666664, 30.40102337458761 ], "wc_summary_review_avg": [ 76.0, 24.779023386727733 ], "wc_main_review_avg": [ 265.6666666666667, 112.69230477523989 ], "wc_review_avg": [ 401.3333333333333, 158.31684124635072 ], "wc_reply_reviewers_avg": [ 139.33333333333334, 197.04708969065123 ], "wc_reply_authors_avg": [ 625.3333333333334, 548.4063781134895 ], "reply_reviewers_avg": [ 1.0, 1.4142135623730951 ], "reply_authors_avg": [ 2.3333333333333335, 1.8856180831641267 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 1.0, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18375669268194129285&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;1;0;0;0", "aff_unique_norm": "Tsinghua University;Shanghai Jiao Tong University", "aff_unique_dep": ";", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.sjtu.edu.cn", "aff_unique_abbr": "THU;SJTU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "China" }, { "id": "R0xRE2MU2uA", "title": "Graph Piece: Efficiently Generating High-Quality Molecular Graphs with Substructures", "track": "main", "status": "Reject", "tldr": "", "abstract": "Molecular graph generation is a fundamental but challenging task in various applications such as drug discovery and material science, which requires generating valid molecules with desired properties. Auto-regressive models, which usually construct graphs following sequential actions of adding nodes and edges at the atom-level, have made rapid progress in recent years. However, these atom-level models ignore high-frequency subgraphs that not only capture the regularities of atomic combination in molecules but also are often related to desired chemical properties. In this paper, we propose a method to automatically discover such common substructures, which we call graph pieces, from given molecular graphs. Based on graph pieces, we leverage a variational autoencoder to generate molecules in two phases: piece-level graph generation followed by bond completion. Experiments show that our graph piece variational autoencoder achieves better performance over state-of-the-art baselines on property optimization and constrained property optimization tasks with higher computational efficiency.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/c7552d96641ceb4b5c7ad064f6aa134dce488df8.zip", "author": "Xiangzhe Kong;Zhixing Tan;Yang Liu", "authorids": "~Xiangzhe_Kong1;~Zhixing_Tan1;~Yang_Liu19", "gender": "M;;M", "homepage": "https://kxz18.github.io/;;http://nlp.csai.tsinghua.edu.cn/~ly/", "dblp": "293/7526;;51/3710-5", "google_scholar": "0oSFYmkAAAAJ;;https://scholar.google.com.hk/citations?user=lVhoKNcAAAAJ", "orcid": ";;0000-0002-3087-242X", "linkedin": ";;", "or_profile": "~Xiangzhe_Kong1;~Zhixing_Tan1;~Yang_Liu19", "aff": "Tsinghua University;;Tsinghua University", "aff_domain": "tsinghua.edu.cn;;tsinghua.edu.cn", "position": "Undergrad student;;Professor", "bibtex": "@misc{\nkong2022graph,\ntitle={Graph Piece: Efficiently Generating High-Quality Molecular Graphs with Substructures},\nauthor={Xiangzhe Kong and Zhixing Tan and Yang Liu},\nyear={2022},\nurl={https://openreview.net/forum?id=R0xRE2MU2uA}\n}", "github": "", "project": "", "reviewers": "TrJs;sMpt;LbDK;ogN1", "site": "https://openreview.net/forum?id=R0xRE2MU2uA", "pdf_size": 0, "recommendation": "3;3;5;6", "confidence": "4;4;4;3", "correctness": "3;2;4;4", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "27;18;56;14", "wc_summary_review": "118;15;16;16", "wc_main_review": "224;196;232;295", "wc_review": "369;229;304;325", "wc_reply_reviewers": "0;38;159;16", "wc_reply_authors": "725;986;896;717", "reply_reviewers": "0;1;1;1", "reply_authors": "1;2;2;1", "recommendation_avg": [ 4.25, 1.299038105676658 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 28.75, 16.422164899914993 ], "wc_summary_review_avg": [ 41.25, 44.313513740167345 ], "wc_main_review_avg": [ 236.75, 36.18960486106473 ], "wc_review_avg": [ 306.75, 50.64768010481823 ], "wc_reply_reviewers_avg": [ 53.25, 62.527493952660535 ], "wc_reply_authors_avg": [ 831.0, 114.54475108009096 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.7777777777777777, "corr_recommendation_correctness": 0.8703882797784892, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4643040882937409174&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Tsinghua University", "aff_unique_dep": "", "aff_unique_url": "https://www.tsinghua.edu.cn", "aff_unique_abbr": "THU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "id": "R11xJsRjA-W", "title": "The Connection between Out-of-Distribution Generalization and Privacy of ML Models", "track": "main", "status": "Reject", "tldr": "", "abstract": "With the goal of generalizing to out-of-distribution (OOD) data, recent domain generalization methods aim to learn ``stable'' feature representations whose effect on the output remains invariant across domains. Given the theoretical connection between generalization and privacy, we ask whether better OOD generalization leads to better privacy for machine learning models, where privacy is measured through robustness to membership inference (MI) attacks. In general, we find that the relationship does not hold. Through extensive evaluation on a synthetic dataset and image datasets like MNIST, Fashion-MNIST, and Chest X-rays, we show that a lower OOD generalization gap does not imply better robustness to MI attacks. Instead, privacy benefits are based on the extent to which a model captures the stable features. A model that captures stable features is more robust to MI attacks than models that exhibit better OOD generalization but do not learn stable features. Further, for the same provable differential privacy guarantees, a model that learns stable features provides higher utility as compared to others. Our results offer the first extensive empirical study connecting stable features and privacy, and also have a takeaway for the domain generalization community; MI attack can be used as a complementary metric to measure model quality.", "keywords": "membership inference attacks;privacy attacks;model privacy;out-of-distribution generalization;domain generalization", "primary_area": "", "supplementary_material": "/attachment/5e968e334f12efbf7721916a7a7338a673b0a949.zip", "author": "Divyat Mahajan;Shruti Tople;Amit Sharma", "authorids": "~Divyat_Mahajan1;~Shruti_Tople2;~Amit_Sharma3", "gender": "M;;M", "homepage": "http://divyat09.github.io/;;http://amitsharma.in/", "dblp": "242/8911.html;;72/2540-7", "google_scholar": "https://scholar.google.co.in/citations?user=z5bDMO4AAAAJ;;https://scholar.google.co.in/citations?user=CXgQufgAAAAJ", "orcid": ";;0000-0002-2086-3191", "linkedin": "divyat-mahajan-6221a0a6/;;", "or_profile": "~Divyat_Mahajan1;~Shruti_Tople2;~Amit_Sharma3", "aff": "Montreal Institute of Learning Algorithms;;Microsoft Research", "aff_domain": "mila.quebec;;microsoft.com", "position": "MS student;;Principal Researcher", "bibtex": "@misc{\nmahajan2022the,\ntitle={The Connection between Out-of-Distribution Generalization and Privacy of {ML} Models},\nauthor={Divyat Mahajan and Shruti Tople and Amit Sharma},\nyear={2022},\nurl={https://openreview.net/forum?id=R11xJsRjA-W}\n}", "github": "", "project": "", "reviewers": "GSgZ;JCc9;upDo;eEMp", "site": "https://openreview.net/forum?id=R11xJsRjA-W", "pdf_size": 0, "recommendation": "1;3;5;5", "confidence": "4;4;3;3", "correctness": "2;2;3;3", "technical_novelty": "1;2;3;2", "empirical_novelty": "1;3;2;3", "wc_summary_paper": "125;90;119;43", "wc_summary_review": "18;29;49;76", "wc_main_review": "232;361;162;147", "wc_review": "375;480;330;266", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "262;398;96;147", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 3.5, 1.6583123951777 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 94.25, 32.41431011143072 ], "wc_summary_review_avg": [ 43.0, 22.056745000112777 ], "wc_main_review_avg": [ 225.5, 84.55323766716447 ], "wc_review_avg": [ 362.75, 77.99158608465403 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 225.75, 116.21182168781282 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.9045340337332909, "corr_recommendation_correctness": 0.9045340337332909, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2749150841027280767&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1", "aff_unique_norm": "Montreal Institute of Learning Algorithms;Microsoft", "aff_unique_dep": "Learning Algorithms;Microsoft Research", "aff_unique_url": "https://mila.quebec;https://www.microsoft.com/en-us/research", "aff_unique_abbr": "MILA;MSR", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "Canada;United States" }, { "id": "R2AN-rz4j_X", "title": "Continual Learning in Deep Networks: an Analysis of the Last Layer", "track": "main", "status": "Reject", "tldr": "", "abstract": "We study how different output layers in a deep neural network learn and forget in continual learning settings. The following three factors can affect catastrophic forgetting in the output layer: (1) weights modifications, (2) interference, and (3) projection drift. In this paper, our goal is to provide more insights into how changing the output layers may address (1) and (2). Some potential solutions to those issues are proposed and evaluated here in several continual learning scenarios. We show that the best-performing type of the output layer depends on the data distribution drifts and/or the amount of data available. In particular, in some cases where a standard linear layer would fail, it turns out that changing parameterization is sufficient in order to achieve a significantly better performance, whithout introducing a continual-learning algorithm and instead using the standard SGD to train a model. Our analysis and results shed light on the dynamics of the output layer in continual learning scenarios, and suggest a way of selecting the best type of output layer for a given scenario.", "keywords": "Continual Learning;Linear Models", "primary_area": "", "supplementary_material": "", "author": "Timothee LESORT;Thomas George;Irina Rish", "authorids": "~Timothee_LESORT1;~Thomas_George2;~Irina_Rish1", "gender": "M;;F", "homepage": ";http://tfjgeorge.github.io/;http://irina-rish.com", "dblp": ";;", "google_scholar": "5NttkuoAAAAJ;pc3_ujYAAAAJ;Avse5gIAAAAJ", "orcid": ";;", "linkedin": "https://fr.linkedin.com/in/timoth\u00e9e-lesort-128039aa;;irina-rish-8b2162", "or_profile": "~Timothee_LESORT1;~Thomas_George2;~Irina_Rish1", "aff": "Montreal Institute for Learning Algorithms, University of Montreal, University of Montreal;Mila - Universit\u00e9 de Montr\u00e9al;University of Montreal", "aff_domain": "mila.umontreal.ca;umontreal.ca;mila.quebec", "position": "Postdoc;PhD student;Professor", "bibtex": "@misc{\nlesort2022continual,\ntitle={Continual Learning in Deep Networks: an Analysis of the Last Layer},\nauthor={Timothee LESORT and Thomas George and Irina Rish},\nyear={2022},\nurl={https://openreview.net/forum?id=R2AN-rz4j_X}\n}", "github": "", "project": "", "reviewers": "e1ZF;Kc24;uN9P;BTLN", "site": "https://openreview.net/forum?id=R2AN-rz4j_X", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "4;4;4;4", "correctness": "3;3;3;3", "technical_novelty": "2;2;2;1", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "81;38;60;111", "wc_summary_review": "39;15;97;21", "wc_main_review": "256;88;246;126", "wc_review": "376;141;403;258", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "360;220;484;164", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 1.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 72.5, 26.93046601899046 ], "wc_summary_review_avg": [ 43.0, 32.4037034920393 ], "wc_main_review_avg": [ 179.0, 73.32803011127464 ], "wc_review_avg": [ 294.5, 104.05407248157086 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 307.0, 124.65552534885889 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 22, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14690084838717849567&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1;0", "aff_unique_norm": "University of Montreal;Universit\u00e9 de Montr\u00e9al", "aff_unique_dep": "Montreal Institute for Learning Algorithms;Mila", "aff_unique_url": "https://www.umontreal.ca;https://www.umontreal.ca", "aff_unique_abbr": "UM;UdeM", "aff_campus_unique_index": "0;1", "aff_campus_unique": "Montreal;Montr\u00e9al;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Canada" }, { "id": "R2aCiGQ9Qc", "title": "Two Sides of the Same Coin: Heterophily and Oversmoothing in Graph Convolutional Neural Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "In node classification tasks, heterophily and oversmoothing are two problems that can hurt the performance of graph convolutional neural networks (GCNs). The heterophily problem refers to the model's inability to handle heterophilous graphs where neighboring nodes belong to different classes; the oversmoothing problem refers to the model's degenerated performance with increasing number of layers. These two seemingly unrelated problems have been studied mostly independently, but there is recent empirical evidence that solving one problem may benefit the other. \n\nIn this work, beyond empirical observations, we aim to: (1) analyze the heterophily and oversmoothing problems from a unified theoretical perspective, (2) identify the common causes of the two problems based on our theories, and (3) propose simple yet effective strategies to address the common causes. In our theoretical analysis, we show that the common causes of the heterophily and oversmoothing problems---namely, the relative degree of a node (compared to its neighbors) and its heterophily level---trigger the node representations in consecutive layers to \"move\" closer to the original decision boundary, which increases the misclassification rate of node labels under certain constraints. We theoretically show that: (1) Nodes with high heterophily have a higher misclassification rate. (2) Even with low heterophily, degree disparity in a node's neighborhood can influence the movements of node representations and result in a \"pseudo-heterophily\" situation, which helps to explain oversmoothing. (3) Allowing not only positive, but also negative messages during message passing can help counteract the common causes of the two problems. Based on our theoretical insights, we propose simple modifications to the GCN architecture (i.e., learned degree corrections and signed messages), and we show that they alleviate the heteorophily and oversmoothing problems with extensive experiments on nine real networks. Compared to other approaches, which tend to work well in either heterophily or oversmoothing, our modified GCN model performs well in both problems.", "keywords": "graph convolutional neural networks;node classification;heterophily;oversmoothing", "primary_area": "", "supplementary_material": "/attachment/c3375425e67a10a298e01422c837365ca5e4f6be.zip", "author": "Yujun Yan;Milad Hashemi;Kevin Swersky;Yaoqing Yang;Danai Koutra", "authorids": "~Yujun_Yan1;~Milad_Hashemi1;~Kevin_Swersky1;~Yaoqing_Yang1;~Danai_Koutra1", "gender": "F;;M;M;F", "homepage": "https://sites.google.com/umich.edu/yujunyan/home;;http://www.cs.toronto.edu/~kswersky;https://sites.google.com/site/yangyaoqingcmu/;http://web.eecs.umich.edu/~dkoutra/", "dblp": "219/1736;127/9046;35/9381;04/4176;91/9987", "google_scholar": "5TQUP58AAAAJ;;https://scholar.google.ca/citations?user=IrixA8MAAAAJ;LYvugWgAAAAJ;https://scholar.google.com.tw/citations?user=bDrA1-8AAAAJ", "orcid": "0000-0003-3776-4293;;;0000-0001-9908-5531;0000-0002-3206-8179", "linkedin": ";;;;", "or_profile": "~Yujun_Yan1;~Milad_Hashemi1;~Kevin_Swersky1;~Yaoqing_Yang1;~Danai_Koutra1", "aff": "University of Michigan;Google;Google Deepmind;University of California, Berkeley;Amazon", "aff_domain": "umich.edu;google.com;google.com;berkeley.edu;amazon.com", "position": "PhD student;Research Scientist;Research Scientist;Postdoc;Scholar", "bibtex": "@misc{\nyan2022two,\ntitle={Two Sides of the Same Coin: Heterophily and Oversmoothing in Graph Convolutional Neural Networks},\nauthor={Yujun Yan and Milad Hashemi and Kevin Swersky and Yaoqing Yang and Danai Koutra},\nyear={2022},\nurl={https://openreview.net/forum?id=R2aCiGQ9Qc}\n}", "github": "", "project": "", "reviewers": "osW7;1EPP;ehjS;nAQJ", "site": "https://openreview.net/forum?id=R2aCiGQ9Qc", "pdf_size": 0, "recommendation": "3;5;6;6", "confidence": "3;5;3;3", "correctness": "2;2;3;3", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;3;0;2", "wc_summary_paper": "75;106;28;45", "wc_summary_review": "63;68;135;30", "wc_main_review": "587;302;211;334", "wc_review": "725;476;374;409", "wc_reply_reviewers": "0;0;59;53", "wc_reply_authors": "1211;1006;448;733", "reply_reviewers": "0;0;1;1", "reply_authors": "3;2;2;2", "recommendation_avg": [ 5.0, 1.224744871391589 ], "confidence_avg": [ 3.5, 0.8660254037844386 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.75, 1.0897247358851685 ], "wc_summary_paper_avg": [ 63.5, 29.75315109362368 ], "wc_summary_review_avg": [ 74.0, 38.1247950814165 ], "wc_main_review_avg": [ 358.5, 139.42829698450743 ], "wc_review_avg": [ 496.0, 137.1987609273495 ], "wc_reply_reviewers_avg": [ 28.0, 28.080242164197944 ], "wc_reply_authors_avg": [ 849.5, 287.20593656817056 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 2.25, 0.4330127018922193 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.8164965809277259, "gs_citation": 368, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15368881309061940541&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff_unique_index": "0;1;2;3;4", "aff_unique_norm": "University of Michigan;Google;DeepMind;University of California, Berkeley;Amazon", "aff_unique_dep": ";Google;DeepMind;;Amazon.com, Inc.", "aff_unique_url": "https://www.umich.edu;https://www.google.com;https://deepmind.com;https://www.berkeley.edu;https://www.amazon.com", "aff_unique_abbr": "UM;Google;DeepMind;UC Berkeley;Amazon", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Mountain View;Berkeley", "aff_country_unique_index": "0;0;1;0;0", "aff_country_unique": "United States;United Kingdom" }, { "title": "A global convergence theory for deep ReLU implicit networks via over-parameterization", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6773", "id": "R332S76RjxS", "poster": "", "openreview": "https://openreview.net/forum?id=R332S76RjxS", "slides": "https://iclr.cc/virtual/2022/poster/6773", "video": "https://iclr.cc/virtual/2022/poster/6773", "author_site": "Tianxiang Gao, Hailiang Liu, Jia Liu, Hridesh Rajan, Hongyang Gao", "tldr": "", "abstract": "Implicit deep learning has received increasing attention recently due to the fact that it generalizes the recursive prediction rule of many commonly used neural network architectures. Its prediction rule is provided implicitly based on the solution of an equilibrium equation. Although a line of recent empirical studies has demonstrated its superior performances, the theoretical understanding of implicit neural networks is limited. In general, the equilibrium equation may not be well-posed during the training. As a result, there is no guarantee that a vanilla (stochastic) gradient descent (SGD) training nonlinear implicit neural networks can converge. This paper fills the gap by analyzing the gradient flow of Rectified Linear Unit (ReLU) activated implicit neural networks. For an $m$ width implicit neural network with ReLU activation and $n$ training samples, we show that a randomly initialized gradient descent converges to a global minimum at a linear rate for the square loss function if the implicit neural network is over-parameterized. It is worth noting that, unlike existing works on the convergence of (S)GD on finite-layer over-parameterized neural networks, our convergence results hold for implicit neural networks, where the number of layers is infinite.", "keywords": "Deep learning;Deep implicit learning;deep equilibrium model;gradient descent;stochastic gradient descent;over-parameterization", "primary_area": "", "supplementary_material": "/attachment/eb41990fc8066d9e5f6972cabb3753f9b1c54050.zip", "author": "Tianxiang Gao;Hailiang Liu;Jia Liu;Hridesh Rajan;Hongyang Gao", "authorids": "~Tianxiang_Gao2;~Hailiang_Liu1;~Jia_Liu1;~Hridesh_Rajan1;~Hongyang_Gao1", "gender": "M;M;M;M;M", "homepage": "https://gaotx-cs.github.io/;https://faculty.sites.iastate.edu/hliu/;https://kevinliu-osu.github.io/index.html;https://www.cs.iastate.edu/hridesh;https://faculty.sites.iastate.edu/hygao/", "dblp": "118/3814;;;;200/7985", "google_scholar": "iNLlIbQAAAAJ;Wq7IGEIAAAAJ;Ofx3dScAAAAJ;https://scholar.google.com.tw/citations?user=aiFvpucAAAAJ;jGmq0aEAAAAJ", "orcid": ";;;;0000-0002-9020-9080", "linkedin": ";;;;hongyang-gao-74924690/", "or_profile": "~Tianxiang_Gao2;~Hailiang_Liu1;~Jia_Liu1;~Hridesh_Rajan1;~Hongyang_Gao1", "aff": "Iowa State University;Iowa State University;The Ohio State University;Iowa State University;Iowa State University", "aff_domain": "iastate.edu;iastate.edu;osu.edu;iastate.edu;iastate.edu", "position": "PhD student;Full Professor;Assistant Professor;Full Professor;Assistant Professor", "bibtex": "@inproceedings{\ngao2022a,\ntitle={A global convergence theory for deep Re{LU} implicit networks via over-parameterization},\nauthor={Tianxiang Gao and Hailiang Liu and Jia Liu and Hridesh Rajan and Hongyang Gao},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=R332S76RjxS}\n}", "github": "", "project": "", "reviewers": "sNJZ;XwkY;pFXH;tTX4", "pdf_size": 0, "recommendation": "3;6;8;8", "confidence": "4;2;3;3", "correctness": "3;4;4;4", "technical_novelty": "2;3;3;3", "empirical_novelty": "0;0;0;3", "wc_summary_paper": "10;56;78;90", "wc_summary_review": "15;36;31;55", "wc_main_review": "441;216;311;149", "wc_review": "466;308;420;294", "wc_reply_reviewers": "259;0;0;0", "wc_reply_authors": "3283;475;864;172", "reply_reviewers": "2;0;0;0", "reply_authors": "5;1;2;1", "recommendation_avg": [ 6.25, 2.0463381929681126 ], "confidence_avg": [ 3.0, 0.7071067811865476 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 0.75, 1.299038105676658 ], "wc_summary_paper_avg": [ 58.5, 30.540956108150905 ], "wc_summary_review_avg": [ 34.25, 14.271912976192084 ], "wc_main_review_avg": [ 279.25, 109.70044439290116 ], "wc_review_avg": [ 372.0, 73.00684899377592 ], "wc_reply_reviewers_avg": [ 64.75, 112.1502897900848 ], "wc_reply_authors_avg": [ 1198.5, 1228.2289078180827 ], "reply_reviewers_avg": [ 0.5, 0.8660254037844386 ], "reply_authors_avg": [ 2.25, 1.6393596310755 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.5183210553488161, "corr_recommendation_correctness": 0.9169493006161777, "gs_citation": 22, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9326391698720015074&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "pdf": "https://openreview.net/pdf?id=R332S76RjxS", "email": "iastate.edu;iastate.edu;osu.edu;iastate.edu;iastate.edu", "author_num": 5, "aff_unique_index": "0;0;1;0;0", "aff_unique_norm": "Iowa State University;Ohio State University", "aff_unique_dep": ";", "aff_unique_url": "https://www.iastate.edu;https://www.osu.edu", "aff_unique_abbr": "ISU;OSU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "R3Y9yq49seb", "title": "Wavelet Feature Maps Compression for Low Bandwidth Convolutional Neural Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "Quantization is one of the most effective techniques for compressing Convolutional Neural Networks (CNNs), which are known for requiring extensive computational resources. However, aggressive quantization may cause severe degradation in the prediction accuracy of such networks, especially in image-to-image tasks such as semantic segmentation and depth prediction. In this paper, we propose Wavelet Compressed Convolution (WCC)---a novel approach for activation maps compression for $1\\times1$ convolutions (the workhorse of modern CNNs). WCC achieves compression ratios and computational savings that are equivalent to low bit quantization rates at a relatively minimal loss of accuracy. To this end, we use a hardware-friendly Haar-wavelet transform, known for its effectiveness in image compression, and define the convolution on the compressed activation map. WCC can be utilized with any $1\\times1$ convolution in an existing network architecture. By combining WCC with light quantization, we show that we achieve compression rates equal to 2-bit and 1-bit with minimal degradation in image-to-image tasks.", "keywords": "Convolutional Neural Networks;Quantization;Wavelet Transform", "primary_area": "", "supplementary_material": "/attachment/56bee2ca5a13b6b1071306f49751e2f9cfa00328.zip", "author": "Yair Zohav;Shahaf E Finder;Maor Ashkenazi;Eran Treister", "authorids": "yairzo@post.bgu.ac.il;finders@post.bgu.ac.il;maorashkenazi@gmail.com;~Eran_Treister1", "gender": ";;;M", "homepage": ";;;https://www.cs.bgu.ac.il/~erant/", "dblp": ";;;22/10384", "google_scholar": ";;;https://scholar.google.co.il/citations?user=5nNoFlEAAAAJ", "orcid": ";;;0000-0002-5351-0966", "linkedin": ";;;", "or_profile": "yairzo@post.bgu.ac.il;finders@post.bgu.ac.il;maorashkenazi@gmail.com;~Eran_Treister1", "aff": ";;;Ben Gurion University of the Negev", "aff_domain": ";;;bgu.ac.il", "position": ";;;Assistant Professor", "bibtex": "@misc{\nzohav2022wavelet,\ntitle={Wavelet Feature Maps Compression for Low Bandwidth Convolutional Neural Networks},\nauthor={Yair Zohav and Shahaf E Finder and Maor Ashkenazi and Eran Treister},\nyear={2022},\nurl={https://openreview.net/forum?id=R3Y9yq49seb}\n}", "github": "", "project": "", "reviewers": "pDi1;C23G;mk6D;QwGk", "site": "https://openreview.net/forum?id=R3Y9yq49seb", "pdf_size": 0, "recommendation": "5;5;5;6", "confidence": "4;4;4;4", "correctness": "3;3;2;3", "technical_novelty": "1;2;2;3", "empirical_novelty": "2;2;4;3", "wc_summary_paper": "50;68;129;82", "wc_summary_review": "33;34;47;55", "wc_main_review": "117;498;831;617", "wc_review": "200;600;1007;754", "wc_reply_reviewers": "0;0;156;77", "wc_reply_authors": "302;1002;1292;826", "reply_reviewers": "0;0;1;1", "reply_authors": "1;2;3;1", "recommendation_avg": [ 5.25, 0.4330127018922193 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 82.25, 29.277764600460877 ], "wc_summary_review_avg": [ 42.25, 9.202581159652981 ], "wc_main_review_avg": [ 515.75, 259.30230909114556 ], "wc_review_avg": [ 640.25, 292.7818086903625 ], "wc_reply_reviewers_avg": [ 58.25, 64.60021284794657 ], "wc_reply_authors_avg": [ 855.5, 360.2870383458167 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.75, 0.82915619758885 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:QlOfePSHBkgJ:scholar.google.com/&scioq=Wavelet+Feature+Maps+Compression+for+Low+Bandwidth+Convolutional+Neural+Networks&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "Ben Gurion University of the Negev", "aff_unique_dep": "", "aff_unique_url": "https://www.bgu.ac.il", "aff_unique_abbr": "BGU", "aff_country_unique_index": "0", "aff_country_unique": "Israel" }, { "id": "R3zqNwzAVsC", "title": "Learning an Ethical Module for Bias Mitigation of pre-trained Models", "track": "main", "status": "Reject", "tldr": "", "abstract": "In spite of the high performance and reliability of deep learning algorithms in broad range everyday applications, many investigations tend to show that a lot of models exhibit biases, discriminating against some subgroups of the population. This urges the practitioner to develop fair systems whose performances are uniform among individuals. In this work, we propose a post-processing method designed to mitigate bias of state-of-the-art models. It consists in learning a shallow neural network, called the Ethical Module, which transforms the deep embeddings of a pre-trained model in order to give more representation power to the disadvantaged subgroups. Its training is supervised by the von Mises-Fisher loss, whose hyperparameters allow to control the space allocated to each subgroup in the latent space. Besides being very simple, the resulting methodology is more stable and faster than most current bias mitigation methods. In order to illustrate our idea in a concrete use case, we focus here on gender bias in facial recognition and conduct extensive numerical experiments on standard datasets.", "keywords": "Deep Learning;Bias;Fairness;Facial Recognition.", "primary_area": "", "supplementary_material": "/attachment/5bc8bb11c35b836eb4ba551f1249677c0ea461a2.zip", "author": "Jean-R\u00e9my Conti;Nathan Noiry;Stephan CLEMENCON;Vincent Despiegel;St\u00e9phane Gentric", "authorids": "~Jean-R\u00e9my_Conti1;~Nathan_Noiry1;~Stephan_CLEMENCON1;vincent.despiegel@idemia.com;stephane.gentric@idemia.com", "gender": "M;M;M;;", "homepage": ";https://noiry.perso.math.cnrs.fr/;https://perso.telecom-paristech.fr/clemenco/;;", "dblp": ";;85/6714;;", "google_scholar": "KdMJugsAAAAJ;;;;", "orcid": ";;;;", "linkedin": "jean-remy-conti-901354104;;;;", "or_profile": "~Jean-R\u00e9my_Conti1;~Nathan_Noiry1;~Stephan_CLEMENCON1;vincent.despiegel@idemia.com;stephane.gentric@idemia.com", "aff": "T\u00e9l\u00e9com ParisTech;Telecom Paris;T\u00e9l\u00e9com ParisTech;;", "aff_domain": "telecom-paristech.fr;telecom-paristech.fr;telecom-paristech.fr;;", "position": "PhD student;Researcher;Full Professor;;", "bibtex": "@misc{\nconti2022learning,\ntitle={Learning an Ethical Module for Bias Mitigation of pre-trained Models},\nauthor={Jean-R{\\'e}my Conti and Nathan Noiry and Stephan CLEMENCON and Vincent Despiegel and St{\\'e}phane Gentric},\nyear={2022},\nurl={https://openreview.net/forum?id=R3zqNwzAVsC}\n}", "github": "", "project": "", "reviewers": "u2Cg;k7gc;7eUe;vzpf", "site": "https://openreview.net/forum?id=R3zqNwzAVsC", "pdf_size": 0, "recommendation": "5;5;5;5", "confidence": "4;4;3;3", "correctness": "4;3;3;3", "technical_novelty": "4;2;3;4", "empirical_novelty": "4;2;3;4", "wc_summary_paper": "85;96;173;60", "wc_summary_review": "34;36;28;82", "wc_main_review": "209;579;175;399", "wc_review": "328;711;376;541", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 5.0, 0.0 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 3.25, 0.82915619758885 ], "empirical_novelty_avg": [ 3.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 103.5, 42.19300889957956 ], "wc_summary_review_avg": [ 45.0, 21.563858652847824 ], "wc_main_review_avg": [ 340.5, 162.0084874319861 ], "wc_review_avg": [ 489.0, 150.56393990594162 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9854883002138201486&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;0", "aff_unique_norm": "T\u00e9l\u00e9com ParisTech;Telecom Paris", "aff_unique_dep": ";", "aff_unique_url": "https://www.telecom-paristech.fr;https://www.telecom-paris.fr", "aff_unique_abbr": "TP;Telecom Paris", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "France" }, { "id": "R5sVzzXhW8n", "title": "Demystifying How Self-Supervised Features Improve Training from Noisy Labels", "track": "main", "status": "Reject", "tldr": "", "abstract": "The advancement of self-supervised learning (SSL) motivates researchers to apply SSL on other tasks such as learning with noisy labels. Recent literature indicates that methods built on SSL features can substantially improve the performance of learning with noisy labels. Nonetheless, the deeper reasons why (and how) SSL features benefit the training from noisy labels are less understood. In this paper, we study why and how self-supervised features help networks resist label noise using both theoretical analyses and numerical experiments. Our result shows that, given a quality encoder pre-trained from SSL, a simple linear layer trained by the cross-entropy loss is theoretically robust to symmetric label noise. Further, we provide insights for how knowledge distilled from SSL features can alleviate the over-fitting problem. We hope our work provides a better understanding for learning with noisy labels from the perspective of self-supervised learning and can potentially serve as a guideline for further research.", "keywords": "Learning with noisy labels;Self-Supervised Learning", "primary_area": "", "supplementary_material": "/attachment/81c35dd0d9a9be1ae56f9ff97007e4504713cadd.zip", "author": "Hao Cheng;Zhaowei Zhu;Xing Sun;Yang Liu", "authorids": "~Hao_Cheng5;~Zhaowei_Zhu1;~Xing_Sun1;~Yang_Liu3", "gender": "M;M;M;M", "homepage": "https://haochenglouis.github.io;https://www.zzw.ai;https://www.sunxing.org;http://www.yliuu.com", "dblp": ";202/1712;;51/3710-18", "google_scholar": "ftlVqVIAAAAJ;YS8pSQoAAAAJ;IUtix9IAAAAJ;jKrIVCIAAAAJ", "orcid": "0000-0001-8864-7818;0000-0003-3894-5862;0000-0001-8132-9083;0000-0001-8420-6011", "linkedin": ";;sunxings/;", "or_profile": "~Hao_Cheng5;~Zhaowei_Zhu1;~Xing_Sun1;~Yang_Liu3", "aff": "Tencent Youtu Lab;University of California, Santa Cruz;Tencent YouTu Lab;University of California, Santa Cruz", "aff_domain": "tencent.com;ucsc.edu;tencent.com;ucsc.edu", "position": "Researcher;PhD student;Principal Researcher;Assistant Professor", "bibtex": "@misc{\ncheng2022demystifying,\ntitle={Demystifying How Self-Supervised Features Improve Training from Noisy Labels},\nauthor={Hao Cheng and Zhaowei Zhu and Xing Sun and Yang Liu},\nyear={2022},\nurl={https://openreview.net/forum?id=R5sVzzXhW8n}\n}", "github": "", "project": "", "reviewers": "WCJg;PASS;GRMb;GhTK", "site": "https://openreview.net/forum?id=R5sVzzXhW8n", "pdf_size": 0, "recommendation": "5;5;5;6", "confidence": "2;4;4;4", "correctness": "3;3;3;4", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "50;53;75;21", "wc_summary_review": "35;33;133;34", "wc_main_review": "372;279;70;21", "wc_review": "457;365;278;76", "wc_reply_reviewers": "860;229;0;0", "wc_reply_authors": "3450;2888;368;360", "reply_reviewers": "5;3;0;0", "reply_authors": "7;6;1;1", "recommendation_avg": [ 5.25, 0.4330127018922193 ], "confidence_avg": [ 3.5, 0.8660254037844386 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 49.75, 19.201236939322424 ], "wc_summary_review_avg": [ 58.75, 42.87408891160254 ], "wc_main_review_avg": [ 185.5, 144.84905936870973 ], "wc_review_avg": [ 294.0, 140.88115558867338 ], "wc_reply_reviewers_avg": [ 272.25, 351.9803794247628 ], "wc_reply_authors_avg": [ 1766.5, 1416.5079420885716 ], "reply_reviewers_avg": [ 2.0, 2.1213203435596424 ], "reply_authors_avg": [ 3.75, 2.7726341266023544 ], "replies_avg": [ 29, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": 1.0, "gs_citation": 18, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12935441617090127675&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;0;1", "aff_unique_norm": "Tencent;University of California, Santa Cruz", "aff_unique_dep": "Youtu Lab;", "aff_unique_url": "https://www.tencent.com;https://www.ucsc.edu", "aff_unique_abbr": "Tencent;UCSC", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Santa Cruz", "aff_country_unique_index": "0;1;0;1", "aff_country_unique": "China;United States" }, { "id": "R612wi_C-7w", "title": "Stable cognitive maps for Path Integration emerge from fusing visual and proprioceptive sensors", "track": "main", "status": "Reject", "tldr": "", "abstract": "Spatial navigation in biological agents relies on the interplay between external (visual, olfactory, auditory, $\\dots$) and proprioceptive (motor commands, linear and angular velocity, $\\dots$) signals. How to combine and exploit these two streams of information, which vastly differ in terms of availability and reliability is a crucial issue. In the context of a new two--dimensional continuous environment we developed, we propose a direct-inverse model of environment dynamics to fuse image and action related signals, allowing reconstruction of the action relating the two successive images, as well as prediction of the new image from its current value and the action. The definition of those models naturally leads to the proposal of a minimalistic recurrent architecture, called Resetting Path Integrator (RPI), that can easily and reliably be trained to keep track of its position relative to its starting point during a sequence of movements. RPI updates its internal state using the (possibly noisy) proprioceptive signal, and occasionally resets it when the image signal is present. Notably, the internal state of this minimal model exhibits strong correlation with position in the environment due to the direct-inverse models, is stable across long trajectories through resetting, and allows for disambiguation of visually confusing positions in the environment through integration of past movement, making it a prime candidate for a \\textbf{cognitive map}. Our architecture is compared to state-of-the-art LSTM networks on identical tasks, and consistently shows better performance while also offering more interpretable internal dynamics and higher-quality representations.", "keywords": "RNNs", "primary_area": "", "supplementary_material": "/attachment/98ae5bfdb0f0189deb5782af937aeecd389dd08b.zip", "author": "Arnaud Fanthomme;R\u00e9mi Monasson", "authorids": "~Arnaud_Fanthomme1;remi.monasson@phys.ens.fr", "gender": ";", "homepage": "https://github.com/AFanthomme;", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": "~Arnaud_Fanthomme1;remi.monasson@phys.ens.fr", "aff": "Ecole Normale Sup\u00e9rieure;", "aff_domain": "ens.fr;", "position": "PhD student;", "bibtex": "@misc{\nfanthomme2022stable,\ntitle={Stable cognitive maps for Path Integration emerge from fusing visual and proprioceptive sensors},\nauthor={Arnaud Fanthomme and R{\\'e}mi Monasson},\nyear={2022},\nurl={https://openreview.net/forum?id=R612wi_C-7w}\n}", "github": "", "project": "", "reviewers": "7DcK;CUYu;hpxs;PuPV;AjJT", "site": "https://openreview.net/forum?id=R612wi_C-7w", "pdf_size": 0, "recommendation": "3;3;5;5;5", "confidence": "3;4;4;2;3", "correctness": "2;2;1;2;2", "technical_novelty": "2;2;2;2;2", "empirical_novelty": "2;1;3;1;3", "wc_summary_paper": "151;61;108;93;144", "wc_summary_review": "73;159;95;57;82", "wc_main_review": "727;504;635;337;1376", "wc_review": "951;724;838;487;1602", "wc_reply_reviewers": "124;151;25;70;141", "wc_reply_authors": "1869;1566;1534;617;2441", "reply_reviewers": "1;1;1;1;1", "reply_authors": "4;3;2;1;4", "recommendation_avg": [ 4.2, 0.9797958971132712 ], "confidence_avg": [ 3.2, 0.7483314773547882 ], "correctness_avg": [ 1.8, 0.4 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.0, 0.8944271909999159 ], "wc_summary_paper_avg": [ 111.4, 33.230106831004925 ], "wc_summary_review_avg": [ 93.2, 35.147688401941885 ], "wc_main_review_avg": [ 715.8, 355.2088962849889 ], "wc_review_avg": [ 920.4, 373.82166871384004 ], "wc_reply_reviewers_avg": [ 102.2, 47.662983540689105 ], "wc_reply_authors_avg": [ 1605.4, 591.8272045115872 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 2.8, 1.16619037896906 ], "replies_avg": [ 25, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.3273268353539886, "corr_recommendation_correctness": -0.4082482904638631, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:VrK22Y9kLzoJ:scholar.google.com/&scioq=Stable+cognitive+maps+for+Path+Integration+emerge+from+fusing+visual+and+proprioceptive+sensors&hl=en&as_sdt=0,33", "gs_version_total": 2, "aff_unique_index": "0", "aff_unique_norm": "Ecole Normale Sup\u00e9rieure", "aff_unique_dep": "", "aff_unique_url": "https://www.ens.fr", "aff_unique_abbr": "ENS", "aff_country_unique_index": "0", "aff_country_unique": "France" }, { "id": "R6hvtDTQmb", "title": "Adapting Stepsizes by Momentumized Gradients Improves Optimization and Generalization", "track": "main", "status": "Reject", "tldr": "", "abstract": "Adaptive gradient methods, such as Adam, have achieved tremendous success in machine learning. Scaling gradients by square roots of the running averages of squared past gradients, such methods are able to attain rapid training of modern deep neural networks. Nevertheless, they are observed to generalize worse than stochastic gradient descent (SGD) and tend to be trapped in local minima at an early stage during training. Intriguingly, we discover that substituting the gradient in the second moment estimation term with the momentumized version in Adam can well solve the issues. The intuition is that gradient with momentum contains more accurate directional information and therefore its second moment estimation is a better choice for scaling than that of the raw gradient. Thereby we propose AdaMomentum as a new optimizer reaching the goal of training fast while generalizing better. We further develop a theory to back up the improvement in optimization and generalization and provide convergence guarantees under both convex and nonconvex settings. Extensive experiments on a wide range of tasks and models demonstrate that AdaMomentum exhibits state-of-the-art performance consistently. The source code is available at https://anonymous.4open.science/r/AdaMomentum_experiments-6D9B.", "keywords": "Deep Learning Optimizer;Neural Network Optimization;Neural Network Generalization", "primary_area": "", "supplementary_material": "/attachment/d0da94cf099cf0149b34925683a6e52b6750475a.zip", "author": "Yizhou Wang;Yue Kang;Can Qin;Huan Wang;Yi Xu;Yulun Zhang;Yun Fu", "authorids": "~Yizhou_Wang3;~Yue_Kang1;~Can_Qin1;~Huan_Wang3;~Yi_Xu9;~Yulun_Zhang1;~Yun_Fu1", "gender": "M;M;M;M;M;M;M", "homepage": "https://wyzjack.github.io/;;http://canqin.tech;https://huanwang.tech/;https://sites.google.com/view/homepage-of-yi-xu;http://yulunzhang.com/;http://www1.ece.neu.edu/~yunfu/", "dblp": "71/3387-6;135/9726-2;214/2488;70/6155-14;14/5580-5;166/2763-1.html;00/5815-1", "google_scholar": "H4kqV1MAAAAJ;;QCik-YcAAAAJ;0-On0y4AAAAJ;https://scholar.google.com.hk/citations?user=12bRAdsAAAAJ;ORmLjWoAAAAJ;https://scholar.google.com.tw/citations?user=h-JEcQ8AAAAJ", "orcid": "0000-0003-1601-9649;;;0000-0001-6951-901X;0000-0001-5857-4179;0000-0002-2288-5079;0000-0002-5098-2853", "linkedin": "yizhou-wang-786603155/;yue-kang-b52063158/;;huanwang-zju/;yi-xu-884755185/;yulun-zhang-1116b5b9/;furaymond/", "or_profile": "~Yizhou_Wang3;~Yue_Kang1;~Can_Qin1;~Huan_Wang3;~Yi_Xu9;~Yulun_Zhang1;~Yun_Fu1", "aff": "Mitsubishi Electric Research Labs;University of California, Davis;Northeastern University;Northeastern University;Honda Research Institute;Swiss Federal Institute of Technology;Northeastern University", "aff_domain": "merl.com;ucdavis.edu;neu.edu;neu.edu;honda-ri.de;ethz.ch;northeastern.edu", "position": "Intern;PhD student;PhD student;PhD student;Intern;Postdoc;Full Professor", "bibtex": "@misc{\nwang2022adapting,\ntitle={Adapting Stepsizes by Momentumized Gradients Improves Optimization and Generalization},\nauthor={Yizhou Wang and Yue Kang and Can Qin and Huan Wang and Yi Xu and Yulun Zhang and Yun Fu},\nyear={2022},\nurl={https://openreview.net/forum?id=R6hvtDTQmb}\n}", "github": "", "project": "", "reviewers": "hk95;mUyQ;5ZsD;a87S", "site": "https://openreview.net/forum?id=R6hvtDTQmb", "pdf_size": 0, "recommendation": "3;3;5;6", "confidence": "5;5;3;2", "correctness": "2;3;3;3", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "42;57;51;137", "wc_summary_review": "89;65;26;89", "wc_main_review": "195;863;195;137", "wc_review": "326;985;272;363", "wc_reply_reviewers": "138;1580;118;0", "wc_reply_authors": "832;4777;469;384", "reply_reviewers": "1;3;1;0", "reply_authors": "3;9;2;1", "recommendation_avg": [ 4.25, 1.299038105676658 ], "confidence_avg": [ 3.75, 1.299038105676658 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 71.75, 38.04848880047669 ], "wc_summary_review_avg": [ 67.25, 25.752427070084092 ], "wc_main_review_avg": [ 347.5, 298.5644821474919 ], "wc_review_avg": [ 486.5, 289.62259925634254 ], "wc_reply_reviewers_avg": [ 459.0, 649.3542946650927 ], "wc_reply_authors_avg": [ 1615.5, 1833.0308917200496 ], "reply_reviewers_avg": [ 1.25, 1.0897247358851685 ], "reply_authors_avg": [ 3.75, 3.112474899497183 ], "replies_avg": [ 28, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": -1.0, "corr_recommendation_correctness": 0.5555555555555555, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7901436267269197921&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;2;2;3;4;2", "aff_unique_norm": "Mitsubishi Electric Research Laboratories;University of California, Davis;Northeastern University;Honda Research Institute;Swiss Federal Institute of Technology", "aff_unique_dep": ";;;;", "aff_unique_url": "https://www.merl.com;https://www.ucdavis.edu;https://www.northeastern.edu;https://www.honda-ri.com;https://www.ethz.ch", "aff_unique_abbr": "MERL;UC Davis;NEU;HRI;ETH Zurich", "aff_campus_unique_index": "1", "aff_campus_unique": ";Davis", "aff_country_unique_index": "0;0;0;0;1;2;0", "aff_country_unique": "United States;Japan;Switzerland" }, { "title": "Toward Faithful Case-based Reasoning through Learning Prototypes in a Nearest Neighbor-friendly Space.", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6132", "id": "R79ZGjHhv6p", "poster": "", "openreview": "https://openreview.net/forum?id=R79ZGjHhv6p", "slides": "https://iclr.cc/virtual/2022/poster/6132", "video": "https://iclr.cc/virtual/2022/poster/6132", "author_site": "Seyed Omid Davoudi, Majid Komeili", "tldr": "", "abstract": "Recent advances in machine learning have brought opportunities for the ever-increasing use of AI in the real world. This has created concerns about the black-box nature of many of the most recent machine learning approaches. In this work, we propose an interpretable neural network that leverages metric and prototype learning for classification tasks. It encodes its own explanations and provides an improved case-based reasoning through learning prototypes in an embedding space learned by a probabilistic nearest neighbor rule. Through experiments, we demonstrated the effectiveness of the proposed method in both performance and the accuracy of the explanations provided.", "keywords": "case-based reasoning;interpretable machine learning;explainable artificial intelligence;xai;prototype learning", "primary_area": "", "supplementary_material": "", "author": "Seyed Omid Davoudi;Majid Komeili", "authorids": "~Seyed_Omid_Davoudi1;~Majid_Komeili1", "gender": "M;", "homepage": ";http://people.scs.carleton.ca/~majidkomeili/", "dblp": ";10/10408", "google_scholar": ";https://scholar.google.ca/citations?user=AIHFesoAAAAJ", "orcid": ";0000-0002-4695-3072", "linkedin": "https://linkedin.com/in/omid-davoodi-bbbb68113;majid-komeili-b61309b7", "or_profile": "~Seyed_Omid_Davoudi1;~Majid_Komeili1", "aff": "Carleton University;Carleton University", "aff_domain": "carleton.ca;carleton.ca", "position": "PhD student;Assistant Professor", "bibtex": "@inproceedings{\ndavoudi2022toward,\ntitle={Toward Faithful Case-based Reasoning through Learning Prototypes in a Nearest Neighbor-friendly Space.},\nauthor={Seyed Omid Davoudi and Majid Komeili},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=R79ZGjHhv6p}\n}", "github": "", "project": "", "reviewers": "s8S9;NPUz;MCMJ", "pdf_size": 0, "recommendation": "6;6;8", "confidence": "4;3;4", "correctness": "4;3;4", "technical_novelty": "2;2;3", "empirical_novelty": "3;1;3", "wc_summary_paper": "118;52;134", "wc_summary_review": "82;41;16", "wc_main_review": "502;259;353", "wc_review": "702;352;503", "wc_reply_reviewers": "99;18;35", "wc_reply_authors": "804;379;518", "reply_reviewers": "2;1;1", "reply_authors": "2;1;1", "recommendation_avg": [ 6.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.9428090415820634 ], "wc_summary_paper_avg": [ 101.33333333333333, 35.490217744549774 ], "wc_summary_review_avg": [ 46.333333333333336, 27.207025236549146 ], "wc_main_review_avg": [ 371.3333333333333, 100.04776636964743 ], "wc_review_avg": [ 519.0, 143.33410852503556 ], "wc_reply_reviewers_avg": [ 50.666666666666664, 34.874377732401506 ], "wc_reply_authors_avg": [ 567.0, 176.93124841775878 ], "reply_reviewers_avg": [ 1.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.49999999999999983, "corr_recommendation_correctness": 0.49999999999999983, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6002618091074316744&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2, "pdf": "https://openreview.net/pdf?id=R79ZGjHhv6p", "email": "carleton.ca;carleton.ca", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Carleton University", "aff_unique_dep": "", "aff_unique_url": "https://carleton.ca", "aff_unique_abbr": "Carleton", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Canada" }, { "id": "R7APxKhg8dt", "title": "CoSe-Co: Text Conditioned Generative CommonSense Contextualizer", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Pre-trained Language Models (PTLMs) have been shown to perform well on natural language tasks. Many prior works have attempted to leverage structured commonsense present in the form of entities linked through labeled relations in Knowledge Graphs (KGs) to assist PTLMs. Retrieval approaches use KG as a separate static module which limits coverage since KGs contain finite knowledge. Generative methods train PTLMs on KG triples to scale the knowledge. However, training on symbolic KG entities limits their application in tasks involving natural language text where they ignore overall context. To mitigate this, we propose a task agnostic CommonSense Contextualizer (CoSe-Co) conditioned on sentences as input to make it generically usable in NLP tasks for generating contextually relevant knowledge in the form of KG paths. We propose a novel dataset comprising of sentence and commonsense path pairs to train CoSe-Co. The knowledge paths inferred by CoSe-Co are diverse, relevant and contain novel entities not present in the underlying KG. Additionally, we show CoSe-Co can be used for KG completion. We augment the generated knowledge in Multi-Choice QA and Open-ended CommonSense Reasoning tasks leading to improvements over current best methods (upto ~3% and ~7% respectively) on CSQA, ARC, QASC and OBQA datasets. Further, improved performance is seen in low training data regimes which shows CoSe-Co knowledge helps in generalising better.", "keywords": "Language Model;Commonsense;Knowledge Graph;Task Agnostic;Novel Sentence-to-Path Dataset", "primary_area": "", "supplementary_material": "/attachment/01dcfda05e4bed0567eb8f761d906c836e12a3a5.zip", "author": "Rachit Bansal;Milan Aggarwal;Sumit Bhatia;Jivat Neet Kaur;Balaji Krishnamurthy", "authorids": "~Rachit_Bansal1;~Milan_Aggarwal1;~Sumit_Bhatia1;~Jivat_Neet_Kaur1;~Balaji_Krishnamurthy1", "gender": "M;M;;F;M", "homepage": "https://rachitbansal.github.io;;http://sumitbhatia.net/;https://jivatneet.github.io/;", "dblp": "228/6038;206/6244.html;52/7536;291/2562;79/1076", "google_scholar": "https://scholar.google.co.in/citations?user=7-x28WYAAAAJ;YiMNG_QAAAAJ;8HVTWNkAAAAJ;O43_7KUAAAAJ;n8iUBg8AAAAJ", "orcid": ";;;;0000-0002-0366-2427", "linkedin": ";milan-aggarwal-31a954b5/;;;balaji-krishnamurthy-4241695/", "or_profile": "~Rachit_Bansal1;~Milan_Aggarwal1;~Sumit_Bhatia1;~Jivat_Neet_Kaur1;~Balaji_Krishnamurthy1", "aff": "Delhi Technological University;Adobe Systems;Adobe Systems;Microsoft Research;Adobe Systems", "aff_domain": "dtu.ac.in;adobe.com;adobe.com;microsoft.com;adobe.com", "position": "Undergrad student;Researcher;Senior ML Scientist;Research Fellow;Principal Scientist", "bibtex": "@misc{\nbansal2022coseco,\ntitle={CoSe-Co: Text Conditioned Generative CommonSense Contextualizer},\nauthor={Rachit Bansal and Milan Aggarwal and Sumit Bhatia and Jivat Neet Kaur and Balaji Krishnamurthy},\nyear={2022},\nurl={https://openreview.net/forum?id=R7APxKhg8dt}\n}", "github": "", "project": "", "reviewers": "p1tx;97zu;TrPL;MgN3", "site": "https://openreview.net/forum?id=R7APxKhg8dt", "pdf_size": 0, "recommendation": "5;5;5;6", "confidence": "3;4;3;4", "correctness": "3;3;3;3", "technical_novelty": "2;3;1;3", "empirical_novelty": "2;3;2;3", "wc_summary_paper": "35;67;124;89", "wc_summary_review": "26;58;24;48", "wc_main_review": "226;178;208;433", "wc_review": "287;303;356;570", "wc_reply_reviewers": "179;51;121;182", "wc_reply_authors": "1766;713;1595;2045", "reply_reviewers": "2;1;1;1", "reply_authors": "4;2;3;4", "recommendation_avg": [ 5.25, 0.4330127018922193 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.25, 0.82915619758885 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 78.75, 32.42202183701689 ], "wc_summary_review_avg": [ 39.0, 14.45683229480096 ], "wc_main_review_avg": [ 261.25, 100.63144389304965 ], "wc_review_avg": [ 379.0, 113.19231422671771 ], "wc_reply_reviewers_avg": [ 133.25, 53.3496719764986 ], "wc_reply_authors_avg": [ 1529.75, 498.1552845248156 ], "reply_reviewers_avg": [ 1.25, 0.4330127018922193 ], "reply_authors_avg": [ 3.25, 0.82915619758885 ], "replies_avg": [ 25, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": 0.0, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11333373138520749443&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff_unique_index": "0;1;1;2;1", "aff_unique_norm": "Delhi Technological University;Adobe;Microsoft", "aff_unique_dep": ";Adobe Systems Incorporated;Microsoft Research", "aff_unique_url": "https://www.dtu.ac.in;https://www.adobe.com;https://www.microsoft.com/en-us/research", "aff_unique_abbr": "DTU;Adobe;MSR", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1;1", "aff_country_unique": "India;United States" }, { "id": "R7vPG65hcs", "title": "Ambiguity Adaptive Inference and Single-shot based Channel Pruning for Satellite Processing Environments", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "In a restricted computing environment like satellite on-board systems, running DL models has limitation on high-speed processing due to the problems such as restriction of available power to consume compared to the relatively high computational complexity. In particular, the latest GPU resources shows high computing performance but also shows relatively high power consumption, whereas in restricted environments such as satellite systems, reconfigurable resources like FPGA or low power embedded GPU are generally adopted due to their relatively low power consumption compared to computing capability. In such a constrained computing environment, in order to overcome the problem of too huge model size to fit in reconfigurable resources or limitation on high-speed processing, we propose a reconfigurable DL accelerating system where the computing complexity and size of DL model are compressed by pruning and can be adapted to the FPGA or low power GPU resources. Therefore, in this paper, we mainly address an ambiguity adaptive inference model that can enhance overall accuracy in inference step directly for mission critical task, a new method for single-shot based channel pruning that can accelerate inference of DL model through compressing the model as much as possible with maintaining accuracy performance under constrained accelerator resources. From the experimental evaluation, for the satellite image analysis model as an example application, our method can achieve up to x8.53 compression while keeping the accuracy, and verified that our method can deploy and accelerate the DL model with high computational complexity on FPGA/GPU resources.", "keywords": "ambiguity;inference;single-shot channel pruning;satellite on-board processing", "primary_area": "", "supplementary_material": "/attachment/23aee5c8193cb669fda47b9af0424f21901d1762.zip", "author": "Minsu Jeon;Kyungno Joo;Changha Lee;Taewoo Kim;SeongHwan Kim;Chan-Hyun Youn", "authorids": "~Minsu_Jeon1;~Kyungno_Joo1;~Changha_Lee1;~Taewoo_Kim1;~SeongHwan_Kim1;~Chan-Hyun_Youn1", "gender": ";;;M;M;M", "homepage": ";;;;http://ncl.kaist.ac.kr;http://ncl.kaist.ac.kr", "dblp": "147/5998;;260/4493;16/2599;;31/5293", "google_scholar": "https://scholar.google.co.kr/citations?user=UnD9JKAAAAAJ;;;https://scholar.google.co.kr/citations?user=ei4FECsAAAAJ;;https://scholar.google.co.kr/scholar?q=chan-hyun+youn", "orcid": ";0000-0001-7339-1645;0000-0003-3687-2989;;;0000-0002-3970-7308", "linkedin": ";;;;;", "or_profile": "~Minsu_Jeon1;~Kyungno_Joo1;~Changha_Lee1;~Taewoo_Kim1;~SeongHwan_Kim1;~Chan-Hyun_Youn1", "aff": "Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology", "aff_domain": "kaist.ac.kr;kaist.ac.kr;kaist.ac.kr;kaist.ac.kr;kaist.ac.kr;kaist.ac.kr", "position": "PhD student;PhD student;PhD student;PhD student;Postdoc;Full Professor", "bibtex": "@misc{\njeon2022ambiguity,\ntitle={Ambiguity Adaptive Inference and Single-shot based Channel Pruning for Satellite Processing Environments},\nauthor={Minsu Jeon and Kyungno Joo and Changha Lee and Taewoo Kim and SeongHwan Kim and Chan-Hyun Youn},\nyear={2022},\nurl={https://openreview.net/forum?id=R7vPG65hcs}\n}", "github": "", "project": "", "reviewers": "kAcL;HQor;NsRi", "site": "https://openreview.net/forum?id=R7vPG65hcs", "pdf_size": 0, "recommendation": "1;1;6", "confidence": "4;4;3", "correctness": "1;1;3", "technical_novelty": "3;2;3", "empirical_novelty": "1;1;3", "wc_summary_paper": "147;40;53", "wc_summary_review": "12;29;53", "wc_main_review": "481;476;144", "wc_review": "640;545;250", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 2.6666666666666665, 2.3570226039551585 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 1.6666666666666667, 0.9428090415820634 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 1.6666666666666667, 0.9428090415820634 ], "wc_summary_paper_avg": [ 80.0, 47.672493816315786 ], "wc_summary_review_avg": [ 31.333333333333332, 16.81930108205715 ], "wc_main_review_avg": [ 367.0, 157.69802366125793 ], "wc_review_avg": [ 478.3333333333333, 166.04885492595912 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.9999999999999998, "corr_recommendation_correctness": 0.9999999999999998, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:6riZqqrwrjUJ:scholar.google.com/&scioq=Ambiguity+Adaptive+Inference+and+Single-shot+based+Channel+Pruning+for+Satellite+Processing+Environments&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0;0;0;0;0", "aff_unique_norm": "Korea Advanced Institute of Science and Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.kaist.ac.kr", "aff_unique_abbr": "KAIST", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "South Korea" }, { "title": "Train Short, Test Long: Attention with Linear Biases Enables Input Length Extrapolation", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6261", "id": "R8sQPpGCv0", "poster": "", "openreview": "https://openreview.net/forum?id=R8sQPpGCv0", "slides": "https://iclr.cc/virtual/2022/poster/6261", "video": "https://iclr.cc/virtual/2022/poster/6261", "author_site": "Ofir Press, Noah Smith, Mike Lewis", "tldr": "", "abstract": "Since the introduction of the transformer model by Vaswani et al. (2017), a fundamental question has yet to be answered: how does a model achieve extrapolation at inference time for sequences that are longer than it saw during training? We first show that extrapolation can be enabled by simply changing the position representation method, though we find that current methods do not allow for efficient extrapolation. We therefore introduce a simpler and more efficient position method, Attention with Linear Biases (ALiBi). ALiBi does not add positional embeddings to word embeddings; instead, it biases query-key attention scores with a penalty that is proportional to their distance. We show that this method trains a 1.3 billion parameter model on input sequences of length 1024 that extrapolates to input sequences of length 2048, achieving the same perplexity as a sinusoidal position embedding model trained on inputs of length 2048 but training 11% faster and using 11% less memory. ALiBi's inductive bias towards recency also leads it to outperform multiple strong position methods on the WikiText-103 benchmark.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/b12fdf10bb64068446f8e39dc39fb169ebbc4ca5.zip", "author": "Ofir Press;Noah Smith;Mike Lewis", "authorids": "~Ofir_Press1;~Noah_Smith1;~Mike_Lewis1", "gender": "M;M;M", "homepage": "https://ofir.io/about;https://homes.cs.washington.edu/~nasmith/;", "dblp": "185/0577;90/5204.html;19/6214", "google_scholar": "LeHa8psAAAAJ;https://scholar.google.com/citations?hl=en;SnQnQicAAAAJ", "orcid": ";0000-0002-2310-6380;", "linkedin": ";;", "or_profile": "~Ofir_Press1;~Noah_Smith1;~Mike_Lewis1", "aff": "Meta Facebook;Allen Institute for Artificial Intelligence;Facebook AI Research", "aff_domain": "facebook.com;allenai.org;fb.com", "position": "Visiting Researcher;Senior Director of NLP Research;Research Scientist", "bibtex": "@inproceedings{\npress2022train,\ntitle={Train Short, Test Long: Attention with Linear Biases Enables Input Length Extrapolation},\nauthor={Ofir Press and Noah Smith and Mike Lewis},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=R8sQPpGCv0}\n}", "github": "", "project": "", "reviewers": "ZAja;xt6W;6wKQ;dz7S", "pdf_size": 0, "recommendation": "6;8;8;8", "confidence": "4;4;5;4", "correctness": "3;4;4;4", "technical_novelty": "2;3;3;3", "empirical_novelty": "3;3;3;3", "wc_summary_paper": "81;76;77;90", "wc_summary_review": "38;28;55;47", "wc_main_review": "307;114;356;140", "wc_review": "426;218;488;277", "wc_reply_reviewers": "13;0;10;56", "wc_reply_authors": "441;461;661;303", "reply_reviewers": "1;0;1;1", "reply_authors": "2;1;1;1", "recommendation_avg": [ 7.5, 0.8660254037844386 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 81.0, 5.522680508593631 ], "wc_summary_review_avg": [ 42.0, 10.074720839804943 ], "wc_main_review_avg": [ 229.25, 104.11381992799996 ], "wc_review_avg": [ 352.25, 109.03296519860405 ], "wc_reply_reviewers_avg": [ 19.75, 21.47527648250425 ], "wc_reply_authors_avg": [ 466.5, 127.71354665813647 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": 1.0, "gs_citation": 757, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3347460907170213441&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=R8sQPpGCv0", "email": "facebook.com;allenai.org;fb.com", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "Meta;Allen Institute for Artificial Intelligence", "aff_unique_dep": "Meta Platforms, Inc.;", "aff_unique_url": "https://meta.com;https://allenai.org", "aff_unique_abbr": "Meta;AI2", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "R9Ht8RZK3qY", "title": "FED-$\\chi^2$: Secure Federated Correlation Test", "track": "main", "status": "Reject", "tldr": "", "abstract": "In this paper, we propose the first secure federated $\\chi^2$-test protocol, FED-$\\chi^2$. We recast $\\chi^2$-test as a problem of the second moment estimation and use stable projection to encode the local information in a short vector. Due to the fact that such encodings can be aggregated with summation, secure aggregation can smoothly be applied to conceal the individual updates. We formally establish the security guarantee of FED-$\\chi^2$ by demonstrating that the joint distribution is hidden in a subspace containing exponentially possible distributions. Our evaluation results show that FED-$\\chi^2$ achieves good accuracy with small client-side computation overhead. FED-$\\chi^2$ performs comparably to the centralized $\\chi^2$-test in several real-world case studies. The code for evaluation is in the supplementary material.", "keywords": "Federated Analytics;Hypothesis Test", "primary_area": "", "supplementary_material": "/attachment/de4f8932c1ad0efab8da7270f9ccdfb8fb22fa3f.zip", "author": "Lun Wang;Qi Pang;Shuai Wang;Dawn Song", "authorids": "~Lun_Wang1;~Qi_Pang1;~Shuai_Wang7;~Dawn_Song2", "gender": ";;M;F", "homepage": "https://wanglun1996.github.io/;;https://home.cse.ust.hk/~shuaiw/;http://people.eecs.berkeley.edu/~dawnsong/", "dblp": ";;42/1503-11;", "google_scholar": ";;;84WzBlYAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Lun_Wang1;~Qi_Pang1;~Shuai_Wang7;~Dawn_Song2", "aff": "University of California, Berkeley;;;University of California, Berkeley", "aff_domain": "berkeley.edu;;;berkeley.edu", "position": "PhD student;;;Full Professor", "bibtex": "@misc{\nwang2022fedchi,\ntitle={{FED}-\\${\\textbackslash}chi{\\textasciicircum}2\\$: Secure Federated Correlation Test},\nauthor={Lun Wang and Qi Pang and Shuai Wang and Dawn Song},\nyear={2022},\nurl={https://openreview.net/forum?id=R9Ht8RZK3qY}\n}", "github": "", "project": "", "reviewers": "GEWR;DQ1G;sPzq;UTuW", "site": "https://openreview.net/forum?id=R9Ht8RZK3qY", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "2;2;2;2", "correctness": "2;3;3;3", "technical_novelty": "3;2;3;3", "empirical_novelty": "3;2;0;3", "wc_summary_paper": "54;96;44;14", "wc_summary_review": "113;36;28;10", "wc_main_review": "108;209;182;103", "wc_review": "275;341;254;127", "wc_reply_reviewers": "0;84;0;13", "wc_reply_authors": "589;819;274;363", "reply_reviewers": "0;1;0;1", "reply_authors": "2;3;2;2", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 2.0, 0.0 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 1.224744871391589 ], "wc_summary_paper_avg": [ 52.0, 29.359836511806396 ], "wc_summary_review_avg": [ 46.75, 39.39146481155531 ], "wc_main_review_avg": [ 150.5, 46.03531253288067 ], "wc_review_avg": [ 249.25, 77.53829698929427 ], "wc_reply_reviewers_avg": [ 24.25, 34.90254288730264 ], "wc_reply_authors_avg": [ 511.25, 211.5542188187227 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 2.25, 0.4330127018922193 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:HCQiIyFZqs4J:scholar.google.com/&scioq=FED-%24%5Cchi%5E2%24:+Secure+Federated+Correlation+Test&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "University of California, Berkeley", "aff_unique_dep": "", "aff_unique_url": "https://www.berkeley.edu", "aff_unique_abbr": "UC Berkeley", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Berkeley", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "RA-zVvZLYIy", "title": "MLP-based architecture with variable length input for automatic speech recognition", "track": "main", "status": "Reject", "tldr": "", "abstract": "We propose multi-layer perceptron (MLP)-based architectures suitable for variable length input.\nRecently, several such architectures that do not rely on self-attention have been proposed for image classification. \nThey achieve performance competitive with that of transformer-based architectures, albeit with a simpler structure and low computational cost. \nThey split an image into patches and mix information by applying MLPs within and across patches alternately. \nDue to the use of MLPs, one such model can only be used for inputs of a fixed, pre-defined size.\nHowever, many types of data are naturally variable in length, for example acoustic signals. \nWe propose three approaches to extend MLP-based architectures for use with sequences of arbitrary length. \nIn all of them, we start by splitting the signal into contiguous tokens of fixed size (equivalent to patches in images). \nNaturally, the number of tokens is variable. \nThe two first approaches use a gating mechanism that mixes local information across tokens in a shift-invariant and length-agnostic way.\nOne uses a depthwise convolution to derive the gate values, while the other rely on shifting tokens.\nThe final approach explores non-gated mixing using a circular convolution applied in the Fourier domain.\nWe evaluate the proposed architectures on an automatic speech recognition task with the Librispeech and Tedlium2 corpora. Compared to Transformer, our proposed architecture reduces the WER by \\SI{1.9 / 3.4}{\\percent} on Librispeech test-clean/test-other set, and 1.8 / 1.6 % on Tedlium2 dev/test set, using only 75.3 % of the parameters.", "keywords": "MLP;automatic speech recognition", "primary_area": "", "supplementary_material": "/attachment/8a70b35ebc8b5e00a7daaf94b0ce8d7c08b15a53.zip", "author": "Jin Sakuma;Tatsuya Komatsu;Robin Scheibler", "authorids": "~Jin_Sakuma1;~Tatsuya_Komatsu1;~Robin_Scheibler1", "gender": "M;M;M", "homepage": "https://github.com/JinSakuma;;http://www.robinscheibler.org", "dblp": ";;", "google_scholar": ";https://scholar.google.co.jp/citations?user=o2lMlxMAAAAJ;-Oed5gEAAAAJ", "orcid": ";;0000-0002-5205-8365", "linkedin": ";;", "or_profile": "~Jin_Sakuma1;~Tatsuya_Komatsu1;~Robin_Scheibler1", "aff": "Waseda University;LINE;LINE Corporation", "aff_domain": "pcl.cs.waseda.ac.jp;linecorp.com;linecorp.com", "position": "MS student;Researcher;Researcher", "bibtex": "@misc{\nsakuma2022mlpbased,\ntitle={{MLP}-based architecture with variable length input for automatic speech recognition},\nauthor={Jin Sakuma and Tatsuya Komatsu and Robin Scheibler},\nyear={2022},\nurl={https://openreview.net/forum?id=RA-zVvZLYIy}\n}", "github": "", "project": "", "reviewers": "h5Ds;4cme;uaQ8;vrtS", "site": "https://openreview.net/forum?id=RA-zVvZLYIy", "pdf_size": 0, "recommendation": "3;5;6;6", "confidence": "4;3;2;3", "correctness": "2;3;3;4", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "151;80;31;76", "wc_summary_review": "296;37;37;63", "wc_main_review": "95;133;76;380", "wc_review": "542;250;144;519", "wc_reply_reviewers": "0;0;21;145", "wc_reply_authors": "614;468;329;544", "reply_reviewers": "0;0;1;1", "reply_authors": "1;1;2;2", "recommendation_avg": [ 5.0, 1.224744871391589 ], "confidence_avg": [ 3.0, 0.7071067811865476 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 84.5, 42.94473192371795 ], "wc_summary_review_avg": [ 108.25, 108.91596531271253 ], "wc_main_review_avg": [ 171.0, 122.3989379038887 ], "wc_review_avg": [ 363.75, 171.10285649281252 ], "wc_reply_reviewers_avg": [ 41.5, 60.36762377301263 ], "wc_reply_authors_avg": [ 488.75, 105.70093424374261 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.8660254037844386, "corr_recommendation_correctness": 0.8660254037844386, "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4332447671478582036&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;2", "aff_unique_norm": "Waseda University;;LINE Corporation", "aff_unique_dep": ";;", "aff_unique_url": "https://www.waseda.jp/top;;https://www.linecorp.com", "aff_unique_abbr": "Waseda;;LINE", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Japan;" }, { "title": "Zero-CL: Instance and Feature decorrelation for negative-free symmetric contrastive learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/5905", "id": "RAW9tCdVxLj", "poster": "", "openreview": "https://openreview.net/forum?id=RAW9tCdVxLj", "slides": "https://iclr.cc/virtual/2022/poster/5905", "video": "https://iclr.cc/virtual/2022/poster/5905", "author_site": "Shaofeng Zhang, Feng Zhu, Junchi Yan, Rui Zhao, Xiaokang Yang", "tldr": "", "abstract": "For self-supervised contrastive learning, models can easily collapse and generate trivial constant solutions. The issue has been mitigated by recent improvement on objective design, which however often requires square complexity either for the size of instances ($\\mathcal{O}(N^{2})$) or feature dimensions ($\\mathcal{O}(d)^2$). To prevent such collapse, we develop two novel methods by decorrelating on different dimensions on the instance embedding stacking matrix, i.e., \\textbf{I}nstance-wise (ICL) and \\textbf{F}eature-wise (FCL) \\textbf{C}ontrastive \\textbf{L}earning. The proposed two methods (FCL, ICL) can be combined synthetically, called Zero-CL, where ``Zero'' means negative samples are \\textbf{zero} relevant, which allows Zero-CL to completely discard negative pairs i.e., with \\textbf{zero} negative samples. Compared with previous methods, Zero-CL mainly enjoys three advantages: 1) Negative free in symmetric architecture. 2) By whitening transformation, the correlation of the different features is equal to zero, alleviating information redundancy. 3) Zero-CL remains original information to a great extent after transformation, which improves the accuracy against other whitening transformation techniques. Extensive experimental results on CIFAR-10/100 and ImageNet show that Zero-CL outperforms or is on par with state-of-the-art symmetric contrastive learning methods.", "keywords": "Self supervised learning;representation learning", "primary_area": "", "supplementary_material": "", "author": "Shaofeng Zhang;Feng Zhu;Junchi Yan;Rui Zhao;Xiaokang Yang", "authorids": "~Shaofeng_Zhang1;~Feng_Zhu1;~Junchi_Yan2;~Rui_Zhao6;~Xiaokang_Yang1", "gender": "M;M;M;M;M", "homepage": "https://sherrylone.github.io;http://home.ustc.edu.cn/~zhufengx/;http://zhaorui.xyz/;https://icne.sjtu.edu.cn/info/1064/1078.htm;http://thinklab.sjtu.edu.cn/", "dblp": "132/2540;71/2791-6;26/2578-1;06/3071-1.html;60/7949.html", "google_scholar": "VoVVJIgAAAAJ;oO53gjEAAAAJ;1c9oQNMAAAAJ;yDEavdMAAAAJ;ga230VoAAAAJ", "orcid": ";;;0000-0003-4029-3322;0000-0001-9639-7679", "linkedin": ";;;;", "or_profile": "~Shaofeng_Zhang1;~Feng_Zhu1;~Rui_Zhao6;~Xiaokang_Yang1;~Junchi_Yan1", "aff": "Shanghai Jiaotong University;SenseTime Group LTD;SenseTime Research;Shanghai Jiaotong University;Shanghai Jiaotong University", "aff_domain": "sjtu.edu.cn;sensetime.com;sensetime.com;sjtu.edu.cn;sjtu.edu.cn", "position": "PhD student;Researcher;Researcher;Full Professor;Associate Professor", "bibtex": "@inproceedings{\nzhang2022zerocl,\ntitle={Zero-{CL}: Instance and Feature decorrelation for negative-free symmetric contrastive learning},\nauthor={Shaofeng Zhang and Feng Zhu and Junchi Yan and Rui Zhao and Xiaokang Yang},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=RAW9tCdVxLj}\n}", "github": "", "project": "", "reviewers": "1y87;kq6q;AaoY;GfmZ", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "4;4;2;3", "correctness": "3;3;3;3", "technical_novelty": "3;3;3;3", "empirical_novelty": "3;3;3;3", "wc_summary_paper": "53;31;65;87", "wc_summary_review": "72;4;28;63", "wc_main_review": "264;92;130;165", "wc_review": "389;127;223;315", "wc_reply_reviewers": "0;30;0;0", "wc_reply_authors": "713;419;331;277", "reply_reviewers": "0;2;0;0", "reply_authors": "2;3;1;1", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 59.0, 20.248456731316587 ], "wc_summary_review_avg": [ 41.75, 27.2981226460722 ], "wc_main_review_avg": [ 162.75, 63.90373619750257 ], "wc_review_avg": [ 263.5, 98.32980219648567 ], "wc_reply_reviewers_avg": [ 7.5, 12.99038105676658 ], "wc_reply_authors_avg": [ 435.0, 168.31518053936787 ], "reply_reviewers_avg": [ 0.5, 0.8660254037844386 ], "reply_authors_avg": [ 1.75, 0.82915619758885 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.3458572319330373, "corr_recommendation_correctness": 0.0, "gs_citation": 48, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4345559874982229274&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "pdf": "https://openreview.net/pdf?id=RAW9tCdVxLj", "email": "sjtu.edu.cn;sensetime.com;sensetime.com;sjtu.edu.cn;sjtu.edu.cn", "author_num": 5, "aff_unique_index": "0;1;2;0;0", "aff_unique_norm": "Shanghai Jiao Tong University;SenseTime Group;SenseTime", "aff_unique_dep": ";;SenseTime Research", "aff_unique_url": "https://www.sjtu.edu.cn;https://www.sensetime.com;https://www.sensetime.com", "aff_unique_abbr": "SJTU;SenseTime;SenseTime", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "id": "RAoBtzlwtCC", "title": "Provable Federated Adversarial Learning via Min-max Optimization", "track": "main", "status": "Reject", "tldr": "", "abstract": "Federated learning (FL) is a trending training paradigm to utilize decentralized training data. FL allows clients to update model parameters locally for several epochs, then share them to a global model for aggregation. This training paradigm with multi-local step updating before aggregation exposes unique vulnerabilities to adversarial attacks. Adversarial training is a trending method to improve the robustness of neural networks against adversarial perturbations. First, we formulate a \\textit{general} form of federated adversarial learning (FAL) that is adapted from adversarial learning in the centralized setting. On the client side of FL training, FAL has an inner loop to optimize an adversarial to generate adversarial samples for adversarial training and an outer loop to update local model parameters. On the server side, FAL aggregates local model updates and broadcast the aggregated model. We design a global training loss to formulate FAL training as a min-max optimization problem. Unlike the convergence analysis in centralized training that relies on the gradient direction, it is significantly harder to analyze the convergence in FAL for two reasons: 1) the complexity of min-max optimization, and 2) model not updating in the gradient direction due to the multi-local updates on the client-side before aggregation. Further, we address the challenges using appropriate gradient approximation and coupling techniques and present the convergence analysis in the over-parameterized regime. Our main result theoretically shows that the minimal value of loss function under this algorithm can converge to $\\epsilon$ small with chosen learning rate and communication rounds. It is noteworthy that our analysis is feasible for non-IID clients.", "keywords": "Federated Learning;Adversarial Training;Optimization;Non-Convex", "primary_area": "", "supplementary_material": "", "author": "Xiaoxiao Li;Zhao Song;Jiaming Yang", "authorids": "~Xiaoxiao_Li1;~Zhao_Song6;~Jiaming_Yang1", "gender": "Unspecified;M;M", "homepage": "https://xxlya.github.io/;;https://www.youtube.com/@zhaosong2031", "dblp": "71/8042;;76/4051-2", "google_scholar": "sdENOQ4AAAAJ;QBsx7kAAAAAJ;yDZct7UAAAAJ", "orcid": ";0009-0005-2150-3453;", "linkedin": ";%E4%BD%B3%E6%98%8E-%E6%9D%A8-737140212/;", "or_profile": "~Xiaoxiao_Li1;~Jiaming_Yang1;~Zhao_Song3", "aff": "University of British Columbia;Peking University;Adobe", "aff_domain": "ece.ubc.ca;pku.edu.cn;adobe.com", "position": "Assistant Professor;Undergrad student;Researcher", "bibtex": "@misc{\nli2022provable,\ntitle={Provable Federated Adversarial Learning via Min-max Optimization},\nauthor={Xiaoxiao Li and Zhao Song and Jiaming Yang},\nyear={2022},\nurl={https://openreview.net/forum?id=RAoBtzlwtCC}\n}", "github": "", "project": "", "reviewers": "vGrK;Pwgr;e6rQ;Gk6a;KhJu", "site": "https://openreview.net/forum?id=RAoBtzlwtCC", "pdf_size": 0, "recommendation": "3;3;3;5;5", "confidence": "3;3;3;3;3", "correctness": "3;3;4;3;4", "technical_novelty": "2;2;2;3;2", "empirical_novelty": "0;2;0;0;0", "wc_summary_paper": "114;28;38;61;149", "wc_summary_review": "53;18;40;39;56", "wc_main_review": "311;135;306;212;278", "wc_review": "478;181;384;312;483", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "128;185;258;240;415", "reply_reviewers": "0;0;0;0;0", "reply_authors": "1;1;1;1;1", "recommendation_avg": [ 3.8, 0.9797958971132712 ], "confidence_avg": [ 3.0, 0.0 ], "correctness_avg": [ 3.4, 0.4898979485566356 ], "technical_novelty_avg": [ 2.2, 0.39999999999999997 ], "empirical_novelty_avg": [ 0.4, 0.8000000000000002 ], "wc_summary_paper_avg": [ 78.0, 46.31630382489518 ], "wc_summary_review_avg": [ 41.2, 13.43726162579266 ], "wc_main_review_avg": [ 248.4, 66.77903862740163 ], "wc_review_avg": [ 367.6, 112.85849547109866 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 245.2, 96.31490019721768 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.16666666666666663, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:-HZerV0ZeJAJ:scholar.google.com/&scioq=Provable+Federated+Adversarial+Learning+via+Min-max+Optimization&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1;2", "aff_unique_norm": "University of British Columbia;Peking University;Adobe", "aff_unique_dep": ";;Adobe Inc.", "aff_unique_url": "https://www.ubc.ca;http://www.pku.edu.cn;https://www.adobe.com", "aff_unique_abbr": "UBC;Peking U;Adobe", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2", "aff_country_unique": "Canada;China;United States" }, { "id": "RB_2cor6d-w", "title": "Towards Physical, Imperceptible Adversarial Attacks via Adversarial Programs", "track": "main", "status": "Reject", "tldr": "", "abstract": "Adversarial examples were originally defined as imperceptible perturbations which cause a deep neural network to misclassify. However, the majority of imperceptible perturbation attacks require to perturb a large number of pixels across the image and are thus hard to execute in the physical world. Existing physical attacks rely on physical objects, such as patches/stickers or 3D-printed objects. Producing adversarial patches is arguably easier than 3D-printing but these attacks incur highly visible perturbations. This raises the question: is it possible to generate adversarial examples with imperceptible patches? In this work, we consider adversarial multi-patch attacks, where the goal is to compute a targeted attack consisting of up to K patches with minimal L2 distortion. Each patch is associated with dimensions, position, and perturbation parameters. We leverage ideas from program synthesis and numerical optimization to search in this large, discrete space and obtain attacks that are competitive with the C&W attack but have at least 3x and up to 10x fewer perturbed pixels. We evaluate our approach on MNIST, Fashion-MNIST, CIFAR-10, and ImageNet and obtain success rate of at least 92% and up to 100% with at most ten patches. For MNIST, Fashion-MNIST, and CIFAR-10, the average L2 distortion is greater than the average L2 distortion of the C&W attack by up to 1.2.", "keywords": "Adversarial Attacks;Adversarial Patches;Adversarial Programs;Program Synthesis", "primary_area": "", "supplementary_material": "", "author": "Itai Mesery;Dana Drachsler Cohen", "authorids": "~Itai_Mesery1;~Dana_Drachsler_Cohen1", "gender": "M;F", "homepage": ";https://ddana.net.technion.ac.il/", "dblp": ";155/1628", "google_scholar": ";https://scholar.google.ch/citations?user=XOiO5xgAAAAJ", "orcid": ";", "linkedin": "itai-mesery-7769928a/;", "or_profile": "~Itai_Mesery1;~Dana_Drachsler_Cohen1", "aff": ";Technion, Technion", "aff_domain": ";technion.ac.il", "position": ";Assistant Professor", "bibtex": "@misc{\nmesery2022towards,\ntitle={Towards Physical, Imperceptible Adversarial Attacks via Adversarial Programs},\nauthor={Itai Mesery and Dana Drachsler Cohen},\nyear={2022},\nurl={https://openreview.net/forum?id=RB_2cor6d-w}\n}", "github": "", "project": "", "reviewers": "tecw;3vSm;Uj1D;xXLy;H5Ak", "site": "https://openreview.net/forum?id=RB_2cor6d-w", "pdf_size": 0, "recommendation": "3;3;5;6;6", "confidence": "5;4;4;5;4", "correctness": "3;3;3;4;3", "technical_novelty": "2;2;2;2;3", "empirical_novelty": "1;1;2;2;3", "wc_summary_paper": "101;49;80;43;73", "wc_summary_review": "161;15;53;76;34", "wc_main_review": "916;353;289;92;271", "wc_review": "1178;417;422;211;378", "wc_reply_reviewers": "515;0;37;0;43", "wc_reply_authors": "1706;610;532;147;346", "reply_reviewers": "1;0;1;0;1", "reply_authors": "3;1;2;1;2", "recommendation_avg": [ 4.6, 1.3564659966250536 ], "confidence_avg": [ 4.4, 0.48989794855663565 ], "correctness_avg": [ 3.2, 0.39999999999999997 ], "technical_novelty_avg": [ 2.2, 0.39999999999999997 ], "empirical_novelty_avg": [ 1.8, 0.7483314773547883 ], "wc_summary_paper_avg": [ 69.2, 21.15088650624366 ], "wc_summary_review_avg": [ 67.8, 50.799212592322725 ], "wc_main_review_avg": [ 384.2, 279.6579339121277 ], "wc_review_avg": [ 521.2, 337.28765171586105 ], "wc_reply_reviewers_avg": [ 119.0, 198.81549235409196 ], "wc_reply_authors_avg": [ 668.2, 543.0338479321524 ], "reply_reviewers_avg": [ 0.6, 0.48989794855663565 ], "reply_authors_avg": [ 1.8, 0.7483314773547883 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.06019292654288455, "corr_recommendation_correctness": 0.5160468465421401, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:RAe4ECsK2u4J:scholar.google.com/&scioq=Towards+Physical,+Imperceptible+Adversarial+Attacks+via+Adversarial+Programs&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "Technion - Israel Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.technion.ac.il/en/", "aff_unique_abbr": "Technion", "aff_country_unique_index": "0", "aff_country_unique": "Israel" }, { "title": "Offline Reinforcement Learning with Value-based Episodic Memory", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6918", "id": "RCZqv9NXlZ", "poster": "", "openreview": "https://openreview.net/forum?id=RCZqv9NXlZ", "slides": "https://iclr.cc/virtual/2022/poster/6918", "video": "https://iclr.cc/virtual/2022/poster/6918", "author_site": "Xiaoteng Ma, Yiqin Yang, Hao Hu, Jun Yang, Chongjie Zhang, Qianchuan Zhao, Bin Liang, Qihan Liu", "tldr": "", "abstract": "Offline reinforcement learning (RL) shows promise of applying RL to real-world problems by effectively utilizing previously collected data. Most existing offline RL algorithms use regularization or constraints to suppress extrapolation error for actions outside the dataset. In this paper, we adopt a different framework, which learns the V-function instead of the Q-function to naturally keep the learning procedure within the support of an offline dataset. To enable effective generalization while maintaining proper conservatism in offline learning, we propose Expectile V-Learning (EVL), which smoothly interpolates between the optimal value learning and behavior cloning. Further, we introduce implicit planning along offline trajectories to enhance learned V-values and accelerate convergence. Together, we present a new offline method called Value-based Episodic Memory (VEM). We provide theoretical analysis for the convergence properties of our proposed VEM method, and empirical results in the D4RL benchmark show that our method achieves superior performance in most tasks, particularly in sparse-reward tasks.", "keywords": "Reinforcement Learning;Offline Learning;Episodic Memory Control", "primary_area": "", "supplementary_material": "/attachment/1ff950cdd5225dd590f7f6d96f3e626049fc9cbb.zip", "author": "Xiaoteng Ma;Yiqin Yang;Hao Hu;Jun Yang;Chongjie Zhang;Qianchuan Zhao;Bin Liang;Qihan Liu", "authorids": "~Xiaoteng_Ma1;~Yiqin_Yang1;~Hao_Hu3;~Jun_Yang6;~Chongjie_Zhang1;~Qianchuan_Zhao1;~Bin_Liang5;~Qihan_Liu1", "gender": "M;M;M;M;;M;M;M", "homepage": "https://xtma.github.io/;https://www.researchgate.net/profile/Yiqin-Yang-2;https://mousehu.github.io;;;;;https://github.com/liuqh16", "dblp": "238/3249;180/7725;67/6924-6;;29/6693;82/3427;;", "google_scholar": "CeDFnNMAAAAJ;aHTi5IEAAAAJ;https://scholar.google.com/citations?hl=en;ZrgN9ssAAAAJ;LjxqXycAAAAJ;;;a3J4_OQAAAAJ", "orcid": "0000-0002-7250-6268;;;;;0000-0002-7952-5621;0000-0002-7163-345X;0000-0001-6637-8346", "linkedin": ";;hao-hu-tsinghua;;;;;", "or_profile": "~Xiaoteng_Ma1;~Yiqin_Yang1;~Hao_Hu3;~Jun_Yang6;~Chongjie_Zhang1;~Qianchuan_Zhao1;~Bin_Liang5;~Qihan_Liu1", "aff": "Department of Automation, Tsinghua University;Tsinghua University;Tsinghua University;Tsinghua University;Tsinghua University;Tsinghua University;;Tsinghua University", "aff_domain": "tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;;tsinghua.edu.cn", "position": "PhD student;PhD student;PhD student;Assistant Professor;Assistant Professor;Full Professor;;PhD student", "bibtex": "@inproceedings{\nma2022offline,\ntitle={Offline Reinforcement Learning with Value-based Episodic Memory},\nauthor={Xiaoteng Ma and Yiqin Yang and Hao Hu and Jun Yang and Chongjie Zhang and Qianchuan Zhao and Bin Liang and Qihan Liu},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=RCZqv9NXlZ}\n}", "github": "", "project": "", "reviewers": "aRTY;y86L;M22A;3Tyw;yPrQ;FzBf", "pdf_size": 0, "recommendation": "5;6;6;8;8;8", "confidence": "3;5;3;4;4;4", "correctness": "4;3;3;4;4;3", "technical_novelty": "3;3;2;3;3;2", "empirical_novelty": "3;3;3;2;3;0", "wc_summary_paper": "52;93;98;108;55;83", "wc_summary_review": "2;56;53;16;47;45", "wc_main_review": "338;358;255;134;117;676", "wc_review": "392;507;406;258;219;804", "wc_reply_reviewers": "0;0;25;21;0;52", "wc_reply_authors": "641;1529;1055;904;865;1326", "reply_reviewers": "0;0;1;1;0;1", "reply_authors": "3;6;5;4;4;5", "recommendation_avg": [ 6.833333333333333, 1.2133516482134197 ], "confidence_avg": [ 3.8333333333333335, 0.6871842709362768 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.3333333333333335, 1.1055415967851334 ], "wc_summary_paper_avg": [ 81.5, 21.140403654298247 ], "wc_summary_review_avg": [ 36.5, 20.1886932051912 ], "wc_main_review_avg": [ 313.0, 186.29725351348222 ], "wc_review_avg": [ 431.0, 192.32091236611097 ], "wc_reply_reviewers_avg": [ 16.333333333333332, 19.014614262602215 ], "wc_reply_authors_avg": [ 1053.3333333333333, 296.75953602575646 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 4.5, 0.9574271077563381 ], "replies_avg": [ 38, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": 0.3664631325590232, "corr_recommendation_correctness": 0.13736056394868904, "gs_citation": 50, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5999497439861071363&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=RCZqv9NXlZ", "email": "tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;;tsinghua.edu.cn", "author_num": 8, "aff_unique_index": "0;0;0;0;0;0;0", "aff_unique_norm": "Tsinghua University", "aff_unique_dep": "Department of Automation", "aff_unique_url": "https://www.tsinghua.edu.cn", "aff_unique_abbr": "THU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "China" }, { "id": "RCyHECZIUFb", "title": "Explaining Knowledge Graph Embedding via Latent Rule Learning", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Knowledge Graph Embeddings (KGEs) embed entities and relations into continuous vector space following certain assumption, and are a powerful tools for representation learning of knowledge graphs. However, following vector space assumptions makes KGE a one step reasoner that directly predict final results without reasonable multi-hop reasoning steps. Thus KGEs are black-box models and explaining predictions made by KGEs remains unsolved. In this paper, we propose KGExplainer, the first general approach of providing explanations for predictions from KGE models. KGExplainer is a multi-hop reasoner learning latent rules for link prediction and is encouraged to behave similarly to KGEs during prediction through knowledge distillation. For explanation, KGExplainer outputs a ranked list of rules for each relation. Experiments on benchmark datasets with two target KGEs show that our approach is faithfulness to replicate KGEs behaviors for link prediction and is good at outputting quality rules for effective explanations. ", "keywords": "knowledge graph embedding;explainable AI;rule learning", "primary_area": "", "supplementary_material": "/attachment/47904194644f60079e3cccfe24403d058a765850.zip", "author": "Wen Zhang;Mingyang Chen;Zezhong Xu;Yushan Zhu;Huajun Chen", "authorids": "~Wen_Zhang4;~Mingyang_Chen3;xuzezhong@zju.edu.cn;yushanzhu@zju.edu.cn;~Huajun_Chen1", "gender": ";M;;;M", "homepage": "https://person.zju.edu.cn/en/wenzhang;;;;", "dblp": "43/2368-15;;;;94/5089", "google_scholar": "Ig9ho4kAAAAJ;ofNVwaQAAAAJ;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~Wen_Zhang4;~Mingyang_Chen3;xuzezhong@zju.edu.cn;yushanzhu@zju.edu.cn;~Huajun_Chen1", "aff": "Zhejiang University;Zhejiang University;;;Zhejiang University", "aff_domain": "zju.edu.cn;zju.edu.cn;;;zju.edu.cn", "position": "Assistant Professor;PhD student;;;Full Professor", "bibtex": "@misc{\nzhang2022explaining,\ntitle={Explaining Knowledge Graph Embedding via Latent Rule Learning},\nauthor={Wen Zhang and Mingyang Chen and Zezhong Xu and Yushan Zhu and Huajun Chen},\nyear={2022},\nurl={https://openreview.net/forum?id=RCyHECZIUFb}\n}", "github": "", "project": "", "reviewers": "My2W;19wV;B2GE;3my2", "site": "https://openreview.net/forum?id=RCyHECZIUFb", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "3;4;5;4", "correctness": "3;3;4;3", "technical_novelty": "2;2;3;2", "empirical_novelty": "2;2;3;2", "wc_summary_paper": "295;115;37;112", "wc_summary_review": "118;84;66;48", "wc_main_review": "125;440;289;810", "wc_review": "538;639;392;970", "wc_reply_reviewers": "0;0;81;0", "wc_reply_authors": "104;417;237;610", "reply_reviewers": "0;0;1;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 139.75, 94.92464116339866 ], "wc_summary_review_avg": [ 79.0, 25.865034312755125 ], "wc_main_review_avg": [ 416.0, 253.28936021870322 ], "wc_review_avg": [ 634.75, 212.54337792554253 ], "wc_reply_reviewers_avg": [ 20.25, 35.074028853269766 ], "wc_reply_authors_avg": [ 342.0, 190.47178268709516 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.7071067811865475, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5431308019003235075&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;0", "aff_unique_norm": "Zhejiang University", "aff_unique_dep": "", "aff_unique_url": "https://www.zju.edu.cn", "aff_unique_abbr": "ZJU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "Learning Temporally Causal Latent Processes from General Temporal Data", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6675", "id": "RDlLMjLJXdq", "poster": "", "openreview": "https://openreview.net/forum?id=RDlLMjLJXdq", "slides": "https://iclr.cc/virtual/2022/poster/6675", "video": "https://iclr.cc/virtual/2022/poster/6675", "author_site": "Weiran Yao, Yuewen Sun, Alex Ho, Changyin Sun, Kun Zhang", "tldr": "", "abstract": "Our goal is to recover time-delayed latent causal variables and identify their relations from measured temporal data. Estimating causally-related latent variables from observations is particularly challenging as the latent variables are not uniquely recoverable in the most general case. In this work, we consider both a nonparametric, nonstationary setting and a parametric setting for the latent processes and propose two provable conditions under which temporally causal latent processes can be identified from their nonlinear mixtures. We propose LEAP, a theoretically-grounded framework that extends Variational AutoEncoders (VAEs) by enforcing our conditions through proper constraints in causal process prior. Experimental results on various datasets demonstrate that temporally causal latent processes are reliably identified from observed variables under different dependency structures and that our approach considerably outperforms baselines that do not properly leverage history or nonstationarity information. This demonstrates that using temporal information to learn latent processes from their invertible nonlinear mixtures in an unsupervised manner, for which we believe our work is one of the first, seems promising even without sparsity or minimality assumptions. ", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/791e0b61918749998721915927bec7be836034c7.zip", "author": "Weiran Yao;Yuewen Sun;Alex Ho;Changyin Sun;Kun Zhang", "authorids": "~Weiran_Yao1;~Yuewen_Sun1;~Alex_Ho1;~Changyin_Sun1;~Kun_Zhang1", "gender": "M;F;M;M;M", "homepage": ";https://yuewen-sun.github.io/;;;http://www.andrew.cmu.edu/user/kunz1/", "dblp": "192/3295;219/9893;;64/221;96/3115-1", "google_scholar": "rr_leUAAAAAJ;https://scholar.google.com/citations?hl=en;Qyap3iQAAAAJ;;RGoypN4AAAAJ", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~Weiran_Yao1;~Yuewen_Sun1;~Alex_Ho1;~Changyin_Sun1;~Kun_Zhang1", "aff": "Carnegie Mellon University;Southeast University;Rice University;Southeast University;Carnegie Mellon University", "aff_domain": "cmu.edu;seu.edu.cn;rice.edu;seu.edu.cn;cmu.edu", "position": "PhD student;PhD student;Undergrad student;Professor;Associate Professor", "bibtex": "@inproceedings{\nyao2022learning,\ntitle={Learning Temporally Causal Latent Processes from General Temporal Data},\nauthor={Weiran Yao and Yuewen Sun and Alex Ho and Changyin Sun and Kun Zhang},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=RDlLMjLJXdq}\n}", "github": "", "project": "", "reviewers": "x73T;hJy9;Qhdx;7eEa", "pdf_size": 0, "recommendation": "6;6;6;8", "confidence": "2;3;3;3", "correctness": "3;4;4;3", "technical_novelty": "3;3;3;4", "empirical_novelty": "3;2;3;3", "wc_summary_paper": "102;58;35;190", "wc_summary_review": "63;72;64;71", "wc_main_review": "148;424;223;411", "wc_review": "313;554;322;672", "wc_reply_reviewers": "13;25;0;22", "wc_reply_authors": "429;1831;2482;1746", "reply_reviewers": "1;1;0;1", "reply_authors": "3;4;5;4", "recommendation_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 2.75, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 96.25, 59.23839548806163 ], "wc_summary_review_avg": [ 67.5, 4.031128874149275 ], "wc_main_review_avg": [ 301.5, 119.08085488440197 ], "wc_review_avg": [ 465.25, 153.5600452591754 ], "wc_reply_reviewers_avg": [ 15.0, 9.72111104761179 ], "wc_reply_authors_avg": [ 1622.0, 745.3029585343131 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 4.0, 0.7071067811865476 ], "replies_avg": [ 26, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": -0.5773502691896257, "gs_citation": 95, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14364754714073733596&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "pdf": "https://openreview.net/pdf?id=RDlLMjLJXdq", "email": "cmu.edu;seu.edu.cn;rice.edu;seu.edu.cn;cmu.edu", "author_num": 5, "aff_unique_index": "0;1;2;1;0", "aff_unique_norm": "Carnegie Mellon University;Southeast University;Rice University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.cmu.edu;https://www.seu.edu.cn/;https://www.rice.edu", "aff_unique_abbr": "CMU;SEU;Rice", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;1;0", "aff_country_unique": "United States;China" }, { "id": "RFGkzxMFqby", "title": "Adversarially Trained Models with Test-Time Covariate Shift Adaptation", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Existing defense models against adversarial examples typically provide either empirical or certified robustness. Adversarially trained models empirically demonstrate state-of-the-art defense while providing no robustness guarantees for large classifiers or higher-dimensional inputs.\nIn contrast, a randomized smoothing framework provides state-of-the-art certification while significantly degrades the empirical performance against adversarial attacks. \nIn this work, we propose a novel \\textit{certification through adaptation} technique that transforms an adversarially trained model into a randomized smoothing classifier during inference to provide certified robustness for $\\ell_2$ norm without affecting their empirical robustness against adversarial attacks. One advantage of our proposed technique is that it allows us to separately choose the appropriate noise level for certifying each test example during inference. It also leads to outperform the existing randomized smoothing models for $\\ell_2$ certification on CIFAR-10. Therefore, our work is a step towards bridging the gap between the empirical and certified robustness against adversarial examples by achieving both using the same classifier for the first time.", "keywords": "Adversarial Training;Certified Robustness", "primary_area": "", "supplementary_material": "/attachment/158eb07b7ebd83fe406d20cc3305dc9f188fc64b.zip", "author": "Jay Nandy;Sudipan Saha;Wynne Hsu;Mong-Li Lee;Xiao Xiang Zhu", "authorids": "~Jay_Nandy1;~Sudipan_Saha1;~Wynne_Hsu1;~Mong-Li_Lee1;~Xiao_Xiang_Zhu1", "gender": "M;M;F;F;F", "homepage": ";https://web.iitd.ac.in/~sahasudipan/;http://www.comp.nus.edu.sg/~whsu/;https://www.comp.nus.edu.sg/~leeml/;https://www.sipeo.bgu.tum.de/", "dblp": "193/4096;124/2800;h/WynneHsu;l/MongLiLee;35/8954", "google_scholar": "https://scholar.google.co.in/citations?user=8N_wxz8AAAAJ;https://scholar.google.co.in/citations?user=C1_Ukv4AAAAJ;https://scholar.google.com.tw/citations?user=ljyBjv8AAAAJ;https://scholar.google.com.tw/citations?user=_xFTK8wAAAAJ;https://scholar.google.de/citations?user=CNakdIgAAAAJ", "orcid": ";0000-0002-9440-0720;0000-0002-4142-8893;0000-0002-9636-388X;0000-0001-5530-3613", "linkedin": "jay-nandy-36654b34/;sudipans/;;;xiaoxiang-zhu-90b473228/", "or_profile": "~Jay_Nandy1;~Sudipan_Saha1;~Wynne_Hsu1;~Mong-Li_Lee1;~Xiaoxiang_Zhu1", "aff": "Google;Technical University of Munich;National University of Singapore;National University of Singapore;German Aerospace Center", "aff_domain": "google.com;tum.de;nus.edu.sg;nus.edu.sg;dlr.de", "position": "Postdoc;Researcher;Full Professor;Full Professor;Professor", "bibtex": "@misc{\nnandy2022adversarially,\ntitle={Adversarially Trained Models with Test-Time Covariate Shift Adaptation},\nauthor={Jay Nandy and Sudipan Saha and Wynne Hsu and Mong-Li Lee and Xiao Xiang Zhu},\nyear={2022},\nurl={https://openreview.net/forum?id=RFGkzxMFqby}\n}", "github": "", "project": "", "reviewers": "AjZV;RVk8;TBNd", "site": "https://openreview.net/forum?id=RFGkzxMFqby", "pdf_size": 0, "recommendation": "3;6;8", "confidence": "4;3;3", "correctness": "2;4;4", "technical_novelty": "2;3;4", "empirical_novelty": "2;2;4", "wc_summary_paper": "67;100;75", "wc_summary_review": "17;83;5", "wc_main_review": "178;409;67", "wc_review": "262;592;147", "wc_reply_reviewers": "0;0;23", "wc_reply_authors": "843;698;18", "reply_reviewers": "0;0;1", "reply_authors": "2;1;1", "recommendation_avg": [ 5.666666666666667, 2.0548046676563256 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.9428090415820634 ], "technical_novelty_avg": [ 3.0, 0.816496580927726 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.9428090415820634 ], "wc_summary_paper_avg": [ 80.66666666666667, 14.055445761538678 ], "wc_summary_review_avg": [ 35.0, 34.292856398964496 ], "wc_main_review_avg": [ 218.0, 142.45701105947717 ], "wc_review_avg": [ 333.6666666666667, 188.60599731244557 ], "wc_reply_reviewers_avg": [ 7.666666666666667, 10.842303978193728 ], "wc_reply_authors_avg": [ 519.6666666666666, 359.6371628306631 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.9176629354822472, "corr_recommendation_correctness": 0.9176629354822472, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1948341173092311324&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;2;2;3", "aff_unique_norm": "Google;Technical University of Munich;National University of Singapore;German Aerospace Center", "aff_unique_dep": "Google;;;", "aff_unique_url": "https://www.google.com;https://www.tum.de;https://www.nus.edu.sg;https://www.dlr.de", "aff_unique_abbr": "Google;TUM;NUS;DLR", "aff_campus_unique_index": "0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;1;2;2;1", "aff_country_unique": "United States;Germany;Singapore" }, { "id": "RGrj2uWTLWY", "title": "PI-GNN: Towards Robust Semi-Supervised Node Classification against Noisy Labels", "track": "main", "status": "Reject", "tldr": "", "abstract": "Semi-supervised node classification on graphs is a fundamental problem in graph mining that uses a small set of labeled nodes and many unlabeled nodes for training, so that its performance is quite sensitive to the quality of the node labels. However, it is expensive to maintain the label quality for real-world graph datasets, which presents huge challenges for the learning algorithm to keep a good generalization ability. In this paper, we propose a novel robust learning objective dubbed pairwise interactions (PI) for the model, such as Graph Neural Network (GNN) to combat against noisy labels. Unlike classic robust training approaches that operate on the pointwise interactions between node and class label pairs, PI explicitly forces the embeddings for node pairs that hold a positive PI label to be close to each other, which can be applied to both labeled and unlabeled nodes. We design several instantiations for the PI labels based on the graph structure as well as node class labels, and further propose a new uncertainty-aware training technique to mitigate the negative effect of the sub-optimal PI labels. Extensive experiments on different datasets and GNN architectures demonstrate the effectiveness of PI, which also brings a promising improvement over the state-of-the-art methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xuefeng Du;Tian Bian;Yu Rong;Bo Han;Tongliang Liu;Tingyang Xu;Wenbing Huang;Junzhou Huang", "authorids": "~Xuefeng_Du1;~Tian_Bian1;~Yu_Rong1;~Bo_Han1;~Tongliang_Liu1;~Tingyang_Xu1;~Wenbing_Huang1;~Junzhou_Huang1", "gender": "M;;M;;M;M;M;", "homepage": "https://d12306.github.io/;;https://royrong.me/;;https://tongliang-liu.github.io/;;https://gsai.ruc.edu.cn/english/wenbing_huang;", "dblp": "34/3557;;24/10036-1;;150/6667;157/0940;155/3181-1.html;22/1170", "google_scholar": "GE_aEh4AAAAJ;;https://scholar.google.com.hk/citations?user=itezhEMAAAAJ;;https://scholar.google.com.au/citations?user=EiLdZ_YAAAAJ;6gIs5YMAAAAJ;0yNkmO4AAAAJ;", "orcid": ";;0000-0001-7387-302X;;;0009-0002-0106-8376;;", "linkedin": "xuefeng-du-094723192/;;;;;;;", "or_profile": "~Xuefeng_Du1;~Tian_Bian1;~Yu_Rong1;~Bo_Han1;~Tongliang_Liu1;~Tingyang_Xu1;~Wenbing_Huang1;~Junzhou_Huang1", "aff": "University of Wisconsin, Madison;;Tencent AI Lab;;University of Sydney;Tencent AI Lab;Tsinghua University;", "aff_domain": "wisc.edu;;tencent.com;;sydney.edu.au;tencent.com;tsinghua.edu.cn;", "position": "PhD student;;Senior Researcher;;Lecturer;Researcher;Researcher;", "bibtex": "@misc{\ndu2022pignn,\ntitle={{PI}-{GNN}: Towards Robust Semi-Supervised Node Classification against Noisy Labels},\nauthor={Xuefeng Du and Tian Bian and Yu Rong and Bo Han and Tongliang Liu and Tingyang Xu and Wenbing Huang and Junzhou Huang},\nyear={2022},\nurl={https://openreview.net/forum?id=RGrj2uWTLWY}\n}", "github": "", "project": "", "reviewers": "8yHP;Lvq5;2qdw;bNFD", "site": "https://openreview.net/forum?id=RGrj2uWTLWY", "pdf_size": 0, "recommendation": "3;5;5;5", "confidence": "4;4;3;4", "correctness": "3;3;4;4", "technical_novelty": "2;3;3;2", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "116;124;37;40", "wc_summary_review": "71;38;41;34", "wc_main_review": "612;168;139;714", "wc_review": "799;330;217;788", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.5, 0.8660254037844386 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 79.25, 40.861809798392436 ], "wc_summary_review_avg": [ 46.0, 14.64581851587681 ], "wc_main_review_avg": [ 408.25, 257.49405332939244 ], "wc_review_avg": [ 533.5, 263.0803109318521 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:DTtPMLThsVEJ:scholar.google.com/&scioq=PI-GNN:+Towards+Robust+Semi-Supervised+Node+Classification+against+Noisy+Labels&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;2;1;3", "aff_unique_norm": "University of Wisconsin;Tencent;University of Sydney;Tsinghua University", "aff_unique_dep": ";Tencent AI Lab;;", "aff_unique_url": "https://www.wisc.edu;https://ai.tencent.com;https://www.sydney.edu.au;https://www.tsinghua.edu.cn", "aff_unique_abbr": "UW;Tencent AI Lab;USYD;THU", "aff_campus_unique_index": "0", "aff_campus_unique": "Madison;", "aff_country_unique_index": "0;1;2;1;1", "aff_country_unique": "United States;China;Australia" }, { "title": "Vision-Based Manipulators Need to Also See from Their Hands", "status": "Oral", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6102", "id": "RJkAHKp7kNZ", "poster": "", "openreview": "https://openreview.net/forum?id=RJkAHKp7kNZ", "slides": "https://iclr.cc/virtual/2022/poster/6102", "video": "https://iclr.cc/virtual/2022/poster/6102", "author_site": "Kyle Hsu, Moo Kim, Rafael Rafailov, Jiajun Wu, Chelsea Finn", "tldr": "", "abstract": "We study how the choice of visual perspective affects learning and generalization in the context of physical manipulation from raw sensor observations. Compared with the more commonly used global third-person perspective, a hand-centric (eye-in-hand) perspective affords reduced observability, but we find that it consistently improves training efficiency and out-of-distribution generalization. These benefits hold across a variety of learning algorithms, experimental settings, and distribution shifts, and for both simulated and real robot apparatuses. However, this is only the case when hand-centric observability is sufficient; otherwise, including a third-person perspective is necessary for learning, but also harms out-of-distribution generalization. To mitigate this, we propose to regularize the third-person information stream via a variational information bottleneck. On six representative manipulation tasks with varying hand-centric observability adapted from the Meta-World benchmark, this results in a state-of-the-art reinforcement learning agent operating from both perspectives improving its out-of-distribution generalization on every task. While some practitioners have long put cameras in the hands of robots, our work systematically analyzes the benefits of doing so and provides simple and broadly applicable insights for improving end-to-end learned vision-based robotic manipulation.", "keywords": "reinforcement learning;observation space;out-of-distribution generalization;visuomotor control;robotics;manipulation", "primary_area": "", "supplementary_material": "", "author": "Kyle Hsu;Moo Jin Kim;Rafael Rafailov;Jiajun Wu;Chelsea Finn", "authorids": "~Kyle_Hsu1;~Moo_Jin_Kim1;~Rafael_Rafailov1;~Jiajun_Wu1;~Chelsea_Finn1", "gender": "M;M;M;M;F", "homepage": "https://www.kylehsu.org;https://moojink.com;https://rmrafailov.github.io/;https://jiajunwu.com;https://ai.stanford.edu/~cbfinn/", "dblp": "217/3841;;272/5358;117/4768;131/1783", "google_scholar": "KCdL5B0AAAAJ;ZKRs0oEAAAAJ;TwABcRgAAAAJ;2efgcS0AAAAJ;vfPE6hgAAAAJ", "orcid": ";;;0000-0002-4176-343X;", "linkedin": ";moojink/;;jiajunwu/;", "or_profile": "~Kyle_Hsu1;~Moo_Jin_Kim1;~Rafael_Rafailov1;~Jiajun_Wu1;~Chelsea_Finn1", "aff": "Stanford University;Stanford University;Stanford University;Stanford University;Google", "aff_domain": "cs.stanford.edu;stanford.edu;stanford.edu;stanford.edu;google.com", "position": "PhD student;MS student;MS student;Assistant Professor;Research Scientist", "bibtex": "@inproceedings{\nhsu2022visionbased,\ntitle={Vision-Based Manipulators Need to Also See from Their Hands},\nauthor={Kyle Hsu and Moo Jin Kim and Rafael Rafailov and Jiajun Wu and Chelsea Finn},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=RJkAHKp7kNZ}\n}", "github": "", "project": "", "reviewers": "z6Z4;u4WJ;RmQW", "pdf_size": 0, "recommendation": "8;8;8", "confidence": "4;4;3", "correctness": "4;4;4", "technical_novelty": "3;2;4", "empirical_novelty": "3;2;4", "wc_summary_paper": "64;132;105", "wc_summary_review": "32;30;56", "wc_main_review": "169;357;164", "wc_review": "265;519;325", "wc_reply_reviewers": "6;74;0", "wc_reply_authors": "339;808;535", "reply_reviewers": "1;2;0", "reply_authors": "1;3;1", "recommendation_avg": [ 8.0, 0.0 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 4.0, 0.0 ], "technical_novelty_avg": [ 3.0, 0.816496580927726 ], "empirical_novelty_avg": [ 3.0, 0.816496580927726 ], "wc_summary_paper_avg": [ 100.33333333333333, 27.956315128349008 ], "wc_summary_review_avg": [ 39.333333333333336, 11.8133634311129 ], "wc_main_review_avg": [ 230.0, 89.82575725629407 ], "wc_review_avg": [ 369.6666666666667, 108.39844197322313 ], "wc_reply_reviewers_avg": [ 26.666666666666668, 33.559234529741914 ], "wc_reply_authors_avg": [ 560.6666666666666, 192.32668965995217 ], "reply_reviewers_avg": [ 1.0, 0.816496580927726 ], "reply_authors_avg": [ 1.6666666666666667, 0.9428090415820634 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 48, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1186901764733740263&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=RJkAHKp7kNZ", "email": "cs.stanford.edu;stanford.edu;stanford.edu;stanford.edu;google.com", "author_num": 5, "aff_unique_index": "0;0;0;0;1", "aff_unique_norm": "Stanford University;Google", "aff_unique_dep": ";Google", "aff_unique_url": "https://www.stanford.edu;https://www.google.com", "aff_unique_abbr": "Stanford;Google", "aff_campus_unique_index": "0;0;0;0;1", "aff_campus_unique": "Stanford;Mountain View", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Deep Ensembling with No Overhead for either Training or Testing: The All-Round Blessings of Dynamic Sparsity", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6299", "id": "RLtqs6pzj1-", "poster": "", "openreview": "https://openreview.net/forum?id=RLtqs6pzj1-", "slides": "https://iclr.cc/virtual/2022/poster/6299", "video": "https://iclr.cc/virtual/2022/poster/6299", "author_site": "Shiwei Liu, Tianlong Chen, Zahra Atashgahi, Xiaohan Chen, Ghada Sokar, Elena Mocanu, Mykola Pechenizkiy, Zhangyang Wang, Decebal Mocanu", "tldr": "", "abstract": "The success of deep ensembles on improving predictive performance, uncertainty estimation, and out-of-distribution robustness has been extensively studied in the machine learning literature. Albeit the promising results, naively training multiple deep neural networks and combining their predictions at inference leads to prohibitive computational costs and memory requirements. Recently proposed efficient ensemble approaches reach the performance of the traditional deep ensembles with significantly lower costs. However, the training resources required by these approaches are still at least the same as training a single dense model. In this work, we draw a unique connection between sparse neural network training and deep ensembles, yielding a novel efficient ensemble learning framework called $FreeTickets$. Instead of training multiple dense networks and averaging them, we directly train sparse subnetworks from scratch and extract diverse yet accurate subnetworks during this efficient, sparse-to-sparse training. Our framework, $FreeTickets$, is defined as the ensemble of these relatively cheap sparse subnetworks. Despite being an ensemble method, $FreeTickets$ has even fewer parameters and training FLOPs than a single dense model. This seemingly counter-intuitive outcome is due to the ultra training/inference efficiency of dynamic sparse training. $FreeTickets$ surpasses the dense baseline in all the following criteria: prediction accuracy, uncertainty estimation, out-of-distribution (OoD) robustness, as well as efficiency for both training and inference. Impressively, $FreeTickets$ outperforms the naive deep ensemble with ResNet50 on ImageNet using around only $1/5$ of the training FLOPs required by the latter. We have released our source code at https://github.com/VITA-Group/FreeTickets.", "keywords": "efficient ensemble;FreeTickets;dynamic sparse training;deep ensemble;dynamic sparsity", "primary_area": "", "supplementary_material": "", "author": "Shiwei Liu;Tianlong Chen;Zahra Atashgahi;Xiaohan Chen;Ghada Sokar;Elena Mocanu;Mykola Pechenizkiy;Zhangyang Wang;Decebal Constantin Mocanu", "authorids": "~Shiwei_Liu2;~Tianlong_Chen1;~Zahra_Atashgahi1;~Xiaohan_Chen1;~Ghada_Sokar1;~Elena_Mocanu1;~Mykola_Pechenizkiy1;~Zhangyang_Wang1;~Decebal_Constantin_Mocanu1", "gender": "M;M;F;M;;F;M;M;M", "homepage": "https://shiweiliuiiiiiii.github.io/;https://tianlong-chen.github.io;https://people.utwente.nl/z.atashgahi;http://xiaohanchen.com;https://research.tue.nl/en/persons/ghada-sokar;https://people.utwente.nl/e.mocanu;http://www.win.tue.nl/~mpechen/;https://vita-group.github.io;https://wwwen.uni.lu/recherche/fstm/dcs/members/decebal_constantin_mocanu", "dblp": "234/8697-3.html;;268/5733.html;94/3802;244/7833;08/1121;37/4649;119/4026;133/7764", "google_scholar": "73IbXtsAAAAJ;LE3ctn0AAAAJ;_nmvlmkAAAAJ;https://scholar.google.com/citations?authuser=1;https://scholar.google.nl/citations?user=0e6fdZsAAAAJ;https://scholar.google.nl/citations?user=JLD5uy0AAAAJ;https://scholar.google.com.tw/citations?user=F0uFT_kAAAAJ;pxFyKAIAAAAJ;RlQgUwEAAAAJ", "orcid": ";0000-0001-7774-8197;0000-0001-8183-5541;0000-0002-0360-0402;;;0000-0003-4955-0743;;0000-0002-5636-7683", "linkedin": ";tianlong-chen-783862167/;atashgahi/;xiaohan-chen-400b00147/;;;mpechen/;;", "or_profile": "~Shiwei_Liu2;~Tianlong_Chen1;~Zahra_Atashgahi1;~Xiaohan_Chen1;~Ghada_Sokar1;~Elena_Mocanu1;~Mykola_Pechenizkiy1;~Zhangyang_Wang1;~Decebal_Constantin_Mocanu1", "aff": "Eindhoven University of Technology;University of Texas, Austin;University of Twente;University of Texas, Austin;Eindhoven University of Technology;University of Twente;Eindhoven University of Technology;University of Texas, Austin;University of Twente", "aff_domain": "tue.nl;utexas.edu;utwente.nl;utexas.edu;tue.nl;utwente.nl;tue.nl;utexas.edu;utwente.nl", "position": "PhD student;PhD student;PhD student;PhD student;PhD student;Assistant Professor;Full Professor;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\nliu2022deep,\ntitle={Deep Ensembling with No Overhead for either Training or Testing: The All-Round Blessings of Dynamic Sparsity},\nauthor={Shiwei Liu and Tianlong Chen and Zahra Atashgahi and Xiaohan Chen and Ghada Sokar and Elena Mocanu and Mykola Pechenizkiy and Zhangyang Wang and Decebal Constantin Mocanu},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=RLtqs6pzj1-}\n}", "github": "", "project": "", "reviewers": "wRp3;hJ1v;HPxD;4rju", "pdf_size": 0, "recommendation": "5;6;6;6", "confidence": "4;4;4;4", "correctness": "3;3;4;3", "technical_novelty": "3;2;2;4", "empirical_novelty": "3;3;3;3", "wc_summary_paper": "61;103;355;89", "wc_summary_review": "33;122;124;32", "wc_main_review": "485;1165;725;661", "wc_review": "579;1390;1204;782", "wc_reply_reviewers": "0;0;31;0", "wc_reply_authors": "1200;2088;1408;2379", "reply_reviewers": "0;0;1;0", "reply_authors": "4;4;2;6", "recommendation_avg": [ 5.75, 0.4330127018922193 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.82915619758885 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 152.0, 118.17360111293893 ], "wc_summary_review_avg": [ 77.75, 45.256905550424015 ], "wc_main_review_avg": [ 759.0, 250.3357745109556 ], "wc_review_avg": [ 988.75, 323.254833683891 ], "wc_reply_reviewers_avg": [ 7.75, 13.423393758658799 ], "wc_reply_authors_avg": [ 1768.75, 481.6489255671604 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 4.0, 1.4142135623730951 ], "replies_avg": [ 25, 0 ], "authors#_avg": [ 9, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 60, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2308297396186938309&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "pdf": "https://openreview.net/pdf?id=RLtqs6pzj1-", "email": "tue.nl;utexas.edu;utwente.nl;utexas.edu;tue.nl;utwente.nl;tue.nl;utexas.edu;utwente.nl", "author_num": 9, "aff_unique_index": "0;1;2;1;0;2;0;1;2", "aff_unique_norm": "Eindhoven University of Technology;University of Texas at Austin;University of Twente", "aff_unique_dep": ";;", "aff_unique_url": "https://www.tue.nl;https://www.utexas.edu;https://www.utwente.nl", "aff_unique_abbr": "TU/e;UT Austin;UT", "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Austin", "aff_country_unique_index": "0;1;0;1;0;0;0;1;0", "aff_country_unique": "Netherlands;United States" }, { "id": "RMv-5wMMrE3", "title": "Cell2State: Learning Cell State Representations From Barcoded Single-Cell Gene-Expression Transitions", "track": "main", "status": "Reject", "tldr": "", "abstract": "Genetic barcoding coupled with single-cell sequencing technology enables direct measurement of cell-to-cell transitions and gene-expression evolution over a long timespan. This new type of data reveals explicit state transitions of cell dynamics. Motivated by dimension reduction methods for dynamical systems, we develop a *cell-to-state* (cell2state) learning method that, through learning from such multi-modal data, maps single-cell gene expression profiles to low-dimensional state vectors that are predictive of cell dynamics. We evaluate the cell2state method using barcoded stem cell dataset (Biddy et al. (2018)) and simulation studies, compared with baseline approaches using features that are not dynamic-aware. We demonstrate the merits of cell2state in challenging downstream tasks including cell state prediction and finding dynamically stable clusters. Further, our method reveals potentiallatent meta-states of the underlying evolution process. For each of the meta-states, we identify a set of marker genes and development pathways that are biologically meaningful and potentially expand existing knowledge.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/46f85a47454bc92b1880589a699bbd8f0f32a1b9.zip", "author": "Yu Wu;Joseph Chahn Kim;Chengzhuo Ni;Le Cong;Mengdi Wang", "authorids": "~Yu_Wu6;~Joseph_Chahn_Kim1;~Chengzhuo_Ni1;~Le_Cong2;~Mengdi_Wang1", "gender": "F;M;M;F;Not Specified", "homepage": "https://ece.princeton.edu/people/yu-wu;https://www.hadtomakeaurl.com/;;http://mwang.princeton.edu;http://www.conglab.com", "dblp": ";;241/5404;;", "google_scholar": ";;;;sfJIWdcAAAAJ", "orcid": ";;;;0000-0003-4725-8714", "linkedin": ";;;;", "or_profile": "~Yu_Wu6;~Joseph_Chahn_Kim1;~Chengzhuo_Ni1;~Mengdi_Wang1;~LE_Cong1", "aff": "Princeton University;Princeton University;Princeton University;Princeton University;Stanford University", "aff_domain": "princeton.edu;princeton.edu;princeton.edu;princeton.edu;stanford.edu", "position": "PhD student;PhD student;Graduate student;Full Professor;Assistant Professor", "bibtex": "@misc{\nwu2022cellstate,\ntitle={Cell2State: Learning Cell State Representations From Barcoded Single-Cell Gene-Expression Transitions },\nauthor={Yu Wu and Joseph Chahn Kim and Chengzhuo Ni and Le Cong and Mengdi Wang},\nyear={2022},\nurl={https://openreview.net/forum?id=RMv-5wMMrE3}\n}", "github": "", "project": "", "reviewers": "Nc9n;o4Q6;1peb;nQmp", "site": "https://openreview.net/forum?id=RMv-5wMMrE3", "pdf_size": 0, "recommendation": "3;3;5;6", "confidence": "3;4;4;5", "correctness": "3;1;3;3", "technical_novelty": "2;3;3;4", "empirical_novelty": "2;0;2;4", "wc_summary_paper": "13;84;72;94", "wc_summary_review": "23;244;75;63", "wc_main_review": "190;57;349;324", "wc_review": "226;385;496;481", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.25, 1.299038105676658 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 2.5, 0.8660254037844386 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.0, 1.4142135623730951 ], "wc_summary_paper_avg": [ 65.75, 31.43544973433655 ], "wc_summary_review_avg": [ 101.25, 84.63561602540624 ], "wc_main_review_avg": [ 230.0, 116.75401492025874 ], "wc_review_avg": [ 397.0, 107.51976562474455 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.816496580927726, "corr_recommendation_correctness": 0.5555555555555555, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Bu2ieUKdaS0J:scholar.google.com/&scioq=Cell2State:+Learning+Cell+State+Representations+From+Barcoded+Single-Cell+Gene-Expression+Transitions&hl=en&as_sdt=0,14", "gs_version_total": 0, "aff_unique_index": "0;0;0;0;1", "aff_unique_norm": "Princeton University;Stanford University", "aff_unique_dep": ";", "aff_unique_url": "https://www.princeton.edu;https://www.stanford.edu", "aff_unique_abbr": "Princeton;Stanford", "aff_campus_unique_index": "1", "aff_campus_unique": ";Stanford", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "RNf9AgtRtL", "title": "Continuous Control With Ensemble Deep Deterministic Policy Gradients", "track": "main", "status": "Reject", "tldr": "", "abstract": "The growth of deep reinforcement learning (RL) has brought multiple exciting tools and methods to the field. This rapid expansion makes it important to understand the interplay between individual elements of the RL toolbox. We approach this task from an empirical perspective by conducting a study in the continuous control setting. We present multiple insights of fundamental nature, including: a commonly used additive action noise is not required for effective exploration and can even hinder training; the performance of policies trained using existing methods varies significantly across training runs, epochs of training, and evaluation runs; the critics' initialization plays the major role in ensemble-based actor-critic exploration, while the training is mostly invariant to the actors' initialization; a strategy based on posterior sampling explores better than the approximated UCB combined with the weighted Bellman backup; the weighted Bellman backup alone cannot replace the clipped double Q-Learning. As a conclusion, we show how existing tools can be brought together in a novel way, giving rise to the Ensemble Deep Deterministic Policy Gradients (ED2) method, to yield state-of-the-art results on continuous control tasks from \\mbox{OpenAI Gym MuJoCo}. From the practical side, ED2 is conceptually straightforward, easy to code, and does not require knowledge outside of the existing RL toolbox.", "keywords": "deep learning;reinforcement learning;ensemble learning;MuJoCo;continuous control;stable performance;deterministic policy gradient", "primary_area": "", "supplementary_material": "", "author": "Piotr Januszewski;Mateusz Olko;Micha\u0142 Kr\u00f3likowski;Jakub Swiatkowski;Marcin Andrychowicz;\u0141ukasz Kuci\u0144ski;Piotr Mi\u0142o\u015b", "authorids": "~Piotr_Januszewski1;~Mateusz_Olko1;~Micha\u0142_Kr\u00f3likowski1;~Jakub_Swiatkowski1;~Marcin_Andrychowicz1;~\u0141ukasz_Kuci\u0144ski1;~Piotr_Mi\u0142o\u015b1", "gender": "M;;M;;M;M;", "homepage": "https://piojanu.github.io/;;;;https://scholar.google.com/citations?user=n9K1v-cAAAAJ&hl=en&oi=ao;https://sites.google.com/view/lukaszkucinski;", "dblp": ";;;215/4459;;250/9699;208/0989.html", "google_scholar": "https://scholar.google.co.uk/citations?user=NfsQfqgAAAAJ;;;WAspBdUAAAAJ;;l6dK-VUAAAAJ;Se68XecAAAAJ", "orcid": "0000-0003-3817-3479;;;;;0000-0002-5617-8129;", "linkedin": "piojanu/;;m-krolikowski/;jakubswiatkowski/;;https://linkedin.com/in/lukasz-kucinski;piotr-milos-4b02151/", "or_profile": "~Piotr_Januszewski1;~Mateusz_Olko1;~Micha\u0142_Kr\u00f3likowski1;~Jakub_Swiatkowski1;~Marcin_Andrychowicz1;~\u0141ukasz_Kuci\u0144ski1;~Piotr_Mi\u0142o\u015b1", "aff": "Gda\u0144sk University of Technology;;University of Warsaw;University of Warsaw;Google;Institute of Mathematics Polish Academy of Sciences;IDEAS NCBR", "aff_domain": "pg.edu.pl;;uw.edu.pl;uw.edu.pl;google.com;impan.pl;ideas-ncbr.pl", "position": "PhD student;;MS student;PhD student;Senior Research Scientist;Assistant Professor;Researcher", "bibtex": "@misc{\njanuszewski2022continuous,\ntitle={Continuous Control With Ensemble Deep Deterministic Policy Gradients},\nauthor={Piotr Januszewski and Mateusz Olko and Micha{\\l} Kr{\\'o}likowski and Jakub Swiatkowski and Marcin Andrychowicz and {\\L}ukasz Kuci{\\'n}ski and Piotr Mi{\\l}o{\\'s}},\nyear={2022},\nurl={https://openreview.net/forum?id=RNf9AgtRtL}\n}", "github": "", "project": "", "reviewers": "2e3U;ZQ4d;thDH;hF9t;M5Fw", "site": "https://openreview.net/forum?id=RNf9AgtRtL", "pdf_size": 0, "recommendation": "3;5;5;6;6", "confidence": "2;4;3;4;5", "correctness": "2;3;4;3;3", "technical_novelty": "2;2;2;2;3", "empirical_novelty": "3;2;2;3;3", "wc_summary_paper": "41;54;68;86;176", "wc_summary_review": "25;56;85;55;48", "wc_main_review": "272;819;267;375;653", "wc_review": "338;929;420;516;877", "wc_reply_reviewers": "0;109;0;141;0", "wc_reply_authors": "995;571;540;769;1079", "reply_reviewers": "0;1;0;1;0", "reply_authors": "2;1;1;1;2", "recommendation_avg": [ 5.0, 1.0954451150103321 ], "confidence_avg": [ 3.6, 1.019803902718557 ], "correctness_avg": [ 3.0, 0.6324555320336759 ], "technical_novelty_avg": [ 2.2, 0.39999999999999997 ], "empirical_novelty_avg": [ 2.6, 0.4898979485566356 ], "wc_summary_paper_avg": [ 85.0, 47.89154413881432 ], "wc_summary_review_avg": [ 53.8, 19.197916553626335 ], "wc_main_review_avg": [ 477.2, 221.11933429711658 ], "wc_review_avg": [ 616.0, 241.57400522407207 ], "wc_reply_reviewers_avg": [ 50.0, 62.06770496804276 ], "wc_reply_authors_avg": [ 790.8, 217.46024924109693 ], "reply_reviewers_avg": [ 0.4, 0.48989794855663565 ], "reply_authors_avg": [ 1.4, 0.4898979485566356 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.8951435925492911, "corr_recommendation_correctness": 0.5773502691896258, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6941813867220688074&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff_unique_index": "0;1;1;2;3;4", "aff_unique_norm": "Gda\u0144sk University of Technology;University of Warsaw;Google;Polish Academy of Sciences;Institute for Development, Economic Analysis, and Simulation (IDEAS)", "aff_unique_dep": ";;Google;Institute of Mathematics;", "aff_unique_url": "https://www.gut.edu.pl;https://www.uw.edu.pl;https://www.google.com;https://www.impan.pl/;https://www.ideas-ncbr.gov.pl", "aff_unique_abbr": "GUT;UW;Google;PAS;IDEAS", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0;1;0;0", "aff_country_unique": "Poland;United States" }, { "id": "RNnKhz25N1O", "title": "Low-Cost Algorithmic Recourse for Users With Uncertain Cost Functions", "track": "main", "status": "Reject", "tldr": "", "abstract": "The problem of identifying algorithmic recourse for people affected by machine learning model decisions has received much attention recently. Existing approaches for recourse generation obtain solutions using properties like diversity, proximity, sparsity, and validity. Yet, these objectives are only heuristics for what we truly care about, which is whether a user is satisfied with the recourses offered to them. Some recent works try to model user-incurred cost, which is more directly linked to user satisfaction. But they assume a single global cost function that is shared across all users. This is an unrealistic assumption when users have dissimilar preferences about their willingness to act upon a feature and different costs associated with changing that feature. In this work, we formalize the notion of user-specific cost functions and introduce a new method for identifying actionable recourses for users. By default, we assume that users' cost functions are hidden from the recourse method, though our framework allows users to partially or completely specify their preferences or cost function. We propose an objective function, Expected Minimum Cost (EMC), based on two key ideas: (1) when presenting a set of options to a user, it is vital that there is at least one low-cost solution the user could adopt; (2) when we do not know the user's true cost function, we can approximately optimize for user satisfaction by first sampling plausible cost functions, then finding a set that achieves a good cost for the user in expectation. We optimize EMC with a novel discrete optimization algorithm, Cost-Optimized Local Search (COLS), which is guaranteed to improve the recourse set quality over iterations. Experimental evaluation on popular real-world datasets with simulated user costs demonstrates that our method satisfies up to 25.89 percentage points more users compared to strong baseline methods. Using standard fairness metrics, we also show that our method can provide more fair solutions across demographic groups than comparable methods, and we verify that our method is robust to misspecification of the cost function distribution. ", "keywords": "Explainability;Interpretability;Counterfactuals;Algorithmic Recourse;Black-box Models;Machine Learning;Accountability;Consumer Protection;Adverse Action Notices", "primary_area": "", "supplementary_material": "/attachment/b16be232e1b30a9e7917db0ce6761aa1d7ed6851.zip", "author": "Prateek Yadav;Peter Hase;Mohit Bansal", "authorids": "~Prateek_Yadav1;~Peter_Hase1;~Mohit_Bansal2", "gender": "M;;M", "homepage": "http://prateek-yadav.github.io;;https://www.cs.unc.edu/~mbansal/", "dblp": "220/5741;;32/5243.html", "google_scholar": "1lXhc0kAAAAJ;;DN8QtscAAAAJ", "orcid": ";;", "linkedin": "prateek-yadav-40bb34a8;;", "or_profile": "~Prateek_Yadav1;~Peter_Hase1;~Mohit_Bansal2", "aff": "Department of Computer Science, University of North Carolina, Chapel Hill;;University of North Carolina at Chapel Hill", "aff_domain": "cs.unc.edu;;unc.edu", "position": "Graduate Student;;Full Professor", "bibtex": "@misc{\nyadav2022lowcost,\ntitle={Low-Cost Algorithmic Recourse for Users With Uncertain Cost Functions},\nauthor={Prateek Yadav and Peter Hase and Mohit Bansal},\nyear={2022},\nurl={https://openreview.net/forum?id=RNnKhz25N1O}\n}", "github": "", "project": "", "reviewers": "B3sM;1gBz;Fy3U;cyuv", "site": "https://openreview.net/forum?id=RNnKhz25N1O", "pdf_size": 0, "recommendation": "3;3;5;6", "confidence": "3;3;3;2", "correctness": "2;2;3;3", "technical_novelty": "3;2;3;4", "empirical_novelty": "3;2;2;4", "wc_summary_paper": "35;20;49;197", "wc_summary_review": "34;55;35;59", "wc_main_review": "331;156;319;430", "wc_review": "400;231;403;686", "wc_reply_reviewers": "0;0;16;77", "wc_reply_authors": "548;607;1290;1062", "reply_reviewers": "0;0;1;1", "reply_authors": "1;1;2;2", "recommendation_avg": [ 4.25, 1.299038105676658 ], "confidence_avg": [ 2.75, 0.4330127018922193 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 75.25, 71.03652229663274 ], "wc_summary_review_avg": [ 45.75, 11.344051304538427 ], "wc_main_review_avg": [ 309.0, 98.27766785999758 ], "wc_review_avg": [ 430.0, 163.3753347356938 ], "wc_reply_reviewers_avg": [ 23.25, 31.712576369636068 ], "wc_reply_authors_avg": [ 876.75, 310.61823433275777 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.7777777777777777, "corr_recommendation_correctness": 0.9622504486493761, "gs_citation": 21, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9779943928558421595&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0", "aff_unique_norm": "University of North Carolina", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.unc.edu", "aff_unique_abbr": "UNC", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Chapel Hill", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "ROpoUxw23oP", "title": "Differentiable Hyper-parameter Optimization", "track": "main", "status": "Reject", "tldr": "", "abstract": "Hyper-parameters are widely present in machine learning.\nConcretely, large amount of hyper-parameters exist in network layers, such as kernel size, channel size and the hidden layer size, which directly affect performance of the model.\nThus, hyper-parameter optimization is crucial for machine learning. Current hyper-parameter optimization always requires multiple training sessions, resulting in a large time consuming.\nTo solve this problem, we propose a method to fine-tune neural network's hyper-parameters efficiently in this paper, where optimization completes in only one training session.\nWe apply our method for the optimization of various neural network layers' hyper-parameters and compare it with multiple benchmark hyper-parameter optimization models.\nExperimental results show that our method is commonly 10 times faster than traditional and mainstream methods such as random search, Bayesian optimization and many other state-of-art models. It also achieves higher quality hyper-parameters with better accuracy and stronger stability.", "keywords": "Hyper-parameter Optimization", "primary_area": "", "supplementary_material": "", "author": "Bozhou Chen;Hongzhi Wang;Chenmin Ba", "authorids": "~Bozhou_Chen1;~Hongzhi_Wang2;~Chenmin_Ba1", "gender": "M;M;M", "homepage": ";http://homepage.hit.edu.cn/wang;", "dblp": "259/9940;81/940;https://dblp.uni-trier.de/pid/259/9983.html", "google_scholar": "avQkdTsAAAAJ;;", "orcid": ";0000-0002-7521-2871;", "linkedin": ";;", "or_profile": "~Bozhou_Chen1;~Hongzhi_Wang2;~Chenmin_Ba1", "aff": "Harbin Institute of Technology;Harbin Institute of Technology;Harbin Institute of Technology", "aff_domain": "hit.edu.cn;hit.edu.cn;hit.edu.cn", "position": "MS student;Full Professor;MS student", "bibtex": "@misc{\nchen2022differentiable,\ntitle={Differentiable Hyper-parameter Optimization},\nauthor={Bozhou Chen and Hongzhi Wang and Chenmin Ba},\nyear={2022},\nurl={https://openreview.net/forum?id=ROpoUxw23oP}\n}", "github": "", "project": "", "reviewers": "iUkA;Qr8L;CR1f;EFJq", "site": "https://openreview.net/forum?id=ROpoUxw23oP", "pdf_size": 0, "recommendation": "1;3;3;5", "confidence": "5;4;5;3", "correctness": "3;3;1;2", "technical_novelty": "1;2;1;2", "empirical_novelty": "1;2;2;2", "wc_summary_paper": "67;80;37;65", "wc_summary_review": "16;22;60;56", "wc_main_review": "198;123;437;359", "wc_review": "281;225;534;480", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.0, 1.4142135623730951 ], "confidence_avg": [ 4.25, 0.82915619758885 ], "correctness_avg": [ 2.25, 0.82915619758885 ], "technical_novelty_avg": [ 1.5, 0.5 ], "empirical_novelty_avg": [ 1.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 62.25, 15.674421839417235 ], "wc_summary_review_avg": [ 38.5, 19.665960439297137 ], "wc_main_review_avg": [ 279.25, 124.76052059846496 ], "wc_review_avg": [ 380.0, 129.94421880176125 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.8528028654224418, "corr_recommendation_correctness": -0.4264014327112209, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:ohFAkDh6zYQJ:scholar.google.com/&scioq=Differentiable+Hyper-parameter+Optimization&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0;0", "aff_unique_norm": "Harbin Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "http://www.hit.edu.cn/", "aff_unique_abbr": "HIT", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Harbin", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "id": "ROteIE-4A6W", "title": "MA-CLIP: Towards Modality-Agnostic Contrastive Language-Image Pre-training", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Large-scale multimodal contrastive pretraining has demonstrated great utility to support high performance in a range of downstream tasks by mapping multiple modalities into a shared embedding space. Typically, this has employed separate encoders for each modality. However, recent work suggest that transformers can support learning across multiple modalities and allow knowledge sharing. Inspired by this, we investigate how to build a modality-shared Contrastive Language-Image Pre-training framework (MS-CLIP). More specifically, we question how many parameters of a transformer model can be shared across modalities during contrastive pre-training, and rigorously study architectural design choices that position the proportion of parameters shared along a spectrum. We observe that a mostly unified encoder for vision and language signals outperforms all other variations that separate more parameters. Additionally, we find that light-weight modality-specific parallel adapter modules further improve performance. Experimental results show that the proposed MS-CLIP outperforms OpenAI CLIP by 13\\% relatively in zero-shot ImageNet classification (pre-trained on YFCC100M), while simultaneously supporting a reduction of parameters. In addition, our approach outperforms OpenAI CLIP by 1.6 points on a collection of 19 downstream vision tasks. Furthermore, we discover that sharing parameters leads to semantic concepts from different modalities being encoded more closely in the embedding space, facilitating the learning of common semantic structures (e.g., attention patterns) across modalities.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Haoxuan You;Luowei Zhou;Bin Xiao;Noel C Codella;Yu Cheng;Ruochen Xu;Shih-Fu Chang;Lu Yuan", "authorids": "~Haoxuan_You1;~Luowei_Zhou1;~Bin_Xiao2;~Noel_C_Codella1;~Yu_Cheng1;~Ruochen_Xu2;~Shih-Fu_Chang3;~Lu_Yuan1", "gender": "M;;M;M;M;M;M;M", "homepage": "https://hxyou.github.io/;https://luoweizhou.github.io;;http://www.noelcodella.com/;https://ych133.github.io;https://xrc10.github.io/;http://www.ee.columbia.edu/~sfchang/;https://www.microsoft.com/en-us/research/people/luyuan/", "dblp": "210/2628;122/7357;43/5134-1;;96/3060-1.html;188/3515;c/ShihFuChang;", "google_scholar": "BhysChMAAAAJ;M-3cIR0AAAAJ;https://scholar.google.com/citations?authuser=1;8BnjC-4AAAAJ;https://scholar.google.com/citations?hl=en;HTp5S00AAAAJ;OMVTRscAAAAJ;k9TsUVsAAAAJ", "orcid": ";;0000-0001-6477-5911;;;;;", "linkedin": ";;;noel-c-f-codella-ph-d-1b1b1723/;chengyu05/;ruochenx/;;", "or_profile": "~Haoxuan_You1;~Luowei_Zhou1;~Bin_Xiao2;~Noel_C_Codella1;~Yu_Cheng1;~Ruochen_Xu2;~Shih-Fu_Chang3;~Lu_Yuan1", "aff": "Columbia University;Microsoft;Microsoft;Microsoft;Microsoft Research;Microsoft Research;Amazon;Microsoft", "aff_domain": "columbia.edu;microsoft.com;microsoft.com;microsoft.com;microsoft.com;research.microsoft.com;amazon.com;microsoft.com", "position": "PhD student;Researcher;Principal Researcher;Principal Researcher;Principal Researcher;Researcher;Scholar;Principal Research Manager", "bibtex": "@misc{\nyou2022maclip,\ntitle={{MA}-{CLIP}: Towards Modality-Agnostic Contrastive Language-Image Pre-training},\nauthor={Haoxuan You and Luowei Zhou and Bin Xiao and Noel C Codella and Yu Cheng and Ruochen Xu and Shih-Fu Chang and Lu Yuan},\nyear={2022},\nurl={https://openreview.net/forum?id=ROteIE-4A6W}\n}", "github": "", "project": "", "reviewers": "hDBb;fePL;mhWr", "site": "https://openreview.net/forum?id=ROteIE-4A6W", "pdf_size": 0, "recommendation": "3;5;8", "confidence": "4;4;4", "correctness": "3;4;3", "technical_novelty": "2;2;4", "empirical_novelty": "2;2;4", "wc_summary_paper": "68;93;84", "wc_summary_review": "54;105;97", "wc_main_review": "357;301;422", "wc_review": "479;499;603", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "810;734;228", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "recommendation_avg": [ 5.333333333333333, 2.0548046676563256 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.6666666666666665, 0.9428090415820634 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.9428090415820634 ], "wc_summary_paper_avg": [ 81.66666666666667, 10.338708279513883 ], "wc_summary_review_avg": [ 85.33333333333333, 22.395436042987768 ], "wc_main_review_avg": [ 360.0, 49.44357052910587 ], "wc_review_avg": [ 527.0, 54.35684562837202 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 590.6666666666666, 258.31419283930614 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": -0.1147078669352809, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14668456141735806462&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;1;1;1;1;2;1", "aff_unique_norm": "Columbia University;Microsoft;Amazon", "aff_unique_dep": ";Microsoft Corporation;Amazon.com, Inc.", "aff_unique_url": "https://www.columbia.edu;https://www.microsoft.com;https://www.amazon.com", "aff_unique_abbr": "Columbia;Microsoft;Amazon", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "RQ3xUXjZWMO", "title": "Implicit Jacobian regularization weighted with impurity of probability output", "track": "main", "status": "Reject", "tldr": "", "abstract": "Gradient descent (GD) plays a crucial role in the success of deep learning, but it is still not fully understood how GD finds minima that generalize well. In many studies, GD has been understood as a gradient flow in the limit of vanishing learning rate. However, this approach has a fundamental limitation in explaining the oscillatory behavior with iterative catapult in a practical finite learning rate regime. To address this limitation, we rather start with strong empirical evidence of the plateau of the sharpness (the top eigenvalue of the Hessian) of the loss function landscape. With this observation, we investigate the Hessian through simple and much lower-dimensional matrices. In particular, to analyze the sharpness, we instead explore the eigenvalue problem for the low-dimensional matrix which is a rank-one modification of a diagonal matrix. The eigendecomposition provides a simple relation between the eigenvalues of the low-dimensional matrix and the impurity of the probability output. We exploit this connection to derive sharpness-impurity-Jacobian relation and to explain how the sharpness influences the learning dynamics and the generalization performance. In particular, we show that GD has implicit regularization effects on the Jacobian norm weighted with the impurity of the probability output.", "keywords": "deep learning;gradient descent;implicit bias;implicit regularization;Hessian;sharpness", "primary_area": "", "supplementary_material": "/attachment/029d3c3a5dbad00d5d69c7c55dce48c5e0fee66f.zip", "author": "Sungyoon Lee;Jinseong Park;Jaewook Lee", "authorids": "~Sungyoon_Lee1;~Jinseong_Park1;~Jaewook_Lee1", "gender": "M;M;M", "homepage": "https://sites.google.com/view/sungyoon-lee/home;https://github.com/JinseongP;http://slcf.snu.ac.kr", "dblp": ";178/8948-1;39/4985-1", "google_scholar": "https://scholar.google.co.kr/citations?user=PAoFkGEAAAAJ;o4-E5z0AAAAJ;teMdzbwAAAAJ", "orcid": ";0000-0003-1931-8441;", "linkedin": ";jinseong-park-a84740226/;", "or_profile": "~Sungyoon_Lee1;~Jinseong_Park1;~Jaewook_Lee1", "aff": "Korea Institute for Advanced Study;Seoul National University;Seoul National University", "aff_domain": "kias.re.kr;snu.ac.kr;snu.ac.kr", "position": "Researcher;MS student;Full Professor", "bibtex": "@misc{\nlee2022implicit,\ntitle={Implicit Jacobian regularization weighted with impurity of probability output},\nauthor={Sungyoon Lee and Jinseong Park and Jaewook Lee},\nyear={2022},\nurl={https://openreview.net/forum?id=RQ3xUXjZWMO}\n}", "github": "", "project": "", "reviewers": "6UXZ;Z9ex;utPV;4LVi", "site": "https://openreview.net/forum?id=RQ3xUXjZWMO", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "4;4;5;3", "correctness": "2;3;4;2", "technical_novelty": "1;2;2;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "49;64;80;71", "wc_summary_review": "96;78;52;59", "wc_main_review": "366;1104;65;375", "wc_review": "511;1246;197;505", "wc_reply_reviewers": "0;0;0;153", "wc_reply_authors": "1437;1237;426;2054", "reply_reviewers": "0;0;0;1", "reply_authors": "3;3;2;4", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 2.75, 0.82915619758885 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 66.0, 11.335784048754634 ], "wc_summary_review_avg": [ 71.25, 17.166464400102893 ], "wc_main_review_avg": [ 477.5, 382.6215493147243 ], "wc_review_avg": [ 614.75, 385.94065282113 ], "wc_reply_reviewers_avg": [ 38.25, 66.25094338950956 ], "wc_reply_authors_avg": [ 1288.5, 581.9366374443183 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 3.0, 0.7071067811865476 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.30151134457776363, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10708825964505483175&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff_unique_index": "0;1;1", "aff_unique_norm": "Korea Institute for Advanced Study;Seoul National University", "aff_unique_dep": ";", "aff_unique_url": "http://www.kaist.edu;https://www.snu.ac.kr", "aff_unique_abbr": "KIAS;SNU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "South Korea" }, { "title": "A Deep Variational Approach to Clustering Survival Data", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6333", "id": "RQ428ZptQfU", "poster": "", "openreview": "https://openreview.net/forum?id=RQ428ZptQfU", "slides": "https://iclr.cc/virtual/2022/poster/6333", "video": "https://iclr.cc/virtual/2022/poster/6333", "author_site": "Laura Manduchi, Ri\u010dards Marcinkevi\u010ds, Michela Massi, Thomas Weikert, Alexander Sauter, Verena Gotta, Timothy M\u00fcller, Flavio Vasella, Marian Neidert, Marc Pfister, Bram Stieltjes, Julia E Vogt", "tldr": "", "abstract": "In this work, we study the problem of clustering survival data \u2014 a challenging and so far under-explored task. We introduce a novel semi-supervised probabilistic approach to cluster survival data by leveraging recent advances in stochastic gradient variational inference. In contrast to previous work, our proposed method employs a deep generative model to uncover the underlying distribution of both the explanatory variables and censored survival times. We compare our model to the related work on clustering and mixture models for survival data in comprehensive experiments on a wide range of synthetic, semi-synthetic, and real-world datasets, including medical imaging data. Our method performs better at identifying clusters and is competitive at predicting survival times. Relying on novel generative assumptions, the proposed model offers a holistic perspective on clustering survival data and holds a promise of discovering subpopulations whose survival is regulated by different generative mechanisms.", "keywords": "survival analysis;clustering;healthcare;variational autoencoders;deep generative models", "primary_area": "", "supplementary_material": "/attachment/5a33c3e98cf2441e07016f9fdd6734b26e2a5539.zip", "author": "Laura Manduchi;Ri\u010dards Marcinkevi\u010ds;Michela C. Massi;Thomas Weikert;Alexander Sauter;Verena Gotta;Timothy M\u00fcller;Flavio Vasella;Marian C. Neidert;Marc Pfister;Bram Stieltjes;Julia E Vogt", "authorids": "~Laura_Manduchi2;~Ri\u010dards_Marcinkevi\u010ds1;michelacarlotta.massi@polimi.it;thomas.weikert@usb.ch;alexander.sauter@usb.ch;verena.gotta@ukbb.ch;timothy.mueller@uzh.ch;flavio.vasella@uzh.ch;marian.neidert@kssg.ch;marc.pfister@ukbb.ch;~Bram_Stieltjes2;~Julia_E_Vogt1", "gender": "F;;;;;;;;;;;F", "homepage": "https://mds.inf.ethz.ch/team/detail/laura-manduchi/;;;;;;;;;;;http://mds.inf.ethz.ch", "dblp": "249/9257;;;;;;;;;;;13/8412", "google_scholar": ";;;;;;;;;;;UoeV-8kAAAAJ", "orcid": ";;;;;;;;;;;", "linkedin": ";;;;;;;;;;;julia-vogt-50b53895", "or_profile": "~Laura_Manduchi2;~Ri\u010dards_Marcinkevi\u010ds1;michelacarlotta.massi@polimi.it;thomas.weikert@usb.ch;alexander.sauter@usb.ch;verena.gotta@ukbb.ch;timothy.mueller@uzh.ch;flavio.vasella@uzh.ch;marian.neidert@kssg.ch;marc.pfister@ukbb.ch;~Bram_Stieltjes2;~Julia_E_Vogt1", "aff": "Microsoft Research;;;;;;;;;;;Swiss Federal Institute of Technology", "aff_domain": "research.microsoft.com;;;;;;;;;;;ethz.ch", "position": "Intern;;;;;;;;;;;Assistant Professor", "bibtex": "@inproceedings{\nmanduchi2022a,\ntitle={A Deep Variational Approach to Clustering Survival Data},\nauthor={Laura Manduchi and Ri{\\v{c}}ards Marcinkevi{\\v{c}}s and Michela C. Massi and Thomas Weikert and Alexander Sauter and Verena Gotta and Timothy M{\\\"u}ller and Flavio Vasella and Marian C. Neidert and Marc Pfister and Bram Stieltjes and Julia E Vogt},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=RQ428ZptQfU}\n}", "github": "", "project": "", "reviewers": "VEke;Z7tL;6hmP;k5xJ", "pdf_size": 0, "recommendation": "6;8;8;8", "confidence": "4;3;4;2", "correctness": "3;4;4;4", "technical_novelty": "2;3;3;3", "empirical_novelty": "3;4;4;3", "wc_summary_paper": "94;51;41;88", "wc_summary_review": "19;42;17;67", "wc_main_review": "234;413;317;165", "wc_review": "347;506;375;320", "wc_reply_reviewers": "0;0;0;30", "wc_reply_authors": "750;1261;242;64", "reply_reviewers": "0;0;0;1", "reply_authors": "1;2;1;1", "recommendation_avg": [ 7.5, 0.8660254037844386 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.5, 0.5 ], "wc_summary_paper_avg": [ 68.5, 22.874658467395747 ], "wc_summary_review_avg": [ 36.25, 20.29008378494283 ], "wc_main_review_avg": [ 282.25, 92.70753744976726 ], "wc_review_avg": [ 387.0, 71.40378141247143 ], "wc_reply_reviewers_avg": [ 7.5, 12.99038105676658 ], "wc_reply_authors_avg": [ 579.25, 467.2148194353428 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 12, 0 ], "corr_recommendation_confidence": -0.5222329678670935, "corr_recommendation_correctness": 1.0, "gs_citation": 54, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12300997661063839145&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 12, "pdf": "https://openreview.net/pdf?id=RQ428ZptQfU", "email": "research.microsoft.com;;;;;;;;;;;ethz.ch", "author_num": 12, "aff_unique_index": "0;1", "aff_unique_norm": "Microsoft;Swiss Federal Institute of Technology", "aff_unique_dep": "Microsoft Research;", "aff_unique_url": "https://www.microsoft.com/en-us/research;https://www.ethz.ch", "aff_unique_abbr": "MSR;ETH Zurich", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "United States;Switzerland" }, { "id": "RQIvNJDHwy", "title": "Improving Neural Network Generalization via Promoting Within-Layer Diversity", "track": "main", "status": "Reject", "tldr": "", "abstract": "Neural networks are composed of multiple layers arranged in a hierarchical structure jointly trained with a gradient-based optimization, where the errors are back-propagated from the last layer back to the first one. At each optimization step, neurons at a given layer receive feedback from neurons belonging to higher layers of the hierarchy. In this paper, we propose to complement this traditional 'between-layer' feedback with additional 'within-layer' feedback to encourage the diversity of the activations within the same layer. To this end, we measure the pairwise similarity between the outputs of the neurons and use it to model the layer's overall diversity. By penalizing similarities and promoting diversity, we encourage each unit within the layer to learn a distinctive representation and, thus, to enrich the data representation learned and to increase the total capacity of the model. We theoretically study how the within-layer activation diversity affects the generalization performance of a neural network and prove that increasing the diversity of hidden activations reduces the estimation error. In addition to the theoretical guarantees, we present an extensive empirical study confirming that the proposed approach enhances the performance of state-of-the-art neural network models and decreases the generalization gap in multiple tasks. ", "keywords": "deep learning;regularization;overfitting;learning theory;neural network", "primary_area": "", "supplementary_material": "", "author": "Firas Laakom;Jenni Raitoharju;Alexandros Iosifidis;Moncef Gabbouj", "authorids": "~Firas_Laakom1;~Jenni_Raitoharju1;~Alexandros_Iosifidis2;~Moncef_Gabbouj1", "gender": "M;;M;M", "homepage": ";;https://www.tuni.fi/en/people/alexandros-iosifidis;https://www.tuni.fi/en/moncef-gabbouj", "dblp": "242/8179;;01/9539;08/6597", "google_scholar": "VPWIyx8AAAAJ;;KjsL0KEAAAAJ;cHukfSUAAAAJ", "orcid": "0000-0001-7436-5692;;0000-0003-4807-1345;0000-0002-9788-2323", "linkedin": ";;;moncef-gabbouj-2186282/?originalSubdomain=fi", "or_profile": "~Firas_Laakom1;~Jenni_Raitoharju1;~Alexandros_Iosifidis2;~Moncef_Gabbouj1", "aff": "Tampere University;;Aarhus University;Tampere University", "aff_domain": "tuni.fi;;au.dk;tuni.fi", "position": "PhD student;;Full Professor;Full Professor", "bibtex": "@misc{\nlaakom2022improving,\ntitle={Improving Neural Network Generalization via Promoting Within-Layer Diversity},\nauthor={Firas Laakom and Jenni Raitoharju and Alexandros Iosifidis and Moncef Gabbouj},\nyear={2022},\nurl={https://openreview.net/forum?id=RQIvNJDHwy}\n}", "github": "", "project": "", "reviewers": "LXoK;dHr7;WnGv;vc2s;LPWQ", "site": "https://openreview.net/forum?id=RQIvNJDHwy", "pdf_size": 0, "recommendation": "3;3;5;5;5", "confidence": "4;3;3;4;4", "correctness": "3;3;3;3;3", "technical_novelty": "2;2;2;3;2", "empirical_novelty": "2;2;2;3;2", "wc_summary_paper": "66;93;76;49;123", "wc_summary_review": "56;41;79;9;108", "wc_main_review": "498;323;239;301;689", "wc_review": "620;457;394;359;920", "wc_reply_reviewers": "0;0;0;50;126", "wc_reply_authors": "669;338;209;467;942", "reply_reviewers": "0;0;0;1;1", "reply_authors": "1;1;1;1;2", "recommendation_avg": [ 4.2, 0.9797958971132712 ], "confidence_avg": [ 3.6, 0.4898979485566356 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.2, 0.39999999999999997 ], "empirical_novelty_avg": [ 2.2, 0.39999999999999997 ], "wc_summary_paper_avg": [ 81.4, 25.223798286538845 ], "wc_summary_review_avg": [ 58.6, 33.5654584357193 ], "wc_main_review_avg": [ 410.0, 163.87556254670798 ], "wc_review_avg": [ 550.0, 205.55583183164617 ], "wc_reply_reviewers_avg": [ 35.2, 49.35747157219462 ], "wc_reply_authors_avg": [ 525.0, 257.9976744081233 ], "reply_reviewers_avg": [ 0.4, 0.48989794855663565 ], "reply_authors_avg": [ 1.2, 0.4 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.16666666666666663, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:B-YVDqH3TvcJ:scholar.google.com/&scioq=Improving+Neural+Network+Generalization+via+Promoting+Within-Layer+Diversity&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;0", "aff_unique_norm": "Tampere University;Aarhus University", "aff_unique_dep": ";", "aff_unique_url": "https://www.tuni.fi;https://au.dk", "aff_unique_abbr": "Tuni;AU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", "aff_country_unique": "Finland;Denmark" }, { "title": "Provably Filtering Exogenous Distractors using Multistep Inverse Dynamics", "status": "Oral", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6733", "id": "RQLLzMCefQu", "poster": "", "openreview": "https://openreview.net/forum?id=RQLLzMCefQu", "slides": "https://iclr.cc/virtual/2022/poster/6733", "video": "https://iclr.cc/virtual/2022/poster/6733", "author_site": "Yonathan Efroni, Dipendra Kumar Misra, Akshay Krishnamurthy, Alekh Agarwal, John Langford", "tldr": "", "abstract": "Many real-world applications of reinforcement learning (RL) require the agent to deal with high-dimensional observations such as those generated from a megapixel camera. Prior work has addressed such problems with representation learning, through which the agent can provably extract endogenous, latent state information from raw observations and subsequently plan efficiently. However, such approaches can fail in the presence of temporally correlated noise in the observations, a phenomenon that is common in practice. We initiate the formal study of latent state discovery in the presence of such exogenous noise sources by proposing a new model, the Exogenous Block MDP (EX-BMDP), for rich observation RL. We start by establishing several negative results, by highlighting failure cases of prior representation learning based approaches. Then, we introduce the Predictive Path Elimination (PPE) algorithm, that learns a generalization of inverse dynamics and is provably sample and computationally efficient in EX-BMDPs when the endogenous state dynamics are near deterministic. The sample complexity of PPE depends polynomially on the size of the latent endogenous state space while not directly depending on the size of the observation space, nor the exogenous state space. We provide experiments on challenging exploration problems which show that our approach works empirically. ", "keywords": "Reinforcement Learning Theory;Invariant Representation;Rich Observation Reinforcement Learning;Exogenous Noise;Inverse Dynamics", "primary_area": "", "supplementary_material": "/attachment/f967ff5acd44c135ebd15ad1bb92a5d01c70c077.zip", "author": "Yonathan Efroni;Dipendra Misra;Akshay Krishnamurthy;Alekh Agarwal;John Langford", "authorids": "~Yonathan_Efroni2;~Dipendra_Misra1;~Akshay_Krishnamurthy1;~Alekh_Agarwal2;~John_Langford1", "gender": "M;M;M;M;M", "homepage": "https://sites.google.com/view/yonathan-efroni/;https://dipendramisra.com/;https://www.cics.umass.edu/~akshay/;https://alekhagarwal.net;http://hunch.net/~jl", "dblp": "215/3475;218/6569;85/8024;;77/4488", "google_scholar": "pfTInEgAAAAJ;rIoPIFsAAAAJ;https://scholar.google.com.tw/citations?user=K0kaNvkAAAAJ;9nnDvooAAAAJ;LFiqVpwAAAAJ", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~Yonathan_Efroni2;~Dipendra_Misra1;~Akshay_Krishnamurthy1;~Alekh_Agarwal2;~John_Langford1", "aff": "Microsoft;Microsoft Research;Microsoft Research;Google;Microsoft", "aff_domain": "microsoft.com;microsoft.com;research.microsoft.com;google.com;microsoft.com", "position": "Postdoc;Researcher;Principal Researcher;Researcher;Researcher", "bibtex": "@inproceedings{\nefroni2022provably,\ntitle={Provably Filtering Exogenous Distractors using Multistep Inverse Dynamics},\nauthor={Yonathan Efroni and Dipendra Misra and Akshay Krishnamurthy and Alekh Agarwal and John Langford},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=RQLLzMCefQu}\n}", "github": "", "project": "", "reviewers": "fNfe;yfQT;B1Cs;Zamv", "pdf_size": 0, "recommendation": "8;8;8;8", "confidence": "5;3;3;2", "correctness": "4;4;3;4", "technical_novelty": "3;3;2;3", "empirical_novelty": "3;3;3;2", "wc_summary_paper": "95;177;87;169", "wc_summary_review": "69;61;70;62", "wc_main_review": "443;331;273;278", "wc_review": "607;569;430;509", "wc_reply_reviewers": "200;22;78;0", "wc_reply_authors": "999;533;664;254", "reply_reviewers": "2;1;1;0", "reply_authors": "3;2;2;2", "recommendation_avg": [ 8.0, 0.0 ], "confidence_avg": [ 3.25, 1.0897247358851685 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 132.0, 41.19465984809196 ], "wc_summary_review_avg": [ 65.5, 4.031128874149275 ], "wc_main_review_avg": [ 331.25, 68.40458683450986 ], "wc_review_avg": [ 528.75, 66.86693876647861 ], "wc_reply_reviewers_avg": [ 75.0, 77.56932383358772 ], "wc_reply_authors_avg": [ 612.5, 267.80449958878586 ], "reply_reviewers_avg": [ 1.0, 0.7071067811865476 ], "reply_authors_avg": [ 2.25, 0.4330127018922193 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 31, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14573645488421993294&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "pdf": "https://openreview.net/pdf?id=RQLLzMCefQu", "email": "microsoft.com;microsoft.com;research.microsoft.com;google.com;microsoft.com", "author_num": 5, "aff_unique_index": "0;0;0;1;0", "aff_unique_norm": "Microsoft;Google", "aff_unique_dep": "Microsoft Corporation;Google", "aff_unique_url": "https://www.microsoft.com;https://www.google.com", "aff_unique_abbr": "Microsoft;Google", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Sparse DETR: Efficient End-to-End Object Detection with Learnable Sparsity", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7017", "id": "RRGVCN8kjim", "poster": "", "openreview": "https://openreview.net/forum?id=RRGVCN8kjim", "slides": "https://iclr.cc/virtual/2022/poster/7017", "video": "https://iclr.cc/virtual/2022/poster/7017", "author_site": "Byungseok Roh, JaeWoong Shin, Wuhyun Shin, Saehoon Kim", "tldr": "", "abstract": "DETR is the first end-to-end object detector using a transformer encoder-decoder architecture and demonstrates competitive performance but low computational efficiency. The subsequent work, Deformable DETR, enhances the efficiency of DETR by replacing dense attention with deformable attention, which achieves 10x faster convergence and improved performance. Using the multiscale feature to ameliorate performance, however, the number of encoder queries increases by 20x compared to DETR, and the computation cost of the encoder attention remains a bottleneck. We observe that the encoder queries referenced by the decoder account for only 45% of the total, and find out the detection accuracy does not deteriorate significantly even if only the referenced queries are polished in the encoder block. Inspired by this observation, we propose Sparse DETR that selectively updates only the queries expected to be referenced by the decoder, thus help the model effectively detect objects. In addition, we show that applying an auxiliary detection loss on the selected queries in the encoder improves the performance while minimizing computational overhead. We validate that Sparse DETR achieves better performance than Deformable DETR even with only 10% encoder queries on the COCO dataset. Albeit only the encoder queries are sparsified, the total computation cost decreases by 38% and the frames per second (FPS) increases by 42% compared to Deformable DETR. Code will be released.\n", "keywords": "Transformer Query Sparsification Mechanism;Efficient End-to-End Object Detection", "primary_area": "", "supplementary_material": "", "author": "Byungseok Roh;JaeWoong Shin;Wuhyun Shin;Saehoon Kim", "authorids": "~Byungseok_Roh1;~JaeWoong_Shin1;~Wuhyun_Shin2;~Saehoon_Kim1", "gender": ";M;M;M", "homepage": ";;;https://saehoonkim.github.io/", "dblp": "258/1192;267/5672;;43/10813", "google_scholar": "H4VWYHwAAAAJ;i_o_95kAAAAJ;bGwfkakAAAAJ;https://scholar.google.com.sg/citations?user=_ZfueMIAAAAJ", "orcid": ";;;", "linkedin": ";%EC%9E%AC%EC%9B%85-%EC%8B%A0-88662220a/;;saehoonkim/", "or_profile": "~Byungseok_Roh1;~JaeWoong_Shin1;~Wuhyun_Shin2;~Saehoon_Kim1", "aff": "Kakao Brain;Lunit Inc.;Kakao Brain;Kakao Brain", "aff_domain": "kakaobrain.com;lunit.io;kakaobrain.com;kakaobrain.com", "position": "Research Scientist;Researcher;Research engineer;Researcher", "bibtex": "@inproceedings{\nroh2022sparse,\ntitle={Sparse {DETR}: Efficient End-to-End Object Detection with Learnable Sparsity},\nauthor={Byungseok Roh and JaeWoong Shin and Wuhyun Shin and Saehoon Kim},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=RRGVCN8kjim}\n}", "github": "", "project": "", "reviewers": "sTnV;jT45;YsVT;u6Zb", "pdf_size": 0, "recommendation": "5;6;8;8", "confidence": "5;5;5;5", "correctness": "3;3;3;4", "technical_novelty": "2;3;4;3", "empirical_novelty": "2;3;4;3", "wc_summary_paper": "94;150;100;88", "wc_summary_review": "28;18;37;35", "wc_main_review": "333;151;389;181", "wc_review": "455;319;526;304", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "1145;142;491;169", "reply_reviewers": "0;0;0;0", "reply_authors": "2;1;1;1", "recommendation_avg": [ 6.75, 1.299038105676658 ], "confidence_avg": [ 5.0, 0.0 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 108.0, 24.61706725018234 ], "wc_summary_review_avg": [ 29.5, 7.433034373659253 ], "wc_main_review_avg": [ 263.5, 100.05373556244665 ], "wc_review_avg": [ 401.0, 93.10477968396681 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 486.75, 404.08190691987187 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.5555555555555555, "gs_citation": 227, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18202446654995980467&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=RRGVCN8kjim", "email": "kakaobrain.com;lunit.io;kakaobrain.com;kakaobrain.com", "author_num": 4, "aff_unique_index": "0;1;0;0", "aff_unique_norm": "Kakao Brain;Lunit Inc.", "aff_unique_dep": ";", "aff_unique_url": "https://brain.kakao.com;https://www.lunit.io", "aff_unique_abbr": "Kakao Brain;Lunit", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "South Korea" }, { "id": "RRj7DcsPjT", "title": "Revisiting Layer-wise Sampling in Fast Training for Graph Convolutional Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "To accelerate the training of graph convolutional networks (GCN), many sampling-based methods have been developed for approximating the embedding aggregation. Among them, a layer-wise approach recursively performs importance sampling to select neighbors jointly for existing nodes in each layer. This paper revisits the approach from a matrix approximation perspective. We identify two issues in the existing layer-wise sampling methods: sub-optimal sampling probabilities and the approximation bias induced by sampling without replacement. We thus propose remedies to address these issues. The improvements are demonstrated by extensive analyses and experiments on common benchmarks.", "keywords": "GCN;efficient GCN;sampling", "primary_area": "", "supplementary_material": "/attachment/b38b8762be7195d1746f332b555eca744eea25ca.zip", "author": "Yifan Chen;Tianning Xu;Dilek Hakkani-Tur;Di Jin;Yun Yang;Ruoqing Zhu", "authorids": "~Yifan_Chen3;~Tianning_Xu1;~Dilek_Hakkani-Tur1;~Di_Jin1;~Yun_Yang4;~Ruoqing_Zhu1", "gender": ";M;;M;M;M", "homepage": ";;;https://jind11.github.io/;https://www-math.umd.edu/people/faculty/item/1811-yy84.html;https://sites.google.com/site/teazrq/", "dblp": ";;;;;", "google_scholar": ";UuINjGEAAAAJ;;x5QTK9YAAAAJ;FY_UnPAAAAAJ;uyzMyb8AAAAJ", "orcid": ";;;;;", "linkedin": ";tianning-xu/;;;;", "or_profile": "~Yifan_Chen3;~Tianning_Xu1;~Dilek_Hakkani-Tur1;~Di_Jin1;~Yun_Yang4;~Ruoqing_Zhu1", "aff": ";University of Illinois, Urbana Champaign;;Amazon;University of Illinois, Urbana Champaign;University of Illinois, Urbana-Champaign", "aff_domain": ";illinois.edu;;amazon.com;illinois.edu;uiuc.edu", "position": ";PhD student;;Researcher;Associate Professor;Assistant Professor", "bibtex": "@misc{\nchen2022revisiting,\ntitle={Revisiting Layer-wise Sampling in Fast Training for Graph Convolutional Networks},\nauthor={Yifan Chen and Tianning Xu and Dilek Hakkani-Tur and Di Jin and Yun Yang and Ruoqing Zhu},\nyear={2022},\nurl={https://openreview.net/forum?id=RRj7DcsPjT}\n}", "github": "", "project": "", "reviewers": "xS1Y;YZaF;Rhqb;cNek", "site": "https://openreview.net/forum?id=RRj7DcsPjT", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "5;3;4;2", "correctness": "3;3;3;3", "technical_novelty": "2;3;2;3", "empirical_novelty": "2;3;2;3", "wc_summary_paper": "22;25;183;57", "wc_summary_review": "19;24;32;14", "wc_main_review": "496;88;239;92", "wc_review": "537;137;454;163", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "957;200;696;238", "reply_reviewers": "0;0;0;0", "reply_authors": "2;1;1;1", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 3.5, 1.118033988749895 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 71.75, 65.67866853096217 ], "wc_summary_review_avg": [ 22.25, 6.6473679001541655 ], "wc_main_review_avg": [ 228.75, 165.86044585735323 ], "wc_review_avg": [ 322.75, 175.46563053772098 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 522.75, 317.7415419802705 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.9233805168766388, "corr_recommendation_correctness": 0.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15350842160213525512&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;0;2", "aff_unique_norm": "University of Illinois Urbana-Champaign;Amazon;University of Illinois", "aff_unique_dep": ";Amazon.com, Inc.;", "aff_unique_url": "https://illinois.edu;https://www.amazon.com;https://illinois.edu", "aff_unique_abbr": "UIUC;Amazon;UIUC", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Urbana-Champaign;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "RSd79AULOu", "title": "Fairness-aware Federated Learning", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Federated Learning is a machine learning technique where a network of clients collaborates with a server to learn a centralized model while keeping data localized. In such a setting, naively minimizing an aggregate loss may introduce bias and disadvantage model performance on certain clients. To address this issue, we propose a new federated learning framework called FAFL in which the goal is to minimize the worst-case weighted client losses over an uncertainty set. By deriving a variational representation, we show that this framework is a fairness-aware objective and can be easily optimized by solving a joint minimization problem over the model parameters and a dual variable. We then propose an optimization algorithm to solve FAFL which can be efficiently implemented in a federated setting and provide convergence guarantees. We further prove generalization bounds for learning with this objective. Experiments on real-world datasets demonstrate the effectiveness of our framework in achieving both accuracy and fairness.", "keywords": "Learning Theory", "primary_area": "", "supplementary_material": "", "author": "Zhuozhuo Tu;zhiqiang xu;Tairan Huang;Dacheng Tao;Ping Li", "authorids": "~Zhuozhuo_Tu1;~zhiqiang_xu1;~Tairan_Huang1;~Dacheng_Tao1;~Ping_Li3", "gender": "M;M;M;;M", "homepage": ";https://scholar.google.com/citations?user=0R20iBMAAAAJ&hl=en;;;http://www.stat.rutgers.edu/home/pingli/", "dblp": "230/4649;72/51-3.html;229/6227;;62/5860-1", "google_scholar": ";;;;", "orcid": ";0000-0002-5693-8933;;;", "linkedin": ";;\u6cf0\u7136-\u9ec4-9b8b791a2/;;", "or_profile": "~Zhuozhuo_Tu1;~zhiqiang_xu1;~Tairan_Huang1;~Dacheng_Tao1;~Ping_Li3", "aff": "The University of Sydney;Mohamed bin Zayed University of Artificial Intelligence;Beihang University;;LinkedIn", "aff_domain": "sydney.edu.au;mbzuai.ac.ae;buaa.edu.cn;;linkedin.com", "position": "Researcher;Assistant Professor;MS student;;Engineer", "bibtex": "@misc{\ntu2022fairnessaware,\ntitle={Fairness-aware Federated Learning},\nauthor={Zhuozhuo Tu and zhiqiang xu and Tairan Huang and Dacheng Tao and Ping Li},\nyear={2022},\nurl={https://openreview.net/forum?id=RSd79AULOu}\n}", "github": "", "project": "", "reviewers": "1ggR;smE9;r8PQ;KuJa", "site": "https://openreview.net/forum?id=RSd79AULOu", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "4;4;4;2", "correctness": "2;3;2;3", "technical_novelty": "1;2;2;2", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "41;121;48;72", "wc_summary_review": "27;154;16;25", "wc_main_review": "481;884;327;162", "wc_review": "549;1159;391;259", "wc_reply_reviewers": "227;57;0;0", "wc_reply_authors": "900;1135;469;422", "reply_reviewers": "1;2;0;0", "reply_authors": "2;3;1;1", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 3.5, 0.8660254037844386 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 1.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 70.5, 31.34086788842964 ], "wc_summary_review_avg": [ 55.5, 57.019733426244635 ], "wc_main_review_avg": [ 463.5, 267.7036607893138 ], "wc_review_avg": [ 589.5, 344.4571816641366 ], "wc_reply_reviewers_avg": [ 71.0, 93.02419040228192 ], "wc_reply_authors_avg": [ 731.5, 298.2871938250115 ], "reply_reviewers_avg": [ 0.75, 0.82915619758885 ], "reply_authors_avg": [ 1.75, 0.82915619758885 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -1.0, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "University of Sydney;Mohamed bin Zayed University of Artificial Intelligence;Beihang University;LinkedIn Corporation", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.sydney.edu.au;https://mbzuai.ac.ae;http://www.buaa.edu.cn/;https://www.linkedin.com", "aff_unique_abbr": "USYD;MBZUAI;BUAA;LinkedIn", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2;3", "aff_country_unique": "Australia;United Arab Emirates;China;United States" }, { "title": "Semi-relaxed Gromov-Wasserstein divergence and applications on graphs", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6131", "id": "RShaMexjc-x", "poster": "", "openreview": "https://openreview.net/forum?id=RShaMexjc-x", "slides": "https://iclr.cc/virtual/2022/poster/6131", "video": "https://iclr.cc/virtual/2022/poster/6131", "author_site": "C\u00e9dric Vincent-Cuaz, R\u00e9mi Flamary, Marco Corneli, Titouan Vayer, Nicolas Courty", "tldr": "", "abstract": "Comparing structured objects such as graphs is a fundamental operation\ninvolved in many learning tasks. To this end, the Gromov-Wasserstein (GW)\ndistance, based on Optimal Transport (OT), has proven to be successful in\nhandling the specific nature of the associated objects. More specifically,\nthrough the nodes connectivity relations, GW operates on graphs, seen as\nprobability measures over specific spaces. At the core of OT is the idea of\nconservation of mass, which imposes a coupling between all the nodes from\nthe two considered graphs. We argue in this paper that this property can be\ndetrimental for tasks such as graph dictionary or partition learning, and we\nrelax it by proposing a new semi-relaxed Gromov-Wasserstein divergence.\nAside from immediate computational benefits, we discuss its properties, and\nshow that it can lead to an efficient graph dictionary learning algorithm.\nWe empirically demonstrate its relevance for complex tasks on graphs such as\npartitioning, clustering and completion.", "keywords": "Optimal Transport;Graph Learning", "primary_area": "", "supplementary_material": "/attachment/25b39f1f8a2bbad10abaaad9d7af0f6d72bf4248.zip", "author": "C\u00e9dric Vincent-Cuaz;R\u00e9mi Flamary;Marco Corneli;Titouan Vayer;Nicolas Courty", "authorids": "~C\u00e9dric_Vincent-Cuaz1;~R\u00e9mi_Flamary1;~Marco_Corneli1;~Titouan_Vayer1;~Nicolas_Courty1", "gender": "M;;M;M;M", "homepage": "https://twitter.com/CedricCuaz;https://remi.flamary.com/;https://math.unice.fr/~mcorneli/;https://tvayer.github.io/;http://people.irisa.fr/Nicolas.Courty/", "dblp": "285/5156;00/8318;;220/5519;74/4219", "google_scholar": "p0spNmMAAAAJ;https://scholar.google.fr/citations?user=zDnwxFQAAAAJ;;https://scholar.google.fr/citations?user=PJEv3JgAAAAJ;https://scholar.google.fr/citations?user=ibEREjcAAAAJ", "orcid": ";0000-0002-4212-6627;;;0000-0003-1353-0126", "linkedin": ";;;;", "or_profile": "~C\u00e9dric_Vincent-Cuaz1;~R\u00e9mi_Flamary1;~Marco_Corneli1;~Titouan_Vayer1;~Nicolas_Courty1", "aff": "INRIA;Ecole polytechnique;Universit\u00e9 Cote d'Azur;ENS, Lyon;IRISA", "aff_domain": "inria.fr;polytechnique.edu;univ-cotedazur.fr;ens-lyon.fr;irisa.fr", "position": "PhD student;Assistant Professor;Junion professor;Post doctoral researcher;Full Professor", "bibtex": "@inproceedings{\nvincent-cuaz2022semirelaxed,\ntitle={Semi-relaxed Gromov-Wasserstein divergence and applications on graphs},\nauthor={C{\\'e}dric Vincent-Cuaz and R{\\'e}mi Flamary and Marco Corneli and Titouan Vayer and Nicolas Courty},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=RShaMexjc-x}\n}", "github": "", "project": "", "reviewers": "vbhF;jEHW;4kuX;ioNk", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "3;2;4;4", "correctness": "4;3;3;4", "technical_novelty": "2;3;3;3", "empirical_novelty": "3;2;2;3", "wc_summary_paper": "95;63;85;104", "wc_summary_review": "40;32;37;132", "wc_main_review": "485;293;253;530", "wc_review": "620;388;375;766", "wc_reply_reviewers": "35;0;0;0", "wc_reply_authors": "1644;506;806;673", "reply_reviewers": "1;0;0;0", "reply_authors": "3;1;1;1", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 86.75, 15.270478054075452 ], "wc_summary_review_avg": [ 60.25, 41.52333681196635 ], "wc_main_review_avg": [ 390.25, 119.16663752913396 ], "wc_review_avg": [ 537.25, 164.14532433182495 ], "wc_reply_reviewers_avg": [ 8.75, 15.155444566227676 ], "wc_reply_authors_avg": [ 907.25, 438.44234227546957 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.5, 0.8660254037844386 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.48420012470625223, "corr_recommendation_correctness": 0.2294157338705618, "gs_citation": 52, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5559583697568820733&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "pdf": "https://openreview.net/pdf?id=RShaMexjc-x", "email": "inria.fr;polytechnique.edu;univ-cotedazur.fr;ens-lyon.fr;irisa.fr", "author_num": 5, "aff_unique_index": "0;1;2;3;4", "aff_unique_norm": "INRIA;Ecole Polytechnique;Universit\u00e9 C\u00f4te d'Azur;Ecole Normale Superieure;Institut de Recherche en Informatique et Automatique", "aff_unique_dep": ";;;;", "aff_unique_url": "https://www.inria.fr;https://www.polytechnique.edu;https://www.univ-cotedazur.fr;https://www.ens-lyon.fr;https://www.irisa.fr", "aff_unique_abbr": "INRIA;X;UCA;ENS;IRISA", "aff_campus_unique_index": "1", "aff_campus_unique": ";Lyon", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "France" }, { "id": "RVdN1-eDZ1b", "title": "Plug-In Inversion: Model-Agnostic Inversion for Vision with Data Augmentations", "track": "main", "status": "Reject", "tldr": "", "abstract": "Existing techniques for model inversion typically rely on hard-to-tune regularizers, such as total variation or feature regularization, which must be individually calibrated for each network in order to produce adequate images. In this work, we introduce Plug-In Inversion, which relies on a simple set of augmentations and does not require excessive hyper-parameter tuning. Under our proposed augmentation-based scheme, the same set of augmentation hyper-parameters can be used for inverting a wide range of image classification models, regardless of input dimensions or the architecture. We illustrate the practicality of our approach by inverting Vision Transformers (ViTs) and Multi-Layer Perceptrons (MLPs) trained on the ImageNet dataset, tasks which to the best of our knowledge have not been successfully accomplished by any previous works. ", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/1b60b29e296867f386fdbcdc279277c3f97f2885.zip", "author": "Amin Ghiasi;Hamid Kazemi;Steven Reich;Chen Zhu;Micah Goldblum;Tom Goldstein", "authorids": "~Amin_Ghiasi1;~Hamid_Kazemi1;~Steven_Reich1;~Chen_Zhu2;~Micah_Goldblum1;~Tom_Goldstein1", "gender": "M;M;M;M;;M", "homepage": "http://cs.umd.edu/~amin;;https://www.cs.umd.edu/people/sreich47;http://www.cs.umd.edu/~chenzhu/;;https://www.cs.umd.edu/~tomg/", "dblp": "239/8313;;;59/10522-1.html;241/7231;25/8184", "google_scholar": "tNQWOxUAAAAJ;7hNdaGQAAAAJ;https://scholar.google.com/citations?view_op=list_works;m-om5O8AAAAJ;pGDKzuUAAAAJ;KmSuVtgAAAAJ", "orcid": ";;;;;", "linkedin": ";hamid-kazemi-608a8085/;;;;", "or_profile": "~Amin_Ghiasi1;~Hamid_Kazemi1;~Steven_Reich1;~Chen_Zhu2;~Micah_Goldblum1;~Tom_Goldstein1", "aff": "University of Maryland, College Park;University of Maryland, College Park;;Department of Computer Science, University of Maryland, College Park;New York University;University of Maryland, College Park", "aff_domain": "umd.edu;umd.edu;;cs.umd.edu;nyu.edu;umd.edu", "position": "PhD student;PhD student;;PhD student;Postdoc;Associate Professor", "bibtex": "@misc{\nghiasi2022plugin,\ntitle={Plug-In Inversion: Model-Agnostic Inversion for Vision with Data Augmentations},\nauthor={Amin Ghiasi and Hamid Kazemi and Steven Reich and Chen Zhu and Micah Goldblum and Tom Goldstein},\nyear={2022},\nurl={https://openreview.net/forum?id=RVdN1-eDZ1b}\n}", "github": "", "project": "", "reviewers": "dsJC;SYwW;vGig;MNJX", "site": "https://openreview.net/forum?id=RVdN1-eDZ1b", "pdf_size": 0, "recommendation": "3;5;6;6", "confidence": "4;4;3;4", "correctness": "2;3;4;3", "technical_novelty": "2;3;2;3", "empirical_novelty": "1;2;3;2", "wc_summary_paper": "128;83;176;56", "wc_summary_review": "31;38;118;85", "wc_main_review": "379;338;287;675", "wc_review": "538;459;581;816", "wc_reply_reviewers": "0;128;0;157", "wc_reply_authors": "295;650;229;459", "reply_reviewers": "0;1;0;1", "reply_authors": "2;2;1;2", "recommendation_avg": [ 5.0, 1.224744871391589 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 110.75, 45.61455359860491 ], "wc_summary_review_avg": [ 68.0, 35.55980877338909 ], "wc_main_review_avg": [ 419.75, 150.92941230919837 ], "wc_review_avg": [ 598.5, 132.97838170168865 ], "wc_reply_reviewers_avg": [ 71.25, 71.9839391809034 ], "wc_reply_authors_avg": [ 408.25, 162.7688161166014 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.75, 0.4330127018922193 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.4714045207910316, "corr_recommendation_correctness": 0.8660254037844386, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3783911125052785325&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0;1;2;0", "aff_unique_norm": "University of Maryland;University of Maryland, College Park;New York University", "aff_unique_dep": ";Department of Computer Science;", "aff_unique_url": "https://www/umd.edu;https://www/umd.edu;https://www.nyu.edu", "aff_unique_abbr": "UMD;UMD;NYU", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "College Park;", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "RW_GTtTfHJ6", "title": "Causal Reinforcement Learning using Observational and Interventional Data", "track": "main", "status": "Reject", "tldr": "", "abstract": "Learning efficiently a causal model of the environment is a key challenge of model-based RL agents operating in POMDPs. We consider here a scenario where the learning agent has the ability to collect online experiences through direct interactions with the environment (interventional data), but also has access to a large collection of offline experiences, obtained by observing another agent interacting with the environment (observational data). A key ingredient, which makes this situation non-trivial, is that we allow the observed agent to act based on privileged information, hidden from the learning agent. We then ask the following questions: can the online and offline experiences be safely combined for learning a causal transition model ? And can we expect the offline experiences to improve the agent's performances ? To answer these, first we bridge the fields of reinforcement learning and causality, by importing ideas from the well-established causal framework of do-calculus, and expressing model-based reinforcement learning as a causal inference problem. Second, we propose a general yet simple methodology for safely leveraging offline data during learning. In a nutshell, our method relies on learning a latent-based causal transition model that explains both the interventional and observational regimes, and then inferring the standard POMDP transition model via deconfounding using the recovered latent variable. We prove our method is correct and efficient in the sense that it attains better generalization guarantees due to the offline data (in the asymptotic case), and we assess its effectiveness empirically on a series of synthetic toy problems.", "keywords": "reinforcement learning;causality;confounding", "primary_area": "", "supplementary_material": "/attachment/cb349d4a62fdec43db8e73cd118b1f53ea2c8864.zip", "author": "Maxime Gasse;Damien GRASSET;Guillaume Gaudron;Pierre-Yves Oudeyer", "authorids": "~Maxime_Gasse2;~Damien_GRASSET1;~Guillaume_Gaudron1;~Pierre-Yves_Oudeyer1", "gender": "M;M;;M", "homepage": "http://www.maximegasse.com/;;;http://www.pyoudeyer.com", "dblp": "118/4730;;;33/5513", "google_scholar": "https://scholar.google.fr/citations?user=s7m9rikAAAAJ;;;https://scholar.google.fr/citations?user=gCqGj4sAAAAJ", "orcid": "0000-0001-6982-062X;;;", "linkedin": "maxime-gasse-100a4a62/;damien-grasset-2a6971115/;guillaume-gaudron-4bbb75a0/;pierreyvesoudeyer/", "or_profile": "~Maxime_Gasse2;~Damien_GRASSET1;~Guillaume_Gaudron1;~Pierre-Yves_Oudeyer1", "aff": "\u00c9cole Polytechnique de Montr\u00e9al, Universit\u00e9 de Montr\u00e9al;;Ubisoft;Microsoft", "aff_domain": "polymtl.ca;;laforge.ubisoft.com;microsoft.com", "position": "Researcher;;Director La Forge France;Visiting researcher", "bibtex": "@misc{\ngasse2022causal,\ntitle={Causal Reinforcement Learning using Observational and Interventional Data},\nauthor={Maxime Gasse and Damien GRASSET and Guillaume Gaudron and Pierre-Yves Oudeyer},\nyear={2022},\nurl={https://openreview.net/forum?id=RW_GTtTfHJ6}\n}", "github": "", "project": "", "reviewers": "NLhN;cYgx;Xuy7;8zjw", "site": "https://openreview.net/forum?id=RW_GTtTfHJ6", "pdf_size": 0, "recommendation": "5;5;5;6", "confidence": "3;3;4;3", "correctness": "3;3;3;3", "technical_novelty": "2;3;2;2", "empirical_novelty": "2;3;2;1", "wc_summary_paper": "100;118;59;104", "wc_summary_review": "105;62;19;131", "wc_main_review": "231;447;133;116", "wc_review": "436;627;211;351", "wc_reply_reviewers": "255;0;0;122", "wc_reply_authors": "1910;1604;802;847", "reply_reviewers": "2;0;0;1", "reply_authors": "4;3;2;2", "recommendation_avg": [ 5.25, 0.4330127018922193 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 95.25, 21.970150204311302 ], "wc_summary_review_avg": [ 79.25, 42.628482262449836 ], "wc_main_review_avg": [ 231.75, 131.79790400457816 ], "wc_review_avg": [ 406.25, 150.65751723694373 ], "wc_reply_reviewers_avg": [ 94.25, 105.32894901213056 ], "wc_reply_authors_avg": [ 1290.75, 478.9015425951351 ], "reply_reviewers_avg": [ 0.75, 0.82915619758885 ], "reply_authors_avg": [ 2.75, 0.82915619758885 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": 0.0, "gs_citation": 69, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15852012637723132045&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 15, "aff_unique_index": "0;1;2", "aff_unique_norm": "\u00c9cole Polytechnique de Montr\u00e9al;Ubisoft;Microsoft", "aff_unique_dep": ";;Microsoft Corporation", "aff_unique_url": "https://www.polymtl.ca;https://www.ubisoft.com;https://www.microsoft.com", "aff_unique_abbr": "Polytechnique Montr\u00e9al;Ubisoft;Microsoft", "aff_campus_unique_index": "0", "aff_campus_unique": "Montr\u00e9al;", "aff_country_unique_index": "0;1;2", "aff_country_unique": "Canada;France;United States" }, { "title": "Anti-Concentrated Confidence Bonuses For Scalable Exploration", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6790", "id": "RXQ-FPbQYVn", "poster": "", "openreview": "https://openreview.net/forum?id=RXQ-FPbQYVn", "slides": "https://iclr.cc/virtual/2022/poster/6790", "video": "https://iclr.cc/virtual/2022/poster/6790", "author_site": "Jordan Ash, Cyril Zhang, Surbhi Goel, Akshay Krishnamurthy, Sham M Kakade", "tldr": "", "abstract": "Intrinsic rewards play a central role in handling the exploration-exploitation tradeoff when designing sequential decision-making algorithms, in both foundational theory and state-of-the-art deep reinforcement learning. The LinUCB algorithm, a centerpiece of the stochastic linear bandits literature, prescribes an elliptical bonus which addresses the challenge of leveraging shared information in large action spaces. This bonus scheme cannot be directly transferred to high-dimensional exploration problems, however, due to the computational cost of maintaining the inverse covariance matrix of action features. We introduce anti-concentrated confidence bounds for efficiently approximating the elliptical bonus, using an ensemble of regressors trained to predict random noise from policy network-derived features. Using this approximation, we obtain stochastic linear bandit algorithms which obtain $\\tilde O(d \\sqrt{T})$ regret bounds for $\\mathsf{poly}(d)$ fixed actions. We develop a practical variant that is competitive with contemporary intrinsic reward heuristics on Atari benchmarks.", "keywords": "deep reinforcement learning;reinforcement learning;bandits;exploration", "primary_area": "", "supplementary_material": "/attachment/cc029b6b133f3bddce88b90804802c6b76f479f2.zip", "author": "Jordan T. Ash;Cyril Zhang;Surbhi Goel;Akshay Krishnamurthy;Sham M. Kakade", "authorids": "~Jordan_T._Ash1;~Cyril_Zhang1;~Surbhi_Goel1;~Akshay_Krishnamurthy1;~Sham_M._Kakade1", "gender": ";;F;M;M", "homepage": "http://www.jordantash.com;https://cyrilzhang.com;https://www.surbhigoel.com;https://www.cics.umass.edu/~akshay/;https://shamulent.github.io", "dblp": "176/5225;203/4448;190/7815;85/8024;s/SMKakade", "google_scholar": "bmRNH-UAAAAJ;sXtjq8IAAAAJ;https://scholar.google.co.in/citations?user=Zqz4CQoAAAAJ;https://scholar.google.com.tw/citations?user=K0kaNvkAAAAJ;https://scholar.google.com.tw/citations?user=wb-DKCIAAAAJ", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~Jordan_T._Ash1;~Cyril_Zhang1;~Surbhi_Goel1;~Akshay_Krishnamurthy1;~Sham_M._Kakade1", "aff": "Microsoft Research;Microsoft;Microsoft Research;Microsoft Research;Harvard University", "aff_domain": "research.microsoft.com;microsoft.com;microsoft.com;research.microsoft.com;harvard.edu", "position": "Postdoc;Senior Researcher;Postdoc;Principal Researcher;Full Professor", "bibtex": "@inproceedings{\nash2022anticoncentrated,\ntitle={Anti-Concentrated Confidence Bonuses For Scalable Exploration},\nauthor={Jordan T. Ash and Cyril Zhang and Surbhi Goel and Akshay Krishnamurthy and Sham M. Kakade},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=RXQ-FPbQYVn}\n}", "github": "", "project": "", "reviewers": "AKDp;Ezvk;1PxN", "pdf_size": 0, "recommendation": "5;6;8", "confidence": "3;4;4", "correctness": "2;4;4", "technical_novelty": "3;3;3", "empirical_novelty": "2;3;4", "wc_summary_paper": "127;104;184", "wc_summary_review": "52;36;45", "wc_main_review": "419;189;301", "wc_review": "598;329;530", "wc_reply_reviewers": "116;0;0", "wc_reply_authors": "521;222;163", "reply_reviewers": "1;0;0", "reply_authors": "2;1;1", "recommendation_avg": [ 6.333333333333333, 1.247219128924647 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.9428090415820634 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 3.0, 0.816496580927726 ], "wc_summary_paper_avg": [ 138.33333333333334, 33.62869145371091 ], "wc_summary_review_avg": [ 44.333333333333336, 6.548960901462833 ], "wc_main_review_avg": [ 303.0, 93.90775615819317 ], "wc_review_avg": [ 485.6666666666667, 114.20546785898163 ], "wc_reply_reviewers_avg": [ 38.666666666666664, 54.68292441175968 ], "wc_reply_authors_avg": [ 302.0, 156.7184311645145 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.7559289460184545, "corr_recommendation_correctness": 0.7559289460184546, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16870554554862231128&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=RXQ-FPbQYVn", "email": "research.microsoft.com;microsoft.com;microsoft.com;research.microsoft.com;harvard.edu", "author_num": 5, "aff_unique_index": "0;0;0;0;1", "aff_unique_norm": "Microsoft;Harvard University", "aff_unique_dep": "Microsoft Research;", "aff_unique_url": "https://www.microsoft.com/en-us/research;https://www.harvard.edu", "aff_unique_abbr": "MSR;Harvard", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "RYTBAtyXqJ", "title": "Cartoon Explanations of Image Classifiers", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "We present CartoonX (Cartoon Explanation), a novel model-agnostic explanation method tailored towards image classifiers and based on the rate-distortion explanation (RDE) framework. Natural images are roughly piece-wise smooth signals---also called cartoon images---and tend to be sparse in the wavelet domain. CartoonX is the first explanation method to exploit this by requiring its explanations to be sparse in the wavelet domain, thus extracting the relevant piece-wise smooth part of an image instead of relevant pixel-sparse regions. We demonstrate experimentally that CartoonX is not only highly interpretable due to its piece-wise smooth nature but also particularly apt at explaining misclassifications.", "keywords": "Explainable AI;Image Classification;Wavelets", "primary_area": "", "supplementary_material": "", "author": "Stefan Kolek;Duc Anh Nguyen;Ron Levie;Joan Bruna;Gitta Kutyniok", "authorids": "~Stefan_Kolek1;~Duc_Anh_Nguyen2;~Ron_Levie1;~Joan_Bruna1;~Gitta_Kutyniok2", "gender": "M;;;M;F", "homepage": "https://skmda37.github.io/;;;http://cims.nyu.edu/~bruna;https://www.ai.math.lmu.de/kutyniok", "dblp": "304/2478;;;44/8776;13/2736", "google_scholar": "7umQNF8AAAAJ;;;L4bNmsMAAAAJ;https://scholar.google.de/citations?user=JHs9LssAAAAJ", "orcid": ";;;;0000-0001-9738-2487", "linkedin": ";;;;gitta-kutyniok-2606b215/?originalSubdomain=de", "or_profile": "~Stefan_Kolek1;~Duc_Anh_Nguyen2;~Ron_Levie1;~Joan_Bruna1;~Gitta_Kutyniok2", "aff": "Institut f\u00fcr Mathematik;;;New York University;LMU Munich", "aff_domain": "lmu.de;;;nyu.edu;uni-muenchen.de", "position": "PhD student;;;Associate Professor;Full Professor", "bibtex": "@misc{\nkolek2022cartoon,\ntitle={Cartoon Explanations of Image Classifiers},\nauthor={Stefan Kolek and Duc Anh Nguyen and Ron Levie and Joan Bruna and Gitta Kutyniok},\nyear={2022},\nurl={https://openreview.net/forum?id=RYTBAtyXqJ}\n}", "github": "", "project": "", "reviewers": "ZKXx;bd56;52tE;YHdw", "site": "https://openreview.net/forum?id=RYTBAtyXqJ", "pdf_size": 0, "recommendation": "3;3;5;6", "confidence": "3;4;4;4", "correctness": "2;2;4;3", "technical_novelty": "2;2;3;4", "empirical_novelty": "1;2;2;3", "wc_summary_paper": "66;75;54;75", "wc_summary_review": "143;48;62;27", "wc_main_review": "386;512;572;731", "wc_review": "595;635;688;833", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "455;723;280;733", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 4.25, 1.299038105676658 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.82915619758885 ], "technical_novelty_avg": [ 2.75, 0.82915619758885 ], "empirical_novelty_avg": [ 2.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 67.5, 8.616843969807043 ], "wc_summary_review_avg": [ 70.0, 43.94883388669147 ], "wc_main_review_avg": [ 550.25, 124.08137450882788 ], "wc_review_avg": [ 687.75, 90.11485726560299 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 547.75, 190.60610562099 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.5555555555555555, "corr_recommendation_correctness": 0.7543365091413573, "gs_citation": 21, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6270070436316559917&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff_unique_index": "0;1;2", "aff_unique_norm": "Institut f\u00fcr Mathematik;New York University;Ludwig Maximilian University of Munich", "aff_unique_dep": "Mathematics Department;;", "aff_unique_url": ";https://www.nyu.edu;https://www.lmu.de", "aff_unique_abbr": ";NYU;LMU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Munich", "aff_country_unique_index": "0;1;0", "aff_country_unique": "Germany;United States" }, { "id": "RbVp8ieInU7", "title": "Low-rank Matrix Recovery with Unknown Correspondence", "track": "main", "status": "Reject", "tldr": "", "abstract": "We study a matrix recovery problem with unknown correspondence: given the observation matrix $M_o=[A,\\tilde P B]$, where $\\tilde P$ is an unknown permutation matrix, we aim to recover the underlying matrix $M=[A,B]$. Such problem commonly arises in many applications where heterogeneous data are utilized and the correspondence among them are unknown, e.g., due to privacy concerns. We show that it is possible to recover $M$ via solving a nuclear norm minimization problem under a proper low-rank condition on $M$, with provable non-asymptotic error bound for the recovery of $M$. We propose an algorithm, $\\text{M}^3\\text{O}$ (Matrix recovery via Min-Max Optimization) which recasts this combinatorial problem as a continuous minimax optimization problem and solves it by proximal gradient with a Max-Oracle. $\\text{M}^3\\text{O}$ can also be applied to a more general scenario where we have missing entries in $M_o$ and multiple groups of data with distinct unknown correspondence. Experiments on simulated data, the MovieLens 100K dataset and Yale B database show that $\\text{M}^3\\text{O}$ achieves state-of-the-art performance over several baselines and can recover the ground-truth correspondence with high accuracy.", "keywords": "low-rank matrix recovery;optimal transport;min-max optimization;permutation matrix", "primary_area": "", "supplementary_material": "/attachment/0559e3e1e5f6e8ae76838d31038a00fe7821a407.zip", "author": "Zhiwei Tang;Tsung-Hui Chang;Xiaojing Ye;Hongyuan Zha", "authorids": "~Zhiwei_Tang1;~Tsung-Hui_Chang1;~Xiaojing_Ye1;~Hongyuan_Zha1", "gender": "M;;Unspecified;", "homepage": "https://zhiweitang-ml.bio;;;", "dblp": ";;01/2390;z/HongyuanZha", "google_scholar": "GN-N9c8AAAAJ;;;n1DQMIsAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Zhiwei_Tang1;~Tsung-Hui_Chang1;~Xiaojing_Ye1;~Hongyuan_Zha1", "aff": "Chinese University of HongKong, Shenzhen;;Georgia State University;The Chinese University of Hong Kong, Shenzhen", "aff_domain": "link.cuhk.edu.cn;;gsu.edu;cuhk.edu.cn", "position": "PhD student;;Associate Professor;Full Professor", "bibtex": "@misc{\ntang2022lowrank,\ntitle={Low-rank Matrix Recovery with Unknown Correspondence},\nauthor={Zhiwei Tang and Tsung-Hui Chang and Xiaojing Ye and Hongyuan Zha},\nyear={2022},\nurl={https://openreview.net/forum?id=RbVp8ieInU7}\n}", "github": "", "project": "", "reviewers": "Wx9p;2Tsx;CuTB;YsMp", "site": "https://openreview.net/forum?id=RbVp8ieInU7", "pdf_size": 0, "recommendation": "5;6;8;10", "confidence": "3;5;3;3", "correctness": "3;3;4;4", "technical_novelty": "3;3;3;4", "empirical_novelty": "3;3;3;4", "wc_summary_paper": "162;98;385;66", "wc_summary_review": "49;85;19;59", "wc_main_review": "458;1038;35;42", "wc_review": "669;1221;439;167", "wc_reply_reviewers": "0;1643;0;0", "wc_reply_authors": "653;3028;75;140", "reply_reviewers": "0;4;0;0", "reply_authors": "2;6;1;1", "recommendation_avg": [ 7.25, 1.920286436967152 ], "confidence_avg": [ 3.5, 0.8660254037844386 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 177.75, 124.5479325400466 ], "wc_summary_review_avg": [ 53.0, 23.62202362203543 ], "wc_main_review_avg": [ 393.25, 409.7605245750254 ], "wc_review_avg": [ 624.0, 387.7847340987007 ], "wc_reply_reviewers_avg": [ 410.75, 711.4398692089163 ], "wc_reply_authors_avg": [ 974.0, 1206.8257952165259 ], "reply_reviewers_avg": [ 1.0, 1.7320508075688772 ], "reply_authors_avg": [ 2.5, 2.0615528128088303 ], "replies_avg": [ 22, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.3758230140014144, "corr_recommendation_correctness": 0.911322376865767, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3403683036323793941&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff_unique_index": "0;1;0", "aff_unique_norm": "Chinese University of Hong Kong;Georgia State University", "aff_unique_dep": ";", "aff_unique_url": "https://www.cuhk.edu.cn;https://www.gsu.edu", "aff_unique_abbr": "CUHK;GSU", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Shenzhen;", "aff_country_unique_index": "0;1;0", "aff_country_unique": "China;United States" }, { "title": "An Explanation of In-context Learning as Implicit Bayesian Inference", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6893", "id": "RdJVFCHjUMI", "poster": "", "openreview": "https://openreview.net/forum?id=RdJVFCHjUMI", "slides": "https://iclr.cc/virtual/2022/poster/6893", "video": "https://iclr.cc/virtual/2022/poster/6893", "author_site": "Sang Michael Xie, Aditi Raghunathan, Percy Liang, Tengyu Ma", "tldr": "", "abstract": "Large language models (LMs) such as GPT-3 have the surprising ability to do in-context learning, where the model learns to do a downstream task simply by conditioning on a prompt consisting of input-output examples. The LM learns from these examples without being explicitly pretrained to learn. Thus, it is unclear what enables in-context learning. In this paper, we study how in-context learning can emerge when pretraining documents have long-range coherence. Here, the LM must infer a latent document-level concept to generate coherent next tokens during pretraining. At test time, in-context learning occurs when the LM also infers a shared latent concept between examples in a prompt. We prove when this occurs despite a distribution mismatch between prompts and pretraining data in a setting where the pretraining distribution is a mixture of HMMs. In contrast to messy large-scale datasets used to train LMs capable of in-context learning, we generate a small-scale synthetic dataset (GINC) where Transformers and LSTMs both exhibit in-context learning. Beyond the theory, experiments on GINC exhibit large-scale real-world phenomena including improved in-context performance with model scaling (despite the same pretraining loss), sensitivity to example order, and instances where zero-shot is better than few-shot in-context learning.", "keywords": "in-context learning;language modeling;pre-training;GPT-3", "primary_area": "", "supplementary_material": "/attachment/ffed395625f523367ea3562d91fc323074cf8321.zip", "author": "Sang Michael Xie;Aditi Raghunathan;Percy Liang;Tengyu Ma", "authorids": "~Sang_Michael_Xie1;~Aditi_Raghunathan1;~Percy_Liang1;~Tengyu_Ma1", "gender": ";F;;M", "homepage": "https://cs.stanford.edu/~eix/;https://www.cs.cmu.edu/~aditirag/;https://cs.stanford.edu/~pliang/;http://ai.stanford.edu/~tengyuma/", "dblp": "220/3987;166/1409;04/1701;54/9061", "google_scholar": "EBNa5IEAAAAJ;Ch9iRwQAAAAJ;pouyVyUAAAAJ;i38QlUwAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Sang_Michael_Xie1;~Aditi_Raghunathan1;~Percy_Liang1;~Tengyu_Ma1", "aff": "Stanford University;Carnegie Mellon University;Stanford University;Facebook AI Research", "aff_domain": "stanford.edu;cmu.edu;stanford.edu;fb.com", "position": "PhD student;Assistant Professor;Associate Professor;Visiting Scientist", "bibtex": "@inproceedings{\nxie2022an,\ntitle={An Explanation of In-context Learning as Implicit Bayesian Inference},\nauthor={Sang Michael Xie and Aditi Raghunathan and Percy Liang and Tengyu Ma},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=RdJVFCHjUMI}\n}", "github": "", "project": "", "reviewers": "BHn6;VX98;qavU;j2L6", "pdf_size": 0, "recommendation": "6;6;6;6", "confidence": "4;3;3;4", "correctness": "4;4;3;3", "technical_novelty": "3;3;4;4", "empirical_novelty": "2;3;2;4", "wc_summary_paper": "165;78;97;57", "wc_summary_review": "135;104;43;351", "wc_main_review": "753;570;305;956", "wc_review": "1053;752;445;1364", "wc_reply_reviewers": "139;249;0;16", "wc_reply_authors": "1749;1218;811;1956", "reply_reviewers": "2;1;0;1", "reply_authors": "3;2;2;4", "recommendation_avg": [ 6.0, 0.0 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.5, 0.5 ], "empirical_novelty_avg": [ 2.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 99.25, 40.511572420729365 ], "wc_summary_review_avg": [ 158.25, 116.10205639867021 ], "wc_main_review_avg": [ 646.0, 239.5861014332843 ], "wc_review_avg": [ 903.5, 341.900935944902 ], "wc_reply_reviewers_avg": [ 101.0, 100.96286446015684 ], "wc_reply_authors_avg": [ 1433.5, 449.0247765992429 ], "reply_reviewers_avg": [ 1.0, 0.7071067811865476 ], "reply_authors_avg": [ 2.75, 0.82915619758885 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 795, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15144987797628396832&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=RdJVFCHjUMI", "email": "stanford.edu;cmu.edu;stanford.edu;fb.com", "author_num": 4, "aff_unique_index": "0;1;0;2", "aff_unique_norm": "Stanford University;Carnegie Mellon University;Meta", "aff_unique_dep": ";;Facebook AI Research", "aff_unique_url": "https://www.stanford.edu;https://www.cmu.edu;https://research.facebook.com", "aff_unique_abbr": "Stanford;CMU;FAIR", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Stanford;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Optimal Representations for Covariate Shift", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6901", "id": "Rf58LPCwJj0", "poster": "", "openreview": "https://openreview.net/forum?id=Rf58LPCwJj0", "slides": "https://iclr.cc/virtual/2022/poster/6901", "video": "https://iclr.cc/virtual/2022/poster/6901", "author_site": "Yangjun Ruan, Yann Dubois, Chris Maddison", "tldr": "", "abstract": "Machine learning systems often experience a distribution shift between training and testing. In this paper, we introduce a simple variational objective whose optima are exactly the set of all representations on which risk minimizers are guaranteed to be robust to any distribution shift that preserves the Bayes predictor, e.g., covariate shifts. Our objective has two components. First, a representation must remain discriminative for the task, i.e., some predictor must be able to simultaneously minimize the source and target risk. Second, the representation's marginal support needs to be the same across source and target. We make this practical by designing self-supervised objectives that only use unlabelled data and augmentations to train robust representations. \nOur objectives give insights into the robustness of CLIP, and further improve CLIP's representations to achieve SOTA results on DomainBed.", "keywords": "distribution shift;domain generalization;representation learning;self-supervised learning;invariance;robustness", "primary_area": "", "supplementary_material": "", "author": "Yangjun Ruan;Yann Dubois;Chris J. Maddison", "authorids": "~Yangjun_Ruan1;~Yann_Dubois1;~Chris_J._Maddison1", "gender": "M;M;M", "homepage": "http://www.cs.toronto.edu/~yjruan/;http://yanndubs.github.io/;http://www.cs.toronto.edu/~cmaddis/", "dblp": "237/3892;198/7527;139/1388", "google_scholar": "https://scholar.google.com.hk/citations?user=9AdCSywAAAAJ;bfM1kzAAAAAJ;https://scholar.google.ca/citations?user=WjCG3owAAAAJ", "orcid": ";;", "linkedin": ";duboisyann/;", "or_profile": "~Yangjun_Ruan1;~Yann_Dubois1;~Chris_J_Maddison1", "aff": "University of Toronto;Stanford University;Google", "aff_domain": "toronto.edu;stanford.edu;google.com", "position": "PhD student;PhD student;Researcher", "bibtex": "@inproceedings{\nruan2022optimal,\ntitle={Optimal Representations for Covariate Shift},\nauthor={Yangjun Ruan and Yann Dubois and Chris J. Maddison},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=Rf58LPCwJj0}\n}", "github": "", "project": "", "reviewers": "b5Jo;hrZK;1s9j", "pdf_size": 0, "recommendation": "5;6;8", "confidence": "3;3;3", "correctness": "3;4;4", "technical_novelty": "3;3;3", "empirical_novelty": "2;3;3", "wc_summary_paper": "78;94;269", "wc_summary_review": "31;44;39", "wc_main_review": "660;516;218", "wc_review": "769;654;526", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "1786;1126;310", "reply_reviewers": "0;0;0", "reply_authors": "5;5;3", "recommendation_avg": [ 6.333333333333333, 1.247219128924647 ], "confidence_avg": [ 3.0, 0.0 ], "correctness_avg": [ 3.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 147.0, 86.51396804370187 ], "wc_summary_review_avg": [ 38.0, 5.354126134736337 ], "wc_main_review_avg": [ 464.6666666666667, 184.06037656764212 ], "wc_review_avg": [ 649.6666666666666, 99.25164426289818 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 1074.0, 603.6952873760073 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 4.333333333333333, 0.9428090415820634 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.7559289460184545, "gs_citation": 88, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2022985710361753356&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "pdf": "https://openreview.net/pdf?id=Rf58LPCwJj0", "email": "toronto.edu;stanford.edu;google.com", "author_num": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "University of Toronto;Stanford University;Google", "aff_unique_dep": ";;Google", "aff_unique_url": "https://www.utoronto.ca;https://www.stanford.edu;https://www.google.com", "aff_unique_abbr": "U of T;Stanford;Google", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Stanford;Mountain View", "aff_country_unique_index": "0;1;1", "aff_country_unique": "Canada;United States" }, { "title": "Exploring extreme parameter compression for pre-trained language models", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6449", "id": "RftryyYyjiG", "poster": "", "openreview": "https://openreview.net/forum?id=RftryyYyjiG", "slides": "https://iclr.cc/virtual/2022/poster/6449", "video": "https://iclr.cc/virtual/2022/poster/6449", "author_site": "Wang Benyou, Yuxin Ren, Lifeng Shang, Xin Jiang, Qun Liu", "tldr": "", "abstract": "Recent work explored the potential of large-scale Transformer-based pre-trained models, especially Pre-trained Language Models (PLMs) in natural language processing. This raises many concerns from various perspectives, e.g., financial costs and carbon emissions. \nCompressing PLMs like BERT with negligible performance loss for faster inference and cheaper deployment has attracted much attention. In this work, we aim to explore larger compression ratios for PLMs, among which tensor decomposition is a potential but under-investigated one. By comparing existing decomposition methods, Tucker decomposition is found to be parameter-efficient for compression. Two decomposition and reconstruction protocols are further proposed to improve the effectiveness and efficiency of Tucker decomposition in parameter compression.\nOur compressed BERT with ${1}/{7}$ parameters in Transformer layers performs on-par with, sometimes slightly better than the original BERT in GLUE benchmark. A tiny version achieves 96.7\\% performance of BERT-base with $ {1}/{48} $ encoder parameters (i.e., less than 2M parameters excluding the embedding layer) and \\textbf{$2.7 \\times$} faster on inference. To show that the proposed method is orthogonal to existing compression methods like knowledge distillation, we also explore the benefit of the proposed method on a distilled BERT. ", "keywords": "pre-trained language models;tensor decomposition;compression;BERT", "primary_area": "", "supplementary_material": "", "author": "Benyou Wang;Yuxin Ren;Lifeng Shang;Xin Jiang;Qun Liu", "authorids": "~Benyou_Wang2;~Yuxin_Ren1;~Lifeng_Shang1;~Xin_Jiang1;~Qun_Liu1", "gender": "M;M;M;M;M", "homepage": "https://wabyking.github.io/old.html;https://github.com/twinkle0331;;;http://liuquncn.github.io/", "dblp": "169/1793;;70/4288;42/4142-2;75/4402-1", "google_scholar": "Jk4vJU8AAAAJ;;https://scholar.google.com.hk/citations?user=jMQIjYoAAAAJ;DUfcez0AAAAJ;2HhiGzcAAAAJ", "orcid": "0000-0002-1501-9914;;;0000-0002-9117-8247;0000-0002-7000-1792", "linkedin": ";;;xin-jiang-9577b76/;qunliu/", "or_profile": "~Benyou_Wang2;~Yuxin_Ren1;~Lifeng_Shang1;~Xin_Jiang1;~Qun_Liu1", "aff": "Universita' degli studi di Padova;Tsinghua University;Huawei Technologies Ltd.;Noah\u2019s Ark Lab, Huawei Technologies;Huawei Noah's Ark Lab", "aff_domain": "unipd.it;tsinghua.edu.cn;huawei.com;huawei.com;huawei.com", "position": "PhD student;MS student;Researcher;Principal Researcher;Chief Scientist of Speech and Language Computing", "bibtex": "@inproceedings{\nwang2022exploring,\ntitle={Exploring extreme parameter compression for pre-trained language models},\nauthor={Benyou Wang and Yuxin Ren and Lifeng Shang and Xin Jiang and Qun Liu},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=RftryyYyjiG}\n}", "github": "", "project": "", "reviewers": "MtV7;7TPf;rMZS;fa7a", "pdf_size": 0, "recommendation": "5;6;6;6", "confidence": "4;4;5;4", "correctness": "2;2;3;3", "technical_novelty": "3;2;3;3", "empirical_novelty": "1;3;2;3", "wc_summary_paper": "38;93;222;41", "wc_summary_review": "28;49;96;51", "wc_main_review": "254;270;355;297", "wc_review": "320;412;673;389", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "1434;1238;1598;758", "reply_reviewers": "0;0;0;0", "reply_authors": "4;2;3;2", "recommendation_avg": [ 5.75, 0.4330127018922193 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 98.5, 74.58049342824168 ], "wc_summary_review_avg": [ 56.0, 24.78911051248108 ], "wc_main_review_avg": [ 294.0, 38.4252521136817 ], "wc_review_avg": [ 448.5, 133.96361446303246 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 1257.0, 315.02857013293254 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.75, 0.82915619758885 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 26, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10120048061999340751&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=RftryyYyjiG", "email": "unipd.it;tsinghua.edu.cn;huawei.com;huawei.com;huawei.com", "author_num": 5, "aff_unique_index": "0;1;2;2;2", "aff_unique_norm": "University of Padova;Tsinghua University;Huawei", "aff_unique_dep": ";;Huawei Technologies", "aff_unique_url": "https://www.unipd.it;https://www.tsinghua.edu.cn;https://www.huawei.com", "aff_unique_abbr": "Unipd;THU;Huawei", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1;1", "aff_country_unique": "Italy;China" }, { "id": "Rh3khfuQUYk", "title": "Iterative Decoding for Compositional Generalization in Transformers", "track": "main", "status": "Reject", "tldr": "", "abstract": "Deep learning models do well at generalizing to in-distribution data but struggle to generalize compositionally, i.e., to combine a set of learned primitives to solve more complex tasks. In particular, in sequence-to-sequence (seq2seq) learning, transformers are often unable to predict even marginally longer examples than those seen during training. This paper introduces iterative decoding, an alternative to seq2seq learning that (i) improves transformer compositional generalization and (ii) evidences that, in general, seq2seq transformers do not learn iterations that are not unrolled. Inspired by the idea of compositionality---that complex tasks can be solved by composing basic primitives---training examples are broken down into a sequence of intermediate steps that the transformer then learns iteratively. At inference time, the intermediate outputs are fed back to the transformer as intermediate inputs until an end-of-iteration token is predicted. Through numerical experiments, we show that transfomers trained via iterative decoding outperform their seq2seq counterparts on the PCFG dataset, and solve the problem of calculating Cartesian products between vectors longer than those seen during training with 100% accuracy, a task at which seq2seq models have been shown to fail. We also illustrate a limitation of iterative decoding, specifically, that it can make sorting harder to learn on the CFQ dataset.", "keywords": "compositional generalization;transformer;compositionality;deep learning;NLP", "primary_area": "", "supplementary_material": "", "author": "Luana Ruiz;Joshua Ainslie;Santiago Ontanon", "authorids": "~Luana_Ruiz1;~Joshua_Ainslie1;~Santiago_Ontanon1", "gender": "F;;", "homepage": "https://sites.google.com/view/luana-ruiz/home;;https://sites.google.com/site/santiagoontanonvillar/", "dblp": ";263/3363;https://dblp.org/pers/o/Onta=ntilde==oacute=n:Santiago.html", "google_scholar": "J-rZew8AAAAJ;;aS-DrOwAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Luana_Ruiz1;~Joshua_Ainslie1;~Santiago_Ontanon1", "aff": "School of Engineering and Applied Science, University of Pennsylvania;Google;Drexel University", "aff_domain": "seas.upenn.edu;google.com;drexel.edu", "position": "PhD student;Software Engineer;Associate Professor", "bibtex": "@misc{\nruiz2022iterative,\ntitle={Iterative Decoding for Compositional Generalization in Transformers},\nauthor={Luana Ruiz and Joshua Ainslie and Santiago Ontanon},\nyear={2022},\nurl={https://openreview.net/forum?id=Rh3khfuQUYk}\n}", "github": "", "project": "", "reviewers": "YC1Z;LPPa;BbzM;BXC7", "site": "https://openreview.net/forum?id=Rh3khfuQUYk", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "4;4;4;5", "correctness": "3;2;3;3", "technical_novelty": "2;2;2;3", "empirical_novelty": "1;0;2;2", "wc_summary_paper": "99;82;106;114", "wc_summary_review": "26;23;61;42", "wc_main_review": "221;289;228;733", "wc_review": "346;394;395;889", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 100.25, 11.797775213996918 ], "wc_summary_review_avg": [ 38.0, 15.116216457830975 ], "wc_main_review_avg": [ 367.75, 212.52926269104685 ], "wc_review_avg": [ 506.0, 222.01013490379216 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 1.0, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8589648667212273978&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;1;2", "aff_unique_norm": "University of Pennsylvania;Google;Drexel University", "aff_unique_dep": "School of Engineering and Applied Science;Google;", "aff_unique_url": "https://www.upenn.edu;https://www.google.com;https://www.drexel.edu", "aff_unique_abbr": "UPenn;Google;Drexel", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Sample and Computation Redistribution for Efficient Face Detection", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6369", "id": "RhB1AdoFfGE", "poster": "", "openreview": "https://openreview.net/forum?id=RhB1AdoFfGE", "slides": "https://iclr.cc/virtual/2022/poster/6369", "video": "https://iclr.cc/virtual/2022/poster/6369", "author_site": "Jia Guo, Jiankang Deng, Alexandros Lattas, Stefanos Zafeiriou", "tldr": "", "abstract": "Although tremendous strides have been made in uncontrolled face detection, accurate face detection with a low computation cost remains an open challenge. In this paper, we point out that computation distribution and scale augmentation are the keys to detecting small faces from low-resolution images. Motivated by these observations, we introduce two simple but effective methods: (1) Computation Redistribution (CR), which reallocates the computation between the backbone, neck and head of the model; and (2) Sample Redistribution (SR), which augments training samples for the most needed stages. The proposed Sample and Computation Redistribution for Face Detection (SCRFD) is implemented by a random search in a meticulously designed search space. Extensive experiments conducted on WIDER FACE demonstrate the state-of-the-art accuracy-efficiency trade-off for the proposed SCRFD family across a wide range of compute regimes. In particular, SCRFD-34GF outperforms the best competitor, TinaFace, by $4.78\\%$ (AP at hard set) while being more than 3$\\times$ faster on GPUs with VGA-resolution images. Code is available at: https://github.com/deepinsight/insightface/tree/master/detection/scrfd.", "keywords": "efficient face detection;computation redistribution;sample redistribution", "primary_area": "", "supplementary_material": "/attachment/f9c1bf1166045e338636898678f68d9057ac2060.zip", "author": "Jia Guo;Jiankang Deng;Alexandros Lattas;Stefanos Zafeiriou", "authorids": "~Jia_Guo1;~Jiankang_Deng1;~Alexandros_Lattas1;~Stefanos_Zafeiriou1", "gender": ";M;M;M", "homepage": "https://insightface.ai;https://jiankangdeng.github.io/;;http://www.imperial.ac.uk/people/s.zafeiriou/", "dblp": ";156/7808;221/0633;25/1885.html", "google_scholar": "H_-hMLUAAAAJ;Z_UoQFsAAAAJ;0wJRUlsAAAAJ;QKOH5iYAAAAJ", "orcid": ";0000-0002-3709-6216;0000-0002-9964-6105;", "linkedin": ";jiankang-deng-b45b21b4/?originalSubdomain=uk;;", "or_profile": "~Jia_Guo1;~Jiankang_Deng1;~Alexandros_Lattas1;~Stefanos_Zafeiriou1", "aff": "InsightFace.AI;;Imperial College London;Imperial College London", "aff_domain": "insightface.ai;;imperial.ac.uk;ic.ac.uk", "position": "Researcher;;PhD student;Full Professor", "bibtex": "@inproceedings{\nguo2022sample,\ntitle={Sample and Computation Redistribution for Efficient Face Detection},\nauthor={Jia Guo and Jiankang Deng and Alexandros Lattas and Stefanos Zafeiriou},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=RhB1AdoFfGE}\n}", "github": "", "project": "", "reviewers": "zu8F;prPQ;Vs5g;oWUC", "pdf_size": 0, "recommendation": "6;6;8;8", "confidence": "3;3;3;3", "correctness": "3;3;4;4", "technical_novelty": "2;2;4;3", "empirical_novelty": "3;3;4;3", "wc_summary_paper": "79;72;49;87", "wc_summary_review": "49;47;51;31", "wc_main_review": "237;137;102;75", "wc_review": "365;256;202;193", "wc_reply_reviewers": "6;6;0;8", "wc_reply_authors": "498;470;638;118", "reply_reviewers": "1;1;0;1", "reply_authors": "1;1;1;1", "recommendation_avg": [ 7.0, 1.0 ], "confidence_avg": [ 3.0, 0.0 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.82915619758885 ], "empirical_novelty_avg": [ 3.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 71.75, 14.16642156650719 ], "wc_summary_review_avg": [ 44.5, 7.92148975887743 ], "wc_main_review_avg": [ 137.75, 61.3733451915406 ], "wc_review_avg": [ 254.0, 68.46531968814577 ], "wc_reply_reviewers_avg": [ 5.0, 3.0 ], "wc_reply_authors_avg": [ 431.0, 191.59070958686905 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 1.0, "gs_citation": 169, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=249972322094479786&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=RhB1AdoFfGE", "email": "insightface.ai;;imperial.ac.uk;ic.ac.uk", "author_num": 4, "aff_unique_index": "0;1;1", "aff_unique_norm": "InsightFace.AI;Imperial College London", "aff_unique_dep": ";", "aff_unique_url": "https://www.insightface.ai;https://www.imperial.ac.uk", "aff_unique_abbr": "InsightFace.AI;ICL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1", "aff_country_unique": "China;United Kingdom" }, { "id": "Rivn22SJjg9", "title": "Contrastive Embeddings for Neural Architectures", "track": "main", "status": "Reject", "tldr": "", "abstract": "The performance of algorithms for neural architecture search strongly depends on the parametrization of the search space. We use contrastive learning to identify networks across different initializations based on their data Jacobians and their number of parameters, and automatically produce the first architecture embeddings independent from the parametrization of the search space. Using our contrastive embeddings, we show that traditional black-box optimization algorithms, without modification, can reach state-of-the-art performance in Neural Architecture Search. As our method provides a unified embedding space, we successfully perform transfer learning between search spaces. Finally, we show the evolution of embeddings during training, motivating future studies into using embeddings at different training stages to gain a deeper understanding of the networks in a search space.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/df760fed9930b5ea9bdcfef9fbc1a00a679ef9d2.zip", "author": "Daniel Hesslow;Iacopo Poli", "authorids": "~Daniel_Hesslow1;~Iacopo_Poli1", "gender": ";M", "homepage": ";", "dblp": ";", "google_scholar": "xslrgtIAAAAJ;vRsrdUgAAAAJ", "orcid": ";0000-0002-0964-0624", "linkedin": ";", "or_profile": "~Daniel_Hesslow1;~Iacopo_Poli1", "aff": "Lighton;LightOn", "aff_domain": "lighton.ai;lighton.ai", "position": "PhD student;Lead Research Scientist", "bibtex": "@misc{\nhesslow2022contrastive,\ntitle={Contrastive Embeddings for Neural Architectures},\nauthor={Daniel Hesslow and Iacopo Poli},\nyear={2022},\nurl={https://openreview.net/forum?id=Rivn22SJjg9}\n}", "github": "", "project": "", "reviewers": "oXnB;pbA7;KncE", "site": "https://openreview.net/forum?id=Rivn22SJjg9", "pdf_size": 0, "recommendation": "3;5;5", "confidence": "3;4;4", "correctness": "3;4;2", "technical_novelty": "3;3;3", "empirical_novelty": "3;3;2", "wc_summary_paper": "100;107;37", "wc_summary_review": "53;34;74", "wc_main_review": "366;304;354", "wc_review": "519;445;465", "wc_reply_reviewers": "173;50;98", "wc_reply_authors": "787;462;591", "reply_reviewers": "1;1;1", "reply_authors": "1;1;1", "recommendation_avg": [ 4.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.0, 0.816496580927726 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 81.33333333333333, 31.47838764754143 ], "wc_summary_review_avg": [ 53.666666666666664, 16.33673433979046 ], "wc_main_review_avg": [ 341.3333333333333, 26.849374087469688 ], "wc_review_avg": [ 476.3333333333333, 31.255221785949445 ], "wc_reply_reviewers_avg": [ 107.0, 50.61620293937506 ], "wc_reply_authors_avg": [ 613.3333333333334, 133.61719782855633 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.9999999999999998, "corr_recommendation_correctness": 0.0, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1575420103809467732&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0", "aff_unique_norm": "LightOn", "aff_unique_dep": "", "aff_unique_url": "", "aff_unique_abbr": "", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "", "aff_country_unique": "" }, { "id": "Rj-x5_ej6B", "title": "Partial Information as Full: Reward Imputation with Sketching in Bandits", "track": "main", "status": "Reject", "tldr": "", "abstract": "We focus on the setting of contextual batched bandit (CBB), where a batch of rewards is observed from the environment in each episode. But these rewards are partial-information feedbacks where the rewards of the non-executed actions are unobserved. Existing approaches for CBB usually ignore the potential rewards of the non-executed actions, resulting in feedback information being underutilized. In this paper, we propose an efficient reward imputation approach using sketching in CBB, which completes the unobserved rewards with the imputed rewards approximating the full-information feedbacks. Specifically, we formulate the reward imputation as a problem of imputation regularized ridge regression, which captures the feedback mechanisms of both the non-executed and executed actions. To reduce the time complexity of reward imputation on a large batch of data, we use randomized sketching for solving the regression problem of imputation. We prove that the proposed reward imputation approach obtains a relative-error bound for sketching approximation, achieves an instantaneous regret with an exponentially-decaying bias and a smaller variance than that without reward imputation, and enjoys a sublinear regret bound against the optimal policy. Moreover, we present two extensions of our approach, including the rate-scheduled version and the version for nonlinear rewards, which makes our approach more feasible. Experimental results demonstrated that our approach can outperform the state-of-the-art baselines on a synthetic dataset, the Criteo dataset, and a dataset from a commercial app.", "keywords": "reward imputation;bandit;sketching;regret analysis", "primary_area": "", "supplementary_material": "/attachment/e5f6d1d5a8c29bb5881a3b05a3fc441387daa7a9.zip", "author": "Xiao Zhang;Ninglu Shao;Zihua Si;Jun Xu;Wenhan Wang;hanjing su;Ji-Rong Wen", "authorids": "~Xiao_Zhang7;~Ninglu_Shao1;~Zihua_Si1;~Jun_Xu1;~Wenhan_Wang3;~hanjing_su1;~Ji-Rong_Wen1", "gender": "M;M;M;M;M;M;M", "homepage": "https://pinkfloyd1989.github.io/Xiao_Zhang/;https://rainym00d.github.io/ninglushao.github.io/;;https://gsai.ruc.edu.cn/~junxu;;http://www.52cs.com;https://gsai.ruc.edu.cn/english/jrwen", "dblp": "49/4478-34;;;90/514-1;;;w/JRWen", "google_scholar": "https://scholar.google.com.hk/citations?user=5FZ6wbAAAAAJ;;https://scholar.google.com/citations?hl=zh-CN;su14mcEAAAAJ;;;tbxCHJgAAAAJ", "orcid": "0000-0001-7397-5632;;;;;;0000-0002-9777-9676", "linkedin": ";;;;wenhan-wang-aa0bb378/;;", "or_profile": "~Xiao_Zhang7;~Ninglu_Shao1;~Zihua_Si1;~Jun_Xu1;~Wenhan_Wang3;~hanjing_su1;~Ji-Rong_Wen1", "aff": "Renmin University of China;Renmin University of China;Renmin University of China;Renmin University of China;Tencent Group;;Renmin University of China", "aff_domain": "ruc.edu.cn;ruc.edu.cn;ruc.edu.cn;ruc.edu.cn;tencent.com;;ruc.edu.cn", "position": "Postdoc;Undergrad student;Undergrad student;Full Professor;Researcher;;Full Professor", "bibtex": "@misc{\nzhang2022partial,\ntitle={Partial Information as Full: Reward Imputation with Sketching in Bandits},\nauthor={Xiao Zhang and Ninglu Shao and Zihua Si and Jun Xu and Wenhan Wang and hanjing su and Ji-Rong Wen},\nyear={2022},\nurl={https://openreview.net/forum?id=Rj-x5_ej6B}\n}", "github": "", "project": "", "reviewers": "FYMj;cFbQ;yEx1", "site": "https://openreview.net/forum?id=Rj-x5_ej6B", "pdf_size": 0, "recommendation": "5;5;6", "confidence": "4;4;4", "correctness": "2;3;4", "technical_novelty": "3;2;3", "empirical_novelty": "3;3;3", "wc_summary_paper": "181;98;95", "wc_summary_review": "146;95;65", "wc_main_review": "1153;333;201", "wc_review": "1480;526;361", "wc_reply_reviewers": "567;0;0", "wc_reply_authors": "1197;546;290", "reply_reviewers": "2;0;0", "reply_authors": "3;1;1", "recommendation_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.0, 0.816496580927726 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 124.66666666666667, 39.852505846210256 ], "wc_summary_review_avg": [ 102.0, 33.436506994600975 ], "wc_main_review_avg": [ 562.3333333333334, 421.12653152651825 ], "wc_review_avg": [ 789.0, 493.2321968403928 ], "wc_reply_reviewers_avg": [ 189.0, 267.28636328851496 ], "wc_reply_authors_avg": [ 677.6666666666666, 381.8065245935724 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.9428090415820634 ], "reply_authors_avg": [ 1.6666666666666667, 0.9428090415820634 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.8660254037844385, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Bvs6NeGoM0oJ:scholar.google.com/&scioq=Partial+Information+as+Full:+Reward+Imputation+with+Sketching+in+Bandits&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0;0;0;1;0", "aff_unique_norm": "Renmin University of China;Tencent", "aff_unique_dep": ";Tencent", "aff_unique_url": "http://www.ruc.edu.cn;https://www.tencent.com", "aff_unique_abbr": "RUC;Tencent", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "China" }, { "id": "Rj2qQDm_rxe", "title": "KIMERA: Injecting Domain Knowledge into Vacant Transformer Heads", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Training transformer language models requires vast amounts of text and computational resources. This drastically limits the usage of these models in niche domains for which they are not optimized, or where domain-specific training data is scarce. We focus here on the clinical domain because of its limited access to training data in common tasks, while structured ontological data is often readily available. Recent observations in model compression of transformer models show optimization potential in improving the representation capacity of attention heads. We propose KIMERA (Knowledge Injection via Mask Enforced Retraining of Attention) for detecting, retraining and instilling attention heads with complementary structured domain knowledge. \nOur novel multi-task training scheme effectively identifies and targets individual attention heads that are least useful for a given downstream task and optimizes their representation with information from structured data. \nDue to its multi-task nature KIMERA generalizes well, thereby building the basis for an efficient fine-tuning.\nKIMERA achieves significant performance boosts on seven datasets in the medical domain in Information Retrieval and Clinical Outcome Prediction settings. We apply KIMERA to BERT-base to evaluate the extent of the domain transfer and also improve on the already strong results of BioBERT in the clinical domain.", "keywords": "Transformer;Domain Adaption;Medical;Clinical;Attention", "primary_area": "", "supplementary_material": "", "author": "Benjamin Winter;Alexei Figueroa;Alexander L\u00f6ser;Felix Alexander Gers;Amy Siu", "authorids": "~Benjamin_Winter1;afigueroa@beuth-hochschule.de;~Alexander_L\u00f6ser1;~Felix_Alexander_Gers1;~Amy_Siu1", "gender": "M;;;;F", "homepage": ";;;;https://prof.bht-berlin.de/siu/", "dblp": ";;36/979;;127/1469", "google_scholar": ";;;;MotmYWgAAAAJ", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~Benjamin_Winter1;afigueroa@beuth-hochschule.de;~Alexander_L\u00f6ser1;~Felix_Alexander_Gers1;~Amy_Siu1", "aff": "Beuth University of Applied Sciences;;;;Beuth Hochschule f\u00fcr Technik Berlin", "aff_domain": "beuth-hochschule.de;;;;beuth-hochschule.de", "position": "PhD student;;;;Professor", "bibtex": "@misc{\nwinter2022kimera,\ntitle={{KIMERA}: Injecting Domain Knowledge into Vacant Transformer Heads},\nauthor={Benjamin Winter and Alexei Figueroa and Alexander L{\\\"o}ser and Felix Alexander Gers and Amy Siu},\nyear={2022},\nurl={https://openreview.net/forum?id=Rj2qQDm_rxe}\n}", "github": "", "project": "", "reviewers": "HMsC;BBNz;ji6r;x5Jr", "site": "https://openreview.net/forum?id=Rj2qQDm_rxe", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "4;5;4;3", "correctness": "3;3;4;3", "technical_novelty": "3;3;3;2", "empirical_novelty": "3;3;4;3", "wc_summary_paper": "89;136;342;97", "wc_summary_review": "76;137;65;21", "wc_main_review": "905;454;526;158", "wc_review": "1070;727;933;276", "wc_reply_reviewers": "285;0;66;0", "wc_reply_authors": "567;363;321;197", "reply_reviewers": "1;0;1;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 166.0, 103.15764634771385 ], "wc_summary_review_avg": [ 74.75, 41.41482222586498 ], "wc_main_review_avg": [ 510.75, 266.13847429486776 ], "wc_review_avg": [ 751.5, 300.4517432134485 ], "wc_reply_reviewers_avg": [ 87.75, 117.02643932035188 ], "wc_reply_authors_avg": [ 362.0, 133.16531079827058 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.7071067811865475, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=786545856797911363&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;1", "aff_unique_norm": "Beuth University of Applied Sciences;Beuth Hochschule f\u00fcr Technik Berlin", "aff_unique_dep": ";", "aff_unique_url": "https://www.beuth-hochschule.de/;https://www.beuth-hochschule.de/", "aff_unique_abbr": "Beuth;BHT Berlin", "aff_campus_unique_index": "1", "aff_campus_unique": ";Berlin", "aff_country_unique_index": "0;0", "aff_country_unique": "Germany" }, { "id": "RjMtFbmETG", "title": "Resmax: An Alternative Soft-Greedy Operator for Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Soft-greedy operators, namely $\\varepsilon$-greedy and softmax, remain a common choice to induce a basic level of exploration for action-value methods in reinforcement learning. These operators, however, have a few critical limitations. In this work, we investigate a simple soft-greedy operator, which we call resmax, that takes actions proportionally to their suboptimality gap: the residual to the estimated maximal value. It is simple to use and ensures coverage of the state-space like $\\varepsilon$-greedy, but focuses exploration more on potentially promising actions like softmax. Further, it does not concentrate probability as quickly as softmax, and so better avoids overemphasizing sub-optimal actions that appear high-valued during learning. Additionally, we prove it is a non-expansion for any fixed exploration hyperparameter, unlike the softmax policy which requires a state-action specific temperature to obtain a non-expansion (called mellowmax). We empirically validate that resmax is comparable to or outperforms $\\varepsilon$-greedy and softmax across a variety of environments in tabular and deep RL.", "keywords": "exploration;reinforcement learning;action-value methods;soft-greedy operator;softmax;mellowmax;epsilon-greedy;suboptimality gap", "primary_area": "", "supplementary_material": "/attachment/193d0cf8ee54feec9e182c74cd1c0bc6773df94f.zip", "author": "Erfan Miahi;Revan MacQueen;Alex Ayoub;Abbas Masoumzadeh;Martha White", "authorids": "~Erfan_Miahi1;~Revan_MacQueen1;~Alex_Ayoub1;~Abbas_Masoumzadeh1;~Martha_White1", "gender": "M;M;M;;F", "homepage": "https://erfanmhi.github.io/;https://www.revanmacqueen.com/;;https://abbasmz.github.io/;http://marthawhite.ca", "dblp": "249/2764;280/0873;266/8071;256/1094;60/7057", "google_scholar": "7aP0Sp4AAAAJ;https://scholar.google.ca/citations?user=ZXXfcCMAAAAJ;eh0TSgYAAAAJ;https://scholar.google.co.id/citations?user=-nW1974AAAAJ;t5zdD_IAAAAJ", "orcid": "0000-0001-7510-083X;;;;0000-0002-5356-2950", "linkedin": "erfan-miahi-8637a1130/;;;https://linkedin.com/in/amasoumzadeh;", "or_profile": "~Erfan_Miahi1;~Revan_MacQueen1;~Alex_Ayoub1;~Abbas_Masoumzadeh1;~Martha_White1", "aff": "University of Alberta;University of Alberta;Spotify ;University of Alberta;University of Alberta", "aff_domain": "ualberta.ca;ualberta.ca;spotify.com;ualberta.ca;ualberta.ca", "position": "MS student;MS student;Intern;PhD student;Associate Professor", "bibtex": "@misc{\nmiahi2022resmax,\ntitle={Resmax: An Alternative Soft-Greedy Operator for Reinforcement Learning},\nauthor={Erfan Miahi and Revan MacQueen and Alex Ayoub and Abbas Masoumzadeh and Martha White},\nyear={2022},\nurl={https://openreview.net/forum?id=RjMtFbmETG}\n}", "github": "", "project": "", "reviewers": "sPif;Y4xX;ypnG;NLrw", "site": "https://openreview.net/forum?id=RjMtFbmETG", "pdf_size": 0, "recommendation": "3;5;6;6", "confidence": "4;3;4;5", "correctness": "3;3;3;3", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;2;3;2", "wc_summary_paper": "65;62;232;41", "wc_summary_review": "224;110;72;44", "wc_main_review": "584;298;333;706", "wc_review": "873;470;637;791", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "294;673;450;1850", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;3", "recommendation_avg": [ 5.0, 1.224744871391589 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 100.0, 76.76913442263108 ], "wc_summary_review_avg": [ 112.5, 68.5036495378166 ], "wc_main_review_avg": [ 480.25, 170.7518301512461 ], "wc_review_avg": [ 692.75, 154.00385547121863 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 816.75, 611.5641319600096 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.5, 0.8660254037844386 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.28867513459481287, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:LcB2-3o2eOkJ:scholar.google.com/&scioq=Resmax:+An+Alternative+Soft-Greedy+Operator+for+Reinforcement+Learning&hl=en&as_sdt=0,33", "gs_version_total": 2, "aff_unique_index": "0;0;1;0;0", "aff_unique_norm": "University of Alberta;Spotify", "aff_unique_dep": ";", "aff_unique_url": "https://www.ualberta.ca;https://www.spotify.com", "aff_unique_abbr": "UAlberta;Spotify", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;0", "aff_country_unique": "Canada;Sweden" }, { "id": "RmzNH3A1cWc", "title": "Hardware-Aware Network Transformation", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "In this paper, we tackle the problem of network acceleration by proposing hardware-aware network transformation (HANT), an approach that builds on neural architecture search techniques and teacher-student distillation. HANT consists of two phases: in the first phase, it trains many alternative operations for every layer of the teacher network using layer-wise feature map distillation. In the second phase, it solves the combinatorial selection of efficient operations using a novel constrained integer linear optimization approach. In extensive experiments, we show that HANT can successfully accelerate three different families of network architectures (EfficientNetsV1, EfficientNetsV2 and ResNests), over two different target hardware platforms with minimal loss of accuracy. For example, HANT accelerates EfficientNetsV1-B6 by 3.6 with <0.4% drop in top-1 accuracy on ImageNet. When comparing the same latency level, HANT can accelerate EfficientNetV1-B4 to the same latency as EfficientNetV1-B1 while achieving 3% higher accuracy. We also show that applying HANT to EfficientNetV1 results in the automated discovery of the same (qualitative) architecture modifications later incorporated in EfficientNetV2. Finally, HANT\u2019s efficient search allows us to examine a large pool of 197 operations per layer, resulting in new insights into the accuracy-latency tradeoffs for different operations.", "keywords": "Model Compression;NAS;Neural Network Acceleration", "primary_area": "", "supplementary_material": "", "author": "Pavlo Molchanov;Jimmy Hall;Hongxu Yin;Jan Kautz;Nicolo Fusi;Arash Vahdat", "authorids": "~Pavlo_Molchanov1;james.hall@microsoft.com;~Hongxu_Yin2;~Jan_Kautz1;~Nicolo_Fusi1;~Arash_Vahdat3", "gender": "M;;;;M;M", "homepage": ";;;http://jankautz.com;;http://latentspace.cc/", "dblp": "165/8169.html;;;48/6214;86/10995;92/8108", "google_scholar": "J9PoyoIAAAAJ;;;P9FclNEAAAAJ;GldD-lwAAAAJ;https://scholar.google.ca/citations?user=p9-nlRIAAAAJ", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": "~Pavlo_Molchanov1;james.hall@microsoft.com;~Hongxu_Yin2;~Jan_Kautz1;~Nicolo_Fusi1;~Arash_Vahdat3", "aff": "NVIDIA Research;;;NVIDIA;Microsoft;NVIDIA", "aff_domain": "nvidia.com;;;nvidia.com;microsoft.com;nvidia.com", "position": "Research Scientist;;;VP Research;Researcher;Research Scientist", "bibtex": "@misc{\nmolchanov2022hardwareaware,\ntitle={Hardware-Aware Network Transformation},\nauthor={Pavlo Molchanov and Jimmy Hall and Hongxu Yin and Jan Kautz and Nicolo Fusi and Arash Vahdat},\nyear={2022},\nurl={https://openreview.net/forum?id=RmzNH3A1cWc}\n}", "github": "", "project": "", "reviewers": "Ub6D;45NN;tSQs;1itH", "site": "https://openreview.net/forum?id=RmzNH3A1cWc", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "4;3;4;4", "correctness": "1;3;4;3", "technical_novelty": "3;3;2;3", "empirical_novelty": "1;2;3;3", "wc_summary_paper": "57;82;84;83", "wc_summary_review": "40;94;34;32", "wc_main_review": "209;459;103;218", "wc_review": "306;635;221;333", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "538;1369;553;207", "reply_reviewers": "0;0;0;0", "reply_authors": "1;2;1;1", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 2.75, 1.0897247358851685 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 76.5, 11.280514172678478 ], "wc_summary_review_avg": [ 50.0, 25.573423705088842 ], "wc_main_review_avg": [ 247.25, 130.35024932849188 ], "wc_review_avg": [ 373.75, 156.3927348056808 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 666.75, 428.380890680245 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.13245323570650439, "corr_recommendation_correctness": 0.7894736842105263, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:DkzLvD17SHYJ:scholar.google.com/&scioq=Hardware-Aware+Network+Transformation&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "NVIDIA;Microsoft", "aff_unique_dep": "NVIDIA Research;Microsoft Corporation", "aff_unique_url": "https://www.nvidia.com/research;https://www.microsoft.com", "aff_unique_abbr": "NVIDIA;Microsoft", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "Rnk6NRGudTa", "title": "Parameterizing Activation Functions for Adversarial Robustness", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Deep neural networks are known to be vulnerable to adversarially perturbed inputs. A commonly used defense is adversarial training, whose performance is influenced by model capacity. While previous works have studied the impact of varying model width and depth on robustness, the impact of increasing capacity by using learnable parametric activation functions (PAFs) has not been studied. We study how using learnable PAFs can improve robustness in conjunction with adversarial training. We first ask the question: how should we incorporate parameters into activation functions to improve robustness? To address this, we analyze the direct impact of activation shape on robustness through PAFs and observe that activation shapes with positive outputs on negative inputs and with high finite curvature can increase robustness. We combine these properties to create a new PAF, which we call Parametric Shifted Sigmoidal Linear Unit (PSSiLU). We then combine PAFs (including PReLU, PSoftplus and PSSiLU) with adversarial training and analyze robust performance. We find that PAFs optimize towards activation shape properties found to directly affect robustness. Additionally, we find that while introducing only 1-2 learnable parameters into the network, smooth PAFs can significantly increase robustness over ReLU. For instance, when trained on CIFAR-10 with additional synthetic data, PSSiLU improves robust accuracy by 4.54% over ReLU on ResNet-18 and 2.69% over ReLU on WRN-28-10 in the $\\ell_{\\infty}$ threat model while adding only 2 additional parameters into the network architecture. The PSSiLU WRN-28-10 model achieves 61.96% AutoAttack accuracy, improving over the state-of-the-art robust accuracy on RobustBench (Croce et. al, 2020). Overall, our work puts into context the importance of activation functions in adversarially trained models.", "keywords": "activation functions;adversarial training", "primary_area": "", "supplementary_material": "/attachment/217431fb075949481948adc38a5aec2a1e5da8ea.zip", "author": "Sihui Dai;Saeed Mahloujifar;Prateek Mittal", "authorids": "~Sihui_Dai1;~Saeed_Mahloujifar1;~Prateek_Mittal1", "gender": "F;M;", "homepage": ";https://www.cs.virginia.edu/~sm5fd/;http://www.princeton.edu/~pmittal/", "dblp": "244/9642;208/0825;", "google_scholar": ";kW-hl3YAAAAJ;https://scholar.google.com.tw/citations?user=xTKD8J4AAAAJ", "orcid": ";;0000-0002-4057-0118", "linkedin": ";;", "or_profile": "~Sihui_Dai1;~Saeed_Mahloujifar1;~Prateek_Mittal1", "aff": "Princeton University;Princeton University;Princeton University", "aff_domain": "princeton.edu;princeton.edu;princeton.edu", "position": "PhD student;Postdoc;Full Professor", "bibtex": "@misc{\ndai2022parameterizing,\ntitle={Parameterizing Activation Functions for Adversarial Robustness},\nauthor={Sihui Dai and Saeed Mahloujifar and Prateek Mittal},\nyear={2022},\nurl={https://openreview.net/forum?id=Rnk6NRGudTa}\n}", "github": "", "project": "", "reviewers": "WN9k;qjye;d2rp;kVxU", "site": "https://openreview.net/forum?id=Rnk6NRGudTa", "pdf_size": 0, "recommendation": "3;5;5;5", "confidence": "4;4;5;4", "correctness": "2;3;3;4", "technical_novelty": "2;2;2;3", "empirical_novelty": "1;3;3;2", "wc_summary_paper": "97;62;150;62", "wc_summary_review": "73;85;69;67", "wc_main_review": "343;387;756;165", "wc_review": "513;534;975;294", "wc_reply_reviewers": "0;563;19;0", "wc_reply_authors": "560;561;574;343", "reply_reviewers": "0;1;1;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 4.5, 0.8660254037844386 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 92.75, 36.00954734511391 ], "wc_summary_review_avg": [ 73.5, 6.98212002188447 ], "wc_main_review_avg": [ 412.75, 214.90041298238586 ], "wc_review_avg": [ 579.0, 247.19526694498015 ], "wc_reply_reviewers_avg": [ 145.5, 241.16850955296798 ], "wc_reply_authors_avg": [ 509.5, 96.28733042306241 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": 0.816496580927726, "gs_citation": 35, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5719082611221849729&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff_unique_index": "0;0;0", "aff_unique_norm": "Princeton University", "aff_unique_dep": "", "aff_unique_url": "https://www.princeton.edu", "aff_unique_abbr": "Princeton", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Tracking the risk of a deployed model and detecting harmful distribution shifts", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6896", "id": "Ro_zAjZppv", "poster": "", "openreview": "https://openreview.net/forum?id=Ro_zAjZppv", "slides": "https://iclr.cc/virtual/2022/poster/6896", "video": "https://iclr.cc/virtual/2022/poster/6896", "author_site": "Aleksandr Podkopaev, Aaditya Ramdas", "tldr": "", "abstract": "When deployed in the real world, machine learning models inevitably encounter changes in the data distribution, and certain---but not all---distribution shifts could result in significant performance degradation. In practice, it may make sense to ignore benign shifts, under which the performance of a deployed model does not degrade substantially, making interventions by a human expert (or model retraining) unnecessary. While several works have developed tests for distribution shifts, these typically either use non-sequential methods, or detect arbitrary shifts (benign or harmful), or both. We argue that a sensible method for firing off a warning has to both (a) detect harmful shifts while ignoring benign ones, and (b) allow continuous monitoring of model performance without increasing the false alarm rate. In this work, we design simple sequential tools for testing if the difference between source (training) and target (test) distributions leads to a significant increase in a risk function of interest, like accuracy or calibration. Recent advances in constructing time-uniform confidence sequences allow efficient aggregation of statistical evidence accumulated during the tracking process. The designed framework is applicable in settings where (some) true labels are revealed after the prediction is performed, or when batches of labels become available in a delayed fashion. We demonstrate the efficacy of the proposed framework through an extensive empirical study on a collection of simulated and real datasets.", "keywords": "Distribution shift;sequential testing", "primary_area": "", "supplementary_material": "/attachment/526060ad4ef0325751f4b35f06aea4f0213c9925.zip", "author": "Aleksandr Podkopaev;Aaditya Ramdas", "authorids": "~Aleksandr_Podkopaev1;~Aaditya_Ramdas2", "gender": "M;M", "homepage": "https://sashapodkopaev.com;http://stat.cmu.edu/~aramdas", "dblp": "268/0747;117/3518", "google_scholar": "58-8sF8AAAAJ;ZvFaPxUAAAAJ", "orcid": ";0000-0003-0497-311X", "linkedin": "sasha-podkopaev/;", "or_profile": "~Aleksandr_Podkopaev1;~Aaditya_Ramdas2", "aff": "Carnegie Mellon University;Carnegie Mellon University", "aff_domain": "andrew.cmu.edu;cmu.edu", "position": "PhD student;Assistant Professor", "bibtex": "@inproceedings{\npodkopaev2022tracking,\ntitle={Tracking the risk of a deployed model and detecting harmful distribution shifts},\nauthor={Aleksandr Podkopaev and Aaditya Ramdas},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=Ro_zAjZppv}\n}", "github": "", "project": "", "reviewers": "a6Wo;oNqc;mEfm;9YNK;P77d", "pdf_size": 0, "recommendation": "6;6;6;8;8", "confidence": "2;3;3;3;4", "correctness": "4;4;3;4;4", "technical_novelty": "3;2;2;4;2", "empirical_novelty": "3;2;2;3;4", "wc_summary_paper": "81;38;77;79;101", "wc_summary_review": "67;43;82;68;146", "wc_main_review": "230;249;215;211;703", "wc_review": "378;330;374;358;950", "wc_reply_reviewers": "0;0;0;93;0", "wc_reply_authors": "714;373;1010;906;1581", "reply_reviewers": "0;0;0;1;0", "reply_authors": "1;1;2;2;3", "recommendation_avg": [ 6.8, 0.9797958971132712 ], "confidence_avg": [ 3.0, 0.6324555320336759 ], "correctness_avg": [ 3.8, 0.39999999999999997 ], "technical_novelty_avg": [ 2.6, 0.8 ], "empirical_novelty_avg": [ 2.8, 0.7483314773547882 ], "wc_summary_paper_avg": [ 75.2, 20.49780476051033 ], "wc_summary_review_avg": [ 81.2, 34.74132985364838 ], "wc_main_review_avg": [ 321.6, 191.16652426614866 ], "wc_review_avg": [ 478.0, 236.6026204419554 ], "wc_reply_reviewers_avg": [ 18.6, 37.2 ], "wc_reply_authors_avg": [ 916.8, 396.68899657036116 ], "reply_reviewers_avg": [ 0.2, 0.4 ], "reply_authors_avg": [ 1.8, 0.7483314773547883 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.6454972243679028, "corr_recommendation_correctness": 0.408248290463863, "gs_citation": 27, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8801690414550502710&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=Ro_zAjZppv", "email": "andrew.cmu.edu;cmu.edu", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Carnegie Mellon University", "aff_unique_dep": "", "aff_unique_url": "https://www.cmu.edu", "aff_unique_abbr": "CMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Language-driven Semantic Segmentation", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6809", "id": "RriDjddCLN", "poster": "", "openreview": "https://openreview.net/forum?id=RriDjddCLN", "slides": "https://iclr.cc/virtual/2022/poster/6809", "video": "https://iclr.cc/virtual/2022/poster/6809", "author_site": "Boyi Li, Kilian Weinberger, Serge Belongie, Vladlen Koltun, Rene Ranftl", "tldr": "", "abstract": "We present LSeg, a novel model for language-driven semantic image segmentation. LSeg uses a text encoder to compute embeddings of descriptive input labels (e.g., ``grass'' or ``building'') together with a transformer-based image encoder that computes dense per-pixel embeddings of the input image. The image encoder is trained with a contrastive objective to align pixel embeddings to the text embedding of the corresponding semantic class. The text embeddings provide a flexible label representation in which semantically similar labels map to similar regions in the embedding space (e.g., ``cat'' and ``furry''). This allows LSeg to generalize to previously unseen categories at test time, without retraining or even requiring a single additional training sample. We demonstrate that our approach achieves highly competitive zero-shot performance compared to existing zero- and few-shot semantic segmentation methods, and even matches the accuracy of traditional segmentation algorithms when a fixed label set is provided. Code and demo are available at https://github.com/isl-org/lang-seg.", "keywords": "language-driven;semantic segmentation;zero-shot;transformer", "primary_area": "", "supplementary_material": "", "author": "Boyi Li;Kilian Q Weinberger;Serge Belongie;Vladlen Koltun;Rene Ranftl", "authorids": "~Boyi_Li1;~Kilian_Q_Weinberger1;~Serge_Belongie1;~Vladlen_Koltun1;~Rene_Ranftl1", "gender": "F;M;M;M;M", "homepage": "https://sites.google.com/site/boyilics/home;http://www.cs.cornell.edu/~kilian/;https://di.ku.dk/english/staff/?pure=en%2Fpersons%2Fserge-belongie(0ce65383-3761-4b17-948a-83b461e371e2)%2Fpublications.html;http://vladlen.info/;", "dblp": ";88/4801;http://dblp.uni-trier.de/pers/hd/b/Belongie:Serge_J=;66/5458.html;https://dblp.org/pers/hd/r/Ranftl:Ren=eacute=", "google_scholar": ";jsxk8vsAAAAJ;ORr4XJYAAAAJ;kg4bCpgAAAAJ;cwKg158AAAAJ", "orcid": ";0009-0008-9313-7239;0000-0002-0388-5217;0000-0003-0858-0970;", "linkedin": ";;sergebelongie;vladlenkoltun/;", "or_profile": "~Boyi_Li1;~Kilian_Q_Weinberger1;~Serge_Belongie1;~Vladlen_Koltun1;~Rene_Ranftl2", "aff": "Cornell University;ASAPP Inc.;University of Copenhagen;Apple;Epic Games", "aff_domain": "cornell.edu;asapp.com;ku.dk;apple.com;epicgames.com", "position": "PhD;Principal Researcher;Full Professor;Distinguished Scientist;Researcher", "bibtex": "@inproceedings{\nli2022languagedriven,\ntitle={Language-driven Semantic Segmentation},\nauthor={Boyi Li and Kilian Q Weinberger and Serge Belongie and Vladlen Koltun and Rene Ranftl},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=RriDjddCLN}\n}", "github": "", "project": "", "reviewers": "u2M8;Cogy;3LsU", "pdf_size": 0, "recommendation": "5;6;8", "confidence": "4;3;3", "correctness": "2;4;4", "technical_novelty": "3;2;3", "empirical_novelty": "3;3;3", "wc_summary_paper": "18;60;169", "wc_summary_review": "134;55;123", "wc_main_review": "200;164;91", "wc_review": "352;279;383", "wc_reply_reviewers": "0;92;0", "wc_reply_authors": "1088;989;284", "reply_reviewers": "0;1;0", "reply_authors": "5;4;1", "recommendation_avg": [ 6.333333333333333, 1.247219128924647 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.9428090415820634 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 82.33333333333333, 63.636118325645505 ], "wc_summary_review_avg": [ 104.0, 34.93804039534368 ], "wc_main_review_avg": [ 151.66666666666666, 45.345586579315764 ], "wc_review_avg": [ 338.0, 43.59663595584718 ], "wc_reply_reviewers_avg": [ 30.666666666666668, 43.36921591277491 ], "wc_reply_authors_avg": [ 787.0, 357.9636853089989 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 3.3333333333333335, 1.699673171197595 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.7559289460184545, "corr_recommendation_correctness": 0.7559289460184546, "gs_citation": 769, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17851834070670501779&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=RriDjddCLN", "email": "cornell.edu;asapp.com;ku.dk;apple.com;epicgames.com", "author_num": 5, "aff_unique_index": "0;1;2;3;4", "aff_unique_norm": "Cornell University;ASAPP Inc.;University of Copenhagen;Apple;Epic Games", "aff_unique_dep": ";;;Apple Inc.;", "aff_unique_url": "https://www.cornell.edu;https://www.asapp.com;https://www.ku.dk;https://www.apple.com;https://www.epicgames.com", "aff_unique_abbr": "Cornell;ASAPP;UCPH;Apple;Epic Games", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;0", "aff_country_unique": "United States;Denmark" }, { "title": "Transformer Embeddings of Irregularly Spaced Events and Their Participants", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6600", "id": "Rty5g9imm7H", "poster": "", "openreview": "https://openreview.net/forum?id=Rty5g9imm7H", "slides": "https://iclr.cc/virtual/2022/poster/6600", "video": "https://iclr.cc/virtual/2022/poster/6600", "author_site": "Hongyuan Mei, Chenghao Yang, Jason Eisner", "tldr": "", "abstract": "The neural Hawkes process (Mei & Eisner, 2017) is a generative model of irregularly spaced sequences of discrete events. To handle complex domains with many event types, Mei et al. (2020a) further consider a setting in which each event in the sequence updates a deductive database of facts (via domain-specific pattern-matching rules); future events are then conditioned on the database contents. They show how to convert such a symbolic system into a neuro-symbolic continuous-time generative model, in which each database fact and possible event has a time-varying embedding that is derived from its symbolic provenance. \n\nIn this paper, we modify both models, replacing their recurrent LSTM-based architectures with flatter attention-based architectures (Vaswani et al., 2017), which are simpler and more parallelizable. This does not appear to hurt our accuracy, which is comparable to or better than that of the original models as well as (where applicable) previous attention-based methods (Zuo et al., 2020; Zhang et al., 2020a).", "keywords": "irregular time series;generative Transformers;neuro-symbolic architectures;logic programming", "primary_area": "", "supplementary_material": "", "author": "Hongyuan Mei;Chenghao Yang;Jason Eisner", "authorids": "~Hongyuan_Mei1;~Chenghao_Yang1;~Jason_Eisner1", "gender": "M;M;M", "homepage": "http://www.cs.jhu.edu/~hmei/;https://yangalan123.github.io/;http://cs.jhu.edu/~jason", "dblp": "164/5576;229/4179;37/3263", "google_scholar": "g_zaiVIAAAAJ;B28fiOAAAAAJ;tjb2UccAAAAJ", "orcid": ";;0000-0002-8861-0772", "linkedin": "hongyuan-mei-57687858?trk=nav_responsive_tab_profile_pic;chenghao-yang-857b51178/;", "or_profile": "~Hongyuan_Mei1;~Chenghao_Yang1;~Jason_Eisner1", "aff": "Toyota Technological Institute at Chicago;Amazon.com, Inc.;Microsoft", "aff_domain": "ttic.edu;amazon.com;microsoft.com", "position": "Research Assistant Professor;Applied Scientist;Director of Research ", "bibtex": "@inproceedings{\nmei2022transformer,\ntitle={Transformer Embeddings of Irregularly Spaced Events and Their Participants},\nauthor={Hongyuan Mei and Chenghao Yang and Jason Eisner},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=Rty5g9imm7H}\n}", "github": "", "project": "", "reviewers": "9eGx;2v8P;5rCL;vfGf", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "3;4;2;3", "correctness": "4;3;2;3", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "40;103;85;85", "wc_summary_review": "45;44;54;23", "wc_main_review": "243;652;350;94", "wc_review": "328;799;489;202", "wc_reply_reviewers": "139;1013;0;0", "wc_reply_authors": "1185;1744;1331;183", "reply_reviewers": "1;5;0;0", "reply_authors": "3;7;3;1", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 3.0, 0.7071067811865476 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 78.25, 23.27418097377435 ], "wc_summary_review_avg": [ 41.5, 11.368817000902073 ], "wc_main_review_avg": [ 334.75, 204.48639930322994 ], "wc_review_avg": [ 454.5, 223.3993061761831 ], "wc_reply_reviewers_avg": [ 288.0, 422.40797814435274 ], "wc_reply_authors_avg": [ 1110.75, 573.5304590865248 ], "reply_reviewers_avg": [ 1.5, 2.0615528128088303 ], "reply_authors_avg": [ 3.5, 2.179449471770337 ], "replies_avg": [ 27, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": -0.6488856845230502, "gs_citation": 60, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=901539587982376122&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=Rty5g9imm7H", "email": "ttic.edu;amazon.com;microsoft.com", "author_num": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "Toyota Technological Institute at Chicago;Amazon;Microsoft", "aff_unique_dep": ";Amazon.com, Inc.;Microsoft Corporation", "aff_unique_url": "https://www.tti-chicago.org;https://www.amazon.com;https://www.microsoft.com", "aff_unique_abbr": "TTI Chicago;Amazon;Microsoft", "aff_campus_unique_index": "0", "aff_campus_unique": "Chicago;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "RuC5ilX2m6O", "title": "Local Patch AutoAugment with Multi-Agent Collaboration", "track": "main", "status": "Reject", "tldr": "", "abstract": "Data augmentation (DA) plays a critical role in improving the generalization of deep learning models. Recent works on automatically searching for DA policies from data have achieved great success. However, existing automated DA methods generally perform the search at the image level, which limits the exploration of diversity in local regions. In this paper, we propose a more fine-grained automated DA approach, dubbed Patch AutoAugment, to divide an image into a grid of patches and search for the joint optimal augmentation policies for the patches. We formulate it as a multi-agent reinforcement learning (MARL) problem, where each agent learns an augmentation policy for each patch based on its content together with the semantics of the whole image. The agents cooperate with each other to achieve the optimal augmentation effect of the entire image by sharing a team reward. We show the effectiveness of our method on multiple benchmark datasets of image classification and fine-grained image recognition (e.g., CIFAR-10, CIFAR-100, ImageNet, CUB-200-2011, Stanford Cars and FGVC-Aircraft). Extensive experiments demonstrate that our method outperforms the state-of-the-art DA methods while requiring fewer computational resources.", "keywords": "Automatic Data Augmentation;Multi-Agent Reinforcement Learning", "primary_area": "", "supplementary_material": "", "author": "Shiqi Lin;Tao Yu;Ruoyu Feng;Xin Li;Xin Jin;Zhibo Chen", "authorids": "~Shiqi_Lin1;~Tao_Yu4;~Ruoyu_Feng1;~Xin_Li28;~Xin_Jin8;~Zhibo_Chen1", "gender": "M;M;M;M;M;", "homepage": "https://geekyutao.github.io/;;https://lixinustc.github.io;http://home.ustc.edu.cn/~jinxustc/;https://faculty.ustc.edu.cn/chenzhibo;https://ieeexplore.ieee.org/author/37086941833", "dblp": "67/1014-12;251/9999;09/1365-82;68/3340-14;54/6561.html;", "google_scholar": "c76x7k8AAAAJ;Gt4QSSEAAAAJ;sbiY97gAAAAJ;byaSC-kAAAAJ;1ayDJfsAAAAJ;", "orcid": ";;;0000-0002-1820-8358;;", "linkedin": ";;;;;", "or_profile": "~Tao_Yu4;~Ruoyu_Feng1;~Xin_Li28;~Xin_Jin8;~Zhibo_Chen1;~Lin_Shi_Qi5", "aff": "University of Science and Technology of China;University of Science and Technology of China;Microsoft;University of Science and Technology of China;University of Science and Technology of China;University of Science and Technology of China", "aff_domain": "ustc.edu.cn;ustc.edu.cn;microsoft.com;ustc.edu.cn;ustc.edu.cn;ustc.edu", "position": "PhD student;PhD student;Intern;PhD student;Full Professor;MS student", "bibtex": "@misc{\nlin2022local,\ntitle={Local Patch AutoAugment with Multi-Agent Collaboration},\nauthor={Shiqi Lin and Tao Yu and Ruoyu Feng and Xin Li and Xin Jin and Zhibo Chen},\nyear={2022},\nurl={https://openreview.net/forum?id=RuC5ilX2m6O}\n}", "github": "", "project": "", "reviewers": "CpsG;cJ8q;XdSw;ci48", "site": "https://openreview.net/forum?id=RuC5ilX2m6O", "pdf_size": 0, "recommendation": "5;6;6;6", "confidence": "4;4;3;3", "correctness": "3;3;4;4", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;3;2;3", "wc_summary_paper": "118;48;181;122", "wc_summary_review": "22;24;172;44", "wc_main_review": "206;235;779;370", "wc_review": "346;307;1132;536", "wc_reply_reviewers": "0;14;0;0", "wc_reply_authors": "738;672;598;1153", "reply_reviewers": "0;1;0;0", "reply_authors": "3;2;2;3", "recommendation_avg": [ 5.75, 0.4330127018922193 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 117.25, 47.12417108024289 ], "wc_summary_review_avg": [ 65.5, 62.08663302193154 ], "wc_main_review_avg": [ 397.5, 228.78865793565905 ], "wc_review_avg": [ 580.25, 330.1229884452157 ], "wc_reply_reviewers_avg": [ 3.5, 6.06217782649107 ], "wc_reply_authors_avg": [ 790.25, 215.20963616901545 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 2.5, 0.5 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 21, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15152061449530469305&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff_unique_index": "0;0;1;0;0;0", "aff_unique_norm": "University of Science and Technology of China;Microsoft", "aff_unique_dep": ";Microsoft Corporation", "aff_unique_url": "http://www.ustc.edu.cn;https://www.microsoft.com", "aff_unique_abbr": "USTC;Microsoft", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;0;0", "aff_country_unique": "China;United States" }, { "id": "RunqFdkPuS", "title": "Self-Supervised Modality-Invariant and Modality-Specific Feature Learning for 3D Objects", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "While most existing self-supervised 3D feature learning methods mainly focus on point cloud data, this paper explores the inherent multimodal attributes of 3D objects. We propose to jointly learn effective features from different modalities including image, point cloud, and mesh with heterogeneous networks from unlabeled 3D data. Our proposed novel self-supervised model learns two types of distinct features: modality-invariant features and modality-specific features. The modality-invariant features capture high-level semantic information across different modalities with minimum modality discrepancy, while the modality-specific features capture specific characteristics preserved in each modality. These two types of features provide a more comprehensive representation of 3D data. The quality of the learned features is evaluated on different downstream tasks including 3D object recognition, 3D within-modal retrieval, and 3D cross-modal retrieval tasks with three data modalities including image, point cloud, and mesh. Our proposed method significantly outperforms the state-of-the-art self-supervised methods for all three tasks and even achieves comparable performance with the state-of-the-art supervised methods on the ModelNet10 and ModelNet40 datasets.", "keywords": "3D Representation Learning;3D Self-supervised Learning", "primary_area": "", "supplementary_material": "", "author": "Longlong Jing;Zhimin Chen;Bing Li;Yingli Tian", "authorids": "~Longlong_Jing1;~Zhimin_Chen1;bli4@clemson.edu;~Yingli_Tian1", "gender": "M;M;;F", "homepage": "https://longlong-jing.github.io/;https://zhiminc.website;;https://www.ccny.cuny.edu/profiles/yingli-tian", "dblp": "214/9050;;;54/8250", "google_scholar": "lhdhi5wAAAAJ;OIYNwLkAAAAJ;;https://scholar.google.com.tw/citations?user=aAWeB4wAAAAJ", "orcid": ";;;", "linkedin": ";;;yingli-tian-43a86413/", "or_profile": "~Longlong_Jing1;~Zhimin_Chen1;bli4@clemson.edu;~Yingli_Tian1", "aff": "Waymo LLC;Clemson University;;CUNY Graduate Center", "aff_domain": "waymo.com;clemson.edu;;ccny.cuny.edu", "position": "Researcher;PhD student;;Full Professor", "bibtex": "@misc{\njing2022selfsupervised,\ntitle={Self-Supervised Modality-Invariant and Modality-Specific Feature Learning for 3D Objects},\nauthor={Longlong Jing and Zhimin Chen and Bing Li and Yingli Tian},\nyear={2022},\nurl={https://openreview.net/forum?id=RunqFdkPuS}\n}", "github": "", "project": "", "reviewers": "RY4y;6XzF;wWUD", "site": "https://openreview.net/forum?id=RunqFdkPuS", "pdf_size": 0, "recommendation": "3;5;6", "confidence": "5;4;3", "correctness": "3;3;3", "technical_novelty": "2;2;3", "empirical_novelty": "0;2;3", "wc_summary_paper": "62;71;92", "wc_summary_review": "65;83;17", "wc_main_review": "704;169;186", "wc_review": "831;323;295", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "685;569;374", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "recommendation_avg": [ 4.666666666666667, 1.247219128924647 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 1.6666666666666667, 1.247219128924647 ], "wc_summary_paper_avg": [ 75.0, 12.569805089976535 ], "wc_summary_review_avg": [ 55.0, 27.85677655436824 ], "wc_main_review_avg": [ 353.0, 248.29149535710374 ], "wc_review_avg": [ 483.0, 246.33852046861583 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 542.6666666666666, 128.3233762371022 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.9819805060619659, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:0wgh8vUub-sJ:scholar.google.com/&scioq=Self-Supervised+Modality-Invariant+and+Modality-Specific+Feature+Learning+for+3D+Objects&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1;2", "aff_unique_norm": "Waymo;Clemson University;City University of New York", "aff_unique_dep": ";;", "aff_unique_url": "https://www.waymo.com;https://www.clemson.edu;https://www.gc.cuny.edu", "aff_unique_abbr": "Waymo;Clemson;CUNY GC", "aff_campus_unique_index": "1", "aff_campus_unique": ";Graduate Center", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "Rupm2vTg1pe", "title": "The Infinite Contextual Graph Markov Model", "track": "main", "status": "Reject", "tldr": "", "abstract": "The Contextual Graph Markov Model is a deep, unsupervised, and probabilistic model for graphs that is trained incrementally on a layer-by-layer basis. As with most Deep Graph Networks, an inherent limitation is the lack of an automatic mechanism to choose the size of each layer's latent representation. In this paper, we circumvent the problem by extending the Contextual Graph Markov Model with Hierarchical Dirichlet Processes. The resulting model for graphs can automatically adjust the complexity of each layer without the need to perform an extensive model selection. To improve the scalability of the method, we introduce a novel approximated inference procedure that better deals with larger graph topologies. The quality of the learned unsupervised representations is then evaluated across a set of eight graph classification tasks, showing competitive performances against end-to-end supervised methods. The analysis is complemented by studies on the importance of depth, hyper-parameters, and compression of the graph embeddings. We believe this to be an important step towards the theoretically grounded and automatic construction of deep probabilistic architectures for graphs.", "keywords": "graph neurals networks;graph classification;probabilistic models", "primary_area": "", "supplementary_material": "/attachment/c99971f9f1afed349a04fc964ef4049c8da23db5.zip", "author": "Daniele Castellana;Federico Errica;Davide Bacciu;Alessio Micheli", "authorids": "~Daniele_Castellana1;~Federico_Errica1;~Davide_Bacciu1;~Alessio_Micheli2", "gender": ";M;M;M", "homepage": "https://danielecastellana22.github.io/;http://pages.di.unipi.it/errica/;http://pages.di.unipi.it/bacciu/;", "dblp": "220/5487;203/9424;07/6626;34/4759.html", "google_scholar": "Dzv0TCAAAAAJ;https://scholar.google.co.uk/citations?user=VJ0n2gQAAAAJ;https://scholar.google.it/citations?user=1d5n2WkAAAAJ;rnaNixYAAAAJ", "orcid": "0000-0002-7159-2272;0000-0001-5181-2904;0000-0001-5213-2468;0000-0001-5764-5238", "linkedin": "daniele-castellana/;https://it.linkedin.com/in/federicoerrica;bacciu/;", "or_profile": "~Daniele_Castellana1;~Federico_Errica1;~Davide_Bacciu1;~Alessio_Micheli2", "aff": "University di Pisa;University of Pisa;University of Pisa;University of Pisa", "aff_domain": "unipi.it;unipi.it;unipi.it;unipi.it", "position": "Postdoc;PhD student;Full Professor;Full Professor", "bibtex": "@misc{\ncastellana2022the,\ntitle={The Infinite Contextual Graph Markov Model},\nauthor={Daniele Castellana and Federico Errica and Davide Bacciu and Alessio Micheli},\nyear={2022},\nurl={https://openreview.net/forum?id=Rupm2vTg1pe}\n}", "github": "", "project": "", "reviewers": "P2Q1;aqe6;mmkq;ynws", "site": "https://openreview.net/forum?id=Rupm2vTg1pe", "pdf_size": 0, "recommendation": "5;5;5;8", "confidence": "2;3;4;3", "correctness": "3;3;4;4", "technical_novelty": "3;3;2;4", "empirical_novelty": "2;2;2;4", "wc_summary_paper": "23;50;58;35", "wc_summary_review": "8;78;44;65", "wc_main_review": "32;238;330;110", "wc_review": "63;366;432;210", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 5.75, 1.299038105676658 ], "confidence_avg": [ 3.0, 0.7071067811865476 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 41.5, 13.5 ], "wc_summary_review_avg": [ 48.75, 26.47050245084139 ], "wc_main_review_avg": [ 177.5, 114.72031206373177 ], "wc_review_avg": [ 267.75, 143.08105220468573 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8605870071068450875&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "University of Pisa", "aff_unique_dep": "", "aff_unique_url": "https://www.unipi.it", "aff_unique_abbr": "UNIP", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Italy" }, { "id": "Rx9luEzcSoy", "title": "Lottery Image Prior", "track": "main", "status": "Reject", "tldr": "", "abstract": "Deep Neural Networks (DNNs), either pre-trained (e.g., GAN generator) or untrained (e.g., deep image prior), could act as overparameterized image priors that help solve various image inverse problems. Since traditional image priors have much fewer parameters, those DNN-based priors naturally invite the curious question: do they really have to be heavily parameterized? Drawing inspirations from the recently prosperous research on lottery ticket hypothesis (LTH), we conjecture and study a novel \u201clottery image prior\u201d (LIP), stated as: given an (untrained or trained) DNN-based image prior, it will have a sparse subnetwork that can be training in isolation, to match the original DNN\u2019s performance when being applied as a prior to various image inverse problems. We conduct extensive experiments in two representative settings: (i) image restoration with the deep image prior, using an untrained DNN; and (ii) compressive sensing image reconstruction, using a pre-trained GAN generator. Our results validate the prevailing existence of LIP, and that it can be found by iterative magnitude pruning (IMP) with surrogate tasks. Specifically, we can successfully locate the LIP subnetworks at the sparsity range of 20%-86.58% in setting i; and those at sparsity range of 5%-36% in setting ii. Those LIP subnetworks also possess high transferrability. To our best knowledge, this is the first time that LTH is demonstrated to be relevant in the context of inverse problems or image priors, and such compact DNN-based priors may potentially contribute to practical efficiency. Code will be publicly available.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Qiming Wu;Xiaohan Chen;Yifan Jiang;Pan Zhou;Zhangyang Wang", "authorids": "~Qiming_Wu1;~Xiaohan_Chen1;~Yifan_Jiang2;~Pan_Zhou5;~Zhangyang_Wang1", "gender": ";M;M;M;M", "homepage": ";http://xiaohanchen.com;https://yifanjiang19.github.io/;http://faculty.hust.edu.cn/pzhou/zh_CN/index.htm;https://vita-group.github.io", "dblp": ";94/3802;81/7246-1;84/6614-1;119/4026", "google_scholar": ";https://scholar.google.com/citations?authuser=1;PMeFEOIAAAAJ;cTpFPJgAAAAJ;pxFyKAIAAAAJ", "orcid": ";0000-0002-0360-0402;;;", "linkedin": ";xiaohan-chen-400b00147/;;;", "or_profile": "~Qiming_Wu1;~Xiaohan_Chen1;~Yifan_Jiang2;~Pan_Zhou5;~Zhangyang_Wang1", "aff": ";University of Texas, Austin;University of Texas, Austin;Huazhong University of Science and Technology;University of Texas, Austin", "aff_domain": ";utexas.edu;utexas.edu;hust.edu.cn;utexas.edu", "position": ";PhD student;PhD student;Professor;Assistant Professor", "bibtex": "@misc{\nwu2022lottery,\ntitle={Lottery Image Prior},\nauthor={Qiming Wu and Xiaohan Chen and Yifan Jiang and Pan Zhou and Zhangyang Wang},\nyear={2022},\nurl={https://openreview.net/forum?id=Rx9luEzcSoy}\n}", "github": "", "project": "", "reviewers": "vf8k;jFko;wMsj;Kjf3", "site": "https://openreview.net/forum?id=Rx9luEzcSoy", "pdf_size": 0, "recommendation": "6;6;6;8", "confidence": "4;3;4;5", "correctness": "3;3;3;3", "technical_novelty": "3;2;3;3", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "43;111;155;42", "wc_summary_review": "39;55;35;19", "wc_main_review": "243;352;323;237", "wc_review": "325;518;513;298", "wc_reply_reviewers": "0;169;26;0", "wc_reply_authors": "763;1201;776;386", "reply_reviewers": "0;1;1;0", "reply_authors": "1;3;1;1", "recommendation_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 87.75, 47.8506792010312 ], "wc_summary_review_avg": [ 37.0, 12.806248474865697 ], "wc_main_review_avg": [ 288.75, 49.861683685972736 ], "wc_review_avg": [ 413.5, 102.46096817813113 ], "wc_reply_reviewers_avg": [ 48.75, 70.2330940511665 ], "wc_reply_authors_avg": [ 781.5, 288.43240109252633 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.5, 0.8660254037844386 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.816496580927726, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:tQuNXM1OgV0J:scholar.google.com/&scioq=Lottery+Image+Prior&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "University of Texas at Austin;Huazhong University of Science and Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.utexas.edu;http://www.hust.edu.cn", "aff_unique_abbr": "UT Austin;HUST", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Austin;", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "United States;China" }, { "id": "Rx_nbGdtRQD", "title": "Coherent and Consistent Relational Transfer Learning with Autoencoders", "track": "main", "status": "Reject", "tldr": "", "abstract": "Human defined concepts are inherently transferable, but it is not clear under what conditions they can be modelled effectively by non-symbolic artificial learners.\nThis paper argues that for a transferable concept to be learned, the system of relations that define it must be coherent across domains.\nThis is to say that the learned concept-specific relations ought to be consistent with respect to a theory that constrains their semantics and that such consistency must extend beyond the representations encountered in the source domain.\nTo demonstrate this, we first present formal definitions for consistency and coherence, and a proposed Dynamic Comparator relation-decoder model designed around these principles. \nWe then perform a proposed Partial Relation Transfer learning task on a novel data set, using a neural-symbolic autoencoder architecture that combines sub-symbolic representations with modular relation-decoders.\nBy comparing against several existing relation-decoder models, our experiments show that relation-decoders which maintain consistency over unobserved regions of representational space retain coherence across domains, whilst achieving better transfer learning performance.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Harald Stromfelt;Luke Dickens;Artur Garcez;Alessandra Russo", "authorids": "~Harald_Stromfelt1;~Luke_Dickens1;~Artur_Garcez1;~Alessandra_Russo1", "gender": "M;M;;F", "homepage": ";http://www.ucl.ac.uk/dis/people/dickens;http://www.staff.city.ac.uk/~aag/;http://www.imperial.ac.uk/people/a.russo/", "dblp": ";30/6365;https://dblp.uni-trier.de/pers/g/Garcez:Artur_S=_d=Avila.html;79/683", "google_scholar": ";;https://scholar.google.co.uk/citations?user=BCpY0gsAAAAJ;https://scholar.google.com.tw/citations?user=_6zceo4AAAAJ", "orcid": ";0000-0003-0896-1407;;0000-0002-3318-8711", "linkedin": "harry-stromfelt-4b2b5a127/;;;alessandra-russo-422b6219/?originalSubdomain=uk", "or_profile": "~Harald_Stromfelt1;~Luke_Dickens1;~Artur_Garcez1;~Alessandra_Russo1", "aff": ";University College London, University of London;City, University of London;Imperial College London", "aff_domain": ";ucl.ac.uk;city.ac.uk;imperial.ac.uk", "position": ";Lecturer;Professor of Computer Science;Full Professor", "bibtex": "@misc{\nstromfelt2022coherent,\ntitle={Coherent and Consistent Relational Transfer Learning with Autoencoders},\nauthor={Harald Stromfelt and Luke Dickens and Artur Garcez and Alessandra Russo},\nyear={2022},\nurl={https://openreview.net/forum?id=Rx_nbGdtRQD}\n}", "github": "", "project": "", "reviewers": "8JLA;TRpG;LAq9;v3U9", "site": "https://openreview.net/forum?id=Rx_nbGdtRQD", "pdf_size": 0, "recommendation": "3;5;6;8", "confidence": "3;4;4;4", "correctness": "2;2;4;4", "technical_novelty": "2;2;3;4", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "43;39;99;350", "wc_summary_review": "25;28;51;81", "wc_main_review": "280;284;108;660", "wc_review": "348;351;258;1091", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "1353;1463;391;1150", "reply_reviewers": "0;0;0;0", "reply_authors": "2;3;1;2", "recommendation_avg": [ 5.5, 1.8027756377319946 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.0, 1.0 ], "technical_novelty_avg": [ 2.75, 0.82915619758885 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 132.75, 127.6526047521162 ], "wc_summary_review_avg": [ 46.25, 22.442983313276336 ], "wc_main_review_avg": [ 333.0, 201.7201031131999 ], "wc_review_avg": [ 512.0, 336.36810193595943 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 1089.25, 418.47841939579155 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.0, 0.7071067811865476 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.8006407690254357, "corr_recommendation_correctness": 0.8320502943378437, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17908645449512726621&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff_unique_index": "0;1;2", "aff_unique_norm": "University College London;City, University of London;Imperial College London", "aff_unique_dep": ";;", "aff_unique_url": "https://www.ucl.ac.uk;https://www.city.ac.uk;https://www.imperial.ac.uk", "aff_unique_abbr": "UCL;City, University of London;ICL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United Kingdom" }, { "title": "Looking Back on Learned Experiences For Class/task Incremental Learning", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6430", "id": "RxplU3vmBx", "poster": "", "openreview": "https://openreview.net/forum?id=RxplU3vmBx", "slides": "https://iclr.cc/virtual/2022/poster/6430", "video": "https://iclr.cc/virtual/2022/poster/6430", "author_site": "Mozhgan Pourkeshavarz, Guoying Zhao, Mohammad Sabokrou", "tldr": "", "abstract": "Classical deep neural networks are limited in their ability to learn from emerging streams of training data. When trained sequentially on new or evolving tasks, their performance degrades sharply, making them inappropriate in real-world use cases. Existing methods tackle it by either storing old data samples or only updating a parameter set of deep neural networks, which, however, demands a large memory budget or spoils the flexibility of models to learn the incremented task distribution. In this paper, we shed light on an on-call transfer set to provide past experiences whenever a new task arises in the data stream. In particular, we propose a Cost-Free Incremental Learning (CF-IL) not only to replay past experiences the model has learned but also to perform this in a cost free manner. Towards this end, we introduced a memory recovery paradigm in which we query the network to synthesize past exemplars whenever a new task emerges. Thus, our method needs no extra memory for data buffering or network growing, besides calls the proposed memory recovery paradigm to provide past exemplars, named a transfer set in order to mitigate catastrophically forgetting the former tasks in the Incremental Learning (IL) setup. Moreover, in contrast with recently proposed methods, the suggested paradigm does not desire a parallel architecture since it only relies on the learner network. Compared to the state-of-the-art data techniques without buffering past data samples, CF-IL demonstrates significantly better performance on the well-known datasets whether a task oracle is available in test time (Task-IL) or not (Class-IL).", "keywords": "Deepl Learning;Class Incremental learning;Continual learning;Experiences", "primary_area": "", "supplementary_material": "", "author": "Mozhgan PourKeshavarzi;Guoying Zhao;Mohammad Sabokrou", "authorids": "m.pourkeshavarz@gmail.com;~Guoying_Zhao3;~Mohammad_Sabokrou1", "gender": ";;M", "homepage": ";https://gyzhao-nm.github.io/Guoying/;https://sabokrou.github.io/", "dblp": ";35/814;163/2030", "google_scholar": ";hzywrFMAAAAJ;jqHXvT0AAAAJ", "orcid": ";0000-0003-3694-206X;", "linkedin": ";;", "or_profile": "m.pourkeshavarz@gmail.com;~Guoying_Zhao3;~Mohammad_Sabokrou1", "aff": ";University of Oulu;Institute for Research in Fundamental Sciences (IPM)", "aff_domain": ";oulu.fi;ipm.ir", "position": ";Full Professor;Assistant Professor", "bibtex": "@inproceedings{\npourkeshavarzi2022looking,\ntitle={Looking Back on Learned Experiences For Class/task Incremental Learning},\nauthor={Mozhgan PourKeshavarzi and Guoying Zhao and Mohammad Sabokrou},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=RxplU3vmBx}\n}", "github": "", "project": "", "reviewers": "ruLe;Q41Z;WhxT", "pdf_size": 0, "recommendation": "6;6;8", "confidence": "3;4;4", "correctness": "3;3;3", "technical_novelty": "2;3;2", "empirical_novelty": "2;3;3", "wc_summary_paper": "25;52;44", "wc_summary_review": "19;44;108", "wc_main_review": "447;313;284", "wc_review": "491;409;436", "wc_reply_reviewers": "252;0;419", "wc_reply_authors": "853;1210;1889", "reply_reviewers": "1;0;6", "reply_authors": "2;3;6", "recommendation_avg": [ 6.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 40.333333333333336, 11.323525167642018 ], "wc_summary_review_avg": [ 57.0, 37.47888294315436 ], "wc_main_review_avg": [ 348.0, 70.99765254335291 ], "wc_review_avg": [ 445.3333333333333, 34.120700787384514 ], "wc_reply_reviewers_avg": [ 223.66666666666666, 172.22530463191396 ], "wc_reply_authors_avg": [ 1317.3333333333333, 429.7009295881135 ], "reply_reviewers_avg": [ 2.3333333333333335, 2.6246692913372702 ], "reply_authors_avg": [ 3.6666666666666665, 1.699673171197595 ], "replies_avg": [ 22, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.49999999999999983, "corr_recommendation_correctness": 0.0, "gs_citation": 51, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13706410822869612262&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "pdf": "https://openreview.net/pdf?id=RxplU3vmBx", "email": ";oulu.fi;ipm.ir", "author_num": 3, "aff_unique_index": "0;1", "aff_unique_norm": "University of Oulu;Institute for Research in Fundamental Sciences", "aff_unique_dep": ";", "aff_unique_url": "https://www.oulu.fi;http://ipm.ir", "aff_unique_abbr": "UOulu;IPM", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "Finland;Iran" }, { "id": "Rz9QJ75IPoi", "title": "Scale-Invariant Teaching for Semi-Supervised Object Detection", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Recent Semi-Supervised Object Detection methods are mainly based on self-training, i.e., generating hard pseudo-labels by a teacher model on unlabeled data as supervisory signals. Although they achieved certain success, the massive False Negative samples and inferior localization precision lack consideration. Furthermore, the limited annotations in semi-supervised learning scale up the challenges: large variance of object sizes and class imbalance (i.e., the extreme ratio between background and object), hindering the performance of prior arts. We address these challenges by introducing a novel approach, Scale-Invariant Teaching (SIT), which is a simple yet effective end-to-end knowledge distillation framework robust to large object size variance and class imbalance. SIT has several appealing benefits compared to previous works.\n(1) SIT imposes a consistency regularization to reduce the prediction discrepancy between objects with different sizes. \n(2) The soft pseudo-label alleviates the noise problem from the False Negative samples and inferior localization precision.\n(3) A re-weighting strategy can implicitly screen the potential foreground regions from unlabeled data to reduce the effect of class imbalance. \nExtensive experiments show that SIT consistently outperforms the recent state-of-the-art methods and baseline on different datasets with significant margins. For example, it surpasses the supervised counterpart by more than 10 mAP when using 5% and 10% labeled data on MS-COCO.", "keywords": "Semi-Supervised Learning;Object Detection", "primary_area": "", "supplementary_material": "", "author": "Qiushan Guo;Yizhou Yu;Ping Luo", "authorids": "~Qiushan_Guo1;~Yizhou_Yu3;~Ping_Luo2", "gender": "M;;", "homepage": "https://guoqiushan.github.io/;https://i.cs.hku.hk/~yzyu/;http://luoping.me/", "dblp": "231/1814;;54/4989-2.html", "google_scholar": "https://scholar.google.com/citations?hl=zh-CN;;https://scholar.google.com.hk/citations?hl=en", "orcid": ";;0000-0002-6685-7950", "linkedin": ";;", "or_profile": "~Qiushan_Guo1;~Yizhou_Yu3;~Luo_Ping2", "aff": "The University of Hong Kong;;The University of Hong Kong", "aff_domain": "hku.hk;;hku.hk", "position": "PhD student;;Assistant Professor", "bibtex": "@misc{\nguo2022scaleinvariant,\ntitle={Scale-Invariant Teaching for Semi-Supervised Object Detection},\nauthor={Qiushan Guo and Yizhou Yu and Ping Luo},\nyear={2022},\nurl={https://openreview.net/forum?id=Rz9QJ75IPoi}\n}", "github": "", "project": "", "reviewers": "2dFM;Juqd;TME4;d1ou", "site": "https://openreview.net/forum?id=Rz9QJ75IPoi", "pdf_size": 0, "recommendation": "5;5;5;6", "confidence": "4;4;5;4", "correctness": "3;4;3;4", "technical_novelty": "3;1;2;3", "empirical_novelty": "2;3;2;3", "wc_summary_paper": "34;90;66;249", "wc_summary_review": "22;55;36;51", "wc_main_review": "207;317;204;291", "wc_review": "263;462;306;591", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "536;760;1052;525", "reply_reviewers": "0;0;0;0", "reply_authors": "1;2;2;1", "recommendation_avg": [ 5.25, 0.4330127018922193 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.25, 0.82915619758885 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 109.75, 82.81417451137214 ], "wc_summary_review_avg": [ 41.0, 13.057564857200596 ], "wc_main_review_avg": [ 254.75, 50.11175011910879 ], "wc_review_avg": [ 405.5, 130.20080644911536 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 718.25, 214.29696101438302 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:RB2MFvTvaL0J:scholar.google.com/&scioq=Scale-Invariant+Teaching+for+Semi-Supervised+Object+Detection&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "University of Hong Kong", "aff_unique_dep": "", "aff_unique_url": "https://www.hku.hk", "aff_unique_abbr": "HKU", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Hong Kong SAR", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "id": "RzXb6a3H3rs", "title": "Learning to Prompt for Continual Learning", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "The mainstream learning paradigm behind continual learning has been to adapt the model parameters to non-stationary data distributions, where catastrophic forgetting is the central challenge. This work explores a new paradigm for continual learning -- learning to dynamically prompt the model to learn tasks sequentially under different task transitions. Specifically, our method, Learning to Prompt for Continual Learning (L2P), prepends a subset of learnable parameters (called Prompts) from a larger set (called Prompt Pool) to the input embeddings. The training objective is designed to dynamically select and update prompts from the prompt pool to learn tasks sequentially given a pretrained backbone model. Under our new framework, instead of mitigating catastrophic forgetting via adapting large model parameters as in the previous continual learning paradigm, we tackle the problem of learning better small prompt parameters. In this framework, the prompt pool explicitly manages task-invariant and task-specific knowledge while maintaining model plasticity. The proposed L2P outperforms previous work in terms of forgetting on all datasets, including rehearsal-based methods on certain benchmarks, with privacy benefits from not requiring access to the data of previous tasks. Moreover, when L2P is additionally equipped with a rehearsal buffer, it matches the performance of training all tasks together, which is often regarded as an upper bound in continual learning. Source code will be released.", "keywords": "Continual learning;Prompt-based learning", "primary_area": "", "supplementary_material": "", "author": "Zifeng Wang;Zizhao Zhang;Chen-Yu Lee;Han Zhang;Ruoxi Sun;Xiaoqi Ren;Guolong Su;Vincent Perot;Jennifer Dy;Tomas Pfister", "authorids": "~Zifeng_Wang1;~Zizhao_Zhang3;~Chen-Yu_Lee2;~Han_Zhang1;~Ruoxi_Sun2;xiaoqiren@google.com;~Guolong_Su1;~Vincent_Perot1;~Jennifer_Dy1;~Tomas_Pfister1", "gender": "M;M;;M;F;;M;M;;M", "homepage": "https://kingspencer.github.io/;https://sites.google.com/corp/view/zizhaozhang;https://chl260.github.io/;https://sites.google.com/corp/view/hanzhang;;;https://scholar.google.com/citations?user=XWLOtwQAAAAJ&hl=en;;https://mllabneu.github.io/;http://tomas.pfister.fi", "dblp": "43/7716-2;;04/656;;72/7683;;;227/2509;24/6000;14/8360", "google_scholar": "https://scholar.google.co.il/citations?user=N1uBekcAAAAJ;https://scholar.google.dk/citations?hl=en;uWPUSEgAAAAJ;cxEoVL4AAAAJ;ut1-7LAAAAAJ;;XWLOtwQAAAAJ;RrANep4AAAAJ;6h7b0fAAAAAJ;ahSpJOAAAAAJ", "orcid": ";;;;;;;;;0009-0004-4088-8718", "linkedin": "zifeng-wang-21b069b4/;;chenyulee260/;;;;;vincentperot/;;", "or_profile": "~Zifeng_Wang1;~Zizhao_Zhang3;~Chen-Yu_Lee2;~Han_Zhang1;~Ruoxi_Sun2;xiaoqiren@google.com;~Guolong_Su1;~Vincent_Perot1;~Jennifer_Dy1;~Tomas_Pfister1", "aff": "Northeastern University;Google;Google;Google;Google;;Google;Google;Northeastern University;Google", "aff_domain": "northeastern.edu;google.com;google.com;google.com;google.com;;google.com;google.com;northeastern.edu;google.com", "position": "PhD student;Researcher;Research Scientist;Researcher;Google;;Google;Software Engineer;Full Professor;Head of Research @ Cloud AI", "bibtex": "@misc{\nwang2022learning,\ntitle={Learning to Prompt for Continual Learning},\nauthor={Zifeng Wang and Zizhao Zhang and Chen-Yu Lee and Han Zhang and Ruoxi Sun and Xiaoqi Ren and Guolong Su and Vincent Perot and Jennifer Dy and Tomas Pfister},\nyear={2022},\nurl={https://openreview.net/forum?id=RzXb6a3H3rs}\n}", "github": "", "project": "", "reviewers": "QXhJ;y82g;CrKX", "site": "https://openreview.net/forum?id=RzXb6a3H3rs", "pdf_size": 0, "recommendation": "3;5;5", "confidence": "3;4;4", "correctness": "3;3;3", "technical_novelty": "2;3;3", "empirical_novelty": "2;2;3", "wc_summary_paper": "52;45;71", "wc_summary_review": "19;18;66", "wc_main_review": "111;107;99", "wc_review": "182;170;236", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "414;391;443", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "recommendation_avg": [ 4.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 56.0, 10.98483803552272 ], "wc_summary_review_avg": [ 34.333333333333336, 22.395436042987765 ], "wc_main_review_avg": [ 105.66666666666667, 4.988876515698588 ], "wc_review_avg": [ 196.0, 28.705400188814647 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 416.0, 21.275964529643932 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 10, 0 ], "corr_recommendation_confidence": 0.9999999999999998, "corr_recommendation_correctness": 0.0, "gs_citation": 1012, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11127330701624169778&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff_unique_index": "0;1;1;1;1;1;1;0;1", "aff_unique_norm": "Northeastern University;Google", "aff_unique_dep": ";Google", "aff_unique_url": "https://www.northeastern.edu;https://www.google.com", "aff_unique_abbr": "NEU;Google", "aff_campus_unique_index": "1;1;1;1;1;1;1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "S-oyLlQ1i-7", "title": "Geon3D: Exploiting 3D Shape Bias towards Building Robust Machine Vision", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Robustness research in machine vision faces a challenge. Many variants of ImageNet-scale robustness benchmarks have been proposed, only to reveal that current vision systems fail under distributional shifts. Although aiming for higher robustness accuracy on these benchmarks is important, we also observe that simply using larger models and larger training datasets may not lead to true robustness, demanding further innovation. To tackle the problem from a new perspective, we encourage closer collaboration between the robustness and 3D vision communities. This proposal is inspired by human vision, which is surprisingly robust to environ-mental variation, including both naturally occurring disturbances (e.g., fog, snow, occlusion) and artificial corruptions (e.g., adversarial examples). We hypothesize that such robustness, at least in part, arises from our ability to infer 3D geometry from 2D retinal projections---the ability to go from images to their underlying causes, including the 3D scene. In this work, we take a first step toward testing this hypothesis by viewing 3D reconstruction as a pretraining method for building more robust vision systems. We introduce a novel dataset called Geon3D, which is derived from objects that emphasize variation across shape features that the human visual system is thought to be particularly sensitive. This dataset enables, for the first time, a controlled setting where we can isolate the effect of \u201c3D shape bias\u201d in robustifying neural networks, and informs new approaches for increasing robustness by exploiting 3D vision tasks. Using Geon3D, we find that CNNs pretrained on 3D reconstruction are more resilient to viewpoint change, rotation, and shift than regular CNNs. Further, when combined with adversarial training, 3D reconstruction pretrained models improve adversarial and common corruption robustness over vanilla adversarially-trained models. We hope that our findings and dataset will encourage exploitation of synergies between the robustness researchers, 3D computer vision community, and computational perception researchers in cognitive science, paving a way for achieving human-like robustness under complex, real-world stimuli conditions.", "keywords": "robustness;common corruptions;adversarial examples;robust vision;vision science", "primary_area": "", "supplementary_material": "/attachment/79178c0f06beed038b966a3f725535814646b26a.zip", "author": "Yutaro Yamada;Yuval Kluger;Sahand Negahban;Ilker Yildirim", "authorids": "~Yutaro_Yamada1;~Yuval_Kluger1;~Sahand_Negahban1;~Ilker_Yildirim2", "gender": ";;;M", "homepage": ";https://medicine.yale.edu/profile/yuval-kluger/;https://sahandnegahban.com/;http://cncl.yale.edu/", "dblp": "172/1440;79/5739;85/3290;", "google_scholar": "0ktnXXMAAAAJ;RDfdfr0AAAAJ;;", "orcid": ";0000-0002-3035-071X;;", "linkedin": ";;;", "or_profile": "~Yutaro_Yamada1;~Yuval_Kluger1;~Sahand_Negahban1;~Ilker_Yildirim2", "aff": "Yale University;Yale University;Yale University;Yale University", "aff_domain": "yale.edu;yale.edu;yale.edu;yale.edu", "position": "PhD student;Professor;Associate Professor;Assistant Professor", "bibtex": "@misc{\nyamada2022geond,\ntitle={Geon3D: Exploiting 3D Shape Bias towards Building Robust Machine Vision},\nauthor={Yutaro Yamada and Yuval Kluger and Sahand Negahban and Ilker Yildirim},\nyear={2022},\nurl={https://openreview.net/forum?id=S-oyLlQ1i-7}\n}", "github": "", "project": "", "reviewers": "5SP9;je8L;f2MT;RQhR", "site": "https://openreview.net/forum?id=S-oyLlQ1i-7", "pdf_size": 0, "recommendation": "3;5;5;5", "confidence": "4;3;4;3", "correctness": "2;3;2;4", "technical_novelty": "3;3;3;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "50;101;73;84", "wc_summary_review": "157;94;44;10", "wc_main_review": "364;630;435;151", "wc_review": "571;825;552;245", "wc_reply_reviewers": "0;0;48;0", "wc_reply_authors": "271;461;170;230", "reply_reviewers": "0;0;1;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 4.5, 0.8660254037844386 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 2.75, 0.82915619758885 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 77.0, 18.506755523321747 ], "wc_summary_review_avg": [ 76.25, 55.37316588384666 ], "wc_main_review_avg": [ 395.0, 171.26149596450452 ], "wc_review_avg": [ 548.25, 205.59836453629683 ], "wc_reply_reviewers_avg": [ 12.0, 20.784609690826528 ], "wc_reply_authors_avg": [ 283.0, 108.86459479555324 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.5222329678670935, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:S0TlM9ebFZ8J:scholar.google.com/&scioq=Geon3D:+Exploiting+3D+Shape+Bias+towards+Building+Robust+Machine+Vision&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Yale University", "aff_unique_dep": "", "aff_unique_url": "https://www.yale.edu", "aff_unique_abbr": "Yale", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "S-sYYe0P0Hd", "title": "SynCLR: A Synthesis Framework for Contrastive Learning of out-of-domain Speech Representations", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Learning generalizable speech representations for unseen samples in different domains has been a challenge with ever increasing importance to date. Although contrastive learning has been a prominent class of representation learning approaches, the state-of-the-art (SOTA) contrastive learning methods were found to have limited ability for learning unseen out-of-domain speech representations. This paper presents SynCLR, a synthesis framework for contrastive learning of speech representations that can be generalized over unseen domains. Specifically, instead of using data augmentation approach, SynCLR employs data synthesis for multi-view generation. To ensure a highly-varied conditional speech distribution in view generation, we design a novel diffusion-based speech synthesizer. A new contrastive loss is also proposed to construct multiple embedding spaces, each of which preserves view-sensitive information to reduce domain reliance for a better disentanglement. Our experiments showed that SynCLR outperformed the SOTA contrastive learning methods with a 17.2\\% relative reduction of EER in speaker verification tested on an unseen speech corpus, and considerably reduced 50.8\\% relative FIDs in a challenging speech-to-image translation task given out-of-domain test speeches.", "keywords": "Contrastive learning;Domain generalization;Speech Synthesis;Diffusion Probabilistic Models", "primary_area": "", "supplementary_material": "", "author": "Rongjie Huang;Max W. Y. Lam;Jun Wang;Dan Su;Dong Yu;Zhou Zhao;Yi Ren", "authorids": "~Rongjie_Huang1;~Max_W._Y._Lam1;~Jun_Wang21;~Dan_Su3;~Dong_Yu2;~Zhou_Zhao2;~Yi_Ren2", "gender": "M;M;;M;M;M;M", "homepage": ";;;;https://sites.google.com/view/dongyu888/;https://dblp.uni-trier.de/pid/75/7785.html?;https://rayeren.github.io/", "dblp": "212/8936-1;200/9096;;;71/4598-1;75/7785;75/6568-6", "google_scholar": "iRHBUsgAAAAJ;R0E0bKkAAAAJ;;yE6WZy4AAAAJ;tMY31_gAAAAJ;https://scholar.google.com.hk/citations?user=IIoFY90AAAAJ;4FA6C0AAAAAJ", "orcid": ";;;;0000-0003-0520-6844;0000-0001-6121-0384;", "linkedin": ";maxingaussian/;;dan-su-4948621a/;dongyu/;;", "or_profile": "~Rongjie_Huang1;~Max_W._Y._Lam1;~Jun_Wang21;~Dan_Su3;~Dong_Yu2;~Zhou_Zhao2;~Yi_Ren2", "aff": "Zhejiang University;Tencent AI Lab;;Tencent AI Lab;Tencent AI Lab;Zhejiang University;Zhejiang University", "aff_domain": "zju.edu.cn;tencent.com;;tencent.com;tencent.com;zju.edu.cn;zju.edu.cn", "position": "MS student;Researcher;;Researcher;Distinguished Scientist;Associate Professor;MS student", "bibtex": "@misc{\nhuang2022synclr,\ntitle={Syn{CLR}: A Synthesis Framework for Contrastive Learning of out-of-domain Speech Representations},\nauthor={Rongjie Huang and Max W. Y. Lam and Jun Wang and Dan Su and Dong Yu and Zhou Zhao and Yi Ren},\nyear={2022},\nurl={https://openreview.net/forum?id=S-sYYe0P0Hd}\n}", "github": "", "project": "", "reviewers": "v6Q3;Y5JE;298k;4r6u", "site": "https://openreview.net/forum?id=S-sYYe0P0Hd", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "4;5;4;4", "correctness": "3;1;3;3", "technical_novelty": "3;1;3;3", "empirical_novelty": "3;1;2;2", "wc_summary_paper": "37;72;66;109", "wc_summary_review": "33;46;37;37", "wc_main_review": "295;603;671;624", "wc_review": "365;721;774;770", "wc_reply_reviewers": "0;84;0;0", "wc_reply_authors": "464;504;414;301", "reply_reviewers": "0;1;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 2.5, 0.8660254037844386 ], "technical_novelty_avg": [ 2.5, 0.8660254037844386 ], "empirical_novelty_avg": [ 2.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 71.0, 25.622255950637914 ], "wc_summary_review_avg": [ 38.25, 4.763139720814412 ], "wc_main_review_avg": [ 548.25, 148.27234232991668 ], "wc_review_avg": [ 657.5, 170.15948401426235 ], "wc_reply_reviewers_avg": [ 21.0, 36.373066958946424 ], "wc_reply_authors_avg": [ 420.75, 76.13598032467961 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:I5WTgbBg7aUJ:scholar.google.com/&scioq=SynCLR:+A+Synthesis+Framework+for+Contrastive+Learning+of+out-of-domain+Speech+Representations&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1;1;1;0;0", "aff_unique_norm": "Zhejiang University;Tencent", "aff_unique_dep": ";Tencent AI Lab", "aff_unique_url": "https://www.zju.edu.cn;https://ai.tencent.com", "aff_unique_abbr": "ZJU;Tencent AI Lab", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "China" }, { "id": "S0NsaRIxvQ", "title": "Adversarial Style Transfer for Robust Policy Optimization in Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "This paper proposes an algorithm that aims to improve generalization for reinforcement learning agents by removing overfitting to confounding features. Our approach consists of a max-min game theoretic objective. A generator transfers the style of observation during reinforcement learning. An additional goal of the generator is to perturb the observation, which maximizes the agent's probability of taking a different action. In contrast, a policy network updates its parameters to minimize the effect of such perturbations, thus staying robust while maximizing the expected future reward. Based on this setup, we propose a practical deep reinforcement learning algorithm, Adversarial Robust Policy Optimization (ARPO), to find an optimal policy that generalizes to unseen environments. We evaluate our approach on visually enriched and diverse Procgen benchmarks. Empirically, we observed that our agent ARPO performs better in generalization and sample efficiency than a few state-of-the-art algorithms.", "keywords": "Deep Reinforcement Learning;Generalization in Reinforcement Learning", "primary_area": "", "supplementary_material": "", "author": "Md Masudur Rahman;Yexiang Xue", "authorids": "~Md_Masudur_Rahman2;~Yexiang_Xue1", "gender": "M;M", "homepage": "https://mmasudurrah.github.io/;https://www.cs.purdue.edu/people/faculty/yexiang/", "dblp": "08/2425-1;117/4903", "google_scholar": "0nUv7b0AAAAJ;", "orcid": "0000-0002-3633-0621;", "linkedin": "masud99r/;", "or_profile": "~Md_Masudur_Rahman2;~Yexiang_Xue1", "aff": "Purdue University;Purdue University", "aff_domain": "purdue.edu;purdue.edu", "position": "PhD student;Assistant Professor", "bibtex": "@misc{\nrahman2022adversarial,\ntitle={Adversarial Style Transfer for Robust Policy Optimization in Reinforcement Learning},\nauthor={Md Masudur Rahman and Yexiang Xue},\nyear={2022},\nurl={https://openreview.net/forum?id=S0NsaRIxvQ}\n}", "github": "", "project": "", "reviewers": "FV5x;f6jT;eK4b;WXXr", "site": "https://openreview.net/forum?id=S0NsaRIxvQ", "pdf_size": 0, "recommendation": "5;5;6;8", "confidence": "5;3;3;5", "correctness": "3;3;4;4", "technical_novelty": "3;2;3;3", "empirical_novelty": "2;2;3;2", "wc_summary_paper": "134;80;48;42", "wc_summary_review": "175;63;48;73", "wc_main_review": "691;195;250;550", "wc_review": "1000;338;346;665", "wc_reply_reviewers": "911;70;0;126", "wc_reply_authors": "1161;699;406;910", "reply_reviewers": "2;1;0;1", "reply_authors": "3;2;1;2", "recommendation_avg": [ 6.0, 1.224744871391589 ], "confidence_avg": [ 4.0, 1.0 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 76.0, 36.46916505762094 ], "wc_summary_review_avg": [ 89.75, 50.016872153304426 ], "wc_main_review_avg": [ 421.5, 206.0685565533956 ], "wc_review_avg": [ 587.25, 272.36682525594046 ], "wc_reply_reviewers_avg": [ 276.75, 368.8952256400183 ], "wc_reply_authors_avg": [ 794.0, 277.3598745312667 ], "reply_reviewers_avg": [ 1.0, 0.7071067811865476 ], "reply_authors_avg": [ 2.0, 0.7071067811865476 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.40824829046386296, "corr_recommendation_correctness": 0.8164965809277259, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:6mu3eby3FIMJ:scholar.google.com/&scioq=Adversarial+Style+Transfer+for+Robust+Policy+Optimization+in+Reinforcement+Learning&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Purdue University", "aff_unique_dep": "", "aff_unique_url": "https://www.purdue.edu", "aff_unique_abbr": "Purdue", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "S2-p6QiTIxZ", "title": "Active Learning: Sampling in the Least Probable Disagreement Region", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Active learning strategy to query samples closest to the decision boundary can be an effective strategy for sampling the most uncertain and thus informative samples. This strategy is valid only when the sample's \"closeness\" to the decision boundary can be estimated. As a measure for evaluating closeness to a given decision boundary of a given sample, this paper considers the least probable disagreement region (LPDR) which is a measure of the smallest perturbation on the decision boundary leading to altered prediction of the sample. Experimental results show that the proposed LPDR-based active learning algorithm consistently outperforms other high performing active learning algorithms and leads to state-of-the-art performance on various datasets and deep networks.", "keywords": "machine learning;active learning;uncertainty;hypothesis perturbation;disagreement region", "primary_area": "", "supplementary_material": "", "author": "Seong Jin Cho;Gwangsu Kim;Chang D. Yoo", "authorids": "~Seong_Jin_Cho1;~Gwangsu_Kim1;~Chang_D._Yoo1", "gender": "M;M;M", "homepage": ";;https://sanctusfactory.com/family.php", "dblp": ";218/3948;31/7819", "google_scholar": ";https://scholar.google.co.kr/citations?user=uD1Osx8AAAAJ;gFWgUQEAAAAJ", "orcid": "0000-0003-4640-7407;;0000-0002-0756-7179", "linkedin": ";;", "or_profile": "~Seong_Jin_Cho1;~Gwangsu_Kim1;~Chang_D._Yoo1", "aff": "Korea Institute of Oriental Medicine;Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology", "aff_domain": "kiom.re.kr;kaist.ac.kr;kaist.ac.kr", "position": "Researcher;Research Associate Professor;Full Professor", "bibtex": "@misc{\ncho2022active,\ntitle={Active Learning: Sampling in the Least Probable Disagreement Region},\nauthor={Seong Jin Cho and Gwangsu Kim and Chang D. Yoo},\nyear={2022},\nurl={https://openreview.net/forum?id=S2-p6QiTIxZ}\n}", "github": "", "project": "", "reviewers": "Urni;VT7X;nHiD;Dsk8", "site": "https://openreview.net/forum?id=S2-p6QiTIxZ", "pdf_size": 0, "recommendation": "1;5;5;5", "confidence": "4;3;3;4", "correctness": "1;3;3;3", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;2;3;2", "wc_summary_paper": "264;113;117;115", "wc_summary_review": "131;43;76;70", "wc_main_review": "859;312;503;241", "wc_review": "1254;468;696;426", "wc_reply_reviewers": "0;94;0;0", "wc_reply_authors": "685;315;680;621", "reply_reviewers": "0;1;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 4.0, 1.7320508075688772 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 2.5, 0.8660254037844386 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 152.25, 64.53439005677515 ], "wc_summary_review_avg": [ 80.0, 31.960913628993776 ], "wc_main_review_avg": [ 478.75, 239.53535751533636 ], "wc_review_avg": [ 711.0, 329.9045316451412 ], "wc_reply_reviewers_avg": [ 23.5, 40.703193977868615 ], "wc_reply_authors_avg": [ 575.25, 152.3489005539587 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 1.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:zVl5YkSekEIJ:scholar.google.com/&scioq=Active+Learning:+Sampling+in+the+Least+Probable+Disagreement+Region&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;1", "aff_unique_norm": "Korea Institute of Oriental Medicine;Korea Advanced Institute of Science and Technology", "aff_unique_dep": ";", "aff_unique_url": "http://www.kiom.re.kr;https://www.kaist.ac.kr", "aff_unique_abbr": "KIOM;KAIST", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "South Korea" }, { "id": "S2pNPZM-w-f", "title": "Input Convex Graph Neural Networks: An Application to Optimal Control and Design Optimization", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Despite the success of modeling networked systems via graph neural networks (GNN), applying GNN for the model-based control is pessimistic since the non-convexity of GNN models hinders solving model-based control problems. In this regard, we propose the input convex graph neural networks (ICGNN) whose inputs and outputs are related via convex functions. When ICGNN is used to model the target objective function, the decision-making problem becomes a convex optimization problem due to the convexity of ICGNN and the corresponding solution can be obtained efficiently. We assess the prediction and control performance of ICGNN on several benchmarks and physical heat diffusion problems, respectively. On the physical heat diffusion, we further apply ICGNN to solve a design optimization problem, which seeks to find the optimal heater allocations while considering the optimal operation of the heaters, by using a gradient-based method. We cast the design optimization problem as a bi-level optimization problem. In there, the input convexity of ICGNN allows us to compute the gradient of the lower level problem (i.e., control problem with a given heater allocation) without bias. We confirm that ICGNN significantly outperforms non-input convex GNN to solve the design optimization problem.", "keywords": "Graph;Graph Neural Network;Convex;Input-convex;Implicit function theorem.", "primary_area": "", "supplementary_material": "", "author": "Junyoung Park;Chihyeon Song;Jinkyoo Park", "authorids": "~Junyoung_Park1;~Chihyeon_Song1;~Jinkyoo_Park1", "gender": ";;M", "homepage": ";https://github.com/song970407;http://silab.kaist.ac.kr/", "dblp": ";;156/7535", "google_scholar": ";;sH2a0nkAAAAJ", "orcid": ";;0000-0003-2620-1479", "linkedin": ";;", "or_profile": "~Junyoung_Park1;~Chihyeon_Song1;~Jinkyoo_Park1", "aff": ";Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology", "aff_domain": ";kaist.ac.kr;kaist.ac.kr", "position": ";PhD student;Associate Professor", "bibtex": "@misc{\npark2022input,\ntitle={Input Convex Graph Neural Networks: An Application to Optimal Control and Design Optimization},\nauthor={Junyoung Park and Chihyeon Song and Jinkyoo Park},\nyear={2022},\nurl={https://openreview.net/forum?id=S2pNPZM-w-f}\n}", "github": "", "project": "", "reviewers": "bpPN;5SXY;Mh4M", "site": "https://openreview.net/forum?id=S2pNPZM-w-f", "pdf_size": 0, "recommendation": "5;5;6", "confidence": "2;3;3", "correctness": "3;3;3", "technical_novelty": "2;2;2", "empirical_novelty": "2;2;3", "wc_summary_paper": "38;85;13", "wc_summary_review": "38;71;11", "wc_main_review": "265;159;242", "wc_review": "341;315;266", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 2.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 45.333333333333336, 29.847761874031512 ], "wc_summary_review_avg": [ 40.0, 24.535688292770594 ], "wc_main_review_avg": [ 222.0, 45.526549030940906 ], "wc_review_avg": [ 307.3333333333333, 31.09483701338357 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.4999999999999999, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:T7YNFiFlkacJ:scholar.google.com/&scioq=Input+Convex+Graph+Neural+Networks:+An+Application+to+Optimal+Control+and+Design+Optimization&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Korea Advanced Institute of Science and Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.kaist.ac.kr", "aff_unique_abbr": "KAIST", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "South Korea" }, { "id": "S352vriz3G", "title": "Icy: A benchmark for measuring compositional inductive bias of emergent communication models", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "We present a benchmark \\textsc{Icy} for measuring the compositional inductive bias of models in the context of emergent communications. We devise corrupted compositional grammars that probe for limitations in the compositional inductive bias of frequently used models. We use these corrupted compositional grammars to compare and contrast a wide range of models. We propose a hierarchical model, HU-RNN, which might show an inductive bias towards relocatable atomic groups of tokens, thus potentially encouraging the emergence of words. We experiment with probing for the compositional inductive bias of sender networks in isolation, and also placed end-to-end, with a receiver, as an auto-encoder. We propose a metric of compositionality, Compositional Entropy, that is fast to calculate, and broadly applicable.", "keywords": "emergent communication;compositionality;metrics;language model", "primary_area": "", "supplementary_material": "/attachment/89b74bcfa86ed9e232b613d09a569fb7bb182ed3.zip", "author": "Hugh Perkins", "authorids": "~Hugh_Perkins2", "gender": "", "homepage": "https://github.com/hughperkins", "dblp": "136/5700", "google_scholar": "https://scholar.google.co.uk/citations?user=eT5s06MAAAAJ", "orcid": "", "linkedin": "hughperkins/", "or_profile": "~Hugh_Nicholas_Perkins1", "aff": "", "aff_domain": "", "position": "", "bibtex": "@misc{\nperkins2022icy,\ntitle={Icy: A benchmark for measuring compositional inductive bias of emergent communication models},\nauthor={Hugh Perkins},\nyear={2022},\nurl={https://openreview.net/forum?id=S352vriz3G}\n}", "github": "", "project": "", "reviewers": "ZwK6;iPy9;c4bC;Qy6J", "site": "https://openreview.net/forum?id=S352vriz3G", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "4;3;3;4", "correctness": "2;3;2;3", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;2;2;4", "wc_summary_paper": "309;120;51;187", "wc_summary_review": "123;39;98;70", "wc_main_review": "504;791;440;907", "wc_review": "936;950;589;1164", "wc_reply_reviewers": "133;0;0;91", "wc_reply_authors": "18;0;0;10", "reply_reviewers": "1;0;0;1", "reply_authors": "1;0;0;1", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 166.75, 95.16925711594054 ], "wc_summary_review_avg": [ 82.5, 31.34086788842964 ], "wc_main_review_avg": [ 660.5, 194.23246381591312 ], "wc_review_avg": [ 909.75, 206.05384611795043 ], "wc_reply_reviewers_avg": [ 56.0, 57.93530875036397 ], "wc_reply_authors_avg": [ 7.0, 7.54983443527075 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 0.5, 0.5 ], "replies_avg": [ 27, 0 ], "authors#_avg": [ 1, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:R9UQa1JnhpQJ:scholar.google.com/&scioq=Icy:+A+benchmark+for+measuring+compositional+inductive+bias+of+emergent+communication+models&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "S3qhbZwzq3H", "title": "Value-aware transformers for 1.5d data", "track": "main", "status": "Reject", "tldr": "", "abstract": "Sparse sequential highly-multivariate data of the form characteristic of hospital in-patient investigation and treatment poses a considerable challenge for representation learning. Such data is neither faithfully reducible to 1d nor dense enough to constitute multivariate series. Conventional models compromise their data by requiring these forms at the point of input. Building on contemporary sequence-modelling architectures we design a value-aware transformer, prompting a reconceptualisation of our data as 1.5-dimensional: a token-value form both respecting its sequential nature and augmenting it with a quantifier. Experiments focused on sequential in-patient laboratory data up to 48hrs after hospital admission show that the value-aware transformer performs favourably versus competitive baselines on in-hospital mortality and length-of-stay prediction within the MIMIC-III dataset.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/63352ecb90debe53f45f227dd83144d7bdc653c3.zip", "author": "James F Cann;Timothy J Roberts;Amy R Tso;Amy Nelson;Parashkev Nachev", "authorids": "~James_F_Cann1;timothy.roberts@ucl.ac.uk;a.tso@ucl.ac.uk;amy.nelson@ucl.ac.uk;~Parashkev_Nachev1", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;Cf10kCIAAAAJ", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~James_F_Cann1;timothy.roberts@ucl.ac.uk;a.tso@ucl.ac.uk;amy.nelson@ucl.ac.uk;~Parashkev_Nachev1", "aff": ";;;;University College London", "aff_domain": ";;;;ucl.ac.uk", "position": ";;;;Full Professor", "bibtex": "@misc{\ncann2022valueaware,\ntitle={Value-aware transformers for 1.5d data},\nauthor={James F Cann and Timothy J Roberts and Amy R Tso and Amy Nelson and Parashkev Nachev},\nyear={2022},\nurl={https://openreview.net/forum?id=S3qhbZwzq3H}\n}", "github": "", "project": "", "reviewers": "qFRi;ph4X;KQ5r", "site": "https://openreview.net/forum?id=S3qhbZwzq3H", "pdf_size": 0, "recommendation": "3;6;6", "confidence": "5;3;4", "correctness": "3;3;3", "technical_novelty": "2;2;3", "empirical_novelty": "2;2;3", "wc_summary_paper": "81;173;66", "wc_summary_review": "41;72;31", "wc_main_review": "383;412;151", "wc_review": "505;657;248", "wc_reply_reviewers": "296;48;54", "wc_reply_authors": "2205;1100;492", "reply_reviewers": "1;1;1", "reply_authors": "4;3;2", "recommendation_avg": [ 5.0, 1.4142135623730951 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 106.66666666666667, 47.302807057885644 ], "wc_summary_review_avg": [ 48.0, 17.45470328211473 ], "wc_main_review_avg": [ 315.3333333333333, 116.80277774474753 ], "wc_review_avg": [ 470.0, 168.7977093051522 ], "wc_reply_reviewers_avg": [ 132.66666666666666, 115.52008002459525 ], "wc_reply_authors_avg": [ 1265.6666666666667, 709.0727904963088 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 3.0, 0.816496580927726 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.8660254037844387, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:BpR5B24mIBIJ:scholar.google.com/&scioq=Value-aware+transformers+for+1.5d+data&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "University College London", "aff_unique_dep": "", "aff_unique_url": "https://www.ucl.ac.uk", "aff_unique_abbr": "UCL", "aff_country_unique_index": "0", "aff_country_unique": "United Kingdom" }, { "id": "S5qdnMhf7R", "title": "Lightweight Convolutional Neural Networks By Hypercomplex Parameterization", "track": "main", "status": "Reject", "tldr": "", "abstract": "Hypercomplex neural networks have proved to reduce the overall number of parameters while ensuring valuable performances by leveraging the properties of Clifford algebras. Recently, hypercomplex linear layers have been further improved by involving efficient parameterized Kronecker products. In this paper, we define the parameterization of hypercomplex convolutional layers to develop lightweight and efficient large-scale convolutional models. Our method grasps the convolution rules and the filters organization directly from data without requiring a rigidly predefined domain structure to follow. The proposed approach is flexible to operate in any user-defined or tuned domain, from 1D to $n$D regardless of whether the algebra rules are preset. Such a malleability allows processing multidimensional inputs in their natural domain without annexing further dimensions, as done, instead, in quaternion neural networks for 3D inputs like color images. As a result, the proposed method operates with $1/n$ free parameters as regards its analog in the real domain. We demonstrate the versatility of this approach to multiple domains of application by performing experiments on various image as well as audio datasets in which our method outperforms real and quaternion-valued counterparts. ", "keywords": "Hypercomplex Neural Networks;Lightweight Neural Networks;Quaternion Neural Networks;Parameterized Hypercomplex Convolutions;Hypercomplex Representation Learning", "primary_area": "", "supplementary_material": "", "author": "Eleonora Grassucci;Aston Zhang;Danilo Comminiello", "authorids": "~Eleonora_Grassucci1;~Aston_Zhang2;~Danilo_Comminiello1", "gender": "F;;M", "homepage": "https://sites.google.com/uniroma1.it/eleonoragrassucci/home-page;;https://danilocomminiello.site.uniroma1.it/", "dblp": "275/6348;;33/9433", "google_scholar": "https://scholar.google.it/citations?user=Jcv0TgQAAAAJ;;https://scholar.google.it/citations?user=H3Y52cMAAAAJ", "orcid": "0000-0003-4626-4506;;0000-0003-4067-4504", "linkedin": ";;danilocomminiello/", "or_profile": "~Eleonora_Grassucci1;~Aston_Zhang2;~Danilo_Comminiello1", "aff": "Sapienza University of Rome;;Sapienza University of Rome", "aff_domain": "uniroma1.it;;uniroma1.it", "position": "PhD student;;Associate Professor", "bibtex": "@misc{\ngrassucci2022lightweight,\ntitle={Lightweight Convolutional Neural Networks By Hypercomplex Parameterization},\nauthor={Eleonora Grassucci and Aston Zhang and Danilo Comminiello},\nyear={2022},\nurl={https://openreview.net/forum?id=S5qdnMhf7R}\n}", "github": "", "project": "", "reviewers": "JhvG;6bRZ;MVqX;KMob", "site": "https://openreview.net/forum?id=S5qdnMhf7R", "pdf_size": 0, "recommendation": "5;5;6;8", "confidence": "4;2;4;5", "correctness": "3;2;4;3", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "56;57;174;63", "wc_summary_review": "51;3;24;35", "wc_main_review": "239;131;119;427", "wc_review": "346;191;317;525", "wc_reply_reviewers": "69;134;0;25", "wc_reply_authors": "1467;988;866;654", "reply_reviewers": "1;2;0;1", "reply_authors": "4;3;3;5", "recommendation_avg": [ 6.0, 1.224744871391589 ], "confidence_avg": [ 3.75, 1.0897247358851685 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 87.5, 50.0124984378905 ], "wc_summary_review_avg": [ 28.25, 17.455300054711177 ], "wc_main_review_avg": [ 229.0, 123.49898785010345 ], "wc_review_avg": [ 344.75, 119.26939045706573 ], "wc_reply_reviewers_avg": [ 57.0, 50.85764445980565 ], "wc_reply_authors_avg": [ 993.75, 298.2233852332845 ], "reply_reviewers_avg": [ 1.0, 0.7071067811865476 ], "reply_authors_avg": [ 3.75, 0.82915619758885 ], "replies_avg": [ 25, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.7492686492653551, "corr_recommendation_correctness": 0.28867513459481287, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4169569508070216823&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Sapienza University of Rome", "aff_unique_dep": "", "aff_unique_url": "https://www.uniroma1.it", "aff_unique_abbr": "Sapienza", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Rome", "aff_country_unique_index": "0;0", "aff_country_unique": "Italy" }, { "id": "S6eHczgYpnu", "title": "Fast and Sample-Efficient Domain Adaptation for Autoencoder-Based End-to-End Communication", "track": "main", "status": "Reject", "tldr": "", "abstract": "The problem of domain adaptation conventionally considers the setting where a source domain has plenty of labeled data, and a target domain (with a different data distribution) has plenty of unlabeled data but none or very limited labeled data. In this paper, we address the setting where the target domain has only limited labeled data from a distribution that is expected to change frequently. We first propose a fast and light-weight method for adapting a Gaussian mixture density network (MDN) using only a small set of target domain samples. This method is well-suited for the setting where the distribution of target data changes rapidly (e.g., a wireless channel), making it challenging to collect a large number of samples and retrain. We then apply the proposed MDN adaptation method to the problem of end-of-end learning of a communication autoencoder, which jointly learns the encoder, decoder, and a channel networks to minimize the decoding error rate. However, the error rate of an autoencoder trained on a particular (source) channel distribution can degrade as the channel distribution changes frequently, not allowing enough time for data collection and retraining of the autoencoder to the target channel distribution. We propose a method for adapting the autoencoder without modifying the encoder and decoder neural networks, and adapting only the MDN model of the channel. The method utilizes feature transformations at the decoder to compensate for changes in the channel distribution, and effectively present to the decoder samples close to the source distribution. Experimental evaluation on simulated datasets and real mmWave wireless channels demonstrate that the proposed methods can adapt the MDN model using very limited number of samples, and improve or maintain the error rate of the autoencoder under changing channel conditions.", "keywords": "domain adaptation;small target dataset;communication autoencoders;mixture density networks;wireless channel", "primary_area": "", "supplementary_material": "", "author": "Jayaram Raghuram;Yijing Zeng;Dolores Garcia;Somesh Jha;Suman Banerjee;Joerg Widmer;Rafael Ruiz", "authorids": "~Jayaram_Raghuram1;~Yijing_Zeng1;~Dolores_Garcia1;~Somesh_Jha1;~Suman_Banerjee3;~Joerg_Widmer1;~Rafael_Ruiz1", "gender": "M;M;;M;M;M;", "homepage": ";;;;http://pages.cs.wisc.edu/~suman;https://www.joergwidmer.org/;", "dblp": "117/7273;;;j/SomeshJha;;;", "google_scholar": "xvjzWWEAAAAJ;GmNBWSkAAAAJ;https://scholar.google.com/citations?hl=en;BaI7l8QAAAAJ;cLb-v7gAAAAJ;LT1_KV4AAAAJ;", "orcid": "0000-0002-9473-3357;;;;;0000-0001-6667-8779;0000-0002-9421-3415", "linkedin": "jayaram-raghuram-32b66410/;;;;;;", "or_profile": "~Jayaram_Raghuram1;~Yijing_Zeng1;~Dolores_Garcia1;~Somesh_Jha1;~Suman_Banerjee3;~Joerg_Widmer1;~Rafael_Ruiz1", "aff": "University of Wisconsin - Madison;University of Wisconsin, Madison;CERN;Department of Computer Science, University of Wisconsin, Madison;UW-Madison;IMDEA Networks;", "aff_domain": "cs.wisc.edu;wisc.edu;cern.ch;cs.wisc.edu;cs.wisc.edu;imdea.org;", "position": "Researcher;PhD student;Postdoc;Full Professor;Full Professor;Principal Researcher;", "bibtex": "@misc{\nraghuram2022fast,\ntitle={Fast and Sample-Efficient Domain Adaptation for Autoencoder-Based End-to-End Communication},\nauthor={Jayaram Raghuram and Yijing Zeng and Dolores Garcia and Somesh Jha and Suman Banerjee and Joerg Widmer and Rafael Ruiz},\nyear={2022},\nurl={https://openreview.net/forum?id=S6eHczgYpnu}\n}", "github": "", "project": "", "reviewers": "wact;8ana;S81w;brRc;9VpG", "site": "https://openreview.net/forum?id=S6eHczgYpnu", "pdf_size": 0, "recommendation": "3;3;5;5;6", "confidence": "3;4;4;3;4", "correctness": "3;2;3;2;3", "technical_novelty": "2;2;2;2;3", "empirical_novelty": "2;2;2;2;2", "wc_summary_paper": "181;45;88;139;55", "wc_summary_review": "24;14;115;39;40", "wc_main_review": "199;294;571;237;276", "wc_review": "404;353;774;415;371", "wc_reply_reviewers": "0;0;0;0;26", "wc_reply_authors": "948;1202;1904;529;939", "reply_reviewers": "0;0;0;0;1", "reply_authors": "2;2;3;1;2", "recommendation_avg": [ 4.4, 1.2 ], "confidence_avg": [ 3.6, 0.4898979485566356 ], "correctness_avg": [ 2.6, 0.4898979485566356 ], "technical_novelty_avg": [ 2.2, 0.39999999999999997 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 101.6, 51.50378626858418 ], "wc_summary_review_avg": [ 46.4, 35.64603764796306 ], "wc_main_review_avg": [ 315.4, 131.9372578159786 ], "wc_review_avg": [ 463.4, 156.88798551833088 ], "wc_reply_reviewers_avg": [ 5.2, 10.4 ], "wc_reply_authors_avg": [ 1104.4, 454.2662655315712 ], "reply_reviewers_avg": [ 0.2, 0.4 ], "reply_authors_avg": [ 2.0, 0.6324555320336759 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.2721655269759087, "corr_recommendation_correctness": 0.2721655269759087, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5837827934665995797&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;2;0;0;3", "aff_unique_norm": "University of Wisconsin-Madison;University of Wisconsin;European Organization for Nuclear Research;IMDEA Networks Institute", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.wisc.edu;https://www.wisc.edu;https://home.cern;https://www.imdea.org/", "aff_unique_abbr": "UW-Madison;UW;CERN;IMDEA", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Madison;", "aff_country_unique_index": "0;0;1;0;0;2", "aff_country_unique": "United States;Switzerland;Spain" }, { "id": "S7vWxSkqv_M", "title": "Evaluating Predictive Distributions: Does Bayesian Deep Learning Work?", "track": "main", "status": "Reject", "tldr": "", "abstract": "Posterior predictive distributions quantify uncertainties ignored by point estimates.\nThis paper introduces \\textit{The Neural Testbed}, which provides tools for the systematic evaluation of agents that generate such predictions.\nCrucially, these tools assess not only the quality of marginal predictions per input, but also joint predictions given many inputs.\nJoint distributions are often critical for useful uncertainty quantification, but they have been largely overlooked by the Bayesian deep learning community.\nWe benchmark several approaches to uncertainty estimation using a neural-network-based data generating process.\nOur results reveal the importance of evaluation beyond marginal predictions.\nFurther, they reconcile sources of confusion in the field, such as why Bayesian deep learning approaches that generate accurate marginal predictions perform poorly in sequential decision tasks, how incorporating priors can be helpful, and what roles epistemic versus aleatoric uncertainty play when evaluating performance.\nWe also present experiments on real-world challenge datasets, which show a high correlation with testbed results, and that the importance of evaluating joint predictive distributions carries over to real data.\nAs part of this effort, we opensource The Neural Testbed, including all implementations from this paper.", "keywords": "Deep Learning;Bayesian;Uncertainty;Testbed;Opensource Code", "primary_area": "", "supplementary_material": "", "author": "Ian Osband;Zheng Wen;Seyed Mohammad Asghari;Xiuyuan Lu;Morteza Ibrahimi;Vikranth Dwaracherla;Dieterich Lawson;Brendan O'Donoghue;Botao Hao;Benjamin Van Roy", "authorids": "~Ian_Osband1;~Zheng_Wen1;~Seyed_Mohammad_Asghari1;~Xiuyuan_Lu1;~Morteza_Ibrahimi2;~Vikranth_Dwaracherla1;~Dieterich_Lawson1;~Brendan_O'Donoghue1;~Botao_Hao1;~Benjamin_Van_Roy3", "gender": "M;M;;F;;M;M;;;M", "homepage": "http://iosband.github.io/;http://zheng-wen.com/;;;;https://vikranth.people.stanford.edu/;;https://haobotao000.github.io/;https://web.stanford.edu/~bvr;http://bodono.github.io/", "dblp": ";;;200/9014;;182/7585;;222/2211;41/4314.html;116/3587", "google_scholar": "https://scholar.google.co.uk/citations?user=QA4o6eYAAAAJ;kK3qvd8AAAAJ;;SPL_2lIAAAAJ;pgcjVaYAAAAJ;ir7j5AkAAAAJ;8xSYX9IAAAAJ;;05sMX8MAAAAJ;0Pzjj-cAAAAJ", "orcid": ";;;;;;;;;", "linkedin": "iosband;;seyed-mohammad-asghari;lxy-lucy/;;;;;;", "or_profile": "~Ian_Osband1;~Zheng_Wen1;~Seyed_Mohammad_Asghari1;~Xiuyuan_Lu1;~Morteza_Ibrahimi2;~Vikranth_Dwaracherla1;~Dieterich_Lawson1;~Botao_Hao1;~Benjamin_Van_Roy3;~Brendan_ODonoghue1", "aff": "Google DeepMind;Google DeepMind;Google DeepMind;Google Deepmind;Google DeepMind;Google DeepMind;;Google Deepmind;Google;Google DeepMind", "aff_domain": "google.com;google.com;deepmind.com;google.com;deepmind.com;deepmind.com;;google.com;google.com;deepmind.com", "position": "Researcher;Research Scientist;Research Engineer;Research Scientist;Researcher;Researcher;;Research Scientist;research scientist;Researcher", "bibtex": "@misc{\nosband2022evaluating,\ntitle={Evaluating Predictive Distributions: Does Bayesian Deep Learning Work?},\nauthor={Ian Osband and Zheng Wen and Seyed Mohammad Asghari and Xiuyuan Lu and Morteza Ibrahimi and Vikranth Dwaracherla and Dieterich Lawson and Brendan O'Donoghue and Botao Hao and Benjamin Van Roy},\nyear={2022},\nurl={https://openreview.net/forum?id=S7vWxSkqv_M}\n}", "github": "", "project": "", "reviewers": "QqqV;4oj5;E6jF;Etjg", "site": "https://openreview.net/forum?id=S7vWxSkqv_M", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "4;3;4;4", "correctness": "4;3;4;4", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;3;3;4", "wc_summary_paper": "56;95;66;126", "wc_summary_review": "28;62;30;37", "wc_main_review": "887;472;964;247", "wc_review": "971;629;1060;410", "wc_reply_reviewers": "319;245;932;69", "wc_reply_authors": "1017;713;1302;309", "reply_reviewers": "1;1;2;1", "reply_authors": "3;4;4;2", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 85.75, 27.2981226460722 ], "wc_summary_review_avg": [ 39.25, 13.5531361684298 ], "wc_main_review_avg": [ 642.5, 295.2257610710827 ], "wc_review_avg": [ 767.5, 261.7045089409046 ], "wc_reply_reviewers_avg": [ 391.25, 325.1402581963667 ], "wc_reply_authors_avg": [ 835.25, 368.3655623154803 ], "reply_reviewers_avg": [ 1.25, 0.4330127018922193 ], "reply_authors_avg": [ 3.25, 0.82915619758885 ], "replies_avg": [ 25, 0 ], "authors#_avg": [ 10, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12110319192689986773&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;0;1;0;0;1;0;0", "aff_unique_norm": "Google;DeepMind", "aff_unique_dep": "Google DeepMind;DeepMind", "aff_unique_url": "https://deepmind.com;https://deepmind.com", "aff_unique_abbr": "DeepMind;DeepMind", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0;0;0;0;0;1;0", "aff_country_unique": "United Kingdom;United States" }, { "title": "RvS: What is Essential for Offline RL via Supervised Learning?", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6088", "id": "S874XAIpkR-", "poster": "", "openreview": "https://openreview.net/forum?id=S874XAIpkR-", "slides": "https://iclr.cc/virtual/2022/poster/6088", "video": "https://iclr.cc/virtual/2022/poster/6088", "author_site": "Scott Emmons, Benjamin Eysenbach, Ilya Kostrikov, Sergey Levine", "tldr": "", "abstract": "Recent work has shown that supervised learning alone, without temporal difference (TD) learning, can be remarkably effective for offline RL. When does this hold true, and which algorithmic components are necessary? Through extensive experiments, we boil supervised learning for offline RL down to its essential elements. In every environment suite we consider, simply maximizing likelihood with a two-layer feedforward MLP is competitive with state-of-the-art results of substantially more complex methods based on TD learning or sequence modeling with Transformers. Carefully choosing model capacity (e.g., via regularization or architecture) and choosing which information to condition on (e.g., goals or rewards) are critical for performance. These insights serve as a field guide for practitioners doing Reinforcement Learning via Supervised Learning (which we coin RvS learning). They also probe the limits of existing RvS methods, which are comparatively weak on random data, and suggest a number of open problems.", "keywords": "reinforcement learning;deep reinforcement learning;offline reinforcement learning", "primary_area": "", "supplementary_material": "/attachment/28f2fde4e20d2f3da4a7d76aea11928ba93365ef.zip", "author": "Scott Emmons;Benjamin Eysenbach;Ilya Kostrikov;Sergey Levine", "authorids": "~Scott_Emmons1;~Benjamin_Eysenbach1;~Ilya_Kostrikov1;~Sergey_Levine1", "gender": "M;M;M;M", "homepage": "http://scottemmons.com/;https://ben-eysenbach.github.io/;;https://people.eecs.berkeley.edu/~svlevine/", "dblp": "180/5699;192/1863;https://dblp.org/pers/k/Kostrikov:Ilya.html;80/7594", "google_scholar": "LoT0z6oAAAAJ;DRnOvU8AAAAJ;PTS2AOgAAAAJ;8R35rCwAAAAJ", "orcid": "0000-0002-7946-7046;0009-0000-7136-6307;;", "linkedin": "scott-emmons-5258005b/;benjamin-eysenbach-a7235775/;;", "or_profile": "~Scott_Emmons1;~Benjamin_Eysenbach1;~Ilya_Kostrikov1;~Sergey_Levine1", "aff": "University of California, Berkeley;Carnegie Mellon University;University of California, Berkeley;Google", "aff_domain": "berkeley.edu;cmu.edu;berkeley.edu;google.com", "position": "PhD student;PhD student;Postdoc;Research Scientist", "bibtex": "@inproceedings{\nemmons2022rvs,\ntitle={RvS: What is Essential for Offline {RL} via Supervised Learning?},\nauthor={Scott Emmons and Benjamin Eysenbach and Ilya Kostrikov and Sergey Levine},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=S874XAIpkR-}\n}", "github": "", "project": "", "reviewers": "hoyq;dVtg;ehJE;oQXj", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "3;2;3;4", "correctness": "3;3;3;4", "technical_novelty": "2;3;2;1", "empirical_novelty": "2;3;3;2", "wc_summary_paper": "95;79;70;45", "wc_summary_review": "94;40;20;46", "wc_main_review": "243;154;256;281", "wc_review": "432;273;346;372", "wc_reply_reviewers": "252;40;119;61", "wc_reply_authors": "690;374;791;592", "reply_reviewers": "1;1;1;1", "reply_authors": "3;2;2;2", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 3.0, 0.7071067811865476 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 72.25, 18.102140757380052 ], "wc_summary_review_avg": [ 50.0, 27.16615541441225 ], "wc_main_review_avg": [ 233.5, 47.88788990966297 ], "wc_review_avg": [ 355.75, 57.05425049897685 ], "wc_reply_reviewers_avg": [ 118.0, 82.59842613512681 ], "wc_reply_authors_avg": [ 611.75, 154.24716366922277 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 2.25, 0.4330127018922193 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.6488856845230502, "corr_recommendation_correctness": 0.9271726499455306, "gs_citation": 227, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12909820441441824737&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=S874XAIpkR-", "email": "berkeley.edu;cmu.edu;berkeley.edu;google.com", "author_num": 4, "aff_unique_index": "0;1;0;2", "aff_unique_norm": "University of California, Berkeley;Carnegie Mellon University;Google", "aff_unique_dep": ";;Google", "aff_unique_url": "https://www.berkeley.edu;https://www.cmu.edu;https://www.google.com", "aff_unique_abbr": "UC Berkeley;CMU;Google", "aff_campus_unique_index": "0;0;2", "aff_campus_unique": "Berkeley;;Mountain View", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "SC6JbEviuD0", "title": "White Paper Assistance: A Step Forward Beyond the Shortcut Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "The promising performances of CNNs often overshadow the need to examine whether they are doing in the way we are actually interested. We show through experiments that even over-parameterized models would still solve a dataset by recklessly leveraging spurious correlations, or so-called ``shortcuts\u2019\u2019. To combat with this unintended propensity, we borrow the idea of printer test page and propose a novel approach called White Paper Assistance. Our proposed method is two-fold; (a) we intentionally involves the white paper to detect the extent to which the model has preference for certain characterized patterns and (b) we debias the model by enforcing it to make a random guess on the white paper. We show the consistent accuracy improvements that are manifest in various architectures, datasets and combinations with other techniques. Experiments have also demonstrated the versatility of our approach on imbalanced classification and robustness to corruptions.", "keywords": "Shortcut Learning;Bias;Classification;Imbalanced Classification;Robustness", "primary_area": "", "supplementary_material": "/attachment/46d93ea1e28756f8b1bb7326339baf1653b449bd.zip", "author": "Xuan Cheng;Tianshu Xie;XiaoMin Wang;MingHui Liu;Jiali Deng;Ming Liu", "authorids": "~Xuan_Cheng2;~Tianshu_Xie1;~XiaoMin_Wang2;~MingHui_Liu1;~Jiali_Deng1;~Ming_Liu12", "gender": "M;M;;M;;M", "homepage": ";;;https://ydri.uestc.edu.cn/info/1158/1821.htm;;http://www.scse.uestc.edu.cn/info/1084/7342.htm", "dblp": ";287/4771;https://dblp2.uni-trier.de/pid/37/3003.html;https://dblp.uni-trier.de/pid/62/4909;;20/2039-2", "google_scholar": ";;;;;", "orcid": "0000-0002-4234-6179;0000-0001-7021-8855;0000-0002-7292-120X;;;", "linkedin": ";;;;;", "or_profile": "~Xuan_Cheng2;~Tianshu_Xie1;~XiaoMin_Wang2;~MingHui_Liu1;~Jiali_Deng1;~Ming_Liu12", "aff": "University of Electronic Science and Technology of China;;University of Electronic Science and Technology of China;University of Electronic Science and Technology of China;;University of Electronic Science and Technology of China", "aff_domain": "uestc.edu.cn;;uestc.edu.cn;uestc.edu.cn;;uestc.edu.cn", "position": "PhD student;;Associate Professor;PhD student;;Full Professor", "bibtex": "@misc{\ncheng2022white,\ntitle={White Paper Assistance: A Step Forward Beyond the Shortcut Learning},\nauthor={Xuan Cheng and Tianshu Xie and XiaoMin Wang and MingHui Liu and Jiali Deng and Ming Liu},\nyear={2022},\nurl={https://openreview.net/forum?id=SC6JbEviuD0}\n}", "github": "", "project": "", "reviewers": "y5Su;C42n;w6iS;u8Nt", "site": "https://openreview.net/forum?id=SC6JbEviuD0", "pdf_size": 0, "recommendation": "1;3;5;8", "confidence": "5;3;3;4", "correctness": "1;3;3;3", "technical_novelty": "1;2;1;4", "empirical_novelty": "1;2;3;3", "wc_summary_paper": "57;58;34;81", "wc_summary_review": "36;211;51;71", "wc_main_review": "184;339;426;536", "wc_review": "277;608;511;688", "wc_reply_reviewers": "173;269;0;82", "wc_reply_authors": "1223;1460;861;715", "reply_reviewers": "1;1;0;1", "reply_authors": "2;2;1;1", "recommendation_avg": [ 4.25, 2.5860201081971503 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 2.5, 0.8660254037844386 ], "technical_novelty_avg": [ 2.0, 1.224744871391589 ], "empirical_novelty_avg": [ 2.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 57.5, 16.62077013859466 ], "wc_summary_review_avg": [ 92.25, 69.6755875468589 ], "wc_main_review_avg": [ 371.25, 128.68833474717124 ], "wc_review_avg": [ 521.0, 154.18657529110632 ], "wc_reply_reviewers_avg": [ 131.0, 100.4614353869185 ], "wc_reply_authors_avg": [ 1064.75, 293.7280842888538 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.32063022053099893, "corr_recommendation_correctness": 0.7255892438417318, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:x3YBDJ8jhI0J:scholar.google.com/&scioq=White+Paper+Assistance:+A+Step+Forward+Beyond+the+Shortcut+Learning&hl=en&as_sdt=0,33", "gs_version_total": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "University of Electronic Science and Technology of China", "aff_unique_dep": "", "aff_unique_url": "https://www.uestc.edu.cn", "aff_unique_abbr": "UESTC", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "id": "SCSonHu4p0W", "title": "Knowledge Based Multilingual Language Model", "track": "main", "status": "Reject", "tldr": "", "abstract": "Knowledge enriched language representation learning has shown promising performance across various knowledge-intensive NLP tasks. However, existing knowledge based language models are all trained with monolingual knowledge graph data, which limits their application to more languages. In this work, we present a novel framework to pretrain knowledge based multilingual language models (KMLMs). We first generate a large amount of code-switched synthetic sentences and reasoning-based multilingual training data using the Wikidata knowledge graphs. Then based on the intra- and inter-sentence structures of the generated data, we design pretraining tasks to facilitate knowledge learning, which allows the language models to not only memorize the factual knowledge but also learn useful logical patterns. Our pretrained KMLMs demonstrate significant performance improvements on a wide range of knowledge-intensive cross-lingual NLP tasks, including named entity recognition, factual knowledge retrieval, relation classification, and a new task designed by us, namely, logic reasoning. Our code and pretrained language models will be made publicly available.", "keywords": "Language Model;Knowledge;Multilingual", "primary_area": "", "supplementary_material": "/attachment/e9cbdc93e6c4081373d1c97d931235400ea12642.zip", "author": "Linlin Liu;Xin Li;Ruidan He;Lidong Bing;Shafiq Joty;Luo Si", "authorids": "~Linlin_Liu2;~Xin_Li40;~Ruidan_He1;~Lidong_Bing2;~Shafiq_Joty1;~Luo_Si3", "gender": "M;M;F;M;M;", "homepage": ";https://lixin4ever.github.io/;https://sites.google.com/view/ruidan/ruidan-he;https://raihanjoty.github.io/;;https://lidongbing.github.io", "dblp": ";09/1365-56.html;203/9668;62/2078;;53/6625", "google_scholar": "bNsRATcAAAAJ;https://scholar.google.com.hk/citations?user=syD9lxQAAAAJ;https://scholar.google.com.sg/citations?user=FsoIVjAAAAAJ;hR249csAAAAJ;xqEfATIAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": "~Linlin_Liu2;~Xin_Li40;~Ruidan_He1;~Shafiq_Joty1;~Luo_Si3;~Lidong_Bing3", "aff": "Nanyang Technological University;Alibaba Group;;SalesForce.com;Alibaba Group;Alibaba Group", "aff_domain": "ntu.edu.sg;alibaba-inc.com;;salesforce.com;alibaba-inc.com;alibaba-inc.com", "position": "PhD student;Researcher;;Principal Researcher;Alibaba Group Inc;Scientist", "bibtex": "@misc{\nliu2022knowledge,\ntitle={Knowledge Based Multilingual Language Model},\nauthor={Linlin Liu and Xin Li and Ruidan He and Lidong Bing and Shafiq Joty and Luo Si},\nyear={2022},\nurl={https://openreview.net/forum?id=SCSonHu4p0W}\n}", "github": "", "project": "", "reviewers": "Rt8N;C6ht;RyT7;pMnq;cZgu", "site": "https://openreview.net/forum?id=SCSonHu4p0W", "pdf_size": 0, "recommendation": "3;3;5;5;5", "confidence": "4;5;3;3;4", "correctness": "3;2;3;4;3", "technical_novelty": "2;3;3;2;2", "empirical_novelty": "3;1;3;3;2", "wc_summary_paper": "56;55;138;85;134", "wc_summary_review": "51;19;246;40;134", "wc_main_review": "179;245;184;184;888", "wc_review": "286;319;568;309;1156", "wc_reply_reviewers": "0;30;34;24;0", "wc_reply_authors": "0;0;0;0;0", "reply_reviewers": "0;1;1;1;0", "reply_authors": "0;0;0;0;0", "recommendation_avg": [ 4.2, 0.9797958971132712 ], "confidence_avg": [ 3.8, 0.7483314773547882 ], "correctness_avg": [ 3.0, 0.6324555320336759 ], "technical_novelty_avg": [ 2.4, 0.4898979485566356 ], "empirical_novelty_avg": [ 2.4, 0.8 ], "wc_summary_paper_avg": [ 93.6, 36.28002205070995 ], "wc_summary_review_avg": [ 98.0, 83.68273418095276 ], "wc_main_review_avg": [ 336.0, 277.0711100060777 ], "wc_review_avg": [ 527.6, 330.5114824026542 ], "wc_reply_reviewers_avg": [ 17.6, 14.71869559437928 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0.6, 0.48989794855663565 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.7637626158259733, "corr_recommendation_correctness": 0.6454972243679028, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12117497305849933871&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;2;1;1", "aff_unique_norm": "Nanyang Technological University;Alibaba Group;Salesforce", "aff_unique_dep": ";;", "aff_unique_url": "https://www.ntu.edu.sg;https://www.alibaba.com;https://www.salesforce.com", "aff_unique_abbr": "NTU;Alibaba;Salesforce", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2;1;1", "aff_country_unique": "Singapore;China;United States" }, { "id": "SCn0mgEIwh", "title": "Learnability and Expressiveness in Self-Supervised Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "In this work, we argue that representations induced by self-supervised learning (SSL) methods should both be expressive and learnable. To measure expressiveness, we propose to use the Intrinsic Dimension (ID) of the dataset in representation space. Inspired by the human study of Laina et al. (2020), we introduce Cluster Learnability (CL), defined in terms of the learning speed of a KNN classifier trained to predict K-means cluster labels for held-out representations. By collecting 30 state-of-art checkpoints, both supervised and self-supervised, using different architectures, we show that ID and CL can be combined to predict downstream classification performance better than the existing techniques based on contrastive losses or pretext tasks, while having no requirements on data augmentation, model architecture or human labels. To further demonstrate the utility of our framework, we propose modifying DeepCluster (Caron et al., 2018) to improve the learnability of the representations. Using our modification, we are able to outperform DeepCluster on both STL10 and ImageNet benchmarks. The performance of the intermediate checkpoints can also be well predicted under our framework, suggesting the possibility of developing new SSL algorithms without labels.", "keywords": "Self-supervised Learning;Learnability;Intrinsic Dimension;Representation Learning", "primary_area": "", "supplementary_material": "/attachment/ec0d3b1e881f5b7e2c275fa63074eec4ce5a0a4a.zip", "author": "Yuchen Lu;Zhen Liu;Alessandro Sordoni;Aristide Baratin;Romain Laroche;Aaron Courville", "authorids": "~Yuchen_Lu1;~Zhen_Liu6;~Alessandro_Sordoni2;~Aristide_Baratin1;~Romain_Laroche1;~Aaron_Courville3", "gender": "M;M;;;M;", "homepage": "http://jackhaha363.github.io/;;;;https://www.researchgate.net/profile/Romain_Laroche;", "dblp": "223/4762;77/35-19;;;65/9019;56/1688", "google_scholar": "https://scholar.google.ca/citations?hl=en;I1IiJCAAAAAJ;;;RiIOKJMAAAAJ;https://scholar.google.ca/citations?user=km6CP8cAAAAJ", "orcid": ";;;;;", "linkedin": ";;;;romain-laroche-6282397/?originalSubdomain=ca;", "or_profile": "~Yuchen_Lu1;~Zhen_Liu6;~Alessandro_Sordoni2;~Aristide_Baratin1;~Romain_Laroche1;~Aaron_Courville3", "aff": "University of Montreal;University of Montreal;;;Microsoft;Universit\u00e9 de Montr\u00e9al", "aff_domain": "umontreal.ca;umontreal.ca;;;microsoft.com; ", "position": "PhD student;PhD student;;;Principal Researcher;Assistant Professor", "bibtex": "@misc{\nlu2022learnability,\ntitle={Learnability and Expressiveness in Self-Supervised Learning},\nauthor={Yuchen Lu and Zhen Liu and Alessandro Sordoni and Aristide Baratin and Romain Laroche and Aaron Courville},\nyear={2022},\nurl={https://openreview.net/forum?id=SCn0mgEIwh}\n}", "github": "", "project": "", "reviewers": "Y1ph;xT8d;Fbk6;Brvt", "site": "https://openreview.net/forum?id=SCn0mgEIwh", "pdf_size": 0, "recommendation": "5;5;5;5", "confidence": "3;3;4;3", "correctness": "3;3;3;2", "technical_novelty": "3;2;2;2", "empirical_novelty": "3;2;0;2", "wc_summary_paper": "67;117;109;93", "wc_summary_review": "42;46;99;25", "wc_main_review": "318;337;180;277", "wc_review": "427;500;388;395", "wc_reply_reviewers": "0;115;0;425", "wc_reply_authors": "359;743;523;974", "reply_reviewers": "0;1;0;2", "reply_authors": "1;1;1;3", "recommendation_avg": [ 5.0, 0.0 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.75, 1.0897247358851685 ], "wc_summary_paper_avg": [ 96.5, 19.09842925478428 ], "wc_summary_review_avg": [ 53.0, 27.703790354390136 ], "wc_main_review_avg": [ 278.0, 60.59290387495882 ], "wc_review_avg": [ 427.5, 44.364963653766246 ], "wc_reply_reviewers_avg": [ 135.0, 173.889332622792 ], "wc_reply_authors_avg": [ 649.75, 231.53549943799115 ], "reply_reviewers_avg": [ 0.75, 0.82915619758885 ], "reply_authors_avg": [ 1.5, 0.8660254037844386 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:yBJ4eKenVvMJ:scholar.google.com/&scioq=Learnability+and+Expressiveness+in+Self-Supervised+Learning&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0;1;2", "aff_unique_norm": "University of Montreal;Microsoft;Universit\u00e9 de Montr\u00e9al", "aff_unique_dep": ";Microsoft Corporation;", "aff_unique_url": "https://wwwumontreal.ca;https://www.microsoft.com;https://www.umontreal.ca", "aff_unique_abbr": "UM;Microsoft;UdeM", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "Canada;United States" }, { "id": "SDkZ6jDCNpB", "title": "Latent Feature Disentanglement For Visual Domain Generalization", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Despite remarkable success in a variety of computer vision applications, it is well-known that deep learning can fail catastrophically when presented with out-of-distribution data, where there are usually style differences between the training and test images. Toward addressing this challenge, we consider the domain generalization problem, wherein predictors are trained using data drawn from a family of related training (source) domains and then evaluated on a distinct and unseen test domain. Naively training a model on the aggregate set of data (pooled from all source domains) has been shown to perform suboptimally, since the information learned by that model might be domain-specific and generalize imperfectly to test domains. Data augmentation has been shown to be an effective approach to overcome this problem. However, its application has been limited to enforcing invariance to simple transformations like rotation, brightness change, etc. Such perturbations do not necessarily cover plausible real-world variations that preserve the semantics of the input (such as a change in the image style). In this paper, taking the advantage of multiple source domains, we propose a novel approach to express and formalize robustness to these kinds of real-world perturbations of the images. The three key ideas underlying our formulation are (1) leveraging disentangled representations of the images to define different factors of variations, (2) generating perturbed images by changing such factors composing the representations of the images. (3) enforcing the learner (classifier) to be invariant to such change in the images. We use image to image translation models to demonstrate the efficacy of this framework. Based on this, we propose a domain-invariant regularization (DIR) loss function, that enforces invariant prediction of targets (class labels) across domains which yields improved generalization performance. We demonstrate the effectiveness of our approach on several widely used datasets for the domain generalization problem, on all of which we achieve competitive results with state-of-the-art models.", "keywords": "Domain Generalization;latent disentanglement;Image classification;Image to Image translation", "primary_area": "", "supplementary_material": "", "author": "Behnam Gholami;Mostafa El-Khamy;Kee-Bong Song", "authorids": "~Behnam_Gholami2;~Mostafa_El-Khamy1;keebong.s@samsung.com", "gender": "M;M;", "homepage": ";https://sites.google.com/site/mostafaelkhamy/;", "dblp": ";00/4303;", "google_scholar": "IwE0wGQAAAAJ;qxPC268AAAAJ;", "orcid": ";0000-0001-9421-6037;", "linkedin": ";mostafa-el-khamy-9606a43;", "or_profile": "~Behnam_Gholami2;~Mostafa_El-Khamy1;keebong.s@samsung.com", "aff": "Samsung;Samsung Semiconductor, INC. ;", "aff_domain": "samsung.com;samsung.com;", "position": "senior research scientist;Sr Principal Eng;", "bibtex": "@misc{\ngholami2022latent,\ntitle={Latent Feature Disentanglement For Visual Domain Generalization},\nauthor={Behnam Gholami and Mostafa El-Khamy and Kee-Bong Song},\nyear={2022},\nurl={https://openreview.net/forum?id=SDkZ6jDCNpB}\n}", "github": "", "project": "", "reviewers": "j4qE;1qCL;CcHU", "site": "https://openreview.net/forum?id=SDkZ6jDCNpB", "pdf_size": 0, "recommendation": "3;5;5", "confidence": "4;4;3", "correctness": "3;4;4", "technical_novelty": "2;2;2", "empirical_novelty": "2;2;2", "wc_summary_paper": "136;75;64", "wc_summary_review": "73;28;68", "wc_main_review": "610;239;240", "wc_review": "819;342;372", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 4.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 91.66666666666667, 31.668421004036325 ], "wc_summary_review_avg": [ 56.333333333333336, 20.138409955990955 ], "wc_main_review_avg": [ 363.0, 174.6558520825073 ], "wc_review_avg": [ 511.0, 218.1329869597902 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.49999999999999983, "corr_recommendation_correctness": 0.9999999999999998, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10892504109745950115&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff_unique_index": "0;0", "aff_unique_norm": "Samsung", "aff_unique_dep": "Samsung", "aff_unique_url": "https://www.samsung.com", "aff_unique_abbr": "Samsung", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "South Korea" }, { "id": "SF9o3-yP1WR", "title": "Robust and Personalized Federated Learning with Spurious Features: an Adversarial Approach", "track": "main", "status": "Reject", "tldr": "", "abstract": "A common approach for personalized federated learning is fine-tuning the global machine learning model to each local client. While this addresses some issues of statistical heterogeneity, we find that such personalization methods are often vulnerable to spurious features, leading to bias and diminished generalization performance. However, debiasing the personalized models under spurious features is difficult. To this end, we propose a strategy to mitigate the effect of spurious features based on our observation that the global model in the federated learning step has a low accuracy disparity due to statistical heterogeneity. Then, we estimate and mitigate the accuracy disparity of personalized models using the global model and adversarial transferability in the personalization step. Empirical results on MNIST, CelebA, and Coil20 datasets show that our method reduces the accuracy disparity of the personalized model on the bias-conflicting data samples from 15.12% to 2.15%, compared to existing personalization approaches, while preserving the benefit of enhanced average accuracy from fine-tuning.", "keywords": "federated learning;personalization;spurious features", "primary_area": "", "supplementary_material": "", "author": "Xiaoyang Wang;Han Zhao;Klara Nahrstedt;Oluwasanmi O Koyejo", "authorids": "~Xiaoyang_Wang6;~Han_Zhao1;~Klara_Nahrstedt1;~Oluwasanmi_O_Koyejo1", "gender": "M;M;F;M", "homepage": "https://xiaoyang-wang.github.io/;https://hanzhaoml.github.io/;https://cs.illinois.edu/about/people/faculty/klara;https://cs.stanford.edu/~sanmi/", "dblp": ";03/3520-2;;14/8885", "google_scholar": ";x942ipYAAAAJ;TW0t25AAAAAJ;EaaOeJwAAAAJ", "orcid": ";0000-0002-8579-1600;0000-0001-6813-3043;0000-0002-4023-419X", "linkedin": ";;;sanmi-koyejo-984754/", "or_profile": "~Xiaoyang_Wang6;~Han_Zhao1;~Klara_Nahrstedt1;~Oluwasanmi_O_Koyejo1", "aff": "Microsoft;University of Illinois, Urbana Champaign;University of Illinois, Urbana Champaign;University of Illinois, Urbana Champaign", "aff_domain": "microsoft.com;illinois.edu;cs.illinois.edu;illinois.edu", "position": "Intern;Assistant Professor;Full Professor;Associate Professor", "bibtex": "@misc{\nwang2022robust,\ntitle={Robust and Personalized Federated Learning with Spurious Features: an Adversarial Approach},\nauthor={Xiaoyang Wang and Han Zhao and Klara Nahrstedt and Oluwasanmi O Koyejo},\nyear={2022},\nurl={https://openreview.net/forum?id=SF9o3-yP1WR}\n}", "github": "", "project": "", "reviewers": "CHgD;jRKj;eYjN", "site": "https://openreview.net/forum?id=SF9o3-yP1WR", "pdf_size": 0, "recommendation": "3;3;6", "confidence": "4;4;3", "correctness": "3;2;4", "technical_novelty": "2;2;3", "empirical_novelty": "0;2;3", "wc_summary_paper": "54;92;54", "wc_summary_review": "19;43;51", "wc_main_review": "261;616;118", "wc_review": "334;751;223", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "670;951;464", "reply_reviewers": "0;0;0", "reply_authors": "1;2;1", "recommendation_avg": [ 4.0, 1.4142135623730951 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.0, 0.816496580927726 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 1.6666666666666667, 1.247219128924647 ], "wc_summary_paper_avg": [ 66.66666666666667, 17.9133717900592 ], "wc_summary_review_avg": [ 37.666666666666664, 13.59738536958076 ], "wc_main_review_avg": [ 331.6666666666667, 209.3582787684521 ], "wc_review_avg": [ 436.0, 227.3015618072168 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 695.0, 199.60126920104156 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -1.0, "corr_recommendation_correctness": 0.8660254037844387, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:xWl2kUulV50J:scholar.google.com/&scioq=Robust+and+Personalized+Federated+Learning+with+Spurious+Features:+an+Adversarial+Approach&hl=en&as_sdt=0,5", "gs_version_total": 4, "aff_unique_index": "0;1;1;1", "aff_unique_norm": "Microsoft;University of Illinois Urbana-Champaign", "aff_unique_dep": "Microsoft Corporation;", "aff_unique_url": "https://www.microsoft.com;https://illinois.edu", "aff_unique_abbr": "Microsoft;UIUC", "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Urbana-Champaign", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "SFgkP_PZvL", "title": "PNODE: A memory-efficient neural ODE framework based on high-level adjoint differentiation", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "We present a memory-efficient neural ODE framework PNODE based on high-level adjoint algorithmic differentiation. It is implemented using PyTorch and PETSc, one of the most commonly used portable, scalable scientific computing libraries. By leveraging discrete adjoint time integrators and advanced checkpointing strategies tailored for these integrators, PNODE can provide a balance between memory and computational costs while computing the gradients consistently and accurately.\nWe demonstrate the performance through numerical experiments on image classification, continuous normalizing flow, and time series regression. We show that PNODE achieves the highest memory efficiency when compared with other reverse-accurate methods. We also show PNODE enables the application of implicit time integration methods that are desired for stiff dynamical systems.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/790f7b87e14d3bdba10a1c1cba33289e5ce79667.zip", "author": "Hong Zhang;Wenjun Zhao", "authorids": "~Hong_Zhang7;~Wenjun_Zhao1", "gender": "M;F", "homepage": "https://www.mcs.anl.gov/~hongzh/;https://cims.nyu.edu/~wenjun", "dblp": ";", "google_scholar": "lo_niigAAAAJ;", "orcid": ";", "linkedin": ";", "or_profile": "~Hong_Zhang7;~Wenjun_Zhao1", "aff": "Argonne National Laboratory;", "aff_domain": "anl.gov;", "position": "Assistant Computational Mathematician;", "bibtex": "@misc{\nzhang2022pnode,\ntitle={{PNODE}: A memory-efficient neural {ODE} framework based on high-level adjoint differentiation},\nauthor={Hong Zhang and Wenjun Zhao},\nyear={2022},\nurl={https://openreview.net/forum?id=SFgkP_PZvL}\n}", "github": "", "project": "", "reviewers": "eRL8;iRe1;agEy", "site": "https://openreview.net/forum?id=SFgkP_PZvL", "pdf_size": 0, "recommendation": "3;5;5", "confidence": "4;4;4", "correctness": "2;3;3", "technical_novelty": "2;2;2", "empirical_novelty": "2;3;3", "wc_summary_paper": "44;63;31", "wc_summary_review": "42;51;43", "wc_main_review": "561;655;349", "wc_review": "647;769;423", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 4.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 2.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 46.0, 13.140268896284683 ], "wc_summary_review_avg": [ 45.333333333333336, 4.027681991198191 ], "wc_main_review_avg": [ 521.6666666666666, 127.98263771135348 ], "wc_review_avg": [ 613.0, 143.28526325713565 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.9999999999999998, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18016379475039650726&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "Argonne National Laboratory", "aff_unique_dep": "", "aff_unique_url": "https://www.anl.gov", "aff_unique_abbr": "ANL", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "id": "SGOma2sAF7Q", "title": "LCS: Learning Compressible Subspaces for Adaptive Network Compression at Inference Time", "track": "main", "status": "Reject", "tldr": "", "abstract": "When deploying deep learning models to a device, it is traditionally assumed that available computational resources (compute, memory, and power) remain static. However, real-world computing systems do not always provide stable resource guarantees. Computational resources need to be conserved when load from other processes is high or battery power is low. Inspired by recent works on neural network subspaces, we propose a method for training a \"compressible subspace\" of neural networks that contains a fine-grained spectrum of models that range from highly efficient to highly accurate. Our models require no retraining, thus our subspace of models can be deployed entirely on-device to allow adaptive network compression at inference time. We present results for achieving arbitrarily fine-grained accuracy-efficiency trade-offs at inference time for structured and unstructured sparsity. We achieve accuracies on-par with standard models when testing our uncompressed models, and maintain high accuracy for sparsity rates above 90% when testing our compressed models. We also demonstrate that our algorithm extends to quantization at variable bit widths, achieving accuracy on par with individually trained networks.", "keywords": "network subspace;compression;post-training;pruning;quantization;efficient", "primary_area": "", "supplementary_material": "", "author": "Maxwell Horton;Elvis Nunez;Anish Prabhu;Anurag Ranjan;Ali Farhadi;Mohammad Rastegari", "authorids": "~Maxwell_Horton1;~Elvis_Nunez1;~Anish_Prabhu1;~Anurag_Ranjan1;~Ali_Farhadi4;~Mohammad_Rastegari2", "gender": "M;M;M;M;;M", "homepage": "https://homes.cs.washington.edu/~mchorton/;;;http://anuragranjan.com;;https://mrastegari.github.io/", "dblp": ";271/7161;;;;31/5228", "google_scholar": "zP3Rp-MAAAAJ;VkjQ270AAAAJ;1zqcBjcAAAAJ;;;N4-2Z_cAAAAJ", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": "~Maxwell_Horton1;~Elvis_Nunez1;~Anish_Prabhu1;~Anurag_Ranjan1;~Ali_Farhadi4;~Mohammad_Rastegari2", "aff": "University of Washington;Apple;Apple;Apple;University of Washington;Department of Computer Science, University of Washington", "aff_domain": "washington.edu;apple.com;apple.com;apple.com;u.washington.edu;cs.washington.edu", "position": "PhD student;Intern;Researcher;Researcher;;Assistant Professor", "bibtex": "@misc{\nhorton2022lcs,\ntitle={{LCS}: Learning Compressible Subspaces for Adaptive Network Compression at Inference Time},\nauthor={Maxwell Horton and Elvis Nunez and Anish Prabhu and Anurag Ranjan and Ali Farhadi and Mohammad Rastegari},\nyear={2022},\nurl={https://openreview.net/forum?id=SGOma2sAF7Q}\n}", "github": "", "project": "", "reviewers": "jBcH;FDfN;Dpge;EG95", "site": "https://openreview.net/forum?id=SGOma2sAF7Q", "pdf_size": 0, "recommendation": "3;3;3;8", "confidence": "4;4;4;4", "correctness": "3;2;3;3", "technical_novelty": "1;2;3;3", "empirical_novelty": "2;1;3;3", "wc_summary_paper": "28;65;75;53", "wc_summary_review": "34;82;131;26", "wc_main_review": "185;180;459;100", "wc_review": "247;327;665;179", "wc_reply_reviewers": "0;0;143;0", "wc_reply_authors": "904;867;1609;162", "reply_reviewers": "0;0;1;0", "reply_authors": "2;2;3;1", "recommendation_avg": [ 4.25, 2.165063509461097 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.82915619758885 ], "empirical_novelty_avg": [ 2.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 55.25, 17.55526986406076 ], "wc_summary_review_avg": [ 68.25, 42.08547849318099 ], "wc_main_review_avg": [ 231.0, 135.88782138219744 ], "wc_review_avg": [ 354.5, 186.76388837245813 ], "wc_reply_reviewers_avg": [ 35.75, 61.92081637058736 ], "wc_reply_authors_avg": [ 885.5, 511.75897647232335 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 2.0, 0.7071067811865476 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5945661049476048459&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;1;1;0;0", "aff_unique_norm": "University of Washington;Apple", "aff_unique_dep": ";Apple Inc.", "aff_unique_url": "https://www.washington.edu;https://www.apple.com", "aff_unique_abbr": "UW;Apple", "aff_campus_unique_index": "1", "aff_campus_unique": ";Seattle", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Equivariant Graph Mechanics Networks with Constraints", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6795", "id": "SHbhHHfePhP", "poster": "", "openreview": "https://openreview.net/forum?id=SHbhHHfePhP", "slides": "https://iclr.cc/virtual/2022/poster/6795", "video": "https://iclr.cc/virtual/2022/poster/6795", "author_site": "Wenbing Huang, Jiaqi Han, Yu Rong, Tingyang Xu, Fuchun Sun, Junzhou Huang", "tldr": "", "abstract": "Learning to reason about relations and dynamics over multiple interacting objects is a challenging topic in machine learning. The challenges mainly stem from that the interacting systems are exponentially-compositional, symmetrical, and commonly geometrically-constrained.\nCurrent methods, particularly the ones based on equivariant Graph Neural Networks (GNNs), have targeted on the first two challenges but remain immature for constrained systems. \nIn this paper, we propose Graph Mechanics Network (GMN) which is combinatorially efficient, equivariant and constraint-aware. The core of GMN is that it represents, by generalized coordinates, the forward kinematics information (positions and velocities) of a structural object. In this manner, the geometrical constraints are implicitly and naturally encoded in the forward kinematics. Moreover, to allow equivariant message passing in GMN, we have developed a general form of orthogonality-equivariant functions, given that the dynamics of constrained systems are more complicated than the unconstrained counterparts. Theoretically, the proposed equivariant formulation is proved to be universally expressive under certain conditions. Extensive experiments support the advantages of GMN compared to the state-of-the-art GNNs in terms of prediction accuracy, constraint satisfaction and data efficiency on the simulated systems consisting of particles, sticks and hinges, as well as two real-world datasets for molecular dynamics prediction and human motion capture.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/f5b19aca48ed36558730af2f074518bc232d9a9e.zip", "author": "Wenbing Huang;Jiaqi Han;Yu Rong;Tingyang Xu;Fuchun Sun;Junzhou Huang", "authorids": "~Wenbing_Huang1;~Jiaqi_Han2;~Yu_Rong1;~Tingyang_Xu1;~Fuchun_Sun1;~Junzhou_Huang2", "gender": "M;M;M;M;M;M", "homepage": "https://gsai.ruc.edu.cn/english/wenbing_huang;https://hanjq17.github.io;https://royrong.me/;;https://www.cs.tsinghua.edu.cn/info/1121/3555.htm;http://ranger.uta.edu/~huang/", "dblp": "155/3181-1.html;235/0412;24/10036-1;157/0940;;22/1170.html", "google_scholar": "0yNkmO4AAAAJ;AKppgMAAAAAJ;https://scholar.google.com.hk/citations?user=itezhEMAAAAJ;6gIs5YMAAAAJ;;https://scholar.google.com.tw/citations?user=X7KrguAAAAAJ", "orcid": ";;0000-0001-7387-302X;0009-0002-0106-8376;;0000-0002-9548-1227", "linkedin": ";;;;;", "or_profile": "~Wenbing_Huang1;~Jiaqi_Han2;~Yu_Rong1;~Tingyang_Xu1;~Fuchun_Sun1;~Junzhou_Huang2", "aff": "Tsinghua University;;Tencent AI Lab;Tencent AI Lab;Tsinghua University;University of Texas, Arlington", "aff_domain": "tsinghua.edu.cn;;tencent.com;tencent.com;cs.tsinghua.edu.cn;uta.edu", "position": "Researcher;;Senior Researcher;Researcher;Full Professor;Full Professor", "bibtex": "@inproceedings{\nhuang2022equivariant,\ntitle={Equivariant Graph Mechanics Networks with Constraints},\nauthor={Wenbing Huang and Jiaqi Han and Yu Rong and Tingyang Xu and Fuchun Sun and Junzhou Huang},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=SHbhHHfePhP}\n}", "github": "", "project": "", "reviewers": "F8Jp;vbzj;Dcng;4PY9", "pdf_size": 0, "recommendation": "5;6;8;8", "confidence": "2;4;4;3", "correctness": "4;4;4;4", "technical_novelty": "1;3;3;2", "empirical_novelty": "1;2;3;3", "wc_summary_paper": "69;155;100;42", "wc_summary_review": "90;108;53;57", "wc_main_review": "660;1076;366;531", "wc_review": "819;1339;519;630", "wc_reply_reviewers": "287;644;334;247", "wc_reply_authors": "1271;2865;1916;2154", "reply_reviewers": "1;1;2;2", "reply_authors": "4;5;6;7", "recommendation_avg": [ 6.75, 1.299038105676658 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "correctness_avg": [ 4.0, 0.0 ], "technical_novelty_avg": [ 2.25, 0.82915619758885 ], "empirical_novelty_avg": [ 2.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 91.5, 42.01487831709144 ], "wc_summary_review_avg": [ 77.0, 22.94558781116753 ], "wc_main_review_avg": [ 658.25, 262.73596537208226 ], "wc_review_avg": [ 826.75, 314.5952757115084 ], "wc_reply_reviewers_avg": [ 378.0, 156.63173369403788 ], "wc_reply_authors_avg": [ 2051.5, 570.0502170861791 ], "reply_reviewers_avg": [ 1.5, 0.5 ], "reply_authors_avg": [ 5.5, 1.118033988749895 ], "replies_avg": [ 36, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.5222329678670935, "corr_recommendation_correctness": 0.0, "gs_citation": 91, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3158185965758098235&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=SHbhHHfePhP", "email": "tsinghua.edu.cn;;tencent.com;tencent.com;cs.tsinghua.edu.cn;uta.edu", "author_num": 6, "aff_unique_index": "0;1;1;0;2", "aff_unique_norm": "Tsinghua University;Tencent;University of Texas at Arlington", "aff_unique_dep": ";Tencent AI Lab;", "aff_unique_url": "https://www.tsinghua.edu.cn;https://ai.tencent.com;https://www.uta.edu", "aff_unique_abbr": "THU;Tencent AI Lab;UTA", "aff_campus_unique_index": "1", "aff_campus_unique": ";Arlington", "aff_country_unique_index": "0;0;0;0;1", "aff_country_unique": "China;United States" }, { "id": "SHnXjI3vTJ", "title": "Self-Supervised Prime-Dual Networks for Few-Shot Image Classification", "track": "main", "status": "Reject", "tldr": "", "abstract": "We construct a prime-dual network structure for few-shot learning which establishes a commutative relationship between the support set and the query set, as well as a new self- supervision constraint for highly effective few-shot learning. Specifically, the prime network performs the forward label prediction of the query set from the support set, while the dual network performs the reverse label prediction of the support set from the query set. This forward and reserve prediction process with commutated support and query sets forms a label prediction loop and establishes a self-supervision constraint between the ground-truth labels and their predicted values. This unique constraint can be used to significantly improve the training performance of few-shot learning through coupled prime and dual network training. It can be also used as an objective function for optimization during the testing stage to refine the query label prediction results. Our extensive experimental results demonstrate that the proposed self-supervised commutative learning and optimization outperforms existing state-of the-art few-shot learning methods by large margins on various benchmark datasets.", "keywords": "few-shot learning;prime-dual network;self-supervision", "primary_area": "", "supplementary_material": "/attachment/874ba3ab6ef572361b7596e0a959a0de4f22788b.zip", "author": "Wenming Cao;Qifan Liu;Guang Liu;Zhihai He", "authorids": "~Wenming_Cao2;~Qifan_Liu1;~Guang_Liu1;~Zhihai_He3", "gender": ";;M;M", "homepage": "http://ceie.szu.edu.cn/info/1017/1069.htm;;;https://faculty.sustech.edu.cn/hezh/", "dblp": ";243/3163;;23/4027", "google_scholar": "https://scholar.google.com/citations?hl=zh-CN;vbGNpy8AAAAJ;https://scholar.google.com/citations?hl=zh-CN;wtr6OgkAAAAJ", "orcid": "0000-0002-8174-6167;;;", "linkedin": ";;;", "or_profile": "~Wenming_Cao2;~Qifan_Liu1;~Guang_Liu1;~Zhihai_He3", "aff": "Shenzhen University;College of Electronics and Information Engineering, Shenzhen University;College of of Electronic and information Engineering, Shenzhen University;Southern University of Science and Technology", "aff_domain": "szu.edu.cn;szu.edu;szu.edu;sustech.edu.cn", "position": "Full Professor;PhD student;MS student;Chair Professor", "bibtex": "@misc{\ncao2022selfsupervised,\ntitle={Self-Supervised Prime-Dual Networks for Few-Shot Image Classification},\nauthor={Wenming Cao and Qifan Liu and Guang Liu and Zhihai He},\nyear={2022},\nurl={https://openreview.net/forum?id=SHnXjI3vTJ}\n}", "github": "", "project": "", "reviewers": "wTnW;qJDP;zUdv;Gmcb;oj74", "site": "https://openreview.net/forum?id=SHnXjI3vTJ", "pdf_size": 0, "recommendation": "5;6;6;6;6", "confidence": "3;3;3;3;4", "correctness": "2;4;3;3;2", "technical_novelty": "2;2;3;3;3", "empirical_novelty": "3;3;3;3;3", "wc_summary_paper": "76;64;78;44;51", "wc_summary_review": "28;26;50;45;13", "wc_main_review": "117;402;321;224;173", "wc_review": "221;492;449;313;237", "wc_reply_reviewers": "131;474;0;122;21", "wc_reply_authors": "635;1409;336;321;281", "reply_reviewers": "2;3;0;1;1", "reply_authors": "2;4;1;1;2", "recommendation_avg": [ 5.8, 0.39999999999999997 ], "confidence_avg": [ 3.2, 0.39999999999999997 ], "correctness_avg": [ 2.8, 0.7483314773547882 ], "technical_novelty_avg": [ 2.6, 0.4898979485566356 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 62.6, 13.410443691392167 ], "wc_summary_review_avg": [ 32.4, 13.45511055324333 ], "wc_main_review_avg": [ 247.4, 102.37695053086901 ], "wc_review_avg": [ 342.4, 109.95926518488562 ], "wc_reply_reviewers_avg": [ 149.6, 170.44717656799128 ], "wc_reply_authors_avg": [ 596.4, 425.4266564285788 ], "reply_reviewers_avg": [ 1.4, 1.019803902718557 ], "reply_authors_avg": [ 2.0, 1.0954451150103321 ], "replies_avg": [ 24, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.2500000000000001, "corr_recommendation_correctness": 0.5345224838248488, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:RqyM2phgNXsJ:scholar.google.com/&scioq=Self-Supervised+Prime-Dual+Networks+for+Few-Shot+Image+Classification&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0;0;1", "aff_unique_norm": "Shenzhen University;Southern University of Science and Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.szu.edu.cn;https://www.sustech.edu.cn", "aff_unique_abbr": "SZU;SUSTech", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Shenzhen", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "title": "Auto-Transfer: Learning to Route Transferable Representations", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6166", "id": "SIKV0_MrZlr", "poster": "", "openreview": "https://openreview.net/forum?id=SIKV0_MrZlr", "slides": "https://iclr.cc/virtual/2022/poster/6166", "video": "https://iclr.cc/virtual/2022/poster/6166", "author_site": "Keerthiram Murugesan, Vijay Sadashivaiah, Ronny Luss, Karthikeyan Shanmugam, Pin-Yu Chen, Amit Dhurandhar", "tldr": "", "abstract": "Knowledge transfer between heterogeneous source and target networks and tasks has received a lot of attention in recent times as large amounts of quality labeled data can be difficult to obtain in many applications. Existing approaches typically constrain the target deep neural network (DNN) feature representations to be close to the source DNNs feature representations, which can be limiting. We, in this paper, propose a novel adversarial multi-armed bandit approach that automatically learns to route source representations to appropriate target representations following which they are combined in meaningful ways to produce accurate target models. We see upwards of 5\\% accuracy improvements compared with the state-of-the-art knowledge transfer methods on four benchmark (target) image datasets CUB200, Stanford Dogs, MIT67, and Stanford40 where the source dataset is ImageNet. We qualitatively analyze the goodness of our transfer scheme by showing individual examples of the important features focused on by our target network at different layers compared with the (closest) competitors. We also observe that our improvement over other methods is higher for smaller target datasets making it an effective tool for small data applications that may benefit from transfer learning.", "keywords": "Feature routing;Transferable Representations", "primary_area": "", "supplementary_material": "/attachment/a94a644fa94ecbb54c00fec4bb1bbc1fcd8c1a49.zip", "author": "Keerthiram Murugesan;Vijay Sadashivaiah;Ronny Luss;Karthikeyan Shanmugam;Pin-Yu Chen;Amit Dhurandhar", "authorids": "~Keerthiram_Murugesan1;~Vijay_Sadashivaiah1;~Ronny_Luss1;~Karthikeyan_Shanmugam1;~Pin-Yu_Chen1;~Amit_Dhurandhar1", "gender": "M;M;;M;M;M", "homepage": "https://keerthi166.github.io;https://vjysd.github.io;;https://sites.google.com/corp/view/karthikeyan-shanmugam/;http://www.pinyuchen.com;https://researcher.watson.ibm.com/researcher/view.php?person=us-adhuran", "dblp": "178/2877;188/2601;80/75;;39/8969;66/3289", "google_scholar": "-698GEMAAAAJ;lr78bS0AAAAJ;lBPWZdAAAAAJ;https://scholar.google.ca/citations?user=m4DyPcUAAAAJ;jxwlCUUAAAAJ;km9vIPEAAAAJ", "orcid": "0000-0001-6847-522X;0000-0003-3375-3810;;0009-0008-2879-5868;0000-0003-1039-8369;", "linkedin": "https://linkedin.com/in/keerthiram;vijaysadashivaiah;;;pin-yu-chen-940062a2;", "or_profile": "~Keerthiram_Murugesan1;~Vijay_Sadashivaiah1;~Ronny_Luss1;~Karthikeyan_Shanmugam1;~Pin-Yu_Chen1;~Amit_Dhurandhar1", "aff": "International Business Machines;Rensselaer Polytechnic Institute;IBM;International Business Machines;International Business Machines;International Business Machines", "aff_domain": "ibm.com;rpi.edu;us.ibm.com;ibm.com;ibm.com;ibm.com", "position": "Researcher;MS student;Research Scientist;Research Staff Member;Research Staff Member;Principal Researcher", "bibtex": "@inproceedings{\nmurugesan2022autotransfer,\ntitle={Auto-Transfer: Learning to Route Transferable Representations},\nauthor={Keerthiram Murugesan and Vijay Sadashivaiah and Ronny Luss and Karthikeyan Shanmugam and Pin-Yu Chen and Amit Dhurandhar},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=SIKV0_MrZlr}\n}", "github": "", "project": "", "reviewers": "8d9w;u325;7soo;3TiT", "pdf_size": 0, "recommendation": "6;6;6;6", "confidence": "5;5;3;4", "correctness": "2;3;2;4", "technical_novelty": "3;3;3;3", "empirical_novelty": "3;3;2;3", "wc_summary_paper": "66;61;133;97", "wc_summary_review": "167;3;41;61", "wc_main_review": "358;158;247;927", "wc_review": "591;222;421;1085", "wc_reply_reviewers": "64;24;0;32", "wc_reply_authors": "831;307;946;974", "reply_reviewers": "1;1;0;1", "reply_authors": "2;1;3;2", "recommendation_avg": [ 6.0, 0.0 ], "confidence_avg": [ 4.25, 0.82915619758885 ], "correctness_avg": [ 2.75, 0.82915619758885 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 89.25, 28.77824699317177 ], "wc_summary_review_avg": [ 68.0, 60.83584469702052 ], "wc_main_review_avg": [ 422.5, 299.7669928461104 ], "wc_review_avg": [ 579.75, 319.6055185693764 ], "wc_reply_reviewers_avg": [ 30.0, 22.891046284519195 ], "wc_reply_authors_avg": [ 764.5, 269.51855223713267 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 2.0, 0.7071067811865476 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18425529718248157459&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=SIKV0_MrZlr", "email": "ibm.com;rpi.edu;us.ibm.com;ibm.com;ibm.com;ibm.com", "author_num": 6, "aff_unique_index": "0;1;0;0;0;0", "aff_unique_norm": "International Business Machines Corporation;Rensselaer Polytechnic Institute", "aff_unique_dep": ";", "aff_unique_url": "https://www.ibm.com;https://www.rpi.edu", "aff_unique_abbr": "IBM;RPI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "SK1nec-Ehd", "title": "PulseImpute: A Novel Benchmark Task and Architecture for Imputation of Physiological Signals", "track": "main", "status": "Withdraw", "tldr": "", "abstract": " Providing care for patients with chronic diseases is one of the biggest drivers of the nation\u2019s rising healthcare costs, but many of these diseases are linked to mutable health behaviors. Mobile health (mHealth) biophysical sensors that continuously measure our current conditions provide the framework for a personalized guidance system for the maintenance of healthy behaviors. However, this physiological sensor data is plagued with missingness due to insecure attachments, wireless dropout, battery, and adherence issues. These issues cripple their rich diagnostic utility as well as their ability to enable temporally-precise interventions. While there is a sizable amount of research focusing on imputation methods, surprisingly, no works have addressed the patterns of missingness, quasi-periodic signal structure, and the between subject heterogeneity that characterizes physiological signals in mHealth applications. We present the PulseImpute Challenge, the first challenge dataset for physiological signal imputation which includes a large set of baselines' performances on realistic missingness models and data. Next, we demonstrate the potential to address this quasi-periodic structure and heterogeneity with our Dilated Convolution Bottleneck (DCB) Transformer, a transformer architecture with a self-attention mechanism that is able to attend to corresponding waveform features in quasi-periodic signals. By utilizing stacked dilated convolutions with bottleneck layers for query and key transformations, we visually demonstrate that the kernel similarity in the attention model gives high similarity to similar temporal features across quasi-periodic periods. We hope the release of our challenge task definitions and baseline implementations will spur the community to address this challenging and important problem. \n ", "keywords": "missingness;imputation;mHealth;sensors;transformer;self-attention", "primary_area": "", "supplementary_material": "", "author": "Maxwell Xu;Alexander Moreno;James Matthew Rehg", "authorids": "~Maxwell_Xu1;~Alexander_Moreno1;~James_Matthew_Rehg1", "gender": ";M;M", "homepage": "https://maxxu05.github.io/;;http://rehg.org/", "dblp": ";161/6588;r/JMRehg", "google_scholar": "https://scholar.google.com/citations?view_op=list_works;zoqP2-IAAAAJ;https://scholar.google.com.tw/citations?user=8kA3eDwAAAAJ", "orcid": ";;0000-0003-1793-5462", "linkedin": ";;", "or_profile": "~Maxwell_Xu1;~Alexander_Moreno1;~James_Rehg1", "aff": "University of Illinois, Urbana Champaign;Luminous Computing;Georgia Institute of Technology", "aff_domain": "illinois.edu;lmns.com;gatech.edu", "position": "PhD student;Researcher;Full Professor", "bibtex": "@misc{\nxu2022pulseimpute,\ntitle={PulseImpute: A Novel Benchmark Task and Architecture for Imputation of Physiological Signals},\nauthor={Maxwell Xu and Alexander Moreno and James Matthew Rehg},\nyear={2022},\nurl={https://openreview.net/forum?id=SK1nec-Ehd}\n}", "github": "", "project": "", "reviewers": "", "site": "https://openreview.net/forum?id=SK1nec-Ehd", "pdf_size": 0, "recommendation": "", "confidence": "", "correctness": "", "technical_novelty": "", "empirical_novelty": "", "wc_summary_paper": "", "wc_summary_review": "", "wc_main_review": "", "wc_review": "", "wc_reply_reviewers": "", "wc_reply_authors": "", "reply_reviewers": "", "reply_authors": "", "recommendation_avg": [ 0, 0 ], "confidence_avg": [ 0, 0 ], "correctness_avg": [ 0, 0 ], "technical_novelty_avg": [ 0, 0 ], "empirical_novelty_avg": [ 0, 0 ], "wc_summary_paper_avg": [ 0, 0 ], "wc_summary_review_avg": [ 0, 0 ], "wc_main_review_avg": [ 0, 0 ], "wc_review_avg": [ 0, 0 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 1, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0, "corr_recommendation_correctness": 0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:GUTHG0Mzm6EJ:scholar.google.com/&scioq=PulseImpute:+A+Novel+Benchmark+Task+and+Architecture+for+Imputation+of+Physiological+Signals&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;2", "aff_unique_norm": "University of Illinois Urbana-Champaign;Luminous Computing;Georgia Institute of Technology", "aff_unique_dep": ";;", "aff_unique_url": "https://illinois.edu;;https://www.gatech.edu", "aff_unique_abbr": "UIUC;;Georgia Tech", "aff_campus_unique_index": "0", "aff_campus_unique": "Urbana-Champaign;", "aff_country_unique_index": "0;0", "aff_country_unique": "United States;" }, { "title": "Evaluating Disentanglement of Structured Representations", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6441", "id": "SLz5sZjacp", "poster": "", "openreview": "https://openreview.net/forum?id=SLz5sZjacp", "slides": "https://iclr.cc/virtual/2022/poster/6441", "video": "https://iclr.cc/virtual/2022/poster/6441", "tldr": "", "abstract": "We introduce the first metric for evaluating disentanglement at individual hierarchy levels of a structured latent representation. Applied to object-centric generative models, this offers a systematic, unified approach to evaluating (i) object separation between latent slots (ii) disentanglement of object properties inside individual slots (iii) disentanglement of intrinsic and extrinsic object properties. We theoretically show that our framework gives stronger guarantees of selecting a good model than previous disentanglement metrics. Experimentally, we demonstrate that viewing object compositionality as a disentanglement problem addresses several issues with prior visual metrics of object separation. As a core technical component, we present the first representation probing algorithm handling slot permutation invariance.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Rapha\u00ebl Dang-Nhu", "authorids": "~Rapha\u00ebl_Dang-Nhu2", "gender": "M", "homepage": "", "dblp": "222/3844.html", "google_scholar": "https://scholar.google.fr/citations?user=aVXUn8UAAAAJ", "orcid": "", "linkedin": "", "or_profile": "~Rapha\u00ebl_Dang-Nhu2", "aff": "Apple", "aff_domain": "apple.com", "position": "Researcher", "bibtex": "@inproceedings{\ndang-nhu2022evaluating,\ntitle={Evaluating Disentanglement of Structured Representations},\nauthor={Rapha{\\\"e}l Dang-Nhu},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=SLz5sZjacp}\n}", "github": "", "project": "", "reviewers": "ZLzf;uWyU;ERyE", "pdf_size": 0, "recommendation": "6;6;6", "confidence": "5;3;4", "correctness": "3;3;3", "technical_novelty": "4;3;3", "empirical_novelty": "2;3;3", "wc_summary_paper": "107;116;82", "wc_summary_review": "46;38;32", "wc_main_review": "728;430;339", "wc_review": "881;584;453", "wc_reply_reviewers": "818;264;0", "wc_reply_authors": "1644;350;380", "reply_reviewers": "4;2;0", "reply_authors": "5;2;1", "recommendation_avg": [ 6.0, 0.0 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 3.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 101.66666666666667, 14.383632673594278 ], "wc_summary_review_avg": [ 38.666666666666664, 5.734883511361751 ], "wc_main_review_avg": [ 499.0, 166.13448367713028 ], "wc_review_avg": [ 639.3333333333334, 179.05740854696728 ], "wc_reply_reviewers_avg": [ 360.6666666666667, 340.87078033895614 ], "wc_reply_authors_avg": [ 791.3333333333334, 603.0507625583623 ], "reply_reviewers_avg": [ 2.0, 1.632993161855452 ], "reply_authors_avg": [ 2.6666666666666665, 1.699673171197595 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 1, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11374263922839869464&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=SLz5sZjacp", "email": "apple.com", "author_num": 1, "aff_unique_index": "0", "aff_unique_norm": "Apple", "aff_unique_dep": "Apple Inc.", "aff_unique_url": "https://www.apple.com", "aff_unique_abbr": "Apple", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "id": "SN2bkl9f69", "title": "Multi-Tailed, Multi-Headed, Spatial Dynamic Memory refined Text-to-Image Synthesis", "track": "main", "status": "Reject", "tldr": "", "abstract": "Synthesizing high-quality, realistic images from text-descriptions is a challenging task, and current methods synthesize images from text in a multi-stage manner, typically by first generating a rough initial image and then refining image details at subsequent stages. However, existing methods that follow this paradigm suffer from three important limitations. Firstly, they synthesize initial images without attempting to separate image attributes at a word-level. As a result, object attributes of initial images (that provide a basis for subsequent refinement) are inherently entangled and ambiguous in nature. Secondly, by using common text-representations for all regions, current methods prevent us from interpreting text in fundamentally different ways at different parts of an image. Different image regions are therefore only allowed to assimilate the same type of information from text at each refinement stage. Finally, current methods generate refinement features only once at each refinement stage and attempt to address all image aspects in a single shot. This single-shot refinement limits the precision with which each refinement stage can learn to improve the prior image. Our proposed method introduces three novel components to address these shortcomings: (1) An initial generation stage that explicitly generates separate sets of image features for each word n-gram. (2) A spatial dynamic memory module for refinement of images. (3) An iterative multi-headed mechanism to make it easier to improve upon multiple image aspects. Experimental results demonstrate that our Multi-Headed Spatial Dynamic Memory image refinement with our Multi-Tailed Word-level Initial Generation (MSMT-GAN) performs favourably against the previous state of the art on the CUB and COCO datasets. ", "keywords": "Text-to-Image Generation;Computer Vision", "primary_area": "", "supplementary_material": "/attachment/e30a7d3948e8af22fcbbad9ac70cc9cd30dda253.zip", "author": "Amrit Diggavi Seshadri;Balaraman Ravindran", "authorids": "~Amrit_Diggavi_Seshadri1;~Balaraman_Ravindran1", "gender": ";M", "homepage": ";http://www.cse.iitm.ac.in/~ravi", "dblp": ";69/2281", "google_scholar": ";https://scholar.google.com/citations?hl=en", "orcid": ";0000-0002-5364-7639", "linkedin": ";ravindran-balaraman-427a307", "or_profile": "~Amrit_Diggavi_Seshadri1;~Balaraman_Ravindran1", "aff": ";Indian Institute of Technology Madras", "aff_domain": ";iitm.ac.in", "position": ";Full Professor", "bibtex": "@misc{\nseshadri2022multitailed,\ntitle={Multi-Tailed, Multi-Headed, Spatial Dynamic Memory refined Text-to-Image Synthesis},\nauthor={Amrit Diggavi Seshadri and Balaraman Ravindran},\nyear={2022},\nurl={https://openreview.net/forum?id=SN2bkl9f69}\n}", "github": "", "project": "", "reviewers": "6fuX;4wnG;WhWC", "site": "https://openreview.net/forum?id=SN2bkl9f69", "pdf_size": 0, "recommendation": "5;5;6", "confidence": "4;4;4", "correctness": "3;3;4", "technical_novelty": "2;2;3", "empirical_novelty": "2;2;2", "wc_summary_paper": "61;47;76", "wc_summary_review": "15;30;62", "wc_main_review": "328;218;182", "wc_review": "404;295;320", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 61.333333333333336, 11.841546445554407 ], "wc_summary_review_avg": [ 35.666666666666664, 19.601587237318874 ], "wc_main_review_avg": [ 242.66666666666666, 62.10385566845982 ], "wc_review_avg": [ 339.6666666666667, 46.62140662351958 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.9999999999999997, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6325909044904068206&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "aff_unique_index": "0", "aff_unique_norm": "Indian Institute of Technology Madras", "aff_unique_dep": "", "aff_unique_url": "https://www.iitm.ac.in", "aff_unique_abbr": "IIT Madras", "aff_campus_unique_index": "0", "aff_campus_unique": "Madras", "aff_country_unique_index": "0", "aff_country_unique": "India" }, { "title": "Evaluating Model-Based Planning and Planner Amortization for Continuous Control", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6195", "id": "SS8F6tFX3-", "poster": "", "openreview": "https://openreview.net/forum?id=SS8F6tFX3-", "slides": "https://iclr.cc/virtual/2022/poster/6195", "video": "https://iclr.cc/virtual/2022/poster/6195", "author_site": "Arunkumar Byravan, Leonard Hasenclever, Piotr Trochim, Mehdi Mirza, Alessandro Ialongo, Yuval Tassa, Jost Tobias Springenberg, Abbas Abdolmaleki, Nicolas Heess, Josh Merel, Martin Riedmiller", "tldr": "", "abstract": "There is a widespread intuition that model-based control methods should be able to surpass the data efficiency of model-free approaches. In this paper we attempt to evaluate this intuition on various challenging locomotion tasks. We take a hybrid approach, combining model predictive control (MPC) with a learned model and model-free policy learning; the learned policy serves as a proposal for MPC. We show that MPC with learned proposals and models (trained on the fly or transferred from related tasks) can significantly improve performance and data efficiency with respect to model-free methods. However, we find that well-tuned model-free agents are strong baselines even for high DoF control problems. Finally, we show that it is possible to distil a model-based planner into a policy that amortizes the planning computation without any loss of performance.", "keywords": "Model-based Reinforcement Learning;Planning;Robotics;Model Predictive Control;Learning", "primary_area": "", "supplementary_material": "/attachment/fcdd96007624b2d68dbadfdc2ea68e6d0b39c857.zip", "author": "Arunkumar Byravan;Leonard Hasenclever;Piotr Trochim;Mehdi Mirza;Alessandro Davide Ialongo;Yuval Tassa;Jost Tobias Springenberg;Abbas Abdolmaleki;Nicolas Heess;Josh Merel;Martin Riedmiller", "authorids": "~Arunkumar_Byravan1;~Leonard_Hasenclever1;~Piotr_Trochim1;~Mehdi_Mirza1;~Alessandro_Davide_Ialongo1;~Yuval_Tassa2;~Jost_Tobias_Springenberg1;~Abbas_Abdolmaleki3;~Nicolas_Heess1;~Josh_Merel1;~Martin_Riedmiller1", "gender": "M;M;;;M;M;;;;M;M", "homepage": "https://homes.cs.washington.edu/~barun/;;http://deepmind.com;;;http://www.springenberg-tobias.de;;;;https://www.riedmiller.me/;", "dblp": "151/9400;150/1667;;119/1493;https://dblp.uni-trier.de/pers/hd/i/Ialongo:Alessandro_Davide;;;76/9181;139/1361;;20/4415", "google_scholar": "obYwWiMAAAAJ;https://scholar.google.co.uk/citations?user=dD-3S4QAAAAJ;;https://scholar.google.ca/citations?user=c646VbAAAAAJ;Z2tqKq4AAAAJ;;;79k7bGEAAAAJ;https://scholar.google.co.uk/citations?user=K4OcFXUAAAAJ;1gVfqpcAAAAJ;https://scholar.google.co.uk/citations?user=CjOTm_4AAAAJ", "orcid": ";;;;;;;;;;", "linkedin": ";;;;alessandro-ialongo/;;;;;;", "or_profile": "~Arunkumar_Byravan1;~Leonard_Hasenclever1;~Piotr_Trochim1;~Mehdi_Mirza1;~Alessandro_Davide_Ialongo1;~Jost_Tobias_Springenberg1;~Abbas_Abdolmaleki3;~Nicolas_Heess1;~Josh_Merel1;~Martin_Riedmiller1;~yuval_tassa1", "aff": "Google;Google DeepMind;;Google DeepMind;Max Planck Institute for Intelligent Systems, Max-Planck Institute;Google DeepMind;Google;Google DeepMind;Meta Reality Labs;;Google", "aff_domain": "google.com;google.com;;google.com;tuebingen.mpg.de;google.com;google.com;google.com;fb.com;;google.com", "position": "Research Scientist;Research Scientist;;Research Scientist;PhD student;Researcher;research scientist;Research Scientist;Research Scientist;;Research Scientist", "bibtex": "@inproceedings{\nbyravan2022evaluating,\ntitle={Evaluating Model-Based Planning and Planner Amortization for Continuous Control},\nauthor={Arunkumar Byravan and Leonard Hasenclever and Piotr Trochim and Mehdi Mirza and Alessandro Davide Ialongo and Yuval Tassa and Jost Tobias Springenberg and Abbas Abdolmaleki and Nicolas Heess and Josh Merel and Martin Riedmiller},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=SS8F6tFX3-}\n}", "github": "", "project": "", "reviewers": "i15j;54KF;CeZd;Bjo9", "pdf_size": 0, "recommendation": "6;6;6;8", "confidence": "4;3;4;4", "correctness": "3;4;4;4", "technical_novelty": "2;1;2;3", "empirical_novelty": "2;2;3;4", "wc_summary_paper": "33;100;102;117", "wc_summary_review": "32;105;111;86", "wc_main_review": "186;459;776;554", "wc_review": "251;664;989;757", "wc_reply_reviewers": "0;4;19;34", "wc_reply_authors": "443;1479;1123;948", "reply_reviewers": "0;1;1;1", "reply_authors": "1;2;2;2", "recommendation_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 88.0, 32.42684073418192 ], "wc_summary_review_avg": [ 83.5, 31.132780152116194 ], "wc_main_review_avg": [ 493.75, 211.66763451222295 ], "wc_review_avg": [ 665.25, 266.85049653317117 ], "wc_reply_reviewers_avg": [ 14.25, 13.423393758658799 ], "wc_reply_authors_avg": [ 998.25, 373.33321242557565 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.75, 0.4330127018922193 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 11, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3810941505296299136&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=SS8F6tFX3-", "email": "google.com;google.com;;google.com;tuebingen.mpg.de;google.com;google.com;google.com;fb.com;;google.com", "author_num": 11, "aff_unique_index": "0;0;0;1;0;0;0;2;0", "aff_unique_norm": "Google;Max Planck Institute for Intelligent Systems;Meta", "aff_unique_dep": "Google;Intelligent Systems;Meta Reality Labs", "aff_unique_url": "https://www.google.com;https://www.mpi-is.mpg.de;https://www.meta.com", "aff_unique_abbr": "Google;MPI-IS;MRL", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;1;1;2;1;0;1;0;0", "aff_country_unique": "United States;United Kingdom;Germany" }, { "id": "STFJBXDTSlT", "title": "Identity-Disentangled Adversarial Augmentation for Self-supervised Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Data augmentation is critical to contrastive self-supervised learning, whose goal is to distinguish a sample's augmentations (positives) from other samples (negatives). However, strong augmentations may change the sample-identity of the positives, while weak augmentation produces easy positives/negatives leading to nearly-zero loss and ineffective learning. In this paper, we study a simple adversarial augmentation method that can modify training data to be hard positives/negatives without distorting the key information about their original identities. In particular, we decompose a sample $x$ to be its variational auto-encoder (VAE) reconstruction $G(x)$ plus the residual $R(x)=x-G(x)$, where $R(x)$ retains most identity-distinctive information due to an information-theoretic interpretation of the VAE objective. We then adversarially perturb $G(x)$ in the VAE's bottleneck space and adds it back to the original $R(x)$ as an augmentation, which is therefore sufficiently challenging for contrastive learning and meanwhile preserves the sample identity intact. We apply this ``identity-disentangled adversarial augmentation (IDAA)'' to different self-supervised learning methods. On multiple benchmark datasets, IDAA consistently improves both their efficiency and generalization performance. We further show that IDAA learned on a dataset can be transferred to other datasets. ", "keywords": "identity disentanglement;contrastive learning;data augmentation;self-supervised learning", "primary_area": "", "supplementary_material": "", "author": "Kaiwen Yang;Tianyi Zhou;Xinmei Tian;Dacheng Tao", "authorids": "~Kaiwen_Yang1;~Tianyi_Zhou1;~Xinmei_Tian1;~Dacheng_Tao1", "gender": "M;M;F;", "homepage": ";https://tianyizhou.github.io/;https://faculty.ustc.edu.cn/tianxinmei1/zh_CN/index.htm;", "dblp": "227/0101;88/8205-1;03/5204-1;", "google_scholar": "WQzn8u0AAAAJ;OKvgizMAAAAJ;https://scholar.google.com.au/citations?hl=zh-CN;", "orcid": ";0000-0001-5348-0632;0000-0002-5952-8753;", "linkedin": ";tianyizhou;;", "or_profile": "~Kaiwen_Yang1;~Tianyi_Zhou1;~Xinmei_Tian1;~Dacheng_Tao1", "aff": "University of Science and Technology of China;University of Washington, Seattle;University of Science and Technology of China;", "aff_domain": "ustc.edu.cn;uw.edu;ustc.edu.cn;", "position": "PhD student;PhD student;Associate Professor;", "bibtex": "@misc{\nyang2022identitydisentangled,\ntitle={Identity-Disentangled Adversarial Augmentation for Self-supervised Learning},\nauthor={Kaiwen Yang and Tianyi Zhou and Xinmei Tian and Dacheng Tao},\nyear={2022},\nurl={https://openreview.net/forum?id=STFJBXDTSlT}\n}", "github": "", "project": "", "reviewers": "dMSm;Augv;rByQ;uy6S;kfHb", "site": "https://openreview.net/forum?id=STFJBXDTSlT", "pdf_size": 0, "recommendation": "5;5;5;6;6", "confidence": "4;3;4;4;2", "correctness": "3;2;3;4;4", "technical_novelty": "3;3;3;4;3", "empirical_novelty": "2;3;3;4;3", "wc_summary_paper": "48;83;52;62;85", "wc_summary_review": "92;298;84;48;38", "wc_main_review": "532;286;429;238;359", "wc_review": "672;667;565;348;482", "wc_reply_reviewers": "457;0;1028;80;0", "wc_reply_authors": "3463;1536;3674;559;460", "reply_reviewers": "1;0;4;1;0", "reply_authors": "7;4;9;2;1", "recommendation_avg": [ 5.4, 0.48989794855663565 ], "confidence_avg": [ 3.4, 0.8 ], "correctness_avg": [ 3.2, 0.7483314773547882 ], "technical_novelty_avg": [ 3.2, 0.39999999999999997 ], "empirical_novelty_avg": [ 3.0, 0.6324555320336759 ], "wc_summary_paper_avg": [ 66.0, 15.401298646542765 ], "wc_summary_review_avg": [ 112.0, 95.23864761744572 ], "wc_main_review_avg": [ 368.8, 104.22936246566992 ], "wc_review_avg": [ 546.8, 121.79885056928904 ], "wc_reply_reviewers_avg": [ 313.0, 395.5219336522312 ], "wc_reply_authors_avg": [ 1938.4, 1384.7056871407729 ], "reply_reviewers_avg": [ 1.2, 1.4696938456699067 ], "reply_authors_avg": [ 4.6, 3.006659275674582 ], "replies_avg": [ 37, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.408248290463863, "corr_recommendation_correctness": 0.8728715609439693, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12382448474762580849&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff_unique_index": "0;1;0", "aff_unique_norm": "University of Science and Technology of China;University of Washington", "aff_unique_dep": ";", "aff_unique_url": "http://www.ustc.edu.cn;https://www.washington.edu", "aff_unique_abbr": "USTC;UW", "aff_campus_unique_index": "1", "aff_campus_unique": ";Seattle", "aff_country_unique_index": "0;1;0", "aff_country_unique": "China;United States" }, { "id": "SUIK1esNljC", "title": "AutoDrop: Training Deep Learning Models with Automatic Learning Rate Drop", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Modern deep learning (DL) architectures are trained using variants of the SGD algorithm that is run with a $\\textit{manually}$ defined learning rate schedule, i.e., the learning rate is dropped at the pre-defined epochs, typically when the training loss is expected to saturate. In this paper we develop an algorithm that realizes the learning rate drop $\\textit{automatically}$. The proposed method, that we refer to as AutoDrop, is motivated by the observation that the angular velocity of the model parameters, i.e., the velocity of the changes of the convergence direction, for a fixed learning rate initially increases rapidly and then progresses towards soft saturation. At saturation the optimizer slows down thus the angular velocity saturation is a good indicator for dropping the learning rate. After the drop, the angular velocity ``resets'' and follows the previously described pattern - it increases again until saturation. We show that our method improves over SOTA training approaches: it accelerates the training of DL models and leads to a better generalization. We also show that our method does not require any extra hyperparameter tuning. AutoDrop is furthermore extremely simple to implement and computationally cheap. Finally, we develop a theoretical framework for analyzing our algorithm and provide convergence guarantees. ", "keywords": "deep learning optimization;automatic learning rate drop;schedules of the hyperparameters", "primary_area": "", "supplementary_material": "/attachment/f42464cda1306e51a877df537986ca5c3a8f4872.zip", "author": "Yunfei Teng;Jing Wang;Anna Ewa Choromanska", "authorids": "~Yunfei_Teng1;~Jing_Wang24;~Anna_Ewa_Choromanska1", "gender": "Unspecified;F;female", "homepage": ";;https://engineering.nyu.edu/faculty/anna-choromanska", "dblp": "215/5192;02/736;55/11352", "google_scholar": ";https://scholar.google.com/citations?hl=en;https://scholar.google.pl/citations?user=l-mlF7YAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Yunfei_Teng1;~Jing_Wang24;~Anna_Ewa_Choromanska1", "aff": "New York University;New York University;New York University", "aff_domain": "nyu.edu;nyu.edu;nyu.edu", "position": "PhD student;PhD student;Assistant Professor", "bibtex": "@misc{\nteng2022autodrop,\ntitle={AutoDrop: Training Deep Learning Models with Automatic Learning Rate Drop},\nauthor={Yunfei Teng and Jing Wang and Anna Ewa Choromanska},\nyear={2022},\nurl={https://openreview.net/forum?id=SUIK1esNljC}\n}", "github": "", "project": "", "reviewers": "dXCM;96Jg;qVox;56zk", "site": "https://openreview.net/forum?id=SUIK1esNljC", "pdf_size": 0, "recommendation": "3;3;3;3", "confidence": "4;4;4;4", "correctness": "3;3;1;2", "technical_novelty": "2;3;2;3", "empirical_novelty": "3;2;2;2", "wc_summary_paper": "67;78;82;81", "wc_summary_review": "57;76;67;41", "wc_main_review": "516;633;558;704", "wc_review": "640;787;707;826", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 2.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 77.0, 5.958187643906492 ], "wc_summary_review_avg": [ 60.25, 12.987975207860538 ], "wc_main_review_avg": [ 602.75, 71.9283497655827 ], "wc_review_avg": [ 740.0, 71.92704637339142 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14268892586664275860&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff_unique_index": "0;0;0", "aff_unique_norm": "New York University", "aff_unique_dep": "", "aff_unique_url": "https://www.nyu.edu", "aff_unique_abbr": "NYU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "SVcEx6SC_NL", "title": "Adversarial Robustness as a Prior for Learned Representations", "track": "main", "status": "Reject", "tldr": "", "abstract": "An common goal in deep learning is to learn versatile, high-level feature representations of input data. However, standard networks' representations seem to possess shortcomings that, as we illustrate, prevent them from fully realizing this goal. In this work, we show that robust optimization can be re-cast as a tool for enforcing priors on the features learned by deep neural networks. It turns out that representations learned by robust models address the aforementioned shortcomings and make significant progress towards learning a high-level encoding of inputs. In particular, these representations are approximately invertible, while allowing for direct visualization and manipulation of salient input features. More broadly, our results indicate adversarial robustness as a promising avenue for improving learned representations. ", "keywords": "adversarial robustness;representation learning", "primary_area": "", "supplementary_material": "", "author": "Logan Engstrom;Andrew Ilyas;Shibani Santurkar;Dimitris Tsipras;Brandon Tran;Aleksander Madry", "authorids": "~Logan_Engstrom1;~Andrew_Ilyas1;~Shibani_Santurkar1;~Dimitris_Tsipras1;~Brandon_Tran1;~Aleksander_Madry1", "gender": "M;M;;M;M;M", "homepage": ";http://andrewilyas.com;https://shibanisanturkar.com/;https://dtsipras.com;;https://people.csail.mit.edu/madry/", "dblp": "207/7298;156/5465;153/2146;168/4752;160/1121;67/2454", "google_scholar": ";Dtw3YBoAAAAJ;QMkbFp8AAAAJ;26eh1jAAAAAJ;;SupjsEUAAAAJ", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": "~Logan_Engstrom1;~Andrew_Ilyas1;~Shibani_Santurkar1;~Dimitris_Tsipras1;~Brandon_Tran1;~Aleksander_Madry1", "aff": "Massachusetts Institute of Technology;Massachusetts Institute of Technology;Stanford University;Stanford University;Massachusetts Institute of Technology;Massachusetts Institute of Technology", "aff_domain": "mit.edu;mit.edu;stanford.edu;stanford.edu;mit.edu;mit.edu", "position": "PhD student;PhD student;Postdoc;Postdoc;PhD student;Professor", "bibtex": "@misc{\nengstrom2022adversarial,\ntitle={Adversarial Robustness as a Prior for Learned Representations},\nauthor={Logan Engstrom and Andrew Ilyas and Shibani Santurkar and Dimitris Tsipras and Brandon Tran and Aleksander Madry},\nyear={2022},\nurl={https://openreview.net/forum?id=SVcEx6SC_NL}\n}", "github": "", "project": "", "reviewers": "h5Za;o96F;DfB9;Ukbn", "site": "https://openreview.net/forum?id=SVcEx6SC_NL", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "4;5;4;2", "correctness": "4;3;4;3", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "113;58;33;63", "wc_summary_review": "38;23;39;65", "wc_main_review": "223;503;306;51", "wc_review": "374;584;378;179", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 3.75, 1.0897247358851685 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 66.75, 29.02046691560975 ], "wc_summary_review_avg": [ 41.25, 15.105876340020794 ], "wc_main_review_avg": [ 270.75, 162.5982395353652 ], "wc_review_avg": [ 378.75, 143.22251045139518 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.6882472016116854, "corr_recommendation_correctness": 0.0, "gs_citation": 225, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1831998416682722967&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;1;1;0;0", "aff_unique_norm": "Massachusetts Institute of Technology;Stanford University", "aff_unique_dep": ";", "aff_unique_url": "https://web.mit.edu;https://www.stanford.edu", "aff_unique_abbr": "MIT;Stanford", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Stanford", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "SVey0ddzC4", "title": "Connecting Graph Convolution and Graph PCA", "track": "main", "status": "Reject", "tldr": "", "abstract": "Graph convolution operator of the GCN model is originally motivated from a localized first-order approximation of spectral graph convolutions. This work stands on a different view; establishing a mathematical connection between graph convolution and graph-regularized PCA (GPCA). Based on this connection, the GCN architecture, shaped by stacking graph convolution layers, shares a close relationship with stacking GPCA. We empirically demonstrate that the unsupervised embeddings by GPCA paired with a 1- or 2-layer MLP achieves similar or even better performance than many sophisticated baselines on semi-supervised node classification tasks across five datasets including Open Graph Benchmark. This suggests that the prowess of graph convolution is driven by graph based regularization. In addition, we extend GPCA to the (semi-)supervised setting and show that it is equivalent to GPCA on a graph extended with \u201cghost\u201d edges between nodes of the same label. Finally, we capitalize on the discovered relationship to design an effective initialization strategy based on stacking GPCA, enabling GCN to converge faster and achieve robust performance at large number of layers.", "keywords": "Graph Convolutional Network;graph regularization;GNN initialization;graph-based PCA", "primary_area": "", "supplementary_material": "/attachment/ab39c84be7105c6bffcc31b068b0ccc0b3d03f0d.zip", "author": "Lingxiao Zhao;Leman Akoglu", "authorids": "~Lingxiao_Zhao1;~Leman_Akoglu3", "gender": "M;F", "homepage": "http://lingxiaozhao.com/;http://www.andrew.cmu.edu/user/lakoglu/", "dblp": ";02/6979.html", "google_scholar": "QKslW6EAAAAJ;4ITkr_kAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Lingxiao_Zhao1;~Leman_Akoglu3", "aff": "Carnegie Mellon University;Carnegie Mellon University", "aff_domain": "andrew.cmu.edu;cmu.edu", "position": "PhD student;Assistant Professor", "bibtex": "@misc{\nzhao2022connecting,\ntitle={Connecting Graph Convolution and Graph {PCA}},\nauthor={Lingxiao Zhao and Leman Akoglu},\nyear={2022},\nurl={https://openreview.net/forum?id=SVey0ddzC4}\n}", "github": "", "project": "", "reviewers": "eCjq;YcNy;uURT;Awat", "site": "https://openreview.net/forum?id=SVey0ddzC4", "pdf_size": 0, "recommendation": "5;5;5;6", "confidence": "4;5;4;3", "correctness": "4;3;2;4", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;2;4;2", "wc_summary_paper": "93;27;38;39", "wc_summary_review": "32;44;20;46", "wc_main_review": "182;229;196;111", "wc_review": "307;300;254;196", "wc_reply_reviewers": "78;96;0;0", "wc_reply_authors": "524;1136;530;677", "reply_reviewers": "1;1;0;0", "reply_authors": "2;3;1;1", "recommendation_avg": [ 5.25, 0.4330127018922193 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 49.25, 25.694114111990707 ], "wc_summary_review_avg": [ 35.5, 10.428326807307105 ], "wc_main_review_avg": [ 179.5, 43.072613108563544 ], "wc_review_avg": [ 264.25, 44.35298749802543 ], "wc_reply_reviewers_avg": [ 43.5, 43.96305266925854 ], "wc_reply_authors_avg": [ 716.75, 249.6891817840733 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.75, 0.82915619758885 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.816496580927726, "corr_recommendation_correctness": 0.5222329678670935, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:u_uY3d1IcdkJ:scholar.google.com/&scioq=Connecting+Graph+Convolution+and+Graph+PCA&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Carnegie Mellon University", "aff_unique_dep": "", "aff_unique_url": "https://www.cmu.edu", "aff_unique_abbr": "CMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "SVwbKmEg7M", "title": "Unsupervised Neural Machine Translation with Generative Language Models Only", "track": "main", "status": "Reject", "tldr": "", "abstract": "We show how to derive state-of-the-art unsupervised neural machine translation systems from generatively pre-trained language models. Our method consists of three steps: \\emph{few-shot amplification}, \\emph{distillation}, and \\emph{backtranslation}. We first use the zero-shot translation ability of large pretrained language models to generate translations for a small set of unlabeled sentences. We then amplify these zero-shot translations by using them as few-shot demonstrations for sampling a larger synthetic dataset. This dataset is then distilled by discarding the few-shot demonstrations and then fine-tuning. During backtranslation, we repeatedly generate translations for a set of inputs and then fine-tune a single language model on both directions of the translation task at once, ensuring cycle-consistency by swapping the roles of gold monotext and generated translations when fine-tuning. By using our method to leverage GPT-3's zero-shot translation capability, we achieve a new state-of-the-art in unsupervised translation on the WMT14 English-French benchmark, attaining a BLEU score of 42.1. ", "keywords": "unsupervised;machine translation;language modeling;few-shot learning", "primary_area": "", "supplementary_material": "", "author": "Jesse Michael Han;Igor Babuschkin;Harrison Edwards;Arvind Neelakantan;Tao Xu;Stanislas Polu;Alex Ray;Pranav Shyam;Aditya Ramesh;Alec Radford;Ilya Sutskever", "authorids": "~Jesse_Michael_Han1;~Igor_Babuschkin1;~Harrison_Edwards1;~Arvind_Neelakantan1;tao@openai.com;~Stanislas_Polu1;aray@openai.com;pranav@openai.com;aramesh@openai.com;~Alec_Radford1;~Ilya_Sutskever1", "gender": "M;M;M;M;;M;;;;;", "homepage": "https://jesse-michael-han.github.io;https://www.babushk.in;;https://people.cs.umass.edu/~arvind/;;;;;;;", "dblp": ";198/1445;;142/8636;;;;;;172/1154;60/5276", "google_scholar": ";_N2COeAAAAAJ;0o470HsAAAAJ;ygTCc6cAAAAJ;;;;;;;", "orcid": ";;;;;;;;;;", "linkedin": ";;;;;;;;;;", "or_profile": "~Jesse_Michael_Han1;~Igor_Babuschkin1;~Harrison_Edwards1;~Arvind_Neelakantan1;tao@openai.com;~Stanislas_Polu1;aray@openai.com;pranav@openai.com;aramesh@openai.com;~Alec_Radford1;~Ilya_Sutskever1", "aff": "University of Pittsburgh;Google DeepMind;;University of Massachusetts Amherst;;OpenAI;;;;;", "aff_domain": "pitt.edu;deepmind.com;; ;;openai.com;;;;;", "position": "PhD student;Senior Research Engineer;;Graduate Student;;Research Engineer;;;;;", "bibtex": "@misc{\nhan2022unsupervised,\ntitle={Unsupervised Neural Machine Translation with Generative Language Models Only},\nauthor={Jesse Michael Han and Igor Babuschkin and Harrison Edwards and Arvind Neelakantan and Tao Xu and Stanislas Polu and Alex Ray and Pranav Shyam and Aditya Ramesh and Alec Radford and Ilya Sutskever},\nyear={2022},\nurl={https://openreview.net/forum?id=SVwbKmEg7M}\n}", "github": "", "project": "", "reviewers": "LmFj;bUnG;LnSu;K8G8", "site": "https://openreview.net/forum?id=SVwbKmEg7M", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "5;3;3;4", "correctness": "3;3;1;3", "technical_novelty": "1;3;2;3", "empirical_novelty": "0;3;2;0", "wc_summary_paper": "96;147;83;83", "wc_summary_review": "145;86;102;47", "wc_main_review": "1077;180;265;378", "wc_review": "1318;413;450;508", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "753;222;322;250", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 2.5, 0.8660254037844386 ], "technical_novelty_avg": [ 2.25, 0.82915619758885 ], "empirical_novelty_avg": [ 1.25, 1.299038105676658 ], "wc_summary_paper_avg": [ 102.25, 26.37588861062315 ], "wc_summary_review_avg": [ 95.0, 35.12121865767189 ], "wc_main_review_avg": [ 475.0, 354.59060901270357 ], "wc_review_avg": [ 672.25, 374.3583677440642 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 386.75, 214.57792873452757 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 11, 0 ], "corr_recommendation_confidence": -0.6225430174794673, "corr_recommendation_correctness": -0.13245323570650439, "gs_citation": 32, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17302434144763438462&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "University of Pittsburgh;Google;University of Massachusetts Amherst;OpenAI", "aff_unique_dep": ";Google DeepMind;;", "aff_unique_url": "https://www.pitt.edu;https://deepmind.com;https://www.umass.edu;https://openai.com", "aff_unique_abbr": "Pitt;DeepMind;UMass Amherst;OpenAI", "aff_campus_unique_index": "1", "aff_campus_unique": ";Amherst", "aff_country_unique_index": "0;1;0;0", "aff_country_unique": "United States;United Kingdom" }, { "title": "On the Existence of Universal Lottery Tickets", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6834", "id": "SYB4WrJql1n", "poster": "", "openreview": "https://openreview.net/forum?id=SYB4WrJql1n", "slides": "https://iclr.cc/virtual/2022/poster/6834", "video": "https://iclr.cc/virtual/2022/poster/6834", "author_site": "Rebekka Burkholz, Nilanjana Laha, Rajarshi Mukherjee, Alkis Gotovos", "tldr": "", "abstract": "The lottery ticket hypothesis conjectures the existence of sparse subnetworks of large randomly initialized deep neural networks that can be successfully trained in isolation. Recent work has experimentally observed that some of these tickets can be practically reused across a variety of tasks, hinting at some form of universality. We formalize this concept and theoretically prove that not only do such universal tickets exist but they also do not require further training. Our proofs introduce a couple of technical innovations related to pruning for strong lottery tickets, including extensions of subset sum results and a strategy to leverage higher amounts of depth. Our explicit sparse constructions of universal function families might be of independent interest, as they highlight representational benefits induced by univariate convolutional architectures. ", "keywords": "theory;deep learning;lottery tickets;universality", "primary_area": "", "supplementary_material": "/attachment/f53431f73176b4cd4cf6e903254ff0957167d77d.zip", "author": "Rebekka Burkholz;Nilanjana Laha;Rajarshi Mukherjee;Alkis Gotovos", "authorids": "~Rebekka_Burkholz1;~Nilanjana_Laha1;~Rajarshi_Mukherjee1;~Alkis_Gotovos1", "gender": "F;F;M;M", "homepage": "https://sites.google.com/view/rebekkaburkholz/startseite;;https://scholar.harvard.edu/rajarshi/home;http://people.csail.mit.edu/alkisg/", "dblp": "194/3172;;;81/10871", "google_scholar": "https://scholar.google.ch/citations?user=vkWBb2wAAAAJ;neGaG4QAAAAJ;;UJ9-UuIAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Rebekka_Burkholz1;~Nilanjana_Laha1;~Rajarshi_Mukherjee1;~Alkis_Gotovos1", "aff": "Helmholtz Center CISPA for Information Security;Harvard university;Harvard University;MPI-SWS", "aff_domain": "cispa.saarland;hsph.harvard.edu;harvard.edu;mpi-sws.org", "position": "Associate Professor;Postdoc;Assistant Professor;Postdoc", "bibtex": "@inproceedings{\nburkholz2022on,\ntitle={On the Existence of Universal Lottery Tickets},\nauthor={Rebekka Burkholz and Nilanjana Laha and Rajarshi Mukherjee and Alkis Gotovos},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=SYB4WrJql1n}\n}", "github": "", "project": "", "reviewers": "y2DU;TK28;rD3v;6yJz", "pdf_size": 0, "recommendation": "6;6;6;8", "confidence": "3;3;2;4", "correctness": "4;3;3;4", "technical_novelty": "2;3;2;3", "empirical_novelty": "0;3;0;2", "wc_summary_paper": "42;46;35;133", "wc_summary_review": "55;26;21;55", "wc_main_review": "167;276;303;287", "wc_review": "264;348;359;475", "wc_reply_reviewers": "55;69;26;25", "wc_reply_authors": "276;571;560;349", "reply_reviewers": "1;1;1;1", "reply_authors": "1;1;1;1", "recommendation_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 3.0, 0.7071067811865476 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 1.25, 1.299038105676658 ], "wc_summary_paper_avg": [ 64.0, 40.03123780249619 ], "wc_summary_review_avg": [ 39.25, 15.848895860595462 ], "wc_main_review_avg": [ 258.25, 53.55079364491249 ], "wc_review_avg": [ 361.5, 75.12822372450982 ], "wc_reply_reviewers_avg": [ 43.75, 18.91262805640718 ], "wc_reply_authors_avg": [ 439.0, 129.1646236397567 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.816496580927726, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 43, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4071511330404748656&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "pdf": "https://openreview.net/pdf?id=SYB4WrJql1n", "email": "cispa.saarland;hsph.harvard.edu;harvard.edu;mpi-sws.org", "author_num": 4, "aff_unique_index": "0;1;1;2", "aff_unique_norm": "Helmholtz Center CISPA;Harvard University;Max Planck Institute for Software Systems", "aff_unique_dep": "Information Security;;", "aff_unique_url": "https://www.cispa.de/;https://www.harvard.edu;https://www.mpi-sws.org", "aff_unique_abbr": "CISPA;Harvard;MPI-SWS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;0", "aff_country_unique": "Germany;United States" }, { "title": "Sparsity Winning Twice: Better Robust Generalization from More Efficient Training", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6147", "id": "SYuJXrXq8tw", "poster": "", "openreview": "https://openreview.net/forum?id=SYuJXrXq8tw", "slides": "https://iclr.cc/virtual/2022/poster/6147", "video": "https://iclr.cc/virtual/2022/poster/6147", "author_site": "Tianlong Chen, Zhenyu Zhang, pengjun wang, Santosh Balachandra, Haoyu Ma, Zehao Wang, Zhangyang Wang", "tldr": "", "abstract": "Recent studies demonstrate the deep networks, even robustified by the state-of-the-art adversarial training (AT), still suffer from large robust generalization gaps, in addition to the much more expensive training costs than standard training. In this paper, we investigate this intriguing problem from a new perspective, i.e., $\\textit{injecting appropriate forms of sparsity}$ during adversarial training. We introduce two alternatives for sparse adversarial training: (i) $\\textit{static sparsity}$, by leveraging recent results from the lottery ticket hypothesis to identify critical sparse subnetworks arising from the early training; (ii) $\\textit{dynamic sparsity}$, by allowing the sparse subnetwork to adaptively adjust its connectivity pattern (while sticking to the same sparsity ratio) throughout training. We find both static and dynamic sparse methods to yield win-win: substantially shrinking the robust generalization gap and alleviating the robust overfitting, meanwhile significantly saving training and inference FLOPs. Extensive experiments validate our proposals with multiple network architectures on diverse datasets, including CIFAR-10/100 and Tiny-ImageNet. For example, our methods reduce robust generalization gap and overfitting by $34.44\\%$ and $4.02\\%$, with comparable robust/standard accuracy boosts and $87.83\\%$/$87.82\\%$ training/inference FLOPs savings on CIFAR-100 with ResNet-18. Besides, our approaches can be organically combined with existing regularizers, establishing new state-of-the-art results in AT. All codes are included.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/d712be4fcd8609c9d5ca643780eeed0ca3bbd18f.zip", "author": "Tianlong Chen;Zhenyu Zhang;pengjun wang;Santosh Balachandra;Haoyu Ma;Zehao Wang;Zhangyang Wang", "authorids": "~Tianlong_Chen1;~Zhenyu_Zhang4;~pengjun_wang1;~Santosh_Balachandra1;~Haoyu_Ma1;~Zehao_Wang4;~Zhangyang_Wang1", "gender": "M;M;;;M;M;M", "homepage": "https://tianlong-chen.github.io;https://zhenyu.gallery;https://github.com/WLucky;;https://www.ics.uci.edu/~haoyum3/;;https://vita-group.github.io", "dblp": ";01/1844-15;;;144/1634;;119/4026", "google_scholar": "LE3ctn0AAAAJ;ZLyJRxoAAAAJ;;;8jugwosAAAAJ;7ksTPyAAAAAJ;pxFyKAIAAAAJ", "orcid": "0000-0001-7774-8197;;;;0000-0001-6646-2644;;", "linkedin": "tianlong-chen-783862167/;zhenyu-allen-zhang-a9b1391a3/;;https://linkedin.com/toshb;haoyu-ma-53517915a/;%E6%B3%BD%E6%98%8A-%E7%8E%8B-b12bb1201/;", "or_profile": "~Tianlong_Chen1;~Zhenyu_Zhang4;~pengjun_wang1;~Santosh_Balachandra1;~Haoyu_Ma1;~Zehao_Wang4;~Zhangyang_Wang1", "aff": "University of Texas, Austin;University of Science and Technology of China;;University of Texas, Austin;Meta Platforms, Inc;University of Science and Technology of China;University of Texas, Austin", "aff_domain": "utexas.edu;ustc.edu;;utexas.edu;fb.com;mail.ustc.edu.cn;utexas.edu", "position": "PhD student;MS student;;Undergrad student;Intern;PhD student;Assistant Professor", "bibtex": "@inproceedings{\nchen2022sparsity,\ntitle={Sparsity Winning Twice: Better Robust Generalization from More Efficient Training},\nauthor={Tianlong Chen and Zhenyu Zhang and pengjun wang and Santosh Balachandra and Haoyu Ma and Zehao Wang and Zhangyang Wang},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=SYuJXrXq8tw}\n}", "github": "", "project": "", "reviewers": "EaMs;dw7x;75p4;a6jd", "pdf_size": 0, "recommendation": "5;6;8;8", "confidence": "3;4;4;4", "correctness": "3;4;4;4", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;4;4;4", "wc_summary_paper": "153;51;34;58", "wc_summary_review": "85;43;16;21", "wc_main_review": "286;317;377;366", "wc_review": "524;411;427;445", "wc_reply_reviewers": "0;0;26;0", "wc_reply_authors": "1929;1496;1531;349", "reply_reviewers": "0;0;1;0", "reply_authors": "8;2;5;1", "recommendation_avg": [ 6.75, 1.299038105676658 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 74.0, 46.438130884005226 ], "wc_summary_review_avg": [ 41.25, 27.224758952100935 ], "wc_main_review_avg": [ 336.5, 36.881567211820055 ], "wc_review_avg": [ 451.75, 43.41298768801797 ], "wc_reply_reviewers_avg": [ 6.5, 11.258330249197702 ], "wc_reply_authors_avg": [ 1326.25, 589.2925313458503 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 4.0, 2.7386127875258306 ], "replies_avg": [ 24, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.7777777777777777, "corr_recommendation_correctness": 0.7777777777777777, "gs_citation": 46, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8533592747621163539&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=SYuJXrXq8tw", "email": "utexas.edu;ustc.edu;;utexas.edu;fb.com;mail.ustc.edu.cn;utexas.edu", "author_num": 7, "aff_unique_index": "0;1;0;2;1;0", "aff_unique_norm": "University of Texas at Austin;University of Science and Technology of China;Meta", "aff_unique_dep": ";;Meta Platforms, Inc", "aff_unique_url": "https://www.utexas.edu;http://www.ustc.edu.cn;https://www.meta.com", "aff_unique_abbr": "UT Austin;USTC;Meta", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Austin;", "aff_country_unique_index": "0;1;0;0;1;0", "aff_country_unique": "United States;China" }, { "id": "SZRqWWB4AAh", "title": "SABAL: Sparse Approximation-based Batch Active Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "We propose a novel and general framework (i.e., SABAL) that formulates batch active learning as a sparse approximation problem. SABAL aims to find a weighted subset from the unlabeled data pool such that the corresponding training loss function approximates its full data pool counterpart. We realize the general framework as a sparsity-constrained discontinuous optimization problem that explicitly balances uncertainty and representation for large-scale applications, for which we propose both greedy and iterative hard thresholding schemes. The proposed method can adapt to various settings, including both Bayesian and non-Bayesian neural networks. Numerical experiments show that that SABAL achieves state-of-the-art performance across different settings with lower computational complexity.", "keywords": "active learning;Bayesian active learning;batch active learning", "primary_area": "", "supplementary_material": "/attachment/6e516f75b1e7db5657dd3bc143053e727d5346f8.zip", "author": "Maohao Shen;Bowen Jiang;Jacky Y. Zhang;Oluwasanmi O Koyejo", "authorids": "~Maohao_Shen1;~Bowen_Jiang2;~Jacky_Y._Zhang1;~Oluwasanmi_O_Koyejo1", "gender": "M;F;M;", "homepage": "https://maohaos2.github.io/Maohao/;https://sites.google.com/seas.upenn.edu/bowenjiang;https://cs.stanford.edu/~sanmi/;https://yiboz.me/", "dblp": "272/5397;142/2975-1;14/8885;251/9129.html", "google_scholar": ";_6AHV9QAAAAJ;EaaOeJwAAAAJ;", "orcid": ";0009-0005-0414-0435;0000-0002-4023-419X;", "linkedin": ";bowen-jiang-6946b2187/;sanmi-koyejo-984754/;", "or_profile": "~Maohao_Shen1;~Bowen_Jiang2;~Oluwasanmi_O_Koyejo1;~Jacky_Yibo_Zhang1", "aff": "Massachusetts Institute of Technology;University of Pennsylvania;University of Illinois, Urbana Champaign;University of Illinois, Urbana Champaign", "aff_domain": "mit.edu;upenn.edu;illinois.edu;illinois.edu", "position": "PhD student;PhD student;Associate Professor;PhD student", "bibtex": "@misc{\nshen2022sabal,\ntitle={{SABAL}: Sparse Approximation-based Batch Active Learning},\nauthor={Maohao Shen and Bowen Jiang and Jacky Y. Zhang and Oluwasanmi O Koyejo},\nyear={2022},\nurl={https://openreview.net/forum?id=SZRqWWB4AAh}\n}", "github": "", "project": "", "reviewers": "ECiF;dVWY;3Nq2;DQaH", "site": "https://openreview.net/forum?id=SZRqWWB4AAh", "pdf_size": 0, "recommendation": "5;5;5;5", "confidence": "4;4;4;4", "correctness": "3;3;4;4", "technical_novelty": "3;3;4;3", "empirical_novelty": "2;2;3;2", "wc_summary_paper": "86;33;41;49", "wc_summary_review": "80;16;27;19", "wc_main_review": "313;144;237;202", "wc_review": "479;193;305;270", "wc_reply_reviewers": "54;0;0;114", "wc_reply_authors": "331;293;319;625", "reply_reviewers": "1;0;0;1", "reply_authors": "1;1;1;2", "recommendation_avg": [ 5.0, 0.0 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 52.25, 20.29008378494283 ], "wc_summary_review_avg": [ 35.5, 26.004807247891687 ], "wc_main_review_avg": [ 224.0, 61.18414827387891 ], "wc_review_avg": [ 311.75, 104.71717862891455 ], "wc_reply_reviewers_avg": [ 42.0, 47.05316142407437 ], "wc_reply_authors_avg": [ 392.0, 135.22203962372407 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:2ZIvZz1QKpIJ:scholar.google.com/&scioq=SABAL:+Sparse+Approximation-based+Batch+Active+Learning&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;2;2", "aff_unique_norm": "Massachusetts Institute of Technology;University of Pennsylvania;University of Illinois Urbana-Champaign", "aff_unique_dep": ";;", "aff_unique_url": "https://web.mit.edu;https://www.upenn.edu;https://illinois.edu", "aff_unique_abbr": "MIT;UPenn;UIUC", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Urbana-Champaign", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Unsupervised Semantic Segmentation by Distilling Feature Correspondences", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6068", "id": "SaKO6z6Hl0c", "poster": "", "openreview": "https://openreview.net/forum?id=SaKO6z6Hl0c", "slides": "https://iclr.cc/virtual/2022/poster/6068", "video": "https://iclr.cc/virtual/2022/poster/6068", "author_site": "Mark Hamilton, Zhoutong Zhang, Bharath Hariharan, Noah Snavely, William Freeman", "tldr": "", "abstract": "Unsupervised semantic segmentation aims to discover and localize semantically meaningful categories within image corpora without any form of annotation. To solve this task, algorithms must produce features for every pixel that are both semantically meaningful and compact enough to form distinct clusters. Unlike previous works which achieve this with a single end-to-end framework, we propose to separate feature learning from cluster compactification. Empirically, we show that current unsupervised feature learning frameworks already generate dense features whose correlations are semantically consistent. This observation motivates us to design STEGO ($\\textbf{S}$elf-supervised $\\textbf{T}$ransformer with $\\textbf{E}$nergy-based $\\textbf{G}$raph $\\textbf{O}$ptimization), a novel framework that distills unsupervised features into high-quality discrete semantic labels. At the core of STEGO is a novel contrastive loss function that encourages features to form compact clusters while preserving their association pattern. STEGO yields a significant improvement over the prior state of the art, on both the CocoStuff ($\\textbf{+14 mIoU}$) and Cityscapes ($\\textbf{+9 mIoU}$) semantic segmentation challenges. ", "keywords": "Unsupervised Semantic Segmentation;Unsupervised Learning;Deep Features;Contrastive Learning;Visual Transformers;Cocostuff;Cityscapes;Semantic Segmentation", "primary_area": "", "supplementary_material": "", "author": "Mark Hamilton;Zhoutong Zhang;Bharath Hariharan;Noah Snavely;William T. Freeman", "authorids": "~Mark_Hamilton1;~Zhoutong_Zhang1;~Bharath_Hariharan3;~Noah_Snavely1;~William_T._Freeman1", "gender": "M;M;M;M;M", "homepage": "https://mhamilton.net;https://ztzhang.info;http://www.cs.cornell.edu/~snavely/;https://billf.mit.edu/;http://home.bharathh.info", "dblp": "91/631;169/4798;33/4636;86/6650;05/8412", "google_scholar": "kgZtMGsAAAAJ;;Db4BCX8AAAAJ;https://scholar.google.com.tw/citations?user=0zZnyMEAAAAJ;TpglobcAAAAJ", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~Mark_Hamilton1;~Zhoutong_Zhang1;~Noah_Snavely1;~William_T._Freeman1;~Bharath_Hariharan2", "aff": "Massachusetts Institute of Technology;Massachusetts Institute of Technology;Cornell University;Massachusetts Institute of Technology;Cornell University", "aff_domain": "mit.edu;mit.edu;cornell.edu;mit.edu;cornell.edu", "position": "PhD student;PhD student;Associate Professor;Professor;Assistant Professor", "bibtex": "@inproceedings{\nhamilton2022unsupervised,\ntitle={Unsupervised Semantic Segmentation by Distilling Feature Correspondences},\nauthor={Mark Hamilton and Zhoutong Zhang and Bharath Hariharan and Noah Snavely and William T. Freeman},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=SaKO6z6Hl0c}\n}", "github": "", "project": "", "reviewers": "GKTw;2hfj;cF6T;XoJq", "pdf_size": 0, "recommendation": "6;6;8;8", "confidence": "2;4;4;4", "correctness": "3;4;3;4", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;3;2;3", "wc_summary_paper": "30;141;98;146", "wc_summary_review": "19;256;46;46", "wc_main_review": "187;542;485;177", "wc_review": "236;939;629;369", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "359;319;517;227", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 7.0, 1.0 ], "confidence_avg": [ 3.5, 0.8660254037844386 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 103.75, 46.48857386498321 ], "wc_summary_review_avg": [ 91.75, 95.46825388578132 ], "wc_main_review_avg": [ 347.75, 167.00804621334865 ], "wc_review_avg": [ 543.25, 268.6711512239451 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 355.5, 104.8081580794167 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": 0.0, "gs_citation": 328, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8638628527714032897&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=SaKO6z6Hl0c", "email": "mit.edu;mit.edu;cornell.edu;mit.edu;cornell.edu", "author_num": 5, "aff_unique_index": "0;0;1;0;1", "aff_unique_norm": "Massachusetts Institute of Technology;Cornell University", "aff_unique_dep": ";", "aff_unique_url": "https://web.mit.edu;https://www.cornell.edu", "aff_unique_abbr": "MIT;Cornell", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "SawkGZ3oR2J", "title": "Accelerating Federated Split Learning via Local-Loss-Based Training", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Federated learning (FL) operates based on model exchanges between the server and the clients, and suffers from significant communication as well as client-side computation burden. Emerging split learning (SL) solutions can reduce the clientside computation burden by splitting the model architecture between the server and the clients. However, SL-based ideas still require significant time delay, since each participating client should wait for the backpropagated gradients from the server in order to update its model. Also, the communication burden can still be substantial, depending on various factors like local dataset size and shape of cut layer activations/gradients. In this paper, we propose a new direction to FL/SL based on updating the client/server-side models in parallel, via local-loss-based training specifically geared to split learning. The parallel training of split models substantially shortens latency while obviating server-to-clients communication. We provide latency analysis that leads to optimal model cut as well as general guidelines for splitting the model. We also provide a theoretical analysis for guaranteeing convergence and understanding interplay among different hyperparameters and system constraints. Extensive experimental results indicate that our scheme has significant communication and latency advantages over existing FL and SL ideas.", "keywords": "Federated Learning;Split Learning", "primary_area": "", "supplementary_material": "", "author": "Dong-Jun Han;Hasnain Irshad Bhatti;Jungmoon Lee;Jaekyun Moon", "authorids": "~Dong-Jun_Han1;~Hasnain_Irshad_Bhatti1;~Jungmoon_Lee1;~Jaekyun_Moon2", "gender": "M;M;M;M", "homepage": "https://sites.google.com/view/djhan930/home?authuser=0;https://hasnainirshad.github.io/;http://comstolab.kaist.ac.kr/;http://comstolab.kaist.ac.kr/people.html", "dblp": "201/0078;326/1184;;78/2744", "google_scholar": "https://scholar.google.co.kr/citations?user=-YR-GxUAAAAJ;https://scholar.google.co.kr/citations?user=aP3vFRcAAAAJ;;", "orcid": ";;;", "linkedin": ";hasnainirshad/;;", "or_profile": "~Dong-Jun_Han1;~Hasnain_Irshad_Bhatti1;~Jungmoon_Lee1;~Jaekyun_Moon2", "aff": "Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology;KAIST", "aff_domain": "kaist.ac.kr;kaist.ac.kr;kaist.edu;kaist.edu", "position": "PhD student;MS student;PhD student;Full Professor", "bibtex": "@misc{\nhan2022accelerating,\ntitle={Accelerating Federated Split Learning via Local-Loss-Based Training},\nauthor={Dong-Jun Han and Hasnain Irshad Bhatti and Jungmoon Lee and Jaekyun Moon},\nyear={2022},\nurl={https://openreview.net/forum?id=SawkGZ3oR2J}\n}", "github": "", "project": "", "reviewers": "ysxN;7kMm;H7B8;FyYW", "site": "https://openreview.net/forum?id=SawkGZ3oR2J", "pdf_size": 0, "recommendation": "3;5;6;6", "confidence": "4;4;4;4", "correctness": "2;3;3;3", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;3;3;4", "wc_summary_paper": "43;177;71;111", "wc_summary_review": "44;59;38;31", "wc_main_review": "151;468;568;313", "wc_review": "238;704;677;455", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 5.0, 1.224744871391589 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 100.5, 50.34630075785112 ], "wc_summary_review_avg": [ 43.0, 10.319883720275147 ], "wc_main_review_avg": [ 375.0, 158.04904302146218 ], "wc_review_avg": [ 518.5, 188.5769073879408 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.9428090415820632, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:t3zT8-vo7GYJ:scholar.google.com/&scioq=Accelerating+Federated+Split+Learning+via+Local-Loss-Based+Training&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Korea Advanced Institute of Science and Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.kaist.ac.kr", "aff_unique_abbr": "KAIST", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "South Korea" }, { "id": "Sb4hTI15hUZ", "title": "Data-oriented Scene Recognition", "track": "main", "status": "Reject", "tldr": "", "abstract": "Most deep learning backbones are evaluated on ImageNet. Using scenery images as an example, we conducted extensive experiments to demonstrate the widely accepted principles in network design may result in dramatic performance differences when the data is altered. Exploratory experiments are engaged to explain the underlining cause of the differences. Based on our observation, this paper presents a novel network design methodology: data-oriented network design. In other words, instead of designing universal backbones, the scheming of the networks should treat the characteristics of data as a crucial component. We further proposed a Deep-Narrow Network and Lossless Pooling module, which improved the scene recognition performance using less than half of the computational resources compared to the benchmark network architecture ResNets. ", "keywords": "data-oriented design;computer vision;scene recognition;image recognition", "primary_area": "", "supplementary_material": "", "author": "Zhinan Qiao;Xiaohui Yuan;Chaoning Zhang;Jianfang Shi;Jian Xia", "authorids": "~Zhinan_Qiao1;~Xiaohui_Yuan1;~Chaoning_Zhang1;shijianfang66@163.com;tyljx@163.com", "gender": "F;;M;;", "homepage": ";;;;", "dblp": "270/0572;;;;", "google_scholar": "LyQTsQcAAAAJ;4F2la7sAAAAJ;https://scholar.google.co.kr/citations?user=lvhxhyQAAAAJ;;", "orcid": ";;;;", "linkedin": "zn-q-3a2946159/;;;;", "or_profile": "~Zhinan_Qiao1;~Xiaohui_Yuan1;~Chaoning_Zhang1;shijianfang66@163.com;tyljx@163.com", "aff": "Experian DataLabs;University of North Texas, Denton;Korea Advanced Institute of Science & Technology;;", "aff_domain": "experian.com;unt.edu;kaist.ac.kr;;", "position": "Researcher;professor;Postdoc;;", "bibtex": "@misc{\nqiao2022dataoriented,\ntitle={Data-oriented Scene Recognition},\nauthor={Zhinan Qiao and Xiaohui Yuan and Chaoning Zhang and Jianfang Shi and Jian Xia},\nyear={2022},\nurl={https://openreview.net/forum?id=Sb4hTI15hUZ}\n}", "github": "", "project": "", "reviewers": "aP7k;eQUx;VeaE;rDrK", "site": "https://openreview.net/forum?id=Sb4hTI15hUZ", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "4;4;4;4", "correctness": "2;1;2;3", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;1;2;2", "wc_summary_paper": "29;46;79;143", "wc_summary_review": "28;75;59;64", "wc_main_review": "278;299;366;580", "wc_review": "335;420;504;787", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 2.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 1.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 74.25, 43.573931426943794 ], "wc_summary_review_avg": [ 56.5, 17.44276354251241 ], "wc_main_review_avg": [ 380.75, 119.53948092575942 ], "wc_review_avg": [ 511.5, 169.91247747001984 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.816496580927726, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Ft0BWW104bAJ:scholar.google.com/&scioq=Data-oriented+Scene+Recognition&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1;2", "aff_unique_norm": "Experian;University of North Texas;Korea Advanced Institute of Science and Technology", "aff_unique_dep": "DataLabs;;", "aff_unique_url": "https://www.experian.com;https://www.unt.edu;https://www.kaist.ac.kr", "aff_unique_abbr": "Experian;UNT;KAIST", "aff_campus_unique_index": "1", "aff_campus_unique": ";Denton", "aff_country_unique_index": "0;1;2", "aff_country_unique": "United Kingdom;United States;South Korea" }, { "id": "SbV8J9JHb6", "title": "Soteria: In search of efficient neural networks for private inference", "track": "main", "status": "Reject", "tldr": "", "abstract": "In the context of ML as a service, our objective is to protect the confidentiality of the users\u2019 queries and the server's model parameters, with modest computation and communication overhead. Prior solutions primarily propose fine-tuning cryptographic methods to make them efficient for known fixed model architectures. The drawback with this line of approach is that the model itself is never designed to efficiently operate with existing cryptographic computations. We observe that the network architecture, internal functions, and parameters of a model, which are all chosen during training, significantly influence the computation and communication overhead of a cryptographic method, during inference.Thus, we propose SOTERIA \u2014 a training method to construct model architectures that are by-design efficient for private inference. We use neural architecture search algorithms with the dual objective of optimizing the accuracy of the model and the overhead of using cryptographic primitives for secure inference. Given the flexibility of modifying a model during training, we find accurate models that are also efficient for private computation. We select garbled circuits as our underlying cryptographic primitive, due to their expressiveness and efficiency. We empirically evaluate SOTERIA on MNIST and CIFAR10 datasets, to compare with the prior work on secure inference. Our results confirm that SOTERIA is indeed effective in balancing performance and accuracy.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Anshul Aggarwal;Trevor E Carlson;Reza Shokri;Shruti Tople", "authorids": "anshul@comp.nus.edu.sg;~Trevor_E_Carlson1;~Reza_Shokri1;~Shruti_Tople2", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": "anshul@comp.nus.edu.sg;~Trevor_E_Carlson1;~Reza_Shokri1;~Shruti_Tople2", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\naggarwal2022soteria,\ntitle={Soteria: In search of efficient neural networks for private inference},\nauthor={Anshul Aggarwal and Trevor E Carlson and Reza Shokri and Shruti Tople},\nyear={2022},\nurl={https://openreview.net/forum?id=SbV8J9JHb6}\n}", "github": "", "project": "", "reviewers": "zXdE;rwM8;TcBF", "site": "https://openreview.net/forum?id=SbV8J9JHb6", "pdf_size": 0, "recommendation": "3;5;5", "confidence": "2;3;4", "correctness": "2;3;3", "technical_novelty": "2;2;3", "empirical_novelty": "3;2;3", "wc_summary_paper": "101;34;239", "wc_summary_review": "40;1;57", "wc_main_review": "309;227;158", "wc_review": "450;262;454", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 4.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 3.0, 0.816496580927726 ], "correctness_avg": [ 2.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 124.66666666666667, 85.3476550481747 ], "wc_summary_review_avg": [ 32.666666666666664, 23.442601296689656 ], "wc_main_review_avg": [ 231.33333333333334, 61.721597156550935 ], "wc_review_avg": [ 388.6666666666667, 89.58174417195107 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.8660254037844385, "corr_recommendation_correctness": 0.9999999999999998, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14957687747686818732&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5 }, { "id": "SgEhFeRyzEZ", "title": "Convergence Analysis and Implicit Regularization of Feedback Alignment for Deep Linear Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "We theoretically analyze the Feedback Alignment (FA) algorithm, an efficient alternative to backpropagation for training neural networks. We provide convergence guarantees with rates for deep linear networks for both continuous and discrete dynamics. Additionally, we study incremental learning phenomena for shallow linear networks. Interestingly, certain specific initializations imply that negligible components are learned {before} the principal ones, thus potentially negatively affecting the effectiveness of such a learning algorithm; a phenomenon we classify as implicit anti-regularization. We also provide initialization schemes where the components of the problem are approximately learned by decreasing order of importance, thus providing a form of implicit regularization.", "keywords": "feedback alignement;optimization;convergence guarantees;implicit regularization", "primary_area": "", "supplementary_material": "/attachment/bdc9175ced37c7cb4245eab4fac217bd6fe696d4.zip", "author": "Manuela Girotti;Ioannis Mitliagkas;Gauthier Gidel", "authorids": "~Manuela_Girotti1;~Ioannis_Mitliagkas1;~Gauthier_Gidel1", "gender": "F;M;M", "homepage": "https://mathemanu.github.io/;http://mitliagkas.github.io/;https://gauthiergidel.github.io/", "dblp": ";83/8757;188/6326", "google_scholar": "P69Py8IAAAAJ;K757SxgAAAAJ;https://scholar.google.fr/citations?user=bDrXQPUAAAAJ", "orcid": "0000-0003-2261-1251;;", "linkedin": "mathemanu/;;", "or_profile": "~Manuela_Girotti1;~Ioannis_Mitliagkas1;~Gauthier_Gidel1", "aff": "Concordia University, Montreal;University of Montreal;Mila - Quebec Artificial Intelligence Institute", "aff_domain": "concordia.ca;umontreal.ca;mila.quebec", "position": "Affiliate Assistant Professor;Assistant Professor;Assistant Professor", "bibtex": "@misc{\ngirotti2022convergence,\ntitle={Convergence Analysis and Implicit Regularization of Feedback Alignment for Deep Linear Networks},\nauthor={Manuela Girotti and Ioannis Mitliagkas and Gauthier Gidel},\nyear={2022},\nurl={https://openreview.net/forum?id=SgEhFeRyzEZ}\n}", "github": "", "project": "", "reviewers": "Q9E4;xi3S;eeMk;6Die", "site": "https://openreview.net/forum?id=SgEhFeRyzEZ", "pdf_size": 0, "recommendation": "5;5;5;8", "confidence": "4;3;3;5", "correctness": "4;4;4;4", "technical_novelty": "3;2;2;4", "empirical_novelty": "2;0;2;0", "wc_summary_paper": "92;47;125;234", "wc_summary_review": "51;27;32;127", "wc_main_review": "443;208;202;264", "wc_review": "586;282;359;625", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "502;374;467;448", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 5.75, 1.299038105676658 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 4.0, 0.0 ], "technical_novelty_avg": [ 2.75, 0.82915619758885 ], "empirical_novelty_avg": [ 1.0, 1.0 ], "wc_summary_paper_avg": [ 124.5, 69.01630242196404 ], "wc_summary_review_avg": [ 59.25, 40.12714168739159 ], "wc_main_review_avg": [ 279.25, 97.58425846416009 ], "wc_review_avg": [ 463.0, 145.7309164178967 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 447.75, 46.778066441442405 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.8703882797784892, "corr_recommendation_correctness": 0.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12788635326023747952&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "Concordia University;University of Montreal;Quebec Artificial Intelligence Institute", "aff_unique_dep": ";;Artificial Intelligence", "aff_unique_url": "https://www.concordia.ca;https://wwwumontreal.ca;https://mila.quebec", "aff_unique_abbr": "Concordia;UM;Mila", "aff_campus_unique_index": "0", "aff_campus_unique": "Montreal;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Canada" }, { "id": "ShtJLsF7cbb", "title": "Time-aware Relational Graph Attention Network for Temporal Knowledge Graph Embeddings", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Embedding-based representation learning approaches for knowledge graphs (KGs) have been mostly designed for static data. However, many KGs involve temporal data, which creates the need for new representation learning approaches that can characterize and reason over time. In this work, we propose a Time-aware Relational Graph ATtention Network (TR-GAT) for temporal knowledge graph (TKG) embeddings, in which the initial feature of each entity is represented by fusing its embedding and the embeddings of its connected relations and timestamps as well as its neighboring entities. Different from the existing temporal GNN models which discretize temporal graphs into multiple snapshots, we treat timestamps as properties of links between entities. To further incorporate relation and time information into the graph structures, we utilize a self-attention mechanism which specifies different weights to different nodes according to the corresponding link features, i.e., embeddings of the relevant relations and timestamps within one neighborhood. Experimental results show that our approach achieves state-of-the-art performances regarding TKG completion and entity alignment tasks on several well-established TKG datasets due to the effective and efficient integration of time information.", "keywords": "Temporal Attention;Temporal Knowledge Graph Reasoning;Knowledge Graph Completion;Entity Alignment", "primary_area": "", "supplementary_material": "", "author": "Chengjin Xu;Fenglong Su;Jens Lehmann", "authorids": "~Chengjin_Xu1;~Fenglong_Su1;~Jens_Lehmann3", "gender": "M;M;M", "homepage": "https://soledad921.github.io/chengjin_xu/;;http://jens-lehmann.org", "dblp": "247/6268.html;205/0212.html;71/4882.html", "google_scholar": "https://scholar.google.de/citations?user=sIts5VgAAAAJ;;https://scholar.google.de/citations?user=sEaQ5rgAAAAJ", "orcid": ";;0000-0001-9108-4278", "linkedin": ";;jenslehmann82/", "or_profile": "~Chengjin_Xu1;~Fenglong_Su1;~Jens_Lehmann3", "aff": "University of Bonn;National University of Defense Technology;Fraunhofer IAIS", "aff_domain": "uni-bonn.de;nudt.edu.cn;iais.fraunhofer.de", "position": "PhD student;PhD student;Lead Scientist", "bibtex": "@misc{\nxu2022timeaware,\ntitle={Time-aware Relational Graph Attention Network for Temporal Knowledge Graph Embeddings},\nauthor={Chengjin Xu and Fenglong Su and Jens Lehmann},\nyear={2022},\nurl={https://openreview.net/forum?id=ShtJLsF7cbb}\n}", "github": "", "project": "", "reviewers": "rkMC;6RVN;S2tF;CnWe", "site": "https://openreview.net/forum?id=ShtJLsF7cbb", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "3;4;3;4", "correctness": "3;3;3;3", "technical_novelty": "2;1;1;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "55;57;60;74", "wc_summary_review": "31;33;25;43", "wc_main_review": "313;480;253;488", "wc_review": "399;570;338;605", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 1.75, 0.82915619758885 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 61.5, 7.433034373659253 ], "wc_summary_review_avg": [ 33.0, 6.48074069840786 ], "wc_main_review_avg": [ 383.5, 102.75334544432118 ], "wc_review_avg": [ 478.0, 112.28757722918418 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.6882472016116854, "corr_recommendation_correctness": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16685000599266718120&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;2", "aff_unique_norm": "University of Bonn;National University of Defense Technology;Fraunhofer Institute for Applied Information Technology", "aff_unique_dep": ";;", "aff_unique_url": "https://www.uni-bonn.de/;http://www.nudt.edu.cn/;https://www.iais.fraunhofer.de/", "aff_unique_abbr": "UBonn;NUDT;Fraunhofer IAIS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", "aff_country_unique": "Germany;China" }, { "title": "Near-Optimal Reward-Free Exploration for Linear Mixture MDPs with Plug-in Solver", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6961", "id": "SidzxAb9k30", "poster": "", "openreview": "https://openreview.net/forum?id=SidzxAb9k30", "slides": "https://iclr.cc/virtual/2022/poster/6961", "video": "https://iclr.cc/virtual/2022/poster/6961", "author_site": "Xiaoyu Chen, Jiachen Hu, Lin Yang, Liwei Wang", "tldr": "", "abstract": "Although model-based reinforcement learning (RL) approaches are considered more sample efficient, existing algorithms are usually relying on sophisticated planning algorithm to couple tightly with the model-learning procedure. Hence the learned models may lack the ability of being re-used with more specialized planners. In this paper we address this issue and provide approaches to learn an RL model efficiently without the guidance of a reward signal. In particular, we take a plug-in solver approach, where we focus on learning a model in the exploration phase and demand that \\emph{any planning algorithm} on the learned model can give a near-optimal policy. Specicially, we focus on the linear mixture MDP setting, where the probability transition matrix is a (unknown) convex combination of a set of existing models. We show that, by establishing a novel exploration algorithm, the plug-in approach learns a model by taking $\\tilde{O}(d^2H^3/\\epsilon^2)$ interactions with the environment and \\emph{any} $\\epsilon$-optimal planner on the model gives an $O(\\epsilon)$-optimal policy on the original model. This sample complexity matches lower bounds for non-plug-in approaches and is \\emph{statistically optimal}. We achieve this result by leveraging a careful maximum total-variance bound using Bernstein inequality and properties specified to linear mixture MDP.", "keywords": "reward-free exploration;model-based reinforcement learning;learning theory", "primary_area": "", "supplementary_material": "", "author": "Xiaoyu Chen;Jiachen Hu;Lin Yang;Liwei Wang", "authorids": "~Xiaoyu_Chen2;~Jiachen_Hu1;~Lin_Yang12;~Liwei_Wang1", "gender": "M;M;M;M", "homepage": ";https://nickhclos.github.io/;http://www.liweiwang-pku.com/;http://www.drlinyang.net", "dblp": "30/4497;239/5040;;166/6264", "google_scholar": "sioumZAAAAAJ;5GavKiQAAAAJ;VZHxoh8AAAAJ;umivlPQAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Xiaoyu_Chen2;~Jiachen_Hu1;~Liwei_Wang1;~lin_Yang1", "aff": "Peking University;Peking University;Peking University;University of California, Los Angeles", "aff_domain": "pku.edu.cn;pku.edu.cn;pku.edu.cn;ucla.edu", "position": "PhD student;PhD student;Full Professor;Assistant Professor", "bibtex": "@inproceedings{\nchen2022nearoptimal,\ntitle={Near-Optimal Reward-Free Exploration for Linear Mixture {MDP}s with Plug-in Solver},\nauthor={Xiaoyu Chen and Jiachen Hu and Lin Yang and Liwei Wang},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=SidzxAb9k30}\n}", "github": "", "project": "", "reviewers": "ZcuD;kLwC;VReK", "pdf_size": 0, "recommendation": "6;8;8", "confidence": "3;3;2", "correctness": "3;4;3", "technical_novelty": "3;3;3", "empirical_novelty": "3;1;0", "wc_summary_paper": "110;82;73", "wc_summary_review": "37;178;39", "wc_main_review": "875;454;389", "wc_review": "1022;714;501", "wc_reply_reviewers": "585;83;23", "wc_reply_authors": "1089;583;197", "reply_reviewers": "2;1;1", "reply_authors": "2;2;1", "recommendation_avg": [ 7.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 2.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 1.3333333333333333, 1.247219128924647 ], "wc_summary_paper_avg": [ 88.33333333333333, 15.755069730795297 ], "wc_summary_review_avg": [ 84.66666666666667, 66.001683480213 ], "wc_main_review_avg": [ 572.6666666666666, 215.42258212380818 ], "wc_review_avg": [ 745.6666666666666, 213.87275552429665 ], "wc_reply_reviewers_avg": [ 230.33333333333334, 251.98059890043564 ], "wc_reply_authors_avg": [ 623.0, 365.25424934785724 ], "reply_reviewers_avg": [ 1.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.49999999999999983, "corr_recommendation_correctness": 0.49999999999999983, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16191747451702843669&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=SidzxAb9k30", "email": "pku.edu.cn;pku.edu.cn;pku.edu.cn;ucla.edu", "author_num": 4, "aff_unique_index": "0;0;0;1", "aff_unique_norm": "Peking University;University of California, Los Angeles", "aff_unique_dep": ";", "aff_unique_url": "http://www.pku.edu.cn;https://www.ucla.edu", "aff_unique_abbr": "Peking U;UCLA", "aff_campus_unique_index": "1", "aff_campus_unique": ";Los Angeles", "aff_country_unique_index": "0;0;0;1", "aff_country_unique": "China;United States" }, { "id": "SjGRJ4vSZlP", "title": "Near-Optimal Algorithms for Autonomous Exploration and Multi-Goal Stochastic Shortest Path", "track": "main", "status": "Reject", "tldr": "", "abstract": "We revisit the incremental autonomous exploration problem proposed by Lim and Auer (2012). In this setting, the agent aims to learn a set of near-optimal goal-conditioned policies to reach the $L$-controllable states: states that are incrementally reachable from an initial state $s_0$ within $L$ steps in expectation. We introduce three new algorithms with stronger sample complexity bounds than existing ones. Furthermore, we also prove the first lower bound for the autonomous exploration problem. In particular, the lower bound implies that one of our proposed algorithms, Value-Aware Autonomous Exploration, is nearly minimax-optimal when the number of $L$-controllable states grows polynomially with respect to $L$. Key in our algorithm design is a connection between autonomous exploration and multi-goal stochastic shortest path, a new problem that naturally generalizes the classical stochastic shortest path problem. This new problem and its connection to autonomous exploration can be of independent interest.", "keywords": "Reinforcement learning theory;autonomous exploration", "primary_area": "", "supplementary_material": "", "author": "Haoyuan Cai;Tengyu Ma;Simon Shaolei Du", "authorids": "~Haoyuan_Cai1;~Tengyu_Ma1;~Simon_Shaolei_Du1", "gender": "M;M;M", "homepage": ";http://ai.stanford.edu/~tengyuma/;http://simonshaoleidu.com", "dblp": ";54/9061;176/5602", "google_scholar": "https://scholar.google.com/citations?view_op=list_works;i38QlUwAAAAJ;OttawxUAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Haoyuan_Cai1;~Tengyu_Ma1;~Simon_Shaolei_Du1", "aff": ";Facebook AI Research;Meta Facebook", "aff_domain": ";fb.com;fb.com", "position": ";Visiting Scientist;Visiting Professor", "bibtex": "@misc{\ncai2022nearoptimal,\ntitle={Near-Optimal Algorithms for Autonomous Exploration and Multi-Goal Stochastic Shortest Path},\nauthor={Haoyuan Cai and Tengyu Ma and Simon Shaolei Du},\nyear={2022},\nurl={https://openreview.net/forum?id=SjGRJ4vSZlP}\n}", "github": "", "project": "", "reviewers": "hPxX;J6YN;CNUB;nNZN", "site": "https://openreview.net/forum?id=SjGRJ4vSZlP", "pdf_size": 0, "recommendation": "3;5;6;6", "confidence": "3;3;3;3", "correctness": "3;2;3;3", "technical_novelty": "2;3;3;3", "empirical_novelty": "1;0;3;0", "wc_summary_paper": "25;245;52;75", "wc_summary_review": "25;4;93;26", "wc_main_review": "577;609;457;244", "wc_review": "627;858;602;345", "wc_reply_reviewers": "0;59;48;44", "wc_reply_authors": "1391;1633;617;702", "reply_reviewers": "0;1;1;1", "reply_authors": "2;2;1;1", "recommendation_avg": [ 5.0, 1.224744871391589 ], "confidence_avg": [ 3.0, 0.0 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.0, 1.224744871391589 ], "wc_summary_paper_avg": [ 99.25, 85.98946156361255 ], "wc_summary_review_avg": [ 37.0, 33.50373113550191 ], "wc_main_review_avg": [ 471.75, 143.18061146677647 ], "wc_review_avg": [ 608.0, 181.7044303257353 ], "wc_reply_reviewers_avg": [ 37.75, 22.47637648732553 ], "wc_reply_authors_avg": [ 1085.75, 435.78972853889064 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=890609703872839864&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff_unique_index": "0;0", "aff_unique_norm": "Meta", "aff_unique_dep": "Facebook AI Research", "aff_unique_url": "https://research.facebook.com", "aff_unique_abbr": "FAIR", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Label-Efficient Semantic Segmentation with Diffusion Models", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6569", "id": "SlxSY2UZQT", "poster": "", "openreview": "https://openreview.net/forum?id=SlxSY2UZQT", "slides": "https://iclr.cc/virtual/2022/poster/6569", "video": "https://iclr.cc/virtual/2022/poster/6569", "author_site": "Dmitry Baranchuk, Andrey Voynov, Ivan Rubachev, Valentin Khrulkov, Artem Babenko", "tldr": "", "abstract": "Denoising diffusion probabilistic models have recently received much research attention since they outperform alternative approaches, such as GANs, and currently provide state-of-the-art generative performance. The superior performance of diffusion models has made them an appealing tool in several applications, including inpainting, super-resolution, and semantic editing. In this paper, we demonstrate that diffusion models can also serve as an instrument for semantic segmentation, especially in the setup when labeled data is scarce. In particular, for several pretrained diffusion models, we investigate the intermediate activations from the networks that perform the Markov step of the reverse diffusion process. We show that these activations effectively capture the semantic information from an input image and appear to be excellent pixel-level representations for the segmentation problem. Based on these observations, we describe a simple segmentation method, which can work even if only a few training images are provided. Our approach significantly outperforms the existing alternatives on several datasets for the same amount of human supervision. ", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Dmitry Baranchuk;Andrey Voynov;Ivan Rubachev;Valentin Khrulkov;Artem Babenko", "authorids": "~Dmitry_Baranchuk2;~Andrey_Voynov1;~Ivan_Rubachev1;~Valentin_Khrulkov1;~Artem_Babenko1", "gender": "M;M;M;M;M", "homepage": ";https://anvoynov.github.io/anvoynov/;https://github.com/puhsu;;", "dblp": "215/3712;255/6107;295/9535;;117/4834", "google_scholar": "NiPmk8oAAAAJ;imBjSgUAAAAJ;;https://scholar.google.ru/citations?user=GS5HTlkAAAAJ;q885d1wAAAAJ", "orcid": "0000-0001-7660-3666;;;0009-0000-6694-5398;0000-0002-1830-8252", "linkedin": ";;;;", "or_profile": "~Dmitry_Baranchuk2;~Andrey_Voynov1;~Ivan_Rubachev1;~Valentin_Khrulkov1;~Artem_Babenko1", "aff": "Meta;Yandex;Higher School of Economics;Skolkovo Institute of Science and Technology;Yandex", "aff_domain": "meta.com;yandex-team.ru;hse.ru;skolkovotech.ru;yandex-team.ru", "position": "Intern;Researcher;MS student;PhD student;Researcher", "bibtex": "@inproceedings{\nbaranchuk2022labelefficient,\ntitle={Label-Efficient Semantic Segmentation with Diffusion Models},\nauthor={Dmitry Baranchuk and Andrey Voynov and Ivan Rubachev and Valentin Khrulkov and Artem Babenko},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=SlxSY2UZQT}\n}", "github": "", "project": "", "reviewers": "LFKA;Ck6k;s8q7", "pdf_size": 0, "recommendation": "6;8;8", "confidence": "4;3;4", "correctness": "4;3;3", "technical_novelty": "3;3;3", "empirical_novelty": "3;3;3", "wc_summary_paper": "149;82;87", "wc_summary_review": "152;108;34", "wc_main_review": "500;261;334", "wc_review": "801;451;455", "wc_reply_reviewers": "0;234;39", "wc_reply_authors": "840;548;558", "reply_reviewers": "0;2;1", "reply_authors": "2;3;2", "recommendation_avg": [ 7.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 106.0, 30.474032661705056 ], "wc_summary_review_avg": [ 98.0, 48.68949236402724 ], "wc_main_review_avg": [ 365.0, 100.00333327777963 ], "wc_review_avg": [ 569.0, 164.0569006981013 ], "wc_reply_reviewers_avg": [ 91.0, 102.36210236215355 ], "wc_reply_authors_avg": [ 648.6666666666666, 135.3546781197048 ], "reply_reviewers_avg": [ 1.0, 0.816496580927726 ], "reply_authors_avg": [ 2.3333333333333335, 0.4714045207910317 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.49999999999999983, "corr_recommendation_correctness": -0.9999999999999998, "gs_citation": 644, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15536080386381166237&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=SlxSY2UZQT", "email": "meta.com;yandex-team.ru;hse.ru;skolkovotech.ru;yandex-team.ru", "author_num": 5, "aff_unique_index": "0;1;2;3;1", "aff_unique_norm": "Meta;Yandex;Higher School of Economics;Skolkovo Institute of Science and Technology", "aff_unique_dep": "Meta Platforms, Inc.;;;", "aff_unique_url": "https://meta.com;https://yandex.com;https://www.hse.ru;https://www.skoltech.ru", "aff_unique_abbr": "Meta;Yandex;HSE;Skoltech", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1;1", "aff_country_unique": "United States;Russian Federation" }, { "id": "Snqhqz4LdK", "title": "Generating Realistic 3D Molecules with an Equivariant Conditional Likelihood Model", "track": "main", "status": "Reject", "tldr": "", "abstract": "The number of drug-like molecules that could potentially exist is thought to be above $10^{33}$, precluding exhaustive computational or experimental screens for molecules with desirable pharmaceutical properties. Machine learning models that can propose novel molecules with specific characteristics are powerful new tools to break through the intractability of searching chemical space. Most of these models generate molecular graphs\u2014representations that describe the topology of covalently bonded atoms in a molecule\u2014because the bonding information in the graphs is required for many downstream applications, such as virtual screening and molecular dynamics simulation. These models, however, do not themselves generate 3D coordinates for the atoms within a molecule (which are also required for these applications), and thus they cannot easily incorporate information about 3D geometry when optimizing molecular properties. In this paper, we present GEN3D, a model that concurrently generates molecular graphs and 3D geometries, and is equivariant to rotations, translations, and atom permutations. The model extends a partially generated molecule by computing a conditional distribution over atom types, bonds, and spatial locations, and then sampling from that distribution to update the molecular graph and geometries, one atom at a time. We found that GEN3D proposes molecules that have much higher rates of chemical validity, and much better atom-distance distributions, than those generated with previous models. In addition, we validated our model\u2019s geometric accuracy by forcing it to predict geometries for benchmark molecular graph inputs, and found that it also advances the state of the art on this test. We believe that the advantages that GEN3D provides over other models will enable it to contribute substantially to structure-based drug discovery efforts.", "keywords": "Generative Models;Molecular Graphs;3D Molecules;Drug Discovery;Equivariance", "primary_area": "", "supplementary_material": "", "author": "James P. Roney;Paul Maragakis;Peter Skopp;David E. Shaw", "authorids": "james.roney@deshawresearch.com;~Paul_Maragakis1;peter.skopp@deshawresearch.com;~David_E._Shaw2", "gender": ";;;", "homepage": ";https://www.deshawresearch.com/people_c-b_maragakis.html;;https://www.deshawresearch.com/", "dblp": ";70/375;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";paul-maragakis-53a9048;;", "or_profile": "james.roney@deshawresearch.com;~Paul_Maragakis1;peter.skopp@deshawresearch.com;~David_E._Shaw2", "aff": ";D. E. Shaw Research;;D. E. Shaw Research", "aff_domain": ";deshawresearch.com;;deshawresearch.com", "position": ";Principal Researcher;;Chief Scientist", "bibtex": "@misc{\nroney2022generating,\ntitle={Generating Realistic 3D Molecules with an Equivariant Conditional Likelihood Model},\nauthor={James P. Roney and Paul Maragakis and Peter Skopp and David E. Shaw},\nyear={2022},\nurl={https://openreview.net/forum?id=Snqhqz4LdK}\n}", "github": "", "project": "", "reviewers": "biHr;Qzyh;7FsH;uzvM", "site": "https://openreview.net/forum?id=Snqhqz4LdK", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "4;4;3;3", "correctness": "3;3;3;3", "technical_novelty": "1;2;2;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "39;66;92;33", "wc_summary_review": "51;17;42;37", "wc_main_review": "155;292;456;200", "wc_review": "245;375;590;270", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "560;377;1486;672", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;3;1", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 57.5, 23.47871376374779 ], "wc_summary_review_avg": [ 36.75, 12.457427503300993 ], "wc_main_review_avg": [ 275.75, 115.18761869228828 ], "wc_review_avg": [ 370.0, 136.06064824187777 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 773.75, 424.48579187058783 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.5, 0.8660254037844386 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.6882472016116854, "corr_recommendation_correctness": 0.0, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18146028249062652665&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "D. E. Shaw Research", "aff_unique_dep": "", "aff_unique_url": "https://www.deshawresearch.com", "aff_unique_abbr": "DESRes", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "EigenGame Unloaded: When playing games is better than optimizing", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7159", "id": "So6YAqnqgMj", "poster": "", "openreview": "https://openreview.net/forum?id=So6YAqnqgMj", "slides": "https://iclr.cc/virtual/2022/poster/7159", "video": "https://iclr.cc/virtual/2022/poster/7159", "author_site": "Ian Gemp, Brian McWilliams, Claire Vernade, Thore Graepel", "tldr": "", "abstract": "We build on the recently proposed EigenGame that views eigendecomposition as a competitive game. EigenGame's updates are biased if computed using minibatches of data, which hinders convergence and more sophisticated parallelism in the stochastic setting. In this work, we propose an unbiased stochastic update that is asymptotically equivalent to EigenGame, enjoys greater parallelism allowing computation on datasets of larger sample sizes, and outperforms EigenGame in experiments. We present applications to finding the principal components of massive datasets and performing spectral clustering of graphs. We analyze and discuss our proposed update in the context of EigenGame and the shift in perspective from optimization to games.", "keywords": "pca;principal components analysis;nash;games;eigendecomposition;svd;singular value decomposition", "primary_area": "", "supplementary_material": "/attachment/1bb9f80ed132a3de78ef901e29fcfe86fe5533aa.zip", "author": "Ian Gemp;Brian McWilliams;Claire Vernade;Thore Graepel", "authorids": "~Ian_Gemp1;~Brian_McWilliams2;~Claire_Vernade1;~Thore_Graepel1", "gender": "M;M;F;", "homepage": "https://imgemp.github.io/;https://sites.google.com/view/mcbrian/;https://www.cvernade.com;", "dblp": "66/10996;;168/8721;g/ThoreGraepel", "google_scholar": "5vo3MeEAAAAJ;https://scholar.google.ch/citations?user=IS4VSXAAAAAJ;tE2hCaYAAAAJ;", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Ian_Gemp1;~Brian_McWilliams2;~Claire_Vernade1;~Thore_Graepel1", "aff": "Google DeepMind;Deepmind;Google;", "aff_domain": "google.com;google.com;google.com;", "position": "Research Scientist;Research Scientist;Research scientist;", "bibtex": "@inproceedings{\ngemp2022eigengame,\ntitle={EigenGame Unloaded: When playing games is better than optimizing},\nauthor={Ian Gemp and Brian McWilliams and Claire Vernade and Thore Graepel},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=So6YAqnqgMj}\n}", "github": "", "project": "", "reviewers": "T9Hp;ub8e;JMvM;Tfdg", "pdf_size": 0, "recommendation": "5;5;8;8", "confidence": "3;3;4;3", "correctness": "4;4;4;4", "technical_novelty": "2;3;3;3", "empirical_novelty": "3;3;3;3", "wc_summary_paper": "42;31;79;39", "wc_summary_review": "29;26;69;24", "wc_main_review": "162;187;252;309", "wc_review": "233;244;400;372", "wc_reply_reviewers": "0;0;0;41", "wc_reply_authors": "243;171;655;453", "reply_reviewers": "0;0;0;1", "reply_authors": "1;1;1;1", "recommendation_avg": [ 6.5, 1.5 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 4.0, 0.0 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 47.75, 18.48479104561369 ], "wc_summary_review_avg": [ 37.0, 18.560711193270585 ], "wc_main_review_avg": [ 227.5, 57.38684518249805 ], "wc_review_avg": [ 312.25, 74.51300222108891 ], "wc_reply_reviewers_avg": [ 10.25, 17.75352077758099 ], "wc_reply_authors_avg": [ 380.5, 189.34294283125527 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.5773502691896258, "corr_recommendation_correctness": 0.0, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12953810890361190150&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=So6YAqnqgMj", "email": "google.com;google.com;google.com;", "author_num": 4, "aff_unique_index": "0;1;0", "aff_unique_norm": "Google;DeepMind", "aff_unique_dep": "Google DeepMind;", "aff_unique_url": "https://deepmind.com;https://deepmind.com", "aff_unique_abbr": "DeepMind;DeepMind", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;1", "aff_country_unique": "United Kingdom;United States" }, { "id": "SoiF5R9z6zQ", "title": "Sparse Fuse Dense: Towards High Quality 3D Detection With Depth Completion", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Current LiDAR-only 3D detection methods inevitably suffer from the sparsity of point clouds. Sparse point clouds can confuse detectors as they lack sufficient geometric and semantic information. Many multi-modal methods are proposed to alleviate this issue, while different representations of images and point clouds make it difficult to fuse them, resulting in suboptimal performance. In this paper, we present a new multi-modal framework named SFD (Sparse Fuse Dense) to tackle these issues. Specifically, we propose to enhance sparse point clouds generated from LiDAR with dense pseudo point clouds generated from depth completion. To make full use of information from different types of point clouds, we design a new RoI feature fusion method 3D-GAF (3D Grid-wise Attentive Fusion), which fuses 3D RoI features from the couple of point clouds in a grid-wise attentive way. In addition, we devise a CPFE (Color Point Feature Extractor) to extract both 3D geometric and 2D semantic features in pseudo point clouds. Moreover, we introduce a multi-modal data augmentation method named SynAugment to utilize all data augmentation approaches tailored to LiDAR-only methods. Our method holds the highest entry on the KITTI 3D object detection leaderboard\u2217, demonstrating the effectiveness of SFD. Codes will be public.", "keywords": "computer vision;3d detection;multi-modal;point clouds", "primary_area": "", "supplementary_material": "", "author": "Xiaopei Wu;Liang Peng;Honghui Yang;Chenxi Huang;Chengqi Deng;Deng Cai;Haifeng Liu;Xiaofei He", "authorids": "~Xiaopei_Wu1;~Liang_Peng3;~Honghui_Yang1;~Chenxi_Huang2;~Chengqi_Deng1;~Deng_Cai4;~Haifeng_Liu2;~Xiaofei_He2", "gender": ";M;;F;M;M;;M", "homepage": ";https://spengliang.github.io/;;http://mrsempress.top;https://github.com/KinglittleQ;http://www.cad.zju.edu.cn/home/dengcai/;;https://person.zju.edu.cn/0007101", "dblp": ";57/3505-1;;88/1185-4;;c/DCai;;h/XiaofeiHe.html", "google_scholar": ";_sJpS34AAAAJ;;e14HvOcAAAAJ;;vzxDyJoAAAAJ;;QLLFowsAAAAJ", "orcid": ";;;;;;;0009-0001-9107-2354", "linkedin": ";;;;;;;", "or_profile": "~Xiaopei_Wu1;~Liang_Peng3;~Honghui_Yang1;~Chenxi_Huang2;~Chengqi_Deng1;~Deng_Cai4;~Haifeng_Liu2;~Xiaofei_He2", "aff": ";Zhejiang University;;Zhejiang University;Zhejiang University;Zhejiang University;;Zhejiang University", "aff_domain": ";zju.edu.cn;;zju.edu.cn;zju.edu.cn;zju.edu.cn;;zju.edu.cn", "position": ";PhD student;;PhD student;MS student;Professor;;Professor", "bibtex": "@misc{\nwu2022sparse,\ntitle={Sparse Fuse Dense: Towards High Quality 3D Detection With Depth Completion},\nauthor={Xiaopei Wu and Liang Peng and Honghui Yang and Chenxi Huang and Chengqi Deng and Deng Cai and Haifeng Liu and Xiaofei He},\nyear={2022},\nurl={https://openreview.net/forum?id=SoiF5R9z6zQ}\n}", "github": "", "project": "", "reviewers": "KDtE;KnGD;Y7uD;YJte;4E4d", "site": "https://openreview.net/forum?id=SoiF5R9z6zQ", "pdf_size": 0, "recommendation": "5;5;5;6;6", "confidence": "4;3;3;4;4", "correctness": "3;3;2;4;3", "technical_novelty": "3;3;2;2;3", "empirical_novelty": "4;2;3;2;3", "wc_summary_paper": "119;47;85;76;54", "wc_summary_review": "33;60;71;39;54", "wc_main_review": "578;205;824;226;95", "wc_review": "730;312;980;341;203", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "0;0;0;0;0", "reply_reviewers": "0;0;0;0;0", "reply_authors": "0;0;0;0;0", "recommendation_avg": [ 5.4, 0.48989794855663565 ], "confidence_avg": [ 3.6, 0.4898979485566356 ], "correctness_avg": [ 3.0, 0.6324555320336759 ], "technical_novelty_avg": [ 2.6, 0.4898979485566356 ], "empirical_novelty_avg": [ 2.8, 0.7483314773547882 ], "wc_summary_paper_avg": [ 76.2, 25.51391777050322 ], "wc_summary_review_avg": [ 51.4, 13.836184445142381 ], "wc_main_review_avg": [ 385.6, 272.67167069572884 ], "wc_review_avg": [ 513.2, 293.68786151286537 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": 0.6666666666666665, "corr_recommendation_correctness": 0.6454972243679027, "gs_citation": 259, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4337309210814836516&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Zhejiang University", "aff_unique_dep": "", "aff_unique_url": "https://www.zju.edu.cn", "aff_unique_abbr": "ZJU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Local Feature Swapping for Generalization in Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/5956", "id": "Sq0-tgDyHe4", "poster": "", "openreview": "https://openreview.net/forum?id=Sq0-tgDyHe4", "slides": "https://iclr.cc/virtual/2022/poster/5956", "video": "https://iclr.cc/virtual/2022/poster/5956", "author_site": "David Bertoin, Emmanuel Rachelson", "tldr": "", "abstract": "Over the past few years, the acceleration of computing resources and research in Deep Learning has led to significant practical successes in a range of tasks, including in particular in computer vision. Building on these advances, reinforcement learning has also seen a leap forward with the emergence of agents capable of making decisions directly from visual observations. Despite these successes, the over-parametrization of neural architectures leads to memorization of the data used during training and thus to a lack of generalization.\nReinforcement learning agents based on visual inputs also suffer from this phenomenon by erroneously correlating rewards with unrelated visual features such as background elements. To alleviate this problem, we introduce a new regularization layer consisting of channel-consistent local permutations (CLOP) of the feature maps. The proposed permutations induce robustness to spatial correlations and help prevent overfitting behaviors in RL. We demonstrate, on the OpenAI Procgen Benchmark, that RL agents trained with the CLOP layer exhibit robustness to visual changes and better generalization properties than agents trained using other state-of-the-art regularization techniques.", "keywords": "Reinforcement learning;Generalization;Regularization", "primary_area": "", "supplementary_material": "/attachment/51ef5aad80459ecd0f060db33b8336fea6db6442.zip", "author": "David Bertoin;Emmanuel Rachelson", "authorids": "~David_Bertoin1;~Emmanuel_Rachelson1", "gender": "M;M", "homepage": "https://davidbert.github.io/;https://personnel.isae-supaero.fr/emmanuel-rachelson", "dblp": ";52/6241", "google_scholar": "oAZZ-o4AAAAJ;https://scholar.google.fr/citations?user=KtG9BSgAAAAJ", "orcid": ";0000-0002-8559-1617", "linkedin": ";emmanuelrachelson/", "or_profile": "~David_Bertoin1;~Emmanuel_Rachelson1", "aff": "Institut Sup\u00e9rieur de l'A\u00e9ronautique et de l'Espace;Institut Sup\u00e9rieur de l'A\u00e9ronautique et de l'Espace", "aff_domain": "isae-supaero.fr;isae-supaero.fr", "position": "PhD student;Full Professor", "bibtex": "@inproceedings{\nbertoin2022local,\ntitle={Local Feature Swapping for Generalization in Reinforcement Learning},\nauthor={David Bertoin and Emmanuel Rachelson},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=Sq0-tgDyHe4}\n}", "github": "", "project": "", "reviewers": "8Mbh;VitZ;JpKh;2gnP;rG3k", "pdf_size": 0, "recommendation": "6;8;8;8;8", "confidence": "4;4;4;4;3", "correctness": "3;4;3;3;4", "technical_novelty": "2;3;4;2;3", "empirical_novelty": "3;3;3;3;3", "wc_summary_paper": "30;71;66;53;42", "wc_summary_review": "50;155;55;134;105", "wc_main_review": "230;475;292;865;338", "wc_review": "310;701;413;1052;485", "wc_reply_reviewers": "513;77;169;371;0", "wc_reply_authors": "1670;1042;879;972;908", "reply_reviewers": "2;1;2;2;0", "reply_authors": "3;2;2;2;2", "recommendation_avg": [ 7.6, 0.7999999999999999 ], "confidence_avg": [ 3.8, 0.39999999999999997 ], "correctness_avg": [ 3.4, 0.4898979485566356 ], "technical_novelty_avg": [ 2.8, 0.7483314773547882 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 52.4, 15.107613974417005 ], "wc_summary_review_avg": [ 99.8, 41.787079342782505 ], "wc_main_review_avg": [ 440.0, 227.26988361857363 ], "wc_review_avg": [ 592.2, 263.24695629769394 ], "wc_reply_reviewers_avg": [ 226.0, 189.69449122207 ], "wc_reply_authors_avg": [ 1094.2, 293.32057548013915 ], "reply_reviewers_avg": [ 1.4, 0.8 ], "reply_authors_avg": [ 2.2, 0.39999999999999997 ], "replies_avg": [ 28, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.2500000000000001, "corr_recommendation_correctness": 0.408248290463863, "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17415532318747556108&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "pdf": "https://openreview.net/pdf?id=Sq0-tgDyHe4", "email": "isae-supaero.fr;isae-supaero.fr", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Institut Sup\u00e9rieur de l'A\u00e9ronautique et de l'Espace", "aff_unique_dep": "", "aff_unique_url": "https://www.isae-supaero.fr", "aff_unique_abbr": "ISAE-SUPAERO", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "France" }, { "id": "Sqv6rs_TRV", "title": "WHAT TO DO IF SPARSE REPRESENTATION LEARNING FAILS UNEXPECTEDLY?", "track": "main", "status": "Reject", "tldr": "", "abstract": "Learning physical equations from data is essential for scientific discovery and engineering modeling. However, most of the existing methods rely on two rules: (1) learn a sparse representation to fit data and (2) check if the loss objective function satisfies error thresholds. This paper illustrates that such conditions are far from sufficient. Specifically, we show that sparse non-physical approximations exist with excellent fitting accuracy, but fail to adequately model the situation. To fundamentally resolve the data-fitting problem, we propose a physical neural network (PNN) utilizing \u201cRange, Inertia, Symmetry, and Extrapolation\u201d (RISE) constraints. RISE is based on a complete analysis for the generalizability of data properties for physical systems. The first three techniques focus on the definition of physics in space and time. The last technique of extrapolation is novel based on active learning without an inquiry, using cross-model validation. We validate the proposed PNN-RISE method via a synthetic dataset, power system dataset, and mass-damper system dataset. Numerical results show the universal capability of the PNN-RISE approach to quickly identify the hidden physical models without local optima, opening the door for the fast and highly accurate discovery of the physical laws or systems with external loads.", "keywords": "Physical neural network;extrapolation", "primary_area": "", "supplementary_material": "", "author": "Jingyi Yuan;Haoran Li;Erik Blasch;Yang Weng", "authorids": "~Jingyi_Yuan1;~Haoran_Li6;~Erik_Blasch1;~Yang_Weng1", "gender": "F;M;M;", "homepage": ";;https://sites.google.com/site/erikblasch/;", "dblp": "248/7785;;01/4960;", "google_scholar": "1k_2PUwAAAAJ;https://scholar.google.com/citations?hl=en;Po7s1TsAAAAJ;", "orcid": "0000-0002-2850-1582;;0000-0001-6894-6108;", "linkedin": "jingyi-yuan-7a1757171/;;erik-blasch-76a0429/;", "or_profile": "~Jingyi_Yuan1;~Haoran_Li6;~Erik_Blasch1;~Yang_Weng1", "aff": "Arizona State University;Arizona State University;Air Force Research Laboratory;", "aff_domain": "asu.edu;asu.edu;us.af.mil;", "position": "PhD student;PhD student;Principal Researcher;", "bibtex": "@misc{\nyuan2022what,\ntitle={{WHAT} {TO} {DO} {IF} {SPARSE} {REPRESENTATION} {LEARNING} {FAILS} {UNEXPECTEDLY}?},\nauthor={Jingyi Yuan and Haoran Li and Erik Blasch and Yang Weng},\nyear={2022},\nurl={https://openreview.net/forum?id=Sqv6rs_TRV}\n}", "github": "", "project": "", "reviewers": "7VKA;VZaq;XyFH;8Azy;Lobz", "site": "https://openreview.net/forum?id=Sqv6rs_TRV", "pdf_size": 0, "recommendation": "3;3;3;5;6", "confidence": "4;3;3;3;4", "correctness": "2;2;2;2;4", "technical_novelty": "2;3;3;2;3", "empirical_novelty": "2;2;2;2;2", "wc_summary_paper": "13;197;213;59;74", "wc_summary_review": "26;44;82;22;60", "wc_main_review": "551;1344;924;225;116", "wc_review": "590;1585;1219;306;250", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "0;0;0;0;0", "reply_reviewers": "0;0;0;0;0", "reply_authors": "0;0;0;0;0", "recommendation_avg": [ 4.0, 1.2649110640673518 ], "confidence_avg": [ 3.4, 0.4898979485566356 ], "correctness_avg": [ 2.4, 0.8 ], "technical_novelty_avg": [ 2.6, 0.4898979485566356 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 111.2, 79.3433046954814 ], "wc_summary_review_avg": [ 46.8, 22.22071105972984 ], "wc_main_review_avg": [ 632.0, 454.02070437371026 ], "wc_review_avg": [ 790.0, 525.722740615241 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.3227486121839514, "corr_recommendation_correctness": 0.7905694150420948, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:enHg1i510uEJ:scholar.google.com/&scioq=WHAT+TO+DO+IF+SPARSE+REPRESENTATION+LEARNING+FAILS+UNEXPECTEDLY%3F&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;1", "aff_unique_norm": "Arizona State University;Air Force Research Laboratory", "aff_unique_dep": ";", "aff_unique_url": "https://www.asu.edu;https://www.afrl.af.mil/", "aff_unique_abbr": "ASU;AFRL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "Srb756cmzyw", "title": "Learning Better Visual Representations for Weakly-Supervised Object Detection Using Natural Language Supervision", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "We present a framework to better leverage natural language supervision for a specific downstream task, namely weakly-supervised object detection (WSOD). Our framework employs a multimodal pre-training step, during which region-level groundings are learned in a weakly-supervised manner and later maintained for the downstream task. Further, to appropriately use the noisy supervision that captions contain for object detection, we use coherence analysis and other cross-modal alignment metrics to weight image-caption pairs during WSOD training. Results indicate that WSOD can better leverage representation learning by (1) learning a region-based alignment between image regions and caption tokens, (2) enforcing the visual backbone does not forget this alignment during the downstream WSOD task, and (3) suppressing instances that have weak image-caption correspondence during the WSOD training stage.", "keywords": "weakly-supervised object detection;vision and language;representation learning", "primary_area": "", "supplementary_material": "", "author": "Mesut Erhan Unal;Adriana Kovashka", "authorids": "~Mesut_Erhan_Unal1;~Adriana_Kovashka1", "gender": "M;F", "homepage": "https://erhan.in;http://people.cs.pitt.edu/~kovashka/", "dblp": ";51/8652.html", "google_scholar": "kMVE1esAAAAJ;Dl949GoAAAAJ", "orcid": ";", "linkedin": "mesuterhanunal;", "or_profile": "~Mesut_Erhan_Unal1;~Adriana_Kovashka1", "aff": "University of Pittsburgh;University of Pittsburgh", "aff_domain": "pitt.edu;pitt.edu", "position": "PhD student;Assistant Professor", "bibtex": "@misc{\nunal2022learning,\ntitle={Learning Better Visual Representations for Weakly-Supervised Object Detection Using Natural Language Supervision},\nauthor={Mesut Erhan Unal and Adriana Kovashka},\nyear={2022},\nurl={https://openreview.net/forum?id=Srb756cmzyw}\n}", "github": "", "project": "", "reviewers": "4wGW;5iWG;n1hZ;YRuX", "site": "https://openreview.net/forum?id=Srb756cmzyw", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "4;4;3;4", "correctness": "3;4;2;3", "technical_novelty": "2;2;3;2", "empirical_novelty": "2;1;2;2", "wc_summary_paper": "126;48;93;97", "wc_summary_review": "47;94;35;18", "wc_main_review": "389;16;134;236", "wc_review": "562;158;262;351", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 91.0, 27.90161285660741 ], "wc_summary_review_avg": [ 48.5, 28.217902119044926 ], "wc_main_review_avg": [ 193.75, 136.99703463944027 ], "wc_review_avg": [ 333.25, 148.68654108560062 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": -0.7071067811865475, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:zEw-UgeJOq8J:scholar.google.com/&scioq=Learning+Better+Visual+Representations+for+Weakly-Supervised+Object+Detection+Using+Natural+Language+Supervision&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "University of Pittsburgh", "aff_unique_dep": "", "aff_unique_url": "https://www.pitt.edu", "aff_unique_abbr": "Pitt", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Neural graphical modelling in continuous-time: consistency guarantees and algorithms", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6517", "id": "SsHBkfeRF9L", "poster": "", "openreview": "https://openreview.net/forum?id=SsHBkfeRF9L", "slides": "https://iclr.cc/virtual/2022/poster/6517", "video": "https://iclr.cc/virtual/2022/poster/6517", "author_site": "Alexis Bellot, Kim Branson, Mihaela van der Schaar", "tldr": "", "abstract": "The discovery of structure from time series data is a key problem in fields of study working with complex systems. Most identifiability results and learning algorithms assume the underlying dynamics to be discrete in time. Comparatively few, in contrast, explicitly define dependencies in infinitesimal intervals of time, independently of the scale of observation and of the regularity of sampling. In this paper, we consider score-based structure learning for the study of dynamical systems. We prove that for vector fields parameterized in a large class of neural networks, least squares optimization with adaptive regularization schemes consistently recovers directed graphs of local independencies in systems of stochastic differential equations. Using this insight, we propose a score-based learning algorithm based on penalized Neural Ordinary Differential Equations (modelling the mean process) that we show to be applicable to the general setting of irregularly-sampled multivariate time series and to outperform the state of the art across a range of dynamical systems.", "keywords": "Dynamical systems;graphical modelling;structure learning", "primary_area": "", "supplementary_material": "", "author": "Alexis Bellot;Kim Branson;Mihaela van der Schaar", "authorids": "~Alexis_Bellot1;~Kim_Branson1;~Mihaela_van_der_Schaar2", "gender": "M;M;F", "homepage": ";https://gsk.ai;https://www.vanderschaar-lab.com", "dblp": "217/4339;45/2850;", "google_scholar": ";;DZ3S--MAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Alexis_Bellot1;~Kim_Branson1;~Mihaela_van_der_Schaar2", "aff": "Columbia University;GSK plc;University of California, Los Angeles", "aff_domain": "columbia.edu;gsk.com;ucla.edu", "position": "Postdoc;Principal Researcher;Full Professor", "bibtex": "@inproceedings{\nbellot2022neural,\ntitle={Neural graphical modelling in continuous-time: consistency guarantees and algorithms},\nauthor={Alexis Bellot and Kim Branson and Mihaela van der Schaar},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=SsHBkfeRF9L}\n}", "github": "", "project": "", "reviewers": "4Lyp;TPnG;xAV7", "pdf_size": 0, "recommendation": "5;5;8", "confidence": "4;4;3", "correctness": "4;4;4", "technical_novelty": "2;3;3", "empirical_novelty": "2;3;3", "wc_summary_paper": "16;65;53", "wc_summary_review": "18;22;21", "wc_main_review": "52;308;146", "wc_review": "86;395;220", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 6.0, 1.4142135623730951 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 4.0, 0.0 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 44.666666666666664, 20.853989759489405 ], "wc_summary_review_avg": [ 20.333333333333332, 1.699673171197595 ], "wc_main_review_avg": [ 168.66666666666666, 105.73341740223644 ], "wc_review_avg": [ 233.66666666666666, 126.518334200577 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -1.0, "corr_recommendation_correctness": 0.0, "gs_citation": 45, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3383946799962251947&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=SsHBkfeRF9L", "email": "columbia.edu;gsk.com;ucla.edu", "author_num": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "Columbia University;GlaxoSmithKline plc;University of California, Los Angeles", "aff_unique_dep": ";;", "aff_unique_url": "https://www.columbia.edu;https://www.gsk.com;https://www.ucla.edu", "aff_unique_abbr": "Columbia;GSK;UCLA", "aff_campus_unique_index": "1", "aff_campus_unique": ";Los Angeles", "aff_country_unique_index": "0;1;0", "aff_country_unique": "United States;United Kingdom" }, { "title": "On the Uncomputability of Partition Functions in Energy-Based Sequence Models", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7146", "id": "SsPCtEY6yCl", "poster": "", "openreview": "https://openreview.net/forum?id=SsPCtEY6yCl", "slides": "https://iclr.cc/virtual/2022/poster/7146", "video": "https://iclr.cc/virtual/2022/poster/7146", "author_site": "Chu-Cheng Lin, Arya McCarthy", "tldr": "", "abstract": "In this paper, we argue that energy-based sequence models backed by expressive parametric families can result in uncomputable and inapproximable partition functions. Among other things, this makes model selection--and therefore learning model parameters--not only difficult, but generally _undecidable_. The reason is that there are no good deterministic or randomized estimates of partition functions. Specifically, we exhibit a pathological example where under common assumptions, _no_ useful importance sampling estimates of the partition function can guarantee to have variance bounded below a rational number. As alternatives, we consider sequence model families whose partition functions are computable (if they exist), but at the cost of reduced expressiveness. Our theoretical results suggest that statistical procedures with asymptotic guarantees and sheer (but finite) amounts of compute are not the only things that make sequence modeling work; computability concerns must not be neglected as we consider more expressive model parametrizations.", "keywords": "energy-based models;turing completeness;model capacity;sequence models;autoregressive models;partition function;parameter estimation;model selection", "primary_area": "", "supplementary_material": "", "author": "Chu-Cheng Lin;Arya D. McCarthy", "authorids": "~Chu-Cheng_Lin1;~Arya_D._McCarthy1", "gender": ";M", "homepage": ";https://cs.jhu.edu/~arya", "dblp": "64/7292;219/5712", "google_scholar": ";erysFsoAAAAJ", "orcid": ";0000-0001-9440-8792", "linkedin": ";", "or_profile": "~Chu-Cheng_Lin1;~Arya_D._McCarthy1", "aff": "Department of Computer Science, Whiting School of Engineering;Johns Hopkins University", "aff_domain": "cs.jhu.edu;jhu.edu", "position": "PhD student;PhD student", "bibtex": "@inproceedings{\nlin2022on,\ntitle={On the Uncomputability of Partition Functions in Energy-Based Sequence Models},\nauthor={Chu-Cheng Lin and Arya D. McCarthy},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=SsPCtEY6yCl}\n}", "github": "", "project": "", "reviewers": "K6DS;EwJC;EUof;NAcm", "pdf_size": 0, "recommendation": "6;6;8;8", "confidence": "3;3;3;4", "correctness": "4;4;4;3", "technical_novelty": "2;3;4;3", "empirical_novelty": "0;0;1;2", "wc_summary_paper": "357;140;57;91", "wc_summary_review": "66;91;46;35", "wc_main_review": "215;108;333;644", "wc_review": "638;339;436;770", "wc_reply_reviewers": "38;8;52;15", "wc_reply_authors": "96;323;527;1443", "reply_reviewers": "1;1;1;1", "reply_authors": "1;1;1;2", "recommendation_avg": [ 7.0, 1.0 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 0.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 161.25, 116.80405600834246 ], "wc_summary_review_avg": [ 59.5, 21.313141485947114 ], "wc_main_review_avg": [ 325.0, 200.63274907153118 ], "wc_review_avg": [ 545.75, 168.51465069838883 ], "wc_reply_reviewers_avg": [ 28.25, 17.640507362318125 ], "wc_reply_authors_avg": [ 597.25, 511.5400155413064 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": -0.5773502691896257, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15107243672587606516&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "pdf": "https://openreview.net/pdf?id=SsPCtEY6yCl", "email": "cs.jhu.edu;jhu.edu", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Johns Hopkins University", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.jhu.edu", "aff_unique_abbr": "JHU", "aff_campus_unique_index": "0", "aff_campus_unique": "Baltimore;", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Deep AutoAugment", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6657", "id": "St-53J9ZARf", "poster": "", "openreview": "https://openreview.net/forum?id=St-53J9ZARf", "slides": "https://iclr.cc/virtual/2022/poster/6657", "video": "https://iclr.cc/virtual/2022/poster/6657", "author_site": "Yu Zheng, Zhi Zhang, Shen Yan, Mi Zhang", "tldr": "", "abstract": "While recent automated data augmentation methods lead to state-of-the-art results, their design spaces and the derived data augmentation strategies still incorporate strong human priors. In this work, instead of fixing a set of hand-picked default augmentations alongside the searched data augmentations, we propose a fully automated approach for data augmentation search named Deep AutoAugment (DeepAA). DeepAA progressively builds a multi-layer data augmentation pipeline from scratch by stacking augmentation layers one at a time until reaching convergence. For each augmentation layer, the policy is optimized to maximize the cosine similarity between the gradients of the original and augmented data along the direction with low variance. Our experiments show that even without default augmentations, we can learn an augmentation policy that achieves strong performance with that of previous works. Extensive ablation studies show that the regularized gradient matching is an effective search method for data augmentation policies. Our code is available at: https://github.com/MSU-MLSys-Lab/DeepAA .", "keywords": "automated machine learning;data augmentation", "primary_area": "", "supplementary_material": "", "author": "Yu Zheng;Zhi Zhang;Shen Yan;Mi Zhang", "authorids": "~Yu_Zheng3;~Zhi_Zhang4;~Shen_Yan2;~Mi_Zhang1", "gender": "M;M;M;M", "homepage": ";https://zhreshold.github.io;https://shenyann.github.io/;https://mi-zhang.github.io/", "dblp": ";;https://dblp.uni-trier.de/pers/hd/y/Yan:Shen;84/2519-2.html", "google_scholar": ";nZr0oXQAAAAJ;-shYRd8AAAAJ;https://scholar.google.com.tw/citations?user=r3A90uAAAAAJ", "orcid": "0000-0001-9659-0233;0000-0003-0249-1678;;", "linkedin": ";;shawnyanyuv/;mizhang/", "or_profile": "~Yu_Zheng3;~Zhi_Zhang4;~Shen_Yan2;~Mi_Zhang1", "aff": "Michigan State University;ByteDance Inc.;Michigan State University;Michigan State University", "aff_domain": "msu.edu;bytedance.com;msu.edu;msu.edu", "position": "PhD student;Researcher;PhD student;Associate Professor", "bibtex": "@inproceedings{\nzheng2022deep,\ntitle={Deep AutoAugment},\nauthor={Yu Zheng and Zhi Zhang and Shen Yan and Mi Zhang},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=St-53J9ZARf}\n}", "github": "", "project": "", "reviewers": "BGUU;3j37;9qhP;VFdc", "pdf_size": 0, "recommendation": "5;6;8;8", "confidence": "5;5;4;3", "correctness": "2;3;4;3", "technical_novelty": "3;3;3;3", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "71;64;76;62", "wc_summary_review": "81;117;45;81", "wc_main_review": "320;524;216;156", "wc_review": "472;705;337;299", "wc_reply_reviewers": "153;41;34;31", "wc_reply_authors": "1381;1075;902;477", "reply_reviewers": "3;1;1;1", "reply_authors": "9;5;3;1", "recommendation_avg": [ 6.75, 1.299038105676658 ], "confidence_avg": [ 4.25, 0.82915619758885 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 68.25, 5.584576975922169 ], "wc_summary_review_avg": [ 81.0, 25.45584412271571 ], "wc_main_review_avg": [ 304.0, 139.91425945914162 ], "wc_review_avg": [ 453.25, 158.93139243082217 ], "wc_reply_reviewers_avg": [ 64.75, 51.080206538345166 ], "wc_reply_authors_avg": [ 958.75, 326.769318480178 ], "reply_reviewers_avg": [ 1.5, 0.8660254037844386 ], "reply_authors_avg": [ 4.5, 2.958039891549808 ], "replies_avg": [ 31, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.8703882797784892, "corr_recommendation_correctness": 0.816496580927726, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1, "pdf": "https://openreview.net/pdf?id=St-53J9ZARf", "email": "msu.edu;bytedance.com;msu.edu;msu.edu", "author_num": 4, "aff_unique_index": "0;1;0;0", "aff_unique_norm": "Michigan State University;ByteDance", "aff_unique_dep": ";", "aff_unique_url": "https://www.msu.edu;https://www.bytedance.com", "aff_unique_abbr": "MSU;ByteDance", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0", "aff_country_unique": "United States;China" }, { "title": "Consistent Counterfactuals for Deep Models", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6992", "id": "St6eyiTEHnG", "poster": "", "openreview": "https://openreview.net/forum?id=St6eyiTEHnG", "slides": "https://iclr.cc/virtual/2022/poster/6992", "video": "https://iclr.cc/virtual/2022/poster/6992", "author_site": "Emily Black, Zifan Wang, Matt Fredrikson", "tldr": "", "abstract": "Counterfactual examples are one of the most commonly-cited methods for explaining the predictions of machine learning models in key areas such as finance and medical diagnosis. Counterfactuals are often discussed under the assumption that the model on which they will be used is static, but in deployment models may be periodically retrained or fine-tuned. This paper studies the consistency of model prediction on counterfactual examples in deep networks under small changes to initial training conditions, such as weight initialization and leave-one-out variations in data, as often occurs during model deployment. We demonstrate experimentally that counterfactual examples for deep models are often inconsistent across such small changes, and that increasing the cost of the counterfactual, a stability-enhancing mitigation suggested by prior work in the context of simpler models, is not a reliable heuristic in deep networks. Rather, our analysis shows that a model's Lipschitz continuity around the counterfactual, along with confidence of its prediction, is key to its consistency across related models. To this end, we propose Stable Neighbor Search as a way to generate more consistent counterfactual explanations, and illustrate the effectiveness of this approach on several benchmark datasets.", "keywords": "deep models;deep networks;explainability;counterfactual explanations;consistency;consistent predictions;model duplicity;random initialization", "primary_area": "", "supplementary_material": "/attachment/1745d806bebfb29d2737f9a9150a976fafab2f5b.zip", "author": "Emily Black;Zifan Wang;Matt Fredrikson", "authorids": "~Emily_Black1;~Zifan_Wang1;~Matt_Fredrikson1", "gender": "F;M;M", "homepage": "https://emblack.github.io/;https://www.zifanw.net;https://cs.cmu.edu/~mfredrik", "dblp": "197/2977;;38/2612", "google_scholar": "dBkGY6gAAAAJ;HJOP3wMAAAAJ;https://scholar.google.com.tw/citations?user=tMYCvLAAAAAJ", "orcid": ";;", "linkedin": ";zifan-wang-sail/;", "or_profile": "~Emily_Black1;~Zifan_Wang1;~Matt_Fredrikson1", "aff": "Carnegie Mellon University;Carnegie Mellon University;Carnegie Mellon University", "aff_domain": "cmu.edu;cmu.edu;cmu.edu", "position": "PhD student;PhD student;Associate Professor", "bibtex": "@inproceedings{\nblack2022consistent,\ntitle={Consistent Counterfactuals for Deep Models},\nauthor={Emily Black and Zifan Wang and Matt Fredrikson},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=St6eyiTEHnG}\n}", "github": "", "project": "", "reviewers": "fWXE;76Ar;gXiV;ZP6K", "pdf_size": 0, "recommendation": "3;6;6;6", "confidence": "3;4;4;3", "correctness": "2;3;4;3", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "31;89;51;84", "wc_summary_review": "50;63;18;65", "wc_main_review": "221;550;115;543", "wc_review": "302;702;184;692", "wc_reply_reviewers": "336;117;225;97", "wc_reply_authors": "1605;789;502;1325", "reply_reviewers": "1;2;1;1", "reply_authors": "4;2;2;2", "recommendation_avg": [ 5.25, 1.299038105676658 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 63.75, 23.889066536807167 ], "wc_summary_review_avg": [ 49.0, 18.801595676963167 ], "wc_main_review_avg": [ 357.25, 192.9408912076442 ], "wc_review_avg": [ 470.0, 230.82894099310857 ], "wc_reply_reviewers_avg": [ 193.75, 95.47610957721308 ], "wc_reply_authors_avg": [ 1055.25, 433.5795053966458 ], "reply_reviewers_avg": [ 1.25, 0.4330127018922193 ], "reply_authors_avg": [ 2.5, 0.8660254037844386 ], "replies_avg": [ 20, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": 0.816496580927726, "gs_citation": 59, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16891585689799007011&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=St6eyiTEHnG", "email": "cmu.edu;cmu.edu;cmu.edu", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Carnegie Mellon University", "aff_unique_dep": "", "aff_unique_url": "https://www.cmu.edu", "aff_unique_abbr": "CMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "SuKTLF9stD", "title": "Data-Efficient Augmentation for Training Neural Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "Data augmentation is essential to achieve state-of-the-art performance in many deep learning applications. However, modern data augmentation techniques become computationally prohibitive for large datasets. To address this, we propose a rigorous technique to select subsets of data points that when augmented, closely capture the training dynamics of full data augmentation. We first show that data augmentation, modeled as additive perturbations, speeds up learning by enlarging the smaller singular values of the network Jacobian. Then, we propose a framework to iteratively extract small subsets of training data that when augmented, closely capture the alignment of the fully augmented Jacobian with label/residual vector. We prove that stochastic gradient descent applied to augmented subsets found by our approach have similar training dynamics to that of fully augmented data. Our experiments demonstrate that our method outperforms state-of-the-art max-loss strategy by 7.7% on CIFAR10 while achieving 6.3x speedup, and by 4.7% on SVHN while achieving 2.2x speedup, using 10% and 30% subsets, respectively.", "keywords": "Data Augmentation;Neural Network;Coresets", "primary_area": "", "supplementary_material": "/attachment/63f5b86fdd62b4d5233b5bd26a26ec10dda1bfc4.zip", "author": "Tian Yu Liu;Baharan Mirzasoleiman", "authorids": "~Tian_Yu_Liu2;~Baharan_Mirzasoleiman4", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": "~Tian_Yu_Liu2;~Baharan_Mirzasoleiman4", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nliu2022dataefficient,\ntitle={Data-Efficient Augmentation for Training Neural Networks},\nauthor={Tian Yu Liu and Baharan Mirzasoleiman},\nyear={2022},\nurl={https://openreview.net/forum?id=SuKTLF9stD}\n}", "github": "", "project": "", "reviewers": "Q8Ag;fQLH;xyXQ;1cJe", "site": "https://openreview.net/forum?id=SuKTLF9stD", "pdf_size": 0, "recommendation": "3;3;5;6", "confidence": "3;4;3;3", "correctness": "3;3;3;3", "technical_novelty": "3;2;3;4", "empirical_novelty": "3;2;2;3", "wc_summary_paper": "46;82;94;86", "wc_summary_review": "41;55;85;108", "wc_main_review": "142;902;261;290", "wc_review": "229;1039;440;484", "wc_reply_reviewers": "93;297;278;88", "wc_reply_authors": "424;665;734;303", "reply_reviewers": "1;1;1;1", "reply_authors": "1;1;1;1", "recommendation_avg": [ 4.25, 1.299038105676658 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 77.0, 18.411952639521967 ], "wc_summary_review_avg": [ 72.25, 26.05163142684158 ], "wc_main_review_avg": [ 398.75, 295.79669960971506 ], "wc_review_avg": [ 548.0, 299.4169333888783 ], "wc_reply_reviewers_avg": [ 189.0, 98.74462010661644 ], "wc_reply_authors_avg": [ 531.5, 175.0692720039699 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.5555555555555555, "corr_recommendation_correctness": 0.0, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16120463592327015292&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 10 }, { "id": "SvFQBlffMB", "title": "Pseudo Knowledge Distillation: Towards Learning Optimal Instance-specific Label Smoothing Regularization", "track": "main", "status": "Reject", "tldr": "", "abstract": "Knowledge Distillation (KD) is an algorithm that transfers the knowledge of a trained, typically larger, neural network into another model under training. Although a complete understanding of KD is elusive, a growing body of work has shown that the success of both KD and label smoothing comes from a similar regularization effect of soft targets. In this work, we propose an instance-specific label smoothing technique, Pseudo-KD, which is efficiently learnt from the data. We devise a two-stage optimization problem that leads to a deterministic and interpretable solution for the optimal label smoothing. We show that Pseudo-KD can be equivalent to an efficient variant of self-distillation techniques, without the need to store the parameters or the output of a trained model. Finally, we conduct experiments on multiple image classification (CIFAR-10 and CIFAR-100) and natural language understanding datasets (the GLUE benchmark) across various neural network architectures and demonstrate that our method is competitive against strong baselines.", "keywords": "Knowledge Distillation;Label Smoothing;Supervised Learning;Image Classification;Natural Language Understanding", "primary_area": "", "supplementary_material": "/attachment/245f99a22c6fc3e2291dc19ac21d806ef0076647.zip", "author": "Peng Lu;Ahmad Rashid;Ivan Kobyzev;Mehdi Rezagholizadeh;Philippe Langlais", "authorids": "~Peng_Lu6;~Ahmad_Rashid1;~Ivan_Kobyzev1;~Mehdi_Rezagholizadeh1;felipe@iro.umontreal.ca", "gender": "M;;;M;", "homepage": ";https://ahmadrash.github.io/;;;", "dblp": ";239/5145;;;", "google_scholar": "c4xAa8gAAAAJ;https://scholar.google.ca/citations?user=YPbaQkQAAAAJ;;MvXlF6kAAAAJ;", "orcid": ";;;;", "linkedin": "peng-lu-211b7617a/;;;;", "or_profile": "~Peng_Lu6;~Ahmad_Rashid1;~Ivan_Kobyzev1;~Mehdi_Rezagholizadeh1;felipe@iro.umontreal.ca", "aff": "University of Montreal;University of Waterloo;;Huawei Technologies Ltd.;", "aff_domain": "umontreal.ca;uwaterloo.ca;;huawei.com;", "position": "PhD student;PhD student;;Principal Researcher;", "bibtex": "@misc{\nlu2022pseudo,\ntitle={Pseudo Knowledge Distillation: Towards Learning Optimal Instance-specific Label Smoothing Regularization},\nauthor={Peng Lu and Ahmad Rashid and Ivan Kobyzev and Mehdi Rezagholizadeh and Philippe Langlais},\nyear={2022},\nurl={https://openreview.net/forum?id=SvFQBlffMB}\n}", "github": "", "project": "", "reviewers": "JBRd;bwuZ;CUiW;piYX", "site": "https://openreview.net/forum?id=SvFQBlffMB", "pdf_size": 0, "recommendation": "5;5;5;6", "confidence": "3;4;3;3", "correctness": "3;3;3;4", "technical_novelty": "2;4;2;3", "empirical_novelty": "2;3;2;2", "wc_summary_paper": "55;90;44;146", "wc_summary_review": "21;24;24;104", "wc_main_review": "259;199;163;156", "wc_review": "335;313;231;406", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "705;535;366;46", "reply_reviewers": "0;0;0;0", "reply_authors": "3;3;2;1", "recommendation_avg": [ 5.25, 0.4330127018922193 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.82915619758885 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 83.75, 39.75157229594824 ], "wc_summary_review_avg": [ 43.25, 35.0954056822257 ], "wc_main_review_avg": [ 194.25, 40.78832553562355 ], "wc_review_avg": [ 321.25, 62.419448091119804 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 413.0, 243.43685012750225 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.25, 0.82915619758885 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": 1.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:2aQ4Fg6zX1UJ:scholar.google.com/&scioq=Pseudo+Knowledge+Distillation:+Towards+Learning+Optimal+Instance-specific+Label+Smoothing+Regularization&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;2", "aff_unique_norm": "University of Montreal;University of Waterloo;Huawei", "aff_unique_dep": ";;Huawei Technologies", "aff_unique_url": "https://wwwumontreal.ca;https://uwaterloo.ca;https://www.huawei.com", "aff_unique_abbr": "UM;UW;Huawei", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1", "aff_country_unique": "Canada;China" }, { "title": "On the Role of Neural Collapse in Transfer Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6574", "id": "SwIp410B6aQ", "poster": "", "openreview": "https://openreview.net/forum?id=SwIp410B6aQ", "slides": "https://iclr.cc/virtual/2022/poster/6574", "video": "https://iclr.cc/virtual/2022/poster/6574", "author_site": "Tomer Galanti, Andras Gyorgy, Marcus Hutter", "tldr": "", "abstract": "We study the ability of foundation models to learn representations for classification that are transferable to new, unseen classes. Recent results in the literature show that representations learned by a single classifier over many classes are competitive on few-shot learning problems with representations learned by special-purpose algorithms designed for such problems. In this paper, we provide an explanation for this behavior based on the recently observed phenomenon that the features learned by overparameterized classification networks show an interesting clustering property, called neural collapse. We demonstrate both theoretically and empirically that neural collapse generalizes to new samples from the training classes, and -- more importantly -- to new classes as well, allowing foundation models to provide feature maps that work well in transfer learning and, specifically, in the few-shot setting.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Tomer Galanti;Andr\u00e1s Gy\u00f6rgy;Marcus Hutter", "authorids": "~Tomer_Galanti1;~Andr\u00e1s_Gy\u00f6rgy2;~Marcus_Hutter1", "gender": "M;;", "homepage": "https://tomergalanti.github.io;http://www.hutter1.net/;http://www.cs.bme.hu/~gya", "dblp": "198/1490;h/MarcusHutter;72/251-1", "google_scholar": ";https://scholar.google.com.tw/citations?user=7hmCntEAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";0000-0002-3263-4097;0000-0003-0586-4337", "linkedin": "tomer-galanti-5880b1104/;hutter1/;", "or_profile": "~Tomer_Galanti1;~Marcus_Hutter1;~Andras_Gyorgy1", "aff": "Massachusetts Institute of Technology;Australian National University;Google DeepMind", "aff_domain": "mit.edu;anu.edu.au;deepmind.com", "position": "Postdoc;Full Professor;Research Scientist", "bibtex": "@inproceedings{\ngalanti2022on,\ntitle={On the Role of Neural Collapse in Transfer Learning},\nauthor={Tomer Galanti and Andr{\\'a}s Gy{\\\"o}rgy and Marcus Hutter},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=SwIp410B6aQ}\n}", "github": "", "project": "", "reviewers": "QSqm;dW2d;XpXZ;BPMK;e2QG", "pdf_size": 0, "recommendation": "6;6;6;6;8", "confidence": "4;3;3;3;3", "correctness": "4;3;3;4;3", "technical_novelty": "2;1;3;3;3", "empirical_novelty": "3;3;2;3;3", "wc_summary_paper": "164;64;13;88;318", "wc_summary_review": "82;52;23;36;42", "wc_main_review": "320;227;203;305;409", "wc_review": "566;343;239;429;769", "wc_reply_reviewers": "0;0;355;0;79", "wc_reply_authors": "751;623;2070;711;372", "reply_reviewers": "0;0;1;0;1", "reply_authors": "1;1;3;1;1", "recommendation_avg": [ 6.4, 0.7999999999999999 ], "confidence_avg": [ 3.2, 0.39999999999999997 ], "correctness_avg": [ 3.4, 0.4898979485566356 ], "technical_novelty_avg": [ 2.4, 0.8 ], "empirical_novelty_avg": [ 2.8, 0.39999999999999997 ], "wc_summary_paper_avg": [ 129.4, 106.11993215225874 ], "wc_summary_review_avg": [ 47.0, 19.859506539690255 ], "wc_main_review_avg": [ 292.8, 73.191256308387 ], "wc_review_avg": [ 469.2, 184.27414360132025 ], "wc_reply_reviewers_avg": [ 86.8, 137.54621041671777 ], "wc_reply_authors_avg": [ 905.4, 597.0241536152453 ], "reply_reviewers_avg": [ 0.4, 0.48989794855663565 ], "reply_authors_avg": [ 1.4, 0.8 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.2500000000000001, "corr_recommendation_correctness": -0.408248290463863, "gs_citation": 99, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12543395241785293470&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "pdf": "https://openreview.net/pdf?id=SwIp410B6aQ", "email": "mit.edu;anu.edu.au;deepmind.com", "author_num": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "Massachusetts Institute of Technology;Australian National University;Google", "aff_unique_dep": ";;Google DeepMind", "aff_unique_url": "https://web.mit.edu;https://www.anu.edu.au;https://deepmind.com", "aff_unique_abbr": "MIT;ANU;DeepMind", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2", "aff_country_unique": "United States;Australia;United Kingdom" }, { "id": "T-uEidE-Xpv", "title": "Contrastive Mutual Information Maximization for Binary Neural Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "Neural network binarization accelerates deep models by quantizing their weights and activations into 1-bit. However, there is still a huge performance gap between Binary Neural Networks (BNNs) and their full-precision counterparts. As the quantization error caused by weights binarization has been reduced in earlier works, the activations binarization becomes the major obstacle for further improvement of the accuracy. In spite of studies about the full-precision networks highlighting the distributions of activations, few works study the distribution of the binary activations in BNNs. In this paper, we introduce mutual information as the metric to measure the information shared by the binary and the latent full-precision activations. Then we maximize the mutual information by establishing a contrastive learning framework while training BNNs. Specifically, the representation ability of the BNNs is greatly strengthened via pulling the positive pairs with binary and full-precision activations from the same input samples, as well as pushing negative pairs from different samples (the number of negative pairs can be exponentially large). This benefits the downstream tasks, not only classification but also segmentation and depth estimation, etc. The experimental results show that our method can be implemented as a pile-up module on existing state-of-the-art binarization methods and can remarkably improve the performance over them on CIFAR-10/100 and ImageNet, in addition to the good generalization ability on NYUD-v2.", "keywords": "network compression;network binarization;contrastive learning", "primary_area": "", "supplementary_material": "", "author": "Yuzhang Shang;Dan Xu;Ziliang Zong;Liqiang Nie;Yan Yan", "authorids": "~Yuzhang_Shang1;~Dan_Xu2;~Ziliang_Zong1;~Liqiang_Nie2;~Yan_Yan6", "gender": "M;;M;M;M", "homepage": "https://42shawn.github.io/;https://userweb.cs.txstate.edu/~zz11/;https://liqiangnie.github.io/index.html;;https://www.danxurgb.net", "dblp": "300/8483;;92/8277;13/3953-2;16/3823-2.html", "google_scholar": "6ZPL5E0AAAAJ;;yywVMhUAAAAJ;;OuSPv-AAAAAJ", "orcid": ";;0000-0003-1476-0273;;0000-0003-0136-9603", "linkedin": ";;;;", "or_profile": "~Yuzhang_Shang1;~Ziliang_Zong1;~Liqiang_Nie2;~Yan_Yan6;~Dan_Xu4", "aff": "Illinois Institute of Technology;Texas State University;Shandong University;;VGG, University of Oxford", "aff_domain": "iit.edu;txstate.edu;sdu.edu.cn;;ox.ac.uk", "position": "PhD student;Associate Professor;Full Professor;;Postdoc", "bibtex": "@misc{\nshang2022contrastive,\ntitle={Contrastive Mutual Information Maximization for Binary Neural Networks},\nauthor={Yuzhang Shang and Dan Xu and Ziliang Zong and Liqiang Nie and Yan Yan},\nyear={2022},\nurl={https://openreview.net/forum?id=T-uEidE-Xpv}\n}", "github": "", "project": "", "reviewers": "vKaa;LrbF;9r7Z;iEui", "site": "https://openreview.net/forum?id=T-uEidE-Xpv", "pdf_size": 0, "recommendation": "3;3;5;6", "confidence": "5;5;4;4", "correctness": "3;3;3;3", "technical_novelty": "3;2;2;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "65;63;67;75", "wc_summary_review": "26;21;33;28", "wc_main_review": "331;630;118;161", "wc_review": "422;714;218;264", "wc_reply_reviewers": "0;0;61;0", "wc_reply_authors": "83;227;41;225", "reply_reviewers": "0;0;1;0", "reply_authors": "1;2;1;1", "recommendation_avg": [ 4.25, 1.299038105676658 ], "confidence_avg": [ 4.5, 0.5 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 67.5, 4.55521678957215 ], "wc_summary_review_avg": [ 27.0, 4.301162633521313 ], "wc_main_review_avg": [ 310.0, 201.18772328350454 ], "wc_review_avg": [ 404.5, 194.0483187250021 ], "wc_reply_reviewers_avg": [ 15.25, 26.413774815425377 ], "wc_reply_authors_avg": [ 144.0, 83.33666660000267 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.9622504486493761, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:eLJ2AWC5pkkJ:scholar.google.com/&scioq=Contrastive+Mutual+Information+Maximization+for+Binary+Neural+Networks&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "Illinois Institute of Technology;Texas State University;Shandong University;University of Oxford", "aff_unique_dep": ";;;VGG", "aff_unique_url": "https://www.iit.edu;https://www.txstate.edu;http://www.sdu.edu.cn;https://www.ox.ac.uk", "aff_unique_abbr": "IIT;TXST;SDU;Oxford", "aff_campus_unique_index": "1", "aff_campus_unique": ";Oxford", "aff_country_unique_index": "0;0;1;2", "aff_country_unique": "United States;China;United Kingdom" }, { "title": "Improving Mutual Information Estimation with Annealed and Energy-Based Bounds", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6176", "id": "T0B9AoM_bFg", "poster": "", "openreview": "https://openreview.net/forum?id=T0B9AoM_bFg", "slides": "https://iclr.cc/virtual/2022/poster/6176", "video": "https://iclr.cc/virtual/2022/poster/6176", "author_site": "Rob Brekelmans, Sicong(Sheldon) Huang, Marzyeh Ghassemi, Greg Ver Steeg, Roger Grosse, Alireza Makhzani", "tldr": "", "abstract": "Mutual information (MI) is a fundamental quantity in information theory and machine learning. However, direct estimation of MI is intractable, even if the true joint probability density for the variables of interest is known, as it involves estimating a potentially high-dimensional log partition function. In this work, we present a unifying view of existing MI bounds from the perspective of importance sampling, and propose three novel bounds based on this approach. Since a tight MI bound without density information requires a sample size exponential in the true MI, we assume either a single marginal or the full joint density information is known. In settings where the full joint density is available, we propose Multi-Sample Annealed Importance Sampling (AIS) bounds on MI, which we demonstrate can tightly estimate large values of MI in our experiments. In settings where only a single marginal distribution is known, we propose Generalized IWAE (GIWAE) and MINE-AIS bounds. Our GIWAE bound unifies variational and contrastive bounds in a single framework that generalizes InfoNCE, IWAE, and Barber-Agakov bounds. Our MINE-AIS method improves upon existing energy-based methods such as MINE-DV and MINE-F by directly optimizing a tighter lower bound on MI. MINE-AIS uses MCMC sampling to estimate gradients for training and Multi-Sample AIS for evaluating the bound. Our methods are particularly suitable for evaluating MI in deep generative models, since explicit forms of the marginal or joint densities are often available. We evaluate our bounds on estimating the MI of VAEs and GANs trained on the MNIST and CIFAR datasets, and showcase significant gains over existing bounds in these challenging settings with high ground truth MI.", "keywords": "mutual information estimation;annealed importance sampling;energy-based models", "primary_area": "", "supplementary_material": "", "author": "Rob Brekelmans;Sicong Huang;Marzyeh Ghassemi;Greg Ver Steeg;Roger Baker Grosse;Alireza Makhzani", "authorids": "~Rob_Brekelmans1;~Sicong_Huang1;~Marzyeh_Ghassemi2;~Greg_Ver_Steeg1;~Roger_Baker_Grosse1;~Alireza_Makhzani1", "gender": "M;M;F;M;M;", "homepage": "https://brekelma.github.io;http://www.cs.toronto.edu/~huang/;https://www.healthyml.org/;https://profiles.ucr.edu/app/home/profile/gregoryv;http://www.cs.toronto.edu/~rgrosse/;http://www.alireza.ai/", "dblp": "207/7856.html;213/8048.html;145/6563;82/9058;26/7058;122/5126.html", "google_scholar": "M6ADg_UAAAAJ;https://scholar.google.ca/citations?hl=en;;goLucoIAAAAJ;xgQd1qgAAAAJ;B0KVWJEAAAAJ", "orcid": ";0009-0006-8791-0243;;0000-0002-0793-141X;;", "linkedin": ";sicong-sheldon-huang-7a4292106/;;;;", "or_profile": "~Rob_Brekelmans1;~Sicong_Huang1;~Marzyeh_Ghassemi2;~Greg_Ver_Steeg1;~Roger_Baker_Grosse1;~Alireza_Makhzani1", "aff": "University of Southern California;University of Toronto;Massachusetts Institute of Technology;USC/ISI;Department of Computer Science, University of Toronto;Vector Institute", "aff_domain": "usc.edu;cs.toronto.edu;mit.edu;isi.edu;cs.toronto.edu;vectorinstitute.ai", "position": "PhD student;PhD student;Assistant Professor;Associate Professor;Assistant Professor;Researcher", "bibtex": "@inproceedings{\nbrekelmans2022improving,\ntitle={Improving Mutual Information Estimation with Annealed and Energy-Based Bounds},\nauthor={Rob Brekelmans and Sicong Huang and Marzyeh Ghassemi and Greg Ver Steeg and Roger Baker Grosse and Alireza Makhzani},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=T0B9AoM_bFg}\n}", "github": "", "project": "", "reviewers": "78Uf;eNs9;YniD", "pdf_size": 0, "recommendation": "6;8;8", "confidence": "4;5;4", "correctness": "4;4;4", "technical_novelty": "4;3;3", "empirical_novelty": "3;3;4", "wc_summary_paper": "55;75;120", "wc_summary_review": "37;25;54", "wc_main_review": "271;783;231", "wc_review": "363;883;405", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "437;802;294", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "recommendation_avg": [ 7.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "correctness_avg": [ 4.0, 0.0 ], "technical_novelty_avg": [ 3.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 83.33333333333333, 27.182510717166817 ], "wc_summary_review_avg": [ 38.666666666666664, 11.897712198383164 ], "wc_main_review_avg": [ 428.3333333333333, 251.31830193777947 ], "wc_review_avg": [ 550.3333333333334, 235.85494600613225 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 511.0, 213.8893795088168 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.4999999999999999, "corr_recommendation_correctness": 0.0, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9396725787575517954&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=T0B9AoM_bFg", "email": "usc.edu;cs.toronto.edu;mit.edu;isi.edu;cs.toronto.edu;vectorinstitute.ai", "author_num": 6, "aff_unique_index": "0;1;2;0;1;3", "aff_unique_norm": "University of Southern California;University of Toronto;Massachusetts Institute of Technology;Vector Institute", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.usc.edu;https://www.utoronto.ca;https://web.mit.edu;https://vectorinstitute.ai/", "aff_unique_abbr": "USC;U of T;MIT;Vector Institute", "aff_campus_unique_index": "0;2;3", "aff_campus_unique": "Los Angeles;;ISI;Toronto", "aff_country_unique_index": "0;1;0;0;1;1", "aff_country_unique": "United States;Canada" }, { "title": "Step-unrolled Denoising Autoencoders for Text Generation", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6942", "id": "T0GpzBQ1Fg6", "poster": "", "openreview": "https://openreview.net/forum?id=T0GpzBQ1Fg6", "slides": "https://iclr.cc/virtual/2022/poster/6942", "video": "https://iclr.cc/virtual/2022/poster/6942", "author_site": "Nikolay Savinov, Junyoung Chung, Mikolaj Binkowski, Erich Elsen, Aaron v den", "tldr": "", "abstract": "In this paper we propose a new generative model of text, Step-unrolled Denoising Autoencoder (SUNDAE), that does not rely on autoregressive models. Similarly to denoising diffusion techniques, SUNDAE is repeatedly applied on a sequence of tokens, starting from random inputs and improving them each time until convergence. We present a simple new improvement operator that converges in fewer iterations than diffusion methods, while qualitatively producing better samples on natural language datasets. SUNDAE achieves state-of-the-art results (among non-autoregressive methods) on the WMT'14 English-to-German translation task and good qualitative results on unconditional language modeling on the Colossal Cleaned Common Crawl dataset and a dataset of Python code from GitHub. The non-autoregressive nature of SUNDAE opens up possibilities beyond left-to-right prompted generation, by filling in arbitrary blank patterns in a template.", "keywords": "generative models;text generation;denoising autoencoders", "primary_area": "", "supplementary_material": "", "author": "Nikolay Savinov;Junyoung Chung;Mikolaj Binkowski;Erich Elsen;Aaron van den Oord", "authorids": "~Nikolay_Savinov1;~Junyoung_Chung1;~Mikolaj_Binkowski1;~Erich_Elsen1;~Aaron_van_den_Oord2", "gender": "M;M;M;M;", "homepage": "https://www.nsavinov.com;https://sites.google.com/corp/view/junyoung-ai/;;;", "dblp": "151/8855;;198/0887;;", "google_scholar": "https://scholar.google.ch/citations?user=qUIOyQYAAAAJ;2HE7cTEAAAAJ;https://scholar.google.co.uk/citations?user=wVZXAk0AAAAJ;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~Nikolay_Savinov1;~Junyoung_Chung1;~Mikolaj_Binkowski1;~Erich_Elsen1;~Aaron_van_den_Oord1", "aff": "Google;Google DeepMind;;Baidu;Google", "aff_domain": "google.com;google.com;;baidu.com;google.com", "position": "Research Scientist;Research Scientist;;Research Scientist;Research Scientist", "bibtex": "@inproceedings{\nsavinov2022stepunrolled,\ntitle={Step-unrolled Denoising Autoencoders for Text Generation},\nauthor={Nikolay Savinov and Junyoung Chung and Mikolaj Binkowski and Erich Elsen and Aaron van den Oord},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=T0GpzBQ1Fg6}\n}", "github": "", "project": "", "reviewers": "7uVK;D7fL;D77g;Rdkb", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "4;3;4;3", "correctness": "4;4;3;3", "technical_novelty": "2;4;3;3", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "65;115;79;166", "wc_summary_review": "28;31;35;44", "wc_main_review": "302;332;248;155", "wc_review": "395;478;362;365", "wc_reply_reviewers": "0;216;0;107", "wc_reply_authors": "369;521;436;270", "reply_reviewers": "0;1;0;1", "reply_authors": "1;2;1;1", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 106.25, 39.02162861798569 ], "wc_summary_review_avg": [ 34.5, 6.020797289396148 ], "wc_main_review_avg": [ 259.25, 67.29552362527541 ], "wc_review_avg": [ 400.0, 46.84549071148684 ], "wc_reply_reviewers_avg": [ 80.75, 89.4745075426515 ], "wc_reply_authors_avg": [ 399.0, 91.91572226773829 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.6882472016116854, "corr_recommendation_correctness": -0.6882472016116854, "gs_citation": 113, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14133404943607412304&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=T0GpzBQ1Fg6", "email": "google.com;google.com;;baidu.com;google.com", "author_num": 5, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Google;Baidu", "aff_unique_dep": "Google;Baidu, Inc.", "aff_unique_url": "https://www.google.com;https://www.baidu.com", "aff_unique_abbr": "Google;Baidu", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;1;2;0", "aff_country_unique": "United States;United Kingdom;China" }, { "id": "T1A11E__Az", "title": "Few-Shot Classification with Task-Adaptive Semantic Feature Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Few-shot classification aims to learn a classifier that categorizes objects of unseen classes with limited samples. One general approach is to mine as much information as possible from limited samples. This can be achieved by incorporating data aspects from multiple modals. However, existing multi-modality methods only use additional modality in support samples while adhering to a single modal in query samples. Such approach could lead to information imbalance between support and query samples, which confounds model generalization from support to query samples. Towards this problem, we propose a task-adaptive semantic feature learning mechanism to incorporates semantic features for both support and query samples. The semantic feature learner is trained episodic-wisely by regressing from the feature vectors of support samples. Then the query samples can obtain the semantic features with this module. Such method maintains a consistent training scheme between support and query samples and enables direct model transfer from support to query datasets, which significantly improves model generalization. We develop two modality combination implementations: feature concatenation and feature fusion, based on the semantic feature learner. Extensive experiments conducted on four benchmarks demonstrate that our method outperforms state-of-the-arts, proving the effectiveness of our method.", "keywords": "few-shot learning;task-adaptive semantic feature learning;feature concatenation;feature fusion.", "primary_area": "", "supplementary_material": "/attachment/c45ffdbe10280c4e29f9d3a5d85ce20445fee872.zip", "author": "Meihong Pan;Chunqiu Xia;Hongyi Xin;Yang Yang;Xiaoyong Pan;Hong-Bin Shen", "authorids": "~Meihong_Pan1;~Chunqiu_Xia1;~Hongyi_Xin1;~Yang_Yang24;~Xiaoyong_Pan2;~Hong-Bin_Shen1", "gender": "M;M;F;M;M;F", "homepage": "http://www.csbio.sjtu.edu.cn/;http://gift.sjtu.edu.cn/novellab/;https://compbio.sjtu.edu.cn/;https://xypan1232.github.io/;https://www.sjtu.edu.cn;http://www.csbio.sjtu.edu.cn", "dblp": ";;;;;254/2289", "google_scholar": ";U7vpUGkAAAAJ;-PN_6coAAAAJ;https://scholar.google.com/citations?hl=zh-CN;;1JtG2aAAAAAJ", "orcid": ";0000-0003-2864-7386;0000-0001-5720-773X;;;", "linkedin": ";;;;;", "or_profile": "~Chunqiu_Xia1;~Hongyi_Xin1;~Yang_Yang24;~Xiaoyong_Pan2;~Hong-Bin_Shen1;~Meihong_Pan2", "aff": "Shanghai Jiaotong University;Shanghai Jiaotong University;Shanghai Jiaotong University;Shanghai Jiaotong University;Shanghai Jiaotong University;Shanghai Jiaotong University", "aff_domain": "sjtu.edu;sjtu.edu.cn;sjtu.edu.cn;sjtu.edu.cn;sjtu.edu.cn;sjtu.edu.cn", "position": "PhD student;Associate Professor;Associate Professor;Assistant Professor;Full Professor;PhD student", "bibtex": "@misc{\npan2022fewshot,\ntitle={Few-Shot Classification with Task-Adaptive Semantic Feature Learning},\nauthor={Meihong Pan and Chunqiu Xia and Hongyi Xin and Yang Yang and Xiaoyong Pan and Hong-Bin Shen},\nyear={2022},\nurl={https://openreview.net/forum?id=T1A11E__Az}\n}", "github": "", "project": "", "reviewers": "QNok;7a88;LGLG;wrVH", "site": "https://openreview.net/forum?id=T1A11E__Az", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "3;3;4;4", "correctness": "2;3;4;3", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "69;53;109;95", "wc_summary_review": "57;93;37;38", "wc_main_review": "233;285;85;111", "wc_review": "359;431;231;244", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "786;784;717;791", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 81.5, 21.834605560898048 ], "wc_summary_review_avg": [ 56.25, 22.664675157610354 ], "wc_main_review_avg": [ 178.5, 83.08279003500063 ], "wc_review_avg": [ 316.25, 82.88961032602337 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 769.5, 30.41792234851026 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 1.0, "corr_recommendation_correctness": 0.7071067811865475, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1125608824667805475&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0;0;0;0;0", "aff_unique_norm": "Shanghai Jiao Tong University", "aff_unique_dep": "", "aff_unique_url": "https://www.sjtu.edu.cn", "aff_unique_abbr": "SJTU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "China" }, { "id": "T2F5aBbSEUQ", "title": "Dataset Condensation with Distribution Matching", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Computational cost to train state-of-the-art deep models in many learning problems is rapidly increasing due to more sophisticated models and larger datasets. A recent promising direction to reduce training time is dataset condensation that aims to replace the original large training set with a significantly smaller learned synthetic set while preserving its information. While training deep models on the small set of condensed images can be extremely fast, their synthesis remains computationally expensive due to the complex bi-level optimization and second-order derivative computation. In this work, we propose a simple yet effective dataset condensation technique that requires significantly lower training cost with comparable performance by matching feature distributions of the synthetic and original training images in sampled embedding spaces. Thanks to its efficiency, we apply our method to more realistic and larger datasets with sophisticated neural architectures and achieve a significant performance boost while using larger synthetic training set. We also show various practical benefits of our method in continual learning and neural architecture search.", "keywords": "Dataset Condensation;Data-efficient Learning;Distribution Matching;Continual Learning;Neural Architecture Search", "primary_area": "", "supplementary_material": "", "author": "Bo Zhao;Hakan Bilen", "authorids": "~Bo_Zhao4;~Hakan_Bilen1", "gender": "M;M", "homepage": ";http://homepages.inf.ed.ac.uk/hbilen/", "dblp": ";97/2993", "google_scholar": "R3_AR5EAAAAJ;PtBtfawAAAAJ", "orcid": ";0000-0002-6947-6918", "linkedin": ";", "or_profile": "~Bo_Zhao4;~Hakan_Bilen1", "aff": "University of Edinburgh;University of Edinburgh", "aff_domain": "ed.ac.uk;ed.ac.uk", "position": "PhD student;Assistant Professor", "bibtex": "@misc{\nzhao2022dataset,\ntitle={Dataset Condensation with Distribution Matching},\nauthor={Bo Zhao and Hakan Bilen},\nyear={2022},\nurl={https://openreview.net/forum?id=T2F5aBbSEUQ}\n}", "github": "", "project": "", "reviewers": "BbYE;HwTK;NrKj", "site": "https://openreview.net/forum?id=T2F5aBbSEUQ", "pdf_size": 0, "recommendation": "3;5;8", "confidence": "4;3;4", "correctness": "2;3;4", "technical_novelty": "2;3;3", "empirical_novelty": "3;2;3", "wc_summary_paper": "103;55;75", "wc_summary_review": "57;141;16", "wc_main_review": "609;277;86", "wc_review": "769;473;177", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 5.333333333333333, 2.0548046676563256 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.0, 0.816496580927726 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 77.66666666666667, 19.686430746977866 ], "wc_summary_review_avg": [ 71.33333333333333, 52.027770362460174 ], "wc_main_review_avg": [ 324.0, 216.08485987376966 ], "wc_review_avg": [ 473.0, 241.6829879546069 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.1147078669352809, "corr_recommendation_correctness": 0.9933992677987828, "gs_citation": 392, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9152753227090456107&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "aff_unique_index": "0;0", "aff_unique_norm": "University of Edinburgh", "aff_unique_dep": "", "aff_unique_url": "https://www.ed.ac.uk", "aff_unique_abbr": "Edinburgh", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United Kingdom" }, { "id": "T3_cV3-zbg", "title": "ENHANCE THE DYNAMIC REGRET VIA OPTIMISM", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "In this paper, we study the enhancement method for dynamic regret in online convex optimization. Existing works have shown that adaptive learning for dynamic environment (Ader) enjoys an $O\\big(\\sqrt{\\left(1+P_T\\right)T}\\,\\big)$ dynamic regret upper bound, where $T$ is the number of rounds and $P_T$ is the path length of the reference strategy sequence. The basic idea of Ader is to maintain a group of experts, where each expert obtains the best dynamic regret of a specific path length by running Mirror Descent (MD) with specific parameter, and then tracks the best expert by Normalized Exponentiated Subgradient (NES). However, Ader is not environmental adaptive. By introducing the estimated linear loss function $\\widehat{x}_{t}^*$, the dynamic regret for Optimistic Mirror Descent (OMD) is tighter than MD if the environment is not completely adversarial and $\\widehat{x}_{t}^*$ is well-estimated. Based on the fact that optimism can enhance dynamic regret, we develop an algorithm to replace MD and NES in Ader with OMD and Optimistic Normalized Exponentiated Subgradient (ONES) respectively, and utilize the adaptive trick to achieve $O\\big(\\sqrt {\\left(1+P_T\\right)M_T}\\,\\big)$ dynamic regret upper bound, where $M_T\\leqslant O\\left(T\\right)$ is a measure of estimation accuracy. In particular, if $\\widehat{x}_t^*\\in\\partial\\widehat{\\varphi}_t$, where $\\widehat{\\varphi}_t$ represents the estimated convex loss function and $\\partial\\widehat{\\varphi}_t$ is Lipschitz continuous, then the dynamic regret upper bound of OMD has a subgradient variation type. Based on this fact, we develop a variant algorithm whose upper bound has a subgradient variation type. All our algorithms are environmental adaptive.", "keywords": "online convex optimization;dynamic regret upper bound;normalized exponentiated gradient;adaptive trick", "primary_area": "", "supplementary_material": "", "author": "Qing-xin Meng;Jian-wei Liu", "authorids": "qingxin6174@gmail.com;~Jian-wei_Liu1", "gender": ";M", "homepage": ";https://www.cup.edu.cn/cise/szdw/fjs1/170307.htm", "dblp": ";43/3771-6.html", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": "qingxin6174@gmail.com;~Jian-wei_Liu1", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nmeng2022enhance,\ntitle={{ENHANCE} {THE} {DYNAMIC} {REGRET} {VIA} {OPTIMISM}},\nauthor={Qing-xin Meng and Jian-wei Liu},\nyear={2022},\nurl={https://openreview.net/forum?id=T3_cV3-zbg}\n}", "github": "", "project": "", "reviewers": "pom1;dXyQ;dvvi;WqRi;ggfA", "site": "https://openreview.net/forum?id=T3_cV3-zbg", "pdf_size": 0, "recommendation": "3;3;3;3;5", "confidence": "4;4;4;5;4", "correctness": "3;4;4;3;4", "technical_novelty": "1;1;2;2;2", "empirical_novelty": "0;0;0;0;0", "wc_summary_paper": "66;79;133;41;83", "wc_summary_review": "45;41;156;2;74", "wc_main_review": "174;239;1354;295;275", "wc_review": "285;359;1643;338;432", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "0;0;0;0;0", "reply_reviewers": "0;0;0;0;0", "reply_authors": "0;0;0;0;0", "recommendation_avg": [ 3.4, 0.8 ], "confidence_avg": [ 4.2, 0.39999999999999997 ], "correctness_avg": [ 3.6, 0.4898979485566356 ], "technical_novelty_avg": [ 1.6, 0.4898979485566356 ], "empirical_novelty_avg": [ 0, 0 ], "wc_summary_paper_avg": [ 80.4, 30.117104774529704 ], "wc_summary_review_avg": [ 63.6, 51.56975858000501 ], "wc_main_review_avg": [ 467.4, 445.20763695156893 ], "wc_review_avg": [ 611.4, 517.9523530210091 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.25, "corr_recommendation_correctness": 0.4082482904638631, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:YjkoQHhX5McJ:scholar.google.com/&scioq=ENHANCE+THE+DYNAMIC+REGRET+VIA+OPTIMISM&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "title": "Deep Attentive Variational Inference", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/5923", "id": "T4-65DNlDij", "poster": "", "openreview": "https://openreview.net/forum?id=T4-65DNlDij", "slides": "https://iclr.cc/virtual/2022/poster/5923", "video": "https://iclr.cc/virtual/2022/poster/5923", "author_site": "Ifigeneia Apostolopoulou, Ian Char, Elan Rosenfeld, Artur Dubrawski", "tldr": "", "abstract": "Stochastic Variational Inference is a powerful framework for learning large-scale probabilistic latent variable models. However, typical assumptions on the factorization or independence of the latent variables can substantially restrict its capacity for inference and generative modeling. A major line of active research aims at building more expressive variational models by designing deep hierarchies of interdependent latent variables. Although these models exhibit superior performance and enable richer latent representations, we show that they incur diminishing returns: adding more stochastic layers to an already very deep model yields small predictive improvement while substantially increasing the inference and training time. Moreover, the architecture for this class of models favors local interactions among the latent variables between neighboring layers when designing the conditioning factors of the involved distributions. This is the first work that proposes attention mechanisms to build more expressive variational distributions in deep probabilistic models by explicitly modeling both local and global interactions in the latent space. Specifically, we propose deep attentive variational autoencoder and test it on a variety of established datasets. We show it achieves state-of-the-art log-likelihoods while using fewer latent layers and requiring less training time than existing models. The proposed non-local inference reduces computational footprint by alleviating the need for deep hierarchies. Project code:\nhttps://github.com/ifiaposto/Deep_Attentive_VI", "keywords": "variational inference;approximate inference;deep probabilistic models;deep probabilistic learning;variational autoencoder;probabilistic methods for deep learning;attention", "primary_area": "", "supplementary_material": "/attachment/bfd63b63242e785e2bd4ad29b7e58dcd5d6e6ef1.zip", "author": "Ifigeneia Apostolopoulou;Ian Char;Elan Rosenfeld;Artur Dubrawski", "authorids": "~Ifigeneia_Apostolopoulou1;~Ian_Char1;~Elan_Rosenfeld1;~Artur_Dubrawski2", "gender": ";M;M;M", "homepage": ";http://ianchar.com;;https://www.autonlab.org", "dblp": "145/9415.html;157/7519;236/4508;76/48", "google_scholar": "xiJGHuwAAAAJ;3SDKldkAAAAJ;f0j0K8QAAAAJ;O3gezzcAAAAJ", "orcid": ";;;0000-0002-2372-0831", "linkedin": ";;;artur-dubrawski-33a2a87/", "or_profile": "~Ifigeneia_Apostolopoulou1;~Ian_Char1;~Elan_Rosenfeld1;~Artur_Dubrawski2", "aff": "Carnegie Mellon University;Carnegie Mellon University;Carnegie Mellon University;Carnegie Mellon University", "aff_domain": "cmu.edu;cmu.edu;andrew.cmu.edu;cmu.edu", "position": "PhD student;PhD student;PhD student;Research Professor", "bibtex": "@inproceedings{\napostolopoulou2022deep,\ntitle={Deep Attentive Variational Inference},\nauthor={Ifigeneia Apostolopoulou and Ian Char and Elan Rosenfeld and Artur Dubrawski},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=T4-65DNlDij}\n}", "github": "", "project": "", "reviewers": "VA9m;NQd9;DYsA;5Yd5", "pdf_size": 0, "recommendation": "6;8;8;8", "confidence": "3;4;3;3", "correctness": "3;3;3;4", "technical_novelty": "3;3;3;2", "empirical_novelty": "3;4;3;3", "wc_summary_paper": "16;117;32;88", "wc_summary_review": "14;119;54;16", "wc_main_review": "169;459;399;126", "wc_review": "199;695;485;230", "wc_reply_reviewers": "0;176;39;34", "wc_reply_authors": "533;1245;1010;549", "reply_reviewers": "0;1;1;1", "reply_authors": "1;2;2;1", "recommendation_avg": [ 7.5, 0.8660254037844386 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 63.25, 40.959583738119214 ], "wc_summary_review_avg": [ 50.75, 42.50514674718816 ], "wc_main_review_avg": [ 288.25, 143.14917918032222 ], "wc_review_avg": [ 402.25, 202.1946772296442 ], "wc_reply_reviewers_avg": [ 62.25, 67.36607083688345 ], "wc_reply_authors_avg": [ 834.25, 304.8453501367538 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12145002496737805734&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=T4-65DNlDij", "email": "cmu.edu;cmu.edu;andrew.cmu.edu;cmu.edu", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Carnegie Mellon University", "aff_unique_dep": "", "aff_unique_url": "https://www.cmu.edu", "aff_unique_abbr": "CMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "T6lAFguUbw", "title": "Modeling Bounded Rationality in Multi-Agent Simulations Using Rationally Inattentive Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Multi-agent reinforcement learning (MARL) is a powerful framework for studying emergent behavior in complex agent-based simulations. However, RL agents are often assumed to be rational and behave optimally, which does not fully reflect human behavior. Here, we study more human-like RL agents which incorporate an established model of human-irrationality, the Rational Inattention (RI) model. RI models the cost of cognitive information processing using mutual information. Our RIRL framework generalizes and is more flexible than prior work by allowing for multi-timestep dynamics and information channels with heterogeneous processing costs. We evaluate RIRL in Principal-Agent (specifically manager-employee relations) problem settings of varying complexity where RI models information asymmetry (e.g. it may be costly for the manager to observe certain information about the employees). We show that using RIRL yields a rich spectrum of new equilibrium behaviors that differ from those found under rational assumptions. For instance, some forms of a Principal's inattention can increase Agent welfare due to increased compensation, while other forms of inattention can decrease Agent welfare by encouraging extra work effort. Additionally, new strategies emerge compared to those under rationality assumptions, e.g., Agents are incentivized to misrepresent their ability. These results suggest RIRL is a powerful tool towards building AI agents that can mimic real human behavior. ", "keywords": "Reinforcement Learning;Multi-Agent Reinforcement Learning;Bounded Rationality;Rational Inattention;Simulations", "primary_area": "", "supplementary_material": "", "author": "Tong Mu;Stephan Zheng;Alexander R Trott", "authorids": "~Tong_Mu1;~Stephan_Zheng1;~Alexander_R_Trott1", "gender": "F;M;M", "homepage": ";http://www.stephanzheng.com;", "dblp": ";https://dblp.org/pers/hd/z/Zheng:Stephan;", "google_scholar": ";7mnKGGEAAAAJ;rB4bvV0AAAAJ", "orcid": ";;", "linkedin": "tong-mu-9b42b2a7/;stephanzheng;", "or_profile": "~Tong_Mu1;~Stephan_Zheng1;~Alexander_R_Trott1", "aff": "Stanford University;SalesForce.com;Salesforce Research", "aff_domain": "stanford.edu;salesforce.com;salesforce.com", "position": "PhD student;Lead Research Scientist;Research Scientist", "bibtex": "@misc{\nmu2022modeling,\ntitle={Modeling Bounded Rationality in Multi-Agent Simulations Using Rationally Inattentive Reinforcement Learning},\nauthor={Tong Mu and Stephan Zheng and Alexander R Trott},\nyear={2022},\nurl={https://openreview.net/forum?id=T6lAFguUbw}\n}", "github": "", "project": "", "reviewers": "FsBe;6aq5;HuB7;tabk", "site": "https://openreview.net/forum?id=T6lAFguUbw", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "4;2;2;3", "correctness": "3;4;4;4", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;3;0;4", "wc_summary_paper": "123;45;45;105", "wc_summary_review": "22;32;16;94", "wc_main_review": "232;261;151;212", "wc_review": "377;338;212;411", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "900;331;310;228", "reply_reviewers": "0;0;0;0", "reply_authors": "2;1;1;1", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 2.75, 0.82915619758885 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.25, 1.479019945774904 ], "wc_summary_paper_avg": [ 79.5, 35.082046690579496 ], "wc_summary_review_avg": [ 41.0, 31.12876483254676 ], "wc_main_review_avg": [ 214.0, 40.329889660151565 ], "wc_review_avg": [ 334.5, 75.29442210416387 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 442.25, 267.0696304337129 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.6225430174794673, "corr_recommendation_correctness": 0.9271726499455306, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6411480953175110505&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2, "aff_unique_index": "0;1;1", "aff_unique_norm": "Stanford University;Salesforce", "aff_unique_dep": ";", "aff_unique_url": "https://www.stanford.edu;https://www.salesforce.com", "aff_unique_abbr": "Stanford;Salesforce", "aff_campus_unique_index": "0", "aff_campus_unique": "Stanford;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "T73sfhfzk07", "title": "GRODIN: Improved Large-Scale Out-of-Domain detection via Back-propagation", "track": "main", "status": "Reject", "tldr": "", "abstract": "Uncertainty estimation and out-of-doman (OOD) input detection are critical for improving the safety and robustness of machine learning. Unfortunately, most methods for detecting OOD examples have been evaluated on small tasks while typical methods are computationally expensive. In this paper we propose a new gradient-based method called GRODIN for OOD detection. The proposed method is conceptually simple, computationally cheaper than ensemble methods and can be directly applied to any existing and deployed model without re-training. We evaluate GRODIN on models trained on CIFAR-10 and ImageNet datasets, and show it's strong performance on various OOD ImageNet datasets such as ImageNet-O, ImageNet-A, ImageNet-R, ImageNet-C.", "keywords": "Out of distribution;deep learning;gradient;backpropagation", "primary_area": "", "supplementary_material": "/attachment/e58f7434a9e7383efa3e18dc5549410393fd9254.zip", "author": "Gleb Yengalych;Igor E. Kuralenok;Vasily A Ershov", "authorids": "~Gleb_Yengalych1;~Igor_E._Kuralenok1;~Vasily_A_Ershov1", "gender": "M;M;M", "homepage": "https://vk.com/herr_bilbo;;", "dblp": ";72/1834;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;vasily-ershov-04768199/", "or_profile": "~Gleb_Yengalych1;~Igor_E._Kuralenok1;~Vasily_A_Ershov1", "aff": "Higher School of Economics;Yandex;Yandex", "aff_domain": "hse.ru;yandex-team.ru;yandex-team.ru", "position": "PhD student;Head of ML @Yandex.Cloud;Team Lead", "bibtex": "@misc{\nyengalych2022grodin,\ntitle={{GRODIN}: Improved Large-Scale Out-of-Domain detection via Back-propagation},\nauthor={Gleb Yengalych and Igor E. Kuralenok and Vasily A Ershov},\nyear={2022},\nurl={https://openreview.net/forum?id=T73sfhfzk07}\n}", "github": "", "project": "", "reviewers": "gLsy;CsE6;cmXa;zSjc", "site": "https://openreview.net/forum?id=T73sfhfzk07", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "4;5;4;4", "correctness": "2;3;3;2", "technical_novelty": "2;1;1;2", "empirical_novelty": "2;1;1;2", "wc_summary_paper": "40;49;98;65", "wc_summary_review": "14;23;28;34", "wc_main_review": "630;290;107;264", "wc_review": "684;362;233;363", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 1.5, 0.5 ], "empirical_novelty_avg": [ 1.5, 0.5 ], "wc_summary_paper_avg": [ 63.0, 22.102036105300343 ], "wc_summary_review_avg": [ 24.75, 7.327175444876422 ], "wc_main_review_avg": [ 322.75, 190.70576158050392 ], "wc_review_avg": [ 410.5, 166.5210196942116 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": -0.5773502691896257, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:MKTnVpjnoa0J:scholar.google.com/&scioq=GRODIN:+Improved+Large-Scale+Out-of-Domain+detection+via+Back-propagation&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;1", "aff_unique_norm": "Higher School of Economics;Yandex", "aff_unique_dep": ";", "aff_unique_url": "https://www.hse.ru;https://yandex.com", "aff_unique_abbr": "HSE;Yandex", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Russian Federation" }, { "id": "T8BnDXDTcFZ", "title": "Accelerating Training of Deep Spiking Neural Networks with Parameter Initialization", "track": "main", "status": "Reject", "tldr": "", "abstract": "Despite that spiking neural networks (SNNs) show strong advantages in information encoding, power consuming, and computational capability, the underdevelopment of supervised learning algorithms is still a hindrance for training SNN. Our consideration is that proper weight initialization is a pivotal issue for efficient SNN training. It greatly influences gradient generating with the method of back-propagation through time at the initial training stage. Focusing on the properties of spiking neurons, we first derive the asymptotic formula of their response curve approximating the actual neuron response distribution. Then, we propose an initialization method obtained from the slant asymptote to overcome gradient vanishing. Finally, experiments with different coding schemes on classification tasks show that our method can effectively improve training speed and the final model accuracy compared with traditional deep learning initialization methods and existing SNN initialization methods. Further validation on different neuron types and training hyper-parameters has shown comparably good versatility and superiority over the other methods. Some suggestions are given to SNN training based on the analyses.", "keywords": "spiking neural networks;back-propagationthrough time;parameter initialization", "primary_area": "", "supplementary_material": "/attachment/cda5076660a64b5a0c340e816b69e920c4894b41.zip", "author": "Jianhao Ding;Jiyuan Zhang;Zhaofei Yu;Tiejun Huang", "authorids": "~Jianhao_Ding1;~Jiyuan_Zhang3;~Zhaofei_Yu1;~Tiejun_Huang1", "gender": "M;M;M;M", "homepage": "https://dingjianhao.github.io/;;https://yuzhaofei.github.io;https://idm.pku.edu.cn/~tjhuang/", "dblp": "128/2534;;166/0573;h/TiejunHuang", "google_scholar": "4rDfCSsAAAAJ;ukHrw0IAAAAJ;qaUgD50AAAAJ;https://scholar.google.com.tw/citations?user=knvEK4AAAAAJ", "orcid": ";;;0000-0002-4234-6099", "linkedin": ";jiyuanzhang-leo;;", "or_profile": "~Jianhao_Ding1;~Jiyuan_Zhang3;~Zhaofei_Yu1;~Tiejun_Huang1", "aff": "Institute of Automation, Chinese Academy of Sciences;Peking University;Peking University;Institute of Computing Technology, Chinese Academy of Sciences", "aff_domain": "ia.ac.cn;pku.edu.cn;pku.edu.cn;ict.ac.cn", "position": "Intern;PhD student;Assistant Professor;Postdoc", "bibtex": "@misc{\nding2022accelerating,\ntitle={Accelerating Training of Deep Spiking Neural Networks with Parameter Initialization},\nauthor={Jianhao Ding and Jiyuan Zhang and Zhaofei Yu and Tiejun Huang},\nyear={2022},\nurl={https://openreview.net/forum?id=T8BnDXDTcFZ}\n}", "github": "", "project": "", "reviewers": "xJ5p;ZLvg;LCr2;gWVF", "site": "https://openreview.net/forum?id=T8BnDXDTcFZ", "pdf_size": 0, "recommendation": "5;6;6;6", "confidence": "4;5;4;4", "correctness": "3;3;4;3", "technical_novelty": "3;3;3;3", "empirical_novelty": "3;3;3;3", "wc_summary_paper": "97;50;147;141", "wc_summary_review": "61;40;55;21", "wc_main_review": "405;110;209;168", "wc_review": "563;200;411;330", "wc_reply_reviewers": "0;0;106;0", "wc_reply_authors": "1003;318;414;394", "reply_reviewers": "0;0;1;0", "reply_authors": "2;1;2;1", "recommendation_avg": [ 5.75, 0.4330127018922193 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 108.75, 39.0280347955159 ], "wc_summary_review_avg": [ 44.25, 15.449514555480375 ], "wc_main_review_avg": [ 223.0, 110.80839318391004 ], "wc_review_avg": [ 376.0, 131.61116973874215 ], "wc_reply_reviewers_avg": [ 26.5, 45.89934640057525 ], "wc_reply_authors_avg": [ 532.25, 274.13716913253484 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7374532010194088347&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;1;0", "aff_unique_norm": "Chinese Academy of Sciences;Peking University", "aff_unique_dep": "Institute of Automation;", "aff_unique_url": "http://www.ia.cas.cn;http://www.pku.edu.cn", "aff_unique_abbr": "CAS;Peking U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "title": "Understanding Domain Randomization for Sim-to-real Transfer", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6876", "id": "T8vZHIRTrY", "poster": "", "openreview": "https://openreview.net/forum?id=T8vZHIRTrY", "slides": "https://iclr.cc/virtual/2022/poster/6876", "video": "https://iclr.cc/virtual/2022/poster/6876", "author_site": "Xiaoyu Chen, Jiachen Hu, Chi Jin, Lihong Li, Liwei Wang", "tldr": "", "abstract": "Reinforcement learning encounters many challenges when applied directly in the real world. Sim-to-real transfer is widely used to transfer the knowledge learned from simulation to the real world. Domain randomization---one of the most popular algorithms for sim-to-real transfer---has been demonstrated to be effective in various tasks in robotics and autonomous driving. Despite its empirical successes, theoretical understanding on why this simple algorithm works is largely missing. In this paper, we propose a theoretical framework for sim-to-real transfers, in which the simulator is modeled as a set of MDPs with tunable parameters (corresponding to unknown physical parameters such as friction). We provide sharp bounds on the sim-to-real gap---the difference between the value of policy returned by domain randomization and the value of an optimal policy for the real world. We prove that sim-to-real transfer can succeed under mild conditions without any real-world training samples. Our theory also highlights the importance of using memory (i.e., history-dependent policies) in domain randomization. Our proof is based on novel techniques that reduce the problem of bounding the sim-to-real gap to the problem of designing efficient learning algorithms for infinite-horizon MDPs, which we believe are of independent interest.", "keywords": "domain randomization;sim-to-real transfer;learning theory", "primary_area": "", "supplementary_material": "", "author": "Xiaoyu Chen;Jiachen Hu;Chi Jin;Lihong Li;Liwei Wang", "authorids": "~Xiaoyu_Chen2;~Jiachen_Hu1;~Chi_Jin1;~Lihong_Li1;~Liwei_Wang1", "gender": "M;M;M;;M", "homepage": ";https://nickhclos.github.io/;https://sites.google.com/view/cjin/home;https://lihongli.github.io;http://www.liweiwang-pku.com/", "dblp": "30/4497;239/5040;126/1802-1;l/LihongLi.html;", "google_scholar": "sioumZAAAAAJ;5GavKiQAAAAJ;GINhGvwAAAAJ;Rqy5KDEAAAAJ;VZHxoh8AAAAJ", "orcid": ";;;;", "linkedin": ";;;lihong-li-9620164;", "or_profile": "~Xiaoyu_Chen2;~Jiachen_Hu1;~Chi_Jin1;~Lihong_Li1;~Liwei_Wang1", "aff": "Peking University;Peking University;Princeton University;Amazon;Peking University", "aff_domain": "pku.edu.cn;pku.edu.cn;princeton.edu;amazon.com;pku.edu.cn", "position": "PhD student;PhD student;Assistant Professor;Senior Principal Scientist;Full Professor", "bibtex": "@inproceedings{\nchen2022understanding,\ntitle={Understanding Domain Randomization for Sim-to-real Transfer},\nauthor={Xiaoyu Chen and Jiachen Hu and Chi Jin and Lihong Li and Liwei Wang},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=T8vZHIRTrY}\n}", "github": "", "project": "", "reviewers": "SVo4;y2ZB;WmdZ;M8oh", "pdf_size": 0, "recommendation": "5;8;8;10", "confidence": "2;2;2;3", "correctness": "4;4;1;3", "technical_novelty": "2;4;1;3", "empirical_novelty": "0;0;0;0", "wc_summary_paper": "77;108;240;107", "wc_summary_review": "46;65;100;119", "wc_main_review": "531;341;299;623", "wc_review": "654;514;639;849", "wc_reply_reviewers": "306;79;0;174", "wc_reply_authors": "787;193;225;709", "reply_reviewers": "2;1;0;1", "reply_authors": "2;1;1;1", "recommendation_avg": [ 7.75, 1.7853571071357126 ], "confidence_avg": [ 2.25, 0.4330127018922193 ], "correctness_avg": [ 3.0, 1.224744871391589 ], "technical_novelty_avg": [ 2.5, 1.118033988749895 ], "empirical_novelty_avg": [ 0, 0 ], "wc_summary_paper_avg": [ 133.0, 63.019838146412276 ], "wc_summary_review_avg": [ 82.5, 28.622543562723422 ], "wc_main_review_avg": [ 448.5, 133.3819702958387 ], "wc_review_avg": [ 664.0, 119.84364814206884 ], "wc_reply_reviewers_avg": [ 139.75, 114.05344142111626 ], "wc_reply_authors_avg": [ 478.5, 271.1434122378783 ], "reply_reviewers_avg": [ 1.0, 0.7071067811865476 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.7276068751089989, "corr_recommendation_correctness": -0.34299717028501764, "gs_citation": 118, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17957837046196778100&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=T8vZHIRTrY", "email": "pku.edu.cn;pku.edu.cn;princeton.edu;amazon.com;pku.edu.cn", "author_num": 5, "aff_unique_index": "0;0;1;2;0", "aff_unique_norm": "Peking University;Princeton University;Amazon", "aff_unique_dep": ";;Amazon.com, Inc.", "aff_unique_url": "http://www.pku.edu.cn;https://www.princeton.edu;https://www.amazon.com", "aff_unique_abbr": "Peking U;Princeton;Amazon", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;1;0", "aff_country_unique": "China;United States" }, { "title": "RotoGrad: Gradient Homogenization in Multitask Learning", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6379", "id": "T8wHz4rnuGL", "poster": "", "openreview": "https://openreview.net/forum?id=T8wHz4rnuGL", "slides": "https://iclr.cc/virtual/2022/poster/6379", "video": "https://iclr.cc/virtual/2022/poster/6379", "author_site": "Adri\u00e1n Javaloy, Isabel Valera", "tldr": "", "abstract": "Multitask learning is being increasingly adopted in applications domains like computer vision and reinforcement learning. However, optimally exploiting its advantages remains a major challenge due to the effect of negative transfer. Previous works have tracked down this issue to the disparities in gradient magnitudes and directions across tasks, when optimizing the shared network parameters. While recent work has acknowledged that negative transfer is a two-fold problem, existing approaches fall short as they only focus on either homogenizing the gradient magnitude across tasks; or greedily change the gradient directions, overlooking future conflicts. In this work, we introduce RotoGrad, an algorithm that tackles negative transfer as a whole: it jointly homogenizes gradient magnitudes and directions, while ensuring training convergence. We show that RotoGrad outperforms competing methods in complex problems, including multi-label classification in CelebA and computer vision tasks in the NYUv2 dataset. A Pytorch implementation can be found in https://github.com/adrianjav/rotograd.", "keywords": "multitask learning;conflicting gradients;negative transfer", "primary_area": "", "supplementary_material": "/attachment/c7dadc4d23625f83e84924c0cb66a1ebb3e7760c.zip", "author": "Adri\u00e1n Javaloy;Isabel Valera", "authorids": "~Adri\u00e1n_Javaloy1;~Isabel_Valera1", "gender": "M;F", "homepage": "https://adrianjav.github.io;https://ivaleram.github.io/", "dblp": "259/2011;126/1768.html", "google_scholar": "ne3evXwAAAAJ;https://scholar.google.es/citations?user=cpdQqpsAAAAJ", "orcid": "0000-0002-5184-4460;", "linkedin": "adrian-javaloy;", "or_profile": "~Adri\u00e1n_Javaloy1;~Isabel_Valera1", "aff": "Saarland University, Saarland University;Universit\u00e4t des Saarlandes", "aff_domain": "cs.uni-saarland.de;uni-saarland.de", "position": "PhD student;Full Professor", "bibtex": "@inproceedings{\njavaloy2022rotograd,\ntitle={RotoGrad: Gradient Homogenization in Multitask Learning},\nauthor={Adri{\\'a}n Javaloy and Isabel Valera},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=T8wHz4rnuGL}\n}", "github": "", "project": "", "reviewers": "2soJ;gsqy;cExt;uDos", "pdf_size": 0, "recommendation": "8;8;8;8", "confidence": "4;3;4;4", "correctness": "4;4;3;4", "technical_novelty": "3;3;4;4", "empirical_novelty": "3;3;4;3", "wc_summary_paper": "94;135;53;111", "wc_summary_review": "42;55;35;38", "wc_main_review": "622;154;288;145", "wc_review": "758;344;376;294", "wc_reply_reviewers": "0;25;101;0", "wc_reply_authors": "367;378;468;132", "reply_reviewers": "0;1;2;0", "reply_authors": "1;1;2;1", "recommendation_avg": [ 8.0, 0.0 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.5, 0.5 ], "empirical_novelty_avg": [ 3.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 98.25, 29.911327285829362 ], "wc_summary_review_avg": [ 42.5, 7.632168761236874 ], "wc_main_review_avg": [ 302.25, 193.09890600415116 ], "wc_review_avg": [ 443.0, 184.1982627496796 ], "wc_reply_reviewers_avg": [ 31.5, 41.403502267320334 ], "wc_reply_authors_avg": [ 336.25, 124.26257481639433 ], "reply_reviewers_avg": [ 0.75, 0.82915619758885 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 118, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17548850565658345849&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "pdf": "https://openreview.net/pdf?id=T8wHz4rnuGL", "email": "cs.uni-saarland.de;uni-saarland.de", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "Saarland University;Universit\u00e4t des Saarlandes", "aff_unique_dep": ";", "aff_unique_url": "https://www.uni-saarland.de;https://www.uni-saarland.de", "aff_unique_abbr": "UdS;UDS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Germany" }, { "title": "Learning with Noisy Labels Revisited: A Study Using Real-World Human Annotations", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7085", "id": "TBWA6PLJZQm", "poster": "", "openreview": "https://openreview.net/forum?id=TBWA6PLJZQm", "slides": "https://iclr.cc/virtual/2022/poster/7085", "video": "https://iclr.cc/virtual/2022/poster/7085", "author_site": "Jiaheng Wei, Zhaowei Zhu, Hao Cheng, Tongliang Liu, Gang Niu, Yang Liu", "tldr": "", "abstract": "Existing research on learning with noisy labels mainly focuses on synthetic label noise. The synthetic noise, though has clean structures which greatly enabled statistical analyses, often fails to model the real-world noise patterns. The recent literature has observed several efforts to offer real-world noisy datasets, e.g., Food-101N, WebVision, and Clothing1M. Yet the existing efforts suffer from two caveats: firstly, the lack of ground-truth verification makes it hard to theoretically study the property and treatment of real-world label noise. Secondly, these efforts are often of large scales, which may result in unfair comparisons of robust methods within reasonable and accessible computation power. To better understand real-world label noise, it is important to establish controllable, easy-to-use, and moderate-sized real-world noisy datasets with both ground-truth and noisy labels. This work presents two new benchmark datasets, which we name as CIFAR-10N, CIFAR-100N (jointly we call them CIFAR-N), equipping the training datasets of CIFAR-10 and CIFAR-100 with human-annotated real-world noisy labels we collected from Amazon Mechanical Turk. We quantitatively and qualitatively show that real-world noisy labels follow an instance-dependent pattern rather than the classically assumed and adopted ones (e.g., class-dependent label noise). We then initiate an effort to benchmarking a subset of the existing solutions using CIFAR-10N and CIFAR-100N. We further proceed to study the memorization of correct and wrong predictions, which further illustrates the difference between human noise and class-dependent synthetic noise. We show indeed the real-world noise patterns impose new and outstanding challenges as compared to synthetic label noise. These observations require us to rethink the treatment of noisy labels, and we hope the availability of these two datasets would facilitate the development and evaluation of future learning with noisy label solutions. The corresponding datasets and the leaderboard are available at http://noisylabels.com. ", "keywords": "Learning with noisy labels;benchmark;real-world label noise;human annotations", "primary_area": "", "supplementary_material": "", "author": "Jiaheng Wei;Zhaowei Zhu;Hao Cheng;Tongliang Liu;Gang Niu;Yang Liu", "authorids": "~Jiaheng_Wei1;~Zhaowei_Zhu1;~Hao_Cheng5;~Tongliang_Liu1;~Gang_Niu1;~Yang_Liu3", "gender": "M;M;M;M;M;M", "homepage": "https://sites.google.com/ucsc.edu/jiahengwei;https://www.zzw.ai;https://haochenglouis.github.io;https://tongliang-liu.github.io/;https://niug1984.github.io;http://www.yliuu.com", "dblp": "270/8936;202/1712;;150/6667;26/3367-1;51/3710-18", "google_scholar": "https://scholar.google.com/citations?hl=en;YS8pSQoAAAAJ;ftlVqVIAAAAJ;https://scholar.google.com.au/citations?user=EiLdZ_YAAAAJ;https://scholar.google.co.jp/citations?user=HOkcy00AAAAJ;jKrIVCIAAAAJ", "orcid": ";0000-0003-3894-5862;0000-0001-8864-7818;;;0000-0001-8420-6011", "linkedin": "jiahengwei/;;;;;", "or_profile": "~Jiaheng_Wei1;~Zhaowei_Zhu1;~Hao_Cheng5;~Tongliang_Liu1;~Gang_Niu1;~Yang_Liu3", "aff": "Google;University of California, Santa Cruz;Tencent Youtu Lab;University of Sydney;RIKEN;University of California, Santa Cruz", "aff_domain": "google.com;ucsc.edu;tencent.com;sydney.edu.au;riken.jp;ucsc.edu", "position": "Intern;PhD student;Researcher;Lecturer;Research Scientist (tenured);Assistant Professor", "bibtex": "@inproceedings{\nwei2022learning,\ntitle={Learning with Noisy Labels Revisited: A Study Using Real-World Human Annotations},\nauthor={Jiaheng Wei and Zhaowei Zhu and Hao Cheng and Tongliang Liu and Gang Niu and Yang Liu},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=TBWA6PLJZQm}\n}", "github": "", "project": "", "reviewers": "EZud;q5d1;PLPg;7fRY", "pdf_size": 0, "recommendation": "6;6;8;8", "confidence": "5;5;4;5", "correctness": "3;3;4;4", "technical_novelty": "3;2;3;3", "empirical_novelty": "3;2;4;3", "wc_summary_paper": "98;33;20;70", "wc_summary_review": "55;51;33;76", "wc_main_review": "431;206;159;226", "wc_review": "584;290;212;372", "wc_reply_reviewers": "200;22;347;0", "wc_reply_authors": "1234;731;1351;546", "reply_reviewers": "1;1;3;0", "reply_authors": "3;2;4;1", "recommendation_avg": [ 7.0, 1.0 ], "confidence_avg": [ 4.75, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 55.25, 30.75203245315665 ], "wc_summary_review_avg": [ 53.75, 15.286840746210448 ], "wc_main_review_avg": [ 255.5, 104.20292702222908 ], "wc_review_avg": [ 364.5, 138.78310415897175 ], "wc_reply_reviewers_avg": [ 142.25, 141.37958657458296 ], "wc_reply_authors_avg": [ 965.5, 336.03310848783934 ], "reply_reviewers_avg": [ 1.25, 1.0897247358851685 ], "reply_authors_avg": [ 2.5, 1.118033988749895 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 1.0, "gs_citation": 313, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=765841518981894990&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=TBWA6PLJZQm", "email": "google.com;ucsc.edu;tencent.com;sydney.edu.au;riken.jp;ucsc.edu", "author_num": 6, "aff_unique_index": "0;1;2;3;4;1", "aff_unique_norm": "Google;University of California, Santa Cruz;Tencent;University of Sydney;RIKEN", "aff_unique_dep": "Google;;Youtu Lab;;", "aff_unique_url": "https://www.google.com;https://www.ucsc.edu;https://www.tencent.com;https://www.sydney.edu.au;https://www.riken.jp", "aff_unique_abbr": "Google;UCSC;Tencent;USYD;RIKEN", "aff_campus_unique_index": "0;1;1", "aff_campus_unique": "Mountain View;Santa Cruz;", "aff_country_unique_index": "0;0;1;2;3;0", "aff_country_unique": "United States;China;Australia;Japan" }, { "title": "SPIRAL: Self-supervised Perturbation-Invariant Representation Learning for Speech Pre-Training", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6593", "id": "TBpg4PnXhYH", "poster": "", "openreview": "https://openreview.net/forum?id=TBpg4PnXhYH", "slides": "https://iclr.cc/virtual/2022/poster/6593", "video": "https://iclr.cc/virtual/2022/poster/6593", "author_site": "Wenyong Huang, Zhenhe Zhang, Yu Ting Yeung, Xin Jiang, Qun Liu", "tldr": "", "abstract": "We introduce a new approach for speech pre-training named SPIRAL which works by learning denoising representation of perturbed data in a teacher-student framework. \nSpecifically, given a speech utterance, we first feed the utterance to a teacher network to obtain corresponding representation. Then the same utterance is perturbed and fed to a student network. The student network is trained to output representation resembling that of the teacher. At the same time, the teacher network is updated as moving average of student's weights over training steps. In order to prevent representation collapse, we apply an in-utterance contrastive loss as pre-training objective and impose position randomization on the input to the teacher. SPIRAL achieves competitive or better results compared to state-of-the-art speech pre-training method wav2vec 2.0, with significant reduction of training cost (80% for BASE model, 65% for LARGE model). \nFurthermore, we address the problem of noise-robustness that is critical to real-world speech applications. We propose multi-condition pre-training by perturbing the student's input with various types of additive noise. We demonstrate that multi-condition pre-trained SPIRAL models are more robust to noisy speech (9.0% - 13.3% relative word error rate reduction on real noisy test data), compared to applying multi-condition training solely in the fine-tuning stage. Source code is available at https://github.com/huawei-noah/Speech-Backbones/tree/main/SPIRAL.", "keywords": "Speech Representation Learning;Speech Pre-training;Speech Recognition;Self-supervised Representation Learning", "primary_area": "", "supplementary_material": "", "author": "Wenyong Huang;Zhenhe Zhang;Yu Ting Yeung;Xin Jiang;Qun Liu", "authorids": "~Wenyong_Huang1;zhangzhenhe1@huawei.com;yeung.yu.ting@huawei.com;~Xin_Jiang1;~Qun_Liu1", "gender": "M;;;M;M", "homepage": ";;;;http://liuquncn.github.io/", "dblp": ";;;42/4142-2;75/4402-1", "google_scholar": "z8UoSOwAAAAJ;;;DUfcez0AAAAJ;2HhiGzcAAAAJ", "orcid": ";;;0000-0002-9117-8247;0000-0002-7000-1792", "linkedin": ";;;xin-jiang-9577b76/;qunliu/", "or_profile": "~Wenyong_Huang1;zhangzhenhe1@huawei.com;yeung.yu.ting@huawei.com;~Xin_Jiang1;~Qun_Liu1", "aff": "Huawei Technologies Ltd.;;;Noah\u2019s Ark Lab, Huawei Technologies;Huawei Noah's Ark Lab", "aff_domain": "huawei.com;;;huawei.com;huawei.com", "position": "Researcher;;;Principal Researcher;Chief Scientist of Speech and Language Computing", "bibtex": "@inproceedings{\nhuang2022spiral,\ntitle={{SPIRAL}: Self-supervised Perturbation-Invariant Representation Learning for Speech Pre-Training},\nauthor={Wenyong Huang and Zhenhe Zhang and Yu Ting Yeung and Xin Jiang and Qun Liu},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=TBpg4PnXhYH}\n}", "github": "", "project": "", "reviewers": "J87m;H4es;GA9R;g6zH;fPpa", "pdf_size": 0, "recommendation": "6;6;8;8;8", "confidence": "4;3;3;4;5", "correctness": "3;3;4;4;4", "technical_novelty": "2;3;3;3;3", "empirical_novelty": "4;3;3;3;3", "wc_summary_paper": "156;27;52;104;50", "wc_summary_review": "113;100;60;71;45", "wc_main_review": "649;325;176;195;361", "wc_review": "918;452;288;370;456", "wc_reply_reviewers": "195;0;0;0;0", "wc_reply_authors": "855;489;172;43;565", "reply_reviewers": "4;0;0;0;0", "reply_authors": "2;1;1;1;1", "recommendation_avg": [ 7.2, 0.9797958971132712 ], "confidence_avg": [ 3.8, 0.7483314773547882 ], "correctness_avg": [ 3.6, 0.4898979485566356 ], "technical_novelty_avg": [ 2.8, 0.39999999999999997 ], "empirical_novelty_avg": [ 3.2, 0.39999999999999997 ], "wc_summary_paper_avg": [ 77.8, 46.52053310098671 ], "wc_summary_review_avg": [ 77.8, 25.182533629482162 ], "wc_main_review_avg": [ 341.2, 169.74145044743787 ], "wc_review_avg": [ 496.8, 219.43418147590407 ], "wc_reply_reviewers_avg": [ 39.0, 78.0 ], "wc_reply_authors_avg": [ 424.8, 289.31947739479966 ], "reply_reviewers_avg": [ 0.8, 1.6000000000000003 ], "reply_authors_avg": [ 1.2, 0.4000000000000001 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.3273268353539886, "corr_recommendation_correctness": 1.0, "gs_citation": 27, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7704368190007822312&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=TBpg4PnXhYH", "email": "huawei.com;;;huawei.com;huawei.com", "author_num": 5, "aff_unique_index": "0;0;0", "aff_unique_norm": "Huawei", "aff_unique_dep": "Huawei Technologies", "aff_unique_url": "https://www.huawei.com", "aff_unique_abbr": "Huawei", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "id": "TCl7CbQ29hH", "title": "CPT: Colorful Prompt Tuning for Pre-trained Vision-Language Models", "track": "main", "status": "Reject", "tldr": "", "abstract": "Pre-Trained Vision-Language Models (VL-PTMs) have shown promising capabilities in grounding natural language in image data, facilitating a broad variety of cross-modal tasks. However, we note that there exists a significant gap between the objective forms of model pre-training and fine-tuning, resulting in a need for large amounts of labeled data to stimulate the visual grounding capability of VL-PTMs for downstream tasks. To address the challenge, we present Cross-modal Prompt Tuning (CPT, alternatively, Colorful Prompt Tuning), a novel paradigm for tuning VL-PTMs, which reformulates visual grounding into a fill-in-the-blank problem with color-based co-referential markers in image and text, maximally mitigating the gap. In this way, CPT enables strong few-shot and even zero-shot visual grounding capabilities of VL-PTMs. Comprehensive experimental results show that the prompt-tuned VL-PTMs outperform their fine-tuned counterparts by a large margin (e.g., 17.3% absolute accuracy improvement, and 73.8% relative standard deviation reduction on average with one shot in RefCOCO evaluation). All the data and codes will be available to facilitate future research.", "keywords": "Pretrained Vision-language Models;Prompt Tuning;Visual Grounding", "primary_area": "", "supplementary_material": "", "author": "Yuan Yao;Ao Zhang;Zhengyan Zhang;Zhiyuan Liu;Tat-Seng Chua;Maosong Sun", "authorids": "~Yuan_Yao12;aozhang@u.nus.edu;~Zhengyan_Zhang1;~Zhiyuan_Liu1;~Tat-Seng_Chua2;~Maosong_Sun1", "gender": "M;;M;M;;M", "homepage": "https://yaoyuanthu.github.io/;;;http://nlp.csai.tsinghua.edu.cn/~lzy;;https://www.cs.tsinghua.edu.cn/csen/info/1312/4394.htm", "dblp": ";;;53/3245-1;;95/3291-1", "google_scholar": "https://scholar.google.com.hk/citations?user=3NWfi3YAAAAJ;;;dT0v5u0AAAAJ;;https://scholar.google.com.tw/citations?user=zIgT0HMAAAAJ", "orcid": ";;;0000-0002-7709-2543;;", "linkedin": ";;;;;", "or_profile": "~Yuan_Yao12;aozhang@u.nus.edu;~Zhengyan_Zhang1;~Zhiyuan_Liu1;~Tat-Seng_Chua2;~Maosong_Sun1", "aff": "Tsinghua University;;Tsinghua University;Tsinghua University;;Tsinghua University", "aff_domain": "tsinghua.edu.cn;;tsinghua.edu.cn;tsinghua.edu.cn;;tsinghua.edu.cn", "position": "PhD student;;PhD student;Associate Professor;;Full Professor", "bibtex": "@misc{\nyao2022cpt,\ntitle={{CPT}: Colorful Prompt Tuning for Pre-trained Vision-Language Models},\nauthor={Yuan Yao and Ao Zhang and Zhengyan Zhang and Zhiyuan Liu and Tat-Seng Chua and Maosong Sun},\nyear={2022},\nurl={https://openreview.net/forum?id=TCl7CbQ29hH}\n}", "github": "", "project": "", "reviewers": "KcLt;iZcJ;vtFS;WQNe", "site": "https://openreview.net/forum?id=TCl7CbQ29hH", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "4;4;4;4", "correctness": "3;3;3;3", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "34;75;57;77", "wc_summary_review": "89;43;45;7", "wc_main_review": "160;359;156;161", "wc_review": "283;477;258;245", "wc_reply_reviewers": "0;132;0;103", "wc_reply_authors": "404;869;339;508", "reply_reviewers": "0;2;0;1", "reply_authors": "1;3;1;2", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 60.75, 17.297037318569906 ], "wc_summary_review_avg": [ 46.0, 29.068883707497267 ], "wc_main_review_avg": [ 209.0, 86.62274528090182 ], "wc_review_avg": [ 315.75, 94.09403541139045 ], "wc_reply_reviewers_avg": [ 58.75, 59.63797028739325 ], "wc_reply_authors_avg": [ 530.0, 204.79379873423903 ], "reply_reviewers_avg": [ 0.75, 0.82915619758885 ], "reply_authors_avg": [ 1.75, 0.82915619758885 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 299, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14534520698560174065&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Tsinghua University", "aff_unique_dep": "", "aff_unique_url": "https://www.tsinghua.edu.cn", "aff_unique_abbr": "THU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "id": "TD-5kgf13mH", "title": "Sparse MoEs meet Efficient Ensembles", "track": "main", "status": "Reject", "tldr": "", "abstract": "Machine learning models based on the aggregated outputs of submodels, either at the activation or prediction levels, lead to strong performance. We study the interplay of two popular classes of such models: ensembles of neural networks and sparse mixture of experts (sparse MoEs). First, we show that these two approaches have complementary features whose combination is beneficial. Then, we present partitioned batch ensembles, an efficient ensemble of sparse MoEs that takes the best of both classes of models. Extensive experiments on fine-tuned vision transformers demonstrate the accuracy, log-likelihood, few-shot learning, robustness, and uncertainty calibration improvements of our approach over several challenging baselines. Partitioned batch ensembles not only scale to models with up to 2.7B parameters, but also provide larger performance gains for larger models. ", "keywords": "Ensembles;Sparse MoEs;Robustness;Uncertainty Calibration;OOD detection;Efficient Ensembles;Large scale;Computer vision", "primary_area": "", "supplementary_material": "", "author": "James Urquhart Allingham;Florian Wenzel;Zelda E Mariet;Basil Mustafa;Joan Puigcerver;Neil Houlsby;Ghassen Jerfel;Vincent Fortuin;Balaji Lakshminarayanan;Jasper Snoek;Dustin Tran;Carlos Riquelme Ruiz;Rodolphe Jenatton", "authorids": "~James_Urquhart_Allingham1;~Florian_Wenzel1;~Zelda_E_Mariet1;~Basil_Mustafa1;~Joan_Puigcerver1;~Neil_Houlsby1;~Ghassen_Jerfel1;~Vincent_Fortuin1;~Balaji_Lakshminarayanan1;~Jasper_Snoek1;~Dustin_Tran1;~Carlos_Riquelme_Ruiz1;~Rodolphe_Jenatton3", "gender": "M;M;F;M;M;M;M;M;M;M;;M;M", "homepage": "https://jamesallingham.com;;https://zelda.lids.mit.edu/;https://www.basilmustafa.com/;http://www.jpuigcerver.net;https://neilhoulsby.github.io/;http://jerfel.com/;https://fortuin.github.io/;http://www.gatsby.ucl.ac.uk/~balaji/;;http://dustintran.com;https://rikel.github.io/;http://rodolphejenatton.com/", "dblp": ";04/9709;164/7319;;155/3271;91/10669;;218/7489;71/8324;95/6097;;https://dblp.uni-trier.de/pers/hd/r/Riquelme:Carlos;68/8398", "google_scholar": "CIp9adkAAAAJ;;twuEPEEAAAAJ;https://scholar.google.co.uk/citations?user=LuxZAJwAAAAJ;https://scholar.google.com/citations?hl=en;https://scholar.google.com/citations?hl=en;aDKi5l8AAAAJ;https://scholar.google.ch/citations?user=XBlrYTIAAAAJ;QYn8RbgAAAAJ;FM2DTXwAAAAJ;wVazIm8AAAAJ;Es2BBeYAAAAJ;QIR6rygAAAAJ", "orcid": ";;;;;;;0000-0002-0640-2671;;;;;", "linkedin": ";;;basil-mustafa/;;;;vincent-fortuin-42426b134/;;;;;", "or_profile": "~James_Urquhart_Allingham1;~Florian_Wenzel1;~Zelda_E_Mariet1;~Basil_Mustafa1;~Joan_Puigcerver1;~Neil_Houlsby1;~Ghassen_Jerfel1;~Vincent_Fortuin1;~Balaji_Lakshminarayanan1;~Jasper_Snoek1;~Dustin_Tran1;~Carlos_Riquelme_Ruiz1;~Rodolphe_Jenatton3", "aff": "University of Amsterdam;Amazon;Google;Google;Google;Google;;University of Cambridge;Google Brain;Google;Google;Google;Google", "aff_domain": "uva.nl;amazon.com;google.com;google.com;google.com;google.com;;cam.ac.uk;google.com;google.com;google.com;google.com;google.com", "position": "Researcher;Researcher;Research Scientist;Research Software Engineer;Software Engineer in Research;Researcher;;Researcher;Research Scientist;Research Scientist;Research Scientist;Researcher;Senior research scientist", "bibtex": "@misc{\nallingham2022sparse,\ntitle={Sparse MoEs meet Efficient Ensembles},\nauthor={James Urquhart Allingham and Florian Wenzel and Zelda E Mariet and Basil Mustafa and Joan Puigcerver and Neil Houlsby and Ghassen Jerfel and Vincent Fortuin and Balaji Lakshminarayanan and Jasper Snoek and Dustin Tran and Carlos Riquelme Ruiz and Rodolphe Jenatton},\nyear={2022},\nurl={https://openreview.net/forum?id=TD-5kgf13mH}\n}", "github": "", "project": "", "reviewers": "cawf;5srE;KpkQ", "site": "https://openreview.net/forum?id=TD-5kgf13mH", "pdf_size": 0, "recommendation": "3;5;6", "confidence": "5;5;3", "correctness": "3;4;4", "technical_novelty": "2;2;3", "empirical_novelty": "2;1;3", "wc_summary_paper": "191;50;72", "wc_summary_review": "67;16;38", "wc_main_review": "490;254;183", "wc_review": "748;320;293", "wc_reply_reviewers": "0;93;0", "wc_reply_authors": "2115;1162;815", "reply_reviewers": "0;1;0", "reply_authors": "4;2;1", "recommendation_avg": [ 4.666666666666667, 1.247219128924647 ], "confidence_avg": [ 4.333333333333333, 0.9428090415820634 ], "correctness_avg": [ 3.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.0, 0.816496580927726 ], "wc_summary_paper_avg": [ 104.33333333333333, 61.93724422528195 ], "wc_summary_review_avg": [ 40.333333333333336, 20.885933597094056 ], "wc_main_review_avg": [ 309.0, 131.22753776043604 ], "wc_review_avg": [ 453.6666666666667, 208.4167832866527 ], "wc_reply_reviewers_avg": [ 31.0, 43.840620433565945 ], "wc_reply_authors_avg": [ 1364.0, 549.6077389071834 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 2.3333333333333335, 1.247219128924647 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 13, 0 ], "corr_recommendation_confidence": -0.7559289460184545, "corr_recommendation_correctness": 0.9449111825230683, "gs_citation": 26, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3533791708788284855&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;2;2;2;2;3;2;2;2;2;2", "aff_unique_norm": "University of Amsterdam;Amazon;Google;University of Cambridge", "aff_unique_dep": ";Amazon.com, Inc.;Google;", "aff_unique_url": "https://www.uva.nl;https://www.amazon.com;https://www.google.com;https://www.cam.ac.uk", "aff_unique_abbr": "UvA;Amazon;Google;Cambridge", "aff_campus_unique_index": "1;1;1;1;2;1;1;1;1;1", "aff_campus_unique": ";Mountain View;Cambridge", "aff_country_unique_index": "0;1;1;1;1;1;2;1;1;1;1;1", "aff_country_unique": "Netherlands;United States;United Kingdom" }, { "id": "TEKnz3B1jGF", "title": "Visio-Linguistic Brain Encoding", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Enabling effective brain-computer interfaces needs understanding how the human brain encodes stimuli across modalities such as visual, language (or text), etc. Brain encoding aims at constructing fMRI brain activity given a stimulus. There exist a plethora of neural encoding models which study brain encoding for single-mode stimuli: visual (pretrained CNNs) or text (pretrained language models). Few recent papers have also obtained separate visual and text representation models and performed late-fusion using simple heuristics. However, previous work has failed to explore: (a) the effectiveness of image Transformer models for encoding visual stimuli, and (b) co-attentive multi-modal modeling for visual and text reasoning. Further, as pretrained image Transformers and multi-modal Transformers have continued to evolve, it is important to understand if they are becoming more brain-like and hence lead to improved brain encoding. In this paper, we systematically explore the efficacy of image Transformers (ViT, DEiT, and BEiT) and multi-modal Transformers (VisualBERT, LXMERT, ViLBERT, and CLIP) for brain encoding. Extensive experiments on two popular datasets, BOLD5000 and Pereira, provide the following insights. (1) To the best of our knowledge, we are the first to investigate the effectiveness of image and multi-modal Transformers for brain encoding. (2) Surprisingly, we observe a better encoding correlation between Transformer model layers and the levels of visual processing in the human brain when compared to CNN architectures. (3) We find that multi-modal Transformers significantly outperform previously proposed single-mode CNNs, image Transformers as well as other previously proposed multi-modal models, thereby establishing new state-of-the-art. The supremacy of visio-linguistic models raises the question of whether the responses elicited in the visual regions are affected implicitly by linguistic processing even when passively viewing images. Future fMRI tasks can verify this computational insight in an appropriate experimental setting. We make our code publicly available.", "keywords": "fMRI encoding;Vision Transformers;Multi-Modal Transformers", "primary_area": "", "supplementary_material": "/attachment/c6d9139b7ce2837f039b8c707d8edbd170e1f492.zip", "author": "SUBBA REDDY OOTA;Jashn Arora;Vijay Rowtula;Manish Gupta;Bapi Raju Surampudi", "authorids": "~SUBBA_REDDY_OOTA1;jashn.arora@research.iiit.ac.in;~Vijay_Rowtula1;manish.gupta@iiit.ac.in;~Bapi_Raju_Surampudi1", "gender": "M;;M;;", "homepage": "https://sites.google.com/view/subbareddyoota300/home?authuser=0;;;;", "dblp": "190/1709;;230/2052;;", "google_scholar": "https://scholar.google.co.in/citations?user=4Uz0LngAAAAJ;;;;", "orcid": "0000-0002-5975-622X;;;;", "linkedin": "subba-reddy-oota-11a91254/;;;;", "or_profile": "~SUBBA_REDDY_OOTA1;jashn.arora@research.iiit.ac.in;~Vijay_Rowtula1;manish.gupta@iiit.ac.in;~Bapi_Raju_Surampudi1", "aff": "MPI-SWS;;International Institute of Information Technology Hyderabad, Dhirubhai Ambani Institute Of Information and Communication Technology;;", "aff_domain": "mpi-sws.org;;iiit.ac.in;;", "position": "Visiting Scholar;;MS student;;", "bibtex": "@misc{\noota2022visiolinguistic,\ntitle={Visio-Linguistic Brain Encoding},\nauthor={SUBBA REDDY OOTA and Jashn Arora and Vijay Rowtula and Manish Gupta and Bapi Raju Surampudi},\nyear={2022},\nurl={https://openreview.net/forum?id=TEKnz3B1jGF}\n}", "github": "", "project": "", "reviewers": "aB2s;JeNH;Jx7S;aiyT", "site": "https://openreview.net/forum?id=TEKnz3B1jGF", "pdf_size": 0, "recommendation": "1;3;3;3", "confidence": "5;4;5;3", "correctness": "2;2;2;3", "technical_novelty": "1;2;1;2", "empirical_novelty": "2;3;4;2", "wc_summary_paper": "115;185;60;84", "wc_summary_review": "82;125;157;153", "wc_main_review": "657;667;723;373", "wc_review": "854;977;940;610", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 2.5, 0.8660254037844386 ], "confidence_avg": [ 4.25, 0.82915619758885 ], "correctness_avg": [ 2.25, 0.4330127018922193 ], "technical_novelty_avg": [ 1.5, 0.5 ], "empirical_novelty_avg": [ 2.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 111.0, 46.96275119709236 ], "wc_summary_review_avg": [ 129.25, 29.93639089803579 ], "wc_main_review_avg": [ 605.0, 136.28646301082145 ], "wc_review_avg": [ 845.25, 142.96393776054157 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.5222329678670935, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 25, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17473532278228081138&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff_unique_index": "0;1", "aff_unique_norm": "Max Planck Institute for Software Systems;International Institute of Information Technology Hyderabad", "aff_unique_dep": ";", "aff_unique_url": "https://www.mpi-sws.org;https://iiit Hyderabad.ac.in", "aff_unique_abbr": "MPI-SWS;IIIT Hyderabad", "aff_campus_unique_index": "1", "aff_campus_unique": ";Hyderabad", "aff_country_unique_index": "0;1", "aff_country_unique": "Germany;India" }, { "id": "TEt7PsVZux6", "title": "I-PGD-AT: Efficient Adversarial Training via Imitating Iterative PGD Attack", "track": "main", "status": "Reject", "tldr": "", "abstract": "Adversarial training has been widely used in various machine learning paradigms to improve the robustness; while it would increase the training cost due to the perturbation optimization process. To improve the efficiency, recent studies leverage Fast Gradient Sign Method with Random Start (FGSM-RS) for adversarial training. However, such methods would lead to relatively low robustness and catastrophic overfitting, which means the robustness against iterative attacks (e.g. Projected Gradient Descent (PGD)) would suddenly drop to 0%. Different approaches have been proposed to address this problem, while later studies show that catastrophic overfitting still remains. In this paper, motivated by the fact that expensive iterative adversarial training methods achieve high robustness without catastrophic overfitting, we aim to ask: Can we perform iterative adversarial training in an efficient way? To this end, we first analyze the difference of perturbation generated by FGSM-RS and PGD and find that PGD tends to craft diverse discrete values instead of $\\pm 1$ in FGSM-RS. Based on this observation, we propose an efficient single-step adversarial training method I-PGD-AT by adopting I-PGD attack for training, in which I-PGD imitates PGD virtually. Unlike FGSM that crafts the perturbation directly using the sign of gradient, I-PGD imitates the perturbation of PGD based on the magnitude of gradient. Extensive empirical evaluations on CIFAR-10 and Tiny ImageNet demonstrate that our I-PGD-AT can improve the robustness compared with the baselines and significantly delay catastrophic overfitting. Moreover, we explore and discuss the factors that affect catastrophic overfitting. Finally, to demonstrate the generality of I-PGD-AT, we integrate it into PGD adversarial training and show that it can even further improve the robustness.", "keywords": "Single-step Adversarial Training;Catastrophic Overfitting;Adversarial Robustness;Adversarial Example", "primary_area": "", "supplementary_material": "/attachment/02abd47a06c8ac512d2f14b673e5a3b411e07fb5.zip", "author": "Xiaosen Wang;Bhavya Kailkhura;Krishnaram Kenthapadi;Bo Li", "authorids": "~Xiaosen_Wang1;~Bhavya_Kailkhura1;~Krishnaram_Kenthapadi1;~Bo_Li19", "gender": "M;M;M;F", "homepage": "https://xiaosen-wang.github.io/;https://people.llnl.gov/kailkhura1;https://cs.stanford.edu/people/kngk/;http://boli.cs.illinois.edu/", "dblp": "241/6284;132/8938;29/4781;50/3402-26", "google_scholar": "sVeDOcsAAAAJ;SQpJmOgAAAAJ;av5rGaEAAAAJ;K8vJkTcAAAAJ", "orcid": ";;0000-0003-1237-087X;", "linkedin": ";;krishnaramkenthapadi/;", "or_profile": "~Xiaosen_Wang1;~Bhavya_Kailkhura1;~Krishnaram_Kenthapadi1;~Bo_Li19", "aff": "Huazhong University of Science and Technology;Lawrence Livermore National Laboratory;Fiddler AI;University of Illinois, Urbana Champaign", "aff_domain": "hust.edu.cn;llnl.gov;fiddler.ai;illinois.edu", "position": "MS student;Research Staff;Chief Scientist;Assistant Professor", "bibtex": "@misc{\nwang2022ipgdat,\ntitle={I-{PGD}-{AT}: Efficient Adversarial Training via Imitating Iterative {PGD} Attack },\nauthor={Xiaosen Wang and Bhavya Kailkhura and Krishnaram Kenthapadi and Bo Li},\nyear={2022},\nurl={https://openreview.net/forum?id=TEt7PsVZux6}\n}", "github": "", "project": "", "reviewers": "bLbt;Dz2K;hVJW;AAHj", "site": "https://openreview.net/forum?id=TEt7PsVZux6", "pdf_size": 0, "recommendation": "3;5;6;6", "confidence": "5;5;4;4", "correctness": "3;4;3;3", "technical_novelty": "2;2;4;4", "empirical_novelty": "2;2;4;0", "wc_summary_paper": "66;68;212;77", "wc_summary_review": "26;71;100;139", "wc_main_review": "355;293;540;355", "wc_review": "447;432;852;571", "wc_reply_reviewers": "255;0;0;0", "wc_reply_authors": "1402;535;1302;651", "reply_reviewers": "1;0;0;0", "reply_authors": "2;1;2;1", "recommendation_avg": [ 5.0, 1.224744871391589 ], "confidence_avg": [ 4.5, 0.5 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 1.0 ], "empirical_novelty_avg": [ 2.0, 1.4142135623730951 ], "wc_summary_paper_avg": [ 105.75, 61.48322942071277 ], "wc_summary_review_avg": [ 84.0, 41.27347816697788 ], "wc_main_review_avg": [ 385.75, 92.5834083408037 ], "wc_review_avg": [ 575.5, 168.50593461359156 ], "wc_reply_reviewers_avg": [ 63.75, 110.41823898251593 ], "wc_reply_authors_avg": [ 972.5, 383.3435143575537 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.8164965809277259, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:3gbikplAxQUJ:scholar.google.com/&scioq=I-PGD-AT:+Efficient+Adversarial+Training+via+Imitating+Iterative+PGD+Attack&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "Huazhong University of Science and Technology;Lawrence Livermore National Laboratory;Fiddler AI;University of Illinois Urbana-Champaign", "aff_unique_dep": ";;;", "aff_unique_url": "http://www.hust.edu.cn;https://www.llnl.gov;https://www.fiddler.ai;https://illinois.edu", "aff_unique_abbr": "HUST;LLNL;Fiddler AI;UIUC", "aff_campus_unique_index": "1", "aff_campus_unique": ";Urbana-Champaign", "aff_country_unique_index": "0;1;1;1", "aff_country_unique": "China;United States" }, { "id": "TFzHbrMveuZ", "title": "Knowledge Graph Completion as Tensor Decomposition: A Genreal Form and Tensor N-rank Regularization", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Knowledge graph completion (KGC) is a 3rd-order binary tensor completion task. Tensor decomposition based (TDB) models have shown great performance in KGC. In this paper, we summarize existing TDB models and derive a general form for them. Based on the general form, we show the principles of model design to satisfy logical rules. However, these models suffer from the overfitting problem severely. Therefore, we propose a regularization term based on the tensor $n$-rank which enforces the low-rankness of the tensor. First, we relax the tensor $n$-rank to the sum of the nuclear norms of the unfolding matrix along each mode of the tensor. In order to be computationally efficient, we further give an upper bound of the sum of the nuclear norms. Finally, we use the upper bound as the regularization term to achieve low-rank matrix decomposition of each unfolding matrix. Experiments show that our model achieves state-of-the-art performance on benchmark datasets.", "keywords": "Knowledge Graph;Tensor Decomposition;Low-rank Tensor Completion", "primary_area": "", "supplementary_material": "/attachment/be22fc822767c16a1f2226db309730a671986c50.zip", "author": "Changyi Xiao;Xiangnan He;Yixin Cao", "authorids": "~Changyi_Xiao1;~Xiangnan_He1;~Yixin_Cao2", "gender": "M;M;M", "homepage": ";http://staff.ustc.edu.cn/~hexn;https://sites.google.com/view/yixin-homepage", "dblp": "270/8871;59/1007;20/8038-2", "google_scholar": "0_fwA4QAAAAJ;https://scholar.google.com.sg/citations?user=X45Go24AAAAJ;https://scholar.google.co.uk/citations?user=CnhTvdoAAAAJ", "orcid": ";0000-0001-8472-7992;", "linkedin": ";;", "or_profile": "~Changyi_Xiao1;~Xiangnan_He1;~Yixin_Cao2", "aff": ";University of Science and Technology of China;Nanyang Technological University", "aff_domain": ";ustc.edu.cn;ntu.edu.sg", "position": ";Professor;Assistant Professor", "bibtex": "@misc{\nxiao2022knowledge,\ntitle={Knowledge Graph Completion as Tensor Decomposition: A Genreal Form and Tensor N-rank Regularization},\nauthor={Changyi Xiao and Xiangnan He and Yixin Cao},\nyear={2022},\nurl={https://openreview.net/forum?id=TFzHbrMveuZ}\n}", "github": "", "project": "", "reviewers": "dW7S;Tq93;skay", "site": "https://openreview.net/forum?id=TFzHbrMveuZ", "pdf_size": 0, "recommendation": "3;3;6", "confidence": "3;4;3", "correctness": "4;3;3", "technical_novelty": "2;2;3", "empirical_novelty": "2;2;3", "wc_summary_paper": "120;32;86", "wc_summary_review": "51;35;10", "wc_main_review": "365;183;121", "wc_review": "536;250;217", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 4.0, 1.4142135623730951 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 79.33333333333333, 36.23380864453651 ], "wc_summary_review_avg": [ 32.0, 16.87206764645835 ], "wc_main_review_avg": [ 223.0, 103.55030983375504 ], "wc_review_avg": [ 334.3333333333333, 143.23484988724715 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.5, "corr_recommendation_correctness": -0.5, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:gUZMFiuPJaQJ:scholar.google.com/&scioq=Knowledge+Graph+Completion+as+Tensor+Decomposition:+A+Genreal+Form+and+Tensor+N-rank+Regularization&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1", "aff_unique_norm": "University of Science and Technology of China;Nanyang Technological University", "aff_unique_dep": ";", "aff_unique_url": "http://www.ustc.edu.cn;https://www.ntu.edu.sg", "aff_unique_abbr": "USTC;NTU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "China;Singapore" }, { "id": "TGfj2P_410X", "title": "On the Effect of Input Perturbations for Graph Neural Networks", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "The expressive power of a message passing graph neural network (MPGNN) depends on its architecture and the input node attributes. In this work, we study how this interplay is affected by input perturbations. First, perturbations of node attributes may act as noise and hinder predictive power. But, perturbations can also aid expressiveness, by making nodes more identifiable. Recent works show that unique node IDs are necessary to represent certain functions with MPGNNs. Our results relate properties of the noise, smoothness of the model and the geometry of the input graphs and task. In particular, we take the perspective of lower bounding smoothness for achieving discrimination: how much output variation is needed for exploiting random node IDs, or for retaining discriminability? Our theoretical results imply constraints on the model for exploiting random node IDs, and, conversely, insights into the tolerance of a given model class for retaining discrimination with perturbations of node attributes.", "keywords": "graph neural networks", "primary_area": "", "supplementary_material": "", "author": "Behrooz Tahmasebi;Stefanie Jegelka", "authorids": "~Behrooz_Tahmasebi1;~Stefanie_Jegelka3", "gender": "M;F", "homepage": "https://people.csail.mit.edu/bzt/;http://people.csail.mit.edu/stefje/", "dblp": "223/0884;38/7003", "google_scholar": "ZXCO3DMAAAAJ;gTWUZlsAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Behrooz_Tahmasebi1;~Stefanie_Jegelka3", "aff": "Massachusetts Institute of Technology;Massachusetts Institute of Technology", "aff_domain": "mit.edu;mit.edu", "position": "PhD student;Associate Professor", "bibtex": "@misc{\ntahmasebi2022on,\ntitle={On the Effect of Input Perturbations for Graph Neural Networks},\nauthor={Behrooz Tahmasebi and Stefanie Jegelka},\nyear={2022},\nurl={https://openreview.net/forum?id=TGfj2P_410X}\n}", "github": "", "project": "", "reviewers": "", "site": "https://openreview.net/forum?id=TGfj2P_410X", "pdf_size": 0, "recommendation": "", "confidence": "", "correctness": "", "technical_novelty": "", "empirical_novelty": "", "wc_summary_paper": "", "wc_summary_review": "", "wc_main_review": "", "wc_review": "", "wc_reply_reviewers": "", "wc_reply_authors": "", "reply_reviewers": "", "reply_authors": "", "recommendation_avg": [ 0, 0 ], "confidence_avg": [ 0, 0 ], "correctness_avg": [ 0, 0 ], "technical_novelty_avg": [ 0, 0 ], "empirical_novelty_avg": [ 0, 0 ], "wc_summary_paper_avg": [ 0, 0 ], "wc_summary_review_avg": [ 0, 0 ], "wc_main_review_avg": [ 0, 0 ], "wc_review_avg": [ 0, 0 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 1, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0, "corr_recommendation_correctness": 0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Gnn7i5FgyAoJ:scholar.google.com/&scioq=On+the+Effect+of+Input+Perturbations+for+Graph+Neural+Networks&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Massachusetts Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://web.mit.edu", "aff_unique_abbr": "MIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "TH7crDRRND", "title": "Revisiting Locality-Sensitive Binary Codes from Random Fourier Features", "track": "main", "status": "Reject", "tldr": "", "abstract": "The method of Random Fourier Feature (RFF) has been popular for large-scale learning, which generates non-linear random features of the data. It has also been used to construct binary codes via stochastic quantization for efficient information retrieval. In this paper, we revisit binary hashing from RFF, and propose SignRFF, a new and simple strategy to extract RFF-based binary codes. We show the locality-sensitivity of SignRFF, and propose a new measure, called ranking efficiency, to theoretically compare different Locality-Sensitive Hashing (LSH) methods with practical implications. Experiments are conducted to show that the proposed SignRFF is consistently better than the previous RFF-based method, and also outperforms other data-dependent and deep learning based hashing methods with sufficient number of hash bits. Moreover, we also validate that the proposed ranking efficiency aligns well with the empirical search performance.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xiaoyun Li;Ping Li", "authorids": "~Xiaoyun_Li1;~Ping_Li3", "gender": "M;M", "homepage": "https://lixiaoyun0239.github.io/cv/;http://www.stat.rutgers.edu/home/pingli/", "dblp": ";62/5860-1", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": "~Xiaoyun_Li1;~Ping_Li3", "aff": "Baidu;LinkedIn", "aff_domain": "baidu.com;linkedin.com", "position": "Researcher;Engineer", "bibtex": "@misc{\nli2022revisiting,\ntitle={Revisiting Locality-Sensitive Binary Codes from Random Fourier Features},\nauthor={Xiaoyun Li and Ping Li},\nyear={2022},\nurl={https://openreview.net/forum?id=TH7crDRRND}\n}", "github": "", "project": "", "reviewers": "3RfW;TrWv;tti8;Mduj", "site": "https://openreview.net/forum?id=TH7crDRRND", "pdf_size": 0, "recommendation": "3;5;6;6", "confidence": "3;3;4;2", "correctness": "3;3;3;4", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;1;3;3", "wc_summary_paper": "10;54;73;61", "wc_summary_review": "12;53;35;13", "wc_main_review": "131;310;427;36", "wc_review": "153;417;535;110", "wc_reply_reviewers": "0;139;168;0", "wc_reply_authors": "412;1018;1214;34", "reply_reviewers": "0;1;1;0", "reply_authors": "1;2;2;1", "recommendation_avg": [ 5.0, 1.224744871391589 ], "confidence_avg": [ 3.0, 0.7071067811865476 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 49.5, 23.79600806858159 ], "wc_summary_review_avg": [ 28.25, 16.990806337546196 ], "wc_main_review_avg": [ 226.0, 152.13645191077646 ], "wc_review_avg": [ 303.75, 177.88110495496704 ], "wc_reply_reviewers_avg": [ 76.75, 77.43182485257596 ], "wc_reply_authors_avg": [ 669.5, 471.1950233183708 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.4714045207910316, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:9gGV0vsVgD8J:scholar.google.com/&scioq=Revisiting+Locality-Sensitive+Binary+Codes+from+Random+Fourier+Features&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1", "aff_unique_norm": "Baidu;LinkedIn Corporation", "aff_unique_dep": "Baidu, Inc.;", "aff_unique_url": "https://www.baidu.com;https://www.linkedin.com", "aff_unique_abbr": "Baidu;LinkedIn", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "China;United States" }, { "id": "THMafOyRVpE", "title": "Fully Online Meta-Learning Without Task Boundaries", "track": "main", "status": "Reject", "tldr": "", "abstract": "While deep networks can learn complex classifiers and models, many applications require models that continually adapt to changing input distributions, changing tasks, and changing environmental conditions. Indeed, this ability to continuously accrue knowledge and use the past experience to learn new tasks quickly in continual settings is one of the key properties of an intelligent system. For complex and high-dimensional problems, simply updating the model continually with standard learning algorithms such as gradient descent may result in slow adaptation. Meta-learning can provide a powerful tool to accelerate adaptation but is conventionally studied in batch settings. In this paper, we study how meta-learning can be applied to tackle online problems of this nature, simultaneously adapting to online to changing tasks and input distributions and meta-training the model in order to adapt more quickly in the future. Extending meta-learning into the online setting presents its own challenges, and although several prior methods have studied related problems, they generally require a discrete notion of tasks, with known ground-truth task boundaries. Such methods typically adapt to each task in sequence, resetting the model between tasks, rather than adapting continuously across tasks. In many real-world settings, such discrete boundaries are unavailable, and may not even exist. To address these settings, we propose a Fully Online Meta-Learning (FOML) algorithm, which does not require any ground truth knowledge about the task boundaries and stays fully online without resetting back to pre-trained weights. Our experiments show that FOML was able to learn new tasks faster than the state-of-the-art online learning methods on Rainbow-MNIST, and CIFAR100 datasets.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/78f17afe1d301f81b1c3f330f85084589258d562.zip", "author": "Jathushan Rajasegaran;Chelsea Finn;Sergey Levine", "authorids": "~Jathushan_Rajasegaran2;~Chelsea_Finn1;~Sergey_Levine1", "gender": "M;F;M", "homepage": "https://brjathu.github.io/;https://ai.stanford.edu/~cbfinn/;https://people.eecs.berkeley.edu/~svlevine/", "dblp": "211/4065;131/1783;80/7594", "google_scholar": ";vfPE6hgAAAAJ;8R35rCwAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Jathushan_Rajasegaran2;~Chelsea_Finn1;~Sergey_Levine1", "aff": "University of California, Berkeley;Google;Google", "aff_domain": "berkeley.edu;google.com;google.com", "position": "PhD student;Research Scientist;Research Scientist", "bibtex": "@misc{\nrajasegaran2022fully,\ntitle={Fully Online Meta-Learning Without Task Boundaries},\nauthor={Jathushan Rajasegaran and Chelsea Finn and Sergey Levine},\nyear={2022},\nurl={https://openreview.net/forum?id=THMafOyRVpE}\n}", "github": "", "project": "", "reviewers": "dcpm;me6U;XzKp;dEVK", "site": "https://openreview.net/forum?id=THMafOyRVpE", "pdf_size": 0, "recommendation": "5;6;6;6", "confidence": "3;4;3;5", "correctness": "3;2;3;2", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "115;94;138;102", "wc_summary_review": "49;133;45;125", "wc_main_review": "439;320;429;279", "wc_review": "603;547;612;506", "wc_reply_reviewers": "68;0;0;100", "wc_reply_authors": "803;508;612;479", "reply_reviewers": "1;0;0;1", "reply_authors": "2;1;1;1", "recommendation_avg": [ 5.75, 0.4330127018922193 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 112.25, 16.64894891577243 ], "wc_summary_review_avg": [ 88.0, 41.12177038990418 ], "wc_main_review_avg": [ 366.75, 68.8853213681986 ], "wc_review_avg": [ 567.0, 43.1335136523794 ], "wc_reply_reviewers_avg": [ 42.0, 43.497126341863094 ], "wc_reply_authors_avg": [ 600.5, 126.9419158513058 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.5222329678670935, "corr_recommendation_correctness": -0.5773502691896257, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2982258306798324452&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;1", "aff_unique_norm": "University of California, Berkeley;Google", "aff_unique_dep": ";Google", "aff_unique_url": "https://www.berkeley.edu;https://www.google.com", "aff_unique_abbr": "UC Berkeley;Google", "aff_campus_unique_index": "0;1;1", "aff_campus_unique": "Berkeley;Mountain View", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Progressive Distillation for Fast Sampling of Diffusion Models", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6537", "id": "TIdIXIpzhoI", "poster": "", "openreview": "https://openreview.net/forum?id=TIdIXIpzhoI", "slides": "https://iclr.cc/virtual/2022/poster/6537", "video": "https://iclr.cc/virtual/2022/poster/6537", "author_site": "Tim Salimans, Jonathan Ho", "tldr": "", "abstract": "Diffusion models have recently shown great promise for generative modeling, outperforming GANs on perceptual quality and autoregressive models at density estimation. A remaining downside is their slow sampling time: generating high quality samples takes many hundreds or thousands of model evaluations. Here we make two contributions to help eliminate this downside: First, we present new parameterizations of diffusion models that provide increased stability when using few sampling steps, compared to models in the literature. Second, we present a method to distill a trained deterministic diffusion sampler, using many steps, into a new diffusion model that takes half as many sampling steps. We then keep progressively applying this distillation procedure to our model, halving the number of required sampling steps each time. On standard image generation benchmarks like CIFAR-10, ImageNet, and LSUN, we start out with (near) state-of-the-art samplers taking 1024 or 8192 steps, and are able to distill down to models taking as little as 4 steps without losing much perceptual quality; achieving, for example, a FID of 3.0 on CIFAR-10 in 4 steps. Finally, we show that the full progressive distillation procedure does not take more time than it takes to train the original model, thus representing an efficient solution for generative modeling using diffusion at both train and test time.", "keywords": "Diffusion Models;Generative Models;fast sampling", "primary_area": "", "supplementary_material": "/attachment/1cadccb57b83fb0be6ce5eedae57e5c115462506.zip", "author": "Tim Salimans;Jonathan Ho", "authorids": "~Tim_Salimans1;~Jonathan_Ho1", "gender": "M;", "homepage": ";", "dblp": "116/2791;80/8677", "google_scholar": ";iVLAQysAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Tim_Salimans1;~Jonathan_Ho1", "aff": "Google;Google", "aff_domain": "google.com;google.com", "position": "Research Scientist;Researcher", "bibtex": "@inproceedings{\nsalimans2022progressive,\ntitle={Progressive Distillation for Fast Sampling of Diffusion Models},\nauthor={Tim Salimans and Jonathan Ho},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=TIdIXIpzhoI}\n}", "github": "", "project": "", "reviewers": "HZnL;orkh;2wPQ;3Sjb", "pdf_size": 0, "recommendation": "8;8;8;8", "confidence": "4;4;3;4", "correctness": "4;3;4;4", "technical_novelty": "3;3;3;4", "empirical_novelty": "3;3;3;3", "wc_summary_paper": "158;120;145;65", "wc_summary_review": "95;68;130;13", "wc_main_review": "807;381;454;180", "wc_review": "1060;569;729;258", "wc_reply_reviewers": "139;43;0;0", "wc_reply_authors": "1204;847;495;221", "reply_reviewers": "1;1;0;0", "reply_authors": "3;2;2;1", "recommendation_avg": [ 8.0, 0.0 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 122.0, 35.63004350263974 ], "wc_summary_review_avg": [ 76.5, 42.746344872983 ], "wc_main_review_avg": [ 455.5, 226.3873892247534 ], "wc_review_avg": [ 654.0, 289.1807393309589 ], "wc_reply_reviewers_avg": [ 45.5, 56.764865894318824 ], "wc_reply_authors_avg": [ 691.75, 369.73596998398733 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 2.0, 0.7071067811865476 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 1365, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5194434213555432016&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=TIdIXIpzhoI", "email": "google.com;google.com", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Google", "aff_unique_dep": "Google", "aff_unique_url": "https://www.google.com", "aff_unique_abbr": "Google", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "TJF4wbKTxJf", "title": "Learning Lightweight Neural Networks via Channel-Split Recurrent Convolution", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Lightweight neural networks refer to deep networks with small numbers of parameters, which are allowed to be implemented in resource-limited hardware such as embedded systems. To learn such lightweight networks effectively and efficiently, in this paper we propose a novel convolutional layer, namely {\\em Channel-Split Recurrent Convolution (CSR-Conv)}, where we split the output channels to generate data sequences with length $T$ as the input to the recurrent layers with shared weights. As a consequence, we can construct lightweight convolutional networks by simply replacing (some) linear convolutional layers with CSR-Conv layers. We prove that under mild conditions the model size decreases with the rate of $O(\\frac{1}{T^2})$. Empirically we demonstrate the state-of-the-art performance using VGG-16, ResNet-50, ResNet-56, ResNet-110, DenseNet-40, MobileNet, and EfficientNet as backbone networks on CIFAR-10 and ImageNet.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/2838f8c5f14b506c2e3b6a8d2c20525fba0dc5f9.zip", "author": "Guojun Wu;Yun Yue;Yanhua Li;Ziming Zhang", "authorids": "~Guojun_Wu1;~Yun_Yue1;~Yanhua_Li1;~Ziming_Zhang4", "gender": ";;M;M", "homepage": "https://users.wpi.edu/~gwu/;https://yunyuny.github.io/;http://www.wpi.edu/~yli15/;https://zimingzhang.wordpress.com/", "dblp": ";;;", "google_scholar": ";Xuz8JrkAAAAJ;https://scholar.google.com.tw/citations?user=ICOWtt0AAAAJ;2yqx3oIAAAAJ", "orcid": ";;0000-0001-8972-503x;", "linkedin": ";;;", "or_profile": "~Guojun_Wu1;~Yun_Yue1;~Yanhua_Li1;~Ziming_Zhang1", "aff": ";Worcester Polytechnic Institute;Worcester Polytechnic Institute;Worcester Polytechnic Institute", "aff_domain": ";wpi.edu;wpi.edu;wpi.edu", "position": ";PhD student;Associate Professor;Assistant Professor", "bibtex": "@misc{\nwu2022learning,\ntitle={Learning Lightweight Neural Networks via Channel-Split Recurrent Convolution},\nauthor={Guojun Wu and Yun Yue and Yanhua Li and Ziming Zhang},\nyear={2022},\nurl={https://openreview.net/forum?id=TJF4wbKTxJf}\n}", "github": "", "project": "", "reviewers": "XPXb;t3xY;Rod8;G1g1", "site": "https://openreview.net/forum?id=TJF4wbKTxJf", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "4;3;4;4", "correctness": "3;3;3;3", "technical_novelty": "3;2;2;3", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "63;35;94;43", "wc_summary_review": "56;22;38;99", "wc_main_review": "231;216;260;24", "wc_review": "350;273;392;166", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 58.75, 22.76373211931646 ], "wc_summary_review_avg": [ 53.75, 28.760867511255636 ], "wc_main_review_avg": [ 182.75, 93.0090721381522 ], "wc_review_avg": [ 295.25, 85.96329158425705 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18374736025439006403&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "aff_unique_index": "0;0;0", "aff_unique_norm": "Worcester Polytechnic Institute", "aff_unique_dep": "", "aff_unique_url": "https://www.wpi.edu", "aff_unique_abbr": "WPI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "TKMJ9eqtpgP", "title": "DiffusionCLIP: Text-guided Image Manipulation Using Diffusion Models", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Diffusion models are recent generative models that have shown great success in image generation with the state-of-the-art performance. However, only a few researches have been conducted for image manipulation with diffusion models. Here, we present a novel DiffusionCLIP which performs text-driven image manipulation with diffusion models using Contrastive Language\u2013Image Pre-training (CLIP) loss. Our method has a performance comparable to that of the modern GAN-based image processing methods for in and out-of-domain image processing tasks, with the advantage of almost perfect inversion even without additional encoders or optimization. Furthermore, our method can be easily used for various novel applications, enabling image translation from an unseen domain to another unseen domain or stroke-conditioned image generation in an unseen domain, etc. Finally, we present a novel multiple attribute control with DiffusionCLIP by combining multiple fine-tuned diffusion models.", "keywords": "Diffusion models;CLIP;Image manipulation;Image to image translation", "primary_area": "", "supplementary_material": "", "author": "Gwanghyun Kim;Jong Chul Ye", "authorids": "~Gwanghyun_Kim1;~Jong_Chul_Ye1", "gender": ";M", "homepage": "https://gwang-kim.github.io/;https://bispl.weebly.com/", "dblp": "02/7013;15/5613", "google_scholar": "https://scholar.google.co.kr/citations?user=SCLtNC4AAAAJ;HNMjoNEAAAAJ", "orcid": "0000-0001-6570-236X;", "linkedin": "gwanghyun-bradley-kim/;", "or_profile": "~Gwanghyun_Kim1;~Jong_Chul_Ye1", "aff": "Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology", "aff_domain": "kaist.ac.kr;kaist.ac.kr", "position": "MS student;Full Professor", "bibtex": "@misc{\nkim2022diffusionclip,\ntitle={Diffusion{CLIP}: Text-guided Image Manipulation Using Diffusion Models},\nauthor={Gwanghyun Kim and Jong Chul Ye},\nyear={2022},\nurl={https://openreview.net/forum?id=TKMJ9eqtpgP}\n}", "github": "", "project": "", "reviewers": "Y3Xi;4iGG;1ZWo", "site": "https://openreview.net/forum?id=TKMJ9eqtpgP", "pdf_size": 0, "recommendation": "3;3;5", "confidence": "4;4;4", "correctness": "3;3;3", "technical_novelty": "2;2;2", "empirical_novelty": "2;2;2", "wc_summary_paper": "85;44;32", "wc_summary_review": "41;33;35", "wc_main_review": "612;195;240", "wc_review": "738;272;307", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 3.6666666666666665, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 53.666666666666664, 22.691163233490013 ], "wc_summary_review_avg": [ 36.333333333333336, 3.39934634239519 ], "wc_main_review_avg": [ 349.0, 186.87428929630744 ], "wc_review_avg": [ 439.0, 211.9072123988862 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 67, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6631579048472018606&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Korea Advanced Institute of Science and Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.kaist.ac.kr", "aff_unique_abbr": "KAIST", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "South Korea" }, { "id": "TKrlyiqKWB", "title": "Prototype Based Classification from Hierarchy to Fairness", "track": "main", "status": "Reject", "tldr": "", "abstract": "Artificial neural nets can represent and classify many types of high-dimensional data but are often tailored to particular applications -- e.g., for ``fair'' or ``hierarchical'' classification. Once an architecture has been selected, it is often difficult for humans to adjust models for a new task; for example, a hierarchical classifier cannot be easily transformed into a fair classifier that shields a protected field. Our contribution in this work is a new neural network architecture, the concept subspace network (CSN), which generalizes existing specialized classifiers to produce a unified model capable of learning a spectrum of multi-concept relationships. We demonstrate that CSNs reproduce state-of-the-art results in fair classification when enforcing concept independence, may be transformed into hierarchical classifiers, or may even reconcile fairness and hierarchy within a single classifier. The CSN is inspired by and matches the performance of existing prototype-based classifiers that promote interpretability.", "keywords": "prototypes;fairness;hierarchy;neural network;encoding", "primary_area": "", "supplementary_material": "", "author": "Mycal Tucker;Julie Shah", "authorids": "~Mycal_Tucker1;~Julie_Shah1", "gender": "M;", "homepage": "http://mycaltucker.com;", "dblp": "256/5146;", "google_scholar": "V1kgcxIAAAAJ;", "orcid": ";", "linkedin": ";", "or_profile": "~Mycal_Tucker1;~Julie_Shah1", "aff": "Massachusetts Institute of Technology;Association for the Advancement of Artificial Intelligence", "aff_domain": "mit.edu;", "position": "PhD student;", "bibtex": "@misc{\ntucker2022prototype,\ntitle={Prototype Based Classification from Hierarchy to Fairness},\nauthor={Mycal Tucker and Julie Shah},\nyear={2022},\nurl={https://openreview.net/forum?id=TKrlyiqKWB}\n}", "github": "", "project": "", "reviewers": "aHhN;BhqD;fPSK;z2bK", "site": "https://openreview.net/forum?id=TKrlyiqKWB", "pdf_size": 0, "recommendation": "3;3;6;6", "confidence": "3;3;4;3", "correctness": "2;2;3;3", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "31;65;106;84", "wc_summary_review": "15;11;61;10", "wc_main_review": "320;441;686;235", "wc_review": "366;517;853;329", "wc_reply_reviewers": "0;0;41;268", "wc_reply_authors": "778;798;434;913", "reply_reviewers": "0;0;1;2", "reply_authors": "1;1;1;2", "recommendation_avg": [ 4.5, 1.5 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 71.5, 27.518175811634027 ], "wc_summary_review_avg": [ 24.25, 21.299941314473145 ], "wc_main_review_avg": [ 420.5, 169.86833136285293 ], "wc_review_avg": [ 516.25, 206.78415679156853 ], "wc_reply_reviewers_avg": [ 77.25, 111.39428845322367 ], "wc_reply_authors_avg": [ 730.75, 178.90692412536748 ], "reply_reviewers_avg": [ 0.75, 0.82915619758885 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.5773502691896258, "corr_recommendation_correctness": 1.0, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11530419927101336822&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff_unique_index": "0;1", "aff_unique_norm": "Massachusetts Institute of Technology;Association for the Advancement of Artificial Intelligence", "aff_unique_dep": ";", "aff_unique_url": "https://web.mit.edu;https://www.aaai.org", "aff_unique_abbr": "MIT;AAAI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "TLgW66V2CbP", "title": "Self-Supervised Learning by Estimating Twin Class Distributions", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "We present TWIST, a novel self-supervised representation learning method by classifying large-scale unlabeled datasets in an end-to-end way. We employ a siamese network terminated by a softmax operation to produce twin class distributions of two augmented images. Without supervision, we enforce the class distributions of different augmentations to be consistent. In the meantime, we regularize the class distributions to make them sharp and diverse. Specifically, we minimize the entropy of the distribution for each sample to make the class prediction for each sample assertive and maximize the entropy of the mean distribution to make the predictions of different samples diverse. In this way, TWIST can naturally avoid the trivial solutions without specific designs such as asymmetric network, stop-gradient operation, or momentum encoder. Different from the clustering-based methods which alternate between clustering and learning, our method is a single learning process guided by a unified loss function. As a result, TWIST outperforms state-of-the-art methods on a wide range of tasks, including unsupervised classification, linear classification, semi-supervised learning, transfer learning, and some dense prediction tasks such as detection and segmentation.", "keywords": "self-supervised learning;unsupervised learning;representation learning", "primary_area": "", "supplementary_material": "", "author": "Feng Wang;Tao Kong;Rufeng Zhang;Huaping Liu;Hang Li", "authorids": "~Feng_Wang12;~Tao_Kong3;~Rufeng_Zhang1;~Huaping_Liu3;~Hang_Li4", "gender": "M;M;M;M;M", "homepage": ";http://www.taokong.org;https://github.com/zzzzzz0407;https://sites.google.com/site/thuliuhuaping/;https://hangli-hl.github.io/", "dblp": "90/4225-34;01/2492;;69/1097-1;https://dblp.org/pers/hd/l/Li_0001:Hang", "google_scholar": "bKG4Un8AAAAJ;kSUXLPkAAAAJ;THbhxeYAAAAJ;https://scholar.google.com.hk/citations?user=HXnkIkwAAAAJ;nTl5mSwAAAAJ", "orcid": ";;0000-0001-5984-728X;;0000-0001-9628-3487", "linkedin": ";;;;hang-li-84aa6314/", "or_profile": "~Feng_Wang12;~Tao_Kong3;~Rufeng_Zhang1;~Huaping_Liu3;~Hang_Li4", "aff": "Tsinghua University;Bytedance;Tongji University;Tsinghua University;ByteDance Technology", "aff_domain": "tsinghua.edu.cn;bytedance.com;tongji.edu.cn;tsinghua.edu.cn;bytedance.com", "position": "PhD student;Researcher;PhD student;Full Professor;Head of Research", "bibtex": "@misc{\nwang2022selfsupervised,\ntitle={Self-Supervised Learning by Estimating Twin Class Distributions},\nauthor={Feng Wang and Tao Kong and Rufeng Zhang and Huaping Liu and Hang Li},\nyear={2022},\nurl={https://openreview.net/forum?id=TLgW66V2CbP}\n}", "github": "", "project": "", "reviewers": "33at;H2Vk;GuYv;83yH", "site": "https://openreview.net/forum?id=TLgW66V2CbP", "pdf_size": 0, "recommendation": "3;5;5;5", "confidence": "5;5;4;4", "correctness": "3;4;3;3", "technical_novelty": "1;3;2;2", "empirical_novelty": "1;0;2;2", "wc_summary_paper": "63;68;68;224", "wc_summary_review": "23;107;59;69", "wc_main_review": "140;124;421;408", "wc_review": "226;299;548;701", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.5, 0.8660254037844386 ], "confidence_avg": [ 4.5, 0.5 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 1.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 105.75, 68.3021778569322 ], "wc_summary_review_avg": [ 64.5, 29.912372022292047 ], "wc_main_review_avg": [ 273.25, 141.43792808154396 ], "wc_review_avg": [ 443.5, 190.6652826290093 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 50, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=248902349160897339&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff_unique_index": "0;1;2;0;1", "aff_unique_norm": "Tsinghua University;ByteDance;Tongji University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.bytedance.com;https://www.tongji.edu.cn", "aff_unique_abbr": "THU;Bytedance;Tongji", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "id": "TLnReGgZEdW", "title": "Generalization in Deep RL for TSP Problems via Equivariance and Local Search", "track": "main", "status": "Reject", "tldr": "", "abstract": "Deep reinforcement learning (RL) has proved to be a competitive heuristic for solving small-sized instances of traveling salesman problems (TSP), but its performance on larger-sized instances is insufficient. Since training on large instances is impractical, we design a novel deep RL approach with a focus on generalizability. Our proposition consisting of a simple deep learning architecture that learns with novel RL training techniques exploits two main ideas. First, we exploit equivariance to facilitate training. Second, we interleave efficient local search heuristics with the usual RL training to smooth the value landscape. In order to validate the whole approach, we empirically evaluate our proposition on random and realistic TSP problems against relevant state-of-the-art deep RL methods. Moreover, we present an ablation study to understand the contribution of each of its components.", "keywords": "Deep Reinforcemenet Learning;Travelling salesman problem;Curriculum Learning;Equivariance;Local Search", "primary_area": "", "supplementary_material": "", "author": "Wenbin Ouyang;Yisen Wang;Paul Weng;Shaochen Han", "authorids": "~Wenbin_Ouyang1;~Yisen_Wang2;~Paul_Weng1;~Shaochen_Han1", "gender": "M;M;M;", "homepage": ";;http://weng.fr;", "dblp": "169/1321;;http://dblp.uni-trier.de/pers/hd/w/Weng:Paul;", "google_scholar": ";;_Hd6AeQAAAAJ;", "orcid": ";;;my-orcid?orcid=0000-0002-4575-0767", "linkedin": "wenbin-ouyang-996180208/;yisen-wang-7308751a0/;paul-weng-69a15980/;", "or_profile": "~Wenbin_Ouyang1;~Yisen_Wang2;~Paul_Weng1;~Shaochen_Han1", "aff": "Shanghai Jiao Tong University, Tsinghua University;University of Michigan;Shanghai Jiaotong University;Shanghai Jiao Tong University, Tsinghua University", "aff_domain": "sjtu.edu.cn;umich.edu;sjtu.edu.cn;sjtu.edu.cn", "position": "Undergrad student;MS student;Associate Professor;Undergrad student", "bibtex": "@misc{\nouyang2022generalization,\ntitle={Generalization in Deep {RL} for {TSP} Problems via Equivariance and Local Search},\nauthor={Wenbin Ouyang and Yisen Wang and Paul Weng and Shaochen Han},\nyear={2022},\nurl={https://openreview.net/forum?id=TLnReGgZEdW}\n}", "github": "", "project": "", "reviewers": "n7h8;XHWq;13UN;eA3s", "site": "https://openreview.net/forum?id=TLnReGgZEdW", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "4;4;4;5", "correctness": "3;3;3;3", "technical_novelty": "3;2;2;3", "empirical_novelty": "3;3;2;2", "wc_summary_paper": "94;146;102;139", "wc_summary_review": "76;94;132;242", "wc_main_review": "274;338;216;842", "wc_review": "444;578;450;1223", "wc_reply_reviewers": "0;39;31;0", "wc_reply_authors": "803;325;752;1306", "reply_reviewers": "0;1;1;0", "reply_authors": "2;1;1;2", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 120.25, 22.56518335843961 ], "wc_summary_review_avg": [ 136.0, 64.45153217728807 ], "wc_main_review_avg": [ 417.5, 248.85487738840885 ], "wc_review_avg": [ 673.75, 321.5947566425796 ], "wc_reply_reviewers_avg": [ 17.5, 17.727097901235837 ], "wc_reply_authors_avg": [ 796.5, 347.82359034430084 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.9271726499455306, "corr_recommendation_correctness": 0.0, "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5262350664909200956&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;1;0;0", "aff_unique_norm": "Shanghai Jiao Tong University;University of Michigan", "aff_unique_dep": ";", "aff_unique_url": "https://www.sjtu.edu.cn;https://www.umich.edu", "aff_unique_abbr": "SJTU;UM", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0", "aff_country_unique": "China;United States" }, { "id": "TN-W4p7H2pK", "title": "Conditional Generative Quantile Networks via Optimal Transport and Convex Potentials", "track": "main", "status": "Reject", "tldr": "", "abstract": "Quantile regression has a natural extension to generative modelling by leveraging a stronger convergence in pointwise rather than in distribution. While the pinball quantile loss works in the scalar case, it does not have a provable extension to the vector case. In this work, we consider a quantile approach to generative modelling using optimal transport with provable guarantees. We suggest and prove that by optimizing smooth functions with respect to the dual of the correlation maximization problem, the optimum is convex almost surely and hence construct a Brenier map as our generative quantile network. Furthermore, we introduce conditional generative modelling with a Kantorovich dual objective by constructing an affine latent model with respect to the covariates. Through extensive experiments on synthetic and real datasets for conditional generative and probabilistic forecasting tasks, we demonstrate the efficacy and versatility of our theoretically motivated model as a distribution estimator and conditioner.", "keywords": "Optimal Transport;Generative Models;Quantile Functions;Time-Series Forecasting;Image Generation", "primary_area": "", "supplementary_material": "/attachment/ada34caaaf2e5da7ba6626688364d12a69adc069.zip", "author": "Jesse Sun;Dihong Jiang;Yaoliang Yu", "authorids": "~Jesse_Sun1;~Dihong_Jiang1;~Yaoliang_Yu1", "gender": ";M;M", "homepage": "https://github.com/sunjesse;https://dihjiang.github.io/;https://cs.uwaterloo.ca/~y328yu/", "dblp": ";234/8064;90/4989", "google_scholar": ";Cen4GYkAAAAJ;https://scholar.google.ca/citations?user=zbXIQMsAAAAJ", "orcid": ";;0000-0002-3823-0720", "linkedin": ";;", "or_profile": "~Jesse_Sun1;~Dihong_Jiang1;~Yaoliang_Yu1", "aff": "University of Waterloo;University of Waterloo;University of Waterloo", "aff_domain": "uwaterloo.ca;uwaterloo.ca;uwaterloo.ca", "position": "Undergrad student;PhD student;Associate Professor", "bibtex": "@misc{\nsun2022conditional,\ntitle={Conditional Generative Quantile Networks via Optimal Transport and Convex Potentials},\nauthor={Jesse Sun and Dihong Jiang and Yaoliang Yu},\nyear={2022},\nurl={https://openreview.net/forum?id=TN-W4p7H2pK}\n}", "github": "", "project": "", "reviewers": "8X3c;7Kv2;Vf6H;jUXa", "site": "https://openreview.net/forum?id=TN-W4p7H2pK", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "2;4;4;4", "correctness": "2;4;3;4", "technical_novelty": "3;1;1;3", "empirical_novelty": "2;2;1;3", "wc_summary_paper": "52;76;148;37", "wc_summary_review": "24;34;77;46", "wc_main_review": "836;377;440;117", "wc_review": "912;487;665;200", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 3.5, 0.8660254037844386 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.0, 1.0 ], "empirical_novelty_avg": [ 2.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 78.25, 42.60501731017135 ], "wc_summary_review_avg": [ 45.25, 19.917015338649513 ], "wc_main_review_avg": [ 442.5, 257.43397211712363 ], "wc_review_avg": [ 566.0, 259.66998286286383 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": 0.5222329678670935, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Mjw_u0to19gJ:scholar.google.com/&scioq=Conditional+Generative+Quantile+Networks+via+Optimal+Transport+and+Convex+Potentials&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Waterloo", "aff_unique_dep": "", "aff_unique_url": "https://uwaterloo.ca", "aff_unique_abbr": "UW", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Canada" }, { "id": "TNBTpPO0QX", "title": "Monotone deep Boltzmann machines", "track": "main", "status": "Reject", "tldr": "", "abstract": "Deep Boltzmann machines refer to deep multi-layered probabilistic models, governed by a pairwise energy function that describes the likelihood of all variables in the network. Due to the difficulty of inference in such systems, they have given way largely to \\emph{restricted} deep Boltzmann machines (which do not permit intra-layer or skip connections). In this paper, we propose a class of model that allows for \\emph{exact, efficient} mean-field inference and learning in \\emph{general} deep Boltzmann machines. To do so, we use the tools of the recently proposed monotone Deep Equilibrium (DEQ) Model, an implicit-depth deep network that always guarantees the existence and uniqueness of its fixed points. We show that, for a class of general deep Boltzmann machine, the mean-field fixed point can be considered as the equivalent fixed point of a monotone DEQ, which gives us a recipe for deriving an efficient mean-field inference procedure with global convergence guarantees. In addition, we show that our procedure outperforms existing mean-field approximation methods while avoiding any issue of local optima. We apply this approach to simple deep convolutional Boltzmann architectures and demonstrate that it allows for tasks such as the joint completion and classification of images, all within a single deep probabilistic setting. ", "keywords": "Deep Boltzmann machine;mean-field inference;deep equilibrium model", "primary_area": "", "supplementary_material": "/attachment/cfea1eac320e9e8bddf07788881506630cbbf203.zip", "author": "Zhili Feng;Ezra Winston;J Zico Kolter", "authorids": "~Zhili_Feng1;~Ezra_Winston1;~J_Zico_Kolter1", "gender": ";;M", "homepage": "https://zhilif.github.io/;https://ezrawinston.github.io;http://www.zicokolter.com", "dblp": "189/7590;66/9442;67/2526", "google_scholar": "_lnL4aQAAAAJ;;UXh1I6UAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Zhili_Feng1;~Ezra_Winston1;~Zico_Kolter1", "aff": "Carnegie Mellon University;Machine Learning Department, School of Computer Science;Carnegie Mellon University", "aff_domain": "andrew.cmu.edu;mld.cs.cmu.edu;cmu.edu", "position": "PhD student;PhD student;Full Professor", "bibtex": "@misc{\nfeng2022monotone,\ntitle={Monotone deep Boltzmann machines},\nauthor={Zhili Feng and Ezra Winston and J Zico Kolter},\nyear={2022},\nurl={https://openreview.net/forum?id=TNBTpPO0QX}\n}", "github": "", "project": "", "reviewers": "ihMA;fJmJ;8YQo;1tpT", "site": "https://openreview.net/forum?id=TNBTpPO0QX", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "4;4;5;4", "correctness": "4;4;4;4", "technical_novelty": "4;2;3;3", "empirical_novelty": "1;2;2;0", "wc_summary_paper": "56;49;97;89", "wc_summary_review": "19;14;94;19", "wc_main_review": "304;536;668;813", "wc_review": "379;599;859;921", "wc_reply_reviewers": "82;65;69;125", "wc_reply_authors": "105;349;387;276", "reply_reviewers": "1;1;1;2", "reply_authors": "1;2;1;3", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 4.0, 0.0 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 1.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 72.75, 20.595812681222366 ], "wc_summary_review_avg": [ 36.5, 33.26033673912518 ], "wc_main_review_avg": [ 580.25, 187.17955951438714 ], "wc_review_avg": [ 689.5, 216.17296315682034 ], "wc_reply_reviewers_avg": [ 85.25, 23.79469478686373 ], "wc_reply_authors_avg": [ 279.25, 108.22286033920929 ], "reply_reviewers_avg": [ 1.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.75, 0.82915619758885 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.13245323570650439, "corr_recommendation_correctness": 0.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4970944090679646985&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;0", "aff_unique_norm": "Carnegie Mellon University", "aff_unique_dep": "", "aff_unique_url": "https://www.cmu.edu", "aff_unique_abbr": "CMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "TNmJgFmz2k", "title": "Leveraging Relational Information for Learning Weakly Disentangled Representations", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Disentanglement is a difficult property to enforce in neural representations. This might be due, in part, to a formalization of the disentanglement problem that focuses too heavily on separating relevant factors of variation of the data in single isolated dimensions of the neural representation. We argue that such a definition might be too restrictive and not necessarily beneficial in terms of downstream tasks. In this work, we present an alternative view over learning (weakly) disentangled representations, which leverages concepts from relational learning. We identify the regions of the latent space that correspond to specific instances of generative factors, and we learn the relationships among these regions in order to perform controlled changes to the latent codes. We also introduce a compound generative model that implements such a weak disentanglement approach. Our experiments shows that the learned representations can separate the relevant factors of variation in the data, while preserving the information needed for effectively generating high quality data samples.", "keywords": "representation learning;disentangled representations;generative models", "primary_area": "", "supplementary_material": "/attachment/614e000f11d713129345374d6362b4b941fdf0b7.zip", "author": "Andrea Valenti;Davide Bacciu", "authorids": "~Andrea_Valenti1;~Davide_Bacciu1", "gender": "M;M", "homepage": ";http://pages.di.unipi.it/bacciu/", "dblp": ";07/6626", "google_scholar": "https://scholar.google.it/citations?user=5LFFoiQAAAAJ;https://scholar.google.it/citations?user=1d5n2WkAAAAJ", "orcid": ";0000-0001-5213-2468", "linkedin": ";bacciu/", "or_profile": "~Andrea_Valenti1;~Davide_Bacciu1", "aff": "University of Pisa;University of Pisa", "aff_domain": "unipi.it;unipi.it", "position": "PhD student;Full Professor", "bibtex": "@misc{\nvalenti2022leveraging,\ntitle={Leveraging Relational Information for Learning Weakly Disentangled Representations},\nauthor={Andrea Valenti and Davide Bacciu},\nyear={2022},\nurl={https://openreview.net/forum?id=TNmJgFmz2k}\n}", "github": "", "project": "", "reviewers": "UkzY;NY7Z;3m6X", "site": "https://openreview.net/forum?id=TNmJgFmz2k", "pdf_size": 0, "recommendation": "3;3;5", "confidence": "4;4;3", "correctness": "2;3;4", "technical_novelty": "2;3;3", "empirical_novelty": "2;3;3", "wc_summary_paper": "25;201;47", "wc_summary_review": "24;99;28", "wc_main_review": "349;380;218", "wc_review": "398;680;293", "wc_reply_reviewers": "0;540;0", "wc_reply_authors": "781;582;143", "reply_reviewers": "0;1;0", "reply_authors": "1;1;1", "recommendation_avg": [ 3.6666666666666665, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.0, 0.816496580927726 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 91.0, 78.29857384822962 ], "wc_summary_review_avg": [ 50.333333333333336, 34.451253807211266 ], "wc_main_review_avg": [ 315.6666666666667, 70.21079372543481 ], "wc_review_avg": [ 457.0, 163.40746616969494 ], "wc_reply_reviewers_avg": [ 180.0, 254.55844122715712 ], "wc_reply_authors_avg": [ 502.0, 266.534550605858 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -1.0, "corr_recommendation_correctness": 0.8660254037844387, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5907873900366097692&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0", "aff_unique_norm": "University of Pisa", "aff_unique_dep": "", "aff_unique_url": "https://www.unipi.it", "aff_unique_abbr": "UNIP", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Italy" }, { "id": "TNxKD3z_tPZ", "title": "Persistent Homology Captures the Generalization of Neural Networks Without A Validation Set", "track": "main", "status": "Reject", "tldr": "", "abstract": "The training of neural networks is usually monitored with a validation (holdout) set to estimate the generalization of the model. This is done instead of measuring intrinsic properties of the model to determine whether it is learning appropriately. In this work, we suggest studying the training of neural networks with Algebraic Topology, specifically Persistent Homology (PH). Using simplicial complex representations of neural networks, we study the PH diagram distance evolution on the neural network learning process with different architectures and several datasets. Results show that the PH diagram distance between consecutive neural network states correlates with the validation accuracy, implying that the generalization error of a neural network could be intrinsically estimated without any holdout set.", "keywords": "Neural Networks;Topological Data Analysis;learning;evolution;Persistent Homology", "primary_area": "", "supplementary_material": "/attachment/f9dd9dd7e8111a367ff920e80e94f192b25ad4ca.zip", "author": "Asier Guti\u00e9rrez-Fandi\u00f1o;David P\u00e9rez Fern\u00e1ndez;Jordi Armengol-Estap\u00e9;Marta Villegas", "authorids": "~Asier_Guti\u00e9rrez-Fandi\u00f1o1;~David_P\u00e9rez_Fern\u00e1ndez1;~Jordi_Armengol-Estap\u00e91;~Marta_Villegas2", "gender": "M;M;M;", "homepage": ";;https://jordiae.com/;", "dblp": "281/8078;;263/2700.html;", "google_scholar": "https://scholar.google.es/citations?user=sYKGw0wAAAAJ;;https://scholar.google.es/citations?user=CiHoJfcAAAAJ;", "orcid": "0000-0002-7368-6950;0000-0002-2214-0245;0000-0001-8893-6185;", "linkedin": "asier-gutierrez-fandino/;;jordiae;", "or_profile": "~Asier_Guti\u00e9rrez-Fandi\u00f1o1;~David_P\u00e9rez_Fern\u00e1ndez1;~Jordi_Armengol-Estap\u00e91;~Marta_Villegas2", "aff": "Barcelona Supercomputing Center;;Barcelona Supercomputing Center;", "aff_domain": "bsc.es;;bsc.es;", "position": "Researcher;;Researcher;", "bibtex": "@misc{\nguti{\\'e}rrez-fandi{\\~n}o2022persistent,\ntitle={Persistent Homology Captures the Generalization of Neural Networks Without A Validation Set},\nauthor={Asier Guti{\\'e}rrez-Fandi{\\~n}o and David P{\\'e}rez Fern{\\'a}ndez and Jordi Armengol-Estap{\\'e} and Marta Villegas},\nyear={2022},\nurl={https://openreview.net/forum?id=TNxKD3z_tPZ}\n}", "github": "", "project": "", "reviewers": "eLVu;JhC7;Fo4q;nK7m", "site": "https://openreview.net/forum?id=TNxKD3z_tPZ", "pdf_size": 0, "recommendation": "1;3;3;3", "confidence": "4;4;5;4", "correctness": "1;1;2;1", "technical_novelty": "1;3;3;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "133;74;87;41", "wc_summary_review": "92;38;228;37", "wc_main_review": "1472;1010;1481;218", "wc_review": "1697;1122;1796;296", "wc_reply_reviewers": "1747;0;98;0", "wc_reply_authors": "3457;746;2312;0", "reply_reviewers": "4;0;2;0", "reply_authors": "5;1;4;0", "recommendation_avg": [ 2.5, 0.8660254037844386 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 1.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.8660254037844386 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 83.75, 33.0104150231408 ], "wc_summary_review_avg": [ 98.75, 77.86968280402843 ], "wc_main_review_avg": [ 1045.25, 514.1932394538069 ], "wc_review_avg": [ 1227.75, 596.3314409789241 ], "wc_reply_reviewers_avg": [ 461.25, 743.4054664178896 ], "wc_reply_authors_avg": [ 1628.75, 1345.4927303779832 ], "reply_reviewers_avg": [ 1.5, 1.6583123951777 ], "reply_authors_avg": [ 2.5, 2.0615528128088303 ], "replies_avg": [ 26, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1994025422791750451&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0", "aff_unique_norm": "Barcelona Supercomputing Center", "aff_unique_dep": "", "aff_unique_url": "https://www.bsc.es", "aff_unique_abbr": "BSC", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Spain" }, { "id": "TQ75Md-FqQp", "title": "Efficient and Modular Implicit Differentiation", "track": "main", "status": "Reject", "tldr": "", "abstract": "Automatic differentiation (autodiff) has revolutionized machine learning. It allows expressing complex computations by composing elementary ones in creative ways and removes the tedious burden of computing their derivatives by hand. More recently, differentiation of optimization problem solutions has attracted a great deal of research, with applications as a layer in a neural network, and in bi-level optimization, including hyper-parameter optimization. However, the formulae for these derivatives often involves a tedious manual derivation and implementation. In this paper, we propose a unified, efficient and modular approach for implicit differentiation of optimization problems. In our approach, the user defines directly in Python a function $F$ capturing the optimality conditions of the problem to be differentiated. Once this is done, we leverage autodiff of $F$ to automatically differentiate the optimization problem. This way, our approach combines the benefits of implicit differentiation and autodiff. We show that seemingly simple principles allow to recover all recently proposed implicit differentiation methods and create new ones easily. We describe in details a JAX implementation of our framework and demonstrate the ease of differentiating through optimization problems thanks to it on four diverse tasks: hyperparameter optimization of multiclass SVMs, dataset distillation, task-driven dictionary learning and sensitivity analysis of molecular dynamics.", "keywords": "implicit differentiation;bilevel optimization;autodiff;jax", "primary_area": "", "supplementary_material": "/attachment/43a0bf641acc44ae41cb32194c1d835ac3ab01a5.zip", "author": "Mathieu Blondel;Quentin Berthet;marco cuturi;Roy Frostig;Stephan Hoyer;Felipe Llinares-L\u00f3pez;Fabian Pedregosa;Jean-Philippe Vert", "authorids": "~Mathieu_Blondel1;~Quentin_Berthet2;~marco_cuturi2;~Roy_Frostig1;~Stephan_Hoyer1;~Felipe_Llinares-L\u00f3pez1;~Fabian_Pedregosa1;~Jean-Philippe_Vert1", "gender": ";M;M;;M;M;;M", "homepage": "http://www.mblondel.org;http://q-berthet.github.io/;http://marcocuturi.net;https://cs.stanford.edu/~rfrostig/;http://stephanhoyer.com;http://fa.bianp.net;http://cbio.mines-paristech.fr/~jvert;", "dblp": "05/8614.html;129/1262;85/5102;136/9091;;11/9764;http://dblp.uni-trier.de/pers/hd/v/Vert:Jean=Philippe;157/7706", "google_scholar": "C0EKzrUAAAAJ;bHwGZjcAAAAJ;https://scholar.google.fr/citations?user=kQEydDMAAAAJ;UoATnWEAAAAJ;bWTG5FgAAAAJ;https://scholar.google.fr/citations?hl=en;https://scholar.google.fr/citations?user=pqpxh7IAAAAJ;zzjTWUUAAAAJ", "orcid": ";;;;0000-0002-5207-0380;0000-0003-4025-3953;0000-0001-9510-8441;", "linkedin": ";;;;;http://www.linkedin.com/in/fabianpedregosa;djipay/;", "or_profile": "~Mathieu_Blondel1;~Quentin_Berthet2;~marco_cuturi2;~Roy_Frostig1;~Stephan_Hoyer1;~Fabian_Pedregosa1;~Jean-Philippe_Vert1;~Felipe_Llinares-Lopez1", "aff": "Google;Google;Google brain;Google;Google;Google AI;MINES ParisTech;Google LLC", "aff_domain": "google.com;google.com;google.com;google.com;google.com;google.com;mines-paristech.fr;google.com", "position": "Research scientist;Researcher;Research scientist;Research scientist;Researcher;Research Scientist;Full Professor;Research Scientist", "bibtex": "@misc{\nblondel2022efficient,\ntitle={Efficient and Modular Implicit Differentiation},\nauthor={Mathieu Blondel and Quentin Berthet and marco cuturi and Roy Frostig and Stephan Hoyer and Felipe Llinares-L{\\'o}pez and Fabian Pedregosa and Jean-Philippe Vert},\nyear={2022},\nurl={https://openreview.net/forum?id=TQ75Md-FqQp}\n}", "github": "", "project": "", "reviewers": "h9WU;b1rc;Q3Lr", "site": "https://openreview.net/forum?id=TQ75Md-FqQp", "pdf_size": 0, "recommendation": "3;8;10", "confidence": "4;3;3", "correctness": "4;3;4", "technical_novelty": "2;3;3", "empirical_novelty": "2;3;3", "wc_summary_paper": "58;77;153", "wc_summary_review": "51;55;76", "wc_main_review": "281;303;463", "wc_review": "390;435;692", "wc_reply_reviewers": "1079;96;171", "wc_reply_authors": "1598;544;504", "reply_reviewers": "2;1;1", "reply_authors": "3;2;1", "recommendation_avg": [ 7.0, 2.943920288775949 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "correctness_avg": [ 3.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 96.0, 41.04469108991645 ], "wc_summary_review_avg": [ 60.666666666666664, 10.96458946893235 ], "wc_main_review_avg": [ 349.0, 81.10898018509829 ], "wc_review_avg": [ 505.6666666666667, 133.0321598545087 ], "wc_reply_reviewers_avg": [ 448.6666666666667, 446.76342235634684 ], "wc_reply_authors_avg": [ 882.0, 506.55174135192414 ], "reply_reviewers_avg": [ 1.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 2.0, 0.816496580927726 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": -0.9607689228305227, "corr_recommendation_correctness": -0.24019223070763068, "gs_citation": 314, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3695618240103498432&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "aff_unique_index": "0;0;0;0;0;0;1;0", "aff_unique_norm": "Google;MINES ParisTech", "aff_unique_dep": "Google;", "aff_unique_url": "https://www.google.com;https://www.mines-paristech.fr", "aff_unique_abbr": "Google;MPT", "aff_campus_unique_index": "0;0;0;0;0;0;0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;0;0;0;0;0;1;0", "aff_country_unique": "United States;France" }, { "id": "TSlidmTs80", "title": "Knowledge-driven Scene Priors for Semantic Audio-Visual Embodied Navigation", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Generalisation to unseen contexts remains a challenge for embodied navigation agents. In the context of semantic audio-visual navigation (SAVi) tasks, generalisation includes both generalising to unseen indoor visual scenes as well as generalising to unheard sounding objects. Previous SAVi task definitions do not include evaluation conditions on truly novel sounding objects, resorting instead to evaluating agents on unheard sound clips of known objects; meanwhile, previous SAVi methods do not include explicit mechanisms for incorporating domain knowledge about object and region semantics. These weaknesses limit the development and assessment of models' abilities to generalise their learned experience. In this work, we introduce the use of knowledge-driven scene priors in the semantic audio-visual embodied navigation task: we combine semantic information from our novel knowledge graph that encodes object-region relations, spatial knowledge from dual Graph Convolutional Networks, and background knowledge from a series of pre-training tasks|all within a reinforcement learning framework for audio-visual navigation. We define a new audio-visual navigation sub-task, where agents are evaluated on novel sounding objects, as opposed to unheard clips of known objects. We show state-of-the-art results on multiple semantic audio-visual navigation benchmarks, within the Habitat-Matterport3D simulator, where we also show improvements in generalisation to unseen regions and novel sounding objects. We release our code, knowledge graph, and dataset in the supplementary material.", "keywords": "Scene Priors;Modular Training;Reinforcement Learning;Audio-Visual;Robot Navigation;Embodied", "primary_area": "", "supplementary_material": "/attachment/9def573933e329ff0625c0f05b59a40e63ca254c.zip", "author": "Gyan Tatiya;Jonathan Francis;Ingrid Navarro;Nariaki Kitamura;Eric Nyberg;Jivko Sinapov;Jean Oh", "authorids": "~Gyan_Tatiya1;~Jonathan_Francis1;~Ingrid_Navarro1;~Nariaki_Kitamura1;~Eric_Nyberg1;~Jivko_Sinapov1;~Jean_Oh2", "gender": "M;;F;M;;;F", "homepage": "https://www.eecs.tufts.edu/~gtatiya/;;https://navars.xyz;https://www.ri.cmu.edu/ri-people/nariaki-kitamura/;https://www.cs.cmu.edu/~ehn;https://www.eecs.tufts.edu/~jsinapov/;http://www.cs.cmu.edu/~jeanoh/", "dblp": "246/7980;;;;05/595;05/6973;62/4860", "google_scholar": "5uHD964AAAAJ;;;;https://scholar.google.com/citations?hl=en;-mHoWKEAAAAJ;", "orcid": ";;;;;;", "linkedin": "gtatiya;;;;eric-nyberg-08620/;;", "or_profile": "~Gyan_Tatiya1;~Jonathan_Francis1;~Ingrid_Navarro1;~Nariaki_Kitamura1;~Eric_Nyberg1;~Jivko_Sinapov1;~Jean_Oh2", "aff": "Bosch;;Carnegie Mellon University;;Carnegie Mellon University;Tufts University;Carnegie Mellon University", "aff_domain": "bosch.com;;cmu.edu;;cmu.edu;tufts.edu;cmu.edu", "position": "Intern;;MS student;;Full Professor;Assistant Professor;Associate Professor", "bibtex": "@misc{\ntatiya2022knowledgedriven,\ntitle={Knowledge-driven Scene Priors for Semantic Audio-Visual Embodied Navigation},\nauthor={Gyan Tatiya and Jonathan Francis and Ingrid Navarro and Nariaki Kitamura and Eric Nyberg and Jivko Sinapov and Jean Oh},\nyear={2022},\nurl={https://openreview.net/forum?id=TSlidmTs80}\n}", "github": "", "project": "", "reviewers": "dK3e;ENmU;fL7L;r52z;Jwas", "site": "https://openreview.net/forum?id=TSlidmTs80", "pdf_size": 0, "recommendation": "3;3;5;5;5", "confidence": "3;4;4;3;4", "correctness": "3;2;2;4;4", "technical_novelty": "2;3;3;2;3", "empirical_novelty": "2;2;3;2;4", "wc_summary_paper": "51;123;145;43;218", "wc_summary_review": "48;33;173;31;42", "wc_main_review": "253;900;451;226;247", "wc_review": "352;1056;769;300;507", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "0;0;0;0;0", "reply_reviewers": "0;0;0;0;0", "reply_authors": "0;0;0;0;0", "recommendation_avg": [ 4.2, 0.9797958971132712 ], "confidence_avg": [ 3.6, 0.4898979485566356 ], "correctness_avg": [ 3.0, 0.8944271909999159 ], "technical_novelty_avg": [ 2.6, 0.4898979485566356 ], "empirical_novelty_avg": [ 2.6, 0.8 ], "wc_summary_paper_avg": [ 116.0, 64.57243994151065 ], "wc_summary_review_avg": [ 65.4, 54.150161587939884 ], "wc_main_review_avg": [ 415.4, 255.6204999603905 ], "wc_review_avg": [ 596.8, 281.63763952994634 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.16666666666666663, "corr_recommendation_correctness": 0.45643546458763845, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1410972531908204978&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;1;1;2;1", "aff_unique_norm": "Robert Bosch GmbH;Carnegie Mellon University;Tufts University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.bosch.com;https://www.cmu.edu;https://www.tufts.edu", "aff_unique_abbr": "Bosch;CMU;Tufts", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1;1", "aff_country_unique": "Germany;United States" }, { "id": "TTnjervir3J", "title": "DATA-DRIVEN EVALUATION OF TRAINING ACTION SPACE FOR REINFORCEMENT LEARNING", "track": "main", "status": "Reject", "tldr": "", "abstract": "Training action space selection for reinforcement learning (RL) is conflict-prone due to complex state-action relationships. To address this challenge, this paper proposes a Shapely-inspired methodology for training action space categorization and ranking. To reduce exponential-time Shapely computations, the methodology includes a Monte Carlo simulation to avoid unnecessary explorations. The effectiveness of the methodology is illustrated using a cloud infrastructure resource tuning case study. It reduces the search space by 80% and categorizes the training action sets into dispensable and indispensable groups. Additionally, it ranks different training actions to facilitate superior RL model performance and lower cost. The proposed data-driven methodology is extensible to different domains, use cases, and machine learning algorithms.", "keywords": "Reinforcement Learning;Action Selection;Cost Optimization;Shapely", "primary_area": "", "supplementary_material": "", "author": "Rajat Ghosh;Debojyoti Dutta;Aroosh Sohi;Akshay Khole", "authorids": "~Rajat_Ghosh2;~Debojyoti_Dutta1;aroosh@nutanix.com;akshay@nutanix.com", "gender": "M;M;;", "homepage": ";;;", "dblp": ";96/2340;;", "google_scholar": "n5SEuIYAAAAJ;;;", "orcid": ";;;", "linkedin": "i-am-rajat/;http://linkedin.com/in/dedutta;;", "or_profile": "~Rajat_Ghosh2;~Debojyoti_Dutta1;aroosh@nutanix.com;akshay@nutanix.com", "aff": "Nutanix;;;", "aff_domain": "nutanix.com;;;", "position": "Staff Engineer;;;", "bibtex": "@misc{\nghosh2022datadriven,\ntitle={{DATA}-{DRIVEN} {EVALUATION} {OF} {TRAINING} {ACTION} {SPACE} {FOR} {REINFORCEMENT} {LEARNING}},\nauthor={Rajat Ghosh and Debojyoti Dutta and Aroosh Sohi and Akshay Khole},\nyear={2022},\nurl={https://openreview.net/forum?id=TTnjervir3J}\n}", "github": "", "project": "", "reviewers": "ASBS;TWbe;5emL;LXKk", "site": "https://openreview.net/forum?id=TTnjervir3J", "pdf_size": 0, "recommendation": "1;1;3;3", "confidence": "4;4;4;3", "correctness": "2;1;2;3", "technical_novelty": "2;1;1;3", "empirical_novelty": "2;1;2;3", "wc_summary_paper": "118;63;45;152", "wc_summary_review": "20;41;27;80", "wc_main_review": "292;247;275;215", "wc_review": "430;351;347;447", "wc_reply_reviewers": "46;29;33;56", "wc_reply_authors": "97;55;90;26", "reply_reviewers": "1;1;1;1", "reply_authors": "1;1;1;1", "recommendation_avg": [ 2.0, 1.0 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 2.0, 0.7071067811865476 ], "technical_novelty_avg": [ 1.75, 0.82915619758885 ], "empirical_novelty_avg": [ 2.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 94.5, 42.72294465506796 ], "wc_summary_review_avg": [ 42.0, 23.205602771744587 ], "wc_main_review_avg": [ 257.25, 29.20937349550654 ], "wc_review_avg": [ 393.75, 45.17396927435091 ], "wc_reply_reviewers_avg": [ 41.0, 10.700467279516348 ], "wc_reply_authors_avg": [ 67.0, 28.521921393903323 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.7071067811865475, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9075637173751992251&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "aff_unique_index": "0", "aff_unique_norm": "Nutanix", "aff_unique_dep": "", "aff_unique_url": "https://www.nutanix.com", "aff_unique_abbr": "Nutanix", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "id": "TVHS5Y4dNvM", "title": "Patches Are All You Need?", "track": "main", "status": "Reject", "tldr": "", "abstract": "Although convolutional networks have been the dominant architecture for vision tasks for many years, recent experiments have shown that Transformer-based models, most notably the Vision Transformer (ViT), may exceed their performance in some settings. However, due to the quadratic runtime of the self-attention layers in Transformers, ViTs require the use of patch embeddings, which group together small regions of the image into single input features, in order to be applied to larger image sizes. This raises a question: Is the performance of ViTs due to the inherently-more-powerful Transformer architecture, or is it at least partly due to using patches as the input representation? In this paper, we present some evidence for the latter: specifically, we propose the ConvMixer, an extremely simple model that is similar in spirit to the ViT and the even-more-basic MLP-Mixer in that it operates directly on patches as input, separates the mixing of spatial and channel dimensions, and maintains equal size and resolution throughout the network. In contrast, however, the ConvMixer uses only standard convolutions to achieve the mixing steps. Despite its simplicity, we show that the ConvMixer outperforms the ViT, MLP-Mixer, and some of their variants for similar parameter counts and data set sizes, in addition to outperforming classical vision models such as the ResNet. Our code is available at https://github.com/tmp-iclr/convmixer.", "keywords": "computer vision;vision transformer;mixer;patch embeddings;convolution;convolutional neural network", "primary_area": "", "supplementary_material": "", "author": "Asher Trockman;J Zico Kolter", "authorids": "~Asher_Trockman1;~J_Zico_Kolter1", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": "~Asher_Trockman1;~J_Zico_Kolter1", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\ntrockman2022patches,\ntitle={Patches Are All You Need?},\nauthor={Asher Trockman and J Zico Kolter},\nyear={2022},\nurl={https://openreview.net/forum?id=TVHS5Y4dNvM}\n}", "github": "", "project": "", "reviewers": "J6jX;jSQ1;RgZh", "site": "https://openreview.net/forum?id=TVHS5Y4dNvM", "pdf_size": 0, "recommendation": "5;5;8", "confidence": "4;5;4", "correctness": "3;2;3", "technical_novelty": "3;3;3", "empirical_novelty": "3;3;3", "wc_summary_paper": "73;48;66", "wc_summary_review": "20;39;71", "wc_main_review": "542;215;747", "wc_review": "635;302;884", "wc_reply_reviewers": "141;0;0", "wc_reply_authors": "2181;1019;1271", "reply_reviewers": "2;0;0", "reply_authors": "4;3;2", "recommendation_avg": [ 6.0, 1.4142135623730951 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "correctness_avg": [ 2.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 62.333333333333336, 10.530379332620875 ], "wc_summary_review_avg": [ 43.333333333333336, 21.044925490219462 ], "wc_main_review_avg": [ 501.3333333333333, 219.0834442753618 ], "wc_review_avg": [ 607.0, 238.42399208133395 ], "wc_reply_reviewers_avg": [ 47.0, 66.46803743153546 ], "wc_reply_authors_avg": [ 1490.3333333333333, 499.09340029920475 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.9428090415820634 ], "reply_authors_avg": [ 3.0, 0.816496580927726 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.5, "corr_recommendation_correctness": 0.5, "gs_citation": 585, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15188717593606933557&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5 }, { "id": "TVs3zZOOZ8t", "title": "Continuous Deep Q-Learning in Optimal Control Problems: Normalized Advantage Functions Analysis", "track": "main", "status": "Reject", "tldr": "", "abstract": "One of the most effective continuous deep reinforcement learning algorithms is normalized advantage functions (NAF). The main idea of NAF consists in the approximation of the Q-function by functions quadratic with respect to the action variable. This idea allows to apply the algorithm to continuous reinforcement learning problems, but on the other hand, it brings up the question of classes of problems in which this approximation is acceptable. The presented paper describes one such class. We consider reinforcement learning problems obtained by the time-discretization of certain optimal control problems. Based on the idea of NAF, we present a new family of quadratic functions and prove its suitable approximation properties. Taking these properties into account, we provide several ways to improve NAF. The experimental results confirm the efficiency of our improvements.", "keywords": "continuous reinforcement learning;deep q-learning;optimal control problems;normalized advantage functions", "primary_area": "", "supplementary_material": "/attachment/33168435d008b45cbae38ca058a0a98d8f12ef5a.zip", "author": "Anton Plaksin;Stepan Martyanov", "authorids": "~Anton_Plaksin1;~Stepan_Martyanov1", "gender": "M;M", "homepage": "https://orcid.org/0000-0002-1468-201X;https://www.instagram.com/pionerstepansky/", "dblp": "276/9860;", "google_scholar": "https://scholar.google.com/citations?view_op=list_works;", "orcid": "0000-0002-1468-201X;", "linkedin": ";", "or_profile": "~Anton_Plaksin1;~Stepan_Evgen'evich_Mart'yanov1", "aff": "N.N. Krasovskii Institute of Mathematics and Mechanics of the Ural Branch of the Russian Academy of Sciences (IMM UB RAS);Ural Federal University", "aff_domain": "imm.uran.ru;urfu.ru", "position": "Researcher;MS student", "bibtex": "@misc{\nplaksin2022continuous,\ntitle={Continuous Deep Q-Learning in Optimal Control Problems: Normalized Advantage Functions Analysis},\nauthor={Anton Plaksin and Stepan Martyanov},\nyear={2022},\nurl={https://openreview.net/forum?id=TVs3zZOOZ8t}\n}", "github": "", "project": "", "reviewers": "J1Yy;8xxB;6AfP", "site": "https://openreview.net/forum?id=TVs3zZOOZ8t", "pdf_size": 0, "recommendation": "3;3;3", "confidence": "4;5;2", "correctness": "4;3;3", "technical_novelty": "2;1;2", "empirical_novelty": "2;0;2", "wc_summary_paper": "93;43;71", "wc_summary_review": "90;156;28", "wc_main_review": "482;360;199", "wc_review": "665;559;298", "wc_reply_reviewers": "75;0;48", "wc_reply_authors": "500;712;411", "reply_reviewers": "1;0;1", "reply_authors": "1;1;1", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 3.6666666666666665, 1.247219128924647 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 1.6666666666666667, 0.4714045207910317 ], "empirical_novelty_avg": [ 1.3333333333333333, 0.9428090415820634 ], "wc_summary_paper_avg": [ 69.0, 20.46134567096374 ], "wc_summary_review_avg": [ 91.33333333333333, 52.264285660052366 ], "wc_main_review_avg": [ 347.0, 115.89938164919892 ], "wc_review_avg": [ 507.3333333333333, 154.21701880430996 ], "wc_reply_reviewers_avg": [ 41.0, 31.016124838541646 ], "wc_reply_authors_avg": [ 541.0, 126.25635297547076 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6686297270848327292&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;1", "aff_unique_norm": "N.N. Krasovskii Institute of Mathematics and Mechanics;Ural Federal University", "aff_unique_dep": "Institute of Mathematics and Mechanics;", "aff_unique_url": ";https://urfu.ru", "aff_unique_abbr": "IMM UB RAS;UFU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Russian Federation" }, { "title": "VOS: Learning What You Don't Know by Virtual Outlier Synthesis", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6052", "id": "TW7d65uYu5M", "poster": "", "openreview": "https://openreview.net/forum?id=TW7d65uYu5M", "slides": "https://iclr.cc/virtual/2022/poster/6052", "video": "https://iclr.cc/virtual/2022/poster/6052", "author_site": "Xuefeng Du, Zhaoning Wang, Mu Cai, Yixuan Li", "tldr": "", "abstract": "Out-of-distribution (OOD) detection has received much attention lately due to its importance in the safe deployment of neural networks. One of the key challenges is that models lack supervision signals from unknown data, and as a result, can produce overconfident predictions on OOD data. Previous approaches rely on real outlier datasets for model regularization, which can be costly and sometimes infeasible to obtain in practice. In this paper, we present VOS, a novel framework for OOD detection by adaptively synthesizing virtual outliers that can meaningfully regularize the model's decision boundary during training. Specifically, VOS samples virtual outliers from the low-likelihood region of the class-conditional distribution estimated in the feature space. Alongside, we introduce a novel unknown-aware training objective, which contrastively shapes the uncertainty space between the ID data and synthesized outlier data. VOS achieves competitive performance on both object detection and image classification models, reducing the FPR95 by up to 9.36% compared to the previous best method on object detectors. Code is available at https://github.com/deeplearning-wisc/vos.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xuefeng Du;Zhaoning Wang;Mu Cai;Yixuan Li", "authorids": "~Xuefeng_Du1;~Zhaoning_Wang2;~Mu_Cai1;~Yixuan_Li1", "gender": "M;M;M;F", "homepage": "https://d12306.github.io/;https://www.zhaoningwang.com/;https://pages.cs.wisc.edu/~mucai/;http://pages.cs.wisc.edu/~sharonli/", "dblp": "34/3557;;279/6529;144/6087-1", "google_scholar": "GE_aEh4AAAAJ;;euruCPEAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";;0009-0008-7967-9752;", "linkedin": "xuefeng-du-094723192/;;mu-cai/;liyixuan", "or_profile": "~Xuefeng_Du1;~Zhaoning_Wang2;~Mu_Cai1;~Yixuan_Li1", "aff": "University of Wisconsin, Madison;;Department of Computer Science, University of Wisconsin, Madison;Cornell University", "aff_domain": "wisc.edu;;cs.wisc.edu;cornell.edu", "position": "PhD student;;PhD student;Graduate Student", "bibtex": "@inproceedings{\ndu2022towards,\ntitle={Towards Unknown-aware Learning with Virtual Outlier Synthesis},\nauthor={Xuefeng Du and Zhaoning Wang and Mu Cai and Sharon Li},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=TW7d65uYu5M}\n}", "github": "", "project": "", "reviewers": "yx7X;XCrg;rXCn;jkoD", "pdf_size": 0, "recommendation": "5;6;8;8", "confidence": "5;3;3;5", "correctness": "4;4;3;4", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "63;75;79;193", "wc_summary_review": "27;31;107;10", "wc_main_review": "298;172;140;308", "wc_review": "388;278;326;511", "wc_reply_reviewers": "0;0;0;144", "wc_reply_authors": "793;370;287;949", "reply_reviewers": "0;0;0;1", "reply_authors": "2;1;2;3", "recommendation_avg": [ 6.75, 1.299038105676658 ], "confidence_avg": [ 4.0, 1.0 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 102.5, 52.580890064737396 ], "wc_summary_review_avg": [ 43.75, 37.35890121510535 ], "wc_main_review_avg": [ 229.5, 74.44964741353716 ], "wc_review_avg": [ 375.75, 87.28222900453449 ], "wc_reply_reviewers_avg": [ 36.0, 62.353829072479584 ], "wc_reply_authors_avg": [ 599.75, 278.35173342373855 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 2.0, 0.7071067811865476 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.19245008972987526, "corr_recommendation_correctness": -0.5555555555555555, "gs_citation": 385, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2027738849340009189&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=TW7d65uYu5M", "email": "wisc.edu;;cs.wisc.edu;cornell.edu", "author_num": 4, "aff_unique_index": "0;1;2", "aff_unique_norm": "University of Wisconsin;University of Wisconsin-Madison;Cornell University", "aff_unique_dep": ";Department of Computer Science;", "aff_unique_url": "https://www.wisc.edu;https://www.wisc.edu;https://www.cornell.edu", "aff_unique_abbr": "UW;UW-Madison;Cornell", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Madison;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "TWANKAJ1ZCr", "title": "Learn Together, Stop Apart: a Novel Approach to Ensemble Pruning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Gradient boosting is the most popular method of constructing ensembles that allow getting state-of-the-art results on many tasks. One of the critical parameters affecting the quality of the learned model is the number of models in the ensemble, or the number of boosting iterations. Unfortunately, the problem of selecting the optimal number of models still remains open and understudied. In this paper, we propose a new look at the hyperparameter selection problem in ensemble models. In contrast to the classical approaches that select the universal size of the ensemble from a hold-out validation subsample, our algorithm uses the hypothesis of heterogeneity of the sample space to adaptively set the required number of steps in one common ensemble for each group of objects individually. Experiments on popular implementations of gradient boosting show that the proposed method does not affect the complexity of learning algorithms and significantly increases quality on most standard benchmarks up to 1.5\\%.", "keywords": "ensemble;boosting;regularization;clusterization", "primary_area": "", "supplementary_material": "", "author": "Bulat Ibragimov;Gleb Gennadjevich Gusev", "authorids": "~Bulat_Ibragimov4;~Gleb_Gennadjevich_Gusev1", "gender": "M;M", "homepage": ";https://sberlabs.com/laboratories/sber-ai-lab", "dblp": ";117/9143.html", "google_scholar": "oQXXE_0AAAAJ;https://scholar.google.ru/citations?user=RWX4sYcAAAAJ", "orcid": "0000-0001-8540-0684;0009-0003-7298-1848", "linkedin": ";gleb-gusev-55a6a0ab/", "or_profile": "~Bulat_Ibragimov4;~Gleb_Gennadjevich_Gusev1", "aff": "Moscow Institute of Physics and Technology;ARTIFICIAL INTELLIGENCE RESEARCH INSTITUTE (AIRI)", "aff_domain": "phystech.edu;airi.net", "position": "PhD student;Principal Researcher", "bibtex": "@misc{\nibragimov2022learn,\ntitle={Learn Together, Stop Apart: a Novel Approach to Ensemble Pruning},\nauthor={Bulat Ibragimov and Gleb Gennadjevich Gusev},\nyear={2022},\nurl={https://openreview.net/forum?id=TWANKAJ1ZCr}\n}", "github": "", "project": "", "reviewers": "xGqg;EFwa;meXP", "site": "https://openreview.net/forum?id=TWANKAJ1ZCr", "pdf_size": 0, "recommendation": "5;5;6", "confidence": "5;4;4", "correctness": "4;3;3", "technical_novelty": "3;2;3", "empirical_novelty": "3;0;3", "wc_summary_paper": "164;96;64", "wc_summary_review": "80;39;74", "wc_main_review": "413;1038;399", "wc_review": "657;1173;537", "wc_reply_reviewers": "153;703;18", "wc_reply_authors": "428;982;152", "reply_reviewers": "1;1;1", "reply_authors": "2;2;1", "recommendation_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.0, 1.4142135623730951 ], "wc_summary_paper_avg": [ 108.0, 41.697322056298376 ], "wc_summary_review_avg": [ 64.33333333333333, 18.080068829760823 ], "wc_main_review_avg": [ 616.6666666666666, 297.98247524905815 ], "wc_review_avg": [ 789.0, 275.9130297756885 ], "wc_reply_reviewers_avg": [ 291.3333333333333, 296.2637713629903 ], "wc_reply_authors_avg": [ 520.6666666666666, 345.1234883664428 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.4999999999999999, "corr_recommendation_correctness": -0.4999999999999999, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:xR01GR76vdoJ:scholar.google.com/&scioq=Learn+Together,+Stop+Apart:+a+Novel+Approach+to+Ensemble+Pruning&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "Moscow Institute of Physics and Technology;", "aff_unique_dep": ";", "aff_unique_url": "https://www.mipt.ru/en;", "aff_unique_abbr": "MIPT;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0", "aff_country_unique": "Russian Federation;" }, { "id": "TWTTKlwrUP0", "title": "Generating High-Fidelity Privacy-Conscious Synthetic Patient Data for Causal Effect Estimation with Multiple Treatments", "track": "main", "status": "Reject", "tldr": "", "abstract": "A causal effect can be defined as the comparison of outcomes from two or more alternative treatments. Knowing this treatment effect is critically important in healthcare because it makes it possible to identify the best treatment for a person when more than one option exists. In the past decade, there has been exponentially growing interest in the use of observational data collected as a part of routine healthcare practice to determine the effect of a treatment with causal inference models. Validation of these models, however, has been a challenge because the ground truth is unknown: only one treatment-outcome pair for each person can be observed. There have been multiple efforts to fill this void using synthetic data where the ground truth can be generated. However, to date, these datasets have been severely limited in their utility either by being modeled after small non-representative patient populations, being dissimilar to real target populations, or only providing known effects for two cohorts (treated vs control). In this work, we produced a large-scale and realistic synthetic dataset that supports multiple hypertension treatments, by modeling after a nationwide cohort of more than 250,000 hypertension patients' multi-year history of diagnoses, medications, and laboratory values. We designed a data generation process by combining an adapted ADS-GAN model for fictitious patient information generation and a neural network for treatment outcome generation. Wasserstein distance of 0.35 demonstrates that our synthetic data follows a nearly identical joint distribution to the patient cohort used to generate the data. Our dataset provides ground truth effects for about 30 hypertension treatments on blood pressure outcomes. Patient privacy was a primary concern for this study; the $\\epsilon$-identifiability metric, which estimates the probability of actual patients being identified, is 0.008%, ensuring that our synthetic data cannot be used to identify any actual patients. Using our dataset, we tested the bias in causal effect estimation of three well-established models: propensity sore stratification, doubly robust approach (DR) with logistic regression, DR with random forest (RF) classification. Interestingly, we found that while the RF DR outperformed the logistic DR as expected, the best performance actually came from propensity score stratification, despite the theoretical strength of statistical properties of the DR family of models. We believe this dataset will facilitate the additional development, evaluation, and comparison of real-world causal models. The approach we used can be readily extended to other types of diseases in the clinical domain, and to datasets in other domains as well.", "keywords": "synthetic data;causal inference;EHR;healthcare;deep generative modeling;treatment effects;model validation;observational patient data;patient privacy", "primary_area": "", "supplementary_material": "/attachment/cb783ede4361c64f85f3c8df38f03594b700b9be.zip", "author": "Jingpu Shi;Dong Wang;Gino Tesei;Beau Norgeot", "authorids": "~Jingpu_Shi1;~Dong_Wang2;~Gino_Tesei1;~Beau_Norgeot1", "gender": "M;F;M;", "homepage": ";https://jelly007.github.io/;;", "dblp": ";;91/4937.html;https://dblp.uni-trier.de/pers/hd/n/Norgeot:Beau", "google_scholar": "https://scholar.google.com/citations?hl=en;https://scholar.google.com.hk/citations?user=JB5mlIMAAAAJ;NsMP1kkAAAAJ;jPYdk7MAAAAJ", "orcid": ";;0000-0002-0882-5125;", "linkedin": ";;ginotesei/;", "or_profile": "~Jingpu_Shi1;~Dong_Wang2;~Gino_Tesei1;~Beau_Norgeot1", "aff": ";;Anthem;", "aff_domain": ";;anthem.com;", "position": ";;AI Principal Data Scientist;", "bibtex": "@misc{\nshi2022generating,\ntitle={Generating High-Fidelity Privacy-Conscious Synthetic Patient Data for Causal Effect Estimation with Multiple Treatments},\nauthor={Jingpu Shi and Dong Wang and Gino Tesei and Beau Norgeot},\nyear={2022},\nurl={https://openreview.net/forum?id=TWTTKlwrUP0}\n}", "github": "", "project": "", "reviewers": "Jr99;46uW;eqMK", "site": "https://openreview.net/forum?id=TWTTKlwrUP0", "pdf_size": 0, "recommendation": "3;3;5", "confidence": "5;3;4", "correctness": "3;3;3", "technical_novelty": "2;2;2", "empirical_novelty": "2;2;2", "wc_summary_paper": "42;125;52", "wc_summary_review": "64;81;16", "wc_main_review": "386;404;326", "wc_review": "492;610;394", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "693;603;618", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "recommendation_avg": [ 3.6666666666666665, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 73.0, 36.995495221265344 ], "wc_summary_review_avg": [ 53.666666666666664, 27.523727137790686 ], "wc_main_review_avg": [ 372.0, 33.34666400106613 ], "wc_review_avg": [ 498.6666666666667, 88.30754340497884 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 638.0, 39.370039370059054 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5319175081356458832&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0", "aff_unique_norm": "Anthem, Inc.", "aff_unique_dep": "", "aff_unique_url": "https://www.antheminc.com", "aff_unique_abbr": "Anthem", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "id": "TXqemS7XEH", "title": "M6-10T: A Sharing-Delinking Paradigm for Efficient Multi-Trillion Parameter Pretraining", "track": "main", "status": "Reject", "tldr": "", "abstract": "Recent expeditious developments in deep learning algorithms, distributed training, and even hardware design for large models have enabled training extreme-scale models, say GPT-3 and Switch Transformer possessing hundreds of billions or even trillions of parameters. However, under limited resources, extreme-scale model training that requires enormous amounts of computes and memory footprint suffers from frustratingly low efficiency in model convergence. In this paper, we propose a simple training strategy called \u201cPseudo-to-Real\u201d for high-memory-footprint-required large models. Pseudo-to-Real is compatible with large models with architecture of sequential layers. We demonstrate a practice of pretraining unprecedented 10-trillion-parameter model, an order of magnitude larger than the state-of-the-art, on solely 512 GPUs within 10 days. Besides demonstrating the application of Pseudo-to-Real, we also provide a technique, Granular CPU offloading, to manage CPU memory for training large model and maintain high GPU utilities. Fast training of extreme-scale models on a decent amount of resources can bring much smaller carbon footprint and contribute to greener AI.", "keywords": "Extreme-Scale Pretraining;Language Modeling;Natural Language Processing", "primary_area": "", "supplementary_material": "", "author": "Junyang Lin;An Yang;Jinze Bai;Chang Zhou;Le Jiang;Xianyan Jia;Ang Wang;Jie Zhang;Yong Li;Wei Lin;Jingren Zhou;Hongxia Yang", "authorids": "~Junyang_Lin1;~An_Yang1;~Jinze_Bai1;~Chang_Zhou2;~Le_Jiang1;~Xianyan_Jia1;~Ang_Wang1;~Jie_Zhang17;~Yong_Li12;~Wei_Lin5;~Jingren_Zhou1;~Hongxia_Yang2", "gender": "M;M;M;M;M;;;M;;M;M;F", "homepage": ";;;;https://scholar.google.com/citations?user=dTTfxwMAAAAJ&hl=zh-CN;https://github.com/SeaOfOcean;;https://github.com/adoda;;;;https://www4.comp.polyu.edu.hk/~hongxyang/", "dblp": "215/3823;63/10551;210/0939;;;;;;;;84/2644;", "google_scholar": "qp6IwtgAAAAJ;vO9FZekAAAAJ;;QeSoG3sAAAAJ;dTTfxwMAAAAJ;;;;;LXSkrXkAAAAJ;;iJlC5mMAAAAJ", "orcid": ";;;;;;;;;;;", "linkedin": ";;;;;;;;;;;", "or_profile": "~Junyang_Lin1;~An_Yang1;~Jinze_Bai1;~Chang_Zhou2;~Le_Jiang1;~Xianyan_Jia1;~Ang_Wang1;~Jie_Zhang17;~Yong_Li12;~Wei_Lin5;~Jingren_Zhou1;~Hongxia_Yang2", "aff": "Alibaba Group;Alibaba Group;Alibaba Group;Alibaba Group;Alibaba Group;Alibaba Group;;Alibaba Group;;;Alibaba Group;Alibaba Group", "aff_domain": "alibaba-inc.com;alibaba-inc.com;alibaba-inc.com;alibaba-inc.com;alibaba-inc.com;alibaba-inc.com;;alibaba-inc.com;;;alibaba-inc.com;alibaba-inc.com", "position": "Researcher;Researcher;Researcher;Researcher;Algorithm Engineer;Software Engineer;;Researcher;;;Researcher;Principal Researcher", "bibtex": "@misc{\nlin2022mt,\ntitle={M6-10T: A Sharing-Delinking Paradigm for Efficient Multi-Trillion Parameter Pretraining},\nauthor={Junyang Lin and An Yang and Jinze Bai and Chang Zhou and Le Jiang and Xianyan Jia and Ang Wang and Jie Zhang and Yong Li and Wei Lin and Jingren Zhou and Hongxia Yang},\nyear={2022},\nurl={https://openreview.net/forum?id=TXqemS7XEH}\n}", "github": "", "project": "", "reviewers": "pBTW;YjPt;gmHp;3GYN;BFav", "site": "https://openreview.net/forum?id=TXqemS7XEH", "pdf_size": 0, "recommendation": "3;3;3;5;6", "confidence": "5;4;4;4;4", "correctness": "2;4;2;4;3", "technical_novelty": "3;2;3;3;2", "empirical_novelty": "2;1;2;3;2", "wc_summary_paper": "109;72;107;58;75", "wc_summary_review": "55;67;50;47;45", "wc_main_review": "150;300;825;645;444", "wc_review": "314;439;982;750;564", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "183;247;541;566;188", "reply_reviewers": "0;0;0;0;0", "reply_authors": "1;1;1;1;1", "recommendation_avg": [ 4.0, 1.2649110640673518 ], "confidence_avg": [ 4.2, 0.39999999999999997 ], "correctness_avg": [ 3.0, 0.8944271909999159 ], "technical_novelty_avg": [ 2.6, 0.4898979485566356 ], "empirical_novelty_avg": [ 2.0, 0.6324555320336759 ], "wc_summary_paper_avg": [ 84.2, 20.272148381461694 ], "wc_summary_review_avg": [ 52.8, 7.8587530817554 ], "wc_main_review_avg": [ 472.8, 240.2443755845285 ], "wc_review_avg": [ 609.8, 235.35369128186625 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 345.0, 171.9034612798707 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 12, 0 ], "corr_recommendation_confidence": -0.39528470752104744, "corr_recommendation_correctness": 0.35355339059327373, "gs_citation": 41, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7672552784787824127&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0;0;0;0;0;0;0;0", "aff_unique_norm": "Alibaba Group", "aff_unique_dep": "", "aff_unique_url": "https://www.alibaba.com", "aff_unique_abbr": "Alibaba", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Trigger Hunting with a Topological Prior for Trojan Detection", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6614", "id": "TXsjU8BaibT", "poster": "", "openreview": "https://openreview.net/forum?id=TXsjU8BaibT", "slides": "https://iclr.cc/virtual/2022/poster/6614", "video": "https://iclr.cc/virtual/2022/poster/6614", "author_site": "Xiaoling Hu, Xiao Lin, Michael Cogswell, Yi Yao, Susmit Jha, Chao Chen", "tldr": "", "abstract": "Despite their success and popularity, deep neural networks (DNNs) are vulnerable when facing backdoor attacks. This impedes their wider adoption, especially in mission critical applications. This paper tackles the problem of Trojan detection, namely, identifying Trojaned models \u2013 models trained with poisoned data. One popular approach is reverse engineering, i.e., recovering the triggers on a clean image by manipulating the model\u2019s prediction. One major challenge of reverse engineering approach is the enormous search space of triggers. To this end, we propose innovative priors such as diversity and topological simplicity to not only increase the chances of finding the appropriate triggers but also improve the quality of the found triggers. Moreover, by encouraging a diverse set of trigger candidates, our method can perform effectively in cases with unknown target labels. We demonstrate that these priors can significantly improve the quality of the recovered triggers, resulting in substantially improved Trojan detection accuracy as validated on both synthetic and publicly available TrojAI benchmarks.", "keywords": "Trojan detection;diversity loss;topological prior", "primary_area": "", "supplementary_material": "", "author": "Xiaoling Hu;Xiao Lin;Michael Cogswell;Yi Yao;Susmit Jha;Chao Chen", "authorids": "~Xiaoling_Hu1;~Xiao_Lin2;~Michael_Cogswell1;~Yi_Yao1;~Susmit_Jha1;~Chao_Chen1", "gender": "M;;M;F;;M", "homepage": "https://huxiaoling.github.io/;;http://mcogswell.io/;;http://susmitjha.github.io/;https://chaochen.github.io/", "dblp": "59/11113-2;09/1280;59/1410;59/179;;66/3019-12", "google_scholar": "6MfwhCAAAAAJ;;9e2wRsoAAAAJ;iD6QaXcAAAAJ;https://scholar.google.com/citations?hl=en;J-iIIFAAAAAJ", "orcid": ";;0000-0003-1647-0325;;0000-0001-5983-9095;0000-0003-1703-6483", "linkedin": "xiaoling-hu-1329337b/;;;;susmitjha/;", "or_profile": "~Xiaoling_Hu1;~Xiao_Lin2;~Michael_Cogswell1;~Yi_Yao1;~Susmit_Jha1;~Chao_Chen1", "aff": "Stony Brook University;SRI International;SRI International;;SRI International;State University of New York, Stony Brook", "aff_domain": "stonybrook.edu;sri.com;sri.com;;sri.com;stonybrook.edu", "position": "PhD student;Computer Scientist;Researcher;;Principal Scientist;Assistant Professor", "bibtex": "@inproceedings{\nhu2022trigger,\ntitle={Trigger Hunting with a Topological Prior for Trojan Detection},\nauthor={Xiaoling Hu and Xiao Lin and Michael Cogswell and Yi Yao and Susmit Jha and Chao Chen},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=TXsjU8BaibT}\n}", "github": "", "project": "", "reviewers": "momo;Tmo4;6VNg;3UGX", "pdf_size": 0, "recommendation": "5;5;8;8", "confidence": "5;4;3;3", "correctness": "3;2;3;3", "technical_novelty": "3;3;3;4", "empirical_novelty": "3;2;3;3", "wc_summary_paper": "40;110;179;62", "wc_summary_review": "52;52;64;61", "wc_main_review": "249;696;287;61", "wc_review": "341;858;530;184", "wc_reply_reviewers": "21;314;0;0", "wc_reply_authors": "313;1119;154;22", "reply_reviewers": "1;1;0;0", "reply_authors": "3;2;1;1", "recommendation_avg": [ 6.5, 1.5 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 97.75, 53.30279073369424 ], "wc_summary_review_avg": [ 57.25, 5.356071321407137 ], "wc_main_review_avg": [ 323.25, 231.5948779658134 ], "wc_review_avg": [ 478.25, 251.15172207253528 ], "wc_reply_reviewers_avg": [ 83.75, 133.21106372970678 ], "wc_reply_authors_avg": [ 402.0, 426.58938102114075 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.75, 0.82915619758885 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.9045340337332909, "corr_recommendation_correctness": 0.5773502691896258, "gs_citation": 52, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14905363852634937357&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=TXsjU8BaibT", "email": "stonybrook.edu;sri.com;sri.com;;sri.com;stonybrook.edu", "author_num": 6, "aff_unique_index": "0;1;1;1;2", "aff_unique_norm": "Stony Brook University;SRI International;State University of New York", "aff_unique_dep": ";;", "aff_unique_url": "https://www.stonybrook.edu;https://www.sri.com;https://www.stonybrook.edu", "aff_unique_abbr": "SBU;SRI;SUNY Stony Brook", "aff_campus_unique_index": "1", "aff_campus_unique": ";Stony Brook", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "TYqb6EXphrr", "title": "Space Time Recurrent Memory Network", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Transformers have recently been popular for learning and inference in the spatial-temporal domain. However, their performance relies on storing and applying attention to the feature tensor of each frame. Hence, their space and time complexity increase linearly as the sequence length grows, which could be very costly for long videos.\nWe propose a novel visual memory network architecture for the learning and inference problem in the spatial-temporal domain. We maintain a fixed set of memory slots in our memory network and explore different designs to input new information into the memory, combine the information in different memory slots and decide how to discard old information. \nFinally, this architecture is benchmarked on the video object segmentation and video prediction problems. Through the experiments, we show that our memory architecture can achieve competitive results compared to state-of-the-art transformer-based methods while maintaining constant memory capacity independent of sequence length.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/b4c463679384327be0c702c8d3e2f2b9c940aec4.zip", "author": "Hung Nguyen;Chanho Kim;Li Fuxin", "authorids": "~Hung_Nguyen3;~Chanho_Kim2;~Li_Fuxin1", "gender": ";M;M", "homepage": ";;http://web.engr.oregonstate.edu/~lif/", "dblp": ";135/4905.html;03/2783", "google_scholar": "r5VhqGIAAAAJ;xARSfT4AAAAJ;snDpfA0AAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Hung_Nguyen3;~Chanho_Kim2;~Fuxin_Li1", "aff": "Oregon State University;Oregon State University;Apple", "aff_domain": "oregonstate.edu;oregonstate.edu;apple.com", "position": "PhD student;Postdoc;Researcher", "bibtex": "@misc{\nnguyen2022space,\ntitle={Space Time Recurrent Memory Network},\nauthor={Hung Nguyen and Chanho Kim and Li Fuxin},\nyear={2022},\nurl={https://openreview.net/forum?id=TYqb6EXphrr}\n}", "github": "", "project": "", "reviewers": "qTp1;3fWH;ttjH;8Erh", "site": "https://openreview.net/forum?id=TYqb6EXphrr", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "5;5;5;4", "correctness": "3;2;2;4", "technical_novelty": "2;2;1;2", "empirical_novelty": "2;1;2;2", "wc_summary_paper": "40;50;106;63", "wc_summary_review": "26;14;22;69", "wc_main_review": "262;349;815;203", "wc_review": "328;413;943;335", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 4.75, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.82915619758885 ], "technical_novelty_avg": [ 1.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 64.75, 25.17315037892556 ], "wc_summary_review_avg": [ 32.75, 21.370248009791556 ], "wc_main_review_avg": [ 407.25, 241.07506611012263 ], "wc_review_avg": [ 504.75, 255.21400333837482 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -1.0, "corr_recommendation_correctness": 0.8703882797784891, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7075572165049446024&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff_unique_index": "0;0;1", "aff_unique_norm": "Oregon State University;Apple", "aff_unique_dep": ";Apple Inc.", "aff_unique_url": "https://oregonstate.edu;https://www.apple.com", "aff_unique_abbr": "OSU;Apple", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Network Augmentation for Tiny Deep Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6878", "id": "TYw3-OlrRm-", "poster": "", "openreview": "https://openreview.net/forum?id=TYw3-OlrRm-", "slides": "https://iclr.cc/virtual/2022/poster/6878", "video": "https://iclr.cc/virtual/2022/poster/6878", "author_site": "Han Cai, Chuang Gan, Ji Lin, Song Han", "tldr": "", "abstract": "We introduce Network Augmentation (NetAug), a new training method for improving the performance of tiny neural networks. Existing regularization techniques (e.g., data augmentation, dropout) have shown much success on large neural networks by adding noise to overcome over-fitting. However, we found these techniques hurt the performance of tiny neural networks. We argue that training tiny models are different from large models: rather than augmenting the data, we should augment the model, since tiny models tend to suffer from under-fitting rather than over-fitting due to limited capacity. To alleviate this issue, NetAug augments the network (reverse dropout) instead of inserting noise into the dataset or the network. It puts the tiny model into larger models and encourages it to work as a sub-model of larger models to get extra supervision, in addition to functioning as an independent model. At test time, only the tiny model is used for inference, incurring zero inference overhead. We demonstrate the effectiveness of NetAug on image classification and object detection. NetAug consistently improves the performance of tiny models, achieving up to 2.2% accuracy improvement on ImageNet. On object detection, achieving the same level of performance, NetAug requires 41% fewer MACs on Pascal VOC and 38% fewer MACs on COCO than the baseline.", "keywords": "Tiny Deep Learning", "primary_area": "", "supplementary_material": "", "author": "Han Cai;Chuang Gan;Ji Lin;Song Han", "authorids": "~Han_Cai1;~Chuang_Gan1;~Ji_Lin1;~Song_Han5", "gender": "M;M;M;", "homepage": "http://hancai.ai/;http://people.csail.mit.edu/ganchuang/;http://linji.me;", "dblp": "22/1915;139/6993;02/8200;", "google_scholar": "x-AvvrYAAAAJ;PTeSCbIAAAAJ;dVtzVVAAAAAJ;", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Han_Cai1;~Chuang_Gan1;~Ji_Lin1;~Song_Han5", "aff": "Massachusetts Institute of Technology;MIT-IBM Watson AI Lab;Massachusetts Institute of Technology;", "aff_domain": "mit.edu;ibm.com;mit.edu;", "position": "PhD student;PhD student;PhD student;", "bibtex": "@inproceedings{\ncai2022network,\ntitle={Network Augmentation for Tiny Deep Learning},\nauthor={Han Cai and Chuang Gan and Ji Lin and Song Han},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=TYw3-OlrRm-}\n}", "github": "", "project": "", "reviewers": "XSG2;ME7S;PXn6;co48", "pdf_size": 0, "recommendation": "3;6;6;8", "confidence": "4;3;5;4", "correctness": "2;3;3;4", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;3;0;3", "wc_summary_paper": "96;49;60;84", "wc_summary_review": "56;42;27;89", "wc_main_review": "89;196;667;125", "wc_review": "241;287;754;298", "wc_reply_reviewers": "0;32;393;7", "wc_reply_authors": "678;658;2405;355", "reply_reviewers": "0;1;3;1", "reply_authors": "3;2;8;1", "recommendation_avg": [ 5.75, 1.7853571071357126 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.0, 1.224744871391589 ], "wc_summary_paper_avg": [ 72.25, 18.659782956937093 ], "wc_summary_review_avg": [ 53.5, 22.91833327273168 ], "wc_main_review_avg": [ 269.25, 232.84584492749704 ], "wc_review_avg": [ 395.0, 208.36866367090806 ], "wc_reply_reviewers_avg": [ 108.0, 164.97424041346576 ], "wc_reply_authors_avg": [ 1024.0, 807.5261605669504 ], "reply_reviewers_avg": [ 1.25, 1.0897247358851685 ], "reply_authors_avg": [ 3.5, 2.692582403567252 ], "replies_avg": [ 25, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.9901475429766743, "gs_citation": 40, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7018888209290577838&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=TYw3-OlrRm-", "email": "mit.edu;ibm.com;mit.edu;", "author_num": 4, "aff_unique_index": "0;0;0", "aff_unique_norm": "Massachusetts Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://web.mit.edu", "aff_unique_abbr": "MIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Bridging Recommendation and Marketing via Recurrent Intensity Modeling", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6924", "id": "TZeArecH2Nf", "poster": "", "openreview": "https://openreview.net/forum?id=TZeArecH2Nf", "slides": "https://iclr.cc/virtual/2022/poster/6924", "video": "https://iclr.cc/virtual/2022/poster/6924", "author_site": "Yifei Ma, Ge Liu, Anoop Deoras", "tldr": "", "abstract": "This paper studies some under-explored connections between personalized recommendation and marketing systems. Obviously, these two systems are different, in two main ways. Firstly, personalized item-recommendation (ItemRec) is user-centric, whereas marketing recommends the best user-state segments (UserRec) on behalf of its item providers. (We treat different temporal states of the same user as separate marketing opportunities.) To overcome this difference, we realize a novel connection to Marked-Temporal Point Processes (MTPPs), where we view both problems as different projections from a unified temporal intensity model for all user-item pairs. Correspondingly, we derive Recurrent Intensity Models (RIMs) to extend from recurrent ItemRec models with minimal changes. The second difference between recommendation and marketing is in the temporal domains where they operate. While recommendation demands immediate responses in real-time, marketing campaigns are often long-term, setting goals to cover a given percentage of all opportunities for a given item in a given period of time. We formulate both considerations into a constrained optimization problem we call online match (OnlnMtch) and derive a solution we call Dual algorithm. Simply put, Dual modifies the real-time ItemRec scores such that the marketing constraints can be met with least compromises in user-centric utilities. Finally, our connections between recommendation and marketing may lead to novel applications. We run experiments where we use marketing as an alternative to cold-start item exploration, by setting a minimal-exposure constraint for every item in the audience base. Our experiments are available at \\url{https://github.com/awslabs/recurrent-intensity-model-experiments}", "keywords": "Recommender systems;marketing;push notifications;temporal point processes;sequence models", "primary_area": "", "supplementary_material": "/attachment/74abd4565c71d116b82834c84c70b72962ac9834.zip", "author": "Yifei Ma;Ge Liu;Anoop Deoras", "authorids": "~Yifei_Ma1;~Ge_Liu2;~Anoop_Deoras1", "gender": ";F;M", "homepage": "http://yma.io;http://www.mit.edu/~geliu/;", "dblp": ";;55/8761", "google_scholar": "ZVMcyxYAAAAJ;P6EahzcAAAAJ;QF_rhCIAAAAJ", "orcid": ";0000-0001-9383-5186;", "linkedin": "yifei-ma-48503620;;anoopdeoras/", "or_profile": "~Yifei_Ma1;~Ge_Liu2;~Anoop_Deoras1", "aff": "Amazon;Amazon AWS AI;Amazon", "aff_domain": "amazon.com;amazon.com;amazon.com", "position": "Applied Scientist;Researcher;Principal Researcher", "bibtex": "@inproceedings{\nma2022bridging,\ntitle={Bridging Recommendation and Marketing via Recurrent Intensity Modeling},\nauthor={Yifei Ma and Ge Liu and Anoop Deoras},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=TZeArecH2Nf}\n}", "github": "", "project": "", "reviewers": "CMef;cWJc;9NaL", "pdf_size": 0, "recommendation": "5;6;8", "confidence": "4;4;3", "correctness": "3;3;4", "technical_novelty": "2;3;4", "empirical_novelty": "2;3;2", "wc_summary_paper": "42;74;134", "wc_summary_review": "59;114;30", "wc_main_review": "197;192;160", "wc_review": "298;380;324", "wc_reply_reviewers": "57;0;0", "wc_reply_authors": "1182;267;107", "reply_reviewers": "1;0;0", "reply_authors": "3;1;1", "recommendation_avg": [ 6.333333333333333, 1.247219128924647 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 3.0, 0.816496580927726 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 83.33333333333333, 38.134265722866914 ], "wc_summary_review_avg": [ 67.66666666666667, 34.83612429010374 ], "wc_main_review_avg": [ 183.0, 16.391054470858997 ], "wc_review_avg": [ 334.0, 34.215006454283575 ], "wc_reply_reviewers_avg": [ 19.0, 26.870057685088806 ], "wc_reply_authors_avg": [ 518.6666666666666, 473.57388253811274 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.9428090415820634 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.944911182523068, "corr_recommendation_correctness": 0.944911182523068, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5079057721276604154&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=TZeArecH2Nf", "email": "amazon.com;amazon.com;amazon.com", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Amazon", "aff_unique_dep": "Amazon.com, Inc.", "aff_unique_url": "https://www.amazon.com", "aff_unique_abbr": "Amazon", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "T_8wHvOkEi9", "title": "Self-Organized Polynomial-time Coordination Graphs", "track": "main", "status": "Reject", "tldr": "", "abstract": "Coordination graph is a promising approach to model agent collaboration in multi-agent reinforcement learning. It factorizes a large multi-agent system into a suite of overlapping groups that represent the underlying coordination dependencies. One critical challenge in this paradigm is the complexity of computing maximum-value actions for a graph-based value factorization. It refers to the decentralized constraint optimization problem (DCOP), which and whose constant-ratio approximation are NP-hard problems. To bypass this fundamental hardness, this paper proposes a novel method, named Self-Organized Polynomial-time Coordination Graphs (SOP-CG), which uses structured graph classes to guarantee the optimality of the induced DCOPs with sufficient function expressiveness. We extend the graph topology to be state-dependent, formulate the graph selection as an imaginary agent, and finally derive an end-to-end learning paradigm from the unified Bellman optimality equation. In experiments, we show that our approach learns interpretable graph topologies, induces effective coordination, and improves performance across a variety of cooperative multi-agent tasks.", "keywords": "Multi-Agent Reinforcement Learning;Coordination Graphs;Polynomial-time DCOP", "primary_area": "", "supplementary_material": "/attachment/b963d3c08714f4fb069ab677256e3f78eb7fbd82.zip", "author": "Weijun Dong;Qianlan Yang;Zhizhou Ren;Jianhao Wang;Tonghan Wang;Chongjie Zhang", "authorids": "~Weijun_Dong1;~Qianlan_Yang1;~Zhizhou_Ren1;~Jianhao_Wang1;~Tonghan_Wang1;~Chongjie_Zhang1", "gender": "M;M;M;M;M;", "homepage": "https://github.com/dwjshift;https://github.com/yanQval;;http://group.iiis.tsinghua.edu.cn/~milab/;https://tonghanwang.github.io/;", "dblp": ";294/4952;https://dblp.uni-trier.de/pid/239/5714.html;https://dblp.uni-trier.de/pid/239/5945;175/6039-1.html;29/6693", "google_scholar": ";iV5nuc4AAAAJ;xgpMeDgAAAAJ;;-AR1yc4AAAAJ;LjxqXycAAAAJ", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": "~Weijun_Dong1;~Qianlan_Yang1;~Zhizhou_Ren1;~Jianhao_Wang1;~Tonghan_Wang1;~Chongjie_Zhang1", "aff": "Institute for Interdisciplinary Information Sciences, Tsinghua University;Tsinghua University;University of Illinois, Urbana Champaign;Tsinghua University;Tsinghua University;Tsinghua University", "aff_domain": "tsinghua.edu.cn;tsinghua.edu.cn;illinois.edu;tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn", "position": "Undergrad student;Undergrad student;PhD student;PhD student;MS student;Assistant Professor", "bibtex": "@misc{\ndong2022selforganized,\ntitle={Self-Organized Polynomial-time Coordination Graphs},\nauthor={Weijun Dong and Qianlan Yang and Zhizhou Ren and Jianhao Wang and Tonghan Wang and Chongjie Zhang},\nyear={2022},\nurl={https://openreview.net/forum?id=T_8wHvOkEi9}\n}", "github": "", "project": "", "reviewers": "xmMA;Jx51;h3B4", "site": "https://openreview.net/forum?id=T_8wHvOkEi9", "pdf_size": 0, "recommendation": "3;3;8", "confidence": "4;4;3", "correctness": "3;3;3", "technical_novelty": "2;2;3", "empirical_novelty": "2;0;3", "wc_summary_paper": "107;40;48", "wc_summary_review": "23;57;20", "wc_main_review": "206;451;250", "wc_review": "336;548;318", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "1004;2529;1190", "reply_reviewers": "0;0;0", "reply_authors": "2;5;2", "recommendation_avg": [ 4.666666666666667, 2.357022603955158 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 1.6666666666666667, 1.247219128924647 ], "wc_summary_paper_avg": [ 65.0, 29.87752778706208 ], "wc_summary_review_avg": [ 33.333333333333336, 16.779617264870957 ], "wc_main_review_avg": [ 302.3333333333333, 106.64687316351828 ], "wc_review_avg": [ 400.6666666666667, 104.43924336931764 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 1574.3333333333333, 679.308635468608 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 3.0, 1.4142135623730951 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.9999999999999998, "corr_recommendation_correctness": 0.0, "gs_citation": 18, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10295867697115976866&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff_unique_index": "0;0;1;0;0;0", "aff_unique_norm": "Tsinghua University;University of Illinois Urbana-Champaign", "aff_unique_dep": "Institute for Interdisciplinary Information Sciences;", "aff_unique_url": "https://www.tsinghua.edu.cn;https://illinois.edu", "aff_unique_abbr": "Tsinghua;UIUC", "aff_campus_unique_index": "1", "aff_campus_unique": ";Urbana-Champaign", "aff_country_unique_index": "0;0;1;0;0;0", "aff_country_unique": "China;United States" }, { "title": "RegionViT: Regional-to-Local Attention for Vision Transformers", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6345", "id": "T__V3uLix7V", "poster": "", "openreview": "https://openreview.net/forum?id=T__V3uLix7V", "slides": "https://iclr.cc/virtual/2022/poster/6345", "video": "https://iclr.cc/virtual/2022/poster/6345", "author_site": "Chun-Fu (Richard) Chen, Rameswar Panda, Quanfu Fan", "tldr": "", "abstract": "Vision transformer (ViT) has recently shown its strong capability in achieving comparable results to convolutional neural networks (CNNs) on image classification. However, vanilla ViT simply inherits the same architecture from the natural language processing directly, which is often not optimized for vision applications. Motivated by this, in this paper, we propose a new architecture that adopts the pyramid structure and employ novel regional-to-local attention rather than global self-attention in vision transformers. More specifically, our model first generates regional tokens and local tokens from an image with different patch sizes, where each regional token is associated with a set of local tokens based on the spatial location. The regional-to-local attention includes two steps: first, the regional self-attention extracts global information among all regional tokens and then the local self-attention exchanges the information among one regional token and the associated local tokens via self-attention. Therefore, even though local self-attention confines the scope in a local region but it can still receive global information.\nExtensive experiments on four vision tasks, including image classification, object and keypoint detection, semantics segmentation and action recognition, show that our approach outperforms or is on par with state-of-the-art ViT variants including many concurrent works. Our source codes and models are available at \\url{https://github.com/IBM/RegionViT}.", "keywords": "vision transformer;image recognition;multi-scale feature", "primary_area": "", "supplementary_material": "/attachment/af4a70e3fc07b53c39dea613593d6634e7c9c517.zip", "author": "Chun-Fu Chen;Rameswar Panda;Quanfu Fan", "authorids": "~Chun-Fu_Chen1;~Rameswar_Panda1;~Quanfu_Fan1", "gender": "M;M;M", "homepage": ";https://rpand002.github.io/;", "dblp": "48/915;126/0986;66/3950", "google_scholar": "9gqd5cYAAAAJ;_ySuu6gAAAAJ;kCxHiwUAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Chun-Fu_Chen1;~Rameswar_Panda1;~Quanfu_Fan1", "aff": "JPMorganChase, GTAR;MIT-IBM Watson AI Lab;MIT-IBM Watson AI Lab", "aff_domain": "jpmchase.com;ibm.com;us.ibm.com", "position": "Executive Director;Research Scientist;Researcher", "bibtex": "@inproceedings{\nchen2022regionvit,\ntitle={RegionViT: Regional-to-Local Attention for Vision Transformers},\nauthor={Chun-Fu Chen and Rameswar Panda and Quanfu Fan},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=T__V3uLix7V}\n}", "github": "", "project": "", "reviewers": "rREV;1u9G;iQks;YwKN", "pdf_size": 0, "recommendation": "6;6;6;6", "confidence": "3;4;3;4", "correctness": "3;3;3;3", "technical_novelty": "3;3;2;3", "empirical_novelty": "3;3;2;2", "wc_summary_paper": "32;142;54;122", "wc_summary_review": "45;98;38;82", "wc_main_review": "75;786;57;227", "wc_review": "152;1026;149;431", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "306;651;172;1022", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;2", "recommendation_avg": [ 6.0, 0.0 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 87.5, 45.724719791377616 ], "wc_summary_review_avg": [ 65.75, 25.02373872945448 ], "wc_main_review_avg": [ 286.25, 295.99102604639893 ], "wc_review_avg": [ 439.5, 357.4566407272356 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 537.75, 329.69863132867266 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 256, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17393879915811894634&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=T__V3uLix7V", "email": "jpmchase.com;ibm.com;us.ibm.com", "author_num": 3, "aff_unique_index": "0;1;1", "aff_unique_norm": "JPMorgan Chase;Massachusetts Institute of Technology", "aff_unique_dep": "Global Technology, Analytics, and Research (GTAR);IBM Watson AI Lab", "aff_unique_url": "https://www.jpmorganchase.com;https://www.mitibmwatsonailab.org", "aff_unique_abbr": "JPM;MIT-IBM AI Lab", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "T_p1vd88T87", "title": "Neural Implicit Representations for Physical Parameter Inference from a Single Video", "track": "main", "status": "Reject", "tldr": "", "abstract": "Neural networks have recently been used to model the dynamics of diverse physical systems. While existing methods achieve impressive results, they are limited by their strong demand for training data and their weak generalization abilities. To overcome these limitations, in this work we propose to combine neural implicit representations for appearance modeling with neural ordinary differential equations (ODEs) in order to obtain interpretable physical models directly from visual observations. Our proposed model combines several unique advantages: (i) It is trained from a single video, and thus overcomes the need for large training datasets. (ii) The use of neural implicit representation enables the processing of high-resolution videos and the synthesis of photo-realistic imagery. (iii) The embedded neural ODE has a known parametric form that allows for the identification of interpretable physical parameters, and (iv) long-term prediction in state space. (v) Furthermore, the photo-realistic rendering of novel scenes with modified physical parameters becomes possible. ", "keywords": "neural implicit representations;physics learning;video interpretation;physical parameter estimation", "primary_area": "", "supplementary_material": "/attachment/d74ee6a9c3212172b795cbf298c23a6881e160a3.zip", "author": "Florian Hofherr;Lukas Koestler;Florian Bernard;Daniel Cremers", "authorids": "~Florian_Hofherr1;~Lukas_Koestler1;~Florian_Bernard3;~Daniel_Cremers1", "gender": "M;M;;M", "homepage": "https://florianhofherr.github.io/;https://lukaskoestler.com;https://florianbernard.net;https://vision.in.tum.de/members/cremers", "dblp": "203/8712;276/1000;134/8112;c/DanielCremers", "google_scholar": "https://scholar.google.com/citations?hl=de;vepdJTQAAAAJ;https://scholar.google.de/citations?user=9GrQ2KYAAAAJ;cXQciMEAAAAJ", "orcid": "0000-0002-8688-3056;;;", "linkedin": "florian-hofherr/;;;", "or_profile": "~Florian_Hofherr1;~Lukas_Koestler1;~Florian_Bernard3;~Daniel_Cremers1", "aff": "Technical University Munich;Technical University Munich;University of Bonn;Technical University Munich", "aff_domain": "tum.de;tum.de;uni-bonn.de;tum.de", "position": "PhD student;PhD student;Assistant Professor;Full Professor", "bibtex": "@misc{\nhofherr2022neural,\ntitle={Neural Implicit Representations for Physical Parameter Inference from a Single Video},\nauthor={Florian Hofherr and Lukas Koestler and Florian Bernard and Daniel Cremers},\nyear={2022},\nurl={https://openreview.net/forum?id=T_p1vd88T87}\n}", "github": "", "project": "", "reviewers": "SZKC;s2K9;PZv8;fvJG;LTRa", "site": "https://openreview.net/forum?id=T_p1vd88T87", "pdf_size": 0, "recommendation": "3;3;3;5;6", "confidence": "3;4;3;3;2", "correctness": "3;2;3;3;3", "technical_novelty": "2;2;2;3;4", "empirical_novelty": "2;1;2;3;3", "wc_summary_paper": "65;51;45;101;121", "wc_summary_review": "24;21;35;18;156", "wc_main_review": "35;411;200;184;526", "wc_review": "124;483;280;303;803", "wc_reply_reviewers": "0;70;0;11;251", "wc_reply_authors": "179;684;484;263;390", "reply_reviewers": "0;1;0;1;1", "reply_authors": "1;1;1;1;1", "recommendation_avg": [ 4.0, 1.2649110640673518 ], "confidence_avg": [ 3.0, 0.6324555320336759 ], "correctness_avg": [ 2.8, 0.39999999999999997 ], "technical_novelty_avg": [ 2.6, 0.8 ], "empirical_novelty_avg": [ 2.2, 0.7483314773547882 ], "wc_summary_paper_avg": [ 76.6, 29.513386793114748 ], "wc_summary_review_avg": [ 50.8, 52.912758385856236 ], "wc_main_review_avg": [ 271.2, 174.8889933643624 ], "wc_review_avg": [ 398.6, 232.0660250876892 ], "wc_reply_reviewers_avg": [ 66.4, 95.89285687682894 ], "wc_reply_authors_avg": [ 400.0, 176.3076855953818 ], "reply_reviewers_avg": [ 0.6, 0.48989794855663565 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.7499999999999999, "corr_recommendation_correctness": 0.39528470752104744, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10442616381094332414&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Technical University of Munich;University of Bonn", "aff_unique_dep": ";", "aff_unique_url": "https://www.tum.de;https://www.uni-bonn.de/", "aff_unique_abbr": "TUM;UBonn", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Germany" }, { "id": "T_p2GaXuGeA", "title": "Local Calibration: Metrics and Recalibration", "track": "main", "status": "Reject", "tldr": "", "abstract": "Probabilistic classifiers output confidence scores along with their predictions, and these confidence scores should be calibrated, i.e., they should reflect the reliability of the prediction. Confidence scores that minimize standard metrics such as the expected calibration error (ECE) accurately measure the reliability on average across the entire population. However, it is in general impossible to measure the reliability of an individual prediction. In this work, we propose the local calibration error (LCE) to span the gap between average and individual reliability. For each individual prediction, the LCE measures the average reliability of a set of similar predictions, where similarity is quantified by a kernel function on a pretrained feature space and by a binning scheme over predicted model confidences. We show theoretically that the LCE can be estimated sample-efficiently from data, and empirically find that it reveals miscalibration modes that are more fine-grained than the ECE can detect. Our key result is a novel local recalibration method LoRe, to improve confidence scores for individual predictions and decrease the LCE. Experimentally, we show that our recalibration method produces more accurate confidence scores, which improves decision making and fairness on classification tasks using both image and tabular data.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Rachel Luo;Aadyot Bhatnagar;Yu Bai;Shengjia Zhao;Huan Wang;Caiming Xiong;Silvio Savarese;Stefano Ermon;Edward Schmerling;Marco Pavone", "authorids": "~Rachel_Luo1;~Aadyot_Bhatnagar1;~Yu_Bai1;~Shengjia_Zhao1;~Huan_Wang1;~Caiming_Xiong1;~Silvio_Savarese1;~Stefano_Ermon1;~Edward_Schmerling1;~Marco_Pavone1", "gender": "F;M;;M;M;M;M;M;M;M", "homepage": "https://rsluo.github.io/;https://linkedin.com/in/abhatnagar6;https://yubai.org;http://sjzhao.me;http://www.cs.yale.edu/homes/wang-huan/;http://cmxiong.com/;;http://cs.stanford.edu/~ermon/;;https://web.stanford.edu/~pavone/", "dblp": "182/0443;;03/6325-17.html;173/5122;70/6155-16.html;80/7282;50/3578;47/8135;143/7326;91/3382-1.html", "google_scholar": "9TPpYBMAAAAJ;o0qh7IUAAAAJ;owqhKD8AAAAJ;;7NpTttkAAAAJ;vaSdahkAAAAJ;ImpbxLsAAAAJ;;b4Kj6MIAAAAJ;RhOpyXcAAAAJ", "orcid": ";;;;;;;;;", "linkedin": ";;;;huanwangyale/;caiming-xiong-150a1417;;;;", "or_profile": "~Rachel_Luo1;~Aadyot_Bhatnagar1;~Yu_Bai1;~Shengjia_Zhao1;~Huan_Wang1;~Caiming_Xiong1;~Silvio_Savarese1;~Stefano_Ermon1;~Edward_Schmerling1;~Marco_Pavone1", "aff": "Stanford University;Salesforce;Salesforce Research;Stanford University;Salesforce.com;Salesforce Research;Stanford University;Stanford University;Stanford University;Stanford University", "aff_domain": "stanford.edu;salesforce.com;salesforce.com;stanford.edu;salesforce.com;salesforce.com;stanford.edu;stanford.edu;stanford.edu;stanford.edu", "position": "PhD student;Researcher;Research Scientist;PhD student;Researcher;Research Scientist;Adjunct Professor;Assistant Professor;Researcher;Associate Professor", "bibtex": "@misc{\nluo2022local,\ntitle={Local Calibration: Metrics and Recalibration },\nauthor={Rachel Luo and Aadyot Bhatnagar and Yu Bai and Shengjia Zhao and Huan Wang and Caiming Xiong and Silvio Savarese and Stefano Ermon and Edward Schmerling and Marco Pavone},\nyear={2022},\nurl={https://openreview.net/forum?id=T_p2GaXuGeA}\n}", "github": "", "project": "", "reviewers": "784d;Es9F;76PS;fisU;hPxn", "site": "https://openreview.net/forum?id=T_p2GaXuGeA", "pdf_size": 0, "recommendation": "5;5;5;5;6", "confidence": "5;4;4;5;4", "correctness": "3;3;4;3;4", "technical_novelty": "2;2;2;2;4", "empirical_novelty": "2;2;2;2;4", "wc_summary_paper": "44;55;110;56;54", "wc_summary_review": "44;60;58;30;41", "wc_main_review": "228;397;502;301;442", "wc_review": "316;512;670;387;537", "wc_reply_reviewers": "114;111;362;0;172", "wc_reply_authors": "641;281;861;802;1033", "reply_reviewers": "1;1;1;0;1", "reply_authors": "1;1;2;1;2", "recommendation_avg": [ 5.2, 0.39999999999999997 ], "confidence_avg": [ 4.4, 0.48989794855663565 ], "correctness_avg": [ 3.4, 0.4898979485566356 ], "technical_novelty_avg": [ 2.4, 0.8 ], "empirical_novelty_avg": [ 2.4, 0.8 ], "wc_summary_paper_avg": [ 63.8, 23.498085028359228 ], "wc_summary_review_avg": [ 46.6, 11.16422858956229 ], "wc_main_review_avg": [ 374.0, 98.16516693817618 ], "wc_review_avg": [ 484.4, 123.14316871024556 ], "wc_reply_reviewers_avg": [ 151.8, 118.93594914911135 ], "wc_reply_authors_avg": [ 723.6, 254.35062413919886 ], "reply_reviewers_avg": [ 0.8, 0.4 ], "reply_authors_avg": [ 1.4, 0.4898979485566356 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 10, 0 ], "corr_recommendation_confidence": -0.408248290463863, "corr_recommendation_correctness": 0.6123724356957947, "gs_citation": 23, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17343114239658011497&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff_unique_index": "0;1;1;0;1;1;0;0;0;0", "aff_unique_norm": "Stanford University;Salesforce", "aff_unique_dep": ";", "aff_unique_url": "https://www.stanford.edu;https://www.salesforce.com", "aff_unique_abbr": "Stanford;Salesforce", "aff_campus_unique_index": "0;0;0;0;0;0", "aff_campus_unique": "Stanford;", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "T_uSMSAlgoy", "title": "On the Latent Holes \ud83e\uddc0 of VAEs for Text Generation", "track": "main", "status": "Reject", "tldr": "", "abstract": "In this paper, we provide the first focused study on the discontinuities (aka. holes) in the latent space of Variational Auto-Encoders (VAEs), a phenomenon which has been shown to have a detrimental effect on model capacity. When investigating la- tent holes, existing works are exclusively centred around the encoder network and they merely explore the existence of holes. We tackle these limitations by proposing a highly efficient Tree-based Decoder-Centric (TDC) algorithm for latent hole identification, with a focal point on the text domain. In contrast to past studies, our approach pays attention to the decoder network, as a decoder has a direct impact on the model\u2019s output quality. Furthermore, we provide, for the first time, in-depth empirical analysis of the latent hole phenomenon, investigating several important aspects such as how the holes impact VAE algorithms\u2019 performance on text generation, and how the holes are distributed in the latent space.", "keywords": "Latent Discontinuity;Variational Auto-Encoder;Natural Language Generation;Generative Model", "primary_area": "", "supplementary_material": "", "author": "Ruizhe Li;Xutan Peng;Chenghua Lin", "authorids": "~Ruizhe_Li2;~Xutan_Peng2;~Chenghua_Lin1", "gender": "M;;", "homepage": "https://www.ruizhe.space/;;", "dblp": "14/10102-1;;", "google_scholar": "f_5wLsUAAAAJ;;", "orcid": "0000-0003-2512-845X;;", "linkedin": "ruizhe-li-3490b4b3/;;", "or_profile": "~Ruizhe_Li2;~Xutan_Peng2;~Chenghua_Lin1", "aff": "University College London;;", "aff_domain": "ucl.ac.uk;;", "position": "Postdoc;;", "bibtex": "@misc{\nli2022on,\ntitle={On the Latent Holes \ud83e\uddc0 of {VAE}s for Text Generation},\nauthor={Ruizhe Li and Xutan Peng and Chenghua Lin},\nyear={2022},\nurl={https://openreview.net/forum?id=T_uSMSAlgoy}\n}", "github": "", "project": "", "reviewers": "6sDa;PV7H;gj1W", "site": "https://openreview.net/forum?id=T_uSMSAlgoy", "pdf_size": 0, "recommendation": "3;3;6", "confidence": "4;3;3", "correctness": "2;2;3", "technical_novelty": "2;3;3", "empirical_novelty": "2;3;3", "wc_summary_paper": "60;160;116", "wc_summary_review": "55;79;17", "wc_main_review": "653;560;238", "wc_review": "768;799;371", "wc_reply_reviewers": "0;60;0", "wc_reply_authors": "21;138;18", "reply_reviewers": "0;1;0", "reply_authors": "1;1;1", "recommendation_avg": [ 4.0, 1.4142135623730951 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "correctness_avg": [ 2.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 112.0, 40.92269134192748 ], "wc_summary_review_avg": [ 50.333333333333336, 25.525586292102197 ], "wc_main_review_avg": [ 483.6666666666667, 177.81326034791545 ], "wc_review_avg": [ 646.0, 194.86576576368324 ], "wc_reply_reviewers_avg": [ 20.0, 28.284271247461902 ], "wc_reply_authors_avg": [ 59.0, 55.87486017879597 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.5, "corr_recommendation_correctness": 1.0, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11106664463603083358&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff_unique_index": "0", "aff_unique_norm": "University College London", "aff_unique_dep": "", "aff_unique_url": "https://www.ucl.ac.uk", "aff_unique_abbr": "UCL", "aff_country_unique_index": "0", "aff_country_unique": "United Kingdom" }, { "title": "Missingness Bias in Model Debugging", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6930", "id": "Te5ytkqsnl", "poster": "", "openreview": "https://openreview.net/forum?id=Te5ytkqsnl", "slides": "https://iclr.cc/virtual/2022/poster/6930", "video": "https://iclr.cc/virtual/2022/poster/6930", "author_site": "Saachi Jain, Hadi Salman, Eric Wong, Pengchuan Zhang, Vibhav Vineet, Sai Vemprala, Aleksander Madry", "tldr": "", "abstract": "Missingness, or the absence of features from an input, is a concept fundamental to many model debugging tools. However, in computer vision, pixels cannot simply be removed from an image. One thus tends to resort to heuristics such as blacking out pixels, which may in turn introduce bias into the debugging process. We study such biases and, in particular, show how transformer-based architectures can enable a more natural implementation of missingness, which side-steps these issues and improves the reliability of model debugging in practice.\n", "keywords": "model debugging;vision transformers;missingness", "primary_area": "", "supplementary_material": "", "author": "Saachi Jain;Hadi Salman;Eric Wong;Pengchuan Zhang;Vibhav Vineet;Sai Vemprala;Aleksander Madry", "authorids": "~Saachi_Jain1;~Hadi_Salman1;~Eric_Wong1;~Pengchuan_Zhang1;~Vibhav_Vineet5;~Sai_Vemprala1;~Aleksander_Madry1", "gender": "F;M;M;M;;M;M", "homepage": "http://people.csail.mit.edu/saachij/;https://hadisalman.com/;http://riceric22.github.io/;https://pzzhang.github.io/pzzhang/;;https://www.saihv.com;https://people.csail.mit.edu/madry/", "dblp": "227/2617;192/3204;64/1811-1.html;;;190/8334.html;67/2454", "google_scholar": "6hsn3EYAAAAJ;Kr8JjF0AAAAJ;pWnTMRkAAAAJ;3VZ_E64AAAAJ;;PnaHFhUAAAAJ;SupjsEUAAAAJ", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": "~Saachi_Jain1;~Hadi_Salman1;~Eric_Wong1;~Pengchuan_Zhang1;~Vibhav_Vineet5;~Sai_Vemprala1;~Aleksander_Madry1", "aff": "Massachusetts Institute of Technology;Massachusetts Institute of Technology;Massachusetts Institute of Technology;Microsoft Research;;Microsoft;Massachusetts Institute of Technology", "aff_domain": "mit.edu;mit.edu;mit.edu;research.microsoft.com;;microsoft.com;mit.edu", "position": "PhD student;PhD Student;Postdoc;Researcher;;Senior Researcher;Professor", "bibtex": "@inproceedings{\njain2022missingness,\ntitle={Missingness Bias in Model Debugging},\nauthor={Saachi Jain and Hadi Salman and Eric Wong and Pengchuan Zhang and Vibhav Vineet and Sai Vemprala and Aleksander Madry},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=Te5ytkqsnl}\n}", "github": "", "project": "", "reviewers": "hiFn;SQBL;sYZg", "pdf_size": 0, "recommendation": "5;5;6", "confidence": "5;4;4", "correctness": "4;3;3", "technical_novelty": "2;2;3", "empirical_novelty": "3;3;3", "wc_summary_paper": "246;117;64", "wc_summary_review": "105;60;92", "wc_main_review": "529;453;291", "wc_review": "880;630;447", "wc_reply_reviewers": "0;523;0", "wc_reply_authors": "414;1331;248", "reply_reviewers": "0;3;0", "reply_authors": "1;3;1", "recommendation_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 142.33333333333334, 76.43006970790721 ], "wc_summary_review_avg": [ 85.66666666666667, 18.909139471577113 ], "wc_main_review_avg": [ 424.3333333333333, 99.25500267268929 ], "wc_review_avg": [ 652.3333333333334, 177.47550691730834 ], "wc_reply_reviewers_avg": [ 174.33333333333334, 246.54456437370956 ], "wc_reply_authors_avg": [ 664.3333333333334, 476.25086759909306 ], "reply_reviewers_avg": [ 1.0, 1.4142135623730951 ], "reply_authors_avg": [ 1.6666666666666667, 0.9428090415820634 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": -0.4999999999999999, "corr_recommendation_correctness": -0.4999999999999999, "gs_citation": 33, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2038886342850944148&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=Te5ytkqsnl", "email": "mit.edu;mit.edu;mit.edu;research.microsoft.com;;microsoft.com;mit.edu", "author_num": 7, "aff_unique_index": "0;0;0;1;1;0", "aff_unique_norm": "Massachusetts Institute of Technology;Microsoft", "aff_unique_dep": ";Microsoft Research", "aff_unique_url": "https://web.mit.edu;https://www.microsoft.com/en-us/research", "aff_unique_abbr": "MIT;MSR", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "SURF: Semi-supervised Reward Learning with Data Augmentation for Feedback-efficient Preference-based Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6054", "id": "TfhfZLQ2EJO", "poster": "", "openreview": "https://openreview.net/forum?id=TfhfZLQ2EJO", "slides": "https://iclr.cc/virtual/2022/poster/6054", "video": "https://iclr.cc/virtual/2022/poster/6054", "author_site": "Jongjin Park, Younggyo Seo, Jinwoo Shin, Honglak Lee, Pieter Abbeel, Kimin Lee", "tldr": "", "abstract": "Preference-based reinforcement learning (RL) has shown potential for teaching agents to perform the target tasks without a costly, pre-defined reward function by learning the reward with a supervisor\u2019s preference between the two agent behaviors. However, preference-based learning often requires a large amount of human feedback, making it difficult to apply this approach to various applications. This data-efficiency problem, on the other hand, has been typically addressed by using unlabeled samples or data augmentation techniques in the context of supervised learning. Motivated by the recent success of these approaches, we present SURF, a semi-supervised reward learning framework that utilizes a large amount of unlabeled samples with data augmentation. In order to leverage unlabeled samples for reward learning, we infer pseudo-labels of the unlabeled samples based on the confidence of the preference predictor. To further improve the label-efficiency of reward learning, we introduce a new data augmentation that temporally crops consecutive subsequences from the original behaviors. Our experiments demonstrate that our approach significantly improves the feedback-efficiency of the state-of-the-art preference-based method on a variety of locomotion and robotic manipulation tasks.", "keywords": "preference-based reinforcement learning;human-in-the-loop reinforcement learning;deep reinforcement learning;semi-supervised learning", "primary_area": "", "supplementary_material": "/attachment/ea8664f4948105bfdacb400acbef1e1e97f21a05.zip", "author": "Jongjin Park;Younggyo Seo;Jinwoo Shin;Honglak Lee;Pieter Abbeel;Kimin Lee", "authorids": "~Jongjin_Park1;~Younggyo_Seo1;~Jinwoo_Shin1;~Honglak_Lee2;~Pieter_Abbeel2;~Kimin_Lee1", "gender": "M;M;M;M;M;M", "homepage": ";https://younggyo.me/;https://sites.google.com/site/mijirim/;https://people.eecs.berkeley.edu/~pabbeel/;https://sites.google.com/view/kiminlee;http://web.eecs.umich.edu/~honglak", "dblp": "30/1783;265/5586;31/7062;;183/6849;58/2562", "google_scholar": "F9DGEgEAAAAJ;tI1-YwIAAAAJ;https://scholar.google.com.tw/citations?user=m3eDp7kAAAAJ;https://scholar.google.com.tw/citations?user=vtwH6GkAAAAJ;92M8xv4AAAAJ;fmSHtE8AAAAJ", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": "~Jongjin_Park1;~Younggyo_Seo1;~Jinwoo_Shin1;~Pieter_Abbeel2;~Kimin_Lee1;~Honglak_Lee1", "aff": "Korea Advanced Institute of Science & Technology;University of California, Berkeley;Korea Advanced Institute of Science & Technology;Covariant;University of California, Berkeley;University of Michigan", "aff_domain": "kaist.ac.kr;berkeley.edu;kaist.ac.kr;covariant.ai;berkeley.edu;umich.edu", "position": "PhD student;Intern;Associate Professor;Founder;Postdoc;Associate Professor", "bibtex": "@inproceedings{\npark2022surf,\ntitle={{SURF}: Semi-supervised Reward Learning with Data Augmentation for Feedback-efficient Preference-based Reinforcement Learning},\nauthor={Jongjin Park and Younggyo Seo and Jinwoo Shin and Honglak Lee and Pieter Abbeel and Kimin Lee},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=TfhfZLQ2EJO}\n}", "github": "", "project": "", "reviewers": "JjVp;cS2w;qrsj", "pdf_size": 0, "recommendation": "6;6;6", "confidence": "4;3;4", "correctness": "3;4;3", "technical_novelty": "3;3;2", "empirical_novelty": "3;3;2", "wc_summary_paper": "66;73;70", "wc_summary_review": "59;53;49", "wc_main_review": "375;249;274", "wc_review": "500;375;393", "wc_reply_reviewers": "102;19;0", "wc_reply_authors": "800;547;759", "reply_reviewers": "1;1;0", "reply_authors": "3;2;2", "recommendation_avg": [ 6.0, 0.0 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 69.66666666666667, 2.8674417556808756 ], "wc_summary_review_avg": [ 53.666666666666664, 4.109609335312651 ], "wc_main_review_avg": [ 299.3333333333333, 54.46915538989832 ], "wc_review_avg": [ 422.6666666666667, 55.174470747096635 ], "wc_reply_reviewers_avg": [ 40.333333333333336, 44.28945196720722 ], "wc_reply_authors_avg": [ 702.0, 110.87229891486271 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 2.3333333333333335, 0.4714045207910317 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 113, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=573650694598627366&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "pdf": "https://openreview.net/pdf?id=TfhfZLQ2EJO", "email": "kaist.ac.kr;berkeley.edu;kaist.ac.kr;covariant.ai;berkeley.edu;umich.edu", "author_num": 6, "aff_unique_index": "0;1;0;2;1;3", "aff_unique_norm": "Korea Advanced Institute of Science and Technology;University of California, Berkeley;Covariant;University of Michigan", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.kaist.ac.kr;https://www.berkeley.edu;;https://www.umich.edu", "aff_unique_abbr": "KAIST;UC Berkeley;;UM", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Berkeley", "aff_country_unique_index": "0;1;0;1;1", "aff_country_unique": "South Korea;United States;" }, { "id": "TfwF7pqwqdm", "title": "On the exploitative behavior of adversarial training against adversarial attacks", "track": "main", "status": "Reject", "tldr": "", "abstract": "Adversarial attacks have been developed as intentionally designed perturbations added to the inputs in order to fool deep neural network classifiers. Adversarial training has been shown to be an effective approach to improving the robustness of the classifiers against such attacks especially in the white-box setting. In this work, we demonstrate that some geometric consequences of adversarial training on the decision boundary of deep networks give an edge to certain types of black-box attacks. In particular, we introduce a highly parallelizable black-box attack against the classifiers equipped with an $\\ell_2$ norm similarity detector, which exploits the low mean curvature of the decision boundary. We use this black-box attack to demonstrate that adversarially-trained networks might be easier to fool in certain scenarios. Moreover, we define a metric called robustness gain to show that while adversarial training is an effective method to improve the robustness in the white-box attack setting, it may not provide such a good robustness gain against the more realistic decision-based black-box attacks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ali Rahmati;Seyed-Mohsen Moosavi-Dezfooli;Huaiyu Dai", "authorids": "~Ali_Rahmati1;~Seyed-Mohsen_Moosavi-Dezfooli1;~Huaiyu_Dai1", "gender": "M;M;M", "homepage": "https://thisisalirah.github.io/;;https://ece.ncsu.edu/people/hdai/", "dblp": ";;09/5360.html", "google_scholar": ";https://scholar.google.ch/citations?user=qosS83IAAAAJ;HOSH65oAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Ali_Rahmati1;~Seyed-Mohsen_Moosavi-Dezfooli1;~Huaiyu_Dai2", "aff": ";Swiss Federal Institute of Technology;North Carolina State University", "aff_domain": ";ethz.ch;ncsu.edu", "position": ";Postdoc;Full Professor", "bibtex": "@misc{\nrahmati2022on,\ntitle={On the exploitative behavior of adversarial training against adversarial attacks},\nauthor={Ali Rahmati and Seyed-Mohsen Moosavi-Dezfooli and Huaiyu Dai},\nyear={2022},\nurl={https://openreview.net/forum?id=TfwF7pqwqdm}\n}", "github": "", "project": "", "reviewers": "ozXg;Q3xq;jHBK;knUq;S9ch", "site": "https://openreview.net/forum?id=TfwF7pqwqdm", "pdf_size": 0, "recommendation": "3;3;5;5;5", "confidence": "4;4;4;4;3", "correctness": "2;2;2;3;3", "technical_novelty": "2;2;3;3;2", "empirical_novelty": "2;2;3;3;2", "wc_summary_paper": "53;54;259;62;52", "wc_summary_review": "21;57;89;57;8", "wc_main_review": "255;469;623;529;196", "wc_review": "329;580;971;648;256", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "0;0;0;0;0", "reply_reviewers": "0;0;0;0;0", "reply_authors": "0;0;0;0;0", "recommendation_avg": [ 4.2, 0.9797958971132712 ], "confidence_avg": [ 3.8, 0.39999999999999997 ], "correctness_avg": [ 2.4, 0.4898979485566356 ], "technical_novelty_avg": [ 2.4, 0.4898979485566356 ], "empirical_novelty_avg": [ 2.4, 0.4898979485566356 ], "wc_summary_paper_avg": [ 96.0, 81.57695753091065 ], "wc_summary_review_avg": [ 46.4, 28.841636569376575 ], "wc_main_review_avg": [ 414.4, 162.93262411193163 ], "wc_review_avg": [ 556.8, 254.0829785719618 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.408248290463863, "corr_recommendation_correctness": 0.6666666666666667, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:S6YMXS4vlsAJ:scholar.google.com/&scioq=On+the+exploitative+behavior+of+adversarial+training+against+adversarial+attacks&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1", "aff_unique_norm": "Swiss Federal Institute of Technology;North Carolina State University", "aff_unique_dep": ";", "aff_unique_url": "https://www.ethz.ch;https://www.ncsu.edu", "aff_unique_abbr": "ETH Zurich;NCSU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "Switzerland;United States" }, { "id": "Ti2i204vZON", "title": "Learning Representations for Pixel-based Control: What Matters and Why?", "track": "main", "status": "Reject", "tldr": "", "abstract": "Learning representations for pixel-based control has garnered significant attention recently in reinforcement learning. A wide range of methods have been proposed to enable efficient learning, leading to sample complexities similar to those in the full state setting. However, moving beyond carefully curated pixel data sets (centered crop, appropriate lighting, clear background, etc.) remains challenging. In this paper, we adopt a more difficult setting, incorporating background distractors, as a first step towards addressing this challenge. We present a simple baseline approach that can learn meaningful representations with no metric-based learning, no data augmentations, no world-model learning, and no contrastive learning. We then analyze when and why previously proposed methods are likely to fail or reduce to the same performance as the baseline in this harder setting and why we should think carefully about extending such methods beyond the well-curated environments. Our results show that finer categorization of benchmarks on the basis of characteristics like the density of reward, planning horizon of the problem, presence of task-irrelevant components, etc., is crucial in evaluating algorithms. Based on these observations, we propose different metrics to consider when evaluating an algorithm on benchmark tasks. We hope such a data-centric view can motivate researchers to rethink representation learning when investigating how to best apply RL to real-world tasks.", "keywords": "Reinforcement Learning;Representation Learning;Pixel-based Control", "primary_area": "", "supplementary_material": "/attachment/d1da387a7bbb5d3e7164bf06995a83d7100e638c.zip", "author": "Manan Tomar;Utkarsh Aashu Mishra;Amy Zhang;Matthew E. Taylor", "authorids": "~Manan_Tomar1;~Utkarsh_Aashu_Mishra2;~Amy_Zhang1;~Matthew_E._Taylor2", "gender": "M;M;F;M", "homepage": "https://manantomar.github.io/;http://utkarshmishra04.github.io/;;https://irll.ca", "dblp": "241/6227;274/2706;43/2754;46/4287.html", "google_scholar": ";10HbT44AAAAJ;;edQgLXcAAAAJ", "orcid": ";0000-0002-4977-5187;;0000-0001-8946-0211", "linkedin": ";utkarshamishra/;;", "or_profile": "~Manan_Tomar1;~Utkarsh_Aashu_Mishra2;~Amy_Zhang2;~Matthew_Taylor1", "aff": "Microsoft;Indian Institute of Science;University of California, Berkeley;Washington State University, Pullman", "aff_domain": "microsoft.com;iisc.ac.in;berkeley.edu;wsu.edu", "position": "Intern;Intern;Postdoc;Adjunct Professor", "bibtex": "@misc{\ntomar2022learning,\ntitle={Learning Representations for Pixel-based Control: What Matters and Why?},\nauthor={Manan Tomar and Utkarsh Aashu Mishra and Amy Zhang and Matthew E. Taylor},\nyear={2022},\nurl={https://openreview.net/forum?id=Ti2i204vZON}\n}", "github": "", "project": "", "reviewers": "61FY;KvBB;RjsF;7euW", "site": "https://openreview.net/forum?id=Ti2i204vZON", "pdf_size": 0, "recommendation": "3;3;6;6", "confidence": "4;4;3;3", "correctness": "3;2;3;3", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "77;95;56;194", "wc_summary_review": "96;44;129;89", "wc_main_review": "513;378;689;768", "wc_review": "686;517;874;1051", "wc_reply_reviewers": "631;0;49;191", "wc_reply_authors": "1782;590;555;767", "reply_reviewers": "2;0;1;1", "reply_authors": "3;1;1;1", "recommendation_avg": [ 4.5, 1.5 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 105.5, 52.92683629313205 ], "wc_summary_review_avg": [ 89.5, 30.30264014900352 ], "wc_main_review_avg": [ 587.0, 151.92267770152026 ], "wc_review_avg": [ 782.0, 200.16618095972157 ], "wc_reply_reviewers_avg": [ 217.75, 248.68793195489 ], "wc_reply_authors_avg": [ 923.5, 502.12772279570464 ], "reply_reviewers_avg": [ 1.0, 0.7071067811865476 ], "reply_authors_avg": [ 1.5, 0.8660254037844386 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -1.0, "corr_recommendation_correctness": 0.5773502691896258, "gs_citation": 22, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5508316500824510645&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "Microsoft;Indian Institute of Science;University of California, Berkeley;Washington State University", "aff_unique_dep": "Microsoft Corporation;;;", "aff_unique_url": "https://www.microsoft.com;https://www.iisc.ac.in;https://www.berkeley.edu;https://wsu.edu", "aff_unique_abbr": "Microsoft;IISc;UC Berkeley;WSU", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Berkeley;Pullman", "aff_country_unique_index": "0;1;0;0", "aff_country_unique": "United States;India" }, { "id": "TlPNpabaoV", "title": "On the Efficiency of Deep Neural Networks", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "The efficiency of neural networks is very important in large-scale deployment scenarios such as mobile applications, internet of things, and edge computing. For given performance requirement, an efficient neural network should use the simplest network architecture with minimal number of parameters and connections. In this paper, we discuss several key issues and a new procedure for obtaining efficient networks that minimize total number of parameters and computation requirement. Our first contribution is identifying and analyzing several key components in training efficient networks with the backpropagation (BP) algorithm: 1) softmax normalization in output layers may be one major cause of parameter explosion; 2) using log likelihood ratio (LLR) representation in output layers can reduce overfitting; 3) weight decaying and structural regularization can effectively reduce overfitting when ReLU activation is used. The second contribution is discovering that a well-trained network without overfitting can be effectively pruned using a simple snapshot-based procedure -- after pruning unimportant weights and connections, simply adjust remaining non-weight parameters using the BP algorithm. The snapshot-based pruning method could also be used to evaluate and analyze the efficiency of neural networks. Finally, we hypothesize that there exist lower-bounds of total number of bits for representing parameters and connections with regard to performance metrics for a given optimization problem. Rather than focusing on improving the sole accuracy metric with more complex network architectures, we should also explore the trade-offs between accuracy and total number of representation bits when comparing different network architectures and implementations.", "keywords": "Deep learning;Neural networks;Computation Efficiency;Weight pruning;Overfitting;Softmax;Log likelihood ratio (LLR)", "primary_area": "", "supplementary_material": "", "author": "Yibin Liang;Yang Yi;Lingjia Liu", "authorids": "~Yibin_Liang1;~Yang_Yi2;~Lingjia_Liu1", "gender": ";F;M", "homepage": "https://www.mics.ece.vt.edu/People/Graduate/yibin-liang.html;https://www.yangyi.ece.vt.edu/;http://www.lingjialiu.ece.vt.edu", "dblp": ";;", "google_scholar": ";ceRq6DEAAAAJ;https://scholar.google.com.tw/citations?user=_d2M0xMAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Yibin_Liang1;~Yang_Yi2;~Lingjia_Liu1", "aff": "Virginia Tech;Virginia Tech;Virginia Tech", "aff_domain": "vt.edu;vt.edu;vt.edu", "position": "PhD student;Associate Professor;Associate Professor", "bibtex": "@misc{\nliang2022on,\ntitle={On the Efficiency of Deep Neural Networks},\nauthor={Yibin Liang and Yang Yi and Lingjia Liu},\nyear={2022},\nurl={https://openreview.net/forum?id=TlPNpabaoV}\n}", "github": "", "project": "", "reviewers": "gGLe;Zr5D;dB3y;MC8n", "site": "https://openreview.net/forum?id=TlPNpabaoV", "pdf_size": 0, "recommendation": "1;3;3;5", "confidence": "4;3;4;4", "correctness": "2;2;2;3", "technical_novelty": "1;2;2;3", "empirical_novelty": "1;2;2;2", "wc_summary_paper": "39;29;117;67", "wc_summary_review": "65;98;152;53", "wc_main_review": "290;510;535;171", "wc_review": "394;637;804;291", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "456;414;267;267", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 3.0, 1.4142135623730951 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 2.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 1.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 63.0, 34.14674215792775 ], "wc_summary_review_avg": [ 92.0, 38.360135557633264 ], "wc_main_review_avg": [ 376.5, 152.19806174849927 ], "wc_review_avg": [ 531.5, 201.3287113155995 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 351.0, 85.30240324867758 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.816496580927726, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1, "aff_unique_index": "0;0;0", "aff_unique_norm": "Virginia Tech", "aff_unique_dep": "", "aff_unique_url": "https://www.vt.edu", "aff_unique_abbr": "VT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Learning Distributionally Robust Models at Scale via Composite Optimization", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6784", "id": "To-R742x7se", "poster": "", "openreview": "https://openreview.net/forum?id=To-R742x7se", "slides": "https://iclr.cc/virtual/2022/poster/6784", "video": "https://iclr.cc/virtual/2022/poster/6784", "author_site": "Farzin Haddadpour, Mohammad Mahdi Kamani, Mehrdad Mahdavi, Amin Karbasi", "tldr": "", "abstract": "To train machine learning models that are robust to distribution shifts in the data, distributionally robust optimization (DRO) has been proven very effective. However, the existing approaches to learning a distributionally robust model either require solving complex optimization problems such as semidefinite programming or a first-order method whose convergence scales linearly with the number of data samples-- which hinders their scalability to large datasets. In this paper, we show how different variants of DRO are simply instances of a finite-sum composite optimization for which we provide scalable methods. We also provide empirical results that demonstrate the effectiveness of our proposed algorithm with respect to the prior art in order to learn robust models from very large datasets. ", "keywords": "Composite Optimization;Distributionally Robust Optimization", "primary_area": "", "supplementary_material": "", "author": "Farzin Haddadpour;Mohammad Mahdi Kamani;Mehrdad Mahdavi;amin karbasi", "authorids": "~Farzin_Haddadpour1;~Mohammad_Mahdi_Kamani2;~Mehrdad_Mahdavi2;~amin_karbasi1", "gender": ";;M;M", "homepage": "https://sites.google.com/view/farzinhdp/home;https://mmkamani.com;http://www.cse.psu.edu/~mzm616/;http://seas.yale.edu/faculty-research/faculty-directory/amin-karbasi", "dblp": "https://dblp.org/pers/hd/h/Haddadpour:Farzin.html;194/7523.html;88/4321;49/7411", "google_scholar": "JdaQ0VMAAAAJ;jUXXvNIAAAAJ;HzxnwocAAAAJ;https://scholar.google.com.tw/citations?user=VusVB38AAAAJ", "orcid": ";0000-0003-3930-4151;;", "linkedin": "farzinhdp/;mm-kamani7/;;", "or_profile": "~Farzin_Haddadpour1;~Mohammad_Mahdi_Kamani2;~Mehrdad_Mahdavi2;~amin_karbasi1", "aff": ";Wyze Labs;Toyota Technological Institute at Chicago;Google", "aff_domain": ";wyze.com;ttic.edu;google.com", "position": ";Researcher;Researcher;Researcher", "bibtex": "@inproceedings{\nhaddadpour2022learning,\ntitle={Learning Distributionally Robust Models at Scale via Composite Optimization},\nauthor={Farzin Haddadpour and Mohammad Mahdi Kamani and Mehrdad Mahdavi and amin karbasi},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=To-R742x7se}\n}", "github": "", "project": "", "reviewers": "Wug8;rHQv;Q3eH", "pdf_size": 0, "recommendation": "5;6;8", "confidence": "2;2;4", "correctness": "4;4;4", "technical_novelty": "2;3;3", "empirical_novelty": "2;2;4", "wc_summary_paper": "75;50;43", "wc_summary_review": "106;18;52", "wc_main_review": "57;138;122", "wc_review": "238;206;217", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "276;617;314", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "recommendation_avg": [ 6.333333333333333, 1.247219128924647 ], "confidence_avg": [ 2.6666666666666665, 0.9428090415820634 ], "correctness_avg": [ 4.0, 0.0 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.9428090415820634 ], "wc_summary_paper_avg": [ 56.0, 13.73559851869101 ], "wc_summary_review_avg": [ 58.666666666666664, 36.23380864453651 ], "wc_main_review_avg": [ 105.66666666666667, 35.0269737329517 ], "wc_review_avg": [ 220.33333333333334, 13.274871834493252 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 402.3333333333333, 152.5829464768444 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.944911182523068, "corr_recommendation_correctness": 0.0, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14563806568575292485&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "pdf": "https://openreview.net/pdf?id=To-R742x7se", "email": ";wyze.com;ttic.edu;google.com", "author_num": 4, "aff_unique_index": "0;1;2", "aff_unique_norm": "Wyze Labs;Toyota Technological Institute at Chicago;Google", "aff_unique_dep": ";;Google", "aff_unique_url": "https://wyze.com;https://www.tti-chicago.org;https://www.google.com", "aff_unique_abbr": "Wyze;TTI Chicago;Google", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Chicago;Mountain View", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Curriculum learning as a tool to uncover learning principles in the brain", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6678", "id": "TpJMvo0_pu-", "poster": "", "openreview": "https://openreview.net/forum?id=TpJMvo0_pu-", "slides": "https://iclr.cc/virtual/2022/poster/6678", "video": "https://iclr.cc/virtual/2022/poster/6678", "author_site": "Daniel R Kepple, Rainer Engelken, Kanaka Rajan", "tldr": "", "abstract": "We present a novel approach to use curricula to identify principles by which a system learns. Previous work in curriculum learning has focused on how curricula can be designed to improve learning of a model on particular tasks. We consider the inverse problem: what can a curriculum tell us about how a learning system acquired a task? Using recurrent neural networks (RNNs) and models of common experimental neuroscience tasks, we demonstrate that curricula can be used to differentiate learning principles using target-based and a representation-based loss functions as use cases. In particular, we compare the performance of RNNs using target-based learning rules versus those using representational learning rules on three different curricula in the context of two tasks. We show that the learned state-space trajectories of RNNs trained by these two learning rules under all curricula tested are indistinguishable. However, by comparing learning times during different curricula, we can disambiguate the learning rules and challenge traditional approaches of interrogating learning systems. Although all animals in neuroscience lab settings are trained by curriculum-based procedures called shaping, almost no behavioral or neural data are collected or published on the relative successes or training times under different curricula. Our results motivate the systematic collection and curation of data during shaping by demonstrating curriculum learning in RNNs as a tool to probe and differentiate learning principles used by biological systems, over conventional statistical analyses of learned state spaces.", "keywords": "curriculum learning;neuroscience", "primary_area": "", "supplementary_material": "", "author": "Daniel R. Kepple;Rainer Engelken;Kanaka Rajan", "authorids": "~Daniel_R._Kepple1;~Rainer_Engelken1;~Kanaka_Rajan1", "gender": ";M;F", "homepage": ";https://ctn.zuckermaninstitute.columbia.edu/people/rainer-engelken;https://www.rajanlab.com/", "dblp": "217/8270;312/6447;94/10452", "google_scholar": ";HvZqeGQAAAAJ;IC6n33kAAAAJ", "orcid": ";0000-0001-7118-2129;0000-0003-2749-2917", "linkedin": ";;rajankdr", "or_profile": "~Daniel_R._Kepple1;~Rainer_Engelken1;~Kanaka_Rajan1", "aff": ";Center for Theoretical Neuroscience, Columbia University;Icahn School of Medicine at Mount Sinai", "aff_domain": ";ctn.zuckermaninstitute.columbia.edu;mssm.edu", "position": ";Postdoc;Assistant Professor", "bibtex": "@inproceedings{\nkepple2022curriculum,\ntitle={Curriculum learning as a tool to uncover learning principles in the brain },\nauthor={Daniel R. Kepple and Rainer Engelken and Kanaka Rajan},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=TpJMvo0_pu-}\n}", "github": "", "project": "", "reviewers": "wnW4;Nbow;cUgC;cCVe", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "4;4;3;3", "correctness": "2;3;3;4", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;3;4;3", "wc_summary_paper": "90;72;123;54", "wc_summary_review": "21;64;101;71", "wc_main_review": "728;483;299;310", "wc_review": "839;619;523;435", "wc_reply_reviewers": "193;97;0;197", "wc_reply_authors": "901;619;121;666", "reply_reviewers": "1;1;0;1", "reply_authors": "2;2;1;2", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 84.75, 25.488968201949643 ], "wc_summary_review_avg": [ 64.25, 28.577744837547975 ], "wc_main_review_avg": [ 455.0, 173.69081725871405 ], "wc_review_avg": [ 604.0, 150.4759116935332 ], "wc_reply_reviewers_avg": [ 121.75, 80.89306212525274 ], "wc_reply_authors_avg": [ 576.75, 283.9880059087003 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.75, 0.4330127018922193 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.6882472016116854, "corr_recommendation_correctness": 0.9733285267845754, "gs_citation": 22, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9690834003086052426&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "pdf": "https://openreview.net/pdf?id=TpJMvo0_pu-", "email": ";ctn.zuckermaninstitute.columbia.edu;mssm.edu", "author_num": 3, "aff_unique_index": "0;1", "aff_unique_norm": "Columbia University;Icahn School of Medicine at Mount Sinai", "aff_unique_dep": "Center for Theoretical Neuroscience;School of Medicine", "aff_unique_url": "https://www.columbia.edu;https://icahn.mssm.edu", "aff_unique_abbr": "Columbia;ISMMS", "aff_campus_unique_index": "1", "aff_campus_unique": ";New York", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Axiomatic Explanations for Visual Search, Retrieval, and Similarity Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6983", "id": "TqNsv1TuCX9", "poster": "", "openreview": "https://openreview.net/forum?id=TqNsv1TuCX9", "slides": "https://iclr.cc/virtual/2022/poster/6983", "video": "https://iclr.cc/virtual/2022/poster/6983", "author_site": "Mark Hamilton, Scott Lundberg, Stephanie Fu, Lei Zhang, William Freeman", "tldr": "", "abstract": "Visual search, recommendation, and contrastive similarity learning power technologies that impact billions of users worldwide. Modern model architectures can be complex and difficult to interpret, and there are several competing techniques one can use to explain a search engine's behavior. We show that the theory of fair credit assignment provides a unique axiomatic solution that generalizes several existing recommendation- and metric-explainability techniques in the literature. Using this formalism, we show when existing approaches violate \"fairness\" and derive methods that sidestep these shortcomings and naturally handle counterfactual information. More specifically, we show existing approaches implicitly approximate second-order Shapley-Taylor indices and extend CAM, GradCAM, LIME, SHAP, SBSM, and other methods to search engines. These extensions can extract pairwise correspondences between images from trained opaque-box models. We also introduce a fast kernel-based method for estimating Shapley-Taylor indices that require orders of magnitude fewer function evaluations to converge. Finally, we show that these game-theoretic measures yield more consistent explanations for image similarity architectures. ", "keywords": "Model Interpretability;Shapley Values;Search Engines;Information Retrieval;Visual Search;Similarity Learning;Metric Learning;Black-box explanations", "primary_area": "", "supplementary_material": "", "author": "Mark Hamilton;Scott Lundberg;Stephanie Fu;Lei Zhang;William T. Freeman", "authorids": "~Mark_Hamilton1;~Scott_Lundberg1;~Stephanie_Fu1;~Lei_Zhang23;~William_T._Freeman1", "gender": "M;;F;M;M", "homepage": "https://mhamilton.net;http://scottlundberg.com;https://stephanie-fu.github.io/;https://billf.mit.edu/;https://www.leizhang.org/", "dblp": "91/631;;270/1541;86/6650;z/LeiZhang", "google_scholar": "kgZtMGsAAAAJ;ESRugcEAAAAJ;Rx-h05AAAAAJ;https://scholar.google.com.tw/citations?user=0zZnyMEAAAAJ;fIlGZToAAAAJ", "orcid": ";0000-0001-6280-0941;0000-0001-6591-6026;;", "linkedin": ";;stephanie-fu/;;", "or_profile": "~Mark_Hamilton1;~Scott_Lundberg1;~Stephanie_Fu1;~William_T._Freeman1;~Lei_Zhang1", "aff": "Massachusetts Institute of Technology;Microsoft;Massachusetts Institute of Technology;Massachusetts Institute of Technology;International Digital Economy Academy", "aff_domain": "mit.edu;microsoft.com;mit.edu;mit.edu;idea.edu.cn", "position": "PhD student;Researcher;Undergrad student;Professor;Chief Scientist", "bibtex": "@inproceedings{\nhamilton2022axiomatic,\ntitle={Axiomatic Explanations for Visual Search, Retrieval, and Similarity Learning},\nauthor={Mark Hamilton and Scott Lundberg and Stephanie Fu and Lei Zhang and William T. Freeman},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=TqNsv1TuCX9}\n}", "github": "", "project": "", "reviewers": "erSL;T6UD;EVSE", "pdf_size": 0, "recommendation": "6;6;6", "confidence": "4;5;2", "correctness": "3;3;4", "technical_novelty": "4;3;3", "empirical_novelty": "3;2;3", "wc_summary_paper": "75;64;56", "wc_summary_review": "42;110;14", "wc_main_review": "489;604;120", "wc_review": "606;778;190", "wc_reply_reviewers": "387;0;0", "wc_reply_authors": "1254;726;599", "reply_reviewers": "2;0;0", "reply_authors": "2;2;2", "recommendation_avg": [ 6.0, 0.0 ], "confidence_avg": [ 3.6666666666666665, 1.247219128924647 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 3.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 65.0, 7.788880963698615 ], "wc_summary_review_avg": [ 55.333333333333336, 40.30991055421593 ], "wc_main_review_avg": [ 404.3333333333333, 206.46280267614526 ], "wc_review_avg": [ 524.6666666666666, 246.84318008718725 ], "wc_reply_reviewers_avg": [ 129.0, 182.43354954612926 ], "wc_reply_authors_avg": [ 859.6666666666666, 283.61515395964926 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.9428090415820634 ], "reply_authors_avg": [ 2.0, 0.0 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8304938842901557841&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=TqNsv1TuCX9", "email": "mit.edu;microsoft.com;mit.edu;mit.edu;idea.edu.cn", "author_num": 5, "aff_unique_index": "0;1;0;0;2", "aff_unique_norm": "Massachusetts Institute of Technology;Microsoft;International Digital Economy Academy", "aff_unique_dep": ";Microsoft Corporation;", "aff_unique_url": "https://web.mit.edu;https://www.microsoft.com;", "aff_unique_abbr": "MIT;Microsoft;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States;" }, { "title": "Memorizing Transformers", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6064", "id": "TrjbxzRcnf-", "poster": "", "openreview": "https://openreview.net/forum?id=TrjbxzRcnf-", "slides": "https://iclr.cc/virtual/2022/poster/6064", "video": "https://iclr.cc/virtual/2022/poster/6064", "author_site": "Yuhuai Wu, Markus Rabe, DeLesley Hutchins, Christian Szegedy", "tldr": "", "abstract": "Language models typically need to be trained or finetuned in order to acquire new knowledge, which involves updating their weights. \nWe instead envision language models that can simply read and memorize new data at inference time, thus acquiring new knowledge immediately. In this work, we extend language models with the ability to memorize the internal representations of past inputs. We demonstrate that an approximate $k$NN lookup into a non-differentiable memory of recent (key, value) pairs improves language modeling across various benchmarks and tasks, including generic webtext (C4), math papers (arXiv), books (PG-19), code (Github), as well as formal theorems (Isabelle). We show that the performance steadily improves when we increase the size of memory up to 262K tokens. \nOn benchmarks including code and mathematics, we find that the model is capable of making use of newly defined functions and theorems during test time.", "keywords": "Transformer;architecture;memorization.", "primary_area": "", "supplementary_material": "", "author": "Yuhuai Wu;Markus Norman Rabe;DeLesley Hutchins;Christian Szegedy", "authorids": "~Yuhuai_Wu1;~Markus_Norman_Rabe1;~DeLesley_Hutchins1;~Christian_Szegedy1", "gender": "M;M;M;", "homepage": "http://www.cs.toronto.edu/~ywu/;https://people.eecs.berkeley.edu/~rabe/;;", "dblp": ";88/1112-2;93/5511.html;78/1537", "google_scholar": "https://scholar.google.ca/citations?user=bOQGfFIAAAAJ;https://scholar.google.com/citations?hl=en;C6CJkqcAAAAJ;3QeF7mAAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Yuhuai_Wu1;~Markus_Norman_Rabe1;~DeLesley_Hutchins1;~Christian_Szegedy1", "aff": "Stanford University;Google;Google;Google", "aff_domain": "stanford.edu;google.com;google.com;google.com", "position": "Postdoc;Researcher/Software Engineer;Senior Software Engineer;Research Scientist", "bibtex": "@inproceedings{\nwu2022memorizing,\ntitle={Memorizing Transformers},\nauthor={Yuhuai Wu and Markus Norman Rabe and DeLesley Hutchins and Christian Szegedy},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=TrjbxzRcnf-}\n}", "github": "", "project": "", "reviewers": "eMDi;zRRc;gVad;75xx", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "4;4;4;5", "correctness": "3;4;4;3", "technical_novelty": "2;2;2;4", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "213;67;71;262", "wc_summary_review": "376;83;30;97", "wc_main_review": "272;79;356;963", "wc_review": "861;229;457;1322", "wc_reply_reviewers": "167;0;13;65", "wc_reply_authors": "952;86;419;497", "reply_reviewers": "1;0;1;1", "reply_authors": "3;1;2;2", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.5, 0.8660254037844386 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 153.25, 86.02434248513615 ], "wc_summary_review_avg": [ 146.5, 134.837865601618 ], "wc_main_review_avg": [ 417.5, 330.56958420278175 ], "wc_review_avg": [ 717.25, 416.08435142408325 ], "wc_reply_reviewers_avg": [ 61.25, 65.72052571305254 ], "wc_reply_authors_avg": [ 488.5, 308.92596200384327 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 2.0, 0.7071067811865476 ], "replies_avg": [ 26, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.9271726499455306, "corr_recommendation_correctness": -0.2294157338705618, "gs_citation": 302, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12149100013599717090&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "pdf": "https://openreview.net/pdf?id=TrjbxzRcnf-", "email": "stanford.edu;google.com;google.com;google.com", "author_num": 4, "aff_unique_index": "0;1;1;1", "aff_unique_norm": "Stanford University;Google", "aff_unique_dep": ";Google", "aff_unique_url": "https://www.stanford.edu;https://www.google.com", "aff_unique_abbr": "Stanford;Google", "aff_campus_unique_index": "0;1;1;1", "aff_campus_unique": "Stanford;Mountain View", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "TscS0R8QzfG", "title": "PDAML: A Pseudo Domain Adaptation Paradigm for Subject-independent EEG-based Emotion Recognition", "track": "main", "status": "Reject", "tldr": "", "abstract": "Domain adaptation (DA) and domain generalization (DG) methods have been successfully adopted to alleviate the domain shift problem caused by the subject variability of EEG signals in subject-independent affective brain-computer interfaces (aBCIs). Usually, the DA methods give relatively promising results than the DG methods but require additional computation resources each time a new subject comes. In this paper, we first propose a new paradigm called Pseudo Domain Adaptation (PDA), which is more suitable for subject-independent aBCIs. Then we propose the pseudo domain adaptation via meta-learning (PDAML) based on PDA. The PDAML consists of a feature extractor, a classifier, and a sum-decomposable structure called domain shift governor. We prove that a network with a sum-decomposable structure can compute the divergence between different domains effectively in theory. By taking advantage of the adversarial learning and meta-learning, the governor helps PDAML quickly generalize to a new domain using the target data through a few self-adaptation steps in the test phase. Experimental results on the public aBICs dataset demonstrate that our proposed method not only avoids the additional computation resources of the DA methods but also reaches a similar generalization performance of the state-of-the-art DA methods.", "keywords": "aBCIs;EEG-based emotion recognition;domain adaptation;domain generalization;meta-learning;adversarial learning", "primary_area": "", "supplementary_material": "/attachment/2df2b6850acd3a1ca6ecc14fa2ef8dc93902c3d0.zip", "author": "Yun Luo;Gengchen Wei;Bao-liang Lu", "authorids": "~Yun_Luo3;~Gengchen_Wei1;~Bao-liang_Lu1", "gender": "M;;M", "homepage": "https://bcmi.sjtu.edu.cn/home/luoyun/;https://wei-gongzi.github.io/;http://bcmi.sjtu.edu.cn/~blu/", "dblp": ";;09/3116.html", "google_scholar": ";;https://scholar.google.com.tw/citations?user=709il6EAAAAJ", "orcid": ";;0000-0001-8359-0058", "linkedin": ";;", "or_profile": "~Yun_Luo3;~Gengchen_Wei1;~Bao-liang_Lu1", "aff": "Shanghai Jiaotong University;Fudan University;Shanghai Jiaotong University", "aff_domain": "sjtu.edu.cn;fudan.edu.cn;sjtu.edu.cn", "position": "PhD student;Undergrad student;Full Professor", "bibtex": "@inproceedings{\nluo2022pdaml,\ntitle={{PDAML}: A Pseudo Domain Adaptation Paradigm for Subject-independent {EEG}-based Emotion Recognition},\nauthor={Yun Luo and Gengchen Wei and Bao-liang Lu},\nbooktitle={Submitted to The Tenth International Conference on Learning Representations },\nyear={2022},\nurl={https://openreview.net/forum?id=TscS0R8QzfG},\nnote={under review}\n}", "github": "", "project": "", "reviewers": "cXdg;tCfS;eiC6;dNMt", "site": "https://openreview.net/forum?id=TscS0R8QzfG", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "4;5;5;5", "correctness": "2;3;3;3", "technical_novelty": "3;1;2;2", "empirical_novelty": "2;1;3;2", "wc_summary_paper": "39;60;74;170", "wc_summary_review": "24;1;72;68", "wc_main_review": "379;134;232;101", "wc_review": "442;195;378;339", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 4.75, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 85.75, 50.21142798208392 ], "wc_summary_review_avg": [ 41.25, 29.911327285829362 ], "wc_main_review_avg": [ 211.5, 108.04281558715508 ], "wc_review_avg": [ 338.5, 90.6435325878245 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16541275286769026697&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;0", "aff_unique_norm": "Shanghai Jiao Tong University;Fudan University", "aff_unique_dep": ";", "aff_unique_url": "https://www.sjtu.edu.cn;https://www.fudan.edu.cn", "aff_unique_abbr": "SJTU;Fudan", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "id": "Tu6SpFYWTA", "title": "Antonymy-Synonymy Discrimination through the Repelling Parasiamese Neural Network", "track": "main", "status": "Reject", "tldr": "", "abstract": "Antonymic and synonymic pairs may both occur nearby in word embeddings spaces because they have similar distributional information. Different methods have been used in order to distinguish antonyms from synonyms, making the antonymy-synonymy discrimination a popular NLP task. In this work, we propose the repelling parasiamese neural network, a model which considers a siamese network for synonymy and a parasiamese network for antonymy, both sharing the same base network. Relying in the antagonism between synoymy and antonymy, the model attempts to repell siamese and parasiamese outputs making use of the contrastive loss functions. We experimentally show that the repelling parasiamese network achieves state-of-the-art results on this task.", "keywords": "antitransitivity;parasiamese network;antonymy-synonymy discrimination", "primary_area": "", "supplementary_material": "", "author": "Mathias Etcheverry;Dina Wonsever", "authorids": "~Mathias_Etcheverry1;~Dina_Wonsever1", "gender": "M;F", "homepage": "https://www.fing.edu.uy/~mathiase/;", "dblp": ";83/5755", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": "~Mathias_Etcheverry1;~Dina_Wonsever1", "aff": "Facultad de Ingenier\u00eda;Facultad de Ingenier\u00eda", "aff_domain": "fing.edu.uy;fing.edu.uy", "position": "PhD student;Full Professor", "bibtex": "@misc{\netcheverry2022antonymysynonymy,\ntitle={Antonymy-Synonymy Discrimination through the Repelling Parasiamese Neural Network},\nauthor={Mathias Etcheverry and Dina Wonsever},\nyear={2022},\nurl={https://openreview.net/forum?id=Tu6SpFYWTA}\n}", "github": "", "project": "", "reviewers": "hoTU;3PGk;w5dj", "site": "https://openreview.net/forum?id=Tu6SpFYWTA", "pdf_size": 0, "recommendation": "3;6;6", "confidence": "4;3;3", "correctness": "3;4;3", "technical_novelty": "2;3;2", "empirical_novelty": "2;3;2", "wc_summary_paper": "79;86;55", "wc_summary_review": "28;34;27", "wc_main_review": "287;89;281", "wc_review": "394;209;363", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "14;113;231", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "recommendation_avg": [ 5.0, 1.4142135623730951 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 73.33333333333333, 13.27487183449325 ], "wc_summary_review_avg": [ 29.666666666666668, 3.0912061651652345 ], "wc_main_review_avg": [ 219.0, 91.9565114605812 ], "wc_review_avg": [ 322.0, 80.89911412782384 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 119.33333333333333, 88.70300007453086 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -1.0, "corr_recommendation_correctness": 0.5, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Cg0SZXTjtVIJ:scholar.google.com/&scioq=Antonymy-Synonymy+Discrimination+through+the+Repelling+Parasiamese+Neural+Network&hl=en&as_sdt=0,5", "gs_version_total": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Facultad de Ingenier\u00eda", "aff_unique_dep": "Ingenier\u00eda", "aff_unique_url": "", "aff_unique_abbr": "", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "", "aff_country_unique": "" }, { "id": "TuR3pmKgERp", "title": "Hyperspherical embedding for novel class classification", "track": "main", "status": "Reject", "tldr": "", "abstract": " Deep neural networks proved to be useful to learn representations and perform classification on many different modalities of data. Traditional approaches work well on the closed set problem. For learning tasks involving novel classes, known as the open set problem, the metric learning approach has been proposed. However, while promising, common metric learning approaches require pairwise learning, which significantly increases training cost while adding additional challenges. In this paper we present a method in which the similarity of samples projected onto a feature space is enforced by a metric learning approach without requiring\npairwise evaluation. We compare our approach against known methods in different datasets, achieving results up to $81\\%$ more accurate.", "keywords": "Metric Learning;open set;deep learning", "primary_area": "", "supplementary_material": "", "author": "Rafael S. Pereira;alexis joly;Patrick Valduriez;F\u00e1bio Porto", "authorids": "~Rafael_S._Pereira1;~alexis_joly1;~Patrick_Valduriez1;~F\u00e1bio_Porto1", "gender": "M;M;M;", "homepage": "http://dexl.lncc.br/rafael-silva-pereira;http://www-sop.inria.fr/members/Alexis.Joly/wiki/pmwiki.php;http://dexl.lncc.br/fabio-porto;http://www-sop.inria.fr/members/Patrick.Valduriez/", "dblp": ";http://dblp.uni-trier.de/pers/hd/j/Joly:Alexis;p/FabioPorto.html;", "google_scholar": ";https://scholar.google.fr/citations?user=kbpkTGgAAAAJ;https://scholar.google.com/citations?hl=pt-BR;", "orcid": ";0000-0002-2161-9940;0000-0002-4597-4832;", "linkedin": ";;;", "or_profile": "~Rafael_S._Pereira1;~alexis_joly1;~F\u00e1bio_Porto1;~Patrick_Valduriez2", "aff": "Laboratorio Nacional De Computa\u00e7\u00e3o Cientifica;;Laboratorio Nacional de Computa\u00e7\u00e3o Cient\u00edfica;INRIA", "aff_domain": "lncc.br;;lncc.br;inria.fr", "position": "Researcher;;Researcher;Principal Researcher", "bibtex": "@misc{\npereira2022hyperspherical,\ntitle={Hyperspherical embedding for novel class classification},\nauthor={Rafael S. Pereira and alexis joly and Patrick Valduriez and F{\\'a}bio Porto},\nyear={2022},\nurl={https://openreview.net/forum?id=TuR3pmKgERp}\n}", "github": "", "project": "", "reviewers": "AHvJ;Tp5p;t3Uk;qMdp", "site": "https://openreview.net/forum?id=TuR3pmKgERp", "pdf_size": 0, "recommendation": "3;3;3;3", "confidence": "4;4;4;3", "correctness": "3;3;2;3", "technical_novelty": "2;1;2;2", "empirical_novelty": "1;2;2;2", "wc_summary_paper": "66;113;27;55", "wc_summary_review": "44;40;8;113", "wc_main_review": "411;373;106;495", "wc_review": "521;526;141;663", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 1.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 65.25, 31.01914731258743 ], "wc_summary_review_avg": [ 51.25, 38.284298348017295 ], "wc_main_review_avg": [ 346.25, 145.5633453174253 ], "wc_review_avg": [ 462.75, 194.30436819587973 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4397848882327306103&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;1", "aff_unique_norm": "Laboratorio Nacional de Computa\u00e7\u00e3o Cient\u00edfica;INRIA", "aff_unique_dep": ";", "aff_unique_url": "http://www.lncc.br;https://www.inria.fr", "aff_unique_abbr": "LNCC;INRIA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1", "aff_country_unique": "Brazil;France" }, { "id": "Tubzedlc4P", "title": "A Statistical Manifold Framework for Point Cloud Data", "track": "main", "status": "Reject", "tldr": "", "abstract": "A large class of problems in machine learning involve data sets in which each data point is a point cloud in $\\mathbb{R}^D$. The reason that most machine learning algorithms designed for point cloud data tend to be ad hoc, and difficult to measure their performance in a uniform and quantitative way, can be traced to the lack of a rigorous mathematical characterization of this space of point cloud data. The primary contribution of this paper is a Riemannian geometric structure for point cloud data. By interpreting the point cloud data as a set of samples from some underlying probability distribution, the set of point cloud data can be given the structure of a statistical manifold, with the Fisher information metric acting as a natural Riemannian metric; this structure then leads to, e.g., distance metrics, volume forms, and other coordinate-invariant, geometrically well-defined measures needed for applications. The only requirement on the part of the user is the choice of a meaningful underlying probability distribution, which is more intuitive and natural to make than what is required in existing ad hoc formulations. Two autoencoder case studies involving point cloud data are presented to demonstrate the advantages of our statistical manifold framework: (i) interpolating between two 3D point cloud data sets to smoothly deform one object into another; (ii) transforming the latent coordinates into another with less distortion. Experiments with synthetic and large-scale standard benchmark point cloud data show more natural and intuitive shape evolutions, and improved classification accuracy for linear SVM vis-\\`{a}-vis existing methods.", "keywords": "Riemannian Geometry;Point Cloud;Autoencoders", "primary_area": "", "supplementary_material": "", "author": "Yonghyeon Lee;Seungyeon Kim;Jinwon Choi;Frank C. Park", "authorids": "~Yonghyeon_Lee2;~Seungyeon_Kim2;~Jinwon_Choi1;~Frank_C._Park1", "gender": "M;M;M;M", "homepage": "https://www.gabe-yhlee.com;https://seungyeon-k.github.io/;;http://robotics.snu.ac.kr", "dblp": "182/6796;74/7997-3;;p/FrankChongwooPark", "google_scholar": ";https://scholar.google.com/citations?hl=en;;u-h3PJIAAAAJ", "orcid": ";0000-0001-6708-5684;;0000-0002-0293-6975", "linkedin": ";seungyeon-kim-45a20b263/;choi-jinwon-73033b1ab/;", "or_profile": "~Yonghyeon_Lee2;~Seungyeon_Kim2;~Jinwon_Choi1;~Frank_C._Park1", "aff": "Seoul National University;Seoul National University;;Seoul National University", "aff_domain": "snu.ac.kr;snu.ac.kr;;snu.ac.kr", "position": "PhD student;PhD student;;Full Professor", "bibtex": "@misc{\nlee2022a,\ntitle={A Statistical Manifold Framework for Point Cloud Data},\nauthor={Yonghyeon Lee and Seungyeon Kim and Jinwon Choi and Frank C. Park},\nyear={2022},\nurl={https://openreview.net/forum?id=Tubzedlc4P}\n}", "github": "", "project": "", "reviewers": "21tj;yY6L;PyPd;FLA8", "site": "https://openreview.net/forum?id=Tubzedlc4P", "pdf_size": 0, "recommendation": "3;5;6;8", "confidence": "5;4;3;3", "correctness": "4;4;3;4", "technical_novelty": "2;3;3;3", "empirical_novelty": "3;2;2;3", "wc_summary_paper": "229;52;130;90", "wc_summary_review": "101;45;95;13", "wc_main_review": "414;616;464;224", "wc_review": "744;713;689;327", "wc_reply_reviewers": "0;0;0;27", "wc_reply_authors": "1377;2329;893;1017", "reply_reviewers": "0;0;0;1", "reply_authors": "2;4;2;3", "recommendation_avg": [ 5.5, 1.8027756377319946 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 125.25, 65.9445790038878 ], "wc_summary_review_avg": [ 63.5, 36.36963018783666 ], "wc_main_review_avg": [ 429.5, 140.03838759425932 ], "wc_review_avg": [ 618.25, 169.2799087310718 ], "wc_reply_reviewers_avg": [ 6.75, 11.691342951089922 ], "wc_reply_authors_avg": [ 1404.0, 562.8596627934888 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 2.75, 0.82915619758885 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.9198662110077999, "corr_recommendation_correctness": -0.16012815380508713, "gs_citation": 28, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10640933730705692613&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Seoul National University", "aff_unique_dep": "", "aff_unique_url": "https://www.snu.ac.kr", "aff_unique_abbr": "SNU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "South Korea" }, { "id": "TvMrYbWpa7", "title": "Instance-Adaptive Video Compression: Improving Neural Codecs by Training on the Test Set", "track": "main", "status": "Reject", "tldr": "", "abstract": "We introduce a video compression algorithm based on instance-adaptive learning. On each video sequence to be transmitted, we finetune a pretrained compression model. The optimal parameters are transmitted to the receiver along with the latent code. By entropy-coding the parameter updates under a suitable mixture model prior, we ensure that the network parameters can be encoded efficiently. This instance-adaptive compression algorithm is agnostic about the choice of base model and has the potential to improve any neural video codec. On UVG, HEVC, and Xiph datasets, our codec improves the performance of a low-latency scale-space flow model by between 24% and 26% BD-rate savings, and that of a state-of-the-art B-frame model by 17 to 20% BD-rate savings. We also demonstrate that instance-adaptive finetuning improves the robustness to domain shift. Finally, our approach reduces the capacity requirements on compression models. We show that it enables a state-of-the-art performance even after reducing the network size by 72%.", "keywords": "Instance-Adaptive Video Compression: Improving Neural Codecs by Training on the Test Set", "primary_area": "", "supplementary_material": "/attachment/30b8b6b762ff4801dbaee558591a631820feccc3.zip", "author": "Ties van Rozendaal;Johann Brehmer;Yunfan Zhang;Reza Pourreza;Taco Cohen", "authorids": "~Ties_van_Rozendaal1;~Johann_Brehmer1;~Yunfan_Zhang1;~Reza_Pourreza1;~Taco_Cohen1", "gender": "M;M;M;;M", "homepage": "http://tivaro.nl;https://johannbrehmer.github.io;;;http://www.ta.co.nl", "dblp": "247/1149.html;220/5763;;;142/2903", "google_scholar": "3S-DgGcAAAAJ;ZdUMvCsAAAAJ;MbFH6kIAAAAJ;;a3q4YxEAAAAJ", "orcid": ";0000-0003-3344-4209;;;", "linkedin": "tivaro;johannbrehmer;yunfan-gerry-zhang-8290a792/;;", "or_profile": "~Ties_van_Rozendaal1;~Johann_Brehmer1;~Yunfan_Zhang1;~Reza_Pourreza1;~Taco_Cohen1", "aff": "Qualcomm AI Research;Qualcomm AI Research;;;Qualcomm Inc, QualComm", "aff_domain": "qualcomm.com;qualcomm.com;;;qti.qualcomm.com", "position": "Senior Researcher;Researcher;;;Principal Researcher", "bibtex": "@misc{\nrozendaal2022instanceadaptive,\ntitle={Instance-Adaptive Video Compression: Improving Neural Codecs by Training on the Test Set},\nauthor={Ties van Rozendaal and Johann Brehmer and Yunfan Zhang and Reza Pourreza and Taco Cohen},\nyear={2022},\nurl={https://openreview.net/forum?id=TvMrYbWpa7}\n}", "github": "", "project": "", "reviewers": "8iiZ;TVKU;pT9R;TzFr", "site": "https://openreview.net/forum?id=TvMrYbWpa7", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "5;4;4;4", "correctness": "4;3;4;3", "technical_novelty": "2;3;3;2", "empirical_novelty": "3;3;3;2", "wc_summary_paper": "74;33;151;48", "wc_summary_review": "29;21;120;39", "wc_main_review": "214;163;300;392", "wc_review": "317;217;571;479", "wc_reply_reviewers": "0;0;61;95", "wc_reply_authors": "810;789;782;635", "reply_reviewers": "0;0;1;1", "reply_authors": "1;1;1;1", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 76.5, 45.445021729558015 ], "wc_summary_review_avg": [ 52.25, 39.63190003015248 ], "wc_main_review_avg": [ 267.25, 87.09011137896196 ], "wc_review_avg": [ 396.0, 137.6553667678816 ], "wc_reply_reviewers_avg": [ 39.0, 40.81053785482372 ], "wc_reply_authors_avg": [ 754.0, 69.47301634447723 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.0, "gs_citation": 25, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6986426152386403343&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;1", "aff_unique_norm": "Qualcomm;Qualcomm Incorporated", "aff_unique_dep": "Qualcomm AI Research;", "aff_unique_url": "https://www.qualcomm.com/research;https://www.qualcomm.com", "aff_unique_abbr": "QAI;Qualcomm", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "TxIXgcP3yp-", "title": "Decouple and Reconstruct: Mining Discriminative Features for Cross-domain Object Detection", "track": "main", "status": "Reject", "tldr": "", "abstract": "In recent years, a great progress has been witnessed for cross-domain object detection. Most state-of-the-art methods strive to handle the relation between local regions by calibrating cross-channel and spatial information to enable better alignment. They succeed in improving the generalization of the model, but implicitly drive networks to pay more attention on the shared attributes and ignore the domain-specific feature, which limits the performance of the algorithm. In order to search for the equilibrium between transferability and discriminability, we propose a novel adaptation framework for cross-domain object detection. Specifically, we adopt a style-aware feature fusion method and design two plug-and-play feature component regularization modules, which repositions the focus of the model on domain-specific features by restructuring the style and content of features. Our key insight is that while it is difficult to extract discriminative features in target domain, it is feasible to assign the underlying details to the model via feature style transfer. Without bells and whistles, our method significantly boosts the performance of existing Domain Adaptive Faster R-CNN detectors, and achieves state-of-the-art results on several benchmark datasets for cross-domain object detection.", "keywords": "domain adaptation;object detection;discriminative feature mining", "primary_area": "", "supplementary_material": "", "author": "Jiawei Wang;Konghuai Shen;Shao Ming;Jun Yin;Ming Liu", "authorids": "~Jiawei_Wang4;~Konghuai_Shen1;~Shao_Ming1;~Jun_Yin8;~Ming_Liu14", "gender": ";M;M;;", "homepage": "https://github.com/dakun09;https://github.com/abesft;https://github.com/shaoming20798;https://github.com/liucw94;https://github.com/dakun09", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~Jiawei_Wang4;~Konghuai_Shen1;~Shao_Ming1;~Jun_Yin8;~Ming_Liu14", "aff": ";;Zhejiang Dahua Tech;ZHEJIANGDAHUATECHNOLOGYCO.,LTD;", "aff_domain": ";;dahuatech.com;dahuatech.com;", "position": ";;software engineer;Software enginner;", "bibtex": "@misc{\nwang2022decouple,\ntitle={Decouple and Reconstruct: Mining Discriminative Features for Cross-domain Object Detection},\nauthor={Jiawei Wang and Konghuai Shen and Shao Ming and Jun Yin and Ming Liu},\nyear={2022},\nurl={https://openreview.net/forum?id=TxIXgcP3yp-}\n}", "github": "", "project": "", "reviewers": "1yU8;VfW6;FLN9;5g9D", "site": "https://openreview.net/forum?id=TxIXgcP3yp-", "pdf_size": 0, "recommendation": "5;5;5;5", "confidence": "3;3;4;2", "correctness": "4;2;2;3", "technical_novelty": "3;2;3;3", "empirical_novelty": "4;2;2;2", "wc_summary_paper": "63;47;87;71", "wc_summary_review": "17;64;102;65", "wc_main_review": "326;154;648;414", "wc_review": "406;265;837;550", "wc_reply_reviewers": "0;0;82;0", "wc_reply_authors": "0;321;0;0", "reply_reviewers": "0;0;1;0", "reply_authors": "0;1;0;0", "recommendation_avg": [ 5.0, 0.0 ], "confidence_avg": [ 3.0, 0.7071067811865476 ], "correctness_avg": [ 2.75, 0.82915619758885 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 67.0, 14.422205101855956 ], "wc_summary_review_avg": [ 62.0, 30.157917700000443 ], "wc_main_review_avg": [ 385.5, 178.08074011526344 ], "wc_review_avg": [ 514.5, 211.71265904522573 ], "wc_reply_reviewers_avg": [ 20.5, 35.50704155516198 ], "wc_reply_authors_avg": [ 80.25, 138.9970773074024 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 0.25, 0.4330127018922193 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Fi67xW9sKNkJ:scholar.google.com/&scioq=Decouple+and+Reconstruct:+Mining+Discriminative+Features+for+Cross-domain+Object+Detection&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1", "aff_unique_norm": "Zhejiang Dahua Technology Co., Ltd.;Zhejiang Dahu\u0430 Technology Co., Ltd.", "aff_unique_dep": ";", "aff_unique_url": "http://www.dahuatech.com;", "aff_unique_abbr": "Dahua Tech;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "title": "Backdoor Defense via Decoupling the Training Process", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6519", "id": "TySnJ-0RdKI", "poster": "", "openreview": "https://openreview.net/forum?id=TySnJ-0RdKI", "slides": "https://iclr.cc/virtual/2022/poster/6519", "video": "https://iclr.cc/virtual/2022/poster/6519", "author_site": "Kunzhe Huang, Yiming Li, Baoyuan Wu, Zhan Qin, Kui Ren", "tldr": "", "abstract": "Recent studies have revealed that deep neural networks (DNNs) are vulnerable to backdoor attacks, where attackers embed hidden backdoors in the DNN model by poisoning a few training samples. The attacked model behaves normally on benign samples, whereas its prediction will be maliciously changed when the backdoor is activated. We reveal that poisoned samples tend to cluster together in the feature space of the attacked DNN model, which is mostly due to the end-to-end supervised training paradigm. Inspired by this observation, we propose a novel backdoor defense via decoupling the original end-to-end training process into three stages. Specifically, we first learn the backbone of a DNN model via \\emph{self-supervised learning} based on training samples without their labels. The learned backbone will map samples with the same ground-truth label to similar locations in the feature space. Then, we freeze the parameters of the learned backbone and train the remaining fully connected layers via standard training with all (labeled) training samples. Lastly, to further alleviate side-effects of poisoned samples in the second stage, we remove labels of some `low-credible' samples determined based on the learned model and conduct a \\emph{semi-supervised fine-tuning} of the whole model. Extensive experiments on multiple benchmark datasets and DNN models verify that the proposed defense is effective in reducing backdoor threats while preserving high accuracy in predicting benign samples. Our code is available at \\url{https://github.com/SCLBD/DBD}.", "keywords": "Backdoor Defense;Backdoor Learning", "primary_area": "", "supplementary_material": "", "author": "Kunzhe Huang;Yiming Li;Baoyuan Wu;Zhan Qin;Kui Ren", "authorids": "hkunzhe@zju.edu.cn;~Yiming_Li1;~Baoyuan_Wu1;qinzhan@zju.edu.cn;kuiren@zju.edu.cn", "gender": ";M;M;;", "homepage": ";http://liyiming.tech;https://sites.google.com/site/baoyuanwu2015/;;", "dblp": ";l/YimingLi-4;73/7781;;", "google_scholar": ";mSW7kU8AAAAJ;JNTG1KoAAAAJ;;", "orcid": ";0000-0002-2258-265X;0000-0003-2183-5990;;", "linkedin": ";yiming-li-thu/;;;", "or_profile": "hkunzhe@zju.edu.cn;~Yiming_Li1;~Baoyuan_Wu1;qinzhan@zju.edu.cn;kuiren@zju.edu.cn", "aff": ";Tsinghua University;The Chinese University of Hong Kong, Shenzhen;;", "aff_domain": ";mails.tsinghua.edu.cn;cuhk.edu.cn;;", "position": ";PhD student;Associate Professor;;", "bibtex": "@inproceedings{\nhuang2022backdoor,\ntitle={Backdoor Defense via Decoupling the Training Process},\nauthor={Kunzhe Huang and Yiming Li and Baoyuan Wu and Zhan Qin and Kui Ren},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=TySnJ-0RdKI}\n}", "github": "", "project": "", "reviewers": "MY71;h7wA;RsZj;G82o", "pdf_size": 0, "recommendation": "6;6;6;8", "confidence": "5;4;4;4", "correctness": "3;3;3;3", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "32;68;106;70", "wc_summary_review": "33;16;85;9", "wc_main_review": "192;146;318;344", "wc_review": "257;230;509;423", "wc_reply_reviewers": "0;177;66;0", "wc_reply_authors": "1375;1834;1567;1113", "reply_reviewers": "0;1;1;0", "reply_authors": "3;4;3;2", "recommendation_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 69.0, 26.1725046566048 ], "wc_summary_review_avg": [ 35.75, 29.74369681125734 ], "wc_main_review_avg": [ 250.0, 83.12640976248163 ], "wc_review_avg": [ 354.75, 115.72461924759139 ], "wc_reply_reviewers_avg": [ 60.75, 72.32349203405488 ], "wc_reply_authors_avg": [ 1472.25, 263.79857372624286 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 3.0, 0.7071067811865476 ], "replies_avg": [ 20, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": 0.0, "gs_citation": 233, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11519386362177505857&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=TySnJ-0RdKI", "email": ";mails.tsinghua.edu.cn;cuhk.edu.cn;;", "author_num": 5, "aff_unique_index": "0;1", "aff_unique_norm": "Tsinghua University;Chinese University of Hong Kong", "aff_unique_dep": ";", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.cuhk.edu.cn", "aff_unique_abbr": "THU;CUHK", "aff_campus_unique_index": "1", "aff_campus_unique": ";Shenzhen", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "id": "TytZk4tWO5", "title": "Reference-Limited Compositional Learning: A Realistic Assessment for Human-level Compositional Generalization", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "To narrow the considerable gap between artificial and human intelligence, we propose a new task, namely reference-limited compositional learning (RLCL), which reproduces three core challenges to mimic human perception: compositional learning, few-shot, and few referential compositions. Building upon the setting, we propose two benchmarks that consist of multiple datasets with diverse compositional labels, providing a suitable and realistic platform for systematically assessing progress on the task. Moreover, we extend popular few-shot and compositional learning approaches to serve as baselines, and also introduce a simple method that achieves better performance in recognizing unseen compositions. Extensive experiments demonstrate that existing solutions struggle with the challenges imposed by the RLCL task, revealing substantial research space for pursuing human-level compositional generalization ability.", "keywords": "compositional learning;few-shot;few referential compositions", "primary_area": "", "supplementary_material": "", "author": "Siteng Huang;Qiyao Wei;Donglin Wang", "authorids": "~Siteng_Huang1;~Qiyao_Wei1;~Donglin_Wang1", "gender": ";M;M", "homepage": "https://kyonhuang.top/;https://qiyaowei.github.io;https://milab.westlake.edu.cn/", "dblp": "251/9544.html;327/3121;", "google_scholar": "mhpkWSYAAAAJ;;https://scholar.google.ca/citations?user=-fo6wdwAAAAJ", "orcid": "0000-0002-9735-1186;;0000-0002-8188-3735", "linkedin": ";qiyaowei;", "or_profile": "~Siteng_Huang1;~Qiyao_Wei1;~Donglin_Wang1", "aff": "Westlake University;;Westlake University", "aff_domain": "westlake.edu.cn;;westlake.edu.cn", "position": "PhD student;;Associate Professor", "bibtex": "@misc{\nhuang2022referencelimited,\ntitle={Reference-Limited Compositional Learning: A Realistic Assessment for Human-level Compositional Generalization},\nauthor={Siteng Huang and Qiyao Wei and Donglin Wang},\nyear={2022},\nurl={https://openreview.net/forum?id=TytZk4tWO5}\n}", "github": "", "project": "", "reviewers": "nsvK;nYmZ;mn6T;eic1", "site": "https://openreview.net/forum?id=TytZk4tWO5", "pdf_size": 0, "recommendation": "5;5;5;5", "confidence": "4;4;3;4", "correctness": "3;3;2;2", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;3;2;2", "wc_summary_paper": "178;91;58;53", "wc_summary_review": "85;50;38;42", "wc_main_review": "145;408;119;230", "wc_review": "408;549;215;325", "wc_reply_reviewers": "87;255;0;0", "wc_reply_authors": "778;630;578;787", "reply_reviewers": "1;1;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 5.0, 0.0 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 95.0, 50.094909921068826 ], "wc_summary_review_avg": [ 53.75, 18.552290963651902 ], "wc_main_review_avg": [ 225.5, 113.08072337936294 ], "wc_review_avg": [ 374.25, 121.92492567149672 ], "wc_reply_reviewers_avg": [ 85.5, 104.10691619676379 ], "wc_reply_authors_avg": [ 693.25, 91.17942476238814 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:t-_ltgGKRuAJ:scholar.google.com/&scioq=Reference-Limited+Compositional+Learning:+A+Realistic+Assessment+for+Human-level+Compositional+Generalization&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Westlake University", "aff_unique_dep": "", "aff_unique_url": "https://www.westlake.edu.cn", "aff_unique_abbr": "WU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "id": "U-GB_gONqbo", "title": "Scalable Hierarchical Embeddings of Complex Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "Graph representation learning has become important in order to understand and predict intrinsic structures in complex networks. A variety of embedding methods has in recent years been developed including the Latent Distance Modeling (LDM) approach. A major challenge is scaling network embedding approaches to very large networks and a drawback of LDM is the computational cost invoked evaluating the full likelihood having O(N^2) complexity, making such analysis of large networks infeasible. We propose a novel multiscale hierarchical estimate of the full likelihood of LDMs providing high details where the likelihood approximation is most important while scaling in complexity at O(NlogN). The approach relies on a clustering procedure approximating the Euclidean norm of every node pair according to the multiscale hierarchical structure imposed. We demonstrate the accuracy of our approximation and for the first time embed very large networks in the order of a million nodes using LDM and contrast the predictive performance to prominent scalable graph embedding approaches. We find that our approach significantly outperforms these existing scalable approaches in the ability to perform link prediction, node clustering, and classification utilizing a surprisingly low embedding dimensionality of two to three dimensions whereas the extracted hierarchical structure facilitates network visualization and interpretation. The developed scalable hierarchical embedding approach enables accurate low dimensional representations of very large networks providing detailed visualizations that can further our understanding of their properties and structure.", "keywords": "Graph Representation Learning;Latent Space Model;Complex Networks;Scalable Network embeddings;Link prediction;Low dimension graph representations", "primary_area": "", "supplementary_material": "/attachment/08fe6542fb5f1082aee145e0836a8aeec83052e2.zip", "author": "Nikolaos Nakis;Abdulkadir CELIKKANAT;Sune Lehmann;Morten M\u00f8rup", "authorids": "~Nikolaos_Nakis1;~Abdulkadir_CELIKKANAT1;~Sune_Lehmann1;~Morten_M\u00f8rup1", "gender": ";M;M;M", "homepage": "https://nicknakis.github.io/nnaknik/;https://abdcelikkanat.github.io;https://sunelehmann.com;http://www.compute.dtu.dk/~mmor", "dblp": "318/2791.html;228/7764;33/7479;69/1866", "google_scholar": "https://scholar.google.com/citations?hl=en;;wvkUbiUAAAAJ;RQonsgMAAAAJ", "orcid": ";;0000-0001-6099-2345;0000-0003-4985-4368", "linkedin": "nikolaos-nakis-67a07a147/;;sunelehmann/;", "or_profile": "~Nikolaos_Nakis1;~Abdulkadir_CELIKKANAT1;~Sune_Lehmann1;~Morten_M\u00f8rup1", "aff": ";Technical University of Denmark;Technical University of Denmark;Technical University of Denmark", "aff_domain": ";dtu.dk;dtu.dk;dtu.dk", "position": ";Postdoc;Professor;Professor", "bibtex": "@misc{\nnakis2022scalable,\ntitle={Scalable Hierarchical Embeddings of Complex Networks},\nauthor={Nikolaos Nakis and Abdulkadir CELIKKANAT and Sune Lehmann and Morten M{\\o}rup},\nyear={2022},\nurl={https://openreview.net/forum?id=U-GB_gONqbo}\n}", "github": "", "project": "", "reviewers": "2m7B;BVVN;MkFe;ApfC", "site": "https://openreview.net/forum?id=U-GB_gONqbo", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "4;4;3;4", "correctness": "4;4;3;2", "technical_novelty": "2;3;2;2", "empirical_novelty": "2;3;2;2", "wc_summary_paper": "318;15;185;61", "wc_summary_review": "32;13;161;33", "wc_main_review": "434;444;579;156", "wc_review": "784;472;925;250", "wc_reply_reviewers": "184;0;123;33", "wc_reply_authors": "1842;1169;1408;593", "reply_reviewers": "1;0;1;1", "reply_authors": "5;2;3;1", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 144.75, 117.77600562083943 ], "wc_summary_review_avg": [ 59.75, 58.99735163547598 ], "wc_main_review_avg": [ 403.25, 153.80730639342204 ], "wc_review_avg": [ 607.75, 263.6876703602199 ], "wc_reply_reviewers_avg": [ 85.0, 72.7564430136603 ], "wc_reply_authors_avg": [ 1253.0, 450.9994456759343 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 2.75, 1.479019945774904 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": -0.8703882797784891, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:epOZW3kPfvQJ:scholar.google.com/&scioq=Scalable+Hierarchical+Embeddings+of+Complex+Networks&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;0", "aff_unique_norm": "Technical University of Denmark", "aff_unique_dep": "", "aff_unique_url": "https://www.tek.dk", "aff_unique_abbr": "DTU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Denmark" }, { "id": "U-_89RnR8F", "title": "Meaningfully Explaining Model Mistakes Using Conceptual Counterfactuals", "track": "main", "status": "Reject", "tldr": "", "abstract": "Understanding and explaining the mistakes made by trained models is critical to many machine learning objectives, such as improving robustness, addressing concept drift, and mitigating biases. However, this is often an ad hoc process that involves manually looking at the model's mistakes on many test samples and guessing at the underlying reasons for those incorrect predictions. In this paper, we propose a systematic approach, \\textit{conceptual counterfactual explanations} (CCE), that explains why a classifier makes a mistake on a particular test sample(s) in terms of human-understandable concepts (e.g. this zebra is misclassified as a dog because of faint \\emph{stripes}). We base CCE on two prior ideas: counterfactual explanations and concept activation vectors, and validate our approach on well-known pretrained models, showing that it explains the models' mistakes meaningfully. In addition, for new models trained on data with spurious correlations, CCE accurately identifies the spurious correlation as the cause of model mistakes from a single misclassified test sample. On two challenging medical applications, CCE generated useful insights, confirmed by clinicians, into biases and mistakes the model makes in real-world settings. The code for CCE is publicly available and can easily be applied to explain mistakes in new models. ", "keywords": "interpretability;concept-based explanations;counterfactual explanations", "primary_area": "", "supplementary_material": "", "author": "Abubakar Abid;Mert Yuksekgonul;James Zou", "authorids": "~Abubakar_Abid1;~Mert_Yuksekgonul2;~James_Zou1", "gender": "M;;M", "homepage": ";;https://cs.stanford.edu/~merty", "dblp": ";;249/5558", "google_scholar": ";23ZXZvEAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";;", "linkedin": ";;", "or_profile": "~Abubakar_Abid1;~James_Zou1;~Mert_Yuksekgonul1", "aff": ";Stanford University;Stanford University", "aff_domain": ";stanford.edu;stanford.edu", "position": ";Assistant Professor;PhD student", "bibtex": "@misc{\nabid2022meaningfully,\ntitle={Meaningfully Explaining Model Mistakes Using Conceptual Counterfactuals},\nauthor={Abubakar Abid and Mert Yuksekgonul and James Zou},\nyear={2022},\nurl={https://openreview.net/forum?id=U-_89RnR8F}\n}", "github": "", "project": "", "reviewers": "KwpD;GEvb;nQZu;oufr", "site": "https://openreview.net/forum?id=U-_89RnR8F", "pdf_size": 0, "recommendation": "5;6;6;6", "confidence": "4;4;4;4", "correctness": "4;4;3;3", "technical_novelty": "3;3;4;3", "empirical_novelty": "2;3;2;2", "wc_summary_paper": "97;35;255;87", "wc_summary_review": "358;23;155;88", "wc_main_review": "388;155;421;614", "wc_review": "843;213;831;789", "wc_reply_reviewers": "580;5;181;0", "wc_reply_authors": "2466;293;1429;742", "reply_reviewers": "3;1;1;0", "reply_authors": "5;1;4;2", "recommendation_avg": [ 5.75, 0.4330127018922193 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 118.5, 82.24810028201259 ], "wc_summary_review_avg": [ 156.0, 125.61647981057263 ], "wc_main_review_avg": [ 394.5, 163.00690169437613 ], "wc_review_avg": [ 669.0, 264.0340887082575 ], "wc_reply_reviewers_avg": [ 191.5, 235.84793829923552 ], "wc_reply_authors_avg": [ 1232.5, 819.052043523487 ], "reply_reviewers_avg": [ 1.25, 1.0897247358851685 ], "reply_authors_avg": [ 3.0, 1.5811388300841898 ], "replies_avg": [ 24, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": -0.5773502691896257, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11182808676291825043&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Stanford University", "aff_unique_dep": "", "aff_unique_url": "https://www.stanford.edu", "aff_unique_abbr": "Stanford", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Deep Learning without Shortcuts: Shaping the Kernel with Tailored Rectifiers", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6162", "id": "U0k7XNTiFEq", "poster": "", "openreview": "https://openreview.net/forum?id=U0k7XNTiFEq", "slides": "https://iclr.cc/virtual/2022/poster/6162", "video": "https://iclr.cc/virtual/2022/poster/6162", "author_site": "Guodong Zhang, Aleksandar Botev, James Martens", "tldr": "", "abstract": "Training very deep neural networks is still an extremely challenging task. The common solution is to use shortcut connections and normalization layers, which are both crucial ingredients in the popular ResNet architecture. However, there is strong evidence to suggest that ResNets behave more like ensembles of shallower networks than truly deep ones. Recently, it was shown that deep vanilla networks (i.e.~networks without normalization layers or shortcut connections) can be trained as fast as ResNets by applying certain transformations to their activation functions. However, this method (called Deep Kernel Shaping) isn't fully compatible with ReLUs, and produces networks that overfit significantly more than ResNets on ImageNet. In this work, we rectify this situation by developing a new type of transformation that is fully compatible with a variant of ReLUs -- Leaky ReLUs. We show in experiments that our method, which introduces negligible extra computational cost, achieves validation accuracies with deep vanilla networks that are competitive with ResNets (of the same width/depth), and significantly higher than those obtained with the Edge of Chaos (EOC) method. And unlike with EOC, the validation accuracies we obtain do not get worse with depth.", "keywords": "Neural Network Training;Kernel Approximation for Neural Networks;Neural Network Initialization;Generalization", "primary_area": "", "supplementary_material": "/attachment/ebf7676418732eb9b378910fa6bde6cc64c5ce61.zip", "author": "Guodong Zhang;Aleksandar Botev;James Martens", "authorids": "~Guodong_Zhang1;~Aleksandar_Botev1;~James_Martens1", "gender": "M;M;M", "homepage": "http://www.cs.toronto.edu/~gdzhang/;;http://www.cs.toronto.edu/~jmartens/", "dblp": "28/4937;183/6489;12/8412", "google_scholar": "B_TZBtwAAAAJ;8k7RD8QAAAAJ;", "orcid": ";;", "linkedin": ";aleksandarbotev/;", "or_profile": "~Guodong_Zhang1;~Aleksandar_Botev1;~James_Martens1", "aff": "Department of Computer Science, University of Toronto;Google DeepMind;Google DeepMind", "aff_domain": "cs.toronto.edu;deepmind.com;google.com", "position": "PhD student;Researcher;Research Scientist", "bibtex": "@inproceedings{\nzhang2022deep,\ntitle={Deep Learning without Shortcuts: Shaping the Kernel with Tailored Rectifiers},\nauthor={Guodong Zhang and Aleksandar Botev and James Martens},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=U0k7XNTiFEq}\n}", "github": "", "project": "", "reviewers": "HqiL;TjGY;vixN;8mde", "pdf_size": 0, "recommendation": "6;6;6;6", "confidence": "4;3;2;3", "correctness": "3;4;4;4", "technical_novelty": "3;3;3;3", "empirical_novelty": "3;2;3;3", "wc_summary_paper": "61;59;31;79", "wc_summary_review": "43;67;38;47", "wc_main_review": "217;393;125;219", "wc_review": "321;519;194;345", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "414;823;500;254", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 6.0, 0.0 ], "confidence_avg": [ 3.0, 0.7071067811865476 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 57.5, 17.168284713389397 ], "wc_summary_review_avg": [ 48.75, 11.008519428151999 ], "wc_main_review_avg": [ 238.5, 96.94715055121527 ], "wc_review_avg": [ 344.75, 115.81531634460099 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 497.75, 207.49743974324116 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 37, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3445605992837467130&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=U0k7XNTiFEq", "email": "cs.toronto.edu;deepmind.com;google.com", "author_num": 3, "aff_unique_index": "0;1;1", "aff_unique_norm": "University of Toronto;Google", "aff_unique_dep": "Department of Computer Science;Google DeepMind", "aff_unique_url": "https://www.utoronto.ca;https://deepmind.com", "aff_unique_abbr": "U of T;DeepMind", "aff_campus_unique_index": "0", "aff_campus_unique": "Toronto;", "aff_country_unique_index": "0;1;1", "aff_country_unique": "Canada;United Kingdom" }, { "id": "U1edbV4kNu_", "title": "SWARM Parallelism: Training Large Models Can Be Surprisingly Communication-Efficient", "track": "main", "status": "Reject", "tldr": "", "abstract": "Many deep learning applications benefit from using large models with billions of parameters. These models can only be trained with specialized distributed training algorithms that require low-latency and high-bandwidth interconnect. As a result, large models are typically trained in dedicated GPU clusters that can be extremely costly to deploy and operate. In contrast, there are more affordable distributed training setups, such as using cheap \"preemptible\" instances or pooling together existing resources from multiple regions. However, both these setups come with unique challenges that make it impractical to train large models using conventional model parallelism. In this work, we carefully analyze these challenges and find configurations where training larger models becomes less communication-intensive. Based on these observations, we propose SWARM Parallelism (Stochastically Wired Adaptively Rebalanced Model Parallelism) \u2014 a model-parallel training algorithm designed for swarms of poorly connected, heterogeneous unreliable devices. SWARM creates temporary randomized pipelines between available nodes that are rebalanced in case of failure. To further reduce the network usage of our approach, we develop several compression-aware architecture modifications and evaluate their tradeoffs. Finally, we combine our insights to train a large Transformer language model with 1.1B shared parameters (approximately 13B before sharing) on a swarm of preemptible T4 GPUs with less than 400Mb/s network throughput.", "keywords": "distributed training;model-parallel;model parallelism;pipeline;fault tolerance;communication efficiency;volunteer computing", "primary_area": "", "supplementary_material": "", "author": "Max Ryabinin;Tim Dettmers;Michael Diskin;Alexander Borzunov", "authorids": "~Max_Ryabinin1;~Tim_Dettmers2;~Michael_Diskin1;~Alexander_Borzunov1", "gender": "Not Specified;M;M;M", "homepage": "https://mryab.github.io/;https://timdettmers.com/;;https://github.com/borzunov", "dblp": "276/0192;172/1045;295/8914.html;295/8854", "google_scholar": "930PERsAAAAJ;lHI3w5kAAAAJ;LRKQhcYAAAAJ;https://scholar.google.ru/citations?user=HdwzsCMAAAAJ", "orcid": ";;0000-0001-8902-513X;", "linkedin": ";;https://www.linkedin.com/m/in/yhn112/;", "or_profile": "~Max_Ryabinin1;~Tim_Dettmers2;~Michael_Diskin1;~Alexander_Borzunov1", "aff": "Yandex;University of Washington;Yandex;HSE University", "aff_domain": "yandex-team.ru;cs.washington.edu;yandex-team.ru;hse.ru", "position": "Research Scientist;PhD student;Researcher;Instructor", "bibtex": "@misc{\nryabinin2022swarm,\ntitle={{SWARM} Parallelism: Training Large Models Can Be Surprisingly Communication-Efficient},\nauthor={Max Ryabinin and Tim Dettmers and Michael Diskin and Alexander Borzunov},\nyear={2022},\nurl={https://openreview.net/forum?id=U1edbV4kNu_}\n}", "github": "", "project": "", "reviewers": "RSKK;5Pxy;8gAi;XtNW", "site": "https://openreview.net/forum?id=U1edbV4kNu_", "pdf_size": 0, "recommendation": "3;6;6;6", "confidence": "5;4;5;4", "correctness": "2;3;3;4", "technical_novelty": "3;4;3;4", "empirical_novelty": "0;3;2;2", "wc_summary_paper": "94;96;395;124", "wc_summary_review": "47;58;157;56", "wc_main_review": "537;399;466;313", "wc_review": "678;553;1018;493", "wc_reply_reviewers": "0;36;330;0", "wc_reply_authors": "1449;1448;1378;990", "reply_reviewers": "0;1;1;0", "reply_authors": "2;3;3;2", "recommendation_avg": [ 5.25, 1.299038105676658 ], "confidence_avg": [ 4.5, 0.5 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 3.5, 0.5 ], "empirical_novelty_avg": [ 1.75, 1.0897247358851685 ], "wc_summary_paper_avg": [ 177.25, 126.27623489794111 ], "wc_summary_review_avg": [ 79.5, 44.93606569338264 ], "wc_main_review_avg": [ 428.75, 82.74773410795005 ], "wc_review_avg": [ 685.5, 203.23939086702657 ], "wc_reply_reviewers_avg": [ 91.5, 138.48014298086207 ], "wc_reply_authors_avg": [ 1316.25, 190.5470742362632 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 2.5, 0.5 ], "replies_avg": [ 22, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.816496580927726, "gs_citation": 40, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18445653334642104295&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff_unique_index": "0;1;0;2", "aff_unique_norm": "Yandex;University of Washington;Higher School of Economics", "aff_unique_dep": ";;", "aff_unique_url": "https://yandex.com;https://www.washington.edu;https://hse.ru", "aff_unique_abbr": "Yandex;UW;HSE", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0", "aff_country_unique": "Russian Federation;United States" }, { "title": "T-WaveNet: A Tree-Structured Wavelet Neural Network for Time Series Signal Analysis", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6803", "id": "U4uFaLyg7PV", "poster": "", "openreview": "https://openreview.net/forum?id=U4uFaLyg7PV", "slides": "https://iclr.cc/virtual/2022/poster/6803", "video": "https://iclr.cc/virtual/2022/poster/6803", "author_site": "Minhao LIU, Ailing Zeng, Qiuxia LAI, Ruiyuan Gao, Min Li, Jing Qin, Qiang Xu", "tldr": "", "abstract": "Time series signal analysis plays an essential role in many applications, e.g., activity recognition and healthcare monitoring.\nRecently, features extracted with deep neural networks (DNNs) have shown to be more effective than conventional hand-crafted ones.\nHowever, most existing solutions rely solely on the network to extract information carried in the raw signal, regardless of its inherent physical and statistical properties, leading to sub-optimal performance particularly under a limited amount of training data.\nIn this work, we propose a novel tree-structured wavelet neural network for time series signal analysis, namely \\emph{T-WaveNet}, taking advantage of an inherent property of various types of signals, known as the \\emph{dominant frequency range}. Specifically, with \\emph{T-WaveNet}, we first conduct frequency spectrum energy analysis of the signals to get a set of dominant frequency subbands. Then, we construct a tree-structured network that iteratively decomposes the input signal into various frequency subbands with similar energies. Each node on the tree is built with an invertible neural network (INN) based wavelet transform unit. Such a disentangled representation learning method facilitates a more effective extraction of the discriminative features, as demonstrated with the comprehensive experiments on various real-life time series classification datasets. ", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/1f5132768675a9d90eaa3c6a4fcf7c3578ee7975.zip", "author": "Minhao LIU;Ailing Zeng;Qiuxia LAI;Ruiyuan Gao;Min Li;Jing Qin;Qiang Xu", "authorids": "~Minhao_LIU1;~Ailing_Zeng1;~Qiuxia_LAI1;~Ruiyuan_Gao2;~Min_Li7;~Jing_Qin3;~Qiang_Xu1", "gender": "M;F;F;;M;M;M", "homepage": "https://scholar.google.com/citations?user=MUTHUDAAAAAJ&hl=en&oi=ao;https://ailingzeng.site/;https://ashleylqx.github.io/;;;https://harry-qinjing.github.io/;https://github.com/cure-lab", "dblp": "79/10137;226/4720;210/4586.html;;82/0-19;00/1015-1;43/1230-1", "google_scholar": "MUTHUDAAAAAJ;Tn7fzS8AAAAJ;LwIItp4AAAAJ;;X5gRH80AAAAJ;X3Wi7wkAAAAJ;https://scholar.google.com.tw/citations?user=eSiKPqUAAAAJ", "orcid": ";;0000-0001-6872-5540;;;0000-0002-7059-0929;", "linkedin": ";%E7%88%B1%E7%8E%B2-%E6%9B%BE-65504112a/;%E7%A7%8B%E9%9C%9E-%E8%B5%96-11813b169/;;;;", "or_profile": "~Minhao_LIU1;~Ailing_Zeng1;~Qiuxia_LAI1;~Ruiyuan_Gao2;~Min_Li7;~Jing_Qin3;~Qiang_Xu1", "aff": ";The Chinese University of Hong Kong;Communication University of China;;Chinese University of Hong Kong;Hong Kong Polytechnic University;The Chinese University of Hong Kong", "aff_domain": ";cuhk.edu.hk;cuc.edu.cn;;cuhk.hk;polyu.edu.hk;cuhk.edu.hk", "position": ";PhD student;Assistant Professor;;PhD student;Associate Professor;Full Professor", "bibtex": "@inproceedings{\nliu2022twavenet,\ntitle={T-WaveNet: A Tree-Structured Wavelet Neural Network for Time Series Signal Analysis},\nauthor={Minhao LIU and Ailing Zeng and Qiuxia LAI and Ruiyuan Gao and Min Li and Jing Qin and Qiang Xu},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=U4uFaLyg7PV}\n}", "github": "", "project": "", "reviewers": "4oWV;jwqd;jjB7;VTwR", "pdf_size": 0, "recommendation": "6;6;6;8", "confidence": "4;3;2;3", "correctness": "3;3;3;4", "technical_novelty": "3;3;3;4", "empirical_novelty": "3;0;3;3", "wc_summary_paper": "46;63;52;123", "wc_summary_review": "176;44;37;27", "wc_main_review": "308;82;178;174", "wc_review": "530;189;267;324", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 3.0, 0.7071067811865476 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 3.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 1.299038105676658 ], "wc_summary_paper_avg": [ 71.0, 30.63494736408078 ], "wc_summary_review_avg": [ 71.0, 60.92208138269736 ], "wc_main_review_avg": [ 185.5, 80.47825793343193 ], "wc_review_avg": [ 327.5, 126.35367030680193 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 1.0, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14855473382118595550&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "pdf": "https://openreview.net/pdf?id=U4uFaLyg7PV", "email": ";cuhk.edu.hk;cuc.edu.cn;;cuhk.hk;polyu.edu.hk;cuhk.edu.hk", "author_num": 7, "aff_unique_index": "0;1;0;2;0", "aff_unique_norm": "Chinese University of Hong Kong;Communication University of China;Hong Kong Polytechnic University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.cuhk.edu.hk;http://www.cuc.edu.cn/;https://www.polyu.edu.hk", "aff_unique_abbr": "CUHK;CUC;PolyU", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Differentiable Gradient Sampling for Learning Implicit 3D Scene Reconstructions from a Single Image", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6075", "id": "U8pbd00cCWB", "poster": "", "openreview": "https://openreview.net/forum?id=U8pbd00cCWB", "slides": "https://iclr.cc/virtual/2022/poster/6075", "video": "https://iclr.cc/virtual/2022/poster/6075", "author_site": "Shizhan Zhu, Sayna Ebrahimi, Angjoo Kanazawa, trevor darrell", "tldr": "", "abstract": "Implicit shape models are promising 3D representations for modeling arbitrary locations, with Signed Distance Functions (SDFs) particularly suitable for clear mesh surface reconstruction. Existing approaches for single object reconstruction impose supervision signals based on the loss of the signed distance value from all locations in a scene, posing difficulties when extending to real-world scenarios. The spatial gradient of the signed distance field, rather than the SDF value itself, has not been typically employed as a source of supervision for single-view reconstruction, in part due to the difficulties of differentiable sampling a spatial gradient from the feature map. In this study, we derive a novel closed-form gradient sampling solution for Differentialble Gradient Sampling (DGS) that enables backpropagation of the loss of the spatial gradient back to the feature map pixels, thus allowing the imposition of the loss efficiently on the spatial gradient. As a result, we achieve high-quality single view indoor scene reconstruction results learning directly from a real-world scanned dataset (e.g. ScannetV2). Our model also performs well when generalizing to unseen images downloaded directly from the internet (Fig. 1). We comfortably advanced the state-of-the-art results with several established datasets including ShapeNet and ScannetV2; extensive quantitative analysis confirmed that our proposed DGS module plays an essential role in achieving this performance improvement. Full codes are available in MaskedURL.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Shizhan Zhu;Sayna Ebrahimi;Angjoo Kanazawa;Trevor Darrell", "authorids": "~Shizhan_Zhu1;~Sayna_Ebrahimi1;~Angjoo_Kanazawa1;~Trevor_Darrell2", "gender": "M;F;F;M", "homepage": "https://zhusz.github.io/homepage/;https://saynaebrahimi.github.io/;https://people.eecs.berkeley.edu/~kanazawa/;https://people.eecs.berkeley.edu/~trevor/", "dblp": "151/6544;207/7584;119/1305;d/TrevorDarrell", "google_scholar": "XzeytVAAAAAJ;wRyjJfMAAAAJ;Ci-_QYIAAAAJ;https://scholar.google.com.tw/citations?user=bh-uRFMAAAAJ", "orcid": ";;;", "linkedin": ";saynaebrahimi/;;", "or_profile": "~Shizhan_Zhu1;~Sayna_Ebrahimi1;~Angjoo_Kanazawa1;~trevor_darrell1", "aff": "UC Berkeley;Google;University of California, Berkeley;Electrical Engineering & Computer Science Department", "aff_domain": "berkeley.edu;google.com;berkeley.edu;eecs.berkeley.edu", "position": "Graduate Student;Research Scientist;Assistant Professor;Professor", "bibtex": "@inproceedings{\nzhu2022differentiable,\ntitle={Differentiable Gradient Sampling for Learning Implicit 3D Scene Reconstructions from a Single Image},\nauthor={Shizhan Zhu and Sayna Ebrahimi and Angjoo Kanazawa and Trevor Darrell},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=U8pbd00cCWB}\n}", "github": "", "project": "", "reviewers": "ZTzy;BEgr;BwV5;SoBW", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "5;4;4;4", "correctness": "3;3;3;4", "technical_novelty": "3;3;3;3", "empirical_novelty": "3;4;3;4", "wc_summary_paper": "79;71;58;141", "wc_summary_review": "67;30;30;30", "wc_main_review": "859;238;151;93", "wc_review": "1005;339;239;264", "wc_reply_reviewers": "175;0;0;114", "wc_reply_authors": "2703;527;573;174", "reply_reviewers": "1;0;0;1", "reply_authors": "6;1;1;2", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 3.5, 0.5 ], "wc_summary_paper_avg": [ 87.25, 31.92471613029629 ], "wc_summary_review_avg": [ 39.25, 16.021469970012117 ], "wc_main_review_avg": [ 335.25, 306.7591685671351 ], "wc_review_avg": [ 461.75, 315.7969086295811 ], "wc_reply_reviewers_avg": [ 72.25, 75.40018236052218 ], "wc_reply_authors_avg": [ 994.25, 998.5502929246979 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 2.5, 2.0615528128088303 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.6622661785325219, "corr_recommendation_correctness": 0.9271726499455306, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3729432650786353902&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "pdf": "https://openreview.net/pdf?id=U8pbd00cCWB", "email": "berkeley.edu;google.com;berkeley.edu;eecs.berkeley.edu", "author_num": 4, "aff_unique_index": "0;1;0;2", "aff_unique_norm": "University of California, Berkeley;Google;Electrical Engineering & Computer Science Department", "aff_unique_dep": ";Google;Electrical Engineering & Computer Science", "aff_unique_url": "https://www.berkeley.edu;https://www.google.com;", "aff_unique_abbr": "UC Berkeley;Google;", "aff_campus_unique_index": "0;1;0", "aff_campus_unique": "Berkeley;Mountain View;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States;" }, { "id": "U9zTUXVdoIr", "title": "GSmooth: Certified Robustness against Semantic Transformations via Generalized Randomized Smoothing", "track": "main", "status": "Reject", "tldr": "", "abstract": "The vulnerability of deep learning models to adversarial examples and semantic transformations has limited the applications in risk-sensitive areas. The recent development of certified defense approaches like randomized smoothing provides a promising direction towards building reliable machine learning systems. However, current certified defenses cannot handle complex semantic transformations like rotational blur and defocus blur which are common in practical applications. In this paper, we propose a generalized randomized smoothing framework (GSmooth) for certified robustness against semantic transformations. We provide both a unified and rigorous theoretical framework and scalable algorithms for certified robustness on complex semantic transformations. Specifically, our key idea is to use a surrogate image-to-image neural network to approximate a transformation which provides a powerful tool for studying the properties of semantic transformations and certify the transformation based on this neural network. Experiments on multiple types of semantic perturbations and corruptions using multiple datasets demonstrate the effectiveness of our approach.", "keywords": "Randomized Smoothing;Adversarial Robustness;Semantic Transformations;Machine Learning", "primary_area": "", "supplementary_material": "/attachment/7b40d168722043564321384b53fa210e7ba051fe.zip", "author": "Hao Zhongkai;Chengyang Ying;Yinpeng Dong;Hang Su;Jun Zhu", "authorids": "~Hao_Zhongkai1;~Chengyang_Ying1;~Yinpeng_Dong2;~Hang_Su3;~Jun_Zhu2", "gender": "M;M;M;M;M", "homepage": "https://haozhongkai.github.io/;https://yingchengyang.github.io/;https://dongyp13.github.io;http://ml.cs.tsinghua.edu.cn/~jun;", "dblp": "270/0220.html;296/2065;183/0980;50/2644-1;26/5371-6", "google_scholar": "dfSzq27ZiVoC;vM6KE18AAAAJ;6_4ad84AAAAJ;axsP38wAAAAJ;dxN1_X0AAAAJ", "orcid": ";;;;", "linkedin": ";%E9%93%96%E9%98%B3-%E5%BA%94-9b682a203/;;;", "or_profile": "~Hao_Zhongkai1;~Chengyang_Ying1;~Yinpeng_Dong2;~Jun_Zhu2;~Hang_Su2", "aff": "Tsinghua University;Tsinghua University;Tsinghua University;Tsinghua University;Tsinghua University", "aff_domain": "mails.tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;mail.tsinghua.edu.cn;tsinghua.edu.cn", "position": "PhD student;PhD student;PhD student;Professor;Associate Professor", "bibtex": "@misc{\nzhongkai2022gsmooth,\ntitle={{GS}mooth: Certified Robustness against Semantic Transformations via Generalized Randomized Smoothing},\nauthor={Hao Zhongkai and Chengyang Ying and Yinpeng Dong and Hang Su and Jun Zhu},\nyear={2022},\nurl={https://openreview.net/forum?id=U9zTUXVdoIr}\n}", "github": "", "project": "", "reviewers": "HKx4;oUek;bwyb;r3hs", "site": "https://openreview.net/forum?id=U9zTUXVdoIr", "pdf_size": 0, "recommendation": "3;5;8;8", "confidence": "3;3;5;4", "correctness": "2;4;3;3", "technical_novelty": "3;2;4;3", "empirical_novelty": "2;3;4;3", "wc_summary_paper": "38;83;84;153", "wc_summary_review": "28;27;20;124", "wc_main_review": "546;699;176;420", "wc_review": "612;809;280;697", "wc_reply_reviewers": "62;109;36;139", "wc_reply_authors": "1487;991;260;874", "reply_reviewers": "1;1;1;2", "reply_authors": "3;3;1;3", "recommendation_avg": [ 6.0, 2.1213203435596424 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 89.5, 41.100486615124154 ], "wc_summary_review_avg": [ 49.75, 42.97891925118639 ], "wc_main_review_avg": [ 460.25, 191.55465930120312 ], "wc_review_avg": [ 599.5, 197.25174270459564 ], "wc_reply_reviewers_avg": [ 86.5, 40.04060439104285 ], "wc_reply_authors_avg": [ 903.0, 436.77511375993026 ], "reply_reviewers_avg": [ 1.25, 0.4330127018922193 ], "reply_authors_avg": [ 2.5, 0.8660254037844386 ], "replies_avg": [ 23, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.8528028654224419, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 31, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1944882134693943289&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Tsinghua University", "aff_unique_dep": "", "aff_unique_url": "https://www.tsinghua.edu.cn", "aff_unique_abbr": "THU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "id": "UECzHrGio7i", "title": "Robust Imitation Learning from Corrupted Demonstrations", "track": "main", "status": "Reject", "tldr": "", "abstract": "We consider offline Imitation Learning from corrupted demonstrations where a constant fraction of data can be noise or even arbitrary outliers. Classical approaches such as Behavior Cloning assumes that demonstrations are collected by an presumably optimal expert, hence\nmay fail drastically when learning from corrupted demonstrations. We propose a novel robust algorithm by minimizing a Median-of-Means (MOM) objective which guarantees the accurate estimation of policy, even in the presence of constant fraction of outliers. \nOur theoretical analysis shows that our robust method in the corrupted setting enjoys nearly the same error scaling and sample complexity guarantees as the classical Behavior Cloning in the expert demonstration setting. Our experiments on continuous-control benchmarks validate that existing algorithms are fragile under corrupted demonstration while our method exhibits the predicted robustness and effectiveness.", "keywords": "Robust Estimation;Imitation Learning;Reinforcement Learning", "primary_area": "", "supplementary_material": "/attachment/cf731377a48745ca815c5567a5a0b8372a6f647f.zip", "author": "Liu Liu;Ziyang Tang;Lanqing Li;Dijun Luo", "authorids": "~Liu_Liu3;~Ziyang_Tang1;~Lanqing_Li1;~Dijun_Luo1", "gender": "M;M;M;M", "homepage": "http://liuliuforph.github.io;http://www.cs.utexas.edu/~ztang/;https://lanqingli1993.github.io/;https://sites.google.com/site/dijunluo/", "dblp": "74/7037;146/1303;275/9979;", "google_scholar": "hP8aSTAAAAAJ;u8y0FJ4AAAAJ;n8IjgKkAAAAJ;y_1aniIAAAAJ", "orcid": ";;0000-0003-1998-4022;", "linkedin": ";;lanqing-li-%EF%BC%88%E6%9D%8E%E8%93%9D%E9%9D%92%EF%BC%89-49209a83/;", "or_profile": "~Liu_Liu3;~Ziyang_Tang1;~Lanqing_Li1;~Dijun_Luo1", "aff": "Tencent AI Lab;University of Texas, Austin;Tencent AI Lab;Tencent AI Lab", "aff_domain": "tencent.com;utexas.edu;tencent.com;tencent.com", "position": "Researcher;PhD student;Research Scientist;Researcher", "bibtex": "@misc{\nliu2022robust,\ntitle={Robust Imitation Learning from Corrupted Demonstrations},\nauthor={Liu Liu and Ziyang Tang and Lanqing Li and Dijun Luo},\nyear={2022},\nurl={https://openreview.net/forum?id=UECzHrGio7i}\n}", "github": "", "project": "", "reviewers": "9dLk;rs1D;Sr3Z;tcU1", "site": "https://openreview.net/forum?id=UECzHrGio7i", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "4;4;3;3", "correctness": "2;2;3;4", "technical_novelty": "2;3;2;3", "empirical_novelty": "2;2;3;2", "wc_summary_paper": "102;106;145;159", "wc_summary_review": "159;43;55;31", "wc_main_review": "1391;263;301;109", "wc_review": "1652;412;501;299", "wc_reply_reviewers": "141;0;0;0", "wc_reply_authors": "437;132;175;118", "reply_reviewers": "1;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 2.75, 0.82915619758885 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 128.0, 24.545875417267155 ], "wc_summary_review_avg": [ 72.0, 50.941142507800116 ], "wc_main_review_avg": [ 516.0, 510.27149636247566 ], "wc_review_avg": [ 716.0, 545.1206288520001 ], "wc_reply_reviewers_avg": [ 35.25, 61.054790966802926 ], "wc_reply_authors_avg": [ 215.5, 129.59648914997658 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -1.0, "corr_recommendation_correctness": 0.9045340337332909, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12759134138723636881&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;0;0", "aff_unique_norm": "Tencent;University of Texas at Austin", "aff_unique_dep": "Tencent AI Lab;", "aff_unique_url": "https://ai.tencent.com;https://www.utexas.edu", "aff_unique_abbr": "Tencent AI Lab;UT Austin", "aff_campus_unique_index": "1", "aff_campus_unique": ";Austin", "aff_country_unique_index": "0;1;0;0", "aff_country_unique": "China;United States" }, { "id": "UF5cHSBycOt", "title": "Learning to Pool in Graph Neural Networks for Extrapolation", "track": "main", "status": "Reject", "tldr": "", "abstract": "Graph neural networks (GNNs) are one of the most popular approaches to using deep learning on graph-structured data, and they have shown state-of-the-art performances on a variety of tasks. However, according to a recent study, a careful choice of pooling functions, which are used for the aggregation and readout operations in GNNs, is crucial for enabling GNNs to extrapolate. Without proper choices of pooling functions, which varies across tasks, GNNs completely fail to generalize to out-of-distribution data, while the number of possible choices grows exponentially with the number of layers. In this paper, we present GNP, a $L^p$ norm-like pooling function that is trainable end-to-end for any given task. Notably, GNP generalizes most of the widely-used pooling functions. We verify experimentally that simply using GNP for every aggregation and readout operation enables GNNs to extrapolate well on many node-level, graph-level, and set-related tasks; and GNP sometimes performs even better than the best-performing choices among existing pooling functions.\n\n", "keywords": "Graph Neural Network;Pooling;Extrapolation", "primary_area": "", "supplementary_material": "/attachment/24f7fcc04177342077fd9a06e60066952b51198e.zip", "author": "Jihoon Ko;Taehyung Kwon;Kijung Shin;Juho Lee", "authorids": "~Jihoon_Ko2;~Taehyung_Kwon1;~Kijung_Shin2;~Juho_Lee2", "gender": ";M;M;M", "homepage": ";https://kbrother.github.io/;https://kijungs.github.io/;https://juho.lee.github.io", "dblp": "127/7344;175/2163;153/2052;55/3410-1", "google_scholar": "_m0bPIQAAAAJ;https://scholar.google.co.kr/citations?user=Ld_e3xIAAAAJ;https://scholar.google.co.kr/citations?user=Yp3Cz5AAAAAJ;Py4URJUAAAAJ", "orcid": ";0000-0002-6177-7329;0000-0002-2872-1526;", "linkedin": ";;kijungshin/;", "or_profile": "~Jihoon_Ko2;~Taehyung_Kwon1;~Kijung_Shin2;~Juho_Lee2", "aff": "Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology", "aff_domain": "kaist.ac.kr;kaist.ac.kr;kaist.ac.kr;kaist.ac.kr", "position": "PhD student;MS student;Associate Professor;Assistant Professor", "bibtex": "@misc{\nko2022learning,\ntitle={Learning to Pool in Graph Neural Networks for Extrapolation},\nauthor={Jihoon Ko and Taehyung Kwon and Kijung Shin and Juho Lee},\nyear={2022},\nurl={https://openreview.net/forum?id=UF5cHSBycOt}\n}", "github": "", "project": "", "reviewers": "NEG6;17q4;oxmV;hW8Y", "site": "https://openreview.net/forum?id=UF5cHSBycOt", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "4;4;4;3", "correctness": "3;3;3;3", "technical_novelty": "1;2;2;2", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "71;37;67;100", "wc_summary_review": "41;47;13;73", "wc_main_review": "467;349;178;333", "wc_review": "579;433;258;506", "wc_reply_reviewers": "98;114;73;73", "wc_reply_authors": "502;484;411;324", "reply_reviewers": "1;1;2;1", "reply_authors": "1;1;3;1", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 1.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 68.75, 22.320114247019436 ], "wc_summary_review_avg": [ 43.5, 21.324868112136123 ], "wc_main_review_avg": [ 331.75, 102.75060827070563 ], "wc_review_avg": [ 444.0, 119.14906629932103 ], "wc_reply_reviewers_avg": [ 89.5, 17.44276354251241 ], "wc_reply_authors_avg": [ 430.25, 70.17255517650757 ], "reply_reviewers_avg": [ 1.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.5, 0.8660254037844386 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.0, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5909761746673289151&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Korea Advanced Institute of Science and Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.kaist.ac.kr", "aff_unique_abbr": "KAIST", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "South Korea" }, { "id": "UFYYol-bRq", "title": "ANCER: Anisotropic Certification via Sample-wise Volume Maximization", "track": "main", "status": "Reject", "tldr": "", "abstract": "Randomized smoothing has recently emerged as an effective tool that enables certification of deep neural network classifiers at scale. All prior art on randomized smoothing has focused on isotropic $\\ell_p$ certification, which has the advantage of yielding certificates that can be easily compared among isotropic methods via $\\ell_p$-norm radius. However, isotropic certification limits the region that can be certified around an input to worst-case adversaries, i.e., it cannot reason about other \"close\", potentially large, constant prediction safe regions. To alleviate this issue, (i) we theoretically extend the isotropic randomized smoothing $\\ell_1$ and $\\ell_2$ certificates to their generalized anisotropic counterparts following a simplified analysis. Moreover, (ii) we propose evaluation metrics allowing for the comparison of general certificates - a certificate is superior to another if it certifies a superset region - with the quantification of each certificate through the volume of the certified region. We introduce ANCER, a framework for obtaining anisotropic certificates for a given test set sample via volume maximization. We achieve it by generalizing memory-based certification of data-dependent classifiers. Our empirical results demonstrate that ANCER achieves state-of-the-art $\\ell_1$ and $\\ell_2$ certified accuracy on CIFAR-10 and ImageNet, while certifying larger regions in terms of volume, highlighting the benefits of moving away from isotropic analysis.", "keywords": "randomized smoothing;anisotropic certification;deep neural network certification;certified defenses", "primary_area": "", "supplementary_material": "/attachment/d8e73d24cdb1c83425c4795e7b2f14d671b95e0a.zip", "author": "Francisco Eiras;Motasem Alfarra;Philip Torr;M. Pawan Kumar;Puneet K. Dokania;Bernard Ghanem;Adel Bibi", "authorids": "~Francisco_Eiras1;~Motasem_Alfarra1;~Philip_Torr1;~M._Pawan_Kumar1;~Puneet_K._Dokania1;~Bernard_Ghanem1;~Adel_Bibi1", "gender": "M;M;;;M;M;M", "homepage": "https://fgirbal.github.io;https://motasemalfarra.netlify.app/;http://www.robots.ox.ac.uk/~tvg/;;https://ivul.kaust.edu.sa;http://adelbibi.com;http://puneetkdokania.github.io/", "dblp": "218/5843;255/5192;;45/2527;37/2516;176/0964;150/4211", "google_scholar": "O_iJTgYAAAAJ;https://scholar.google.com/citations?hl=en;;https://scholar.google.com/citations?hl=en;rVsGTeEAAAAJ;Q4j2laYAAAAJ;https://scholar.google.fr/citations?user=WsM7ybkAAAAJ", "orcid": ";;;;0000-0002-5534-587X;0000-0002-6169-3918;", "linkedin": "franciscogirbaleiras/;;;;bernardghanem/;adel-bibi-ba3671ab/;", "or_profile": "~Francisco_Eiras1;~Motasem_Alfarra1;~Philip_Torr1;~M._Pawan_Kumar1;~Bernard_Ghanem1;~Adel_Bibi1;~Puneet_Dokania1", "aff": "University of Oxford;KAUST;University of Oxford;Google DeepMind;King Abdullah University of Science and Technology;University of Oxford;University of Oxford", "aff_domain": "ox.ac.uk;kaust.edu.sa;ox.ac.uk;deepmind.com;kaust.edu.sa;ox.ac.uk;oxford.ac.uk", "position": "PhD student;PhD student;Full Professor;Researcher;Associate Professor;Postdoc;Senior Researcher", "bibtex": "@misc{\neiras2022ancer,\ntitle={{ANCER}: Anisotropic Certification via Sample-wise Volume Maximization},\nauthor={Francisco Eiras and Motasem Alfarra and Philip Torr and M. Pawan Kumar and Puneet K. Dokania and Bernard Ghanem and Adel Bibi},\nyear={2022},\nurl={https://openreview.net/forum?id=UFYYol-bRq}\n}", "github": "", "project": "", "reviewers": "fXE9;xsin;xapX;Fomp", "site": "https://openreview.net/forum?id=UFYYol-bRq", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "4;5;3;4", "correctness": "3;4;4;4", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "119;38;47;54", "wc_summary_review": "57;48;33;52", "wc_main_review": "551;717;182;353", "wc_review": "727;803;262;459", "wc_reply_reviewers": "416;200;0;0", "wc_reply_authors": "1484;2009;310;497", "reply_reviewers": "2;1;0;0", "reply_authors": "3;5;1;1", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 64.5, 31.972644557496334 ], "wc_summary_review_avg": [ 47.5, 8.958236433584458 ], "wc_main_review_avg": [ 450.75, 201.69330058284038 ], "wc_review_avg": [ 562.75, 215.58800407258286 ], "wc_reply_reviewers_avg": [ 154.0, 171.89531698100447 ], "wc_reply_authors_avg": [ 1075.0, 699.8117604041819 ], "reply_reviewers_avg": [ 0.75, 0.82915619758885 ], "reply_authors_avg": [ 2.5, 1.6583123951777 ], "replies_avg": [ 20, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.6622661785325219, "gs_citation": 34, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17782604003861205737&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff_unique_index": "0;1;0;2;1;0;0", "aff_unique_norm": "University of Oxford;King Abdullah University of Science and Technology;Google", "aff_unique_dep": ";;Google DeepMind", "aff_unique_url": "https://www.ox.ac.uk;https://www.kaust.edu.sa;https://deepmind.com", "aff_unique_abbr": "Oxford;KAUST;DeepMind", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0;1;0;0", "aff_country_unique": "United Kingdom;Saudi Arabia" }, { "id": "UGINpaICVOt", "title": "Neural networks with trainable matrix activation functions", "track": "main", "status": "Reject", "tldr": "", "abstract": "The training process of neural networks usually optimize weights and bias parameters of linear transformations, while nonlinear activation functions are pre-specified and fixed. This work develops a systematic approach to constructing matrix activation functions whose entries are generalized from ReLU. The activation is based on matrix-vector multiplications using only scalar multiplications and comparisons. The proposed activation functions depend on parameters that are trained along with the weights and bias vectors. Neural networks based on this approach are simple and efficient and are shown to be robust in numerical experiments.", "keywords": "neural networks;trainable activation function;function approximation;image classification", "primary_area": "", "supplementary_material": "", "author": "Yuwen Li;Zhengqi Liu;Ludmil Zikatanov", "authorids": "~Yuwen_Li1;~Zhengqi_Liu1;~Ludmil_Zikatanov1", "gender": "M;M;", "homepage": "https://sites.google.com/view/liyuwen;https://github.com/liuzhengqi1996;http://www.personal.psu.edu/ltz1/", "dblp": ";;75/3099.html", "google_scholar": "7kpkBewAAAAJ;;7QW688MAAAAJ", "orcid": ";;my-orcid?orcid=0000-0002-5189-4230", "linkedin": ";;", "or_profile": "~Yuwen_Li1;~Zhengqi_Liu1;~Ludmil_Zikatanov1", "aff": ";Pennsylvania State University;Pennsylvania State University", "aff_domain": ";psu.edu;psu.edu", "position": ";PhD student;Full Professor", "bibtex": "@misc{\nli2022neural,\ntitle={Neural networks with trainable matrix activation functions},\nauthor={Yuwen Li and Zhengqi Liu and Ludmil Zikatanov},\nyear={2022},\nurl={https://openreview.net/forum?id=UGINpaICVOt}\n}", "github": "", "project": "", "reviewers": "FB5F;MSbu;MkDq;mR17;iGq4", "site": "https://openreview.net/forum?id=UGINpaICVOt", "pdf_size": 0, "recommendation": "1;1;3;3;3", "confidence": "4;4;5;3;4", "correctness": "2;3;2;1;2", "technical_novelty": "2;1;2;1;2", "empirical_novelty": "1;1;1;1;1", "wc_summary_paper": "49;117;68;40;70", "wc_summary_review": "46;19;31;33;20", "wc_main_review": "824;240;224;318;194", "wc_review": "919;376;323;391;284", "wc_reply_reviewers": "309;56;69;0;52", "wc_reply_authors": "523;403;300;198;391", "reply_reviewers": "1;1;1;0;1", "reply_authors": "1;1;1;1;1", "recommendation_avg": [ 2.2, 0.9797958971132712 ], "confidence_avg": [ 4.0, 0.6324555320336759 ], "correctness_avg": [ 2.0, 0.6324555320336759 ], "technical_novelty_avg": [ 1.6, 0.4898979485566356 ], "empirical_novelty_avg": [ 1.0, 0.0 ], "wc_summary_paper_avg": [ 68.8, 26.63381309538685 ], "wc_summary_review_avg": [ 29.8, 9.867117106835208 ], "wc_main_review_avg": [ 360.0, 235.58947344904865 ], "wc_review_avg": [ 458.6, 233.33803804780737 ], "wc_reply_reviewers_avg": [ 97.2, 108.48299405897683 ], "wc_reply_authors_avg": [ 363.0, 108.79154378902801 ], "reply_reviewers_avg": [ 0.8, 0.4 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": -0.6454972243679028, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:gQAJp6NSPYgJ:scholar.google.com/&scioq=Neural+networks+with+trainable+matrix+activation+functions&hl=en&as_sdt=0,33", "gs_version_total": 5, "aff_unique_index": "0;0", "aff_unique_norm": "Pennsylvania State University", "aff_unique_dep": "", "aff_unique_url": "https://www.psu.edu", "aff_unique_abbr": "PSU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "UI4K-I2ypG", "title": "A Survey on Evidential Deep Learning For Single-Pass Uncertainty Estimation", "track": "main", "status": "Reject", "tldr": "", "abstract": "Popular approaches for quantifying predictive uncertainty in deep neural networks often involve a set of weights or models, for instance via ensembling or Monte Carlo Dropout. These techniques usually produce overhead by having to train multiple model instances or do not produce very diverse predictions. This survey aims to familiarize the reader with an alternative class of models based on the concept of Evidential Deep Learning: For unfamiliar data, they admit \u201cwhat they don\u2019t know\u201d and fall back onto a prior belief. Furthermore, they allow uncertainty estimation in a single model and forward pass by parameterizing distributions over distributions. This survey recapitulates existing works, focusing on the implementation in a classification setting. Finally, we survey the application of the same paradigm to regression problems. We also provide a reflection on the strengths and weaknesses of the mentioned approaches compared to existing ones and provide the most central theoretical results in order to inform future research.", "keywords": "uncertainty estimation;prior networks;posterior networks;conjugate priors;classification;regression;evidential deep learning;dirichlet", "primary_area": "", "supplementary_material": "", "author": "Dennis Thomas Ulmer", "authorids": "~Dennis_Thomas_Ulmer1", "gender": "Non-Binary", "homepage": "http://dennisulmer.eu/", "dblp": "", "google_scholar": "rn9WoaEAAAAJ", "orcid": "", "linkedin": "", "or_profile": "~Dennis_Thomas_Ulmer1", "aff": "IT University of Copenhagen", "aff_domain": "itu.dk", "position": "PhD student", "bibtex": "@misc{\nulmer2022a,\ntitle={A Survey on Evidential Deep Learning For Single-Pass Uncertainty Estimation},\nauthor={Dennis Thomas Ulmer},\nyear={2022},\nurl={https://openreview.net/forum?id=UI4K-I2ypG}\n}", "github": "", "project": "", "reviewers": "vS5D;H7TT;8cR8;eCYd", "site": "https://openreview.net/forum?id=UI4K-I2ypG", "pdf_size": 0, "recommendation": "1;3;5;5", "confidence": "3;4;4;4", "correctness": "4;3;4;3", "technical_novelty": "1;1;1;2", "empirical_novelty": "0;0;1;0", "wc_summary_paper": "64;74;79;38", "wc_summary_review": "49;133;97;59", "wc_main_review": "254;301;244;604", "wc_review": "367;508;420;701", "wc_reply_reviewers": "0;45;107;0", "wc_reply_authors": "359;130;332;97", "reply_reviewers": "0;1;1;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 3.5, 1.6583123951777 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 1.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 0.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 63.75, 15.81731646013318 ], "wc_summary_review_avg": [ 84.5, 33.237779709240506 ], "wc_main_review_avg": [ 350.75, 147.78933486554433 ], "wc_review_avg": [ 499.0, 127.033460159125 ], "wc_reply_reviewers_avg": [ 38.0, 43.8691235380877 ], "wc_reply_authors_avg": [ 229.5, 116.97542476947882 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 1, 0 ], "corr_recommendation_confidence": 0.8703882797784891, "corr_recommendation_correctness": -0.30151134457776363, "gs_citation": 36, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15653345888379663016&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "IT University of Copenhagen", "aff_unique_dep": "", "aff_unique_url": "https://itu.dk", "aff_unique_abbr": "ITU", "aff_country_unique_index": "0", "aff_country_unique": "Denmark" }, { "id": "UIQxciuYcon", "title": "Revisiting Contrastive Learning through the Lens of Neighborhood Component Analysis: an Integrated Framework", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "As a seminal tool in self-supervised representation learning, contrastive learning has gained unprecedented attention in recent years. In essence, contrastive learning aims to leverage pairs of positive and negative samples for representation learning, which relates to exploiting neighborhood information in a feature space. By investigating the connection between contrastive learning and neighborhood component analysis (NCA), we provide a novel stochastic nearest neighbor viewpoint of contrastive learning and subsequently propose a series of contrastive losses that outperform the existing ones. Under our proposed framework, we show a principled way to design integrated contrastive losses that simultaneously achieve good accuracy and robustness on downstream tasks.", "keywords": "representation learning", "primary_area": "", "supplementary_material": "/attachment/6e1421ceeadf9faadaf4a5caa0e1271223b832a8.zip", "author": "Ching-Yun Ko;Jeet Mohapatra;Pin-Yu Chen;Sijia Liu;Luca Daniel;Tsui-Wei Weng", "authorids": "~Ching-Yun_Ko1;~Jeet_Mohapatra1;~Pin-Yu_Chen1;~Sijia_Liu1;~Luca_Daniel1;~Tsui-Wei_Weng1", "gender": "F;M;M;M;;F", "homepage": ";;http://www.pinyuchen.com;https://lsjxjtu.github.io/;https://www.mit.edu/~dluca/;https://lilywenglab.github.io", "dblp": "206/6472;210/2304;39/8969;128/6972-1;35/5202;177/9197", "google_scholar": ";;jxwlCUUAAAAJ;C7dO_UgAAAAJ;;v8GM4xoAAAAJ", "orcid": ";;0000-0003-1039-8369;;0000-0002-5880-3151;", "linkedin": ";;pin-yu-chen-940062a2;;;", "or_profile": "~Ching-Yun_Ko1;~Jeet_Mohapatra1;~Pin-Yu_Chen1;~Sijia_Liu1;~Luca_Daniel1;~Tsui-Wei_Weng1", "aff": "Massachusetts Institute of Technology;;International Business Machines;Michigan State University;;University of California, San Diego", "aff_domain": "mit.edu;;ibm.com;msu.edu;;ucsd.edu", "position": "PhD student;;Research Staff Member;Assistant Professor;;Assistant Professor", "bibtex": "@misc{\nko2022revisiting,\ntitle={Revisiting Contrastive Learning through the Lens of Neighborhood Component Analysis: an Integrated Framework },\nauthor={Ching-Yun Ko and Jeet Mohapatra and Pin-Yu Chen and Sijia Liu and Luca Daniel and Tsui-Wei Weng},\nyear={2022},\nurl={https://openreview.net/forum?id=UIQxciuYcon}\n}", "github": "", "project": "", "reviewers": "9BWx;waF4;DLDG;jY8a;6SgW", "site": "https://openreview.net/forum?id=UIQxciuYcon", "pdf_size": 0, "recommendation": "3;5;5;5;6", "confidence": "4;4;4;5;3", "correctness": "3;3;2;2;3", "technical_novelty": "2;2;3;2;3", "empirical_novelty": "1;2;3;2;3", "wc_summary_paper": "44;53;66;29;102", "wc_summary_review": "53;53;148;59;43", "wc_main_review": "610;464;1097;122;490", "wc_review": "707;570;1311;210;635", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "0;0;0;0;0", "reply_reviewers": "0;0;0;0;0", "reply_authors": "0;0;0;0;0", "recommendation_avg": [ 4.8, 0.9797958971132712 ], "confidence_avg": [ 4.0, 0.6324555320336759 ], "correctness_avg": [ 2.6, 0.4898979485566356 ], "technical_novelty_avg": [ 2.4, 0.4898979485566356 ], "empirical_novelty_avg": [ 2.2, 0.7483314773547882 ], "wc_summary_paper_avg": [ 58.8, 24.733782565551916 ], "wc_summary_review_avg": [ 71.2, 38.74222502644886 ], "wc_main_review_avg": [ 556.6, 315.20507610125827 ], "wc_review_avg": [ 686.6, 356.0048314278895 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.3227486121839514, "corr_recommendation_correctness": -0.16666666666666663, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6422514382686875606&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 11, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "Massachusetts Institute of Technology;International Business Machines Corporation;Michigan State University;University of California, San Diego", "aff_unique_dep": ";;;", "aff_unique_url": "https://web.mit.edu;https://www.ibm.com;https://www.msu.edu;https://www.ucsd.edu", "aff_unique_abbr": "MIT;IBM;MSU;UCSD", "aff_campus_unique_index": "1", "aff_campus_unique": ";San Diego", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "UJ9_wmscwk", "title": "Learning Graph Representations for Influence Maximization", "track": "main", "status": "Reject", "tldr": "", "abstract": "As the field of machine learning for combinatorial optimization advances, traditional problems are resurfaced and readdressed through this new perspective. The overwhelming majority of the literature focuses on small graph problems, while several real-world problems are devoted to large graphs. Here, we focus on two such problems: influence estimation, a #P-hard counting problem, and influence maximization, an NP-hard problem. We develop Glie, a Graph Neural Network (GNN) that inherently parameterizes an upper bound of influence estimation and train it on small simulated graphs. Experiments show that Glie provides accurate influence estimation for real graphs up to 10 times larger than the train set. More importantly, it can be used for influence maximization on considerably larger graphs, as the predictions ranking is not affected by the drop of accuracy. We develop a version of Cost Effective Lazy Forward optimization with Glie instead of simulated influence estimation, surpassing the benchmark for influence maximization, although with a computational overhead. To balance the time complexity and quality of influence, we propose two different approaches. The first is a Q-network that learns to choose seeds sequentially using Glie's predictions. The second defines a provably submodular function based on Glie's representations to rank nodes fast while building the seed set. The latter provides the best combination of time efficiency and influence spread, outperforming SOTA benchmarks.", "keywords": "influence maximization;graph neural networks", "primary_area": "", "supplementary_material": "/attachment/6d8ce4a922847e5c20b8ba562a6103a99d6eb4b6.zip", "author": "George Panagopoulos;Nikolaos Tziortziotis;Fragkiskos D. Malliaros;Michalis Vazirgiannis", "authorids": "~George_Panagopoulos1;~Nikolaos_Tziortziotis1;~Fragkiskos_D._Malliaros1;~Michalis_Vazirgiannis2", "gender": "M;M;M;", "homepage": "https://giorgospanagopoulos.github.io/;https://ntziortziotis.github.io;http://fragkiskos.me/;", "dblp": ";34/9922;22/9458;", "google_scholar": "https://scholar.google.gr/citations?user=6PLvpVMAAAAJ;Rfxpz04AAAAJ;_7heOKcAAAAJ;", "orcid": ";;;", "linkedin": "giorgospanagopoulos/;;;", "or_profile": "~George_Panagopoulos1;~Nikolaos_Tziortziotis1;~Fragkiskos_D._Malliaros1;~Michalis_Vazirgiannis2", "aff": "Ecole polytechnique;Jellyfish Inc;CentraleSup\u00e9lec, Inria, Paris-Saclay University;", "aff_domain": "polytechnique.edu;jellyfish.com;centralesupelec.fr;", "position": "PhD student;Senior Data Scientist R&D;Assistant Professor;", "bibtex": "@misc{\npanagopoulos2022learning,\ntitle={Learning Graph Representations for Influence Maximization},\nauthor={George Panagopoulos and Nikolaos Tziortziotis and Fragkiskos D. Malliaros and Michalis Vazirgiannis},\nyear={2022},\nurl={https://openreview.net/forum?id=UJ9_wmscwk}\n}", "github": "", "project": "", "reviewers": "et7w;caQs;F6tB;GuP1", "site": "https://openreview.net/forum?id=UJ9_wmscwk", "pdf_size": 0, "recommendation": "3;3;5;6", "confidence": "4;4;3;2", "correctness": "3;3;4;3", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;2;3;2", "wc_summary_paper": "75;78;106;53", "wc_summary_review": "14;41;45;29", "wc_main_review": "641;283;402;161", "wc_review": "730;402;553;243", "wc_reply_reviewers": "752;95;48;21", "wc_reply_authors": "1536;1171;835;315", "reply_reviewers": "3;1;1;1", "reply_authors": "3;2;1;1", "recommendation_avg": [ 4.25, 1.299038105676658 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 78.0, 18.828170383762732 ], "wc_summary_review_avg": [ 32.25, 12.07010770457331 ], "wc_main_review_avg": [ 371.75, 177.27291812344038 ], "wc_review_avg": [ 482.0, 180.32332073251092 ], "wc_reply_reviewers_avg": [ 229.0, 303.11301522699415 ], "wc_reply_authors_avg": [ 964.25, 449.40926503578004 ], "reply_reviewers_avg": [ 1.5, 0.8660254037844386 ], "reply_authors_avg": [ 1.75, 0.82915619758885 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.986440050415621, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14015849108086375443&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff_unique_index": "0;1;2", "aff_unique_norm": "Ecole Polytechnique;Jellyfish Inc;CentraleSup\u00e9lec", "aff_unique_dep": ";;", "aff_unique_url": "https://www.polytechnique.edu;;https://www.centralesupelec.fr", "aff_unique_abbr": "X;;CentraleSup\u00e9lec", "aff_campus_unique_index": "1", "aff_campus_unique": ";Paris-Saclay", "aff_country_unique_index": "0;0", "aff_country_unique": "France;" }, { "title": "Maximum n-times Coverage for Vaccine Design", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6394", "id": "ULfq0qR25dY", "poster": "", "openreview": "https://openreview.net/forum?id=ULfq0qR25dY", "slides": "https://iclr.cc/virtual/2022/poster/6394", "video": "https://iclr.cc/virtual/2022/poster/6394", "author_site": "Ge Liu, Alexander Dimitrakakis, Brandon Carter, David Gifford", "tldr": "", "abstract": "We introduce the maximum $n$-times coverage problem that selects $k$ overlays to maximize the summed coverage of weighted elements, where each element must be covered at least $n$ times. We also define the min-cost $n$-times coverage problem where the objective is to select the minimum set of overlays such that the sum of the weights of elements that are covered at least $n$ times is at least $\\tau$. Maximum $n$-times coverage is a generalization of the multi-set multi-cover problem, is NP-complete, and is not submodular. We introduce two new practical solutions for $n$-times coverage based on integer linear programming and sequential greedy optimization. We show that maximum $n$-times coverage is a natural way to frame peptide vaccine design, and find that it produces a pan-strain COVID-19 vaccine design that is superior to 29 other published designs in predicted population coverage and the expected number of peptides displayed by each individual's HLA molecules.", "keywords": "computational biology;vaccine design;COVID-19;maximum n-times coverage;combinatorial optimization;integer linear programming", "primary_area": "", "supplementary_material": "/attachment/5e8db7a31cf8f7eef84a53079c32f0d8e8efb459.zip", "author": "Ge Liu;Alexander Dimitrakakis;Brandon Carter;David Gifford", "authorids": "~Ge_Liu2;~Alexander_Dimitrakakis1;~Brandon_Carter1;~David_Gifford1", "gender": "F;M;;M", "homepage": "http://www.mit.edu/~geliu/;https://www.linkedin.com/in/alexdimi/;;http://giffordlab.mit.edu", "dblp": ";;;g/DavidKGifford", "google_scholar": "P6EahzcAAAAJ;;MwzQlyIAAAAJ;", "orcid": "0000-0001-9383-5186;;;", "linkedin": ";;;", "or_profile": "~Ge_Liu2;~Alexander_Dimitrakakis1;~Brandon_Carter1;~David_Gifford1", "aff": "Amazon AWS AI;Massachusetts Institute of Technology;;Massachusetts Institute of Technology", "aff_domain": "amazon.com;mit.edu;;mit.edu", "position": "Researcher;Master's student;;Full Professor", "bibtex": "@inproceedings{\nliu2022maximum,\ntitle={Maximum n-times Coverage for Vaccine Design},\nauthor={Ge Liu and Alexander Dimitrakakis and Brandon Carter and David Gifford},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=ULfq0qR25dY}\n}", "github": "", "project": "", "reviewers": "csaP;4qAy;Hy3s;Z6bA", "pdf_size": 0, "recommendation": "6;6;6;8", "confidence": "3;3;4;3", "correctness": "3;3;2;4", "technical_novelty": "3;3;3;2", "empirical_novelty": "3;2;2;3", "wc_summary_paper": "117;145;99;115", "wc_summary_review": "33;71;26;17", "wc_main_review": "140;225;399;79", "wc_review": "290;441;524;211", "wc_reply_reviewers": "0;0;55;0", "wc_reply_authors": "323;503;1486;0", "reply_reviewers": "0;0;2;0", "reply_authors": "1;1;3;0", "recommendation_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 119.0, 16.55294535724685 ], "wc_summary_review_avg": [ 36.75, 20.571521577170707 ], "wc_main_review_avg": [ 210.75, 120.420876512339 ], "wc_review_avg": [ 366.5, 122.87086717363071 ], "wc_reply_reviewers_avg": [ 13.75, 23.81569860407206 ], "wc_reply_authors_avg": [ 578.0, 554.3460110797226 ], "reply_reviewers_avg": [ 0.5, 0.8660254037844386 ], "reply_authors_avg": [ 1.25, 1.0897247358851685 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": 0.816496580927726, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17184876342921372695&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "pdf": "https://openreview.net/pdf?id=ULfq0qR25dY", "email": "amazon.com;mit.edu;;mit.edu", "author_num": 4, "aff_unique_index": "0;1;1", "aff_unique_norm": "Amazon;Massachusetts Institute of Technology", "aff_unique_dep": "Amazon Web Services AI;", "aff_unique_url": "https://aws.amazon.com;https://web.mit.edu", "aff_unique_abbr": "AWS;MIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "UMQ4PFd35i", "title": "Time Delay Estimation of Traffic Congestion Based on Statistical Causality", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Considering how congestion will propagate in the near future, understanding traffic congestion propagation has become crucial in GPS navigation systems for providing users with a more accurate estimated time of arrival (ETA). However, providing the exact ETA during congestion is a challenge owing to the complex propagation process between roads and high uncertainty regarding the future behavior of the process. To aid in accurate ETA calculation during congestion, we propose a novel time delay estimation method for the propagation of traffic congestion due to traffic accidents using lag-specific transfer entropy (TE). Nonlinear normalization with a sliding window is used to effectively reveal the causal relationship between the source and target time series in calculating the TE. Moreover, Markov bootstrap techniques were adopted to quantify the uncertainty in the time delay estimator. To the best of our knowledge, the proposed method is the first to estimate the time delay based on the causal relationship between adjacent roads. The proposed method was validated using simulated data as well as real user trajectory data obtained from a major GPS navigation system applied in South Korea.", "keywords": "time delay estimation;transfer entropy;traffic congestion", "primary_area": "", "supplementary_material": "", "author": "YongKyung Oh;Ji-In Kwak;JuYeong Lee;Sungil Kim", "authorids": "~YongKyung_Oh1;~Ji-In_Kwak1;~JuYeong_Lee1;~Sungil_Kim1", "gender": ";F;M;M", "homepage": ";https://github.com/Ji-In-Kwak;https://github.com/passionlee428;http://analytics.unist.ac.kr", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~YongKyung_Oh1;~Ji-In_Kwak1;~JuYeong_Lee1;~Sungil_Kim1", "aff": ";;;Ulsan National Institute of Science and Technology", "aff_domain": ";;;unist.ac.kr", "position": ";;;Associate Professor", "bibtex": "@misc{\noh2022time,\ntitle={Time Delay Estimation of Traffic Congestion Based on Statistical Causality},\nauthor={YongKyung Oh and Ji-In Kwak and JuYeong Lee and Sungil Kim},\nyear={2022},\nurl={https://openreview.net/forum?id=UMQ4PFd35i}\n}", "github": "", "project": "", "reviewers": "vQ9m;Su9w;bcVg", "site": "https://openreview.net/forum?id=UMQ4PFd35i", "pdf_size": 0, "recommendation": "3;3;6", "confidence": "4;4;3", "correctness": "4;3;3", "technical_novelty": "1;2;3", "empirical_novelty": "1;2;3", "wc_summary_paper": "44;52;13", "wc_summary_review": "30;23;23", "wc_main_review": "347;578;57", "wc_review": "421;653;93", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 4.0, 1.4142135623730951 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.0, 0.816496580927726 ], "empirical_novelty_avg": [ 2.0, 0.816496580927726 ], "wc_summary_paper_avg": [ 36.333333333333336, 16.81930108205715 ], "wc_summary_review_avg": [ 25.333333333333332, 3.299831645537222 ], "wc_main_review_avg": [ 327.3333333333333, 213.1514849949574 ], "wc_review_avg": [ 389.0, 229.73608046335835 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -1.0, "corr_recommendation_correctness": -0.5, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:yLi522CHaAgJ:scholar.google.com/&scioq=Time+Delay+Estimation+of+Traffic+Congestion+Based+on+Statistical+Causality&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "Ulsan National Institute of Science and Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.unist.ac.kr", "aff_unique_abbr": "UNIST", "aff_country_unique_index": "0", "aff_country_unique": "South Korea" }, { "title": "Provably Robust Adversarial Examples", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6160", "id": "UMfhoMtIaP5", "poster": "", "openreview": "https://openreview.net/forum?id=UMfhoMtIaP5", "slides": "https://iclr.cc/virtual/2022/poster/6160", "video": "https://iclr.cc/virtual/2022/poster/6160", "author_site": "Dimitar I. Dimitrov, Gagandeep Singh, Timon Gehr, Martin Vechev", "tldr": "", "abstract": "We introduce the concept of provably robust adversarial examples for deep neural networks \u2013 connected input regions constructed from standard adversarial examples which are guaranteed to be robust to a set of real-world perturbations (such as changes in pixel intensity and geometric transformations). We present a novel method called PARADE for generating these regions in a scalable manner which works by iteratively refining the region initially obtained via sampling until a refined region is certified to be adversarial with existing state-of-the-art verifiers. At each step, a novel optimization procedure is applied to maximize the region's volume under the constraint that the convex relaxation of the network behavior with respect to the region implies a chosen bound on the certification objective. Our experimental evaluation shows the effectiveness of PARADE: it successfully finds large provably robust regions including ones containing $\\approx 10^{573}$ adversarial examples for pixel intensity and $\\approx 10^{599}$ for geometric perturbations. The provability enables our robust examples to be significantly more effective against state-of-the-art defenses based on randomized smoothing than the individual attacks used to construct the regions.", "keywords": "Adversarial attacks;Robustness Certification;Abstract Interpretation;Deep Learning", "primary_area": "", "supplementary_material": "/attachment/124d5d0906843e2d2ca88b3100ea79ddf6d90df2.zip", "author": "Dimitar Iliev Dimitrov;Gagandeep Singh;Timon Gehr;Martin Vechev", "authorids": "~Dimitar_Iliev_Dimitrov2;~Gagandeep_Singh1;~Timon_Gehr1;~Martin_Vechev1", "gender": "M;M;;M", "homepage": "https://www.sri.inf.ethz.ch/people/dimitadi;https://ggndpsngh.github.io/;https://www.sri.inf.ethz.ch/people/timon;https://www.sri.inf.ethz.ch/people/martin", "dblp": "271/0915;64/3747-1;165/2836;93/2189.html", "google_scholar": "https://scholar.google.com/citations?hl=en;https://scholar.google.ch/citations?user=m4b2ruEAAAAJ;HcL76tsAAAAJ;https://scholar.google.ch/citations?user=aZ1Rh50AAAAJ", "orcid": "0000-0001-9813-0900;0000-0002-9299-2961;;", "linkedin": ";gagandeep-singh-1bb01b49/;;", "or_profile": "~Dimitar_Iliev_Dimitrov2;~Gagandeep_Singh1;~Timon_Gehr1;~Martin_Vechev1", "aff": "Swiss Federal Institute of Technology;University of Illinois, Urbana Champaign;Swiss Federal Institute of Technology;Swiss Federal Institute of Technology", "aff_domain": "ethz.ch;illinois.edu;ethz.ch;ethz.ch", "position": "PhD student;Assistant Professor;PhD student;Full Professor", "bibtex": "@inproceedings{\ndimitrov2022provably,\ntitle={Provably Robust Adversarial Examples},\nauthor={Dimitar Iliev Dimitrov and Gagandeep Singh and Timon Gehr and Martin Vechev},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=UMfhoMtIaP5}\n}", "github": "", "project": "", "reviewers": "XdXU;N75e;A11V", "pdf_size": 0, "recommendation": "6;6;8", "confidence": "4;3;5", "correctness": "3;3;4", "technical_novelty": "2;3;4", "empirical_novelty": "3;3;3", "wc_summary_paper": "25;88;163", "wc_summary_review": "2;55;150", "wc_main_review": "119;649;1046", "wc_review": "146;792;1359", "wc_reply_reviewers": "0;274;415", "wc_reply_authors": "483;902;2708", "reply_reviewers": "0;1;3", "reply_authors": "2;3;5", "recommendation_avg": [ 6.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 3.0, 0.816496580927726 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 92.0, 56.40921910468182 ], "wc_summary_review_avg": [ 69.0, 61.22635598062869 ], "wc_main_review_avg": [ 604.6666666666666, 379.74231028715013 ], "wc_review_avg": [ 765.6666666666666, 495.5551320376865 ], "wc_reply_reviewers_avg": [ 229.66666666666666, 172.29883600561232 ], "wc_reply_authors_avg": [ 1364.3333333333333, 965.3912275457149 ], "reply_reviewers_avg": [ 1.3333333333333333, 1.247219128924647 ], "reply_authors_avg": [ 3.3333333333333335, 1.247219128924647 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.8660254037844385, "corr_recommendation_correctness": 0.9999999999999998, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2578760386117257823&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 7, "pdf": "https://openreview.net/pdf?id=UMfhoMtIaP5", "email": "ethz.ch;illinois.edu;ethz.ch;ethz.ch", "author_num": 4, "aff_unique_index": "0;1;0;0", "aff_unique_norm": "Swiss Federal Institute of Technology;University of Illinois Urbana-Champaign", "aff_unique_dep": ";", "aff_unique_url": "https://www.ethz.ch;https://illinois.edu", "aff_unique_abbr": "ETH Zurich;UIUC", "aff_campus_unique_index": "1", "aff_campus_unique": ";Urbana-Champaign", "aff_country_unique_index": "0;1;0;0", "aff_country_unique": "Switzerland;United States" }, { "id": "UORhn0DGIT", "title": "Heterogeneous Wasserstein Discrepancy for Incomparable Distributions", "track": "main", "status": "Reject", "tldr": "", "abstract": "Optimal Transport (OT) metrics allow for defining discrepancies between two probability measures. Wasserstein distance is for longer the celebrated OT-distance frequently-used in the literature, which seeks probability distributions to be supported on the $\\text{\\it same}$ metric space. Because of its high computational complexity, several approximate Wasserstein distances have been proposed based on entropy regularization or on slicing, and one-dimensional Wassserstein computation. In this paper, we propose a novel extension of Wasserstein distance to compare two incomparable distributions, that hinges on the idea of $\\text{\\it distributional slicing}$, embeddings, and on computing the closed-form Wassertein distance between the sliced distributions. We provide a theoretical analysis of this new divergence, called $\\text{\\it heterogeneous Wasserstein discrepancy (HWD)}$, and we show that it preserves several interesting properties including rotation-invariance. We show that the embeddings involved in HWD can be efficiently learned. Finally, we provide a large set of experiments illustrating the behavior of HWD as a divergence in the context of generative modeling and in query framework.", "keywords": "Optimal transport;Wasserstein distance;Incomprable distributions;Generative models", "primary_area": "", "supplementary_material": "/attachment/e06259752592f6e2ac0a5c17b011589fb2d8a505.zip", "author": "Mokhtar Z. Alaya;Gilles Gasso;Maxime Berar;Alain Rakotomamonjy", "authorids": "~Mokhtar_Z._Alaya1;~Gilles_Gasso1;~Maxime_Berar1;~Alain_Rakotomamonjy1", "gender": "M;M;M;", "homepage": "http://mzalaya.github.io/;http://asi.insa-rouen.fr/enseignants/~gasso/;;", "dblp": "167/0364;;89/1783;", "google_scholar": "bmbmbusAAAAJ;https://scholar.google.fr/citations?user=wPTfsEQAAAAJ;J8-mCe4AAAAJ;", "orcid": "0000-0002-1103-6944;;;", "linkedin": "mzalaya/;;;", "or_profile": "~Mokhtar_Z._Alaya1;~Gilles_Gasso1;~Maxime_Berar1;~Alain_Rakotomamonjy1", "aff": "Universit\u00e9 de Technologie de Compi\u00e8gne;INSA Rouen Normandy;Universit\u00e9 de Rouen Normandie;", "aff_domain": "utc.fr;insa-rouen.fr;univ-rouen.fr;", "position": "Assistant Professor;Full Professor;Assistant Professor;", "bibtex": "@misc{\nalaya2022heterogeneous,\ntitle={Heterogeneous Wasserstein Discrepancy for Incomparable Distributions},\nauthor={Mokhtar Z. Alaya and Gilles Gasso and Maxime Berar and Alain Rakotomamonjy},\nyear={2022},\nurl={https://openreview.net/forum?id=UORhn0DGIT}\n}", "github": "", "project": "", "reviewers": "rgFL;xwxk;MjvX", "site": "https://openreview.net/forum?id=UORhn0DGIT", "pdf_size": 0, "recommendation": "3;3;6", "confidence": "5;3;3", "correctness": "3;2;4", "technical_novelty": "1;2;3", "empirical_novelty": "2;0;0", "wc_summary_paper": "35;71;90", "wc_summary_review": "33;40;91", "wc_main_review": "189;443;373", "wc_review": "257;554;554", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 4.0, 1.4142135623730951 ], "confidence_avg": [ 3.6666666666666665, 0.9428090415820634 ], "correctness_avg": [ 3.0, 0.816496580927726 ], "technical_novelty_avg": [ 2.0, 0.816496580927726 ], "empirical_novelty_avg": [ 0.6666666666666666, 0.9428090415820634 ], "wc_summary_paper_avg": [ 65.33333333333333, 22.80838052607467 ], "wc_summary_review_avg": [ 54.666666666666664, 25.84999462712173 ], "wc_main_review_avg": [ 335.0, 107.11987055008359 ], "wc_review_avg": [ 455.0, 140.0071426749364 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.5000000000000001, "corr_recommendation_correctness": 0.8660254037844387, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11365442240737867799&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff_unique_index": "0;1;2", "aff_unique_norm": "Universit\u00e9 de Technologie de Compi\u00e8gne;INSA Rouen Normandy;Universit\u00e9 de Rouen Normandie", "aff_unique_dep": ";;", "aff_unique_url": "https://www.utc.fr;https://www.insa-rouen.fr;https://www.univ-rouen.fr", "aff_unique_abbr": "UTC;INSA Rouen;UR Normandie", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "France" }, { "id": "UOj0MV__Cr", "title": "A Two-Stage Neural-Filter Pareto Front Extractor and the need for Benchmarking", "track": "main", "status": "Reject", "tldr": "", "abstract": "Pareto solutions are optimal trade-offs between multiple competing objectives over the feasible set satisfying imposed constraints. Fixed-point iterative strategies do not always converge and might only return one solution point per run. Consequently, multiple runs of a scalarization problem are required to retrieve a Pareto front, where all instances converge. Recently proposed Multi-Task Learning (MTL) solvers claim to achieve Pareto solutions combining Linear Scalarization and domain decomposition. We demonstrate key shortcomings of MTL solvers, that limit their usability for real-world applications. Issues include unjustified convexity assumptions on practical problems, incomplete and often wrong inferences on datasets that violate Pareto definition, and lack of proper benchmarking and verification. We propose a two stage Pareto framework: Hybrid Neural Pareto Front (HNPF) that is accurate and handles non-convex functions and constraints. The Stage-1 neural network efficiently extracts the \\textit{weak} Pareto front, using Fritz-John Conditions (FJC) as the discriminator, with no assumptions of convexity on the objectives or constraints. An FJC guided diffusive manifold is used to bound the error between the true and the Stage-1 extracted \\textit{weak} Pareto front. The Stage-2, low-cost Pareto filter then extracts the strong Pareto subset from this weak front. Numerical experiments demonstrates the accuracy and efficiency of our approach.", "keywords": "Pareto Optimality;Neural nets;Pareto Filter;Interpretability;Benchmarking", "primary_area": "", "supplementary_material": "/attachment/b7a51bea0e8d40405a2790b70ac5bfb7eeebd00c.zip", "author": "Soumyajit Gupta;Gurpreet Singh;Matthew Lease", "authorids": "~Soumyajit_Gupta1;~Gurpreet_Singh3;~Matthew_Lease1", "gender": "M;M;", "homepage": ";;https://mattlease.com/", "dblp": ";;29/239.html", "google_scholar": ";ou3UDckAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";;0000-0002-0056-2834", "linkedin": "soumyajit-gupta/;grpt-singh;mattlease", "or_profile": "~Soumyajit_Gupta1;~Gurpreet_Singh3;~Matthew_Lease1", "aff": "University of Texas, Austin;University of Texas at Austin;University of Texas at Austin", "aff_domain": "utexas.edu;utexas.edu;utexas.edu", "position": "PhD student;Research Scientist;Full Professor", "bibtex": "@misc{\ngupta2022a,\ntitle={A Two-Stage Neural-Filter Pareto Front Extractor and the need for Benchmarking},\nauthor={Soumyajit Gupta and Gurpreet Singh and Matthew Lease},\nyear={2022},\nurl={https://openreview.net/forum?id=UOj0MV__Cr}\n}", "github": "", "project": "", "reviewers": "yaU2;gpGn;pPPS;GeVa", "site": "https://openreview.net/forum?id=UOj0MV__Cr", "pdf_size": 0, "recommendation": "3;3;3;6", "confidence": "4;5;4;3", "correctness": "2;2;2;3", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;1;1;3", "wc_summary_paper": "55;80;105;34", "wc_summary_review": "8;23;106;49", "wc_main_review": "206;565;1232;305", "wc_review": "269;668;1443;388", "wc_reply_reviewers": "0;0;50;92", "wc_reply_authors": "445;855;899;720", "reply_reviewers": "0;0;1;1", "reply_authors": "1;1;2;2", "recommendation_avg": [ 3.75, 1.299038105676658 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 2.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 1.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 68.5, 26.63174797117155 ], "wc_summary_review_avg": [ 46.5, 37.35304539123952 ], "wc_main_review_avg": [ 577.0, 400.2480480901812 ], "wc_review_avg": [ 692.0, 457.1438504453494 ], "wc_reply_reviewers_avg": [ 35.5, 38.48051454957431 ], "wc_reply_authors_avg": [ 729.75, 177.13748191729496 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.816496580927726, "corr_recommendation_correctness": 1.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Yfkmr8cjeI0J:scholar.google.com/&scioq=A+Two-Stage+Neural-Filter+Pareto+Front+Extractor+and+the+need+for+Benchmarking&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Texas at Austin", "aff_unique_dep": "", "aff_unique_url": "https://www.utexas.edu", "aff_unique_abbr": "UT Austin", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Austin", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "UPJ4Hvu6pu", "title": "Adaptive Early-Learning Correction for Segmentation from Noisy Annotations", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Deep learning in the presence of noisy annotations has been studied extensively in classification, but much less in segmentation tasks. In this work, we study the learning dynamics of deep segmentation networks trained on inaccurately-annotated data. We discover a phenomenon that has been previously reported in the context of classification: the networks tend to first fit the clean pixel-level labels during an \"early-learning\" phase, before eventually memorizing the false annotations. However, in contrast to classification, memorization in segmentation does not arise simultaneously for all semantic categories. Inspired by these findings, we propose a new method for segmentation from noisy annotations with two key elements. First, we detect the beginning of the memorization phase separately for each category during training. This allows us to adaptively correct the noisy annotations in order to exploit early learning. Second, we incorporate a regularization term that enforces consistency across scales to boost robustness against annotation noise. Our method outperforms standard approaches on a medical-imaging segmentation task where noises are synthesized to mimic human annotation errors. It also provides robustness to realistic noisy annotations present in weakly-supervised semantic segmentation, achieving state-of-the-art results on PASCAL VOC 2012.", "keywords": "semantic segmentation;segmentation from noisy annotations;weakly supervised semantic segmentation", "primary_area": "", "supplementary_material": "/attachment/2db9d47ad3aac107da47d4db36b9270aaaf0a21d.zip", "author": "Sheng Liu;Kangning Liu;Weicheng Zhu;Yiqiu Shen;Carlos Fernandez-Granda", "authorids": "~Sheng_Liu2;~Kangning_Liu1;~Weicheng_Zhu1;~Yiqiu_Shen1;~Carlos_Fernandez-Granda1", "gender": ";M;M;M;", "homepage": "https://shengliu66.github.io/;https://kangning-liu.github.io/;;https://seyiqi.github.io/;https://cims.nyu.edu/~cfgranda/", "dblp": ";259/1458;180/5811;https://dblp.uni-trier.de/pers/hd/s/Shen:Yiqiu;77/11141", "google_scholar": "rzhzR-cAAAAJ;F3F2qAkAAAAJ;Glw83HYAAAAJ;XaeN2zgAAAAJ;GX-PtukAAAAJ", "orcid": ";;;0000-0002-7726-2514;", "linkedin": ";;;yiqiu-shen-a2317782/;", "or_profile": "~Sheng_Liu2;~Kangning_Liu1;~Weicheng_Zhu1;~Yiqiu_Shen1;~Carlos_Fernandez-Granda1", "aff": "New York University;Google;New York University;New York University;New York University", "aff_domain": "nyu.edu;google.com;nyu.edu;nyu.edu;nyu.edu", "position": "PhD student;Intern;PhD student;PhD student;Associate Professor", "bibtex": "@misc{\nliu2022adaptive,\ntitle={Adaptive Early-Learning Correction for Segmentation from Noisy Annotations},\nauthor={Sheng Liu and Kangning Liu and Weicheng Zhu and Yiqiu Shen and Carlos Fernandez-Granda},\nyear={2022},\nurl={https://openreview.net/forum?id=UPJ4Hvu6pu}\n}", "github": "", "project": "", "reviewers": "FebZ;ene2;dgVD;zR7K", "site": "https://openreview.net/forum?id=UPJ4Hvu6pu", "pdf_size": 0, "recommendation": "3;5;5;5", "confidence": "4;5;3;3", "correctness": "2;3;3;3", "technical_novelty": "3;3;2;2", "empirical_novelty": "2;3;2;2", "wc_summary_paper": "136;247;121;36", "wc_summary_review": "109;205;67;21", "wc_main_review": "600;678;359;70", "wc_review": "845;1130;547;127", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.5, 0.8660254037844386 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 135.0, 75.0699673637867 ], "wc_summary_review_avg": [ 100.5, 67.88777504087169 ], "wc_main_review_avg": [ 426.75, 237.17227388546073 ], "wc_review_avg": [ 662.25, 371.47097800501183 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.17407765595569782, "corr_recommendation_correctness": 1.0, "gs_citation": 149, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=530625488460245511&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 11, "aff_unique_index": "0;1;0;0;0", "aff_unique_norm": "New York University;Google", "aff_unique_dep": ";Google", "aff_unique_url": "https://www.nyu.edu;https://www.google.com", "aff_unique_abbr": "NYU;Google", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "UPwD79EleQ", "title": "Cyclic Test Time Augmentation with Entropy Weight Method", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "In the recent studies of data augmentation of neural networks, the application of test time augmentation has been studied to extract optimal transformation policies to enhance performance with minimum cost. The policy search method with the best level of input data dependency involves training a loss predictor network to estimate suitable transformations for each of the given input image in independent manner, resulting in instance-level transformation extraction. In this work, we propose a method to utilize and modify the loss prediction pipeline to further improve the performance with the cyclic search for suitable transformations and the use of the entropy weight method. The cyclic usage of the loss predictor allows refining each input image with multiple transformations with a more flexible transformation magnitude. For cases where multiple augmentations are generated, we implement the entropy weight method to reflect the data uncertainty of each augmentation to force the final result to focus on augmentations with low uncertainty. The experimental result shows convincing qualitative outcome and robust performance for the corrupted conditions of data.", "keywords": "Data Augmentation;Test Time Augmentation;Uncertainty Estimation", "primary_area": "", "supplementary_material": "", "author": "Sewhan Chun;Jae Young Lee;Junmo Kim", "authorids": "~Sewhan_Chun1;mcneato@kaist.ac.kr;~Junmo_Kim1", "gender": "M;;M", "homepage": ";;https://siit.kaist.ac.kr/Faculty", "dblp": "308/7195;;40/240-2.html", "google_scholar": ";;https://scholar.google.com.tw/citations?user=GdQtWNQAAAAJ", "orcid": ";;", "linkedin": "jeff-chun-69a764221;;", "or_profile": "~Sewhan_Chun1;mcneato@kaist.ac.kr;~Junmo_Kim1", "aff": "Korea Advanced Institute of Science & Technology;;Korea Advanced Institute of Science & Technology", "aff_domain": "kaist.ac.kr;;kaist.ac.kr", "position": "MS student;;Associate Professor", "bibtex": "@misc{\nchun2022cyclic,\ntitle={Cyclic Test Time Augmentation with Entropy Weight Method},\nauthor={Sewhan Chun and Jae Young Lee and Junmo Kim},\nyear={2022},\nurl={https://openreview.net/forum?id=UPwD79EleQ}\n}", "github": "", "project": "", "reviewers": "EKvF;GGc3;h9u6;Stct", "site": "https://openreview.net/forum?id=UPwD79EleQ", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "3;5;2;3", "correctness": "2;3;3;3", "technical_novelty": "3;2;3;3", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "63;93;42;38", "wc_summary_review": "49;40;23;28", "wc_main_review": "164;494;386;206", "wc_review": "276;627;451;272", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 3.25, 1.0897247358851685 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 59.0, 21.805962487356524 ], "wc_summary_review_avg": [ 35.0, 10.173494974687902 ], "wc_main_review_avg": [ 312.5, 133.92068548211662 ], "wc_review_avg": [ 406.5, 146.3907442429336 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.6882472016116854, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1725783248374401360&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff_unique_index": "0;0", "aff_unique_norm": "Korea Advanced Institute of Science and Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.kaist.ac.kr", "aff_unique_abbr": "KAIST", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "South Korea" }, { "id": "UQBEkRO0_-M", "title": "Softmax Gradient Tampering: Decoupling the Backward Pass for Improved Fitting", "track": "main", "status": "Reject", "tldr": "", "abstract": "We introduce Softmax Gradient Tampering, a technique for modifying the gradients in the backward pass of neural networks in order to enhance their accuracy. Our approach transforms the predicted probability values using a power-based probability transformation and then recomputes the gradients in the backward pass. This modification results in a smoother gradient profile, which we demonstrate empirically and theoretically. We do a grid search for the transform parameters on residual networks. We demonstrate that modifying the softmax gradients in ConvNets may result in increased training accuracy, thus increasing the fit across the training data and maximally utilizing\u00a0the learning capacity of neural networks. We get better test metrics and lower generalization gaps when combined with regularization techniques such as label smoothing. Softmax gradient tampering improves ResNet-50's test accuracy by $0.52\\%$ over the baseline on the ImageNet dataset. Our approach is very generic and may be used across\u00a0a wide range of different network architectures and datasets.", "keywords": "gradient tampering;smoothing;softmax;prediction;image classification;neural networks", "primary_area": "", "supplementary_material": "/attachment/ae399c97b9045b4dc166f5011ae7215bfe02ad57.zip", "author": "Bishshoy Das;Milton Mondal;Brejesh Lall;Shiv Dutt Joshi;Sumantra Dutta Roy", "authorids": "bishshoy.das@ee.iitd.ac.in;milton.mondal@ee.iitd.ac.in;~Brejesh_Lall1;sdjoshi@ee.iitd.ac.in;sumantra@ee.iitd.ac.in", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\ndas2022softmax,\ntitle={Softmax Gradient Tampering: Decoupling the Backward Pass for Improved Fitting},\nauthor={Bishshoy Das and Milton Mondal and Brejesh Lall and Shiv Dutt Joshi and Sumantra Dutta Roy},\nyear={2022},\nurl={https://openreview.net/forum?id=UQBEkRO0_-M}\n}", "github": "", "project": "", "reviewers": "x726;JYfd;9KKt;cjju", "site": "https://openreview.net/forum?id=UQBEkRO0_-M", "pdf_size": 0, "recommendation": "1;3;3;5", "confidence": "3;4;4;4", "correctness": "2;3;2;2", "technical_novelty": "2;2;2;2", "empirical_novelty": "1;2;2;2", "wc_summary_paper": "45;37;32;75", "wc_summary_review": "5;5;111;35", "wc_main_review": "167;458;540;207", "wc_review": "217;500;683;317", "wc_reply_reviewers": "0;0;66;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;1;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.0, 1.4142135623730951 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 2.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 1.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 47.25, 16.67895380412093 ], "wc_summary_review_avg": [ 39.0, 43.3358973600409 ], "wc_main_review_avg": [ 343.0, 159.30003138731644 ], "wc_review_avg": [ 429.25, 178.21668692914253 ], "wc_reply_reviewers_avg": [ 16.5, 28.578838324886476 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.816496580927726, "corr_recommendation_correctness": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6194593178756657546&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2 }, { "id": "UQQgMRq58O", "title": "Understanding Generalized Label Smoothing when Learning with Noisy Labels", "track": "main", "status": "Reject", "tldr": "", "abstract": "Label smoothing (LS) is an arising learning paradigm that uses the positively weighted average of both the hard training labels and uniformly distributed soft labels. It was shown that LS serves as a regularizer for training data with hard labels and therefore improves the generalization of the model. Later it was reported LS even helps with improving robustness when learning with noisy labels. However, we observe that the advantage of LS vanishes when we operate in a high label noise regime. Puzzled by the observation, we proceeded to discover that several proposed learning-with-noisy-labels solutions in the literature instead relate more closely to $\\textit{negative label smoothing}$ (NLS), which defines as using a negative weight to combine the hard and soft labels! We show that NLS differs substantially from LS in their achieved model confidence. To differentiate the two cases, we will call LS the positive label smoothing (PLS), and this paper unifies PLS and NLS into $\\textit{generalized label smoothing}$ (GLS). We provide understandings for the properties of GLS when learning with noisy labels. Among other established properties, we theoretically show NLS is considered more beneficial when the label noise rates are high. We provide extensive experimental results on multiple benchmarks to support our findings too.", "keywords": "Learning with noisy labels;label smoothing;model confidence", "primary_area": "", "supplementary_material": "/attachment/e5094664991b8a1add9c1d390965a54fbf7d07ce.zip", "author": "Jiaheng Wei;Hangyu Liu;Tongliang Liu;Gang Niu;Yang Liu", "authorids": "~Jiaheng_Wei1;~Hangyu_Liu1;~Tongliang_Liu1;~Gang_Niu1;~Yang_Liu3", "gender": "M;M;M;M;M", "homepage": "https://sites.google.com/ucsc.edu/jiahengwei;https://sites.google.com/brown.edu/cedric/home;https://tongliang-liu.github.io/;https://niug1984.github.io;http://www.yliuu.com", "dblp": "270/8936;;150/6667;26/3367-1;51/3710-18", "google_scholar": "https://scholar.google.com/citations?hl=en;;https://scholar.google.com.au/citations?user=EiLdZ_YAAAAJ;https://scholar.google.co.jp/citations?user=HOkcy00AAAAJ;jKrIVCIAAAAJ", "orcid": ";;;;0000-0001-8420-6011", "linkedin": "jiahengwei/;;;;", "or_profile": "~Jiaheng_Wei1;~Hangyu_Liu1;~Tongliang_Liu1;~Gang_Niu1;~Yang_Liu3", "aff": "Google;;University of Sydney;RIKEN;University of California, Santa Cruz", "aff_domain": "google.com;;sydney.edu.au;riken.jp;ucsc.edu", "position": "Intern;;Lecturer;Research Scientist (tenured);Assistant Professor", "bibtex": "@misc{\nwei2022understanding,\ntitle={Understanding Generalized Label Smoothing when Learning with Noisy Labels},\nauthor={Jiaheng Wei and Hangyu Liu and Tongliang Liu and Gang Niu and Yang Liu},\nyear={2022},\nurl={https://openreview.net/forum?id=UQQgMRq58O}\n}", "github": "", "project": "", "reviewers": "ENj2;Ww4B;2zXX;nStq", "site": "https://openreview.net/forum?id=UQQgMRq58O", "pdf_size": 0, "recommendation": "3;5;5;5", "confidence": "4;3;4;5", "correctness": "2;3;4;3", "technical_novelty": "1;2;3;2", "empirical_novelty": "1;2;4;1", "wc_summary_paper": "78;33;240;55", "wc_summary_review": "69;281;48;19", "wc_main_review": "379;281;204;352", "wc_review": "526;595;492;426", "wc_reply_reviewers": "259;0;0;0", "wc_reply_authors": "1680;992;429;571", "reply_reviewers": "1;0;0;0", "reply_authors": "3;2;1;1", "recommendation_avg": [ 4.5, 0.8660254037844386 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.0, 1.224744871391589 ], "wc_summary_paper_avg": [ 101.5, 81.53066907612128 ], "wc_summary_review_avg": [ 104.25, 103.57937777376344 ], "wc_main_review_avg": [ 304.0, 67.9301111437336 ], "wc_review_avg": [ 509.75, 60.952337937112794 ], "wc_reply_reviewers_avg": [ 64.75, 112.1502897900848 ], "wc_reply_authors_avg": [ 918.0, 486.2226856081481 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.75, 0.82915619758885 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.816496580927726, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6210908166320374825&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "Google;University of Sydney;RIKEN;University of California, Santa Cruz", "aff_unique_dep": "Google;;;", "aff_unique_url": "https://www.google.com;https://www.sydney.edu.au;https://www.riken.jp;https://www.ucsc.edu", "aff_unique_abbr": "Google;USYD;RIKEN;UCSC", "aff_campus_unique_index": "0;2", "aff_campus_unique": "Mountain View;;Santa Cruz", "aff_country_unique_index": "0;1;2;0", "aff_country_unique": "United States;Australia;Japan" }, { "id": "URNZQmbxpwh", "title": "Fishr: Invariant Gradient Variances for Out-of-distribution Generalization", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Learning robust models that generalize well under changes in the data distribution is critical for real-world applications. To this end, there has been a growing surge of interest to learn simultaneously from multiple training domains - while enforcing different types of invariance across those domains. Yet, all existing approaches fail to show systematic benefits under controlled evaluation protocols. In this paper, we introduce a new regularization - named Fishr - that enforces domain invariance in the space of the gradients of the loss: specifically, the domain-level variances of gradients are matched across training domains. Our approach is based on the close relations between the gradient covariance, the Fisher Information and the Hessian of the loss: in particular, we show that Fishr eventually aligns the domain-level loss landscapes locally around the final weights. Extensive experiments demonstrate the effectiveness of Fishr for out-of-distribution generalization. Notably, Fishr improves the state of the art on the DomainBed benchmark and performs consistently better than Empirical Risk Minimization. Our code is available anonymously at https://anonymous.4open.science/r/fishr-anonymous-EBB6/.", "keywords": "Deep Learning;Computer Vision;Domain Generalization", "primary_area": "", "supplementary_material": "/attachment/d60a5384ba37b9a17f0063d1d7d15833f502ffd6.zip", "author": "Alexandre Rame;Corentin Dancette;Matthieu Cord", "authorids": "~Alexandre_Rame1;~Corentin_Dancette1;~Matthieu_Cord1", "gender": "M;M;M", "homepage": "https://alexrame.github.io/;https://cdancette.fr;https://cord.isir.upmc.fr/", "dblp": ";;68/3117", "google_scholar": "7znwivwAAAAJ;https://scholar.google.fr/citations?user=2zReQdQAAAAJ;SpAotDcAAAAJ", "orcid": ";;", "linkedin": "alexandre-ram%C3%A9-05259587;;", "or_profile": "~Alexandre_Rame1;~Corentin_Dancette1;~Matthieu_Cord1", "aff": "Universit\u00e9 Pierre et Marie Curie - Paris 6, Sorbonne Universit\u00e9 - Facult\u00e9 des Sciences (Paris VI);Sorbonne Universite;Sorbonne Universit\u00e9", "aff_domain": "isir.upmc.fr;sorbonne-universite.fr;isir.upmc.fr", "position": "PhD student;PhD student;Full Professor", "bibtex": "@misc{\nrame2022fishr,\ntitle={Fishr: Invariant Gradient Variances for Out-of-distribution Generalization },\nauthor={Alexandre Rame and Corentin Dancette and Matthieu Cord},\nyear={2022},\nurl={https://openreview.net/forum?id=URNZQmbxpwh}\n}", "github": "", "project": "", "reviewers": "ZMd5;iWJv;KDTu;Mhdk", "site": "https://openreview.net/forum?id=URNZQmbxpwh", "pdf_size": 0, "recommendation": "3;5;8;8", "confidence": "5;4;4;4", "correctness": "3;3;4;3", "technical_novelty": "2;2;4;2", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "41;31;197;68", "wc_summary_review": "41;54;187;87", "wc_main_review": "156;128;527;534", "wc_review": "238;213;911;689", "wc_reply_reviewers": "0;0;0;147", "wc_reply_authors": "980;603;1537;690", "reply_reviewers": "0;0;0;2", "reply_authors": "2;1;3;2", "recommendation_avg": [ 6.0, 2.1213203435596424 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.8660254037844386 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 84.25, 66.48825084178407 ], "wc_summary_review_avg": [ 92.25, 57.21614719639903 ], "wc_main_review_avg": [ 336.25, 194.51783337267563 ], "wc_review_avg": [ 512.75, 297.9113752443837 ], "wc_reply_reviewers_avg": [ 36.75, 63.65286717815624 ], "wc_reply_authors_avg": [ 952.5, 365.1893344554301 ], "reply_reviewers_avg": [ 0.5, 0.8660254037844386 ], "reply_authors_avg": [ 2.0, 0.7071067811865476 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.8164965809277261, "corr_recommendation_correctness": 0.5443310539518174, "gs_citation": 267, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12989883752146186165&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff_unique_index": "0;1;2", "aff_unique_norm": "Universit\u00e9 Pierre et Marie Curie - Paris 6;Sorbonne University;Sorbonne Universit\u00e9", "aff_unique_dep": "Facult\u00e9 des Sciences;;", "aff_unique_url": "https://www.upmc.fr;https://www.sorbonne-universite.fr;https://www.sorbonne-universite.fr", "aff_unique_abbr": "UPMC;Sorbonne;Sorbonne U", "aff_campus_unique_index": "0", "aff_campus_unique": "Paris;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "France" }, { "title": "EntQA: Entity Linking as Question Answering", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6200", "id": "US2rTP5nm_", "poster": "", "openreview": "https://openreview.net/forum?id=US2rTP5nm_", "slides": "https://iclr.cc/virtual/2022/poster/6200", "video": "https://iclr.cc/virtual/2022/poster/6200", "author_site": "Wenzheng Zhang, Wenyue Hua, Karl Stratos", "tldr": "", "abstract": "A conventional approach to entity linking is to first find mentions in a given document and then infer their underlying entities in the knowledge base. A well-known limitation of this approach is that it requires finding mentions without knowing their entities, which is unnatural and difficult. We present a new model that does not suffer from this limitation called $\\textbf{EntQA}$, which stands for $\\mbox{\\textbf{Ent}ity}$ linking as $\\mbox{\\textbf{Q}uestion}$ $\\mbox{\\textbf{A}nswering}$. EntQA first proposes candidate entities with a fast retrieval module, and then scrutinizes the document to find mentions of each candidate with a powerful reader module. Our approach combines progress in entity linking with that in open-domain question answering and capitalizes on pretrained models for dense entity retrieval and reading comprehension. Unlike in previous works, we do not rely on a mention-candidates dictionary or large-scale weak supervision. EntQA achieves strong results on the GERBIL benchmarking platform.\n", "keywords": "Entity linking;open-domain question answering;dense retrieval;reading comprehension;information extraction;natural language processing", "primary_area": "", "supplementary_material": "", "author": "Wenzheng Zhang;Wenyue Hua;Karl Stratos", "authorids": "wenzheng.zhang@rutgers.edu;wenyue.hua@rutgers.edu;~Karl_Stratos2", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": "wenzheng.zhang@rutgers.edu;wenyue.hua@rutgers.edu;~Karl_Stratos2", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nzhang2022entqa,\ntitle={Ent{QA}: Entity Linking as Question Answering},\nauthor={Wenzheng Zhang and Wenyue Hua and Karl Stratos},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=US2rTP5nm_}\n}", "github": "", "project": "", "reviewers": "Uses;SZGo;97R7", "pdf_size": 0, "recommendation": "8;8;8", "confidence": "4;4;3", "correctness": "3;3;4", "technical_novelty": "3;3;3", "empirical_novelty": "2;3;2", "wc_summary_paper": "82;61;93", "wc_summary_review": "13;35;32", "wc_main_review": "243;320;239", "wc_review": "338;416;364", "wc_reply_reviewers": "9;104;0", "wc_reply_authors": "377;204;157", "reply_reviewers": "1;1;0", "reply_authors": "1;2;1", "recommendation_avg": [ 8.0, 0.0 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 78.66666666666667, 13.27487183449325 ], "wc_summary_review_avg": [ 26.666666666666668, 9.741092797468305 ], "wc_main_review_avg": [ 267.3333333333333, 37.27674282385138 ], "wc_review_avg": [ 372.6666666666667, 32.42769735204082 ], "wc_reply_reviewers_avg": [ 37.666666666666664, 47.04843839656695 ], "wc_reply_authors_avg": [ 246.0, 94.59739249401468 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 68, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8005658916202648918&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=US2rTP5nm_", "email": ";;", "author_num": 3 }, { "title": "Information Gain Propagation: a New Way to Graph Active Learning with Soft Labels", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6074", "id": "USC0-nvGPK", "poster": "", "openreview": "https://openreview.net/forum?id=USC0-nvGPK", "slides": "https://iclr.cc/virtual/2022/poster/6074", "video": "https://iclr.cc/virtual/2022/poster/6074", "author_site": "Wentao Zhang, Yexin Wang, Zhenbang You, Meng Cao, Ping Huang, Jiulong Shan, Zhi Yang, Bin CUI", "tldr": "", "abstract": "Graph Neural Networks (GNNs) have achieved great success in various tasks, but their performance highly relies on a large number of labeled nodes, which typically requires considerable human effort. GNN-based Active Learning (AL) methods are proposed to improve the labeling efficiency by selecting the most valuable nodes to label. Existing methods assume an oracle can correctly categorize all the selected nodes and thus just focus on the node selection. However, such an exact labeling task is costly, especially when the categorization is out of the domain of individual expert (oracle). The paper goes further, presenting a soft-label approach to AL on GNNs. Our key innovations are: i) relaxed queries where a domain expert (oracle) only judges the correctness of the predicted labels (a binary question) rather than identifying the exact class (a multi-class question), and ii) new criteria of maximizing information gain propagation for active learner with relaxed queries and soft labels. Empirical studies on public datasets demonstrate that our method significantly outperforms the state-of-the-art GNN-based AL methods in terms of both accuracy and labeling cost. ", "keywords": "Active Learning;Graph;Information Gain", "primary_area": "", "supplementary_material": "/attachment/bc5526647bcea7f5991ea3c0561946d6f68b8129.zip", "author": "Wentao Zhang;Yexin Wang;Zhenbang You;Meng Cao;Ping Huang;Jiulong Shan;Zhi Yang;Bin CUI", "authorids": "~Wentao_Zhang1;~Yexin_Wang2;~Zhenbang_You1;~Meng_Cao2;~Ping_Huang1;~Jiulong_Shan2;~Zhi_Yang4;~Bin_CUI2", "gender": ";M;;M;M;;M;M", "homepage": ";https://wyxpkueecs.github.io/;;https://www.linkedin.com/in/caomeng/;;;https://yangzhihome.github.io/;https://cuibinpku.github.io/index.html", "dblp": ";;;;;;90/5587-1;55/5031.html", "google_scholar": ";;https://scholar.google.com/citations?hl=en;;;;;IJAU8KoAAAAJ", "orcid": ";;;;;;;0000-0003-1681-4677", "linkedin": ";;;caomeng/;ping-huang-82845138/;;;", "or_profile": "~Wentao_Zhang1;~Yexin_Wang2;~Zhenbang_You1;~Meng_Cao2;~Ping_Huang1;~Jiulong_Shan2;~Zhi_Yang4;~Bin_CUI2", "aff": ";;Peking University;Apple;Apple;;Peking University;Peking University", "aff_domain": ";;pku.edu.cn;apple.com;apple.com;;pku.edu.cn;pku.edu.cn", "position": ";;Undergrad student;Researcher;Researcher;;Associate Professor;Full Professor", "bibtex": "@inproceedings{\nzhang2022information,\ntitle={Information Gain Propagation: a New Way to Graph Active Learning with Soft Labels},\nauthor={Wentao Zhang and Yexin Wang and Zhenbang You and Meng Cao and Ping Huang and Jiulong Shan and Zhi Yang and Bin CUI},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=USC0-nvGPK}\n}", "github": "", "project": "", "reviewers": "tKPB;owkR;5Qmn;QqqC", "pdf_size": 0, "recommendation": "1;5;6;8", "confidence": "4;4;4;5", "correctness": "3;3;3;4", "technical_novelty": "2;3;3;4", "empirical_novelty": "2;0;3;3", "wc_summary_paper": "76;52;153;43", "wc_summary_review": "338;61;53;70", "wc_main_review": "325;177;263;139", "wc_review": "739;290;469;252", "wc_reply_reviewers": "159;70;10;0", "wc_reply_authors": "1677;914;667;271", "reply_reviewers": "2;2;1;0", "reply_authors": "4;3;1;1", "recommendation_avg": [ 5.0, 2.5495097567963922 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.0, 1.224744871391589 ], "wc_summary_paper_avg": [ 81.0, 43.2839462156583 ], "wc_summary_review_avg": [ 130.5, 119.95103167542996 ], "wc_main_review_avg": [ 226.0, 72.69800547470336 ], "wc_review_avg": [ 437.5, 192.39347702040212 ], "wc_reply_reviewers_avg": [ 59.75, 63.24703550365029 ], "wc_reply_authors_avg": [ 882.25, 512.9802018596819 ], "reply_reviewers_avg": [ 1.25, 0.82915619758885 ], "reply_authors_avg": [ 2.25, 1.299038105676658 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": 0.6793662204867574, "corr_recommendation_correctness": 0.6793662204867574, "gs_citation": 25, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4290124558616540696&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=USC0-nvGPK", "email": ";;pku.edu.cn;apple.com;apple.com;;pku.edu.cn;pku.edu.cn", "author_num": 8, "aff_unique_index": "0;1;1;0;0", "aff_unique_norm": "Peking University;Apple", "aff_unique_dep": ";Apple Inc.", "aff_unique_url": "http://www.pku.edu.cn;https://www.apple.com", "aff_unique_abbr": "Peking U;Apple", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;0;0", "aff_country_unique": "China;United States" }, { "title": "Graph-based Nearest Neighbor Search in Hyperbolic Spaces", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6589", "id": "USIgIY6TNDe", "poster": "", "openreview": "https://openreview.net/forum?id=USIgIY6TNDe", "slides": "https://iclr.cc/virtual/2022/poster/6589", "video": "https://iclr.cc/virtual/2022/poster/6589", "author_site": "Liudmila Prokhorenkova, Dmitry Baranchuk, Nikolay Bogachev, Yury Demidovich, Alexander Kolpakov", "tldr": "", "abstract": "The nearest neighbor search (NNS) problem is widely studied in Euclidean space, and graph-based algorithms are known to outperform other approaches for this task. However, hyperbolic geometry often allows for better data representation in various domains, including graphs, words, and images. In this paper, we show that graph-based approaches are also well suited for hyperbolic geometry. From a theoretical perspective, we rigorously analyze the time and space complexity of graph-based NNS, assuming that an $n$-element dataset is uniformly distributed within a $d$-dimensional ball of radius $R$ in the hyperbolic space of curvature $-1$. Under some conditions on $R$ and $d$, we derive the time and space complexity of graph-based NNS and compare the obtained results with known guarantees for the Euclidean case. Interestingly, in the dense setting ($d \\ll \\log n$) and under some assumptions on the radius $R$, graph-based NNS has lower time complexity in the hyperbolic space. This agrees with our experiments: we consider datasets embedded in hyperbolic and Euclidean spaces and show that graph-based NNS can be more efficient in the hyperbolic space. We also demonstrate that graph-based methods outperform other existing baselines for hyperbolic NNS. Overall, our theoretical and empirical analysis suggests that graph-based NNS can be considered a default approach for similarity search in hyperbolic spaces.", "keywords": "similarity search;nearest neighbor search;hyperbolic space;graph-based nearest neighbor search", "primary_area": "", "supplementary_material": "/attachment/db70a3c0809ee20ee70c1c18c5c93f32406c2ff9.zip", "author": "Liudmila Prokhorenkova;Dmitry Baranchuk;Nikolay Bogachev;Yury Demidovich;Alexander Kolpakov", "authorids": "~Liudmila_Prokhorenkova1;~Dmitry_Baranchuk2;~Nikolay_Bogachev1;~Yury_Demidovich1;~Alexander_Kolpakov1", "gender": "F;M;M;M;", "homepage": ";;https://nvbogachev.netlify.app/;;https://sashakolpakov.wordpress.com", "dblp": "45/11468;215/3712;;326/7284;", "google_scholar": "https://scholar.google.ru/citations?user=6JyZlSEAAAAJ;NiPmk8oAAAAJ;;https://scholar.google.com/citations?hl=ru;", "orcid": ";0000-0001-7660-3666;;;", "linkedin": ";;;yuradem/;", "or_profile": "~Liudmila_Prokhorenkova1;~Dmitry_Baranchuk2;~Nikolay_Bogachev1;~Yury_Demidovich1;~Alexander_Kolpakov1", "aff": "Moscow Institute of Physics and Technology;Meta;Moscow Institute of Physics and Technology;Huawei Technologies Ltd.;University of Neuchatel", "aff_domain": "mipt.edu;meta.com;phystech.edu;huawei.com;unine.ch", "position": "Researcher;Intern;Associate Professor;Researcher;Professor", "bibtex": "@inproceedings{\nprokhorenkova2022graphbased,\ntitle={Graph-based Nearest Neighbor Search in Hyperbolic Spaces},\nauthor={Liudmila Prokhorenkova and Dmitry Baranchuk and Nikolay Bogachev and Yury Demidovich and Alexander Kolpakov},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=USIgIY6TNDe}\n}", "github": "", "project": "", "reviewers": "1ri4;VNYQ;6ftD;zLTN;XgPX", "pdf_size": 0, "recommendation": "5;6;6;6;6", "confidence": "4;4;3;4;3", "correctness": "4;4;3;3;4", "technical_novelty": "2;3;3;3;3", "empirical_novelty": "2;3;0;1;3", "wc_summary_paper": "174;52;71;37;80", "wc_summary_review": "314;9;16;27;22", "wc_main_review": "993;88;272;450;241", "wc_review": "1481;149;359;514;343", "wc_reply_reviewers": "74;0;0;137;0", "wc_reply_authors": "1573;102;599;1307;558", "reply_reviewers": "1;0;0;3;0", "reply_authors": "4;3;2;4;2", "recommendation_avg": [ 5.8, 0.39999999999999997 ], "confidence_avg": [ 3.6, 0.4898979485566356 ], "correctness_avg": [ 3.6, 0.4898979485566356 ], "technical_novelty_avg": [ 2.8, 0.39999999999999997 ], "empirical_novelty_avg": [ 1.8, 1.1661903789690602 ], "wc_summary_paper_avg": [ 82.8, 47.980829505126316 ], "wc_summary_review_avg": [ 77.6, 118.35303122438394 ], "wc_main_review_avg": [ 408.8, 313.93336872655004 ], "wc_review_avg": [ 569.2, 470.39234687651964 ], "wc_reply_reviewers_avg": [ 42.2, 55.39097399396403 ], "wc_reply_authors_avg": [ 827.8, 536.0751439863632 ], "reply_reviewers_avg": [ 0.8, 1.1661903789690602 ], "reply_authors_avg": [ 3.0, 0.8944271909999159 ], "replies_avg": [ 26, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.408248290463863, "corr_recommendation_correctness": -0.408248290463863, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9059847835311944540&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2, "pdf": "https://openreview.net/pdf?id=USIgIY6TNDe", "email": "mipt.edu;meta.com;phystech.edu;huawei.com;unine.ch", "author_num": 5, "aff_unique_index": "0;1;0;2;3", "aff_unique_norm": "Moscow Institute of Physics and Technology;Meta;Huawei;University of Neuchatel", "aff_unique_dep": ";Meta Platforms, Inc.;Huawei Technologies;", "aff_unique_url": "https://www.mipt.ru/en;https://meta.com;https://www.huawei.com;https://www.unine.ch", "aff_unique_abbr": "MIPT;Meta;Huawei;UNINE", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;2;3", "aff_country_unique": "Russian Federation;United States;China;Switzerland" }, { "id": "UTTrevGchy", "title": "Learning Diverse Options via InfoMax Termination Critic", "track": "main", "status": "Reject", "tldr": "", "abstract": "We consider the problem of autonomously learning reusable temporally extended actions, or options, in reinforcement learning. While options can speed up transfer learning by serving as reusable building blocks, learning reusable options for unknown task distribution remains challenging. Motivated by the recent success of mutual information (MI) based skill learning, we hypothesize that more diverse options are more reusable. To this end, we propose a method for learning termination conditions of options by maximizing MI between options and corresponding state transitions. We derive a scalable approximation of this MI maximization via gradient ascent, yielding the InfoMax Termination Critic (IMTC) algorithm. Our experiments demonstrate that IMTC significantly improves the diversity of learned options without rewards, combined with an intrinsic option learning method. Moreover, we test the reusability of learned options by transferring options into various tasks, confirming that IMTC helps quick adaptation, especially in complex domains where an agent needs to manipulate objects.", "keywords": "reinforcement learning;hierachical reinforcement learning;options;life long reinforcement learning;skill transfer", "primary_area": "", "supplementary_material": "", "author": "Yuji Kanagawa;Tomoyuki Kaneko", "authorids": "~Yuji_Kanagawa1;~Tomoyuki_Kaneko1", "gender": "M;M", "homepage": "https://kngwyu.github.io/;", "dblp": ";16/5774", "google_scholar": ";", "orcid": ";0000-0001-8051-2388", "linkedin": ";", "or_profile": "~Yuji_Kanagawa1;~Tomoyuki_Kaneko1", "aff": "Okinawa Institute of Science and Technology (OIST);The University of Tokyo", "aff_domain": "oist.jp;u-tokyo.ac.jp", "position": "PhD student;Associate Professor", "bibtex": "@misc{\nkanagawa2022learning,\ntitle={Learning Diverse Options via InfoMax Termination Critic},\nauthor={Yuji Kanagawa and Tomoyuki Kaneko},\nyear={2022},\nurl={https://openreview.net/forum?id=UTTrevGchy}\n}", "github": "", "project": "", "reviewers": "XKs3;SWkr;YG8c;rEv8", "site": "https://openreview.net/forum?id=UTTrevGchy", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "4;3;4;4", "correctness": "4;3;2;3", "technical_novelty": "2;2;3;2", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "49;91;71;80", "wc_summary_review": "68;95;49;32", "wc_main_review": "199;380;351;383", "wc_review": "316;566;471;495", "wc_reply_reviewers": "0;0;17;90", "wc_reply_authors": "622;433;499;787", "reply_reviewers": "0;0;1;1", "reply_authors": "1;1;2;2", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 72.75, 15.433324334050653 ], "wc_summary_review_avg": [ 61.0, 23.39871791359518 ], "wc_main_review_avg": [ 328.25, 75.66166466580022 ], "wc_review_avg": [ 462.0, 91.2441778964554 ], "wc_reply_reviewers_avg": [ 26.75, 37.17105728924051 ], "wc_reply_authors_avg": [ 585.25, 134.78941909512037 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": -0.7071067811865475, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15144722458293940254&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1", "aff_unique_norm": "Okinawa Institute of Science and Technology;University of Tokyo", "aff_unique_dep": ";", "aff_unique_url": "https://www.oist.jp;https://www.u-tokyo.ac.jp", "aff_unique_abbr": "OIST;UTokyo", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Japan" }, { "id": "UTdxT0g6ZuC", "title": "Automatic Forecasting via Meta-Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "In this work, we develop techniques for fast automatic selection of the best forecasting model for a new unseen time-series dataset, without having to first train (or evaluate) all the models on the new time-series data to select the best one. In particular, we develop a forecasting meta-learning approach called AutoForecast that allows for the quick inference of the best time-series forecasting model for an unseen dataset. Our approach learns both forecasting models performances over time horizon of same dataset and task similarity across different datasets. The experiments demonstrate the effectiveness of the approach over state-of-the-art (SOTA) single and ensemble methods and several SOTA meta-learners (adapted to our problem) in terms of selecting better forecasting models (i.e., 2X gain) for unseen tasks for univariate and multivariate testbeds. \n", "keywords": "Forecasting Model Selection;Time-series Forecasting;Meta-features", "primary_area": "", "supplementary_material": "", "author": "Mustafa Abdallah;Ryan Rossi;Kanak Mahadik;Sungchul Kim;Handong Zhao;Haoliang Wang;Saurabh Bagchi", "authorids": "~Mustafa_Abdallah1;~Ryan_Rossi1;mahadik@adobe.com;~Sungchul_Kim1;~Handong_Zhao3;~Haoliang_Wang1;~Saurabh_Bagchi1", "gender": ";M;;M;;;M", "homepage": "https://web.ics.purdue.edu/~abdalla0/;http://ryanrossi.com;;https://sites.google.com/site/subright;;;https://saurabhbagchi.us", "dblp": ";17/5085;;61/1573;;;57/95.html", "google_scholar": "NK4Ok1IAAAAJ;_Dc6lbQAAAAJ;;v8ISLgIAAAAJ;;;https://scholar.google.com.tw/citations?user=3EfsOvYAAAAJ", "orcid": ";0000-0001-9758-0635;;0000-0003-3580-5290;;;", "linkedin": "mustafa-abdallah-12315aa6/;;;;;;", "or_profile": "~Mustafa_Abdallah1;~Ryan_Rossi1;mahadik@adobe.com;~Sungchul_Kim1;~Handong_Zhao3;~Haoliang_Wang1;~Saurabh_Bagchi1", "aff": "Purdue University;Adobe Research;;Adobe Systems;;;Purdue University", "aff_domain": "purdue.edu;adobe.com;;adobe.com;;;purdue.edu", "position": "PhD student;Senior Research Scientist;;Researcher;;;Full Professor", "bibtex": "@misc{\nabdallah2022automatic,\ntitle={Automatic Forecasting via Meta-Learning},\nauthor={Mustafa Abdallah and Ryan Rossi and Kanak Mahadik and Sungchul Kim and Handong Zhao and Haoliang Wang and Saurabh Bagchi},\nyear={2022},\nurl={https://openreview.net/forum?id=UTdxT0g6ZuC}\n}", "github": "", "project": "", "reviewers": "sRFp;cQ6U;M5ZB;ZkNj", "site": "https://openreview.net/forum?id=UTdxT0g6ZuC", "pdf_size": 0, "recommendation": "3;3;5;6", "confidence": "3;4;3;3", "correctness": "3;2;2;3", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;2;4;3", "wc_summary_paper": "104;118;79;49", "wc_summary_review": "74;40;51;28", "wc_main_review": "1011;176;472;239", "wc_review": "1189;334;602;316", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;704;541", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;1;1", "recommendation_avg": [ 4.25, 1.299038105676658 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 87.5, 26.253571185650152 ], "wc_summary_review_avg": [ 48.25, 16.946607330082326 ], "wc_main_review_avg": [ 474.5, 328.78602464216755 ], "wc_review_avg": [ 610.25, 352.8160816912971 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 311.25, 316.54018307317637 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0.5, 0.5 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": -0.5555555555555555, "corr_recommendation_correctness": 0.19245008972987526, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1, "aff_unique_index": "0;1;1;0", "aff_unique_norm": "Purdue University;Adobe", "aff_unique_dep": ";Adobe Research", "aff_unique_url": "https://www.purdue.edu;https://research.adobe.com", "aff_unique_abbr": "Purdue;Adobe", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "UVtVRcurOYv", "title": "KGRefiner: Knowledge Graph Refinement for Improving Accuracy of Translational Link Prediction Methods", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "The Link Prediction is the task of predicting missing relations between entities of the knowledge graph. Recent work in link prediction has attempted to provide a model for increasing link prediction accuracy by using more layers in neural network architecture. In this paper, we propose a novel method of refining the knowledge graph so that link prediction operation can be performed more accurately using relatively fast translational models. Translational link prediction models, such as TransE, TransH, TransD, have less complexity than deep learning approaches. Our method uses the hierarchy of relationships and entities in the knowledge graph to add the entity information as auxiliary nodes to the graph and connect them to the nodes which contain this information in their hierarchy. Our experiments show that our method can significantly increase the performance of translational link prediction methods in H@10, MR, MRR.", "keywords": "Knowledge Graph Embedding;Link Prediction;Representation Learning", "primary_area": "", "supplementary_material": "", "author": "Mohammad Javad Saeedizade;Mohammad Javad Saeedizade;Najmeh Torabian", "authorids": "~Mohammad_Javad_Saeedizade1;saeedizade74@gmail.com;najmeh.torabian@gmail.com", "gender": "M;;", "homepage": "https://saeedizade.github.io/;;", "dblp": ";;", "google_scholar": "E-T5yjkAAAAJ;;", "orcid": ";;", "linkedin": "saeedizade/;;", "or_profile": "~Mohammad_Javad_Saeedizade1;saeedizade74@gmail.com;najmeh.torabian@gmail.com", "aff": "Link\u00f6ping University;;", "aff_domain": "liu.se;;", "position": "PhD student;;", "bibtex": "@misc{\nsaeedizade2022kgrefiner,\ntitle={{KGR}efiner: Knowledge Graph Refinement for Improving Accuracy of Translational Link Prediction Methods},\nauthor={Mohammad Javad Saeedizade and Mohammad Javad Saeedizade and Najmeh Torabian},\nyear={2022},\nurl={https://openreview.net/forum?id=UVtVRcurOYv}\n}", "github": "", "project": "", "reviewers": "wWdF;a5cr;LdBo;pAx4;ttSw", "site": "https://openreview.net/forum?id=UVtVRcurOYv", "pdf_size": 0, "recommendation": "3;3;3;3;5", "confidence": "4;5;4;4;3", "correctness": "2;2;2;3;2", "technical_novelty": "2;2;1;2;2", "empirical_novelty": "2;2;1;3;2", "wc_summary_paper": "19;48;49;108;84", "wc_summary_review": "38;60;2;27;23", "wc_main_review": "241;237;149;295;390", "wc_review": "298;345;200;430;497", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "190;136;130;190;207", "reply_reviewers": "0;0;0;0;0", "reply_authors": "1;1;1;1;1", "recommendation_avg": [ 3.4, 0.8 ], "confidence_avg": [ 4.0, 0.6324555320336759 ], "correctness_avg": [ 2.2, 0.39999999999999997 ], "technical_novelty_avg": [ 1.8, 0.4 ], "empirical_novelty_avg": [ 2.0, 0.6324555320336759 ], "wc_summary_paper_avg": [ 61.6, 31.026440337234952 ], "wc_summary_review_avg": [ 30.0, 19.00526242912736 ], "wc_main_review_avg": [ 262.4, 79.12926133864767 ], "wc_review_avg": [ 354.0, 103.10965037279489 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 170.6, 31.3789738519283 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.7905694150420948, "corr_recommendation_correctness": -0.25, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11033625067501476493&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0", "aff_unique_norm": "Link\u00f6ping University", "aff_unique_dep": "", "aff_unique_url": "https://www.liu.se", "aff_unique_abbr": "LiU", "aff_country_unique_index": "0", "aff_country_unique": "Sweden" }, { "id": "UXrVIKDbsb_", "title": "Unleash the Potential of Adaptation Models via Dynamic Domain Labels", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "In this paper, we propose an embarrassing simple yet highly effective adversarial domain adaptation (ADA) method for effectively training models for alignment. We view ADA problem primarily from a neural network memorization perspective and point out a fundamental dilemma, in that the real-world data often exhibits an imbalanced distribution where the majority data clusters typically dominate and biase the adaptation process. Unlike prior works that either attempt loss re-weighting or data re-sampling for alleviating this defect, we introduce a new concept of dynamic domain labels (DDLs) to replace the original immutable domain labels on the fly. DDLs adaptively and timely transfer the model attention from over-memorized aligned data to those easily overlooked samples, which allows each sample can be well studied and fully unleashes the potential of adaption model. Albeit simple, this dynamic adversarial domain adaptation (DADA) framework with DDLs effectively promotes adaptation. We demonstrate through empirical results on real and synthetic data as well as toy games that our method leads to efficient training without bells and whistles, while being robust to different backbones.", "keywords": "Adversarial Domain Adaptation;Dynamic Domain Labels", "primary_area": "", "supplementary_material": "/attachment/0e91f3b321d396aeed04f74ab307198079ac8e72.zip", "author": "Xin Jin;Tianyu He;Xu Shen;Songhua Wu;Tongliang Liu;Xinchao Wang;Jianqiang Huang;Zhibo Chen;Xian-Sheng Hua", "authorids": "~Xin_Jin8;~Tianyu_He1;~Xu_Shen1;~Songhua_Wu1;~Tongliang_Liu1;~Xinchao_Wang1;~Jianqiang_Huang2;~Zhibo_Chen1;~Xian-Sheng_Hua1", "gender": "M;M;M;M;M;M;M;M;M", "homepage": "http://home.ustc.edu.cn/~jinxustc/;https://www.microsoft.com/en-us/research/people/tianyuhe/;;https://scifancier.github.io/;https://tongliang-liu.github.io/;https://scholar.google.com.hk/citations?user=UqAybqgAAAAJ&hl=en;https://faculty.ustc.edu.cn/chenzhibo;;https://sites.google.com/site/sitexinchaowang/", "dblp": "68/3340-14;198/4010;09/10130-1.html;;150/6667;;54/6561.html;56/5807-1;", "google_scholar": "byaSC-kAAAAJ;P08KU1YAAAAJ;38jwGs8AAAAJ;https://scholar.google.com/citations?hl=en;https://scholar.google.com.au/citations?user=EiLdZ_YAAAAJ;https://scholar.google.com.hk/citations?user=UqAybqgAAAAJ;1ayDJfsAAAAJ;https://scholar.google.co.uk/citations?user=6G-l4o0AAAAJ;https://scholar.google.com.tw/citations?user=w69Buq0AAAAJ", "orcid": "0000-0002-1820-8358;0000-0002-4828-3228;;;;0000-0001-5735-2910;;;", "linkedin": ";;;;;;;xshua;", "or_profile": "~Xin_Jin8;~Tianyu_He1;~Xu_Shen1;~Songhua_Wu1;~Tongliang_Liu1;~Jianqiang_Huang2;~Zhibo_Chen1;~Xian-Sheng_Hua1;~Xinchao_WANG3", "aff": "University of Science and Technology of China;Alibaba Group;Alibaba Group;University of Sydney;University of Sydney;Alibaba Group;University of Science and Technology of China;Alibaba Group;National University of Singapore", "aff_domain": "ustc.edu.cn;alibaba-inc.com;alibaba-inc.com;sydney.edu.au;sydney.edu.au;alibaba-inc.com;ustc.edu.cn;alibaba-inc.com;nus.edu", "position": "PhD student;Researcher;Researcher;PhD student;Lecturer;Researcher;Full Professor;Distinguished Engineer;Assistant Professor", "bibtex": "@misc{\njin2022unleash,\ntitle={Unleash the Potential of Adaptation Models via Dynamic Domain Labels},\nauthor={Xin Jin and Tianyu He and Xu Shen and Songhua Wu and Tongliang Liu and Xinchao Wang and Jianqiang Huang and Zhibo Chen and Xian-Sheng Hua},\nyear={2022},\nurl={https://openreview.net/forum?id=UXrVIKDbsb_}\n}", "github": "", "project": "", "reviewers": "w7Qx;nnj1;yjdx", "site": "https://openreview.net/forum?id=UXrVIKDbsb_", "pdf_size": 0, "recommendation": "3;5;5", "confidence": "5;5;3", "correctness": "1;3;3", "technical_novelty": "1;3;3", "empirical_novelty": "2;3;2", "wc_summary_paper": "42;50;79", "wc_summary_review": "39;35;36", "wc_main_review": "217;284;327", "wc_review": "298;369;442", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 4.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 4.333333333333333, 0.9428090415820634 ], "correctness_avg": [ 2.3333333333333335, 0.9428090415820634 ], "technical_novelty_avg": [ 2.3333333333333335, 0.9428090415820634 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 57.0, 15.895492023421818 ], "wc_summary_review_avg": [ 36.666666666666664, 1.699673171197595 ], "wc_main_review_avg": [ 276.0, 45.2621990922521 ], "wc_review_avg": [ 369.6666666666667, 58.78964383479647 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 9, 0 ], "corr_recommendation_confidence": -0.49999999999999983, "corr_recommendation_correctness": 1.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18070500685492083880&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;1;2;2;1;0;1;3", "aff_unique_norm": "University of Science and Technology of China;Alibaba Group;University of Sydney;National University of Singapore", "aff_unique_dep": ";;;", "aff_unique_url": "http://www.ustc.edu.cn;https://www.alibaba.com;https://www.sydney.edu.au;https://www.nus.edu.sg", "aff_unique_abbr": "USTC;Alibaba;USYD;NUS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;1;0;0;0;2", "aff_country_unique": "China;Australia;Singapore" }, { "id": "UXwlFxVWks", "title": "Divergent representations of ethological visual inputs emerge from supervised, unsupervised, and reinforcement learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Artificial neural systems trained using reinforcement, supervised, and unsupervised learning all acquire internal representations of high dimensional input. To what extent these representations depend on the different learning objectives is largely unknown. Here we compare the representations learned by eight different convolutional neural networks, each with identical ResNet architectures and trained on the same family of egocentric images, but embedded within different learning systems. Specifically, the representations are trained to guide action in a compound reinforcement learning task; to predict one or a combination of three task-related targets with supervision; or using one of three different unsupervised objectives. Using representational similarity analysis, we find that the network trained with reinforcement learning differs most from the other networks. Through further analysis using metrics inspired by the neuroscience literature, we find that the model trained with reinforcement learning has a high-dimensional representation wherein individual images are represented with very different patterns of neural activity. These representations seem to arise in order to guide long-term behavior and goal-seeking in the RL agent. Our results provide insights into how the properties of neural representations are influenced by objective functions and can inform transfer learning approaches.", "keywords": "reinforcement learning;transfer learning;representations;dimensionality;sparsity;RSA", "primary_area": "", "supplementary_material": "/attachment/8175a6502106da3f94325817756ab60a62d501d6.zip", "author": "Grace W Lindsay;Josh Merel;Thomas D. Mrsic-Flogel;Maneesh Sahani", "authorids": "~Grace_W_Lindsay1;~Josh_Merel1;t.mrsic-flogel@ucl.ac.uk;~Maneesh_Sahani1", "gender": "F;;;", "homepage": "https://lindsay-lab.github.io;;;http://www.gatsby.ucl.ac.uk/~maneesh", "dblp": "172/1328;139/1361;;44/3197", "google_scholar": "https://scholar.google.com/citations?hl=en;https://scholar.google.co.uk/citations?user=K4OcFXUAAAAJ;;https://scholar.google.co.uk/citations?user=rwxX83UAAAAJ", "orcid": ";;;0000-0001-5560-3341", "linkedin": ";;;", "or_profile": "~Grace_W_Lindsay1;~Josh_Merel1;t.mrsic-flogel@ucl.ac.uk;~Maneesh_Sahani1", "aff": "University College London;Meta Reality Labs;;University College London", "aff_domain": "ucl.ac.uk;fb.com;;ucl.ac.uk", "position": "Postdoc;Research Scientist;;Full Professor", "bibtex": "@misc{\nlindsay2022divergent,\ntitle={Divergent representations of ethological visual inputs emerge from supervised, unsupervised, and reinforcement learning},\nauthor={Grace W Lindsay and Josh Merel and Thomas D. Mrsic-Flogel and Maneesh Sahani},\nyear={2022},\nurl={https://openreview.net/forum?id=UXwlFxVWks}\n}", "github": "", "project": "", "reviewers": "RDPx;mWVA;vGGV;WoDx", "site": "https://openreview.net/forum?id=UXwlFxVWks", "pdf_size": 0, "recommendation": "3;3;3;6", "confidence": "4;4;3;3", "correctness": "3;4;2;3", "technical_novelty": "3;1;1;3", "empirical_novelty": "2;2;2;4", "wc_summary_paper": "209;56;318;76", "wc_summary_review": "80;42;124;58", "wc_main_review": "678;324;1295;511", "wc_review": "967;422;1737;645", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.75, 1.299038105676658 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.0, 1.0 ], "empirical_novelty_avg": [ 2.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 164.75, 106.23882294152172 ], "wc_summary_review_avg": [ 76.0, 30.822070014844883 ], "wc_main_review_avg": [ 702.0, 364.5510938126506 ], "wc_review_avg": [ 942.75, 497.8093887222297 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.0, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12501658453892356070&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "University College London;Meta", "aff_unique_dep": ";Meta Reality Labs", "aff_unique_url": "https://www.ucl.ac.uk;https://www.meta.com", "aff_unique_abbr": "UCL;MRL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", "aff_country_unique": "United Kingdom;United States" }, { "id": "UYDtmk6BMf5", "title": "Decomposing Texture and Semantics for Out-of-distribution Detection", "track": "main", "status": "Reject", "tldr": "", "abstract": "Out-of-distribution (OOD) detection has made significant progress in recent years because the distribution mismatch between the training and testing can severely deteriorate the reliability of a machine learning system.Nevertheless, the lack of precise interpretation of the in-distribution limits the application of OOD detection methods to real-world system pipielines. To tackle this issue, we decompose the definition of the in-distribution into texture and semantics, motivated by real-world scenarios. In addition, we design new benchmarks to measure the robustness that OOD detection methods should have. To achieve a good balance between the OOD detection performance and robustness, our method takes a divide-and-conquer approach. That is, the model first tackles each component of the texture and semantics separately, and then combines them later. Such design philosophy is empirically proven by a series of benchmarks including not only ours but also the conventional counterpart.", "keywords": "Out-of-distribution detection;Fourier analysis;Normailzing flow model", "primary_area": "", "supplementary_material": "/attachment/91e6a60c34cd78913615b90b2e0e9bf496a3289b.zip", "author": "Jeong-Hyeon Moon;Namhyuk Ahn;Kyung-Ah Sohn", "authorids": "~Jeong-Hyeon_Moon1;~Namhyuk_Ahn1;~Kyung-Ah_Sohn1", "gender": "M;M;F", "homepage": ";https://nmhkahn.github.io;https://sites.google.com/site/kasohn", "dblp": ";217/1998;65/3835", "google_scholar": ";cFSb6QQAAAAJ;-QsSytMAAAAJ", "orcid": "0000-0002-2805-7063;;0000-0001-8941-1188", "linkedin": ";;", "or_profile": "~Jeong-Hyeon_Moon1;~Namhyuk_Ahn1;~Kyung-Ah_Sohn1", "aff": "Ajou University;NAVER WEBTOON Corp.;Ajou University", "aff_domain": "ajou.ac.kr;webtoonscorp.com;ajou.ac.kr", "position": "PhD student;Researcher;Full Professor", "bibtex": "@misc{\nmoon2022decomposing,\ntitle={Decomposing Texture and Semantics for Out-of-distribution Detection},\nauthor={Jeong-Hyeon Moon and Namhyuk Ahn and Kyung-Ah Sohn},\nyear={2022},\nurl={https://openreview.net/forum?id=UYDtmk6BMf5}\n}", "github": "", "project": "", "reviewers": "q2kp;sj54;nEb7;7WrH", "site": "https://openreview.net/forum?id=UYDtmk6BMf5", "pdf_size": 0, "recommendation": "3;5;6;6", "confidence": "4;4;3;3", "correctness": "3;3;4;4", "technical_novelty": "2;2;3;2", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "28;87;32;28", "wc_summary_review": "84;83;9;31", "wc_main_review": "268;490;93;124", "wc_review": "380;660;134;183", "wc_reply_reviewers": "115;238;0;0", "wc_reply_authors": "756;501;59;574", "reply_reviewers": "1;1;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 5.0, 1.224744871391589 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 43.75, 25.02373872945448 ], "wc_summary_review_avg": [ 51.75, 32.69078616368839 ], "wc_main_review_avg": [ 243.75, 156.7583729821154 ], "wc_review_avg": [ 339.25, 206.81075286357816 ], "wc_reply_reviewers_avg": [ 88.25, 98.38286182054271 ], "wc_reply_authors_avg": [ 472.5, 256.15864225124244 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.8164965809277259, "corr_recommendation_correctness": 0.8164965809277259, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16179099271050242422&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1;0", "aff_unique_norm": "Ajou University;NAVER WEBTOON Corp.", "aff_unique_dep": ";", "aff_unique_url": "https://www.ajou.ac.kr;https://www.webtoons.com", "aff_unique_abbr": "Ajou;Naver Webtoon", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "South Korea" }, { "id": "UYfK9egx2I", "title": "A General Framework for Defending against Backdoor Attacks via Influence Graph", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "In this work, we propose a new and general framework to defend against backdoor attacks, inspired by the fact that attack triggers usually follow a \\textsc{specific} type of attacking pattern, and therefore, they have greater connections with and larger impacts on each other during training. We introduce the notion of the {\\it influence graph}, which consists of nodes and edges respectively representative of individual training points and associated pair-wise influences. The influence between a pair of training points represents the degree to which removing one training point would impact the prediction the model makes on the other training point, approximated by influence function. Malicious training points are extracted by finding the maximum average sub-graph subject to a particular size. Extensive experiments on computer vision and natural language processing tasks demonstrate the effectiveness and generality of the proposed framework.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xiaofei Sun;Jiwei Li;Tianwei Zhang;Han Qiu;Fei Wu;Chun Fan", "authorids": "~Xiaofei_Sun1;~Jiwei_Li1;~Tianwei_Zhang1;~Han_Qiu3;~Fei_Wu1;~Chun_Fan1", "gender": "M;M;M;M;M;M", "homepage": ";https://nlp.stanford.edu/~bdlijiwei/;https://personal.ntu.edu.sg/tianwei.zhang/index.html;https://qiuhan.info;https://person.zju.edu.cn/wufei;", "dblp": ";73/5746-1;77/7902-4;15/4507-1;84/3254-1;", "google_scholar": "hIokU_IAAAAJ;PwU16JEAAAAJ;9vpiYDIAAAAJ;https://scholar.google.fr/citations?user=6JWNv6gAAAAJ;XJLn4MYAAAAJ;", "orcid": ";;;;;", "linkedin": ";;;;;chunfan/", "or_profile": "~Xiaofei_Sun1;~Jiwei_Li1;~Tianwei_Zhang1;~Han_Qiu3;~Fei_Wu1;~Chun_Fan1", "aff": ";Zhejiang University;Nanyang Technological University;Tsinghua University;Zhejiang University;Peking University", "aff_domain": ";zju.edu.cn;ntu.edu.sg;tsinghua.edu.cn;zju.edu.cn;pku.edu.cn", "position": ";Assistant Professor;Assistant Professor;Assistant Professor;Full Professor;Researcher", "bibtex": "@misc{\nsun2022a,\ntitle={A General Framework for Defending against Backdoor Attacks via Influence Graph},\nauthor={Xiaofei Sun and Jiwei Li and Tianwei Zhang and Han Qiu and Fei Wu and Chun Fan},\nyear={2022},\nurl={https://openreview.net/forum?id=UYfK9egx2I}\n}", "github": "", "project": "", "reviewers": "", "site": "https://openreview.net/forum?id=UYfK9egx2I", "pdf_size": 0, "recommendation": "", "confidence": "", "correctness": "", "technical_novelty": "", "empirical_novelty": "", "wc_summary_paper": "", "wc_summary_review": "", "wc_main_review": "", "wc_review": "", "wc_reply_reviewers": "", "wc_reply_authors": "", "reply_reviewers": "", "reply_authors": "", "recommendation_avg": [ 0, 0 ], "confidence_avg": [ 0, 0 ], "correctness_avg": [ 0, 0 ], "technical_novelty_avg": [ 0, 0 ], "empirical_novelty_avg": [ 0, 0 ], "wc_summary_paper_avg": [ 0, 0 ], "wc_summary_review_avg": [ 0, 0 ], "wc_main_review_avg": [ 0, 0 ], "wc_review_avg": [ 0, 0 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 1, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0, "corr_recommendation_correctness": 0, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3140592797940864746&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;2;0;3", "aff_unique_norm": "Zhejiang University;Nanyang Technological University;Tsinghua University;Peking University", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.zju.edu.cn;https://www.ntu.edu.sg;https://www.tsinghua.edu.cn;http://www.pku.edu.cn", "aff_unique_abbr": "ZJU;NTU;THU;Peking U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0;0", "aff_country_unique": "China;Singapore" }, { "title": "Fine-Tuning can Distort Pretrained Features and Underperform Out-of-Distribution", "status": "Oral", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/5945", "id": "UYneFzXSJWh", "poster": "", "openreview": "https://openreview.net/forum?id=UYneFzXSJWh", "slides": "https://iclr.cc/virtual/2022/poster/5945", "video": "https://iclr.cc/virtual/2022/poster/5945", "author_site": "Ananya Kumar, Aditi Raghunathan, Robbie Jones, Tengyu Ma, Percy Liang", "tldr": "", "abstract": "When transferring a pretrained model to a downstream task, two popular methods are full fine-tuning (updating all the model parameters) and linear probing (updating only the last linear layer---the \"head\"). It is well known that fine-tuning leads to better accuracy in-distribution (ID). However, in this paper, we find that fine-tuning can achieve worse accuracy than linear probing out-of-distribution (OOD) when the pretrained features are good and the distribution shift is large. On 10 distribution shift datasets (BREEDS-Living17, BREEDS-Entity30, DomainNet, CIFAR $\\to$ STL, CIFAR-10.1, FMoW, ImageNetV2, ImageNet-R, ImageNet-A, ImageNet-Sketch), fine-tuning obtains on average 2% higher accuracy ID but 7% lower accuracy OOD than linear probing. We show theoretically that this tradeoff between ID and OOD accuracy arises even in a simple setting: fine-tuning overparameterized two-layer linear networks. We prove that the OOD error of fine-tuning is high when we initialize with a fixed or random head---this is because while fine-tuning learns the head, the lower layers of the neural network change simultaneously and distort the pretrained features. Our analysis suggests that the easy two-step strategy of linear probing then full fine-tuning (LP-FT), sometimes used as a fine-tuning heuristic, combines the benefits of both fine-tuning and linear probing. Empirically, LP-FT outperforms both fine-tuning and linear probing on the above datasets (1% better ID, 10% better OOD than full fine-tuning).", "keywords": "fine-tuning theory;transfer learning theory;fine-tuning;distribution shift;implicit regularization", "primary_area": "", "supplementary_material": "/attachment/34f81a5b333e131206e26a14e9de6125508dcbc8.zip", "author": "Ananya Kumar;Aditi Raghunathan;Robbie Matthew Jones;Tengyu Ma;Percy Liang", "authorids": "~Ananya_Kumar1;~Aditi_Raghunathan1;~Robbie_Matthew_Jones1;~Tengyu_Ma1;~Percy_Liang1", "gender": "M;F;M;M;", "homepage": "https://ananyakumar.wordpress.com/;https://www.cs.cmu.edu/~aditirag/;;http://ai.stanford.edu/~tengyuma/;https://cs.stanford.edu/~pliang/", "dblp": "192/0474;166/1409;;54/9061;04/1701", "google_scholar": "tP5IBFkAAAAJ;Ch9iRwQAAAAJ;dXzqCT4AAAAJ;i38QlUwAAAAJ;pouyVyUAAAAJ", "orcid": ";;;;", "linkedin": ";;robbie-jones-96;;", "or_profile": "~Ananya_Kumar1;~Aditi_Raghunathan1;~Robbie_Matthew_Jones1;~Tengyu_Ma1;~Percy_Liang1", "aff": "Microsoft;Carnegie Mellon University;;Facebook AI Research;Stanford University", "aff_domain": "microsoft.com;cmu.edu;;fb.com;stanford.edu", "position": "Intern;Assistant Professor;;Visiting Scientist;Associate Professor", "bibtex": "@inproceedings{\nkumar2022finetuning,\ntitle={Fine-Tuning can Distort Pretrained Features and Underperform Out-of-Distribution},\nauthor={Ananya Kumar and Aditi Raghunathan and Robbie Matthew Jones and Tengyu Ma and Percy Liang},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=UYneFzXSJWh}\n}", "github": "", "project": "", "reviewers": "VgHa;5UET;VT2b;Gpdo", "pdf_size": 0, "recommendation": "8;8;8;8", "confidence": "3;4;4;3", "correctness": "3;3;4;4", "technical_novelty": "3;3;3;2", "empirical_novelty": "3;2;0;3", "wc_summary_paper": "99;146;59;98", "wc_summary_review": "109;38;73;18", "wc_main_review": "501;472;166;46", "wc_review": "709;656;298;162", "wc_reply_reviewers": "209;0;0;0", "wc_reply_authors": "1584;1769;745;160", "reply_reviewers": "3;0;0;0", "reply_authors": "4;5;1;1", "recommendation_avg": [ 8.0, 0.0 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 1.224744871391589 ], "wc_summary_paper_avg": [ 100.5, 30.826125283596703 ], "wc_summary_review_avg": [ 59.5, 34.70230539892127 ], "wc_main_review_avg": [ 296.25, 195.19269325464003 ], "wc_review_avg": [ 456.25, 232.0607409709794 ], "wc_reply_reviewers_avg": [ 52.25, 90.49965469547384 ], "wc_reply_authors_avg": [ 1064.5, 649.3075157427334 ], "reply_reviewers_avg": [ 0.75, 1.299038105676658 ], "reply_authors_avg": [ 2.75, 1.7853571071357126 ], "replies_avg": [ 23, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 807, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15252191125521850551&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=UYneFzXSJWh", "email": "microsoft.com;cmu.edu;;fb.com;stanford.edu", "author_num": 5, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "Microsoft;Carnegie Mellon University;Meta;Stanford University", "aff_unique_dep": "Microsoft Corporation;;Facebook AI Research;", "aff_unique_url": "https://www.microsoft.com;https://www.cmu.edu;https://research.facebook.com;https://www.stanford.edu", "aff_unique_abbr": "Microsoft;CMU;FAIR;Stanford", "aff_campus_unique_index": "1", "aff_campus_unique": ";Stanford", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "U_Jog0t3fAu", "title": "Iterative Sketching and its Application to Federated Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Johnson-Lindenstrauss lemma is one of the most valuable tools in machine learning, since it enables the reduction to the dimension of various learning problems. In this paper, we exploit the power of Fast-JL transform or so-called sketching technique and apply it to federated learning settings. Federated learning is an emerging learning scheme which allows multiple clients to train models without data exchange. Though most federated learning frameworks only require clients and the server to send gradient information over the network, they still face the challenges of communication efficiency and data privacy. We show that by iteratively applying independent sketches combined with additive noises, one can achieve the above two goals simultaneously. In our designed framework, each client only passes a sketched gradient to the server, and de-sketches the average-gradient information received from the server to synchronize. Such framework enjoys several benefits: 1). Better privacy, since we only exchange randomly sketched gradients with low-dimensional noises, which is more robust against emerging gradient attacks; 2). Lower communication cost per round, since our framework only communicates low-dimensional sketched gradients, which is particularly valuable in a small-bandwidth channel; 3). No extra overall communication cost. We provably show that the introduced randomness does not increase the overall communication at all.\n", "keywords": "Federated learning;optimization;sketching;differential privacy", "primary_area": "", "supplementary_material": "", "author": "Zhao Song;Zheng Yu;Lichen Zhang", "authorids": "~Zhao_Song6;~Zheng_Yu1;~Lichen_Zhang2", "gender": "M;M;M", "homepage": "https://sites.google.com/view/zhengyu/;https://lczh.github.io/;https://www.youtube.com/@zhaosong2031", "dblp": "28/4466;00/6357-3;76/4051-2", "google_scholar": ";https://scholar.google.com/citations?view_op=list_works;yDZct7UAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Zheng_Yu1;~Lichen_Zhang2;~Zhao_Song3", "aff": "Princeton University;Carnegie Mellon University;Adobe", "aff_domain": "princeton.edu;cmu.edu;adobe.com", "position": "PhD student;MS student;Researcher", "bibtex": "@misc{\nsong2022iterative,\ntitle={Iterative Sketching and its Application to Federated Learning},\nauthor={Zhao Song and Zheng Yu and Lichen Zhang},\nyear={2022},\nurl={https://openreview.net/forum?id=U_Jog0t3fAu}\n}", "github": "", "project": "", "reviewers": "YCQM;RjL4;NaBa;ujZn", "site": "https://openreview.net/forum?id=U_Jog0t3fAu", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "4;3;3;4", "correctness": "3;2;4;3", "technical_novelty": "1;2;2;3", "empirical_novelty": "0;0;2;0", "wc_summary_paper": "73;65;38;133", "wc_summary_review": "54;19;43;64", "wc_main_review": "677;390;78;613", "wc_review": "804;474;159;810", "wc_reply_reviewers": "145;37;0;42", "wc_reply_authors": "623;788;45;707", "reply_reviewers": "1;1;0;1", "reply_authors": "2;1;1;1", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 0.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 77.25, 34.701404870696514 ], "wc_summary_review_avg": [ 45.0, 16.748134224444225 ], "wc_main_review_avg": [ 439.5, 234.32936222334578 ], "wc_review_avg": [ 561.75, 269.36070147666305 ], "wc_reply_reviewers_avg": [ 56.0, 53.884134956404374 ], "wc_reply_authors_avg": [ 540.75, 292.1064660359301 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.7071067811865475, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17671755946664886544&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;2", "aff_unique_norm": "Princeton University;Carnegie Mellon University;Adobe", "aff_unique_dep": ";;Adobe Inc.", "aff_unique_url": "https://www.princeton.edu;https://www.cmu.edu;https://www.adobe.com", "aff_unique_abbr": "Princeton;CMU;Adobe", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "UajXTGRjuKB", "title": "Sampling Before Training: Rethinking the Effect of Edges in the Process of Training Graph Neural Networks", "track": "main", "status": "Desk Reject", "tldr": "", "abstract": "Graph neural networks (GNN) demonstrate excellent performance on many graph-based tasks; however, they also impose a heavy computational burden when trained on a large-scale graph. Although various sampling methods have been proposed to speed up training GNN by shrinking the scale of the graph during training, they become unavailable if we need to perform sampling before training. In this paper, we quantify the importance of every edge for training in the graph with the extra information they convey in addition to the node features, as inspired by a manifold learning algorithm called diffusion map. Based on this calculation, we propose Graph Diffusion Sampling (GDS), a simple but effective sampling method for shrinking the size of the edge set before training. GDS prefers to sample edges with high importance, and edges dropped by GDS will never be used in the training procedure. We empirically show that GDS preserves the edges crucial for training in a variety of models (GCN, GraphSAGE, GAT, and JKNet). Compared to training on the full graph, GDS can guarantee the performance of the model while only samples a small fraction of the edges.\n", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/c1ac687e0a9a056e6f15d173072ddd6ca4550642.zip", "author": "Hengyuan Ma;Qi Yang;Bowen Sun;Long Shun;Junkui Li;Jianfeng Feng", "authorids": "~Hengyuan_Ma1;xianmu.yq@antgroup.com;wenxi.sbw@antgroup.com;shunlong.wxd@antgroup.com;kui.lijk@antgroup.com;~Jianfeng_Feng2", "gender": "M;;;;;", "homepage": ";;;;;", "dblp": "268/5474;;;;;", "google_scholar": "xl1WCzUAAAAJ;;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": "~Hengyuan_Ma1;xianmu.yq@antgroup.com;wenxi.sbw@antgroup.com;shunlong.wxd@antgroup.com;kui.lijk@antgroup.com;~Jianfeng_Feng2", "aff": "Fudan University;;;;;", "aff_domain": "fudan.edu.cn;;;;;", "position": "PhD student;;;;;", "bibtex": "@misc{\nma2022sampling,\ntitle={Sampling Before Training: Rethinking the Effect of Edges in the Process of Training Graph Neural Networks},\nauthor={Hengyuan Ma and Qi Yang and Bowen Sun and Long Shun and Junkui Li and Jianfeng Feng},\nyear={2022},\nurl={https://openreview.net/forum?id=UajXTGRjuKB}\n}", "github": "", "project": "", "reviewers": "", "site": "https://openreview.net/forum?id=UajXTGRjuKB", "pdf_size": 0, "recommendation": "", "confidence": "", "correctness": "", "technical_novelty": "", "empirical_novelty": "", "wc_summary_paper": "", "wc_summary_review": "", "wc_main_review": "", "wc_review": "", "wc_reply_reviewers": "", "wc_reply_authors": "", "reply_reviewers": "", "reply_authors": "", "recommendation_avg": [ 0, 0 ], "confidence_avg": [ 0, 0 ], "correctness_avg": [ 0, 0 ], "technical_novelty_avg": [ 0, 0 ], "empirical_novelty_avg": [ 0, 0 ], "wc_summary_paper_avg": [ 0, 0 ], "wc_summary_review_avg": [ 0, 0 ], "wc_main_review_avg": [ 0, 0 ], "wc_review_avg": [ 0, 0 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 1, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0, "corr_recommendation_correctness": 0, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1451397554555636461&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "Fudan University", "aff_unique_dep": "", "aff_unique_url": "https://www.fudan.edu.cn", "aff_unique_abbr": "Fudan", "aff_country_unique_index": "0", "aff_country_unique": "China" }, { "id": "UarYhFFxQ2B", "title": "Towards Robust Active Feature Acquisition", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Truly intelligent systems are expected to make critical decisions with incomplete and uncertain data. Active feature acquisition (AFA), where features are sequentially acquired to improve the prediction, is a step towards this goal. However, current AFA methods lack robustness in two key areas that limits their applicability in the real world. First, current AFA models only consider a small set of candidate features and have difficulty scaling to a large feature space. Second, they are ignorant about the valid domains where they can predict confidently, thus they can be vulnerable to out-of-distribution (OOD) inputs. In order to remedy these deficiencies and bring AFA models closer to practical use, we propose several techniques to advance the current AFA approaches. Our framework can easily handle a large number of features using a hierarchical acquisition policy and is more robust to OOD inputs with the help of an OOD detector for partially observed data. Extensive experiments demonstrate the efficacy of our framework over strong baselines.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/71cfa3fd34d7a75c6c8811f617558593f80c0041.zip", "author": "Yang Li;Siyuan Shan;Qin Liu;Junier Oliva", "authorids": "~Yang_Li19;~Siyuan_Shan1;~Qin_Liu3;~Junier_Oliva1", "gender": ";M;M;M", "homepage": ";https://shansiliu95.github.io/;https://sites.google.com/cs.unc.edu/qinliu/home;http://lupalab.com", "dblp": ";190/7476;06/2123-8;137/8390", "google_scholar": ";CHEENg8AAAAJ;o209D9kAAAAJ;", "orcid": ";;0000-0001-6342-5311;", "linkedin": ";;;", "or_profile": "~Yang_Li19;~Siyuan_Shan1;~Qin_Liu3;~Junier_Oliva1", "aff": ";Department of Computer Science, University of North Carolina, Chapel Hill;University of North Carolina at Chapel Hill;", "aff_domain": ";cs.unc.edu;cs.unc.edu;", "position": ";PhD student;PhD student;", "bibtex": "@misc{\nli2022towards,\ntitle={Towards Robust Active Feature Acquisition},\nauthor={Yang Li and Siyuan Shan and Qin Liu and Junier Oliva},\nyear={2022},\nurl={https://openreview.net/forum?id=UarYhFFxQ2B}\n}", "github": "", "project": "", "reviewers": "wMB2;mB1B;Ki8N;EYjM", "site": "https://openreview.net/forum?id=UarYhFFxQ2B", "pdf_size": 0, "recommendation": "1;3;3;5", "confidence": "3;5;3;3", "correctness": "2;2;3;4", "technical_novelty": "1;2;2;3", "empirical_novelty": "1;2;2;2", "wc_summary_paper": "31;110;475;127", "wc_summary_review": "279;69;220;35", "wc_main_review": "51;674;2328;318", "wc_review": "361;853;3023;480", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.0, 1.4142135623730951 ], "confidence_avg": [ 3.5, 0.8660254037844386 ], "correctness_avg": [ 2.75, 0.82915619758885 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 1.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 185.75, 170.8820865392274 ], "wc_summary_review_avg": [ 150.75, 101.64244930146066 ], "wc_main_review_avg": [ 842.75, 885.5329962796418 ], "wc_review_avg": [ 1179.25, 1079.8537806110603 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.8528028654224418, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16494375070600222656&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0", "aff_unique_norm": "University of North Carolina", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.unc.edu", "aff_unique_abbr": "UNC", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Chapel Hill", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "Ub1BQTKiwqg", "title": "Learning sparse DNNs with soft thresholding of weights during training", "track": "main", "status": "Reject", "tldr": "", "abstract": "This paper proposes a new and simple way of training sparse neural networks. Our method is based on a differentiation of the forward and backward paths: the weights in the forward path are a thresholded version of the weights maintained in the backward path. This decoupling allows for micro-updates, produced by gradient descent, to stack up, leading to the possible re-activation of weights that were set to zero in earlier training steps. At the end of training, links with zero weights are pruned away.\n\nAdditional critical specificities of our approach lie (i) in the progressive increase of the zeroed weight ratio along the training, and (ii) in the use of soft-thresholding rather than hard-tresholding to derive the forward-path weights from the ones maintained in the backward path.\nAt constant accuracy, our approach reduces the number of training cycles to 1 compared to the state-of-the-art recursive pruning methods. At high pruning rates, it also improves the model accuracy compared to other single cycle pruning approaches (66.18% top-1 accuracy when training a ResNet-50 on ImageNet at 98% sparsity).\n", "keywords": "pruning;sparse;DNN", "primary_area": "", "supplementary_material": "/attachment/d93f9e610c52ab4af92e3a155539e905b1305db9.zip", "author": "Antoine Vanderschueren;Christophe De Vleeschouwer", "authorids": "~Antoine_Vanderschueren1;~Christophe_De_Vleeschouwer1", "gender": "M;M", "homepage": "https://sites.uclouvain.be/ispgroup/Category/AntoineVanderschueren;", "dblp": ";", "google_scholar": ";xb3Zc3cAAAAJ", "orcid": ";0000-0001-5049-2929", "linkedin": ";", "or_profile": "~Antoine_Vanderschueren1;~Christophe_De_Vleeschouwer1", "aff": "UCLouvain;UCLouvain", "aff_domain": "uclouvain.be;uclouvain.be", "position": "PhD student;Associate Professor", "bibtex": "@misc{\nvanderschueren2022learning,\ntitle={Learning sparse {DNN}s with soft thresholding of weights during training},\nauthor={Antoine Vanderschueren and Christophe De Vleeschouwer},\nyear={2022},\nurl={https://openreview.net/forum?id=Ub1BQTKiwqg}\n}", "github": "", "project": "", "reviewers": "2DCW;U2rk;EscA;u8ib", "site": "https://openreview.net/forum?id=Ub1BQTKiwqg", "pdf_size": 0, "recommendation": "3;3;3;3", "confidence": "4;4;4;4", "correctness": "3;3;3;3", "technical_novelty": "2;2;2;2", "empirical_novelty": "3;2;2;2", "wc_summary_paper": "58;35;70;83", "wc_summary_review": "50;38;31;74", "wc_main_review": "1041;201;381;711", "wc_review": "1149;274;482;868", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 61.5, 17.67059704707229 ], "wc_summary_review_avg": [ 48.25, 16.345871038277526 ], "wc_main_review_avg": [ 583.5, 321.27674985905844 ], "wc_review_avg": [ 693.25, 338.61584059225584 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:3froRq2bvD4J:scholar.google.com/&scioq=Learning+sparse+DNNs+with+soft+thresholding+of+weights+during+training&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Universit\u00e9 catholique de Louvain", "aff_unique_dep": "", "aff_unique_url": "https://www.uclouvain.be", "aff_unique_abbr": "UCL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Belgium" }, { "title": "Transform2Act: Learning a Transform-and-Control Policy for Efficient Agent Design", "status": "Oral", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6196", "id": "UcDUxjPYWSr", "poster": "", "openreview": "https://openreview.net/forum?id=UcDUxjPYWSr", "slides": "https://iclr.cc/virtual/2022/poster/6196", "video": "https://iclr.cc/virtual/2022/poster/6196", "author_site": "Ye Yuan, Yuda Song, Zhengyi Luo, Wen Sun, Kris Kitani", "tldr": "", "abstract": "An agent's functionality is largely determined by its design, i.e., skeletal structure and joint attributes (e.g., length, size, strength). However, finding the optimal agent design for a given function is extremely challenging since the problem is inherently combinatorial and the design space is prohibitively large. Additionally, it can be costly to evaluate each candidate design which requires solving for its optimal controller. To tackle these problems, our key idea is to incorporate the design procedure of an agent into its decision-making process. Specifically, we learn a conditional policy that, in an episode, first applies a sequence of transform actions to modify an agent's skeletal structure and joint attributes, and then applies control actions under the new design. To handle a variable number of joints across designs, we use a graph-based policy where each graph node represents a joint and uses message passing with its neighbors to output joint-specific actions. Using policy gradient methods, our approach enables joint optimization of agent design and control as well as experience sharing across different designs, which improves sample efficiency substantially. Experiments show that our approach, Transform2Act, outperforms prior methods significantly in terms of convergence speed and final performance. Notably, Transform2Act can automatically discover plausible designs similar to giraffes, squids, and spiders. Code and videos are available at https://sites.google.com/view/transform2act.", "keywords": "Agent Design;Morphology Optimization;Reinforcement Learning", "primary_area": "", "supplementary_material": "", "author": "Ye Yuan;Yuda Song;Zhengyi Luo;Wen Sun;Kris M. Kitani", "authorids": "~Ye_Yuan5;~Yuda_Song2;~Zhengyi_Luo1;~Wen_Sun1;~Kris_M._Kitani1", "gender": "M;M;M;;M", "homepage": "https://www.ye-yuan.com;https://yudasong.github.io/;https://zhengyiluo.github.io/;https://wensun.github.io;http://www.cs.cmu.edu/~kkitani/", "dblp": "33/6315-7;250/4880-1;;;42/163", "google_scholar": "EEp82sIAAAAJ;0QDCG8IAAAAJ;lHPTxGsAAAAJ;iOLC30YAAAAJ;yv3sH74AAAAJ", "orcid": ";;;;0000-0002-9389-4060", "linkedin": ";;zhengyi-zen-luo-726156105/;;", "or_profile": "~Ye_Yuan5;~Yuda_Song2;~Zhengyi_Luo1;~Wen_Sun1;~Kris_M._Kitani1", "aff": "Robotics Institute, Carnegie Mellon University;Carnegie Mellon University;NVIDIA;Cornell University;Carnegie Mellon University", "aff_domain": "cs.cmu.edu;andrew.cmu.edu;nvidia.com;cornell.edu;cmu.edu", "position": "PhD student;PhD student;Intern;Assistant Professor;Associate Professor", "bibtex": "@inproceedings{\nyuan2022transformact,\ntitle={Transform2Act: Learning a Transform-and-Control Policy for Efficient Agent Design},\nauthor={Ye Yuan and Yuda Song and Zhengyi Luo and Wen Sun and Kris M. Kitani},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=UcDUxjPYWSr}\n}", "github": "", "project": "", "reviewers": "42PW;ZAQQ;mvHK;487L", "pdf_size": 0, "recommendation": "8;8;8;8", "confidence": "4;4;5;4", "correctness": "3;4;4;3", "technical_novelty": "4;3;3;3", "empirical_novelty": "3;3;3;3", "wc_summary_paper": "101;99;89;120", "wc_summary_review": "125;51;23;37", "wc_main_review": "655;203;593;172", "wc_review": "881;353;705;329", "wc_reply_reviewers": "228;0;49;47", "wc_reply_authors": "896;412;1240;401", "reply_reviewers": "1;0;1;1", "reply_authors": "2;1;2;1", "recommendation_avg": [ 8.0, 0.0 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 102.25, 11.211043662389331 ], "wc_summary_review_avg": [ 59.0, 39.370039370059054 ], "wc_main_review_avg": [ 405.75, 219.6216917792958 ], "wc_review_avg": [ 567.0, 234.56342425877057 ], "wc_reply_reviewers_avg": [ 81.0, 87.10625695092173 ], "wc_reply_authors_avg": [ 737.25, 352.42401663337307 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 47, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=871690359216860608&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=UcDUxjPYWSr", "email": "cs.cmu.edu;andrew.cmu.edu;nvidia.com;cornell.edu;cmu.edu", "author_num": 5, "aff_unique_index": "0;0;1;2;0", "aff_unique_norm": "Carnegie Mellon University;NVIDIA;Cornell University", "aff_unique_dep": "Robotics Institute;NVIDIA Corporation;", "aff_unique_url": "https://www.cmu.edu;https://www.nvidia.com;https://www.cornell.edu", "aff_unique_abbr": "CMU;NVIDIA;Cornell", "aff_campus_unique_index": "0", "aff_campus_unique": "Pittsburgh;", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "What Makes Better Augmentation Strategies? Augment Difficult but Not too Different", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6037", "id": "Ucx3DQbC9GH", "poster": "", "openreview": "https://openreview.net/forum?id=Ucx3DQbC9GH", "slides": "https://iclr.cc/virtual/2022/poster/6037", "video": "https://iclr.cc/virtual/2022/poster/6037", "author_site": "Jaehyung Kim, Dongyeop Kang, Sungsoo Ahn, Jinwoo Shin", "tldr": "", "abstract": "The practice of data augmentation has been extensively used to boost the performance of deep neural networks for various NLP tasks. It is more effective when only a limited number of labeled samples is available, e.g., low-data or class-imbalanced regimes. Most current augmentation techniques rely on parameter tuning or inherent randomness; hence, their effectiveness largely varies on the tasks. To efficiently find the best augmentation strategy for each task, learning data augmentation policy is a promising solution, but the question of what makes a good augmentation in NLP tasks and how to design the reward function for learning a good policy remains under-explored. To answer this, we hypothesize that good data augmentation should construct more diverse and challenging samples for providing informative training signals, while avoiding the risk of losing the semantics of original samples. Therefore, we design a novel reward function for updating the augmentation policy to construct difficult but not too different samples (DND). Particularly, we jointly optimize a data augmentation policy while training the model, to construct the augmented samples with low confidence but a high semantic similarity with original ones. In addition, we introduce a sample re-weighting scheme to focus on difficult augmented samples after the original ones are learned confidently for more effective learning from the augmented ones. Our learning-based augmentation outperforms the recent state-of-the-art augmentation schemes on various text classification tasks and GLUE benchmark by successfully discovering the effective augmentations for each task. Remarkably, our method is more effective on the challenging low-data and class-imbalanced regimes, and the learned augmentation policy is well-transferable to the different tasks and models. ", "keywords": "NLP;data augmentation;learning augmentation policy;text classification", "primary_area": "", "supplementary_material": "/attachment/4121bfd41ea185360c1b2ebf964a64b1ad1f816f.zip", "author": "Jaehyung Kim;Dongyeop Kang;Sungsoo Ahn;Jinwoo Shin", "authorids": "~Jaehyung_Kim1;~Dongyeop_Kang2;~Sungsoo_Ahn1;~Jinwoo_Shin1", "gender": "M;M;M;M", "homepage": "https://sites.google.com/view/jaehyungkim;https://sungsooahn.super.site/;https://sites.google.com/site/mijirim/;https://dykang.github.io/", "dblp": "02/7206-1;90/5164;31/7062;69/9056", "google_scholar": "https://scholar.google.co.kr/citations?user=6OYOsGsAAAAJ;XTenHs0AAAAJ;https://scholar.google.com.tw/citations?user=m3eDp7kAAAAJ;https://scholar.google.co.kr/citations?user=fMKZOjwAAAAJ", "orcid": ";;;0000-0002-9021-1789", "linkedin": ";;;dongyeop-kang-30ba0611/", "or_profile": "~Jaehyung_Kim1;~Sungsoo_Ahn1;~Jinwoo_Shin1;~dongyeop_kang1", "aff": "Korea Advanced Institute of Science & Technology;Pohang University of Science and Technology;Korea Advanced Institute of Science & Technology;University of Minnesota", "aff_domain": "kaist.ac.kr;postech.ac.kr;kaist.ac.kr;umn.edu", "position": "PhD student;Assistant Professor;Associate Professor;Assistant Professor", "bibtex": "@inproceedings{\nkim2022what,\ntitle={What Makes Better Augmentation Strategies? Augment Difficult but Not too Different},\nauthor={Jaehyung Kim and Dongyeop Kang and Sungsoo Ahn and Jinwoo Shin},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=Ucx3DQbC9GH}\n}", "github": "", "project": "", "reviewers": "A1ft;SErr;XAnt;PytR", "pdf_size": 0, "recommendation": "6;6;6;8", "confidence": "5;3;4;4", "correctness": "3;3;3;4", "technical_novelty": "2;2;2;4", "empirical_novelty": "3;3;3;4", "wc_summary_paper": "83;89;68;72", "wc_summary_review": "40;33;50;125", "wc_main_review": "458;129;541;510", "wc_review": "581;251;659;707", "wc_reply_reviewers": "58;0;0;0", "wc_reply_authors": "2040;683;2303;1322", "reply_reviewers": "1;0;0;0", "reply_authors": "4;1;4;2", "recommendation_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.8660254037844386 ], "empirical_novelty_avg": [ 3.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 78.0, 8.396427811873332 ], "wc_summary_review_avg": [ 62.0, 36.871398129173244 ], "wc_main_review_avg": [ 409.5, 164.63975826026956 ], "wc_review_avg": [ 549.5, 178.1088150541685 ], "wc_reply_reviewers_avg": [ 14.5, 25.11473670974872 ], "wc_reply_authors_avg": [ 1587.0, 633.5033543715456 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 2.75, 1.299038105676658 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 1.0, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4076110530054541528&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=Ucx3DQbC9GH", "email": "kaist.ac.kr;postech.ac.kr;kaist.ac.kr;umn.edu", "author_num": 4, "aff_unique_index": "0;1;0;2", "aff_unique_norm": "Korea Advanced Institute of Science and Technology;Pohang University of Science and Technology;University of Minnesota", "aff_unique_dep": ";;", "aff_unique_url": "https://www.kaist.ac.kr;https://www.postech.ac.kr;https://www.minnesota.edu", "aff_unique_abbr": "KAIST;POSTECH;UMN", "aff_campus_unique_index": "1", "aff_campus_unique": ";Pohang", "aff_country_unique_index": "0;0;0;1", "aff_country_unique": "South Korea;United States" }, { "id": "Ud7G0LtrHVD", "title": "Are Vision Transformers Robust to Patch-wise Perturbations?", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "The recent advances in Vision Transformer (ViT) have demonstrated its impressive performance in image classification, which makes it a promising alternative to Convolutional Neural Network (CNN). Unlike CNNs, ViT represents an input image as a sequence of image patches. The patch-wise input image representation makes the following question interesting: How does ViT perform when individual input image patches are perturbed with natural corruptions or adversarial perturbations, compared to CNNs? In this work, we conduct a comprehensive study on the robustness of vision transformers to patch-wise perturbations. Surprisingly, we find that vision transformers are more robust to naturally corrupted patches than CNNs, whereas they are more vulnerable to adversarial patches. Based on extensive qualitative and quantitative experiments, we discover that ViT's stronger robustness to natural corrupted patches and higher vulnerability against adversarial patches are both caused by the attention mechanism. Specifically, the attention model can help improve the robustness of vision transformers by effectively ignoring natural corrupted patches. However, when vision transformers are attacked by an adversary, the attention mechanism can be easily fooled to focus more on the adversarially perturbed patches and cause a mistake.", "keywords": "Vision Transformer;Adversarial Robustness", "primary_area": "", "supplementary_material": "", "author": "Jindong Gu;Volker Tresp;Yao Qin", "authorids": "~Jindong_Gu1;~Volker_Tresp1;~Yao_Qin1", "gender": ";M;", "homepage": ";https://www.dbs.ifi.lmu.de/~tresp/;https://yaoqin1.github.io", "dblp": ";t/VolkerTresp;66/10420-1", "google_scholar": ";xIJHTUwAAAAJ;https://scholar.google.com/citations?view_op=list_works", "orcid": ";0000-0001-9428-3686;", "linkedin": ";volker-tresp-8110a118/;", "or_profile": "~Jindong_Gu1;~Volker_Tresp1;~Yao_Qin1", "aff": ";Siemens Corporate Research;Google", "aff_domain": ";siemens.com;google.com", "position": ";Principal Researcher;Researcher", "bibtex": "@misc{\ngu2022are,\ntitle={Are Vision Transformers Robust to Patch-wise Perturbations?},\nauthor={Jindong Gu and Volker Tresp and Yao Qin},\nyear={2022},\nurl={https://openreview.net/forum?id=Ud7G0LtrHVD}\n}", "github": "", "project": "", "reviewers": "uJQr;C5qG;BsoZ;JbW3", "site": "https://openreview.net/forum?id=Ud7G0LtrHVD", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "4;4;3;4", "correctness": "4;2;2;3", "technical_novelty": "3;2;2;2", "empirical_novelty": "3;1;2;3", "wc_summary_paper": "62;64;56;132", "wc_summary_review": "42;11;23;52", "wc_main_review": "492;128;259;208", "wc_review": "596;203;338;392", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.82915619758885 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 78.5, 31.028212968200407 ], "wc_summary_review_avg": [ 32.0, 15.98436736314578 ], "wc_main_review_avg": [ 271.75, 135.46286391480137 ], "wc_review_avg": [ 382.25, 141.3088373032628 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": -0.30151134457776363, "gs_citation": 85, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16503398445472541087&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff_unique_index": "0;1", "aff_unique_norm": "Siemens AG;Google", "aff_unique_dep": "Corporate Research;Google", "aff_unique_url": "https://www.siemens.com/research;https://www.google.com", "aff_unique_abbr": "Siemens;Google", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;1", "aff_country_unique": "Germany;United States" }, { "title": "Minimax Optimization with Smooth Algorithmic Adversaries", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6354", "id": "UdxJ2fJx7N0", "poster": "", "openreview": "https://openreview.net/forum?id=UdxJ2fJx7N0", "slides": "https://iclr.cc/virtual/2022/poster/6354", "video": "https://iclr.cc/virtual/2022/poster/6354", "author_site": "Tanner Fiez, Chi Jin, Praneeth Netrapalli, Lillian J Ratliff", "tldr": "", "abstract": "This paper considers minimax optimization $\\min_x \\max_y f(x, y)$ in the challenging setting where $f$ can be both nonconvex in $x$ and nonconcave in $y$. Though such optimization problems arise in many machine learning paradigms including training generative adversarial networks (GANs) and adversarially robust models, from a theoretical point of view, two fundamental issues remain: (i) the absence of simple and efficiently computable optimality notions, and (ii) cyclic or diverging behavior of existing algorithms. This paper proposes a new theoretical framework for nonconvex-nonconcave minimax optimization that addresses both of the above issues. The starting point of this paper is the observation that, under a computational budget, the max-player can not fully maximize $f(x,\\cdot)$ since nonconcave maximization is NP-hard in general. So, we propose a new framework, and a corresponding algorithm, for the min-player to play against \\emph{smooth algorithms} deployed by the adversary (i.e., the max-player) instead of against full maximization. Our algorithm is guaranteed to make monotonic progress (thus having no limit cycles or diverging behavior), and to find an appropriate ``stationary point'' in a polynomial number of iterations. Our framework covers practically relevant settings where the smooth algorithms deployed by the adversary are multi-step stochastic gradient ascent, and its accelerated version. We further present experimental results that confirm our theoretical findings and demonstrate the effectiveness of the proposed approach in practice on simple, conceptual settings.", "keywords": "Minimax optimization;two player zero sum games;generative adversarial networks;adversarial training", "primary_area": "", "supplementary_material": "/attachment/4e2ba35603ae73bda8e10822707db81c5277ec74.zip", "author": "Tanner Fiez;Chi Jin;Praneeth Netrapalli;Lillian J Ratliff", "authorids": "~Tanner_Fiez1;~Chi_Jin1;~Praneeth_Netrapalli1;~Lillian_J_Ratliff1", "gender": ";M;M;F", "homepage": ";https://sites.google.com/view/cjin/home;http://praneethnetrapalli.org/;https://faculty.washington.edu/ratliffl/", "dblp": "195/5645;126/1802-1;http://dblp.uni-trier.de/pers/hd/n/Netrapalli:Praneeth;127/7426", "google_scholar": "_B6SVAcAAAAJ;GINhGvwAAAAJ;https://scholar.google.co.in/citations?user=mim8FQkAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";;;0000-0001-8936-0229", "linkedin": "tannerfiez/;;;", "or_profile": "~Tanner_Fiez1;~Chi_Jin1;~Praneeth_Netrapalli1;~Lillian_Ratliff1", "aff": "Amazon;Princeton University;Google;University of Washington, Seattle", "aff_domain": "amazon.com;princeton.edu;google.com;uw.edu", "position": "Researcher;Assistant Professor;Research Scientist;Assistant Professor", "bibtex": "@inproceedings{\nfiez2022minimax,\ntitle={Minimax Optimization with Smooth Algorithmic Adversaries},\nauthor={Tanner Fiez and Chi Jin and Praneeth Netrapalli and Lillian J Ratliff},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=UdxJ2fJx7N0}\n}", "github": "", "project": "", "reviewers": "WVv5;nxeu;BUCz;9PEU", "pdf_size": 0, "recommendation": "6;6;8;8", "confidence": "5;4;4;3", "correctness": "3;3;4;4", "technical_novelty": "3;3;3;3", "empirical_novelty": "3;3;3;3", "wc_summary_paper": "77;72;39;36", "wc_summary_review": "54;15;92;12", "wc_main_review": "421;132;92;66", "wc_review": "552;219;223;114", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "476;178;36;47", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 7.0, 1.0 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 56.0, 18.614510468986285 ], "wc_summary_review_avg": [ 43.25, 32.66018217952864 ], "wc_main_review_avg": [ 177.75, 142.39447847441275 ], "wc_review_avg": [ 277.0, 164.6769564936151 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 184.25, 177.4632004106767 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.7071067811865475, "corr_recommendation_correctness": 1.0, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11061521782135546152&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "pdf": "https://openreview.net/pdf?id=UdxJ2fJx7N0", "email": "amazon.com;princeton.edu;google.com;uw.edu", "author_num": 4, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "Amazon;Princeton University;Google;University of Washington", "aff_unique_dep": "Amazon.com, Inc.;;Google;", "aff_unique_url": "https://www.amazon.com;https://www.princeton.edu;https://www.google.com;https://www.washington.edu", "aff_unique_abbr": "Amazon;Princeton;Google;UW", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Mountain View;Seattle", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "UeE41VsK1KJ", "title": "Subjective Learning for Open-Ended Data", "track": "main", "status": "Reject", "tldr": "", "abstract": "Conventional supervised learning typically assumes that the learning task can be solved by learning a single function since the data is sampled from a fixed distribution. However, this assumption is invalid in open-ended environments where no task-level data partitioning is available. In this paper, we present a novel supervised learning framework of learning from open-ended data, which is modeled as data implicitly sampled from multiple domains with the data in each domain obeying a domain-specific target function. Since different domains may possess distinct target functions, open-ended data inherently requires multiple functions to capture all its input-output relations, rendering training a single global model problematic. To address this issue, we devise an Open-ended Supervised Learning (OSL) framework, of which the key component is a subjective function that allocates the data among multiple candidate models to resolve the \"conflict'' between the data from different domains, exhibiting a natural hierarchy. We theoretically analyze the learnability and the generalization error of OSL, and empirically validate its efficacy in both open-ended regression and classification tasks.", "keywords": "Open-ended data;machine learning;supervised learning;data conflict", "primary_area": "", "supplementary_material": "/attachment/7efe325c7cb47447a3120215c470953bd5cb6833.zip", "author": "Tianren Zhang;Yizhou Jiang;Xin Su;Shangqi Guo;Feng Chen", "authorids": "~Tianren_Zhang1;~Yizhou_Jiang1;~Xin_Su1;~Shangqi_Guo2;~Feng_Chen1", "gender": "M;M;;M;M", "homepage": ";;;https://shangiguo.github.io/;", "dblp": ";201/8247;;232/1375;21/3047-7", "google_scholar": "https://scholar.google.com/citations?hl=zh-CN;oM8ue_UAAAAJ;https://scholar.google.com.hk/citations?user=I_oE7zUAAAAJ;https://scholar.google.com.hk/citations?user=A0dr0aQAAAAJ;", "orcid": "0000-0001-9687-5263;;;0000-0003-3181-6881;0000-0003-4813-2494", "linkedin": "%E5%A4%A9%E4%BB%BB-%E7%AB%A0-622b30110/;;;;", "or_profile": "~Tianren_Zhang1;~Yizhou_Jiang1;~Xin_Su1;~Shangqi_Guo2;~Feng_Chen1", "aff": "Tsinghua University;Tsinghua University;WeChat, Tencent;Tsinghua University;Tsinghua University", "aff_domain": "tsinghua.edu.cn;tsinghua.edu.cn;tencent.com;tsinghua.edu.cn;tsinghua.edu.cn", "position": "MS student;PhD student;Researcher;Postdoc;Full Professor", "bibtex": "@misc{\nzhang2022subjective,\ntitle={Subjective Learning for Open-Ended Data},\nauthor={Tianren Zhang and Yizhou Jiang and Xin Su and Shangqi Guo and Feng Chen},\nyear={2022},\nurl={https://openreview.net/forum?id=UeE41VsK1KJ}\n}", "github": "", "project": "", "reviewers": "WErS;E4u9;R5W1;tMyf", "site": "https://openreview.net/forum?id=UeE41VsK1KJ", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "3;2;3;3", "correctness": "4;3;3;4", "technical_novelty": "3;3;3;3", "empirical_novelty": "3;2;3;3", "wc_summary_paper": "62;55;54;160", "wc_summary_review": "49;41;106;32", "wc_main_review": "142;570;273;168", "wc_review": "253;666;433;360", "wc_reply_reviewers": "0;7;15;0", "wc_reply_authors": "988;589;1278;105", "reply_reviewers": "0;1;1;0", "reply_authors": "2;1;4;1", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 2.75, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 82.75, 44.70668294561787 ], "wc_summary_review_avg": [ 57.0, 28.922309727959142 ], "wc_main_review_avg": [ 288.25, 169.90052236529468 ], "wc_review_avg": [ 428.0, 151.58990731575767 ], "wc_reply_reviewers_avg": [ 5.5, 6.18465843842649 ], "wc_reply_authors_avg": [ 740.0, 440.73064336394856 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 2.0, 1.224744871391589 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.13245323570650439, "corr_recommendation_correctness": 0.2294157338705618, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:ohE8RXWZ9LQJ:scholar.google.com/&scioq=Subjective+Learning+for+Open-Ended+Data&hl=en&as_sdt=0,5", "gs_version_total": 5, "aff_unique_index": "0;0;1;0;0", "aff_unique_norm": "Tsinghua University;Tencent", "aff_unique_dep": ";WeChat", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.tencent.com", "aff_unique_abbr": "THU;Tencent", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "id": "UeRmyymo3kb", "title": "GARNET: A Spectral Approach to Robust and Scalable Graph Neural Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "Graph neural networks (GNNs) have been increasingly deployed in various applications that involve learning on non-Euclidean data. However, recent studies show that GNNs are vulnerable to graph adversarial attacks. Although there are several defense methods to improve GNN adversarial robustness, they fail to perform well on low homophily graphs. In addition, few of those defense models can scale to large graphs due to their high computational complexity and memory usage. In this paper, we propose GARNET, a scalable spectral method to boost the adversarial robustness of GNN models for both homophilic and heterophilic graphs. GARNET first computes a reduced-rank yet sparse approximation of the adversarial graph by exploiting an efficient spectral graph embedding and sparsification scheme. Next, GARNET trains an adaptive graph filter on the reduced-rank graph for node representation refinement, which is subsequently leveraged to guide label propagation for further enhancing the quality of node embeddings. GARNET has been evaluated on both homophilic and heterophilic datasets, including a large graph with millions of nodes. Our extensive experiment results show that GARNET increases adversarial accuracy over state-of-the-art GNN (defense) models by up to $9.96\\%$ and $15.17\\%$ on homophilic and heterophilic graphs, respectively.", "keywords": "graph neural networks;adversarial robustness;low-rank approximation;spectral graph theory", "primary_area": "", "supplementary_material": "", "author": "Chenhui Deng;Xiuyu Li;Zhuo Feng;Zhiru Zhang", "authorids": "~Chenhui_Deng1;~Xiuyu_Li1;~Zhuo_Feng3;~Zhiru_Zhang2", "gender": "M;Not Specified;M;M", "homepage": "https://chenhui1016.github.io;https://xiuyuli.com/;https://web.stevens.edu/facultyprofile/?id=2371;https://www.csl.cornell.edu/~zhiruz", "dblp": "250/2396;279/5847;81/4441.html;81/4227", "google_scholar": "https://scholar.google.com/citations?hl=en;https://scholar.google.com/citations?;;https://scholar.google.com.tw/citations?user=x05pUHsAAAAJ", "orcid": ";;;", "linkedin": "chenhui-deng-113b0b16a;;;", "or_profile": "~Chenhui_Deng1;~Xiuyu_Li1;~Zhuo_Feng3;~Zhiru_Zhang2", "aff": "Cornell University;University of California, Berkeley;;Cornell University", "aff_domain": "cornell.edu;berkeley.edu;;cornell.edu", "position": "PhD;PhD student;;Associate Professor", "bibtex": "@misc{\ndeng2022garnet,\ntitle={{GARNET}: A Spectral Approach to Robust and Scalable Graph Neural Networks},\nauthor={Chenhui Deng and Xiuyu Li and Zhuo Feng and Zhiru Zhang},\nyear={2022},\nurl={https://openreview.net/forum?id=UeRmyymo3kb}\n}", "github": "", "project": "", "reviewers": "HF71;F9Qe;EL8y", "site": "https://openreview.net/forum?id=UeRmyymo3kb", "pdf_size": 0, "recommendation": "3;5;6", "confidence": "4;4;4", "correctness": "2;3;3", "technical_novelty": "2;2;3", "empirical_novelty": "2;2;4", "wc_summary_paper": "46;77;101", "wc_summary_review": "36;25;40", "wc_main_review": "371;425;358", "wc_review": "453;527;499", "wc_reply_reviewers": "212;270;83", "wc_reply_authors": "1025;853;404", "reply_reviewers": "2;1;1", "reply_authors": "3;2;1", "recommendation_avg": [ 4.666666666666667, 1.247219128924647 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 2.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.9428090415820634 ], "wc_summary_paper_avg": [ 74.66666666666667, 22.51419305435771 ], "wc_summary_review_avg": [ 33.666666666666664, 6.342099196813483 ], "wc_main_review_avg": [ 384.6666666666667, 29.00957696271277 ], "wc_review_avg": [ 493.0, 30.506829836393468 ], "wc_reply_reviewers_avg": [ 188.33333333333334, 78.15511641743119 ], "wc_reply_authors_avg": [ 760.6666666666666, 261.79423641902855 ], "reply_reviewers_avg": [ 1.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 2.0, 0.816496580927726 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.9449111825230683, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:qD0kJLU3PuMJ:scholar.google.com/&scioq=GARNET:+A+Spectral+Approach+to+Robust+and+Scalable+Graph+Neural+Networks&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1;0", "aff_unique_norm": "Cornell University;University of California, Berkeley", "aff_unique_dep": ";", "aff_unique_url": "https://www.cornell.edu;https://www.berkeley.edu", "aff_unique_abbr": "Cornell;UC Berkeley", "aff_campus_unique_index": "1", "aff_campus_unique": ";Berkeley", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Finding an Unsupervised Image Segmenter in each of your Deep Generative Models", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6368", "id": "Ug-bgjgSlKV", "poster": "", "openreview": "https://openreview.net/forum?id=Ug-bgjgSlKV", "slides": "https://iclr.cc/virtual/2022/poster/6368", "video": "https://iclr.cc/virtual/2022/poster/6368", "author_site": "Luke Melas-Kyriazi, Christian Rupprecht, Iro Laina, Andrea Vedaldi", "tldr": "", "abstract": "Recent research has shown that numerous human-interpretable directions exist in the latent space of GANs. In this paper, we develop an automatic procedure for finding directions that lead to foreground-background image separation, and we use these directions to train an image segmentation model without human supervision. Our method is generator-agnostic, producing strong segmentation results with a wide range of different GAN architectures. Furthermore, by leveraging GANs pretrained on large datasets such as ImageNet, we are able to segment images from a range of domains without further training or finetuning. Evaluating our method on image segmentation benchmarks, we compare favorably to prior work while using neither human supervision nor access to the training data. Broadly, our results demonstrate that automatically extracting foreground-background structure from pretrained deep generative models can serve as a remarkably effective substitute for human supervision.", "keywords": "unsupervised;generative;deep learning;segmentation;object segmentation", "primary_area": "", "supplementary_material": "/attachment/6fd52cbfdf689282af64c262e039f69c270fc1d3.zip", "author": "Luke Melas-Kyriazi;Christian Rupprecht;Iro Laina;Andrea Vedaldi", "authorids": "~Luke_Melas-Kyriazi1;~Christian_Rupprecht1;~Iro_Laina1;~Andrea_Vedaldi1", "gender": "M;M;M;", "homepage": "https://lukemelas.github.io/;http://chrirupp.github.io;https://www.robots.ox.ac.uk/~vedaldi/;", "dblp": "228/5680;https://dblp.uni-trier.de/pid/76/744-1;99/2825;182/2070", "google_scholar": "https://scholar.google.com/citations?hl=en;https://scholar.google.de/citations?user=IrYlproAAAAJ;bRT7t28AAAAJ;n9nXAPcAAAAJ", "orcid": ";;0000-0003-1374-2858;0000-0001-8857-7709", "linkedin": ";;;", "or_profile": "~Luke_Melas-Kyriazi1;~Christian_Rupprecht1;~Andrea_Vedaldi1;~Iro_Laina2", "aff": "University of Oxford, University of Oxford;University of Oxford;Meta;University of Oxford", "aff_domain": "robots.ox.ac.uk;ox.ac.uk;meta.com;ox.ac.uk", "position": "PhD student;Lecturer;Researcher;Postdoc", "bibtex": "@inproceedings{\nmelas-kyriazi2022finding,\ntitle={Finding an Unsupervised Image Segmenter in each of your Deep Generative Models},\nauthor={Luke Melas-Kyriazi and Christian Rupprecht and Iro Laina and Andrea Vedaldi},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=Ug-bgjgSlKV}\n}", "github": "", "project": "", "reviewers": "JFoE;zTVT;XSNi;YWmx", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "3;3;3;4", "correctness": "3;3;3;4", "technical_novelty": "2;3;3;4", "empirical_novelty": "2;3;2;3", "wc_summary_paper": "97;67;76;106", "wc_summary_review": "75;33;46;79", "wc_main_review": "126;258;339;141", "wc_review": "298;358;461;326", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "645;1037;814;484", "reply_reviewers": "0;0;0;0", "reply_authors": "1;2;1;1", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 86.5, 15.660459763365825 ], "wc_summary_review_avg": [ 58.25, 19.356846334049358 ], "wc_main_review_avg": [ 216.0, 87.48999942850611 ], "wc_review_avg": [ 360.75, 61.64971613884366 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 745.0, 205.02804686188668 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.9271726499455306, "corr_recommendation_correctness": 0.9271726499455306, "gs_citation": 62, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15465667626420403073&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "pdf": "https://openreview.net/pdf?id=Ug-bgjgSlKV", "email": "robots.ox.ac.uk;ox.ac.uk;meta.com;ox.ac.uk", "author_num": 4, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "University of Oxford;Meta", "aff_unique_dep": ";Meta Platforms, Inc.", "aff_unique_url": "https://www.ox.ac.uk;https://meta.com", "aff_unique_abbr": "Oxford;Meta", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "United Kingdom;United States" }, { "id": "UgBo_nhiHl", "title": "Gradient Boosting Neural Networks: GrowNet", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "A novel gradient boosting framework is proposed where shallow neural networks are employed as ``weak learners''. General loss functions are considered under this unified framework with specific examples presented for classification, regression and learning to rank. A fully corrective step is incorporated to remedy the pitfall of the greedy function approximation of the classic gradient boosting decision tree. The proposed model rendered outperforming results against state-of-the-art boosting methods in all three tasks on multiple datasets. An ablation study is performed to shed light on the effect of each model component and model hyperparameters.", "keywords": "Deep Neural Networks;Gradient Boosting classifiers;NN architecture optimization", "primary_area": "", "supplementary_material": "/attachment/a50da14a1b0114fc2ae6877bc848200e23a375c2.zip", "author": "Sarkhan Badirli;Xuanqing Liu;Zhengming Xing;Avradeep Bhowmik;Khoa D Doan;Sathiya Keerthi", "authorids": "~Sarkhan_Badirli1;~Xuanqing_Liu1;~Zhengming_Xing1;~Avradeep_Bhowmik1;~Khoa_D_Doan1;~Sathiya_Keerthi1", "gender": "M;M;M;M;M;", "homepage": "https://github.com/sbadirli;;https://zmxing.github.io;;http://www.keerthis.com;https://khoadoan.me", "dblp": ";205/2594;https://dblp.uni-trier.de/pers/hd/x/Xing:Zhengming;147/2054;;238/4276.html", "google_scholar": "Mkaq4VYAAAAJ;;oGCF8pEAAAAJ;;Sr7jln4AAAAJ;Zz2hMgcAAAAJ", "orcid": ";;;;;", "linkedin": "sarkhan-badirli-08a5bb85/;;;;;", "or_profile": "~Sarkhan_Badirli1;~Xuanqing_Liu1;~Zhengming_Xing1;~Avradeep_Bhowmik1;~Sathiya_Keerthi1;~Khoa_Doan1", "aff": ";Amazon;Criteo;;;VinUniversity", "aff_domain": ";amazon.com;criteo.com;;;vinuni.edu.vn", "position": ";Researcher;Research Scientist;;;Assistant Professor", "bibtex": "@misc{\nbadirli2022gradient,\ntitle={Gradient Boosting Neural Networks: GrowNet},\nauthor={Sarkhan Badirli and Xuanqing Liu and Zhengming Xing and Avradeep Bhowmik and Khoa D Doan and Sathiya Keerthi},\nyear={2022},\nurl={https://openreview.net/forum?id=UgBo_nhiHl}\n}", "github": "", "project": "", "reviewers": "VoQC;xoDb;YkVQ;mPg7", "site": "https://openreview.net/forum?id=UgBo_nhiHl", "pdf_size": 0, "recommendation": "3;3;3;3", "confidence": "4;4;5;4", "correctness": "3;2;3;2", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "59;67;55;61", "wc_summary_review": "25;50;12;99", "wc_main_review": "371;325;137;470", "wc_review": "455;442;204;630", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 60.5, 4.330127018922194 ], "wc_summary_review_avg": [ 46.5, 33.24530041975858 ], "wc_main_review_avg": [ 325.75, 120.9160349167967 ], "wc_review_avg": [ 432.75, 151.50474415014205 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 122, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12043421628759186206&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1;2", "aff_unique_norm": "Amazon;Criteo;VinUniversity", "aff_unique_dep": "Amazon.com, Inc.;;", "aff_unique_url": "https://www.amazon.com;https://www.criteo.com;https://vinuni.edu.vn", "aff_unique_abbr": "Amazon;Criteo;VinUni", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2", "aff_country_unique": "United States;France;Vietnam" }, { "id": "UgNQM-LcVpN", "title": "A Modulation Layer to Increase Neural Network Robustness Against Data Quality Issues", "track": "main", "status": "Reject", "tldr": "", "abstract": "Data quality is a common problem in machine learning, especially in high-stakes settings such as healthcare. Missing data affects accuracy, calibration, and feature attribution in complex patterns. Developers often train models on carefully curated datasets to minimize missing data bias; however, this reduces the usability of such models in production environments, such as real-time healthcare records. Making machine learning models robust to missing data is therefore crucial for practical application. While some classifiers naturally handle missing data, others, such as deep neural networks, are not designed for unknown values. We propose a novel neural network modification to mitigate the impacts of missing data. The approach is inspired by neuromodulation that is performed by biological neural networks. Our proposal replaces the fixed weights of a fully-connected layer with a function of an additional input (reliability score) at each input, mimicking the ability of cortex to up- and down-weight inputs based on the presence of other data. The modulation function is jointly learned with the main task using a multi-layer perceptron. We tested our modulating fully connected layer on multiple classification, regression, and imputation problems, and it either improved performance or generated comparable performance to conventional neural network architectures concatenating reliability to the inputs. Models with modulating layers were more robust against degradation of data quality by introducing additional missingness at evaluation time. These results suggest that explicitly accounting for reduced information quality with a modulating fully connected layer can enable the deployment of artificial intelligence systems in real-time settings.\n", "keywords": "missing data;modulation;DNN layer;neuromodulation;robustness", "primary_area": "", "supplementary_material": "", "author": "Mohamed Abdelhack;Jiaming Zhang;Sandhya Tripathi;Bradley A Fritz;Michael Avidan;Yixin Chen;Christopher Ryan King", "authorids": "~Mohamed_Abdelhack1;~Jiaming_Zhang2;~Sandhya_Tripathi1;~Bradley_A_Fritz1;~Michael_Avidan1;~Yixin_Chen1;~Christopher_Ryan_King1", "gender": "M;M;;M;;M;M", "homepage": "https://mabdelhack.github.io/;;;;;https://www.cse.wustl.edu/~yixin.chen/;", "dblp": "261/9549;;214/7186;;;59/983;", "google_scholar": "S1TWer4AAAAJ;;gww9XxIAAAAJ;;;NByrsK0AAAAJ;", "orcid": "0000-0002-6753-3237;;;0000-0002-7239-8877;;;0000-0002-4574-8616", "linkedin": "mabdelhack;http://www.linkedin.com/in/jiaming-zhang-a371a2152;;;;;", "or_profile": "~Mohamed_Abdelhack1;~Jiaming_Zhang2;~Sandhya_Tripathi1;~Bradley_A_Fritz1;~Michael_Avidan1;~Yixin_Chen1;~Christopher_Ryan_King1", "aff": "Centre for Addiction and Mental Health;;;Washington University, St. Louis;Washington University;Washington University, Saint Louis;Washington University, St. Louis", "aff_domain": "camh.ca;;;wustl.edu;;wustl.edu;wustl.edu", "position": "Postdoc;;;Instructor;;Full Professor;Assistant Professor", "bibtex": "@misc{\nabdelhack2022a,\ntitle={A Modulation Layer to Increase Neural Network Robustness Against Data Quality Issues},\nauthor={Mohamed Abdelhack and Jiaming Zhang and Sandhya Tripathi and Bradley A Fritz and Michael Avidan and Yixin Chen and Christopher Ryan King},\nyear={2022},\nurl={https://openreview.net/forum?id=UgNQM-LcVpN}\n}", "github": "", "project": "", "reviewers": "g2de;P1JV;4EPR;fZmE", "site": "https://openreview.net/forum?id=UgNQM-LcVpN", "pdf_size": 0, "recommendation": "3;3;3;3", "confidence": "5;3;5;4", "correctness": "3;3;1;2", "technical_novelty": "2;2;2;1", "empirical_novelty": "2;2;1;2", "wc_summary_paper": "93;98;46;65", "wc_summary_review": "33;54;26;83", "wc_main_review": "611;240;110;167", "wc_review": "737;392;182;315", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 4.25, 0.82915619758885 ], "correctness_avg": [ 2.25, 0.82915619758885 ], "technical_novelty_avg": [ 1.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 75.5, 21.17191535974013 ], "wc_summary_review_avg": [ 49.0, 22.169799277395363 ], "wc_main_review_avg": [ 282.0, 195.45715643076363 ], "wc_review_avg": [ 406.5, 205.06889086353397 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=302956930628889344&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff_unique_index": "0;1;1;1;1", "aff_unique_norm": "Centre for Addiction and Mental Health;Washington University in St. Louis", "aff_unique_dep": ";", "aff_unique_url": "https://camh.ca;https://wustl.edu", "aff_unique_abbr": "CAMH;WUSTL", "aff_campus_unique_index": "1;1;2;1", "aff_campus_unique": ";St. Louis;Saint Louis", "aff_country_unique_index": "0;1;1;1;1", "aff_country_unique": "Canada;United States" }, { "id": "UjynxfqnGWG", "title": "Inductive Biases and Variable Creation in Self-Attention Mechanisms", "track": "main", "status": "Reject", "tldr": "", "abstract": "Self-attention, an architectural motif designed to model long-range interactions in sequential data, has driven numerous recent breakthroughs in natural language processing and beyond. This work provides a theoretical analysis of the inductive biases of self-attention modules, where our focus is to rigorously establish which functions and long-range dependencies self-attention blocks prefer to represent. We show that bounded-norm Transformer layers create sparse variables: they can represent sparse Lipschitz functions of the input sequence, with sample complexity scaling only logarithmically with the context length. We propose new experimental protocols to support the analysis and guide the practice of training Transformers, built around the rich theory of learning sparse Boolean functions.", "keywords": "transformers;attention", "primary_area": "", "supplementary_material": "", "author": "Benjamin L. Edelman;Surbhi Goel;Sham M. Kakade;Cyril Zhang", "authorids": "~Benjamin_L._Edelman1;~Surbhi_Goel1;~Sham_M._Kakade1;~Cyril_Zhang1", "gender": "F;M;;M", "homepage": "https://www.surbhigoel.com;https://shamulent.github.io;https://cyrilzhang.com;https://www.benjaminedelman.com/", "dblp": "190/7815;s/SMKakade;203/4448;241/9410", "google_scholar": "https://scholar.google.co.in/citations?user=Zqz4CQoAAAAJ;https://scholar.google.com.tw/citations?user=wb-DKCIAAAAJ;sXtjq8IAAAAJ;mQSj2C0AAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Surbhi_Goel1;~Sham_M._Kakade1;~Cyril_Zhang1;~Benjamin_L_Edelman1", "aff": "Microsoft Research;Harvard University;Microsoft;Harvard University", "aff_domain": "microsoft.com;harvard.edu;microsoft.com;harvard.edu", "position": "Postdoc;Full Professor;Senior Researcher;PhD student", "bibtex": "@misc{\nedelman2022inductive,\ntitle={Inductive Biases and Variable Creation in Self-Attention Mechanisms},\nauthor={Benjamin L. Edelman and Surbhi Goel and Sham M. Kakade and Cyril Zhang},\nyear={2022},\nurl={https://openreview.net/forum?id=UjynxfqnGWG}\n}", "github": "", "project": "", "reviewers": "yc2n;YFzD;k53Q;8Th6", "site": "https://openreview.net/forum?id=UjynxfqnGWG", "pdf_size": 0, "recommendation": "3;5;6;8", "confidence": "3;4;3;4", "correctness": "3;4;3;4", "technical_novelty": "3;4;3;4", "empirical_novelty": "3;2;3;3", "wc_summary_paper": "89;41;96;77", "wc_summary_review": "52;25;25;59", "wc_main_review": "252;145;177;720", "wc_review": "393;211;298;856", "wc_reply_reviewers": "0;206;24;0", "wc_reply_authors": "532;829;282;780", "reply_reviewers": "0;1;1;0", "reply_authors": "1;2;1;1", "recommendation_avg": [ 5.5, 1.8027756377319946 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.5, 0.5 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 75.75, 21.182244923520265 ], "wc_summary_review_avg": [ 40.25, 15.449514555480375 ], "wc_main_review_avg": [ 323.5, 232.1901160687078 ], "wc_review_avg": [ 439.5, 248.93221969042094 ], "wc_reply_reviewers_avg": [ 57.5, 86.29455370995322 ], "wc_reply_authors_avg": [ 605.75, 218.2067540201265 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.5547001962252291, "corr_recommendation_correctness": 0.5547001962252291, "gs_citation": 158, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1755481428186925773&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff_unique_index": "0;1;0;1", "aff_unique_norm": "Microsoft;Harvard University", "aff_unique_dep": "Microsoft Research;", "aff_unique_url": "https://www.microsoft.com/en-us/research;https://www.harvard.edu", "aff_unique_abbr": "MSR;Harvard", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "UkgBSwjxwe", "title": "Neuro-Symbolic Forward Reasoning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Reasoning is an essential part of human intelligence and thus has been a long-standing goal in artificial intelligence research. With the recent success of deep learning, incorporating reasoning with deep learning systems i.e. neuro-symbolic AI has become a major field of interest. We propose Neuro-Symbolic Forward Reasoner (NS-FR), a new approach for reasoning tasks taking advantage of differentiable forward-chaining using first-order logic. The key idea is to combine differentiable forward-chaining reasoning with object-centric learning. Differentiable forward-chaining reasoning computes logical entailments smoothly, i.e., it deduces new facts from given facts and rules in a differentiable manner. The object-centric learning approach factorizes raw inputs into representations in terms of objects. This allows us to provide a consistent framework to perform the forward-chaining inference from raw inputs. NS-FR factorizes the raw inputs into the object-centric representations, then converts them into probabilistic ground atoms and finally performs differentiable forward-chaining inference using weighted rules for inference. Our comprehensive experimental evaluations on object-centric reasoning data sets, 2D Kandinsky patterns and 3D CLEVR-Hans, and variety of tasks show the effectiveness and advantage of our approach.", "keywords": "neuro-symbolic AI;differentiable logic;object-centric reasoning", "primary_area": "", "supplementary_material": "/attachment/5db9f7ec27358c3b38845595d89fcee021076b59.zip", "author": "Hikaru Shindo;Devendra Singh Dhami;Kristian Kersting", "authorids": "~Hikaru_Shindo1;~Devendra_Singh_Dhami1;~Kristian_Kersting1", "gender": "M;M;M", "homepage": "https://www.hikarushindo.com/;https://sites.google.com/view/devendradhami;http://www.ml.informatik.tu-darmstadt.de/", "dblp": "227/1466;201/2130;40/3793", "google_scholar": "Ws03zBoAAAAJ;aVlaHfkAAAAJ;QY-earAAAAAJ", "orcid": ";;0000-0002-2873-9152", "linkedin": "hkrsnd;;", "or_profile": "~Hikaru_Shindo1;~Devendra_Singh_Dhami1;~Kristian_Kersting1", "aff": "TU Darmstadt;CS Department, TU Darmstadt, TU Darmstadt;TU Darmstadt", "aff_domain": "tu-darmstadt.de;cs.tu-darmstadt.de;tu-darmstadt.de", "position": "PhD student;Postdoctoral researcher;Full Professor", "bibtex": "@misc{\nshindo2022neurosymbolic,\ntitle={Neuro-Symbolic Forward Reasoning},\nauthor={Hikaru Shindo and Devendra Singh Dhami and Kristian Kersting},\nyear={2022},\nurl={https://openreview.net/forum?id=UkgBSwjxwe}\n}", "github": "", "project": "", "reviewers": "Jwkf;ZLoi;hJBE;iyKK", "site": "https://openreview.net/forum?id=UkgBSwjxwe", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "4;3;4;4", "correctness": "2;4;2;4", "technical_novelty": "2;2;3;1", "empirical_novelty": "2;1;0;1", "wc_summary_paper": "54;19;175;73", "wc_summary_review": "72;15;118;64", "wc_main_review": "821;250;762;380", "wc_review": "947;284;1055;517", "wc_reply_reviewers": "170;0;229;0", "wc_reply_authors": "1376;815;2761;842", "reply_reviewers": "1;0;1;0", "reply_authors": "3;2;4;2", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.0, 1.0 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 1.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 80.25, 58.03178008643195 ], "wc_summary_review_avg": [ 67.25, 36.533375152044194 ], "wc_main_review_avg": [ 553.25, 243.5378564001909 ], "wc_review_avg": [ 700.75, 313.67847790372866 ], "wc_reply_reviewers_avg": [ 99.75, 101.90774013783252 ], "wc_reply_authors_avg": [ 1448.5, 790.10711299165 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 2.75, 0.82915619758885 ], "replies_avg": [ 22, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 24, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13219309705614054654&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;0", "aff_unique_norm": "Technische Universit\u00e4t Darmstadt", "aff_unique_dep": "", "aff_unique_url": "https://www.tu-darmstadt.de", "aff_unique_abbr": "TU Darmstadt", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Darmstadt;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Germany" }, { "id": "Ul3o26VB6KZ", "title": "Spiking Graph Convolutional Networks", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Graph Convolutional Networks (GCNs) achieve an impressive performance due to the remarkable representation ability in learning the graph information. However, GCNs, when implemented on a deep network, require expensive computation power, which makes them difficult to be deployed on battery-powered devices. In contrast, Spiking Neural Networks (SNNs), which perform a bio-fidelity inference process, offer an energy-efficient neural architecture. In this work, we propose SpikingGCN, an end-to-end framework that aims to integrate the embedding of GCNs with the biofidelity characteristics of SNNs. In particular, the original graph data are encoded into spike trains based on the incorporation of graph convolution. We further model biological information processing by utilizing a fully connected layer combined with neuron nodes. In a wide range of scenarios, including citation networks, image graph classification, and recommender systems, our experimental results show that the proposed method could gain competitive performance against state-of-art (SOTA) approaches. Furthermore, we show that SpikingGCN on a neuromorphic chip can bring a clear advantage of energy efficiency into graph data analysis, which demonstrates its great potential to construct environment-friendly machine learning models.", "keywords": "Graph;spike;energy;neural network", "primary_area": "", "supplementary_material": "", "author": "Zulun Zhu;Jiaying Peng;Jintang Li;Liang Chen;Qi Yu;Zibin Zheng", "authorids": "~Zulun_Zhu1;pengjy36@mail2.sysu.edu.cn;lijt55@mail2.sysu.edu.cn;chenliang6@mail.sysu.edu.cn;~Qi_Yu1;zbzheng@inpluslab.com", "gender": "M;;;;M;", "homepage": "https://zulunzhu.github.io/;;;;https://www.rit.edu/mining/;", "dblp": "219/4427;;;;58/6957-1;", "google_scholar": ";;;;L3gWdfEAAAAJ;", "orcid": "0000-0002-5176-6378;;;;0000-0002-0426-5407;", "linkedin": ";;;;;", "or_profile": "~Zulun_Zhu1;pengjy36@mail2.sysu.edu.cn;lijt55@mail2.sysu.edu.cn;chenliang6@mail.sysu.edu.cn;~Qi_Yu1;zbzheng@inpluslab.com", "aff": "Sun Yat-Sen University;;;;Rochester Institute of Technology;", "aff_domain": "sysu.edu;;;;rit.edu;", "position": "Researcher;;;;Professor;", "bibtex": "@misc{\nzhu2022spiking,\ntitle={Spiking Graph Convolutional Networks},\nauthor={Zulun Zhu and Jiaying Peng and Jintang Li and Liang Chen and Qi Yu and Zibin Zheng},\nyear={2022},\nurl={https://openreview.net/forum?id=Ul3o26VB6KZ}\n}", "github": "", "project": "", "reviewers": "GoPN;Ko6H;cCCF;M75L", "site": "https://openreview.net/forum?id=Ul3o26VB6KZ", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "5;2;3;4", "correctness": "2;1;3;3", "technical_novelty": "1;2;3;2", "empirical_novelty": "2;2;3;2", "wc_summary_paper": "107;60;55;15", "wc_summary_review": "113;20;38;29", "wc_main_review": "457;116;244;207", "wc_review": "677;196;337;251", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 3.5, 1.118033988749895 ], "correctness_avg": [ 2.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 59.25, 32.62188682464581 ], "wc_summary_review_avg": [ 50.0, 36.92560087527351 ], "wc_main_review_avg": [ 256.0, 125.04599153911332 ], "wc_review_avg": [ 365.25, 186.87211536235148 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.9045340337332909, "gs_citation": 68, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5321589456296045729&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 5, "aff_unique_index": "0;1", "aff_unique_norm": "Sun Yat-sen University;Rochester Institute of Technology", "aff_unique_dep": ";", "aff_unique_url": "http://www.sysu.edu.cn/;https://www.rit.edu", "aff_unique_abbr": "SYSU;RIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "China;United States" }, { "id": "Ulj0tR-k7q", "title": "On strong convergence of the two-tower model for recommender system", "track": "main", "status": "Desk Reject", "tldr": "", "abstract": "Recommender system is capable of predicting preferred items for a user by integrating information from similar users or items. A popular model in recommender system is the so-called two-tower model, which employs two deep neural networks to embed users and items into a low-dimensional space, and predicts ratings via the geometrical relationship of the embeddings of user and item in the embedded space. Even though it is popularly used for recommendations, its theoretical properties remain largely unknown. In this paper, we establish some asymptotic results of the two-tower model in terms of its strong convergence to the optimal recommender system, showing that it achieves a fast convergence rate depending on the intrinsic dimensions of inputs features. To the best of our knowledge, this is among the first attempts to establish the statistical guarantee of the two-tower model. Through numerical experiments, we also demonstrate that the two-tower model is capable of capturing the effects of users' and items' features on ratings, leading to higher prediction accuracy over its competitors in both simulated examples and a real application data set. ", "keywords": "Artificial neural networks;Collaborative filtering;Empirical process;Recommender system;Two-tower model", "primary_area": "", "supplementary_material": "/attachment/b35254e6171f4f699c2e06cc859ea5218bb1c348.zip", "author": "SHIRONG XU;Junhui Wang", "authorids": "~SHIRONG_XU1;~Junhui_Wang3", "gender": "M;", "homepage": "https://scholars.cityu.edu.hk/en/persons/shirong-xu(b5961dbc-abb0-4dc7-87fe-1e6b5839fefc).html;https://sites.google.com/site/junhuiwang", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": "~SHIRONG_XU1;~Junhui_Wang3", "aff": ";City University of Hong Kong", "aff_domain": ";cityu.edu.hk", "position": ";Full Professor", "bibtex": "@misc{\nxu2022on,\ntitle={On strong convergence of the two-tower model for recommender system},\nauthor={SHIRONG XU and Junhui Wang},\nyear={2022},\nurl={https://openreview.net/forum?id=Ulj0tR-k7q}\n}", "github": "", "project": "", "reviewers": "", "site": "https://openreview.net/forum?id=Ulj0tR-k7q", "pdf_size": 0, "recommendation": "", "confidence": "", "correctness": "", "technical_novelty": "", "empirical_novelty": "", "wc_summary_paper": "", "wc_summary_review": "", "wc_main_review": "", "wc_review": "", "wc_reply_reviewers": "", "wc_reply_authors": "", "reply_reviewers": "", "reply_authors": "", "recommendation_avg": [ 0, 0 ], "confidence_avg": [ 0, 0 ], "correctness_avg": [ 0, 0 ], "technical_novelty_avg": [ 0, 0 ], "empirical_novelty_avg": [ 0, 0 ], "wc_summary_paper_avg": [ 0, 0 ], "wc_summary_review_avg": [ 0, 0 ], "wc_main_review_avg": [ 0, 0 ], "wc_review_avg": [ 0, 0 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 1, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0, "corr_recommendation_correctness": 0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7481773551127368222&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "City University of Hong Kong", "aff_unique_dep": "", "aff_unique_url": "https://www.cityu.edu.hk", "aff_unique_abbr": "CityU", "aff_campus_unique_index": "0", "aff_campus_unique": "Hong Kong SAR", "aff_country_unique_index": "0", "aff_country_unique": "China" }, { "id": "UoNqm70g9HY", "title": "Equivalent Distance Geometry Error for Molecular Conformation Comparison", "track": "main", "status": "Reject", "tldr": "", "abstract": "\\textit{Straight-forward} conformation generation models, which generate 3-D structures directly from input molecular graphs, play an important role in various molecular tasks with machine learning, such as 3D-QSAR and virtual screening in drug design. However, existing loss functions in these models either cost overmuch time or fail to guarantee the equivalence during optimization, which means treating different items unfairly, resulting in poor local geometry in generated conformation. So, we propose \\textbf{E}quivalent \\textbf{D}istance \\textbf{G}eometry \\textbf{E}rror (EDGE) to calculate the differential discrepancy between conformations where the essential factors of three kinds in conformation geometry (i.e. bond lengths, bond angles and dihedral angles) are equivalently optimized with certain weights. And in the improved version of our method, the optimization features minimizing linear transformations of atom-pair distances within 3-hop. Extensive experiments show that, compared with existing loss functions, EDGE performs effectively and efficiently in two tasks under the same backbones.", "keywords": "molecule;molecular conformation;loss function", "primary_area": "", "supplementary_material": "/attachment/10534ea9da3f59f273b10ea8ffa9da53a7886e55.zip", "author": "Shuwen Yang;Tianyu Wen;Ziyao Li;Guojie Song", "authorids": "~Shuwen_Yang1;~Tianyu_Wen1;~Ziyao_Li1;~Guojie_Song1", "gender": "M;M;M;M", "homepage": ";;;http://sai.pku.edu.cn/info/1022/2212.htm", "dblp": ";163/5023;230/4058;37/2900", "google_scholar": "mGpZECcAAAAJ;UCRUPCEAAAAJ;KzJYwbMAAAAJ;https://scholar.google.com.tw/citations?user=a832IIMAAAAJ", "orcid": "0009-0008-1358-9594;0000-0001-8653-2187;;0000-0001-8295-2520", "linkedin": ";tianyu-wen-54512b273/en;ziyao-li-3a4594146/;", "or_profile": "~Shuwen_Yang1;~Tianyu_Wen1;~Ziyao_Li1;~Guojie_Song1", "aff": "Peking University;Peking University;Peking University;Peking University", "aff_domain": "pku.edu.cn;pku.edu.cn;pku.edu.cn;pku.edu.cn", "position": "MS student;Undergrad student;PhD student;Associate Professor", "bibtex": "@misc{\nyang2022equivalent,\ntitle={Equivalent Distance Geometry Error for Molecular Conformation Comparison},\nauthor={Shuwen Yang and Tianyu Wen and Ziyao Li and Guojie Song},\nyear={2022},\nurl={https://openreview.net/forum?id=UoNqm70g9HY}\n}", "github": "", "project": "", "reviewers": "AM3U;fdL6;AFf9;vUYf", "site": "https://openreview.net/forum?id=UoNqm70g9HY", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "5;5;4;3", "correctness": "3;3;4;3", "technical_novelty": "2;2;2;3", "empirical_novelty": "3;0;2;3", "wc_summary_paper": "74;92;113;50", "wc_summary_review": "48;36;52;15", "wc_main_review": "318;320;139;87", "wc_review": "440;448;304;152", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 4.25, 0.82915619758885 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 1.224744871391589 ], "wc_summary_paper_avg": [ 82.25, 23.177305710543667 ], "wc_summary_review_avg": [ 37.75, 14.394009170484782 ], "wc_main_review_avg": [ 216.0, 104.6303015383211 ], "wc_review_avg": [ 336.0, 120.66482503198685 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.9045340337332909, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:W8fHFyi1UMkJ:scholar.google.com/&scioq=Equivalent+Distance+Geometry+Error+for+Molecular+Conformation+Comparison&hl=en&as_sdt=0,5", "gs_version_total": 3, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Peking University", "aff_unique_dep": "", "aff_unique_url": "http://www.pku.edu.cn", "aff_unique_abbr": "Peking U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "id": "Uozyxz3eKY", "title": "DM-CT: Consistency Training with Data and Model Perturbation", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Consistency training has been widely adopted and shown great promise in deep learning. The common approach of consistency training is performed on the data-level, which typically utilizes the data augmentation strategy (or adversarial training) to make the predictions from the augmented input and the original input to be consistent, so that the model is more robust and attains better generalization ability. Recently, consistency training is also incorporated from the model-level, in which the randomness existed in the model (e.g., dropout) is constrained during the training stage, and the inference model can be more consistent with the training phase. In this work, we investigate these two aspects and propose an integrated framework, DM-CT, that incorporates both the data-level and model-level consistency training. Concretely, the input data is first augmented, and the output distributions of different sub models generated by model variance are forced to be consistent (model-level). Meanwhile, the predictions of the original input and the augmented one are constrained to be consistent (data-level). We study different data augmentation strategies and model variances in the DM-CT framework. Experiments on different tasks, including neural machine translation ($4$ IWSLT14 translation tasks, multilingual translation task, and WMT16 Romanian$\\to$English translation), natural language understand (GLUE benchmark), and image classification (CIFAR-100 dataset), well demonstrate the superiority of DM-CT by obtaining significant and consistent performance improvements. ", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "xiaobo liang;Runze Mao;Lijun Wu;Juntao Li;Weiqing Liu;Qing Li;Min Zhang", "authorids": "~xiaobo_liang1;~Runze_Mao1;~Lijun_Wu1;~Juntao_Li2;~Weiqing_Liu1;~Qing_Li5;~Min_Zhang1", "gender": "M;M;M;M;;M;", "homepage": ";;https://apeterswu.github.io/;https://lijuntaopku.github.io/;;https://www4.comp.polyu.edu.hk/~csqli/;", "dblp": ";;68/1284-3;;;(2024-11-14-1812689);83/5342", "google_scholar": ";;https://scholar.google.com/citations?hl=en;sZSygsYAAAAJ;;https://scholar.google.co.in/citations?user=D1LEg-YAAAAJ;", "orcid": "0009-0001-1550-2877;0000-0002-1555-2436;0000-0002-3530-590X;0000-0002-6286-7529;;0000-0003-3370-471X;", "linkedin": ";;lijun-wu-59340478/;;weiqing-liu-09646b91/;;", "or_profile": "~xiaobo_liang1;~Runze_Mao1;~Lijun_Wu1;~Juntao_Li2;~Weiqing_Liu1;~Qing_Li5;~Min_Zhang1", "aff": "Soochow University, China;City University of Hong Kong;Microsoft Research;Soochow University, China;;Hong Kong Polytechnic University;", "aff_domain": "suda.edu.cn;cityu.edu.hk;microsoft.com;suda.edu.cn;;polyu.edu.hk;", "position": "PhD student;PhD student;Researcher;Associate Professor;;Full Professor;", "bibtex": "@misc{\nliang2022dmct,\ntitle={{DM}-{CT}: Consistency Training with Data and Model Perturbation},\nauthor={xiaobo liang and Runze Mao and Lijun Wu and Juntao Li and Weiqing Liu and Qing Li and Min Zhang},\nyear={2022},\nurl={https://openreview.net/forum?id=Uozyxz3eKY}\n}", "github": "", "project": "", "reviewers": "tubW;K43Q;X8YZ;499Q", "site": "https://openreview.net/forum?id=Uozyxz3eKY", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "5;4;4;4", "correctness": "4;2;4;3", "technical_novelty": "2;2;1;2", "empirical_novelty": "3;2;2;3", "wc_summary_paper": "61;67;42;52", "wc_summary_review": "67;67;40;28", "wc_main_review": "147;447;180;100", "wc_review": "275;581;262;180", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 1.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 55.5, 9.447221813845593 ], "wc_summary_review_avg": [ 50.5, 17.03672503740082 ], "wc_main_review_avg": [ 218.5, 134.95276951585691 ], "wc_review_avg": [ 324.5, 152.50327865328012 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": -0.17407765595569782, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:fUOssKi1eQMJ:scholar.google.com/&scioq=DM-CT:+Consistency+Training+with+Data+and+Model+Perturbation&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;2;0;3", "aff_unique_norm": "Soochow University;City University of Hong Kong;Microsoft;Hong Kong Polytechnic University", "aff_unique_dep": ";;Microsoft Research;", "aff_unique_url": "https://www.soochow.edu.cn;https://www.cityu.edu.hk;https://www.microsoft.com/en-us/research;https://www.polyu.edu.hk", "aff_unique_abbr": "Soochow U;CityU;MSR;PolyU", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;1;0;0", "aff_country_unique": "China;United States" }, { "id": "UquMPXFTpgp", "title": "Cluster Tree for Nearest Neighbor Search", "track": "main", "status": "Desk Reject", "tldr": "", "abstract": "Tree-based algorithms are an important and widely used class of algorithms for Nearest Neighbor Search (NNS) with random partition (RP) tree being arguably the most well studied. However, in spite of possessing theoretical guarantees and strong practical performance, a major drawback of the RP tree is its lack of adaptability to the input dataset.\n\nInspired by recent theoretical and practical works for NNS, we attempt to remedy this by introducing ClusterTree, a new tree based algorithm. Our approach utilizes randomness as in RP trees while adapting to the underlying cluster structure of the dataset to create well-balanced and meaningful partitions. Experimental evaluations on real world datasets demonstrate improvements over RP trees and other tree based methods for NNS while maintaining efficient construction time. In addition, we show theoretically and empirically that ClusterTree finds partitions which are superior to those found by RP trees in preserving the cluster structure of the input dataset.", "keywords": "Nearest neighbor search;tree algorithms;graph cuts;random projections", "primary_area": "", "supplementary_material": "", "author": "Dan Kushnir;Sandeep Silwal", "authorids": "~Dan_Kushnir1;~Sandeep_Silwal1", "gender": "M;M", "homepage": ";https://sandeepsilwal.com", "dblp": "87/231;225/4637", "google_scholar": ";MnDnUvcAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Dan_Kushnir1;~Sandeep_Silwal1", "aff": "Nokia networks GmbH;Massachusetts Institute of Technology", "aff_domain": "nokia-bell-labs.com;mit.edu", "position": "Researcher;PhD student", "bibtex": "@misc{\nkushnir2022cluster,\ntitle={Cluster Tree for Nearest Neighbor Search},\nauthor={Dan Kushnir and Sandeep Silwal},\nyear={2022},\nurl={https://openreview.net/forum?id=UquMPXFTpgp}\n}", "github": "", "project": "", "reviewers": "", "site": "https://openreview.net/forum?id=UquMPXFTpgp", "pdf_size": 0, "recommendation": "", "confidence": "", "correctness": "", "technical_novelty": "", "empirical_novelty": "", "wc_summary_paper": "", "wc_summary_review": "", "wc_main_review": "", "wc_review": "", "wc_reply_reviewers": "", "wc_reply_authors": "", "reply_reviewers": "", "reply_authors": "", "recommendation_avg": [ 0, 0 ], "confidence_avg": [ 0, 0 ], "correctness_avg": [ 0, 0 ], "technical_novelty_avg": [ 0, 0 ], "empirical_novelty_avg": [ 0, 0 ], "wc_summary_paper_avg": [ 0, 0 ], "wc_summary_review_avg": [ 0, 0 ], "wc_main_review_avg": [ 0, 0 ], "wc_review_avg": [ 0, 0 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 1, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0, "corr_recommendation_correctness": 0, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1, "aff_unique_index": "0;1", "aff_unique_norm": "Nokia Networks;Massachusetts Institute of Technology", "aff_unique_dep": ";", "aff_unique_url": "https://networks.nokia.com;https://web.mit.edu", "aff_unique_abbr": "Nokia;MIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "Germany;United States" }, { "title": "MIDI-DDSP: Detailed Control of Musical Performance via Hierarchical Modeling", "status": "Oral", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6966", "id": "UseMOjWENv", "poster": "", "openreview": "https://openreview.net/forum?id=UseMOjWENv", "slides": "https://iclr.cc/virtual/2022/poster/6966", "video": "https://iclr.cc/virtual/2022/poster/6966", "author_site": "Yusong Wu, Ethan Manilow, Yi Deng, Rigel Swavely, Kyle Kastner, Timotheus Cooijmans, Aaron Courville, Anna Huang, Jesse Engel", "tldr": "", "abstract": "Musical expression requires control of both what notes that are played, and how they are performed. Conventional audio synthesizers provide detailed expressive controls, but at the cost of realism. Black-box neural audio synthesis and concatenative samplers can produce realistic audio, but have few mechanisms for control. In this work, we introduce MIDI-DDSP a hierarchical model of musical instruments that enables both realistic neural audio synthesis and detailed user control. Starting from interpretable Differentiable Digital Signal Processing (DDSP) synthesis parameters, we infer musical notes and high-level properties of their expressive performance (such as timbre, vibrato, dynamics, and articulation). This creates a 3-level hierarchy (notes, performance, synthesis) that affords individuals the option to intervene at each level, or utilize trained priors (performance given notes, synthesis given performance) for creative assistance. Through quantitative experiments and listening tests, we demonstrate that this hierarchy can reconstruct high-fidelity audio, accurately predict performance attributes for a note sequence, independently manipulate the attributes of a given performance, and as a complete system, generate realistic audio from a novel note sequence. By utilizing an interpretable hierarchy, with multiple levels of granularity, MIDI-DDSP opens the door to assistive tools to empower individuals across a diverse range of musical experience.", "keywords": "Audio Synthesis;Generative Model;Hierarchical;DDSP;Music;Audio;Structured Models", "primary_area": "", "supplementary_material": "", "author": "Yusong Wu;Ethan Manilow;Yi Deng;Rigel Swavely;Kyle Kastner;Tim Cooijmans;Aaron Courville;Cheng-Zhi Anna Huang;Jesse Engel", "authorids": "~Yusong_Wu1;~Ethan_Manilow1;~Yi_Deng4;rigeljs@google.com;~Kyle_Kastner1;~Tim_Cooijmans1;~Aaron_Courville3;~Cheng-Zhi_Anna_Huang1;~Jesse_Engel1", "gender": "M;;;;Unspecified;M;;F;M", "homepage": "http://lukewys.github.io/;https://ethman.github.io/;;;;;;;", "dblp": "255/5686;210/6197;;;http://dblp.uni-trier.de/pers/hd/k/Kastner:Kyle;153/5756;56/1688;59/9006;", "google_scholar": ";;;;https://scholar.google.ca/citations?user=0XtGoMUAAAAJ;https://scholar.google.ca/citations?user=Ec6vKzwAAAAJ;https://scholar.google.ca/citations?user=km6CP8cAAAAJ;NRz_EVgAAAAJ;Sc7qOfcAAAAJ", "orcid": ";;;;;;;;", "linkedin": ";;%E6%BC%AA-%E9%82%93-b4082318a/;;;;;;", "or_profile": "~Yusong_Wu1;~Ethan_Manilow1;~Yi_Deng4;rigeljs@google.com;~Kyle_Kastner1;~Tim_Cooijmans1;~Aaron_Courville3;~Cheng-Zhi_Anna_Huang1;~Jesse_Engel1", "aff": "University of Montreal, Mila;Northwestern University;New York University;;Universit\u00e9 de Montr\u00e9al;University of Montreal;Universit\u00e9 de Montr\u00e9al;Google;Google Brain", "aff_domain": "umontreal.ca;u.northwestern.edu;nyu.edu;;umontreal.ca;umontreal.ca; ;google.com;google.com", "position": "MS student;PhD student;MS student;;PhD student;PhD student;Assistant Professor;Researcher;Research Scientist", "bibtex": "@inproceedings{\nwu2022mididdsp,\ntitle={{MIDI}-{DDSP}: Detailed Control of Musical Performance via Hierarchical Modeling},\nauthor={Yusong Wu and Ethan Manilow and Yi Deng and Rigel Swavely and Kyle Kastner and Tim Cooijmans and Aaron Courville and Cheng-Zhi Anna Huang and Jesse Engel},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=UseMOjWENv}\n}", "github": "", "project": "", "reviewers": "FkaH;Z9iV;6Ata", "pdf_size": 0, "recommendation": "8;8;8", "confidence": "4;3;5", "correctness": "4;3;4", "technical_novelty": "3;3;3", "empirical_novelty": "4;3;4", "wc_summary_paper": "136;112;33", "wc_summary_review": "85;39;23", "wc_main_review": "354;121;418", "wc_review": "575;272;474", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "465;385;505", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "recommendation_avg": [ 8.0, 0.0 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "correctness_avg": [ 3.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 93.66666666666667, 44.00252518006463 ], "wc_summary_review_avg": [ 49.0, 26.280537792569366 ], "wc_main_review_avg": [ 297.6666666666667, 127.62531967529885 ], "wc_review_avg": [ 440.3333333333333, 125.96913202138936 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 451.6666666666667, 49.88876515698589 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 9, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 67, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13729627625392909520&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=UseMOjWENv", "email": "umontreal.ca;u.northwestern.edu;nyu.edu;;umontreal.ca;umontreal.ca; ;google.com;google.com", "author_num": 9, "aff_unique_index": "0;1;2;3;0;3;4;4", "aff_unique_norm": "University of Montreal;Northwestern University;New York University;Universit\u00e9 de Montr\u00e9al;Google", "aff_unique_dep": "Mila;;;;Google", "aff_unique_url": "https://www.mila.quebec;https://www.northwestern.edu;https://www.nyu.edu;https://www.umontreal.ca;https://www.google.com", "aff_unique_abbr": "UM;NU;NYU;UdeM;Google", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;1;1;0;0;0;1;1", "aff_country_unique": "Canada;United States" }, { "title": "Measuring CLEVRness: Black-box Testing of Visual Reasoning Models", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6011", "id": "UtGtoS4CYU", "poster": "", "openreview": "https://openreview.net/forum?id=UtGtoS4CYU", "slides": "https://iclr.cc/virtual/2022/poster/6011", "video": "https://iclr.cc/virtual/2022/poster/6011", "author_site": "Spyridon Mouselinos, Henryk Michalewski, Mateusz Malinowski", "tldr": "", "abstract": "How can we measure the reasoning capabilities of intelligence systems? Visual question answering provides a convenient framework for testing the model's abilities by interrogating the model through questions about the scene. However, despite scores of various visual QA datasets and architectures, which sometimes yield even a super-human performance, the question of whether those architectures can actually reason remains open to debate.\nTo answer this, we extend the visual question answering framework and propose the following behavioral test in the form of a two-player game. We consider black-box neural models of CLEVR. These models are trained on a diagnostic dataset benchmarking reasoning. Next, we train an adversarial player that re-configures the scene to fool the CLEVR model. We show that CLEVR models, which otherwise could perform at a ``human-level'', can easily be fooled by our agent. Our results \nput in doubt whether data-driven approaches can do reasoning without exploiting the numerous biases that are often present in those datasets. Finally, we also propose a controlled experiment measuring the efficiency of such models to learn and perform reasoning.", "keywords": "Visual Reasoning;Visual Question Answering;Black Box Testing;Computer Vision", "primary_area": "", "supplementary_material": "", "author": "Spyridon Mouselinos;Henryk Michalewski;Mateusz Malinowski", "authorids": "~Spyridon_Mouselinos1;~Henryk_Michalewski1;~Mateusz_Malinowski1", "gender": "M;M;", "homepage": "https://spyrosmouselinos.github.io;https://www.mimuw.edu.pl/~henrykm/;http://mateuszmalinowski.com/", "dblp": ";https://dblp.uni-trier.de/pers/hd/m/Michalewski:Henryk;http://dblp.uni-trier.de/pers/hd/m/Malinowski:Mateusz", "google_scholar": "D6TDBuUAAAAJ;YdHW1ycAAAAJ;https://scholar.google.de/citations?user=IqJ3zskAAAAJ", "orcid": ";;", "linkedin": "spyridon-mouselinos/;henryk-michalewski-8a230a27/;", "or_profile": "~Spyridon_Mouselinos1;~Henryk_Michalewski1;~Mateusz_Malinowski1", "aff": "University of Warsaw;Google DeepMind;Google DeepMind", "aff_domain": "mimuw.edu.pl;google.com;deepmind.com", "position": "PhD student;Researcher;Research Scientist", "bibtex": "@inproceedings{\nmouselinos2022measuring,\ntitle={Measuring {CLEVR}ness: Black-box Testing of Visual Reasoning Models},\nauthor={Spyridon Mouselinos and Henryk Michalewski and Mateusz Malinowski},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=UtGtoS4CYU}\n}", "github": "", "project": "", "reviewers": "8VmJ;u1a9;ELDV", "pdf_size": 0, "recommendation": "6;6;6", "confidence": "4;4;4", "correctness": "3;3;3", "technical_novelty": "3;3;2", "empirical_novelty": "3;3;3", "wc_summary_paper": "71;151;94", "wc_summary_review": "48;133;155", "wc_main_review": "156;277;1092", "wc_review": "275;561;1341", "wc_reply_reviewers": "231;21;83", "wc_reply_authors": "763;971;1691", "reply_reviewers": "1;1;1", "reply_authors": "2;2;3", "recommendation_avg": [ 6.0, 0.0 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 105.33333333333333, 33.62869145371091 ], "wc_summary_review_avg": [ 112.0, 46.13747572924495 ], "wc_main_review_avg": [ 508.3333333333333, 415.66038487635024 ], "wc_review_avg": [ 725.6666666666666, 450.49996917005694 ], "wc_reply_reviewers_avg": [ 111.66666666666667, 88.09590733336532 ], "wc_reply_authors_avg": [ 1141.6666666666667, 397.6106414851371 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 2.3333333333333335, 0.4714045207910317 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15061476273048645969&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=UtGtoS4CYU", "email": "mimuw.edu.pl;google.com;deepmind.com", "author_num": 3, "aff_unique_index": "0;1;1", "aff_unique_norm": "University of Warsaw;Google", "aff_unique_dep": ";Google DeepMind", "aff_unique_url": "https://www.uw.edu.pl;https://deepmind.com", "aff_unique_abbr": "UW;DeepMind", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1", "aff_country_unique": "Poland;United Kingdom" }, { "id": "UvNXZgJAOAP", "title": "Sharp Attention for Sequence to Sequence Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Attention mechanism has been widely applied to tasks that output some sequence from an input image. Its success comes from the ability to align relevant parts of the encoded image with the target output. However, most of the existing methods fail to build clear alignment because the aligned parts are unable to well represent the target. In this paper we seek clear alignment in attention mechanism through a \\emph{sharpener} module. Since it deliberately locates the target in an image region and refines representation to be target-specific, the alignment and interpretability of attention can be significantly improved. Experiments on synthetic handwritten digit as well as real-world scene text recognition datasets show that our approach outperforms the mainstream ones such as soft and hard attention.", "keywords": "Attention mechanism;sequence to sequence learning;reinforcement learning", "primary_area": "", "supplementary_material": "", "author": "Pei Zhang;Hua Liu", "authorids": "~Pei_Zhang5;~Hua_Liu1", "gender": "M;F", "homepage": ";", "dblp": ";", "google_scholar": "_ztkIyIAAAAJ;", "orcid": ";", "linkedin": ";\u534e-\u5218-a199a5127", "or_profile": "~Pei_Zhang5;~Hua_Liu1", "aff": "Autohome Inc.;Autohome Inc.", "aff_domain": "autohome.com.cn;autohome.com.cn", "position": "Principal Researcher;Researcher", "bibtex": "@misc{\nzhang2022sharp,\ntitle={Sharp Attention for Sequence to Sequence Learning},\nauthor={Pei Zhang and Hua Liu},\nyear={2022},\nurl={https://openreview.net/forum?id=UvNXZgJAOAP}\n}", "github": "", "project": "", "reviewers": "1RPC;mESb;aDJW;5Hay", "site": "https://openreview.net/forum?id=UvNXZgJAOAP", "pdf_size": 0, "recommendation": "3;3;5;6", "confidence": "4;4;3;4", "correctness": "3;2;3;3", "technical_novelty": "2;2;3;2", "empirical_novelty": "2;2;3;2", "wc_summary_paper": "136;57;37;123", "wc_summary_review": "89;30;60;47", "wc_main_review": "626;147;181;332", "wc_review": "851;234;278;502", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.25, 1.299038105676658 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 88.25, 42.10329559547566 ], "wc_summary_review_avg": [ 56.5, 21.569654610122992 ], "wc_main_review_avg": [ 321.5, 189.09058675671827 ], "wc_review_avg": [ 466.25, 244.2789133347371 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": 0.5555555555555555, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:THZDSRqOIq4J:scholar.google.com/&scioq=Sharp+Attention+for+Sequence+to+Sequence+Learning&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Autohome Inc.", "aff_unique_dep": "", "aff_unique_url": "https://www.autohome.com.cn", "aff_unique_abbr": "Autohome", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "id": "UxBH9j8IE_H", "title": "Revisiting the Lottery Ticket Hypothesis: A Ramanujan Graph Perspective", "track": "main", "status": "Reject", "tldr": "", "abstract": "Neural networks often yield to weight pruning resulting in a sparse subnetwork that is adequate for a given task. Retraining these `lottery ticket' subnetworks from their initialization minimizes the computational burden while preserving the test set accuracy of the original network. Based on our knowledge, the existing literature only confirms that pruning is needed and it can be achieved up to certain sparsity. We analyze the pruned network in the context of the properties of Ramanujan expander graphs. We consider the feed-forward network (both multi-layer perceptron and convolutional network) as a series of bipartite graphs which establish the connection from input to output. Now, as the fraction of remaining weights reduce with increasingly aggressive pruning two distinct regimes are observed: initially, no significant decrease in accuracy is demonstrated, and then the accuracy starts dropping rapidly. We empirically show that in the first regime the pruned lottery ticket sub-network remains a Ramanujan graph. Subsequently, with the loss of Ramanujan graph property, accuracy begins to reduce sharply. This characterizes an absence of resilient connectivity in the pruned sub-network. We also propose a new magnitude-based pruning algorithm to preserve the above property. We perform experiments on MNIST and CIFAR10 datasets using different established feed-forward architectures and show that the winning ticket obtained from the proposed algorithm is much more robust.", "keywords": "Deep Neural Networks;Network Pruning;Ramanujan Graphs;Eigenvalue bounds;Spectral Gap", "primary_area": "", "supplementary_material": "", "author": "BITHIKA PAL;Arindam Biswas;Pabitra Mitra;BISWAJIT BASU", "authorids": "~BITHIKA_PAL1;~Arindam_Biswas1;~Pabitra_Mitra1;~BISWAJIT_BASU1", "gender": "F;;M;M", "homepage": "https://sites.google.com/view/bithikapal/;;http://cse.iitkgp.ac.in/~pabitra/;https://people.tcd.ie/Profile?Username=basub", "dblp": "217/5181;;m/PabitraMitra;", "google_scholar": "x7-BaeQAAAAJ;;https://scholar.google.com.tw/citations?user=5bXSZPYAAAAJ;Gjeu9sgAAAAJ", "orcid": ";;0000-0002-1908-9813;", "linkedin": ";;pabitra-mitra-8028235/;", "or_profile": "~BITHIKA_PAL1;~Arindam_Biswas1;~Pabitra_Mitra1;~BISWAJIT_BASU1", "aff": "Indian Institute of Technology Kharagpur;;Indian Institute of Technology Kharagpur;Trinity College, Dublin", "aff_domain": "iitkgp.ac.in;;iitkgp.ac.in;tcd.ie", "position": "PhD student;;Full Professor;Full Professor", "bibtex": "@misc{\npal2022revisiting,\ntitle={Revisiting the Lottery Ticket Hypothesis: A Ramanujan Graph Perspective},\nauthor={BITHIKA PAL and Arindam Biswas and Pabitra Mitra and BISWAJIT BASU},\nyear={2022},\nurl={https://openreview.net/forum?id=UxBH9j8IE_H}\n}", "github": "", "project": "", "reviewers": "M3ko;u9fu;rGMS;GGCK", "site": "https://openreview.net/forum?id=UxBH9j8IE_H", "pdf_size": 0, "recommendation": "5;5;5;6", "confidence": "5;3;3;3", "correctness": "3;3;3;3", "technical_novelty": "2;2;2;4", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "79;62;25;116", "wc_summary_review": "20;25;61;28", "wc_main_review": "237;579;248;245", "wc_review": "336;666;334;389", "wc_reply_reviewers": "0;192;229;0", "wc_reply_authors": "587;2075;887;270", "reply_reviewers": "0;3;1;0", "reply_authors": "1;5;2;1", "recommendation_avg": [ 5.25, 0.4330127018922193 ], "confidence_avg": [ 3.5, 0.8660254037844386 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.5, 0.8660254037844386 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 70.5, 32.72995569810628 ], "wc_summary_review_avg": [ 33.5, 16.132265804901678 ], "wc_main_review_avg": [ 327.25, 145.4035333133277 ], "wc_review_avg": [ 431.25, 137.31601326866434 ], "wc_reply_reviewers_avg": [ 105.25, 106.05982981317668 ], "wc_reply_authors_avg": [ 954.75, 682.5820005684299 ], "reply_reviewers_avg": [ 1.0, 1.224744871391589 ], "reply_authors_avg": [ 2.25, 1.6393596310755 ], "replies_avg": [ 20, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:NRw7pSfUxuAJ:scholar.google.com/&scioq=Revisiting+the+Lottery+Ticket+Hypothesis:+A+Ramanujan+Graph+Perspective&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;1", "aff_unique_norm": "Indian Institute of Technology Kharagpur;Trinity College Dublin", "aff_unique_dep": ";", "aff_unique_url": "https://www.iitkgp.ac.in;https://www.tcd.ie", "aff_unique_abbr": "IIT Kharagpur;TCD", "aff_campus_unique_index": "0;0;1", "aff_campus_unique": "Kharagpur;Dublin", "aff_country_unique_index": "0;0;1", "aff_country_unique": "India;Ireland" }, { "id": "UxTR9Z2DW8R", "title": "Reinforcement Learning State Estimation for High-Dimensional Nonlinear Systems", "track": "main", "status": "Reject", "tldr": "", "abstract": "In high-dimensional nonlinear systems such as fluid flows, the design of state estimators such as Kalman filters relies on a reduced-order model (ROM) of the dynamics. However, ROMs are prone to large errors, which negatively affects the performance of the estimator. Here, we introduce the reinforcement learning reduced-order estimator (RL-ROE), a ROM-based estimator in which the data assimilation feedback term is given by a nonlinear stochastic policy trained through reinforcement learning. The flexibility of the nonlinear policy enables the RL-ROE to compensate for errors of the ROM, while still taking advantage of the imperfect knowledge of the dynamics. We show that the trained RL-ROE is able to outperform a Kalman filter designed using the same ROM, and displays robust estimation performance with respect to different reference trajectories and initial state estimates.", "keywords": "Reinforcement learning;partial differential equation;reduced order modeling;closure models;state prediction;state estimation;dynamic mode decomposition.", "primary_area": "", "supplementary_material": "", "author": "Saviz Mowlavi;Mouhacine Benosman;Saleh Nabi", "authorids": "~Saviz_Mowlavi1;~Mouhacine_Benosman1;~Saleh_Nabi1", "gender": ";M;", "homepage": ";;", "dblp": ";;192/3210", "google_scholar": "CHVIWXoAAAAJ;cs7AJxcAAAAJ;", "orcid": ";;", "linkedin": ";;", "or_profile": "~Saviz_Mowlavi1;~Mouhacine_Benosman1;~Saleh_Nabi1", "aff": "Massachusetts Institute of Technology;Mitsubishi Electric Research Labs;Mitsubishi Electric Research Labs", "aff_domain": "mit.edu;merl.com;merl.com", "position": "PhD student;Researcher;Principal Research Scientist ", "bibtex": "@misc{\nmowlavi2022reinforcement,\ntitle={Reinforcement Learning State Estimation for High-Dimensional Nonlinear Systems},\nauthor={Saviz Mowlavi and Mouhacine Benosman and Saleh Nabi},\nyear={2022},\nurl={https://openreview.net/forum?id=UxTR9Z2DW8R}\n}", "github": "", "project": "", "reviewers": "1zd8;Cjmb;EDfb;WTxT", "site": "https://openreview.net/forum?id=UxTR9Z2DW8R", "pdf_size": 0, "recommendation": "3;3;5;8", "confidence": "4;3;4;4", "correctness": "3;3;1;4", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "199;48;60;77", "wc_summary_review": "32;18;42;34", "wc_main_review": "334;102;252;366", "wc_review": "565;168;354;477", "wc_reply_reviewers": "427;0;0;0", "wc_reply_authors": "1436;695;629;808", "reply_reviewers": "3;0;0;0", "reply_authors": "4;1;1;1", "recommendation_avg": [ 4.75, 2.0463381929681126 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 2.75, 1.0897247358851685 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 96.0, 60.35312750802563 ], "wc_summary_review_avg": [ 31.5, 8.645808232895291 ], "wc_main_review_avg": [ 263.5, 102.0918703913294 ], "wc_review_avg": [ 391.0, 148.9714737793783 ], "wc_reply_reviewers_avg": [ 106.75, 184.89642370797765 ], "wc_reply_authors_avg": [ 892.0, 320.53470950897037 ], "reply_reviewers_avg": [ 0.75, 1.299038105676658 ], "reply_authors_avg": [ 1.75, 1.299038105676658 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.49374193110101877, "corr_recommendation_correctness": 0.3083035200691658, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1062651888305221384&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;1;1", "aff_unique_norm": "Massachusetts Institute of Technology;Mitsubishi Electric Research Laboratories", "aff_unique_dep": ";", "aff_unique_url": "https://web.mit.edu;https://www.merl.com", "aff_unique_abbr": "MIT;MERL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "Uxppuphg5ZL", "title": "Constraint-based graph network simulator", "track": "main", "status": "Reject", "tldr": "", "abstract": "In the rapidly advancing area of learned physical simulators, nearly all methods train a forward model that directly predicts future states from input states. However, many traditional simulation engines use a constraint-based approach instead of direct prediction. Here we present a framework for constraint-based learned simulation, where a scalar constraint function is implemented as a trainable function approximator, and future predictions are computed as the solutions to a constraint satisfaction problem. We implement our method using a graph neural network as the constraint function and gradient descent as the constraint solver. The architecture can be trained by standard backpropagation. We test the model on a variety of challenging physical domains, including simulated ropes, bouncing balls, colliding irregular shapes and splashing fluids. Our model achieves better or comparable performance to top learned simulators. A key advantage of our model is the ability to generalize to more solver iterations at test time to improve the simulation accuracy. We also show how hand-designed constraints can be added at test time to satisfy objectives which were not present in the training data, which is not possible with forward approaches. Our constraint-based framework is applicable to any setting in which forward learned simulators are used, and more generally demonstrates key ways that learned models can leverage popular methods in numerical methods.", "keywords": "Physical simulations;graph neural network", "primary_area": "", "supplementary_material": "", "author": "Yulia Rubanova;Alvaro Sanchez-Gonzalez;Tobias Pfaff;Peter Battaglia", "authorids": "~Yulia_Rubanova2;~Alvaro_Sanchez-Gonzalez1;~Tobias_Pfaff1;~Peter_Battaglia1", "gender": "M;M;M;F", "homepage": ";http://tobiaspfaff.com;;https://yuliarubanova.github.io/", "dblp": "222/1889;67/7591;41/3400;222/3085", "google_scholar": "https://scholar.google.co.uk/citations?user=d1oQ8NcAAAAJ;3oUgDKQAAAAJ;https://scholar.google.co.uk/citations?user=nQ7Ij30AAAAJ;u_HzE9wAAAAJ", "orcid": ";;;", "linkedin": ";;;https://linkedin.com/in/yulia-rubanova-031702100", "or_profile": "~Alvaro_Sanchez-Gonzalez1;~Tobias_Pfaff1;~Peter_Battaglia1;~Yulia_Rubanova1", "aff": "Google DeepMind;Deepmind;Google DeepMind;Google DeepMind", "aff_domain": "google.com;google.com;google.com;deepmind.com", "position": "Senior Research Engineer;Research scientist;Researcher;Research Scientist", "bibtex": "@misc{\nrubanova2022constraintbased,\ntitle={Constraint-based graph network simulator},\nauthor={Yulia Rubanova and Alvaro Sanchez-Gonzalez and Tobias Pfaff and Peter Battaglia},\nyear={2022},\nurl={https://openreview.net/forum?id=Uxppuphg5ZL}\n}", "github": "", "project": "", "reviewers": "AXAA;TMn2;WhyX;kLRf", "site": "https://openreview.net/forum?id=Uxppuphg5ZL", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "4;3;4;4", "correctness": "3;3;4;3", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;2;3;2", "wc_summary_paper": "94;55;101;129", "wc_summary_review": "14;16;48;100", "wc_main_review": "599;746;358;627", "wc_review": "707;817;507;856", "wc_reply_reviewers": "0;39;0;0", "wc_reply_authors": "2832;2247;929;2445", "reply_reviewers": "0;1;0;0", "reply_authors": "6;4;2;5", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 94.75, 26.423237878806603 ], "wc_summary_review_avg": [ 44.5, 34.767082132384935 ], "wc_main_review_avg": [ 582.5, 140.87671915543746 ], "wc_review_avg": [ 721.75, 135.49054395049126 ], "wc_reply_reviewers_avg": [ 9.75, 16.887495373796554 ], "wc_reply_authors_avg": [ 2113.25, 715.3664707686544 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 4.25, 1.479019945774904 ], "replies_avg": [ 24, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.13245323570650439, "corr_recommendation_correctness": 0.13245323570650439, "gs_citation": 36, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1543304278279933928&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;1;0;0", "aff_unique_norm": "Google;DeepMind", "aff_unique_dep": "Google DeepMind;", "aff_unique_url": "https://deepmind.com;https://deepmind.com", "aff_unique_abbr": "DeepMind;DeepMind", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United Kingdom" }, { "id": "Uy6YEI9-6v", "title": "Object-Centric Neural Scene Rendering", "track": "main", "status": "Reject", "tldr": "", "abstract": "We present a method for composing photorealistic scenes from captured images of objects. Traditional computer graphics methods are unable to model objects from observations only; instead, they rely on underlying computer graphics models. Our work builds upon neural radiance fields (NeRFs), which implicitly model the volumetric density and directionally-emitted radiance of a scene. While NeRFs synthesize realistic pictures, they only model static scenes and are closely tied to specific imaging conditions. This property makes NeRFs hard to generalize to new scenarios, including new lighting or new arrangements of objects. Instead of learning a scene radiance field as a NeRF does, we propose to learn object-centric neural scattering functions (OSFs), a representation that models per-object light transport implicitly using a lighting- and view-dependent neural network. This enables rendering scenes even when objects or lights move, without retraining. Combined with a volumetric path tracing procedure, our framework is capable of rendering light transport effects including occlusions, specularities, shadows, and indirect illumination, both within individual objects and between different objects. We evaluate our approach on synthetic and real world datasets and generalize to novel scene configurations, producing photorealistic, physically accurate renderings of multi-object scenes.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/a8851bc3afffc29dfce712a94ee2a15cf526f944.zip", "author": "Michelle Guo;Alireza Fathi;Jiajun Wu;Thomas Funkhouser", "authorids": "~Michelle_Guo1;~Alireza_Fathi1;~Jiajun_Wu1;~Thomas_Funkhouser1", "gender": "F;M;M;M", "homepage": "https://shellguo.com;http://ai.stanford.edu/~alireza/;https://jiajunwu.com;http://www.cs.princeton.edu/~funk/", "dblp": "185/0671;70/3898;117/4768;f/TAFunkhouser", "google_scholar": "lyjjpNMAAAAJ;luv0xMIAAAAJ;2efgcS0AAAAJ;https://scholar.google.com.tw/citations?user=BghVDhgAAAAJ", "orcid": "0000-0002-6574-6669;;0000-0002-4176-343X;", "linkedin": ";alireza-fathi-04338411/;jiajunwu/;", "or_profile": "~Michelle_Guo1;~Alireza_Fathi1;~Jiajun_Wu1;~Thomas_Funkhouser1", "aff": "Computer Science Department, Stanford University;Google;Stanford University;Google", "aff_domain": "cs.stanford.edu;google.com;stanford.edu;google.com", "position": "PhD student;researcher;Assistant Professor;Senior Research Scientist", "bibtex": "@misc{\nguo2022objectcentric,\ntitle={Object-Centric Neural Scene Rendering},\nauthor={Michelle Guo and Alireza Fathi and Jiajun Wu and Thomas Funkhouser},\nyear={2022},\nurl={https://openreview.net/forum?id=Uy6YEI9-6v}\n}", "github": "", "project": "", "reviewers": "6eq6;TPuY;KXhU;4NH2", "site": "https://openreview.net/forum?id=Uy6YEI9-6v", "pdf_size": 0, "recommendation": "5;5;5;5", "confidence": "4;3;3;4", "correctness": "4;3;3;2", "technical_novelty": "3;3;2;2", "empirical_novelty": "1;2;2;2", "wc_summary_paper": "94;57;133;107", "wc_summary_review": "143;63;52;46", "wc_main_review": "858;176;465;591", "wc_review": "1095;296;650;744", "wc_reply_reviewers": "45;47;0;98", "wc_reply_authors": "431;85;148;240", "reply_reviewers": "1;1;0;1", "reply_authors": "1;1;1;1", "recommendation_avg": [ 5.0, 0.0 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 1.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 97.75, 27.39867697535777 ], "wc_summary_review_avg": [ 76.0, 39.15992849840255 ], "wc_main_review_avg": [ 522.5, 245.26567228211943 ], "wc_review_avg": [ 696.25, 284.43837205974864 ], "wc_reply_reviewers_avg": [ 47.5, 34.6878941419049 ], "wc_reply_authors_avg": [ 226.0, 130.56224569147085 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 125, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12782584135694000323&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;0;1", "aff_unique_norm": "Stanford University;Google", "aff_unique_dep": "Computer Science Department;Google", "aff_unique_url": "https://www.stanford.edu;https://www.google.com", "aff_unique_abbr": "Stanford;Google", "aff_campus_unique_index": "0;1;0;1", "aff_campus_unique": "Stanford;Mountain View", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "UyBxDoukIB", "title": "Teamwork makes von Neumann work:Min-Max Optimization in Two-Team Zero-Sum Games", "track": "main", "status": "Reject", "tldr": "", "abstract": "Motivated by recent advances in both theoretical and applied aspects of multiplayer games, spanning from e-sports to multi-agent generative adversarial networks, we focus on min-max optimization in team zero-sum games. In this class of games, players are split in two teams with payoffs equal within the same team and of opposite sign across the opponent team. Unlike the textbook two player zero-sum games, finding a Nash equilibrium in our class can be shown to be $\\textsf{CLS}$-hard, i.e., it is unlikely to have a polynomial time algorithm for computing Nash equilibria. Moreover In this generalized framework, we establish that even asymptotic last iterate or time average convergence to a Nash Equilibrium is not possible using Gradient Descent Ascent (GDA), its optimistic variant and extra gradient. Specifically, we present a family of team games whose induced utility is non-multilinear with non-attractive $\\textit{per-se}$ mixed Nash Equilibria, as strict saddle points of the underlying optimization landscape. Leveraging techniques from control theory, we complement these negative results by designing a modified GDA that converges locally to Nash equilibria. Finally, we discuss connections of our framework with AI architectures with team competition structure like multi-agent generative adversarial networks.", "keywords": "Min-max Optimization;Non-convex Optimization;Multi-agent learning;Multi-agent GANs;Game Theory;Duality Gap", "primary_area": "", "supplementary_material": "/attachment/1a2d6b287a96185fbf4774003f5fda4a1f1bbbd9.zip", "author": "Fivos Kalogiannis;Ioannis Panageas;Emmanouil-Vasileios Vlatakis-Gkaragkounis", "authorids": "~Fivos_Kalogiannis1;~Ioannis_Panageas1;~Emmanouil-Vasileios_Vlatakis-Gkaragkounis1", "gender": "M;M;M", "homepage": "https://fivoskal.github.io/;https://panageas.github.io;http://www.cs.columbia.edu/~emvlatakis/", "dblp": "305/7347;139/3829;251/8372", "google_scholar": "FVEj9MIAAAAJ;5NiFWuwAAAAJ;MKutDKcAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Fivos_Kalogiannis1;~Ioannis_Panageas1;~Emmanouil-Vasileios_Vlatakis-Gkaragkounis1", "aff": "National Technical University of Athens;Donald Bren School of Information and Computer Sciences, University of California, Irvine;Columbia University", "aff_domain": "ntua.gr;ics.uci.edu;columbia.edu", "position": "Undergrad student;Assistant Professor;PhD student", "bibtex": "@misc{\nkalogiannis2022teamwork,\ntitle={Teamwork makes von Neumann work:Min-Max Optimization in Two-Team Zero-Sum Games},\nauthor={Fivos Kalogiannis and Ioannis Panageas and Emmanouil-Vasileios Vlatakis-Gkaragkounis},\nyear={2022},\nurl={https://openreview.net/forum?id=UyBxDoukIB}\n}", "github": "", "project": "", "reviewers": "FpwP;iJde;rW5j;89w7", "site": "https://openreview.net/forum?id=UyBxDoukIB", "pdf_size": 0, "recommendation": "3;6;6;6", "confidence": "4;4;3;4", "correctness": "4;3;3;4", "technical_novelty": "1;2;2;3", "empirical_novelty": "1;2;2;0", "wc_summary_paper": "58;47;66;92", "wc_summary_review": "117;33;44;35", "wc_main_review": "465;255;729;379", "wc_review": "640;335;839;506", "wc_reply_reviewers": "225;22;286;0", "wc_reply_authors": "3282;288;1167;1586", "reply_reviewers": "2;1;1;0", "reply_authors": "9;1;4;3", "recommendation_avg": [ 5.25, 1.299038105676658 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 1.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 65.75, 16.588776326179094 ], "wc_summary_review_avg": [ 57.25, 34.744603897583865 ], "wc_main_review_avg": [ 457.0, 173.87926845946873 ], "wc_review_avg": [ 580.0, 184.5142270937393 ], "wc_reply_reviewers_avg": [ 133.25, 124.38121843751169 ], "wc_reply_authors_avg": [ 1580.75, 1088.1946919094946 ], "reply_reviewers_avg": [ 1.0, 0.7071067811865476 ], "reply_authors_avg": [ 4.25, 2.947456530637899 ], "replies_avg": [ 30, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": -0.5773502691896257, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14286859676017090888&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;2", "aff_unique_norm": "National Technical University of Athens;University of California, Irvine;Columbia University", "aff_unique_dep": ";Donald Bren School of Information and Computer Sciences;", "aff_unique_url": "https://www.ntua.gr;https://www.uci.edu;https://www.columbia.edu", "aff_unique_abbr": "NTUA;UCI;Columbia", "aff_campus_unique_index": "1", "aff_campus_unique": ";Irvine", "aff_country_unique_index": "0;1;1", "aff_country_unique": "Greece;United States" }, { "id": "UzOEYQM-xTg", "title": "Robust Long-Tailed Learning under Label Noise", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Long-tailed learning has attracted much attention recently, with the goal of improving generalisation for tail classes. Most existing works use supervised learning without considering the prevailing noise in the training dataset. To move long-tailed learning towards more realistic scenarios, this work investigates the label noise problem under long-tailed label distribution. We first observe the negative impact of noisy labels on the performance of existing methods, revealing the intrinsic challenges of this problem. As the most commonly used approach to cope with noisy labels in previous literature, we then find that the small-loss trick fails under long-tailed label distribution. The reason is that deep neural networks cannot distinguish correctly-labeled and mislabeled examples on tail classes. To overcome this limitation, we establish a new prototypical noise detection method by designing a distance-based metric that is resistant to label noise. Based on the above findings, we propose a robust framework,~\\algo, that realizes noise detection for long-tailed learning, followed by soft pseudo-labeling via both label smoothing and diverse label guessing. Moreover, our framework can naturally leverage semi-supervised learning algorithms to further improve the generalisation. Extensive experiments on both benchmark and real-world datasets demonstrate substantial improvement over many existing methods. For example, \\algo\\ outperforms baselines by more than 5\\% in test accuracy.", "keywords": "weakly-supervised learning;long-tailed learning;learning with noisy labels;semi-supervised learning;multi-label learning", "primary_area": "", "supplementary_material": "/attachment/37b8bdd776dce1467cc476eee7a080b49026eff2.zip", "author": "Tong Wei;Jiang-Xin Shi;Wei-Wei Tu;Yu-Feng Li", "authorids": "~Tong_Wei1;~Jiang-Xin_Shi1;~Wei-Wei_Tu1;~Yu-Feng_Li1", "gender": "M;;M;M", "homepage": "https://palm.seu.edu.cn/weit/;http://www.lamda.nju.edu.cn/shijx;;https://cs.nju.edu.cn/liyf/index.htm", "dblp": "49/933-1;299/5485.html;229/4363;57/413", "google_scholar": "EFCZuW4AAAAJ;KEgtGncAAAAJ;NrSit7IAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": "0000-0002-2766-8209;0000-0002-0318-0911;0000-0002-2407-0252;0000-0002-2220-5248", "linkedin": ";;wei-wei-tu/;", "or_profile": "~Tong_Wei1;~Jiang-Xin_Shi1;~Wei-Wei_Tu1;~Yu-feng_Li2", "aff": "Southeast University;Nanjing University;4Paradigm Inc.;Nanjing University", "aff_domain": "seu.edu.cn;nju.edu.cn;4paradigm.com;nju.edu.cn", "position": "Associate Professor;PhD student;Vice President;Assistant Professor", "bibtex": "@misc{\nwei2022robust,\ntitle={Robust Long-Tailed Learning under Label Noise},\nauthor={Tong Wei and Jiang-Xin Shi and Wei-Wei Tu and Yu-Feng Li},\nyear={2022},\nurl={https://openreview.net/forum?id=UzOEYQM-xTg}\n}", "github": "", "project": "", "reviewers": "kRzC;Qf4S;MdVZ", "site": "https://openreview.net/forum?id=UzOEYQM-xTg", "pdf_size": 0, "recommendation": "3;3;5", "confidence": "4;4;5", "correctness": "3;2;3", "technical_novelty": "2;2;2", "empirical_novelty": "2;2;2", "wc_summary_paper": "89;185;62", "wc_summary_review": "82;74;62", "wc_main_review": "904;596;273", "wc_review": "1075;855;397", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 3.6666666666666665, 0.9428090415820634 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "correctness_avg": [ 2.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 112.0, 52.78257288158659 ], "wc_summary_review_avg": [ 72.66666666666667, 8.219218670625303 ], "wc_main_review_avg": [ 591.0, 257.62893212266874 ], "wc_review_avg": [ 775.6666666666666, 282.41970343601895 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 1.0, "corr_recommendation_correctness": 0.5, "gs_citation": 65, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16820883929253164498&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;2;1", "aff_unique_norm": "Southeast University;Nanjing University;4Paradigm", "aff_unique_dep": ";;", "aff_unique_url": "https://www.seu.edu.cn/;https://www.nju.edu.cn;https://www.4paradigm.com/", "aff_unique_abbr": "SEU;Nanjing U;4Paradigm", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "id": "V09OhBn8iR", "title": "Mitigating Dataset Bias Using Per-Sample Gradients From A Biased Classifier", "track": "main", "status": "Reject", "tldr": "", "abstract": "The performance of deep neural networks (DNNs) primarily depends on the configuration of the training set. Specifically, biased training sets can make the trained model have unintended prejudice, which causes severe errors in the inference. Although several studies have addressed biased training using human supervision, few studies have been conducted without human knowledge because biased information cannot be easily extracted without human involvement. This study proposes a simple method to remove prejudice from a biased model without additional information and reconstruct a balanced training set based on the biased training set. The novel training method consists of three steps: (1) training biased DNNs, (2) measuring the contribution to the prejudicial training and generating balanced data batches to prevent the prejudice, (3) training de-biased DNNs with the balanced data. We test the training method based on various synthetic and real-world biased sets and discuss how gradients can efficiently detect minority samples. The experiment demonstrates that the detection method based on the gradients helps erase prejudice, resulting in improved inference accuracy by up to 19.58\\% compared to the other state-of-the-art algorithm.", "keywords": "dataset bias;debiasing;representation bias", "primary_area": "", "supplementary_material": "/attachment/80a7beb0677cd5699306f7981683f87f405f89c2.zip", "author": "Sumyeong Ahn;Se-Young Yun", "authorids": "~Sumyeong_Ahn1;~Se-Young_Yun1", "gender": "M;M", "homepage": "https://sumyeongahn.github.io;https://fbsqkd.github.io", "dblp": "217/5462;23/8862", "google_scholar": "krxhvIYAAAAJ;X_IAjb8AAAAJ", "orcid": ";", "linkedin": ";seyoung-yun-395130ab/", "or_profile": "~Sumyeong_Ahn1;~Se-Young_Yun1", "aff": "Korea Advanced Institute of Science & Technology;KAIST", "aff_domain": "kaist.ac.kr;kaist.ac.kr", "position": "PhD student;Assistant Professor", "bibtex": "@misc{\nahn2022mitigating,\ntitle={Mitigating Dataset Bias Using Per-Sample Gradients From A Biased Classifier},\nauthor={Sumyeong Ahn and Se-Young Yun},\nyear={2022},\nurl={https://openreview.net/forum?id=V09OhBn8iR}\n}", "github": "", "project": "", "reviewers": "sFXQ;BR78;3HUA;WQAv", "site": "https://openreview.net/forum?id=V09OhBn8iR", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "4;4;4;4", "correctness": "3;3;3;3", "technical_novelty": "3;3;2;2", "empirical_novelty": "2;2;3;2", "wc_summary_paper": "81;68;143;108", "wc_summary_review": "82;31;81;59", "wc_main_review": "543;451;134;500", "wc_review": "706;550;358;667", "wc_reply_reviewers": "145;0;109;107", "wc_reply_authors": "1279;1338;891;873", "reply_reviewers": "1;0;2;1", "reply_authors": "3;3;4;2", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 100.0, 28.71410803072246 ], "wc_summary_review_avg": [ 63.25, 20.765054779605084 ], "wc_main_review_avg": [ 407.0, 160.9425363289643 ], "wc_review_avg": [ 570.25, 135.32253138335832 ], "wc_reply_reviewers_avg": [ 90.25, 54.25576006287259 ], "wc_reply_authors_avg": [ 1095.25, 214.3622809637927 ], "reply_reviewers_avg": [ 1.0, 0.7071067811865476 ], "reply_authors_avg": [ 3.0, 0.7071067811865476 ], "replies_avg": [ 22, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:cPYZTl8onUcJ:scholar.google.com/&scioq=Mitigating+Dataset+Bias+Using+Per-Sample+Gradients+From+A+Biased+Classifier&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Korea Advanced Institute of Science and Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.kaist.ac.kr", "aff_unique_abbr": "KAIST", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "South Korea" }, { "title": "Tuformer: Data-driven Design of Transformers for Improved Generalization or Efficiency", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6003", "id": "V0A5g83gdQ_", "poster": "", "openreview": "https://openreview.net/forum?id=V0A5g83gdQ_", "slides": "https://iclr.cc/virtual/2022/poster/6003", "video": "https://iclr.cc/virtual/2022/poster/6003", "author_site": "Xiaoyu Liu, Jiahao Su, Furong Huang", "tldr": "", "abstract": "Transformers are neural network architectures that achieve remarkable performance in many areas. However, the core component of Transformers, multi-head self-attention (MHSA), is mainly derived from heuristics, and the interactions across its components are not well understood. To address the problem, we first introduce a mathematically rigorous and yet intuitive tensor diagram representation of MHSA. Guided by tensor diagram representations, we propose a novel design, namely Tunable Transformers (Tuformers), by allowing data-driven weights across heads, whereas MHSA adopts pre-defined and fixed weights across heads, as will be explained in our paper. Tuformers naturally reveal a flexible design space that a user, depending on the needs, can choose a structure that has either improved performance (generalization error) or higher model efficiency. Any pre-trained Transformer can be an initialization of the corresponding Tuformer with trainable number of heads for efficient training and fine-tuning. Tuformers universally outperform Transformers on various tasks across multiple domains under a wide range of model sizes.", "keywords": "Attention Modules;Transformers;Data-driven Model Design;Trainable Heads;Expressive Power;Tensor Methods.", "primary_area": "", "supplementary_material": "/attachment/b4371d590b920b120ea336c316ea3bdc3b123faf.zip", "author": "Xiaoyu Liu;Jiahao Su;Furong Huang", "authorids": "~Xiaoyu_Liu3;~Jiahao_Su1;~Furong_Huang1", "gender": "F;M;F", "homepage": ";;https://furong-huang.com", "dblp": ";;72/8513", "google_scholar": ";z4AEqYkAAAAJ;13yyuCcAAAAJ", "orcid": "0000-0003-3385-4726;;", "linkedin": ";jiahaosu-umd/;", "or_profile": "~Xiaoyu_Liu3;~Jiahao_Su1;~Furong_Huang1", "aff": "University of Maryland, College Park;University of Maryland, College Park;University of Maryland", "aff_domain": "umd.edu;umd.edu;cs.umd.edu", "position": "PhD student;PhD student;Assistant Professor", "bibtex": "@inproceedings{\nliu2022tuformer,\ntitle={Tuformer: Data-driven Design of Transformers for Improved Generalization or Efficiency},\nauthor={Xiaoyu Liu and Jiahao Su and Furong Huang},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=V0A5g83gdQ_}\n}", "github": "", "project": "", "reviewers": "hfb5;5BZt;R5aP;cD8e", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "4;4;4;4", "correctness": "3;2;3;2", "technical_novelty": "3;3;3;3", "empirical_novelty": "3;3;2;2", "wc_summary_paper": "16;66;41;68", "wc_summary_review": "29;21;160;67", "wc_main_review": "247;355;330;1075", "wc_review": "292;442;531;1210", "wc_reply_reviewers": "130;107;131;381", "wc_reply_authors": "848;1128;311;1776", "reply_reviewers": "1;1;1;2", "reply_authors": "2;2;1;5", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 47.75, 21.194043974664204 ], "wc_summary_review_avg": [ 69.25, 55.20133603455626 ], "wc_main_review_avg": [ 501.75, 333.37169570915887 ], "wc_review_avg": [ 618.75, 351.881638480896 ], "wc_reply_reviewers_avg": [ 187.25, 112.27282618692735 ], "wc_reply_authors_avg": [ 1015.75, 528.0607801191071 ], "reply_reviewers_avg": [ 1.25, 0.4330127018922193 ], "reply_authors_avg": [ 2.5, 1.5 ], "replies_avg": [ 22, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7602797263795576985&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2, "pdf": "https://openreview.net/pdf?id=V0A5g83gdQ_", "email": "umd.edu;umd.edu;cs.umd.edu", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Maryland", "aff_unique_dep": "", "aff_unique_url": "https://www/umd.edu", "aff_unique_abbr": "UMD", "aff_campus_unique_index": "0;0", "aff_campus_unique": "College Park;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "V0LnyelKACB", "title": "Accelerating HEP simulations with Neural Importance Sampling", "track": "main", "status": "Reject", "tldr": "", "abstract": "Virtually all high-energy-physics (HEP) simulations for the LHC rely on Monte Carlo using importance sampling by means of the VEGAS algorithm. However, complex high-precision calculations have become a challenge for the standard toolbox.\nAs a result, there has been keen interest in HEP for modern machine learning to power adaptive sampling. Despite previous work proving that normalizing-flow-powered neural importance sampling (NIS) sometimes outperforms VEGAS, existing research has still left major questions open, which we intend to solve by introducing Z\u00fcNIS, a fully automated NIS library.\nWe first show how to extend the original formulation of NIS to reuse samples over multiple gradient steps, yielding a significant improvement for slow functions. We then benchmark Z\u00fcNIS over a range of problems and show high performance with limited fine-tuning. This is crucial for Z\u00fcNIS to be a mature tool for the wider HEP public. We outline how the the library allows for non-experts to employ it with minimal effort, an essential condition to widely assess the value of NIS for LHC simulations.", "keywords": "Importance Sampling;Normalizing Flows;High-Energy-Physics", "primary_area": "", "supplementary_material": "", "author": "Nicolas Deutschmann;Niklas G\u00f6tz", "authorids": "~Nicolas_Deutschmann1;~Niklas_G\u00f6tz1", "gender": "M;", "homepage": "https://ndeutschmann.github.io;https://www.niklasgoetz.com/", "dblp": "325/4842;", "google_scholar": "GBAFB2AAAAAJ;", "orcid": "0000-0003-0432-1429;", "linkedin": ";niklasgoetz/", "or_profile": "~Nicolas_Deutschmann1;~Niklas_G\u00f6tz1", "aff": "International Business Machines;FIAS, Goethe University", "aff_domain": "ibm.com;fias.uni-frankfurt.de", "position": "Postdoc;PhD student", "bibtex": "@misc{\ndeutschmann2022accelerating,\ntitle={Accelerating {HEP} simulations with Neural Importance Sampling},\nauthor={Nicolas Deutschmann and Niklas G{\\\"o}tz},\nyear={2022},\nurl={https://openreview.net/forum?id=V0LnyelKACB}\n}", "github": "", "project": "", "reviewers": "TzbE;dDs7;9xQC;wAEt", "site": "https://openreview.net/forum?id=V0LnyelKACB", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "3;4;3;4", "correctness": "4;4;3;4", "technical_novelty": "1;2;1;2", "empirical_novelty": "2;2;1;3", "wc_summary_paper": "49;17;48;151", "wc_summary_review": "43;15;4;14", "wc_main_review": "107;249;59;317", "wc_review": "199;281;111;482", "wc_reply_reviewers": "0;0;117;0", "wc_reply_authors": "248;151;531;731", "reply_reviewers": "0;0;2;0", "reply_authors": "1;1;2;1", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 1.5, 0.5 ], "empirical_novelty_avg": [ 2.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 66.25, 50.59335430666759 ], "wc_summary_review_avg": [ 19.0, 14.508618128546908 ], "wc_main_review_avg": [ 183.0, 104.2401074443038 ], "wc_review_avg": [ 268.25, 137.27231148341605 ], "wc_reply_reviewers_avg": [ 29.25, 50.66248612138966 ], "wc_reply_authors_avg": [ 415.25, 229.6174808240871 ], "reply_reviewers_avg": [ 0.5, 0.8660254037844386 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14536335789097018109&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 12, "aff_unique_index": "0;1", "aff_unique_norm": "International Business Machines Corporation;Goethe University", "aff_unique_dep": ";Frankfurt Institute for Advanced Studies", "aff_unique_url": "https://www.ibm.com;https://www.fias.uni-frankfurt.de", "aff_unique_abbr": "IBM;FIAS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "United States;Germany" }, { "id": "V1MBgNBx5E", "title": "Mask and Understand: Evaluating the Importance of Parameters", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Influence functions are classic techniques from robust statistics based on first-order Taylor approximations that have been widely used in the machine learning community to estimate small perturbations of datasets accurately to the model. However, existing researches concentrate on the estimate the perturbations of the training or pre-training points. In this paper, we introduce the influence functions to predict the effects of removing features or parameters. It is worth emphasizing that our method can be applied to explore the influence of any combination of parameters disturbance on the model whether they belong to the same layer or whether are related. The validation and experiments also demonstrate that the influence functions for parameters can be used in many fields such as understanding model structure, model pruning, feature importance ranking, and any other strategies of masking parameters as you can imagine when you want to evaluate the importance of a group of parameters.", "keywords": "influence function;interpretability;model pruning;feature importance ranking", "primary_area": "", "supplementary_material": "", "author": "Bowei Zhu;Yong Liu", "authorids": "~Bowei_Zhu1;~Yong_Liu7", "gender": ";M", "homepage": ";https://iie-liuyong.github.io", "dblp": "304/1543;29/4867-18", "google_scholar": ";vVhmzbAAAAAJ", "orcid": ";0000-0002-6739-621X", "linkedin": ";", "or_profile": "~Bowei_Zhu1;~Yong_Liu7", "aff": "Renmin University of China;Renmin University of China", "aff_domain": "ruc.edu.cn;ruc.edu.cn", "position": "MS student;Associate Professor", "bibtex": "@misc{\nzhu2022mask,\ntitle={Mask and Understand: Evaluating the Importance of Parameters},\nauthor={Bowei Zhu and Yong Liu},\nyear={2022},\nurl={https://openreview.net/forum?id=V1MBgNBx5E}\n}", "github": "", "project": "", "reviewers": "CJuR;nmeH;h7yJ", "site": "https://openreview.net/forum?id=V1MBgNBx5E", "pdf_size": 0, "recommendation": "1;3;8", "confidence": "5;5;5", "correctness": "1;3;4", "technical_novelty": "1;2;4", "empirical_novelty": "1;2;3", "wc_summary_paper": "128;21;172", "wc_summary_review": "119;37;73", "wc_main_review": "834;464;247", "wc_review": "1081;522;492", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 4.0, 2.943920288775949 ], "confidence_avg": [ 5.0, 0.0 ], "correctness_avg": [ 2.6666666666666665, 1.247219128924647 ], "technical_novelty_avg": [ 2.3333333333333335, 1.247219128924647 ], "empirical_novelty_avg": [ 2.0, 0.816496580927726 ], "wc_summary_paper_avg": [ 107.0, 63.40872705445731 ], "wc_summary_review_avg": [ 76.33333333333333, 33.559234529741914 ], "wc_main_review_avg": [ 515.0, 242.33998156859437 ], "wc_review_avg": [ 698.3333333333334, 270.863229119216 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.9078412990032039, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:UQO-0hEl7Q8J:scholar.google.com/&scioq=Mask+and+Understand:+Evaluating+the+Importance+of+Parameters&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Renmin University of China", "aff_unique_dep": "", "aff_unique_url": "http://www.ruc.edu.cn", "aff_unique_abbr": "RUC", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "id": "V2WidtMGSRG", "title": "Provable Identifiability of ReLU Neural Networks via Lasso Regularization", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "LASSO regularization is a popular regression tool to enhance the prediction accuracy of statistical models by performing variable selection through the $\\ell_1$ penalty, initially formulated for the linear model and its variants. In this paper, the territory of LASSO is extended to the neural network model, a fashionable and powerful nonlinear regression model. Specifically, given a neural network whose output $y$ depends only on a small subset of input $\\boldsymbol{x}$, denoted by $\\mathcal{S}^{\\star}$, we prove that the LASSO estimator can stably reconstruct the neural network and identify $\\mathcal{S}^{\\star}$ when the number of samples scales logarithmically with the input dimension. This challenging regime has been well understood for linear models while barely studied for neural networks. Our theory lies in an extended Restricted Isometry Property (RIP)-based analysis framework for two-layer ReLU neural networks, which may be of independent interest to other LASSO or neural network settings. Based on the result, we further propose a neural network-based variable selection method. Experiments on simulated and real-world datasets show the promising performance of our variable selection approach compared with classical techniques.", "keywords": "Lasso;nonlinear regression;model selection", "primary_area": "", "supplementary_material": "/attachment/e567f111d7835c5a718bf83ef8f4c413e2644fc2.zip", "author": "Gen Li;Ganghua Wang;Yuantao Gu;Jie Ding", "authorids": "~Gen_Li2;~Ganghua_Wang1;~Yuantao_Gu1;~Jie_Ding2", "gender": "M;M;;M", "homepage": ";https://gwang.umn.edu;;http://jding.org", "dblp": "28/538-5.html;200/9632;;94/1825-2", "google_scholar": "https://scholar.google.com/citations?view_op=list_works;;;ZyqvoqcAAAAJ", "orcid": "0000-0002-3078-9191;0000-0002-0888-167X;;", "linkedin": ";;;", "or_profile": "~Gen_Li2;~Ganghua_Wang1;~Yuantao_Gu1;~Jie_Ding2", "aff": "The Wharton School, University of Pennsylvania;University of Minnesota, Minneapolis;;University of Minnesota, Minneapolis", "aff_domain": "wharton.upenn.edu;umn.edu;;umn.edu", "position": "Postdoc;PhD student;;Assistant Professor", "bibtex": "@misc{\nli2022provable,\ntitle={Provable Identifiability of Re{LU} Neural Networks via Lasso Regularization},\nauthor={Gen Li and Ganghua Wang and Yuantao Gu and Jie Ding},\nyear={2022},\nurl={https://openreview.net/forum?id=V2WidtMGSRG}\n}", "github": "", "project": "", "reviewers": "ftar;jMTG;BNVe;nbEF", "site": "https://openreview.net/forum?id=V2WidtMGSRG", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "3;3;3;2", "correctness": "2;3;4;4", "technical_novelty": "1;3;3;3", "empirical_novelty": "1;2;3;0", "wc_summary_paper": "59;71;63;73", "wc_summary_review": "19;14;53;56", "wc_main_review": "257;94;416;126", "wc_review": "335;179;532;255", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 2.75, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.5, 0.8660254037844386 ], "empirical_novelty_avg": [ 1.5, 1.118033988749895 ], "wc_summary_paper_avg": [ 66.5, 5.722761571129799 ], "wc_summary_review_avg": [ 35.5, 19.11151485361639 ], "wc_main_review_avg": [ 223.25, 126.9397002517337 ], "wc_review_avg": [ 325.25, 131.49596001398675 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.6622661785325219, "corr_recommendation_correctness": 0.899228803025897, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:o0F4sdlH-AcJ:scholar.google.com/&scioq=Provable+Identifiability+of+ReLU+Neural+Networks+via+Lasso+Regularization&hl=en&as_sdt=0,5", "gs_version_total": 2, "aff_unique_index": "0;1;1", "aff_unique_norm": "University of Pennsylvania;University of Minnesota", "aff_unique_dep": "The Wharton School;", "aff_unique_url": "https://www.wharton.upenn.edu;https://www.minnesota.edu", "aff_unique_abbr": "UPenn Wharton;UMN", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Minneapolis", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "V37YFd_fFgN", "title": "Leveraging Redundancy in Attention with Reuse Transformers", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Pairwise dot product-based attention allows Transformers to exchange information between tokens in an input-dependent way, and is key to their success across diverse applications in language and vision. However, a typical Transformer model computes such pairwise attention scores repeatedly for the same sequence, in multiple heads in multiple layers. We systematically analyze the empirical similarity of these scores across heads and layers and find them to be considerably redundant, especially adjacent layers showing high similarity. Motivated by these findings, we propose a novel architecture that reuses attention scores computed in one layer in multiple subsequent layers. Experiments on a number of standard benchmarks show that reusing attention delivers performance equivalent to or better than standard transformers, while reducing both compute and memory usage.", "keywords": "Transformers;attention;redundancy;reuse;efficient", "primary_area": "", "supplementary_material": "", "author": "Srinadh Bhojanapalli;Ayan Chakrabarti;Andreas Veit;Michal Lukasik;Himanshu Jain;Frederick Liu;Yin-Wen Chang;Sanjiv Kumar", "authorids": "~Srinadh_Bhojanapalli1;~Ayan_Chakrabarti1;~Andreas_Veit1;~Michal_Lukasik1;~Himanshu_Jain3;~Frederick_Liu1;~Yin-Wen_Chang1;~Sanjiv_Kumar1", "gender": "M;M;;;M;M;;", "homepage": "https://bsrinadh.github.io/;https://projects.ayanc.org/;http://andreasveit.eu/;https://mlukasik.github.io/;;https://frederick0329.github.io/;;http://www.sanjivk.com/", "dblp": "131/6700;68/5758;133/1801;72/11338;;;48/9362;", "google_scholar": "bpSF_9EAAAAJ;0v5utcwAAAAJ;UA9Hb2EAAAAJ;https://scholar.google.co.uk/citations?user=cLZLZCQAAAAJ;JtrH9jQAAAAJ;iJbdUkQAAAAJ;ohIxH_QAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";0000-0002-4843-740X;;;;;;", "linkedin": ";ayan-chakrabarti/;;;;;;", "or_profile": "~Srinadh_Bhojanapalli1;~Ayan_Chakrabarti1;~Andreas_Veit1;~Michal_Lukasik1;~Himanshu_Jain3;~Frederick_Liu1;~Yin-Wen_Chang1;~Sanjiv_Kumar1", "aff": "Google;Google;Google;Google Research;Google;Google;Google;Google", "aff_domain": "google.com;google.com;google.com;google.com;google.com;google.com;google.com;google.com", "position": "Research Scientist;Research Scientist;Senior Research Scientist;Research Scientist;Researcher;Software Engineer;Software Engineer;Research Scientist", "bibtex": "@misc{\nbhojanapalli2022leveraging,\ntitle={Leveraging Redundancy in Attention with Reuse Transformers},\nauthor={Srinadh Bhojanapalli and Ayan Chakrabarti and Andreas Veit and Michal Lukasik and Himanshu Jain and Frederick Liu and Yin-Wen Chang and Sanjiv Kumar},\nyear={2022},\nurl={https://openreview.net/forum?id=V37YFd_fFgN}\n}", "github": "", "project": "", "reviewers": "CyCB;6tZH;FSdD", "site": "https://openreview.net/forum?id=V37YFd_fFgN", "pdf_size": 0, "recommendation": "1;5;6", "confidence": "5;4;4", "correctness": "3;3;4", "technical_novelty": "1;3;3", "empirical_novelty": "1;2;3", "wc_summary_paper": "115;75;118", "wc_summary_review": "5;47;75", "wc_main_review": "414;369;212", "wc_review": "534;491;405", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 4.0, 2.160246899469287 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.3333333333333335, 0.9428090415820634 ], "empirical_novelty_avg": [ 2.0, 0.816496580927726 ], "wc_summary_paper_avg": [ 102.66666666666667, 19.601587237318874 ], "wc_summary_review_avg": [ 42.333333333333336, 28.76726534718856 ], "wc_main_review_avg": [ 331.6666666666667, 86.58842622172754 ], "wc_review_avg": [ 476.6666666666667, 53.63042254375983 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": -0.9819805060619656, "corr_recommendation_correctness": 0.654653670707977, "gs_citation": 30, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4128728582933261530&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0;0;0;0;0;0;0", "aff_unique_norm": "Google", "aff_unique_dep": "Google", "aff_unique_url": "https://www.google.com", "aff_unique_abbr": "Google", "aff_campus_unique_index": "0;0;0;0;0;0;0;0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0;0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Exploring the Limits of Large Scale Pre-training", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6231", "id": "V3C8p78sDa", "poster": "", "openreview": "https://openreview.net/forum?id=V3C8p78sDa", "slides": "https://iclr.cc/virtual/2022/poster/6231", "video": "https://iclr.cc/virtual/2022/poster/6231", "author_site": "Samira Abnar, Mostafa Dehghani, Behnam Neyshabur, Hanie Sedghi", "tldr": "", "abstract": "Recent developments in large-scale machine learning suggest that by scaling up data, model size and training time properly, one might observe that improvements in pre-training would transfer favorably to most downstream tasks. In this work we systematically study this phenomena and establish that, as we increase the upstream accuracy, performance of downstream tasks \\emph{saturates}. In particular, we investigate more than 4800 experiments on Vision Transformers, MLP-Mixers and ResNets with number of parameters ranging from ten million to ten billion, trained on the largest scale of available image data (JFT, ImageNet21K) and evaluated on more than 20 downstream image recognition tasks. We propose a model for downstream performance that reflects the saturation phenomena and captures the nonlinear relationship in performance of upstream and downstream tasks. Delving deeper to understand the reasons that give rise to these phenomena, we show that the observed saturation behavior is closely related to the way that representations evolve through the layers of the models. We showcase an even more extreme scenario where performance on upstream and downstream are at odds with each other. That is, in order to have a better downstream performance, we need to hurt upstream accuracy.", "keywords": "Scaling law;Pre-training;Transfer learning;Large Scale;Vision Transformer;Few Shot;Empirical Investigation", "primary_area": "", "supplementary_material": "", "author": "Samira Abnar;Mostafa Dehghani;Behnam Neyshabur;Hanie Sedghi", "authorids": "~Samira_Abnar1;~Mostafa_Dehghani1;~Behnam_Neyshabur1;~Hanie_Sedghi1", "gender": "Unspecified;M;M;F", "homepage": "https://samiraabnar.github.io/;http://mostafadehghani.com/;https://www.neyshabur.net;https://haniesedghi.com/", "dblp": "150/5405;125/4062;131/9898;66/8332", "google_scholar": "https://scholar.google.nl/citations?user=jbxwjgMAAAAJ;https://scholar.google.nl/citations?user=MiHOX3QAAAAJ;e1ucbCYAAAAJ;_9GX96fDWAMC", "orcid": ";;;", "linkedin": ";;;hanie-sedghi-71bb2582", "or_profile": "~Samira_Abnar1;~Mostafa_Dehghani1;~Behnam_Neyshabur1;~Hanie_Sedghi1", "aff": "Apple;Google DeepMind;Google;Google Research, Brain team", "aff_domain": "apple.com;google.com;google.com;google.com", "position": "Researcher;Research Scientist;Research Scientist;Senior Research Scientist", "bibtex": "@inproceedings{\nabnar2022exploring,\ntitle={Exploring the Limits of Large Scale Pre-training},\nauthor={Samira Abnar and Mostafa Dehghani and Behnam Neyshabur and Hanie Sedghi},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=V3C8p78sDa}\n}", "github": "", "project": "", "reviewers": "SKn8;3WYy;2EkD;pqjL", "pdf_size": 0, "recommendation": "6;8;8;8", "confidence": "3;3;3;3", "correctness": "4;4;3;4", "technical_novelty": "3;3;2;2", "empirical_novelty": "3;3;3;4", "wc_summary_paper": "47;38;119;402", "wc_summary_review": "31;21;132;37", "wc_main_review": "163;303;308;697", "wc_review": "241;362;559;1136", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "761;410;583;729", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 7.5, 0.8660254037844386 ], "confidence_avg": [ 3.0, 0.0 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 3.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 151.5, 147.99408771974643 ], "wc_summary_review_avg": [ 55.25, 44.6787141712919 ], "wc_main_review_avg": [ 367.75, 198.8031375506936 ], "wc_review_avg": [ 574.5, 343.4752538393414 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 620.75, 138.95030586508258 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": -0.3333333333333333, "gs_citation": 138, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11447367171581256986&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=V3C8p78sDa", "email": "apple.com;google.com;google.com;google.com", "author_num": 4, "aff_unique_index": "0;1;1;1", "aff_unique_norm": "Apple;Google", "aff_unique_dep": "Apple Inc.;Google DeepMind", "aff_unique_url": "https://www.apple.com;https://deepmind.com", "aff_unique_abbr": "Apple;DeepMind", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;1;0;0", "aff_country_unique": "United States;United Kingdom" }, { "id": "V3NZqmGA6yk", "title": "Beyond Pixels: A Sample Based Method for understanding the decisions of Neural Networks", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Interpretability in deep learning is one of the largest obstacles to more widespread adoption of deep learning in critical applications. A variety of methods have been introduced to understand and explain decisions made by large neural networks. A class of these methods are algorithms that attempt to highlight which input or feature subset was most influential to model predictions. We identify two key weaknesses in existing methods. First, most existing methods do not provide a formal measure of which features are important on their own, and which are important due to correlations with others. Second, many of these methods are only applied to the most granular component of input features (e.g., pixels). We partially tackle these problems by proposing a novel Morris Screening based sensitivity analysis method using input-partitioning (MoSIP). MoSIP allows us to quantify local and global importance of less granular aspects of input space, and helps highlight which parts of inputs are individually important and which are potentially important due to correlations. Through experiments on both MNIST with spurious correlations (Biased-MNIST), and the large scale ImageNet-1K dataset, we reveal several new and interesting findings. Our key finding is that newer CNN architectures (e.g., ResNet) compared to older architectures (e.g., VGG) do not extract fundamentally more relevant features, but simply make stronger use of non-linearities and feature interactions. This can manifest itself in the use of spurious correlations in the data to make decisions.", "keywords": "Machine Learning Interpretability;Bias;ImageNet;AlexNet;ResNet;VGG-16;Inception;CNNs;MNIST", "primary_area": "", "supplementary_material": "", "author": "Ohi Dibua;Mackenzie Austin;Kushal Kafle", "authorids": "~Ohi_Dibua1;maustin@adobe.com;~Kushal_Kafle2", "gender": "M;;M", "homepage": ";;https://kushalkafle.com", "dblp": ";;188/6388", "google_scholar": ";;M_iwxCQAAAAJ", "orcid": ";;0000-0002-0847-7861", "linkedin": "ohi-dibua-2bb72032;;kushalkafle/", "or_profile": "~Ohi_Dibua1;maustin@adobe.com;~Kushal_Kafle2", "aff": "Adobe Systems;;Adobe Systems", "aff_domain": "adobe.com;;adobe.com", "position": "Research Engineer;;Researcher", "bibtex": "@misc{\ndibua2022beyond,\ntitle={Beyond Pixels: A Sample Based Method for understanding the decisions of Neural Networks},\nauthor={Ohi Dibua and Mackenzie Austin and Kushal Kafle},\nyear={2022},\nurl={https://openreview.net/forum?id=V3NZqmGA6yk}\n}", "github": "", "project": "", "reviewers": "FoaC;mmzL;F1hA;qA1T", "site": "https://openreview.net/forum?id=V3NZqmGA6yk", "pdf_size": 0, "recommendation": "1;3;3;3", "confidence": "3;2;4;3", "correctness": "2;3;3;2", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "90;39;79;55", "wc_summary_review": "44;29;53;57", "wc_main_review": "924;53;338;404", "wc_review": "1058;121;470;516", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 2.5, 0.8660254037844386 ], "confidence_avg": [ 3.0, 0.7071067811865476 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 65.75, 19.967160539245434 ], "wc_summary_review_avg": [ 45.75, 10.755812382149477 ], "wc_main_review_avg": [ 429.75, 314.3663269181354 ], "wc_review_avg": [ 541.25, 335.1696398840444 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:olrcomJcFPwJ:scholar.google.com/&scioq=Beyond+Pixels:+A+Sample+Based+Method+for+understanding+the+decisions+of+Neural+Networks&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Adobe", "aff_unique_dep": "Adobe Systems Incorporated", "aff_unique_url": "https://www.adobe.com", "aff_unique_abbr": "Adobe", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "V70cjLuGACn", "title": "Closed-loop Control for Online Continual Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Online class-incremental continual learning (CL) deals with the sequential task learning problem in a realistic non-stationary setting with a single-pass through of data. Replay-based CL methods have shown promising results in several online class-incremental continual learning benchmarks. However, these replay methods typically assume pre-defined and fixed replay dynamics, which is suboptimal. This paper introduces a closed-loop continual learning framework, which obtains a real-time feedback learning signal via an additional test memory and then adapts the replay dynamics accordingly. More specifically, we propose a reinforcement learning-based method to dynamically adjust replay hyperparameters online to balance the stability and plasticity trade-off in continual learning. To address the non-stationarity in the continual learning environment, we employ a Q function with task-specific and task-shared components to support fast adaptation. The proposed method is applied to improve state-of-the-art replay-based methods and achieves superior performance on popular benchmarks.", "keywords": "Continual Learnig;Reinforcement learning;Class-incremental Continual Learning;Online Learning", "primary_area": "", "supplementary_material": "", "author": "Yaqian Zhang;Eibe Frank;Bernhard Pfahringer;Albert Bifet;Nick Jin Sean Lim;Alvin Jia", "authorids": "~Yaqian_Zhang5;~Eibe_Frank2;~Bernhard_Pfahringer1;~Albert_Bifet1;~Nick_Jin_Sean_Lim1;alvin.jia@waikato.ac.nz", "gender": "F;;M;M;M;", "homepage": ";;https://profiles.waikato.ac.nz/bernhard.pfahringer;https://albertbifet.com/;https://www.nicklim.com;", "dblp": ";;10/140;48/1070;;", "google_scholar": "DezNG3MAAAAJ;;https://scholar.google.co.nz/citations?user=PEv3OQUAAAAJ;https://scholar.google.com/citations?hl=en;;", "orcid": "0000-0002-8594-4697;;0000-0002-3732-5787;0000-0002-8339-7773;0000-0003-4690-5780;", "linkedin": ";;;abifet/;;", "or_profile": "~Yaqian_Zhang5;~Eibe_Frank2;~Bernhard_Pfahringer1;~Albert_Bifet1;~Nick_Jin_Sean_Lim1;alvin.jia@waikato.ac.nz", "aff": "The University of Waikato;;The University of Waikato;T\u00e9l\u00e9com Paris;The University of Waikato;", "aff_domain": "waikato.ac.nz;;waikato.ac.nz;telecom-paris.fr;waikato.ac.nz;", "position": "Postdoc;;Full Professor;Full Professor;Postdoc;", "bibtex": "@misc{\nzhang2022closedloop,\ntitle={Closed-loop Control for Online Continual Learning},\nauthor={Yaqian Zhang and Eibe Frank and Bernhard Pfahringer and Albert Bifet and Nick Jin Sean Lim and Alvin Jia},\nyear={2022},\nurl={https://openreview.net/forum?id=V70cjLuGACn}\n}", "github": "", "project": "", "reviewers": "GhFg;vSnQ;ng9G;hbHU", "site": "https://openreview.net/forum?id=V70cjLuGACn", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "4;4;4;4", "correctness": "2;3;4;3", "technical_novelty": "3;2;2;3", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "111;46;127;75", "wc_summary_review": "121;76;44;57", "wc_main_review": "1674;235;262;527", "wc_review": "1906;357;433;659", "wc_reply_reviewers": "696;50;0;0", "wc_reply_authors": "806;238;512;716", "reply_reviewers": "1;1;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 89.75, 31.506943679132064 ], "wc_summary_review_avg": [ 74.5, 29.159046623646667 ], "wc_main_review_avg": [ 674.5, 588.2331595549506 ], "wc_review_avg": [ 838.75, 626.108766509462 ], "wc_reply_reviewers_avg": [ 186.5, 294.8673430544658 ], "wc_reply_authors_avg": [ 568.0, 218.27963716297495 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.6488856845230502, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:G5rBfvkb1FcJ:scholar.google.com/&scioq=Closed-loop+Control+for+Online+Continual+Learning&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "University of Waikato;T\u00e9l\u00e9com Paris", "aff_unique_dep": ";", "aff_unique_url": "https://www.waikato.ac.nz;https://www.telecom-paris.fr", "aff_unique_abbr": "UoW;T\u00e9l\u00e9com Paris", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "New Zealand;France" }, { "id": "V7eSbSAz-O8", "title": "Benchmarking Machine Learning Robustness in Covid-19 Spike Sequence Classification", "track": "main", "status": "Reject", "tldr": "", "abstract": "The rapid spread of the COVID-19 pandemic has resulted in an unprecedented amount of sequence data of the SARS-CoV-2 viral genome --- millions of sequences and counting. This amount of data, while being orders of magnitude beyond the capacity of traditional approaches to understanding the diversity, dynamics and evolution of viruses, is nonetheless a rich resource for machine learning (ML) and deep learning (DL) approaches as alternatives for extracting such important information from these data. It is of hence utmost importance to design a framework for testing and benchmarking the robustness of these ML and DL approaches.\n\nThis paper the first (to our knowledge) to explore such a framework. In this paper, we introduce several ways to perturb SARS-CoV-2 spike protein sequences in ways that mimic the error profiles of common sequencing platforms such as Illumina and PacBio. We show from experiments on a wide array of ML approaches from naive Bayes to logistic regression, that DL approaches are more robust (and accurate) to such adverarial attacks to the input sequences, while $k$-mer based feature vector representations are more robust than the baseline one-hot embedding. Our benchmarking framework may developers of futher ML and DL techniques to properly assess their approaches towards understanding the behaviour of the SARS-CoV-2 virus, or towards avoiding possible future pandemics.", "keywords": "COVID-19;Sequence Classification;Spike Sequences;k-mers;Deep Learning", "primary_area": "", "supplementary_material": "", "author": "Sarwan Ali;Bikram Sahoo;Pin-Yu Chen;Murray Patterson", "authorids": "~Sarwan_Ali1;~Bikram_Sahoo1;~Pin-Yu_Chen1;~Murray_Patterson1", "gender": "M;M;M;M", "homepage": "https://sarwanpasha.github.io/;;http://www.pinyuchen.com;", "dblp": "243/0133;;39/8969;33/387", "google_scholar": "9dtXSoAAAAAJ;wyH2kiIAAAAJ;jxwlCUUAAAAJ;", "orcid": "0000-0001-8121-2168;my-orcid?orcid=0000-0001-6481-2583;0000-0003-1039-8369;", "linkedin": "sarwan-ali/;bikram-sahoo-b091bb20/;pin-yu-chen-940062a2;", "or_profile": "~Sarwan_Ali1;~Bikram_Sahoo1;~Pin-Yu_Chen1;~Murray_Patterson1", "aff": "Georgia State University;Georgia State University;International Business Machines;Georgia State University", "aff_domain": "gsu.edu;gsu.edu;ibm.com;gsu.edu", "position": "PhD student;PhD student;Research Staff Member;Assistant Professor", "bibtex": "@misc{\nali2022benchmarking,\ntitle={Benchmarking Machine Learning Robustness in Covid-19 Spike Sequence Classification},\nauthor={Sarwan Ali and Bikram Sahoo and Pin-Yu Chen and Murray Patterson},\nyear={2022},\nurl={https://openreview.net/forum?id=V7eSbSAz-O8}\n}", "github": "", "project": "", "reviewers": "xx9i;uALA;YVK3", "site": "https://openreview.net/forum?id=V7eSbSAz-O8", "pdf_size": 0, "recommendation": "1;1;3", "confidence": "5;4;5", "correctness": "2;1;2", "technical_novelty": "2;1;1", "empirical_novelty": "2;2;1", "wc_summary_paper": "75;38;30", "wc_summary_review": "51;34;31", "wc_main_review": "270;367;236", "wc_review": "396;439;297", "wc_reply_reviewers": "43;32;0", "wc_reply_authors": "565;866;409", "reply_reviewers": "1;1;0", "reply_authors": "1;2;1", "recommendation_avg": [ 1.6666666666666667, 0.9428090415820634 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "correctness_avg": [ 1.6666666666666667, 0.4714045207910317 ], "technical_novelty_avg": [ 1.3333333333333333, 0.4714045207910317 ], "empirical_novelty_avg": [ 1.6666666666666667, 0.4714045207910317 ], "wc_summary_paper_avg": [ 47.666666666666664, 19.601587237318874 ], "wc_summary_review_avg": [ 38.666666666666664, 8.806563209081938 ], "wc_main_review_avg": [ 291.0, 55.50375362681939 ], "wc_review_avg": [ 377.3333333333333, 59.45493157753097 ], "wc_reply_reviewers_avg": [ 25.0, 18.239152027072603 ], "wc_reply_authors_avg": [ 613.3333333333334, 189.6739893138282 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.4999999999999999, "corr_recommendation_correctness": 0.4999999999999999, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:KRiN2y8mLiMJ:scholar.google.com/&scioq=Benchmarking+Machine+Learning+Robustness+in+Covid-19+Spike+Sequence+Classification&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Georgia State University;International Business Machines Corporation", "aff_unique_dep": ";", "aff_unique_url": "https://www.gsu.edu;https://www.ibm.com", "aff_unique_abbr": "GSU;IBM", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "V8UTvwzUOcX", "title": "Biased Multi-Domain Adversarial Training", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Several recent studies have shown that the use of extra in-distribution data can lead to a high level of adversarial robustness. However, there is no guarantee that it will always be possible to obtain sufficient extra data for a selected dataset. In this paper, we propose a biased multi-domain adversarial training (BiaMAT) method that induces training data amplification using freely available auxiliary datasets. The proposed method can achieve increased adversarial robustness on a primary dataset by leveraging auxiliary datasets via multi-domain learning. Specifically, data amplification on both robust and non-robust features can be accomplished through the application of BiaMAT as demonstrated through an additional analysis based on shuffle testing. Our experimental results indicate that BiaMAT can effectively utilize the robust and non-robust features present in various auxiliary datasets. Moreover, we demonstrate that while existing methods are vulnerable to negative transfer due to the distributional discrepancy between auxiliary and primary data, the proposed method enables neural networks to flexibly leverage diverse image datasets for adversarial training by successfully handling the domain discrepancy through the application of a confidence-based selection strategy.", "keywords": "adversarial training;adversarial robustness", "primary_area": "", "supplementary_material": "", "author": "Saehyung Lee;Hyungyu Lee;Sanghyuk Chun;Sungroh Yoon", "authorids": "~Saehyung_Lee1;~Hyungyu_Lee1;~Sanghyuk_Chun1;~Sungroh_Yoon1", "gender": "M;M;;M", "homepage": ";https://sanghyukchun.github.io/home/;http://ailab.snu.ac.kr;https://snu.ac.kr", "dblp": "260/0442;213/1095.html;99/1474;", "google_scholar": "nS24h74AAAAJ;https://scholar.google.co.kr/citations?user=4_uj0xcAAAAJ;Bphl_fIAAAAJ;", "orcid": ";0000-0002-4533-2610;0000-0002-2367-197X;", "linkedin": ";https://kr.linkedin.com/in/sanghyukchun/en;;", "or_profile": "~Saehyung_Lee1;~Sanghyuk_Chun1;~Sungroh_Yoon1;~Hyungyu_Lee2", "aff": "Seoul National University;NAVER AI Lab;Seoul National University;Seoul National University", "aff_domain": "snu.ac.kr;navercorp.com;snu.ac.kr;snu.ac.kr", "position": "PhD student;Lead research scientist;Full Professor;PhD student", "bibtex": "@misc{\nlee2022biased,\ntitle={Biased Multi-Domain Adversarial Training},\nauthor={Saehyung Lee and Hyungyu Lee and Sanghyuk Chun and Sungroh Yoon},\nyear={2022},\nurl={https://openreview.net/forum?id=V8UTvwzUOcX}\n}", "github": "", "project": "", "reviewers": "CPQu;R8fS;K8pu", "site": "https://openreview.net/forum?id=V8UTvwzUOcX", "pdf_size": 0, "recommendation": "5;5;5", "confidence": "4;3;4", "correctness": "3;4;3", "technical_novelty": "3;2;2", "empirical_novelty": "3;2;2", "wc_summary_paper": "135;56;37", "wc_summary_review": "218;35;18", "wc_main_review": "484;232;89", "wc_review": "837;323;144", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 5.0, 0.0 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 76.0, 42.43426288586461 ], "wc_summary_review_avg": [ 90.33333333333333, 90.54035318881606 ], "wc_main_review_avg": [ 268.3333333333333, 163.2918314620245 ], "wc_review_avg": [ 434.6666666666667, 293.7281433949124 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Ri7KWQEzOuMJ:scholar.google.com/&scioq=Biased+Multi-Domain+Adversarial+Training&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1;0;0", "aff_unique_norm": "Seoul National University;NAVER Corporation", "aff_unique_dep": ";NAVER AI Lab", "aff_unique_url": "https://www.snu.ac.kr;https://www.naver.com", "aff_unique_abbr": "SNU;NAVER", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "South Korea" }, { "id": "VABfTTrrOv", "title": "Conjugation Invariant Learning with Neural Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "Machine learning under the constraint of symmetries, given by group invariances or equivariances, has emerged as a topic of active interest in recent years. Natural settings for such applications include the multi-reference alignment and cryo electron microscopy, multi-object tracking, spherical images, and so on. A fundamental paradigm among such symmetries is the action of a group by symmetries, which often pertains to change of basis or relabelling of objects in pure and applied mathematics. Thus, a naturally significant class of functions consists of those that are intrinsic to the problem, in the sense of being independent of such base change or relabelling; in other words invariant under the conjugation action by a group. In this work, we investigate such functions, known as class functions, leveraging tools from group representation theory. A fundamental ingredient in our approach are given by the so-called irreducible characters of the group, which are canonical tracial class functions related to its irreducible representations. Such functions form an orthogonal basis for the class functions, extending ideas from Fourier analysis to this domain, and accord a very explicit structure. Exploiting a tensorial structure on representations, which translates into a multiplicative algebra structure for irreducible characters, we propose to efficiently approximate class functions using polynomials in a small number of such characters. Thus, our approach provides a global, non-linear coordinate system to describe functions on the group that is intrinsic in nature, in the sense that it is independent of local charts, and can be easily computed in concrete models. We demonstrate that such non-linear approximation using a small dictionary can be effectively implemented using a deep neural network paradigm. This allows us to learn a class function efficiently from a dataset of its outputs.", "keywords": "Learning under group actions;Neural networks;Group representations;Characters;Class functions", "primary_area": "", "supplementary_material": "/attachment/eb5fcda752526c1176497e37bd0acbc6a36ae96e.zip", "author": "Aaron Yi Rui Low;Subhroshekhar Ghosh;Yong Sheng Soh", "authorids": "~Aaron_Yi_Rui_Low1;~Subhroshekhar_Ghosh1;~Yong_Sheng_Soh1", "gender": "M;;", "homepage": "https://aaronlyr94.wixsite.com/mysite;https://subhro-ghosh.github.io/;https://yssoh.github.io/", "dblp": ";;123/9574.html", "google_scholar": ";RpGHEzsAAAAJ;OPntcXsAAAAJ", "orcid": "0000-0002-7693-4745;;0000-0003-3367-1401", "linkedin": "aaron-yi-rui-low-2703021ba/;;", "or_profile": "~Aaron_Yi_Rui_Low1;~Subhroshekhar_Ghosh1;~Yong_Sheng_Soh1", "aff": "National University of Singapore;National University of Singapore;Institute of High Performance Computing, Singapore, A*STAR", "aff_domain": "nus.edu.sg;nus.edu.sg;ihpc.a-star.edu.sg", "position": "Instructor;Assistant Professor;Researcher", "bibtex": "@misc{\nlow2022conjugation,\ntitle={Conjugation Invariant Learning with Neural Networks},\nauthor={Aaron Yi Rui Low and Subhroshekhar Ghosh and Yong Sheng Soh},\nyear={2022},\nurl={https://openreview.net/forum?id=VABfTTrrOv}\n}", "github": "", "project": "", "reviewers": "qL2S;zG1H;Y8tG;5x3M;E1Av", "site": "https://openreview.net/forum?id=VABfTTrrOv", "pdf_size": 0, "recommendation": "3;3;3;3;5", "confidence": "1;4;4;4;2", "correctness": "3;3;4;4;4", "technical_novelty": "2;2;2;3;4", "empirical_novelty": "2;1;1;2;1", "wc_summary_paper": "19;16;96;119;73", "wc_summary_review": "64;41;85;122;62", "wc_main_review": "155;164;771;547;186", "wc_review": "238;221;952;788;321", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "0;0;0;0;0", "reply_reviewers": "0;0;0;0;0", "reply_authors": "0;0;0;0;0", "recommendation_avg": [ 3.4, 0.8 ], "confidence_avg": [ 3.0, 1.2649110640673518 ], "correctness_avg": [ 3.6, 0.4898979485566356 ], "technical_novelty_avg": [ 2.6, 0.8 ], "empirical_novelty_avg": [ 1.4, 0.4898979485566356 ], "wc_summary_paper_avg": [ 64.6, 41.1271200061468 ], "wc_summary_review_avg": [ 74.8, 27.40364939200617 ], "wc_main_review_avg": [ 364.6, 250.79920255056635 ], "wc_review_avg": [ 504.0, 305.1865003567491 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.3952847075210474, "corr_recommendation_correctness": 0.4082482904638631, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16475298486657000768&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;1", "aff_unique_norm": "National University of Singapore;Institute of High Performance Computing", "aff_unique_dep": ";", "aff_unique_url": "https://www.nus.edu.sg;https://www.ihpc.a-star.edu.sg", "aff_unique_abbr": "NUS;IHPC", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Singapore" }, { "id": "VAmkgdMztWs", "title": "Network robustness as a mathematical property: training, evaluation and attack", "track": "main", "status": "Reject", "tldr": "", "abstract": "Neural networks are widely used in AI for their ability to detect general patterns in noisy data. Paradoxically, by default they are also known to not be particularly robust, i.e. moving a small distance in the input space can result in the network's output changing significantly.\nMany methods for improving neural network robustness have been proposed recently. This growing body of research gave rise to numerous explicit or implicit notions of robustness. Connections between these notions are often subtle, and a systematic comparison of these different definitions was lacking in the literature.\nIn this paper we attempt to address this gap by performing an in-depth comparison of the different definitions of robustness, by analysing their relationships, assumptions, interpretability and verifiability.\nBy abstracting robustness as a stand-alone mathematical property, we are able to show that, having a choice of several definitions of robustness, one can combine them in a modular way when defining training modes, evaluation metrics, and attacks on neural networks.\nWe also perform experiments to compare the applicability and efficacy of different training methods for ensuring the network obeys these different definitions.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/206b1dc7f1b4c8cd402bddf6e010532ba114426c.zip", "author": "Marco Casadio;Matthew L Daggitt;Ekaterina Komendantskaya;Wen Kokke;Robert Stewart", "authorids": "~Marco_Casadio2;~Matthew_L_Daggitt1;~Ekaterina_Komendantskaya1;~Wen_Kokke1;~Robert_Stewart4", "gender": "M;M;F;F;M", "homepage": "https://github.com/Tgl70;;https://www.macs.hw.ac.uk/~ek19/;https://wen.works;https://www.macs.hw.ac.uk/~rs46/", "dblp": "158/5741;222/4171;62/4310;;https://dblp.uni-trier.de/pid/13/10106.html", "google_scholar": ";https://scholar.google.com.au/citations?hl=en;https://scholar.google.co.uk/citations?user=-HbSa84AAAAJ;;https://scholar.google.co.uk/citations?user=ArZWHf4AAAAJ", "orcid": ";;;;0000-0003-0365-693X", "linkedin": ";;;;rob-stewart-94949888/", "or_profile": "~Marco_Casadio2;~Matthew_L_Daggitt1;~Ekaterina_Komendantskaya1;~Wen_Kokke1;~Robert_Stewart4", "aff": "Heriot-Watt University;Heriot-Watt University;Heriot-Watt University;University of Edinburgh;", "aff_domain": "hw.ac.uk;hw.ac.uk;hw.ac.uk;ed.ac.uk;", "position": "PhD student;Postdoc;Professor;PhD student;", "bibtex": "@misc{\ncasadio2022network,\ntitle={Network robustness as a mathematical property: training, evaluation and attack},\nauthor={Marco Casadio and Matthew L Daggitt and Ekaterina Komendantskaya and Wen Kokke and Robert Stewart},\nyear={2022},\nurl={https://openreview.net/forum?id=VAmkgdMztWs}\n}", "github": "", "project": "", "reviewers": "6o25;DHPy;2Rrw", "site": "https://openreview.net/forum?id=VAmkgdMztWs", "pdf_size": 0, "recommendation": "1;1;3", "confidence": "4;5;3", "correctness": "2;1;4", "technical_novelty": "1;2;1", "empirical_novelty": "1;2;2", "wc_summary_paper": "96;23;40", "wc_summary_review": "21;49;56", "wc_main_review": "598;548;349", "wc_review": "715;620;445", "wc_reply_reviewers": "19;39;0", "wc_reply_authors": "532;544;19", "reply_reviewers": "1;1;0", "reply_authors": "1;1;1", "recommendation_avg": [ 1.6666666666666667, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "correctness_avg": [ 2.3333333333333335, 1.247219128924647 ], "technical_novelty_avg": [ 1.3333333333333333, 0.4714045207910317 ], "empirical_novelty_avg": [ 1.6666666666666667, 0.4714045207910317 ], "wc_summary_paper_avg": [ 53.0, 31.187604375242845 ], "wc_summary_review_avg": [ 42.0, 15.121728296285006 ], "wc_main_review_avg": [ 498.3333333333333, 107.54947182059468 ], "wc_review_avg": [ 593.3333333333334, 111.82824131477501 ], "wc_reply_reviewers_avg": [ 19.333333333333332, 15.923427883328248 ], "wc_reply_authors_avg": [ 365.0, 244.7079892443236 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.8660254037844385, "corr_recommendation_correctness": 0.9449111825230679, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:W2R-569uNuoJ:scholar.google.com/&scioq=Network+robustness+as+a+mathematical+property:+training,+evaluation+and+attack&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;0;1", "aff_unique_norm": "Heriot-Watt University;University of Edinburgh", "aff_unique_dep": ";", "aff_unique_url": "https://www.hw.ac.uk;https://www.ed.ac.uk", "aff_unique_abbr": "HWU;Edinburgh", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United Kingdom" }, { "title": "The Unreasonable Effectiveness of Random Pruning: Return of the Most Naive Baseline for Sparse Training", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6925", "id": "VBZJ_3tz-t", "poster": "", "openreview": "https://openreview.net/forum?id=VBZJ_3tz-t", "slides": "https://iclr.cc/virtual/2022/poster/6925", "video": "https://iclr.cc/virtual/2022/poster/6925", "author_site": "Shiwei Liu, Tianlong Chen, Xiaohan Chen, Li Shen, Decebal Mocanu, Zhangyang Wang, Mykola Pechenizkiy", "tldr": "", "abstract": "Random pruning is arguably the most naive way to attain sparsity in neural networks, but has been deemed uncompetitive by either post-training pruning or sparse training. In this paper, we focus on sparse training and highlight a perhaps counter-intuitive finding, that random pruning at initialization can be quite powerful for the sparse training of modern neural networks. Without any delicate pruning criteria or carefully pursued sparsity structures, we empirically demonstrate that sparsely training a randomly pruned network from scratch can match the performance of its dense equivalent. There are two key factors that contribute to this revival: (i) $the network sizes matter$: as the original dense networks grow wider and deeper, the performance of training a randomly pruned sparse network will quickly grow to matching that of its dense equivalent, even at high sparsity ratios; (ii) $appropriate layer-wise sparsity ratios$ can be pre-chosen for sparse training, which shows to be another important performance booster. Simple as it looks, a randomly pruned subnetwork of Wide ResNet-50 can be sparsely trained to outperforming a dense Wide ResNet-50, on ImageNet. We also observed such randomly pruned networks outperform dense counterparts in other favorable aspects, such as out-of-distribution detection, uncertainty estimation, and adversarial robustness. Overall, our results strongly suggest there is larger-than-expected room for sparse training at scale, and the benefits of sparsity might be more universal beyond carefully designed pruning. Our source code can be found at https://github.com/VITA-Group/Random_Pruning.\n", "keywords": "random pruning;sparse training;static sparse training;layer-wise sparsities;dynamic sparse training", "primary_area": "", "supplementary_material": "/attachment/c273399dc2db71f26a07314992eb81c16cb9f3a0.zip", "author": "Shiwei Liu;Tianlong Chen;Xiaohan Chen;Li Shen;Decebal Constantin Mocanu;Zhangyang Wang;Mykola Pechenizkiy", "authorids": "~Shiwei_Liu2;~Tianlong_Chen1;~Xiaohan_Chen1;~Li_Shen1;~Decebal_Constantin_Mocanu1;~Zhangyang_Wang1;~Mykola_Pechenizkiy1", "gender": "M;M;M;M;M;M;M", "homepage": "https://shiweiliuiiiiiii.github.io/;https://tianlong-chen.github.io;http://xiaohanchen.com;https://sites.google.com/site/mathshenli/home;https://wwwen.uni.lu/recherche/fstm/dcs/members/decebal_constantin_mocanu;https://vita-group.github.io;http://www.win.tue.nl/~mpechen/", "dblp": "234/8697-3.html;;94/3802;91/3680-8;133/7764;119/4026;37/4649", "google_scholar": "73IbXtsAAAAJ;LE3ctn0AAAAJ;https://scholar.google.com/citations?authuser=1;yVhgENIAAAAJ;RlQgUwEAAAAJ;pxFyKAIAAAAJ;https://scholar.google.com.tw/citations?user=F0uFT_kAAAAJ", "orcid": ";0000-0001-7774-8197;0000-0002-0360-0402;;0000-0002-5636-7683;;0000-0003-4955-0743", "linkedin": ";tianlong-chen-783862167/;xiaohan-chen-400b00147/;;;;mpechen/", "or_profile": "~Shiwei_Liu2;~Tianlong_Chen1;~Xiaohan_Chen1;~Li_Shen1;~Decebal_Constantin_Mocanu1;~Zhangyang_Wang1;~Mykola_Pechenizkiy1", "aff": "Eindhoven University of Technology;University of Texas, Austin;University of Texas, Austin;JD Explore Academy;University of Twente;University of Texas, Austin;Eindhoven University of Technology", "aff_domain": "tue.nl;utexas.edu;utexas.edu;jd.com;utwente.nl;utexas.edu;tue.nl", "position": "PhD student;PhD student;PhD student;Researcher;Assistant Professor;Assistant Professor;Full Professor", "bibtex": "@inproceedings{\nliu2022the,\ntitle={The Unreasonable Effectiveness of Random Pruning: Return of the Most Naive Baseline for Sparse Training},\nauthor={Shiwei Liu and Tianlong Chen and Xiaohan Chen and Li Shen and Decebal Constantin Mocanu and Zhangyang Wang and Mykola Pechenizkiy},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=VBZJ_3tz-t}\n}", "github": "", "project": "", "reviewers": "WXbx;Gr4w;GL4s;nAhU", "pdf_size": 0, "recommendation": "6;6;6;8", "confidence": "5;5;4;4", "correctness": "4;3;4;4", "technical_novelty": "2;2;2;3", "empirical_novelty": "3;2;3;4", "wc_summary_paper": "158;67;89;45", "wc_summary_review": "177;17;41;46", "wc_main_review": "644;649;220;386", "wc_review": "979;733;350;477", "wc_reply_reviewers": "266;151;29;0", "wc_reply_authors": "1143;1288;281;583", "reply_reviewers": "2;1;1;0", "reply_authors": "3;4;1;1", "recommendation_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 4.5, 0.5 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 89.75, 42.363752194535365 ], "wc_summary_review_avg": [ 70.25, 62.599420923839226 ], "wc_main_review_avg": [ 474.75, 181.50946944994357 ], "wc_review_avg": [ 634.75, 241.9342627657356 ], "wc_reply_reviewers_avg": [ 111.5, 105.67520995957378 ], "wc_reply_authors_avg": [ 823.75, 409.2635917107702 ], "reply_reviewers_avg": [ 1.0, 0.7071067811865476 ], "reply_authors_avg": [ 2.25, 1.299038105676658 ], "replies_avg": [ 20, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 134, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15333598630551716586&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "pdf": "https://openreview.net/pdf?id=VBZJ_3tz-t", "email": "tue.nl;utexas.edu;utexas.edu;jd.com;utwente.nl;utexas.edu;tue.nl", "author_num": 7, "aff_unique_index": "0;1;1;2;3;1;0", "aff_unique_norm": "Eindhoven University of Technology;University of Texas at Austin;JD;University of Twente", "aff_unique_dep": ";;JD Explore Academy;", "aff_unique_url": "https://www.tue.nl;https://www.utexas.edu;;https://www.utwente.nl", "aff_unique_abbr": "TU/e;UT Austin;;UT", "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Austin", "aff_country_unique_index": "0;1;1;0;1;0", "aff_country_unique": "Netherlands;United States;" }, { "id": "VCD05OEn7r", "title": "CAGE: Probing Causal Relationships in Deep Generative Models", "track": "main", "status": "Reject", "tldr": "", "abstract": "Deep generative models excel at generating complex, high-dimensional data, often exhibiting impressive generalization beyond the training distribution. The learning principle for these models is however purely based on statistical objectives and it is unclear to what extent such models have internalized the causal relationships present in the training data, if at all. With increasing real-world deployments, such a causal understanding of generative models is essential for interpreting and controlling their use in high-stake applications that require synthetic data generation. We propose CAGE, a framework for inferring the cause-effect relationships governing deep generative models. CAGE employs careful geometrical manipulations within the latent space of a generative model for generating counterfactuals and estimating unit-level generative causal effects. CAGE does not require any modifications to the training procedure and can be used with any existing pretrained latent variable model. Moreover, the pretraining can be completely unsupervised and does not require any treatment or outcome labels. Empirically, we demonstrate the use of CAGE for: (a) inferring cause-effect relationships within a deep generative model trained on both synthetic and high resolution images, and (b) guiding data augmentations for robust classification where CAGE achieves improvements over current default approaches on image datasets.", "keywords": "Generative Models;Causality", "primary_area": "", "supplementary_material": "", "author": "Joey Bose;Ricardo Pio Monti;Aditya Grover", "authorids": "~Joey_Bose1;~Ricardo_Pio_Monti1;~Aditya_Grover1", "gender": "M;M;M", "homepage": "https://joeybose.github.io/;;https://aditya-grover.github.io", "dblp": "174/3372;;162/5052", "google_scholar": "ybPyI7IAAAAJ;https://scholar.google.co.uk/citations?hl=en;oOhnPUgAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Joey_Bose1;~Ricardo_Pio_Monti1;~Aditya_Grover1", "aff": "McGill University and Mila;University College London;University of California, Los Angeles", "aff_domain": "mcgill.ca;ucl.ac.uk;ucla.edu", "position": "PhD student;Postdoc;Assistant Professor", "bibtex": "@misc{\nbose2022cage,\ntitle={{CAGE}: Probing Causal Relationships in Deep Generative Models},\nauthor={Joey Bose and Ricardo Pio Monti and Aditya Grover},\nyear={2022},\nurl={https://openreview.net/forum?id=VCD05OEn7r}\n}", "github": "", "project": "", "reviewers": "xP5d;TgM5;Ak2p;AcYm", "site": "https://openreview.net/forum?id=VCD05OEn7r", "pdf_size": 0, "recommendation": "3;3;6;6", "confidence": "4;4;4;4", "correctness": "2;3;4;3", "technical_novelty": "3;1;3;4", "empirical_novelty": "3;0;3;4", "wc_summary_paper": "145;104;68;68", "wc_summary_review": "140;22;16;14", "wc_main_review": "354;483;423;149", "wc_review": "639;609;507;231", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "1515;2014;1190;898", "reply_reviewers": "0;0;0;0", "reply_authors": "4;5;3;3", "recommendation_avg": [ 4.5, 1.5 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.75, 1.0897247358851685 ], "empirical_novelty_avg": [ 2.5, 1.5 ], "wc_summary_paper_avg": [ 96.25, 31.751968442917047 ], "wc_summary_review_avg": [ 48.0, 53.19774431308154 ], "wc_main_review_avg": [ 352.25, 125.9114272018231 ], "wc_review_avg": [ 496.5, 160.9060284762507 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 1404.25, 414.20186805469626 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 3.75, 0.82915619758885 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.7071067811865476, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:AQs2Y0Vhk8AJ:scholar.google.com/&scioq=CAGE:+Probing+Causal+Relationships+in+Deep+Generative+Models&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;2", "aff_unique_norm": "McGill University;University College London;University of California, Los Angeles", "aff_unique_dep": ";;", "aff_unique_url": "https://www.mcgill.ca;https://www.ucl.ac.uk;https://www.ucla.edu", "aff_unique_abbr": "McGill;UCL;UCLA", "aff_campus_unique_index": "1", "aff_campus_unique": ";Los Angeles", "aff_country_unique_index": "0;1;2", "aff_country_unique": "Canada;United Kingdom;United States" }, { "id": "VDdDvnwFoyM", "title": "TimeVAE: A Variational Auto-Encoder for Multivariate Time Series Generation", "track": "main", "status": "Reject", "tldr": "", "abstract": "Recent work in synthetic data generation in the time-series domain has focused on the use of Generative Adversarial Networks. We propose a novel architecture for synthetically generating time-series data with the use of Variational Auto-Encoders (VAEs). The proposed architecture has several distinct properties: interpretability, ability to encode domain knowledge, and reduced training times. We evaluate data generation quality by similarity and predictability against four multivariate datasets. We experiment with varying sizes of training data to measure the impact of data availability on generation quality for our VAE method as well as several state-of-the-art data generation methods. Our results on similarity tests show that the VAE approach is able to accurately represent the temporal attributes of the original data. On next-step prediction tasks using generated data, the proposed VAE architecture consistently meets or exceeds performance of state-of-the-art data generation methods. While noise reduction may cause the generated data to deviate from original data, we demonstrate the resulting de-noised data can significantly improve performance for next-step prediction using generated data. Finally, the proposed architecture can incorporate domain-specific time-patterns such as polynomial trends and seasonalities to provide interpretable outputs. Such interpretability can be highly advantageous in applications requiring transparency of model outputs or where users desire to inject prior knowledge of time-series patterns into the generative model.", "keywords": "VAE;Variational Auto Encoder;Time Series;Data Generation;GAN;Generative Adversarial Network", "primary_area": "", "supplementary_material": "/attachment/571600156cf59eaed6b996a818f72f319804b239.zip", "author": "Abhyuday Desai;Cynthia Freeman;Zuhui Wang;Ian Beaver", "authorids": "~Abhyuday_Desai1;~Cynthia_Freeman1;wzhings@gmail.com;~Ian_Beaver1", "gender": ";F;;", "homepage": "https://www.linkedin.com/in/abhyuday-desai-ai/;;;", "dblp": ";178/3290;;130/3669", "google_scholar": "n3B2zmwAAAAJ;;;zz2PF00AAAAJ", "orcid": ";;;0000-0003-0865-1214", "linkedin": "abhyuday-desai-ai/;;;", "or_profile": "~Abhyuday_Desai1;~Cynthia_Freeman1;wzhings@gmail.com;~Ian_Beaver1", "aff": "Ready Tensor, Inc.;;;Verint Systems Inc", "aff_domain": "readytensor.ai;;;verint.com", "position": "CEO;;;Principal Researcher", "bibtex": "@misc{\ndesai2022timevae,\ntitle={Time{VAE}: A Variational Auto-Encoder for Multivariate Time Series Generation},\nauthor={Abhyuday Desai and Cynthia Freeman and Zuhui Wang and Ian Beaver},\nyear={2022},\nurl={https://openreview.net/forum?id=VDdDvnwFoyM}\n}", "github": "", "project": "", "reviewers": "f3YU;CnbM;X2k3;Gi2q", "site": "https://openreview.net/forum?id=VDdDvnwFoyM", "pdf_size": 0, "recommendation": "3;3;3;3", "confidence": "4;4;4;4", "correctness": "4;3;3;3", "technical_novelty": "3;2;1;3", "empirical_novelty": "3;2;2;2", "wc_summary_paper": "35;111;46;56", "wc_summary_review": "71;111;23;77", "wc_main_review": "113;601;301;225", "wc_review": "219;823;370;358", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.82915619758885 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 62.0, 29.248931604419333 ], "wc_summary_review_avg": [ 70.5, 31.38072656902641 ], "wc_main_review_avg": [ 310.0, 180.8286481727937 ], "wc_review_avg": [ 442.5, 227.5571356824479 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 153, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14206760136936331755&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1", "aff_unique_norm": "Ready Tensor, Inc.;Verint Systems", "aff_unique_dep": ";", "aff_unique_url": ";https://www.verint.com", "aff_unique_abbr": ";Verint", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Learning Fast Samplers for Diffusion Models by Differentiating Through Sample Quality", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6174", "id": "VFBjuF8HEp", "poster": "", "openreview": "https://openreview.net/forum?id=VFBjuF8HEp", "slides": "https://iclr.cc/virtual/2022/poster/6174", "video": "https://iclr.cc/virtual/2022/poster/6174", "author_site": "Daniel Watson, William Chan, Jonathan Ho, Mohammad Norouzi", "tldr": "", "abstract": "Diffusion models have emerged as an expressive family of generative models rivaling GANs in sample quality and autoregressive models in likelihood scores. Standard diffusion models typically require hundreds of forward passes through the model to generate a single high-fidelity sample. We introduce Differentiable Diffusion Sampler Search (DDSS): a method that optimizes fast samplers for any pre-trained diffusion model by differentiating through sample quality scores. We also present Generalized Gaussian Diffusion Models (GGDM), a family of flexible non-Markovian samplers for diffusion models. We show that optimizing the degrees of freedom of GGDM samplers by maximizing sample quality scores via gradient descent leads to improved sample quality. Our optimization procedure backpropagates through the sampling process using the reparametrization trick and gradient rematerialization. DDSS achieves strong results on unconditional image generation across various datasets (e.g., FID scores on LSUN church 128x128 of 11.6 with only 10 inference steps, and 4.82 with 20 steps, compared to 51.1 and 14.9 with strongest DDPM/DDIM baselines). Our method is compatible with any pre-trained diffusion model without fine-tuning or re-training required.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Daniel Watson;William Chan;Jonathan Ho;Mohammad Norouzi", "authorids": "~Daniel_Watson1;~William_Chan1;~Jonathan_Ho1;~Mohammad_Norouzi1", "gender": ";;;M", "homepage": ";http://williamchan.ca;;https://norouzi.github.io/", "dblp": ";58/2301;80/8677;https://dblp.org/pers/hd/n/Norouzi_0002:Mohammad", "google_scholar": "_pKKv2QAAAAJ;Nla9qfUAAAAJ;iVLAQysAAAAJ;Lncr-VoAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Daniel_Watson1;~William_Chan1;~Jonathan_Ho1;~Mohammad_Norouzi1", "aff": "Google;Google Brain;Google;Google Brain", "aff_domain": "google.com;google.com;google.com;google.com", "position": "Researcher;Research Scientist;Researcher;Research Scientist", "bibtex": "@inproceedings{\nwatson2022learning,\ntitle={Learning Fast Samplers for Diffusion Models by Differentiating Through Sample Quality},\nauthor={Daniel Watson and William Chan and Jonathan Ho and Mohammad Norouzi},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=VFBjuF8HEp}\n}", "github": "", "project": "", "reviewers": "vLgy;u4ae;owpU;zfCn", "pdf_size": 0, "recommendation": "6;6;6;8", "confidence": "4;4;4;3", "correctness": "4;3;3;4", "technical_novelty": "3;2;3;3", "empirical_novelty": "4;2;3;3", "wc_summary_paper": "119;84;428;227", "wc_summary_review": "95;245;100;77", "wc_main_review": "494;275;557;364", "wc_review": "708;604;1085;668", "wc_reply_reviewers": "0;0;370;0", "wc_reply_authors": "359;445;644;249", "reply_reviewers": "0;0;1;0", "reply_authors": "1;2;1;1", "recommendation_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 214.5, 134.06062061619735 ], "wc_summary_review_avg": [ 129.25, 67.37349256198613 ], "wc_main_review_avg": [ 422.5, 109.97840697155056 ], "wc_review_avg": [ 766.25, 187.7316901857542 ], "wc_reply_reviewers_avg": [ 92.5, 160.21469970012114 ], "wc_reply_authors_avg": [ 424.25, 144.64676802472982 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -1.0, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 200, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=348650318710260128&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=VFBjuF8HEp", "email": "google.com;google.com;google.com;google.com", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Google", "aff_unique_dep": "Google", "aff_unique_url": "https://www.google.com", "aff_unique_abbr": "Google", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "VFDDn-7_NRZ", "title": "Sliced Recursive Transformer", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "We present a neat yet effective recursive operation on vision transformers that can improve parameter utilization without involving additional parameters. This is achieved by sharing weights across depth of transformer networks. The proposed method can obtain a substantial gain (~2%) simply using naive recursive operation, requires no special or sophisticated knowledge for designing principles of networks, and introduces minimum computational overhead to the training procedure. To reduce the additional computation caused by recursive operation while maintaining the superior accuracy, we propose an approximating method through multiple sliced group self-attentions across recursive layers which can reduce the cost consumption by 10~30% with minimal performance loss. We call our model Sliced Recursive Transformer (SReT), which is compatible with a broad range of other designs for efficient vision transformers. Our best model establishes significant improvement on ImageNet over state-of-the-art methods while containing fewer parameters. The proposed sliced recursive operation allows us to build a transformer with more than 100 or even 1000 layers effortlessly under a still small size (13~15M), to avoid difficulties in optimization when the model size is too large. The flexible scalability has shown great potential for scaling up and constructing extremely deep and large dimensionality vision transformers.", "keywords": "Recursive Operation;Vision Transformer;Efficient Model;Approximating Self-Attention;Sliced Group Self-Attention", "primary_area": "", "supplementary_material": "", "author": "Zhiqiang Shen;Zechun Liu;Eric Xing", "authorids": "~Zhiqiang_Shen1;~Zechun_Liu1;~Eric_Xing1", "gender": ";;M", "homepage": ";;http://www.cs.cmu.edu/~epxing/", "dblp": ";;36/3855", "google_scholar": ";;https://scholar.google.com.tw/citations?user=5pKTRxEAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Zhiqiang_Shen1;~Zechun_Liu1;~Eric_Xing1", "aff": ";;School of Computer Science, Carnegie Mellon University", "aff_domain": ";;cs.cmu.edu", "position": ";;Full Professor", "bibtex": "@misc{\nshen2022sliced,\ntitle={Sliced Recursive Transformer},\nauthor={Zhiqiang Shen and Zechun Liu and Eric Xing},\nyear={2022},\nurl={https://openreview.net/forum?id=VFDDn-7_NRZ}\n}", "github": "", "project": "", "reviewers": "jogr;7hxC;hbdy;WxDB;wt6c", "site": "https://openreview.net/forum?id=VFDDn-7_NRZ", "pdf_size": 0, "recommendation": "3;5;5;5;6", "confidence": "4;4;3;4;4", "correctness": "2;3;3;2;3", "technical_novelty": "2;3;2;2;2", "empirical_novelty": "2;2;2;2;2", "wc_summary_paper": "58;97;73;85;79", "wc_summary_review": "55;24;16;28;13", "wc_main_review": "428;332;255;337;153", "wc_review": "541;453;344;450;245", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "0;0;0;0;0", "reply_reviewers": "0;0;0;0;0", "reply_authors": "0;0;0;0;0", "recommendation_avg": [ 4.8, 0.9797958971132712 ], "confidence_avg": [ 3.8, 0.39999999999999997 ], "correctness_avg": [ 2.6, 0.4898979485566356 ], "technical_novelty_avg": [ 2.2, 0.39999999999999997 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 78.4, 12.92439553712281 ], "wc_summary_review_avg": [ 27.2, 14.905032707109367 ], "wc_main_review_avg": [ 301.0, 92.09343081892432 ], "wc_review_avg": [ 406.6, 102.11092008203627 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.10206207261596575, "corr_recommendation_correctness": 0.6666666666666667, "gs_citation": 33, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6881440757906382227&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "aff_unique_index": "0", "aff_unique_norm": "Carnegie Mellon University", "aff_unique_dep": "School of Computer Science", "aff_unique_url": "https://www.cmu.edu", "aff_unique_abbr": "CMU", "aff_campus_unique_index": "0", "aff_campus_unique": "Pittsburgh", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "title": "Sparse Attention with Learning to Hash", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/5999", "id": "VGnOJhd5Q1q", "poster": "", "openreview": "https://openreview.net/forum?id=VGnOJhd5Q1q", "slides": "https://iclr.cc/virtual/2022/poster/5999", "video": "https://iclr.cc/virtual/2022/poster/5999", "author_site": "Zhiqing Sun, Yiming Yang, Shinjae Yoo", "tldr": "", "abstract": "Transformer has become ubiquitous in sequence modeling tasks. As a key component of Transformer, self-attention does not scale to long sequences due to its quadratic time and space complexity with respect to the sequence length. To tackle this problem, recent work developed dynamic attention sparsification techniques based on Approximate Nearest Neighbor (ANN) methods, where similar queries and keys are allocated to the same hash bucket with high probability. However, the effectiveness of those ANN methods relies on the assumption that queries and keys should lie in the same space, which is not well justified. Besides, some of the ANN methods such as Locality-Sensitive Hashing (LSH) are randomized and cannot fully utilize the available real data distributions. To overcome these issues, this paper proposes a new strategy for sparse attention, namely LHA (Learning-to-Hash Attention), which directly learns separate parameterized hash functions for queries and keys, respectively. Another advantage of LHA is that it does not impose extra constraints for queries and keys, which makes it applicable to the wide range of pre-trained Transformer models. Our experiments on evaluation of the WikiText-103 dataset for language modeling, the GLUE benchmark for natural language understanding, and the Lang-Range-Arena benchmark for multiple tasks (text/image classification, retrieval, etc.) show the superior performance of LHA over other strong Transformer variants.", "keywords": "Sparse Attention;Transformer;Learning-to-Hash;Natural Language Processing", "primary_area": "", "supplementary_material": "", "author": "Zhiqing Sun;Yiming Yang;Shinjae Yoo", "authorids": "~Zhiqing_Sun1;~Yiming_Yang1;~Shinjae_Yoo1", "gender": "M;F;M", "homepage": "https://www.cs.cmu.edu/~zhiqings/;http://www.cs.cmu.edu/~yiming/;", "dblp": "211/7692;25/1666;69/1062", "google_scholar": "https://scholar.google.com/citations?hl=en;MlZq4XwAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";0000-0001-8322-607X;", "linkedin": "zhiqing-sun-5781b3100/;yiming-yang-24100924/;", "or_profile": "~Zhiqing_Sun1;~Yiming_Yang1;~Shinjae_Yoo1", "aff": "Carnegie Mellon University;School of Computer Science, Carnegie Mellon University;Brookhaven National Lab", "aff_domain": "cs.cmu.edu;cs.cmu.edu;bnl.gov", "position": "PhD student;Full Professor;Scientist", "bibtex": "@inproceedings{\nsun2022sparse,\ntitle={Sparse Attention with Learning to Hash},\nauthor={Zhiqing Sun and Yiming Yang and Shinjae Yoo},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=VGnOJhd5Q1q}\n}", "github": "", "project": "", "reviewers": "wDdo;fhAP;zEkG", "pdf_size": 0, "recommendation": "5;6;8", "confidence": "5;3;4", "correctness": "3;3;3", "technical_novelty": "2;2;3", "empirical_novelty": "3;3;3", "wc_summary_paper": "20;56;142", "wc_summary_review": "45;69;67", "wc_main_review": "195;314;660", "wc_review": "260;439;869", "wc_reply_reviewers": "0;501;278", "wc_reply_authors": "360;550;1246", "reply_reviewers": "0;1;1", "reply_authors": "2;1;3", "recommendation_avg": [ 6.333333333333333, 1.247219128924647 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 72.66666666666667, 51.181593913784624 ], "wc_summary_review_avg": [ 60.333333333333336, 10.873004286866726 ], "wc_main_review_avg": [ 389.6666666666667, 197.23139258805182 ], "wc_review_avg": [ 522.6666666666666, 255.5651689013109 ], "wc_reply_reviewers_avg": [ 259.6666666666667, 204.94281045100254 ], "wc_reply_authors_avg": [ 718.6666666666666, 380.8633467385499 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 2.0, 0.816496580927726 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.3273268353539886, "corr_recommendation_correctness": 0.0, "gs_citation": 18, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5211673172629196703&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2, "pdf": "https://openreview.net/pdf?id=VGnOJhd5Q1q", "email": "cs.cmu.edu;cs.cmu.edu;bnl.gov", "author_num": 3, "aff_unique_index": "0;0;1", "aff_unique_norm": "Carnegie Mellon University;Brookhaven National Laboratory", "aff_unique_dep": ";", "aff_unique_url": "https://www.cmu.edu;https://www.bnl.gov", "aff_unique_abbr": "CMU;BNL", "aff_campus_unique_index": "1", "aff_campus_unique": ";Pittsburgh", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "VINWzIM6_6", "title": "Contrastive Representation Learning for 3D Protein Structures", "track": "main", "status": "Reject", "tldr": "", "abstract": "Learning from 3D protein structures has gained a lot of attention in the fields of protein modeling and structural bioinformatics. Unfortunately, the number of available structures is orders of magnitude lower than the number of available protein sequences. Moreover, this number is reduced even more when only annotated protein structures are considered. This makes the training of existing models difficult and prone to overfitting. To address this limitation, we introduce a new representation learning framework for 3D protein structures. Our framework uses unsupervised contrastive learning to learn meaningful representations of protein structures making use of annotated and un-annotated proteins from the Protein Data Bank. We show how these representations can be used to directly solve different tasks in the field of structural bioinformatics, such as protein function and protein structural similarity prediction. Moreover, we show how fine-tuned networks, pre-trained with our algorithm, lead to significantly improved task performance.", "keywords": "representation learning;structural bioinformatics;proteins", "primary_area": "", "supplementary_material": "", "author": "Pedro Hermosilla;Timo Ropinski", "authorids": "~Pedro_Hermosilla1;~Timo_Ropinski2", "gender": "M;M", "homepage": "https://phermosilla.github.io/;https://viscom.uni-ulm.de/members/timo-ropinski/", "dblp": "170/7065;92/5590", "google_scholar": "C7F4B6MAAAAJ;FuY-lbcAAAAJ", "orcid": ";0000-0002-7857-5512", "linkedin": ";", "or_profile": "~Pedro_Hermosilla1;~Timo_Ropinski2", "aff": "Ulm University;Ulm University", "aff_domain": "uni-ulm.de;uni-ulm.de", "position": "Postdoc;Full Professor", "bibtex": "@misc{\nhermosilla2022contrastive,\ntitle={Contrastive Representation Learning for 3D Protein Structures},\nauthor={Pedro Hermosilla and Timo Ropinski},\nyear={2022},\nurl={https://openreview.net/forum?id=VINWzIM6_6}\n}", "github": "", "project": "", "reviewers": "xGdi;c8cK;ncDp;Ayvu", "site": "https://openreview.net/forum?id=VINWzIM6_6", "pdf_size": 0, "recommendation": "3;5;6;6", "confidence": "3;4;4;4", "correctness": "2;4;4;3", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "58;37;100;76", "wc_summary_review": "38;62;13;53", "wc_main_review": "457;313;560;325", "wc_review": "553;412;673;454", "wc_reply_reviewers": "55;0;0;0", "wc_reply_authors": "640;495;807;548", "reply_reviewers": "1;0;0;0", "reply_authors": "2;1;1;2", "recommendation_avg": [ 5.0, 1.224744871391589 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 67.75, 23.177305710543667 ], "wc_summary_review_avg": [ 41.5, 18.553975315279473 ], "wc_main_review_avg": [ 413.75, 101.59570611005172 ], "wc_review_avg": [ 523.0, 100.60069582264329 ], "wc_reply_reviewers_avg": [ 13.75, 23.81569860407206 ], "wc_reply_authors_avg": [ 622.5, 118.48312116077969 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.9428090415820632, "corr_recommendation_correctness": 0.7385489458759963, "gs_citation": 58, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3122018777985929046&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0", "aff_unique_norm": "Ulm University", "aff_unique_dep": "", "aff_unique_url": "https://www.uni-ulm.de/", "aff_unique_abbr": "U Ulm", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Germany" }, { "id": "VKtGrkUvCR", "title": "Only tails matter: Average-Case Universality and Robustness in the Convex Regime", "track": "main", "status": "Reject", "tldr": "", "abstract": "Recent works have studied the average convergence properties of first-order optimization methods on distributions of quadratic problems. The average-case framework allows a more fine-grained and representative analysis of convergence than usual worst-case results, in exchange for a more precise hypothesis over the data generating process, namely assuming knowledge of the expected spectral distribution (e.s.d) of the random matrix associated with the problem. In this work, we show that a problem's asymptotic average complexity is determined by the concentration of eigenvalues near the edges of the e.s.d. We argue that having \u00e0 priori information on this concentration is a more grounded assumption than complete knowledge of the e.s.d., and that basing our analysis on the approximate concentration is effectively a middle ground between the coarseness of the worst-case convergence and this more unrealistic hypothesis. We introduce the Generalized Chebyshev method, asymptotically optimal under a hypothesis on this concentration, and globally optimal when the e.s.d. follows a Beta distribution. We compare its performance to classical optimization algorithms, such as Gradient Descent or Nesterov's scheme, and we show that, asymptotically, Nesterov's method is universally nearly-optimal in the average-case.", "keywords": "optimization;average-case;first-order;random matrix theory;nesterov", "primary_area": "", "supplementary_material": "", "author": "Leonardo Cunha;Gauthier Gidel;Fabian Pedregosa;Courtney Paquette;Damien Scieur", "authorids": "~Leonardo_Cunha1;~Gauthier_Gidel1;~Fabian_Pedregosa1;~Courtney_Paquette1;~Damien_Scieur1", "gender": "M;M;M;F;M", "homepage": ";https://gauthiergidel.github.io/;http://fa.bianp.net;https://cypaquette.github.io/;https://damienscieur.com/", "dblp": ";188/6326;11/9764;https://dblp.uni-trier.de/pers/hd/p/Paquette:Courtney;191/6712", "google_scholar": ";https://scholar.google.fr/citations?user=bDrXQPUAAAAJ;https://scholar.google.fr/citations?hl=en;EkeZG30AAAAJ;https://scholar.google.fr/citations?user=hNscQzgAAAAJ", "orcid": ";;0000-0003-4025-3953;;", "linkedin": "leonardo-cunha-845589b7/;;http://www.linkedin.com/in/fabianpedregosa;;damien-scieur-6873ba82/", "or_profile": "~Leonardo_Cunha1;~Gauthier_Gidel1;~Fabian_Pedregosa1;~Courtney_Yumiko_Paquette1;~Damien_Scieur3", "aff": ";Mila - Quebec Artificial Intelligence Institute;Google AI;Google;Samsung", "aff_domain": ";mila.quebec;google.com;google.com;samsung.com", "position": ";Assistant Professor;Research Scientist;Research Scientist;Researcher", "bibtex": "@misc{\ncunha2022only,\ntitle={Only tails matter: Average-Case Universality and Robustness in the Convex Regime},\nauthor={Leonardo Cunha and Gauthier Gidel and Fabian Pedregosa and Courtney Paquette and Damien Scieur},\nyear={2022},\nurl={https://openreview.net/forum?id=VKtGrkUvCR}\n}", "github": "", "project": "", "reviewers": "Tq5M;poH2;VZRV;KTu2", "site": "https://openreview.net/forum?id=VKtGrkUvCR", "pdf_size": 0, "recommendation": "5;5;5;8", "confidence": "3;4;4;4", "correctness": "4;4;3;4", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;3;2;3", "wc_summary_paper": "32;259;39;79", "wc_summary_review": "32;46;50;50", "wc_main_review": "105;393;209;342", "wc_review": "169;698;298;471", "wc_reply_reviewers": "0;80;0;0", "wc_reply_authors": "226;318;414;298", "reply_reviewers": "0;1;0;0", "reply_authors": "2;1;2;1", "recommendation_avg": [ 5.75, 1.299038105676658 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 102.25, 92.25880716766287 ], "wc_summary_review_avg": [ 44.5, 7.399324293474371 ], "wc_main_review_avg": [ 262.25, 112.93665259781697 ], "wc_review_avg": [ 409.0, 198.29649517830617 ], "wc_reply_reviewers_avg": [ 20.0, 34.64101615137755 ], "wc_reply_authors_avg": [ 314.0, 67.11184694225007 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=936811545281060495&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;1;1;2", "aff_unique_norm": "Quebec Artificial Intelligence Institute;Google;Samsung", "aff_unique_dep": "Artificial Intelligence;Google AI;Samsung", "aff_unique_url": "https://mila.quebec;https://ai.google;https://www.samsung.com", "aff_unique_abbr": "Mila;Google AI;Samsung", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;1;1;2", "aff_country_unique": "Canada;United States;South Korea" }, { "title": "Implicit Bias of MSE Gradient Optimization in Underparameterized Neural Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7090", "id": "VLgmhQDVBV", "poster": "", "openreview": "https://openreview.net/forum?id=VLgmhQDVBV", "slides": "https://iclr.cc/virtual/2022/poster/7090", "video": "https://iclr.cc/virtual/2022/poster/7090", "author_site": "Benjamin Bowman, Guido Montufar", "tldr": "", "abstract": "We study the dynamics of a neural network in function space when optimizing the mean squared error via gradient flow. We show that in the underparameterized regime the network learns eigenfunctions of an integral operator $T_K$ determined by the Neural Tangent Kernel at rates corresponding to their eigenvalues. For example, for uniformly distributed data on the sphere $S^{d - 1}$ and rotation invariant weight distributions, the eigenfunctions of $T_K$ are the spherical harmonics. Our results can be understood as describing a spectral bias in the underparameterized regime. The proofs use the concept of ``Damped Deviations'' where deviations of the NTK matter less for eigendirections with large eigenvalues. Aside from the underparameterized regime, the damped deviations point-of-view allows us to extend certain results in the literature in the overparameterized setting. ", "keywords": "underparameterized regime;spectral bias;neural tangent kernel;implicit bias;implicit regularization;gradient flow", "primary_area": "", "supplementary_material": "", "author": "Benjamin Bowman;Guido Montufar", "authorids": "~Benjamin_Bowman1;~Guido_Montufar1", "gender": ";M", "homepage": "https://www.benjamin-bowman.com/;http://www.math.ucla.edu/~montufar/", "dblp": "248/2643;", "google_scholar": "zYZ_FNEAAAAJ;https://scholar.google.de/citations?user=pDIuuVwAAAAJ", "orcid": ";0000-0002-0131-2669", "linkedin": "benjamin-bowman314;", "or_profile": "~Benjamin_Bowman1;~Guido_Montufar1", "aff": "University of California, Los Angeles;UCLA", "aff_domain": "ucla.edu;math.ucla.edu", "position": "PhD student;Assistant Professor", "bibtex": "@inproceedings{\nbowman2022implicit,\ntitle={Implicit Bias of {MSE} Gradient Optimization in Underparameterized Neural Networks},\nauthor={Benjamin Bowman and Guido Montufar},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=VLgmhQDVBV}\n}", "github": "", "project": "", "reviewers": "hz1D;VUVP;Q2fJ;bKHH", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "3;3;3;3", "correctness": "4;4;4;4", "technical_novelty": "3;3;2;3", "empirical_novelty": "0;0;0;4", "wc_summary_paper": "51;63;149;117", "wc_summary_review": "53;38;94;69", "wc_main_review": "564;421;443;217", "wc_review": "668;522;686;403", "wc_reply_reviewers": "324;86;177;20", "wc_reply_authors": "2092;1469;1275;882", "reply_reviewers": "2;1;1;1", "reply_authors": "5;4;3;2", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 3.0, 0.0 ], "correctness_avg": [ 4.0, 0.0 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.0, 1.7320508075688772 ], "wc_summary_paper_avg": [ 95.0, 39.87480407475377 ], "wc_summary_review_avg": [ 63.5, 20.74246851269154 ], "wc_main_review_avg": [ 411.25, 124.66830992678132 ], "wc_review_avg": [ 569.75, 115.38278684448561 ], "wc_reply_reviewers_avg": [ 151.75, 114.00520821436186 ], "wc_reply_authors_avg": [ 1429.5, 437.06206653060156 ], "reply_reviewers_avg": [ 1.25, 0.4330127018922193 ], "reply_authors_avg": [ 3.5, 1.118033988749895 ], "replies_avg": [ 24, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13886748612780727924&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=VLgmhQDVBV", "email": "ucla.edu;math.ucla.edu", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "University of California, Los Angeles", "aff_unique_dep": "", "aff_unique_url": "https://www.ucla.edu", "aff_unique_abbr": "UCLA", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Los Angeles", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "VMuenFh7IpP", "title": "What Doesn't Kill You Makes You Robust(er): How to Adversarially Train against Data Poisoning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Data poisoning is a threat model in which a malicious actor tampers with training data to manipulate outcomes at inference time. A variety of defenses against this threat model have been proposed, but each suffers from at least one of the following flaws: they are easily overcome by adaptive attacks, they severely reduce testing performance, or they cannot generalize to diverse data poisoning threat models. Adversarial training, and its variants, are currently considered the only empirically strong defense against (inference-time) adversarial attacks. In this work, we extend the adversarial training framework to defend against (training-time) data poisoning. Our method desensitizes networks to the effects of such attacks by creating poisons during training and injecting them into training batches. \nWe show that this defense withstands adaptive attacks, generalizes to diverse threat models, and incurs a better performance trade-off than previous defenses.", "keywords": "Data Poisoning;Poisoning Defenses;Adversarial Training;Empirical Defenses;Robustness;Security", "primary_area": "", "supplementary_material": "/attachment/43c9e200ab7ed6cedc75a9d89437b158663a82b8.zip", "author": "Jonas Geiping;Liam H Fowl;Gowthami Somepalli;Micah Goldblum;Michael Moeller;Tom Goldstein", "authorids": "~Jonas_Geiping1;~Liam_H_Fowl1;~Gowthami_Somepalli1;~Micah_Goldblum1;~Michael_Moeller1;~Tom_Goldstein1", "gender": "M;;F;;M;M", "homepage": "https://jonasgeiping.github.io/;;https://somepago.github.io/;;http://vsa.informatik.uni-siegen.de;https://www.cs.umd.edu/~tomg/", "dblp": "190/7229;241/6940;286/5012;241/7231;08/5840-1;25/8184", "google_scholar": "https://scholar.google.de/citations?user=206vNCEAAAAJ;IXv3ToAAAAAJ;T2ezBDsAAAAJ;pGDKzuUAAAAJ;https://scholar.google.de/citations?user=sxzdAGUAAAAJ;KmSuVtgAAAAJ", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": "~Jonas_Geiping1;~Liam_H_Fowl1;~Gowthami_Somepalli1;~Micah_Goldblum1;~Michael_Moeller1;~Tom_Goldstein1", "aff": "University of Maryland, College Park;University of Maryland, College Park;University of Maryland, College Park;New York University;University of Siegen;University of Maryland, College Park", "aff_domain": "umd.edu;umd.edu;umd.edu;nyu.edu;uni-siegen.de;umd.edu", "position": "Postdoc;PhD student;PhD student;Postdoc;Full Professor;Associate Professor", "bibtex": "@misc{\ngeiping2022what,\ntitle={What Doesn't Kill You Makes You Robust(er): How to Adversarially Train against Data Poisoning},\nauthor={Jonas Geiping and Liam H Fowl and Gowthami Somepalli and Micah Goldblum and Michael Moeller and Tom Goldstein},\nyear={2022},\nurl={https://openreview.net/forum?id=VMuenFh7IpP}\n}", "github": "", "project": "", "reviewers": "E8z9;dQnr;xUH6;pSWq", "site": "https://openreview.net/forum?id=VMuenFh7IpP", "pdf_size": 0, "recommendation": "3;6;6;8", "confidence": "4;2;3;3", "correctness": "2;4;4;3", "technical_novelty": "2;3;3;4", "empirical_novelty": "2;3;2;4", "wc_summary_paper": "80;30;74;48", "wc_summary_review": "59;38;57;34", "wc_main_review": "685;225;134;274", "wc_review": "824;293;265;356", "wc_reply_reviewers": "212;132;0;196", "wc_reply_authors": "913;399;133;626", "reply_reviewers": "1;1;0;2", "reply_authors": "2;1;1;2", "recommendation_avg": [ 5.75, 1.7853571071357126 ], "confidence_avg": [ 3.0, 0.7071067811865476 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 58.0, 20.149441679609886 ], "wc_summary_review_avg": [ 47.0, 11.113055385446435 ], "wc_main_review_avg": [ 329.5, 211.3060576509817 ], "wc_review_avg": [ 434.5, 227.28011351633913 ], "wc_reply_reviewers_avg": [ 135.0, 83.49251463454674 ], "wc_reply_authors_avg": [ 517.75, 287.26066124688913 ], "reply_reviewers_avg": [ 1.0, 0.7071067811865476 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.5940885257860046, "corr_recommendation_correctness": 0.5488604301969737, "gs_citation": 88, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14904439968501323227&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;0;1;2;0", "aff_unique_norm": "University of Maryland;New York University;University of Siegen", "aff_unique_dep": ";;", "aff_unique_url": "https://www/umd.edu;https://www.nyu.edu;https://www.uni-siegen.de", "aff_unique_abbr": "UMD;NYU;Uni Siegen", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "College Park;", "aff_country_unique_index": "0;0;0;0;1;0", "aff_country_unique": "United States;Germany" }, { "id": "VNXYZjGcsty", "title": "Chaining Data - A Novel Paradigm in Artificial Intelligence Exemplified with NMF based Clustering", "track": "main", "status": "Reject", "tldr": "", "abstract": "In the era of artificial intelligence there is an acceleration of high quality inference from the fusion of data and we have overcome the linking challenge associated with higher order features. We have fundamentally linked together tables of databases for clustering algorithms and expect this paradigm and those related to it to produce many new insights. We propose linked view clustering that is an extension of multi-view clustering by adding complementary and consensus information across linked views of each datapoint. While there are many methods, we focus on non-negative matrix factorization combined with the fusion of linking data in a manner that corresponds to extracting knowledge from the multiple tables of a relational database. It is commonplace to identify hashtag communities on social media by word usage, however there exists troves of data not included but could be. We can incorporate locations by hashtag to improve community detection, this is multiNMF or multiview clustering, but we extend this method to beyond the first link. A general artificial intelligence method to incorporate any table that can be chained backwards has not been done before to our knowledge. We call this linked view NMF or chained view clustering and give the algorithms to perform multiplicative updates and the general solution that can be solved using automatic differentiation such as JAX. We demonstrate how the equations can be interpreted on synthetic data as well as how information flows through the links and as a proof of concept on real data we incorporate word vectors using the method on an authorship clustering dataset.", "keywords": "NMF;clustering;linking data;chaining data", "primary_area": "", "supplementary_material": "", "author": "Norman J Mapes;Sumeet Dua", "authorids": "~Norman_J_Mapes1;~Sumeet_Dua1", "gender": "M;", "homepage": ";", "dblp": ";", "google_scholar": "Ve_JjBwAAAAJ;", "orcid": "0000-0003-1890-6452;", "linkedin": ";", "or_profile": "~Norman_J_Mapes1;~Sumeet_Dua1", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nmapes2022chaining,\ntitle={Chaining Data - A Novel Paradigm in Artificial Intelligence Exemplified with {NMF} based Clustering},\nauthor={Norman J Mapes and Sumeet Dua},\nyear={2022},\nurl={https://openreview.net/forum?id=VNXYZjGcsty}\n}", "github": "", "project": "", "reviewers": "QKqU;k2WG;RkXc", "site": "https://openreview.net/forum?id=VNXYZjGcsty", "pdf_size": 0, "recommendation": "3;5;5", "confidence": "4;4;4", "correctness": "2;3;3", "technical_novelty": "2;3;2", "empirical_novelty": "1;2;2", "wc_summary_paper": "84;67;69", "wc_summary_review": "63;33;59", "wc_main_review": "147;730;163", "wc_review": "294;830;291", "wc_reply_reviewers": "133;321;0", "wc_reply_authors": "164;801;755", "reply_reviewers": "1;1;0", "reply_authors": "1;1;1", "recommendation_avg": [ 4.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 2.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 1.6666666666666667, 0.4714045207910317 ], "wc_summary_paper_avg": [ 73.33333333333333, 7.586537784494029 ], "wc_summary_review_avg": [ 51.666666666666664, 13.299958228840001 ], "wc_main_review_avg": [ 346.6666666666667, 271.1362920910605 ], "wc_review_avg": [ 471.6666666666667, 253.38288988976524 ], "wc_reply_reviewers_avg": [ 151.33333333333334, 131.687340149141 ], "wc_reply_authors_avg": [ 573.3333333333334, 290.05095337811866 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.9999999999999998, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:oE7ko7ZjT84J:scholar.google.com/&scioq=Chaining+Data+-+A+Novel+Paradigm+in+Artificial+Intelligence+Exemplified+with+NMF+based+Clustering&hl=en&as_sdt=0,5", "gs_version_total": 2 }, { "id": "VNdFPD5wqjh", "title": "Generalizable Person Re-identification Without Demographics", "track": "main", "status": "Reject", "tldr": "", "abstract": "Generalizable Person Re-Identification (DG ReID) aims to learn ready-to-use cross-domain representations for direct cross-data evaluation. It typically fully exploit demographics information, e.g. the domain information and camera IDs to learn features that are domain-invariant. However, the protected demographic features are not often accessible due to privacy and regulation issues. Under this more realistic setting, distributionally robust optimization (DRO) provides a promising way for learning robust models that are able to perform well on a collection of possible data distributions (the ``uncertainty set\u201d) without demographics. However, the convex condition of KL DRO may not hold for overparameterized neural networks, such that applying KL DRO often fails to generalize under distribution shifts in real scenarios. Instead, by applying the change-of-measure technique and the analytical solution of KL DRO, we propose a simple yet efficient approach, Unit DRO. Unit DRO minimizes the loss over a reweighted dataset where important samples (i.e. samples on which models perform poorly) will be upweighted and others will be downweighted. Empirical results show that Unit DRO achieves superior performance on large-scale DG ReID and cross-domain ReID benchmarks compared to standard baselines.", "keywords": "Generalizable Person Re-Identification;Distributionally robust optimization", "primary_area": "", "supplementary_material": "/attachment/5705632df9d9223ab0c13efbda20ae462e4cebae.zip", "author": "YiFan Zhang;Feng Li;Zhang Zhang;Liang Wang;Dacheng Tao;Tieniu Tan", "authorids": "~YiFan_Zhang8;~Feng_Li9;~Zhang_Zhang1;~Liang_Wang3;~Dacheng_Tao1;~Tieniu_Tan1", "gender": ";M;;M;;", "homepage": ";https://fengli-ust.github.io/;https://zhangzhang80.github.io/;;;", "dblp": ";92/2954-40.html;94/2468-1;56/4499-1;;", "google_scholar": ";https://scholar.google.com/citations?hl=zh-CN;rnRNwEMAAAAJ;;;", "orcid": ";;0000-0001-9425-3065;;;", "linkedin": ";;;;;", "or_profile": "~YiFan_Zhang8;~Feng_Li9;~Zhang_Zhang1;~Liang_Wang3;~Dacheng_Tao1;~Tieniu_Tan1", "aff": ";Hong Kong University of Science and Technology;Institute of Automation, Chinese Academy of Sciences;Institute of Automation\uff0c CAS\uff0cChina;;", "aff_domain": ";ust.hk;ia.ac.cn;ia.ac.cn;;", "position": ";PhD student;Associate Professor;Full Professor;;", "bibtex": "@misc{\nzhang2022generalizable,\ntitle={Generalizable Person Re-identification Without Demographics},\nauthor={YiFan Zhang and Feng Li and Zhang Zhang and Liang Wang and Dacheng Tao and Tieniu Tan},\nyear={2022},\nurl={https://openreview.net/forum?id=VNdFPD5wqjh}\n}", "github": "", "project": "", "reviewers": "Cuay;umsc;3Xjf;eLS6", "site": "https://openreview.net/forum?id=VNdFPD5wqjh", "pdf_size": 0, "recommendation": "3;5;6;8", "confidence": "4;4;3;4", "correctness": "2;2;3;4", "technical_novelty": "2;2;3;4", "empirical_novelty": "2;3;2;3", "wc_summary_paper": "47;72;76;37", "wc_summary_review": "75;86;30;44", "wc_main_review": "400;117;180;166", "wc_review": "522;275;286;247", "wc_reply_reviewers": "732;246;0;0", "wc_reply_authors": "1937;1934;450;824", "reply_reviewers": "1;1;0;0", "reply_authors": "3;4;1;2", "recommendation_avg": [ 5.5, 1.8027756377319946 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.82915619758885 ], "technical_novelty_avg": [ 2.75, 0.82915619758885 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 58.0, 16.446884203398525 ], "wc_summary_review_avg": [ 58.75, 22.64260364887395 ], "wc_main_review_avg": [ 215.75, 108.91826063613024 ], "wc_review_avg": [ 332.5, 110.32792031031855 ], "wc_reply_reviewers_avg": [ 244.5, 298.8390034784616 ], "wc_reply_authors_avg": [ 1286.25, 662.579193379931 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 2.5, 1.118033988749895 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.16012815380508713, "corr_recommendation_correctness": 0.9198662110077999, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12525195099936833347&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2, "aff_unique_index": "0;1;1", "aff_unique_norm": "Hong Kong University of Science and Technology;Chinese Academy of Sciences", "aff_unique_dep": ";Institute of Automation", "aff_unique_url": "https://www.ust.hk;http://www.ia.cas.cn", "aff_unique_abbr": "HKUST;CAS", "aff_campus_unique_index": "0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "Decoupled Adaptation for Cross-Domain Object Detection", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6215", "id": "VNqaB1g9393", "poster": "", "openreview": "https://openreview.net/forum?id=VNqaB1g9393", "slides": "https://iclr.cc/virtual/2022/poster/6215", "video": "https://iclr.cc/virtual/2022/poster/6215", "author_site": "Junguang Jiang, baixu chen, Jianmin Wang, Mingsheng Long", "tldr": "", "abstract": "Cross-domain object detection is more challenging than object classification since multiple objects exist in an image and the location of each object is unknown in the unlabeled target domain. As a result, when we adapt features of different objects to enhance the transferability of the detector, the features of the foreground and the background are easy to be confused, which may hurt the discriminability of the detector. Besides, previous methods focused on category adaptation but ignored another important part for object detection, i.e., the adaptation on bounding box regression. To this end, we propose D-adapt, namely Decoupled Adaptation, to decouple the adversarial adaptation and the training of the detector. Besides, we fill the blank of regression domain adaptation in object detection by introducing a bounding box adaptor. Experiments show that \\textit{D-adapt} achieves state-of-the-art results on four cross-domain object detection tasks and yields 17\\% and 21\\% relative improvement on benchmark datasets Clipart1k and Comic2k in particular.", "keywords": "Object Detection;Domain Adaptation;Object Localization;Deep Learning;Transfer Learning", "primary_area": "", "supplementary_material": "", "author": "Junguang Jiang;Baixu Chen;Jianmin Wang;Mingsheng Long", "authorids": "~Junguang_Jiang2;~Baixu_Chen2;~Jianmin_Wang1;~Mingsheng_Long5", "gender": "M;M;M;M", "homepage": "https://github.com/tsingcbx99;https://www.thss.tsinghua.edu.cn/en/faculty/jianminwang.htm;https://junguangjiang.github.io/;http://ise.thss.tsinghua.edu.cn/~mlong", "dblp": "279/4076;06/3456-1.html;276/3175;74/9023", "google_scholar": ";https://scholar.google.com.tw/citations?user=MiovcboAAAAJ;dXS9TPUAAAAJ;_MjXpXkAAAAJ", "orcid": ";0000-0001-6841-7943;;0000-0002-5412-9120", "linkedin": ";;;", "or_profile": "~Baixu_Chen2;~Jianmin_Wang1;~junguang_jiang1;~Mingsheng_Long2", "aff": "Tsinghua University;Tsinghua University;Tsinghua University;Tsinghua University", "aff_domain": "tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn", "position": "Undergrad student;Full Professor;MS student;Associate Professor", "bibtex": "@inproceedings{\njiang2022decoupled,\ntitle={Decoupled Adaptation for Cross-Domain Object Detection},\nauthor={Junguang Jiang and Baixu Chen and Jianmin Wang and Mingsheng Long},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=VNqaB1g9393}\n}", "github": "", "project": "", "reviewers": "PJVS;Kz22;LmKN;HwgG", "pdf_size": 0, "recommendation": "6;8;8;8", "confidence": "4;3;3;4", "correctness": "3;3;4;3", "technical_novelty": "2;2;3;3", "empirical_novelty": "3;3;4;4", "wc_summary_paper": "103;53;93;52", "wc_summary_review": "124;42;17;73", "wc_main_review": "375;268;76;351", "wc_review": "602;363;186;476", "wc_reply_reviewers": "373;126;0;0", "wc_reply_authors": "1441;769;75;307", "reply_reviewers": "1;2;0;0", "reply_authors": "4;3;1;1", "recommendation_avg": [ 7.5, 0.8660254037844386 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 3.5, 0.5 ], "wc_summary_paper_avg": [ 75.25, 23.025800746119558 ], "wc_summary_review_avg": [ 64.0, 39.91866731242415 ], "wc_main_review_avg": [ 267.5, 117.47446531055164 ], "wc_review_avg": [ 406.75, 152.94014352026744 ], "wc_reply_reviewers_avg": [ 124.75, 152.27832248879025 ], "wc_reply_authors_avg": [ 648.0, 521.5601595214113 ], "reply_reviewers_avg": [ 0.75, 0.82915619758885 ], "reply_authors_avg": [ 2.25, 1.299038105676658 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 94, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15741647354170922060&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 7, "pdf": "https://openreview.net/pdf?id=VNqaB1g9393", "email": "tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Tsinghua University", "aff_unique_dep": "", "aff_unique_url": "https://www.tsinghua.edu.cn", "aff_unique_abbr": "THU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "id": "VO7bAwdWRjg", "title": "Fourier Features in Reinforcement Learning with Neural Networks", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "In classic Reinforcement Learning (RL), encoding the inputs with a Fourier feature mapping is a standard way to facilitate generalization and add prior domain knowledge. In Deep RL, such input encodings are less common, since they could, in principle, be learned by the network and may therefore seem less beneficial. In this paper, we present experiments on Multilayer Perceptron (MLP) that indicate that even in Deep RL, Fourier features can lead to significant performance gains, in both rewards and sample efficiency. Furthermore, we observe that they increase the robustness with respect to hyperparameters, lead to smoother policies, and benefit the training process by reducing learning interference, encouraging sparsity, and increasing the expressiveness of the learned features. According to our experiments, other input preprocessings, such as random Fourier features or Polynomial features, do not give similar advantages.But a major bottleneck with conventional Fourier features is that they exponentially increase the number of features with the state dimension. We remedy this by proposing a simple, light version that only has a linear number of features, yet still maintains the benefits. Our experiments cover both shallow/deep, discrete/continuous, and on/off-policy RL settings. To the best of our knowledge, this is the first reported application of Fourier features in Deep RL. ", "keywords": "Deep Reinforcement Learning;Fourier features;interference;sparsity;expressiveness;preprocessing", "primary_area": "", "supplementary_material": "", "author": "David Brellmann;Goran Frehse;David Filliat", "authorids": "~David_Brellmann1;~Goran_Frehse1;~David_Filliat1", "gender": "M;M;M", "homepage": ";https://sites.google.com/site/frehseg/;https://perso.ensta-paris.fr/~filliat/en/", "dblp": ";95/3625;13/5289", "google_scholar": "https://scholar.google.com/citations?hl=fr;IgZwd6MAAAAJ;https://scholar.google.fr/citations?user=Wzq_c20AAAAJ", "orcid": ";0000-0002-5441-0481;0000-0002-5739-1618", "linkedin": "david-brellmann;goran-frehse-84b8311/;", "or_profile": "~David_Brellmann1;~Goran_Frehse1;~David_Filliat1", "aff": "ENSTA Paris;ENSTA Paris;ENSTA Paris", "aff_domain": "ensta-paris.fr;ensta-paris.fr;ensta-paris.fr", "position": "PhD student;Full Professor;Full Professor", "bibtex": "@misc{\nbrellmann2022fourier,\ntitle={Fourier Features in Reinforcement Learning with Neural Networks},\nauthor={David Brellmann and Goran Frehse and David Filliat},\nyear={2022},\nurl={https://openreview.net/forum?id=VO7bAwdWRjg}\n}", "github": "", "project": "", "reviewers": "cDko;R2VE;jmFP;4bfq", "site": "https://openreview.net/forum?id=VO7bAwdWRjg", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "3;4;4;4", "correctness": "2;2;2;4", "technical_novelty": "2;1;2;2", "empirical_novelty": "2;3;2;3", "wc_summary_paper": "80;105;90;126", "wc_summary_review": "59;108;66;85", "wc_main_review": "340;826;566;287", "wc_review": "479;1039;722;498", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 2.5, 0.8660254037844386 ], "technical_novelty_avg": [ 1.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 100.25, 17.32591988899868 ], "wc_summary_review_avg": [ 79.5, 19.00657780874821 ], "wc_main_review_avg": [ 504.75, 213.0203922163322 ], "wc_review_avg": [ 684.5, 225.8810527689297 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": 1.0, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5670253867571904078&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0;0", "aff_unique_norm": "\u00c9cole Nationale Sup\u00e9rieure de Techniques Avanc\u00e9es", "aff_unique_dep": "", "aff_unique_url": "https://www.ensta.fr", "aff_unique_abbr": "ENSTA", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Paris", "aff_country_unique_index": "0;0;0", "aff_country_unique": "France" }, { "title": "Self-Supervised Inference in State-Space Models", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6606", "id": "VPjw9KPWRSK", "poster": "", "openreview": "https://openreview.net/forum?id=VPjw9KPWRSK", "slides": "https://iclr.cc/virtual/2022/poster/6606", "video": "https://iclr.cc/virtual/2022/poster/6606", "author_site": "David Ruhe, Patrick Forr\u00e9", "tldr": "", "abstract": "We perform approximate inference in state-space models with nonlinear state transitions. Without parameterizing a generative model, we apply Bayesian update formulas using a local linearity approximation parameterized by neural networks. It comes accompanied by a maximum likelihood objective that requires no supervision via uncorrupt observations or ground truth latent states. The optimization backpropagates through a recursion similar to the classical Kalman filter and smoother. Additionally, using an approximate conditional independence, we can perform smoothing without having to parameterize a separate model. In scientific applications, domain knowledge can give a linear approximation of the latent transition maps, which we can easily incorporate into our model. Usage of such domain knowledge is reflected in excellent results (despite our model's simplicity) on the chaotic Lorenz system compared to fully supervised and variational inference methods. Finally, we show competitive results on an audio denoising experiment.\n", "keywords": "self-supervision;inference;state-space model;Kalman filter;recurrent neural network", "primary_area": "", "supplementary_material": "/attachment/bc99a5668a801964b3b7d57eb65d64e2594f690f.zip", "author": "David Ruhe;Patrick Forr\u00e9", "authorids": "~David_Ruhe1;~Patrick_Forr\u00e91", "gender": ";", "homepage": ";", "dblp": "243/3507;", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": "~David_Ruhe1;~Patrick_Forr\u00e91", "aff": "Microsoft;", "aff_domain": "microsoft.com;", "position": "Intern;", "bibtex": "@inproceedings{\nruhe2022selfsupervised,\ntitle={Self-Supervised Inference in State-Space Models},\nauthor={David Ruhe and Patrick Forr{\\'e}},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=VPjw9KPWRSK}\n}", "github": "", "project": "", "reviewers": "q53Q;d1x8;SVPv;7PDK", "pdf_size": 0, "recommendation": "6;6;6;8", "confidence": "4;3;3;3", "correctness": "4;4;3;4", "technical_novelty": "3;3;3;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "71;33;66;50", "wc_summary_review": "38;37;45;44", "wc_main_review": "481;328;235;89", "wc_review": "590;398;346;183", "wc_reply_reviewers": "34;0;46;25", "wc_reply_authors": "944;640;635;160", "reply_reviewers": "1;0;1;1", "reply_authors": "2;1;1;1", "recommendation_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 55.0, 14.882876066137216 ], "wc_summary_review_avg": [ 41.0, 3.5355339059327378 ], "wc_main_review_avg": [ 283.25, 142.45064934916934 ], "wc_review_avg": [ 379.25, 145.24698792057617 ], "wc_reply_reviewers_avg": [ 26.25, 16.887495373796554 ], "wc_reply_authors_avg": [ 594.75, 280.46869254874065 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4020918215203548179&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=VPjw9KPWRSK", "email": "microsoft.com;", "author_num": 2, "aff_unique_index": "0", "aff_unique_norm": "Microsoft", "aff_unique_dep": "Microsoft Corporation", "aff_unique_url": "https://www.microsoft.com", "aff_unique_abbr": "Microsoft", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "id": "VQhFC3Ki5C", "title": "DEEP GRAPH TREE NETWORKS", "track": "main", "status": "Reject", "tldr": "", "abstract": "We propose Graph Tree Networks (GTree), a self-interpretive deep graph neural network architecture which originates from the tree representation of the graphs. In the tree representation, each node forms its own tree where the node itself is the root node and all its neighbors up to hop-k are the subnodes. Under the tree representation, the message propagates upward from the leaf nodes to the root node naturally and straightforwardly to update the root node's hidden features. This message passing (or neighborhood aggregation) scheme is essentially different from that in the vanilla GCN, GAT and many of their derivatives, and is demonstrated experimentally a superior message passing scheme. Models adopting this scheme has the capability of going deep. Two scalable graph learning models are proposed within this GTree network architecture - Graph Tree Convolution Network (GTCN) and Graph Tree Attention Network (GTAN), with demonstrated state-of-the-art performances on several benchmark datasets. The deep capability is also demonstrated for both models.", "keywords": "graph tree networks;graph tree convolution networks;graph tree attention networks;GNNs", "primary_area": "", "supplementary_material": "/attachment/b652b62f6039b2ff4d60563f58b75c2ce0cc59a3.zip", "author": "Nan Wu;Chaofan Wang", "authorids": "~Nan_Wu3;chaofan.wang1989@gmail.com", "gender": "F;", "homepage": "https://github.com/nanw16;", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": "~Nan_Wu3;chaofan.wang1989@gmail.com", "aff": "University of Delaware;", "aff_domain": "udel.edu;", "position": "PhD student;", "bibtex": "@misc{\nwu2022deep,\ntitle={{DEEP} {GRAPH} {TREE} {NETWORKS}},\nauthor={Nan Wu and Chaofan Wang},\nyear={2022},\nurl={https://openreview.net/forum?id=VQhFC3Ki5C}\n}", "github": "", "project": "", "reviewers": "2Sv9;RxA8;T8A2;3AAc", "site": "https://openreview.net/forum?id=VQhFC3Ki5C", "pdf_size": 0, "recommendation": "1;5;5;6", "confidence": "5;4;4;4", "correctness": "3;3;3;4", "technical_novelty": "1;2;2;3", "empirical_novelty": "1;2;2;3", "wc_summary_paper": "21;39;63;41", "wc_summary_review": "5;72;45;130", "wc_main_review": "63;69;332;33", "wc_review": "89;180;440;204", "wc_reply_reviewers": "137;0;33;0", "wc_reply_authors": "1370;254;1056;302", "reply_reviewers": "1;0;1;0", "reply_authors": "3;1;2;1", "recommendation_avg": [ 4.25, 1.920286436967152 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 41.0, 14.89966442575134 ], "wc_summary_review_avg": [ 63.0, 45.436769251345325 ], "wc_main_review_avg": [ 124.25, 120.7173869001479 ], "wc_review_avg": [ 228.25, 129.5615201361886 ], "wc_reply_reviewers_avg": [ 42.5, 56.198309583118245 ], "wc_reply_authors_avg": [ 745.5, 480.80011439266525 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.75, 0.82915619758885 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.9771398364036774, "corr_recommendation_correctness": 0.5261522196019801, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:uOecSAlp5tcJ:scholar.google.com/&scioq=DEEP+GRAPH+TREE+NETWORKS&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "University of Delaware", "aff_unique_dep": "", "aff_unique_url": "https://www.udel.edu", "aff_unique_abbr": "UD", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "id": "VQyHD2R3Aq", "title": "SPIDE: A Purely Spike-based Method for Training Feedback Spiking Neural Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "Spiking neural networks (SNNs) with event-based computation are promising brain-inspired models for energy-efficient applications on neuromorphic hardware. However, most supervised SNN training methods require complex computation or impractical neuron models, which hinders them from spike-based energy-efficient training. Among them, the recently proposed method, implicit differentiation on the equilibrium state (IDE), for training feedback SNNs is a promising way that is possible for generalization to locally spike-based learning with flexible network structures. In this paper, we study spike-based implicit differentiation on the equilibrium state (SPIDE) that extends the IDE method for supervised local learning with spikes, which could be possible for energy-efficient training on neuromorphic hardware. Specifically, we first introduce ternary spiking neuron couples that can realize ternary outputs with the common neuron model, and we prove that implicit differentiation can be solved by spikes based on this design. With this approach, the whole training procedure can be made as event-driven spike computation and weights are updated locally with two-stage average firing rates. Then to reduce the approximation error of spikes due to the finite simulation time steps, we propose to modify the resting membrane potential. Based on it, the average firing rate, when viewed as a stochastic estimator, achieves an unbiased estimation of iterative solution for implicit differentiation and the variance of this estimator is reduced. With these key components, we can train SNNs with either feedback or feedforward structures in a small number of time steps. Further, the firing sparsity during training demonstrates the great potential for energy efficiency. Meanwhile, even with these constraints, our trained models could still achieve competitive results on MNIST, CIFAR-10 and CIFAR-100. Our proposed method demonstrates the great potential for energy-efficient training of SNNs on neuromorphic hardware.", "keywords": "spiking neural network;equilibrium state;spike-based training method;neuromorphic engineering", "primary_area": "", "supplementary_material": "", "author": "Mingqing Xiao;Qingyan Meng;Zongpeng Zhang;Yisen Wang;Zhouchen Lin", "authorids": "~Mingqing_Xiao1;~Qingyan_Meng1;~Zongpeng_Zhang1;~Yisen_Wang1;~Zhouchen_Lin1", "gender": "M;M;;M;M", "homepage": "https://pkuxmq.github.io/;https://zero-lab-pku.github.io/personwise/mengqingyan/;;https://yisenwang.github.io/;https://zhouchenlin.github.io", "dblp": "19/2900-2;83/8497;303/0919;172/1346-1;l/ZhouchenLin", "google_scholar": "Hvj-WrwAAAAJ;48VBXzUAAAAJ;;uMWPDboAAAAJ;https://scholar.google.com.tw/citations?user=TanjFwoAAAAJ", "orcid": "0000-0001-6191-7726;;0000-0002-4742-1852;;0000-0003-1493-7569", "linkedin": ";;;;", "or_profile": "~Mingqing_Xiao1;~Qingyan_Meng1;~Zongpeng_Zhang1;~Yisen_Wang1;~Zhouchen_Lin1", "aff": "Peking University;The Chinese University of Hong Kong, Shenzhen;Peking University;Peking University;Peking University", "aff_domain": "pku.edu.cn;edu.cn;stu.pku.edu.cn;pku.edu.cn;pku.edu.cn", "position": "PhD student;PhD student;MS student;Assistant Professor;Professor", "bibtex": "@misc{\nxiao2022spide,\ntitle={{SPIDE}: A Purely Spike-based Method for Training Feedback Spiking Neural Networks},\nauthor={Mingqing Xiao and Qingyan Meng and Zongpeng Zhang and Yisen Wang and Zhouchen Lin},\nyear={2022},\nurl={https://openreview.net/forum?id=VQyHD2R3Aq}\n}", "github": "", "project": "", "reviewers": "PJRp;dKfz;FWiT", "site": "https://openreview.net/forum?id=VQyHD2R3Aq", "pdf_size": 0, "recommendation": "5;5;6", "confidence": "3;2;4", "correctness": "3;3;3", "technical_novelty": "3;2;2", "empirical_novelty": "3;3;2", "wc_summary_paper": "87;103;31", "wc_summary_review": "25;46;34", "wc_main_review": "309;369;798", "wc_review": "421;518;863", "wc_reply_reviewers": "0;316;348", "wc_reply_authors": "1116;2107;2755", "reply_reviewers": "0;1;1", "reply_authors": "3;6;5", "recommendation_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.0, 0.816496580927726 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 73.66666666666667, 30.8688984074406 ], "wc_summary_review_avg": [ 35.0, 8.602325267042627 ], "wc_main_review_avg": [ 492.0, 217.7567450160844 ], "wc_review_avg": [ 600.6666666666666, 189.67750408405198 ], "wc_reply_reviewers_avg": [ 221.33333333333334, 157.0505934050836 ], "wc_reply_authors_avg": [ 1992.6666666666667, 673.9853278983321 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 4.666666666666667, 1.247219128924647 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.8660254037844385, "corr_recommendation_correctness": 0.0, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14752066082223758260&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff_unique_index": "0;1;0;0;0", "aff_unique_norm": "Peking University;Chinese University of Hong Kong", "aff_unique_dep": ";", "aff_unique_url": "http://www.pku.edu.cn;https://www.cuhk.edu.cn", "aff_unique_abbr": "Peking U;CUHK", "aff_campus_unique_index": "1", "aff_campus_unique": ";Shenzhen", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "id": "VSu5WrtLK3q", "title": "A Geometric Perspective on Variational Autoencoders", "track": "main", "status": "Reject", "tldr": "", "abstract": "In this paper, we propose a geometrical interpretation of the Variational Autoencoder framework. We show that VAEs naturally unveil a Riemannian structure of the learned latent space. Moreover, we show that using these geometrical considerations can significantly improve the generation from the vanilla VAE which can now compete with more advanced VAE models on four benchmark data sets. In particular, we propose a new way to generate samples consisting in sampling from the uniform distribution deriving intrinsically from the Riemannian manifold learned by a VAE. We also stress the proposed method's robustness in the low data regime which is known as very challenging for deep generative models. Finally, we validate the method on a complex neuroimaging data set combining both high dimensional data and low sample sizes.", "keywords": "Variational Autoencoders;Riemannian geometry", "primary_area": "", "supplementary_material": "/attachment/b5fdb71a62aec33013959887b1ab507a8bd25258.zip", "author": "Cl\u00e9ment Chadebec;Stephanie Allassonniere", "authorids": "~Cl\u00e9ment_Chadebec1;~Stephanie_Allassonniere1", "gender": "M;F", "homepage": "https://clementchadebec.github.io/;https://sites.google.com/site/stephanieallassonniere/", "dblp": "277/1188;", "google_scholar": "c6VZwnwAAAAJ;https://scholar.google.fr/citations?user=9ubMya8AAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Cl\u00e9ment_Chadebec1;~Stephanie_Allassonniere1", "aff": "Universite de Paris;University Paris Descartes", "aff_domain": "etu.u-paris.fr;parisdescartes.fr", "position": "PhD student;Full Professor", "bibtex": "@misc{\nchadebec2022a,\ntitle={A Geometric Perspective on Variational Autoencoders},\nauthor={Cl{\\'e}ment Chadebec and Stephanie Allassonniere},\nyear={2022},\nurl={https://openreview.net/forum?id=VSu5WrtLK3q}\n}", "github": "", "project": "", "reviewers": "v2S3;BQpw;yrKH;NpUX;UafD", "site": "https://openreview.net/forum?id=VSu5WrtLK3q", "pdf_size": 0, "recommendation": "6;6;6;6;8", "confidence": "4;3;3;3;3", "correctness": "2;3;4;3;4", "technical_novelty": "3;4;2;2;4", "empirical_novelty": "2;3;2;2;4", "wc_summary_paper": "57;54;62;138;72", "wc_summary_review": "170;46;68;186;61", "wc_main_review": "287;484;226;686;230", "wc_review": "514;584;356;1010;363", "wc_reply_reviewers": "70;278;115;0;16", "wc_reply_authors": "289;753;296;806;173", "reply_reviewers": "1;1;1;0;1", "reply_authors": "1;3;1;1;1", "recommendation_avg": [ 6.4, 0.7999999999999999 ], "confidence_avg": [ 3.2, 0.39999999999999997 ], "correctness_avg": [ 3.2, 0.7483314773547882 ], "technical_novelty_avg": [ 3.0, 0.8944271909999159 ], "empirical_novelty_avg": [ 2.6, 0.8 ], "wc_summary_paper_avg": [ 76.6, 31.302396074422163 ], "wc_summary_review_avg": [ 106.2, 59.27022861437266 ], "wc_main_review_avg": [ 382.6, 178.48428502251957 ], "wc_review_avg": [ 565.4, 238.94401017811686 ], "wc_reply_reviewers_avg": [ 95.8, 99.7765503512724 ], "wc_reply_authors_avg": [ 463.4, 262.3025733766255 ], "reply_reviewers_avg": [ 0.8, 0.4 ], "reply_authors_avg": [ 1.4, 0.8000000000000002 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.2500000000000001, "corr_recommendation_correctness": 0.5345224838248488, "gs_citation": 30, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12467131101745386543&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff_unique_index": "0;1", "aff_unique_norm": "Universit\u00e9 de Paris;University Paris Descartes", "aff_unique_dep": ";", "aff_unique_url": "https://www.universitedeparis.fr;https://www.univ-paris5.fr", "aff_unique_abbr": "UP;UPD", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "France" }, { "id": "VTGygqhwRXX", "title": "An Optics Controlling Environment and Reinforcement Learning Benchmarks", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Deep reinforcement learning has the potential to address various scientific problems. In this paper, we implement an optics simulation environment for reinforcement learning based controllers. The environment incorporates nonconvex and nonlinear optical phenomena as well as more realistic time-dependent noise. Then we provide the benchmark results of several state-of-the-art reinforcement learning algorithms on the proposed simulation environment. In the end, we discuss the difficulty of controlling the real-world optical environment with reinforcement learning algorithms. We will make the code of the paper publicly available. ", "keywords": "Reinforcement learning;Optical simulation;Machine Learning for Optics", "primary_area": "", "supplementary_material": "/attachment/94310cf932a980c705ecd6106b3da71c125e84b9.zip", "author": "ABULIKEMU ABUDUWEILI", "authorids": "~ABULIKEMU_ABUDUWEILI1", "gender": "M", "homepage": "https://walleclipse.github.io/", "dblp": "245/8652", "google_scholar": "6Oro5g8AAAAJ", "orcid": "", "linkedin": "", "or_profile": "~ABULIKEMU_ABUDUWEILI1", "aff": "Carnegie Mellon University", "aff_domain": "andrew.cmu.edu", "position": "PhD student", "bibtex": "@misc{\nabuduweili2022an,\ntitle={An Optics Controlling Environment and Reinforcement Learning Benchmarks},\nauthor={ABULIKEMU ABUDUWEILI},\nyear={2022},\nurl={https://openreview.net/forum?id=VTGygqhwRXX}\n}", "github": "", "project": "", "reviewers": "VNgq;BJW7;WkxX;wk56;zVLc", "site": "https://openreview.net/forum?id=VTGygqhwRXX", "pdf_size": 0, "recommendation": "3;3;5;5;6", "confidence": "3;3;3;4;2", "correctness": "4;2;2;3;3", "technical_novelty": "2;3;2;3;2", "empirical_novelty": "2;3;2;0;0", "wc_summary_paper": "168;73;87;49;70", "wc_summary_review": "52;66;45;441;19", "wc_main_review": "217;316;539;100;130", "wc_review": "437;455;671;590;219", "wc_reply_reviewers": "244;254;0;0;0", "wc_reply_authors": "503;438;318;184;274", "reply_reviewers": "2;2;0;0;0", "reply_authors": "3;2;1;1;1", "recommendation_avg": [ 4.4, 1.2 ], "confidence_avg": [ 3.0, 0.6324555320336759 ], "correctness_avg": [ 2.8, 0.7483314773547882 ], "technical_novelty_avg": [ 2.4, 0.4898979485566356 ], "empirical_novelty_avg": [ 1.4, 1.2000000000000002 ], "wc_summary_paper_avg": [ 89.4, 41.13684479879321 ], "wc_summary_review_avg": [ 124.6, 158.9347035735116 ], "wc_main_review_avg": [ 260.4, 158.31942395044265 ], "wc_review_avg": [ 474.4, 154.284931214944 ], "wc_reply_reviewers_avg": [ 99.6, 122.02557109065296 ], "wc_reply_authors_avg": [ 343.4, 114.27265639688262 ], "reply_reviewers_avg": [ 0.8, 0.9797958971132713 ], "reply_authors_avg": [ 1.6, 0.8 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 1, 0 ], "corr_recommendation_confidence": -0.2635231383473649, "corr_recommendation_correctness": -0.13363062095621214, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:azgLgkM-NYQJ:scholar.google.com/&scioq=An+Optics+Controlling+Environment+and+Reinforcement+Learning+Benchmarks&hl=en&as_sdt=0,5", "gs_version_total": 3, "aff_unique_index": "0", "aff_unique_norm": "Carnegie Mellon University", "aff_unique_dep": "", "aff_unique_url": "https://www.cmu.edu", "aff_unique_abbr": "CMU", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "title": "Why Propagate Alone? Parallel Use of Labels and Features on Graphs", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6772", "id": "VTNjxbFRKly", "poster": "", "openreview": "https://openreview.net/forum?id=VTNjxbFRKly", "slides": "https://iclr.cc/virtual/2022/poster/6772", "video": "https://iclr.cc/virtual/2022/poster/6772", "author_site": "Yangkun Wang, Jiarui Jin, Weinan Zhang, Yang Yongyi, Jiuhai Chen, Quan Gan, Yong Yu, Zheng Zhang, Zengfeng Huang, David Wipf", "tldr": "", "abstract": "One of the challenges of graph-based semi-supervised learning over ordinary supervised learning for classification tasks lies in label utilization. The direct use of ground-truth labels in graphs for training purposes can result in a parametric model learning trivial degenerate solutions (e.g., an identity mapping from input to output). In addressing this issue, a label trick has recently been proposed in the literature and applied to a wide range of graph neural network (GNN) architectures, achieving state-of-the-art results on various datasets. The essential idea is to randomly split the observed labels on the graph and use a fraction of them as input to the model (along with original node features), and predict the remaining fraction. Despite its success in enabling GNNs to propagate features and labels simultaneously, this approach has never been analyzed from a theoretical perspective, nor fully explored across certain natural use cases. In this paper, we demonstrate that under suitable settings, this stochastic trick can be reduced to a more interpretable deterministic form, allowing us to better explain its behavior, including an emergent regularization effect, and motivate broader application scenarios. Our experimental results corroborate these analyses while also demonstrating improved node classification performance applying the label trick in new domains.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/8c1d7aa49d4f0aec97df66f4789ca9a817d13a19.zip", "author": "Yangkun Wang;Jiarui Jin;Weinan Zhang;Yang Yongyi;Jiuhai Chen;Quan Gan;Yong Yu;Zheng Zhang;Zengfeng Huang;David Wipf", "authorids": "~Yangkun_Wang1;~Jiarui_Jin1;~Weinan_Zhang1;~Yang_Yongyi1;~Jiuhai_Chen1;~Quan_Gan1;~Yong_Yu1;~Zheng_Zhang1;~Zengfeng_Huang1;~David_Wipf1", "gender": ";M;M;M;M;M;;M;M;M", "homepage": ";https://jinjiarui.github.io/;http://wnzhang.net;https://fftyyy.github.io;https://www.linkedin.com/in/jiuhai-chen-6a486715a/;;https://apex.sjtu.edu.cn/members/yyu;https://shanghai.nyu.edu/academics/faculty/directory/zheng-zhang;https://zengfenghuang.github.io/;http://www.davidwipf.com/", "dblp": ";241/9563;28/10261-1;05/3653;;72/3872;43/5685.html;;97/9726;81/6421", "google_scholar": ";unCPHQEAAAAJ;Qzss0GEAAAAJ;EmL0jD0AAAAJ;;;;https://scholar.google.com.hk/citations?user=k0KiE4wAAAAJ;https://scholar.google.com.hk/citations?user=FwNBuXUAAAAJ;YJx1WSgAAAAJ", "orcid": ";0000-0001-6458-1586;0000-0002-0127-2425;;;0009-0002-0986-457X;0000-0003-4457-2820;;0000-0003-2671-7483;", "linkedin": ";jiarui-jerry-jin-ba4a84176/;;yongyi-yang-528922218/?originalSubdomain=cn;;quan-gan-231992136/;;;;", "or_profile": "~Yangkun_Wang1;~Jiarui_Jin1;~Weinan_Zhang1;~Yang_Yongyi1;~Jiuhai_Chen1;~Quan_Gan1;~Yong_Yu1;~Zheng_Zhang1;~Zengfeng_Huang1;~David_Wipf1", "aff": ";Shanghai Jiaotong University;Shanghai Jiaotong University;Fudan University;University of Maryland, College Park;Amazon;Shanghai Jiaotong University;Amazon;Fudan University;Amazon AI Research Lab", "aff_domain": ";sjtu.edu.cn;sjtu.edu.cn;fudan.edu.cn;umd.edu;amazon.com;sjtu.edu.cn;amazon.com;fudan.edu.cn;amazon.com", "position": ";PhD student;Associate Professor;Undergrad student;PhD student;Researcher;Full Professor;Senior Principal Scientist;Associate Professor;Principal Research Scientist", "bibtex": "@inproceedings{\nwang2022why,\ntitle={Why Propagate Alone? Parallel Use of Labels and Features on Graphs},\nauthor={Yangkun Wang and Jiarui Jin and Weinan Zhang and Yang Yongyi and Jiuhai Chen and Quan Gan and Yong Yu and Zheng Zhang and Zengfeng Huang and David Wipf},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=VTNjxbFRKly}\n}", "github": "", "project": "", "reviewers": "n4P3;bkGb;xaW4;Ugm6;RzBD", "pdf_size": 0, "recommendation": "5;5;5;6;8", "confidence": "4;4;3;4;2", "correctness": "4;3;3;3;4", "technical_novelty": "3;3;2;3;4", "empirical_novelty": "2;3;2;1;3", "wc_summary_paper": "247;84;68;116;74", "wc_summary_review": "55;18;34;36;22", "wc_main_review": "773;527;319;235;100", "wc_review": "1075;629;421;387;196", "wc_reply_reviewers": "590;471;118;0;0", "wc_reply_authors": "3551;2932;1884;1389;56", "reply_reviewers": "3;2;2;0;0", "reply_authors": "9;6;4;2;1", "recommendation_avg": [ 5.8, 1.16619037896906 ], "confidence_avg": [ 3.4, 0.8 ], "correctness_avg": [ 3.4, 0.4898979485566356 ], "technical_novelty_avg": [ 3.0, 0.6324555320336759 ], "empirical_novelty_avg": [ 2.2, 0.7483314773547882 ], "wc_summary_paper_avg": [ 117.8, 66.68852974837576 ], "wc_summary_review_avg": [ 33.0, 12.96148139681572 ], "wc_main_review_avg": [ 390.8, 236.059653477675 ], "wc_review_avg": [ 541.6, 300.0130663821161 ], "wc_reply_reviewers_avg": [ 235.8, 247.32844559411276 ], "wc_reply_authors_avg": [ 1962.4, 1219.2677474615655 ], "reply_reviewers_avg": [ 1.4, 1.2000000000000002 ], "reply_authors_avg": [ 4.4, 2.870540018881465 ], "replies_avg": [ 35, 0 ], "authors#_avg": [ 10, 0 ], "corr_recommendation_confidence": -0.7717436331412899, "corr_recommendation_correctness": 0.4900980294098034, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4667146621050849805&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=VTNjxbFRKly", "email": ";sjtu.edu.cn;sjtu.edu.cn;fudan.edu.cn;umd.edu;amazon.com;sjtu.edu.cn;amazon.com;fudan.edu.cn;amazon.com", "author_num": 10, "aff_unique_index": "0;0;1;2;3;0;3;1;3", "aff_unique_norm": "Shanghai Jiao Tong University;Fudan University;University of Maryland;Amazon", "aff_unique_dep": ";;;Amazon.com, Inc.", "aff_unique_url": "https://www.sjtu.edu.cn;https://www.fudan.edu.cn;https://www/umd.edu;https://www.amazon.com", "aff_unique_abbr": "SJTU;Fudan;UMD;Amazon", "aff_campus_unique_index": "1", "aff_campus_unique": ";College Park", "aff_country_unique_index": "0;0;0;1;1;0;1;0;1", "aff_country_unique": "China;United States" }, { "id": "VUcI0pKic8l", "title": "Attacking Perceptual Similarity Metrics", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Perceptual similarity metrics have progressively become more correlated with human judgments on perceptual similarity; however, despite recent advances, the addition of an imperceptible distortion can still compromise these metrics. To the best of our knowledge, no study to date has systematically examined the robustness of these metrics to imperceptible adversarial perturbations. Following the two-alternative forced choice experimental design with two distorted images, and one reference image, we perturb the distorted image closer to the reference via an adversarial attack until the metric flips its judgment. We first show that all metrics are susceptible to perturbations generated via common adversarial attacks such as FGSM, PGD, and the One-pixel attack. Next, we attack the widely adopted LPIPS metric using FlowAdv, our flow-based spatial attack, in a white-box setting to craft adversarial examples that can effectively transfer to other similarity metrics in a black-box setting. In addition, we combine the spatial attack FlowAdv with PGD ($l_\\infty$-bounded) attack, to increase transferability and use these adversarial examples to benchmark the robustness of both traditional and recently developed metrics. Our benchmark provides a good starting point for discussion and further research on the robustness of metrics to imperceptible adversarial perturbations.", "keywords": "perceptual similarity metrics;computer vision;adversarial robustness;image quality assessment;transferable adversarial examples", "primary_area": "", "supplementary_material": "", "author": "Abhijay Ghildyal;Feng Liu", "authorids": "~Abhijay_Ghildyal1;~Feng_Liu6", "gender": "M;", "homepage": "https://abhijay9.github.io/;", "dblp": "325/4739;77/1318-15", "google_scholar": "8Sdd57YAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": "0000-0003-1940-9626;", "linkedin": "abhijay-ghildyal/;", "or_profile": "~Abhijay_Ghildyal1;~Feng_Liu6", "aff": "Portland State University;Portland State University", "aff_domain": "pdx.edu;pdx.edu", "position": "PhD student;Full Professor", "bibtex": "@misc{\nghildyal2022attacking,\ntitle={Attacking Perceptual Similarity Metrics},\nauthor={Abhijay Ghildyal and Feng Liu},\nyear={2022},\nurl={https://openreview.net/forum?id=VUcI0pKic8l}\n}", "github": "", "project": "", "reviewers": "D3dY;f8Rz;3Huq;g7UX", "site": "https://openreview.net/forum?id=VUcI0pKic8l", "pdf_size": 0, "recommendation": "3;3;5;8", "confidence": "4;3;5;4", "correctness": "4;4;2;4", "technical_novelty": "3;2;2;2", "empirical_novelty": "3;2;3;3", "wc_summary_paper": "60;46;95;45", "wc_summary_review": "65;124;65;13", "wc_main_review": "342;246;712;229", "wc_review": "467;416;872;287", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.75, 2.0463381929681126 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.5, 0.8660254037844386 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 61.5, 20.22992832414391 ], "wc_summary_review_avg": [ 66.75, 39.283425257988895 ], "wc_main_review_avg": [ 382.25, 195.19525480912696 ], "wc_review_avg": [ 510.5, 218.77899807796908 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.34554737023254406, "corr_recommendation_correctness": -0.07053456158585983, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14209973457867571373&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0", "aff_unique_norm": "Portland State University", "aff_unique_dep": "", "aff_unique_url": "https://www.pdx.edu", "aff_unique_abbr": "PSU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "VXqNHWh3LL", "title": "Shift-tolerant Perceptual Similarity Metric", "track": "main", "status": "Reject", "tldr": "", "abstract": "Existing perceptual similarity metrics assume an image and its reference are well aligned. As a result, these metrics are often sensitive to a small alignment error that is imperceptible to the human eyes. This paper studies the effect of small misalignment, specifically a small shift between the input and reference image, on existing metrics, and accordingly develops a shift-tolerant similarity metric. This paper builds upon LPIPS, a widely used learned perceptual similarity metric and explores architectural design considerations to make it robust against imperceptible misalignment. Specifically, we study a wide spectrum of neural network elements, such as anti-aliasing filtering, pooling, striding, padding, and skip connection, and discuss their roles in making a robust metric. Based on our studies, we develop a new deep neural network-based perceptual similarity metric. Our experiments show that our metric is tolerant to imperceptible shifts while being consistent with the human similarity judgment.", "keywords": "Computer Vision;Perceptual Similarity Metric;Image Quality Assessment;Robustness;Convolutional Neural Networks;Anti-aliasing", "primary_area": "", "supplementary_material": "", "author": "Abhijay Ghildyal;Feng Liu", "authorids": "~Abhijay_Ghildyal1;~Feng_Liu6", "gender": "M;", "homepage": "https://abhijay9.github.io/;", "dblp": "325/4739;77/1318-15", "google_scholar": "8Sdd57YAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": "0000-0003-1940-9626;", "linkedin": "abhijay-ghildyal/;", "or_profile": "~Abhijay_Ghildyal1;~Feng_Liu6", "aff": "Portland State University;Portland State University", "aff_domain": "pdx.edu;pdx.edu", "position": "PhD student;Full Professor", "bibtex": "@misc{\nghildyal2022shifttolerant,\ntitle={Shift-tolerant Perceptual Similarity Metric},\nauthor={Abhijay Ghildyal and Feng Liu},\nyear={2022},\nurl={https://openreview.net/forum?id=VXqNHWh3LL}\n}", "github": "", "project": "", "reviewers": "2NrG;GQvy;LHuY", "site": "https://openreview.net/forum?id=VXqNHWh3LL", "pdf_size": 0, "recommendation": "3;6;8", "confidence": "5;5;4", "correctness": "2;3;4", "technical_novelty": "2;1;2", "empirical_novelty": "2;3;4", "wc_summary_paper": "10;162;107", "wc_summary_review": "6;39;19", "wc_main_review": "370;447;142", "wc_review": "386;648;268", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "946;1322;561", "reply_reviewers": "0;0;0", "reply_authors": "2;2;1", "recommendation_avg": [ 5.666666666666667, 2.0548046676563256 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "correctness_avg": [ 3.0, 0.816496580927726 ], "technical_novelty_avg": [ 1.6666666666666667, 0.4714045207910317 ], "empirical_novelty_avg": [ 3.0, 0.816496580927726 ], "wc_summary_paper_avg": [ 93.0, 62.838417124134075 ], "wc_summary_review_avg": [ 21.333333333333332, 13.572848714334887 ], "wc_main_review_avg": [ 319.6666666666667, 129.5024667289735 ], "wc_review_avg": [ 434.0, 158.80386225361985 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 943.0, 310.684191208157 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.8029550685469661, "corr_recommendation_correctness": 0.9933992677987828, "gs_citation": 48, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17919204630676311579&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff_unique_index": "0;0", "aff_unique_norm": "Portland State University", "aff_unique_dep": "", "aff_unique_url": "https://www.pdx.edu", "aff_unique_abbr": "PSU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "VZAgsLaP3or", "title": "Practical No-box Adversarial Attacks with Training-free Hybrid Image Transformation", "track": "main", "status": "Reject", "tldr": "", "abstract": "\t In recent years, the adversarial vulnerability of deep neural networks (DNNs) has raised increasing attention. \n\t\tAmong all the threat models, no-box attacks are the most practical but extremely challenging since they neither rely on any knowledge of the target model or similar substitute model, nor access the dataset for training a new substitute model. Although a recent method has attempted such an attack in a loose sense, its performance is not good enough and the computational overhead of training is expensive.\n\t\tIn this paper, we move a step forward and show the existence of a \\textbf{training-free} adversarial perturbation under the no-box threat model, which can be successfully used to attack different DNNs in real-time.\n\t\tMotivated by our observation that high-frequency component (HFC) domains in low-level features and plays a crucial role in classification, we attack an image mainly by manipulating its frequency components. Specifically, the perturbation is combined by the suppression of the original HFC and the adding of noisy HFC.\n\t\tWe empirically and experimentally analyze the requirements of effective noisy HFC and show that it should be regionally homogeneous, repeating and dense.\n\t\tExtensive experiments on the ImageNet dataset demonstrate the effectiveness of our proposed no-box method. It attacks ten well-known models with a success rate of \\textbf{98.13\\%} on average, which outperforms state-of-the-art no-box attacks by \\textbf{29.39\\%}. Furthermore, our method is even competitive to mainstream transfer-based black-box attacks. Our code is available in our appendix. ", "keywords": "no-box attack;training-free;hybrid image transformation", "primary_area": "", "supplementary_material": "/attachment/a822f1d203d487f85e65e65ef7550e7e97aec0a7.zip", "author": "Qilong Zhang;Chaoning Zhang;Jingkuan Song;Lianli Gao", "authorids": "~Qilong_Zhang2;~Chaoning_Zhang1;~Jingkuan_Song3;~Lianli_Gao1", "gender": "M;M;M;F", "homepage": ";;https://cfm.uestc.edu.cn/~songjingkuan/;https://lianligao.github.io/", "dblp": "22/3730;;70/10575;123/9849.html", "google_scholar": "IgPyQWYAAAAJ;https://scholar.google.co.kr/citations?user=lvhxhyQAAAAJ;F5Zy9V4AAAAJ;https://scholar.google.com.au/citations?user=zsm2dpYAAAAJ", "orcid": "0009-0005-2591-5762;;;", "linkedin": ";;;", "or_profile": "~Qilong_Zhang2;~Chaoning_Zhang1;~Jingkuan_Song3;~Lianli_Gao1", "aff": "University of Electronic Science and Technology of China;Korea Advanced Institute of Science & Technology;University of Electronic Science and Technology of China,;University of Electronic Science and Technology of China", "aff_domain": "uestc.edu;kaist.ac.kr;uestc.edu.cn;uestc.edu.cn", "position": "MS student;Postdoc;Full Professor;Full Professor", "bibtex": "@misc{\nzhang2022practical,\ntitle={Practical No-box Adversarial Attacks with Training-free Hybrid Image Transformation},\nauthor={Qilong Zhang and Chaoning Zhang and Jingkuan Song and Lianli Gao},\nyear={2022},\nurl={https://openreview.net/forum?id=VZAgsLaP3or}\n}", "github": "", "project": "", "reviewers": "cY6Z;h2W3;usLh;jdwq", "site": "https://openreview.net/forum?id=VZAgsLaP3or", "pdf_size": 0, "recommendation": "3;5;8;8", "confidence": "5;4;4;4", "correctness": "3;3;4;3", "technical_novelty": "1;2;2;3", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "41;60;56;61", "wc_summary_review": "30;27;115;62", "wc_main_review": "358;108;254;521", "wc_review": "429;195;425;644", "wc_reply_reviewers": "0;0;38;102", "wc_reply_authors": "1619;534;834;1045", "reply_reviewers": "0;0;1;1", "reply_authors": "3;2;3;3", "recommendation_avg": [ 6.0, 2.1213203435596424 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 54.5, 8.0156097709407 ], "wc_summary_review_avg": [ 58.5, 35.387144558440994 ], "wc_main_review_avg": [ 310.25, 150.63594358585203 ], "wc_review_avg": [ 423.25, 158.79605631123212 ], "wc_reply_reviewers_avg": [ 35.0, 41.677331968349414 ], "wc_reply_authors_avg": [ 1008.0, 396.7499212350268 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 2.75, 0.4330127018922193 ], "replies_avg": [ 22, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.8164965809277261, "corr_recommendation_correctness": 0.5443310539518174, "gs_citation": 29, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2172384165022561961&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;0;0", "aff_unique_norm": "University of Electronic Science and Technology of China;Korea Advanced Institute of Science and Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.uestc.edu.cn;https://www.kaist.ac.kr", "aff_unique_abbr": "UESTC;KAIST", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0", "aff_country_unique": "China;South Korea" }, { "id": "VZC5Lzyl0le", "title": "Automated Mobile Attention KPConv Networks via A Wide & Deep Predictor", "track": "main", "status": "Reject", "tldr": "", "abstract": "Kernel Point Convolution (KPConv) achieves cutting-edge performance on 3D point cloud applications. Unfortunately, the large size of KPConv network limits its usage in mobile scenarios. In addition, we observe that KPConv ignores the kernel relationship and treats each kernel point equally when formulating neighbor-kernel correlation via Euclidean distance. This leads to a weak representation power. To mitigate the above issues, we propose a module named Mobile Attention Kernel Point Convolution (MAKPConv) to improve the efficiency and quality of KPConv. MAKPConv employs a depthwise kernel to reduce resource consumption and re-calibrates the contribution of kernel points towards each neighbor point via Neighbor-Kernel attention to improve representation power. Furthermore, we capitalize Inverted Residual Bottleneck (IRB) to craft a design space and employ a predictor-based Neural Architecture Search (NAS) approach to automate the design of efficient 3D networks based on MAKPConv. To fully exploit the immense design space via an accurate predictor, we identify the importance of carrying feature engineering on searchable features to improve neural architecture representations and propose a Wide & Deep Predictor to unify dense and sparse neural architecture representations for lower error in performance prediction. Experimental evaluations show that our NAS-crafted MAKPConv network uses 96% fewer parameters on 3D point cloud classification and segmentation benchmarks with better performance. Compared with state-of-the-art NAS-crafted model SPVNAS, our NAS-crafted MAKPConv network achieves ~1% better mIOU with 83% fewer parameters and 52% fewer Multiply-Accumulates.", "keywords": "3D Point Cloud Classification and segmentation;Neural Architecture Search", "primary_area": "", "supplementary_material": "", "author": "Tunhou Zhang;Mingyuan Ma;Feng Yan;Hai Li;Yiran Chen", "authorids": "~Tunhou_Zhang1;~Mingyuan_Ma1;~Feng_Yan2;~Hai_Li1;~Yiran_Chen1", "gender": "M;M;;F;M", "homepage": ";https://cei.pratt.duke.edu;http://www.cs.uh.edu/~fyan/;https://ece.duke.edu/faculty/hai-helen-li;https://ece.duke.edu/people/yiran-chen/", "dblp": ";;62/3960-1.html;30/5330-1;80/1641", "google_scholar": "https://scholar.google.com/citations?hl=en;;iLE0_VAAAAAJ;E6Tpfq8AAAAJ;", "orcid": ";;;0000-0003-3228-6544;0000-0002-1486-8412", "linkedin": ";;;;", "or_profile": "~Tunhou_Zhang1;~Mingyuan_Ma1;~Feng_Yan2;~Hai_Li1;~Yiran_Chen1", "aff": "Duke University;Duke University;University of Nevada, Reno;Duke University;Duke University", "aff_domain": "duke.edu;duke.edu;unr.edu;duke.edu;duke.edu", "position": "PhD student;PhD student;Assistant Professor;Professor;Professor", "bibtex": "@misc{\nzhang2022automated,\ntitle={Automated Mobile Attention {KPC}onv Networks via A Wide \\& Deep Predictor},\nauthor={Tunhou Zhang and Mingyuan Ma and Feng Yan and Hai Li and Yiran Chen},\nyear={2022},\nurl={https://openreview.net/forum?id=VZC5Lzyl0le}\n}", "github": "", "project": "", "reviewers": "Jg8J;8cJa;xh8F;epb3", "site": "https://openreview.net/forum?id=VZC5Lzyl0le", "pdf_size": 0, "recommendation": "3;5;6;6", "confidence": "5;2;4;3", "correctness": "3;3;3;4", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "50;67;89;134", "wc_summary_review": "59;49;27;88", "wc_main_review": "234;354;405;289", "wc_review": "343;470;521;511", "wc_reply_reviewers": "697;0;108;90", "wc_reply_authors": "1683;442;1166;393", "reply_reviewers": "4;0;1;1", "reply_authors": "5;1;2;2", "recommendation_avg": [ 5.0, 1.224744871391589 ], "confidence_avg": [ 3.5, 1.118033988749895 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 85.0, 31.488092987667578 ], "wc_summary_review_avg": [ 55.75, 21.924586655168667 ], "wc_main_review_avg": [ 320.5, 64.6857789626128 ], "wc_review_avg": [ 461.25, 70.89560987818639 ], "wc_reply_reviewers_avg": [ 223.75, 276.27737420932607 ], "wc_reply_authors_avg": [ 921.0, 535.9323651357511 ], "reply_reviewers_avg": [ 1.5, 1.5 ], "reply_authors_avg": [ 2.5, 1.5 ], "replies_avg": [ 22, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.5477225575051661, "corr_recommendation_correctness": 0.4714045207910316, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:W_o34ge7WLUJ:scholar.google.com/&scioq=Automated+Mobile+Attention+KPConv+Networks+via+A+Wide+%26+Deep+Predictor&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0;1;0;0", "aff_unique_norm": "Duke University;University of Nevada", "aff_unique_dep": ";", "aff_unique_url": "https://www.duke.edu;https://www.unr.edu", "aff_unique_abbr": "Duke;UNR", "aff_campus_unique_index": "1", "aff_campus_unique": ";Reno", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "Vc5wUmpwR7x", "title": "Minimizing Memorization in Meta-learning: A Causal Perspective", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Meta-learning has emerged as a potent paradigm for quick learning of few-shot tasks, by leveraging the meta-knowledge learned from meta-training tasks. Well-generalized meta-knowledge that facilitates fast adaptation in each task is preferred; however, recent evidence suggests the undesirable memorization effect where the meta-knowledge simply memorizing all meta-training tasks discourages task-specific adaptation and poorly generalizes. There have been several solutions to mitigating the effect, including both regularizer-based and augmentation-based methods, while a systematic understanding of these methods in a single framework is still lacking. In this paper, we offer a novel causal perspective of meta-learning. Through the lens of causality, we conclude the universal label space as a confounder to be the causing factor of memorization and frame the two lines of prevailing methods as different deconfounder approaches. Remarkably, derived from the causal inference principle of front-door adjustment, we propose two frustratingly easy but effective deconfounder algorithms, i.e., sampling multiple versions of the meta-knowledge via Dropout and grouping the meta-knowledge into multiple bins. The proposed causal perspective not only brings in the two deconfounder algorithms that surpass previous works in four benchmark datasets towards combating memorization, but also opens a promising direction for meta-learning.", "keywords": "meta-learning;causality;intervention;memorization;overfitting", "primary_area": "", "supplementary_material": "", "author": "Yinjie Jiang;Zhengyu Chen;Luotian Yuan;Ying Wei;Kun Kuang;Xinhai Ye;Zhihua Wang;Fei Wu", "authorids": "~Yinjie_Jiang1;~Zhengyu_Chen3;~Luotian_Yuan1;~Ying_Wei1;~Kun_Kuang1;~Xinhai_Ye1;~Zhihua_Wang4;~Fei_Wu1", "gender": "M;;M;F;M;M;M;M", "homepage": ";;https://github.com/yuanluotian;https://wei-ying.net/;http://kunkuang.github.io;https://person.zju.edu.cn/0621591?fulltext=%E5%8F%B6%E6%98%95%E6%B5%B7;;https://person.zju.edu.cn/wufei", "dblp": "314/2114;;323/8896;14/4899-1;194/4245;;;84/3254-1", "google_scholar": ";;;5UpFdKsAAAAJ;https://scholar.google.com.hk/citations?user=FOsNiMQAAAAJ;;oDdO4JIAAAAJ;XJLn4MYAAAAJ", "orcid": "0000-0003-3058-269X;;;;0009-0000-7528-8131;;0000-0002-1593-1321;", "linkedin": ";;;;;;;", "or_profile": "~Yinjie_Jiang1;~Zhengyu_Chen3;~Luotian_Yuan1;~Ying_Wei1;~Kun_Kuang1;~Xinhai_Ye1;~Zhihua_Wang4;~Fei_Wu1", "aff": "Zhejiang University;;Zhejiang University;City University of Hong Kong;Zhejiang University;Zhejiang University;Shanghai Institute for Advanced Study of Zhejiang University;Zhejiang University", "aff_domain": "zju.edu.cn;;zju.edu.cn;cityu.edu.hk;zju.edu.cn;zju.edu.cn;zju.edu.cn;zju.edu.cn", "position": "PhD student;;Undergrad student;Assistant Professor;Associate Professor;Postdoc;Researcher;Full Professor", "bibtex": "@misc{\njiang2022minimizing,\ntitle={Minimizing Memorization in Meta-learning: A Causal Perspective},\nauthor={Yinjie Jiang and Zhengyu Chen and Luotian Yuan and Ying Wei and Kun Kuang and Xinhai Ye and Zhihua Wang and Fei Wu},\nyear={2022},\nurl={https://openreview.net/forum?id=Vc5wUmpwR7x}\n}", "github": "", "project": "", "reviewers": "7TwH;6sbp;5gbq;STG8", "site": "https://openreview.net/forum?id=Vc5wUmpwR7x", "pdf_size": 0, "recommendation": "6;6;6;8", "confidence": "4;4;3;4", "correctness": "3;3;3;3", "technical_novelty": "3;3;3;3", "empirical_novelty": "3;3;2;2", "wc_summary_paper": "152;88;50;63", "wc_summary_review": "50;34;45;43", "wc_main_review": "512;113;327;92", "wc_review": "714;235;422;198", "wc_reply_reviewers": "0;0;99;0", "wc_reply_authors": "1218;313;1271;198", "reply_reviewers": "0;0;1;0", "reply_authors": "2;1;4;1", "recommendation_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 88.25, 39.25796097608738 ], "wc_summary_review_avg": [ 43.0, 5.787918451395113 ], "wc_main_review_avg": [ 261.0, 171.62604697422825 ], "wc_review_avg": [ 392.25, 204.2478580059042 ], "wc_reply_reviewers_avg": [ 24.75, 42.868257487329714 ], "wc_reply_authors_avg": [ 750.0, 496.52240634235227 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 2.0, 1.224744871391589 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Aqrpg9gGDoQJ:scholar.google.com/&scioq=Minimizing+Memorization+in+Meta-learning:+A+Causal+Perspective&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0;1;0;0;0;0", "aff_unique_norm": "Zhejiang University;City University of Hong Kong", "aff_unique_dep": ";", "aff_unique_url": "https://www.zju.edu.cn;https://www.cityu.edu.hk", "aff_unique_abbr": "ZJU;CityU", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Hong Kong SAR;Shanghai", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "China" }, { "id": "VdYTmPf6BZ-", "title": "Adversarial Robustness via Adaptive Label Smoothing", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Adversarial training (AT) has become a dominant defense paradigm by enforcing the model's predictions to be locally invariant to adversarial examples. Being a simple technique, Label smoothing (LS) has shown its potential for improving model robustness. However, the prior study shows the benefit of directly combining two techniques together is limited. In this paper, we aim to better understand the behavior of LS and explore new algorithms for more effective LS on improving adversarial robustness. We first show both theoretically and empirically that strong smoothing in AT increases local smoothness of the loss surface which is beneficial for robustness but sacrifices the training loss which influences the accuracy of samples near the decision boundary. Based on this result, we propose \\textit{surface smoothing adversarial training} (SSAT). Specifically, much stronger smoothness is used on the perturbed examples farther away from the decision boundary to achieve better robustness, while weaker smoothness is on those closer to the decision boundary to avoid incorrect classification on clean samples. Meanwhile, LS builds a different representation space among data classes in which SSAT differs from other AT methods. We study such a distinction and further propose a cooperative defense strategy termed by Co-SSAT. Experimental results show that our Co-SSAT achieves the state-of-the-art performances on CIFAR-10 with $\\ell_{\\infty}$ adversaries and also has a good generalization ability of unseen attacks, i.e., other $\\ell_p$ norms, or larger perturbations due to the smoothness property of the loss surface.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Qibing Ren;Liangliang Shi;Lanjun Wang;Junchi Yan", "authorids": "~Qibing_Ren1;~Liangliang_Shi1;wang.lanjun@outlook.com;~Junchi_Yan2", "gender": ";M;;", "homepage": ";;;", "dblp": ";89/8730;;", "google_scholar": ";Qf1k8lUAAAAJ;;", "orcid": ";0000-0001-7033-4207;;", "linkedin": ";;;", "or_profile": "~Qibing_Ren1;~Liangliang_Shi1;wang.lanjun@outlook.com;~Junchi_Yan2", "aff": ";Shanghai Jiaotong University;;", "aff_domain": ";sjtu.edu.cn;;", "position": ";PhD student;;", "bibtex": "@misc{\nren2022adversarial,\ntitle={Adversarial Robustness via Adaptive Label Smoothing},\nauthor={Qibing Ren and Liangliang Shi and Lanjun Wang and Junchi Yan},\nyear={2022},\nurl={https://openreview.net/forum?id=VdYTmPf6BZ-}\n}", "github": "", "project": "", "reviewers": "NTaY;xv5Q;cKvC;kk6P;uRGP", "site": "https://openreview.net/forum?id=VdYTmPf6BZ-", "pdf_size": 0, "recommendation": "3;3;3;3;5", "confidence": "4;4;4;4;4", "correctness": "1;2;3;2;4", "technical_novelty": "1;3;2;2;2", "empirical_novelty": "1;0;2;1;3", "wc_summary_paper": "97;34;104;72;79", "wc_summary_review": "38;26;38;24;43", "wc_main_review": "451;240;320;480;142", "wc_review": "586;300;462;576;264", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "0;0;0;0;0", "reply_reviewers": "0;0;0;0;0", "reply_authors": "0;0;0;0;0", "recommendation_avg": [ 3.4, 0.8 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 2.4, 1.019803902718557 ], "technical_novelty_avg": [ 2.0, 0.6324555320336759 ], "empirical_novelty_avg": [ 1.4, 1.019803902718557 ], "wc_summary_paper_avg": [ 77.2, 24.522642598219303 ], "wc_summary_review_avg": [ 33.8, 7.4404300950953095 ], "wc_main_review_avg": [ 326.6, 126.98598347849263 ], "wc_review_avg": [ 437.6, 134.79109762888646 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.7844645405527362, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:xs73WGupLBMJ:scholar.google.com/&scioq=Adversarial+Robustness+via+Adaptive+Label+Smoothing&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "Shanghai Jiao Tong University", "aff_unique_dep": "", "aff_unique_url": "https://www.sjtu.edu.cn", "aff_unique_abbr": "SJTU", "aff_country_unique_index": "0", "aff_country_unique": "China" }, { "title": "DEGREE: Decomposition Based Explanation for Graph Neural Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6059", "id": "Ve0Wth3ptT_", "poster": "", "openreview": "https://openreview.net/forum?id=Ve0Wth3ptT_", "slides": "https://iclr.cc/virtual/2022/poster/6059", "video": "https://iclr.cc/virtual/2022/poster/6059", "author_site": "Qizhang Feng, Ninghao Liu, Fan Yang, Ruixiang Tang, Mengnan Du, Xia Hu", "tldr": "", "abstract": "Graph Neural Networks (GNNs) are gaining extensive attention for their application in graph data. However, the black-box nature of GNNs prevents users from understanding and trusting the models, thus hampering their applicability. Whereas explaining GNNs remains a challenge, most existing methods fall into approximation based and perturbation based approaches with suffer from faithfulness problems and unnatural artifacts respectively. To tackle these problems, we propose DEGREE (Decomposition based Explanation for GRaph nEural nEtworks) to provide a faithful explanation for GNN predictions. By decomposing the information generation and aggregation mechanism of GNNs, DEGREE allows tracking the contributions of specific components of the input graph to the final prediction. Based on this, we further design a subgraph level interpretation algorithm to reveal complex interactions between graph nodes that are overlooked by previous methods. The efficiency of our algorithm can be further improved by utilizing GNN characteristics. Finally, we conduct quantitative and qualitative experiments on synthetic and real-world datasets to demonstrate the effectiveness of DEGREE on node classification and graph classification tasks.", "keywords": "XAI;GNN", "primary_area": "", "supplementary_material": "", "author": "Qizhang Feng;Ninghao Liu;Fan Yang;Ruixiang Tang;Mengnan Du;Xia Hu", "authorids": "~Qizhang_Feng1;~Ninghao_Liu2;~Fan_Yang27;~Ruixiang_Tang1;~Mengnan_Du1;~Xia_Hu4", "gender": "M;M;M;;M;M", "homepage": ";https://yangfan.sites.wfu.edu/;https://www.ruixiangtang.net/;https://mengnandu.com/;https://cobweb.cs.uga.edu/~ninghaoliu/;https://cs.rice.edu/~xh37/index.html", "dblp": "323/5667.html;;239/1928;183/5606;145/4489;256/9406.html", "google_scholar": ";RXFeW-8AAAAJ;T575jsoAAAAJ;0i-Js2gAAAAJ;Nir-EDYAAAAJ;https://scholar.google.com.tw/citations?user=pcCS60IAAAAJ", "orcid": "0000-0002-2574-0270;0000-0003-3442-754X;;;0000-0002-9170-2424;", "linkedin": "qizhang-feng-355478197/;;ruixiang-tang-91660717b/;;;", "or_profile": "~Qizhang_Feng1;~Fan_Yang27;~Ruixiang_Tang1;~Mengnan_Du1;~Ninghao_Liu1;~Xia_Hu2", "aff": "Texas A&M;Rice University;Rice University;Texas A&M University;University of Georgia;Rice University", "aff_domain": "tamu.edu;rice.edu;rice.edu;tamu.edu;uga.edu;rice.edu", "position": "PhD student;PhD student;PhD student;PhD student;Assistant Professor;Associate Professor", "bibtex": "@inproceedings{\nfeng2022degree,\ntitle={{DEGREE}: Decomposition Based Explanation for Graph Neural Networks},\nauthor={Qizhang Feng and Ninghao Liu and Fan Yang and Ruixiang Tang and Mengnan Du and Xia Hu},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=Ve0Wth3ptT_}\n}", "github": "", "project": "", "reviewers": "XaeP;81sK;gPGV;eChJ", "pdf_size": 0, "recommendation": "6;6;6;8", "confidence": "3;3;3;4", "correctness": "3;3;3;4", "technical_novelty": "2;3;2;3", "empirical_novelty": "3;3;2;3", "wc_summary_paper": "92;72;51;74", "wc_summary_review": "61;153;46;37", "wc_main_review": "451;403;176;129", "wc_review": "604;628;273;240", "wc_reply_reviewers": "21;0;21;0", "wc_reply_authors": "1307;792;718;352", "reply_reviewers": "1;0;1;0", "reply_authors": "3;2;2;2", "recommendation_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 72.25, 14.53229162933362 ], "wc_summary_review_avg": [ 74.25, 46.267564232408 ], "wc_main_review_avg": [ 289.75, 139.28994041207713 ], "wc_review_avg": [ 436.25, 180.3279997670911 ], "wc_reply_reviewers_avg": [ 10.5, 10.5 ], "wc_reply_authors_avg": [ 792.25, 340.6980884889142 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 2.25, 0.4330127018922193 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 1.0, "corr_recommendation_correctness": 1.0, "gs_citation": 36, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17130362446685501668&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=Ve0Wth3ptT_", "email": "tamu.edu;rice.edu;rice.edu;tamu.edu;uga.edu;rice.edu", "author_num": 6, "aff_unique_index": "0;1;1;0;2;1", "aff_unique_norm": "Texas A&M University;Rice University;University of Georgia", "aff_unique_dep": ";;", "aff_unique_url": "https://www.tamu.edu;https://www.rice.edu;https://www.uga.edu", "aff_unique_abbr": "TAMU;Rice;UGA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "VgxHf-qUZ3D", "title": "Self-evolutionary optimization for Pareto front learning", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Multi-task learning (MTL), which aims to improve performance by learning multiple tasks simultaneously, inherently presents an optimization challenge due to multiple objectives. Hence, multi-objective optimization (MOO) approaches have been proposed for multitasking problems. Recent MOO methods approximate multiple optimal solutions (Pareto front) with a single unified model, which is collectively referred to as Pareto front learning (PFL). In this paper, we show that PFL can be re-formulated into another MOO problem with multiple objectives, each of which corresponds to different preference weights for the tasks. We leverage an evolutionary algorithm (EA) to propose a method for PFL called self-evolutionary optimization (SEO) by directly maximizing the hypervolume. By using SEO, the neural network learns to approximate the Pareto front conditioned on multiple hyper-parameters that drastically affect the hypervolume. Then, by generating a population of approximations simply by inferencing the network, the hyper-parameters of the network can be optimized by EA. Utilizing SEO for PFL, we also introduce self-evolutionary Pareto networks (SEPNet), enabling the unified model to approximate the entire Pareto front set that maximizes the hypervolume. Extensive experimental results confirm that SEPNet can find a better Paretofront than the current state-of-the-art methods while minimizing the increase in model size and training cost.", "keywords": "Pareto optimal;Multi-objective optimization;Multi-task learning;Evolutionary strategy", "primary_area": "", "supplementary_material": "", "author": "Simyung Chang;KiYoon Yoo;Jiho Jang;Nojun Kwak", "authorids": "~Simyung_Chang1;~KiYoon_Yoo2;geographic@snu.ac.kr;~Nojun_Kwak1", "gender": "M;;;M", "homepage": ";;;http://mipal.snu.ac.kr", "dblp": "206/6540;;;49/2806", "google_scholar": "https://scholar.google.co.kr/citations?user=0-tF1dwAAAAJ;;;h_8-1M0AAAAJ", "orcid": ";;;0000-0002-1792-0327", "linkedin": ";;;", "or_profile": "~Simyung_Chang1;~KiYoon_Yoo2;geographic@snu.ac.kr;~Nojun_Kwak1", "aff": "Seoul National University;;;Seoul National University", "aff_domain": "snu.ac.kr;;;snu.ac.kr", "position": "PhD student;;;Full Professor", "bibtex": "@misc{\nchang2022selfevolutionary,\ntitle={Self-evolutionary optimization for Pareto front learning},\nauthor={Simyung Chang and KiYoon Yoo and Jiho Jang and Nojun Kwak},\nyear={2022},\nurl={https://openreview.net/forum?id=VgxHf-qUZ3D}\n}", "github": "", "project": "", "reviewers": "qFpD;6oDC;TEvS;rxMa", "site": "https://openreview.net/forum?id=VgxHf-qUZ3D", "pdf_size": 0, "recommendation": "3;3;5;6", "confidence": "3;4;4;2", "correctness": "2;3;2;4", "technical_novelty": "2;2;2;3", "empirical_novelty": "1;2;3;3", "wc_summary_paper": "46;33;90;80", "wc_summary_review": "37;29;15;31", "wc_main_review": "410;293;265;123", "wc_review": "493;355;370;234", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.25, 1.299038105676658 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "correctness_avg": [ 2.75, 0.82915619758885 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 62.25, 23.47738273317535 ], "wc_summary_review_avg": [ 28.0, 8.06225774829855 ], "wc_main_review_avg": [ 272.75, 102.14297577415688 ], "wc_review_avg": [ 363.0, 91.72513286989559 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.5222329678670935, "corr_recommendation_correctness": 0.5222329678670935, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9806782180917819121&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0", "aff_unique_norm": "Seoul National University", "aff_unique_dep": "", "aff_unique_url": "https://www.snu.ac.kr", "aff_unique_abbr": "SNU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "South Korea" }, { "title": "What Do We Mean by Generalization in Federated Learning?", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6652", "id": "VimqQq-i_Q", "poster": "", "openreview": "https://openreview.net/forum?id=VimqQq-i_Q", "slides": "https://iclr.cc/virtual/2022/poster/6652", "video": "https://iclr.cc/virtual/2022/poster/6652", "author_site": "Honglin Yuan, Warren Morningstar, Lin Ning, Karan Singhal", "tldr": "", "abstract": "Federated learning data is drawn from a distribution of distributions: clients are drawn from a meta-distribution, and their data are drawn from local data distributions. Generalization studies in federated learning should separate performance gaps from unseen client data (out-of-sample gap) from performance gaps from unseen client distributions (participation gap). In this work, we propose a framework for disentangling these performance gaps. Using this framework, we observe and explain differences in behavior across natural and synthetic federated datasets, indicating that dataset synthesis strategy can be important for realistic simulations of generalization in federated learning. We propose a semantic synthesis strategy that enables realistic simulation without naturally partitioned data. Informed by our \ufb01ndings, we call out community suggestions for future federated learning works.", "keywords": "Federated Learning;generalization;heterogeneity", "primary_area": "", "supplementary_material": "/attachment/b9e9c96273103043abf82025fc71cad9a6f7e026.zip", "author": "Honglin Yuan;Warren Richard Morningstar;Lin Ning;Karan Singhal", "authorids": "~Honglin_Yuan1;~Warren_Richard_Morningstar1;~Lin_Ning1;~Karan_Singhal1", "gender": ";M;F;", "homepage": "https://hongliny.github.io;;;https://karansinghal.com", "dblp": "125/3654;260/0779;38/3526-1;", "google_scholar": "6rQZU7MAAAAJ;https://scholar.google.com/citations?view_op=search_authors;FCY4vUEAAAAJ;nMfflL0AAAAJ", "orcid": ";;0000-0001-9458-7946;", "linkedin": "yuanhl/;;;karan1149/", "or_profile": "~Honglin_Yuan1;~Warren_Richard_Morningstar1;~Lin_Ning1;~Karan_Singhal1", "aff": "Stanford University;Google;Google;Google Research", "aff_domain": "stanford.edu;google.com;google.com;google.com", "position": "PhD Candidate;Software Engineer;Software Engineer;Researcher", "bibtex": "@inproceedings{\nyuan2022what,\ntitle={What Do We Mean by Generalization in Federated Learning?},\nauthor={Honglin Yuan and Warren Richard Morningstar and Lin Ning and Karan Singhal},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=VimqQq-i_Q}\n}", "github": "", "project": "", "reviewers": "dKHn;bfMQ;TBV6;b9Rh", "pdf_size": 0, "recommendation": "6;6;6;8", "confidence": "3;4;3;4", "correctness": "3;3;4;4", "technical_novelty": "2;2;3;4", "empirical_novelty": "2;2;2;4", "wc_summary_paper": "71;123;62;317", "wc_summary_review": "40;92;23;63", "wc_main_review": "216;1370;181;491", "wc_review": "327;1585;266;871", "wc_reply_reviewers": "0;157;0;0", "wc_reply_authors": "1115;3608;700;1164", "reply_reviewers": "0;1;0;0", "reply_authors": "2;7;1;2", "recommendation_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.82915619758885 ], "empirical_novelty_avg": [ 2.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 143.25, 102.9814910554319 ], "wc_summary_review_avg": [ 54.5, 25.889186931999237 ], "wc_main_review_avg": [ 564.5, 480.3012075770787 ], "wc_review_avg": [ 762.25, 530.2006106182829 ], "wc_reply_reviewers_avg": [ 39.25, 67.98299419707844 ], "wc_reply_authors_avg": [ 1646.75, 1146.586537292323 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 3.0, 2.345207879911715 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 100, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7455517891491181404&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=VimqQq-i_Q", "email": "stanford.edu;google.com;google.com;google.com", "author_num": 4, "aff_unique_index": "0;1;1;1", "aff_unique_norm": "Stanford University;Google", "aff_unique_dep": ";Google", "aff_unique_url": "https://www.stanford.edu;https://www.google.com", "aff_unique_abbr": "Stanford;Google", "aff_campus_unique_index": "0;1;1;1", "aff_campus_unique": "Stanford;Mountain View", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Proving the Lottery Ticket Hypothesis for Convolutional Neural Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/5981", "id": "Vjki79-619-", "poster": "", "openreview": "https://openreview.net/forum?id=Vjki79-619-", "slides": "https://iclr.cc/virtual/2022/poster/5981", "video": "https://iclr.cc/virtual/2022/poster/5981", "author_site": "Arthur da Cunha, Emanuele Natale, Laurent Viennot", "tldr": "", "abstract": "The lottery ticket hypothesis states that a randomly-initialized neural network contains a small subnetwork which, when trained in isolation, can compete with the performance of the original network. Recent theoretical works proved an even stronger version: every sufficiently overparameterized (dense) neural network contains a subnetwork that, even without training, achieves accuracy comparable to that of the trained large network. These works left as an open problem to extend the result to convolutional neural networks (CNNs).\nIn this work we provide such generalization by showing that, with high probability, it is possible to approximate any CNN by pruning a random CNN whose size is larger by a logarithmic factor.", "keywords": "lottery ticket hypothesis;convolutional neural network;network pruning;random subset sum;random neural network", "primary_area": "", "supplementary_material": "/attachment/1f32548502745eb0169a1544d59ad80fcae3ad3e.zip", "author": "Arthur da Cunha;Emanuele Natale;Laurent Viennot", "authorids": "~Arthur_da_Cunha1;~Emanuele_Natale1;~Laurent_Viennot1", "gender": ";M;", "homepage": ";https://www-sop.inria.fr/members/Emanuele.Natale/;", "dblp": ";126/5223;v/LaurentViennot", "google_scholar": ";https://scholar.google.it/citations?user=m2P3BH4AAAAJ;", "orcid": ";0000-0002-8755-3892;", "linkedin": ";;", "or_profile": "~Arthur_da_Cunha1;~Emanuele_Natale1;~Laurent_Viennot1", "aff": ";CNRS;INRIA", "aff_domain": ";cnrs.fr;inria.fr", "position": ";Researcher;Researcher", "bibtex": "@inproceedings{\ncunha2022proving,\ntitle={Proving the Lottery Ticket Hypothesis for Convolutional Neural Networks},\nauthor={Arthur da Cunha and Emanuele Natale and Laurent Viennot},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=Vjki79-619-}\n}", "github": "", "project": "", "reviewers": "DkZV;ifdf;dycZ;7cVT", "pdf_size": 0, "recommendation": "5;6;8;8", "confidence": "4;2;3;4", "correctness": "4;4;4;4", "technical_novelty": "2;3;3;3", "empirical_novelty": "0;0;2;2", "wc_summary_paper": "51;97;79;63", "wc_summary_review": "40;45;46;60", "wc_main_review": "367;121;221;228", "wc_review": "458;263;346;351", "wc_reply_reviewers": "512;0;150;0", "wc_reply_authors": "1743;400;741;585", "reply_reviewers": "3;0;2;0", "reply_authors": "4;1;2;1", "recommendation_avg": [ 6.75, 1.299038105676658 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "correctness_avg": [ 4.0, 0.0 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.0, 1.0 ], "wc_summary_paper_avg": [ 72.5, 17.284386017443605 ], "wc_summary_review_avg": [ 47.75, 7.428828979051813 ], "wc_main_review_avg": [ 234.25, 87.55391196285863 ], "wc_review_avg": [ 354.5, 69.22607890094599 ], "wc_reply_reviewers_avg": [ 165.5, 209.21460274082207 ], "wc_reply_authors_avg": [ 867.25, 519.8232271647738 ], "reply_reviewers_avg": [ 1.25, 1.299038105676658 ], "reply_authors_avg": [ 2.0, 1.224744871391589 ], "replies_avg": [ 26, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.058025885318565944, "corr_recommendation_correctness": 0.0, "gs_citation": 30, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5931056693782794538&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "pdf": "https://openreview.net/pdf?id=Vjki79-619-", "email": ";cnrs.fr;inria.fr", "author_num": 3, "aff_unique_index": "0;1", "aff_unique_norm": "Centre National de la Recherche Scientifique;INRIA", "aff_unique_dep": ";", "aff_unique_url": "https://www.cnrs.fr;https://www.inria.fr", "aff_unique_abbr": "CNRS;INRIA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "France" }, { "id": "VjoSeYLAiZN", "title": "A NEW BACKBONE FOR HYPERSPECTRAL IMAGE RECONSTRUCTION", "track": "main", "status": "Reject", "tldr": "", "abstract": "As the inverse process of snapshot compressive imaging, the hyperspectral image (HSI) reconstruction takes the 2D measurement as input and posteriorly retrieves the captured 3D spatial-spectral signal. Built upon several assumptions, numerous sophisticated neural networks have come to the fore in this task. Despite their prosperity under experimental settings, it's still extremely challenging for existing networks to achieve high-fidelity reconstructive quality while maximizing the reconstructive efficiency (computational efficiency and power occupation), which prohibits their further deployment in practical applications. In this paper, we firstly conduct a retrospective analysis on aforementioned assumptions, through which we indicate the imminent aspiration for an authentically practical-oriented network in reconstructive community. By analysing the effectiveness and limitations of the widely-used reconstructive backbone U-Net, we propose a Simple Reconstruction Network, namely SRN, just based on some popular techniques, e.g., scale/spectral-invariant learning and identity connection. It turns out, under current conditions, such a pragmatic solution outperforms existing reconstructive methods by an obvious margin and maximize the reconstructive efficiency concretely. We hope the proposed SRN can further contribute to the cutting-edge reconstructive methods as a promising backbone, and also benefit the realistic tasks, i.e., real-time/high-resolution HSI reconstruction, solely as a baseline. \n", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jiamian Wang;Yulun Zhang;Xin Yuan;Yun Fu;ZHIQIANG TAO", "authorids": "~Jiamian_Wang1;~Yulun_Zhang1;~Xin_Yuan4;~Yun_Fu1;~ZHIQIANG_TAO2", "gender": "M;M;M;M;", "homepage": "https://jiamian-wang.github.io/;http://yulunzhang.com/;https://en.westlake.edu.cn/faculty/xin-yuan.html;http://www1.ece.neu.edu/~yunfu/;http://ztao.cc/", "dblp": "291/6309;166/2763-1.html;78/713-2;00/5815-1;135/5229.html", "google_scholar": "MGSkEscAAAAJ;ORmLjWoAAAAJ;cS9CbWkAAAAJ;https://scholar.google.com.tw/citations?user=h-JEcQ8AAAAJ;sEKglOkAAAAJ", "orcid": ";0000-0002-2288-5079;0000-0002-8311-7524;0000-0002-5098-2853;", "linkedin": "%E5%8A%A0%E5%86%95-%E7%8E%8B-5928b81ba/;yulun-zhang-1116b5b9/;xin-yuan-0024bb31/;furaymond/;", "or_profile": "~Jiamian_Wang1;~Yulun_Zhang1;~Xin_Yuan4;~Yun_Fu1;~ZHIQIANG_TAO2", "aff": "Santa Clara University;Swiss Federal Institute of Technology;Westlake University;Northeastern University;Santa Clara University", "aff_domain": "scu.edu;ethz.ch;westlake.edu.cn;northeastern.edu;scu.edu", "position": "PhD student;Postdoc;Associate Professor;Full Professor;Assistant Professor", "bibtex": "@misc{\nwang2022a,\ntitle={A {NEW} {BACKBONE} {FOR} {HYPERSPECTRAL} {IMAGE} {RECONSTRUCTION}},\nauthor={Jiamian Wang and Yulun Zhang and Xin Yuan and Yun Fu and ZHIQIANG TAO},\nyear={2022},\nurl={https://openreview.net/forum?id=VjoSeYLAiZN}\n}", "github": "", "project": "", "reviewers": "ms6v;gU7n;NYcV;dXDL", "site": "https://openreview.net/forum?id=VjoSeYLAiZN", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "4;4;3;3", "correctness": "2;2;3;3", "technical_novelty": "1;3;2;3", "empirical_novelty": "3;0;3;3", "wc_summary_paper": "72;31;78;58", "wc_summary_review": "75;17;28;12", "wc_main_review": "546;285;558;249", "wc_review": "693;333;664;319", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.25, 0.82915619758885 ], "empirical_novelty_avg": [ 2.25, 1.299038105676658 ], "wc_summary_paper_avg": [ 59.75, 18.115946014492316 ], "wc_summary_review_avg": [ 33.0, 24.92990172463582 ], "wc_main_review_avg": [ 409.5, 143.130185495583 ], "wc_review_avg": [ 502.25, 176.61734767570258 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.6882472016116854, "corr_recommendation_correctness": 0.6882472016116854, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12774669566807278138&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;2;3;0", "aff_unique_norm": "Santa Clara University;Swiss Federal Institute of Technology;Westlake University;Northeastern University", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.scu.edu;https://www.ethz.ch;https://www.westlake.edu.cn;https://www.northeastern.edu", "aff_unique_abbr": "SCU;ETH Zurich;WU;NEU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2;0;0", "aff_country_unique": "United States;Switzerland;China" }, { "id": "VnurXbqxr0B", "title": "STRIC: Stacked Residuals of Interpretable Components for Time Series Anomaly Detection", "track": "main", "status": "Reject", "tldr": "", "abstract": "We present a residual-style architecture for interpretable forecasting and anomaly detection in multivariate time series. \nOur architecture is composed of stacked residual blocks designed to separate components of the signal such as trends, seasonality, and linear dynamics. \nThese are followed by a Temporal Convolutional Network (TCN) that can freely model the remaining components and can aggregate global statistics from different time series as context for the local predictions of each time series. The architecture can be trained end-to-end and automatically adapts to the time scale of the signals. \nAfter modeling the signals, we use an anomaly detection system based on the classic CUMSUM algorithm and a variational approximation of the $f$-divergence to detect both isolated point anomalies and change-points in statistics of the signals. \nOur method outperforms state-of-the-art robust statistical methods on typical time series benchmarks where deep networks usually underperform. To further illustrate the general applicability of our method, we show that it can be successfully employed on complex data such as text embeddings of newspaper articles.", "keywords": "Anomaly Detection;Time-Series forecasting;Residual Temporal Convolutional Networks", "primary_area": "", "supplementary_material": "", "author": "Luca Zancato;Alessandro Achille;Giovanni Paolini;Alessandro Chiuso;Stefano Soatto", "authorids": "~Luca_Zancato1;~Alessandro_Achille1;~Giovanni_Paolini1;~Alessandro_Chiuso2;~Stefano_Soatto3", "gender": "M;M;M;M;", "homepage": ";;http://giovannipaolini.org;http://automatica.dei.unipd.it/people/chiuso.html;https://www.cs.ucla.edu/~soatto", "dblp": "274/1481;190/7328;150/6260;;08/1262", "google_scholar": "Z2Mhh2UAAAAJ;;https://scholar.google.it/citations?user=xGI18C0AAAAJ;;lH1PdF8AAAAJ", "orcid": ";;0000-0002-3964-9101;;0000-0003-2902-6362", "linkedin": ";;g-paolini/;;stefano-soatto-5765aa6/", "or_profile": "~Luca_Zancato1;~Alessandro_Achille1;~Giovanni_Paolini1;~Alessandro_Chiuso2;~Stefano_Soatto2", "aff": "Amazon Web Services;California Institute of Technology;Amazon;Universita' degli studi di Padova;UCLA Computer Science Department, University of California, Los Angeles", "aff_domain": "amazon.it;caltech.edu;amazon.com;unipd.it;cs.ucla.edu", "position": "Applied Scientist;Postdoc;Applied Scientist;Professor;Professor", "bibtex": "@misc{\nzancato2022stric,\ntitle={{STRIC}: Stacked Residuals of Interpretable Components for Time Series Anomaly Detection},\nauthor={Luca Zancato and Alessandro Achille and Giovanni Paolini and Alessandro Chiuso and Stefano Soatto},\nyear={2022},\nurl={https://openreview.net/forum?id=VnurXbqxr0B}\n}", "github": "", "project": "", "reviewers": "rnBY;zX4p;TtBt", "site": "https://openreview.net/forum?id=VnurXbqxr0B", "pdf_size": 0, "recommendation": "5;5;6", "confidence": "5;4;4", "correctness": "3;2;3", "technical_novelty": "3;2;4", "empirical_novelty": "2;2;2", "wc_summary_paper": "58;63;119", "wc_summary_review": "104;24;61", "wc_main_review": "297;257;404", "wc_review": "459;344;584", "wc_reply_reviewers": "46;16;30", "wc_reply_authors": "512;795;769", "reply_reviewers": "1;1;1", "reply_authors": "1;1;1", "recommendation_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "correctness_avg": [ 2.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 3.0, 0.816496580927726 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 80.0, 27.65260686927485 ], "wc_summary_review_avg": [ 63.0, 32.69046751985457 ], "wc_main_review_avg": [ 319.3333333333333, 62.055530687352025 ], "wc_review_avg": [ 462.3333333333333, 98.00793618659438 ], "wc_reply_reviewers_avg": [ 30.666666666666668, 12.256517540566824 ], "wc_reply_authors_avg": [ 692.0, 127.72105020969201 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.4999999999999999, "corr_recommendation_correctness": 0.4999999999999999, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1741969519961563053&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;0;2;3", "aff_unique_norm": "Amazon;California Institute of Technology;University of Padova;University of California, Los Angeles", "aff_unique_dep": "Amazon Web Services;;;Computer Science Department", "aff_unique_url": "https://aws.amazon.com;https://www.caltech.edu;https://www.unipd.it;https://www.ucla.edu", "aff_unique_abbr": "AWS;Caltech;Unipd;UCLA", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Pasadena;Los Angeles", "aff_country_unique_index": "0;0;0;1;0", "aff_country_unique": "United States;Italy" }, { "title": "Discovering Nonlinear PDEs from Scarce Data with Physics-encoded Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6855", "id": "Vog_3GXsgmb", "poster": "", "openreview": "https://openreview.net/forum?id=Vog_3GXsgmb", "slides": "https://iclr.cc/virtual/2022/poster/6855", "video": "https://iclr.cc/virtual/2022/poster/6855", "author_site": "Chengping Rao, Pu Ren, Yang Liu, Hao Sun", "tldr": "", "abstract": "There have been growing interests in leveraging experimental measurements to discover the underlying partial differential equations (PDEs) that govern complex physical phenomena. Although past research attempts have achieved great success in data-driven PDE discovery, the robustness of the existing methods cannot be guaranteed when dealing with low-quality measurement data. To overcome this challenge, we propose a novel physics-encoded discrete learning framework for discovering spatiotemporal PDEs from scarce and noisy data. The general idea is to (1) firstly introduce a novel deep convolutional-recurrent networks, which can encode prior physics knowledge (e.g., known terms, assumed PDE structure, initial/boundary conditions, etc.) while remaining flexible on representation capability, to accurately reconstruct high-fidelity data, and (2) then perform sparse regression with the reconstructed data to identify the analytical form of the governing PDEs. We validate our proposed framework on three high-dimensional PDE systems. The effectiveness and superiority of the proposed method over baselines are demonstrated.", "keywords": "Data-driven equation discovery;dynamical system modeling;physics-encoded learning", "primary_area": "", "supplementary_material": "", "author": "Chengping Rao;Pu Ren;Yang Liu;Hao Sun", "authorids": "~Chengping_Rao1;~Pu_Ren1;~Yang_Liu52;~Hao_Sun4", "gender": "M;M;F;", "homepage": ";https://paulpuren.github.io/;;", "dblp": ";;;", "google_scholar": "29DpfrEAAAAJ;FiuAyGwAAAAJ;34upg6YAAAAJ;", "orcid": ";0000-0002-6354-385X;0000-0003-0127-4030;", "linkedin": "chengping-rao-532754161/;;;", "or_profile": "~Chengping_Rao1;~Pu_Ren1;~Yang_Liu52;~Hao_Sun4", "aff": ";Northeastern University;Northeastern University;", "aff_domain": ";northeastern.edu;northeastern.edu;", "position": ";PhD student;Assistant Professor;", "bibtex": "@inproceedings{\nrao2022discovering,\ntitle={Discovering Nonlinear {PDE}s from Scarce Data with Physics-encoded Learning},\nauthor={Chengping Rao and Pu Ren and Yang Liu and Hao Sun},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=Vog_3GXsgmb}\n}", "github": "", "project": "", "reviewers": "zdB8;x24u;53sM;t1Ji;yjyY", "pdf_size": 0, "recommendation": "5;5;5;6;6", "confidence": "5;5;4;2;3", "correctness": "2;3;3;3;4", "technical_novelty": "2;3;2;3;3", "empirical_novelty": "2;2;2;3;0", "wc_summary_paper": "28;48;54;66;64", "wc_summary_review": "31;64;75;206;27", "wc_main_review": "967;682;832;260;281", "wc_review": "1026;794;961;532;372", "wc_reply_reviewers": "0;1914;304;0;0", "wc_reply_authors": "1532;5977;1701;755;806", "reply_reviewers": "0;8;2;0;0", "reply_authors": "2;14;4;2;2", "recommendation_avg": [ 5.4, 0.48989794855663565 ], "confidence_avg": [ 3.8, 1.16619037896906 ], "correctness_avg": [ 3.0, 0.6324555320336759 ], "technical_novelty_avg": [ 2.6, 0.4898979485566356 ], "empirical_novelty_avg": [ 1.8, 0.9797958971132712 ], "wc_summary_paper_avg": [ 52.0, 13.682105101189656 ], "wc_summary_review_avg": [ 80.6, 65.36849394012378 ], "wc_main_review_avg": [ 604.4, 287.2285501129719 ], "wc_review_avg": [ 737.0, 249.87837041248687 ], "wc_reply_reviewers_avg": [ 443.6, 744.5679552599615 ], "wc_reply_authors_avg": [ 2154.2, 1948.421248087795 ], "reply_reviewers_avg": [ 2.0, 3.0983866769659336 ], "reply_authors_avg": [ 4.8, 4.66476151587624 ], "replies_avg": [ 42, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.910182054618206, "corr_recommendation_correctness": 0.6454972243679027, "gs_citation": 39, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15902631994502947274&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=Vog_3GXsgmb", "email": ";northeastern.edu;northeastern.edu;", "author_num": 4, "aff_unique_index": "0;0", "aff_unique_norm": "Northeastern University", "aff_unique_dep": "", "aff_unique_url": "https://www.northeastern.edu", "aff_unique_abbr": "NEU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "VppWsjXgBY6", "title": "TLDR: Twin Learning for Dimensionality Reduction", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Dimensionality reduction methods are unsupervised approaches which learn low-dimensional spaces where some properties of the initial space, typically the notion of ``neighborhood'', are preserved. They are a crucial component of diverse tasks like visualization, compression, indexing, and retrieval. Aiming for a totally different goal, self-supervised visual representation learning has been shown to produce transferable representation functions by learning models that encode invariance to artificially created distortions, eg a set of hand-crafted image transformations. Unlike manifold learning methods that usually require propagation on large k-NN graphs or complicated solvers, self-supervised learning approaches rely on simpler and more scalable frameworks for learning.\nIn this paper, we unify these two families of approaches from the angle of manifold learning and propose TLDR, a dimensionality reduction method for generic input spaces that is porting a simple self-supervised learning framework to a setting where it is hard or impossible to define an appropriate set of distortions by hand. We propose to use nearest neighbors to build pairs from a training set and a redundancy reduction loss borrowed from the self-supervised literature to learn an encoder that produces representations invariant across such pairs. TLDR is a method that is simple, easy to implement and train, and of broad applicability; it consists of an offline nearest neighbor computation step that can be highly approximated, and a straightforward learning process that does not require mining negative samples to contrast, eigendecompositions, or cumbersome optimization solvers. Aiming for scalability, the Achilles' heel of manifold learning, we focus on improving linear dimensionality reduction, a technique that is still an integral part of many large-scale systems. By simply replacing PCA with TLDR, we are able to increase the performance of GeM-AP, a state-of-the-art landmark recognition method by 4% mAP for 128 dimensions, and to retain its performance with 16 times fewer dimensions.", "keywords": "dimensionality reduction;manifold learning;image retrieval;document retrieval;PCA", "primary_area": "", "supplementary_material": "", "author": "Yannis Kalantidis;Carlos Eduardo Rosar Kos Lassance;Jon Almaz\u00e1n;Diane Larlus", "authorids": "~Yannis_Kalantidis2;~Carlos_Eduardo_Rosar_Kos_Lassance1;~Jon_Almaz\u00e1n1;~Diane_Larlus1", "gender": "M;M;F;M", "homepage": "https://www.skamalas.com/;https://cadurosar.github.io/;https://dlarlus.github.io/;", "dblp": "33/8693;;48/4033;https://dblp1.uni-trier.de/pers/hd/a/Almaz=aacute=n:Jon", "google_scholar": "QJZQgN8AAAAJ;https://scholar.google.ca/citations?user=UnUYWp0AAAAJ;https://scholar.google.fr/citations?user=nI2oJqkAAAAJ;0dUAE_EAAAAJ", "orcid": ";;;", "linkedin": ";;;jalmazan/", "or_profile": "~Yannis_Kalantidis2;~Carlos_Eduardo_Rosar_Kos_Lassance1;~Diane_Larlus1;~Jon_Almazan2", "aff": "Naver Labs Europe;Naver Labs Europe;NAVER LABS Europe;Naver Labs Europe", "aff_domain": "naverlabs.com;naverlabs.com;naverlabs.com;naverlabs.com", "position": "Research Scientist;Postdoc;Principal Researcher;Research Scientist", "bibtex": "@misc{\nkalantidis2022tldr,\ntitle={{TLDR}: Twin Learning for Dimensionality Reduction},\nauthor={Yannis Kalantidis and Carlos Eduardo Rosar Kos Lassance and Jon Almaz{\\'a}n and Diane Larlus},\nyear={2022},\nurl={https://openreview.net/forum?id=VppWsjXgBY6}\n}", "github": "", "project": "", "reviewers": "i8W9;TKH8;X3Uy;7YvM", "site": "https://openreview.net/forum?id=VppWsjXgBY6", "pdf_size": 0, "recommendation": "3;3;5;6", "confidence": "4;5;4;3", "correctness": "4;3;3;4", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;2;3;4", "wc_summary_paper": "40;50;85;62", "wc_summary_review": "43;52;31;24", "wc_main_review": "168;298;194;100", "wc_review": "251;400;310;186", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "194;395;107;85", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 4.25, 1.299038105676658 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 59.25, 16.78354849249705 ], "wc_summary_review_avg": [ 37.5, 10.781929326423912 ], "wc_main_review_avg": [ 190.0, 71.17583859709698 ], "wc_review_avg": [ 286.75, 78.73174391565324 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 195.25, 122.3159331403722 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.816496580927726, "corr_recommendation_correctness": 0.19245008972987526, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7842262493890317862&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "NAVER LABS", "aff_unique_dep": "", "aff_unique_url": "https://labs.naver.com", "aff_unique_abbr": "NLE", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "Unknown;France" }, { "id": "Vq_QHT5kcAK", "title": "Greedy Bayesian Posterior Approximation with Deep Ensembles", "track": "main", "status": "Reject", "tldr": "", "abstract": "Ensembles of independently trained neural networks are a state-of-the-art approach to estimate predictive uncertainty in Deep Learning, and can be interpreted as an approximation of the posterior distribution via a mixture of delta functions. The training of ensembles relies on non-convexity of the loss landscape and random initialization of their individual members, making the resulting posterior approximation uncontrolled. This paper proposes a novel and principled method to tackle this limitation, minimizing an $f$-divergence between the true posterior and a kernel density estimator in a function space. We analyze this objective from a combinatorial point of view, and show that it is submodular with respect to mixture components for any $f$. Subsequently, we consider the problem of greedy ensemble construction, and from the marginal gain of the total objective, we derive a novel diversity term for ensemble methods. The performance of our approach is demonstrated on computer vision out-of-distribution detection benchmarks in a range of architectures trained on multiple datasets. The source code of our method is made publicly available.", "keywords": "Bayesian posterior;deep ensembles;submodular optimization", "primary_area": "", "supplementary_material": "", "author": "Aleksei Tiulpin;Matthew B. Blaschko", "authorids": "~Aleksei_Tiulpin1;~Matthew_B._Blaschko1", "gender": "M;M", "homepage": "https://aleksei.tiulpin.ai;http://homes.esat.kuleuven.be/~mblaschk/", "dblp": "https://dblp.org/pers/hd/t/Tiulpin:Aleksei;12/5233", "google_scholar": "https://scholar.google.fi/citations?user=EFE2gpQAAAAJ;EmmO7LcAAAAJ", "orcid": "0000-0002-7852-4141;0000-0002-2640-181X", "linkedin": ";matthew-blaschko-5b7a51b0/", "or_profile": "~Aleksei_Tiulpin1;~Matthew_Blaschko1", "aff": "Aalto University;KU Leuven", "aff_domain": "aalto.fi;esat.kuleuven.be", "position": "Postdoc;Associate Professor", "bibtex": "@misc{\ntiulpin2022greedy,\ntitle={Greedy Bayesian Posterior Approximation with Deep Ensembles},\nauthor={Aleksei Tiulpin and Matthew B. Blaschko},\nyear={2022},\nurl={https://openreview.net/forum?id=Vq_QHT5kcAK}\n}", "github": "", "project": "", "reviewers": "dcPg;ZKEY;xECp", "site": "https://openreview.net/forum?id=Vq_QHT5kcAK", "pdf_size": 0, "recommendation": "3;6;6", "confidence": "4;3;3", "correctness": "3;3;3", "technical_novelty": "3;3;3", "empirical_novelty": "2;2;4", "wc_summary_paper": "70;77;98", "wc_summary_review": "171;28;19", "wc_main_review": "542;202;235", "wc_review": "783;307;352", "wc_reply_reviewers": "0;39;0", "wc_reply_authors": "850;268;422", "reply_reviewers": "0;1;0", "reply_authors": "2;1;1", "recommendation_avg": [ 5.0, 1.4142135623730951 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.9428090415820634 ], "wc_summary_paper_avg": [ 81.66666666666667, 11.897712198383164 ], "wc_summary_review_avg": [ 72.66666666666667, 69.62917651546815 ], "wc_main_review_avg": [ 326.3333333333333, 153.09329036752575 ], "wc_review_avg": [ 480.6666666666667, 214.56985394556764 ], "wc_reply_reviewers_avg": [ 13.0, 18.384776310850235 ], "wc_reply_authors_avg": [ 513.3333333333334, 246.22121941231808 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -1.0, "corr_recommendation_correctness": 0.0, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14995119736695069760&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff_unique_index": "0;1", "aff_unique_norm": "Aalto University;Katholieke Universiteit Leuven", "aff_unique_dep": ";", "aff_unique_url": "https://www.aalto.fi;https://www.kuleuven.be", "aff_unique_abbr": "Aalto;KU Leuven", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "Finland;Belgium" }, { "title": "Optimizer Amalgamation", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6427", "id": "VqzXzA9hjaX", "poster": "", "openreview": "https://openreview.net/forum?id=VqzXzA9hjaX", "slides": "https://iclr.cc/virtual/2022/poster/6427", "video": "https://iclr.cc/virtual/2022/poster/6427", "author_site": "Tianshu Huang, Tianlong Chen, Sijia Liu, Shiyu Chang, Lisa Amini, Zhangyang Wang", "tldr": "", "abstract": "Selecting an appropriate optimizer for a given problem is of major interest for researchers and practitioners. Many analytical optimizers have been proposed using a variety of theoretical and empirical approaches; however, none can offer a universal advantage over other competitive optimizers. We are thus motivated to study a new problem named Optimizer Amalgamation: how can we best combine a pool of \"teacher\" optimizers into a single \"student\" optimizer that can have stronger problem-specific performance? In this paper, we draw inspiration from the field of \"learning to optimize\" to use a learnable amalgamation target. First, we define three differentiable amalgamation mechanisms to amalgamate a pool of analytical optimizers by gradient descent. Then, in order to reduce variance of the amalgamation process, we also explore methods to stabilize the amalgamation process by perturbing the amalgamation target. Finally, we present experiments showing the superiority of our amalgamated optimizer compared to its amalgamated components and learning to optimize baselines, and the efficacy of our variance reducing perturbations.\n", "keywords": "Learning to Optimize;Knowledge Amalgamation;Stability-Aware Training", "primary_area": "", "supplementary_material": "/attachment/0b380a955ad88e5ca5ed7eb72c61d91ff50d01d7.zip", "author": "Tianshu Huang;Tianlong Chen;Sijia Liu;Shiyu Chang;Lisa Amini;Zhangyang Wang", "authorids": "~Tianshu_Huang1;~Tianlong_Chen1;~Sijia_Liu1;~Shiyu_Chang2;~Lisa_Amini1;~Zhangyang_Wang1", "gender": ";M;M;Unspecified;F;M", "homepage": "https://tianshu.io/;https://tianlong-chen.github.io;https://lsjxjtu.github.io/;http://people.csail.mit.edu/chang87/;;https://vita-group.github.io", "dblp": ";;128/6972-1;28/9988;51/2221;119/4026", "google_scholar": "https://scholar.google.com/citations?view_op=list_works;LE3ctn0AAAAJ;C7dO_UgAAAAJ;r21asW4AAAAJ;wstakvUAAAAJ;pxFyKAIAAAAJ", "orcid": ";0000-0001-7774-8197;;;;", "linkedin": ";tianlong-chen-783862167/;;;;", "or_profile": "~Tianshu_Huang1;~Tianlong_Chen1;~Sijia_Liu1;~Shiyu_Chang2;~Lisa_Amini1;~Zhangyang_Wang1", "aff": "Bosch;University of Texas, Austin;Michigan State University;University of California, Santa Barbara;;University of Texas, Austin", "aff_domain": "bosch.com;utexas.edu;msu.edu;ucsb.edu;;utexas.edu", "position": "Intern;PhD student;Assistant Professor;Assistant Professor;;Assistant Professor", "bibtex": "@inproceedings{\nhuang2022optimizer,\ntitle={Optimizer Amalgamation},\nauthor={Tianshu Huang and Tianlong Chen and Sijia Liu and Shiyu Chang and Lisa Amini and Zhangyang Wang},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=VqzXzA9hjaX}\n}", "github": "", "project": "", "reviewers": "kuqL;h9GD;ZiUc;Zbiy", "pdf_size": 0, "recommendation": "6;6;6;6", "confidence": "4;3;4;4", "correctness": "3;3;3;3", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "86;51;64;57", "wc_summary_review": "60;24;53;79", "wc_main_review": "395;93;243;391", "wc_review": "541;168;360;527", "wc_reply_reviewers": "16;0;128;0", "wc_reply_authors": "1021;130;886;516", "reply_reviewers": "1;0;1;0", "reply_authors": "5;1;2;1", "recommendation_avg": [ 6.0, 0.0 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 64.5, 13.238202294873727 ], "wc_summary_review_avg": [ 54.0, 19.761072845369505 ], "wc_main_review_avg": [ 280.5, 124.3814696809778 ], "wc_review_avg": [ 399.0, 151.18697033805526 ], "wc_reply_reviewers_avg": [ 36.0, 53.51635264103861 ], "wc_reply_authors_avg": [ 638.25, 346.8215499359865 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 2.25, 1.6393596310755 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12945586216189211719&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=VqzXzA9hjaX", "email": "bosch.com;utexas.edu;msu.edu;ucsb.edu;;utexas.edu", "author_num": 6, "aff_unique_index": "0;1;2;3;1", "aff_unique_norm": "Robert Bosch GmbH;University of Texas at Austin;Michigan State University;University of California, Santa Barbara", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.bosch.com;https://www.utexas.edu;https://www.msu.edu;https://www.ucsb.edu", "aff_unique_abbr": "Bosch;UT Austin;MSU;UCSB", "aff_campus_unique_index": "1;2;1", "aff_campus_unique": ";Austin;Santa Barbara", "aff_country_unique_index": "0;1;1;1;1", "aff_country_unique": "Germany;United States" }, { "title": "Hindsight: Posterior-guided training of retrievers for improved open-ended generation", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6736", "id": "Vr_BTpw3wz", "poster": "", "openreview": "https://openreview.net/forum?id=Vr_BTpw3wz", "slides": "https://iclr.cc/virtual/2022/poster/6736", "video": "https://iclr.cc/virtual/2022/poster/6736", "author_site": "Ashwin Paranjape, Omar Khattab, Christopher Potts, Peter Bailis, Christopher Manning", "tldr": "", "abstract": "Many text generation systems benefit from retrieving passages from a textual knowledge corpus (e.g., Wikipedia) and using them to generate the output. For open-ended generation tasks, like generating informative utterances in conversations, many varied passages $z$ are relevant to the context $x$ but few are relevant to the observed next utterance $y$ (label). For such tasks, existing methods (that jointly train the retriever and generator) underperform: during training the top-k context-relevant retrieved passages might not contain the label-relevant passage and the generator may hence not learn a preference to ground its generated output in them. We propose using an additional guide-retriever that also conditions on the observed label $y$ and \u201cin hindsight\u201d retrieves label-relevant passages during training. We maximize the evidence lower bound (ELBo) to jointly train the guide-retriever $Q(z|x,y)$ with the standard retriever $P_\\eta(z|x)$ and the generator $P_\\theta(y|x,z)$ and find that ELBo has better inductive biases than prior work. For informative conversations from the Wizard of Wikipedia dataset, with our posterior-guided training, the retriever finds passages with higher relevance in the top-10 (23% relative improvement), the generator\u2019s responses are more grounded in the retrieved passage (19% relative improvement) and the end-to-end system produces better overall output (6.4% relative improvement). ", "keywords": "retrieval;generation;retrieval-augmented generation;open-ended generation;informative conversations;free-form QA;posterior distribution;ELBo", "primary_area": "", "supplementary_material": "", "author": "Ashwin Paranjape;Omar Khattab;Christopher Potts;Matei Zaharia;Christopher D Manning", "authorids": "~Ashwin_Paranjape1;~Omar_Khattab1;~Christopher_Potts1;~Matei_Zaharia1;~Christopher_D_Manning1", "gender": "M;M;M;M;M", "homepage": "http://stanford.edu/~ashwinpp;https://scholar.google.com/citations?hl=en&user=Lwr5ozgAAAAJ;http://web.stanford.edu/~cgpotts/;https://cs.stanford.edu/~matei/;https://nlp.stanford.edu/~manning/", "dblp": "160/8414;129/7815;13/2617;36/2133;m/ChristopherDManning", "google_scholar": "-ru71hwAAAAJ;;3j08YoAAAAAJ;I1EvjZsAAAAJ;1zmDOdwAAAAJ", "orcid": ";;0000-0002-7978-6055;0000-0002-7547-7204;0000-0001-6155-649X", "linkedin": "ashwinparanjape/;;;mateizaharia/;christopher-manning-011575/", "or_profile": "~Ashwin_Paranjape1;~Omar_Khattab1;~Christopher_Potts1;~Matei_Zaharia1;~Christopher_D_Manning1", "aff": "Stanford University;Stanford University;Stanford University;Stanford University;Computer Science Department, Stanford University", "aff_domain": "stanford.edu;stanford.edu;stanford.edu;stanford.edu;cs.stanford.edu", "position": "PhD student;PhD student;Full Professor;Assistant Professor;Full Professor", "bibtex": "@inproceedings{\nparanjape2022hindsight,\ntitle={Hindsight: Posterior-guided training of retrievers for improved open-ended generation},\nauthor={Ashwin Paranjape and Omar Khattab and Christopher Potts and Matei Zaharia and Christopher D Manning},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=Vr_BTpw3wz}\n}", "github": "", "project": "", "reviewers": "uUxg;hgu8;eXap;kJHS", "pdf_size": 0, "recommendation": "6;6;8;8", "confidence": "3;4;4;4", "correctness": "3;3;4;4", "technical_novelty": "3;2;3;4", "empirical_novelty": "2;2;2;0", "wc_summary_paper": "125;232;118;193", "wc_summary_review": "46;42;92;48", "wc_main_review": "99;249;636;366", "wc_review": "270;523;846;607", "wc_reply_reviewers": "0;12;186;0", "wc_reply_authors": "176;1228;1339;750", "reply_reviewers": "0;1;1;0", "reply_authors": "1;2;2;1", "recommendation_avg": [ 7.0, 1.0 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 1.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 167.0, 47.60777247467056 ], "wc_summary_review_avg": [ 57.0, 20.322401432901575 ], "wc_main_review_avg": [ 337.5, 196.61447047458128 ], "wc_review_avg": [ 561.5, 205.83063426030637 ], "wc_reply_reviewers_avg": [ 49.5, 78.96043312951113 ], "wc_reply_authors_avg": [ 873.25, 459.37967684694104 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": 1.0, "gs_citation": 49, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5860346961260362500&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=Vr_BTpw3wz", "email": "stanford.edu;stanford.edu;stanford.edu;stanford.edu;cs.stanford.edu", "author_num": 5, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Stanford University", "aff_unique_dep": "", "aff_unique_url": "https://www.stanford.edu", "aff_unique_abbr": "Stanford", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Entroformer: A Transformer-based Entropy Model for Learned Image Compression", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7022", "id": "VrjOFfcnSV8", "poster": "", "openreview": "https://openreview.net/forum?id=VrjOFfcnSV8", "slides": "https://iclr.cc/virtual/2022/poster/7022", "video": "https://iclr.cc/virtual/2022/poster/7022", "author_site": "Yichen Qian, Xiuyu Sun, Ming Lin, Zhiyu Tan, Rong Jin", "tldr": "", "abstract": "One critical component in lossy deep image compression is the entropy model, which predicts the probability distribution of the quantized latent representation in the encoding and decoding modules. Previous works build entropy models upon convolutional neural networks which are inefficient in capturing global dependencies. In this work, we propose a novel transformer-based entropy model, termed Entroformer, to capture long-range dependencies in probability distribution estimation effectively and efficiently. Different from vision transformers in image classification, the Entroformer is highly optimized for image compression, including a top-k self-attention and a diamond relative position encoding. Meanwhile, we further expand this architecture with a parallel bidirectional context model to speed up the decoding process. The experiments show that the Entroformer achieves state-of-the-art performance on image compression while being time-efficient.", "keywords": "Image compression;Entropy Model;Global Dependencies", "primary_area": "", "supplementary_material": "", "author": "Yichen Qian;Xiuyu Sun;Ming Lin;Zhiyu Tan;Rong Jin", "authorids": "~Yichen_Qian1;~Xiuyu_Sun1;~Ming_Lin4;~Zhiyu_Tan2;~Rong_Jin1", "gender": "M;M;M;M;M", "homepage": ";https://sites.google.com/view/sunxiuyu/home;https://minglin-home.github.io/;https://scholar.google.com/citations?user=XprTQQ8AAAAJ&hl=en;https://www.cse.msu.edu/~rongjin/", "dblp": "183/8088;40/8845;;136/4997;j/RongJin", "google_scholar": "JjTDAOsAAAAJ;https://scholar.google.com/citations?hl=zh-CN;https://scholar.google.com/citations?hl=en;XprTQQ8AAAAJ;", "orcid": ";0000-0002-7208-8078;;;", "linkedin": ";;;;", "or_profile": "~Yichen_Qian1;~Xiuyu_Sun1;~Ming_Lin4;~Zhiyu_Tan2;~Rong_Jin3", "aff": "Alibaba Group;Alibaba Group;Alibaba Group;Alibaba DAMO Academy;Alibaba Group", "aff_domain": "alibaba-inc.com;alibaba-inc.com;alibaba-inc.com;alibaba-inc.com;alibaba-inc.com", "position": "Algorithm Engineer;Staff Algorithm Engineer;Algorithm Engineer;Researcher;Researcher", "bibtex": "@inproceedings{\nqian2022entroformer,\ntitle={Entroformer: A Transformer-based Entropy Model for Learned Image Compression},\nauthor={Yichen Qian and Xiuyu Sun and Ming Lin and Zhiyu Tan and Rong Jin},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=VrjOFfcnSV8}\n}", "github": "", "project": "", "reviewers": "JhCm;FHR4;3XoD", "pdf_size": 0, "recommendation": "6;6;8", "confidence": "4;5;3", "correctness": "3;3;4", "technical_novelty": "2;2;3", "empirical_novelty": "2;3;3", "wc_summary_paper": "107;80;40", "wc_summary_review": "38;39;31", "wc_main_review": "367;194;47", "wc_review": "512;313;118", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "107;355;74", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "recommendation_avg": [ 6.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 75.66666666666667, 27.52372713779069 ], "wc_summary_review_avg": [ 36.0, 3.559026084010437 ], "wc_main_review_avg": [ 202.66666666666666, 130.78311137995695 ], "wc_review_avg": [ 314.3333333333333, 160.85258952082668 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 178.66666666666666, 125.41220922311442 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.8660254037844385, "corr_recommendation_correctness": 0.9999999999999998, "gs_citation": 169, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12131488115519581306&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=VrjOFfcnSV8", "email": "alibaba-inc.com;alibaba-inc.com;alibaba-inc.com;alibaba-inc.com;alibaba-inc.com", "author_num": 5, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Alibaba Group", "aff_unique_dep": "", "aff_unique_url": "https://www.alibaba.com", "aff_unique_abbr": "Alibaba", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Encoding Weights of Irregular Sparsity for Fixed-to-Fixed Model Compression", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6553", "id": "Vs5NK44aP9P", "poster": "", "openreview": "https://openreview.net/forum?id=Vs5NK44aP9P", "slides": "https://iclr.cc/virtual/2022/poster/6553", "video": "https://iclr.cc/virtual/2022/poster/6553", "author_site": "baeseong park, Se Jung Kwon, Daehwan Oh, Byeonguk Kim, Dongsoo Lee", "tldr": "", "abstract": "Even though fine-grained pruning techniques achieve a high compression ratio, conventional sparsity representations (such as CSR) associated with irregular sparsity degrade parallelism significantly. Practical pruning methods, thus, usually lower pruning rates (by structured pruning) to improve parallelism. In this paper, we study fixed-to-fixed (lossless) encoding architecture/algorithm to support fine-grained pruning methods such that sparse neural networks can be stored in a highly regular structure. We first estimate the maximum compression ratio of encoding-based compression using entropy. Then, as an effort to push the compression ratio to the theoretical maximum (by entropy), we propose a sequential fixed-to-fixed encoding scheme. We demonstrate that our proposed compression scheme achieves almost the maximum compression ratio for the Transformer and ResNet-50 pruned by various fine-grained pruning methods.", "keywords": "Sparse Neural Network;Fixed-to-fixed data compression;Unstructured Pruning", "primary_area": "", "supplementary_material": "", "author": "Bae Seong Park;Se Jung Kwon;Daehwan Oh;Byeongwook Kim;Dongsoo Lee", "authorids": "~Bae_Seong_Park1;~Se_Jung_Kwon1;~Daehwan_Oh1;~Byeongwook_Kim1;~Dongsoo_Lee1", "gender": "M;M;;;M", "homepage": "https://baeseong.tistory.com/;;;;", "dblp": "241/6925.html;119/5676;;220/5405;11/9680", "google_scholar": "https://scholar.google.co.kr/citations?user=RMmyMJsAAAAJ;https://scholar.google.co.kr/citations?user=8eTxKOkAAAAJ;;https://scholar.google.co.kr/citations?user=OjfC7gUAAAAJ;ALiieEkAAAAJ", "orcid": ";;;;", "linkedin": "baeseong-park/;se-jung-kwon-305503175/;daehwan-oh-1004b1144/;;", "or_profile": "~Bae_Seong_Park1;~Se_Jung_Kwon1;~Daehwan_Oh1;~Byeongwook_Kim1;~Dongsoo_Lee1", "aff": "NAVER Clova;NAVER Cloud;Samsung Research;NAVER CLOUD;NAVER CLOVA", "aff_domain": "navercorp.com;navercorp.com;samsung.com;navercorp.com;navercorp.com", "position": "Software Engineer;AI Researcher;Researcher;Researcher;Executive Officer", "bibtex": "@inproceedings{\npark2022encoding,\ntitle={Encoding Weights of Irregular Sparsity for Fixed-to-Fixed Model Compression},\nauthor={Bae Seong Park and Se Jung Kwon and Daehwan Oh and Byeongwook Kim and Dongsoo Lee},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=Vs5NK44aP9P}\n}", "github": "", "project": "", "reviewers": "ieBc;Qcma;dSLA;NvPC", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "3;4;3;3", "correctness": "4;3;3;4", "technical_novelty": "2;3;4;3", "empirical_novelty": "2;3;4;3", "wc_summary_paper": "37;35;24;78", "wc_summary_review": "55;169;33;41", "wc_main_review": "581;394;110;192", "wc_review": "673;598;167;311", "wc_reply_reviewers": "646;0;0;0", "wc_reply_authors": "1326;788;308;323", "reply_reviewers": "1;0;0;0", "reply_authors": "3;1;1;1", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 43.5, 20.524375751773793 ], "wc_summary_review_avg": [ 74.5, 55.12485827646181 ], "wc_main_review_avg": [ 319.25, 183.08382642931625 ], "wc_review_avg": [ 437.25, 206.39328356320127 ], "wc_reply_reviewers_avg": [ 161.5, 279.7262054223737 ], "wc_reply_authors_avg": [ 686.25, 416.73035346612323 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.5, 0.8660254037844386 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.13245323570650439, "corr_recommendation_correctness": 0.2294157338705618, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14143984971869968117&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=Vs5NK44aP9P", "email": "navercorp.com;navercorp.com;samsung.com;navercorp.com;navercorp.com", "author_num": 5, "aff_unique_index": "0;0;1;0;0", "aff_unique_norm": "NAVER Corporation;Samsung", "aff_unique_dep": "Clova;Samsung Research", "aff_unique_url": "https://www.naver.com;https://research.samsung.com", "aff_unique_abbr": "NAVER;Samsung", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "South Korea" }, { "id": "Vt1lpp5Vebd", "title": "Maximum Likelihood Estimation for Multimodal Learning with Missing Modality", "track": "main", "status": "Reject", "tldr": "", "abstract": "Multimodal learning has achieved great successes in many scenarios. Compared with unimodal learning, it can effectively combine the information from different modalities to improve the performance of learning tasks. In reality, the multimodal data may have missing modalities due to various reasons, such as sensor failure and data transmission error. In previous works, the information of the modality-missing data has not been well exploited. To address this problem, we propose an efficient approach based on maximum likelihood estimation to incorporate the knowledge in the modality-missing data. Specifically, we design a likelihood function to characterize the conditional distributions of the modality-complete data and the modality-missing data, which is theoretically optimal. Moreover, we develop a generalized form of the softmax function to effectively implement maximum likelihood estimation in an end-to-end manner. Such training strategy guarantees the computability of our algorithm capably. Finally, we conduct a series of experiments on real-world multimodal datasets. Our results demonstrate the effectiveness of the proposed approach, even when 95% of the training data has missing modality.", "keywords": "multimodal learning;missing modality;maximum likelihood estimation", "primary_area": "", "supplementary_material": "/attachment/25cd05ce53a6de934c9a5ce912ca17de1fce0061.zip", "author": "Fei Ma;Xiangxiang Xu;Shao-Lun Huang;Lin Zhang", "authorids": "~Fei_Ma3;~Xiangxiang_Xu1;~Shao-Lun_Huang3;~Lin_Zhang5", "gender": ";M;M;M", "homepage": ";https://xiangxiangxu.com/;https://sites.google.com/view/slhuang/home;https://www.tbsi.edu.cn/index.php?s=/cms/172.html", "dblp": ";147/5345-1;64/2243;", "google_scholar": ";u-BAw9sAAAAJ;;", "orcid": ";0000-0002-4178-0934;;", "linkedin": ";xiangxiangxu/;;", "or_profile": "~Fei_Ma3;~Xiangxiang_Xu1;~Shao-Lun_Huang3;~Lin_Zhang5", "aff": ";Massachusetts Institute of Technology;Tsinghua University;Tsinghua University", "aff_domain": ";mit.edu;tsinghua.edu.cn;tsinghua.edu.cn", "position": ";Postdoc;Associate Professor;Full Professor", "bibtex": "@misc{\nma2022maximum,\ntitle={Maximum Likelihood Estimation for Multimodal Learning with Missing Modality},\nauthor={Fei Ma and Xiangxiang Xu and Shao-Lun Huang and Lin Zhang},\nyear={2022},\nurl={https://openreview.net/forum?id=Vt1lpp5Vebd}\n}", "github": "", "project": "", "reviewers": "APG5;wkEE;cK5q", "site": "https://openreview.net/forum?id=Vt1lpp5Vebd", "pdf_size": 0, "recommendation": "3;3;3", "confidence": "4;4;4", "correctness": "3;3;2", "technical_novelty": "3;2;2", "empirical_novelty": "2;2;2", "wc_summary_paper": "53;66;102", "wc_summary_review": "32;88;24", "wc_main_review": "949;208;321", "wc_review": "1034;362;447", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 2.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 73.66666666666667, 20.725722075613085 ], "wc_summary_review_avg": [ 48.0, 28.472208672083497 ], "wc_main_review_avg": [ 492.6666666666667, 325.9573932620983 ], "wc_review_avg": [ 614.3333333333334, 298.7711870683353 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3782242172807135986&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1;1", "aff_unique_norm": "Massachusetts Institute of Technology;Tsinghua University", "aff_unique_dep": ";", "aff_unique_url": "https://web.mit.edu;https://www.tsinghua.edu.cn", "aff_unique_abbr": "MIT;THU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1", "aff_country_unique": "United States;China" }, { "id": "VuEqOs9Yp7Q", "title": "Temporal Action Localization with Global Segmentation Mask Transformers", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Inspired by the promising results of Transformers in object detection in images, it is interesting to formulate Transformer based methods for temporal action localization (TAL) in videos. Nonetheless, this is non-trivial to adapt recent object detection transformers due to two unique challenges with TAL: (1) more complex spatio-temporal visual observations, and (2) less training data availability . In this paper, to address the above two challenges, a novel {\\em Global Segmentation Mask Transformer} (GSMT) is proposed. Compared to object detection transformers, it is architecturally reformulated with the core idea to drive the transformer to learn {\\em global segmentation masks} of all action instances jointly at the full video length. Supervised by such global temporal structure signals, GSMT allows to more effectively train from limited complex video data. Due to modeling TAL holistically rather than locally to each individual proposal, our model also differs significantly to the conventional proposal-based TAL methods that learn to detect local start and end points of action instances using more complex architectures. Extensive experiments show that despite its simpler design, GSMT outperforms existing TAL methods, achieving new state-of-the-art performance on two benchmarks. Importantly, it is around $\\bf{100\\times}$ faster to train and twice as efficient for inference. ", "keywords": "Temporal Action Localization;Transformer;Global Contextual Learning;Self-attention Learning", "primary_area": "", "supplementary_material": "", "author": "Sauradip Nag;Xiatian Zhu;Yi-Zhe Song;Tao Xiang", "authorids": "~Sauradip_Nag1;~Xiatian_Zhu3;~Yi-Zhe_Song2;~Tao_Xiang1", "gender": "M;;M;M", "homepage": ";https://x-up-lab.github.io;http://personal.ee.surrey.ac.uk/Personal/Y.Song/;https://www.surrey.ac.uk/people/tao-xiang", "dblp": "222/2994;128/7935;98/1684;22/4460-2.html", "google_scholar": "hlkMCO4AAAAJ;ZbA-z1cAAAAJ;https://scholar.google.co.uk/citations?user=irZFP_AAAAAJ;MeS5d4gAAAAJ", "orcid": ";0000-0002-9284-2955;;0000-0002-2530-1059", "linkedin": ";;;", "or_profile": "~Sauradip_Nag1;~Xiatian_Zhu3;~Yi-Zhe_Song2;~Tao_Xiang1", "aff": "University of Surrey;Samsung AI Centre, Cambridge, UK;University of Surrey;University of Surrey", "aff_domain": "surrey.ac.uk;samsung.com;surrey.ac.uk;surrey.ac.uk", "position": "PhD student;Researcher;Professor;Full Professor", "bibtex": "@misc{\nnag2022temporal,\ntitle={Temporal Action Localization with Global Segmentation Mask Transformers},\nauthor={Sauradip Nag and Xiatian Zhu and Yi-Zhe Song and Tao Xiang},\nyear={2022},\nurl={https://openreview.net/forum?id=VuEqOs9Yp7Q}\n}", "github": "", "project": "", "reviewers": "KEho;CdYK;531h;mkGj", "site": "https://openreview.net/forum?id=VuEqOs9Yp7Q", "pdf_size": 0, "recommendation": "3;3;5;6", "confidence": "5;5;4;3", "correctness": "3;3;2;3", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;2;3;2", "wc_summary_paper": "75;49;61;45", "wc_summary_review": "121;34;54;50", "wc_main_review": "688;336;468;235", "wc_review": "884;419;583;330", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.25, 1.299038105676658 ], "confidence_avg": [ 4.25, 0.82915619758885 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 57.5, 11.6940155635265 ], "wc_summary_review_avg": [ 64.75, 33.32697856091968 ], "wc_main_review_avg": [ 431.75, 169.45261136966877 ], "wc_review_avg": [ 554.0, 211.03435739234502 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.986440050415621, "corr_recommendation_correctness": -0.3333333333333333, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14841093837601547810&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;0;0", "aff_unique_norm": "University of Surrey;Samsung", "aff_unique_dep": ";AI Centre", "aff_unique_url": "https://www.surrey.ac.uk;https://www.samsung.com/uk/", "aff_unique_abbr": "Surrey;", "aff_campus_unique_index": "1", "aff_campus_unique": ";Cambridge", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United Kingdom" }, { "id": "VuW5ojKGI43", "title": "Protecting Your NLG Models with Semantic and Robust Watermarks", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Natural language generation (NLG) applications have gained great popularity due to the powerful deep learning techniques and large training corpus. The deployed NLG models may be stolen or used without authorization, while watermark has become a useful tool to protect Intellectual Property (IP). However, existing watermark technologies are easily detected or harmful for the applications. In this paper, we propose a semantic and robust watermarking scheme for NLG models that utilize pair-matched phrases as watermarks for IP protection. The watermarks give NLG models personal preference for some special phrase combinations. When the key phrase appears behinds a specific prefix phrase, the model would give the congenial predication for the key phrase. We use word tag n-gram to generate semantic watermark which is syntax correctly. For the key phrase's predication, we choose the original model's second predication, which makes nearly no harmfulness to the task and also undetectable. Extensive experimental results demonstrate the effectiveness, robustness, and undetectability of the proposed scheme.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Tao Xiang;Chunlong Xie;Shangwei Guo;Jiwei Li;Tianwei Zhang", "authorids": "~Tao_Xiang2;bluedask@cqu.edu.dn;~Shangwei_Guo1;~Jiwei_Li1;~Tianwei_Zhang1", "gender": "M;;M;M;M", "homepage": ";;http://www.cs.cqu.edu.cn/info/1332/5290.htm;https://nlp.stanford.edu/~bdlijiwei/;https://personal.ntu.edu.sg/tianwei.zhang/index.html", "dblp": "22/4460-1.html;;176/6479;73/5746-1;77/7902-4", "google_scholar": "https://scholar.google.com/citations?hl=en;;wQrVkBYAAAAJ;PwU16JEAAAAJ;9vpiYDIAAAAJ", "orcid": "0000-0002-9439-4623;;;;", "linkedin": ";;;;", "or_profile": "~Tao_Xiang2;bluedask@cqu.edu.dn;~Shangwei_Guo1;~Jiwei_Li1;~Tianwei_Zhang1", "aff": "Chongqing University;;Chongqing University;Zhejiang University;Nanyang Technological University", "aff_domain": "cqu.edu.cn;;cqu.edu.cn;zju.edu.cn;ntu.edu.sg", "position": "Full Professor;;Associate Professor;Assistant Professor;Assistant Professor", "bibtex": "@misc{\nxiang2022protecting,\ntitle={Protecting Your {NLG} Models with Semantic and Robust Watermarks},\nauthor={Tao Xiang and Chunlong Xie and Shangwei Guo and Jiwei Li and Tianwei Zhang},\nyear={2022},\nurl={https://openreview.net/forum?id=VuW5ojKGI43}\n}", "github": "", "project": "", "reviewers": "2u8x;FEV9;Lyy7", "site": "https://openreview.net/forum?id=VuW5ojKGI43", "pdf_size": 0, "recommendation": "5;5;6", "confidence": "3;3;4", "correctness": "2;3;3", "technical_novelty": "3;2;4", "empirical_novelty": "4;2;3", "wc_summary_paper": "134;91;80", "wc_summary_review": "38;48;41", "wc_main_review": "421;233;232", "wc_review": "593;372;353", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "correctness_avg": [ 2.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 3.0, 0.816496580927726 ], "empirical_novelty_avg": [ 3.0, 0.816496580927726 ], "wc_summary_paper_avg": [ 101.66666666666667, 23.299976156401723 ], "wc_summary_review_avg": [ 42.333333333333336, 4.189935029992179 ], "wc_main_review_avg": [ 295.3333333333333, 88.86068997156292 ], "wc_review_avg": [ 439.3333333333333, 108.93525090111507 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.9999999999999997, "corr_recommendation_correctness": 0.4999999999999999, "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2355458185487406971&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0;1;2", "aff_unique_norm": "Chongqing University;Zhejiang University;Nanyang Technological University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.cqu.edu.cn;https://www.zju.edu.cn;https://www.ntu.edu.sg", "aff_unique_abbr": "CQU;ZJU;NTU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1", "aff_country_unique": "China;Singapore" }, { "id": "Vvb-eicR8N", "title": "Learning-Augmented Sketches for Hessians", "track": "main", "status": "Reject", "tldr": "", "abstract": "Sketching is a dimensionality reduction technique where one compresses a matrix by linear combinations that are typically chosen at random. A line of work has shown how to sketch the Hessian to speed up each iteration in a second order method, but such sketches usually depend only on the matrix at hand, and in a number of cases are even oblivious to the input matrix. One could instead hope to learn a distribution on sketching matrices that is optimized for the specific distribution of input matrices. We show how to design learned sketches for the Hessian in the context of second order methods. We prove that a smaller sketching dimension of the column space of a tall matrix is possible, assuming the knowledge of the indices of the rows of large leverage scores. This would lead to faster convergence of the iterative Hessian sketch procedure. We also design a new objective to learn the sketch, whereby we optimize the subspace embedding property of the sketch. We show empirically that learned sketches, compared with their \"non-learned\" counterparts, do improve the approximation accuracy for important problems, including LASSO and matrix estimation with nuclear norm constraints.", "keywords": "least squares;convex optimization;iterative Hessian sketch;subspace embedding;learning-augmented sketch", "primary_area": "", "supplementary_material": "/attachment/58ca3e24310ac5061ef25c0cbb0ba3fee7eaeeab.zip", "author": "Yi Li;Honghao Lin;David Woodruff", "authorids": "~Yi_Li8;~Honghao_Lin1;~David_Woodruff1", "gender": "M;M;M", "homepage": ";https://honghlin.github.io;http://www.cs.cmu.edu/~dwoodruf/", "dblp": "59/871-2;https://dblp.uni-trier.de/pid/264/2663.html;w/DPWoodruff", "google_scholar": ";;https://scholar.google.com.tw/citations?user=0G2t-6sAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Yi_Li8;~Honghao_Lin1;~David_Woodruff1", "aff": "Nanyang Technological University;Carnegie Mellon University;Carnegie Mellon University", "aff_domain": "ntu.edu.sg;cmu.edu;cmu.edu", "position": "Assistant Professor;PhD student;Associate Professor", "bibtex": "@misc{\nli2022learningaugmented,\ntitle={Learning-Augmented Sketches for Hessians },\nauthor={Yi Li and Honghao Lin and David Woodruff},\nyear={2022},\nurl={https://openreview.net/forum?id=Vvb-eicR8N}\n}", "github": "", "project": "", "reviewers": "SAjC;kP9y;zU27;tMi8", "site": "https://openreview.net/forum?id=Vvb-eicR8N", "pdf_size": 0, "recommendation": "5;5;5;5", "confidence": "3;3;3;3", "correctness": "4;4;4;3", "technical_novelty": "2;2;2;3", "empirical_novelty": "3;2;2;3", "wc_summary_paper": "59;62;137;34", "wc_summary_review": "55;104;77;33", "wc_main_review": "134;765;358;436", "wc_review": "248;931;572;503", "wc_reply_reviewers": "0;113;172;165", "wc_reply_authors": "242;1336;787;1284", "reply_reviewers": "0;1;1;1", "reply_authors": "1;2;2;2", "recommendation_avg": [ 5.0, 0.0 ], "confidence_avg": [ 3.0, 0.0 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 73.0, 38.516230345141516 ], "wc_summary_review_avg": [ 67.25, 26.309456474811487 ], "wc_main_review_avg": [ 423.25, 226.31766943833617 ], "wc_review_avg": [ 563.5, 244.09475619111527 ], "wc_reply_reviewers_avg": [ 112.5, 68.83494751941052 ], "wc_reply_authors_avg": [ 912.25, 442.3473606793647 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.75, 0.4330127018922193 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18215798707448204443&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1;1", "aff_unique_norm": "Nanyang Technological University;Carnegie Mellon University", "aff_unique_dep": ";", "aff_unique_url": "https://www.ntu.edu.sg;https://www.cmu.edu", "aff_unique_abbr": "NTU;CMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1", "aff_country_unique": "Singapore;United States" }, { "id": "Vvmj4zGU_z3", "title": "To Smooth or not to Smooth? On Compatibility between Label Smoothing and Knowledge Distillation", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "This work investigates the compatibility between label smoothing (LS) and knowledge distillation (KD). Contemporary findings addressing this thesis statement take dichotomous standpoints. Specifically, Muller et al. [1] claim that LS erases relative information in the logits; therefore a LS-trained teacher can hurt KD. On the contrary, Shen et al. [2] claim that LS enlarges the distance between semantically similar classes; therefore a LS-trained teacher is compatible with KD. Critically, there is no effort to understand and resolve these contradictory findings, leaving the primal question $-$ to smooth or not to smooth a teacher network? $-$ unanswered. \n\nIn this work, we establish a foundational understanding on the compatibility between LS and KD. We begin by meticulously scrutinizing these contradictory findings under a unified empirical consistency. Through our profound investigation, we discover that in the presence of a LS-trained teacher, KD at higher temperatures systematically diffuses penultimate layer representations learnt by the student towards semantically similar classes. This systematic diffusion essentially curtails the benefits of distilling from a LS-trained teacher, thereby rendering KD at increased temperatures ineffective. We show this systematic diffusion qualitatively by visualizing penultimate layer representations, and quantitatively using our proposed relative distance metric called diffusion index ($\\eta$). \n\nImportantly, our discovered systematic diffusion was the missing concept which is instrumental in understanding and resolving these contradictory findings. Our discovery is comprehensively supported by large-scale experiments and analyses including image classification (standard, fine-grained), neural machine translation and compact student network distillation tasks spanning across multiple datasets and teacher-student architectures. Finally, we shed light on the question $-$ to smooth or not to smooth a teacher network? $-$ in order to help practitioners make informed decisions.", "keywords": "label smoothing;knowledge distillation;systematic diffusion;semantically similar classes", "primary_area": "", "supplementary_material": "/attachment/c9b95152b7b2f024b9434428dfaf550cb28b1af3.zip", "author": "Keshigeyan Chandrasegaran;Ngoc-Trung Tran;Yunqing ZHAO;Ngai-man Cheung", "authorids": "~Keshigeyan_Chandrasegaran1;~Ngoc-Trung_Tran2;~Yunqing_ZHAO1;~Ngai-man_Cheung1", "gender": "M;;;M", "homepage": "https://keshik6.github.io/;;;https://sites.google.com/site/mancheung0407/", "dblp": "289/0842;131/3529.html;;82/3605", "google_scholar": "vh2Ywj8AAAAJ;https://scholar.google.com.sg/citations?user=9SE3GYMAAAAJ;;https://scholar.google.com.sg/citations?hl=en", "orcid": ";0000-0002-1308-9142;;0000-0003-0135-3791", "linkedin": "keshigeyan-chandrasegaran/;;;", "or_profile": "~Keshigeyan_Chandrasegaran1;~Ngoc-Trung_Tran2;~Yunqing_ZHAO1;~Ngai-man_Cheung1", "aff": "Singapore University of Technology and Design;SeeingMachines Inc.;;Singapore University of Technology and Design", "aff_domain": "sutd.edu.sg;seeingmachines.com;;sutd.edu.sg", "position": "Researcher;Sensior Scientist;;Associate Professor", "bibtex": "@misc{\nchandrasegaran2022to,\ntitle={To Smooth or not to Smooth? On Compatibility between Label Smoothing and Knowledge Distillation},\nauthor={Keshigeyan Chandrasegaran and Ngoc-Trung Tran and Yunqing ZHAO and Ngai-man Cheung},\nyear={2022},\nurl={https://openreview.net/forum?id=Vvmj4zGU_z3}\n}", "github": "", "project": "", "reviewers": "StsE;gUs8;VCvi;hCXC", "site": "https://openreview.net/forum?id=Vvmj4zGU_z3", "pdf_size": 0, "recommendation": "5;6;6;6", "confidence": "3;4;3;4", "correctness": "4;3;4;3", "technical_novelty": "3;3;3;3", "empirical_novelty": "3;3;0;4", "wc_summary_paper": "91;133;53;143", "wc_summary_review": "30;72;242;26", "wc_main_review": "238;316;463;237", "wc_review": "359;521;758;406", "wc_reply_reviewers": "131;79;595;142", "wc_reply_authors": "1982;4082;6280;2253", "reply_reviewers": "1;1;3;1", "reply_authors": "5;10;13;6", "recommendation_avg": [ 5.75, 0.4330127018922193 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.5, 1.5 ], "wc_summary_paper_avg": [ 105.0, 35.805027579936315 ], "wc_summary_review_avg": [ 92.5, 88.17454281140334 ], "wc_main_review_avg": [ 313.5, 92.07198270918249 ], "wc_review_avg": [ 511.0, 154.30327281039763 ], "wc_reply_reviewers_avg": [ 236.75, 208.19987391927017 ], "wc_reply_authors_avg": [ 3649.25, 1720.272852631233 ], "reply_reviewers_avg": [ 1.5, 0.8660254037844386 ], "reply_authors_avg": [ 8.5, 3.2015621187164243 ], "replies_avg": [ 48, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": -0.5773502691896257, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16464524205402216378&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;0", "aff_unique_norm": "Singapore University of Technology and Design;SeeingMachines", "aff_unique_dep": ";", "aff_unique_url": "https://www.sutd.edu.sg;https://www.seeingmachines.com", "aff_unique_abbr": "SUTD;SeeingMachines", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", "aff_country_unique": "Singapore;Australia" }, { "id": "VwSHZgruNEc", "title": "Safe Opponent-Exploitation Subgame Refinement", "track": "main", "status": "Reject", "tldr": "", "abstract": "Search algorithms have been playing a vital role in the success of superhuman AI in both perfect information and imperfect information games. Specifically, search algorithms can generate a refinement of Nash equilibrium (NE) approximation in games such as Texas hold'em with theoretical guarantees. However, when confronted with opponents of limited rationality, an NE strategy tends to be overly conservative, because it prefers to achieve its low exploitability rather than actively exploiting the weakness of opponents. In this paper, we investigate the dilemma of safety and opponent exploitation. We present a new real-time search framework that smoothly interpolates between the two extremes of strategy search, hence unifying safe search and opponent exploitation. We provide our new strategy with a theoretically upper-bounded exploitability and lower-bounded reward against an opponent. Our method can exploit the weakness of its opponent without significantly sacrificing its exploitability. Empirical results show that our method significantly outperforms NE baselines when opponents play non-NE strategies and keeps low exploitability at the same time.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Mingyang Liu;Chengjie Wu;Qihan Liu;Yansen Jing;Jun Yang;Pingzhong Tang;Chongjie Zhang", "authorids": "~Mingyang_Liu1;~Chengjie_Wu1;~Qihan_Liu1;jingys19@mails.tsinghua.edu.cn;~Jun_Yang6;~Pingzhong_Tang1;~Chongjie_Zhang1", "gender": "M;M;M;;M;;", "homepage": "https://liumy.netlify.app/;;https://github.com/liuqh16;;;;", "dblp": "133/7678;70/6141;;;;96/3886;29/6693", "google_scholar": "EqobCqwAAAAJ;fXL69VsAAAAJ;a3J4_OQAAAAJ;;ZrgN9ssAAAAJ;;LjxqXycAAAAJ", "orcid": ";;0000-0001-6637-8346;;;;", "linkedin": ";;;;;;", "or_profile": "~Mingyang_Liu1;~Chengjie_Wu1;~Qihan_Liu1;jingys19@mails.tsinghua.edu.cn;~Jun_Yang6;~Pingzhong_Tang1;~Chongjie_Zhang1", "aff": "Tsinghua University;Tsinghua University;Tsinghua University;;Tsinghua University;Tsinghua University;Tsinghua University", "aff_domain": "tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;;tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn", "position": "Undergrad student;PhD student;PhD student;;Assistant Professor;Associate Professor;Assistant Professor", "bibtex": "@misc{\nliu2022safe,\ntitle={Safe Opponent-Exploitation Subgame Refinement},\nauthor={Mingyang Liu and Chengjie Wu and Qihan Liu and Yansen Jing and Jun Yang and Pingzhong Tang and Chongjie Zhang},\nyear={2022},\nurl={https://openreview.net/forum?id=VwSHZgruNEc}\n}", "github": "", "project": "", "reviewers": "f5FT;hmTT;Wp7Z;SWZo", "site": "https://openreview.net/forum?id=VwSHZgruNEc", "pdf_size": 0, "recommendation": "3;5;6;8", "confidence": "4;4;3;4", "correctness": "4;4;3;3", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;2;2;4", "wc_summary_paper": "63;59;80;81", "wc_summary_review": "16;103;35;37", "wc_main_review": "177;321;338;795", "wc_review": "256;483;453;913", "wc_reply_reviewers": "453;211;255;339", "wc_reply_authors": "862;1110;274;877", "reply_reviewers": "1;1;1;1", "reply_authors": "2;4;1;3", "recommendation_avg": [ 5.5, 1.8027756377319946 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 70.75, 9.858372076565177 ], "wc_summary_review_avg": [ 47.75, 32.93459427410637 ], "wc_main_review_avg": [ 407.75, 232.16306230750834 ], "wc_review_avg": [ 526.25, 239.71167576903716 ], "wc_reply_reviewers_avg": [ 314.5, 92.24288590455093 ], "wc_reply_authors_avg": [ 780.75, 308.6530212066618 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 2.5, 1.118033988749895 ], "replies_avg": [ 24, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": -0.16012815380508713, "corr_recommendation_correctness": -0.8320502943378437, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6781739090143445422&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;0;0;0;0", "aff_unique_norm": "Tsinghua University", "aff_unique_dep": "", "aff_unique_url": "https://www.tsinghua.edu.cn", "aff_unique_abbr": "THU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "China" }, { "id": "Vx8l4vwv94", "title": "JOINTLY LEARNING TOPIC SPECIFIC WORD AND DOCUMENT EMBEDDING", "track": "main", "status": "Reject", "tldr": "", "abstract": "Document embedding generally ignores underlying topics, which fails to capture polysemous terms that can mislead to improper thematic representation. Moreover, embedding a new document during the test process needs a complex and expensive inference method. Some models first learn word embeddings and later learn underlying topics using a clustering algorithm for document representation; those methods miss the mutual interaction between the two paradigms. To this point, we propose a novel document-embedding method by weighted averaging of jointly learning topic-specific word embeddings called TDE: Topical Document Embedding, which efficiently captures syntactic and semantic properties by utilizing three levels of knowledge -i.e., word, topic, and document. TDE obtains document vectors on the fly simultaneously during the jointly learning process of the topical word embeddings. Experiments demonstrate better topical word embeddings using document vector as a global context and better document classification results on the obtained document embeddings by the proposed method over the recent related models.", "keywords": "Language modeling \u00b7Document embedding \u00b7Natural language processing \u00b7Machine learning", "primary_area": "", "supplementary_material": "/attachment/8a780ca27a6122c96951b1273319e6eb24741886.zip", "author": "Farid Uddin;Zuping Zhang", "authorids": "~Farid_Uddin1;~Zuping_Zhang1", "gender": "M;M", "homepage": ";http://faculty.csu.edu.cn/zhangzuping/zh_CN/index.htm", "dblp": ";220/2325.html", "google_scholar": "https://scholar.google.com/citations?view_op=new_articles;Uu6GaEsAAAAJ", "orcid": ";0000-0002-2528-7808", "linkedin": ";", "or_profile": "~Farid_Uddin1;~Zuping_Zhang1", "aff": "Central South University, China;Central South University, China", "aff_domain": "csu.edu.cn;csu.edu.cn", "position": "PhD student;Full Professor", "bibtex": "@misc{\nuddin2022jointly,\ntitle={{JOINTLY} {LEARNING} {TOPIC} {SPECIFIC} {WORD} {AND} {DOCUMENT} {EMBEDDING}},\nauthor={Farid Uddin and Zuping Zhang},\nyear={2022},\nurl={https://openreview.net/forum?id=Vx8l4vwv94}\n}", "github": "", "project": "", "reviewers": "iEk8;Nesz;PLvo;a6Hd", "site": "https://openreview.net/forum?id=Vx8l4vwv94", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "4;5;4;4", "correctness": "2;2;3;4", "technical_novelty": "2;1;1;2", "empirical_novelty": "2;1;1;2", "wc_summary_paper": "76;68;110;81", "wc_summary_review": "28;40;49;15", "wc_main_review": "261;85;450;411", "wc_review": "365;193;609;507", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.82915619758885 ], "technical_novelty_avg": [ 1.5, 0.5 ], "empirical_novelty_avg": [ 1.5, 0.5 ], "wc_summary_paper_avg": [ 83.75, 15.848895860595462 ], "wc_summary_review_avg": [ 33.0, 12.786711852544421 ], "wc_main_review_avg": [ 301.75, 143.6617120181992 ], "wc_review_avg": [ 418.5, 156.39293462301933 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": 0.8703882797784891, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:ScwdeYXzfSIJ:scholar.google.com/&scioq=JOINTLY+LEARNING+TOPIC+SPECIFIC+WORD+AND+DOCUMENT+EMBEDDING&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Central South University", "aff_unique_dep": "", "aff_unique_url": "http://www.csu.edu.cn", "aff_unique_abbr": "CSU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "id": "Vy5WbmrVPaD", "title": "Pretext Tasks Selection for Multitask Self-Supervised Speech Representation Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Through solving pretext tasks, self-supervised learning leverages unlabeled data to extract useful latent representations replacing traditional input features in the downstream task. In audio/speech signal processing, a wide range of features where engineered through decades of research efforts. As it turns out, learning to predict such features (a.k.a pseudo-labels) has proven to be a particularly relevant pretext task, leading to useful self-supervised representations which prove to be effective for downstream tasks. However, methods and common practices for combining such pretext tasks for better performance on the downstream task have not been explored and understood properly. In fact, the process relies almost exclusively on a computationally heavy experimental procedure, which becomes intractable with the increase of the number of pretext tasks. This paper introduces a method to select a group of pretext tasks among a set of candidates. The method we propose estimates calibrated weights for the partial losses corresponding to the considered pretext tasks during the self-supervised training process. The experiments conducted on automatic speech recognition, speaker and emotion recognition validate our approach, as the groups selected and weighted with our method perform better than classic baselines, thus facilitating the selection and combination of relevant pseudo-labels for self-supervised representation learning.\n", "keywords": "Self-Supervised Learning;Speech Processing;Representation Learning", "primary_area": "", "supplementary_material": "", "author": "Salah Zaiem;Titouan Parcollet;Slim Essid;Abdelwahab HEBA", "authorids": "~Salah_Zaiem1;~Titouan_Parcollet1;~Slim_Essid1;~Abdelwahab_HEBA1", "gender": "M;M;Not Specified;", "homepage": ";http://www.darnault-parcollet.fr/Parcollet/index.html;https://perso.telecom-paris.fr/essid/;", "dblp": "232/3285;https://dblp.org/pers/hd/p/Parcollet:Titouan;53/6904;", "google_scholar": ";;5dP_Pv0AAAAJ;mR6_x0MAAAAJ", "orcid": ";;;", "linkedin": ";titouan-parcollet-b233a698;;abdel-heba/", "or_profile": "~Salah_Zaiem1;~Titouan_Parcollet1;~Slim_Essid1;~Abdelwahab_HEBA1", "aff": "T\u00e9l\u00e9com ParisTech;Avignon University;T\u00e9l\u00e9com ParisTech;", "aff_domain": "enst.fr;univ-avignon.fr;telecom-paristech.fr;", "position": "PhD student;Associate Professor;Full Professor;", "bibtex": "@misc{\nzaiem2022pretext,\ntitle={Pretext Tasks Selection for Multitask Self-Supervised Speech Representation Learning},\nauthor={Salah Zaiem and Titouan Parcollet and Slim Essid and Abdelwahab HEBA},\nyear={2022},\nurl={https://openreview.net/forum?id=Vy5WbmrVPaD}\n}", "github": "", "project": "", "reviewers": "zWkh;Q1kK;ybV2", "site": "https://openreview.net/forum?id=Vy5WbmrVPaD", "pdf_size": 0, "recommendation": "3;3;6", "confidence": "3;5;3", "correctness": "2;2;3", "technical_novelty": "3;2;3", "empirical_novelty": "2;2;2", "wc_summary_paper": "133;78;58", "wc_summary_review": "79;34;19", "wc_main_review": "358;139;100", "wc_review": "570;251;177", "wc_reply_reviewers": "0;176;0", "wc_reply_authors": "796;1716;232", "reply_reviewers": "0;1;0", "reply_authors": "1;3;1", "recommendation_avg": [ 4.0, 1.4142135623730951 ], "confidence_avg": [ 3.6666666666666665, 0.9428090415820634 ], "correctness_avg": [ 2.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 89.66666666666667, 31.710495984067414 ], "wc_summary_review_avg": [ 44.0, 25.495097567963924 ], "wc_main_review_avg": [ 199.0, 113.55175031676086 ], "wc_review_avg": [ 332.6666666666667, 170.51751294873563 ], "wc_reply_reviewers_avg": [ 58.666666666666664, 82.96719565922157 ], "wc_reply_authors_avg": [ 914.6666666666666, 611.623704213265 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.9428090415820634 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.5000000000000001, "corr_recommendation_correctness": 1.0, "gs_citation": 24, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14546011024435351066&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "aff_unique_index": "0;1;0", "aff_unique_norm": "T\u00e9l\u00e9com ParisTech;Avignon University", "aff_unique_dep": ";", "aff_unique_url": "https://www.telecom-paristech.fr;https://www.univ-avignon.fr", "aff_unique_abbr": "TP;U. Avignon", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "France" }, { "id": "VyZRObZ19kt", "title": "Learned Index with Dynamic $\\epsilon$", "track": "main", "status": "Reject", "tldr": "", "abstract": "Index structure is a fundamental component in database and facilitates broad data retrieval applications. Recent learned index methods show superior performance by learning hidden yet useful data distribution with the help of machine learning, and provide a guarantee that the prediction error is no more than a pre-defined $\\epsilon$. However, existing learned index methods adopt a fixed $\\epsilon$ for all the learned segments, neglecting the diverse characteristics of different data localities. In this paper, we propose a mathematically-grounded learned index framework with dynamic $\\epsilon$, which is efficient and pluggable to existing learned index methods. We theoretically analyze prediction error bounds that link $\\epsilon$ with data characteristics for an illustrative learned index method. Under the guidance of the derived bounds, we learn how to vary $\\epsilon$ and improve the index performance with a better space-time trade-off. Experiments with real-world datasets and several state-of-the-art methods demonstrate the efficiency, effectiveness and usability of the proposed framework.", "keywords": "Learned Index;Dynamic $\\epsilon$", "primary_area": "", "supplementary_material": "", "author": "Daoyuan Chen;Wuchao Li;Yaliang Li;Bolin Ding;Kai Zeng;Defu Lian;Jingren Zhou", "authorids": "~Daoyuan_Chen1;~Wuchao_Li1;~Yaliang_Li1;~Bolin_Ding3;~Kai_Zeng4;~Defu_Lian1;~Jingren_Zhou1", "gender": "M;M;M;M;M;M;M", "homepage": "https://yxdyc.github.io/;https://scholar.google.com/citations?user=3WjhtxYAAAAJ&hl=en&oi=ao;https://sites.google.com/site/yaliangli/;https://bolinding.github.io/;https://kai-zeng.github.io/;https://faculty.ustc.edu.cn/liandefu/en/index.htm;", "dblp": "217/4891;238/4953;https://dblp.org/pers/hd/l/Li:Yaliang;46/3522.html;80/1651-2.html;87/10734;84/2644", "google_scholar": "https://scholar.google.com.hk/citations?user=1GdfinUAAAAJ;https://scholar.google.com/citations?view_op=list_works;CCPBcdYAAAAJ;AjYkTi8AAAAJ;sEl5MeQAAAAJ;QW0ad4sAAAAJ;", "orcid": "0000-0002-8015-2121;0009-0004-8789-2319;0000-0002-4204-6096;;;0000-0002-3507-9607;", "linkedin": ";liwuchao;;bolin-ding-50a0119/;;;", "or_profile": "~Daoyuan_Chen1;~Wuchao_Li1;~Yaliang_Li1;~Bolin_Ding3;~Kai_Zeng4;~Defu_Lian1;~Jingren_Zhou1", "aff": "Alibaba Group;University of Science and Technology of China;Alibaba Group;Alibaba Group;Alibaba Group;University of Science and Technology of China;Alibaba Group", "aff_domain": "alibaba-inc.com;ustc.edu.cn;alibaba-inc.com;alibaba-inc.com;alibaba-inc.com;ustc.edu.cn;alibaba-inc.com", "position": "Staff;PhD student;Staff Engineer;Senior Director;Researcher;Full Professor;Researcher", "bibtex": "@misc{\nchen2022learned,\ntitle={Learned Index with Dynamic \\${\\textbackslash}epsilon\\$},\nauthor={Daoyuan Chen and Wuchao Li and Yaliang Li and Bolin Ding and Kai Zeng and Defu Lian and Jingren Zhou},\nyear={2022},\nurl={https://openreview.net/forum?id=VyZRObZ19kt}\n}", "github": "", "project": "", "reviewers": "it4h;Fjr7;bkBs", "site": "https://openreview.net/forum?id=VyZRObZ19kt", "pdf_size": 0, "recommendation": "3;3;8", "confidence": "3;4;3", "correctness": "3;2;3", "technical_novelty": "2;3;2", "empirical_novelty": "3;2;2", "wc_summary_paper": "267;60;68", "wc_summary_review": "29;21;22", "wc_main_review": "260;245;141", "wc_review": "556;326;231", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "582;809;284", "reply_reviewers": "0;0;0", "reply_authors": "1;2;1", "recommendation_avg": [ 4.666666666666667, 2.357022603955158 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "correctness_avg": [ 2.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 131.66666666666666, 95.75083405496905 ], "wc_summary_review_avg": [ 24.0, 3.559026084010437 ], "wc_main_review_avg": [ 215.33333333333334, 52.91712598225855 ], "wc_review_avg": [ 371.0, 136.44290625263986 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 558.3333333333334, 214.98268664140272 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": -0.5000000000000001, "corr_recommendation_correctness": 0.5000000000000001, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1, "aff_unique_index": "0;1;0;0;0;1;0", "aff_unique_norm": "Alibaba Group;University of Science and Technology of China", "aff_unique_dep": ";", "aff_unique_url": "https://www.alibaba.com;http://www.ustc.edu.cn", "aff_unique_abbr": "Alibaba;USTC", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "ExT5: Towards Extreme Multi-Task Scaling for Transfer Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7043", "id": "Vzh1BFUCiIX", "poster": "", "openreview": "https://openreview.net/forum?id=Vzh1BFUCiIX", "slides": "https://iclr.cc/virtual/2022/poster/7043", "video": "https://iclr.cc/virtual/2022/poster/7043", "author_site": "Vamsi Aribandi, Yi Tay, Tal Schuster, Jinfeng Rao, Huaixiu Steven Zheng, Sanket Vaibhav Mehta, Honglei Zhuang, Vinh Tran, Dara Bahri, Jianmo Ni, Jai Gupta, Kai Hui, Sebastian Ruder, Donald Metzler", "tldr": "", "abstract": "Despite the recent success of multi-task learning and transfer learning for natural language processing (NLP), few works have systematically studied the effect of scaling up the number of tasks during pre-training. Towards this goal, this paper introduces ExMix (Extreme Mixture): a massive collection of 107 supervised NLP tasks across diverse domains and task-families. Using ExMix, we study the effect of multi-task pre-training at the largest scale to date, and analyze co-training transfer amongst common families of tasks. Through this analysis, we show that manually curating an ideal set of tasks for multi-task pre-training is not straightforward, and that multi-task scaling can vastly improve models on its own. Finally, we propose ExT5: a model pre-trained using a multi-task objective of self-supervised span denoising and supervised ExMix. Via extensive experiments, we show that ExT5 outperforms strong T5 baselines on SuperGLUE, GEM, Rainbow, Closed-Book QA tasks, and several tasks outside of ExMix. ExT5 also significantly improves sample efficiency while pre-training.", "keywords": "Natural Language Processing;Transfer Learning;Multi-task Learning", "primary_area": "", "supplementary_material": "", "author": "Vamsi Aribandi;Yi Tay;Tal Schuster;Jinfeng Rao;Huaixiu Steven Zheng;Sanket Vaibhav Mehta;Honglei Zhuang;Vinh Q. Tran;Dara Bahri;Jianmo Ni;Jai Gupta;Kai Hui;Sebastian Ruder;Donald Metzler", "authorids": "~Vamsi_Aribandi1;~Yi_Tay1;~Tal_Schuster1;~Jinfeng_Rao2;~Huaixiu_Steven_Zheng1;~Sanket_Vaibhav_Mehta2;~Honglei_Zhuang1;~Vinh_Q._Tran1;~Dara_Bahri1;~Jianmo_Ni2;~Jai_Gupta1;~Kai_Hui1;~Sebastian_Ruder2;~Donald_Metzler1", "gender": "M;M;Not Specified;;M;M;M;M;M;;M;M;;M", "homepage": "https://vamsi-aribandi.github.io/;http://yitay.net;https://people.csail.mit.edu/tals/;;;https://sanketvmehta.github.io;https://hongleizhuang.github.io/;https://vqtran.github.io;http://www.dara.run;;;https://khui.github.io/;;https://research.google/people/DonaldMetzler/", "dblp": "286/8454;;190/7491;;307/3201;225/7804;10/9988;77/2885-2.html;231/7656;161/2449;154/6787-1;37/10077;;95/2272", "google_scholar": "P1sLApYAAAAJ;VBclY_cAAAAJ;oo8QRmIAAAAJ;;PyK4x4wAAAAJ;H4pn-ogAAAAJ;FxEDj4wAAAAJ;ot3WsOwAAAAJ;j5PpTOwAAAAJ;VECFLiAAAAAJ;;VorTj3AAAAAJ;;bmXpOd8AAAAJ", "orcid": "0000-0002-4345-1763;;;;;0000-0003-1809-4685;0000-0001-8134-1509;;;;;0000-0002-3110-7404;;0000-0003-4276-6269", "linkedin": "vamsi-aribandi/;;;;;sanketvmehta/;;vinh-tran-32597468/;;;;;;donmetzler/", "or_profile": "~Vamsi_Aribandi1;~Yi_Tay1;~Tal_Schuster1;~Jinfeng_Rao2;~Huaixiu_Steven_Zheng1;~Sanket_Vaibhav_Mehta2;~Honglei_Zhuang1;~Vinh_Q._Tran1;~Dara_Bahri1;~Jianmo_Ni2;~Jai_Gupta1;~Kai_Hui1;~Sebastian_Ruder2;~Donald_Metzler1", "aff": "Google;Google;Google;;Google;Carnegie Mellon University;Google DeepMind;Google;Google Research;Google;Google Inc;Google;;Google", "aff_domain": "google.com;google.com;google.com;;google.com;cmu.edu;google.com;google.com;google.com;google.com;google.com;google.com;;google.com", "position": "AI Resident;Research Scientist;Researcher;;Software Engineer;PhD student;Research Scientist;Researcher;Research Scientist;Software engineer;Researcher;Software Engineer;;Research Scientist", "bibtex": "@inproceedings{\naribandi2022ext,\ntitle={ExT5: Towards Extreme Multi-Task Scaling for Transfer Learning},\nauthor={Vamsi Aribandi and Yi Tay and Tal Schuster and Jinfeng Rao and Huaixiu Steven Zheng and Sanket Vaibhav Mehta and Honglei Zhuang and Vinh Q. Tran and Dara Bahri and Jianmo Ni and Jai Gupta and Kai Hui and Sebastian Ruder and Donald Metzler},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=Vzh1BFUCiIX}\n}", "github": "", "project": "", "reviewers": "ri6r;4oQF;6JD9;u5L1", "pdf_size": 0, "recommendation": "5;6;8;8", "confidence": "4;4;5;4", "correctness": "3;4;4;4", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;3;3;4", "wc_summary_paper": "99;102;420;312", "wc_summary_review": "73;100;41;46", "wc_main_review": "1136;303;517;112", "wc_review": "1308;505;978;470", "wc_reply_reviewers": "0;21;0;0", "wc_reply_authors": "1281;436;559;54", "reply_reviewers": "0;1;0;0", "reply_authors": "2;1;1;1", "recommendation_avg": [ 6.75, 1.299038105676658 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 233.25, 138.13648142326485 ], "wc_summary_review_avg": [ 65.0, 23.590252224170897 ], "wc_main_review_avg": [ 517.0, 385.0266224561621 ], "wc_review_avg": [ 815.25, 348.11734731265545 ], "wc_reply_reviewers_avg": [ 5.25, 9.093266739736606 ], "wc_reply_authors_avg": [ 582.5, 444.19280723577685 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 14, 0 ], "corr_recommendation_confidence": 0.5555555555555555, "corr_recommendation_correctness": 0.7777777777777777, "gs_citation": 222, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6680726853502237450&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=Vzh1BFUCiIX", "email": "google.com;google.com;google.com;;google.com;cmu.edu;google.com;google.com;google.com;google.com;google.com;google.com;;google.com", "author_num": 14, "aff_unique_index": "0;0;0;0;1;0;0;0;0;0;0;0", "aff_unique_norm": "Google;Carnegie Mellon University", "aff_unique_dep": "Google;", "aff_unique_url": "https://www.google.com;https://www.cmu.edu", "aff_unique_abbr": "Google;CMU", "aff_campus_unique_index": "0;0;0;0;0;0;0;0;0;0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;0;0;0;0;1;0;0;0;0;0;0", "aff_country_unique": "United States;United Kingdom" }, { "id": "W08IqLMlMer", "title": "Offline Pre-trained Multi-Agent Decision Transformer", "track": "main", "status": "Reject", "tldr": "", "abstract": "Offline reinforcement learning leverages static datasets to learn optimal policies with no necessity to access the environment. This is desirable for multi-agent systems due to the expensiveness of agents' online interactions and the demand for sample numbers. Yet, in multi-agent reinforcement learning (MARL), the paradigm of offline pre-training with online fine-tuning has never been reported, nor datasets or benchmarks for offline MARL research are available. In this paper, we intend to investigate whether offline training is able to learn policy representations that elevate performance on downstream MARL tasks. We introduce the first offline dataset based on StarCraftII with diverse quality levels and propose a multi-agent decision transformer (MADT) for effective offline learning. MADT integrates the powerful temporal representation learning ability of Transformer into both offline and online multi-agent learning, which promotes generalisation across agents and scenarios. The proposed method demonstrates superior performance than the state-of-the-art algorithms in offline MARL. Furthermore, when applied to online tasks, the pre-trained MADT largely improves sample efficiency, even in zero-shot task transfer. To our best knowledge, this is the first work to demonstrate the effectiveness of pre-trained models in terms of sample efficiency and generalisability enhancement in MARL.", "keywords": "Multi-Agent Reinforcement Learning;Offline Reinforcement Learning;Machine Learning", "primary_area": "", "supplementary_material": "/attachment/587f5a0e3647f13fe2611d5852a700b40d55546c.zip", "author": "Linghui Meng;Muning Wen;Yaodong Yang;chenyang le;Xi yun Li;Haifeng Zhang;Ying Wen;Weinan Zhang;Jun Wang;Bo XU", "authorids": "~Linghui_Meng1;~Muning_Wen2;~Yaodong_Yang1;~chenyang_le1;~Xi_yun_Li1;~Haifeng_Zhang3;~Ying_Wen1;~Weinan_Zhang1;~Jun_Wang2;~Bo_XU10", "gender": "M;M;M;M;M;;M;M;M;M", "homepage": "https://reinholdm.github.io/Homepage/;https://github.com/morning9393;https://www.yangyaodong.com;https://github.com/nethermanpro;https://lixiyun98.github.io/;https://pkuzhf.github.io;https://yingwen.io;http://wnzhang.net;http://www0.cs.ucl.ac.uk/staff/jun.wang/;", "dblp": "257/9511-1;295/0261;170/1496-1;301/7724;271/5693.html;93/7133-2;41/4203-1;28/10261-1;w/JunWang12;", "google_scholar": ";Zt1WFtQAAAAJ;https://scholar.google.co.uk/citations?user=6yL0xw8AAAAJ;;HRULU6QAAAAJ;;_A1CxG8AAAAJ;Qzss0GEAAAAJ;https://scholar.google.co.uk/citations?user=wIE1tY4AAAAJ;", "orcid": ";0009-0000-7868-1262;0000-0001-8132-5613;;;;0000-0003-1247-2382;0000-0002-0127-2425;;", "linkedin": ";;yaodong-yang;;;;wenying45;;;%E6%B3%A2-%E5%BE%90-74210b115/?midToken=AQH1EMB1ZoboJA&midSig=2Q5MzMXmNEH9M1&trk=eml-email_pymk_02-header-22-profile&trkEmail=eml-email_pymk_02-header-22-profile-null-7ydrhe~kpggjoav~k9-null-neptune/profile~vanity.view", "or_profile": "~Linghui_Meng1;~Muning_Wen2;~Yaodong_Yang1;~chenyang_le1;~Xi_yun_Li1;~Haifeng_Zhang3;~Ying_Wen1;~Weinan_Zhang1;~Jun_Wang2;~Bo_XU10", "aff": "University of Chinese Academy of Sciences;Shanghai Jiaotong University;King's College London;Shanghai Jiaotong University;Institute of Automation, Chinese Academy of Sciences;Institute of Automation, Chinese Academy of Sciences;Shanghai Jiaotong University;Shanghai Jiaotong University;University College London;Institute of Automation, Chinese Academy of Sciences", "aff_domain": "ucas.edu.cn;sjtu.edu.cn;kcl.ac.uk;sjtu.edu.cn;ia.ac.cn;ia.ac.cn;sjtu.edu.cn;sjtu.edu.cn;ucl.ac.uk;ia.ac.cn", "position": "PhD student;PhD student;Assistant Professor;Undergrad student;PhD student;Associate Professor;Assistant Professor;Associate Professor;Professor;Full Professor", "bibtex": "@misc{\nmeng2022offline,\ntitle={Offline Pre-trained Multi-Agent Decision Transformer},\nauthor={Linghui Meng and Muning Wen and Yaodong Yang and chenyang le and Xi yun Li and Haifeng Zhang and Ying Wen and Weinan Zhang and Jun Wang and Bo XU},\nyear={2022},\nurl={https://openreview.net/forum?id=W08IqLMlMer}\n}", "github": "", "project": "", "reviewers": "nCt4;mKDU;3RgR;yY9Q", "site": "https://openreview.net/forum?id=W08IqLMlMer", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "5;4;4;3", "correctness": "3;3;1;3", "technical_novelty": "1;2;2;2", "empirical_novelty": "3;2;0;3", "wc_summary_paper": "36;39;52;98", "wc_summary_review": "23;52;20;200", "wc_main_review": "280;112;150;498", "wc_review": "339;203;222;796", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 2.5, 0.8660254037844386 ], "technical_novelty_avg": [ 1.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 1.224744871391589 ], "wc_summary_paper_avg": [ 56.25, 24.843258642939738 ], "wc_summary_review_avg": [ 73.75, 73.95395526947831 ], "wc_main_review_avg": [ 260.0, 150.8708056583513 ], "wc_review_avg": [ 390.0, 240.1197617856556 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 10, 0 ], "corr_recommendation_confidence": -0.816496580927726, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 122, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8278382785351493118&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "aff_unique_index": "0;1;2;1;3;3;1;1;4;3", "aff_unique_norm": "University of Chinese Academy of Sciences;Shanghai Jiao Tong University;King's College London;Chinese Academy of Sciences;University College London", "aff_unique_dep": ";;;Institute of Automation;", "aff_unique_url": "http://www.ucas.ac.cn;https://www.sjtu.edu.cn;https://www.kcl.ac.uk;http://www.ia.cas.cn;https://www.ucl.ac.uk", "aff_unique_abbr": "UCAS;SJTU;KCL;CAS;UCL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;0;0;0;0;1;0", "aff_country_unique": "China;United Kingdom" }, { "id": "W0KJGRBH60o", "title": "Dynamic Differential-Privacy Preserving SGD", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Differentially-Private Stochastic Gradient Descent (DP-SGD) prevents training-data privacy breaches by adding noise to the clipped gradient during SGD training to satisfy the differential privacy (DP) definition. On the other hand, the same clipping operation and additive noise across training steps results in unstable updates and even a ramp-up period, which significantly reduces the model's accuracy. In this paper, we extend the Gaussian DP central limit theorem to calibrate the clipping value and the noise power for each individual step separately. We, therefore, are able to propose the dynamic DP-SGD, which has a lower privacy cost than the DP-SGD during updates until they achieve the same target privacy budget at a target number of updates.\nDynamic DP-SGD, in particular, improves model accuracy without sacrificing privacy by gradually lowering both clipping value and noise power while adhering to a total privacy budget constraint. Extensive experiments on a variety of deep learning tasks, including image classification, natural language processing, and federated learning, show that the proposed dynamic DP-SGD algorithm stabilizes updates and, as a result, significantly improves model accuracy in the strong privacy protection region when compared to DP-SGD.", "keywords": "Differential Privacy;Deep Learning;DP-SGD", "primary_area": "", "supplementary_material": "/attachment/6756e319431f2d3fb6adcdf7598bd9ddb1950234.zip", "author": "Jian Du;Song Li;Fengran Mo;Siheng Chen", "authorids": "~Jian_Du3;~Song_Li4;~Fengran_Mo1;~Siheng_Chen1", "gender": "Not Specified;M;M;M", "homepage": ";https://github.com/Antoine-ls;https://fengranmark.github.io/;https://siheng-chen.github.io/", "dblp": "58/2985-1.html;;278/7940;136/4945", "google_scholar": "fl77g-UAAAAJ;;https://scholar.google.com/citations?hl=en;https://scholar.google.com/citations?hl=en", "orcid": ";;0000-0002-0838-6994;", "linkedin": ";;fengran-mo-7bb771185/;", "or_profile": "~Jian_Du3;~Song_Li4;~Fengran_Mo1;~Siheng_Chen2", "aff": ";Shanghai Jiaotong University;Universit\u00e9 de Montr\u00e9al;Shanghai Jiaotong University", "aff_domain": ";sjtu.edu.cn;umontreal.ca;sjtu.edu.cn", "position": ";MS student;MS student;Associate Professor", "bibtex": "@misc{\ndu2022dynamic,\ntitle={Dynamic Differential-Privacy Preserving {SGD}},\nauthor={Jian Du and Song Li and Fengran Mo and Siheng Chen},\nyear={2022},\nurl={https://openreview.net/forum?id=W0KJGRBH60o}\n}", "github": "", "project": "", "reviewers": "Udru;3meH;GVsS;4bPH", "site": "https://openreview.net/forum?id=W0KJGRBH60o", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "4;4;3;4", "correctness": "2;3;3;3", "technical_novelty": "2;2;3;2", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "78;30;155;69", "wc_summary_review": "118;26;46;41", "wc_main_review": "191;69;242;143", "wc_review": "387;125;443;253", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 83.0, 45.3155602414888 ], "wc_summary_review_avg": [ 57.75, 35.55541449624796 ], "wc_main_review_avg": [ 161.25, 63.73529242107547 ], "wc_review_avg": [ 302.0, 123.32477447779907 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 47, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11488420498577202910&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "Shanghai Jiao Tong University;Universit\u00e9 de Montr\u00e9al", "aff_unique_dep": ";", "aff_unique_url": "https://www.sjtu.edu.cn;https://www.umontreal.ca", "aff_unique_abbr": "SJTU;UdeM", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", "aff_country_unique": "China;Canada" }, { "id": "W2gO9bYYG5P", "title": "Can Vision Transformers Perform Convolution?", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Several recent studies have demonstrated that attention-based networks, such as Vision Transformer (ViT), can outperform Convolutional Neural Networks (CNNs) on several computer vision tasks without using convolutional layers. This naturally leads to the following questions: Can a self-attention layer of ViT express any convolution operation? In this work, we prove that a single ViT layer with image patches as the input can perform any convolution operation constructively, where the multi-head attention mechanism and the relative positional encoding play essential roles. We further provide a lower bound on the number of heads for Vision Transformers to express CNNs. Corresponding with our analysis, experimental results show that the construction in our proof can help inject convolutional bias into Transformers and significantly improve the performance of ViT in low data regimes. ", "keywords": "Vision Transformers;CNN;expressive power;multi-head self-attention", "primary_area": "", "supplementary_material": "/attachment/fd3217358d1e0291d9606970755540d4ca00454c.zip", "author": "Shanda Li;Xiangning Chen;Di He;Cho-Jui Hsieh", "authorids": "~Shanda_Li1;~Xiangning_Chen1;~Di_He1;~Cho-Jui_Hsieh1", "gender": "M;M;M;M", "homepage": "https://lithiumda.github.io/;;https://dihe-pku.github.io/;http://web.cs.ucla.edu/~chohsieh/index.html", "dblp": "295/9278;56/7393;74/184;14/2770", "google_scholar": ";vNcBx1sAAAAJ;https://scholar.google.co.jp/citations?user=orVoz4IAAAAJ;Wy89g4IAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Shanda_Li1;~Xiangning_Chen1;~Di_He1;~Cho-Jui_Hsieh1", "aff": "Peking University;University of California, Los Angeles;Microsoft;University of California, Los Angeles", "aff_domain": "pku.edu.cn;cs.ucla.edu;microsoft.com;ucla.edu", "position": "Undergrad student;PhD student;Senior Researcher;Assistant Professor", "bibtex": "@misc{\nli2022can,\ntitle={Can Vision Transformers Perform Convolution?},\nauthor={Shanda Li and Xiangning Chen and Di He and Cho-Jui Hsieh},\nyear={2022},\nurl={https://openreview.net/forum?id=W2gO9bYYG5P}\n}", "github": "", "project": "", "reviewers": "LrVC;utEW;ENrv", "site": "https://openreview.net/forum?id=W2gO9bYYG5P", "pdf_size": 0, "recommendation": "1;3;5", "confidence": "5;5;4", "correctness": "1;4;3", "technical_novelty": "2;3;2", "empirical_novelty": "1;3;2", "wc_summary_paper": "60;57;55", "wc_summary_review": "31;39;59", "wc_main_review": "445;235;172", "wc_review": "536;331;286", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "247;47;47", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "recommendation_avg": [ 3.0, 1.632993161855452 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "correctness_avg": [ 2.6666666666666665, 1.247219128924647 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.0, 0.816496580927726 ], "wc_summary_paper_avg": [ 57.333333333333336, 2.0548046676563256 ], "wc_summary_review_avg": [ 43.0, 11.775681155103795 ], "wc_main_review_avg": [ 284.0, 116.71332400373147 ], "wc_review_avg": [ 384.3333333333333, 108.80665829299643 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 113.66666666666667, 94.28090415820634 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.8660254037844385, "corr_recommendation_correctness": 0.6546536707079771, "gs_citation": 23, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=843075943760736543&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;2;1", "aff_unique_norm": "Peking University;University of California, Los Angeles;Microsoft", "aff_unique_dep": ";;Microsoft Corporation", "aff_unique_url": "http://www.pku.edu.cn;https://www.ucla.edu;https://www.microsoft.com", "aff_unique_abbr": "Peking U;UCLA;Microsoft", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Los Angeles", "aff_country_unique_index": "0;1;1;1", "aff_country_unique": "China;United States" }, { "id": "W3-hiLnUYl", "title": "On the Practicality of Deterministic Epistemic Uncertainty", "track": "main", "status": "Reject", "tldr": "", "abstract": "A set of novel approaches for estimating epistemic uncertainty in deep neural networks with a single forward pass has recently emerged as a valid alternative to Bayesian Neural Networks. On the premise of informative representations, these deterministic uncertainty methods (DUMs) achieve strong performance on detecting out-of-distribution (OOD) data while adding negligible computational costs at inference time. However, it remains unclear whether DUMs are well calibrated and can seamlessly scale to real-world applications - both prerequisites for their practical deployment. To this end, we first provide a taxonomy of DUMs and evaluate their calibration under continuous distributional shifts. Then, we extend them to semantic segmentation. We find that, while DUMs scale to realistic vision tasks and perform well on OOD detection, the practicality of current methods is undermined by poor calibration under distributional shifts.", "keywords": "uncertainty;epistemic uncertainty;uncertainty calibration", "primary_area": "", "supplementary_material": "/attachment/e443569bc657149d98e76f6c0adfc697155c6f02.zip", "author": "Janis Postels;Mattia Segu;TAO SUN;Luca Sieber;Luc Van Gool;Fisher Yu;Federico Tombari", "authorids": "~Janis_Postels1;~Mattia_Segu1;~TAO_SUN9;~Luca_Sieber1;~Luc_Van_Gool1;~Fisher_Yu2;~Federico_Tombari1", "gender": ";M;M;;M;M;M", "homepage": "https://janispostels.github.io/;https://taosun.io;;;https://www.yf.io/;https://federicotombari.github.io/;https://mattiasegu.github.io/", "dblp": "246/4950;;;61/5017;117/6314;16/3539;245/2565", "google_scholar": "_z8NnVsAAAAJ;x1O-cHYAAAAJ;;https://scholar.google.be/citations?user=TwMib_QAAAAJ;-XCiamcAAAAJ;TFsE4BIAAAAJ;dwX7yWkAAAAJ", "orcid": ";;;;;0000-0001-5598-5212;", "linkedin": "https://linkedin.com/in/jgpostels;;luca-daniel-sieber;;;fedet/;mattia-segu/?originalSubdomain=ch", "or_profile": "~Janis_Postels1;~TAO_SUN9;~Luca_Sieber1;~Luc_Van_Gool1;~Fisher_Yu2;~Federico_Tombari1;~Mattia_Seg\u00f91", "aff": "Swiss Federal Institute of Technology;Swiss Federal Institute of Technology;;KU Leuven;Swiss Federal Institute of Technology;Technical University Munich (TUM);Saarland Informatics Campus, Max-Planck Institute", "aff_domain": "ethz.ch;ethz.ch;;kuleuven.be;ethz.ch;in.tum.de;mpi-inf.mpg.de", "position": "PhD student;MS student;;Emeritus;Assistant Professor;Lecturer;PhD student", "bibtex": "@misc{\npostels2022on,\ntitle={On the Practicality of Deterministic Epistemic Uncertainty},\nauthor={Janis Postels and Mattia Segu and TAO SUN and Luca Sieber and Luc Van Gool and Fisher Yu and Federico Tombari},\nyear={2022},\nurl={https://openreview.net/forum?id=W3-hiLnUYl}\n}", "github": "", "project": "", "reviewers": "WCgZ;JspH;ZZs6;z2yi", "site": "https://openreview.net/forum?id=W3-hiLnUYl", "pdf_size": 0, "recommendation": "3;5;5;8", "confidence": "4;4;4;5", "correctness": "1;3;3;4", "technical_novelty": "2;1;1;4", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "32;30;83;87", "wc_summary_review": "60;37;52;35", "wc_main_review": "455;180;401;742", "wc_review": "547;247;536;864", "wc_reply_reviewers": "396;0;102;0", "wc_reply_authors": "1330;121;1132;377", "reply_reviewers": "1;0;1;0", "reply_authors": "2;1;2;1", "recommendation_avg": [ 5.25, 1.7853571071357126 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 2.75, 1.0897247358851685 ], "technical_novelty_avg": [ 2.0, 1.224744871391589 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 58.0, 27.046256672597043 ], "wc_summary_review_avg": [ 46.0, 10.41633332799983 ], "wc_main_review_avg": [ 444.5, 200.2929105085849 ], "wc_review_avg": [ 548.5, 218.2893721645651 ], "wc_reply_reviewers_avg": [ 124.5, 162.1873916184609 ], "wc_reply_authors_avg": [ 740.0, 504.15622578720576 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.8892972917998875, "corr_recommendation_correctness": 0.9316142209946916, "gs_citation": 77, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10237983835645354047&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff_unique_index": "0;0;1;0;2;3", "aff_unique_norm": "Swiss Federal Institute of Technology;Katholieke Universiteit Leuven;Technical University Munich;Max-Planck Institute", "aff_unique_dep": ";;;Informatics", "aff_unique_url": "https://www.ethz.ch;https://www.kuleuven.be;https://www.tum.de;https://www.mpi-sws.org", "aff_unique_abbr": "ETH Zurich;KU Leuven;TUM;MPI-SWS", "aff_campus_unique_index": "1", "aff_campus_unique": ";Saarland", "aff_country_unique_index": "0;0;1;0;2;2", "aff_country_unique": "Switzerland;Belgium;Germany" }, { "id": "W5PbuwQFzZx", "title": "Locality-Based Mini Batching for Graph Neural Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "Training graph neural networks on large graphs is challenging since there is no clear way of how to extract mini batches from connected data. To solve this, previous methods have primarily relied on sampling. While this often leads to good convergence, it introduces significant overhead and requires expensive random data accesses. In this work we propose locality-based mini batching (LBMB), which circumvents sampling by using fixed mini batches based on node locality. LBMB first partitions the training/validation nodes into batches, and then selects the most important auxiliary nodes for each batch using local clustering. Thanks to precomputed batches and consecutive memory accesses, LBMB accelerates training by up to 20x per epoch compared to previous methods, and thus provides significantly better convergence per runtime. Moreover, it accelerates inference by up to 100x, at little to no cost of accuracy.", "keywords": "GNN;graph neural network;graphs;scalability;batching;local clustering", "primary_area": "", "supplementary_material": "/attachment/8ef2f17ab630809ef65e727d10ac8a06bb197525.zip", "author": "Johannes Klicpera;Chendi Qian;Stephan G\u00fcnnemann", "authorids": "~Johannes_Klicpera1;~Chendi_Qian1;~Stephan_G\u00fcnnemann1", "gender": "M;;M", "homepage": ";https://github.com/chendiqian;http://www.daml.in.tum.de", "dblp": "228/7897;322/9379;43/3011", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": "~Johannes_Klicpera1;~Chendi_Qian1;~Stephan_G\u00fcnnemann1", "aff": "Technical University Munich;Technische Universit\u00e4t M\u00fcnchen;Technical University Munich", "aff_domain": "tum.de;tum.de;tum.de", "position": "PhD student;MS student;Professor", "bibtex": "@misc{\nklicpera2022localitybased,\ntitle={Locality-Based Mini Batching for Graph Neural Networks},\nauthor={Johannes Klicpera and Chendi Qian and Stephan G{\\\"u}nnemann},\nyear={2022},\nurl={https://openreview.net/forum?id=W5PbuwQFzZx}\n}", "github": "", "project": "", "reviewers": "AWEr;WNRe;myta", "site": "https://openreview.net/forum?id=W5PbuwQFzZx", "pdf_size": 0, "recommendation": "5;5;6", "confidence": "4;4;4", "correctness": "2;3;4", "technical_novelty": "2;2;3", "empirical_novelty": "3;3;2", "wc_summary_paper": "70;53;244", "wc_summary_review": "21;45;155", "wc_main_review": "516;297;898", "wc_review": "607;395;1297", "wc_reply_reviewers": "443;0;418", "wc_reply_authors": "2594;785;1406", "reply_reviewers": "2;0;1", "reply_authors": "5;2;3", "recommendation_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.0, 0.816496580927726 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 122.33333333333333, 86.31080787222163 ], "wc_summary_review_avg": [ 73.66666666666667, 58.33999961909115 ], "wc_main_review_avg": [ 570.3333333333334, 248.34697949083701 ], "wc_review_avg": [ 766.3333333333334, 385.08988849646806 ], "wc_reply_reviewers_avg": [ 287.0, 203.19612857204407 ], "wc_reply_authors_avg": [ 1595.0, 750.5158226180179 ], "reply_reviewers_avg": [ 1.0, 0.816496580927726 ], "reply_authors_avg": [ 3.3333333333333335, 1.247219128924647 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.8660254037844385, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:06_RaLx_O7kJ:scholar.google.com/&scioq=Locality-Based+Mini+Batching+for+Graph+Neural+Networks&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;0", "aff_unique_norm": "Technical University of Munich;Technische Universit\u00e4t M\u00fcnchen", "aff_unique_dep": ";", "aff_unique_url": "https://www.tum.de;https://www.tum.de", "aff_unique_abbr": "TUM;TUM", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Germany" }, { "id": "W6BpshgRi0q", "title": "Ask2Mask: Guided Data Selection for Masked Speech Modeling", "track": "main", "status": "Reject", "tldr": "", "abstract": "Masked speech modeling (MSM) methods such as wav2vec2 or w2v-BERT learn representations over speech frames which are randomly masked within an utterance. While these methods improve performance of Automatic Speech Recognition (ASR) systems, they have one major limitation. They treat all unsupervised speech samples with equal weight, which hinders learning as not all samples have relevant information to learn meaningful representations. In this work, we address this limitation. We propose ask2mask (ATM), a novel approach to focus on specific samples during MSM pre-training. ATM employs an external ASR model or \\textit{scorer} to weight unsupervised input samples in two different ways: 1) A fine-grained data selection is performed by masking over the highly confident input frames as chosen by the scorer. This allows the model to learn meaningful representations. 2) ATM is further extended to focus at utterance-level by weighting the final MSM loss with the utterance-level confidence score. We conduct fine-tuning experiments on two well-benchmarked corpora: LibriSpeech (matching the pre-training data) and AMI (not matching the pre-training data). The results substantiate the efficacy of ATM on significantly improving the recognition performance under mismatched conditions (up to 11.6\\% relative) while still yielding modest improvements under matched conditions.", "keywords": "Masked speech modeling (MSM);Data selection;Self-supervision;ASR;Speech recognition", "primary_area": "", "supplementary_material": "", "author": "Murali Karthick Baskar;Andrew Rosenberg;Bhuvana Ramabhadran;Yu Zhang;Pedro Moreno", "authorids": "~Murali_Karthick_Baskar1;~Andrew_Rosenberg1;~Bhuvana_Ramabhadran2;~Yu_Zhang2;pedro@google.com", "gender": ";M;;M;", "homepage": "https://github.com/creatorscan/Resume-and-publications;;https://research.google/people/BhuvanaRamabhadran/;;", "dblp": ";21/6080;39/1849;50/671-33;", "google_scholar": "https://scholar.google.cz/citations?user=VqT0OL8AAAAJ;40bq19cAAAAJ;jecEO0EAAAAJ;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~Murali_Karthick_Baskar1;~Andrew_Rosenberg1;~Bhuvana_Ramabhadran2;~Yu_Zhang2;pedro@google.com", "aff": "Research, Google;Google;Google;Google;", "aff_domain": "research.google.com;google.com;google.com;google.com;", "position": "Researcher;Research Scientist;Senior Research Manager;Research Scientist;", "bibtex": "@misc{\nbaskar2022askmask,\ntitle={Ask2Mask: Guided Data Selection for Masked Speech Modeling},\nauthor={Murali Karthick Baskar and Andrew Rosenberg and Bhuvana Ramabhadran and Yu Zhang and Pedro Moreno},\nyear={2022},\nurl={https://openreview.net/forum?id=W6BpshgRi0q}\n}", "github": "", "project": "", "reviewers": "UPo9;EBBL;YoKo;fRrU", "site": "https://openreview.net/forum?id=W6BpshgRi0q", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "4;5;4;5", "correctness": "3;3;3;4", "technical_novelty": "2;3;3;2", "empirical_novelty": "2;3;2;2", "wc_summary_paper": "58;100;133;33", "wc_summary_review": "28;130;90;31", "wc_main_review": "260;186;332;370", "wc_review": "346;416;555;434", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "772;754;768;829", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 4.5, 0.5 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 81.0, 38.399218742052554 ], "wc_summary_review_avg": [ 69.75, 42.67537346058028 ], "wc_main_review_avg": [ 287.0, 70.43436661176133 ], "wc_review_avg": [ 437.75, 75.25415270933559 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 780.75, 28.647643882176418 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.6882472016116854, "corr_recommendation_correctness": 0.6622661785325219, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9812016131456853330&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Google", "aff_unique_dep": "Google Research", "aff_unique_url": "https://research.google", "aff_unique_abbr": "Google", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "W6lWkLqOss", "title": "Class-Weighted Evaluation Metrics for Imbalanced Data Classification", "track": "main", "status": "Reject", "tldr": "", "abstract": "Class distribution skews in imbalanced datasets may lead to models with prediction bias towards majority classes, making fair assessment of classifiers a challenging task. Metrics such as Balanced Accuracy are commonly used to evaluate a classifier\u2019s prediction performance under such scenarios. However, these metrics fall short when classes vary in importance. In this paper, we propose a simple and general-purpose evaluation framework for imbalanced data classification that is sensitive to arbitrary skews in class cardinalities and importances. Experiments with several state-of-the-art classifiers tested on real-world datasets from three different domains show the effectiveness of our framework \u2013 not only in evaluating and ranking classifiers, but also training them.", "keywords": "Imbalanced data classification;Evaluation metrics;Log parsing;Sentiment analysis;URL classification", "primary_area": "", "supplementary_material": "/attachment/c00820ecec5ac60b0ac3da246fe51298e2d214b1.zip", "author": "Min Du;Nesime Tatbul;Brian Rivers;Akhilesh Kumar Gupta;Lucas Hu;Wei Wang;Ryan Marcus;Shengtian Zhou;Insup Lee;Justin Gottschlich", "authorids": "~Min_Du1;~Nesime_Tatbul1;brian.j.rivers@intel.com;~Akhilesh_Kumar_Gupta1;lucashu1998@gmail.com;wewang@paloaltonetworks.com;~Ryan_Marcus1;~Shengtian_Zhou1;~Insup_Lee1;~Justin_Gottschlich1", "gender": ";;;M;;;M;M;;", "homepage": ";https://people.csail.mit.edu/tatbul/;;;;;https://rmarcus.info;;https://www.cis.upenn.edu/~lee/;", "dblp": ";t/NesimeTatbul;;;;;https://dblp.uni-trier.de/pid/175/1473.html;;l/InsupLee.html;", "google_scholar": ";YlsHgYQAAAAJ;;;;;vPOl-IwAAAAJ;2z2FiKAAAAAJ;qPlUgrgAAAAJ;", "orcid": ";0000-0002-0416-7022;;;;;0000-0002-1279-1124;;0000-0003-2672-1132;", "linkedin": ";nesime-tatbul-0724964;;akhilesh-gupta-embedded/;;;;shengtian-zhou/;;", "or_profile": "~Min_Du1;~Nesime_Tatbul1;brian.j.rivers@intel.com;~Akhilesh_Kumar_Gupta1;lucashu1998@gmail.com;wewang@paloaltonetworks.com;~Ryan_Marcus1;~Shengtian_Zhou1;~Insup_Lee1;~Justin_Gottschlich1", "aff": ";Massachusetts Institute of Technology;;;;;;Intel;University of Pennsylvania;", "aff_domain": ";mit.edu;;;;;;intel.com;upenn.edu;", "position": ";Sr. Research Scientist;;;;;;Researcher;Full Professor;", "bibtex": "@misc{\ndu2022classweighted,\ntitle={Class-Weighted Evaluation Metrics for Imbalanced Data Classification},\nauthor={Min Du and Nesime Tatbul and Brian Rivers and Akhilesh Kumar Gupta and Lucas Hu and Wei Wang and Ryan Marcus and Shengtian Zhou and Insup Lee and Justin Gottschlich},\nyear={2022},\nurl={https://openreview.net/forum?id=W6lWkLqOss}\n}", "github": "", "project": "", "reviewers": "QJaY;WMHg;ircg;xLmW", "site": "https://openreview.net/forum?id=W6lWkLqOss", "pdf_size": 0, "recommendation": "1;3;6;6", "confidence": "5;5;4;3", "correctness": "2;4;3;3", "technical_novelty": "1;2;3;3", "empirical_novelty": "2;1;2;3", "wc_summary_paper": "123;63;71;94", "wc_summary_review": "113;25;268;23", "wc_main_review": "415;219;268;78", "wc_review": "651;307;607;195", "wc_reply_reviewers": "232;18;0;0", "wc_reply_authors": "320;248;308;69", "reply_reviewers": "1;1;0;0", "reply_authors": "2;1;1;1", "recommendation_avg": [ 4.0, 2.1213203435596424 ], "confidence_avg": [ 4.25, 0.82915619758885 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.25, 0.82915619758885 ], "empirical_novelty_avg": [ 2.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 87.75, 23.31710745354149 ], "wc_summary_review_avg": [ 107.25, 99.670394300414 ], "wc_main_review_avg": [ 245.0, 120.409717215846 ], "wc_review_avg": [ 440.0, 193.72919243108407 ], "wc_reply_reviewers_avg": [ 62.5, 98.13638468987942 ], "wc_reply_authors_avg": [ 236.25, 100.34035828120209 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 10, 0 ], "corr_recommendation_confidence": -0.8528028654224419, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 33, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16914361965458291751&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "aff_unique_index": "0;1;2", "aff_unique_norm": "Massachusetts Institute of Technology;Intel;University of Pennsylvania", "aff_unique_dep": ";Intel Corporation;", "aff_unique_url": "https://web.mit.edu;https://www.intel.com;https://www.upenn.edu", "aff_unique_abbr": "MIT;Intel;UPenn", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "How to Robustify Black-Box ML Models? A Zeroth-Order Optimization Perspective", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6612", "id": "W9G_ImpHlQd", "poster": "", "openreview": "https://openreview.net/forum?id=W9G_ImpHlQd", "slides": "https://iclr.cc/virtual/2022/poster/6612", "video": "https://iclr.cc/virtual/2022/poster/6612", "author_site": "Yimeng Zhang, Yuguang Yao, Jinghan Jia, Jinfeng Yi, Mingyi Hong, Shiyu Chang, Sijia Liu", "tldr": "", "abstract": "The lack of adversarial robustness has been recognized as an important issue for state-of-the-art machine learning (ML) models, e.g., deep neural networks (DNNs). Thereby, robustifying ML models against adversarial attacks is now a major focus of research. However, nearly all existing defense methods, particularly for robust training, made the white-box assumption that the defender has the access to the details of an ML model (or its surrogate alternatives if available), e.g., its architectures and parameters. Beyond existing works, in this paper we aim to address the problem of black-box defense: How to robustify a black-box model using just input queries and output feedback? Such a problem arises in practical scenarios, where the owner of the predictive model is reluctant to share model information in order to preserve privacy. To this end, we propose a general notion of defensive operation that can be applied to black-box models, and design it through the lens of denoised smoothing (DS), a \ufb01rst-order (FO) certi\ufb01ed defense technique. To allow the design of merely using model queries, we further integrate DS with the zeroth-order (gradient-free) optimization. However, a direct implementation of zeroth-order (ZO) optimization suffers a high variance of gradient estimates, and thus leads to ineffective defense. To tackle this problem, we next propose to prepend an autoencoder (AE) to a given (black-box) model so that DS can be trained using variance-reduced ZO optimization. We term the eventual defense as ZO-AE-DS. In practice, we empirically show that ZO-AE-DS can achieve improved accuracy, certi\ufb01ed robustness, and query complexity over existing baselines. And the effectiveness of our approach is justi\ufb01ed under both image classi\ufb01cation and image reconstruction tasks.", "keywords": "Zeroth-Order Optimization;Black-Box Defense;Gradient-Free;Adversarial Robustness;Certified Defense", "primary_area": "", "supplementary_material": "", "author": "Yimeng Zhang;Yuguang Yao;Jinghan Jia;Jinfeng Yi;Mingyi Hong;Shiyu Chang;Sijia Liu", "authorids": "~Yimeng_Zhang2;~Yuguang_Yao1;~Jinghan_Jia1;~Jinfeng_Yi1;~Mingyi_Hong1;~Shiyu_Chang2;~Sijia_Liu1", "gender": "M;M;M;M;M;Unspecified;M", "homepage": "https://damon-demon.github.io;https://www.cse.msu.edu/~yaoyugua/;https://jinghanjia.netlify.app/;http://jinfengyi.net/;http://people.ece.umn.edu/~mhong/mingyi.html;http://people.csail.mit.edu/chang87/;https://lsjxjtu.github.io/", "dblp": ";238/9467;286/5392;117/4898;57/8053;28/9988;128/6972-1", "google_scholar": "https://scholar.google.com/citations?hl=en;-chIdAkAAAAJ;bqP_zxYAAAAJ;lZxRZ84AAAAJ;qRnP-p0AAAAJ;r21asW4AAAAJ;C7dO_UgAAAAJ", "orcid": "0000-0003-1608-2541;;;;;;", "linkedin": ";tonyyaomsu/;jinghan-jia-5194451ba/;https://www.linkedin.com/nhome/?trk=;;;", "or_profile": "~Yimeng_Zhang2;~Yuguang_Yao1;~Jinghan_Jia1;~Jinfeng_Yi1;~Mingyi_Hong1;~Shiyu_Chang2;~Sijia_Liu1", "aff": "Intel;Michigan State University;Michigan State University;JD AI Research;University of Minnesota, Minneapolis;University of California, Santa Barbara;Michigan State University", "aff_domain": "intel.com;msu.edu;msu.edu;jd.com;umn.edu;ucsb.edu;msu.edu", "position": "Intern;PhD student;PhD student;Senior Director;Associate Professor;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\nzhang2022how,\ntitle={How to Robustify Black-Box {ML} Models? A Zeroth-Order Optimization Perspective},\nauthor={Yimeng Zhang and Yuguang Yao and Jinghan Jia and Jinfeng Yi and Mingyi Hong and Shiyu Chang and Sijia Liu},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=W9G_ImpHlQd}\n}", "github": "", "project": "", "reviewers": "RU4Z;ATPZ;ULvk;Vo8C", "pdf_size": 0, "recommendation": "8;8;8;8", "confidence": "3;4;5;4", "correctness": "3;4;3;3", "technical_novelty": "3;4;4;3", "empirical_novelty": "3;3;4;3", "wc_summary_paper": "42;84;65;100", "wc_summary_review": "22;68;87;64", "wc_main_review": "159;166;155;379", "wc_review": "223;318;307;543", "wc_reply_reviewers": "16;0;48;150", "wc_reply_authors": "463;608;608;1142", "reply_reviewers": "1;0;1;2", "reply_authors": "1;2;2;4", "recommendation_avg": [ 8.0, 0.0 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 3.5, 0.5 ], "empirical_novelty_avg": [ 3.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 72.75, 21.649191670822262 ], "wc_summary_review_avg": [ 60.25, 23.731571797923543 ], "wc_main_review_avg": [ 214.75, 94.91147190935351 ], "wc_review_avg": [ 347.75, 118.56511923833249 ], "wc_reply_reviewers_avg": [ 53.5, 58.33309523760933 ], "wc_reply_authors_avg": [ 705.25, 259.0129099099116 ], "reply_reviewers_avg": [ 1.0, 0.7071067811865476 ], "reply_authors_avg": [ 2.25, 1.0897247358851685 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 41, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8309073291494301716&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "pdf": "https://openreview.net/pdf?id=W9G_ImpHlQd", "email": "intel.com;msu.edu;msu.edu;jd.com;umn.edu;ucsb.edu;msu.edu", "author_num": 7, "aff_unique_index": "0;1;1;2;3;4;1", "aff_unique_norm": "Intel;Michigan State University;JD;University of Minnesota;University of California, Santa Barbara", "aff_unique_dep": "Intel Corporation;;JD AI Research;;", "aff_unique_url": "https://www.intel.com;https://www.msu.edu;https://www.jd.com;https://www.minnesota.edu;https://www.ucsb.edu", "aff_unique_abbr": "Intel;MSU;JD AI;UMN;UCSB", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Minneapolis;Santa Barbara", "aff_country_unique_index": "0;0;0;1;0;0;0", "aff_country_unique": "United States;China" }, { "title": "Sparse Communication via Mixed Distributions", "status": "Oral", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/5895", "id": "WAid50QschI", "poster": "", "openreview": "https://openreview.net/forum?id=WAid50QschI", "slides": "https://iclr.cc/virtual/2022/poster/5895", "video": "https://iclr.cc/virtual/2022/poster/5895", "author_site": "Ant\u00f3nio Farinhas, Wilker Aziz, Vlad Niculae, Andre Martins", "tldr": "", "abstract": "Neural networks and other machine learning models compute continuous representations, while humans communicate mostly through discrete symbols. Reconciling these two forms of communication is desirable for generating human-readable interpretations or learning discrete latent variable models, while maintaining end-to-end differentiability. Some existing approaches (such as the Gumbel-Softmax transformation) build continuous relaxations that are discrete approximations in the zero-temperature limit, while others (such as sparsemax transformations and the Hard Concrete distribution) produce discrete/continuous hybrids. In this paper, we build rigorous theoretical foundations for these hybrids, which we call \"mixed random variables.'' Our starting point is a new \"direct sum'' base measure defined on the face lattice of the probability simplex. From this measure, we introduce new entropy and Kullback-Leibler divergence functions that subsume the discrete and differential cases and have interpretations in terms of code optimality. Our framework suggests two strategies for representing and sampling mixed random variables, an extrinsic (\"sample-and-project'\u2019) and an intrinsic one (based on face stratification). We experiment with both approaches on an emergent communication benchmark and on modeling MNIST and Fashion-MNIST data with variational auto-encoders with mixed latent variables.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/92722c31871512945eb4d2c789e94cd2ba73b3b0.zip", "author": "Ant\u00f3nio Farinhas;Wilker Aziz;Vlad Niculae;Andre Martins", "authorids": "~Ant\u00f3nio_Farinhas1;~Wilker_Aziz1;~Vlad_Niculae2;~Andre_Martins1", "gender": "M;M;M;M", "homepage": ";http://wilkeraziz.github.io;https://vene.ro;https://andre-martins.github.io/", "dblp": "267/5345;51/10489;40/10489;m/AndreFTMartins", "google_scholar": "yK5wIPkAAAAJ;phgBJXYAAAAJ;7_3UAgQAAAAJ;https://scholar.google.pt/citations?user=mT7ppvwAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Ant\u00f3nio_Farinhas1;~Wilker_Aziz1;~Vlad_Niculae2;~Andre_Martins1", "aff": "Instituto Superior T\u00e9cnico;University of Amsterdam;University of Amsterdam;Unbabel", "aff_domain": "tecnico.ulisboa.pt;uva.nl;uva.nl;unbabel.com", "position": "PhD student;Assistant Professor;Assistant Professor;Research Scientist", "bibtex": "@inproceedings{\nfarinhas2022sparse,\ntitle={Sparse Communication via Mixed Distributions},\nauthor={Ant{\\'o}nio Farinhas and Wilker Aziz and Vlad Niculae and Andre Martins},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=WAid50QschI}\n}", "github": "", "project": "", "reviewers": "ASbt;F3KA;ePyA;B5yQ", "pdf_size": 0, "recommendation": "6;8;8;8", "confidence": "2;3;3;5", "correctness": "2;4;4;4", "technical_novelty": "3;3;4;4", "empirical_novelty": "3;3;3;3", "wc_summary_paper": "33;89;67;111", "wc_summary_review": "63;54;30;60", "wc_main_review": "106;155;87;364", "wc_review": "202;298;184;535", "wc_reply_reviewers": "108;0;0;0", "wc_reply_authors": "724;50;156;187", "reply_reviewers": "1;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 7.5, 0.8660254037844386 ], "confidence_avg": [ 3.25, 1.0897247358851685 ], "correctness_avg": [ 3.5, 0.8660254037844386 ], "technical_novelty_avg": [ 3.5, 0.5 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 75.0, 28.809720581775867 ], "wc_summary_review_avg": [ 51.75, 12.968712349342937 ], "wc_main_review_avg": [ 178.0, 110.21569761154714 ], "wc_review_avg": [ 304.75, 139.82019703891137 ], "wc_reply_reviewers_avg": [ 27.0, 46.76537180435969 ], "wc_reply_authors_avg": [ 279.25, 261.75310408856666 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.6622661785325219, "corr_recommendation_correctness": 1.0, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9090566515327405784&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=WAid50QschI", "email": "tecnico.ulisboa.pt;uva.nl;uva.nl;unbabel.com", "author_num": 4, "aff_unique_index": "0;1;1;2", "aff_unique_norm": "Instituto Superior T\u00e9cnico;University of Amsterdam;Unbabel", "aff_unique_dep": ";;", "aff_unique_url": "https://www.ist.utl.pt;https://www.uva.nl;https://www.unbabel.com", "aff_unique_abbr": "IST;UvA;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;0", "aff_country_unique": "Portugal;Netherlands" }, { "id": "WDBo7y8lcJm", "title": "Teacher's pet: understanding and mitigating biases in distillation", "track": "main", "status": "Reject", "tldr": "", "abstract": "Knowledge distillation is widely used as a means of improving the performance of a relatively simple \u201cstudent\u201d model using the predictions from a complex \u201cteacher\u201d model. Several works have shown that distillation significantly boosts the student\u2019s overall performance; however, are these gains uniform across all data sub-groups? In this paper, we show that distillation can harm performance on certain subgroups, e.g., classes with few associated samples, compared to the vanilla student trained using the one-hot labels. We trace this behavior to errors made by the teacher distribution being transferred to and amplified by the student model. To mitigate this problem, we present techniques which soften the teacher influence for subgroups where it is less reliable. Experiments on several image classification benchmarks show that these modifications of distillation maintain boost in overall accuracy, while additionally ensuring improvement in subgroup performance.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Michal Lukasik;Srinadh Bhojanapalli;Aditya Krishna Menon;Sanjiv Kumar", "authorids": "~Michal_Lukasik1;~Srinadh_Bhojanapalli1;~Aditya_Krishna_Menon1;~Sanjiv_Kumar1", "gender": ";M;;M", "homepage": "https://mlukasik.github.io/;https://bsrinadh.github.io/;http://www.sanjivk.com/;https://akmenon.github.io/", "dblp": "72/11338;131/6700;;89/3514", "google_scholar": "https://scholar.google.co.uk/citations?user=cLZLZCQAAAAJ;bpSF_9EAAAAJ;https://scholar.google.com/citations?hl=en;", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Michal_Lukasik1;~Srinadh_Bhojanapalli1;~Sanjiv_Kumar1;~Aditya_Menon1", "aff": "Google Research;Google;Google;Google", "aff_domain": "google.com;google.com;google.com;google.com", "position": "Research Scientist;Research Scientist;Research Scientist;Research Scientist", "bibtex": "@misc{\nlukasik2022teachers,\ntitle={Teacher's pet: understanding and mitigating biases in distillation},\nauthor={Michal Lukasik and Srinadh Bhojanapalli and Aditya Krishna Menon and Sanjiv Kumar},\nyear={2022},\nurl={https://openreview.net/forum?id=WDBo7y8lcJm}\n}", "github": "", "project": "", "reviewers": "QhBi;t1Uw;bvA3;pMdP", "site": "https://openreview.net/forum?id=WDBo7y8lcJm", "pdf_size": 0, "recommendation": "3;5;6;6", "confidence": "5;5;4;5", "correctness": "3;4;3;4", "technical_novelty": "2;2;3;4", "empirical_novelty": "2;2;3;4", "wc_summary_paper": "77;171;109;34", "wc_summary_review": "33;43;65;99", "wc_main_review": "208;787;213;255", "wc_review": "318;1001;387;388", "wc_reply_reviewers": "0;458;43;135", "wc_reply_authors": "532;513;398;204", "reply_reviewers": "0;2;1;1", "reply_authors": "1;3;1;1", "recommendation_avg": [ 5.0, 1.224744871391589 ], "confidence_avg": [ 4.75, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.82915619758885 ], "empirical_novelty_avg": [ 2.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 97.75, 49.96686402006834 ], "wc_summary_review_avg": [ 60.0, 25.317977802344327 ], "wc_main_review_avg": [ 365.75, 243.89277869588514 ], "wc_review_avg": [ 523.5, 277.1412094943659 ], "wc_reply_reviewers_avg": [ 159.0, 179.38366703799986 ], "wc_reply_authors_avg": [ 411.75, 130.44227650574027 ], "reply_reviewers_avg": [ 1.0, 0.7071067811865476 ], "reply_authors_avg": [ 1.5, 0.8660254037844386 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.4714045207910316, "corr_recommendation_correctness": 0.40824829046386296, "gs_citation": 36, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9486543015968022500&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Google", "aff_unique_dep": "Google Research", "aff_unique_url": "https://research.google", "aff_unique_abbr": "Google Research", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "A Program to Build E(N)-Equivariant Steerable CNNs", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6098", "id": "WE4qe9xlnQw", "poster": "", "openreview": "https://openreview.net/forum?id=WE4qe9xlnQw", "slides": "https://iclr.cc/virtual/2022/poster/6098", "video": "https://iclr.cc/virtual/2022/poster/6098", "author_site": "Gabriele Cesa, Leon Lang, Maurice Weiler", "tldr": "", "abstract": "Equivariance is becoming an increasingly popular design choice to build data efficient neural networks by exploiting prior knowledge about the symmetries of the problem at hand. Euclidean steerable CNNs are one of the most common classes of equivariant networks. While the constraints these architectures need to satisfy are understood, existing approaches are tailored to specific (classes of) groups. No generally applicable method that is practical for implementation has been described so far. In this work, we generalize the Wigner-Eckart theorem proposed in Lang & Weiler (2020), which characterizes general $G$-steerable kernel spaces for compact groups $G$ over their homogeneous spaces, to arbitrary $G$-spaces. This enables us to directly parameterize filters in terms of a band-limited basis on the whole space rather than on $G$'s orbits, but also to easily implement steerable CNNs equivariant to a large number of groups. To demonstrate its generality, we instantiate our method on a variety of isometry groups acting on the Euclidean space $\\mathbb{R}^3$. Our framework allows us to build $E(3)$ and $SE(3)$-steerable CNNs like previous works, but also CNNs with arbitrary $G\\leq O(3)$-steerable kernels. For example, we build 3D CNNs equivariant to the symmetries of platonic solids or choose $G=SO(2)$ when working with 3D data having only azimuthal symmetries. We compare these models on 3D shapes and molecular datasets, observing improved performance by matching the model's symmetries to the ones of the data.", "keywords": "equivariance;3D;geometric deep learning;isometries;steerable CNN", "primary_area": "", "supplementary_material": "/attachment/8e59589617d819e70c68a8002c689047b6750a52.zip", "author": "Gabriele Cesa;Leon Lang;Maurice Weiler", "authorids": "~Gabriele_Cesa1;~Leon_Lang1;~Maurice_Weiler1", "gender": "M;M;", "homepage": "https://github.com/Gabri95;https://langleon.github.io/;https://maurice-weiler.gitlab.io/", "dblp": "254/1536;255/5021;210/0855", "google_scholar": "hTplhaMAAAAJ;E3ae_sMAAAAJ;uQePx6EAAAAJ", "orcid": ";0000-0002-1950-2831;", "linkedin": ";leon-lang/;maurice-weiler-78b6931a6/", "or_profile": "~Gabriele_Cesa1;~Leon_Lang1;~Maurice_Weiler1", "aff": "Qualcomm Inc, QualComm;University of Amsterdam;University of Amsterdam", "aff_domain": "qti.qualcomm.com;uva.nl;uva.nl", "position": "Researcher;PhD student;PhD student", "bibtex": "@inproceedings{\ncesa2022a,\ntitle={A Program to Build E(N)-Equivariant Steerable {CNN}s },\nauthor={Gabriele Cesa and Leon Lang and Maurice Weiler},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=WE4qe9xlnQw}\n}", "github": "", "project": "", "reviewers": "3dsB;RMPC;onv5;oCia", "pdf_size": 0, "recommendation": "6;6;6;8", "confidence": "2;3;3;2", "correctness": "3;4;4;4", "technical_novelty": "3;3;3;3", "empirical_novelty": "3;2;3;2", "wc_summary_paper": "18;87;81;73", "wc_summary_review": "38;141;194;49", "wc_main_review": "189;609;492;256", "wc_review": "245;837;767;378", "wc_reply_reviewers": "0;0;108;68", "wc_reply_authors": "145;403;597;498", "reply_reviewers": "0;0;1;1", "reply_authors": "1;1;1;1", "recommendation_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 2.5, 0.5 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 64.75, 27.444261695297982 ], "wc_summary_review_avg": [ 105.5, 64.88643926122006 ], "wc_main_review_avg": [ 386.5, 170.78714822843082 ], "wc_review_avg": [ 556.75, 250.94060552250207 ], "wc_reply_reviewers_avg": [ 44.0, 46.216880033165374 ], "wc_reply_authors_avg": [ 410.75, 168.0660212535538 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 129, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12842661194453189795&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=WE4qe9xlnQw", "email": "qti.qualcomm.com;uva.nl;uva.nl", "author_num": 3, "aff_unique_index": "0;1;1", "aff_unique_norm": "Qualcomm Incorporated;University of Amsterdam", "aff_unique_dep": ";", "aff_unique_url": "https://www.qualcomm.com;https://www.uva.nl", "aff_unique_abbr": "Qualcomm;UvA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1", "aff_country_unique": "United States;Netherlands" }, { "id": "WGhT5zCamoC", "title": "Seq2Tok: Deep Sequence Tokenizer for Retrieval", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Search over sequences is a fundamental problem. Very efficient solutions exist for text sequences, which are made up of discrete tokens chosen from a finite alphabet. Sequences, such as audio, video or sensor readings, are made up of continuous-valued samples with a large sampling rate, making similarity search inefficient. This paper proposes Seq2Tok, a deep sequence tokenizer that converts continuous-valued sequences to discrete tokens that are easier to retrieve via sequence queries. The only information available for training Seq2Tok is pairs of similar sequences, i.e., depending on how we form the pairs, the similarity semantics are learnt. Seq2Tok compresses the query and target sequences into short sequences of tokens that are faster to match. Experiments show consistent performance of Seq2Tok across various audio retrieval tasks, namely, music search (query by humming) and speech keyword search via audio query.", "keywords": "sequence representation learning;audio search;music retrieval", "primary_area": "", "supplementary_material": "", "author": "Adhiraj Banerjee;Vipul Arora", "authorids": "adhiraj@iitk.ac.in;~Vipul_Arora1", "gender": ";M", "homepage": ";https://home.iitk.ac.in/~vipular", "dblp": ";", "google_scholar": ";https://scholar.google.co.in/citations?user=SC9YYPAAAAAJ", "orcid": ";0000-0002-1207-1258", "linkedin": ";", "or_profile": "adhiraj@iitk.ac.in;~Vipul_Arora1", "aff": ";IIT Kanpur", "aff_domain": ";iitk.ac.in", "position": ";Associate Professor", "bibtex": "@misc{\nbanerjee2022seqtok,\ntitle={Seq2Tok: Deep Sequence Tokenizer for Retrieval},\nauthor={Adhiraj Banerjee and Vipul Arora},\nyear={2022},\nurl={https://openreview.net/forum?id=WGhT5zCamoC}\n}", "github": "", "project": "", "reviewers": "cfU5;37Kx;BBHm;dyUG", "site": "https://openreview.net/forum?id=WGhT5zCamoC", "pdf_size": 0, "recommendation": "1;3;5;5", "confidence": "4;3;3;3", "correctness": "2;3;2;3", "technical_novelty": "1;2;2;3", "empirical_novelty": "1;2;2;2", "wc_summary_paper": "48;106;62;37", "wc_summary_review": "49;66;45;17", "wc_main_review": "161;327;403;432", "wc_review": "258;499;510;486", "wc_reply_reviewers": "52;0;67;0", "wc_reply_authors": "352;753;452;790", "reply_reviewers": "1;0;1;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 3.5, 1.6583123951777 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 1.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 63.25, 26.223796445213647 ], "wc_summary_review_avg": [ 44.25, 17.597940220378067 ], "wc_main_review_avg": [ 330.75, 105.23871673485951 ], "wc_review_avg": [ 438.25, 104.41354078853949 ], "wc_reply_reviewers_avg": [ 29.75, 30.21899237234756 ], "wc_reply_authors_avg": [ 586.75, 188.5568548210327 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.8703882797784891, "corr_recommendation_correctness": 0.30151134457776363, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:RJ9xDp5YIzAJ:scholar.google.com/&scioq=Seq2Tok:+Deep+Sequence+Tokenizer+for+Retrieval&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "Indian Institute of Technology Kanpur", "aff_unique_dep": "", "aff_unique_url": "https://www.iitk.ac.in", "aff_unique_abbr": "IITK", "aff_campus_unique_index": "0", "aff_campus_unique": "Kanpur", "aff_country_unique_index": "0", "aff_country_unique": "India" }, { "title": "Learning Prototype-oriented Set Representations for Meta-Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6720", "id": "WH6u2SvlLp4", "poster": "", "openreview": "https://openreview.net/forum?id=WH6u2SvlLp4", "slides": "https://iclr.cc/virtual/2022/poster/6720", "video": "https://iclr.cc/virtual/2022/poster/6720", "author_site": "Dandan Guo, Long Tian, Minghe Zhang, Mingyuan Zhou, Hongyuan Zha", "tldr": "", "abstract": "Learning from set-structured data is a fundamental problem that has recently attracted increasing attention, where a series of summary networks are introduced to deal with the set input. In fact, many meta-learning problems can be treated as set-input tasks. Most existing summary networks aim to design different architectures for the input set in order to enforce permutation invariance. However, scant attention has been paid to the common cases where different sets in a meta distribution are closely related and share certain statistical properties. Viewing each set as a distribution over a set of global prototypes, this paper provides a novel prototype-oriented optimal transport (POT) framework to improve existing summary networks. To learn the distribution over the global prototypes, we minimize its regularized optimal transport distance to the set empirical distribution over data points, providing a natural unsupervised way to improve the summary network. Since our plug-and-play framework can be applied to many meta learning problems, we further instantiate it to the cases of few-shot classification and implicit meta generative modeling. Extensive experiments demonstrate that our framework significantly improves the existing summary networks on learning more powerful summary statistics from sets and can be successfully integrated into metric-based few-shot classification and generative modeling applications, providing a promising tool for addressing set-input and meta-learning problems.", "keywords": "Summary Networks;Distribution Matching;Optimal Transport;Few-shot Classification;Meta Generative Models", "primary_area": "", "supplementary_material": "/attachment/89608051254b08a2a2a6f0efa2b547ff278996aa.zip", "author": "Dan dan Guo;Long Tian;Minghe Zhang;Mingyuan Zhou;Hongyuan Zha", "authorids": "~Dan_dan_Guo1;tianlong_xidian@163.com;~Minghe_Zhang1;~Mingyuan_Zhou1;zhahy@cuhksz.edu.cn", "gender": "F;;M;M;", "homepage": "https://github.com/Dan123dan;;https://minghe0zhang.github.io/;http://mingyuanzhou.github.io;", "dblp": "121/1618;;;;", "google_scholar": "https://scholar.google.com.hk/citations?user=QLOY4JkAAAAJ;;;LXwCIisAAAAJ;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~Dan_dan_Guo1;tianlong_xidian@163.com;~Minghe_Zhang1;~Mingyuan_Zhou1;zhahy@cuhksz.edu.cn", "aff": "The Chinese University of Hong Kong(ShenZhen);;Georgia Institute of Technology;The University of Texas at Austin;", "aff_domain": "cuhk.edu.hk;;gatech.edu;utexas.edu;", "position": "Postdoc;;PhD student;Associate Professor;", "bibtex": "@inproceedings{\nguo2022learning,\ntitle={Learning Prototype-oriented Set Representations for Meta-Learning },\nauthor={Dan dan Guo and Long Tian and Minghe Zhang and Mingyuan Zhou and Hongyuan Zha},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=WH6u2SvlLp4}\n}", "github": "", "project": "", "reviewers": "AAZ1;tuTq;TfNJ;Nrfc", "pdf_size": 0, "recommendation": "6;6;6;8", "confidence": "4;2;3;3", "correctness": "4;3;4;4", "technical_novelty": "2;2;2;3", "empirical_novelty": "3;2;3;3", "wc_summary_paper": "40;75;81;95", "wc_summary_review": "24;28;90;91", "wc_main_review": "159;298;107;711", "wc_review": "223;401;278;897", "wc_reply_reviewers": "0;19;0;0", "wc_reply_authors": "606;1231;333;553", "reply_reviewers": "0;1;0;0", "reply_authors": "1;3;1;1", "recommendation_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 3.0, 0.7071067811865476 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 72.75, 20.253086184579377 ], "wc_summary_review_avg": [ 58.25, 32.2829289253624 ], "wc_main_review_avg": [ 318.75, 236.98562720131363 ], "wc_review_avg": [ 449.75, 266.14035300946006 ], "wc_reply_reviewers_avg": [ 4.75, 8.227241335952167 ], "wc_reply_authors_avg": [ 680.75, 333.7711603778853 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.5, 0.8660254037844386 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 26, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5362026226688886341&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=WH6u2SvlLp4", "email": "cuhk.edu.hk;;gatech.edu;utexas.edu;", "author_num": 5, "aff_unique_index": "0;1;2", "aff_unique_norm": "Chinese University of Hong Kong;Georgia Institute of Technology;University of Texas at Austin", "aff_unique_dep": ";;", "aff_unique_url": "https://www.cuhk.edu.cn;https://www.gatech.edu;https://www.utexas.edu", "aff_unique_abbr": "CUHK;Georgia Tech;UT Austin", "aff_campus_unique_index": "0;2", "aff_campus_unique": "Shenzhen;;Austin", "aff_country_unique_index": "0;1;1", "aff_country_unique": "China;United States" }, { "title": "Federated Learning from Only Unlabeled Data with Class-conditional-sharing Clients", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6108", "id": "WHA8009laxu", "poster": "", "openreview": "https://openreview.net/forum?id=WHA8009laxu", "slides": "https://iclr.cc/virtual/2022/poster/6108", "video": "https://iclr.cc/virtual/2022/poster/6108", "author_site": "Nan Lu, Zhao Wang, Xiaoxiao Li, Gang Niu, Qi Dou, Masashi Sugiyama", "tldr": "", "abstract": "Supervised federated learning (FL) enables multiple clients to share the trained model without sharing their labeled data. However, potential clients might even be reluctant to label their own data, which could limit the applicability of FL in practice. In this paper, we show the possibility of unsupervised FL whose model is still a classifier for predicting class labels, if the class-prior probabilities are shifted while the class-conditional distributions are shared among the unlabeled data owned by the clients. We propose federation of unsupervised learning (FedUL), where the unlabeled data are transformed into surrogate labeled data for each of the clients, a modified model is trained by supervised FL, and the wanted model is recovered from the modified model. FedUL is a very general solution to unsupervised FL: it is compatible with many supervised FL methods, and the recovery of the wanted model can be theoretically guaranteed as if the data have been labeled. Experiments on benchmark and real-world datasets demonstrate the effectiveness of FedUL. Code is available at https://github.com/lunanbit/FedUL.", "keywords": "unsupervised federated learning;unlabeled data;class prior shift", "primary_area": "", "supplementary_material": "", "author": "Nan Lu;Zhao Wang;Xiaoxiao Li;Gang Niu;Qi Dou;Masashi Sugiyama", "authorids": "~Nan_Lu1;~Zhao_Wang3;~Xiaoxiao_Li1;~Gang_Niu1;~Qi_Dou2;~Masashi_Sugiyama1", "gender": "F;M;Unspecified;M;F;M", "homepage": ";http://kyfafyd.wang;https://xxlya.github.io/;https://niug1984.github.io;https://www.cse.cuhk.edu.hk/~qdou;http://www.ms.k.u-tokyo.ac.jp/sugi/", "dblp": ";86/981-6;71/8042;26/3367-1;165/7846;35/1228", "google_scholar": "https://scholar.google.co.jp/citations?user=KQUQlG4AAAAJ;1kEufdwAAAAJ;sdENOQ4AAAAJ;https://scholar.google.co.jp/citations?user=HOkcy00AAAAJ;https://scholar.google.com.hk/citations?user=iHh7IJQAAAAJ;https://scholar.google.co.jp/citations?user=GkYIrlIAAAAJ", "orcid": ";;;;0000-0002-3416-9950;0000-0001-6658-6743", "linkedin": ";;;;;", "or_profile": "~Nan_Lu1;~Zhao_Wang3;~Xiaoxiao_Li1;~Gang_Niu1;~Qi_Dou2;~Masashi_Sugiyama1", "aff": "Eberhard-Karls-Universit\u00e4t T\u00fcbingen;The Chinese University of Hong Kong;University of British Columbia;RIKEN;The Chinese University of Hong Kong;The University of Tokyo", "aff_domain": "uni-tuebingen.de;cuhk.edu.hk;ece.ubc.ca;riken.jp;cuhk.edu.hk;u-tokyo.ac.jp", "position": "Postdoc;PhD student;Assistant Professor;Research Scientist (tenured);Assistant Professor;Full Professor", "bibtex": "@inproceedings{\nlu2022federated,\ntitle={Federated Learning from Only Unlabeled Data with Class-conditional-sharing Clients},\nauthor={Nan Lu and Zhao Wang and Xiaoxiao Li and Gang Niu and Qi Dou and Masashi Sugiyama},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=WHA8009laxu}\n}", "github": "", "project": "", "reviewers": "U3W9;mco5;4sTZ;SBTk", "pdf_size": 0, "recommendation": "6;8;8;8", "confidence": "2;3;4;4", "correctness": "3;4;4;3", "technical_novelty": "3;3;4;3", "empirical_novelty": "2;3;4;3", "wc_summary_paper": "104;27;108;28", "wc_summary_review": "60;241;16;19", "wc_main_review": "144;48;425;406", "wc_review": "308;316;549;453", "wc_reply_reviewers": "0;0;6;0", "wc_reply_authors": "358;639;698;639", "reply_reviewers": "0;0;1;0", "reply_authors": "5;5;6;3", "recommendation_avg": [ 7.5, 0.8660254037844386 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 66.75, 39.27706073524342 ], "wc_summary_review_avg": [ 84.0, 92.29572037749097 ], "wc_main_review_avg": [ 255.75, 163.45393081844193 ], "wc_review_avg": [ 406.5, 100.45023643575958 ], "wc_reply_reviewers_avg": [ 1.5, 2.598076211353316 ], "wc_reply_authors_avg": [ 583.5, 132.4018504402412 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 4.75, 1.0897247358851685 ], "replies_avg": [ 26, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.8703882797784891, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 53, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10078372194856107683&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=WHA8009laxu", "email": "uni-tuebingen.de;cuhk.edu.hk;ece.ubc.ca;riken.jp;cuhk.edu.hk;u-tokyo.ac.jp", "author_num": 6, "aff_unique_index": "0;1;2;3;1;4", "aff_unique_norm": "Eberhard Karls University of T\u00fcbingen;Chinese University of Hong Kong;University of British Columbia;RIKEN;University of Tokyo", "aff_unique_dep": ";;;;", "aff_unique_url": "https://www.uni-tuebingen.de/;https://www.cuhk.edu.hk;https://www.ubc.ca;https://www.riken.jp;https://www.u-tokyo.ac.jp", "aff_unique_abbr": "Uni T\u00fcbingen;CUHK;UBC;RIKEN;UTokyo", "aff_campus_unique_index": "0;1;1", "aff_campus_unique": "T\u00fcbingen;Hong Kong SAR;", "aff_country_unique_index": "0;1;2;3;1;3", "aff_country_unique": "Germany;China;Canada;Japan" }, { "id": "WIJVRV7jnTX", "title": "Calibrated ensembles - a simple way to mitigate ID-OOD accuracy tradeoffs", "track": "main", "status": "Reject", "tldr": "", "abstract": "We often see undesirable tradeoffs in robust machine learning where out-of-distribution (OOD) accuracy is at odds with in-distribution (ID) accuracy. A \u2018robust\u2019 classifier obtained via specialized techniques like removing spurious features has better OOD but worse ID accuracy compared to a \u2018standard\u2019 classifier trained via vanilla ERM. On six distribution shift datasets, we find that simply ensembling the standard and robust models is a strong baseline---we match the ID accuracy of a standard model with only a small drop in OOD accuracy compared to the robust model. However, calibrating these models in-domain surprisingly improves the OOD accuracy of the ensemble and completely eliminates the tradeoff and we achieve the best of both ID and OOD accuracy over the original models.", "keywords": "distribution shift;calibration;ensembles", "primary_area": "", "supplementary_material": "", "author": "Ananya Kumar;Aditi Raghunathan;Tengyu Ma;Percy Liang", "authorids": "~Ananya_Kumar1;~Aditi_Raghunathan1;~Tengyu_Ma1;~Percy_Liang1", "gender": "M;F;M;", "homepage": "https://ananyakumar.wordpress.com/;https://www.cs.cmu.edu/~aditirag/;http://ai.stanford.edu/~tengyuma/;https://cs.stanford.edu/~pliang/", "dblp": "192/0474;166/1409;54/9061;04/1701", "google_scholar": "tP5IBFkAAAAJ;Ch9iRwQAAAAJ;i38QlUwAAAAJ;pouyVyUAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Ananya_Kumar1;~Aditi_Raghunathan1;~Tengyu_Ma1;~Percy_Liang1", "aff": "Microsoft;Carnegie Mellon University;Facebook AI Research;Stanford University", "aff_domain": "microsoft.com;cmu.edu;fb.com;stanford.edu", "position": "Intern;Assistant Professor;Visiting Scientist;Associate Professor", "bibtex": "@misc{\nkumar2022calibrated,\ntitle={Calibrated ensembles - a simple way to mitigate {ID}-{OOD} accuracy tradeoffs},\nauthor={Ananya Kumar and Aditi Raghunathan and Tengyu Ma and Percy Liang},\nyear={2022},\nurl={https://openreview.net/forum?id=WIJVRV7jnTX}\n}", "github": "", "project": "", "reviewers": "EkrF;u3S8;7Ez2;VdiR", "site": "https://openreview.net/forum?id=WIJVRV7jnTX", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "4;3;4;3", "correctness": "2;3;3;3", "technical_novelty": "2;2;3;3", "empirical_novelty": "3;2;3;3", "wc_summary_paper": "72;171;48;105", "wc_summary_review": "80;34;96;60", "wc_main_review": "350;106;528;178", "wc_review": "502;311;672;343", "wc_reply_reviewers": "44;0;14;146", "wc_reply_authors": "1551;1053;550;253", "reply_reviewers": "1;0;1;1", "reply_authors": "4;5;1;2", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 99.0, 46.233105022267324 ], "wc_summary_review_avg": [ 67.5, 23.16786567640619 ], "wc_main_review_avg": [ 290.5, 163.2811991626715 ], "wc_review_avg": [ 457.0, 143.6680201019002 ], "wc_reply_reviewers_avg": [ 51.0, 57.105166141076936 ], "wc_reply_authors_avg": [ 851.75, 494.7238497384172 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 3.0, 1.5811388300841898 ], "replies_avg": [ 22, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5173396076145170661&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "Microsoft;Carnegie Mellon University;Meta;Stanford University", "aff_unique_dep": "Microsoft Corporation;;Facebook AI Research;", "aff_unique_url": "https://www.microsoft.com;https://www.cmu.edu;https://research.facebook.com;https://www.stanford.edu", "aff_unique_abbr": "Microsoft;CMU;FAIR;Stanford", "aff_campus_unique_index": "1", "aff_campus_unique": ";Stanford", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "WKWAkkXGpWN", "title": "Efficient Training and Inference of Hypergraph Reasoning Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "We study the problem of hypergraph reasoning in large domains, e.g., predicting the relationship between several entities based on the input facts. We observe that in logical reasoning, logical rules (e.g., my parent's parent is my grandparent) usually apply locally (e.g., only three people are involved in a grandparent rule), and sparsely (e.g., the grandparent relationship is sparse across all pairs of people in the world). Inspired by these observations, we propose Sparse and Local Neural Logic Machines (SpaLoc), a structured neural network for hypergraph reasoning. To leverage the sparsity in hypergraph neural networks, SpaLoc represents the grounding of relationships such as parent and grandparent as sparse tensors and uses neural networks and finite-domain quantification operations to infer new facts based on the input. We further introduce a sparsification loss to regularize the number of hyperedges in intermediate layers of a SpaLoc model. To enable training on large-scale graphs such as real-world knowledge graphs, SpaLoc makes training and inference-time sub-sampling of the input graphs. To remedy the information loss in sampled sub-graphs, we propose a novel sampling and label calibration paradigm based on an information-theoretic measure information sufficiency. Our SpaLoc shows superior accuracy and efficiency on synthetic datasets compared with prior art and achieves state-of-the-art performance on several real-world knowledge graph reasoning benchmarks.", "keywords": "Relational Rule Induction;Hypergraph Network;Efficient Learning", "primary_area": "", "supplementary_material": "", "author": "Guangxuan Xiao;Leslie Pack Kaelbling;Jiajun Wu;Jiayuan Mao", "authorids": "~Guangxuan_Xiao1;~Leslie_Pack_Kaelbling1;~Jiajun_Wu1;~Jiayuan_Mao1", "gender": ";F;M;F", "homepage": ";http://people.csail.mit.edu/lpk/;https://jiajunwu.com;http://jiayuanm.com", "dblp": ";k/LesliePackKaelbling;117/4768;200/8283", "google_scholar": ";IcasIiwAAAAJ;2efgcS0AAAAJ;-xaOIZIAAAAJ", "orcid": ";0000-0001-6054-7145;0000-0002-4176-343X;0000-0003-4798-3748", "linkedin": ";;jiajunwu/;", "or_profile": "~Guangxuan_Xiao1;~Leslie_Pack_Kaelbling1;~Jiajun_Wu1;~Jiayuan_Mao1", "aff": ";Massachusetts Institute of Technology;Stanford University;Massachusetts Institute of Technology", "aff_domain": ";mit.edu;stanford.edu;mit.edu", "position": ";Full Professor;Assistant Professor;PhD student", "bibtex": "@misc{\nxiao2022efficient,\ntitle={Efficient Training and Inference of Hypergraph Reasoning Networks},\nauthor={Guangxuan Xiao and Leslie Pack Kaelbling and Jiajun Wu and Jiayuan Mao},\nyear={2022},\nurl={https://openreview.net/forum?id=WKWAkkXGpWN}\n}", "github": "", "project": "", "reviewers": "QNYc;GMNn;emhK;joDw;Rg6r", "site": "https://openreview.net/forum?id=WKWAkkXGpWN", "pdf_size": 0, "recommendation": "3;3;6;6;6", "confidence": "3;4;3;4;3", "correctness": "1;3;3;3;3", "technical_novelty": "3;2;2;3;3", "empirical_novelty": "2;1;2;3;3", "wc_summary_paper": "143;80;198;104;126", "wc_summary_review": "255;26;577;74;62", "wc_main_review": "711;341;65;285;237", "wc_review": "1109;447;840;463;425", "wc_reply_reviewers": "0;115;30;35;0", "wc_reply_authors": "560;622;554;369;337", "reply_reviewers": "0;1;1;1;0", "reply_authors": "1;2;1;1;1", "recommendation_avg": [ 4.8, 1.469693845669907 ], "confidence_avg": [ 3.4, 0.4898979485566356 ], "correctness_avg": [ 2.6, 0.8000000000000002 ], "technical_novelty_avg": [ 2.6, 0.4898979485566356 ], "empirical_novelty_avg": [ 2.2, 0.7483314773547882 ], "wc_summary_paper_avg": [ 130.2, 39.96198193283211 ], "wc_summary_review_avg": [ 198.8, 205.1062163855596 ], "wc_main_review_avg": [ 327.8, 212.67665598273825 ], "wc_review_avg": [ 656.8, 273.2591444032569 ], "wc_reply_reviewers_avg": [ 36.0, 42.118879377305376 ], "wc_reply_authors_avg": [ 488.4, 113.54047736380186 ], "reply_reviewers_avg": [ 0.6, 0.48989794855663565 ], "reply_authors_avg": [ 1.2, 0.4000000000000001 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.16666666666666669, "corr_recommendation_correctness": 0.6123724356957944, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:YQ_rmiVKBhEJ:scholar.google.com/&scioq=Efficient+Training+and+Inference+of+Hypergraph+Reasoning+Networks&hl=en&as_sdt=0,5", "gs_version_total": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "Massachusetts Institute of Technology;Stanford University", "aff_unique_dep": ";", "aff_unique_url": "https://web.mit.edu;https://www.stanford.edu", "aff_unique_abbr": "MIT;Stanford", "aff_campus_unique_index": "1", "aff_campus_unique": ";Stanford", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Graph Condensation for Graph Neural Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6573", "id": "WLEx3Jo4QaB", "poster": "", "openreview": "https://openreview.net/forum?id=WLEx3Jo4QaB", "slides": "https://iclr.cc/virtual/2022/poster/6573", "video": "https://iclr.cc/virtual/2022/poster/6573", "author_site": "Wei Jin, Lingxiao Zhao, Shichang Zhang, Yozen Liu, Jiliang Tang, Neil Shah", "tldr": "", "abstract": "Given the prevalence of large-scale graphs in real-world applications, the storage and time for training neural models have raised increasing concerns. To alleviate the concerns, we propose and study the problem of graph condensation for graph neural networks (GNNs). Specifically, we aim to condense the large, original graph into a small, synthetic and highly-informative graph, such that GNNs trained on the small graph and large graph have comparable performance. We approach the condensation problem by imitating the GNN training trajectory on the original graph through the optimization of a gradient matching loss and design a strategy to condense node futures and structural information simultaneously. Extensive experiments have demonstrated the effectiveness of the proposed framework in condensing different graph datasets into informative smaller graphs. In particular, we are able to approximate the original test accuracy by 95.3\\% on Reddit, 99.8\\% on Flickr and 99.0\\% on Citeseer, while reducing their graph size by more than 99.9\\%, and the condensed graphs can be used to train various GNN architectures. ", "keywords": "data-efficient learning;graph generation;graph neural networks", "primary_area": "", "supplementary_material": "", "author": "Wei Jin;Lingxiao Zhao;Shichang Zhang;Yozen Liu;Jiliang Tang;Neil Shah", "authorids": "~Wei_Jin4;~Lingxiao_Zhao1;~Shichang_Zhang2;~Yozen_Liu1;~Jiliang_Tang1;~Neil_Shah2", "gender": ";M;M;;M;M", "homepage": "http://www.cs.emory.edu/~wjin30/;http://lingxiaozhao.com/;https://shichangzh.github.io/;https://www.linkedin.com/in/yozen-liu-531a67130/;https://www.cse.msu.edu/~tangjili/;http://nshah.net", "dblp": "66/2173-9;;234/4118;242/8056.html;64/10812;71/7771", "google_scholar": "eWow24EAAAAJ;QKslW6EAAAAJ;TYqG0x4AAAAJ;i3U2JjEAAAAJ;WtzKMWAAAAAJ;Qut69OgAAAAJ", "orcid": ";;0000-0003-0954-5018;;0000-0001-7125-3898;0000-0003-3261-8430", "linkedin": ";;shichang-zhang-4430a4106/;;;", "or_profile": "~Wei_Jin4;~Lingxiao_Zhao1;~Shichang_Zhang2;~Yozen_Liu1;~Jiliang_Tang1;~Neil_Shah2", "aff": "Michigan State University;Carnegie Mellon University;University of California, Los Angeles;Snap Inc.;Michigan State University;Snap Inc.", "aff_domain": "msu.edu;andrew.cmu.edu;cs.ucla.edu;snapchat.com;msu.edu;snap.com", "position": "PhD student;PhD student;PhD student;Researcher;Associate Professor;Research Scientist", "bibtex": "@inproceedings{\njin2022graph,\ntitle={Graph Condensation for Graph Neural Networks},\nauthor={Wei Jin and Lingxiao Zhao and Shichang Zhang and Yozen Liu and Jiliang Tang and Neil Shah},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=WLEx3Jo4QaB}\n}", "github": "", "project": "", "reviewers": "XqrK;peGb;R5cV;cTj2", "pdf_size": 0, "recommendation": "5;6;6;6", "confidence": "4;4;3;4", "correctness": "4;3;3;3", "technical_novelty": "2;2;3;2", "empirical_novelty": "4;2;3;3", "wc_summary_paper": "58;175;62;63", "wc_summary_review": "25;73;41;100", "wc_main_review": "261;1146;204;265", "wc_review": "344;1394;307;428", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "1105;2325;1100;1135", "reply_reviewers": "0;0;0;0", "reply_authors": "2;3;2;2", "recommendation_avg": [ 5.75, 0.4330127018922193 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 89.5, 49.39888662712956 ], "wc_summary_review_avg": [ 59.75, 28.960101864461734 ], "wc_main_review_avg": [ 469.0, 391.6101377645885 ], "wc_review_avg": [ 618.25, 450.0202078796018 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 1416.25, 524.8377725545295 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.25, 0.4330127018922193 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": -1.0, "gs_citation": 206, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14491892748486687067&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=WLEx3Jo4QaB", "email": "msu.edu;andrew.cmu.edu;cs.ucla.edu;snapchat.com;msu.edu;snap.com", "author_num": 6, "aff_unique_index": "0;1;2;3;0;3", "aff_unique_norm": "Michigan State University;Carnegie Mellon University;University of California, Los Angeles;Snap Inc.", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.msu.edu;https://www.cmu.edu;https://www.ucla.edu;https://www.snapinc.com", "aff_unique_abbr": "MSU;CMU;UCLA;Snap", "aff_campus_unique_index": "1", "aff_campus_unique": ";Los Angeles", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "WLZ_2JjCz2a", "title": "Sparse Unbalanced GAN Training with In-Time Over-Parameterization", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Generative adversarial networks (GANs) have received an upsurging interest since being proposed due to the high quality of the generated data. While GANs achieving increasingly impressive results, the resource demands associated with the large model size hinders its usage in resource-limited scenarios. For inference, the existing model compression techniques can reduce the model complexity with comparable performance. However, the training efficiency of GANs has less be explored due to the fragile training process of GANs. In this paper, we for the first time explore the possibility of directly training sparse GAN from scratch without involving any dense or pre-training steps. Even more unconventionally, our proposed method enables training sparse unbalanced GANs with an extremely sparse generator in an end-to-end way, chasing high training and inference efficiency gains. Instead of training full GANs, we start by training a sparse subnetwork and periodically explore the sparse connectivity during training, while maintaining a fixed parameter count. Extensive experiments with modern GAN architectures validate the efficiency of our method. Our sparsified GANs, trained from scratch in one single run, outperform the ones learned by expensive iterative pruning and retraining. Perhaps most importantly, we find instead of inheriting parameters from expensive pre-trained GANs, directly training sparse GANs from scratch can be a much more efficient solution. For example, only training with a 80% sparse generator and a 50% sparse discriminator, our method can achieve even better performance than the dense BigGAN. ", "keywords": "sparse unbalance GAN training;GAN training;dynamic sparse training;sparse training;bigGAN", "primary_area": "", "supplementary_material": "", "author": "Shiwei Liu;Yuesong Tian;Tianlong Chen;Li Shen", "authorids": "~Shiwei_Liu2;~Yuesong_Tian1;~Tianlong_Chen1;~Li_Shen1", "gender": "M;M;M;M", "homepage": "https://shiweiliuiiiiiii.github.io/;;https://tianlong-chen.github.io;https://sites.google.com/site/mathshenli/home", "dblp": "234/8697-3.html;245/4028;;91/3680-8", "google_scholar": "73IbXtsAAAAJ;0LROy2EAAAAJ;LE3ctn0AAAAJ;yVhgENIAAAAJ", "orcid": ";;0000-0001-7774-8197;", "linkedin": ";;tianlong-chen-783862167/;", "or_profile": "~Shiwei_Liu2;~Yuesong_Tian1;~Tianlong_Chen1;~Li_Shen1", "aff": "Eindhoven University of Technology;Zhejiang University;University of Texas, Austin;JD Explore Academy", "aff_domain": "tue.nl;zju.edu.cn;utexas.edu;jd.com", "position": "PhD student;PhD student;PhD student;Researcher", "bibtex": "@misc{\nliu2022sparse,\ntitle={Sparse Unbalanced {GAN} Training with In-Time Over-Parameterization},\nauthor={Shiwei Liu and Yuesong Tian and Tianlong Chen and Li Shen},\nyear={2022},\nurl={https://openreview.net/forum?id=WLZ_2JjCz2a}\n}", "github": "", "project": "", "reviewers": "w8qj;XVra;krGJ;xat3", "site": "https://openreview.net/forum?id=WLZ_2JjCz2a", "pdf_size": 0, "recommendation": "3;3;5;6", "confidence": "4;3;3;3", "correctness": "3;3;4;3", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "46;56;63;97", "wc_summary_review": "27;17;115;38", "wc_main_review": "107;160;290;192", "wc_review": "180;233;468;327", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.25, 1.299038105676658 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 65.5, 19.1637678967368 ], "wc_summary_review_avg": [ 49.25, 38.68058298423125 ], "wc_main_review_avg": [ 187.25, 66.63848362620506 ], "wc_review_avg": [ 302.0, 109.34578181164558 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.5555555555555555, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3194054350564533042&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "Eindhoven University of Technology;Zhejiang University;University of Texas at Austin;JD", "aff_unique_dep": ";;;JD Explore Academy", "aff_unique_url": "https://www.tue.nl;https://www.zju.edu.cn;https://www.utexas.edu;", "aff_unique_abbr": "TU/e;ZJU;UT Austin;", "aff_campus_unique_index": "1", "aff_campus_unique": ";Austin", "aff_country_unique_index": "0;1;2", "aff_country_unique": "Netherlands;China;United States;" }, { "id": "WN2Sup7qLdw", "title": "Multi-Resolution Continuous Normalizing Flows", "track": "main", "status": "Reject", "tldr": "", "abstract": "Recent work has shown that Neural Ordinary Differential Equations (ODEs) can serve as generative models of images using the perspective of Continuous Normalizing Flows (CNFs). Such models offer exact likelihood calculation, and invertible generation/density estimation. In this work we introduce a Multi-Resolution variant of such models (MRCNF), by characterizing the conditional distribution over the additional information required to generate a fine image that is consistent with the coarse image. We introduce a transformation between resolutions that allows for no change in the log likelihood. We show that this approach yields comparable likelihood values for various image datasets, using orders of magnitude fewer parameters than the prior methods, in significantly less training time, using only one GPU.", "keywords": "Normalizing flows;generative models;neural ode;continuous normalizing flows;computer vision", "primary_area": "", "supplementary_material": "/attachment/d60979cfef0ea89f3293e24d6d8c0d7be7ac44b1.zip", "author": "Vikram Voleti;Chris Finlay;Adam M Oberman;Christopher Pal", "authorids": "~Vikram_Voleti1;~Chris_Finlay1;~Adam_M_Oberman1;~Christopher_Pal1", "gender": "M;M;M;", "homepage": "https://voletiv.github.io;https://cfinlay.github.io/;https://www.adamoberman.net/;https://scholar.google.ca/citations?user=1ScWJOoAAAAJ&hl=en&oi=ao", "dblp": "243/6609;227/1604;31/8186;45/1217", "google_scholar": "PPCRqZUAAAAJ;https://scholar.google.ca/citations?user=OnvqhuIAAAAJ;https://scholar.google.ca/citations?user=LPAZlL8AAAAJ;https://scholar.google.ca/citations?user=1ScWJOoAAAAJ", "orcid": ";0000-0002-8962-2206;;", "linkedin": "vikram-voleti-45372222;chfinlay/;adam-oberman-527348107/;", "or_profile": "~Vikram_Voleti1;~Chris_Finlay1;~Adam_M_Oberman1;~Christopher_Pal1", "aff": "Unity Technologies;Deep Render;McGill University;Polytechnique Montreal", "aff_domain": "unity.com;deeprender.ai;mcgill.ca;polymtl.ca", "position": "Intern;Researcher;Full Professor;Full Professor", "bibtex": "@misc{\nvoleti2022multiresolution,\ntitle={Multi-Resolution Continuous Normalizing Flows},\nauthor={Vikram Voleti and Chris Finlay and Adam M Oberman and Christopher Pal},\nyear={2022},\nurl={https://openreview.net/forum?id=WN2Sup7qLdw}\n}", "github": "", "project": "", "reviewers": "cSSK;hXQn;efZN;K92k;McBP", "site": "https://openreview.net/forum?id=WN2Sup7qLdw", "pdf_size": 0, "recommendation": "3;3;5;5;6", "confidence": "4;3;3;4;4", "correctness": "2;2;3;3;3", "technical_novelty": "2;2;2;2;4", "empirical_novelty": "2;2;2;2;4", "wc_summary_paper": "53;61;60;48;208", "wc_summary_review": "42;48;85;62;70", "wc_main_review": "350;486;782;252;634", "wc_review": "445;595;927;362;912", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "0;0;0;0;0", "reply_reviewers": "0;0;0;0;0", "reply_authors": "0;0;0;0;0", "recommendation_avg": [ 4.4, 1.2 ], "confidence_avg": [ 3.6, 0.4898979485566356 ], "correctness_avg": [ 2.6, 0.4898979485566356 ], "technical_novelty_avg": [ 2.4, 0.8 ], "empirical_novelty_avg": [ 2.4, 0.8 ], "wc_summary_paper_avg": [ 86.0, 61.18496547355404 ], "wc_summary_review_avg": [ 61.4, 15.409088227406578 ], "wc_main_review_avg": [ 500.8, 190.618362179513 ], "wc_review_avg": [ 648.2, 233.81650925458624 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.2721655269759087, "corr_recommendation_correctness": 0.9525793444156803, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9937347803434269994&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 10, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "Unity Technologies;Deep Render;McGill University;Polytechnique Montreal", "aff_unique_dep": ";;;", "aff_unique_url": "https://unity.com;;https://www.mcgill.ca;https://www.polymtl.ca", "aff_unique_abbr": "Unity;;McGill;PolyMTL", "aff_campus_unique_index": "1", "aff_campus_unique": ";Montreal", "aff_country_unique_index": "0;2;2", "aff_country_unique": "United States;;Canada" }, { "id": "WNTscnQd1s", "title": "Sparsistent Model Discovery", "track": "main", "status": "Reject", "tldr": "", "abstract": "Discovering the partial differential equations underlying spatio-temporal datasets from very limited and highly noisy observations is of paramount interest in many scientific fields. However, it remains an open question to know when model discovery algorithms based on sparse regression can actually recover the underlying physical processes. In this work, we show the design matrices used to infer the equations by sparse regression can violate the irrepresentability condition (IRC) of the Lasso, even when derived from analytical PDE solutions (i.e. without additional noise). Sparse regression techniques which can recover the true underlying model under violated IRC conditions are therefore required, leading to the introduction of the randomised adaptive Lasso. We show once the latter is integrated within the deep learning model discovery framework DeepMod, a wide variety of nonlinear and chaotic canonical PDEs can be recovered: (1) up to $\\mathcal{O}(2)$ higher noise-to-sample ratios than state-of-the-art algorithms, (2) with a single set of hyperparameters, which paves the road towards truly automated model discovery.", "keywords": "model discovery;sparse regression;sparsistency;physics informed deep learning;partial differential equations", "primary_area": "", "supplementary_material": "", "author": "Georges Tod;Gert-Jan Both;Remy Kusters", "authorids": "~Georges_Tod1;~Gert-Jan_Both1;~Remy_Kusters1", "gender": "M;M;", "homepage": "https://github.com/georgestod/;;", "dblp": ";;", "google_scholar": ";w8oKI8wAAAAJ;https://scholar.google.fr/citations?user=442FIp8AAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Georges_Tod1;~Gert-Jan_Both1;~Remy_Kusters1", "aff": ";Pasqal;International Business Machines", "aff_domain": ";pasqal.com;ibm.com", "position": ";Researcher;Researcher", "bibtex": "@misc{\ntod2022sparsistent,\ntitle={Sparsistent Model Discovery},\nauthor={Georges Tod and Gert-Jan Both and Remy Kusters},\nyear={2022},\nurl={https://openreview.net/forum?id=WNTscnQd1s}\n}", "github": "", "project": "", "reviewers": "1UJP;pXwU;dRBM;kL72", "site": "https://openreview.net/forum?id=WNTscnQd1s", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "3;4;3;3", "correctness": "4;2;3;2", "technical_novelty": "2;3;2;2", "empirical_novelty": "3;3;3;3", "wc_summary_paper": "52;139;63;224", "wc_summary_review": "51;19;58;85", "wc_main_review": "701;1183;119;226", "wc_review": "804;1341;240;535", "wc_reply_reviewers": "0;148;0;0", "wc_reply_authors": "309;423;83;293", "reply_reviewers": "0;1;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.82915619758885 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 119.5, 69.00905737655022 ], "wc_summary_review_avg": [ 53.25, 23.498670175139697 ], "wc_main_review_avg": [ 557.25, 422.4975591645471 ], "wc_review_avg": [ 730.0, 405.25362428977735 ], "wc_reply_reviewers_avg": [ 37.0, 64.08587988004847 ], "wc_reply_authors_avg": [ 277.0, 122.71104269787622 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": -0.30151134457776363, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10832115340917139632&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;1", "aff_unique_norm": "Pasqal;International Business Machines Corporation", "aff_unique_dep": ";", "aff_unique_url": "https://www.pasqal.com;https://www.ibm.com", "aff_unique_abbr": ";IBM", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "France;United States" }, { "title": "Learning Curves for SGD on Structured Features", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6344", "id": "WPI2vbkAl3Q", "poster": "", "openreview": "https://openreview.net/forum?id=WPI2vbkAl3Q", "slides": "https://iclr.cc/virtual/2022/poster/6344", "video": "https://iclr.cc/virtual/2022/poster/6344", "author_site": "Blake Bordelon, Cengiz Pehlevan", "tldr": "", "abstract": "The generalization performance of a machine learning algorithm such as a neural network depends in a non-trivial way on the structure of the data distribution. To analyze the influence of data structure on test loss dynamics, we study an exactly solveable model of stochastic gradient descent (SGD) on the square loss which predicts test error when training on features with arbitrary covariance structure. We solve the theory exactly for both Gaussian features and arbitrary features and we show that the simpler Gaussian model accurately predicts test loss of nonlinear random-feature models and neural networks in the kernel regime trained with SGD on real datasets such as MNIST and CIFAR-10. We show that the optimal batch size at a fixed compute budget is typically small and depends on the feature correlation structure, demonstrating the computational benefits of SGD with small batch sizes. Lastly, we extend our theory to the more usual setting of stochastic gradient descent on a fixed subsampled training set, showing that both training and test error can be accurately predicted in our framework on real data.", "keywords": "Stochastic Gradient Descent;Generalization", "primary_area": "", "supplementary_material": "/attachment/1d264da6817d352d660eb5ca22cfc7c51991e4a2.zip", "author": "Blake Bordelon;Cengiz Pehlevan", "authorids": "~Blake_Bordelon1;~Cengiz_Pehlevan2", "gender": "M;", "homepage": "https://blakebordelon.github.io/;https://pehlevan.seas.harvard.edu/", "dblp": "228/6993;145/3480", "google_scholar": "yeQ8_pgAAAAJ;veDLTPEAAAAJ", "orcid": "0000-0003-0455-9445;0000-0001-9767-6063", "linkedin": ";", "or_profile": "~Blake_Bordelon1;~Cengiz_Pehlevan2", "aff": "Harvard University;School of Engineering and Applied Sciences, Harvard University", "aff_domain": "harvard.edu;seas.harvard.edu", "position": "PhD student;Assistant Professor", "bibtex": "@inproceedings{\nbordelon2022learning,\ntitle={Learning Curves for {SGD} on Structured Features},\nauthor={Blake Bordelon and Cengiz Pehlevan},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=WPI2vbkAl3Q}\n}", "github": "", "project": "", "reviewers": "AoHd;My4V;Ack9;FVgD", "pdf_size": 0, "recommendation": "5;5;6;8", "confidence": "3;4;4;3", "correctness": "3;3;4;3", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;2;0;3", "wc_summary_paper": "54;63;203;109", "wc_summary_review": "69;26;83;40", "wc_main_review": "274;420;646;289", "wc_review": "397;509;932;438", "wc_reply_reviewers": "533;0;35;0", "wc_reply_authors": "1718;503;1090;464", "reply_reviewers": "5;0;1;0", "reply_authors": "7;1;2;1", "recommendation_avg": [ 6.0, 1.224744871391589 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 1.75, 1.0897247358851685 ], "wc_summary_paper_avg": [ 107.25, 59.08627167117587 ], "wc_summary_review_avg": [ 54.5, 22.610838109190027 ], "wc_main_review_avg": [ 407.25, 149.08282094191804 ], "wc_review_avg": [ 569.0, 213.3740846494719 ], "wc_reply_reviewers_avg": [ 142.0, 226.19571171885642 ], "wc_reply_authors_avg": [ 943.75, 511.1929063474962 ], "reply_reviewers_avg": [ 1.5, 2.0615528128088303 ], "reply_authors_avg": [ 2.75, 2.48746859276655 ], "replies_avg": [ 23, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.40824829046386296, "corr_recommendation_correctness": 0.0, "gs_citation": 30, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16931573474353829992&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "pdf": "https://openreview.net/pdf?id=WPI2vbkAl3Q", "email": "harvard.edu;seas.harvard.edu", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Harvard University", "aff_unique_dep": "", "aff_unique_url": "https://www.harvard.edu", "aff_unique_abbr": "Harvard", "aff_campus_unique_index": "1", "aff_campus_unique": ";Cambridge", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "WQIdU90Gsu", "title": "Compound Multi-branch Feature Fusion for Real Image Restoration", "track": "main", "status": "Reject", "tldr": "", "abstract": "Image restoration is a challenging and ill-posed problem which also has been a long-standing issue. However, most of learning based restoration methods are proposed to target one degradation type which means they are lack of generalization. In this paper, we proposed a multi-branch restoration model inspired from the Human Visual System (i.e., Retinal Ganglion Cells) which can achieve multiple restoration tasks in a general framework. The experiments show that the proposed multi-branch architecture, called CMFNet, has competitive performance results on four datasets, including image dehazing, deraindrop, and deblurring, which are very common applications for autonomous cars. The source code and pretrained models of three restoration tasks are available at https://github.com/publish_after_accepting/CMFNet.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Chi-Mao Fan;Tsung-Jung Liu;Kuan-Hsien Liu", "authorids": "qaz5517359@gmail.com;~Tsung-Jung_Liu1;khliu@nutc.edu.tw", "gender": ";M;", "homepage": ";http://www.ee.nchu.edu.tw/en/main.asp?un=29&sn=83;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": "qaz5517359@gmail.com;~Tsung-Jung_Liu1;khliu@nutc.edu.tw", "aff": ";National Chung Hsing University;", "aff_domain": ";nchu.edu.tw;", "position": ";Associate Professor;", "bibtex": "@misc{\nfan2022compound,\ntitle={Compound Multi-branch Feature Fusion for Real Image Restoration},\nauthor={Chi-Mao Fan and Tsung-Jung Liu and Kuan-Hsien Liu},\nyear={2022},\nurl={https://openreview.net/forum?id=WQIdU90Gsu}\n}", "github": "", "project": "", "reviewers": "ttvV;sRLq;MkTj;eNZ2;WqhR", "site": "https://openreview.net/forum?id=WQIdU90Gsu", "pdf_size": 0, "recommendation": "1;3;3;3;6", "confidence": "5;4;4;5;4", "correctness": "2;3;2;2;3", "technical_novelty": "2;2;2;2;2", "empirical_novelty": "1;2;2;2;3", "wc_summary_paper": "80;64;69;37;99", "wc_summary_review": "142;54;19;45;49", "wc_main_review": "876;439;406;475;322", "wc_review": "1098;557;494;557;470", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "0;0;0;0;0", "reply_reviewers": "0;0;0;0;0", "reply_authors": "0;0;0;0;0", "recommendation_avg": [ 3.2, 1.6 ], "confidence_avg": [ 4.4, 0.48989794855663565 ], "correctness_avg": [ 2.4, 0.4898979485566356 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.0, 0.6324555320336759 ], "wc_summary_paper_avg": [ 69.8, 20.33125672456083 ], "wc_summary_review_avg": [ 61.8, 41.882693323137666 ], "wc_main_review_avg": [ 503.6, 192.96486726863 ], "wc_review_avg": [ 635.2, 233.94136017386924 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.6123724356957946, "corr_recommendation_correctness": 0.6634034720037775, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13528383967076139409&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "id": "WQVouCWioh", "title": "Design in the Dark: Learning Deep Generative Models for De Novo Protein Design", "track": "main", "status": "Reject", "tldr": "", "abstract": "The design of novel protein sequences is providing paths towards the development of novel therapeutics and materials. \nGenerative modelling approaches to design are emerging and to date have required conditioning on 3D protein structure-derived information, and unconditional models of protein sequences have so far performed poorly.\nThus, it is unknown if unconditional generative models can learn a distribution of sequences that captures structure information without it being explicitly provided, and so be of use in important tasks like de novo protein sequence design, where it is not possible to condition on structure.\nHere, we demonstrate that it is possible to use unconditioned generative models to produce realistic samples of protein sequences. \nWe progressively grow a dataset of over half a million synthetic sequences for training autoregressive language models, using an iterative framework we call DARK.\nIt begins by training an autoregressive model on an initial sample of synthetic sequences, sampling from it, and refining the samples thus generated, which are then used for subsequent rounds of training.\nUsing the confidence measures provided by AlphaFold and other measures of sample quality, we show that our approach matches or exceeds the performance of prior methods that use weak conditioning on explicit structural information, and improves after each iteration of DARK.\nCrucially, the DARK framework and the trained models are entirely unsupervised; strong structural signal is an objective, but no model is ever conditioned on any specific structural state.\nThe trained model indirectly learns to incorporate a structural signal into its learned sequence distribution, as this signal is strongly represented in the makeup of the training set at each step.\nOur work demonstrates a way of unconditionally sampling sequences and structures jointly, and in an unsupervised way.", "keywords": "generative models;sequence design;language models;proteins", "primary_area": "", "supplementary_material": "", "author": "Lewis Moffat;Shaun M. Kandathil;David T. Jones", "authorids": "~Lewis_Moffat1;s.kandathil@cs.ucl.ac.uk;dtj@cs.ucl.ac.uk", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": "0000-0002-9378-1250;;", "linkedin": ";;", "or_profile": "~Lewis_Moffat1;s.kandathil@cs.ucl.ac.uk;dtj@cs.ucl.ac.uk", "aff": "Department of Computer Science, University College London;;", "aff_domain": "cs.ucl.ac.uk;;", "position": "PhD student;;", "bibtex": "@misc{\nmoffat2022design,\ntitle={Design in the Dark: Learning Deep Generative Models for De Novo Protein Design},\nauthor={Lewis Moffat and Shaun M. Kandathil and David T. Jones},\nyear={2022},\nurl={https://openreview.net/forum?id=WQVouCWioh}\n}", "github": "", "project": "", "reviewers": "g4tX;mSTZ;Jhfe;jJy9;sSh9", "site": "https://openreview.net/forum?id=WQVouCWioh", "pdf_size": 0, "recommendation": "3;3;3;5;5", "confidence": "5;4;4;4;3", "correctness": "1;3;2;3;3", "technical_novelty": "2;2;2;2;3", "empirical_novelty": "1;2;2;2;2", "wc_summary_paper": "125;17;28;85;447", "wc_summary_review": "87;43;611;58;98", "wc_main_review": "813;431;618;248;1021", "wc_review": "1025;491;1257;391;1566", "wc_reply_reviewers": "317;216;0;0;15", "wc_reply_authors": "3293;1624;0;614;1953", "reply_reviewers": "1;1;0;0;1", "reply_authors": "6;3;0;1;3", "recommendation_avg": [ 3.8, 0.9797958971132712 ], "confidence_avg": [ 4.0, 0.6324555320336759 ], "correctness_avg": [ 2.4, 0.8 ], "technical_novelty_avg": [ 2.2, 0.39999999999999997 ], "empirical_novelty_avg": [ 1.8, 0.4000000000000001 ], "wc_summary_paper_avg": [ 140.4, 158.22212234703466 ], "wc_summary_review_avg": [ 179.4, 216.69573138389228 ], "wc_main_review_avg": [ 626.2, 272.75146195758515 ], "wc_review_avg": [ 946.0, 447.75261026598156 ], "wc_reply_reviewers_avg": [ 109.6, 132.14325559785487 ], "wc_reply_authors_avg": [ 1496.8, 1137.5745074499516 ], "reply_reviewers_avg": [ 0.6, 0.48989794855663565 ], "reply_authors_avg": [ 2.6, 2.0591260281974 ], "replies_avg": [ 24, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.6454972243679028, "corr_recommendation_correctness": 0.6123724356957946, "gs_citation": 29, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1840427416112743408&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0", "aff_unique_norm": "University College London", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.ucl.ac.uk", "aff_unique_abbr": "UCL", "aff_campus_unique_index": "0", "aff_campus_unique": "London", "aff_country_unique_index": "0", "aff_country_unique": "United Kingdom" }, { "id": "WQX6Zel-ZS1", "title": "Camera Bias Regularization for Person Re-identification", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Person re-identification (Re-ID) is to match persons captured by non-overlapping cameras. Due to the discrepancies between cameras caused by illumination, background, or viewpoint, the underlying difficulty for Re-ID is the camera bias problem, which leads to the large gap of within-identity features from different cameras. With limited cross-camera annotated data, Re-ID models tend to learn camera-related features, instead of identity-related features. Consequently, Re-ID models suffer from poor transfer ability from seen domains to unseen domains. In this paper, we investigate the camera bias problem in supervised learning, unsupervised learning, and their variants. In particular, we propose a novel Camera Bias Regularization (CBR) term to reduce the feature distribution gap between cameras by enlarging the intra-camera distance and reducing the inter-camera distance simultaneously. Extensive experiments on person Re-ID tasks validate the effectiveness and universality of the proposed CBR.", "keywords": "Person Re-identification;Image Retrieval", "primary_area": "", "supplementary_material": "", "author": "Tao He;Tongkun Xu;Weihua Chen;Yuchen Guo;Guiguang Ding;Zhenhua Guo", "authorids": "~Tao_He2;~Tongkun_Xu1;~Weihua_Chen1;~Yuchen_Guo1;~Guiguang_Ding1;~Zhenhua_Guo3", "gender": "M;M;M;M;M;M", "homepage": ";;https://cwhgn.github.io;;http://ise.thss.tsinghua.edu.cn/MIG/dgg.html;https://www-en.sz.tsinghua.edu.cn/INFORMATIONSCIENCE/108561.jhtml", "dblp": ";221/0691;;;51/740;41/294-1", "google_scholar": "2U82fIsAAAAJ;gHfL5v0AAAAJ;KWVlYaMAAAAJ;PNMUgAoAAAAJ;https://scholar.google.com.tw/citations?user=B7F3yt4AAAAJ;dbR6bD0AAAAJ", "orcid": ";;0000-0003-4141-7833;;0000-0003-0137-9975;0000-0002-8201-0864", "linkedin": ";;;;;zhenhua-guo-71589020/", "or_profile": "~Tao_He2;~Tongkun_Xu1;~Weihua_Chen1;~Yuchen_Guo1;~Guiguang_Ding1;~Zhenhua_Guo1", "aff": "Tsinghua University;Shandong University;Alibaba Group;Tsinghua University;Tsinghua University;Alibaba Group", "aff_domain": "tsinghua.edu.cn;sdu.edu.cn;alibaba-inc.com;tsinghua.edu.cn;tsinghua.edu.cn;alibaba-inc.com", "position": "PhD student;MS student;Algorithm Engineer;Researcher;Associate Professor;Researcher", "bibtex": "@misc{\nhe2022camera,\ntitle={Camera Bias Regularization for Person Re-identification},\nauthor={Tao He and Tongkun Xu and Weihua Chen and Yuchen Guo and Guiguang Ding and Zhenhua Guo},\nyear={2022},\nurl={https://openreview.net/forum?id=WQX6Zel-ZS1}\n}", "github": "", "project": "", "reviewers": "c2od;6jz1;YjWh;cUf5", "site": "https://openreview.net/forum?id=WQX6Zel-ZS1", "pdf_size": 0, "recommendation": "3;5;5;5", "confidence": "4;4;4;4", "correctness": "3;4;3;3", "technical_novelty": "2;2;3;2", "empirical_novelty": "2;0;3;2", "wc_summary_paper": "43;69;65;55", "wc_summary_review": "41;49;77;49", "wc_main_review": "417;212;242;204", "wc_review": "501;330;384;308", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.5, 0.8660254037844386 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.75, 1.0897247358851685 ], "wc_summary_paper_avg": [ 58.0, 10.04987562112089 ], "wc_summary_review_avg": [ 54.0, 13.674794331177344 ], "wc_main_review_avg": [ 268.75, 86.75648390754434 ], "wc_review_avg": [ 380.75, 74.73076675640361 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15613335160778211597&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1;2;0;0;2", "aff_unique_norm": "Tsinghua University;Shandong University;Alibaba Group", "aff_unique_dep": ";;", "aff_unique_url": "https://www.tsinghua.edu.cn;http://www.sdu.edu.cn;https://www.alibaba.com", "aff_unique_abbr": "THU;SDU;Alibaba", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "CodeTrek: Flexible Modeling of Code using an Extensible Relational Representation", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6737", "id": "WQc075jmBmf", "poster": "", "openreview": "https://openreview.net/forum?id=WQc075jmBmf", "slides": "https://iclr.cc/virtual/2022/poster/6737", "video": "https://iclr.cc/virtual/2022/poster/6737", "author_site": "Pardis Pashakhanloo, Aaditya Naik, Yuepeng Wang, Hanjun Dai, Petros Maniatis, Mayur Naik", "tldr": "", "abstract": "Designing a suitable representation for code-reasoning tasks is challenging in aspects such as the kinds of program information to model, how to combine them, and how much context to consider. We propose CodeTrek, a deep learning approach that addresses these challenges by representing codebases as databases that conform to rich relational schemas. The relational representation not only allows CodeTrek to uniformly represent diverse kinds of program information, but also to leverage program-analysis queries to derive new semantic relations, which can be readily incorporated without further architectural engineering. CodeTrek embeds this relational representation using a set of walks that can traverse different relations in an unconstrained fashion, and incorporates all relevant attributes along the way. We evaluate CodeTrek on four diverse and challenging Python tasks: variable misuse, exception prediction, unused definition, and variable shadowing. CodeTrek achieves an accuracy of 91%, 63%, 98%, and 94% on these tasks respectively, and outperforms state-of-the-art neural models by 2-19% points.", "keywords": "relational database;code representation;knowledge graph reasoning;program understanding", "primary_area": "", "supplementary_material": "/attachment/5cf08b9b3e094fe4a95688d7975f8bea63508637.zip", "author": "Pardis Pashakhanloo;Aaditya Naik;Yuepeng Wang;Hanjun Dai;Petros Maniatis;Mayur Naik", "authorids": "~Pardis_Pashakhanloo1;~Aaditya_Naik1;~Yuepeng_Wang1;~Hanjun_Dai1;~Petros_Maniatis1;~Mayur_Naik1", "gender": "F;M;M;M;M;M", "homepage": "https://pardisp.github.io/;https://www.seas.upenn.edu/~asnaik;https://www.cs.sfu.ca/~yuepeng;https://hanjun-dai.github.io;https://ai.google/research/people/PetrosManiatis;http://www.cis.upenn.edu/~mhnaik/", "dblp": ";269/9481;;144/7311;m/PetrosManiatis;92/6794", "google_scholar": "h7nE8esAAAAJ;EfE0jh4AAAAJ;;obpl7GQAAAAJ;https://scholar.google.com/citations?hl=en;https://scholar.google.com.tw/citations?user=fmsV6nEAAAAJ", "orcid": ";;;;;", "linkedin": ";;;hanjun-dai;;ai4code/", "or_profile": "~Pardis_Pashakhanloo1;~Aaditya_Naik1;~Yuepeng_Wang1;~Hanjun_Dai1;~Petros_Maniatis1;~Mayur_Naik1", "aff": "School of Engineering and Applied Science, University of Pennsylvania;University of Pennsylvania;Simon Fraser University;Google Research;Google Research, Brain Team;University of Pennsylvania", "aff_domain": "seas.upenn.edu;upenn.edu;sfu.ca;google.com;google.com;upenn.edu", "position": "PhD student;PhD student;Assistant Professor;Researcher;Research Scientist;Professor", "bibtex": "@inproceedings{\npashakhanloo2022codetrek,\ntitle={CodeTrek: Flexible Modeling of Code using an Extensible Relational Representation},\nauthor={Pardis Pashakhanloo and Aaditya Naik and Yuepeng Wang and Hanjun Dai and Petros Maniatis and Mayur Naik},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=WQc075jmBmf}\n}", "github": "", "project": "", "reviewers": "okqQ;3Z8h;yHJY;n7bP", "pdf_size": 0, "recommendation": "5;5;5;8", "confidence": "5;4;5;5", "correctness": "3;2;4;4", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "59;131;108;80", "wc_summary_review": "41;45;137;278", "wc_main_review": "394;292;428;597", "wc_review": "494;468;673;955", "wc_reply_reviewers": "328;78;0;880", "wc_reply_authors": "993;937;1027;1991", "reply_reviewers": "2;1;0;2", "reply_authors": "2;3;2;5", "recommendation_avg": [ 5.75, 1.299038105676658 ], "confidence_avg": [ 4.75, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 94.5, 27.31757675929547 ], "wc_summary_review_avg": [ 125.25, 96.18829190707152 ], "wc_main_review_avg": [ 427.75, 109.78700970515592 ], "wc_review_avg": [ 647.5, 194.28651522944148 ], "wc_reply_reviewers_avg": [ 321.5, 344.4629878521058 ], "wc_reply_authors_avg": [ 1237.0, 436.5065864336986 ], "reply_reviewers_avg": [ 1.25, 0.82915619758885 ], "reply_authors_avg": [ 3.0, 1.224744871391589 ], "replies_avg": [ 24, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": 0.5222329678670935, "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10059664661976088389&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "pdf": "https://openreview.net/pdf?id=WQc075jmBmf", "email": "seas.upenn.edu;upenn.edu;sfu.ca;google.com;google.com;upenn.edu", "author_num": 6, "aff_unique_index": "0;0;1;2;2;0", "aff_unique_norm": "University of Pennsylvania;Simon Fraser University;Google", "aff_unique_dep": "School of Engineering and Applied Science;;Google Research", "aff_unique_url": "https://www.upenn.edu;https://www.sfu.ca;https://research.google", "aff_unique_abbr": "UPenn;SFU;Google Research", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;1;0;0;0", "aff_country_unique": "United States;Canada" }, { "id": "WRORN3GUCu", "title": "VISCOS Flows: Variational Schur Conditional Sampling with Normalizing Flows", "track": "main", "status": "Reject", "tldr": "", "abstract": "We present a method for conditional sampling for pre-trained normalizing flows when only part of an observation is available. We derive a lower bound to the conditioning variable log-probability using Schur complement properties in the spirit of Gaussian conditional sampling. Our derivation relies on partitioning flow's domain in such a way that the flow restrictions to subdomains remain bijective, which is crucial for the Schur complement application. Simulation from the variational conditional flow then amends to solving an equality constraint. Our contribution is three-fold: a) we provide detailed insights on the choice of variational distributions; b) we discuss how to partition the input space of the flow to preserve bijectivity property; c) we propose a set of methods to optimise the variational distribution. Our numerical results indicate that our sampling method can be successfully applied to invertible residual networks for inference and classification.", "keywords": "Normalizing Flows;Conditional Sampling;Implicit Methods", "primary_area": "", "supplementary_material": "", "author": "Vincent Moens;Aivar Sootla;Haitham Bou Ammar;Jun Wang", "authorids": "~Vincent_Moens3;~Aivar_Sootla1;~Haitham_Bou_Ammar1;~Jun_Wang2", "gender": "M;M;M;M", "homepage": "https://github.com/vmoens;;;http://www0.cs.ucl.ac.uk/staff/jun.wang/", "dblp": "220/5625;66/9184;;w/JunWang12", "google_scholar": "8l-tvFoAAAAJ;https://scholar.google.co.uk/citations?hl=en;https://scholar.google.co.uk/citations?user=AE5suDoAAAAJ;https://scholar.google.co.uk/citations?user=wIE1tY4AAAAJ", "orcid": ";;;", "linkedin": "vincent-moens-9bb91972/;;;", "or_profile": "~Vincent_Moens3;~Aivar_Sootla1;~Haitham_Bou_Ammar1;~Jun_Wang2", "aff": "Meta;Huawei R&D UK;Huawei R&D UK;University College London", "aff_domain": "fb.com;huawei.com;huawei.com;ucl.ac.uk", "position": "Applied ML Scientist;Research scientist;Principal Researcher;Professor", "bibtex": "@misc{\nmoens2022viscos,\ntitle={{VISCOS} Flows: Variational Schur Conditional Sampling with Normalizing Flows},\nauthor={Vincent Moens and Aivar Sootla and Haitham Bou Ammar and Jun Wang},\nyear={2022},\nurl={https://openreview.net/forum?id=WRORN3GUCu}\n}", "github": "", "project": "", "reviewers": "puL5;urKg;7gsd", "site": "https://openreview.net/forum?id=WRORN3GUCu", "pdf_size": 0, "recommendation": "3;3;5", "confidence": "3;4;3", "correctness": "4;4;3", "technical_novelty": "3;2;3", "empirical_novelty": "0;1;1", "wc_summary_paper": "74;46;58", "wc_summary_review": "43;39;32", "wc_main_review": "304;415;240", "wc_review": "421;500;330", "wc_reply_reviewers": "137;0;15", "wc_reply_authors": "273;149;99", "reply_reviewers": "1;0;1", "reply_authors": "1;1;1", "recommendation_avg": [ 3.6666666666666665, 0.9428090415820634 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "correctness_avg": [ 3.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 0.6666666666666666, 0.4714045207910317 ], "wc_summary_paper_avg": [ 59.333333333333336, 11.469767022723502 ], "wc_summary_review_avg": [ 38.0, 4.546060565661952 ], "wc_main_review_avg": [ 319.6666666666667, 72.29722601102264 ], "wc_review_avg": [ 417.0, 69.4598205199716 ], "wc_reply_reviewers_avg": [ 50.666666666666664, 61.35325763333372 ], "wc_reply_authors_avg": [ 173.66666666666666, 73.14521325570267 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.5, "corr_recommendation_correctness": -1.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12667169551825478844&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;1;2", "aff_unique_norm": "Meta;Huawei;University College London", "aff_unique_dep": "Meta Platforms, Inc.;R&D;", "aff_unique_url": "https://meta.com;https://www.huawei.com/uk;https://www.ucl.ac.uk", "aff_unique_abbr": "Meta;Huawei;UCL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1", "aff_country_unique": "United States;United Kingdom" }, { "id": "WTXMNULQ3Uu", "title": "Generating Scenes with Latent Object Models", "track": "main", "status": "Reject", "tldr": "", "abstract": "We introduce a structured latent variable model that learns the underlying data-generating process for a dataset of scenes. Our goals are to obtain a compositional scene representation and to perform scene generation by modeling statistical relationships between scenes as well as between objects within a scene. To make inference tractable, we take inspiration from visual topic models and introduce an interpretable hierarchy of scene-level and object-level latent variables (i.e., slots). Since generating scenes requires modeling dependencies between objects, we cannot make a bag-of-words assumption to simplify inference. Moreover, assuming that slots are generated with an autoregressive prior requires decomposing scenes sequentially during inference which has known limitations. Our approach is to assume that the assignment of objects to slots during generation is a deterministic function of the scene latent variable. This removes the need for sequential scene decomposition and enables us to propose an inference algorithm that uses orderless scene decomposition to indirectly estimate an ordered slot posterior. Qualitative and quantitative analysis establishes that our approach successfully learns a smoothly traversable scene-level latent space. The hierarchy of scene and slot variables improves the ability of slot-based models to generate samples displaying complex object relations. We also demonstrate that the learned hierarchy of representations can be used for a scene-retrieval application with object-centric re-ranking.", "keywords": "deep generative models;slots;scene generation;object-centric;VAEs", "primary_area": "", "supplementary_material": "/attachment/39ca7559933d9c5677c5d4ec25478a2ed1931c47.zip", "author": "Patrick Emami;Pan He;Sanjay Ranka;Anand Rangarajan", "authorids": "~Patrick_Emami1;~Pan_He1;~Sanjay_Ranka1;~Anand_Rangarajan1", "gender": "M;M;;M", "homepage": "http://pemami4911.github.io;http://bestsonny.github.io/;;http://www.cise.ufl.edu/~anand", "dblp": "153/7716;26/8730;;90/6511-1", "google_scholar": "WSU6_r0AAAAJ;Y_ABdTgAAAAJ;;6JEDxqcAAAAJ", "orcid": ";;;0000-0001-8695-8436", "linkedin": ";;;", "or_profile": "~Patrick_Emami1;~Pan_He1;~Sanjay_Ranka1;~Anand_Rangarajan1", "aff": "National Renewable Energy Lab;University of Florida;University of Florida;University of Florida", "aff_domain": "nrel.gov;ufl.edu;ufl.edu;ufl.edu", "position": "Postdoc;PhD student;;Professor", "bibtex": "@misc{\nemami2022generating,\ntitle={Generating Scenes with Latent Object Models},\nauthor={Patrick Emami and Pan He and Sanjay Ranka and Anand Rangarajan},\nyear={2022},\nurl={https://openreview.net/forum?id=WTXMNULQ3Uu}\n}", "github": "", "project": "", "reviewers": "4aYc;T7nj;tg1E;XQMo", "site": "https://openreview.net/forum?id=WTXMNULQ3Uu", "pdf_size": 0, "recommendation": "3;3;5;6", "confidence": "4;5;4;4", "correctness": "1;2;3;3", "technical_novelty": "2;1;3;3", "empirical_novelty": "2;2;4;3", "wc_summary_paper": "107;123;159;143", "wc_summary_review": "99;78;88;83", "wc_main_review": "353;768;901;294", "wc_review": "559;969;1148;520", "wc_reply_reviewers": "118;252;0;115", "wc_reply_authors": "1659;1452;851;901", "reply_reviewers": "1;1;0;1", "reply_authors": "4;2;2;3", "recommendation_avg": [ 4.25, 1.299038105676658 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 2.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.25, 0.82915619758885 ], "empirical_novelty_avg": [ 2.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 133.0, 19.697715603592208 ], "wc_summary_review_avg": [ 87.0, 7.7781745930520225 ], "wc_main_review_avg": [ 579.0, 260.6271282886722 ], "wc_review_avg": [ 799.0, 267.46121214112526 ], "wc_reply_reviewers_avg": [ 121.25, 89.22828867573332 ], "wc_reply_authors_avg": [ 1215.75, 347.9923670139907 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 2.75, 0.82915619758885 ], "replies_avg": [ 20, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.5555555555555555, "corr_recommendation_correctness": 0.8703882797784892, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:ejzabUjMvwoJ:scholar.google.com/&scioq=Generating+Scenes+with+Latent+Object+Models&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;1;1", "aff_unique_norm": "National Renewable Energy Laboratory;University of Florida", "aff_unique_dep": ";", "aff_unique_url": "https://www.nrel.gov;https://www.ufl.edu", "aff_unique_abbr": "NREL;UF", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Robust Learning Meets Generative Models: Can Proxy Distributions Improve Adversarial Robustness?", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6374", "id": "WVX0NNVBBkV", "poster": "", "openreview": "https://openreview.net/forum?id=WVX0NNVBBkV", "slides": "https://iclr.cc/virtual/2022/poster/6374", "video": "https://iclr.cc/virtual/2022/poster/6374", "author_site": "Vikash Sehwag, Saeed Mahloujifar, Tinashe Handina, Sihui Dai, Chong Xiang, Mung Chiang, Prateek Mittal", "tldr": "", "abstract": "While additional training data improves the robustness of deep neural networks against adversarial examples, it presents the challenge of curating a large number of specific real-world samples. We circumvent this challenge by using additional data from proxy distributions learned by advanced generative models. We first seek to formally understand the transfer of robustness from classifiers trained on proxy distributions to the real data distribution. We prove that the difference between the robustness of a classifier on the two distributions is upper bounded by the conditional Wasserstein distance between them. Next we use proxy distributions to significantly improve the performance of adversarial training on five different datasets. For example, we improve robust accuracy by up to $7.5$% and $6.7$% in $\\ell_{\\infty}$ and $\\ell_2$ threat model over baselines that are not using proxy distributions on the CIFAR-10 dataset. We also improve certified robust accuracy by $7.6$% on the CIFAR-10 dataset. We further demonstrate that different generative models brings a disparate improvement in the performance in robust training. We propose a robust discrimination approach to characterize the impact and further provide a deeper understanding of why diffusion-based generative models are a better choice for proxy distribution than generative adversarial networks.", "keywords": "adversarial robustness;certified adversarial robustness;adversarial attacks;generative models;proxy distribution", "primary_area": "", "supplementary_material": "/attachment/ddf8f50e25ae557f137880f544fba3f294cb3f3f.zip", "author": "Vikash Sehwag;Saeed Mahloujifar;Tinashe Handina;Sihui Dai;Chong Xiang;Mung Chiang;Prateek Mittal", "authorids": "~Vikash_Sehwag1;~Saeed_Mahloujifar1;~Tinashe_Handina1;~Sihui_Dai1;~Chong_Xiang1;~Mung_Chiang2;~Prateek_Mittal1", "gender": "M;M;;F;;M;", "homepage": "https://vsehwag.github.io/;https://www.cs.virginia.edu/~sm5fd/;;;https://xiangchong.xyz/;https://www.purdue.edu/president/about/mung-chiang/;http://www.princeton.edu/~pmittal/", "dblp": "187/5613;208/0825;;244/9642;133/5336-1;;", "google_scholar": "JAkeEG8AAAAJ;kW-hl3YAAAAJ;;;aLXNz30AAAAJ;;https://scholar.google.com.tw/citations?user=xTKD8J4AAAAJ", "orcid": ";;;;;;0000-0002-4057-0118", "linkedin": ";;tinashe-handina02250/;;;;", "or_profile": "~Vikash_Sehwag1;~Saeed_Mahloujifar1;~Tinashe_Handina1;~Sihui_Dai1;~Chong_Xiang1;~Mung_Chiang2;~Prateek_Mittal1", "aff": "Princeton University;Princeton University;California Institute of Technology;Princeton University;Princeton University;Purdue University;Princeton University", "aff_domain": "princeton.edu;princeton.edu;caltech.edu;princeton.edu;princeton.edu;purdue.edu;princeton.edu", "position": "PhD student;Postdoc;PhD student;PhD student;PhD student;Full Professor;Full Professor", "bibtex": "@inproceedings{\nsehwag2022robust,\ntitle={Robust Learning Meets Generative Models: Can Proxy Distributions Improve Adversarial Robustness?},\nauthor={Vikash Sehwag and Saeed Mahloujifar and Tinashe Handina and Sihui Dai and Chong Xiang and Mung Chiang and Prateek Mittal},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=WVX0NNVBBkV}\n}", "github": "", "project": "", "reviewers": "sAeN;NSbG;fLTj;c1MR", "pdf_size": 0, "recommendation": "6;6;8;8", "confidence": "3;4;4;3", "correctness": "4;4;4;4", "technical_novelty": "3;3;3;3", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "41;33;117;115", "wc_summary_review": "35;23;32;66", "wc_main_review": "215;110;282;213", "wc_review": "291;166;431;394", "wc_reply_reviewers": "138;81;5;5", "wc_reply_authors": "1818;965;705;268", "reply_reviewers": "2;1;1;1", "reply_authors": "4;2;1;1", "recommendation_avg": [ 7.0, 1.0 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 4.0, 0.0 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 76.5, 39.607448794387146 ], "wc_summary_review_avg": [ 39.0, 16.20185174601965 ], "wc_main_review_avg": [ 205.0, 61.477638210978796 ], "wc_review_avg": [ 320.5, 102.8992225432243 ], "wc_reply_reviewers_avg": [ 57.25, 56.00167408212008 ], "wc_reply_authors_avg": [ 939.0, 565.3127453012182 ], "reply_reviewers_avg": [ 1.25, 0.4330127018922193 ], "reply_authors_avg": [ 2.0, 1.224744871391589 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 177, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15097099690109904849&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=WVX0NNVBBkV", "email": "princeton.edu;princeton.edu;caltech.edu;princeton.edu;princeton.edu;purdue.edu;princeton.edu", "author_num": 7, "aff_unique_index": "0;0;1;0;0;2;0", "aff_unique_norm": "Princeton University;California Institute of Technology;Purdue University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.princeton.edu;https://www.caltech.edu;https://www.purdue.edu", "aff_unique_abbr": "Princeton;Caltech;Purdue", "aff_campus_unique_index": "1", "aff_campus_unique": ";Pasadena", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "WXwg_9eRQ0T", "title": "MergeBERT: Program Merge Conflict Resolution via Neural Transformers", "track": "main", "status": "Reject", "tldr": "", "abstract": "Collaborative software development is an integral part of the modern software development life cycle, essential to the success of large-scale software projects. When multiple developers make concurrent changes around the same lines of code, a merge conflict may occur. \nSuch conflicts stall pull requests and continuous integration pipelines for hours to several days, seriously hurting developer productivity.\n\nIn this paper, we introduce MergeBERT, a novel neural program merge framework based on the token-level three-way differencing and a transformer encoder model. Exploiting restricted nature of merge conflict resolutions, we reformulate the task of generating the resolution sequence as a classification task over a set of primitive merge patterns extracted from real-world merge commit data.\n\nOur model achieves 63--68\\% accuracy of merge resolution synthesis, yielding nearly a 3$\\times$ performance improvement over existing structured, and 2$\\times$ improvement over neural program merge tools. Finally, we demonstrate that MergeBERT is sufficiently flexible to work with source code files in Java, JavaScript, TypeScript, and C\\# programming languages, and can generalize zero-shot to unseen languages.", "keywords": "Software evolution;program merge;ml4code", "primary_area": "", "supplementary_material": "", "author": "Alexey Svyatkovskiy;Todd Mytkowicz;Negar Ghorbani;Sarah Fakhoury;Elizabeth A Dinella;Christian Bird;Neel Sundaresan;Shuvendu Lahiri", "authorids": "~Alexey_Svyatkovskiy1;~Todd_Mytkowicz1;negargh@uci.edu;sarah.fakhoury@wsu.edu;~Elizabeth_A_Dinella1;cbird@microsoft.com;~Neel_Sundaresan3;shuvendu@microsoft.com", "gender": "M;M;;;F;;;", "homepage": "https://www.microsoft.com/en-us/research/people/alsvyatk/;https://www.microsoft.com/en-us/research/people/toddm/;;;https://www.seas.upenn.edu/~edinella/;;https://www.linkedin.com/in/neel-sundaresan-a964a2/;", "dblp": "198/0454;;;;214/8020;;s/NeelSundaresan.html;", "google_scholar": "0Oj4J4wAAAAJ;;;;DxQ4wV0AAAAJ;;;", "orcid": "0000-0001-7714-4481;;;;;;;", "linkedin": ";;;;;;neel-sundaresan-a964a2/;", "or_profile": "~Alexey_Svyatkovskiy1;~Todd_Mytkowicz1;negargh@uci.edu;sarah.fakhoury@wsu.edu;~Elizabeth_A_Dinella1;cbird@microsoft.com;~Neel_Sundaresan3;shuvendu@microsoft.com", "aff": "Microsoft;Microsoft;;;University of Pennsylvania;;University of California, Santa Cruz;", "aff_domain": "microsoft.com;microsoft.com;;;cis.upenn.edu;;ucsc.edu;", "position": "Principal Researcher;Researcher;;;PhD student;;Full Professor (adjunct);", "bibtex": "@misc{\nsvyatkovskiy2022mergebert,\ntitle={Merge{BERT}: Program Merge Conflict Resolution via Neural Transformers},\nauthor={Alexey Svyatkovskiy and Todd Mytkowicz and Negar Ghorbani and Sarah Fakhoury and Elizabeth A Dinella and Christian Bird and Neel Sundaresan and Shuvendu Lahiri},\nyear={2022},\nurl={https://openreview.net/forum?id=WXwg_9eRQ0T}\n}", "github": "", "project": "", "reviewers": "TUUK;k5J6;vcTN", "site": "https://openreview.net/forum?id=WXwg_9eRQ0T", "pdf_size": 0, "recommendation": "3;6;6", "confidence": "4;4;4", "correctness": "3;3;2", "technical_novelty": "2;3;2", "empirical_novelty": "2;3;2", "wc_summary_paper": "91;50;231", "wc_summary_review": "124;17;13", "wc_main_review": "570;233;723", "wc_review": "785;300;967", "wc_reply_reviewers": "0;145;0", "wc_reply_authors": "1533;1375;788", "reply_reviewers": "0;1;0", "reply_authors": "2;2;1", "recommendation_avg": [ 5.0, 1.4142135623730951 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 2.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 124.0, 77.48978427293927 ], "wc_summary_review_avg": [ 51.333333333333336, 51.40903509003926 ], "wc_main_review_avg": [ 508.6666666666667, 204.6889238712138 ], "wc_review_avg": [ 684.0, 281.5113970457798 ], "wc_reply_reviewers_avg": [ 48.333333333333336, 68.3536555146996 ], "wc_reply_authors_avg": [ 1232.0, 320.5131302562606 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": -0.5, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13918545667459734632&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "aff_unique_index": "0;0;1;2", "aff_unique_norm": "Microsoft;University of Pennsylvania;University of California, Santa Cruz", "aff_unique_dep": "Microsoft Corporation;;", "aff_unique_url": "https://www.microsoft.com;https://www.upenn.edu;https://www.ucsc.edu", "aff_unique_abbr": "Microsoft;UPenn;UCSC", "aff_campus_unique_index": "1", "aff_campus_unique": ";Santa Cruz", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "WXy4C-RjET", "title": "Logit Attenuating Weight Normalization", "track": "main", "status": "Reject", "tldr": "", "abstract": "Over-parameterized deep networks trained using gradient-based optimizers is a popular way of solving classification and ranking problems. Without appropriately tuned regularization, such networks have the tendency to make output scores (logits) and network weights large, causing training loss to become too small and the network to lose its adaptivity (ability to move around and escape regions of poor generalization) in the weight space. Adaptive optimizers like Adam, being aggressive at optimizing the train loss, are particularly affected by this. It is well known that, even with weight decay (WD) and normal hyper-parameter tuning, adaptive optimizers lag behind SGD a lot in terms of generalization performance, mainly in the image classification domain.\n\nAn alternative to WD for improving a network's adaptivity is to directly control the magnitude of the weights and hence the logits. We propose a method called Logit Attenuating Weight Normalization (LAWN), that can be stacked onto any gradient-based optimizer. LAWN initially starts off training in a free (unregularized) mode and, after some initial epochs, it constrains the weight norms of layers, thereby controlling the logits and improving adaptivity. This is a new regularization approach that does not use WD anywhere; instead, the number of initial free epochs becomes the new hyper-parameter. The resulting LAWN variant of adaptive optimizers gives a solid lift to generalization performance, making their performance equal or even exceed SGD's performance on benchmark image classification and recommender datasets. Another important feature is that LAWN also greatly improves the adaptive optimizers when used with large batch sizes.\n", "keywords": "deep learning;gradient methods;stochastic optimization;generalization gap;imagenet;adam;large batch training", "primary_area": "", "supplementary_material": "/attachment/c05248a2bc20c38687cae03b9050491caa06c93e.zip", "author": "Aman Gupta;Rohan Ramanath;Jun Shi;Anika Ramachandran;Sirou Zhu;Mingzhou Zhou;Sathiya Keerthi", "authorids": "~Aman_Gupta1;~Rohan_Ramanath1;~Jun_Shi4;~Anika_Ramachandran1;~Sirou_Zhu2;~Mingzhou_Zhou1;~Sathiya_Keerthi1", "gender": "M;;;F;F;M;M", "homepage": ";http://ramanath.me/;;;;https://www.linkedin.com/in/mingzhouzhou/;http://www.keerthis.com", "dblp": "359/1957.html;;;;;;", "google_scholar": "vuGUPbkAAAAJ;;;;;;Sr7jln4AAAAJ", "orcid": ";;;;;;", "linkedin": "aman-gupta1/;;jun-shi-5a3207149/;anika-ramachandran/;sirouz-97/;mingzhouzhou/;", "or_profile": "~Aman_Gupta1;~Rohan_Ramanath1;~Jun_Shi4;~Anika_Ramachandran1;~Sirou_Zhu2;~Mingzhou_Zhou1;~Sathiya_Keerthi1", "aff": "LinkedIn;LinkedIn;LinkedIn;University of California, Berkeley;LinkedIn;LinkedIn;", "aff_domain": "linkedin.com;linkedin.com;linkedin.com;berkeley.edu;linkedin.com;linkedin.com;", "position": "Researcher;Sr. Staff Engineer;Engineer;Undergrad student;Engineer;Researcher;", "bibtex": "@misc{\ngupta2022logit,\ntitle={Logit Attenuating Weight Normalization},\nauthor={Aman Gupta and Rohan Ramanath and Jun Shi and Anika Ramachandran and Sirou Zhu and Mingzhou Zhou and Sathiya Keerthi},\nyear={2022},\nurl={https://openreview.net/forum?id=WXy4C-RjET}\n}", "github": "", "project": "", "reviewers": "Xf6G;qsYk;7vWF;dDDW", "site": "https://openreview.net/forum?id=WXy4C-RjET", "pdf_size": 0, "recommendation": "3;5;5;5", "confidence": "5;4;4;3", "correctness": "3;3;3;3", "technical_novelty": "2;2;3;2", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "54;114;76;62", "wc_summary_review": "21;33;30;40", "wc_main_review": "227;365;399;231", "wc_review": "302;512;505;333", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "1102;432;666;689", "reply_reviewers": "0;0;0;0", "reply_authors": "2;1;1;1", "recommendation_avg": [ 4.5, 0.8660254037844386 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 76.5, 23.038012067016545 ], "wc_summary_review_avg": [ 31.0, 6.819090848492928 ], "wc_main_review_avg": [ 305.5, 77.45159778855437 ], "wc_review_avg": [ 413.0, 96.15872295325057 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 722.25, 241.20776832432242 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": -0.816496580927726, "corr_recommendation_correctness": 0.0, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=199008801261262399&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;0;1;0;0", "aff_unique_norm": "LinkedIn Corporation;University of California, Berkeley", "aff_unique_dep": ";", "aff_unique_url": "https://www.linkedin.com;https://www.berkeley.edu", "aff_unique_abbr": "LinkedIn;UC Berkeley", "aff_campus_unique_index": "1", "aff_campus_unique": ";Berkeley", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "WYDzDksK5b", "title": "DiBB: Distributing Black-Box Optimization", "track": "main", "status": "Reject", "tldr": "", "abstract": "We present a novel framework for Distributing Black-Box Optimization (DiBB). DiBB can encapsulate any Black Box Optimization (BBO) method, making it of particular interest for scaling and distributing modern Evolution Strategies (ES), such as CMA-ES and its variants, which maintain a sampling covariance matrix throughout the run. Due to high algorithmic complexity however, such methods are unsuitable alone to address high-dimensional problems, e.g. for sophisticated Reinforcement Learning (RL) control. This limits the applicable methods to simpler ES, which trade off faster updates for lowered sample efficiency. DiBB overcomes this limitation by means of problem decomposition, leveraging expert knowledge in the problem structure such as a known topology for a neural network controller. This allows to distribute the workload across an arbitrary number of nodes in a cluster, while maintaining the feasibility of second order (covariance) learning on high-dimensional problems. The computational complexity per node is bounded by the (arbitrary) size of blocks of variables, which is independent of the problem size.", "keywords": "Black Box Optimization;Distributed Computing;Evolutionary Computation", "primary_area": "", "supplementary_material": "", "author": "Giuseppe Cuccu;Luca Sven Rolshoven;Fabien Vorpe;Philippe Cudre-Mauroux;Tobias Glasmachers", "authorids": "~Giuseppe_Cuccu1;luca.rolshoven@students.unibe.ch;fabien.vorpe@unine.ch;~Philippe_Cudre-Mauroux1;~Tobias_Glasmachers1", "gender": "M;;;M;", "homepage": "https://exascale.info/members/giuseppe-cuccu/;;;https://exascale.info/phil/;", "dblp": "20/7185;;;71/5578;", "google_scholar": "http://scholar.google.com/citations?user=-BLm8NQAAAAJ;;;https://scholar.google.ch/citations?user=NpccCXwAAAAJ;", "orcid": "0000-0002-1005-5246;;;0000-0003-2588-4212;", "linkedin": "giuse;;;p-c-m/;", "or_profile": "~Giuseppe_Cuccu1;luca.rolshoven@students.unibe.ch;fabien.vorpe@unine.ch;~Philippe_Cudre-Mauroux1;~Tobias_Glasmachers1", "aff": "Fribourg University;;;University of Fribourg;", "aff_domain": "unifr.ch;;;unifr.ch;", "position": "Senior Researcher;;;Full Professor;", "bibtex": "@misc{\ncuccu2022dibb,\ntitle={Di{BB}: Distributing Black-Box Optimization},\nauthor={Giuseppe Cuccu and Luca Sven Rolshoven and Fabien Vorpe and Philippe Cudre-Mauroux and Tobias Glasmachers},\nyear={2022},\nurl={https://openreview.net/forum?id=WYDzDksK5b}\n}", "github": "", "project": "", "reviewers": "CxhD;6N3W;mpL3;wFNK", "site": "https://openreview.net/forum?id=WYDzDksK5b", "pdf_size": 0, "recommendation": "3;3;3;6", "confidence": "4;3;4;3", "correctness": "3;3;2;4", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "69;119;50;68", "wc_summary_review": "58;22;30;62", "wc_main_review": "345;416;324;443", "wc_review": "472;557;404;573", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.75, 1.299038105676658 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 76.5, 25.675864152935535 ], "wc_summary_review_avg": [ 43.0, 17.291616465790582 ], "wc_main_review_avg": [ 382.0, 49.01530373260988 ], "wc_review_avg": [ 501.5, 68.13405903070799 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.816496580927726, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8020783918702693640&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0", "aff_unique_norm": "University of Fribourg", "aff_unique_dep": "", "aff_unique_url": "https://www.unifr.ch", "aff_unique_abbr": "UNIFR", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Switzerland" }, { "title": "An Unconstrained Layer-Peeled Perspective on Neural Collapse", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6209", "id": "WZ3yjh8coDg", "poster": "", "openreview": "https://openreview.net/forum?id=WZ3yjh8coDg", "slides": "https://iclr.cc/virtual/2022/poster/6209", "video": "https://iclr.cc/virtual/2022/poster/6209", "author_site": "Wenlong Ji, Yiping Lu, Yiliang Zhang, Zhun Deng, Weijie J Su", "tldr": "", "abstract": "Neural collapse is a highly symmetric geometry of neural networks that emerges during the terminal phase of training, with profound implications on the generalization performance and robustness of the trained networks. To understand how the last-layer features and classifiers exhibit this recently discovered implicit bias, in this paper, we introduce a surrogate model called the unconstrained layer-peeled model (ULPM). We prove that gradient flow on this model converges to critical points of a minimum-norm separation problem exhibiting neural collapse in its global minimizer. Moreover, we show that the ULPM with the cross-entropy loss has a benign global landscape for its loss function, which allows us to prove that all the critical points are strict saddle points except the global minimizers that exhibit the neural collapse phenomenon. Empirically, we show that our results also hold during the training of neural networks in real-world tasks when explicit regularization or weight decay is not used.", "keywords": "neural collapse;uncostrained model;implicit regularization", "primary_area": "", "supplementary_material": "/attachment/bfb832d58bd1c82bce12a9295410d61d104b7f57.zip", "author": "Wenlong Ji;Yiping Lu;Yiliang Zhang;Zhun Deng;Weijie J Su", "authorids": "~Wenlong_Ji1;~Yiping_Lu1;~Yiliang_Zhang1;~Zhun_Deng1;~Weijie_J_Su1", "gender": "M;M;;M;M", "homepage": "https://wenlong2000.github.io/;https://2prime.github.io/;;https://www.zhundeng.org/;http://stat.wharton.upenn.edu/~suw/", "dblp": ";93/683-1;;204/4353;228/9127", "google_scholar": ";NmhvVBgAAAAJ;;nkmi-moAAAAJ;Uhf4nBkAAAAJ", "orcid": ";;;;", "linkedin": ";;yiliang-zhang-b75412197/;;", "or_profile": "~Wenlong_Ji1;~Yiping_Lu1;~Yiliang_Zhang1;~Zhun_Deng1;~Weijie_J_Su1", "aff": "Peking University;Stanford University;University of Pennsylvania;Harvard University;University of Pennsylvania", "aff_domain": "pku.edu.cn;stanford.edu;sas.upenn.edu;harvard.edu;upenn.edu", "position": "Undergrad student;PhD student;PhD student;PhD student;Assistant Professor", "bibtex": "@inproceedings{\nji2022an,\ntitle={An Unconstrained Layer-Peeled Perspective on Neural Collapse},\nauthor={Wenlong Ji and Yiping Lu and Yiliang Zhang and Zhun Deng and Weijie J Su},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=WZ3yjh8coDg}\n}", "github": "", "project": "", "reviewers": "sXBT;PTW6;ez35;Au7e", "pdf_size": 0, "recommendation": "6;6;8;8", "confidence": "4;3;3;4", "correctness": "4;3;4;3", "technical_novelty": "3;3;3;3", "empirical_novelty": "2;2;3;2", "wc_summary_paper": "68;46;147;110", "wc_summary_review": "32;40;2;32", "wc_main_review": "266;247;187;415", "wc_review": "366;333;336;557", "wc_reply_reviewers": "67;0;0;0", "wc_reply_authors": "677;450;303;617", "reply_reviewers": "1;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 7.0, 1.0 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 92.75, 38.854697270729055 ], "wc_summary_review_avg": [ 26.5, 14.517231140957975 ], "wc_main_review_avg": [ 278.75, 83.89390621493315 ], "wc_review_avg": [ 398.0, 92.70113267916417 ], "wc_reply_reviewers_avg": [ 16.75, 29.011851026778693 ], "wc_reply_authors_avg": [ 511.75, 146.43663305334496 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 99, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2914030200506365394&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "pdf": "https://openreview.net/pdf?id=WZ3yjh8coDg", "email": "pku.edu.cn;stanford.edu;sas.upenn.edu;harvard.edu;upenn.edu", "author_num": 5, "aff_unique_index": "0;1;2;3;2", "aff_unique_norm": "Peking University;Stanford University;University of Pennsylvania;Harvard University", "aff_unique_dep": ";;;", "aff_unique_url": "http://www.pku.edu.cn;https://www.stanford.edu;https://www.upenn.edu;https://www.harvard.edu", "aff_unique_abbr": "Peking U;Stanford;UPenn;Harvard", "aff_campus_unique_index": "1", "aff_campus_unique": ";Stanford", "aff_country_unique_index": "0;1;1;1;1", "aff_country_unique": "China;United States" }, { "id": "WZR7ckBkzPY", "title": "Variational Wasserstein gradient flow", "track": "main", "status": "Reject", "tldr": "", "abstract": "The gradient flow of a function over the space of probability densities with respect to the Wasserstein metric often exhibits nice properties and has been utilized in several machine learning applications. The standard approach to compute the Wasserstein gradient flow is the finite difference which discretizes the underlying space over a grid, and is not scalable. In this work, we propose a scalable proximal gradient type algorithm for Wasserstein gradient flow. The key of our method is a variational formulation of the objective function, which makes it possible to realize the JKO proximal map through a primal-dual optimization. This primal-dual problem can be efficiently solved by alternatively updating the parameters in the inner and outer loops. Our framework covers all the classical Wasserstein gradient flows including the heat equation and the porous medium equation. We demonstrate the performance and scalability of our algorithm with several numerical examples.", "keywords": "Wasserstein gradient flow;JKO;f-divergence", "primary_area": "", "supplementary_material": "/attachment/7453fb73ad702a964cdcf10414e183259308a1c0.zip", "author": "Jiaojiao Fan;Amirhossein Taghvaei;Yongxin Chen", "authorids": "~Jiaojiao_Fan1;~Amirhossein_Taghvaei1;~Yongxin_Chen1", "gender": "F;M;M", "homepage": "https://sbyebss.github.io;https://amirtag.github.io/;https://yongxin.ae.gatech.edu/", "dblp": "78/10176;158/4926;", "google_scholar": "zse9JEwAAAAJ;l96zhjwAAAAJ;X8BYiV4AAAAJ", "orcid": ";;", "linkedin": "jiaojiao-fan-9a1a14162/?locale=en_US;;", "or_profile": "~Jiaojiao_Fan1;~Amirhossein_Taghvaei1;~Yongxin_Chen1", "aff": "Microsoft;University of Washington, Seattle;Georgia Institute of Technology", "aff_domain": "microsoft.com;uw.edu;gatech.edu", "position": "Intern;Assistant Professor;Assistant Professor", "bibtex": "@misc{\nfan2022variational,\ntitle={Variational Wasserstein gradient flow},\nauthor={Jiaojiao Fan and Amirhossein Taghvaei and Yongxin Chen},\nyear={2022},\nurl={https://openreview.net/forum?id=WZR7ckBkzPY}\n}", "github": "", "project": "", "reviewers": "CWde;S3mH;72fj;pvCT", "site": "https://openreview.net/forum?id=WZR7ckBkzPY", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "5;4;4;4", "correctness": "2;4;4;3", "technical_novelty": "2;2;1;2", "empirical_novelty": "2;2;3;1", "wc_summary_paper": "48;76;153;119", "wc_summary_review": "45;23;38;56", "wc_main_review": "790;333;156;633", "wc_review": "883;432;347;808", "wc_reply_reviewers": "0;0;31;0", "wc_reply_authors": "69;0;25;0", "reply_reviewers": "0;0;1;0", "reply_authors": "1;0;1;0", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 1.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 99.0, 40.1434926233381 ], "wc_summary_review_avg": [ 40.5, 11.968709203585824 ], "wc_main_review_avg": [ 478.0, 248.03124803137206 ], "wc_review_avg": [ 617.5, 231.4956803052705 ], "wc_reply_reviewers_avg": [ 7.75, 13.423393758658799 ], "wc_reply_authors_avg": [ 23.5, 28.182441342083905 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 0.5, 0.5 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": -0.17407765595569782, "gs_citation": 74, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4247639090058922494&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff_unique_index": "0;1;2", "aff_unique_norm": "Microsoft;University of Washington;Georgia Institute of Technology", "aff_unique_dep": "Microsoft Corporation;;", "aff_unique_url": "https://www.microsoft.com;https://www.washington.edu;https://www.gatech.edu", "aff_unique_abbr": "Microsoft;UW;Georgia Tech", "aff_campus_unique_index": "1", "aff_campus_unique": ";Seattle", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "WZeI0Vro15y", "title": "Generative Posterior Networks for Approximately Bayesian Epistemic Uncertainty Estimation", "track": "main", "status": "Reject", "tldr": "", "abstract": "Ensembles of neural networks are often used to estimate epistemic uncertainty in high-dimensional problems because of their scalability and ease of use. These methods, however, are expensive to sample from as each sample requires a new neural network to be trained from scratch. We propose a new method, Generative Posterior Networks (GPNs), a generative model that, given a prior distribution over functions, approximates the posterior distribution directly by regularizing the network towards samples from the prior. This allows our method to quickly sample from the posterior and construct confidence bounds. We prove theoretically that our method indeed approximates the Bayesian posterior and show empirically that it improves epistemic uncertainty estimation over competing methods.", "keywords": "Uncertainty;Bayesian;Neural Networks;Generative Models", "primary_area": "", "supplementary_material": "", "author": "Melrose Roderick;Felix Berkenkamp;Fatemeh Sheikholeslami;J Zico Kolter", "authorids": "~Melrose_Roderick1;~Felix_Berkenkamp1;~Fatemeh_Sheikholeslami1;~J_Zico_Kolter1", "gender": "M;M;;M", "homepage": "https://melroderick.github.io/;https://berkenkamp.me;https://fatemehsheikholeslami.github.io/;http://www.zicokolter.com", "dblp": "181/3909;168/8558;;67/2526", "google_scholar": "PYrd2GMAAAAJ;https://scholar.google.ch/citations?user=N_tCEl8AAAAJ;XKFQX4gAAAAJ;UXh1I6UAAAAJ", "orcid": ";;;", "linkedin": "https://linkedin.com/in/melrose-roderick-4b74b199;berkenkamp/;;", "or_profile": "~Melrose_Roderick1;~Felix_Berkenkamp1;~Fatemeh_Sheikholeslami1;~Zico_Kolter1", "aff": "Carnegie Mellon University;Bosch;Bosch Center for AI;Carnegie Mellon University", "aff_domain": "cmu.edu;bosch.com;us.bosch.com;cmu.edu", "position": "PhD student;Research Scientist;Machine Learning Research Scientist;Full Professor", "bibtex": "@misc{\nroderick2022generative,\ntitle={Generative Posterior Networks for Approximately Bayesian Epistemic Uncertainty Estimation},\nauthor={Melrose Roderick and Felix Berkenkamp and Fatemeh Sheikholeslami and J Zico Kolter},\nyear={2022},\nurl={https://openreview.net/forum?id=WZeI0Vro15y}\n}", "github": "", "project": "", "reviewers": "vyt5;15Ce;v6ij;ZgaQ", "site": "https://openreview.net/forum?id=WZeI0Vro15y", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "2;4;4;3", "correctness": "2;3;3;4", "technical_novelty": "2;3;2;3", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "84;102;39;64", "wc_summary_review": "39;49;42;24", "wc_main_review": "322;382;587;483", "wc_review": "445;533;668;571", "wc_reply_reviewers": "0;86;737;0", "wc_reply_authors": "609;521;650;232", "reply_reviewers": "0;1;3;0", "reply_authors": "2;2;4;2", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 72.25, 23.434749838647733 ], "wc_summary_review_avg": [ 38.5, 9.12414379544733 ], "wc_main_review_avg": [ 443.5, 100.86748732867296 ], "wc_review_avg": [ 554.25, 80.01054617986307 ], "wc_reply_reviewers_avg": [ 205.75, 308.72024148085916 ], "wc_reply_authors_avg": [ 503.0, 163.2559340422271 ], "reply_reviewers_avg": [ 1.0, 1.224744871391589 ], "reply_authors_avg": [ 2.5, 0.8660254037844386 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.6225430174794673, "corr_recommendation_correctness": 0.9733285267845754, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:njaGl6CstJwJ:scholar.google.com/&scioq=Generative+Posterior+Networks+for+Approximately+Bayesian+Epistemic+Uncertainty+Estimation&hl=en&as_sdt=0,5", "gs_version_total": 4, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "Carnegie Mellon University;Robert Bosch GmbH;Bosch Center for AI", "aff_unique_dep": ";;Center for AI", "aff_unique_url": "https://www.cmu.edu;https://www.bosch.com;https://www.bosch-ai.com", "aff_unique_abbr": "CMU;Bosch;BCAI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;0", "aff_country_unique": "United States;Germany" }, { "id": "WcZUevpX3H3", "title": "Personalized Neural Architecture Search for Federated Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Federated Learning (FL) is a recently proposed learning paradigm for decentralized devices to collaboratively train a predictive model without exchanging private data. Existing FL frameworks, however, assume a one-size-fit-all model architecture to be collectively trained by local devices, which is determined prior to observing their data. Even with good engineering acumen, this often falls apart when local tasks are different and require diverging choices of architecture modelling to learn effectively. This motivates us to develop a novel personalized neural architecture search (NAS) algorithm for FL. Our algorithm, FedPNAS, learns a base architecture that can be structurally personalized for quick adaptation to each local task. We empirically show that FedPNAS significantly outperforms other NAS and FL benchmarks on several real-world datasets.", "keywords": "Personalized Learning;Federated Learning", "primary_area": "", "supplementary_material": "/attachment/d3c98d8dcf9bbd6855fa4b743d506c760bc83e4e.zip", "author": "Minh Hoang;Carl Kingsford", "authorids": "~Minh_Hoang1;~Carl_Kingsford1", "gender": "M;", "homepage": ";", "dblp": ";", "google_scholar": "56Mb6DY0_NUC;", "orcid": ";", "linkedin": ";", "or_profile": "~Minh_Hoang1;~Carl_Kingsford1", "aff": "Carnegie Mellon University;", "aff_domain": "cmu.edu;", "position": "PhD student;", "bibtex": "@misc{\nhoang2022personalized,\ntitle={Personalized Neural Architecture Search for Federated Learning},\nauthor={Minh Hoang and Carl Kingsford},\nyear={2022},\nurl={https://openreview.net/forum?id=WcZUevpX3H3}\n}", "github": "", "project": "", "reviewers": "EuZf;RES6;MieJ;1UyE", "site": "https://openreview.net/forum?id=WcZUevpX3H3", "pdf_size": 0, "recommendation": "3;5;5;5", "confidence": "4;4;4;3", "correctness": "3;3;3;3", "technical_novelty": "2;3;2;3", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "41;52;34;70", "wc_summary_review": "10;34;31;59", "wc_main_review": "250;197;196;263", "wc_review": "301;283;261;392", "wc_reply_reviewers": "0;0;81;0", "wc_reply_authors": "150;169;167;215", "reply_reviewers": "0;0;1;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 4.5, 0.8660254037844386 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 49.25, 13.589977924926883 ], "wc_summary_review_avg": [ 33.5, 17.38533865071371 ], "wc_main_review_avg": [ 226.5, 30.35210042155238 ], "wc_review_avg": [ 309.25, 49.83159138538524 ], "wc_reply_reviewers_avg": [ 20.25, 35.074028853269766 ], "wc_reply_authors_avg": [ 175.25, 24.107830678018296 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": 0.0, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6713655448196219147&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0", "aff_unique_norm": "Carnegie Mellon University", "aff_unique_dep": "", "aff_unique_url": "https://www.cmu.edu", "aff_unique_abbr": "CMU", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "id": "Wf5EN11MvQ3", "title": "Free Hyperbolic Neural Networks with Limited Radii", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Non-Euclidean geometry with constant negative curvature, i.e., hyperbolic space, has attracted sustained attention in the community of machine learning. Hyperbolic space, owing to its ability to embed hierarchical structures continuously with low distortion, has been applied for learning data with tree-like structures. Hyperbolic Neural Networks (HNNs) that operate directly in hyperbolic space have also been proposed recently to further exploit the potential of hyperbolic representations. While HNNs have achieved better performance than Euclidean neural networks (ENNs) on datasets with implicit hierarchical structure, they still perform poorly on standard classification benchmarks such as CIFAR and ImageNet. The traditional wisdom is that it is critical for the data to respect the hyperbolic geometry when applying HNNs. In this paper, we first conduct an empirical study showing that the inferior performance of HNNs on standard recognition datasets can be attributed to the notorious vanishing gradient problem. We further discovered that this problem stems from the hybrid architecture of HNNs. Our analysis leads to a simple yet effective solution called Feature Clipping, which regularizes the hyperbolic embedding whenever its norm exceeding a given threshold. Our thorough experiments show that the proposed method can successfully avoid the vanishing gradient problem when training HNNs with backpropagation. The improved HNNs are able to achieve comparable performance with ENNs on standard image recognition datasets including MNIST, CIFAR10, CIFAR100 and ImageNet, while demonstrating more adversarial robustness and stronger out-of-distribution detection capability.", "keywords": "Geometric deep learning;Hyperbolic neural network;Vanishing gradient problem", "primary_area": "", "supplementary_material": "", "author": "Yunhui Guo;Xudong Wang;Yubei Chen;Stella Yu", "authorids": "~Yunhui_Guo2;~Xudong_Wang4;~Yubei_Chen1;~Stella_Yu2", "gender": "M;M;M;F", "homepage": "https://yunhuiguo.github.io/;http://people.eecs.berkeley.edu/~xdwang/;https://redwood.berkeley.edu/people/yubei-chen/;http://www.eecs.umich.edu/~stellayu", "dblp": "165/3105;;30/10064;58/5089", "google_scholar": "BxIXuZYAAAAJ;Azf07WcAAAAJ;WeyLqFUAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";;;", "linkedin": ";;yubei-chen-05998a39/;", "or_profile": "~Yunhui_Guo2;~Xudong_Wang4;~Yubei_Chen1;~Stella_Yu2", "aff": "University of California, Berkeley;FAIR Labs, Meta;Facebook AI Research;University of California, Berkeley", "aff_domain": "berkeley.edu;fb.com;facebook.com;berkeley.edu", "position": "Postdoc;Research Intern;Postdoc Researcher;Director, ICSI Vision Group", "bibtex": "@misc{\nguo2022free,\ntitle={Free Hyperbolic Neural Networks with Limited Radii},\nauthor={Yunhui Guo and Xudong Wang and Yubei Chen and Stella Yu},\nyear={2022},\nurl={https://openreview.net/forum?id=Wf5EN11MvQ3}\n}", "github": "", "project": "", "reviewers": "38HB;RbTY;1DSc;uSkW", "site": "https://openreview.net/forum?id=Wf5EN11MvQ3", "pdf_size": 0, "recommendation": "3;5;5;8", "confidence": "4;3;4;3", "correctness": "4;3;3;4", "technical_novelty": "1;2;2;3", "empirical_novelty": "1;3;2;3", "wc_summary_paper": "126;45;55;50", "wc_summary_review": "56;30;57;40", "wc_main_review": "305;344;254;217", "wc_review": "487;419;366;307", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 5.25, 1.7853571071357126 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 69.0, 33.09833832687073 ], "wc_summary_review_avg": [ 45.75, 11.321991874224253 ], "wc_main_review_avg": [ 280.0, 48.38904834774083 ], "wc_review_avg": [ 394.75, 66.3791194578536 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.7001400420140049, "corr_recommendation_correctness": 0.14002800840280097, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2026130507914578640&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;1;0", "aff_unique_norm": "University of California, Berkeley;Meta", "aff_unique_dep": ";FAIR Labs", "aff_unique_url": "https://www.berkeley.edu;https://meta.com", "aff_unique_abbr": "UC Berkeley;Meta", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Berkeley;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Model Zoo: A Growing Brain That Learns Continually", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6649", "id": "WfvgGBcgbE7", "poster": "", "openreview": "https://openreview.net/forum?id=WfvgGBcgbE7", "slides": "https://iclr.cc/virtual/2022/poster/6649", "video": "https://iclr.cc/virtual/2022/poster/6649", "author_site": "Rahul Ramesh, Pratik A Chaudhari", "tldr": "", "abstract": "This paper argues that continual learning methods can benefit by splitting the capacity of the learner across multiple models. We use statistical learning theory and experimental analysis to show how multiple tasks can interact with each other in a non-trivial fashion when a single model is trained on them. The generalization error on a particular task can improve when it is trained with synergistic tasks, but can also deteriorate when trained with competing tasks. This theory motivates our method named Model Zoo which, inspired from the boosting literature, grows an ensemble of small models, each of which is trained during one episode of continual learning. We demonstrate that Model Zoo obtains large gains in accuracy on a wide variety of continual learning benchmark problems.", "keywords": "Continual Learning;Learning Theory", "primary_area": "", "supplementary_material": "/attachment/eff66ec7507a03e05044d3a479987d2279e8aaf6.zip", "author": "Rahul Ramesh;Pratik Chaudhari", "authorids": "~Rahul_Ramesh2;~Pratik_Chaudhari1", "gender": "M;M", "homepage": "https://cis.upenn.edu/~rahulram;https://pratikac.github.io/", "dblp": "168/7029;", "google_scholar": "wCa6nygAAAAJ;c_z5hWEAAAAJ", "orcid": ";", "linkedin": ";pratik-chaudhari-59508765", "or_profile": "~Rahul_Ramesh2;~Pratik_Chaudhari1", "aff": "University of Pennsylvania;School of Engineering and Applied Science, University of Pennsylvania", "aff_domain": "upenn.edu;seas.upenn.edu", "position": "PhD student;Assistant Professor", "bibtex": "@inproceedings{\nramesh2022model,\ntitle={Model Zoo: A Growing Brain That Learns Continually},\nauthor={Rahul Ramesh and Pratik Chaudhari},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=WfvgGBcgbE7}\n}", "github": "", "project": "", "reviewers": "R6VS;Cn5V;KWuM;LD4g", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "4;4;5;3", "correctness": "2;3;3;4", "technical_novelty": "4;3;2;3", "empirical_novelty": "4;3;2;3", "wc_summary_paper": "113;262;89;162", "wc_summary_review": "50;100;110;54", "wc_main_review": "782;862;792;123", "wc_review": "945;1224;991;339", "wc_reply_reviewers": "1854;0;788;0", "wc_reply_authors": "7262;1700;3624;357", "reply_reviewers": "5;0;1;0", "reply_authors": "12;3;5;1", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 156.5, 66.34945365261119 ], "wc_summary_review_avg": [ 78.5, 26.77218706045511 ], "wc_main_review_avg": [ 639.75, 299.9336384935841 ], "wc_review_avg": [ 874.75, 326.899353777275 ], "wc_reply_reviewers_avg": [ 660.5, 760.4635099727008 ], "wc_reply_authors_avg": [ 3235.75, 2598.420517833863 ], "reply_reviewers_avg": [ 1.5, 2.0615528128088303 ], "reply_authors_avg": [ 5.25, 4.14578098794425 ], "replies_avg": [ 34, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.6488856845230502, "corr_recommendation_correctness": 0.9733285267845754, "gs_citation": 83, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7739075327593015355&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=WfvgGBcgbE7", "email": "upenn.edu;seas.upenn.edu", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "University of Pennsylvania", "aff_unique_dep": "", "aff_unique_url": "https://www.upenn.edu", "aff_unique_abbr": "UPenn", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "WlPPBKnOB4w", "title": "One Stage Autoencoders for Multi-Domain Learning", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Autoencoders (AEs) are widely being used for representation learning. Empirically AEs are capable of capturing hidden representations of a given domain precisely. However, in principle AEs\u2019 latent representation might be misleading, especially in the presence of weak encoding constraints. In this paper, we introduce one stage autoencoders (OSAs) to induce searching for patterns while training artificial neural networks. We propose two different frameworks for OSAs; Autoclave Restricted Boltzmann Machines (ACRBMs) and Local Observer Convolution (LOC). Both frameworks are compatible with artificial neural networks and trained via direct backpropagation (end-to-end training). Furthermore, they are scalable and require significantly less number of parameters than traditional AEs. ACRBMs are extensions of RBMs that are able to describe a given domain symmetrically. LOC is a density based clustering algorithm that implicitly draws a spatial graph from input domains. Unlike standard clustering algorithms that require specifying the expected number of clusters, we believe that LOC is the first neural network compatible algorithm capable of dynamically choosing the appropriate number of clusters that best fit a given domain. Both ACRBMs and LOC were evaluated in terms of unsupervised learning. Experiments showed that both structures of shallow ACRBMs and AE-based ACRBMs outperformed K-means for image clustering using the same number of clusters. Similarly, LOC outperformed K-means in terms of unsupervised image segmentation.", "keywords": "energy-based models;probabilistic models;autoencoders;optimization;learning representations;unsupervised learning", "primary_area": "", "supplementary_material": "/attachment/d68ea7fdd967f621c39d3c51b4ef69838018e7e0.zip", "author": "Mohamed Zayan;Dina Khattab", "authorids": "~Mohamed_Zayan1;~Dina_Khattab1", "gender": ";M", "homepage": "https://www.asu.edu.eg;", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";m-zayan/", "or_profile": "~Dina_Khattab1;~Mohamed_Osama_Zayan1", "aff": "Fcis;Ain Shams University", "aff_domain": "cis.asu.edu.eg;asu.edu.eg", "position": "Lecturer;Undergrad student", "bibtex": "@misc{\nzayan2022one,\ntitle={One Stage Autoencoders for Multi-Domain Learning},\nauthor={Mohamed Zayan and Dina Khattab},\nyear={2022},\nurl={https://openreview.net/forum?id=WlPPBKnOB4w}\n}", "github": "", "project": "", "reviewers": "sfM7;GAXA;1HSs;a1Zd", "site": "https://openreview.net/forum?id=WlPPBKnOB4w", "pdf_size": 0, "recommendation": "1;1;3;3", "confidence": "4;2;3;3", "correctness": "1;2;3;3", "technical_novelty": "1;2;3;2", "empirical_novelty": "0;1;1;2", "wc_summary_paper": "95;55;51;39", "wc_summary_review": "57;30;25;29", "wc_main_review": "283;769;365;194", "wc_review": "435;854;441;262", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 2.0, 1.0 ], "confidence_avg": [ 3.0, 0.7071067811865476 ], "correctness_avg": [ 2.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 1.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 60.0, 21.047565179849187 ], "wc_summary_review_avg": [ 35.25, 12.695963925594622 ], "wc_main_review_avg": [ 402.75, 219.93223388125716 ], "wc_review_avg": [ 498.0, 217.74411587916674 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.9045340337332909, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:eR4KhhYu_qQJ:scholar.google.com/&scioq=One+Stage+Autoencoders+for+Multi-Domain+Learning&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1", "aff_unique_norm": "Faculty of Computers and Information Sciences;Ain Shams University", "aff_unique_dep": ";", "aff_unique_url": ";https://www.ashams.edu.eg", "aff_unique_abbr": "FCIS;ASU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "1", "aff_country_unique": ";Egypt" }, { "title": "Scene Transformer: A unified architecture for predicting future trajectories of multiple agents", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/5969", "id": "Wm3EA5OlHsG", "poster": "", "openreview": "https://openreview.net/forum?id=Wm3EA5OlHsG", "slides": "https://iclr.cc/virtual/2022/poster/5969", "video": "https://iclr.cc/virtual/2022/poster/5969", "author_site": "Jiquan Ngiam, Vijay Vasudevan, Benjamin Caine, Zhengdong Zhang, Hao-Tien (Lewis) Chiang, Jeffrey Ling, Rebecca Roelofs, Alex Bewley, Chenxi Liu, Ashish Venugopal, David Weiss, Ben Sapp, Zhifeng Chen, Jonathon Shlens", "tldr": "", "abstract": "Predicting the motion of multiple agents is necessary for planning in dynamic environments. This task is challenging for autonomous driving since agents (e.g., vehicles and pedestrians) and their associated behaviors may be diverse and influence one another. Most prior work have focused on predicting independent futures for each agent based on all past motion, and planning against these independent predictions. However, planning against independent predictions can make it challenging to represent the future interaction possibilities between different agents, leading to sub-optimal planning. In this work, we formulate a model for predicting the behavior of all agents jointly, producing consistent futures that account for interactions between agents. Inspired by recent language modeling approaches, we use a masking strategy as the query to our model, enabling one to invoke a single model to predict agent behavior in many ways, such as potentially conditioned on the goal or full future trajectory of the autonomous vehicle or the behavior of other agents in the environment. Our model architecture employs attention to combine features across road elements, agent interactions, and time steps. We evaluate our approach on autonomous driving datasets for both marginal and joint motion prediction, and achieve state of the art performance across two popular datasets. Through combining a scene-centric approach, agent permutation equivariant model, and a sequence masking strategy, we show that our model can unify a variety of motion prediction tasks from joint motion predictions to conditioned prediction.", "keywords": "trajectory prediction;motion forecasting;multi-task learning;attention;autonomous vehicles", "primary_area": "", "supplementary_material": "", "author": "Jiquan Ngiam;Vijay Vasudevan;Benjamin Caine;Zhengdong Zhang;Hao-Tien Lewis Chiang;Jeffrey Ling;Rebecca Roelofs;Alex Bewley;Chenxi Liu;Ashish Venugopal;David J Weiss;Benjamin Sapp;Zhifeng Chen;Jonathon Shlens", "authorids": "~Jiquan_Ngiam1;~Vijay_Vasudevan1;~Benjamin_Caine1;~Zhengdong_Zhang3;~Hao-Tien_Lewis_Chiang1;~Jeffrey_Ling1;~Rebecca_Roelofs1;~Alex_Bewley1;~Chenxi_Liu1;~Ashish_Venugopal1;~David_J_Weiss1;~Benjamin_Sapp3;~Zhifeng_Chen1;~Jonathon_Shlens1", "gender": "M;;M;M;M;;F;Unspecified;;M;;M;M;", "homepage": "http://ngi.am/;https://vijay.vasu.org;;https://zhengdong-mit.github.io/;https://sites.google.com/view/lewispro/home;https://jeffreyling.github.io;;https://alex.bewley.ai/;;;;;;", "dblp": "72/8781;08/2793;;;;157/8170;145/2224;39/9969;146/8008;16/3259;06/7944;54/5582;61/5154;", "google_scholar": "n0pk_jEAAAAJ;;KS-nDCMAAAAJ;https://scholar.google.com/citations?hl=en;megAxigAAAAJ;pfKbmL4AAAAJ;;https://scholar.google.co.uk/citations?user=UO32CB0AAAAJ;;;27RgVF4AAAAJ;aPqcyU4AAAAJ;;", "orcid": ";;;;;;;0000-0002-8428-9264;;;;;;", "linkedin": "jngiam;;;;hao-tien-lewis-chiang-22598a79/;;;;;;;;;", "or_profile": "~Jiquan_Ngiam1;~Vijay_Vasudevan1;~Benjamin_Caine1;~Zhengdong_Zhang3;~Hao-Tien_Lewis_Chiang1;~Jeffrey_Ling1;~Rebecca_Roelofs1;~Alex_Bewley1;~Chenxi_Liu1;~Ashish_Venugopal1;~David_J_Weiss1;~Benjamin_Sapp3;~Zhifeng_Chen1;~Jonathon_Shlens1", "aff": "Google;Google;Google Brain;;Google Deepmind;Waymo;Google;Google;Waymo;;Waymo;Waymo;Google;", "aff_domain": "google.com;google.com;google.com;;deepmind.com;waymo.com;google.com;google.com;waymo.com;;waymo.com;waymo.com;google.com;", "position": "Research Engineer;Software Engineer;Research Software Engineer;;Researcher;Research Engineer;Research scientist;Research Scientist;Researcher;;Software Engineer;Researcher;Engineer;", "bibtex": "@inproceedings{\nngiam2022scene,\ntitle={Scene Transformer: A unified architecture for predicting future trajectories of multiple agents},\nauthor={Jiquan Ngiam and Vijay Vasudevan and Benjamin Caine and Zhengdong Zhang and Hao-Tien Lewis Chiang and Jeffrey Ling and Rebecca Roelofs and Alex Bewley and Chenxi Liu and Ashish Venugopal and David J Weiss and Benjamin Sapp and Zhifeng Chen and Jonathon Shlens},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=Wm3EA5OlHsG}\n}", "github": "", "project": "", "reviewers": "LaWD;evQV;RDXN;ADF1", "pdf_size": 0, "recommendation": "5;6;8;8", "confidence": "5;5;5;4", "correctness": "3;4;3;4", "technical_novelty": "2;2;3;4", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "30;47;242;74", "wc_summary_review": "34;106;59;47", "wc_main_review": "1057;392;976;151", "wc_review": "1121;545;1277;272", "wc_reply_reviewers": "0;285;9;0", "wc_reply_authors": "519;1116;1062;76", "reply_reviewers": "0;2;1;0", "reply_authors": "1;3;2;1", "recommendation_avg": [ 6.75, 1.299038105676658 ], "confidence_avg": [ 4.75, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.82915619758885 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 98.25, 84.46411960116556 ], "wc_summary_review_avg": [ 61.5, 27.17075633838705 ], "wc_main_review_avg": [ 644.0, 383.192510365221 ], "wc_review_avg": [ 803.75, 410.585785798778 ], "wc_reply_reviewers_avg": [ 73.5, 122.16484764448406 ], "wc_reply_authors_avg": [ 693.25, 426.04423185861816 ], "reply_reviewers_avg": [ 0.75, 0.82915619758885 ], "reply_authors_avg": [ 1.75, 0.82915619758885 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 14, 0 ], "corr_recommendation_confidence": -0.5555555555555555, "corr_recommendation_correctness": 0.19245008972987526, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1, "pdf": "https://openreview.net/pdf?id=Wm3EA5OlHsG", "email": "google.com;google.com;google.com;;deepmind.com;waymo.com;google.com;google.com;waymo.com;;waymo.com;waymo.com;google.com;", "author_num": 14, "aff_unique_index": "0;0;0;1;2;0;0;2;2;2;0", "aff_unique_norm": "Google;DeepMind;Waymo", "aff_unique_dep": "Google;DeepMind;", "aff_unique_url": "https://www.google.com;https://deepmind.com;https://www.waymo.com", "aff_unique_abbr": "Google;DeepMind;Waymo", "aff_campus_unique_index": "0;0;0;0;0;0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;0;0;1;0;0;0;0;0;0;0", "aff_country_unique": "United States;United Kingdom" }, { "id": "WnOLO1f50MH", "title": "Exploiting Redundancy: Separable Group Convolutional Networks on Lie Groups", "track": "main", "status": "Reject", "tldr": "", "abstract": "Group convolutional neural networks (G-CNNs) have been shown to increase parameter efficiency and model accuracy by incorporating geometric inductive biases. In this work, we investigate the properties of representations learned by regular G-CNNs, and show considerable parameter redundancy in group convolution kernels. This finding motivates further weight-tying by sharing convolution kernels over subgroups. To this end, we introduce convolution kernels that are separable over the subgroup and channel dimensions. In order to obtain equivariance to arbitrary affine Lie groups we provide a continuous parameterisation of separable convolution kernels. We evaluate our approach across several vision datasets, and show that our weight sharing leads to improved performance and computational efficiency. In many settings, separable G-CNNs outperform their non-separable counterpart, while only using a fraction of their training time. In addition, thanks to the increase in computational efficiency, we are able to implement G-CNNs equivariant to the $\\mathrm{Sim(2)}$ group; the group of dilations, rotations and translations. $\\mathrm{Sim(2)}$-equivariance further improves performance on all tasks considered.", "keywords": "Group equivariance;separable convolutions;group equivariant neural networks", "primary_area": "", "supplementary_material": "", "author": "David M Knigge;David W. Romero;Erik J Bekkers", "authorids": "~David_M_Knigge1;~David_W._Romero1;~Erik_J_Bekkers1", "gender": "M;;M", "homepage": "https://davidwromero.xyz/;https://erikbekkers.bitbucket.io/;", "dblp": "254/1396;43/5596;304/8106", "google_scholar": "7tdzmVoAAAAJ;https://scholar.google.nl/citations?user=yeWrfR4AAAAJ;Csnj-pQAAAAJ", "orcid": ";;0000-0001-5272-3313", "linkedin": "david-w-romero-05893567/;;https://linkedin.com/in/david-knigge", "or_profile": "~David_W._Romero1;~Erik_J_Bekkers1;~David_Mattanja_Knigge1", "aff": "Vrije Universiteit Amsterdam;University of Amsterdam;University of Amsterdam", "aff_domain": "vu.nl;uva.nl;uva.nl", "position": "PhD student;Assistant Professor;PhD student", "bibtex": "@misc{\nknigge2022exploiting,\ntitle={Exploiting Redundancy: Separable Group Convolutional Networks on Lie Groups},\nauthor={David M Knigge and David W. Romero and Erik J Bekkers},\nyear={2022},\nurl={https://openreview.net/forum?id=WnOLO1f50MH}\n}", "github": "", "project": "", "reviewers": "4A1c;m8cZ;xJAv;hC39", "site": "https://openreview.net/forum?id=WnOLO1f50MH", "pdf_size": 0, "recommendation": "5;5;6;8", "confidence": "4;5;3;3", "correctness": "4;2;4;4", "technical_novelty": "2;2;4;3", "empirical_novelty": "2;1;1;3", "wc_summary_paper": "65;55;63;36", "wc_summary_review": "45;80;39;27", "wc_main_review": "538;506;186;166", "wc_review": "648;641;288;229", "wc_reply_reviewers": "282;0;0;0", "wc_reply_authors": "2800;2061;1204;1013", "reply_reviewers": "1;0;0;0", "reply_authors": "5;4;2;2", "recommendation_avg": [ 6.0, 1.224744871391589 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 3.5, 0.8660254037844386 ], "technical_novelty_avg": [ 2.75, 0.82915619758885 ], "empirical_novelty_avg": [ 1.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 54.75, 11.453711188955307 ], "wc_summary_review_avg": [ 47.75, 19.715159142142372 ], "wc_main_review_avg": [ 349.0, 173.51368822084325 ], "wc_review_avg": [ 451.5, 194.13976923855657 ], "wc_reply_reviewers_avg": [ 70.5, 122.10958193360585 ], "wc_reply_authors_avg": [ 1769.5, 713.9651602144183 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 3.25, 1.299038105676658 ], "replies_avg": [ 22, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.7385489458759963, "corr_recommendation_correctness": 0.4714045207910316, "gs_citation": 42, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15152080644760721791&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff_unique_index": "0;1;1", "aff_unique_norm": "Vrije Universiteit Amsterdam;University of Amsterdam", "aff_unique_dep": ";", "aff_unique_url": "https://www.vu.nl;https://www.uva.nl", "aff_unique_abbr": "VU Amsterdam;UvA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Netherlands" }, { "title": "Top-label calibration and multiclass-to-binary reductions", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6278", "id": "WqoBaaPHS-", "poster": "", "openreview": "https://openreview.net/forum?id=WqoBaaPHS-", "slides": "https://iclr.cc/virtual/2022/poster/6278", "video": "https://iclr.cc/virtual/2022/poster/6278", "author_site": "Chirag Gupta, Aaditya Ramdas", "tldr": "", "abstract": "We propose a new notion of multiclass calibration called top-label calibration. A classifier is said to be top-label calibrated if the reported probability for the predicted class label---the top-label---is calibrated, conditioned on the top-label. This conditioning is essential for practical utility of the calibration property, since the top-label is always reported and we must condition on what is reported. However, the popular notion of confidence calibration erroneously skips this conditioning. Furthermore, we outline a multiclass-to-binary (M2B) reduction framework that unifies confidence, top-label, and class-wise calibration, among others. As its name suggests, M2B works by reducing multiclass calibration to different binary calibration problems; various types of multiclass calibration can then be achieved using simple binary calibration routines. We instantiate the M2B framework with the well-studied histogram binning (HB) binary calibrator, and prove that the overall procedure is multiclass calibrated without making any assumptions on the underlying data distribution. In an empirical evaluation with four deep net architectures on CIFAR-10 and CIFAR-100, we find that the M2B + HB procedure achieves lower top-label and class-wise calibration error than other approaches such as temperature scaling. Code for this work is available at https://github.com/aigen/df-posthoc-calibration.", "keywords": "calibration;multiclass;uncertainty quantification;distribution-free;histogram binning", "primary_area": "", "supplementary_material": "/attachment/8c0bb4e26bc860bba21f67effcdb9613904271cd.zip", "author": "Chirag Gupta;Aaditya Ramdas", "authorids": "~Chirag_Gupta1;~Aaditya_Ramdas2", "gender": "M;M", "homepage": "https://aigen.github.io;http://stat.cmu.edu/~aramdas", "dblp": "64/4756;117/3518", "google_scholar": "2ALBM1sAAAAJ;ZvFaPxUAAAAJ", "orcid": ";0000-0003-0497-311X", "linkedin": ";", "or_profile": "~Chirag_Gupta1;~Aaditya_Ramdas2", "aff": "Carnegie Mellon University;Carnegie Mellon University", "aff_domain": "cmu.edu;cmu.edu", "position": "PhD student;Assistant Professor", "bibtex": "@inproceedings{\ngupta2022toplabel,\ntitle={Top-label calibration and multiclass-to-binary reductions},\nauthor={Chirag Gupta and Aaditya Ramdas},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=WqoBaaPHS-}\n}", "github": "", "project": "", "reviewers": "xjQt;8zku;NPUy;ZncC", "pdf_size": 0, "recommendation": "5;5;6;8", "confidence": "3;4;3;4", "correctness": "3;3;3;4", "technical_novelty": "2;3;4;3", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "210;124;107;42", "wc_summary_review": "31;55;11;18", "wc_main_review": "633;361;195;117", "wc_review": "874;540;313;177", "wc_reply_reviewers": "203;146;53;0", "wc_reply_authors": "1319;1318;288;86", "reply_reviewers": "2;1;1;0", "reply_authors": "4;3;1;1", "recommendation_avg": [ 6.0, 1.224744871391589 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 120.75, 59.930689133364716 ], "wc_summary_review_avg": [ 28.75, 16.768646337734005 ], "wc_main_review_avg": [ 326.5, 197.68345909559557 ], "wc_review_avg": [ 476.0, 263.8512838702893 ], "wc_reply_reviewers_avg": [ 100.5, 78.95093413000254 ], "wc_reply_authors_avg": [ 752.75, 570.2400262170308 ], "reply_reviewers_avg": [ 1.0, 0.7071067811865476 ], "reply_authors_avg": [ 2.25, 1.299038105676658 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.40824829046386296, "corr_recommendation_correctness": 0.9428090415820632, "gs_citation": 54, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5210721734640980720&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=WqoBaaPHS-", "email": "cmu.edu;cmu.edu", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Carnegie Mellon University", "aff_unique_dep": "", "aff_unique_url": "https://www.cmu.edu", "aff_unique_abbr": "CMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "Wsif-S7ggTM", "title": "Cross-Stage Transformer for Video Learning", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Transformer network has been proved efficient in modeling long-range dependencies in video learning. However, videos contain rich contextual information in both spatial and temporal dimensions, e.g., scenes and temporal reasoning. In traditional transformer networks, stacked transformer blocks work in a sequential and independent way, which may lead to the inefficient propagation of such contextual information. To address this problem, we propose a cross-stage transformer paradigm, which allows to fuse self-attentions and features from different blocks. By inserting the proposed cross-stage mechanism in existing spatial and temporal transformer blocks, we build a separable transformer network for video learning based on ViT structure, in which self-attentions and features are progressively aggregated from one block to the next. Extensive experiments show that our approach outperforms existing ViT based video transformer approaches with the same pre-training dataset on mainstream video action recognition datasets of Kinetics-400 (Top-1 accuracy 81.8%) and Kinetics-600 (Top-1 accuracy 84.0%). Due to the effectiveness of cross-stage transformer, our proposed method achieves comparable performance with other ViT based approaches with much lower computation cost (e.g., 8.6% of ViViT\u2019s FLOPs) in inference process. As an independent module, our proposed method can be conveniently added on other video transformer frameworks.", "keywords": "Transformer;video recognition", "primary_area": "", "supplementary_material": "", "author": "Leroy Lin;Xun Guo;Yan Lu", "authorids": "~Leroy_Lin1;~Xun_Guo1;~Yan_Lu7", "gender": "M;M;M", "homepage": "https://yuanze-lin.me/;;https://www.microsoft.com/en-us/research/people/yanlu/", "dblp": "299/9155;32/5851;15/4830-1", "google_scholar": "0WFC2w0AAAAJ;Ow4R8-EAAAAJ;djk5l-4AAAAJ", "orcid": ";;0000-0001-5383-6424", "linkedin": ";;", "or_profile": "~Leroy_Lin1;~Xun_Guo1;~Yan_Lu7", "aff": ";Microsoft Research Asia;Microsoft Research Asia", "aff_domain": ";microsoft.com;microsoft.com", "position": ";Principal Researcher;Partner Research Manager", "bibtex": "@misc{\nlin2022crossstage,\ntitle={Cross-Stage Transformer for Video Learning},\nauthor={Leroy Lin and Xun Guo and Yan Lu},\nyear={2022},\nurl={https://openreview.net/forum?id=Wsif-S7ggTM}\n}", "github": "", "project": "", "reviewers": "hZyt;WNnz;a4AF;h49B", "site": "https://openreview.net/forum?id=Wsif-S7ggTM", "pdf_size": 0, "recommendation": "3;3;3;6", "confidence": "5;4;4;5", "correctness": "2;4;3;3", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "96;63;87;40", "wc_summary_review": "25;26;46;36", "wc_main_review": "373;140;182;516", "wc_review": "494;229;315;592", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.75, 1.299038105676658 ], "confidence_avg": [ 4.5, 0.5 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 71.5, 21.823152842795196 ], "wc_summary_review_avg": [ 33.25, 8.525696452489967 ], "wc_main_review_avg": [ 302.75, 151.22727101948246 ], "wc_review_avg": [ 407.5, 143.12669212973518 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:7sKHIGheHHEJ:scholar.google.com/&scioq=Cross-Stage+Transformer+for+Video+Learning&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Microsoft", "aff_unique_dep": "Research", "aff_unique_url": "https://www.microsoft.com/en-us/research/group/asia", "aff_unique_abbr": "MSR Asia", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Asia", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "id": "WtPHnvDUk5X", "title": "GANet: Glyph-Attention Network for Few-Shot Font Generation", "track": "main", "status": "Reject", "tldr": "", "abstract": "Font generation is a valuable but challenging task, it is time consuming and costly to design font libraries which cover all glyphs with various styles. The time and cost of such task will be greatly reduced if the complete font library can be generated from only a few custom samples. Inspired by font characteristics and global and local attention mechanism Wang et al. (2018), we propose a glyph-attention network (GANet) to tackle this problem. Firstly, a content encoder and a style encoder are trained to extract features as keys and values from a content glyph set and a style glyph set, respectively. Secondly, a query vector generated from a single glyph sample by the query encoder is applied to draw out proper features from the content and style (key, value) pairs via glyph-attention modules. Next, a decoder is used to recover a glyph from the queried features. Lastly, Adversarial losses Goodfellow et al. (2014) with multi-task glyph discriminator are employed to stablize the training process. Experimental results demonstrate that our method is able to create robust results with superior fidelity. Less number of samples are needed and better performance is achieved when compared to the other state-of-the-art few-shot font generation methods, without utilizing supervision on locality such as component, skeleton, or strokes, etc.", "keywords": "font generation;GANet;glyph-attention;few-shot;GAN", "primary_area": "", "supplementary_material": "", "author": "Mingtao Guo;Wei Xiong;Zheng Wang;Yong Tang;Ting Wu", "authorids": "~Mingtao_Guo1;~Wei_Xiong8;~Zheng_Wang17;~Yong_Tang4;~Ting_Wu1", "gender": "M;M;;M;M", "homepage": "https://github.com/MingtaoGuo;https://www.pingpongx.com;http://www.workerwonder.com;;https://github.com/wuutiing", "dblp": ";;;;", "google_scholar": ";;;ubVWrRwAAAAJ;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~Mingtao_Guo1;~Wei_Xiong8;~Zheng_Wang17;~Yong_Tang4;~Ting_Wu1", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nguo2022ganet,\ntitle={{GAN}et: Glyph-Attention Network for Few-Shot Font Generation},\nauthor={Mingtao Guo and Wei Xiong and Zheng Wang and Yong Tang and Ting Wu},\nyear={2022},\nurl={https://openreview.net/forum?id=WtPHnvDUk5X}\n}", "github": "", "project": "", "reviewers": "GLJu;6xeo;oLEK;ihg3", "site": "https://openreview.net/forum?id=WtPHnvDUk5X", "pdf_size": 0, "recommendation": "3;5;5;5", "confidence": "4;5;5;4", "correctness": "4;3;3;3", "technical_novelty": "1;3;2;2", "empirical_novelty": "2;3;2;2", "wc_summary_paper": "41;80;77;55", "wc_summary_review": "46;30;39;54", "wc_main_review": "220;276;836;240", "wc_review": "307;386;952;349", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.5, 0.8660254037844386 ], "confidence_avg": [ 4.5, 0.5 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 63.25, 16.068213964221414 ], "wc_summary_review_avg": [ 42.25, 8.842369591913696 ], "wc_main_review_avg": [ 393.0, 256.5521389503506 ], "wc_review_avg": [ 498.5, 263.3158749487011 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": -1.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:PcEBVinHAG4J:scholar.google.com/&scioq=GANet:+Glyph-Attention+Network+for+Few-Shot+Font+Generation&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "title": "Model-augmented Prioritized Experience Replay", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6032", "id": "WuEiafqdy9H", "poster": "", "openreview": "https://openreview.net/forum?id=WuEiafqdy9H", "slides": "https://iclr.cc/virtual/2022/poster/6032", "video": "https://iclr.cc/virtual/2022/poster/6032", "author_site": "Youngmin Oh, Jinwoo Shin, Eunho Yang, Sung Ju Hwang", "tldr": "", "abstract": "Experience replay is an essential component in off-policy model-free reinforcement learning (MfRL). Due to its effectiveness, various methods for calculating priority scores on experiences have been proposed for sampling. Since critic networks are crucial to policy learning, TD-error, directly correlated to $Q$-values, is one of the most frequently used features to compute the scores. However, critic networks often under- or overestimate $Q$-values, so it is often ineffective to learn to predict $Q$-values by sampled experiences based heavily on TD-error. Accordingly, it is valuable to find auxiliary features, which positively support TD-error in calculating the scores for efficient sampling. Motivated by this, we propose a novel experience replay method, which we call model-augmented prioritized experience replay (MaPER), that employs new learnable features driven from components in model-based RL (MbRL) to calculate the scores on experiences. The proposed MaPER brings the effect of curriculum learning for predicting $Q$-values better by the critic network with negligible memory and computational overhead compared to the vanilla PER. Indeed, our experimental results on various tasks demonstrate that MaPER can significantly improve the performance of the state-of-the-art off-policy MfRL and MbRL which includes off-policy MfRL algorithms in its policy optimization procedure.", "keywords": "RL;Reinforcement Learning;Replay Buffer", "primary_area": "", "supplementary_material": "/attachment/c2a384a11b9cb27425cf957305bfd14ad2c69a7c.zip", "author": "Youngmin Oh;Jinwoo Shin;Eunho Yang;Sung Ju Hwang", "authorids": "~Youngmin_Oh2;~Jinwoo_Shin1;~Eunho_Yang1;~Sung_Ju_Hwang1", "gender": "M;M;M;", "homepage": "https://sites.google.com/view/youngmin0oh;https://sites.google.com/site/mijirim/;https://sites.google.com/site/hleehome2/;", "dblp": ";31/7062;96/2621;", "google_scholar": "_6sDqYYAAAAJ;https://scholar.google.com.tw/citations?user=m3eDp7kAAAAJ;;", "orcid": "0000-0002-9279-0155;;;", "linkedin": "youngmin-oh-124593166/;;;", "or_profile": "~Youngmin_Oh2;~Jinwoo_Shin1;~Eunho_Yang1;~Sung_Ju_Hwang1", "aff": "Samsung;Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology;", "aff_domain": "samsung.com;kaist.ac.kr;kaist.ac.kr;", "position": "Researcher;Associate Professor;Associate Professor;", "bibtex": "@inproceedings{\noh2022modelaugmented,\ntitle={Model-augmented Prioritized Experience Replay},\nauthor={Youngmin Oh and Jinwoo Shin and Eunho Yang and Sung Ju Hwang},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=WuEiafqdy9H}\n}", "github": "", "project": "", "reviewers": "noEm;K56x;rZAM;mk5C", "pdf_size": 0, "recommendation": "5;6;8;8", "confidence": "3;5;4;3", "correctness": "2;3;4;3", "technical_novelty": "2;2;3;4", "empirical_novelty": "3;2;3;4", "wc_summary_paper": "75;47;115;96", "wc_summary_review": "4;24;44;18", "wc_main_review": "591;194;214;186", "wc_review": "670;265;373;300", "wc_reply_reviewers": "284;0;0;0", "wc_reply_authors": "1901;311;587;344", "reply_reviewers": "1;0;0;0", "reply_authors": "3;1;1;1", "recommendation_avg": [ 6.75, 1.299038105676658 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.75, 0.82915619758885 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 83.25, 25.262373206015305 ], "wc_summary_review_avg": [ 22.5, 14.378803844548406 ], "wc_main_review_avg": [ 296.25, 170.47928759822994 ], "wc_review_avg": [ 402.0, 159.5603334165481 ], "wc_reply_reviewers_avg": [ 71.0, 122.97560733739029 ], "wc_reply_authors_avg": [ 785.75, 652.6512755675882 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.5, 0.8660254037844386 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.058025885318565944, "corr_recommendation_correctness": 0.816496580927726, "gs_citation": 26, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8781721173927203460&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=WuEiafqdy9H", "email": "samsung.com;kaist.ac.kr;kaist.ac.kr;", "author_num": 4, "aff_unique_index": "0;1;1", "aff_unique_norm": "Samsung;Korea Advanced Institute of Science and Technology", "aff_unique_dep": "Samsung;", "aff_unique_url": "https://www.samsung.com;https://www.kaist.ac.kr", "aff_unique_abbr": "Samsung;KAIST", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "South Korea" }, { "title": "Assessing Generalization of SGD via Disagreement", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6300", "id": "WvOGCEAQhxl", "poster": "", "openreview": "https://openreview.net/forum?id=WvOGCEAQhxl", "slides": "https://iclr.cc/virtual/2022/poster/6300", "video": "https://iclr.cc/virtual/2022/poster/6300", "author_site": "Yiding Jiang, Vaishnavh Nagarajan, Christina Baek, Zico Kolter", "tldr": "", "abstract": "We empirically show that the test error of deep networks can be estimated by training the same architecture on the same training set but with two different runs of Stochastic Gradient Descent (SGD), and then measuring the disagreement rate between the two networks on unlabeled test data. This builds on -- and is a stronger version of -- the observation in Nakkiran&Bansal 20, which requires the runs to be on separate training sets. We further theoretically show that this peculiar phenomenon arises from the well-calibrated nature of ensembles of SGD-trained models. This finding not only provides a simple empirical measure to directly predict the test error using unlabeled test data, but also establishes a new conceptual connection between generalization and calibration.", "keywords": "Generalization;Deep Learning;Empirical Phenomenon;Accuracy Estimation;Stochastic Gradient Descent", "primary_area": "", "supplementary_material": "", "author": "Yiding Jiang;Vaishnavh Nagarajan;Christina Baek;J Zico Kolter", "authorids": "~Yiding_Jiang2;~Vaishnavh_Nagarajan3;~Christina_Baek2;~J_Zico_Kolter1", "gender": "M;;M;M", "homepage": "https://yidingjiang.github.io/;https://kebaek.github.io;http://www.zicokolter.com;https://vaishnavh.github.io/", "dblp": ";202/7238;67/2526;161/0079", "google_scholar": "x9qzWg8AAAAJ;;UXh1I6UAAAAJ;https://scholar.google.nl/citations?user=LrsjJfwAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Yiding_Jiang2;~Christina_Baek2;~Zico_Kolter1;~Vaishnavh_Nagarajan1", "aff": "Carnegie Mellon University;Carnegie Mellon University;Carnegie Mellon University;Google", "aff_domain": "andrew.cmu.edu;cmu.edu;cmu.edu;google.com", "position": "PhD student;PhD student;Full Professor;Researcher", "bibtex": "@inproceedings{\njiang2022assessing,\ntitle={Assessing Generalization of {SGD} via Disagreement},\nauthor={Yiding Jiang and Vaishnavh Nagarajan and Christina Baek and J Zico Kolter},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=WvOGCEAQhxl}\n}", "github": "", "project": "", "reviewers": "APKH;DeNB;5dWv;sYb8", "pdf_size": 0, "recommendation": "8;8;8;8", "confidence": "3;4;4;3", "correctness": "4;4;4;4", "technical_novelty": "3;3;4;3", "empirical_novelty": "3;3;4;3", "wc_summary_paper": "65;110;121;75", "wc_summary_review": "25;84;33;14", "wc_main_review": "354;418;321;158", "wc_review": "444;612;475;247", "wc_reply_reviewers": "0;172;22;0", "wc_reply_authors": "467;1628;177;319", "reply_reviewers": "0;2;1;0", "reply_authors": "1;4;1;1", "recommendation_avg": [ 8.0, 0.0 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 4.0, 0.0 ], "technical_novelty_avg": [ 3.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 92.75, 23.34925052330374 ], "wc_summary_review_avg": [ 39.0, 26.842131062939096 ], "wc_main_review_avg": [ 312.75, 95.9097883430049 ], "wc_review_avg": [ 444.5, 130.37733698768355 ], "wc_reply_reviewers_avg": [ 48.5, 71.86619511286234 ], "wc_reply_authors_avg": [ 647.75, 575.1614447266089 ], "reply_reviewers_avg": [ 0.75, 0.82915619758885 ], "reply_authors_avg": [ 1.75, 1.299038105676658 ], "replies_avg": [ 20, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 145, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5961133293362778913&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=WvOGCEAQhxl", "email": "andrew.cmu.edu;cmu.edu;cmu.edu;google.com", "author_num": 4, "aff_unique_index": "0;0;0;1", "aff_unique_norm": "Carnegie Mellon University;Google", "aff_unique_dep": ";Google", "aff_unique_url": "https://www.cmu.edu;https://www.google.com", "aff_unique_abbr": "CMU;Google", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "WwKv20NrsfB", "title": "Apollo: An Adaptive Parameter-wised Diagonal Quasi-Newton Method for Nonconvex Stochastic Optimization", "track": "main", "status": "Reject", "tldr": "", "abstract": "In this paper, we introduce Apollo, a quasi-Newton method for nonconvex stochastic optimization, which dynamically incorporates the curvature of the loss function by approximating the Hessian via a diagonal matrix. Importantly, the update and storage of the diagonal approximation of Hessian is as efficient as adaptive first-order optimization methods with linear complexity for both time and memory. To handle nonconvexity, we replace the Hessian with its rectified absolute value, which is guaranteed to be positive-definite. Experiments on three tasks of vision and language show that Apollo achieves significant improvements over other stochastic optimization methods, including SGD and variants of Adam, in term of both convergence speed and generalization performance.", "keywords": "Optimization for Neural Networks;Optimization for Representation Learning;Stochastic Optimization;Nonconvex;Quasi-Newton;Optimization for Deep Learning", "primary_area": "", "supplementary_material": "/attachment/584d9bc4830486de19bafce51aff9032f40ad5ce.zip", "author": "Xuezhe Ma", "authorids": "~Xuezhe_Ma1", "gender": "M", "homepage": "https://xuezhemax.github.io/", "dblp": "127/0230", "google_scholar": "6_MQLIcAAAAJ", "orcid": "", "linkedin": "xuezhe-ma-b5354731", "or_profile": "~Xuezhe_Ma1", "aff": "USC/ISI", "aff_domain": "isi.edu", "position": "Assistant Professor", "bibtex": "@misc{\nma2022apollo,\ntitle={Apollo: An Adaptive Parameter-wised Diagonal Quasi-Newton Method for Nonconvex Stochastic Optimization},\nauthor={Xuezhe Ma},\nyear={2022},\nurl={https://openreview.net/forum?id=WwKv20NrsfB}\n}", "github": "", "project": "", "reviewers": "XwAE;LyjG;p4ji;qbeR", "site": "https://openreview.net/forum?id=WwKv20NrsfB", "pdf_size": 0, "recommendation": "3;5;6;6", "confidence": "4;4;3;3", "correctness": "2;3;3;3", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;2;3;2", "wc_summary_paper": "77;64;98;53", "wc_summary_review": "37;19;33;56", "wc_main_review": "216;289;338;141", "wc_review": "330;372;469;250", "wc_reply_reviewers": "23;0;0;76", "wc_reply_authors": "20;38;193;306", "reply_reviewers": "1;0;0;1", "reply_authors": "1;1;1;1", "recommendation_avg": [ 5.0, 1.224744871391589 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 73.0, 16.748134224444225 ], "wc_summary_review_avg": [ 36.25, 13.216939887886303 ], "wc_main_review_avg": [ 246.0, 74.56205469271887 ], "wc_review_avg": [ 355.25, 78.95370478957906 ], "wc_reply_reviewers_avg": [ 24.75, 31.04331651096577 ], "wc_reply_authors_avg": [ 139.25, 117.43801556565914 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 1, 0 ], "corr_recommendation_confidence": -0.8164965809277259, "corr_recommendation_correctness": 0.9428090415820632, "gs_citation": 50, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13451820388334628730&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0", "aff_unique_norm": "University of Southern California", "aff_unique_dep": "", "aff_unique_url": "https://isi.usc.edu", "aff_unique_abbr": "USC", "aff_campus_unique_index": "0", "aff_campus_unique": "ISI", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "id": "WxBFVNbDUT6", "title": "Benchmarking Sample Selection Strategies for Batch Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Training sample section techniques, such as prioritized experience replay (PER), have been recognized as of significant importance for online reinforcement learning algorithms. Efficient sample selection can help further improve the learning efficiency and the final learning performance. However, the impact of sample selection for batch reinforcement learning algorithms, where we aim to learn a near-optimal policy exclusively from the offline logged dataset, has not been well studied. In this work, we investigate the application of non-uniform sampling techniques in batch reinforcement learning. In particular, we compare six variants of PER based on various heuristic priority metrics that focus on different aspects of the offline learning setting. These metrics include temporal-difference error, n-step return, self-imitation learning objective, pseudo-count, uncertainty, and likelihood. Through extensive experiments on the standard batch RL datasets, we find that non-uniform sampling is also effective in batch RL settings. Furthermore, there is no single metric that works in all situations. Our findings also show that it is insufficient to avoid the bootstrapping error in batch reinforcement learning by only changing the sampling scheme.", "keywords": "Reinforcement Learning;Experience Replay", "primary_area": "", "supplementary_material": "", "author": "Yuwei Fu;Di Wu;Benoit Boulet", "authorids": "~Yuwei_Fu1;~Di_Wu11;~Benoit_Boulet1", "gender": "M;M;M", "homepage": "http://mcgillialab.com/people/;http://mcgillialab.com/people/;https://www.mcgill.ca/ece/benoit-boulet", "dblp": "200/1646;52/328-44.html;", "google_scholar": ";https://scholar.google.ca/citations?user=IbcoTsgAAAAJ;https://scholar.google.ca/citations?user=kkGyLY4AAAAJ", "orcid": ";;0000-0002-3191-3967", "linkedin": ";;benoit-boulet-97078012/", "or_profile": "~Yuwei_Fu1;~Di_Wu11;~Benoit_Boulet1", "aff": "McGill University;McGill University;McGill University", "aff_domain": "mcgill.ca;mcgill.ca;mcgill.ca", "position": "PhD student;Adjunct Professor;Full Professor", "bibtex": "@misc{\nfu2022benchmarking,\ntitle={Benchmarking Sample Selection Strategies for Batch Reinforcement Learning},\nauthor={Yuwei Fu and Di Wu and Benoit Boulet},\nyear={2022},\nurl={https://openreview.net/forum?id=WxBFVNbDUT6}\n}", "github": "", "project": "", "reviewers": "iELi;YFrP;VhFt;8KRS", "site": "https://openreview.net/forum?id=WxBFVNbDUT6", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "4;4;3;4", "correctness": "4;3;4;3", "technical_novelty": "1;1;2;2", "empirical_novelty": "3;2;4;2", "wc_summary_paper": "40;92;74;50", "wc_summary_review": "37;54;101;75", "wc_main_review": "390;129;322;390", "wc_review": "467;275;497;515", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "94;231;106;426", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 1.5, 0.5 ], "empirical_novelty_avg": [ 2.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 64.0, 20.346989949375804 ], "wc_summary_review_avg": [ 66.75, 23.920441049445557 ], "wc_main_review_avg": [ 307.75, 106.86995602132528 ], "wc_review_avg": [ 438.5, 95.9413883576843 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 214.25, 133.50725635709844 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": -0.5773502691896257, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14382563296372998869&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;0", "aff_unique_norm": "McGill University", "aff_unique_dep": "", "aff_unique_url": "https://www.mcgill.ca", "aff_unique_abbr": "McGill", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Canada" }, { "title": "Expressivity of Emergent Languages is a Trade-off between Contextual Complexity and Unpredictability", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6566", "id": "WxuE_JWxjkW", "poster": "", "openreview": "https://openreview.net/forum?id=WxuE_JWxjkW", "slides": "https://iclr.cc/virtual/2022/poster/6566", "video": "https://iclr.cc/virtual/2022/poster/6566", "author_site": "Shangmin Guo, YI REN, Kory Mathewson, Simon Kirby, Stefano Albrecht, Kenny Smith", "tldr": "", "abstract": "Researchers are using deep learning models to explore the emergence of language in various language games, where agents interact and develop an emergent language to solve tasks. We focus on the factors that determine the expressivity of emergent languages, which reflects the amount of information about input spaces those languages are capable of encoding. We measure the expressivity of emergent languages based on the generalisation performance across different games, and demonstrate that the expressivity of emergent languages is a trade-off between the complexity and unpredictability of the context those languages emerged from. Another contribution of this work is the discovery of message type collapse, i.e. the number of unique messages is lower than that of inputs. We also show that using the contrastive loss proposed by Chen et al. (2020) can alleviate this problem.", "keywords": "Emergent Language;Expressivity", "primary_area": "", "supplementary_material": "/attachment/9d25f73d43130742bdd6e4b09e1654d9e93e7c5d.zip", "author": "Shangmin Guo;Yi Ren;Kory Wallace Mathewson;Simon Kirby;Stefano V Albrecht;Kenny Smith", "authorids": "~Shangmin_Guo1;~Yi_Ren6;~Kory_Wallace_Mathewson1;simon.kirby@ed.ac.uk;~Stefano_V_Albrecht1;~Kenny_Smith1", "gender": "M;M;M;;;M", "homepage": ";https://joshua-ren.github.io/;https://korymathewson.com/;;https://agents-lab.org/stefano-albrecht/;http://www.ling.ed.ac.uk/~kenny", "dblp": "183/0949;;182/1971;;118/3975;58/6224", "google_scholar": "cpOrbSoAAAAJ;5QNce38AAAAJ;K8MFvX4AAAAJ;;https://scholar.google.co.uk/citations?user=ceSFqCcAAAAJ;", "orcid": "0000-0003-1716-0994;;0000-0002-5688-6221;;0000-0002-8735-1465;0000-0002-4530-6914", "linkedin": ";;korymath/?originalSubdomain=ca;;;", "or_profile": "~Shangmin_Guo1;~Yi_Ren6;~Kory_Wallace_Mathewson1;simon.kirby@ed.ac.uk;~Stefano_V_Albrecht1;~Kenny_Smith1", "aff": "University of Edinburgh;University of British Columbia;;;University of Edinburgh;University of Edinburgh", "aff_domain": "ed.ac.uk;ubc.ca;;;ed.ac.uk;ed.ac.uk", "position": "PhD student;PhD student;;;Associate Professor;Professor", "bibtex": "@inproceedings{\nguo2022expressivity,\ntitle={Expressivity of Emergent Languages is a Trade-off between Contextual Complexity and Unpredictability},\nauthor={Shangmin Guo and Yi Ren and Kory Wallace Mathewson and Simon Kirby and Stefano V Albrecht and Kenny Smith},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=WxuE_JWxjkW}\n}", "github": "", "project": "", "reviewers": "27ev;MdRY;jyzs;B1Wi", "pdf_size": 0, "recommendation": "3;6;8;8", "confidence": "4;4;4;3", "correctness": "2;4;4;4", "technical_novelty": "3;3;3;2", "empirical_novelty": "2;0;3;4", "wc_summary_paper": "84;67;128;225", "wc_summary_review": "46;108;84;37", "wc_main_review": "1073;617;297;146", "wc_review": "1203;792;509;408", "wc_reply_reviewers": "0;57;110;0", "wc_reply_authors": "2229;574;582;210", "reply_reviewers": "0;1;1;0", "reply_authors": "4;2;2;1", "recommendation_avg": [ 6.25, 2.0463381929681126 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.8660254037844386 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 1.479019945774904 ], "wc_summary_paper_avg": [ 126.0, 61.33922073192649 ], "wc_summary_review_avg": [ 68.75, 28.71737279069936 ], "wc_main_review_avg": [ 533.25, 355.007306262843 ], "wc_review_avg": [ 728.0, 308.2539537459333 ], "wc_reply_reviewers_avg": [ 41.75, 45.7622934302904 ], "wc_reply_authors_avg": [ 898.75, 782.5814254759692 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 2.25, 1.0897247358851685 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.49374193110101877, "corr_recommendation_correctness": 0.9169493006161777, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11902843079487263363&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=WxuE_JWxjkW", "email": "ed.ac.uk;ubc.ca;;;ed.ac.uk;ed.ac.uk", "author_num": 6, "aff_unique_index": "0;1;0;0", "aff_unique_norm": "University of Edinburgh;University of British Columbia", "aff_unique_dep": ";", "aff_unique_url": "https://www.ed.ac.uk;https://www.ubc.ca", "aff_unique_abbr": "Edinburgh;UBC", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0", "aff_country_unique": "United Kingdom;Canada" }, { "title": "HyperDQN: A Randomized Exploration Method for Deep Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6658", "id": "X0nrKAXu7g-", "poster": "", "openreview": "https://openreview.net/forum?id=X0nrKAXu7g-", "slides": "https://iclr.cc/virtual/2022/poster/6658", "video": "https://iclr.cc/virtual/2022/poster/6658", "author_site": "Ziniu Li, Yingru Li, Yushun Zhang, Tong Zhang, Zhi-Quan Luo", "tldr": "", "abstract": "Randomized least-square value iteration (RLSVI) is a provably efficient exploration method. However, it is limited to the case where (1) a good feature is known in advance and (2) this feature is fixed during the training. If otherwise, RLSVI suffers an unbearable computational burden to obtain the posterior samples. In this work, we present a practical algorithm named HyperDQN to address the above issues under deep RL. In addition to a non-linear neural network (i.e., base model) that predicts Q-values, our method employs a probabilistic hypermodel (i.e., meta model), which outputs the parameter of the base model. When both models are jointly optimized under a specifically designed objective, three purposes can be achieved. First, the hypermodel can generate approximate posterior samples regarding the parameter of the Q-value function. As a result, diverse Q-value functions are sampled to select exploratory action sequences. This retains the punchline of RLSVI for efficient exploration. Second, a good feature is learned to approximate Q-value functions. This addresses limitation (1). Third, the posterior samples of the Q-value function can be obtained in a more efficient way than the existing methods, and the changing feature does not affect the efficiency. This deals with limitation (2). On the Atari suite, HyperDQN with 20M frames outperforms DQN with 200M frames in terms of the maximum human-normalized score. For SuperMarioBros, HyperDQN outperforms several exploration bonus and randomized exploration methods on 5 out of 9 games.", "keywords": "exploration;reinforcement learning", "primary_area": "", "supplementary_material": "", "author": "Ziniu Li;Yingru Li;Yushun Zhang;Tong Zhang;Zhi-Quan Luo", "authorids": "~Ziniu_Li1;~Yingru_Li1;~Yushun_Zhang1;~Tong_Zhang2;~Zhi-Quan_Luo1", "gender": "M;M;M;M;M", "homepage": "http://www.liziniu.org/;https://richardli.xyz;https://zyushun.github.io/;http://tongzhang-ml.org;", "dblp": "254/0986;156/7684;276/8662;07/4227-1;", "google_scholar": "80UnKQQAAAAJ;OOhB7fcAAAAJ;https://scholar.google.com/citations?hl=en;LurWtuYAAAAJ;dW3gcXoAAAAJ", "orcid": ";;;0000-0002-5511-2558;", "linkedin": ";;;;", "or_profile": "~Ziniu_Li1;~Yingru_Li1;~Yushun_Zhang1;~Tong_Zhang2;~Zhi-Quan_Luo1", "aff": "The Chinese University of Hong Kong, Shenzhen;The Chinese University of Hong Kong, Shenzhen, China;The Chinese University of Hong Kong, Shenzhen;Hong Kong University of Science and Technology;The Chinese University of Hong Kong, Shenzhen", "aff_domain": "cuhk.edu.cn;cuhk.edu.cn;cuhk.edu.cn;ust.hk;cuhk.edu.cn", "position": "PhD student;PhD student;PhD student;Full Professor;Full Professor", "bibtex": "@inproceedings{\nli2022hyperdqn,\ntitle={Hyper{DQN}: A Randomized Exploration Method for Deep Reinforcement Learning},\nauthor={Ziniu Li and Yingru Li and Yushun Zhang and Tong Zhang and Zhi-Quan Luo},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=X0nrKAXu7g-}\n}", "github": "", "project": "", "reviewers": "TPoj;2epQ;GmJU;C9G4", "pdf_size": 0, "recommendation": "3;6;6;8", "confidence": "4;3;3;4", "correctness": "3;3;3;4", "technical_novelty": "2;2;2;3", "empirical_novelty": "3;3;2;3", "wc_summary_paper": "139;72;89;106", "wc_summary_review": "91;40;76;50", "wc_main_review": "436;418;445;296", "wc_review": "666;530;610;452", "wc_reply_reviewers": "0;125;0;0", "wc_reply_authors": "2487;1079;1192;862", "reply_reviewers": "0;1;0;0", "reply_authors": "5;3;2;2", "recommendation_avg": [ 5.75, 1.7853571071357126 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 101.5, 24.763884994079586 ], "wc_summary_review_avg": [ 64.25, 20.27775875189366 ], "wc_main_review_avg": [ 398.75, 60.1139542868376 ], "wc_review_avg": [ 564.5, 80.96141056083448 ], "wc_reply_reviewers_avg": [ 31.25, 54.12658773652741 ], "wc_reply_authors_avg": [ 1405.0, 635.8494318626069 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 3.0, 1.224744871391589 ], "replies_avg": [ 25, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.14002800840280097, "corr_recommendation_correctness": 0.7276068751089989, "gs_citation": 21, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9038373559931716364&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=X0nrKAXu7g-", "email": "cuhk.edu.cn;cuhk.edu.cn;cuhk.edu.cn;ust.hk;cuhk.edu.cn", "author_num": 5, "aff_unique_index": "0;0;0;1;0", "aff_unique_norm": "Chinese University of Hong Kong;Hong Kong University of Science and Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.cuhk.edu.cn;https://www.ust.hk", "aff_unique_abbr": "CUHK;HKUST", "aff_campus_unique_index": "0;0;0;1;0", "aff_campus_unique": "Shenzhen;Hong Kong SAR", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "id": "X1y1ur-NCh_", "title": "Did I do that? Blame as a means to identify controlled effects in reinforcement learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Identifying controllable aspects of the environment has proven to be an extraordinary intrinsic motivator to reinforcement learning agents. Despite repeatedly achieving State-of-the-Art results, this approach has only been studied as a proxy to a reward-based task and has not yet been evaluated on its own. We show that solutions relying on action-prediction fail to model critical controlled events. Humans, on the other hand, assign blame to their actions to decide what they controlled. This work proposes Controlled Effect Network (CEN), an unsupervised method based on counterfactual measures of blame to identify effects on the environment controlled by the agent. CEN is evaluated in a wide range of environments showing that it can accurately identify controlled effects. Moreover, we demonstrate CEN's capabilities as intrinsic motivator by integrating it in the state-of-the-art exploration method, achieving substantially better performance than action-prediction models.", "keywords": "reinforcement learning;unsupervised reinforcement learning", "primary_area": "", "supplementary_material": "", "author": "Oriol Corcoll Andreu;Youssef Sherif Mansour Mohamed;Raul Vicente", "authorids": "~Oriol_Corcoll_Andreu1;~Youssef_Sherif_Mansour_Mohamed2;~Raul_Vicente1", "gender": ";M;", "homepage": "http://ocorcoll.com;https://www.linkedin.com/in/youssef-sherif-mohamed/;", "dblp": ";;02/4965", "google_scholar": "PH0oQ5UAAAAJ;;", "orcid": ";;", "linkedin": "ocorcoll;;", "or_profile": "~Oriol_Corcoll_Andreu1;~Youssef_Sherif_Mansour_Mohamed2;~Raul_Vicente1", "aff": "University of Tartu;;", "aff_domain": "ut.ee;;", "position": "PhD student;;", "bibtex": "@misc{\nandreu2022did,\ntitle={Did I do that? Blame as a means to identify controlled effects in reinforcement learning},\nauthor={Oriol Corcoll Andreu and Youssef Sherif Mansour Mohamed and Raul Vicente},\nyear={2022},\nurl={https://openreview.net/forum?id=X1y1ur-NCh_}\n}", "github": "", "project": "", "reviewers": "hgSF;MeF5;Y2tc;fGPn", "site": "https://openreview.net/forum?id=X1y1ur-NCh_", "pdf_size": 0, "recommendation": "5;6;6;6", "confidence": "4;4;4;3", "correctness": "3;3;3;3", "technical_novelty": "3;3;3;4", "empirical_novelty": "2;4;4;3", "wc_summary_paper": "108;102;167;135", "wc_summary_review": "35;103;156;77", "wc_main_review": "356;523;684;250", "wc_review": "499;728;1007;462", "wc_reply_reviewers": "473;241;671;241", "wc_reply_authors": "848;1602;1675;687", "reply_reviewers": "2;2;3;1", "reply_authors": "2;3;3;1", "recommendation_avg": [ 5.75, 0.4330127018922193 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 3.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 128.0, 25.71964229922337 ], "wc_summary_review_avg": [ 92.75, 43.84275880918079 ], "wc_main_review_avg": [ 453.25, 164.98390072973788 ], "wc_review_avg": [ 674.0, 217.58561533336712 ], "wc_reply_reviewers_avg": [ 406.5, 179.6962715250375 ], "wc_reply_authors_avg": [ 1203.0, 439.96193017123653 ], "reply_reviewers_avg": [ 2.0, 0.7071067811865476 ], "reply_authors_avg": [ 2.25, 0.82915619758885 ], "replies_avg": [ 22, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": 0.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10141094237793272341&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff_unique_index": "0", "aff_unique_norm": "University of Tartu", "aff_unique_dep": "", "aff_unique_url": "https://www.ut.ee", "aff_unique_abbr": "UT", "aff_country_unique_index": "0", "aff_country_unique": "Estonia" }, { "id": "X2V7RW3Sul", "title": "Improving Hyperparameter Optimization by Planning Ahead", "track": "main", "status": "Reject", "tldr": "", "abstract": "Hyperparameter optimization (HPO) is generally treated as a bi-level optimization problem that involves fitting a (probabilistic) surrogate model to a set of observed hyperparameter responses, e.g. validation loss, and consequently maximizing an acquisition function using a surrogate model to identify good hyperparameter candidates for evaluation. The choice of a surrogate and/or acquisition function can be further improved via knowledge transfer across related tasks. In this paper, we propose a novel transfer learning approach, defined within the context of model-based reinforcement learning, where we represent the surrogate as an ensemble of probabilistic models that allows trajectory sampling. We further propose a new variant of model predictive control which employs a simple look-ahead strategy as a policy that optimizes a sequence of actions, representing hyperparameter candidates to expedite HPO. Our experiments on three meta-datasets comparing to state-of-the-art HPO algorithms including a model-free reinforcement learning approach show that the proposed method can outperform all baselines by exploiting a simple planning-based policy. ", "keywords": "model-based reinforcement learning;hyperparameter optimization;model predictive control;meta-learning;transfer learning", "primary_area": "", "supplementary_material": "/attachment/82b718c590b9ddc74abc7ac80c65d7c0929f79ba.zip", "author": "Hadi Samer Jomaa;Jonas Falkner;Lars Schmidt-Thieme", "authorids": "~Hadi_Samer_Jomaa1;~Jonas_Falkner1;~Lars_Schmidt-Thieme1", "gender": "M;;M", "homepage": "https://www.ismll.uni-hildesheim.de/personen/hsjomaa.html;;https://www.ismll.uni-hildesheim.de/personen/lst_en.html", "dblp": ";;s/LarsSchmidtThieme", "google_scholar": "QLSZWNkAAAAJ;;https://scholar.google.de/citations?user=l3taTdYAAAAJ", "orcid": ";;0000-0001-5729-6023", "linkedin": "hadisamerjomaa/;;", "or_profile": "~Hadi_Samer_Jomaa1;~Jonas_Falkner1;~Lars_Schmidt-Thieme1", "aff": "University of Hildesheim;;University of Hildesheim", "aff_domain": "uni-hildesheim.de;;uni-hildesheim.de", "position": "PhD student;;Full Professor", "bibtex": "@misc{\njomaa2022improving,\ntitle={Improving Hyperparameter Optimization by Planning Ahead},\nauthor={Hadi Samer Jomaa and Jonas Falkner and Lars Schmidt-Thieme},\nyear={2022},\nurl={https://openreview.net/forum?id=X2V7RW3Sul}\n}", "github": "", "project": "", "reviewers": "8eE4;cKwe;yo76;Vs7P", "site": "https://openreview.net/forum?id=X2V7RW3Sul", "pdf_size": 0, "recommendation": "3;3;5;8", "confidence": "4;4;2;3", "correctness": "3;2;3;3", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;2;2;4", "wc_summary_paper": "87;27;189;103", "wc_summary_review": "42;78;50;35", "wc_main_review": "665;554;341;210", "wc_review": "794;659;580;348", "wc_reply_reviewers": "0;0;0;31", "wc_reply_authors": "964;880;224;61", "reply_reviewers": "0;0;0;1", "reply_authors": "2;2;1;1", "recommendation_avg": [ 4.75, 2.0463381929681126 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 101.5, 57.92020372892347 ], "wc_summary_review_avg": [ 51.25, 16.330569494050106 ], "wc_main_review_avg": [ 442.5, 177.69144605185699 ], "wc_review_avg": [ 595.25, 161.96508111318315 ], "wc_reply_reviewers_avg": [ 7.75, 13.423393758658799 ], "wc_reply_authors_avg": [ 532.25, 395.10528660092615 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.5525321015128181, "corr_recommendation_correctness": 0.49374193110101877, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:xD_sf_lS6Q4J:scholar.google.com/&scioq=Improving+Hyperparameter+Optimization+by+Planning+Ahead&hl=en&as_sdt=0,33", "gs_version_total": 3, "aff_unique_index": "0;0", "aff_unique_norm": "University of Hildesheim", "aff_unique_dep": "", "aff_unique_url": "https://www.uni-hildesheim.de/", "aff_unique_abbr": "", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Germany" }, { "id": "X3WxnuzAYyE", "title": "PKCAM: Previous Knowledge Channel Attention Module", "track": "main", "status": "Reject", "tldr": "", "abstract": "Attention mechanisms have been explored with CNNs, both across the spatial and channel dimensions. \nHowever, all the existing methods devote the attention modules to capture local interactions from the current feature map only, disregarded the valuable previous knowledge that is acquired by the earlier layers. \nThis paper tackles the following question: Can one incorporate previous knowledge aggregation while learning channel attention more efficiently? To this end, we propose a Previous Knowledge Channel Attention Module( PKCAM), that captures channel-wise relations across different layers to model the global context. \nOur proposed module PKCAM is easily integrated into any feed-forward CNN architectures and trained in an end-to-end fashion with a negligible footprint due to its lightweight property. We validate our novel architecture through extensive experiments on image classification and object detection tasks with different backbones. \nOur experiments show consistent improvements in performances against their counterparts. We also conduct experiments that probe the robustness of the learned representations.", "keywords": "Channel Attention;Attention;Deep Learning;Computer Vision;Neural Networks.", "primary_area": "", "supplementary_material": "", "author": "Eslam Mohamed BAKR;Ahmad A. Al Sallab;Mohsen Rashwan", "authorids": "~Eslam_Mohamed_BAKR1;~Ahmad_A._Al_Sallab1;~Mohsen_Rashwan1", "gender": "M;M;M", "homepage": ";;http://eece.cu.edu.eg/content/mohsen-rashwan", "dblp": "330/8100;;", "google_scholar": "https://scholar.google.com/citations?hl=en;;", "orcid": ";;", "linkedin": "eslam-bakr-a693a0124/;;", "or_profile": "~Eslam_Mohamed_BAKR1;~Ahmad_A._Al_Sallab1;~Mohsen_Rashwan1", "aff": "Valeo;;", "aff_domain": "valeo.com;;", "position": "Researcher;;", "bibtex": "@misc{\nbakr2022pkcam,\ntitle={{PKCAM}: Previous Knowledge Channel Attention Module},\nauthor={Eslam Mohamed BAKR and Ahmad A. Al Sallab and Mohsen Rashwan},\nyear={2022},\nurl={https://openreview.net/forum?id=X3WxnuzAYyE}\n}", "github": "", "project": "", "reviewers": "HH21;2Qrr;cAgA;BFC2", "site": "https://openreview.net/forum?id=X3WxnuzAYyE", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "4;5;4;4", "correctness": "2;2;2;3", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "73;71;46;53", "wc_summary_review": "19;19;18;50", "wc_main_review": "272;377;238;147", "wc_review": "364;467;302;250", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 2.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 60.75, 11.54068888758379 ], "wc_summary_review_avg": [ 26.5, 13.573871960498227 ], "wc_main_review_avg": [ 258.5, 82.27545198903498 ], "wc_review_avg": [ 345.75, 80.80338792402209 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": 1.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1663216934288968339&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0", "aff_unique_norm": "Valeo", "aff_unique_dep": "", "aff_unique_url": "https://www.valeo.com", "aff_unique_abbr": "", "aff_country_unique_index": "0", "aff_country_unique": "France" }, { "id": "X4vBK5BBtQY", "title": "Enhancing the Transferability of Adversarial Attacks via Scale Ensemble", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "There is a line of works on adversarial example generation in computer vision, which makes deep learning suffers a lot. Driven by the transferability decrease among models with different input sizes, we present a novel attack method by using a scale input ensemble framework to enhance the transferability of adversarial images, which is named Scale Ensemble Method(SEM). Our method can preserve the characteristic textures of the original image via zooming the surrogate model's input in and out in a specific sequence during generating adversarial examples. The superior texture feature highlights the important attacking region and increases the diversity of adversarial perturbations for assisting a more aggressive attack. The experiments on ImageNet show that our method successfully mitigates the gap of transferability between models with different input sizes and achieves about 8% higher success rate comparing with the state-of-the-art input transformation methods. And we also demonstrate that our method can integrate with existing methods and bypass a variety of defense methods with over 90% success rate.", "keywords": "adversarial examples;adversarial attack;transferability;scale ensemble;image classification", "primary_area": "", "supplementary_material": "", "author": "Xianfeng Gao;Zhikai Chen;Bo Zhang", "authorids": "~Xianfeng_Gao2;~Zhikai_Chen1;~Bo_Zhang12", "gender": "M;M;M", "homepage": ";https://blade.tencent.com;", "dblp": ";;", "google_scholar": "https://scholar.google.com.hk/citations?view_op=list_works;;vwlckBgAAAAJ", "orcid": ";;", "linkedin": ";;\u5fd7\u51ef-\u9648-ba7410106", "or_profile": "~Xianfeng_Gao2;~Bo_Zhang12;~Zhi_kai_Chen1", "aff": "Tencent Blade Team;Tencent Blade Team;Tencent Blade Team", "aff_domain": "tencent.com;tencent.com;tencent.com", "position": "Researcher;Researcher;Researcher", "bibtex": "@misc{\ngao2022enhancing,\ntitle={Enhancing the Transferability of Adversarial Attacks via Scale Ensemble},\nauthor={Xianfeng Gao and Zhikai Chen and Bo Zhang},\nyear={2022},\nurl={https://openreview.net/forum?id=X4vBK5BBtQY}\n}", "github": "", "project": "", "reviewers": "trBT;Qrxb;Thoq;nMK8", "site": "https://openreview.net/forum?id=X4vBK5BBtQY", "pdf_size": 0, "recommendation": "1;3;5;5", "confidence": "5;3;4;4", "correctness": "2;2;3;3", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;2;3;2", "wc_summary_paper": "72;119;49;51", "wc_summary_review": "61;19;78;45", "wc_main_review": "435;349;342;137", "wc_review": "568;487;469;233", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.5, 1.6583123951777 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 72.75, 28.181332473820326 ], "wc_summary_review_avg": [ 50.75, 21.72987574745884 ], "wc_main_review_avg": [ 315.75, 109.50656373021664 ], "wc_review_avg": [ 439.25, 124.78055737974566 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.42640143271122083, "corr_recommendation_correctness": 0.9045340337332909, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:zxF32CZO3LEJ:scholar.google.com/&scioq=Enhancing+the+Transferability+of+Adversarial+Attacks+via+Scale+Ensemble&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;0", "aff_unique_norm": "Tencent", "aff_unique_dep": "Blade Team", "aff_unique_url": "https://www.tencent.com", "aff_unique_abbr": "Tencent", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "id": "X59kvde4v1Y", "title": "DSDF: Coordinated look-ahead strategy in stochastic multi-agent reinforcement learning", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Multi-Agent reinforcement learning has received lot of attention in recent years and have applications in many different areas. Existing methods involving Centralized Training and Decentralized execution, attempts to train the agents towards learning a pattern of coordinated actions to arrive at optimal joint policy. However if some agents are stochastic in their action to varying degrees, the above methods provides poor coordination among agents. In this paper we show how the stochasticity of agents, which could be a result of malfunction or aging of robots, can add to the uncertainty in coordination and thereby contribute to unsatisfactory global rewards. In such a scenario, the deterministic agents have to understand the behavior and limitations of the stochastic agents while the stochastic agents have to plan taking in cognizance their own limitations. Our proposed method, Deep Stochastic Discounted Factor (DSDF), tunes the discounted factor for the agents by using a learning representation of uncertainty to update the utility networks of individual agents. DSDF also helps in imparting an extent of reliability in coordination thereby granting stochastic agents tasks which are immediate and of shorter trajectory with deterministic ones taking the tasks which involve longer planning. Results on benchmark environments shows the efficacy of the proposed approach when compared with existing deterministic approaches.", "keywords": "reinforcement learning;multi-agent reinforcement learning;stochastic actions;poor co-ordination", "primary_area": "", "supplementary_material": "", "author": "Satheesh K Perepu;Kaushik Dey", "authorids": "~Satheesh_K_Perepu1;~Kaushik_Dey1", "gender": "M;M", "homepage": ";", "dblp": "232/1491;", "google_scholar": "https://scholar.google.co.in/citations?user=eotZW25NVFsC;", "orcid": ";", "linkedin": "satheesh-kumar-perepu-29049723/;kaushikdey/", "or_profile": "~Satheesh_K_Perepu1;~Kaushik_Dey1", "aff": "Ericsson Research;Ericsson", "aff_domain": "ericsson.com;ericsson.com", "position": "Researcher;Principal Researcher", "bibtex": "@misc{\nperepu2022dsdf,\ntitle={{DSDF}: Coordinated look-ahead strategy in stochastic multi-agent reinforcement learning},\nauthor={Satheesh K Perepu and Kaushik Dey},\nyear={2022},\nurl={https://openreview.net/forum?id=X59kvde4v1Y}\n}", "github": "", "project": "", "reviewers": "j88F;Kino;7seM;HdYi", "site": "https://openreview.net/forum?id=X59kvde4v1Y", "pdf_size": 0, "recommendation": "1;3;5;6", "confidence": "3;3;2;4", "correctness": "3;2;3;3", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "57;44;141;41", "wc_summary_review": "58;22;48;68", "wc_main_review": "206;501;209;240", "wc_review": "321;567;398;349", "wc_reply_reviewers": "0;131;169;36", "wc_reply_authors": "686;838;956;576", "reply_reviewers": "0;1;1;1", "reply_authors": "1;2;2;2", "recommendation_avg": [ 3.75, 1.920286436967152 ], "confidence_avg": [ 3.0, 0.7071067811865476 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 70.75, 41.002286521607545 ], "wc_summary_review_avg": [ 49.0, 17.11724276862369 ], "wc_main_review_avg": [ 289.0, 123.11986029881614 ], "wc_review_avg": [ 408.75, 95.43158544213755 ], "wc_reply_reviewers_avg": [ 84.0, 68.54560525664647 ], "wc_reply_authors_avg": [ 764.0, 144.71351008112546 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.75, 0.4330127018922193 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.18411492357966466, "corr_recommendation_correctness": 0.22549380840084865, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:0IHwdu7bJk4J:scholar.google.com/&scioq=DSDF:+Coordinated+look-ahead+strategy+in+stochastic+multi-agent+reinforcement+learning&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Ericsson", "aff_unique_dep": "Research", "aff_unique_url": "https://www.ericsson.com/research", "aff_unique_abbr": "Ericsson", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Sweden" }, { "id": "X5S3pEGPZv8", "title": "Revisiting Skeleton-based Action Recognition", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Human skeleton, as a compact representation of human action, has received increasing attention in recent years. Many skeleton-based action recognition methods adopt GCNs to extract features on top of human skeletons. Despite the positive results shown in these attempts, GCN-based methods are subject to limitations in robustness, interoperability, and scalability. In this work, we propose PoseConv3D, a new approach to skeleton-based action recognition. PoseConv3D relies on a 3D heatmap stack instead of a graph sequence as the base representation of human skeletons. Compared to GCN-based methods, PoseConv3D is more effective in learning spatiotemporal features, more robust against pose estimation noises, and generalizes better in cross-dataset settings. Also, PoseConv3D can handle multiple-person scenarios without additional computation cost, and its features can be easily integrated with other modalities at early fusion stages, providing a great design space to boost the performance. PoseConv3D achieves the state-of-the-art on five of six standard skeleton-based action recognition benchmarks. Once fused with other modalities, it achieves the state-of-the-art on all eight multi-modality action recognition benchmarks. \n", "keywords": "action;skeleton;video;recognition", "primary_area": "", "supplementary_material": "/attachment/de9a1e5e91aed90db45bdd9da5ebe636ee78d391.zip", "author": "Haodong Duan;Yue Zhao;Kai Chen;Dahua Lin;Bo Dai", "authorids": "~Haodong_Duan1;~Yue_Zhao4;~Kai_Chen4;~Dahua_Lin1;~Bo_Dai2", "gender": "M;M;M;M;M", "homepage": "https://kennymckormick.github.io;https://zhaoyue-zephyrus.github.io/;https://chenkai.site/;http://dahua.site;http://daibo.info/", "dblp": "211/7919;48/76-6;181/2839-26;53/6088;64/2903-2", "google_scholar": "vi3W-m8AAAAJ;https://scholar.google.com.hk/citations?user=6_U35tAAAAAJ;https://scholar.google.com.hk/citations?user=eGD0b7IAAAAJ;GMzzRRUAAAAJ;https://scholar.google.com.hk/citations?user=KNWTvgEAAAAJ", "orcid": "0000-0002-3052-4177;0000-0003-2753-5921;0000-0002-6820-2325;;0000-0003-0777-9232", "linkedin": "haodong-duan-bb9349166/;;;;", "or_profile": "~Haodong_Duan1;~Yue_Zhao4;~Kai_Chen4;~Dahua_Lin1;~Bo_Dai2", "aff": "The Chinese University of Hong Kong;University of Texas, Austin;SenseTime;The Chinese University of Hong Kong;Nanyang Technological University", "aff_domain": "ie.cuhk.edu;utexas.edu;sensetime.com;cuhk.edu.hk;ntu.edu.sg", "position": "PhD student;PhD student;Researcher;Associate Professor;Research Assistant Professor", "bibtex": "@misc{\nduan2022revisiting,\ntitle={Revisiting Skeleton-based Action Recognition},\nauthor={Haodong Duan and Yue Zhao and Kai Chen and Dahua Lin and Bo Dai},\nyear={2022},\nurl={https://openreview.net/forum?id=X5S3pEGPZv8}\n}", "github": "", "project": "", "reviewers": "aEYZ;Ao4Q;CLvT;U3T2", "site": "https://openreview.net/forum?id=X5S3pEGPZv8", "pdf_size": 0, "recommendation": "3;5;6;6", "confidence": "4;3;4;3", "correctness": "3;2;3;3", "technical_novelty": "1;2;3;2", "empirical_novelty": "1;2;3;3", "wc_summary_paper": "41;100;101;57", "wc_summary_review": "123;44;66;54", "wc_main_review": "350;238;811;84", "wc_review": "514;382;978;195", "wc_reply_reviewers": "0;0;371;0", "wc_reply_authors": "792;480;704;0", "reply_reviewers": "0;0;1;0", "reply_authors": "1;1;2;0", "recommendation_avg": [ 5.0, 1.224744871391589 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 74.75, 26.366408553308887 ], "wc_summary_review_avg": [ 71.75, 30.597181242722343 ], "wc_main_review_avg": [ 370.75, 271.1543610196967 ], "wc_review_avg": [ 517.25, 289.15339787040375 ], "wc_reply_reviewers_avg": [ 92.75, 160.64771240201338 ], "wc_reply_authors_avg": [ 494.0, 307.05699796617563 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.7071067811865476 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.40824829046386296, "corr_recommendation_correctness": 0.0, "gs_citation": 836, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3596130811806048765&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff_unique_index": "0;1;2;0;3", "aff_unique_norm": "Chinese University of Hong Kong;University of Texas at Austin;SenseTime;Nanyang Technological University", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.cuhk.edu.hk;https://www.utexas.edu;https://www.sensetime.com;https://www.ntu.edu.sg", "aff_unique_abbr": "CUHK;UT Austin;SenseTime;NTU", "aff_campus_unique_index": "0;1;0", "aff_campus_unique": "Hong Kong SAR;Austin;", "aff_country_unique_index": "0;1;0;0;2", "aff_country_unique": "China;United States;Singapore" }, { "title": "Planning in Stochastic Environments with a Learned Model", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6832", "id": "X6D9bAHhBQ1", "poster": "", "openreview": "https://openreview.net/forum?id=X6D9bAHhBQ1", "slides": "https://iclr.cc/virtual/2022/poster/6832", "video": "https://iclr.cc/virtual/2022/poster/6832", "author_site": "Ioannis Antonoglou, Julian Schrittwieser, Sherjil Ozair, Thomas Hubert, David Silver", "tldr": "", "abstract": "Model-based reinforcement learning has proven highly successful. However, learning a model in isolation from its use during planning is problematic in complex environments. To date, the most effective techniques have instead combined value-equivalent model learning with powerful tree-search methods. This approach is exemplified by MuZero, which has achieved state-of-the-art performance in a wide range of domains, from board games to visually rich environments, with discrete and continuous action spaces, in online and offline settings. However, previous instantiations of this approach were limited to the use of deterministic models. This limits their performance in environments that are inherently stochastic, partially observed, or so large and complex that they appear stochastic to a finite agent. In this paper we extend this approach to learn and plan with stochastic models. Specifically, we introduce a new algorithm, Stochastic MuZero, that learns a stochastic model incorporating afterstates, and uses this model to perform a stochastic tree search. Stochastic MuZero matched or exceeded the state of the art in a set of canonical single and multi-agent environments, including 2048 and backgammon, while maintaining the same performance as standard MuZero in the game of Go.", "keywords": "model-based reinforcement learning;deep reinforcement learning;tree based search;MCTS", "primary_area": "", "supplementary_material": "", "author": "Ioannis Antonoglou;Julian Schrittwieser;Sherjil Ozair;Thomas K Hubert;David Silver", "authorids": "~Ioannis_Antonoglou1;~Julian_Schrittwieser1;~Sherjil_Ozair1;~Thomas_K_Hubert1;~David_Silver1", "gender": "M;;M;M;", "homepage": ";http://www.furidamu.org;http://sherjil.ozair.io;;", "dblp": "139/0830;;139/0736;;34/3601", "google_scholar": ";;O7MZStwAAAAJ;WXG0QfMAAAAJ;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~Ioannis_Antonoglou1;~Julian_Schrittwieser1;~Sherjil_Ozair1;~Thomas_K_Hubert1;~David_Silver1", "aff": "Google DeepMind;Google DeepMind;Google;;Google DeepMind", "aff_domain": "deepmind.com;deepmind.com;google.com;;deepmind.com", "position": "Researcher;Researcher;Intern;;Full Professor", "bibtex": "@inproceedings{\nantonoglou2022planning,\ntitle={Planning in Stochastic Environments with a Learned Model},\nauthor={Ioannis Antonoglou and Julian Schrittwieser and Sherjil Ozair and Thomas K Hubert and David Silver},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=X6D9bAHhBQ1}\n}", "github": "", "project": "", "reviewers": "1oV1;TrM2;qs8f;4qYA", "pdf_size": 0, "recommendation": "5;8;8;10", "confidence": "4;4;4;4", "correctness": "3;4;4;4", "technical_novelty": "3;3;3;3", "empirical_novelty": "3;4;4;4", "wc_summary_paper": "311;71;32;102", "wc_summary_review": "101;112;20;46", "wc_main_review": "429;582;543;163", "wc_review": "841;765;595;311", "wc_reply_reviewers": "506;430;0;0", "wc_reply_authors": "755;843;821;321", "reply_reviewers": "3;1;0;0", "reply_authors": "3;2;1;1", "recommendation_avg": [ 7.75, 1.7853571071357126 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 3.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 129.0, 107.96527219434961 ], "wc_summary_review_avg": [ 69.75, 38.08132744535043 ], "wc_main_review_avg": [ 429.25, 163.6770830018668 ], "wc_review_avg": [ 628.0, 203.54115063052976 ], "wc_reply_reviewers_avg": [ 234.0, 235.5376827601053 ], "wc_reply_authors_avg": [ 685.0, 212.63583893596112 ], "reply_reviewers_avg": [ 1.0, 1.224744871391589 ], "reply_authors_avg": [ 1.75, 0.82915619758885 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.8892972917998875, "gs_citation": 85, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8468410037462646095&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2, "pdf": "https://openreview.net/pdf?id=X6D9bAHhBQ1", "email": "deepmind.com;deepmind.com;google.com;;deepmind.com", "author_num": 5, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Google", "aff_unique_dep": "Google DeepMind", "aff_unique_url": "https://deepmind.com", "aff_unique_abbr": "DeepMind", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "United Kingdom;United States" }, { "title": "Learning-Augmented $k$-means Clustering", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7143", "id": "X8cLTHexYyY", "poster": "", "openreview": "https://openreview.net/forum?id=X8cLTHexYyY", "slides": "https://iclr.cc/virtual/2022/poster/7143", "video": "https://iclr.cc/virtual/2022/poster/7143", "author_site": "Jon Ergun, Zhili Feng, Sandeep Silwal, David Woodruff, Samson Zhou", "tldr": "", "abstract": "$k$-means clustering is a well-studied problem due to its wide applicability. Unfortunately, there exist strong theoretical limits on the performance of any algorithm for the $k$-means problem on worst-case inputs. To overcome this barrier, we consider a scenario where ``advice'' is provided to help perform clustering. Specifically, we consider the $k$-means problem augmented with a predictor that, given any point, returns its cluster label in an approximately optimal clustering up to some, possibly adversarial, error. We present an algorithm whose performance improves along with the accuracy of the predictor, even though na\\\"{i}vely following the accurate predictor can still lead to a high clustering cost. Thus if the predictor is sufficiently accurate, we can retrieve a close to optimal clustering with nearly optimal runtime, breaking known computational barriers for algorithms that do not have access to such advice. We evaluate our algorithms on real datasets and show significant improvements in the quality of clustering.", "keywords": "clustering;learning-augmented algorithms", "primary_area": "", "supplementary_material": "/attachment/5909fc23c1364bc92b1b7f261715bb13083672b5.zip", "author": "Jon C. Ergun;Zhili Feng;Sandeep Silwal;David Woodruff;Samson Zhou", "authorids": "~Jon_Ergun1;~Zhili_Feng1;~Sandeep_Silwal1;~David_Woodruff1;~Samson_Zhou1", "gender": "M;;M;M;", "homepage": ";https://zhilif.github.io/;https://sandeepsilwal.com;http://www.cs.cmu.edu/~dwoodruf/;https://samsonzhou.github.io/", "dblp": ";189/7590;225/4637;w/DPWoodruff;179/2683", "google_scholar": ";_lnL4aQAAAAJ;MnDnUvcAAAAJ;https://scholar.google.com.tw/citations?user=0G2t-6sAAAAJ;NpjsgocAAAAJ", "orcid": ";;;;", "linkedin": "jon-ergun-059229233/;;;;", "or_profile": "~Jon_Ergun1;~Zhili_Feng1;~Sandeep_Silwal1;~David_Woodruff1;~Samson_Zhou1", "aff": "Georgetown Day School;Carnegie Mellon University;Massachusetts Institute of Technology;Carnegie Mellon University;School of Computer Science, Carnegie Mellon University", "aff_domain": "gds.org;andrew.cmu.edu;mit.edu;cmu.edu;cs.cmu.edu", "position": "High shool Student;PhD student;PhD student;Associate Professor;Postdoc", "bibtex": "@inproceedings{\nergun2022learningaugmented,\ntitle={Learning-Augmented \\$k\\$-means Clustering},\nauthor={Jon C. Ergun and Zhili Feng and Sandeep Silwal and David Woodruff and Samson Zhou},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=X8cLTHexYyY}\n}", "github": "", "project": "", "reviewers": "dm82;Ftwn;4io3", "pdf_size": 0, "recommendation": "6;8;8", "confidence": "4;5;3", "correctness": "4;3;4", "technical_novelty": "2;4;2", "empirical_novelty": "2;4;2", "wc_summary_paper": "204;395;65", "wc_summary_review": "81;35;2", "wc_main_review": "303;634;207", "wc_review": "588;1064;274", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 7.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "correctness_avg": [ 3.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.6666666666666665, 0.9428090415820634 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.9428090415820634 ], "wc_summary_paper_avg": [ 221.33333333333334, 135.27831393916108 ], "wc_summary_review_avg": [ 39.333333333333336, 32.396844839514166 ], "wc_main_review_avg": [ 381.3333333333333, 182.91042130568238 ], "wc_review_avg": [ 642.0, 324.7686355956601 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": -0.49999999999999983, "gs_citation": 45, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1674496592859124064&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=X8cLTHexYyY", "email": "gds.org;andrew.cmu.edu;mit.edu;cmu.edu;cs.cmu.edu", "author_num": 5, "aff_unique_index": "0;1;2;1;1", "aff_unique_norm": "Georgetown Day School;Carnegie Mellon University;Massachusetts Institute of Technology", "aff_unique_dep": ";;", "aff_unique_url": "https://www.georgetownday.org;https://www.cmu.edu;https://web.mit.edu", "aff_unique_abbr": ";CMU;MIT", "aff_campus_unique_index": "1", "aff_campus_unique": ";Pittsburgh", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "XC-nkaS4rcS", "title": "Accelerated Gradient-Free Method for Heavily Constrained Nonconvex Optimization", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Zeroth-order (ZO) method has been shown to be a powerful method for solving the optimization problem where explicit expression of the gradients is difficult or infeasible to obtain. Recently, due to the practical value of the constrained problems, a lot of ZO Frank-Wolfe or projected ZO methods have been proposed. However, in many applications, we may have a very large number of nonconvex white/black-box constraints, which makes the existing zeroth-order methods extremely inefficient (or even not working) since they need to inquire function value of all the constraints and project the solution to the complicated feasible set. In this paper, to solve the nonconvex problem with a large number of white/black-box constraints, we proposed a doubly stochastic zeroth-order gradient method (DSZOG). Specifically, we reformulate the problem by using the penalty method with distribution probability and sample a mini-batch of constraints to calculate the stochastic zeroth/first-order gradient of the penalty function to update the parameters and distribution, alternately. To further speed up our method, we propose an accelerated doubly stochastic zeroth-order gradient method (ADSZOG) by using the exponential moving average method and adaptive stepsize. Theoretically, we prove DSZOG and ADSZOG can converge to the $\\epsilon$-stationary point of the constrained problem. We also compare the performances of our method with several ZO methods in two applications, and the experimental results demonstrate the superiority of our method in terms of training time and accuracy.", "keywords": "Constrained optimization;nonconvex;zeroth-order", "primary_area": "", "supplementary_material": "/attachment/ac86744276b4c29f113b6169bb817bc8bc58218d.zip", "author": "Wanli Shi;Hongchang Gao;Bin Gu", "authorids": "~Wanli_Shi1;~Hongchang_Gao1;~Bin_Gu1", "gender": "M;;M", "homepage": ";;https://mbzuai.ac.ae/study/faculty/bin-gu/", "dblp": "245/9064;166/5141.html;29/1758-1", "google_scholar": "Li38vbwAAAAJ;;Vo8OgCgAAAAJ", "orcid": ";;0000-0001-6049-1815", "linkedin": ";;", "or_profile": "~Wanli_Shi1;~Hongchang_Gao1;~Bin_Gu1", "aff": "Nanjing University of Information Science and Technology;Temple University;Mohamed bin Zayed University of Artificial Intelligence", "aff_domain": "nuist.edu.cn;temple.edu;mbzuai.ac.ae", "position": "PhD student;Assistant Professor;Assistant Professor", "bibtex": "@misc{\nshi2022accelerated,\ntitle={Accelerated Gradient-Free Method for Heavily Constrained Nonconvex Optimization},\nauthor={Wanli Shi and Hongchang Gao and Bin Gu},\nyear={2022},\nurl={https://openreview.net/forum?id=XC-nkaS4rcS}\n}", "github": "", "project": "", "reviewers": "8Koo;CMcj;dYM3;N2oE;61Mt", "site": "https://openreview.net/forum?id=XC-nkaS4rcS", "pdf_size": 0, "recommendation": "3;5;5;5;5", "confidence": "4;4;3;4;2", "correctness": "3;3;3;3;4", "technical_novelty": "2;2;2;2;3", "empirical_novelty": "3;3;0;2;2", "wc_summary_paper": "169;96;15;87;83", "wc_summary_review": "42;41;236;89;45", "wc_main_review": "692;586;39;320;162", "wc_review": "903;723;290;496;290", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "0;0;0;0;0", "reply_reviewers": "0;0;0;0;0", "reply_authors": "0;0;0;0;0", "recommendation_avg": [ 4.6, 0.7999999999999999 ], "confidence_avg": [ 3.4, 0.8 ], "correctness_avg": [ 3.2, 0.39999999999999997 ], "technical_novelty_avg": [ 2.2, 0.39999999999999997 ], "empirical_novelty_avg": [ 2.0, 1.0954451150103321 ], "wc_summary_paper_avg": [ 90.0, 48.908077042549934 ], "wc_summary_review_avg": [ 90.6, 74.89352441967196 ], "wc_main_review_avg": [ 359.8, 247.0404015540778 ], "wc_review_avg": [ 540.4, 241.74085298103833 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.37500000000000017, "corr_recommendation_correctness": 0.2500000000000001, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:TYr-FzEQ3goJ:scholar.google.com/&scioq=Accelerated+Gradient-Free+Method+for+Heavily+Constrained+Nonconvex+Optimization&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;2", "aff_unique_norm": "Nanjing University of Information Science and Technology;Temple University;Mohamed bin Zayed University of Artificial Intelligence", "aff_unique_dep": ";;", "aff_unique_url": "http://www.nuist.edu.cn;https://www.temple.edu;https://mbzuai.ac.ae", "aff_unique_abbr": ";Temple;MBZUAI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2", "aff_country_unique": "China;United States;United Arab Emirates" }, { "id": "XCS9lvsr5wg", "title": "Federated causal discovery", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Causal discovery aims to learn a causal graph from observational data. To date, most causal discovery methods require data to be stored in a central server. However, data owners gradually refuse to share personalized data to avoid privacy leakage, making this task more troublesome by cutting off the first step. The puzzle, \\textit{how do we infer causal relations from decentralized data?} arises. In this paper, we take a first step in developing a gradient-based learning framework named DAG-Shared Federated Causal Discovery (DS-FCD), which can learn the causal graph without directly touching local data and naturally handle the data heterogeneity. DS-FCD benefits from a two-level structure of each local model. The first level learns the causal graph and communicates with the server to get training information from other clients, while the second level approximates causal mechanisms and personally updates from its own data to accommodate the data heterogeneity. Moreover, DS-FCD formulates the overall learning task as a continuous optimization problem by taking advantage of an equality acyclicity constraint, which can be naturally solved by gradient descent methods. Extensive experiments on both synthetic and real-world data sets verify the efficacy of the proposed method.", "keywords": "Causal discovery;Data heterogeneity;Decentralized data", "primary_area": "", "supplementary_material": "", "author": "Erdun Gao;Junjia Chen;Li Shen;Tongliang Liu;Mingming Gong;Howard Bondell", "authorids": "~Erdun_Gao1;~Junjia_Chen1;~Li_Shen1;~Tongliang_Liu1;~Mingming_Gong1;~Howard_Bondell2", "gender": "M;M;M;M;M;", "homepage": ";https://github.com/cjj19970505;https://sites.google.com/site/mathshenli/home;https://tongliang-liu.github.io/;https://mingming-gong.github.io/;", "dblp": "246/5884;;91/3680-8;150/6667;98/8479;", "google_scholar": ";;yVhgENIAAAAJ;https://scholar.google.com.au/citations?user=EiLdZ_YAAAAJ;https://scholar.google.com.au/citations?user=6BmiCJIAAAAJ;", "orcid": "0000-0003-1736-2764;;;;0000-0001-7147-5589;", "linkedin": ";;;;;", "or_profile": "~Erdun_Gao1;~Junjia_Chen1;~Li_Shen1;~Tongliang_Liu1;~Mingming_Gong1;~Howard_Bondell2", "aff": "University of Melbourne;Xi'an Jiaotong University;JD Explore Academy;University of Sydney;University of Melbourne;", "aff_domain": "unimelb.edu.au;xjtu.edu.cn;jd.com;sydney.edu.au;unimelb.edu.au;", "position": "PhD student;MS student;Researcher;Lecturer;Assistant Professor;", "bibtex": "@misc{\ngao2022federated,\ntitle={Federated causal discovery},\nauthor={Erdun Gao and Junjia Chen and Li Shen and Tongliang Liu and Mingming Gong and Howard Bondell},\nyear={2022},\nurl={https://openreview.net/forum?id=XCS9lvsr5wg}\n}", "github": "", "project": "", "reviewers": "rvax;tbd8;FQuA;XsX3", "site": "https://openreview.net/forum?id=XCS9lvsr5wg", "pdf_size": 0, "recommendation": "3;3;5;6", "confidence": "4;4;4;2", "correctness": "3;3;3;3", "technical_novelty": "2;2;2;4", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "44;12;52;29", "wc_summary_review": "21;18;12;33", "wc_main_review": "324;418;742;244", "wc_review": "389;448;806;306", "wc_reply_reviewers": "25;0;0;0", "wc_reply_authors": "2822;3413;3456;2052", "reply_reviewers": "1;0;0;0", "reply_authors": "5;6;6;4", "recommendation_avg": [ 4.25, 1.299038105676658 ], "confidence_avg": [ 3.5, 0.8660254037844386 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.5, 0.8660254037844386 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 34.25, 15.270478054075452 ], "wc_summary_review_avg": [ 21.0, 7.648529270389178 ], "wc_main_review_avg": [ 432.0, 189.27757394894937 ], "wc_review_avg": [ 487.25, 190.8184673976814 ], "wc_reply_reviewers_avg": [ 6.25, 10.825317547305483 ], "wc_reply_authors_avg": [ 2935.75, 568.4146263952046 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 5.25, 0.82915619758885 ], "replies_avg": [ 29, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.7777777777777777, "corr_recommendation_correctness": 0.0, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3299431951753987713&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;2;3;0", "aff_unique_norm": "University of Melbourne;Xi'an Jiao Tong University;JD;University of Sydney", "aff_unique_dep": ";;JD Explore Academy;", "aff_unique_url": "https://www.unimelb.edu.au;https://www.xjtu.edu.cn;;https://www.sydney.edu.au", "aff_unique_abbr": "UniMelb;XJTU;;USYD", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0", "aff_country_unique": "Australia;China;" }, { "title": "Training invariances and the low-rank phenomenon: beyond linear networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6638", "id": "XEW8CQgArno", "poster": "", "openreview": "https://openreview.net/forum?id=XEW8CQgArno", "slides": "https://iclr.cc/virtual/2022/poster/6638", "video": "https://iclr.cc/virtual/2022/poster/6638", "author_site": "Thien Le, Stefanie Jegelka", "tldr": "", "abstract": "The implicit bias induced by the training of neural networks has become a topic of rigorous study. In the limit of gradient flow and gradient descent with appropriate step size, it has been shown that when one trains a deep linear network with logistic or exponential loss on linearly separable data, the weights converge to rank-$1$ matrices. In this paper, we extend this theoretical result to the last few linear layers of the much wider class of nonlinear ReLU-activated feedforward networks containing fully-connected layers and skip connections. Similar to the linear case, the proof relies on specific local training invariances, sometimes referred to as alignment, which we show to hold for submatrices where neurons are stably-activated in all training examples, and it reflects empirical results in the literature. We also show this is not true in general for the full matrix of ReLU fully-connected layers. Our proof relies on a specific decomposition of the network into a multilinear function and another ReLU network whose weights are constant under a certain parameter directional convergence.", "keywords": "deep learning;nonsmooth analysis;Clarke subdifferential;implicit regularization;low rank bias;alignment;training invariance", "primary_area": "", "supplementary_material": "", "author": "Thien Le;Stefanie Jegelka", "authorids": "~Thien_Le1;~Stefanie_Jegelka3", "gender": "M;F", "homepage": "https://steven-le-thien.github.io;http://people.csail.mit.edu/stefje/", "dblp": "194/5549;38/7003", "google_scholar": "WhFGh74AAAAJ;gTWUZlsAAAAJ", "orcid": "0000-0001-5476-8451;", "linkedin": ";", "or_profile": "~Thien_Le1;~Stefanie_Jegelka3", "aff": "Massachusetts Institute of Technology;Massachusetts Institute of Technology", "aff_domain": "mit.edu;mit.edu", "position": "PhD student;Associate Professor", "bibtex": "@inproceedings{\nle2022training,\ntitle={Training invariances and the low-rank phenomenon: beyond linear networks},\nauthor={Thien Le and Stefanie Jegelka},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=XEW8CQgArno}\n}", "github": "", "project": "", "reviewers": "eeoo;udhX;p57R;6P7z", "pdf_size": 0, "recommendation": "6;8;8;8", "confidence": "3;4;4;3", "correctness": "4;4;4;4", "technical_novelty": "3;4;3;4", "empirical_novelty": "0;3;0;4", "wc_summary_paper": "40;68;75;81", "wc_summary_review": "42;17;42;23", "wc_main_review": "325;258;97;300", "wc_review": "407;343;214;404", "wc_reply_reviewers": "12;0;0;24", "wc_reply_authors": "553;261;8;421", "reply_reviewers": "1;0;0;1", "reply_authors": "1;1;1;1", "recommendation_avg": [ 7.5, 0.8660254037844386 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 4.0, 0.0 ], "technical_novelty_avg": [ 3.5, 0.5 ], "empirical_novelty_avg": [ 1.75, 1.7853571071357126 ], "wc_summary_paper_avg": [ 66.0, 15.700318468107582 ], "wc_summary_review_avg": [ 31.0, 11.20267825120404 ], "wc_main_review_avg": [ 245.0, 88.73837952092657 ], "wc_review_avg": [ 342.0, 78.1888738888085 ], "wc_reply_reviewers_avg": [ 9.0, 9.9498743710662 ], "wc_reply_authors_avg": [ 310.75, 203.08418820774796 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": 0.0, "gs_citation": 33, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17303466148092399563&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=XEW8CQgArno", "email": "mit.edu;mit.edu", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Massachusetts Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://web.mit.edu", "aff_unique_abbr": "MIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "CDTrans: Cross-domain Transformer for Unsupervised Domain Adaptation", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6251", "id": "XGzk5OKWFFc", "poster": "", "openreview": "https://openreview.net/forum?id=XGzk5OKWFFc", "slides": "https://iclr.cc/virtual/2022/poster/6251", "video": "https://iclr.cc/virtual/2022/poster/6251", "author_site": "Tongkun Xu, Weihua Chen, Pichao WANG, Fan Wang, Li Hao, Rong Jin", "tldr": "", "abstract": "Unsupervised domain adaptation (UDA) aims to transfer knowledge learned from a labeled source domain to a different unlabeled target domain. Most existing UDA methods focus on learning domain-invariant feature representation, either from the domain level or category level, using convolution neural networks (CNNs)-based frameworks. One fundamental problem for the category level based UDA is the production of pseudo labels for samples in target domain, which are usually too noisy for accurate domain alignment, inevitably compromising the UDA performance. With the success of Transformer in various tasks, we find that the cross-attention in Transformer is robust to the noisy input pairs for better feature alignment, thus in this paper Transformer is adopted for the challenging UDA task. Specifically, to generate accurate input pairs, we design a two-way center-aware labeling algorithm to produce pseudo labels for target samples. Along with the pseudo labels, a weight-sharing triple-branch transformer framework is proposed to apply self-attention and cross-attention for source/target feature learning and source-target domain alignment, respectively. \nSuch design explicitly enforces the framework to learn discriminative domain-specific and domain-invariant representations simultaneously. The proposed method is dubbed CDTrans (cross-domain transformer), and it provides one of the first attempts to solve UDA tasks with a pure transformer solution. Experiments show that our proposed method achieves the best performance on public UDA datasets, e.g. VisDA-2017 and DomainNet. Code and models are available at https://github.com/CDTrans/CDTrans.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/62f785dd36c2c426747623fbb98295fc4f21bcff.zip", "author": "Tongkun Xu;Weihua Chen;Pichao WANG;Fan Wang;Hao Li;Rong Jin", "authorids": "~Tongkun_Xu1;~Weihua_Chen1;~Pichao_WANG3;~Fan_Wang6;~Hao_Li16;~Rong_Jin1", "gender": "M;M;M;F;M;M", "homepage": ";https://cwhgn.github.io;https://wangpichao.github.io/;;;https://www.cse.msu.edu/~rongjin/", "dblp": "221/0691;;;;17/5705-30;j/RongJin", "google_scholar": "gHfL5v0AAAAJ;KWVlYaMAAAAJ;;WCRGTHsAAAAJ;pHN-QIwAAAAJ;", "orcid": ";0000-0003-4141-7833;;0000-0001-7320-1119;;", "linkedin": ";;;;%E6%98%8A-%E6%9D%8E-392547a5/detail/recent-activity/;", "or_profile": "~Tongkun_Xu1;~Weihua_Chen1;~Pichao_WANG3;~Fan_Wang6;~Li_Hao1;~Rong_Jin3", "aff": "Shandong University;Alibaba Group;Alibaba Group;Alibaba Group;Alibaba Group;Alibaba Group", "aff_domain": "sdu.edu.cn;alibaba-inc.com;alibaba-inc.com;alibaba-inc.com;alibaba-inc.com;alibaba-inc.com", "position": "MS student;Algorithm Engineer;Researcher;Senior Staff Algorithm Engineer;Researcher;Researcher", "bibtex": "@inproceedings{\nxu2022cdtrans,\ntitle={{CDT}rans: Cross-domain Transformer for Unsupervised Domain Adaptation},\nauthor={Tongkun Xu and Weihua Chen and Pichao WANG and Fan Wang and Hao Li and Rong Jin},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=XGzk5OKWFFc}\n}", "github": "", "project": "", "reviewers": "1DYQ;3J2D;GSc9", "pdf_size": 0, "recommendation": "6;8;8", "confidence": "3;4;4", "correctness": "3;4;3", "technical_novelty": "2;4;3", "empirical_novelty": "3;4;3", "wc_summary_paper": "122;80;33", "wc_summary_review": "40;115;31", "wc_main_review": "157;105;198", "wc_review": "319;300;262", "wc_reply_reviewers": "0;0;288", "wc_reply_authors": "305;136;1144", "reply_reviewers": "0;0;2", "reply_authors": "1;1;2", "recommendation_avg": [ 7.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 3.0, 0.816496580927726 ], "empirical_novelty_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 78.33333333333333, 36.353205574688396 ], "wc_summary_review_avg": [ 62.0, 37.656340767525464 ], "wc_main_review_avg": [ 153.33333333333334, 38.055515004033545 ], "wc_review_avg": [ 293.6666666666667, 23.697163449568293 ], "wc_reply_reviewers_avg": [ 96.0, 135.7645019878171 ], "wc_reply_authors_avg": [ 528.3333333333334, 440.7753269965197 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.9428090415820634 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.9999999999999998, "corr_recommendation_correctness": 0.49999999999999983, "gs_citation": 310, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9897783945226246229&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=XGzk5OKWFFc", "email": "sdu.edu.cn;alibaba-inc.com;alibaba-inc.com;alibaba-inc.com;alibaba-inc.com;alibaba-inc.com", "author_num": 6, "aff_unique_index": "0;1;1;1;1;1", "aff_unique_norm": "Shandong University;Alibaba Group", "aff_unique_dep": ";", "aff_unique_url": "http://www.sdu.edu.cn;https://www.alibaba.com", "aff_unique_abbr": "SDU;Alibaba", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "China" }, { "id": "XHMwXYdGm6H", "title": "Rethinking Negative Sampling for Handling Missing Entity Annotations", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Negative sampling is highly effective in handling missing annotations for named entity recognition (NER). One of our contributions is an analysis on how it makes sense through introducing two insightful concepts: missampling and uncertainty. Empirical studies show low missampling rate and high uncertainty are both essential for achieving promising performances with negative sampling. Based on the sparsity of named entities, we also theoretically derive a lower bound for the probability of zero missampling rate, which is only relevant to sentence length. The other contribution is an adaptive and weighted sampling distribution that further improves negative sampling via our former analysis. Experiments on synthetic datasets and well-annotated datasets (e.g., CoNLL-2003) show that our proposed approach benefits negative sampling in terms of F1 score and loss convergence. Besides, models with improved negative sampling have achieved new state-of-the-art results on real-world datasets (e.g., EC).", "keywords": "Missing Annotations;NER;Negative Sampling;Unlabeled Entity Problem", "primary_area": "", "supplementary_material": "", "author": "Yangming Li;lemao liu;Shuming Shi", "authorids": "~Yangming_Li1;~lemao_liu1;~Shuming_Shi1", "gender": ";M;M", "homepage": ";https://lemaoliu.github.io/homepage/;", "dblp": ";41/10887.html;s/ShumingShi", "google_scholar": ";;Lg31AKMAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Yangming_Li1;~lemao_liu1;~Shuming_Shi1", "aff": ";Tencent;Tencent AI Lab", "aff_domain": ";tencent.com;tencent.com", "position": ";Researcher;Principal Researcher", "bibtex": "@misc{\nli2022rethinking,\ntitle={Rethinking Negative Sampling for Handling Missing Entity Annotations},\nauthor={Yangming Li and lemao liu and Shuming Shi},\nyear={2022},\nurl={https://openreview.net/forum?id=XHMwXYdGm6H}\n}", "github": "", "project": "", "reviewers": "nYYH;wyuD;FBCF;Tqi8;E2ZV", "site": "https://openreview.net/forum?id=XHMwXYdGm6H", "pdf_size": 0, "recommendation": "5;5;5;6;6", "confidence": "4;4;4;4;4", "correctness": "2;4;3;3;3", "technical_novelty": "2;2;3;3;3", "empirical_novelty": "2;2;3;3;3", "wc_summary_paper": "155;51;87;101;71", "wc_summary_review": "21;12;123;4;47", "wc_main_review": "401;178;1210;177;337", "wc_review": "577;241;1420;282;455", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "0;0;0;0;0", "reply_reviewers": "0;0;0;0;0", "reply_authors": "0;0;0;0;0", "recommendation_avg": [ 5.4, 0.48989794855663565 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.0, 0.6324555320336759 ], "technical_novelty_avg": [ 2.6, 0.4898979485566356 ], "empirical_novelty_avg": [ 2.6, 0.4898979485566356 ], "wc_summary_paper_avg": [ 93.0, 35.19090791667644 ], "wc_summary_review_avg": [ 41.4, 43.28787359064892 ], "wc_main_review_avg": [ 460.6, 384.89510259290125 ], "wc_review_avg": [ 595.0, 429.8450883748702 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8648471000001573630&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0", "aff_unique_norm": "Tencent", "aff_unique_dep": "Tencent Holdings Limited", "aff_unique_url": "https://www.tencent.com", "aff_unique_abbr": "Tencent", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "title": "Dealing with Non-Stationarity in MARL via Trust-Region Decomposition", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6464", "id": "XHUxf5aRB3s", "poster": "", "openreview": "https://openreview.net/forum?id=XHUxf5aRB3s", "slides": "https://iclr.cc/virtual/2022/poster/6464", "video": "https://iclr.cc/virtual/2022/poster/6464", "author_site": "Wenhao Li, Xiangfeng Wang, Bo Jin, Junjie Sheng, Hongyuan Zha", "tldr": "", "abstract": "Non-stationarity is one thorny issue in cooperative multi-agent reinforcement learning (MARL). One of the reasons is the policy changes of agents during the learning process. Some existing works have discussed various consequences caused by non-stationarity with several kinds of measurement indicators. This makes the objectives or goals of existing algorithms are inevitably inconsistent and disparate. In this paper, we introduce a novel notion, the $\\delta$-$stationarity$ measurement, to explicitly measure the non-stationarity of a policy sequence, which can be further proved to be bounded by the KL-divergence of consecutive joint policies. A straightforward but highly non-trivial way is to control the joint policies' divergence, which is difficult to estimate accurately by imposing the trust-region constraint on the joint policy. Although it has lower computational complexity to decompose the joint policy and impose trust-region constraints on the factorized policies, simple policy factorization like mean-field approximation will lead to more considerable policy divergence, which can be considered as the trust-region decomposition dilemma. We model the joint policy as a pairwise Markov random field and propose a trust-region decomposition network (TRD-Net) based on message passing to estimate the joint policy divergence more accurately. The Multi-Agent Mirror descent policy algorithm with Trust region decomposition, called MAMT, is established by adjusting the trust-region of the local policies adaptively in an end-to-end manner. MAMT can approximately constrain the consecutive joint policies' divergence to satisfy $\\delta$-stationarity and alleviate the non-stationarity problem. Our method can bring noticeable and stable performance improvement compared with baselines in cooperative tasks of different complexity.", "keywords": "Nonstationarity;Trust-Region Methods;Multi-Agent Reinforcement Learning", "primary_area": "", "supplementary_material": "/attachment/d9c695cd93c7205a3899766695e25ab3c0607aa9.zip", "author": "Wenhao Li;Xiangfeng Wang;Bo Jin;Junjie Sheng;Hongyuan Zha", "authorids": "~Wenhao_Li2;~Xiangfeng_Wang1;~Bo_Jin1;~Junjie_Sheng1;~Hongyuan_Zha1", "gender": "M;M;;M;", "homepage": "https://tomaxent.com;https://xfwang87.github.io/;;;", "dblp": ";84/4695;;;z/HongyuanZha", "google_scholar": "HAtzuaYAAAAJ;YpGMkgsAAAAJ;;2A7h6E0AAAAJ;n1DQMIsAAAAJ", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~Wenhao_Li2;~Xiangfeng_Wang1;~Bo_Jin1;~Junjie_Sheng1;~Hongyuan_Zha1", "aff": "East China Normal University;East China Normal University;;East China Normal University;The Chinese University of Hong Kong, Shenzhen", "aff_domain": "ecnu.edu.cn;ecnu.edu.cn;;ecnu.edu.cn;cuhk.edu.cn", "position": "PhD student;Associate Professor;;PhD student;Full Professor", "bibtex": "@inproceedings{\nli2022dealing,\ntitle={Dealing with Non-Stationarity in {MARL} via Trust-Region Decomposition},\nauthor={Wenhao Li and Xiangfeng Wang and Bo Jin and Junjie Sheng and Hongyuan Zha},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=XHUxf5aRB3s}\n}", "github": "", "project": "", "reviewers": "16B5;LytU;DdBz;XYrN", "pdf_size": 0, "recommendation": "6;6;6;8", "confidence": "3;2;3;4", "correctness": "3;4;3;3", "technical_novelty": "2;3;2;3", "empirical_novelty": "3;2;2;3", "wc_summary_paper": "67;53;73;195", "wc_summary_review": "55;50;81;72", "wc_main_review": "114;192;421;346", "wc_review": "236;295;575;613", "wc_reply_reviewers": "0;0;113;0", "wc_reply_authors": "423;767;1814;1115", "reply_reviewers": "0;0;2;0", "reply_authors": "2;2;5;3", "recommendation_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 3.0, 0.7071067811865476 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 97.0, 57.043842787806646 ], "wc_summary_review_avg": [ 64.5, 12.539936203984453 ], "wc_main_review_avg": [ 268.25, 121.43388118643001 ], "wc_review_avg": [ 429.75, 166.11347777949868 ], "wc_reply_reviewers_avg": [ 28.25, 48.93043531382078 ], "wc_reply_authors_avg": [ 1029.75, 514.6597783973409 ], "reply_reviewers_avg": [ 0.5, 0.8660254037844386 ], "reply_authors_avg": [ 3.0, 1.224744871391589 ], "replies_avg": [ 23, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.816496580927726, "corr_recommendation_correctness": -0.3333333333333333, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7143761541203592106&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=XHUxf5aRB3s", "email": "ecnu.edu.cn;ecnu.edu.cn;;ecnu.edu.cn;cuhk.edu.cn", "author_num": 5, "aff_unique_index": "0;0;0;1", "aff_unique_norm": "East China Normal University;Chinese University of Hong Kong", "aff_unique_dep": ";", "aff_unique_url": "http://www.ecnu.edu.cn;https://www.cuhk.edu.cn", "aff_unique_abbr": "ECNU;CUHK", "aff_campus_unique_index": "1", "aff_campus_unique": ";Shenzhen", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "id": "XHxRBwjpEQ", "title": "Was my Model Stolen? Feature Sharing for Robust and Transferable Watermarks", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Deep Neural Networks (DNNs) are increasingly being deployed in cloud-based services via various APIs, e.g., prediction APIs. Recent studies show that these public APIs are vulnerable to the model extraction attack, where an adversary attempts to train a local copy of the private model using predictions returned by the API. Existing defenses mainly focus on perturbing prediction distribution to undermine the training objective of the attacker and thus inevitably impact the API utility. In this work, we extend the concept of watermarking to protect APIs. The main idea is to insert a watermark which is only known to defender into the protected model and the watermark will then be transferred into all stolen models. The defender can leverage the knowledge of watermarks to detect and certify stolen models. However, the effectiveness of the watermark remains limited since watermarks are distinct from the task data, and the adversary in extraction attacks only adopts inputs sampled from the task distribution. Hence the watermark tends to be discarded during the extraction attack. To bridge the gap, we propose a feature-sharing framework to improve the transferability of watermarks. For legitimate data and watermarks, we encourage the model to only show the difference in final decision layers and use the same features for all other layers. Comprehensive experiments on text and image domains indicate that the proposed framework is effective in terms of API watermarking while keeping the utility of the API. Besides, experimental analysis also validates the robustness of the watermark against various watermark removal attacks.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/f35dc097d7e8f7e3631cede57976efd5b75e4bdd.zip", "author": "Ruixiang Tang;Hongye Jin;Curtis Wigington;Mengnan Du;Rajiv Jain;Xia Hu", "authorids": "~Ruixiang_Tang1;~Hongye_Jin1;~Curtis_Wigington1;~Mengnan_Du1;~Rajiv_Jain1;~Xia_Hu4", "gender": "M;M;;;M;", "homepage": "https://www.ruixiangtang.net/;https://github.com/Mooler0410;;https://mengnandu.com/;;", "dblp": "239/1928;268/7929;;183/5606;;", "google_scholar": "T575jsoAAAAJ;;;0i-Js2gAAAAJ;https://scholar.google.com/;", "orcid": ";;;;;", "linkedin": "ruixiang-tang-91660717b/;;;;;", "or_profile": "~Ruixiang_Tang1;~Hongye_Jin1;~Curtis_Wigington1;~Mengnan_Du1;~Rajiv_Jain1;~Xia_Hu4", "aff": "Rice University;Texas A&M;;Texas A&M University;Adobe Systems;", "aff_domain": "rice.edu;tamu.edu;;tamu.edu;adobe.com;", "position": "PhD student;PhD student;;PhD student;Senior Research Scientist;", "bibtex": "@misc{\ntang2022was,\ntitle={Was my Model Stolen? Feature Sharing for Robust and Transferable Watermarks},\nauthor={Ruixiang Tang and Hongye Jin and Curtis Wigington and Mengnan Du and Rajiv Jain and Xia Hu},\nyear={2022},\nurl={https://openreview.net/forum?id=XHxRBwjpEQ}\n}", "github": "", "project": "", "reviewers": "FNUM;xtcG;WPw3;Wo41;FYJ4", "site": "https://openreview.net/forum?id=XHxRBwjpEQ", "pdf_size": 0, "recommendation": "3;5;5;5;5", "confidence": "4;4;3;3;4", "correctness": "3;3;3;3;3", "technical_novelty": "2;2;3;3;3", "empirical_novelty": "2;3;2;3;3", "wc_summary_paper": "99;53;145;45;79", "wc_summary_review": "52;16;86;39;34", "wc_main_review": "523;252;533;146;513", "wc_review": "674;321;764;230;626", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "0;0;0;0;0", "reply_reviewers": "0;0;0;0;0", "reply_authors": "0;0;0;0;0", "recommendation_avg": [ 4.6, 0.7999999999999999 ], "confidence_avg": [ 3.6, 0.4898979485566356 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.6, 0.4898979485566356 ], "empirical_novelty_avg": [ 2.6, 0.4898979485566356 ], "wc_summary_paper_avg": [ 84.2, 35.92436499090833 ], "wc_summary_review_avg": [ 45.4, 23.354656923192 ], "wc_main_review_avg": [ 393.4, 162.35097782274056 ], "wc_review_avg": [ 523.0, 208.87508228604005 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.408248290463863, "corr_recommendation_correctness": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16546689658732651816&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;1;2", "aff_unique_norm": "Rice University;Texas A&M University;Adobe", "aff_unique_dep": ";;Adobe Systems Incorporated", "aff_unique_url": "https://www.rice.edu;https://www.tamu.edu;https://www.adobe.com", "aff_unique_abbr": "Rice;TAMU;Adobe", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "XIZaWGCPl0b", "title": "Tesseract: Gradient Flip Score to Secure Federated Learning against Model Poisoning Attacks", "track": "main", "status": "Reject", "tldr": "", "abstract": "Federated learning\u2014multi-party, distributed learning in a decentralized environment\u2014is vulnerable to model poisoning attacks, even more so than centralized learning approaches. This is because malicious clients can collude and send in carefully tailored model updates to make the global model inaccurate. This motivated the development of Byzantine-resilient federated learning algorithms, such as Krum, Trimmed mean, and FoolsGold. However, a recently developed targeted model poisoning attack showed that all prior defenses can be bypassed. The attack uses the intuition that simply by changing the sign of the gradient updates that the optimizer is computing, for a set of malicious clients, a model can be pushed away from the optima to increase the test error rate. In this work, we develop tesseract\u2014a defense against this directed deviation attack, a state-of-the-art model poisoning attack. TESSERACT is based on a simple intuition that in a federated learning setting, certain patterns of gradient flips are indicative of an attack. This intuition is remarkably stable across different learning algorithms, models, and datasets. TESSERACT assigns reputation scores to the participating clients based on their behavior during the training phase and then takes a weighted contribution of the clients. We show that TESSERACT provides robustness against even an adaptive white-box version of the attack.", "keywords": "federated learning;aggregation;security;untargeted model poisoning attack", "primary_area": "", "supplementary_material": "/attachment/19568f9d519b1aefd89d4d879511e0327c853d3e.zip", "author": "Atul Sharma;Wei Chen;Joshua Christian Zhao;Qiang Qiu;Somali Chaterji;Saurabh Bagchi", "authorids": "~Atul_Sharma1;~Wei_Chen26;~Joshua_Christian_Zhao1;~Qiang_Qiu1;~Somali_Chaterji1;~Saurabh_Bagchi1", "gender": "M;M;M;;F;M", "homepage": "https://sharm438.github.io/;https://weichennone.github.io/myhomepage/;https://joshuaczhao.github.io;https://web.ics.purdue.edu/~qqiu/;https://schaterji.io;https://saurabhbagchi.us", "dblp": ";181/2832-124.html;280/1327;97/360;157/2828;57/95.html", "google_scholar": "0gIenGAAAAAJ;jVT7rQgAAAAJ;aKAajcUAAAAJ;jdLtt_YAAAAJ;https://scholar.google.com/citations?hl=en;https://scholar.google.com.tw/citations?user=3EfsOvYAAAAJ", "orcid": ";0000-0001-6722-4322;;;0000-0002-3651-6362;", "linkedin": "atul-sharma-8b7296137;;joshua-zhao-35638815a/;;;", "or_profile": "~Atul_Sharma1;~Wei_Chen26;~Joshua_Christian_Zhao1;~Qiang_Qiu1;~Somali_Chaterji1;~Saurabh_Bagchi1", "aff": "Purdue University;Purdue University;Purdue University;Purdue University;;Purdue University", "aff_domain": "purdue.edu;purdue.edu;purdue.edu;purdue.edu;;purdue.edu", "position": "PhD student;PhD student;PhD student;Assistant Professor;;Full Professor", "bibtex": "@misc{\nsharma2022tesseract,\ntitle={Tesseract: Gradient Flip Score to Secure Federated Learning against Model Poisoning Attacks},\nauthor={Atul Sharma and Wei Chen and Joshua Christian Zhao and Qiang Qiu and Somali Chaterji and Saurabh Bagchi},\nyear={2022},\nurl={https://openreview.net/forum?id=XIZaWGCPl0b}\n}", "github": "", "project": "", "reviewers": "EZwJ;YpRv;4KHr;VCnh", "site": "https://openreview.net/forum?id=XIZaWGCPl0b", "pdf_size": 0, "recommendation": "5;5;6;8", "confidence": "4;4;4;4", "correctness": "3;4;3;3", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;2;4;3", "wc_summary_paper": "85;21;108;155", "wc_summary_review": "46;6;41;161", "wc_main_review": "506;448;390;1382", "wc_review": "637;475;539;1698", "wc_reply_reviewers": "17;0;213;369", "wc_reply_authors": "1397;755;1480;1516", "reply_reviewers": "1;0;2;4", "reply_authors": "3;1;5;5", "recommendation_avg": [ 6.0, 1.224744871391589 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 92.25, 48.25647624930772 ], "wc_summary_review_avg": [ 63.5, 58.36308764964376 ], "wc_main_review_avg": [ 681.5, 406.50799500132837 ], "wc_review_avg": [ 837.25, 500.29210217631856 ], "wc_reply_reviewers_avg": [ 149.75, 151.75535410653555 ], "wc_reply_authors_avg": [ 1287.0, 310.1668905605497 ], "reply_reviewers_avg": [ 1.75, 1.479019945774904 ], "reply_authors_avg": [ 3.5, 1.6583123951777 ], "replies_avg": [ 33, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": -0.4714045207910316, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18293008064202323499&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Purdue University", "aff_unique_dep": "", "aff_unique_url": "https://www.purdue.edu", "aff_unique_abbr": "Purdue", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "XJFGyJEBLuz", "title": "Born Again Neural Rankers", "track": "main", "status": "Reject", "tldr": "", "abstract": "We introduce Born Again neural Rankers (BAR) in the Learning to Rank (LTR) setting, where student rankers, trained in the Knowledge Distillation (KD) framework, are parameterized identically to their teachers. Unlike the existing ranking distillation work, which pursues a good trade-off between performance and efficiency, BAR adapts the idea of Born Again Networks (BAN) to ranking problems and significantly improves ranking performance of students over the teacher rankers without increasing model capacity. By examining the key differences between ranking distillation and common distillation for classification problems, we find that the key success factors of BAR lie in (1) an appropriate teacher score transformation function, and (2) a novel listwise distillation framework, both are specifically designed for ranking problems and are rarely studied in the knowledge distillation literature. Using the state-of-the-art neural ranking structures, BAR is able to push the limits of neural rankers above a recent rigorous benchmark study, and significantly outperforms strong gradient boosted decision tree based models on 7 out of 9 key metrics, the first time in the literature. In addition to the strong empirical results, we give theoretical explanations on why listwise distillation is effective for neural rankers. ", "keywords": "learning to rank;knowledge distillation;neural networks", "primary_area": "", "supplementary_material": "", "author": "Zhen Qin;Le Yan;Yi Tay;Honglei Zhuang;Xuanhui Wang;Michael Bendersky;Marc Najork", "authorids": "~Zhen_Qin5;~Le_Yan1;~Yi_Tay1;~Honglei_Zhuang1;~Xuanhui_Wang1;~Michael_Bendersky1;~Marc_Najork1", "gender": "M;M;M;M;M;;M", "homepage": "http://alumni.cs.ucr.edu/~zqin001/;;http://yitay.net;https://hongleizhuang.github.io/;;http://bendersky.github.io/;http://marc.najork.org/", "dblp": ";67/2358;;10/9988;67/2661;80/4305;n/MarcNajork", "google_scholar": "Kv1yk3YAAAAJ;X_knTr4AAAAJ;VBclY_cAAAAJ;FxEDj4wAAAAJ;;C9mxM5IAAAAJ;7HeAnjwAAAAJ", "orcid": "0000-0001-6739-134X;;;0000-0001-8134-1509;;0000-0002-2941-6240;0000-0003-1423-0854", "linkedin": ";;;;;;najork/", "or_profile": "~Zhen_Qin5;~Le_Yan1;~Yi_Tay1;~Honglei_Zhuang1;~Xuanhui_Wang1;~Michael_Bendersky1;~Marc_Najork1", "aff": "Google Deepmind;Google;Google;Google DeepMind;Google;Google;Google Research", "aff_domain": "google.com;google.com;google.com;google.com;google.com;google.com;google.com", "position": "Researcher;Software Engineer;Research Scientist;Research Scientist;Software Engineer;Researcher;Director, Research Engineering", "bibtex": "@misc{\nqin2022born,\ntitle={Born Again Neural Rankers},\nauthor={Zhen Qin and Le Yan and Yi Tay and Honglei Zhuang and Xuanhui Wang and Michael Bendersky and Marc Najork},\nyear={2022},\nurl={https://openreview.net/forum?id=XJFGyJEBLuz}\n}", "github": "", "project": "", "reviewers": "TTzh;7kAC;wTyv", "site": "https://openreview.net/forum?id=XJFGyJEBLuz", "pdf_size": 0, "recommendation": "3;3;8", "confidence": "4;4;4", "correctness": "3;2;4", "technical_novelty": "2;2;3", "empirical_novelty": "2;2;3", "wc_summary_paper": "59;200;63", "wc_summary_review": "26;122;27", "wc_main_review": "177;210;304", "wc_review": "262;532;394", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "316;182;178", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "recommendation_avg": [ 4.666666666666667, 2.357022603955158 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.0, 0.816496580927726 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 107.33333333333333, 65.54557362798973 ], "wc_summary_review_avg": [ 58.333333333333336, 45.02098276236192 ], "wc_main_review_avg": [ 230.33333333333334, 53.80417166808619 ], "wc_review_avg": [ 396.0, 110.23611023616536 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 225.33333333333334, 64.13180871368661 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.8660254037844387, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13221315490696634702&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;1;1;1;1;1", "aff_unique_norm": "DeepMind;Google", "aff_unique_dep": "DeepMind;Google", "aff_unique_url": "https://deepmind.com;https://www.google.com", "aff_unique_abbr": "DeepMind;Google", "aff_campus_unique_index": "1;1;1;1;1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;1;1;0;1;1;1", "aff_country_unique": "United Kingdom;United States" }, { "title": "Space-Time Graph Neural Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6716", "id": "XJiajt89Omg", "poster": "", "openreview": "https://openreview.net/forum?id=XJiajt89Omg", "slides": "https://iclr.cc/virtual/2022/poster/6716", "video": "https://iclr.cc/virtual/2022/poster/6716", "author_site": "Samar Hadou, Charilaos Kanatsoulis, Alejandro Ribeiro", "tldr": "", "abstract": "We introduce space-time graph neural network (ST-GNN), a novel GNN architecture, tailored to jointly process the underlying space-time topology of time-varying network data. The cornerstone of our proposed architecture is the composition of time and graph convolutional filters followed by pointwise nonlinear activation functions. We introduce a generic definition of convolution operators that mimic the diffusion process of signals over its underlying support. On top of this definition, we propose space-time graph convolutions that are built upon a composition of time and graph shift operators. We prove that ST-GNNs with multivariate integral Lipschitz filters are stable to small perturbations in the underlying graphs as well as small perturbations in the time domain caused by time warping. Our analysis shows that small variations in the network topology and time evolution of a system does not significantly affect the performance of ST-GNNs. Numerical experiments with decentralized control systems showcase the effectiveness and stability of the proposed ST-GNNs.", "keywords": "ST-GNNs;GNNs;stability;graph-time perturbations", "primary_area": "", "supplementary_material": "", "author": "Samar Hadou;Charilaos I Kanatsoulis;Alejandro Ribeiro", "authorids": "~Samar_Hadou1;kanac@seas.upenn.edu;~Alejandro_Ribeiro1", "gender": "F;;M", "homepage": ";;https://alelab.seas.upenn.edu", "dblp": "304/2692.html;;32/15", "google_scholar": "https://scholar.google.com.eg/citations?user=FwL_RdEAAAAJ;;7mrPM4kAAAAJ", "orcid": ";;0000-0003-4230-9906", "linkedin": ";;", "or_profile": "~Samar_Hadou1;kanac@seas.upenn.edu;~Alejandro_Ribeiro1", "aff": "University of Pennsylvania;;University of Pennsylvania", "aff_domain": "seas.upenn.edu;;upenn.edu", "position": "PhD student;;Full Professor", "bibtex": "@inproceedings{\nhadou2022spacetime,\ntitle={Space-Time Graph Neural Networks},\nauthor={Samar Hadou and Charilaos I Kanatsoulis and Alejandro Ribeiro},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=XJiajt89Omg}\n}", "github": "", "project": "", "reviewers": "aJtV;G6p1;u6ex", "pdf_size": 0, "recommendation": "5;5;8", "confidence": "3;3;3", "correctness": "3;4;4", "technical_novelty": "3;3;3", "empirical_novelty": "2;2;3", "wc_summary_paper": "19;55;135", "wc_summary_review": "61;91;44", "wc_main_review": "49;278;261", "wc_review": "129;424;440", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "110;804;382", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "recommendation_avg": [ 6.0, 1.4142135623730951 ], "confidence_avg": [ 3.0, 0.0 ], "correctness_avg": [ 3.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 69.66666666666667, 48.47909056719425 ], "wc_summary_review_avg": [ 65.33333333333333, 19.430788855719562 ], "wc_main_review_avg": [ 196.0, 104.17613290320709 ], "wc_review_avg": [ 331.0, 142.9848476820767 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 432.0, 285.5217446477005 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.5, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3240310455846129638&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=XJiajt89Omg", "email": "seas.upenn.edu;;upenn.edu", "author_num": 3, "aff_unique_index": "0;0", "aff_unique_norm": "University of Pennsylvania", "aff_unique_dep": "", "aff_unique_url": "https://www.upenn.edu", "aff_unique_abbr": "UPenn", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "XK4GN6UCTfH", "title": "MS$^2$-Transformer: An End-to-End Model for MS/MS-assisted Molecule Identification", "track": "main", "status": "Reject", "tldr": "", "abstract": "Mass spectrometry (MS) acts as an important technique for measuring the mass-to-charge ratios of ions and identifying the chemical structures of unknown metabolites. Practically, tandem mass spectrometry (MS/MS), which couples multiple standard MS in series and outputs fine-grained spectrum with fragmental information, has been popularly used. Manually interpreting the MS/MS spectrum into the molecules (i.e., the simplified molecular-input line-entry system, SMILES) is often costly and cumbersome, mainly due to the synthesis and labeling of isotopes and the requirement of expert knowledge. In this work, we regard molecule identification as a spectrum-to-sequence conversion problem and propose an end-to-end model, called MS$^2$-Transformer, to address this task. The chemical knowledge, defined through a fragmentation tree from the MS/MS spectrum, is incorporated into MS$^2$-Transformer. Our method achieves state-of-the-art results on two widely used benchmarks in molecule identification. To our best knowledge, MS$^2$-Transformer is the first machine learning model that can accurately identify the structures (e.g., molecular graph) from experimental MS/MS rather than chemical formula/categories only (e.g., C$_6$H$_{12}$O$_6$/organic compound), demonstrating it the great application potential in biomedical studies.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Mengji Zhang;Yingce Xia;Nian Wu;Kun Qian;Jianyang Zeng", "authorids": "~Mengji_Zhang1;~Yingce_Xia1;~Nian_Wu1;~Kun_Qian3;~Jianyang_Zeng2", "gender": ";M;;;M", "homepage": "https://github.com/mjDelta;https://www.microsoft.com/en-us/research/people/yinxia/;;;https://www.westlake.edu.cn/faculty/jianyang-zeng.html", "dblp": ";http://dblp.uni-trier.de/pers/hd/x/Xia:Yingce;;;", "google_scholar": ";GS5wRxYAAAAJ;;6FXooBkAAAAJ;", "orcid": ";;0000-0001-5656-5703;;0000-0003-0950-7716", "linkedin": ";;;;", "or_profile": "~Mengji_Zhang1;~Yingce_Xia1;~Nian_Wu1;~Kun_Qian3;~Jianyang_Zeng2", "aff": "Shanghai Jiaotong University;Microsoft;Tsinghua University;Shanghai Jiaotong University;Tsinghua University", "aff_domain": "sjtu.edu.cn;microsoft.com;tsinghua.edu.cn;sjtu.edu;tsinghua.edu.cn", "position": "PhD student;Researcher;Postdoc;Researcher;Associate Professor", "bibtex": "@misc{\nzhang2022mstransformer,\ntitle={{MS}\\${\\textasciicircum}2\\$-Transformer: An End-to-End Model for {MS}/{MS}-assisted Molecule Identification},\nauthor={Mengji Zhang and Yingce Xia and Nian Wu and Kun Qian and Jianyang Zeng},\nyear={2022},\nurl={https://openreview.net/forum?id=XK4GN6UCTfH}\n}", "github": "", "project": "", "reviewers": "PCDA;g6pz;ZGQp", "site": "https://openreview.net/forum?id=XK4GN6UCTfH", "pdf_size": 0, "recommendation": "5;5;5", "confidence": "5;3;4", "correctness": "3;3;4", "technical_novelty": "2;3;3", "empirical_novelty": "2;3;4", "wc_summary_paper": "60;25;60", "wc_summary_review": "66;34;70", "wc_main_review": "216;687;199", "wc_review": "342;746;329", "wc_reply_reviewers": "155;0;0", "wc_reply_authors": "486;744;755", "reply_reviewers": "1;0;0", "reply_authors": "1;1;1", "recommendation_avg": [ 5.0, 0.0 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 3.0, 0.816496580927726 ], "wc_summary_paper_avg": [ 48.333333333333336, 16.49915822768611 ], "wc_summary_review_avg": [ 56.666666666666664, 16.110727964792765 ], "wc_main_review_avg": [ 367.3333333333333, 226.1449879072175 ], "wc_review_avg": [ 472.3333333333333, 193.58431984251433 ], "wc_reply_reviewers_avg": [ 51.666666666666664, 73.06770072260991 ], "wc_reply_authors_avg": [ 661.6666666666666, 124.29624111595473 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:2KDslYQvOmYJ:scholar.google.com/&scioq=MS%24%5E2%24-Transformer:+An+End-to-End+Model+for+MS/MS-assisted+Molecule+Identification&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;2;0;2", "aff_unique_norm": "Shanghai Jiao Tong University;Microsoft;Tsinghua University", "aff_unique_dep": ";Microsoft Corporation;", "aff_unique_url": "https://www.sjtu.edu.cn;https://www.microsoft.com;https://www.tsinghua.edu.cn", "aff_unique_abbr": "SJTU;Microsoft;THU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0;0", "aff_country_unique": "China;United States" }, { "id": "XLjtkZbYUT", "title": "Mutual Information Minimization Based Disentangled Learning Framework For Causal Effect Estimation", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Learning treatment effect from observational data is a fundamental problem in causal inference. Recently, disentangled representation learning methods, such as DR-CFR and DeR-CFR, have witnessed great success in treatment effect estimation, which aim to decompose covariates into three disjoint factors. However, we argue that these methods cannot identify underlying factors well, as they cannot obtain independent disentangled factors. Inspired by the success of mutual information minimization in disentangled representation learning, we propose a novel method called MimCE in this paper: Mutual Information Minimization based Disentangled Learning Framework for Causal Effect Estimation. MimCE mainly focuses on obtaining independent disentangled factors for treatment effect estimation and numerous experiments demonstrate that it performs better than the state-of-the-art methods both on the predictive performance and model stability.", "keywords": "causal inference;individual treatment effect;disentangled representation learning;mutual information", "primary_area": "", "supplementary_material": "", "author": "Mingyuan Cheng", "authorids": "~Mingyuan_Cheng4", "gender": "", "homepage": "", "dblp": "", "google_scholar": "https://scholar.google.com/citations?hl=zh-CN", "orcid": "", "linkedin": "", "or_profile": "~Mingyuan_Cheng4", "aff": "", "aff_domain": "", "position": "", "bibtex": "@misc{\ncheng2022mutual,\ntitle={Mutual Information Minimization Based Disentangled Learning Framework For Causal Effect Estimation},\nauthor={Mingyuan Cheng},\nyear={2022},\nurl={https://openreview.net/forum?id=XLjtkZbYUT}\n}", "github": "", "project": "", "reviewers": "CZbB;Pp2X;X9Kv;BhNa", "site": "https://openreview.net/forum?id=XLjtkZbYUT", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "4;4;4;5", "correctness": "2;3;3;3", "technical_novelty": "2;3;2;3", "empirical_novelty": "3;2;2;3", "wc_summary_paper": "35;85;161;63", "wc_summary_review": "38;35;89;28", "wc_main_review": "743;265;861;245", "wc_review": "816;385;1111;336", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 86.0, 46.78675026115834 ], "wc_summary_review_avg": [ 47.5, 24.23324163210527 ], "wc_main_review_avg": [ 528.5, 276.75395209463585 ], "wc_review_avg": [ 662.0, 319.5003912360672 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 1, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:ygfjQRIiAzoJ:scholar.google.com/&scioq=Mutual+Information+Minimization+Based+Disentangled+Learning+Framework+For+Causal+Effect+Estimation&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "title": "GLASS: GNN with Labeling Tricks for Subgraph Representation Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6916", "id": "XLxhEjKNbXj", "poster": "", "openreview": "https://openreview.net/forum?id=XLxhEjKNbXj", "slides": "https://iclr.cc/virtual/2022/poster/6916", "video": "https://iclr.cc/virtual/2022/poster/6916", "author_site": "Xiyuan Wang, Muhan Zhang", "tldr": "", "abstract": "Despite the remarkable achievements of Graph Neural Networks (GNNs) on graph representation learning, few works have tried to use them to predict properties of subgraphs in the whole graph. The existing state-of-the-art method SubGNN introduces an overly complicated subgraph-level GNN model which synthesizes three artificial channels each of which has two carefully designed subgraph-level message passing modules, yet only slightly outperforms a plain GNN which performs node-level message passing and then pools node embeddings within the subgraph. By analyzing SubGNN and plain GNNs, we find that the key for subgraph representation learning might be to distinguish nodes inside and outside the subgraph. With this insight, we propose an expressive and scalable labeling trick, namely max-zero-one, to enhance plain GNNs for subgraph tasks. The resulting model is called GLASS (GNN with LAbeling trickS for Subgraph). We theoretically characterize GLASS's expressive power. Compared with SubGNN, GLASS is more expressive, more scalable, and easier to implement. Experiments on eight benchmark datasets show that GLASS outperforms the strongest baseline by $14.8\\%$ on average. And ablation analysis shows that our max-zero-one labeling trick can boost the performance of a plain GNN by up to $105\\%$ in maximum, which illustrates the effectiveness of labeling trick on subgraph tasks. Furthermore, training a GLASS model only takes $37\\%$ time needed for a SubGNN on average. ", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/96c54aaacec89e6855925cba2d3bed81d7952a71.zip", "author": "Xiyuan Wang;Muhan Zhang", "authorids": "~Xiyuan_Wang1;~Muhan_Zhang1", "gender": ";M", "homepage": ";https://muhanzhang.github.io/", "dblp": "95/8542;157/5518", "google_scholar": ";https://scholar.google.com.hk/citations?user=OBBqkosAAAAJ", "orcid": ";0000-0002-7680-6401", "linkedin": "%E5%B8%8C%E5%85%83-%E7%8E%8B-969660221/;jerry-muhan-zhang-a33a1777/", "or_profile": "~Xiyuan_Wang1;~Muhan_Zhang1", "aff": "Peking University;Peking University", "aff_domain": "pku.edu.cn;pku.edu.cn", "position": "Undergrad student;Assistant Professor", "bibtex": "@inproceedings{\nwang2022glass,\ntitle={{GLASS}: {GNN} with Labeling Tricks for Subgraph Representation Learning},\nauthor={Xiyuan Wang and Muhan Zhang},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=XLxhEjKNbXj}\n}", "github": "", "project": "", "reviewers": "VqVZ;UU3H;8Ehn;cEit", "pdf_size": 0, "recommendation": "5;6;6;6", "confidence": "3;4;4;4", "correctness": "3;3;3;3", "technical_novelty": "3;2;2;2", "empirical_novelty": "3;3;2;2", "wc_summary_paper": "52;125;51;43", "wc_summary_review": "28;14;41;24", "wc_main_review": "438;383;150;193", "wc_review": "518;522;242;260", "wc_reply_reviewers": "272;0;0;156", "wc_reply_authors": "2487;1233;29;1773", "reply_reviewers": "2;0;0;2", "reply_authors": "5;3;1;5", "recommendation_avg": [ 5.75, 0.4330127018922193 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 67.75, 33.23683950077083 ], "wc_summary_review_avg": [ 26.75, 9.67923034130297 ], "wc_main_review_avg": [ 291.0, 122.0225389016308 ], "wc_review_avg": [ 385.5, 134.657899879658 ], "wc_reply_reviewers_avg": [ 107.0, 114.59057552870567 ], "wc_reply_authors_avg": [ 1380.5, 898.1518524169508 ], "reply_reviewers_avg": [ 1.0, 1.0 ], "reply_authors_avg": [ 3.5, 1.6583123951777 ], "replies_avg": [ 25, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 1.0, "corr_recommendation_correctness": 0.0, "gs_citation": 42, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15747756943116527175&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "pdf": "https://openreview.net/pdf?id=XLxhEjKNbXj", "email": "pku.edu.cn;pku.edu.cn", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Peking University", "aff_unique_dep": "", "aff_unique_url": "http://www.pku.edu.cn", "aff_unique_abbr": "Peking U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "id": "XNYOJD0QdBD", "title": "Personalized PageRank meets Graph Attention Networks", "track": "main", "status": "Desk Reject", "tldr": "", "abstract": "There has been a rising interest in graph neural networks (GNNs) for representation learning over the past few years. GNNs provide a general and efficient framework to learn from graph-structured data. However, GNNs typically only use the information of a very limited neighborhood for each node. A larger neighborhood would be desirable to provide the model with more information. However, increasing the size of the neighborhood is not trivial since neighborhood aggregation over many layers leads to over-smoothing. In this work, we incorporate the limit distribution of Personalized PageRank (PPR) into graph attention networks (GATs) to address this issue. Intuitively, message aggregation based on Personalized PageRank corresponds to infinitely many neighborhood aggregation layers. We show that our models outperform a variety ofbaseline models across all datasets used for our experiments. Our implementation is publicly available online.", "keywords": "GNN;Personalized PageRank;Graph Attention Network;Graph Neural Network", "primary_area": "", "supplementary_material": "", "author": "Julie Choi", "authorids": "~Julie_Choi1", "gender": "F", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "julie-choi-a5215115/", "or_profile": "~Julie_Choi1", "aff": "Amazon", "aff_domain": "amazon.com", "position": "Manager", "bibtex": "@misc{\nchoi2022personalized,\ntitle={Personalized PageRank meets Graph Attention Networks},\nauthor={Julie Choi},\nyear={2022},\nurl={https://openreview.net/forum?id=XNYOJD0QdBD}\n}", "github": "", "project": "", "reviewers": "", "site": "https://openreview.net/forum?id=XNYOJD0QdBD", "pdf_size": 0, "recommendation": "", "confidence": "", "correctness": "", "technical_novelty": "", "empirical_novelty": "", "wc_summary_paper": "", "wc_summary_review": "", "wc_main_review": "", "wc_review": "", "wc_reply_reviewers": "", "wc_reply_authors": "", "reply_reviewers": "", "reply_authors": "", "recommendation_avg": [ 0, 0 ], "confidence_avg": [ 0, 0 ], "correctness_avg": [ 0, 0 ], "technical_novelty_avg": [ 0, 0 ], "empirical_novelty_avg": [ 0, 0 ], "wc_summary_paper_avg": [ 0, 0 ], "wc_summary_review_avg": [ 0, 0 ], "wc_main_review_avg": [ 0, 0 ], "wc_review_avg": [ 0, 0 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 1, 0 ], "authors#_avg": [ 1, 0 ], "corr_recommendation_confidence": 0, "corr_recommendation_correctness": 0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:z8aYIzZ1sZ0J:scholar.google.com/&scioq=Personalized+PageRank+meets+Graph+Attention+Networks&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "Amazon", "aff_unique_dep": "Amazon.com, Inc.", "aff_unique_url": "https://www.amazon.com", "aff_unique_abbr": "Amazon", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "title": "Cross-Trajectory Representation Learning for Zero-Shot Generalization in RL", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6350", "id": "XOh5x-vxsrV", "poster": "", "openreview": "https://openreview.net/forum?id=XOh5x-vxsrV", "slides": "https://iclr.cc/virtual/2022/poster/6350", "video": "https://iclr.cc/virtual/2022/poster/6350", "author_site": "Bogdan Mazoure, Ahmed Ahmed, R Devon Hjelm, Andrey Kolobov, Patrick MacAlpine", "tldr": "", "abstract": "A highly desirable property of a reinforcement learning (RL) agent -- and a major difficulty for deep RL approaches -- is the ability to generalize policies learned on a few tasks over a high-dimensional observation space to similar tasks not seen during training. Many promising approaches to this challenge consider RL as a process of training two functions simultaneously: a complex nonlinear encoder that maps high-dimensional observations to a latent representation space, and a simple linear policy over this space. We posit that a superior encoder for zero-shot generalization in RL can be trained by using solely an auxiliary SSL objective if the training process encourages the encoder to map behaviorally similar observations to similar representations, as reward-based signal can cause overfitting in the encoder (Raileanu et al., 2021). We propose Cross-Trajectory Representation Learning (CTRL), a method that runs within an RL agent and conditions its encoder to recognize behavioral similarity in observations by applying a novel SSL objective to pairs of trajectories from the agent's policies. CTRL can be viewed as having the same effect as inducing a pseudo-bisimulation metric but, crucially, avoids the use of rewards and associated overfitting risks. Our experiments ablate various components of CTRL and demonstrate that in combination with PPO it achieves better generalization performance on the challenging Procgen benchmark suite (Cobbe et al., 2020). ", "keywords": "reinforcement learning;representation learning;self-supervised learning;procgen", "primary_area": "", "supplementary_material": "/attachment/57ad6e19097b0d2f44e552317e58ee1d656c8f3b.zip", "author": "Bogdan Mazoure;Ahmed M Ahmed;R Devon Hjelm;Andrey Kolobov;Patrick MacAlpine", "authorids": "~Bogdan_Mazoure1;~Ahmed_M_Ahmed1;~R_Devon_Hjelm1;~Andrey_Kolobov1;~Patrick_MacAlpine1", "gender": "M;M;M;M;", "homepage": "https://bmazoure.github.io;;;https://www.microsoft.com/en-us/research/people/akolobov/;", "dblp": ";;195/5928;95/3462;00/10482", "google_scholar": "https://scholar.google.ca/citations?user=NaxShlcAAAAJ;;https://scholar.google.ca/citations?user=68c5HfwAAAAJ;xEWgxBsAAAAJ;ZP4gs8oAAAAJ", "orcid": ";;;;", "linkedin": ";ahmed-ahmed-13914510a/;;;", "or_profile": "~Bogdan_Mazoure1;~Ahmed_M_Ahmed1;~R_Devon_Hjelm1;~Andrey_Kolobov1;~Patrick_MacAlpine1", "aff": "McGill University, McGill University;Stanford University;Microsoft;Microsoft;Sony AI", "aff_domain": "mail.mcgill.ca;stanford.edu;microsoft.com;microsoft.com;sony.com", "position": "PhD student;MS student;Researcher;Researcher;Research Scientist", "bibtex": "@inproceedings{\nmazoure2022crosstrajectory,\ntitle={Cross-Trajectory Representation Learning for Zero-Shot Generalization in {RL}},\nauthor={Bogdan Mazoure and Ahmed M Ahmed and R Devon Hjelm and Andrey Kolobov and Patrick MacAlpine},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=XOh5x-vxsrV}\n}", "github": "", "project": "", "reviewers": "Kekc;7TDG;z5qk;DS9R", "pdf_size": 0, "recommendation": "3;6;6;6", "confidence": "4;3;3;4", "correctness": "3;3;4;3", "technical_novelty": "3;3;4;3", "empirical_novelty": "2;3;3;4", "wc_summary_paper": "115;65;43;77", "wc_summary_review": "58;46;53;35", "wc_main_review": "547;338;193;905", "wc_review": "720;449;289;1017", "wc_reply_reviewers": "0;0;17;234", "wc_reply_authors": "400;342;524;461", "reply_reviewers": "0;0;1;1", "reply_authors": "1;1;1;1", "recommendation_avg": [ 5.25, 1.299038105676658 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 3.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 75.0, 26.115129714401192 ], "wc_summary_review_avg": [ 48.0, 8.631338250816034 ], "wc_main_review_avg": [ 495.75, 267.7007424345327 ], "wc_review_avg": [ 618.75, 276.7691953595992 ], "wc_reply_reviewers_avg": [ 62.75, 99.11451710017054 ], "wc_reply_authors_avg": [ 431.75, 67.8762661023719 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 31, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8504220534031883718&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=XOh5x-vxsrV", "email": "mail.mcgill.ca;stanford.edu;microsoft.com;microsoft.com;sony.com", "author_num": 5, "aff_unique_index": "0;1;2;2;3", "aff_unique_norm": "McGill University;Stanford University;Microsoft;Sony", "aff_unique_dep": ";;Microsoft Corporation;Sony AI", "aff_unique_url": "https://www.mcgill.ca;https://www.stanford.edu;https://www.microsoft.com;https://www.sony.com", "aff_unique_abbr": "McGill;Stanford;Microsoft;Sony AI", "aff_campus_unique_index": "1", "aff_campus_unique": ";Stanford", "aff_country_unique_index": "0;1;1;1;2", "aff_country_unique": "Canada;United States;Japan" }, { "id": "XSwpJ2bonX", "title": "Neural Circuit Architectural Priors for Embodied Control", "track": "main", "status": "Reject", "tldr": "", "abstract": "Artificial neural networks coupled with learning-based methods have enabled robots to tackle increasingly complex tasks, but often at the expense of requiring large amounts of learning experience. In nature, animals are born with highly structured connectivity in their brains and nervous systems that enables them to efficiently learn robust motor skills. Capturing some of this structure in artificial models may bring robots closer to matching animal performance and efficiency. In this paper, we present Neural Circuit Architectural Priors (NCAP), a set of reusable architectural components and design principles for deriving network architectures for embodied control from biological neural circuits. We apply this method to control a simulated agent performing a locomotion task and show that the NCAP architecture achieves comparable asymptotic performance with fully connected MLP architectures while dramatically improving data efficiency and requiring far fewer parameters. We further show through an ablation analysis that principled excitation/inhibition and initialization play significant roles in our NCAP architecture. Overall, our work suggests a way of advancing artificial intelligence and robotics research inspired by systems neuroscience.", "keywords": "neuroscience-inspired AI;robotics;motor control", "primary_area": "", "supplementary_material": "/attachment/9daed30384d30682bd2d51721443d34c5552df73.zip", "author": "Nikhil Xie Bhattasali;Anthony M. Zador;Tatiana A Engel", "authorids": "~Nikhil_Xie_Bhattasali1;zador@cshl.edu;~Tatiana_A_Engel1", "gender": "M;;", "homepage": ";;http://pni.princeton.edu/faculty/tatiana-engel", "dblp": ";;", "google_scholar": ";;Vq0BIO4AAAAJ", "orcid": ";;0000-0001-5842-9406", "linkedin": "nikhilbhattasali/;;", "or_profile": "~Nikhil_Xie_Bhattasali1;zador@cshl.edu;~Tatiana_A_Engel1", "aff": "Cold Spring Harbor Laboratory;;Cold Spring Harbor Laboratory", "aff_domain": "cshl.edu;;cshl.edu", "position": "Researcher;;Assistant Professor", "bibtex": "@misc{\nbhattasali2022neural,\ntitle={Neural Circuit Architectural Priors for Embodied Control},\nauthor={Nikhil Xie Bhattasali and Anthony M. Zador and Tatiana A Engel},\nyear={2022},\nurl={https://openreview.net/forum?id=XSwpJ2bonX}\n}", "github": "", "project": "", "reviewers": "GaKc;qSdv;AX4S;6cvx", "site": "https://openreview.net/forum?id=XSwpJ2bonX", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "4;4;4;3", "correctness": "2;3;2;3", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;2;1;3", "wc_summary_paper": "88;71;66;57", "wc_summary_review": "77;90;89;76", "wc_main_review": "480;633;346;154", "wc_review": "645;794;501;287", "wc_reply_reviewers": "130;0;0;0", "wc_reply_authors": "177;416;612;489", "reply_reviewers": "1;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 70.5, 11.280514172678478 ], "wc_summary_review_avg": [ 83.0, 6.519202405202649 ], "wc_main_review_avg": [ 403.25, 176.1240684858262 ], "wc_review_avg": [ 556.75, 187.04862335767137 ], "wc_reply_reviewers_avg": [ 32.5, 56.29165124598851 ], "wc_reply_authors_avg": [ 423.5, 158.61982852090088 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -1.0, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8891335267014433377&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 11, "aff_unique_index": "0;0", "aff_unique_norm": "Cold Spring Harbor Laboratory", "aff_unique_dep": "", "aff_unique_url": "https://www.cshl.edu", "aff_unique_abbr": "CSHL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "XTzAhbVbKgq", "title": "Batched Lipschitz Bandits", "track": "main", "status": "Desk Reject", "tldr": "", "abstract": "In this paper, we study the batched Lipschitz bandit problem, where the expected reward is Lipschitz and the reward observations are collected in batches. We introduce a novel landscape-aware algorithm, called Batched Lipschitz Narrowing (BLiN), that naturally fits into the batched feedback setting. In particular, we show that for a $T$-step problem with Lipschitz reward of zooming dimension $d_z$, our algorithm achieves theoretically optimal regret rate of $ \\widetilde{\\mathcal{O}} \\left( T^{\\frac{d_z + 1}{d_z + 2}} \\right) $ using only $ \\mathcal{O} \\left( \\frac{\\log T}{d_z} \\right) $ batches. For the lower bound, we show that in an environment with $B$-batches, for any policy $\\pi$, there exists a problem instance such that the expected regret is lower bounded by $ \\widetilde{\\Omega} \\left(R_z(T)^\\frac{1}{1-\\left(\\frac{1}{d+2}\\right)^B}\\right) $, where $R_z (T)$ is the regret lower bound for vanilla Lipschitz bandits that depends on the zooming dimension $d_z$, and $d$ is the dimension of the arm space. ", "keywords": "Multi-armed bandits;online learning;batched bandits;Lipschitz bandits", "primary_area": "", "supplementary_material": "/attachment/21a23984f653d8b18b7db42977fdab6a94813ef1.zip", "author": "Yasong Feng;Zengfeng Huang;Tianyu Wang", "authorids": "~Yasong_Feng1;~Zengfeng_Huang1;~Tianyu_Wang4", "gender": ";M;", "homepage": ";https://zengfenghuang.github.io/;https://wangt1anyu.github.io", "dblp": "250/2394;97/9726;", "google_scholar": ";https://scholar.google.com.hk/citations?user=FwNBuXUAAAAJ;", "orcid": ";0000-0003-2671-7483;", "linkedin": ";;", "or_profile": "~Yasong_Feng1;~Zengfeng_Huang1;~Tianyu_Wang4", "aff": "Fudan University;Fudan University;Fudan University", "aff_domain": "fdu.edu;fudan.edu.cn;fudan.edu.cn", "position": "PhD student;Associate Professor;Assistant Professor", "bibtex": "@misc{\nfeng2022batched,\ntitle={Batched Lipschitz Bandits},\nauthor={Yasong Feng and Zengfeng Huang and Tianyu Wang},\nyear={2022},\nurl={https://openreview.net/forum?id=XTzAhbVbKgq}\n}", "github": "", "project": "", "reviewers": "", "site": "https://openreview.net/forum?id=XTzAhbVbKgq", "pdf_size": 0, "recommendation": "", "confidence": "", "correctness": "", "technical_novelty": "", "empirical_novelty": "", "wc_summary_paper": "", "wc_summary_review": "", "wc_main_review": "", "wc_review": "", "wc_reply_reviewers": "", "wc_reply_authors": "", "reply_reviewers": "", "reply_authors": "", "recommendation_avg": [ 0, 0 ], "confidence_avg": [ 0, 0 ], "correctness_avg": [ 0, 0 ], "technical_novelty_avg": [ 0, 0 ], "empirical_novelty_avg": [ 0, 0 ], "wc_summary_paper_avg": [ 0, 0 ], "wc_summary_review_avg": [ 0, 0 ], "wc_main_review_avg": [ 0, 0 ], "wc_review_avg": [ 0, 0 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 1, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0, "corr_recommendation_correctness": 0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:H-2-ZzaI-jQJ:scholar.google.com/&scioq=Batched+Lipschitz+Bandits&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;0", "aff_unique_norm": "Fudan University", "aff_unique_dep": "", "aff_unique_url": "https://www.fudan.edu.cn", "aff_unique_abbr": "Fudan", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "Salient ImageNet: How to discover spurious features in Deep Learning?", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/5903", "id": "XVPqLyNxSyh", "poster": "", "openreview": "https://openreview.net/forum?id=XVPqLyNxSyh", "slides": "https://iclr.cc/virtual/2022/poster/5903", "video": "https://iclr.cc/virtual/2022/poster/5903", "author_site": "Sahil Singla, Soheil Feizi", "tldr": "", "abstract": "Deep neural networks can be unreliable in the real world especially when they heavily use {\\it spurious} features for their predictions. Focusing on image classifications, we define {\\it core features} as the set of visual features that are always a part of the object definition while {\\it spurious features} are the ones that are likely to {\\it co-occur} with the object but not a part of it (e.g., attribute ``fingers\" for class ``band aid\"). Traditional methods for discovering spurious features either require extensive human annotations (thus, not scalable), or are useful on specific models. In this work, we introduce a {\\it general} framework to discover a subset of spurious and core visual features used in inferences of a general model and localize them on a large number of images with minimal human supervision. Our methodology is based on this key idea: to identify spurious or core \\textit{visual features} used in model predictions, we identify spurious or core \\textit{neural features} (penultimate layer neurons of a robust model) via limited human supervision (e.g., using top 5 activating images per feature). We then show that these neural feature annotations {\\it generalize} extremely well to many more images {\\it without} any human supervision. We use the activation maps for these neural features as the soft masks to highlight spurious or core visual features. Using this methodology, we introduce the {\\it Salient Imagenet} dataset containing core and spurious masks for a large set of samples from Imagenet. Using this dataset, we show that several popular Imagenet models rely heavily on various spurious features in their predictions, indicating the standard accuracy alone is not sufficient to fully assess model' performance specially in safety-critical applications. Code is available at \\url{https://github.com/singlasahil14/salient_imagenet}.", "keywords": "interpretability;failure explanation;debugging;robustness", "primary_area": "", "supplementary_material": "/attachment/335d0a4d0307c714ec937fc7c941d3f6e1e926c2.zip", "author": "Sahil Singla;Soheil Feizi", "authorids": "~Sahil_Singla1;~Soheil_Feizi2", "gender": "M;M", "homepage": "https://singlasahil14.github.io/;https://www.cs.umd.edu/~sfeizi/", "dblp": "55/8911-2;57/2132", "google_scholar": "jjjbOI4AAAAJ;lptAmrMAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Sahil_Singla1;~Soheil_Feizi2", "aff": "University of Maryland, College Park;University of Maryland, College Park", "aff_domain": "umd.edu;umd.edu", "position": "PhD student;Assistant Professor", "bibtex": "@inproceedings{\nsingla2022salient,\ntitle={Salient ImageNet: How to discover spurious features in Deep Learning?},\nauthor={Sahil Singla and Soheil Feizi},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=XVPqLyNxSyh}\n}", "github": "", "project": "", "reviewers": "iQgx;1LYa;DwGM", "pdf_size": 0, "recommendation": "6;8;8", "confidence": "3;4;4", "correctness": "3;4;3", "technical_novelty": "3;3;3", "empirical_novelty": "3;3;3", "wc_summary_paper": "57;71;219", "wc_summary_review": "55;41;69", "wc_main_review": "241;294;874", "wc_review": "353;406;1162", "wc_reply_reviewers": "0;60;10", "wc_reply_authors": "851;458;1429", "reply_reviewers": "0;1;1", "reply_authors": "2;1;3", "recommendation_avg": [ 7.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 115.66666666666667, 73.29089681232968 ], "wc_summary_review_avg": [ 55.0, 11.430952132988164 ], "wc_main_review_avg": [ 469.6666666666667, 286.7244127884629 ], "wc_review_avg": [ 640.3333333333334, 369.5080814031301 ], "wc_reply_reviewers_avg": [ 23.333333333333332, 26.246692913372705 ], "wc_reply_authors_avg": [ 912.6666666666666, 398.8001448790553 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 2.0, 0.816496580927726 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.9999999999999998, "corr_recommendation_correctness": 0.49999999999999983, "gs_citation": 120, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14829986418742964472&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=XVPqLyNxSyh", "email": "umd.edu;umd.edu", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "University of Maryland", "aff_unique_dep": "", "aff_unique_url": "https://www/umd.edu", "aff_unique_abbr": "UMD", "aff_campus_unique_index": "0;0", "aff_campus_unique": "College Park", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Contrastive Fine-grained Class Clustering via Generative Adversarial Networks", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7164", "id": "XWODe7ZLn8f", "poster": "", "openreview": "https://openreview.net/forum?id=XWODe7ZLn8f", "slides": "https://iclr.cc/virtual/2022/poster/7164", "video": "https://iclr.cc/virtual/2022/poster/7164", "author_site": "Yunji Kim, Jung-Woo Ha", "tldr": "", "abstract": "Unsupervised fine-grained class clustering is a practical yet challenging task due to the difficulty of feature representations learning of subtle object details. We introduce C3-GAN, a method that leverages the categorical inference power of InfoGAN with contrastive learning. We aim to learn feature representations that encourage a dataset to form distinct cluster boundaries in the embedding space, while also maximizing the mutual information between the latent code and its image observation. Our approach is to train a discriminator, which is also used for inferring clusters, to optimize the contrastive loss, where image-latent pairs that maximize the mutual information are considered as positive pairs and the rest as negative pairs. Specifically, we map the input of a generator, which was sampled from the categorical distribution, to the embedding space of the discriminator and let them act as a cluster centroid. In this way, C3-GAN succeeded in learning a clustering-friendly embedding space where each cluster is distinctively separable. Experimental results show that C3-GAN achieved the state-of-the-art clustering performance on four fine-grained image datasets, while also alleviating the mode collapse phenomenon. Code is available at https://github.com/naver-ai/c3-gan.", "keywords": "Unsupervised Fine-grained Class Clustering;Disentangled Representation Learning;Generative Adversarial Networks", "primary_area": "", "supplementary_material": "", "author": "Yunji Kim;Jung-Woo Ha", "authorids": "~Yunji_Kim1;~Jung-Woo_Ha1", "gender": ";M", "homepage": ";https://aidljwha.wordpress.com/", "dblp": ";66/867-1", "google_scholar": ";https://scholar.google.co.kr/citations?user=eGj3ay4AAAAJ", "orcid": ";0000-0002-7400-7681", "linkedin": ";jung-woo-ha-b2782862?trk=hp-identity-name", "or_profile": "~Yunji_Kim1;~Jung-Woo_Ha1", "aff": ";NAVER AI Lab", "aff_domain": ";navercorp.com", "position": ";Head (Executive Director)", "bibtex": "@inproceedings{\nkim2022contrastive,\ntitle={Contrastive Fine-grained Class Clustering via Generative Adversarial Networks},\nauthor={Yunji Kim and Jung-Woo Ha},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=XWODe7ZLn8f}\n}", "github": "", "project": "", "reviewers": "5wFE;imaQ;9p3B;odf8", "pdf_size": 0, "recommendation": "6;6;8;8", "confidence": "3;4;3;3", "correctness": "4;3;4;4", "technical_novelty": "3;2;3;3", "empirical_novelty": "3;2;3;2", "wc_summary_paper": "95;93;55;122", "wc_summary_review": "22;83;31;76", "wc_main_review": "188;339;60;184", "wc_review": "305;515;146;382", "wc_reply_reviewers": "0;53;0;82", "wc_reply_authors": "556;874;21;590", "reply_reviewers": "0;1;0;1", "reply_authors": "2;2;1;1", "recommendation_avg": [ 7.0, 1.0 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 91.25, 23.85765076448224 ], "wc_summary_review_avg": [ 53.0, 26.80485030736042 ], "wc_main_review_avg": [ 192.75, 98.88219000406494 ], "wc_review_avg": [ 337.0, 133.4297568010974 ], "wc_reply_reviewers_avg": [ 33.75, 35.27304211433995 ], "wc_reply_authors_avg": [ 510.25, 308.2745326815045 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2883627661337586326&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=XWODe7ZLn8f", "email": ";navercorp.com", "author_num": 2, "aff_unique_index": "0", "aff_unique_norm": "NAVER Corporation", "aff_unique_dep": "NAVER AI Lab", "aff_unique_url": "https://www.naver.com", "aff_unique_abbr": "NAVER", "aff_country_unique_index": "0", "aff_country_unique": "South Korea" }, { "id": "XY1DWeh58WR", "title": "Deep Recurrent Neural Network Layers with Layerwise Loss", "track": "main", "status": "Desk Reject", "tldr": "", "abstract": "Deep learning techniques have brought significant performance improvement to various areas of machine learning. Especially in the computer vision area, very deep networks such as ResNet have shown notable performance improvement. However, in speech recognition or language processing, such kinds of a very deep network have not been extensively employed. In this paper, we propose a very deep LSTM structure and their training strategy. In our training strategy, we first start training a conventional model with several LSTM layers. One notable difference is that for the top LSTM layer of the initial model, the Connectionist Temporal Classification (CTC) loss is applied both to the input and output of this top LSTM layer. Once this initial model is sufficiently layered, this top layer is copied to construct a very deep LSTM stack. For this newly constructed stack, the CTC loss is applied to every output of the LSTM layer as well as the top of the stack. Experimental results show that this deep LSTM structure shows significantly better results than the conventional model with 5 ~ 6 layers with a comparable number of parameters.\n", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Chanwoo Kim", "authorids": "~Chanwoo_Kim2", "gender": "M", "homepage": "https://www.facebook.com/chanwcom", "dblp": "", "google_scholar": "pJoZXxYAAAAJ", "orcid": "", "linkedin": "chanwoo-kim-2628a622/?originalSubdomain=kr", "or_profile": "~Chanwoo_Kim2", "aff": "Samsung Research", "aff_domain": "samsung.com", "position": "Corporate Vice President", "bibtex": "@misc{\nkim2022deep,\ntitle={Deep Recurrent Neural Network Layers with Layerwise Loss},\nauthor={Chanwoo Kim},\nyear={2022},\nurl={https://openreview.net/forum?id=XY1DWeh58WR}\n}", "github": "", "project": "", "reviewers": "", "site": "https://openreview.net/forum?id=XY1DWeh58WR", "pdf_size": 0, "recommendation": "", "confidence": "", "correctness": "", "technical_novelty": "", "empirical_novelty": "", "wc_summary_paper": "", "wc_summary_review": "", "wc_main_review": "", "wc_review": "", "wc_reply_reviewers": "", "wc_reply_authors": "", "reply_reviewers": "", "reply_authors": "", "recommendation_avg": [ 0, 0 ], "confidence_avg": [ 0, 0 ], "correctness_avg": [ 0, 0 ], "technical_novelty_avg": [ 0, 0 ], "empirical_novelty_avg": [ 0, 0 ], "wc_summary_paper_avg": [ 0, 0 ], "wc_summary_review_avg": [ 0, 0 ], "wc_main_review_avg": [ 0, 0 ], "wc_review_avg": [ 0, 0 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 1, 0 ], "authors#_avg": [ 1, 0 ], "corr_recommendation_confidence": 0, "corr_recommendation_correctness": 0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:jpLfo_dN-j4J:scholar.google.com/&scioq=Deep+Recurrent+Neural+Network+Layers+with+Layerwise+Loss&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "Samsung", "aff_unique_dep": "Samsung Research", "aff_unique_url": "https://research.samsung.com", "aff_unique_abbr": "Samsung", "aff_country_unique_index": "0", "aff_country_unique": "South Korea" }, { "title": "EE-Net: Exploitation-Exploration Neural Networks in Contextual Bandits", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6185", "id": "X_ch3VrNSRg", "poster": "", "openreview": "https://openreview.net/forum?id=X_ch3VrNSRg", "slides": "https://iclr.cc/virtual/2022/poster/6185", "video": "https://iclr.cc/virtual/2022/poster/6185", "author_site": "Yikun Ban, Yuchen Yan, Arindam Banerjee, Jingrui He", "tldr": "", "abstract": "In this paper, we propose a novel neural exploration strategy in contextual bandits, EE-Net, distinct from the standard UCB-based and TS-based approaches. Contextual multi-armed bandits have been studied for decades with various applications. To solve the exploitation-exploration tradeoff in bandits, there are three main techniques: epsilon-greedy, Thompson Sampling (TS), and Upper Confidence Bound (UCB). In recent literature, linear contextual bandits have adopted ridge regression to estimate the reward function and combine it with TS or UCB strategies for exploration. However, this line of works explicitly assumes the reward is based on a linear function of arm vectors, which may not be true in real-world datasets. To overcome this challenge, a series of neural bandit algorithms have been proposed, where a neural network is used to learn the underlying reward function and TS or UCB are adapted for exploration. Instead of calculating a large-deviation based statistical bound for exploration like previous methods, we propose \"EE-Net\", a novel neural-based exploration strategy. In addition to using a neural network (Exploitation network) to learn the reward function, EE-Net uses another neural network (Exploration network) to adaptively learn potential gains compared to the currently estimated reward for exploration. Then, a decision-maker is constructed to combine the outputs from the Exploitation and Exploration networks. We prove that EE-Net can achieve $\\mathcal{O}(\\sqrt{T\\log T})$ regret and show that EE-Net outperforms existing linear and neural contextual bandit baselines on real-world datasets. ", "keywords": "Contextual Bandits;Exploration Strategy;Neural Networks", "primary_area": "", "supplementary_material": "", "author": "Yikun Ban;Yuchen Yan;Arindam Banerjee;Jingrui He", "authorids": "~Yikun_Ban1;~Yuchen_Yan1;~Arindam_Banerjee4;~Jingrui_He1", "gender": ";;;F", "homepage": ";;https://arindam.cs.illinois.edu/;https://www.hejingrui.org", "dblp": ";;82/4807.html;34/2685", "google_scholar": ";;RY7cuPAAAAAJ;hXpZynkAAAAJ", "orcid": ";;;0000-0002-6429-6272", "linkedin": ";;;", "or_profile": "~Yikun_Ban1;~Yuchen_Yan1;~Arindam_Banerjee4;~Jingrui_He1", "aff": ";;University of Illinois, Urbana Champaign;University of Illinois, Urbana Champaign", "aff_domain": ";;illinois.edu;illinois.edu", "position": ";;Professor;Associate Professor", "bibtex": "@inproceedings{\nban2022eenet,\ntitle={{EE}-Net: Exploitation-Exploration Neural Networks in Contextual Bandits},\nauthor={Yikun Ban and Yuchen Yan and Arindam Banerjee and Jingrui He},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=X_ch3VrNSRg}\n}", "github": "", "project": "", "reviewers": "KcQy;SiY7;BoiG;vjXZ", "pdf_size": 0, "recommendation": "6;6;8;8", "confidence": "3;3;2;3", "correctness": "3;3;4;4", "technical_novelty": "3;2;3;4", "empirical_novelty": "2;2;4;3", "wc_summary_paper": "114;61;87;87", "wc_summary_review": "30;21;45;47", "wc_main_review": "163;134;289;230", "wc_review": "307;216;421;364", "wc_reply_reviewers": "0;23;0;85", "wc_reply_authors": "381;937;339;681", "reply_reviewers": "0;1;0;1", "reply_authors": "1;3;1;2", "recommendation_avg": [ 7.0, 1.0 ], "confidence_avg": [ 2.75, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 87.25, 18.73999733191016 ], "wc_summary_review_avg": [ 35.75, 10.755812382149477 ], "wc_main_review_avg": [ 204.0, 60.170590823092304 ], "wc_review_avg": [ 327.0, 75.70667077609475 ], "wc_reply_reviewers_avg": [ 27.0, 34.777866524558405 ], "wc_reply_authors_avg": [ 584.5, 242.5134016915354 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.75, 0.82915619758885 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 1.0, "gs_citation": 45, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=959574665974730167&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "pdf": "https://openreview.net/pdf?id=X_ch3VrNSRg", "email": ";;illinois.edu;illinois.edu", "author_num": 4, "aff_unique_index": "0;0", "aff_unique_norm": "University of Illinois Urbana-Champaign", "aff_unique_dep": "", "aff_unique_url": "https://illinois.edu", "aff_unique_abbr": "UIUC", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Urbana-Champaign", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Lossless Compression with Probabilistic Circuits", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6879", "id": "X_hByk2-5je", "poster": "", "openreview": "https://openreview.net/forum?id=X_hByk2-5je", "slides": "https://iclr.cc/virtual/2022/poster/6879", "video": "https://iclr.cc/virtual/2022/poster/6879", "author_site": "Anji Liu, Stephan Mandt, Guy Van den Broeck", "tldr": "", "abstract": "Despite extensive progress on image generation, common deep generative model architectures are not easily applied to lossless compression. For example, VAEs suffer from a compression cost overhead due to their latent variables. This overhead can only be partially eliminated with elaborate schemes such as bits-back coding, often resulting in poor single-sample compression rates. To overcome such problems, we establish a new class of tractable lossless compression models that permit efficient encoding and decoding: Probabilistic Circuits (PCs). These are a class of neural networks involving $|p|$ computational units that support efficient marginalization over arbitrary subsets of the $D$ feature dimensions, enabling efficient arithmetic coding. We derive efficient encoding and decoding schemes that both have time complexity $\\mathcal{O} (\\log(D) \\cdot |p|)$, where a naive scheme would have linear costs in $D$ and $|p|$, making the approach highly scalable. Empirically, our PC-based (de)compression algorithm runs 5-40 times faster than neural compression algorithms that achieve similar bitrates. By scaling up the traditional PC structure learning pipeline, we achieve state-of-the-art results on image datasets such as MNIST. Furthermore, PCs can be naturally integrated with existing neural compression algorithms to improve the performance of these base models on natural image datasets. Our results highlight the potential impact that non-standard learning architectures may have on neural data compression.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/47fa820f74007d38214ffa0555e1cfbe2ca8b6a3.zip", "author": "Anji Liu;Stephan Mandt;Guy Van den Broeck", "authorids": "~Anji_Liu1;~Stephan_Mandt1;~Guy_Van_den_Broeck1", "gender": "M;M;M", "homepage": "https://liuanji.github.io/;http://web.cs.ucla.edu/~guyvdb/;https://www.stephanmandt.com", "dblp": "227/8622;96/7521.html;147/5018", "google_scholar": "k_4zYecAAAAJ;d0KQ9z0AAAAJ;HOrGe7wAAAAJ", "orcid": ";0000-0003-3434-2503;", "linkedin": "anji-liu-7610b7190/;guyvdb;stephan-mandt-8702795a/", "or_profile": "~Anji_Liu1;~Guy_Van_den_Broek1;~Stephan_M_Mandt1", "aff": "University of California, Los Angeles;University of California, Los Angeles;University of California, Irvine", "aff_domain": "ucla.edu;ucla.edu;uci.edu", "position": "PhD student;Associate Professor;Assistant Professor", "bibtex": "@inproceedings{\nliu2022lossless,\ntitle={Lossless Compression with Probabilistic Circuits},\nauthor={Anji Liu and Stephan Mandt and Guy Van den Broeck},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=X_hByk2-5je}\n}", "github": "", "project": "", "reviewers": "mSD5;WvLE;T76k;H7Ew", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "3;2;3;4", "correctness": "3;3;2;3", "technical_novelty": "3;3;3;2", "empirical_novelty": "3;3;2;3", "wc_summary_paper": "45;59;151;98", "wc_summary_review": "36;23;51;43", "wc_main_review": "275;337;757;241", "wc_review": "356;419;959;382", "wc_reply_reviewers": "160;0;189;0", "wc_reply_authors": "596;588;904;438", "reply_reviewers": "1;0;1;0", "reply_authors": "2;1;3;1", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 3.0, 0.7071067811865476 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 88.25, 41.105808591973954 ], "wc_summary_review_avg": [ 38.25, 10.280442597476044 ], "wc_main_review_avg": [ 402.5, 207.54457352578507 ], "wc_review_avg": [ 529.0, 249.26792814158824 ], "wc_reply_reviewers_avg": [ 87.25, 87.85036994799738 ], "wc_reply_authors_avg": [ 631.5, 169.44836971774026 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.75, 0.82915619758885 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.6488856845230502, "corr_recommendation_correctness": 0.13245323570650439, "gs_citation": 30, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13531638226043466967&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "pdf": "https://openreview.net/pdf?id=X_hByk2-5je", "email": "ucla.edu;ucla.edu;uci.edu", "author_num": 3, "aff_unique_index": "0;0;1", "aff_unique_norm": "University of California, Los Angeles;University of California, Irvine", "aff_unique_dep": ";", "aff_unique_url": "https://www.ucla.edu;https://www.uci.edu", "aff_unique_abbr": "UCLA;UCI", "aff_campus_unique_index": "0;0;1", "aff_campus_unique": "Los Angeles;Irvine", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "Xa8sKVPnDJq", "title": "Composing Features: Compositional Model Augmentation for Steerability of Music Transformers", "track": "main", "status": "Reject", "tldr": "", "abstract": "Music is a combinatorial art. Given a starting sequence, many continuations are possible, yet often only one is written down. With generative models, we can explore many. However, finding a continuation with specific combinations of features (such as rising pitches, with block chords played in syncopated rhythm) can take many trials.\nTo tackle the combinatorial nature of composing features, we propose a compositional approach to steering music transformers, building on lightweight fine-tuning methods such as prefix tuning and bias tuning. We introduce a novel contrastive loss function that enables us to steer compositional models over logical features using supervised learning. We examine the difficulty in steering based on whether features musically follow a prime or not, using existing music as a proxy. We show that with a relatively small number of extra parameters, our method allows bias tuning to perform successful fine-tuning in both the single-feature and compositional setting.", "keywords": "applications;music;controllable generation;compositionality;transformer;finetuning", "primary_area": "", "supplementary_material": "", "author": "Halley Young;Vincent Dumoulin;Pablo Samuel Castro;Jesse Engel;Cheng-Zhi Anna Huang", "authorids": "~Halley_Young1;~Vincent_Dumoulin1;~Pablo_Samuel_Castro1;~Jesse_Engel1;~Cheng-Zhi_Anna_Huang1", "gender": "F;M;M;M;F", "homepage": "https://www.seas.upenn.edu/~halleyy/;;https://psc-g.github.io/;;", "dblp": "231/5126;133/8606;05/5455;;59/9006", "google_scholar": ";https://scholar.google.ca/citations?user=mZfgLA4AAAAJ;https://scholar.google.ca/citations?user=jn5r6TsAAAAJ;Sc7qOfcAAAAJ;NRz_EVgAAAAJ", "orcid": ";;;;", "linkedin": ";;pablo-samuel-castro-2113641b/;;", "or_profile": "~Halley_Young1;~Vincent_Dumoulin1;~Pablo_Samuel_Castro1;~Jesse_Engel1;~Cheng-Zhi_Anna_Huang1", "aff": "Google;Google;Google;Google Brain;Google", "aff_domain": "google.com;google.com;google.com;google.com;google.com", "position": "Researcher;Research Scientist;Researcher;Research Scientist;Researcher", "bibtex": "@misc{\nyoung2022composing,\ntitle={Composing Features: Compositional Model Augmentation for Steerability of Music Transformers},\nauthor={Halley Young and Vincent Dumoulin and Pablo Samuel Castro and Jesse Engel and Cheng-Zhi Anna Huang},\nyear={2022},\nurl={https://openreview.net/forum?id=Xa8sKVPnDJq}\n}", "github": "", "project": "", "reviewers": "ezPb;qjek;ZYeZ;LG3E", "site": "https://openreview.net/forum?id=Xa8sKVPnDJq", "pdf_size": 0, "recommendation": "3;3;3;6", "confidence": "4;4;3;3", "correctness": "3;2;2;4", "technical_novelty": "2;4;3;2", "empirical_novelty": "2;3;2;3", "wc_summary_paper": "50;68;94;159", "wc_summary_review": "99;59;74;17", "wc_main_review": "251;509;607;328", "wc_review": "400;636;775;504", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "145;1478;447;54", "reply_reviewers": "0;0;0;0", "reply_authors": "1;3;1;1", "recommendation_avg": [ 3.75, 1.299038105676658 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 2.75, 0.82915619758885 ], "technical_novelty_avg": [ 2.75, 0.82915619758885 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 92.75, 41.324175732856425 ], "wc_summary_review_avg": [ 62.25, 29.77729839995563 ], "wc_main_review_avg": [ 423.75, 141.29645253862532 ], "wc_review_avg": [ 578.75, 140.82857487030108 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 531.0, 565.7715970248065 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.5, 0.8660254037844386 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.8703882797784892, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2066467371884208741&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Google", "aff_unique_dep": "Google", "aff_unique_url": "https://www.google.com", "aff_unique_abbr": "Google", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "Xb2YyVApEj6", "title": "MaiT: integrating spatial locality into image transformers with attention masks", "track": "main", "status": "Reject", "tldr": "", "abstract": "Though image transformers have shown competitive results with convolutional neural networks in computer vision tasks, lacking inductive biases such as locality still poses problems in terms of model efficiency especially for embedded applications. In this work, we address this issue by introducing attention masks to incorporate spatial locality into self-attention heads of transformers. Local dependencies are captured with masked attention heads along with global dependencies captured by original unmasked attention heads. With Masked attention image Transformer \u2013 MaiT, top-1 accuracy increases by up to 1.0\\% compared to DeiT, without extra parameters, computation, or external training data. Moreover, attention masks regulate the training of attention maps, which facilitates the convergence and improves the accuracy of deeper transformers. Masked attention heads guide the model to focus on local information in early layers and promote diverse attention maps in latter layers. Deep MaiT improves the top-1 accuracy by up to 1.5\\% compared to CaiT with fewer parameters and less FLOPs. Encoding locality with attention masks requires no extra parameter or structural change, and thus it can be combined with other techniques for further improvement in vision transformers.", "keywords": "vision transformer;image classification;deep learning;computer vision", "primary_area": "", "supplementary_material": "", "author": "Ling Li;Ali Shafiee;Joseph H Hassoun", "authorids": "~Ling_Li7;~Ali_Shafiee1;~Joseph_H_Hassoun1", "gender": "F;;M", "homepage": ";;", "dblp": ";;", "google_scholar": "https://scholar.google.com/citations?hl=en;RutLGUMAAAAJ;", "orcid": ";;", "linkedin": "ling-li-9a035a42/;ali-shafiei-44921947/;https://www.linkedin.com/pub/dir/Joseph/Hassoun", "or_profile": "~Ling_Li7;~Ali_Shafiee1;~Joseph_H_Hassoun1", "aff": "Samsung;;", "aff_domain": "samsung.com;;", "position": "Researcher;;", "bibtex": "@misc{\nli2022mait,\ntitle={MaiT: integrating spatial locality into image transformers with attention masks },\nauthor={Ling Li and Ali Shafiee and Joseph H Hassoun},\nyear={2022},\nurl={https://openreview.net/forum?id=Xb2YyVApEj6}\n}", "github": "", "project": "", "reviewers": "71Sc;kZat;PxTP;wzwv", "site": "https://openreview.net/forum?id=Xb2YyVApEj6", "pdf_size": 0, "recommendation": "1;3;5;5", "confidence": "5;5;5;3", "correctness": "2;3;4;3", "technical_novelty": "2;3;2;2", "empirical_novelty": "1;3;0;2", "wc_summary_paper": "36;65;96;48", "wc_summary_review": "28;22;25;39", "wc_main_review": "382;294;96;70", "wc_review": "446;381;217;157", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "469;163;288;331", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 3.5, 1.6583123951777 ], "confidence_avg": [ 4.5, 0.8660254037844386 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.5, 1.118033988749895 ], "wc_summary_paper_avg": [ 61.25, 22.554101622543072 ], "wc_summary_review_avg": [ 28.5, 6.422616289332565 ], "wc_main_review_avg": [ 210.5, 131.56272268389705 ], "wc_review_avg": [ 300.25, 117.4890952386646 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 312.75, 109.29861618520154 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.5222329678670935, "corr_recommendation_correctness": 0.8528028654224417, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14784957348176128440&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "Samsung", "aff_unique_dep": "Samsung", "aff_unique_url": "https://www.samsung.com", "aff_unique_abbr": "Samsung", "aff_country_unique_index": "0", "aff_country_unique": "South Korea" }, { "id": "XbatFr32NRm", "title": "Generalizing MLPs With Dropouts, Batch Normalization, and Skip Connections", "track": "main", "status": "Reject", "tldr": "", "abstract": "A multilayer perceptron (MLP) is typically made of multiple fully connected layers with nonlinear activation functions. There have been several approaches to make them better (e.g., faster convergence, better convergence limit, etc.). But the researches lack structured ways to test them. We test different MLP architectures by carrying out the experiments on the age and gender datasets. We empirically show that by whitening inputs before every linear layer and adding skip connections, our proposed MLP architecture can result in better performance. Since the whitening process includes dropouts, it can also be used to approximate Bayesian inference. We have open sourced our code, and released models and docker images at https://github.com/anonymous.", "keywords": "MLP;batch normalization;dropout;residual connections;Bayesian inference", "primary_area": "", "supplementary_material": "", "author": "Taewoon Kim", "authorids": "~Taewoon_Kim1", "gender": "", "homepage": "https://taewoon.kim/", "dblp": "00/3896-2", "google_scholar": "dJ4ksGoAAAAJ", "orcid": "0000-0003-2892-0194", "linkedin": "tae898/", "or_profile": "~Taewoon_Kim1", "aff": "Vrije Universiteit Amsterdam", "aff_domain": "vu.nl", "position": "PhD student", "bibtex": "@misc{\nkim2022generalizing,\ntitle={Generalizing {MLP}s With Dropouts, Batch Normalization, and Skip Connections},\nauthor={Taewoon Kim},\nyear={2022},\nurl={https://openreview.net/forum?id=XbatFr32NRm}\n}", "github": "", "project": "", "reviewers": "8xR5;qoCZ;8daM;7ffG", "site": "https://openreview.net/forum?id=XbatFr32NRm", "pdf_size": 0, "recommendation": "3;3;3;3", "confidence": "4;4;5;4", "correctness": "4;1;2;3", "technical_novelty": "2;2;1;2", "empirical_novelty": "1;2;1;2", "wc_summary_paper": "130;81;20;56", "wc_summary_review": "17;42;7;44", "wc_main_review": "167;342;48;230", "wc_review": "314;465;75;330", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 2.5, 1.118033988749895 ], "technical_novelty_avg": [ 1.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.5, 0.5 ], "wc_summary_paper_avg": [ 71.75, 40.01484099681017 ], "wc_summary_review_avg": [ 27.5, 15.913830462839549 ], "wc_main_review_avg": [ 196.75, 106.31880125358826 ], "wc_review_avg": [ 296.0, 140.4296977138383 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 1, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12950046067682335449&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0", "aff_unique_norm": "Vrije Universiteit Amsterdam", "aff_unique_dep": "", "aff_unique_url": "https://www.vu.nl", "aff_unique_abbr": "VU Amsterdam", "aff_country_unique_index": "0", "aff_country_unique": "Netherlands" }, { "title": "Predicting Physics in Mesh-reduced Space with Temporal Attention", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6494", "id": "XctLdNfCmP", "poster": "", "openreview": "https://openreview.net/forum?id=XctLdNfCmP", "slides": "https://iclr.cc/virtual/2022/poster/6494", "video": "https://iclr.cc/virtual/2022/poster/6494", "author_site": "XU HAN, Han Gao, Tobias Pfaff, Jian-Xun Wang, Liping Liu", "tldr": "", "abstract": "Auto-regressive sequence models for physics prediction are often restricted to low-dimensional systems, as memory cost increases with both spatial extents and sequence length. On the other hand, graph-based next-step prediction models have recently been very successful in modeling complex high-dimensional physical systems on irregular meshes, but suffer from error accumulation and drift, due to their short temporal attention span. In this paper, we present a method that marries the strengths of both approaches. We use a GNN to locally summarize features and create coarsened, compact mesh representation of the system state, onto which we apply a transformer-style temporal attention module. We use a second GNN to decode these predictions back to a full-sized graph and perform fine-scale updates. Our method outperforms a competitive GNN baseline on three complex fluid dynamics prediction tasks, from sonic shocks to vascular flow. We demonstrate stable rollouts without the need for training noise and show perfectly phase-stable predictions even for very long sequences. More broadly, we believe our approach paves the way to bringing the benefits of attention-based sequence models to solving high-dimensional complex physics tasks.", "keywords": "fluid dynamics;graph neural network;attention neural network", "primary_area": "", "supplementary_material": "/attachment/852978b8f8b6743955ef5f06733f3c3e8dddeeaf.zip", "author": "XU HAN;Han Gao;Tobias Pfaff;Jian-Xun Wang;Liping Liu", "authorids": "~XU_HAN6;~Han_Gao3;~Tobias_Pfaff1;~Jian-Xun_Wang1;~Liping_Liu1", "gender": "M;M;M;M;", "homepage": ";https://gaohan1234.github.io/;http://tobiaspfaff.com;http://sites.nd.edu/jianxun-wang/;https://www.eecs.tufts.edu/~liulp/", "dblp": ";;67/7591;163/4396;47/5615-1", "google_scholar": "eFsFAJoAAAAJ;ozQz4CQAAAAJ;3oUgDKQAAAAJ;1cXHUD4AAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";0000-0002-7733-8996;;;0000-0002-3690-3928", "linkedin": ";%E6%B6%B5-han-%E9%AB%98-gao-87038a143/;;;", "or_profile": "~XU_HAN6;~Han_Gao3;~Tobias_Pfaff1;~Jian-Xun_Wang1;~Liping_Liu1", "aff": "Tufts University;University of Notre Dame;Deepmind;University of Notre Dame;Tufts University", "aff_domain": "tufts.edu;nd.edu;google.com;nd.edu;tufts.edu", "position": "PhD student;PhD student;Research scientist;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\nhan2022predicting,\ntitle={Predicting Physics in Mesh-reduced Space with Temporal Attention},\nauthor={XU HAN and Han Gao and Tobias Pfaff and Jian-Xun Wang and Liping Liu},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=XctLdNfCmP}\n}", "github": "", "project": "", "reviewers": "X3ET;UQq7;7ipR;PRtV", "pdf_size": 0, "recommendation": "6;6;6;8", "confidence": "4;4;5;4", "correctness": "3;4;3;4", "technical_novelty": "2;3;2;3", "empirical_novelty": "3;3;2;3", "wc_summary_paper": "90;113;36;36", "wc_summary_review": "101;84;40;16", "wc_main_review": "391;308;225;343", "wc_review": "582;505;301;395", "wc_reply_reviewers": "50;21;0;0", "wc_reply_authors": "810;778;1313;351", "reply_reviewers": "1;1;0;0", "reply_authors": "3;2;2;1", "recommendation_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 68.75, 33.74444398712179 ], "wc_summary_review_avg": [ 60.25, 33.88491552298751 ], "wc_main_review_avg": [ 316.75, 60.615076507416866 ], "wc_review_avg": [ 445.75, 106.77400198550207 ], "wc_reply_reviewers_avg": [ 17.75, 20.498475553074673 ], "wc_reply_authors_avg": [ 813.0, 340.8364710532017 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 2.0, 0.7071067811865476 ], "replies_avg": [ 20, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 116, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14408846050126398666&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "pdf": "https://openreview.net/pdf?id=XctLdNfCmP", "email": "tufts.edu;nd.edu;google.com;nd.edu;tufts.edu", "author_num": 5, "aff_unique_index": "0;1;2;1;0", "aff_unique_norm": "Tufts University;University of Notre Dame;DeepMind", "aff_unique_dep": ";;", "aff_unique_url": "https://www.tufts.edu;https://www.nd.edu;https://deepmind.com", "aff_unique_abbr": "Tufts;Notre Dame;DeepMind", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;0", "aff_country_unique": "United States;United Kingdom" }, { "id": "Xd6T7cT7vwj", "title": "Strongly Self-Normalizing Neural Networks with Applications to Implicit Representation Learning", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Recent studies have show that wide neural networks with orthogonal linear layers and Gaussian Poincar\u00e9 normalized activation functions avoid vanishing and exploding gradients for input vectors with the correct magnitude. This paper introduces a strengthening of the condition that the activation function must be Gaussian Poincar\u00e9 normalized which creates robustness to deviations from standard normal distribution in the pre-activations, thereby reducing the dependence on the requirement that the network is wide and that the input vector has the correct magnitude. In implicit representation learning this allows the training of deep networks of this type where the linear layers are no longer constrained to be orthogonal linear transformations. Networks of this type can be fitted to a reference image to 1/10th the mean square error achievable with previous methods. Herein is also given an improved positional encoding for implicit representation learning of two-dimensional images and a small-batch training procedure for fitting of neural networks to images which allows fitting in fewer epochs, leading to substantial improvement in training time.", "keywords": "Strongly Self-Normalizing Neural Networks with Applications to Implicit Representation Learning", "primary_area": "", "supplementary_material": "", "author": "Marcus L\u00e5ng", "authorids": "~Marcus_L\u00e5ng1", "gender": "M", "homepage": "https://github.com/mlaang", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "~Marcus_L\u00e5ng1", "aff": "", "aff_domain": "", "position": "", "bibtex": "@misc{\nl{\\r{a}}ng2022strongly,\ntitle={Strongly Self-Normalizing Neural Networks with Applications to Implicit Representation Learning },\nauthor={Marcus L{\\r{a}}ng},\nyear={2022},\nurl={https://openreview.net/forum?id=Xd6T7cT7vwj}\n}", "github": "", "project": "", "reviewers": "", "site": "https://openreview.net/forum?id=Xd6T7cT7vwj", "pdf_size": 0, "recommendation": "", "confidence": "", "correctness": "", "technical_novelty": "", "empirical_novelty": "", "wc_summary_paper": "", "wc_summary_review": "", "wc_main_review": "", "wc_review": "", "wc_reply_reviewers": "", "wc_reply_authors": "", "reply_reviewers": "", "reply_authors": "", "recommendation_avg": [ 0, 0 ], "confidence_avg": [ 0, 0 ], "correctness_avg": [ 0, 0 ], "technical_novelty_avg": [ 0, 0 ], "empirical_novelty_avg": [ 0, 0 ], "wc_summary_paper_avg": [ 0, 0 ], "wc_summary_review_avg": [ 0, 0 ], "wc_main_review_avg": [ 0, 0 ], "wc_review_avg": [ 0, 0 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 1, 0 ], "authors#_avg": [ 1, 0 ], "corr_recommendation_confidence": 0, "corr_recommendation_correctness": 0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:cvR7z163U5sJ:scholar.google.com/&scioq=Strongly+Self-Normalizing+Neural+Networks+with+Applications+to+Implicit+Representation+Learning&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "XeqjsCVLk1m", "title": "Tell me why!\u2014Explanations support learning relational and causal structure", "track": "main", "status": "Reject", "tldr": "", "abstract": "Explanations play a considerable role in human learning, especially in areas that remain major challenges for AI\u2014forming abstractions, and learning about the relational and causal structure of the world. Here, we explore whether machine learning models might likewise benefit from explanations. We outline a family of relational tasks that involve selecting an object that is the odd one out in a set (i.e., unique along one of many possible feature dimensions). Odd-one-out tasks require agents to reason over multi-dimensional relationships among a set of objects. We show that agents do not learn these tasks well from reward alone, but achieve >90% performance when they are also trained to generate language explaining object properties or why a choice is correct or incorrect. In further experiments, we show how predicting explanations enables agents to generalize appropriately from ambiguous, causally-confounded training, and even to meta-learn to perform experimental interventions to identify causal structure. We show that explanations help overcome the tendency of agents to fixate on simple features, and explore which aspects of explanations make them most beneficial. Our results suggest that learning from explanations is a powerful principle that could offer a promising path towards training more robust and general machine learning systems.", "keywords": "Explanation;RL;Language;Relations;Causality", "primary_area": "", "supplementary_material": "/attachment/5e8a7c76077cd1561b71e78a09782869c8b2bb27.zip", "author": "Andrew Kyle Lampinen;Nicholas Andrew Roy;Ishita Dasgupta;Stephanie C.Y. Chan;Allison Tam;Chen Yan;Adam Santoro;Neil Charles Rabinowitz;Jane X Wang;Felix Hill", "authorids": "~Andrew_Kyle_Lampinen1;nroy@deepmind.com;~Ishita_Dasgupta1;~Stephanie_C.Y._Chan1;~Allison_Tam1;~Chen_Yan2;~Adam_Santoro1;~Neil_Charles_Rabinowitz1;~Jane_X_Wang1;~Felix_Hill1", "gender": "M;;;F;;;M;M;;", "homepage": "https://github.com/google/BIG-bench;;;https://scychan.github.io/;;http://example.com;;;;https://fh295.github.io/", "dblp": "https://dblp.uni-trier.de/pers/hd/l/Lampinen:Andrew_K=;;169/6218;255/7866;;;180/5951;156/0289;;116/0509", "google_scholar": "_N44XxAAAAAJ;;;https://scholar.google.com/citations?hl=en;;;;https://scholar.google.co.uk/citations?user=AgUYQMwAAAAJ;;https://scholar.google.co.uk/citations?user=4HLUnhIAAAAJ", "orcid": ";;;;;;;;;", "linkedin": ";;idasgupta6/;scychan;allison-tam/;;;;;", "or_profile": "~Andrew_Kyle_Lampinen1;nroy@deepmind.com;~Ishita_Dasgupta1;~Stephanie_C.Y._Chan1;~Allison_Tam1;~Chen_Yan2;~Adam_Santoro1;~Neil_Charles_Rabinowitz1;~Jane_X_Wang1;~Felix_Hill1", "aff": "Google DeepMind;;Google DeepMind;Google DeepMind;;Google DeepMind;Google;Google DeepMind;;Google", "aff_domain": "google.com;;deepmind.com;deepmind.com;;deepmind.com;google.com;google;;google.com", "position": "Research Scientist;;Researcher;Research Scientist;;Research Scientist;Research Scientist;Research Scientist;;Researcher", "bibtex": "@misc{\nlampinen2022tell,\ntitle={Tell me why!{\\textemdash}Explanations support learning relational and causal structure},\nauthor={Andrew Kyle Lampinen and Nicholas Andrew Roy and Ishita Dasgupta and Stephanie C.Y. Chan and Allison Tam and Chen Yan and Adam Santoro and Neil Charles Rabinowitz and Jane X Wang and Felix Hill},\nyear={2022},\nurl={https://openreview.net/forum?id=XeqjsCVLk1m}\n}", "github": "", "project": "", "reviewers": "AvzX;x1cM;DFpa;8zdV", "site": "https://openreview.net/forum?id=XeqjsCVLk1m", "pdf_size": 0, "recommendation": "3;6;6;6", "confidence": "4;4;2;3", "correctness": "1;3;3;4", "technical_novelty": "1;1;2;2", "empirical_novelty": "1;3;2;2", "wc_summary_paper": "51;96;71;54", "wc_summary_review": "54;109;47;9", "wc_main_review": "456;524;131;524", "wc_review": "561;729;249;587", "wc_reply_reviewers": "354;0;0;0", "wc_reply_authors": "1315;425;211;311", "reply_reviewers": "1;0;0;0", "reply_authors": "3;1;1;1", "recommendation_avg": [ 5.25, 1.299038105676658 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "correctness_avg": [ 2.75, 1.0897247358851685 ], "technical_novelty_avg": [ 1.5, 0.5 ], "empirical_novelty_avg": [ 2.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 68.0, 17.874562931719478 ], "wc_summary_review_avg": [ 54.75, 35.695763053897586 ], "wc_main_review_avg": [ 408.75, 162.74423952939165 ], "wc_review_avg": [ 531.5, 175.1877564215034 ], "wc_reply_reviewers_avg": [ 88.5, 153.28649646984564 ], "wc_reply_authors_avg": [ 565.5, 439.2980195721351 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.5, 0.8660254037844386 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 10, 0 ], "corr_recommendation_confidence": -0.5222329678670935, "corr_recommendation_correctness": 0.9271726499455307, "gs_citation": 46, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9093718010434750052&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff_unique_index": "0;0;0;0;0;0;0", "aff_unique_norm": "Google", "aff_unique_dep": "Google DeepMind", "aff_unique_url": "https://deepmind.com", "aff_unique_abbr": "DeepMind", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0;0;1;0;1", "aff_country_unique": "United Kingdom;United States" }, { "id": "Xg47v73CDaj", "title": "Non-deep Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "Depth is the hallmark of deep neural networks. But more depth means more sequential computation and higher latency. This begs the question -- is it possible to build high-performing ``non-deep\" neural networks? We show that it is. To do so, we use parallel subnetworks instead of stacking one layer after another. This helps effectively reduce depth while maintaining high performance. By utilizing parallel substructures, we show, for the first time, that a network with a depth of just 12 can achieve top-1 accuracy over 80% on ImageNet, 96% on CIFAR10, and 81% on CIFAR100. We also show that a network with a low-depth (12) backbone can achieve an AP of 48% on MS-COCO. We analyze the scaling rules for our design and show how to increase performance without changing the network's depth. Finally, we provide a proof of concept for how non-deep networks could be used to build low-latency recognition systems. We will open-source our code.", "keywords": "non-deep networks", "primary_area": "", "supplementary_material": "", "author": "Ankit Goyal;Alexey Bochkovskiy;Jia Deng;Vladlen Koltun", "authorids": "~Ankit_Goyal1;alexeyab84@gmail.com;~Jia_Deng1;~Vladlen_Koltun1", "gender": "M;;M;M", "homepage": "http://imankgoyal.github.io/;;;http://vladlen.info/", "dblp": "89/10051-1;;07/6526-1.html;66/5458.html", "google_scholar": "RhN6jKIAAAAJ;;U3Eub-EAAAAJ;kg4bCpgAAAAJ", "orcid": ";;;0000-0003-0858-0970", "linkedin": ";;;vladlenkoltun/", "or_profile": "~Ankit_Goyal1;alexeyab84@gmail.com;~Jia_Deng1;~Vladlen_Koltun1", "aff": "Princeton University;;Princeton University;Apple", "aff_domain": "princeton.edu;;princeton.edu;apple.com", "position": "PhD student;;Assistant Professor;Distinguished Scientist", "bibtex": "@misc{\ngoyal2022nondeep,\ntitle={Non-deep Networks},\nauthor={Ankit Goyal and Alexey Bochkovskiy and Jia Deng and Vladlen Koltun},\nyear={2022},\nurl={https://openreview.net/forum?id=Xg47v73CDaj}\n}", "github": "", "project": "", "reviewers": "75Wm;pTPq;Z9wZ;wcwX", "site": "https://openreview.net/forum?id=Xg47v73CDaj", "pdf_size": 0, "recommendation": "5;5;5;5", "confidence": "5;4;5;5", "correctness": "2;3;4;3", "technical_novelty": "3;3;3;2", "empirical_novelty": "3;3;3;3", "wc_summary_paper": "90;43;95;53", "wc_summary_review": "60;5;5;35", "wc_main_review": "403;332;226;171", "wc_review": "553;380;326;259", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "604;816;498;341", "reply_reviewers": "0;0;0;0", "reply_authors": "1;2;1;1", "recommendation_avg": [ 5.0, 0.0 ], "confidence_avg": [ 4.75, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 70.25, 22.598395960775623 ], "wc_summary_review_avg": [ 26.25, 23.01494079940246 ], "wc_main_review_avg": [ 283.0, 90.26904231241184 ], "wc_review_avg": [ 379.5, 108.95526604987938 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 564.75, 172.6171703510401 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 118, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12528683203894514853&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "aff_unique_index": "0;0;1", "aff_unique_norm": "Princeton University;Apple", "aff_unique_dep": ";Apple Inc.", "aff_unique_url": "https://www.princeton.edu;https://www.apple.com", "aff_unique_abbr": "Princeton;Apple", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "XgS9YPYtdj", "title": "Improved Generalization Risk Bounds for Meta-Learning with PAC-Bayes-kl Analysis", "track": "main", "status": "Withdraw", "tldr": "", "abstract": " By incorporating knowledge from observed tasks, PAC-Bayes meta-learning algorithms aim to construct a hyperposterior from which an informative prior is sampled for fast adaptation to novel tasks. The goal of PAC-Bayes meta-learning theory is thus to propose an upper bound on the generalization risk over a novel task of the learned hyperposterior. In this work, we first generalize the tight PAC-Bayes-kl bound from independently and identically distributed (i.i.d.) setting to non-i.i.d. meta-learning setting. Based on the extended PAC-Bayes-kl bound, we further provide three improved PAC-Bayes generalization bounds for meta-learning, leading to better asymptotic behaviour than existing results. By minimizing objective functions derived from the improved bounds, we develop three PAC-Bayes meta-learning algorithms for classification. Moreover, we employ localized PAC-Bayes analysis for meta-learning to yield insights into the role of hyperposterior for learning a novel task. In particular, we identify that when the number of training task is large, utilizing a prior generated from an informative hyperposterior can achieve the same order of PAC-Bayes-kl bound as that obtained through setting a localized distribution-dependent prior for a novel task. Experiments with deep neural networks show that minimizing our bounds can achieve competitive performance on novel tasks w.r.t. previous PAC-Bayes meta-learning methods as well as PAC-Bayes single-task learning methods with localized prior.", "keywords": "PAC-Bayes bounds;meta-learning;localized PAC-Bayes analysis", "primary_area": "", "supplementary_material": "/attachment/c790a2895ff39af524526a16fd81d1034e7fb317.zip", "author": "Jiechao Guan;Zhiwu Lu;Yong Liu", "authorids": "~Jiechao_Guan2;~Zhiwu_Lu1;~Yong_Liu7", "gender": "M;M;M", "homepage": "https://gsai.ruc.edu.cn/luzhiwu;https://iie-liuyong.github.io;", "dblp": "53/5234;29/4867-18;228/8337", "google_scholar": "OUXS8doAAAAJ;vVhmzbAAAAAJ;https://scholar.google.com/citations?hl=zh-CN", "orcid": ";0000-0002-6739-621X;", "linkedin": ";;", "or_profile": "~Zhiwu_Lu1;~Yong_Liu7;~Jiechao_Guan1", "aff": "Renmin University of China;Renmin University of China;School of Information, Renmin University of China", "aff_domain": "ruc.edu.cn;ruc.edu.cn;ruc.edu.cn", "position": "Full Professor;Associate Professor;PhD student", "bibtex": "@misc{\nguan2022improved,\ntitle={Improved Generalization Risk Bounds for Meta-Learning with {PAC}-Bayes-kl Analysis},\nauthor={Jiechao Guan and Zhiwu Lu and Yong Liu},\nyear={2022},\nurl={https://openreview.net/forum?id=XgS9YPYtdj}\n}", "github": "", "project": "", "reviewers": "ttmo;si6X;zEbh;3HJF", "site": "https://openreview.net/forum?id=XgS9YPYtdj", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "5;4;4;2", "correctness": "3;4;4;3", "technical_novelty": "3;3;4;4", "empirical_novelty": "3;2;2;3", "wc_summary_paper": "194;33;134;130", "wc_summary_review": "128;35;53;81", "wc_main_review": "1999;451;705;291", "wc_review": "2321;519;892;502", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "422;302;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;0;0", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 3.75, 1.0897247358851685 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 122.75, 57.68611184678683 ], "wc_summary_review_avg": [ 74.25, 35.0954056822257 ], "wc_main_review_avg": [ 861.5, 673.1231313808789 ], "wc_review_avg": [ 1058.5, 745.382619867139 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 181.0, 185.9058901702687 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0.5, 0.5 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.6882472016116854, "corr_recommendation_correctness": 0.0, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2369754078139255308&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;0", "aff_unique_norm": "Renmin University of China", "aff_unique_dep": "", "aff_unique_url": "http://www.ruc.edu.cn", "aff_unique_abbr": "RUC", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "A Unified Contrastive Energy-based Model for Understanding the Generative Ability of Adversarial Training", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6954", "id": "XhF2VOMRHS", "poster": "", "openreview": "https://openreview.net/forum?id=XhF2VOMRHS", "slides": "https://iclr.cc/virtual/2022/poster/6954", "video": "https://iclr.cc/virtual/2022/poster/6954", "author_site": "Yifei Wang, Yisen Wang, Jiansheng Yang, Zhouchen Lin", "tldr": "", "abstract": "Adversarial Training (AT) is known as an effective approach to enhance the robustness of deep neural networks. Recently researchers notice that robust models with AT have good generative ability and can synthesize realistic images, while the reason behind it is yet under-explored. In this paper, we demystify this phenomenon by developing a unified probabilistic framework, called Contrastive Energy-based Models (CEM). On the one hand, we provide the first probabilistic characterization of AT through a unified understanding of robustness and generative ability. On the other hand, our unified framework can be extended to the unsupervised scenario, which interprets unsupervised contrastive learning as an important sampling of CEM. Based on these, we propose a principled method to develop adversarial learning and sampling methods. Experiments show that the sampling methods derived from our framework improve the sample quality in both supervised and unsupervised learning. Notably, our unsupervised adversarial sampling method achieves an Inception score of 9.61 on CIFAR-10, which is superior to previous energy-based models and comparable to state-of-the-art generative models.", "keywords": "Generative Models;Energy-based Models;Sampling;Adversarial Training", "primary_area": "", "supplementary_material": "", "author": "Yifei Wang;Yisen Wang;Jiansheng Yang;Zhouchen Lin", "authorids": "~Yifei_Wang1;~Yisen_Wang1;yjs@math.pku.edu.cn;~Zhouchen_Lin1", "gender": "M;M;;M", "homepage": "https://yifeiwang77.com;https://yisenwang.github.io/;;https://zhouchenlin.github.io", "dblp": "00/555-1;172/1346-1;;l/ZhouchenLin", "google_scholar": "-CLy6YsAAAAJ;uMWPDboAAAAJ;;https://scholar.google.com.tw/citations?user=TanjFwoAAAAJ", "orcid": ";;;0000-0003-1493-7569", "linkedin": ";;;", "or_profile": "~Yifei_Wang1;~Yisen_Wang1;yjs@math.pku.edu.cn;~Zhouchen_Lin1", "aff": "Peking University;Peking University;;Peking University", "aff_domain": "pku.edu.cn;pku.edu.cn;;pku.edu.cn", "position": "PhD student;Assistant Professor;;Professor", "bibtex": "@inproceedings{\nwang2022a,\ntitle={A Unified Contrastive Energy-based Model for Understanding the Generative Ability of Adversarial Training},\nauthor={Yifei Wang and Yisen Wang and Jiansheng Yang and Zhouchen Lin},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=XhF2VOMRHS}\n}", "github": "", "project": "", "reviewers": "rJdV;MiyJ;9aEk;Pwz6", "pdf_size": 0, "recommendation": "5;6;8;8", "confidence": "3;2;4;3", "correctness": "3;4;4;4", "technical_novelty": "2;3;4;4", "empirical_novelty": "2;3;4;3", "wc_summary_paper": "59;88;63;179", "wc_summary_review": "31;72;19;34", "wc_main_review": "119;107;395;123", "wc_review": "209;267;477;336", "wc_reply_reviewers": "0;0;46;0", "wc_reply_authors": "937;689;1039;120", "reply_reviewers": "0;0;1;0", "reply_authors": "4;2;2;1", "recommendation_avg": [ 6.75, 1.299038105676658 ], "confidence_avg": [ 3.0, 0.7071067811865476 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.25, 0.82915619758885 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 97.25, 48.48904515455011 ], "wc_summary_review_avg": [ 39.0, 19.862024066041204 ], "wc_main_review_avg": [ 186.0, 120.80976781701055 ], "wc_review_avg": [ 322.25, 100.01843580060628 ], "wc_reply_reviewers_avg": [ 11.5, 19.91858428704209 ], "wc_reply_authors_avg": [ 696.25, 356.21438418458064 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 2.25, 1.0897247358851685 ], "replies_avg": [ 22, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.5443310539518174, "corr_recommendation_correctness": 0.7777777777777777, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9237224064252747933&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "pdf": "https://openreview.net/pdf?id=XhF2VOMRHS", "email": "pku.edu.cn;pku.edu.cn;;pku.edu.cn", "author_num": 4, "aff_unique_index": "0;0;0", "aff_unique_norm": "Peking University", "aff_unique_dep": "", "aff_unique_url": "http://www.pku.edu.cn", "aff_unique_abbr": "Peking U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "id": "XhMa8XPHxpw", "title": "Low-Precision Stochastic Gradient Langevin Dynamics", "track": "main", "status": "Reject", "tldr": "", "abstract": "Low-precision optimization is widely used to accelerate large-scale deep learning. Despite providing better uncertainty estimation and generalization, sampling methods remain mostly unexplored in this space. In this paper, we provide the first study of low-precision Stochastic Gradient Langevin Dynamics (SGLD), arguing that it is particularly suited to low-bit arithmetic due to its intrinsic ability to handle system noise. We prove the convergence of low-precision SGLD on strongly log-concave distributions, showing that with full-precision gradient accumulators, SGLD is more robust to quantization error than SGD; however, with low-precision gradient accumulators, SGLD can diverge arbitrarily far from the target distribution with small stepsizes. To remedy this issue, we develop a new quantization function that preserves the correct variance in each update step. We demonstrate that the resulting low-precision SGLD algorithm is comparable to full-precision SGLD and outperforms low-precision SGD on deep learning tasks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ruqi Zhang;Andrew Gordon Wilson;Christopher De Sa", "authorids": "~Ruqi_Zhang1;~Andrew_Gordon_Wilson1;~Christopher_De_Sa2", "gender": "F;Not Specified;M", "homepage": "https://ruqizhang.github.io/;https://cims.nyu.edu/~andrewgw;http://cs.cornell.edu/~cdesa", "dblp": ";65/10453;154/6336", "google_scholar": "4ojpmc8AAAAJ;https://scholar.google.com.tw/citations?user=twWX2LIAAAAJ;", "orcid": ";;", "linkedin": ";;", "or_profile": "~Ruqi_Zhang1;~Andrew_Gordon_Wilson1;~Christopher_De_Sa1", "aff": "Purdue University;New York University;Cornell University", "aff_domain": "purdue.edu;nyu.edu;cornell.edu", "position": "Assistant Professor;Associate Professor;Assistant Professor", "bibtex": "@misc{\nzhang2022lowprecision,\ntitle={Low-Precision Stochastic Gradient Langevin Dynamics},\nauthor={Ruqi Zhang and Andrew Gordon Wilson and Christopher De Sa},\nyear={2022},\nurl={https://openreview.net/forum?id=XhMa8XPHxpw}\n}", "github": "", "project": "", "reviewers": "dB5g;yqrt;gLYz;bvZD", "site": "https://openreview.net/forum?id=XhMa8XPHxpw", "pdf_size": 0, "recommendation": "3;3;3;6", "confidence": "4;4;4;4", "correctness": "3;3;2;3", "technical_novelty": "2;2;2;4", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "58;53;42;52", "wc_summary_review": "72;33;52;15", "wc_main_review": "173;211;427;266", "wc_review": "303;297;521;333", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "606;495;526;210", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 3.75, 1.299038105676658 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.8660254037844386 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 51.25, 5.80409338312195 ], "wc_summary_review_avg": [ 43.0, 21.24852936087578 ], "wc_main_review_avg": [ 269.25, 96.89265968070028 ], "wc_review_avg": [ 363.5, 91.94971451831702 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 459.25, 149.4947741561557 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5250731865302553140&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff_unique_index": "0;1;2", "aff_unique_norm": "Purdue University;New York University;Cornell University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.purdue.edu;https://www.nyu.edu;https://www.cornell.edu", "aff_unique_abbr": "Purdue;NYU;Cornell", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "XizHAfgfd3J", "title": "Semantic-aware Representation Learning Via Probability Contrastive Loss", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Recent feature contrastive learning (FCL) has shown promising performance in unsupervised representation learning. For the close-set representation learning where labeled data and unlabeled data belong to the same semantic space, however, FCL cannot show overwhelming gains due to not involving the class semantics during optimization. Consequently, the produced features do not guarantee to be easily classified by the class weights learned from labeled data although they are information-rich. To tackle this issue, we propose a novel probability contrastive learning (PCL) in this paper, which not only produces rich features but also enforces them to be distributed around the class prototypes. Specifically, we propose to use the output probabilities after softmax to perform contrastive learning instead of the extracted features in FCL. Evidently, such a way can exploit the class semantics during optimization. Moreover, we propose to remove the L2 normalization in the traditional FCL and directly use the L1-normalized probability for contrastive learning. Our proposed PCL is simple and effective. We conduct extensive experiments on three close-set image classification tasks, \\textit{i.e.}, unsupervised domain adaptation, semi-supervised learning, and semi-supervised domain adaptation. The results on multiple datasets demonstrate that our PCL can consistently get considerable gains and achieves the state-of-the-art performance for all three tasks.", "keywords": "contrastive learning;semi-supervised learning;unsupervised domain adaptation;semi-supervised domain adaptation", "primary_area": "", "supplementary_material": "/attachment/11710ea66a592d34b6f9322d8f85e83dcb3a464a.zip", "author": "Junjie Li;Yixin Zhang;Zilei Wang;Keyu Tu", "authorids": "~Junjie_Li3;~Yixin_Zhang1;~Zilei_Wang1;~Keyu_Tu3", "gender": "M;M;M;", "homepage": ";http://home.ustc.edu.cn/~zhyx12/;;", "dblp": "83/5144-2.html;34/3337;49/1878;", "google_scholar": "https://scholar.google.com.hk/citations?user=3fXVH5oAAAAJ;F24AuKYAAAAJ;https://scholar.google.com.hk/citations?hl=zh-CN;", "orcid": ";0000-0002-4513-1106;;", "linkedin": ";;;", "or_profile": "~Junjie_Li3;~Yixin_Zhang1;~Zilei_Wang1;~Keyu_Tu3", "aff": "University of Science and Technology of China;University of Science and Technology of China;University of Science and Technology of China;", "aff_domain": "ustc.edu.cn;ustc.edu.cn;ustc.edu.cn;", "position": "PhD student;Postdoc;Associate Professor;", "bibtex": "@misc{\nli2022semanticaware,\ntitle={Semantic-aware Representation Learning Via Probability Contrastive Loss},\nauthor={Junjie Li and Yixin Zhang and Zilei Wang and Keyu Tu},\nyear={2022},\nurl={https://openreview.net/forum?id=XizHAfgfd3J}\n}", "github": "", "project": "", "reviewers": "SPTz;gmp5;Xyfv;pwSh;k1DX", "site": "https://openreview.net/forum?id=XizHAfgfd3J", "pdf_size": 0, "recommendation": "3;5;5;5;5", "confidence": "4;4;4;4;4", "correctness": "2;3;3;3;4", "technical_novelty": "2;3;2;2;2", "empirical_novelty": "2;2;2;3;3", "wc_summary_paper": "54;73;67;95;51", "wc_summary_review": "4;58;48;121;52", "wc_main_review": "159;607;183;329;332", "wc_review": "217;738;298;545;435", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "0;0;0;0;0", "reply_reviewers": "0;0;0;0;0", "reply_authors": "0;0;0;0;0", "recommendation_avg": [ 4.6, 0.7999999999999999 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.0, 0.6324555320336759 ], "technical_novelty_avg": [ 2.2, 0.39999999999999997 ], "empirical_novelty_avg": [ 2.4, 0.4898979485566356 ], "wc_summary_paper_avg": [ 68.0, 15.748015748023622 ], "wc_summary_review_avg": [ 56.6, 37.446495163099044 ], "wc_main_review_avg": [ 322.0, 159.53933684204657 ], "wc_review_avg": [ 446.6, 184.13538497529476 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.790569415042095, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5872644314824100187&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Science and Technology of China", "aff_unique_dep": "", "aff_unique_url": "http://www.ustc.edu.cn", "aff_unique_abbr": "USTC", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "id": "Xk1kE26xYS9", "title": "Learning Pessimism for Robust and Efficient Off-Policy Reinforcement Learning", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Popular off-policy deep reinforcement learning algorithms compensate for overestimation bias during temporal-difference learning by utilizing pessimistic estimates of the expected target returns. In this work, we propose a novel learnable penalty to enact such pessimism, based on a new way to quantify the critic's epistemic uncertainty. Furthermore, we propose to learn the penalty alongside the critic with dual TD-learning, a strategy to estimate and minimize the bias magnitude in the target returns. Our method enables us to accurately counteract overestimation bias throughout training without incurring the downsides of overly pessimistic targets. Empirically, by integrating our method and other orthogonal improvements with popular off-policy algorithms, we achieve state-of-the-art results in continuous control tasks from both proprioceptive and pixel observations. ", "keywords": "Reinforcement learning;Off-policy learning;Continuous control;Machine learning", "primary_area": "", "supplementary_material": "/attachment/c24274488d4e72a685bab6b14f64c16f07969cca.zip", "author": "Edoardo Cetin;Oya Celiktutan", "authorids": "~Edoardo_Cetin1;~Oya_Celiktutan2", "gender": ";F", "homepage": "https://aladoro.github.io/;https://nms.kcl.ac.uk/oya.celiktutan/", "dblp": "287/4615;05/4947", "google_scholar": "https://scholar.google.it/citations?hl=en;https://scholar.google.co.uk/citations?user=CCCoMqcAAAAJ", "orcid": ";0000-0002-7213-6359", "linkedin": "edoardo-cetin-916b68195/;oya-celiktutan-5249104/?originalSubdomain=uk", "or_profile": "~Edoardo_Cetin1;~Oya_Celiktutan2", "aff": "Twitter;King's College London", "aff_domain": "twitter.com;kcl.ac.uk", "position": "Intern;Assistant Professor", "bibtex": "@misc{\ncetin2022learning,\ntitle={Learning Pessimism for Robust and Efficient Off-Policy Reinforcement Learning},\nauthor={Edoardo Cetin and Oya Celiktutan},\nyear={2022},\nurl={https://openreview.net/forum?id=Xk1kE26xYS9}\n}", "github": "", "project": "", "reviewers": "e6qv;Karr;WGQn;BFjL", "site": "https://openreview.net/forum?id=Xk1kE26xYS9", "pdf_size": 0, "recommendation": "5;5;6;8", "confidence": "4;3;4;4", "correctness": "3;3;4;4", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;3;2;2", "wc_summary_paper": "45;100;58;47", "wc_summary_review": "169;73;21;66", "wc_main_review": "629;250;181;304", "wc_review": "843;423;260;417", "wc_reply_reviewers": "0;0;16;0", "wc_reply_authors": "1813;716;1156;834", "reply_reviewers": "0;0;1;0", "reply_authors": "3;1;2;2", "recommendation_avg": [ 6.0, 1.224744871391589 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 62.5, 22.20923231451281 ], "wc_summary_review_avg": [ 82.25, 53.91370419475924 ], "wc_main_review_avg": [ 341.0, 171.89677134838803 ], "wc_review_avg": [ 485.75, 216.36470946066967 ], "wc_reply_reviewers_avg": [ 4.0, 6.928203230275509 ], "wc_reply_authors_avg": [ 1129.75, 426.0800247606076 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 2.0, 0.7071067811865476 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.4714045207910316, "corr_recommendation_correctness": 0.8164965809277259, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4663700050221147547&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1", "aff_unique_norm": "Twitter, Inc.;King's College London", "aff_unique_dep": ";", "aff_unique_url": "https://twitter.com;https://www.kcl.ac.uk", "aff_unique_abbr": "Twitter;KCL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "United States;United Kingdom" }, { "title": "An Agnostic Approach to Federated Learning with Class Imbalance", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6104", "id": "Xo0lbDt975", "poster": "", "openreview": "https://openreview.net/forum?id=Xo0lbDt975", "slides": "https://iclr.cc/virtual/2022/poster/6104", "video": "https://iclr.cc/virtual/2022/poster/6104", "author_site": "Zebang Shen, Juan Cervino, Hamed Hassani, Alejandro Ribeiro", "tldr": "", "abstract": "Federated Learning (FL) has emerged as the tool of choice for training deep models over heterogeneous and decentralized datasets. \nAs a reflection of the experiences from different clients, severe class imbalance issues are observed in real-world FL problems.\nMoreover, there exists a drastic mismatch between the imbalances from the local and global perspectives, i.e. a local majority class can be the minority of the population. Additionally, the privacy requirement of FL poses an extra challenge, as one should handle class imbalance without identifying the minority class. In this paper we propose a novel agnostic constrained learning formulation to tackle the class imbalance problem in FL, without requiring further information beyond the standard FL objective. A meta algorithm, CLIMB, is designed to solve the target optimization problem, with its convergence property analyzed under certain oracle assumptions. Through an extensive empirical study over various data heterogeneity and class imbalance configurations, we showcase that CLIMB considerably improves the performance in the minority class without compromising the overall accuracy of the classifier, which significantly outperforms previous arts. \nIn fact, we observe the greatest performance boost in the most difficult scenario where every client only holds data from one class. The code can be found here https://github.com/shenzebang/Federated-Learning-Pytorch.", "keywords": "Federated Learning;Class Imbalance", "primary_area": "", "supplementary_material": "/attachment/f8b50e1828ed0dbda6b241f7e5894d032b025e04.zip", "author": "Zebang Shen;Juan Cervino;Hamed Hassani;Alejandro Ribeiro", "authorids": "~Zebang_Shen1;~Juan_Cervino1;~Hamed_Hassani2;~Alejandro_Ribeiro1", "gender": "M;M;M;M", "homepage": ";https://juancervino.github.io/;https://www.seas.upenn.edu/~hassani/;https://alelab.seas.upenn.edu", "dblp": "165/3377;;73/4984;32/15", "google_scholar": "klqzFvgAAAAJ;lbyYN_sAAAAJ;;7mrPM4kAAAAJ", "orcid": ";;;0000-0003-4230-9906", "linkedin": ";;;", "or_profile": "~Zebang_Shen1;~Juan_Cervino1;~Hamed_Hassani2;~Alejandro_Ribeiro1", "aff": "University of Pennsylvania;University of Pennsylvania;University of Pennsylvania;University of Pennsylvania", "aff_domain": "upenn.edu;upenn.edu;upenn.edu;upenn.edu", "position": "Postdoc;PhD student;;Full Professor", "bibtex": "@inproceedings{\nshen2022an,\ntitle={An Agnostic Approach to Federated Learning with Class Imbalance},\nauthor={Zebang Shen and Juan Cervino and Hamed Hassani and Alejandro Ribeiro},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=Xo0lbDt975}\n}", "github": "", "project": "", "reviewers": "Mp3G;u6Lr;8nbc;obo5", "pdf_size": 0, "recommendation": "6;6;6;6", "confidence": "3;3;4;3", "correctness": "3;3;2;3", "technical_novelty": "3;3;3;3", "empirical_novelty": "3;2;3;3", "wc_summary_paper": "127;58;51;39", "wc_summary_review": "50;44;50;51", "wc_main_review": "311;242;233;259", "wc_review": "488;344;334;349", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "311;556;832;692", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 6.0, 0.0 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 68.75, 34.310166131920724 ], "wc_summary_review_avg": [ 48.75, 2.7726341266023544 ], "wc_main_review_avg": [ 261.25, 30.202441954252638 ], "wc_review_avg": [ 378.75, 63.306299054675435 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 597.75, 192.17488779754757 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 82, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7867989990896103176&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "pdf": "https://openreview.net/pdf?id=Xo0lbDt975", "email": "upenn.edu;upenn.edu;upenn.edu;upenn.edu", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "University of Pennsylvania", "aff_unique_dep": "", "aff_unique_url": "https://www.upenn.edu", "aff_unique_abbr": "UPenn", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "XpmTU4k-5uf", "title": "TIME-LAPSE: Learning to say \u201cI don't know\u201d through spatio-temporal uncertainty scoring", "track": "main", "status": "Reject", "tldr": "", "abstract": "Safe deployment of trained ML models requires determining when input samples go out-of-distribution (OOD) and refraining from making uncertain predictions on them. Existing approaches inspect test samples in isolation to estimate their corresponding predictive uncertainty. However, in the real-world, deployed models typically see test inputs consecutively and predict labels continuously over time during inference. In this work, we propose TIME-LAPSE, a spatio-temporal framework for uncertainty scoring that examines the sequence of predictions prior to the current sample to determine its predictive uncertainty. Our key insight is that in-distribution samples will be more \u201csimilar\u201d to each other compared to OOD samples, not just over the encoding latent-space but also across time. Specifically, (a) our spatial uncertainty score estimates how different OOD latent-space representations are from those of an in-distribution set using metrics such as Mahalanobis distance and cosine similarity and (b) our temporal uncertainty score determines deviations in correlations over time using representations of past inputs in a non-parametric, sliding-window based algorithm. We evaluate TIME-LAPSE on both audio and vision tasks using public datasets and further benchmark our approach on a challenging, real-world, electroencephalograms (EEG) dataset for seizure detection. We achieve state-of-the-art results for OOD detection in the audio and EEG domain and observe considerable gains in semantically corrected vision benchmarks. We show that TIME-LAPSE is more driven by semantic content compared to other methods, i.e., it is more robust to dataset statistics. We also propose a sequential OOD detection evaluation framework to emulate real-life drift settings and show that TIME-LAPSE outperforms spatial methods significantly. ", "keywords": "out-of-distribution detection;OOD detection;spatio-temporal;latent-space;sequential;outlier;anomaly", "primary_area": "", "supplementary_material": "", "author": "Nandita Bhaskhar;Daniel Rubin;Christopher Lee-Messer", "authorids": "~Nandita_Bhaskhar1;~Daniel_Rubin1;~Christopher_Lee-Messer1", "gender": "F;;M", "homepage": "https://web.stanford.edu/~nanbhas/;http://rubin.web.stanford.edu;", "dblp": ";;", "google_scholar": "https://scholar.google.com/scholar?hl=en;;zEAw56MAAAAJ", "orcid": ";;0000-0002-2938-6184", "linkedin": "nanditabhaskhar/;;", "or_profile": "~Nandita_Bhaskhar1;~Daniel_Rubin1;~Christopher_Lee-Messer1", "aff": "Stanford University;Stanford University;", "aff_domain": "stanford.edu;stanford.edu;", "position": "PhD student;Full Professor;", "bibtex": "@misc{\nbhaskhar2022timelapse,\ntitle={{TIME}-{LAPSE}: Learning to say {\\textquotedblleft}I don't know{\\textquotedblright} through spatio-temporal uncertainty scoring},\nauthor={Nandita Bhaskhar and Daniel Rubin and Christopher Lee-Messer},\nyear={2022},\nurl={https://openreview.net/forum?id=XpmTU4k-5uf}\n}", "github": "", "project": "", "reviewers": "CrvH;qgxp;o7sa;54Yv", "site": "https://openreview.net/forum?id=XpmTU4k-5uf", "pdf_size": 0, "recommendation": "3;5;5;5", "confidence": "3;4;4;3", "correctness": "2;1;3;2", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;2;3;2", "wc_summary_paper": "30;65;106;63", "wc_summary_review": "84;150;43;57", "wc_main_review": "575;320;314;359", "wc_review": "689;535;463;479", "wc_reply_reviewers": "0;55;0;0", "wc_reply_authors": "1093;1355;1087;1405", "reply_reviewers": "0;1;0;0", "reply_authors": "2;2;2;2", "recommendation_avg": [ 4.5, 0.8660254037844386 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 2.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 66.0, 26.953663943887108 ], "wc_summary_review_avg": [ 83.5, 41.12481002995637 ], "wc_main_review_avg": [ 392.0, 107.05839528033287 ], "wc_review_avg": [ 541.5, 89.25665241313949 ], "wc_reply_reviewers_avg": [ 13.75, 23.81569860407206 ], "wc_reply_authors_avg": [ 1235.0, 146.08901396066713 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 2.0, 0.0 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8717982471505739156&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Stanford University", "aff_unique_dep": "", "aff_unique_url": "https://www.stanford.edu", "aff_unique_abbr": "Stanford", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "Xr6-DAhePa", "title": "Understanding Self-supervised Learning via Information Bottleneck Principle", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Self-supervised learning alleviates the massive demands for annotations in deep learning, and recent advances are mainly dominated by contrastive learning. Existed contrastive learning methods narrows the distance between positive pair while neglect the redundancy shared by positive pairs. To address this issue, we introduce the information bottleneck principle and propose the Self-supervised Variational Information Bottleneck (SVIB) learning framework. Specifically, We apply Gaussian Mixture Model (GMM) as the parametric approximate posterior distribution to the real feature distribution and introduce the categorical latent variable. Features from different augmentations are forced to infer the other one and the latent variable together. Then, \nwe propose variational information bottleneck as our objective, which is composed of two parts. The first is maximizing the mutual information between the inferred feature and the latent variable. The second is minimizing the mutual information between the other feature and the latent variable. Compare to previous works, SVIB provides the self-supervised learning field with a novel perspective from the variational information bottleneck, while also highlighting a long-neglected issue. Experiments show that SVIB outperforms current SOTA methods in multiple benchmarks.", "keywords": "contrastive learning;unsupervised learning;variational information bottleneck", "primary_area": "", "supplementary_material": "", "author": "Jin Li;Yaoming Wang;Dongsheng Jiang;XIAOPENG ZHANG;Wenrui Dai;Hongkai Xiong", "authorids": "~Jin_Li10;~Yaoming_Wang1;~Dongsheng_Jiang1;~XIAOPENG_ZHANG7;~Wenrui_Dai1;~Hongkai_Xiong1", "gender": ";;M;M;;M", "homepage": ";;https://sites.google.com/site/dongshengjiangbme/;https://sites.google.com/site/zxphistory/;;http://min.sjtu.edu.cn", "dblp": ";;85/8729;;16/5135.html;21/3569", "google_scholar": ";;-eGIgsoAAAAJ;Ud6aBAcAAAAJ;Xg8MhyAAAAAJ;bB16iN4AAAAJ", "orcid": ";;;;;0000-0003-4552-0029", "linkedin": ";;;;;", "or_profile": "~Jin_Li10;~Yaoming_Wang1;~Dongsheng_Jiang1;~XIAOPENG_ZHANG7;~Wenrui_Dai1;~Hongkai_Xiong1", "aff": ";;Huawei Technologies Ltd.;Huawei Technologies Ltd.;Shanghai Jiaotong University;Shanghai Jiaotong University", "aff_domain": ";;huawei.com;huawei.com;sjtu.edu.cn;sjtu.edu.cn", "position": ";;Principal Researcher;Principal Researcher;Associate Professor;Full Professor", "bibtex": "@misc{\nli2022understanding,\ntitle={Understanding Self-supervised Learning via Information Bottleneck Principle},\nauthor={Jin Li and Yaoming Wang and Dongsheng Jiang and XIAOPENG ZHANG and Wenrui Dai and Hongkai Xiong},\nyear={2022},\nurl={https://openreview.net/forum?id=Xr6-DAhePa}\n}", "github": "", "project": "", "reviewers": "3Pez;3ETK;G5sW;sKSY;h6ND", "site": "https://openreview.net/forum?id=Xr6-DAhePa", "pdf_size": 0, "recommendation": "3;3;5;5;6", "confidence": "5;3;2;4;3", "correctness": "2;2;2;2;3", "technical_novelty": "2;2;3;3;3", "empirical_novelty": "1;2;4;2;3", "wc_summary_paper": "148;61;42;150;45", "wc_summary_review": "41;38;49;44;49", "wc_main_review": "212;333;607;471;354", "wc_review": "401;432;698;665;448", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "0;0;0;0;0", "reply_reviewers": "0;0;0;0;0", "reply_authors": "0;0;0;0;0", "recommendation_avg": [ 4.4, 1.2 ], "confidence_avg": [ 3.4, 1.019803902718557 ], "correctness_avg": [ 2.2, 0.39999999999999997 ], "technical_novelty_avg": [ 2.6, 0.4898979485566356 ], "empirical_novelty_avg": [ 2.4, 1.019803902718557 ], "wc_summary_paper_avg": [ 89.2, 49.2560656163279 ], "wc_summary_review_avg": [ 44.2, 4.354308211415448 ], "wc_main_review_avg": [ 395.4, 133.9650700742548 ], "wc_review_avg": [ 528.8, 126.02444207375012 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.4576043153224293, "corr_recommendation_correctness": 0.6666666666666666, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17069205388918731590&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;1;1", "aff_unique_norm": "Huawei;Shanghai Jiao Tong University", "aff_unique_dep": "Huawei Technologies;", "aff_unique_url": "https://www.huawei.com;https://www.sjtu.edu.cn", "aff_unique_abbr": "Huawei;SJTU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "id": "XuS18b_H0DW", "title": "Tactics on Refining Decision Boundary for Improving Certification-based Robust Training", "track": "main", "status": "Reject", "tldr": "", "abstract": "In verification-based robust training, existing methods utilize relaxation based methods to bound the worst case performance of neural networks given certain perturbation. However, these certification based methods treat all the examples equally regardless of their vulnerability and true adversarial distribution, limiting the model's potential in achieving optimal verifiable accuracy. In the paper, we propose new methods to include the customized weight distribution and automatic schedule tuning methods on the perturbation schedule. These methods are generally applicable to all the verification-based robust training with almost no additional computational cost. Our results show improvement on MNIST with $\\epsilon = 0.3$ and CIFAR on $\\epsilon = 8/255$ for both IBP and CROWN-IBP based methods. ", "keywords": "adversarial robustness;certifiable training;deep learning", "primary_area": "", "supplementary_material": "/attachment/cd55ecb24c215199ef33483dd2004bfc144a0e7e.zip", "author": "Wang Zhang;Lam M. Nguyen;Subhro Das;Pin-Yu Chen;Sijia Liu;Alexandre Megretski;Luca Daniel;Tsui-Wei Weng", "authorids": "~Wang_Zhang2;~Lam_M._Nguyen1;~Subhro_Das1;~Pin-Yu_Chen1;~Sijia_Liu1;~Alexandre_Megretski1;~Luca_Daniel1;~Tsui-Wei_Weng1", "gender": ";;;M;M;M;;F", "homepage": ";;;http://www.pinyuchen.com;https://lsjxjtu.github.io/;http://www.mit.edu/~ameg/;https://www.mit.edu/~dluca/;https://lilywenglab.github.io", "dblp": ";;;39/8969;128/6972-1;;35/5202;177/9197", "google_scholar": ";;;jxwlCUUAAAAJ;C7dO_UgAAAAJ;;;v8GM4xoAAAAJ", "orcid": ";;;0000-0003-1039-8369;;;0000-0002-5880-3151;", "linkedin": ";;;pin-yu-chen-940062a2;;;;", "or_profile": "~Wang_Zhang2;~Lam_M._Nguyen1;~Subhro_Das1;~Pin-Yu_Chen1;~Sijia_Liu1;~Alexandre_Megretski1;~Luca_Daniel1;~Tsui-Wei_Weng1", "aff": ";;;International Business Machines;Michigan State University;Massachusetts Institute of Technology;;University of California, San Diego", "aff_domain": ";;;ibm.com;msu.edu;mit.edu;;ucsd.edu", "position": ";;;Research Staff Member;Assistant Professor;Full Professor;;Assistant Professor", "bibtex": "@misc{\nzhang2022tactics,\ntitle={Tactics on Refining Decision Boundary for Improving Certification-based Robust Training},\nauthor={Wang Zhang and Lam M. Nguyen and Subhro Das and Pin-Yu Chen and Sijia Liu and Alexandre Megretski and Luca Daniel and Tsui-Wei Weng},\nyear={2022},\nurl={https://openreview.net/forum?id=XuS18b_H0DW}\n}", "github": "", "project": "", "reviewers": "p9xi;2goi;jhkj;onQv", "site": "https://openreview.net/forum?id=XuS18b_H0DW", "pdf_size": 0, "recommendation": "3;5;6;8", "confidence": "4;5;4;5", "correctness": "3;3;3;3", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "61;38;43;58", "wc_summary_review": "36;36;92;60", "wc_main_review": "305;611;519;281", "wc_review": "402;685;654;399", "wc_reply_reviewers": "470;209;506;148", "wc_reply_authors": "1487;1428;1445;1043", "reply_reviewers": "2;1;3;1", "reply_authors": "5;4;5;4", "recommendation_avg": [ 5.5, 1.8027756377319946 ], "confidence_avg": [ 4.5, 0.5 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 50.0, 9.72111104761179 ], "wc_summary_review_avg": [ 56.0, 22.978250586152114 ], "wc_main_review_avg": [ 429.0, 140.09282636880448 ], "wc_review_avg": [ 535.0, 134.9499907373098 ], "wc_reply_reviewers_avg": [ 333.25, 156.76315734253376 ], "wc_reply_authors_avg": [ 1350.75, 178.9725886832953 ], "reply_reviewers_avg": [ 1.75, 0.82915619758885 ], "reply_authors_avg": [ 4.5, 0.5 ], "replies_avg": [ 31, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": 0.5547001962252291, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:BiuEKawOBp0J:scholar.google.com/&scioq=Tactics+on+Refining+Decision+Boundary+for+Improving+Certification-based+Robust+Training&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "International Business Machines Corporation;Michigan State University;Massachusetts Institute of Technology;University of California, San Diego", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.ibm.com;https://www.msu.edu;https://web.mit.edu;https://www.ucsd.edu", "aff_unique_abbr": "IBM;MSU;MIT;UCSD", "aff_campus_unique_index": "1", "aff_campus_unique": ";San Diego", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "XuxAEYYGhV-", "title": "Improving Out-of-Distribution Robustness of Classifiers Through Interpolated Generative Models", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Out-of-distribution (OoD) generalization is one of the major challenges for deploying machine learning systems in the real world. Learning representations that disentangle the underlying structure of data is of key importance for improving OoD generalization. Recent works suggest the proprieties of disentangled representation in the latent space of GAN models. In this work, we investigate when and how GAN models can be used to improve OoD robustness in classifiers. Generative models are expected to be able to generate realistic images and increase the diversity of the training set to improve the model's ability to generalize. However, training the conventional GAN models for data augmentation preserves the correlations in the training data. This hampers training a robust classifier against distribution shifts since spurious correlations from the biased training data are unrelated to the causal features of interest. Besides, Training GAN models directly on multiple source domains are fallible and suffer from mode collapse. In this paper, we employ interpolated generative models to generate OoD samples at training time via data augmentation. Specifically, we use the StyleGAN2 model as the source of generative augmentation, which is pre-trained on one source training domain. We then fine-tune it on other source domains with frozen lower layers of the discriminator. Then, we apply linear interpolation in the parameter space of the multiple correlated networks on multiple source domains and control the augmentation in the training time via the interpolation coefficients. A style-mixing mechanism is further introduced to improve the diversity of the generated OoD samples. Our experiments show that our proposed framework explicitly increases the diversity of training domains and achieves consistent improvements over baselines on both synthesized MNIST and many real-world OoD datasets.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Haoyue Bai;Ceyuan Yang;Yinghao Xu;S.-H. Gary Chan;Bolei Zhou", "authorids": "~Haoyue_Bai1;~Ceyuan_Yang2;~Yinghao_Xu1;~S.-H._Gary_Chan2;~Bolei_Zhou5", "gender": "F;M;M;M;M", "homepage": "https://haoyuebaizju.github.io/;https://ceyuan.me/;https://justimyhxu.github.io/;https://boleizhou.github.io/;https://home.cse.ust.hk/~gchan/", "dblp": "150/3371.html;218/2676;232/2482;46/8066;c/ShuengHanGaryChan", "google_scholar": "https://scholar.google.com/citations?view_op=search_authors;Rfj4jWoAAAAJ;https://scholar.google.com/citations?hl=en;9D4aG8AAAAAJ;https://scholar.google.com.tw/citations?user=uiCSOycAAAAJ", "orcid": "0000-0001-8139-0431;;;;0000-0003-4207-764X", "linkedin": "haoyue-bai-a2234a257/;;;;", "or_profile": "~Haoyue_Bai1;~Ceyuan_Yang2;~Yinghao_Xu1;~Bolei_Zhou5;~S.-H._Chan1", "aff": "Hong Kong University of Science and Technology;The Chinese University of Hong Kong;Chinese University of Hong Kong;University of California, Los Angeles;Hong Kong University of Science and Technology", "aff_domain": "ust.hk;cuhk.edu.hk;ie.cuhk.edu.hk;ucla.edu;ust.hk", "position": "MS student;PhD student;PhD student;Assistant Professor;Full Professor", "bibtex": "@misc{\nbai2022improving,\ntitle={Improving Out-of-Distribution Robustness of Classifiers Through Interpolated Generative Models},\nauthor={Haoyue Bai and Ceyuan Yang and Yinghao Xu and S.-H. Gary Chan and Bolei Zhou},\nyear={2022},\nurl={https://openreview.net/forum?id=XuxAEYYGhV-}\n}", "github": "", "project": "", "reviewers": "L1Tr;9B1Q;M9Aq;F3k1", "site": "https://openreview.net/forum?id=XuxAEYYGhV-", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "4;4;4;4", "correctness": "4;2;2;3", "technical_novelty": "2;2;3;2", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "188;127;195;64", "wc_summary_review": "24;27;160;40", "wc_main_review": "231;459;1159;234", "wc_review": "443;613;1514;338", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 2.75, 0.82915619758885 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 143.5, 52.974050251042726 ], "wc_summary_review_avg": [ 62.75, 56.46846465063487 ], "wc_main_review_avg": [ 520.75, 379.91997512634157 ], "wc_review_avg": [ 727.0, 464.8499757986441 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.17407765595569782, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:GBkB6L4JmEoJ:scholar.google.com/&scioq=Improving+Out-of-Distribution+Robustness+of+Classifiers+Through+Interpolated+Generative+Models&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1;1;2;0", "aff_unique_norm": "Hong Kong University of Science and Technology;Chinese University of Hong Kong;University of California, Los Angeles", "aff_unique_dep": ";;", "aff_unique_url": "https://www.ust.hk;https://www.cuhk.edu.hk;https://www.ucla.edu", "aff_unique_abbr": "HKUST;CUHK;UCLA", "aff_campus_unique_index": "0;0;0;1;0", "aff_campus_unique": "Hong Kong SAR;Los Angeles", "aff_country_unique_index": "0;0;0;1;0", "aff_country_unique": "China;United States" }, { "id": "XwOnGWENp62", "title": "RitzNet: A Deep Neural Network Method for Linear Stress Problems", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Learning based method for physics related computation has attracted significant attention recently. Effort has been devoted into learning a surrogate model which simulates system behavior from existing data. This paper presents RitzNet, an unsupervised learning method which takes any point in the computation domain as input, and learns a neural network model to output its corresponding function value satisfying the underlying governing PDEs. We focus on the linear elastic boundary value problem and formulate it as the natural minimization of its associated energy functional, whose discrete version is further utilized as the loss function of RitzNet. A standard fully connected deep neural network structure is explored in this study to model the solutions of a system of elliptic PDEs. Numerical studies on problems with analytical solutions or unknown solutions show that the proposed RitzNet is capable of approximating linear elasticity problems accurately. A parametric sensitivity study sheds light on the potential of RitzNet due to its meshless characteristics. ", "keywords": "Linear Elasticity;Deep Neural Network;Ritz Method;Unsupervised Learning", "primary_area": "", "supplementary_material": "", "author": "Min Liu;Zhiqiang Cai;Karthik Ramani", "authorids": "~Min_Liu1;~Zhiqiang_Cai1;~Karthik_Ramani1", "gender": "F;;M", "homepage": ";;https://engineering.purdue.edu/~ramani/", "dblp": "99/76;;01/6965.html", "google_scholar": ";;wQ6njfUAAAAJ", "orcid": ";;0000-0001-8639-5135", "linkedin": ";;karthikramani1/", "or_profile": "~Min_Liu1;~Zhiqiang_Cai1;~Karthik_Ramani1", "aff": "Purdue University;Purdue University;Purdue University", "aff_domain": "purdue.edu;ecn.purdue.edu;purdue.edu", "position": "Researcher;;Full Professor", "bibtex": "@misc{\nliu2022ritznet,\ntitle={RitzNet: A Deep Neural Network Method for Linear Stress Problems},\nauthor={Min Liu and Zhiqiang Cai and Karthik Ramani},\nyear={2022},\nurl={https://openreview.net/forum?id=XwOnGWENp62}\n}", "github": "", "project": "", "reviewers": "ZpZT;tAtn;tnoP;6drr", "site": "https://openreview.net/forum?id=XwOnGWENp62", "pdf_size": 0, "recommendation": "1;1;3;3", "confidence": "5;4;3;4", "correctness": "2;3;3;4", "technical_novelty": "1;1;2;2", "empirical_novelty": "0;1;2;1", "wc_summary_paper": "54;41;152;47", "wc_summary_review": "23;23;22;84", "wc_main_review": "278;143;174;263", "wc_review": "355;207;348;394", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 2.0, 1.0 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 1.5, 0.5 ], "empirical_novelty_avg": [ 1.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 73.5, 45.554911919572405 ], "wc_summary_review_avg": [ 38.0, 26.56124997058685 ], "wc_main_review_avg": [ 214.5, 57.308376351105956 ], "wc_review_avg": [ 326.0, 70.90486584149215 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.7071067811865475, "corr_recommendation_correctness": 0.7071067811865475, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:t6hX9ItD02oJ:scholar.google.com/&scioq=RitzNet:+A+Deep+Neural+Network+Method+for+Linear+Stress+Problems&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;0", "aff_unique_norm": "Purdue University", "aff_unique_dep": "", "aff_unique_url": "https://www.purdue.edu", "aff_unique_abbr": "Purdue", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "Xx4MNjSmQQ9", "title": "Robust Generalization of Quadratic Neural Networks via Function Identification", "track": "main", "status": "Reject", "tldr": "", "abstract": "A key challenge facing deep learning is that neural networks are often not robust to shifts in the underlying data distribution. We study this problem from the perspective of the statistical concept of parameter identification. Generalization bounds from learning theory often assume that the test distribution is close to the training distribution. In contrast, if we can identify the ``true'' parameters, then the model generalizes to arbitrary distribution shifts. However, neural networks are typically overparameterized, making parameter identification impossible. We show that for quadratic neural networks, we can identify the function represented by the model even though we cannot identify its parameters. Thus, we can obtain robust generalization bounds even in the overparameterized setting. We leverage this result to obtain new bounds for contextual bandits and transfer learning with quadratic neural networks. Overall, our results suggest that we can improve robustness of neural networks by designing models that can represent the true data generating process. In practice, the true data generating process is often very complex; thus, we study how our framework might connect to neural module networks, which are designed to break down complex tasks into compositions of simpler ones. We prove robust generalization bounds when individual neural modules are identifiable.", "keywords": "neural network;function identification;robust generalization", "primary_area": "", "supplementary_material": "", "author": "Kan Xu;Hamsa Bastani;Osbert Bastani", "authorids": "~Kan_Xu2;~Hamsa_Bastani1;~Osbert_Bastani1", "gender": "M;M;F", "homepage": "https://kanxu526.github.io/;http://obastani.github.io;https://hamsabastani.github.io/", "dblp": ";21/11275;199/1777", "google_scholar": "SaEf5CUAAAAJ;cxYepGkAAAAJ;ZbUfUMoAAAAJ", "orcid": "0000-0001-8738-6564;;", "linkedin": "kan-xu-8170b953/;;", "or_profile": "~Kan_Xu2;~Osbert_Bastani1;~Hamsa_Sridhar_Bastani1", "aff": "University of Pennsylvania;University of Pennsylvania;Macro-Eyes", "aff_domain": "upenn.edu;upenn.edu;macro-eyes.com", "position": "PhD student;Assistant Professor;Researcher", "bibtex": "@misc{\nxu2022robust,\ntitle={Robust Generalization of Quadratic Neural Networks via Function Identification},\nauthor={Kan Xu and Hamsa Bastani and Osbert Bastani},\nyear={2022},\nurl={https://openreview.net/forum?id=Xx4MNjSmQQ9}\n}", "github": "", "project": "", "reviewers": "QEKW;LNcd;hpBY", "site": "https://openreview.net/forum?id=Xx4MNjSmQQ9", "pdf_size": 0, "recommendation": "5;5;6", "confidence": "3;4;4", "correctness": "3;4;4", "technical_novelty": "2;3;4", "empirical_novelty": "1;3;0", "wc_summary_paper": "85;142;137", "wc_summary_review": "54;119;54", "wc_main_review": "267;393;157", "wc_review": "406;654;348", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "362;140;240", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "recommendation_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 3.0, 0.816496580927726 ], "empirical_novelty_avg": [ 1.3333333333333333, 1.247219128924647 ], "wc_summary_paper_avg": [ 121.33333333333333, 25.77250904010361 ], "wc_summary_review_avg": [ 75.66666666666667, 30.64129385141706 ], "wc_main_review_avg": [ 272.3333333333333, 96.4203759009935 ], "wc_review_avg": [ 469.3333333333333, 132.7085361065955 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 247.33333333333334, 90.77934175179334 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.4999999999999999, "corr_recommendation_correctness": 0.4999999999999999, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12381499498827458127&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0;1", "aff_unique_norm": "University of Pennsylvania;Macro-Eyes", "aff_unique_dep": ";", "aff_unique_url": "https://www.upenn.edu;https://www.macro-eyes.com", "aff_unique_abbr": "UPenn;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "XyVXPuuO_P", "title": "Meta-Learning an Inference Algorithm for Probabilistic Programs", "track": "main", "status": "Reject", "tldr": "", "abstract": "We present a meta-algorithm for learning a posterior-inference algorithm for restricted probabilistic programs. Our meta-algorithm takes a training set of probabilistic programs that describe models with observations, and attempts to learn an efficient method for inferring the posterior of a similar program.\nA key feature of our approach is the use of what we call a white-box inference algorithm that extracts information directly from model descriptions themselves, given as programs. Concretely, our white-box inference algorithm is equipped with multiple neural networks, one for each type of atomic command, and computes an approximate posterior of a given probabilistic program by analysing individual atomic commands in the program using these networks. The parameters of these networks are then learnt from a training set by our meta-algorithm.\nWe empirically demonstrate that the learnt inference algorithm generalises well to programs that are new in terms of both parameters and structures, and report cases where our approach\nhas advantages over alternative approaches such as HMC in terms of test-time efficiency.\nThe overall results show the promise as well as remaining challenges of our approach.", "keywords": "Probabilistic Programming;Approximate Posterior Inference;Meta Learning", "primary_area": "", "supplementary_material": "", "author": "Gwonsoo Che;Hongseok Yang", "authorids": "~Gwonsoo_Che1;~Hongseok_Yang2", "gender": "M;M", "homepage": "https://sites.google.com/view/gwonsoo-che;https://sites.google.com/view/hongseokyang/home", "dblp": ";82/5808", "google_scholar": ";cLuwH14AAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Gwonsoo_Che1;~Hongseok_Yang1", "aff": "Korea Advanced Institute of Science & Technology;Institute for Basic Science", "aff_domain": "kaist.ac.kr;ibs.re.kr", "position": "PhD student;Visiting Research Fellow", "bibtex": "@misc{\nche2022metalearning,\ntitle={Meta-Learning an Inference Algorithm for Probabilistic Programs},\nauthor={Gwonsoo Che and Hongseok Yang},\nyear={2022},\nurl={https://openreview.net/forum?id=XyVXPuuO_P}\n}", "github": "", "project": "", "reviewers": "BQrG;mFVo;mmqX;VGsf", "site": "https://openreview.net/forum?id=XyVXPuuO_P", "pdf_size": 0, "recommendation": "3;3;5;6", "confidence": "4;5;3;2", "correctness": "2;2;3;3", "technical_novelty": "1;3;3;3", "empirical_novelty": "1;1;2;3", "wc_summary_paper": "87;53;74;73", "wc_summary_review": "56;65;72;36", "wc_main_review": "378;330;355;71", "wc_review": "521;448;501;180", "wc_reply_reviewers": "133;0;0;0", "wc_reply_authors": "366;163;608;0", "reply_reviewers": "1;0;0;0", "reply_authors": "1;1;1;0", "recommendation_avg": [ 4.25, 1.299038105676658 ], "confidence_avg": [ 3.5, 1.118033988749895 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.5, 0.8660254037844386 ], "empirical_novelty_avg": [ 1.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 71.75, 12.152674602736633 ], "wc_summary_review_avg": [ 57.25, 13.516193990913271 ], "wc_main_review_avg": [ 283.5, 123.85576288570508 ], "wc_review_avg": [ 412.5, 136.85850357212007 ], "wc_reply_reviewers_avg": [ 33.25, 57.59068935166517 ], "wc_reply_authors_avg": [ 284.25, 227.48447749242143 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 0.75, 0.4330127018922193 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.9467292624062574, "corr_recommendation_correctness": 0.9622504486493761, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3142345858753882964&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1", "aff_unique_norm": "Korea Advanced Institute of Science and Technology;Institute for Basic Science", "aff_unique_dep": ";", "aff_unique_url": "https://www.kaist.ac.kr;https://www.ibs.re.kr", "aff_unique_abbr": "KAIST;IBS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "South Korea" }, { "title": "Coordination Among Neural Modules Through a Shared Global Workspace", "status": "Oral", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6382", "id": "XzTtHjgPDsT", "poster": "", "openreview": "https://openreview.net/forum?id=XzTtHjgPDsT", "slides": "https://iclr.cc/virtual/2022/poster/6382", "video": "https://iclr.cc/virtual/2022/poster/6382", "author_site": "Anirudh Goyal, Aniket Didolkar, Alex Lamb, Kartikeya Badola, Nan Rosemary Ke, Nasim Rahaman, Jonathan Binas, Charles Blundell, Michael Mozer, Yoshua Bengio", "tldr": "", "abstract": " Deep learning has seen a movement away from representing examples with a monolithic hidden state towards a richly structured state. For example, Transformers segment by position, and object-centric architectures decompose images into entities. In all these architectures, interactions between different elements are modeled via pairwise interactions: Transformers make use of self-attention to incorporate information from other positions and object-centric architectures make use of graph neural networks to model interactions among entities. We consider how to improve on pairwise interactions in terms of global coordination and a coherent, integrated representation that can be used for downstream tasks. In cognitive science, a global workspace architecture has been proposed in which functionally specialized components share information through a common, bandwidth-limited communication channel. We explore the use of such a communication channel in the context of deep learning for modeling the structure of complex environments. The proposed method includes a shared workspace through which communication among different specialist modules takes place but due to limits on the communication bandwidth, specialist modules must compete for access. We show that capacity limitations have a rational basis in that (1) they encourage specialization and compositionality and (2) they facilitate the synchronization of otherwise independent specialists.\n", "keywords": "slot based recurrent architectures;attention;transformers;latent bottleneck.", "primary_area": "", "supplementary_material": "/attachment/05665d6b26ef6ebbf7a57aa77ddae775cc08aafa.zip", "author": "Anirudh Goyal;Aniket Rajiv Didolkar;Alex Lamb;Kartikeya Badola;Nan Rosemary Ke;Nasim Rahaman;Jonathan Binas;Charles Blundell;Michael Curtis Mozer;Yoshua Bengio", "authorids": "~Anirudh_Goyal1;~Aniket_Rajiv_Didolkar1;~Alex_Lamb1;~Kartikeya_Badola1;~Nan_Rosemary_Ke1;~Nasim_Rahaman1;~Jonathan_Binas1;~Charles_Blundell1;~Michael_Curtis_Mozer1;~Yoshua_Bengio1", "gender": "M;M;M;F;M;;;M;M;M", "homepage": "https://anirudh9119.github.io/;https://github.com/dido1998/;https://kartikeya-badola.github.io/;https://nke001.github.io/;;;http://www.gatsby.ucl.ac.uk/~ucgtcbl/;https://www.cs.colorado.edu/~mozer;http://yoshuabengio.org;", "dblp": "172/1039;245/8589;;120/5291;222/3165;116/4760;35/8396;m/MichaelCMozer;56/953;", "google_scholar": "krrh6OUAAAAJ;https://scholar.google.ca/citations?user=ekvl5o0AAAAJ;1bXieIsAAAAJ;https://scholar.google.ca/citations?user=dxwPYhQAAAAJ;https://scholar.google.de/citations?user=iH9DuY0AAAAJ;https://scholar.google.ca/citations?user=oD1W8a4AAAAJ;https://scholar.google.co.uk/citations?user=f31mvPsAAAAJ;lmjR_qMAAAAJ;kukA0LcAAAAJ;https://scholar.google.ca/citations?user=BFzFy1YAAAAJ", "orcid": ";;0000-0002-2020-9173;;;;;;;", "linkedin": ";aniket-didolkar-7a9b8912a;;;https://de.linkedin.com/in/nasim-rahaman/de;;;;yoshuabengio/?originalSubdomain=ca;", "or_profile": "~Anirudh_Goyal1;~Aniket_Rajiv_Didolkar1;~Kartikeya_Badola1;~Nan_Rosemary_Ke1;~Nasim_Rahaman1;~Jonathan_Binas1;~Charles_Blundell1;~Michael_Curtis_Mozer1;~Yoshua_Bengio1;~Alex_Matthew_Lamb1", "aff": "University of Montreal;Manipal Institute of Technology;Indian Institute of Technology, Delhi;Mila;Max Planck Institute for Intelligent Systems, Max-Planck Institute;Montreal Institute for Learning Algorithms, University of Montreal;Google DeepMind;Google DeepMind;University of Montreal;Microsoft Research NYC", "aff_domain": "umontreal.ca;manipal.edu;iitd.ac.in;mila.quebec;tuebingen.mpg.de;mila.umontreal.ca;google.com;google.com;umontreal.ca;microsoft.com", "position": "PhD student;Undergrad student;Undergrad student;PhD student;PhD student;Postdoc;Research Scientist;Research Scientist;Full Professor;Researcher", "bibtex": "@inproceedings{\ngoyal2022coordination,\ntitle={Coordination Among Neural Modules Through a Shared Global Workspace},\nauthor={Anirudh Goyal and Aniket Rajiv Didolkar and Alex Lamb and Kartikeya Badola and Nan Rosemary Ke and Nasim Rahaman and Jonathan Binas and Charles Blundell and Michael Curtis Mozer and Yoshua Bengio},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=XzTtHjgPDsT}\n}", "github": "", "project": "", "reviewers": "pjX4;EPxc;2GPN;Cdd9", "pdf_size": 0, "recommendation": "6;6;8;10", "confidence": "4;3;3;3", "correctness": "4;2;4;4", "technical_novelty": "4;3;4;4", "empirical_novelty": "3;3;3;4", "wc_summary_paper": "37;359;67;155", "wc_summary_review": "71;172;51;68", "wc_main_review": "452;830;276;89", "wc_review": "560;1361;394;312", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "405;721;590;38", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 7.5, 1.6583123951777 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.8660254037844386 ], "technical_novelty_avg": [ 3.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 154.5, 125.78056288632199 ], "wc_summary_review_avg": [ 90.5, 47.668123520860355 ], "wc_main_review_avg": [ 411.75, 273.47246205056916 ], "wc_review_avg": [ 656.75, 416.29879593868634 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 438.5, 257.04133908770393 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 10, 0 ], "corr_recommendation_confidence": -0.5222329678670935, "corr_recommendation_correctness": 0.5222329678670935, "gs_citation": 108, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9755111905599234560&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "pdf": "https://openreview.net/pdf?id=XzTtHjgPDsT", "email": "umontreal.ca;manipal.edu;iitd.ac.in;mila.quebec;tuebingen.mpg.de;mila.umontreal.ca;google.com;google.com;umontreal.ca;microsoft.com", "author_num": 10, "aff_unique_index": "0;1;2;3;4;0;5;5;0;6", "aff_unique_norm": "University of Montreal;Manipal Institute of Technology;Indian Institute of Technology Delhi;Mila;Max Planck Institute for Intelligent Systems;Google;Microsoft", "aff_unique_dep": ";;;Quebec Artificial Intelligence Institute;Intelligent Systems;Google DeepMind;Microsoft Research", "aff_unique_url": "https://wwwumontreal.ca;https://mit manipal.edu;https://www.iitdelhi.ac.in;https://mila.quebec;https://www.mpi-is.mpg.de;https://deepmind.com;https://www.microsoft.com/en-us/research/group/microsoft-research-new-york-city", "aff_unique_abbr": "UM;MIT Manipal;IIT Delhi;Mila;MPI-IS;DeepMind;MSR NYC", "aff_campus_unique_index": "1;2;3", "aff_campus_unique": ";Delhi;Montreal;New York City", "aff_country_unique_index": "0;1;1;0;2;0;3;3;0;4", "aff_country_unique": "Canada;India;Germany;United Kingdom;United States" }, { "id": "Y-8bEgodif", "title": "Learning Dense NeRF Correspondence Through Generative Structural Priors", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Neural radiance field (NeRF), a kind of 3D shape representation, has shown promising results over building geometry and textures from images. However, unlike mesh or signed distance function (SDF) based representation, it remains an open problem to build correspondences across radiance fields, limiting its application in many downstream tasks.\nAssumptions of prior arts on the availability of either correspondence annotations or 3D shapes as supervision signals do not apply to NeRF.\nThis paper shows that by leveraging rich structural priors encapsulated in a pretrained NeRF generative adversarial network (GAN), we can learn correspondence in a self-supervised manner without using any correspondence or 3D supervision.\nTo exploit the priors, we devise a novel Bijective Deformation Field (BDF), a way to establish a bijective shape deformation field for 3D radiance fields.\nOur experiments demonstrate that the GAN-derived priors are discriminative enough to guide the learning of \naccurate, smooth and robust 3D dense correspondence.\nWe also show that BDF can produce high-quality dense correspondences across different shapes belonging to the same object category.\nWe further demonstrate how the accurate correspondences facilitate downstream applications such as texture transfer, segmentation transfer, and deformation transfer. Code and models will be released. ", "keywords": "Dense Correspondence;Generative Model;Neural Radiance Field", "primary_area": "", "supplementary_material": "", "author": "Yushi LAN;Chen Change Loy;Bo Dai", "authorids": "~Yushi_LAN1;~Chen_Change_Loy2;~Bo_Dai2", "gender": "M;M;M", "homepage": "https://nirvanalan.github.io/;https://www.mmlab-ntu.com/person/ccloy/index.html;http://daibo.info/", "dblp": "259/2752;01/5855;64/2903-2", "google_scholar": "dTNZCUcAAAAJ;https://scholar.google.co.uk/citations?user=559LF80AAAAJ;https://scholar.google.com.hk/citations?user=KNWTvgEAAAAJ", "orcid": ";0000-0001-5345-1591;0000-0003-0777-9232", "linkedin": ";;", "or_profile": "~Yushi_LAN1;~Chen_Change_Loy2;~Bo_Dai2", "aff": "Nanyang Technological University;Nanyang Technological University;Nanyang Technological University", "aff_domain": "ntu.edu.sg;ntu.edu.sg;ntu.edu.sg", "position": "PhD student;Full Professor;Research Assistant Professor", "bibtex": "@misc{\nlan2022learning,\ntitle={Learning Dense Ne{RF} Correspondence Through Generative Structural Priors},\nauthor={Yushi LAN and Chen Change Loy and Bo Dai},\nyear={2022},\nurl={https://openreview.net/forum?id=Y-8bEgodif}\n}", "github": "", "project": "", "reviewers": "", "site": "https://openreview.net/forum?id=Y-8bEgodif", "pdf_size": 0, "recommendation": "", "confidence": "", "correctness": "", "technical_novelty": "", "empirical_novelty": "", "wc_summary_paper": "", "wc_summary_review": "", "wc_main_review": "", "wc_review": "", "wc_reply_reviewers": "", "wc_reply_authors": "", "reply_reviewers": "", "reply_authors": "", "recommendation_avg": [ 0, 0 ], "confidence_avg": [ 0, 0 ], "correctness_avg": [ 0, 0 ], "technical_novelty_avg": [ 0, 0 ], "empirical_novelty_avg": [ 0, 0 ], "wc_summary_paper_avg": [ 0, 0 ], "wc_summary_review_avg": [ 0, 0 ], "wc_main_review_avg": [ 0, 0 ], "wc_review_avg": [ 0, 0 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 1, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0, "corr_recommendation_correctness": 0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:T3abSpltvwEJ:scholar.google.com/&scioq=Learning+Dense+NeRF+Correspondence+Through+Generative+Structural+Priors&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;0", "aff_unique_norm": "Nanyang Technological University", "aff_unique_dep": "", "aff_unique_url": "https://www.ntu.edu.sg", "aff_unique_abbr": "NTU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Singapore" }, { "id": "Y03EQLbqBjP", "title": "Contrastive Learning Through Time", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Contrastive learning has emerged as a powerful form of unsupervised representation learning for images. The utility of learned representations for downstream tasks depends strongly on the chosen augmentation operations. Taking inspiration from biology, we here study contrastive learning through time (CLTT), that works completely without any augmentation operations. Instead, positive pairs of images are generated from temporally close video frames during extended naturalistic interaction with objects. To this end, we develop a new data set using a near-photorealistic training environment based on ThreeDWorld (TDW). We propose a family of CLTT algorithms based on state-of-the-art contrastive learning methods and demonstrate that CLTT allows linear classification performance that approaches that of the fully supervised setting. We also consider temporal correlations resulting from one object being seen systematically before or after another object. We show that this leads to increased representational similarity between these objects, matching classic biological findings. We argue that this \"close in time, will align\" effect is generically useful for learning abstract representations. The data sets, code and pre-trained models for this paper can be downloaded at: (link will be added in the final version)", "keywords": "contrastive learning;object recognition;virtual environment;temporal coherence", "primary_area": "", "supplementary_material": "", "author": "Felix Schneider;Xia Xu;Markus Roland Ernst;Zhengyang Yu;Jochen Triesch", "authorids": "fschneider@fias.uni-frankfurt.de;xiaxu@fias.uni-frankfurt.de;mernst@fias.uni-frankfurt.de;zhyu@fias.uni-frankfurt.de;~Jochen_Triesch1", "gender": ";;;;M", "homepage": ";;;;http://fias.uni-frankfurt.de/~triesch/", "dblp": ";;;;24/2918", "google_scholar": ";;;;AgEdsugAAAAJ", "orcid": ";;;;0000-0001-8166-2441", "linkedin": ";;;;", "or_profile": "fschneider@fias.uni-frankfurt.de;xiaxu@fias.uni-frankfurt.de;mernst@fias.uni-frankfurt.de;zhyu@fias.uni-frankfurt.de;~Jochen_Triesch1", "aff": ";;;;Goethe University", "aff_domain": ";;;;uni-frankfurt.de", "position": ";;;;Full Professor", "bibtex": "@misc{\nschneider2022contrastive,\ntitle={Contrastive Learning Through Time},\nauthor={Felix Schneider and Xia Xu and Markus Roland Ernst and Zhengyang Yu and Jochen Triesch},\nyear={2022},\nurl={https://openreview.net/forum?id=Y03EQLbqBjP}\n}", "github": "", "project": "", "reviewers": "52HK;Dixg;HxEB;xWvq", "site": "https://openreview.net/forum?id=Y03EQLbqBjP", "pdf_size": 0, "recommendation": "3;5;6;8", "confidence": "4;4;4;4", "correctness": "3;3;4;3", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;3;3;4", "wc_summary_paper": "59;64;79;134", "wc_summary_review": "18;394;154;97", "wc_main_review": "199;559;642;808", "wc_review": "276;1017;875;1039", "wc_reply_reviewers": "0;0;252;0", "wc_reply_authors": "0;0;248;0", "reply_reviewers": "0;0;1;0", "reply_authors": "0;0;1;0", "recommendation_avg": [ 5.5, 1.8027756377319946 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 84.0, 29.790938219532464 ], "wc_summary_review_avg": [ 165.75, 140.35023156375624 ], "wc_main_review_avg": [ 552.0, 222.65107230821954 ], "wc_review_avg": [ 801.75, 309.99949596733217 ], "wc_reply_reviewers_avg": [ 63.0, 109.11920087683927 ], "wc_reply_authors_avg": [ 62.0, 107.38715006927039 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 0.25, 0.4330127018922193 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.16012815380508713, "gs_citation": 23, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16863776989093640775&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "aff_unique_index": "0", "aff_unique_norm": "Goethe University Frankfurt", "aff_unique_dep": "", "aff_unique_url": "https://www.uni-frankfurt.de", "aff_unique_abbr": "GU", "aff_campus_unique_index": "0", "aff_campus_unique": "Frankfurt", "aff_country_unique_index": "0", "aff_country_unique": "Germany" }, { "id": "Y0cGpgUhSvp", "title": "Prioritized training on points that are learnable, worth learning, and not yet learned", "track": "main", "status": "Reject", "tldr": "", "abstract": "We introduce reducible held-out loss selection (RHOLS), a technique for faster model training which selects a sequence of training points that are \u201cjust right\u201d. We propose a tractable information-theoretic acquisition function\u2014the reducible heldout loss\u2014to efficiently choose training points that maximize information about a holdout set. We show that the \u201chard\u201d (e.g. high loss) points usually selected in the optimization literature are typically noisy, leading to deterioration on real-world datasets. At the same time, \u201ceasy\u201d (e.g. low noise) samples, often prioritized for curriculum learning, confer less information. In contrast, RHOLS chooses points that are \u201cjust right\u201d and trains in fewer steps than the above approaches.", "keywords": "Data selection;subset selection;deep learning;active learning", "primary_area": "", "supplementary_material": "/attachment/ce715eb04c8cdc50e22b6a09f1ceee6f00786612.zip", "author": "S\u00f6ren Mindermann;Muhammed Razzak;Mrinank Sharma;Jan M. Brauner;Winnie Xu;Andreas Kirsch;Aidan Gomez;Benedikt H\u00f6ltgen;Sebastian Farquhar;Yarin Gal", "authorids": "~S\u00f6ren_Mindermann1;~Muhammed_Razzak1;~Mrinank_Sharma1;~Jan_M._Brauner1;~Winnie_Xu1;~Andreas_Kirsch1;~Aidan_Gomez1;~Benedikt_H\u00f6ltgen1;~Sebastian_Farquhar1;~Yarin_Gal1", "gender": "M;;M;F;;Unspecified;M;;;", "homepage": "https://www.soren-mindermann.com/;https://mrinanksharma.github.io/;;https://winniexu.ca;https://www.blackhc.net;http://gom.ai;https://fm.ls/ben;https://sebastianfarquhar.com/;http://www.cs.ox.ac.uk/people/yarin.gal/website//;https://oatml.cs.ox.ac.uk//members/muhammed_razzak/", "dblp": "211/7976;254/2914;271/0265;285/6560;56/2914-2;;;215/5432;67/9076;296/3692", "google_scholar": "slBPlrQAAAAJ;https://scholar.google.co.uk/citations?user=5gslw-MAAAAJ;https://scholar.google.de/citations?user=tNZUnjcAAAAJ;k4l-zNYAAAAJ;WYQVZpYAAAAJ;https://scholar.google.ca/citations?user=2oq9614AAAAJ;;bvShhTEAAAAJ;https://scholar.google.co.uk/citations?user=SIayDoQAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": "0000-0002-0315-9821;0000-0002-4304-7963;0000-0002-1588-5724;;0000-0001-8244-7700;;;;;", "linkedin": ";;;https://linkedin.com/in/winnie-xu;blackhc;;benedikt-h%C3%B6ltgen-b010aa203/;;;", "or_profile": "~S\u00f6ren_Mindermann1;~Mrinank_Sharma1;~Jan_M._Brauner1;~Winnie_Xu1;~Andreas_Kirsch1;~Aidan_Gomez1;~Benedikt_H\u00f6ltgen1;~Sebastian_Farquhar1;~Yarin_Gal1;~Muhammed_T_Razzak1", "aff": "University of Oxford;University of Oxford;University of Oxford;University of Toronto;University of Oxford;;Eberhard-Karls-Universit\u00e4t T\u00fcbingen;University of Oxford;University of Oxford;University of Oxford", "aff_domain": "ox.ac.uk;ox.ac.uk;ox.ac.uk;utoronto.ca;ox.ac.uk;;uni-tuebingen.de;ox.ac.uk;ox.ac.uk;ox.ac.uk", "position": "PhD student;PhD student;PhD student;Undergrad student;PhD student;;PhD student;PhD student;Associate Professor;PhD student", "bibtex": "@misc{\nmindermann2022prioritized,\ntitle={Prioritized training on points that are learnable, worth learning, and not yet learned},\nauthor={S{\\\"o}ren Mindermann and Muhammed Razzak and Mrinank Sharma and Jan M. Brauner and Winnie Xu and Andreas Kirsch and Aidan Gomez and Benedikt H{\\\"o}ltgen and Sebastian Farquhar and Yarin Gal},\nyear={2022},\nurl={https://openreview.net/forum?id=Y0cGpgUhSvp}\n}", "github": "", "project": "", "reviewers": "BBTj;NdhY;DHeZ;Yofy", "site": "https://openreview.net/forum?id=Y0cGpgUhSvp", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "4;4;4;4", "correctness": "4;2;3;3", "technical_novelty": "2;3;3;3", "empirical_novelty": "3;2;2;3", "wc_summary_paper": "43;62;37;57", "wc_summary_review": "41;37;108;23", "wc_main_review": "263;185;491;191", "wc_review": "347;284;636;271", "wc_reply_reviewers": "168;70;175;0", "wc_reply_authors": "2079;1909;2505;1230", "reply_reviewers": "1;1;1;0", "reply_authors": "4;4;4;2", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 49.75, 10.133484099755622 ], "wc_summary_review_avg": [ 52.25, 32.87381176559846 ], "wc_main_review_avg": [ 282.5, 124.2286198909092 ], "wc_review_avg": [ 384.5, 148.02111335887187 ], "wc_reply_reviewers_avg": [ 103.25, 72.64081153181041 ], "wc_reply_authors_avg": [ 1930.75, 459.1472394559288 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 3.5, 0.8660254037844386 ], "replies_avg": [ 23, 0 ], "authors#_avg": [ 10, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 172, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5784378723216835078&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff_unique_index": "0;0;0;1;0;2;0;0;0", "aff_unique_norm": "University of Oxford;University of Toronto;Eberhard Karls University of T\u00fcbingen", "aff_unique_dep": ";;", "aff_unique_url": "https://www.ox.ac.uk;https://www.utoronto.ca;https://www.uni-tuebingen.de/", "aff_unique_abbr": "Oxford;U of T;Uni T\u00fcbingen", "aff_campus_unique_index": "1", "aff_campus_unique": ";T\u00fcbingen", "aff_country_unique_index": "0;0;0;1;0;2;0;0;0", "aff_country_unique": "United Kingdom;Canada;Germany" }, { "id": "Y1O-K5itG09", "title": "Deep Ensemble as a Gaussian Process Posterior", "track": "main", "status": "Reject", "tldr": "", "abstract": "Deep Ensemble (DE) is a flexible, feasible, and effective alternative to Bayesian neural networks (BNNs) for uncertainty estimation in deep learning. However, DE is broadly criticized for lacking a proper Bayesian justification. Some attempts try to fix this issue, while they are typically coupled with a regression likelihood or rely on restrictive assumptions. In this work, we propose to define a Gaussian process (GP) approximate posterior with the ensemble members, based on which we perform variational inference directly in the function space. We further develop a function-space posterior regularization mechanism to properly incorporate prior knowledge. We demonstrate the algorithmic benefits of variational inference in the GP family, and provide strategies to make the training feasible. As a result, our method consumes only marginally added training cost than the standard Deep Ensemble. Empirically, our approach achieves better uncertainty estimation than the existing Deep Ensemble and its variants across diverse scenarios.", "keywords": "Deep ensemble;Bayesian deep learning;Gaussian process;functional variational inference;uncertainty estimation", "primary_area": "", "supplementary_material": "/attachment/8f7f3c85a2ea215242582ad322f428847a288a28.zip", "author": "Zhijie Deng;Feng Zhou;Jianfei Chen;Guoqiang Wu;Jun Zhu", "authorids": "~Zhijie_Deng1;~Feng_Zhou9;~Jianfei_Chen1;~Guoqiang_Wu2;~Jun_Zhu2", "gender": "M;;M;M;M", "homepage": "https://thudzj.github.io/;;http://ml.cs.tsinghua.edu.cn/~jianfei;https://guoqiangwoodrowwu.github.io/;http://ml.cs.tsinghua.edu.cn/~jun", "dblp": "209/4959;;48/6809-1;98/4857;50/2644-1", "google_scholar": "J3dR0sUAAAAJ;;di5RZ1MAAAAJ;KCTX-_0AAAAJ;axsP38wAAAAJ", "orcid": "0000-0002-0932-1631;;;0000-0003-4486-7944;", "linkedin": ";;;;", "or_profile": "~Zhijie_Deng1;~Feng_Zhou9;~Jianfei_Chen1;~Guoqiang_Wu2;~Jun_Zhu2", "aff": "Tsinghua University;;Tsinghua University;Shandong University;Tsinghua University", "aff_domain": "tsinghua.edu.cn;;tsinghua.edu.cn;sdu.edu.cn;mail.tsinghua.edu.cn", "position": "PhD student;;Assistant Professor;Associate Professor;Professor", "bibtex": "@misc{\ndeng2022deep,\ntitle={Deep Ensemble as a Gaussian Process Posterior},\nauthor={Zhijie Deng and Feng Zhou and Jianfei Chen and Guoqiang Wu and Jun Zhu},\nyear={2022},\nurl={https://openreview.net/forum?id=Y1O-K5itG09}\n}", "github": "", "project": "", "reviewers": "hBTZ;Bycy;zZzc;uL2a;5XQ4", "site": "https://openreview.net/forum?id=Y1O-K5itG09", "pdf_size": 0, "recommendation": "5;5;5;5;8", "confidence": "4;3;3;4;4", "correctness": "3;3;3;4;4", "technical_novelty": "2;2;2;3;4", "empirical_novelty": "2;2;0;3;3", "wc_summary_paper": "73;90;123;33;71", "wc_summary_review": "88;61;57;51;34", "wc_main_review": "497;947;442;591;112", "wc_review": "658;1098;622;675;217", "wc_reply_reviewers": "0;72;35;877;19", "wc_reply_authors": "601;1642;199;1535;564", "reply_reviewers": "0;1;1;2;1", "reply_authors": "1;3;2;4;1", "recommendation_avg": [ 5.6, 1.2 ], "confidence_avg": [ 3.6, 0.4898979485566356 ], "correctness_avg": [ 3.4, 0.4898979485566356 ], "technical_novelty_avg": [ 2.6, 0.8 ], "empirical_novelty_avg": [ 2.0, 1.0954451150103321 ], "wc_summary_paper_avg": [ 78.0, 29.216433731720237 ], "wc_summary_review_avg": [ 58.2, 17.52027397046062 ], "wc_main_review_avg": [ 517.8, 268.4856793201455 ], "wc_review_avg": [ 654.0, 279.13652573606345 ], "wc_reply_reviewers_avg": [ 200.6, 339.0271965491854 ], "wc_reply_authors_avg": [ 908.2, 573.9600682974383 ], "reply_reviewers_avg": [ 1.0, 0.6324555320336759 ], "reply_authors_avg": [ 2.2, 1.16619037896906 ], "replies_avg": [ 25, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.408248290463863, "corr_recommendation_correctness": 0.6123724356957946, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:_fLCukE7Bk8J:scholar.google.com/&scioq=Deep+Ensemble+as+a+Gaussian+Process+Posterior&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Tsinghua University;Shandong University", "aff_unique_dep": ";", "aff_unique_url": "https://www.tsinghua.edu.cn;http://www.sdu.edu.cn", "aff_unique_abbr": "THU;SDU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "id": "Y2eS8eWCsyG", "title": "A Broad Dataset is All You Need for One-Shot Object Detection", "track": "main", "status": "Reject", "tldr": "", "abstract": "Is it possible to detect arbitrary objects from a single example? A central problem of all existing attempts at one-shot object detection is the generalization gap: Object categories used during training are detected much more reliably than novel ones. We here show that this generalization gap can be nearly closed by increasing the number of object categories used during training. Doing so allows us to improve generalization from seen to unseen classes from 45% to 89% and improve the state-of-the-art on COCO by 5.4 AP50 (from 22.0 to 27.5). \nWe verify that the effect is caused by the number of categories and not the number of training samples, and that it holds for different models, backbones and datasets. This result suggests that the key to strong few-shot detection models may not lie in sophisticated metric learning approaches, but instead simply in scaling the number of categories. We hope that our findings will help to better understand the challenges of few-shot learning and encourage future data annotation efforts to focus on wider datasets with a broader set of categories rather than gathering more samples per category.", "keywords": "One-Shot Learning;Object Detection;Generalization;Instance Segmentation", "primary_area": "", "supplementary_material": "", "author": "Claudio Michaelis;Matthias Bethge;Alexander S Ecker", "authorids": "~Claudio_Michaelis1;~Matthias_Bethge1;~Alexander_S_Ecker1", "gender": ";M;M", "homepage": ";https://bethgelab.org;http://eckerlab.org", "dblp": "217/2962;77/3005;26/7228", "google_scholar": "https://scholar.google.de/citations?hl=de;https://scholar.google.com/citations?hl=en;VgYU_m8AAAAJ", "orcid": ";;0000-0003-2392-5105", "linkedin": ";;alexecker/", "or_profile": "~Claudio_Michaelis1;~Matthias_Bethge1;~Alexander_S_Ecker1", "aff": "University of Tuebingen;University of Tuebingen;Max Planck Institute for Dynamics and Self-Organization", "aff_domain": "uni-tuebingen.de;uni-tuebingen.de;ds.mpg.de", "position": "PhD student;Full Professor;Principal Researcher", "bibtex": "@misc{\nmichaelis2022a,\ntitle={A Broad Dataset is All You Need for One-Shot Object Detection},\nauthor={Claudio Michaelis and Matthias Bethge and Alexander S Ecker},\nyear={2022},\nurl={https://openreview.net/forum?id=Y2eS8eWCsyG}\n}", "github": "", "project": "", "reviewers": "wPts;b9Xw;djNN;fCN3", "site": "https://openreview.net/forum?id=Y2eS8eWCsyG", "pdf_size": 0, "recommendation": "3;5;5;5", "confidence": "5;2;4;4", "correctness": "3;3;4;4", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "83;85;70;140", "wc_summary_review": "21;43;47;31", "wc_main_review": "515;201;120;270", "wc_review": "619;329;237;441", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.5, 0.8660254037844386 ], "confidence_avg": [ 3.75, 1.0897247358851685 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 94.5, 26.893307717720408 ], "wc_summary_review_avg": [ 35.5, 10.23474474522936 ], "wc_main_review_avg": [ 276.5, 147.5779455067728 ], "wc_review_avg": [ 406.5, 142.3753841083493 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.6622661785325219, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1613537824128137212&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0;1", "aff_unique_norm": "University of Tuebingen;Max Planck Institute for Dynamics and Self-Organization", "aff_unique_dep": ";", "aff_unique_url": "https://www.uni-tuebingen.de/;https://www.mpids.org", "aff_unique_abbr": "Uni T\u00fcbingen;MPIDS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Germany" }, { "id": "Y3cm4HJ3Ncs", "title": "Learning-to-Count by Learning-to-Rank: Weakly Supervised Object Counting & Localization Using Only Pairwise Image Rankings", "track": "main", "status": "Reject", "tldr": "", "abstract": "Object counting and localization in dense scenes is a challenging class of image analysis problems that typically requires labour intensive annotations to learn to solve. We propose a form of weak supervision that only requires object-based pairwise image rankings. These annotations can be collected rapidly with a single click per image pair and supply a weak signal for object quantity. However, the problem of actually extracting object counts and locations from rankings is challenging. Thus, we introduce adversarial density map generation, a strategy for regularizing the features of a ranking network such that the features correspond to an object proposal map where each proposal must be a Gaussian blob that integrates to 1. This places a soft integer and soft localization constraint on the representation, which encourages the network to satisfy the provided ranking constraints by detecting objects. We then demonstrate the effectiveness of our method for exploiting pairwise image rankings as a weakly supervised signal for object counting and localization on several datasets, and show results with a performance that approaches that of fully supervised methods on many counting benchmark datasets while relying on data that can be collected with a fraction of the annotation burden.", "keywords": "Object Counting;Weak Supervision;Ranking", "primary_area": "", "supplementary_material": "/attachment/d97592f494881d3f369a9162015ca85039f5de55.zip", "author": "Adriano C. D'Alessandro;Ali Mahdavi Amiri;Ghassan Hamarneh", "authorids": "~Adriano_C._D'Alessandro1;~Ali_Mahdavi_Amiri1;~Ghassan_Hamarneh1", "gender": "M;M;M", "homepage": "https://adrian-dalessandro.github.io/;https://www.sfu.ca/~amahdavi;http://www.medicalimageanalysis.com", "dblp": "358/6034;33/10499.html;h/GhassanHamarneh", "google_scholar": "FvPiYsEAAAAJ;https://scholar.google.ca/citations?user=M9eTADwAAAAJ;https://scholar.google.ca/citations?user=61DdlkAAAAAJ", "orcid": "0009-0004-1791-8843;;0000-0001-5040-7448", "linkedin": ";;ghassanhamarneh/", "or_profile": "~Adriano_C._D'Alessandro1;~Ali_Mahdavi_Amiri1;~Ghassan_Hamarneh1", "aff": "Simon Fraser University;Simon Fraser University;Simon Fraser University", "aff_domain": "sfu.ca;sfu.ca;sfu.ca", "position": "PhD student;Assistant Professor;Full Professor", "bibtex": "@misc{\nd'alessandro2022learningtocount,\ntitle={Learning-to-Count by Learning-to-Rank: Weakly Supervised Object Counting \\& Localization Using Only Pairwise Image Rankings},\nauthor={Adriano C. D'Alessandro and Ali Mahdavi Amiri and Ghassan Hamarneh},\nyear={2022},\nurl={https://openreview.net/forum?id=Y3cm4HJ3Ncs}\n}", "github": "", "project": "", "reviewers": "yR54;MGT7;GhpV", "site": "https://openreview.net/forum?id=Y3cm4HJ3Ncs", "pdf_size": 0, "recommendation": "3;3;6", "confidence": "5;4;4", "correctness": "2;3;3", "technical_novelty": "2;2;3", "empirical_novelty": "3;2;2", "wc_summary_paper": "57;49;72", "wc_summary_review": "56;23;239", "wc_main_review": "476;345;274", "wc_review": "589;417;585", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "811;812;688", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "recommendation_avg": [ 4.0, 1.4142135623730951 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "correctness_avg": [ 2.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 59.333333333333336, 9.533566430716727 ], "wc_summary_review_avg": [ 106.0, 95.00526301210897 ], "wc_main_review_avg": [ 365.0, 83.6699866539171 ], "wc_review_avg": [ 530.3333333333334, 80.15540461434206 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 770.3333333333334, 58.219889690341695 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.5, "corr_recommendation_correctness": 0.5, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:2vWD_fh84CEJ:scholar.google.com/&scioq=Learning-to-Count+by+Learning-to-Rank:+Weakly+Supervised+Object+Counting+%26+Localization+Using+Only+Pairwise+Image+Rankings&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0;0", "aff_unique_norm": "Simon Fraser University", "aff_unique_dep": "", "aff_unique_url": "https://www.sfu.ca", "aff_unique_abbr": "SFU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Canada" }, { "title": "Pessimistic Bootstrapping for Uncertainty-Driven Offline Reinforcement Learning", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6030", "id": "Y4cs1Z3HnqL", "poster": "", "openreview": "https://openreview.net/forum?id=Y4cs1Z3HnqL", "slides": "https://iclr.cc/virtual/2022/poster/6030", "video": "https://iclr.cc/virtual/2022/poster/6030", "author_site": "Chenjia Bai, Lingxiao Wang, Zhuoran Yang, Zhihong Deng, Animesh Garg, Peng Liu, Zhaoran Wang", "tldr": "", "abstract": "Offline Reinforcement Learning (RL) aims to learn policies from previously collected datasets without exploring the environment. Directly applying off-policy algorithms to offline RL usually fails due to the extrapolation error caused by the out-of-distribution (OOD) actions. Previous methods tackle such problem by penalizing the Q-values of OOD actions or constraining the trained policy to be close to the behavior policy. Nevertheless, such methods typically prevent the generalization of value functions beyond the offline data and also lack precise characterization of OOD data. In this paper, we propose Pessimistic Bootstrapping for offline RL (PBRL), a purely uncertainty-driven offline algorithm without explicit policy constraints. Specifically, PBRL conducts uncertainty quantification via the disagreement of bootstrapped Q-functions, and performs pessimistic updates by penalizing the value function based on the estimated uncertainty. To tackle the extrapolating error, we further propose a novel OOD sampling method. We show that such OOD sampling and pessimistic bootstrapping yields provable uncertainty quantifier in linear MDPs, thus providing the theoretical underpinning for PBRL. Extensive experiments on D4RL benchmark show that PBRL has better performance compared to the state-of-the-art algorithms.", "keywords": "Pessimistic Bootstrapping;Bootstrapped Q-functions;Uncertainty Estimation;Offline Reinforcement Learning", "primary_area": "", "supplementary_material": "", "author": "Chenjia Bai;Lingxiao Wang;Zhuoran Yang;Zhi-Hong Deng;Animesh Garg;Peng Liu;Zhaoran Wang", "authorids": "~Chenjia_Bai2;~Lingxiao_Wang6;~Zhuoran_Yang1;~Zhi-Hong_Deng2;~Animesh_Garg1;~Peng_Liu5;~Zhaoran_Wang1", "gender": "M;M;M;;M;M;Not Specified", "homepage": "https://baichenjia.github.io/;;https://zhuoranyang.github.io/;https://familyld.github.io/;http://animesh.garg.tech;https://homepage.hit.edu.cn/liupeng;https://zhaoranwang.github.io/", "dblp": "247/1943;140/1229;;;123/5728;21/6121-8;117/2756", "google_scholar": "Rm_1y2kAAAAJ;;;e8D8_NwAAAAJ;zp8V7ZMAAAAJ;;https://scholar.google.com.tw/citations?user=HSx0BgQAAAAJ", "orcid": ";;;;0000-0003-0482-4296;;", "linkedin": ";;;;animeshgarg/;;", "or_profile": "~Chenjia_Bai2;~Lingxiao_Wang6;~Zhuoran_Yang1;~Zhi-Hong_Deng2;~Animesh_Garg1;~Peng_Liu5;~Zhaoran_Wang1", "aff": "Harbin institute of technology;Northwestern University;University of California, Berkeley;University of Technology Sydney;University of Toronto;Harbin Institute of Technology;", "aff_domain": "hit.edu.cn;northwestern.edu;berkeley.edu;uts.edu.au;toronto.edu;hit.edu.cn;", "position": "PhD student;PhD student;Postdoc;PhD student;Assistant Professor;Professor;", "bibtex": "@inproceedings{\nbai2022pessimistic,\ntitle={Pessimistic Bootstrapping for Uncertainty-Driven Offline Reinforcement Learning},\nauthor={Chenjia Bai and Lingxiao Wang and Zhuoran Yang and Zhi-Hong Deng and Animesh Garg and Peng Liu and Zhaoran Wang},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=Y4cs1Z3HnqL}\n}", "github": "", "project": "", "reviewers": "73UJ;wgNz;hc93;HkNt", "pdf_size": 0, "recommendation": "6;6;8;8", "confidence": "4;4;4;4", "correctness": "3;4;4;4", "technical_novelty": "2;1;2;3", "empirical_novelty": "2;3;3;4", "wc_summary_paper": "130;76;49;100", "wc_summary_review": "25;88;22;18", "wc_main_review": "328;337;296;150", "wc_review": "483;501;367;268", "wc_reply_reviewers": "16;245;0;282", "wc_reply_authors": "1204;1922;1000;1052", "reply_reviewers": "1;2;0;3", "reply_authors": "2;5;2;4", "recommendation_avg": [ 7.0, 1.0 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 88.75, 29.877876430563134 ], "wc_summary_review_avg": [ 38.25, 28.830322578840494 ], "wc_main_review_avg": [ 277.75, 75.31392633504112 ], "wc_review_avg": [ 404.75, 94.22413438180263 ], "wc_reply_reviewers_avg": [ 135.75, 128.54255132056466 ], "wc_reply_authors_avg": [ 1294.5, 369.96047086141516 ], "reply_reviewers_avg": [ 1.5, 1.118033988749895 ], "reply_authors_avg": [ 3.25, 1.299038105676658 ], "replies_avg": [ 31, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 191, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8122293342821829012&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=Y4cs1Z3HnqL", "email": "hit.edu.cn;northwestern.edu;berkeley.edu;uts.edu.au;toronto.edu;hit.edu.cn;", "author_num": 7, "aff_unique_index": "0;1;2;3;4;0", "aff_unique_norm": "Harbin Institute of Technology;Northwestern University;University of California, Berkeley;University of Technology Sydney;University of Toronto", "aff_unique_dep": ";;;;", "aff_unique_url": "http://www.hit.edu.cn/;https://www.northwestern.edu;https://www.berkeley.edu;https://www.uts.edu.au;https://www.utoronto.ca", "aff_unique_abbr": "HIT;NU;UC Berkeley;UTS;U of T", "aff_campus_unique_index": "0;2;0", "aff_campus_unique": "Harbin;;Berkeley", "aff_country_unique_index": "0;1;1;2;3;0", "aff_country_unique": "China;United States;Australia;Canada" }, { "id": "Y77aWEc17ln", "title": "Neural Shape Mating: Self-Supervised Object Assembly with Adversarial Shape Priors", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Learning to autonomously assemble shapes is a crucial skill for many robotic applications. Whereas the majority of existing part assembly methods focus on correctly posing semantic parts to recreate a whole object, we interpret assembly more literally: as mating geometric parts together to achieve a snug fit. By focusing on shape alignment, rather than semantic cues, we can achieve across category generalization and scaling. In this paper, we introduce a novel task, pairwise 3D geometric shape assembly, and propose Neural Shape Mating (NSM) to tackle this problem. Given point clouds of two object parts, NSM learns to reason about their geometric structure and fit in order to predict a pair of 3D poses that tightly mate them together. In addition, we couple the training of NSM with an implicit shape reconstruction task, making NSM more robust to imperfect point cloud observations. To train NSM, we present a self-supervised data collection pipeline that generates pairwise shape assembly data with ground truth by randomly cutting an object mesh into two parts, resulting in a dataset that consists of 19,226 shape assembly pairs with numerous object meshes and diverse cut types. We train NSM on the collected dataset and compare it with several point cloud registration methods and one part assembly baseline approach. Extensive experimental results and ablation studies under various settings demonstrate the effectiveness of the proposed algorithm. Additional material available at: neural-shape-mating.github.io.", "keywords": "Geometric Shape Assembly;Shape Matching;Pose Estimation;Implicit Representations", "primary_area": "", "supplementary_material": "", "author": "Yun-Chun Chen;Haoda Li;Dylan Turpin;Alec Jacobson;Animesh Garg", "authorids": "~Yun-Chun_Chen1;~Haoda_Li1;~Dylan_Turpin1;~Alec_Jacobson1;~Animesh_Garg1", "gender": "M;M;;M;M", "homepage": "https://yunchunchen.github.io;https://haoda-li.github.io/;http://www.cs.toronto.edu/~dylanturpin/;http://www.cs.toronto.edu/~jacobson/;http://animesh.garg.tech", "dblp": "214/6606;;;33/8698.html;123/5728", "google_scholar": "https://scholar.google.com/citations?hl=en;e1Bif78AAAAJ;;https://scholar.google.ca/citations?user=lSJavJUAAAAJ;zp8V7ZMAAAAJ", "orcid": ";0000-0001-6845-4437;;0000-0003-4603-7143;0000-0003-0482-4296", "linkedin": ";haoda-li-515b26165/;;;animeshgarg/", "or_profile": "~Yun-Chun_Chen1;~Haoda_Li1;~Dylan_Turpin1;~Alec_Jacobson1;~Animesh_Garg1", "aff": "University of Toronto;Toronto University;Department of Computer Science, University of Toronto;Department of Computer Science, University of Toronto;University of Toronto", "aff_domain": "cs.toronto.edu;utoronto.ca;cs.toronto.edu;cs.toronto.edu;toronto.edu", "position": "PhD student;Undergrad student;PhD student;Associate Professor;Assistant Professor", "bibtex": "@misc{\nchen2022neural,\ntitle={Neural Shape Mating: Self-Supervised Object Assembly with Adversarial Shape Priors},\nauthor={Yun-Chun Chen and Haoda Li and Dylan Turpin and Alec Jacobson and Animesh Garg},\nyear={2022},\nurl={https://openreview.net/forum?id=Y77aWEc17ln}\n}", "github": "", "project": "", "reviewers": "GPbx;jDm2;qchj;3quw", "site": "https://openreview.net/forum?id=Y77aWEc17ln", "pdf_size": 0, "recommendation": "3;5;5;5", "confidence": "4;4;3;3", "correctness": "2;3;3;3", "technical_novelty": "2;3;2;3", "empirical_novelty": "2;2;3;2", "wc_summary_paper": "33;89;72;155", "wc_summary_review": "52;70;67;85", "wc_main_review": "727;557;564;504", "wc_review": "812;716;703;744", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.5, 0.8660254037844386 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 87.25, 44.070256409510485 ], "wc_summary_review_avg": [ 68.5, 11.715374513859981 ], "wc_main_review_avg": [ 588.0, 83.5374167663808 ], "wc_review_avg": [ 743.75, 42.097357399247755 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 1.0, "gs_citation": 52, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5168476826085235286&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "University of Toronto", "aff_unique_dep": "", "aff_unique_url": "https://www.utoronto.ca", "aff_unique_abbr": "U of T", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Toronto", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "Canada" }, { "id": "Y8Ivdg7typR", "title": "Wakening Past Concepts without Past Data: Class-incremental Learning from Placebos", "track": "main", "status": "Reject", "tldr": "", "abstract": "Not forgetting knowledge about previous classes is one of the key challenges in class-incremental learning (CIL). A common technique to address this challenge is knowledge distillation (KD) that penalizes inconsistencies across models of subsequent phases. As old-class data is scarce, the KD loss mainly uses new class data. However, we empirically observe that this both harms learning of new classes and also underperforms to distil old class knowledge from the previous phase model. To address this issue, we propose to compute the KD loss using placebo data chosen from a free image stream (e.g., Google Images), which is both simple and surprisingly effective even when there is no class overlap between the placebos and the old data. When the image stream is available, we use an evaluation function to quickly judge the quality of candidate images (good or bad placebos) and collect good ones. For training this function, we sample pseudo CIL tasks from the data in the 0-th phase and design a reinforcement learning algorithm. Our method does not require any additional supervision or memory budget, and can significantly improve a number of top-performing CIL methods, in particular on higher-resolution benchmarks, e.g., ImageNet-1k and ImageNet-Subset, and with a lower memory budget for old class exemplars, e.g., five exemplars per class.", "keywords": "incremental learning;continual learning;class-incremental learning", "primary_area": "", "supplementary_material": "", "author": "Yaoyao Liu;Bernt Schiele;Qianru Sun", "authorids": "~Yaoyao_Liu1;~Bernt_Schiele1;~Qianru_Sun2", "gender": ";M;F", "homepage": "https://yaoyaoliu.web.illinois.edu/;http://www.mpi-inf.mpg.de/~schiele;https://qianrusun.com/", "dblp": "12/10033-1;s/BerntSchiele;127/6132.html", "google_scholar": "Qi2PSmEAAAAJ;https://scholar.google.de/citations?user=z76PBfYAAAAJ;https://scholar.google.de/citations?user=fNfrGMIAAAAJ", "orcid": "0000-0002-5316-3028;0000-0001-9683-5237;0000-0003-2689-317X", "linkedin": ";;", "or_profile": "~Yaoyao_Liu1;~Bernt_Schiele1;~Qianru_Sun2", "aff": "Max Planck Institute for Informatics;Amazon;Singapore Management University", "aff_domain": "mpi-inf.mpg.de;amazon.com;smu.edu.sg", "position": "PhD student;Principal Researcher;Assistant Professor", "bibtex": "@misc{\nliu2022wakening,\ntitle={Wakening Past Concepts without Past Data: Class-incremental Learning from Placebos},\nauthor={Yaoyao Liu and Bernt Schiele and Qianru Sun},\nyear={2022},\nurl={https://openreview.net/forum?id=Y8Ivdg7typR}\n}", "github": "", "project": "", "reviewers": "9D1K;Dwjy;4jmy;vLx7", "site": "https://openreview.net/forum?id=Y8Ivdg7typR", "pdf_size": 0, "recommendation": "3;5;6;6", "confidence": "4;4;4;4", "correctness": "3;1;2;4", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "68;60;40;70", "wc_summary_review": "59;43;33;38", "wc_main_review": "307;406;387;296", "wc_review": "434;509;460;404", "wc_reply_reviewers": "0;182;0;0", "wc_reply_authors": "358;2063;603;968", "reply_reviewers": "0;1;0;0", "reply_authors": "1;4;2;2", "recommendation_avg": [ 5.0, 1.224744871391589 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 2.5, 1.118033988749895 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 59.5, 11.863810517704673 ], "wc_summary_review_avg": [ 43.25, 9.756408150543928 ], "wc_main_review_avg": [ 349.0, 48.130032204435516 ], "wc_review_avg": [ 451.75, 38.53813046840752 ], "wc_reply_reviewers_avg": [ 45.5, 78.80831174438391 ], "wc_reply_authors_avg": [ 998.0, 652.0640305982228 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 2.25, 1.0897247358851685 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13826390472753680766&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 11, "aff_unique_index": "0;1;2", "aff_unique_norm": "Max Planck Institute for Informatics;Amazon;Singapore Management University", "aff_unique_dep": ";Amazon.com, Inc.;", "aff_unique_url": "https://mpi-inf.mpg.de;https://www.amazon.com;https://www.smu.edu.sg", "aff_unique_abbr": "MPII;Amazon;SMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2", "aff_country_unique": "Germany;United States;Singapore" }, { "id": "Y8KfxdZl-rI", "title": "Weakly Supervised Label Learning Flows", "track": "main", "status": "Reject", "tldr": "", "abstract": "Supervised learning usually requires a large amount of labelled data. However, attaining ground-truth labels is costly for many tasks. Alternatively, weakly supervised methods learn with only cheap weak signals that only approximately label some data. Many existing weakly supervised learning methods learn a deterministic function that estimates labels given the input data and weak signals. In this paper, we develop label learning flow (LLF), a general framework for weakly supervised learning problems. Our method is a generative model based on normalizing flows. The main idea of LLF is to optimize the conditional likelihoods of all possible labelings of the data within a constrained space defined by weak signals. We develop a training method for LLF that trains the conditional flow inversely and avoids estimating the labels. Once a model is trained, we can make predictions with a sampling algorithm. We apply LLF to three weakly supervised learning problems. Experiment results show that our method outperforms many state-of-the-art alternatives.", "keywords": "Weakly Supervised Learning;Deep Generative Flows;Deep Learning;Deep Generative Models;Machine Learning", "primary_area": "", "supplementary_material": "", "author": "You Lu;Chidubem Gibson Arachie;Bert Huang", "authorids": "~You_Lu1;~Chidubem_Gibson_Arachie1;~Bert_Huang1", "gender": "M;M;M", "homepage": ";http://people.cs.vt.edu/achid17/;http://berthuang.com", "dblp": "48/7828-3;220/4292;93/10793", "google_scholar": "TNrf52wAAAAJ;cAFyusAAAAAJ;TrqegZIAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~You_Lu1;~Chidubem_Gibson_Arachie1;~Bert_Huang1", "aff": "Motional;Virginia Tech;Tufts University", "aff_domain": "motional.com;vt.edu;tufts.edu", "position": "Researcher;PhD student;Tufts University", "bibtex": "@misc{\nlu2022weakly,\ntitle={Weakly Supervised Label Learning Flows},\nauthor={You Lu and Chidubem Gibson Arachie and Bert Huang},\nyear={2022},\nurl={https://openreview.net/forum?id=Y8KfxdZl-rI}\n}", "github": "", "project": "", "reviewers": "W3Vi;fLEx;tWTj;ii99;mY6j", "site": "https://openreview.net/forum?id=Y8KfxdZl-rI", "pdf_size": 0, "recommendation": "5;5;6;6;8", "confidence": "3;4;4;3;4", "correctness": "2;2;3;3;4", "technical_novelty": "2;3;3;3;3", "empirical_novelty": "2;2;4;3;3", "wc_summary_paper": "172;103;95;89;39", "wc_summary_review": "162;95;21;19;38", "wc_main_review": "1170;723;184;210;189", "wc_review": "1504;921;300;318;266", "wc_reply_reviewers": "391;1477;0;0;0", "wc_reply_authors": "1619;1557;286;267;219", "reply_reviewers": "2;3;0;0;0", "reply_authors": "4;3;1;1;1", "recommendation_avg": [ 6.0, 1.0954451150103321 ], "confidence_avg": [ 3.6, 0.4898979485566356 ], "correctness_avg": [ 2.8, 0.7483314773547882 ], "technical_novelty_avg": [ 2.8, 0.39999999999999997 ], "empirical_novelty_avg": [ 2.8, 0.7483314773547882 ], "wc_summary_paper_avg": [ 99.6, 42.56571390215369 ], "wc_summary_review_avg": [ 67.0, 54.89990892524322 ], "wc_main_review_avg": [ 495.2, 394.76342282435434 ], "wc_review_avg": [ 661.8, 486.2593546657997 ], "wc_reply_reviewers_avg": [ 373.6, 572.1057944121874 ], "wc_reply_authors_avg": [ 789.6, 652.5511780695825 ], "reply_reviewers_avg": [ 1.0, 1.2649110640673518 ], "reply_authors_avg": [ 2.0, 1.2649110640673518 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.372677996249965, "corr_recommendation_correctness": 0.9759000729485333, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1, "aff_unique_index": "0;1;2", "aff_unique_norm": "Motional;Virginia Tech;Tufts University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.motional.com;https://www.vt.edu;https://www.tufts.edu", "aff_unique_abbr": "Motional;VT;Tufts", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "Y9FNtYulBE0", "title": "CheXT: Knowledge-Guided Cross-Attention Transformer for Abnormality Classification and Localization in Chest X-rays", "track": "main", "status": "Reject", "tldr": "", "abstract": "Classical chest X-ray analysis has designed radiomic features to indicate the characteristics of abnormality of the chest X-rays. However, extracting reliable radiomic features heavily hinges on pathology localization, which is often absent in real-world image data. Although the past decade has witnessed the promising performance of convolutional neural networks (CNNs) in analyzing chest X-rays, most of them ignored domain knowledge such as radiomics. Recently, the surge of Transformers in computer vision has suggested a promising substitute for CNNs. It can encode highly expressive and generalizable representations and avoid costly manual annotations via a unique implementation of the self-attention mechanism. Moreover, Transformers naturally suit the feature extraction and fusion from different input modalities. Inspired by its recent success, this paper proposes \\textbf{CheXT}, the first Transformer-based chest X-ray model. CheXT targets (semi-supervised) abnormality classification and localization from chest X-rays, enhanced by baked-in auxiliary knowledge guidance using radiomics. Specifically, CheXT consists of an image branch and a radiomics branch, interacted by cross-attention layers. During training, the image branch leverages its learned attention to estimate pathology localization, which is then utilized to extract radiomic features from images in the radiomics branch. Therefore, the two branches in CheXT are deeply fused and constitute an end-to-end optimization loop that can bootstrap accurate pathology localization from image data without any bounding box used for training. Extensive experiments on the NIH chest X-ray dataset demonstrate that CheXT significantly outperforms existing baselines in disease classification (by 1.1\\% in average AUCs) and localization (by a \\textbf{significant average margin of 3.6\\%} over different IoU thresholds). Codes and models will be publicly released.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yan Han;Ying Ding;Ahmed Tewfik;Yifan Peng;Zhangyang Wang", "authorids": "~Yan_Han2;~Ying_Ding4;~Ahmed_Tewfik1;~Yifan_Peng5;~Zhangyang_Wang1", "gender": "M;F;M;M;M", "homepage": "https://yannhan.github.io;https://yingding.ischool.utexas.edu/;;https://vivo.weill.cornell.edu/display/cwid-yip4002;https://vita-group.github.io", "dblp": "79/4311-1.html;38/6013-1.html;;;119/4026", "google_scholar": "swtJHJEAAAAJ;riuIGwIAAAAJ;h_33tYMAAAAJ;IwGLficAAAAJ;pxFyKAIAAAAJ", "orcid": "0000-0001-7164-2295;;;0000-0001-9309-8331;", "linkedin": ";ying-ding-6a63bb/;;;", "or_profile": "~Yan_Han2;~Ying_Ding4;~Ahmed_Tewfik1;~Yifan_Peng5;~Zhangyang_Wang1", "aff": "Amazon;University of Texas, Austin;University of Texas at Austin;Weill Cornell Medicine, Cornell University;University of Texas, Austin", "aff_domain": "amazon.com;utexas.edu;utexas.edu;med.cornell.edu;utexas.edu", "position": "Intern;Full Professor;Full Professor;Associate Professor;Assistant Professor", "bibtex": "@misc{\nhan2022chext,\ntitle={Che{XT}: Knowledge-Guided Cross-Attention Transformer for Abnormality Classification and Localization in Chest X-rays},\nauthor={Yan Han and Ying Ding and Ahmed Tewfik and Yifan Peng and Zhangyang Wang},\nyear={2022},\nurl={https://openreview.net/forum?id=Y9FNtYulBE0}\n}", "github": "", "project": "", "reviewers": "ZMa3;LbVM;euAn", "site": "https://openreview.net/forum?id=Y9FNtYulBE0", "pdf_size": 0, "recommendation": "5;5;5", "confidence": "5;4;4", "correctness": "3;3;3", "technical_novelty": "3;2;2", "empirical_novelty": "3;2;2", "wc_summary_paper": "92;21;57", "wc_summary_review": "86;38;57", "wc_main_review": "392;283;536", "wc_review": "570;342;650", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "652;350;415", "reply_reviewers": "0;0;0", "reply_authors": "2;2;2", "recommendation_avg": [ 5.0, 0.0 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 56.666666666666664, 28.986586936412888 ], "wc_summary_review_avg": [ 60.333333333333336, 19.73716179078328 ], "wc_main_review_avg": [ 403.6666666666667, 103.61574311957725 ], "wc_review_avg": [ 520.6666666666666, 130.48967604969965 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 472.3333333333333, 129.7852927808934 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.0, 0.0 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:8yjuMKkxTVcJ:scholar.google.com/&scioq=CheXT:+Knowledge-Guided+Cross-Attention+Transformer+for+Abnormality+Classification+and+Localization+in+Chest+X-rays&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;1;2;1", "aff_unique_norm": "Amazon;University of Texas at Austin;Cornell University", "aff_unique_dep": "Amazon.com, Inc.;;", "aff_unique_url": "https://www.amazon.com;https://www.utexas.edu;https://www.weill.cornell.edu", "aff_unique_abbr": "Amazon;UT Austin;Cornell", "aff_campus_unique_index": "1;1;2;1", "aff_campus_unique": ";Austin;Weill Cornell Medicine", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "YDqIYJBQTQs", "title": "Unsupervised Object Learning via Common Fate", "track": "main", "status": "Reject", "tldr": "", "abstract": "Learning generative object models from unlabelled videos is a long standing problem and is required for causal scene modeling. We decompose this problem into three easier subtasks, and provide candidate solutions for each of them. Inspired by the Common Fate Principle of Gestalt Psychology, we first extract (noisy) masks of moving objects via unsupervised motion segmentation. Second, generative models are trained on the masks of the background and the moving objects, respectively. Third, background and foreground models are combined in a conditional ``dead leaves scene model to sample novel scene configurations where occlusions and depth layering arise naturally. To evaluate the individual stages, we introduce the Fishbowl dataset positioned between complex real-world scenes and common object-centric benchmarks of simplistic objects. We show that our approach allows learning generative models that generalize beyond the occlusions present in the input videos, and represent scenes in a modular fashion that allows sampling plausible scenes outside the training distribution by permitting, for instance, object numbers or densities not observed in the training set.", "keywords": "object learning;scene modeling;scene generation;causal modeling;causal representation learning;generative modeling;common fate", "primary_area": "", "supplementary_material": "", "author": "Matthias Tangemann;Steffen Schneider;Julius Von K\u00fcgelgen;Francesco Locatello;Peter Vincent Gehler;Thomas Brox;Matthias Kuemmerer;Matthias Bethge;Bernhard Sch\u00f6lkopf", "authorids": "~Matthias_Tangemann1;~Steffen_Schneider1;~Julius_Von_K\u00fcgelgen1;~Francesco_Locatello1;~Peter_Vincent_Gehler1;~Thomas_Brox1;~Matthias_Kuemmerer1;~Matthias_Bethge1;~Bernhard_Sch\u00f6lkopf1", "gender": "M;;M;M;;M;;M;", "homepage": ";https://stes.io;https://sites.google.com/view/julius-von-kuegelgen/home;https://twitter.com/FrancescoLocat8;;https://lmb.informatik.uni-freiburg.de/people/brox/index.en.html;;https://bethgelab.org;", "dblp": "277/6683;16/8643.html;223/5666;195/6074;;97/4586;151/6291.html;77/3005;", "google_scholar": ";https://scholar.google.de/citations?user=KR5dj44AAAAJ;6EOl3hAAAAAJ;;;https://scholar.google.com/citations?hl=de;https://scholar.google.de/citations?user=y5Ej2qYAAAAJ;https://scholar.google.com/citations?hl=en;", "orcid": "0000-0001-9734-8692;0000-0003-2327-6459;0000-0001-6469-4118;;;0000-0002-6282-8861;0000-0001-9644-4703;;", "linkedin": ";https://linkedin.com/in/steffen-schneider;julius-von-k%C3%BCgelgen/;;;;;;", "or_profile": "~Matthias_Tangemann1;~Steffen_Schneider1;~Julius_Von_K\u00fcgelgen1;~Francesco_Locatello1;~Peter_Vincent_Gehler1;~Thomas_Brox1;~Matthias_Kuemmerer1;~Matthias_Bethge1;~Bernhard_Sch\u00f6lkopf1", "aff": "University of T\u00fcbingen;Meta;, Max Planck Institute for Intelligent Systems;Amazon;;University of Freiburg;Eberhard-Karls-Universit\u00e4t T\u00fcbingen;University of Tuebingen;", "aff_domain": "uni-tuebingen.de;meta.com;is.tuebingen.mpg.de;amazon.com;;uni-freiburg.de;uni-tuebingen.de;uni-tuebingen.de;", "position": "PhD student;Intern;PhD student;Senior Applied Scientist;;Full Professor;Postdoc;Full Professor;", "bibtex": "@misc{\ntangemann2022unsupervised,\ntitle={Unsupervised Object Learning via Common Fate},\nauthor={Matthias Tangemann and Steffen Schneider and Julius Von K{\\\"u}gelgen and Francesco Locatello and Peter Vincent Gehler and Thomas Brox and Matthias Kuemmerer and Matthias Bethge and Bernhard Sch{\\\"o}lkopf},\nyear={2022},\nurl={https://openreview.net/forum?id=YDqIYJBQTQs}\n}", "github": "", "project": "", "reviewers": "dAqW;GUfQ;3iwv", "site": "https://openreview.net/forum?id=YDqIYJBQTQs", "pdf_size": 0, "recommendation": "3;3;6", "confidence": "4;4;3", "correctness": "2;2;3", "technical_novelty": "1;3;2", "empirical_novelty": "2;3;3", "wc_summary_paper": "52;57;154", "wc_summary_review": "31;44;121", "wc_main_review": "149;422;481", "wc_review": "232;523;756", "wc_reply_reviewers": "54;475;65", "wc_reply_authors": "314;1256;587", "reply_reviewers": "1;2;1", "reply_authors": "2;2;2", "recommendation_avg": [ 4.0, 1.4142135623730951 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 2.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.0, 0.816496580927726 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 87.66666666666667, 46.949145063805176 ], "wc_summary_review_avg": [ 65.33333333333333, 39.71845358967989 ], "wc_main_review_avg": [ 350.6666666666667, 144.6198080792838 ], "wc_review_avg": [ 503.6666666666667, 214.35847441973354 ], "wc_reply_reviewers_avg": [ 198.0, 195.92005172178438 ], "wc_reply_authors_avg": [ 719.0, 395.73475965601 ], "reply_reviewers_avg": [ 1.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 2.0, 0.0 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 9, 0 ], "corr_recommendation_confidence": -1.0, "corr_recommendation_correctness": 1.0, "gs_citation": 27, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4030686879805286993&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff_unique_index": "0;1;2;3;4;5;6", "aff_unique_norm": "University of T\u00fcbingen;Meta;Max Planck Institute for Intelligent Systems;Amazon;University of Freiburg;Eberhard Karls University of T\u00fcbingen;University of Tuebingen", "aff_unique_dep": ";Meta Platforms, Inc.;;Amazon.com, Inc.;;;", "aff_unique_url": "https://www.uni-tuebingen.de/;https://meta.com;https://www.mpi-is.mpg.de;https://www.amazon.com;https://www.uni-freiburg.de;https://www.uni-tuebingen.de/;https://www.uni-tuebingen.de/", "aff_unique_abbr": "Uni T\u00fcbingen;Meta;MPI-IS;Amazon;UoF;Uni T\u00fcbingen;Uni T\u00fcbingen", "aff_campus_unique_index": "1", "aff_campus_unique": ";T\u00fcbingen", "aff_country_unique_index": "0;1;0;1;0;0;0", "aff_country_unique": "Germany;United States" }, { "id": "YDud6vPh2V", "title": "Xi-learning: Successor Feature Transfer Learning for General Reward Functions", "track": "main", "status": "Reject", "tldr": "", "abstract": "Transfer in Reinforcement Learning aims to improve learning performance on target tasks using knowledge from experienced source tasks. Successor features (SF) are a prominent transfer mechanism in domains where the reward function changes between tasks. They reevaluate the expected return of previously learned policies in a new target task and to transfer their knowledge. A limiting factor of the SF framework is its assumption that rewards linearly decompose into successor features and a reward weight vector. We propose a novel SF mechanism, $\\xi$-learning, based on learning the cumulative discounted probability of successor features. Crucially, $\\xi$-learning allows to reevaluate the expected return of policies for general reward functions. We introduce two $\\xi$-learning variations, prove its convergence, and provide a guarantee on its transfer performance. Experimental evaluations based on $\\xi$-learning with function approximation demonstrate the prominent advantage of $\\xi$-learning over available mechanisms not only for general reward functions, but also in the case of linearly decomposable reward functions.", "keywords": "reinforcement learning;transfer learning;meta learning;successor features", "primary_area": "", "supplementary_material": "", "author": "Chris Reinke;Xavier Alameda-Pineda", "authorids": "~Chris_Reinke1;~Xavier_Alameda-Pineda1", "gender": ";M", "homepage": ";http://xavirema.eu", "dblp": ";22/10486", "google_scholar": ";https://scholar.google.fr/citations?user=ukI2bz8AAAAJ", "orcid": ";0000-0002-5354-1084", "linkedin": ";xavier-alameda-pineda-4a47271a/", "or_profile": "~Chris_Reinke1;~Xavier_Alameda-Pineda1", "aff": "INRIA;INRIA", "aff_domain": "inria.fr;inria.fr", "position": "Postdoc;Researcher", "bibtex": "@misc{\nreinke2022xilearning,\ntitle={Xi-learning: Successor Feature Transfer Learning for General Reward Functions},\nauthor={Chris Reinke and Xavier Alameda-Pineda},\nyear={2022},\nurl={https://openreview.net/forum?id=YDud6vPh2V}\n}", "github": "", "project": "", "reviewers": "Ccnn;wZjF;wA2K;UrNN", "site": "https://openreview.net/forum?id=YDud6vPh2V", "pdf_size": 0, "recommendation": "5;5;5;5", "confidence": "4;4;3;3", "correctness": "4;3;3;3", "technical_novelty": "3;2;3;3", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "74;128;68;86", "wc_summary_review": "76;55;77;38", "wc_main_review": "592;321;429;149", "wc_review": "742;504;574;273", "wc_reply_reviewers": "173;0;0;0", "wc_reply_authors": "1123;582;995;623", "reply_reviewers": "1;0;0;0", "reply_authors": "2;2;2;2", "recommendation_avg": [ 5.0, 0.0 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 89.0, 23.430749027719962 ], "wc_summary_review_avg": [ 61.5, 16.163229875244614 ], "wc_main_review_avg": [ 372.75, 161.22713016114875 ], "wc_review_avg": [ 523.25, 168.39147098353882 ], "wc_reply_reviewers_avg": [ 43.25, 74.91119742735394 ], "wc_reply_authors_avg": [ 830.75, 233.144134603468 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 2.0, 0.0 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=368889743250818259&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "INRIA", "aff_unique_dep": "", "aff_unique_url": "https://www.inria.fr", "aff_unique_abbr": "INRIA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "France" }, { "id": "YHm6xV3JODS", "title": "Stop just recalling memorized relations: Extracting Unseen Relational Triples from the context", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "The ability to extract entities and their relations from unstructured text is essential for automated maintenance of large-scale knowledge graphs. To keep a knowledge graph up-to-date, it is required of an extractor to possess not only the ability to recall the triples encountered during training, but also the triples it has never seen before. In this paper, we show that although existing extraction models are able to memorize and recall already seen triples, they cannot generalize effectively for unseen triples. This alarming observation was previously unknown due to the composition of the test sets of the go-to benchmark datasets, which turns out to contain only 2\\% unseen data, rendering them incapable to measure the generalization performance. To combat memorization and promote generalization, we present a simple yet effective noising framework that can be combined with existing models. By carefully noising the entities and their surrounding context, we refrain the model from simply memorizing the entities and their context, and promote generalization. To properly evaluate the generalization performance, we propose test set augmentation and train set sifting to emphasize unseen data. Experiments show that our model not only outperforms the current state-of-the-art in terms of generalization on the newly augmented unseen test data, but is also able to retain its memorization capabilities - achieving competitive results on the standard test data.", "keywords": "Relational Triple Extraction;Natural Language Processing", "primary_area": "", "supplementary_material": "", "author": "Juhyuk Lee;Min-Joong Lee;June Yong Yang;Eunho Yang", "authorids": "~Juhyuk_Lee1;~Min-Joong_Lee1;~June_Yong_Yang1;~Eunho_Yang1", "gender": "M;M;;M", "homepage": ";;http://mli.kaist.ac.kr/people/;https://sites.google.com/site/hleehome2/", "dblp": ";90/9436;277/5624;96/2621", "google_scholar": "https://scholar.google.co.kr/citations?user=InhSbJAAAAAJ;ZzFeopcAAAAJ;nkLNWg0AAAAJ;", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Juhyuk_Lee1;~Min-Joong_Lee1;~June_Yong_Yang1;~Eunho_Yang1", "aff": "Samsung Research;;Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology", "aff_domain": "samsung.com;;kaist.ac.kr;kaist.ac.kr", "position": "Researcher;;PhD student;Associate Professor", "bibtex": "@misc{\nlee2022stop,\ntitle={Stop just recalling memorized relations: Extracting Unseen Relational Triples from the context},\nauthor={Juhyuk Lee and Min-Joong Lee and June Yong Yang and Eunho Yang},\nyear={2022},\nurl={https://openreview.net/forum?id=YHm6xV3JODS}\n}", "github": "", "project": "", "reviewers": "1CnV;fH2Q;MfiT;54pq", "site": "https://openreview.net/forum?id=YHm6xV3JODS", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "4;5;4;4", "correctness": "2;1;4;4", "technical_novelty": "2;1;2;2", "empirical_novelty": "2;2;4;3", "wc_summary_paper": "95;109;168;246", "wc_summary_review": "30;43;93;105", "wc_main_review": "360;624;319;194", "wc_review": "485;776;580;545", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 2.75, 1.299038105676658 ], "technical_novelty_avg": [ 1.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 154.5, 59.508402768012516 ], "wc_summary_review_avg": [ 67.75, 31.869852525545205 ], "wc_main_review_avg": [ 374.25, 156.6211591707838 ], "wc_review_avg": [ 596.5, 109.0607628801486 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.9622504486493763, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:pDAf5j6ZmoUJ:scholar.google.com/&scioq=Stop+just+recalling+memorized+relations:+Extracting+Unseen+Relational+Triples+from+the+context&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1;1", "aff_unique_norm": "Samsung;Korea Advanced Institute of Science and Technology", "aff_unique_dep": "Samsung Research;", "aff_unique_url": "https://research.samsung.com;https://www.kaist.ac.kr", "aff_unique_abbr": "Samsung;KAIST", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "South Korea" }, { "title": "Reinforcement Learning with Sparse Rewards using Guidance from Offline Demonstration", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6479", "id": "YJ1WzgMVsMt", "poster": "", "openreview": "https://openreview.net/forum?id=YJ1WzgMVsMt", "slides": "https://iclr.cc/virtual/2022/poster/6479", "video": "https://iclr.cc/virtual/2022/poster/6479", "author_site": "Desik Rengarajan, Gargi Vaidya, Akshay Sarvesh, Dileep Kalathil, Srinivas Shakkottai", "tldr": "", "abstract": "A major challenge in real-world reinforcement learning (RL) is the sparsity of reward feedback. Often, what is available is an intuitive but sparse reward function that only indicates whether the task is completed partially or fully. However, the lack of carefully designed, fine grain feedback implies that most existing RL algorithms fail to learn an acceptable policy in a reasonable time frame. This is because of the large number of exploration actions that the policy has to perform before it gets any useful feedback that it can learn from. In this work, we address this challenging problem by developing an algorithm that exploits the offline demonstration data generated by {a sub-optimal behavior policy} for faster and efficient online RL in such sparse reward settings. The proposed algorithm, which we call the Learning Online with Guidance Offline (LOGO) algorithm, merges a policy improvement step with an additional policy guidance step by using the offline demonstration data. The key idea is that by obtaining guidance from - not imitating - the offline {data}, LOGO orients its policy in the manner of the sub-optimal {policy}, while yet being able to learn beyond and approach optimality. We provide a theoretical analysis of our algorithm, and provide a lower bound on the performance improvement in each learning episode. We also extend our algorithm to the even more challenging incomplete observation setting, where the demonstration data contains only a censored version of the true state observation. We demonstrate the superior performance of our algorithm over state-of-the-art approaches on a number of benchmark environments with sparse rewards {and censored state}. Further, we demonstrate the value of our approach via implementing LOGO on a mobile robot for trajectory tracking and obstacle avoidance, where it shows excellent performance.", "keywords": "Reinforcement Learning;Sparse Rewards;Learning from Demonstrations", "primary_area": "", "supplementary_material": "/attachment/e9a8a1a43323e97b23825086eb46414675c9d0b4.zip", "author": "Desik Rengarajan;Gargi Vaidya;Akshay Sarvesh;Dileep Kalathil;Srinivas Shakkottai", "authorids": "~Desik_Rengarajan1;gargivaidya@tamu.edu;~Akshay_Sarvesh1;~Dileep_Kalathil1;~Srinivas_Shakkottai1", "gender": "M;;;M;", "homepage": "https://sites.google.com/view/desik-rengarajan/home;;https://akshaysarvesh25.github.io/;http://people.tamu.edu/~dileep.kalathil/;https://cesg.tamu.edu/faculty/sshakkot/", "dblp": "218/1345;;;44/8356;03/353.html", "google_scholar": "ygOY_E4AAAAJ;;;S24XFwwAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": "0000-0002-8538-6023;;;;0000-0002-5882-6433", "linkedin": "desik-rengarajan-109868100/;;;;", "or_profile": "~Desik_Rengarajan1;gargivaidya@tamu.edu;~Akshay_Sarvesh1;~Dileep_Kalathil1;~Srinivas_Shakkottai1", "aff": "Texas A&M University;;Texas A&M;Texas A&M University;Texas A&M", "aff_domain": "tamu.edu;;tamu.edu;tamu.edu;tamu.edu", "position": "PhD student;;PhD student;Associate Professor;Full Professor", "bibtex": "@inproceedings{\nrengarajan2022reinforcement,\ntitle={Reinforcement Learning with Sparse Rewards using Guidance from Offline Demonstration},\nauthor={Desik Rengarajan and Gargi Vaidya and Akshay Sarvesh and Dileep Kalathil and Srinivas Shakkottai},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=YJ1WzgMVsMt}\n}", "github": "", "project": "", "reviewers": "FWYZ;KhX3;DXFH;FZbN;DFHZ", "pdf_size": 0, "recommendation": "6;6;8;8;8", "confidence": "4;4;3;4;4", "correctness": "4;3;4;4;3", "technical_novelty": "2;3;3;3;3", "empirical_novelty": "3;2;2;4;3", "wc_summary_paper": "90;112;60;109;82", "wc_summary_review": "59;73;91;68;91", "wc_main_review": "613;680;318;821;159", "wc_review": "762;865;469;998;332", "wc_reply_reviewers": "0;112;0;100;0", "wc_reply_authors": "882;722;254;778;479", "reply_reviewers": "0;1;0;1;0", "reply_authors": "2;1;1;2;1", "recommendation_avg": [ 7.2, 0.9797958971132712 ], "confidence_avg": [ 3.8, 0.39999999999999997 ], "correctness_avg": [ 3.6, 0.4898979485566356 ], "technical_novelty_avg": [ 2.8, 0.39999999999999997 ], "empirical_novelty_avg": [ 2.8, 0.7483314773547882 ], "wc_summary_paper_avg": [ 90.6, 19.011575421305828 ], "wc_summary_review_avg": [ 76.4, 12.737346662472527 ], "wc_main_review_avg": [ 518.2, 243.29356752696935 ], "wc_review_avg": [ 685.2, 248.01725746407243 ], "wc_reply_reviewers_avg": [ 42.4, 52.067648304873536 ], "wc_reply_authors_avg": [ 623.0, 227.07003324965626 ], "reply_reviewers_avg": [ 0.4, 0.48989794855663565 ], "reply_authors_avg": [ 1.4, 0.4898979485566356 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.40824829046386296, "corr_recommendation_correctness": 0.16666666666666663, "gs_citation": 83, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6148886566095169606&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=YJ1WzgMVsMt", "email": "tamu.edu;;tamu.edu;tamu.edu;tamu.edu", "author_num": 5, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Texas A&M University", "aff_unique_dep": "", "aff_unique_url": "https://www.tamu.edu", "aff_unique_abbr": "TAMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "YJVMboHZCtW", "title": "Decision boundary variability and generalization in neural networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "Existing works suggest that the generalizability is guaranteed when the margin between data and decision boundaries is sufficiently large. However, the existence of adversarial examples in neural networks shows that excellent generalization and small margin can exist simultaneously, which casts shadows to the current understanding. This paper discovers that the neural network with lower decision boundary (DB) variability has better generalizability. Two new notions, algorithm DB variability and $(\\epsilon, \\eta)$-data DB variability, are proposed to measure the decision boundary variability from the algorithm and data perspectives. Extensive experiments show significant negative correlations between the decision boundary variability and the generalizability. From the theoretical view, we prove two lower bounds and two upper bounds of the generalization error based on the decision boundary variability, which is consistent with our empirical results. Moreover, the bounds do not explicitly depend on the network size, which is usually prohibitively large in deep learning.", "keywords": "explainability of deep learning", "primary_area": "", "supplementary_material": "", "author": "Shiye Lei;Fengxiang He;Yancheng Yuan;Dacheng Tao", "authorids": "~Shiye_Lei1;~Fengxiang_He1;~Yancheng_Yuan1;~Dacheng_Tao1", "gender": ";;;", "homepage": ";https://fengxianghe.github.io/;;", "dblp": ";225/4682;;", "google_scholar": ";QSx-Yu0AAAAJ;;", "orcid": ";;;", "linkedin": ";fengxiang-he-35b173122;;", "or_profile": "~Shiye_Lei1;~Fengxiang_He1;~Yancheng_Yuan1;~Dacheng_Tao1", "aff": ";JD.com, Inc.;;", "aff_domain": ";jd.com;;", "position": ";Algorithm Scientist;;", "bibtex": "@misc{\nlei2022decision,\ntitle={Decision boundary variability and generalization in neural networks},\nauthor={Shiye Lei and Fengxiang He and Yancheng Yuan and Dacheng Tao},\nyear={2022},\nurl={https://openreview.net/forum?id=YJVMboHZCtW}\n}", "github": "", "project": "", "reviewers": "kiZ8;3dWe;pU8o;fXBi", "site": "https://openreview.net/forum?id=YJVMboHZCtW", "pdf_size": 0, "recommendation": "6;6;6;8", "confidence": "4;3;3;3", "correctness": "4;3;4;4", "technical_novelty": "3;3;3;4", "empirical_novelty": "3;3;2;4", "wc_summary_paper": "33;66;68;55", "wc_summary_review": "36;33;41;36", "wc_main_review": "614;141;504;125", "wc_review": "683;240;613;216", "wc_reply_reviewers": "344;117;217;14", "wc_reply_authors": "2052;1278;2043;385", "reply_reviewers": "4;2;1;1", "reply_authors": "7;4;4;2", "recommendation_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 55.5, 13.901438774457844 ], "wc_summary_review_avg": [ 36.5, 2.8722813232690143 ], "wc_main_review_avg": [ 346.0, 216.59524463847308 ], "wc_review_avg": [ 438.0, 211.62348640923577 ], "wc_reply_reviewers_avg": [ 173.0, 122.05941176328845 ], "wc_reply_authors_avg": [ 1439.5, 685.0950663958981 ], "reply_reviewers_avg": [ 2.0, 1.224744871391589 ], "reply_authors_avg": [ 4.25, 1.7853571071357126 ], "replies_avg": [ 30, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:XE2fo676ZegJ:scholar.google.com/&scioq=Decision+boundary+variability+and+generalization+in+neural+networks&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "JD.com", "aff_unique_dep": "", "aff_unique_url": "https://www.jd.com", "aff_unique_abbr": "JD.com", "aff_country_unique_index": "0", "aff_country_unique": "China" }, { "id": "YKAVWfKSKU", "title": "Deep Dirichlet Process Mixture Models", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "In this paper we propose the deep Dirichlet process mixture (DDPM) model, which is an unsupervised method that simultaneously performs clustering and feature learning. As a member of the Bayesian nonparametrics family, the traditional Dirichlet process mixture model is able to adapt the number of its mixture components. However its modelling capacity is restricted since the clustering is performed in the raw feature space, rendering it inapplicable to complex domains like images and texts. Our method alleviates this limitation by using the flow-based generative model, which is a deep invertible neural network, to learn more expressive features. These two seemly orthogonal models are unified by the Monte Carlo expectation-maximization algorithm, and during its iterations Gibbs sampling is used to generate samples from the posterior. This combination allows our method to exploit the mutually beneficial relation between clustering and feature learning. We conducted comparison experiments on four clustering benchmark datasets. The clustering performance of DDPM shows a significant gain over DPM in most cases and is competitive compared to other popular methods. Furthermore, the learned representation of DDPM is shown to be efficient and universal to boost other methods' performance.", "keywords": "Dirichlet process;Bayesian nonparametrics;flow-based generative model;clustering", "primary_area": "", "supplementary_material": "", "author": "Naiqi Li;Wenjie Li;Yong Jiang;Shu-Tao Xia", "authorids": "~Naiqi_Li1;~Wenjie_Li3;~Yong_Jiang3;~Shu-Tao_Xia1", "gender": "M;M;M;M", "homepage": "https://naiqili.github.io/;https://still2009.github.io;;https://www.sigs.tsinghua.edu.cn/xst/list.htm", "dblp": "117/4912;33/3999-8;74/1552-1.html;03/6195", "google_scholar": "5K2l_wUAAAAJ;https://scholar.google.com/citations?hl=en;;https://scholar.google.com.hk/citations?user=koAXTXgAAAAJ", "orcid": ";;;0000-0002-8639-982X", "linkedin": ";;;", "or_profile": "~Naiqi_Li1;~Wenjie_Li3;~Yong_Jiang3;~Shu-Tao_Xia1", "aff": "Tsinghua University;Tencent;Tsinghua University;Shenzhen International Graduate School, Tsinghua University", "aff_domain": "tsinghua.edu.cn;tencent.com;tsinghua.edu.cn;sz.tsinghua.edu.cn", "position": "PhD student;Research Intern;Full Professor;Full Professor", "bibtex": "@misc{\nli2022deep,\ntitle={Deep Dirichlet Process Mixture Models},\nauthor={Naiqi Li and Wenjie Li and Yong Jiang and Shu-Tao Xia},\nyear={2022},\nurl={https://openreview.net/forum?id=YKAVWfKSKU}\n}", "github": "", "project": "", "reviewers": "oyrV;iduD;sHc8;96ak", "site": "https://openreview.net/forum?id=YKAVWfKSKU", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "4;3;4;4", "correctness": "3;3;3;3", "technical_novelty": "3;2;2;3", "empirical_novelty": "2;3;2;3", "wc_summary_paper": "43;120;52;48", "wc_summary_review": "77;29;19;82", "wc_main_review": "583;315;207;241", "wc_review": "703;464;278;371", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 65.75, 31.483130403439873 ], "wc_summary_review_avg": [ 51.75, 28.030117730755254 ], "wc_main_review_avg": [ 336.5, 147.5762514769907 ], "wc_review_avg": [ 454.0, 158.08700136317344 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.13245323570650439, "corr_recommendation_correctness": 0.0, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17936712230563689136&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1;0;0", "aff_unique_norm": "Tsinghua University;Tencent", "aff_unique_dep": ";Tencent Holdings Limited", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.tencent.com", "aff_unique_abbr": "THU;Tencent", "aff_campus_unique_index": "1", "aff_campus_unique": ";Shenzhen", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "id": "YLglAn-USkf", "title": "Are BERT Families Zero-Shot Learners? A Study on Their Potential and Limitations", "track": "main", "status": "Reject", "tldr": "", "abstract": "Starting from the resurgence of deep learning, language models (LMs) have never been so popular. Through simply increasing model scale and data size, large LMs pre-trained with self-supervision objectives demonstrate awe-inspiring results on both task performance and generalization. At the early stage, supervised fine-tuning is indispensable in adapting pre-trained language models (PLMs) to downstream tasks. Later on, the sustained growth of model capacity and data size, as well as newly presented pre-training techniques, make the PLMs perform well under the few-shot setting, especially in the recent paradigm of prompt-based learning. After witnessing the success of PLMs for few-shot tasks, we propose to further study the potential and limitations of PLMs for the zero-shot setting. We utilize 3 models from the most popular BERT family to launch the empirical study on 20 different datasets. We are surprised to find that a simple Multi-Null Prompting (without manually/automatically created prompts) strategy can yield very promising results on a few widely-used datasets, e.g., $86.59\\%(\\pm0.59)$ accuracy on the IMDB dataset, and $86.22\\%(\\pm2.71)$ accuracy on the Amazon dataset, which outperforms manually created prompts without engineering in achieving much better and stable performance with the accuracy of $74.06\\%(\\pm13.04)$, $75.54\\%(\\pm11.77)$ for comparison. However, we also observe some limitations of PLMs under the zero-shot setting, particularly for the language understanding tasks (e.g., GLUE).", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yue Wang;Lijun Wu;xiaobo liang;Juntao Li;Min Zhang", "authorids": "~Yue_Wang17;~Lijun_Wu1;~xiaobo_liang1;~Juntao_Li2;~Min_Zhang1", "gender": "M;M;M;M;", "homepage": "https://wangyuenlp.github.io/;https://apeterswu.github.io/;;https://lijuntaopku.github.io/;", "dblp": "33/4822-39.html;68/1284-3;;;83/5342", "google_scholar": ";https://scholar.google.com/citations?hl=en;;sZSygsYAAAAJ;", "orcid": ";0000-0002-3530-590X;0009-0001-1550-2877;0000-0002-6286-7529;", "linkedin": ";lijun-wu-59340478/;;;", "or_profile": "~Yue_Wang17;~Lijun_Wu1;~xiaobo_liang1;~Juntao_Li2;~Min_Zhang1", "aff": "Soochow University, China;Microsoft Research;Soochow University, China;Soochow University, China;", "aff_domain": "suda.edu.cn;microsoft.com;suda.edu.cn;suda.edu.cn;", "position": "PhD student;Researcher;PhD student;Associate Professor;", "bibtex": "@inproceedings{\nwang2022are,\ntitle={Are {BERT} Families Zero-Shot Learners? A Study on Their Potential and Limitations},\nauthor={Yue Wang and Lijun Wu and xiaobo liang and Juntao Li and Min Zhang},\nbooktitle={Submitted to The Tenth International Conference on Learning Representations },\nyear={2022},\nurl={https://openreview.net/forum?id=YLglAn-USkf},\nnote={under review}\n}", "github": "", "project": "", "reviewers": "B5Rv;9k3X;S18q;sxHN", "site": "https://openreview.net/forum?id=YLglAn-USkf", "pdf_size": 0, "recommendation": "3;5;6;6", "confidence": "4;3;4;3", "correctness": "3;3;3;4", "technical_novelty": "1;2;2;2", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "949;60;159;102", "wc_summary_review": "66;63;53;35", "wc_main_review": "379;387;416;112", "wc_review": "1394;510;628;249", "wc_reply_reviewers": "0;217;111;0", "wc_reply_authors": "1637;1802;980;480", "reply_reviewers": "0;1;1;0", "reply_authors": "3;3;2;1", "recommendation_avg": [ 5.0, 1.224744871391589 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 1.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 317.5, 366.2857491085341 ], "wc_summary_review_avg": [ 54.25, 12.111461513789324 ], "wc_main_review_avg": [ 323.5, 122.88307450580817 ], "wc_review_avg": [ 695.25, 426.0958665605664 ], "wc_reply_reviewers_avg": [ 82.0, 90.15819430312477 ], "wc_reply_authors_avg": [ 1224.75, 528.6120387391873 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 2.25, 0.82915619758885 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.40824829046386296, "corr_recommendation_correctness": 0.4714045207910316, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:A63VO7DGqakJ:scholar.google.com/&scioq=Are+BERT+Families+Zero-Shot+Learners%3F+A+Study+on+Their+Potential+and+Limitations&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;0;0", "aff_unique_norm": "Soochow University;Microsoft", "aff_unique_dep": ";Microsoft Research", "aff_unique_url": "https://www.soochow.edu.cn;https://www.microsoft.com/en-us/research", "aff_unique_abbr": "Soochow U;MSR", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0", "aff_country_unique": "China;United States" }, { "id": "YRDlrT00BP", "title": "On Transportation of Mini-batches: A Hierarchical Approach", "track": "main", "status": "Reject", "tldr": "", "abstract": "Mini-batch optimal transport (m-OT) has been successfully used in practical applications that involve probability measures with a very high number of supports. The m-OT solves several smaller optimal transport problems and then returns the average of their costs and transportation plans. Despite its scalability advantage, the m-OT does not consider the relationship between mini-batches which leads to undesirable estimation. Moreover, the m-OT does not approximate a proper metric between probability measures since the identity property is not satisfied. To address these problems, we propose a novel mini-batching scheme for optimal transport, named Batch of Mini-batches Optimal Transport (BoMb-OT), that finds the optimal coupling between mini-batches and it can be seen as an approximation to a well-defined distance on the space of probability measures. Furthermore, we show that the m-OT is a limit of the entropic regularized version of the BoMb-OT when the regularized parameter goes to infinity. Finally, we present the new algorithms of the BoMb-OT in various applications, such as deep generative models and deep domain adaptation. From extensive experiments, we observe that the BoMb-OT achieves a favorable performance in deep learning models such as deep generative models and deep domain adaptation. In other applications such as approximate Bayesian computation, color transfer, and gradient flow, the BoMb-OT also yields either a lower quantitative result or a better qualitative result than the m-OT.", "keywords": "Deep Generative Models;Deep Domain Adaptation;Color Transfer;Approximate Bayesian Computation;Gradient Flow;Optimal Transport", "primary_area": "", "supplementary_material": "/attachment/044b29486bd4b44b5f58c45c92e1769b8b2550d1.zip", "author": "Khai Nguyen;Dang Nguyen;Nguyen Dinh Quoc;Tung Pham;Hung Bui;Dinh Phung;Trung Le;Nhat Ho", "authorids": "~Khai_Nguyen1;~Dang_Nguyen2;~Nguyen_Dinh_Quoc1;~Tung_Pham1;v.hungbh1@vinai.io;~Dinh_Phung2;~Trung_Le2;~Nhat_Ho1", "gender": "M;M;M;M;;;M;M", "homepage": "https://khainb.com;https://hsgser.github.io/;;;;;;https://nhatptnk8912.github.io/", "dblp": "120/4308;;;38/10862-1;;;;203/4479", "google_scholar": "im5fNaQAAAAJ;https://scholar.google.co.jp/citations?user=WIqAtrcAAAAJ;https://scholar.google.com.vn/citations?user=nGMSVVAAAAAJ;KcUuEKsAAAAJ;;;https://scholar.google.com/citations?hl=en;https://scholar.google.ca/citations?user=Xs7cKMwAAAAJ", "orcid": ";;;;;;;", "linkedin": ";dang-nguyen-50b7a7a0/;dinh-quoc-nguyen-7ab95a146/;;;;;nhat-pham-minh-ho-267b8164/", "or_profile": "~Khai_Nguyen1;~Dang_Nguyen2;~Nguyen_Dinh_Quoc1;~Tung_Pham1;v.hungbh1@vinai.io;~Dinh_Phung2;~Trung_Le2;~Nhat_Ho1", "aff": "University of Texas, Austin;;VinAI;VinAI Research;;;Monash University;University of Texas, Austin", "aff_domain": "utexas.edu;;vinai.io;vinai.io;;;monash.edu;utexas.edu", "position": "PhD student;;AI Resident;Researcher;;;Assistant Professor;Assistant Professor", "bibtex": "@misc{\nnguyen2022on,\ntitle={On Transportation of Mini-batches: A Hierarchical Approach},\nauthor={Khai Nguyen and Dang Nguyen and Nguyen Dinh Quoc and Tung Pham and Hung Bui and Dinh Phung and Trung Le and Nhat Ho},\nyear={2022},\nurl={https://openreview.net/forum?id=YRDlrT00BP}\n}", "github": "", "project": "", "reviewers": "ScyT;9B4c;C31a", "site": "https://openreview.net/forum?id=YRDlrT00BP", "pdf_size": 0, "recommendation": "3;5;6", "confidence": "4;3;3", "correctness": "3;4;3", "technical_novelty": "3;2;3", "empirical_novelty": "2;2;3", "wc_summary_paper": "60;19;93", "wc_summary_review": "84;11;26", "wc_main_review": "664;293;240", "wc_review": "808;323;359", "wc_reply_reviewers": "608;0;0", "wc_reply_authors": "2872;1785;788", "reply_reviewers": "4;0;0", "reply_authors": "8;4;2", "recommendation_avg": [ 4.666666666666667, 1.247219128924647 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 57.333333333333336, 30.26916289265731 ], "wc_summary_review_avg": [ 40.333333333333336, 31.478387647541428 ], "wc_main_review_avg": [ 399.0, 188.62838245255315 ], "wc_review_avg": [ 496.6666666666667, 220.63594952369442 ], "wc_reply_reviewers_avg": [ 202.66666666666666, 286.6139486409473 ], "wc_reply_authors_avg": [ 1815.0, 851.0538565018471 ], "reply_reviewers_avg": [ 1.3333333333333333, 1.8856180831641267 ], "reply_authors_avg": [ 4.666666666666667, 2.494438257849294 ], "replies_avg": [ 23, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": -0.9449111825230683, "corr_recommendation_correctness": 0.18898223650461363, "gs_citation": 21, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5470461083042798465&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff_unique_index": "0;1;2;3;0", "aff_unique_norm": "University of Texas at Austin;VinAI;VinAI Research;Monash University", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.utexas.edu;https://www.vinai.co;https://www.vinai.io/;https://www.monash.edu", "aff_unique_abbr": "UT Austin;VinAI;VinAI;Monash", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Austin;", "aff_country_unique_index": "0;1;1;2;0", "aff_country_unique": "United States;Vietnam;Australia" }, { "title": "A Relational Intervention Approach for Unsupervised Dynamics Generalization in Model-Based Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6135", "id": "YRq0ZUnzKoZ", "poster": "", "openreview": "https://openreview.net/forum?id=YRq0ZUnzKoZ", "slides": "https://iclr.cc/virtual/2022/poster/6135", "video": "https://iclr.cc/virtual/2022/poster/6135", "author_site": "Jiaxian Guo, Mingming Gong, Dacheng Tao", "tldr": "", "abstract": "The generalization of model-based reinforcement learning (MBRL) methods to environments with unseen transition dynamics is an important yet challenging problem.\nExisting methods try to extract environment-specified information $Z$ from past transition segments to make the dynamics prediction model generalizable to different dynamics. However, because environments are not labelled, the extracted information inevitably contains redundant information unrelated to the dynamics in transition segments and thus fails to maintain a crucial property of $Z$: $Z$ should be similar in the same environment and dissimilar in different ones. As a result, the learned dynamics prediction function will deviate from the true one, which undermines the generalization ability. To tackle this problem, we introduce an interventional prediction module to estimate the probability of two estimated $\\hat{z}_i, \\hat{z}_j$ belonging to the same environment.\nFurthermore, by utilizing the $Z$'s invariance within a single environment, a relational head is proposed to enforce the similarity between $\\hat{{Z}}$ from the same environment. As a result, the redundant information will be reduced in $\\hat{Z}$. We empirically show that $\\hat{{Z}}$ estimated by our method enjoy less redundant information than previous methods, and such $\\hat{{Z}}$ can significantly reduce dynamics prediction errors and improve the performance of model-based RL methods on zero-shot new environments with unseen dynamics. The codes of this method are available at \\url{https://github.com/CR-Gjx/RIA}.", "keywords": "Model-Based Reinforcement Learning;Unsupervised Dynamics Generalization", "primary_area": "", "supplementary_material": "", "author": "Jiaxian Guo;Mingming Gong;Dacheng Tao", "authorids": "~Jiaxian_Guo2;~Mingming_Gong1;~Dacheng_Tao1", "gender": "M;M;", "homepage": ";https://mingming-gong.github.io/;", "dblp": "206/6264;98/8479;", "google_scholar": "wQgPocEAAAAJ;https://scholar.google.com.au/citations?user=6BmiCJIAAAAJ;", "orcid": ";0000-0001-7147-5589;", "linkedin": ";;", "or_profile": "~Jiaxian_Guo2;~Mingming_Gong1;~Dacheng_Tao1", "aff": "University of Sydney;University of Melbourne;", "aff_domain": "sydney.edu.au;unimelb.edu.au;", "position": "PhD student;Assistant Professor;", "bibtex": "@inproceedings{\nguo2022a,\ntitle={A Relational Intervention Approach for Unsupervised Dynamics Generalization in Model-Based Reinforcement Learning},\nauthor={Jiaxian Guo and Mingming Gong and Dacheng Tao},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=YRq0ZUnzKoZ}\n}", "github": "", "project": "", "reviewers": "q6e8;RE3u;3e12;db3k", "pdf_size": 0, "recommendation": "6;6;8;8", "confidence": "4;4;4;3", "correctness": "2;3;4;4", "technical_novelty": "3;3;3;3", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "278;141;140;106", "wc_summary_review": "150;72;106;18", "wc_main_review": "270;356;538;204", "wc_review": "698;569;784;328", "wc_reply_reviewers": "16;30;56;0", "wc_reply_authors": "1255;1459;1368;126", "reply_reviewers": "1;1;1;0", "reply_authors": "2;2;2;1", "recommendation_avg": [ 7.0, 1.0 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 166.25, 66.03928754915516 ], "wc_summary_review_avg": [ 86.5, 48.257123826436235 ], "wc_main_review_avg": [ 342.0, 125.33953885346794 ], "wc_review_avg": [ 594.75, 171.97001918939242 ], "wc_reply_reviewers_avg": [ 25.5, 20.56088519495209 ], "wc_reply_authors_avg": [ 1052.0, 539.4881833738344 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.75, 0.4330127018922193 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.9045340337332909, "gs_citation": 27, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16171191146892627821&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=YRq0ZUnzKoZ", "email": "sydney.edu.au;unimelb.edu.au;", "author_num": 3, "aff_unique_index": "0;1", "aff_unique_norm": "University of Sydney;University of Melbourne", "aff_unique_dep": ";", "aff_unique_url": "https://www.sydney.edu.au;https://www.unimelb.edu.au", "aff_unique_abbr": "USYD;UniMelb", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Australia" }, { "id": "YTtMaJUN_uc", "title": "Learning Universal User Representations via Self-Supervised Lifelong Behaviors Modeling", "track": "main", "status": "Reject", "tldr": "", "abstract": "Universal user representation is an important research topic in industry, and is widely used in diverse downstream user analysis tasks, such as user profiling and user preference prediction. With the rapid development of Internet service platforms, extremely long user behavior sequences have been accumulated. However, existing researches have little ability to model universal user representation based on lifelong behavior sequences since user registration. In this study, we propose a novel framework called Lifelong User Representation Model (LURM) to tackle this challenge. Specifically, LURM consists of two cascaded sub-models: (i) Bag of Interests (BoI) encodes user behaviors in any time period into a sparse vector with super-high dimension (eg. 10^5); (ii) Self-supervised Multi-anchor Encoder Network (SMEN) maps sequences of BoI features to multiple low-dimensional user representations by contrastive learning. SMEN achieves almost lossless dimensionality reduction with the main help of a novel multi-anchor module which can learn different aspects of user preferences. Experiments on several benchmark datasets show that our approach can outperform state-of-the-art unsupervised representation methods in downstream tasks.", "keywords": "universal user representation;extremely long sequence modeling;self-supervised learning", "primary_area": "", "supplementary_material": "", "author": "Bei Yang;Ke Liu;Xiaoxiao Xu;Renjun Xu;Hong Liu;huan xu", "authorids": "~Bei_Yang1;~Ke_Liu3;~Xiaoxiao_Xu1;~Renjun_Xu1;~Hong_Liu10;~huan_xu1", "gender": ";;M;;M;M", "homepage": ";;https://orcid.org/0000-0003-0189-8601;https://person.zju.edu.cn/en/rux;https://c.liepin.com/resume/getdefaultresume/;", "dblp": ";;;269/4621;;35/2843", "google_scholar": ";;;;;", "orcid": "0000-0002-1997-234X;;0000-0003-0189-8601;0000-0002-7566-7948;;", "linkedin": ";;;;;", "or_profile": "~Bei_Yang1;~Ke_Liu3;~Xiaoxiao_Xu1;~Renjun_Xu1;~Hong_Liu10;~huan_xu1", "aff": "Alibaba Group;;;Zhejiang University;Alibaba Group;Georgia Institute of Technology", "aff_domain": "alibaba-inc.com;;;zju.edu.cn;alibaba-inc.com;gatech.edu", "position": "Researcher;;;Principal Researcher;Researcher;Assistant Professor", "bibtex": "@misc{\nyang2022learning,\ntitle={Learning Universal User Representations via Self-Supervised Lifelong Behaviors Modeling},\nauthor={Bei Yang and Ke Liu and Xiaoxiao Xu and Renjun Xu and Hong Liu and huan xu},\nyear={2022},\nurl={https://openreview.net/forum?id=YTtMaJUN_uc}\n}", "github": "", "project": "", "reviewers": "Rubm;2k8K;FiJs", "site": "https://openreview.net/forum?id=YTtMaJUN_uc", "pdf_size": 0, "recommendation": "5;5;5", "confidence": "4;4;3", "correctness": "3;3;2", "technical_novelty": "3;3;2", "empirical_novelty": "2;2;2", "wc_summary_paper": "59;65;109", "wc_summary_review": "64;23;201", "wc_main_review": "327;396;330", "wc_review": "450;484;640", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "1267;1215;1307", "reply_reviewers": "0;0;0", "reply_authors": "3;2;2", "recommendation_avg": [ 5.0, 0.0 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 2.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 77.66666666666667, 22.29100466306732 ], "wc_summary_review_avg": [ 96.0, 76.10957013849617 ], "wc_main_review_avg": [ 351.0, 31.843366656181317 ], "wc_review_avg": [ 524.6666666666666, 82.72578531241366 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 1263.0, 37.66519171153476 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.3333333333333335, 0.4714045207910317 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14821162064290577088&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;0;2", "aff_unique_norm": "Alibaba Group;Zhejiang University;Georgia Institute of Technology", "aff_unique_dep": ";;", "aff_unique_url": "https://www.alibaba.com;https://www.zju.edu.cn;https://www.gatech.edu", "aff_unique_abbr": "Alibaba;ZJU;Georgia Tech", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1", "aff_country_unique": "China;United States" }, { "title": "Scale Mixtures of Neural Network Gaussian Processes", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6289", "id": "YVPBh4k78iZ", "poster": "", "openreview": "https://openreview.net/forum?id=YVPBh4k78iZ", "slides": "https://iclr.cc/virtual/2022/poster/6289", "video": "https://iclr.cc/virtual/2022/poster/6289", "author_site": "Hyungi Lee, Eunggu Yun, Hongseok Yang, Juho Lee", "tldr": "", "abstract": "Recent works have revealed that infinitely-wide feed-forward or recurrent neural networks of any architecture correspond to Gaussian processes referred to as NNGP. While these works have extended the class of neural networks converging to Gaussian processes significantly, however, there has been little focus on broadening the class of stochastic processes that such neural networks converge to. In this work, inspired by the scale mixture of Gaussian random variables, we propose the scale mixture of NNGP for which we introduce a prior distribution on the scale of the last-layer parameters. We show that simply introducing a scale prior on the last-layer parameters can turn infinitely-wide neural networks of any architecture into a richer class of stochastic processes. With certain scale priors, we obtain heavy-tailed stochastic processes, and in the case of inverse gamma priors, we recover Student\u2019s $t$ processes. We further analyze the distributions of the neural networks initialized with our prior setting and trained with gradient descents and obtain similar results as for NNGP. We present a practical posterior-inference algorithm for the scale mixture of NNGP and empirically demonstrate its usefulness on regression and classification tasks. In particular, we show that in both tasks, the heavy-tailed stochastic processes obtained from our framework are robust to out-of-distribution data.", "keywords": "Neural Network Gaussian Processes;Infinitely-wide Neural Networks;Scale Mixtures of Gaussians;Heavy-tailed Stochastic Processes", "primary_area": "", "supplementary_material": "/attachment/8eb4aee476f11ba80d33da315e70f386c2ad16c4.zip", "author": "Hyungi Lee;Eunggu Yun;Hongseok Yang;Juho Lee", "authorids": "~Hyungi_Lee1;~Eunggu_Yun1;~Hongseok_Yang2;~Juho_Lee2", "gender": "M;M;M;M", "homepage": ";https://juho.lee.github.io;https://yuneg11.github.io;https://sites.google.com/view/hongseokyang/home", "dblp": "221/7959;55/3410-1;;82/5808", "google_scholar": ";Py4URJUAAAAJ;r7-847MAAAAJ;cLuwH14AAAAJ", "orcid": ";;0000-0002-4648-1415;", "linkedin": "hyungi-lee-a8b161149/;;yuneg/;", "or_profile": "~Hyungi_Lee1;~Juho_Lee2;~EungGu_Yun1;~Hongseok_Yang1", "aff": "Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology;Institute for Basic Science", "aff_domain": "kaist.ac.kr;kaist.ac.kr;kaist.ac.kr;ibs.re.kr", "position": "MS student;Assistant Professor;MS student;Visiting Research Fellow", "bibtex": "@inproceedings{\nlee2022scale,\ntitle={Scale Mixtures of Neural Network Gaussian Processes},\nauthor={Hyungi Lee and Eunggu Yun and Hongseok Yang and Juho Lee},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=YVPBh4k78iZ}\n}", "github": "", "project": "", "reviewers": "C7Q4;QZea;Z24X;axPe", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "4;2;2;5", "correctness": "4;4;3;4", "technical_novelty": "1;2;3;3", "empirical_novelty": "1;0;0;3", "wc_summary_paper": "104;99;200;131", "wc_summary_review": "21;98;104;8", "wc_main_review": "111;189;471;68", "wc_review": "236;386;775;207", "wc_reply_reviewers": "125;34;147;0", "wc_reply_authors": "240;356;949;138", "reply_reviewers": "1;1;1;0", "reply_authors": "1;1;2;1", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 3.25, 1.299038105676658 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.82915619758885 ], "empirical_novelty_avg": [ 1.0, 1.224744871391589 ], "wc_summary_paper_avg": [ 133.5, 40.277164746292655 ], "wc_summary_review_avg": [ 57.75, 43.54523510098436 ], "wc_main_review_avg": [ 209.75, 156.94485496504817 ], "wc_review_avg": [ 401.0, 226.36364549105494 ], "wc_reply_reviewers_avg": [ 76.5, 61.198447692731555 ], "wc_reply_authors_avg": [ 420.75, 314.58653420005123 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.4856618642571828, "corr_recommendation_correctness": 0.13245323570650439, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1361989022651133185&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "pdf": "https://openreview.net/pdf?id=YVPBh4k78iZ", "email": "kaist.ac.kr;kaist.ac.kr;kaist.ac.kr;ibs.re.kr", "author_num": 4, "aff_unique_index": "0;0;0;1", "aff_unique_norm": "Korea Advanced Institute of Science and Technology;Institute for Basic Science", "aff_unique_dep": ";", "aff_unique_url": "https://www.kaist.ac.kr;https://www.ibs.re.kr", "aff_unique_abbr": "KAIST;IBS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "South Korea" }, { "id": "YVa8X_2I1b", "title": "INFERNO: Inferring Object-Centric 3D Scene Representations without Supervision", "track": "main", "status": "Reject", "tldr": "", "abstract": "We propose INFERNO, a method to infer object-centric representations of visual scenes without relying on annotations. Our method learns to decompose a scene into multiple objects, each object having a structured representation that disentangles its shape, appearance and 3D pose. To impose this structure we rely on recent advances in neural 3D rendering. Each object representation defines a localized neural radiance field that is used to generate 2D views of the scene through a differentiable rendering process. Our model is subsequently trained by minimizing a reconstruction loss between inputs and corresponding rendered scenes. We empirically show that INFERNO discovers objects in a scene without supervision. We also validate the interpretability of the learned representations by manipulating inferred scenes and showing the corresponding effect in the rendered output. Finally, we demonstrate the usefulness of our 3D object representations in a visual reasoning task using the CATER dataset.", "keywords": "object discovery;scene representation;object-centric representations;3D rendering", "primary_area": "", "supplementary_material": "/attachment/b13cbfce21ef6e4c042dd0a4701792852cd2fe21.zip", "author": "Lluis Castrejon;Nicolas Ballas;Aaron Courville", "authorids": "~Lluis_Castrejon1;~Nicolas_Ballas1;~Aaron_Courville3", "gender": ";;", "homepage": ";;", "dblp": "183/6532;120/9066;56/1688", "google_scholar": "https://scholar.google.ca/citations?user=XWhajuQAAAAJ;euUV4iUAAAAJ;https://scholar.google.ca/citations?user=km6CP8cAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Lluis_Castrejon1;~Nicolas_Ballas1;~Aaron_Courville3", "aff": "Universit\u00e9 de Montr\u00e9al;Meta;Universit\u00e9 de Montr\u00e9al", "aff_domain": "umontreal.ca;meta.com; ", "position": "PhD student;Researcher;Assistant Professor", "bibtex": "@misc{\ncastrejon2022inferno,\ntitle={{INFERNO}: Inferring Object-Centric 3D Scene Representations without Supervision},\nauthor={Lluis Castrejon and Nicolas Ballas and Aaron Courville},\nyear={2022},\nurl={https://openreview.net/forum?id=YVa8X_2I1b}\n}", "github": "", "project": "", "reviewers": "3rCh;G78k;k83T;noTD", "site": "https://openreview.net/forum?id=YVa8X_2I1b", "pdf_size": 0, "recommendation": "5;5;5;5", "confidence": "4;4;4;5", "correctness": "3;3;3;2", "technical_novelty": "2;2;3;3", "empirical_novelty": "0;3;3;0", "wc_summary_paper": "165;121;112;59", "wc_summary_review": "7;17;31;29", "wc_main_review": "418;291;345;288", "wc_review": "590;429;488;376", "wc_reply_reviewers": "0;285;94;138", "wc_reply_authors": "478;852;313;352", "reply_reviewers": "0;1;1;1", "reply_authors": "1;2;1;1", "recommendation_avg": [ 5.0, 0.0 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 1.5, 1.5 ], "wc_summary_paper_avg": [ 114.25, 37.67874069020885 ], "wc_summary_review_avg": [ 21.0, 9.695359714832659 ], "wc_main_review_avg": [ 335.5, 52.75651618520692 ], "wc_review_avg": [ 470.75, 79.43354140411971 ], "wc_reply_reviewers_avg": [ 129.25, 102.81384877534738 ], "wc_reply_authors_avg": [ 498.75, 212.87011885184825 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14498468949985227877&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2, "aff_unique_index": "0;1;0", "aff_unique_norm": "Universit\u00e9 de Montr\u00e9al;Meta", "aff_unique_dep": ";Meta Platforms, Inc.", "aff_unique_url": "https://www.umontreal.ca;https://meta.com", "aff_unique_abbr": "UdeM;Meta", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", "aff_country_unique": "Canada;United States" }, { "title": "Neural Structured Prediction for Inductive Node Classification", "status": "Oral", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/5947", "id": "YWNAX0caEjI", "poster": "", "openreview": "https://openreview.net/forum?id=YWNAX0caEjI", "slides": "https://iclr.cc/virtual/2022/poster/5947", "video": "https://iclr.cc/virtual/2022/poster/5947", "author_site": "Meng Qu, Huiyu Cai, Jian Tang", "tldr": "", "abstract": "This paper studies node classification in the inductive setting, i.e., aiming to learn a model on labeled training graphs and generalize it to infer node labels on unlabeled test graphs. This problem has been extensively studied with graph neural networks (GNNs) by learning effective node representations, as well as traditional structured prediction methods for modeling the structured output of node labels, e.g., conditional random fields (CRFs). In this paper, we present a new approach called the Structured Proxy Network (SPN), which combines the advantages of both worlds. SPN defines flexible potential functions of CRFs with GNNs. However, learning such a model is nontrivial as it involves optimizing a maximin game with high-cost inference. Inspired by the underlying connection between joint and marginal distributions defined by Markov networks, we propose to solve an approximate version of the optimization problem as a proxy, which yields a near-optimal solution, making learning more efficient. Extensive experiments on two settings show that our approach outperforms many competitive baselines.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Meng Qu;Huiyu Cai;Jian Tang", "authorids": "~Meng_Qu2;~Huiyu_Cai1;~Jian_Tang1", "gender": "M;M;", "homepage": "https://mnqu.github.io/;https://hui2000ji.github.io;http://www.jian-tang.com", "dblp": "14/8543.html;237/9501;181/2667-5", "google_scholar": "92UwQYkAAAAJ;ZQ2VZ0sAAAAJ;https://scholar.google.ca/citations?user=1ir6WUEAAAAJ", "orcid": ";0000-0001-8506-537X;", "linkedin": ";;", "or_profile": "~Meng_Qu2;~Huiyu_Cai1;~Jian_Tang1", "aff": "University of Montreal;Mila - Quebec AI Institute;Mila, HEC Montreal", "aff_domain": "umontreal.ca;mila.quebec;hec.ca", "position": "PhD student;PhD student;Assistant Professor", "bibtex": "@inproceedings{\nqu2022neural,\ntitle={Neural Structured Prediction for Inductive Node Classification},\nauthor={Meng Qu and Huiyu Cai and Jian Tang},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=YWNAX0caEjI}\n}", "github": "", "project": "", "reviewers": "jqoo;LsFg;DQ8r;RSMV", "pdf_size": 0, "recommendation": "8;8;8;10", "confidence": "4;4;3;4", "correctness": "4;3;4;4", "technical_novelty": "3;3;2;4", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "70;144;87;157", "wc_summary_review": "38;62;58;123", "wc_main_review": "165;1045;118;241", "wc_review": "273;1251;263;521", "wc_reply_reviewers": "0;146;0;94", "wc_reply_authors": "290;1599;92;195", "reply_reviewers": "0;2;0;1", "reply_authors": "1;3;1;2", "recommendation_avg": [ 8.5, 0.8660254037844386 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 114.5, 36.78654645383282 ], "wc_summary_review_avg": [ 70.25, 31.78344694963087 ], "wc_main_review_avg": [ 392.25, 379.41229223629534 ], "wc_review_avg": [ 577.0, 402.62389397550663 ], "wc_reply_reviewers_avg": [ 60.0, 62.75348595894893 ], "wc_reply_authors_avg": [ 544.0, 613.116220630314 ], "reply_reviewers_avg": [ 0.75, 0.82915619758885 ], "reply_authors_avg": [ 1.75, 0.82915619758885 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 21, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2079533968187968682&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=YWNAX0caEjI", "email": "umontreal.ca;mila.quebec;hec.ca", "author_num": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "University of Montreal;Quebec AI Institute;HEC Montreal", "aff_unique_dep": ";AI Institute;HEC Business School", "aff_unique_url": "https://wwwumontreal.ca;https://mila.quebec;https://www.hec.ca", "aff_unique_abbr": "UM;Mila;HEC", "aff_campus_unique_index": "1", "aff_campus_unique": ";Montreal", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Canada" }, { "title": "A Johnson-Lindenstrauss Framework for Randomly Initialized CNNs", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7014", "id": "YX0lrvdPQc", "poster": "", "openreview": "https://openreview.net/forum?id=YX0lrvdPQc", "slides": "https://iclr.cc/virtual/2022/poster/7014", "video": "https://iclr.cc/virtual/2022/poster/7014", "author_site": "Ido Nachum, Jan H\u0105z\u0142a, Michael Gastpar, Anatoly Khina", "tldr": "", "abstract": "How does the geometric representation of a dataset change after the application of each randomly initialized layer of a neural network? The celebrated Johnson-Lindenstrauss lemma answers this question for linear fully-connected neural networks (FNNs), stating that the geometry is essentially preserved. For FNNs with the ReLU activation, the angle between two input contracts according to a known mapping. The question for non-linear convolutional neural networks (CNNs) becomes much more intricate. To answer this question, we introduce a geometric framework. For linear CNNs, we show that the Johnson--Lindenstrauss lemma continues to hold, namely, that the angle between two inputs is preserved. For CNNs with ReLU activation, on the other hand, the behavior is richer: The angle between the outputs contracts, where the level of contraction depends on the nature of the inputs. In particular, after one layer, the geometry of natural images is essentially preserved, whereas for Gaussian correlated inputs, CNNs exhibit the same contracting behavior as FNNs with ReLU activation. ", "keywords": "convolutional neural networks;Johnson-Lindenstrauss lemma;initialization;isometry;theory.", "primary_area": "", "supplementary_material": "", "author": "Ido Nachum;Jan Hazla;Michael Gastpar;Anatoly Khina", "authorids": "~Ido_Nachum1;jan.hazla@epfl.ch;~Michael_Gastpar1;~Anatoly_Khina1", "gender": "M;;;M", "homepage": "https://idonachum.wordpress.com;;https://people.epfl.ch/michael.gastpar;http://www.eng.tau.ac.il/~anatolyk/", "dblp": ";;;", "google_scholar": "jpY2NNcAAAAJ;;https://scholar.google.ch/citations?user=IQ3hcw4AAAAJ;RhnwFLAAAAAJ", "orcid": ";;0000-0002-5499-5336;0000-0003-2359-1678", "linkedin": ";;;anatoly-khina/", "or_profile": "~Ido_Nachum1;jan.hazla@epfl.ch;~Michael_Gastpar1;~Anatoly_Khina1", "aff": "Swiss Federal Institute of Technology Lausanne;;School of Computer and Communication Sciences, EPFL - EPF Lausanne;Tel Aviv University", "aff_domain": "epfl.ch;;ic.epfl.ch;tau.ac.il", "position": "Postdoc;;Full Professor;Assistant Professor", "bibtex": "@inproceedings{\nnachum2022a,\ntitle={A Johnson-Lindenstrauss Framework for Randomly Initialized {CNN}s},\nauthor={Ido Nachum and Jan Hazla and Michael Gastpar and Anatoly Khina},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=YX0lrvdPQc}\n}", "github": "", "project": "", "reviewers": "DLps;g6qL;Mtqu", "pdf_size": 0, "recommendation": "6;8;8", "confidence": "2;4;4", "correctness": "4;4;3", "technical_novelty": "2;3;3", "empirical_novelty": "0;3;3", "wc_summary_paper": "62;103;170", "wc_summary_review": "47;12;76", "wc_main_review": "121;175;916", "wc_review": "230;290;1162", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "75;422;809", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "recommendation_avg": [ 7.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 3.3333333333333335, 0.9428090415820634 ], "correctness_avg": [ 3.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.0, 1.4142135623730951 ], "wc_summary_paper_avg": [ 111.66666666666667, 44.51466674654047 ], "wc_summary_review_avg": [ 45.0, 26.166135875720485 ], "wc_main_review_avg": [ 404.0, 362.7092499509766 ], "wc_review_avg": [ 560.6666666666666, 425.91183229500547 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 435.3333333333333, 299.8025276003449 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 1.0, "corr_recommendation_correctness": -0.49999999999999983, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15814639999884665060&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=YX0lrvdPQc", "email": "epfl.ch;;ic.epfl.ch;tau.ac.il", "author_num": 4, "aff_unique_index": "0;1;2", "aff_unique_norm": "Swiss Federal Institute of Technology Lausanne;EPFL;Tel Aviv University", "aff_unique_dep": ";School of Computer and Communication Sciences;", "aff_unique_url": "https://www.epfl.ch;https://www.epfl.ch;https://www.tau.ac.il", "aff_unique_abbr": "EPFL;EPFL;TAU", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Lausanne;", "aff_country_unique_index": "0;0;1", "aff_country_unique": "Switzerland;Israel" }, { "id": "YYHXJOawkPb", "title": "The Evolution of Out-of-Distribution Robustness Throughout Fine-Tuning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Although machine learning models typically experience a drop in performance on out-of-distribution data, accuracies on in- versus out-of-distribution data are widely observed to follow a single linear trend when evaluated across a testbed of models. Models that are more accurate on the out-of-distribution data relative to this baseline exhibit \u201ceffective robustness\u201d and are exceedingly rare. Identifying such models, and understanding their properties, is key to improving out-of-distribution performance. We conduct a thorough empirical investigation of effective robustness during fine-tuning and surprisingly find that models pre-trained on larger datasets exhibit effective robustness during training that vanishes at convergence. We study how properties of the data influence effective robustness, and we show that it increases with the larger size, more diversity, and higher example difficulty of the dataset. We also find that models that display effective robustness are able to correctly classify 10% of the examples that no other current testbed model gets correct. Finally, we discuss several strategies for scaling effective robustness to the high-accuracy regime to improve the out-of-distribution accuracy of state-of-the-art models.", "keywords": "out-of-distribution;generalization;robustness", "primary_area": "", "supplementary_material": "/attachment/5917ebb2c5b7a8dfaaeaa1c71f7c90b0438bff5f.zip", "author": "Anders Johan Andreassen;Yasaman Bahri;Behnam Neyshabur;Rebecca Roelofs", "authorids": "~Anders_Johan_Andreassen1;~Yasaman_Bahri1;~Behnam_Neyshabur1;~Rebecca_Roelofs1", "gender": "M;F;M;F", "homepage": ";https://yasamanb.github.io/;https://www.neyshabur.net;", "dblp": ";;131/9898;145/2224", "google_scholar": ";p2_vHmAAAAAJ;e1ucbCYAAAAJ;", "orcid": "0000-0003-3504-3919;;;", "linkedin": ";yasamanbahri;;", "or_profile": "~Anders_Johan_Andreassen1;~Yasaman_Bahri1;~Behnam_Neyshabur1;~Rebecca_Roelofs1", "aff": "Google;Google Brain;Google;Google", "aff_domain": "google.com;google.com;google.com;google.com", "position": "Research Scientist;Research Scientist;Research Scientist;Research scientist", "bibtex": "@misc{\nandreassen2022the,\ntitle={The Evolution of Out-of-Distribution Robustness Throughout Fine-Tuning},\nauthor={Anders Johan Andreassen and Yasaman Bahri and Behnam Neyshabur and Rebecca Roelofs},\nyear={2022},\nurl={https://openreview.net/forum?id=YYHXJOawkPb}\n}", "github": "", "project": "", "reviewers": "XPbn;cHRW;kcNT;AB8m", "site": "https://openreview.net/forum?id=YYHXJOawkPb", "pdf_size": 0, "recommendation": "3;3;3;8", "confidence": "4;4;3;3", "correctness": "2;3;4;4", "technical_novelty": "1;2;2;3", "empirical_novelty": "1;3;2;3", "wc_summary_paper": "126;42;46;92", "wc_summary_review": "124;46;12;71", "wc_main_review": "552;454;346;194", "wc_review": "802;542;404;357", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.25, 2.165063509461097 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 76.5, 34.68068626771967 ], "wc_summary_review_avg": [ 63.25, 40.84957160118084 ], "wc_main_review_avg": [ 386.5, 132.89375455603624 ], "wc_review_avg": [ 526.25, 173.11610988004554 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.5222329678670935, "gs_citation": 79, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14349606461730528071&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Google", "aff_unique_dep": "Google", "aff_unique_url": "https://www.google.com", "aff_unique_abbr": "Google", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "YYULSFvKru9", "title": "StARformer: Transformer with State-Action-Reward Representations", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Reinforcement Learning (RL) can be considered as a sequence modeling task, i.e., given a sequence of past state-action-reward experiences, a model autoregressively predicts a sequence of future actions. Recently, Transformers have been successfully adopted to model this problem. In this work, we propose State-Action-Reward Transformer (StARformer), which explicitly models strongly related local causal relations to help improve action prediction in long sequences. StARformer first extracts local representations (i.e., StAR-representations) from each group of state-action-reward tokens within a very short time span. A sequence of such local representations combined with state representations, is then used to make action predictions over a long time span. Our experiments show that StARformer outperforms the state-of-the-art Transformer-based method on Atari (image) and Gym (state vector) benchmarks, in both offline-RL and imitation learning settings. StARformer is also more compliant with longer sequences of inputs compared to the baseline. The code will be released online.", "keywords": "Representation Learning;Reinforcement Learning;Transformer", "primary_area": "", "supplementary_material": "", "author": "Jinghuan Shang;Michael S Ryoo", "authorids": "~Jinghuan_Shang1;~Michael_S_Ryoo1", "gender": "M;M", "homepage": "https://www.cs.stonybrook.edu/~jishang;http://michaelryoo.com/", "dblp": "218/7364;r/MichaelSRyoo", "google_scholar": "gMvLIDUAAAAJ;vcw0TJIAAAAJ", "orcid": "0000-0001-7301-5981;", "linkedin": ";", "or_profile": "~Jinghuan_Shang1;~Michael_S_Ryoo1", "aff": "Department of Computer Science, State University of New York, Stony Brook;Google DeepMind", "aff_domain": "cs.stonybrook.edu;google.com", "position": "PhD student;Research Scientist", "bibtex": "@misc{\nshang2022starformer,\ntitle={St{AR}former: Transformer with State-Action-Reward Representations},\nauthor={Jinghuan Shang and Michael S Ryoo},\nyear={2022},\nurl={https://openreview.net/forum?id=YYULSFvKru9}\n}", "github": "", "project": "", "reviewers": "a4zG;Crmx;XFEb;z9Aw", "site": "https://openreview.net/forum?id=YYULSFvKru9", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "4;4;4;4", "correctness": "2;2;3;3", "technical_novelty": "2;2;3;2", "empirical_novelty": "2;2;3;0", "wc_summary_paper": "47;119;112;111", "wc_summary_review": "76;38;33;85", "wc_main_review": "1154;557;532;512", "wc_review": "1277;714;677;708", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.75, 1.0897247358851685 ], "wc_summary_paper_avg": [ 97.25, 29.17511782324109 ], "wc_summary_review_avg": [ 58.0, 22.792542640082964 ], "wc_main_review_avg": [ 688.75, 269.0849076035295 ], "wc_review_avg": [ 844.0, 250.38670092478952 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1, "aff_unique_index": "0;1", "aff_unique_norm": "State University of New York;Google", "aff_unique_dep": "Department of Computer Science;Google DeepMind", "aff_unique_url": "https://www.stonybrook.edu;https://deepmind.com", "aff_unique_abbr": "SUNY Stony Brook;DeepMind", "aff_campus_unique_index": "0", "aff_campus_unique": "Stony Brook;", "aff_country_unique_index": "0;1", "aff_country_unique": "United States;United Kingdom" }, { "title": "Generative Planning for Temporally Coordinated Exploration in Reinforcement Learning", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6673", "id": "YZHES8wIdE", "poster": "", "openreview": "https://openreview.net/forum?id=YZHES8wIdE", "slides": "https://iclr.cc/virtual/2022/poster/6673", "video": "https://iclr.cc/virtual/2022/poster/6673", "author_site": "Haichao Zhang, Wei Xu, Haonan Yu", "tldr": "", "abstract": "Standard model-free reinforcement learning algorithms optimize a policy that generates the action to be taken in the current time step in order to maximize expected future return. While flexible, it faces difficulties arising from the inefficient exploration due to its single step nature. In this work, we present Generative Planning method (GPM), which can generate actions not only for the current step, but also for a number of future steps (thus termed as generative planning). This brings several benefits to GPM. Firstly, since GPM is trained by maximizing value, the plans generated from it can be regarded as intentional action sequences for reaching high value regions. GPM can therefore leverage its generated multi-step plans for temporally coordinated exploration towards high value regions, which is potentially more effective than a sequence of actions generated by perturbing each action at single step level, whose consistent movement decays exponentially with the number of exploration steps. Secondly, starting from a crude initial plan generator, GPM can refine it to be adaptive to the task, which, in return, benefits future explorations. This is potentially more effective than commonly used action-repeat strategy, which is non-adaptive in its form of plans. Additionally, since the multi-step plan can be interpreted as the intent of the agent from now to a span of time period into the future, it offers a more informative and intuitive signal for interpretation. Experiments are conducted on several benchmark environments and the results demonstrated its effectiveness compared with several baseline methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Haichao Zhang;Wei Xu;Haonan Yu", "authorids": "~Haichao_Zhang4;~Wei_Xu13;~Haonan_Yu5", "gender": "M;;M", "homepage": ";;https://sites.google.com/site/hczhang1/", "dblp": ";;", "google_scholar": "Gxz1fqwAAAAJ;Army5cEAAAAJ;_OsT-RgAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Wei_Xu13;~Haonan_Yu5;~Haichao_Zhang2", "aff": "Horizon Robotics;Horizon Robotics;Horizon Robotics", "aff_domain": "horizon.auto;horizon.auto;horizon.ai", "position": "Researcher;Research Scientist;Research Scientist", "bibtex": "@inproceedings{\nzhang2022generative,\ntitle={Generative Planning for Temporally Coordinated Exploration in Reinforcement Learning},\nauthor={Haichao Zhang and Wei Xu and Haonan Yu},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=YZHES8wIdE}\n}", "github": "", "project": "", "reviewers": "ZK8i;356g;3VLM;nNjd", "pdf_size": 0, "recommendation": "6;8;8;8", "confidence": "4;4;4;3", "correctness": "3;4;4;3", "technical_novelty": "2;4;4;3", "empirical_novelty": "2;3;4;2", "wc_summary_paper": "206;90;111;93", "wc_summary_review": "104;27;54;31", "wc_main_review": "771;239;464;319", "wc_review": "1081;356;629;443", "wc_reply_reviewers": "45;17;11;31", "wc_reply_authors": "1324;271;954;571", "reply_reviewers": "1;1;1;1", "reply_authors": "2;1;2;1", "recommendation_avg": [ 7.5, 0.8660254037844386 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.25, 0.82915619758885 ], "empirical_novelty_avg": [ 2.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 125.0, 47.44997365647319 ], "wc_summary_review_avg": [ 54.0, 30.651264247988205 ], "wc_main_review_avg": [ 448.25, 203.0435605972275 ], "wc_review_avg": [ 627.25, 279.91818000980214 ], "wc_reply_reviewers_avg": [ 26.0, 13.152946437965905 ], "wc_reply_authors_avg": [ 780.0, 396.5394053558864 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14730527943022398215&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=YZHES8wIdE", "email": "horizon.auto;horizon.auto;horizon.ai", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Horizon Robotics", "aff_unique_dep": "", "aff_unique_url": "https://www.horizon-robotics.com/", "aff_unique_abbr": "Horizon Robotics", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "id": "Ybx635VOYoM", "title": "ContraQA: Question Answering under Contradicting Contexts", "track": "main", "status": "Reject", "tldr": "", "abstract": "With a rise in false, inaccurate, and misleading information in propaganda, news, and social media, real-world Question Answering (QA) systems face the challenges of synthesizing and reasoning over contradicting information to derive correct answers. This urgency gives rise to the need to make QA systems robust to misinformation, a topic previously unexplored. We study the risk of misinformation to QA models by investigating the behavior of the QA model under contradicting contexts that are mixed with both real and fake information. We create the first large-scale dataset for this problem, namely ContraQA, which contains over 10K human-written and model-generated contradicting pairs of contexts. Experiments show that QA models are vulnerable under contradicting contexts brought by misinformation. To defend against such a threat, we build a misinformation-aware QA system as a counter-measure that integrates question answering and misinformation detection in a joint fashion. ", "keywords": "Question Answering;Misinformation Detection;Robustness;Text Generation;Contradicting Contexts", "primary_area": "", "supplementary_material": "", "author": "Liangming Pan;Wenhu Chen;Min-Yen Kan;William Yang Wang", "authorids": "~Liangming_Pan1;~Wenhu_Chen3;~Min-Yen_Kan1;~William_Yang_Wang2", "gender": "M;M;M;M", "homepage": "https://liangmingpan.bio;https://www.comp.nus.edu.sg/~kanmy/;https://wenhuchen.github.io/;https://www.cs.ucsb.edu/~william/", "dblp": "186/9707;k/MinYenKan;136/0957.html;08/9282", "google_scholar": "JcjjOTUAAAAJ;https://scholar.google.com.tw/citations?user=aNVcd3EAAAAJ;https://scholar.google.co.jp/citations?user=U8ShbhUAAAAJ;gf8Ms_8AAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Liangming_Pan1;~Min-Yen_Kan1;~wenhu_chen1;~William_Wang1", "aff": "National University of Singapore;National University of Singapore;Google;UC Santa Barbara", "aff_domain": "u.nus.edu;nus.edu.sg;google.com;ucsb.edu", "position": "PhD student;Associate Professor;Researcher;Full Professor", "bibtex": "@misc{\npan2022contraqa,\ntitle={Contra{QA}: Question Answering under Contradicting Contexts},\nauthor={Liangming Pan and Wenhu Chen and Min-Yen Kan and William Yang Wang},\nyear={2022},\nurl={https://openreview.net/forum?id=Ybx635VOYoM}\n}", "github": "", "project": "", "reviewers": "4eU9;2VLR;g2wC;nPFz", "site": "https://openreview.net/forum?id=Ybx635VOYoM", "pdf_size": 0, "recommendation": "3;3;5;6", "confidence": "4;4;4;3", "correctness": "2;3;3;4", "technical_novelty": "1;2;2;3", "empirical_novelty": "1;2;3;3", "wc_summary_paper": "298;226;208;94", "wc_summary_review": "66;43;57;43", "wc_main_review": "320;541;307;335", "wc_review": "684;810;572;472", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.25, 1.299038105676658 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 206.5, 73.1624903895432 ], "wc_summary_review_avg": [ 52.25, 9.781998773256925 ], "wc_main_review_avg": [ 375.75, 95.92021424079493 ], "wc_review_avg": [ 634.5, 126.05851815724314 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.7777777777777777, "corr_recommendation_correctness": 0.816496580927726, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5045108225804908171&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2, "aff_unique_index": "0;0;1;2", "aff_unique_norm": "National University of Singapore;Google;University of California, Santa Barbara", "aff_unique_dep": ";Google;", "aff_unique_url": "https://www.nus.edu.sg;https://www.google.com;https://www.ucsb.edu", "aff_unique_abbr": "NUS;Google;UCSB", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Mountain View;Santa Barbara", "aff_country_unique_index": "0;0;1;1", "aff_country_unique": "Singapore;United States" }, { "id": "Yc64t25hseP", "title": "GUIDED MCMC FOR SPARSE BAYESIAN MODELS TO DETECT RARE EVENTS IN IMAGES SANS LABELED DATA", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Detection of rare events in images is a challenging task because of two main problems, the first problem is the lack of labeled data for rare category class and the second problem is a highly imbalanced data problem. Training models in this scenario becomes hard. Unsupervised methods do not apply as we need to detect rare events automatically. Rule-based methods seem to be the only viable solution, but it is tedious to come up with a set of rules covering all corner cases. Even the recently popular zero-shot learning techniques required to be pre-trained on auxiliary datasets. In the given scenario, we propose an approach to provide little guidance from experts as an input into a hierarchical Bayesian model. The guidance influences the Markov chain Monte Carlo (MCMC) based inference technique of the model. After the steady-state is obtained for the underlying Markov chain, it is possible to compute the posterior probability of the presence of the rare event in a given image. The proposed method neither needs any labeled data nor required pre-training, unlike zero-shot learning. The proposed technique has been observed to outperform the state-of-the-art unsupervised image classification techniques.", "keywords": "MCMC;hierarchical Bayesian models;image classification;rare events", "primary_area": "", "supplementary_material": "", "author": "Gaurav Jain;Mrinal Das", "authorids": "~Gaurav_Jain4;~Mrinal_Das1", "gender": ";M", "homepage": "https://gauravjain10.github.io/resume/;http://nmrinl.github.io", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": "gaurav-jain-b12848aa/;", "or_profile": "~Gaurav_Jain4;~Mrinal_Kanti_Das1", "aff": ";Indian Institute of Technology Palakkad", "aff_domain": ";iitpkd.ac.in", "position": ";Assistant Professor", "bibtex": "@misc{\njain2022guided,\ntitle={{GUIDED} {MCMC} {FOR} {SPARSE} {BAYESIAN} {MODELS} {TO} {DETECT} {RARE} {EVENTS} {IN} {IMAGES} {SANS} {LABELED} {DATA}},\nauthor={Gaurav Jain and Mrinal Das},\nyear={2022},\nurl={https://openreview.net/forum?id=Yc64t25hseP}\n}", "github": "", "project": "", "reviewers": "vmfV;WoCJ;Jxqk;dVLQ", "site": "https://openreview.net/forum?id=Yc64t25hseP", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "5;3;1;4", "correctness": "2;2;4;3", "technical_novelty": "1;1;2;2", "empirical_novelty": "1;1;2;3", "wc_summary_paper": "61;41;59;80", "wc_summary_review": "49;19;24;79", "wc_main_review": "299;253;96;189", "wc_review": "409;313;179;348", "wc_reply_reviewers": "0;0;0;24", "wc_reply_authors": "802;0;177;516", "reply_reviewers": "0;0;0;1", "reply_authors": "1;0;1;1", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 3.25, 1.479019945774904 ], "correctness_avg": [ 2.75, 0.82915619758885 ], "technical_novelty_avg": [ 1.5, 0.5 ], "empirical_novelty_avg": [ 1.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 60.25, 13.808964479641476 ], "wc_summary_review_avg": [ 42.75, 23.81569860407206 ], "wc_main_review_avg": [ 209.25, 76.16552697907368 ], "wc_review_avg": [ 312.25, 84.25370911716587 ], "wc_reply_reviewers_avg": [ 6.0, 10.392304845413264 ], "wc_reply_authors_avg": [ 373.75, 309.04398958724306 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 0.75, 0.4330127018922193 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.50709255283711, "corr_recommendation_correctness": 0.9045340337332909, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:S4xhQCd49d0J:scholar.google.com/&scioq=GUIDED+MCMC+FOR+SPARSE+BAYESIAN+MODELS+TO+DETECT+RARE+EVENTS+IN+IMAGES+SANS+LABELED+DATA&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "Indian Institute of Technology Palakkad", "aff_unique_dep": "", "aff_unique_url": "https://www.iitpkd.ac.in", "aff_unique_abbr": "IIT Palakkad", "aff_campus_unique_index": "0", "aff_campus_unique": "Palakkad", "aff_country_unique_index": "0", "aff_country_unique": "India" }, { "title": "On the Convergence of Certified Robust Training with Interval Bound Propagation", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7086", "id": "YeShU5mLfLt", "poster": "", "openreview": "https://openreview.net/forum?id=YeShU5mLfLt", "slides": "https://iclr.cc/virtual/2022/poster/7086", "video": "https://iclr.cc/virtual/2022/poster/7086", "author_site": "Yihan Wang, Zhouxing Shi, Quanquan Gu, Cho-Jui Hsieh", "tldr": "", "abstract": "Interval Bound Propagation (IBP) is so far the base of state-of-the-art methods for training neural networks with certifiable robustness guarantees when potential adversarial perturbations present, while the convergence of IBP training remains unknown in existing literature. In this paper, we present a theoretical analysis on the convergence of IBP training. With an overparameterized assumption, we analyze the convergence of IBP robust training. We show that when using IBP training to train a randomly initialized two-layer ReLU neural network with logistic loss, gradient descent can linearly converge to zero robust training error with a high probability if we have sufficiently small perturbation radius and large network width.", "keywords": "Certified robustness;Adversarial robustness;Convergence", "primary_area": "", "supplementary_material": "", "author": "Yihan Wang;Zhouxing Shi;Quanquan Gu;Cho-Jui Hsieh", "authorids": "~Yihan_Wang2;~Zhouxing_Shi1;~Quanquan_Gu1;~Cho-Jui_Hsieh1", "gender": "F;;M;M", "homepage": "https://yihanwang617.github.io;https://shizhouxing.github.io;http://web.cs.ucla.edu/~qgu/;http://web.cs.ucla.edu/~chohsieh/index.html", "dblp": ";232/2169;50/4597;14/2770", "google_scholar": ";YFIr4PwAAAAJ;GU9HgNAAAAAJ;Wy89g4IAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Yihan_Wang2;~Zhouxing_Shi1;~Quanquan_Gu1;~Cho-Jui_Hsieh1", "aff": "University of California, Los Angeles;University of California, Los Angeles;University of California, Los Angeles;University of California, Los Angeles", "aff_domain": "ucla.edu;ucla.edu;cs.ucla.edu;ucla.edu", "position": "MS student;PhD student;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\nwang2022on,\ntitle={On the Convergence of Certified Robust Training with Interval Bound Propagation},\nauthor={Yihan Wang and Zhouxing Shi and Quanquan Gu and Cho-Jui Hsieh},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=YeShU5mLfLt}\n}", "github": "", "project": "", "reviewers": "n3Ki;Ysed;YCZg", "pdf_size": 0, "recommendation": "5;6;8", "confidence": "4;3;2", "correctness": "3;3;4", "technical_novelty": "2;4;4", "empirical_novelty": "1;0;0", "wc_summary_paper": "83;33;30", "wc_summary_review": "101;33;56", "wc_main_review": "949;377;271", "wc_review": "1133;443;357", "wc_reply_reviewers": "617;0;68", "wc_reply_authors": "2080;590;152", "reply_reviewers": "1;0;1", "reply_authors": "5;1;1", "recommendation_avg": [ 6.333333333333333, 1.247219128924647 ], "confidence_avg": [ 3.0, 0.816496580927726 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 3.3333333333333335, 0.9428090415820634 ], "empirical_novelty_avg": [ 0.3333333333333333, 0.4714045207910317 ], "wc_summary_paper_avg": [ 48.666666666666664, 24.30820620467271 ], "wc_summary_review_avg": [ 63.333333333333336, 28.241026106633512 ], "wc_main_review_avg": [ 532.3333333333334, 297.78888868159976 ], "wc_review_avg": [ 644.3333333333334, 347.3186177304957 ], "wc_reply_reviewers_avg": [ 228.33333333333334, 276.2273620689224 ], "wc_reply_authors_avg": [ 940.6666666666666, 825.2358585411944 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 2.3333333333333335, 1.8856180831641267 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.9819805060619659, "corr_recommendation_correctness": 0.944911182523068, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4306297357131381375&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=YeShU5mLfLt", "email": "ucla.edu;ucla.edu;cs.ucla.edu;ucla.edu", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "University of California, Los Angeles", "aff_unique_dep": "", "aff_unique_url": "https://www.ucla.edu", "aff_unique_abbr": "UCLA", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Los Angeles", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "YedA6OCN6X", "title": "Evaluating generative networks using Gaussian mixtures of image features", "track": "main", "status": "Reject", "tldr": "", "abstract": "We develop a measure for evaluating the performance of generative networks given two sets of images. A popular performance measure currently used to do this is the Fr\u00e9chet Inception Distance (FID). However, FID assumes that images featurized using the penultimate layer of Inception-v3 follow a Gaussian distribution. This assumption allows FID to be easily computed, since FID uses the 2-Wasserstein distance of two Gaussian distributions fitted to the featurized images. However, we show that Inception-v3 features of the ImageNet dataset are not Gaussian; in particular, each marginal is not Gaussian. To remedy this problem, we model the featurized images using Gaussian mixture models (GMMs) and compute the $2$-Wasserstein distance restricted to GMMs. We define a performance measure, which we call WaM, on two sets of images by using Inception-v3 (or another classifier) to featurize the images, estimate two GMMs, and use the restricted 2-Wasserstein distance to compare the GMMs. We experimentally show the advantages of WaM over FID, including how FID is more sensitive than WaM to image perturbations. By modelling the non-Gaussian features obtained from Inception-v3 as GMMs and using a GMM metric, we can more accurately evaluate generative network performance.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Lorenzo Luzi;Carlos Ortiz Marrero;Nile N Wynar;Richard Baraniuk;Michael J. Henry", "authorids": "~Lorenzo_Luzi1;~Carlos_Ortiz_Marrero1;~Nile_N_Wynar1;~Richard_Baraniuk1;~Michael_J._Henry1", "gender": "M;;F;;M", "homepage": ";https://cmortiz.github.io/;;http://richb.rice.edu/;", "dblp": ";;;32/2804;", "google_scholar": "https://scholar.google.com/citations?hl=en;CCElNRAAAAAJ;;https://scholar.google.com.tw/citations?user=N-BBA20AAAAJ;IZmLYDgAAAAJ", "orcid": ";;;;", "linkedin": ";carlos-m-ortiz-marrero-0a7784b9;nilewynar;richard-baraniuk;cv-mjh/", "or_profile": "~Lorenzo_Luzi1;~Carlos_Ortiz_Marrero1;~Nile_N_Wynar1;~Richard_Baraniuk1;~Michael_J._Henry1", "aff": "Rice University;Pacific Northwest National Laboratory;Pacific Northwest National Laboratory;William Marsh Rice University;Pacific Northwest National Laboratory", "aff_domain": "rice.edu;pnnl.gov;pnnl.gov;rice.edu;pnnl.gov", "position": "PhD student;Researcher;Data Scientist;C. Sidney Burrus Professor;Researcher", "bibtex": "@misc{\nluzi2022evaluating,\ntitle={Evaluating generative networks using Gaussian mixtures of image features},\nauthor={Lorenzo Luzi and Carlos Ortiz Marrero and Nile N Wynar and Richard Baraniuk and Michael J. Henry},\nyear={2022},\nurl={https://openreview.net/forum?id=YedA6OCN6X}\n}", "github": "", "project": "", "reviewers": "3jkq;KkXg;Dqh4;xevY", "site": "https://openreview.net/forum?id=YedA6OCN6X", "pdf_size": 0, "recommendation": "1;3;5;8", "confidence": "5;4;3;4", "correctness": "3;2;3;4", "technical_novelty": "1;2;3;3", "empirical_novelty": "1;2;3;3", "wc_summary_paper": "74;95;52;49", "wc_summary_review": "4;56;27;52", "wc_main_review": "374;807;316;512", "wc_review": "452;958;395;613", "wc_reply_reviewers": "304;0;0;129", "wc_reply_authors": "641;727;534;627", "reply_reviewers": "1;0;0;1", "reply_authors": "1;1;1;2", "recommendation_avg": [ 4.25, 2.5860201081971503 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.25, 0.82915619758885 ], "empirical_novelty_avg": [ 2.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 67.5, 18.580904176062045 ], "wc_summary_review_avg": [ 34.75, 20.94486810653149 ], "wc_main_review_avg": [ 502.25, 189.8056571865022 ], "wc_review_avg": [ 604.5, 219.19226719936998 ], "wc_reply_reviewers_avg": [ 108.25, 124.6843514640069 ], "wc_reply_authors_avg": [ 632.25, 68.43747146118126 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.5468687416197306, "corr_recommendation_correctness": 0.6835859270246631, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2885722444850622951&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "aff_unique_index": "0;1;1;0;1", "aff_unique_norm": "Rice University;Pacific Northwest National Laboratory", "aff_unique_dep": ";", "aff_unique_url": "https://www.rice.edu;https://www.pnnl.gov", "aff_unique_abbr": "Rice;PNNL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Understanding Dimensional Collapse in Contrastive Self-supervised Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6792", "id": "YevsQ05DEN7", "poster": "", "openreview": "https://openreview.net/forum?id=YevsQ05DEN7", "slides": "https://iclr.cc/virtual/2022/poster/6792", "video": "https://iclr.cc/virtual/2022/poster/6792", "author_site": "Li Jing, Pascal Vincent, Yann LeCun, Yuandong Tian", "tldr": "", "abstract": "Self-supervised visual representation learning aims to learn useful representations without relying on human annotations. Joint embedding approach bases on maximizing the agreement between embedding vectors from different views of the same image. Various methods have been proposed to solve the collapsing problem where all embedding vectors collapse to a trivial constant solution. Among these methods, contrastive learning prevents collapse via negative sample pairs. It has been shown that non-contrastive methods suffer from a lesser collapse problem of a different nature: dimensional collapse, whereby the embedding vectors end up spanning a lower-dimensional subspace instead of the entire available embedding space. Here, we show that dimensional collapse also happens in contrastive learning. In this paper, we shed light on the dynamics at play in contrastive learning that leads to dimensional collapse. Inspired by our theory, we propose a novel contrastive learning method, called DirectCLR, which directly optimizes the representation space without relying on a trainable projector. Experiments show that DirectCLR outperforms SimCLR with a trainable linear projector on ImageNet. ", "keywords": "self-supervised learning;contrastive learning;implicit regularization;dimensional collapse", "primary_area": "", "supplementary_material": "/attachment/c38a47fdd124927be812b8c10aabaf900a38363b.zip", "author": "Li Jing;Pascal Vincent;Yann LeCun;Yuandong Tian", "authorids": "~Li_Jing1;~Pascal_Vincent1;~Yann_LeCun1;~Yuandong_Tian1", "gender": "M;M;M;M", "homepage": "http://jingli.io/;http://www.iro.umontreal.ca/~vincentp;http://yann.lecun.com;http://yuandong-tian.com", "dblp": "59/6222;43/861;l/YannLeCun;t/YuandongTian", "google_scholar": "VhxDLwcAAAAJ;WBCKQMsAAAAJ;WLN3QrAAAAAJ;0mgEF28AAAAJ", "orcid": ";;;0000-0003-4202-4847", "linkedin": "li-jing-568b3765/;;;yuandongtian", "or_profile": "~Li_Jing1;~Pascal_Vincent1;~Yann_LeCun1;~Yuandong_Tian1", "aff": "Facebook AI Research;Facebook A.I. Research;New York University;Meta AI (FAIR)", "aff_domain": "fb.com;fb.com;nyu.edu;meta.com", "position": "Postdoc;Research Scientist;Full Professor;Research Scientist", "bibtex": "@inproceedings{\njing2022understanding,\ntitle={Understanding Dimensional Collapse in Contrastive Self-supervised Learning},\nauthor={Li Jing and Pascal Vincent and Yann LeCun and Yuandong Tian},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=YevsQ05DEN7}\n}", "github": "", "project": "", "reviewers": "z9oW;9VqB;fU6b;HfRQ;t7MP", "pdf_size": 0, "recommendation": "5;6;6;6;8", "confidence": "4;4;3;4;3", "correctness": "3;3;2;3;4", "technical_novelty": "2;3;3;3;4", "empirical_novelty": "3;2;2;3;4", "wc_summary_paper": "44;37;66;66;92", "wc_summary_review": "31;27;51;2;159", "wc_main_review": "195;204;279;176;583", "wc_review": "270;268;396;244;834", "wc_reply_reviewers": "0;0;251;0;124", "wc_reply_authors": "987;516;1491;401;783", "reply_reviewers": "0;0;2;0;1", "reply_authors": "3;1;4;1;2", "recommendation_avg": [ 6.2, 0.9797958971132712 ], "confidence_avg": [ 3.6, 0.4898979485566356 ], "correctness_avg": [ 3.0, 0.6324555320336759 ], "technical_novelty_avg": [ 3.0, 0.6324555320336759 ], "empirical_novelty_avg": [ 2.8, 0.7483314773547882 ], "wc_summary_paper_avg": [ 61.0, 19.3700800204852 ], "wc_summary_review_avg": [ 54.0, 54.76495229615379 ], "wc_main_review_avg": [ 287.4, 151.89022351685443 ], "wc_review_avg": [ 402.4, 222.2625474523317 ], "wc_reply_reviewers_avg": [ 75.0, 100.25168327763879 ], "wc_reply_authors_avg": [ 835.6, 386.3312568250206 ], "reply_reviewers_avg": [ 0.6, 0.7999999999999999 ], "reply_authors_avg": [ 2.2, 1.16619037896906 ], "replies_avg": [ 25, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.6666666666666667, "corr_recommendation_correctness": 0.6454972243679028, "gs_citation": 452, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15289790182345311933&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=YevsQ05DEN7", "email": "fb.com;fb.com;nyu.edu;meta.com", "author_num": 4, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Meta;New York University", "aff_unique_dep": "Facebook AI Research;", "aff_unique_url": "https://research.facebook.com;https://www.nyu.edu", "aff_unique_abbr": "FAIR;NYU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "YfFWrndRGQx", "title": "Multi-Objective Online Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "This paper presents a systematic study of multi-objective online learning. We first formulate the framework of Multi-Objective Online Convex Optimization, which encompasses a novel multi-objective dynamic regret in the unconstrained max-min form. We show that it is equivalent to the regret commonly used in the zero-order multi-objective bandit setting and overcomes the problem that the latter is hard to optimize via first-order gradient-based methods. Then we propose the Online Mirror Multiple Descent algorithm with two variants, which computes the composite gradient using either the vanilla min-norm solver or a newly designed $L_1$-regularized min-norm solver. We further derive regret bounds of both variants and show that the $L_1$-regularized variant enjoys a lower bound. Extensive experiments demonstrate the effectiveness of the proposed algorithm and verify the theoretical advantage of the $L_1$-regularized variant.", "keywords": "online algorithm;online learning;multi-objective optimization", "primary_area": "", "supplementary_material": "/attachment/b583a5c1ae427759574f73d83a960c0277631e4b.zip", "author": "Jiyan Jiang;Wenpeng Zhang;Shiji Zhou;Lihong Gu;Xiaodong Zeng;Wenwu Zhu", "authorids": "~Jiyan_Jiang1;~Wenpeng_Zhang1;~Shiji_Zhou1;~Lihong_Gu1;~Xiaodong_Zeng2;~Wenwu_Zhu1", "gender": "M;M;M;M;M;M", "homepage": ";;https://arnoldshijizhou.github.io;;;http://media.cs.tsinghua.edu.cn/en/zww", "dblp": ";203/4474.html;294/8684;128/4619;;97/6308-1.html", "google_scholar": ";EMMkuFMAAAAJ;Do5jf8oAAAAJ;;RaSueKIAAAAJ;https://scholar.google.com.tw/citations?user=7t2jzpgAAAAJ", "orcid": "0000-0002-1083-2834;;0009-0000-0677-7396;0000-0002-0706-3448;;0000-0003-2236-9290", "linkedin": "JiyanJiang/;;shiji-zhou-05b766ba/;lihong-gu-026172a6/;;", "or_profile": "~Jiyan_Jiang1;~Wenpeng_Zhang1;~Shiji_Zhou1;~Lihong_Gu1;~Xiaodong_Zeng2;~Wenwu_Zhu1", "aff": "Tsinghua University;Ant Group;Tsinghua University;;;Tsinghua University", "aff_domain": "tsinghua.edu.cn;ant.com;mails.tsinghua.edu.cn;;;tsinghua.edu.cn", "position": "PhD student;Researcher;PhD student;;;Full Professor", "bibtex": "@misc{\njiang2022multiobjective,\ntitle={Multi-Objective Online Learning},\nauthor={Jiyan Jiang and Wenpeng Zhang and Shiji Zhou and Lihong Gu and Xiaodong Zeng and Wenwu Zhu},\nyear={2022},\nurl={https://openreview.net/forum?id=YfFWrndRGQx}\n}", "github": "", "project": "", "reviewers": "Qnvq;3rTx;9N6u;ieZJ", "site": "https://openreview.net/forum?id=YfFWrndRGQx", "pdf_size": 0, "recommendation": "6;6;6;6", "confidence": "3;3;3;4", "correctness": "4;3;4;3", "technical_novelty": "3;3;3;4", "empirical_novelty": "3;0;3;4", "wc_summary_paper": "102;200;181;54", "wc_summary_review": "60;18;135;36", "wc_main_review": "422;145;330;314", "wc_review": "584;363;646;404", "wc_reply_reviewers": "0;0;75;0", "wc_reply_authors": "748;726;707;301", "reply_reviewers": "0;0;1;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 6.0, 0.0 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 1.5 ], "wc_summary_paper_avg": [ 134.25, 59.137023090446476 ], "wc_summary_review_avg": [ 62.25, 44.56666355023674 ], "wc_main_review_avg": [ 302.75, 99.96843251747023 ], "wc_review_avg": [ 499.25, 118.6957770942168 ], "wc_reply_reviewers_avg": [ 18.75, 32.47595264191645 ], "wc_reply_authors_avg": [ 620.5, 185.0331051460792 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13767632467884928189&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "aff_unique_index": "0;1;0;0", "aff_unique_norm": "Tsinghua University;Ant Group", "aff_unique_dep": ";", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.antgroup.com", "aff_unique_abbr": "THU;Ant Group", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "title": "Towards Building A Group-based Unsupervised Representation Disentanglement Framework", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/5931", "id": "YgPqNctmyd", "poster": "", "openreview": "https://openreview.net/forum?id=YgPqNctmyd", "slides": "https://iclr.cc/virtual/2022/poster/5931", "video": "https://iclr.cc/virtual/2022/poster/5931", "author_site": "Tao Yang, Xuanchi Ren, Yuwang Wang, Wenjun Zeng, Nanning Zheng", "tldr": "", "abstract": "Disentangled representation learning is one of the major goals of deep learning, and is a key step for achieving explainable and generalizable models. The key idea of the state-of-the-art VAE-based unsupervised representation disentanglement methods is to minimize the total correlation of the joint distribution of the latent variables. However, it has been proved that their goal can not be achieved without introducing other inductive biases. The Group Theory based definition of representation disentanglement mathematically connects the data transformations to the representations using the formalism of group. In this paper, built on the group-based definition and inspired by the \\emph{n-th dihedral group}, we first propose a theoretical framework towards achieving unsupervised representation disentanglement. We then propose a model based on existing VAE-based methods to tackle the unsupervised learning problem of the framework. In the theoretical framework, we prove three sufficient conditions on model, group structure, and data respectively in an effort to achieve, in an unsupervised way, disentangled representation per group-based definition. With these conditions, we offer an option, from the perspective of the group-based definition, for the inductive bias that existing VAE-based models lack. Experimentally, we train 1800 models covering the most prominent VAE-based methods on five datasets to verify the effectiveness of our theoretical framework. Compared to the original VAE-based methods, these Groupified VAEs consistently achieve better mean performance with smaller variances.", "keywords": "Disentangled representation learning;Group theory;VAE", "primary_area": "", "supplementary_material": "", "author": "Tao Yang;Xuanchi Ren;Yuwang Wang;Wenjun Zeng;Nanning Zheng", "authorids": "~Tao_Yang9;~Xuanchi_Ren1;~Yuwang_Wang3;~Wenjun_Zeng3;~Nanning_Zheng1", "gender": "M;M;M;M;M", "homepage": "https://github.com/ThomasMrY;https://xuanchiren.com/;;https://www.eias.ac.cn/h-col-187.html;", "dblp": ";255/5432;161/2633;57/145;07/256-1", "google_scholar": "https://scholar.google.com.hk/citations?user=qT5psCEAAAAJ;fDHUk18AAAAJ;;_cUfvYQAAAAJ;https://scholar.google.com/citations?hl=zh-CN", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~Tao_Yang9;~Xuanchi_Ren1;~Yuwang_Wang3;~Wenjun_Zeng3;~Nanning_Zheng1", "aff": "Xi'an Jiaotong University;Hong Kong University of Science and Technology;Microsoft Research Asia;Eastern Institute for Advanced Study;Xi'an Jiaotong University", "aff_domain": "xjtu.edu.cn;hkust.edu;microsoft.com;eias.ac.cn;xjtu.edu.cn", "position": "PhD student;Undergrad student;Researcher;Full Professor;Full Professor", "bibtex": "@inproceedings{\nyang2022towards,\ntitle={Towards Building A Group-based Unsupervised Representation Disentanglement Framework},\nauthor={Tao Yang and Xuanchi Ren and Yuwang Wang and Wenjun Zeng and Nanning Zheng},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=YgPqNctmyd}\n}", "github": "", "project": "", "reviewers": "M8Aa;PA1y;YTd1;AKvy", "pdf_size": 0, "recommendation": "3;6;6;8", "confidence": "2;4;4;4", "correctness": "2;4;3;4", "technical_novelty": "2;3;3;4", "empirical_novelty": "2;3;2;4", "wc_summary_paper": "125;52;47;77", "wc_summary_review": "59;38;85;62", "wc_main_review": "233;254;541;293", "wc_review": "417;344;673;432", "wc_reply_reviewers": "708;59;501;242", "wc_reply_authors": "1859;592;1934;394", "reply_reviewers": "3;1;2;1", "reply_authors": "7;3;6;3", "recommendation_avg": [ 5.75, 1.7853571071357126 ], "confidence_avg": [ 3.5, 0.8660254037844386 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 75.25, 30.889925542156945 ], "wc_summary_review_avg": [ 61.0, 16.658331248957683 ], "wc_main_review_avg": [ 330.25, 123.56653066263534 ], "wc_review_avg": [ 466.5, 123.78307638768719 ], "wc_reply_reviewers_avg": [ 377.5, 247.12598001828945 ], "wc_reply_authors_avg": [ 1194.75, 705.7313139573729 ], "reply_reviewers_avg": [ 1.75, 0.82915619758885 ], "reply_authors_avg": [ 4.75, 1.7853571071357126 ], "replies_avg": [ 32, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.8892972917998875, "corr_recommendation_correctness": 0.8866206949335731, "gs_citation": 31, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12379032527618028840&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=YgPqNctmyd", "email": "xjtu.edu.cn;hkust.edu;microsoft.com;eias.ac.cn;xjtu.edu.cn", "author_num": 5, "aff_unique_index": "0;1;2;3;0", "aff_unique_norm": "Xi'an Jiao Tong University;Hong Kong University of Science and Technology;Microsoft;Eastern Institute for Advanced Study", "aff_unique_dep": ";;Research;", "aff_unique_url": "https://www.xjtu.edu.cn;https://www.ust.hk;https://www.microsoft.com/en-us/research/group/asia;", "aff_unique_abbr": "XJTU;HKUST;MSR Asia;", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Hong Kong SAR;Asia", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China;" }, { "id": "YgR1rRWETI", "title": "Connectivity Matters: Neural Network Pruning Through the Lens of Effective Sparsity", "track": "main", "status": "Reject", "tldr": "", "abstract": "Neural network pruning is a fruitful area of research with surging interest in high sparsity regimes. Benchmarking in this domain heavily relies on faithful representation of the sparsity of subnetworks, which has been traditionally computed as the fraction of removed connections (direct sparsity). This definition, however, fails to recognize unpruned parameters that detached from input or output layers of underlying subnetworks, potentially underestimating actual effective sparsity: the fraction of inactivated connections. While this effect might be negligible for moderately pruned networks (up to $10\\times-100\\times$ compression rates), we find that it plays an increasing role for thinner subnetworks, greatly distorting comparison between different pruning algorithms. For example, we show that effective compression of a randomly pruned LeNet-300-100 can be orders of magnitude larger than its direct counterpart, while no discrepancy is ever observed when using SynFlow for pruning (Tanaka et al., 2020). In this work, we adopt the lens of effective sparsity to reevaluate several recent pruning algorithms on common benchmark architectures (e.g., LeNet-300-100, VGG-19, ResNet-18) and discover that their absolute and relative performance changes dramatically in this new, and as we argue, more appropriate framework. To aim for effective, rather than direct, sparsity, we develop a low-cost extension to most pruning algorithms. Further, equipped with effective sparsity as a reference frame, we partially reconfirm that random pruning with appropriate sparsity allocation across layers performs as well or better than more sophisticated algorithms for pruning at initialization (Su et al., 2020). In response to this observation, using a simple analogy of pressure distribution in coupled cylinders from thermodynamics, we design novel layerwise sparsity quotas that outperform all existing baselines in the context of random pruning.", "keywords": "Neural Networks;Pruning;Sparsity", "primary_area": "", "supplementary_material": "/attachment/f0f1b65abda768277a271c0f1956c293c579c5f9.zip", "author": "Artem M Vysogorets;Julia Kempe", "authorids": "~Artem_M_Vysogorets1;~Julia_Kempe1", "gender": "M;", "homepage": "https://artem.vysogorets.org;", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": "avysogorets/;", "or_profile": "~Artem_M_Vysogorets1;~Julia_Kempe1", "aff": "Bloomberg;", "aff_domain": "bloomberg.com;", "position": "Intern;", "bibtex": "@misc{\nvysogorets2022connectivity,\ntitle={Connectivity Matters: Neural Network Pruning Through the Lens of Effective Sparsity},\nauthor={Artem M Vysogorets and Julia Kempe},\nyear={2022},\nurl={https://openreview.net/forum?id=YgR1rRWETI}\n}", "github": "", "project": "", "reviewers": "Ne1A;xGEE;B6nv;B4nR", "site": "https://openreview.net/forum?id=YgR1rRWETI", "pdf_size": 0, "recommendation": "3;3;3;6", "confidence": "4;4;4;4", "correctness": "3;4;3;3", "technical_novelty": "1;2;1;3", "empirical_novelty": "2;2;1;3", "wc_summary_paper": "64;67;60;106", "wc_summary_review": "83;67;53;20", "wc_main_review": "381;262;795;408", "wc_review": "528;396;908;534", "wc_reply_reviewers": "456;107;0;122", "wc_reply_authors": "457;316;1162;339", "reply_reviewers": "1;1;0;1", "reply_authors": "1;1;2;1", "recommendation_avg": [ 3.75, 1.299038105676658 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 1.75, 0.82915619758885 ], "empirical_novelty_avg": [ 2.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 74.25, 18.498310733685926 ], "wc_summary_review_avg": [ 55.75, 23.209642392764263 ], "wc_main_review_avg": [ 461.5, 200.2279950456479 ], "wc_review_avg": [ 591.5, 190.87364930759824 ], "wc_reply_reviewers_avg": [ 171.25, 170.99908625486862 ], "wc_reply_authors_avg": [ 568.5, 346.80722310817 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": -0.3333333333333333, "gs_citation": 23, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10514056194055911863&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff_unique_index": "0", "aff_unique_norm": "Bloomberg", "aff_unique_dep": "", "aff_unique_url": "https://www.bloomberg.com", "aff_unique_abbr": "Bloomberg", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "title": "Permutation-Based SGD: Is Random Optimal?", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6386", "id": "YiBa9HKTyXE", "poster": "", "openreview": "https://openreview.net/forum?id=YiBa9HKTyXE", "slides": "https://iclr.cc/virtual/2022/poster/6386", "video": "https://iclr.cc/virtual/2022/poster/6386", "author_site": "Shashank Rajput, Kangwook Lee, Dimitris Papailiopoulos", "tldr": "", "abstract": "A recent line of ground-breaking results for permutation-based SGD has corroborated a widely observed phenomenon: random permutations offer faster convergence than with-replacement sampling. However, is random optimal? We show that this depends heavily on what functions we are optimizing, and the convergence gap between optimal and random permutations can vary from exponential to nonexistent. We first show that for 1-dimensional strongly convex functions, with smooth second derivatives, there exist optimal permutations that offer exponentially faster convergence compared to random. However, for general strongly convex functions, random permutations are optimal. Finally, we show that for quadratic, strongly-convex functions, there are easy-to-construct permutations that lead to accelerated convergence compared to random. Our results suggest that a general convergence characterization of optimal permutations cannot capture the nuances of individual function classes, and can mistakenly indicate that one cannot do much better than random.", "keywords": "Convex Optimization;Stochastic Optimization;Large Scale Learning", "primary_area": "", "supplementary_material": "/attachment/ed23add48779386139f866e6ee6e8a15a7c4a141.zip", "author": "Shashank Rajput;Kangwook Lee;Dimitris Papailiopoulos", "authorids": "~Shashank_Rajput1;~Kangwook_Lee1;~Dimitris_Papailiopoulos1", "gender": "M;M;M", "homepage": "https://pages.cs.wisc.edu/~srajput/;http://kangwooklee.com/;http://papail.io", "dblp": "241/5361;88/9826-1;", "google_scholar": "qEXxyDQAAAAJ;sCEl8r-n5VEC;hYi6i9sAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Shashank_Rajput1;~Kangwook_Lee1;~Dimitris_Papailiopoulos1", "aff": "University of Wisconsin, Madison;KRAFTON;University of Wisconsin, Madison", "aff_domain": "wisc.edu;krafton.com;wisc.edu", "position": "PhD student;Researcher;Assistant Professor", "bibtex": "@inproceedings{\nrajput2022permutationbased,\ntitle={Permutation-Based {SGD}: Is Random Optimal?},\nauthor={Shashank Rajput and Kangwook Lee and Dimitris Papailiopoulos},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=YiBa9HKTyXE}\n}", "github": "", "project": "", "reviewers": "53pE;jCFt;JbYX;D2Ry", "pdf_size": 0, "recommendation": "6;6;6;10", "confidence": "3;4;2;4", "correctness": "3;4;4;4", "technical_novelty": "3;4;3;4", "empirical_novelty": "1;3;2;2", "wc_summary_paper": "113;127;126;405", "wc_summary_review": "29;51;30;133", "wc_main_review": "170;260;123;737", "wc_review": "312;438;279;1275", "wc_reply_reviewers": "0;0;0;98", "wc_reply_authors": "430;771;565;815", "reply_reviewers": "0;0;0;1", "reply_authors": "1;1;1;2", "recommendation_avg": [ 7.0, 1.7320508075688772 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.5, 0.5 ], "empirical_novelty_avg": [ 2.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 192.75, 122.66697803402512 ], "wc_summary_review_avg": [ 60.75, 42.628482262449836 ], "wc_main_review_avg": [ 322.5, 244.3220211114831 ], "wc_review_avg": [ 576.0, 407.90623922661445 ], "wc_reply_reviewers_avg": [ 24.5, 42.4352447854375 ], "wc_reply_authors_avg": [ 645.25, 156.04546613086842 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.5222329678670935, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 21, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9197780273484525148&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=YiBa9HKTyXE", "email": "wisc.edu;krafton.com;wisc.edu", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "University of Wisconsin;KRAFTON Inc.", "aff_unique_dep": ";", "aff_unique_url": "https://www.wisc.edu;https://www.krafton.com", "aff_unique_abbr": "UW;KRAFTON", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Madison;", "aff_country_unique_index": "0;1;0", "aff_country_unique": "United States;South Korea" }, { "title": "Generalized Demographic Parity for Group Fairness", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6839", "id": "YigKlMJwjye", "poster": "", "openreview": "https://openreview.net/forum?id=YigKlMJwjye", "slides": "https://iclr.cc/virtual/2022/poster/6839", "video": "https://iclr.cc/virtual/2022/poster/6839", "author_site": "Zhimeng Jiang, Xiaotian Han, Chao Fan, Fan Yang, Ali Mostafavi, Xia Hu", "tldr": "", "abstract": "This work aims to generalize demographic parity to continuous sensitive attributes while preserving tractable computation. Current fairness metrics for continuous sensitive attributes largely rely on intractable statistical independence between variables, such as Hirschfeld-Gebelein-Renyi (HGR) and mutual information. Statistical fairness metrics estimation relying on either tractable bounds or neural network approximation, however, are not sufficiently trustful to rank algorithms prediction bias due to lack of estimation accuracy guarantee. \nTo make fairness metrics trustable, we propose \\textit{\\underline{G}eneralized \\underline{D}emographic \\underline{P}arity} (GDP), a group fairness metric for continuous and discrete attributes. We show the understanding of GDP from the probability perspective and theoretically reveal the connection between GDP regularizer and adversarial debiasing. To estimate GDP, we adopt hard and soft group strategies via the one-hot or the soft group indicator, representing the membership of each sample in different groups of the sensitive attribute. We provably and numerically show that the soft group strategy achieves a faster estimation error convergence rate. Experiments show the better bias mitigation performance of GDP regularizer, compared with adversarial debiasing, for regression and classification tasks in tabular and graph benchmarks.", "keywords": "Generalized demographic parity;estimation error analysis", "primary_area": "", "supplementary_material": "/attachment/842b40ecf91b9a8b70942e78295167d95058bbec.zip", "author": "Zhimeng Jiang;Xiaotian Han;Chao Fan;Fan Yang;Ali Mostafavi;Xia Hu", "authorids": "~Zhimeng_Jiang1;~Xiaotian_Han1;~Chao_Fan2;~Fan_Yang27;~Ali_Mostafavi2;~Xia_Hu4", "gender": "M;M;;M;M;M", "homepage": "http://www.zhimengjiang.com/;https://ahxt.github.io/;https://fanchaolab.com;https://yangfan.sites.wfu.edu/;;https://cs.rice.edu/~xh37/index.html", "dblp": "217/3235;;;;;256/9406.html", "google_scholar": "5Es3Yk4AAAAJ;Uromx98AAAAJ;3k_B_zUAAAAJ;RXFeW-8AAAAJ;DFNvQPYAAAAJ;https://scholar.google.com.tw/citations?user=pcCS60IAAAAJ", "orcid": "0000-0001-6933-3952;;;0000-0003-3442-754X;;", "linkedin": ";;;;;", "or_profile": "~Zhimeng_Jiang1;~Xiaotian_Han1;~Chao_Fan2;~Fan_Yang27;~Ali_Mostafavi2;~Xia_Hu2", "aff": "Texas A&M University;Texas A&M University;Texas A&M;Rice University;Texas A&M;Rice University", "aff_domain": "tamu.edu;tamu.edu;tamu.edu;rice.edu;tamu.edu;rice.edu", "position": "PhD student;PhD student;Postdoc;PhD student;Associate Professor;Associate Professor", "bibtex": "@inproceedings{\njiang2022generalized,\ntitle={Generalized Demographic Parity for Group Fairness},\nauthor={Zhimeng Jiang and Xiaotian Han and Chao Fan and Fan Yang and Ali Mostafavi and Xia Hu},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=YigKlMJwjye}\n}", "github": "", "project": "", "reviewers": "dM4B;Lp8E;s7Ek;zy7G", "pdf_size": 0, "recommendation": "5;6;6;6", "confidence": "4;3;4;3", "correctness": "4;4;3;4", "technical_novelty": "2;4;3;3", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "44;185;4;66", "wc_summary_review": "12;104;24;74", "wc_main_review": "292;166;514;128", "wc_review": "348;455;542;268", "wc_reply_reviewers": "280;29;129;0", "wc_reply_authors": "1756;1049;1403;621", "reply_reviewers": "1;1;1;0", "reply_authors": "4;3;5;2", "recommendation_avg": [ 5.75, 0.4330127018922193 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 74.75, 67.42171386133698 ], "wc_summary_review_avg": [ 53.5, 37.29276069158732 ], "wc_main_review_avg": [ 275.0, 150.74813431681335 ], "wc_review_avg": [ 403.25, 104.01291987056223 ], "wc_reply_reviewers_avg": [ 109.5, 109.454328374898 ], "wc_reply_authors_avg": [ 1207.25, 420.7661910134891 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 3.5, 1.118033988749895 ], "replies_avg": [ 24, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": -0.3333333333333333, "gs_citation": 83, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1671616400535986934&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=YigKlMJwjye", "email": "tamu.edu;tamu.edu;tamu.edu;rice.edu;tamu.edu;rice.edu", "author_num": 6, "aff_unique_index": "0;0;0;1;0;1", "aff_unique_norm": "Texas A&M University;Rice University", "aff_unique_dep": ";", "aff_unique_url": "https://www.tamu.edu;https://www.rice.edu", "aff_unique_abbr": "TAMU;Rice", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "YmONQIWli--", "title": "Gotta Go Fast When Generating Data with Score-Based Models", "track": "main", "status": "Reject", "tldr": "", "abstract": "Score-based (denoising diffusion) generative models have recently gained a lot of success in generating realistic and diverse data. These approaches define a forward diffusion process for transforming data to noise and generate data by reversing it (thereby going from noise to data). Unfortunately, current score-based models generate data very slowly due to the sheer number of score network evaluations required by numerical SDE solvers. \n \nIn this work, we aim to accelerate this process by devising a more efficient SDE solver. Existing approaches rely on the Euler-Maruyama (EM) solver, which uses a fixed step size. We found that naively replacing it with other SDE solvers fares poorly - they either result in low-quality samples or become slower than EM. To get around this issue, we carefully devise an SDE solver with adaptive step sizes tailored to score-based generative models piece by piece. Our solver requires only two score function evaluations, rarely rejects samples, and leads to high-quality samples. Our approach generates data 2 to 10 times faster than EM while achieving better or equal sample quality. For high-resolution images, our method leads to significantly higher quality samples than all other methods tested. Our SDE solver has the benefit of requiring no step size tuning.", "keywords": "score-based;generative model;denoising diffusion;SDE;diffusion process", "primary_area": "", "supplementary_material": "", "author": "Alexia Jolicoeur-Martineau;Ke Li;R\u00e9mi Pich\u00e9-Taillefer;Tal Kachman;Ioannis Mitliagkas", "authorids": "~Alexia_Jolicoeur-Martineau1;~Ke_Li1;~R\u00e9mi_Pich\u00e9-Taillefer1;~Tal_Kachman1;~Ioannis_Mitliagkas1", "gender": "F;M;;;M", "homepage": "https://ajolicoeur.wordpress.com;http://www.sfu.ca/~keli/;;;http://mitliagkas.github.io/", "dblp": "223/4753;75/6627-11;;;83/8757", "google_scholar": "0qytQ1oAAAAJ;vQc8tI4AAAAJ;BhtSX08AAAAJ;;K757SxgAAAAJ", "orcid": "0000-0003-2169-4008;;;;", "linkedin": ";;;;", "or_profile": "~Alexia_Jolicoeur-Martineau1;~Ke_Li1;~R\u00e9mi_Pich\u00e9-Taillefer1;~Tal_Kachman1;~Ioannis_Mitliagkas1", "aff": "University of Montreal;Simon Fraser University;;;University of Montreal", "aff_domain": "umontreal.ca;sfu.ca;;;umontreal.ca", "position": "PhD student;Assistant Professor;;;Assistant Professor", "bibtex": "@misc{\njolicoeur-martineau2022gotta,\ntitle={Gotta Go Fast When Generating Data with Score-Based Models},\nauthor={Alexia Jolicoeur-Martineau and Ke Li and R{\\'e}mi Pich{\\'e}-Taillefer and Tal Kachman and Ioannis Mitliagkas},\nyear={2022},\nurl={https://openreview.net/forum?id=YmONQIWli--}\n}", "github": "", "project": "", "reviewers": "Px4w;ybV4;RPn6;4pA6", "site": "https://openreview.net/forum?id=YmONQIWli--", "pdf_size": 0, "recommendation": "5;5;6;8", "confidence": "3;3;3;4", "correctness": "3;3;2;3", "technical_novelty": "2;2;2;4", "empirical_novelty": "3;2;3;3", "wc_summary_paper": "52;60;44;35", "wc_summary_review": "124;115;25;26", "wc_main_review": "303;269;219;208", "wc_review": "479;444;288;269", "wc_reply_reviewers": "0;0;17;19", "wc_reply_authors": "694;849;773;329", "reply_reviewers": "0;0;1;1", "reply_authors": "1;1;1;1", "recommendation_avg": [ 6.0, 1.224744871391589 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.8660254037844386 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 47.75, 9.283722313813572 ], "wc_summary_review_avg": [ 72.5, 47.10891635348875 ], "wc_main_review_avg": [ 249.75, 38.38863764188565 ], "wc_review_avg": [ 370.0, 92.57699498255492 ], "wc_reply_reviewers_avg": [ 9.0, 9.027735042633894 ], "wc_reply_authors_avg": [ 661.25, 199.4998433583345 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.9428090415820632, "corr_recommendation_correctness": 0.0, "gs_citation": 253, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=960292549817161760&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "University of Montreal;Simon Fraser University", "aff_unique_dep": ";", "aff_unique_url": "https://wwwumontreal.ca;https://www.sfu.ca", "aff_unique_abbr": "UM;SFU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Canada" }, { "id": "Yn4CPz_LRKO", "title": "Conditional GANs with Auxiliary Discriminative Classifier", "track": "main", "status": "Reject", "tldr": "", "abstract": "Conditional generative models aim to learn the underlying joint distribution of data and labels, and thus realize conditional generation. Among them, auxiliary classifier generative adversarial networks (AC-GAN) have been widely used, but suffer from the problem of low intra-class diversity on generated samples. In this paper, we point out that the fundamental reason is that the classifier of AC-GAN is generator-agnostic, and therefore cannot provide informative guidance to the generator to approximate the target distribution, resulting in minimization of conditional entropy that decreases the intra-class diversity. Motivated by this observation, we propose a novel conditional GAN with auxiliary \\textit{discriminative} classifier (ADC-GAN) to resolve the problem of AC-GAN. Specifically, the proposed auxiliary \\textit{discriminative} classifier becomes generator-aware by recognizing the labels of the real data and the generated data \\textit{discriminatively}. Our theoretical analysis reveals that the generator can faithfully replicate the target distribution even without the original discriminator, making the proposed ADC-GAN robust to the hyper-parameter and stable during the training process. Extensive experimental results on synthetic and real-world datasets demonstrate the superiority of ADC-GAN on conditional generative modeling compared to competing methods.", "keywords": "conditional generative adversarial networks;conditional image generation", "primary_area": "", "supplementary_material": "/attachment/92628413225657119e161c3eb2172cde831a94ad.zip", "author": "Liang Hou;Qi Cao;Huawei Shen;Xueqi Cheng", "authorids": "~Liang_Hou1;~Qi_Cao1;~Huawei_Shen1;~Xueqi_Cheng1", "gender": "M;F;M;M", "homepage": "https://liang-hou.github.io/;https://caoqi92.github.io/biography/;https://www.ict.ac.cn/sourcedb/cn/jssrck/201402/t20140221_4037648.html;https://people.ucas.ac.cn/~cxq?language=en", "dblp": ";40/5905;;44/912", "google_scholar": "X48pntMAAAAJ;FflWb1gAAAAJ;;hY8aLqAAAAAJ", "orcid": ";;0000-0002-1081-8119;", "linkedin": ";;;", "or_profile": "~Liang_Hou1;~Qi_Cao1;~Huawei_Shen1;~Xueqi_Cheng1", "aff": "Institute of Computing Technology, Chinese Academy of Sciences;Institute of Computing Technology, Chinese Academy of Sciences, China;Institute of Computing Technology, Chinese Academy of Sciences;Institute of Computing Technology, Chinese Academy", "aff_domain": "ict.ac.cn;ict.ac.cn;ict.ac.cn;ict.ac.cn", "position": "PhD student;Assistant Professor;Full Professor;Full Professor", "bibtex": "@misc{\nhou2022conditional,\ntitle={Conditional {GAN}s with Auxiliary Discriminative Classifier},\nauthor={Liang Hou and Qi Cao and Huawei Shen and Xueqi Cheng},\nyear={2022},\nurl={https://openreview.net/forum?id=Yn4CPz_LRKO}\n}", "github": "", "project": "", "reviewers": "DPgR;mZT7;ebJs;uPwH", "site": "https://openreview.net/forum?id=Yn4CPz_LRKO", "pdf_size": 0, "recommendation": "5;5;6;8", "confidence": "5;5;4;4", "correctness": "3;3;3;3", "technical_novelty": "1;2;2;4", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "143;64;41;84", "wc_summary_review": "86;30;51;31", "wc_main_review": "297;632;497;453", "wc_review": "526;726;589;568", "wc_reply_reviewers": "340;66;173;65", "wc_reply_authors": "834;1278;689;763", "reply_reviewers": "3;1;1;1", "reply_authors": "2;3;1;1", "recommendation_avg": [ 6.0, 1.224744871391589 ], "confidence_avg": [ 4.5, 0.5 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.25, 1.0897247358851685 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 83.0, 37.83516882478523 ], "wc_summary_review_avg": [ 49.5, 22.677080940897135 ], "wc_main_review_avg": [ 469.75, 119.57293799183827 ], "wc_review_avg": [ 602.25, 74.96123998440794 ], "wc_reply_reviewers_avg": [ 161.0, 112.27867117133155 ], "wc_reply_authors_avg": [ 891.0, 229.241139414373 ], "reply_reviewers_avg": [ 1.5, 0.8660254037844386 ], "reply_authors_avg": [ 1.75, 0.82915619758885 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.8164965809277259, "corr_recommendation_correctness": 0.0, "gs_citation": 51, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=868024013198158367&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Chinese Academy of Sciences", "aff_unique_dep": "Institute of Computing Technology", "aff_unique_url": "http://www.ict.ac.cn", "aff_unique_abbr": "CAS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "id": "Yp4sR6rmgFt", "title": "Transductive Universal Transport for Zero-Shot Action Recognition", "track": "main", "status": "Reject", "tldr": "", "abstract": "This work addresses the problem of recognizing action categories in videos for which no training examples are available. The current state-of-the-art enables such a zero-shot recognition by learning universal mappings from videos to a shared semantic space, either trained on large-scale seen actions or on objects. While effective, universal action and object models are biased to their seen categories. Such biases are further amplified due to biases between seen and unseen categories in the semantic space. The amplified biases result in many unseen action categories simply never being selected during inference, hampering zero-shot progress. We seeks to address this limitation and introduce transductive universal transport for zero-shot action recognition. Our proposal is to re-position unseen action embeddings through transduction, \\ie by using the distribution of the unlabelled test set. For universal action models, we first find an optimal mapping from unseen actions to the mapped test videos in the shared hyperspherical space. We then define target embeddings as weighted Fr\\'echet means, with the weights given by the transport couplings. Finally, we re-position unseen action embeddings along the geodesic between the original and target, as a form of semantic regularization. For universal object models, we outline a weighted transport variant from unseen action embeddings to object embeddings directly. Empirically, we show that our approach directly boosts universal action and object models, resulting in state-of-the-art performance for zero-shot classification and spatio-temporal localization.", "keywords": "zero-shot learning;action recognition;action localization;optimal transport", "primary_area": "", "supplementary_material": "", "author": "Pascal Mettes", "authorids": "~Pascal_Mettes1", "gender": "M", "homepage": "https://staff.fnwi.uva.nl/p.s.m.mettes/", "dblp": "147/4008", "google_scholar": "https://scholar.google.nl/citations?user=sMQxA3AAAAAJ", "orcid": "0000-0001-9275-5942", "linkedin": "", "or_profile": "~Pascal_Mettes1", "aff": "University of Amsterdam", "aff_domain": "uva.nl", "position": "Assistant Professor", "bibtex": "@misc{\nmettes2022transductive,\ntitle={Transductive Universal Transport for Zero-Shot Action Recognition},\nauthor={Pascal Mettes},\nyear={2022},\nurl={https://openreview.net/forum?id=Yp4sR6rmgFt}\n}", "github": "", "project": "", "reviewers": "Gn5W;EmnY;2kAP;cLxn", "site": "https://openreview.net/forum?id=Yp4sR6rmgFt", "pdf_size": 0, "recommendation": "5;5;5;6", "confidence": "2;4;2;2", "correctness": "3;3;3;3", "technical_novelty": "3;2;2;3", "empirical_novelty": "3;2;2;3", "wc_summary_paper": "62;95;108;78", "wc_summary_review": "46;45;19;85", "wc_main_review": "162;154;232;117", "wc_review": "270;294;359;280", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "227;319;247;241", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 5.25, 0.4330127018922193 ], "confidence_avg": [ 2.5, 0.8660254037844386 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 85.75, 17.354754391808605 ], "wc_summary_review_avg": [ 48.75, 23.562417108607512 ], "wc_main_review_avg": [ 166.25, 41.583500333665995 ], "wc_review_avg": [ 300.75, 34.694199803425356 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 258.5, 35.67562192870644 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 1, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:-s5xo0eL72kJ:scholar.google.com/&scioq=Transductive+Universal+Transport+for+Zero-Shot+Action+Recognition&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "University of Amsterdam", "aff_unique_dep": "", "aff_unique_url": "https://www.uva.nl", "aff_unique_abbr": "UvA", "aff_country_unique_index": "0", "aff_country_unique": "Netherlands" }, { "id": "YpBHDlalKDG", "title": "Complex Locomotion Skill Learning via Differentiable Physics", "track": "main", "status": "Reject", "tldr": "", "abstract": "Differentiable physics enables efficient gradient-based optimizations of neural network (NN) controllers. However, existing work typically only delivers NN controllers with limited capability and generalizability. We present a practical learning framework that outputs unified NN controllers capable of tasks with significantly improved complexity and diversity. To systematically improve training robustness and efficiency, we investigated a suite of improvements over the baseline approach, including periodic activation functions, and tailored loss functions. In addition, we find our adoption of batching and a modified Adam optimizer effective in training complex locomotion tasks. We evaluate our framework on differentiable mass-spring and material point method (MPM) simulations, with challenging locomotion tasks and multiple robot designs. Experiments show that our learning framework, based on differentiable physics, delivers better results than reinforcement learning and converges much faster. We demonstrate that users can interactively control soft robot locomotion and switch among multiple goals with specified velocity, height, and direction instructions using a unified NN controller trained in our system.", "keywords": "differentiable physics;locomotion skill learning;physical simulation", "primary_area": "", "supplementary_material": "/attachment/59de33a9d1ab4b88254b6fc58d0986f0453480a1.zip", "author": "Jiancheng Liu;Yu Fang;Mingrui Zhang;Jiasheng Zhang;Yidong Ma;Minchen Li;Yuanming Hu;Chenfanfu Jiang;Tiantian Liu", "authorids": "~Jiancheng_Liu2;~Yu_Fang2;~Mingrui_Zhang4;~Jiasheng_Zhang1;~Yidong_Ma1;~Minchen_Li1;~Yuanming_Hu1;~Chenfanfu_Jiang3;~Tiantian_Liu1", "gender": "M;M;M;M;M;M;M;;F", "homepage": "https://ljcc0930.github.io/;http://squarefk.com/;https://erizmr.github.io/;;http://www.ma-yidong.com;https://www.cs.cmu.edu/~minchenl/;http://taichi.graphics/me;;https://tiantianliu.cn/", "dblp": "74/3002;;;;;175/5809;204/4110;132/7630;", "google_scholar": "ReWNzl4AAAAJ;;IN75qZcAAAAJ;;;https://scholar.google.ca/citations?user=Gk7K7VkAAAAJ;https://scholar.google.com/citations?hl=en;;", "orcid": ";;;;;0000-0001-9868-7311;;;", "linkedin": ";;;jiasheng-zhang-43858a193;;minchenli/;;;", "or_profile": "~Jiancheng_Liu2;~Yu_Fang2;~Mingrui_Zhang4;~Jiasheng_Zhang1;~Yidong_Ma1;~Minchen_Li1;~Yuanming_Hu1;~Chenfanfu_Jiang3;~Tiantian_Liu1", "aff": "Taichi Graphics Inc.;Department of Computer and Information Science, School of Engineering and Applied Science;Imperial College London;;;University of California, Los Angeles;Massachusetts Institute of Technology;University of California, Los Angeles;", "aff_domain": "taichi.graphics;cis.upenn.edu;imperial.ac.uk;;;ucla.edu;mit.edu;ucla.edu;", "position": "Researcher;PhD student;PhD student;;;Postdoc;PhD student;Associate Professor;", "bibtex": "@misc{\nliu2022complex,\ntitle={Complex Locomotion Skill Learning via Differentiable Physics},\nauthor={Jiancheng Liu and Yu Fang and Mingrui Zhang and Jiasheng Zhang and Yidong Ma and Minchen Li and Yuanming Hu and Chenfanfu Jiang and Tiantian Liu},\nyear={2022},\nurl={https://openreview.net/forum?id=YpBHDlalKDG}\n}", "github": "", "project": "", "reviewers": "cp7c;kPtv;2nYD;38Wd", "site": "https://openreview.net/forum?id=YpBHDlalKDG", "pdf_size": 0, "recommendation": "5;6;6;6", "confidence": "4;4;3;4", "correctness": "3;3;4;3", "technical_novelty": "2;2;2;2", "empirical_novelty": "3;3;3;3", "wc_summary_paper": "69;73;102;59", "wc_summary_review": "41;51;60;63", "wc_main_review": "265;275;205;461", "wc_review": "375;399;367;583", "wc_reply_reviewers": "0;0;72;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;1;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 5.75, 0.4330127018922193 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 75.75, 15.990231392947383 ], "wc_summary_review_avg": [ 53.75, 8.584142356694699 ], "wc_main_review_avg": [ 301.5, 95.8996871736295 ], "wc_review_avg": [ 431.0, 88.54377448471462 ], "wc_reply_reviewers_avg": [ 18.0, 31.176914536239792 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 9, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5250715052868212006&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "aff_unique_index": "0;1;2;3;4;3", "aff_unique_norm": "Taichi Graphics Inc.;School of Engineering and Applied Science;Imperial College London;University of California, Los Angeles;Massachusetts Institute of Technology", "aff_unique_dep": ";Department of Computer and Information Science;;;", "aff_unique_url": ";;https://www.imperial.ac.uk;https://www.ucla.edu;https://web.mit.edu", "aff_unique_abbr": ";;ICL;UCLA;MIT", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Los Angeles", "aff_country_unique_index": "0;2;0;0;0", "aff_country_unique": "United States;;United Kingdom" }, { "title": "Universalizing Weak Supervision", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7091", "id": "YpPiNigTzMT", "poster": "", "openreview": "https://openreview.net/forum?id=YpPiNigTzMT", "slides": "https://iclr.cc/virtual/2022/poster/7091", "video": "https://iclr.cc/virtual/2022/poster/7091", "author_site": "Changho Shin, Winfred Li, Harit Vishwakarma, Nicholas Roberts, Frederic Sala", "tldr": "", "abstract": "Weak supervision (WS) frameworks are a popular way to bypass hand-labeling large datasets for training data-hungry models.\nThese approaches synthesize multiple noisy but cheaply-acquired estimates of labels into a set of high-quality pseudo-labels for downstream training. However, the synthesis technique is specific to a particular kind of label, such as binary labels or sequences, and each new label type requires manually designing a new synthesis algorithm. Instead, we propose a universal technique that enables weak supervision over any label type while still offering desirable properties, including practical flexibility, computational efficiency, and theoretical guarantees. We apply this technique to important problems previously not tackled by WS frameworks including learning to rank, regression, and learning in hyperbolic space. Theoretically, our synthesis approach produces a consistent estimators for learning some challenging but important generalizations of the exponential family model. Experimentally, we validate our framework and show improvement over baselines in diverse settings including real-world learning-to-rank and regression problems along with learning on hyperbolic manifolds.", "keywords": "Weak supervision", "primary_area": "", "supplementary_material": "/attachment/fd1ee4735ecb1725affb1b44878a640320daa2d4.zip", "author": "Changho Shin;Winfred Li;Harit Vishwakarma;Nicholas Carl Roberts;Frederic Sala", "authorids": "~Changho_Shin2;~Winfred_Li1;~Harit_Vishwakarma1;~Nicholas_Carl_Roberts1;~Frederic_Sala1", "gender": ";;M;M;M", "homepage": ";;https://harit7.github.io;https://nick11roberts.science/;https://pages.cs.wisc.edu/~fredsala/", "dblp": ";;207/7622;;133/3602", "google_scholar": "VpvIQAcAAAAJ;;pJF_ZZUAAAAJ;https://scholar.google.com/citations?hl=en;9KhIkNkAAAAJ", "orcid": ";;;0000-0002-0625-9182;", "linkedin": ";winfred-li-a2435b156;harit7;nick11roberts/;", "or_profile": "~Changho_Shin2;~Winfred_Li1;~Harit_Vishwakarma1;~Nicholas_Carl_Roberts1;~Frederic_Sala1", "aff": "University of Wisconsin, Madison;University of Wisconsin, Madison;University of Wisconsin, Madison;University of Wisconsin-Madison;University of Wisconsin, Madison", "aff_domain": "wisc.edu;wisc.edu;wisc.edu;wisc.edu;wisc.edu", "position": "PhD student;PhD student;PhD student;PhD student;Assistant Professor", "bibtex": "@inproceedings{\nshin2022universalizing,\ntitle={Universalizing Weak Supervision},\nauthor={Changho Shin and Winfred Li and Harit Vishwakarma and Nicholas Carl Roberts and Frederic Sala},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=YpPiNigTzMT}\n}", "github": "", "project": "", "reviewers": "NJ4o;xXw5;NZTU;AswL", "pdf_size": 0, "recommendation": "3;5;8;8", "confidence": "4;3;3;2", "correctness": "3;4;4;4", "technical_novelty": "3;2;3;3", "empirical_novelty": "3;3;3;2", "wc_summary_paper": "19;105;48;146", "wc_summary_review": "34;48;33;74", "wc_main_review": "295;263;191;329", "wc_review": "348;416;272;549", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "585;1046;161;417", "reply_reviewers": "0;0;0;0", "reply_authors": "1;2;1;1", "recommendation_avg": [ 6.0, 2.1213203435596424 ], "confidence_avg": [ 3.0, 0.7071067811865476 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 79.5, 49.30770730829005 ], "wc_summary_review_avg": [ 47.25, 16.543503256565703 ], "wc_main_review_avg": [ 269.5, 50.977936403899285 ], "wc_review_avg": [ 396.25, 101.84393698203148 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 552.25, 322.5797382043702 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.8333333333333334, "corr_recommendation_correctness": 0.8164965809277261, "gs_citation": 43, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13783191610599324767&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=YpPiNigTzMT", "email": "wisc.edu;wisc.edu;wisc.edu;wisc.edu;wisc.edu", "author_num": 5, "aff_unique_index": "0;0;0;1;0", "aff_unique_norm": "University of Wisconsin;University of Wisconsin-Madison", "aff_unique_dep": ";", "aff_unique_url": "https://www.wisc.edu;https://www.wisc.edu", "aff_unique_abbr": "UW;UW-Madison", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Madison", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "How Low Can We Go: Trading Memory for Error in Low-Precision Training", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6320", "id": "YpSxqy_RE84", "poster": "", "openreview": "https://openreview.net/forum?id=YpSxqy_RE84", "slides": "https://iclr.cc/virtual/2022/poster/6320", "video": "https://iclr.cc/virtual/2022/poster/6320", "author_site": "Chengrun Yang, Ziyang Wu, Jerry Chee, Christopher De Sa, Madeleine Udell", "tldr": "", "abstract": "Low-precision arithmetic trains deep learning models using less energy, less memory and less time. However, we pay a price for the savings: lower precision may yield larger round-off error and hence larger prediction error. As applications proliferate, users must choose which precision to use to train a new model, and chip manufacturers must decide which precisions to manufacture. We view these precision choices as a hyperparameter tuning problem, and borrow ideas from meta-learning to learn the tradeoff between memory and error. In this paper, we introduce Pareto Estimation to Pick the Perfect Precision (PEPPP). We use matrix factorization to find non-dominated configurations (the Pareto frontier) with a limited number of network evaluations. For any given memory budget, the precision that minimizes error is a point on this frontier. Practitioners can use the frontier to trade memory for error and choose the best precision for their goals.", "keywords": "low-precision training;meta-learning;Pareto frontier;error-memory tradeoff;active learning;matrix factorization", "primary_area": "", "supplementary_material": "", "author": "Chengrun Yang;Ziyang Wu;Jerry Chee;Christopher De Sa;Madeleine Udell", "authorids": "~Chengrun_Yang1;~Ziyang_Wu1;~Jerry_Chee1;~Christopher_De_Sa2;~Madeleine_Udell1", "gender": "M;;;F;M", "homepage": "https://chengrunyang.github.io/;https://robinwu218.github.io/;http://jerry-chee.github.io/;https://people.orie.cornell.edu/mru8;http://cs.cornell.edu/~cdesa", "dblp": "225/4721;236/5238;207/8369;153/2166;154/6336", "google_scholar": "XYYhXe4AAAAJ;9RAHYd0AAAAJ;qyQpUAkAAAAJ;tZ9pEDMAAAAJ;", "orcid": ";;;0000-0002-3985-915X;", "linkedin": ";;;;", "or_profile": "~Chengrun_Yang1;~Ziyang_Wu1;~Jerry_Chee1;~Madeleine_Udell1;~Christopher_De_Sa1", "aff": "Cornell University;University of California, Berkeley;Cornell University;Cornell University;Cornell University", "aff_domain": "cornell.edu;berkeley.edu;cornell.edu;cornell.edu;cornell.edu", "position": "PhD student;PhD student;PhD student;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\nyang2022how,\ntitle={How Low Can We Go: Trading Memory for Error in Low-Precision Training},\nauthor={Chengrun Yang and Ziyang Wu and Jerry Chee and Christopher De Sa and Madeleine Udell},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=YpSxqy_RE84}\n}", "github": "", "project": "", "reviewers": "ANon;HPBj;ezUH;M7yh", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "3;3;4;4", "correctness": "3;4;3;4", "technical_novelty": "2;4;3;3", "empirical_novelty": "2;4;3;3", "wc_summary_paper": "80;89;236;65", "wc_summary_review": "34;80;199;27", "wc_main_review": "276;112;373;181", "wc_review": "390;281;808;273", "wc_reply_reviewers": "0;13;28;26", "wc_reply_authors": "640;270;819;340", "reply_reviewers": "0;1;1;1", "reply_authors": "1;1;1;1", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 117.5, 68.95106960736722 ], "wc_summary_review_avg": [ 85.0, 68.89484741256054 ], "wc_main_review_avg": [ 235.5, 98.44922549212868 ], "wc_review_avg": [ 438.0, 218.56234808401928 ], "wc_reply_reviewers_avg": [ 16.75, 11.255554184490428 ], "wc_reply_authors_avg": [ 517.25, 222.86248562734824 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.6882472016116854, "corr_recommendation_correctness": 0.6882472016116854, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=652848499450213393&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=YpSxqy_RE84", "email": "cornell.edu;berkeley.edu;cornell.edu;cornell.edu;cornell.edu", "author_num": 5, "aff_unique_index": "0;1;0;0;0", "aff_unique_norm": "Cornell University;University of California, Berkeley", "aff_unique_dep": ";", "aff_unique_url": "https://www.cornell.edu;https://www.berkeley.edu", "aff_unique_abbr": "Cornell;UC Berkeley", "aff_campus_unique_index": "1", "aff_campus_unique": ";Berkeley", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "YqHW0o9wXae", "title": "Assisted Learning for Organizations with Limited Imbalanced Data", "track": "main", "status": "Reject", "tldr": "", "abstract": "We develop an assisted learning framework for assisting organization-level learners to improve their learning performance with limited and imbalanced data. In particular, learners at the organization level usually have sufficient computation resource, but are subject to stringent collaboration policy and information privacy. Their limited imbalanced data often cause biased inference and sub-optimal decision-making. In our assisted learning framework, an organizational learner purchases assistance service from a service provider and aims to enhance its model performance within a few assistance rounds. We develop effective stochastic training algorithms for assisted deep learning and assisted reinforcement learning. Different from existing distributed algorithms that need to frequently transmit gradients or models, our framework allows the learner to only occasionally share information with the service provider, and still achieve a near-oracle model as if all the data were centralized.", "keywords": "Assisted learning;Deep learning;Reinforcement learning;Optimization;Heterogeneous learner;Imbalanced data", "primary_area": "", "supplementary_material": "/attachment/e6dd245fffeac8265a4799a2b7351262a370f396.zip", "author": "Cheng Chen;Jiaying Zhou;Jie Ding;Yi Zhou;Bhavya Kailkhura", "authorids": "~Cheng_Chen7;~Jiaying_Zhou1;~Jie_Ding2;~Yi_Zhou2;~Bhavya_Kailkhura1", "gender": ";F;M;M;M", "homepage": ";;http://jding.org;https://sites.google.com/site/yizhouhomepage/home;https://people.llnl.gov/kailkhura1", "dblp": ";;94/1825-2;;132/8938", "google_scholar": ";;ZyqvoqcAAAAJ;4fK8bYIAAAAJ;SQpJmOgAAAAJ", "orcid": ";;;;", "linkedin": ";jiaying-zhou-9b4493161/;;;", "or_profile": "~Cheng_Chen7;~Jiaying_Zhou1;~Jie_Ding2;~Yi_Zhou2;~Bhavya_Kailkhura1", "aff": ";University of Minnesota, Minneapolis;University of Minnesota, Minneapolis;University of Utah;Lawrence Livermore National Laboratory", "aff_domain": ";umn.edu;umn.edu;utah.edu;llnl.gov", "position": ";PhD student;Assistant Professor;Assistant Professor;Research Staff", "bibtex": "@misc{\nchen2022assisted,\ntitle={Assisted Learning for Organizations with Limited Imbalanced Data},\nauthor={Cheng Chen and Jiaying Zhou and Jie Ding and Yi Zhou and Bhavya Kailkhura},\nyear={2022},\nurl={https://openreview.net/forum?id=YqHW0o9wXae}\n}", "github": "", "project": "", "reviewers": "cvfa;D8UY;Bk5z", "site": "https://openreview.net/forum?id=YqHW0o9wXae", "pdf_size": 0, "recommendation": "3;5;5", "confidence": "4;3;3", "correctness": "2;4;3", "technical_novelty": "2;2;3", "empirical_novelty": "2;2;3", "wc_summary_paper": "105;26;80", "wc_summary_review": "27;45;18", "wc_main_review": "306;184;125", "wc_review": "438;255;223", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "767;855;379", "reply_reviewers": "0;0;0", "reply_authors": "1;2;1", "recommendation_avg": [ 4.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "correctness_avg": [ 3.0, 0.816496580927726 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 70.33333333333333, 32.96799795087486 ], "wc_summary_review_avg": [ 30.0, 11.224972160321824 ], "wc_main_review_avg": [ 205.0, 75.37019746999916 ], "wc_review_avg": [ 305.3333333333333, 94.71477650762255 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 667.0, 206.79136023215926 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.9999999999999998, "corr_recommendation_correctness": 0.8660254037844385, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11801604042902484937&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff_unique_index": "0;0;1;2", "aff_unique_norm": "University of Minnesota;University of Utah;Lawrence Livermore National Laboratory", "aff_unique_dep": ";;", "aff_unique_url": "https://www.minnesota.edu;https://www.utah.edu;https://www.llnl.gov", "aff_unique_abbr": "UMN;Utah;LLNL", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Minneapolis;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "Yr_1QZaRqmv", "title": "Decision Tree Algorithms for MDP", "track": "main", "status": "Reject", "tldr": "", "abstract": "Decision trees are robust modeling tools in machine learning with human-interpretable representations. The curse of dimensionality of Markov Decision Process (MDP) makes exact solution methods computationally intractable in practice for large state-action spaces. In this paper, we show that even for problems with large state space, when the solution policy of the MDP can be represented by a tree-like structure, our proposed algorithm retrieves a tree of the solution policy of the MDP in computationally tractable time. Our algorithm uses a tree growing strategy to incrementally disaggregate the state space solving smaller MDP instances with Linear Programming. These ideas can be extended to experience based RL problems as an alternative to black-box based policies.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Elioth Sanabria;David Yao;Henry Lam", "authorids": "~Elioth_Sanabria1;~David_Yao1;~Henry_Lam1", "gender": "M;;", "homepage": "http://www.columbia.edu/~ems2268/;http://www.columbia.edu/~yao;http://www.columbia.edu/~khl2114/", "dblp": ";y/DavidDYao.html;35/9508", "google_scholar": ";;Bnj50x0AAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Elioth_Sanabria1;~David_Yao1;~Henry_Lam1", "aff": "Columbia University;Columbia University;Columbia University", "aff_domain": "columbia.edu;columbia.edu;columbia.edu", "position": "PhD student;Full Professor;Associate Professor", "bibtex": "@misc{\nsanabria2022decision,\ntitle={Decision Tree Algorithms for {MDP}},\nauthor={Elioth Sanabria and David Yao and Henry Lam},\nyear={2022},\nurl={https://openreview.net/forum?id=Yr_1QZaRqmv}\n}", "github": "", "project": "", "reviewers": "HjwV;MpCH;vkFy;qMvr", "site": "https://openreview.net/forum?id=Yr_1QZaRqmv", "pdf_size": 0, "recommendation": "3;3;5;6", "confidence": "2;4;4;3", "correctness": "3;4;3;4", "technical_novelty": "2;1;2;3", "empirical_novelty": "0;1;2;3", "wc_summary_paper": "99;61;60;88", "wc_summary_review": "52;38;35;23", "wc_main_review": "92;152;379;595", "wc_review": "243;251;474;706", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "245;591;678;90", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 4.25, 1.299038105676658 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 1.5, 1.118033988749895 ], "wc_summary_paper_avg": [ 77.0, 16.95582495781317 ], "wc_summary_review_avg": [ 37.0, 10.319883720275147 ], "wc_main_review_avg": [ 304.5, 198.96796224518158 ], "wc_review_avg": [ 418.5, 190.12693128539155 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 401.0, 241.80880877255072 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.17407765595569782, "corr_recommendation_correctness": 0.19245008972987526, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:ome6y7cwaXMJ:scholar.google.com/&scioq=Decision+Tree+Algorithms+for+MDP&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;0", "aff_unique_norm": "Columbia University", "aff_unique_dep": "", "aff_unique_url": "https://www.columbia.edu", "aff_unique_abbr": "Columbia", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "Ysu4E5DhQIw", "title": "Cascaded Fast and Slow Models for Efficient Semantic Code Search", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "The goal of natural language semantic code search is to retrieve a semantically relevant code snippet from a fixed set of candidates using a natural language query. Existing approaches are neither effective nor efficient enough towards a practical semantic code search system. In this paper, we propose an efficient and accurate semantic code search framework with cascaded fast and slow models, in which a fast transformer encoder model is learned to optimize a scalable index for fast retrieval followed by learning a slow classification-based re-ranking model to improve the performance of the top K results from the fast retrieval. To further reduce the high memory cost of deploying two separate models in practice, we propose to jointly train the fast and slow model based on a single transformer encoder with shared parameters. The proposed cascaded approach is not only efficient and scalable, but also achieves state-of-the-art results with an average mean reciprocal ranking (MRR) score of 0.7795 (across 6 programming languages) as opposed to the previous state-of-the-art result of 0.713 MRR on the CodeSearchNet benchmark.", "keywords": "code retrieval;code search;fast and slow;transformer;mean reciprocal ranking;recall", "primary_area": "", "supplementary_material": "", "author": "Akhilesh Deepak Gotmare;Junnan Li;Shafiq Joty;Steven Hoi", "authorids": "~Akhilesh_Deepak_Gotmare1;~Junnan_Li2;~Shafiq_Joty1;~Steven_Hoi2", "gender": "M;M;M;M", "homepage": "https://akhileshgotmare.github.io/;https://raihanjoty.github.io/;http://stevenhoi.com;https://sites.google.com/site/junnanlics/", "dblp": "156/0933;62/2078;;193/6773-1.html", "google_scholar": "https://scholar.google.ch/citations?user=2S-aFwIAAAAJ;hR249csAAAAJ;JoLjflYAAAAJ;MuUhwi0AAAAJ", "orcid": ";;;", "linkedin": "akhilesh-gotmare/;;;", "or_profile": "~Akhilesh_Deepak_Gotmare1;~Shafiq_Joty1;~Steven_Hoi2;~Junnan_li1", "aff": "SalesForce.com;SalesForce.com;Singapore Management University;Salesforce Research", "aff_domain": "salesforce.com;salesforce.com;smu.edu.sg;salesforce.com", "position": "Researcher;Principal Researcher;Associate Professor;Research Scientist", "bibtex": "@misc{\ngotmare2022cascaded,\ntitle={Cascaded Fast and Slow Models for Efficient Semantic Code Search},\nauthor={Akhilesh Deepak Gotmare and Junnan Li and Shafiq Joty and Steven Hoi},\nyear={2022},\nurl={https://openreview.net/forum?id=Ysu4E5DhQIw}\n}", "github": "", "project": "", "reviewers": "cXHD;DPco;wD3o;PcBf", "site": "https://openreview.net/forum?id=Ysu4E5DhQIw", "pdf_size": 0, "recommendation": "3;3;5;6", "confidence": "4;4;2;3", "correctness": "3;3;3;3", "technical_novelty": "1;2;3;2", "empirical_novelty": "2;2;3;2", "wc_summary_paper": "134;115;61;57", "wc_summary_review": "58;90;21;87", "wc_main_review": "215;675;156;609", "wc_review": "407;880;238;753", "wc_reply_reviewers": "329;158;0;0", "wc_reply_authors": "257;728;297;364", "reply_reviewers": "1;1;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 4.25, 1.299038105676658 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 91.75, 33.46173187388842 ], "wc_summary_review_avg": [ 64.0, 27.793884219374593 ], "wc_main_review_avg": [ 413.75, 230.38595334785495 ], "wc_review_avg": [ 569.5, 258.06055490911433 ], "wc_reply_reviewers_avg": [ 121.75, 135.93449709326916 ], "wc_reply_authors_avg": [ 411.5, 186.6875732339997 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.7543365091413573, "corr_recommendation_correctness": 0.0, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18033598175224385412&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Salesforce;Singapore Management University", "aff_unique_dep": ";", "aff_unique_url": "https://www.salesforce.com;https://www.smu.edu.sg", "aff_unique_abbr": "Salesforce;SMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "United States;Singapore" }, { "id": "YtdASzotUEW", "title": "Label Smoothed Embedding Hypothesis for Out-of-Distribution Detection", "track": "main", "status": "Reject", "tldr": "", "abstract": "Detecting out-of-distribution (OOD) examples is critical in many applications. We propose an unsupervised method to detect OOD samples using a $k$-NN density estimate with respect to a classification model's intermediate activations on in-distribution samples. We leverage a recent insight about label smoothing, which we call the {\\it Label Smoothed Embedding Hypothesis}, and show that one of the implications is that the $k$-NN density estimator performs better as an OOD detection method both theoretically and empirically when the model is trained with label smoothing. Finally, we show that our proposal outperforms many OOD baselines and we also provide new finite-sample high-probability statistical results for $k$-NN density estimation's ability to detect OOD examples.", "keywords": "deep k-nn;label smoothing;out-of-distribution detection;robustness", "primary_area": "", "supplementary_material": "", "author": "Dara Bahri;Heinrich Jiang;Yi Tay;Donald Metzler", "authorids": "~Dara_Bahri1;~Heinrich_Jiang1;~Yi_Tay1;~Donald_Metzler1", "gender": "M;M;M;M", "homepage": "http://www.dara.run;;http://yitay.net;https://research.google/people/DonaldMetzler/", "dblp": "231/7656;182/2472;;95/2272", "google_scholar": "j5PpTOwAAAAJ;;VBclY_cAAAAJ;bmXpOd8AAAAJ", "orcid": ";;;0000-0003-4276-6269", "linkedin": ";;;donmetzler/", "or_profile": "~Dara_Bahri1;~Heinrich_Jiang1;~Yi_Tay1;~Donald_Metzler1", "aff": "Google Research;Google;Google;Google", "aff_domain": "google.com;google.com;google.com;google.com", "position": "Research Scientist;Research scientist;Research Scientist;Research Scientist", "bibtex": "@misc{\nbahri2022label,\ntitle={Label Smoothed Embedding Hypothesis for Out-of-Distribution Detection},\nauthor={Dara Bahri and Heinrich Jiang and Yi Tay and Donald Metzler},\nyear={2022},\nurl={https://openreview.net/forum?id=YtdASzotUEW}\n}", "github": "", "project": "", "reviewers": "E77y;swL5;pUXL;orsU", "site": "https://openreview.net/forum?id=YtdASzotUEW", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "4;3;3;4", "correctness": "2;3;3;4", "technical_novelty": "2;3;2;3", "empirical_novelty": "3;3;2;2", "wc_summary_paper": "90;48;96;160", "wc_summary_review": "70;60;32;41", "wc_main_review": "533;207;283;252", "wc_review": "693;315;411;453", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 98.5, 40.03436024217197 ], "wc_summary_review_avg": [ 50.75, 15.022899187573616 ], "wc_main_review_avg": [ 318.75, 126.61432580873303 ], "wc_review_avg": [ 468.0, 139.20129309744217 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.2294157338705618, "corr_recommendation_correctness": 0.9733285267845754, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8357190317615770922&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Google", "aff_unique_dep": "Google Research", "aff_unique_url": "https://research.google", "aff_unique_abbr": "Google Research", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "YxQiIOLKgEf", "title": "Counterfactual Graph Learning for Link Prediction", "track": "main", "status": "Reject", "tldr": "", "abstract": "Learning to predict missing links is important for many graph-based applications. Existing methods were designed to learn the association between two sets of variables: (1) the observed graph structure (e.g., clustering effect) and (2) the existence of link between a pair of nodes. However, the causal relationship between these variables was ignored. We visit the possibility of learning it by asking a counterfactual question: \u201cwould the link exist or not if the observed graph structure became different?\u201d To answer this question, we leverage causal models considering the information of the node pair (i.e., learned graph representations) as context, global graph structural properties as treatment, and link existence as outcome. In this work, we propose a novel link prediction method that enhances graph learning by counterfactual inference. It creates counterfactual links from the observed ones, and learns representations from both the observed and counterfactual links. Experiments on benchmark datasets show that this novel graph learning method achieves state-of-the-art performance on link prediction.", "keywords": "Link Prediction;Graph Representation Learning;Graph Neural Networks.", "primary_area": "", "supplementary_material": "/attachment/63a0517486fbbb05744d2a97bf3f65ce69063c26.zip", "author": "Tong Zhao;Gang Liu;Daheng Wang;Wenhao Yu;Meng Jiang", "authorids": "~Tong_Zhao3;~Gang_Liu6;~Daheng_Wang1;~Wenhao_Yu2;~Meng_Jiang3", "gender": "M;M;M;M;M", "homepage": "https://tzhao.io/;https://liugangcode.github.io/;https://dahengwang0705.github.io/;https://wyu97.github.io/;http://www.meng-jiang.com/", "dblp": "94/6503-3;37/2109-25;223/3148;159/8117-2.html;69/339-1", "google_scholar": "05cRc-MAAAAJ;zdF3vTYAAAAJ;KcHNH7kAAAAJ;z4qSdX8AAAAJ;LZIPfCkAAAAJ", "orcid": "0000-0001-7660-1732;0000-0003-4204-731X;0000-0001-9474-952X;0000-0002-4075-5980;0000-0002-3009-519X", "linkedin": ";;;;meng-jiang-94b10916/", "or_profile": "~Tong_Zhao3;~Gang_Liu6;~Daheng_Wang1;~Wenhao_Yu2;~Meng_Jiang3", "aff": "University of Notre Dame;University of Notre Dame;Amazon;University of Notre Dame;University of Notre Dame", "aff_domain": "nd.edu;nd.edu;amazon.com;nd.edu;nd.edu", "position": "PhD student;PhD student;Researcher;PhD student;Assistant Professor", "bibtex": "@misc{\nzhao2022counterfactual,\ntitle={Counterfactual Graph Learning for Link Prediction},\nauthor={Tong Zhao and Gang Liu and Daheng Wang and Wenhao Yu and Meng Jiang},\nyear={2022},\nurl={https://openreview.net/forum?id=YxQiIOLKgEf}\n}", "github": "", "project": "", "reviewers": "LKEp;WwN8;y1dx;Q6cy", "site": "https://openreview.net/forum?id=YxQiIOLKgEf", "pdf_size": 0, "recommendation": "5;5;6;8", "confidence": "4;4;5;4", "correctness": "3;3;3;3", "technical_novelty": "3;3;3;4", "empirical_novelty": "3;3;3;0", "wc_summary_paper": "65;90;131;113", "wc_summary_review": "122;44;40;38", "wc_main_review": "260;249;183;295", "wc_review": "447;383;354;446", "wc_reply_reviewers": "0;103;0;60", "wc_reply_authors": "1142;734;579;524", "reply_reviewers": "0;1;0;1", "reply_authors": "4;3;2;3", "recommendation_avg": [ 6.0, 1.224744871391589 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 3.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 1.299038105676658 ], "wc_summary_paper_avg": [ 99.75, 24.772716847370617 ], "wc_summary_review_avg": [ 61.0, 35.2845575287547 ], "wc_main_review_avg": [ 246.75, 40.5362492098122 ], "wc_review_avg": [ 407.5, 40.326790102858425 ], "wc_reply_reviewers_avg": [ 40.75, 43.49353400219393 ], "wc_reply_authors_avg": [ 744.75, 241.93322942497997 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 3.0, 0.7071067811865476 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 18, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9658162943248256248&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "aff_unique_index": "0;0;1;0;0", "aff_unique_norm": "University of Notre Dame;Amazon", "aff_unique_dep": ";Amazon.com, Inc.", "aff_unique_url": "https://www.nd.edu;https://www.amazon.com", "aff_unique_abbr": "Notre Dame;Amazon", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "YxWU4YZ4Cr", "title": "Generalization to Out-of-Distribution transformations", "track": "main", "status": "Reject", "tldr": "", "abstract": "Humans understand a set of canonical geometric transformations (such as translation, rotation and scaling) that support generalization by being untethered to any\nspecific object. We explored inductive biases that allowed artificial neural networks to learn these transformations in pixel space in a way that could generalize\nout-of-distribution (OOD). Unsurprisingly, we found that convolution and high\ntraining diversity were important contributing factors to OOD generalization of\ntranslation to untrained shapes, sizes, time-points and locations, however these\nweren\u2019t sufficient for rotation and scaling. To remedy this we show that two more\nprinciple components are needed 1) iterative training where outputs are fed back\nas inputs 2) applying convolutions after conversion to log-polar space. We propose POLARAE which exploits all four components and outperforms standard\nconvolutional autoencoders and variational autoencoders trained iteratively with\nhigh diversity wrt OOD generalization to larger shapes in larger grids and new\nlocations.", "keywords": "Out of distribution;generalization;convolution;polar transformation", "primary_area": "", "supplementary_material": "", "author": "Shanka Subhra Mondal;Zack Dulberg;Jonathan Cohen", "authorids": "~Shanka_Subhra_Mondal1;~Zack_Dulberg1;~Jonathan_Cohen1", "gender": "M;;M", "homepage": "https://sites.google.com/view/shankasubhramondal/;;https://jdc.princeton.edu", "dblp": "241/7065;;31/5509-3", "google_scholar": "5V-xQYUAAAAJ;4KjRP1MAAAAJ;https://scholar.google.com.tw/citations?user=NCkkQAMAAAAJ", "orcid": ";;0000-0003-2316-0763", "linkedin": "shanka-subhra-mondal-057622147;;", "or_profile": "~Shanka_Subhra_Mondal1;~Zack_Dulberg1;~Jonathan_Cohen1", "aff": "Princeton University;Princeton University;Princeton University", "aff_domain": "princeton.edu;princeton.edu;princeton.edu", "position": "PhD student;PhD student;Full Professor", "bibtex": "@misc{\nmondal2022generalization,\ntitle={Generalization to Out-of-Distribution transformations},\nauthor={Shanka Subhra Mondal and Zack Dulberg and Jonathan Cohen},\nyear={2022},\nurl={https://openreview.net/forum?id=YxWU4YZ4Cr}\n}", "github": "", "project": "", "reviewers": "3Ri2;sNFP;JcBj;31wD", "site": "https://openreview.net/forum?id=YxWU4YZ4Cr", "pdf_size": 0, "recommendation": "3;3;3;6", "confidence": "3;3;4;3", "correctness": "1;3;2;4", "technical_novelty": "2;2;1;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "24;34;217;62", "wc_summary_review": "43;17;72;41", "wc_main_review": "245;282;894;171", "wc_review": "312;333;1183;274", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.75, 1.299038105676658 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 2.5, 1.118033988749895 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 84.25, 77.89857187394388 ], "wc_summary_review_avg": [ 43.25, 19.49839737004044 ], "wc_main_review_avg": [ 398.0, 289.14096907909817 ], "wc_review_avg": [ 525.5, 380.1963308607804 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": 0.7745966692414834, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7288242414176080335&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;0", "aff_unique_norm": "Princeton University", "aff_unique_dep": "", "aff_unique_url": "https://www.princeton.edu", "aff_unique_abbr": "Princeton", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "Z0XiFAb_WDr", "title": "Communicating Natural Programs to Humans and Machines", "track": "main", "status": "Reject", "tldr": "", "abstract": "The Abstraction and Reasoning Corpus (ARC) is a set of procedural tasks that tests an agent's ability to flexibly solve novel problems. While most ARC tasks are easy for humans, they are challenging for state-of-the-art AI. What makes building intelligent systems that can generalize to novel situations such as ARC difficult?\nWe posit that the answer might be found by studying the difference of \\emph{language}: While humans readily generate and interpret instructions in a general language, computer systems are shackled to a narrow domain-specific language that they can precisely execute.\nWe present LARC, the \\textit{Language-complete ARC}: a collection of natural language descriptions by a group of human participants who instruct each other on how to solve ARC tasks using language alone, which contains successful instructions for 88\\% of the ARC tasks.\nWe analyze the collected instructions as `natural programs', finding that while they resemble computer programs, they are distinct in two ways: First, they contain a wide range of primitives; Second, they frequently leverage communicative strategies beyond directly executable codes. We demonstrate that these two distinctions prevent current program synthesis techniques from leveraging LARC to its full potential, and give concrete suggestions on how to build the next-generation program synthesizers.", "keywords": "program synthesis;communication;cognition", "primary_area": "", "supplementary_material": "/attachment/23d393c42e14dca740b973dca8b2311d64b5ad98.zip", "author": "Sam Acquaviva;Yewen Pu;Marta Kryven;Catherine Wong;Theodoros Sechopoulos;Gabrielle Ecanow;Maxwell Nye;Michael Henry Tessler;Joshua B. Tenenbaum", "authorids": "~Sam_Acquaviva1;~Yewen_Pu1;~Marta_Kryven1;~Catherine_Wong3;~Theodoros_Sechopoulos1;~Gabrielle_Ecanow1;~Maxwell_Nye1;~Michael_Henry_Tessler1;~Joshua_B._Tenenbaum1", "gender": "M;M;;M;;M;M;;Non-Binary", "homepage": "https://samacquaviva.com;http://www.mit.edu/~yewenpu;https://marta-kryven.github.io;;;https://maxwellnye.github.io/;https://www.mit.edu/~tessler/;;https://web.mit.edu/zyzzyva/www/academic.html", "dblp": ";53/10322;134/5551;;;224/0047;;t/JoshuaBTenenbaum;339/3594", "google_scholar": ";LJnNKXMAAAAJ;https://scholar.google.com/citations?hl=en;https://scholar.google.com/citations?hl=en;;NsuX8R8AAAAJ;DQjm2rAAAAAJ;;KssJcIAAAAAJ", "orcid": ";;0000-0002-2764-8611;;;;;;", "linkedin": "sam-acquaviva/;;marta-kryven/;;gabrielle-ecanow/;;;;", "or_profile": "~Sam_Acquaviva1;~Yewen_Pu1;~Marta_Kryven1;~Theodoros_Sechopoulos1;~Gabrielle_Ecanow1;~Maxwell_Nye1;~Michael_Henry_Tessler1;~Joshua_B._Tenenbaum1;~Catherine_Wong1", "aff": "Massachusetts Institute of Technology;Autodesk;Massachusetts Institute of Technology;;Massachusetts Institute of Technology;Massachusetts Institute of Technology;Google DeepMind;Massachusetts Institute of Technology;Massachusetts Institute of Technology", "aff_domain": "mit.edu;autodesk.com;mit.edu;;mit.edu;mit.edu;deepmind.com;mit.edu;mit.edu", "position": "Undergrad student;Principal Researcher;Postdoc;;Undergrad student;PhD student;Researcher;Professor;PhD student", "bibtex": "@misc{\nacquaviva2022communicating,\ntitle={Communicating Natural Programs to Humans and Machines},\nauthor={Sam Acquaviva and Yewen Pu and Marta Kryven and Catherine Wong and Theodoros Sechopoulos and Gabrielle Ecanow and Maxwell Nye and Michael Henry Tessler and Joshua B. Tenenbaum},\nyear={2022},\nurl={https://openreview.net/forum?id=Z0XiFAb_WDr}\n}", "github": "", "project": "", "reviewers": "iQUC;4uxi;doKU", "site": "https://openreview.net/forum?id=Z0XiFAb_WDr", "pdf_size": 0, "recommendation": "5;5;5", "confidence": "4;4;3", "correctness": "4;3;3", "technical_novelty": "1;3;2", "empirical_novelty": "2;3;3", "wc_summary_paper": "93;58;62", "wc_summary_review": "60;45;60", "wc_main_review": "293;458;213", "wc_review": "446;561;335", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "338;660;480", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "recommendation_avg": [ 5.0, 0.0 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.0, 0.816496580927726 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 71.0, 15.641824275533422 ], "wc_summary_review_avg": [ 55.0, 7.0710678118654755 ], "wc_main_review_avg": [ 321.3333333333333, 102.00762498732904 ], "wc_review_avg": [ 447.3333333333333, 92.26893060806306 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 492.6666666666667, 131.76072589694127 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 9, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 70, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13381039702346039142&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff_unique_index": "0;1;0;0;0;2;0;0", "aff_unique_norm": "Massachusetts Institute of Technology;Autodesk;Google", "aff_unique_dep": ";;Google DeepMind", "aff_unique_url": "https://web.mit.edu;https://www.autodesk.com;https://deepmind.com", "aff_unique_abbr": "MIT;Autodesk;DeepMind", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;1;0;0", "aff_country_unique": "United States;United Kingdom" }, { "title": "Learning Audio-Visual Speech Representation by Masked Multimodal Cluster Prediction", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6707", "id": "Z1Qlm11uOM", "poster": "", "openreview": "https://openreview.net/forum?id=Z1Qlm11uOM", "slides": "https://iclr.cc/virtual/2022/poster/6707", "video": "https://iclr.cc/virtual/2022/poster/6707", "author_site": "Bowen Shi, Wei-Ning Hsu, Kushal Lakhotia, Abdelrahman Mohamed", "tldr": "", "abstract": "Video recordings of speech contain correlated audio and visual information, providing a strong signal for speech representation learning from the speaker\u2019s lip movements and the produced sound. We introduce Audio-Visual Hidden Unit BERT (AV-HuBERT), a self-supervised representation learning framework for audio-visual speech, which masks multi-stream video input and predicts automatically discovered and iteratively refined multimodal hidden units. AV-HuBERT learns powerful audio-visual speech representation benefiting both lip-reading and automatic speech recognition. On the largest public lip-reading benchmark LRS3 (433 hours), AV-HuBERT achieves 32.5% WER with only 30 hours of labeled data, outperforming the former state-of-the-art approach (33.6%) trained with a thousand times more transcribed video data (31K hours) (Makino et al., 2019). The lip-reading WER is further reduced to 26.9% when using all 433 hours of labeled data from LRS3 and combined with self-training. Using our audio-visual representation on the same benchmark for audio-only speech recognition leads to a 40% relative WER reduction over the state-of-the-art performance (1.3% vs 2.3%). Our code and models are available at https://github.com/facebookresearch/av_hubert.", "keywords": "audio-visual speech recognition;lip reading;speech recognition;self-supervised learning", "primary_area": "", "supplementary_material": "", "author": "Bowen Shi;Wei-Ning Hsu;Kushal Lakhotia;Abdelrahman Mohamed", "authorids": "~Bowen_Shi1;~Wei-Ning_Hsu2;~Kushal_Lakhotia1;~Abdelrahman_Mohamed2", "gender": "M;;M;M", "homepage": ";;http://www.cs.toronto.edu/~asamir;https://wnhsu.github.io/", "dblp": ";232/3341;28/8759;160/9923", "google_scholar": "xqyoorYAAAAJ;w9W6zXUAAAAJ;https://scholar.google.ca/citations?user=tJ_PrzgAAAAJ;https://scholar.google.com/citations?authorid=N5HDmqoAAAAJ", "orcid": ";;;", "linkedin": ";;abdel-rahman-mohamed-a5808210;", "or_profile": "~Bowen_Shi1;~Kushal_Lakhotia1;~Abdelrahman_Mohamed2;~Wei-Ning_Hsu1", "aff": "Toyota Technological Institute at Chicago;Outreach;Meta Facebook;Meta Facebook", "aff_domain": "ttic.edu;outreach.io;fb.com;fb.com", "position": "PhD student;Researcher;research scientist;Researcher", "bibtex": "@inproceedings{\nshi2022learning,\ntitle={Learning Audio-Visual Speech Representation by Masked Multimodal Cluster Prediction},\nauthor={Bowen Shi and Wei-Ning Hsu and Kushal Lakhotia and Abdelrahman Mohamed},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=Z1Qlm11uOM}\n}", "github": "", "project": "", "reviewers": "vjDT;BjAV;9sxE;3PrM", "pdf_size": 0, "recommendation": "6;6;8;8", "confidence": "5;3;5;4", "correctness": "4;3;3;4", "technical_novelty": "3;3;4;3", "empirical_novelty": "3;3;3;3", "wc_summary_paper": "81;61;93;134", "wc_summary_review": "25;49;28;14", "wc_main_review": "174;249;533;129", "wc_review": "280;359;654;277", "wc_reply_reviewers": "20;51;51;20", "wc_reply_authors": "687;915;1169;186", "reply_reviewers": "1;1;1;1", "reply_authors": "1;2;2;1", "recommendation_avg": [ 7.0, 1.0 ], "confidence_avg": [ 4.25, 0.82915619758885 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 92.25, 26.677471769266294 ], "wc_summary_review_avg": [ 29.0, 12.668859459319927 ], "wc_main_review_avg": [ 271.25, 157.083377541992 ], "wc_review_avg": [ 392.5, 154.51618038250882 ], "wc_reply_reviewers_avg": [ 35.5, 15.5 ], "wc_reply_authors_avg": [ 739.25, 362.07345594506097 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.30151134457776363, "corr_recommendation_correctness": 0.0, "gs_citation": 368, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10092601406427600448&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=Z1Qlm11uOM", "email": "ttic.edu;outreach.io;fb.com;fb.com", "author_num": 4, "aff_unique_index": "0;1;2;2", "aff_unique_norm": "Toyota Technological Institute at Chicago;Outreach;Meta", "aff_unique_dep": ";;Meta Platforms, Inc.", "aff_unique_url": "https://www.tti-chicago.org;;https://meta.com", "aff_unique_abbr": "TTI Chicago;;Meta", "aff_campus_unique_index": "0", "aff_campus_unique": "Chicago;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States;" }, { "title": "The Hidden Convex Optimization Landscape of Regularized Two-Layer ReLU Networks: an Exact Characterization of Optimal Solutions", "status": "Oral", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7124", "id": "Z7Lk2cQEG8a", "poster": "", "openreview": "https://openreview.net/forum?id=Z7Lk2cQEG8a", "slides": "https://iclr.cc/virtual/2022/poster/7124", "video": "https://iclr.cc/virtual/2022/poster/7124", "author_site": "Yifei Wang, Jonathan Lacotte, Mert Pilanci", "tldr": "", "abstract": "We prove that finding all globally optimal two-layer ReLU neural networks can be performed by solving a convex optimization program with cone constraints. Our analysis is novel, characterizes all optimal solutions, and does not leverage duality-based analysis which was recently used to lift neural network training into convex spaces. Given the set of solutions of our convex optimization program, we show how to construct exactly the entire set of optimal neural networks. We provide a detailed characterization of this optimal set and its invariant transformations. As additional consequences of our convex perspective, (i) we establish that Clarke stationary points found by stochastic gradient descent correspond to the global optimum of a subsampled convex problem (ii) we provide a polynomial-time algorithm for checking if a neural network is a global minimum of the training loss (iii) we provide an explicit construction of a continuous path between any neural network and the global minimum of its sublevel set and (iv) characterize the minimal size of the hidden layer so that the neural network optimization landscape has no spurious valleys.\nOverall, we provide a rich framework for studying the landscape of neural network training loss through convexity.", "keywords": "Neural networks;global optimization;convex optimization;convex analysis", "primary_area": "", "supplementary_material": "/attachment/f17e392702c46d321772e0ae0d51d86683a951a1.zip", "author": "Yifei Wang;Jonathan Lacotte;Mert Pilanci", "authorids": "~Yifei_Wang2;~Jonathan_Lacotte1;~Mert_Pilanci3", "gender": "M;M;M", "homepage": "http://web.stanford.edu/~wangyf18/;http://web.stanford.edu/~lacotte/;https://stanford.edu/~pilanci/", "dblp": ";https://dblp.uni-trier.de/pers/l/Lacotte:Jonathan.html;45/8056", "google_scholar": ";lDjk14QAAAAJ;aSAS-aAAAAAJ", "orcid": ";;", "linkedin": ";;mert-pilanci-ba615743/", "or_profile": "~Yifei_Wang2;~Jonathan_Lacotte1;~Mert_Pilanci3", "aff": "Stanford University;;Stanford University", "aff_domain": "stanford.edu;;stanford.edu", "position": "PhD student;;Assistant Professor", "bibtex": "@inproceedings{\nwang2022the,\ntitle={The Hidden Convex Optimization Landscape of Regularized Two-Layer Re{LU} Networks: an Exact Characterization of Optimal Solutions},\nauthor={Yifei Wang and Jonathan Lacotte and Mert Pilanci},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=Z7Lk2cQEG8a}\n}", "github": "", "project": "", "reviewers": "DhNM;P5NY;FbD2;VYzk", "pdf_size": 0, "recommendation": "8;8;8;8", "confidence": "3;4;4;3", "correctness": "4;3;4;3", "technical_novelty": "4;3;3;3", "empirical_novelty": "0;3;3;0", "wc_summary_paper": "133;72;119;153", "wc_summary_review": "64;86;45;2", "wc_main_review": "228;217;251;197", "wc_review": "425;375;415;352", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "81;200;159;108", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 8.0, 0.0 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.5, 1.5 ], "wc_summary_paper_avg": [ 119.25, 29.83601012199855 ], "wc_summary_review_avg": [ 49.25, 30.898017735770686 ], "wc_main_review_avg": [ 223.25, 19.49839737004044 ], "wc_review_avg": [ 391.75, 29.608909132218972 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 137.0, 45.90751572455212 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 39, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17815823735661649294&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=Z7Lk2cQEG8a", "email": "stanford.edu;;stanford.edu", "author_num": 3, "aff_unique_index": "0;0", "aff_unique_norm": "Stanford University", "aff_unique_dep": "", "aff_unique_url": "https://www.stanford.edu", "aff_unique_abbr": "Stanford", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "Z7VhFVRVqeU", "title": "Neural Bootstrapping Attention for Neural Processes", "track": "main", "status": "Reject", "tldr": "", "abstract": "Neural Processes (NP) learn to fit a broad class of stochastic processes with neural networks. Modeling functional uncertainty is an important aspect of learning stochastic processes. Recently, Bootstrapping (Attentive) Neural Processes (B(A)NP) propose a bootstrap method to capture the functional uncertainty which can replace the latent variable in (Attentive) Neural Processes ((A)NP), thus overcoming the limitations of Gaussian assumption on the latent variable. However, B(A)NP conduct bootstrapping in a non-parallelizable and memory-inefficient way and fail to capture diverse patterns in the stochastic processes. Furthermore, we found that ANP and BANP both tend to overfit in some cases. To resolve these problems, we propose an efficient and easy-to-implement approach, Neural Bootstrapping Attentive Neural Processes (NeuBANP). NeuBANP learns to generate the bootstrap distribution of random functions by injecting multiple random weights into the encoder and the loss function. We evaluate our models in benchmark experiments including Bayesian optimization and contextual multi-armed bandit. NeuBANP achieves state-of-the-art performance in both of the sequential decision-making tasks, and this empirically shows that our method greatly improves the quality of functional uncertainty modeling.", "keywords": "Neural Process;Bootstrapping", "primary_area": "", "supplementary_material": "", "author": "Minsub Lee;Junhyun Park;Sojin Jang;Chanhui Lee;Hyungjoo Cho;Minsuk Shin;Sungbin Lim", "authorids": "~Minsub_Lee1;~Junhyun_Park1;~Sojin_Jang2;~Chanhui_Lee2;~Hyungjoo_Cho1;~Minsuk_Shin1;~Sungbin_Lim1", "gender": "M;M;F;M;M;M;M", "homepage": "https://msublee.github.io;https://jundev1l2l.github.io;http://soj2n.github.io;https://chless.github.io/;;https://sites.google.com/site/minsukshin0/;https://www.sungbin-lim.net", "dblp": ";;;361/2606;;;206/6907", "google_scholar": ";;;;Pl95pGIAAAAJ;;https://scholar.google.com/citations?hl=ko", "orcid": ";;;0009-0003-2092-3670;;;0000-0003-2684-2022", "linkedin": ";;;;;;sungbin-lim-43b739b5/", "or_profile": "~Minsub_Lee1;~Junhyun_Park1;~Sojin_Jang2;~Chanhui_Lee2;~Hyungjoo_Cho1;~Minsuk_Shin1;~Sungbin_Lim1", "aff": "Ulsan National Institute of Science and Technology;Ulsan National Institute of Science and Technology;Ulsan National Institute of Science and Technology;Ulsan National Institute of Science and Technology;Seoul National University;University of South Carolina;Ulsan National Institute of Science and Technology", "aff_domain": "unist.ac.kr;unist.ac.kr;unist.ac.kr;unist.ac.kr;snu.ac.kr;sc.edu;unist.ac.kr", "position": "MS student;MS student;MS student;MS student;PhD student;Assistant Professor;Assistant Professor", "bibtex": "@misc{\nlee2022neural,\ntitle={Neural Bootstrapping Attention for Neural Processes},\nauthor={Minsub Lee and Junhyun Park and Sojin Jang and Chanhui Lee and Hyungjoo Cho and Minsuk Shin and Sungbin Lim},\nyear={2022},\nurl={https://openreview.net/forum?id=Z7VhFVRVqeU}\n}", "github": "", "project": "", "reviewers": "B7VB;Me1R;1P97;fWXh", "site": "https://openreview.net/forum?id=Z7VhFVRVqeU", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "3;3;3;4", "correctness": "4;3;3;3", "technical_novelty": "2;2;3;2", "empirical_novelty": "3;3;3;2", "wc_summary_paper": "115;70;53;71", "wc_summary_review": "74;61;86;68", "wc_main_review": "314;109;590;466", "wc_review": "503;240;729;605", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "702;665;1187;890", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;2;2", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 77.25, 22.93877721239735 ], "wc_summary_review_avg": [ 72.25, 9.175374651751284 ], "wc_main_review_avg": [ 369.75, 179.49425478270885 ], "wc_review_avg": [ 519.25, 179.99496520736352 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 861.0, 206.64825186775715 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": -0.5773502691896257, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=484933070290675946&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;0;0;1;2;0", "aff_unique_norm": "Ulsan National Institute of Science and Technology;Seoul National University;University of South Carolina", "aff_unique_dep": ";;", "aff_unique_url": "https://www.unist.ac.kr;https://www.snu.ac.kr;https://www.sc.edu", "aff_unique_abbr": "UNIST;SNU;USC", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;1;0", "aff_country_unique": "South Korea;United States" }, { "title": "SUMNAS: Supernet with Unbiased Meta-Features for Neural Architecture Search", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6140", "id": "Z8FzvVU6_Kj", "poster": "", "openreview": "https://openreview.net/forum?id=Z8FzvVU6_Kj", "slides": "https://iclr.cc/virtual/2022/poster/6140", "video": "https://iclr.cc/virtual/2022/poster/6140", "author_site": "Hyeonmin Ha, Ji Hoon Kim, Semin Park, Byung-Gon Chun", "tldr": "", "abstract": "One-shot Neural Architecture Search (NAS) usually constructs an over-parameterized network, which we call a supernet, and typically adopts sharing parameters among the sub-models to improve computational efficiency. One-shot NAS often repeatedly samples sub-models from the supernet and trains them to optimize the shared parameters. However, this training strategy suffers from multi-model forgetting. Training a sampled sub-model overrides the previous knowledge learned by the other sub-models, resulting in an unfair performance evaluation between the sub-models. We propose Supernet with Unbiased Meta-Features for Neural Architecture Search (SUMNAS), a supernet learning strategy based on meta-learning to tackle the knowledge forgetting issue. During the training phase, we explicitly address the multi-model forgetting problem and help the supernet learn unbiased meta-features, independent from the sampled sub-models. Once training is over, sub-models can be instantly compared to get the overall ranking or the best sub-model. Our evaluation on the NAS-Bench-201 and MobileNet-based search space demonstrate that SUMNAS shows improved ranking ability and finds architectures whose performance is on par with existing state-of-the-art NAS algorithms.", "keywords": "Neural architecture search", "primary_area": "", "supplementary_material": "", "author": "Hyeonmin Ha;Ji-Hoon Kim;Semin Park;Byung-Gon Chun", "authorids": "~Hyeonmin_Ha1;~Ji-Hoon_Kim2;~Semin_Park1;~Byung-Gon_Chun1", "gender": "M;;M;", "homepage": ";;;", "dblp": "198/6798.html;51/4060;;34/3515", "google_scholar": "IG6eO5cAAAAJ;1KdhN5QAAAAJ;;", "orcid": ";0000-0002-5212-1686;;", "linkedin": ";;semin-park-002b49a6;", "or_profile": "~Hyeonmin_Ha1;~Ji-Hoon_Kim2;~Semin_Park1;~Byung-Gon_Chun1", "aff": "Seoul National University;NAVER;;FriendliAI", "aff_domain": "snu.ac.kr;navercorp.com;;friendli.ai", "position": "PhD student;Research Scientist;;Chief Executive Officer", "bibtex": "@inproceedings{\nha2022sumnas,\ntitle={{SUMNAS}: Supernet with Unbiased Meta-Features for Neural Architecture Search},\nauthor={Hyeonmin Ha and Ji-Hoon Kim and Semin Park and Byung-Gon Chun},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=Z8FzvVU6_Kj}\n}", "github": "", "project": "", "reviewers": "LNNQ;m3L7;z8FZ;Yfxi", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "4;4;4;5", "correctness": "3;4;3;3", "technical_novelty": "2;3;3;3", "empirical_novelty": "3;3;3;3", "wc_summary_paper": "47;35;55;34", "wc_summary_review": "19;24;38;32", "wc_main_review": "175;144;224;159", "wc_review": "241;203;317;225", "wc_reply_reviewers": "141;0;48;24", "wc_reply_authors": "571;381;341;607", "reply_reviewers": "1;0;1;1", "reply_authors": "3;1;1;2", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 42.75, 8.728545125048045 ], "wc_summary_review_avg": [ 28.25, 7.292976072907411 ], "wc_main_review_avg": [ 175.5, 30.07074990750979 ], "wc_review_avg": [ 246.5, 42.88064831599448 ], "wc_reply_reviewers_avg": [ 53.25, 53.42927568290628 ], "wc_reply_authors_avg": [ 475.0, 115.57681428383462 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.75, 0.82915619758885 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.9271726499455306, "corr_recommendation_correctness": -0.13245323570650439, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6959271681113228692&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "pdf": "https://openreview.net/pdf?id=Z8FzvVU6_Kj", "email": "snu.ac.kr;navercorp.com;;friendli.ai", "author_num": 4, "aff_unique_index": "0;1;2", "aff_unique_norm": "Seoul National University;NAVER Corporation;FriendliAI", "aff_unique_dep": ";;", "aff_unique_url": "https://www.snu.ac.kr;https://www.naver.com;https://www.friendliai.com", "aff_unique_abbr": "SNU;NAVER;FriendliAI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1", "aff_country_unique": "South Korea;United States" }, { "id": "ZAA0Ol4z2i4", "title": "Explaining Off-Policy Actor-Critic From A Bias-Variance Perspective", "track": "main", "status": "Reject", "tldr": "", "abstract": "Off-policy Actor-Critic algorithms have demonstrated phenomenal experimental performance but still require better explanations. To this end, we show its policy evaluation error on the distribution of transitions decomposes into: a Bellman error, a bias from policy mismatch, and a variance term from sampling. By comparing the magnitude of bias and variance, we explain the success of the Emphasizing Recent Experience sampling and 1/age weighted sampling. Both sampling strategies yield smaller bias and variance and are hence preferable to uniform sampling.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/60e0215368e232156b817ee30e08592533ba679f.zip", "author": "Ting-Han Fan;Peter Ramadge", "authorids": "~Ting-Han_Fan1;~Peter_Ramadge1", "gender": "M;M", "homepage": ";http://ee.princeton.edu/people/faculty/peter-j-ramadge", "dblp": "213/0948;77/3256", "google_scholar": "1mQ3kTEAAAAJ;BOMboVoAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Ting-Han_Fan1;~Peter_Ramadge1", "aff": "Princeton University;Princeton University", "aff_domain": "princeton.edu;princeton.edu", "position": "PhD student;Full Professor", "bibtex": "@misc{\nfan2022explaining,\ntitle={Explaining Off-Policy Actor-Critic From A Bias-Variance Perspective},\nauthor={Ting-Han Fan and Peter Ramadge},\nyear={2022},\nurl={https://openreview.net/forum?id=ZAA0Ol4z2i4}\n}", "github": "", "project": "", "reviewers": "b574;e5zQ;VgPP;RYz4", "site": "https://openreview.net/forum?id=ZAA0Ol4z2i4", "pdf_size": 0, "recommendation": "3;3;3;8", "confidence": "3;3;5;4", "correctness": "2;2;2;3", "technical_novelty": "2;2;2;4", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "36;57;102;98", "wc_summary_review": "10;116;162;32", "wc_main_review": "261;787;376;585", "wc_review": "307;960;640;715", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "403;953;239;694", "reply_reviewers": "0;0;0;0", "reply_authors": "1;2;1;1", "recommendation_avg": [ 4.25, 2.165063509461097 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 2.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.8660254037844386 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 73.25, 27.797257058925798 ], "wc_summary_review_avg": [ 80.0, 61.69278726074872 ], "wc_main_review_avg": [ 502.25, 201.29005812508476 ], "wc_review_avg": [ 655.5, 233.42718350697717 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 572.25, 273.63056755413857 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.17407765595569782, "corr_recommendation_correctness": 1.0, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18000305305865197345&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0", "aff_unique_norm": "Princeton University", "aff_unique_dep": "", "aff_unique_url": "https://www.princeton.edu", "aff_unique_abbr": "Princeton", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "ZB8vwY8cg6Y", "title": "Using a Cross-Task Grid of Linear Probes to Interpret CNN Model Predictions On Retinal Images", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "We analyze the relationships and shared structure among different prediction tasks on a dataset of retinal images using linear probes: linear regression models trained on some \"target'' task, using embeddings from a deep convolutional (CNN) model trained on some \"source'' task as input. We use this method across all possible pairings of 101 tasks in the UK Biobank dataset of retinal images, leading to $\\sim$193k different models. We analyze the performance of these linear probes by source and target task and by layer depth. \n\nWe observe that representations from the middle layers of the network are more generalizable. \nWe find that some target tasks are easily predicted irrespective of the source task, and that some other target tasks are more accurately predicted from correlated source tasks than from embeddings trained on the same task.\n\nWe then try to understand the principles that might be at work using synthetic experiments: images generated based on a \"dead leaves\" model.\n", "keywords": "linear probes;interpretability;medical imaging", "primary_area": "", "supplementary_material": "", "author": "Katy Blumer;Subhashini Venugopalan;Michael Brenner;Jon Kleinberg", "authorids": "~Katy_Blumer1;~Subhashini_Venugopalan2;~Michael_Brenner1;~Jon_Kleinberg3", "gender": "F;;;M", "homepage": ";https://vsubhashini.github.io;https://brennergroup.seas.harvard.edu;http://www.cs.cornell.edu/home/kleinber/", "dblp": "120/8723;21/11044;;https://dblp.uni-trier.de/pid/k/JonMKleinberg.html", "google_scholar": "fkpwCJ0AAAAJ;https://scholar.google.com/citations?hl=en;;VX7d5EQAAAAJ", "orcid": "0000-0002-1360-0940;0000-0003-3729-8456;;0000-0002-1929-2512", "linkedin": ";;;", "or_profile": "~Katy_Blumer1;~Subhashini_Venugopalan2;~Michael_Brenner1;~Jon_Kleinberg3", "aff": "Cornell University;Google;Harvard University;Cornell University", "aff_domain": "cornell.edu;google.com;fas.harvard.edu;cornell.edu", "position": "PhD student;Staff Research Scientist ;Professor;Full Professor", "bibtex": "@misc{\nblumer2022using,\ntitle={Using a Cross-Task Grid of Linear Probes to Interpret {CNN} Model Predictions On Retinal Images},\nauthor={Katy Blumer and Subhashini Venugopalan and Michael Brenner and Jon Kleinberg},\nyear={2022},\nurl={https://openreview.net/forum?id=ZB8vwY8cg6Y}\n}", "github": "", "project": "", "reviewers": "c5ZM;hN9e;9Jwo;itAv", "site": "https://openreview.net/forum?id=ZB8vwY8cg6Y", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "4;4;4;4", "correctness": "3;2;3;3", "technical_novelty": "1;1;1;3", "empirical_novelty": "3;2;2;3", "wc_summary_paper": "77;57;75;72", "wc_summary_review": "16;88;32;21", "wc_main_review": "178;244;248;104", "wc_review": "271;389;355;197", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 1.5, 0.8660254037844386 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 70.25, 7.854139036202504 ], "wc_summary_review_avg": [ 39.25, 28.734778579275673 ], "wc_main_review_avg": [ 193.5, 58.67495206644825 ], "wc_review_avg": [ 303.0, 74.76630257007497 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5487688166374690814&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "Cornell University;Google;Harvard University", "aff_unique_dep": ";Google;", "aff_unique_url": "https://www.cornell.edu;https://www.google.com;https://www.harvard.edu", "aff_unique_abbr": "Cornell;Google;Harvard", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Stochastic Training is Not Necessary for Generalization", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6236", "id": "ZBESeIUB5k", "poster": "", "openreview": "https://openreview.net/forum?id=ZBESeIUB5k", "slides": "https://iclr.cc/virtual/2022/poster/6236", "video": "https://iclr.cc/virtual/2022/poster/6236", "author_site": "Jonas Geiping, Micah Goldblum, Phil Pope, Michael Moeller, Tom Goldstein", "tldr": "", "abstract": "It is widely believed that the implicit regularization of SGD is fundamental to the impressive generalization behavior we observe in neural networks. In this work, we demonstrate that non-stochastic full-batch training can achieve comparably strong performance to SGD on CIFAR-10 using modern architectures. To this end, we show that the implicit regularization of SGD can be completely replaced with explicit regularization. Our observations indicate that the perceived difficulty of full-batch training may be the result of its optimization properties and the disproportionate time and effort spent by the ML community tuning optimizers and hyperparameters for small-batch training.", "keywords": "Optimization;Generalization;Stochasticity;SGD;full-batch;implicit regularization;implicit bias", "primary_area": "", "supplementary_material": "/attachment/9c5f0533736ab60237cf287e95c4f57d56c96fe9.zip", "author": "Jonas Geiping;Micah Goldblum;Phil Pope;Michael Moeller;Tom Goldstein", "authorids": "~Jonas_Geiping1;~Micah_Goldblum1;~Phil_Pope1;~Michael_Moeller1;~Tom_Goldstein1", "gender": "M;;;M;M", "homepage": "https://jonasgeiping.github.io/;;https://ppope.github.io/;http://vsa.informatik.uni-siegen.de;https://www.cs.umd.edu/~tomg/", "dblp": "190/7229;241/7231;254/1952;08/5840-1;25/8184", "google_scholar": "https://scholar.google.de/citations?user=206vNCEAAAAJ;pGDKzuUAAAAJ;w_Y1qcwAAAAJ;https://scholar.google.de/citations?user=sxzdAGUAAAAJ;KmSuVtgAAAAJ", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~Jonas_Geiping1;~Micah_Goldblum1;~Phil_Pope1;~Michael_Moeller1;~Tom_Goldstein1", "aff": "University of Maryland, College Park;New York University;University of Maryland, College Park;University of Siegen;University of Maryland, College Park", "aff_domain": "umd.edu;nyu.edu;umd.edu;uni-siegen.de;umd.edu", "position": "Postdoc;Postdoc;PhD student;Full Professor;Associate Professor", "bibtex": "@inproceedings{\ngeiping2022stochastic,\ntitle={Stochastic Training is Not Necessary for Generalization},\nauthor={Jonas Geiping and Micah Goldblum and Phil Pope and Michael Moeller and Tom Goldstein},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=ZBESeIUB5k}\n}", "github": "", "project": "", "reviewers": "Tx6S;o29S;g1Z8;UbWk;XRTg", "pdf_size": 0, "recommendation": "5;6;6;8;10", "confidence": "3;3;4;4;5", "correctness": "2;4;3;3;4", "technical_novelty": "2;3;3;3;4", "empirical_novelty": "2;4;2;3;4", "wc_summary_paper": "238;29;52;97;40", "wc_summary_review": "41;187;223;41;28", "wc_main_review": "218;327;359;340;217", "wc_review": "497;543;634;478;285", "wc_reply_reviewers": "36;359;0;184;0", "wc_reply_authors": "288;710;780;502;430", "reply_reviewers": "1;2;0;1;0", "reply_authors": "1;2;1;2;1", "recommendation_avg": [ 7.0, 1.7888543819998317 ], "confidence_avg": [ 3.8, 0.7483314773547882 ], "correctness_avg": [ 3.2, 0.7483314773547882 ], "technical_novelty_avg": [ 3.0, 0.6324555320336759 ], "empirical_novelty_avg": [ 3.0, 0.8944271909999159 ], "wc_summary_paper_avg": [ 91.2, 76.95557159816306 ], "wc_summary_review_avg": [ 104.0, 83.38345159562537 ], "wc_main_review_avg": [ 292.2, 61.836558765830425 ], "wc_review_avg": [ 487.4, 114.65530951508526 ], "wc_reply_reviewers_avg": [ 115.8, 139.2729693802785 ], "wc_reply_authors_avg": [ 542.0, 180.846896572764 ], "reply_reviewers_avg": [ 0.8, 0.7483314773547883 ], "reply_authors_avg": [ 1.4, 0.4898979485566356 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.8964214570007952, "corr_recommendation_correctness": 0.5976143046671968, "gs_citation": 90, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16676804811575846883&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=ZBESeIUB5k", "email": "umd.edu;nyu.edu;umd.edu;uni-siegen.de;umd.edu", "author_num": 5, "aff_unique_index": "0;1;0;2;0", "aff_unique_norm": "University of Maryland;New York University;University of Siegen", "aff_unique_dep": ";;", "aff_unique_url": "https://www/umd.edu;https://www.nyu.edu;https://www.uni-siegen.de", "aff_unique_abbr": "UMD;NYU;Uni Siegen", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "College Park;", "aff_country_unique_index": "0;0;0;1;0", "aff_country_unique": "United States;Germany" }, { "id": "ZC1s7bdR9bD", "title": "Path Integrals for the Attribution of Model Uncertainties", "track": "main", "status": "Reject", "tldr": "", "abstract": "Understanding model uncertainties is of key importance in Bayesian machine learning applications. This often requires to meaningfully attribute a model's predictive uncertainty to its input features, however, popular attribution methods are primarily targeted at model scores for classification and regression tasks. Thus, in order to explain uncertainties, state-of-the-art alternatives commonly procure counterfactual feature vectors associated with low uncertainty and proceed by making direct comparisons. Here, we present a novel algorithm for uncertainty attribution in differentiable models, via path integrals which leverage in-distribution curves connecting feature vectors to counterfactual counterparts. We validate our method on benchmark image data sets with varying resolution, and demonstrate that (i) it produces meaningful attributions that significantly simplify interpretability over the existing alternatives and (ii) retains desirable properties from popular attribution methods.", "keywords": "bayesian neural networks;path integrals;uncertainty attribution", "primary_area": "", "supplementary_material": "", "author": "Iker Perez Lopez;Piotr Skalski;Alec Barns-Graham;Jason Wong;David Sutton", "authorids": "~Iker_Perez_Lopez1;~Piotr_Skalski1;alec.barns-graham@featurespace.co.uk;jason.wong@featurespace.co.uk;david.sutton@featurespace.co.uk", "gender": "Not Specified;M;;;", "homepage": "https://ikerperez.wordpress.com/;;;;", "dblp": ";297/5489;;;", "google_scholar": "https://scholar.google.co.uk/citations?user=B1Z6c1wAAAAJ;;;;", "orcid": "0000-0001-9400-4229;0000-0003-3102-9837;;;", "linkedin": "ikerperez/;https://uk.linkedin.com/in/piotrekskalski;;;", "or_profile": "~Iker_Perez_Lopez1;~Piotr_Skalski1;alec.barns-graham@featurespace.co.uk;jason.wong@featurespace.co.uk;david.sutton@featurespace.co.uk", "aff": "Featurespace;Featurespace;;;", "aff_domain": "featurespace.co.uk;featurespace.co.uk;;;", "position": "Researcher;Researcher;;;", "bibtex": "@misc{\nlopez2022path,\ntitle={Path Integrals for the Attribution of Model Uncertainties},\nauthor={Iker Perez Lopez and Piotr Skalski and Alec Barns-Graham and Jason Wong and David Sutton},\nyear={2022},\nurl={https://openreview.net/forum?id=ZC1s7bdR9bD}\n}", "github": "", "project": "", "reviewers": "2e8D;DBLf;FBxT;n3ei", "site": "https://openreview.net/forum?id=ZC1s7bdR9bD", "pdf_size": 0, "recommendation": "3;5;5;5", "confidence": "4;3;3;4", "correctness": "3;4;3;4", "technical_novelty": "2;3;3;2", "empirical_novelty": "2;3;3;2", "wc_summary_paper": "57;160;183;99", "wc_summary_review": "33;91;63;61", "wc_main_review": "280;404;271;1224", "wc_review": "370;655;517;1384", "wc_reply_reviewers": "0;0;0;410", "wc_reply_authors": "0;0;0;112", "reply_reviewers": "0;0;0;1", "reply_authors": "0;0;0;1", "recommendation_avg": [ 4.5, 0.8660254037844386 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 124.75, 49.721097131901665 ], "wc_summary_review_avg": [ 62.0, 20.518284528683193 ], "wc_main_review_avg": [ 544.75, 395.6711860876402 ], "wc_review_avg": [ 731.5, 389.9682679398415 ], "wc_reply_reviewers_avg": [ 102.5, 177.5352077758099 ], "wc_reply_authors_avg": [ 28.0, 48.49742261192856 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 0.25, 0.4330127018922193 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:KBg2e9WvXWIJ:scholar.google.com/&scioq=Path+Integrals+for+the+Attribution+of+Model+Uncertainties&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Featurespace Ltd.", "aff_unique_dep": "", "aff_unique_url": "https://www.featurespace.co.uk", "aff_unique_abbr": "", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United Kingdom" }, { "id": "ZCB_kzXYhvB", "title": "An Improved Composite Functional Gradient Learning by Wasserstein Regularization for Generative adversarial networks", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Generative adversarial networks (GANs) are usually trained by a minimax game which is notoriously and empirically known to be unstable. Recently, a totally new methodology called \nComposite Functional Gradient Learning (CFG) provides an alternative\ntheoretical foundation for training GANs more stablely by employing\na strong discriminator with logistic regression and functional gradient learning for the generator. \nHowever, the discriminator using logistic regression from the CFG framework is gradually \nhard to discriminate between real and fake images while the training steps go on.\nTo address this problem, our key idea and contribution are to introduce\nthe Wasserstein distance regularization into the CFG framework for the discriminator. This gives us a novel\nimproved CFG formulation with more competitive generate image quality. In particular, we provide an intuitive explanation using logistic regression with Wasserstein regularization. The method helps to enhance the model gradients \nin training GANs to archives better image quality. Empirically, we compare our improved CFG with the original\nversion. We show that the standard CFG is easy to stick into mode collapse problem, while our improved CFG works much better\nthanks to the newly added Wasserstein distance regularization. We conduct extensive\nexperiments for image generation on different benchmarks, and it shows\nthe efficacy of our improved CFG method.\n", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Chang Wan;Yanwei Fu;Ke Fan;Jinshan Zeng;Ming Zhong;Riheng Jia;MingLu Li;ZhongLong Zheng", "authorids": "~Chang_Wan2;~Yanwei_Fu2;21110980004@m.fudan.edu.cn;jinshanzeng@jxnu.edu.cn;tracym@zjnu.edu.cn;~Riheng_Jia1;mlli@zjnu.edu.cn;zhonglong@zjnu.edu.cn", "gender": "M;M;;;;M;;", "homepage": ";http://yanweifu.github.io;;;;http://mypage.zjnu.edu.cn/JRH1/zh_CN/index.htm;;", "dblp": ";63/9065;;;;;;", "google_scholar": "tinKicEAAAAJ;https://scholar.google.co.uk/citations?user=Vg54TcsAAAAJ;;;;;;", "orcid": "0009-0006-2899-3560;0000-0002-6595-6893;;;;;;", "linkedin": ";;;;;;;", "or_profile": "~Chang_Wan2;~Yanwei_Fu2;21110980004@m.fudan.edu.cn;jinshanzeng@jxnu.edu.cn;tracym@zjnu.edu.cn;~Riheng_Jia1;mlli@zjnu.edu.cn;zhonglong@zjnu.edu.cn", "aff": "Zhejiang Normal University;Fudan University,;;;;Zhejiang Normal University;;", "aff_domain": "zjnu.edu.cn;fudan.edu.cn;;;;zjnu.edu.cn;;", "position": "PhD student;Professor;;;;Associate Professor;;", "bibtex": "@misc{\nwan2022an,\ntitle={An Improved Composite Functional Gradient Learning by Wasserstein Regularization for Generative adversarial networks},\nauthor={Chang Wan and Yanwei Fu and Ke Fan and Jinshan Zeng and Ming Zhong and Riheng Jia and MingLu Li and ZhongLong Zheng},\nyear={2022},\nurl={https://openreview.net/forum?id=ZCB_kzXYhvB}\n}", "github": "", "project": "", "reviewers": "7Pfn;XM5Z;PhTu", "site": "https://openreview.net/forum?id=ZCB_kzXYhvB", "pdf_size": 0, "recommendation": "1;3;3", "confidence": "5;4;4", "correctness": "1;3;2", "technical_novelty": "1;2;2", "empirical_novelty": "1;2;2", "wc_summary_paper": "53;59;76", "wc_summary_review": "13;60;17", "wc_main_review": "136;444;365", "wc_review": "202;563;458", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 2.3333333333333335, 0.9428090415820634 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "correctness_avg": [ 2.0, 0.816496580927726 ], "technical_novelty_avg": [ 1.6666666666666667, 0.4714045207910317 ], "empirical_novelty_avg": [ 1.6666666666666667, 0.4714045207910317 ], "wc_summary_paper_avg": [ 62.666666666666664, 9.741092797468305 ], "wc_summary_review_avg": [ 30.0, 21.275964529643932 ], "wc_main_review_avg": [ 315.0, 130.61648696342536 ], "wc_review_avg": [ 407.6666666666667, 151.61427666578396 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": -1.0, "corr_recommendation_correctness": 0.8660254037844387, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:UKqDw4zd8MMJ:scholar.google.com/&scioq=An+Improved+Composite+Functional+Gradient+Learning+by+Wasserstein+Regularization+for+Generative+adversarial+networks&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;0", "aff_unique_norm": "Zhejiang Normal University;Fudan University", "aff_unique_dep": ";", "aff_unique_url": "http://www.zjnu.edu.cn;https://www.fudan.edu.cn", "aff_unique_abbr": "ZJNU;Fudan", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "id": "ZDYhm_o8MX", "title": "Neural Manifold Clustering and Embedding", "track": "main", "status": "Reject", "tldr": "", "abstract": "Given a union of non-linear manifolds, non-linear subspace clustering or manifold clustering aims to cluster data points based on manifold structures and also learn to parameterize each manifold as a linear subspace in a feature space. Deep neural networks have the potential to achieve this goal under highly non-linear settings given their large capacity and flexibility. We argue that achieving manifold clustering with neural networks requires two essential ingredients: a domain-specific constraint that ensures the identification of the manifolds, and a learning algorithm for embedding each manifold to a linear subspace in the feature space. This work shows that many constraints can be implemented by data augmentation. For subspace feature learning, Maximum Coding Rate Reduction (MCR$^2$) objective can be used. Putting them together yields Neural Manifold Clustering and Embedding (NMCE), a novel method for general purpose manifold clustering, which significantly outperforms autoencoder-based deep subspace clustering and achieve state-of-the-art performance on several important benchmarks. Further, on more challenging natural image datasets, NMCE can also outperform other algorithms specifically designed for clustering. Qualitatively, we demonstrate that NMCE learns a meaningful and interpretable feature space. As the formulation of NMCE is closely related to several important Self-supervised learning (SSL) methods, we believe this work can help us build a deep understanding on SSL representation learning.", "keywords": "Self-supervised Learning;Clustering;Subspace Clustering;Manifold Learning;Deep Subspace Clustering", "primary_area": "", "supplementary_material": "/attachment/75cf85b111777cccdec1603a018317e6b48b1a3f.zip", "author": "ZENGYI LI;Yubei Chen;Yann LeCun;Friedrich Sommer", "authorids": "~ZENGYI_LI1;~Yubei_Chen1;~Yann_LeCun1;~Friedrich_Sommer1", "gender": "M;M;M;M", "homepage": ";https://redwood.berkeley.edu/people/yubei-chen/;http://yann.lecun.com;https://www.rctn.org/wiki/Fritz_Sommer", "dblp": ";30/10064;l/YannLeCun;", "google_scholar": "rstPxpcAAAAJ;WeyLqFUAAAAJ;WLN3QrAAAAAJ;lA-oLkgAAAAJ", "orcid": ";;;", "linkedin": ";yubei-chen-05998a39/;;", "or_profile": "~ZENGYI_LI1;~Yubei_Chen1;~Yann_LeCun1;~Friedrich_Sommer1", "aff": "University of California, Berkeley;Facebook AI Research;New York University;University of California, Berkeley", "aff_domain": "berkeley.edu;facebook.com;nyu.edu;berkeley.edu", "position": "PhD student;Postdoc Researcher;Full Professor;Full Professor", "bibtex": "@misc{\nli2022neural,\ntitle={Neural Manifold Clustering and Embedding},\nauthor={ZENGYI LI and Yubei Chen and Yann LeCun and Friedrich Sommer},\nyear={2022},\nurl={https://openreview.net/forum?id=ZDYhm_o8MX}\n}", "github": "", "project": "", "reviewers": "HTFw;aRVQ;SD19;dgFk;zSx8", "site": "https://openreview.net/forum?id=ZDYhm_o8MX", "pdf_size": 0, "recommendation": "3;5;5;6;6", "confidence": "4;3;4;4;3", "correctness": "3;3;2;3;3", "technical_novelty": "2;2;2;3;2", "empirical_novelty": "2;3;2;3;2", "wc_summary_paper": "31;54;115;57;134", "wc_summary_review": "34;39;26;37;43", "wc_main_review": "514;340;364;306;923", "wc_review": "579;433;505;400;1100", "wc_reply_reviewers": "0;106;141;0;210", "wc_reply_authors": "692;498;568;177;777", "reply_reviewers": "0;1;1;0;1", "reply_authors": "2;2;2;1;2", "recommendation_avg": [ 5.0, 1.0954451150103321 ], "confidence_avg": [ 3.6, 0.4898979485566356 ], "correctness_avg": [ 2.8, 0.39999999999999997 ], "technical_novelty_avg": [ 2.2, 0.39999999999999997 ], "empirical_novelty_avg": [ 2.4, 0.4898979485566356 ], "wc_summary_paper_avg": [ 78.2, 39.32124107909109 ], "wc_summary_review_avg": [ 35.8, 5.706137047074843 ], "wc_main_review_avg": [ 489.4, 228.16450205936943 ], "wc_review_avg": [ 603.4, 255.85042505338936 ], "wc_reply_reviewers_avg": [ 91.4, 81.78899681497505 ], "wc_reply_authors_avg": [ 542.4, 206.66939783141575 ], "reply_reviewers_avg": [ 0.6, 0.48989794855663565 ], "reply_authors_avg": [ 1.8, 0.4 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.372677996249965, "corr_recommendation_correctness": 0.0, "gs_citation": 61, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14865743586679367971&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "University of California, Berkeley;Meta;New York University", "aff_unique_dep": ";Facebook AI Research;", "aff_unique_url": "https://www.berkeley.edu;https://research.facebook.com;https://www.nyu.edu", "aff_unique_abbr": "UC Berkeley;FAIR;NYU", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Berkeley;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Case-based reasoning for better generalization in textual reinforcement learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6875", "id": "ZDaSIkWT-AP", "poster": "", "openreview": "https://openreview.net/forum?id=ZDaSIkWT-AP", "slides": "https://iclr.cc/virtual/2022/poster/6875", "video": "https://iclr.cc/virtual/2022/poster/6875", "author_site": "Mattia Atzeni, Shehzaad Dhuliawala, Keerthiram Murugesan, Mrinmaya Sachan", "tldr": "", "abstract": "Text-based games (TBG) have emerged as promising environments for driving research in grounded language understanding and studying problems like generalization and sample efficiency. Several deep reinforcement learning (RL) methods with varying architectures and learning schemes have been proposed for TBGs. However, these methods fail to generalize efficiently, especially under distributional shifts. In a departure from deep RL approaches, in this paper, we propose a general method inspired by case-based reasoning to train agents and generalize out of the training distribution. The case-based reasoner collects instances of positive experiences from the agent's interaction with the world and later reuses the collected experiences to act efficiently. The method can be used in conjunction with any existing on-policy neural agent introduced in the literature for TBGs. Our experiments show that the proposed approach consistently improves existing methods, obtains good out-of-distribution generalization and achieves new state-of-the-art results on widely used environments.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Mattia Atzeni;Shehzaad Zuzar Dhuliawala;Keerthiram Murugesan;Mrinmaya Sachan", "authorids": "~Mattia_Atzeni1;~Shehzaad_Zuzar_Dhuliawala3;~Keerthiram_Murugesan1;~Mrinmaya_Sachan3", "gender": ";M;M;M", "homepage": ";https://keerthi166.github.io;https://sites.google.com/site/mrinsachan/;https://shehzaadzd.github.io", "dblp": "204/8455.html;178/2877;86/10440.html;184/8733", "google_scholar": "GxcjDq0AAAAJ;-698GEMAAAAJ;Tpp9ZjoAAAAJ;7O33ij4AAAAJ", "orcid": ";0000-0001-6847-522X;;", "linkedin": ";https://linkedin.com/in/keerthiram;;", "or_profile": "~Mattia_Atzeni1;~Keerthiram_Murugesan1;~MRINMAYA_SACHAN2;~Shehzaad_Zuzar_Dhuliawala1", "aff": "International Business Machines;International Business Machines;Swiss Federal Institute of Technology;Swiss Federal Institute of Technology", "aff_domain": "ibm.com;ibm.com;ethz.ch;ethz.ch", "position": "Researcher;Researcher;Assistant Professor;PhD student", "bibtex": "@inproceedings{\natzeni2022casebased,\ntitle={Case-based reasoning for better generalization in textual reinforcement learning},\nauthor={Mattia Atzeni and Shehzaad Zuzar Dhuliawala and Keerthiram Murugesan and MRINMAYA SACHAN},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=ZDaSIkWT-AP}\n}", "github": "", "project": "", "reviewers": "HscQ;eX6x;aw9P;Da5u", "pdf_size": 0, "recommendation": "6;8;8;8", "confidence": "3;5;4;4", "correctness": "4;3;3;4", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "107;86;83;101", "wc_summary_review": "58;67;50;46", "wc_main_review": "227;310;472;420", "wc_review": "392;463;605;567", "wc_reply_reviewers": "0;0;125;117", "wc_reply_authors": "400;413;885;821", "reply_reviewers": "0;0;1;1", "reply_authors": "1;2;3;2", "recommendation_avg": [ 7.5, 0.8660254037844386 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 94.25, 10.034316120194738 ], "wc_summary_review_avg": [ 55.25, 8.042853971072706 ], "wc_main_review_avg": [ 357.25, 95.26640278713163 ], "wc_review_avg": [ 506.75, 84.2091889285249 ], "wc_reply_reviewers_avg": [ 60.5, 60.5660796155736 ], "wc_reply_authors_avg": [ 629.75, 224.44083296049317 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 2.0, 0.7071067811865476 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.816496580927726, "corr_recommendation_correctness": -0.5773502691896257, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1304573593498282003&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=ZDaSIkWT-AP", "email": "ibm.com;ibm.com;ethz.ch;ethz.ch", "author_num": 4, "aff_unique_index": "0;0;1;1", "aff_unique_norm": "International Business Machines Corporation;Swiss Federal Institute of Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.ibm.com;https://www.ethz.ch", "aff_unique_abbr": "IBM;ETH Zurich", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;1", "aff_country_unique": "United States;Switzerland" }, { "id": "ZFIT_sGjPJ", "title": "Data-Dependent Randomized Smoothing", "track": "main", "status": "Reject", "tldr": "", "abstract": "Randomized smoothing is a recent technique that achieves state-of-art performance in training certifiably robust deep neural networks. While the smoothing family of distributions is often connected to the choice of the norm used for certification, the parameters of these distributions are always set as global hyper parameters independent from the input data on which a network is certified. In this work, we revisit Gaussian randomized smoothing and show that the variance of the Gaussian distribution can be optimized at each input so as to maximize the certification radius for the construction of the smooth classifier. We also propose a simple memory-based approach to certifying the resultant smooth classifier. This new approach is generic, parameter-free, and easy to implement. In fact, we show that our data dependent framework can be seamlessly incorporated into 3 randomized smoothing approaches, leading to consistent improved certified accuracy. When this framework is used in the training routine of these approaches followed by a data dependent certification, we achieve 9% and 6% improvement over the certified accuracy of the strongest baseline for a radius of 0.5 on CIFAR10 and ImageNet. ", "keywords": "Network Certification;Randomized Smoothing", "primary_area": "", "supplementary_material": "/attachment/6a9930925ec1a6517d6492ccb448085943258c81.zip", "author": "Motasem Alfarra;Adel Bibi;Philip Torr;Bernard Ghanem", "authorids": "~Motasem_Alfarra1;~Adel_Bibi1;~Philip_Torr1;~Bernard_Ghanem1", "gender": "M;M;;M", "homepage": "https://motasemalfarra.netlify.app/;http://adelbibi.com;http://www.robots.ox.ac.uk/~tvg/;https://ivul.kaust.edu.sa", "dblp": "255/5192;176/0964;;37/2516", "google_scholar": "https://scholar.google.com/citations?hl=en;Q4j2laYAAAAJ;;rVsGTeEAAAAJ", "orcid": ";0000-0002-6169-3918;;0000-0002-5534-587X", "linkedin": ";adel-bibi-ba3671ab/;;bernardghanem/", "or_profile": "~Motasem_Alfarra1;~Adel_Bibi1;~Philip_Torr1;~Bernard_Ghanem1", "aff": "KAUST;University of Oxford;University of Oxford;King Abdullah University of Science and Technology", "aff_domain": "kaust.edu.sa;ox.ac.uk;ox.ac.uk;kaust.edu.sa", "position": "PhD student;Postdoc;Full Professor;Associate Professor", "bibtex": "@misc{\nalfarra2022datadependent,\ntitle={Data-Dependent Randomized Smoothing},\nauthor={Motasem Alfarra and Adel Bibi and Philip Torr and Bernard Ghanem},\nyear={2022},\nurl={https://openreview.net/forum?id=ZFIT_sGjPJ}\n}", "github": "", "project": "", "reviewers": "viFi;Xsdx;DGVf;y9v6", "site": "https://openreview.net/forum?id=ZFIT_sGjPJ", "pdf_size": 0, "recommendation": "3;5;6;6", "confidence": "4;4;3;1", "correctness": "2;3;4;3", "technical_novelty": "3;3;2;3", "empirical_novelty": "3;2;2;3", "wc_summary_paper": "62;62;110;38", "wc_summary_review": "115;42;3;22", "wc_main_review": "376;399;179;82", "wc_review": "553;503;292;142", "wc_reply_reviewers": "249;176;0;0", "wc_reply_authors": "735;887;646;98", "reply_reviewers": "1;1;0;0", "reply_authors": "1;2;1;1", "recommendation_avg": [ 5.0, 1.224744871391589 ], "confidence_avg": [ 3.0, 1.224744871391589 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 68.0, 26.153393661244042 ], "wc_summary_review_avg": [ 45.5, 42.42935304715357 ], "wc_main_review_avg": [ 259.0, 133.24601307356255 ], "wc_review_avg": [ 372.5, 165.24300287758027 ], "wc_reply_reviewers_avg": [ 106.25, 109.33978004367852 ], "wc_reply_authors_avg": [ 591.5, 297.6680197804259 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.6666666666666665, "corr_recommendation_correctness": 0.8660254037844386, "gs_citation": 42, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15497611913511940623&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 11, "aff_unique_index": "0;1;1;0", "aff_unique_norm": "King Abdullah University of Science and Technology;University of Oxford", "aff_unique_dep": ";", "aff_unique_url": "https://www.kaust.edu.sa;https://www.ox.ac.uk", "aff_unique_abbr": "KAUST;Oxford", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;0", "aff_country_unique": "Saudi Arabia;United Kingdom" }, { "id": "ZFWwI5ahxud", "title": "Learning to Adapt to Semantic Shift", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Machine learning systems are typically trained and tested on the same distribution of data. However, in the real world, models and agents must adapt to data distributions that change over time. Previous work in computer vision has proposed using image corruptions to model this change. \nIn contrast, we propose studying models under a setting more similar to what an agent might encounter in the real world. In this setting, models must adapt online without labels to a test distribution that changes in semantics.\nWe define two types of semantic distribution shift, one or both of which can occur: \\emph{static shift}, where the test set contains labels unseen at train time, and \\emph{continual shift}, where the distribution of labels changes throughout the test phase.\nUsing a dataset that contains both class and attribute labels for image instances, we generate shifts by changing the joint distribution of class and attribute labels. We compare to previously proposed methods for distribution adaptation that optimize a fixed self-supervised criterion at test time or a meta-learning criterion at train time. Surprisingly, these provide little improvement in this more difficult setting, with some even underperforming a static model that does not change parameters at test time.\nIn this setting, we introduce two models that ``learn to adapt''---via recurrence and learned Hebbian update rules. These models outperform both previous work and static models under both \\emph{static} and \\emph{continual} semantic shifts, suggesting that ``learning to adapt'' is a useful capability for models and agents in a changing world.", "keywords": "Adaptation;Incremental Learning;Deep Learning;Hebbian Learning;Synaptic Plasticity;Domain Adaptation;Continual Learning", "primary_area": "", "supplementary_material": "/attachment/c97f103076d3c1de238b098c0d4a8bbbe599cc09.zip", "author": "Ryan Y Benmalek;Sabhya Chhabria;Pedro O. Pinheiro;Claire Cardie;Serge Belongie", "authorids": "~Ryan_Y_Benmalek1;~Sabhya_Chhabria1;~Pedro_O._Pinheiro1;~Claire_Cardie1;~Serge_Belongie1", "gender": "M;M;M;F;M", "homepage": ";;;https://www.cs.cornell.edu/home/cardie/;https://di.ku.dk/english/staff/?pure=en%2Fpersons%2Fserge-belongie(0ce65383-3761-4b17-948a-83b461e371e2)%2Fpublications.html", "dblp": ";;223/9937;c/ClaireCardie;http://dblp.uni-trier.de/pers/hd/b/Belongie:Serge_J=", "google_scholar": ";;https://scholar.google.ca/citations?user=BU6f7L4AAAAJ;ex9BQiIAAAAJ;ORr4XJYAAAAJ", "orcid": ";;;;0000-0002-0388-5217", "linkedin": ";sabhyachhabria/;;;sergebelongie", "or_profile": "~Ryan_Y_Benmalek1;~Sabhya_Chhabria1;~Pedro_O._Pinheiro1;~Claire_Cardie1;~Serge_Belongie1", "aff": "Cornell University;Department of Computer Science, Cornell University;Deep Genomics;Cornell University;University of Copenhagen", "aff_domain": "cornell.edu;cs.cornell.edu;deepgenomics.com;cornell.edu;ku.dk", "position": "PhD student;Undergrad student;Researcher;Full Professor;Full Professor", "bibtex": "@misc{\nbenmalek2022learning,\ntitle={Learning to Adapt to Semantic Shift },\nauthor={Ryan Y Benmalek and Sabhya Chhabria and Pedro O. Pinheiro and Claire Cardie and Serge Belongie},\nyear={2022},\nurl={https://openreview.net/forum?id=ZFWwI5ahxud}\n}", "github": "", "project": "", "reviewers": "bjbH;Zzbr;QuUH;597Q", "site": "https://openreview.net/forum?id=ZFWwI5ahxud", "pdf_size": 0, "recommendation": "3;3;3;3", "confidence": "5;4;4;4", "correctness": "3;2;2;3", "technical_novelty": "1;2;2;2", "empirical_novelty": "2;2;0;2", "wc_summary_paper": "40;113;90;333", "wc_summary_review": "20;48;20;89", "wc_main_review": "110;531;282;563", "wc_review": "170;692;392;985", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 1.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 144.0, 112.26531075982464 ], "wc_summary_review_avg": [ 44.25, 28.252212302756046 ], "wc_main_review_avg": [ 371.5, 186.08129943656348 ], "wc_review_avg": [ 559.75, 307.56005511119287 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15148370332552960545&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;1;0;2", "aff_unique_norm": "Cornell University;Deep Genomics;University of Copenhagen", "aff_unique_dep": ";;", "aff_unique_url": "https://www.cornell.edu;https://www.deepgenomics.com;https://www.ku.dk", "aff_unique_abbr": "Cornell;Deep Genomics;UCPH", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;2", "aff_country_unique": "United States;Canada;Denmark" }, { "title": "It Takes Two to Tango: Mixup for Deep Metric Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6337", "id": "ZKy2X3dgPA", "poster": "", "openreview": "https://openreview.net/forum?id=ZKy2X3dgPA", "slides": "https://iclr.cc/virtual/2022/poster/6337", "video": "https://iclr.cc/virtual/2022/poster/6337", "author_site": "Shashanka Venkataramanan, Bill Psomas, Ewa Kijak, laurent amsaleg, Konstantinos Karantzalos, Yannis Avrithis", "tldr": "", "abstract": "Metric learning involves learning a discriminative representation such that embeddings of similar classes are encouraged to be close, while embeddings of dissimilar classes are pushed far apart. State-of-the-art methods focus mostly on sophisticated loss functions or mining strategies. On the one hand, metric learning losses consider two or more examples at a time. On the other hand, modern data augmentation methods for classification consider two or more examples at a time. The combination of the two ideas is under-studied.\n\nIn this work, we aim to bridge this gap and improve representations using mixup, which is a powerful data augmentation approach interpolating two or more examples and corresponding target labels at a time. This task is challenging because, unlike classification, the loss functions used in metric learning are not additive over examples, so the idea of interpolating target labels is not straightforward. To the best of our knowledge, we are the first to investigate mixing both examples and target labels for deep metric learning. We develop a generalized formulation that encompasses existing metric learning loss functions and modify it to accommodate for mixup, introducing Metric Mix, or Metrix. We also introduce a new metric---utilization---to demonstrate that by mixing examples during training, we are exploring areas of the embedding space beyond the training classes, thereby improving representations. To validate the effect of improved representations, we show that mixing inputs, intermediate representations or embeddings along with target labels significantly outperforms state-of-the-art metric learning methods on four benchmark deep metric learning datasets.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Shashanka Venkataramanan;Bill Psomas;Ewa Kijak;laurent amsaleg;Konstantinos Karantzalos;Yannis Avrithis", "authorids": "~Shashanka_Venkataramanan2;~Bill_Psomas2;~Ewa_Kijak1;~laurent_amsaleg1;~Konstantinos_Karantzalos1;~Yannis_Avrithis2", "gender": "M;M;;;M;", "homepage": "https://shashankvkt.github.io/;http://users.ntua.gr/psomasbill/;;;http://users.ntua.gr/karank/;https://avrithis.net/", "dblp": "218/8893;294/8365;;a/LAmsaleg;50/2661;a/YSAvrithis", "google_scholar": "CbfH47IAAAAJ;qiDVfC4AAAAJ;;;https://scholar.google.gr/citations?user=U6t7QpsAAAAJ;AF2SxG0AAAAJ", "orcid": ";0000-0001-5381-0312;;;0000-0001-8730-6245;0000-0001-7476-4482", "linkedin": "shashank-venkataramanan-1b2b9993/;bill-psomas/;;;https://gr.linkedin.com/in/konstantinos-karantzalos-b8768928;yannisavrithis/", "or_profile": "~Shashanka_Venkataramanan2;~Bill_Psomas2;~Ewa_Kijak1;~laurent_amsaleg1;~Konstantinos_Karantzalos1;~Yannis_Avrithis2", "aff": "INRIA;IARAI;;IRISA;National Technical University of Athens;Athena RC", "aff_domain": "inria.fr;iarai.ac.at;;irisa.fr;ntua.gr;athenarc.gr", "position": "PhD student;Intern;;researcher;Full Professor;Research Director", "bibtex": "@inproceedings{\nvenkataramanan2022it,\ntitle={It Takes Two to Tango: Mixup for Deep Metric Learning},\nauthor={Shashanka Venkataramanan and Bill Psomas and Ewa Kijak and laurent amsaleg and Konstantinos Karantzalos and Yannis Avrithis},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=ZKy2X3dgPA}\n}", "github": "", "project": "", "reviewers": "c8hM;S9kx;LaZ6;EXEY;gPed", "pdf_size": 0, "recommendation": "6;6;6;6;8", "confidence": "3;4;2;4;4", "correctness": "4;3;3;4;3", "technical_novelty": "2;2;3;3;3", "empirical_novelty": "3;2;3;4;3", "wc_summary_paper": "85;69;56;43;138", "wc_summary_review": "61;49;40;60;76", "wc_main_review": "149;302;256;224;372", "wc_review": "295;420;352;327;586", "wc_reply_reviewers": "16;167;0;0;0", "wc_reply_authors": "382;709;315;333;356", "reply_reviewers": "1;2;0;0;0", "reply_authors": "2;2;1;1;1", "recommendation_avg": [ 6.4, 0.7999999999999999 ], "confidence_avg": [ 3.4, 0.8 ], "correctness_avg": [ 3.4, 0.4898979485566356 ], "technical_novelty_avg": [ 2.6, 0.4898979485566356 ], "empirical_novelty_avg": [ 3.0, 0.6324555320336759 ], "wc_summary_paper_avg": [ 78.2, 32.98120676991671 ], "wc_summary_review_avg": [ 57.2, 12.155657119218196 ], "wc_main_review_avg": [ 260.6, 74.75185616424518 ], "wc_review_avg": [ 396.0, 103.51231810755664 ], "wc_reply_reviewers_avg": [ 36.6, 65.49381650201796 ], "wc_reply_authors_avg": [ 419.0, 146.7310464761974 ], "reply_reviewers_avg": [ 0.6, 0.7999999999999999 ], "reply_authors_avg": [ 1.4, 0.4898979485566356 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.37500000000000006, "corr_recommendation_correctness": -0.408248290463863, "gs_citation": 38, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11528364689956817661&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "pdf": "https://openreview.net/pdf?id=ZKy2X3dgPA", "email": "inria.fr;iarai.ac.at;;irisa.fr;ntua.gr;athenarc.gr", "author_num": 6, "aff_unique_index": "0;1;2;3;4", "aff_unique_norm": "INRIA;Institute of Advanced Research in Artificial Intelligence;Institut de Recherche en Informatique et Automatique;National Technical University of Athens;Athena RC", "aff_unique_dep": ";;;;", "aff_unique_url": "https://www.inria.fr;https://www.iarai.ac.at;https://www.irisa.fr;https://www.ntua.gr;", "aff_unique_abbr": "INRIA;IARAI;IRISA;NTUA;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;2", "aff_country_unique": "France;Austria;Greece;" }, { "id": "ZN5fOmir9Uk", "title": "TexRel: a Green Family of Datasets for Emergent Communication with Relations", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "We propose a new dataset TexRel as a playground for the study of emergent communications, in particular for relations.\nTexRel provides rapid training and experimentation, whilst being sufficiently large to avoid overfitting.\nWe use TexRel to compare Sender models, compare with a related dataset, examine the effect of changing meaning space size, and perform a case-study for using \\sys/ in place of symbolic input.", "keywords": "emergent communication;compositionality;metrics;language model;dataset;green;relations", "primary_area": "", "supplementary_material": "/attachment/ef9b76d48991eb0a63b9fa56c60343ab4ed1ba1f.zip", "author": "Hugh Perkins", "authorids": "~Hugh_Perkins2", "gender": "", "homepage": "https://github.com/hughperkins", "dblp": "136/5700", "google_scholar": "https://scholar.google.co.uk/citations?user=eT5s06MAAAAJ", "orcid": "", "linkedin": "hughperkins/", "or_profile": "~Hugh_Nicholas_Perkins1", "aff": "", "aff_domain": "", "position": "", "bibtex": "@misc{\nperkins2022texrel,\ntitle={TexRel: a Green Family of Datasets for Emergent Communication with Relations},\nauthor={Hugh Perkins},\nyear={2022},\nurl={https://openreview.net/forum?id=ZN5fOmir9Uk}\n}", "github": "", "project": "", "reviewers": "KKXF;Rhmj;scAk;stt3;3fqj", "site": "https://openreview.net/forum?id=ZN5fOmir9Uk", "pdf_size": 0, "recommendation": "3;3;3;5;6", "confidence": "4;5;4;4;4", "correctness": "2;3;2;3;3", "technical_novelty": "2;2;2;2;2", "empirical_novelty": "2;2;3;2;3", "wc_summary_paper": "97;252;35;55;105", "wc_summary_review": "54;358;57;71;91", "wc_main_review": "336;1055;114;558;565", "wc_review": "487;1665;206;684;761", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "0;0;0;0;0", "reply_reviewers": "0;0;0;0;0", "reply_authors": "0;0;0;0;0", "recommendation_avg": [ 4.0, 1.2649110640673518 ], "confidence_avg": [ 4.2, 0.39999999999999997 ], "correctness_avg": [ 2.6, 0.4898979485566356 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.4, 0.4898979485566356 ], "wc_summary_paper_avg": [ 108.8, 76.15878150285756 ], "wc_summary_review_avg": [ 126.2, 116.63515764982701 ], "wc_main_review_avg": [ 525.6, 312.48334355610064 ], "wc_review_avg": [ 760.6, 491.17109035447106 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 1, 0 ], "corr_recommendation_confidence": -0.39528470752104744, "corr_recommendation_correctness": 0.6454972243679028, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1892789175731539810&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3 }, { "title": "Generalisation in Lifelong Reinforcement Learning through Logical Composition", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6562", "id": "ZOcX-eybqoL", "poster": "", "openreview": "https://openreview.net/forum?id=ZOcX-eybqoL", "slides": "https://iclr.cc/virtual/2022/poster/6562", "video": "https://iclr.cc/virtual/2022/poster/6562", "author_site": "Geraud Nangue Tasse, Steven James, Benjamin Rosman", "tldr": "", "abstract": "We leverage logical composition in reinforcement learning to create a framework that enables an agent to autonomously determine whether a new task can be immediately solved using its existing abilities, or whether a task-specific skill should be learned. In the latter case, the proposed algorithm also enables the agent to learn the new task faster by generating an estimate of the optimal policy. Importantly, we provide two main theoretical results: we bound the performance of the transferred policy on a new task, and we give bounds on the necessary and sufficient number of tasks that need to be learned throughout an agent's lifetime to generalise over a distribution. We verify our approach in a series of experiments, where we perform transfer learning both after learning a set of base tasks, and after learning an arbitrary set of tasks. We also demonstrate that, as a side effect of our transfer learning approach, an agent can produce an interpretable Boolean expression of its understanding of the current task. Finally, we demonstrate our approach in the full lifelong setting where an agent receives tasks from an unknown distribution. Starting from scratch, an agent is able to quickly generalise over the task distribution after learning only a few tasks, which are sub-logarithmic in the size of the task space.", "keywords": "Reinforcement Learning;Lifelong learning;Multi task learning;Transfer learning;Logical composition;Deep Reinforcement Learning", "primary_area": "", "supplementary_material": "/attachment/73712897744101e96675f8da95110567d360b743.zip", "author": "Geraud Nangue Tasse;Steven James;Benjamin Rosman", "authorids": "~Geraud_Nangue_Tasse1;~Steven_James1;~Benjamin_Rosman1", "gender": "M;M;M", "homepage": "https://geraudnt.github.io/;;http://www.raillab.org", "dblp": "256/0971;195/8202;45/4591", "google_scholar": "CAfsMIsAAAAJ;;https://scholar.google.co.za/citations?user=pWJ0SocAAAAJ", "orcid": "0000-0002-6152-8429;;", "linkedin": "geraud-nangue-tasse-264281a5/;;", "or_profile": "~Geraud_Nangue_Tasse1;~Steven_James1;~Benjamin_Rosman1", "aff": "University of the Witwatersrand;University of the Witwatersrand;University of the Witwatersrand", "aff_domain": "wits.ac.za;wits.ac.za;wits.ac.za", "position": "PhD student;Lecturer;Full Professor", "bibtex": "@inproceedings{\ntasse2022generalisation,\ntitle={Generalisation in Lifelong Reinforcement Learning through Logical Composition },\nauthor={Geraud Nangue Tasse and Steven James and Benjamin Rosman},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=ZOcX-eybqoL}\n}", "github": "", "project": "", "reviewers": "KyZj;AVki;1zPe;BUDa;td5N;ovqB", "pdf_size": 0, "recommendation": "5;5;5;6;6;8", "confidence": "3;2;3;3;3;3", "correctness": "3;2;3;4;4;3", "technical_novelty": "3;4;2;2;3;3", "empirical_novelty": "3;3;2;3;2;3", "wc_summary_paper": "76;43;126;152;77;161", "wc_summary_review": "37;35;117;61;134;81", "wc_main_review": "1266;1294;320;292;243;1531", "wc_review": "1379;1372;563;505;454;1773", "wc_reply_reviewers": "1646;310;353;0;0;334", "wc_reply_authors": "4394;1991;1820;588;994;1705", "reply_reviewers": "2;2;3;0;0;2", "reply_authors": "6;4;4;2;3;4", "recommendation_avg": [ 5.833333333333333, 1.0671873729054748 ], "confidence_avg": [ 2.8333333333333335, 0.3726779962499649 ], "correctness_avg": [ 3.1666666666666665, 0.6871842709362768 ], "technical_novelty_avg": [ 2.8333333333333335, 0.6871842709362768 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 105.83333333333333, 43.30287391027169 ], "wc_summary_review_avg": [ 77.5, 37.602083275620075 ], "wc_main_review_avg": [ 824.3333333333334, 546.3078090437864 ], "wc_review_avg": [ 1007.6666666666666, 518.5414373241758 ], "wc_reply_reviewers_avg": [ 440.5, 559.3656377242587 ], "wc_reply_authors_avg": [ 1915.3333333333333, 1211.9895030715224 ], "reply_reviewers_avg": [ 1.5, 1.118033988749895 ], "reply_authors_avg": [ 3.8333333333333335, 1.2133516482134197 ], "replies_avg": [ 40, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.3492151478847891, "corr_recommendation_correctness": 0.2651439066774996, "gs_citation": 26, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5906819650459140976&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "pdf": "https://openreview.net/pdf?id=ZOcX-eybqoL", "email": "wits.ac.za;wits.ac.za;wits.ac.za", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of the Witwatersrand", "aff_unique_dep": "", "aff_unique_url": "https://www.wits.ac.za", "aff_unique_abbr": "Wits", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "South Africa" }, { "id": "ZOjKx9dEmLB", "title": "NAS-Bench-360: Benchmarking Diverse Tasks for Neural Architecture Search", "track": "main", "status": "Reject", "tldr": "", "abstract": "Most existing neural architecture search (NAS) benchmarks and algorithms prioritize performance on well-studied tasks, e.g., image classification on CIFAR and ImageNet. This makes the applicability of NAS approaches in more diverse areas inadequately understood.\nIn this paper, we present NAS-Bench-360, a benchmark suite for evaluating state-of-the-art NAS methods for convolutional neural networks (CNNs). To construct it, we curate a collection of ten tasks spanning a diverse array of application domains, dataset sizes, problem dimensionalities, and learning objectives. By carefully selecting tasks that can both interoperate with modern CNN-based search methods but that are also far-afield from their original development domain, we can use NAS-Bench-360 to investigate the following central question: do existing state-of-the-art NAS methods perform well on diverse tasks? Our experiments show that a modern NAS procedure designed for image classification can indeed find good architectures for tasks with other dimensionalities and learning objectives; however, the same method struggles against more task-specific methods and performs catastrophically poorly on classification in non-vision domains. The case for NAS robustness becomes even more dire in a resource-constrained setting, where a recent NAS method provides little-to-no benefit over much simpler baselines. These results demonstrate the need for a benchmark such as NAS-Bench-360 to help develop NAS approaches that work well on a variety of tasks, a crucial component of a truly robust and automated pipeline. We conclude with a demonstration of the kind of future research our suite of tasks will enable. All data and code is made publicly available.", "keywords": "automated machine learning;neural architecture search", "primary_area": "", "supplementary_material": "/attachment/ea5c61261baca7804d7c5359d98d3b20b3483a37.zip", "author": "Renbo Tu;Mikhail Khodak;Nicholas Carl Roberts;Ameet Talwalkar", "authorids": "~Renbo_Tu1;~Mikhail_Khodak1;~Nicholas_Carl_Roberts1;~Ameet_Talwalkar1", "gender": ";;M;M", "homepage": "https://rtu715.github.io;;https://nick11roberts.science/;http://www.cs.cmu.edu/~atalwalk/", "dblp": ";;;56/5528", "google_scholar": ";;https://scholar.google.com/citations?hl=en;https://scholar.google.com.tw/citations?user=TW7U1W0AAAAJ", "orcid": ";;0000-0002-0625-9182;", "linkedin": ";;nick11roberts/;", "or_profile": "~Renbo_Tu1;~Mikhail_Khodak1;~Nicholas_Carl_Roberts1;~Ameet_Talwalkar1", "aff": "University of Toronto;;University of Wisconsin-Madison;Carnegie Mellon University", "aff_domain": "toronto.edu;;wisc.edu;cmu.edu", "position": "PhD student;;PhD student;Associate Professor", "bibtex": "@misc{\ntu2022nasbench,\ntitle={{NAS}-Bench-360: Benchmarking Diverse Tasks for Neural Architecture Search},\nauthor={Renbo Tu and Mikhail Khodak and Nicholas Carl Roberts and Ameet Talwalkar},\nyear={2022},\nurl={https://openreview.net/forum?id=ZOjKx9dEmLB}\n}", "github": "", "project": "", "reviewers": "ofGn;oZbZ;zB8j;JYCo;8TV7", "site": "https://openreview.net/forum?id=ZOjKx9dEmLB", "pdf_size": 0, "recommendation": "3;3;5;5;6", "confidence": "5;5;4;4;5", "correctness": "4;3;3;3;4", "technical_novelty": "2;1;4;2;1", "empirical_novelty": "2;1;2;3;3", "wc_summary_paper": "52;26;100;59;80", "wc_summary_review": "14;57;41;42;51", "wc_main_review": "247;146;378;206;551", "wc_review": "313;229;519;307;682", "wc_reply_reviewers": "95;0;219;215;529", "wc_reply_authors": "236;184;549;555;663", "reply_reviewers": "1;0;1;1;1", "reply_authors": "1;1;1;1;1", "recommendation_avg": [ 4.4, 1.2 ], "confidence_avg": [ 4.6, 0.48989794855663565 ], "correctness_avg": [ 3.4, 0.4898979485566356 ], "technical_novelty_avg": [ 2.0, 1.0954451150103321 ], "empirical_novelty_avg": [ 2.2, 0.7483314773547882 ], "wc_summary_paper_avg": [ 63.4, 25.152335875619983 ], "wc_summary_review_avg": [ 41.0, 14.737706741552431 ], "wc_main_review_avg": [ 305.6, 144.42243593015596 ], "wc_review_avg": [ 410.0, 166.51966850795733 ], "wc_reply_reviewers_avg": [ 211.6, 178.42600707296006 ], "wc_reply_authors_avg": [ 437.4, 190.76330884108714 ], "reply_reviewers_avg": [ 0.8, 0.4000000000000001 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.408248290463863, "corr_recommendation_correctness": 0.06804138174397723, "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2777184194230461065&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2, "aff_unique_index": "0;1;2", "aff_unique_norm": "University of Toronto;University of Wisconsin-Madison;Carnegie Mellon University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.utoronto.ca;https://www.wisc.edu;https://www.cmu.edu", "aff_unique_abbr": "U of T;UW-Madison;CMU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Madison", "aff_country_unique_index": "0;1;1", "aff_country_unique": "Canada;United States" }, { "title": "Accelerated Policy Learning with Parallel Differentiable Simulation", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6923", "id": "ZSKRQMvttc", "poster": "", "openreview": "https://openreview.net/forum?id=ZSKRQMvttc", "slides": "https://iclr.cc/virtual/2022/poster/6923", "video": "https://iclr.cc/virtual/2022/poster/6923", "author_site": "Jie Xu, Viktor Makoviychuk, Yashraj Narang, Fabio Ramos, Wojciech Matusik, Animesh Garg, Miles Macklin", "tldr": "", "abstract": "Deep reinforcement learning can generate complex control policies, but requires large amounts of training data to work effectively. Recent work has attempted to address this issue by leveraging differentiable simulators. However, inherent problems such as local minima and exploding/vanishing numerical gradients prevent these methods from being generally applied to control tasks with complex contact-rich dynamics, such as humanoid locomotion in classical RL benchmarks. In this work we present a high-performance differentiable simulator and a new policy learning algorithm (SHAC) that can effectively leverage simulation gradients, even in the presence of non-smoothness. Our learning algorithm alleviates problems with local minima through a smooth critic function, avoids vanishing/exploding gradients through a truncated learning window, and allows many physical environments to be run in parallel. We evaluate our method on classical RL control tasks, and show substantial improvements in sample efficiency and wall-clock time over state-of-the-art RL and differentiable simulation-based algorithms. In addition, we demonstrate the scalability of our method by applying it to the challenging high-dimensional problem of muscle-actuated locomotion with a large action space, achieving a greater than $17\\times$ reduction in training time over the best-performing established RL algorithm. More visual results are provided at: https://short-horizon-actor-critic.github.io/.", "keywords": "Robot Control;Policy Learning;Differentiable Simulation;Reinforcement Learning", "primary_area": "", "supplementary_material": "/attachment/6e91626fdb66dc4d9a2be62ea0642fd595f6b723.zip", "author": "Jie Xu;Viktor Makoviychuk;Yashraj Narang;Fabio Ramos;Wojciech Matusik;Animesh Garg;Miles Macklin", "authorids": "~Jie_Xu7;~Viktor_Makoviychuk1;~Yashraj_Narang1;~Fabio_Ramos1;~Wojciech_Matusik2;~Animesh_Garg1;~Miles_Macklin1", "gender": "M;M;M;M;M;M;M", "homepage": "https://people.csail.mit.edu/jiex;;;https://fabioramos.github.io/;https://cdfg.mit.edu/wojciech;http://animesh.garg.tech;https://mmacklin.com", "dblp": "37/5126-28;;215/6022.html;22/2488;;123/5728;", "google_scholar": "3Tj5lWEAAAAJ;rmAcDNkAAAAJ;M3NuG7AAAAAJ;https://scholar.google.com.au/citations?user=T_mJiHoAAAAJ;https://scholar.google.com/citations?hl=en;zp8V7ZMAAAAJ;", "orcid": ";;0000-0001-5445-3759;;0000-0003-0212-5643;0000-0003-0482-4296;", "linkedin": ";;;fabio-ramos-3256b421/;wojciech-matusik-67238126/;animeshgarg/;", "or_profile": "~Jie_Xu7;~Viktor_Makoviychuk1;~Yashraj_Narang1;~Fabio_Ramos1;~Wojciech_Matusik2;~Animesh_Garg1;~Miles_Macklin1", "aff": "Massachusetts Institute of Technology;NVIDIA;NVIDIA;NVIDIA;Massachusetts Institute of Technology;University of Toronto;NVIDIA", "aff_domain": "mit.edu;nvidia.com;nvidia.com;nvidia.com;mit.edu;toronto.edu;nvidia.com", "position": "PhD student;Senior Research Scientist;Researcher;Principal Research Scientist;Full Professor;Assistant Professor;Principal Engineer", "bibtex": "@inproceedings{\nxu2022accelerated,\ntitle={Accelerated Policy Learning with Parallel Differentiable Simulation},\nauthor={Jie Xu and Miles Macklin and Viktor Makoviychuk and Yashraj Narang and Animesh Garg and Fabio Ramos and Wojciech Matusik},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=ZSKRQMvttc}\n}", "github": "", "project": "", "reviewers": "Hcaa;5Q2y;nSwn;4PEt", "pdf_size": 0, "recommendation": "6;8;8;8", "confidence": "4;4;4;4", "correctness": "4;4;4;4", "technical_novelty": "2;3;2;2", "empirical_novelty": "3;3;4;4", "wc_summary_paper": "162;340;95;92", "wc_summary_review": "70;387;134;28", "wc_main_review": "463;1259;651;169", "wc_review": "695;1986;880;289", "wc_reply_reviewers": "238;290;448;0", "wc_reply_authors": "888;1646;1635;576", "reply_reviewers": "2;2;2;0", "reply_authors": "2;3;3;1", "recommendation_avg": [ 7.5, 0.8660254037844386 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 4.0, 0.0 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.5, 0.5 ], "wc_summary_paper_avg": [ 172.25, 100.81263561677177 ], "wc_summary_review_avg": [ 154.75, 139.3007088998473 ], "wc_main_review_avg": [ 635.5, 398.8643253037303 ], "wc_review_avg": [ 962.5, 628.3941836140751 ], "wc_reply_reviewers_avg": [ 244.0, 160.70469812671936 ], "wc_reply_authors_avg": [ 1186.25, 467.46784648786274 ], "reply_reviewers_avg": [ 1.5, 0.8660254037844386 ], "reply_authors_avg": [ 2.25, 0.82915619758885 ], "replies_avg": [ 24, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 116, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10548121622562610559&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=ZSKRQMvttc", "email": "mit.edu;nvidia.com;nvidia.com;nvidia.com;mit.edu;toronto.edu;nvidia.com", "author_num": 7, "aff_unique_index": "0;1;1;1;0;2;1", "aff_unique_norm": "Massachusetts Institute of Technology;NVIDIA;University of Toronto", "aff_unique_dep": ";NVIDIA Corporation;", "aff_unique_url": "https://web.mit.edu;https://www.nvidia.com;https://www.utoronto.ca", "aff_unique_abbr": "MIT;NVIDIA;U of T", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;1;0", "aff_country_unique": "United States;Canada" }, { "id": "ZTZa78mCbie", "title": "For Manifold Learning, Deep Neural Networks Can be Locality Sensitive Hash Functions", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "It is well established that training deep neural networks gives useful representations that capture essential features of the inputs. However, these representations are poorly understood in theory and practice. In the context of supervised learning an important question is whether these representations capture features informative for classification, while filtering out non-informative noisy ones. We present a formal framework to study this question by considering a generative process where each class is associated with a high-dimensional manifold and different classes define different manifolds. Under this model, each input is produced using two latent vectors: (i) a ``manifold identifier\" $\\gamma$ and; (ii)~a ``transformation parameter\" $\\theta$ that shifts examples along the surface of a manifold. E.g., $\\gamma$ might represent a canonical image of a dog, and $\\theta$ might stand for variations in pose, background or lighting. We provide theoretical evidence that neural representations can be viewed as LSH-like functions that map each input to an embedding that is a function of solely the informative $\\gamma$ and invariant to $\\theta$, effectively recovering the manifold identifier . We formally show that we get one-shot learning to unseen classes as an important consequence of this behavior.", "keywords": "theory of deep learning;theory of representation learning;manifold learning;locality sensitive hash functions;interpretability", "primary_area": "", "supplementary_material": "", "author": "Nishanth Dikkala;Gal Kaplun;Rina Panigrahy", "authorids": "~Nishanth_Dikkala1;~Gal_Kaplun1;~Rina_Panigrahy1", "gender": "M;M;", "homepage": "http://people.csail.mit.edu/nishanthd/;http://www.galkaplun.com;", "dblp": "138/8092;237/9816;p/RinaPanigrahy", "google_scholar": "CMZoOTIAAAAJ;y4BzFYsAAAAJ;", "orcid": ";;", "linkedin": ";gal-kaplun-865496151/;", "or_profile": "~Nishanth_Dikkala1;~Gal_Kaplun1;~Rina_Panigrahy1", "aff": "Google;Harvard University;Google", "aff_domain": "google.com;harvard.edu;google.com", "position": "Google Research;PhD student;Research Scientist", "bibtex": "@misc{\ndikkala2022for,\ntitle={For Manifold Learning, Deep Neural Networks Can be Locality Sensitive Hash Functions},\nauthor={Nishanth Dikkala and Gal Kaplun and Rina Panigrahy},\nyear={2022},\nurl={https://openreview.net/forum?id=ZTZa78mCbie}\n}", "github": "", "project": "", "reviewers": "MTcC;1Drx;Xmdv", "site": "https://openreview.net/forum?id=ZTZa78mCbie", "pdf_size": 0, "recommendation": "5;5;5", "confidence": "3;3;3", "correctness": "3;4;3", "technical_novelty": "3;4;3", "empirical_novelty": "0;2;3", "wc_summary_paper": "67;99;179", "wc_summary_review": "41;26;176", "wc_main_review": "854;253;593", "wc_review": "962;378;948", "wc_reply_reviewers": "31;43;209", "wc_reply_authors": "808;762;434", "reply_reviewers": "1;1;1", "reply_authors": "2;3;1", "recommendation_avg": [ 5.0, 0.0 ], "confidence_avg": [ 3.0, 0.0 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 3.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 1.6666666666666667, 1.247219128924647 ], "wc_summary_paper_avg": [ 115.0, 47.10272462041518 ], "wc_summary_review_avg": [ 81.0, 67.45368781616021 ], "wc_main_review_avg": [ 566.6666666666666, 246.0627742851179 ], "wc_review_avg": [ 762.6666666666666, 272.0604507988783 ], "wc_reply_reviewers_avg": [ 94.33333333333333, 81.22944184359648 ], "wc_reply_authors_avg": [ 668.0, 166.52527335712938 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 2.0, 0.816496580927726 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17426263495625532856&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "Google;Harvard University", "aff_unique_dep": "Google;", "aff_unique_url": "https://www.google.com;https://www.harvard.edu", "aff_unique_abbr": "Google;Harvard", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Learning to Extend Molecular Scaffolds with Structural Motifs", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7209", "id": "ZTsoE8G3GG", "poster": "", "openreview": "https://openreview.net/forum?id=ZTsoE8G3GG", "slides": "https://iclr.cc/virtual/2022/poster/7209", "video": "https://iclr.cc/virtual/2022/poster/7209", "author_site": "Krzysztof Maziarz, Henry Jackson-Flux, Pashmina Cameron, Finton Sirockin, Nadine Schneider, Nikolaus Stiefl, Marwin Segler, Marc Brockschmidt", "tldr": "", "abstract": "Recent advancements in deep learning-based modeling of molecules promise to accelerate in silico drug discovery. A plethora of generative models is available, building molecules either atom-by-atom and bond-by-bond or fragment-by-fragment. However, many drug discovery projects require a fixed scaffold to be present in the generated molecule, and incorporating that constraint has only recently been explored. Here, we propose MoLeR, a graph-based model that naturally supports scaffolds as initial seed of the generative procedure, which is possible because it is not conditioned on the generation history. Our experiments show that MoLeR performs comparably to state-of-the-art methods on unconstrained molecular optimization tasks, and outperforms them on scaffold-based tasks, while being an order of magnitude faster to train and sample from than existing approaches. Furthermore, we show the influence of a number of seemingly minor design choices on the overall performance.", "keywords": "molecules;graph neural networks;scaffold;generative model", "primary_area": "", "supplementary_material": "", "author": "Krzysztof Maziarz;Henry Richard Jackson-Flux;Pashmina Cameron;Finton Sirockin;Nadine Schneider;Nikolaus Stiefl;Marwin Segler;Marc Brockschmidt", "authorids": "~Krzysztof_Maziarz1;~Henry_Richard_Jackson-Flux1;~Pashmina_Cameron1;~Finton_Sirockin1;~Nadine_Schneider1;~Nikolaus_Stiefl1;~Marwin_Segler2;~Marc_Brockschmidt1", "gender": "M;M;F;M;F;;M;", "homepage": ";http://www.jackson-flux.com;https://www.microsoft.com/en-us/research/people/pcameron/;;;;;", "dblp": "194/2971;;94/8938;;;;80/8292;185/0993", "google_scholar": "BA8bBVkAAAAJ;Kc83yK8AAAAJ;https://scholar.google.com/citations?hl=en;;CVgOH54AAAAJ;;https://scholar.google.co.uk/citations?user=pF27eLMAAAAJ;imsL94QAAAAJ", "orcid": ";;0009-0009-0444-1755;0000-0003-2536-7485;;;;", "linkedin": ";henryrj/;pashmina-cameron-7424b51/;https://ch.linkedin.com/in/finton-sirockin-9673844;;nikolaus-stiefl-39583b25/;;", "or_profile": "~Krzysztof_Maziarz1;~Henry_Richard_Jackson-Flux1;~Pashmina_Cameron1;~Finton_Sirockin1;~Nadine_Schneider1;~Nikolaus_Stiefl1;~Marc_Brockschmidt1;~Marwin_Segler1", "aff": "Microsoft Research;;Microsoft;Novartis;Novartis;Novartis Institutes for Biomedical Research;Microsoft Research;Microsoft", "aff_domain": "microsoft.com;;microsoft.com;novartis.com;novartis.com;novartis.com;microsoft.com;microsoft.com", "position": "Senior Researcher;;Principal Scientist;Principal Researcher;Researcher;Principal Researcher;Researcher;Researcher", "bibtex": "@inproceedings{\nmaziarz2022learning,\ntitle={Learning to Extend Molecular Scaffolds with Structural Motifs},\nauthor={Krzysztof Maziarz and Henry Richard Jackson-Flux and Pashmina Cameron and Finton Sirockin and Nadine Schneider and Nikolaus Stiefl and Marwin Segler and Marc Brockschmidt},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=ZTsoE8G3GG}\n}", "github": "", "project": "", "reviewers": "WYph;2My6;9yrY;V5A5", "pdf_size": 0, "recommendation": "3;6;8;8", "confidence": "5;3;3;4", "correctness": "2;4;3;3", "technical_novelty": "1;2;3;3", "empirical_novelty": "3;3;3;4", "wc_summary_paper": "128;60;85;157", "wc_summary_review": "42;48;13;106", "wc_main_review": "339;259;403;382", "wc_review": "509;367;501;645", "wc_reply_reviewers": "0;0;23;13", "wc_reply_authors": "1222;570;598;609", "reply_reviewers": "0;0;1;1", "reply_authors": "2;1;2;1", "recommendation_avg": [ 6.25, 2.0463381929681126 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.25, 0.82915619758885 ], "empirical_novelty_avg": [ 3.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 107.5, 37.526657191921586 ], "wc_summary_review_avg": [ 52.25, 33.737034546622496 ], "wc_main_review_avg": [ 345.75, 55.14242921743655 ], "wc_review_avg": [ 505.5, 98.32980219648567 ], "wc_reply_reviewers_avg": [ 9.0, 9.669539802906858 ], "wc_reply_authors_avg": [ 749.75, 273.02415186206514 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": -0.6998739952495694, "corr_recommendation_correctness": 0.5183210553488161, "gs_citation": 106, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16834575414277010470&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "pdf": "https://openreview.net/pdf?id=ZTsoE8G3GG", "email": "microsoft.com;;microsoft.com;novartis.com;novartis.com;novartis.com;microsoft.com;microsoft.com", "author_num": 8, "aff_unique_index": "0;0;1;1;2;0;0", "aff_unique_norm": "Microsoft;Novartis;Novartis Institutes for BioMedical Research", "aff_unique_dep": "Microsoft Research;;", "aff_unique_url": "https://www.microsoft.com/en-us/research;https://www.novartis.com;https://www.nibr.com", "aff_unique_abbr": "MSR;Novartis;NIBR", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;1;1;0;0", "aff_country_unique": "United States;Switzerland" }, { "title": "Bregman Gradient Policy Optimization", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6662", "id": "ZU-zFnTum1N", "poster": "", "openreview": "https://openreview.net/forum?id=ZU-zFnTum1N", "slides": "https://iclr.cc/virtual/2022/poster/6662", "video": "https://iclr.cc/virtual/2022/poster/6662", "author_site": "Feihu Huang, Shangqian Gao, Heng Huang", "tldr": "", "abstract": "In the paper, we design a novel Bregman gradient policy optimization framework for reinforcement learning based on Bregman divergences and momentum techniques. Specifically, we propose a Bregman gradient policy optimization (BGPO) algorithm based on the basic momentum technique and mirror descent iteration. Meanwhile, we further propose an accelerated Bregman gradient policy optimization (VR-BGPO) algorithm based on the variance reduced technique. Moreover, we provide a convergence analysis framework for our Bregman gradient policy optimization under the nonconvex setting. We prove that our BGPO achieves a sample complexity of $O(\\epsilon^{-4})$ for finding $\\epsilon$-stationary policy only requiring one trajectory at each iteration, and our VR-BGPO reaches the best known sample complexity of $O(\\epsilon^{-3})$, which also only requires one trajectory at each iteration. In particular, by using different Bregman divergences, our BGPO framework unifies many existing policy optimization algorithms such as the existing (variance reduced) policy gradient algorithms such as natural policy gradient algorithm. Extensive experimental results on multiple reinforcement learning tasks demonstrate the efficiency of our new algorithms. ", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Feihu Huang;Shangqian Gao;Heng Huang", "authorids": "~Feihu_Huang1;~Shangqian_Gao1;~Heng_Huang1", "gender": "M;;M", "homepage": ";;https://www.cs.umd.edu/~heng/", "dblp": "169/6247;195/2523;03/281", "google_scholar": "tRQwlHUAAAAJ;9mNI83oAAAAJ;4OqLaDwAAAAJ", "orcid": "0000-0003-0806-6074;;", "linkedin": ";;", "or_profile": "~Feihu_Huang1;~Shangqian_Gao1;~Heng_Huang1", "aff": "University of Pittsburgh;University of Pittsburgh;University of Pittsburgh", "aff_domain": "pitt.edu;pitt.edu;pitt.edu", "position": "Senior Postdoc;PhD student;Full Professor", "bibtex": "@inproceedings{\nhuang2022bregman,\ntitle={Bregman Gradient Policy Optimization},\nauthor={Feihu Huang and Shangqian Gao and Heng Huang},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=ZU-zFnTum1N}\n}", "github": "", "project": "", "reviewers": "mQZe;pxbY;s4n4", "pdf_size": 0, "recommendation": "6;8;8", "confidence": "4;5;3", "correctness": "4;4;3", "technical_novelty": "1;3;3", "empirical_novelty": "1;0;3", "wc_summary_paper": "42;82;48", "wc_summary_review": "23;383;37", "wc_main_review": "212;86;146", "wc_review": "277;551;231", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "674;195;393", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "recommendation_avg": [ 7.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "correctness_avg": [ 3.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.3333333333333335, 0.9428090415820634 ], "empirical_novelty_avg": [ 1.3333333333333333, 1.247219128924647 ], "wc_summary_paper_avg": [ 57.333333333333336, 17.613126418163876 ], "wc_summary_review_avg": [ 147.66666666666666, 166.50392054109585 ], "wc_main_review_avg": [ 148.0, 51.45872132107443 ], "wc_review_avg": [ 353.0, 141.26098777322304 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 420.6666666666667, 196.5270690996253 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": -0.49999999999999983, "gs_citation": 29, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17535380024235547901&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=ZU-zFnTum1N", "email": "pitt.edu;pitt.edu;pitt.edu", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Pittsburgh", "aff_unique_dep": "", "aff_unique_url": "https://www.pitt.edu", "aff_unique_abbr": "Pitt", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "ZUXZKjfptc9", "title": "Bit-aware Randomized Response for Local Differential Privacy in Federated Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "In this paper, we develop BitRand, a bit-aware randomized response algorithm, to preserve local differential privacy (LDP) in federated learning (FL). We encode embedded features extracted from clients' local data into binary encoding bits, in which different bits have different impacts on the embedded features. Based upon that, we randomize all the bits to preserve LDP with three key advantages: (1) Bit-aware: Bits with a more substantial influence on the model utility have smaller randomization probabilities, and vice-versa, under the same privacy protection; (2) Dimension-elastic: Increasing the dimensions of embedded features, gradients, model outcomes, and training rounds marginally affect the randomization probabilities of binary encoding bits under the same privacy protection; and (3) LDP protection is achieved for both embedded features and labels with tight privacy loss and expected error bounds ensuring high model utility. Extensive theoretical and experimental results show that our BitRand significantly outperforms various baseline approaches in text and image classification.", "keywords": "local differential privacy;federated learning;bit-aware", "primary_area": "", "supplementary_material": "/attachment/0c706a9e536e4bf6a2a7ba5aa3b107c612d4c1f2.zip", "author": "Phung Lai;Hai Phan;Li Xiong;Khang Tran;My Thai;Tong Sun;Franck Dernoncourt;Jiuxiang Gu;Nikolaos Barmpalios;Rajiv Jain", "authorids": "~Phung_Lai1;~Hai_Phan1;~Li_Xiong1;~Khang_Tran1;~My_Thai1;~Tong_Sun1;~Franck_Dernoncourt1;~Jiuxiang_Gu2;~Nikolaos_Barmpalios1;~Rajiv_Jain1", "gender": "F;Not Specified;;M;F;F;;M;M;M", "homepage": "https://www.linkedin.com/in/phunglai/;https://sites.google.com/site/ihaiphan/;http://www.cs.emory.edu/~lxiong/;;http://www.cise.ufl.edu/~mythai;https://research.adobe.com/person/tong-sun/;http://francky.me;http://gujiuxiang.com;;", "dblp": ";153/5204;39/3530-1.html;47/7216;63/4711;;132/4043;173/4935.html;;", "google_scholar": ";nsEbWjAAAAAJ;jJ8BLgsAAAAJ;https://scholar.google.com/citations?hl=en;zLLJimcAAAAJ;https://scholar.google.com/citations?hl=en;kz2aIc8AAAAJ;https://scholar.google.com.sg/citations?user=zPxKV9EAAAAJ;Yp4dul4AAAAJ;https://scholar.google.com/", "orcid": ";;0000-0001-7354-0428;;0000-0003-0503-2012;;0000-0002-1119-1346;;;", "linkedin": ";;li-xiong-32472513/;khangtran16/;;tong-sun/?trk=hb_tab_pro_top;franckdernoncourt;;;", "or_profile": "~Phung_Lai1;~Hai_Phan1;~Li_Xiong1;~Khang_Tran1;~My_Thai1;~Tong_Sun1;~Franck_Dernoncourt1;~Jiuxiang_Gu2;~Nikolaos_Barmpalios1;~Rajiv_Jain1", "aff": "New Jersey Institute of Technology;New Jersey Institute of Technology;Emory University;New Jersey Institute of Technology;University of Florida;Adobe Systems;Adobe Systems;Adobe Systems;Adobe Systems;Adobe Systems", "aff_domain": "njit.edu;njit.edu;emory.edu;njit.edu;ufl.edu;adobe.com;adobe.com;adobe.com;adobe.com;adobe.com", "position": "PhD student;Assistant Professor;Professor;PhD student;Full Professor;Director, Document Intelligence Lab;Researcher;Researcher;Senior Machine Learning Scientist;Senior Research Scientist", "bibtex": "@misc{\nlai2022bitaware,\ntitle={Bit-aware Randomized Response for Local Differential Privacy in Federated Learning},\nauthor={Phung Lai and Hai Phan and Li Xiong and Khang Tran and My Thai and Tong Sun and Franck Dernoncourt and Jiuxiang Gu and Nikolaos Barmpalios and Rajiv Jain},\nyear={2022},\nurl={https://openreview.net/forum?id=ZUXZKjfptc9}\n}", "github": "", "project": "", "reviewers": "yGHK;S5g9;Sanq;UqE3", "site": "https://openreview.net/forum?id=ZUXZKjfptc9", "pdf_size": 0, "recommendation": "3;5;6;6", "confidence": "4;3;3;3", "correctness": "3;3;3;3", "technical_novelty": "2;3;3;2", "empirical_novelty": "2;3;0;2", "wc_summary_paper": "78;85;48;33", "wc_summary_review": "45;62;25;59", "wc_main_review": "361;372;381;396", "wc_review": "484;519;454;488", "wc_reply_reviewers": "141;102;18;157", "wc_reply_authors": "1192;796;2049;1462", "reply_reviewers": "1;1;1;1", "reply_authors": "3;3;5;4", "recommendation_avg": [ 5.0, 1.224744871391589 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 1.75, 1.0897247358851685 ], "wc_summary_paper_avg": [ 61.0, 21.319005605327842 ], "wc_summary_review_avg": [ 47.75, 14.618053906043718 ], "wc_main_review_avg": [ 377.5, 12.816005617976296 ], "wc_review_avg": [ 486.25, 23.025800746119558 ], "wc_reply_reviewers_avg": [ 104.5, 53.79823417176441 ], "wc_reply_authors_avg": [ 1374.75, 455.67936918407884 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 3.75, 0.82915619758885 ], "replies_avg": [ 25, 0 ], "authors#_avg": [ 10, 0 ], "corr_recommendation_confidence": -0.9428090415820632, "corr_recommendation_correctness": 0.0, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14553217584569473667&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;1;0;2;3;3;3;3;3", "aff_unique_norm": "New Jersey Institute of Technology;Emory University;University of Florida;Adobe", "aff_unique_dep": ";;;Adobe Systems Incorporated", "aff_unique_url": "https://www.njit.edu;https://www.emory.edu;https://www.ufl.edu;https://www.adobe.com", "aff_unique_abbr": "NJIT;Emory;UF;Adobe", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "ZUinrZwKnHb", "title": "Attend to Who You Are: Supervising Self-Attention for Keypoint Detection and Instance-Aware Association", "track": "main", "status": "Reject", "tldr": "", "abstract": "Bottom-up multi-person pose estimation models need to detect keypoints and learn associative information between keypoints. \nWe argue that these problems can be entirely solved by the Transformer model. Specifically, the self-attention in Transformer measures the pairwise dependencies between locations, which can play a role in providing association information for keypoints grouping.\nHowever, the naive attention patterns are still not subjectively controlled, so there is no guarantee that the keypoints will always attend to the instances to which they belong.\nTo address it we propose a novel approach of multi-person keypoint detection and instance association using instance masks to supervise self-attention. By supervising self-attention to be instance-aware, we can assign the detected keypoints to the correct human instances based on the pairwise attention scores, without using pre-defined offset vector fields or embedding like CNN-based bottom-up models. An additional benefit of our method is that the instance segmentation results of any number of people can be directly obtained from the supervised attention matrix, thereby simplifying the pixel assignment pipeline.\nThe experiments on the COCO multi-person keypoint detection challenge and person instance segmentation task demonstrate the effectiveness and simplicity of the proposed method.", "keywords": "human pose estimation;bottom-up;self-attention;transformer;instance segmentation", "primary_area": "", "supplementary_material": "", "author": "Sen Yang;Zhicheng Wang;Ze Chen;Yanjie Li;Shoukui Zhang;Zhibin Quan;Shu-Tao Xia;Yiping Bao;Erjin Zhou;Wankou Yang", "authorids": "~Sen_Yang3;~Zhicheng_Wang3;~Ze_Chen1;~Yanjie_Li1;~Shoukui_Zhang1;~Zhibin_Quan1;~Shu-Tao_Xia1;~Yiping_Bao1;~Erjin_Zhou1;~Wankou_Yang1", "gender": "M;M;;M;;;M;M;;M", "homepage": ";;;;;;https://www.sigs.tsinghua.edu.cn/xst/list.htm;;;https://automation.seu.edu.cn/ywk/list.htm", "dblp": ";;;;;;03/6195;;;99/3602", "google_scholar": "z5O3DLcAAAAJ;0QBBNGoAAAAJ;;8xZwThIAAAAJ;https://scholar.google.com/citations?hl=zh-CN;;https://scholar.google.com.hk/citations?user=koAXTXgAAAAJ;EB9_W4kAAAAJ;;https://scholar.google.com/citations?hl=zh-CN", "orcid": ";;;;;;0000-0002-8639-982X;;;", "linkedin": ";;;;;;;;;", "or_profile": "~Sen_Yang3;~Zhicheng_Wang3;~Ze_Chen1;~Yanjie_Li1;~Shoukui_Zhang1;~Zhibin_Quan1;~Shu-Tao_Xia1;~Yiping_Bao1;~Erjin_Zhou1;~Wankou_Yang1", "aff": "Tencent Inc.;Nreal;;Tsinghua University;Sankuai Technology Inc.;;Shenzhen International Graduate School, Tsinghua University;;;Southeast University", "aff_domain": "tencent.com;nreal.ai;;tsinghua.edu.cn;meituan.com;;sz.tsinghua.edu.cn;;;seu.edu.cn", "position": "Intern;Principal Researcher;;MS student;Researcher;;Full Professor;;;Full Professor", "bibtex": "@misc{\nyang2022attend,\ntitle={Attend to Who You Are: Supervising Self-Attention for Keypoint Detection and Instance-Aware Association},\nauthor={Sen Yang and Zhicheng Wang and Ze Chen and Yanjie Li and Shoukui Zhang and Zhibin Quan and Shu-Tao Xia and Yiping Bao and Erjin Zhou and Wankou Yang},\nyear={2022},\nurl={https://openreview.net/forum?id=ZUinrZwKnHb}\n}", "github": "", "project": "", "reviewers": "YSvp;cESx;mi8h;oDDC;PtRJ", "site": "https://openreview.net/forum?id=ZUinrZwKnHb", "pdf_size": 0, "recommendation": "3;3;3;5;5", "confidence": "4;4;4;4;3", "correctness": "2;3;4;3;3", "technical_novelty": "3;2;2;2;3", "empirical_novelty": "2;3;2;2;2", "wc_summary_paper": "91;49;105;124;177", "wc_summary_review": "21;9;13;54;52", "wc_main_review": "364;152;184;356;509", "wc_review": "476;210;302;534;738", "wc_reply_reviewers": "81;0;0;229;106", "wc_reply_authors": "598;420;442;651;767", "reply_reviewers": "1;0;0;1;1", "reply_authors": "1;1;1;1;1", "recommendation_avg": [ 3.8, 0.9797958971132712 ], "confidence_avg": [ 3.8, 0.39999999999999997 ], "correctness_avg": [ 3.0, 0.6324555320336759 ], "technical_novelty_avg": [ 2.4, 0.4898979485566356 ], "empirical_novelty_avg": [ 2.2, 0.39999999999999997 ], "wc_summary_paper_avg": [ 109.2, 41.92564847441241 ], "wc_summary_review_avg": [ 29.8, 19.343215865000317 ], "wc_main_review_avg": [ 313.0, 130.71189693367623 ], "wc_review_avg": [ 452.0, 184.4776409216033 ], "wc_reply_reviewers_avg": [ 83.2, 84.411847509695 ], "wc_reply_authors_avg": [ 575.6, 130.29290080430323 ], "reply_reviewers_avg": [ 0.6, 0.48989794855663565 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 10, 0 ], "corr_recommendation_confidence": -0.6123724356957946, "corr_recommendation_correctness": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10413214431065134893&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1;2;3;2;4", "aff_unique_norm": "Tencent;Nreal;Tsinghua University;Sankuai Technology Inc.;Southeast University", "aff_unique_dep": "Tencent;;;;", "aff_unique_url": "https://www.tencent.com;https://www.nreal.ai;https://www.tsinghua.edu.cn;https://www.sankuai.com;https://www.seu.edu.cn/", "aff_unique_abbr": "Tencent;Nreal;THU;Sankuai;SEU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Shenzhen", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "China" }, { "id": "ZV3PZXrRDQ", "title": "Towards a Game-Theoretic View of Baseline Values in the Shapley Value", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "This paper aims to formulate the problem of estimating optimal baseline values, which are used to compute the Shapley value in game theory. In the computation of Shapley values, people usually set an input variable to its baseline value to represent the absence of this variable. However, there are no studies on how to ensure that baseline values represent the absence states of variables without bringing in additional information, which ensures the trustworthiness of the Shapley value. To this end, previous studies usually determine baseline values in an empirical manner, which are not reliable. Therefore, we revisit the feature representation of a deep model in game theory, and formulate the absence state of an input variable. From the perspective of game-theoretic interaction, we learn the optimal baseline value of each input variable. Experimental results have demonstrated the effectiveness of our method. The code will be released when the paper is accepted.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jie Ren;Zhanpeng Zhou;Qirui Chen;Quanshi Zhang", "authorids": "~Jie_Ren1;~Zhanpeng_Zhou1;~Qirui_Chen1;~Quanshi_Zhang1", "gender": "F;M;;M", "homepage": "https://jie-ren.github.io/;https://zzp1012.github.io/;;http://qszhang.com", "dblp": "r/JieRen-18;;;http://dblp.uni-trier.de/pers/hd/z/Zhang:Quanshi", "google_scholar": "https://scholar.google.com/citations?hl=zh-CN;idxXY3UAAAAJ;;iFFhHK0AAAAJ", "orcid": "0000-0001-9918-3000;;;", "linkedin": ";;;", "or_profile": "~Jie_Ren1;~Zhanpeng_Zhou1;~Qirui_Chen1;~Quanshi_Zhang1", "aff": "Shanghai Jiaotong University;Shanghai Jiaotong University;;Shanghai Jiaotong University", "aff_domain": "sjtu.edu.cn;sjtu.edu.cn;;sjtu.edu.cn", "position": "PhD student;Undergrad student;;Associate Professor", "bibtex": "@misc{\nren2022towards,\ntitle={Towards a Game-Theoretic View of Baseline Values in the Shapley Value},\nauthor={Jie Ren and Zhanpeng Zhou and Qirui Chen and Quanshi Zhang},\nyear={2022},\nurl={https://openreview.net/forum?id=ZV3PZXrRDQ}\n}", "github": "", "project": "", "reviewers": "wtdf;zZtc;J4Bu;7PfT", "site": "https://openreview.net/forum?id=ZV3PZXrRDQ", "pdf_size": 0, "recommendation": "3;5;5;5", "confidence": "3;4;4;2", "correctness": "4;2;3;4", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;3;2;2", "wc_summary_paper": "59;116;119;108", "wc_summary_review": "8;87;71;23", "wc_main_review": "403;518;667;163", "wc_review": "470;721;857;294", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.5, 0.8660254037844386 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 100.5, 24.295061226512683 ], "wc_summary_review_avg": [ 47.25, 32.683137854251385 ], "wc_main_review_avg": [ 437.75, 184.18112688329387 ], "wc_review_avg": [ 585.5, 218.16564807503494 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.17407765595569782, "corr_recommendation_correctness": -0.5222329678670935, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:RJgKC9s5OboJ:scholar.google.com/&scioq=Towards+a+Game-Theoretic+View+of+Baseline+Values+in+the+Shapley+Value&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0;0", "aff_unique_norm": "Shanghai Jiao Tong University", "aff_unique_dep": "", "aff_unique_url": "https://www.sjtu.edu.cn", "aff_unique_abbr": "SJTU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "id": "ZV7MoEj44Et", "title": "Measuring the Effectiveness of Self-Supervised Learning using Calibrated Learning Curves", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Self-supervised learning has witnessed remarkable progress in recent years, in particular with the introduction of augmentation-based contrastive methods. While a number of large-scale empirical studies on the performance of self-supervised pre-training have been conducted, there isn't yet an agreed upon set of control baselines, evaluation practices, and metrics to report. We identify this as an important angle of investigation and propose an evaluation standard that aims to quantify and communicate transfer learning performance in an informative yet accessible setup. This is done by baking in a number of key control baselines in the evaluation method, particularly the blind guess (quantifying the dataset bias), the scratch model (quantifying the architectural contribution), and the gold standard (quantifying the upper-bound). We further provide a number of experiments to demonstrate how the proposed evaluation can be employed in empirical studies of basic questions -- for example, whether the effectiveness of existing self-supervised learning methods is skewed towards image classification versus other tasks, such as dense pixel-wise predictions. \n", "keywords": "Self-Supervised Learning;Transfer Learning;Metric;Evaluation", "primary_area": "", "supplementary_material": "", "author": "Andrei Atanov;Shijian Xu;Onur Beker;Andrey Filatov;Amir Zamir", "authorids": "~Andrei_Atanov1;~Shijian_Xu1;onur.beker@epfl.ch;~Andrey_Filatov1;~Amir_Zamir1", "gender": "M;M;;;M", "homepage": "https://andrewatanov.github.io;https://shijianxu.github.io/;;;https://amirzamir.com/", "dblp": "215/4857;166/5100;;;76/8610", "google_scholar": "https://scholar.google.ru/citations?user=XriU_R8AAAAJ;;;;RKjEFukAAAAJ", "orcid": ";;;;", "linkedin": ";;;andrei-filatov;", "or_profile": "~Andrei_Atanov1;~Shijian_Xu1;onur.beker@epfl.ch;~Andrey_Filatov1;~Amir_Zamir1", "aff": "Swiss Federal Institute of Technology Lausanne;Swiss Federal Institute of Technology Lausanne;;Skolkovo Institute of Science and Technology;Swiss Federal Institute of Technology Lausanne", "aff_domain": "epfl.ch;epfl.ch;;skoltech.ru;epfl.ch", "position": "PhD student;MS student;;MS student;Assistant Professor", "bibtex": "@misc{\natanov2022measuring,\ntitle={Measuring the Effectiveness of Self-Supervised Learning using Calibrated Learning Curves},\nauthor={Andrei Atanov and Shijian Xu and Onur Beker and Andrey Filatov and Amir Zamir},\nyear={2022},\nurl={https://openreview.net/forum?id=ZV7MoEj44Et}\n}", "github": "", "project": "", "reviewers": "bGT2;hWBB;7aPs;ZaD4", "site": "https://openreview.net/forum?id=ZV7MoEj44Et", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "4;4;3;2", "correctness": "2;3;4;3", "technical_novelty": "1;2;2;4", "empirical_novelty": "2;2;2;4", "wc_summary_paper": "65;26;54;45", "wc_summary_review": "46;36;54;38", "wc_main_review": "448;102;283;170", "wc_review": "559;164;391;253", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.25, 1.0897247358851685 ], "empirical_novelty_avg": [ 2.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 47.5, 14.291605927956452 ], "wc_summary_review_avg": [ 43.5, 7.123903424387503 ], "wc_main_review_avg": [ 250.75, 130.95299729292185 ], "wc_review_avg": [ 341.75, 149.2437184607781 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.8703882797784891, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:eorQLtrETRwJ:scholar.google.com/&scioq=Measuring+the+Effectiveness+of+Self-Supervised+Learning+using+Calibrated+Learning+Curves&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Swiss Federal Institute of Technology Lausanne;Skolkovo Institute of Science and Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.epfl.ch;https://www.skoltech.ru", "aff_unique_abbr": "EPFL;Skoltech", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Lausanne;", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "Switzerland;Russian Federation" }, { "id": "ZVqsBl2HapR", "title": "Error-based or target-based? A unifying framework for learning in recurrent spiking networks", "track": "main", "status": "Desk Reject", "tldr": "", "abstract": "Learning in biological or artificial networks means changing the laws governing the network dynamics in order to better behave in a specific situation. In the field of supervised learning, two complementary approaches stand out: error-based and target-based learning. However, there exists no consensus on which is better suited for which task, and what is the most biologically plausible. Here we propose a comprehensive theoretical framework that includes these two frameworks as special cases. This novel theoretical formulation offers major insights into the differences between the two approaches. In particular, we show how target-based naturally emerges from error-based when the number of constraints on the target dynamics, and as a consequence on the internal network dynamics, is comparable to the degrees of freedom of the network. Moreover, given the experimental evidences on the relevance that spikes have in biological networks, we investigate the role of coding with specific patterns of spikes by introducing a parameter that defines the tolerance to precise spike timing during learning. Our approach naturally lends itself to Imitation Learning (and Behavioral Cloning in particular) and we apply it to solve relevant closed-loop tasks such as the button-and-food task, and the 2D Bipedal Walker. We show that a high dimensionality feedback structure is extremely important when it is necessary to solve a task that requires retaining memory for a long time (button-and-food). On the other hand, we find that coding with specific patterns of spikes enables optimal performances in a motor task (the 2D Bipedal Walker). Finally, we show that our theoretical formulation suggests protocols to deduce the structure of learning feedback in biological networks.", "keywords": "target-based;error-based;recurrent neural network;spiking neural network", "primary_area": "", "supplementary_material": "/attachment/c4370f1cef559c0b196000cff113ad63e4a830e4.zip", "author": "Cristiano Capone;Paolo Muratore;Pier Stanislao Paolucci", "authorids": "~Cristiano_Capone1;~Paolo_Muratore1;~Pier_Stanislao_Paolucci1", "gender": "M;M;", "homepage": ";;https://sites.google.com/site/pierstanislaopaolucci/", "dblp": "228/9143;321/1726;95/2151.html", "google_scholar": "wyHFdf4AAAAJ;HGEzIUcAAAAJ;https://scholar.google.it/citations?user=jhvLaT8AAAAJ", "orcid": ";0000-0003-4520-5950;0000-0003-1937-6086", "linkedin": ";;", "or_profile": "~Cristiano_Capone1;~Paolo_Muratore1;~Pier_Stanislao_Paolucci1", "aff": "INFN;SISSA/ISAS;INFN (Istituto Nazionale di Fisica Nucleare)", "aff_domain": "infn.it;sissa.it;infn.it", "position": "Postdoc;PhD student;Principal Researcher", "bibtex": "@misc{\ncapone2022errorbased,\ntitle={Error-based or target-based? A unifying framework for learning in recurrent spiking networks},\nauthor={Cristiano Capone and Paolo Muratore and Pier Stanislao Paolucci},\nyear={2022},\nurl={https://openreview.net/forum?id=ZVqsBl2HapR}\n}", "github": "", "project": "", "reviewers": "", "site": "https://openreview.net/forum?id=ZVqsBl2HapR", "pdf_size": 0, "recommendation": "", "confidence": "", "correctness": "", "technical_novelty": "", "empirical_novelty": "", "wc_summary_paper": "", "wc_summary_review": "", "wc_main_review": "", "wc_review": "", "wc_reply_reviewers": "", "wc_reply_authors": "", "reply_reviewers": "", "reply_authors": "", "recommendation_avg": [ 0, 0 ], "confidence_avg": [ 0, 0 ], "correctness_avg": [ 0, 0 ], "technical_novelty_avg": [ 0, 0 ], "empirical_novelty_avg": [ 0, 0 ], "wc_summary_paper_avg": [ 0, 0 ], "wc_summary_review_avg": [ 0, 0 ], "wc_main_review_avg": [ 0, 0 ], "wc_review_avg": [ 0, 0 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 1, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0, "corr_recommendation_correctness": 0, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7190654748846958688&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "aff_unique_index": "0;1;0", "aff_unique_norm": "Istituto Nazionale di Fisica Nucleare;Scuola Internazionale Superiore di Studi Avanzati", "aff_unique_dep": ";", "aff_unique_url": "https://www.infn.it;https://www.sissa.it", "aff_unique_abbr": "INFN;SISSA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Italy" }, { "id": "ZWjEkv9rjo", "title": "Lattice Quantization", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Low bit quantization of weights in increasingly large deep convolutional neural networks (DCNNs) can be critical for their implementation in memory constrained hardware systems. Post-training quantization consists in quantizing a model without retraining, which is user-friendly, fast and data frugal. In this paper, we propose LatticeQ, a new post-training weight quantization method designed for DCNNs. Instead of the standard scalar rounding widely used in state-of-the-art quantization methods, LatticeQ uses a quantizer based on lattices - discrete algebraic structures - which we show are able to exploit the inner correlations between the model parameters. LatticeQ allows us to achieve state-of-the-art results in post-training quantization, enabling us to approach full precision accuracies for bitwidths previously not accessible to post-training quantization methods. In particular, we achieve ImageNet classification results close to full precision on the popular Resnet-18/50, with only 0.5% and 5% accuracy drop for the 4-bit weights and 3-bit weights model architectures respectively.", "keywords": "Convolutional neural networks;quantization;post-training", "primary_area": "", "supplementary_material": "", "author": "Cl\u00e9ment Metz;Thibault Allenet;Johannes Christian Thiele;Antoine Dupret;Olivier BICHLER", "authorids": "~Cl\u00e9ment_Metz1;~Thibault_Allenet1;~Johannes_Christian_Thiele1;antoine.dupret@cea.fr;~Olivier_BICHLER1", "gender": "M;;M;;M", "homepage": ";;;;", "dblp": ";;;;75/9434", "google_scholar": ";;;;https://scholar.google.fr/citations?user=mocqL0kAAAAJ", "orcid": ";;;;", "linkedin": "clement-metz-231b8a1a8/;;;;", "or_profile": "~Cl\u00e9ment_Metz1;~Thibault_Allenet1;~Johannes_Christian_Thiele1;antoine.dupret@cea.fr;~Olivier_BICHLER1", "aff": "CEA;CEA;CEA;;CEA", "aff_domain": "cea.fr;cea.fr;cea.fr;;cea.fr", "position": "PhD student;PhD student;PhD student;;Researcher", "bibtex": "@misc{\nmetz2022lattice,\ntitle={Lattice Quantization},\nauthor={Cl{\\'e}ment Metz and Thibault Allenet and Johannes Christian Thiele and Antoine Dupret and Olivier BICHLER},\nyear={2022},\nurl={https://openreview.net/forum?id=ZWjEkv9rjo}\n}", "github": "", "project": "", "reviewers": "ZUQP;Vo3A;Voe8", "site": "https://openreview.net/forum?id=ZWjEkv9rjo", "pdf_size": 0, "recommendation": "3;5;5", "confidence": "4;4;4", "correctness": "2;2;4", "technical_novelty": "1;3;3", "empirical_novelty": "1;3;2", "wc_summary_paper": "81;50;94", "wc_summary_review": "38;22;24", "wc_main_review": "467;389;163", "wc_review": "586;461;281", "wc_reply_reviewers": "0;0;279", "wc_reply_authors": "125;497;157", "reply_reviewers": "0;0;1", "reply_authors": "1;1;1", "recommendation_avg": [ 4.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 2.6666666666666665, 0.9428090415820634 ], "technical_novelty_avg": [ 2.3333333333333335, 0.9428090415820634 ], "empirical_novelty_avg": [ 2.0, 0.816496580927726 ], "wc_summary_paper_avg": [ 75.0, 18.457157599876172 ], "wc_summary_review_avg": [ 28.0, 7.118052168020874 ], "wc_main_review_avg": [ 339.6666666666667, 128.91685520348204 ], "wc_review_avg": [ 442.6666666666667, 125.18874638809281 ], "wc_reply_reviewers_avg": [ 93.0, 131.52186130069785 ], "wc_reply_authors_avg": [ 259.6666666666667, 168.32772267877394 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.5, "gs_citation": 126, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17293715891392282647&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Commissariat \u00e0 l'\u00c9nergie Atomique et aux \u00c9nergies Alternatives", "aff_unique_dep": "", "aff_unique_url": "https://www cea fr", "aff_unique_abbr": "CEA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "France" }, { "id": "ZWykq5n4zx", "title": "Boosting the Confidence of Near-Tight Generalization Bounds for Uniformly Stable Randomized Algorithms", "track": "main", "status": "Reject", "tldr": "", "abstract": "High probability generalization bounds of uniformly stable learning algorithms have recently been actively studied with a series of near-tight results established by~\\citet{feldman2019high,bousquet2020sharper}. However, for randomized algorithms with on-average uniform stability, such as stochastic gradient descent (SGD) with time decaying learning rates, it still remains less well understood if these deviation bounds still hold with high confidence over the internal randomness of algorithm. This paper addresses this open question and makes progress towards answering it inside a classic framework of confidence-boosting. To this end, we first establish an in-expectation first moment generalization error bound for randomized learning algorithm with on-average uniform stability, based on which we then show that a properly designed subbagging process leads to near-tight high probability generalization bounds over the randomness of data and algorithm. We further substantialize these generic results to SGD to derive improved high probability generalization bounds for convex or non-convex optimization with natural time decaying learning rates, which have not been possible to prove with the existing uniform stability results. Specially for deterministic uniformly stable algorithms, our confidence-boosting results improve upon the best known generalization bounds in terms of a logarithmic factor on sample size, which moves a step forward towards resolving an open question raised by~\\citet{bousquet2020sharper}.", "keywords": "Uniform stability;Randomized learning algorithms;Bagging;Generalization bounds;Stochastic gradient methods", "primary_area": "", "supplementary_material": "", "author": "Xiaotong Yuan;Ping Li", "authorids": "~Xiaotong_Yuan1;~Ping_Li3", "gender": "M;M", "homepage": "https://sites.google.com/site/xtyuan1980/;http://www.stat.rutgers.edu/home/pingli/", "dblp": "64/5926;62/5860-1", "google_scholar": "yzU6g24AAAAJ;", "orcid": ";", "linkedin": ";", "or_profile": "~Xiaotong_Yuan1;~Ping_Li3", "aff": ";LinkedIn", "aff_domain": ";linkedin.com", "position": ";Engineer", "bibtex": "@misc{\nyuan2022boosting,\ntitle={Boosting the Confidence of Near-Tight Generalization Bounds for Uniformly Stable Randomized Algorithms},\nauthor={Xiaotong Yuan and Ping Li},\nyear={2022},\nurl={https://openreview.net/forum?id=ZWykq5n4zx}\n}", "github": "", "project": "", "reviewers": "Ufmg;LQgS;jM1w;F7yX", "site": "https://openreview.net/forum?id=ZWykq5n4zx", "pdf_size": 0, "recommendation": "6;6;6;8", "confidence": "5;4;2;3", "correctness": "3;4;4;4", "technical_novelty": "3;3;2;4", "empirical_novelty": "0;3;0;0", "wc_summary_paper": "91;73;84;98", "wc_summary_review": "39;23;54;26", "wc_main_review": "189;228;54;198", "wc_review": "319;324;192;322", "wc_reply_reviewers": "0;0;0;25", "wc_reply_authors": "422;434;114;434", "reply_reviewers": "0;0;0;1", "reply_authors": "1;1;1;1", "recommendation_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 3.5, 1.118033988749895 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 0.75, 1.299038105676658 ], "wc_summary_paper_avg": [ 86.5, 9.233092656309694 ], "wc_summary_review_avg": [ 35.5, 12.257650672131263 ], "wc_main_review_avg": [ 167.25, 66.96034274105831 ], "wc_review_avg": [ 289.25, 56.17550622824862 ], "wc_reply_reviewers_avg": [ 6.25, 10.825317547305483 ], "wc_reply_authors_avg": [ 351.0, 136.91968448692833 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.2581988897471611, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:gaxhlxyT7DMJ:scholar.google.com/&scioq=Boosting+the+Confidence+of+Near-Tight+Generalization+Bounds+for+Uniformly+Stable+Randomized+Algorithms&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "LinkedIn Corporation", "aff_unique_dep": "", "aff_unique_url": "https://www.linkedin.com", "aff_unique_abbr": "LinkedIn", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "id": "ZaI7Rd11G4S", "title": "Embedding Compression with Hashing for Efficient Representation Learning in Graph", "track": "main", "status": "Reject", "tldr": "", "abstract": "Graph neural networks (GNNs) are deep learning models designed specifically for graph data, and they typically rely on node features as the input node representation to the first layer. When applying such type of networks on graph without node feature, one can extract simple graph-based node features (e.g., number of degrees) or learn the input node representation (i.e., embeddings) when training the network. While the latter approach, which trains node embeddings, more likely leads to better performance, the number of parameters associated with the embeddings grows linearly with the number of nodes. It is therefore impractical to train the input node embeddings together with GNNs within graphics processing unit (GPU) memory in an end-to-end fashion when dealing with industrial scale graph data. Inspired by the embedding compression methods developed for natural language processing (NLP) models, we develop a node embedding compression method where each node is compactly represented with a bit vector instead of a float-point vector. The parameters utilized in the compression method can be trained together with GNNs. We show that the proposed node embedding compression method achieves superior performance compared to the alternatives.", "keywords": "embedding compression;hashing;graph", "primary_area": "", "supplementary_material": "", "author": "Chin-Chia Michael Yeh;Mengting Gu;Yan Zheng;Huiyuan Chen;Javid Ebrahimi;Zhongfang Zhuang;Junpeng Wang;Liang Wang;Wei Zhang", "authorids": "~Chin-Chia_Michael_Yeh1;~Mengting_Gu1;~Yan_Zheng2;~Huiyuan_Chen1;~Javid_Ebrahimi1;zzhuang@visa.com;~Junpeng_Wang1;~Liang_Wang11;~Wei_Zhang52", "gender": "Unspecified;F;F;M;;;M;M;M", "homepage": "https://mcyeh.github.io/;;https://usa.visa.com/about-visa/visa-research/yan-zheng.html;;;;https://junpengw.github.io/;;", "dblp": "117/5435;;10/2381-1;204/5464;116/5290;;172/6642-1;;10/4661-189.html", "google_scholar": "F4d7Sv4AAAAJ;EB0q5vgAAAAJ;fCDg0VQAAAAJ;j3y4dJwAAAAJ;kiXM564AAAAJ;;6_6MH5wAAAAJ;SNToU-gAAAAJ;", "orcid": "0000-0002-9807-2963;;;0000-0002-6360-558X;;;0000-0002-1130-9914;;", "linkedin": ";;;;;;;liang-wang-7472712/;zhangwei0119/", "or_profile": "~Chin-Chia_Michael_Yeh1;~Mengting_Gu1;~Yan_Zheng2;~Huiyuan_Chen1;~Javid_Ebrahimi1;zzhuang@visa.com;~Junpeng_Wang1;~Liang_Wang11;~Wei_Zhang52", "aff": "VISA;;VISA;;VISA;;VISA;VISA;VISA", "aff_domain": "visa.com;;visa.com;;visa.com;;visa.com;visa.com;visa.com", "position": "Research Scientist;;Principal Researcher;;Researcher;;Researcher;Principal Scientist;Principal Researcher", "bibtex": "@misc{\nyeh2022embedding,\ntitle={Embedding Compression with Hashing for Efficient Representation Learning in Graph},\nauthor={Chin-Chia Michael Yeh and Mengting Gu and Yan Zheng and Huiyuan Chen and Javid Ebrahimi and Zhongfang Zhuang and Junpeng Wang and Liang Wang and Wei Zhang},\nyear={2022},\nurl={https://openreview.net/forum?id=ZaI7Rd11G4S}\n}", "github": "", "project": "", "reviewers": "J5SK;hQPL;WEHB;u7vW", "site": "https://openreview.net/forum?id=ZaI7Rd11G4S", "pdf_size": 0, "recommendation": "3;3;6;6", "confidence": "4;4;3;3", "correctness": "4;3;3;4", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;2;0;4", "wc_summary_paper": "89;66;95;73", "wc_summary_review": "27;32;68;101", "wc_main_review": "189;170;420;408", "wc_review": "305;268;583;582", "wc_reply_reviewers": "0;0;8;0", "wc_reply_authors": "413;541;971;689", "reply_reviewers": "0;0;1;0", "reply_authors": "1;1;2;1", "recommendation_avg": [ 4.5, 1.5 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.0, 1.4142135623730951 ], "wc_summary_paper_avg": [ 80.75, 11.712706775122479 ], "wc_summary_review_avg": [ 57.0, 29.924906014890006 ], "wc_main_review_avg": [ 296.75, 117.5188814616613 ], "wc_review_avg": [ 434.5, 148.57742089563945 ], "wc_reply_reviewers_avg": [ 2.0, 3.4641016151377544 ], "wc_reply_authors_avg": [ 653.5, 207.70351465490418 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 9, 0 ], "corr_recommendation_confidence": -1.0, "corr_recommendation_correctness": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11899773386583455010&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2, "aff_unique_index": "0;0;0;0;0;0", "aff_unique_norm": "VISA", "aff_unique_dep": "", "aff_unique_url": "https://www.visa.com", "aff_unique_abbr": "VISA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "FedChain: Chained Algorithms for Near-optimal Communication Cost in Federated Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6628", "id": "ZaVVVlcdaN", "poster": "", "openreview": "https://openreview.net/forum?id=ZaVVVlcdaN", "slides": "https://iclr.cc/virtual/2022/poster/6628", "video": "https://iclr.cc/virtual/2022/poster/6628", "author_site": "Charlie Hou, Kiran Thekumparampil, Giulia Fanti, Sewoong Oh", "tldr": "", "abstract": "Federated learning (FL) aims to minimize the communication complexity of training a model over heterogeneous data distributed across many clients. A common approach is local methods, where clients take multiple optimization steps over local data before communicating with the server (e.g., FedAvg). Local methods can exploit similarity between clients' data. However, in existing analyses, this comes at the cost of slow convergence in terms of the dependence on the number of communication rounds R. On the other hand, global methods, where clients simply return a gradient vector in each round (e.g., SGD), converge faster in terms of R but fail to exploit the similarity between clients even when clients are homogeneous. We propose FedChain, an algorithmic framework that combines the strengths of local methods and global methods to achieve fast convergence in terms of R while leveraging the similarity between clients. Using FedChain, we instantiate algorithms that improve upon previously known rates in the general convex and PL settings, and are near-optimal (via an algorithm-independent lower bound that we show) for problems that satisfy strong convexity. Empirical results support this theoretical gain over existing methods. ", "keywords": "Federated Learning;Optimization;Distributed Optimization", "primary_area": "", "supplementary_material": "", "author": "Charlie Hou;Kiran Koshy Thekumparampil;Giulia Fanti;Sewoong Oh", "authorids": "~Charlie_Hou1;~Kiran_Koshy_Thekumparampil1;~Giulia_Fanti1;~Sewoong_Oh1", "gender": ";M;;M", "homepage": "https://www.andrew.cmu.edu/user/charlieh/;http://thekump2.web.engr.illinois.edu;https://www.andrew.cmu.edu/user/gfanti/;https://homes.cs.washington.edu/~sewoong/", "dblp": ";142/2840;141/9910;80/4366", "google_scholar": "92wmC6gAAAAJ;0gJQCIgAAAAJ;Rn_BmTYAAAAJ;55TAOdgAAAAJ", "orcid": ";;0000-0002-7671-2624;", "linkedin": "charlie-hou-027a19113/;;;", "or_profile": "~Charlie_Hou1;~Kiran_Koshy_Thekumparampil1;~Giulia_Fanti1;~Sewoong_Oh1", "aff": "Carnegie Mellon University;University of Illinois, Urbana Champaign;Carnegie Mellon University;University of Washington", "aff_domain": "andrew.cmu.edu;illinois.edu;andrew.cmu.edu;uw.edu", "position": "PhD student;PhD student;Assistant Professor;Associate Professor", "bibtex": "@inproceedings{\nhou2022fedchain,\ntitle={FedChain: Chained Algorithms for Near-optimal Communication Cost in Federated Learning},\nauthor={Charlie Hou and Kiran Koshy Thekumparampil and Giulia Fanti and Sewoong Oh},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=ZaVVVlcdaN}\n}", "github": "", "project": "", "reviewers": "y87n;ZoaG;AN2w;BBMC", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "4;3;4;4", "correctness": "2;3;3;4", "technical_novelty": "2;2;3;2", "empirical_novelty": "2;3;3;1", "wc_summary_paper": "84;213;22;61", "wc_summary_review": "68;31;39;57", "wc_main_review": "550;646;644;851", "wc_review": "702;890;705;969", "wc_reply_reviewers": "0;55;0;0", "wc_reply_authors": "976;1438;491;1574", "reply_reviewers": "0;1;0;0", "reply_authors": "3;3;1;4", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 95.0, 71.64146843832837 ], "wc_summary_review_avg": [ 48.75, 14.566657131957214 ], "wc_main_review_avg": [ 672.75, 109.98039598037461 ], "wc_review_avg": [ 816.5, 116.40554110522402 ], "wc_reply_reviewers_avg": [ 13.75, 23.81569860407206 ], "wc_reply_authors_avg": [ 1119.75, 425.3283290588578 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 2.75, 1.0897247358851685 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": 0.7071067811865475, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14527378219360321519&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=ZaVVVlcdaN", "email": "andrew.cmu.edu;illinois.edu;andrew.cmu.edu;uw.edu", "author_num": 4, "aff_unique_index": "0;1;0;2", "aff_unique_norm": "Carnegie Mellon University;University of Illinois Urbana-Champaign;University of Washington", "aff_unique_dep": ";;", "aff_unique_url": "https://www.cmu.edu;https://illinois.edu;https://www.washington.edu", "aff_unique_abbr": "CMU;UIUC;UW", "aff_campus_unique_index": "1", "aff_campus_unique": ";Urbana-Champaign", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "Zae_OHNq-y", "title": "Imbalanced Adversarial Training with Reweighting", "track": "main", "status": "Reject", "tldr": "", "abstract": "Adversarial training has been empirically proven to be one of the most effective and reliable defense methods against adversarial attacks. However, the majority of existing studies are focused on balanced datasets, where each class has a similar amount of training examples. Research on adversarial training with imbalanced training datasets is rather limited. As the initial effort to investigate this problem, we reveal the facts that adversarially trained models present two distinguished behaviors from naturally trained models in imbalanced datasets: (1) Compared to natural training, adversarially trained models can suffer much worse performance on under-represented classes, when the training dataset is extremely imbalanced. (2) Traditional reweighting strategies may lose efficacy to deal with the imbalance issue for adversarial training. For example, upweighting under-represented classes will drastically hurt the model\u2019s performance on well-represented classes, and as a result, finding an optimal reweighting value can be tremendously challenging. In this paper, to further understand our observations, we theoretically show that the poor data separability is one key reason causing this strong tension between under-represented and well-represented classes. Motivated by this finding, we propose Separable Reweighted Adversarial Training (SRAT) to facilitate adversarial training under imbalanced scenarios, by learning more separable features for different classes. Extensive experiments on various datasets verify the effectiveness of the proposed framework.", "keywords": "imbalanced data;robustness;adversarial training", "primary_area": "", "supplementary_material": "", "author": "Wentao Wang;Han Xu;Xiaorui Liu;Yaxin Li;Bhavani Thuraisingham;Jiliang Tang", "authorids": "~Wentao_Wang3;~Han_Xu1;~Xiaorui_Liu1;~Yaxin_Li3;~Bhavani_Thuraisingham1;~Jiliang_Tang1", "gender": ";M;M;;F;M", "homepage": "http://www.cse.msu.edu/~wangw116/;https://cse.msu.edu/~xuhan1/;https://sites.google.com/ncsu.edu/xiaorui/;http://cse.msu.edu/~liyaxin1/;http://www.utdallas.edu/~bhavani.thuraisingham/;https://www.cse.msu.edu/~tangjili/", "dblp": ";32/34-2;172/0995;;t/BMThuraisingham;64/10812", "google_scholar": ";mX2rL3IAAAAJ;NhvN1KoAAAAJ;;https://scholar.google.com.tw/citations?user=o_xUNWkAAAAJ;WtzKMWAAAAAJ", "orcid": ";0000-0002-4016-6748;0000-0001-8217-5688;;0000-0003-4653-2080;0000-0001-7125-3898", "linkedin": ";;;;dr-bhavani-thuraisingham-aka-dr-bhavani-75305127/;", "or_profile": "~Wentao_Wang3;~Han_Xu1;~Xiaorui_Liu1;~Yaxin_Li3;~Bhavani_Thuraisingham1;~Jiliang_Tang1", "aff": "Michigan State University;VISA;Michigan State University;Michigan State University;University of Texas at Dallas;Michigan State University", "aff_domain": "msu.edu;visa.com;msu.edu;msu.edu;utd.edu;msu.edu", "position": "PhD student;Intern;PhD student;PhD student;Full Professor;Associate Professor", "bibtex": "@misc{\nwang2022imbalanced,\ntitle={Imbalanced Adversarial Training with Reweighting},\nauthor={Wentao Wang and Han Xu and Xiaorui Liu and Yaxin Li and Bhavani Thuraisingham and Jiliang Tang},\nyear={2022},\nurl={https://openreview.net/forum?id=Zae_OHNq-y}\n}", "github": "", "project": "", "reviewers": "9erX;mB3v;2Vzh;6i9F", "site": "https://openreview.net/forum?id=Zae_OHNq-y", "pdf_size": 0, "recommendation": "3;3;3;8", "confidence": "4;4;4;4", "correctness": "3;3;3;4", "technical_novelty": "2;2;2;3", "empirical_novelty": "3;2;2;4", "wc_summary_paper": "89;86;16;89", "wc_summary_review": "63;199;29;31", "wc_main_review": "529;452;453;142", "wc_review": "681;737;498;262", "wc_reply_reviewers": "82;0;0;7", "wc_reply_authors": "1378;693;763;182", "reply_reviewers": "1;0;0;1", "reply_authors": "3;1;1;1", "recommendation_avg": [ 4.25, 2.165063509461097 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 70.0, 31.20096152364539 ], "wc_summary_review_avg": [ 80.5, 69.73342096871485 ], "wc_main_review_avg": [ 394.0, 148.80692188201462 ], "wc_review_avg": [ 544.5, 185.51078135784994 ], "wc_reply_reviewers_avg": [ 22.25, 34.61484508126535 ], "wc_reply_authors_avg": [ 754.0, 424.3707105821513 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.5, 0.8660254037844386 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 1.0, "gs_citation": 28, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=44205133538750333&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff_unique_index": "0;1;0;0;2;0", "aff_unique_norm": "Michigan State University;VISA;University of Texas at Dallas", "aff_unique_dep": ";;", "aff_unique_url": "https://www.msu.edu;https://www.visa.com;https://www.utdallas.edu", "aff_unique_abbr": "MSU;VISA;UT Dallas", "aff_campus_unique_index": "1", "aff_campus_unique": ";Dallas", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "Zca3NK3X8G", "title": "WaveCorr: Deep Reinforcement Learning with Permutation Invariant Policy Networks for Portfolio Management", "track": "main", "status": "Reject", "tldr": "", "abstract": "The problem of portfolio management represents an important and challenging class of dynamic decision making problems, where rebalancing decisions need to be made over time with the consideration of many factors such as investors\u2019 preferences, trading environment, and market conditions. In this paper, we present a new portfolio policy network architecture for deep reinforcement learning (DRL) that can exploit more effectively cross-asset dependency information and achieve better performance than state-of-the-art architectures. In doing so, we introduce a new form of permutation invariance property for policy networks and derive general theory for verifying its applicability. Our portfolio policy network, named WaveCorr, is the first convolutional neural network architecture that preserves this invariance property when treating asset correlation information. Finally, in a set of experiments conducted using data from both Canadian (TSX) and American stock markets (S\\&P 500), WaveCorr consistently outperforms other architectures with an impressive 3\\%-25\\% absolute improvement in terms of average annual return, and up to more than 200\\% relative improvement in average Sharpe ratio. We also measured an improvement of a factor of up to 5 in the stability of performance under random choices of initial asset ordering and weights. The stability of the network has been found as particularly valuable by our industrial partner.", "keywords": "permutation invariance;portfolio management;deep reinforcement learning;policy network", "primary_area": "", "supplementary_material": "/attachment/18c609cb73905ed31f3456ae645ea4019df91978.zip", "author": "Saeed Marzban;Erick Delage;Jonathan Li", "authorids": "~Saeed_Marzban1;~Erick_Delage2;~Jonathan_Li1", "gender": "M;M;", "homepage": ";http://web.hec.ca/pages/erick.delage/;", "dblp": ";26/1546;85/6906", "google_scholar": ";https://scholar.google.ca/citations?user=ciH2ROgAAAAJ;", "orcid": ";0000-0002-6740-3600;", "linkedin": "saeed-marzban-07891a56/;erick-delage-2105361/?originalSubdomain=ca;", "or_profile": "~Saeed_Marzban1;~Erick_Delage2;~Jonathan_Li1", "aff": "HEC Montreal;Computer Science Department;University of Ottawa", "aff_domain": "hec.ca;cs.stanford.edu;uottawa.ca", "position": "PhD student;Researcher;Associate Professor", "bibtex": "@misc{\nmarzban2022wavecorr,\ntitle={WaveCorr: Deep Reinforcement Learning with Permutation Invariant Policy Networks for Portfolio Management},\nauthor={Saeed Marzban and Erick Delage and Jonathan Li},\nyear={2022},\nurl={https://openreview.net/forum?id=Zca3NK3X8G}\n}", "github": "", "project": "", "reviewers": "iRgh;4KRB;y3fW;ruHv", "site": "https://openreview.net/forum?id=Zca3NK3X8G", "pdf_size": 0, "recommendation": "5;5;8;8", "confidence": "2;3;3;3", "correctness": "3;4;3;3", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;0;3;3", "wc_summary_paper": "89;71;94;46", "wc_summary_review": "31;33;22;5", "wc_main_review": "244;219;282;115", "wc_review": "364;323;398;166", "wc_reply_reviewers": "74;0;0;0", "wc_reply_authors": "742;766;321;341", "reply_reviewers": "1;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 6.5, 1.5 ], "confidence_avg": [ 2.75, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.0, 1.224744871391589 ], "wc_summary_paper_avg": [ 75.0, 18.801595676963167 ], "wc_summary_review_avg": [ 22.75, 11.053845484717073 ], "wc_main_review_avg": [ 215.0, 61.9394865978077 ], "wc_review_avg": [ 312.75, 88.79013177149812 ], "wc_reply_reviewers_avg": [ 18.5, 32.04293994002423 ], "wc_reply_authors_avg": [ 542.5, 211.78821969127557 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.5773502691896258, "corr_recommendation_correctness": -0.5773502691896258, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:qjh2BayzDG8J:scholar.google.com/&scioq=WaveCorr:+Deep+Reinforcement+Learning+with+Permutation+Invariant+Policy+Networks+for+Portfolio+Management&hl=en&as_sdt=0,33", "gs_version_total": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "HEC Montreal;Computer Science Department;University of Ottawa", "aff_unique_dep": ";Computer Science;", "aff_unique_url": "https://www.hec.ca;;https://www.uottawa.ca", "aff_unique_abbr": "HEC;;U Ottawa", "aff_campus_unique_index": "0", "aff_campus_unique": "Montreal;", "aff_country_unique_index": "0;0", "aff_country_unique": "Canada;" }, { "id": "ZeE81SFTsl", "title": "DAdaQuant: Doubly-adaptive quantization for communication-efficient Federated Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Federated Learning (FL) is a powerful technique for training a model on a\nserver with data from several clients in a privacy-preserving manner. In FL,\na server sends the model to every client, who then train the model locally\nand send it back to the server. The server aggregates the updated models and\nrepeats the process for several rounds. FL incurs significant communication\ncosts, in particular when transmitting the updated local models from the\nclients back to the server. Recently proposed algorithms quantize the model\nparameters to efficiently compress FL communication. These algorithms\ntypically have a quantization level that controls the compression factor. We\nfind that dynamic adaptations of the quantization level can boost\ncompression without sacrificing model quality. First, we introduce a\ntime-adaptive quantization algorithm that increases the quantization level\nas training progresses. Second, we introduce a client-adaptive quantization\nalgorithm that assigns each individual client the optimal quantization level\nat every round. Finally, we combine both algorithms into DAdaQuant, the\ndoubly-adaptive quantization algorithm. Our experiments show that DAdaQuant\nconsistently improves client$\\rightarrow$server compression, outperforming\nthe strongest non-adaptive baselines by up to $2.8\\times$.", "keywords": "federated learning;gradient compression;quantization;communication efficiency", "primary_area": "", "supplementary_material": "/attachment/b30f470c792c25f0b454a566db6250cb5bfe89bf.zip", "author": "Robert H\u00f6nig;Yiren Zhao;Robert D. Mullins", "authorids": "~Robert_H\u00f6nig2;~Yiren_Zhao2;~Robert_D._Mullins1", "gender": "M;M;M", "homepage": "https://aaronzhao.me;https://www.csat.cam.ac.uk/~rdm34;", "dblp": "https://dblp.uni-trier.de/pers/hd/z/Zhao:Yiren;31/789;305/3644", "google_scholar": "lOOmgEgAAAAJ;zjXO2HMAAAAJ;", "orcid": ";;", "linkedin": "yiren-aaron-zhao-baa8b5116/;;robert-h%C3%B6nig-b19370185/", "or_profile": "~Yiren_Zhao2;~Robert_Mullins1;~Robert_H\u00f6nig1", "aff": "Imperial College London;University of Cambridge;Swiss Federal Institute of Technology", "aff_domain": "ic.ac.uk;cam.ac.uk;ethz.ch", "position": "Assistant Professor;Associate Professor;MS student", "bibtex": "@misc{\nh{\\\"o}nig2022dadaquant,\ntitle={{DA}daQuant: Doubly-adaptive quantization for communication-efficient Federated Learning},\nauthor={Robert H{\\\"o}nig and Yiren Zhao and Robert D. Mullins},\nyear={2022},\nurl={https://openreview.net/forum?id=ZeE81SFTsl}\n}", "github": "", "project": "", "reviewers": "2tGb;eicQ;CKYb;d8ve", "site": "https://openreview.net/forum?id=ZeE81SFTsl", "pdf_size": 0, "recommendation": "3;5;5;8", "confidence": "4;3;4;2", "correctness": "3;3;3;3", "technical_novelty": "2;3;3;2", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "44;50;57;69", "wc_summary_review": "28;52;31;45", "wc_main_review": "194;124;449;184", "wc_review": "266;226;537;298", "wc_reply_reviewers": "0;0;261;5", "wc_reply_authors": "609;482;857;609", "reply_reviewers": "0;0;1;1", "reply_authors": "1;1;2;1", "recommendation_avg": [ 5.25, 1.7853571071357126 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 55.0, 9.300537618869138 ], "wc_summary_review_avg": [ 39.0, 9.874208829065749 ], "wc_main_review_avg": [ 237.75, 124.86868102130333 ], "wc_review_avg": [ 331.75, 121.21545899760476 ], "wc_reply_reviewers_avg": [ 66.5, 112.31317821164176 ], "wc_reply_authors_avg": [ 639.25, 135.98965953336304 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.8866206949335731, "corr_recommendation_correctness": 0.0, "gs_citation": 82, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13114481689249347847&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "aff_unique_index": "0;1;2", "aff_unique_norm": "Imperial College London;University of Cambridge;Swiss Federal Institute of Technology", "aff_unique_dep": ";;", "aff_unique_url": "https://www.imperial.ac.uk;https://www.cam.ac.uk;https://www.ethz.ch", "aff_unique_abbr": "ICL;Cambridge;ETH Zurich", "aff_campus_unique_index": "1", "aff_campus_unique": ";Cambridge", "aff_country_unique_index": "0;0;1", "aff_country_unique": "United Kingdom;Switzerland" }, { "title": "Attacking deep networks with surrogate-based adversarial black-box methods is easy", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6599", "id": "Zf4ZdI4OQPV", "poster": "", "openreview": "https://openreview.net/forum?id=Zf4ZdI4OQPV", "slides": "https://iclr.cc/virtual/2022/poster/6599", "video": "https://iclr.cc/virtual/2022/poster/6599", "author_site": "Nicholas A. Lord, Romain Mueller, Luca Bertinetto", "tldr": "", "abstract": "A recent line of work on black-box adversarial attacks has revived the use of transfer from surrogate models by integrating it into query-based search. However, we find that existing approaches of this type underperform their potential, and can be overly complicated besides. Here, we provide a short and simple algorithm which achieves state-of-the-art results through a search which uses the surrogate network's class-score gradients, with no need for other priors or heuristics. The guiding assumption of the algorithm is that the studied networks are in a fundamental sense learning similar functions, and that a transfer attack from one to the other should thus be fairly \"easy\". This assumption is validated by the extremely low query counts and failure rates achieved: e.g. an untargeted attack on a VGG-16 ImageNet network using a ResNet-152 as the surrogate yields a median query count of 6 at a success rate of 99.9%. Code is available at https://github.com/fiveai/GFCS.", "keywords": "adversarial attacks;black-box attacks;network robustness;network analysis", "primary_area": "", "supplementary_material": "", "author": "Nicholas A. Lord;Romain Mueller;Luca Bertinetto", "authorids": "~Nicholas_A._Lord1;~Romain_Mueller1;~Luca_Bertinetto1", "gender": "M;Not Specified;", "homepage": ";http://www.robots.ox.ac.uk/~luca;", "dblp": "206/6157;154/1351;34/183", "google_scholar": "https://scholar.google.co.uk/citations?user=6K_Z_9sAAAAJ;https://scholar.google.it/citations?user=zEy5CTkAAAAJ;https://scholar.google.co.uk/citations?user=WKmXsyEAAAAJ", "orcid": "0000-0002-7483-8434;;", "linkedin": ";lucabertinetto/;", "or_profile": "~Romain_Mueller1;~Luca_Bertinetto1;~Nicholas_A_Lord1", "aff": "FiveAI;FiveAI;Five", "aff_domain": "five.ai;five.ai;five.ai", "position": "Research engineer;Researcher;Researcher", "bibtex": "@inproceedings{\nlord2022attacking,\ntitle={Attacking deep networks with surrogate-based adversarial black-box methods is easy},\nauthor={Nicholas A. Lord and Romain Mueller and Luca Bertinetto},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=Zf4ZdI4OQPV}\n}", "github": "", "project": "", "reviewers": "wR15;SLVp;KEn1;tmFJ", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "3;4;4;2", "correctness": "2;3;4;4", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;3;2;2", "wc_summary_paper": "33;61;64;135", "wc_summary_review": "89;44;29;30", "wc_main_review": "206;465;597;213", "wc_review": "328;570;690;378", "wc_reply_reviewers": "0;179;256;0", "wc_reply_authors": "793;2045;1869;50", "reply_reviewers": "0;1;2;0", "reply_authors": "3;5;6;1", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 73.25, 37.64555086593899 ], "wc_summary_review_avg": [ 48.0, 24.40286868382486 ], "wc_main_review_avg": [ 370.25, 167.40575706946282 ], "wc_review_avg": [ 491.5, 145.92720788118987 ], "wc_reply_reviewers_avg": [ 108.75, 112.10569789265843 ], "wc_reply_authors_avg": [ 1189.25, 813.82933561036 ], "reply_reviewers_avg": [ 0.75, 0.82915619758885 ], "reply_authors_avg": [ 3.75, 1.920286436967152 ], "replies_avg": [ 27, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.30151134457776363, "corr_recommendation_correctness": 0.9045340337332909, "gs_citation": 36, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9504422673038646416&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=Zf4ZdI4OQPV", "email": "five.ai;five.ai;five.ai", "author_num": 3, "aff_unique_index": "0;0", "aff_unique_norm": "FiveAI;", "aff_unique_dep": ";", "aff_unique_url": "https://www.five.ai;", "aff_unique_abbr": "FiveAI;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United Kingdom;" }, { "id": "ZfcosR9vZ-j", "title": "Pyramid Mini-Batching for Optimal Transport", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Optimal transport theory provides a useful tool to measure the differences between two distributions.\nAligning distributions by minimizing optimal transport distances has been shown to be effective in a variety of machine learning settings, including generative modeling and domain adaptation. However, computing optimal transport distances over large numbers of data points is very time-consuming and intractable for measuring the distances between discrete distributions with large numbers of data points. In this work we propose a geometric sampling scheme which partitions the datasets into pyramid-based encodings. Our approach, Pyramid Mini-Batching, significantly improves the quality of optimal transport approximations and downstream alignments with minimal computational overhead. We perform experiments over the Discrete Optimal Transport benchmark to demonstrate the effectiveness of this strategy over multiple established optimal transport settings and see that our approach improves estimates of OT distances by nearly $30\\%$ for single pass estimation. Furthermore, we see that when attempting to minimize optimal transport distance our approach is ten times more effective than with random mini-batch sampling. To highlight the practical benefits of this approach, we use optimal transport distance in domain adaptation settings and show our approach produces state of the results on large-scale domain adaptation problems VisDA17 and DomainNet. Ablation studies indicate that our sampling approach could be combined with conventional distribution alignment approaches and over substantial improvements to their results.", "keywords": "optimal transport;machine learning", "primary_area": "", "supplementary_material": "", "author": "Devin Guillory;Kuniaki Saito;Eric Tzeng;Yannik Pitcan;Kate Saenko;Trevor Darrell", "authorids": "~Devin_Guillory1;~Kuniaki_Saito2;~Eric_Tzeng1;~Yannik_Pitcan1;~Kate_Saenko1;~Trevor_Darrell2", "gender": "M;M;M;M;F;M", "homepage": "https://www.devinguillory.com/;;;;http://ai.bu.edu;https://people.eecs.berkeley.edu/~trevor/", "dblp": "188/1061;182/1957;136/5767;;88/2754;d/TrevorDarrell", "google_scholar": "t4dSV4YAAAAJ;https://scholar.google.co.jp/citations?user=2X0cwhkAAAAJ;;;https://scholar.google.com.tw/citations?user=9xDADY4AAAAJ;https://scholar.google.com.tw/citations?user=bh-uRFMAAAAJ", "orcid": ";;;;0000-0002-5704-7614;", "linkedin": "devin-guillory-78528958/;;;yannik-pitcan;;", "or_profile": "~Devin_Guillory1;~Kuniaki_Saito2;~Eric_Tzeng1;~Yannik_Pitcan1;~Kate_Saenko1;~trevor_darrell1", "aff": "University of California, Berkeley;Boston University;University of California, Berkeley;;Boston University, Boston University;Electrical Engineering & Computer Science Department", "aff_domain": "berkeley.edu;bu.edu;berkeley.edu;;bu.edu;eecs.berkeley.edu", "position": "PhD student;PhD student;PhD student;;Full Professor;Professor", "bibtex": "@misc{\nguillory2022pyramid,\ntitle={Pyramid Mini-Batching for Optimal Transport},\nauthor={Devin Guillory and Kuniaki Saito and Eric Tzeng and Yannik Pitcan and Kate Saenko and Trevor Darrell},\nyear={2022},\nurl={https://openreview.net/forum?id=ZfcosR9vZ-j}\n}", "github": "", "project": "", "reviewers": "NaKp;yPjE;Uoc5;rYXq", "site": "https://openreview.net/forum?id=ZfcosR9vZ-j", "pdf_size": 0, "recommendation": "1;3;3;5", "confidence": "5;3;3;4", "correctness": "2;2;2;3", "technical_novelty": "1;2;2;3", "empirical_novelty": "2;2;2;0", "wc_summary_paper": "62;17;42;92", "wc_summary_review": "58;50;106;14", "wc_main_review": "451;635;781;265", "wc_review": "571;702;929;371", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.0, 1.4142135623730951 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 2.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 1.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 53.25, 27.471576219794887 ], "wc_summary_review_avg": [ 57.0, 32.78719262151 ], "wc_main_review_avg": [ 533.0, 193.94329068054918 ], "wc_review_avg": [ 643.25, 202.75893938369276 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.4264014327112209, "corr_recommendation_correctness": 0.816496580927726, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:-RTyxtizxnkJ:scholar.google.com/&scioq=Pyramid+Mini-Batching+for+Optimal+Transport&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1;0;1;2", "aff_unique_norm": "University of California, Berkeley;Boston University;Electrical Engineering & Computer Science Department", "aff_unique_dep": ";;Electrical Engineering & Computer Science", "aff_unique_url": "https://www.berkeley.edu;https://www.bu.edu;", "aff_unique_abbr": "UC Berkeley;BU;", "aff_campus_unique_index": "0;0;2", "aff_campus_unique": "Berkeley;;Boston", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States;" }, { "id": "ZgV2C9NKk6Q", "title": "TorchGeo: deep learning with geospatial data", "track": "main", "status": "Reject", "tldr": "", "abstract": "Remotely sensed geospatial data are critical for earth observation applications including precision agriculture, urban planning, disaster monitoring and response, and climate change research, among others. Deep learning methods are particularly promising for modeling many earth observation tasks given the success of deep neural networks in similar computer vision tasks and the sheer volume of remotely sensed imagery available. However, the variance in data collection methods and handling of geospatial metadata make the application of deep learning methodology to remotely sensed data nontrivial. For example, satellite imagery often includes additional spectral bands beyond red, green, and blue and must be joined to other geospatial data sources that can have differing coordinate systems, bounds, and resolutions. To help realize the potential of deep learning for remote sensing applications, we introduce TorchGeo, a Python library for integrating geospatial data into the PyTorch deep learning ecosystem. TorchGeo provides data loaders for a variety of benchmark datasets, composable datasets for generic geospatial data sources, samplers for geospatial data, and transforms that work with multispectral imagery. TorchGeo is also the first library to provide pre-trained models for multispectral satellite imagery, allowing for advances in transfer learning on downstream earth observation tasks with limited labeled data. We use TorchGeo to create reproducible benchmark results on existing datasets, benchmark our proposed method for preprocessing geospatial imagery on-the-fly, and investigate the differences between ImageNet pre-training and in-domain self-supervised pre-training on model performance across several datasets. We aim for TorchGeo to become a new standard for reproducibility and for driving progress at the intersection of deep learning and remotely sensed geospatial data.", "keywords": "deep learning;remote sensing;geospatial data", "primary_area": "", "supplementary_material": "", "author": "Adam J Stewart;Caleb Robinson;Isaac A Corley;Anthony Ortiz;Juan M Lavista Ferres;Arindam Banerjee", "authorids": "~Adam_J_Stewart1;~Caleb_Robinson1;~Isaac_A_Corley1;~Anthony_Ortiz1;jlavista@microsoft.com;~Arindam_Banerjee4", "gender": "M;M;;M;;", "homepage": "https://github.com/adamjstewart;http://calebrob.com;;http://utminers.utep.edu/amortizcepeda/;;https://arindam.cs.illinois.edu/", "dblp": ";194/7729;;210/2608;;82/4807.html", "google_scholar": "IQ19q4AAAAAJ;cjYgLT0AAAAJ;;Ix7TigcAAAAJ;;RY7cuPAAAAAJ", "orcid": "0000-0002-0468-5006;;;;;", "linkedin": ";;;;;", "or_profile": "~Adam_J_Stewart1;~Caleb_Robinson1;~Isaac_A_Corley1;~Anthony_Ortiz1;jlavista@microsoft.com;~Arindam_Banerjee4", "aff": "University of Illinois Urbana-Champaign;Microsoft;;Microsoft;;University of Illinois, Urbana Champaign", "aff_domain": "illinois.edu;microsoft.com;;microsoft.com;;illinois.edu", "position": "PhD student;Principal Researcher;;Researcher;;Professor", "bibtex": "@misc{\nstewart2022torchgeo,\ntitle={TorchGeo: deep learning with geospatial data},\nauthor={Adam J Stewart and Caleb Robinson and Isaac A Corley and Anthony Ortiz and Juan M Lavista Ferres and Arindam Banerjee},\nyear={2022},\nurl={https://openreview.net/forum?id=ZgV2C9NKk6Q}\n}", "github": "", "project": "", "reviewers": "6sDp;Xe57;stxS;xaxq;W3nK", "site": "https://openreview.net/forum?id=ZgV2C9NKk6Q", "pdf_size": 0, "recommendation": "5;5;5;5;6", "confidence": "3;4;4;5;4", "correctness": "4;3;3;4;3", "technical_novelty": "2;2;3;2;3", "empirical_novelty": "2;3;4;0;3", "wc_summary_paper": "45;87;78;26;76", "wc_summary_review": "20;161;57;35;11", "wc_main_review": "27;1133;405;502;90", "wc_review": "92;1381;540;563;177", "wc_reply_reviewers": "0;40;68;0;0", "wc_reply_authors": "69;2499;879;147;397", "reply_reviewers": "0;2;1;0;0", "reply_authors": "1;5;2;1;1", "recommendation_avg": [ 5.2, 0.39999999999999997 ], "confidence_avg": [ 4.0, 0.6324555320336759 ], "correctness_avg": [ 3.4, 0.4898979485566356 ], "technical_novelty_avg": [ 2.4, 0.4898979485566356 ], "empirical_novelty_avg": [ 2.4, 1.3564659966250536 ], "wc_summary_paper_avg": [ 62.4, 23.07032726252491 ], "wc_summary_review_avg": [ 56.8, 54.377936702306016 ], "wc_main_review_avg": [ 431.4, 394.4660188153094 ], "wc_review_avg": [ 550.6, 456.0090349982114 ], "wc_reply_reviewers_avg": [ 21.6, 27.89695323866031 ], "wc_reply_authors_avg": [ 798.2, 896.2147956823743 ], "reply_reviewers_avg": [ 0.6, 0.7999999999999999 ], "reply_authors_avg": [ 2.0, 1.5491933384829668 ], "replies_avg": [ 20, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": -0.408248290463863, "gs_citation": 102, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5937555831120576793&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff_unique_index": "0;1;1;0", "aff_unique_norm": "University of Illinois Urbana-Champaign;Microsoft", "aff_unique_dep": ";Microsoft Corporation", "aff_unique_url": "https://illinois.edu;https://www.microsoft.com", "aff_unique_abbr": "UIUC;Microsoft", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Urbana-Champaign;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "ZgrmzzYjMc4", "title": "What can multi-cloud configuration learn from AutoML?", "track": "main", "status": "Reject", "tldr": "", "abstract": "Multi-cloud computing has become increasingly popular with enterprises looking to avoid vendor lock-in. While most cloud providers offer similar functionality, they may differ significantly in terms of performance and/or cost. A customer looking to benefit from such differences will naturally want to solve the multi-cloud configuration problem: given a workload, which cloud provider should be chosen and how should its nodes be configured in order to minimize runtime or cost? In this work, we consider this multi-cloud optimization problem and publish a new offline benchmark dataset, MOCCA, comprising 60 different multi-cloud configuration tasks across 3 public cloud providers, to enable further research in this area. Furthermore, we identify an analogy between multi-cloud configuration and the selection-configuration problems that are commonly studied in the automated machine learning (AutoML) field. Inspired by this connection, we propose an algorithm for solving multi-cloud configuration, CloudBandit (CB). It treats the outer problem of cloud provider selection as a best-arm identification problem, in which each arm pull corresponds to running an arbitrary black-box optimizer on the inner problem of node configuration. Extensive experiments on MOCCA indicate that CB achieves (a) significantly lower regret relative to its component black-box optimizers and (b) competitive or lower regret relative to state-of-the-art AutoML methods, whilst also being cheaper and faster.", "keywords": "Cloud;AutoML;Multi-armed bandit;Black box optimizers", "primary_area": "", "supplementary_material": "", "author": "Malgorzata Lazuka;Thomas Parnell;Andreea Anghel;Haralampos Pozidis", "authorids": "~Malgorzata_Lazuka1;~Thomas_Parnell1;~Andreea_Anghel1;~Haralampos_Pozidis2", "gender": "F;F;M;", "homepage": ";https://researcher.watson.ibm.com/researcher/view.php?person=zurich-AAN;https://researcher.watson.ibm.com/researcher/view.php?person=zurich-HAP;https://researcher.watson.ibm.com/researcher/view.php?person=zurich-TPA", "dblp": "267/9249.html;57/10370;;91/1768", "google_scholar": ";https://scholar.google.ch/citations?user=gT8Xhk4AAAAJ;https://scholar.google.ch/citations?user=FhioCGgAAAAJ;h10yh7sAAAAJ", "orcid": ";;;", "linkedin": "https://ch.linkedin.com/in/mlazuka;https://ch.linkedin.com/in/andreeaanghel;haris-pozidis-4505bb8/?originalSubdomain=ch;", "or_profile": "~Malgorzata_Lazuka1;~Andreea_Anghel1;~Haralampos_Pozidis2;~Thomas_P._Parnell1", "aff": "IBM Research;International Business Machines;International Business Machines;IBM Research", "aff_domain": "zurich.ibm.com;ibm.com;ibm.com;zurich.ibm.com", "position": "PhD student;Researcher;Principal Researcher;Research Scientist", "bibtex": "@misc{\nlazuka2022what,\ntitle={What can multi-cloud configuration learn from Auto{ML}?},\nauthor={Malgorzata Lazuka and Thomas Parnell and Andreea Anghel and Haralampos Pozidis},\nyear={2022},\nurl={https://openreview.net/forum?id=ZgrmzzYjMc4}\n}", "github": "", "project": "", "reviewers": "FdKV;5pqw;mcUn;bhwc", "site": "https://openreview.net/forum?id=ZgrmzzYjMc4", "pdf_size": 0, "recommendation": "5;5;5;5", "confidence": "4;4;5;3", "correctness": "2;4;4;3", "technical_novelty": "2;2;2;2", "empirical_novelty": "3;2;3;2", "wc_summary_paper": "83;75;70;47", "wc_summary_review": "62;99;63;80", "wc_main_review": "176;213;205;269", "wc_review": "321;387;338;396", "wc_reply_reviewers": "0;0;206;0", "wc_reply_authors": "650;661;737;343", "reply_reviewers": "0;0;1;0", "reply_authors": "2;2;1;2", "recommendation_avg": [ 5.0, 0.0 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 68.75, 13.386093530227555 ], "wc_summary_review_avg": [ 76.0, 15.083103128998356 ], "wc_main_review_avg": [ 215.75, 33.6851228289285 ], "wc_review_avg": [ 360.5, 31.737202145116697 ], "wc_reply_reviewers_avg": [ 51.5, 89.20061658979718 ], "wc_reply_authors_avg": [ 597.75, 150.84656940083192 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.75, 0.4330127018922193 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Nely6FrJJ-MJ:scholar.google.com/&scioq=What+can+multi-cloud+configuration+learn+from+AutoML%3F&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;1;0", "aff_unique_norm": "IBM;International Business Machines Corporation", "aff_unique_dep": "IBM Research;", "aff_unique_url": "https://www.ibm.com/research;https://www.ibm.com", "aff_unique_abbr": "IBM;IBM", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "Zk3TwMJNj7", "title": "Directional Bias Helps Stochastic Gradient Descent to Generalize in Nonparametric Model", "track": "main", "status": "Reject", "tldr": "", "abstract": " This paper studies the Stochastic Gradient Descent (SGD) algorithm in kernel regression. The main finding is that SGD with moderate and annealing step size converges in the direction of the eigenvector that corresponds to the largest eigenvalue of the gram matrix. On the contrary, the Gradient Descent (GD) with a moderate or small step size converges along the direction that corresponds to the smallest eigenvalue. For a general squared risk minimization problem, we show that directional bias towards a larger eigenvalue of the Hessian (which is the gram matrix in our case) results in an estimator that is closer to the ground truth. Adopting this result to kernel regression, the directional bias helps the SGD estimator generalize better. This result gives one way to explain how noise helps in generalization when learning with a nontrivial step size, which may be useful for promoting further understanding of stochastic algorithms in deep learning. The correctness of our theory is supported by simulations and experiments of Neural Network on the FashionMNIST dataset. ", "keywords": "directional bias;SGD;RKHS;nonparametric regression", "primary_area": "", "supplementary_material": "/attachment/e9349fa755bb372987b2ec2eafa8fcc66975b97f.zip", "author": "Yiling Luo;Xiaoming Huo;Yajun Mei", "authorids": "~Yiling_Luo1;~Xiaoming_Huo1;~Yajun_Mei1", "gender": "F;M;M", "homepage": ";https://www.isye.gatech.edu/users/xiaoming-huo;http://www.isye.gatech.edu/~ymei/", "dblp": ";67/3392;", "google_scholar": ";https://scholar.google.com/citations?hl=en;cRoarqgAAAAJ", "orcid": ";0000-0003-0101-1206;", "linkedin": "yiling-luo-b3773914a/;xiaoming-huo-9653374/;", "or_profile": "~Yiling_Luo1;~Xiaoming_Huo1;~Yajun_Mei1", "aff": "Georgia Institute of Technology;Georgia Institute of Technology;Georgia Institute of Technology", "aff_domain": "gatech.edu;gatech.edu;gatech.edu", "position": "PhD student;Full Professor;Full Professor", "bibtex": "@misc{\nluo2022directional,\ntitle={Directional Bias Helps Stochastic Gradient Descent to Generalize in Nonparametric Model},\nauthor={Yiling Luo and Xiaoming Huo and Yajun Mei},\nyear={2022},\nurl={https://openreview.net/forum?id=Zk3TwMJNj7}\n}", "github": "", "project": "", "reviewers": "aXTx;EcVL;RGP1", "site": "https://openreview.net/forum?id=Zk3TwMJNj7", "pdf_size": 0, "recommendation": "3;5;5", "confidence": "4;5;4", "correctness": "4;4;3", "technical_novelty": "2;2;3", "empirical_novelty": "2;2;2", "wc_summary_paper": "88;37;49", "wc_summary_review": "16;21;23", "wc_main_review": "318;81;391", "wc_review": "422;139;463", "wc_reply_reviewers": "0;0;135", "wc_reply_authors": "175;0;304", "reply_reviewers": "0;0;1", "reply_authors": "1;0;2", "recommendation_avg": [ 4.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "correctness_avg": [ 3.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 58.0, 21.77154105707724 ], "wc_summary_review_avg": [ 20.0, 2.943920288775949 ], "wc_main_review_avg": [ 263.3333333333333, 132.32871528466106 ], "wc_review_avg": [ 341.3333333333333, 144.04706021143076 ], "wc_reply_reviewers_avg": [ 45.0, 63.63961030678928 ], "wc_reply_authors_avg": [ 159.66666666666666, 124.58018390668006 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.816496580927726 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.4999999999999999, "corr_recommendation_correctness": -0.49999999999999983, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:4NX1vg3MHkkJ:scholar.google.com/&scioq=Directional+Bias+Helps+Stochastic+Gradient+Descent+to+Generalize+in+Nonparametric+Model&hl=en&as_sdt=0,14", "gs_version_total": 0, "aff_unique_index": "0;0;0", "aff_unique_norm": "Georgia Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.gatech.edu", "aff_unique_abbr": "Georgia Tech", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Understanding and Preventing Capacity Loss in Reinforcement Learning", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6597", "id": "ZkC8wKoLbQ7", "poster": "", "openreview": "https://openreview.net/forum?id=ZkC8wKoLbQ7", "slides": "https://iclr.cc/virtual/2022/poster/6597", "video": "https://iclr.cc/virtual/2022/poster/6597", "author_site": "Clare Lyle, Mark Rowland, Will Dabney", "tldr": "", "abstract": "The reinforcement learning (RL) problem is rife with sources of non-stationarity that can destabilize or inhibit learning progress.\nWe identify a key mechanism by which this occurs in agents using neural networks as function approximators: \\textit{capacity loss}, whereby networks trained to predict a sequence of target values lose their ability to quickly fit new functions over time.\nWe demonstrate that capacity loss occurs in a broad range of RL agents and environments, and is particularly damaging to learning progress in sparse-reward tasks. We then present a simple regularizer, Initial Feature Regularization (InFeR), that mitigates this phenomenon by regressing a subspace of features towards its value at initialization, improving performance over a state-of-the-art model-free algorithm in the Atari 2600 suite. Finally, we study how this regularization affects different notions of capacity and evaluate other mechanisms by which it may improve performance.", "keywords": "Reinforcement learning;representation learning", "primary_area": "", "supplementary_material": "/attachment/b137b21a1ef08a10027eb583d895e8a4e7300e3d.zip", "author": "Clare Lyle;Mark Rowland;Will Dabney", "authorids": "~Clare_Lyle1;~Mark_Rowland1;~Will_Dabney1", "gender": ";M;M", "homepage": ";http://sites.google.com/view/markrowland;", "dblp": "192/1910;86/4090;https://dblp.uni-trier.de/pers/hd/d/Dabney:Will", "google_scholar": ";https://scholar.google.co.uk/citations?user=-0U84zMAAAAJ;https://scholar.google.co.uk/citations?user=dR-7QW8AAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Clare_Lyle1;~Mark_Rowland1;~Will_Dabney1", "aff": "University of Oxford;Google DeepMind;Google DeepMind", "aff_domain": "ox.ac.uk;google.com;google.com", "position": "PhD student;Research Scientist;Research Scientist", "bibtex": "@inproceedings{\nlyle2022understanding,\ntitle={Understanding and Preventing Capacity Loss in Reinforcement Learning},\nauthor={Clare Lyle and Mark Rowland and Will Dabney},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=ZkC8wKoLbQ7}\n}", "github": "", "project": "", "reviewers": "vuGv;Mqcv;zrcc;KLqH", "pdf_size": 0, "recommendation": "3;6;8;8", "confidence": "4;4;5;4", "correctness": "3;3;4;4", "technical_novelty": "2;3;2;4", "empirical_novelty": "2;3;3;4", "wc_summary_paper": "57;130;111;67", "wc_summary_review": "21;122;44;69", "wc_main_review": "314;533;229;671", "wc_review": "392;785;384;807", "wc_reply_reviewers": "150;1008;21;0", "wc_reply_authors": "2886;3219;991;1048", "reply_reviewers": "1;2;1;0", "reply_authors": "6;6;2;2", "recommendation_avg": [ 6.25, 2.0463381929681126 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.82915619758885 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 91.25, 30.21899237234756 ], "wc_summary_review_avg": [ 64.0, 37.54330832518626 ], "wc_main_review_avg": [ 436.75, 174.9033661768692 ], "wc_review_avg": [ 592.0, 204.16782312597644 ], "wc_reply_reviewers_avg": [ 294.75, 415.7808166570459 ], "wc_reply_authors_avg": [ 2036.0, 1023.493771353788 ], "reply_reviewers_avg": [ 1.0, 0.7071067811865476 ], "reply_authors_avg": [ 4.0, 2.0 ], "replies_avg": [ 27, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.49374193110101877, "corr_recommendation_correctness": 0.8551861104941366, "gs_citation": 126, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1822647811392761328&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=ZkC8wKoLbQ7", "email": "ox.ac.uk;google.com;google.com", "author_num": 3, "aff_unique_index": "0;1;1", "aff_unique_norm": "University of Oxford;Google", "aff_unique_dep": ";Google DeepMind", "aff_unique_url": "https://www.ox.ac.uk;https://deepmind.com", "aff_unique_abbr": "Oxford;DeepMind", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United Kingdom" }, { "id": "ZnUHvSyjstv", "title": "On the Capacity and Superposition of Minima in Neural Network Loss Function Landscapes", "track": "main", "status": "Reject", "tldr": "", "abstract": "Minima of the loss function landscape of a neural network are locally optimal sets of\nweights that extract and process information from the input data to make outcome predictions. \nIn underparameterised networks, the capacity of the weights may be insufficient to fit all the relevant information. \nWe demonstrate that different local minima specialise in certain aspects of the learning problem, and process the input\ninformation differently. This effect can be exploited using a meta-network in\nwhich the predictive power from multiple minima of the LFL is combined to produce a better\nclassifier. With this approach, we can increase the area under the receiver operating characteristic curve\n(AUC) by around $20\\%$ for a complex learning problem. \nWe propose a theoretical basis for combining minima and show how a meta-network can\nbe trained to select the representative that is used for classification of a\nspecific data item. Finally, we present an analysis of symmetry-equivalent\nsolutions to machine learning problems, which provides a systematic means to improve the\nefficiency of this approach.", "keywords": "ensemble learning;interpretability;loss function landscape;theoretical chemistry", "primary_area": "", "supplementary_material": "", "author": "Maximilian Paul Niroomand;John William Roger Morgan;Conor T Cafolla;David John Wales", "authorids": "~Maximilian_Paul_Niroomand1;~John_William_Roger_Morgan1;~Conor_T_Cafolla1;~David_John_Wales1", "gender": ";M;M;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";https://scholar.google.co.uk/citations?user=B8ps65sAAAAJ;;", "orcid": ";0000-0002-9157-9278;0000-0003-2021-974X;", "linkedin": ";;;", "or_profile": "~Maximilian_Paul_Niroomand1;~John_William_Roger_Morgan1;~Conor_T_Cafolla1;~David_John_Wales1", "aff": ";;University of Cambridge;", "aff_domain": ";;cam.ac.uk;", "position": ";;PhD student;", "bibtex": "@misc{\nniroomand2022on,\ntitle={On the Capacity and Superposition of Minima in Neural Network Loss Function Landscapes},\nauthor={Maximilian Paul Niroomand and John William Roger Morgan and Conor T Cafolla and David John Wales},\nyear={2022},\nurl={https://openreview.net/forum?id=ZnUHvSyjstv}\n}", "github": "", "project": "", "reviewers": "yRSp;saVg;XRAh;DYC3", "site": "https://openreview.net/forum?id=ZnUHvSyjstv", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "4;5;4;3", "correctness": "2;2;3;3", "technical_novelty": "1;3;2;3", "empirical_novelty": "2;1;2;2", "wc_summary_paper": "77;64;62;79", "wc_summary_review": "2;95;18;45", "wc_main_review": "386;587;260;182", "wc_review": "465;746;340;306", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.25, 0.82915619758885 ], "empirical_novelty_avg": [ 1.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 70.5, 7.566372975210778 ], "wc_summary_review_avg": [ 40.0, 35.27747156472527 ], "wc_main_review_avg": [ 353.75, 153.07902370997797 ], "wc_review_avg": [ 464.25, 173.1074449583264 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.816496580927726, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=35629709876383835&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff_unique_index": "0", "aff_unique_norm": "University of Cambridge", "aff_unique_dep": "", "aff_unique_url": "https://www.cam.ac.uk", "aff_unique_abbr": "Cambridge", "aff_campus_unique_index": "0", "aff_campus_unique": "Cambridge", "aff_country_unique_index": "0", "aff_country_unique": "United Kingdom" }, { "id": "ZnUwk6i_iTR", "title": "Symmetric Machine Theory of Mind", "track": "main", "status": "Reject", "tldr": "", "abstract": "Theory of mind (ToM), the ability to understand others' thoughts and desires, is a cornerstone of human intelligence. Because of this, a number of previous works have attempted to measure the ability of machines to develop a theory of mind, with one agent attempting to understand anothers' internal \"mental state''. However, ToM agents are often tested as passive observers or in tasks with specific predefined roles, such as speaker-listener scenarios. In this work, we propose to model machine theory of mind in a more flexible and symmetric scenario; a multi-agent environment SymmToM where all agents can speak, listen, see other agents, and move freely through a grid world. An effective strategy to solve SymmToM requires developing theory of mind to maximize each agent's rewards. We show that multi-agent deep-reinforcement learning models that model the mental states of other agents achieve significant performance improvements over agents with no such ToM model. At the same time, our best agents fail to achieve performance comparable to agents with access to the gold-standard mental state of other agents, demonstrating that the modeling of theory of mind in multi-agent scenarios is very much an open challenge.\n", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Melanie Sclar;Graham Neubig;Yonatan Bisk", "authorids": "~Melanie_Sclar1;~Graham_Neubig1;~Yonatan_Bisk1", "gender": "F;M;M", "homepage": "https://msclar.github.io;http://phontron.com;http://www.YonatanBisk.com", "dblp": "274/6796;03/8155;38/9282", "google_scholar": "4uNPtZgAAAAJ;wlosgkoAAAAJ;bWoGh8UAAAAJ", "orcid": ";;0000-0002-2111-9081", "linkedin": "melanie-sclar-077047b5/;;yonatanbisk/", "or_profile": "~Melanie_Sclar1;~Graham_Neubig1;~Yonatan_Bisk1", "aff": "University of Washington, Seattle;Carnegie Mellon University;Meta", "aff_domain": "uw.edu;cmu.edu;meta.com", "position": "PhD student;Associate Professor;Visiting Professor", "bibtex": "@misc{\nsclar2022symmetric,\ntitle={Symmetric Machine Theory of Mind},\nauthor={Melanie Sclar and Graham Neubig and Yonatan Bisk},\nyear={2022},\nurl={https://openreview.net/forum?id=ZnUwk6i_iTR}\n}", "github": "", "project": "", "reviewers": "n7Fh;VGt1;sDZt;cEGn;t9A3", "site": "https://openreview.net/forum?id=ZnUwk6i_iTR", "pdf_size": 0, "recommendation": "3;3;5;5;6", "confidence": "4;4;4;2;4", "correctness": "3;4;3;2;4", "technical_novelty": "3;2;2;2;2", "empirical_novelty": "3;2;2;2;4", "wc_summary_paper": "77;96;139;75;225", "wc_summary_review": "42;28;58;107;72", "wc_main_review": "308;649;454;222;678", "wc_review": "427;773;651;404;975", "wc_reply_reviewers": "46;230;85;0;38", "wc_reply_authors": "830;659;490;659;548", "reply_reviewers": "1;1;1;0;1", "reply_authors": "1;1;1;1;1", "recommendation_avg": [ 4.4, 1.2 ], "confidence_avg": [ 3.6, 0.8 ], "correctness_avg": [ 3.2, 0.7483314773547882 ], "technical_novelty_avg": [ 2.2, 0.39999999999999997 ], "empirical_novelty_avg": [ 2.6, 0.8 ], "wc_summary_paper_avg": [ 122.4, 56.226684056593626 ], "wc_summary_review_avg": [ 61.4, 27.185290140073917 ], "wc_main_review_avg": [ 462.2, 180.55735930722958 ], "wc_review_avg": [ 646.0, 214.90463001061656 ], "wc_reply_reviewers_avg": [ 79.8, 79.80576420284439 ], "wc_reply_authors_avg": [ 637.2, 116.40171820037709 ], "reply_reviewers_avg": [ 0.8, 0.4 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.25000000000000006, "corr_recommendation_correctness": -0.08908708063747484, "gs_citation": 36, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14676997486280015283&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff_unique_index": "0;1;2", "aff_unique_norm": "University of Washington;Carnegie Mellon University;Meta", "aff_unique_dep": ";;Meta Platforms, Inc.", "aff_unique_url": "https://www.washington.edu;https://www.cmu.edu;https://meta.com", "aff_unique_abbr": "UW;CMU;Meta", "aff_campus_unique_index": "0", "aff_campus_unique": "Seattle;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "ZncyIXXAB-0", "title": "IIT-GAN: Irregular and Intermittent Time-series Synthesis with Generative Adversarial Networks", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Time-series data is one of the most popular data types in the field of machine learning. For various reasons, there is a strong motivation to synthesize fake time-series data. Several disparate settings for time-series synthesis have been previously solved, ranging from synthesizing time-series without any missing values to time-series of multiple signals with different frequencies. In this paper, we solve the problem of synthesizing irregular and intermittent time-series where values can be missing and may not have specific frequencies, which is far more challenging than existing settings. To this end, we adopt various state-of-the-art deep learning concepts, such as autoencoders (AEs), generative adversarial networks (GANs), neural ordinary differential equations (NODEs), neural controlled differential equations (NCDEs), and so on. Our contribution lies in carefully re-designing those heterogeneous technologies and proposing our unified framework. Our method achieves the state-of-the-art synthesis performance for the irregular and intermittent time-series synthesis task.", "keywords": "time-series synthesis;GANs;differential equations", "primary_area": "", "supplementary_material": "/attachment/faa61944853e7a689d2e49f1d157488ce66e8881.zip", "author": "Jinsung Jeon;Jeonghak Kim;Haryong Song;Noseong Park", "authorids": "~Jinsung_Jeon1;kimhaggie@yonsei.ac.kr;thomas783789@yonsei.ac.kr;~Noseong_Park1", "gender": ";;;", "homepage": "https://sites.google.com/view/npark/home?authuser=0;;;", "dblp": "294/0098;;;", "google_scholar": "0R6W6lsAAAAJ;;;", "orcid": "0000-0002-9693-2739;;;", "linkedin": "jinsung-jeon-994942289/;;;", "or_profile": "~Jinsung_Jeon1;kimhaggie@yonsei.ac.kr;thomas783789@yonsei.ac.kr;~Noseong_Park1", "aff": "Yonsei University;;;", "aff_domain": "yonsei.ac.kr;;;", "position": "PhD student;;;", "bibtex": "@misc{\njeon2022iitgan,\ntitle={{IIT}-{GAN}: Irregular and Intermittent Time-series Synthesis with Generative Adversarial Networks},\nauthor={Jinsung Jeon and Jeonghak Kim and Haryong Song and Noseong Park},\nyear={2022},\nurl={https://openreview.net/forum?id=ZncyIXXAB-0}\n}", "github": "", "project": "", "reviewers": "mhRP;q91a;KVwT;x1ug", "site": "https://openreview.net/forum?id=ZncyIXXAB-0", "pdf_size": 0, "recommendation": "3;5;6;6", "confidence": "5;3;3;4", "correctness": "2;3;3;3", "technical_novelty": "1;2;3;2", "empirical_novelty": "2;2;3;2", "wc_summary_paper": "162;75;148;136", "wc_summary_review": "65;37;64;20", "wc_main_review": "942;277;366;91", "wc_review": "1169;389;578;247", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 5.0, 1.224744871391589 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 130.25, 33.199209327934305 ], "wc_summary_review_avg": [ 46.5, 18.980252896102307 ], "wc_main_review_avg": [ 419.0, 317.8387956181561 ], "wc_review_avg": [ 595.75, 351.1775726039463 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.7385489458759963, "corr_recommendation_correctness": 0.9428090415820632, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:o_iUq9Gh9-sJ:scholar.google.com/&scioq=IIT-GAN:+Irregular+and+Intermittent+Time-series+Synthesis+with+Generative+Adversarial+Networks&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "Yonsei University", "aff_unique_dep": "", "aff_unique_url": "https://www.yonsei.ac.kr", "aff_unique_abbr": "Yonsei", "aff_country_unique_index": "0", "aff_country_unique": "South Korea" }, { "id": "ZocWLFKDN3a", "title": "Variational Disentangled Attention for Regularized Visual Dialog", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "One of the most important challenges in a visual dialog is to effectively extract the information from a given image and its historical conversation which are related to the current question. Many studies adopt the soft attention mechanism in different information sources due to its simplicity and ease of optimization. However, some of visual dialogs are observed in a single round. This implies that there is no substantial correlation between individual rounds of questions and answers. This paper presents a unified approach to disentangled attention to deal with context-free visual dialogs. The question is disentangled in latent representation. In particular, an informative regularization is imposed to strengthen the dependence between vision and language by pretraining on the visual question answering before transferring to visual dialog. Importantly, a novel variational attention mechanism is developed and implemented by a local reparameterization trick which carries out a discrete attention to identify the relevant conversations in a visual dialog. A set of experiments are evaluated to illustrate the merits of the proposed attention and regularization schemes for context-free visual dialogs.", "keywords": "attention mechanism;latent disentanglement;visual dialog;model regularization", "primary_area": "", "supplementary_material": "/attachment/1c4ecc6b317b4f13140e87d395cb4b1246f552b1.zip", "author": "Jen-Tzung Chien;Hsiu-Wei Tien", "authorids": "~Jen-Tzung_Chien1;hwtien.ee08@nycu.edu.tw", "gender": "M;", "homepage": "http://chien.cm.nctu.edu.tw;", "dblp": "03/3569;", "google_scholar": ";", "orcid": "0000-0003-3466-8941;", "linkedin": "jen-tzung-chien-23a79158/;", "or_profile": "~Jen-Tzung_Chien1;hwtien.ee08@nycu.edu.tw", "aff": "National Yang Ming Chiao Tung University;", "aff_domain": "nycu.edu.tw;", "position": "Full Professor;", "bibtex": "@misc{\nchien2022variational,\ntitle={Variational Disentangled Attention for Regularized Visual Dialog},\nauthor={Jen-Tzung Chien and Hsiu-Wei Tien},\nyear={2022},\nurl={https://openreview.net/forum?id=ZocWLFKDN3a}\n}", "github": "", "project": "", "reviewers": "PB5u;en7d;dh8Q", "site": "https://openreview.net/forum?id=ZocWLFKDN3a", "pdf_size": 0, "recommendation": "3;5;6", "confidence": "4;4;2", "correctness": "3;3;3", "technical_novelty": "2;2;3", "empirical_novelty": "2;2;3", "wc_summary_paper": "42;53;60", "wc_summary_review": "62;43;19", "wc_main_review": "403;98;228", "wc_review": "507;194;307", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 4.666666666666667, 1.247219128924647 ], "confidence_avg": [ 3.3333333333333335, 0.9428090415820634 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 51.666666666666664, 7.408703590297623 ], "wc_summary_review_avg": [ 41.333333333333336, 17.594190960528863 ], "wc_main_review_avg": [ 243.0, 124.96666222103664 ], "wc_review_avg": [ 336.0, 129.4166398368721 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.7559289460184546, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:VnCJ7NqwN7QJ:scholar.google.com/&scioq=Variational+Disentangled+Attention+for+Regularized+Visual+Dialog&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "National Yang Ming Chiao Tung University", "aff_unique_dep": "", "aff_unique_url": "https://www.nycu.edu.tw", "aff_unique_abbr": "NYCU", "aff_campus_unique_index": "0", "aff_campus_unique": "Taiwan", "aff_country_unique_index": "0", "aff_country_unique": "China" }, { "title": "FastSHAP: Real-Time Shapley Value Estimation", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6282", "id": "Zq2G_VTV53T", "poster": "", "openreview": "https://openreview.net/forum?id=Zq2G_VTV53T", "slides": "https://iclr.cc/virtual/2022/poster/6282", "video": "https://iclr.cc/virtual/2022/poster/6282", "author_site": "Neil Jethani, Mukund Sudarshan, Ian Covert, Su-In Lee, Rajesh Ranganath", "tldr": "", "abstract": "Although Shapley values are theoretically appealing for explaining black-box models, they are costly to calculate and thus impractical in settings that involve large, high-dimensional models. To remedy this issue, we introduce FastSHAP, a new method for estimating Shapley values in a single forward pass using a learned explainer model. To enable efficient training without requiring ground truth Shapley values, we develop an approach to train FastSHAP via stochastic gradient descent using a weighted least-squares objective function. In our experiments with tabular and image datasets, we compare FastSHAP to existing estimation approaches and find that it generates accurate explanations with an orders-of-magnitude speedup.", "keywords": "interpretability;shapley;amortization;explainability;game theory", "primary_area": "", "supplementary_material": "/attachment/5d5c5c73c5f4d1be803abed07dcc8be48953f8ce.zip", "author": "Neil Jethani;Mukund Sudarshan;Ian Connick Covert;Su-In Lee;Rajesh Ranganath", "authorids": "~Neil_Jethani1;~Mukund_Sudarshan1;~Ian_Connick_Covert1;~Su-In_Lee2;~Rajesh_Ranganath2", "gender": "M;M;M;F;", "homepage": "https://neiljethani.github.io;;https://iancovert.com;http://suinlee.cs.washington.edu/;", "dblp": ";;262/3443;17/1784;97/7057", "google_scholar": ";;Np8Ek3cAAAAJ;;", "orcid": ";;;;", "linkedin": "neiljethani;;ian-covert/;;", "or_profile": "~Neil_Jethani1;~Mukund_Sudarshan1;~Ian_Connick_Covert1;~Su-In_Lee2;~Rajesh_Ranganath2", "aff": "NYU Langone;New York University;University of Washington;University of Washington;New York University", "aff_domain": "nyumc.org;nyu.edu;uw.edu;uw.edu;nyu.edu", "position": "MD Student;PhD student;PhD student;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\njethani2022fastshap,\ntitle={Fast{SHAP}: Real-Time Shapley Value Estimation},\nauthor={Neil Jethani and Mukund Sudarshan and Ian Connick Covert and Su-In Lee and Rajesh Ranganath},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=Zq2G_VTV53T}\n}", "github": "", "project": "", "reviewers": "zqFs;Asf6;wDJz;GNuc", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "3;4;2;3", "correctness": "3;3;3;3", "technical_novelty": "3;3;3;3", "empirical_novelty": "3;3;0;4", "wc_summary_paper": "60;36;36;14", "wc_summary_review": "8;32;25;34", "wc_main_review": "229;217;96;408", "wc_review": "297;285;157;456", "wc_reply_reviewers": "0;0;0;41", "wc_reply_authors": "481;810;193;546", "reply_reviewers": "0;0;0;1", "reply_authors": "2;2;2;2", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 3.0, 0.7071067811865476 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.5, 1.5 ], "wc_summary_paper_avg": [ 36.5, 16.27114009527298 ], "wc_summary_review_avg": [ 24.75, 10.231690964840562 ], "wc_main_review_avg": [ 237.5, 111.33844798630885 ], "wc_review_avg": [ 298.75, 106.08104213289008 ], "wc_reply_reviewers_avg": [ 10.25, 17.75352077758099 ], "wc_reply_authors_avg": [ 507.5, 219.43165222911665 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 2.0, 0.0 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 172, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12598717957999821688&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=Zq2G_VTV53T", "email": "nyumc.org;nyu.edu;uw.edu;uw.edu;nyu.edu", "author_num": 5, "aff_unique_index": "0;1;2;2;1", "aff_unique_norm": "NYU Langone Health;New York University;University of Washington", "aff_unique_dep": ";;", "aff_unique_url": "https://nyulangone.org;https://www.nyu.edu;https://www.washington.edu", "aff_unique_abbr": "NYU Langone;NYU;UW", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Constructing Orthogonal Convolutions in an Explicit Manner", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6994", "id": "Zr5W2LSRhD", "poster": "", "openreview": "https://openreview.net/forum?id=Zr5W2LSRhD", "slides": "https://iclr.cc/virtual/2022/poster/6994", "video": "https://iclr.cc/virtual/2022/poster/6994", "author_site": "Tan Yu, Jun Li, YUNFENG CAI, Ping Li", "tldr": "", "abstract": "Convolutions with orthogonal input-output Jacobian matrix, i.e., orthogonal convolution, have recently attracted substantial attention. A convolution layer with an orthogonal Jacobian matrix is 1-Lipschitz in the 2-norm, making the output robust to the perturbation in input. Meanwhile, an orthogonal Jacobian matrix preserves the gradient norm in back-propagation, which is critical for stable training deep networks. Nevertheless, existing orthogonal convolutions are burdened by high computational costs for preserving orthogonality.\nIn this work, we exploit the relation between the singular values of the convolution layer's Jacobian and the structure of the convolution kernel. To achieve orthogonality, we explicitly construct the convolution kernel for enforcing all singular values of the convolution layer's Jacobian to be $1$s. After training, the explicitly constructed orthogonal (ECO) convolution is constructed only once, and their weights are stored. Then, in evaluation, we only need to load the stored weights of the trained ECO convolution, and the computational cost of ECO convolution is the same as the standard dilated convolution. It is more efficient than the recent state-of-the-art approach, skew orthogonal convolution (SOC) in evaluation. Experiments on CIFAR-10 and CIFAR-100 demonstrate that the proposed ECO convolution is faster than SOC in evaluation while leading to competitive standard and certified robust accuracies. ", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/4fb1b2a2a0083de95407a8a6e9cd4f3d9553c365.zip", "author": "Tan Yu;Jun Li;YUNFENG CAI;Ping Li", "authorids": "~Tan_Yu3;~Jun_Li13;~YUNFENG_CAI1;~Ping_Li3", "gender": "M;M;M;M", "homepage": "https://junli-galios.github.io/;https://www.bimsa.cn/detail/yfcai.html;http://www.stat.rutgers.edu/home/pingli/;https://sites.google.com/site/tanyuspersonalwebsite/", "dblp": "116/1011-98;133/8201;62/5860-1;", "google_scholar": "fyQZYz8AAAAJ;https://scholar.google.com/citations?hl=en;;xrCW6z8AAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Jun_Li13;~YUNFENG_CAI1;~Ping_Li3;~YU_TAN1", "aff": "Baidu;Baidu Research;LinkedIn;Baidu", "aff_domain": "baidu.com;baidu.com;linkedin.com;baidu.com", "position": "Postdoc;Resseacher;Engineer;Research Scientist", "bibtex": "@inproceedings{\nyu2022constructing,\ntitle={Constructing Orthogonal Convolutions in an Explicit Manner},\nauthor={Tan Yu and Jun Li and YUNFENG CAI and Ping Li},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=Zr5W2LSRhD}\n}", "github": "", "project": "", "reviewers": "mniG;6wTV;htXp;YtM2", "pdf_size": 0, "recommendation": "3;6;6;8", "confidence": "5;3;4;4", "correctness": "3;4;3;4", "technical_novelty": "3;3;3;4", "empirical_novelty": "2;3;3;4", "wc_summary_paper": "13;66;64;73", "wc_summary_review": "41;11;22;57", "wc_main_review": "384;152;128;398", "wc_review": "438;229;214;528", "wc_reply_reviewers": "741;9;0;1003", "wc_reply_authors": "4621;257;82;3933", "reply_reviewers": "4;1;0;2", "reply_authors": "11;2;1;8", "recommendation_avg": [ 5.75, 1.7853571071357126 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 54.0, 23.90606617576384 ], "wc_summary_review_avg": [ 32.75, 17.640507362318125 ], "wc_main_review_avg": [ 265.5, 125.88387505951665 ], "wc_review_avg": [ 352.25, 134.67066310076595 ], "wc_reply_reviewers_avg": [ 438.25, 443.54220486893917 ], "wc_reply_authors_avg": [ 2223.25, 2069.030011261316 ], "reply_reviewers_avg": [ 1.75, 1.479019945774904 ], "reply_authors_avg": [ 5.5, 4.153311931459037 ], "replies_avg": [ 36, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.5940885257860046, "corr_recommendation_correctness": 0.7001400420140049, "gs_citation": 22, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15801297280684943051&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2, "pdf": "https://openreview.net/pdf?id=Zr5W2LSRhD", "email": "baidu.com;baidu.com;linkedin.com;baidu.com", "author_num": 4, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Baidu;LinkedIn Corporation", "aff_unique_dep": "Baidu, Inc.;", "aff_unique_url": "https://www.baidu.com;https://www.linkedin.com", "aff_unique_abbr": "Baidu;LinkedIn", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "China;United States" }, { "id": "ZumkmSpY9G4", "title": "Bypassing Logits Bias in Online Class-Incremental Learning with a Generative Framework", "track": "main", "status": "Reject", "tldr": "", "abstract": "Continual learning requires the model to maintain the learned knowledge while learning from a non-i.i.d data stream continually. Due to the single-pass training setting, online continual learning is very challenging, but it is closer to the real-world scenarios where quick adaptation to new data is appealing. In this paper, we focus on online class-incremental learning setting in which new classes emerge over time. Almost all existing methods are replay-based with a softmax classifier. However, the inherent logits bias problem in the softmax classifier is a main cause of catastrophic forgetting while existing solutions are not applicable for online settings. To bypass this problem, we abandon the softmax classifier and propose a novel generative framework based on the feature space. In our framework, a generative classifier which utilizes replay memory is used for inference, and the training objective is a pair-based metric learning loss which is proven theoretically to optimize the feature space in a generative way. In order to improve the ability to learn new data, we further propose a hybrid of generative and discriminative loss to train the model. Extensive experiments on several benchmarks, including newly introduced task-free datasets, show that our method beats a series of state-of-the-art replay-based methods with discriminative classifiers, and reduces catastrophic forgetting consistently with a remarkable margin.", "keywords": "continual learning;online class-incremental learning;catastrophic forgetting;deep learning", "primary_area": "", "supplementary_material": "", "author": "Gehui Shen;Shibo Jie;Ziheng Li;Zhi-Hong Deng", "authorids": "~Gehui_Shen1;~Shibo_Jie1;~Ziheng_Li1;~Zhi-Hong_Deng1", "gender": "M;M;M;M", "homepage": ";;https://www.researchgate.net/profile/Ziheng_Li4;http://www.cis.pku.edu.cn/jzyg/szdw/dzh.htm", "dblp": ";318/9497;;161/4814-1", "google_scholar": "TrYXrQkAAAAJ;https://scholar.google.com/citations?hl=zh-CN;;https://scholar.google.com.tw/citations?user=tRoAxlsAAAAJ", "orcid": ";;;0000-0002-0263-8142", "linkedin": ";;;", "or_profile": "~Gehui_Shen1;~Shibo_Jie1;~Ziheng_Li1;~Zhi-Hong_Deng1", "aff": ";Peking University;Peking University;Peking University", "aff_domain": ";pku.edu.cn;pku.edu.cn;pku.edu.cn", "position": ";PhD student;Undergrad student;Full Professor", "bibtex": "@misc{\nshen2022bypassing,\ntitle={Bypassing Logits Bias in Online Class-Incremental Learning with a Generative Framework},\nauthor={Gehui Shen and Shibo Jie and Ziheng Li and Zhi-Hong Deng},\nyear={2022},\nurl={https://openreview.net/forum?id=ZumkmSpY9G4}\n}", "github": "", "project": "", "reviewers": "TwHY;6SSs;hxsn;aZyF", "site": "https://openreview.net/forum?id=ZumkmSpY9G4", "pdf_size": 0, "recommendation": "5;5;5;6", "confidence": "4;4;4;4", "correctness": "3;3;3;3", "technical_novelty": "2;2;2;3", "empirical_novelty": "3;2;2;3", "wc_summary_paper": "132;89;42;63", "wc_summary_review": "139;28;29;16", "wc_main_review": "438;532;469;151", "wc_review": "709;649;540;230", "wc_reply_reviewers": "507;165;250;0", "wc_reply_authors": "457;579;577;268", "reply_reviewers": "1;1;1;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 5.25, 0.4330127018922193 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 81.5, 33.57454392839909 ], "wc_summary_review_avg": [ 53.0, 49.914927626913375 ], "wc_main_review_avg": [ 397.5, 146.29166073293447 ], "wc_review_avg": [ 532.0, 184.58466891917107 ], "wc_reply_reviewers_avg": [ 230.5, 183.2027565294802 ], "wc_reply_authors_avg": [ 470.25, 126.78993453740719 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11514847493149151509&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Peking University", "aff_unique_dep": "", "aff_unique_url": "http://www.pku.edu.cn", "aff_unique_abbr": "Peking U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "id": "Zwy3usE9RxT", "title": "Training Deep Generative Models via Auxiliary Supervised Learning", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Deep generative modeling has long been viewed as a challenging unsupervised learning problem, partly due to the lack of labels and the high dimension of the data. Although various latent variable models have been proposed to tackle these difficulties, the latent variable only serves as a device to model the observed data, and is typically averaged out during training. In this article, we show that by introducing a properly pre-trained encoder, the latent variable can play a more important role, which decomposes a deep generative model into a supervised learning problem and a much simpler unsupervised learning task. With this new training method, which we call the auxiliary supervised learning (ASL) framework, deep generative models can benefit from the enormous success of deep supervised learning and representation learning techniques. By evaluating on various synthetic and real data sets, we demonstrate that ASL is a stable, efficient, and accurate training framework for deep generative models.", "keywords": "deep generative models;supervised learning;representation learning;mutual information;latent variable model", "primary_area": "", "supplementary_material": "", "author": "Yijia Zheng;Yixuan Qiu", "authorids": "~Yijia_Zheng1;~Yixuan_Qiu1", "gender": "F;", "homepage": "https://amberyzheng.com;https://statr.me", "dblp": "371/9933;209/7159", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": "~Yijia_Zheng1;~Yixuan_Qiu1", "aff": "Shanghai University of Finance and Economics;Shanghai University of Finance and Economics", "aff_domain": "sufe.edu.cn;sufe.edu.cn", "position": "Undergrad student;Associate Professor", "bibtex": "@misc{\nzheng2022training,\ntitle={Training Deep Generative Models via Auxiliary Supervised Learning},\nauthor={Yijia Zheng and Yixuan Qiu},\nyear={2022},\nurl={https://openreview.net/forum?id=Zwy3usE9RxT}\n}", "github": "", "project": "", "reviewers": "nUPN;6DAm;QNg7;chEF", "site": "https://openreview.net/forum?id=Zwy3usE9RxT", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "5;5;4;3", "correctness": "2;3;2;2", "technical_novelty": "2;3;3;2", "empirical_novelty": "2;3;0;2", "wc_summary_paper": "82;55;335;156", "wc_summary_review": "31;21;96;3", "wc_main_review": "534;580;821;548", "wc_review": "647;656;1252;707", "wc_reply_reviewers": "643;0;359;371", "wc_reply_authors": "957;573;476;884", "reply_reviewers": "1;0;1;1", "reply_authors": "2;1;2;2", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 4.25, 0.82915619758885 ], "correctness_avg": [ 2.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 1.75, 1.0897247358851685 ], "wc_summary_paper_avg": [ 157.0, 109.21767256263979 ], "wc_summary_review_avg": [ 37.75, 35.0954056822257 ], "wc_main_review_avg": [ 620.75, 116.81047684176279 ], "wc_review_avg": [ 815.5, 253.0498962655389 ], "wc_reply_reviewers_avg": [ 343.25, 228.41231906357416 ], "wc_reply_authors_avg": [ 722.5, 202.5987413583806 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.75, 0.4330127018922193 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.9045340337332909, "corr_recommendation_correctness": -0.5773502691896257, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:MoexCtooIMQJ:scholar.google.com/&scioq=Training+Deep+Generative+Models+via+Auxiliary+Supervised+Learning&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Shanghai University of Finance and Economics", "aff_unique_dep": "", "aff_unique_url": "http://www.sufe.edu.cn", "aff_unique_abbr": "SUFE", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "id": "ZzwfldvDLpC", "title": "Let Your Heart Speak in its Mother Tongue: Multilingual Captioning of Cardiac Signals", "track": "main", "status": "Reject", "tldr": "", "abstract": "Cardiac signals convey a significant amount of information about the health status of a patient. Upon recording these signals, cardiologists are expected to manually generate an accompanying report to share with physicians and patients. Generating these reports, however, can be time-consuming and error-prone, while also exhibiting a high degree of intra- and inter-physician variability. To address this, we design a neural, multilingual, cardiac signal captioning framework. In the process, we propose a discriminative multilingual representation learning method, RTLP, which randomly replaces tokens with those from a different language and tasks a network with identifying the language of all tokens. We show that RTLP performs on par with state-of-the-art pre-training methods such as MLM and MARGE, while generating more clinically accurate reports than MLM. We also show that, with RTLP, multilingual fine-tuning can be preferable to its monolingual counterpart, a phenomenon we refer to as the \\textit{blessing of multilinguality}.", "keywords": "multilingual representation learning;cardiac signal captioning", "primary_area": "", "supplementary_material": "", "author": "Dani Kiyasseh;Tingting Zhu;David A. Clifton", "authorids": "~Dani_Kiyasseh1;~Tingting_Zhu1;~David_A._Clifton1", "gender": ";F;M", "homepage": "https://danikiyasseh.github.io/;https://eng.ox.ac.uk/people/tingting-zhu/;http://www.eng.ox.ac.uk/chi", "dblp": ";29/7666-1;89/6424", "google_scholar": "UD1oO4MAAAAJ;https://scholar.google.com.vn/citations?user=fjGMIl0AAAAJ;", "orcid": ";0000-0002-1552-5630;", "linkedin": ";;", "or_profile": "~Dani_Kiyasseh1;~Tingting_Zhu1;~David_A._Clifton1", "aff": "California Institute of Technology;University of Oxford;University of Oxford", "aff_domain": "caltech.edu;eng.ox.ac.uk;ox.ac.uk", "position": "Postdoc;RAEng Research Fellow;Full Professor", "bibtex": "@misc{\nkiyasseh2022let,\ntitle={Let Your Heart Speak in its Mother Tongue: Multilingual Captioning of Cardiac Signals},\nauthor={Dani Kiyasseh and Tingting Zhu and David A. Clifton},\nyear={2022},\nurl={https://openreview.net/forum?id=ZzwfldvDLpC}\n}", "github": "", "project": "", "reviewers": "VuBG;KZQE;VEwE;HBxE", "site": "https://openreview.net/forum?id=ZzwfldvDLpC", "pdf_size": 0, "recommendation": "3;3;6;8", "confidence": "4;5;4;4", "correctness": "2;3;3;3", "technical_novelty": "3;2;3;4", "empirical_novelty": "3;2;3;4", "wc_summary_paper": "101;67;53;72", "wc_summary_review": "37;60;30;37", "wc_main_review": "719;367;207;423", "wc_review": "857;494;290;532", "wc_reply_reviewers": "330;0;0;147", "wc_reply_authors": "1669;906;478;456", "reply_reviewers": "1;0;0;1", "reply_authors": "3;2;1;1", "recommendation_avg": [ 5.0, 2.1213203435596424 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 73.25, 17.469616481193857 ], "wc_summary_review_avg": [ 41.0, 11.335784048754634 ], "wc_main_review_avg": [ 429.0, 185.24578267804102 ], "wc_review_avg": [ 543.25, 203.1789543727401 ], "wc_reply_reviewers_avg": [ 119.25, 135.67124787514854 ], "wc_reply_authors_avg": [ 877.25, 491.05670497407937 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.75, 0.82915619758885 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.5443310539518174, "corr_recommendation_correctness": 0.5443310539518174, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15075419675553666915&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;1", "aff_unique_norm": "California Institute of Technology;University of Oxford", "aff_unique_dep": ";", "aff_unique_url": "https://www.caltech.edu;https://www.ox.ac.uk", "aff_unique_abbr": "Caltech;Oxford", "aff_campus_unique_index": "0", "aff_campus_unique": "Pasadena;", "aff_country_unique_index": "0;1;1", "aff_country_unique": "United States;United Kingdom" }, { "id": "_2CLeIIYMPd", "title": "Discovering Latent Network Topology in Contextualized Representations with Randomized Dynamic Programming", "track": "main", "status": "Reject", "tldr": "", "abstract": "The discovery of large-scale discrete latent structures is crucial for understanding the fundamental generative processes of language. In this work, we use structured latent variables to study the representation space of contextualized embeddings and gain insight into the hidden topology of pretrained language models. However, existing methods are severely limited by issues of scalability and efficiency as working with large combinatorial spaces requires expensive memory consumption. We address this challenge by proposing a Randomized Dynamic Programming (RDP) algorithm for the approximate inference of structured models with DP-style exact computation (e.g., Forward-Backward). Our technique samples a subset of DP paths reducing memory complexity to as small as one percent. We use RDP to analyze the representation space of pretrained language models, discovering a large-scale latent network in a fully unsupervised way. The induced latent states not only serve as anchors marking the topology of the space (neighbors and connectivity), but also reveal linguistic properties related to syntax, morphology, and semantics. We also show that traversing this latent network yields unsupervised paraphrase generation.", "keywords": "latent structures;dynamic programming;approximate inference;randomization;memory efficiency;contextualized representations;network topology;paraphrase generation;bertology", "primary_area": "", "supplementary_material": "/attachment/85eebdacb1459ec014c976e03148522a48c8ce34.zip", "author": "Yao Fu;Mirella Lapata", "authorids": "~Yao_Fu3;~Mirella_Lapata1", "gender": "M;F", "homepage": "https://franxyao.github.io/;https://homepages.inf.ed.ac.uk/mlap/", "dblp": ";59/6701", "google_scholar": "liSP4cEAAAAJ;j67B9Q4AAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Yao_Fu3;~Mirella_Lapata1", "aff": "Allen Institute for Artificial Intelligence;Edinburgh University, University of Edinburgh", "aff_domain": "allenai.org;inf.ed.ac.uk", "position": "Intern;Full Professor", "bibtex": "@misc{\nfu2022discovering,\ntitle={Discovering Latent Network Topology in Contextualized Representations with Randomized Dynamic Programming},\nauthor={Yao Fu and Mirella Lapata},\nyear={2022},\nurl={https://openreview.net/forum?id=_2CLeIIYMPd}\n}", "github": "", "project": "", "reviewers": "irMJ;61KB;cUAQ;Cp5o", "site": "https://openreview.net/forum?id=_2CLeIIYMPd", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "4;3;4;4", "correctness": "3;4;2;4", "technical_novelty": "3;2;3;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "76;151;111;74", "wc_summary_review": "61;30;142;34", "wc_main_review": "359;306;709;648", "wc_review": "496;487;962;756", "wc_reply_reviewers": "45;81;0;0", "wc_reply_authors": "2099;1004;2540;1512", "reply_reviewers": "1;1;0;0", "reply_authors": "4;2;5;2", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 103.0, 31.376742979474464 ], "wc_summary_review_avg": [ 66.75, 45.05205322735025 ], "wc_main_review_avg": [ 505.5, 175.34323482815068 ], "wc_review_avg": [ 675.25, 197.68330101452676 ], "wc_reply_reviewers_avg": [ 31.5, 33.974254958718376 ], "wc_reply_authors_avg": [ 1788.75, 581.6044080816445 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 3.25, 1.299038105676658 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.13245323570650439, "corr_recommendation_correctness": 0.3458572319330373, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:pe4xo-XtTBUJ:scholar.google.com/&scioq=Discovering+Latent+Network+Topology+in+Contextualized+Representations+with+Randomized+Dynamic+Programming&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1", "aff_unique_norm": "Allen Institute for Artificial Intelligence;University of Edinburgh", "aff_unique_dep": ";", "aff_unique_url": "https://allenai.org;https://www.ed.ac.uk", "aff_unique_abbr": "AI2;Edinburgh", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "United States;United Kingdom" }, { "id": "_3bwD_KXl5K", "title": "WaveSense: Efficient Temporal Convolutions with Spiking Neural Networks for Keyword Spotting", "track": "main", "status": "Reject", "tldr": "", "abstract": "Ultra-low power local signal processing is a crucial aspect for edge applications on always-on devices. \nNeuromorphic processors emulating spiking neural networks show great computational power while fulfilling the limited power budget as needed in this domain.\nIn this work we propose spiking neural dynamics as a natural alternative to dilated temporal convolutions. We extend this idea to WaveSense, a spiking neural network inspired by the WaveNet architecture.\nWaveSense uses simple neural dynamics, fixed time-constants and a simple feed-forward architecture and hence is particularly well suited for a neuromorphic implementation.\nWe test the capabilities of this model on several datasets for keyword-spotting. The results show that the proposed network beats the state of the art of other spiking neural networks and reaches near state-of-the-art performance of artificial neural networks such as CNNs and LSTMs.", "keywords": "spiking;keyword spotting;temporal processing;streaming;audio;neuromorphic;wavenet;wavesense;always-on;low-power;temporal convolution", "primary_area": "", "supplementary_material": "", "author": "Philipp Weidel;Sadique Sheik", "authorids": "~Philipp_Weidel1;~Sadique_Sheik1", "gender": ";M", "homepage": ";", "dblp": ";99/9846", "google_scholar": "yaCN3UQAAAAJ;J5Z-oUwAAAAJ", "orcid": ";0000-0003-0302-8511", "linkedin": ";", "or_profile": "~Philipp_Weidel1;~Sadique_Sheik1", "aff": "SynSense AG;SynSense AI", "aff_domain": "synsense.ai;synsense.ai", "position": "Researcher;Researcher", "bibtex": "@misc{\nweidel2022wavesense,\ntitle={WaveSense: Efficient Temporal Convolutions with Spiking Neural Networks for Keyword Spotting},\nauthor={Philipp Weidel and Sadique Sheik},\nyear={2022},\nurl={https://openreview.net/forum?id=_3bwD_KXl5K}\n}", "github": "", "project": "", "reviewers": "kAcR;u5kv;4EoH;SLVL;nmri", "site": "https://openreview.net/forum?id=_3bwD_KXl5K", "pdf_size": 0, "recommendation": "3;3;3;3;5", "confidence": "4;4;4;4;4", "correctness": "2;2;2;2;3", "technical_novelty": "2;3;2;3;2", "empirical_novelty": "2;2;1;2;3", "wc_summary_paper": "37;111;51;103;117", "wc_summary_review": "19;94;10;54;222", "wc_main_review": "110;739;226;352;393", "wc_review": "166;944;287;509;732", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "0;31;31;31;31", "reply_reviewers": "0;0;0;0;0", "reply_authors": "0;1;1;1;1", "recommendation_avg": [ 3.4, 0.8 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 2.2, 0.39999999999999997 ], "technical_novelty_avg": [ 2.4, 0.4898979485566356 ], "empirical_novelty_avg": [ 2.0, 0.6324555320336759 ], "wc_summary_paper_avg": [ 83.8, 33.09622334950017 ], "wc_summary_review_avg": [ 79.8, 77.01532315065619 ], "wc_main_review_avg": [ 364.0, 212.21215799289163 ], "wc_review_avg": [ 527.6, 284.30870545940024 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 24.8, 12.399999999999999 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0.8, 0.4000000000000001 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 1.0, "gs_citation": 18, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15151752104791996437&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1", "aff_unique_norm": "SynSense AG;SynSense AI", "aff_unique_dep": ";", "aff_unique_url": "https://www.synsense.ag;https://www.synsense.ai", "aff_unique_abbr": ";SynSense AI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "Switzerland;China" }, { "id": "_4D8IVs7yO8", "title": "Dense-to-Sparse Gate for Mixture-of-Experts", "track": "main", "status": "Reject", "tldr": "", "abstract": "Mixture-of-experts (MoE) is becoming popular due to its success in improving the model quality, especially in Transformers. By routing tokens with a sparse gate to a few experts that each only contains part of the full model, MoE keeps the model size unchanged and significantly reduces per-token computation, which effectively scales neural networks. However, we found that the current approach of jointly training experts and the sparse gate introduces a negative impact on model accuracy, diminishing the efficiency of expensive large-scale model training. In this work, we proposed $\\texttt{Dense-To-Sparse}$ gate (DTS-Gate) for MoE training. Specifically, instead of using a permanent sparse gate, DTS-Gate begins as a dense gate that routes tokens to all experts, then gradually and adaptively becomes sparser while routes to fewer experts. MoE with DTS-Gate naturally decouples the training of experts and the sparse gate by training all experts at first and then learning the sparse gate. Our code is available at https://anonymous.4open.science/r/MoE-3D0D/README.md/README.moe.md.", "keywords": "Deep Learning;Transformer;Mixture of Experts.", "primary_area": "", "supplementary_material": "", "author": "Xiaonan Nie;Shijie Cao;Xupeng Miao;Lingxiao Ma;Jilong Xue;Youshan Miao;Zichao Yang;Zhi Yang;Bin CUI", "authorids": "~Xiaonan_Nie1;~Shijie_Cao1;~Xupeng_Miao1;~Lingxiao_Ma1;jxue@microsoft.com;yomia@microsoft.com;~Zichao_Yang1;~Zhi_Yang4;~Bin_CUI2", "gender": "M;M;M;M;;;M;M;M", "homepage": "https://codecaution.github.io/;https://www.microsoft.com/en-us/research/people/shijiecao/;https://hsword.github.io;https://xysmlx.github.io/;;;;https://yangzhihome.github.io/;https://cuibinpku.github.io/index.html", "dblp": "295/3397;;243/2364;57/3203;;;07/8707;90/5587-1;55/5031.html", "google_scholar": "99LfmxYAAAAJ;StqnQfsAAAAJ;aCAgdYkAAAAJ;AQq30wIAAAAJ;;;https://scholar.google.co.uk/citations?user=siCYLcUAAAAJ;;IJAU8KoAAAAJ", "orcid": ";;0000-0002-9371-8358;;;;;;0000-0003-1681-4677", "linkedin": "https://www.linkedin.com/mwlite/in/\u5c0f\u6960-\u8042-a80b01163;;;;;;;;", "or_profile": "~Xiaonan_Nie1;~Shijie_Cao1;~Xupeng_Miao1;~Lingxiao_Ma1;jxue@microsoft.com;yomia@microsoft.com;~Zichao_Yang1;~Zhi_Yang4;~Bin_CUI2", "aff": "Peking University;Microsoft Research Asia;Peking University;Microsoft Research;;;;Peking University;Peking University", "aff_domain": "pku.edu.cn;microsoft.com;pku.edu.cn;microsoft.com;;;;pku.edu.cn;pku.edu.cn", "position": "PhD student;Researcher;PhD student;Senior Researcher;;;;Associate Professor;Full Professor", "bibtex": "@misc{\nnie2022densetosparse,\ntitle={Dense-to-Sparse Gate for Mixture-of-Experts},\nauthor={Xiaonan Nie and Shijie Cao and Xupeng Miao and Lingxiao Ma and Jilong Xue and Youshan Miao and Zichao Yang and Zhi Yang and Bin CUI},\nyear={2022},\nurl={https://openreview.net/forum?id=_4D8IVs7yO8}\n}", "github": "", "project": "", "reviewers": "hQQS;XPcr;prfk;gUQG;wm4x", "site": "https://openreview.net/forum?id=_4D8IVs7yO8", "pdf_size": 0, "recommendation": "5;5;5;5;6", "confidence": "3;4;3;5;4", "correctness": "2;2;3;3;4", "technical_novelty": "2;2;2;2;4", "empirical_novelty": "2;2;2;2;3", "wc_summary_paper": "62;63;49;72;65", "wc_summary_review": "28;39;49;163;129", "wc_main_review": "438;575;189;502;359", "wc_review": "528;677;287;737;553", "wc_reply_reviewers": "169;103;75;225;176", "wc_reply_authors": "504;540;833;752;1041", "reply_reviewers": "1;1;1;1;1", "reply_authors": "1;1;2;2;2", "recommendation_avg": [ 5.2, 0.39999999999999997 ], "confidence_avg": [ 3.8, 0.7483314773547882 ], "correctness_avg": [ 2.8, 0.7483314773547882 ], "technical_novelty_avg": [ 2.4, 0.8 ], "empirical_novelty_avg": [ 2.2, 0.39999999999999997 ], "wc_summary_paper_avg": [ 62.2, 7.467261881037788 ], "wc_summary_review_avg": [ 81.6, 54.07994082836999 ], "wc_main_review_avg": [ 412.6, 132.57541250171542 ], "wc_review_avg": [ 556.4, 155.277300337171 ], "wc_reply_reviewers_avg": [ 149.6, 53.842733957331696 ], "wc_reply_authors_avg": [ 734.0, 197.43859805012798 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 1.6, 0.4898979485566356 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 9, 0 ], "corr_recommendation_confidence": 0.1336306209562122, "corr_recommendation_correctness": 0.8017837257372732, "gs_citation": 28, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7471219084952011269&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;0;1;0;0", "aff_unique_norm": "Peking University;Microsoft", "aff_unique_dep": ";Research", "aff_unique_url": "http://www.pku.edu.cn;https://www.microsoft.com/en-us/research/group/asia", "aff_unique_abbr": "Peking U;MSR Asia", "aff_campus_unique_index": "1", "aff_campus_unique": ";Asia", "aff_country_unique_index": "0;0;0;1;0;0", "aff_country_unique": "China;United States" }, { "title": "Capacity of Group-invariant Linear Readouts from Equivariant Representations: How Many Objects can be Linearly Classified Under All Possible Views?", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6767", "id": "_4GFbtOuWq-", "poster": "", "openreview": "https://openreview.net/forum?id=_4GFbtOuWq-", "slides": "https://iclr.cc/virtual/2022/poster/6767", "video": "https://iclr.cc/virtual/2022/poster/6767", "author_site": "Matthew Farrell, Blake Bordelon, Shubhendu Trivedi, Cengiz Pehlevan", "tldr": "", "abstract": "Equivariance has emerged as a desirable property of representations of objects subject to identity-preserving transformations that constitute a group, such as translations and rotations. However, the expressivity of a representation constrained by group equivariance is still not fully understood. We address this gap by providing a generalization of Cover's Function Counting Theorem that quantifies the number of linearly separable and group-invariant binary dichotomies that can be assigned to equivariant representations of objects. We find that the fraction of separable dichotomies is determined by the dimension of the space that is fixed by the group action. We show how this relation extends to operations such as convolutions, element-wise nonlinearities, and global and local pooling. While other operations do not change the fraction of separable dichotomies, local pooling decreases the fraction, despite being a highly nonlinear operation. Finally, we test our theory on intermediate representations of randomly initialized and fully trained convolutional neural networks and find perfect agreement.", "keywords": "representation learning;perceptron capacity;perceptual manifolds;equivariance;cover's theorem;vc dimension", "primary_area": "", "supplementary_material": "/attachment/057dd8a28ab2af3f95979aee3b2e09f91c7ac806.zip", "author": "Matthew Farrell;Blake Bordelon;Shubhendu Trivedi;Cengiz Pehlevan", "authorids": "~Matthew_Farrell1;~Blake_Bordelon1;~Shubhendu_Trivedi2;~Cengiz_Pehlevan2", "gender": "M;;M;M", "homepage": "https://blakebordelon.github.io/;https://pehlevan.seas.harvard.edu/;;https://ttic.uchicago.edu/~shubhendu/", "dblp": "228/6993;145/3480;;97/9735", "google_scholar": "yeQ8_pgAAAAJ;veDLTPEAAAAJ;H3qtlSEAAAAJ;EbyGwncAAAAJ", "orcid": "0000-0003-0455-9445;0000-0001-9767-6063;0000-0001-8359-8666;", "linkedin": ";;;", "or_profile": "~Blake_Bordelon1;~Cengiz_Pehlevan2;~Matthew_Stuart_Farrell1;~Shubhendu_Trivedi1", "aff": "Harvard University;School of Engineering and Applied Sciences, Harvard University;Harvard University;Massachusetts Institute of Technology", "aff_domain": "harvard.edu;seas.harvard.edu;harvard.edu;mit.edu", "position": "PhD student;Assistant Professor;Postdoc;Research Associate", "bibtex": "@inproceedings{\nfarrell2022capacity,\ntitle={Capacity of Group-invariant Linear Readouts from Equivariant Representations: How Many Objects can be Linearly Classified Under All Possible Views?},\nauthor={Matthew Farrell and Blake Bordelon and Shubhendu Trivedi and Cengiz Pehlevan},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=_4GFbtOuWq-}\n}", "github": "", "project": "", "reviewers": "zVT9;s9Dq;hH77;bab9", "pdf_size": 0, "recommendation": "6;6;6;8", "confidence": "3;3;3;3", "correctness": "4;4;3;4", "technical_novelty": "3;3;3;3", "empirical_novelty": "0;3;2;3", "wc_summary_paper": "91;148;62;79", "wc_summary_review": "73;40;37;42", "wc_main_review": "154;460;194;552", "wc_review": "318;648;293;673", "wc_reply_reviewers": "0;323;18;65", "wc_reply_authors": "840;2472;1101;2059", "reply_reviewers": "0;2;1;1", "reply_authors": "2;6;2;3", "recommendation_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 3.0, 0.0 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.0, 1.224744871391589 ], "wc_summary_paper_avg": [ 95.0, 32.28776858192588 ], "wc_summary_review_avg": [ 48.0, 14.543039572248986 ], "wc_main_review_avg": [ 340.0, 169.74687036879354 ], "wc_review_avg": [ 483.0, 177.93959649274245 ], "wc_reply_reviewers_avg": [ 101.5, 130.06632923243433 ], "wc_reply_authors_avg": [ 1618.0, 670.1436413187847 ], "reply_reviewers_avg": [ 1.0, 0.7071067811865476 ], "reply_authors_avg": [ 3.25, 1.6393596310755 ], "replies_avg": [ 22, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14636029131782047015&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "pdf": "https://openreview.net/pdf?id=_4GFbtOuWq-", "email": "harvard.edu;seas.harvard.edu;harvard.edu;mit.edu", "author_num": 4, "aff_unique_index": "0;0;0;1", "aff_unique_norm": "Harvard University;Massachusetts Institute of Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.harvard.edu;https://web.mit.edu", "aff_unique_abbr": "Harvard;MIT", "aff_campus_unique_index": "1", "aff_campus_unique": ";Cambridge", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "_55bCXzj3D9", "title": "Exploring and Evaluating Personalized Models for Code Generation", "track": "main", "status": "Reject", "tldr": "", "abstract": "Large Transformer models achieved the state-of-the-art status for Natural Language Understanding and are increasingly the baseline architecture for source code generation models. Transformers are usually pre-trained on a large unsupervised corpus, learning token representations and transformations relevant to modeling generally available text, and then fine-tuned on a particular task of interest. While fine-tuning is a tried-and-true method for adapting a model to a new domain, for example question-answering on a given topic or a source code generation model, generalization remains an on-going challenge. Here we explore the ability of various levels of model fine-tuning to improve generalization by personalized fine-tuning. In the context of generating unit tests for Java methods, here we evaluate learning to personalize to a specific project using several methods to personalize transformer models for unit test generation for a specific Java project. We consider three fine-tuning approaches: (i) custom fine-tuning, which allows all the model parameters to be tuned; (ii) lightweight fine-tuning, which freezes most of the model's parameters, allowing a tuning of the token embeddings and softmax layer or the final layer alone; (iii) prefix tuning, which keeps language model parameters frozen, but optimizes a small project-specific prefix vector. Each of these techniques offers a different trade-off in total compute cost and prediction performance, which we evaluate by code and task-specific metrics, training time, and total computational operations. We compare these fine-tuning strategies for code generation and discuss the potential generalization and cost benefits of each in deployment scenarios.", "keywords": "code generation;custom models;NLP", "primary_area": "", "supplementary_material": "", "author": "Andrei Zlotchevski;Dawn Drain;Alexey Svyatkovskiy;Colin Clement;Neel Sundaresan;Michele Tufano", "authorids": "andrei.zlotchevski@mail.mcgill.ca;dawn@anthropic.com;~Alexey_Svyatkovskiy1;~Colin_Clement1;~Neel_Sundaresan3;~Michele_Tufano1", "gender": ";;M;;;M", "homepage": ";;https://www.microsoft.com/en-us/research/people/alsvyatk/;https://cbclement.com;https://www.linkedin.com/in/neel-sundaresan-a964a2/;https://tufanomichele.com/", "dblp": ";;198/0454;;s/NeelSundaresan.html;166/4957", "google_scholar": ";;0Oj4J4wAAAAJ;J2aZLEYAAAAJ;;KmeqxSEAAAAJ", "orcid": ";;0000-0001-7714-4481;0000-0002-3727-7308;;", "linkedin": ";;;colin-b-clement/;neel-sundaresan-a964a2/;michele-tufano-3a854050/", "or_profile": "andrei.zlotchevski@mail.mcgill.ca;dawn@anthropic.com;~Alexey_Svyatkovskiy1;~Colin_Clement1;~Neel_Sundaresan3;~Michele_Tufano1", "aff": ";;Microsoft;Microsoft;University of California, Santa Cruz;Microsoft", "aff_domain": ";;microsoft.com;microsoft.com;ucsc.edu;microsoft.com", "position": ";;Principal Researcher;Senior Research Manager;Full Professor (adjunct);Senior Research Scientist", "bibtex": "@misc{\nzlotchevski2022exploring,\ntitle={Exploring and Evaluating Personalized Models for Code Generation},\nauthor={Andrei Zlotchevski and Dawn Drain and Alexey Svyatkovskiy and Colin Clement and Neel Sundaresan and Michele Tufano},\nyear={2022},\nurl={https://openreview.net/forum?id=_55bCXzj3D9}\n}", "github": "", "project": "", "reviewers": "v2C4;bTU6;BJx7;1jM4", "site": "https://openreview.net/forum?id=_55bCXzj3D9", "pdf_size": 0, "recommendation": "1;3;3;3", "confidence": "5;4;5;5", "correctness": "2;3;3;3", "technical_novelty": "1;1;1;1", "empirical_novelty": "2;1;1;2", "wc_summary_paper": "74;49;70;136", "wc_summary_review": "85;14;39;94", "wc_main_review": "743;399;226;652", "wc_review": "902;462;335;882", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 2.5, 0.8660254037844386 ], "confidence_avg": [ 4.75, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 1.0, 0.0 ], "empirical_novelty_avg": [ 1.5, 0.5 ], "wc_summary_paper_avg": [ 82.25, 32.45285041410076 ], "wc_summary_review_avg": [ 58.0, 32.8709598277872 ], "wc_main_review_avg": [ 505.0, 204.52994890724437 ], "wc_review_avg": [ 645.25, 250.90174869856926 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": 1.0, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12971368750694612529&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Microsoft;University of California, Santa Cruz", "aff_unique_dep": "Microsoft Corporation;", "aff_unique_url": "https://www.microsoft.com;https://www.ucsc.edu", "aff_unique_abbr": "Microsoft;UCSC", "aff_campus_unique_index": "1", "aff_campus_unique": ";Santa Cruz", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Towards Evaluating the Robustness of Neural Networks Learned by Transduction", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6995", "id": "_5js_8uTrx1", "poster": "", "openreview": "https://openreview.net/forum?id=_5js_8uTrx1", "slides": "https://iclr.cc/virtual/2022/poster/6995", "video": "https://iclr.cc/virtual/2022/poster/6995", "author_site": "Jiefeng Chen, Xi Wu, Yang Guo, Yingyu Liang, Somesh Jha", "tldr": "", "abstract": "There has been emerging interest in using transductive learning for adversarial robustness (Goldwasser et al., NeurIPS 2020; Wu et al., ICML 2020; Wang et al., ArXiv 2021). Compared to traditional defenses, these defense mechanisms \"dynamically learn\" the model based on test-time input; and theoretically, attacking these defenses reduces to solving a bilevel optimization problem, which poses difficulty in crafting adaptive attacks. In this paper, we examine these defense mechanisms from a principled threat analysis perspective. We formulate and analyze threat models for transductive-learning based defenses, and point out important subtleties. We propose the principle of attacking model space for solving bilevel attack objectives, and present Greedy Model Space Attack (GMSA), an attack framework that can serve as a new baseline for evaluating transductive-learning based defenses. Through systematic evaluation, we show that GMSA, even with weak instantiations, can break previous transductive-learning based defenses, which were resilient to previous attacks, such as AutoAttack (Croce and Hein, ICML 2020). On the positive side, we report a somewhat surprising empirical result of \"transductive adversarial training\": Adversarially retraining the model using fresh randomness at the test time gives a significant increase in robustness against attacks we consider.", "keywords": "adversarial robustness;transductive learning;test-time defense;dynamic defense;attacking model spaces", "primary_area": "", "supplementary_material": "/attachment/e8d0e1b51f2f9e01fb2986f13326f11fa0b11c05.zip", "author": "Jiefeng Chen;Xi Wu;Yang Guo;Yingyu Liang;Somesh Jha", "authorids": "~Jiefeng_Chen2;~Xi_Wu1;~Yang_Guo4;~Yingyu_Liang1;~Somesh_Jha1", "gender": "M;M;M;;M", "homepage": "https://jfc43.github.io/;http://andrewxiwu.github.io/;;;", "dblp": "199/3381;37/4465-1;;;j/SomeshJha", "google_scholar": "5mOfQfAAAAAJ;OmmxazMAAAAJ;BbQQEPcAAAAJ;;BaI7l8QAAAAJ", "orcid": ";;;;", "linkedin": "jiefeng-chen-aa1769122/;;;;", "or_profile": "~Jiefeng_Chen2;~Xi_Wu1;~Yang_Guo4;~Yingyu_Liang1;~Somesh_Jha1", "aff": "University of Wisconsin, Madison;Google;Department of Computer Science, University of Wisconsin, Madison;;Department of Computer Science, University of Wisconsin, Madison", "aff_domain": "wisc.edu;google.com;cs.wisc.edu;;cs.wisc.edu", "position": "PhD student;Software Engineer;PhD student;;Full Professor", "bibtex": "@inproceedings{\nchen2022towards,\ntitle={Towards Evaluating the Robustness of Neural Networks Learned by Transduction},\nauthor={Jiefeng Chen and Xi Wu and Yang Guo and Yingyu Liang and Somesh Jha},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=_5js_8uTrx1}\n}", "github": "", "project": "", "reviewers": "WKKJ;2Lpz;q4Dr;7duz", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "5;4;4;4", "correctness": "3;3;4;4", "technical_novelty": "3;2;3;2", "empirical_novelty": "3;2;3;3", "wc_summary_paper": "38;73;76;159", "wc_summary_review": "21;35;13;96", "wc_main_review": "163;468;175;414", "wc_review": "222;576;264;669", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "544;1126;135;550", "reply_reviewers": "0;0;0;0", "reply_authors": "1;2;1;1", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 86.5, 44.44378471732578 ], "wc_summary_review_avg": [ 41.25, 32.575872973720905 ], "wc_main_review_avg": [ 305.0, 137.39905385409318 ], "wc_review_avg": [ 432.75, 193.14939166355146 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 588.75, 352.8564686951339 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 1.0, "gs_citation": 22, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10802124604610826531&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=_5js_8uTrx1", "email": "wisc.edu;google.com;cs.wisc.edu;;cs.wisc.edu", "author_num": 5, "aff_unique_index": "0;1;2;2", "aff_unique_norm": "University of Wisconsin;Google;University of Wisconsin-Madison", "aff_unique_dep": ";Google;Department of Computer Science", "aff_unique_url": "https://www.wisc.edu;https://www.google.com;https://www.wisc.edu", "aff_unique_abbr": "UW;Google;UW-Madison", "aff_campus_unique_index": "0;1;0;0", "aff_campus_unique": "Madison;Mountain View", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "_67HnXYixmN", "title": "Nested Policy Reinforcement Learning for Clinical Decision Support", "track": "main", "status": "Reject", "tldr": "", "abstract": "Off-policy reinforcement learning (RL) has proven to be a powerful framework for guiding agents' actions in environments with stochastic rewards and unknown or noisy state dynamics. In many real-world settings, these agents must operate in multiple environments, each with slightly different dynamics. For example, we may be interested in developing policies to guide medical treatment for patients with and without a given disease, or policies to navigate curriculum design for students with and without a learning disability. Here, we introduce nested policy fitted Q-iteration (NFQI), an RL framework that finds optimal policies in environments that exhibit such a structure. Our approach develops a nested $Q$-value function that takes advantage of the shared structure between two groups of observations from two separate environments while allowing their policies to be distinct from one another. We find that NFQI yields policies that rely on relevant features and perform at least as well as a policy that does not consider group structure. We demonstrate NFQI's performance using an OpenAI Gym environment and a clinical decision making RL task. Our results suggest that NFQI can develop policies that are better suited to many real-world clinical environments. ", "keywords": "machine learning for healthcare;reinforcement learning", "primary_area": "", "supplementary_material": "/attachment/3688860e608282d0971298872b9abcb9ba6eec78.zip", "author": "Aishwarya Mandyam;Andrew Jones;Krzysztof Laudanski;Barbara Engelhardt", "authorids": "~Aishwarya_Mandyam1;~Andrew_Jones6;~Krzysztof_Laudanski1;~Barbara_Engelhardt1", "gender": "F;M;;F", "homepage": "http://aishwarya-rm.github.io;https://andrewcharlesjones.github.io/;https://www.med.upenn.edu/apps/faculty/index.php/g275/p8399133;https://beehive.stanford.edu", "dblp": "298/6915;;;27/2355", "google_scholar": "Fd9lH-sAAAAJ;;;https://scholar.google.com.tw/citations?user=VEGtG7YAAAAJ", "orcid": ";;;", "linkedin": "aishwaryamandyam/;;;", "or_profile": "~Aishwarya_Mandyam1;~Andrew_Jones6;~Krzysztof_Laudanski1;~Barbara_Engelhardt1", "aff": "Princeton University;Princeton University;University of Pennsylvania;Princeton University", "aff_domain": "princeton.edu;princeton.edu;upenn.edu;princeton.edu", "position": "PhD student;PhD student;Associate Professor;Full Professor", "bibtex": "@misc{\nmandyam2022nested,\ntitle={Nested Policy Reinforcement Learning for Clinical Decision Support},\nauthor={Aishwarya Mandyam and Andrew Jones and Krzysztof Laudanski and Barbara Engelhardt},\nyear={2022},\nurl={https://openreview.net/forum?id=_67HnXYixmN}\n}", "github": "", "project": "", "reviewers": "rPvd;KSWV;gMUj;EMFG", "site": "https://openreview.net/forum?id=_67HnXYixmN", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "5;4;4;2", "correctness": "2;3;2;3", "technical_novelty": "2;2;2;2", "empirical_novelty": "1;2;1;2", "wc_summary_paper": "80;142;175;134", "wc_summary_review": "105;31;83;87", "wc_main_review": "583;1626;1275;98", "wc_review": "768;1799;1533;319", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 3.75, 1.0897247358851685 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 1.5, 0.5 ], "wc_summary_paper_avg": [ 132.75, 34.11286414243166 ], "wc_summary_review_avg": [ 76.5, 27.54541704167864 ], "wc_main_review_avg": [ 895.5, 593.9934763951537 ], "wc_review_avg": [ 1104.75, 590.8013096634096 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.9271726499455306, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Of3bPcx6-nMJ:scholar.google.com/&scioq=Nested+Policy+Reinforcement+Learning+for+Clinical+Decision+Support&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Princeton University;University of Pennsylvania", "aff_unique_dep": ";", "aff_unique_url": "https://www.princeton.edu;https://www.upenn.edu", "aff_unique_abbr": "Princeton;UPenn", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "_7YnfGdDVML", "title": "DCoM: A Deep Column Mapper for Semantic Data Type Detection", "track": "main", "status": "Reject", "tldr": "", "abstract": "Detection of semantic data types is a very crucial task in data science for automated data cleaning, schema matching, data discovery, semantic data type normalization and sensitive data identification. Existing methods include regular expression-based or dictionary lookup-based methods that are not robust to dirty as well unseen data and are limited to a very less number of semantic data types to predict. Existing Machine Learning methods extract a large number of engineered features from data and build logistic regression, random forest or feedforward neural network for this purpose. In this paper, we introduce DCoM, a collection of multi-input NLP-based deep neural networks to detect semantic data types where instead of extracting a large number of features from the data, we feed the raw values of columns (or instances) to the model as texts. We train DCoM on 686,765 data columns extracted from the VizNet corpus with 78 different semantic data types. DCoM outperforms other contemporary results with a quite significant margin on the same dataset achieving a support-weighted F1 score of 0.925. ", "keywords": "Semantic Data Type Detection;Machine Learning;Natural Language Processing;Semantic Column Tagging;Sensitive Data Detection;Column Search", "primary_area": "", "supplementary_material": "/attachment/9c7852e32872fba92c78fb82ea32d09c3ed39c6e.zip", "author": "Subhadip Maji;Swapna sourav Rout;Sudeep Choudhary", "authorids": "~Subhadip_Maji1;~Swapna_sourav_Rout1;~Sudeep_Choudhary1", "gender": "M;M;M", "homepage": ";;", "dblp": "259/1375;;", "google_scholar": "https://scholar.google.co.in/citations?hl=en;4Kk-P-AAAAAJ;", "orcid": "0000-0002-8802-1572;;0000-0002-8285-9008", "linkedin": "subhadip-maji-66283810a/;;https://linkedin.com/in/sudeep-choudhary-9b9aa7138", "or_profile": "~Subhadip_Maji1;~Swapna_sourav_Rout1;~Sudeep_Choudhary1", "aff": "Indian Institute of Technology, Kanpur;International Institute of Information Technology Bangalore;", "aff_domain": "iitk.ac.in;iiitb.ac.in;", "position": "PhD student;MS student;", "bibtex": "@misc{\nmaji2022dcom,\ntitle={{DC}oM: A Deep Column Mapper for Semantic Data Type Detection},\nauthor={Subhadip Maji and Swapna sourav Rout and Sudeep Choudhary},\nyear={2022},\nurl={https://openreview.net/forum?id=_7YnfGdDVML}\n}", "github": "", "project": "", "reviewers": "MWJi;S7C6;S4in", "site": "https://openreview.net/forum?id=_7YnfGdDVML", "pdf_size": 0, "recommendation": "3;3;6", "confidence": "5;4;4", "correctness": "2;2;3", "technical_novelty": "1;2;2", "empirical_novelty": "1;2;2", "wc_summary_paper": "69;141;87", "wc_summary_review": "37;32;43", "wc_main_review": "323;164;369", "wc_review": "429;337;499", "wc_reply_reviewers": "308;0;120", "wc_reply_authors": "230;51;217", "reply_reviewers": "1;0;1", "reply_authors": "1;1;1", "recommendation_avg": [ 4.0, 1.4142135623730951 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "correctness_avg": [ 2.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 1.6666666666666667, 0.4714045207910317 ], "empirical_novelty_avg": [ 1.6666666666666667, 0.4714045207910317 ], "wc_summary_paper_avg": [ 99.0, 30.59411708155671 ], "wc_summary_review_avg": [ 37.333333333333336, 4.4969125210773475 ], "wc_main_review_avg": [ 285.3333333333333, 87.82684985558548 ], "wc_review_avg": [ 421.6666666666667, 66.33919572084734 ], "wc_reply_reviewers_avg": [ 142.66666666666666, 126.75786190826807 ], "wc_reply_authors_avg": [ 166.0, 81.49028571962836 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.5, "corr_recommendation_correctness": 1.0, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10153125146425145115&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1", "aff_unique_norm": "Indian Institute of Technology Kanpur;International Institute of Information Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.iitk.ac.in;https://iiitb.ac.in", "aff_unique_abbr": "IIT Kanpur;IIITB", "aff_campus_unique_index": "0;1", "aff_campus_unique": "Kanpur;Bangalore", "aff_country_unique_index": "0;0", "aff_country_unique": "India" }, { "id": "_B8Jd7Nqs7R", "title": "Improved Generalization Bound for Deep Neural Networks Using Geometric Functional Analysis", "track": "main", "status": "Reject", "tldr": "", "abstract": "Understanding how a neural network behaves in multiple domains is the key to further its explainability, generalizability, and robustness. In this paper, we prove a novel generalization bound using the fundamental concepts of geometric functional analysis. Specifically, by leveraging the covering number of the training dataset and applying certain geometric inequalities we show that a sharp bound can be obtained. To the best of our knowledge this is the first approach which utilizes covering numbers to estimate such generalization bounds.", "keywords": "Generalization bounds;Geometric functional analysis", "primary_area": "", "supplementary_material": "/attachment/1eca2c545764321e8a25f65ebed8933fda741476.zip", "author": "Phani raj Chinnalingu;Rajarshi Banerjee", "authorids": "~Phani_raj_Chinnalingu1;~Rajarshi_Banerjee2", "gender": "M;M", "homepage": "https://ece.iisc.ac.in/~nextgenwrl/Members.html;", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";rajarshi-banerjee-88170b191/", "or_profile": "~Phani_raj_Chinnalingu1;~Rajarshi_Banerjee2", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nchinnalingu2022improved,\ntitle={Improved Generalization Bound for Deep Neural Networks Using Geometric Functional Analysis},\nauthor={Phani raj Chinnalingu and Rajarshi Banerjee},\nyear={2022},\nurl={https://openreview.net/forum?id=_B8Jd7Nqs7R}\n}", "github": "", "project": "", "reviewers": "YPEk;3Prr;FW5M;SWDb;nLh5", "site": "https://openreview.net/forum?id=_B8Jd7Nqs7R", "pdf_size": 0, "recommendation": "1;3;3;5;6", "confidence": "4;4;3;4;3", "correctness": "1;3;2;4;4", "technical_novelty": "2;2;2;2;4", "empirical_novelty": "2;2;0;2;2", "wc_summary_paper": "44;66;70;54;22", "wc_summary_review": "64;41;17;28;49", "wc_main_review": "422;259;548;103;183", "wc_review": "530;366;635;185;254", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "0;146;0;74;134", "reply_reviewers": "0;0;0;0;0", "reply_authors": "0;1;0;1;1", "recommendation_avg": [ 3.6, 1.7435595774162693 ], "confidence_avg": [ 3.6, 0.4898979485566356 ], "correctness_avg": [ 2.8, 1.16619037896906 ], "technical_novelty_avg": [ 2.4, 0.8 ], "empirical_novelty_avg": [ 1.6, 0.8 ], "wc_summary_paper_avg": [ 51.2, 17.232527382830416 ], "wc_summary_review_avg": [ 39.8, 16.31441080762649 ], "wc_main_review_avg": [ 303.0, 161.5685612982922 ], "wc_review_avg": [ 394.0, 167.71523484764288 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 70.8, 62.74519901952659 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0.6, 0.48989794855663565 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.42146361521176234, "corr_recommendation_correctness": 0.9442673704375604, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Pm4CfWbszmIJ:scholar.google.com/&scioq=Improved+Generalization+Bound+for+Deep+Neural+Networks+Using+Geometric+Functional+Analysis&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "title": "PriorGrad: Improving Conditional Denoising Diffusion Models with Data-Dependent Adaptive Prior", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6445", "id": "_BNiN4IjC5", "poster": "", "openreview": "https://openreview.net/forum?id=_BNiN4IjC5", "slides": "https://iclr.cc/virtual/2022/poster/6445", "video": "https://iclr.cc/virtual/2022/poster/6445", "author_site": "Sang-gil Lee, Heeseung Kim, Chaehun Shin, Xu Tan, Chang Liu, Qi Meng, Tao Qin, Wei Chen, Sungroh Yoon, Tie-Yan Liu", "tldr": "", "abstract": "Denoising diffusion probabilistic models have been recently proposed to generate high-quality samples by estimating the gradient of the data density. The framework assumes the prior noise as a standard Gaussian distribution, whereas the corresponding data distribution may be more complicated than the standard Gaussian distribution, which potentially introduces inefficiency in denoising the prior noise into the data sample because of the discrepancy between the data and the prior. In this paper, we propose PriorGrad to improve the efficiency of the conditional diffusion model (for example, a vocoder using a mel-spectrogram as the condition) by applying an adaptive prior derived from the data statistics based on the conditional information. We formulate the training and sampling procedures of PriorGrad and demonstrate the advantages of an adaptive prior through a theoretical analysis. Focusing on the audio domain, we consider the recently proposed diffusion-based audio generative models based on both the spectral and time domains and show that PriorGrad achieves faster convergence and superior performance, leading to an improved perceptual quality and tolerance to a smaller network capacity, and thereby demonstrating the efficiency of a data-dependent adaptive prior.", "keywords": "diffusion-based model;generative model;speech synthesis", "primary_area": "", "supplementary_material": "", "author": "Sang-gil Lee;Heeseung Kim;Chaehun Shin;Xu Tan;Chang Liu;Qi Meng;Tao Qin;Wei Chen;Sungroh Yoon;Tie-Yan Liu", "authorids": "~Sang-gil_Lee1;~Heeseung_Kim1;~Chaehun_Shin1;~Xu_Tan1;~Chang_Liu10;~Qi_Meng1;~Tao_Qin1;~Wei_Chen1;~Sungroh_Yoon1;~Tie-Yan_Liu1", "gender": "M;M;M;M;M;F;M;F;;M", "homepage": "https://github.com/L0SG;https://gmltmd789.github.io;https://github.com/chaehunshin;https://tan-xu.github.io/;https://changliu00.github.io/;;https://www.microsoft.com/en-us/research/people/taoqin/;https://weichen-cas.github.io/;http://ailab.snu.ac.kr;http://member.acm.org/~tieyanliu", "dblp": "190/7789;294/8710;287/9294;96/10484-3;52/5716-30;;14/6841;;99/1474;l/TieYanLiu", "google_scholar": "P93s2UQAAAAJ;4ojbJpoAAAAJ;M8RX0MEAAAAJ;tob-U1oAAAAJ;rYd0GEsAAAAJ;t-z3K34AAAAJ;Bl4SRU0AAAAJ;https://scholar.google.com/citations?hl=en;Bphl_fIAAAAJ;Nh832fgAAAAJ", "orcid": "0000-0002-1981-056X;;;0000-0001-5631-0639;0000-0001-5207-5440;;;;0000-0002-2367-197X;0000-0002-0476-8020", "linkedin": "sang-gil-lee/;gmltmd789/;;;chang-liu-9ab479168/;;;;;", "or_profile": "~Sang-gil_Lee1;~Heeseung_Kim1;~Chaehun_Shin1;~Xu_Tan1;~Chang_Liu10;~Qi_Meng1;~Tao_Qin1;~Wei_Chen1;~Sungroh_Yoon1;~Tie-Yan_Liu1", "aff": "NVIDIA;Seoul National University;Seoul National University;Microsoft;Microsoft;Microsoft;Microsoft Research Asia; Chinese Academy of Sciences;Seoul National University;Microsoft", "aff_domain": "nvidia.com;snu.ac.kr;snu.ac.kr;microsoft.com;microsoft.com;microsoft.com;microsoft.com;ict.ac.cn;snu.ac.kr;microsoft.com", "position": "Internship;PhD student;PhD student;Principal Researcher;Researcher;associate researcher;Principal Researcher;Full Professor;Full Professor;Distinguished Scientist", "bibtex": "@inproceedings{\nlee2022priorgrad,\ntitle={PriorGrad: Improving Conditional Denoising Diffusion Models with Data-Dependent Adaptive Prior},\nauthor={Sang-gil Lee and Heeseung Kim and Chaehun Shin and Xu Tan and Chang Liu and Qi Meng and Tao Qin and Wei Chen and Sungroh Yoon and Tie-Yan Liu},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=_BNiN4IjC5}\n}", "github": "", "project": "", "reviewers": "518v;Wdw2;spyq;D8iM", "pdf_size": 0, "recommendation": "6;6;6;6", "confidence": "4;4;3;4", "correctness": "3;3;3;1", "technical_novelty": "2;3;3;3", "empirical_novelty": "3;3;2;2", "wc_summary_paper": "88;161;72;19", "wc_summary_review": "90;123;32;83", "wc_main_review": "646;760;482;247", "wc_review": "824;1044;586;349", "wc_reply_reviewers": "29;0;0;146", "wc_reply_authors": "1265;669;1088;1525", "reply_reviewers": "1;0;0;2", "reply_authors": "2;1;2;3", "recommendation_avg": [ 6.0, 0.0 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 2.5, 0.8660254037844386 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 85.0, 50.76908508137605 ], "wc_summary_review_avg": [ 82.0, 32.58066911528982 ], "wc_main_review_avg": [ 533.75, 192.8034945222726 ], "wc_review_avg": [ 700.75, 259.7627523337401 ], "wc_reply_reviewers_avg": [ 43.75, 60.20952997657431 ], "wc_reply_authors_avg": [ 1136.75, 311.58977438292163 ], "reply_reviewers_avg": [ 0.75, 0.82915619758885 ], "reply_authors_avg": [ 2.0, 0.7071067811865476 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 10, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 140, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=279374286400814704&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=_BNiN4IjC5", "email": "nvidia.com;snu.ac.kr;snu.ac.kr;microsoft.com;microsoft.com;microsoft.com;microsoft.com;ict.ac.cn;snu.ac.kr;microsoft.com", "author_num": 10, "aff_unique_index": "0;1;1;2;2;2;2;3;1;2", "aff_unique_norm": "NVIDIA;Seoul National University;Microsoft;Chinese Academy of Sciences", "aff_unique_dep": "NVIDIA Corporation;;Microsoft Corporation;", "aff_unique_url": "https://www.nvidia.com;https://www.snu.ac.kr;https://www.microsoft.com;https://www.cas.cn", "aff_unique_abbr": "NVIDIA;SNU;Microsoft;CAS", "aff_campus_unique_index": "1", "aff_campus_unique": ";Asia", "aff_country_unique_index": "0;1;1;0;0;0;2;2;1;0", "aff_country_unique": "United States;South Korea;China" }, { "title": "F8Net: Fixed-Point 8-bit Only Multiplication for Network Quantization", "status": "Oral", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/5943", "id": "_CfpJazzXT2", "poster": "", "openreview": "https://openreview.net/forum?id=_CfpJazzXT2", "slides": "https://iclr.cc/virtual/2022/poster/5943", "video": "https://iclr.cc/virtual/2022/poster/5943", "author_site": "Qing Jin, Jian Ren, Richard Zhuang, Sumant Hanumante, Zhengang Li, Zhiyu Chen, Yanzhi Wang, Kaiyuan Yang, Sergey Tulyakov", "tldr": "", "abstract": "Neural network quantization is a promising compression technique to reduce memory footprint and save energy consumption, potentially leading to real-time inference. However, there is a performance gap between quantized and full-precision models. To reduce it, existing quantization approaches require high-precision INT32 or full-precision multiplication during inference for scaling or dequantization. This introduces a noticeable cost in terms of memory, speed, and required energy. To tackle these issues, we present F8Net, a novel quantization framework consisting in only \ufb01xed-point 8-bit multiplication. To derive our method, we \ufb01rst discuss the advantages of \ufb01xed-point multiplication with different formats of \ufb01xed-point numbers and study the statistical behavior of the associated \ufb01xed-point numbers. Second, based on the statistical and algorithmic analysis, we apply different \ufb01xed-point formats for weights and activations of different layers. We introduce a novel algorithm to automatically determine the right format for each layer during training. Third, we analyze a previous quantization algorithm\u2014parameterized clipping activation (PACT)\u2014and reformulate it using \ufb01xed-point arithmetic. Finally, we unify the recently proposed method for quantization \ufb01ne-tuning and our \ufb01xed-point approach to show the potential of our method. We verify F8Net on ImageNet for MobileNet V1/V2 and ResNet18/50. Our approach achieves comparable and better performance, when compared not only to existing quantization techniques with INT32 multiplication or \ufb02oating point arithmetic, but also to the full-precision counterparts, achieving state-of-the-art performance.", "keywords": "Neural Network Quantization;Fixed-Point Arithmetic", "primary_area": "", "supplementary_material": "", "author": "Qing Jin;Jian Ren;Richard Zhuang;Sumant Hanumante;Zhengang Li;Zhiyu Chen;Yanzhi Wang;Kaiyuan Yang;Sergey Tulyakov", "authorids": "~Qing_Jin1;~Jian_Ren2;rzhuang@snapchat.com;shanumante@snapchat.com;~Zhengang_Li2;~Zhiyu_Chen3;~Yanzhi_Wang3;~Kaiyuan_Yang1;~Sergey_Tulyakov1", "gender": ";M;;;M;M;M;;M", "homepage": ";https://alanspike.github.io/;;;;;https://web.northeastern.edu/yanzhiwang/;https://vlsi.rice.edu;http://www.stulyakov.com/", "dblp": "37/11144;59/2180-5;;;https://dblp.uni-trier.de/pers/hd/l/Li:Zhengang;;;;40/6115", "google_scholar": "X9iggBcAAAAJ;https://scholar.google.co.jp/citations?user=vDALiU4AAAAJ;;;;;https://scholar.google.com/citations?hl=en;;mgzXR0sAAAAJ", "orcid": "0000-0001-8795-9297;;;;;;;;", "linkedin": ";;;;;zhiyu-chen-720445179/;;;sergeytulyakov/", "or_profile": "~Qing_Jin1;~Jian_Ren2;rzhuang@snapchat.com;shanumante@snapchat.com;~Zhengang_Li2;~Zhiyu_Chen3;~Yanzhi_Wang3;~Kaiyuan_Yang1;~Sergey_Tulyakov1", "aff": "Northeastern University;Snap Inc.;;;Northeastern University;Rice University;Northeastern University;Rice University;", "aff_domain": "northeastern.edu;snapchat.com;;;neu.edu;rice.edu;northeastern.edu;rice.edu;", "position": "PhD Student;Research Scientist;;;PhD student;PhD student;Associate Professor;Assistant Professor;", "bibtex": "@inproceedings{\njin2022fnet,\ntitle={F8Net: Fixed-Point 8-bit Only Multiplication for Network Quantization},\nauthor={Qing Jin and Jian Ren and Richard Zhuang and Sumant Hanumante and Zhengang Li and Zhiyu Chen and Yanzhi Wang and Kaiyuan Yang and Sergey Tulyakov},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=_CfpJazzXT2}\n}", "github": "", "project": "", "reviewers": "1v2J;xQht;jedd;7TRc", "pdf_size": 0, "recommendation": "5;5;6;10", "confidence": "5;4;4;5", "correctness": "3;3;4;4", "technical_novelty": "3;2;3;4", "empirical_novelty": "2;2;0;4", "wc_summary_paper": "95;68;109;105", "wc_summary_review": "48;46;166;49", "wc_main_review": "294;329;291;346", "wc_review": "437;443;566;500", "wc_reply_reviewers": "115;408;52;0", "wc_reply_authors": "1760;2513;1463;195", "reply_reviewers": "1;3;1;0", "reply_authors": "5;8;4;1", "recommendation_avg": [ 6.5, 2.0615528128088303 ], "confidence_avg": [ 4.5, 0.5 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.0, 1.4142135623730951 ], "wc_summary_paper_avg": [ 94.25, 15.990231392947383 ], "wc_summary_review_avg": [ 77.25, 51.2512194976861 ], "wc_main_review_avg": [ 315.0, 23.313086453749534 ], "wc_review_avg": [ 486.5, 52.06966487312935 ], "wc_reply_reviewers_avg": [ 143.75, 157.90562846206592 ], "wc_reply_authors_avg": [ 1482.75, 836.2076222446193 ], "reply_reviewers_avg": [ 1.25, 1.0897247358851685 ], "reply_authors_avg": [ 4.5, 2.5 ], "replies_avg": [ 30, 0 ], "authors#_avg": [ 9, 0 ], "corr_recommendation_confidence": 0.48507125007266594, "corr_recommendation_correctness": 0.7276068751089989, "gs_citation": 56, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9661231870650652462&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=_CfpJazzXT2", "email": "northeastern.edu;snapchat.com;;;neu.edu;rice.edu;northeastern.edu;rice.edu;", "author_num": 9, "aff_unique_index": "0;1;0;2;0;2", "aff_unique_norm": "Northeastern University;Snap Inc.;Rice University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.northeastern.edu;https://www.snapinc.com;https://www.rice.edu", "aff_unique_abbr": "NEU;Snap;Rice", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "_DqUHcsQfaE", "title": "Inference-Time Personalized Federated Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "In Federated learning (FL), multiple clients collaborate to learn a model through a central server but keep the data decentralized. Personalized federated learning (PFL) further extends FL to handle data heterogeneity between clients by learning personalized models. In both FL and PFL, all clients participate in the training process and their labeled data is used for training. However, in reality, novel clients may wish to join a prediction service after it has been deployed, obtaining predictions for their own unlabeled data.\n \nHere, we defined a new learning setup, Inference-Time PFL (IT-PFL), where a model trained on a set of clients, needs to be later evaluated on novel unlabeled clients at inference time. We propose a novel approach to this problem IT-PFL-HN, based on a hypernetwork module and an encoder module. Specifically, we train an encoder network that learns a representation for a client given its unlabeled data. That client representation is fed to a hypernetwork that generates a personalized model for that client. Evaluated on four benchmark datasets, we find that IT-PFL-HN generalizes better than current FL and PFL methods, especially when the novel client has a large domain shift. We also analyzed the generalization error for the novel client, showing how it can be bounded using results from multi-task learning and domain adaptation. Finally, since novel clients do not contribute their data to training, they can potentially have better control over their data privacy; Indeed, we showed analytically and experimentally how novel clients can apply differential privacy to their data. ", "keywords": "Federated learning;Personalized federated learning;hypernetworks", "primary_area": "", "supplementary_material": "", "author": "Ohad Amosy;Gal Eyal;Gal Chechik", "authorids": "~Ohad_Amosy1;galeyal10@gmail.com;~Gal_Chechik1", "gender": ";;", "homepage": ";;https://chechiklab.biu.ac.il/~gal/", "dblp": ";;c/GalChechik", "google_scholar": ";;Wk2gAZUAAAAJ", "orcid": ";;0000-0001-9164-5303", "linkedin": ";;", "or_profile": "~Ohad_Amosy1;galeyal10@gmail.com;~Gal_Chechik1", "aff": "Bar Ilan University, Technion;;NVIDIA", "aff_domain": "biu.ac.il;;nvidia.com", "position": "PhD student;;Principal Researcher", "bibtex": "@misc{\namosy2022inferencetime,\ntitle={Inference-Time Personalized Federated Learning},\nauthor={Ohad Amosy and Gal Eyal and Gal Chechik},\nyear={2022},\nurl={https://openreview.net/forum?id=_DqUHcsQfaE}\n}", "github": "", "project": "", "reviewers": "pXWe;GyWf;MpJV;o4oc", "site": "https://openreview.net/forum?id=_DqUHcsQfaE", "pdf_size": 0, "recommendation": "3;5;5;5", "confidence": "4;4;3;4", "correctness": "4;3;3;3", "technical_novelty": "1;2;2;2", "empirical_novelty": "2;3;3;2", "wc_summary_paper": "59;47;129;49", "wc_summary_review": "50;4;70;31", "wc_main_review": "236;121;244;261", "wc_review": "345;172;443;341", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.5, 0.8660254037844386 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 1.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 71.0, 33.793490497431605 ], "wc_summary_review_avg": [ 38.75, 24.34517405975977 ], "wc_main_review_avg": [ 215.5, 55.30144663568938 ], "wc_review_avg": [ 325.25, 97.45351455950679 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": -1.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8124057957124636770&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1", "aff_unique_norm": "Bar-Ilan University;NVIDIA", "aff_unique_dep": ";NVIDIA Corporation", "aff_unique_url": "https://www.biu.ac.il;https://www.nvidia.com", "aff_unique_abbr": "BIU;NVIDIA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "Israel;United States" }, { "id": "_ERVcPna8IP", "title": "Can network pruning benefit deep learning under label noise?", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Network pruning is a widely-used technique to reduce the computational cost of over-parameterized neural networks. Conventional wisdom also regards pruning as a way to improve generalization: by zeroing out parameters, pruning reduces model capacity and prevents overfitting. However, this wisdom is facing challenges in a line of recent studies, which show that over-parameterization actually helps generalization. In this work, we demonstrate the existence of a novel double descent phenomenon in sparse regimes, namely, in the presence of label noise, medium sparsity induced by pruning hurts model performance, while high sparsity benefits. Through extensive experiments on noisy versions of MNIST, CIFAR-10 and CIFAR-100, We show that proper pruning could consistently promise non-trivial robustness against label noise, which provides a new lens for studying network pruning. Further, we reassess some common beliefs concerning the generalization of sparse networks, and hypothesize it is the distance from initialization that is key to robustness rather than sharpness/flatness. Experimental results correlate with this hypothesis. Together, our study provides valuable insight on whether, when and why network pruning benefits deep learning under label noise.\n", "keywords": "network pruning;label noise;double descent;sparse loss landscape", "primary_area": "", "supplementary_material": "/attachment/93d6d7ee5296452529313f7a6dc9630e73ace790.zip", "author": "Zheng He;Quanzhi Zhu;Zengchang Qin", "authorids": "~Zheng_He1;~Quanzhi_Zhu2;~Zengchang_Qin1", "gender": "F;;M", "homepage": "https://github.com/he-zh;https://github.com/zqzsdu;", "dblp": ";;05/1860", "google_scholar": ";;gl_tc8IAAAAJ", "orcid": ";;0000-0002-8084-6721", "linkedin": ";;zengchangqin/", "or_profile": "~Zheng_He1;~Quanzhi_Zhu2;~Zengchang_Qin1", "aff": "Beihang University;Beihang University;Beihang University", "aff_domain": "buaa.edu.cn;buaa.edu.cn;buaa.edu.cn", "position": "MS student;MS student;Associate Professor", "bibtex": "@misc{\nhe2022can,\ntitle={Can network pruning benefit deep learning under label noise?},\nauthor={Zheng He and Quanzhi Zhu and Zengchang Qin},\nyear={2022},\nurl={https://openreview.net/forum?id=_ERVcPna8IP}\n}", "github": "", "project": "", "reviewers": "Zqq9;2GQs;bSpe;7bEQ", "site": "https://openreview.net/forum?id=_ERVcPna8IP", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "4;3;4;4", "correctness": "3;2;3;3", "technical_novelty": "2;2;1;2", "empirical_novelty": "2;3;3;4", "wc_summary_paper": "109;119;31;146", "wc_summary_review": "98;84;59;122", "wc_main_review": "464;1238;469;703", "wc_review": "671;1441;559;971", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 1.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 101.25, 42.75730931665368 ], "wc_summary_review_avg": [ 90.75, 22.818577957445115 ], "wc_main_review_avg": [ 718.5, 315.09562040752013 ], "wc_review_avg": [ 910.5, 341.3220619883807 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.13245323570650439, "corr_recommendation_correctness": -0.13245323570650439, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:FUZFAYrfUCEJ:scholar.google.com/&scioq=Can+network+pruning+benefit+deep+learning+under+label+noise%3F&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;0", "aff_unique_norm": "Beihang University", "aff_unique_dep": "", "aff_unique_url": "http://www.buaa.edu.cn/", "aff_unique_abbr": "BUAA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "Spread Spurious Attribute: Improving Worst-group Accuracy with Spurious Attribute Estimation", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/5900", "id": "_F9xpOrqyX9", "poster": "", "openreview": "https://openreview.net/forum?id=_F9xpOrqyX9", "slides": "https://iclr.cc/virtual/2022/poster/5900", "video": "https://iclr.cc/virtual/2022/poster/5900", "author_site": "Junhyun Nam, Jaehyung Kim, Jaeho Lee, Jinwoo Shin", "tldr": "", "abstract": "The paradigm of worst-group loss minimization has shown its promise in avoiding to learn spurious correlations, but requires costly additional supervision on spurious attributes. To resolve this, recent works focus on developing weaker forms of supervision---e.g., hyperparameters discovered with a small number of validation samples with spurious attribute annotation---but none of the methods retain comparable performance to methods using full supervision on the spurious attribute. In this paper, instead of searching for weaker supervisions, we ask: Given access to a fixed number of samples with spurious attribute annotations, what is the best achievable worst-group loss if we ''fully exploit'' them? To this end, we propose a pseudo-attribute-based algorithm, coined Spread Spurious Attribute (SSA), for improving the worst-group accuracy. In particular, we leverage samples both with and without spurious attribute annotations to train a model to predict the spurious attribute, then use the pseudo-attribute predicted by the trained model as supervision on the spurious attribute to train a new robust model having minimal worst-group loss. Our experiments on various benchmark datasets show that our algorithm consistently outperforms the baseline methods using the same number of validation samples with spurious attribute annotations. We also demonstrate that the proposed SSA can achieve comparable performances to methods using full (100%) spurious attribute supervision, by using a much smaller number of annotated samples---from 0.6% and up to 1.5%, depending on the dataset.", "keywords": "worst-group loss minimization;spurious correlation", "primary_area": "", "supplementary_material": "/attachment/6b693e71064507cd3a8cd859ba3b1899c1cb3cc6.zip", "author": "Junhyun Nam;Jaehyung Kim;Jaeho Lee;Jinwoo Shin", "authorids": "~Junhyun_Nam1;~Jaehyung_Kim1;~Jaeho_Lee3;~Jinwoo_Shin1", "gender": "M;M;M;M", "homepage": "https://alinlab.kaist.ac.kr;https://sites.google.com/view/jaehyungkim;https://jaeho-lee.github.io;https://sites.google.com/site/mijirim/", "dblp": ";02/7206-1;78/6080-1;31/7062", "google_scholar": ";https://scholar.google.co.kr/citations?user=6OYOsGsAAAAJ;t91zoQMAAAAJ;https://scholar.google.com.tw/citations?user=m3eDp7kAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Junhyun_Nam1;~Jaehyung_Kim1;~Jaeho_Lee3;~Jinwoo_Shin1", "aff": "Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology", "aff_domain": "kaist.ac.kr;kaist.ac.kr;kaist.ac.kr;kaist.ac.kr", "position": "PhD student;PhD student;Postdoc;Associate Professor", "bibtex": "@inproceedings{\nnam2022spread,\ntitle={Spread Spurious Attribute: Improving Worst-group Accuracy with Spurious Attribute Estimation },\nauthor={Junhyun Nam and Jaehyung Kim and Jaeho Lee and Jinwoo Shin},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=_F9xpOrqyX9}\n}", "github": "", "project": "", "reviewers": "a6uc;dgzg;qZ2x;kQbw", "pdf_size": 0, "recommendation": "6;6;6;8", "confidence": "4;5;3;4", "correctness": "3;4;3;3", "technical_novelty": "3;3;2;2", "empirical_novelty": "3;3;2;3", "wc_summary_paper": "88;54;62;95", "wc_summary_review": "31;65;52;37", "wc_main_review": "45;373;393;531", "wc_review": "164;492;507;663", "wc_reply_reviewers": "41;0;54;13", "wc_reply_authors": "47;1067;1258;833", "reply_reviewers": "1;0;1;1", "reply_authors": "1;2;2;2", "recommendation_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 74.75, 17.166464400102893 ], "wc_summary_review_avg": [ 46.25, 13.254716141811564 ], "wc_main_review_avg": [ 335.5, 178.41174288706446 ], "wc_review_avg": [ 456.5, 181.66521406147078 ], "wc_reply_reviewers_avg": [ 27.0, 21.50581316760657 ], "wc_reply_authors_avg": [ 801.25, 460.74525228156176 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.75, 0.4330127018922193 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": -0.3333333333333333, "gs_citation": 105, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15529682602522509204&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=_F9xpOrqyX9", "email": "kaist.ac.kr;kaist.ac.kr;kaist.ac.kr;kaist.ac.kr", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Korea Advanced Institute of Science and Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.kaist.ac.kr", "aff_unique_abbr": "KAIST", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "South Korea" }, { "id": "_HFPHFbJrP-", "title": "Certified Adversarial Robustness Under the Bounded Support Set", "track": "main", "status": "Reject", "tldr": "", "abstract": "Deep neural networks (DNNs) have revealed severe vulnerability to adversarial perturbations, beside empirical adversarial training for robustness, the design of provably robust classifiers attracts more and more attention. Randomized smoothing method provides the certified robustness with agnostic architecture, which is further extended to a provable robustness framework using $f$-divergence. While these methods cannot be applied to smoothing measures with bounded support set such as uniform probability measure due to the use of likelihood ratio in their certification methods. In this paper, we introduce a framework that is able to deal with robustness properties of arbitrary smoothing measures including those with bounded support set by using Wasserstein distance as well as total variation distance. By applying our methodology to uniform probability measures with support set $B_{2}(O,r)$, we obtain certified robustness properties with respect to $l_{p}$-perturbations. And by applying to uniform probability measures with support set $B_{\\infty}(O,r)$, we obtain certified robustness properties with respect to $l_{1},l_{2},l_{\\infty}$-perturbations. We present experimental results on CIFAR-10 dataset with ResNet to validate our theory. It is worth mentioning that our certification procedure only costs constant computation time which is an improvement upon the state-of-the-art methods in terms of the computation time.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yiwen Kou;Qinyuan Zheng;Yisen Wang", "authorids": "~Yiwen_Kou1;~Qinyuan_Zheng1;~Yisen_Wang1", "gender": "F;M;M", "homepage": "https://evankou.github.io/;https://github.com/Jeff0115;https://yisenwang.github.io/", "dblp": "323/9058;;172/1346-1", "google_scholar": "https://scholar.google.com/citations?hl=en;;uMWPDboAAAAJ", "orcid": ";;", "linkedin": "yiwen-kou-5a444916b/;;", "or_profile": "~Yiwen_Kou1;~Qinyuan_Zheng1;~Yisen_Wang1", "aff": "Peking University;Peking University;Peking University", "aff_domain": "pku.edu.cn;pku.edu.cn;pku.edu.cn", "position": "Undergrad student;Undergrad student;Assistant Professor", "bibtex": "@misc{\nkou2022certified,\ntitle={Certified Adversarial Robustness Under the Bounded Support Set},\nauthor={Yiwen Kou and Qinyuan Zheng and Yisen Wang},\nyear={2022},\nurl={https://openreview.net/forum?id=_HFPHFbJrP-}\n}", "github": "", "project": "", "reviewers": "neZn;hLLe;55UR", "site": "https://openreview.net/forum?id=_HFPHFbJrP-", "pdf_size": 0, "recommendation": "3;3;5", "confidence": "3;4;3", "correctness": "4;4;3", "technical_novelty": "3;2;3", "empirical_novelty": "2;0;2", "wc_summary_paper": "63;24;48", "wc_summary_review": "42;38;25", "wc_main_review": "270;176;252", "wc_review": "375;238;325", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "387;446;754", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "recommendation_avg": [ 3.6666666666666665, 0.9428090415820634 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "correctness_avg": [ 3.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 1.3333333333333333, 0.9428090415820634 ], "wc_summary_paper_avg": [ 45.0, 16.06237840420901 ], "wc_summary_review_avg": [ 35.0, 7.2571803523590805 ], "wc_main_review_avg": [ 232.66666666666666, 40.737642979872504 ], "wc_review_avg": [ 312.6666666666667, 56.60584971734125 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 529.0, 160.91198422325996 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.5, "corr_recommendation_correctness": -1.0, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14521301205175113544&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;0", "aff_unique_norm": "Peking University", "aff_unique_dep": "", "aff_unique_url": "http://www.pku.edu.cn", "aff_unique_abbr": "Peking U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "id": "_J-pKtWbDKc", "title": "Explainable Automatic Hypothesis Generation via High-order Graph Walks", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "In this paper, we study the automatic hypothesis generation (HG) problem, focusing on explainability. Given pairs of biomedical terms, we focus on link prediction to explain how the prediction was made. This more transparent process encourages trust in the biomedical community for automatic hypothesis generation systems. We use a reinforcement learning strategy to formulate the HG problem as a guided node-pair embedding-based link prediction problem via a directed graph walk. Given nodes in a node-pair, the model starts a graph walk, simultaneously aggregating information from the visited nodes and their neighbors for an improved node-pair representation. Then at the end of the walk, it infers the probability of a link from the gathered information. This guided walk framework allows for explainability via the walk trajectory information. By evaluating our model on predicting the links between millions of biomedical terms in both transductive and inductive settings, we verified the effectiveness of our proposed model on obtaining higher prediction accuracy than baselines and understanding the reason for a link prediction. ", "keywords": "Hypothesis generation;Edge embedding;Reinforcement learning;Graph walk;Link prediction", "primary_area": "", "supplementary_material": "", "author": "Uchenna Akujuobi;Xiangliang Zhang;Sucheendra Palaniappan;Michael Spranger", "authorids": "~Uchenna_Akujuobi1;~Xiangliang_Zhang1;suchee@sbx-corp.com;~Michael_Spranger2", "gender": "M;F;;", "homepage": ";https://sites.nd.edu/xiangliang-zhang/;;", "dblp": ";74/1890-1;;", "google_scholar": "a8Af3L4AAAAJ;BhRJe4wAAAAJ;;", "orcid": ";0000-0002-3574-5665;;", "linkedin": ";;;", "or_profile": "~Uchenna_Akujuobi1;~Xiangliang_Zhang1;suchee@sbx-corp.com;~Michael_Spranger2", "aff": "Sony Research;University of Notre Dame;;", "aff_domain": "sony.com;nd.edu;;", "position": "Research Scientist;Associate Professor;;", "bibtex": "@misc{\nakujuobi2022explainable,\ntitle={Explainable Automatic Hypothesis Generation via High-order Graph Walks},\nauthor={Uchenna Akujuobi and Xiangliang Zhang and Sucheendra Palaniappan and Michael Spranger},\nyear={2022},\nurl={https://openreview.net/forum?id=_J-pKtWbDKc}\n}", "github": "", "project": "", "reviewers": "mPD9;jH1y;NwbJ;6CYy", "site": "https://openreview.net/forum?id=_J-pKtWbDKc", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "4;4;4;3", "correctness": "4;4;3;3", "technical_novelty": "1;2;2;3", "empirical_novelty": "0;2;2;3", "wc_summary_paper": "74;62;99;82", "wc_summary_review": "39;30;51;3", "wc_main_review": "112;118;252;171", "wc_review": "225;210;402;256", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 1.75, 1.0897247358851685 ], "wc_summary_paper_avg": [ 79.25, 13.442005058770064 ], "wc_summary_review_avg": [ 30.75, 17.66882848408462 ], "wc_main_review_avg": [ 163.25, 56.14879784999854 ], "wc_review_avg": [ 273.25, 76.16224458352052 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.6622661785325219, "corr_recommendation_correctness": -0.6882472016116854, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:R-i7TSZiO3wJ:scholar.google.com/&scioq=Explainable+Automatic+Hypothesis+Generation+via+High-order+Graph+Walks&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1", "aff_unique_norm": "Sony;University of Notre Dame", "aff_unique_dep": "Research;", "aff_unique_url": "https://www.sony.com;https://www.nd.edu", "aff_unique_abbr": "Sony;Notre Dame", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "Japan;United States" }, { "id": "_K6rwRjW9WO", "title": "RieszNet and ForestRiesz: Automatic Debiased Machine Learning with Neural Nets and Random Forests", "track": "main", "status": "Reject", "tldr": "", "abstract": "Many causal and policy effects of interest are defined by linear functionals of high-dimensional or non-parametric regression functions. $\\sqrt{n}$-consistent and asymptotically normal estimation of the object of interest requires debiasing to reduce the effects of regularization and/or model selection on the object of interest. Debiasing is typically achieved by adding a correction term to the plug-in estimator of the functional, that is derived based on a functional-specific theoretical derivation of what is known as the influence function and which leads to properties such as double robustness and Neyman orthogonality. We instead implement an automatic debiasing procedure based on automatically learning the Riesz representation of the linear functional using Neural Nets and Random Forests. Our method solely requires value query oracle access to the linear functional. We propose a multi-tasking Neural Net debiasing method with stochastic gradient descent minimization of a combined Reisz representer and regression loss, while sharing representation layers for the two functions. We also propose a random forest method which learns a locally linear representation of the Reisz function. Even though our methodology applies to arbitrary functionals, we experimentally find that it beats state of the art performance of the prior neural net based estimator of Shi et al. (2019) for the case of the average treatment effect functional. We also evaluate our method on the more challenging problem of estimating average marginal effects with continuous treatments, using semi-synthetic data of gasoline price changes on gasoline demand.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/abbbe8b4b1c96095b553ec35d031e43c8febd914.zip", "author": "Victor Quintas-Martinez;Victor Chernozhukov;Vasilis Syrgkanis;Whitney Newey", "authorids": "~Victor_Quintas-Martinez1;~Victor_Chernozhukov1;~Vasilis_Syrgkanis1;~Whitney_Newey1", "gender": "M;Not Specified;;M", "homepage": "https://economics.mit.edu/faculty;https://www.victorchernozhukov.com/;https://www.vsyrgkanis.com;https://economics.mit.edu/people/faculty/whitney-newey", "dblp": ";;;", "google_scholar": ";6VW1kJgAAAAJ;G1WMpcUAAAAJ;", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Victor_Quintas-Martinez1;~Victor_Chernozhukov1;~Vasilis_Syrgkanis1;~Whitney_Newey1", "aff": "Massachusetts Institute of Technology;Massachusetts Institute of Technology;Microsoft;Massachusetts Institute of Technology", "aff_domain": "mit.edu;mit.edu;microsoft.com;mit.edu", "position": "PhD student;Full Professor;Researcher;Full Professor", "bibtex": "@misc{\nquintas-martinez2022riesznet,\ntitle={RieszNet and ForestRiesz: Automatic Debiased Machine Learning with Neural Nets and Random Forests},\nauthor={Victor Quintas-Martinez and Victor Chernozhukov and Vasilis Syrgkanis and Whitney Newey},\nyear={2022},\nurl={https://openreview.net/forum?id=_K6rwRjW9WO}\n}", "github": "", "project": "", "reviewers": "WPay;DMWH;S7Am;47Pi", "site": "https://openreview.net/forum?id=_K6rwRjW9WO", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "4;4;3;2", "correctness": "4;3;3;3", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "69;169;131;107", "wc_summary_review": "150;40;36;61", "wc_main_review": "417;792;326;1308", "wc_review": "636;1001;493;1476", "wc_reply_reviewers": "0;0;0;817", "wc_reply_authors": "576;636;344;905", "reply_reviewers": "0;0;0;1", "reply_authors": "1;1;1;2", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 119.0, 36.359317925395686 ], "wc_summary_review_avg": [ 71.75, 46.16478636363435 ], "wc_main_review_avg": [ 710.75, 386.53290610244295 ], "wc_review_avg": [ 901.5, 379.90558037491365 ], "wc_reply_reviewers_avg": [ 204.25, 353.7713774459432 ], "wc_reply_authors_avg": [ 615.25, 199.6889769115962 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.899228803025897, "corr_recommendation_correctness": -0.6622661785325219, "gs_citation": 66, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9961128829212907766&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Massachusetts Institute of Technology;Microsoft", "aff_unique_dep": ";Microsoft Corporation", "aff_unique_url": "https://web.mit.edu;https://www.microsoft.com", "aff_unique_abbr": "MIT;Microsoft", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "_Ko4kT3ckWy", "title": "Increase and Conquer: Training Graph Neural Networks on Growing Graphs", "track": "main", "status": "Reject", "tldr": "", "abstract": "Graph neural networks (GNNs) use graph convolutions to exploit network invariances and learn meaningful features from network data. However, on large-scale graphs convolutions incur in high computational cost, leading to scalability limitations. Leveraging the graphon --- the limit object of a graph --- in this paper we consider the problem of learning a graphon neural network (WNN) --- the limit object of a GNN --- by training GNNs on graphs sampled Bernoulli from the graphon. Under smoothness conditions, we show that: (i) the expected distance between the learning steps on the GNN and on the WNN decreases asymptotically with the size of the graph, and (ii) when training on a sequence of growing graphs, gradient descent follows the learning direction of the WNN. Inspired by these results, we propose a novel algorithm to learn GNNs on large-scale graphs that, starting from a moderate number of nodes, successively increases the size of the graph during training. This algorithm is benchmarked on both a recommendation system and a decentralized control problem where it is shown to retain comparable performance, to its large-scale counterpart, at a reduced computational cost.", "keywords": "Machine Learning;Graph Neural Networks", "primary_area": "", "supplementary_material": "/attachment/cf516ee4e8e977c8ac60b5cac752316a8a41c35b.zip", "author": "Juan Cervino;Luana Ruiz;Alejandro Ribeiro", "authorids": "~Juan_Cervino1;~Luana_Ruiz1;~Alejandro_Ribeiro1", "gender": "M;F;M", "homepage": "https://juancervino.github.io/;https://sites.google.com/view/luana-ruiz/home;https://alelab.seas.upenn.edu", "dblp": ";;32/15", "google_scholar": "lbyYN_sAAAAJ;J-rZew8AAAAJ;7mrPM4kAAAAJ", "orcid": ";;0000-0003-4230-9906", "linkedin": ";;", "or_profile": "~Juan_Cervino1;~Luana_Ruiz1;~Alejandro_Ribeiro1", "aff": "University of Pennsylvania;School of Engineering and Applied Science, University of Pennsylvania;University of Pennsylvania", "aff_domain": "upenn.edu;seas.upenn.edu;upenn.edu", "position": "PhD student;PhD student;Full Professor", "bibtex": "@misc{\ncervino2022increase,\ntitle={Increase and Conquer: Training Graph Neural Networks on Growing Graphs},\nauthor={Juan Cervino and Luana Ruiz and Alejandro Ribeiro},\nyear={2022},\nurl={https://openreview.net/forum?id=_Ko4kT3ckWy}\n}", "github": "", "project": "", "reviewers": "3Wdv;n2bA;TUDu", "site": "https://openreview.net/forum?id=_Ko4kT3ckWy", "pdf_size": 0, "recommendation": "3;3;6", "confidence": "4;4;4", "correctness": "3;2;3", "technical_novelty": "3;3;3", "empirical_novelty": "2;3;3", "wc_summary_paper": "122;78;88", "wc_summary_review": "92;54;65", "wc_main_review": "829;410;344", "wc_review": "1043;542;497", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 4.0, 1.4142135623730951 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 2.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 96.0, 18.83259585576738 ], "wc_summary_review_avg": [ 70.33333333333333, 15.965240019770727 ], "wc_main_review_avg": [ 527.6666666666666, 214.77171342820935 ], "wc_review_avg": [ 694.0, 247.46312856666142 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.5, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=900327663452910868&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Pennsylvania", "aff_unique_dep": "", "aff_unique_url": "https://www.upenn.edu", "aff_unique_abbr": "UPenn", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "_L0nSXXUDDR", "title": "Learning with Neighbor Consistency for Noisy Labels", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Recent advances in deep learning have relied on large, labelled datasets to train high-capacity models. However, collecting large datasets in a time- and cost-efficient manner often results in label noise. We present a method for learning from noisy labels that leverages similarities between training examples in feature space, encouraging the prediction of each example to be similar to its nearest neighbours. Compared to training algorithms that use multiple models or distinct stages, our approach takes the form of a simple, additional regularization term. It can be interpreted as an inductive version of the classical, transductive label propagation algorithm. We compare our approach to relevant baselines under both synthetic and realistic noise, and demonstrate that our simple approach achieves state-of-the-art accuracy under the realistic conditions of mini-ImageNet-Red, mini-WebVision and Clothing1M.\n\n", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ahmet Iscen;Jack Valmadre;Anurag Arnab;Cordelia Schmid", "authorids": "~Ahmet_Iscen3;~Jack_Valmadre1;~Anurag_Arnab1;~Cordelia_Schmid1", "gender": "M;M;;F", "homepage": ";https://jack.valmadre.net/;;https://cordeliaschmid.github.io/", "dblp": "140/7520;50/8535;;s/CordeliaSchmid", "google_scholar": "wIjyqzAAAAAJ;_VSBqL0AAAAJ;;IvqCXP4AAAAJ", "orcid": ";;;", "linkedin": ";;;cordelia-schmid-47985a9", "or_profile": "~Ahmet_Iscen3;~Jack_Valmadre1;~Anurag_Arnab1;~Cordelia_Schmid1", "aff": "Google;University of Adelaide;;Inria", "aff_domain": "google.com;adelaide.edu.au;;inria.fr", "position": "Researcher;Lecturer;;Researcher", "bibtex": "@misc{\niscen2022learning,\ntitle={Learning with Neighbor Consistency for Noisy Labels},\nauthor={Ahmet Iscen and Jack Valmadre and Anurag Arnab and Cordelia Schmid},\nyear={2022},\nurl={https://openreview.net/forum?id=_L0nSXXUDDR}\n}", "github": "", "project": "", "reviewers": "Lsqe;5Cna;NLEL", "site": "https://openreview.net/forum?id=_L0nSXXUDDR", "pdf_size": 0, "recommendation": "5;5;5", "confidence": "4;4;4", "correctness": "3;3;3", "technical_novelty": "3;2;2", "empirical_novelty": "2;2;3", "wc_summary_paper": "36;72;92", "wc_summary_review": "42;93;33", "wc_main_review": "252;374;670", "wc_review": "330;539;795", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 5.0, 0.0 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 66.66666666666667, 23.170862929310353 ], "wc_summary_review_avg": [ 56.0, 26.419689627245813 ], "wc_main_review_avg": [ 432.0, 175.50688495516826 ], "wc_review_avg": [ 554.6666666666666, 190.15841349312478 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 97, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6760478343448133888&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff_unique_index": "0;1;2", "aff_unique_norm": "Google;University of Adelaide;INRIA", "aff_unique_dep": "Google;;", "aff_unique_url": "https://www.google.com;https://www.adelaide.edu.au;https://www.inria.fr", "aff_unique_abbr": "Google;Adelaide;Inria", "aff_campus_unique_index": "0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;1;2", "aff_country_unique": "United States;Australia;France" }, { "id": "_LNdXw0BSx", "title": "Towards Coherent and Consistent Use of Entities in Narrative Generation", "track": "main", "status": "Reject", "tldr": "", "abstract": "Large pre-trained language models (LMs) have demonstrated impressive capabilities in generating long, fluent text; however, there is little to no analysis on their ability to maintain entity coherence and consistency. In this work, we focus on the end task of narrative generation and systematically analyse the long-range entity coherence and consistency in generated stories. First, we propose a set of automatic metrics for measuring model performance in terms of entity usage. Given these metrics, we quantify the limitations of current LMs. Next, we propose augmenting a pre-trained LM with a dynamic entity memory in an end-to-end manner by using an auxiliary entity-related loss for guiding the reads and writes to the memory. We demonstrate that the dynamic entity memory increases entity coherence according to both automatic and human judgment and helps preserving entity-related information especially in settings with a limited context window. Finally, we also validate that our automatic metrics are correlated with human ratings and serve as a good indicator of the quality of generated stories. ", "keywords": "language modeling;narrative generation;entity memory;dynamic representations", "primary_area": "", "supplementary_material": "", "author": "Pinelopi Papalampidi;Kris Cao;Tom\u00e1\u0161 Ko\u010disk\u00fd", "authorids": "~Pinelopi_Papalampidi1;~Kris_Cao2;~Tom\u00e1\u0161_Ko\u010disk\u00fd1", "gender": "F;M;M", "homepage": "https://ppapalampidi.github.io/;https://kriscao.github.io;", "dblp": "203/9741.html;;", "google_scholar": "https://scholar.google.gr/citations?user=3VE4eWAAAAAJ;;LuLM2EoAAAAJ", "orcid": ";;", "linkedin": ";;https://uk.linkedin.com/in/tomaskocisky", "or_profile": "~Pinelopi_Papalampidi1;~Kris_Cao1;~Tomas_Kocisky1", "aff": "University of Edinburgh;Google DeepMind;Google DeepMind", "aff_domain": "ed.ac.uk;google.com;google.com", "position": "PhD student;Researcher;Research Scientist", "bibtex": "@misc{\npapalampidi2022towards,\ntitle={Towards Coherent and Consistent Use of Entities in Narrative Generation},\nauthor={Pinelopi Papalampidi and Kris Cao and Tom{\\'a}{\\v{s}} Ko{\\v{c}}isk{\\'y}},\nyear={2022},\nurl={https://openreview.net/forum?id=_LNdXw0BSx}\n}", "github": "", "project": "", "reviewers": "XC2i;2yed;sZJR;Q7Fu", "site": "https://openreview.net/forum?id=_LNdXw0BSx", "pdf_size": 0, "recommendation": "5;5;5;6", "confidence": "5;3;4;4", "correctness": "4;3;3;3", "technical_novelty": "4;2;2;3", "empirical_novelty": "3;2;2;0", "wc_summary_paper": "87;107;56;171", "wc_summary_review": "79;48;34;91", "wc_main_review": "402;248;359;354", "wc_review": "568;403;449;616", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "411;270;460;117", "reply_reviewers": "0;0;0;0", "reply_authors": "2;1;2;2", "recommendation_avg": [ 5.25, 0.4330127018922193 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.82915619758885 ], "empirical_novelty_avg": [ 1.75, 1.0897247358851685 ], "wc_summary_paper_avg": [ 105.25, 42.08547849318099 ], "wc_summary_review_avg": [ 63.0, 22.94558781116753 ], "wc_main_review_avg": [ 340.75, 56.70703219178376 ], "wc_review_avg": [ 509.0, 86.26412927746966 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 314.5, 133.66843307228524 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.75, 0.4330127018922193 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": -0.3333333333333333, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3888239152813411015&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff_unique_index": "0;1;1", "aff_unique_norm": "University of Edinburgh;Google", "aff_unique_dep": ";Google DeepMind", "aff_unique_url": "https://www.ed.ac.uk;https://deepmind.com", "aff_unique_abbr": "Edinburgh;DeepMind", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United Kingdom" }, { "id": "_MO2xzOZXv", "title": "Count-GNN: Graph Neural Networks for Subgraph Isomorphism Counting", "track": "main", "status": "Reject", "tldr": "", "abstract": "The prevalence of graph structures has attracted a surge of research interest in graph data. As many graph-based tasks exploit recurring subgraph patterns on graphs, subgraph isomorphism counting becomes an important problem. Classical methods usually boil down to a backtracking framework that needs to navigate a huge search space with prohibitive computational cost due to the NP-completeness of the problem. Some recent studies resort to graph neural networks (GNNs) to learn a low-dimensional representation for both the query subgraph and the input graph, in order to predict the number of query subgraph isomorphisms on the input graph. However, typical GNNs employ a node-centric message passing mechanism that receives and aggregates messages on nodes. While effective on node-oriented tasks, they become inadequate in complex structure matching for isomorphism counting. Moreover, given an input graph, the space of query subgraph is enormous, and thus expecting a single model to fit the diverse range of query subgraphs is unrealistic. In this paper, we propose a novel GNN called Count-GNN for subgraph isomorphic counting, to deal with the above challenges at two levels. At the edge level, we resort to an edge-centric message passing scheme, where messages on edges are propagated and aggregated based on the edge adjacency. By treating edges as first-class citizens, Count-GNN is able to preserve finer-grained structural information, given that an edge is an atomic unit of encoding graph structures. At the graph level, we modulate the graph representation conditioned on the query subgraph, so that the model can be adapted to each unique query for better matching with the input graph. To demonstrate the effectiveness and efficiency of Count-GNN, we conduct extensive experiments on a number of benchmark graphs. Results show that Count-GNN achieves superior performance in comparison to the state-of-the-art baselines.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/e269a78ab896db081c7d0c449f7b55e475c9f0f8.zip", "author": "Xingtong Yu;Zemin Liu;Yuan Fang;Xinming Zhang", "authorids": "1317125762@qq.com;~Zemin_Liu1;~Yuan_Fang1;xinming@ustc.edu.cn", "gender": ";M;M;", "homepage": ";https://zemin-liu.github.io/;http://www.yfang.site;", "dblp": ";17/964.html;22/981-1;", "google_scholar": ";IxHO1nkAAAAJ;XkBJjPUAAAAJ;", "orcid": ";0000-0001-6262-9435;0000-0002-4265-5289;", "linkedin": ";;;", "or_profile": "1317125762@qq.com;~Zemin_Liu1;~Yuan_Fang1;xinming@ustc.edu.cn", "aff": ";Singapore Management University;Singapore Management University;", "aff_domain": ";smu.edu.sg;smu.edu.sg;", "position": ";Postdoc;Assistant Professor;", "bibtex": "@misc{\nyu2022countgnn,\ntitle={Count-{GNN}: Graph Neural Networks for Subgraph Isomorphism Counting},\nauthor={Xingtong Yu and Zemin Liu and Yuan Fang and Xinming Zhang},\nyear={2022},\nurl={https://openreview.net/forum?id=_MO2xzOZXv}\n}", "github": "", "project": "", "reviewers": "Pcig;GT7Q;PD36;s5Qd;AQWQ", "site": "https://openreview.net/forum?id=_MO2xzOZXv", "pdf_size": 0, "recommendation": "3;3;5;5;8", "confidence": "4;3;4;4;4", "correctness": "3;2;3;3;4", "technical_novelty": "2;2;2;2;3", "empirical_novelty": "3;2;2;3;3", "wc_summary_paper": "132;64;96;71;75", "wc_summary_review": "69;78;29;35;22", "wc_main_review": "348;303;568;235;210", "wc_review": "549;445;693;341;307", "wc_reply_reviewers": "0;0;40;0;14", "wc_reply_authors": "401;495;246;763;48", "reply_reviewers": "0;0;1;0;1", "reply_authors": "1;1;1;1;1", "recommendation_avg": [ 4.8, 1.8330302779823362 ], "confidence_avg": [ 3.8, 0.39999999999999997 ], "correctness_avg": [ 3.0, 0.6324555320336759 ], "technical_novelty_avg": [ 2.2, 0.39999999999999997 ], "empirical_novelty_avg": [ 2.6, 0.4898979485566356 ], "wc_summary_paper_avg": [ 87.6, 24.630062931304092 ], "wc_summary_review_avg": [ 46.6, 22.526428922490133 ], "wc_main_review_avg": [ 332.8, 127.34425782107334 ], "wc_review_avg": [ 467.0, 141.2515486640766 ], "wc_reply_reviewers_avg": [ 10.8, 15.57433786714543 ], "wc_reply_authors_avg": [ 390.6, 239.98883307354114 ], "reply_reviewers_avg": [ 0.4, 0.48989794855663565 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.49099025303098287, "corr_recommendation_correctness": 0.8625819491779426, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:1V7JD9MQ7tkJ:scholar.google.com/&scioq=Count-GNN:+Graph+Neural+Networks+for+Subgraph+Isomorphism+Counting&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Singapore Management University", "aff_unique_dep": "", "aff_unique_url": "https://www.smu.edu.sg", "aff_unique_abbr": "SMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Singapore" }, { "id": "_MRiKN8-sw", "title": "Tabular Data Imputation: Choose KNN over Deep Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "As databases are ubiquitous nowadays, missing values constitute a pervasive problem for data analysis. Over the last 70 years, various imputation algorithms for tabular data have been developed and shown useful at estimating missing values. Besides, recent infatuations for Artificial Neural Networks have led to the development of complex and powerful algorithms for data imputation. \nThis study is the first to compare state-of-the-art deep-learning models with the well-established KNN algorithm (1951). By using real-world and generated datasets in various missing data scenarios, we claim that the good old KNN algorithm is still competitive (nay better) than powerful deep-learning algorithms for tabular data imputation.\nThis work advocates for an appropriate and reasonable use of machine learning, in a world where overconsumption, performances and rapidity unfortunately often prevails over sustainability and common sense.", "keywords": "data imputation;knn;deep learning;artificial neural networks;digital sobriety", "primary_area": "", "supplementary_material": "/attachment/32b3af7be3180ad068662dd40f7c1941bfd27046.zip", "author": "Florian Lalande;Kenji Doya", "authorids": "~Florian_Lalande1;~Kenji_Doya1", "gender": "M;M", "homepage": ";https://groups.oist.jp/ncu", "dblp": ";00/100", "google_scholar": "rZ_nO54AAAAJ;https://scholar.google.co.jp/citations?user=SHufeXQAAAAJ", "orcid": ";0000-0002-2446-6820", "linkedin": "florian-lalande-166b99129/;", "or_profile": "~Florian_Lalande1;~Kenji_Doya1", "aff": "Okinawa Institute of Science and Technology (OIST);Okinawa Institute of Science and Technology Graduate University", "aff_domain": "oist.jp;oist.jp", "position": "PhD student;Full Professor", "bibtex": "@misc{\nlalande2022tabular,\ntitle={Tabular Data Imputation: Choose {KNN} over Deep Learning},\nauthor={Florian Lalande and Kenji Doya},\nyear={2022},\nurl={https://openreview.net/forum?id=_MRiKN8-sw}\n}", "github": "", "project": "", "reviewers": "zJAq;m2kd;VqfN", "site": "https://openreview.net/forum?id=_MRiKN8-sw", "pdf_size": 0, "recommendation": "3;3;6", "confidence": "5;4;5", "correctness": "2;2;3", "technical_novelty": "2;2;2", "empirical_novelty": "2;3;3", "wc_summary_paper": "40;257;96", "wc_summary_review": "44;51;34", "wc_main_review": "641;339;662", "wc_review": "725;647;792", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "505;270;506", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "recommendation_avg": [ 4.0, 1.4142135623730951 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "correctness_avg": [ 2.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 131.0, 91.98188227399277 ], "wc_summary_review_avg": [ 43.0, 6.97614984548545 ], "wc_main_review_avg": [ 547.3333333333334, 147.56316914761925 ], "wc_review_avg": [ 721.3333333333334, 59.252754272598075 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 427.0, 111.01651528789158 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.5, "corr_recommendation_correctness": 1.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:lcbW9IDAozQJ:scholar.google.com/&scioq=Tabular+Data+Imputation:+Choose+KNN+over+Deep+Learning&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1", "aff_unique_norm": "Okinawa Institute of Science and Technology;Okinawa Institute of Science and Technology Graduate University", "aff_unique_dep": ";", "aff_unique_url": "https://www.oist.jp;https://www.oist.jp", "aff_unique_abbr": "OIST;OIST", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Japan" }, { "title": "CrossFormer: A Versatile Vision Transformer Hinging on Cross-scale Attention", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6267", "id": "_PHymLIxuI", "poster": "", "openreview": "https://openreview.net/forum?id=_PHymLIxuI", "slides": "https://iclr.cc/virtual/2022/poster/6267", "video": "https://iclr.cc/virtual/2022/poster/6267", "author_site": "Wenxiao Wang, Lu Yao, Long Chen, Binbin Lin, Deng Cai, Xiaofei He, Wei Liu", "tldr": "", "abstract": "Transformers have made great progress in dealing with computer vision tasks. However, existing vision transformers have not yet possessed the ability of building the interactions among features of different scales, which is perceptually important to visual inputs. The reasons are two-fold: (1) Input embeddings of each layer are equal-scale, so no cross-scale feature can be extracted; (2) to lower the computational cost, some vision transformers merge adjacent embeddings inside the self-attention module, thus sacrificing small-scale (fine-grained) features of the embeddings and also disabling the cross-scale interactions. To this end, we propose Cross-scale Embedding Layer (CEL) and Long Short Distance Attention (LSDA). On the one hand, CEL blends each embedding with multiple patches of different scales, providing the self-attention module itself with cross-scale features. On the other hand, LSDA splits the self-attention module into a short-distance one and a long-distance counterpart, which not only reduces the computational burden but also keeps both small-scale and large-scale features in the embeddings. Through the above two designs, we achieve cross-scale attention. Besides, we put forward a dynamic position bias for vision transformers to make the popular relative position bias apply to variable-sized images. Hinging on the cross-scale attention module, we construct a versatile vision architecture, dubbed CrossFormer, which accommodates variable-sized inputs. Extensive experiments show that CrossFormer outperforms the other vision transformers on image classification, object detection, instance segmentation, and semantic segmentation tasks.", "keywords": "vision transformers;architecture", "primary_area": "", "supplementary_material": "/attachment/4f3263d9cf91a43a9bc6b4900188fa10f25bbb04.zip", "author": "Wenxiao Wang;Lu Yao;Long Chen;Binbin Lin;Deng Cai;Xiaofei He;Wei Liu", "authorids": "~Wenxiao_Wang2;~Lu_Yao2;~Long_Chen8;~Binbin_Lin3;~Deng_Cai4;~Xiaofei_He2;~Wei_Liu3", "gender": "M;F;M;M;M;M;M", "homepage": "https://wenxiaowang.com;https://github.com/clarissayl;https://zjuchenlong.github.io/;https://www.linkedin.com/in/binbin-lin-03598b31/;http://www.cad.zju.edu.cn/home/dengcai/;https://person.zju.edu.cn/0007101;https://sites.google.com/view/cuweiliu", "dblp": "243/5853-1;;64/5725-16;51/8073;c/DCai;h/XiaofeiHe.html;49/3283-5", "google_scholar": "https://scholar.google.com.hk/citations?user=rcxOjikAAAAJ;;https://scholar.google.com.sg/citations?user=-gtmMpIAAAAJ;Zmvq4KYAAAAJ;vzxDyJoAAAAJ;QLLFowsAAAAJ;AjxoEpIAAAAJ", "orcid": ";;0000-0001-6148-9709;0000-0002-0330-6406;;0009-0001-9107-2354;0000-0002-3865-8145", "linkedin": ";;;;;;", "or_profile": "~Wenxiao_Wang2;~Lu_Yao2;~Long_Chen8;~Binbin_Lin3;~Deng_Cai4;~Xiaofei_He2;~Wei_Liu3", "aff": "Zhejiang University;Zhejiang University;Columbia University;Zhejiang University;Zhejiang University;Zhejiang University;Tencent", "aff_domain": "zju.edu.cn;zju.edu.cn;columbia.edu;zju.edu.cn;zju.edu.cn;zju.edu.cn;tencent.com", "position": "PhD student;MS student;Postdoc;Researcher;Professor;Professor;Distinguished Scientist", "bibtex": "@inproceedings{\nwang2022crossformer,\ntitle={CrossFormer: A Versatile Vision Transformer Hinging on Cross-scale Attention},\nauthor={Wenxiao Wang and Lu Yao and Long Chen and Binbin Lin and Deng Cai and Xiaofei He and Wei Liu},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=_PHymLIxuI}\n}", "github": "", "project": "", "reviewers": "K7wS;euuQ;GtDH;BmmR", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "5;4;5;4", "correctness": "3;3;3;3", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;0;3;3", "wc_summary_paper": "83;104;87;58", "wc_summary_review": "50;43;65;105", "wc_main_review": "256;139;556;477", "wc_review": "389;286;708;640", "wc_reply_reviewers": "12;21;35;56", "wc_reply_authors": "739;368;997;1266", "reply_reviewers": "1;1;1;1", "reply_authors": "2;1;2;2", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 4.5, 0.5 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 1.224744871391589 ], "wc_summary_paper_avg": [ 83.0, 16.446884203398525 ], "wc_summary_review_avg": [ 65.75, 24.014318645341575 ], "wc_main_review_avg": [ 357.0, 167.12719706858007 ], "wc_review_avg": [ 505.75, 173.8165340236653 ], "wc_reply_reviewers_avg": [ 31.0, 16.59819267269783 ], "wc_reply_authors_avg": [ 842.5, 331.3174459638369 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 1.75, 0.4330127018922193 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": -0.6882472016116854, "corr_recommendation_correctness": 0.0, "gs_citation": 394, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8825090207674893811&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 12, "pdf": "https://openreview.net/pdf?id=_PHymLIxuI", "email": "zju.edu.cn;zju.edu.cn;columbia.edu;zju.edu.cn;zju.edu.cn;zju.edu.cn;tencent.com", "author_num": 7, "aff_unique_index": "0;0;1;0;0;0;2", "aff_unique_norm": "Zhejiang University;Columbia University;Tencent", "aff_unique_dep": ";;Tencent Holdings Limited", "aff_unique_url": "https://www.zju.edu.cn;https://www.columbia.edu;https://www.tencent.com", "aff_unique_abbr": "ZJU;Columbia;Tencent", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;0;0;0", "aff_country_unique": "China;United States" }, { "id": "_PlNmPOsUS9", "title": "PARL: Enhancing Diversity of Ensemble Networks to Resist Adversarial Attacks via Pairwise Adversarially Robust Loss Function", "track": "main", "status": "Reject", "tldr": "", "abstract": "The security of Deep Learning classifiers is a critical field of study because of the existence of adversarial attacks. Such attacks usually rely on the principle of transferability, where an adversarial example crafted on a surrogate classifier tends to mislead the target classifier trained on the same dataset even if both classifiers have quite different architecture. Ensemble methods against adversarial attacks demonstrate that an adversarial example is less likely to mislead multiple classifiers in an ensemble having diverse decision boundaries. However, recent ensemble methods have either been shown to be vulnerable to stronger adversaries or shown to lack an end-to-end evaluation. This paper attempts to develop a new ensemble methodology that constructs multiple diverse classifiers using a Pairwise Adversarially Robust Loss (PARL) function during the training procedure. PARL utilizes gradients of each layer with respect to input in every classifier within the ensemble simultaneously. The proposed training procedure enables PARL to achieve higher robustness with high clean example accuracy against black-box transfer attacks compared to the previous ensemble methods. We also evaluate the robustness in the presence of white-box attacks, where adversarial examples are crafted on the target classifier. We present extensive experiments using standard image classification datasets like CIFAR-10 and CIFAR-100 trained using standard ResNet20 classifier against state-of-the-art adversarial attacks to demonstrate the robustness of the proposed ensemble methodology.", "keywords": "Adversarial Attack;Ensemble-based Defence;Model Diversity", "primary_area": "", "supplementary_material": "/attachment/10ab5e753891f233d6005cb8d448836ddecdc72f.zip", "author": "Manaar Alam;Shubhajit Datta;Debdeep Mukhopadhyay;Arijit Mondal;Partha Pratim Chakrabarti", "authorids": "~Manaar_Alam1;~Shubhajit_Datta1;~Debdeep_Mukhopadhyay2;~Arijit_Mondal1;~Partha_Pratim_Chakrabarti1", "gender": "M;M;M;M;M", "homepage": "https://manaaralam.github.io;;https://sites.google.com/view/debdeepmukhopadhyay/;;http://www.iitkgp.ac.in/department/CS/faculty/cs-ppchak", "dblp": "192/5163;;;08/5656.html;c/PPChakrabarti.html", "google_scholar": "46jmlGgAAAAJ;mgOVQYQAAAAJ;2ELnl9IAAAAJ;;", "orcid": ";;;0000-0001-5060-1427;0000-0002-3553-8834", "linkedin": ";;;;", "or_profile": "~Manaar_Alam1;~Shubhajit_Datta1;~Debdeep_Mukhopadhyay2;~Arijit_Mondal1;~Partha_Pratim_Chakrabarti1", "aff": "Indian Institute of Technology Kharagpur;Indian Institute of Technology Kharagpur;Indian Institute of Technology Kharagpur;Indian Institute of Technology, Patna;Indian Institute of Technology Kharagpur", "aff_domain": "iitkgp.ac.in;iitkgp.ac.in;iitkgp.ac.in;iitp.ac.in;iitkgp.ac.in", "position": "PhD student;PhD student;Full Professor;Assistant Professor;Professor", "bibtex": "@misc{\nalam2022parl,\ntitle={{PARL}: Enhancing Diversity of Ensemble Networks to Resist Adversarial Attacks via Pairwise Adversarially Robust Loss Function},\nauthor={Manaar Alam and Shubhajit Datta and Debdeep Mukhopadhyay and Arijit Mondal and Partha Pratim Chakrabarti},\nyear={2022},\nurl={https://openreview.net/forum?id=_PlNmPOsUS9}\n}", "github": "", "project": "", "reviewers": "BGda;Dge8;SqzR;BhHc", "site": "https://openreview.net/forum?id=_PlNmPOsUS9", "pdf_size": 0, "recommendation": "3;3;6;6", "confidence": "4;5;5;3", "correctness": "3;3;4;4", "technical_novelty": "3;3;3;2", "empirical_novelty": "2;2;3;2", "wc_summary_paper": "69;85;50;23", "wc_summary_review": "8;31;67;23", "wc_main_review": "180;404;331;167", "wc_review": "257;520;448;213", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.5, 1.5 ], "confidence_avg": [ 4.25, 0.82915619758885 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 56.75, 23.09085316743407 ], "wc_summary_review_avg": [ 32.25, 21.695333599647643 ], "wc_main_review_avg": [ 270.5, 100.4800975317998 ], "wc_review_avg": [ 359.5, 128.02441173463754 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.3015113445777637, "corr_recommendation_correctness": 1.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:FIo-yG2dA3gJ:scholar.google.com/&scioq=PARL:+Enhancing+Diversity+of+Ensemble+Networks+to+Resist+Adversarial+Attacks+via+Pairwise+Adversarially+Robust+Loss+Function&hl=en&as_sdt=0,5", "gs_version_total": 3, "aff_unique_index": "0;0;0;1;0", "aff_unique_norm": "Indian Institute of Technology Kharagpur;Indian Institute of Technology Patna", "aff_unique_dep": ";", "aff_unique_url": "https://www.iitkgp.ac.in;https://www.iitp.ac.in", "aff_unique_abbr": "IIT Kharagpur;IIT Patna", "aff_campus_unique_index": "0;0;0;1;0", "aff_campus_unique": "Kharagpur;Patna", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "India" }, { "title": "Efficient Split-Mix Federated Learning for On-Demand and In-Situ Customization", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6081", "id": "_QLmakITKg", "poster": "", "openreview": "https://openreview.net/forum?id=_QLmakITKg", "slides": "https://iclr.cc/virtual/2022/poster/6081", "video": "https://iclr.cc/virtual/2022/poster/6081", "author_site": "Junyuan Hong, Haotao Wang, Zhangyang Wang, Jiayu Zhou", "tldr": "", "abstract": "Federated learning (FL) provides a distributed learning framework for multiple participants to collaborate learning without sharing raw data. In many practical FL scenarios, participants have heterogeneous resources due to disparities in hardware and inference dynamics that require quickly loading models of different sizes and levels of robustness. The heterogeneity and dynamics together impose significant challenges to existing FL approaches and thus greatly limit FL's applicability. In this paper, we propose a novel Split-Mix FL strategy for heterogeneous participants that, once training is done, provides in-situ customization of model sizes and robustness. Specifically, we achieve customization by learning a set of base sub-networks of different sizes and robustness levels, which are later aggregated on-demand according to inference requirements. This split-mix strategy achieves customization with high efficiency in communication, storage, and inference. Extensive experiments demonstrate that our method provides better in-situ customization than the existing heterogeneous-architecture FL methods. Codes and pre-trained models are available: https://github.com/illidanlab/SplitMix.", "keywords": "federated learning", "primary_area": "", "supplementary_material": "", "author": "Junyuan Hong;Haotao Wang;Zhangyang Wang;Jiayu Zhou", "authorids": "~Junyuan_Hong1;~Haotao_Wang1;~Zhangyang_Wang1;~Jiayu_Zhou1", "gender": "M;;M;M", "homepage": "https://jyhong.gitlab.io/;;https://vita-group.github.io;http://jiayuzhou.github.io/", "dblp": "185/1316;236/5090;119/4026;73/1353", "google_scholar": "7Cbv6doAAAAJ;aMIJhlEAAAAJ;pxFyKAIAAAAJ;https://scholar.google.com.tw/citations?user=yQKlLTQAAAAJ", "orcid": "0000-0002-5718-5187;;;0000-0003-4336-6777", "linkedin": ";;;jiayuzhou/", "or_profile": "~Junyuan_Hong1;~Haotao_Wang1;~Zhangyang_Wang1;~Jiayu_Zhou1", "aff": "Sony AI;University of Texas, Austin;University of Texas, Austin;Michigan State University", "aff_domain": "sony.com;utexas.edu;utexas.edu;msu.edu", "position": "Intern;PhD student;Assistant Professor;Associate Professor", "bibtex": "@inproceedings{\nhong2022efficient,\ntitle={Efficient Split-Mix Federated Learning for On-Demand and In-Situ Customization},\nauthor={Junyuan Hong and Haotao Wang and Zhangyang Wang and Jiayu Zhou},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=_QLmakITKg}\n}", "github": "", "project": "", "reviewers": "emNn;7P9j;Yvwo;82wJ", "pdf_size": 0, "recommendation": "3;3;6;8", "confidence": "3;4;4;3", "correctness": "2;3;3;3", "technical_novelty": "3;2;3;3", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "49;195;150;130", "wc_summary_review": "28;383;174;64", "wc_main_review": "509;611;551;394", "wc_review": "586;1189;875;588", "wc_reply_reviewers": "322;0;0;38", "wc_reply_authors": "1929;2204;1147;726", "reply_reviewers": "2;0;0;1", "reply_authors": "3;4;2;2", "recommendation_avg": [ 5.0, 2.1213203435596424 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 131.0, 52.87248812000434 ], "wc_summary_review_avg": [ 162.25, 138.33360943747547 ], "wc_main_review_avg": [ 516.25, 79.34536848487126 ], "wc_review_avg": [ 809.5, 248.65890291722917 ], "wc_reply_reviewers_avg": [ 90.0, 134.8406466908254 ], "wc_reply_authors_avg": [ 1501.5, 592.3117844513986 ], "reply_reviewers_avg": [ 0.75, 0.82915619758885 ], "reply_authors_avg": [ 2.75, 0.82915619758885 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.23570226039551587, "corr_recommendation_correctness": 0.5443310539518174, "gs_citation": 68, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1074497134544260795&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=_QLmakITKg", "email": "sony.com;utexas.edu;utexas.edu;msu.edu", "author_num": 4, "aff_unique_index": "0;1;1;2", "aff_unique_norm": "Sony;University of Texas at Austin;Michigan State University", "aff_unique_dep": "Sony AI;;", "aff_unique_url": "https://www.sony.com;https://www.utexas.edu;https://www.msu.edu", "aff_unique_abbr": "Sony AI;UT Austin;MSU", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Austin", "aff_country_unique_index": "0;1;1;1", "aff_country_unique": "Japan;United States" }, { "id": "_S7yM35SUCy", "title": "Generalizing Cross Entropy Loss with a Beta Proper Composite Loss: An Improved Loss Function for Open Set Recognition", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Open set recognition involves identifying data instances encountered during test time that do not belong to known classes in the training set. The majority of recent deep learning approaches to open set recognition use a cross entropy loss to train their networks. Surprisingly, other loss functions are seldom used. In our work, we explore generalizing cross entropy with a Beta loss. This Beta loss is a proper composite loss with a Beta weight function. This weight function adds the flexibility of putting more emphasis on different parts of the observation-conditioned class probability (i.e. $P(Y|X)$) range during training. We show that the flexibility gained through this is Beta loss function produces consistent improvements over cross entropy loss for open set recognition and produces state of the art results relative to recent methods.", "keywords": "Proper Composite Loss;Open Set Recognition in deep learning;Out-of-distribution detection in deep learning", "primary_area": "", "supplementary_material": "", "author": "Matthew Lyle Olson;Neale Ratzlaff;Weng-Keen Wong", "authorids": "~Matthew_Lyle_Olson1;~Neale_Ratzlaff1;~Weng-Keen_Wong1", "gender": "M;M;M", "homepage": "https://neale.io;http://www.eecs.oregonstate.edu/~wong;http://web.engr.oregonstate.edu/~olsomatt/", "dblp": "218/5264;19/1015;21/10701", "google_scholar": "l3PrX7MAAAAJ;https://scholar.google.com/citations?hl=en;", "orcid": ";0000-0002-6673-343X;", "linkedin": ";;", "or_profile": "~Neale_Ratzlaff1;~Weng-Keen_Wong1;~Matthew_Olson1", "aff": "HRL Laboratories;Oregon State University;Oregon State University", "aff_domain": "hrl.com;oregonstate.edu;oregonstate.edu", "position": "Researcher;Full Professor;PhD student", "bibtex": "@misc{\nolson2022generalizing,\ntitle={Generalizing Cross Entropy Loss with a Beta Proper Composite Loss: An Improved Loss Function for Open Set Recognition},\nauthor={Matthew Lyle Olson and Neale Ratzlaff and Weng-Keen Wong},\nyear={2022},\nurl={https://openreview.net/forum?id=_S7yM35SUCy}\n}", "github": "", "project": "", "reviewers": "T3Gj;D9h4;oFpc;Hm9L", "site": "https://openreview.net/forum?id=_S7yM35SUCy", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "5;4;3;3", "correctness": "2;3;3;3", "technical_novelty": "2;2;3;2", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "87;64;54;85", "wc_summary_review": "56;79;11;18", "wc_main_review": "813;424;254;177", "wc_review": "956;567;319;280", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 72.5, 13.97318861248212 ], "wc_summary_review_avg": [ 41.0, 27.829840100151493 ], "wc_main_review_avg": [ 417.0, 245.47606807996578 ], "wc_review_avg": [ 530.5, 269.1955608846476 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.5222329678670935, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:oC9yKh4IQXsJ:scholar.google.com/&scioq=Generalizing+Cross+Entropy+Loss+with+a+Beta+Proper+Composite+Loss:+An+Improved+Loss+Function+for+Open+Set+Recognition&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1;1", "aff_unique_norm": "HRL Laboratories;Oregon State University", "aff_unique_dep": ";", "aff_unique_url": "https://www.hrl.com;https://oregonstate.edu", "aff_unique_abbr": "HRL;OSU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Mastering Visual Continuous Control: Improved Data-Augmented Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6275", "id": "_SJ-_yyes8", "poster": "", "openreview": "https://openreview.net/forum?id=_SJ-_yyes8", "slides": "https://iclr.cc/virtual/2022/poster/6275", "video": "https://iclr.cc/virtual/2022/poster/6275", "author_site": "Denis Yarats, Rob Fergus, Alessandro Lazaric, Lerrel Pinto", "tldr": "", "abstract": "We present DrQ-v2, a model-free reinforcement learning (RL) algorithm for visual continuous control. DrQ-v2 builds on DrQ, an off-policy actor-critic approach that uses data augmentation to learn directly from pixels. We introduce several improvements that yield state-of-the-art results on the DeepMind Control Suite. Notably, DrQ-v2 is able to solve complex humanoid locomotion tasks directly from pixel observations, previously unattained by model-free RL. DrQ-v2 is conceptually simple, easy to implement, and provides significantly better computational footprint compared to prior work, with the majority of tasks taking just 8 hours to train on a single GPU. Finally, we publicly release DrQ-v2 's implementation to provide RL practitioners with a strong and computationally efficient baseline.", "keywords": "Image-based RL;Data augmentation in RL;Continuous Control", "primary_area": "", "supplementary_material": "/attachment/af7d3bb82221879ccdfe8f02e60cc5718d4dcce4.zip", "author": "Denis Yarats;Rob Fergus;Alessandro Lazaric;Lerrel Pinto", "authorids": "~Denis_Yarats1;~Rob_Fergus1;~Alessandro_Lazaric2;~Lerrel_Pinto1", "gender": "M;M;M;M", "homepage": "http://denis-yarats.info/;http://cs.nyu.edu/fergus/;;https://www.lerrelpinto.com/", "dblp": "200/8142;77/3763;36/321;168/8304", "google_scholar": "7kaXqgMAAAAJ;https://scholar.google.com.tw/citations?user=GgQ9GEkAAAAJ;6JZ3R6wAAAAJ;pmVPj94AAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Denis_Yarats1;~Rob_Fergus1;~Alessandro_Lazaric2;~Lerrel_Pinto1", "aff": "New York University;Google;Meta Facebook;New York University", "aff_domain": "cs.nyu.edu;google.com;fb.com;cs.nyu.edu", "position": "PhD student;Research scientist;Research Scientist;Assistant Professor", "bibtex": "@inproceedings{\nyarats2022mastering,\ntitle={Mastering Visual Continuous Control: Improved Data-Augmented Reinforcement Learning},\nauthor={Denis Yarats and Rob Fergus and Alessandro Lazaric and Lerrel Pinto},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=_SJ-_yyes8}\n}", "github": "", "project": "", "reviewers": "ncsH;uMh5;K4wT;7c6j", "pdf_size": 0, "recommendation": "5;6;8;8", "confidence": "5;5;3;5", "correctness": "1;3;4;3", "technical_novelty": "2;2;1;1", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "89;13;56;64", "wc_summary_review": "47;53;89;82", "wc_main_review": "150;65;293;460", "wc_review": "286;131;438;606", "wc_reply_reviewers": "16;0;10;39", "wc_reply_authors": "596;346;464;328", "reply_reviewers": "1;0;1;1", "reply_authors": "1;1;1;1", "recommendation_avg": [ 6.75, 1.299038105676658 ], "confidence_avg": [ 4.5, 0.8660254037844386 ], "correctness_avg": [ 2.75, 1.0897247358851685 ], "technical_novelty_avg": [ 1.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 55.5, 27.390691849604675 ], "wc_summary_review_avg": [ 67.75, 18.046814123273947 ], "wc_main_review_avg": [ 242.0, 149.93165109475717 ], "wc_review_avg": [ 365.25, 176.3567052878909 ], "wc_reply_reviewers_avg": [ 16.25, 14.324367350776788 ], "wc_reply_authors_avg": [ 433.5, 107.38132984834934 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.5555555555555555, "corr_recommendation_correctness": 0.8388704928078611, "gs_citation": 393, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6421326850849903033&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=_SJ-_yyes8", "email": "cs.nyu.edu;google.com;fb.com;cs.nyu.edu", "author_num": 4, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "New York University;Google;Meta", "aff_unique_dep": ";Google;Meta Platforms, Inc.", "aff_unique_url": "https://www.nyu.edu;https://www.google.com;https://meta.com", "aff_unique_abbr": "NYU;Google;Meta", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "_V7e0PfB3jM", "title": "Noisy $\\ell^{0}$-Sparse Subspace Clustering on Dimensionality Reduced Data", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "High-dimensional data often lie in or close to low-dimensional subspaces. Sparse subspace clustering methods with sparsity induced by $\\ell^{0}$-norm, such as $\\ell^{0}$-Sparse Subspace Clustering ($\\ell^{0}$-SSC) \\citep{YangFJYH16-L0SSC}, are demonstrated to be more effective than its $\\ell^{1}$ counterpart such as Sparse Subspace Clustering (SSC) \\citep{ElhamifarV13}. However, the theoretical analysis of $\\ell^{0}$-SSC is restricted to clean data that lie exactly in subspaces. Real data often suffer from noise and they may lie close to subspaces. In this paper, we show that an optimal solution to the optimization problem of noisy $\\ell^{0}$-SSC achieves Subspace Detection Property (SDP), a key element with which data from different subspaces are separated, under deterministic and randomized models. Our results provide theoretical guarantee on the correctness of noisy $\\ell^{0}$-SSC in terms of SDP on noisy data for the first time. In order to improve the efficiency of noisy $\\ell^{0}$-SSC, we propose Noisy-DR-$\\ell^{0}$-SSC and Noisy-DR-$\\ell^{0}$-SSC-OSNAP which provably recover the subspaces on dimensionality reduced data. Both algorithms first project the data onto a lower dimensional space by linear transformation, then perform noisy $\\ell^{0}$-SSC on the dimensionality reduced data for improved efficiency. Experimental results demonstrate the effectiveness of Noisy-DR-$\\ell^{0}$-SSC and Noisy-DR-$\\ell^{0}$-SSC-OSNAP.", "keywords": "$\\ell^{0}$-Sparse Subspace Clustering;Dimensionality Reduction", "primary_area": "", "supplementary_material": "", "author": "Yingzhen Yang", "authorids": "~Yingzhen_Yang1", "gender": "M", "homepage": "http://yingzhenyang.com", "dblp": "66/3838.html", "google_scholar": "", "orcid": "", "linkedin": "yingzhen-yang-9b869122", "or_profile": "~Yingzhen_Yang1", "aff": "Arizona State University", "aff_domain": "asu.edu", "position": "Assistant Professor", "bibtex": "@misc{\nyang2022noisy,\ntitle={Noisy \\${\\textbackslash}ell{\\textasciicircum}\\{0\\}\\$-Sparse Subspace Clustering on Dimensionality Reduced Data},\nauthor={Yingzhen Yang},\nyear={2022},\nurl={https://openreview.net/forum?id=_V7e0PfB3jM}\n}", "github": "", "project": "", "reviewers": "2Rby;qxv1;Zz2w;Sb2W", "site": "https://openreview.net/forum?id=_V7e0PfB3jM", "pdf_size": 0, "recommendation": "5;5;5;6", "confidence": "3;4;4;5", "correctness": "3;3;4;4", "technical_novelty": "2;2;3;3", "empirical_novelty": "3;2;3;3", "wc_summary_paper": "157;47;83;113", "wc_summary_review": "56;37;63;42", "wc_main_review": "589;430;445;229", "wc_review": "802;514;591;384", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 5.25, 0.4330127018922193 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 100.0, 40.36087214122113 ], "wc_summary_review_avg": [ 49.5, 10.452272480183437 ], "wc_main_review_avg": [ 423.25, 128.18419364336617 ], "wc_review_avg": [ 572.75, 151.63010090348155 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 1, 0 ], "corr_recommendation_confidence": 0.816496580927726, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13650560534455093718&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff_unique_index": "0", "aff_unique_norm": "Arizona State University", "aff_unique_dep": "", "aff_unique_url": "https://www.asu.edu", "aff_unique_abbr": "ASU", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "id": "_Vn-mKDipa1", "title": "Hierarchically Regularized Deep Forecasting", "track": "main", "status": "Reject", "tldr": "", "abstract": "Hierarchical forecasting is a key problem in many practical multivariate forecasting applications - the goal is to simultaneously predict a large number of correlated time series that are arranged in a pre-specified aggregation hierarchy. The main challenge is to exploit the hierarchical correlations to simultaneously obtain good prediction accuracy for time series at different levels of the hierarchy. In this paper, we propose a new approach for hierarchical forecasting which consists of two components. First, decomposing the time series along a global set of basis time series and modeling hierarchical constraints using the coefficients of the basis decomposition. And second, using a linear autoregressive model with coefficients that vary with time. Unlike past methods, our approach is scalable (inference for a specific time series only needs access to its own history) while also modeling the hierarchical structure via (approximate) coherence constraints among the time series forecasts. We experiment on several public datasets and demonstrate significantly improved overall performance on forecasts at different levels of the hierarchy, compared to existing state-of-the-art hierarchical models.", "keywords": "hierarchical time series;deep learning", "primary_area": "", "supplementary_material": "/attachment/4d57b41a4a50b5afd7fecb66412143d1a48c6cda.zip", "author": "Biswajit Paria;Rajat Sen;Amr Ahmed;Abhimanyu Das", "authorids": "~Biswajit_Paria1;~Rajat_Sen1;~Amr_Ahmed1;~Abhimanyu_Das2", "gender": "M;M;M;M", "homepage": ";http://rajatsen91.github.io;https://research.google/people/AmrAhmed/;https://sites.google.com/site/abhidas/", "dblp": "166/5945;http://dblp.uni-trier.de/pers/hd/s/Sen:Rajat;49/2951;83/6359", "google_scholar": "8tgfu84AAAAJ;YzsCLBoAAAAJ;ivUi2T0AAAAJ;", "orcid": ";;;", "linkedin": ";rajat-sen-a8702417/;amr-ahmed-b998965/;", "or_profile": "~Biswajit_Paria1;~Rajat_Sen1;~Amr_Ahmed1;~Abhimanyu_Das2", "aff": "Carnegie Mellon University;Google;;Research, Google", "aff_domain": "cs.cmu.edu;google.com;;research.google.com", "position": "PhD student;Research Scientist;;Researcher", "bibtex": "@misc{\nparia2022hierarchically,\ntitle={Hierarchically Regularized Deep Forecasting},\nauthor={Biswajit Paria and Rajat Sen and Amr Ahmed and Abhimanyu Das},\nyear={2022},\nurl={https://openreview.net/forum?id=_Vn-mKDipa1}\n}", "github": "", "project": "", "reviewers": "iRbQ;UnFM;U9mT", "site": "https://openreview.net/forum?id=_Vn-mKDipa1", "pdf_size": 0, "recommendation": "5;6;6", "confidence": "5;3;4", "correctness": "3;3;3", "technical_novelty": "3;3;2", "empirical_novelty": "2;3;3", "wc_summary_paper": "82;92;52", "wc_summary_review": "57;86;43", "wc_main_review": "469;481;244", "wc_review": "608;659;339", "wc_reply_reviewers": "321;0;0", "wc_reply_authors": "1445;464;459", "reply_reviewers": "1;0;0", "reply_authors": "3;2;1", "recommendation_avg": [ 5.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 75.33333333333333, 16.99673171197595 ], "wc_summary_review_avg": [ 62.0, 17.90716802475106 ], "wc_main_review_avg": [ 398.0, 109.00458705944443 ], "wc_review_avg": [ 535.3333333333334, 140.38122698170469 ], "wc_reply_reviewers_avg": [ 107.0, 151.32085117392117 ], "wc_reply_authors_avg": [ 789.3333333333334, 463.63083973734484 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 2.0, 0.816496580927726 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.8660254037844385, "corr_recommendation_correctness": 0.0, "gs_citation": 22, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1321926212657137257&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;1", "aff_unique_norm": "Carnegie Mellon University;Google", "aff_unique_dep": ";Google", "aff_unique_url": "https://www.cmu.edu;https://www.google.com", "aff_unique_abbr": "CMU;Google", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "CURVATURE-GUIDED DYNAMIC SCALE NETWORKS FOR MULTI-VIEW STEREO", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6520", "id": "_Wzj0J2xs2D", "poster": "", "openreview": "https://openreview.net/forum?id=_Wzj0J2xs2D", "slides": "https://iclr.cc/virtual/2022/poster/6520", "video": "https://iclr.cc/virtual/2022/poster/6520", "author_site": "Khang Truong Giang, Soohwan Song, Sungho Jo", "tldr": "", "abstract": "Multi-view stereo (MVS) is a crucial task for precise 3D reconstruction. Most recent studies tried to improve the performance of matching cost volume in MVS by introducing a skilled design to cost formulation or cost regularization. In this paper, we focus on learning robust feature extraction to enhance the performance of matching costs, without need of heavy computation in the other steps. In particular, we present a dynamic scale feature extraction network, namely, CDSFNet. It is composed of multiple novel convolution layers, each of which can select a proper patch scale for each pixel guided by the normal curvature of image surface. As a result, CDFSNet can estimate the optimal patch scales to learn discriminative features for accurate matching computation between reference and source images. By combining the robust extracted features with an appropriate cost formulation strategy, our final MVS architecture can estimate depth maps more precisely. Extensive experiments showed that the proposed method outperforms other state-of-the-art methods on complex outdoor scenes. It significantly improves the completeness of reconstructed models. Moreover, the method can process the high resolution with faster run-time and lower memory compared to the other MVS methods. ", "keywords": "multi-view stereo;3D reconstruction;dynamic scale", "primary_area": "", "supplementary_material": "/attachment/99881fcea29ea2d22dd91e54ccf305d6e9f88d07.zip", "author": "Khang Truong Giang;Soohwan Song;Sungho Jo", "authorids": "~Khang_Truong_Giang1;dramanet30@naver.com;~Sungho_Jo1", "gender": ";;M", "homepage": ";;http://nmail.kaist.ac.kr/wordpress/index.php/professor-jo-sungho/", "dblp": ";;18/3943", "google_scholar": ";;", "orcid": ";;0000-0002-7618-362X", "linkedin": ";;", "or_profile": "~Khang_Truong_Giang1;dramanet30@naver.com;~Sungho_Jo1", "aff": ";;Korea Advanced Institute of Science & Technology", "aff_domain": ";;kaist.ac.kr", "position": ";;Full Professor", "bibtex": "@inproceedings{\ngiang2022curvatureguided,\ntitle={{CURVATURE}-{GUIDED} {DYNAMIC} {SCALE} {NETWORKS} {FOR} {MULTI}-{VIEW} {STEREO}},\nauthor={Khang Truong Giang and Soohwan Song and Sungho Jo},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=_Wzj0J2xs2D}\n}", "github": "", "project": "", "reviewers": "6P6T;aAob;68iw;Qeua", "pdf_size": 0, "recommendation": "6;6;8;8", "confidence": "3;3;5;4", "correctness": "3;4;4;3", "technical_novelty": "3;3;3;3", "empirical_novelty": "3;2;2;0", "wc_summary_paper": "105;44;104;85", "wc_summary_review": "25;59;204;35", "wc_main_review": "366;208;1080;210", "wc_review": "496;311;1388;330", "wc_reply_reviewers": "62;88;331;59", "wc_reply_authors": "1460;638;2049;662", "reply_reviewers": "1;1;1;1", "reply_authors": "4;1;4;1", "recommendation_avg": [ 7.0, 1.0 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 1.75, 1.0897247358851685 ], "wc_summary_paper_avg": [ 84.5, 24.70323865407125 ], "wc_summary_review_avg": [ 80.75, 72.22317841247366 ], "wc_main_review_avg": [ 466.0, 360.2415856061041 ], "wc_review_avg": [ 631.25, 442.79644025217726 ], "wc_reply_reviewers_avg": [ 135.0, 113.721150187641 ], "wc_reply_authors_avg": [ 1202.25, 590.2687417609034 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 2.5, 1.5 ], "replies_avg": [ 20, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.9045340337332909, "corr_recommendation_correctness": 0.0, "gs_citation": 48, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4920966031938804836&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=_Wzj0J2xs2D", "email": ";;kaist.ac.kr", "author_num": 3, "aff_unique_index": "0", "aff_unique_norm": "Korea Advanced Institute of Science and Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.kaist.ac.kr", "aff_unique_abbr": "KAIST", "aff_country_unique_index": "0", "aff_country_unique": "South Korea" }, { "title": "A Class of Short-term Recurrence Anderson Mixing Methods and Their Applications", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/5953", "id": "_X90SIKbHa", "poster": "", "openreview": "https://openreview.net/forum?id=_X90SIKbHa", "slides": "https://iclr.cc/virtual/2022/poster/5953", "video": "https://iclr.cc/virtual/2022/poster/5953", "author_site": "Fuchao Wei, Chenglong Bao, Yang Liu", "tldr": "", "abstract": "Anderson mixing (AM) is a powerful acceleration method for fixed-point iterations, but its computation requires storing many historical iterations. The extra memory footprint can be prohibitive when solving high-dimensional problems in a resource-limited machine. To reduce the memory overhead, we propose a novel class of short-term recurrence AM methods (ST-AM). The ST-AM methods only store two previous iterations with cheap corrections. We prove that the basic version of ST-AM is equivalent to the full-memory AM in strongly convex quadratic optimization, and with minor changes it has local linear convergence for solving general nonlinear fixed-point problems. We further analyze the convergence properties of the regularized ST-AM for nonconvex (stochastic) optimization. Finally, we apply ST-AM to several applications including solving root-finding problems and training neural networks. Experimental results show that ST-AM is competitive with the long-memory AM and outperforms many existing optimizers. ", "keywords": "Anderson mixing;sequence acceleration;fixed-point iteration;nonconvex optimization;stochastic optimization", "primary_area": "", "supplementary_material": "/attachment/648d8dc52d7101780f7d130381e9405f823de846.zip", "author": "Fuchao Wei;Chenglong Bao;Yang Liu", "authorids": "~Fuchao_Wei1;~Chenglong_Bao3;~Yang_Liu19", "gender": ";M;M", "homepage": ";https://matbc.github.io/;http://nlp.csai.tsinghua.edu.cn/~ly/", "dblp": "303/4842;;51/3710-5", "google_scholar": ";;https://scholar.google.com.hk/citations?user=lVhoKNcAAAAJ", "orcid": "0000-0003-3536-371X;;0000-0002-3087-242X", "linkedin": ";;", "or_profile": "~Fuchao_Wei1;~Chenglong_Bao3;~Yang_Liu19", "aff": "Tsinghua University;Tsinghua University;Tsinghua University", "aff_domain": "tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn", "position": "PhD student;Assistant Professor;Professor", "bibtex": "@inproceedings{\nwei2022a,\ntitle={A Class of Short-term Recurrence Anderson Mixing Methods and Their Applications},\nauthor={Fuchao Wei and Chenglong Bao and Yang Liu},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=_X90SIKbHa}\n}", "github": "", "project": "", "reviewers": "Axgn;xxcF;i7Ws", "pdf_size": 0, "recommendation": "6;6;8", "confidence": "3;3;2", "correctness": "3;3;3", "technical_novelty": "3;3;4", "empirical_novelty": "0;3;3", "wc_summary_paper": "98;56;39", "wc_summary_review": "25;28;29", "wc_main_review": "183;348;815", "wc_review": "306;432;883", "wc_reply_reviewers": "0;0;169", "wc_reply_authors": "430;716;1273", "reply_reviewers": "0;0;1", "reply_authors": "1;1;2", "recommendation_avg": [ 6.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 2.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 3.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.0, 1.4142135623730951 ], "wc_summary_paper_avg": [ 64.33333333333333, 24.796953217863052 ], "wc_summary_review_avg": [ 27.333333333333332, 1.699673171197595 ], "wc_main_review_avg": [ 448.6666666666667, 267.6519298558401 ], "wc_review_avg": [ 540.3333333333334, 247.7018817494575 ], "wc_reply_reviewers_avg": [ 56.333333333333336, 79.66736401368435 ], "wc_reply_authors_avg": [ 806.3333333333334, 350.0307922962715 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.9999999999999998, "corr_recommendation_correctness": 0.0, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11507840632767837877&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=_X90SIKbHa", "email": "tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Tsinghua University", "aff_unique_dep": "", "aff_unique_url": "https://www.tsinghua.edu.cn", "aff_unique_abbr": "THU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "Temporal Efficient Training of Spiking Neural Network via Gradient Re-weighting", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6033", "id": "_XNtisL32jv", "poster": "", "openreview": "https://openreview.net/forum?id=_XNtisL32jv", "slides": "https://iclr.cc/virtual/2022/poster/6033", "video": "https://iclr.cc/virtual/2022/poster/6033", "author_site": "Shikuang Deng, Yuhang Li, Shanghang Zhang, Shi Gu", "tldr": "", "abstract": "Recently, brain-inspired spiking neuron networks (SNNs) have attracted widespread research interest because of their event-driven and energy-efficient characteristics. It is difficult to efficiently train deep SNNs due to the non-differentiability of its activation function, which disables the typically used gradient descent approaches for traditional artificial neural networks (ANNs). Although the adoption of surrogate gradient (SG) formally allows for the back-propagation of losses, the discrete spiking mechanism actually differentiates the loss landscape of SNNs from that of ANNs, failing the surrogate gradient methods to achieve comparable accuracy as for ANNs. In this paper, we first analyze why the current direct training approach with surrogate gradient results in SNNs with poor generalizability. Then we introduce the temporal efficient training (TET) approach to compensate for the loss of momentum in the gradient descent with SG so that the training process can converge into flatter minima with better generalizability. Meanwhile, we demonstrate that TET improves the temporal scalability of SNN and induces a temporal inheritable training for acceleration. Our method consistently outperforms the SOTA on all reported mainstream datasets, including CIFAR-10/100 and ImageNet. Remarkably on DVS-CIFAR10, we obtained 83% top-1 accuracy, over 10% improvement compared to existing state of the art.", "keywords": "Spiking Neural Networks;Direct Training;Surrogate Gradient;Generalizability", "primary_area": "", "supplementary_material": "/attachment/ff7589425476597733582208d6252eae70b09008.zip", "author": "Shikuang Deng;Yuhang Li;Shanghang Zhang;Shi Gu", "authorids": "~Shikuang_Deng1;~Yuhang_Li1;~Shanghang_Zhang4;~Shi_Gu1", "gender": "M;M;;F", "homepage": "https://www.guslab.org/;;https://nangongwubu.github.io/;https://www.shanghangzhang.com/", "dblp": "286/8188;;175/1269;95/11531", "google_scholar": "rtlmA3gAAAAJ;3UzXL-AAAAAJ;9_jlOXUAAAAJ;voqw10cAAAAJ", "orcid": ";;0000-0003-2303-6770;", "linkedin": ";;;", "or_profile": "~Shikuang_Deng1;~Yuhang_Li1;~Shi_Gu1;~Shanghang_Zhang1", "aff": "University of Electronic Science and Technology of China;Yale University;University of Electronic Science and Technology of China, Tsinghua University;University of California, Berkeley", "aff_domain": "uestc.edu.cn;yale.edu;uestc.edu.cn;berkeley.edu", "position": "PhD student;PhD student;Full Professor;Postdoc Research Fellow", "bibtex": "@inproceedings{\ndeng2022temporal,\ntitle={Temporal Efficient Training of Spiking Neural Network via Gradient Re-weighting},\nauthor={Shikuang Deng and Yuhang Li and Shanghang Zhang and Shi Gu},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=_XNtisL32jv}\n}", "github": "", "project": "", "reviewers": "5F8z;A3wt;KVXN;HqdL", "pdf_size": 0, "recommendation": "5;5;8;8", "confidence": "4;5;4;4", "correctness": "3;1;3;3", "technical_novelty": "3;2;3;3", "empirical_novelty": "2;1;1;3", "wc_summary_paper": "13;89;51;91", "wc_summary_review": "181;17;14;62", "wc_main_review": "62;524;269;37", "wc_review": "256;630;334;190", "wc_reply_reviewers": "0;163;0;0", "wc_reply_authors": "333;1613;520;0", "reply_reviewers": "0;2;0;0", "reply_authors": "1;4;1;0", "recommendation_avg": [ 6.5, 1.5 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 2.5, 0.8660254037844386 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 61.0, 31.96873472629156 ], "wc_summary_review_avg": [ 68.5, 67.67754428168918 ], "wc_main_review_avg": [ 223.0, 195.72557318858463 ], "wc_review_avg": [ 352.5, 168.12718400068442 ], "wc_reply_reviewers_avg": [ 40.75, 70.58107040843176 ], "wc_reply_authors_avg": [ 616.5, 604.7249374715748 ], "reply_reviewers_avg": [ 0.5, 0.8660254037844386 ], "reply_authors_avg": [ 1.5, 1.5 ], "replies_avg": [ 27, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.5773502691896258, "corr_recommendation_correctness": 0.5773502691896258, "gs_citation": 338, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7413408769468810617&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=_XNtisL32jv", "email": "uestc.edu.cn;yale.edu;uestc.edu.cn;berkeley.edu", "author_num": 4, "aff_unique_index": "0;1;0;2", "aff_unique_norm": "University of Electronic Science and Technology of China;Yale University;University of California, Berkeley", "aff_unique_dep": ";;", "aff_unique_url": "https://www.uestc.edu.cn;https://www.yale.edu;https://www.berkeley.edu", "aff_unique_abbr": "UESTC;Yale;UC Berkeley", "aff_campus_unique_index": "1", "aff_campus_unique": ";Berkeley", "aff_country_unique_index": "0;1;0;1", "aff_country_unique": "China;United States" }, { "id": "_Xaf6zMDsHL", "title": "Momentum Contrastive Autoencoder: Using Contrastive Learning for Latent Space Distribution Matching in WAE", "track": "main", "status": "Reject", "tldr": "", "abstract": "Wasserstein autoencoder (WAE) shows that matching two distributions is equivalent to minimizing a simple autoencoder (AE) loss under the constraint that the latent space of this AE matches a pre-specified prior distribution. This latent space distribution matching is a core component of WAE, and a challenging task. In this paper, we propose to use the contrastive learning framework that has been shown to be effective for self-supervised representation learning, as a means to resolve this problem. We do so by exploiting the fact that contrastive learning objectives optimize the latent space distribution to be uniform over the unit hyper-sphere, which can be easily sampled from. \nWe show that using the contrastive learning framework to optimize the WAE loss achieves faster convergence and more stable optimization compared with existing popular algorithms for WAE. This is also reflected in the FID scores on CelebA and CIFAR-10 datasets, and the realistic generated image quality on the CelebA-HQ dataset.", "keywords": "Wasserstein autoencoder;contrastive learning", "primary_area": "", "supplementary_material": "/attachment/f8d7c27468b97631da9b855dc8f8ce052a387b91.zip", "author": "Devansh Arpit;Aadyot Bhatnagar;Huan Wang;Caiming Xiong", "authorids": "~Devansh_Arpit2;~Aadyot_Bhatnagar1;~Huan_Wang1;~Caiming_Xiong1", "gender": "M;M;M;M", "homepage": ";https://linkedin.com/in/abhatnagar6;http://www.cs.yale.edu/homes/wang-huan/;http://cmxiong.com/", "dblp": "120/8494;;70/6155-16.html;80/7282", "google_scholar": "https://scholar.google.ca/citations?hl=en;o0qh7IUAAAAJ;7NpTttkAAAAJ;vaSdahkAAAAJ", "orcid": ";;;", "linkedin": ";;huanwangyale/;caiming-xiong-150a1417", "or_profile": "~Devansh_Arpit2;~Aadyot_Bhatnagar1;~Huan_Wang1;~Caiming_Xiong1", "aff": "Salesforce Research;Salesforce;Salesforce.com;Salesforce Research", "aff_domain": "salesforce.com;salesforce.com;salesforce.com;salesforce.com", "position": "Senior Research Scientist;Researcher;Researcher;Research Scientist", "bibtex": "@misc{\narpit2022momentum,\ntitle={Momentum Contrastive Autoencoder: Using Contrastive Learning for Latent Space Distribution Matching in {WAE}},\nauthor={Devansh Arpit and Aadyot Bhatnagar and Huan Wang and Caiming Xiong},\nyear={2022},\nurl={https://openreview.net/forum?id=_Xaf6zMDsHL}\n}", "github": "", "project": "", "reviewers": "nhFp;pPeS;ZJcJ", "site": "https://openreview.net/forum?id=_Xaf6zMDsHL", "pdf_size": 0, "recommendation": "5;5;6", "confidence": "3;4;4", "correctness": "4;3;4", "technical_novelty": "3;2;3", "empirical_novelty": "3;2;2", "wc_summary_paper": "47;60;13", "wc_summary_review": "51;43;83", "wc_main_review": "227;377;167", "wc_review": "325;480;263", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "447;755;544", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "recommendation_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 40.0, 19.8158185969358 ], "wc_summary_review_avg": [ 59.0, 17.281975195754296 ], "wc_main_review_avg": [ 257.0, 88.31760866327846 ], "wc_review_avg": [ 356.0, 91.26152895205442 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 582.0, 128.57941774120252 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.4999999999999999, "corr_recommendation_correctness": 0.4999999999999999, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:a7-WyEwFCsYJ:scholar.google.com/&scioq=Momentum+Contrastive+Autoencoder:+Using+Contrastive+Learning+for+Latent+Space+Distribution+Matching+in+WAE&hl=en&as_sdt=0,5", "gs_version_total": 3, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Salesforce", "aff_unique_dep": "Salesforce Research", "aff_unique_url": "https://research.salesforce.com", "aff_unique_abbr": "Salesforce", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "_YkSZbA7ptn", "title": "Structural Optimization Makes Graph Classification Simpler and Better", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "In deep neural networks, better results can often be obtained by increasing the complexity of previously developed basic models. However, it is unclear whether there is a way to boost performance by decreasing the complexity of such models. Here, based on an optimization method, we investigate the feasibility of improving graph classification performance while simplifying the model learning process. Inspired by progress in structural information assessment, we optimize the given data sample from graphs to encoding trees. In particular, we minimize the structural entropy of the transformed encoding tree to decode the key structure underlying a graph. This transformation is denoted as structural optimization. Furthermore, we propose a novel feature combination scheme, termed hierarchical reporting, for encoding trees. In this scheme, features are transferred from leaf nodes to root nodes by following the hierarchical structures of encoding trees. We then present an implementation of the scheme in a tree kernel and a convolutional network to perform graph classification. The tree kernel follows label propagation in the Weisfeiler-Lehman (WL) subtree kernel, but it has a lower runtime complexity $O(n)$. The convolutional network is a special implementation of our tree kernel in the deep learning field and is called Encoding Tree Learning (ETL). We empirically validate our tree kernel and convolutional network with several graph classification benchmarks and demonstrate that our methods achieve better performance and lower computational consumption than competing approaches.", "keywords": "Structural Optimization;Graph Classification;Encoding Tree Kernel;Encoding Tree Learning", "primary_area": "", "supplementary_material": "", "author": "Junran Wu;Jianhao Li;Yicheng Pan;Ke Xu", "authorids": "~Junran_Wu1;lijianhao@buaa.edu.cn;~Yicheng_Pan1;kexu@buaa.edu.cn", "gender": ";;M;", "homepage": "https://github.com/Wu-Junran;;http://scse.buaa.edu.cn/info/1080/7261.htm;", "dblp": "241/7211;;14/721-1;", "google_scholar": "pbjk-2UAAAAJ;;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Junran_Wu1;lijianhao@buaa.edu.cn;~Yicheng_Pan1;kexu@buaa.edu.cn", "aff": "Beihang University;;Beihang University;", "aff_domain": "buaa.edu.cn;;buaa.edu.cn;", "position": "PhD student;;Assistant Professor;", "bibtex": "@misc{\nwu2022structural,\ntitle={Structural Optimization Makes Graph Classification Simpler and Better},\nauthor={Junran Wu and Jianhao Li and Yicheng Pan and Ke Xu},\nyear={2022},\nurl={https://openreview.net/forum?id=_YkSZbA7ptn}\n}", "github": "", "project": "", "reviewers": "Jr2S;txn6;VVCZ;MB5d", "site": "https://openreview.net/forum?id=_YkSZbA7ptn", "pdf_size": 0, "recommendation": "3;3;3;6", "confidence": "4;4;3;3", "correctness": "3;2;2;3", "technical_novelty": "2;3;2;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "86;165;54;66", "wc_summary_review": "35;47;9;64", "wc_main_review": "305;261;240;199", "wc_review": "426;473;303;329", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.75, 1.299038105676658 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 92.75, 43.251445062564095 ], "wc_summary_review_avg": [ 38.75, 20.029665498954294 ], "wc_main_review_avg": [ 251.25, 38.21239981995373 ], "wc_review_avg": [ 382.75, 69.39875719348294 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:dl73zLGCcg0J:scholar.google.com/&scioq=Structural+Optimization+Makes+Graph+Classification+Simpler+and+Better&hl=en&as_sdt=0,33", "gs_version_total": 4, "aff_unique_index": "0;0", "aff_unique_norm": "Beihang University", "aff_unique_dep": "", "aff_unique_url": "http://www.buaa.edu.cn/", "aff_unique_abbr": "BUAA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "id": "_ZoDJyBBp7z", "title": "Feature Flow Regularization: Improving Structured Sparsity in Deep Neural Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "Pruning is a model compression method that removes redundant parameters and accelerates the inference speed of deep neural networks (DNNs) while maintaining accuracy. Most available pruning methods impose various conditions on parameters or features directly. In this paper, we propose a simple and effective regularization strategy to improve the structured sparsity and structured pruning in DNNs from a new perspective of evolution of features. In particular, we consider the trajectories connecting features of adjacent hidden layers, namely feature flow. We propose feature flow regularization (FFR) to penalize the length and the total absolute curvature of the trajectories, which implicitly increases the structured sparsity of the parameters. The principle behind FFR is that short and straight trajectories will lead to an efficient network that avoids redundant parameters. Experiments on CIFAR-10 and ImageNet datasets show that FFR improves structured sparsity and achieves pruning results comparable to or even better than those state-of-the-art methods.", "keywords": "structured pruning", "primary_area": "", "supplementary_material": "", "author": "Yue Wu;Yuan Lan;Luchan Zhang;Yang Xiang", "authorids": "~Yue_Wu20;~Yuan_Lan1;~Luchan_Zhang1;~Yang_Xiang3", "gender": ";;F;", "homepage": ";;http://math.szu.edu.cn/info/1107/2938.htm;", "dblp": ";;256/1734;", "google_scholar": ";;zcoF2hUAAAAJ;", "orcid": "0000-0002-3863-8961;;0000-0003-2913-0055;", "linkedin": ";;;", "or_profile": "~Yue_Wu20;~Yuan_Lan1;~Luchan_Zhang1;~Yang_Xiang3", "aff": "Hong Kong University of Science and Technology;;Shenzhen University;", "aff_domain": "ust.hk;;szu.edu.cn;", "position": "PhD student;;Assistant Professor;", "bibtex": "@misc{\nwu2022feature,\ntitle={Feature Flow Regularization: Improving Structured Sparsity in Deep Neural Networks},\nauthor={Yue Wu and Yuan Lan and Luchan Zhang and Yang Xiang},\nyear={2022},\nurl={https://openreview.net/forum?id=_ZoDJyBBp7z}\n}", "github": "", "project": "", "reviewers": "D39H;wyu7;Ac8a", "site": "https://openreview.net/forum?id=_ZoDJyBBp7z", "pdf_size": 0, "recommendation": "5;6;6", "confidence": "5;3;4", "correctness": "3;3;3", "technical_novelty": "4;2;3", "empirical_novelty": "3;3;2", "wc_summary_paper": "49;56;60", "wc_summary_review": "37;11;22", "wc_main_review": "258;340;188", "wc_review": "344;407;270", "wc_reply_reviewers": "9;0;44", "wc_reply_authors": "1807;687;2110", "reply_reviewers": "1;0;1", "reply_authors": "5;1;4", "recommendation_avg": [ 5.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 3.0, 0.816496580927726 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 55.0, 4.546060565661952 ], "wc_summary_review_avg": [ 23.333333333333332, 10.656244908763854 ], "wc_main_review_avg": [ 262.0, 62.1181669615795 ], "wc_review_avg": [ 340.3333333333333, 55.99007848618261 ], "wc_reply_reviewers_avg": [ 17.666666666666668, 18.979521127315678 ], "wc_reply_authors_avg": [ 1534.6666666666667, 612.0219676522149 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 3.3333333333333335, 1.699673171197595 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.8660254037844385, "corr_recommendation_correctness": 0.0, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1, "aff_unique_index": "0;1", "aff_unique_norm": "Hong Kong University of Science and Technology;Shenzhen University", "aff_unique_dep": ";", "aff_unique_url": "https://www.ust.hk;https://www.szu.edu.cn", "aff_unique_abbr": "HKUST;SZU", "aff_campus_unique_index": "0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "id": "__ObYt4753c", "title": "A Simple Approach to Adversarial Robustness in Few-shot Image Classification", "track": "main", "status": "Reject", "tldr": "", "abstract": " Few-shot image classification, where the goal is to generalize to tasks with limited labeled data, has seen great progress over the years. However, the classifiers are vulnerable to adversarial examples, posing a question regarding their generalization capabilities. Recent works have tried to combine meta-learning approaches with adversarial training to improve the robustness of few-shot classifiers. We show that a simple transfer-learning based approach can be used to train adversarially robust few-shot classifiers. We also present a method for novel classification task based on calibrating the centroid of the few-shot category towards the base classes. We show that standard adversarial training on base categories along with centroid-based classifier in the novel categories, outperforms or is on-par with state-of-the-art advanced methods on standard benchmarks such as Mini-ImageNet, CIFAR-FS and CUB datasets. Our method is simple and easy to scale, and with little effort can lead to robust few-shot classifiers.", "keywords": "Few-shot learning;Robustness;Image Classification", "primary_area": "", "supplementary_material": "/attachment/9b7490a7f74dfabe1d70d01f97c584e102cdfdfa.zip", "author": "Akshayvarun Subramanya;Hamed Pirsiavash", "authorids": "~Akshayvarun_Subramanya2;~Hamed_Pirsiavash1", "gender": "M;", "homepage": "https://web.cs.ucdavis.edu/~hpirsiav/;https://aksvarun.github.io", "dblp": "07/6340;190/7249", "google_scholar": "https://scholar.google.com.tw/citations?user=c9XXy4MAAAAJ;2_3SWFwAAAAJ", "orcid": ";", "linkedin": "hpirsiav/;", "or_profile": "~Hamed_Pirsiavash1;~Akshayvarun_Subramanya1", "aff": "University of California, Davis;University of Maryland, Baltimore County", "aff_domain": "ucdavis.edu;umbc.edu", "position": "Associate Professor;PhD student", "bibtex": "@misc{\nsubramanya2022a,\ntitle={A Simple Approach to Adversarial Robustness in Few-shot Image Classification },\nauthor={Akshayvarun Subramanya and Hamed Pirsiavash},\nyear={2022},\nurl={https://openreview.net/forum?id=__ObYt4753c}\n}", "github": "", "project": "", "reviewers": "nGCx;QZee;iADS", "site": "https://openreview.net/forum?id=__ObYt4753c", "pdf_size": 0, "recommendation": "5;5;6", "confidence": "2;4;4", "correctness": "3;3;4", "technical_novelty": "2;2;2", "empirical_novelty": "2;1;3", "wc_summary_paper": "93;39;172", "wc_summary_review": "68;19;45", "wc_main_review": "56;125;197", "wc_review": "217;183;414", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "264;499;682", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "recommendation_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.3333333333333335, 0.9428090415820634 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.0, 0.816496580927726 ], "wc_summary_paper_avg": [ 101.33333333333333, 54.61583002105606 ], "wc_summary_review_avg": [ 44.0, 20.016659728003237 ], "wc_main_review_avg": [ 126.0, 57.56735185849702 ], "wc_review_avg": [ 271.3333333333333, 101.83101470355462 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 481.6666666666667, 171.08737209066277 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.4999999999999999, "corr_recommendation_correctness": 0.9999999999999997, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5422710077294131377&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;1", "aff_unique_norm": "University of California, Davis;University of Maryland, Baltimore County", "aff_unique_dep": ";", "aff_unique_url": "https://www.ucdavis.edu;https://www.umbc.edu", "aff_unique_abbr": "UC Davis;UMBC", "aff_campus_unique_index": "0;1", "aff_campus_unique": "Davis;Baltimore County", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "_cz2R6QnpQJ", "title": "Noise Reconstruction and Removal Network: A New Way to Denoise FIB-SEM Images", "track": "main", "status": "Reject", "tldr": "", "abstract": "Recent advances in Focused Ion Beam-Scanning Electron Microscopy (FIB-SEM) allow the imaging and analysis of cellular ultrastructure at nanoscale resolution, but the collection of labels and/or noise-free data sets has several challenges, often immutable. Reasons range from time consuming manual annotations, requiring highly trained specialists, to introducing imaging artifacts from the prolonged scanning during acquisition. We propose a fully unsupervised Noise Reconstruction and Removal Network for denoising scanning electron microscopy images. The architecture, inspired by gated recurrent units, reconstructs and removes the noise by synthesizing the sequential data. At the same time, the fully unsupervised training guides the network in distinguishing true signal from noise and gives comparable/even better results than supervised approaches on 3D electron microscopy data sets. We provide detailed performance analysis using numerical as well as empirical metrics.", "keywords": "Neural Network;CNN;LSTM;Unsupervised learning;Denoising;FIB-SEM", "primary_area": "", "supplementary_material": "/attachment/9814252f5658455fcf19d7ac56c4f69838ad3eaf.zip", "author": "Katya Giannios;Abhishek Chaurasia;Bambi DeLaRosa;Guillaume THIBAULT;Jessica L. Riesterer;Erin S Stempinski;Terence P Lo;Joe W Gray", "authorids": "~Katya_Giannios1;~Abhishek_Chaurasia2;bdelarosa@micron.com;~Guillaume_THIBAULT1;~Jessica_L._Riesterer1;stempins@ohsu.edu;~Terence_P_Lo1;grayjo@ohsu.edu", "gender": "F;M;;M;F;;M;", "homepage": "https://github.com/katyadimova;https://codeac29.github.io/;;https://www.thibault.biz;;;;", "dblp": ";182/1871;;;;;;", "google_scholar": ";https://scholar.google.co.in/citations?user=r8tMFqMAAAAJ;;seovZWoAAAAJ;;;;", "orcid": ";;;0000-0002-0734-2041;0000-0003-1084-2773;;0000-0001-7131-665X;", "linkedin": "katyadimova/;;;;;;;", "or_profile": "~Katya_Giannios1;~Abhishek_Chaurasia2;bdelarosa@micron.com;~Guillaume_THIBAULT1;~Jessica_L._Riesterer1;stempins@ohsu.edu;~Terence_P_Lo1;grayjo@ohsu.edu", "aff": ";Micron Technology Inc;;Oregon Health and Science University;Oregon Health and Science University;;Oregon Health and Science University;", "aff_domain": ";micron.com;;ohsu.edu;ohsu.edu;;ohsu.edu;", "position": ";Researcher;;Researcher;Researcher;;Alliance Manager;", "bibtex": "@misc{\ngiannios2022noise,\ntitle={Noise Reconstruction and Removal Network: A New Way to Denoise {FIB}-{SEM} Images},\nauthor={Katya Giannios and Abhishek Chaurasia and Bambi DeLaRosa and Guillaume THIBAULT and Jessica L. Riesterer and Erin S Stempinski and Terence P Lo and Joe W Gray},\nyear={2022},\nurl={https://openreview.net/forum?id=_cz2R6QnpQJ}\n}", "github": "", "project": "", "reviewers": "xdev;4BoC;34aJ;aPxW;Uq3h", "site": "https://openreview.net/forum?id=_cz2R6QnpQJ", "pdf_size": 0, "recommendation": "3;3;5;5;5", "confidence": "5;3;3;4;4", "correctness": "3;2;3;3;3", "technical_novelty": "1;2;2;2;2", "empirical_novelty": "1;2;2;2;2", "wc_summary_paper": "79;119;113;108;74", "wc_summary_review": "96;21;216;31;52", "wc_main_review": "521;464;763;250;472", "wc_review": "696;604;1092;389;598", "wc_reply_reviewers": "0;0;0;11;13", "wc_reply_authors": "0;0;0;85;296", "reply_reviewers": "0;0;0;1;1", "reply_authors": "0;0;0;1;1", "recommendation_avg": [ 4.2, 0.9797958971132712 ], "confidence_avg": [ 3.8, 0.7483314773547882 ], "correctness_avg": [ 2.8, 0.39999999999999997 ], "technical_novelty_avg": [ 1.8, 0.4000000000000001 ], "empirical_novelty_avg": [ 1.8, 0.4000000000000001 ], "wc_summary_paper_avg": [ 98.6, 18.445595680270127 ], "wc_summary_review_avg": [ 83.2, 71.22752277034489 ], "wc_main_review_avg": [ 494.0, 163.71316379570703 ], "wc_review_avg": [ 675.8, 231.12455516452596 ], "wc_reply_reviewers_avg": [ 4.8, 5.912698199637793 ], "wc_reply_authors_avg": [ 76.2, 114.72471398961952 ], "reply_reviewers_avg": [ 0.4, 0.48989794855663565 ], "reply_authors_avg": [ 0.4, 0.48989794855663565 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": -0.21821789023599233, "corr_recommendation_correctness": 0.6123724356957947, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=460300598332654571&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 12, "aff_unique_index": "0;1;1;1", "aff_unique_norm": "Micron Technology;Oregon Health & Science University", "aff_unique_dep": ";", "aff_unique_url": "https://www.micron.com;https://www.ohsu.edu", "aff_unique_abbr": "MTI;OHSU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "_dDmyNX8aZV", "title": "RNAS: Robust Network Architecture Search beyond DARTS", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "The vulnerability of Deep Neural Networks (DNNs) (i.e., susceptibility to adversarial attacks) severely limits the application of DNNs. Most of the existing methods improve the robustness of the model from weights optimization, such as adversarial training and regularization. However, the architecture of DNNs is also a key factor to robustness, which is often neglected or underestimated. We propose a Robust Network Architecture Search (RNAS) to address this problem. In our method, we define a network vulnerability metric based on the features\u2019 deviation between clean examples and adversarial examples. Through constraining this vulnerability, we search the robust architecture and solve it by iterative optimization. The extensive experiments conducted on CIFAR-10/100 and SVHN show that our model achieves the best performance under various adversarial attacks compared with extensive baselines and state-of-the-art methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yaguan Qian;Shenghui Huang;Yuqi Wang;Simin Li", "authorids": "~Yaguan_Qian1;~Shenghui_Huang2;~Yuqi_Wang2;~Simin_Li4", "gender": "M;M;M;M", "homepage": ";https://github.com/hye999;https://github.com/Rocher97;https://github.com/lsm140", "dblp": "03/8585;;;", "google_scholar": ";;;", "orcid": "0000-0003-4056-9755;;;", "linkedin": ";;rocherwang;", "or_profile": "~Yaguan_Qian1;~Shenghui_Huang2;~Yuqi_Wang2;~Simin_Li4", "aff": "Zhejiang University Science and Technology;;Zhejiang University of Science and Technology;", "aff_domain": "zust.edu.cn;;zust.edu.cn;", "position": "Full Professor;;MS student;", "bibtex": "@misc{\nqian2022rnas,\ntitle={{RNAS}: Robust Network Architecture Search beyond {DARTS}},\nauthor={Yaguan Qian and Shenghui Huang and Yuqi Wang and Simin Li},\nyear={2022},\nurl={https://openreview.net/forum?id=_dDmyNX8aZV}\n}", "github": "", "project": "", "reviewers": "9pzJ;hzxx;G8Ep", "site": "https://openreview.net/forum?id=_dDmyNX8aZV", "pdf_size": 0, "recommendation": "5;5;5", "confidence": "4;2;4", "correctness": "3;3;3", "technical_novelty": "2;2;2", "empirical_novelty": "0;2;2", "wc_summary_paper": "30;71;37", "wc_summary_review": "53;13;2", "wc_main_review": "254;29;144", "wc_review": "337;113;183", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 5.0, 0.0 ], "confidence_avg": [ 3.3333333333333335, 0.9428090415820634 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 1.3333333333333333, 0.9428090415820634 ], "wc_summary_paper_avg": [ 46.0, 17.90716802475106 ], "wc_summary_review_avg": [ 22.666666666666668, 21.913973218524802 ], "wc_main_review_avg": [ 142.33333333333334, 91.86342519680446 ], "wc_review_avg": [ 211.0, 93.56637572689597 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:bTqFXSRBK-AJ:scholar.google.com/&scioq=RNAS:+Robust+Network+Architecture+Search+beyond+DARTS&hl=en&as_sdt=0,14", "gs_version_total": 0, "aff_unique_index": "0;1", "aff_unique_norm": "Zhejiang University;Zhejiang University of Science and Technology", "aff_unique_dep": "Science and Technology;", "aff_unique_url": "http://www.zju.edu.cn;http://www.zjust.edu.cn", "aff_unique_abbr": "ZJU;ZUST", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "id": "_dE5DwHlnQR", "title": "Informative Robust Causal Representation for Generalizable Deep Learning", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "In many real-world scenarios, such as image classification and recommender systems, it is evidence that representation learning can improve model's performance over multiple downstream tasks. Existing learning approaches rely on establishing the correlation (or its proxy) between features and the downstream task (labels), which typically results in a representation containing cause, effect and spurious correlated variables of the label. Its generalizability may deteriorate because of the unstability of the non-causal parts. In this paper, we propose to learn causal representation from observational data by regularizing the learning procedure with mutual information measures according to our hypothetical causal graph. The optimization involves a counterfactual loss, based on which we deduce a theoretical guarantee that the causality-inspired learning is with reduced sample complexity and better generalization ability. Extensive experiments show that the models trained on causal representations learned by our approach is robust under adversarial attacks and distribution shift. ", "keywords": "Causal Representation;Mutual Information;Robust Representation", "primary_area": "", "supplementary_material": "", "author": "Mengyue Yang;Furui Liu;Xu Chen;Zhitang Chen;Jianye HAO;Jun Wang", "authorids": "~Mengyue_Yang1;~Furui_Liu1;~Xu_Chen13;~Zhitang_Chen1;~Jianye_HAO1;~Jun_Wang2", "gender": "F;M;M;M;M;M", "homepage": "https://ymy4323460.github.io/;;https://gsai.ruc.edu.cn/chenxu;;http://www.icdai.org/jianye.html;http://www0.cs.ucl.ac.uk/staff/jun.wang/", "dblp": "262/3824.html;116/7289;83/6331-17;06/10875;21/7664.html;w/JunWang12", "google_scholar": "kJJkqdcAAAAJ;https://scholar.google.com.hk/citations?user=DJY8NXMAAAAJ;loPoqy0AAAAJ;;;https://scholar.google.co.uk/citations?user=wIE1tY4AAAAJ", "orcid": ";;0000-0003-0144-1775;;0000-0002-0422-8235;", "linkedin": ";;;;;", "or_profile": "~Mengyue_Yang1;~Furui_Liu1;~Xu_Chen13;~Zhitang_Chen1;~Jianye_HAO1;~Jun_Wang2", "aff": "University College London;Huawei Technologies Ltd.;Renmin University of China;Huawei Technologies Ltd.;Tianjin University;University College London", "aff_domain": "ucl.ac.uk;huawei.com;ruc.edu.cn;huawei.com;tju.edu.cn;ucl.ac.uk", "position": "PhD student;Senior Researcher;Associate Professor;Researcher;Associate Professor;Professor", "bibtex": "@misc{\nyang2022informative,\ntitle={Informative Robust Causal Representation for Generalizable Deep Learning},\nauthor={Mengyue Yang and Furui Liu and Xu Chen and Zhitang Chen and Jianye HAO and Jun Wang},\nyear={2022},\nurl={https://openreview.net/forum?id=_dE5DwHlnQR}\n}", "github": "", "project": "", "reviewers": "vUUL;3Akd;6Bvg;VnTh", "site": "https://openreview.net/forum?id=_dE5DwHlnQR", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "4;4;2;3", "correctness": "2;3;4;4", "technical_novelty": "2;3;4;3", "empirical_novelty": "2;3;4;3", "wc_summary_paper": "70;96;82;68", "wc_summary_review": "69;55;37;18", "wc_main_review": "681;501;228;330", "wc_review": "820;652;347;416", "wc_reply_reviewers": "253;0;0;0", "wc_reply_authors": "829;460;143;347", "reply_reviewers": "1;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 79.0, 11.180339887498949 ], "wc_summary_review_avg": [ 44.75, 19.162137145944865 ], "wc_main_review_avg": [ 435.0, 172.29770747168982 ], "wc_review_avg": [ 558.75, 188.52237930813413 ], "wc_reply_reviewers_avg": [ 63.25, 109.55221357873148 ], "wc_reply_authors_avg": [ 444.75, 249.24322959711463 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.48420012470625223, "corr_recommendation_correctness": 0.899228803025897, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:1UyRL2NubXIJ:scholar.google.com/&scioq=Informative+Robust+Causal+Representation+for+Generalizable+Deep+Learning&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;2;1;3;0", "aff_unique_norm": "University College London;Huawei;Renmin University of China;Tianjin University", "aff_unique_dep": ";Huawei Technologies;;", "aff_unique_url": "https://www.ucl.ac.uk;https://www.huawei.com;http://www.ruc.edu.cn;http://www.tju.edu.cn", "aff_unique_abbr": "UCL;Huawei;RUC;TJU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1;1;0", "aff_country_unique": "United Kingdom;China" }, { "id": "_dXmN3FV--0", "title": "Lottery Ticket Structured Node Pruning for Tabular Datasets", "track": "main", "status": "Reject", "tldr": "", "abstract": "In this paper we presented two pruning approaches on tabular neural networks based on the lottery ticket hypothesis that went beyond masking nodes by resizing the models accordingly. We showed top performing models in 6 of 8 datasets tested in terms of F1/RMSE. We also showed in 6 of 8 datasets a total reduction of over 85% of nodes and many over 98% reduced with minimal affect to accuracy. In one dataset the model reached a total size of one node per layer while still improving RMSE compared to the larger model used for pruning. We presented results for two approaches, iterative pruning using two styles, and oneshot pruning. Iterative pruning gradually reduces nodes in each layers based on norm pruning until we reach the smallest state, while oneshot will prune the model directly to the smallest state. We showed that the iterative approach will obtain the best result more consistently than oneshot.", "keywords": "Lottery Ticket Hypothesis;Tabular;Pruning", "primary_area": "", "supplementary_material": "", "author": "Ryan Bluteau;Robin Gras;Mitchel Paulin;Zachary Innes", "authorids": "~Ryan_Bluteau1;rgras@uwindsor.ca;paulinm@uwindsor.ca;innesz@uwindsor.ca", "gender": "M;;;", "homepage": "https://www.researchgate.net/profile/Ryan-Bluteau;;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": "0000-0001-9777-0711;;;", "linkedin": ";;;", "or_profile": "~Ryan_Bluteau1;rgras@uwindsor.ca;paulinm@uwindsor.ca;innesz@uwindsor.ca", "aff": "University of Windsor;;;", "aff_domain": "uwindsor.ca;;;", "position": "PhD student;;;", "bibtex": "@misc{\nbluteau2022lottery,\ntitle={Lottery Ticket Structured Node Pruning for Tabular Datasets},\nauthor={Ryan Bluteau and Robin Gras and Mitchel Paulin and Zachary Innes},\nyear={2022},\nurl={https://openreview.net/forum?id=_dXmN3FV--0}\n}", "github": "", "project": "", "reviewers": "EcHe;k3Jq;EuY8;Y7kX", "site": "https://openreview.net/forum?id=_dXmN3FV--0", "pdf_size": 0, "recommendation": "1;3;3;5", "confidence": "5;3;3;4", "correctness": "3;2;3;4", "technical_novelty": "1;2;2;2", "empirical_novelty": "1;1;2;2", "wc_summary_paper": "35;91;53;89", "wc_summary_review": "421;35;21;53", "wc_main_review": "31;467;339;248", "wc_review": "487;593;413;390", "wc_reply_reviewers": "9;0;0;14", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "1;0;0;1", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.0, 1.4142135623730951 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 1.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.5, 0.5 ], "wc_summary_paper_avg": [ 67.0, 23.874672772626646 ], "wc_summary_review_avg": [ 132.5, 166.95134021624384 ], "wc_main_review_avg": [ 271.25, 159.0351769263643 ], "wc_review_avg": [ 470.75, 79.15925403893091 ], "wc_reply_reviewers_avg": [ 5.75, 6.015604707757983 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.4264014327112209, "corr_recommendation_correctness": 0.5, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15068889753854592650&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0", "aff_unique_norm": "University of Windsor", "aff_unique_dep": "", "aff_unique_url": "https://www.uwindsor.ca", "aff_unique_abbr": "UWindsor", "aff_country_unique_index": "0", "aff_country_unique": "Canada" }, { "id": "_fLxZ6VpXTH", "title": "Stabilized Likelihood-based Imitation Learning via Denoising Continuous Normalizing Flow", "track": "main", "status": "Reject", "tldr": "", "abstract": "State-of-the-art imitation learning (IL) approaches, e.g, GAIL, apply adversarial training to minimize the discrepancy between expert and learner behaviors, which is prone to unstable training and mode collapse. In this work, we propose SLIL \u2013 Stabilized Likelihood-based Imitation Learning \u2013 a novel IL approach that directly maximizes the likelihood of observing the expert demonstrations. SLIL is a two-stage optimization framework, where in stage one the expert state distribution is estimated via a new method for denoising continuous normalizing flow, and in stage two the learner policy is trained to match both the expert\u2019s policy and state distribution. Experimental evaluation of SLIL compared with several baselines in ten different physics-based control tasks reveals superior results in terms of learner policy performance, training stability, and mode distribution preservation.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/d8c88ec53ef1a151b2c99941519caea9b3181161.zip", "author": "Xin Zhang;Yanhua Li;Ziming Zhang;Christopher Brinton;Zhenming Liu;Zhi-Li Zhang;Hui Lu;Zhihong Tian", "authorids": "~Xin_Zhang11;~Yanhua_Li1;~Ziming_Zhang4;~Christopher_Brinton1;~Zhenming_Liu1;~Zhi-Li_Zhang1;~Hui_Lu1;~Zhihong_Tian2", "gender": "F;M;;M;M;M;M;M", "homepage": "https://xinzhang525.netlify.app;http://www.wpi.edu/~yli15/;https://www.cbrinton.net/;http://www.wm.edu/as/computerscience/faculty/liu_zhenming.php/;https://www-users.cs.umn.edu/~zhang089/;;;https://zimingzhang.wordpress.com/", "dblp": "76/1584-98;;;51/2717;07/5905;65/4062.html;73/5444;", "google_scholar": "gPIQUXsAAAAJ;https://scholar.google.com.tw/citations?user=ICOWtt0AAAAJ;vWmHA5MAAAAJ;https://scholar.google.com.tw/citations?user=ozfkg2sAAAAJ;https://scholar.google.com.hk/citations?hl=en;;;2yqx3oIAAAAJ", "orcid": ";0000-0001-8972-503x;;;0000-0001-8584-2319;;;", "linkedin": ";;;;;;;", "or_profile": "~Xin_Zhang11;~Yanhua_Li1;~Christopher_Brinton1;~Zhenming_Liu1;~Zhi-Li_Zhang1;~Hui_Lu1;~Zhihong_Tian2;~Ziming_Zhang1", "aff": "Worcester Polytechnic Institute;Worcester Polytechnic Institute;Purdue University;College of William and Mary;University of Minnesota, Minneapolis;Guangzhou University, China, Tsinghua University;Guangzhou University;Worcester Polytechnic Institute", "aff_domain": "wpi.edu;wpi.edu;purdue.edu;wm.edu;umn.edu;gzhu.edu.cn;gzhu.edu.cn;wpi.edu", "position": "PhD student;Associate Professor;Assistant Professor;Assistant Professor;Full Professor;Full Professor;Full Professor;Assistant Professor", "bibtex": "@misc{\nzhang2022stabilized,\ntitle={Stabilized Likelihood-based Imitation Learning via Denoising Continuous Normalizing Flow},\nauthor={Xin Zhang and Yanhua Li and Ziming Zhang and Christopher Brinton and Zhenming Liu and Zhi-Li Zhang and Hui Lu and Zhihong Tian},\nyear={2022},\nurl={https://openreview.net/forum?id=_fLxZ6VpXTH}\n}", "github": "", "project": "", "reviewers": "SsHp;JhT1;kBN4;nLuT", "site": "https://openreview.net/forum?id=_fLxZ6VpXTH", "pdf_size": 0, "recommendation": "5;5;5;8", "confidence": "5;4;4;4", "correctness": "2;3;4;3", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;3;2;3", "wc_summary_paper": "172;137;129;232", "wc_summary_review": "67;42;72;40", "wc_main_review": "572;157;216;533", "wc_review": "811;336;417;805", "wc_reply_reviewers": "899;0;0;16", "wc_reply_authors": "2333;203;317;1072", "reply_reviewers": "4;0;0;1", "reply_authors": "6;2;2;3", "recommendation_avg": [ 5.75, 1.299038105676658 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 167.5, 40.59864529759583 ], "wc_summary_review_avg": [ 55.25, 14.376630342329875 ], "wc_main_review_avg": [ 369.5, 184.70043313430534 ], "wc_review_avg": [ 592.25, 217.6526762987306 ], "wc_reply_reviewers_avg": [ 228.75, 387.0241433037479 ], "wc_reply_authors_avg": [ 981.25, 848.876426519196 ], "reply_reviewers_avg": [ 1.25, 1.6393596310755 ], "reply_authors_avg": [ 3.25, 1.6393596310755 ], "replies_avg": [ 23, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13480990007221212811&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;1;2;3;4;4;0", "aff_unique_norm": "Worcester Polytechnic Institute;Purdue University;College of William and Mary;University of Minnesota;Guangzhou University", "aff_unique_dep": ";;;;", "aff_unique_url": "https://www.wpi.edu;https://www.purdue.edu;https://www.wm.edu;https://www.minnesota.edu;http://www.gzhu.edu.cn", "aff_unique_abbr": "WPI;Purdue;WM;UMN;GU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Minneapolis", "aff_country_unique_index": "0;0;0;0;0;1;1;0", "aff_country_unique": "United States;China" }, { "id": "_faKHAwA8O", "title": "Representation Consolidation from Multiple Expert Teachers", "track": "main", "status": "Reject", "tldr": "", "abstract": "A library of diverse expert models transfers better to a novel task than a single generalist model. However, growing such a library indefinitely is impractical. Hence, we explore the problem of learning a consolidated image feature representation from a collection of related task-specific teachers that transfer well on novel recognition tasks. This differs from traditional knowledge distillation in which a student model is trained to emulate the input/output functionality of a teacher. Indeed, we observe experimentally that standard distillation of task-specific teachers, or using these teacher representations directly, **reduces** downstream transferability compared to a task-agnostic generalist model. We show that a simple multi-head, multi-task distillation method using an unlabeled proxy dataset and adding a generalist teacher is sufficient to consolidate representations from task-specific teacher(s). We improve downstream performance, outperforming the teacher (or best of all teachers) as well as the strong baseline of ImageNet pre-trained features. Our method almost reaches the performance of a multi-task joint training oracle, reaping the benefit of the teachers without replaying their training data.", "keywords": "transfer learning;distillation;pretraining;model merging", "primary_area": "", "supplementary_material": "", "author": "Zhizhong Li;Avinash Ravichandran;Charless Fowlkes;Marzia Polito;Rahul Bhotika;Stefano Soatto", "authorids": "~Zhizhong_Li1;~Avinash_Ravichandran1;~Charless_Fowlkes1;~Marzia_Polito1;~Rahul_Bhotika1;~Stefano_Soatto1", "gender": "M;M;M;F;M;", "homepage": "http://zli115.web.engr.illinois.edu/;;https://www.ics.uci.edu/~fowlkes;;;https://www.cs.ucla.edu/~soatto", "dblp": ";90/4314;90/4157;86/750;28/2609;08/1262", "google_scholar": "qIdGcLUAAAAJ;28p_eLYAAAAJ;yLQF4mkAAAAJ;8qsuHEoAAAAJ;KFeN73wAAAAJ;lH1PdF8AAAAJ", "orcid": "0000-0002-6068-7209;;;;;0000-0003-2902-6362", "linkedin": ";;;marzia-polito-89b0a33/;rahul-bhotika/;stefano-soatto-5765aa6/", "or_profile": "~Zhizhong_Li1;~Avinash_Ravichandran1;~Charless_Fowlkes1;~Marzia_Polito1;~Rahul_Bhotika3;~Stefano_Soatto2", "aff": "Amazon;Amazon;University of California, Irvine;Amazon;Amazon;UCLA Computer Science Department, University of California, Los Angeles", "aff_domain": "amazon.com;amazon.com;uci.edu;amazon.com;amazon.com;cs.ucla.edu", "position": "Applied Scientist;Research Scientist;Professor;Senior Manager, Applied Science;Director of Computer Vision, Amazon Web Services;Professor", "bibtex": "@misc{\nli2022representation,\ntitle={Representation Consolidation from Multiple Expert Teachers},\nauthor={Zhizhong Li and Avinash Ravichandran and Charless Fowlkes and Marzia Polito and Rahul Bhotika and Stefano Soatto},\nyear={2022},\nurl={https://openreview.net/forum?id=_faKHAwA8O}\n}", "github": "", "project": "", "reviewers": "Swfw;HSTC;vU3z;6iCZ", "site": "https://openreview.net/forum?id=_faKHAwA8O", "pdf_size": 0, "recommendation": "3;5;5;5", "confidence": "4;4;4;4", "correctness": "3;3;3;3", "technical_novelty": "2;2;2;2", "empirical_novelty": "3;3;2;2", "wc_summary_paper": "62;108;66;100", "wc_summary_review": "38;36;50;26", "wc_main_review": "287;410;286;135", "wc_review": "387;554;402;261", "wc_reply_reviewers": "0;20;0;0", "wc_reply_authors": "605;654;434;568", "reply_reviewers": "0;1;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 4.5, 0.8660254037844386 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 84.0, 20.248456731316587 ], "wc_summary_review_avg": [ 37.5, 8.52936105461599 ], "wc_main_review_avg": [ 279.5, 97.47948502120843 ], "wc_review_avg": [ 401.0, 103.93026508192885 ], "wc_reply_reviewers_avg": [ 5.0, 8.660254037844387 ], "wc_reply_authors_avg": [ 565.25, 81.68651969572458 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:RsP7Sfw7PlsJ:scholar.google.com/&scioq=Representation+Consolidation+from+Multiple+Expert+Teachers&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0;1;0;0;2", "aff_unique_norm": "Amazon;University of California, Irvine;University of California, Los Angeles", "aff_unique_dep": "Amazon.com, Inc.;;Computer Science Department", "aff_unique_url": "https://www.amazon.com;https://www.uci.edu;https://www.ucla.edu", "aff_unique_abbr": "Amazon;UCI;UCLA", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Irvine;Los Angeles", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "_gZ8dG4vOr9", "title": "Pruning Compact ConvNets For Efficient Inference", "track": "main", "status": "Reject", "tldr": "", "abstract": "Neural network pruning is frequently used to compress over-parameterized networks by large amounts, while incurring only marginal drops in generalization performance. However, the impact of pruning on networks that have been highly optimized for efficient inference has not received the same level of attention. In this paper, we analyze the effect of pruning for computer vision, and study state-of-the-art FBNetV3 family of models. We show that model pruning approaches can be used to further optimize networks trained through NAS (Neural Architecture Search). The resulting family of pruned models can consistently obtain better performance than existing FBNetV3 models at the same level of computation, and thus provide state-of-the-art results when trading off between computational complexity and generalization performance on the ImageNet benchmark. In addition to better generalization performance, we also demonstrate that when limited computation resources are available, pruning FBNetV3 models incur only a fraction of GPU-hours involved in running a full-scale NAS (Neural Architecture Search).", "keywords": "pruning;neural networks;computations;latency;imagenet", "primary_area": "", "supplementary_material": "", "author": "Sayan Ghosh;Karthik Prasad;Xiaoliang Dai;Peizhao Zhang;Bichen Wu;Graham Cormode;Peter Vajda", "authorids": "~Sayan_Ghosh1;~Karthik_Prasad1;~Xiaoliang_Dai1;~Peizhao_Zhang1;~Bichen_Wu1;~Graham_Cormode1;~Peter_Vajda1", "gender": ";;M;M;M;M;", "homepage": ";https://ai.facebook.com/people/karthik-prasad;;;;http://dimacs.rutgers.edu/~graham/;https://sites.google.com/site/vajdap", "dblp": "67/6126-4;;192/3904;23/8011.html;130/1371;c/GrahamCormode;44/5953", "google_scholar": "WC_NlykAAAAJ;O7Qb6I8AAAAJ;u4olrOcAAAAJ;eqQQkM4AAAAJ;K3QJPdMAAAAJ;https://scholar.google.co.uk/citations?user=gpLVKmEAAAAJ;k8QB5VUAAAAJ", "orcid": ";;;;;0000-0002-0698-0922;", "linkedin": ";prasadkarthik/;;;bichenwu/;;p%C3%A9ter-vajda-9a03aaa/", "or_profile": "~Sayan_Ghosh1;~Karthik_Prasad1;~Xiaoliang_Dai1;~Peizhao_Zhang1;~Bichen_Wu1;~Graham_Cormode1;~Peter_Vajda1", "aff": "Meta Facebook;Meta;Meta Facebook;Meta;Meta Facebook;The university of Warwick;Meta", "aff_domain": "fb.com;meta.com;fb.com;meta.com;fb.com;warwick.ac.uk;meta.com", "position": "Research Scientist;Researcher;Research Scientist;Research Scientist;Research Scientist;Full Professor;Researcher", "bibtex": "@misc{\nghosh2022pruning,\ntitle={Pruning Compact ConvNets For Efficient Inference},\nauthor={Sayan Ghosh and Karthik Prasad and Xiaoliang Dai and Peizhao Zhang and Bichen Wu and Graham Cormode and Peter Vajda},\nyear={2022},\nurl={https://openreview.net/forum?id=_gZ8dG4vOr9}\n}", "github": "", "project": "", "reviewers": "haMF;F7aS;tgou;gdjt", "site": "https://openreview.net/forum?id=_gZ8dG4vOr9", "pdf_size": 0, "recommendation": "1;3;3;3", "confidence": "4;4;4;4", "correctness": "1;2;3;4", "technical_novelty": "1;1;1;1", "empirical_novelty": "1;2;0;3", "wc_summary_paper": "43;62;79;38", "wc_summary_review": "25;66;17;42", "wc_main_review": "172;446;212;199", "wc_review": "240;574;308;279", "wc_reply_reviewers": "62;80;0;0", "wc_reply_authors": "176;336;329;370", "reply_reviewers": "1;1;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 2.5, 0.8660254037844386 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 2.5, 1.118033988749895 ], "technical_novelty_avg": [ 1.0, 0.0 ], "empirical_novelty_avg": [ 1.5, 1.118033988749895 ], "wc_summary_paper_avg": [ 55.5, 16.25576820700886 ], "wc_summary_review_avg": [ 37.5, 18.76832437912346 ], "wc_main_review_avg": [ 257.25, 109.92582726547934 ], "wc_review_avg": [ 350.25, 131.4160853929229 ], "wc_reply_reviewers_avg": [ 35.5, 36.065911883661 ], "wc_reply_authors_avg": [ 302.75, 74.80432808334021 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.7745966692414834, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11027090178441911601&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0;0;0;0;1;0", "aff_unique_norm": "Meta;University of Warwick", "aff_unique_dep": "Meta Platforms, Inc.;", "aff_unique_url": "https://meta.com;https://warwick.ac.uk", "aff_unique_abbr": "Meta;Warwick", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;1;0", "aff_country_unique": "United States;United Kingdom" }, { "id": "_gZf4NEuf0H", "title": "Towards Understanding the Condensation of Neural Networks at Initial Training", "track": "main", "status": "Reject", "tldr": "", "abstract": "Implicit regularization is important for understanding the learning of neural networks (NNs). Empirical works show that input weights of hidden neurons (the input weight of a hidden neuron consists of the weight from its input layer to the hidden neuron and its bias term) condense on isolated orientations with a small initialization. The condensation dynamics implies that the training implicitly regularizes a NN towards one with much smaller effective size. In this work, we utilize multilayer networks to show that the maximal number of condensed orientations in the initial training stage is twice the multiplicity of the activation function, where ``multiplicity'' is multiple roots of activation function at origin. Our theoretical analysis confirms experiments for two cases, one is for the activation function of multiplicity one, which contains many common activation functions, and the other is for the layer with one-dimensional input. This work makes a step towards understanding how small initialization implicitly leads NNs to condensation at initial training stage, which lays a foundation for the future study of the nonlinear dynamics of NNs and its implicit regularization effect at a later stage of training.", "keywords": "neural networks;training;condensation dynamics;implicit regularization", "primary_area": "", "supplementary_material": "/attachment/49efb56400fb7ca178dabd9abf3730ed6a94f30d.zip", "author": "Zhiqin Xu;Hanxu Zhou;Tao Luo;Yaoyu Zhang", "authorids": "~Zhiqin_Xu1;~Hanxu_Zhou1;~Tao_Luo3;~Yaoyu_Zhang1", "gender": "M;;;", "homepage": "https://ins.sjtu.edu.cn/people/xuzhiqin/;;;https://ins.sjtu.edu.cn/peoples/zhangyaoyu", "dblp": "223/4493.html;;;", "google_scholar": "EjLvG5cAAAAJ;https://scholar.google.com.hk/citations?user=ypD3aL8AAAAJ;;", "orcid": "0000-0002-0122-0879;;;", "linkedin": ";;;", "or_profile": "~Zhiqin_Xu1;~Hanxu_Zhou1;~Tao_Luo3;~Yaoyu_Zhang1", "aff": "Shanghai Jiaotong University;Shanghai Jiaotong University;;Shanghai Jiaotong University", "aff_domain": "sjtu.edu.cn;sjtu.edu.cn;;sjtu.edu.cn", "position": "Associate Professor;PhD student;;Associate Professor", "bibtex": "@misc{\nxu2022towards,\ntitle={Towards Understanding the Condensation of Neural Networks at Initial Training},\nauthor={Zhiqin Xu and Hanxu Zhou and Tao Luo and Yaoyu Zhang},\nyear={2022},\nurl={https://openreview.net/forum?id=_gZf4NEuf0H}\n}", "github": "", "project": "", "reviewers": "KucV;ejGJ;jDJ5;zuZq", "site": "https://openreview.net/forum?id=_gZf4NEuf0H", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "3;2;3;3", "correctness": "3;3;3;4", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "37;118;60;118", "wc_summary_review": "37;71;55;24", "wc_main_review": "276;134;186;410", "wc_review": "350;323;301;552", "wc_reply_reviewers": "0;19;18;43", "wc_reply_authors": "1031;988;606;1538", "reply_reviewers": "0;1;1;1", "reply_authors": "2;3;1;2", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 2.75, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 83.25, 35.688758734369 ], "wc_summary_review_avg": [ 46.75, 17.80975856096876 ], "wc_main_review_avg": [ 251.5, 104.66494159937223 ], "wc_review_avg": [ 381.5, 99.9562404254982 ], "wc_reply_reviewers_avg": [ 20.0, 15.280706789936126 ], "wc_reply_authors_avg": [ 1040.75, 331.3392332640371 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 2.0, 0.7071067811865476 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 35, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13785790388400550245&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff_unique_index": "0;0;0", "aff_unique_norm": "Shanghai Jiao Tong University", "aff_unique_dep": "", "aff_unique_url": "https://www.sjtu.edu.cn", "aff_unique_abbr": "SJTU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "Anomaly Detection for Tabular Data with Internal Contrastive Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7127", "id": "_hszZbt46bT", "poster": "", "openreview": "https://openreview.net/forum?id=_hszZbt46bT", "slides": "https://iclr.cc/virtual/2022/poster/7127", "video": "https://iclr.cc/virtual/2022/poster/7127", "author_site": "Tom Shenkar, Lior Wolf", "tldr": "", "abstract": " We consider the task of finding out-of-class samples in tabular data, where little can be assumed on the structure of the data. In order to capture the structure of the samples of the single training class, we learn mappings that maximize the mutual information between each sample and the part that is masked out. The mappings are learned by employing a contrastive loss, which considers only one sample at a time. Once learned, we can score a test sample by measuring whether the learned mappings lead to a small contrastive loss using the masked parts of this sample. Our experiments show that our method leads by a sizable accuracy gap in comparison to the literature and that the same default set of hyperparameters provides state-of-the-art results across benchmarks.", "keywords": "Anomaly detection;Tabular data", "primary_area": "", "supplementary_material": "/attachment/f470e517aa77f89e6cbbf43efaea0ae20ba46248.zip", "author": "Tom Shenkar;Lior Wolf", "authorids": "~Tom_Shenkar1;~Lior_Wolf1", "gender": "M;M", "homepage": ";http://www.cs.tau.ac.il/~wolf", "dblp": ";83/4103", "google_scholar": ";UbFrXTsAAAAJ", "orcid": ";0000-0001-5578-8892", "linkedin": "tom-shenkar-993799195/;", "or_profile": "~Tom_Shenkar1;~Lior_Wolf1", "aff": "Tel Aviv University;Tel Aviv University", "aff_domain": "tau.post.ac.il;tau.ac.il", "position": "MS student;Full Professor", "bibtex": "@inproceedings{\nshenkar2022anomaly,\ntitle={Anomaly Detection for Tabular Data with Internal Contrastive Learning},\nauthor={Tom Shenkar and Lior Wolf},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=_hszZbt46bT}\n}", "github": "", "project": "", "reviewers": "e9Pz;yptt;uGiC;iWxT", "pdf_size": 0, "recommendation": "6;6;8;8", "confidence": "4;4;4;4", "correctness": "3;3;4;4", "technical_novelty": "2;2;3;3", "empirical_novelty": "3;3;4;3", "wc_summary_paper": "53;169;120;156", "wc_summary_review": "244;59;124;110", "wc_main_review": "424;240;274;286", "wc_review": "721;468;518;552", "wc_reply_reviewers": "111;0;0;232", "wc_reply_authors": "1588;330;614;1058", "reply_reviewers": "1;0;0;2", "reply_authors": "4;1;1;3", "recommendation_avg": [ 7.0, 1.0 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 3.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 124.5, 45.01388674620311 ], "wc_summary_review_avg": [ 134.25, 67.82468208550631 ], "wc_main_review_avg": [ 306.0, 70.18546858146635 ], "wc_review_avg": [ 564.75, 95.02992949592249 ], "wc_reply_reviewers_avg": [ 85.75, 95.82894917507966 ], "wc_reply_authors_avg": [ 897.5, 475.6519210515185 ], "reply_reviewers_avg": [ 0.75, 0.82915619758885 ], "reply_authors_avg": [ 2.25, 1.299038105676658 ], "replies_avg": [ 26, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 1.0, "gs_citation": 119, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15358518546419142220&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "pdf": "https://openreview.net/pdf?id=_hszZbt46bT", "email": "tau.post.ac.il;tau.ac.il", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Tel Aviv University", "aff_unique_dep": "", "aff_unique_url": "https://www.tau.ac.il", "aff_unique_abbr": "TAU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Israel" }, { "id": "_ixHFNR-FZ", "title": "Adversarially Robust Models may not Transfer Better: Sufficient Conditions for Domain Transferability from the View of Regularization", "track": "main", "status": "Reject", "tldr": "", "abstract": "Machine learning (ML) robustness and generalization are fundamentally correlated: they essentially concern about data distribution shift under adversarial and natural settings, respectively. Thus, it is critical to uncover their underlying connections to tackle one based on the other. On the one hand, recent studies show that more robust (adversarially trained) models are more generalizable to other domains. On the other hand, there lacks of theoretical understanding of such phenomenon and it is not clear whether there are counterexamples. In this paper, we aim to provide sufficient conditions for this phenomenon considering different factors that could affect both, such as the norm of last layer norm, Jacobian norm, and data augmentations (DA). In particular, we propose a general theoretical framework indicating factors that can be reformed as a function class regularization process, which could lead to the improvement of domain generalization. Our analysis, for the first time, shows that ``robustness\" is actually not the causation for domain generalization; rather, robustness induced by adversarial training is a by-product of such function class regularization. We then discuss in details about different properties of DA and we prove that under certain conditions, DA can be viewed as regularization and therefore improve generalization. We conduct extensive experiments to verify our theoretical findings, and show several counterexamples where robustness and generalization are negatively correlated when the sufficient conditions are not satisfied.", "keywords": "Domain transferability;model regularization", "primary_area": "", "supplementary_material": "/attachment/6761851f3b40ffb8d2096c0f38a2d31d91f5b839.zip", "author": "Xiaojun Xu;Jacky Y. Zhang;Evelyn Ma;Danny Son;Oluwasanmi O Koyejo;Bo Li", "authorids": "~Xiaojun_Xu1;~Jacky_Y._Zhang1;alphabetaluca@outlook.com;hhson2@illinois.edu;~Oluwasanmi_O_Koyejo1;~Bo_Li19", "gender": "M;;;;M;F", "homepage": ";;;;https://cs.stanford.edu/~sanmi/;http://boli.cs.illinois.edu/", "dblp": ";;;;14/8885;50/3402-26", "google_scholar": "rdMZZQwAAAAJ;;;;EaaOeJwAAAAJ;K8vJkTcAAAAJ", "orcid": ";;;;0000-0002-4023-419X;", "linkedin": ";;;;sanmi-koyejo-984754/;", "or_profile": "~Xiaojun_Xu1;~Jacky_Y._Zhang1;alphabetaluca@outlook.com;hhson2@illinois.edu;~Oluwasanmi_O_Koyejo1;~Bo_Li19", "aff": "University of Illinois, Urbana Champaign;;;;University of Illinois, Urbana Champaign;University of Illinois, Urbana Champaign", "aff_domain": "illinois.edu;;;;illinois.edu;illinois.edu", "position": "PhD student;;;;Associate Professor;Assistant Professor", "bibtex": "@misc{\nxu2022adversarially,\ntitle={Adversarially Robust Models may not Transfer Better: Sufficient Conditions for Domain Transferability from the View of Regularization},\nauthor={Xiaojun Xu and Jacky Y. Zhang and Evelyn Ma and Danny Son and Oluwasanmi O Koyejo and Bo Li},\nyear={2022},\nurl={https://openreview.net/forum?id=_ixHFNR-FZ}\n}", "github": "", "project": "", "reviewers": "ijfD;MEJz;nSDW;5SD9", "site": "https://openreview.net/forum?id=_ixHFNR-FZ", "pdf_size": 0, "recommendation": "3;3;5;6", "confidence": "3;4;3;4", "correctness": "3;2;3;3", "technical_novelty": "3;2;3;3", "empirical_novelty": "3;3;2;3", "wc_summary_paper": "243;40;178;62", "wc_summary_review": "59;12;86;79", "wc_main_review": "389;181;440;124", "wc_review": "691;233;704;265", "wc_reply_reviewers": "0;0;0;29", "wc_reply_authors": "922;580;388;97", "reply_reviewers": "0;0;0;1", "reply_authors": "2;1;1;1", "recommendation_avg": [ 4.25, 1.299038105676658 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 130.75, 83.35878777909382 ], "wc_summary_review_avg": [ 59.0, 28.88771365130858 ], "wc_main_review_avg": [ 283.5, 133.76191535710007 ], "wc_review_avg": [ 473.25, 224.5822510796434 ], "wc_reply_reviewers_avg": [ 7.25, 12.55736835487436 ], "wc_reply_authors_avg": [ 496.75, 299.74770641324346 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.19245008972987526, "corr_recommendation_correctness": 0.5555555555555555, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15666112236501144638&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Illinois Urbana-Champaign", "aff_unique_dep": "", "aff_unique_url": "https://illinois.edu", "aff_unique_abbr": "UIUC", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Urbana-Champaign", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "_j4hwbj6Opj", "title": "3D Meta-Registration: Meta-learning 3D Point Cloud Registration Functions", "track": "main", "status": "Reject", "tldr": "", "abstract": "Learning robust 3D point cloud registration functions with deep neural networks has emerged as a powerful paradigm in recent years, offering promising performance in producing spatial geometric transformations for each pair of 3D point clouds. However, 3D point cloud registration functions are often generalized from extensive training over a large volume of data to learn the ability to predict the desired geometric transformation to register 3D point clouds. Generalizing across 3D point cloud registration functions requires robust learning of priors over the respective function space and enables consistent registration in presence of significant 3D structure variations. In this paper, we proposed to formalize the learning of a 3D point cloud registration function space as a meta-learning problem, aiming to predict a 3D registration model that can be quickly adapted to new point clouds with no or limited training data. Specifically, we define each task as the learning of the 3D registration function which takes points in 3D space as input and predicts the geometric transformation that aligns the source point cloud with the target one. Also, we introduce an auxiliary deep neural network named 3D registration meta-learner that is trained to predict the prior over the respective 3D registration function space. After training, the 3D registration meta-learner, which is trained with the distribution of 3D registration function space, is able to uniquely parameterize the 3D registration function with optimal initialization to rapidly adapt to new registration tasks. We tested our model on the synthesized dataset ModelNet and FlyingThings3D, as well as real-world dataset KITTI. Experimental results demonstrate that 3D Meta-Registration achieves superior performance over other previous techniques (e.g. FlowNet3D).", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yu Hao;Yi Fang", "authorids": "~Yu_Hao1;~Yi_Fang2", "gender": "M;M", "homepage": ";http://mmvc.engineering.nyu.edu/", "dblp": "33/32703;96/361-6", "google_scholar": ";j-cyhzwAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Yu_Hao1;~Yi_Fang2", "aff": "New York University;New York University", "aff_domain": "nyu.edu;nyu.edu", "position": "PhD student;Associate Professor", "bibtex": "@misc{\nhao2022d,\ntitle={3D Meta-Registration: Meta-learning 3D Point Cloud Registration Functions},\nauthor={Yu Hao and Yi Fang},\nyear={2022},\nurl={https://openreview.net/forum?id=_j4hwbj6Opj}\n}", "github": "", "project": "", "reviewers": "xfRw;woQH;eJC8;t8Fa", "site": "https://openreview.net/forum?id=_j4hwbj6Opj", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "4;4;4;4", "correctness": "3;2;4;2", "technical_novelty": "3;3;2;2", "empirical_novelty": "3;2;2;2", "wc_summary_paper": "28;34;59;89", "wc_summary_review": "41;105;47;21", "wc_main_review": "234;521;344;124", "wc_review": "303;660;450;234", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 2.75, 0.82915619758885 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 52.5, 24.06761309311748 ], "wc_summary_review_avg": [ 53.5, 31.252999856013822 ], "wc_main_review_avg": [ 305.75, 146.6089611858702 ], "wc_review_avg": [ 411.75, 163.1814557478882 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": -0.5222329678670935, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:eDd8d9XgEIIJ:scholar.google.com/&scioq=3D+Meta-Registration:+Meta-learning+3D+Point+Cloud+Registration+Functions&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "New York University", "aff_unique_dep": "", "aff_unique_url": "https://www.nyu.edu", "aff_unique_abbr": "NYU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Generalizing Few-Shot NAS with Gradient Matching", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6058", "id": "_jMtny3sMKU", "poster": "", "openreview": "https://openreview.net/forum?id=_jMtny3sMKU", "slides": "https://iclr.cc/virtual/2022/poster/6058", "video": "https://iclr.cc/virtual/2022/poster/6058", "author_site": "Shoukang Hu, Ruochen Wang, Lanqing HONG, Zhenguo Li, Cho-Jui Hsieh, Jiashi Feng", "tldr": "", "abstract": "Efficient performance estimation of architectures drawn from large search spaces is essential to Neural Architecture Search. One-Shot methods tackle this challenge by training one supernet to approximate the performance of every architecture in the search space via weight-sharing, thereby drastically reducing the search cost. However, due to coupled optimization between child architectures caused by weight-sharing, One-Shot supernet's performance estimation could be inaccurate, leading to degraded search outcomes. To address this issue, Few-Shot NAS reduces the level of weight-sharing by splitting the One-Shot supernet into multiple separated sub-supernets via edge-wise (layer-wise) exhaustive partitioning. Since each partition of the supernet is not equally important, it necessitates the design of a more effective splitting criterion. In this work, we propose a gradient matching score (GM) that leverages gradient information at the shared weight for making informed splitting decisions. Intuitively, gradients from different child models can be used to identify whether they agree on how to update the shared modules, and subsequently to decide if they should share weight. Compared with exhaustive partitioning, the proposed criterion significantly reduces the branching factor per edge. This allows us to split more edges (layers) for a given budget, resulting in substantially improved performance as NAS search spaces usually include dozens of edges (layers). Extensive empirical evaluations of the proposed method on a wide range of search spaces (NASBench-201, DARTS, MobileNet Space), datasets (cifar10, cifar100, ImageNet) and search algorithms (DARTS, SNAS, RSPS, ProxylessNAS, OFA) demonstrate that it significantly outperforms its Few-Shot counterparts while surpassing previous comparable methods in terms of the accuracy of derived architectures. \nOur code is available at https://github.com/skhu101/GM-NAS.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/bc0a1c7150e1628b9a4195e784c50d443932ce50.zip", "author": "Shoukang Hu;Ruochen Wang;Lanqing HONG;Zhenguo Li;Cho-Jui Hsieh;Jiashi Feng", "authorids": "~Shoukang_Hu1;~Ruochen_Wang2;~Lanqing_HONG1;~Zhenguo_Li1;~Cho-Jui_Hsieh1;~Jiashi_Feng1", "gender": "M;M;F;M;M;M", "homepage": "https://skhu101.github.io/;https://ruocwang.github.io/;https://racheltechie.github.io/;http://www.ee.columbia.edu/~zgli/;http://web.cs.ucla.edu/~chohsieh/index.html;https://sites.google.com/site/jshfeng/", "dblp": "226/1865;33/120;226/4258;23/6479;14/2770;56/8278", "google_scholar": "9cUPotAAAAAJ;8fXrlRAAAAAJ;https://scholar.google.com.sg/citations?user=2p7x6OUAAAAJ;XboZC1AAAAAJ;Wy89g4IAAAAJ;https://scholar.google.com.sg/citations?user=Q8iay0gAAAAJ", "orcid": ";;;;;0000-0001-6843-0064", "linkedin": ";ruochen-wang-1699b1113/;;;;", "or_profile": "~Shoukang_Hu1;~Ruochen_Wang2;~Lanqing_HONG1;~Zhenguo_Li1;~Cho-Jui_Hsieh1;~Jiashi_Feng2", "aff": "Nanyang Technological University;University of California, Los Angeles;Huawei Technologies Ltd.;Huawei Noah's Ark Lab;University of California, Los Angeles;ByteDance", "aff_domain": "ntu.edu;ucla.edu;huawei.com;huawei.com;ucla.edu;bytedance.com", "position": "Postdoc;PhD student;Researcher;Principal Researcher;Assistant Professor;Research Lead", "bibtex": "@inproceedings{\nhu2022generalizing,\ntitle={Generalizing Few-Shot {NAS} with Gradient Matching},\nauthor={Shoukang Hu and Ruochen Wang and Lanqing HONG and Zhenguo Li and Cho-Jui Hsieh and Jiashi Feng},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=_jMtny3sMKU}\n}", "github": "", "project": "", "reviewers": "dVgX;Y8nY;tbrG;x5Hy", "pdf_size": 0, "recommendation": "6;6;6;6", "confidence": "4;4;3;3", "correctness": "4;3;3;4", "technical_novelty": "3;3;2;2", "empirical_novelty": "3;3;3;3", "wc_summary_paper": "85;77;77;108", "wc_summary_review": "45;89;64;26", "wc_main_review": "201;277;184;334", "wc_review": "331;443;325;468", "wc_reply_reviewers": "29;12;0;0", "wc_reply_authors": "565;772;290;619", "reply_reviewers": "1;1;0;0", "reply_authors": "2;1;1;1", "recommendation_avg": [ 6.0, 0.0 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 86.75, 12.695963925594622 ], "wc_summary_review_avg": [ 56.0, 23.313086453749534 ], "wc_main_review_avg": [ 249.0, 60.28681447878964 ], "wc_review_avg": [ 391.75, 64.39477851503179 ], "wc_reply_reviewers_avg": [ 10.25, 11.882234638316145 ], "wc_reply_authors_avg": [ 561.5, 174.17017540325324 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 38, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10558207332757804678&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=_jMtny3sMKU", "email": "ntu.edu;ucla.edu;huawei.com;huawei.com;ucla.edu;bytedance.com", "author_num": 6, "aff_unique_index": "0;1;2;2;1;3", "aff_unique_norm": "Nanyang Technological University;University of California, Los Angeles;Huawei;ByteDance", "aff_unique_dep": ";;Huawei Technologies;", "aff_unique_url": "https://www.ntu.edu.sg;https://www.ucla.edu;https://www.huawei.com;https://www.bytedance.com", "aff_unique_abbr": "NTU;UCLA;Huawei;ByteDance", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Los Angeles", "aff_country_unique_index": "0;1;2;2;1;2", "aff_country_unique": "Singapore;United States;China" }, { "id": "_kJXRDyaU0X", "title": "What Would the Expert $do(\\cdot)$?: Causal Imitation Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "We develop algorithms for imitation learning from data that was corrupted by unobserved confounders. Sources of such confounding include (a) persistent perturbations to actions or (b) the expert responding to a part of the state that the learner does not have access to. When a confounder affects multiple timesteps of recorded data, it can manifest as spurious correlations between states and actions that a learner might latch onto, leading to poor policy performance. By utilizing the effect of past states on current states, we are able to break up these spurious correlations, an application of the econometric technique of instrumental variable regression. This insight leads to two novel algorithms, one of a generative-modeling flavor ($\\texttt{DoubIL}$) that can utilize access to a simulator and one of a game-theoretic flavor ($\\texttt{ResiduIL}$) that can be run offline. Both approaches are able to find policies that match the result of a query to an unconfounded expert. We find both algorithms compare favorably to non-causal approaches on simulated control problems.", "keywords": "imitation learning;causal inference;reinforcement learning", "primary_area": "", "supplementary_material": "/attachment/bdcf23bf1dd2c2247c233608680746ca31ff687f.zip", "author": "Gokul Swamy;Sanjiban Choudhury;Drew Bagnell;Steven Wu", "authorids": "~Gokul_Swamy1;schoudhury@aurora.tech;~Drew_Bagnell2;~Steven_Wu1", "gender": ";;;", "homepage": "https://gokul.dev/;;https://robotwhisperer.org/;", "dblp": "31/11509;;;", "google_scholar": "Sbpra_AAAAAJ;;7t4jbPQAAAAJ;", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Gokul_Swamy1;schoudhury@aurora.tech;~Drew_Bagnell2;~Steven_Wu1", "aff": "Carnegie Mellon University;;Carnegie Mellon University;", "aff_domain": "cmu.edu;;cmu.edu;", "position": "PhD student;;Associate Professor;", "bibtex": "@misc{\nswamy2022what,\ntitle={What Would the Expert \\$do({\\textbackslash}cdot)\\$?: Causal Imitation Learning},\nauthor={Gokul Swamy and Sanjiban Choudhury and Drew Bagnell and Steven Wu},\nyear={2022},\nurl={https://openreview.net/forum?id=_kJXRDyaU0X}\n}", "github": "", "project": "", "reviewers": "Jzke;aRw4;X9N9;A5Fo", "site": "https://openreview.net/forum?id=_kJXRDyaU0X", "pdf_size": 0, "recommendation": "3;3;5;6", "confidence": "3;3;4;3", "correctness": "2;2;3;3", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "52;17;89;60", "wc_summary_review": "75;69;38;38", "wc_main_review": "315;195;337;188", "wc_review": "442;281;464;286", "wc_reply_reviewers": "203;0;35;23", "wc_reply_authors": "500;494;476;255", "reply_reviewers": "2;0;1;1", "reply_authors": "2;1;1;2", "recommendation_avg": [ 4.25, 1.299038105676658 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 54.5, 25.656383221335 ], "wc_summary_review_avg": [ 55.0, 17.131841699011815 ], "wc_main_review_avg": [ 258.75, 67.7435421276449 ], "wc_review_avg": [ 368.25, 85.12454111476902 ], "wc_reply_reviewers_avg": [ 65.25, 80.51824327442819 ], "wc_reply_authors_avg": [ 431.25, 102.14052819522719 ], "reply_reviewers_avg": [ 1.0, 0.7071067811865476 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": 0.9622504486493761, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:AUAl4gXx0zgJ:scholar.google.com/&scioq=What+Would+the+Expert+%24do(%5Ccdot)%24%3F:+Causal+Imitation+Learning&hl=en&as_sdt=0,5", "gs_version_total": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Carnegie Mellon University", "aff_unique_dep": "", "aff_unique_url": "https://www.cmu.edu", "aff_unique_abbr": "CMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "The Boltzmann Policy Distribution: Accounting for Systematic Suboptimality in Human Models", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7128", "id": "_l_QjPGN5ye", "poster": "", "openreview": "https://openreview.net/forum?id=_l_QjPGN5ye", "slides": "https://iclr.cc/virtual/2022/poster/7128", "video": "https://iclr.cc/virtual/2022/poster/7128", "author_site": "Cassidy Laidlaw, Anca Dragan", "tldr": "", "abstract": "Models of human behavior for prediction and collaboration tend to fall into two categories: ones that learn from large amounts of data via imitation learning, and ones that assume human behavior to be noisily-optimal for some reward function. The former are very useful, but only when it is possible to gather a lot of human data in the target environment and distribution. The advantage of the latter type, which includes Boltzmann rationality, is the ability to make accurate predictions in new environments without extensive data when humans are actually close to optimal. However, these models fail when humans exhibit systematic suboptimality, i.e. when their deviations from optimal behavior are not independent, but instead consistent over time. Our key insight is that systematic suboptimality can be modeled by predicting policies, which couple action choices over time, instead of trajectories. We introduce the Boltzmann policy distribution (BPD), which serves as a prior over human policies and adapts via Bayesian inference to capture systematic deviations by observing human actions during a single episode. The BPD is difficult to compute and represent because policies lie in a high-dimensional continuous space, but we leverage tools from generative and sequence modeling to enable efficient sampling and inference. We show that the BPD enables prediction of human behavior and human-AI collaboration equally as well as imitation learning-based human models while using far less data.", "keywords": "human model;boltzmann rationality;suboptimality;HRI;human-robot collaboration;generative models;reinforcement learning;deep RL", "primary_area": "", "supplementary_material": "", "author": "Cassidy Laidlaw;Anca Dragan", "authorids": "~Cassidy_Laidlaw1;~Anca_Dragan1", "gender": "M;F", "homepage": "https://cassidylaidlaw.com;http://www.ancadragan.com/", "dblp": "241/5375;", "google_scholar": "DzeJ67UAAAAJ;", "orcid": ";", "linkedin": ";", "or_profile": "~Cassidy_Laidlaw1;~Anca_Dragan1", "aff": "University of California, Berkeley;University of California, Berkeley", "aff_domain": "berkeley.edu;berkeley.edu", "position": "PhD student;Associate Professor", "bibtex": "@inproceedings{\nlaidlaw2022the,\ntitle={The Boltzmann Policy Distribution: Accounting for Systematic Suboptimality in Human Models},\nauthor={Cassidy Laidlaw and Anca Dragan},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=_l_QjPGN5ye}\n}", "github": "", "project": "", "reviewers": "Bv5s;9Tye;Cv9m;Hd8J", "pdf_size": 0, "recommendation": "6;8;8;8", "confidence": "3;4;2;4", "correctness": "4;4;4;4", "technical_novelty": "3;3;3;4", "empirical_novelty": "3;3;2;4", "wc_summary_paper": "358;248;80;90", "wc_summary_review": "64;101;34;94", "wc_main_review": "382;21;229;661", "wc_review": "804;370;343;845", "wc_reply_reviewers": "0;0;0;199", "wc_reply_authors": "661;39;377;1466", "reply_reviewers": "0;0;0;2", "reply_authors": "1;1;1;3", "recommendation_avg": [ 7.5, 0.8660254037844386 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "correctness_avg": [ 4.0, 0.0 ], "technical_novelty_avg": [ 3.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 194.0, 115.78428218026832 ], "wc_summary_review_avg": [ 73.25, 26.58359456506964 ], "wc_main_review_avg": [ 323.25, 233.3263540622876 ], "wc_review_avg": [ 590.5, 234.64281365513838 ], "wc_reply_reviewers_avg": [ 49.75, 86.16952767655164 ], "wc_reply_authors_avg": [ 635.75, 527.4975710844553 ], "reply_reviewers_avg": [ 0.5, 0.8660254037844386 ], "reply_authors_avg": [ 1.5, 0.8660254037844386 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.17407765595569782, "corr_recommendation_correctness": 0.0, "gs_citation": 40, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=403926585745142626&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=_l_QjPGN5ye", "email": "berkeley.edu;berkeley.edu", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "University of California, Berkeley", "aff_unique_dep": "", "aff_unique_url": "https://www.berkeley.edu", "aff_unique_abbr": "UC Berkeley", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Berkeley", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "_lmjQL6kcG", "title": "Improving the Transferability of Supervised Pretraining with an MLP Projector", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "The pretrain-finetune paradigm is a classical pipeline in visual learning. Recent progress on unsupervised pretraining methods showed superior transfer performance to their supervised counterparts. While a few works attempted to explore the underlying mechanisms, the reasons behind the transferability gaps still have not been fully explored. This paper reveals that the multilayer perceptron (MLP) projector is a key factor for the better transferability of unsupervised pretraining. Based on this observation, we attempt to close the transferability gap between supervised pretraining and unsupervised pretraining by adding an MLP projector before the classifier of supervised pretraining. Our analysis indicates that the MLP projector can help retain intra-class variation of visual features, decrease the feature distribution distance between pretraining dataset and evaluation dataset, and reduce feature redundancy for effective adaptation to new tasks. Extensive experiments demonstrate that the added MLP projector significantly boosts the transferability of supervised pretraining, \\emph{e.g.,} \\textbf{+7.2\\%} top-1 accuracy on the unseen class generalization task and \\textbf{+5.7\\%} top-1 accuracy on 12-domain classification tasks, making supervised pretraining even better than unsupervised pretraining.", "keywords": "transferability;supervised learning;MLP projector", "primary_area": "", "supplementary_material": "", "author": "Yizhou Wang;SHIXIANG TANG;Feng Zhu;LEI BAI;Rui Zhao;Donglian Qi;Wanli Ouyang", "authorids": "~Yizhou_Wang5;~SHIXIANG_TANG1;~Feng_Zhu1;~LEI_BAI1;~Rui_Zhao6;~Donglian_Qi1;~Wanli_Ouyang1", "gender": "M;M;M;M;M;F;", "homepage": "https://yizhouwang98.github.io/;;http://home.ustc.edu.cn/~zhufengx/;http://leibai.site/;http://zhaorui.xyz/;https://person.zju.edu.cn/0004117;", "dblp": "71/3387-7;260/6757;71/2791-6;119/1223-1;26/2578-1;;", "google_scholar": "CQGaGMAAAAAJ;TJ4ihdkAAAAJ;oO53gjEAAAAJ;https://scholar.google.com.au/citations?user=sakOO04AAAAJ;1c9oQNMAAAAJ;;", "orcid": "0009-0009-6819-5872;;;0000-0003-3378-7201;;;", "linkedin": "yizhouwang98/;shixiang-tang-80441a1a3/;;lei-bai-641370153/;;;", "or_profile": "~Yizhou_Wang5;~SHIXIANG_TANG1;~Feng_Zhu1;~LEI_BAI1;~Rui_Zhao6;~Donglian_Qi1;~Wanli_Ouyang1", "aff": "Zhejiang University;University of Sydney;SenseTime Group LTD;University of Sydney;SenseTime Research;Zhejiang University;", "aff_domain": "zju.edu.cn;sydney.edu.au;sensetime.com;sydney.edu.au;sensetime.com;zju.edu.cn;", "position": "MS student;PhD student;Researcher;Postdoc;Researcher;Full Professor;", "bibtex": "@misc{\nwang2022improving,\ntitle={Improving the Transferability of Supervised Pretraining with an {MLP} Projector},\nauthor={Yizhou Wang and SHIXIANG TANG and Feng Zhu and LEI BAI and Rui Zhao and Donglian Qi and Wanli Ouyang},\nyear={2022},\nurl={https://openreview.net/forum?id=_lmjQL6kcG}\n}", "github": "", "project": "", "reviewers": "F4Qx;qLZk;4jXb;HMuL", "site": "https://openreview.net/forum?id=_lmjQL6kcG", "pdf_size": 0, "recommendation": "3;3;5;6", "confidence": "5;4;4;4", "correctness": "3;2;3;4", "technical_novelty": "1;2;2;3", "empirical_novelty": "1;2;3;3", "wc_summary_paper": "55;143;50;85", "wc_summary_review": "32;27;45;32", "wc_main_review": "176;126;411;97", "wc_review": "263;296;506;214", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.25, 1.299038105676658 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 83.25, 37.002533697032156 ], "wc_summary_review_avg": [ 34.0, 6.670832032063167 ], "wc_main_review_avg": [ 202.5, 123.64970683345756 ], "wc_review_avg": [ 319.75, 111.41897280086548 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": -0.5555555555555555, "corr_recommendation_correctness": 0.816496580927726, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:zmOz48GPmdsJ:scholar.google.com/&scioq=Improving+the+Transferability+of+Supervised+Pretraining+with+an+MLP+Projector&hl=en&as_sdt=0,33", "gs_version_total": 2, "aff_unique_index": "0;1;2;1;3;0", "aff_unique_norm": "Zhejiang University;University of Sydney;SenseTime Group;SenseTime", "aff_unique_dep": ";;;SenseTime Research", "aff_unique_url": "https://www.zju.edu.cn;https://www.sydney.edu.au;https://www.sensetime.com;https://www.sensetime.com", "aff_unique_abbr": "ZJU;USYD;SenseTime;SenseTime", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;1;0;0", "aff_country_unique": "China;Australia" }, { "id": "_qc3iqcq-ps", "title": "On the Evolution of Neuron Communities in a Deep Learning Architecture", "track": "main", "status": "Reject", "tldr": "", "abstract": "Deep learning techniques are increasingly being adopted for classification tasks over the past decade, yet explaining how deep learning architectures can achieve state-of-the-art performance is still an elusive goal. While all the training information is embedded deeply in a trained model, we still do not understand much about its performance by only analyzing the model. This paper examines the neuron activation patterns of deep learning-based classification models and explores whether the models' performances can be explained through neurons' activation behavior. We propose two approaches: one that models neurons' activation behavior as a graph and examines whether the neurons form meaningful communities, and the other examines the predictability of neurons' behavior using entropy. Our comprehensive experimental study reveals that both the community quality and entropy can provide new insights into the deep learning models' performances, thus paves a novel way of explaining deep learning models directly from the neurons' activation pattern\n.", "keywords": "explainable ai;deep learning", "primary_area": "", "supplementary_material": "/attachment/4c5cdb4e6611d37de77dfd2323b80601717f437c.zip", "author": "Sakib Mostafa;Debajyoti Mondal", "authorids": "~Sakib_Mostafa1;~Debajyoti_Mondal1", "gender": "M;", "homepage": ";https://www.cs.usask.ca/faculty/dmondal/", "dblp": ";https://dblp.uni-trier.de/pid/90/8236", "google_scholar": "iDKl2-AAAAAJ;DYu56mwAAAAJ", "orcid": "my-orcid?orcid=0000-0002-4777-7832;", "linkedin": "sakib-mostafa-414151ba/;", "or_profile": "~Sakib_Mostafa1;~Debajyoti_Mondal1", "aff": "University of Saskatchewan;University of Saskatchewan", "aff_domain": "usask.ca;usask.ca", "position": "PhD student;Assistant Professor", "bibtex": "@misc{\nmostafa2022on,\ntitle={On the Evolution of Neuron Communities in a Deep Learning Architecture},\nauthor={Sakib Mostafa and Debajyoti Mondal},\nyear={2022},\nurl={https://openreview.net/forum?id=_qc3iqcq-ps}\n}", "github": "", "project": "", "reviewers": "RKXh;8T7r;Gvju;Xmnm", "site": "https://openreview.net/forum?id=_qc3iqcq-ps", "pdf_size": 0, "recommendation": "3;3;5;8", "confidence": "3;4;3;4", "correctness": "2;4;3;3", "technical_novelty": "1;1;3;3", "empirical_novelty": "1;2;3;2", "wc_summary_paper": "36;77;73;60", "wc_summary_review": "48;34;28;29", "wc_main_review": "305;305;588;197", "wc_review": "389;416;689;286", "wc_reply_reviewers": "0;0;85;0", "wc_reply_authors": "368;324;448;109", "reply_reviewers": "0;0;1;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 4.75, 2.0463381929681126 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.0, 1.0 ], "empirical_novelty_avg": [ 2.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 61.5, 16.00781059358212 ], "wc_summary_review_avg": [ 34.75, 7.980444849756184 ], "wc_main_review_avg": [ 348.75, 144.99719824879375 ], "wc_review_avg": [ 445.0, 148.9916105020682 ], "wc_reply_reviewers_avg": [ 21.25, 36.80607966083864 ], "wc_reply_authors_avg": [ 312.25, 125.48381369722551 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.3665083330689157, "corr_recommendation_correctness": 0.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13192223421127204577&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0", "aff_unique_norm": "University of Saskatchewan", "aff_unique_dep": "", "aff_unique_url": "https://www.usask.ca", "aff_unique_abbr": "U of S", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Canada" }, { "id": "_qjEae4op-", "title": "MoFE: Mixture of Factual Experts for Controlling Hallucinations in Abstractive Summarization", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Neural abstractive summarization models are susceptible to generating factually inconsistent content, a phenomenon known as hallucination. This limits the usability and adoption of these systems in real-world applications. To reduce the presence of hallucination, we propose the Mixture of Factual Experts (MoFE) model, which combines multiple summarization experts that each target a specific type of error. We train our experts using reinforcement learning (RL) to minimize the error defined by two factual consistency metrics: entity overlap and dependency arc entailment. We construct MoFE by combining the experts using two ensembling strategies (weights and logits) and evaluate them on two summarization datasets (XSUM and CNN/DM). Our experiments on BART models show that the MoFE improves performance according to both entity overlap and dependency arc entailment, without a significant performance drop on standard ROUGE metrics. The performance improvement also transfers to unseen factual consistency metrics, such as question answer-based factuality evaluation metric and BERTScore precision with respect to the source document.", "keywords": "abstractive summarization;content hallucinations", "primary_area": "", "supplementary_material": "", "author": "Prafulla Kumar Choubey;Jesse Vig;Wenhao Liu;Nazneen Rajani", "authorids": "~Prafulla_Kumar_Choubey2;~Jesse_Vig1;~Wenhao_Liu1;~Nazneen_Rajani1", "gender": "M;;;", "homepage": ";https://jessevig.com;;", "dblp": "203/8260;24/2880;;", "google_scholar": "k7aMOCsAAAAJ;OWwZXTwAAAAJ;;", "orcid": ";;;", "linkedin": ";jesse-vig;;", "or_profile": "~Prafulla_Kumar_Choubey2;~Jesse_Vig1;~Wenhao_Liu1;~Nazneen_Rajani1", "aff": "SalesForce.com;Salesforce Research;;", "aff_domain": "salesforce.com;salesforce.com;;", "position": "Researcher;Research Scientist;;", "bibtex": "@misc{\nchoubey2022mofe,\ntitle={Mo{FE}: Mixture of Factual Experts for Controlling Hallucinations in Abstractive Summarization},\nauthor={Prafulla Kumar Choubey and Jesse Vig and Wenhao Liu and Nazneen Rajani},\nyear={2022},\nurl={https://openreview.net/forum?id=_qjEae4op-}\n}", "github": "", "project": "", "reviewers": "hLri;15Sz;ui9y;RuGC", "site": "https://openreview.net/forum?id=_qjEae4op-", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "4;5;4;4", "correctness": "3;3;3;3", "technical_novelty": "3;2;2;3", "empirical_novelty": "2;0;2;3", "wc_summary_paper": "145;96;74;101", "wc_summary_review": "35;34;13;26", "wc_main_review": "406;478;121;236", "wc_review": "586;608;208;363", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "435;0;108;264", "reply_reviewers": "0;0;0;0", "reply_authors": "1;0;1;1", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 1.75, 1.0897247358851685 ], "wc_summary_paper_avg": [ 104.0, 25.758493744782516 ], "wc_summary_review_avg": [ 27.0, 8.803408430829505 ], "wc_main_review_avg": [ 310.25, 140.21122458633616 ], "wc_review_avg": [ 441.25, 165.29273275011215 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 201.75, 164.14380128411796 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0.75, 0.4330127018922193 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": 0.0, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9129312184830076917&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Salesforce", "aff_unique_dep": "", "aff_unique_url": "https://www.salesforce.com", "aff_unique_abbr": "Salesforce", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Path Integral Sampler: A Stochastic Control Approach For Sampling", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7195", "id": "_uCb2ynRu7Y", "poster": "", "openreview": "https://openreview.net/forum?id=_uCb2ynRu7Y", "slides": "https://iclr.cc/virtual/2022/poster/7195", "video": "https://iclr.cc/virtual/2022/poster/7195", "author_site": "Qinsheng Zhang, Yongxin Chen", "tldr": "", "abstract": "We present Path Integral Sampler~(PIS), a novel algorithm to draw samples from unnormalized probability density functions. The PIS is built on the Schr\\\"odinger bridge problem which aims to recover the most likely evolution of a diffusion process given its initial distribution and terminal distribution. The PIS draws samples from the initial distribution and then propagates the samples through the Schr\\\"odinger bridge to reach the terminal distribution. Applying the Girsanov theorem, with a simple prior diffusion, we formulate the PIS as a stochastic optimal control problem whose running cost is the control energy and terminal cost is chosen according to the target distribution. By modeling the control as a neural network, we establish a sampling algorithm that can be trained end-to-end. We provide theoretical justification of the sampling quality of PIS in terms of Wasserstein distance when sub-optimal control is used. Moreover, the path integrals theory is used to compute importance weights of the samples to compensate for the bias induced by the sub-optimality of the controller and the time-discretization. We experimentally demonstrate the advantages of PIS compared with other start-of-the-art sampling methods on a variety of tasks.", "keywords": "Sampling;Path Integral;Stochastic Differential Equation;MCMC", "primary_area": "", "supplementary_material": "/attachment/484a2b361b234755970542e97f1eff4bb74bec3a.zip", "author": "Qinsheng Zhang;Yongxin Chen", "authorids": "~Qinsheng_Zhang1;~Yongxin_Chen1", "gender": "M;M", "homepage": "https://qsh-zh.github.io/;https://yongxin.ae.gatech.edu/", "dblp": ";", "google_scholar": ";X8BYiV4AAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Qinsheng_Zhang1;~Yongxin_Chen1", "aff": "Georgia Institute of Technology;Georgia Institute of Technology", "aff_domain": "gatech.edu;gatech.edu", "position": "PhD student;Assistant Professor", "bibtex": "@inproceedings{\nzhang2022path,\ntitle={Path Integral Sampler: A Stochastic Control Approach For Sampling},\nauthor={Qinsheng Zhang and Yongxin Chen},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=_uCb2ynRu7Y}\n}", "github": "", "project": "", "reviewers": "1wV8;d7Mk;ZcgD;dEAt", "pdf_size": 0, "recommendation": "5;6;8;8", "confidence": "3;5;4;3", "correctness": "3;3;4;4", "technical_novelty": "3;3;3;3", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "80;94;78;38", "wc_summary_review": "26;106;95;29", "wc_main_review": "308;1777;273;108", "wc_review": "414;1977;446;175", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "1510;2039;1172;246", "reply_reviewers": "0;0;0;0", "reply_authors": "3;4;3;2", "recommendation_avg": [ 6.75, 1.299038105676658 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 72.5, 20.850659461993043 ], "wc_summary_review_avg": [ 64.0, 36.72192805395708 ], "wc_main_review_avg": [ 616.5, 674.2582962040586 ], "wc_review_avg": [ 753.0, 714.3930990708127 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 1241.75, 652.6769396110146 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 3.0, 0.7071067811865476 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.058025885318565944, "corr_recommendation_correctness": 0.9622504486493761, "gs_citation": 115, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17129588743049853976&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=_uCb2ynRu7Y", "email": "gatech.edu;gatech.edu", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Georgia Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.gatech.edu", "aff_unique_abbr": "Georgia Tech", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "_uOnt-62ll", "title": "Scaling Laws for the Few-Shot Adaptation of Pre-trained Image Classifiers", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Empirical science of neural scaling laws is a rapidly growing area of significant importance to the future of machine learning, particularly in the light of recent breakthroughs achieved by large-scale pre-trained models such as GPT-3, CLIP and DALL-e. Accurately predicting the neural network performance with increasing resources such as data, compute and model size provides a more comprehensive evaluation of different approaches across multiple scales, as opposed to traditional point-wise comparisons of fixed-size models on fixed-size benchmarks, and, most importantly, allows for focus on the best-scaling, and thus most promising in the future, approaches. In this work, we consider a challenging problem of few-shot learning in image classification, especially when the target data distribution in the few-shot phase is different from the source, training, data distribution, in a sense that it includes new image classes not encountered during training. Our current main goal is to investigate how the amount of pre-training data affects the few-shot generalization performance of standard image classifiers. Our key observations are that (1) such performance improvements are well-approximated by power laws (linear log-log plots) as the training set size increases, (2) this applies to both cases of target data coming from either the same or from a different domain (i.e., new classes) as the training data, and (3) few-shot performance on new classes converges at a faster rate than the standard classification performance on previously seen classes. Our findings shed new light on the relationship between scale and generalization.", "keywords": "scaling;scale;law;laws;few-shot;one-shot;out-of-distribution;ood;generalization;image;vision", "primary_area": "", "supplementary_material": "", "author": "Gabriele Prato;Simon Guiroy;Ethan Caballero;Irina Rish;Sarath Chandar", "authorids": "~Gabriele_Prato1;~Simon_Guiroy1;~Ethan_Caballero1;~Irina_Rish1;~Sarath_Chandar1", "gender": ";Not Specified;;F;M", "homepage": ";https://simonguiroy.github.io/;http://ethancaballero.github.io/;http://irina-rish.com;http://sarathchandar.in/", "dblp": ";;;;45/8542", "google_scholar": ";;KvLJAf0AAAAJ;Avse5gIAAAAJ;https://scholar.google.co.in/citations?user=yxWtZLAAAAAJ", "orcid": ";;;;", "linkedin": ";;;irina-rish-8b2162;", "or_profile": "~Gabriele_Prato1;~Simon_Guiroy1;~Ethan_Caballero1;~Irina_Rish1;~Sarath_Chandar1", "aff": ";Montreal Institute for Learning Algorithms, University of Montreal, University of Montreal;Mila;University of Montreal;\u00c9cole Polytechnique de Montr\u00e9al", "aff_domain": ";mila.umontreal.ca;mila.quebec;mila.quebec;polymtl.ca", "position": ";PhD student;PhD student;Professor;Assistant Professor", "bibtex": "@misc{\nprato2022scaling,\ntitle={Scaling Laws for the Few-Shot Adaptation of Pre-trained Image Classifiers},\nauthor={Gabriele Prato and Simon Guiroy and Ethan Caballero and Irina Rish and Sarath Chandar},\nyear={2022},\nurl={https://openreview.net/forum?id=_uOnt-62ll}\n}", "github": "", "project": "", "reviewers": "cpeP;b4Sf;GtXw;79jf", "site": "https://openreview.net/forum?id=_uOnt-62ll", "pdf_size": 0, "recommendation": "3;3;3;3", "confidence": "4;4;4;4", "correctness": "2;3;2;3", "technical_novelty": "1;2;2;1", "empirical_novelty": "1;2;3;2", "wc_summary_paper": "34;131;71;292", "wc_summary_review": "359;90;71;184", "wc_main_review": "673;898;606;1014", "wc_review": "1066;1119;748;1490", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 1.5, 0.5 ], "empirical_novelty_avg": [ 2.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 132.0, 98.64836541980814 ], "wc_summary_review_avg": [ 176.0, 113.98903456034708 ], "wc_main_review_avg": [ 797.75, 165.1853126037542 ], "wc_review_avg": [ 1105.75, 263.3385416151612 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11510385341951771284&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;0;2", "aff_unique_norm": "University of Montreal;Mila;\u00c9cole Polytechnique de Montr\u00e9al", "aff_unique_dep": "Montreal Institute for Learning Algorithms;Quebec Artificial Intelligence Institute;", "aff_unique_url": "https://www.umontreal.ca;https://mila.quebec;https://www.polymtl.ca", "aff_unique_abbr": "UM;Mila;Polytechnique Montr\u00e9al", "aff_campus_unique_index": "0;2", "aff_campus_unique": "Montreal;;Montr\u00e9al", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Canada" }, { "title": "Geometric and Physical Quantities improve E(3) Equivariant Message Passing", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6225", "id": "_xwr8gOBeV1", "poster": "", "openreview": "https://openreview.net/forum?id=_xwr8gOBeV1", "slides": "https://iclr.cc/virtual/2022/poster/6225", "video": "https://iclr.cc/virtual/2022/poster/6225", "author_site": "Johannes Brandstetter, Rob Hesselink, Elise van der Pol, Erik Bekkers, Max Welling", "tldr": "", "abstract": "Including covariant information, such as position, force, velocity or spin is important in many tasks in computational physics and chemistry. We introduce Steerable E($3$) Equivariant Graph Neural Networks (SEGNNs) that generalise equivariant graph networks, such that node and edge attributes are not restricted to invariant scalars, but can contain covariant information, such as vectors or tensors. Our model, composed of steerable MLPs, is able to incorporate geometric and physical information in both the message and update functions.\nThrough the definition of steerable node attributes, the MLPs provide a new class of activation functions for general use with steerable feature fields. We discuss ours and related work through the lens of equivariant non-linear convolutions, which further allows us to pin-point the successful components of SEGNNs: non-linear message aggregation improves upon classic linear (steerable) point convolutions; steerable messages improve upon recent equivariant graph networks that send invariant messages. We demonstrate the effectiveness of our method on several tasks in computational physics and chemistry and provide extensive ablation studies.", "keywords": "equivariant graph neural networks;steerable message passing;non-linear convolutions;molecular modeling;covariant information", "primary_area": "", "supplementary_material": "/attachment/6d3c1183d7b9d9a521fab1347ddcf929b55234db.zip", "author": "Johannes Brandstetter;Rob Hesselink;Elise van der Pol;Erik J Bekkers;Max Welling", "authorids": "~Johannes_Brandstetter1;~Rob_Hesselink1;~Elise_van_der_Pol1;~Erik_J_Bekkers1;~Max_Welling1", "gender": "M;F;;M;M", "homepage": ";http://elisevanderpol.nl;https://erikbekkers.bitbucket.io/;https://staff.fnwi.uva.nl/m.welling/;https://amlab.science.uva.nl/", "dblp": "251/8691;186/8470.html;43/5596;16/2286;", "google_scholar": "KiRvOHcAAAAJ;https://scholar.google.nl/citations?user=564o-vIAAAAJ;https://scholar.google.nl/citations?user=yeWrfR4AAAAJ;https://scholar.google.nl/citations?user=8200InoAAAAJ;", "orcid": ";;;0000-0003-1484-2121;", "linkedin": ";;;;", "or_profile": "~Johannes_Brandstetter1;~Elise_van_der_Pol1;~Erik_J_Bekkers1;~Max_Welling1;~Rob_Daniel_Hesselink1", "aff": "Microsoft;University of Amsterdam;University of Amsterdam;University of Amsterdam;University of Amsterdam", "aff_domain": "microsoft.com;uva.nl;uva.nl;uva.nl;uva.nl", "position": "Researcher;PhD student;Assistant Professor;Full Professor;PhD student", "bibtex": "@inproceedings{\nbrandstetter2022geometric,\ntitle={Geometric and Physical Quantities improve E(3) Equivariant Message Passing},\nauthor={Johannes Brandstetter and Rob Hesselink and Elise van der Pol and Erik J Bekkers and Max Welling},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=_xwr8gOBeV1}\n}", "github": "", "project": "", "reviewers": "vbV1;k14d;zF5H;SAGC;aA3f;Mqkb", "pdf_size": 0, "recommendation": "6;6;6;6;8;10", "confidence": "3;3;3;4;4;5", "correctness": "3;3;4;4;4;4", "technical_novelty": "3;3;4;2;2;4", "empirical_novelty": "3;3;4;2;2;4", "wc_summary_paper": "104;122;82;80;48;129", "wc_summary_review": "33;54;24;50;160;67", "wc_main_review": "413;484;323;416;824;679", "wc_review": "550;660;429;546;1032;875", "wc_reply_reviewers": "22;0;140;51;55;0", "wc_reply_authors": "643;795;1571;774;566;818", "reply_reviewers": "1;0;1;1;1;0", "reply_authors": "1;1;3;1;1;1", "recommendation_avg": [ 7.0, 1.5275252316519468 ], "confidence_avg": [ 3.6666666666666665, 0.7453559924999298 ], "correctness_avg": [ 3.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 3.0, 0.816496580927726 ], "empirical_novelty_avg": [ 3.0, 0.816496580927726 ], "wc_summary_paper_avg": [ 94.16666666666667, 27.58270392031129 ], "wc_summary_review_avg": [ 64.66666666666667, 44.86894496444308 ], "wc_main_review_avg": [ 523.1666666666666, 173.20836841471862 ], "wc_review_avg": [ 682.0, 208.23144175011933 ], "wc_reply_reviewers_avg": [ 44.666666666666664, 47.85278350199588 ], "wc_reply_authors_avg": [ 861.1666666666666, 329.7531484139142 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.74535599249993 ], "replies_avg": [ 20, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.8783100656536799, "corr_recommendation_correctness": 0.46291004988627577, "gs_citation": 268, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10039670233060190176&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=_xwr8gOBeV1", "email": "microsoft.com;uva.nl;uva.nl;uva.nl;uva.nl", "author_num": 5, "aff_unique_index": "0;1;1;1;1", "aff_unique_norm": "Microsoft;University of Amsterdam", "aff_unique_dep": "Microsoft Corporation;", "aff_unique_url": "https://www.microsoft.com;https://www.uva.nl", "aff_unique_abbr": "Microsoft;UvA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1;1", "aff_country_unique": "United States;Netherlands" }, { "id": "_xxbJ7oSJXX", "title": "Offline Reinforcement Learning with Resource Constrained Online Deployment", "track": "main", "status": "Reject", "tldr": "", "abstract": "Offline reinforcement learning is used to train policies in scenarios where real-time access to the environment is expensive or impossible.\nAs a natural consequence of these harsh conditions, an agent may lack the resources to fully observe the online environment before taking an action. We dub this situation the resource-constrained setting. This leads to situations where the offline dataset (available for training) can contain fully processed features (using powerful language models, image models, complex sensors, etc.) which are not available when actions are actually taken online.\nThis disconnect leads to an interesting and unexplored problem in offline RL: Is it possible to use a richly processed offline dataset to train a policy which has access to fewer features in the online environment? \nIn this work, we introduce and formalize this novel resource-constrained problem setting. We highlight the performance gap between policies trained using the full offline dataset and policies trained using limited features. \nWe address this performance gap with a policy transfer algorithm which first trains a teacher agent using the offline dataset where features are fully available, and then transfers this knowledge to a student agent that only uses the resource-constrained features. To better capture the challenge of this setting, we propose a data collection procedure: Resource Constrained-Datasets for RL (RC-D4RL). We evaluate our transfer algorithm on RC-D4RL and the popular D4RL benchmarks and observe consistent improvement over the baseline (TD3+BC without transfer).", "keywords": "Offline Reinforcement Learning;Reinforcement Learning;Transfer Learning;Knowledge Transfer;Resource Constraints", "primary_area": "", "supplementary_material": "", "author": "Jayanth Reddy Regatti;Aniket Anand Deshmukh;Young Hun Jung;Frank Cheng;Abhishek Gupta;Urun Dogan", "authorids": "~Jayanth_Reddy_Regatti1;~Aniket_Anand_Deshmukh1;~Young_Hun_Jung2;~Frank_Cheng1;~Abhishek_Gupta3;~Urun_Dogan1", "gender": "M;M;M;;M;M", "homepage": ";http://www-personal.umich.edu/~aniketde/;;https://chengfulun.github.io/;;", "dblp": "248/5721;;;;;", "google_scholar": "zZlhJWUAAAAJ;a4cD32QAAAAJ;ajqlbHUAAAAJ;;pha24HQAAAAJ;1ETBduMAAAAJ", "orcid": ";;;;;", "linkedin": "jayanth-reddy-regatti-b9019550/;aniket2305/;young-hun-jung/;;;", "or_profile": "~Jayanth_Reddy_Regatti1;~Aniket_Anand_Deshmukh1;~Young_Hun_Jung2;~Frank_Cheng1;~Abhishek_Gupta3;~Urun_Dogan1", "aff": "Ohio State University;Microsoft;;Microsoft;The Ohio State University;Microsoft", "aff_domain": "osu.edu;microsoft.com;;microsoft.com;osu.edu;microsoft.com", "position": "PhD student;Data & Applied Scientist;;Applied Scientist;Assistant Professor;Researcher", "bibtex": "@misc{\nregatti2022offline,\ntitle={Offline Reinforcement Learning with Resource Constrained Online Deployment},\nauthor={Jayanth Reddy Regatti and Aniket Anand Deshmukh and Young Hun Jung and Frank Cheng and Abhishek Gupta and Urun Dogan},\nyear={2022},\nurl={https://openreview.net/forum?id=_xxbJ7oSJXX}\n}", "github": "", "project": "", "reviewers": "oGRd;QjjH;CV2Z;pgM4", "site": "https://openreview.net/forum?id=_xxbJ7oSJXX", "pdf_size": 0, "recommendation": "5;5;5;6", "confidence": "2;4;4;3", "correctness": "4;3;4;4", "technical_novelty": "1;4;2;3", "empirical_novelty": "3;4;3;2", "wc_summary_paper": "103;46;153;82", "wc_summary_review": "23;70;78;78", "wc_main_review": "319;664;523;330", "wc_review": "445;780;754;490", "wc_reply_reviewers": "438;302;276;0", "wc_reply_authors": "862;2310;1349;798", "reply_reviewers": "1;2;2;0", "reply_authors": "3;6;4;2", "recommendation_avg": [ 5.25, 0.4330127018922193 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 1.118033988749895 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 96.0, 38.71046370169182 ], "wc_summary_review_avg": [ 62.25, 22.895141405983935 ], "wc_main_review_avg": [ 459.0, 143.49390230947097 ], "wc_review_avg": [ 617.25, 150.8730840806272 ], "wc_reply_reviewers_avg": [ 254.0, 159.02829936838285 ], "wc_reply_authors_avg": [ 1329.75, 604.7331539613155 ], "reply_reviewers_avg": [ 1.25, 0.82915619758885 ], "reply_authors_avg": [ 3.75, 1.479019945774904 ], "replies_avg": [ 33, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.17407765595569782, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:ueHTVPxNcUoJ:scholar.google.com/&scioq=Offline+Reinforcement+Learning+with+Resource+Constrained+Online+Deployment&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;1;0;1", "aff_unique_norm": "Ohio State University;Microsoft", "aff_unique_dep": ";Microsoft Corporation", "aff_unique_url": "https://www.osu.edu;https://www.microsoft.com", "aff_unique_abbr": "OSU;Microsoft", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "_ysluXvD1M", "title": "Equal Experience in Recommender Systems", "track": "main", "status": "Reject", "tldr": "", "abstract": "We explore the fairness issue that arises in recommender systems. Biased data due to inherent stereotypes of particular groups (e.g., male students' average rating on mathematics is often higher than that on humanities, and vice versa for females) may yield a limited scope of suggested items to a certain group of users. Our main contribution lies in the introduction of a novel fairness notion (that we call equal experience), which can serve to regulate such unfairness in the presence of biased data. The notion captures the degree of the equal experience of item recommendations across distinct groups. We propose an optimization framework that incorporates the fairness notion as a regularization term, as well as introduce computationally-efficient algorithms that solve the optimization. Experiments on synthetic and benchmark real datasets demonstrate that the proposed framework can indeed mitigate such unfairness while exhibiting a minor degradation of recommendation accuracy.", "keywords": "Fairness;Recommender systems", "primary_area": "", "supplementary_material": "/attachment/03af87f5946fb8ca2edcea10cc8e09a198c0c66e.zip", "author": "Jaewoong Cho;Moonseok Choi;Changho Suh", "authorids": "~Jaewoong_Cho1;~Moonseok_Choi1;~Changho_Suh1", "gender": ";M;M", "homepage": "https://sites.google.com/view/jaewoongcho;;https://csuh.kaist.ac.kr", "dblp": "184/3848;331/2083;75/1420", "google_scholar": ";i-pOb1IAAAAJ;https://scholar.google.com.tw/citations?user=B1guGw8AAAAJ", "orcid": ";;0000-0002-3101-4291", "linkedin": ";moonseok-choi/;changho-suh-584aa732/?originalSubdomain=kr", "or_profile": "~Jaewoong_Cho1;~Moonseok_Choi1;~Changho_Suh1", "aff": "Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology", "aff_domain": "kaist.ac.kr;kaist.ac.kr;kaist.ac.kr", "position": "PhD student;PhD student;Associate Professor", "bibtex": "@misc{\ncho2022equal,\ntitle={Equal Experience in Recommender Systems},\nauthor={Jaewoong Cho and Moonseok Choi and Changho Suh},\nyear={2022},\nurl={https://openreview.net/forum?id=_ysluXvD1M}\n}", "github": "", "project": "", "reviewers": "dddS;zEp7;TDXy;85io", "site": "https://openreview.net/forum?id=_ysluXvD1M", "pdf_size": 0, "recommendation": "3;5;6;6", "confidence": "3;4;3;4", "correctness": "4;3;3;4", "technical_novelty": "2;3;2;2", "empirical_novelty": "2;3;2;2", "wc_summary_paper": "81;53;39;62", "wc_summary_review": "36;98;60;58", "wc_main_review": "133;557;132;91", "wc_review": "250;708;231;211", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "289;696;186;172", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 5.0, 1.224744871391589 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 58.75, 15.237699957670777 ], "wc_summary_review_avg": [ 63.0, 22.293496809607955 ], "wc_main_review_avg": [ 228.25, 190.5588819761493 ], "wc_review_avg": [ 350.0, 207.15091117347276 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 335.75, 212.84075620049842 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.40824829046386296, "corr_recommendation_correctness": -0.40824829046386296, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:y_kiOJN8g7QJ:scholar.google.com/&scioq=Equal+Experience+in+Recommender+Systems&hl=en&as_sdt=0,33", "gs_version_total": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Korea Advanced Institute of Science and Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.kaist.ac.kr", "aff_unique_abbr": "KAIST", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "South Korea" }, { "id": "_zL5mZ95FV6", "title": "BLUnet: Arithmetic-free Inference with Bit-serialised Table Lookup Operation for Efficient Deep Neural Networks", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Deep neural networks (DNNs) are both computation and memory intensive. Large amounts of costly arithmetic multiply-accumulate (MAC) operations and data movement hinder its application to edge AI where DNN models are required to run on energy-constrained platforms. Table lookup operations have potential advantages over traditional arithmetic multiplication and addition operations in terms of both energy consumption and latency in hardware implementations for DNN design. Moreover, the integration of weights into the table lookup operation eliminates costly weight movements. However, the challenge of using table lookups is in scaling. In particular, the size and lookup times of tables grow exponentially with the fan-in of the tables. In this paper, we propose BLUnet, a table lookup-based DNN model with bit-serialized input to overcome this challenge. Using binarized time series inputs, we successfully solve the fan-in issue of lookup tables. BLUnet not only achieves high efficiency but also the same accuracies as MAC-based neural networks. We experimented with popular models in computer vision applications to confirm this. Our experimental results show that compared to MAC-based baseline designs as well as the state-of-the-art solutions, BLUnet achieves orders of magnitude improvement in energy efficiencies. ", "keywords": "Efficient Inference;Deep Neural Networks", "primary_area": "", "supplementary_material": "", "author": "Tao Luo;Zhehui Wang;Daniel Gerlinghoff;Rick Siow Mong Goh;Weng-Fai Wong", "authorids": "~Tao_Luo2;~Zhehui_Wang2;~Daniel_Gerlinghoff1;~Rick_Siow_Mong_Goh1;~Weng-Fai_Wong1", "gender": "M;;;M;", "homepage": ";;https://sites.google.com/view/rickgoh/home;https://www.comp.nus.edu.sg/~wongwf/;http://zhehui-wang.github.io", "dblp": "43/4720-14;;https://dblp.uni-trier.de/pers/g/Goh:Rick_Siow_Mong;37/1143;", "google_scholar": "d4KZI8MAAAAJ;;https://scholar.google.com.sg/citations?user=fBsBJjoAAAAJ;https://scholar.google.com.tw/citations?user=SL1cTsIAAAAJ;", "orcid": "0000-0002-3415-3676;0000-0001-7332-1663;0000-0001-9116-1595;0000-0002-4281-2053;", "linkedin": ";;rickgoh/;;", "or_profile": "~Tao_Luo2;~Daniel_Gerlinghoff1;~Rick_Siow_Mong_Goh1;~Weng-Fai_Wong1;~zhehui_wang1", "aff": "Institute of High Performance Computing, Singapore, A*STAR;Institute of High Performance Computing, Singapore, A*STAR;Institute of High Performance Computing, Singapore, A*STAR;National University of Singapore;Institute of High Performance Computing, Singapore, A*STAR", "aff_domain": "ihpc.a-star.edu.sg;ihpc.a-star.edu.sg;ihpc.a-star.edu.sg;nus.edu.sg;ihpc.a-star.edu.sg", "position": "Researcher;Researcher;Director;Associate Professor;Researcher", "bibtex": "@misc{\nluo2022blunet,\ntitle={{BLU}net: Arithmetic-free Inference with Bit-serialised Table Lookup Operation for Efficient Deep Neural Networks},\nauthor={Tao Luo and Zhehui Wang and Daniel Gerlinghoff and Rick Siow Mong Goh and Weng-Fai Wong},\nyear={2022},\nurl={https://openreview.net/forum?id=_zL5mZ95FV6}\n}", "github": "", "project": "", "reviewers": "Hkok;4boW;4pUJ;sonT", "site": "https://openreview.net/forum?id=_zL5mZ95FV6", "pdf_size": 0, "recommendation": "3;3;3;3", "confidence": "4;3;5;4", "correctness": "2;2;2;2", "technical_novelty": "2;2;1;3", "empirical_novelty": "2;2;1;0", "wc_summary_paper": "86;77;38;123", "wc_summary_review": "82;42;36;72", "wc_main_review": "325;657;157;502", "wc_review": "493;776;231;697", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 2.0, 0.0 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 1.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 81.0, 30.224162519414826 ], "wc_summary_review_avg": [ 58.0, 19.44222209522358 ], "wc_main_review_avg": [ 410.25, 187.55449208163478 ], "wc_review_avg": [ 549.25, 210.7680893778752 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8184024530013013554&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;0;1;0", "aff_unique_norm": "Institute of High Performance Computing;National University of Singapore", "aff_unique_dep": ";", "aff_unique_url": "https://www.ihpc.a-star.edu.sg;https://www.nus.edu.sg", "aff_unique_abbr": "IHPC;NUS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "Singapore" }, { "id": "a0SRWViFYW", "title": "Stochastic Projective Splitting: Solving Saddle-Point Problems with Multiple Regularizers", "track": "main", "status": "Reject", "tldr": "", "abstract": "We present a new, stochastic variant of the projective splitting (PS) family of algorithms for monotone inclusion problems. It can solve min-max and noncooperative game formulations arising in applications such as robust ML without the convergence issues associated with gradient descent-ascent, the current de facto standard approach in ML applications. Our proposal is the first version of PS able to use stochastic gradient oracles. It can solve min-max games while handling multiple constraints and nonsmooth regularizers via projection and proximal operators. Unlike other stochastic splitting methods that can solve such problems, our method does not rely on a product-space reformulation of the original problem. We prove almost-sure convergence of the iterates to the solution and a convergence rate for the expected residual. By working with monotone inclusions rather than variational inequalities, our analysis avoids the drawbacks of measuring convergence through the restricted gap function. We close with numerical experiments on a distributionally robust sparse logistic regression problem.", "keywords": "convex optimization;min-max games;saddle-point problems;first-order stochastic methods;proximal methods;operator splitting", "primary_area": "", "supplementary_material": "/attachment/7360fef113d991247626b0761f946907be1d134e.zip", "author": "Patrick R. Johnstone;Jonathan Eckstein;Thomas Flynn;Shinjae Yoo", "authorids": "~Patrick_R._Johnstone1;~Jonathan_Eckstein1;~Thomas_Flynn1;~Shinjae_Yoo1", "gender": ";M;M;M", "homepage": "https://sites.google.com/site/proycejohnstone/home?authuser=0;http://eckstein.rutgers.edu;https://thomasflynn.org;", "dblp": ";;63/4858;69/1062", "google_scholar": ";vCYxzygAAAAJ;https://scholar.google.com/citations?hl=en;https://scholar.google.com/citations?hl=en", "orcid": ";0000-0002-6871-2691;0000-0003-4083-7086;", "linkedin": ";;;", "or_profile": "~Patrick_R._Johnstone1;~Jonathan_Eckstein1;~Thomas_Flynn1;~Shinjae_Yoo1", "aff": "Brookhaven National Lab;Rutgers University;Brookhaven National Laboratory;Brookhaven National Lab", "aff_domain": "bnl.gov;rutgers.edu;bnl.gov;bnl.gov", "position": "Postdoc;Full Professor;Researcher;Scientist", "bibtex": "@misc{\njohnstone2022stochastic,\ntitle={Stochastic Projective Splitting: Solving Saddle-Point Problems with Multiple Regularizers},\nauthor={Patrick R. Johnstone and Jonathan Eckstein and Thomas Flynn and Shinjae Yoo},\nyear={2022},\nurl={https://openreview.net/forum?id=a0SRWViFYW}\n}", "github": "", "project": "", "reviewers": "vqpx;LTCL;H9EN", "site": "https://openreview.net/forum?id=a0SRWViFYW", "pdf_size": 0, "recommendation": "5;5;6", "confidence": "4;4;4", "correctness": "4;4;3", "technical_novelty": "3;2;4", "empirical_novelty": "2;3;4", "wc_summary_paper": "68;87;56", "wc_summary_review": "104;39;49", "wc_main_review": "934;308;544", "wc_review": "1106;434;649", "wc_reply_reviewers": "905;76;35", "wc_reply_authors": "3389;684;759", "reply_reviewers": "2;1;1", "reply_authors": "7;3;1", "recommendation_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 3.0, 0.816496580927726 ], "empirical_novelty_avg": [ 3.0, 0.816496580927726 ], "wc_summary_paper_avg": [ 70.33333333333333, 12.762793146051099 ], "wc_summary_review_avg": [ 64.0, 28.577380332470412 ], "wc_main_review_avg": [ 595.3333333333334, 258.1283057361634 ], "wc_review_avg": [ 729.6666666666666, 280.2098420033735 ], "wc_reply_reviewers_avg": [ 338.6666666666667, 400.80779544426133 ], "wc_reply_authors_avg": [ 1610.6666666666667, 1257.8442758236101 ], "reply_reviewers_avg": [ 1.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 3.6666666666666665, 2.494438257849294 ], "replies_avg": [ 20, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": -0.9999999999999997, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8213211010442516161&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;0;0", "aff_unique_norm": "Brookhaven National Laboratory;Rutgers University", "aff_unique_dep": ";", "aff_unique_url": "https://www.bnl.gov;https://www.rutgers.edu", "aff_unique_abbr": "BNL;Rutgers", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "a1m8Jba-N6l", "title": "$k$-Mixup Regularization for Deep Learning via Optimal Transport", "track": "main", "status": "Reject", "tldr": "", "abstract": "Mixup is a popular regularization technique for training deep neural networks that can improve generalization and increase adversarial robustness. It perturbs input training data in the direction of other randomly-chosen instances in the training set. To better leverage the structure of the data, we extend mixup to $k$-mixup by perturbing $k$-batches of training points in the direction of other $k$-batches using displacement interpolation, i.e. interpolation under the Wasserstein metric. We demonstrate theoretically and in simulations that $k$-mixup preserves cluster and manifold structures, and we extend theory studying the efficacy of standard mixup to the $k$-mixup case. Our empirical results show that training with $k$-mixup further improves generalization and robustness across several network architectures and benchmark datasets of differing modalities.", "keywords": "Neural networks;Classification;Data augmentation;Optimal Transport", "primary_area": "", "supplementary_material": "", "author": "Kristjan Greenewald;Anming Gu;Mikhail Yurochkin;Justin Solomon;Edward Chien", "authorids": "~Kristjan_Greenewald1;~Anming_Gu1;~Mikhail_Yurochkin1;~Justin_Solomon1;~Edward_Chien1", "gender": ";M;M;M;M", "homepage": "https://researcher.watson.ibm.com/researcher/view.php?person=ibm-Kristjan.H.Greenewald;https://anminggu.github.io/;https://moonfolk.github.io/;http://people.csail.mit.edu/jsolomon/;https://cs-people.bu.edu/edchien/", "dblp": "146/0563;;191/6719;80/5094;182/9382", "google_scholar": "L3zNUG4AAAAJ;https://scholar.google.com/citations?hl=en;QjBF9sUAAAAJ;pImSVwoAAAAJ;YZ1HFYIAAAAJ", "orcid": ";;;0000-0002-7701-7586;", "linkedin": ";;mikhail-yurochkin-a45659114/;justin-solomon-8a587914/;", "or_profile": "~Kristjan_Greenewald1;~Anming_Gu1;~Mikhail_Yurochkin1;~Justin_Solomon1;~Edward_Chien1", "aff": "MIT-IBM Watson AI Lab, IBM Research;Boston University;IBM Research;Massachusetts Institute of Technology;Boston University", "aff_domain": "ibm.com;bu.edu;ibm.com;mit.edu;bu.edu", "position": "Research Scientist;Undergrad student;Researcher;Associate Professor;Assistant Professor", "bibtex": "@misc{\ngreenewald2022kmixup,\ntitle={\\$k\\$-Mixup Regularization for Deep Learning via Optimal Transport},\nauthor={Kristjan Greenewald and Anming Gu and Mikhail Yurochkin and Justin Solomon and Edward Chien},\nyear={2022},\nurl={https://openreview.net/forum?id=a1m8Jba-N6l}\n}", "github": "", "project": "", "reviewers": "8N9h;P7xo;iZL9;K2Mi;EGaJ", "site": "https://openreview.net/forum?id=a1m8Jba-N6l", "pdf_size": 0, "recommendation": "3;3;5;6;6", "confidence": "4;4;4;2;3", "correctness": "4;3;2;4;3", "technical_novelty": "3;2;3;3;2", "empirical_novelty": "2;3;2;2;2", "wc_summary_paper": "93;75;17;74;148", "wc_summary_review": "58;31;18;39;41", "wc_main_review": "1543;1071;607;181;293", "wc_review": "1694;1177;642;294;482", "wc_reply_reviewers": "56;319;0;82;0", "wc_reply_authors": "2286;1526;1227;509;467", "reply_reviewers": "1;1;0;1;0", "reply_authors": "4;3;2;1;1", "recommendation_avg": [ 4.6, 1.3564659966250536 ], "confidence_avg": [ 3.4, 0.8 ], "correctness_avg": [ 3.2, 0.7483314773547882 ], "technical_novelty_avg": [ 2.6, 0.4898979485566356 ], "empirical_novelty_avg": [ 2.2, 0.39999999999999997 ], "wc_summary_paper_avg": [ 81.4, 41.983806401992666 ], "wc_summary_review_avg": [ 37.4, 13.093509842666329 ], "wc_main_review_avg": [ 739.0, 506.8222568119912 ], "wc_review_avg": [ 857.8, 511.23082849139684 ], "wc_reply_reviewers_avg": [ 91.4, 118.19576980586065 ], "wc_reply_authors_avg": [ 1203.0, 678.3989976407689 ], "reply_reviewers_avg": [ 0.6, 0.48989794855663565 ], "reply_authors_avg": [ 2.2, 1.16619037896906 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.7740702698132101, "corr_recommendation_correctness": -0.11821656093586504, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2900884832698460006&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff_unique_index": "0;1;0;2;1", "aff_unique_norm": "IBM;Boston University;Massachusetts Institute of Technology", "aff_unique_dep": "AI Lab;;", "aff_unique_url": "https://www.ibmwatsonai.org/;https://www.bu.edu;https://web.mit.edu", "aff_unique_abbr": "MIT-IBM AI Lab;BU;MIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Distributionally Robust Models with Parametric Likelihood Ratios", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6425", "id": "a34GrNaYEcS", "poster": "", "openreview": "https://openreview.net/forum?id=a34GrNaYEcS", "slides": "https://iclr.cc/virtual/2022/poster/6425", "video": "https://iclr.cc/virtual/2022/poster/6425", "author_site": "Paul Michel, Tatsunori Hashimoto, Graham Neubig", "tldr": "", "abstract": "As machine learning models are deployed ever more broadly, it becomes increasingly important that they are not only able to perform well on their training distribution, but also yield accurate predictions when confronted with distribution shift. The Distributionally Robust Optimization (DRO) framework proposes to address this issue by training models to minimize their expected risk under a collection of distributions, to imitate test-time shifts. This is most commonly achieved by instance-level re-weighting of the training objective to emulate the likelihood ratio with possible test distributions, which allows for estimating their empirical risk via importance sampling (assuming that they are subpopulations of the training distribution). However, re-weighting schemes in the literature are usually limited due to the difficulty of keeping the optimization problem tractable and the complexity of enforcing normalization constraints. In this paper, we show that three simple ideas -- mini-batch level normalization, a KL penalty and simultaneous gradient updates -- allow us to train models with DRO using a broader class of parametric likelihood ratios. In a series of experiments on both image and text classification benchmarks, we find that models trained with the resulting parametric adversaries are consistently more robust to subpopulation shifts when compared to other DRO approaches, and that the method performs reliably well with little hyper-parameter tuning.", "keywords": "distributionally robust optimization;fairness;deep learning;robustness;adversarial learning", "primary_area": "", "supplementary_material": "", "author": "Paul Michel;Tatsunori Hashimoto;Graham Neubig", "authorids": "~Paul_Michel1;~Tatsunori_Hashimoto1;~Graham_Neubig1", "gender": "M;M;M", "homepage": "https://pmichel31415.github.io/;https://thashim.github.io;http://phontron.com", "dblp": "185/1024;;03/8155", "google_scholar": "oyyIf0YAAAAJ;5ygiTwsAAAAJ;wlosgkoAAAAJ", "orcid": ";;", "linkedin": "paul-michel-4954b799/;;", "or_profile": "~Paul_Michel1;~Tatsunori_Hashimoto1;~Graham_Neubig1", "aff": "Ecole Normale Sup\u00e9rieure de Paris;Stanford University;Carnegie Mellon University", "aff_domain": "ens.fr;stanford.edu;cmu.edu", "position": "Postdoc;Assistant Professor;Associate Professor", "bibtex": "@inproceedings{\nmichel2022distributionally,\ntitle={Distributionally Robust Models with Parametric Likelihood Ratios},\nauthor={Paul Michel and Tatsunori Hashimoto and Graham Neubig},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=a34GrNaYEcS}\n}", "github": "", "project": "", "reviewers": "hEDd;iYtW;5Xhf;UdKV", "pdf_size": 0, "recommendation": "6;6;8;8", "confidence": "3;3;3;4", "correctness": "3;3;4;4", "technical_novelty": "3;2;3;3", "empirical_novelty": "3;3;3;3", "wc_summary_paper": "46;35;145;58", "wc_summary_review": "13;73;91;34", "wc_main_review": "399;285;1120;223", "wc_review": "458;393;1356;315", "wc_reply_reviewers": "0;101;650;0", "wc_reply_authors": "462;870;934;109", "reply_reviewers": "0;1;2;0", "reply_authors": "1;2;2;1", "recommendation_avg": [ 7.0, 1.0 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 71.0, 43.49137845596527 ], "wc_summary_review_avg": [ 52.75, 30.84132779242813 ], "wc_main_review_avg": [ 506.75, 359.6431390976338 ], "wc_review_avg": [ 630.5, 421.9161646583359 ], "wc_reply_reviewers_avg": [ 187.75, 270.0466394902925 ], "wc_reply_authors_avg": [ 593.75, 333.3259478348483 ], "reply_reviewers_avg": [ 0.75, 0.82915619758885 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": 1.0, "gs_citation": 25, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2606416541563470801&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=a34GrNaYEcS", "email": "ens.fr;stanford.edu;cmu.edu", "author_num": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "Ecole Normale Sup\u00e9rieure de Paris;Stanford University;Carnegie Mellon University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.ens.fr;https://www.stanford.edu;https://www.cmu.edu", "aff_unique_abbr": "ENS Paris;Stanford;CMU", "aff_campus_unique_index": "0;1", "aff_campus_unique": "Paris;Stanford;", "aff_country_unique_index": "0;1;1", "aff_country_unique": "France;United States" }, { "id": "a3NaSCJ20V", "title": "Equivariant Grasp learning In Real Time", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Visual grasp detection is a key problem in robotics where the agent must learn to model the grasp function, a mapping from an image of a scene onto a set of feasible grasp poses. In this paper, we recognize that the grasp function is $\\mathrm{SE}(2)$-equivariant and that it can be modeled using an equivariant convolutional neural network. As a result, we are able to significantly improve the sample efficiency of grasp learning to the point where we can learn a good approximation of the grasp function within only 500 grasp experiences. This is fast enough that we can learn to grasp completely on a physical robot in about an hour. ", "keywords": "Robotic Grasping;Equivariance;Reinforcement Leanring", "primary_area": "", "supplementary_material": "", "author": "Xupeng Zhu;Dian Wang;Ondrej Biza;Robert Platt", "authorids": "~Xupeng_Zhu1;~Dian_Wang1;~Ondrej_Biza1;~Robert_Platt1", "gender": "M;M;M;", "homepage": "https://zxp-s-works.github.io/;https://pointw.github.io/;https://sites.google.com/view/obiza;http://www.ccs.neu.edu/home/rplatt/", "dblp": "257/4426;191/1369-1;230/8616.html;39/5434", "google_scholar": "mwxz-8MAAAAJ;CckjtfQAAAAJ;Gi9Xq8YAAAAJ;Z4Y5S2oAAAAJ", "orcid": ";;0000-0003-3390-8050;", "linkedin": "xupengzhu-skunk;dianwang1007;ond%C5%99ej-b%C3%AD%C5%BEa-a9405353/;", "or_profile": "~Xupeng_Zhu1;~Dian_Wang1;~Ondrej_Biza1;~Robert_Platt1", "aff": "Northeastern University;Northeastern University;Google Brain;Northeastern University", "aff_domain": "northeastern.edu;northeastern.edu;google.com;neu.edu", "position": "PhD student;PhD student;Intern;Associate Professor", "bibtex": "@misc{\nzhu2022equivariant,\ntitle={Equivariant Grasp learning In Real Time},\nauthor={Xupeng Zhu and Dian Wang and Ondrej Biza and Robert Platt},\nyear={2022},\nurl={https://openreview.net/forum?id=a3NaSCJ20V}\n}", "github": "", "project": "", "reviewers": "3wvJ;oxnd;wiXu;YeYi", "site": "https://openreview.net/forum?id=a3NaSCJ20V", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "5;4;4;4", "correctness": "2;4;3;3", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "42;57;93;164", "wc_summary_review": "32;69;76;52", "wc_main_review": "458;263;727;192", "wc_review": "532;389;896;408", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 89.0, 47.10095540432275 ], "wc_summary_review_avg": [ 57.25, 16.990806337546196 ], "wc_main_review_avg": [ 410.0, 207.31980127329854 ], "wc_review_avg": [ 556.25, 203.6963119450129 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.9271726499455306, "corr_recommendation_correctness": 0.6488856845230502, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13858331612592962666&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Northeastern University;Google", "aff_unique_dep": ";Google Brain", "aff_unique_url": "https://www.northeastern.edu;https://brain.google.com", "aff_unique_abbr": "NEU;Google Brain", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "a3hQPNqIFk6", "title": "Brittle interpretations: The Vulnerability of TCAV and Other Concept-based Explainability Tools to Adversarial Attack", "track": "main", "status": "Reject", "tldr": "", "abstract": "Methods for model explainability have become increasingly critical for testing the fairness and soundness of deep learning. A number of explainability techniques have been developed which use a set of examples to represent a human-interpretable concept in a model's activations. In this work we show that these explainability methods can suffer the same vulnerability to adversarial attacks as the models they are meant to analyze. We demonstrate this phenomenon on two well-known concept-based approaches to the explainability of deep learning models: TCAV and faceted feature visualization. We show that by carefully perturbing the examples of the concept that is being investigated, we can radically change the output of the interpretability method, e.g. showing that stripes are not an important factor in identifying images of a zebra. Our work highlights the fact that in safety-critical applications, there is need for security around not only the machine learning pipeline but also the model interpretation process. ", "keywords": "interpretability;adversarial attack", "primary_area": "", "supplementary_material": "", "author": "Davis Brown;Henry Kvinge", "authorids": "~Davis_Brown1;~Henry_Kvinge1", "gender": ";", "homepage": "https://davisrbrown.com/;https://hkvinge.github.io/", "dblp": "304/3144;223/4356", "google_scholar": "https://scholar.google.com/citations?hl=en;vfFn_QsAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Davis_Brown1;~Henry_Kvinge1", "aff": "Pacific Northwest National Laboratory;Pacific Northwest National Laboratory", "aff_domain": "pnnl.gov;pnnl.gov", "position": "Researcher;Principal Researcher", "bibtex": "@misc{\nbrown2022brittle,\ntitle={Brittle interpretations: The Vulnerability of {TCAV} and Other Concept-based Explainability Tools to Adversarial Attack},\nauthor={Davis Brown and Henry Kvinge},\nyear={2022},\nurl={https://openreview.net/forum?id=a3hQPNqIFk6}\n}", "github": "", "project": "", "reviewers": "YeZA;jcjC;6y3k;oy63", "site": "https://openreview.net/forum?id=a3hQPNqIFk6", "pdf_size": 0, "recommendation": "3;5;5;5", "confidence": "4;4;4;4", "correctness": "2;3;3;2", "technical_novelty": "2;3;2;3", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "49;61;64;50", "wc_summary_review": "5;38;116;66", "wc_main_review": "164;390;519;560", "wc_review": "218;489;699;676", "wc_reply_reviewers": "0;235;0;171", "wc_reply_authors": "373;1096;630;1023", "reply_reviewers": "0;1;0;1", "reply_authors": "1;2;1;2", "recommendation_avg": [ 4.5, 0.8660254037844386 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 56.0, 6.59545297913646 ], "wc_summary_review_avg": [ 56.25, 40.69628361410904 ], "wc_main_review_avg": [ 408.25, 154.3411400113398 ], "wc_review_avg": [ 520.5, 192.7050855582177 ], "wc_reply_reviewers_avg": [ 101.5, 103.99158619811509 ], "wc_reply_authors_avg": [ 780.5, 294.55602183625444 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=697051669163471501&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Pacific Northwest National Laboratory", "aff_unique_dep": "", "aff_unique_url": "https://www.pnnl.gov", "aff_unique_abbr": "PNNL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "a3mRgptHKZd", "title": "Faster No-Regret Learning Dynamics for Extensive-Form Correlated Equilibrium", "track": "main", "status": "Reject", "tldr": "", "abstract": "A recent emerging trend in the literature on learning in games has been concerned with providing accelerated learning dynamics for correlated and coarse correlated equilibria in normal-form games. Much less is known about the significantly more challenging setting of extensive-form games, which can capture sequential and simultaneous moves, as well as imperfect information. In this paper, we develop faster no-regret learning dynamics for \\textit{extensive-form correlated equilibrium (EFCE)} in multiplayer general-sum imperfect-information extensive-form games. When all agents play $T$ repetitions of the game according to the accelerated dynamics, the correlated distribution of play is an $O(T^{-3/4})$-approximate EFCE. This significantly improves over the best prior rate of $O(T^{-1/2})$. One of our conceptual contributions is to connect predictive (that is, optimistic) regret minimization with the framework of $\\Phi$-regret. One of our main technical contributions is to characterize the stability of certain fixed point strategies through a refined perturbation analysis of a structured Markov chain, which may be of independent interest. \nFinally, experiments on standard benchmarks corroborate our findings.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/567264c36ae98d84cbcb927795705d35d59bd461.zip", "author": "Ioannis Anagnostides;Gabriele Farina;Christian Kroer;Tuomas Sandholm", "authorids": "~Ioannis_Anagnostides1;~Gabriele_Farina1;~Christian_Kroer1;~Tuomas_Sandholm1", "gender": "M;M;M;M", "homepage": ";http://www.cs.cmu.edu/~gfarina/about/;http://www.columbia.edu/~ck2945/;http://www.cs.cmu.edu/~sandholm", "dblp": "273/7648;;64/10660;s/TuomasSandholm", "google_scholar": "QVwDo_sAAAAJ;sktDNcEAAAAJ;https://scholar.google.ch/citations?user=ckHwjPAAAAAJ;0DpK1EMAAAAJ", "orcid": ";;0000-0002-9009-8683;", "linkedin": ";;;", "or_profile": "~Ioannis_Anagnostides1;~Gabriele_Farina1;~Christian_Kroer1;~Tuomas_Sandholm1", "aff": "Carnegie Mellon University;School of Computer Science, Carnegie Mellon University;Columbia University;Carnegie Mellon University", "aff_domain": "cmu.edu;cs.cmu.edu;columbia.edu;cmu.edu", "position": "PhD student;PhD student;Assistant Professor;Full Professor", "bibtex": "@misc{\nanagnostides2022faster,\ntitle={Faster No-Regret Learning Dynamics for Extensive-Form Correlated Equilibrium},\nauthor={Ioannis Anagnostides and Gabriele Farina and Christian Kroer and Tuomas Sandholm},\nyear={2022},\nurl={https://openreview.net/forum?id=a3mRgptHKZd}\n}", "github": "", "project": "", "reviewers": "mWDx;zmM5;NoAR;4e5F", "site": "https://openreview.net/forum?id=a3mRgptHKZd", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "4;4;3;2", "correctness": "3;3;3;4", "technical_novelty": "2;3;3;4", "empirical_novelty": "0;3;2;0", "wc_summary_paper": "97;231;57;143", "wc_summary_review": "133;108;72;3", "wc_main_review": "620;641;331;467", "wc_review": "850;980;460;613", "wc_reply_reviewers": "328;317;0;0", "wc_reply_authors": "1118;957;260;717", "reply_reviewers": "2;1;0;0", "reply_authors": "3;2;1;1", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 1.25, 1.299038105676658 ], "wc_summary_paper_avg": [ 132.0, 64.7533782902483 ], "wc_summary_review_avg": [ 79.0, 48.94384537406108 ], "wc_main_review_avg": [ 514.75, 125.55949784862952 ], "wc_review_avg": [ 725.75, 202.1241883100585 ], "wc_reply_reviewers_avg": [ 161.25, 161.29689240651848 ], "wc_reply_authors_avg": [ 763.0, 323.5683853530811 ], "reply_reviewers_avg": [ 0.75, 0.82915619758885 ], "reply_authors_avg": [ 1.75, 0.82915619758885 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.899228803025897, "corr_recommendation_correctness": 0.9271726499455306, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Sqk3TVksLQUJ:scholar.google.com/&scioq=Faster+No-Regret+Learning+Dynamics+for+Extensive-Form+Correlated+Equilibrium&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Carnegie Mellon University;Columbia University", "aff_unique_dep": ";", "aff_unique_url": "https://www.cmu.edu;https://www.columbia.edu", "aff_unique_abbr": "CMU;Columbia", "aff_campus_unique_index": "1", "aff_campus_unique": ";Pittsburgh", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "a43otnDilz2", "title": "KNIFE: Kernelized-Neural Differential Entropy Estimation", "track": "main", "status": "Reject", "tldr": "", "abstract": "Estimation of (differential) entropy and the related mutual information has been pursued with significant efforts by the machine learning community. To address shortcomings in previously proposed estimators for differential entropy, here we introduce KNIFE, a fully parameterized, differentiable kernel-based estimator of differential entropy. The flexibility of our approach also allows us to construct KNIFE-based estimators for conditional (on either discrete or continuous variables) differential entropy, as well as mutual information. We empirically validate our method on high-dimensional synthetic data and further apply it to guide the training of neural networks for real-world tasks. Our experiments on a large variety of tasks, including visual domain adaptation, textual fair classification, and textual fine-tuning demonstrate the effectiveness of KNIFE-based estimation.", "keywords": "differential entropy estimation;differential entropy;mutual information;kernel estimation", "primary_area": "", "supplementary_material": "/attachment/0c484dac19d614789c2a3c4f190488fc7c083c2a.zip", "author": "Georg Pichler;Pierre Colombo;Malik Boudiaf;G\u00fcnther Koliander;Pablo Piantanida", "authorids": "~Georg_Pichler1;~Pierre_Colombo2;~Malik_Boudiaf1;~G\u00fcnther_Koliander1;~Pablo_Piantanida2", "gender": "M;M;;M;M", "homepage": ";https://pierrecolombo.github.io/;;;https://www.pablo-piantanida.org", "dblp": "155/0692.html;;;126/5116;44/1416", "google_scholar": ";yPoMt8gAAAAJ;;;https://scholar.google.fr/citations?user=QyBEFv0AAAAJ", "orcid": "0000-0001-5696-4472;;;;", "linkedin": ";;malik-boudiaf/;;pablo-piantanida-60a51bb5/?locale=en_US", "or_profile": "~Georg_Pichler1;~Pierre_Colombo2;~Malik_Boudiaf1;~G\u00fcnther_Koliander1;~Pablo_Piantanida2", "aff": "TU Wien Vienna University of Technology;CentraleSupelec;\u00c9cole de technologie sup\u00e9rieure;Austrian Acedemy of Sciences;Universit\u00e9 Paris-Saclay, CNRS ", "aff_domain": "tuwien.ac.at;centralesupelec.fr;etsmtl.ca;oeaw.ac.at;centralesupelec.fr", "position": "Postdoc;Postdoc;PhD student;Principal Researcher;Full Professor", "bibtex": "@misc{\npichler2022knife,\ntitle={{KNIFE}: Kernelized-Neural Differential Entropy Estimation},\nauthor={Georg Pichler and Pierre Colombo and Malik Boudiaf and G{\\\"u}nther Koliander and Pablo Piantanida},\nyear={2022},\nurl={https://openreview.net/forum?id=a43otnDilz2}\n}", "github": "", "project": "", "reviewers": "Y1rk;5zVE;ug32;ABE4;8iox", "site": "https://openreview.net/forum?id=a43otnDilz2", "pdf_size": 0, "recommendation": "5;5;6;6;6", "confidence": "5;3;3;3;3", "correctness": "4;3;4;3;4", "technical_novelty": "2;2;3;2;3", "empirical_novelty": "3;2;3;3;3", "wc_summary_paper": "55;82;99;54;49", "wc_summary_review": "27;19;45;29;15", "wc_main_review": "485;267;261;624;292", "wc_review": "567;368;405;707;356", "wc_reply_reviewers": "0;0;60;104;0", "wc_reply_authors": "371;560;263;551;386", "reply_reviewers": "0;0;1;1;0", "reply_authors": "1;1;1;2;1", "recommendation_avg": [ 5.6, 0.48989794855663565 ], "confidence_avg": [ 3.4, 0.8000000000000002 ], "correctness_avg": [ 3.6, 0.4898979485566356 ], "technical_novelty_avg": [ 2.4, 0.4898979485566356 ], "empirical_novelty_avg": [ 2.8, 0.39999999999999997 ], "wc_summary_paper_avg": [ 67.8, 19.405153954555473 ], "wc_summary_review_avg": [ 27.0, 10.353743284435827 ], "wc_main_review_avg": [ 385.8, 144.95985651207027 ], "wc_review_avg": [ 480.6, 136.11847780518264 ], "wc_reply_reviewers_avg": [ 32.8, 42.51305681787655 ], "wc_reply_authors_avg": [ 426.2, 113.81985766991629 ], "reply_reviewers_avg": [ 0.4, 0.48989794855663565 ], "reply_authors_avg": [ 1.2, 0.4 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.6123724356957944, "corr_recommendation_correctness": 0.16666666666666666, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7445294954304353052&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "aff_unique_index": "0;1;2;3;4", "aff_unique_norm": "Vienna University of Technology;CentraleSup\u00e9lec;\u00c9cole de technologie sup\u00e9rieure;Austrian Academy of Sciences;Universit\u00e9 Paris-Saclay", "aff_unique_dep": ";;;;", "aff_unique_url": "https://www.tuwien.ac.at;https://www.centralesupelec.fr;https://www.etsmtl.ca;https://www.oeaw.ac.at;https://www.universite-paris-saclay.fr", "aff_unique_abbr": "TU Wien;CS;ETS;OEAW;UPS", "aff_campus_unique_index": "0", "aff_campus_unique": "Vienna;", "aff_country_unique_index": "0;1;2;0;1", "aff_country_unique": "Austria;France;Canada" }, { "id": "a4W0tSTN9Kn", "title": "MULTI-LEVEL APPROACH TO ACCURATE AND SCALABLE HYPERGRAPH EMBEDDING", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Many problems such as node classification and link prediction in network data\ncan be solved using graph embeddings, and a number of algorithms are known for constructing\nsuch embeddings. However, it is difficult to use graphs to capture \nnon-binary relations such as communities of nodes. These kinds of \ncomplex relations are expressed more naturally as hypergraphs.\nWhile hypergraphs are a generalization of graphs, state-of-the-art graph\nembedding techniques are not adequate for solving prediction and\nclassification tasks on large hypergraphs accurately in reasonable time.\nIn this paper, we introduce NetVec, a novel multi-level framework for scalable unsupervised hypergraph embedding, which outperforms state-of-the-art hypergraph embedding systems by up to \n15% in accuracy. \nWe also show that NetVec is capable of generating high quality embeddings for real-world hypergraphs with millions of nodes and hyperedges in only a couple of minutes while existing hypergraph systems either fail for such large hypergraphs or may take days to produce the embeddings.", "keywords": "network embedding;hypergraph embedding;hyperedge classification;multi-level hypergraph embedding", "primary_area": "", "supplementary_material": "/attachment/fde41e8cf29a7df2eb658347345a7309736d9a34.zip", "author": "Sepideh Maleki;Donya Saless;Dennis Wall;Keshav Pingali", "authorids": "~Sepideh_Maleki2;~Donya_Saless2;~Dennis_Wall1;~Keshav_Pingali1", "gender": ";M;F;F", "homepage": "https://wall-lab.stanford.edu/;;https://www.linkedin.com/in/donya-saless/;https://www.cs.utexas.edu/~smaleki/", "dblp": ";;;", "google_scholar": ";02UU6wgAAAAJ;;https://scholar.google.com/citations?hl=en", "orcid": ";;;", "linkedin": ";;donya-saless/edit/forms/language/1435902547/;", "or_profile": "~Dennis_Wall1;~Keshav_Pingali1;~Donya_Saless1;~sepideh_maleki1", "aff": "Stanford University;;University of Tehran, University of Tehran;University of Texas, Austin", "aff_domain": "stanford.edu;;ut.ac.ir;utexas.edu", "position": "Professor;;Undergrad student;PhD student", "bibtex": "@misc{\nmaleki2022multilevel,\ntitle={{MULTI}-{LEVEL} {APPROACH} {TO} {ACCURATE} {AND} {SCALABLE} {HYPERGRAPH} {EMBEDDING}},\nauthor={Sepideh Maleki and Donya Saless and Dennis Wall and Keshav Pingali},\nyear={2022},\nurl={https://openreview.net/forum?id=a4W0tSTN9Kn}\n}", "github": "", "project": "", "reviewers": "zUxB;vdRS;Ry87;YeSQ;RAtD", "site": "https://openreview.net/forum?id=a4W0tSTN9Kn", "pdf_size": 0, "recommendation": "1;3;3;5;5", "confidence": "4;3;5;4;3", "correctness": "3;2;4;3;3", "technical_novelty": "1;3;2;2;3", "empirical_novelty": "1;3;2;2;3", "wc_summary_paper": "13;50;42;103;27", "wc_summary_review": "54;100;30;48;28", "wc_main_review": "884;526;245;500;129", "wc_review": "951;676;317;651;184", "wc_reply_reviewers": "534;130;16;308;0", "wc_reply_authors": "825;415;454;578;0", "reply_reviewers": "1;1;1;1;0", "reply_authors": "2;2;2;2;0", "recommendation_avg": [ 3.4, 1.4966629547095767 ], "confidence_avg": [ 3.8, 0.7483314773547882 ], "correctness_avg": [ 3.0, 0.6324555320336759 ], "technical_novelty_avg": [ 2.2, 0.7483314773547882 ], "empirical_novelty_avg": [ 2.2, 0.7483314773547882 ], "wc_summary_paper_avg": [ 47.0, 30.744105126023754 ], "wc_summary_review_avg": [ 52.0, 26.015380066414558 ], "wc_main_review_avg": [ 456.8, 261.32998297172105 ], "wc_review_avg": [ 555.8, 273.84477354881176 ], "wc_reply_reviewers_avg": [ 197.6, 200.9712417237849 ], "wc_reply_authors_avg": [ 454.4, 268.57148024315615 ], "reply_reviewers_avg": [ 0.8, 0.4 ], "reply_authors_avg": [ 1.6, 0.8 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.2857142857142857, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:7SfuP9D3xFMJ:scholar.google.com/&scioq=MULTI-LEVEL+APPROACH+TO+ACCURATE+AND+SCALABLE+HYPERGRAPH+EMBEDDING&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1;2", "aff_unique_norm": "Stanford University;University of Tehran;University of Texas at Austin", "aff_unique_dep": ";;", "aff_unique_url": "https://www.stanford.edu;https://ut.ac.ir;https://www.utexas.edu", "aff_unique_abbr": "Stanford;UT;UT Austin", "aff_campus_unique_index": "0;2", "aff_campus_unique": "Stanford;;Austin", "aff_country_unique_index": "0;1;0", "aff_country_unique": "United States;Iran" }, { "id": "a61qArWbjw_", "title": "Scalable multimodal variational autoencoders with surrogate joint posterior", "track": "main", "status": "Reject", "tldr": "", "abstract": "To obtain a joint representation from multimodal data in variational autoencoders (VAEs), it is important to infer the representation from arbitrary subsets of modalities after learning. A scalable way to achieve this is to aggregate the inferences of each modality as experts. A state-of-the-art approach to learning this aggregation of experts is to encourage all modalities to be reconstructed and cross-generated from arbitrary subsets. However, this learning may be insufficient if cross-generation is difficult. Furthermore, to evaluate its objective function, exponential generation paths concerning the number of modalities are required. To alleviate these problems, we propose to explicitly minimize the divergence between inferences from arbitrary subsets and the surrogate joint posterior that approximates the true joint posterior. We also proposed using a gradient origin network, a deep generative model that learns inferences without using an inference network, thereby reducing the need for additional parameters by introducing the surrogate posterior. We demonstrate that our method performs better than existing scalable multimodal VAEs in inference and generation. \n", "keywords": "Deep generative models;multimodal learning", "primary_area": "", "supplementary_material": "", "author": "Masahiro Suzuki;Yutaka Matsuo", "authorids": "~Masahiro_Suzuki1;~Yutaka_Matsuo1", "gender": "M;M", "homepage": ";http://ymatsuo.com", "dblp": ";m/YMatsuo.html", "google_scholar": "r2nt5kUAAAAJ;Dy8iau4AAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Masahiro_Suzuki1;~Yutaka_Matsuo1", "aff": "The University of Tokyo, Tokyo Institute of Technology;The University of Tokyo", "aff_domain": "u-tokyo.ac.jp;u-tokyo.ac.jp", "position": "Assistant Professor;Associate Professor", "bibtex": "@misc{\nsuzuki2022scalable,\ntitle={Scalable multimodal variational autoencoders with surrogate joint posterior},\nauthor={Masahiro Suzuki and Yutaka Matsuo},\nyear={2022},\nurl={https://openreview.net/forum?id=a61qArWbjw_}\n}", "github": "", "project": "", "reviewers": "yNag;FDzr;yRW1;MNSw", "site": "https://openreview.net/forum?id=a61qArWbjw_", "pdf_size": 0, "recommendation": "3;5;6;8", "confidence": "3;4;3;2", "correctness": "2;4;3;3", "technical_novelty": "2;3;3;4", "empirical_novelty": "3;2;2;3", "wc_summary_paper": "99;64;146;90", "wc_summary_review": "36;54;47;54", "wc_main_review": "158;312;156;434", "wc_review": "293;430;349;578", "wc_reply_reviewers": "0;0;104;91", "wc_reply_authors": "133;556;484;499", "reply_reviewers": "0;0;1;1", "reply_authors": "1;1;2;1", "recommendation_avg": [ 5.5, 1.8027756377319946 ], "confidence_avg": [ 3.0, 0.7071067811865476 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 99.75, 29.634228520412 ], "wc_summary_review_avg": [ 47.75, 7.361215932167728 ], "wc_main_review_avg": [ 265.0, 116.29703349613007 ], "wc_review_avg": [ 412.5, 107.24854311364794 ], "wc_reply_reviewers_avg": [ 48.75, 48.96618731328793 ], "wc_reply_authors_avg": [ 418.0, 166.72282387243806 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.5883484054145521, "corr_recommendation_correctness": 0.39223227027636803, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:9StevQ1YTCgJ:scholar.google.com/&scioq=Scalable+multimodal+variational+autoencoders+with+surrogate+joint+posterior&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "University of Tokyo", "aff_unique_dep": "", "aff_unique_url": "https://www.u-tokyo.ac.jp", "aff_unique_abbr": "UTokyo", "aff_campus_unique_index": "0", "aff_campus_unique": "Tokyo;", "aff_country_unique_index": "0;0", "aff_country_unique": "Japan" }, { "id": "a68yJLSKY-P", "title": "Adaptive Differentially Private Empirical Risk Minimization", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "We propose an adaptive (stochastic) gradient perturbation method for differentially private empirical risk minimization. At each iteration, the random noise added to the gradient is optimally adapted to the stepsize; we name this process adaptive differentially private (ADP) learning. Given the same privacy budget, we prove that the ADP method considerably improves the utility guarantee compared to the standard differentially private method in which vanilla random noise is added. Our method is particularly useful for gradient-based algorithms with non-constant learning rate, including variants of AdaGrad (Duchi et al., 2011). We provide extensive numerical experiments to demonstrate the effectiveness of the proposed adaptive differentially private algorithm.", "keywords": "Non-convex optimization;Gradient Perturbation;Differentially Private Learning;Adaptive Gradient Methods;Stochastic Gradient Descent;Theory", "primary_area": "", "supplementary_material": "", "author": "Xiaoxia Wu;Lingxiao Wang;Irina Cristali;Quanquan Gu;Rebecca Willett", "authorids": "~Xiaoxia_Wu1;~Lingxiao_Wang5;~Irina_Cristali1;~Quanquan_Gu1;~Rebecca_Willett1", "gender": "F;M;F;M;F", "homepage": "https://sites.google.com/view/xwu/home;https://lingxiaowang-ai.github.io/;https://irinacristali.github.io;http://web.cs.ucla.edu/~qgu/;https://willett.psd.uchicago.edu/", "dblp": "63/1016;140/1229-1;304/2932;50/4597;w/RebeccaWillett", "google_scholar": "Ry0Bdt8AAAAJ;VPyxd6kAAAAJ;Qt07hBUAAAAJ;GU9HgNAAAAAJ;bGRVPl8AAAAJ", "orcid": ";;;;0000-0002-8109-7582", "linkedin": ";;irina-cristali-aa05aa180/;;rebecca-willett-90b95973/", "or_profile": "~Xiaoxia_Wu1;~Lingxiao_Wang5;~Irina_Cristali1;~Quanquan_Gu1;~Rebecca_Willett1", "aff": "Microsoft;Toyota Technological Institute at Chicago;University of Chicago;University of California, Los Angeles;University of Chicago", "aff_domain": "microsoft.com;ttic.edu;uchicago.edu;cs.ucla.edu;uchicago.edu", "position": "Researcher;Research assistant professor;PhD student;Assistant Professor;Full Professor", "bibtex": "@misc{\nwu2022adaptive,\ntitle={Adaptive Differentially Private Empirical Risk Minimization },\nauthor={Xiaoxia Wu and Lingxiao Wang and Irina Cristali and Quanquan Gu and Rebecca Willett},\nyear={2022},\nurl={https://openreview.net/forum?id=a68yJLSKY-P}\n}", "github": "", "project": "", "reviewers": "", "site": "https://openreview.net/forum?id=a68yJLSKY-P", "pdf_size": 0, "recommendation": "", "confidence": "", "correctness": "", "technical_novelty": "", "empirical_novelty": "", "wc_summary_paper": "", "wc_summary_review": "", "wc_main_review": "", "wc_review": "", "wc_reply_reviewers": "", "wc_reply_authors": "", "reply_reviewers": "", "reply_authors": "", "recommendation_avg": [ 0, 0 ], "confidence_avg": [ 0, 0 ], "correctness_avg": [ 0, 0 ], "technical_novelty_avg": [ 0, 0 ], "empirical_novelty_avg": [ 0, 0 ], "wc_summary_paper_avg": [ 0, 0 ], "wc_summary_review_avg": [ 0, 0 ], "wc_main_review_avg": [ 0, 0 ], "wc_review_avg": [ 0, 0 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 1, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0, "corr_recommendation_correctness": 0, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13313666125242942946&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;2;3;2", "aff_unique_norm": "Microsoft;Toyota Technological Institute at Chicago;University of Chicago;University of California, Los Angeles", "aff_unique_dep": "Microsoft Corporation;;;", "aff_unique_url": "https://www.microsoft.com;https://www.tti-chicago.org;https://www.uchicago.edu;https://www.ucla.edu", "aff_unique_abbr": "Microsoft;TTI Chicago;UChicago;UCLA", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Chicago;Los Angeles", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Memory Replay with Data Compression for Continual Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/5928", "id": "a7H7OucbWaU", "poster": "", "openreview": "https://openreview.net/forum?id=a7H7OucbWaU", "slides": "https://iclr.cc/virtual/2022/poster/5928", "video": "https://iclr.cc/virtual/2022/poster/5928", "author_site": "Liyuan Wang, Xingxing Zhang, Kuo Yang, Longhui Yu, Chongxuan Li, Lanqing HONG, Shifeng Zhang, Zhenguo Li, Yi Zhong, Jun Zhu", "tldr": "", "abstract": "Continual learning needs to overcome catastrophic forgetting of the past. Memory replay of representative old training samples has been shown as an effective solution, and achieves the state-of-the-art (SOTA) performance. However, existing work is mainly built on a small memory buffer containing a few original data, which cannot fully characterize the old data distribution. In this work, we propose memory replay with data compression to reduce the storage cost of old training samples and thus increase their amount that can be stored in the memory buffer. Observing that the trade-off between the quality and quantity of compressed data is highly nontrivial for the efficacy of memory replay, we propose a novel method based on determinantal point processes (DPPs) to efficiently determine an appropriate compression quality for currently-arrived training samples. In this way, using a naive data compression algorithm with a properly selected quality can largely boost recent strong baselines by saving more compressed data in a limited storage space. We extensively validate this across several benchmarks of class-incremental learning and in a realistic scenario of object detection for autonomous driving.", "keywords": "Continual Learning;Memory Replay;Data Compression", "primary_area": "", "supplementary_material": "/attachment/6bc530a049dae5f81022889b174599d9ccc7c1cb.zip", "author": "Liyuan Wang;Xingxing Zhang;Kuo Yang;Longhui Yu;Chongxuan Li;Lanqing HONG;Shifeng Zhang;Zhenguo Li;Yi Zhong;Jun Zhu", "authorids": "~Liyuan_Wang1;~Xingxing_Zhang3;~Kuo_Yang1;~Longhui_Yu1;~Chongxuan_Li1;~Lanqing_HONG1;~Shifeng_Zhang5;~Zhenguo_Li1;~Yi_Zhong1;~Jun_Zhu2", "gender": "M;F;;M;M;F;M;M;M;M", "homepage": "https://lywang3081.github.io/;https://indussky8.github.io/;;https://yulonghui.github.io/;http://ml.cs.tsinghua.edu.cn/~chongxuan;https://racheltechie.github.io/;https://github.com/zsffq999;http://www.ee.columbia.edu/~zgli/;http://life.tsinghua.edu.cn/publish/smkx/11230/2018/20180205194642525261278/20180205194642525261278_.html;http://ml.cs.tsinghua.edu.cn/~jun", "dblp": "121/6094;;;313/9946;161/9965;226/4258;;23/6479;;50/2644-1", "google_scholar": "UAgdoY4AAAAJ;https://scholar.google.com.hk/citations?user=RKjiLyAAAAAJ;;https://scholar.google.com.hk/citations?user=3eHjDDgAAAAJ;UKMcQn4AAAAJ;https://scholar.google.com.sg/citations?user=2p7x6OUAAAAJ;;XboZC1AAAAAJ;;axsP38wAAAAJ", "orcid": ";0000-0002-2909-1589;;;0000-0002-0912-9076;;;;;", "linkedin": ";;kuo-yang/;%E9%BE%99%E8%BE%89-%E8%99%9E-71655a154/;;;;;;", "or_profile": "~Liyuan_Wang1;~Xingxing_Zhang3;~Kuo_Yang1;~Longhui_Yu1;~Chongxuan_Li1;~Lanqing_HONG1;~Shifeng_Zhang5;~Zhenguo_Li1;~Yi_Zhong1;~Jun_Zhu2", "aff": "Tsinghua University;Tsinghua University;Huawei Technologies Ltd.;Peking University;Renmin University of China;Huawei Technologies Ltd.;Huawei Technologies Ltd.;Huawei Noah's Ark Lab;Tsinghua University;Tsinghua University", "aff_domain": "tsinghua.edu.cn;tsinghua.edu.cn;huawei.com;pku.edu.cn;ruc.edu.cn;huawei.com;huawei.com;huawei.com;tsinghua.edu.cn;mail.tsinghua.edu.cn", "position": "PhD student;Postdoc;Researcher;MS student;Assistant Professor;Researcher;Researcher;Principal Researcher;Full Professor;Professor", "bibtex": "@inproceedings{\nwang2022memory,\ntitle={Memory Replay with Data Compression for Continual Learning},\nauthor={Liyuan Wang and Xingxing Zhang and Kuo Yang and Longhui Yu and Chongxuan Li and Lanqing HONG and Shifeng Zhang and Zhenguo Li and Yi Zhong and Jun Zhu},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=a7H7OucbWaU}\n}", "github": "", "project": "", "reviewers": "iR3u;gA5x;kRF2;iLt4", "pdf_size": 0, "recommendation": "3;6;6;6", "confidence": "4;4;4;4", "correctness": "4;3;3;3", "technical_novelty": "1;4;2;3", "empirical_novelty": "2;3;2;3", "wc_summary_paper": "75;87;96;102", "wc_summary_review": "303;49;15;27", "wc_main_review": "49;160;79;235", "wc_review": "427;296;190;364", "wc_reply_reviewers": "90;0;39;443", "wc_reply_authors": "1081;996;422;946", "reply_reviewers": "1;0;1;2", "reply_authors": "3;2;2;3", "recommendation_avg": [ 5.25, 1.299038105676658 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 1.118033988749895 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 90.0, 10.173494974687902 ], "wc_summary_review_avg": [ 98.5, 118.696040372036 ], "wc_main_review_avg": [ 130.75, 72.60294415517872 ], "wc_review_avg": [ 319.25, 87.83329380138262 ], "wc_reply_reviewers_avg": [ 143.0, 176.12069725049352 ], "wc_reply_authors_avg": [ 861.25, 258.15245011426873 ], "reply_reviewers_avg": [ 1.0, 0.7071067811865476 ], "reply_authors_avg": [ 2.5, 0.5 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 10, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": -1.0, "gs_citation": 142, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18195224691973743635&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=a7H7OucbWaU", "email": "tsinghua.edu.cn;tsinghua.edu.cn;huawei.com;pku.edu.cn;ruc.edu.cn;huawei.com;huawei.com;huawei.com;tsinghua.edu.cn;mail.tsinghua.edu.cn", "author_num": 10, "aff_unique_index": "0;0;1;2;3;1;1;1;0;0", "aff_unique_norm": "Tsinghua University;Huawei;Peking University;Renmin University of China", "aff_unique_dep": ";Huawei Technologies;;", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.huawei.com;http://www.pku.edu.cn;http://www.ruc.edu.cn", "aff_unique_abbr": "THU;Huawei;Peking U;RUC", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0;0", "aff_country_unique": "China" }, { "id": "aBAgwom5pTn", "title": "Dynamic and Efficient Gray-Box Hyperparameter Optimization for Deep Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Gray-box hyperparameter optimization techniques have recently emerged as a promising direction for tuning Deep Learning methods. However, the multi-budget search mechanisms of existing prior works can suffer from the poor correlation among the performances of hyperparameter configurations at different budgets. As a remedy, we introduce DyHPO, a method that learns to dynamically decide which configuration to try next, and for what budget. Our technique is a modification to the classical Bayesian optimization for a gray-box setup. Concretely, we propose a new surrogate for Gaussian Processes that embeds the learning curve dynamics and a new acquisition function that incorporates multi-budget information. We demonstrate the significant superiority of DyHPO against state-of-the-art hyperparameter optimization baselines through large-scale experiments comprising 50 datasets (Tabular, Image, NLP) and diverse neural networks (MLP, CNN/NAS, RNN).", "keywords": "hyperparameter optimization", "primary_area": "", "supplementary_material": "", "author": "Martin Wistuba;Arlind Kadra;Josif Grabocka", "authorids": "~Martin_Wistuba1;~Arlind_Kadra1;~Josif_Grabocka1", "gender": "M;M;M", "homepage": ";;https://www.utn.de/departments/department-engineering/machine-learning-lab/", "dblp": "https://dblp.uni-trier.de/pers/hd/w/Wistuba:Martin;252/5295;117/4936", "google_scholar": "https://scholar.google.co.uk/citations?user=pTULHVsAAAAJ;bMa0KUcAAAAJ;KRy27XcAAAAJ", "orcid": ";0000-0001-9308-6576;", "linkedin": "https://linkedin.com/in/wistuba/;;", "or_profile": "~Martin_Wistuba1;~Arlind_Kadra1;~Josif_Grabocka1", "aff": "Amazon;Universit\u00e4t Freiburg;Universit\u00e4t Freiburg", "aff_domain": "amazon.com;uni-freiburg.de;uni-freiburg.de", "position": "Researcher;PhD student;Assistant Professor", "bibtex": "@misc{\nwistuba2022dynamic,\ntitle={Dynamic and Efficient Gray-Box Hyperparameter Optimization for Deep Learning},\nauthor={Martin Wistuba and Arlind Kadra and Josif Grabocka},\nyear={2022},\nurl={https://openreview.net/forum?id=aBAgwom5pTn}\n}", "github": "", "project": "", "reviewers": "RUAE;cMi3;ty1k;oS3h;j16y", "site": "https://openreview.net/forum?id=aBAgwom5pTn", "pdf_size": 0, "recommendation": "3;3;5;6;6", "confidence": "5;4;3;3;3", "correctness": "2;1;3;3;3", "technical_novelty": "2;1;2;3;3", "empirical_novelty": "2;1;3;3;3", "wc_summary_paper": "84;132;63;74;68", "wc_summary_review": "198;22;43;133;66", "wc_main_review": "1176;202;153;290;541", "wc_review": "1458;356;259;497;675", "wc_reply_reviewers": "803;0;0;0;155", "wc_reply_authors": "2484;974;665;314;1148", "reply_reviewers": "1;0;0;0;2", "reply_authors": "6;3;1;1;4", "recommendation_avg": [ 4.6, 1.3564659966250536 ], "confidence_avg": [ 3.6, 0.8 ], "correctness_avg": [ 2.4, 0.8 ], "technical_novelty_avg": [ 2.2, 0.7483314773547882 ], "empirical_novelty_avg": [ 2.4, 0.8 ], "wc_summary_paper_avg": [ 84.2, 24.90301186603741 ], "wc_summary_review_avg": [ 92.4, 64.64240094550945 ], "wc_main_review_avg": [ 472.4, 376.34059042309 ], "wc_review_avg": [ 649.0, 428.0677516468626 ], "wc_reply_reviewers_avg": [ 191.6, 311.53850484330184 ], "wc_reply_authors_avg": [ 1117.0, 740.163765662708 ], "reply_reviewers_avg": [ 0.6, 0.7999999999999999 ], "reply_authors_avg": [ 3.0, 1.8973665961010275 ], "replies_avg": [ 27, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.8846517369293829, "corr_recommendation_correctness": 0.8846517369293829, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1, "aff_unique_index": "0;1;1", "aff_unique_norm": "Amazon;University of Freiburg", "aff_unique_dep": "Amazon.com, Inc.;", "aff_unique_url": "https://www.amazon.com;https://www.uni-freiburg.de", "aff_unique_abbr": "Amazon;Uni Freiburg", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1", "aff_country_unique": "United States;Germany" }, { "title": "Mirror Descent Policy Optimization", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6402", "id": "aBO5SvgSt1", "poster": "", "openreview": "https://openreview.net/forum?id=aBO5SvgSt1", "slides": "https://iclr.cc/virtual/2022/poster/6402", "video": "https://iclr.cc/virtual/2022/poster/6402", "author_site": "Manan Tomar, Lior Shani, Yonathan Efroni, Mohammad Ghavamzadeh", "tldr": "", "abstract": "Mirror descent (MD), a well-known first-order method in constrained convex optimization, has recently been shown as an important tool to analyze trust-region algorithms in reinforcement learning (RL). However, there remains a considerable gap between such theoretically analyzed algorithms and the ones used in practice. Inspired by this, we propose an efficient RL algorithm, called {\\em mirror descent policy optimization} (MDPO). MDPO iteratively updates the policy by {\\em approximately} solving a trust-region problem, whose objective function consists of two terms: a linearization of the standard RL objective and a proximity term that restricts two consecutive policies to be close to each other. Each update performs this approximation by taking multiple gradient steps on this objective function. We derive {\\em on-policy} and {\\em off-policy} variants of MDPO, while emphasizing important design choices motivated by the existing theory of MD in RL. We highlight the connections between on-policy MDPO and two popular trust-region RL algorithms: TRPO and PPO, and show that explicitly enforcing the trust-region constraint is in fact {\\em not} a necessity for high performance gains in TRPO. We then show how the popular soft actor-critic (SAC) algorithm can be derived by slight modifications of off-policy MDPO. Overall, MDPO is derived from the MD principles, offers a unified approach to viewing a number of popular RL algorithms, and performs better than or on-par with TRPO, PPO, and SAC in a number of continuous and discrete control tasks.", "keywords": "Reinforcement Learning;Policy Optimization", "primary_area": "", "supplementary_material": "/attachment/02907fadef589f40ff66fa22a7710ef314c56951.zip", "author": "Manan Tomar;Lior Shani;Yonathan Efroni;Mohammad Ghavamzadeh", "authorids": "~Manan_Tomar1;~Lior_Shani2;~Yonathan_Efroni2;~Mohammad_Ghavamzadeh2", "gender": "M;M;M;M", "homepage": "https://manantomar.github.io/;;https://sites.google.com/view/yonathan-efroni/;https://mohammadghavamzadeh.github.io/", "dblp": "241/6227;https://dblp.uni-trier.de/pers/s/Shani:Lior;215/3475;88/6389", "google_scholar": ";https://scholar.google.co.il/citations?user=TrQLB1gAAAAJ;pfTInEgAAAAJ;https://scholar.google.ca/citations?user=LHIPpCsAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Manan_Tomar1;~Lior_Shani2;~Yonathan_Efroni2;~Mohammad_Ghavamzadeh1", "aff": "Microsoft;Technion, Technion;Microsoft;Google Research", "aff_domain": "microsoft.com;technion.ac.il;microsoft.com;google.com", "position": "Intern;PhD student;Postdoc;Senior Staff Research Scientist", "bibtex": "@inproceedings{\ntomar2022mirror,\ntitle={Mirror Descent Policy Optimization},\nauthor={Manan Tomar and Lior Shani and Yonathan Efroni and Mohammad Ghavamzadeh},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=aBO5SvgSt1}\n}", "github": "", "project": "", "reviewers": "qkZc;wxw9;ELQd;fuGH", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "4;2;4;2", "correctness": "3;3;4;4", "technical_novelty": "1;2;4;3", "empirical_novelty": "2;3;4;3", "wc_summary_paper": "97;44;74;44", "wc_summary_review": "39;18;17;27", "wc_main_review": "172;133;420;110", "wc_review": "308;195;511;181", "wc_reply_reviewers": "55;11;0;0", "wc_reply_authors": "271;214;731;38", "reply_reviewers": "1;1;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 3.0, 1.0 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.5, 1.118033988749895 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 64.75, 22.286486937155438 ], "wc_summary_review_avg": [ 25.25, 8.842369591913696 ], "wc_main_review_avg": [ 208.75, 123.96244390943573 ], "wc_review_avg": [ 298.75, 132.06508811945722 ], "wc_reply_reviewers_avg": [ 16.5, 22.677080940897135 ], "wc_reply_authors_avg": [ 313.5, 255.88718217214398 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.6882472016116854, "corr_recommendation_correctness": 0.6882472016116854, "gs_citation": 87, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2587999722409846316&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=aBO5SvgSt1", "email": "microsoft.com;technion.ac.il;microsoft.com;google.com", "author_num": 4, "aff_unique_index": "0;1;0;2", "aff_unique_norm": "Microsoft;Technion - Israel Institute of Technology;Google", "aff_unique_dep": "Microsoft Corporation;;Google Research", "aff_unique_url": "https://www.microsoft.com;https://www.technion.ac.il/en/;https://research.google", "aff_unique_abbr": "Microsoft;Technion;Google Research", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;1;0;0", "aff_country_unique": "United States;Israel" }, { "title": "Unrolling PALM for Sparse Semi-Blind Source Separation", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6592", "id": "aBVxf5NaaRt", "poster": "", "openreview": "https://openreview.net/forum?id=aBVxf5NaaRt", "slides": "https://iclr.cc/virtual/2022/poster/6592", "video": "https://iclr.cc/virtual/2022/poster/6592", "author_site": "Mohammad Fahes, Christophe Kervazo, Jerome Bobin, Florence Tupin", "tldr": "", "abstract": "Sparse Blind Source Separation (BSS) has become a well established tool for a wide range of applications \u2013 for instance, in astrophysics and remote sensing. Classical sparse BSS methods, such as the Proximal Alternating Linearized Minimization (PALM) algorithm, nevertheless often suffer from a difficult hyper-parameter choice, which undermines their results. To bypass this pitfall, we propose in this work to build on the thriving field of algorithm unfolding/unrolling. Unrolling PALM enables to leverage the data-driven knowledge stemming from realistic simulations or ground-truth data by learning both PALM hyper-parameters and variables. In contrast to most existing unrolled algorithms, which assume a fixed known dictionary during the training and testing phases, this article further emphasizes on the ability to deal with variable mixing matrices (a.k.a. dictionaries). The proposed Learned PALM (LPALM) algorithm thus enables to perform semi-blind source separation, which is key to increase the generalization of the learnt model in real-world applications. We illustrate the relevance of LPALM in astrophysical multispectral imaging: the algorithm not only needs up to $10^4\u221210^5$ times less iterations than PALM, but also improves the separation quality, while avoiding the cumbersome hyper-parameter and initialization choice of PALM. We further show that LPALM outperforms other unrolled source separation methods in the semi-blind setting.", "keywords": "Algorithm Unrolling/Unfolding;Blind Source Separation;Sparse Representations;Multi-Convex Optimization;Hyper-parameter Choice", "primary_area": "", "supplementary_material": "", "author": "Mohammad Fahes;Christophe Kervazo;J\u00e9r\u00f4me Bobin;Florence Tupin", "authorids": "~Mohammad_Fahes2;~Christophe_Kervazo1;jerome.bobin@cea.fr;~Florence_Tupin2", "gender": "M;;;F", "homepage": ";https://sites.google.com/view/christophekervazo/;;https://perso.telecom-paristech.fr/tupin/", "dblp": "308/6867;;;", "google_scholar": "AcWj3loAAAAJ;;;", "orcid": ";;;", "linkedin": "mohammad-fahes-1463b1190;;;", "or_profile": "~Mohammad_Fahes2;~Christophe_Kervazo1;jerome.bobin@cea.fr;~Florence_Tupin2", "aff": "INRIA;Telecom Paris;;T\u00e9l\u00e9com ParisTech", "aff_domain": "inria.fr;telecom-paris.fr;;telecom-paristech.fr", "position": "PhD student;Assistant Professor;;Full Professor", "bibtex": "@inproceedings{\nfahes2022unrolling,\ntitle={Unrolling {PALM} for Sparse Semi-Blind Source Separation},\nauthor={Mohammad Fahes and Christophe Kervazo and J{\\'e}r{\\^o}me Bobin and Florence Tupin},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=aBVxf5NaaRt}\n}", "github": "", "project": "", "reviewers": "7bHu;rsVe;fMrf;bWys", "pdf_size": 0, "recommendation": "5;6;8;8", "confidence": "2;4;4;4", "correctness": "2;3;3;4", "technical_novelty": "2;3;2;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "144;136;88;39", "wc_summary_review": "52;66;161;248", "wc_main_review": "614;377;324;290", "wc_review": "810;579;573;577", "wc_reply_reviewers": "0;31;0;0", "wc_reply_authors": "1638;1378;791;694", "reply_reviewers": "0;1;0;0", "reply_authors": "3;2;1;1", "recommendation_avg": [ 6.75, 1.299038105676658 ], "confidence_avg": [ 3.5, 0.8660254037844386 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 101.75, 42.08547849318099 ], "wc_summary_review_avg": [ 131.75, 79.14030262767511 ], "wc_main_review_avg": [ 401.25, 126.68341446298328 ], "wc_review_avg": [ 634.75, 101.20369311443136 ], "wc_reply_reviewers_avg": [ 7.75, 13.423393758658799 ], "wc_reply_authors_avg": [ 1125.25, 395.12490113886776 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.75, 0.82915619758885 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.7777777777777777, "corr_recommendation_correctness": 0.816496580927726, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17855454750763330141&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=aBVxf5NaaRt", "email": "inria.fr;telecom-paris.fr;;telecom-paristech.fr", "author_num": 4, "aff_unique_index": "0;1;2", "aff_unique_norm": "INRIA;Telecom Paris;T\u00e9l\u00e9com ParisTech", "aff_unique_dep": ";;", "aff_unique_url": "https://www.inria.fr;https://www.telecom-paris.fr;https://www.telecom-paristech.fr", "aff_unique_abbr": "INRIA;Telecom Paris;TP", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "France" }, { "title": "Bundle Networks: Fiber Bundles, Local Trivializations, and a Generative Approach to Exploring Many-to-one Maps", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6976", "id": "aBXzcPPOuX", "poster": "", "openreview": "https://openreview.net/forum?id=aBXzcPPOuX", "slides": "https://iclr.cc/virtual/2022/poster/6976", "video": "https://iclr.cc/virtual/2022/poster/6976", "author_site": "Nico Courts, Henry Kvinge", "tldr": "", "abstract": "Many-to-one maps are ubiquitous in machine learning, from the image recognition model that assigns a multitude of distinct images to the concept of \u201ccat\u201d to the time series forecasting model which assigns a range of distinct time-series to a single scalar regression value. While the primary use of such models is naturally to associate correct output to each input, in many problems it is also useful to be able to explore, understand, and sample from a model's fibers, which are the set of input values $x$ such that $f(x) = y,$ for fixed $y$ in the output space. In this paper we show that popular generative architectures are ill-suited to such tasks. Motivated by this, we introduce a novel generative architecture, Bundle Networks, based on the concept of a fiber bundle from (differential) topology. BundleNets exploit the idea of a local trivialization wherein a space can be locally decomposed into a product space that cleanly encodes the many-to-one nature of the map. By enforcing this decomposition in BundleNets and by utilizing state-of-the-art invertible components, investigating a network's fibers becomes natural.", "keywords": "generative models;applications of topology to deep learning;many-to-one maps;invertible neural nets", "primary_area": "", "supplementary_material": "", "author": "Nico Courts;Henry Kvinge", "authorids": "~Nico_Courts1;~Henry_Kvinge1", "gender": "M;", "homepage": "https://nicocourts.com;https://hkvinge.github.io/", "dblp": ";223/4356", "google_scholar": ";vfFn_QsAAAAJ", "orcid": "0000-0001-7662-2939;", "linkedin": "nico-courts-41159149;", "or_profile": "~Nico_Courts1;~Henry_Kvinge1", "aff": "University of Washington, Seattle;Pacific Northwest National Laboratory", "aff_domain": "uw.edu;pnnl.gov", "position": "PhD student;Principal Researcher", "bibtex": "@inproceedings{\ncourts2022bundle,\ntitle={Bundle Networks: Fiber Bundles, Local Trivializations, and a Generative Approach to Exploring Many-to-one Maps},\nauthor={Nico Courts and Henry Kvinge},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=aBXzcPPOuX}\n}", "github": "", "project": "", "reviewers": "Hz2R;tPZF;6GTJ;asbk", "pdf_size": 0, "recommendation": "6;6;6;8", "confidence": "3;3;4;3", "correctness": "3;4;4;4", "technical_novelty": "3;4;2;4", "empirical_novelty": "3;0;3;3", "wc_summary_paper": "66;53;226;77", "wc_summary_review": "43;144;135;24", "wc_main_review": "204;399;839;267", "wc_review": "313;596;1200;368", "wc_reply_reviewers": "0;91;124;49", "wc_reply_authors": "394;605;980;439", "reply_reviewers": "0;1;2;1", "reply_authors": "1;1;2;1", "recommendation_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.25, 0.82915619758885 ], "empirical_novelty_avg": [ 2.25, 1.299038105676658 ], "wc_summary_paper_avg": [ 105.5, 70.08744538075275 ], "wc_summary_review_avg": [ 86.5, 53.518688324733816 ], "wc_main_review_avg": [ 427.25, 247.91971986915442 ], "wc_review_avg": [ 619.25, 351.6840734238615 ], "wc_reply_reviewers_avg": [ 66.0, 46.459659921269335 ], "wc_reply_authors_avg": [ 604.5, 230.59759322247922 ], "reply_reviewers_avg": [ 1.0, 0.7071067811865476 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=792839043857596844&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=aBXzcPPOuX", "email": "uw.edu;pnnl.gov", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "University of Washington;Pacific Northwest National Laboratory", "aff_unique_dep": ";", "aff_unique_url": "https://www.washington.edu;https://www.pnnl.gov", "aff_unique_abbr": "UW;PNNL", "aff_campus_unique_index": "0", "aff_campus_unique": "Seattle;", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "SDEdit: Guided Image Synthesis and Editing with Stochastic Differential Equations", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6268", "id": "aBsCjcPu_tE", "poster": "", "openreview": "https://openreview.net/forum?id=aBsCjcPu_tE", "slides": "https://iclr.cc/virtual/2022/poster/6268", "video": "https://iclr.cc/virtual/2022/poster/6268", "author_site": "Chenlin Meng, Yutong He, Yang Song, Jiaming Song, Jiajun Wu, Junyan Zhu, Stefano Ermon", "tldr": "", "abstract": "Guided image synthesis enables everyday users to create and edit photo-realistic images with minimum effort. The key challenge is balancing faithfulness to the user inputs (e.g., hand-drawn colored strokes) and realism of the synthesized images. Existing GAN-based methods attempt to achieve such balance using either conditional GANs or GAN inversions, which are challenging and often require additional training data or loss functions for individual applications. To address these issues, we introduce a new image synthesis and editing method, Stochastic Differential Editing (SDEdit), based on a diffusion model generative prior, which synthesizes realistic images by iteratively denoising through a stochastic differential equation (SDE). Given an input image with user guide in a form of manipulating RGB pixels, SDEdit first adds noise to the input, then subsequently denoises the resulting image through the SDE prior to increase its realism. SDEdit does not require task-specific training or inversions and can naturally achieve the balance between realism and faithfulness. SDEdit outperforms state-of-the-art GAN-based methods by up to 98.09% on realism and 91.72% on overall satisfaction scores, according to a human perception study, on multiple tasks, including stroke-based image synthesis and editing as well as image compositing.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Chenlin Meng;Yutong He;Yang Song;Jiaming Song;Jiajun Wu;Jun-Yan Zhu;Stefano Ermon", "authorids": "~Chenlin_Meng1;~Yutong_He1;~Yang_Song1;~Jiaming_Song1;~Jiajun_Wu1;~Jun-Yan_Zhu1;~Stefano_Ermon1", "gender": "F;F;M;M;M;M;M", "homepage": "https://chenlin9.github.io/;https://kellyyutonghe.github.io/;https://yang-song.net;http://tsong.me;https://jiajunwu.com;https://www.cs.cmu.edu/~junyanz/;http://cs.stanford.edu/~ermon/", "dblp": "227/2517;;;173/5104;117/4768;117/4782.html;47/8135", "google_scholar": "nEFU7wIAAAAJ;uNF3hk0AAAAJ;o_J2CroAAAAJ;;2efgcS0AAAAJ;UdpacsMAAAAJ;", "orcid": ";;;;0000-0002-4176-343X;0000-0001-8504-3410;", "linkedin": ";yutong-he-b7608b12b/;;jiamings/;jiajunwu/;jun-yan-zhu-99b18814;", "or_profile": "~Chenlin_Meng1;~Yutong_He1;~Yang_Song1;~Jiaming_Song1;~Jiajun_Wu1;~Jun-Yan_Zhu1;~Stefano_Ermon1", "aff": "Stanford University;Computer Science Department, Stanford University;Stanford University;Computer Science Department, Stanford University;Stanford University;Carnegie Mellon University;Stanford University", "aff_domain": "stanford.edu;cs.stanford.edu;stanford.edu;cs.stanford.edu;stanford.edu;cmu.edu;stanford.edu", "position": "PhD student;MS student;PhD student;Postdoc;Assistant Professor;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\nmeng2022sdedit,\ntitle={{SDE}dit: Guided Image Synthesis and Editing with Stochastic Differential Equations},\nauthor={Chenlin Meng and Yutong He and Yang Song and Jiaming Song and Jiajun Wu and Jun-Yan Zhu and Stefano Ermon},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=aBsCjcPu_tE}\n}", "github": "", "project": "", "reviewers": "Hvki;N9v6;CnCY", "pdf_size": 0, "recommendation": "6;6;8", "confidence": "2;3;4", "correctness": "2;3;4", "technical_novelty": "2;3;2", "empirical_novelty": "2;2;3", "wc_summary_paper": "56;44;34", "wc_summary_review": "48;41;39", "wc_main_review": "241;128;163", "wc_review": "345;213;236", "wc_reply_reviewers": "0;0;19", "wc_reply_authors": "551;364;437", "reply_reviewers": "0;0;1", "reply_authors": "1;1;1", "recommendation_avg": [ 6.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 3.0, 0.816496580927726 ], "correctness_avg": [ 3.0, 0.816496580927726 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 44.666666666666664, 8.993825042154693 ], "wc_summary_review_avg": [ 42.666666666666664, 3.8586123009300755 ], "wc_main_review_avg": [ 177.33333333333334, 47.23228650921834 ], "wc_review_avg": [ 264.6666666666667, 57.575071766250446 ], "wc_reply_reviewers_avg": [ 6.333333333333333, 8.956685895029603 ], "wc_reply_authors_avg": [ 450.6666666666667, 76.95164426804378 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.8660254037844385, "corr_recommendation_correctness": 0.8660254037844385, "gs_citation": 1666, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2574908324079451158&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=aBsCjcPu_tE", "email": "stanford.edu;cs.stanford.edu;stanford.edu;cs.stanford.edu;stanford.edu;cmu.edu;stanford.edu", "author_num": 7, "aff_unique_index": "0;0;0;0;0;1;0", "aff_unique_norm": "Stanford University;Carnegie Mellon University", "aff_unique_dep": ";", "aff_unique_url": "https://www.stanford.edu;https://www.cmu.edu", "aff_unique_abbr": "Stanford;CMU", "aff_campus_unique_index": "0;0;0;0;0;0", "aff_campus_unique": "Stanford;", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Conditional Object-Centric Learning from Video", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7059", "id": "aD7uesX1GF_", "poster": "", "openreview": "https://openreview.net/forum?id=aD7uesX1GF_", "slides": "https://iclr.cc/virtual/2022/poster/7059", "video": "https://iclr.cc/virtual/2022/poster/7059", "author_site": "Thomas Kipf, Gamaleldin Elsayed, Aravindh Mahendran, Austin Stone, Sara Sabour, Georg Heigold, Rico Jonschkowski, Alexey Dosovitskiy, Klaus Greff", "tldr": "", "abstract": "Object-centric representations are a promising path toward more systematic generalization by providing flexible abstractions upon which compositional world models can be built. Recent work on simple 2D and 3D datasets has shown that models with object-centric inductive biases can learn to segment and represent meaningful objects from the statistical structure of the data alone without the need for any supervision. However, such fully-unsupervised methods still fail to scale to diverse realistic data, despite the use of increasingly complex inductive biases such as priors for the size of objects or the 3D geometry of the scene. In this paper, we instead take a weakly-supervised approach and focus on how 1) using the temporal dynamics of video data in the form of optical flow and 2) conditioning the model on simple object location cues can be used to enable segmenting and tracking objects in significantly more realistic synthetic data. We introduce a sequential extension to Slot Attention which we train to predict optical flow for realistic looking synthetic scenes and show that conditioning the initial state of this model on a small set of hints, such as center of mass of objects in the first frame, is sufficient to significantly improve instance segmentation. These benefits generalize beyond the training distribution to novel objects, novel backgrounds, and to longer video sequences. We also find that such initial-state-conditioning can be used during inference as a flexible interface to query the model for specific objects or parts of objects, which could pave the way for a range of weakly-supervised approaches and allow more effective interaction with trained models.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/dc4c20bb657ae995416042d310bc4a52027c0095.zip", "author": "Thomas Kipf;Gamaleldin Fathy Elsayed;Aravindh Mahendran;Austin Stone;Sara Sabour;Georg Heigold;Rico Jonschkowski;Alexey Dosovitskiy;Klaus Greff", "authorids": "~Thomas_Kipf2;~Gamaleldin_Fathy_Elsayed1;~Aravindh_Mahendran2;~Austin_Stone1;~Sara_Sabour1;~Georg_Heigold1;~Rico_Jonschkowski1;~Alexey_Dosovitskiy1;~Klaus_Greff1", "gender": "M;M;;F;;;;M;M", "homepage": "http://www.columbia.edu/~gfa2109/;https://aravindhm.github.io/;;https://scholar.google.ca/citations?user=l8wQ39EAAAAJ&hl=en;;;;http://qwlouse.github.io/;http://tkipf.github.io/", "dblp": "https://dblp.uni-trier.de/pers/hd/e/Elsayed:Gamaleldin_F=;131/5343;202/1823;;46/2236;165/1321;135/4956.html;76/11430;186/8206", "google_scholar": "7PrTPzsAAAAJ;lAjGbLMAAAAJ;IU4ZllQAAAAJ;;;5ErX8dMAAAAJ;FXNJRDoAAAAJ;https://scholar.google.ch/citations?user=OcownLgAAAAJ;83HL5FwAAAAJ", "orcid": "0000-0002-4676-4220;0000-0002-2650-9871;;;;;;0000-0001-6982-0937;", "linkedin": "gamaleldin-elsayed-83668820/;;austin-charles-stone-1ba33b138/;;;;;;thomas-kipf-6b260410a", "or_profile": "~Gamaleldin_Fathy_Elsayed1;~Aravindh_Mahendran2;~Austin_Stone1;~Sara_Sabour1;~Georg_Heigold1;~Rico_Jonschkowski1;~Alexey_Dosovitskiy1;~Klaus_Greff1;~Thomas_N._Kipf1", "aff": "Google Research, Brain Team;Google;Google;Department of Computer Science, University of Toronto;Google;Google;Google;Google;Google", "aff_domain": "google.com;google.com;google.com;cs.toronto.edu;google.com;google.com;google.com;google.com;google.com", "position": "Research Scientist;Researcher;Research Engineer;PhD student;Researcher;Research Scientist;Researcher;Researcher;Research Scientist", "bibtex": "@inproceedings{\nkipf2022conditional,\ntitle={Conditional Object-Centric Learning from Video},\nauthor={Thomas Kipf and Gamaleldin Fathy Elsayed and Aravindh Mahendran and Austin Stone and Sara Sabour and Georg Heigold and Rico Jonschkowski and Alexey Dosovitskiy and Klaus Greff},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=aD7uesX1GF_}\n}", "github": "", "project": "", "reviewers": "r9zz;o4c6;TdU6;1UV9", "pdf_size": 0, "recommendation": "6;6;8;8", "confidence": "5;3;5;4", "correctness": "4;3;4;4", "technical_novelty": "3;3;4;3", "empirical_novelty": "3;3;4;3", "wc_summary_paper": "41;63;149;221", "wc_summary_review": "135;250;55;21", "wc_main_review": "426;59;317;295", "wc_review": "602;372;521;537", "wc_reply_reviewers": "48;59;0;31", "wc_reply_authors": "1009;658;407;515", "reply_reviewers": "1;1;0;1", "reply_authors": "3;2;1;1", "recommendation_avg": [ 7.0, 1.0 ], "confidence_avg": [ 4.25, 0.82915619758885 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 118.5, 71.62925380038521 ], "wc_summary_review_avg": [ 115.25, 88.12030129317534 ], "wc_main_review_avg": [ 274.25, 133.8083984658661 ], "wc_review_avg": [ 508.0, 84.17541208690338 ], "wc_reply_reviewers_avg": [ 34.5, 22.276669409945463 ], "wc_reply_authors_avg": [ 647.25, 227.04005703840016 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.75, 0.82915619758885 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 9, 0 ], "corr_recommendation_confidence": 0.30151134457776363, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 239, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13987153077190983503&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=aD7uesX1GF_", "email": "google.com;google.com;google.com;cs.toronto.edu;google.com;google.com;google.com;google.com;google.com", "author_num": 9, "aff_unique_index": "0;0;0;1;0;0;0;0;0", "aff_unique_norm": "Google;University of Toronto", "aff_unique_dep": "Google Research;Department of Computer Science", "aff_unique_url": "https://research.google;https://www.utoronto.ca", "aff_unique_abbr": "Google;U of T", "aff_campus_unique_index": "0;0;0;1;0;0;0;0;0", "aff_campus_unique": "Mountain View;Toronto", "aff_country_unique_index": "0;0;0;1;0;0;0;0;0", "aff_country_unique": "United States;Canada" }, { "id": "aJ9BXxg352", "title": "Intriguing Properties of Input-dependent Randomized Smoothing", "track": "main", "status": "Reject", "tldr": "", "abstract": "Randomized smoothing is currently considered the state-of-the-art method to obtain certifiably robust classifiers. Despite its remarkable performance, the method is associated with various serious problems such as ``certified accuracy waterfalls'', certification vs.\\ accuracy trade-off, or even fairness issues. Input-dependent smoothing approaches have been proposed to overcome these flaws. However, we demonstrate that these methods lack formal guarantees and so the resulting certificates are not justified. We show that the input-dependent smoothing, in general, suffers from the curse of dimensionality, forcing the variance function to have low semi-elasticity. On the other hand, we provide a theoretical and practical framework that enables the usage of input-dependent smoothing even in the presence of the curse of dimensionality, under strict restrictions. We present one concrete design of the smoothing variance and test it on CIFAR10 and MNIST. Our design solves some of the problems of classical smoothing and is formally underlined, yet further improvement of the design is still necessary.", "keywords": "randomized smoothing;certifiable robustness;deep learning;machine learning", "primary_area": "", "supplementary_material": "", "author": "Peter S\u00faken\u00edk;Aleksei Kuvshinov;Stephan G\u00fcnnemann", "authorids": "~Peter_S\u00faken\u00edk1;~Aleksei_Kuvshinov1;~Stephan_G\u00fcnnemann1", "gender": "M;M;M", "homepage": "https://research-explorer.app.ist.ac.at/person/d64d6a8d-eb8e-11eb-b029-96fd216dec3c;https://www.cs.cit.tum.de/daml/team/aleksei-kuvshinov/;http://www.daml.in.tum.de", "dblp": "304/2274;289/8401;43/3011", "google_scholar": "qEhrUDAAAAAJ;https://scholar.google.de/citations?user=tjVXaLIAAAAJ;", "orcid": ";;", "linkedin": ";;", "or_profile": "~Peter_S\u00faken\u00edk1;~Aleksei_Kuvshinov1;~Stephan_G\u00fcnnemann1", "aff": "Institute of Science and Technology;Technical University Munich;Technical University Munich", "aff_domain": "ist.ac.at;tum.de;tum.de", "position": "PhD student;PhD student;Professor", "bibtex": "@misc{\ns{\\'u}ken{\\'\\i}k2022intriguing,\ntitle={Intriguing Properties of Input-dependent Randomized Smoothing},\nauthor={Peter S{\\'u}ken{\\'\\i}k and Aleksei Kuvshinov and Stephan G{\\\"u}nnemann},\nyear={2022},\nurl={https://openreview.net/forum?id=aJ9BXxg352}\n}", "github": "", "project": "", "reviewers": "jyXD;Q6Nf;j688;UXiJ", "site": "https://openreview.net/forum?id=aJ9BXxg352", "pdf_size": 0, "recommendation": "3;5;5;8", "confidence": "4;2;4;4", "correctness": "4;3;4;4", "technical_novelty": "3;3;2;3", "empirical_novelty": "1;1;1;3", "wc_summary_paper": "58;90;91;38", "wc_summary_review": "39;53;95;32", "wc_main_review": "506;314;354;504", "wc_review": "603;457;540;574", "wc_reply_reviewers": "363;0;353;16", "wc_reply_authors": "2233;858;1891;969", "reply_reviewers": "1;0;2;1", "reply_authors": "4;2;5;2", "recommendation_avg": [ 5.25, 1.7853571071357126 ], "confidence_avg": [ 3.5, 0.8660254037844386 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 69.25, 22.398381637966615 ], "wc_summary_review_avg": [ 54.75, 24.437420076595647 ], "wc_main_review_avg": [ 419.5, 86.66458330829266 ], "wc_review_avg": [ 543.5, 54.69232121605372 ], "wc_reply_reviewers_avg": [ 183.0, 175.12709670407946 ], "wc_reply_authors_avg": [ 1487.75, 588.1527756459201 ], "reply_reviewers_avg": [ 1.0, 0.7071067811865476 ], "reply_authors_avg": [ 3.25, 1.299038105676658 ], "replies_avg": [ 23, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.08084520834544431, "corr_recommendation_correctness": 0.08084520834544431, "gs_citation": 30, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6155870339923015137&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff_unique_index": "0;1;1", "aff_unique_norm": "Institute of Science and Technology;Technical University of Munich", "aff_unique_dep": ";", "aff_unique_url": ";https://www.tum.de", "aff_unique_abbr": ";TUM", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "1;1", "aff_country_unique": ";Germany" }, { "id": "aJORhCrlYqu", "title": "ARMCMC: Online Bayesian Density Estimation of Model Parameters", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Although the Bayesian paradigm provides a rigorous framework to estimate the full probability distribution over unknown parameters, its online implementation can be challenging due to heavy computational costs. This paper proposes Adaptive Recursive Markov Chain Monte Carlo (ARMCMC) which estimates full probability density of model parameters while alleviating shortcomings of conventional online approaches. These shortcomings include: being solely able to account for Gaussian noise, being applicable to systems with linear in the parameters (LIP) constraint, or having requirements on persistence excitation (PE). In ARMCMC, we propose a variable jump distribution, which depends on a temporal forgetting factor. This allows one to adjust the trade-off between exploitation and exploration, depending on whether there is an abrupt change to the parameter being estimated. We prove that ARMCMC requires fewer samples to achieve the same precision and reliability compared to conventional MCMC approaches. We demonstrate our approach on two challenging benchmarks: the estimation of parameters in a soft bending actuator and the Hunt-Crossley dynamic model. Our method shows at-least 70\\% improvement in parameter point estimation accuracy and approximately 55\\% reduction in tracking error of the value of interest compared to recursive least squares and conventional MCMC.", "keywords": "Bayesian;Probabilistic approaches;MCMC;Hunt Crossley;parameter identification.", "primary_area": "", "supplementary_material": "/attachment/2fc0d21dadf1d455a982ead7621e67212fdb063d.zip", "author": "Pedram Agand;Mo Chen;Hamid Taghirad", "authorids": "~Pedram_Agand1;~Mo_Chen1;~Hamid_Taghirad1", "gender": "M;M;", "homepage": "https://upaspro.com/pedram-agand/;http://www.sfu.ca/~mochen/;", "dblp": "207/0639;;", "google_scholar": "https://scholar.google.ca/citations?user=URfHnY4AAAAJ;https://scholar.google.ca/citations?user=19UAgLUAAAAJ;", "orcid": ";0000-0001-8506-3665;", "linkedin": "agand/;;", "or_profile": "~Pedram_Agand1;~Mo_Chen1;~Hamid_Taghirad1", "aff": "Simon Fraser University;Simon Fraser University;", "aff_domain": "sfu.ca;sfu.ca;", "position": "PhD student;Assistant Professor;", "bibtex": "@misc{\nagand2022armcmc,\ntitle={{ARMCMC}: Online Bayesian Density Estimation of Model Parameters},\nauthor={Pedram Agand and Mo Chen and Hamid Taghirad},\nyear={2022},\nurl={https://openreview.net/forum?id=aJORhCrlYqu}\n}", "github": "", "project": "", "reviewers": "D12a;5LQM;6qvf;ZCbj", "site": "https://openreview.net/forum?id=aJORhCrlYqu", "pdf_size": 0, "recommendation": "1;3;3;5", "confidence": "4;5;4;3", "correctness": "3;2;1;3", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "62;90;43;76", "wc_summary_review": "20;44;18;28", "wc_main_review": "557;262;304;509", "wc_review": "639;396;365;613", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.0, 1.4142135623730951 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 2.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 67.75, 17.383541066192468 ], "wc_summary_review_avg": [ 27.5, 10.23474474522936 ], "wc_main_review_avg": [ 408.0, 127.01771529987461 ], "wc_review_avg": [ 503.25, 123.58069226218147 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.5, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:3mJKzUlr8-cJ:scholar.google.com/&scioq=ARMCMC:+Online+Bayesian+Density+Estimation+of+Model+Parameters&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Simon Fraser University", "aff_unique_dep": "", "aff_unique_url": "https://www.sfu.ca", "aff_unique_abbr": "SFU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Canada" }, { "id": "aJ_GcB4vcT0", "title": "Unsupervised Learning of Neurosymbolic Encoders", "track": "main", "status": "Reject", "tldr": "", "abstract": "We present a framework for the unsupervised learning of neurosymbolic encoders, i.e., encoders obtained by composing neural networks with symbolic programs from a domain-specific language. Such a framework can naturally incorporate symbolic expert knowledge into the learning process and lead to more interpretable and factorized latent representations than fully neural encoders. Also, models learned this way can have downstream impact, as many analysis workflows can benefit from having clean programmatic descriptions. We ground our learning algorithm in the variational autoencoding (VAE) framework, where we aim to learn a neurosymbolic encoder in conjunction with a standard decoder. Our algorithm integrates standard VAE-style training with modern program synthesis techniques. We evaluate our method on learning latent representations for real-world trajectory data from animal biology and sports analytics. We show that our approach offers significantly better separation than standard VAEs and leads to practical gains on downstream tasks.", "keywords": "unsupervised learning;representation learning;neurosymbolic program synthesis", "primary_area": "", "supplementary_material": "/attachment/6a16338b120615c04d8eca65310f639ae630f4d0.zip", "author": "Eric Zhan;Jennifer J. Sun;Ann Kennedy;Yisong Yue;Swarat Chaudhuri", "authorids": "~Eric_Zhan1;~Jennifer_J._Sun1;~Ann_Kennedy1;~Yisong_Yue1;~Swarat_Chaudhuri1", "gender": "M;F;M;M;F", "homepage": ";http://www.kennedylab.org;http://www.yisongyue.com;http://www.cs.utexas.edu/~swarat;https://jenjsun.com/", "dblp": ";148/5435;28/1244;37/6100;232/1563", "google_scholar": ";Wl4nCbQAAAAJ;tEk4qo8AAAAJ;9j6RBYQAAAAJ;", "orcid": ";0000-0002-3782-0518;0000-0001-9127-1989;0000-0002-6859-1391;", "linkedin": ";;yisongyue/;swarat-chaudhuri-609b3092/;jennifer-sun-224778a3/", "or_profile": "~Eric_Zhan1;~Ann_Kennedy1;~Yisong_Yue1;~Swarat_Chaudhuri1;~Jennifer_Jianing_Sun1", "aff": "California Institute of Technology;Northwestern University;Argo AI;University of Texas, Austin;California Institute of Technology", "aff_domain": "caltech.edu;northwestern.edu;argo.ai;utexas.edu;caltech.edu", "position": "PhD student;Assistant Professor;Principal Researcher;Associate Professor;PhD student", "bibtex": "@misc{\nzhan2022unsupervised,\ntitle={Unsupervised Learning of Neurosymbolic Encoders},\nauthor={Eric Zhan and Jennifer J. Sun and Ann Kennedy and Yisong Yue and Swarat Chaudhuri},\nyear={2022},\nurl={https://openreview.net/forum?id=aJ_GcB4vcT0}\n}", "github": "", "project": "", "reviewers": "G4ix;a6JR;cxLd;zt4F", "site": "https://openreview.net/forum?id=aJ_GcB4vcT0", "pdf_size": 0, "recommendation": "5;5;5;6", "confidence": "3;3;3;4", "correctness": "3;3;4;4", "technical_novelty": "3;3;3;2", "empirical_novelty": "2;3;2;0", "wc_summary_paper": "65;43;77;51", "wc_summary_review": "42;56;106;36", "wc_main_review": "301;345;427;189", "wc_review": "408;444;610;276", "wc_reply_reviewers": "135;0;0;0", "wc_reply_authors": "742;510;623;363", "reply_reviewers": "1;0;0;0", "reply_authors": "2;1;1;1", "recommendation_avg": [ 5.25, 0.4330127018922193 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.75, 1.0897247358851685 ], "wc_summary_paper_avg": [ 59.0, 13.038404810405298 ], "wc_summary_review_avg": [ 60.0, 27.53179979587241 ], "wc_main_review_avg": [ 315.5, 85.89965075598387 ], "wc_review_avg": [ 434.5, 119.07455647618428 ], "wc_reply_reviewers_avg": [ 33.75, 58.45671475544961 ], "wc_reply_authors_avg": [ 559.5, 140.00089285429576 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 1.0, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8192863692497647267&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff_unique_index": "0;1;2;3;0", "aff_unique_norm": "California Institute of Technology;Northwestern University;Argo AI;University of Texas at Austin", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.caltech.edu;https://www.northwestern.edu;https://www.argo.ai;https://www.utexas.edu", "aff_unique_abbr": "Caltech;NU;Argo AI;UT Austin", "aff_campus_unique_index": "0;2;0", "aff_campus_unique": "Pasadena;;Austin", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "aKZeBGUJXlH", "title": "Gradient Broadcast Adaptation: Defending against the backdoor attack in pre-trained models", "track": "main", "status": "Reject", "tldr": "", "abstract": "Pre-trained language models (e.g, BERT, GPT-3) have revolutionized the NLP research and fine-tuning becomes the indispensable step of downstream adaptation. However, the covert attack is the emerging threat to the pre-train-then-fine tuning learning paradigm. The backdoor attack is a typical challenge, which the victim model fails on the trigger-activated samples while behaves normally on others. These backdoors could survive the cascading fine-tuning stage, which continually posing the application of pre-trained models. In this paper, we proposed a Gradient Broadcast Adaptation (GBA) method, prevent the model from controlled producing outputs in a trigger-anchor-free manner. We design the prompt-based tuning, flexibly accessing the rare tokens while providing a fair measure of distance in word embedding space. The gradient broadcast alleviates lazy updating of potential triggers and purges the underlying abnormal weights. The GBA defense method is evaluated over five text-classification tasks against three state-of-the-art backdoor attacks. We find our method can cover nearly 100% embedded backdoor with negligible performance loss on clean data.", "keywords": "backdoor attacks;deep learning security;pre-trained models", "primary_area": "", "supplementary_material": "/attachment/c5cdd821bc42458cbf3f6e96b76e5a4384ad2383.zip", "author": "Tianyu Chen;Haoyi Zhou;He Mingrui;Jianxin Li", "authorids": "~Tianyu_Chen1;~Haoyi_Zhou1;~He_Mingrui1;~Jianxin_Li2", "gender": "M;M;M;M", "homepage": "https://github.com/Tarpelite;https://www.zhouhaoyi.com/;https://github.com/reynolds9808;http://myjianxin.github.io", "dblp": ";162/1287;;l/JianxinLi-2.html", "google_scholar": ";mbrFlN0AAAAJ;;EY2lqD0AAAAJ", "orcid": ";0000-0002-2393-3634;;0000-0001-5152-0055", "linkedin": ";haoyi-zhou-54a7a69a/;;", "or_profile": "~Tianyu_Chen1;~Haoyi_Zhou1;~He_Mingrui1;~Jianxin_Li3", "aff": "Beihang University;Beihang University;Beihang University;Beihang University ", "aff_domain": "buaa.edu.cn;buaa.edu.cn;buaa.edu.cn;buaa.edu.cn", "position": "PhD student;Assistant Professor;MS student;Full Professor", "bibtex": "@misc{\nchen2022gradient,\ntitle={Gradient Broadcast Adaptation: Defending against the backdoor attack in pre-trained models},\nauthor={Tianyu Chen and Haoyi Zhou and He Mingrui and Jianxin Li},\nyear={2022},\nurl={https://openreview.net/forum?id=aKZeBGUJXlH}\n}", "github": "", "project": "", "reviewers": "KqXG;gVmg;pjTM", "site": "https://openreview.net/forum?id=aKZeBGUJXlH", "pdf_size": 0, "recommendation": "3;5;8", "confidence": "4;3;4", "correctness": "2;3;4", "technical_novelty": "2;3;3", "empirical_novelty": "1;2;3", "wc_summary_paper": "43;70;96", "wc_summary_review": "97;21;68", "wc_main_review": "352;258;482", "wc_review": "492;349;646", "wc_reply_reviewers": "0;0;18", "wc_reply_authors": "665;653;444", "reply_reviewers": "0;0;1", "reply_authors": "2;2;2", "recommendation_avg": [ 5.333333333333333, 2.0548046676563256 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.0, 0.816496580927726 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.0, 0.816496580927726 ], "wc_summary_paper_avg": [ 69.66666666666667, 21.63844315615664 ], "wc_summary_review_avg": [ 62.0, 31.31559781748812 ], "wc_main_review_avg": [ 364.0, 91.84044134621014 ], "wc_review_avg": [ 495.6666666666667, 121.27745966263568 ], "wc_reply_reviewers_avg": [ 6.0, 8.48528137423857 ], "wc_reply_authors_avg": [ 587.3333333333334, 101.47030216877361 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 2.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.1147078669352809, "corr_recommendation_correctness": 0.9933992677987828, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:7AoArEuhCUIJ:scholar.google.com/&scioq=Gradient+Broadcast+Adaptation:+Defending+against+the+backdoor+attack+in+pre-trained+models&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Beihang University", "aff_unique_dep": "", "aff_unique_url": "http://www.buaa.edu.cn/", "aff_unique_abbr": "BUAA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "id": "aM7l2S2s5pk", "title": "Offline-Online Reinforcement Learning: Extending Batch and Online RL", "track": "main", "status": "Reject", "tldr": "", "abstract": "Batch RL has seen a surge in popularity and is applicable in many practical scenarios where past data is available. Unfortunately, the performance of batch RL agents is limited in both theory and practice without strong assumptions on the data-collection process e.g. sufficient coverage or a good policy. To enable better performance, we investigate the offline-online setting: The agent has access to a batch of data to train on but is also allowed to learn during the evaluation phase in an online manner. This is an extension to batch RL, allowing the agent to adapt to new situations without having to precommit to a policy. In our experiments, we find that agents trained in an offline-online manner can outperform agents trained only offline or online, sometimes by a large margin, for different dataset sizes and data-collection policies. Furthermore, we investigate the use of optimism vs. pessimism for value functions in the offline-online setting due to their use in batch and online RL.", "keywords": "Reinforcement Learning;Batch RL;Online RL;Offline RL", "primary_area": "", "supplementary_material": "", "author": "Maryam Hashemzadeh;Wesley Chung;Martha White", "authorids": "~Maryam_Hashemzadeh1;~Wesley_Chung1;~Martha_White1", "gender": "F;;F", "homepage": ";;http://marthawhite.ca", "dblp": ";225/7749.html;60/7057", "google_scholar": "3h6myHkAAAAJ;https://scholar.google.ca/citations?user=y5e1qjQAAAAJ;t5zdD_IAAAAJ", "orcid": ";;0000-0002-5356-2950", "linkedin": "maryam-hashemzadeh-b76a7155/;;", "or_profile": "~Maryam_Hashemzadeh1;~Wesley_Chung1;~Martha_White1", "aff": "Montreal Institute for Learning Algorithms, Universit\u00e9 de Montr\u00e9al;McGill University;University of Alberta", "aff_domain": "mila.umontreal.ca;mcgill.ca;ualberta.ca", "position": "Researcher;PhD student;Associate Professor", "bibtex": "@misc{\nhashemzadeh2022offlineonline,\ntitle={Offline-Online Reinforcement Learning: Extending Batch and Online {RL}},\nauthor={Maryam Hashemzadeh and Wesley Chung and Martha White},\nyear={2022},\nurl={https://openreview.net/forum?id=aM7l2S2s5pk}\n}", "github": "", "project": "", "reviewers": "Vqyy;FHnd;NS43;aP1R", "site": "https://openreview.net/forum?id=aM7l2S2s5pk", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "3;5;3;3", "correctness": "3;4;3;4", "technical_novelty": "1;1;1;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "176;54;57;106", "wc_summary_review": "26;9;54;2", "wc_main_review": "254;380;217;240", "wc_review": "456;443;328;348", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 3.5, 0.8660254037844386 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 1.5, 0.8660254037844386 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 98.25, 49.40837479618207 ], "wc_summary_review_avg": [ 22.75, 20.04214309898021 ], "wc_main_review_avg": [ 272.75, 63.31419667025714 ], "wc_review_avg": [ 393.75, 56.38428415791053 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:AXexeGy9xU8J:scholar.google.com/&scioq=Offline-Online+Reinforcement+Learning:+Extending+Batch+and+Online+RL&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;2", "aff_unique_norm": "Universit\u00e9 de Montr\u00e9al;McGill University;University of Alberta", "aff_unique_dep": "Montreal Institute for Learning Algorithms;;", "aff_unique_url": "https://www.udemontreal.ca;https://www.mcgill.ca;https://www.ualberta.ca", "aff_unique_abbr": "UdeM;McGill;UAlberta", "aff_campus_unique_index": "0", "aff_campus_unique": "Montreal;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Canada" }, { "id": "aMaQjwz5IXI", "title": "Style Equalization: Unsupervised Learning of Controllable Generative Sequence Models", "track": "main", "status": "Reject", "tldr": "", "abstract": "Controllable generative sequence models with the capability to extract and replicate the style of specific examples enable many applications, including narrating audiobooks in different voices, auto-completing and auto-correcting written handwriting, and generating missing training samples for downstream recognition tasks. However, typical training algorithms for these controllable sequence generative models suffer from the training-inference mismatch, where the same sample is used as content and style input during training but different samples are given during inference. In this paper, we tackle the training-inference mismatch encountered during unsupervised learning of controllable generative sequence models. By introducing a style transformation module that we call style equalization, we enable training using different content and style samples and thereby mitigate the training- inference mismatch. To demonstrate its generality, we applied style equalization to text-to-speech and text-to-handwriting synthesis on three datasets. Our models achieve state-of-the-art style replication with a similar mean style opinion score as the real data. Moreover, the proposed method enables style interpolation between sequences and generates novel styles.", "keywords": "Controllable sequence models;Text to speech;Text to handwriting", "primary_area": "", "supplementary_material": "/attachment/3be2c8a4cfc2cc12f8953835ac59ab30785bdb32.zip", "author": "Jen-Hao Rick Chang;Ashish Shrivastava;Hema Swetha Koppula;Xiaoshuai Zhang;Oncel Tuzel", "authorids": "~Jen-Hao_Rick_Chang1;~Ashish_Shrivastava1;~Hema_Swetha_Koppula2;~Xiaoshuai_Zhang1;~Oncel_Tuzel2", "gender": "M;M;F;M;M", "homepage": "https://rick-chang.github.io;;;https://i.buriedjet.com;http://www.onceltuzel.net", "dblp": "169/4938;09/7436-1.html;;175/5693;73/2943.html", "google_scholar": "F5Z9kN4AAAAJ;8t3gvfAAAAAJ;;cTGxuQQAAAAJ;Fe7NTe0AAAAJ", "orcid": ";;;;", "linkedin": ";;hemakoppula;;", "or_profile": "~Jen-Hao_Rick_Chang1;~Ashish_Shrivastava1;~Hema_Swetha_Koppula2;~Xiaoshuai_Zhang1;~Oncel_Tuzel2", "aff": "Apple;Apple;Apple;Google;Apple", "aff_domain": "apple.com;apple.com;apple.com;google.com;apple.com", "position": "Researcher;Researcher;Research Scientist;Intern;Principal Researcher", "bibtex": "@misc{\nchang2022style,\ntitle={Style Equalization: Unsupervised Learning of Controllable Generative Sequence Models },\nauthor={Jen-Hao Rick Chang and Ashish Shrivastava and Hema Swetha Koppula and Xiaoshuai Zhang and Oncel Tuzel},\nyear={2022},\nurl={https://openreview.net/forum?id=aMaQjwz5IXI}\n}", "github": "", "project": "", "reviewers": "A5yD;E4io;K825", "site": "https://openreview.net/forum?id=aMaQjwz5IXI", "pdf_size": 0, "recommendation": "3;6;8", "confidence": "4;4;4", "correctness": "2;3;4", "technical_novelty": "3;2;3", "empirical_novelty": "2;3;4", "wc_summary_paper": "26;204;129", "wc_summary_review": "37;144;38", "wc_main_review": "470;439;161", "wc_review": "533;787;328", "wc_reply_reviewers": "283;43;27", "wc_reply_authors": "2106;754;84", "reply_reviewers": "1;1;1", "reply_authors": "3;1;1", "recommendation_avg": [ 5.666666666666667, 2.0548046676563256 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.0, 0.816496580927726 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 3.0, 0.816496580927726 ], "wc_summary_paper_avg": [ 119.66666666666667, 72.96726815649755 ], "wc_summary_review_avg": [ 73.0, 50.20624131187941 ], "wc_main_review_avg": [ 356.6666666666667, 138.93483684407195 ], "wc_review_avg": [ 549.3333333333334, 187.74154811572447 ], "wc_reply_reviewers_avg": [ 117.66666666666667, 117.09065813386745 ], "wc_reply_authors_avg": [ 981.3333333333334, 840.9840796484926 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 1.6666666666666667, 0.9428090415820634 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.9933992677987828, "gs_citation": 26, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9338198586933521323&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;0;1;0", "aff_unique_norm": "Apple;Google", "aff_unique_dep": "Apple Inc.;Google", "aff_unique_url": "https://www.apple.com;https://www.google.com", "aff_unique_abbr": "Apple;Google", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "aNCZ8151BjY", "title": "Design and Evaluation for Robust Continual Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Continual learning is the ability to learn from new experiences without forgetting\nprevious experiences. Different continual learning methods are each motivated\nby their own interpretation of the continual learning scenario, resulting in a wide\nvariety of experiment protocols, which hinders understanding and comparison of\nresults. Existing works emphasize differences in accuracy without considering\nthe effects of experimental settings. However, understanding the effects of experimental\nassumptions is the most crucial part of any evaluation, as the experimental\nprotocol may supply implicit information. We propose six rules as a guideline for\nexperimental design and execution to conduct robust continual learning evaluation\nfor better understanding of the methods. Using these rules, we demonstrate the\nimportance of experimental choices regarding the sequence of incoming data and\nthe sequence of the task oracle. Even when task oracle-based methods are desired,\nthe rules can guide experimental design to support better evaluation and understanding\nof the continual learning methods. Consistent application of these rules\nin evaluating continual learning methods makes explicit the effect and validity of\nmany assumptions, thereby avoiding misleading conclusions.", "keywords": "Continual Learning;robust experimental protocol;task oracle;task identifier", "primary_area": "", "supplementary_material": "", "author": "Yeu-Shin Fu;Josh Milthorpe", "authorids": "~Yeu-Shin_Fu1;josh.milthorpe@anu.edu.au", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nfu2022design,\ntitle={Design and Evaluation for Robust Continual Learning},\nauthor={Yeu-Shin Fu and Josh Milthorpe},\nyear={2022},\nurl={https://openreview.net/forum?id=aNCZ8151BjY}\n}", "github": "", "project": "", "reviewers": "vS9u;Ajqh;rfDE;tqw3", "site": "https://openreview.net/forum?id=aNCZ8151BjY", "pdf_size": 0, "recommendation": "1;3;5;5", "confidence": "2;5;4;3", "correctness": "2;3;4;3", "technical_novelty": "1;2;1;2", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "14;71;63;57", "wc_summary_review": "36;48;46;72", "wc_main_review": "372;271;341;343", "wc_review": "422;390;450;472", "wc_reply_reviewers": "0;82;153;0", "wc_reply_authors": "1457;1557;1093;695", "reply_reviewers": "0;1;1;0", "reply_authors": "2;2;2;1", "recommendation_avg": [ 3.5, 1.6583123951777 ], "confidence_avg": [ 3.5, 1.118033988749895 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 1.5, 0.5 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 51.25, 22.072324299900995 ], "wc_summary_review_avg": [ 50.5, 13.219304066402286 ], "wc_main_review_avg": [ 331.75, 37.157603528753036 ], "wc_review_avg": [ 433.5, 30.736785778607366 ], "wc_reply_reviewers_avg": [ 58.75, 63.888085743744114 ], "wc_reply_authors_avg": [ 1200.5, 339.1058094459604 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.75, 0.4330127018922193 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.40451991747794525, "corr_recommendation_correctness": 0.8528028654224417, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:KNEYhUx52U0J:scholar.google.com/&scioq=Design+and+Evaluation+for+Robust+Continual+Learning&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "title": "Divisive Feature Normalization Improves Image Recognition Performance in AlexNet", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6633", "id": "aOX3a9q3RVV", "poster": "", "openreview": "https://openreview.net/forum?id=aOX3a9q3RVV", "slides": "https://iclr.cc/virtual/2022/poster/6633", "video": "https://iclr.cc/virtual/2022/poster/6633", "author_site": "Michelle Miller, SueYeon Chung, Ken Miller", "tldr": "", "abstract": "Local divisive normalization provides a phenomenological description of many nonlinear response properties of neurons across visual cortical areas. To gain insight into the utility of this operation, we studied the effects on AlexNet of a local divisive normalization between features, with learned parameters. Developing features were arranged in a line topology, with the influence between features determined by an exponential function of the distance between them. We compared an AlexNet model with no normalization or with canonical normalizations (Batch, Group, Layer) to the same models with divisive normalization added. Divisive normalization always improved performance for models with batch or group or no normalization, generally by 1-2 percentage points, on both the CIFAR-100 and ImageNet databases. To gain insight into mechanisms underlying the improved performance, we examined several aspects of network representations. In the early layers both canonical and divisive normalizations reduced manifold capacities and increased average dimension of the individual categorical manifolds. In later layers the capacity was higher and manifold dimension lower for models roughly in order of their performance improvement. Examining the sparsity of activations across a given layer, divisive normalization layers increased sparsity, while the canonical normalization layers decreased it. Nonetheless, in the final layer, the sparseness of activity increased in the order of no normalization, divisive, com- bined, and canonical. We also investigated how the receptive fields (RFs) in the first convolutional layer (where RFs are most interpretable) change with normalization. Divisive normalization enhanced RF Fourier power at low wavelengths, while divisive+canonical enhanced power at mid (batch, group) or low (layer) wavelengths, compared to canonical alone or no normalization. In conclusion, divisive normalization enhances image recognition performance, most strongly when combined with canonical normalization, and in doing so it reduces manifold capacity and sparsity in early layers while increasing them in final layers, and increases low- or mid-wavelength power in the first-layer receptive fields.", "keywords": "divisive normalization;AlexNet;ImageNet;CIFAR-100;manifold capacity;sparsity;receptive fields;Batch Normalization;Group Normalization;Layer Normalization", "primary_area": "", "supplementary_material": "", "author": "Michelle Miller;SueYeon Chung;Kenneth D. Miller", "authorids": "~Michelle_Miller3;~SueYeon_Chung1;~Kenneth_D._Miller2", "gender": "F;F;M", "homepage": "http://michellecmiller.com/;https://sites.google.com/site/sueyeonchung/;https://www.neurotheory.columbia.edu/Ken", "dblp": ";173/5418;89/1759", "google_scholar": ";h7yVv0QAAAAJ;-5ZxgGsAAAAJ", "orcid": ";;0000-0002-1433-0647", "linkedin": ";;", "or_profile": "~Michelle_Miller3;~SueYeon_Chung1;~Kenneth_Miller2", "aff": "University of Chicago;Flatiron Institute / Simons Foundation;Columbia University", "aff_domain": "uchicago.edu;simonsfoundation.org;columbia.edu", "position": "PhD student;Principal Investigator;Full Professor", "bibtex": "@inproceedings{\nmiller2022divisive,\ntitle={Divisive Feature Normalization Improves Image Recognition Performance in AlexNet},\nauthor={Michelle Miller and SueYeon Chung and Kenneth D. Miller},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=aOX3a9q3RVV}\n}", "github": "", "project": "", "reviewers": "nq4A;1XvT;h6d9;Qotw", "pdf_size": 0, "recommendation": "6;6;8;8", "confidence": "4;4;4;5", "correctness": "3;3;3;3", "technical_novelty": "2;3;3;4", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "50;90;59;44", "wc_summary_review": "46;280;30;86", "wc_main_review": "398;545;267;1477", "wc_review": "494;915;356;1607", "wc_reply_reviewers": "73;192;216;0", "wc_reply_authors": "1886;1689;956;1102", "reply_reviewers": "1;1;4;0", "reply_authors": "4;3;4;2", "recommendation_avg": [ 7.0, 1.0 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 60.75, 17.711225254058512 ], "wc_summary_review_avg": [ 110.5, 99.96374342730468 ], "wc_main_review_avg": [ 671.75, 475.1985769128523 ], "wc_review_avg": [ 843.0, 486.78794151046924 ], "wc_reply_reviewers_avg": [ 120.25, 88.04650759683771 ], "wc_reply_authors_avg": [ 1408.25, 389.0323733315776 ], "reply_reviewers_avg": [ 1.5, 1.5 ], "reply_authors_avg": [ 3.25, 0.82915619758885 ], "replies_avg": [ 24, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": 0.0, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5893004841560835108&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "pdf": "https://openreview.net/pdf?id=aOX3a9q3RVV", "email": "uchicago.edu;simonsfoundation.org;columbia.edu", "author_num": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "University of Chicago;Flatiron Institute;Columbia University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.uchicago.edu;https://flatironinstitute.org;https://www.columbia.edu", "aff_unique_abbr": "UChicago;Flatiron;Columbia", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "On the Pitfalls of Heteroscedastic Uncertainty Estimation with Probabilistic Neural Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6755", "id": "aPOpXlnV1T", "poster": "", "openreview": "https://openreview.net/forum?id=aPOpXlnV1T", "slides": "https://iclr.cc/virtual/2022/poster/6755", "video": "https://iclr.cc/virtual/2022/poster/6755", "author_site": "Maximilian Seitzer, Arash Tavakoli, Dimitrije Antic, Georg Martius", "tldr": "", "abstract": "Capturing aleatoric uncertainty is a critical part of many machine learning systems. In deep learning, a common approach to this end is to train a neural network to estimate the parameters of a heteroscedastic Gaussian distribution by maximizing the logarithm of the likelihood function under the observed data. In this work, we examine this approach and identify potential hazards associated with the use of log-likelihood in conjunction with gradient-based optimizers. First, we present a synthetic example illustrating how this approach can lead to very poor but stable parameter estimates. Second, we identify the culprit to be the log-likelihood loss, along with certain conditions that exacerbate the issue. Third, we present an alternative formulation, termed $\\beta$-NLL, in which each data point's contribution to the loss is weighted by the $\\beta$-exponentiated variance estimate. We show that using an appropriate $\\beta$ largely mitigates the issue in our illustrative example. Fourth, we evaluate this approach on a range of domains and tasks and show that it achieves considerable improvements and performs more robustly concerning hyperparameters, both in predictive RMSE and log-likelihood criteria.", "keywords": "Uncertainty Estimation;Probabilistic Neural Networks;Aleatoric Uncertainty;Heteroscedastic Uncertainty;Analysis", "primary_area": "", "supplementary_material": "", "author": "Maximilian Seitzer;Arash Tavakoli;Dimitrije Antic;Georg Martius", "authorids": "~Maximilian_Seitzer1;~Arash_Tavakoli1;~Dimitrije_Antic1;~Georg_Martius1", "gender": ";M;M;M", "homepage": ";https://atavakol.github.io;https://anticdimi.github.io;https://uni-tuebingen.de/de/264672", "dblp": ";177/8682;316/9873;47/2706", "google_scholar": ";https://scholar.google.co.uk/citations?user=Jwq-Qx0AAAAJ;SXtAPAIAAAAJ;https://scholar.google.de/citations?user=b-JF-UIAAAAJ", "orcid": ";0000-0001-8481-3284;0009-0008-3135-1475;", "linkedin": ";arashtavakoli;dimitrije-anti\u0107-3986a422/;", "or_profile": "~Maximilian_Seitzer1;~Arash_Tavakoli1;~Dimitrije_Antic1;~Georg_Martius1", "aff": ";Max Planck Institute for Intelligent Systems;University of Tuebingen;Max Planck Institute for Intelligent Systems", "aff_domain": ";mpg.de;uni-tuebingen.de;tuebingen.mpg.de", "position": ";Postdoc;MS student;Assistant Professor", "bibtex": "@inproceedings{\nseitzer2022on,\ntitle={On the Pitfalls of Heteroscedastic Uncertainty Estimation with Probabilistic Neural Networks},\nauthor={Maximilian Seitzer and Arash Tavakoli and Dimitrije Antic and Georg Martius},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=aPOpXlnV1T}\n}", "github": "", "project": "", "reviewers": "gjks;5Jtp;5nZv", "pdf_size": 0, "recommendation": "6;6;6", "confidence": "4;5;4", "correctness": "4;3;3", "technical_novelty": "3;3;3", "empirical_novelty": "3;2;3", "wc_summary_paper": "186;89;92", "wc_summary_review": "108;89;23", "wc_main_review": "346;424;254", "wc_review": "640;602;369", "wc_reply_reviewers": "0;12;70", "wc_reply_authors": "633;1004;931", "reply_reviewers": "0;1;1", "reply_authors": "2;2;2", "recommendation_avg": [ 6.0, 0.0 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 122.33333333333333, 45.03578823804711 ], "wc_summary_review_avg": [ 73.33333333333333, 36.42648609032841 ], "wc_main_review_avg": [ 341.3333333333333, 69.48061280354078 ], "wc_review_avg": [ 537.0, 119.80261544167834 ], "wc_reply_reviewers_avg": [ 27.333333333333332, 30.5650490302604 ], "wc_reply_authors_avg": [ 856.0, 160.47637416974084 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 2.0, 0.0 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 131, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12019013391257516150&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=aPOpXlnV1T", "email": ";mpg.de;uni-tuebingen.de;tuebingen.mpg.de", "author_num": 4, "aff_unique_index": "0;1;0", "aff_unique_norm": "Max Planck Institute for Intelligent Systems;University of Tuebingen", "aff_unique_dep": "Intelligent Systems;", "aff_unique_url": "https://www.mpi-is.mpg.de;https://www.uni-tuebingen.de/", "aff_unique_abbr": "MPI-IS;Uni T\u00fcbingen", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Germany" }, { "id": "aQE7-2-0Ud5", "title": "Boosting Semantic Segmentation via Feature Enhancement", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Semantic segmentation aims to map each pixel of an image into its correspond-ing semantic label. Most existing methods either mainly concentrate on high-levelfeatures or simple combination of low-level and high-level features from backboneconvolutional networks, which may weaken or even ignore the compensation be-tween different levels. To effectively take advantages from both shallow (textural)and deep (semantic) features, this paper proposes a novel plug-and-play module,namelyfeature enhancement module(FEM). The proposed FEM first aligns fea-tures from different stages through a learnable filter to extract desired information,and then enhances target features by taking in the extracted message. Two types ofFEM,i.e.detail FEM and semantic FEM, are customized. Concretely, the formertype strengthens textural information to protect key but tiny/low-contrast detailsfrom suppression/removal, while the other one highlights structural informationto boost segmentation performance. By equipping a given backbone network withFEMs, there might contain two information flows,i.e.detail flow and seman-tic flow. Extensive experiments on Cityscapes, PASCAL Context, and ADE20Kdatasets are conducted to validate the effectiveness of our design, and reveal itssuperiority over other state-of-the-art alternatives.", "keywords": "Deep learning;computer vision;semantic segmentation;feature enhancement", "primary_area": "", "supplementary_material": "", "author": "Liu Zhi;Xiaojie Guo;zhang yi", "authorids": "~Liu_Zhi1;~Xiaojie_Guo2;yizhang@tju.edu.cn", "gender": "M;M;", "homepage": "https://zhiliu.com;https://sites.google.com/view/xjguo;", "dblp": ";43/8066-1;", "google_scholar": ";RL7jPuQAAAAJ;", "orcid": ";;", "linkedin": ";;", "or_profile": "~Liu_Zhi1;~Xiaojie_Guo2;yizhang@tju.edu.cn", "aff": "Tianjin University;;", "aff_domain": "tju.edu;;", "position": "MS student;;", "bibtex": "@misc{\nzhi2022boosting,\ntitle={Boosting Semantic Segmentation via Feature Enhancement},\nauthor={Liu Zhi and Xiaojie Guo and zhang yi},\nyear={2022},\nurl={https://openreview.net/forum?id=aQE7-2-0Ud5}\n}", "github": "", "project": "", "reviewers": "R2ZG;3kFj;MN6i;TNmL", "site": "https://openreview.net/forum?id=aQE7-2-0Ud5", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "5;4;4;5", "correctness": "3;3;3;1", "technical_novelty": "1;1;2;2", "empirical_novelty": "0;2;2;2", "wc_summary_paper": "42;25;49;54", "wc_summary_review": "16;6;18;23", "wc_main_review": "298;136;107;252", "wc_review": "356;167;174;329", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 4.5, 0.5 ], "correctness_avg": [ 2.5, 0.8660254037844386 ], "technical_novelty_avg": [ 1.5, 0.5 ], "empirical_novelty_avg": [ 1.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 42.5, 10.965856099730654 ], "wc_summary_review_avg": [ 15.75, 6.179603547154137 ], "wc_main_review_avg": [ 198.25, 79.12134667711364 ], "wc_review_avg": [ 256.5, 86.5635604628183 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": -0.5773502691896257, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1115944615219217023&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0", "aff_unique_norm": "Tianjin University", "aff_unique_dep": "", "aff_unique_url": "http://www.tju.edu.cn", "aff_unique_abbr": "TJU", "aff_country_unique_index": "0", "aff_country_unique": "China" }, { "title": "Do Not Escape From the Manifold: Discovering the Local Coordinates on the Latent Space of GANs", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6326", "id": "aTzMi4yV_RO", "poster": "", "openreview": "https://openreview.net/forum?id=aTzMi4yV_RO", "slides": "https://iclr.cc/virtual/2022/poster/6326", "video": "https://iclr.cc/virtual/2022/poster/6326", "author_site": "Jaewoong Choi, Junho Lee, Changyeon Yoon, Jung Ho Park, Geonho Hwang, Myungjoo Kang", "tldr": "", "abstract": "The discovery of the disentanglement properties of the latent space in GANs motivated a lot of research to find the semantically meaningful directions on it. In this paper, we suggest that the disentanglement property is closely related to the geometry of the latent space. In this regard, we propose an unsupervised method for finding the semantic-factorizing directions on the intermediate latent space of GANs based on the local geometry. Intuitively, our proposed method, called $\\textit{Local Basis}$, finds the principal variation of the latent space in the neighborhood of the base latent variable. Experimental results show that the local principal variation corresponds to the semantic factorization and traversing along it provides strong robustness to image traversal. Moreover, we suggest an explanation for the limited success in finding the global traversal directions in the latent space, especially $\\mathcal{W}$-space of StyleGAN2. We show that $\\mathcal{W}$-space is warped globally by comparing the local geometry, discovered from Local Basis, through the metric on Grassmannian Manifold. The global warpage implies that the latent space is not well-aligned globally and therefore the global traversal directions are bound to show limited success on it.", "keywords": "generative adversarial network;disentanglement;semantic factorization;latent space control;image manipulation;grassmannian", "primary_area": "", "supplementary_material": "/attachment/9bb84002ff6250be9187cacf3f1f70785c8e90f5.zip", "author": "Jaewoong Choi;Junho Lee;Changyeon Yoon;Jung Ho Park;Geonho Hwang;Myungjoo Kang", "authorids": "~Jaewoong_Choi1;~Junho_Lee2;~Changyeon_Yoon1;~Jung_Ho_Park1;~Geonho_Hwang1;~Myungjoo_Kang1", "gender": "M;M;M;M;;", "homepage": ";https://sites.google.com/view/junopage;https://github.com/shinypond;https://github.com/forallexist;;http://ncia.snu.ac.kr/", "dblp": "63/11483;;;29/8446;;64/5657.html", "google_scholar": "e4ZLjREAAAAJ;s_orZYMAAAAJ;;;UJ_Mw6YAAAAJ;", "orcid": ";;;;;", "linkedin": ";junho-lee-457748229/;;;;", "or_profile": "~Jaewoong_Choi1;~Junho_Lee2;~Changyeon_Yoon1;~Jung_Ho_Park1;~Geonho_Hwang1;~Myungjoo_Kang1", "aff": "Seoul National University;Seoul National University;Seoul National University;Seoul National University;Seoul National University;Seoul National University", "aff_domain": "snu.ac.kr;snu.ac.kr;snu.ac.kr;snu.ac.kr;snu.ac.kr;snu.ac.kr", "position": "PhD student;PhD student;PhD student;PhD student;PhD student;Full Professor", "bibtex": "@inproceedings{\nchoi2022do,\ntitle={Do Not Escape From the Manifold: Discovering the Local Coordinates on the Latent Space of {GAN}s},\nauthor={Jaewoong Choi and Junho Lee and Changyeon Yoon and Jung Ho Park and Geonho Hwang and Myungjoo Kang},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=aTzMi4yV_RO}\n}", "github": "", "project": "", "reviewers": "Pg5m;v9kk;sKqG", "pdf_size": 0, "recommendation": "6;6;8", "confidence": "4;5;3", "correctness": "4;4;3", "technical_novelty": "2;3;3", "empirical_novelty": "3;3;0", "wc_summary_paper": "46;75;105", "wc_summary_review": "53;27;243", "wc_main_review": "668;142;301", "wc_review": "767;244;649", "wc_reply_reviewers": "258;88;3", "wc_reply_authors": "1702;448;519", "reply_reviewers": "3;1;1", "reply_authors": "4;2;1", "recommendation_avg": [ 6.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "correctness_avg": [ 3.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.0, 1.4142135623730951 ], "wc_summary_paper_avg": [ 75.33333333333333, 24.087802353519553 ], "wc_summary_review_avg": [ 107.66666666666667, 96.28199323976536 ], "wc_main_review_avg": [ 370.3333333333333, 220.26398303449935 ], "wc_review_avg": [ 553.3333333333334, 223.97370877453946 ], "wc_reply_reviewers_avg": [ 116.33333333333333, 106.01362595859501 ], "wc_reply_authors_avg": [ 889.6666666666666, 575.1372782987457 ], "reply_reviewers_avg": [ 1.6666666666666667, 0.9428090415820634 ], "reply_authors_avg": [ 2.3333333333333335, 1.247219128924647 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.8660254037844385, "corr_recommendation_correctness": -0.9999999999999998, "gs_citation": 37, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4704378958785987295&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=aTzMi4yV_RO", "email": "snu.ac.kr;snu.ac.kr;snu.ac.kr;snu.ac.kr;snu.ac.kr;snu.ac.kr", "author_num": 6, "aff_unique_index": "0;0;0;0;0;0", "aff_unique_norm": "Seoul National University", "aff_unique_dep": "", "aff_unique_url": "https://www.snu.ac.kr", "aff_unique_abbr": "SNU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "South Korea" }, { "id": "aUkOeKsGe2X", "title": "Autoencoder for Synthetic to Real Generalization: From Simple to More Complex Scenes", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Learning on synthetic data and transferring the resulting properties to their real counterparts is an important challenge for reducing costs and increasing safety in machine learning. In this work, we focus on autoencoder architectures and aim at learning latent space representations that are invariant to inductive biases caused by the domain shift between simulated and real images showing the same scenario. We train on synthetic images only, present approaches to increase generalizability and improve the preservation of the semantics to real datasets of increasing visual complexity. We show that pre-trained feature extractors (e.g. VGG) can be sufficient for generalization on images of lower complexity, but additional improvements are required for visually more complex scenes. To this end, we demonstrate that a sampling technique, which matches semantically important parts of the image, while randomizing the other parts, leads to salient feature extraction and a neglection of unimportant parts. This helps the generalization to real data and can further be improved via triplet-loss structuring of the latent space. We show that our approach outperforms classification models fine-tuned on the same data. ", "keywords": "Autoencoder;sim2real;mpi3d;sviro", "primary_area": "", "supplementary_material": "/attachment/c1e67e4f1fccfce7fb1ff84892875279a789c097.zip", "author": "Steve Dias Da Cruz;Bertram Taetz;Thomas Stifter;Didier Stricker", "authorids": "~Steve_Dias_Da_Cruz1;~Bertram_Taetz1;~Thomas_Stifter1;~Didier_Stricker1", "gender": "M;;;", "homepage": "https://cruz.lu/;;https://www.iee-sensing.com;", "dblp": ";;;", "google_scholar": "https://scholar.google.de/citations?user=qgFaB1YAAAAJ;;;", "orcid": "0000-0002-8322-934X;;;", "linkedin": "stevediasdacruz;;;", "or_profile": "~Steve_Dias_Da_Cruz1;~Bertram_Taetz1;~Thomas_Stifter1;~Didier_Stricker1", "aff": "TU Kaiserslautern;;IEE S.A.;", "aff_domain": "uni-kl.de;;iee-sensing.com;", "position": "PhD student;;Researcher;", "bibtex": "@misc{\ncruz2022autoencoder,\ntitle={Autoencoder for Synthetic to Real Generalization: From Simple to More Complex Scenes},\nauthor={Steve Dias Da Cruz and Bertram Taetz and Thomas Stifter and Didier Stricker},\nyear={2022},\nurl={https://openreview.net/forum?id=aUkOeKsGe2X}\n}", "github": "", "project": "", "reviewers": "nNTc;NBTc;SVvn;2Msi", "site": "https://openreview.net/forum?id=aUkOeKsGe2X", "pdf_size": 0, "recommendation": "5;5;5;5", "confidence": "4;4;4;3", "correctness": "3;4;3;3", "technical_novelty": "2;2;2;2", "empirical_novelty": "3;2;2;2", "wc_summary_paper": "106;348;99;43", "wc_summary_review": "102;85;41;62", "wc_main_review": "528;287;336;500", "wc_review": "736;720;476;605", "wc_reply_reviewers": "53;77;110;94", "wc_reply_authors": "626;697;699;923", "reply_reviewers": "1;2;1;1", "reply_authors": "2;3;2;2", "recommendation_avg": [ 5.0, 0.0 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 149.0, 117.45850331074375 ], "wc_summary_review_avg": [ 72.5, 23.070543990118654 ], "wc_main_review_avg": [ 412.75, 103.19732312419737 ], "wc_review_avg": [ 634.25, 104.4087520277874 ], "wc_reply_reviewers_avg": [ 83.5, 21.12463017427761 ], "wc_reply_authors_avg": [ 736.25, 111.75727045700427 ], "reply_reviewers_avg": [ 1.25, 0.4330127018922193 ], "reply_authors_avg": [ 2.25, 0.4330127018922193 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2674536800983647642&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff_unique_index": "0;1", "aff_unique_norm": "Technische Universit\u00e4t Kaiserslautern;IEE S.A.", "aff_unique_dep": ";", "aff_unique_url": "https://www.tu-kl.de;", "aff_unique_abbr": "TU Kaiserslautern;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "Germany;Unknown" }, { "id": "aUoV6qhY_e", "title": "Specialized Transformers: Faster, Smaller and more Accurate NLP Models", "track": "main", "status": "Reject", "tldr": "", "abstract": "Transformers have greatly advanced the state-of-the-art in Natural Language Processing (NLP) in recent years, but are especially demanding in terms of their computation and storage requirements. Transformers are first pre-trained on a large dataset, and subsequently fine-tuned for different downstream tasks. We observe that this design process leads to models that are not only over-parameterized for downstream tasks, but also contain elements that adversely impact accuracy of the downstream tasks.\nWe propose a Specialization framework to create optimized transformer models for a given downstream task. Our framework systematically uses accuracy-driven pruning, i.e., it identifies and prunes parts of the pre-trained Transformer that hinder performance on the downstream task. We also replace the dense soft-attention in selected layers with sparse hard-attention to help the model focus on the relevant parts of the input. In effect, our framework leads to models that are not only faster and smaller, but also more accurate. The large number of parameters contained in Transformers presents a challenge in the form of a large pruning design space. Further, the traditional iterative prune-retrain approach is not applicable to Transformers, since the fine-tuning data is often very small and re-training quickly leads to overfitting. To address these challenges, we propose a hierarchical, re-training-free pruning method with model- and task- specific heuristics. Our experiments on GLUE and SQUAD show that Specialized models are consistently more accurate (by up to 4.5\\%), while also being up to 2.5$\\times$ faster and up to 3.2$\\times$ smaller than the conventional fine-tuned models. In addition, we demonstrate that Specialization can be combined with previous efforts such as distillation or quantization to achieve further benefits.\nFor example, Specialized Q8BERT and DistilBERT models exceed the performance of BERT-Base, while being up to 3.7$\\times$ faster and up to 12.1$\\times$ smaller. \n", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Amrit Nagarajan;Sanchari Sen;Jacob R. Stevens;Anand Raghunathan", "authorids": "~Amrit_Nagarajan1;~Sanchari_Sen1;~Jacob_R._Stevens1;~Anand_Raghunathan1", "gender": "M;F;M;", "homepage": ";;;https://engineering.purdue.edu/~araghu/", "dblp": ";;;74/3747.html", "google_scholar": ";InjzEk8AAAAJ;CagpctsAAAAJ;OP7F8jEAAAAJ", "orcid": ";;;", "linkedin": "https://in.linkedin.com/in/amrit-nagarajan-8a99b0152;;jacobrstevens/;", "or_profile": "~Amrit_Nagarajan1;~Sanchari_Sen1;~Jacob_R._Stevens1;~Anand_Raghunathan1", "aff": "Purdue University;International Business Machines;Purdue University;Purdue University", "aff_domain": "purdue.edu;ibm.com;purdue.edu;purdue.edu", "position": "PhD student;Researcher;PhD student;Full Professor", "bibtex": "@misc{\nnagarajan2022specialized,\ntitle={Specialized Transformers: Faster, Smaller and more Accurate {NLP} Models},\nauthor={Amrit Nagarajan and Sanchari Sen and Jacob R. Stevens and Anand Raghunathan},\nyear={2022},\nurl={https://openreview.net/forum?id=aUoV6qhY_e}\n}", "github": "", "project": "", "reviewers": "Hc1r;GDsL;bSmL;ccHS", "site": "https://openreview.net/forum?id=aUoV6qhY_e", "pdf_size": 0, "recommendation": "3;5;8;8", "confidence": "4;3;4;3", "correctness": "3;2;4;3", "technical_novelty": "2;2;3;3", "empirical_novelty": "3;3;3;3", "wc_summary_paper": "71;49;47;71", "wc_summary_review": "52;39;32;27", "wc_main_review": "361;241;205;114", "wc_review": "484;329;284;212", "wc_reply_reviewers": "28;79;25;0", "wc_reply_authors": "1782;1470;529;256", "reply_reviewers": "1;2;1;0", "reply_authors": "5;4;1;1", "recommendation_avg": [ 6.0, 2.1213203435596424 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 59.5, 11.521718621802913 ], "wc_summary_review_avg": [ 37.5, 9.394147114027968 ], "wc_main_review_avg": [ 230.25, 88.54765666012851 ], "wc_review_avg": [ 327.25, 99.65785217432693 ], "wc_reply_reviewers_avg": [ 33.0, 28.696689704563486 ], "wc_reply_authors_avg": [ 1009.25, 633.9279828971111 ], "reply_reviewers_avg": [ 1.0, 0.7071067811865476 ], "reply_authors_avg": [ 2.75, 1.7853571071357126 ], "replies_avg": [ 22, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.23570226039551587, "corr_recommendation_correctness": 0.5000000000000001, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8855326963960857152&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;0;0", "aff_unique_norm": "Purdue University;International Business Machines Corporation", "aff_unique_dep": ";", "aff_unique_url": "https://www.purdue.edu;https://www.ibm.com", "aff_unique_abbr": "Purdue;IBM", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "aWA3-vIQDv", "title": "Universality of Deep Neural Network Lottery Tickets: A Renormalization Group Perspective", "track": "main", "status": "Reject", "tldr": "", "abstract": "Foundational work on the Lottery Ticket Hypothesis has suggested an exciting corollary: winning tickets found in the context of one task can be transferred to similar tasks, possibly even across different architectures. While this has become of broad practical and theoretical interest, to date, there exists no detailed understanding of why winning ticket universality exists, or any way of knowing a priori whether a given ticket can be transferred to a given task. To address these outstanding open questions, we make use of renormalization group theory, one of the most successful tools in theoretical physics. We find that iterative magnitude pruning, the method used for discovering winning tickets, is a renormalization group scheme. This opens the door to a wealth of existing numerical and theoretical tools, some of which we leverage here to examine winning ticket universality in large scale lottery ticket experiments, as well as sheds new light on the success iterative magnitude pruning has found in the field of sparse machine learning.", "keywords": "lottery ticket hypothesis;winning tickets;renormalization group", "primary_area": "", "supplementary_material": "", "author": "William T Redman;Tianlong Chen;Akshunna S. Dogra;Zhangyang Wang", "authorids": "~William_T_Redman1;~Tianlong_Chen1;~Akshunna_S._Dogra1;~Zhangyang_Wang1", "gender": "M;M;;M", "homepage": "https://wredman4.wixsite.com/wtredman;https://tianlong-chen.github.io;https://profiles.imperial.ac.uk/a.dogra21;https://vita-group.github.io", "dblp": "266/7985;;257/4975;119/4026", "google_scholar": "-SOfw0AAAAAJ;LE3ctn0AAAAJ;TL86PJMAAAAJ;pxFyKAIAAAAJ", "orcid": ";0000-0001-7774-8197;0000-0002-1326-8976;", "linkedin": ";tianlong-chen-783862167/;;", "or_profile": "~William_T_Redman1;~Tianlong_Chen1;~Akshunna_S._Dogra1;~Zhangyang_Wang1", "aff": "UC Santa Barbara;University of Texas, Austin;Imperial College London;University of Texas, Austin", "aff_domain": "ucsb.edu;utexas.edu;imperial.ac.uk;utexas.edu", "position": "PhD student;PhD student;PhD student;Assistant Professor", "bibtex": "@misc{\nredman2022universality,\ntitle={Universality of Deep Neural Network Lottery Tickets: A Renormalization Group Perspective},\nauthor={William T Redman and Tianlong Chen and Akshunna S. Dogra and Zhangyang Wang},\nyear={2022},\nurl={https://openreview.net/forum?id=aWA3-vIQDv}\n}", "github": "", "project": "", "reviewers": "i8Gz;iuDr;cSyR;ud5a", "site": "https://openreview.net/forum?id=aWA3-vIQDv", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "3;3;5;3", "correctness": "2;2;3;3", "technical_novelty": "2;3;3;3", "empirical_novelty": "1;3;3;3", "wc_summary_paper": "105;69;239;119", "wc_summary_review": "75;25;52;67", "wc_main_review": "270;310;412;153", "wc_review": "450;404;703;339", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "231;392;361;192", "reply_reviewers": "0;0;0;0", "reply_authors": "2;2;2;2", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 3.5, 0.8660254037844386 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 133.0, 63.85922016435841 ], "wc_summary_review_avg": [ 54.75, 19.057478846898924 ], "wc_main_review_avg": [ 286.25, 92.73180414507205 ], "wc_review_avg": [ 474.0, 137.96919946132905 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 294.0, 84.35935040053354 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.0, 0.0 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.13245323570650439, "corr_recommendation_correctness": 0.6882472016116854, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1872155372598713649&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2, "aff_unique_index": "0;1;2;1", "aff_unique_norm": "University of California, Santa Barbara;University of Texas at Austin;Imperial College London", "aff_unique_dep": ";;", "aff_unique_url": "https://www.ucsb.edu;https://www.utexas.edu;https://www.imperial.ac.uk", "aff_unique_abbr": "UCSB;UT Austin;ICL", "aff_campus_unique_index": "0;1;1", "aff_campus_unique": "Santa Barbara;Austin;", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "United States;United Kingdom" }, { "id": "aY5zi3TampL", "title": "Mimicking Randomized Controlled Trials to Learn End-to-End Patient Representations through Self-Supervised Covariate Balancing for Causal Treatment Effect Estimation", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "A causal effect can be defined as a comparison of outcomes that result from two or more alternative actions, with only one of the action-outcome pairs actually being observed. The gold standard for causal effect measurements is Randomized Controlled Trials (RCTs), in which a target population is explicitly defined and each study sample is randomly assigned to either the treatment or control cohorts. The great potential to derive actionable insights from causal relationships has led to a growing body of machine-learning research applying causal effect estimators to Real World Data (RWD) in the fields of healthcare, education, and economics. The primary difference between causal effect studies utilizing RWD and RCTs is that for RWD the study occurs after the treatment, and therefore we do not have control over the treatment assignment mechanism. This can lead to massive differences in covariate distributions between control and treatment samples, making a comparison of causal effects confounded and unreliable. Classical approaches have sought to solve this problem piece meal, first by estimating treatment assignment and then treatment effect separately. Recent work extended part of these approaches to a new family of representation-learning based algorithms, revealing that the lower bound of the expected treatment effect estimation error is determined by two factors: the standard generalization-error of the representation and the distance between the treated and control distributions induced by the representation. Here we argue that to achieve minimal dissimilarity in learning such distributions, as it happens for RCTs, a specific auto-balancing self-supervised objective should be used. Experiments on real and simulated data revealed that our approach consistently produces less biased errors than previously published state-of-art methods. We demonstrate that our reduction in error can be directly attributed to the ability to learn representations that explicitly reduce such dissimilarity. Additionally, we show that error improvements between our approach and previously published state-of-art methods widen as a function of sample dissimilarity between treated and untreated covariate distributions. Thus, by learning representations that induce distributions analogous to RCTs, we provide empirical evidence to support the error bound dissimilarity hypothesis as well as providing a new state-of-the-art model for causal effect estimation.", "keywords": "causal treatment effect estimation;representation learning;self-supervised learning;end-to-end causal effect estimation;randomized controlled trials", "primary_area": "", "supplementary_material": "/attachment/25a6288c21510a61e9f9515b05217593bafab2f0.zip", "author": "Gino Tesei;Stefanos Giampanis;Beau Norgeot", "authorids": "~Gino_Tesei1;stefanos.giampanis@anthem.com;~Beau_Norgeot1", "gender": "M;;", "homepage": ";;", "dblp": "91/4937.html;;https://dblp.uni-trier.de/pers/hd/n/Norgeot:Beau", "google_scholar": "NsMP1kkAAAAJ;;jPYdk7MAAAAJ", "orcid": "0000-0002-0882-5125;;", "linkedin": "ginotesei/;;", "or_profile": "~Gino_Tesei1;stefanos.giampanis@anthem.com;~Beau_Norgeot1", "aff": "Anthem;;", "aff_domain": "anthem.com;;", "position": "AI Principal Data Scientist;;", "bibtex": "@misc{\ntesei2022mimicking,\ntitle={Mimicking Randomized Controlled Trials to Learn End-to-End Patient Representations through Self-Supervised Covariate Balancing for Causal Treatment Effect Estimation},\nauthor={Gino Tesei and Stefanos Giampanis and Beau Norgeot},\nyear={2022},\nurl={https://openreview.net/forum?id=aY5zi3TampL}\n}", "github": "", "project": "", "reviewers": "FRNd;t7dB;b7Dh;uhL3", "site": "https://openreview.net/forum?id=aY5zi3TampL", "pdf_size": 0, "recommendation": "3;3;3;3", "confidence": "3;3;5;3", "correctness": "3;2;2;2", "technical_novelty": "1;1;1;2", "empirical_novelty": "2;2;0;2", "wc_summary_paper": "38;33;204;151", "wc_summary_review": "49;32;22;4", "wc_main_review": "139;40;122;1305", "wc_review": "226;105;348;1460", "wc_reply_reviewers": "0;0;0;554", "wc_reply_authors": "692;608;722;1189", "reply_reviewers": "0;0;0;2", "reply_authors": "1;1;1;2", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 3.5, 0.8660254037844386 ], "correctness_avg": [ 2.25, 0.4330127018922193 ], "technical_novelty_avg": [ 1.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 106.5, 73.45236551670749 ], "wc_summary_review_avg": [ 26.75, 16.29992331270304 ], "wc_main_review_avg": [ 401.5, 522.9772939621757 ], "wc_review_avg": [ 534.75, 541.0579335893708 ], "wc_reply_reviewers_avg": [ 138.5, 239.8890368482895 ], "wc_reply_authors_avg": [ 802.75, 226.8825411969815 ], "reply_reviewers_avg": [ 0.5, 0.8660254037844386 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:KTjexBpL8S0J:scholar.google.com/&scioq=Mimicking+Randomized+Controlled+Trials+to+Learn+End-to-End+Patient+Representations+through+Self-Supervised+Covariate+Balancing+for+Causal+Treatment+Effect+Estimation&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "Anthem, Inc.", "aff_unique_dep": "", "aff_unique_url": "https://www.antheminc.com", "aff_unique_abbr": "Anthem", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "title": "Rethinking Class-Prior Estimation for Positive-Unlabeled Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7212", "id": "aYAA-XHKyk", "poster": "", "openreview": "https://openreview.net/forum?id=aYAA-XHKyk", "slides": "https://iclr.cc/virtual/2022/poster/7212", "video": "https://iclr.cc/virtual/2022/poster/7212", "author_site": "Yu Yao, Tongliang Liu, Bo Han, Mingming Gong, Gang Niu, Masashi Sugiyama, Dacheng Tao", "tldr": "", "abstract": "Given only positive (P) and unlabeled (U) data, PU learning can train a binary classifier without any negative data. It has two building blocks: PU class-prior estimation (CPE) and PU classification; the latter has been well studied while the former has received less attention. Hitherto, the distributional-assumption-free CPE methods rely on a critical assumption that the support of the positive data distribution cannot be contained in the support of the negative data distribution. If this is violated, those CPE methods will systematically overestimate the class prior; it is even worse that we cannot verify the assumption based on the data. In this paper, we rethink CPE for PU learning\u2014can we remove the assumption to make CPE always valid? We show an affirmative answer by proposing Regrouping CPE (ReCPE) that builds an auxiliary probability distribution such that the support of the positive data distribution is never contained in the support of the negative data distribution. ReCPE can work with any CPE method by treating it as the base method. Theoretically, ReCPE does not affect its base if the assumption already holds for the original probability distribution; otherwise, it reduces the positive bias of its base. Empirically, ReCPE improves all state-of-the-art CPE methods on various datasets, implying that the assumption has indeed been violated here.", "keywords": "Positive-Unlabeled Learning;Class-Prior Estimation", "primary_area": "", "supplementary_material": "/attachment/f6da1273d3affae318189d1e6f7dbc0b737e2070.zip", "author": "Yu Yao;Tongliang Liu;Bo Han;Mingming Gong;Gang Niu;Masashi Sugiyama;Dacheng Tao", "authorids": "~Yu_Yao3;~Tongliang_Liu1;~Bo_Han1;~Mingming_Gong1;~Gang_Niu1;~Masashi_Sugiyama1;~Dacheng_Tao1", "gender": "M;M;;M;M;M;", "homepage": "https://a5507203.github.io/;https://tongliang-liu.github.io/;;https://mingming-gong.github.io/;https://niug1984.github.io;http://www.ms.k.u-tokyo.ac.jp/sugi/;", "dblp": "230/9625;150/6667;;98/8479;26/3367-1;35/1228;", "google_scholar": "OkcaMKAAAAAJ;https://scholar.google.com.au/citations?user=EiLdZ_YAAAAJ;;https://scholar.google.com.au/citations?user=6BmiCJIAAAAJ;https://scholar.google.co.jp/citations?user=HOkcy00AAAAJ;https://scholar.google.co.jp/citations?user=GkYIrlIAAAAJ;", "orcid": ";;;0000-0001-7147-5589;;0000-0001-6658-6743;", "linkedin": "yu-yao-150377134/;;;;;;", "or_profile": "~Yu_Yao3;~Tongliang_Liu1;~Bo_Han1;~Mingming_Gong1;~Gang_Niu1;~Masashi_Sugiyama1;~Dacheng_Tao1", "aff": "University of Sydney;University of Sydney;;University of Melbourne;RIKEN;The University of Tokyo;", "aff_domain": "uni.sydney.edu.au;sydney.edu.au;;unimelb.edu.au;riken.jp;u-tokyo.ac.jp;", "position": "PhD student;Lecturer;;Assistant Professor;Research Scientist (tenured);Full Professor;", "bibtex": "@inproceedings{\nyao2022rethinking,\ntitle={Rethinking Class-Prior Estimation for Positive-Unlabeled Learning},\nauthor={Yu Yao and Tongliang Liu and Bo Han and Mingming Gong and Gang Niu and Masashi Sugiyama and Dacheng Tao},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=aYAA-XHKyk}\n}", "github": "", "project": "", "reviewers": "8SWo;6Jj8;JBHN;EaHM", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "4;4;3;4", "correctness": "4;4;2;4", "technical_novelty": "2;3;2;3", "empirical_novelty": "2;3;2;0", "wc_summary_paper": "111;146;166;103", "wc_summary_review": "90;62;84;57", "wc_main_review": "277;255;470;144", "wc_review": "478;463;720;304", "wc_reply_reviewers": "0;62;0;21", "wc_reply_authors": "499;267;1448;70", "reply_reviewers": "0;1;0;1", "reply_authors": "2;1;3;1", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.8660254037844386 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 1.75, 1.0897247358851685 ], "wc_summary_paper_avg": [ 131.5, 25.656383221335 ], "wc_summary_review_avg": [ 73.25, 14.02453207775575 ], "wc_main_review_avg": [ 286.5, 117.32540219406879 ], "wc_review_avg": [ 491.25, 148.62936284597333 ], "wc_reply_reviewers_avg": [ 20.75, 25.31180554602931 ], "wc_reply_authors_avg": [ 571.0, 528.6137531317172 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.75, 0.82915619758885 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.13245323570650439, "corr_recommendation_correctness": 0.13245323570650439, "gs_citation": 26, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3812122096693998256&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=aYAA-XHKyk", "email": "uni.sydney.edu.au;sydney.edu.au;;unimelb.edu.au;riken.jp;u-tokyo.ac.jp;", "author_num": 7, "aff_unique_index": "0;0;1;2;3", "aff_unique_norm": "University of Sydney;University of Melbourne;RIKEN;University of Tokyo", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.sydney.edu.au;https://www.unimelb.edu.au;https://www.riken.jp;https://www.u-tokyo.ac.jp", "aff_unique_abbr": "USYD;UniMelb;RIKEN;UTokyo", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;1", "aff_country_unique": "Australia;Japan" }, { "id": "aYSlxlHKEA", "title": "Fully Decentralized Model-based Policy Optimization with Networked Agents", "track": "main", "status": "Reject", "tldr": "", "abstract": "Model-based RL is an effective approach for reducing sample complexity. However, when it comes to multi-agent setting where the number of agent is large, the model estimation can be problematic due to the exponential increased interactions. In this paper, we propose a decentralized model-based reinforcement learning algorithm for networked multi-agent systems, where agents are cooperative and communicate locally with their neighbors. We analyze our algorithm theoretically and derive an upper bound of performance discrepancy caused by model usage, and provide a sufficient condition of monotonic policy improvement. In our experiments, we compare our algorithm against other strong multi-agent baselines and demonstrate that our algorithm not only matches the asymptotic performance of model-free methods but also largely increases its sample efficiency.", "keywords": "Reinforcement learning;model-based;multi-agent;deep learning;networked system control.", "primary_area": "", "supplementary_material": "", "author": "Yuchen Liu;Yali Du;Runji Lin;Hangrui Bi;Mingdong Wu;Jun Wang;Hao Dong", "authorids": "~Yuchen_Liu3;~Yali_Du1;~Runji_Lin1;~Hangrui_Bi1;~Mingdong_Wu1;~Jun_Wang2;~Hao_Dong3", "gender": "M;;;M;M;M;M", "homepage": ";;;https://github.com/20171130;https://aaronanima.github.io/;http://www0.cs.ucl.ac.uk/staff/jun.wang/;https://zsdonghao.github.io", "dblp": ";;;282/0828.html;315/5136;w/JunWang12;14/1525-3.html", "google_scholar": ";;;LndxL6MAAAAJ;https://scholar.google.com/citations?hl=en;https://scholar.google.co.uk/citations?user=wIE1tY4AAAAJ;xLFL4sMAAAAJ", "orcid": ";;;;;;0000-0003-2261-9122", "linkedin": "%E9%9B%A8%E8%BE%B0-%E5%88%98-280007212/;;;;;;", "or_profile": "~Yuchen_Liu3;~Yali_Du1;~Runji_Lin1;~Hangrui_Bi1;~Mingdong_Wu1;~Jun_Wang2;~Hao_Dong3", "aff": "Peking University;;;;Center on Frontiers of Computing Studies,Peking University;University College London;Peking University", "aff_domain": "pku.edu.cn;;;;pku.edu.cn;ucl.ac.uk;pku.edu.cn", "position": "Undergrad student;;;;PhD student;Professor;Assistant Professor", "bibtex": "@misc{\nliu2022fully,\ntitle={Fully Decentralized Model-based Policy Optimization with Networked Agents},\nauthor={Yuchen Liu and Yali Du and Runji Lin and Hangrui Bi and Mingdong Wu and Jun Wang and Hao Dong},\nyear={2022},\nurl={https://openreview.net/forum?id=aYSlxlHKEA}\n}", "github": "", "project": "", "reviewers": "QuxM;2Aak;mC6k", "site": "https://openreview.net/forum?id=aYSlxlHKEA", "pdf_size": 0, "recommendation": "5;5;5", "confidence": "3;4;3", "correctness": "3;4;4", "technical_novelty": "2;2;2", "empirical_novelty": "2;2;1", "wc_summary_paper": "21;22;28", "wc_summary_review": "68;19;11", "wc_main_review": "563;185;271", "wc_review": "652;226;310", "wc_reply_reviewers": "66;0;0", "wc_reply_authors": "554;240;406", "reply_reviewers": "1;0;0", "reply_authors": "2;1;1", "recommendation_avg": [ 5.0, 0.0 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "correctness_avg": [ 3.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 1.6666666666666667, 0.4714045207910317 ], "wc_summary_paper_avg": [ 23.666666666666668, 3.0912061651652345 ], "wc_summary_review_avg": [ 32.666666666666664, 25.197001585285676 ], "wc_main_review_avg": [ 339.6666666666667, 161.77625151905193 ], "wc_review_avg": [ 396.0, 184.23897524682448 ], "wc_reply_reviewers_avg": [ 22.0, 31.11269837220809 ], "wc_reply_authors_avg": [ 400.0, 128.26015229472742 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:bjwvVDqqJXAJ:scholar.google.com/&scioq=Fully+Decentralized+Model-based+Policy+Optimization+with+Networked+Agents&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Peking University;University College London", "aff_unique_dep": ";", "aff_unique_url": "http://www.pku.edu.cn;https://www.ucl.ac.uk", "aff_unique_abbr": "Peking U;UCL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "China;United Kingdom" }, { "id": "a_ASZbWsQp_", "title": "RVFR: Robust Vertical Federated Learning via Feature Subspace Recovery", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Vertical Federated Learning (VFL) is a distributed learning paradigm that allows multiple agents to jointly train a global model when each agent holds a different subset of features for the same sample(s). VFL is known to be vulnerable to backdoor attacks, where data from malicious agents are manipulated during training, and vulnerable to test-time attacks, where malicious agents manipulate the test data. However, unlike the standard horizontal federated learning, improving the robustness of robust VFL remains challenging. To this end, we propose RVFR, a novel robust VFL training and inference framework. The key to our approach is to ensure that with a low-rank feature subspace, a small number of attacked samples, and other mild assumptions, RVFR recovers the underlying uncorrupted features with guarantees, thus sanitizes the model against a vast range of backdoor attacks. Further, RVFR also defends against test-time adversarial and missing feature attacks. We conduct extensive experiments on several datasets and show that the robustness of RVFR outperforms different baselines against diverse types of attacks.", "keywords": "Vertical Federated Learning;Adversarial Attacks;Backdoor Attacks;Feature Recovery;Robustness", "primary_area": "", "supplementary_material": "/attachment/ecf83a0689e3c3ac5f27454530159c998c359308.zip", "author": "Jing Liu;Chulin Xie;Krishnaram Kenthapadi;Oluwasanmi O Koyejo;Bo Li", "authorids": "~Jing_Liu13;~Chulin_Xie1;~Krishnaram_Kenthapadi1;~Oluwasanmi_O_Koyejo1;~Bo_Li19", "gender": "M;F;M;M;F", "homepage": "https://sites.google.com/a/eng.ucsd.edu/l0-magic/;;https://cs.stanford.edu/people/kngk/;https://cs.stanford.edu/~sanmi/;http://boli.cs.illinois.edu/", "dblp": "72/2590-9;245/4284;29/4781;14/8885;50/3402-26", "google_scholar": ";WeJnzAgAAAAJ;av5rGaEAAAAJ;EaaOeJwAAAAJ;K8vJkTcAAAAJ", "orcid": ";;0000-0003-1237-087X;0000-0002-4023-419X;", "linkedin": ";;krishnaramkenthapadi/;sanmi-koyejo-984754/;", "or_profile": "~Jing_Liu13;~Chulin_Xie1;~Krishnaram_Kenthapadi1;~Oluwasanmi_O_Koyejo1;~Bo_Li19", "aff": "Mitsubishi Electric Research Labs;University of Illinois, Urbana Champaign;Fiddler AI;University of Illinois, Urbana Champaign;University of Illinois, Urbana Champaign", "aff_domain": "merl.com;illinois.edu;fiddler.ai;illinois.edu;illinois.edu", "position": "Researcher;PhD student;Chief Scientist;Associate Professor;Assistant Professor", "bibtex": "@misc{\nliu2022rvfr,\ntitle={{RVFR}: Robust Vertical Federated Learning via Feature Subspace Recovery},\nauthor={Jing Liu and Chulin Xie and Krishnaram Kenthapadi and Oluwasanmi O Koyejo and Bo Li},\nyear={2022},\nurl={https://openreview.net/forum?id=a_ASZbWsQp_}\n}", "github": "", "project": "", "reviewers": "gmgW;KYnS;t2fE;YQ1h", "site": "https://openreview.net/forum?id=a_ASZbWsQp_", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "4;3;3;4", "correctness": "2;3;3;3", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "44;63;87;143", "wc_summary_review": "25;221;36;83", "wc_main_review": "744;212;298;1301", "wc_review": "813;496;421;1527", "wc_reply_reviewers": "0;0;42;0", "wc_reply_authors": "1106;1103;751;968", "reply_reviewers": "0;0;1;0", "reply_authors": "2;5;1;2", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 84.25, 37.18450618201081 ], "wc_summary_review_avg": [ 91.25, 78.01402117568354 ], "wc_main_review_avg": [ 638.75, 432.39991616557927 ], "wc_review_avg": [ 814.25, 437.0190928323384 ], "wc_reply_reviewers_avg": [ 10.5, 18.186533479473212 ], "wc_reply_authors_avg": [ 982.0, 144.54584047975922 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 2.5, 1.5 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16152748825841521233&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;2;1;1", "aff_unique_norm": "Mitsubishi Electric Research Laboratories;University of Illinois Urbana-Champaign;Fiddler AI", "aff_unique_dep": ";;", "aff_unique_url": "https://www.merl.com;https://illinois.edu;https://www.fiddler.ai", "aff_unique_abbr": "MERL;UIUC;Fiddler AI", "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Urbana-Champaign", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "a_nR4BPPJF1", "title": "Blessing of Class Diversity in Pre-training", "track": "main", "status": "Reject", "tldr": "", "abstract": "This paper presents a new statistical analysis aiming to explain the recent superior achievements of the pre-training techniques in natural language processing (NLP).\nWe prove that when the classes of the pre-training task (e.g., different words in masked language model task) are sufficiently diverse, in the sense that the least singular value of the last linear layer in pre-training is large, then pre-training can significantly improve the sample efficiency of downstream tasks. Inspired by our theory, we propose a new regularization technique that targets the multi-class pre-training: a \\emph{diversity regularizer only to the last linear layer} in the pre-training phase.\nOur empirical results show that this technique consistently boosts the performance of the pre-trained BERT model on different downstream tasks.", "keywords": "representation learning;statistical learning theory", "primary_area": "", "supplementary_material": "", "author": "Yulai Zhao;Jianshu Chen;Simon Shaolei Du", "authorids": "~Yulai_Zhao1;~Jianshu_Chen1;~Simon_Shaolei_Du1", "gender": "M;M;M", "homepage": "https://yulaizhao.com/;https://chenjianshu.github.io/;http://simonshaoleidu.com", "dblp": "64/6357-2;11/3124;176/5602", "google_scholar": "r-mWYj0AAAAJ;jQeFWdoAAAAJ;OttawxUAAAAJ", "orcid": "0000-0002-6930-3590;;", "linkedin": "yulaizhao/;;", "or_profile": "~Yulai_Zhao1;~Jianshu_Chen1;~Simon_Shaolei_Du1", "aff": "Tsinghua University;Tencent AI Lab;Meta Facebook", "aff_domain": "tsinghua.edu.cn;tencent.com;fb.com", "position": "Undergrad student;Principal Researcher;Visiting Professor", "bibtex": "@inproceedings{\nzhao2022blessing,\ntitle={Blessing of Class Diversity in Pre-training},\nauthor={Yulai Zhao and Jianshu Chen and Simon Shaolei Du},\nbooktitle={Submitted to The Tenth International Conference on Learning Representations },\nyear={2022},\nurl={https://openreview.net/forum?id=a_nR4BPPJF1},\nnote={under review}\n}", "github": "", "project": "", "reviewers": "AkLr;tQpS;2YHt;4MVe", "site": "https://openreview.net/forum?id=a_nR4BPPJF1", "pdf_size": 0, "recommendation": "3;6;6;8", "confidence": "5;3;2;4", "correctness": "2;3;3;3", "technical_novelty": "2;3;3;3", "empirical_novelty": "3;3;2;3", "wc_summary_paper": "81;74;60;95", "wc_summary_review": "26;98;34;99", "wc_main_review": "563;352;161;320", "wc_review": "670;524;255;514", "wc_reply_reviewers": "657;129;0;0", "wc_reply_authors": "786;224;105;74", "reply_reviewers": "2;1;0;0", "reply_authors": "3;2;1;1", "recommendation_avg": [ 5.75, 1.7853571071357126 ], "confidence_avg": [ 3.5, 1.118033988749895 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 77.5, 12.619429464123963 ], "wc_summary_review_avg": [ 64.25, 34.368408458932166 ], "wc_main_review_avg": [ 349.0, 143.1694799878801 ], "wc_review_avg": [ 490.75, 149.46132442876316 ], "wc_reply_reviewers_avg": [ 196.5, 271.0355142781108 ], "wc_reply_authors_avg": [ 297.25, 287.6815730977568 ], "reply_reviewers_avg": [ 0.75, 0.82915619758885 ], "reply_authors_avg": [ 1.75, 0.82915619758885 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.4383570037596046, "corr_recommendation_correctness": 0.8892972917998875, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5036162941136586381&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff_unique_index": "0;1;2", "aff_unique_norm": "Tsinghua University;Tencent;Meta", "aff_unique_dep": ";Tencent AI Lab;Meta Platforms, Inc.", "aff_unique_url": "https://www.tsinghua.edu.cn;https://ai.tencent.com;https://meta.com", "aff_unique_abbr": "THU;Tencent AI Lab;Meta", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1", "aff_country_unique": "China;United States" }, { "id": "ab7fanwXWu", "title": "Accelerating Optimization using Neural Reparametrization", "track": "main", "status": "Reject", "tldr": "", "abstract": "We tackle the problem of accelerating certain optimization problems related to steady states in ODE and energy minimization problems common in physics. \nWe reparametrize the optimization variables as the output of a neural network. \nWe then find the conditions under which this neural reparameterization could speed up convergence rates during gradient descent.\nWe find that to get the maximum speed up the neural network needs to be a special graph convolutional network (GCN) with its aggregation function constructed from the gradients of the loss function.\nWe show the utility of our method on two different optimization problems on graphs and point-clouds. ", "keywords": "optimization;graph neural networks;neural reparameterization;neural tangent kernel", "primary_area": "", "supplementary_material": "/attachment/394cf265cac0a05582b4b2ee7c17c27ad2a6abf7.zip", "author": "Nima Dehmamy;Csaba Both;Jianzhi Long;Rose Yu", "authorids": "~Nima_Dehmamy1;both.c@northeastern.edu;jlong@ucsd.edu;~Rose_Yu1", "gender": "M;;;F", "homepage": ";;;http://roseyu.com", "dblp": "198/1338;;;164/7314", "google_scholar": "gvHpUtgAAAAJ;;;", "orcid": "0000-0003-1617-5502;;;", "linkedin": "nima-dehmamy-57770a4a/;;;", "or_profile": "~Nima_Dehmamy1;both.c@northeastern.edu;jlong@ucsd.edu;~Rose_Yu1", "aff": "Northwestern University;;;University of California, San Diego", "aff_domain": "northwestern.edu;;;ucsd.edu", "position": "Research Assistant Professor;;;Assistant Professor", "bibtex": "@misc{\ndehmamy2022accelerating,\ntitle={Accelerating Optimization using Neural Reparametrization},\nauthor={Nima Dehmamy and Csaba Both and Jianzhi Long and Rose Yu},\nyear={2022},\nurl={https://openreview.net/forum?id=ab7fanwXWu}\n}", "github": "", "project": "", "reviewers": "BGYL;ue9h;e51J;WWWw", "site": "https://openreview.net/forum?id=ab7fanwXWu", "pdf_size": 0, "recommendation": "1;3;5;6", "confidence": "3;3;3;4", "correctness": "3;3;3;4", "technical_novelty": "2;3;2;3", "empirical_novelty": "1;2;2;3", "wc_summary_paper": "51;131;78;65", "wc_summary_review": "34;74;33;52", "wc_main_review": "268;750;153;266", "wc_review": "353;955;264;383", "wc_reply_reviewers": "0;0;0;18", "wc_reply_authors": "1209;1885;269;528", "reply_reviewers": "0;0;0;1", "reply_authors": "2;3;1;1", "recommendation_avg": [ 3.75, 1.920286436967152 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 81.25, 30.26858932953434 ], "wc_summary_review_avg": [ 48.25, 16.67895380412093 ], "wc_main_review_avg": [ 359.25, 230.3512263913522 ], "wc_review_avg": [ 488.75, 272.723646756199 ], "wc_reply_reviewers_avg": [ 4.5, 7.794228634059948 ], "wc_reply_authors_avg": [ 972.75, 628.7051673877032 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.75, 0.82915619758885 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.676481425202546, "corr_recommendation_correctness": 0.676481425202546, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:81GfAo5vbD8J:scholar.google.com/&scioq=Accelerating+Optimization+using+Neural+Reparametrization&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1", "aff_unique_norm": "Northwestern University;University of California, San Diego", "aff_unique_dep": ";", "aff_unique_url": "https://www.northwestern.edu;https://www.ucsd.edu", "aff_unique_abbr": "NU;UCSD", "aff_campus_unique_index": "1", "aff_campus_unique": ";San Diego", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "ab7lBP7Fb60", "title": "Enforcing fairness in private federated learning via the modified method of differential multipliers", "track": "main", "status": "Reject", "tldr": "", "abstract": "Federated learning with differential privacy, or private federated learning, provides a strategy to train machine learning models while respecting users' privacy. However, differential privacy can disproportionately degrade the performance of the models on under-represented groups, as these parts of the distribution are difficult to learn in the presence of noise. Existing approaches for enforcing fairness in machine learning models have considered the centralized setting, in which the algorithm has access to the users' data. This paper introduces an algorithm to enforce group fairness in private federated learning, where users' data does not leave their devices. First, the paper extends the modified method of differential multipliers to empirical risk minimization with fairness constraints, thus providing an algorithm to enforce fairness in the central setting. Then, this algorithm is extended to the private federated learning setting. The proposed algorithm, FPFL, is tested on a federated version of the Adult dataset and an \"unfair\" version of the FEMNIST dataset. The experiments on these datasets show how private federated learning accentuates unfairness in the trained models, and how FPFL is able to mitigate such unfairness.", "keywords": "Private federated learning;fairness", "primary_area": "", "supplementary_material": "", "author": "Borja Rodr\u00edguez G\u00e1lvez;Filip Granqvist;Rogier van Dalen;Matt Seigel", "authorids": "~Borja_Rodr\u00edguez_G\u00e1lvez1;~Filip_Granqvist1;~Rogier_van_Dalen2;~Matt_Seigel1", "gender": "M;;M;M", "homepage": "https://burklight.github.io/;;https://www.linkedin.com/in/matthew-seigel-92a8445/?originalSubdomain=uk;https://www.vandalen.uk", "dblp": "254/2966;;;97/4127", "google_scholar": "sq1BkUMAAAAJ;;;https://scholar.google.co.uk/citations?hl=en", "orcid": "0000-0002-0862-1333;;;0000-0002-9603-5771", "linkedin": "borja-rodriguez/;filip-granqvist-112017149/;matthew-seigel-92a8445/?originalSubdomain=uk;rogier-van-dalen/", "or_profile": "~Borja_Rodr\u00edguez_G\u00e1lvez1;~Filip_Granqvist1;~Matt_Seigel1;~Rogier_C._van_Dalen1", "aff": "Apple;Apple;Apple;Apple", "aff_domain": "apple.com;apple.com;apple.com;apple.com", "position": "Intern;Researcher;Researcher;Researcher", "bibtex": "@misc{\ng{\\'a}lvez2022enforcing,\ntitle={Enforcing fairness in private federated learning via the modified method of differential multipliers},\nauthor={Borja Rodr{\\'\\i}guez G{\\'a}lvez and Filip Granqvist and Rogier van Dalen and Matt Seigel},\nyear={2022},\nurl={https://openreview.net/forum?id=ab7lBP7Fb60}\n}", "github": "", "project": "", "reviewers": "cNjj;18Rw;jvmJ;FrAm", "site": "https://openreview.net/forum?id=ab7lBP7Fb60", "pdf_size": 0, "recommendation": "3;3;5;6", "confidence": "4;4;3;3", "correctness": "2;3;2;4", "technical_novelty": "3;4;2;4", "empirical_novelty": "3;4;2;3", "wc_summary_paper": "19;82;59;91", "wc_summary_review": "52;59;28;42", "wc_main_review": "313;726;211;331", "wc_review": "384;867;298;464", "wc_reply_reviewers": "0;0;0;131", "wc_reply_authors": "1198;1360;923;864", "reply_reviewers": "0;0;0;2", "reply_authors": "2;2;2;3", "recommendation_avg": [ 4.25, 1.299038105676658 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 2.75, 0.82915619758885 ], "technical_novelty_avg": [ 3.25, 0.82915619758885 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 62.75, 27.82422505659412 ], "wc_summary_review_avg": [ 45.25, 11.648497757221744 ], "wc_main_review_avg": [ 395.25, 196.36493449697173 ], "wc_review_avg": [ 503.25, 218.06120127156962 ], "wc_reply_reviewers_avg": [ 32.75, 56.72466394788073 ], "wc_reply_authors_avg": [ 1086.25, 202.15881751731732 ], "reply_reviewers_avg": [ 0.5, 0.8660254037844386 ], "reply_authors_avg": [ 2.25, 0.4330127018922193 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.9622504486493761, "corr_recommendation_correctness": 0.5222329678670935, "gs_citation": 36, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2013377171524139099&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Apple", "aff_unique_dep": "Apple Inc.", "aff_unique_url": "https://www.apple.com", "aff_unique_abbr": "Apple", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "aboaN31kfW", "title": "Causal Triple Attention Time Series Forecasting", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Time series forecasting has historically been a key area of academic research and industrial applications. In multi-horizon and multi-series forecasting tasks, accurately capturing the local information in a sequence and effectively sharing global information across different sequences are very challenging, due to the complex dependencies over time in a long sequence and the heterogeneous nature across multiple time series. In this paper, from the perspective of causal inference, we give a theoretical analysis of these difficulties and establish a causal graph to identify the confounding relationship that generates harmful bias and misleads the time series model to capture the spurious correlations. We propose a causal triple attention time series forecasting model with three interpretable attention modules, which leverages the front-door adjustment to remove the confounding effect and help the model effectively utilize the local and global temporal information. We evaluate the performance of our model on four benchmark datasets and the results demonstrate the superiority over the state-of-the-art methods.\n", "keywords": "Time series forecasting;causal inference;multi-horizon;multi-series forecasting tasks", "primary_area": "", "supplementary_material": "", "author": "Zhixuan Chu;tan yan;yue wu;yi xu;cheng zhang;Yulin kang", "authorids": "~Zhixuan_Chu1;yantan.yt@antgroup.com;yuyue.wy@antfin.com;haolin.xy@antgroup.com;chongye.zc@alibaba-inc.com;yulin.kyl@antgroup.com", "gender": "M;;;;;", "homepage": ";;;;;", "dblp": "258/1233;;;;;", "google_scholar": "a4IuTngAAAAJ;;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": "~Zhixuan_Chu1;yantan.yt@antgroup.com;yuyue.wy@antfin.com;haolin.xy@antgroup.com;chongye.zc@alibaba-inc.com;yulin.kyl@antgroup.com", "aff": "Ant Group;;;;;", "aff_domain": "antgroup.com;;;;;", "position": "Researcher;;;;;", "bibtex": "@misc{\nchu2022causal,\ntitle={Causal Triple Attention Time Series Forecasting},\nauthor={Zhixuan Chu and tan yan and yue wu and yi xu and cheng zhang and Yulin kang},\nyear={2022},\nurl={https://openreview.net/forum?id=aboaN31kfW}\n}", "github": "", "project": "", "reviewers": "xLTc;negq;R8Tt", "site": "https://openreview.net/forum?id=aboaN31kfW", "pdf_size": 0, "recommendation": "1;3;5", "confidence": "4;3;2", "correctness": "1;3;3", "technical_novelty": "2;2;3", "empirical_novelty": "0;1;3", "wc_summary_paper": "47;107;80", "wc_summary_review": "39;70;32", "wc_main_review": "558;539;314", "wc_review": "644;716;426", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 3.0, 1.632993161855452 ], "confidence_avg": [ 3.0, 0.816496580927726 ], "correctness_avg": [ 2.3333333333333335, 0.9428090415820634 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 1.3333333333333333, 1.247219128924647 ], "wc_summary_paper_avg": [ 78.0, 24.535688292770594 ], "wc_summary_review_avg": [ 47.0, 16.51262143533445 ], "wc_main_review_avg": [ 470.3333333333333, 110.81616408368512 ], "wc_review_avg": [ 595.3333333333334, 123.29188492714714 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -1.0, "corr_recommendation_correctness": 0.8660254037844387, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:E7vYjPNSgqcJ:scholar.google.com/&scioq=Causal+Triple+Attention+Time+Series+Forecasting&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "Ant Group", "aff_unique_dep": "", "aff_unique_url": "https://www.antgroup.com", "aff_unique_abbr": "Ant Group", "aff_country_unique_index": "0", "aff_country_unique": "China" }, { "id": "acD4xGc7u7", "title": "Self-Supervised Learning of Motion-Informed Latents", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Siamese network architectures trained for self-supervised instance recognition can learn powerful visual representations that are useful in various tasks. Many such approaches work by simply maximizing the similarity between representations of augmented images of the same object. In this paper, we further expand on the success of these methods by studying an unusual training scheme for learning motion-informed representations. Our goal is to show that common Siamese networks can effectively be trained on video sequences to disentangle attributes related to pose and motion that are useful for video and non-video tasks, yet typically suppressed in usual training schemes. Unlike parallel efforts that focus on introducing new image-space operators for data augmentation, we argue that extending the augmentation strategy by using different frames of a video leads to more powerful representations. To show the effectiveness of this approach, we use the Objectron and UCF101 datasets to learn representations and evaluate them on pose estimation, action recognition, and object re-identification. We show that self-supervised learning using in-domain video sequences yields better results on different task than fine-tuning pre-trained networks on still images. Furthermore, we carefully validate our method against a number of baselines.", "keywords": "Representation learning;self-supervised learning;video representation learning;pose estimation", "primary_area": "", "supplementary_material": "/attachment/4a4b959c9f7df4973dad43460f9fd69edf51db54.zip", "author": "Rapha\u00ebl Jean;Pierre-Luc St-Charles;Soren Pirk;Simon Brodeur", "authorids": "raphael.jean@rocketmail.com;~Pierre-Luc_St-Charles3;~Soren_Pirk2;~Simon_Brodeur1", "gender": ";;;M", "homepage": ";;;https://simonbrodeur.com/", "dblp": ";;;", "google_scholar": ";30mr9vYAAAAJ;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": "raphael.jean@rocketmail.com;~Pierre-Luc_St-Charles3;~Soren_Pirk2;~Simon_Brodeur1", "aff": ";Mila;;Menya Solutions, the AI Division of Levio", "aff_domain": ";mila.quebec;;levio.ca", "position": ";Researcher;;Researcher", "bibtex": "@misc{\njean2022selfsupervised,\ntitle={Self-Supervised Learning of Motion-Informed Latents},\nauthor={Rapha{\\\"e}l Jean and Pierre-Luc St-Charles and Soren Pirk and Simon Brodeur},\nyear={2022},\nurl={https://openreview.net/forum?id=acD4xGc7u7}\n}", "github": "", "project": "", "reviewers": "Dt6Y;94i4;1o42;hjGY", "site": "https://openreview.net/forum?id=acD4xGc7u7", "pdf_size": 0, "recommendation": "1;3;5;5", "confidence": "5;4;5;4", "correctness": "2;2;2;3", "technical_novelty": "1;1;1;1", "empirical_novelty": "1;2;2;2", "wc_summary_paper": "35;11;27;61", "wc_summary_review": "36;18;17;63", "wc_main_review": "49;431;344;623", "wc_review": "120;460;388;747", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.5, 1.6583123951777 ], "confidence_avg": [ 4.5, 0.5 ], "correctness_avg": [ 2.25, 0.4330127018922193 ], "technical_novelty_avg": [ 1.0, 0.0 ], "empirical_novelty_avg": [ 1.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 33.5, 18.07622748252522 ], "wc_summary_review_avg": [ 33.5, 18.634645153584223 ], "wc_main_review_avg": [ 361.75, 206.8663517829809 ], "wc_review_avg": [ 428.75, 223.18532097788153 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.30151134457776363, "corr_recommendation_correctness": 0.5222329678670935, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:TU6hHe3L2v0J:scholar.google.com/&scioq=Self-Supervised+Learning+of+Motion-Informed+Latents&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1", "aff_unique_norm": "Mila;Menya Solutions", "aff_unique_dep": "Quebec Artificial Intelligence Institute;AI Division", "aff_unique_url": "https://mila.quebec;", "aff_unique_abbr": "Mila;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0", "aff_country_unique": "Canada;" }, { "id": "ad_F_z27pCx", "title": "A Discussion On the Validity of Manifold Learning", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Dimensionality reduction (DR) and manifold learning (ManL) have been applied extensively in many machine learning tasks, including signal processing, speech recognition, and neuroinformatics. However, the understanding of whether DR and ManL models can generate valid learning results remains unclear. In this work, we investigate the validity of learning results of some widely used DR and ManL methods through the chart mapping function of a manifold. We identify a fundamental problem of these methods: the mapping functions induced by these methods violate the basic settings of manifolds, and hence they are not learning manifold in the mathematical sense. To address this problem, we provide a provably correct algorithm called fixed points Laplacian mapping (FPLM), that has the geometric guarantee to find a valid manifold representation (up to a homeomorphism). Combining one additional condition (orientation preserving), we discuss a sufficient condition for an algorithm to be bijective for any -simplex decomposition result on a -manifold. However, constructing such a mapping function and its computational method satisfying these conditions is still an open problem in mathematics.", "keywords": "Manifold learning;Dimensionality Reduction;Computational Geometry;Simplicial Complex", "primary_area": "", "supplementary_material": "", "author": "Dai Shi;Andi Han;Yi Guo;Junbin Gao", "authorids": "~Dai_Shi1;~Andi_Han1;~Yi_Guo3;~Junbin_Gao1", "gender": "M;M;;", "homepage": "https://github.com/EEthanShi;https://github.com/andyjm3;;https://www.sydney.edu.au/business/about/our-people/academic-staff/junbin-gao.html", "dblp": "96/8513;268/7976.html;24/3508-1;30/3983", "google_scholar": ";AKHQHs0AAAAJ;;https://scholar.google.com.au/citations?user=3-KJN8IAAAAJ", "orcid": "0000-0002-6600-4325;0000-0003-4655-655X;;0000-0001-9803-0256", "linkedin": ";;;", "or_profile": "~Dai_Shi1;~Andi_Han1;~Yi_Guo3;~Junbin_Gao1", "aff": "University of Sydney;University of Sydney;Western Sydney University;University of Sydney", "aff_domain": "sydney.edu.au;sydney.edu.au;wsu.edu.au;sydney.edu.au", "position": "Researcher;PhD student;Associate Professor;Full Professor", "bibtex": "@misc{\nshi2022a,\ntitle={A Discussion On the Validity of Manifold Learning},\nauthor={Dai Shi and Andi Han and Yi Guo and Junbin Gao},\nyear={2022},\nurl={https://openreview.net/forum?id=ad_F_z27pCx}\n}", "github": "", "project": "", "reviewers": "eseD;7AHA;dNQi", "site": "https://openreview.net/forum?id=ad_F_z27pCx", "pdf_size": 0, "recommendation": "3;5;6", "confidence": "4;4;3", "correctness": "2;2;3", "technical_novelty": "3;2;2", "empirical_novelty": "2;0;2", "wc_summary_paper": "96;110;98", "wc_summary_review": "15;101;71", "wc_main_review": "549;440;375", "wc_review": "660;651;544", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 4.666666666666667, 1.247219128924647 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 2.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 1.3333333333333333, 0.9428090415820634 ], "wc_summary_paper_avg": [ 101.33333333333333, 6.18241233033047 ], "wc_summary_review_avg": [ 62.333333333333336, 35.64017707899642 ], "wc_main_review_avg": [ 454.6666666666667, 71.78826892714126 ], "wc_review_avg": [ 618.3333333333334, 52.689868307125444 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.7559289460184545, "corr_recommendation_correctness": 0.7559289460184545, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=915226326014692706&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "University of Sydney;Western Sydney University", "aff_unique_dep": ";", "aff_unique_url": "https://www.sydney.edu.au;https://www.westernsydney.edu.au", "aff_unique_abbr": "USYD;WSU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Australia" }, { "id": "adjl32ogfqD", "title": "Learning Stochastic Shortest Path with Linear Function Approximation", "track": "main", "status": "Reject", "tldr": "", "abstract": "We study the stochastic shortest path (SSP) problem in reinforcement learning with linear function approximation, where the transition kernel is represented as a linear mixture of unknown models. We call this class of SSP problems as linear mixture SSP. We propose a novel algorithm for learning the linear mixture SSP, which can attain a $\\tilde O(dB_{\\star}^{1.5}\\sqrt{K/c_{\\min}})$ regret. Here $K$ is the number of episodes, $d$ is the dimension of the feature mapping in the mixture model, $B_{\\star}$ bounds the expected cumulative cost of the optimal policy, and $c_{\\min}>0$ is the lower bound of the cost function. Our algorithm also applies to the case when $c_{\\min} = 0$, where a $\\tilde O(K^{2/3})$ regret is guaranteed. To the best of our knowledge, this is the first algorithm with a sublinear regret guarantee for learning linear mixture SSP. In complement to the regret upper bounds, we also prove a lower bound of $\\Omega(dB_{\\star} \\sqrt{K})$, which nearly matches our upper bound.", "keywords": "reinforcement learning;stochastic shortest path", "primary_area": "", "supplementary_material": "/attachment/f3597f58488117aeee65fb1e9796eec8dad23ffc.zip", "author": "Yifei Min;Jiafan He;Tianhao Wang;Quanquan Gu", "authorids": "~Yifei_Min1;~Jiafan_He1;~Tianhao_Wang1;~Quanquan_Gu1", "gender": ";M;M;M", "homepage": ";https://sites.google.com/g.ucla.edu/jiafan-he-homepage;https://tianhaowang.ttic.edu;http://web.cs.ucla.edu/~qgu/", "dblp": ";214/5785;145/3288-2;50/4597", "google_scholar": ";F3AXNBwAAAAJ;m45LD1kAAAAJ;GU9HgNAAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Yifei_Min1;~Jiafan_He1;~Tianhao_Wang1;~Quanquan_Gu1", "aff": ";University of California, Los Angeles;Yale University;University of California, Los Angeles", "aff_domain": ";ucla.edu;yale.edu;cs.ucla.edu", "position": ";PhD student;PhD student;Assistant Professor", "bibtex": "@misc{\nmin2022learning,\ntitle={Learning Stochastic Shortest Path with Linear Function Approximation},\nauthor={Yifei Min and Jiafan He and Tianhao Wang and Quanquan Gu},\nyear={2022},\nurl={https://openreview.net/forum?id=adjl32ogfqD}\n}", "github": "", "project": "", "reviewers": "wXVK;q9Tm;sn7Z", "site": "https://openreview.net/forum?id=adjl32ogfqD", "pdf_size": 0, "recommendation": "5;6;6", "confidence": "3;3;3", "correctness": "4;4;4", "technical_novelty": "2;3;2", "empirical_novelty": "0;1;0", "wc_summary_paper": "69;225;133", "wc_summary_review": "161;34;45", "wc_main_review": "365;309;353", "wc_review": "595;568;531", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "1492;137;1108", "reply_reviewers": "0;0;0", "reply_authors": "2;1;2", "recommendation_avg": [ 5.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.0, 0.0 ], "correctness_avg": [ 4.0, 0.0 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 0.3333333333333333, 0.4714045207910317 ], "wc_summary_paper_avg": [ 142.33333333333334, 64.02777175222917 ], "wc_summary_review_avg": [ 80.0, 57.45142876088171 ], "wc_main_review_avg": [ 342.3333333333333, 24.073960113690386 ], "wc_review_avg": [ 564.6666666666666, 26.23398982660133 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 912.3333333333334, 570.216528074107 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 36, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12589791277500777414&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "aff_unique_index": "0;1;0", "aff_unique_norm": "University of California, Los Angeles;Yale University", "aff_unique_dep": ";", "aff_unique_url": "https://www.ucla.edu;https://www.yale.edu", "aff_unique_abbr": "UCLA;Yale", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Los Angeles;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "ae7BJIOxkxH", "title": "Stingy Teacher: Sparse Logits Suffice to Fail Knowledge Distillation", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Knowledge distillation (KD) aims to transfer the discrimination power of pre-trained teacher models to (more lightweight) student models. However, KD also poses the risk of intellectual properties (IPs) leakage of teacher models. Even if the teacher model is released as a black box, it can still be cloned through KD by imitating input-output behaviors. To address this unwanted effect of KD, the concept of Nasty Teacher was proposed recently. It is a special network that achieves nearly the same accuracy as a normal one, but significantly degrades the accuracy of student models trying to imitate it. Previous work builds the nasty teacher by retraining a new model and distorting its output distribution from the normal one via an adversarial loss. With this design, the ``nasty\" teacher tends to produce sparse and noisy logits. However, it is unclear why the distorted distribution of the logits is catastrophic to the student model. In addition, the retraining process used in Nasty Teacher is undesirable, not only degrading the performance of the teacher model but also limiting its applicability to large datasets. \n\nIn this paper, we provide a theoretical analysis of why the sparsity of logits is key to Nasty Teacher. We further propose Stingy Teacher, a much simpler yet more effective algorithm to prevent imitation through KD without incurring accuracy drop or requiring retraining. Stingy Teacher directly manipulates the logits of a standard pre-trained network by maintaining the values for a small subset of classes while zeroing out the rest. Extensive experiments on large-scale datasets and various teacher-student pairs demonstrate that our stingy teacher is highly effective and more catastrophic to student models than the Nasty Teacher. Code and pre-trained models will be released upon acceptance.", "keywords": "Knowledge Distillation;avoid knowledge leaking", "primary_area": "", "supplementary_material": "/attachment/5e9338b8b88fba199b186a3709828ee11dbcf445.zip", "author": "Haoyu Ma;Yifan Huang;Tianlong Chen;Hao Tang;Chenyu You;Zhangyang Wang;Xiaohui Xie", "authorids": "~Haoyu_Ma1;~Yifan_Huang2;~Tianlong_Chen1;~Hao_Tang8;~Chenyu_You1;~Zhangyang_Wang1;~Xiaohui_Xie2", "gender": "M;M;M;M;M;;", "homepage": "https://www.ics.uci.edu/~haoyum3/;https://huangyifan0501.github.io/;https://tianlong-chen.github.io;https://chenyuyou.me/;https://vita-group.github.io;https://www.ics.uci.edu/~xhx/;https://tanghaotommy.github.io/", "dblp": "144/1634;;;191/9432;119/4026;;", "google_scholar": "8jugwosAAAAJ;;LE3ctn0AAAAJ;hy_wB7cAAAAJ;pxFyKAIAAAAJ;1CR0meYAAAAJ;2X3D1-4AAAAJ", "orcid": "0000-0001-6646-2644;;0000-0001-7774-8197;0000-0001-8365-7822;;;", "linkedin": "haoyu-ma-53517915a/;;tianlong-chen-783862167/;chenyu-you-b07475a4/;;;", "or_profile": "~Haoyu_Ma1;~Yifan_Huang2;~Tianlong_Chen1;~Chenyu_You1;~Zhangyang_Wang1;~Xiaohui_Xie2;~Hao_Tang14", "aff": "Meta Platforms, Inc;Southeast University;University of Texas, Austin;Yale University;University of Texas, Austin;University of California, Irvine;Meta Platforms", "aff_domain": "fb.com;seu.edu.cn;utexas.edu;yale.edu;utexas.edu;uci.edu;meta.com", "position": "Intern;Undergrad student;PhD student;PhD student;Assistant Professor;Full Professor;Researcher", "bibtex": "@misc{\nma2022stingy,\ntitle={Stingy Teacher: Sparse Logits Suffice to Fail Knowledge Distillation},\nauthor={Haoyu Ma and Yifan Huang and Tianlong Chen and Hao Tang and Chenyu You and Zhangyang Wang and Xiaohui Xie},\nyear={2022},\nurl={https://openreview.net/forum?id=ae7BJIOxkxH}\n}", "github": "", "project": "", "reviewers": "7dGf;zuiu;AXLb;dEfv", "site": "https://openreview.net/forum?id=ae7BJIOxkxH", "pdf_size": 0, "recommendation": "3;3;5;6", "confidence": "5;4;4;4", "correctness": "4;3;3;4", "technical_novelty": "3;2;2;3", "empirical_novelty": "3;2;2;4", "wc_summary_paper": "73;58;148;28", "wc_summary_review": "39;41;277;18", "wc_main_review": "322;312;81;388", "wc_review": "434;411;506;434", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.25, 1.299038105676658 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 76.75, 44.21184795956849 ], "wc_summary_review_avg": [ 93.75, 106.18233139275102 ], "wc_main_review_avg": [ 275.75, 116.16878883762196 ], "wc_review_avg": [ 446.25, 35.75174820900371 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": -0.5555555555555555, "corr_recommendation_correctness": 0.19245008972987526, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12605012673501458019&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;2;3;2;4;0", "aff_unique_norm": "Meta;Southeast University;University of Texas at Austin;Yale University;University of California, Irvine", "aff_unique_dep": "Meta Platforms, Inc;;;;", "aff_unique_url": "https://www.meta.com;https://www.seu.edu.cn/;https://www.utexas.edu;https://www.yale.edu;https://www.uci.edu", "aff_unique_abbr": "Meta;SEU;UT Austin;Yale;UCI", "aff_campus_unique_index": "1;1;2", "aff_campus_unique": ";Austin;Irvine", "aff_country_unique_index": "0;1;0;0;0;0;0", "aff_country_unique": "United States;China" }, { "id": "aedexcMXbKK", "title": "Larger Model Causes Lower Classification Accuracy Under Differential Privacy: Reason and Solution", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Differential privacy (DP) is an essential technique for privacy-preserving, which works by adding random noise to the data. In deep learning, DP-stochastic gradient descent (SGD) is a popular technique to build privacy-preserving models. With a small noise, however, the large model (such as ResNet50) trained by DP-SGD cannot perform better than the small model (such as ResNet18). To better understand this phenomenon, we study high dimensional DP learning from the viewpoint of generalization. Theoretically, we first demonstrate that for the Gaussian mixture model with even small DP noise, if excess features are used, classification can be as bad as the random guessing since the noise accumulation for the estimation in high dimensional feature space. Then we propose a robust measure to select the important features, which trades off the model accuracy and privacy preserving. Moreover, the conditions under which important features can be selected by the proposed measure are established. Simulation on the real data (such as CIFAR-10) supports our theoretical results and reveals the advantage of the proposed classification and privacy preserving procedure.\n", "keywords": "Differential privacy;feature selection;generalization;high dimension.", "primary_area": "", "supplementary_material": "/attachment/b4d76b43c69cdd165c8a424f3c6f6b09b31a5fcd.zip", "author": "Yinchen Shen;Zhiguo Wang;Ruoyu Sun;Xiaojing Shen", "authorids": "~Yinchen_Shen1;~Zhiguo_Wang2;~Ruoyu_Sun1;shenxj@scu.edu.cn", "gender": "M;;;", "homepage": ";;https://ruoyus.github.io/;", "dblp": ";;30/9879-1;", "google_scholar": ";Xgk30P0AAAAJ;PsfzbCMAAAAJ;", "orcid": "0000-0001-7237-1343;;;", "linkedin": ";;;", "or_profile": "~Yinchen_Shen1;~Zhiguo_Wang2;~Ruoyu_Sun1;shenxj@scu.edu.cn", "aff": "Sichuan University;Sichuan University;University of Illinois, Urbana-Champaign;", "aff_domain": "scu.edu.cn;scu.edu.cn;uiuc.edu;", "position": "MS student;Associate Professor;Assistant Professor;", "bibtex": "@misc{\nshen2022larger,\ntitle={Larger Model Causes Lower Classification Accuracy Under Differential Privacy: Reason and Solution},\nauthor={Yinchen Shen and Zhiguo Wang and Ruoyu Sun and Xiaojing Shen},\nyear={2022},\nurl={https://openreview.net/forum?id=aedexcMXbKK}\n}", "github": "", "project": "", "reviewers": "Um8Y;Keai;buWr;V39p", "site": "https://openreview.net/forum?id=aedexcMXbKK", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "4;4;3;2", "correctness": "3;3;3;3", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "54;64;67;74", "wc_summary_review": "18;38;1;3", "wc_main_review": "126;234;232;179", "wc_review": "198;336;300;256", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 64.75, 7.189401922274203 ], "wc_summary_review_avg": [ 15.0, 14.815532390029054 ], "wc_main_review_avg": [ 192.75, 44.40368790990226 ], "wc_review_avg": [ 272.5, 51.504854140168185 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.7608859102526822, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:3lhphzJiyPwJ:scholar.google.com/&scioq=Larger+Model+Causes+Lower+Classification+Accuracy+Under+Differential+Privacy:+Reason+and+Solution&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;1", "aff_unique_norm": "Sichuan University;University of Illinois", "aff_unique_dep": ";", "aff_unique_url": "https://www.scu.edu.cn;https://illinois.edu", "aff_unique_abbr": "SCU;UIUC", "aff_campus_unique_index": "1", "aff_campus_unique": ";Urbana-Champaign", "aff_country_unique_index": "0;0;1", "aff_country_unique": "China;United States" }, { "title": "Evading Adversarial Example Detection Defenses with Orthogonal Projected Gradient Descent", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6295", "id": "af1eUDdUVz", "poster": "", "openreview": "https://openreview.net/forum?id=af1eUDdUVz", "slides": "https://iclr.cc/virtual/2022/poster/6295", "video": "https://iclr.cc/virtual/2022/poster/6295", "author_site": "Oliver Bryniarski, Nabeel Hingun, Pedro Pachuca, Vincent Wang, Nicholas Carlini", "tldr": "", "abstract": "Evading adversarial example detection defenses requires finding adversarial examples that must simultaneously (a) be misclassified by the model and (b) be detected as non-adversarial. We find that existing attacks that attempt to satisfy multiple simultaneous constraints often over-optimize against one constraint at the cost of satisfying another. We introduce Selective Projected Gradient Descent and Orthogonal Projected Gradient Descent, improved attack techniques to generate adversarial examples that avoid this problem by orthogonalizing the gradients when running standard gradient-based attacks. We use our technique to evade four state-of-the-art detection defenses, reducing their accuracy to 0% while maintaining a 0% detection rate.", "keywords": "Adversarial examples;adversarial attacks", "primary_area": "", "supplementary_material": "", "author": "Oliver Bryniarski;Nabeel Hingun;Pedro Pachuca;Vincent Wang;Nicholas Carlini", "authorids": "~Oliver_Bryniarski1;~Nabeel_Hingun1;~Pedro_Pachuca1;~Vincent_Wang1;~Nicholas_Carlini1", "gender": ";M;M;M;", "homepage": "https://obryniarski.github.io/;;;;http://nicholas.carlini.com", "dblp": ";;;;145/1806", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": "oliver-bryniarski/;nabeel-hingun-044a40198/;pedropachuca/;v-wang;", "or_profile": "~Oliver_Bryniarski1;~Nabeel_Hingun1;~Pedro_Pachuca1;~Vincent_Wang1;~Nicholas_Carlini1", "aff": ";University of California, Berkeley;University of California, Berkeley;Electrical Engineering & Computer Science Department, University of California Berkeley;Google", "aff_domain": ";berkeley.edu;berkeley.edu;eecs.berkeley.edu;google.com", "position": ";Undergrad student;Undergrad student;Undergrad student;Researcher", "bibtex": "@inproceedings{\nbryniarski2022evading,\ntitle={Evading Adversarial Example Detection Defenses with Orthogonal Projected Gradient Descent},\nauthor={Oliver Bryniarski and Nabeel Hingun and Pedro Pachuca and Vincent Wang and Nicholas Carlini},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=af1eUDdUVz}\n}", "github": "", "project": "", "reviewers": "K9MM;ndnu;AXWZ;Tn7g", "pdf_size": 0, "recommendation": "6;8;8;8", "confidence": "3;4;4;3", "correctness": "3;4;3;3", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;3;3;2", "wc_summary_paper": "54;73;35;144", "wc_summary_review": "89;16;85;46", "wc_main_review": "931;95;79;192", "wc_review": "1074;184;199;382", "wc_reply_reviewers": "29;0;0;0", "wc_reply_authors": "620;421;124;334", "reply_reviewers": "1;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 7.5, 0.8660254037844386 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 76.5, 41.22196016688192 ], "wc_summary_review_avg": [ 59.0, 29.974989574643725 ], "wc_main_review_avg": [ 324.25, 352.9655613512457 ], "wc_review_avg": [ 459.75, 363.10354928036713 ], "wc_reply_reviewers_avg": [ 7.25, 12.55736835487436 ], "wc_reply_authors_avg": [ 374.75, 178.06090952255636 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 44, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6627043113889326245&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=af1eUDdUVz", "email": ";berkeley.edu;berkeley.edu;eecs.berkeley.edu;google.com", "author_num": 5, "aff_unique_index": "0;0;0;1", "aff_unique_norm": "University of California, Berkeley;Google", "aff_unique_dep": ";Google", "aff_unique_url": "https://www.berkeley.edu;https://www.google.com", "aff_unique_abbr": "UC Berkeley;Google", "aff_campus_unique_index": "0;0;0;1", "aff_campus_unique": "Berkeley;Mountain View", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "RelViT: Concept-guided Vision Transformer for Visual Relational Reasoning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6087", "id": "afoV8W3-IYp", "poster": "", "openreview": "https://openreview.net/forum?id=afoV8W3-IYp", "slides": "https://iclr.cc/virtual/2022/poster/6087", "video": "https://iclr.cc/virtual/2022/poster/6087", "author_site": "Xiaojian Ma, Weili Nie, Zhiding Yu, Huaizu Jiang, Chaowei Xiao, Yuke Zhu, Song-Chun Zhu, Anima Anandkumar", "tldr": "", "abstract": "Reasoning about visual relationships is central to how humans interpret the visual world. This task remains challenging for current deep learning algorithms since it requires addressing three key technical problems jointly: 1) identifying object entities and their properties, 2) inferring semantic relations between pairs of entities, and 3) generalizing to novel object-relation combinations, i.e., systematic generalization. In this work, we use vision transformers (ViTs) as our base model for visual reasoning and make better use of concepts defined as object entities and their relations to improve the reasoning ability of ViTs. Specifically, we introduce a novel concept-feature dictionary to allow flexible image feature retrieval at training time with concept keys. This dictionary enables two new concept-guided auxiliary tasks: 1) a global task for promoting relational reasoning, and 2) a local task for facilitating semantic object-centric correspondence learning. To examine the systematic generalization of visual reasoning models, we introduce systematic splits for the standard HICO and GQA benchmarks. We show the resulting model, Concept-guided Vision Transformer (or RelViT for short) significantly outperforms prior approaches on HICO and GQA by 16% and 13% in the original split, and by 43% and 18% in the systematic split. Our ablation analyses also reveal our model's compatibility with multiple ViT variants and robustness to hyper-parameters.", "keywords": "visual relational reasoning;representation learning;systematic generalization", "primary_area": "", "supplementary_material": "", "author": "Xiaojian Ma;Weili Nie;Zhiding Yu;Huaizu Jiang;Chaowei Xiao;Yuke Zhu;Song-Chun Zhu;Anima Anandkumar", "authorids": "~Xiaojian_Ma1;~Weili_Nie1;~Zhiding_Yu1;~Huaizu_Jiang1;~Chaowei_Xiao2;~Yuke_Zhu1;~Song-Chun_Zhu1;~Anima_Anandkumar1", "gender": ";M;;M;;M;M;", "homepage": ";https://weilinie.github.io/;;http://jianghz.me;;https://cs.utexas.edu/~yukez/;https://zhusongchun.net/;", "dblp": ";147/4786;;128/7890;;133/1772;10/10313;", "google_scholar": ";zW7BH7oAAAAJ;;0hHqYoAAAAAJ;;mWGyYMsAAAAJ;https://scholar.google.com.tw/citations?user=Al8dyb4AAAAJ;", "orcid": ";;;;;;;", "linkedin": ";;;;;;;", "or_profile": "~Xiaojian_Ma1;~Weili_Nie1;~Zhiding_Yu1;~Huaizu_Jiang1;~Chaowei_Xiao2;~Yuke_Zhu1;~Song-Chun_Zhu1;~Anima_Anandkumar1", "aff": ";NVIDIA;;Northeastern University;;Computer Science Department, University of Texas, Austin;Peking University;", "aff_domain": ";nvidia.com;;northeastern.edu;;cs.utexas.edu;pku.edu.cn;", "position": ";Research Scientist;;Assistant Professor;;Assistant Professor;Full Professor;", "bibtex": "@inproceedings{\nma2022relvit,\ntitle={RelViT: Concept-guided Vision Transformer for Visual Relational Reasoning},\nauthor={Xiaojian Ma and Weili Nie and Zhiding Yu and Huaizu Jiang and Chaowei Xiao and Yuke Zhu and Song-Chun Zhu and Anima Anandkumar},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=afoV8W3-IYp}\n}", "github": "", "project": "", "reviewers": "RiEe;fbWL;dCw4", "pdf_size": 0, "recommendation": "6;6;8", "confidence": "4;2;4", "correctness": "3;3;3", "technical_novelty": "2;2;3", "empirical_novelty": "3;2;3", "wc_summary_paper": "76;70;125", "wc_summary_review": "91;10;67", "wc_main_review": "352;381;712", "wc_review": "519;461;904", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 6.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 3.3333333333333335, 0.9428090415820634 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 90.33333333333333, 24.63511495586917 ], "wc_summary_review_avg": [ 56.0, 33.97057550292606 ], "wc_main_review_avg": [ 481.6666666666667, 163.29999659794512 ], "wc_review_avg": [ 628.0, 196.59264143570243 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": 0.5, "corr_recommendation_correctness": 0.0, "gs_citation": 25, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10463631265009137162&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "pdf": "https://openreview.net/pdf?id=afoV8W3-IYp", "email": ";nvidia.com;;northeastern.edu;;cs.utexas.edu;pku.edu.cn;", "author_num": 8, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "NVIDIA;Northeastern University;University of Texas at Austin;Peking University", "aff_unique_dep": "NVIDIA Corporation;;Computer Science Department;", "aff_unique_url": "https://www.nvidia.com;https://www.northeastern.edu;https://www.utexas.edu;http://www.pku.edu.cn", "aff_unique_abbr": "NVIDIA;NEU;UT Austin;Peking U", "aff_campus_unique_index": "1", "aff_campus_unique": ";Austin", "aff_country_unique_index": "0;0;0;1", "aff_country_unique": "United States;China" }, { "id": "agBJ7SYcUVb", "title": "DFSSATTEN: Dynamic Fine-grained Structured Sparse Attention Mechanism", "track": "main", "status": "Reject", "tldr": "", "abstract": "Transformers are becoming mainstream solutions for various tasks like NLP and Computer vision. Despite their success, the quadratic complexity of their attention mechanism hinders them from applying to latency sensitive tasks. Tremendous efforts have been made to alleviate this problem, and many of them successfully reduce the asymptotic complexity to linear. Nevertheless, few of them achieve practical speedup over the original full attention, especially under the moderate sequence length. In this paper, we present DFSSATTEN, an attention mechanism that dynamically prunes the full attention weight matrix to the 50% fine-grained structured sparse pattern used by the sparse tensor core on NVIDIA A100 GPU. We provide both theoretical and empirical evidences that demonstrate DFSSAT- TEN is a good approximation of the full attention mechanism and can achieve speedups in wall-clock time under arbitrary sequence length. We evaluate our method on tasks from various domains under different sequence lengths from 256 to 4096. DFSSATTEN achieves 1.27 \u223c 1.89\u00d7 speedups over the full-attention mechanism with no accuracy loss.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/fcf42ba8d3e2ba12f8cd0b619a721d9f2601380d.zip", "author": "Zhaodong Chen;Liu Liu;Yuying Quan;Zheng Qu;Yufei Ding;Yuan Xie", "authorids": "~Zhaodong_Chen1;~Liu_Liu2;yuying_quan@ucsb.edu;~Zheng_Qu2;~Yufei_Ding1;~Yuan_Xie1", "gender": "M;M;;;;M", "homepage": "https://apuaachen.github.io/Zhaodong-Chen/;https://liuliu-cs.github.io;;;;https://www.ece.ucsb.edu/~yuanxie/", "dblp": ";74/7037-17;;;;", "google_scholar": "https://scholar.google.com;https://scholar.google.com/citations?hl=en;;mV2ceTMAAAAJ;;", "orcid": ";0000-0003-0792-8146;;;;", "linkedin": ";;;;;", "or_profile": "~Zhaodong_Chen1;~Liu_Liu2;yuying_quan@ucsb.edu;~Zheng_Qu2;~Yufei_Ding1;~Yuan_Xie1", "aff": "UC Santa Barbara;Rensselaer Polytechnic Institute;;UC Santa Barbara;;", "aff_domain": "ucsb.edu;rpi.edu;;ucsb.edu;;", "position": "PhD student;Assistant Professor;;PhD student;;", "bibtex": "@misc{\nchen2022dfssatten,\ntitle={{DFSSATTEN}: Dynamic Fine-grained Structured Sparse Attention Mechanism},\nauthor={Zhaodong Chen and Liu Liu and Yuying Quan and Zheng Qu and Yufei Ding and Yuan Xie},\nyear={2022},\nurl={https://openreview.net/forum?id=agBJ7SYcUVb}\n}", "github": "", "project": "", "reviewers": "V8Q5;cVkD;kbgy;zwV3", "site": "https://openreview.net/forum?id=agBJ7SYcUVb", "pdf_size": 0, "recommendation": "5;5;8;8", "confidence": "4;3;5;4", "correctness": "3;3;4;4", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "21;74;76;92", "wc_summary_review": "36;60;28;47", "wc_main_review": "295;392;149;439", "wc_review": "352;526;253;578", "wc_reply_reviewers": "0;0;28;0", "wc_reply_authors": "2173;1796;998;1300", "reply_reviewers": "0;0;1;0", "reply_authors": "4;3;3;3", "recommendation_avg": [ 6.5, 1.5 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 65.75, 26.76167969317322 ], "wc_summary_review_avg": [ 42.75, 12.028611723719408 ], "wc_main_review_avg": [ 318.75, 110.91071859833927 ], "wc_review_avg": [ 427.25, 130.8651500591353 ], "wc_reply_reviewers_avg": [ 7.0, 12.12435565298214 ], "wc_reply_authors_avg": [ 1566.75, 451.31107619911126 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 3.25, 0.4330127018922193 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.7071067811865476, "corr_recommendation_correctness": 1.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:vB6-a8d1OdsJ:scholar.google.com/&scioq=DFSSATTEN:+Dynamic+Fine-grained+Structured+Sparse+Attention+Mechanism&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1;0", "aff_unique_norm": "University of California, Santa Barbara;Rensselaer Polytechnic Institute", "aff_unique_dep": ";", "aff_unique_url": "https://www.ucsb.edu;https://www.rpi.edu", "aff_unique_abbr": "UCSB;RPI", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Santa Barbara;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "WeakM3D: Towards Weakly Supervised Monocular 3D Object Detection", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/5913", "id": "ahi2XSHpAUZ", "poster": "", "openreview": "https://openreview.net/forum?id=ahi2XSHpAUZ", "slides": "https://iclr.cc/virtual/2022/poster/5913", "video": "https://iclr.cc/virtual/2022/poster/5913", "author_site": "Liang Peng, Senbo Yan, Boxi Wu, Zheng Yang, Xiaofei He, Deng Cai", "tldr": "", "abstract": "\tMonocular 3D object detection is one of the most challenging tasks in 3D scene understanding. Due to the ill-posed nature of monocular imagery, existing monocular 3D detection methods highly rely on training with the manually annotated 3D box labels on the LiDAR point clouds. This annotation process is very laborious and expensive. To dispense with the reliance on 3D box labels, in this paper we explore the weakly supervised monocular 3D detection. Specifically, we first detect 2D boxes on the image. Then, we adopt the generated 2D boxes to select corresponding RoI LiDAR points as the weak supervision. Eventually, we adopt a network to predict 3D boxes which can tightly align with associated RoI LiDAR points. This network is learned by minimizing our newly-proposed 3D alignment loss between the 3D box estimates and the corresponding RoI LiDAR points. We will illustrate the potential challenges of the above learning problem and resolve these challenges by introducing several effective designs into our method. Codes are available at https://github.com/SPengLiang/WeakM3D.\n", "keywords": "Computer vision;monocular 3D object detection;weakly supervised", "primary_area": "", "supplementary_material": "", "author": "Liang Peng;Senbo Yan;Boxi Wu;Zheng Yang;Xiaofei He;Deng Cai", "authorids": "~Liang_Peng3;~Senbo_Yan1;~Boxi_Wu1;~Zheng_Yang2;~Xiaofei_He2;~Deng_Cai4", "gender": "M;M;M;M;M;M", "homepage": "https://spengliang.github.io/;;http://wiki.zjulearning.org:8081/wiki/User:Wuboxi;https://www.linkedin.com/in/zheng-yang-5455774b/;https://person.zju.edu.cn/0007101;http://www.cad.zju.edu.cn/home/dengcai/", "dblp": "57/3505-1;;;59/5806-8;h/XiaofeiHe.html;c/DCai", "google_scholar": "_sJpS34AAAAJ;;;y8b7ARgAAAAJ;QLLFowsAAAAJ;vzxDyJoAAAAJ", "orcid": ";0000-0002-5051-0506;;0009-0009-2840-2494;0009-0001-9107-2354;", "linkedin": ";;;;;", "or_profile": "~Liang_Peng3;~Senbo_Yan1;~Boxi_Wu1;~Zheng_Yang2;~Xiaofei_He2;~Deng_Cai4", "aff": "Zhejiang University;Zhejiang University;Zhejiang University;Fabu Inc;Zhejiang University;Zhejiang University", "aff_domain": "zju.edu.cn;zju.edu.cn;zju.edu.cn;fabu.ai;zju.edu.cn;zju.edu.cn", "position": "PhD student;PhD student;PhD student;CTO;Professor;Professor", "bibtex": "@inproceedings{\npeng2022weakmd,\ntitle={WeakM3D: Towards Weakly Supervised Monocular 3D Object Detection},\nauthor={Liang Peng and Senbo Yan and Boxi Wu and Zheng Yang and Xiaofei He and Deng Cai},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=ahi2XSHpAUZ}\n}", "github": "", "project": "", "reviewers": "D4LN;Tcr1;1Zpn;fzTD;Pqqk", "pdf_size": 0, "recommendation": "6;6;6;6;8", "confidence": "4;5;4;4;4", "correctness": "4;3;3;3;3", "technical_novelty": "2;3;3;3;3", "empirical_novelty": "3;3;2;2;3", "wc_summary_paper": "183;16;78;86;82", "wc_summary_review": "30;58;49;38;89", "wc_main_review": "415;329;543;178;557", "wc_review": "628;403;670;302;728", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "0;0;0;0;0", "reply_reviewers": "0;0;0;0;0", "reply_authors": "0;0;0;0;0", "recommendation_avg": [ 6.4, 0.7999999999999999 ], "confidence_avg": [ 4.2, 0.39999999999999997 ], "correctness_avg": [ 3.2, 0.39999999999999997 ], "technical_novelty_avg": [ 2.8, 0.39999999999999997 ], "empirical_novelty_avg": [ 2.6, 0.4898979485566356 ], "wc_summary_paper_avg": [ 89.0, 53.56117997206559 ], "wc_summary_review_avg": [ 52.8, 20.448960853794013 ], "wc_main_review_avg": [ 404.4, 141.10364984648697 ], "wc_review_avg": [ 546.2, 164.4437897884867 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.2500000000000001, "corr_recommendation_correctness": -0.2500000000000001, "gs_citation": 30, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1602406100270508731&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=ahi2XSHpAUZ", "email": "zju.edu.cn;zju.edu.cn;zju.edu.cn;fabu.ai;zju.edu.cn;zju.edu.cn", "author_num": 6, "aff_unique_index": "0;0;0;1;0;0", "aff_unique_norm": "Zhejiang University;Fabu Inc", "aff_unique_dep": ";", "aff_unique_url": "https://www.zju.edu.cn;", "aff_unique_abbr": "ZJU;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;0;0", "aff_country_unique": "China;United States" }, { "title": "Neural Stochastic Dual Dynamic Programming", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6655", "id": "aisKPsMM3fg", "poster": "", "openreview": "https://openreview.net/forum?id=aisKPsMM3fg", "slides": "https://iclr.cc/virtual/2022/poster/6655", "video": "https://iclr.cc/virtual/2022/poster/6655", "author_site": "Hanjun Dai, Yuan Xue, Zia Syed, Dale Schuurmans, Bo Dai", "tldr": "", "abstract": "Stochastic dual dynamic programming (SDDP) is a state-of-the-art method for solving multi-stage stochastic optimization, widely used for modeling real-world process optimization tasks. Unfortunately, SDDP has a worst-case complexity that scales exponentially in the number of decision variables, which severely limits applicability to only low dimensional problems. To overcome this limitation, we extend SDDP by introducing a trainable neural model that learns to map problem instances to a piece-wise linear value function within intrinsic low-dimension space, which is architected specifically to interact with a base SDDP solver, so that can accelerate optimization performance on new instances. The proposed Neural Stochastic Dual Dynamic Programming ($$\\nu$$-SDDP) continually self-improves by solving successive problems. An empirical investigation demonstrates that $$\\nu$$-SDDP can significantly reduce problem solving cost without sacrificing solution quality over competitors such as SDDP and reinforcement learning algorithms, across a range of synthetic and real-world process optimization problems.", "keywords": "data-driven algorithm design;learning to optimize;multi-stage stochastic optimization;primal-dual dynamic programming", "primary_area": "", "supplementary_material": "/attachment/6b962962b0cd1d1b2d322ce2469fe12b573e4587.zip", "author": "Hanjun Dai;Yuan Xue;Zia Syed;Dale Schuurmans;Bo Dai", "authorids": "~Hanjun_Dai1;~Yuan_Xue5;~Zia_Syed1;~Dale_Schuurmans1;~Bo_Dai1", "gender": "M;F;M;;", "homepage": "https://hanjun-dai.github.io;;;;https://bo-dai.github.io/", "dblp": "144/7311;;;;64/2903", "google_scholar": "obpl7GQAAAAJ;jcatRRIAAAAJ;;;TIKl_foAAAAJ", "orcid": ";;;;0009-0002-8070-574X", "linkedin": "hanjun-dai;yuan-emily-xue-3483012;ziamsyed/;;", "or_profile": "~Hanjun_Dai1;~Yuan_Xue5;~Zia_Syed1;~Dale_Schuurmans1;~Bo_Dai1", "aff": "Google Research;Google;;;Google Brain", "aff_domain": "google.com;google.com;;;google.com", "position": "Researcher;Researcher;;;Research Scientist", "bibtex": "@inproceedings{\ndai2022neural,\ntitle={Neural Stochastic Dual Dynamic Programming},\nauthor={Hanjun Dai and Yuan Xue and Zia Syed and Dale Schuurmans and Bo Dai},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=aisKPsMM3fg}\n}", "github": "", "project": "", "reviewers": "RrLt;meaA;hKKH;doAC", "pdf_size": 0, "recommendation": "6;6;6;6", "confidence": "4;3;2;5", "correctness": "3;4;4;3", "technical_novelty": "3;2;3;3", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "88;86;54;111", "wc_summary_review": "55;77;19;65", "wc_main_review": "307;221;158;335", "wc_review": "450;384;231;511", "wc_reply_reviewers": "9;0;10;46", "wc_reply_authors": "843;819;273;595", "reply_reviewers": "1;0;1;1", "reply_authors": "2;2;1;1", "recommendation_avg": [ 6.0, 0.0 ], "confidence_avg": [ 3.5, 1.118033988749895 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 84.75, 20.29008378494283 ], "wc_summary_review_avg": [ 54.0, 21.656407827707714 ], "wc_main_review_avg": [ 255.25, 70.12266038877874 ], "wc_review_avg": [ 394.0, 104.27607587553341 ], "wc_reply_reviewers_avg": [ 16.25, 17.612140698961042 ], "wc_reply_authors_avg": [ 632.5, 228.9863533051697 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9501248800846618828&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "pdf": "https://openreview.net/pdf?id=aisKPsMM3fg", "email": "google.com;google.com;;;google.com", "author_num": 5, "aff_unique_index": "0;0;0", "aff_unique_norm": "Google", "aff_unique_dep": "Google Research", "aff_unique_url": "https://research.google", "aff_unique_abbr": "Google Research", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "ajIC9wlTd52", "title": "Learning to Generalize Compositionally by Transferring Across Semantic Parsing Tasks", "track": "main", "status": "Reject", "tldr": "", "abstract": "Neural network models often generalize poorly to mismatched domains or distributions. In NLP, this issue arises in particular when models are expected to generalize compositionally, that is, to novel combinations of familiar words and constructions. We investigate learning representations that facilitate transfer learning from one compositional task to another: the representation and the task-specific layers of the models are strategically trained differently on a pre-finetuning task such that they generalize well on mismatched splits that require compositionality. We apply this method to semantic parsing, using three very different datasets, COGS, GeoQuery and SCAN, used alternately as the pre-finetuning and target task. Our method significantly improves compositional generalization over baselines on the test set of the target task, which is held out during fine-tuning. Ablation studies characterize the utility of the major steps in the proposed algorithm and support our hypothesis. ", "keywords": "transfer learning;compositional generalization", "primary_area": "", "supplementary_material": "", "author": "Wang Zhu;Peter Shaw;Tal Linzen;Fei Sha", "authorids": "~Wang_Zhu1;~Peter_Shaw1;~Tal_Linzen1;~Fei_Sha3", "gender": "M;M;M;M", "homepage": "https://billzhu.me;http://www.ptshaw.com;http://tallinzen.net;http://feisha.org", "dblp": "223/4711-1;217/1471;169/3438;13/3601", "google_scholar": "dMkqNF8AAAAJ;SmGaQicAAAAJ;5mJDXjoAAAAJ;HDHOS0QAAAAJ", "orcid": "0000-0002-6821-4115;;;", "linkedin": ";;;", "or_profile": "~Wang_Zhu1;~Peter_Shaw1;~Tal_Linzen1;~Fei_Sha2", "aff": "University of Southern California;Google DeepMind;New York University;Google", "aff_domain": "usc.edu;google.com;nyu.edu;google.com", "position": "PhD student;Research Scientist;Assistant Professor;research scientist", "bibtex": "@misc{\nzhu2022learning,\ntitle={Learning to Generalize Compositionally by Transferring Across Semantic Parsing Tasks},\nauthor={Wang Zhu and Peter Shaw and Tal Linzen and Fei Sha},\nyear={2022},\nurl={https://openreview.net/forum?id=ajIC9wlTd52}\n}", "github": "", "project": "", "reviewers": "uYpx;K5oD;gjvP", "site": "https://openreview.net/forum?id=ajIC9wlTd52", "pdf_size": 0, "recommendation": "5;6;6", "confidence": "4;4;4", "correctness": "3;4;3", "technical_novelty": "3;2;3", "empirical_novelty": "2;3;3", "wc_summary_paper": "146;163;179", "wc_summary_review": "33;148;52", "wc_main_review": "364;762;757", "wc_review": "543;1073;988", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "884;733;662", "reply_reviewers": "0;0;0", "reply_authors": "2;1;1", "recommendation_avg": [ 5.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 162.66666666666666, 13.474255287605159 ], "wc_summary_review_avg": [ 77.66666666666667, 50.33443707399096 ], "wc_main_review_avg": [ 627.6666666666666, 186.45166189182177 ], "wc_review_avg": [ 868.0, 232.41485896273213 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 759.6666666666666, 92.57189398276107 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.4999999999999999, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2018359435978831281&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;2;1", "aff_unique_norm": "University of Southern California;Google;New York University", "aff_unique_dep": ";Google DeepMind;", "aff_unique_url": "https://www.usc.edu;https://deepmind.com;https://www.nyu.edu", "aff_unique_abbr": "USC;DeepMind;NYU", "aff_campus_unique_index": "0;2", "aff_campus_unique": "Los Angeles;;Mountain View", "aff_country_unique_index": "0;1;0;0", "aff_country_unique": "United States;United Kingdom" }, { "id": "ajOSNLwqssu", "title": "Generating Antimicrobial Peptides from Latent Secondary Structure Space", "track": "main", "status": "Reject", "tldr": "", "abstract": "Antimicrobial peptides (AMPs) have shown promising results in broad-spectrum antibiotics and resistant infection treatments, which makes it attract plenty of attention in drug discovery. Recently, many researchers bring deep generative models to AMP design. However, few studies consider structure information during the generation, though it has shown crucial influence on antimicrobial activity in all AMP mechanism theories. In this paper, we propose LSSAMP that uses the multi-scale VQ-VAE to learn the positional latent spaces modeling the secondary structure. By sampling in the latent secondary structure space, we can generate peptides with ideal amino acids and secondary structures at the same time. Experimental results show that our LSSAMP can generate peptides with multiply ideal physical attributes and a high probability of being predicted as AMPs by public AMP prediction models.", "keywords": "Antimicrobial Peptides;Drug Discovery;Secondary Structure;VQ-VAE", "primary_area": "", "supplementary_material": "", "author": "Danqing Wang;Zeyu Wen;Lei Li;Hao Zhou", "authorids": "~Danqing_Wang1;~Zeyu_Wen2;~Lei_Li11;~Hao_Zhou5", "gender": "F;M;M;M", "homepage": ";;https://www.cs.cmu.edu/~leili;https://zhouh.github.io/", "dblp": "226/6524.html;;13/7007-5.html;63/778-12", "google_scholar": "https://scholar.google.com/citations?hl=en-US;Eq2pVG0AAAAJ;BYXqAlwAAAAJ;https://scholar.google.com/citations?hl=zh-CN", "orcid": ";;0000-0003-3095-9776;", "linkedin": ";;;", "or_profile": "~Danqing_Wang1;~Zeyu_Wen2;~Lei_Li11;~Hao_Zhou5", "aff": "ByteDance AI Lab;Huazhong University of Science and Technology;Computer Science Department, UC Santa Barbara;Bytedance", "aff_domain": "bytedance.com;hust.edu;cs.ucsb.edu;bytedance.com", "position": "Researcher;PhD student;Assistant Professor;Researcher", "bibtex": "@misc{\nwang2022generating,\ntitle={Generating Antimicrobial Peptides from Latent Secondary Structure Space},\nauthor={Danqing Wang and Zeyu Wen and Lei Li and Hao Zhou},\nyear={2022},\nurl={https://openreview.net/forum?id=ajOSNLwqssu}\n}", "github": "", "project": "", "reviewers": "n6Pg;s64x;bThw", "site": "https://openreview.net/forum?id=ajOSNLwqssu", "pdf_size": 0, "recommendation": "1;5;6", "confidence": "4;4;4", "correctness": "2;3;3", "technical_novelty": "2;3;3", "empirical_novelty": "1;2;2", "wc_summary_paper": "53;126;121", "wc_summary_review": "23;84;29", "wc_main_review": "489;716;359", "wc_review": "565;926;509", "wc_reply_reviewers": "97;159;0", "wc_reply_authors": "975;674;459", "reply_reviewers": "1;1;0", "reply_authors": "3;2;1", "recommendation_avg": [ 4.0, 2.160246899469287 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 2.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 1.6666666666666667, 0.4714045207910317 ], "wc_summary_paper_avg": [ 100.0, 33.29664647778612 ], "wc_summary_review_avg": [ 45.333333333333336, 27.45096638655105 ], "wc_main_review_avg": [ 521.3333333333334, 147.52702200689276 ], "wc_review_avg": [ 666.6666666666666, 184.79598360233794 ], "wc_reply_reviewers_avg": [ 85.33333333333333, 65.43359653538506 ], "wc_reply_authors_avg": [ 702.6666666666666, 211.6291305300121 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 2.0, 0.816496580927726 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.9819805060619656, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9301267409501667803&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "ByteDance;Huazhong University of Science and Technology;University of California, Santa Barbara", "aff_unique_dep": "AI Lab;;Computer Science Department", "aff_unique_url": "https://www.bytedance.com;http://www.hust.edu.cn;https://www.ucsb.edu", "aff_unique_abbr": "ByteDance;HUST;UCSB", "aff_campus_unique_index": "1", "aff_campus_unique": ";Santa Barbara", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "China;United States" }, { "title": "Meta-Learning with Fewer Tasks through Task Interpolation", "status": "Oral", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7140", "id": "ajXWF7bVR8d", "poster": "", "openreview": "https://openreview.net/forum?id=ajXWF7bVR8d", "slides": "https://iclr.cc/virtual/2022/poster/7140", "video": "https://iclr.cc/virtual/2022/poster/7140", "author_site": "Huaxiu Yao, Linjun Zhang, Chelsea Finn", "tldr": "", "abstract": "Meta-learning enables algorithms to quickly learn a newly encountered task with just a few labeled examples by transferring previously learned knowledge. However, the bottleneck of current meta-learning algorithms is the requirement of a large number of meta-training tasks, which may not be accessible in real-world scenarios. To address the challenge that available tasks may not densely sample the space of tasks, we propose to augment the task set through interpolation. By meta-learning with task interpolation (MLTI), our approach effectively generates additional tasks by randomly sampling a pair of tasks and interpolating the corresponding features and labels. Under both gradient-based and metric-based meta-learning settings, our theoretical analysis shows MLTI corresponds to a data-adaptive meta-regularization and further improves the generalization. Empirically, in our experiments on eight datasets from diverse domains including image recognition, pose prediction, molecule property prediction, and medical image classification, we find that the proposed general MLTI framework is compatible with representative meta-learning algorithms and consistently outperforms other state-of-the-art strategies.", "keywords": "meta-learning;task interpolation;meta-regularization", "primary_area": "", "supplementary_material": "", "author": "Huaxiu Yao;Linjun Zhang;Chelsea Finn", "authorids": "~Huaxiu_Yao1;~Linjun_Zhang1;~Chelsea_Finn1", "gender": "M;M;F", "homepage": "http://huaxiuyao.mystrikingly.com;;https://ai.stanford.edu/~cbfinn/", "dblp": "197/1635;;131/1783", "google_scholar": "A20BZnQAAAAJ;TUAzs3sAAAAJ;vfPE6hgAAAAJ", "orcid": ";;", "linkedin": "huaxiuyao/;;", "or_profile": "~Huaxiu_Yao1;~Linjun_Zhang1;~Chelsea_Finn1", "aff": "Computer Science Department, Stanford University;Rutgers University;Google", "aff_domain": "cs.stanford.edu;rutgers.edu;google.com", "position": "Postdoc;Assistant Professor;Research Scientist", "bibtex": "@inproceedings{\nyao2022metalearning,\ntitle={Meta-Learning with Fewer Tasks through Task Interpolation},\nauthor={Huaxiu Yao and Linjun Zhang and Chelsea Finn},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=ajXWF7bVR8d}\n}", "github": "", "project": "", "reviewers": "id95;A6CS;hkv8;maUk;a9eU", "pdf_size": 0, "recommendation": "8;8;8;8;8", "confidence": "3;4;3;4;3", "correctness": "3;4;3;3;4", "technical_novelty": "3;3;3;3;3", "empirical_novelty": "3;3;3;3;2", "wc_summary_paper": "116;79;99;66;78", "wc_summary_review": "44;73;53;82;47", "wc_main_review": "456;307;185;338;165", "wc_review": "616;459;337;486;290", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "430;715;208;794;744", "reply_reviewers": "0;0;0;0;0", "reply_authors": "1;1;1;2;1", "recommendation_avg": [ 8.0, 0.0 ], "confidence_avg": [ 3.4, 0.4898979485566356 ], "correctness_avg": [ 3.4, 0.4898979485566356 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.8, 0.39999999999999997 ], "wc_summary_paper_avg": [ 87.6, 17.715529910222838 ], "wc_summary_review_avg": [ 59.8, 15.011995203836165 ], "wc_main_review_avg": [ 290.2, 106.58217486990965 ], "wc_review_avg": [ 437.6, 115.35441040549772 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 578.2, 224.4035650340698 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.2, 0.4 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 68, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17468967265592568520&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "pdf": "https://openreview.net/pdf?id=ajXWF7bVR8d", "email": "cs.stanford.edu;rutgers.edu;google.com", "author_num": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "Stanford University;Rutgers University;Google", "aff_unique_dep": "Computer Science Department;;Google", "aff_unique_url": "https://www.stanford.edu;https://www.rutgers.edu;https://www.google.com", "aff_unique_abbr": "Stanford;Rutgers;Google", "aff_campus_unique_index": "0;2", "aff_campus_unique": "Stanford;;Mountain View", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "alGr3g3L9Jo", "title": "The Details Matter: Preventing Class Collapse in Supervised Contrastive Learning", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Supervised contrastive learning optimizes a loss that pushes together embeddings of points from the same class while pulling apart embeddings of points from different classes. Class collapse\u2014when every point from the same class has the same embedding\u2014minimizes this loss but loses critical information that is not encoded in the class labels. For instance, the \u201ccat\u201d label does not capture unlabeled categories such as breeds, poses, or backgrounds (which we call \u201cstrata\u201d). As a result, class collapse produces embeddings that are less useful for downstream applications such as transfer learning and achieves sub-optimal generalization error when there are strata. We explore a simple modification to supervised contrastive loss that prevents class collapse by uniformly pulling apart individual points from the same class. More importantly, we introduce a theoretical framing to analyze this loss through a view of how it embeds strata of different sizes. We show that our loss maintains distinctions between strata in embedding space, even though it does not explicitly use strata labels. We empirically explore several downstream implications of this insight. Our loss produces embeddings that achieve lift on three downstream applications by distinguishing strata: 4.4 points on coarse-to-fine transfer learning, 2.5 points on worst-group robustness, and 1.0 points on minimal coreset construction. Our loss also produces more accurate models, with up to 4.0 points of lift across 9 tasks.", "keywords": "contrastive learning;supervised contrastive learning;transfer learning;robustness;noisy labels;coresets", "primary_area": "", "supplementary_material": "", "author": "Daniel Yang Fu;Mayee F Chen;Michael Zhang;Kayvon Fatahalian;Christopher R\u00e9", "authorids": "~Daniel_Yang_Fu1;~Mayee_F_Chen1;~Michael_Zhang4;~Kayvon_Fatahalian1;~Christopher_R\u00e91", "gender": ";;M;;", "homepage": ";;https://michaelzhang.xyz/;;", "dblp": ";;;;", "google_scholar": ";;DG_asaIAAAAJ;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~Daniel_Yang_Fu1;~Mayee_F_Chen1;~Michael_Zhang4;~Kayvon_Fatahalian1;~Christopher_R\u00e91", "aff": ";;Stanford University;;", "aff_domain": ";;stanford.edu;;", "position": ";;PhD student;;", "bibtex": "@misc{\nfu2022the,\ntitle={The Details Matter: Preventing Class Collapse in Supervised Contrastive Learning},\nauthor={Daniel Yang Fu and Mayee F Chen and Michael Zhang and Kayvon Fatahalian and Christopher R{\\'e}},\nyear={2022},\nurl={https://openreview.net/forum?id=alGr3g3L9Jo}\n}", "github": "", "project": "", "reviewers": "XuHp;Nc1Z;4ztP;eXgx", "site": "https://openreview.net/forum?id=alGr3g3L9Jo", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "4;3;2;3", "correctness": "2;3;4;3", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "131;128;56;26", "wc_summary_review": "39;117;38;40", "wc_main_review": "327;429;481;242", "wc_review": "497;674;575;308", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 3.0, 0.7071067811865476 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 85.25, 45.515793962096275 ], "wc_summary_review_avg": [ 58.5, 33.78239186321774 ], "wc_main_review_avg": [ 369.75, 92.24254712441542 ], "wc_review_avg": [ 513.5, 134.20599837563148 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.7071067811865475, "corr_recommendation_correctness": 0.7071067811865475, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14744237019303970983&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0", "aff_unique_norm": "Stanford University", "aff_unique_dep": "", "aff_unique_url": "https://www.stanford.edu", "aff_unique_abbr": "Stanford", "aff_campus_unique_index": "0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "id": "alaQzRbCY9w", "title": "Bolstering Stochastic Gradient Descent with Model Building", "track": "main", "status": "Reject", "tldr": "", "abstract": "Stochastic gradient descent method and its variants constitute the core optimization algorithms that achieve good convergence rates for solving machine learning problems. These rates are obtained especially when these algorithms are fine-tuned for the application at hand. Although this tuning process can require large computational costs, recent work has shown that these costs can be reduced by line search methods that iteratively adjust the stepsize. We propose an alternative approach to stochastic line search by using a new algorithm based on forward step model building. This model building step incorporates a second-order information that allows adjusting not only the stepsize but also the search direction. Noting that deep learning model parameters come in groups (layers of tensors), our method builds its model and calculates a new step for each parameter group. This novel diagonalization approach makes the selected step lengths adaptive. We provide convergence rate analysis, and experimentally show that the proposed algorithm achieves faster convergence and better generalization in most problems. Moreover, our experiments show that the proposed method is quite robust as it converges for a wide range of initial stepsizes.", "keywords": "Stochastic Line Search;Stochastic Model Building;Non-convex Stochastic Optimization;Unconstrained Optimization", "primary_area": "", "supplementary_material": "/attachment/a8a9f4846404b2f51b7d17e4999ef6eb858f2ae2.zip", "author": "Ilker Birbil;\u00d6zg\u00fcr Martin;G\u00f6nenc Onay;Figen \u00d6ztoprak", "authorids": "~Ilker_Birbil1;~\u00d6zg\u00fcr_Martin1;~G\u00f6nenc_Onay1;~Figen_\u00d6ztoprak1", "gender": "M;M;;", "homepage": ";;https://onayg.com;", "dblp": "29/1843;;213/5231;17/3444", "google_scholar": "https://scholar.google.com.tr/citations?user=aRzaChQAAAAJ;GjYt5_cAAAAJ;;", "orcid": "0000-0001-7472-7032;0000-0003-1605-1593;;", "linkedin": ";;;", "or_profile": "~Ilker_Birbil1;~\u00d6zg\u00fcr_Martin1;~G\u00f6nenc_Onay1;~Figen_\u00d6ztoprak1", "aff": "University of Amsterdam;Mimar Sinan Fine Arts University;D4C;", "aff_domain": "uva.nl;msgsu.edu.tr;d4c.ai;", "position": "Full Professor;Associate Professor;Researcher;", "bibtex": "@misc{\nbirbil2022bolstering,\ntitle={Bolstering Stochastic Gradient Descent with Model Building},\nauthor={Ilker Birbil and {\\\"O}zg{\\\"u}r Martin and G{\\\"o}nenc Onay and Figen {\\\"O}ztoprak},\nyear={2022},\nurl={https://openreview.net/forum?id=alaQzRbCY9w}\n}", "github": "", "project": "", "reviewers": "hEnG;4z74;8Xqy;xM1f", "site": "https://openreview.net/forum?id=alaQzRbCY9w", "pdf_size": 0, "recommendation": "3;5;5;5", "confidence": "3;4;4;5", "correctness": "3;3;4;3", "technical_novelty": "2;2;2;2", "empirical_novelty": "1;2;0;2", "wc_summary_paper": "50;114;85;36", "wc_summary_review": "39;49;49;16", "wc_main_review": "267;1208;175;444", "wc_review": "356;1371;309;496", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "47;211;122;147", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 4.5, 0.8660254037844386 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 1.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 71.25, 30.457962834043908 ], "wc_summary_review_avg": [ 38.25, 13.47915056670857 ], "wc_main_review_avg": [ 523.5, 406.84917352748795 ], "wc_review_avg": [ 633.0, 431.6010889698959 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 131.75, 58.717012015258405 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.816496580927726, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=849562719806869055&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 13, "aff_unique_index": "0;1;2", "aff_unique_norm": "University of Amsterdam;Mimar Sinan Fine Arts University;D4C", "aff_unique_dep": ";;", "aff_unique_url": "https://www.uva.nl;https://www.msfau.edu.tr;", "aff_unique_abbr": "UvA;MSFAU;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "Netherlands;T\u00fcrkiye;" }, { "id": "anWCFENEc5H", "title": "Modeling Adversarial Noise for Adversarial Defense", "track": "main", "status": "Reject", "tldr": "", "abstract": "Deep neural networks have been demonstrated to be vulnerable to adversarial noise, promoting the development of defense against adversarial attacks. Motivated by the fact that adversarial noise contains well-generalizing features and that the relationship between adversarial data and natural data can help infer natural data and make reliable predictions, in this paper, we study to model adversarial noise by learning the transition relationship between adversarial labels (i.e. the flipped labels used to generate adversarial data) and natural labels (i.e. the ground truth labels of the natural data). Specifically, we introduce an instance-dependent transition matrix to relate adversarial labels and natural labels, which can be seamlessly embedded with the target model (enabling us to model stronger adaptive adversarial noise). Empirical evaluations demonstrate that our method could effectively improve adversarial accuracy.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Dawei Zhou;Nannan Wang;Bo Han;Tongliang Liu", "authorids": "~Dawei_Zhou3;~Nannan_Wang1;~Bo_Han1;~Tongliang_Liu1", "gender": "M;M;M;M", "homepage": ";https://tongliang-liu.github.io/;https://bhanml.github.io/;", "dblp": "10/8359-1;150/6667;241/0472-3;39/3130-4", "google_scholar": "SRBn7oUAAAAJ;https://scholar.google.com.au/citations?user=EiLdZ_YAAAAJ;nTNjqHwAAAAJ;https://scholar.google.com.hk/citations?user=7H-LIigAAAAJ", "orcid": ";;;0000-0002-0694-3603", "linkedin": ";;;", "or_profile": "~Nannan_Wang1;~Tongliang_Liu1;~bo_han2;~Zhou_Dawei1", "aff": "Xidian University;University of Sydney;Microsoft Research;Xidian University", "aff_domain": "xidian.edu.cn;sydney.edu.au;microsoft.com;xidian.edu.cn", "position": "Full Professor;Lecturer;Researcher;PhD student", "bibtex": "@misc{\nzhou2022modeling,\ntitle={Modeling Adversarial Noise for Adversarial Defense},\nauthor={Dawei Zhou and Nannan Wang and Bo Han and Tongliang Liu},\nyear={2022},\nurl={https://openreview.net/forum?id=anWCFENEc5H}\n}", "github": "", "project": "", "reviewers": "Ee6e;aEYo;9XAu", "site": "https://openreview.net/forum?id=anWCFENEc5H", "pdf_size": 0, "recommendation": "3;3;5", "confidence": "4;3;5", "correctness": "3;2;3", "technical_novelty": "2;2;3", "empirical_novelty": "3;2;2", "wc_summary_paper": "88;58;58", "wc_summary_review": "28;21;69", "wc_main_review": "284;285;246", "wc_review": "400;364;373", "wc_reply_reviewers": "67;260;103", "wc_reply_authors": "960;1173;671", "reply_reviewers": "1;1;1", "reply_authors": "4;3;3", "recommendation_avg": [ 3.6666666666666665, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "correctness_avg": [ 2.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 68.0, 14.142135623730951 ], "wc_summary_review_avg": [ 39.333333333333336, 21.171259344267224 ], "wc_main_review_avg": [ 271.6666666666667, 18.153665072253467 ], "wc_review_avg": [ 379.0, 15.297058540778355 ], "wc_reply_reviewers_avg": [ 143.33333333333334, 83.79472271900872 ], "wc_reply_authors_avg": [ 934.6666666666666, 205.72203468650497 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 3.3333333333333335, 0.4714045207910317 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.8660254037844387, "corr_recommendation_correctness": 0.5, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1454560527652334703&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "Xidian University;University of Sydney;Microsoft", "aff_unique_dep": ";;Microsoft Research", "aff_unique_url": "http://www.xidian.edu.cn/;https://www.sydney.edu.au;https://www.microsoft.com/en-us/research", "aff_unique_abbr": "Xidian;USYD;MSR", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2;0", "aff_country_unique": "China;Australia;United States" }, { "id": "an_ndI09oVZ", "title": "Deep banach space kernels", "track": "main", "status": "Reject", "tldr": "", "abstract": "The recent success of deep learning has encouraged many researchers to explore the deep/concatenated variants of classical kernel methods. Some of which includes MLMKL, DGP and DKL. Although, These methods have proven to be quite useful in various real-world settings. They still suffer from the limitations of only utilizing kernels from Hilbert spaces. In this paper, we address these shortcomings by introducing a new class of concatenated kernel learning methods that use the kernels from the reproducing kernel Banach spaces(RKBSs). These spaces turned out to be one of the most general spaces where a reproducing Kernel exists. We propose a framework of construction for these Deep RKBS models and then provide a representer theorem for regularized learning problems. We also describe the relationship with its deep RKHS variant as well as standard Deep Gaussian Processes. In the end, we construct and implement a two-layer deep RKBS model and demonstrate it on a range of machine learning tasks.", "keywords": "RKBS;RKHS;concatenated kernel learning;representation learning;deep learning;MLMKL;Deep Gaussian Processes;gaussian processes;kernel machines", "primary_area": "", "supplementary_material": "", "author": "Mrityunjay Bhardwaj", "authorids": "~Mrityunjay_Bhardwaj1", "gender": "M", "homepage": "https://mrityunjay.ml", "dblp": "", "google_scholar": "https://scholar.google.com/citations?view_op=list_works", "orcid": "", "linkedin": "mrityunjay-bhardwaj-53b00317b/", "or_profile": "~Mrityunjay_Bhardwaj1", "aff": "Jupiter AI Labs", "aff_domain": "juppiterailabs.com", "position": "Researcher", "bibtex": "@misc{\nbhardwaj2022deep,\ntitle={Deep banach space kernels},\nauthor={Mrityunjay Bhardwaj},\nyear={2022},\nurl={https://openreview.net/forum?id=an_ndI09oVZ}\n}", "github": "", "project": "", "reviewers": "57b6;5Wv8;kbJr", "site": "https://openreview.net/forum?id=an_ndI09oVZ", "pdf_size": 0, "recommendation": "1;3;3", "confidence": "5;3;4", "correctness": "2;4;4", "technical_novelty": "1;1;3", "empirical_novelty": "1;1;2", "wc_summary_paper": "17;25;37", "wc_summary_review": "13;6;46", "wc_main_review": "375;79;321", "wc_review": "405;110;404", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 2.3333333333333335, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "correctness_avg": [ 3.3333333333333335, 0.9428090415820634 ], "technical_novelty_avg": [ 1.6666666666666667, 0.9428090415820634 ], "empirical_novelty_avg": [ 1.3333333333333333, 0.4714045207910317 ], "wc_summary_paper_avg": [ 26.333333333333332, 8.219218670625303 ], "wc_summary_review_avg": [ 21.666666666666668, 17.441967269268172 ], "wc_main_review_avg": [ 258.3333333333333, 128.7098373172083 ], "wc_review_avg": [ 306.3333333333333, 138.82923163208662 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 1, 0 ], "corr_recommendation_confidence": -0.8660254037844387, "corr_recommendation_correctness": 1.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:xpUfglbHn1UJ:scholar.google.com/&scioq=Deep+banach+space+kernels&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "Jupiter AI Labs", "aff_unique_dep": "", "aff_unique_url": "https://jupiter.ai", "aff_unique_abbr": "Jupiter AI", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "title": "Boosted Curriculum Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6802", "id": "anbBFlX1tJ1", "poster": "", "openreview": "https://openreview.net/forum?id=anbBFlX1tJ1", "slides": "https://iclr.cc/virtual/2022/poster/6802", "video": "https://iclr.cc/virtual/2022/poster/6802", "author_site": "Pascal Klink, Carlo D'Eramo, Jan Peters, Joni Pajarinen", "tldr": "", "abstract": "Curriculum value-based reinforcement learning (RL) solves a complex target task by reusing action-values across a tailored sequence of related tasks of increasing difficulty. However, finding an exact way of reusing action-values in this setting is still a poorly understood problem. In this paper, we introduce the concept of boosting to curriculum value-based RL, by approximating the action-value function as a sum of residuals trained on each task. This approach, which we refer to as boosted curriculum reinforcement learning (BCRL), has the benefit of naturally increasing the representativeness of the functional space by adding a new residual each time a new task is presented. This procedure allows reusing previous action-values while promoting expressiveness of the action-value function. We theoretically study BCRL as an approximate value iteration algorithm, discussing advantages over regular curriculum RL in terms of approximation accuracy and convergence to the optimal action-value function. Finally, we provide detailed empirical evidence of the benefits of BCRL in problems requiring curricula for accurate action-value estimation and targeted exploration.", "keywords": "reinforcement learning;curriculum learning;boosting;residual learning", "primary_area": "", "supplementary_material": "", "author": "Pascal Klink;Carlo D'Eramo;Jan Peters;Joni Pajarinen", "authorids": "~Pascal_Klink2;~Carlo_D'Eramo2;~Jan_Peters3;~Joni_Pajarinen2", "gender": "M;M;M;", "homepage": ";https://carloderamo.wixsite.com/home;https://www.jan-peters.net;", "dblp": ";182/8953;p/JanPeters1;23/8355", "google_scholar": "https://scholar.google.de/citations?user=ZjqU_KwAAAAJ;https://scholar.google.it/citations?user=1Rt_86gAAAAJ;https://scholar.google.de/citations?user=-kIVAcAAAAAJ;https://scholar.google.fi/citations?user=-2fJStwAAAAJ", "orcid": ";0000-0003-2712-118X;0000-0002-5266-8091;0000-0003-4469-8191", "linkedin": ";carlo-d-eramo-6438a289/;janrpeters/;", "or_profile": "~Pascal_Klink2;~Carlo_D'Eramo2;~Jan_Peters3;~Joni_Pajarinen2", "aff": "TU Darmstadt;TU Darmstadt;TU Darmstadt;Technische Universit\u00e4t Darmstadt", "aff_domain": "tu-darmstadt.de;tu-darmstadt.de;tu-darmstadt.de;tu-darmstadt.de", "position": "PhD student;Postdoc;Full Professor;Researcher", "bibtex": "@inproceedings{\nklink2022boosted,\ntitle={Boosted Curriculum Reinforcement Learning},\nauthor={Pascal Klink and Carlo D'Eramo and Jan Peters and Joni Pajarinen},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=anbBFlX1tJ1}\n}", "github": "", "project": "", "reviewers": "YXe6;N1P2;TFaS;hq3v", "pdf_size": 0, "recommendation": "6;6;6;8", "confidence": "4;3;4;4", "correctness": "3;3;3;4", "technical_novelty": "3;3;3;3", "empirical_novelty": "3;2;3;3", "wc_summary_paper": "69;52;157;103", "wc_summary_review": "20;43;52;88", "wc_main_review": "479;326;753;355", "wc_review": "568;421;962;546", "wc_reply_reviewers": "238;260;162;123", "wc_reply_authors": "1456;614;859;823", "reply_reviewers": "2;1;2;2", "reply_authors": "4;2;3;2", "recommendation_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 95.25, 40.10221315588455 ], "wc_summary_review_avg": [ 50.75, 24.468091466234142 ], "wc_main_review_avg": [ 478.25, 168.71481114590978 ], "wc_review_avg": [ 624.25, 202.89945169960416 ], "wc_reply_reviewers_avg": [ 195.75, 55.55346523845295 ], "wc_reply_authors_avg": [ 938.0, 313.3552297313705 ], "reply_reviewers_avg": [ 1.75, 0.4330127018922193 ], "reply_authors_avg": [ 2.75, 0.82915619758885 ], "replies_avg": [ 25, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": 1.0, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7828839611681580854&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "pdf": "https://openreview.net/pdf?id=anbBFlX1tJ1", "email": "tu-darmstadt.de;tu-darmstadt.de;tu-darmstadt.de;tu-darmstadt.de", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Technische Universit\u00e4t Darmstadt", "aff_unique_dep": "", "aff_unique_url": "https://www.tu-darmstadt.de", "aff_unique_abbr": "TU Darmstadt", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Darmstadt;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Germany" }, { "id": "apop1GvnJZb", "title": "Why does Negative Sampling not Work Well? Analysis of Convexity in Negative Sampling", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "A negative sampling (NS) loss function is widely used in various tasks because we can choose an appropriate noise distribution considering properties for a targeting task. In particular, since the NS loss function does not have a normalization term, it is useful for classification problems with a large number of labels to be considered, such as knowledge graph embedding in terms of computational efficiency. On the other hand, properties of the NS loss function that are considered important for learning, such as the relationship between the noise distribution and the number of negative samples, have not been investigated theoretically. By analyzing the gradient of the NS loss function, we show that the NS loss function is non-convex and has a partial convex domain. We investigated the conditions of noise distribution and the number of samples required for efficient learning under this property. As a result, we found that the NS loss function behaves as a convex loss function when our induced conditions are satisfied and combined with a scoring method that handles only non-negative values, which enables efficient learning. Experimental results in FB15k-237, WN18RR, and YAGO3-10 showed that NS loss satisfying the conditions we proposed can improve the performance of KG completion by utilizing TransE and RotatE, which are non-negative scoring methods.", "keywords": "Knowledge Graph Embedding;KGE;Negative Sampling;Convexity", "primary_area": "", "supplementary_material": "", "author": "Hidetaka Kamigaito;Katsuhiko Hayashi", "authorids": "~Hidetaka_Kamigaito2;~Katsuhiko_Hayashi2", "gender": "M;M", "homepage": "https://sites.google.com/site/hidetakakamigaito/;", "dblp": "124/2384;23/9282", "google_scholar": "https://scholar.google.co.jp/citations?user=cyZpch8AAAAJ;", "orcid": "0000-0002-5249-5813;", "linkedin": ";", "or_profile": "~Hidetaka_Kamigaito2;~Katsuhiko_Hayashi2", "aff": "Tokyo Institute of Technology;Hokkaido University", "aff_domain": "titech.ac.jp;hokudai.ac.jp", "position": "Assistant Professor;Associate Professor", "bibtex": "@misc{\nkamigaito2022why,\ntitle={Why does Negative Sampling not Work Well? Analysis of Convexity in Negative Sampling},\nauthor={Hidetaka Kamigaito and Katsuhiko Hayashi},\nyear={2022},\nurl={https://openreview.net/forum?id=apop1GvnJZb}\n}", "github": "", "project": "", "reviewers": "ERwS;fZRf;RhnX;QiVE;EcLo;qLs5;LfBR", "site": "https://openreview.net/forum?id=apop1GvnJZb", "pdf_size": 0, "recommendation": "3;3;3;5;6;6;8", "confidence": "4;2;4;3;3;4;3", "correctness": "2;3;3;3;4;3;4", "technical_novelty": "3;4;2;2;3;3;4", "empirical_novelty": "2;4;2;2;2;3;4", "wc_summary_paper": "72;129;146;60;127;150;47", "wc_summary_review": "52;105;44;60;35;50;27", "wc_main_review": "516;507;312;157;319;206;118", "wc_review": "640;741;502;277;481;406;192", "wc_reply_reviewers": "338;153;147;0;0;0;0", "wc_reply_authors": "742;784;442;448;379;267;56", "reply_reviewers": "1;1;1;0;0;0;0", "reply_authors": "1;1;1;1;1;1;1", "recommendation_avg": [ 4.857142857142857, 1.8070158058105026 ], "confidence_avg": [ 3.2857142857142856, 0.6998542122237652 ], "correctness_avg": [ 3.142857142857143, 0.6388765649999398 ], "technical_novelty_avg": [ 3.0, 0.7559289460184544 ], "empirical_novelty_avg": [ 2.7142857142857144, 0.880630571852711 ], "wc_summary_paper_avg": [ 104.42857142857143, 40.07442056566095 ], "wc_summary_review_avg": [ 53.285714285714285, 23.432055490071626 ], "wc_main_review_avg": [ 305.0, 147.4759060224512 ], "wc_review_avg": [ 462.7142857142857, 177.98211329530062 ], "wc_reply_reviewers_avg": [ 91.14285714285714, 120.1860802154352 ], "wc_reply_authors_avg": [ 445.42857142857144, 236.19232910179494 ], "reply_reviewers_avg": [ 0.42857142857142855, 0.4948716593053935 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 20, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.08068715304598781, "corr_recommendation_correctness": 0.7601397897755385, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:m3-aBLT_er4J:scholar.google.com/&scioq=Why+does+Negative+Sampling+not+Work+Well%3F+Analysis+of+Convexity+in+Negative+Sampling&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1", "aff_unique_norm": "Tokyo Institute of Technology;Hokkaido University", "aff_unique_dep": ";", "aff_unique_url": "https://www.titech.ac.jp;https://www.hokudai.ac.jp", "aff_unique_abbr": "Titech;Hokkaido U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Japan" }, { "title": "Ab-Initio Potential Energy Surfaces by Pairing GNNs with Neural Wave Functions", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7108", "id": "apv504XsysP", "poster": "", "openreview": "https://openreview.net/forum?id=apv504XsysP", "slides": "https://iclr.cc/virtual/2022/poster/7108", "video": "https://iclr.cc/virtual/2022/poster/7108", "author_site": "Nicholas Gao, Stephan G\u00fcnnemann", "tldr": "", "abstract": "Solving the Schr\u00f6dinger equation is key to many quantum mechanical properties. However, an analytical solution is only tractable for single-electron systems. Recently, neural networks succeeded at modelling wave functions of many-electron systems. Together with the variational Monte-Carlo (VMC) framework, this led to solutions on par with the best known classical methods. Still, these neural methods require tremendous amounts of computational resources as one has to train a separate model for each molecular geometry. In this work, we combine a Graph Neural Network (GNN) with a neural wave function to simultaneously solve the Schr\u00f6dinger equation for multiple geometries via VMC. This enables us to model continuous subsets of the potential energy surface with a single training pass. Compared to existing state-of-the-art networks, our Potential Energy Surface Network (PESNet) speeds up training for multiple geometries by up to 40 times while matching or surpassing their accuracy. This may open the path to accurate and orders of magnitude cheaper quantum mechanical calculations.", "keywords": "Graph Neural Networks;Computational Physics;Self-Generative Learning;Machine Learning for Science", "primary_area": "", "supplementary_material": "", "author": "Nicholas Gao;Stephan G\u00fcnnemann", "authorids": "~Nicholas_Gao1;~Stephan_G\u00fcnnemann1", "gender": "M;M", "homepage": ";http://www.daml.in.tum.de", "dblp": ";43/3011", "google_scholar": "3GIKgWoAAAAJ;", "orcid": ";", "linkedin": ";", "or_profile": "~Nicholas_Gao1;~Stephan_G\u00fcnnemann1", "aff": "Technical University Munich;Technical University Munich", "aff_domain": "tum.de;tum.de", "position": "PhD student;Professor", "bibtex": "@inproceedings{\ngao2022abinitio,\ntitle={Ab-Initio Potential Energy Surfaces by Pairing {GNN}s with Neural Wave Functions},\nauthor={Nicholas Gao and Stephan G{\\\"u}nnemann},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=apv504XsysP}\n}", "github": "", "project": "", "reviewers": "3XzF;bjEg;Zkwk", "pdf_size": 0, "recommendation": "8;8;8", "confidence": "4;4;2", "correctness": "3;3;4", "technical_novelty": "3;3;3", "empirical_novelty": "3;2;0", "wc_summary_paper": "38;149;84", "wc_summary_review": "31;74;39", "wc_main_review": "424;218;179", "wc_review": "493;441;302", "wc_reply_reviewers": "6;111;58", "wc_reply_authors": "1003;713;353", "reply_reviewers": "1;1;1", "reply_authors": "2;2;1", "recommendation_avg": [ 8.0, 0.0 ], "confidence_avg": [ 3.3333333333333335, 0.9428090415820634 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 1.6666666666666667, 1.247219128924647 ], "wc_summary_paper_avg": [ 90.33333333333333, 45.536310297997964 ], "wc_summary_review_avg": [ 48.0, 18.672618098881223 ], "wc_main_review_avg": [ 273.6666666666667, 107.4874669696684 ], "wc_review_avg": [ 412.0, 80.62671186813131 ], "wc_reply_reviewers_avg": [ 58.333333333333336, 42.866718506967096 ], "wc_reply_authors_avg": [ 689.6666666666666, 265.8738213681236 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 46, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16901851478491451308&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=apv504XsysP", "email": "tum.de;tum.de", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Technical University of Munich", "aff_unique_dep": "", "aff_unique_url": "https://www.tum.de", "aff_unique_abbr": "TUM", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Germany" }, { "id": "aq6mqSkwApo", "title": "Meta-OLE: Meta-learned Orthogonal Low-Rank Embedding", "track": "main", "status": "Reject", "tldr": "", "abstract": "We introduce Meta-OLE, a new geometry-regularized method for fast adaptation to novel tasks in few-shot image classification. The proposed method learns to adapt for each few-shot classification task a feature space with simultaneous inter-class orthogonality and intra-class low-rankness. Specifically, a deep feature extractor is trained by explicitly imposing orthogonal low-rank subspace structures among features corresponding to different classes within a given task. To adapt to novel tasks with unseen categories, we further meta-learn a light-weight transformation to enhance the inter-class margins. As an additional benefit, this light-weight transformation lets us exploit the query data for label propagation from labeled to unlabeled data without any auxiliary network components. The explicitly geometry-regularized feature subspaces allow the classifiers on novel tasks to be inferred in a closed form, with an adaptive subspace truncation that selectively discards non-discriminative dimensions. We perform experiments on standard few-shot image classification tasks, and observe performance superior to state-of-the-art meta-learning methods. ", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ze Wang;Yue Lu;Qiang Qiu", "authorids": "~Ze_Wang3;~Yue_Lu1;~Qiang_Qiu1", "gender": "M;M;", "homepage": ";https://lu.seas.harvard.edu;https://web.ics.purdue.edu/~qqiu/", "dblp": ";39/6975;97/360", "google_scholar": "80Jw_w8AAAAJ;wc0FCZUAAAAJ;jdLtt_YAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Ze_Wang3;~Yue_Lu1;~Qiang_Qiu1", "aff": "Purdue University;School of Engineering and Applied Sciences, Harvard University;Purdue University", "aff_domain": "purdue.edu;seas.harvard.edu;purdue.edu", "position": "PhD student;Professor;Assistant Professor", "bibtex": "@misc{\nwang2022metaole,\ntitle={Meta-{OLE}: Meta-learned Orthogonal Low-Rank Embedding},\nauthor={Ze Wang and Yue Lu and Qiang Qiu},\nyear={2022},\nurl={https://openreview.net/forum?id=aq6mqSkwApo}\n}", "github": "", "project": "", "reviewers": "jCT7;jSyC;HpAs", "site": "https://openreview.net/forum?id=aq6mqSkwApo", "pdf_size": 0, "recommendation": "3;5;6", "confidence": "4;4;4", "correctness": "3;3;4", "technical_novelty": "2;2;3", "empirical_novelty": "2;2;3", "wc_summary_paper": "82;51;154", "wc_summary_review": "41;20;58", "wc_main_review": "256;270;141", "wc_review": "379;341;353", "wc_reply_reviewers": "50;142;148", "wc_reply_authors": "377;482;559", "reply_reviewers": "1;1;1", "reply_authors": "2;1;2", "recommendation_avg": [ 4.666666666666667, 1.247219128924647 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 95.66666666666667, 43.14574782705192 ], "wc_summary_review_avg": [ 39.666666666666664, 15.542057635833022 ], "wc_main_review_avg": [ 222.33333333333334, 57.79465565449994 ], "wc_review_avg": [ 357.6666666666667, 15.860503004493758 ], "wc_reply_reviewers_avg": [ 113.33333333333333, 44.850368510811094 ], "wc_reply_authors_avg": [ 472.6666666666667, 74.59371436134698 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.7559289460184545, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3249392109701933093&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;1;0", "aff_unique_norm": "Purdue University;Harvard University", "aff_unique_dep": ";School of Engineering and Applied Sciences", "aff_unique_url": "https://www.purdue.edu;https://www.harvard.edu", "aff_unique_abbr": "Purdue;Harvard", "aff_campus_unique_index": "1", "aff_campus_unique": ";Cambridge", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Graph Neural Network Guided Local Search for the Traveling Salesperson Problem", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6596", "id": "ar92oEosBIg", "poster": "", "openreview": "https://openreview.net/forum?id=ar92oEosBIg", "slides": "https://iclr.cc/virtual/2022/poster/6596", "video": "https://iclr.cc/virtual/2022/poster/6596", "author_site": "Benjamin Hudson, Qingbiao Li, Matthew Malencia, Amanda Prorok", "tldr": "", "abstract": "Solutions to the Traveling Salesperson Problem (TSP) have practical applications to processes in transportation, logistics, and automation, yet must be computed with minimal delay to satisfy the real-time nature of the underlying tasks. However, solving large TSP instances quickly without sacrificing solution quality remains challenging for current approximate algorithms. To close this gap, we present a hybrid data-driven approach for solving the TSP based on Graph Neural Networks (GNNs) and Guided Local Search (GLS). Our model predicts the regret of including each edge of the problem graph in the solution; GLS uses these predictions in conjunction with the original problem graph to find solutions. Our experiments demonstrate that this approach converges to optimal solutions at a faster rate than three recent learning based approaches for the TSP. Notably, we reduce the mean optimality gap on the 100-node problem set from 1.534% to 0.705%, a 2x improvement. When generalizing from 20-node instances to the 100-node problem set, we reduce the optimality gap from 18.845% to 2.622%, a 7x improvement.", "keywords": "Traveling Salesman Problem;Graph Neural Network;Metaheuristic;Guided Local Search;Hybrid", "primary_area": "", "supplementary_material": "", "author": "Benjamin Hudson;Qingbiao Li;Matthew Malencia;Amanda Prorok", "authorids": "~Benjamin_Hudson1;~Qingbiao_Li1;~Matthew_Malencia1;~Amanda_Prorok1", "gender": ";M;;", "homepage": ";https://qingbiaoli.github.io/;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;0000-0001-5445-361X;", "linkedin": ";;;", "or_profile": "~Benjamin_Hudson1;~Qingbiao_Li1;~Matthew_Malencia1;~Amanda_Prorok1", "aff": ";University of Cambridge;University of Pennsylvania;", "aff_domain": ";cam.ac.uk;upenn.edu;", "position": ";PhD student;PhD student;", "bibtex": "@inproceedings{\nhudson2022graph,\ntitle={Graph Neural Network Guided Local Search for the Traveling Salesperson Problem},\nauthor={Benjamin Hudson and Qingbiao Li and Matthew Malencia and Amanda Prorok},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=ar92oEosBIg}\n}", "github": "", "project": "", "reviewers": "u4N8;un7S;2Y2p;v3Sn;aECP", "pdf_size": 0, "recommendation": "3;3;6;8;8", "confidence": "5;4;5;3;3", "correctness": "2;2;3;3;4", "technical_novelty": "2;3;3;3;3", "empirical_novelty": "2;3;2;3;3", "wc_summary_paper": "76;95;181;46;137", "wc_summary_review": "39;75;342;18;136", "wc_main_review": "234;316;1637;188;136", "wc_review": "349;486;2160;252;409", "wc_reply_reviewers": "293;0;1006;0;0", "wc_reply_authors": "1126;491;1804;385;385", "reply_reviewers": "2;0;4;0;0", "reply_authors": "6;2;3;1;1", "recommendation_avg": [ 5.6, 2.244994432064365 ], "confidence_avg": [ 4.0, 0.8944271909999159 ], "correctness_avg": [ 2.8, 0.7483314773547882 ], "technical_novelty_avg": [ 2.8, 0.39999999999999997 ], "empirical_novelty_avg": [ 2.6, 0.4898979485566356 ], "wc_summary_paper_avg": [ 107.0, 47.33286384743691 ], "wc_summary_review_avg": [ 122.0, 117.05554237198682 ], "wc_main_review_avg": [ 502.2, 570.4729266144011 ], "wc_review_avg": [ 731.2, 718.4865760750162 ], "wc_reply_reviewers_avg": [ 259.8, 389.97558897961807 ], "wc_reply_authors_avg": [ 838.2, 556.2241994016441 ], "reply_reviewers_avg": [ 1.2, 1.5999999999999999 ], "reply_authors_avg": [ 2.6, 1.8547236990991407 ], "replies_avg": [ 27, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.6972166887783963, "corr_recommendation_correctness": 0.9047619047619047, "gs_citation": 98, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7438825804269654854&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "pdf": "https://openreview.net/pdf?id=ar92oEosBIg", "email": ";cam.ac.uk;upenn.edu;", "author_num": 4, "aff_unique_index": "0;1", "aff_unique_norm": "University of Cambridge;University of Pennsylvania", "aff_unique_dep": ";", "aff_unique_url": "https://www.cam.ac.uk;https://www.upenn.edu", "aff_unique_abbr": "Cambridge;UPenn", "aff_campus_unique_index": "0", "aff_campus_unique": "Cambridge;", "aff_country_unique_index": "0;1", "aff_country_unique": "United Kingdom;United States" }, { "id": "auLXcGlEOZ7", "title": "On Margin Maximization in Linear and ReLU Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "The implicit bias of neural networks has been extensively studied in recent years. Lyu and Li [2019] showed that in homogeneous networks trained with the exponential or the logistic loss, gradient flow converges to a KKT point of the max margin problem in the parameter space. However, that leaves open the question of whether this point will generally be an actual optimum of the max margin problem. In this paper, we study this question in detail, for several neural network architectures involving linear and ReLU activations. Perhaps surprisingly, we show that in many cases, the KKT point is not even a local optimum of the max margin problem. On the flip side, we identify multiple settings where a local or global optimum can be guaranteed. Finally, we answer a question posed in Lyu and Li [2019] by showing that for non-homogeneous networks, the normalized margin may strictly decrease over time.", "keywords": "Implicit bias;Homogeneous neural networks;Exponential loss;Logistic loss;Maximum margin;Linear networks;ReLU networks", "primary_area": "", "supplementary_material": "", "author": "Gal Vardi;Ohad Shamir;Nathan Srebro", "authorids": "~Gal_Vardi1;~Ohad_Shamir1;~Nathan_Srebro1", "gender": "M;;M", "homepage": "https://sites.google.com/view/galvardi/home;http://www.wisdom.weizmann.ac.il/~shamiro/;http://ttic.uchicago.edu/~nati/", "dblp": "https://dblp.uni-trier.de/pid/167/9638.html;12/5897;50/3633", "google_scholar": "https://scholar.google.co.il/citations?hl=en;all0DHsAAAAJ;https://scholar.google.com.tw/citations?user=ZnT-QpMAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Gal_Vardi1;~Ohad_Shamir1;~Nathan_Srebro1", "aff": "Weizmann Institute;Weizmann Institute;University of Chicago", "aff_domain": "weizmann.ac.il;weizmann.ac.il;uchicago.edu", "position": "Postdoc;Associate Professor;Full Professor", "bibtex": "@misc{\nvardi2022on,\ntitle={On Margin Maximization in Linear and Re{LU} Networks},\nauthor={Gal Vardi and Ohad Shamir and Nathan Srebro},\nyear={2022},\nurl={https://openreview.net/forum?id=auLXcGlEOZ7}\n}", "github": "", "project": "", "reviewers": "AttS;frEy;7GJf;sBmy", "site": "https://openreview.net/forum?id=auLXcGlEOZ7", "pdf_size": 0, "recommendation": "5;6;6;6", "confidence": "2;4;4;3", "correctness": "4;4;4;4", "technical_novelty": "2;3;3;3", "empirical_novelty": "0;0;0;0", "wc_summary_paper": "41;152;213;153", "wc_summary_review": "11;22;148;58", "wc_main_review": "83;218;807;308", "wc_review": "135;392;1168;519", "wc_reply_reviewers": "0;0;0;150", "wc_reply_authors": "295;140;376;1022", "reply_reviewers": "0;0;0;1", "reply_authors": "1;1;1;2", "recommendation_avg": [ 5.75, 0.4330127018922193 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "correctness_avg": [ 4.0, 0.0 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 0, 0 ], "wc_summary_paper_avg": [ 139.75, 62.13443087371124 ], "wc_summary_review_avg": [ 59.75, 53.83481680102571 ], "wc_main_review_avg": [ 354.0, 273.52422196215093 ], "wc_review_avg": [ 553.5, 380.7968618568173 ], "wc_reply_reviewers_avg": [ 37.5, 64.9519052838329 ], "wc_reply_authors_avg": [ 458.25, 336.34533964364664 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.8703882797784891, "corr_recommendation_correctness": 0.0, "gs_citation": 35, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10167009318174181439&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff_unique_index": "0;0;1", "aff_unique_norm": "Weizmann Institute of Science;University of Chicago", "aff_unique_dep": ";", "aff_unique_url": "https://www.weizmann.org.il;https://www.uchicago.edu", "aff_unique_abbr": "Weizmann;UChicago", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1", "aff_country_unique": "Israel;United States" }, { "title": "Hybrid Memoised Wake-Sleep: Approximate Inference at the Discrete-Continuous Interface", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6869", "id": "auOPcdAcoy", "poster": "", "openreview": "https://openreview.net/forum?id=auOPcdAcoy", "slides": "https://iclr.cc/virtual/2022/poster/6869", "video": "https://iclr.cc/virtual/2022/poster/6869", "author_site": "Tuan Anh Le, Katherine Collins, Luke Hewitt, Kevin Ellis, Siddharth N, Samuel Gershman, Joshua B Tenenbaum", "tldr": "", "abstract": "Modeling complex phenomena typically involves the use of both discrete and continuous variables. Such a setting applies across a wide range of problems, from identifying trends in time-series data to performing effective compositional scene understanding in images. Here, we propose Hybrid Memoised Wake-Sleep (HMWS), an algorithm for effective inference in such hybrid discrete-continuous models. Prior approaches to learning suffer as they need to perform repeated expensive inner-loop discrete inference. We build on a recent approach, Memoised Wake-Sleep (MWS), which alleviates part of the problem by memoising discrete variables, and extend it to allow for a principled and effective way to handle continuous variables by learning a separate recognition model used for importance-sampling based approximate inference and marginalization. We evaluate HMWS in the GP-kernel learning and 3D scene understanding domains, and show that it outperforms current state-of-the-art inference methods.", "keywords": "wake-sleep;variational inference;neuro-symbolic generative models", "primary_area": "", "supplementary_material": "", "author": "Tuan Anh Le;Katherine M. Collins;Luke Hewitt;Kevin Ellis;Siddharth N;Samuel Gershman;Joshua B. Tenenbaum", "authorids": "~Tuan_Anh_Le1;~Katherine_M._Collins1;~Luke_Hewitt1;~Kevin_Ellis1;~Siddharth_N1;~Samuel_Gershman1;~Joshua_B._Tenenbaum1", "gender": "M;F;;M;M;M;", "homepage": "https://www.tuananhle.co.uk;https://collinskatie.github.io/;;https://www.cs.cornell.edu/~ellisk/;https://homepages.inf.ed.ac.uk/snaraya3/;http://gershmanlab.com/;", "dblp": "76/10097-1;284/4959.html;;;67/8366;44/10432;t/JoshuaBTenenbaum", "google_scholar": "https://scholar.google.co.uk/citations?user=tkceMM0AAAAJ;48ZphCEAAAAJ;;L7XI6asAAAAJ;V7D7hxMAAAAJ;0HuMHFwAAAAJ;", "orcid": ";0000-0002-7032-716X;;;0000-0003-4911-7333;;", "linkedin": ";katie-collins-474121175/;;;;;", "or_profile": "~Tuan_Anh_Le1;~Katherine_M._Collins1;~Luke_Hewitt1;~Kevin_Ellis1;~Siddharth_N1;~Samuel_Gershman1;~Joshua_B._Tenenbaum1", "aff": "Massachusetts Institute of Technology;University of Cambridge;;Cornell University;University of Edinburgh;Harvard University;Massachusetts Institute of Technology", "aff_domain": "mit.edu;cam.ac.uk;;cornell.edu;ed.ac.uk;harvard.edu;mit.edu", "position": "Postdoc;PhD student;;Assistant Professor;Reader (Associate Professor);Professor;Professor", "bibtex": "@inproceedings{\nle2022hybrid,\ntitle={Hybrid Memoised Wake-Sleep: Approximate Inference at the Discrete-Continuous Interface},\nauthor={Tuan Anh Le and Katherine M. Collins and Luke Hewitt and Kevin Ellis and Siddharth N and Samuel Gershman and Joshua B. Tenenbaum},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=auOPcdAcoy}\n}", "github": "", "project": "", "reviewers": "pAhD;FY4y;G5jQ", "pdf_size": 0, "recommendation": "6;6;8", "confidence": "4;4;3", "correctness": "4;3;4", "technical_novelty": "3;3;3", "empirical_novelty": "2;2;3", "wc_summary_paper": "53;66;60", "wc_summary_review": "16;41;138", "wc_main_review": "228;510;218", "wc_review": "297;617;416", "wc_reply_reviewers": "30;0;17", "wc_reply_authors": "679;1030;298", "reply_reviewers": "1;0;1", "reply_authors": "1;2;1", "recommendation_avg": [ 6.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 59.666666666666664, 5.312459150169743 ], "wc_summary_review_avg": [ 65.0, 52.61812108643434 ], "wc_main_review_avg": [ 318.6666666666667, 135.35467811970477 ], "wc_review_avg": [ 443.3333333333333, 132.06143351570216 ], "wc_reply_reviewers_avg": [ 15.666666666666666, 12.283683848458853 ], "wc_reply_authors_avg": [ 669.0, 298.92139434975206 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": -0.9999999999999998, "corr_recommendation_correctness": 0.49999999999999983, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16807529631857721058&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=auOPcdAcoy", "email": "mit.edu;cam.ac.uk;;cornell.edu;ed.ac.uk;harvard.edu;mit.edu", "author_num": 7, "aff_unique_index": "0;1;2;3;4;0", "aff_unique_norm": "Massachusetts Institute of Technology;University of Cambridge;Cornell University;University of Edinburgh;Harvard University", "aff_unique_dep": ";;;;", "aff_unique_url": "https://web.mit.edu;https://www.cam.ac.uk;https://www.cornell.edu;https://www.ed.ac.uk;https://www.harvard.edu", "aff_unique_abbr": "MIT;Cambridge;Cornell;Edinburgh;Harvard", "aff_campus_unique_index": "1", "aff_campus_unique": ";Cambridge", "aff_country_unique_index": "0;1;0;1;0;0", "aff_country_unique": "United States;United Kingdom" }, { "title": "Asymmetry Learning for Counterfactually-invariant Classification in OOD Tasks", "status": "Oral", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6946", "id": "avgclFZ221l", "poster": "", "openreview": "https://openreview.net/forum?id=avgclFZ221l", "slides": "https://iclr.cc/virtual/2022/poster/6946", "video": "https://iclr.cc/virtual/2022/poster/6946", "author_site": "S Chandra Mouli, Bruno Ribeiro", "tldr": "", "abstract": "Generalizing from observed to new related environments (out-of-distribution) is central to the reliability of classifiers. However, most classifiers fail to predict label $Y$ from input $X$ when the change in environment is due a (stochastic) input transformation $T^\\text{te} \\circ X'$ not observed in training, as in training we observe $T^\\text{tr} \\circ X'$, where $X'$ is a hidden variable. This work argues that when the transformations in train $T^\\text{tr}$ and test $T^\\text{te}$ are (arbitrary) symmetry transformations induced by a collection of known $m$ equivalence relations, the task of finding a robust OOD classifier can be defined as finding the simplest causal model that defines a causal connection between the target labels and the symmetry transformations that are associated with label changes. We then propose a new learning paradigm, asymmetry learning, that identifies which symmetries the classifier must break in order to correctly predict $Y$ in both train and test. Asymmetry learning performs a causal model search that, under certain identifiability conditions, finds classifiers that perform equally well in-distribution and out-of-distribution. Finally, we show how to learn counterfactually-invariant representations with asymmetry learning in two physics tasks.", "keywords": "out-of-distribution classification;symmetries;counterfactual invariances;geometric deep learning", "primary_area": "", "supplementary_material": "", "author": "S Chandra Mouli;Bruno Ribeiro", "authorids": "~S_Chandra_Mouli1;~Bruno_Ribeiro1", "gender": "M;M", "homepage": "https://www.cs.purdue.edu/homes/chandr/;https://www.cs.purdue.edu/homes/ribeirob/", "dblp": "167/6021;15/606", "google_scholar": "https://scholar.google.com/citations?hl=en;KIEleCsAAAAJ", "orcid": ";0000-0002-3527-6192", "linkedin": ";", "or_profile": "~S_Chandra_Mouli1;~Bruno_Ribeiro1", "aff": "Purdue University;Purdue University", "aff_domain": "purdue.edu;purdue.edu", "position": "PhD student;Assistant Professor", "bibtex": "@inproceedings{\nmouli2022asymmetry,\ntitle={Asymmetry Learning for Counterfactually-invariant Classification in {OOD} Tasks},\nauthor={S Chandra Mouli and Bruno Ribeiro},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=avgclFZ221l}\n}", "github": "", "project": "", "reviewers": "XTXH;RsMV;GzS3", "pdf_size": 0, "recommendation": "8;8;8", "confidence": "2;3;4", "correctness": "4;4;3", "technical_novelty": "3;3;3", "empirical_novelty": "3;2;3", "wc_summary_paper": "140;106;206", "wc_summary_review": "7;48;29", "wc_main_review": "194;300;411", "wc_review": "341;454;646", "wc_reply_reviewers": "0;11;0", "wc_reply_authors": "900;720;1433", "reply_reviewers": "0;1;0", "reply_authors": "3;2;3", "recommendation_avg": [ 8.0, 0.0 ], "confidence_avg": [ 3.0, 0.816496580927726 ], "correctness_avg": [ 3.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 150.66666666666666, 41.51572660517404 ], "wc_summary_review_avg": [ 28.0, 16.753109164172084 ], "wc_main_review_avg": [ 301.6666666666667, 88.59771755274261 ], "wc_review_avg": [ 480.3333333333333, 125.9003132994072 ], "wc_reply_reviewers_avg": [ 3.6666666666666665, 5.185449728701348 ], "wc_reply_authors_avg": [ 1017.6666666666666, 302.73897814600764 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 2.6666666666666665, 0.4714045207910317 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 18, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11174304914119710081&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=avgclFZ221l", "email": "purdue.edu;purdue.edu", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Purdue University", "aff_unique_dep": "", "aff_unique_url": "https://www.purdue.edu", "aff_unique_abbr": "Purdue", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "b-VKxdc5cY", "title": "Distribution Matching in Deep Generative Models with Kernel Transfer Operators", "track": "main", "status": "Reject", "tldr": "", "abstract": "Generative models which use explicit density modeling (e.g., variational autoencoders, flow-based generative models) involve finding a mapping from a known distribution, e.g. Gaussian, to the unknown input distribution. This often requires searching over a class of non-linear functions (e.g., representable by a deep neural network). While effective in practice, the associated runtime/memory costs can increase rapidly, usually as a function of the performance desired in an application. We propose a substantially cheaper (and simpler) distribution matching strategy based on adapting known results on kernel transfer operators. We show that our formulation enables highly efficient distribution approximation and sampling, and offers surprisingly good empirical performance that compares favorably with powerful baselines, but with significant runtime savings. We show that the algorithm also performs well in small sample size settings (in brain imaging). ", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/b94916f5514703967a217c73b5e04493b9f9b4ec.zip", "author": "Zhichun Huang;Rudrasis Chakraborty;Vikas Singh", "authorids": "~Zhichun_Huang1;~Rudrasis_Chakraborty1;~Vikas_Singh1", "gender": "M;M;M", "homepage": ";;http://vsingh-www.cs.wisc.edu/", "dblp": "247/6016.html;http://dblp.uni-trier.de/pers/hd/c/Chakraborty:Rudrasis;", "google_scholar": "qaI1g_MAAAAJ;TB2Z8sgAAAAJ;d32BmwcAAAAJ", "orcid": ";;", "linkedin": "zhichun-huang-59563a132/;;", "or_profile": "~Zhichun_Huang1;~Rudrasis_Chakraborty1;~Vikas_Singh1", "aff": "Carnegie Mellon University;Lawrence Livermore National Labs;University of Wisconsin, Madison", "aff_domain": "andrew.cmu.edu;llnl.gov;wisc.edu", "position": "MS student;Researcher;Professor", "bibtex": "@misc{\nhuang2022distribution,\ntitle={Distribution Matching in Deep Generative Models with Kernel Transfer Operators},\nauthor={Zhichun Huang and Rudrasis Chakraborty and Vikas Singh},\nyear={2022},\nurl={https://openreview.net/forum?id=b-VKxdc5cY}\n}", "github": "", "project": "", "reviewers": "wZ8o;mptM;UugV;hWmF", "site": "https://openreview.net/forum?id=b-VKxdc5cY", "pdf_size": 0, "recommendation": "6;6;6;6", "confidence": "4;3;3;3", "correctness": "3;4;3;4", "technical_novelty": "3;2;2;4", "empirical_novelty": "3;2;2;2", "wc_summary_paper": "51;119;55;180", "wc_summary_review": "24;127;37;35", "wc_main_review": "763;352;164;702", "wc_review": "838;598;256;917", "wc_reply_reviewers": "287;52;42;198", "wc_reply_authors": "1830;855;517;1247", "reply_reviewers": "1;1;1;1", "reply_authors": "4;2;1;3", "recommendation_avg": [ 6.0, 0.0 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.82915619758885 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 101.25, 52.869532814277825 ], "wc_summary_review_avg": [ 55.75, 41.43292772662825 ], "wc_main_review_avg": [ 495.25, 247.3270860621618 ], "wc_review_avg": [ 652.25, 257.1734580006265 ], "wc_reply_reviewers_avg": [ 144.75, 102.75060827070563 ], "wc_reply_authors_avg": [ 1112.25, 488.31924752153685 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 2.5, 1.118033988749895 ], "replies_avg": [ 20, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:aj22Gd5Akh8J:scholar.google.com/&scioq=Distribution+Matching+in+Deep+Generative+Models+with+Kernel+Transfer+Operators&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;2", "aff_unique_norm": "Carnegie Mellon University;Lawrence Livermore National Laboratory;University of Wisconsin", "aff_unique_dep": ";;", "aff_unique_url": "https://www.cmu.edu;https://www.llnl.gov;https://www.wisc.edu", "aff_unique_abbr": "CMU;LLNL;UW", "aff_campus_unique_index": "1", "aff_campus_unique": ";Madison", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "b-ZaBVGx8Q", "title": "DP-REC: Private & Communication-Efficient Federated Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Privacy and communication efficiency are important challenges in federated training of neural networks, and combining them is still an open problem. In this work, we develop a method that unifies highly compressed communication and differential privacy (DP). We introduce a compression technique based on Relative Entropy Coding (REC) to the federated setting. With a minor modification to REC, we obtain a provably differentially private learning algorithm, DP-REC, and show how to compute its privacy guarantees. Our experiments demonstrate that DP-REC drastically reduces communication costs while providing privacy guarantees comparable to the state-of-the-art.", "keywords": "Federated learning;differential privacy;compression;communication efficiency", "primary_area": "", "supplementary_material": "", "author": "Aleksei Triastcyn;Matthias Reisser;Christos Louizos", "authorids": "~Aleksei_Triastcyn1;~Matthias_Reisser1;~Christos_Louizos1", "gender": ";M;", "homepage": ";http://matthiasreisser.github.io/;", "dblp": ";228/6851;", "google_scholar": "https://scholar.google.ch/citations?user=BCWx7iQAAAAJ;;", "orcid": ";;", "linkedin": ";;", "or_profile": "~Aleksei_Triastcyn1;~Matthias_Reisser1;~Christos_Louizos1", "aff": "Qualcomm Inc, QualComm;Qualcomm Inc, QualComm;", "aff_domain": "qti.qualcomm.com;qti.qualcomm.com;", "position": "Researcher;Senior Engineer;", "bibtex": "@misc{\ntriastcyn2022dprec,\ntitle={{DP}-{REC}: Private \\& Communication-Efficient Federated Learning},\nauthor={Aleksei Triastcyn and Matthias Reisser and Christos Louizos},\nyear={2022},\nurl={https://openreview.net/forum?id=b-ZaBVGx8Q}\n}", "github": "", "project": "", "reviewers": "qyw3;5TiV;Kc1E;Uy67", "site": "https://openreview.net/forum?id=b-ZaBVGx8Q", "pdf_size": 0, "recommendation": "6;6;8;8", "confidence": "4;3;3;4", "correctness": "4;3;4;4", "technical_novelty": "3;4;3;3", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "53;212;49;172", "wc_summary_review": "88;25;10;44", "wc_main_review": "498;190;157;293", "wc_review": "639;427;216;509", "wc_reply_reviewers": "0;205;22;16", "wc_reply_authors": "943;1469;249;325", "reply_reviewers": "0;2;1;1", "reply_authors": "3;3;1;2", "recommendation_avg": [ 7.0, 1.0 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 121.5, 71.91835648845154 ], "wc_summary_review_avg": [ 41.75, 29.294837429144405 ], "wc_main_review_avg": [ 284.5, 133.079863240086 ], "wc_review_avg": [ 447.75, 153.67721854588598 ], "wc_reply_reviewers_avg": [ 60.75, 83.67011115087634 ], "wc_reply_authors_avg": [ 746.5, 496.43403388567145 ], "reply_reviewers_avg": [ 1.0, 0.7071067811865476 ], "reply_authors_avg": [ 2.25, 0.82915619758885 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14531387670217241531&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0", "aff_unique_norm": "Qualcomm Incorporated", "aff_unique_dep": "", "aff_unique_url": "https://www.qualcomm.com", "aff_unique_abbr": "Qualcomm", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Bootstrapped Meta-Learning", "status": "Oral", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6252", "id": "b-ny3x071E5", "poster": "", "openreview": "https://openreview.net/forum?id=b-ny3x071E5", "slides": "https://iclr.cc/virtual/2022/poster/6252", "video": "https://iclr.cc/virtual/2022/poster/6252", "author_site": "Sebastian Flennerhag, Yannick Schroecker, Tom Zahavy, Hado van Hasselt, David Silver, Satinder Singh", "tldr": "", "abstract": "Meta-learning empowers artificial intelligence to increase its efficiency by learning how to learn. Unlocking this potential involves overcoming a challenging meta-optimisation problem. We propose an algorithm that tackles this problem by letting the meta-learner teach itself. The algorithm first bootstraps a target from the meta-learner, then optimises the meta-learner by minimising the distance to that target under a chosen (pseudo-)metric. Focusing on meta-learning with gradients, we establish conditions that guarantee performance improvements and show that metric can be used to control meta-optimisation. Meanwhile, the bootstrapping mechanism can extend the effective meta-learning horizon without requiring backpropagation through all updates. We achieve a new state-of-the art for model-free agents on the Atari ALE benchmark and demonstrate that it yields both performance and efficiency gains in multi-task meta-learning. Finally, we explore how bootstrapping opens up new possibilities and find that it can meta-learn efficient exploration in an epsilon-greedy Q-learning agent - without backpropagating through the update rule.", "keywords": "meta-learning;meta-gradients;meta-reinforcement learning", "primary_area": "", "supplementary_material": "", "author": "Sebastian Flennerhag;Yannick Schroecker;Tom Zahavy;Hado van Hasselt;David Silver;Satinder Singh", "authorids": "~Sebastian_Flennerhag1;~Yannick_Schroecker1;~Tom_Zahavy2;~Hado_van_Hasselt1;~David_Silver1;~Satinder_Singh2", "gender": ";M;M;M;;", "homepage": "http://flennerhag.com;;http://tomzahavy.wixsite.com/zahavy;http://hadovanhasselt.com;;", "dblp": "https://dblp.uni-trier.de/pers/hd/f/Flennerhag:Sebastian;180/1434;149/0142;https://dblp.uni-trier.de/pers/h/Hasselt:Hado_van.html;34/3601;", "google_scholar": "https://scholar.google.co.uk/citations?user=SeMQQkcAAAAJ;dNqsv5MAAAAJ;https://scholar.google.co.il/citations?user=9dXN6cMAAAAJ;;;", "orcid": "0000-0003-2354-4193;;;;;", "linkedin": "https://linkedin.com/in/flennerhag;;tomzahavy/;;;", "or_profile": "~Sebastian_Flennerhag1;~Yannick_Schroecker1;~Tom_Zahavy2;~Hado_van_Hasselt1;~David_Silver1;~Satinder_Baveja2", "aff": "Google DeepMind;Google DeepMind;Google DeepMind;Google DeepMind;Google DeepMind;Google DeepMind", "aff_domain": "google.com;google.com;deepmind.com;google.com;deepmind.com;google.com", "position": "Research Scientist;Research Scientist;Research Scientist;Research scientist;Full Professor;Research Scientist", "bibtex": "@inproceedings{\nflennerhag2022bootstrapped,\ntitle={Bootstrapped Meta-Learning},\nauthor={Sebastian Flennerhag and Yannick Schroecker and Tom Zahavy and Hado van Hasselt and David Silver and Satinder Singh},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=b-ny3x071E5}\n}", "github": "", "project": "", "reviewers": "SXZx;gG1V;N6Le;u9p2", "pdf_size": 0, "recommendation": "8;8;10;10", "confidence": "3;3;5;4", "correctness": "4;4;4;4", "technical_novelty": "4;4;3;4", "empirical_novelty": "4;4;3;4", "wc_summary_paper": "294;97;261;89", "wc_summary_review": "143;1;91;24", "wc_main_review": "719;265;241;496", "wc_review": "1156;363;593;609", "wc_reply_reviewers": "0;0;236;0", "wc_reply_authors": "261;385;691;368", "reply_reviewers": "0;0;4;0", "reply_authors": "1;1;3;1", "recommendation_avg": [ 9.0, 1.0 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 4.0, 0.0 ], "technical_novelty_avg": [ 3.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 185.25, 93.02788560426384 ], "wc_summary_review_avg": [ 64.75, 55.98381462530041 ], "wc_main_review_avg": [ 430.25, 194.17952389477114 ], "wc_review_avg": [ 680.25, 291.4081115892281 ], "wc_reply_reviewers_avg": [ 59.0, 102.19099764656376 ], "wc_reply_authors_avg": [ 426.25, 160.07400632207592 ], "reply_reviewers_avg": [ 1.0, 1.7320508075688772 ], "reply_authors_avg": [ 1.5, 0.8660254037844386 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.9045340337332909, "corr_recommendation_correctness": 0.0, "gs_citation": 83, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12149908180561162592&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=b-ny3x071E5", "email": "google.com;google.com;deepmind.com;google.com;deepmind.com;google.com", "author_num": 6, "aff_unique_index": "0;0;0;0;0;0", "aff_unique_norm": "Google", "aff_unique_dep": "Google DeepMind", "aff_unique_url": "https://deepmind.com", "aff_unique_abbr": "DeepMind", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United Kingdom" }, { "id": "b30Yre8MzuN", "title": "NeuroSED: Learning Subgraph Similarity via Graph Neural Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "Subgraph similarity search is a fundamental operator in graph analysis. In this framework, given a query graph and a graph database, the goal is to identify subgraphs of the database graphs that are structurally similar to the query. Subgraph edit distance (SED) is one of the most expressive measures of subgraph similarity. In this work, we study the problem of learning SED from a training set of graph pairs and their SED values. Towards that end, we design a novel siamese graph neural network called NeuroSED, which learns an embedding space with a rich structure reminiscent of SED. With the help of a specially crafted inductive bias, NeuroSED not only enables high accuracy but also ensures that the predicted SED, like true SED, satisfies triangle inequality. The design is generic enough to also model graph edit distance (GED), while ensuring that the predicted GED space is metric, like the true GED space. Extensive experiments on real graph datasets, for both SED and GED, establish that NeuroSED achieves $\\approx 2$ times lower RMSE than the state of the art and is $\\approx 18$ times faster than the fastest baseline. Further, owing to its pair-independent embeddings and theoretical properties, NeuroSED allows orders-of-magnitude faster graph/subgraph retrieval.", "keywords": "Subgraph similarity;graph neural networks", "primary_area": "", "supplementary_material": "", "author": "Rishabh Ranjan;Siddharth Grover;Sourav Medya;Venkatesan Chakaravarthy;Yogish Sabharwal;Sayan Ranu", "authorids": "~Rishabh_Ranjan1;~Siddharth_Grover1;~Sourav_Medya1;~Venkatesan_Chakaravarthy1;~Yogish_Sabharwal1;~Sayan_Ranu2", "gender": "M;M;M;M;M;M", "homepage": "https://rishabh-ranjan.github.io;;https://souravmedya.github.io/;https://dblp.org/pid/c/VTChakaravarthy.html;https://www.cse.iitd.ac.in/~yogish;https://www.cse.iitd.ac.in/~sayan/index.html", "dblp": ";;178/3021;;57/3685.html;38/768", "google_scholar": "NNzQUrcAAAAJ;;RCFhOM4AAAAJ;https://scholar.google.co.in/citations?user=_3I7KHAAAAAJ;https://scholar.google.co.in/citations?user=vkw-hvEAAAAJ;K4w5qYUAAAAJ", "orcid": ";;0000-0003-0996-2807;;;0000-0003-4147-9372", "linkedin": ";siddharth-grover-173853184;sourav-medya-35987a49/;;;", "or_profile": "~Rishabh_Ranjan1;~Siddharth_Grover1;~Sourav_Medya1;~Venkatesan_Chakaravarthy1;~Yogish_Sabharwal1;~Sayan_Ranu2", "aff": "Indian Institute of Technology Delhi, Dhirubhai Ambani Institute Of Information and Communication Technology;Indian Institute of Technology Delhi;Northwestern University;;;Indian Institute of Technology Delhi", "aff_domain": "iitd.ac.in;iitd.ac.in;northwestern.edu;;;iitd.ac.in", "position": "Undergrad student;Undergrad student;Postdoc;;;Associate Professor", "bibtex": "@misc{\nranjan2022neurosed,\ntitle={Neuro{SED}: Learning Subgraph Similarity via Graph Neural Networks},\nauthor={Rishabh Ranjan and Siddharth Grover and Sourav Medya and Venkatesan Chakaravarthy and Yogish Sabharwal and Sayan Ranu},\nyear={2022},\nurl={https://openreview.net/forum?id=b30Yre8MzuN}\n}", "github": "", "project": "", "reviewers": "Y8L2;L1VP;gqTK;ECCJ", "site": "https://openreview.net/forum?id=b30Yre8MzuN", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "4;4;4;3", "correctness": "3;3;3;4", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;3;3;4", "wc_summary_paper": "82;98;27;108", "wc_summary_review": "46;33;46;44", "wc_main_review": "523;325;240;179", "wc_review": "651;456;313;331", "wc_reply_reviewers": "0;0;0;62", "wc_reply_authors": "1772;1439;517;496", "reply_reviewers": "0;0;0;1", "reply_authors": "3;3;1;2", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 78.75, 31.283981524096323 ], "wc_summary_review_avg": [ 42.25, 5.402545696243577 ], "wc_main_review_avg": [ 316.75, 129.8775865959943 ], "wc_review_avg": [ 437.75, 134.8765639390328 ], "wc_reply_reviewers_avg": [ 15.5, 26.846787517317598 ], "wc_reply_authors_avg": [ 1056.0, 562.0200174371016 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 2.25, 0.82915619758885 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:yLKtUUInyJoJ:scholar.google.com/&scioq=NeuroSED:+Learning+Subgraph+Similarity+via+Graph+Neural+Networks&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Indian Institute of Technology Delhi;Northwestern University", "aff_unique_dep": ";", "aff_unique_url": "https://www.iitd.ac.in;https://www.northwestern.edu", "aff_unique_abbr": "IIT Delhi;NU", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Delhi;", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "India;United States" }, { "id": "b4jq1xzirPS", "title": "An Attention-LSTM Hybrid Model for the Coordinated Routing of Multiple Vehicles", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Reinforcement learning has recently shown promise in learning quality solutions in a number of combinatorial optimization problems. In particular, the attention-based encoder-decoder models show high effectiveness on various routing problems, including the Traveling Salesman Problem (TSP). Unfortunately, they perform poorly for the TSP with Drones (TSP-D), requiring routing a heterogeneous fleet of vehicles in coordination. In TSP-D, two different types of vehicles are moving in tandem and may need to wait at a node for the other vehicle to join. State-less attention-based decoder fails to make such coordination between vehicles. We propose an attention encoder-LSTM decoder hybrid model, in which the decoder's hidden state can represent the sequence of actions made. We empirically demonstrate that such a hybrid model improves upon a purely attention-based model for both solution quality and computational efficiency. Our experiments on the min-max Capacitated Vehicle Routing Problem (mmCVRP) also confirm that the hybrid model is more suitable for coordinated routing of multiple vehicles than the attention-based model.", "keywords": "Reinforcement Learning;Combinatorial Optimization;Traveling Salesman Problem with Drones;Vehicle Routing Problem", "primary_area": "", "supplementary_material": "", "author": "Aigerim Bogyrbayeva;Taehyun Yoon;Hanbum Ko;Sungbin Lim;Hyokun Yun;Changhyun Kwon", "authorids": "~Aigerim_Bogyrbayeva1;~Taehyun_Yoon1;~Hanbum_Ko1;~Sungbin_Lim1;~Hyokun_Yun1;~Changhyun_Kwon1", "gender": "F;;M;M;M;M", "homepage": ";https://thyoon.com;https://hanbumko.github.io/about;https://www.sungbin-lim.net;http://bikestra.github.io/;https://www.chkwon.net", "dblp": "276/0439;63/4723;309/8650;206/6907;45/9671;49/34-1", "google_scholar": "Du9nwvYAAAAJ;aVfQYSsAAAAJ;;https://scholar.google.com/citations?hl=ko;W4oOmZEAAAAJ;HFiBSkgAAAAJ", "orcid": ";;;0000-0003-2684-2022;;0000-0001-8455-6396", "linkedin": ";th-yoon/;;sungbin-lim-43b739b5/;hyokun-yun-b4439b7/;chkwon/", "or_profile": "~Aigerim_Bogyrbayeva1;~Taehyun_Yoon1;~Hanbum_Ko1;~Sungbin_Lim1;~Hyokun_Yun1;~Changhyun_Kwon1", "aff": "Suleyman Demirel University;Ulsan National Institute of Science and Technology;Ulsan National Institute of Science and Technology;Ulsan National Institute of Science and Technology;Amazon;Korea Advanced Institute of Science & Technology", "aff_domain": "sdu.edu.kz;unist.ac.kr;unist.ac.kr;unist.ac.kr;amazon.com;kaist.ac.kr", "position": "Assistant Professor;MS student;MS student;Assistant Professor;Machine Learning Scientist;Visiting Professor", "bibtex": "@misc{\nbogyrbayeva2022an,\ntitle={An Attention-{LSTM} Hybrid Model for the Coordinated Routing of Multiple Vehicles},\nauthor={Aigerim Bogyrbayeva and Taehyun Yoon and Hanbum Ko and Sungbin Lim and Hyokun Yun and Changhyun Kwon},\nyear={2022},\nurl={https://openreview.net/forum?id=b4jq1xzirPS}\n}", "github": "", "project": "", "reviewers": "BaqZ;6ehT;Sr9R;2UiK", "site": "https://openreview.net/forum?id=b4jq1xzirPS", "pdf_size": 0, "recommendation": "3;5;5;5", "confidence": "4;3;3;4", "correctness": "2;3;4;3", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;3;2;2", "wc_summary_paper": "53;58;32;71", "wc_summary_review": "21;91;4;115", "wc_main_review": "578;306;84;459", "wc_review": "652;455;120;645", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.5, 0.8660254037844386 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 53.5, 14.044571905188139 ], "wc_summary_review_avg": [ 57.75, 46.42938186105863 ], "wc_main_review_avg": [ 356.75, 184.64476028309062 ], "wc_review_avg": [ 468.0, 215.90391381352956 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.816496580927726, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:vR1gooG40-EJ:scholar.google.com/&scioq=An+Attention-LSTM+Hybrid+Model+for+the+Coordinated+Routing+of+Multiple+Vehicles&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;1;1;2;3", "aff_unique_norm": "Suleyman Demirel University;Ulsan National Institute of Science and Technology;Amazon;Korea Advanced Institute of Science and Technology", "aff_unique_dep": ";;Amazon.com, Inc.;", "aff_unique_url": "https://www.duaz.kz;https://www.unist.ac.kr;https://www.amazon.com;https://www.kaist.ac.kr", "aff_unique_abbr": "SDU;UNIST;Amazon;KAIST", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1;2;1", "aff_country_unique": "Kazakhstan;South Korea;United States" }, { "id": "b8mo34uDObn", "title": "Ensembles and Cocktails: Robust Finetuning for Natural Language Generation", "track": "main", "status": "Reject", "tldr": "", "abstract": "When finetuning a pretrained language model for natural language generation tasks, one is currently faced with a tradeoff. Lightweight finetuning (e.g., prefix-tuning, adapters), which freezes all or most of the parameters of the pretrained model, has been shown to achieve stronger out-of-distribution (OOD) performance than full finetuning, which tunes all of the parameters. However, lightweight finetuning can underperform full finetuning in-distribution (ID). In this work, we present methods to combine the benefits of full and lightweight finetuning, achieving strong performance both ID and OOD. First, we show that an ensemble of the lightweight and full finetuning models achieves the best of both worlds: performance matching the better of full and lightweight finetuning, both ID and OOD. Second, we show that we can achieve similar improvements using a single model instead of two with our proposed cocktail finetuning, which augments full finetuning via distillation from a lightweight model. Finally, we provide some explanatory theory in a multiclass logistic regression setting with a large number of classes, describing how distillation on ID data can transfer the OOD behavior of one model to another.", "keywords": "finetuning;pretrained language models;natural language generation;robustness;prefix-tuning", "primary_area": "", "supplementary_material": "", "author": "John Hewitt;Xiang Lisa Li;Sang Michael Xie;Benjamin Newman;Percy Liang", "authorids": "~John_Hewitt1;~Xiang_Lisa_Li1;~Sang_Michael_Xie1;~Benjamin_Newman1;~Percy_Liang1", "gender": "M;F;;;", "homepage": "https://nlp.stanford.edu/~johnhew/;https://xiangli1999.github.io;https://cs.stanford.edu/~eix/;http://blnewman.com;https://cs.stanford.edu/~pliang/", "dblp": "205/9025;40/1491-63;220/3987;126/5109;04/1701", "google_scholar": "7C27kVMAAAAJ;nzA4P0oAAAAJ;EBNa5IEAAAAJ;QehvrDoAAAAJ;pouyVyUAAAAJ", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~John_Hewitt1;~Xiang_Lisa_Li1;~Sang_Michael_Xie1;~Benjamin_Newman1;~Percy_Liang1", "aff": "Google DeepMind;Stanford University;Stanford University;Stanford University;Stanford University", "aff_domain": "deepmind.com;stanford.edu;stanford.edu;stanford.edu;stanford.edu", "position": "Research Scientist Intern;PhD student;PhD student;MS student;Associate Professor", "bibtex": "@misc{\nhewitt2022ensembles,\ntitle={Ensembles and Cocktails: Robust Finetuning for Natural Language Generation},\nauthor={John Hewitt and Xiang Lisa Li and Sang Michael Xie and Benjamin Newman and Percy Liang},\nyear={2022},\nurl={https://openreview.net/forum?id=b8mo34uDObn}\n}", "github": "", "project": "", "reviewers": "K9aD;rP9V;ioou;Sg93", "site": "https://openreview.net/forum?id=b8mo34uDObn", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "5;4;4;4", "correctness": "2;3;2;3", "technical_novelty": "1;3;4;3", "empirical_novelty": "0;3;2;3", "wc_summary_paper": "92;62;102;75", "wc_summary_review": "48;106;43;39", "wc_main_review": "358;276;402;152", "wc_review": "498;444;547;266", "wc_reply_reviewers": "317;0;255;0", "wc_reply_authors": "1193;146;268;490", "reply_reviewers": "1;0;1;0", "reply_authors": "3;1;1;1", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.75, 1.0897247358851685 ], "empirical_novelty_avg": [ 2.0, 1.224744871391589 ], "wc_summary_paper_avg": [ 82.75, 15.384651442265437 ], "wc_summary_review_avg": [ 59.0, 27.3221521846285 ], "wc_main_review_avg": [ 297.0, 95.14725429564429 ], "wc_review_avg": [ 438.75, 106.18233139275102 ], "wc_reply_reviewers_avg": [ 143.0, 144.67031485415382 ], "wc_reply_authors_avg": [ 524.25, 405.3198582601153 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.5, 0.8660254037844386 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.9271726499455306, "corr_recommendation_correctness": 0.6882472016116854, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15964745744916448392&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "aff_unique_index": "0;1;1;1;1", "aff_unique_norm": "Google;Stanford University", "aff_unique_dep": "Google DeepMind;", "aff_unique_url": "https://deepmind.com;https://www.stanford.edu", "aff_unique_abbr": "DeepMind;Stanford", "aff_campus_unique_index": "1;1;1;1", "aff_campus_unique": ";Stanford", "aff_country_unique_index": "0;1;1;1;1", "aff_country_unique": "United Kingdom;United States" }, { "id": "bB6YLDJewoK", "title": "Simpler Calibration for Survival Analysis", "track": "main", "status": "Reject", "tldr": "", "abstract": "Survival analysis, also known as time-to-event analysis, is the problem to predict the distribution of the time of the occurrence of an event. This problem has applications in various fields such as healthcare, security, and finance. While there have been many neural network models proposed for survival analysis, none of them are calibrated. This means that the average of the predicted distribution is different from the actual distribution in the dataset. Therefore, X-CAL has recently been proposed for the calibration, which is supposed to be used as a regularization term in the loss function of a neural network. X-CAL is formulated on the basis of the widely used definition of calibration for distribution regression. In this work, we propose new calibration definitions for distribution regression and survival analysis, and demonstrate a simpler alternative to X-CAL based on the new calibration definition for survival analysis.\n", "keywords": "survival analysis;time-to-event analysis;calibration", "primary_area": "", "supplementary_material": "", "author": "Hiroki Yanagisawa;Toshiya Iwamori;Akira Koseki;Michiharu Kudo;Mohamed Ghalwash;Prithwish Chakraborty", "authorids": "~Hiroki_Yanagisawa1;~Toshiya_Iwamori1;~Akira_Koseki1;~Michiharu_Kudo1;~Mohamed_Ghalwash1;~Prithwish_Chakraborty1", "gender": "M;M;M;M;M;", "homepage": ";;;https://researcher.watson.ibm.com/researcher/view.php?person=jp-KUDO;https://m-fakhry.github.io/;https://prithwi.github.io/", "dblp": "79/687;244/7862.html;86/2676;58/3839;123/5916;25/8076", "google_scholar": ";;;https://scholar.google.com/citations?hl=ja;EXFJTaUAAAAJ;xKSM8cQAAAAJ", "orcid": ";;;0000-0003-1575-6305;;", "linkedin": ";;ak-110061175/;michiharu-kudo-26576512/;;", "or_profile": "~Hiroki_Yanagisawa1;~Toshiya_Iwamori1;~Akira_Koseki1;~Michiharu_Kudo1;~Mohamed_Ghalwash1;~Prithwish_Chakraborty1", "aff": "International Business Machines;International Business Machines;International Business Machines;;International Business Machines;International Business Machines", "aff_domain": "ibm.com;ibm.com;ibm.com;;ibm.com;ibm.com", "position": "Researcher;Researcher;Researcher;;Research Scientist;Research Staff Member", "bibtex": "@misc{\nyanagisawa2022simpler,\ntitle={Simpler Calibration for Survival Analysis},\nauthor={Hiroki Yanagisawa and Toshiya Iwamori and Akira Koseki and Michiharu Kudo and Mohamed Ghalwash and Prithwish Chakraborty},\nyear={2022},\nurl={https://openreview.net/forum?id=bB6YLDJewoK}\n}", "github": "", "project": "", "reviewers": "sM7d;xdbC;KZxz;MTaz", "site": "https://openreview.net/forum?id=bB6YLDJewoK", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "4;4;5;3", "correctness": "3;3;4;3", "technical_novelty": "1;2;2;3", "empirical_novelty": "1;2;2;2", "wc_summary_paper": "42;68;62;92", "wc_summary_review": "42;63;52;61", "wc_main_review": "447;609;592;405", "wc_review": "531;740;706;558", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 1.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 66.0, 17.832554500127006 ], "wc_summary_review_avg": [ 54.5, 8.32165848854662 ], "wc_main_review_avg": [ 513.25, 88.70844097378783 ], "wc_review_avg": [ 633.75, 90.5604080158653 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:MLDzmgK4UfsJ:scholar.google.com/&scioq=Simpler+Calibration+for+Survival+Analysis&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "International Business Machines Corporation", "aff_unique_dep": "", "aff_unique_url": "https://www.ibm.com", "aff_unique_abbr": "IBM", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "bBrmOMYVrh", "title": "LiST: Lite Self-training Makes Efficient Few-shot Learners", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "We present a new method LiST for efficient fine-tuning of large pre-trained language models (PLMs) in few-shot learning settings. LiST significantly improves over recent methods that adopt prompt-tuning using two key techniques. The first one is the use of self-training to leverage large amounts of unlabeled data for prompt tuning to significantly boost the model performance in few-shot settings. We use self-training in conjunction with meta-learning for re-weighting noisy pseudo-prompt labels. However, traditional self-training is also quite expensive as it requires updating all the model parameters repetitively. Therefore, we introduce a second technique for light-weight fine-tuning where we only update a small number of the model parameters. To this end, we introduce a small number of task-specific adapter parameters that are tuned during self-training while keeping the PLM encoder frozen. This also significantly reduces the overall model footprint across several tasks that can now share a common PLM encoder as backbone for inference. Combining the above techniques, LiST not only improves the model performance for few-shot learning on target domains but also reduces the model memory footprint. We present a comprehensive study on six NLU tasks to validate the effectiveness of LiST . The results show that LiST improves by 35% over classic fine-tuning and 6% over prompt-tuning with 96% reduction in the number of trainable parameters when fine-tuned with only 30 labeled examples from the target domain.", "keywords": "Prompt fine-tuning;Semi-supervised Learning;Few-shot;NLP", "primary_area": "", "supplementary_material": "/attachment/355545fb8cd13076605ca59ae25c83a831bf8d3e.zip", "author": "Yaqing Wang;Subhabrata Mukherjee;Xiaodong Liu;Jing Gao;Ahmed Hassan Awadallah;Jianfeng Gao", "authorids": "~Yaqing_Wang1;~Subhabrata_Mukherjee2;~Xiaodong_Liu1;~Jing_Gao1;~Ahmed_Hassan_Awadallah1;~Jianfeng_Gao1", "gender": "M;;;M;M;F", "homepage": "https://yaqingwang.github.io/;https://subhomukherjee.com/;;https://www.microsoft.com/en-us/research/people/hassanam/publications/;https://www.microsoft.com/en-us/research/people/jfgao/;https://engineering.purdue.edu/~jinggao/", "dblp": "147/1393;37/11030.html;65/622;147/9148;92/5339;67/4834-4", "google_scholar": "_Rfg2CAAAAAJ;T4iBN5cAAAAJ;NIewcxMAAAAJ;sNGk-9MAAAAJ;https://scholar.google.com/citations?hl=en;Ftj1h4cAAAAJ", "orcid": ";;;;;", "linkedin": ";subho87;;ahmed-hassan-awadallah-a355a27/;;", "or_profile": "~Yaqing_Wang1;~Subhabrata_Mukherjee2;~Xiaodong_Liu1;~Ahmed_Hassan_Awadallah1;~Jianfeng_Gao1;~Jing_Gao2", "aff": "Purdue University;Microsoft;Microsoft Research;Microsoft Research;Microsoft Research;Purdue University", "aff_domain": "purdue.edu;microsoft.com;microsoft.com;microsoft.com;microsoft.com;purdue.edu", "position": "PhD student;Principal Researcher;Researcher;Principal Researcher;Principal Researcher;Associate Professor", "bibtex": "@misc{\nwang2022list,\ntitle={Li{ST}: Lite Self-training Makes Efficient Few-shot Learners},\nauthor={Yaqing Wang and Subhabrata Mukherjee and Xiaodong Liu and Jing Gao and Ahmed Hassan Awadallah and Jianfeng Gao},\nyear={2022},\nurl={https://openreview.net/forum?id=bBrmOMYVrh}\n}", "github": "", "project": "", "reviewers": "xbnK;QNKo;whFs;pyKD", "site": "https://openreview.net/forum?id=bBrmOMYVrh", "pdf_size": 0, "recommendation": "3;5;5;8", "confidence": "5;3;4;4", "correctness": "3;3;3;4", "technical_novelty": "1;2;3;3", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "81;70;76;56", "wc_summary_review": "45;47;22;40", "wc_main_review": "810;376;92;274", "wc_review": "936;493;190;370", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "3143;1218;773;216", "reply_reviewers": "0;0;0;0", "reply_authors": "6;2;2;1", "recommendation_avg": [ 5.25, 1.7853571071357126 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.82915619758885 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 70.75, 9.364160400164021 ], "wc_summary_review_avg": [ 38.5, 9.86154146165801 ], "wc_main_review_avg": [ 388.0, 264.0265138201086 ], "wc_review_avg": [ 497.25, 275.27928999472516 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 1337.5, 1101.1962813231798 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.75, 1.920286436967152 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.39605901719066966, "corr_recommendation_correctness": 0.8892972917998875, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17574747323819041261&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;1;1;1;0", "aff_unique_norm": "Purdue University;Microsoft", "aff_unique_dep": ";Microsoft Corporation", "aff_unique_url": "https://www.purdue.edu;https://www.microsoft.com", "aff_unique_abbr": "Purdue;Microsoft", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Learning Features with Parameter-Free Layers", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7139", "id": "bCrdi4iVvv", "poster": "", "openreview": "https://openreview.net/forum?id=bCrdi4iVvv", "slides": "https://iclr.cc/virtual/2022/poster/7139", "video": "https://iclr.cc/virtual/2022/poster/7139", "author_site": "Dongyoon Han, YoungJoon Yoo, Beomyoung Kim, Byeongho Heo", "tldr": "", "abstract": "Trainable layers such as convolutional building blocks are the standard network design choices by learning parameters to capture the global context through successive spatial operations. When designing an efficient network, trainable layers such as the depthwise convolution is the source of efficiency in the number of parameters and FLOPs, but there was little improvement to the model speed in practice. This paper argues that simple built-in parameter-free operations can be a favorable alternative to the efficient trainable layers replacing spatial operations in a network architecture. We aim to break the stereotype of organizing the spatial operations of building blocks into trainable layers. Extensive experimental analyses based on layer-level studies with fully-trained models and neural architecture searches are provided to investigate whether parameter-free operations such as the max-pool are functional. The studies eventually give us a simple yet effective idea for redesigning network architectures, where the parameter-free operations are heavily used as the main building block without sacrificing the model accuracy as much. Experimental results on the ImageNet dataset demonstrate that the network architectures with parameter-free operations could enjoy the advantages of further efficiency in terms of model speed, the number of the parameters, and FLOPs. Code and ImageNet pretrained models are available at https://github.com/naver-ai/PfLayer.\n\n", "keywords": "ImageNet;efficient network architecture;network design;image classification", "primary_area": "", "supplementary_material": "", "author": "Dongyoon Han;YoungJoon Yoo;Beomyoung Kim;Byeongho Heo", "authorids": "~Dongyoon_Han1;~YoungJoon_Yoo1;~Beomyoung_Kim1;~Byeongho_Heo1", "gender": "M;M;M;M", "homepage": "https://dongyoonhan.github.io/;http://sites.google.com/view/yjyoo3312;https://beomyoung-kim.github.io/;https://sites.google.com/view/byeongho-heo/home", "dblp": "151/8876;146/4031;264/6398;142/2705", "google_scholar": "jcP7m1QAAAAJ;YGVqRuIAAAAJ;n_TR1LcAAAAJ;https://scholar.google.co.kr/citations?user=4_7rLDIAAAAJ", "orcid": "0000-0002-9130-8195;;;", "linkedin": "https://linkedin.com/in/dongyoon-han-04961a120/en;;beomyoung-kim/;byeongho-heo-1a7756122/", "or_profile": "~Dongyoon_Han1;~YoungJoon_Yoo1;~Beomyoung_Kim1;~Byeongho_Heo1", "aff": "NAVER;NAVER;NAVER CLOVA;NAVER AI Lab", "aff_domain": "navercorp.com;navercorp.com;navercorp.com;navercorp.com", "position": "Research Scientist;Researcher;Researcher;Researcher", "bibtex": "@inproceedings{\nhan2022learning,\ntitle={Learning Features with Parameter-Free Layers},\nauthor={Dongyoon Han and YoungJoon Yoo and Beomyoung Kim and Byeongho Heo},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=bCrdi4iVvv}\n}", "github": "", "project": "", "reviewers": "PJUA;KvAV;DuSH;HgeU", "pdf_size": 0, "recommendation": "6;6;6;8", "confidence": "4;4;4;4", "correctness": "3;3;3;4", "technical_novelty": "3;2;3;4", "empirical_novelty": "2;3;2;4", "wc_summary_paper": "58;87;75;49", "wc_summary_review": "137;85;22;7", "wc_main_review": "264;842;161;118", "wc_review": "459;1014;258;174", "wc_reply_reviewers": "0;78;0;0", "wc_reply_authors": "827;1561;1174;144", "reply_reviewers": "0;1;0;0", "reply_authors": "2;4;3;1", "recommendation_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 67.25, 14.737282653189496 ], "wc_summary_review_avg": [ 62.75, 51.90556328564406 ], "wc_main_review_avg": [ 346.25, 291.09652608713833 ], "wc_review_avg": [ 476.25, 327.2845665472174 ], "wc_reply_reviewers_avg": [ 19.5, 33.77499074759311 ], "wc_reply_authors_avg": [ 926.5, 521.0693331985677 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 2.5, 1.118033988749895 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 1.0, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18180829610140817876&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=bCrdi4iVvv", "email": "navercorp.com;navercorp.com;navercorp.com;navercorp.com", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "NAVER Corporation", "aff_unique_dep": "", "aff_unique_url": "https://www.naver.com", "aff_unique_abbr": "NAVER", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "South Korea" }, { "id": "bE239PSGIGZ", "title": "Synthesising Audio Adversarial Examples for Automatic Speech Recognition", "track": "main", "status": "Reject", "tldr": "", "abstract": "Adversarial examples in automatic speech recognition (ASR) are naturally sounded by humans yet capable of fooling well trained ASR models to transcribe incorrectly. Existing audio adversarial examples are typically constructed by adding constrained perturbations on benign audio inputs. Such attacks are therefore generated with an audio dependent assumption. For the first time, we propose the Speech Synthesising based Attack (SSA), a novel threat model that constructs audio adversarial examples entirely from scratch, i.e., without depending on any existing audio) to fool cutting-edge ASR models. To this end, we introduce a conditional variational auto-encoder (CVAE) as the speech synthesiser. Meanwhile, an adaptive sign gradient descent algorithm is proposed to solve the adversarial audio synthesis task. Experiments on three datasets (i.e., Audio Mnist, Common Voice, and Librispeech) show that our method could synthesise audio adversarial examples that are naturally sounded but misleading the start-of-the-art ASR models. The project webpage containing generated audio demos is at https://sites.google.com/view/ssa-asr/home. ", "keywords": "Adversarial Attack;Speech Synthesise;Automatic Speech Recognition", "primary_area": "", "supplementary_material": "", "author": "Xinghua Qu;pengfei wei;Mingyong Gao;Zhu Sun;Yew-Soon Ong;Zejun MA", "authorids": "~Xinghua_Qu1;~pengfei_wei2;~Mingyong_Gao1;~Zhu_Sun1;~Yew-Soon_Ong1;~Zejun_MA1", "gender": "M;M;;F;M;M", "homepage": "https://xinghua-qu.github.io/;https://pengfei-wei.com/;;https://sites.google.com/view/zhusun/home;;http://www.ntu.edu.sg/home/asysong/", "dblp": "18/1099;29/11273-1;195/8137;163/5129-1.html;;64/4136", "google_scholar": "https://scholar.google.com.sg/citations?user=2PxlmU0AAAAJ;https://scholar.google.com.sg/citations?user=a94WthkAAAAJ;;https://scholar.google.com.sg/citations?user=kJy0fd8AAAAJ;https://scholar.google.com/citations?hl=zh-CN;https://scholar.google.com.tw/citations?user=h9oWOsEAAAAJ", "orcid": "0000-0001-8072-2019;;;0000-0002-3350-7022;;0000-0002-4480-169X", "linkedin": "xinghua-qu/;;;;zejun-ma-58614365/;", "or_profile": "~Xinghua_Qu1;~pengfei_wei2;~Mingyong_Gao1;~Zhu_Sun1;~Zejun_MA1;~Yew_Soon_Ong1", "aff": "Bytedance Seed;AI LAB Bytedance;University of Science and Technology of China, Tsinghua University;Institute of High Performance Computing, Singapore, A*STAR;ByteDance Inc.;Nanyang Technological University", "aff_domain": "bytedance.com;bytedance.com;ustc.edu.cn;ihpc.a-star.edu.sg;bytedance.com;ntu.edu.sg", "position": "Research Scientist;Researcher;PhD student;Researcher;Principal Researcher;Full Professor", "bibtex": "@misc{\nqu2022synthesising,\ntitle={Synthesising Audio Adversarial Examples for Automatic Speech Recognition},\nauthor={Xinghua Qu and pengfei wei and Mingyong Gao and Zhu Sun and Yew-Soon Ong and Zejun MA},\nyear={2022},\nurl={https://openreview.net/forum?id=bE239PSGIGZ}\n}", "github": "", "project": "", "reviewers": "4HAP;Hbzp;aMq5;Krqb", "site": "https://openreview.net/forum?id=bE239PSGIGZ", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "4;4;4;4", "correctness": "3;3;3;4", "technical_novelty": "3;3;2;3", "empirical_novelty": "2;3;3;4", "wc_summary_paper": "71;69;100;90", "wc_summary_review": "60;87;88;11", "wc_main_review": "328;582;415;212", "wc_review": "459;738;603;313", "wc_reply_reviewers": "231;0;0;25", "wc_reply_authors": "1292;890;1001;278", "reply_reviewers": "1;0;0;1", "reply_authors": "3;2;2;1", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 82.5, 13.009611831257688 ], "wc_summary_review_avg": [ 61.5, 31.244999599935987 ], "wc_main_review_avg": [ 384.25, 134.98587889108992 ], "wc_review_avg": [ 528.25, 158.67478533150754 ], "wc_reply_reviewers_avg": [ 64.0, 96.95617566715387 ], "wc_reply_authors_avg": [ 865.25, 369.4654077176915 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 2.0, 0.7071067811865476 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.9271726499455306, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=95502913691872848&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "aff_unique_index": "0;0;1;2;0;3", "aff_unique_norm": "ByteDance;University of Science and Technology of China;Institute of High Performance Computing;Nanyang Technological University", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.bytedance.com;http://www.ustc.edu.cn/;https://www.ihpc.a-star.edu.sg;https://www.ntu.edu.sg", "aff_unique_abbr": "Bytedance;USTC;IHPC;NTU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;0;1", "aff_country_unique": "China;Singapore" }, { "title": "Policy improvement by planning with Gumbel", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6418", "id": "bERaNdoegnO", "poster": "", "openreview": "https://openreview.net/forum?id=bERaNdoegnO", "slides": "https://iclr.cc/virtual/2022/poster/6418", "video": "https://iclr.cc/virtual/2022/poster/6418", "author_site": "Ivo Danihelka, Arthur Guez, Julian Schrittwieser, David Silver", "tldr": "", "abstract": "AlphaZero is a powerful reinforcement learning algorithm based on approximate policy iteration and tree search. However, AlphaZero can fail to improve its policy network, if not visiting all actions at the root of a search tree. To address this issue, we propose a policy improvement algorithm based on sampling actions without replacement. Furthermore, we use the idea of policy improvement to replace the more heuristic mechanisms by which AlphaZero selects and uses actions, both at root nodes and at non-root nodes. Our new algorithms, Gumbel AlphaZero and Gumbel MuZero, respectively without and with model-learning, match the state of the art on Go, chess, and Atari, and significantly improve prior performance when planning with few simulations.", "keywords": "AlphaZero;MuZero;reinforcement learning", "primary_area": "", "supplementary_material": "", "author": "Ivo Danihelka;Arthur Guez;Julian Schrittwieser;David Silver", "authorids": "~Ivo_Danihelka1;~Arthur_Guez1;~Julian_Schrittwieser1;~David_Silver1", "gender": "M;M;;", "homepage": ";https://www.gatsby.ucl.ac.uk/~aguez/;http://www.furidamu.org;", "dblp": "26/2791;;;34/3601", "google_scholar": "https://scholar.google.co.uk/citations?user=1TTFBEkAAAAJ;https://scholar.google.co.uk/citations?user=iyD9aw8AAAAJ;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Ivo_Danihelka1;~Arthur_Guez1;~Julian_Schrittwieser1;~David_Silver1", "aff": "Google DeepMind;Google DeepMind;Google DeepMind;Google DeepMind", "aff_domain": "deepmind.com;google.com;deepmind.com;deepmind.com", "position": "Research Scientist;Research Scientist;Researcher;Full Professor", "bibtex": "@inproceedings{\ndanihelka2022policy,\ntitle={Policy improvement by planning with Gumbel},\nauthor={Ivo Danihelka and Arthur Guez and Julian Schrittwieser and David Silver},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=bERaNdoegnO}\n}", "github": "", "project": "", "reviewers": "Qmbr;Qbi3;dky3;urs9", "pdf_size": 0, "recommendation": "6;8;8;8", "confidence": "4;3;3;3", "correctness": "3;4;4;4", "technical_novelty": "3;3;4;3", "empirical_novelty": "4;3;4;3", "wc_summary_paper": "60;111;66;117", "wc_summary_review": "86;46;46;35", "wc_main_review": "900;761;618;302", "wc_review": "1046;918;730;454", "wc_reply_reviewers": "0;13;26;0", "wc_reply_authors": "135;235;341;121", "reply_reviewers": "0;1;1;0", "reply_authors": "1;1;2;1", "recommendation_avg": [ 7.5, 0.8660254037844386 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.5, 0.5 ], "wc_summary_paper_avg": [ 88.5, 25.675864152935535 ], "wc_summary_review_avg": [ 53.25, 19.434183800715687 ], "wc_main_review_avg": [ 645.25, 221.84383583953826 ], "wc_review_avg": [ 787.0, 222.69934889891348 ], "wc_reply_reviewers_avg": [ 9.75, 10.779030568655049 ], "wc_reply_authors_avg": [ 208.0, 88.48163651289458 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -1.0, "corr_recommendation_correctness": 1.0, "gs_citation": 89, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7251499641538462070&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=bERaNdoegnO", "email": "deepmind.com;google.com;deepmind.com;deepmind.com", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Google", "aff_unique_dep": "Google DeepMind", "aff_unique_url": "https://deepmind.com", "aff_unique_abbr": "DeepMind", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United Kingdom" }, { "id": "bHqI0DvSIId", "title": "Neural Simulated Annealing", "track": "main", "status": "Reject", "tldr": "", "abstract": "Simulated annealing (SA) is a stochastic global optimisation technique applicable to a wide range of discrete and continuous variable problems. Despite its simplicity, the development of an effective SA optimiser for a given problem hinges on a handful of carefully handpicked components; namely, neighbour proposal distribution and temperature annealing schedule. In this work, we view SA from a reinforcement learning perspective and frame the proposal distribution as a policy, which can be optimised for higher solution quality given a fixed computational budget. We demonstrate that this Neural SA with such a learnt proposal distribution outperforms SA baselines with hand-selected parameters on a number of problems: Rosenbrock's function, the Knapsack problem, the Bin Packing problem, and the Travelling Salesperson problem. We also show that Neural SA scales well to large problems while again outperforming popular off-the-shelf solvers in terms of solution quality and wall clock time.", "keywords": "Combinatorial Optimization;Reinforcement Learning;Evolution Strategies;Simulated Annealing", "primary_area": "", "supplementary_material": "", "author": "Alvaro Correia;Daniel E. Worrall;Roberto Bondesan", "authorids": "~Alvaro_Correia1;~Daniel_E._Worrall1;~Roberto_Bondesan1", "gender": "M;M;M", "homepage": "https://danielewworrall.github.io/;https://www.imperial.ac.uk/people/r.bondesan;https://alcorreia.github.io/", "dblp": "187/1680;242/9104;222/2873", "google_scholar": "613GPbQAAAAJ;l2z7p3oAAAAJ;E9h9QKEAAAAJ", "orcid": ";;0000-0001-5291-0653", "linkedin": "daniel-worrall-46a43238/;;", "or_profile": "~Daniel_E._Worrall1;~Roberto_Bondesan1;~Alvaro_Henrique_Chaim_Correia1", "aff": "Qualcomm Inc, QualComm;Qualcomm AI Research;Eindhoven University of Technology", "aff_domain": "qti.qualcomm.com;qualcomm.com;tue.nl", "position": "Postdoc;Deep Learning Research Engineer;PhD student", "bibtex": "@misc{\ncorreia2022neural,\ntitle={Neural Simulated Annealing},\nauthor={Alvaro Correia and Daniel E. Worrall and Roberto Bondesan},\nyear={2022},\nurl={https://openreview.net/forum?id=bHqI0DvSIId}\n}", "github": "", "project": "", "reviewers": "4KAJ;C2r3;cPBg;HRDu", "site": "https://openreview.net/forum?id=bHqI0DvSIId", "pdf_size": 0, "recommendation": "5;5;6;8", "confidence": "5;4;4;4", "correctness": "2;3;4;4", "technical_novelty": "3;3;3;3", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "63;166;105;266", "wc_summary_review": "38;137;47;139", "wc_main_review": "206;844;203;387", "wc_review": "307;1147;355;792", "wc_reply_reviewers": "0;577;8;116", "wc_reply_authors": "463;2661;792;1063", "reply_reviewers": "0;2;1;1", "reply_authors": "1;4;1;2", "recommendation_avg": [ 6.0, 1.224744871391589 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 150.0, 76.33151380655306 ], "wc_summary_review_avg": [ 90.25, 47.861127232859864 ], "wc_main_review_avg": [ 410.0, 261.41442194339623 ], "wc_review_avg": [ 650.25, 343.45551021930044 ], "wc_reply_reviewers_avg": [ 175.25, 236.4311474827291 ], "wc_reply_authors_avg": [ 1244.75, 844.8243530462412 ], "reply_reviewers_avg": [ 1.0, 0.7071067811865476 ], "reply_authors_avg": [ 2.0, 1.224744871391589 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.4714045207910316, "corr_recommendation_correctness": 0.7385489458759963, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8620156498699286178&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff_unique_index": "0;1;2", "aff_unique_norm": "Qualcomm Incorporated;Qualcomm;Eindhoven University of Technology", "aff_unique_dep": ";Qualcomm AI Research;", "aff_unique_url": "https://www.qualcomm.com;https://www.qualcomm.com/research;https://www.tue.nl", "aff_unique_abbr": "Qualcomm;QAI;TU/e", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1", "aff_country_unique": "United States;Netherlands" }, { "id": "bM45i3LQBdl", "title": "Combining Differential Privacy and Byzantine Resilience in Distributed SGD", "track": "main", "status": "Reject", "tldr": "", "abstract": "Privacy and Byzantine resilience (BR) are two crucial requirements of modern-day distributed machine learning. The two concepts have been extensively studied individually but the question of how to combine them effectively remains unanswered. This paper contributes to addressing this question by studying the extent to which the distributed SGD algorithm, in the standard parameter-server architecture, can learn an accurate model despite (a) a fraction of the workers being malicious (Byzantine), and (b) the other fraction, whilst being honest, providing noisy information to the server to ensure differential privacy (DP). We first observe that the integration of standard practices in DP and BR is not straightforward. In fact, we show that many existing results on the convergence of distributed SGD under Byzantine faults, especially those relying on $(\\alpha,f)$-Byzantine resilience, are rendered invalid when honest workers enforce DP. To circumvent this shortcoming, we revisit the theory of $(\\alpha,f)$-BR to obtain an approximate convergence guarantee. Our analysis provides key insights on how to improve this guarantee through hyperparameter optimization. Essentially, our theoretical and empirical results show that (1) an imprudent combination of standard approaches to DP and BR might be fruitless, but (2) by carefully re-tuning the learning algorithm, we can obtain reasonable learning accuracy while simultaneously guaranteeing DP and BR. ", "keywords": "Distributed SGD;Byzantine resilience", "primary_area": "", "supplementary_material": "/attachment/cd6103053c152ef50bbb62e403a8a8dd5e75140f.zip", "author": "Rachid Guerraoui;Nirupam Gupta;Rafael Pinot;S\u00e9bastien Rouault;John Stephan", "authorids": "~Rachid_Guerraoui1;~Nirupam_Gupta1;~Rafael_Pinot1;~S\u00e9bastien_Rouault1;~John_Stephan1", "gender": "M;;;M;", "homepage": "https://lpdwww.epfl.ch/rachid/;;;https://sebastien.rouau.lt;", "dblp": "g/RachidGuerraoui;;;203/8639;", "google_scholar": ";;;5pSk6VAAAAAJ;I58gv9UAAAAJ", "orcid": ";;;;", "linkedin": ";;;;john-stephan-b3353159/", "or_profile": "~Rachid_Guerraoui1;~Nirupam_Gupta1;~Rafael_Pinot1;~S\u00e9bastien_Rouault1;~John_Stephan1", "aff": ";;;Calicarpa;EPFL", "aff_domain": ";;;calicarpa.com;epfl.ch", "position": ";;;CTO;PhD student", "bibtex": "@misc{\nguerraoui2022combining,\ntitle={Combining Differential Privacy and Byzantine Resilience in Distributed {SGD}},\nauthor={Rachid Guerraoui and Nirupam Gupta and Rafael Pinot and S{\\'e}bastien Rouault and John Stephan},\nyear={2022},\nurl={https://openreview.net/forum?id=bM45i3LQBdl}\n}", "github": "", "project": "", "reviewers": "NXde;Vukg;RoTT;tyNc", "site": "https://openreview.net/forum?id=bM45i3LQBdl", "pdf_size": 0, "recommendation": "3;3;6;6", "confidence": "3;3;3;4", "correctness": "3;3;4;4", "technical_novelty": "2;1;2;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "82;107;65;40", "wc_summary_review": "53;45;14;63", "wc_main_review": "630;406;61;95", "wc_review": "765;558;140;198", "wc_reply_reviewers": "21;408;0;0", "wc_reply_authors": "1143;653;131;138", "reply_reviewers": "1;1;0;0", "reply_authors": "2;2;1;1", "recommendation_avg": [ 4.5, 1.5 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 73.5, 24.438698819699873 ], "wc_summary_review_avg": [ 43.75, 18.32177666057525 ], "wc_main_review_avg": [ 298.0, 234.12923781535702 ], "wc_review_avg": [ 415.25, 257.71241239024556 ], "wc_reply_reviewers_avg": [ 107.25, 173.8496117338201 ], "wc_reply_authors_avg": [ 516.25, 419.22748896034955 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.5773502691896258, "corr_recommendation_correctness": 1.0, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1, "aff_unique_index": "0;1", "aff_unique_norm": "Calicarpa;EPFL", "aff_unique_dep": ";", "aff_unique_url": ";https://www.epfl.ch", "aff_unique_abbr": ";EPFL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "1", "aff_country_unique": ";Switzerland" }, { "id": "bM5L3GLi6bG", "title": "Open Set Domain Adaptation with Zero-shot Learning on Graph", "track": "main", "status": "Reject", "tldr": "", "abstract": "Open set domain adaptation focuses on transferring the information from a richly labeled domain called \\emph{source domain} to a scarcely labeled domain called \\emph{target domain} while classifying the unseen target samples as one \\emph{unknown} class in an unsupervised way. Compared with the close set domain adaptation, where the source domain and the target domain share the same class space, the classification of the unknown class makes it easy to adapt to the realistic environment. Particularly, after the recognition of the unknown samples, the robot can either ask for manually labeling or further develop the classification ability of the unknown classes based on pre-stored knowledge. Inspired by this idea, in this paper we propose a model for open set domain adaptation with zero-shot learning on the unknown classes. We utilize adversarial learning to align the two domains while rejecting the unknown classes. Then the knowledge graph is introduced to generate the classifiers for the unknown classes with the employment of the graph convolution network (GCN). Thus the classification ability of the source domain is transferred to the target domain and the model can distinguish the unknown classes with prior knowledge. We evaluate our model on digits datasets and the result shows superior performance.", "keywords": "open set domain adaptation;zero-shot learning;knowledge graph;graph convolutional network;adversarial learning", "primary_area": "", "supplementary_material": "", "author": "Xinyue Zhang;Xu Yang;Zhi-yong Liu", "authorids": "~Xinyue_Zhang1;~Xu_Yang1;~Zhi-yong_Liu1", "gender": "F;M;M", "homepage": ";http://people.ucas.ac.cn/~XuYang;https://people.ucas.ac.cn/~liuzhiyong", "dblp": ";63/1534-4.html;16/5205-1.html", "google_scholar": ";https://scholar.google.com.hk/citations?hl=zh-CN;https://scholar.google.com/citations?hl=zh-CN", "orcid": "0000-0003-3152-8900;0000-0003-0553-4581;", "linkedin": ";;", "or_profile": "~Xinyue_Zhang1;~Xu_Yang1;~Zhi-yong_Liu1", "aff": "Institute of Automation, Chinese Academy of Sciences;Institute of Automation of Chinese academy of science;Institute of Automation, Chinese Academy of Sciences", "aff_domain": "ia.ac.cn;ia.ac.cn;ia.ac.cn", "position": "MS student;Associate Professor;Full Professor", "bibtex": "@misc{\nzhang2022open,\ntitle={Open Set Domain Adaptation with Zero-shot Learning on Graph},\nauthor={Xinyue Zhang and Xu Yang and Zhi-yong Liu},\nyear={2022},\nurl={https://openreview.net/forum?id=bM5L3GLi6bG}\n}", "github": "", "project": "", "reviewers": "fmyC;ymSC;4Ng6", "site": "https://openreview.net/forum?id=bM5L3GLi6bG", "pdf_size": 0, "recommendation": "3;3;5", "confidence": "4;5;4", "correctness": "3;2;3", "technical_novelty": "3;2;2", "empirical_novelty": "2;0;4", "wc_summary_paper": "60;118;59", "wc_summary_review": "69;25;26", "wc_main_review": "606;174;250", "wc_review": "735;317;335", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 3.6666666666666665, 0.9428090415820634 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "correctness_avg": [ 2.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.0, 1.632993161855452 ], "wc_summary_paper_avg": [ 79.0, 27.58018612458347 ], "wc_summary_review_avg": [ 40.0, 20.51016008388688 ], "wc_main_review_avg": [ 343.3333333333333, 188.30707781587913 ], "wc_review_avg": [ 462.3333333333333, 192.9444364462359 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.4999999999999999, "corr_recommendation_correctness": 0.5, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1035697092170605472&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2, "aff_unique_index": "0;0;0", "aff_unique_norm": "Chinese Academy of Sciences", "aff_unique_dep": "Institute of Automation", "aff_unique_url": "http://www.ia.cas.cn", "aff_unique_abbr": "CAS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "id": "bOcUqfdH3S8", "title": "Provably Calibrated Regression Under Distribution Drift", "track": "main", "status": "Reject", "tldr": "", "abstract": "Accurate uncertainty quantification is a key building block of trustworthy machine learning systems. Uncertainty is typically represented by probability distributions over the possible outcomes, and these probabilities should be calibrated, \\textit{e.g}. the 90\\% credible interval should contain the true outcome 90\\% of the times. In the online prediction setup, existing conformal methods can provably achieve calibration assuming no distribution shift; however, the assumption is difficult to verify, and unlikely to hold in many applications such as time series prediction. Inspired by control theory, we propose a prediction algorithm that guarantees calibration even under distribution shift, and achieves strong performance on metrics such as sharpness and proper scores. We compare our method with baselines on 19 time-series and regression datasets, and our method achieves approximately 2x reduction in calibration error, comparable sharpness, and improved downstream decision utility. ", "keywords": "calibration;online prediction;distribution shift;uncertainty quantification", "primary_area": "", "supplementary_material": "", "author": "Shengjia Zhao;YUSUKE TASHIRO;Danny Tse;Stefano Ermon", "authorids": "~Shengjia_Zhao1;~YUSUKE_TASHIRO1;~Danny_Tse1;~Stefano_Ermon1", "gender": "M;;M;M", "homepage": "http://sjzhao.me;;;http://cs.stanford.edu/~ermon/", "dblp": "173/5122;;;47/8135", "google_scholar": ";oxUYUmEAAAAJ;;", "orcid": ";;;", "linkedin": ";;danny-tse/;", "or_profile": "~Shengjia_Zhao1;~YUSUKE_TASHIRO1;~Danny_Tse1;~Stefano_Ermon1", "aff": "Stanford University;Mitsubishi UFJ Trust Investment Technology Institute;Stanford University;Stanford University", "aff_domain": "stanford.edu;mtec-institute.co.jp;stanford.edu;stanford.edu", "position": "PhD student;Researcher;Undergrad student;Assistant Professor", "bibtex": "@misc{\nzhao2022provably,\ntitle={Provably Calibrated Regression Under Distribution Drift},\nauthor={Shengjia Zhao and YUSUKE TASHIRO and Danny Tse and Stefano Ermon},\nyear={2022},\nurl={https://openreview.net/forum?id=bOcUqfdH3S8}\n}", "github": "", "project": "", "reviewers": "Kb8i;3r9U;DfEt;Lt92", "site": "https://openreview.net/forum?id=bOcUqfdH3S8", "pdf_size": 0, "recommendation": "5;5;5;5", "confidence": "3;4;3;3", "correctness": "3;3;2;2", "technical_novelty": "3;3;2;3", "empirical_novelty": "2;3;3;2", "wc_summary_paper": "76;19;102;77", "wc_summary_review": "40;31;33;137", "wc_main_review": "348;110;317;274", "wc_review": "464;160;452;488", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 5.0, 0.0 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 68.5, 30.41792234851026 ], "wc_summary_review_avg": [ 60.25, 44.437456047798236 ], "wc_main_review_avg": [ 262.25, 91.74523148371254 ], "wc_review_avg": [ 391.0, 133.99626860476377 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:DSrX_ehrn5sJ:scholar.google.com/&scioq=Provably+Calibrated+Regression+Under+Distribution+Drift&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;0;0", "aff_unique_norm": "Stanford University;Mitsubishi UFJ Trust and Banking Corporation", "aff_unique_dep": ";Investment Technology Institute", "aff_unique_url": "https://www.stanford.edu;https://www.mufg.jp", "aff_unique_abbr": "Stanford;MUTB", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Stanford;", "aff_country_unique_index": "0;1;0;0", "aff_country_unique": "United States;Japan" }, { "id": "bPadTQyLb2_", "title": "Efficient representations for privacy-preserving inference", "track": "main", "status": "Reject", "tldr": "", "abstract": "Deep neural networks have a wide range of applications across multiple domains such as computer vision and medicine. In many cases, the input of a model at inference time can consist of sensitive user data, which raises questions concerning the levels of privacy and trust guaranteed by such services. Much existing work has leveraged homomorphic encryption (HE) schemes that enable computation on encrypted data to achieve private inference for multi-layer perceptrons and CNNs. An early work along this direction was CryptoNets, which takes 250 seconds for one MNIST inference. The main limitation of such approaches is that of compute, which is due to the costly nature of the NTT (number theoretic transform) operations that constitute HE operations. Others have proposed the use of model pruning and efficient data representations to reduce the number of HE operations required. In this paper, we focus on improving upon existing work by proposing changes to the representations of intermediate tensors during CNN inference. We construct and evaluate private CNNs on the MNIST and CIFAR-10 datasets, and achieve over a two-fold reduction in the number of operations used for inferences of the CryptoNets architecture.", "keywords": "Deep neural networks;Homomorphic encryption", "primary_area": "", "supplementary_material": "", "author": "Han Xuanyuan;Francisco Vargas;Stephen Cummins", "authorids": "~Han_Xuanyuan1;~Francisco_Vargas1;sac92@cam.ac.uk", "gender": "M;M;", "homepage": "https://github.com/xuyhan;;", "dblp": ";79/7431-1;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": "~Han_Xuanyuan1;~Francisco_Vargas1;sac92@cam.ac.uk", "aff": "University of Cambridge;University of Cambridge;", "aff_domain": "cam.ac.uk;cam.ac.uk;", "position": "MS student;PhD student;", "bibtex": "@misc{\nxuanyuan2022efficient,\ntitle={Efficient representations for privacy-preserving inference},\nauthor={Han Xuanyuan and Francisco Vargas and Stephen Cummins},\nyear={2022},\nurl={https://openreview.net/forum?id=bPadTQyLb2_}\n}", "github": "", "project": "", "reviewers": "9nHn;7wWC;uGUc;gqrN", "site": "https://openreview.net/forum?id=bPadTQyLb2_", "pdf_size": 0, "recommendation": "3;5;6;8", "confidence": "5;2;3;3", "correctness": "2;3;4;4", "technical_novelty": "1;1;2;3", "empirical_novelty": "1;0;2;3", "wc_summary_paper": "74;45;103;19", "wc_summary_review": "70;32;45;15", "wc_main_review": "121;386;143;36", "wc_review": "265;463;291;70", "wc_reply_reviewers": "0;58;11;0", "wc_reply_authors": "904;704;280;23", "reply_reviewers": "0;1;1;0", "reply_authors": "2;2;1;1", "recommendation_avg": [ 5.5, 1.8027756377319946 ], "confidence_avg": [ 3.25, 1.0897247358851685 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 1.75, 0.82915619758885 ], "empirical_novelty_avg": [ 1.5, 1.118033988749895 ], "wc_summary_paper_avg": [ 60.25, 31.427495923156208 ], "wc_summary_review_avg": [ 40.5, 20.081085628023203 ], "wc_main_review_avg": [ 171.5, 130.12782177536056 ], "wc_review_avg": [ 272.25, 139.36889000060236 ], "wc_reply_reviewers_avg": [ 17.25, 23.951774464536026 ], "wc_reply_authors_avg": [ 477.75, 345.9699228256699 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.5726562866782, "corr_recommendation_correctness": 0.9198662110077999, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:ISMOYiIBrvcJ:scholar.google.com/&scioq=Efficient+representations+for+privacy-preserving+inference&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "University of Cambridge", "aff_unique_dep": "", "aff_unique_url": "https://www.cam.ac.uk", "aff_unique_abbr": "Cambridge", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Cambridge", "aff_country_unique_index": "0;0", "aff_country_unique": "United Kingdom" }, { "id": "bRbZoK2HQw8", "title": "Attention-based Interpretation and Response to The Trade-Off of Adversarial Training", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "To boost the robustness of a model against adversarial examples, adversarial training has been regarded as a benchmark method. However, it is commonly considered to be easily suffering from the trade-off dilemma between robustness and generalization in practice. This paper tries to make an intuitive explanation for this phenomenon in the perspective of model attention and provides an attention expansion viewpoint to learn a reliable model. To be specific, we argue that adversarial training does enable one model to concentrate on exact semantic information of input, which is beneficial to avoid adversarial accumulation. But it also easily make the model to cover fewer spatial region so that the model usually ignores some inherent features of the input. This may be one main reason to result in weak generalization on unseen inputs. To address this issue, we propose an Attention-Extended Learning Framework (AELF) built on the cascade structure of deep models. AELF advocates that clean high-level features (from natural inputs) are used to guide the robustness learning rather than hand-crafted labels, so as to ensure broad spatial attention of model to input space. In addition, we provide a very simple solution to implement AELF under the efficient softmax-based training manner, which avoids checking the difference between high-dimensional embedding vectors via additional regularization loss. Experimental observations verify the rationality of our interpretation, and remarkable improvements on multiple datasets also demonstrate the superiority of AELF.", "keywords": "Adversarial training;Trade-off", "primary_area": "", "supplementary_material": "", "author": "Changbin Shao;Wenbin Li;Zhenhua Feng;Jing Huo;Yang Gao", "authorids": "~Changbin_Shao1;~Wenbin_Li5;~Zhenhua_Feng1;~Jing_Huo2;~Yang_Gao3", "gender": "M;M;M;F;M", "homepage": ";https://cs.nju.edu.cn/liwenbin/;https://ai.jiangnan.edu.cn/info/1013/3771.htm;https://cs.nju.edu.cn/huojing;https://cs.nju.edu.cn/gaoyang/", "dblp": "99/9413.html;27/1736-6.html;348/7584-1;38/9090;89/4402-1", "google_scholar": "VLko2nYAAAAJ;K-kC4yYAAAAJ;Y6KtijIAAAAJ;HKK1BdgAAAAJ;https://scholar.google.com.tw/citations?user=CJwLwzQAAAAJ", "orcid": ";;0000-0002-4485-4249;0000-0002-8504-455X;", "linkedin": ";;;;", "or_profile": "~Changbin_Shao1;~Wenbin_Li5;~Zhenhua_Feng1;~Jing_Huo2;~Yang_Gao3", "aff": "Jiangsu University of Science and Technology;Nanjing University;University of Surrey;Nanjing University;Nanjing University", "aff_domain": "just.edu;nju.edu.cn;surrey.ac.uk;nju.edu.cn;nju.edu.cn", "position": "Associate Professor;Assistant Professor;Lecturer;Associate Professor;Full Professor", "bibtex": "@misc{\nshao2022attentionbased,\ntitle={Attention-based Interpretation and Response to The Trade-Off of Adversarial Training},\nauthor={Changbin Shao and Wenbin Li and Zhenhua Feng and Jing Huo and Yang Gao},\nyear={2022},\nurl={https://openreview.net/forum?id=bRbZoK2HQw8}\n}", "github": "", "project": "", "reviewers": "MRRz;9UHh;x48d", "site": "https://openreview.net/forum?id=bRbZoK2HQw8", "pdf_size": 0, "recommendation": "3;3;5", "confidence": "5;5;3", "correctness": "2;4;2", "technical_novelty": "2;2;3", "empirical_novelty": "2;2;2", "wc_summary_paper": "84;155;44", "wc_summary_review": "39;24;97", "wc_main_review": "169;230;417", "wc_review": "292;409;558", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 3.6666666666666665, 0.9428090415820634 ], "confidence_avg": [ 4.333333333333333, 0.9428090415820634 ], "correctness_avg": [ 2.6666666666666665, 0.9428090415820634 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 94.33333333333333, 45.900859348043674 ], "wc_summary_review_avg": [ 53.333333333333336, 31.47838764754143 ], "wc_main_review_avg": [ 272.0, 105.51145277488442 ], "wc_review_avg": [ 419.6666666666667, 108.85566386530174 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -1.0, "corr_recommendation_correctness": -0.5000000000000001, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:rolshJiUftcJ:scholar.google.com/&scioq=Attention-based+Interpretation+and+Response+to+The+Trade-Off+of+Adversarial+Training&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1;2;1;1", "aff_unique_norm": "Jiangsu University of Science and Technology;Nanjing University;University of Surrey", "aff_unique_dep": ";;", "aff_unique_url": "http://www.just.edu.cn/;https://www.nju.edu.cn;https://www.surrey.ac.uk", "aff_unique_abbr": "JUST;Nanjing U;Surrey", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;0", "aff_country_unique": "China;United Kingdom" }, { "title": "Evaluating Distributional Distortion in Neural Language Modeling", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/5993", "id": "bTteFbU99ye", "poster": "", "openreview": "https://openreview.net/forum?id=bTteFbU99ye", "slides": "https://iclr.cc/virtual/2022/poster/5993", "video": "https://iclr.cc/virtual/2022/poster/5993", "author_site": "Benjamin LeBrun, Alessandro Sordoni, Timothy O'Donnell", "tldr": "", "abstract": "A fundamental characteristic of natural language is the high rate at which speakers produce novel expressions. Because of this novelty, a heavy-tail of rare events accounts for a significant amount of the total probability mass of distributions in language (Baayen, 2001). Standard language modeling metrics such as perplexity quantify the performance of language models (LM) in aggregate. As a result, we have relatively little understanding of whether neural LMs accurately estimate the probability of sequences in this heavy-tail of rare events. To address this gap, we develop a controlled evaluation scheme which uses generative models trained on natural data as artificial languages from which we can exactly compute sequence probabilities. Training LMs on generations from these artificial languages, we compare the sequence-level probability estimates given by LMs to the true probabilities in the target language. Our experiments reveal that LSTM and Transformer language models (i) systematically underestimate the probability of sequences drawn from the target language, and (ii) do so more severely for less-probable sequences. Investigating where this probability mass went, (iii) we find that LMs tend to overestimate the probability of ill formed (perturbed) sequences. In addition, we find that this underestimation behaviour (iv) is weakened, but not eliminated by greater amounts of training data, and (v) is exacerbated for target distributions with lower entropy.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Benjamin LeBrun;Alessandro Sordoni;Timothy J. O'Donnell", "authorids": "~Benjamin_LeBrun1;~Alessandro_Sordoni2;~Timothy_J._O'Donnell1", "gender": ";;M", "homepage": "https://benlebrun.github.io/;https://mcqll.org/;", "dblp": ";89/3188;57/7642", "google_scholar": ";iYjXhYwAAAAJ;", "orcid": ";0000-0002-5711-977X;", "linkedin": ";timothy-o-donnell-698b1b3/;", "or_profile": "~Benjamin_LeBrun1;~Timothy_J._O'Donnell1;~Alessandro_Sordoni1", "aff": "McGill University, McGill University;McGill University, Mila;Microsoft", "aff_domain": "mail.mcgill.ca;mcgill.ca;microsoft.com", "position": "MS student;Assistant Professor;Researcher", "bibtex": "@inproceedings{\nlebrun2022evaluating,\ntitle={Evaluating Distributional Distortion in Neural Language Modeling},\nauthor={Benjamin LeBrun and Alessandro Sordoni and Timothy J. O'Donnell},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=bTteFbU99ye}\n}", "github": "", "project": "", "reviewers": "Msuw;AtM6;V22X", "pdf_size": 0, "recommendation": "8;8;8", "confidence": "3;4;5", "correctness": "3;3;4", "technical_novelty": "2;3;4", "empirical_novelty": "3;2;4", "wc_summary_paper": "131;86;124", "wc_summary_review": "71;92;41", "wc_main_review": "402;983;171", "wc_review": "604;1161;336", "wc_reply_reviewers": "31;22;0", "wc_reply_authors": "557;822;47", "reply_reviewers": "1;1;0", "reply_authors": "1;2;1", "recommendation_avg": [ 8.0, 0.0 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 3.0, 0.816496580927726 ], "empirical_novelty_avg": [ 3.0, 0.816496580927726 ], "wc_summary_paper_avg": [ 113.66666666666667, 19.770910168449223 ], "wc_summary_review_avg": [ 68.0, 20.92844953645635 ], "wc_main_review_avg": [ 518.6666666666666, 341.60828769545714 ], "wc_review_avg": [ 700.3333333333334, 343.6241486792737 ], "wc_reply_reviewers_avg": [ 17.666666666666668, 13.021349989749739 ], "wc_reply_authors_avg": [ 475.3333333333333, 321.61916747745136 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 25, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=949734267120846607&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=bTteFbU99ye", "email": "mail.mcgill.ca;mcgill.ca;microsoft.com", "author_num": 3, "aff_unique_index": "0;0;1", "aff_unique_norm": "McGill University;Microsoft", "aff_unique_dep": ";Microsoft Corporation", "aff_unique_url": "https://www.mcgill.ca;https://www.microsoft.com", "aff_unique_abbr": "McGill;Microsoft", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1", "aff_country_unique": "Canada;United States" }, { "id": "bUAdXW8wN6", "title": "Domain Invariant Adversarial Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "The phenomenon of adversarial examples illustrates one of the most basic vulnerabilities of deep neural networks. Among the variety of techniques introduced to surmount this inherent weakness, adversarial training has emerged as the most effective strategy to achieve robustness. Typically, this is achieved by balancing robust and natural objectives. In this work, we aim to further reduce the trade-off between robust and standard accuracy by enforcing a domain-invariant feature representation. We present a new adversarial training method, Domain Invariant Adversarial Learning (DIAL), which learns a feature representation which is both robust and domain invariant. DIAL uses a variant of Domain Adversarial Neural Network (DANN) on the natural domain and its corresponding adversarial domain. In a case where the source domain consists of natural examples and the target domain is the adversarially perturbed examples, our method learns a feature representation constrained not to discriminate between the natural and adversarial examples, and can therefore achieve a more robust representation. Our experiments indicate that our method improves both robustness and standard accuracy, when compared to other state-of-the-art adversarial training methods.", "keywords": "adversarial Training;Robustness;Domain-invariant representation;domain adaptation", "primary_area": "", "supplementary_material": "/attachment/74b3fdc498751f842f2d5a52dfe12b701bf46188.zip", "author": "Matan Levi;Idan Attias;Aryeh Kontorovich", "authorids": "~Matan_Levi1;~Idan_Attias1;~Aryeh_Kontorovich1", "gender": "M;M;", "homepage": ";https://www.idanattias.com;http://www.cs.bgu.ac.il/~karyeh/", "dblp": "209/9793;228/6803;20/10289", "google_scholar": "https://scholar.google.co.il/citations?user=c96qB5UAAAAJ;-L6uUy0AAAAJ;https://scholar.google.co.il/citations?user=UNVQ5DsAAAAJ", "orcid": "0000-0003-0716-2929;;", "linkedin": "matan-levi-582106117/;;prof-aryeh-kontorovich-7b236055/", "or_profile": "~Matan_Levi1;~Idan_Attias1;~Aryeh_Kontorovich1", "aff": "Ben Gurion University of the Negev;Tel Aviv University;Ben Gurion University of the Negev", "aff_domain": "bgu.ac.il;tau.ac.il;bgu.ac.il", "position": "PhD student;PhD student;Professor", "bibtex": "@misc{\nlevi2022domain,\ntitle={Domain Invariant Adversarial Learning},\nauthor={Matan Levi and Idan Attias and Aryeh Kontorovich},\nyear={2022},\nurl={https://openreview.net/forum?id=bUAdXW8wN6}\n}", "github": "", "project": "", "reviewers": "FZTb;8wyr;FjGF;iRX5", "site": "https://openreview.net/forum?id=bUAdXW8wN6", "pdf_size": 0, "recommendation": "3;3;5;8", "confidence": "4;3;4;4", "correctness": "3;4;4;4", "technical_novelty": "2;2;3;2", "empirical_novelty": "3;2;4;3", "wc_summary_paper": "57;32;68;208", "wc_summary_review": "56;34;48;73", "wc_main_review": "136;34;143;255", "wc_review": "249;100;259;536", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "299;206;421;300", "reply_reviewers": "0;0;0;0", "reply_authors": "2;2;2;1", "recommendation_avg": [ 4.75, 2.0463381929681126 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 91.25, 68.65629978377804 ], "wc_summary_review_avg": [ 52.75, 14.095655359010449 ], "wc_main_review_avg": [ 142.0, 78.2144487930459 ], "wc_review_avg": [ 286.0, 157.47539490345787 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 306.5, 76.33642642932665 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.75, 0.4330127018922193 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.49374193110101877, "corr_recommendation_correctness": 0.49374193110101877, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12447175214192233682&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;1;0", "aff_unique_norm": "Ben Gurion University of the Negev;Tel Aviv University", "aff_unique_dep": ";", "aff_unique_url": "https://www.bgu.ac.il;https://www.tau.ac.il", "aff_unique_abbr": "BGU;TAU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Israel" }, { "id": "bUKyC0UiZcr", "title": "Temporal abstractions-augmented temporally contrastive learning: an alternative to the Laplacian in RL", "track": "main", "status": "Reject", "tldr": "", "abstract": "In reinforcement learning (RL), the graph Laplacian has proved to be a valuable tool in the task-agnostic setting, with applications ranging from option discovery to dynamics-aware metric learning. Conveniently, learning the Laplacian representation has recently been framed as the optimization of a temporally-contrastive objective to overcome its computational limitations in large or even continuous state spaces (Wu et al., 2019). However, this approach relies on a uniform access to the state space S, and overlooks the exploration problem that emerges during the representation learning process. In this work, we reconcile such representation learning with exploration in a non-uniform prior setting, while recovering the expressive potential afforded by a uniform prior. Our approach leverages the learned representation to build a skill-based covering policy which in turn provides a better training distribution to extend and refine the representation. We also propose to integrate temporal abstractions captured by the learned skills into the representation, which encourages exploration and improves the representation\u2019s dynamics-awareness. We find that our method scales better to challenging environments, and that the learned skills can solve difficult continuous navigation tasks with sparse rewards, where standard skill discovery methods are limited.", "keywords": "Representation learning;Laplacian;self-supervised;exploration", "primary_area": "", "supplementary_material": "", "author": "Akram Erraqabi;Marlos C. Machado;Harry Zhao;Sainbayar Sukhbaatar;Ludovic Denoyer;Alessandro Lazaric;Yoshua Bengio", "authorids": "~Akram_Erraqabi1;~Marlos_C._Machado1;~Harry_Zhao1;~Sainbayar_Sukhbaatar1;~Ludovic_Denoyer1;~Alessandro_Lazaric2;~Yoshua_Bengio1", "gender": "M;M;M;M;M;M;M", "homepage": ";https://webdocs.cs.ualberta.ca/~machado/;http://cs.mcgill.ca/~mzhao36;;;;http://yoshuabengio.org", "dblp": "https://dblp.org/pers/e/Erraqabi:Akram.html;21/10949;;56/10550;54/5551;36/321;56/953", "google_scholar": ";https://scholar.google.ca/citations?user=xf_n4xUAAAAJ;TZh189YAAAAJ;ri1sE34AAAAJ;9PLqulwAAAAJ;6JZ3R6wAAAAJ;kukA0LcAAAAJ", "orcid": ";;;;;;", "linkedin": ";cholodovskis/;harry-zhao-992a16106/;;;;yoshuabengio/?originalSubdomain=ca", "or_profile": "~Akram_Erraqabi1;~Marlos_C._Machado1;~Harry_Zhao1;~Sainbayar_Sukhbaatar1;~Ludovic_Denoyer1;~Alessandro_Lazaric2;~Yoshua_Bengio1", "aff": "University of Montreal;University of Alberta;McGill University;Meta Facebook;Meta Facebook;Meta Facebook;University of Montreal", "aff_domain": "umontreal.ca;ualberta.ca;mail.mcgill.ca;fb.com;fb.com;fb.com;umontreal.ca", "position": "PhD student;Adjunct Professor;PhD student;Research Scientist;Research Scientist;Research Scientist;Full Professor", "bibtex": "@misc{\nerraqabi2022temporal,\ntitle={Temporal abstractions-augmented temporally contrastive learning: an alternative to the Laplacian in {RL}},\nauthor={Akram Erraqabi and Marlos C. Machado and Harry Zhao and Sainbayar Sukhbaatar and Ludovic Denoyer and Alessandro Lazaric and Yoshua Bengio},\nyear={2022},\nurl={https://openreview.net/forum?id=bUKyC0UiZcr}\n}", "github": "", "project": "", "reviewers": "XcUD;73Mt;2jVQ", "site": "https://openreview.net/forum?id=bUKyC0UiZcr", "pdf_size": 0, "recommendation": "5;5;6", "confidence": "4;4;4", "correctness": "3;3;3", "technical_novelty": "3;2;3", "empirical_novelty": "2;2;2", "wc_summary_paper": "143;67;49", "wc_summary_review": "90;25;44", "wc_main_review": "297;227;742", "wc_review": "530;319;835", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 86.33333333333333, 40.737642979872504 ], "wc_summary_review_avg": [ 53.0, 27.28858125052797 ], "wc_main_review_avg": [ 422.0, 228.07162617622268 ], "wc_review_avg": [ 561.3333333333334, 211.81805609741792 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13275347078784894756&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff_unique_index": "0;1;2;3;3;3;0", "aff_unique_norm": "University of Montreal;University of Alberta;McGill University;Meta", "aff_unique_dep": ";;;Meta Platforms, Inc.", "aff_unique_url": "https://wwwumontreal.ca;https://www.ualberta.ca;https://www.mcgill.ca;https://meta.com", "aff_unique_abbr": "UM;UAlberta;McGill;Meta", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;1;1;0", "aff_country_unique": "Canada;United States" }, { "id": "bUi8963hi5l", "title": "Calibrating Probabilistic Embeddings for Cross-Modal Retrieval", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "The core of cross-modal retrieval is to measure the content similarity between data of different modalities. The main challenge focuses on learning a shared representation space for multiple modalities where the similarity measurement can reflect the semantic closeness.\nThe multiplicity of correspondences further escalates the challenge since all the possible matches should be ranked ahead of the negatives. Probabilistic embeddings are proposed to handle the multiplicity while suffering from similarity miscalibration. To address it, we propose to calibrate the similarity for probabilistic embeddings. The key idea is to estimate the density ratio between the distributions of the two modalities, and use it to calibrate the similarity measurement in the embedding space. To the best of our knowledge, we are the first to study the miscalibration in probabilistic embeddings. \nIn addition, we further evaluate three pre-training tasks of language models, \nwhich is important for cross-modal but seldom investigated in previous studies. \nExtensive experiments as well as ablation studies on two benchmarks demonstrate its superior performance in tackling the multiplicity of cross-modal retrieval.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Fengchun Qiao;Xi Peng", "authorids": "~Fengchun_Qiao1;~Xi_Peng1", "gender": ";Not Specified", "homepage": "https://joffery.github.io/joffery/;https://deep-real.github.io/dr_xipeng.html", "dblp": "215/3373;149/7762-5", "google_scholar": "BY6zd_0AAAAJ;DWw4v0kAAAAJ", "orcid": "0000-0003-2714-2036;0000-0002-7772-001X", "linkedin": "fengchun-qiao-9148ba157/;xi-peng-74b540b6/", "or_profile": "~Fengchun_Qiao1;~Xi_Peng1", "aff": "University of Delaware;University of Delaware", "aff_domain": "udel.edu;udel.edu", "position": "PhD student;Assistant Professor", "bibtex": "@misc{\nqiao2022calibrating,\ntitle={Calibrating Probabilistic Embeddings for Cross-Modal Retrieval},\nauthor={Fengchun Qiao and Xi Peng},\nyear={2022},\nurl={https://openreview.net/forum?id=bUi8963hi5l}\n}", "github": "", "project": "", "reviewers": "kZT6;MjEx;8QeY", "site": "https://openreview.net/forum?id=bUi8963hi5l", "pdf_size": 0, "recommendation": "3;5;5", "confidence": "5;4;4", "correctness": "3;4;3", "technical_novelty": "1;3;3", "empirical_novelty": "2;4;3", "wc_summary_paper": "94;46;59", "wc_summary_review": "48;74;101", "wc_main_review": "270;366;241", "wc_review": "412;486;401", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 4.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.3333333333333335, 0.9428090415820634 ], "empirical_novelty_avg": [ 3.0, 0.816496580927726 ], "wc_summary_paper_avg": [ 66.33333333333333, 20.27039439401436 ], "wc_summary_review_avg": [ 74.33333333333333, 21.63844315615664 ], "wc_main_review_avg": [ 292.3333333333333, 53.41868170926306 ], "wc_review_avg": [ 433.0, 37.74475681027322 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.9999999999999997, "corr_recommendation_correctness": 0.49999999999999983, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Dy7F7vr18kkJ:scholar.google.com/&scioq=Calibrating+Probabilistic+Embeddings+for+Cross-Modal+Retrieval&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "University of Delaware", "aff_unique_dep": "", "aff_unique_url": "https://www.udel.edu", "aff_unique_abbr": "UD", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "bVT5w39X0a", "title": "Bayesian Relational Generative Model for Scalable Multi-modal Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "The study of complex systems requires the integration of multiple heterogeneous and high-dimensional data types (e.g. multi-omics). However, previous generative approaches for multi-modal inputs suffer from two shortcomings. First, they are not stochastic processes, leading to poor uncertainty estimations over their predictions. This is mostly due to the computationally intensive nature of traditional stochastic processes, such as Gaussian Processes (GPs), that makes their applicability limited in multi-modal learning frameworks. Second, they are not able to effectively approximate the joint posterior distribution of multi-modal data types with various missing patterns. More precisely, their model assumptions result in miscalibrated precisions and/or computational cost of sub-sampling procedure. In this paper, we propose a class of stochastic processes that learns a graph of dependencies between samples across multi-modal data types through adopting priors over the relational structure of the given data modalities. The dependency graph in our method, multi-modal Relational Neural Process (mRNP), not only posits distributions over the functions and naturally enables rapid adaptation to new observations by its predictive distribution, but also makes mRNP scalable to large datasets through mini-batch optimization. We also introduce mixture-of-graphs (MoG) in our model construction and show that it can address the aforementioned limitations in joint posterior approximation. Experiments on both toy regression and classification tasks using real-world datasets demonstrate the potential of mRNP for offering higher prediction accuracies as well as more robust uncertainty estimates compared to existing baselines and state-of-the-art methods.", "keywords": "Multi-modal Learning;Bayesian Learning;Neural Processes;Variational Inference", "primary_area": "", "supplementary_material": "", "author": "Ehsan Hajiramezanali;Talip Ucar;Lindsay Edwards", "authorids": "~Ehsan_Hajiramezanali1;~Talip_Ucar2;~Lindsay_Edwards1", "gender": "M;;M", "homepage": "http://ehsanhajiramezanali.github.io/;;", "dblp": "225/3486;;", "google_scholar": "20I_DMoAAAAJ;;jWcQDOsAAAAJ", "orcid": ";;", "linkedin": "ehsan-hajiramezanali-978a3b52/;;lindsay-edwards-7a7268a7/", "or_profile": "~Ehsan_Hajiramezanali1;~Talip_Ucar2;~Lindsay_Edwards1", "aff": "AstraZeneca;;Relation Therapeutics", "aff_domain": "astrazeneca.com;;relationrx.com", "position": "AI Research Scientist;;Principal Researcher", "bibtex": "@misc{\nhajiramezanali2022bayesian,\ntitle={Bayesian Relational Generative Model for Scalable Multi-modal Learning},\nauthor={Ehsan Hajiramezanali and Talip Ucar and Lindsay Edwards},\nyear={2022},\nurl={https://openreview.net/forum?id=bVT5w39X0a}\n}", "github": "", "project": "", "reviewers": "Vnks;bvzL;fVe5;VW19", "site": "https://openreview.net/forum?id=bVT5w39X0a", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "3;4;4;5", "correctness": "3;3;2;3", "technical_novelty": "2;2;3;2", "empirical_novelty": "0;0;3;2", "wc_summary_paper": "26;86;99;56", "wc_summary_review": "14;56;61;8", "wc_main_review": "277;528;688;51", "wc_review": "317;670;848;115", "wc_reply_reviewers": "59;0;0;0", "wc_reply_authors": "510;1274;985;105", "reply_reviewers": "1;0;0;0", "reply_authors": "1;2;2;1", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.25, 1.299038105676658 ], "wc_summary_paper_avg": [ 66.75, 28.225653225390552 ], "wc_summary_review_avg": [ 34.75, 23.909987452945266 ], "wc_main_review_avg": [ 386.0, 242.62831656671898 ], "wc_review_avg": [ 487.5, 287.7034063058691 ], "wc_reply_reviewers_avg": [ 14.75, 25.54774941164094 ], "wc_reply_authors_avg": [ 718.5, 447.06179662324087 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.7071067811865475, "corr_recommendation_correctness": -0.5773502691896257, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:x2Ez9qECnIkJ:scholar.google.com/&scioq=Bayesian+Relational+Generative+Model+for+Scalable+Multi-modal+Learning&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1", "aff_unique_norm": "AstraZeneca;Relation Therapeutics", "aff_unique_dep": ";", "aff_unique_url": "https://www.astrazeneca.com;", "aff_unique_abbr": "AZ;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0", "aff_country_unique": "United Kingdom;" }, { "id": "bVkRc9NDHcK", "title": "Variable Length Variable Quality Audio Steganography", "track": "main", "status": "Reject", "tldr": "", "abstract": "Steganography is the task of hiding and recovering secret data inside a non-secret container data while making imperceptible changes to the container. When using steganography to hide audio inside an image, current approaches neither allow the encoding of a signal with variable length nor allow making a trade-off between secret data reconstruction quality and imperceptibility in the changes made to the container image. To address this problem, we propose VLVQ (Variable Length Variable Quality Audio Steganography), a deep learning based steganographic framework capable of hiding variable-length audio inside an image by training the network to iteratively encode and decode the audio data from the container image. Complementary to the standard reconstruction loss, we propose an optional conditional loss term that allows the users to make quality trade-offs between audio and image reconstruction on inference time, without needing to train a separate model for each trade-off setups. Our experiments on ImageNet and AudioSet demonstrate VLVQ\u2019s ability to retain reasonable image quality (28.99 $psnr$) and audio reconstruction quality (23.79 $snrseg$) while encoding 19 seconds of audio. We also show VLVQ\u2019s capability to generalize to signals longer than what is seen during training.", "keywords": "computer vision;stegaography;recurrent neural network;loss conditional training;information hiding", "primary_area": "", "supplementary_material": "/attachment/cb8e49069e2ab8051e838caf5f4e21a5d812ac51.zip", "author": "Seungmo Ku", "authorids": "~Seungmo_Ku1", "gender": "Not Specified", "homepage": "https://example.com", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "~Seungmo_Ku1", "aff": "(no institution, private researcher)", "aff_domain": "protonmail.com", "position": "Researcher", "bibtex": "@misc{\nku2022variable,\ntitle={Variable Length Variable Quality Audio Steganography},\nauthor={Seungmo Ku},\nyear={2022},\nurl={https://openreview.net/forum?id=bVkRc9NDHcK}\n}", "github": "", "project": "", "reviewers": "c5tq;gihE;zNiL;jZgM", "site": "https://openreview.net/forum?id=bVkRc9NDHcK", "pdf_size": 0, "recommendation": "3;5;5;5", "confidence": "5;2;3;4", "correctness": "3;2;3;4", "technical_novelty": "3;2;2;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "75;52;94;84", "wc_summary_review": "144;37;39;32", "wc_main_review": "588;320;367;142", "wc_review": "807;409;500;258", "wc_reply_reviewers": "0;41;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;1;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.5, 0.8660254037844386 ], "confidence_avg": [ 3.5, 1.118033988749895 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 76.25, 15.530212490497354 ], "wc_summary_review_avg": [ 63.0, 46.834816109385976 ], "wc_main_review_avg": [ 354.25, 158.92195411584896 ], "wc_review_avg": [ 493.5, 200.57729183534212 ], "wc_reply_reviewers_avg": [ 10.25, 17.75352077758099 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 1, 0 ], "corr_recommendation_confidence": -0.7745966692414834, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:a2nGNIkwfB8J:scholar.google.com/&scioq=Variable+Length+Variable+Quality+Audio+Steganography&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "Private Researcher", "aff_unique_dep": "", "aff_unique_url": "", "aff_unique_abbr": "" }, { "title": "Large Language Models Can Be Strong Differentially Private Learners", "status": "Oral", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6894", "id": "bVuP3ltATMz", "poster": "", "openreview": "https://openreview.net/forum?id=bVuP3ltATMz", "slides": "https://iclr.cc/virtual/2022/poster/6894", "video": "https://iclr.cc/virtual/2022/poster/6894", "author_site": "Xuechen Li, Florian Tramer, Percy Liang, Tatsunori Hashimoto", "tldr": "", "abstract": "Differentially Private (DP) learning has seen limited success for building large deep learning models of text, and straightforward attempts at applying Differentially Private Stochastic Gradient Descent (DP-SGD) to NLP tasks have resulted in large performance drops and high computational overhead.\nWe show that this performance drop can be mitigated with (1) the use of large pretrained language models; (2) non-standard hyperparameters that suit DP optimization; and (3) fine-tuning objectives which are aligned with the pretraining procedure.\nWith the above, we obtain NLP models that outperform state-of-the-art DP-trained models under the same privacy budget and strong non-private baselines---by directly fine-tuning pretrained models with DP optimization on moderately-sized corpora. \nTo address the computational challenge of running DP-SGD with large Transformers, we propose a memory saving technique that allows clipping in DP-SGD to run without instantiating per-example gradients for any linear layer in the model. \nThe technique enables privately training Transformers with almost the same memory cost as non-private training at a modest run-time overhead. \nContrary to conventional wisdom that DP optimization fails at learning high-dimensional models (due to noise that scales with dimension) empirical results reveal that private learning with pretrained language models tends to not suffer from dimension-dependent performance degradation.\nCode to reproduce results can be found at https://github.com/lxuechen/private-transformers.\n", "keywords": "language model;differential privacy;language generation;fine-tuning;NLP", "primary_area": "", "supplementary_material": "", "author": "Xuechen Li;Florian Tramer;Percy Liang;Tatsunori Hashimoto", "authorids": "~Xuechen_Li1;~Florian_Tramer1;~Percy_Liang1;~Tatsunori_Hashimoto1", "gender": "M;M;;M", "homepage": "https://www.lxuechen.com/;http://floriantramer.com;https://cs.stanford.edu/~pliang/;https://thashim.github.io", "dblp": ";158/7224;04/1701;", "google_scholar": "GaYmpIgAAAAJ;https://scholar.google.ch/citations?user=ijH0-a8AAAAJ;pouyVyUAAAAJ;5ygiTwsAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Xuechen_Li1;~Florian_Tramer1;~Percy_Liang1;~Tatsunori_Hashimoto1", "aff": "Computer Science Department, Stanford University;Google;Stanford University;Stanford University", "aff_domain": "cs.stanford.edu;google.com;stanford.edu;stanford.edu", "position": "PhD student;Visiting Researcher;Associate Professor;Assistant Professor", "bibtex": "@inproceedings{\nli2022large,\ntitle={Large Language Models Can Be Strong Differentially Private Learners},\nauthor={Xuechen Li and Florian Tramer and Percy Liang and Tatsunori Hashimoto},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=bVuP3ltATMz}\n}", "github": "", "project": "", "reviewers": "E6VG;uqBn;a9ro;XPKW", "pdf_size": 0, "recommendation": "6;8;8;8", "confidence": "4;4;3;5", "correctness": "3;3;3;3", "technical_novelty": "3;3;2;4", "empirical_novelty": "4;3;3;3", "wc_summary_paper": "77;87;102;286", "wc_summary_review": "19;15;77;69", "wc_main_review": "327;306;177;228", "wc_review": "423;408;356;583", "wc_reply_reviewers": "0;57;0;0", "wc_reply_authors": "1357;1471;211;1025", "reply_reviewers": "0;1;0;0", "reply_authors": "3;4;1;2", "recommendation_avg": [ 7.5, 0.8660254037844386 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 3.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 138.0, 85.90983645660141 ], "wc_summary_review_avg": [ 45.0, 28.178005607210743 ], "wc_main_review_avg": [ 259.5, 60.24325688406961 ], "wc_review_avg": [ 442.5, 84.8425011418216 ], "wc_reply_reviewers_avg": [ 14.25, 24.681724007856502 ], "wc_reply_authors_avg": [ 1016.0, 492.8011769466465 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 2.5, 1.118033988749895 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 415, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12835205672391916982&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=bVuP3ltATMz", "email": "cs.stanford.edu;google.com;stanford.edu;stanford.edu", "author_num": 4, "aff_unique_index": "0;1;0;0", "aff_unique_norm": "Stanford University;Google", "aff_unique_dep": "Computer Science Department;Google", "aff_unique_url": "https://www.stanford.edu;https://www.google.com", "aff_unique_abbr": "Stanford;Google", "aff_campus_unique_index": "0;1;0;0", "aff_campus_unique": "Stanford;Mountain View", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "DIVA: Dataset Derivative of a Learning Task", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7196", "id": "bVvMOtLMiw", "poster": "", "openreview": "https://openreview.net/forum?id=bVvMOtLMiw", "slides": "https://iclr.cc/virtual/2022/poster/7196", "video": "https://iclr.cc/virtual/2022/poster/7196", "author_site": "Yonatan Dukler, Alessandro Achille, Giovanni Paolini, Avinash Ravichandran, Marzia Polito, Stefano Soatto", "tldr": "", "abstract": "We present a method to compute the derivative of a learning task with respect to a dataset. A learning task is a function from a training set to the validation error, which can be represented by a trained deep neural network (DNN). The ``dataset derivative'' is a linear operator, computed around the trained model, that informs how perturbations of the weight of each training sample affect the validation error, usually computed on a separate validation dataset. Our method, DIVA (Differentiable Validation) hinges on a closed-form differentiable expression of the leave-one-out cross-validation error around a pre-trained DNN. Such expression constitutes the dataset derivative. DIVA could be used for dataset auto-curation, for example removing samples with faulty annotations, augmenting a dataset with additional relevant samples, or rebalancing. More generally, DIVA can be used to optimize the dataset, along with the parameters of the model, as part of the training process without the need for a separate validation dataset, unlike bi-level optimization methods customary in AutoML. To illustrate the flexibility of DIVA, we report experiments on sample auto-curation tasks such as outlier rejection, dataset extension, and automatic aggregation of multi-modal data.", "keywords": "Leave one out cross validation;AutoML;dataset optimization", "primary_area": "", "supplementary_material": "/attachment/d872c8d7c427b934714a7377c281e34e34159490.zip", "author": "Yonatan Dukler;Alessandro Achille;Giovanni Paolini;Avinash Ravichandran;Marzia Polito;Stefano Soatto", "authorids": "~Yonatan_Dukler1;~Alessandro_Achille1;~Giovanni_Paolini1;~Avinash_Ravichandran1;~Marzia_Polito1;~Stefano_Soatto3", "gender": "M;M;M;M;F;", "homepage": "https://dukleryoni.github.io/;;http://giovannipaolini.org;;;https://www.cs.ucla.edu/~soatto", "dblp": "242/3844;190/7328;150/6260;90/4314;86/750;08/1262", "google_scholar": ";;https://scholar.google.it/citations?user=xGI18C0AAAAJ;28p_eLYAAAAJ;8qsuHEoAAAAJ;lH1PdF8AAAAJ", "orcid": ";;0000-0002-3964-9101;;;0000-0003-2902-6362", "linkedin": ";;g-paolini/;;marzia-polito-89b0a33/;stefano-soatto-5765aa6/", "or_profile": "~Yonatan_Dukler1;~Alessandro_Achille1;~Giovanni_Paolini1;~Avinash_Ravichandran1;~Marzia_Polito1;~Stefano_Soatto2", "aff": "AWS AI Labs;California Institute of Technology;Amazon;Amazon;Amazon;UCLA Computer Science Department, University of California, Los Angeles", "aff_domain": "amazon.com;caltech.edu;amazon.com;amazon.com;amazon.com;cs.ucla.edu", "position": "Researcher;Postdoc;Applied Scientist;Research Scientist;Senior Manager, Applied Science;Professor", "bibtex": "@inproceedings{\ndukler2022diva,\ntitle={{DIVA}: Dataset Derivative of a Learning Task},\nauthor={Yonatan Dukler and Alessandro Achille and Giovanni Paolini and Avinash Ravichandran and Marzia Polito and Stefano Soatto},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=bVvMOtLMiw}\n}", "github": "", "project": "", "reviewers": "2K7t;9tXK;3qBW;7aEq", "pdf_size": 0, "recommendation": "5;6;8;8", "confidence": "5;3;4;3", "correctness": "4;3;4;3", "technical_novelty": "2;3;3;4", "empirical_novelty": "2;2;4;4", "wc_summary_paper": "102;88;56;77", "wc_summary_review": "96;77;77;32", "wc_main_review": "802;292;732;297", "wc_review": "1000;457;865;406", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "1048;514;836;512", "reply_reviewers": "0;0;0;0", "reply_authors": "2;1;2;1", "recommendation_avg": [ 6.75, 1.299038105676658 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 3.0, 1.0 ], "wc_summary_paper_avg": [ 80.75, 16.813313177360374 ], "wc_summary_review_avg": [ 70.5, 23.542514733987108 ], "wc_main_review_avg": [ 530.75, 237.5493369807628 ], "wc_review_avg": [ 682.0, 255.64330619048096 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 727.5, 227.2196074285844 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.5222329678670935, "corr_recommendation_correctness": -0.19245008972987526, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8357861389764032292&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "pdf": "https://openreview.net/pdf?id=bVvMOtLMiw", "email": "amazon.com;caltech.edu;amazon.com;amazon.com;amazon.com;cs.ucla.edu", "author_num": 6, "aff_unique_index": "0;1;0;0;0;2", "aff_unique_norm": "Amazon;California Institute of Technology;University of California, Los Angeles", "aff_unique_dep": "AWS AI Labs;;Computer Science Department", "aff_unique_url": "https://aws.amazon.com;https://www.caltech.edu;https://www.ucla.edu", "aff_unique_abbr": "AWS;Caltech;UCLA", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Pasadena;Los Angeles", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Online Adversarial Attacks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6381", "id": "bYGSzbCM_i", "poster": "", "openreview": "https://openreview.net/forum?id=bYGSzbCM_i", "slides": "https://iclr.cc/virtual/2022/poster/6381", "video": "https://iclr.cc/virtual/2022/poster/6381", "author_site": "Andjela Mladenovic, Joey Bose, Hugo Berard, William Hamilton, Simon Lacoste-Julien, Pascal Vincent, Gauthier Gidel", "tldr": "", "abstract": "Adversarial attacks expose important vulnerabilities of deep learning models, yet little attention has been paid to settings where data arrives as a stream. In this paper, we formalize the online adversarial attack problem, emphasizing two key elements found in real-world use-cases: attackers must operate under partial knowledge of the target model, and the decisions made by the attacker are irrevocable since they operate on a transient data stream. We first rigorously analyze a deterministic variant of the online threat model by drawing parallels to the well-studied $k$-secretary problem in theoretical computer science and propose Virtual+, a simple yet practical online algorithm. Our main theoretical result shows Virtual+ yields provably the best competitive ratio over all single-threshold algorithms for $k<5$---extending the previous analysis of the $k$-secretary problem. We also introduce the \\textit{stochastic $k$-secretary}---effectively reducing online blackbox transfer attacks to a $k$-secretary problem under noise---and prove theoretical bounds on the performance of Virtual+ adapted to this setting. Finally, we complement our theoretical results by conducting experiments on MNIST, CIFAR-10, and Imagenet classifiers, revealing the necessity of online algorithms in achieving near-optimal performance and also the rich interplay between attack strategies and online attack selection, enabling simple strategies like FGSM to outperform stronger adversaries.", "keywords": "Online Algorithms;Adversarial Attacks", "primary_area": "", "supplementary_material": "/attachment/6c0a2e919f8a5680a3c89a2eff430d7dfa586422.zip", "author": "Andjela Mladenovic;Joey Bose;Hugo Berard;William L. Hamilton;Simon Lacoste-Julien;Pascal Vincent;Gauthier Gidel", "authorids": "~Andjela_Mladenovic1;~Joey_Bose1;~Hugo_Berard2;~William_L._Hamilton1;~Simon_Lacoste-Julien1;~Pascal_Vincent1;~Gauthier_Gidel1", "gender": "F;M;M;;M;M;M", "homepage": ";https://joeybose.github.io/;;;http://www.iro.umontreal.ca/~slacoste/;http://www.iro.umontreal.ca/~vincentp;https://gauthiergidel.github.io/", "dblp": ";174/3372;205/3145;137/3314;94/446.html;43/861;188/6326", "google_scholar": "ALrei20AAAAJ;ybPyI7IAAAAJ;P5d_140AAAAJ;;oejm5IUAAAAJ;WBCKQMsAAAAJ;https://scholar.google.fr/citations?user=bDrXQPUAAAAJ", "orcid": ";;;;0000-0001-6485-6180;;", "linkedin": ";;;;simon-lacoste-julien-355b9a3;;", "or_profile": "~Andjela_Mladenovic1;~Joey_Bose1;~Hugo_Berard2;~William_L._Hamilton1;~Simon_Lacoste-Julien1;~Pascal_Vincent1;~Gauthier_Gidel1", "aff": "Montreal Institute for Learning Algorithms, University of Montreal, University of Montreal;McGill University and Mila;University of Montreal, University of Montreal;McGill University;Samsung - SAIT AI Lab, Montreal;Facebook A.I. Research;Mila - Quebec Artificial Intelligence Institute", "aff_domain": "mila.umontreal.ca;mcgill.ca;iro.umontreal.ca;mcgill.ca;samsung.com;fb.com;mila.quebec", "position": "PhD student;PhD student;PhD student;Assistant Professor;VP Lab Director;Research Scientist;Assistant Professor", "bibtex": "@inproceedings{\nmladenovic2022online,\ntitle={Online Adversarial Attacks},\nauthor={Andjela Mladenovic and Joey Bose and Hugo Berard and William L. Hamilton and Simon Lacoste-Julien and Pascal Vincent and Gauthier Gidel},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=bYGSzbCM_i}\n}", "github": "", "project": "", "reviewers": "XV1X;4rG9;VskD;82UN", "pdf_size": 0, "recommendation": "5;5;6;8", "confidence": "3;4;3;4", "correctness": "3;2;3;4", "technical_novelty": "3;3;3;3", "empirical_novelty": "3;2;2;3", "wc_summary_paper": "41;180;73;65", "wc_summary_review": "33;27;133;54", "wc_main_review": "454;342;217;230", "wc_review": "528;549;423;349", "wc_reply_reviewers": "140;0;0;0", "wc_reply_authors": "1974;1096;1083;1253", "reply_reviewers": "1;0;0;0", "reply_authors": "6;4;4;3", "recommendation_avg": [ 6.0, 1.224744871391589 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 89.75, 53.41991669780102 ], "wc_summary_review_avg": [ 61.75, 42.34014052881733 ], "wc_main_review_avg": [ 310.75, 95.92542676475304 ], "wc_review_avg": [ 462.25, 80.95484852681771 ], "wc_reply_reviewers_avg": [ 35.0, 60.6217782649107 ], "wc_reply_authors_avg": [ 1351.5, 365.5752316555376 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 4.25, 1.0897247358851685 ], "replies_avg": [ 24, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.40824829046386296, "corr_recommendation_correctness": 0.8660254037844386, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10843150111517715745&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=bYGSzbCM_i", "email": "mila.umontreal.ca;mcgill.ca;iro.umontreal.ca;mcgill.ca;samsung.com;fb.com;mila.quebec", "author_num": 7, "aff_unique_index": "0;1;0;1;2;3;4", "aff_unique_norm": "University of Montreal;McGill University;Samsung;Meta;Quebec Artificial Intelligence Institute", "aff_unique_dep": "Montreal Institute for Learning Algorithms;;SAIT AI Lab;Facebook A.I. Research;Artificial Intelligence", "aff_unique_url": "https://www.umontreal.ca;https://www.mcgill.ca;https://www.samsung.com;https://research.facebook.com;https://mila.quebec", "aff_unique_abbr": "UM;McGill;Samsung;FAIR;Mila", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Montreal;", "aff_country_unique_index": "0;0;0;0;0;1;0", "aff_country_unique": "Canada;United States" }, { "id": "bYfk8y7BXS", "title": "Pessimistic Model Selection for Offline Deep Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Deep Reinforcement Learning (DRL) has demonstrated great potentials in solving sequential decision making problems in many applications. Despite its promising performance, practical gaps exist when deploying DRL in real-world scenarios. One main barrier is the over-fitting issue that leads to poor generalizability of the policy learned by DRL. In particular, for offline DRL with observational data, model selection is a challenging task as there is no ground truth available for performance demonstration, in contrast with the online setting with simulated environments. In this work, we propose a pessimistic model selection (PMS) approach for offline DRL with a theoretical guarantee, which features a tuning-free framework for finding the best policy among a set of candidate models. Two refined approaches are also proposed to address the potential bias of DRL model in identifying the optimal policy. Numerical studies demonstrated the superior performance of our approach over existing methods.", "keywords": "reinforcement learning theory;offline deep reinforcement learning;model selection;pessimism;tuning free", "primary_area": "", "supplementary_material": "", "author": "Chao-Han Huck Yang;Zhengling Qi;Yifan Cui;Pin-Yu Chen", "authorids": "~Chao-Han_Huck_Yang1;~Zhengling_Qi1;~Yifan_Cui1;~Pin-Yu_Chen1", "gender": "M;;M;M", "homepage": "https://huckiyang.github.io/;https://sites.google.com/view/statsqizl/home?authuser=0;https://sites.google.com/view/yifancui;http://www.pinyuchen.com", "dblp": "230/4012;173/0201;227/3562-1;39/8969", "google_scholar": "TT3XJW8AAAAJ;;;jxwlCUUAAAAJ", "orcid": "0000-0003-2879-8811;;;0000-0003-1039-8369", "linkedin": ";;;pin-yu-chen-940062a2", "or_profile": "~Chao-Han_Huck_Yang1;~Zhengling_Qi1;~Yifan_Cui1;~Pin-Yu_Chen1", "aff": "Georgia Institute of Technology;George Washington University;National University of Singapore;International Business Machines", "aff_domain": "gatech.edu;gwu.edu;nus.edu.sg;ibm.com", "position": "PhD student;Assistant Professor;Assistant Professor;Research Staff Member", "bibtex": "@misc{\nyang2022pessimistic,\ntitle={Pessimistic Model Selection for Offline Deep Reinforcement Learning},\nauthor={Chao-Han Huck Yang and Zhengling Qi and Yifan Cui and Pin-Yu Chen},\nyear={2022},\nurl={https://openreview.net/forum?id=bYfk8y7BXS}\n}", "github": "", "project": "", "reviewers": "Y9ri;Sxiw;Wrcc;MuG4", "site": "https://openreview.net/forum?id=bYfk8y7BXS", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "3;4;3;3", "correctness": "3;3;3;4", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;2;0;2", "wc_summary_paper": "50;54;70;79", "wc_summary_review": "12;64;48;53", "wc_main_review": "456;528;223;284", "wc_review": "518;646;341;416", "wc_reply_reviewers": "0;88;145;0", "wc_reply_authors": "1001;494;905;181", "reply_reviewers": "0;1;1;0", "reply_authors": "2;2;2;1", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 63.25, 11.776565713313877 ], "wc_summary_review_avg": [ 44.25, 19.49839737004044 ], "wc_main_review_avg": [ 372.75, 123.82926754204759 ], "wc_review_avg": [ 480.25, 114.4735231396326 ], "wc_reply_reviewers_avg": [ 58.25, 61.63754943214404 ], "wc_reply_authors_avg": [ 645.25, 328.7980953411987 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.75, 0.4330127018922193 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": 1.0, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5633105735638276872&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "Georgia Institute of Technology;George Washington University;National University of Singapore;International Business Machines Corporation", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.gatech.edu;https://www.gwu.edu;https://www.nus.edu.sg;https://www.ibm.com", "aff_unique_abbr": "Georgia Tech;GWU;NUS;IBM", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "United States;Singapore" }, { "title": "A NON-PARAMETRIC REGRESSION VIEWPOINT : GENERALIZATION OF OVERPARAMETRIZED DEEP RELU NETWORK UNDER NOISY OBSERVATIONS", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6935", "id": "bZJbzaj_IlP", "poster": "", "openreview": "https://openreview.net/forum?id=bZJbzaj_IlP", "slides": "https://iclr.cc/virtual/2022/poster/6935", "video": "https://iclr.cc/virtual/2022/poster/6935", "author_site": "Namjoon Suh, Hyunouk Ko, Xiaoming Huo", "tldr": "", "abstract": "We study the generalization properties of the overparameterized deep neural network (DNN) with Rectified Linear Unit (ReLU) activations.\nUnder the non-parametric regression framework, it is assumed that the ground-truth function is from a reproducing kernel Hilbert space (RKHS) induced by a neural tangent kernel (NTK) of ReLU DNN, and a dataset is given with the noises. Without a delicate adoption of early stopping, we prove that the overparametrized DNN trained by vanilla gradient descent does not recover the ground-truth function. It turns out that the estimated DNN's $L_{2}$ prediction error is bounded away from $0$. As a complement of the above result, we show that the $\\ell_{2}$-regularized gradient descent enables the overparametrized DNN achieve the minimax optimal convergence rate of the $L_{2}$ prediction error, without early stopping. Notably, the rate we obtained is faster than $\\mathcal{O}(n^{-1/2})$ known in the literature.", "keywords": "Overparametrized Deep Neural Network;Neural Tangent Kernel;Minimax;Non-parametric regression", "primary_area": "", "supplementary_material": "", "author": "Namjoon Suh;Hyunouk Ko;Xiaoming Huo", "authorids": "~Namjoon_Suh1;~Hyunouk_Ko1;~Xiaoming_Huo1", "gender": "M;M;M", "homepage": "https://sites.google.com/site/namjoonsuh/;https://www.isye.gatech.edu/users/hyunouk-ko;https://www.isye.gatech.edu/users/xiaoming-huo", "dblp": "254/9575;;67/3392", "google_scholar": ";;https://scholar.google.com/citations?hl=en", "orcid": ";;0000-0003-0101-1206", "linkedin": ";;xiaoming-huo-9653374/", "or_profile": "~Namjoon_Suh1;~Hyunouk_Ko1;~Xiaoming_Huo1", "aff": "Georgia Institute of Technology;Georgia Institute of Technology;Georgia Institute of Technology", "aff_domain": "gatech.edu;gatech.edu;gatech.edu", "position": "PhD student;PhD student;Full Professor", "bibtex": "@inproceedings{\nsuh2022a,\ntitle={A {NON}-{PARAMETRIC} {REGRESSION} {VIEWPOINT} : {GENERALIZATION} {OF} {OVERPARAMETRIZED} {DEEP} {RELU} {NETWORK} {UNDER} {NOISY} {OBSERVATIONS}},\nauthor={Namjoon Suh and Hyunouk Ko and Xiaoming Huo},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=bZJbzaj_IlP}\n}", "github": "", "project": "", "reviewers": "XydK;ytWV;eixk;FGhp", "pdf_size": 0, "recommendation": "6;6;8;8", "confidence": "3;4;3;3", "correctness": "3;3;4;4", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;0;3;0", "wc_summary_paper": "61;142;176;56", "wc_summary_review": "40;109;17;59", "wc_main_review": "258;576;215;101", "wc_review": "359;827;408;216", "wc_reply_reviewers": "43;71;8;19", "wc_reply_authors": "543;1671;434;589", "reply_reviewers": "1;2;1;1", "reply_authors": "1;3;1;1", "recommendation_avg": [ 7.0, 1.0 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 1.25, 1.299038105676658 ], "wc_summary_paper_avg": [ 108.75, 51.6980415489794 ], "wc_summary_review_avg": [ 56.25, 33.89229263416684 ], "wc_main_review_avg": [ 287.5, 176.1682434492664 ], "wc_review_avg": [ 452.5, 227.43405637678805 ], "wc_reply_reviewers_avg": [ 35.25, 24.21130934088448 ], "wc_reply_authors_avg": [ 809.25, 500.7056895023263 ], "reply_reviewers_avg": [ 1.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.5, 0.8660254037844386 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 1.0, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9783854103180879759&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "pdf": "https://openreview.net/pdf?id=bZJbzaj_IlP", "email": "gatech.edu;gatech.edu;gatech.edu", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Georgia Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.gatech.edu", "aff_unique_abbr": "Georgia Tech", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "ba81PoR_k1p", "title": "One for Many: an Instagram inspired black-box adversarial attack", "track": "main", "status": "Reject", "tldr": "", "abstract": "It is well known that deep learning models are susceptible to adversarial attacks. To produce more robust and effective attacks, we propose a nested evolutionary algorithm able to produce multi-network (decision-based) black-box adversarial attacks based on Instagram inspired image filters. Due to the multi-network training, the system reaches a high transferability rate of attacks and, due to the composition of image filters, it is able to bypass standard detection mechanisms. Moreover, this kind of attack is semantically robust: our filter composition cannot be distinguished from any other filter composition used extensively every day to enhance images; this raises new security issues and challenges for real-world systems. Experimental results demonstrate that the method is also effective against\nensemble-adversarially trained models and it has a low cost in terms of queries to the victim model.", "keywords": "black-box adversarial attacks;instragram-based image filters;evolutionary algorithm;multi-network attacks", "primary_area": "", "supplementary_material": "", "author": "Alina Elena Baia;Alfredo Milani;Valentina Poggioni", "authorids": "baia_alinna@yahoo.com;alfredo.milani@unipg.it;~Valentina_Poggioni1", "gender": ";;F", "homepage": ";;", "dblp": ";;37/4605", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": "baia_alinna@yahoo.com;alfredo.milani@unipg.it;~Valentina_Poggioni1", "aff": ";;University of Perugia", "aff_domain": ";;unipg.it", "position": ";;Researcher", "bibtex": "@misc{\nbaia2022one,\ntitle={One for Many: an Instagram inspired black-box adversarial attack},\nauthor={Alina Elena Baia and Alfredo Milani and Valentina Poggioni},\nyear={2022},\nurl={https://openreview.net/forum?id=ba81PoR_k1p}\n}", "github": "", "project": "", "reviewers": "eN8o;chwU;1MTN;Cohx;PWpF", "site": "https://openreview.net/forum?id=ba81PoR_k1p", "pdf_size": 0, "recommendation": "3;3;5;6;6", "confidence": "4;4;4;4;2", "correctness": "3;3;3;4;3", "technical_novelty": "2;2;3;3;2", "empirical_novelty": "2;2;2;3;3", "wc_summary_paper": "69;49;48;42;84", "wc_summary_review": "27;24;27;58;34", "wc_main_review": "303;279;455;103;125", "wc_review": "399;352;530;203;243", "wc_reply_reviewers": "0;0;0;19;25", "wc_reply_authors": "682;1144;1402;219;253", "reply_reviewers": "0;0;0;1;1", "reply_authors": "1;2;2;1;2", "recommendation_avg": [ 4.6, 1.3564659966250536 ], "confidence_avg": [ 3.6, 0.8 ], "correctness_avg": [ 3.2, 0.39999999999999997 ], "technical_novelty_avg": [ 2.4, 0.4898979485566356 ], "empirical_novelty_avg": [ 2.4, 0.4898979485566356 ], "wc_summary_paper_avg": [ 58.4, 15.704776343520464 ], "wc_summary_review_avg": [ 34.0, 12.441864811996632 ], "wc_main_review_avg": [ 253.0, 128.73538752029296 ], "wc_review_avg": [ 345.4, 116.41065243352948 ], "wc_reply_reviewers_avg": [ 8.8, 10.943491216243562 ], "wc_reply_authors_avg": [ 740.0, 471.89702266490303 ], "reply_reviewers_avg": [ 0.4, 0.48989794855663565 ], "reply_authors_avg": [ 1.6, 0.4898979485566356 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.51604684654214, "corr_recommendation_correctness": 0.5160468465421401, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:oe2-TJd5RygJ:scholar.google.com/&scioq=One+for+Many:+an+Instagram+inspired+black-box+adversarial+attack&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "University of Perugia", "aff_unique_dep": "", "aff_unique_url": "https://www.unipg.it", "aff_unique_abbr": "Unipg", "aff_country_unique_index": "0", "aff_country_unique": "Italy" }, { "title": "Robust Unlearnable Examples: Protecting Data Privacy Against Adversarial Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6035", "id": "baUQQPwQiAg", "poster": "", "openreview": "https://openreview.net/forum?id=baUQQPwQiAg", "slides": "https://iclr.cc/virtual/2022/poster/6035", "video": "https://iclr.cc/virtual/2022/poster/6035", "author_site": "Shaopeng Fu, Fengxiang He, Yang Liu, Li Shen, Dacheng Tao", "tldr": "", "abstract": "The tremendous amount of accessible data in cyberspace face the risk of being unauthorized used for training deep learning models. To address this concern, methods are proposed to make data unlearnable for deep learning models by adding a type of error-minimizing noise. However, such conferred unlearnability is found fragile to adversarial training. In this paper, we design new methods to generate robust unlearnable examples that are protected from adversarial training. We first find that the vanilla error-minimizing noise, which suppresses the informative knowledge of data via minimizing the corresponding training loss, could not effectively minimize the adversarial training loss. This explains the vulnerability of error-minimizing noise in adversarial training. Based on the observation, robust error-minimizing noise is then introduced to reduce the adversarial training loss. Experiments show that the unlearnability brought by robust error-minimizing noise can effectively protect data from adversarial training in various scenarios. The code is available at \\url{https://github.com/fshp971/robust-unlearnable-examples}.", "keywords": "unlearnable examples;adversarial training;privacy", "primary_area": "", "supplementary_material": "", "author": "Shaopeng Fu;Fengxiang He;Yang Liu;Li Shen;Dacheng Tao", "authorids": "~Shaopeng_Fu1;~Fengxiang_He1;~Yang_Liu59;~Li_Shen1;~Dacheng_Tao1", "gender": ";;F;M;", "homepage": "https://shaopengfu.me;https://fengxianghe.github.io/;;https://sites.google.com/site/mathshenli/home;", "dblp": "278/8181;225/4682;;91/3680-8;", "google_scholar": "i7cWm4gAAAAJ;QSx-Yu0AAAAJ;JEieoFsAAAAJ;yVhgENIAAAAJ;", "orcid": ";;;;", "linkedin": ";fengxiang-he-35b173122;;;", "or_profile": "~Shaopeng_Fu1;~Fengxiang_He1;~Yang_Liu59;~Li_Shen1;~Dacheng_Tao1", "aff": "JD Explore Academy;JD.com, Inc.;Tsinghua University;JD Explore Academy;", "aff_domain": "jd.com;jd.com;tsinghua.edu.cn;jd.com;", "position": "Researcher;Algorithm Scientist;Associate Professor;Researcher;", "bibtex": "@inproceedings{\nfu2022robust,\ntitle={Robust Unlearnable Examples: Protecting Data Privacy Against Adversarial Learning},\nauthor={Shaopeng Fu and Fengxiang He and Yang Liu and Li Shen and Dacheng Tao},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=baUQQPwQiAg}\n}", "github": "", "project": "", "reviewers": "ntwj;XwVk;c7i6;z2Tr", "pdf_size": 0, "recommendation": "3;6;6;8", "confidence": "4;3;5;5", "correctness": "2;4;3;4", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;3;2;4", "wc_summary_paper": "115;119;63;39", "wc_summary_review": "48;51;73;60", "wc_main_review": "353;131;418;242", "wc_review": "516;301;554;341", "wc_reply_reviewers": "315;0;155;17", "wc_reply_authors": "1794;183;2416;541", "reply_reviewers": "1;0;2;1", "reply_authors": "3;1;6;2", "recommendation_avg": [ 5.75, 1.7853571071357126 ], "confidence_avg": [ 4.25, 0.82915619758885 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 84.0, 34.10278580995987 ], "wc_summary_review_avg": [ 58.0, 9.72111104761179 ], "wc_main_review_avg": [ 286.0, 109.40063985187655 ], "wc_review_avg": [ 428.0, 108.76350490858594 ], "wc_reply_reviewers_avg": [ 121.75, 126.73471308209129 ], "wc_reply_authors_avg": [ 1233.5, 907.6856559404252 ], "reply_reviewers_avg": [ 1.0, 0.7071067811865476 ], "reply_authors_avg": [ 3.0, 1.8708286933869707 ], "replies_avg": [ 23, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.37998029782867415, "corr_recommendation_correctness": 0.8866206949335731, "gs_citation": 93, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12845600220253091559&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=baUQQPwQiAg", "email": "jd.com;jd.com;tsinghua.edu.cn;jd.com;", "author_num": 5, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "JD;JD.com;Tsinghua University", "aff_unique_dep": "JD Explore Academy;;", "aff_unique_url": ";https://www.jd.com;https://www.tsinghua.edu.cn", "aff_unique_abbr": ";JD.com;THU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "1;1", "aff_country_unique": ";China" }, { "id": "beUek8ku1Q", "title": "k-Median Clustering via Metric Embedding: Towards Better Initialization with Privacy", "track": "main", "status": "Reject", "tldr": "", "abstract": "In clustering algorithms, the choice of initial centers is crucial for the quality of the learned clusters. We propose a new initialization scheme for the $k$-median problem in the general metric space (e.g., discrete space induced by graphs), based on the construction of metric embedding tree structure of the data. From the tree, we can extract good initial centers that can be used subsequently for the local search algorithm. Our method, named the HST initialization, can also be easily extended to the setting of differential privacy (DP) to generate private initial centers. Theoretically, the initial centers from HST initialization can achieve lower error than those from another popular initialization method, $k$-median++, in the non-DP setting. Moreover, with privacy constraint, we show that the error of applying DP local search followed by our private HST initialization improves previous results, and approaches the known lower bound within a small factor. Empirically, experiments are conducted to demonstrate the effectiveness of our methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Chenglin Fan;Ping Li;Xiaoyun Li", "authorids": "~Chenglin_Fan1;~Ping_Li3;~Xiaoyun_Li1", "gender": "M;M;M", "homepage": ";http://www.stat.rutgers.edu/home/pingli/;https://lixiaoyun0239.github.io/cv/", "dblp": "76/8243.html;62/5860-1;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": "~Chenglin_Fan1;~Ping_Li3;~Xiaoyun_Li1", "aff": "Research, Baidu;LinkedIn;Baidu", "aff_domain": "research.baidu.com;linkedin.com;baidu.com", "position": "Visiting Researcher;Engineer;Researcher", "bibtex": "@misc{\nfan2022kmedian,\ntitle={k-Median Clustering via Metric Embedding: Towards Better Initialization with Privacy},\nauthor={Chenglin Fan and Ping Li and Xiaoyun Li},\nyear={2022},\nurl={https://openreview.net/forum?id=beUek8ku1Q}\n}", "github": "", "project": "", "reviewers": "o7Pb;xvPV;3Abh;dRWX", "site": "https://openreview.net/forum?id=beUek8ku1Q", "pdf_size": 0, "recommendation": "5;6;6;6", "confidence": "4;4;3;3", "correctness": "4;3;4;4", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;3;2;3", "wc_summary_paper": "71;442;78;105", "wc_summary_review": "38;82;46;51", "wc_main_review": "154;438;284;326", "wc_review": "263;962;408;482", "wc_reply_reviewers": "0;66;13;0", "wc_reply_authors": "624;760;334;495", "reply_reviewers": "0;1;1;0", "reply_authors": "2;2;1;1", "recommendation_avg": [ 5.75, 0.4330127018922193 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 174.0, 155.24979871162475 ], "wc_summary_review_avg": [ 54.25, 16.67895380412093 ], "wc_main_review_avg": [ 300.5, 101.60093503506747 ], "wc_review_avg": [ 528.75, 262.247378442569 ], "wc_reply_reviewers_avg": [ 19.75, 27.224758952100935 ], "wc_reply_authors_avg": [ 553.25, 157.4918648692687 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": -0.3333333333333333, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11567119225203703744&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;0", "aff_unique_norm": "Baidu;LinkedIn Corporation", "aff_unique_dep": "Research;", "aff_unique_url": "https://www.baidu.com;https://www.linkedin.com", "aff_unique_abbr": "Baidu;LinkedIn", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", "aff_country_unique": "China;United States" }, { "id": "beiz51zcm-H", "title": "BO-DBA: Query-Efficient Decision-Based Adversarial Attacks via Bayesian Optimization", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Decision-based attacks (DBA), wherein attackers perturb inputs to spoof learning algorithms by observing solely the output labels, are a type of severe adversarial attacks against Deep Neural Networks (DNNs) that require minimal knowledge of attackers. Most existing DBA attacks rely on zeroth-order gradient estimation and require an excessive number ($>$20,000) of queries to converge. To better understand the attack, this paper presents an efficient DBA attack technique, namely BO-DBA, that greatly improves the query efficiency. We achieve this by introducing dimension reduction techniques and derivative-free optimization to the process of closest decision boundary search. In BO-DBA, we adopt the Gaussian process to model the distribution of decision boundary radius over a low-dimensional search space defined by perturbation generator functions. Bayesian Optimization is then leveraged to find the optimal direction. Experimental results on pre-trained ImageNet classifiers show that BO-DBA converges within 200 queries while the state-of-the-art DBA techniques using zeroth order optimization need over 15,000 queries to achieve the same level of perturbation distortion.", "keywords": "Adversarial Attack;Bayesian Optimization", "primary_area": "", "supplementary_material": "", "author": "ZHUOSHENG ZHANG;Shucheng Yu", "authorids": "~ZHUOSHENG_ZHANG2;~Shucheng_Yu1", "gender": "M;", "homepage": ";", "dblp": ";", "google_scholar": ";AwOsshkAAAAJ", "orcid": ";", "linkedin": "gnbenjamin/;", "or_profile": "~ZHUOSHENG_ZHANG2;~Shucheng_Yu1", "aff": "Stevens Institute of Technology;Stevens Institute of Technology", "aff_domain": "stevens.edu;stevens.edu", "position": "PhD student;Associate Professor", "bibtex": "@misc{\nzhang2022bodba,\ntitle={{BO}-{DBA}: Query-Efficient Decision-Based Adversarial Attacks via Bayesian Optimization},\nauthor={ZHUOSHENG ZHANG and Shucheng Yu},\nyear={2022},\nurl={https://openreview.net/forum?id=beiz51zcm-H}\n}", "github": "", "project": "", "reviewers": "2rc9;FQ54;9VGY;i1Je", "site": "https://openreview.net/forum?id=beiz51zcm-H", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "3;5;3;3", "correctness": "2;4;2;4", "technical_novelty": "3;2;1;2", "empirical_novelty": "2;2;1;0", "wc_summary_paper": "53;60;31;67", "wc_summary_review": "26;26;11;21", "wc_main_review": "338;217;336;343", "wc_review": "417;303;378;431", "wc_reply_reviewers": "97;237;78;0", "wc_reply_authors": "377;389;364;416", "reply_reviewers": "1;1;1;0", "reply_authors": "1;2;1;1", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 3.5, 0.8660254037844386 ], "correctness_avg": [ 3.0, 1.0 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 1.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 52.75, 13.497684986693088 ], "wc_summary_review_avg": [ 21.0, 6.123724356957945 ], "wc_main_review_avg": [ 308.5, 52.88903478037768 ], "wc_review_avg": [ 382.25, 49.7060107029321 ], "wc_reply_reviewers_avg": [ 103.0, 85.47806736233571 ], "wc_reply_authors_avg": [ 386.5, 19.189841062395487 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9181993547529393338&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0", "aff_unique_norm": "Stevens Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.stevens.edu", "aff_unique_abbr": "SIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Learning Efficient Online 3D Bin Packing on Packing Configuration Trees", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6490", "id": "bfuGjlCwAq", "poster": "", "openreview": "https://openreview.net/forum?id=bfuGjlCwAq", "slides": "https://iclr.cc/virtual/2022/poster/6490", "video": "https://iclr.cc/virtual/2022/poster/6490", "author_site": "Hang Zhao, Yang Yu, Kai Xu", "tldr": "", "abstract": "Online 3D Bin Packing Problem (3D-BPP) has widespread applications in industrial automation and has aroused enthusiastic research interest recently. Existing methods usually solve the problem with limited resolution of spatial discretization, and/or cannot deal with complex practical constraints well. We propose to enhance the practical applicability of online 3D-BPP via learning on a novel hierarchical representation \u2013 packing configuration tree (PCT). PCT is a full-fledged description of the state and action space of bin packing which can support packing policy learning based on deep reinforcement learning (DRL). The size of the packing action space is proportional to the number of leaf nodes, making the DRL model easy to train and well-performing even with continuous solution space. During training, PCT expands based on heuristic rules, however, the DRL model learns a much more effective and robust packing policy than heuristic methods. Through extensive evaluation, we demonstrate that our method outperforms all existing online BPP methods and is versatile in terms of incorporating various practical constraints.", "keywords": "Bin Packing Problem;Online 3D-BPP;Reinforcement Learning", "primary_area": "", "supplementary_material": "/attachment/58c94c6f61b3dafa283a2fb41693fe9b5045a884.zip", "author": "Hang Zhao;Yang Yu;Kai Xu", "authorids": "~Hang_Zhao3;~Yang_Yu5;~Kai_Xu5", "gender": "M;M;M", "homepage": ";http://kevinkaixu.net/;http://www.lamda.nju.edu.cn/yuy", "dblp": ";Xu_0004:Kai;46/2181-1", "google_scholar": "https://scholar.google.com/citations?hl=en;https://scholar.google.com.hk/citations?user=GuVkg-8AAAAJ;PG2lDSwAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Hang_Zhao3;~Kevin_Xu1;~Yang_Yu2", "aff": "National University of Defense Technology;National University of Defense Technology;Nanjing University", "aff_domain": "nudt.edu.cn;nudt.edu.cn;nju.edu.cn", "position": "PhD student;Professor;Professor", "bibtex": "@inproceedings{\nzhao2022learning,\ntitle={Learning Efficient Online 3D Bin Packing on Packing Configuration Trees},\nauthor={Hang Zhao and Kai Xu},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=bfuGjlCwAq}\n}", "github": "", "project": "", "reviewers": "65kq;vAx8;MYY8;Dhzn", "pdf_size": 0, "recommendation": "3;6;6;8", "confidence": "4;2;3;4", "correctness": "2;4;3;3", "technical_novelty": "2;3;3;3", "empirical_novelty": "0;3;3;0", "wc_summary_paper": "55;46;140;52", "wc_summary_review": "16;62;63;123", "wc_main_review": "532;145;390;315", "wc_review": "603;253;593;490", "wc_reply_reviewers": "0;0;25;0", "wc_reply_authors": "2076;378;929;739", "reply_reviewers": "0;0;1;0", "reply_authors": "3;1;2;2", "recommendation_avg": [ 5.75, 1.7853571071357126 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.5, 1.5 ], "wc_summary_paper_avg": [ 73.25, 38.674119253061214 ], "wc_summary_review_avg": [ 66.0, 37.993420483025744 ], "wc_main_review_avg": [ 345.5, 139.54658720298394 ], "wc_review_avg": [ 484.75, 140.92262948157048 ], "wc_reply_reviewers_avg": [ 6.25, 10.825317547305483 ], "wc_reply_authors_avg": [ 1030.5, 635.2363733288578 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 2.0, 0.7071067811865476 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.1266600992762247, "corr_recommendation_correctness": 0.5940885257860046, "gs_citation": 69, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17600632586661755809&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 0, "pdf": "https://openreview.net/pdf?id=bfuGjlCwAq", "email": "nudt.edu.cn;nudt.edu.cn;nju.edu.cn", "author_num": 3, "aff_unique_index": "0;0;1", "aff_unique_norm": "National University of Defense Technology;Nanjing University", "aff_unique_dep": ";", "aff_unique_url": "http://www.nudt.edu.cn/;https://www.nju.edu.cn", "aff_unique_abbr": "NUDT;Nanjing U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "id": "bgAS1ZvveZ", "title": "Faster Reinforcement Learning with Value Target Lower Bounding", "track": "main", "status": "Reject", "tldr": "", "abstract": "We show that an arbitrary lower bound of the optimal value function can be used to improve the Bellman value target during value learning. In the tabular case, value learning under the lower bounded Bellman operator converges to the same optimal value as under the original Bellman operator, at a potentially faster speed. In practice, discounted episodic return from the training experience or discounted goal return from hindsight relabeling can serve as the value lower bound when the environment is deterministic. This is because the empirical episodic return from any state can always be repeated through the same action sequence in a deterministic environment, thus a lower bound of the optimal value from the state. We experiment on Atari games, FetchEnv tasks and a challenging physically simulated car push and reach task. We show that in most cases, simply lower bounding with the discounted episodic return performs at least as well as common baselines such as TD3, SAC and Hindsight Experience Replay (HER). It learns much faster than TD3 or HER on some of the harder continuous control tasks, requiring minimal or no parameter tuning.", "keywords": "reinforcement learning;bellman value target;lower bound;discounted return", "primary_area": "", "supplementary_material": "", "author": "Le Zhao;Wei Xu", "authorids": "~Le_Zhao2;~Wei_Xu13", "gender": ";M", "homepage": ";", "dblp": ";", "google_scholar": ";Gxz1fqwAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Le_Zhao2;~Wei_Xu13", "aff": ";Horizon Robotics", "aff_domain": ";horizon.auto", "position": ";Researcher", "bibtex": "@misc{\nzhao2022faster,\ntitle={Faster Reinforcement Learning with Value Target Lower Bounding},\nauthor={Le Zhao and Wei Xu},\nyear={2022},\nurl={https://openreview.net/forum?id=bgAS1ZvveZ}\n}", "github": "", "project": "", "reviewers": "MXsM;hHjM;e2TJ;cupU", "site": "https://openreview.net/forum?id=bgAS1ZvveZ", "pdf_size": 0, "recommendation": "3;6;6;6", "confidence": "5;5;4;4", "correctness": "3;2;4;4", "technical_novelty": "1;3;3;3", "empirical_novelty": "1;3;3;3", "wc_summary_paper": "28;33;87;83", "wc_summary_review": "18;18;67;50", "wc_main_review": "124;96;553;649", "wc_review": "170;147;707;782", "wc_reply_reviewers": "362;119;107;214", "wc_reply_authors": "807;588;1023;797", "reply_reviewers": "1;2;3;1", "reply_authors": "1;2;3;2", "recommendation_avg": [ 5.25, 1.299038105676658 ], "confidence_avg": [ 4.5, 0.5 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.5, 0.8660254037844386 ], "empirical_novelty_avg": [ 2.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 57.75, 27.343874999714288 ], "wc_summary_review_avg": [ 38.25, 21.123150806638673 ], "wc_main_review_avg": [ 355.5, 248.03275993303788 ], "wc_review_avg": [ 451.5, 294.3097857700284 ], "wc_reply_reviewers_avg": [ 200.5, 102.04043316254591 ], "wc_reply_authors_avg": [ 803.75, 153.84631129799635 ], "reply_reviewers_avg": [ 1.75, 0.82915619758885 ], "reply_authors_avg": [ 2.0, 0.7071067811865476 ], "replies_avg": [ 22, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.17407765595569782, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10931768291379180304&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2, "aff_unique_index": "0", "aff_unique_norm": "Horizon Robotics", "aff_unique_dep": "", "aff_unique_url": "https://www.horizon-robotics.com/", "aff_unique_abbr": "Horizon Robotics", "aff_country_unique_index": "0", "aff_country_unique": "China" }, { "id": "bglU8l_Pq8Q", "title": "In defense of dual-encoders for neural ranking", "track": "main", "status": "Reject", "tldr": "", "abstract": "Transformer-based models such as BERT have proven successful in information retrieval problem, which seek to identify relevant documents for a given query. There are two broad flavours of such models: cross-attention (CA) models, which learn a joint embedding for the query and document, and dual-encoder (DE) models, which learn separate embeddings for the query and document. Empirically, CA models are often found to be more accurate, which has motivated a series of works seeking to bridge this gap. However, a more fundamental question remains less explored: does this performance gap reflect an inherent limitation in the capacity of DE models, or a limitation in the training of such models? And does such an understanding suggest a principled means of improving DE models? In this paper, we study these questions, with three contributions. First, we establish theoretically that with a sufficiently large embedding dimension, DE models have the capacity to model a broad class of score distributions. Second, we show empirically that on real-world problems, DE models may overfit to spurious correlations in the training set, and thus under-perform on test samples. To mitigate this behaviour, we propose a suitable distillation strategy, and confirm its practical efficacy on the MSMARCO-Passage and Natural Questions benchmarks.", "keywords": "cross-attention;dual encoder;neural ranking;distillation", "primary_area": "", "supplementary_material": "", "author": "Aditya Krishna Menon;Sadeep Jayasumana;Seungyeon Kim;Ankit Singh Rawat;Sashank J. Reddi;Sanjiv Kumar", "authorids": "~Aditya_Krishna_Menon1;~Sadeep_Jayasumana1;~Seungyeon_Kim1;~Ankit_Singh_Rawat1;~Sashank_J._Reddi1;~Sanjiv_Kumar1", "gender": ";;;M;M;", "homepage": ";;https://www.seungyeon.ai;https://ankitsrawat.github.io/home/;;http://www.sanjivk.com/", "dblp": ";;74/7997-1.html;https://dblp.org/pers/hd/r/Rawat:Ankit_Singh;50/10452;", "google_scholar": ";;zbcN_QIAAAAJ;http://scholar.google.com/citations?user=U0_ab4cAAAAJ;70lgwYwAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": "~Aditya_Krishna_Menon1;~Sadeep_Jayasumana1;~Seungyeon_Kim1;~Ankit_Singh_Rawat1;~Sashank_J._Reddi1;~Sanjiv_Kumar1", "aff": ";;Google;Google;Google;Google", "aff_domain": ";;google.com;google.com;google.com;google.com", "position": ";;Researcher;Research Scientist;Research Scientist;Research Scientist", "bibtex": "@misc{\nmenon2022in,\ntitle={In defense of dual-encoders for neural ranking},\nauthor={Aditya Krishna Menon and Sadeep Jayasumana and Seungyeon Kim and Ankit Singh Rawat and Sashank J. Reddi and Sanjiv Kumar},\nyear={2022},\nurl={https://openreview.net/forum?id=bglU8l_Pq8Q}\n}", "github": "", "project": "", "reviewers": "4c6M;FgFj;oYj4;yXc2", "site": "https://openreview.net/forum?id=bglU8l_Pq8Q", "pdf_size": 0, "recommendation": "3;5;6;6", "confidence": "5;3;2;4", "correctness": "4;3;4;3", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "85;76;197;133", "wc_summary_review": "54;49;58;14", "wc_main_review": "212;357;148;173", "wc_review": "351;482;403;320", "wc_reply_reviewers": "0;17;8;0", "wc_reply_authors": "524;745;238;342", "reply_reviewers": "0;1;1;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 5.0, 1.224744871391589 ], "confidence_avg": [ 3.5, 1.118033988749895 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 122.75, 48.03319164910864 ], "wc_summary_review_avg": [ 43.75, 17.469616481193857 ], "wc_main_review_avg": [ 222.5, 80.9336147716139 ], "wc_review_avg": [ 389.0, 61.33922073192649 ], "wc_reply_reviewers_avg": [ 6.25, 7.013380069552769 ], "wc_reply_authors_avg": [ 462.25, 192.6841651511613 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.7302967433402214, "corr_recommendation_correctness": -0.40824829046386296, "gs_citation": 32, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1752802016858903204&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Google", "aff_unique_dep": "Google", "aff_unique_url": "https://www.google.com", "aff_unique_abbr": "Google", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "bi9j5yi-Vrv", "title": "A General Theory of Relativity in Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "We propose a new general theory measuring the relativity between two arbitrary Markov Decision Processes (MDPs) from the perspective of reinforcement learning (RL). Considering two MDPs, tasks such as policy transfer, dynamics modeling, environment design, and simulation to reality (sim2real), etc., are all closely related. The proposed theory deeply investigates the connection between any two cumulative expected returns defined on different policies and environment dynamics, and the theoretical results suggest two new general algorithms referred to as Relative Policy Optimization (RPO) and Relative Transition Optimization (RTO), which can offer fast policy transfer and dynamics modeling. RPO updates the policy using the \\emph{relative policy gradient} to transfer the policy evaluated in one environment to maximize the return in another, while RTO updates the parameterized dynamics model (if there exists) using the \\emph{relative transition gradient} to reduce the gap between the dynamics of the two environments. Then, integrating the two algorithms offers the complete algorithm Relative Policy-Transition Optimization (RPTO), in which the policy interacts with the two environments simultaneously, such that data collections from the two environments, policy and transition updates are all completed in a closed loop to form a principled learning framework for policy transfer. We demonstrate the effectiveness of RPO, RTO and RPTO in the OpenAI gym's classic control tasks by creating policy transfer problems.", "keywords": "Reinforcement Learning;General RL Theory;Policy Transfer;Dynamics Modeling", "primary_area": "", "supplementary_material": "/attachment/c5ce0fa0c1fb68e1ca1f1d706e09572cdda3d431.zip", "author": "Lei Han;Cheng Zhou;Yizheng Zhang", "authorids": "~Lei_Han1;~Cheng_Zhou1;~Yizheng_Zhang1", "gender": "M;M;M", "homepage": "https://www.leihan.org;;", "dblp": "75/2307-1;61/3491;", "google_scholar": "Tz4_zi8AAAAJ;;", "orcid": ";;0000-0002-0488-9869", "linkedin": ";;", "or_profile": "~Lei_Han1;~Cheng_Zhou1;~Yizheng_Zhang1", "aff": "Tencent Robotics X;;Tencent Robotics X", "aff_domain": "tencent.com;;tencent.com", "position": "Principal Researcher;;Researcher", "bibtex": "@misc{\nhan2022a,\ntitle={A General Theory of Relativity in Reinforcement Learning},\nauthor={Lei Han and Cheng Zhou and Yizheng Zhang},\nyear={2022},\nurl={https://openreview.net/forum?id=bi9j5yi-Vrv}\n}", "github": "", "project": "", "reviewers": "vBa4;G2Gj;U1Ad;gbpy", "site": "https://openreview.net/forum?id=bi9j5yi-Vrv", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "4;2;4;4", "correctness": "3;3;3;2", "technical_novelty": "1;2;2;3", "empirical_novelty": "1;2;2;0", "wc_summary_paper": "21;33;141;242", "wc_summary_review": "41;2;21;46", "wc_main_review": "149;242;432;209", "wc_review": "211;277;594;497", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "242;143;305;476", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 3.5, 0.8660254037844386 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 1.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 109.25, 89.7674077825577 ], "wc_summary_review_avg": [ 27.5, 17.44276354251241 ], "wc_main_review_avg": [ 258.0, 105.84658709660883 ], "wc_review_avg": [ 394.75, 156.35276620514267 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 291.5, 121.16620816052634 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": -1.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:IJ6pJHW7dfkJ:scholar.google.com/&scioq=A+General+Theory+of+Relativity+in+Reinforcement+Learning&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Tencent", "aff_unique_dep": "Tencent Robotics X", "aff_unique_url": "https://www.tencent.com", "aff_unique_abbr": "Tencent Robotics X", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "id": "bidTZROu2y", "title": "Physics Informed Machine Learning of SPH: Machine Learning Lagrangian Turbulence", "track": "main", "status": "Reject", "tldr": "", "abstract": "Smoothed particle hydrodynamics (SPH) is a mesh-free Lagrangian method for obtaining approximate numerical solutions of the equations of fluid dynamics, which has been widely applied to weakly- and strongly compressible turbulence in astrophysics and engineering applications. We present a learn-able hierarchy of parameterized and \"physics-explainable\" SPH informed fluid simulators using both physics based parameters and Neural Networks as universal function approximators. Our learning algorithm develops a mixed mode approach, mixing forward and reverse mode automatic differentiation with forward and adjoint based sensitivity analyses to efficiently perform gradient based optimization. We show that our physics informed learning method is capable of: (a) solving inverse problems over the physically interpretable parameter space, as well as over the space of Neural Network parameters; (b) learning Lagrangian statistics of turbulence; (c) combining Lagrangian trajectory based, probabilistic, and Eulerian field based loss functions; and (d) extrapolating beyond training sets into more complex regimes of interest. Furthermore, our hierarchy of models gradually introduces more physical structure, which we show improves interpretability, generalizability (over larger ranges of time scales and Reynolds numbers), preservation of physical symmetries, and requires less training data.", "keywords": "Physics Informed Machine Learning;Smoothed Particle Hydrodynamics;Sensitivity Analysis;Differentiable Programming;Mixed Mode Automatic Differentiation;Deep Learning;Turbulence;Lagrangian Fluid Simulation.", "primary_area": "", "supplementary_material": "/attachment/dbaeb1115d8903ac26ebb8251f47b06324bab0aa.zip", "author": "Michael J Woodward;Yifeng Tian;Criston Hyett;Chris Fryer;Daniel Livescu;Misha Stepanov;Michael Chertkov", "authorids": "~Michael_J_Woodward1;~Yifeng_Tian1;~Criston_Hyett1;~Chris_Fryer1;~Daniel_Livescu1;~Misha_Stepanov1;~Michael_Chertkov2", "gender": "M;M;M;M;M;;M", "homepage": "https://sites.google.com/math.arizona.edu/mwoodward/;https://www.researchgate.net/profile/Yifeng-Tian-2;https://sites.google.com/math.arizona.edu/cmhyett/;;https://public.lanl.gov/livescu;https://www.math.arizona.edu/~stepanov/;https://sites.google.com/site/mchertkov/", "dblp": ";;;;;;00/2960", "google_scholar": "ls8lMyEAAAAJ;Pk57n7YAAAAJ;;ZZZorWwAAAAJ;9Oek18AAAAAJ;;k4UNBd4AAAAJ", "orcid": ";;;;0000-0003-2367-1547;;0000-0002-6758-515X", "linkedin": ";;;;;;", "or_profile": "~Michael_J_Woodward1;~Yifeng_Tian1;~Criston_Hyett1;~Chris_Fryer1;~Daniel_Livescu1;~Misha_Stepanov1;~Michael_Chertkov1", "aff": "University of Arizona;Los Alamos National Laboratory;University of Arizona;Los Alamos National Laboratory;Los Alamos National Laboratory;University of Arizona;", "aff_domain": "arizona.edu;lanl.gov;arizona.edu;lanl.gov;lanl.gov;arizona.edu;", "position": "PhD student;Postdoc;PhD student;Researcher;Principal Researcher;Associate Professor;", "bibtex": "@misc{\nwoodward2022physics,\ntitle={Physics Informed Machine Learning of {SPH}: Machine Learning Lagrangian Turbulence},\nauthor={Michael J Woodward and Yifeng Tian and Criston Hyett and Chris Fryer and Daniel Livescu and Misha Stepanov and Michael Chertkov},\nyear={2022},\nurl={https://openreview.net/forum?id=bidTZROu2y}\n}", "github": "", "project": "", "reviewers": "sFNH;Xkyx;mpMw;jmEb;hB3G", "site": "https://openreview.net/forum?id=bidTZROu2y", "pdf_size": 0, "recommendation": "3;3;3;5;6", "confidence": "4;4;3;3;3", "correctness": "2;3;4;3;4", "technical_novelty": "2;2;2;2;2", "empirical_novelty": "2;2;3;3;2", "wc_summary_paper": "56;177;143;83;116", "wc_summary_review": "50;39;136;75;93", "wc_main_review": "483;365;387;110;277", "wc_review": "589;581;666;268;486", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "0;0;0;0;0", "reply_reviewers": "0;0;0;0;0", "reply_authors": "0;0;0;0;0", "recommendation_avg": [ 4.0, 1.2649110640673518 ], "confidence_avg": [ 3.4, 0.4898979485566356 ], "correctness_avg": [ 3.2, 0.7483314773547882 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.4, 0.4898979485566356 ], "wc_summary_paper_avg": [ 115.0, 42.74108094094018 ], "wc_summary_review_avg": [ 78.6, 34.35462123208463 ], "wc_main_review_avg": [ 324.4, 125.64648821196715 ], "wc_review_avg": [ 518.0, 137.43216508517938 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": -0.6454972243679028, "corr_recommendation_correctness": 0.42257712736425823, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18066825925437757938&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;0;1;1;0", "aff_unique_norm": "University of Arizona;Los Alamos National Laboratory", "aff_unique_dep": ";", "aff_unique_url": "https://www.arizona.edu;https://www.lanl.gov", "aff_unique_abbr": "UA;LANL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "bilHNPhT6-", "title": "On Multi-objective Policy Optimization as a Tool for Reinforcement Learning: Case Studies in Offline RL and Finetuning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Many advances that have improved the robustness and efficiency of deep reinforcement learning (RL) algorithms can, in one way or another, be understood as introducing additional objectives or constraints in the policy optimization step. This includes ideas as far ranging as exploration bonuses, entropy regularization, and regularization toward teachers or data priors. Often, the task reward and auxiliary objectives are in conflict, and in this paper we argue that this makes it natural to treat these cases as instances of multi-objective (MO) optimization problems. We demonstrate how this perspective allows us to develop novel and more effective RL algorithms. In particular, we focus on offline RL and finetuning as case studies, and show that existing approaches can be understood as MO algorithms relying on linear scalarization. We hypothesize that replacing linear scalarization with a better algorithm can improve performance. We introduce Distillation of a Mixture of Experts (DiME), a new MORL algorithm that outperforms linear scalarization and can be applied to these non-standard MO problems. We demonstrate that for offline RL, DiME leads to a simple new algorithm that outperforms state-of-the-art. For finetuning, we derive new algorithms that learn to outperform the teacher policy.", "keywords": "offline RL;learning from experts;finetuning;multi-objective RL;deep RL;continuous control", "primary_area": "", "supplementary_material": "", "author": "Abbas Abdolmaleki;Sandy Huang;Giulia Vezzani;Bobak Shahriari;Jost Tobias Springenberg;Shruti Mishra;Dhruva Tirumala;Arunkumar Byravan;Konstantinos Bousmalis;Andr\u00e1s Gy\u00f6rgy;Csaba Szepesvari;raia hadsell;Nicolas Heess;Martin Riedmiller", "authorids": "~Abbas_Abdolmaleki3;~Sandy_Huang1;~Giulia_Vezzani1;~Bobak_Shahriari1;~Jost_Tobias_Springenberg1;~Shruti_Mishra1;~Dhruva_Tirumala1;~Arunkumar_Byravan1;~Konstantinos_Bousmalis1;~Andr\u00e1s_Gy\u00f6rgy2;~Csaba_Szepesvari1;~raia_hadsell1;~Nicolas_Heess1;~Martin_Riedmiller1", "gender": ";F;F;M;M;;M;M;M;F;;M;;", "homepage": ";https://shhuang.github.io/;;;http://www.springenberg-tobias.de;;https://homes.cs.washington.edu/~barun/;;https://sites.ualberta.ca/~szepesva/;http://www.raiahadsell.com;;https://www.riedmiller.me/;http://www.cs.bme.hu/~gya;", "dblp": ";153/7841;;;;;151/9400;http://dblp.org/pers/hd/b/Bousmalis:Konstantinos;http://dblp.uni-trier.de/pers/hd/s/Szepesv=aacute=ri:Csaba;http://dblp.uni-trier.de/pers/hd/h/Hadsell:Raia;76/9181;;72/251-1;190/7697.html", "google_scholar": ";eurA6WgAAAAJ;https://scholar.google.it/citations?user=Zlpuln8AAAAJ;https://scholar.google.co.uk/citations?user=Vwas7kAAAAAJ;;JYoisp4AAAAJ;obYwWiMAAAAJ;wtRVnsYAAAAJ;https://scholar.google.ca/citations?user=zvC19mQAAAAJ;EWQnacoAAAAJ;79k7bGEAAAAJ;1gVfqpcAAAAJ;https://scholar.google.com/citations?hl=en;HqKq-2YAAAAJ", "orcid": ";;;;;;;;;;;;0000-0003-0586-4337;", "linkedin": ";;;;;;;;csaba-szepesvari-09376b1?trk=hp-identity-name;;;;;", "or_profile": "~Abbas_Abdolmaleki3;~Sandy_Huang1;~Giulia_Vezzani1;~Bobak_Shahriari1;~Jost_Tobias_Springenberg1;~Shruti_Mishra1;~Arunkumar_Byravan1;~Konstantinos_Bousmalis1;~Csaba_Szepesvari1;~raia_hadsell1;~Nicolas_Heess1;~Martin_Riedmiller1;~Andras_Gyorgy1;~Dhruva_TB1", "aff": "Google;Google DeepMind;Google DeepMind;Google DeepMind;Google DeepMind;;Google;Google DeepMind;Google DeepMind;Google DeepMind;Google DeepMind;;Google DeepMind;University College London", "aff_domain": "google.com;deepmind.com;deepmind.com;deepmind.com;google.com;;google.com;google.com;google.com;deepmind.com;google.com;;deepmind.com;ucl.ac.uk", "position": "research scientist;Research Scientist;Researcher;Research Scientist;Researcher;;Research Scientist;Research Scientist;Research Scientist;Research Scientist;Research Scientist;;Research Scientist;PhD student", "bibtex": "@misc{\nabdolmaleki2022on,\ntitle={On Multi-objective Policy Optimization as a Tool for Reinforcement Learning: Case Studies in Offline {RL} and Finetuning},\nauthor={Abbas Abdolmaleki and Sandy Huang and Giulia Vezzani and Bobak Shahriari and Jost Tobias Springenberg and Shruti Mishra and Dhruva Tirumala and Arunkumar Byravan and Konstantinos Bousmalis and Andr{\\'a}s Gy{\\\"o}rgy and Csaba Szepesvari and raia hadsell and Nicolas Heess and Martin Riedmiller},\nyear={2022},\nurl={https://openreview.net/forum?id=bilHNPhT6-}\n}", "github": "", "project": "", "reviewers": "gxGj;kKtt;JB6Q;gFyL", "site": "https://openreview.net/forum?id=bilHNPhT6-", "pdf_size": 0, "recommendation": "3;5;6;6", "confidence": "3;3;3;3", "correctness": "3;3;4;4", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;3;2;3", "wc_summary_paper": "75;69;42;136", "wc_summary_review": "49;24;30;42", "wc_main_review": "462;178;188;247", "wc_review": "586;271;260;425", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "505;489;406;181", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 5.0, 1.224744871391589 ], "confidence_avg": [ 3.0, 0.0 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 80.5, 34.3693177121688 ], "wc_summary_review_avg": [ 36.25, 9.807522622966516 ], "wc_main_review_avg": [ 268.75, 114.64592229992309 ], "wc_review_avg": [ 385.5, 132.8730597224283 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 395.25, 129.27949373353843 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 14, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.8164965809277259, "gs_citation": 25, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17971056041563345318&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;0;0;0;0;0;0;0;0;0;1", "aff_unique_norm": "Google;University College London", "aff_unique_dep": "Google;", "aff_unique_url": "https://www.google.com;https://www.ucl.ac.uk", "aff_unique_abbr": "Google;UCL", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;1;1;1;1;0;1;1;1;1;1;1", "aff_country_unique": "United States;United Kingdom" }, { "id": "biyvmQe5jM", "title": "How to decay your learning rate", "track": "main", "status": "Reject", "tldr": "", "abstract": "Complex learning rate schedules have become an integral part of deep learning. We find empirically that common fine-tuned schedules decay the learning rate after the weight norm bounces. This leads to the proposal of ABEL: an automatic scheduler which decays the learning rate by keeping track of the weight norm. ABEL's performance matches that of tuned schedules, is more robust with respect to its parameters and does not depend on the time budget. Through extensive experiments in vision, NLP, and RL, we show that if the weight norm does not bounce, we can simplify schedules even further with no loss in performance. In such cases, a complex schedule has similar performance to a constant learning rate with a decay at the end of training. ", "keywords": "learning rates;hyperparameter tuning;schedules", "primary_area": "", "supplementary_material": "/attachment/963504882463458337315f601d568d75c8371ab5.zip", "author": "Aitor Lewkowycz", "authorids": "~Aitor_Lewkowycz2", "gender": "M", "homepage": "https://scholar.google.com/citations?user=Yum1ah0AAAAJ&hl=en&authuser=1", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "~Aitor_Lewkowycz2", "aff": "Google", "aff_domain": "google.com", "position": "Postdoc", "bibtex": "@misc{\nlewkowycz2022how,\ntitle={How to decay your learning rate},\nauthor={Aitor Lewkowycz},\nyear={2022},\nurl={https://openreview.net/forum?id=biyvmQe5jM}\n}", "github": "", "project": "", "reviewers": "y2w2;iadJ;oLgq;rqFL", "site": "https://openreview.net/forum?id=biyvmQe5jM", "pdf_size": 0, "recommendation": "3;3;6;6", "confidence": "4;4;4;3", "correctness": "2;3;4;3", "technical_novelty": "2;2;3;3", "empirical_novelty": "3;2;3;3", "wc_summary_paper": "188;144;165;126", "wc_summary_review": "64;45;106;21", "wc_main_review": "1106;391;521;318", "wc_review": "1358;580;792;465", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "756;317;323;208", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 4.5, 1.5 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 155.75, 23.177305710543667 ], "wc_summary_review_avg": [ 59.0, 31.12073263919087 ], "wc_main_review_avg": [ 584.0, 310.02338621465316 ], "wc_review_avg": [ 798.75, 343.52829213909007 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 401.0, 210.00833316799597 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 1, 0 ], "corr_recommendation_confidence": -0.5773502691896258, "corr_recommendation_correctness": 0.7071067811865476, "gs_citation": 49, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1481251574701367585&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff_unique_index": "0", "aff_unique_norm": "Google", "aff_unique_dep": "Google", "aff_unique_url": "https://www.google.com", "aff_unique_abbr": "Google", "aff_campus_unique_index": "0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "id": "bjYunHo6LWR", "title": "Classification and Uncertainty Quantification of Corrupted Data using Semi-Supervised Autoencoders", "track": "main", "status": "Reject", "tldr": "", "abstract": "Parametric and non-parametric classifiers often have to deal with real-world data, where corruptions like noise, occlusions, and blur are unavoidable \u2013 posing significant challenges. We present a probabilistic approach to classify strongly corrupted data and quantify uncertainty, despite the model only having been trained with uncorrupted data. A semi-supervised autoencoder trained on uncorrupted data is the underlying architecture. We use the decoding part as a generative model for realistic data and extend it by convolutions, masking, and additive Gaussian noise to describe imperfections. This constitutes a statistical inference task in terms of the optimal latent space activations of the underlying uncorrupted datum. We solve this problem approximately with Metric Gaussian Variational Inference (MGVI). The supervision of the autoencoder\u2019s latent space allows us to classify corrupted data directly under uncertainty with the statistically inferred latent space activations. Furthermore, we demonstrate that the model uncertainty strongly depends on whether the classification is correct or wrong, setting a basis for a statistical \"lie detector\" of the classification. Independent of that, we show that the generative model can optimally restore the uncorrupted datum by decoding the inferred latent space activations.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/e44334ea46d0d114d95c4b2a719befc1be24ba7d.zip", "author": "Philipp Joppich;Sebastian Dorn;Oliver De Candido;Wolfgang Utschick;Jakob Knollm\u00fcller", "authorids": "~Philipp_Joppich1;~Sebastian_Dorn1;~Oliver_De_Candido1;~Wolfgang_Utschick1;~Jakob_Knollm\u00fcller1", "gender": ";M;;M;M", "homepage": ";https://www.audi.de;https://www.ei.tum.de/msv/people/oliver-de-candido/;https://www.ce.cit.tum.de/msv/;", "dblp": ";;173/7050;34/5115;", "google_scholar": ";;https://scholar.google.com/citations?hl=en;qflRi8QAAAAJ;DIM4kDIAAAAJ", "orcid": "0000-0002-1542-7598;;0000-0002-9523-7777;0000-0002-2871-4246;", "linkedin": ";;decandido/;;", "or_profile": "~Philipp_Joppich1;~Sebastian_Dorn1;~Oliver_De_Candido1;~Wolfgang_Utschick1;~Jakob_Knollm\u00fcller1", "aff": "AUDI AG;Max-Planck Institute;Technical University Munich;Technical University Munich;Technical University Munich", "aff_domain": "audi.de;mpg.de;tum.de;tum.de;tum.de", "position": "Employee;Researcher;PhD student;Full Professor;Postdoc", "bibtex": "@misc{\njoppich2022classification,\ntitle={Classification and Uncertainty Quantification of Corrupted Data using Semi-Supervised Autoencoders},\nauthor={Philipp Joppich and Sebastian Dorn and Oliver De Candido and Wolfgang Utschick and Jakob Knollm{\\\"u}ller},\nyear={2022},\nurl={https://openreview.net/forum?id=bjYunHo6LWR}\n}", "github": "", "project": "", "reviewers": "hL2w;SCcY;Hjos;kh1g", "site": "https://openreview.net/forum?id=bjYunHo6LWR", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "4;3;4;3", "correctness": "4;2;1;3", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;2;2;1", "wc_summary_paper": "99;97;35;31", "wc_summary_review": "47;33;109;15", "wc_main_review": "275;118;74;242", "wc_review": "421;248;218;288", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "44;51;37;211", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 2.5, 1.118033988749895 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 1.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 65.5, 32.53843880704789 ], "wc_summary_review_avg": [ 51.0, 35.35533905932738 ], "wc_main_review_avg": [ 177.25, 83.54452405753473 ], "wc_review_avg": [ 293.75, 77.55119276967956 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 85.75, 72.48232543179061 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": -0.4472135954999579, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16044405502053798067&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;1;2;2;2", "aff_unique_norm": "AUDI AG;Max-Planck-Gesellschaft zur F\u00f6rderung der Wissenschaften e.V.;Technical University of Munich", "aff_unique_dep": ";;", "aff_unique_url": "https://www.audi.com;https://www.mpg.de;https://www.tum.de", "aff_unique_abbr": "AUDI;MPG;TUM", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "Germany" }, { "title": "Scattering Networks on the Sphere for Scalable and Rotationally Equivariant Spherical CNNs", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6521", "id": "bjy5Zb2fo2", "poster": "", "openreview": "https://openreview.net/forum?id=bjy5Zb2fo2", "slides": "https://iclr.cc/virtual/2022/poster/6521", "video": "https://iclr.cc/virtual/2022/poster/6521", "author_site": "Jason McEwen, Christopher Wallis, Augustine Mavor-Parker", "tldr": "", "abstract": "Convolutional neural networks (CNNs) constructed natively on the sphere have been developed recently and shown to be highly effective for the analysis of spherical data. While an efficient framework has been formulated, spherical CNNs are nevertheless highly computationally demanding; typically they cannot scale beyond spherical signals of thousands of pixels. We develop scattering networks constructed natively on the sphere that provide a powerful representational space for spherical data. Spherical scattering networks are computationally scalable and exhibit rotational equivariance, while their representational space is invariant to isometries and provides efficient and stable signal representations. By integrating scattering networks as an additional type of layer in the generalized spherical CNN framework, we show how they can be leveraged to scale spherical CNNs to the high-resolution data typical of many practical applications, with spherical signals of many tens of megapixels and beyond.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jason McEwen;Christopher Wallis;Augustine N. Mavor-Parker", "authorids": "~Jason_McEwen1;christophergrwallis@gmail.com;~Augustine_N._Mavor-Parker1", "gender": "M;;M", "homepage": "http://www.jasonmcewen.org/;;https://self-supervisor.github.io/", "dblp": "19/6924;;", "google_scholar": "https://scholar.google.co.uk/citations?user=V19kdRg7j1YC;;J7XkuPwAAAAJ", "orcid": ";;", "linkedin": "jason-mcewen-57300029/;;", "or_profile": "~Jason_McEwen1;christophergrwallis@gmail.com;~Augustine_N._Mavor-Parker1", "aff": "University College London;;", "aff_domain": "ucl.ac.uk;;", "position": "Full Professor;;", "bibtex": "@inproceedings{\nmcewen2022scattering,\ntitle={Scattering Networks on the Sphere for Scalable and Rotationally Equivariant Spherical {CNN}s},\nauthor={Jason McEwen and Christopher Wallis and Augustine N. Mavor-Parker},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=bjy5Zb2fo2}\n}", "github": "", "project": "", "reviewers": "74vz;hrny;FtPm;Zc9b", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "4;3;4;3", "correctness": "4;4;4;3", "technical_novelty": "2;2;3;3", "empirical_novelty": "1;2;3;3", "wc_summary_paper": "78;27;78;153", "wc_summary_review": "50;23;52;57", "wc_main_review": "320;107;465;604", "wc_review": "448;157;595;814", "wc_reply_reviewers": "119;0;19;103", "wc_reply_authors": "1372;501;1053;1375", "reply_reviewers": "1;0;1;1", "reply_authors": "2;1;2;3", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 84.0, 44.94997219131509 ], "wc_summary_review_avg": [ 45.5, 13.238202294873727 ], "wc_main_review_avg": [ 374.0, 183.97418297141584 ], "wc_review_avg": [ 503.5, 238.70745694259324 ], "wc_reply_reviewers_avg": [ 60.25, 51.50424739766615 ], "wc_reply_authors_avg": [ 1075.25, 356.4297791992134 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 2.0, 0.7071067811865476 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": -0.5773502691896257, "gs_citation": 28, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2963386100287050530&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "pdf": "https://openreview.net/pdf?id=bjy5Zb2fo2", "email": "ucl.ac.uk;;", "author_num": 3, "aff_unique_index": "0", "aff_unique_norm": "University College London", "aff_unique_dep": "", "aff_unique_url": "https://www.ucl.ac.uk", "aff_unique_abbr": "UCL", "aff_country_unique_index": "0", "aff_country_unique": "United Kingdom" }, { "id": "bl9zYxOVwa", "title": "Understanding the robustness-accuracy tradeoff by rethinking robust fairness", "track": "main", "status": "Reject", "tldr": "", "abstract": "Although current adversarial training (AT) methods can effectively improve the robustness on adversarial examples, \nthey usually lead to a decrease in accuracy, called the robustness-accuracy trade-off. In addition, researchers have recently discovered a robust fairness phenomenon in the AT model; that is, not all categories of the dataset have experienced a serious decline in accuracy with the introduction of AT methods. In this paper, we explore the relationship between the robustness-accuracy tradeoff and robust fairness for the first time. Empirically, we have found that AT will cause a substantial increase in the inter-class similarity, which could be the root cause of these two phenomena. We argue that the label smoothing (LS) is more than a trick in AT. The smoothness learned from LS can help reduce the excessive inter-class similarity caused by AT, and also reduce the intra-class variance, thereby significantly improving accuracy. Then, we explored the effect of another classic smoothing regularizer, namely, the maximum entropy (ME), and we have found ME can also help reduce both inter-class similarity and intra-class variance. Additionally, we revealed that TRADES actually implies the function of ME, \nwhich can explain why TRADES usually performs better than PGD-AT on robustness. Finally, we proposed the maximum entropy PGD-AT (ME-AT) and the maximum entropy TRADES (ME-TRADES), and experimental results show that our methods can significantly mitigate both tradeoff and robust fairness.", "keywords": "Adversarial training;Adversarial robustness", "primary_area": "", "supplementary_material": "/attachment/559bc1748d3fdb4d6de624e02e674b54d1d1d2bf.zip", "author": "Zihui Wu;Haichang Gao;Shudong Zhang;Yipeng Gao", "authorids": "~Zihui_Wu1;hchgao@xidian.edu.cn;sdong_zhang@163.com;ypg100500@163.com", "gender": "M;;;", "homepage": ";;;", "dblp": "222/6329;;;", "google_scholar": "https://scholar.google.com/citations?hl=zh-CN;;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Zihui_Wu1;hchgao@xidian.edu.cn;sdong_zhang@163.com;ypg100500@163.com", "aff": "Xidian University;;;", "aff_domain": "xidian.edu;;;", "position": "MS student;;;", "bibtex": "@misc{\nwu2022understanding,\ntitle={Understanding the robustness-accuracy tradeoff by rethinking robust fairness },\nauthor={Zihui Wu and Haichang Gao and Shudong Zhang and Yipeng Gao},\nyear={2022},\nurl={https://openreview.net/forum?id=bl9zYxOVwa}\n}", "github": "", "project": "", "reviewers": "EY7N;NCTP;94sP;VwP1", "site": "https://openreview.net/forum?id=bl9zYxOVwa", "pdf_size": 0, "recommendation": "3;3;6;6", "confidence": "4;4;4;3", "correctness": "3;3;3;3", "technical_novelty": "1;3;2;3", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "44;117;105;64", "wc_summary_review": "25;48;75;92", "wc_main_review": "475;550;152;247", "wc_review": "544;715;332;403", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.5, 1.5 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.25, 0.82915619758885 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 82.5, 29.669007398293594 ], "wc_summary_review_avg": [ 60.0, 25.583197610932064 ], "wc_main_review_avg": [ 356.0, 162.24518482839483 ], "wc_review_avg": [ 498.5, 146.4453823102661 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.5773502691896258, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:WnFD4d1dwhQJ:scholar.google.com/&scioq=Understanding+the+robustness-accuracy+tradeoff+by+rethinking+robust+fairness&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "Xidian University", "aff_unique_dep": "", "aff_unique_url": "http://www.xidian.edu.cn/", "aff_unique_abbr": "Xidian", "aff_country_unique_index": "0", "aff_country_unique": "China" }, { "id": "bmGLlsX_iJl", "title": "EMFlow: Data Imputation in Latent Space via EM and Deep Flow Models", "track": "main", "status": "Reject", "tldr": "", "abstract": "The presence of missing values within high-dimensional data is an ubiquitous problem for many applied sciences. A serious limitation of many available data mining and machine learning methods is their inability to handle partially missing values and so an integrated approach that combines imputation and model estimation is vital for down-stream analysis. A computationally fast algorithm, called EMFlow, is introduced that performs imputation in a latent space via an online version of Expectation-Maximization (EM) algorithm by using a normalizing flow (NF) model which maps the data space to a latent space. The proposed EMFlow algorithm is iterative, involving updating the parameters of online EM and NF alternatively. Extensive experimental results for high-dimensional multivariate and image datasets are presented to illustrate the superior performance of the EMFlow compared to a couple of recently available methods in terms of both predictive accuracy and speed of algorithmic convergence.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/549b6b12b1052ff1899acaea3ef689d7ebf5a30a.zip", "author": "Qi Ma;Sujit K Ghosh", "authorids": "~Qi_Ma2;~Sujit_K_Ghosh1", "gender": "M;M", "homepage": ";https://ci.lib.ncsu.edu/profiles/sghosh2", "dblp": ";", "google_scholar": ";nDj-zN0AAAAJ", "orcid": "0000-0002-0379-8279;0000-0001-8351-408X", "linkedin": ";ghoshbabu", "or_profile": "~Qi_Ma2;~Sujit_K_Ghosh1", "aff": ";North Carolina State University", "aff_domain": ";ncsu.edu", "position": ";Full Professor", "bibtex": "@misc{\nma2022emflow,\ntitle={{EMF}low: Data Imputation in Latent Space via {EM} and Deep Flow Models},\nauthor={Qi Ma and Sujit K Ghosh},\nyear={2022},\nurl={https://openreview.net/forum?id=bmGLlsX_iJl}\n}", "github": "", "project": "", "reviewers": "Twou;L7zK;zdAh;tdhx", "site": "https://openreview.net/forum?id=bmGLlsX_iJl", "pdf_size": 0, "recommendation": "3;5;6;6", "confidence": "4;3;5;4", "correctness": "2;3;4;4", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "72;120;37;113", "wc_summary_review": "79;36;43;78", "wc_main_review": "295;216;106;578", "wc_review": "446;372;186;769", "wc_reply_reviewers": "601;0;0;55", "wc_reply_authors": "1145;589;579;761", "reply_reviewers": "3;0;0;1", "reply_authors": "3;2;1;2", "recommendation_avg": [ 5.0, 1.224744871391589 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 85.5, 33.47013594235912 ], "wc_summary_review_avg": [ 59.0, 19.6596032513375 ], "wc_main_review_avg": [ 298.75, 174.63873424873418 ], "wc_review_avg": [ 443.25, 210.57940901237234 ], "wc_reply_reviewers_avg": [ 164.0, 253.29923016069355 ], "wc_reply_authors_avg": [ 768.5, 229.09550410254673 ], "reply_reviewers_avg": [ 1.0, 1.224744871391589 ], "reply_authors_avg": [ 2.0, 0.7071067811865476 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.28867513459481287, "corr_recommendation_correctness": 0.9847319278346618, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3674910851219521862&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0", "aff_unique_norm": "North Carolina State University", "aff_unique_dep": "", "aff_unique_url": "https://www.ncsu.edu", "aff_unique_abbr": "NCSU", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "title": "Subspace Regularizers for Few-Shot Class Incremental Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7151", "id": "boJy41J-tnQ", "poster": "", "openreview": "https://openreview.net/forum?id=boJy41J-tnQ", "slides": "https://iclr.cc/virtual/2022/poster/7151", "video": "https://iclr.cc/virtual/2022/poster/7151", "author_site": "Afra Feyza Aky\u00fcrek, Ekin Aky\u00fcrek, Derry Wijaya, Jacob Andreas", "tldr": "", "abstract": "Few-shot class incremental learning---the problem of updating a trained classifier to discriminate among an expanded set of classes with limited labeled data---is a key challenge for machine learning systems deployed in non-stationary environments. Existing approaches to the problem rely on complex model architectures and training procedures that are difficult to tune and re-use. In this paper, we present an extremely simple approach that enables the use of ordinary logistic regression classifiers for few-shot incremental learning. The key to this approach is a new family of \\textit{subspace regularization} schemes that encourage weight vectors for new classes to lie close to the subspace spanned by the weights of existing classes. When combined with pretrained convolutional feature extractors, logistic regression models trained with subspace regularization outperform specialized, state-of-the-art approaches to few-shot incremental image classification by up to 23\\% on the \\textit{mini}ImageNet dataset. Because of its simplicity, subspace regularization can be straightforwardly configured to incorporate additional background information about the new classes (including class names and descriptions specified in natural language); this offers additional control over the trade-off between existing and new classes. Our results show that simple geometric regularization of class representations offers an effective tool for continual learning.", "keywords": "few-shot class incremental learning;incremental learning;incremental classification;subspace regularization;manifold regularization;few-shot learning", "primary_area": "", "supplementary_material": "", "author": "Afra Feyza Aky\u00fcrek;Ekin Aky\u00fcrek;Derry Wijaya;Jacob Andreas", "authorids": "~Afra_Feyza_Aky\u00fcrek1;~Ekin_Aky\u00fcrek1;~Derry_Wijaya1;~Jacob_Andreas1", "gender": "F;M;F;M", "homepage": "https://derrywijaya.github.io/;http://web.mit.edu/jda/www;https://feyzaakyurek.github.io;https://www.ekinakyurek.me/", "dblp": "https://dblp.org/pers/w/Wijaya:Derry;97/8154;268/0913.html;216/3446", "google_scholar": "8lmWWD0AAAAJ;dnZ8udEAAAAJ;https://scholar.google.com/citations?hl=en;FQHeASwAAAAJ", "orcid": "0000-0002-0848-4703;;;0000-0002-5166-4689", "linkedin": "derry-wijaya-577b80178/;;afrafeyzaakyurek/;", "or_profile": "~Derry_Wijaya1;~Jacob_Andreas1;~Afra_Feyza_Akyurek1;~EKIN_AKYUREK1", "aff": "Boston University;Microsoft;Boston University;Massachusetts Institute of Technology", "aff_domain": "bu.edu;microsoft.com;bu.edu;mit.edu", "position": "Assistant Professor;Researcher;PhD student;PhD student", "bibtex": "@inproceedings{\naky{\\\"u}rek2022subspace,\ntitle={Subspace Regularizers for Few-Shot Class Incremental Learning},\nauthor={Afra Feyza Aky{\\\"u}rek and Ekin Aky{\\\"u}rek and Derry Wijaya and Jacob Andreas},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=boJy41J-tnQ}\n}", "github": "", "project": "", "reviewers": "EoLF;oQAd;Vrap;qioC", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "4;5;4;4", "correctness": "3;3;4;4", "technical_novelty": "3;3;3;4", "empirical_novelty": "2;3;3;4", "wc_summary_paper": "67;140;129;94", "wc_summary_review": "27;69;147;51", "wc_main_review": "309;266;327;214", "wc_review": "403;475;603;359", "wc_reply_reviewers": "0;31;0;0", "wc_reply_authors": "540;624;823;170", "reply_reviewers": "0;1;0;0", "reply_authors": "2;2;2;1", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 107.5, 28.90069203323685 ], "wc_summary_review_avg": [ 73.5, 44.97499305169485 ], "wc_main_review_avg": [ 279.0, 43.583253664681806 ], "wc_review_avg": [ 460.0, 92.36341266973628 ], "wc_reply_reviewers_avg": [ 7.75, 13.423393758658799 ], "wc_reply_authors_avg": [ 539.25, 236.6657717119229 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.75, 0.4330127018922193 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.13245323570650439, "corr_recommendation_correctness": 0.6882472016116854, "gs_citation": 92, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=740038996677769193&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "pdf": "https://openreview.net/pdf?id=boJy41J-tnQ", "email": "bu.edu;microsoft.com;bu.edu;mit.edu", "author_num": 4, "aff_unique_index": "0;1;0;2", "aff_unique_norm": "Boston University;Microsoft;Massachusetts Institute of Technology", "aff_unique_dep": ";Microsoft Corporation;", "aff_unique_url": "https://www.bu.edu;https://www.microsoft.com;https://web.mit.edu", "aff_unique_abbr": "BU;Microsoft;MIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Sequence Approximation using Feedforward Spiking Neural Network for Spatiotemporal Learning: Theory and Optimization Methods", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6640", "id": "bp-LJ4y_XC", "poster": "", "openreview": "https://openreview.net/forum?id=bp-LJ4y_XC", "slides": "https://iclr.cc/virtual/2022/poster/6640", "video": "https://iclr.cc/virtual/2022/poster/6640", "author_site": "Xueyuan She, Saurabh Dash, Saibal Mukhopadhyay", "tldr": "", "abstract": "A dynamical system of spiking neurons with only feedforward connections can classify spatiotemporal patterns without recurrent connections. However, the theoretical construct of a feedforward spiking neural network (SNN) for approximating a temporal sequence remains unclear, making it challenging to optimize SNN architectures for learning complex spatiotemporal patterns. In this work, we establish a theoretical framework to understand and improve sequence approximation using a feedforward SNN. Our framework shows that a feedforward SNN with one neuron per layer and skip-layer connections can approximate the mapping function between any arbitrary pairs of input and output spike train on a compact domain. Moreover, we prove that heterogeneous neurons with varying dynamics and skip-layer connections improve sequence approximation using feedforward SNN. Consequently, we propose SNN architectures incorporating the preceding constructs that are trained using supervised backpropagation-through-time (BPTT) and unsupervised spiking-timing-dependent plasticity (STDP) algorithms for classification of spatiotemporal data. A dual-search-space Bayesian optimization method is developed to optimize architecture and parameters of the proposed SNN with heterogeneous neuron dynamics and skip-layer connections. ", "keywords": "spiking neural network;spatiotemporal processing;feedforward network", "primary_area": "", "supplementary_material": "/attachment/07bfc11fcbf1f0d165886d9670fb0314518d9a36.zip", "author": "Xueyuan She;Saurabh Dash;Saibal Mukhopadhyay", "authorids": "~Xueyuan_She1;~Saurabh_Dash1;~Saibal_Mukhopadhyay1", "gender": "M;M;M", "homepage": ";https://saurabhdash.com;https://greenlab.ece.gatech.edu", "dblp": ";190/7336;66/1210", "google_scholar": "mm-MTxkAAAAJ;bboszRcAAAAJ;5KRtMEkAAAAJ", "orcid": ";;0000-0002-8894-3390", "linkedin": ";;", "or_profile": "~Xueyuan_She1;~Saurabh_Dash1;~Saibal_Mukhopadhyay2", "aff": "Georgia Institute of Technology;Georgia Institute of Technology;Georgia Institute of Technology", "aff_domain": "gatech.edu;gatech.edu;gatech.edu", "position": "PhD student;PhD student;Full Professor", "bibtex": "@inproceedings{\nshe2022sequence,\ntitle={Sequence Approximation using Feedforward Spiking Neural Network for Spatiotemporal Learning: Theory and Optimization Methods},\nauthor={Xueyuan She and Saurabh Dash and Saibal Mukhopadhyay},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=bp-LJ4y_XC}\n}", "github": "", "project": "", "reviewers": "himw;EDAS;oV8W", "pdf_size": 0, "recommendation": "6;6;8", "confidence": "4;3;4", "correctness": "3;3;3", "technical_novelty": "3;3;3", "empirical_novelty": "2;3;3", "wc_summary_paper": "69;64;101", "wc_summary_review": "70;7;122", "wc_main_review": "475;275;562", "wc_review": "614;346;785", "wc_reply_reviewers": "228;33;72", "wc_reply_authors": "1546;886;985", "reply_reviewers": "1;1;1", "reply_authors": "3;3;3", "recommendation_avg": [ 6.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 78.0, 16.391054470858997 ], "wc_summary_review_avg": [ 66.33333333333333, 47.02009026882965 ], "wc_main_review_avg": [ 437.3333333333333, 120.15637958741748 ], "wc_review_avg": [ 581.6666666666666, 180.67343160766302 ], "wc_reply_reviewers_avg": [ 111.0, 84.24962907930218 ], "wc_reply_authors_avg": [ 1139.0, 290.6165858997039 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 3.0, 0.0 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.49999999999999983, "corr_recommendation_correctness": 0.0, "gs_citation": 32, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4516583275484285931&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "pdf": "https://openreview.net/pdf?id=bp-LJ4y_XC", "email": "gatech.edu;gatech.edu;gatech.edu", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Georgia Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.gatech.edu", "aff_unique_abbr": "Georgia Tech", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "bpUHBc9HCU8", "title": "A General Unified Graph Neural Network Framework Against Adversarial Attacks", "track": "main", "status": "Reject", "tldr": "", "abstract": "Graph Neural Networks (GNNs) are powerful tools in representation learning for graphs. However, they are reported to be vulnerable to adversarial attacks, raising numerous concerns for applying it in some risk-sensitive domains. Therefore, it is essential to develop a robust GNN model to defend against adversarial attacks. Existing studies address this issue only considering cleaning perturbed graph structure, and almost none of them simultaneously consider denoising features. As the graph and features are interrelated and influence each other, we propose a General Unified Graph Neural Network (GUGNN) framework to jointly clean the graph and denoise features of data. On this basis, we further extend it by introducing two operations and develop a robust GNN model(R-GUGNN) to defend against adversarial attacks. One operation is reconstructing the graph with its intrinsic properties, including similarity of two adjacent nodes\u2019 features, sparsity of real-world graphs and many slight noises having small eigenvalues in perturbed graphs. The other is the convolution operation for features to find the optimal solution adopting the Laplacian smoothness and the prior knowledge that nodes with many neighbors are difficult to attack. Experiments on four real-world datasets demonstrate that R-GUGNN has greatly improved the overall robustness over the state-of-the-art baselines. ", "keywords": "Graph Neural Networks;general unified framework;against adversarial attacks;robust model;graph reconstruction operation", "primary_area": "", "supplementary_material": "/attachment/669f5d0fe7254bf0be7b57413d424552df20cac2.zip", "author": "Yujie Gu;Yangkun Cao;Qiang Huang;Huiyan Sun", "authorids": "~Yujie_Gu1;caoyk20@mails.jlu.edu.cn;~Qiang_Huang4;~Huiyan_Sun1", "gender": "M;;M;F", "homepage": "https://github.com/guyujie98/;;https://15754311016.github.io/qianghuang.github.io/;", "dblp": ";;80/2732-1.html;", "google_scholar": ";;https://scholar.google.com/citations?hl=en;A9t1pDwAAAAJ", "orcid": ";;0000-0003-0046-0923;", "linkedin": ";;;", "or_profile": "~Yujie_Gu1;caoyk20@mails.jlu.edu.cn;~Qiang_Huang4;~Huiyan_Sun1", "aff": "Jilin University;;Jilin University, China;Jilin University, China", "aff_domain": "jlu.edu;;jlu.edu.cn;jlu.edu.cn", "position": "MS student;;PhD student;Associate Professor", "bibtex": "@misc{\ngu2022a,\ntitle={A General Unified Graph Neural Network Framework Against Adversarial Attacks},\nauthor={Yujie Gu and Yangkun Cao and Qiang Huang and Huiyan Sun},\nyear={2022},\nurl={https://openreview.net/forum?id=bpUHBc9HCU8}\n}", "github": "", "project": "", "reviewers": "Hj7t;4H1G;z4si;V2ZH", "site": "https://openreview.net/forum?id=bpUHBc9HCU8", "pdf_size": 0, "recommendation": "3;5;5;5", "confidence": "5;4;4;5", "correctness": "3;4;2;2", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;3;3;2", "wc_summary_paper": "79;98;86;84", "wc_summary_review": "39;10;128;60", "wc_main_review": "435;266;1299;344", "wc_review": "553;374;1513;488", "wc_reply_reviewers": "13;24;0;0", "wc_reply_authors": "90;107;149;115", "reply_reviewers": "1;1;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 4.5, 0.8660254037844386 ], "confidence_avg": [ 4.5, 0.5 ], "correctness_avg": [ 2.75, 0.82915619758885 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 86.75, 6.977642868476432 ], "wc_summary_review_avg": [ 59.25, 43.48203652084387 ], "wc_main_review_avg": [ 586.0, 415.97295585169957 ], "wc_review_avg": [ 732.0, 455.4398972422157 ], "wc_reply_reviewers_avg": [ 9.25, 10.034316120194738 ], "wc_reply_authors_avg": [ 115.25, 21.47527648250425 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": -0.17407765595569782, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18054358963951086474&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;0", "aff_unique_norm": "Jilin University", "aff_unique_dep": "", "aff_unique_url": "http://www.jlu.edu.cn", "aff_unique_abbr": "JLU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "id": "bq7smM1OJIX", "title": "Determining the Ethno-nationality of Writers Using Written English Text", "track": "main", "status": "Reject", "tldr": "", "abstract": "Ethno-nationality is where nations are defined by a shared heritage, for instance it can be a membership of a common language, nationality, religion or an ethnic ancestry. The main goal of this research is to determine a person\u2019s country-of-origin using English text written in less controlled environments, employing Machine Learning (ML) and Natural Language Processing (NLP) techniques. The current literature mainly focuses on determining the native language of English writers and a minimal number of researches have been conducted in determining the country-of-origin of English writers.\n\nFurther, most experiments in the literature are mainly based on the TOEFL, ICLE datasets which were collected in more controlled environments (i.e., standard exam answers). Hence, most of the writers try to follow some guidelines and patterns of writing. Subsequently, the creativity, freedom of writing and the insights of writers could be hidden. Thus, we believe it hides the real nativism of the writers. Further, those corpora are not freely available as it involves a high cost of licenses. Thus, the main data corpus used for this research was the International Corpus of English (ICE corpus). Up to this point, none of the researchers have utilised the ICE corpus for the purpose of determining the writers\u2019 country-of-origin, even though there is a true potential. \n\nFor this research, an overall accuracy of 0.7636 for the flat classification (for all ten countries) and accuracy of 0.6224~1.000 for sub-categories were received. In addition, the best ML model obtained for the flat classification strategy is linear SVM with SGD optimizer trained with word (1,1) uni-gram model.", "keywords": "Ethno-nationality;Native Language Identification;Natural Language Processing;Machine Learning;Linear SVM;Less-controlled environments;ICE corpus", "primary_area": "", "supplementary_material": "/attachment/e29cf1801c70697b21565846a6823e0693846dd5.zip", "author": "Deenuka Niroshini Perera;Ruvan Weerasinghe;Randhil Pushpananda", "authorids": "~Deenuka_Niroshini_Perera1;arw@ucsc.cmb.ac.lk;rpn@ucsc.cmb.ac.lk", "gender": "F;;", "homepage": ";;", "dblp": ";;", "google_scholar": "yGS7KSkAAAAJ;;", "orcid": ";;", "linkedin": ";;", "or_profile": "~Deenuka_Niroshini_Perera1;arw@ucsc.cmb.ac.lk;rpn@ucsc.cmb.ac.lk", "aff": "University of Colombo School of Computing;;", "aff_domain": "ucsc.cmb.ac.lk;;", "position": "Lecturer;;", "bibtex": "@misc{\nperera2022determining,\ntitle={Determining the Ethno-nationality of Writers Using Written English Text},\nauthor={Deenuka Niroshini Perera and Ruvan Weerasinghe and Randhil Pushpananda},\nyear={2022},\nurl={https://openreview.net/forum?id=bq7smM1OJIX}\n}", "github": "", "project": "", "reviewers": "k2RW;Ak7m;bgYR;hVHm", "site": "https://openreview.net/forum?id=bq7smM1OJIX", "pdf_size": 0, "recommendation": "3;3;3;3", "confidence": "5;4;4;4", "correctness": "3;1;2;3", "technical_novelty": "1;1;1;1", "empirical_novelty": "1;1;2;1", "wc_summary_paper": "59;32;145;89", "wc_summary_review": "26;15;7;53", "wc_main_review": "142;211;71;406", "wc_review": "227;258;223;548", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 2.25, 0.82915619758885 ], "technical_novelty_avg": [ 1.0, 0.0 ], "empirical_novelty_avg": [ 1.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 81.25, 41.96650450061335 ], "wc_summary_review_avg": [ 25.25, 17.383541066192468 ], "wc_main_review_avg": [ 207.5, 124.83689358518978 ], "wc_review_avg": [ 314.0, 135.7773913433308 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:2_Ml_n3kkEMJ:scholar.google.com/&scioq=Determining+the+Ethno-nationality+of+Writers+Using+Written+English+Text&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "University of Colombo", "aff_unique_dep": "School of Computing", "aff_unique_url": "https://www.soc.lk", "aff_unique_abbr": "UoC", "aff_campus_unique_index": "0", "aff_campus_unique": "Colombo", "aff_country_unique_index": "0", "aff_country_unique": "Sri Lanka" }, { "id": "bsr02xd-utn", "title": "Pairwise Adversarial Training for Unsupervised Class-imbalanced Domain Adaptation", "track": "main", "status": "Reject", "tldr": "", "abstract": "Unsupervised domain adaptation (UDA) has become an appealing approach for knowledge transfer from a labeled source domain to an unlabeled target domain. However, when the classes in source and target domains are imbalanced, most existing UDA methods experience significant performance drop, as the decision boundary usually favors the majority classes. Some recent class-imbalanced domain adaptation (CDA) methods aim to tackle the challenge of biased label distribution by exploiting pseudo-labeled target data during training process. However, these methods may be challenged with the problem of unreliable pseudo labels and error accumulation during training. In this paper, we propose a pairwise adversarial training approach to augment training data for unsupervised class-imbalanced domain adaptation. Unlike conventional adversarial training in which the adversarial samples are obtained from the $\\ell_p$ ball of the original data, we obtain the semantic adversarial samples from the interpolated line of the aligned pair-wise samples from source domain and target domain. Experimental results and ablation study show that our method can achieve considerable improvements on the CDA benchmarks compared with the state-of-art methods focusing on the same problem.\n", "keywords": "domain adaptation adversarial training imbalanced class distribution", "primary_area": "", "supplementary_material": "", "author": "Weili Shi;Ronghang Zhu;Sheng Li", "authorids": "~Weili_Shi2;~Ronghang_Zhu2;~Sheng_Li3", "gender": ";;M", "homepage": ";;http://sheng-li.org", "dblp": ";;23/3439-1", "google_scholar": ";;DEncVcYAAAAJ", "orcid": ";;0000-0003-1205-8632", "linkedin": ";;sheng-li-15a70022/", "or_profile": "~Weili_Shi2;~Ronghang_Zhu2;~Sheng_Li3", "aff": ";;University of Georgia", "aff_domain": ";;uga.edu", "position": ";;Assistant Professor", "bibtex": "@misc{\nshi2022pairwise,\ntitle={Pairwise Adversarial Training for Unsupervised Class-imbalanced Domain Adaptation},\nauthor={Weili Shi and Ronghang Zhu and Sheng Li},\nyear={2022},\nurl={https://openreview.net/forum?id=bsr02xd-utn}\n}", "github": "", "project": "", "reviewers": "sUHK;Cnqr;p4yb;orbW", "site": "https://openreview.net/forum?id=bsr02xd-utn", "pdf_size": 0, "recommendation": "5;5;5;5", "confidence": "4;5;5;5", "correctness": "3;4;3;3", "technical_novelty": "3;3;3;2", "empirical_novelty": "2;3;0;2", "wc_summary_paper": "16;67;88;49", "wc_summary_review": "23;61;37;24", "wc_main_review": "101;274;88;138", "wc_review": "140;402;213;211", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "655;685;675;929", "reply_reviewers": "0;0;0;0", "reply_authors": "2;2;2;3", "recommendation_avg": [ 5.0, 0.0 ], "confidence_avg": [ 4.75, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.75, 1.0897247358851685 ], "wc_summary_paper_avg": [ 55.0, 26.410225292488512 ], "wc_summary_review_avg": [ 36.25, 15.31951369985353 ], "wc_main_review_avg": [ 150.25, 73.76440537278125 ], "wc_review_avg": [ 241.5, 97.21753956977105 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 736.0, 111.95088208674373 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.25, 0.4330127018922193 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 22, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16290933726591020855&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "aff_unique_index": "0", "aff_unique_norm": "University of Georgia", "aff_unique_dep": "", "aff_unique_url": "https://www.uga.edu", "aff_unique_abbr": "UGA", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "title": "Generalized Natural Gradient Flows in Hidden Convex-Concave Games and GANs", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7040", "id": "bsycpMi00R1", "poster": "", "openreview": "https://openreview.net/forum?id=bsycpMi00R1", "slides": "https://iclr.cc/virtual/2022/poster/7040", "video": "https://iclr.cc/virtual/2022/poster/7040", "author_site": "Andjela Mladenovic, Iosif Sakos, Gauthier Gidel, Georgios Piliouras", "tldr": "", "abstract": "Game-theoretic formulations in machine learning have recently risen in prominence, whereby entire modeling paradigms are best captured as zero-sum games. Despite their popularity, however, their dynamics are still poorly understood. This lack of theory is often substantiated with painful empirical observations of volatile training dynamics and even divergence. Such results highlight the need to develop an appropriate theory with convergence guarantees that are powerful enough to inform practice. This paper studies the generalized Gradient Descent-Ascent (GDA) flow in a large class of non-convex non-concave Zero-Sum games dubbed Hidden Convex-Concave games, a class of games that includes GANs. We focus on two specific geometries: a novel geometry induced by the hidden convex-concave structure that we call the hidden mapping geometry and the Fisher information geometry. For the hidden mapping geometry, we prove global convergence under mild assumptions. In the case of Fisher information geometry, we provide a complete picture of the dynamics in an interesting special setting of team competition via invariant function analysis.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/78797d649101f0173bb8b2e22fbd6908eb573b3d.zip", "author": "Andjela Mladenovic;Iosif Sakos;Gauthier Gidel;Georgios Piliouras", "authorids": "~Andjela_Mladenovic1;~Iosif_Sakos1;~Gauthier_Gidel1;~Georgios_Piliouras1", "gender": "F;M;M;", "homepage": ";;https://gauthiergidel.github.io/;", "dblp": ";271/1082;188/6326;62/1236", "google_scholar": "ALrei20AAAAJ;https://scholar.google.gr/citations?user=69xvSfQAAAAJ;https://scholar.google.fr/citations?user=bDrXQPUAAAAJ;", "orcid": ";0000-0002-1871-9078;;", "linkedin": ";joseph-sakos-3b3a6a200?lipi=urn%3Ali%3Apage%3Ad_flagship3_profile_view_base_contact_details%3BP9xevRgnRfKhbYYoPyDf3Q%3D%3D;;", "or_profile": "~Andjela_Mladenovic1;~Iosif_Sakos1;~Gauthier_Gidel1;~Georgios_Piliouras1", "aff": "Montreal Institute for Learning Algorithms, University of Montreal, University of Montreal;Singapore University of Technology and Design;Mila - Quebec Artificial Intelligence Institute;Singapore University of Technology and Design", "aff_domain": "mila.umontreal.ca;sutd.edu.sg;mila.quebec;sutd.edu.sg", "position": "PhD student;PhD student;Assistant Professor;Associate Professor", "bibtex": "@inproceedings{\nmladenovic2022generalized,\ntitle={Generalized Natural Gradient Flows in Hidden Convex-Concave Games and {GAN}s},\nauthor={Andjela Mladenovic and Iosif Sakos and Gauthier Gidel and Georgios Piliouras},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=bsycpMi00R1}\n}", "github": "", "project": "", "reviewers": "qQLu;5G2k;Wrtu;PUzM", "pdf_size": 0, "recommendation": "6;6;6;6", "confidence": "2;3;4;4", "correctness": "3;4;4;4", "technical_novelty": "3;2;3;2", "empirical_novelty": "0;3;2;2", "wc_summary_paper": "163;27;62;216", "wc_summary_review": "53;11;30;63", "wc_main_review": "772;251;373;231", "wc_review": "988;289;465;510", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 6.0, 0.0 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 1.75, 1.0897247358851685 ], "wc_summary_paper_avg": [ 117.0, 75.89795781178832 ], "wc_summary_review_avg": [ 39.25, 20.22838352414745 ], "wc_main_review_avg": [ 406.75, 217.76865591723708 ], "wc_review_avg": [ 563.0, 258.8986288105829 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9068976418863969186&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "pdf": "https://openreview.net/pdf?id=bsycpMi00R1", "email": "mila.umontreal.ca;sutd.edu.sg;mila.quebec;sutd.edu.sg", "author_num": 4, "aff_unique_index": "0;1;2;1", "aff_unique_norm": "University of Montreal;Singapore University of Technology and Design;Quebec Artificial Intelligence Institute", "aff_unique_dep": "Montreal Institute for Learning Algorithms;;Artificial Intelligence", "aff_unique_url": "https://www.umontreal.ca;https://www.sutd.edu.sg;https://mila.quebec", "aff_unique_abbr": "UM;SUTD;Mila", "aff_campus_unique_index": "0", "aff_campus_unique": "Montreal;", "aff_country_unique_index": "0;1;0;1", "aff_country_unique": "Canada;Singapore" }, { "id": "buSCIu6izBY", "title": "Occupy & Specify: Investigations into a Maximum Credit Assignment Occupancy Objective for Data-efficient Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "The capability to widely sample the state and action spaces is a key ingredient toward building effective reinforcement learning algorithms. The trade-off between exploration and exploitation generally requires the use of a data model, from which novelty bonuses are estimated and used to bias the return toward wider exploration. Surprisingly, little is known about the optimization objective followed when novelty (or entropy) bonuses are considered. Following the ``probability matching'' principle, we interpret here returns (cumulative rewards) as set points that fixate the occupancy of the state space, that is the frequency at which the different states are expected to be visited during trials. The circular dependence of the rewards sampling on the occupancy/policy makes it difficult to evaluate. We provide here a variational formulation for the matching objective, named MaCAO (Maximal Credit Assignment Occupancy) that interprets rewards as a log-likelihood on occupancy, that operates anticausally from the effects toward the causes. It is, broadly speaking, an estimation of the contribution of a state toward reaching a (future) goal. It is constructed so as to provide better convergence guaranties, with a complementary term serving as a regularizer, that, in principle, may reduce the greediness. In the absence of an explicit target occupancy, a uniform prior is used, making the regularizer consistent with a MaxEnt (Maximum Entropy) objective on states. Optimizing the entropy on states in known to be more tricky than optimizing the entropy on actions, because of an external sampling through the (unknown) environment, that prevents the propagation of a gradient. In our practical implementations, the MaxEnt regularizer is interpreted as a TD-error rather than a reward, making it possible to define an update in both the discrete and continuous cases. It is implemented on an actor-critic off-policy setup with a replay buffer, using gradient descent on a multi-layered neural network, and shown to provide significant increase in the sampling efficacy, that reflects in a reduced training time and higher returns on a set of classical motor learning benchmarks, in both the dense and the sparse rewards cases. ", "keywords": "Reinforcement Learning;Intrinsic reward;MaxEnt;Probability matching;Motor control;Variational inference", "primary_area": "", "supplementary_material": "", "author": "Emmanuel Dauc\u00e9", "authorids": "~Emmanuel_Dauc\u00e91", "gender": "", "homepage": "http://emmanuel.dauce.free.fr/", "dblp": "40/652.html", "google_scholar": "https://scholar.google.fr/citations?user=n5-JYWMAAAAJ", "orcid": "0000-0001-6596-8168", "linkedin": "", "or_profile": "~Emmanuel_Dauc\u00e91", "aff": "Ecole Centrale de Marseille", "aff_domain": "centrale-marseille.fr", "position": "Associate Professor", "bibtex": "@misc{\ndauc{\\'e}2022occupy,\ntitle={Occupy \\& Specify: Investigations into a Maximum Credit Assignment Occupancy Objective for Data-efficient Reinforcement Learning},\nauthor={Emmanuel Dauc{\\'e}},\nyear={2022},\nurl={https://openreview.net/forum?id=buSCIu6izBY}\n}", "github": "", "project": "", "reviewers": "yV1o;6SWm;FQij", "site": "https://openreview.net/forum?id=buSCIu6izBY", "pdf_size": 0, "recommendation": "1;3;3", "confidence": "3;4;3", "correctness": "3;3;3", "technical_novelty": "1;2;2", "empirical_novelty": "2;2;2", "wc_summary_paper": "97;28;76", "wc_summary_review": "167;21;59", "wc_main_review": "915;530;678", "wc_review": "1179;579;813", "wc_reply_reviewers": "0;69;11", "wc_reply_authors": "724;907;484", "reply_reviewers": "0;1;1", "reply_authors": "1;2;1", "recommendation_avg": [ 2.3333333333333335, 0.9428090415820634 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 1.6666666666666667, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 67.0, 28.879058156387302 ], "wc_summary_review_avg": [ 82.33333333333333, 61.845686097648624 ], "wc_main_review_avg": [ 707.6666666666666, 158.5692978549827 ], "wc_review_avg": [ 857.0, 246.9169900999119 ], "wc_reply_reviewers_avg": [ 26.666666666666668, 30.269162892657306 ], "wc_reply_authors_avg": [ 705.0, 173.2108541633578 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 1, 0 ], "corr_recommendation_confidence": 0.5, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:R7egk90nv5MJ:scholar.google.com/&scioq=Occupy+%26+Specify:+Investigations+into+a+Maximum+Credit+Assignment+Occupancy+Objective+for+Data-efficient+Reinforcement+Learning&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "Ecole Centrale de Marseille", "aff_unique_dep": "", "aff_unique_url": "https://www.ecm.fr", "aff_unique_abbr": "ECM", "aff_country_unique_index": "0", "aff_country_unique": "France" }, { "title": "How Does SimSiam Avoid Collapse Without Negative Samples? A Unified Understanding with Self-supervised Contrastive Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6629", "id": "bwq6O4Cwdl", "poster": "", "openreview": "https://openreview.net/forum?id=bwq6O4Cwdl", "slides": "https://iclr.cc/virtual/2022/poster/6629", "video": "https://iclr.cc/virtual/2022/poster/6629", "author_site": "Chaoning Zhang, Kang Zhang, Chenshuang Zhang, Trung X. Pham, Chang Yoo, In Kweon", "tldr": "", "abstract": "To avoid collapse in self-supervised learning (SSL), a contrastive loss is widely used but often requires a large number of negative samples. Without negative samples yet achieving competitive performance, a recent work~\\citep{chen2021exploring} has attracted significant attention for providing a minimalist simple Siamese (SimSiam) method to avoid collapse. However, the reason for how it avoids collapse without negative samples remains not fully clear and our investigation starts by revisiting the explanatory claims in the original SimSiam. After refuting their claims, we introduce vector decomposition for analyzing the collapse based on the gradient analysis of the $l_2$-normalized representation vector. This yields a unified perspective on how negative samples and SimSiam alleviate collapse. Such a unified perspective comes timely for understanding the recent progress in SSL. ", "keywords": "SimSiam;Negative samples;SSL;Collapse;Covariance", "primary_area": "", "supplementary_material": "", "author": "Chaoning Zhang;Kang Zhang;Chenshuang Zhang;Trung X. Pham;Chang D. Yoo;In So Kweon", "authorids": "~Chaoning_Zhang1;~Kang_Zhang6;~Chenshuang_Zhang2;~Trung_X._Pham1;~Chang_D._Yoo1;~In_So_Kweon2", "gender": "M;M;F;M;M;M", "homepage": ";;https://chenshuang-zhang.github.io/;https://trungpx.github.io/;https://sanctusfactory.com/family.php;https://ee.kaist.ac.kr/en/professor-s2/2/", "dblp": ";29/177-8;165/5102.html;228/6857;31/7819;74/4917.html", "google_scholar": "https://scholar.google.co.kr/citations?user=lvhxhyQAAAAJ;nj19btQAAAAJ;HbqjLHYAAAAJ;4DkPIIAAAAAJ;gFWgUQEAAAAJ;XA8EOlEAAAAJ", "orcid": ";0000-0003-2761-9383;;0000-0003-4177-7054;0000-0002-0756-7179;", "linkedin": ";;;;;", "or_profile": "~Chaoning_Zhang1;~Kang_Zhang6;~Chenshuang_Zhang2;~Trung_X._Pham1;~Chang_D._Yoo1;~In-So_Kweon1", "aff": "Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology;Kyung Hee University;Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology", "aff_domain": "kaist.ac.kr;kaist.ac.kr;khu.ac.kr;kaist.ac.kr;kaist.ac.kr;kaist.ac.kr", "position": "Postdoc;PhD student;Researcher;PhD student;Full Professor;Emeritus", "bibtex": "@inproceedings{\nzhang2022how,\ntitle={How Does SimSiam Avoid Collapse Without Negative Samples? A Unified Understanding with Self-supervised Contrastive Learning},\nauthor={Chaoning Zhang and Kang Zhang and Chenshuang Zhang and Trung X. Pham and Chang D. Yoo and In So Kweon},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=bwq6O4Cwdl}\n}", "github": "", "project": "", "reviewers": "D1d5;d9ds;q4XL;S8KB;TeJe", "pdf_size": 0, "recommendation": "6;6;6;8;8", "confidence": "3;3;3;3;4", "correctness": "3;2;3;4;3", "technical_novelty": "3;2;2;4;3", "empirical_novelty": "2;2;2;4;3", "wc_summary_paper": "69;50;70;50;121", "wc_summary_review": "63;16;23;18;73", "wc_main_review": "314;367;360;50;436", "wc_review": "446;433;453;118;630", "wc_reply_reviewers": "49;0;33;0;0", "wc_reply_authors": "1265;724;723;62;758", "reply_reviewers": "1;0;1;0;0", "reply_authors": "3;2;4;2;2", "recommendation_avg": [ 6.8, 0.9797958971132712 ], "confidence_avg": [ 3.2, 0.39999999999999997 ], "correctness_avg": [ 3.0, 0.6324555320336759 ], "technical_novelty_avg": [ 2.8, 0.7483314773547882 ], "empirical_novelty_avg": [ 2.6, 0.8 ], "wc_summary_paper_avg": [ 72.0, 26.007691170113507 ], "wc_summary_review_avg": [ 38.6, 24.319539469323836 ], "wc_main_review_avg": [ 305.4, 133.51793886965152 ], "wc_review_avg": [ 416.0, 165.62487735844516 ], "wc_reply_reviewers_avg": [ 16.4, 20.71328076379983 ], "wc_reply_authors_avg": [ 706.4, 382.2389828366542 ], "reply_reviewers_avg": [ 0.4, 0.48989794855663565 ], "reply_authors_avg": [ 2.6, 0.8 ], "replies_avg": [ 27, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.6123724356957946, "corr_recommendation_correctness": 0.6454972243679028, "gs_citation": 98, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1421493250983119439&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "pdf": "https://openreview.net/pdf?id=bwq6O4Cwdl", "email": "kaist.ac.kr;kaist.ac.kr;khu.ac.kr;kaist.ac.kr;kaist.ac.kr;kaist.ac.kr", "author_num": 6, "aff_unique_index": "0;0;1;0;0;0", "aff_unique_norm": "Korea Advanced Institute of Science and Technology;Kyung Hee University", "aff_unique_dep": ";", "aff_unique_url": "https://www.kaist.ac.kr;http://www.khu.ac.kr", "aff_unique_abbr": "KAIST;KHU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "South Korea" }, { "id": "bxiDvWZm6zU", "title": "Influence-Based Reinforcement Learning for Intrinsically-Motivated Agents", "track": "main", "status": "Reject", "tldr": "", "abstract": "Discovering successful coordinated behaviors is a central challenge in Multi-Agent Reinforcement Learning (MARL) since it requires exploring a joint action space that grows exponentially with the number of agents. In this paper, we propose a mechanism for achieving sufficient exploration and coordination in a team of agents. Specifically, agents are rewarded for contributing to a more diversified team behavior by employing proper intrinsic motivation functions. To learn meaningful coordination protocols, we structure agents\u2019 interactions by introducing a novel framework, where at each timestep, an agent simulates counterfactual rollouts of its policy and, through a sequence of computations, assesses the gap between other agents\u2019 current behaviors and their targets. Actions that minimize the gap are considered highly influential and are rewarded. We evaluate our approach on a set of challenging tasks with sparse rewards and partial observability that require learning complex cooperative strategies under a proper exploration scheme, such as the StarCraft Multi-Agent Challenge. Our methods show significantly improved performances over different baselines across all tasks.", "keywords": "Mutli-Agent Reinforcement Learning;Coordination;Intrinsic Motivation;Coordinated Exploration", "primary_area": "", "supplementary_material": "/attachment/f7cc0aa45aaefafb5b7de2c79749b3c2b7a22cd7.zip", "author": "Ammar Fayad;Majd Ibrahim", "authorids": "~Ammar_Fayad1;~Majd_Ibrahim1", "gender": ";M", "homepage": ";", "dblp": ";", "google_scholar": "https://scholar.google.com/citations?hl=en;https://scholar.google.com/citations?hl=en", "orcid": ";", "linkedin": ";", "or_profile": "~Ammar_Fayad1;~Majd_Ibrahim1", "aff": "Massachusetts Institute of Technology;Higher Institute for Applied Sciences and Technology", "aff_domain": "mit.edu;hiast.edu.sy", "position": "Undergrad student;Undergrad student", "bibtex": "@misc{\nfayad2022influencebased,\ntitle={Influence-Based Reinforcement Learning for Intrinsically-Motivated Agents},\nauthor={Ammar Fayad and Majd Ibrahim},\nyear={2022},\nurl={https://openreview.net/forum?id=bxiDvWZm6zU}\n}", "github": "", "project": "", "reviewers": "sFW9;Pto8;HYex;dZZd", "site": "https://openreview.net/forum?id=bxiDvWZm6zU", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "4;4;4;4", "correctness": "2;3;3;2", "technical_novelty": "1;2;3;2", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "75;96;107;76", "wc_summary_review": "34;30;58;64", "wc_main_review": "1101;640;497;568", "wc_review": "1210;766;662;708", "wc_reply_reviewers": "44;0;140;43", "wc_reply_authors": "329;364;151;176", "reply_reviewers": "1;0;1;1", "reply_authors": "1;1;1;1", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 88.5, 13.573871960498227 ], "wc_summary_review_avg": [ 46.5, 14.722431864335457 ], "wc_main_review_avg": [ 701.5, 236.12761380236748 ], "wc_review_avg": [ 836.5, 218.76642795456527 ], "wc_reply_reviewers_avg": [ 56.75, 51.24146270355678 ], "wc_reply_authors_avg": [ 255.0, 92.75505377067063 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1142398221941556144&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;1", "aff_unique_norm": "Massachusetts Institute of Technology;Higher Institute for Applied Sciences and Technology", "aff_unique_dep": ";", "aff_unique_url": "https://web.mit.edu;", "aff_unique_abbr": "MIT;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0", "aff_country_unique": "United States;" }, { "title": "SketchODE: Learning neural sketch representation in continuous time", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6760", "id": "c-4HSDAWua5", "poster": "", "openreview": "https://openreview.net/forum?id=c-4HSDAWua5", "slides": "https://iclr.cc/virtual/2022/poster/6760", "video": "https://iclr.cc/virtual/2022/poster/6760", "author_site": "Ayan Das, Yongxin Yang, Timothy Hospedales, Tao Xiang, Yi-Zhe Song", "tldr": "", "abstract": "Learning meaningful representations for chirographic drawing data such as sketches, handwriting, and flowcharts is a gateway for understanding and emulating human creative expression. Despite being inherently continuous-time data, existing works have treated these as discrete-time sequences, disregarding their true nature. In this work, we model such data as continuous-time functions and learn compact representations by virtue of Neural Ordinary Differential Equations. To this end, we introduce the first continuous-time Seq2Seq model and demonstrate some remarkable properties that set it apart from traditional discrete-time analogues. We also provide solutions for some practical challenges for such models, including introducing a family of parameterized ODE dynamics & continuous-time data augmentation particularly suitable for the task. Our models are validated on several datasets including VectorMNIST, DiDi and Quick, Draw!.", "keywords": "Chirography;Sketch;Free-form;Neural ODE", "primary_area": "", "supplementary_material": "", "author": "Ayan Das;Yongxin Yang;Timothy Hospedales;Tao Xiang;Yi-Zhe Song", "authorids": "~Ayan_Das1;~Yongxin_Yang1;~Timothy_Hospedales1;~Tao_Xiang1;~Yi-Zhe_Song2", "gender": "M;M;M;M;M", "homepage": "https://ayandas.me/;http://homepages.inf.ed.ac.uk/thospeda/;https://www.surrey.ac.uk/people/tao-xiang;http://personal.ee.surrey.ac.uk/Personal/Y.Song/;", "dblp": "269/9613;32/3545;22/4460-2.html;98/1684;150/4258", "google_scholar": "x-WI_EgAAAAJ;https://scholar.google.fr/citations?user=nHhtvqkAAAAJ;MeS5d4gAAAAJ;https://scholar.google.co.uk/citations?user=irZFP_AAAAAJ;https://scholar.google.co.uk/citations?user=F7PtrL8AAAAJ", "orcid": "0000-0002-7764-1346;0000-0003-4867-7486;0000-0002-2530-1059;;", "linkedin": "ayan-das-a49928a7/;timothyhospedales/;;;", "or_profile": "~Ayan_Das1;~Timothy_Hospedales1;~Tao_Xiang1;~Yi-Zhe_Song2;~Yongxin_Yang3", "aff": "University of Surrey;Samsung AI Research Centre;University of Surrey;University of Surrey;Queen Mary University of London", "aff_domain": "surrey.ac.uk;samsung.com;surrey.ac.uk;surrey.ac.uk;qmul.ac.uk", "position": "PhD student;Principal Researcher;Full Professor;Professor;Assistant Professor", "bibtex": "@inproceedings{\ndas2022sketchode,\ntitle={Sketch{ODE}: Learning neural sketch representation in continuous time},\nauthor={Ayan Das and Yongxin Yang and Timothy Hospedales and Tao Xiang and Yi-Zhe Song},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=c-4HSDAWua5}\n}", "github": "", "project": "", "reviewers": "KvGm;Q3GY;zrrF;S7jh", "pdf_size": 0, "recommendation": "5;6;8;8", "confidence": "4;3;3;3", "correctness": "3;4;4;4", "technical_novelty": "2;3;4;3", "empirical_novelty": "2;3;4;4", "wc_summary_paper": "63;26;308;33", "wc_summary_review": "217;37;49;33", "wc_main_review": "712;254;829;174", "wc_review": "992;317;1186;240", "wc_reply_reviewers": "160;0;200;0", "wc_reply_authors": "1707;530;440;220", "reply_reviewers": "1;0;1;0", "reply_authors": "3;1;2;1", "recommendation_avg": [ 6.75, 1.299038105676658 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 3.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 107.5, 116.59009391882314 ], "wc_summary_review_avg": [ 84.0, 77.01298591796062 ], "wc_main_review_avg": [ 492.25, 282.7263473749838 ], "wc_review_avg": [ 683.75, 411.91405353544326 ], "wc_reply_reviewers_avg": [ 90.0, 91.10433579144299 ], "wc_reply_authors_avg": [ 724.25, 578.4887099157597 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.75, 0.82915619758885 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.7777777777777777, "corr_recommendation_correctness": 0.7777777777777777, "gs_citation": 18, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6606956618063660201&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "pdf": "https://openreview.net/pdf?id=c-4HSDAWua5", "email": "surrey.ac.uk;samsung.com;surrey.ac.uk;surrey.ac.uk;qmul.ac.uk", "author_num": 5, "aff_unique_index": "0;1;0;0;2", "aff_unique_norm": "University of Surrey;Samsung;Queen Mary University of London", "aff_unique_dep": ";AI Research;", "aff_unique_url": "https://www.surrey.ac.uk;https://www.samsung.com/global/researchers/samsung-ai-research-centre/;https://www.qmul.ac.uk", "aff_unique_abbr": "Surrey;SARC;QMUL", "aff_campus_unique_index": "1", "aff_campus_unique": ";London", "aff_country_unique_index": "0;1;0;0;0", "aff_country_unique": "United Kingdom;South Korea" }, { "id": "c0AD3ll9Wyv", "title": "Can Label-Noise Transition Matrix Help to Improve Sample Selection and Label Correction?", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Existing methods for learning with noisy labels can be generally divided into two categories: (1) sample selection and label correction based on the memorization effect of neural networks; (2) loss correction with the transition matrix. So far, the two categories of methods have been studied independently because they are designed according to different philosophies, i.e., the memorization effect is a property of the neural networks independent of label noise while the transition matrix is exploited to model the distribution of label noise. In this paper, we take a first step in unifying these two paradigms by showing that modelling the distribution of label noise with the transition matrix can also help sample selection and label correction, which leads to better robustness against different types of noise. More specifically, we first train a network with the loss corrected by the transition matrix and then use the confidence of the estimated clean class posterior from the network to select and re-label instances. Our proposed method demonstrates strong robustness on multiple benchmark datasets under various types of noise.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/463ca52b283203c56eb6c8c4f643aafba132920d.zip", "author": "Yu Yao;Xuefeng Li;Tongliang Liu;Alan Blair;Mingming Gong;Bo Han;Gang Niu;Masashi Sugiyama", "authorids": "~Yu_Yao3;~Xuefeng_Li1;~Tongliang_Liu1;~Alan_Blair1;~Mingming_Gong1;~Bo_Han1;~Gang_Niu1;~Masashi_Sugiyama1", "gender": "M;M;M;M;M;M;M;M", "homepage": "https://a5507203.github.io/;http://www.cse.unsw.edu.au/~z5085453/;https://tongliang-liu.github.io/;https://www.cse.unsw.edu.au/~blair/;https://mingming-gong.github.io/;https://niug1984.github.io;http://www.ms.k.u-tokyo.ac.jp/sugi/;https://bhanml.github.io/", "dblp": "230/9625;59/3654;150/6667;79/3180;98/8479;26/3367-1;35/1228;241/0472-3", "google_scholar": "OkcaMKAAAAAJ;BSh0CXIAAAAJ;https://scholar.google.com.au/citations?user=EiLdZ_YAAAAJ;oYi8fBIAAAAJ;https://scholar.google.com.au/citations?user=6BmiCJIAAAAJ;https://scholar.google.co.jp/citations?user=HOkcy00AAAAJ;https://scholar.google.co.jp/citations?user=GkYIrlIAAAAJ;nTNjqHwAAAAJ", "orcid": ";;;0000-0002-1039-4766;0000-0001-7147-5589;;0000-0001-6658-6743;", "linkedin": "yu-yao-150377134/;;;;;;;", "or_profile": "~Yu_Yao3;~Xuefeng_Li1;~Tongliang_Liu1;~Alan_Blair1;~Mingming_Gong1;~Gang_Niu1;~Masashi_Sugiyama1;~bo_han2", "aff": "University of Sydney;University of New South Wales;University of Sydney;;University of Melbourne;RIKEN;The University of Tokyo;Microsoft Research", "aff_domain": "uni.sydney.edu.au;unsw.edu.au;sydney.edu.au;;unimelb.edu.au;riken.jp;u-tokyo.ac.jp;microsoft.com", "position": "PhD student;PhD student;Lecturer;;Assistant Professor;Research Scientist (tenured);Full Professor;Researcher", "bibtex": "@misc{\nyao2022can,\ntitle={Can Label-Noise Transition Matrix Help to Improve Sample Selection and Label Correction?},\nauthor={Yu Yao and Xuefeng Li and Tongliang Liu and Alan Blair and Mingming Gong and Bo Han and Gang Niu and Masashi Sugiyama},\nyear={2022},\nurl={https://openreview.net/forum?id=c0AD3ll9Wyv}\n}", "github": "", "project": "", "reviewers": "wyQv;qHcV;E3xv;HYCn;zo6S", "site": "https://openreview.net/forum?id=c0AD3ll9Wyv", "pdf_size": 0, "recommendation": "3;3;5;6;6", "confidence": "5;5;4;5;4", "correctness": "2;3;4;4;3", "technical_novelty": "3;2;2;4;4", "empirical_novelty": "3;2;3;4;3", "wc_summary_paper": "100;172;112;112;59", "wc_summary_review": "57;202;45;34;51", "wc_main_review": "492;769;277;300;154", "wc_review": "649;1143;434;446;264", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "0;0;0;0;0", "reply_reviewers": "0;0;0;0;0", "reply_authors": "0;0;0;0;0", "recommendation_avg": [ 4.6, 1.3564659966250536 ], "confidence_avg": [ 4.6, 0.4898979485566356 ], "correctness_avg": [ 3.2, 0.7483314773547882 ], "technical_novelty_avg": [ 3.0, 0.8944271909999159 ], "empirical_novelty_avg": [ 3.0, 0.6324555320336759 ], "wc_summary_paper_avg": [ 111.0, 36.18839593018734 ], "wc_summary_review_avg": [ 77.8, 62.56324799752647 ], "wc_main_review_avg": [ 398.4, 214.59599250684994 ], "wc_review_avg": [ 587.2, 303.51237207072796 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": -0.5417363388859615, "corr_recommendation_correctness": 0.6698938453032357, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:vsZmPuDhdP4J:scholar.google.com/&scioq=Can+Label-Noise+Transition+Matrix+Help+to+Improve+Sample+Selection+and+Label+Correction%3F&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1;0;2;3;4;5", "aff_unique_norm": "University of Sydney;University of New South Wales;University of Melbourne;RIKEN;University of Tokyo;Microsoft", "aff_unique_dep": ";;;;;Microsoft Research", "aff_unique_url": "https://www.sydney.edu.au;https://www.unsw.edu.au;https://www.unimelb.edu.au;https://www.riken.jp;https://www.u-tokyo.ac.jp;https://www.microsoft.com/en-us/research", "aff_unique_abbr": "USYD;UNSW;UniMelb;RIKEN;UTokyo;MSR", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;1;1;2", "aff_country_unique": "Australia;Japan;United States" }, { "id": "c4iTLTkpY5", "title": "Personalized Heterogeneous Federated Learning with Gradient Similarity", "track": "main", "status": "Reject", "tldr": "", "abstract": "In the conventional federated learning (FL), the local models of multiple clients are trained independently by their privacy data, and the center server generates the shared global model by aggregating local models. However, the global model often fails to adapt to each client due to statistical and systems heterogeneities, such as non-IID data and inconsistencies in clients' hardware and bandwidth. To address these problems, we propose the Subclass Personalized FL (SPFL) algorithm for non-IID data in synchronous FL and the Personalized Leap Gradient Approximation (PLGA) algorithm for the asynchronous FL. In SPFL, the server uses the Softmax Normalized Gradient Similarity (SNGS) to weight the relationship between clients, and sends the personalized global model to each client. In PLGA, the server also applies the SNGS to weight the relationship between client and itself, and uses the first-order Taylor expansion of gradient to approximate the model of the delayed clients. To the best of our knowledge, this is one of the few studies investigating explicitly on personalization in asynchronous FL. The stage strategy of ResNet is further applied to improve the performance of FL. The experimental results show that (1) in synchronous FL, the SPFL algorithm used on non-IID data outperforms the vanilla FedAvg, PerFedAvg, and FedUpdate algorithms, improving the accuracy by $1.81\\!\\sim\\!18.46\\%$ on four datasets (CIFAR10, CIFAR100, MNIST, EMNIST), while still maintaining the state of the art performance on IID data; (2) in asynchronous FL, compared with the vanilla FedAvg, PerFedAvg, and FedAsync algorithms, the PLGA algorithm improves the accuracy by $0.23\\!\\sim\\!12.63\\%$ on the same four datasets of non-IID data.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jing Xie;Xiang Yin;Xiyi Zhang;Juan Chen;Quan Wen;Qiang Yang;Xuan Mo", "authorids": "~Jing_Xie3;~Xiang_Yin1;~Xiyi_Zhang1;~Juan_Chen1;~Quan_Wen1;~Qiang_Yang4;~Xuan_Mo1", "gender": "F;M;M;F;M;M;", "homepage": "https://github.com/MondayCat;https://github.com/yinxiangcsu;https://github.com/lizardlove;https://www.scse.uestc.edu.cn/info/1081/10954.htm;https://www.en.scse.uestc.edu.cn/sznr.jsp?urltype=news.NewsContentUrl&wbtreeid=1022&wbnewsid=1142;;https://github.com/DOCNULL", "dblp": ";;;;;82/6362-1;", "google_scholar": ";;;;;;", "orcid": ";;;0000-0001-8114-8725;;;", "linkedin": ";;;;;;", "or_profile": "~Jing_Xie3;~Xiang_Yin1;~Xiyi_Zhang1;~Juan_Chen1;~Quan_Wen1;~Qiang_Yang4;~Xuan_Mo1", "aff": "University of Electronic Science and Technology of China;;University of Electronic Science and Technology of China;University of Electronic Science and Technology of China;;University of Electronic Science and Technology of China;University of Electronic Science and Technology of China", "aff_domain": "uestc.edu.cn;;uestc.edu.cn;uestc.edu.cn;;uestc.edu.cn;uestc.edu.cn", "position": "MS student;;MS student;Associate Professor;;MS student;MS student", "bibtex": "@misc{\nxie2022personalized,\ntitle={Personalized Heterogeneous Federated Learning with Gradient Similarity},\nauthor={Jing Xie and Xiang Yin and Xiyi Zhang and Juan Chen and Quan Wen and Qiang Yang and Xuan Mo},\nyear={2022},\nurl={https://openreview.net/forum?id=c4iTLTkpY5}\n}", "github": "", "project": "", "reviewers": "3fbQ;7GZS;1psQ;HSpu", "site": "https://openreview.net/forum?id=c4iTLTkpY5", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "4;4;4;3", "correctness": "3;2;3;3", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;2;3;4", "wc_summary_paper": "10;68;68;31", "wc_summary_review": "10;91;54;18", "wc_main_review": "141;498;188;239", "wc_review": "161;657;310;288", "wc_reply_reviewers": "0;0;0;12", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;1", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 44.25, 24.883478454589103 ], "wc_summary_review_avg": [ 43.25, 32.16655872175325 ], "wc_main_review_avg": [ 266.5, 138.07697128775675 ], "wc_review_avg": [ 354.0, 183.94972139147154 ], "wc_reply_reviewers_avg": [ 3.0, 5.196152422706632 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": -1.0, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14022604026144124462&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "University of Electronic Science and Technology of China", "aff_unique_dep": "", "aff_unique_url": "https://www.uestc.edu.cn", "aff_unique_abbr": "UESTC", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "id": "c60vFLXEwED", "title": "PIVQGAN: Posture and Identity Disentangled Image-to-Image Translation via Vector Quantization", "track": "main", "status": "Reject", "tldr": "", "abstract": "One popular objective for the image-to-image translation task is to independently control the coarse-level object arrangements (posture) and the fine-grained level styling (identity) of the generated image from two exemplar sources. To approach this objective, we propose PIVQGAN with two novel techniques in the framework of StyleGAN2. First, we propose a Vector-Quantized Spatial Normalization (VQSN) module for the generator for better pose-identity disentanglement. The VQSN module automatically learns to encode the shaping and composition information from the commonly shared objects inside the training-set images. Second, we design a joint-training scheme with self-supervision methods for the GAN-Inversion encoder and the generator. Specifically, we let the encoder and generator reconstruct images from two differently augmented variants of the original ones, one defining the pose and the other for identity. The VQSN module facilitates a more delicate separation of posture and identity, while the training scheme ensures the VQSN module learns the pose-related representations. Comprehensive experiments conducted on various datasets show better synthesis image quality and disentangling scores of our model. Moreover, we present model applications beyond posture-identity disentangling, thanks to the latent-space reducing feature of the leveraged VQSN module. ", "keywords": "deep learning;generative model;image synthesis;generative adversarial network;self-supervised learning;image-to-image translation", "primary_area": "", "supplementary_material": "/attachment/547c508c210fe8510d91dc1cf96efb18d51bab81.zip", "author": "Bingchen Liu;Yizhe Zhu;Xiao Yang;Ahmed Elgammal", "authorids": "~Bingchen_Liu2;~Yizhe_Zhu2;~Xiao_Yang1;~Ahmed_Elgammal1", "gender": "M;M;M;M", "homepage": ";http://yzzhu.net/;;https://www.cs.rutgers.edu/~elgammal/Home.html", "dblp": ";http://dblp.uni-trier.de/pers/hd/z/Zhu:Yizhe;57/3385-2.html;e/AhmedMElgammal", "google_scholar": "uKdv6SUAAAAJ;hPXUR0cAAAAJ;_MAKSLkAAAAJ;https://scholar.google.com.tw/citations?user=DxQiCiIAAAAJ", "orcid": "0000-0002-2886-8915;;;", "linkedin": "bingchen-liu-71b38611a/;yizhe-ethan-zhu-171a06126/;;", "or_profile": "~Bingchen_Liu2;~Yizhe_Zhu2;~Xiao_Yang1;~Ahmed_Elgammal1", "aff": "Bytedance Inc.;;Bytedance;Rutgers University, new brunswick", "aff_domain": "bytedance.com;;bytedance.com;rutgers.edu", "position": "Researcher;;Research Scientist;Full Professor", "bibtex": "@misc{\nliu2022pivqgan,\ntitle={{PIVQGAN}: Posture and Identity Disentangled Image-to-Image Translation via Vector Quantization},\nauthor={Bingchen Liu and Yizhe Zhu and Xiao Yang and Ahmed Elgammal},\nyear={2022},\nurl={https://openreview.net/forum?id=c60vFLXEwED}\n}", "github": "", "project": "", "reviewers": "SbZB;a2FQ;SuTz;fSLH;dkA5", "site": "https://openreview.net/forum?id=c60vFLXEwED", "pdf_size": 0, "recommendation": "5;5;5;6;6", "confidence": "4;3;4;5;3", "correctness": "3;2;3;4;3", "technical_novelty": "2;3;2;3;2", "empirical_novelty": "2;3;3;2;2", "wc_summary_paper": "50;78;205;54;104", "wc_summary_review": "78;36;173;16;20", "wc_main_review": "659;83;956;269;264", "wc_review": "787;197;1334;339;388", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "788;123;578;276;347", "reply_reviewers": "0;0;0;0;0", "reply_authors": "1;1;1;1;1", "recommendation_avg": [ 5.4, 0.48989794855663565 ], "confidence_avg": [ 3.8, 0.7483314773547882 ], "correctness_avg": [ 3.0, 0.6324555320336759 ], "technical_novelty_avg": [ 2.4, 0.4898979485566356 ], "empirical_novelty_avg": [ 2.4, 0.4898979485566356 ], "wc_summary_paper_avg": [ 98.2, 56.78873127654817 ], "wc_summary_review_avg": [ 64.6, 58.47939808171763 ], "wc_main_review_avg": [ 446.2, 316.76199266957514 ], "wc_review_avg": [ 609.0, 412.0179607735566 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 422.4, 234.36433175720234 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.2182178902359924, "corr_recommendation_correctness": 0.6454972243679027, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:hMOCGuiypY0J:scholar.google.com/&scioq=PIVQGAN:+Posture+and+Identity+Disentangled+Image-to-Image+Translation+via+Vector+Quantization&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;2", "aff_unique_norm": "Bytedance Inc.;ByteDance;Rutgers University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.bytedance.com;https://www.bytedance.com;https://www.rutgers.edu", "aff_unique_abbr": "Bytedance;Bytedance;Rutgers", "aff_campus_unique_index": "1", "aff_campus_unique": ";New Brunswick", "aff_country_unique_index": "0;0;1", "aff_country_unique": "China;United States" }, { "id": "c7S4WIlmu5", "title": "Contrastive Pre-training for Zero-Shot Information Retrieval", "track": "main", "status": "Reject", "tldr": "", "abstract": "Information retrieval is an important component in natural language processing, for knowledge intensive tasks such as question answering and fact checking. Recently, information retrieval has seen the emergence of dense retrievers, based on neural networks, as an alternative to classical sparse methods based on term-frequency. Neural retrievers work well on the problems for which they were specifically trained, but they do not generalize as well as term-frequency methods to new domains or applications. By contrast, in many other NLP tasks, conventional self-supervised pre-training based on masking leads to strong generalization with small number of training examples. We believe this is not yet the case for information retrieval, because these pre-training methods are not well adapted to this task. In this work, we consider contrastive learning as a more natural pre-training technique for retrieval and show that it leads to models that are competitive with BM25 on many domains or applications, even without training on supervised data. Our dense pre-trained models also compare favorably against BERT pre-trained models in the few-shot setting, and achieves state-of-the-art performance on the BEIR benchmark when fine-tuned on MS-MARCO.", "keywords": "information retrieval;contrastive pretraining", "primary_area": "", "supplementary_material": "", "author": "Gautier Izacard;Mathilde Caron;Lucas Hosseini;Sebastian Riedel;Piotr Bojanowski;Armand Joulin;Edouard Grave", "authorids": "~Gautier_Izacard1;~Mathilde_Caron1;~Lucas_Hosseini1;~Sebastian_Riedel1;~Piotr_Bojanowski1;~Armand_Joulin1;~Edouard_Grave1", "gender": "Unspecified;F;M;M;M;;", "homepage": ";;;https://www.riedelcastro.org/;;;", "dblp": "222/3621;223/4085;;18/3348-1.html;142/2542;68/8653;50/10261", "google_scholar": "https://scholar.google.com/citations?view_op=list_works;;;https://scholar.google.com.tw/citations?user=AcCtcrsAAAAJ;https://scholar.google.fr/citations?user=lJ_oh2EAAAAJ;kRJkDakAAAAJ;7UV4ET4AAAAJ", "orcid": ";;;;;;", "linkedin": ";;https://fr.linkedin.com/in/lucas-hosseini-73126a41;;piotr-bojanowski-9a94402a;;edouard-grave-63099823/", "or_profile": "~Gautier_Izacard1;~Mathilde_Caron1;~Lucas_Hosseini1;~Sebastian_Riedel1;~Piotr_Bojanowski1;~Armand_Joulin1;~Edouard_Grave1", "aff": "Meta Facebook;Google;Meta Facebook;Meta Facebook;Meta;Meta Facebook;Meta Facebook", "aff_domain": "fb.com;google.com;fb.com;fb.com;meta.com;fb.com;fb.com", "position": "PhD student;Researcher;Research Engineer;Researcher;Researcher;Full Professor;Research Scientist", "bibtex": "@misc{\nizacard2022contrastive,\ntitle={Contrastive Pre-training for Zero-Shot Information Retrieval},\nauthor={Gautier Izacard and Mathilde Caron and Lucas Hosseini and Sebastian Riedel and Piotr Bojanowski and Armand Joulin and Edouard Grave},\nyear={2022},\nurl={https://openreview.net/forum?id=c7S4WIlmu5}\n}", "github": "", "project": "", "reviewers": "cPwk;F8YA;NWVD;tnYn", "site": "https://openreview.net/forum?id=c7S4WIlmu5", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "4;5;4;3", "correctness": "3;2;3;3", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "60;118;110;60", "wc_summary_review": "17;183;46;86", "wc_main_review": "86;1013;205;118", "wc_review": "163;1314;361;264", "wc_reply_reviewers": "0;143;0;0", "wc_reply_authors": "154;439;140;236", "reply_reviewers": "0;1;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 87.0, 27.147743920996454 ], "wc_summary_review_avg": [ 83.0, 62.717621128355944 ], "wc_main_review_avg": [ 355.5, 382.09717350433255 ], "wc_review_avg": [ 525.5, 460.59228174167225 ], "wc_reply_reviewers_avg": [ 35.75, 61.92081637058736 ], "wc_reply_authors_avg": [ 242.25, 119.36577189462648 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": -0.7071067811865475, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6093638363236498815&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;0;0;0;0;0", "aff_unique_norm": "Meta;Google", "aff_unique_dep": "Meta Platforms, Inc.;Google", "aff_unique_url": "https://meta.com;https://www.google.com", "aff_unique_abbr": "Meta;Google", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "c7zS_oS5gU", "title": "Federated Distillation of Natural Language Understanding with Confident Sinkhorns", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Enhancing the user experience is an essential task for application service providers. For instance, two users living wide apart may have different tastes of food. A food recommender mobile application installed on an edge device might want to learn from user feedback (reviews) to satisfy the client's needs pertaining to distinct domains. Retrieving user data comes at the cost of privacy while asking for model parameters trained on a user device becomes space inefficient at a large scale. In this work, we propose an approach to learn a central (global) model from the federation of (local) models which are trained on user-devices, without disclosing the local data or model parameters to the server. We propose a federation mechanism for the problems with natural similarity metric between the labels which commonly appear in natural language understanding (NLU) tasks. To learn the global model, the objective is to minimize the optimal transport cost of the global model's predictions from the confident sum of soft-targets assigned by local models. The confidence (a model weighting scheme) score of a model is defined as the L2 distance of a model's prediction from its probability bias. The method improves the global model's performance over the baseline designed on three NLU tasks with intrinsic label space semantics, i.e., fine-grained sentiment analysis, emotion recognition in conversation, and natural language inference.", "keywords": "Sinkhorn;NLP;Wasserstein;Random probability skew;Federated Distillation", "primary_area": "", "supplementary_material": "", "author": "Rishabh Bhardwaj;Tushar Vaidya;Soujanya Poria", "authorids": "~Rishabh_Bhardwaj1;~Tushar_Vaidya1;~Soujanya_Poria1", "gender": "M;M;M", "homepage": "https://www.rishabh.ai/;https://sites.google.com/view/tusharvfm;https://soujanyaporia.github.io", "dblp": "245/1413.html;199/1756.html;116/4904", "google_scholar": "nomHn1sAAAAJ;shwb3OMAAAAJ;https://scholar.google.co.in/citations?user=oS6gRc4AAAAJ", "orcid": "0000-0003-3833-4754;0000-0002-2264-2595;", "linkedin": "rishabh-bhardwaj-nlp/;tushar-vaidya-730a9713/;", "or_profile": "~Rishabh_Bhardwaj1;~Tushar_Vaidya1;~Soujanya_Poria1", "aff": "Singapore University of Technology and Design;Singapore University of Technology and Design;Singapore University of Technology and Design", "aff_domain": "sutd.edu.sg;sutd.edu.sg;sutd.edu.sg", "position": "PhD student;Postdoc;Associate Professor", "bibtex": "@misc{\nbhardwaj2022federated,\ntitle={Federated Distillation of Natural Language Understanding with Confident Sinkhorns},\nauthor={Rishabh Bhardwaj and Tushar Vaidya and Soujanya Poria},\nyear={2022},\nurl={https://openreview.net/forum?id=c7zS_oS5gU}\n}", "github": "", "project": "", "reviewers": "R6vr;3jqn;H6ku;steV", "site": "https://openreview.net/forum?id=c7zS_oS5gU", "pdf_size": 0, "recommendation": "1;3;5;6", "confidence": "3;3;3;4", "correctness": "2;2;3;3", "technical_novelty": "1;2;3;3", "empirical_novelty": "1;2;3;2", "wc_summary_paper": "129;121;57;448", "wc_summary_review": "173;10;40;149", "wc_main_review": "331;274;174;593", "wc_review": "633;405;271;1190", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.75, 1.920286436967152 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.25, 0.82915619758885 ], "empirical_novelty_avg": [ 2.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 188.75, 152.25697849359813 ], "wc_summary_review_avg": [ 93.0, 69.34334863561176 ], "wc_main_review_avg": [ 343.0, 154.8918977868113 ], "wc_review_avg": [ 624.75, 351.0714848859132 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.676481425202546, "corr_recommendation_correctness": 0.911322376865767, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1693840286248854875&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Singapore University of Technology and Design", "aff_unique_dep": "", "aff_unique_url": "https://www.sutd.edu.sg", "aff_unique_abbr": "SUTD", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Singapore" }, { "title": "Orchestrated Value Mapping for Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6000", "id": "c87d0TS4yX", "poster": "", "openreview": "https://openreview.net/forum?id=c87d0TS4yX", "slides": "https://iclr.cc/virtual/2022/poster/6000", "video": "https://iclr.cc/virtual/2022/poster/6000", "author_site": "Mehdi Fatemi, Arash Tavakoli", "tldr": "", "abstract": "We present a general convergent class of reinforcement learning algorithms that is founded on two distinct principles: (1) mapping value estimates to a different space using arbitrary functions from a broad class, and (2) linearly decomposing the reward signal into multiple channels. The first principle enables incorporating specific properties into the value estimator that can enhance learning. The second principle, on the other hand, allows for the value function to be represented as a composition of multiple utility functions. This can be leveraged for various purposes, e.g. dealing with highly varying reward scales, incorporating a priori knowledge about the sources of reward, and ensemble learning. Combining the two principles yields a general blueprint for instantiating convergent algorithms by orchestrating diverse mapping functions over multiple reward channels. This blueprint generalizes and subsumes algorithms such as Q-Learning, Log Q-Learning, and Q-Decomposition. In addition, our convergence proof for this general class relaxes certain required assumptions in some of these algorithms. Based on our theory, we discuss several interesting configurations as special cases. Finally, to illustrate the potential of the design space that our theory opens up, we instantiate a particular algorithm and evaluate its performance on the Atari suite.", "keywords": "Reinforcement Learning;Value Mapping;Reward Decomposition", "primary_area": "", "supplementary_material": "", "author": "Mehdi Fatemi;Arash Tavakoli", "authorids": "~Mehdi_Fatemi1;~Arash_Tavakoli1", "gender": ";M", "homepage": ";https://atavakol.github.io", "dblp": ";177/8682", "google_scholar": "X9_mSpYAAAAJ;https://scholar.google.co.uk/citations?user=Jwq-Qx0AAAAJ", "orcid": "0000-0001-9598-6164;0000-0001-8481-3284", "linkedin": "fatemi/;arashtavakoli", "or_profile": "~Mehdi_Fatemi1;~Arash_Tavakoli1", "aff": "Microsoft;Max Planck Institute for Intelligent Systems", "aff_domain": "microsoft.com;mpg.de", "position": "Senior Researcher;Postdoc", "bibtex": "@inproceedings{\nfatemi2022orchestrated,\ntitle={Orchestrated Value Mapping for Reinforcement Learning},\nauthor={Mehdi Fatemi and Arash Tavakoli},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=c87d0TS4yX}\n}", "github": "", "project": "", "reviewers": "159F;1Ecs;x9kQ", "pdf_size": 0, "recommendation": "6;6;6", "confidence": "3;4;3", "correctness": "4;3;3", "technical_novelty": "3;3;2", "empirical_novelty": "3;2;2", "wc_summary_paper": "71;99;57", "wc_summary_review": "70;45;20", "wc_main_review": "394;526;172", "wc_review": "535;670;249", "wc_reply_reviewers": "18;184;0", "wc_reply_authors": "1210;1058;815", "reply_reviewers": "1;1;0", "reply_authors": "2;3;2", "recommendation_avg": [ 6.0, 0.0 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 75.66666666666667, 17.46106780494506 ], "wc_summary_review_avg": [ 45.0, 20.412414523193153 ], "wc_main_review_avg": [ 364.0, 146.0684770920817 ], "wc_review_avg": [ 484.6666666666667, 175.51891319424493 ], "wc_reply_reviewers_avg": [ 67.33333333333333, 82.82243389055621 ], "wc_reply_authors_avg": [ 1027.6666666666667, 162.67827827408988 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 2.3333333333333335, 0.4714045207910317 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11063352245318082342&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=c87d0TS4yX", "email": "microsoft.com;mpg.de", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "Microsoft;Max Planck Institute for Intelligent Systems", "aff_unique_dep": "Microsoft Corporation;Intelligent Systems", "aff_unique_url": "https://www.microsoft.com;https://www.mpi-is.mpg.de", "aff_unique_abbr": "Microsoft;MPI-IS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "United States;Germany" }, { "id": "c8AvdRAyVkz", "title": "Perturbation Deterioration: The Other Side of Catastrophic Overfitting", "track": "main", "status": "Reject", "tldr": "", "abstract": "Our goal is to understand why the robustness accuracy would abruptly drop to zero, after conducting FGSM-style adversarial training for too long. While this phenomenon is commonly explained as overfitting, we observe that it is a twin process: not only does the model catastrophic overfits to one type of perturbation, but also the perturbation deteriorates into random noise. For example, at the same epoch when the FGSM-trained model catastrophically overfits, its generated perturbations deteriorate into random noise. Intuitively, once the generated perturbations become weak and inadequate, models would be misguided to overfit those weak attacks and fail to defend strong ones. In the light of our analyses, we propose APART, an adaptive adversarial training method, which parameterizes perturbation generation and progressively strengthens them. In our experiments, APART successfully prevents perturbation deterioration and catastrophic overfitting. Also, APART significantly improves the model robustness while maintaining the same efficiency as FGSM-style methods, e.g., on the CIFAR-10 dataset, APART achieves 53.89%accuracy under the PGD-20 attack and 49.05% accuracy under the AutoAttack.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zichao Li;Liyuan Liu;Chengyu Dong;Jingbo Shang", "authorids": "~Zichao_Li2;~Liyuan_Liu3;~Chengyu_Dong1;~Jingbo_Shang2", "gender": "M;;M;M", "homepage": ";https://www.chengyu-dong.me/;https://shangjingbo1226.github.io/;https://liyuanlucasliu.github.io/", "dblp": "95/147-2;14/3155;151/3145.html;06/1624", "google_scholar": ";Ppfi7j0AAAAJ;0SkFI4MAAAAJ;RmvbkzYAAAAJ", "orcid": ";;;", "linkedin": "%E5%AD%90%E8%B6%85-%E6%9D%8E-95676a193/;;;", "or_profile": "~Zichao_Li2;~Chengyu_Dong1;~Jingbo_Shang2;~Liyuan_Liu1", "aff": ";University of California, San Diego;University of California, San Diego;University of Illinois, Urbana Champaign", "aff_domain": ";ucsd.edu;ucsd.edu;illinois.edu", "position": ";PhD student;Assistant Professor;PhD student", "bibtex": "@misc{\nli2022perturbation,\ntitle={Perturbation Deterioration: The Other Side of Catastrophic Overfitting},\nauthor={Zichao Li and Liyuan Liu and Chengyu Dong and Jingbo Shang},\nyear={2022},\nurl={https://openreview.net/forum?id=c8AvdRAyVkz}\n}", "github": "", "project": "", "reviewers": "Bmpq;996w;amVQ;Z8eC", "site": "https://openreview.net/forum?id=c8AvdRAyVkz", "pdf_size": 0, "recommendation": "3;3;5;6", "confidence": "5;3;5;2", "correctness": "3;3;3;3", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "184;100;60;89", "wc_summary_review": "4;46;14;97", "wc_main_review": "793;359;169;397", "wc_review": "981;505;243;583", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.25, 1.299038105676658 ], "confidence_avg": [ 3.75, 1.299038105676658 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 108.25, 46.11060073345391 ], "wc_summary_review_avg": [ 40.25, 36.25172409693089 ], "wc_main_review_avg": [ 429.5, 226.94657961731875 ], "wc_review_avg": [ 578.0, 264.5694615786183 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.4074074074074074, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:m800Q67AB_cJ:scholar.google.com/&scioq=Perturbation+Deterioration:+The+Other+Side+of+Catastrophic+Overfitting&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;1", "aff_unique_norm": "University of California, San Diego;University of Illinois Urbana-Champaign", "aff_unique_dep": ";", "aff_unique_url": "https://www.ucsd.edu;https://illinois.edu", "aff_unique_abbr": "UCSD;UIUC", "aff_campus_unique_index": "0;0;1", "aff_campus_unique": "San Diego;Urbana-Champaign", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "c8JDlJMBeyh", "title": "Towards Generic Interface for Human-Neural Network Knowledge Exchange", "track": "main", "status": "Reject", "tldr": "", "abstract": "Neural Networks (NN) outperform humans in multiple domains. Yet they suffer from a lack of transparency and interpretability, which hinders intuitive and effective human interactions with them. Especially when NN makes mistakes, humans can hardly locate the reason for the error, and correcting it is even harder. While recent advances in explainable AI have substantially improved the explainability of NNs, effective knowledge exchange between humans and NNs is still under-explored. To fill this gap, we propose Human-NN-Interface (HNI), a framework using a structural representation of visual concepts as a \u201dlanguage\u201d for humans and NN to communicate, interact, and exchange knowledge. Take image classification as an example, HNI visualizes the reasoning logic of a NN with class-specific Structural Concept Graphs (c-SCG), which are human-interpretable. On the other hand, humans can effectively provide feedback and guidance to the NN by modifying the c-SCG, and transferring the knowledge back to NN through HNI. We demonstrate the efficacy of HNI with image classification tasks and 3 different types of interactions: (1) Explaining the reasoning logic of NNs so humans can intuitively identify and locate errors of NN; (2) human users can correct the errors and improve NN\u2019s performance by modifying the c-SCG and distilling the knowledge back to the original NN; (3) human users can intuitively guide NN and provide a new solution for zero-shot learning.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/eb223febc0634b7174dce22a2bf8aad005fc08b7.zip", "author": "Yunhao Ge;Yao Xiao;Zhi Xu;Linwei Li;Ziyan Wu;Laurent Itti", "authorids": "~Yunhao_Ge1;~Yao_Xiao3;~Zhi_Xu2;~Linwei_Li2;~Ziyan_Wu2;~Laurent_Itti1", "gender": "M;;M;F;M;M", "homepage": "https://gyhandy.github.io/;;https://github.com/zhix9767;https://github.com/FredericaLee;http://wuziyan.com;http://ilab.usc.edu", "dblp": "204/1908;;;;;31/3256", "google_scholar": "https://scholar.google.ca/citations?user=QhjGr4oAAAAJ;;https://scholar.google.com/citations?hl=en;;CkPUb-4AAAAJ;xhUvqK8AAAAJ", "orcid": ";;;;0000-0002-9774-7770;0000-0002-0168-2977", "linkedin": "yunhao-ge-720727135/;;;;;", "or_profile": "~Yunhao_Ge1;~Yao_Xiao3;~Zhi_Xu2;~Linwei_Li2;~Ziyan_Wu2;~Laurent_Itti1", "aff": "University of Southern California;;Northeastern University;;United Imaging Intelligence;University of Southern California", "aff_domain": "usc.edu;;neu.edu;;uii-ai.com;usc.edu", "position": "PhD student;;PhD student;;Principal Expert Scientist;Professor", "bibtex": "@misc{\nge2022towards,\ntitle={Towards Generic Interface for Human-Neural Network Knowledge Exchange},\nauthor={Yunhao Ge and Yao Xiao and Zhi Xu and Linwei Li and Ziyan Wu and Laurent Itti},\nyear={2022},\nurl={https://openreview.net/forum?id=c8JDlJMBeyh}\n}", "github": "", "project": "", "reviewers": "GNsq;KrBb;9inm;yaEq", "site": "https://openreview.net/forum?id=c8JDlJMBeyh", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "3;4;3;4", "correctness": "4;2;3;3", "technical_novelty": "3;2;3;3", "empirical_novelty": "2;3;2;3", "wc_summary_paper": "69;102;61;74", "wc_summary_review": "38;47;24;27", "wc_main_review": "135;510;211;313", "wc_review": "242;659;296;414", "wc_reply_reviewers": "271;0;0;0", "wc_reply_authors": "1658;792;517;1839", "reply_reviewers": "1;0;0;0", "reply_authors": "3;1;2;3", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 76.5, 15.435349040433131 ], "wc_summary_review_avg": [ 34.0, 9.137833441248533 ], "wc_main_review_avg": [ 292.25, 140.69004051460075 ], "wc_review_avg": [ 402.75, 160.48890148542984 ], "wc_reply_reviewers_avg": [ 67.75, 117.34644221279143 ], "wc_reply_authors_avg": [ 1201.5, 559.2470384365034 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 2.25, 0.82915619758885 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:5sKeJKSQaPQJ:scholar.google.com/&scioq=Towards+Generic+Interface+for+Human-Neural+Network+Knowledge+Exchange&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "University of Southern California;Northeastern University;United Imaging Intelligence", "aff_unique_dep": ";;", "aff_unique_url": "https://www.usc.edu;https://www.northeastern.edu;https://www.united-imaging.com", "aff_unique_abbr": "USC;NEU;", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Los Angeles;", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "United States;China" }, { "id": "c9IvZqZ8SNI", "title": "Learning Structure from the Ground up---Hierarchical Representation Learning by Chunking", "track": "main", "status": "Reject", "tldr": "", "abstract": "From learning to play the piano to speaking a new language, reusing and recombining previously acquired representations enables us to master complex skills and easily adapt to new environments. Inspired by the Gestalt principle of grouping by proximity and theories of chunking in cognitive science, we propose a hierarchical chunking model (HCM). HCM learns representations from non-i.i.d sequential data from the ground up by first discovering the minimal atomic sequential units as chunks. As learning progresses, a hierarchy of chunk representation is acquired by chunking previously learned representations into more complex representations guided by sequential dependence. We provide learning guarantees on an idealized version of HCM, and demonstrate that HCM learns meaningful and interpretable representations in visual, temporal, visual-temporal domains and language data. Furthermore, the interpretability of the learned chunks enables flexible transfer between environments that share partial representational structure. Taken together, our results show how cognitive science in general and theories of chunking in particular could inform novel and more interpretable approaches to representation learning.", "keywords": "representation learning;interpretability;cognitive science", "primary_area": "", "supplementary_material": "", "author": "Shuchen Wu;Noemi Elteto;Ishita Dasgupta;Eric Schulz", "authorids": "~Shuchen_Wu1;noemi.elteto@tuebingen.mpg.de;~Ishita_Dasgupta1;~Eric_Schulz1", "gender": ";;;M", "homepage": "https://swu32.github.io/;;;https://cpilab.org", "dblp": ";;169/6218;124/0016", "google_scholar": "apHyqNYAAAAJ;;;", "orcid": "0000-0002-8425-6016;;;", "linkedin": "shuchen-wu-61a69b78/;;idasgupta6/;", "or_profile": "~Shuchen_Wu1;noemi.elteto@tuebingen.mpg.de;~Ishita_Dasgupta1;~Eric_Schulz1", "aff": "Max Planck Institute For Biological Cybernetics;;Google DeepMind;Max Planck Institute for Biological Cybernetics", "aff_domain": "mpi.tuebingen.de;;deepmind.com;tuebingen.mpg.de", "position": "PhD student;;Researcher;Assistant Professor", "bibtex": "@misc{\nwu2022learning,\ntitle={Learning Structure from the Ground up---Hierarchical Representation Learning by Chunking},\nauthor={Shuchen Wu and Noemi Elteto and Ishita Dasgupta and Eric Schulz},\nyear={2022},\nurl={https://openreview.net/forum?id=c9IvZqZ8SNI}\n}", "github": "", "project": "", "reviewers": "LNDm;mjpW;zfT1;1Nwa", "site": "https://openreview.net/forum?id=c9IvZqZ8SNI", "pdf_size": 0, "recommendation": "3;3;5;6", "confidence": "4;4;3;3", "correctness": "3;3;3;4", "technical_novelty": "1;2;3;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "110;153;75;94", "wc_summary_review": "46;113;124;31", "wc_main_review": "624;453;504;419", "wc_review": "780;719;703;544", "wc_reply_reviewers": "186;120;440;0", "wc_reply_authors": "843;798;214;456", "reply_reviewers": "1;1;1;0", "reply_authors": "2;1;1;1", "recommendation_avg": [ 4.25, 1.299038105676658 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.82915619758885 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 108.0, 28.78367592924851 ], "wc_summary_review_avg": [ 78.5, 40.5370201174186 ], "wc_main_review_avg": [ 500.0, 77.72065362566117 ], "wc_review_avg": [ 686.5, 87.14499411899688 ], "wc_reply_reviewers_avg": [ 186.5, 160.831433494824 ], "wc_reply_authors_avg": [ 577.75, 257.8782416180163 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.9622504486493761, "corr_recommendation_correctness": 0.7777777777777777, "gs_citation": 18, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1596867569131365166&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff_unique_index": "0;1;0", "aff_unique_norm": "Max Planck Institute for Biological Cybernetics;Google", "aff_unique_dep": "Biological Cybernetics;Google DeepMind", "aff_unique_url": "https://www.biocybernetics.mpg.de;https://deepmind.com", "aff_unique_abbr": "MPIBC;DeepMind", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", "aff_country_unique": "Germany;United Kingdom" }, { "id": "cAuJrUm8lG", "title": "Implicit Equivariance in Convolutional Networks", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Convolutional Neural Networks (CNN) are inherently equivariant under translations, however, they do not have an equivalent embedded mechanism to handle other transformations such as rotations and change in scale. Several approaches have been proposed that make CNNs equivariant under other transformation groups by design. Among these, steerable CNNs have been especially effective. However, these approaches require redesigning standard networks with filters mapped from combinations of predefined basis involving complex analytical functions. We experimentally demonstrate that these restrictions in the choice of basis can lead to model weights that are sub-optimal for the primary deep learning task (e.g. classification). Moreover, such hard-baked explicit formulations make it difficult to design composite networks comprising heterogeneous feature groups. To circumvent such issues, we propose Implicitly Equivariant Networks (IEN) which induce equivariance in the different layers of a standard CNN model by optimizing a multi-objective loss function that combines the primary loss with an equivariance loss term. Through experiments with VGG and ResNet models on Rot-MNIST , Rot-TinyImageNet, Scale-MNIST and STL-10 datasets, we show that IEN, even with its simple formulation, performs better than steerable networks. Also, IEN facilitates construction of heterogeneous filter groups allowing to reduce channels in CNNs by factors of over 30% while maintaining performance at par with baselines. The efficacy of IEN is further validated on the hard problem of visual object tracking. We show that IEN outperforms state-of-the-art rotation equivariant tracking method while providing faster inference speed.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/a5954f26081dd0983ec6bded795b8f2cb8f146b4.zip", "author": "Naman Khetan;Tushar Arora;Suraj Sharan;Samee Ur Rehman;Deepak Gupta", "authorids": "~Naman_Khetan2;~Tushar_Arora2;~Suraj_Sharan1;~Samee_Ur_Rehman1;~Deepak_Gupta2", "gender": "M;M;M;M;M", "homepage": ";;;http://dkgupta90.github.io;", "dblp": ";;;163/3197.html;", "google_scholar": ";;https://scholar.google.com/citations?hl=en;https://scholar.google.co.in/citations?user=Nsxpe_kAAAAJ;", "orcid": ";;;;", "linkedin": "tushar14;suraj-sharan/;surehman/;;https://www.linkedin.com/mwlite/in/naman-khetan-046415181", "or_profile": "~Tushar_Arora2;~Suraj_Sharan1;~Samee_Ur_Rehman1;~Deepak_Gupta2;~NAMAN_KHETAN1", "aff": ";;Transmute AI Research;UiT The Arctic University of Norway;Indian Institute Of Technology Dhanbad", "aff_domain": ";;transmute.ai;uit.no;iitism.ac.in", "position": ";;Researcher;Associate Professor;Undergrad student", "bibtex": "@misc{\nkhetan2022implicit,\ntitle={Implicit Equivariance in Convolutional Networks},\nauthor={Naman Khetan and Tushar Arora and Suraj Sharan and Samee Ur Rehman and Deepak Gupta},\nyear={2022},\nurl={https://openreview.net/forum?id=cAuJrUm8lG}\n}", "github": "", "project": "", "reviewers": "cDmd;D5FC;rnvg;EmuQ", "site": "https://openreview.net/forum?id=cAuJrUm8lG", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "4;4;3;4", "correctness": "3;3;3;3", "technical_novelty": "2;3;2;3", "empirical_novelty": "2;2;2;4", "wc_summary_paper": "22;77;78;88", "wc_summary_review": "21;64;25;63", "wc_main_review": "127;383;563;491", "wc_review": "170;524;666;642", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 66.25, 25.907286619790966 ], "wc_summary_review_avg": [ 43.25, 20.30240133580262 ], "wc_main_review_avg": [ 391.0, 165.33602148352307 ], "wc_review_avg": [ 500.5, 198.23912328296854 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.13245323570650439, "corr_recommendation_correctness": 0.0, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13750727158362037940&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "Transmute AI Research;Arctic University of Norway;Indian Institute of Technology Dhanbad", "aff_unique_dep": ";;", "aff_unique_url": "https://www.transmute.ai;https://www.uit.no;https://www.iitdh.ac.in", "aff_unique_abbr": "Transmute AI;UiT;IIT Dhanbad", "aff_campus_unique_index": "1", "aff_campus_unique": ";Dhanbad", "aff_country_unique_index": "0;1;2", "aff_country_unique": "United States;Norway;India" }, { "title": "GiraffeDet: A Heavy-Neck Paradigm for Object Detection", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6863", "id": "cBu4ElJfneV", "poster": "", "openreview": "https://openreview.net/forum?id=cBu4ElJfneV", "slides": "https://iclr.cc/virtual/2022/poster/6863", "video": "https://iclr.cc/virtual/2022/poster/6863", "author_site": "yiqi jiang, Zhiyu Tan, Junyan Wang, Xiuyu Sun, Ming Lin, Li Hao", "tldr": "", "abstract": "In conventional object detection frameworks, a backbone body inherited from image recognition models extracts deep latent features and then a neck module fuses these latent features to capture information at different scales. As the resolution in object detection is much larger than in image recognition, the computational cost of the backbone often dominates the total inference cost. This heavy-backbone design paradigm is mostly due to the historical legacy when transferring image recognition models to object detection rather than an end-to-end optimized design for object detection. In this work, we show that such paradigm indeed leads to sub-optimal object detection models. To this end, we propose a novel heavy-neck paradigm, GiraffeDet, a giraffe-like network for efficient object detection. The GiraffeDet uses an extremely lightweight backbone and a very deep and large neck module which encourages dense information exchange among different spatial scales as well as different levels of latent semantics simultaneously. This design paradigm allows detectors to process the high-level semantic information and low-level spatial information at the same priority even in the early stage of the network, making it more effective in detection tasks. Numerical evaluations on multiple popular object detection benchmarks show that GiraffeDet consistently outperforms previous SOTA models across a wide spectrum of resource constraints. The source code is available at\nhttps://github.com/jyqi/GiraffeDet.", "keywords": "Object Detection;fpn;space-to-depth;representation", "primary_area": "", "supplementary_material": "", "author": "yiqi jiang;Zhiyu Tan;Junyan Wang;Xiuyu Sun;Ming Lin;Hao Li", "authorids": "~yiqi_jiang1;~Zhiyu_Tan2;~Junyan_Wang5;~Xiuyu_Sun1;~Ming_Lin4;~Hao_Li16", "gender": "M;M;M;M;M;M", "homepage": ";https://scholar.google.com/citations?user=XprTQQ8AAAAJ&hl=en;;https://sites.google.com/view/sunxiuyu/home;https://minglin-home.github.io/;", "dblp": ";136/4997;70/4949-1;40/8845;;17/5705-30", "google_scholar": "2uuknY0AAAAJ;XprTQQ8AAAAJ;5yS_tTUAAAAJ;https://scholar.google.com/citations?hl=zh-CN;https://scholar.google.com/citations?hl=en;pHN-QIwAAAAJ", "orcid": ";;0000-0001-5409-1292;0000-0002-7208-8078;;", "linkedin": ";;;;;%E6%98%8A-%E6%9D%8E-392547a5/detail/recent-activity/", "or_profile": "~yiqi_jiang1;~Zhiyu_Tan2;~Junyan_Wang5;~Xiuyu_Sun1;~Ming_Lin4;~Li_Hao1", "aff": "Alibaba Group;Alibaba DAMO Academy;Alibaba Group;Alibaba Group;Alibaba Group;Alibaba Group", "aff_domain": "alibaba-inc.com;alibaba-inc.com;alibaba-inc.com;alibaba-inc.com;alibaba-inc.com;alibaba-inc.com", "position": "Researcher;Researcher;Research intern;Staff Algorithm Engineer;Algorithm Engineer;Researcher", "bibtex": "@inproceedings{\njiang2022giraffedet,\ntitle={GiraffeDet: A Heavy-Neck Paradigm for Object Detection},\nauthor={yiqi jiang and Zhiyu Tan and Junyan Wang and Xiuyu Sun and Ming Lin and Hao Li},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=cBu4ElJfneV}\n}", "github": "", "project": "", "reviewers": "MDN5;ygpA;FXm4", "pdf_size": 0, "recommendation": "5;8;8", "confidence": "5;3;5", "correctness": "3;2;4", "technical_novelty": "2;3;4", "empirical_novelty": "2;2;3", "wc_summary_paper": "75;55;73", "wc_summary_review": "81;105;46", "wc_main_review": "95;400;76", "wc_review": "251;560;195", "wc_reply_reviewers": "0;214;61", "wc_reply_authors": "490;1238;319", "reply_reviewers": "0;1;1", "reply_authors": "1;3;1", "recommendation_avg": [ 7.0, 1.4142135623730951 ], "confidence_avg": [ 4.333333333333333, 0.9428090415820634 ], "correctness_avg": [ 3.0, 0.816496580927726 ], "technical_novelty_avg": [ 3.0, 0.816496580927726 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 67.66666666666667, 8.993825042154695 ], "wc_summary_review_avg": [ 77.33333333333333, 24.225789747475496 ], "wc_main_review_avg": [ 190.33333333333334, 148.4594969081541 ], "wc_review_avg": [ 335.3333333333333, 160.4999134648434 ], "wc_reply_reviewers_avg": [ 91.66666666666667, 90.01604795195627 ], "wc_reply_authors_avg": [ 682.3333333333334, 399.0691947798304 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.9428090415820634 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.5, "corr_recommendation_correctness": 0.0, "gs_citation": 143, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11647720760505575533&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "pdf": "https://openreview.net/pdf?id=cBu4ElJfneV", "email": "alibaba-inc.com;alibaba-inc.com;alibaba-inc.com;alibaba-inc.com;alibaba-inc.com;alibaba-inc.com", "author_num": 6, "aff_unique_index": "0;0;0;0;0;0", "aff_unique_norm": "Alibaba Group", "aff_unique_dep": "", "aff_unique_url": "https://www.alibaba.com", "aff_unique_abbr": "Alibaba", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "China" }, { "id": "cD0O_Sc-wNy", "title": "Learn the Time to Learn: Replay Scheduling for Continual Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Replay-based continual learning has shown to be successful in mitigating catastrophic forgetting. Most previous works focus on increasing the sample quality in the commonly small replay memory. However, in many real-world applications, replay memories would be limited by constraints on processing time rather than storage capacity as most organizations do store all historical data in the cloud. Inspired by human learning, we illustrate that scheduling over which tasks to revisit is critical to the final performance with finite memory resources. To this end, we propose to learn the time to learn for a continual learning system, in which we learn schedules over which tasks to replay at different times using Monte Carlo tree search. We perform extensive evaluation and show that our method can learn replay schedules that significantly improve final performance across all tasks than baselines without considering the scheduling. Furthermore, our method can be combined with any other memory selection methods leading to consistently improved performance. Our results indicate that the learned schedules are also consistent with human learning insights.", "keywords": "Continual Learning;Replay Memory;Task Incremental Learning", "primary_area": "", "supplementary_material": "/attachment/c0fe51b09925ef434751d4b72d414326f3522f22.zip", "author": "Marcus Klasson;Hedvig Kjellstrom;Cheng Zhang", "authorids": "~Marcus_Klasson1;~Hedvig_Kjellstrom1;~Cheng_Zhang1", "gender": "M;F;F", "homepage": "https://marcusklasson.github.io/;https://www.kth.se/profile/hedvig;http://cheng-zhang.org", "dblp": "207/9774;k/HedvigKjellstrom;82/6384-5", "google_scholar": "https://scholar.google.es/citations?user=H9VHxP4AAAAJ;wr3CtKAAAAAJ;r40iAwIAAAAJ", "orcid": ";0000-0002-5750-9655;", "linkedin": ";hedvig-kjellstr%C3%B6m-aaa973/;", "or_profile": "~Marcus_Klasson1;~Hedvig_Kjellstrom1;~Cheng_Zhang1", "aff": "KTH Royal Institute of Technology, Stockholm, Sweden;KTH Royal Institute of Technology;Microsoft", "aff_domain": "kth.se;kth.se;microsoft.com", "position": "PhD student;Full Professor;Principal Researcher", "bibtex": "@misc{\nklasson2022learn,\ntitle={Learn the Time to Learn: Replay Scheduling for Continual Learning},\nauthor={Marcus Klasson and Hedvig Kjellstrom and Cheng Zhang},\nyear={2022},\nurl={https://openreview.net/forum?id=cD0O_Sc-wNy}\n}", "github": "", "project": "", "reviewers": "dhXe;gm3R;8z4B;kfup", "site": "https://openreview.net/forum?id=cD0O_Sc-wNy", "pdf_size": 0, "recommendation": "3;5;6;8", "confidence": "4;4;4;4", "correctness": "3;3;4;3", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;2;2;0", "wc_summary_paper": "68;96;47;187", "wc_summary_review": "25;54;30;47", "wc_main_review": "252;330;509;511", "wc_review": "345;480;586;745", "wc_reply_reviewers": "145;107;70;0", "wc_reply_authors": "1349;1116;518;544", "reply_reviewers": "1;1;1;0", "reply_authors": "3;3;1;1", "recommendation_avg": [ 5.5, 1.8027756377319946 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 99.5, 53.42518132865812 ], "wc_summary_review_avg": [ 39.0, 11.895377253370318 ], "wc_main_review_avg": [ 400.5, 112.92143286373938 ], "wc_review_avg": [ 539.0, 146.4257491017205 ], "wc_reply_reviewers_avg": [ 80.5, 53.50934497823721 ], "wc_reply_authors_avg": [ 881.75, 360.41113675911845 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 2.0, 1.0 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.16012815380508713, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9306078513595845109&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff_unique_index": "0;0;1", "aff_unique_norm": "KTH Royal Institute of Technology;Microsoft", "aff_unique_dep": ";Microsoft Corporation", "aff_unique_url": "https://www.kth.se;https://www.microsoft.com", "aff_unique_abbr": "KTH;Microsoft", "aff_campus_unique_index": "0", "aff_campus_unique": "Stockholm;", "aff_country_unique_index": "0;0;1", "aff_country_unique": "Sweden;United States" }, { "title": "Reversible Instance Normalization for Accurate Time-Series Forecasting against Distribution Shift", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6034", "id": "cGDAkQo1C0p", "poster": "", "openreview": "https://openreview.net/forum?id=cGDAkQo1C0p", "slides": "https://iclr.cc/virtual/2022/poster/6034", "video": "https://iclr.cc/virtual/2022/poster/6034", "author_site": "Taesung Kim, Jinhee Kim, Yunwon Tae, Cheonbok Park, Jang-Ho Choi, Jaegul Choo", "tldr": "", "abstract": "Statistical properties such as mean and variance often change over time in time series, i.e., time-series data suffer from a distribution shift problem. This change in temporal distribution is one of the main challenges that prevent accurate time-series forecasting. To address this issue, we propose a simple yet effective normalization method called reversible instance normalization (RevIN), a generally-applicable normalization-and-denormalization method with learnable affine transformation. The proposed method is symmetrically structured to remove and restore the statistical information of a time-series instance, leading to significant performance improvements in time-series forecasting, as shown in Fig. 1. We demonstrate the effectiveness of RevIN via extensive quantitative and qualitative analyses on various real-world datasets, addressing the distribution shift problem.", "keywords": "Time-series forecasting;Normalization;Distribution shift", "primary_area": "", "supplementary_material": "", "author": "Taesung Kim;Jinhee Kim;Yunwon Tae;Cheonbok Park;Jang-Ho Choi;Jaegul Choo", "authorids": "~Taesung_Kim1;~Jinhee_Kim1;~Yunwon_Tae1;~Cheonbok_Park1;~Jang-Ho_Choi1;~Jaegul_Choo1", "gender": "M;F;M;M;M;M", "homepage": ";https://sites.google.com/view/jinhee-kim/;;https://cbokpark.github.io/;https://github.com/jangho87;https://sites.google.com/site/jaegulchoo/", "dblp": ";;276/5054;239/8130;;07/2074", "google_scholar": "rvp49kYAAAAJ;G7JUwU8AAAAJ;;https://scholar.google.com/citations?hl=ko;;GHJYsLEAAAAJ", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": "~Taesung_Kim1;~Jinhee_Kim1;~Yunwon_Tae1;~Cheonbok_Park1;~Jang-Ho_Choi1;~Jaegul_Choo1", "aff": "Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology;VUNO;NAVER;Electronics and Telecommunications Research Institute;Korea Advanced Institute of Science & Technology", "aff_domain": "kaist.ac.kr;kaist.ac.kr;vuno.co;navercorp.com;etri.re.kr;kaist.ac.kr", "position": "PhD student;PhD student;Researcher;Researcher;Senior Researcher;Associate Professor", "bibtex": "@inproceedings{\nkim2022reversible,\ntitle={Reversible Instance Normalization for Accurate Time-Series Forecasting against Distribution Shift},\nauthor={Taesung Kim and Jinhee Kim and Yunwon Tae and Cheonbok Park and Jang-Ho Choi and Jaegul Choo},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=cGDAkQo1C0p}\n}", "github": "", "project": "", "reviewers": "J9jm;uPmn;V5fo", "pdf_size": 0, "recommendation": "5;6;8", "confidence": "4;3;4", "correctness": "3;4;3", "technical_novelty": "2;2;2", "empirical_novelty": "2;3;0", "wc_summary_paper": "47;118;159", "wc_summary_review": "32;116;521", "wc_main_review": "157;157;1052", "wc_review": "236;391;1732", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "504;556;3411", "reply_reviewers": "0;0;0", "reply_authors": "1;1;7", "recommendation_avg": [ 6.333333333333333, 1.247219128924647 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 1.6666666666666667, 1.247219128924647 ], "wc_summary_paper_avg": [ 108.0, 46.26733909213568 ], "wc_summary_review_avg": [ 223.0, 213.49004660639335 ], "wc_main_review_avg": [ 455.3333333333333, 421.9070461079734 ], "wc_review_avg": [ 786.3333333333334, 671.6746897783843 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 1490.3333333333333, 1358.2823303308076 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 3.0, 2.8284271247461903 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.18898223650461363, "corr_recommendation_correctness": -0.18898223650461363, "gs_citation": 667, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15726225809303254672&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=cGDAkQo1C0p", "email": "kaist.ac.kr;kaist.ac.kr;vuno.co;navercorp.com;etri.re.kr;kaist.ac.kr", "author_num": 6, "aff_unique_index": "0;0;1;2;3;0", "aff_unique_norm": "Korea Advanced Institute of Science and Technology;VUNO;NAVER Corporation;Electronics and Telecommunications Research Institute", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.kaist.ac.kr;https://www.vuno.co.kr;https://www.naver.com;http://www.etri.re.kr", "aff_unique_abbr": "KAIST;VUNO;NAVER;ETRI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "South Korea" }, { "id": "cJPkX1g9PQS", "title": "Rethinking Self-Supervision Objectives for Generalizable Coherence Modeling", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Although large-scale pre-trained neural models have shown impressive performances in a variety of tasks, their ability to generate coherent text that appropriately models discourse phenomena is harder to evaluate and less understood. Given the claims of improved text generation quality across various systems, we consider the coherence evaluation of machine generated text to be one of the principal applications of coherence models that needs to be investigated. We explore training data and self-supervision objectives that result in a model that generalizes well across tasks and can be used off-the-shelf to perform such evaluations.\nPrior work in neural coherence modeling has primarily focused on devising new architectures, and trained the model to distinguish coherent and incoherent text through pairwise self-supervision on the permuted documents task. We instead use a basic model architecture and show significant improvements over state of the art within the same training regime. We then design a harder self-supervision objective by increasing the ratio of negative samples within a contrastive learning setup, and enhance the model further through automatic hard negative mining coupled with a large global negative queue encoded by a momentum encoder. We show empirically that increasing the density of negative samples improves the basic model, and using a global negative queue further improves and stabilizes the model while training with hard negative samples. We evaluate the coherence model on task-independent test sets that resemble real-world use cases and show significant improvements in coherence evaluations of downstream applications. ", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Prathyusha Jwalapuram;Shafiq Joty;Xiang Lin", "authorids": "~Prathyusha_Jwalapuram1;~Shafiq_Joty1;~Xiang_Lin2", "gender": "F;M;M", "homepage": "https://pjwalapuram.com/;https://raihanjoty.github.io/;https://shawnlimn.github.io", "dblp": "214/9948;62/2078;29/6347", "google_scholar": "https://scholar.google.co.in/citations?hl=en;hR249csAAAAJ;R4ZlMwIAAAAJ", "orcid": ";;", "linkedin": "prathyusha-jwalapuram-094220154/;;", "or_profile": "~Prathyusha_Jwalapuram1;~Shafiq_Joty1;~Xiang_Lin2", "aff": "Nanyang Technological University, Singapore;SalesForce.com;Nanyang Technological University", "aff_domain": "ntu.edu.sg;salesforce.com;ntu.edu.sg", "position": "PhD student;Principal Researcher;PhD student", "bibtex": "@misc{\njwalapuram2022rethinking,\ntitle={Rethinking Self-Supervision Objectives for Generalizable Coherence Modeling},\nauthor={Prathyusha Jwalapuram and Shafiq Joty and Xiang Lin},\nyear={2022},\nurl={https://openreview.net/forum?id=cJPkX1g9PQS}\n}", "github": "", "project": "", "reviewers": "MZes;7WP7;TbsM;faZN", "site": "https://openreview.net/forum?id=cJPkX1g9PQS", "pdf_size": 0, "recommendation": "3;5;6;6", "confidence": "4;4;2;4", "correctness": "3;3;4;3", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "221;43;52;149", "wc_summary_review": "81;39;24;90", "wc_main_review": "382;338;299;489", "wc_review": "684;420;375;728", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "1311;1310;893;1053", "reply_reviewers": "0;0;0;0", "reply_authors": "2;2;2;2", "recommendation_avg": [ 5.0, 1.224744871391589 ], "confidence_avg": [ 3.5, 0.8660254037844386 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 116.25, 73.38042995240625 ], "wc_summary_review_avg": [ 58.5, 27.69927796892908 ], "wc_main_review_avg": [ 377.0, 71.01760345153869 ], "wc_review_avg": [ 551.75, 155.84667946414515 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 1141.75, 177.9794580843531 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.0, 0.0 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.4714045207910316, "corr_recommendation_correctness": 0.4714045207910316, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=287106064780165053&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;1;0", "aff_unique_norm": "Nanyang Technological University;Salesforce", "aff_unique_dep": ";", "aff_unique_url": "https://www.ntu.edu.sg;https://www.salesforce.com", "aff_unique_abbr": "NTU;Salesforce", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", "aff_country_unique": "Singapore;United States" }, { "id": "cKTBRHIVjy9", "title": "SubMix: Practical Private Prediction for Large-scale Language Models", "track": "main", "status": "Reject", "tldr": "", "abstract": "Recent data-extraction attacks have exposed that language models can memorize some training samples verbatim. This is a vulnerability that can compromise the privacy of the model\u2019s training data. In this work, we introduce SubMix a practical protocol for private next-token prediction designed to prevent privacy violations by language models that were fine-tuned on a private corpus after pre-training on a public corpus. We show that SubMix limits the leakage of information that is unique to any individual user in the private corpus via a relaxation of group differentially private prediction. Importantly, SubMix admits a tight, data-dependent privacy accounting mechanism, which allows it to thwart existing data-extraction attacks while maintaining the utility of the language model. SubMix is the first protocol that maintains privacy even when publicly releasing tens of thousands of next-token predictions made by large transformer-based models such as GPT-2.", "keywords": "private prediction;language models;user privacy;machine learning", "primary_area": "", "supplementary_material": "/attachment/d0cd5e024d5aff2596fece045f2a9f5fc7fb5703.zip", "author": "Tony A Ginart;Laurens van der Maaten;James Zou;Chuan Guo", "authorids": "~Tony_A_Ginart1;~Laurens_van_der_Maaten3;~James_Zou1;~Chuan_Guo1", "gender": ";;M;M", "homepage": ";;https://sites.google.com/view/chuanguo;https://lvdmaaten.github.io/", "dblp": ";;;53/2650.html", "google_scholar": ";23ZXZvEAAAAJ;0gp5M-kAAAAJ;6GDfcqEAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Tony_A_Ginart1;~James_Zou1;~Chuan_Guo1;~Laurens_van_der_Maaten1", "aff": "Stanford University;Stanford University;Meta;Meta", "aff_domain": "stanford.edu;stanford.edu;meta.com;meta.com", "position": "PhD student;Assistant Professor;Researcher;Research Scientist", "bibtex": "@misc{\nginart2022submix,\ntitle={SubMix: Practical Private Prediction for Large-scale Language Models},\nauthor={Tony A Ginart and Laurens van der Maaten and James Zou and Chuan Guo},\nyear={2022},\nurl={https://openreview.net/forum?id=cKTBRHIVjy9}\n}", "github": "", "project": "", "reviewers": "aiFL;zjmQ;6t5J;kTn8", "site": "https://openreview.net/forum?id=cKTBRHIVjy9", "pdf_size": 0, "recommendation": "3;3;6;8", "confidence": "4;5;2;3", "correctness": "2;1;3;4", "technical_novelty": "2;1;3;3", "empirical_novelty": "2;1;2;2", "wc_summary_paper": "185;51;92;129", "wc_summary_review": "97;18;41;53", "wc_main_review": "555;420;167;143", "wc_review": "837;489;300;325", "wc_reply_reviewers": "174;0;0;0", "wc_reply_authors": "746;561;242;191", "reply_reviewers": "1;0;0;0", "reply_authors": "2;1;1;1", "recommendation_avg": [ 5.0, 2.1213203435596424 ], "confidence_avg": [ 3.5, 1.118033988749895 ], "correctness_avg": [ 2.5, 1.118033988749895 ], "technical_novelty_avg": [ 2.25, 0.82915619758885 ], "empirical_novelty_avg": [ 1.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 114.25, 49.29186038282589 ], "wc_summary_review_avg": [ 52.25, 28.734778579275673 ], "wc_main_review_avg": [ 321.25, 173.17386494503148 ], "wc_review_avg": [ 487.75, 214.3097932899941 ], "wc_reply_reviewers_avg": [ 43.5, 75.34421012924616 ], "wc_reply_authors_avg": [ 435.0, 228.79138969812652 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.7378647873726218, "corr_recommendation_correctness": 0.9486832980505139, "gs_citation": 26, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12478836071719209833&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0;1;1", "aff_unique_norm": "Stanford University;Meta", "aff_unique_dep": ";Meta Platforms, Inc.", "aff_unique_url": "https://www.stanford.edu;https://meta.com", "aff_unique_abbr": "Stanford;Meta", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Stanford;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "cKoY420qRuL", "title": "Group-disentangled Representation Learning with Weakly-Supervised Regularization", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Learning interpretable and human-controllable representations that uncover factors of variation in data remains an ongoing key challenge in representation learning. We investigate learning group-disentangled representations for groups of factors with weak supervision. Existing techniques to address this challenge merely constrain the approximate posterior by averaging over observations of a shared group. As a result, observations with a common set of variations are encoded to distinct latent representations, reducing their capacity to disentangle and generalize to downstream tasks. In contrast to previous works, we propose GroupVAE, a simple yet effective Kullback-Leibler (KL) divergence-based regularization across shared latent representations to enforce consistent and disentangled representations. We conduct a thorough evaluation and demonstrate that our GroupVAE significantly improves group disentanglement. Further, we demonstrate that learning group-disentangled representations improve upon downstream tasks, including fair classification and 3D shape-related tasks such as reconstruction, classification, and transfer learning, and is competitive to supervised methods. ", "keywords": "variational autoencoder;representation learning;disentanglement", "primary_area": "", "supplementary_material": "", "author": "Linh Tran;Amir Hosein Khasahmadi;Aditya Sanghi;Saeid Asgari", "authorids": "~Linh_Tran1;~Amir_Hosein_Khasahmadi1;~Aditya_Sanghi1;~Saeid_Asgari1", "gender": "F;M;;", "homepage": "http://www.linht.com;https://github.com/sanghiad;https://asgsaeid.github.io/;", "dblp": "130/8465;;201/4374.html;259/1508", "google_scholar": "https://scholar.google.co.uk/citations?user=GHIsTp8AAAAJ;q0-11e25FxIC;SuePM1sAAAAJ;cFpYRhkAAAAJ", "orcid": ";;;", "linkedin": ";;;amir-khas/", "or_profile": "~Linh_Tran1;~Aditya_Sanghi1;~Saeid_Asgari1;~Amir_Hosein_Khasahmadi2", "aff": "Autodesk;Autodesk;Autodesk;Toronto University", "aff_domain": "autodesk.com;autodesk.com;autodesk.com;utoronto.ca", "position": "Research Scientist;Researcher;Research Scientist;MS student", "bibtex": "@misc{\ntran2022groupdisentangled,\ntitle={Group-disentangled Representation Learning with Weakly-Supervised Regularization},\nauthor={Linh Tran and Amir Hosein Khasahmadi and Aditya Sanghi and Saeid Asgari},\nyear={2022},\nurl={https://openreview.net/forum?id=cKoY420qRuL}\n}", "github": "", "project": "", "reviewers": "3pNU;R7RD;K2rb;iSJY;XDsw", "site": "https://openreview.net/forum?id=cKoY420qRuL", "pdf_size": 0, "recommendation": "3;5;5;5;5", "confidence": "4;4;3;4;2", "correctness": "2;3;3;3;1", "technical_novelty": "2;2;2;2;2", "empirical_novelty": "2;2;3;2;2", "wc_summary_paper": "37;37;181;114;55", "wc_summary_review": "17;79;126;63;33", "wc_main_review": "322;278;325;732;89", "wc_review": "376;394;632;909;177", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "0;0;0;0;0", "reply_reviewers": "0;0;0;0;0", "reply_authors": "0;0;0;0;0", "recommendation_avg": [ 4.6, 0.7999999999999999 ], "confidence_avg": [ 3.4, 0.8 ], "correctness_avg": [ 2.4, 0.8 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.2, 0.39999999999999997 ], "wc_summary_paper_avg": [ 84.8, 55.79390647732062 ], "wc_summary_review_avg": [ 63.6, 38.05049276947671 ], "wc_main_review_avg": [ 349.2, 210.06418066867087 ], "wc_review_avg": [ 497.6, 251.2437859928082 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.37500000000000017, "corr_recommendation_correctness": 0.25, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11400517788136501499&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0;0;1", "aff_unique_norm": "Autodesk;University of Toronto", "aff_unique_dep": ";", "aff_unique_url": "https://www.autodesk.com;https://www.utoronto.ca", "aff_unique_abbr": "Autodesk;U of T", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1", "aff_country_unique": "United States;Canada" }, { "id": "cLcLdwOfhoe", "title": "FedLite: A Scalable Approach for Federated Learning on Resource-constrained Clients", "track": "main", "status": "Reject", "tldr": "", "abstract": "In classical federated learning, the clients contribute to the overall training by communicating local updates for the underlying model on their private data to a coordinating server. However, updating and communicating the entire model becomes prohibitively expensive when resource-constrained clients collectively aim to train a large machine learning model. Split learning provides a natural solution in such a setting, where only a (small) part of the model is stored and trained on clients while the remaining (large) part of the model only stays at the servers. Unfortunately, the model partitioning employed in split learning significantly increases the communication cost compared to the classical federated learning algorithms. This paper addresses this issue by proposing an end-to-end training framework that relies on a novel vector quantization scheme accompanied by a gradient correction method to reduce the additional communication cost associated with split learning. An extensive empirical evaluation on standard image and text benchmarks shows that the proposed method can achieve up to $490\\times$ communication cost reduction with minimal drop in accuracy, and enables a desirable performance vs. communication trade-off. ", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jianyu Wang;Hang Qi;Ankit Singh Rawat;Sashank J. Reddi;Sagar M. Waghmare;Felix Yu;Gauri Joshi", "authorids": "~Jianyu_Wang2;~Hang_Qi1;~Ankit_Singh_Rawat1;~Sashank_J._Reddi1;~Sagar_M._Waghmare1;~Felix_Yu1;~Gauri_Joshi1", "gender": "M;;M;M;M;M;", "homepage": ";;https://ankitsrawat.github.io/home/;;;http://felixyu.org;", "dblp": ";96/1046-1;https://dblp.org/pers/hd/r/Rawat:Ankit_Singh;50/10452;;23/10574;", "google_scholar": "5nrx1YwAAAAJ;72jdrSUAAAAJ;http://scholar.google.com/citations?user=U0_ab4cAAAAJ;70lgwYwAAAAJ;l6e9JeEAAAAJ;lYvF6cUAAAAJ;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": "~Jianyu_Wang2;~Hang_Qi1;~Ankit_Singh_Rawat1;~Sashank_J._Reddi1;~Sagar_M._Waghmare1;~Felix_Yu1;~Gauri_Joshi1", "aff": "Carnegie Mellon University;Google;Google;Google;Google;Google;", "aff_domain": "andrew.cmu.edu;google.com;google.com;google.com;google.com;google.com;", "position": "PhD student;Researcher;Research Scientist;Research Scientist;Researcher;Research Scientist;", "bibtex": "@misc{\nwang2022fedlite,\ntitle={FedLite: A Scalable Approach for Federated Learning on Resource-constrained Clients},\nauthor={Jianyu Wang and Hang Qi and Ankit Singh Rawat and Sashank J. Reddi and Sagar M. Waghmare and Felix Yu and Gauri Joshi},\nyear={2022},\nurl={https://openreview.net/forum?id=cLcLdwOfhoe}\n}", "github": "", "project": "", "reviewers": "ELQm;oo1D;WW1C;cPmB", "site": "https://openreview.net/forum?id=cLcLdwOfhoe", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "4;4;5;4", "correctness": "3;3;4;4", "technical_novelty": "2;3;3;2", "empirical_novelty": "2;2;0;0", "wc_summary_paper": "79;53;99;118", "wc_summary_review": "67;7;35;57", "wc_main_review": "471;177;920;179", "wc_review": "617;237;1054;354", "wc_reply_reviewers": "83;0;307;0", "wc_reply_authors": "599;256;823;226", "reply_reviewers": "1;0;1;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 1.0, 1.0 ], "wc_summary_paper_avg": [ 87.25, 24.107830678018296 ], "wc_summary_review_avg": [ 41.5, 23.038012067016545 ], "wc_main_review_avg": [ 436.75, 303.56578776271874 ], "wc_review_avg": [ 565.5, 313.8188171541025 ], "wc_reply_reviewers_avg": [ 97.5, 125.61150425020791 ], "wc_reply_authors_avg": [ 476.0, 248.21261047738892 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.13245323570650439, "corr_recommendation_correctness": 0.6882472016116854, "gs_citation": 26, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16816732136758024955&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;1;1;1;1;1", "aff_unique_norm": "Carnegie Mellon University;Google", "aff_unique_dep": ";Google", "aff_unique_url": "https://www.cmu.edu;https://www.google.com", "aff_unique_abbr": "CMU;Google", "aff_campus_unique_index": "1;1;1;1;1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "cMBKc-0OTY5", "title": "Kalman Filter Is All You Need: Optimization Works When Noise Estimation Fails", "track": "main", "status": "Reject", "tldr": "", "abstract": "Determining the noise parameters of a Kalman Filter (KF) has been studied for decades. A huge body of research focuses on the task of noise estimation under various conditions, since precise noise estimation is considered equivalent to minimization of the filtering errors. However, we show that even a small violation of the KF assumptions can significantly modify the effective noise, breaking the equivalence between the tasks and making noise estimation an inferior strategy. We show that such violations are common, and are often not trivial to handle or even notice. Consequentially, we argue that a robust solution is needed - rather than choosing a dedicated model per problem.\nTo that end, we apply gradient-based optimization to the filtering errors directly, with relation to an efficient parameterization of the symmetric and positive-definite parameters of the KF. In a variety of state-estimation and tracking problems, we show that the optimization improves both the accuracy of the KF and its robustness to design decisions.\nIn addition, we demonstrate how an optimized neural network model can seem to reduce the errors significantly compared to a KF - and how this reduction vanishes once the KF is optimized similarly. This indicates how complicated models can be wrongly identified as superior to the KF, while in fact they were merely more optimized.", "keywords": "Kalman Filter;noise estimation;optimization;gradient descent;parameterization", "primary_area": "", "supplementary_material": "", "author": "Ido Greenberg;Shie Mannor;Netanel Yannay", "authorids": "~Ido_Greenberg1;~Shie_Mannor2;~Netanel_Yannay1", "gender": "M;M;M", "homepage": "https://idogreenberg.neocities.org/;https://shie.net.technion.ac.il;https://www.linkedin.com/in/nati-yannay-9693b524/", "dblp": ";20/1669;", "google_scholar": "LnwyFkkAAAAJ;https://scholar.google.com.tw/citations?user=q1HlbIUAAAAJ;", "orcid": ";;", "linkedin": "ido-greenberg-87245852/;;", "or_profile": "~Ido_Greenberg1;~Shie_Mannor2;~Netanel_Yannay1", "aff": "Technion, Technion;Technion - Israel Institute of Technology, Technion;ELTA Systems Ltd.", "aff_domain": "technion.ac.il;technion.il;iai.co.il", "position": "PhD student;Full Professor;Researcher", "bibtex": "@misc{\ngreenberg2022kalman,\ntitle={Kalman Filter Is All You Need: Optimization Works When Noise Estimation Fails},\nauthor={Ido Greenberg and Shie Mannor and Netanel Yannay},\nyear={2022},\nurl={https://openreview.net/forum?id=cMBKc-0OTY5}\n}", "github": "", "project": "", "reviewers": "D5C1;NGZA;Eeik;Qzvj", "site": "https://openreview.net/forum?id=cMBKc-0OTY5", "pdf_size": 0, "recommendation": "3;3;5;6", "confidence": "3;2;4;4", "correctness": "3;1;3;3", "technical_novelty": "1;2;2;2", "empirical_novelty": "2;2;3;2", "wc_summary_paper": "100;20;134;98", "wc_summary_review": "66;30;36;9", "wc_main_review": "361;96;400;227", "wc_review": "527;146;570;334", "wc_reply_reviewers": "0;0;286;0", "wc_reply_authors": "794;96;968;368", "reply_reviewers": "0;0;2;0", "reply_authors": "1;1;3;1", "recommendation_avg": [ 4.25, 1.299038105676658 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "correctness_avg": [ 2.5, 0.8660254037844386 ], "technical_novelty_avg": [ 1.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 88.0, 41.78516483155236 ], "wc_summary_review_avg": [ 35.25, 20.38841582860228 ], "wc_main_review_avg": [ 271.0, 119.68918079759757 ], "wc_review_avg": [ 394.25, 168.64811739239784 ], "wc_reply_reviewers_avg": [ 71.5, 123.84163274117472 ], "wc_reply_authors_avg": [ 556.5, 343.99527613035616 ], "reply_reviewers_avg": [ 0.5, 0.8660254037844386 ], "reply_authors_avg": [ 1.5, 0.8660254037844386 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.8703882797784892, "corr_recommendation_correctness": 0.5555555555555555, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:pRpbWa_uPYAJ:scholar.google.com/&scioq=Kalman+Filter+Is+All+You+Need:+Optimization+Works+When+Noise+Estimation+Fails&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;1", "aff_unique_norm": "Technion - Israel Institute of Technology;ELTA Systems", "aff_unique_dep": ";", "aff_unique_url": "https://www.technion.ac.il/en/;https://www.elta.co.il", "aff_unique_abbr": "Technion;ELTA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Israel" }, { "title": "Label Leakage and Protection in Two-party Split Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6388", "id": "cOtBRgsf2fO", "poster": "", "openreview": "https://openreview.net/forum?id=cOtBRgsf2fO", "slides": "https://iclr.cc/virtual/2022/poster/6388", "video": "https://iclr.cc/virtual/2022/poster/6388", "author_site": "Oscar Li, Jiankai Sun, Xin Yang, Weihao Gao, Hongyi Zhang, Junyuan Xie, Virginia Smith, Chong Wang", "tldr": "", "abstract": "Two-party split learning is a popular technique for learning a model across feature-partitioned data. In this work, we explore whether it is possible for one party to steal the private label information from the other party during split training, and whether there are methods that can protect against such attacks. Specifically, we first formulate a realistic threat model and propose a privacy loss metric to quantify label leakage in split learning. We then show that there exist two simple yet effective methods within the threat model that can allow one party to accurately recover private ground-truth labels owned by the other party. To combat these attacks, we propose several random perturbation techniques, including $\\texttt{Marvell}$, an approach that strategically finds the structure of the noise perturbation by minimizing the amount of label leakage (measured through our quantification metric) of a worst-case adversary. We empirically demonstrate the effectiveness of our protection techniques against the identified attacks, and show that $\\texttt{Marvell}$ in particular has improved privacy-utility tradeoffs relative to baseline approaches.", "keywords": "Split Learning;label leakage;privacy;privacy protection", "primary_area": "", "supplementary_material": "/attachment/7c84c98e0da2c15a881b6651e4554ff368993d66.zip", "author": "Oscar Li;Jiankai Sun;Xin Yang;Weihao Gao;Hongyi Zhang;Junyuan Xie;Virginia Smith;Chong Wang", "authorids": "~Oscar_Li1;~Jiankai_Sun5;~Xin_Yang3;~Weihao_Gao1;~Hongyi_Zhang1;~Junyuan_Xie1;~Virginia_Smith1;~Chong_Wang8", "gender": "M;M;M;M;M;;F;M", "homepage": "https://www.oscarli.one/;http://jiankai.me;;https://wgao9.github.io/;;;;https://chongw.github.io", "dblp": "160/8481;;44/1152-17;https://dblp.uni-trier.de/pers/hd/g/Gao:Weihao;;;120/0921;w/ChongWang2", "google_scholar": "rtpoh5wAAAAJ;GQ6xw-oAAAAJ;https://scholar.google.com/citations?hl=en;E__5Lr0AAAAJ;6Wg-hF4AAAAJ;qJsC_XsAAAAJ;;vRI2blsAAAAJ", "orcid": ";;;;;;;", "linkedin": ";jiankaisun/;;weihao-gao-6517b3ab/;;;;", "or_profile": "~Oscar_Li1;~Jiankai_Sun5;~Xin_Yang3;~Weihao_Gao1;~Hongyi_Zhang1;~Junyuan_Xie1;~Virginia_Smith1;~Chong_Wang1", "aff": "Amazon;ByteDance Inc.;ByteDance;;Bytedance Inc;Department of Computer Science, University of Washington;Carnegie Mellon University;Apple", "aff_domain": "amazon.com;bytedance.com;bytedance.com;;bytedance.com;cs.washington.edu;cmu.edu;apple.com", "position": "Intern;Research Scientist;Researcher;;Research Scientist;PhD student;Associate Professor;Researcher", "bibtex": "@inproceedings{\nli2022label,\ntitle={Label Leakage and Protection in Two-party Split Learning},\nauthor={Oscar Li and Jiankai Sun and Xin Yang and Weihao Gao and Hongyi Zhang and Junyuan Xie and Virginia Smith and Chong Wang},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=cOtBRgsf2fO}\n}", "github": "", "project": "", "reviewers": "SgGa;GVaD;ercK", "pdf_size": 0, "recommendation": "6;6;8", "confidence": "2;3;4", "correctness": "3;4;3", "technical_novelty": "3;3;3", "empirical_novelty": "4;3;3", "wc_summary_paper": "564;76;86", "wc_summary_review": "57;27;42", "wc_main_review": "254;197;246", "wc_review": "875;300;374", "wc_reply_reviewers": "251;9;47", "wc_reply_authors": "1431;858;1097", "reply_reviewers": "3;1;1", "reply_authors": "4;2;2", "recommendation_avg": [ 6.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 3.0, 0.816496580927726 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 242.0, 227.7249803308075 ], "wc_summary_review_avg": [ 42.0, 12.24744871391589 ], "wc_main_review_avg": [ 232.33333333333334, 25.197001585285676 ], "wc_review_avg": [ 516.3333333333334, 255.4086050930069 ], "wc_reply_reviewers_avg": [ 102.33333333333333, 106.2617313784956 ], "wc_reply_authors_avg": [ 1128.6666666666667, 234.99550823130403 ], "reply_reviewers_avg": [ 1.6666666666666667, 0.9428090415820634 ], "reply_authors_avg": [ 2.6666666666666665, 0.9428090415820634 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": 0.8660254037844385, "corr_recommendation_correctness": -0.49999999999999983, "gs_citation": 172, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4111278201202932828&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=cOtBRgsf2fO", "email": "amazon.com;bytedance.com;bytedance.com;;bytedance.com;cs.washington.edu;cmu.edu;apple.com", "author_num": 8, "aff_unique_index": "0;1;1;1;2;3;4", "aff_unique_norm": "Amazon;ByteDance;University of Washington;Carnegie Mellon University;Apple", "aff_unique_dep": "Amazon.com, Inc.;;Department of Computer Science;;Apple Inc.", "aff_unique_url": "https://www.amazon.com;https://www.bytedance.com;https://www.washington.edu;https://www.cmu.edu;https://www.apple.com", "aff_unique_abbr": "Amazon;ByteDance;UW;CMU;Apple", "aff_campus_unique_index": "1", "aff_campus_unique": ";Seattle", "aff_country_unique_index": "0;1;1;1;0;0;0", "aff_country_unique": "United States;China" }, { "title": "Learning more skills through optimistic exploration", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6475", "id": "cU8rknuhxc", "poster": "", "openreview": "https://openreview.net/forum?id=cU8rknuhxc", "slides": "https://iclr.cc/virtual/2022/poster/6475", "video": "https://iclr.cc/virtual/2022/poster/6475", "author_site": "DJ Strouse, Kate Baumli, David Warde-Farley, Volodymyr Mnih, Steven Hansen", "tldr": "", "abstract": "Unsupervised skill learning objectives (Eysenbach et al., 2019; Gregor et al., 2016) allow agents to learn rich repertoires of behavior in the absence of extrinsic rewards. They work by simultaneously training a policy to produce distinguishable latent-conditioned trajectories, and a discriminator to evaluate distinguishability by trying to infer latents from trajectories. The hope is for the agent to explore and master the environment by encouraging each skill (latent) to reliably reach different states. However, an inherent exploration problem lingers: when a novel state is actually encountered, the discriminator will necessarily not have seen enough training data to produce accurate and confident skill classifications, leading to low intrinsic reward for the agent and effective penalization of the sort of exploration needed to actually maximize the objective. To combat this inherent pessimism towards exploration, we derive an information gain auxiliary objective that involves training an ensemble of discriminators and rewarding the policy for their disagreement. Our objective directly estimates the epistemic uncertainty that comes from the discriminator not having seen enough training examples, thus providing an intrinsic reward more tailored to the true objective compared to pseudocount-based methods (Burda et al., 2019). We call this exploration bonus discriminator disagreement intrinsic reward, or DISDAIN. We demonstrate empirically that DISDAIN improves skill learning both in a tabular grid world (Four Rooms) and the 57 games of the Atari Suite (from pixels). Thus, we encourage researchers to treat pessimism with DISDAIN.", "keywords": "intrinsic control;skill discovery;unsupervised skill learning;uncertainty estimation;optimistic exploration;variational information maximization", "primary_area": "", "supplementary_material": "", "author": "DJ Strouse;Kate Baumli;David Warde-Farley;Volodymyr Mnih;Steven Stenberg Hansen", "authorids": "~DJ_Strouse1;~Kate_Baumli1;~David_Warde-Farley1;~Volodymyr_Mnih1;~Steven_Stenberg_Hansen1", "gender": ";F;M;M;M", "homepage": "http://www.djstrouse.com;;;;", "dblp": "181/2305;266/7836;71/9421;04/1930;61/3521", "google_scholar": "K8E0T7MAAAAJ;feM7-mEAAAAJ;https://scholar.google.ca/citations?user=MOgfm8oAAAAJ;rLdfJ1gAAAAJ;hIOEWsEAAAAJ", "orcid": ";;;;", "linkedin": ";katebaumli/;;;", "or_profile": "~DJ_Strouse1;~Kate_Baumli1;~David_Warde-Farley1;~Volodymyr_Mnih1;~Steven_Stenberg_Hansen1", "aff": "Google DeepMind;Google DeepMind;Google DeepMind;Google DeepMind;Google DeepMind", "aff_domain": "google.com;google.com;google.com;deepmind.com;google.com", "position": "Research Scientist;Researcher;Research Scientist;Researcher;Research Scientist", "bibtex": "@inproceedings{\nstrouse2022learning,\ntitle={Learning more skills through optimistic exploration},\nauthor={DJ Strouse and Kate Baumli and David Warde-Farley and Volodymyr Mnih and Steven Stenberg Hansen},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=cU8rknuhxc}\n}", "github": "", "project": "", "reviewers": "Upj9;7dHZ;rscN;vjRd", "pdf_size": 0, "recommendation": "6;8;8;8", "confidence": "4;5;3;5", "correctness": "3;4;3;4", "technical_novelty": "2;4;3;3", "empirical_novelty": "2;4;3;3", "wc_summary_paper": "68;108;128;184", "wc_summary_review": "57;41;40;57", "wc_main_review": "237;470;295;336", "wc_review": "362;619;463;577", "wc_reply_reviewers": "33;56;36;0", "wc_reply_authors": "998;936;620;369", "reply_reviewers": "1;1;1;0", "reply_authors": "2;2;1;1", "recommendation_avg": [ 7.5, 0.8660254037844386 ], "confidence_avg": [ 4.25, 0.82915619758885 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 122.0, 41.80908992073375 ], "wc_summary_review_avg": [ 48.75, 8.257572282456872 ], "wc_main_review_avg": [ 334.5, 85.77441343431035 ], "wc_review_avg": [ 505.25, 100.48973828207535 ], "wc_reply_reviewers_avg": [ 31.25, 20.09197601033806 ], "wc_reply_authors_avg": [ 730.75, 253.31736517657055 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.17407765595569782, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 57, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11133914389931612853&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=cU8rknuhxc", "email": "google.com;google.com;google.com;deepmind.com;google.com", "author_num": 5, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Google", "aff_unique_dep": "Google DeepMind", "aff_unique_url": "https://deepmind.com", "aff_unique_abbr": "DeepMind", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United Kingdom" }, { "id": "cVak2hs06z", "title": "Correct-N-Contrast: a Contrastive Approach for Improving Robustness to Spurious Correlations", "track": "main", "status": "Reject", "tldr": "", "abstract": "Spurious correlations pose a fundamental challenge for building robust machine learning models. For example, models trained with empirical risk minimization (ERM) may depend on correlations between class labels and spurious features to classify data, even if these relations only hold for certain data groups. This can result in poor performance on other groups that do not exhibit such relations. When group information is available during training, Sagawa et al. (2019) have shown how to improve worst-group performance by optimizing the worst-group loss (GDRO). However, when group information is unavailable, improving worst-group performance is more challenging. For this latter setting, we propose Correct-N-Contrast (CNC), a contrastive learning method to train models more robust to spurious correlations. Our motivating observation is that worst-group performance is related to a representation alignment loss, which measures the distance in feature space between different groups within each class. We prove that the gap between worst-group and average loss for each class is upper bounded by the alignment loss for that class. Thus, CNC aims to improve representation alignment via contrastive learning. First, CNC uses an ERM model to infer the group information. Second, with a careful sampling scheme, CNC trains a contrastive model to encourage similar representations for groups in the same class. We show that CNC significantly improves worst-group accuracy over existing state-of-the-art methods on popular benchmarks, e.g., achieving $7.7\\%$ absolute lift in worst-group accuracy on the CelebA data set, and performs almost as well as GDRO trained with group labels. CNC also learns better-aligned representations between different groups in each class, reducing the alignment loss substantially compared to prior methods.", "keywords": "spurious correlations;contrastive learning;robustness;group shifts", "primary_area": "", "supplementary_material": "/attachment/c3e302cf66c5754791f720d208b5f6aed123325d.zip", "author": "Michael Zhang;Nimit Sharad Sohoni;Hongyang R. Zhang;Chelsea Finn;Christopher Re", "authorids": "~Michael_Zhang4;~Nimit_Sharad_Sohoni1;~Hongyang_R._Zhang1;~Chelsea_Finn1;~Christopher_Re1", "gender": "M;;M;F;", "homepage": "https://michaelzhang.xyz/;;http://www.hongyangzhang.com;https://ai.stanford.edu/~cbfinn/;", "dblp": ";239/8622;264/2660;131/1783;", "google_scholar": "DG_asaIAAAAJ;;Sx-673sAAAAJ;vfPE6hgAAAAJ;", "orcid": ";;;;", "linkedin": ";;hongyang-r-zhang-5b7797157;;", "or_profile": "~Michael_Zhang4;~Nimit_Sharad_Sohoni1;~Hongyang_R._Zhang1;~Chelsea_Finn1;~Christopher_Re1", "aff": "Stanford University;Stanford University;Northeastern University;Google;", "aff_domain": "stanford.edu;stanford.edu;northeastern.edu;google.com;", "position": "PhD student;PhD student;Assistant Professor;Research Scientist;", "bibtex": "@misc{\nzhang2022correctncontrast,\ntitle={Correct-N-Contrast: a Contrastive Approach for Improving Robustness to Spurious Correlations},\nauthor={Michael Zhang and Nimit Sharad Sohoni and Hongyang R. Zhang and Chelsea Finn and Christopher Re},\nyear={2022},\nurl={https://openreview.net/forum?id=cVak2hs06z}\n}", "github": "", "project": "", "reviewers": "FMKR;GgTx;ktqR;U9kN", "site": "https://openreview.net/forum?id=cVak2hs06z", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "4;4;4;4", "correctness": "2;3;4;4", "technical_novelty": "2;2;4;2", "empirical_novelty": "3;2;4;3", "wc_summary_paper": "66;82;38;139", "wc_summary_review": "70;14;23;72", "wc_main_review": "423;511;396;600", "wc_review": "559;607;457;811", "wc_reply_reviewers": "1100;267;0;162", "wc_reply_authors": "5888;2428;1301;1118", "reply_reviewers": "2;2;0;1", "reply_authors": "10;4;2;3", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.5, 0.8660254037844386 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 81.25, 36.873940662749895 ], "wc_summary_review_avg": [ 44.75, 26.45160675649024 ], "wc_main_review_avg": [ 482.5, 80.0640368704951 ], "wc_review_avg": [ 608.5, 128.85165889502548 ], "wc_reply_reviewers_avg": [ 382.25, 425.1684225104212 ], "wc_reply_authors_avg": [ 2683.75, 1916.7809440569886 ], "reply_reviewers_avg": [ 1.25, 0.82915619758885 ], "reply_authors_avg": [ 4.75, 3.112474899497183 ], "replies_avg": [ 30, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.9045340337332909, "gs_citation": 199, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8960959356014477531&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff_unique_index": "0;0;1;2", "aff_unique_norm": "Stanford University;Northeastern University;Google", "aff_unique_dep": ";;Google", "aff_unique_url": "https://www.stanford.edu;https://www.northeastern.edu;https://www.google.com", "aff_unique_abbr": "Stanford;NEU;Google", "aff_campus_unique_index": "0;0;2", "aff_campus_unique": "Stanford;;Mountain View", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "cWlMII1LwTZ", "title": "Task-aware Privacy Preservation for Multi-dimensional Data", "track": "main", "status": "Reject", "tldr": "", "abstract": "Local differential privacy (LDP), a state-of-the-art technique for privacy preservation, has been successfully deployed in a few real-world applications. In the future, LDP can be adopted to anonymize richer user data attributes that will be input to more sophisticated machine learning (ML) tasks. However, today's LDP approaches are largely task-agnostic and often lead to sub-optimal performance - they will simply inject noise to all data attributes according to a given privacy budget, regardless of what features are most relevant for an ultimate task. In this paper, we address how to significantly improve the ultimate task performance for multi-dimensional user data by considering a task-aware privacy preservation problem. The key idea is to use an encoder-decoder framework to learn (and anonymize) a task-relevant latent representation of user data, which gives an analytical near-optimal solution for a linear setting with mean-squared error (MSE) task loss. We also provide an approximate solution through a learning algorithm for general nonlinear cases. Extensive experiments demonstrate that our task-aware approach significantly improves ultimate task accuracy compared to a standard benchmark LDP approach while guaranteeing the same level of privacy.", "keywords": "Privacy;Representation Learning", "primary_area": "", "supplementary_material": "/attachment/6ca5c50152802cbea390eab5dc00bfc452dfbe0d.zip", "author": "Jiangnan Cheng;Ao Tang;Sandeep P. Chinchali", "authorids": "~Jiangnan_Cheng1;~Ao_Tang1;~Sandeep_P._Chinchali1", "gender": "M;;", "homepage": "http://networks.ece.cornell.edu/jiangnan/;;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": "~Jiangnan_Cheng1;~Ao_Tang1;~Sandeep_P._Chinchali1", "aff": "Cornell University;;", "aff_domain": "cornell.edu;;", "position": "PhD student;;", "bibtex": "@misc{\ncheng2022taskaware,\ntitle={Task-aware Privacy Preservation for Multi-dimensional Data},\nauthor={Jiangnan Cheng and Ao Tang and Sandeep P. Chinchali},\nyear={2022},\nurl={https://openreview.net/forum?id=cWlMII1LwTZ}\n}", "github": "", "project": "", "reviewers": "r5c5;S4hg;mz4E;mTuz", "site": "https://openreview.net/forum?id=cWlMII1LwTZ", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "5;3;4;3", "correctness": "2;3;3;4", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;0;2;3", "wc_summary_paper": "77;63;103;95", "wc_summary_review": "133;20;48;40", "wc_main_review": "426;315;382;122", "wc_review": "636;398;533;257", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "328;123;288;52", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.75, 1.0897247358851685 ], "wc_summary_paper_avg": [ 84.5, 15.580436450882884 ], "wc_summary_review_avg": [ 60.25, 43.222534631832964 ], "wc_main_review_avg": [ 311.25, 116.19245887750203 ], "wc_review_avg": [ 456.0, 142.56051346708878 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 197.75, 113.95256688640234 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.899228803025897, "corr_recommendation_correctness": 0.9733285267845754, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12634725104863101184&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "aff_unique_index": "0", "aff_unique_norm": "Cornell University", "aff_unique_dep": "", "aff_unique_url": "https://www.cornell.edu", "aff_unique_abbr": "Cornell", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "title": "Adversarial Robustness Through the Lens of Causality", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6330", "id": "cZAi1yWpiXQ", "poster": "", "openreview": "https://openreview.net/forum?id=cZAi1yWpiXQ", "slides": "https://iclr.cc/virtual/2022/poster/6330", "video": "https://iclr.cc/virtual/2022/poster/6330", "author_site": "Yonggang Zhang, Mingming Gong, Tongliang Liu, Gang Niu, Xinmei Tian, Bo Han, Bernhard Schoelkopf, Kun Zhang", "tldr": "", "abstract": "The adversarial vulnerability of deep neural networks has attracted signi\ufb01cant attention in machine learning. As causal reasoning has an instinct for modeling distribution change, it is essential to incorporate causality into analyzing this specific type of distribution change induced by adversarial attacks. However, causal formulations of the intuition of adversarial attacks and the development of robust DNNs are still lacking in the literature. To bridge this gap, we construct a causal graph to model the generation process of adversarial examples and define the adversarial distribution to formalize the intuition of adversarial attacks. From the causal perspective, we study the distinction between the natural and adversarial distribution and conclude that the origin of adversarial vulnerability is the focus of models on spurious correlations. Inspired by the causal understanding, we propose the \\emph{Causal}-inspired \\emph{Adv}ersarial distribution alignment method, CausalAdv, to eliminate the difference between natural and adversarial distributions by considering spurious correlations. Extensive experiments demonstrate the efficacy of the proposed method. Our work is the first attempt towards using causality to understand and mitigate the adversarial vulnerability.", "keywords": "Adversarial examples;Causality", "primary_area": "", "supplementary_material": "/attachment/e3a862284b6d989e49146e3a75afc2615170ea64.zip", "author": "Yonggang Zhang;Mingming Gong;Tongliang Liu;Gang Niu;Xinmei Tian;Bo Han;Bernhard Sch\u00f6lkopf;Kun Zhang", "authorids": "~Yonggang_Zhang1;~Mingming_Gong1;~Tongliang_Liu1;~Gang_Niu1;~Xinmei_Tian1;~Bo_Han1;~Bernhard_Sch\u00f6lkopf1;~Kun_Zhang1", "gender": "M;M;M;M;F;;;M", "homepage": "https://yonggangzhangben.github.io/index.html;https://mingming-gong.github.io/;https://tongliang-liu.github.io/;https://niug1984.github.io;https://faculty.ustc.edu.cn/tianxinmei1/zh_CN/index.htm;;;http://www.andrew.cmu.edu/user/kunz1/", "dblp": "27/6859-3;98/8479;150/6667;26/3367-1;03/5204-1;;;96/3115-1", "google_scholar": "XSbEr98AAAAJ;https://scholar.google.com.au/citations?user=6BmiCJIAAAAJ;https://scholar.google.com.au/citations?user=EiLdZ_YAAAAJ;https://scholar.google.co.jp/citations?user=HOkcy00AAAAJ;https://scholar.google.com.au/citations?hl=zh-CN;;;RGoypN4AAAAJ", "orcid": "0000-0002-4080-7592;0000-0001-7147-5589;;;0000-0002-5952-8753;;;", "linkedin": ";;;;;;;", "or_profile": "~Yonggang_Zhang1;~Mingming_Gong1;~Tongliang_Liu1;~Gang_Niu1;~Xinmei_Tian1;~Bo_Han1;~Bernhard_Sch\u00f6lkopf1;~Kun_Zhang1", "aff": "University of Science and Technology of China;University of Melbourne;University of Sydney;RIKEN;University of Science and Technology of China;;;Carnegie Mellon University", "aff_domain": "ustc.edu.cn;unimelb.edu.au;sydney.edu.au;riken.jp;ustc.edu.cn;;;cmu.edu", "position": "PhD student;Assistant Professor;Lecturer;Research Scientist (tenured);Associate Professor;;;Associate Professor", "bibtex": "@inproceedings{\nzhang2022adversarial,\ntitle={Adversarial Robustness Through the Lens of Causality},\nauthor={Yonggang Zhang and Mingming Gong and Tongliang Liu and Gang Niu and Xinmei Tian and Bo Han and Bernhard Sch{\\\"o}lkopf and Kun Zhang},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=cZAi1yWpiXQ}\n}", "github": "", "project": "", "reviewers": "aWQa;Jqu4;iRoe;65Tx", "pdf_size": 0, "recommendation": "6;8;8;8", "confidence": "4;3;4;3", "correctness": "3;3;4;4", "technical_novelty": "3;3;4;4", "empirical_novelty": "2;3;4;3", "wc_summary_paper": "97;91;53;63", "wc_summary_review": "117;21;38;7", "wc_main_review": "952;273;148;311", "wc_review": "1166;385;239;381", "wc_reply_reviewers": "0;13;16;10", "wc_reply_authors": "1943;621;874;1086", "reply_reviewers": "0;1;1;1", "reply_authors": "9;1;4;3", "recommendation_avg": [ 7.5, 0.8660254037844386 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.5, 0.5 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 76.0, 18.466185312619388 ], "wc_summary_review_avg": [ 45.75, 42.57566793369189 ], "wc_main_review_avg": [ 421.0, 312.4475956060472 ], "wc_review_avg": [ 542.75, 364.60689447677754 ], "wc_reply_reviewers_avg": [ 9.75, 6.015604707757983 ], "wc_reply_authors_avg": [ 1131.0, 496.8697012296081 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 4.25, 2.947456530637899 ], "replies_avg": [ 25, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 105, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9768134460096621020&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 6, "pdf": "https://openreview.net/pdf?id=cZAi1yWpiXQ", "email": "ustc.edu.cn;unimelb.edu.au;sydney.edu.au;riken.jp;ustc.edu.cn;;;cmu.edu", "author_num": 8, "aff_unique_index": "0;1;2;3;0;4", "aff_unique_norm": "University of Science and Technology of China;University of Melbourne;University of Sydney;RIKEN;Carnegie Mellon University", "aff_unique_dep": ";;;;", "aff_unique_url": "http://www.ustc.edu.cn;https://www.unimelb.edu.au;https://www.sydney.edu.au;https://www.riken.jp;https://www.cmu.edu", "aff_unique_abbr": "USTC;UniMelb;USYD;RIKEN;CMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;2;0;3", "aff_country_unique": "China;Australia;Japan;United States" }, { "id": "cav5FW0gy3C", "title": "Dataset Bias Prediction for Few-Shot Image Classification", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "One of the obstacles which negatively affect the image classification performance is dataset bias. In particular, if each class has only a few training data samples, the data are highly likely to have dataset bias. Therefore, dataset bias can be a serious issue in few-shot learning, but has rarely been studied so far. To address this issue, we propose a bias prediction network to help improve the performance of few-shot image classification models. Once the features are extracted from an image data, the bias prediction network tries to recover the bias of the raw image such as color from the features. However, if the bias prediction network can recover it easily, we can assume that the extracted features also contain the color bias. Therefore, in our proposed framework, the full model tries to extract features that are difficult for the bias prediction network to recover from. We validate our method by adding the bias prediction network to several existing models and evaluating the performance improvement. Our experimental results show that the bias prediction network can suppress the negative effect of the dataset color bias, resulting in the substantial improvements in existing few-shot classification models. The proposed bias prediction network, which can be integrated with other models very easily, could potentially benefit many existing models for various tasks.", "keywords": "Machine Learning;Few-Shot Learning;Image Classification;Dataset Bias", "primary_area": "", "supplementary_material": "", "author": "Jangwook Kim;Kyung-Ah Sohn", "authorids": "~Jangwook_Kim1;~Kyung-Ah_Sohn1", "gender": "M;F", "homepage": ";https://sites.google.com/site/kasohn", "dblp": ";65/3835", "google_scholar": "FbAFVuEAAAAJ;-QsSytMAAAAJ", "orcid": ";0000-0001-8941-1188", "linkedin": ";", "or_profile": "~Jangwook_Kim1;~Kyung-Ah_Sohn1", "aff": "Ajou University;Ajou University", "aff_domain": "ajou.ac.kr;ajou.ac.kr", "position": "PhD student;Full Professor", "bibtex": "@misc{\nkim2022dataset,\ntitle={Dataset Bias Prediction for Few-Shot Image Classification},\nauthor={Jangwook Kim and Kyung-Ah Sohn},\nyear={2022},\nurl={https://openreview.net/forum?id=cav5FW0gy3C}\n}", "github": "", "project": "", "reviewers": "E1HN;8Z5k;JKBx;mice", "site": "https://openreview.net/forum?id=cav5FW0gy3C", "pdf_size": 0, "recommendation": "1;3;5;6", "confidence": "5;2;3;5", "correctness": "2;1;3;4", "technical_novelty": "1;1;3;2", "empirical_novelty": "1;1;3;3", "wc_summary_paper": "80;70;79;75", "wc_summary_review": "28;26;24;100", "wc_main_review": "422;129;235;96", "wc_review": "530;225;338;271", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.75, 1.920286436967152 ], "confidence_avg": [ 3.75, 1.299038105676658 ], "correctness_avg": [ 2.5, 1.118033988749895 ], "technical_novelty_avg": [ 1.75, 0.82915619758885 ], "empirical_novelty_avg": [ 2.0, 1.0 ], "wc_summary_paper_avg": [ 76.0, 3.9370039370059056 ], "wc_summary_review_avg": [ 44.5, 32.07413287993925 ], "wc_main_review_avg": [ 220.5, 127.16622979391974 ], "wc_review_avg": [ 341.0, 116.28198484718087 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.02505486760009429, "corr_recommendation_correctness": 0.7568892626614565, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7321437436303353687&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0", "aff_unique_norm": "Ajou University", "aff_unique_dep": "", "aff_unique_url": "https://www.ajou.ac.kr", "aff_unique_abbr": "Ajou", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "South Korea" }, { "title": "Towards Deployment-Efficient Reinforcement Learning: Lower Bound and Optimality", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7178", "id": "ccWaPGl9Hq", "poster": "", "openreview": "https://openreview.net/forum?id=ccWaPGl9Hq", "slides": "https://iclr.cc/virtual/2022/poster/7178", "video": "https://iclr.cc/virtual/2022/poster/7178", "author_site": "Jiawei Huang, Jinglin Chen, Li Zhao, Tao Qin, Nan Jiang, Tie-Yan Liu", "tldr": "", "abstract": "Deployment efficiency is an important criterion for many real-world applications of reinforcement learning (RL). Despite the community's increasing interest, there lacks a formal theoretical formulation for the problem. In this paper, we propose such a formulation for deployment-efficient RL (DE-RL) from an ''optimization with constraints'' perspective: we are interested in exploring an MDP and obtaining a near-optimal policy within minimal \\emph{deployment complexity}, whereas in each deployment the policy can sample a large batch of data. Using finite-horizon linear MDPs as a concrete structural model, we reveal the fundamental limit in achieving deployment efficiency by establishing information-theoretic lower bounds, and provide algorithms that achieve the optimal deployment efficiency. Moreover, our formulation for DE-RL is flexible and can serve as a building block for other practically relevant settings; we give ''Safe DE-RL'' and ''Sample-Efficient DE-RL'' as two examples, which may be worth future investigation.", "keywords": "reinforcement learning theory;deployment efficiency;linear MDP", "primary_area": "", "supplementary_material": "/attachment/d59eb9b4cb070a9c284e47968b83fb293a8da603.zip", "author": "Jiawei Huang;Jinglin Chen;Li Zhao;Tao Qin;Nan Jiang;Tie-Yan Liu", "authorids": "~Jiawei_Huang3;~Jinglin_Chen2;~Li_Zhao1;~Tao_Qin1;~Nan_Jiang2;~Tie-Yan_Liu1", "gender": ";;F;M;M;M", "homepage": "https://jiaweihhuang.github.io;;https://www.microsoft.com/en-us/research/people/lizo/;https://www.microsoft.com/en-us/research/people/taoqin/;http://nanjiang.cs.illinois.edu;http://member.acm.org/~tieyanliu", "dblp": "13/4208;89/5737;97/4708-7;14/6841;06/4489-8;l/TieYanLiu", "google_scholar": "6IcfJiIAAAAJ;;b-LJkLQAAAAJ;Bl4SRU0AAAAJ;nUlanA8AAAAJ;Nh832fgAAAAJ", "orcid": ";;;;;0000-0002-0476-8020", "linkedin": ";;;;nan-jiang-28139937/;", "or_profile": "~Jiawei_Huang3;~Jinglin_Chen2;~Li_Zhao1;~Tao_Qin1;~Nan_Jiang2;~Tie-Yan_Liu1", "aff": "University of Illinois, Urbana Champaign;University of Illinois, Urbana Champaign;Microsoft;Microsoft Research Asia;University of Illinois, Urbana Champaign;Microsoft", "aff_domain": "illinois.edu;illinois.edu;microsoft.com;microsoft.com;illinois.edu;microsoft.com", "position": "PhD student;PhD student;Researcher;Principal Researcher;Assistant Professor;Distinguished Scientist", "bibtex": "@inproceedings{\nhuang2022towards,\ntitle={Towards Deployment-Efficient Reinforcement Learning: Lower Bound and Optimality},\nauthor={Jiawei Huang and Jinglin Chen and Li Zhao and Tao Qin and Nan Jiang and Tie-Yan Liu},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=ccWaPGl9Hq}\n}", "github": "", "project": "", "reviewers": "Ckdx;iHZz;5FHN;sYkB", "pdf_size": 0, "recommendation": "8;8;8;8", "confidence": "3;4;4;3", "correctness": "4;4;4;4", "technical_novelty": "4;3;3;3", "empirical_novelty": "0;0;0;0", "wc_summary_paper": "67;40;132;57", "wc_summary_review": "35;18;52;29", "wc_main_review": "261;131;189;89", "wc_review": "363;189;373;175", "wc_reply_reviewers": "0;79;0;0", "wc_reply_authors": "308;285;95;35", "reply_reviewers": "0;1;0;0", "reply_authors": "1;2;1;1", "recommendation_avg": [ 8.0, 0.0 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 4.0, 0.0 ], "technical_novelty_avg": [ 3.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 0, 0 ], "wc_summary_paper_avg": [ 74.0, 34.84967718645325 ], "wc_summary_review_avg": [ 33.5, 12.298373876248844 ], "wc_main_review_avg": [ 167.5, 64.61230532955777 ], "wc_review_avg": [ 275.0, 93.19871243745806 ], "wc_reply_reviewers_avg": [ 19.75, 34.208003449485325 ], "wc_reply_authors_avg": [ 180.75, 117.9584142823224 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 27, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16228322784613377738&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=ccWaPGl9Hq", "email": "illinois.edu;illinois.edu;microsoft.com;microsoft.com;illinois.edu;microsoft.com", "author_num": 6, "aff_unique_index": "0;0;1;1;0;1", "aff_unique_norm": "University of Illinois Urbana-Champaign;Microsoft", "aff_unique_dep": ";Microsoft Corporation", "aff_unique_url": "https://illinois.edu;https://www.microsoft.com", "aff_unique_abbr": "UIUC;Microsoft", "aff_campus_unique_index": "0;0;2;0", "aff_campus_unique": "Urbana-Champaign;;Asia", "aff_country_unique_index": "0;0;0;1;0;0", "aff_country_unique": "United States;China" }, { "id": "cd2jyHoFa18", "title": "Learning Neural Processes on the Fly", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Deep neural networks (DNNs) have performed impressively on a wide range of tasks, but they usually require a significant number of training samples to achieve good performance. Thus, DNNs do not work well in low-data regimes because they tend to overfit a small dataset and make poor predictions. In contrast, shallow neural networks (SNNs) generally are robust against overfitting in low-data regimes and converge more quickly than DNNs, but they struggle to represent very complex systems. Hence, DNNs and SNNs have a complementary relationship, and combining their benefits can provide fast-learning capability with high asymptotic performance, as meta-learning does. However, aggregating heterogeneous methods with opposite properties is not trivial, as it can make the combined method inferior to each base method. In this paper, we propose a new algorithm called anytime neural processes that combines DNNs and SNNs and can work in both low-data and high-data regimes. To combine heterogeneous models effectively, we propose a novel aggregation method based on a generalized product-of-exports and a winner-take-all gate network. Moreover, we discuss the theoretical basis of the proposed method. Experiments on a public dataset show that the proposed method achieves comparable performance with other state-of-the-art methods.", "keywords": "Neural Processes;Gaussian Processes;Uncertainty Quantification;Ensemble Methods;Meta-Learning", "primary_area": "", "supplementary_material": "/attachment/4207ba5b15267c439fa96754d453df4b4192ba0d.zip", "author": "Younghwa Jung;Zhenyuan Yuan;Seung-Woo Seo;Minghui Zhu;Seong-Woo Kim", "authorids": "~Younghwa_Jung1;zqy5086@psu.edu;~Seung-Woo_Seo1;~Minghui_Zhu1;~Seong-Woo_Kim1", "gender": "M;;;;M", "homepage": ";;http://vi.snu.ac.kr;;https://arisnu.squarespace.com/", "dblp": ";;;;00/653", "google_scholar": "Rs_VdmsAAAAJ;;;;VlVqpq8AAAAJ", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~Younghwa_Jung1;zqy5086@psu.edu;~Seung-Woo_Seo1;~Minghui_Zhu1;~Seong-Woo_Kim1", "aff": ";;Seoul National University;;Seoul National University", "aff_domain": ";;snu.ac.kr;;snu.ac.kr", "position": ";;Full Professor;;Associate Professor", "bibtex": "@misc{\njung2022learning,\ntitle={Learning Neural Processes on the Fly},\nauthor={Younghwa Jung and Zhenyuan Yuan and Seung-Woo Seo and Minghui Zhu and Seong-Woo Kim},\nyear={2022},\nurl={https://openreview.net/forum?id=cd2jyHoFa18}\n}", "github": "", "project": "", "reviewers": "fJtj;9V1G;etuz;bVPB", "site": "https://openreview.net/forum?id=cd2jyHoFa18", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "4;3;3;3", "correctness": "2;3;3;2", "technical_novelty": "2;2;2;2", "empirical_novelty": "0;2;2;2", "wc_summary_paper": "75;33;87;71", "wc_summary_review": "34;44;4;29", "wc_main_review": "1020;245;282;87", "wc_review": "1129;322;373;187", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 1.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 66.5, 20.21756661915573 ], "wc_summary_review_avg": [ 27.75, 14.737282653189496 ], "wc_main_review_avg": [ 408.5, 360.5651813472843 ], "wc_review_avg": [ 502.75, 367.8969794657194 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": -0.5773502691896257, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1, "aff_unique_index": "0;0", "aff_unique_norm": "Seoul National University", "aff_unique_dep": "", "aff_unique_url": "https://www.snu.ac.kr", "aff_unique_abbr": "SNU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "South Korea" }, { "id": "cdZLe5S0ur", "title": "AQUILA: Communication Efficient Federated Learning with Adaptive Quantization of Lazily-Aggregated Gradients", "track": "main", "status": "Reject", "tldr": "", "abstract": "The development and deployment of federated learning (FL) have been bottlenecked by the heavy communication overheads of high-dimensional models between the distributed client nodes and the central server. To achieve better error-communication tradeoffs, recent efforts have been made to either adaptively reduce the communication frequency by skipping unimportant updates, a.k.a. lazily-aggregated quantization (LAQ), or adjust the quantization bits for each communication. In this paper, we propose a unifying communication efficient framework for FL based on adaptive quantization of lazily-aggregated gradients (AQUILA), which adaptively adjusts two mutually-dependent factors, the communication frequency and the quantization level, in a synergistic way. Specifically, we start from a careful investigation on the classical LAQ scheme and formulate AQUILA as an optimization problem where the optimal quantization level per communication is selected by minimizing the gradient loss caused by updates skipping. Meanwhile, we adjust the LAQ strategy to better fit the novel quantization criterion and thus keep the communication frequency at an appropriate level. The effectiveness and convergence of the proposed AQUILA framework are theoretically verified. The experimental results demonstrate that AQUILA can reduce around 50% of overall transmitted bits compared to existing methods while achieving the same level of model accuracy in a number of non-homogeneous FL scenarios, including Non-IID data distribution and heterogeneous model architecture. The proposed AQUILA is highly adaptive and compatible to existing FL settings.", "keywords": "Federated Learning;communication efficiency;adaptive quantization", "primary_area": "", "supplementary_material": "", "author": "Zihao Zhao;Yuzhu Mao;Muhammad Zeeshan;Yang Liu;Tian Lan;Wenbo Ding", "authorids": "~Zihao_Zhao1;~Yuzhu_Mao1;~Muhammad_Zeeshan2;~Yang_Liu59;~Tian_Lan4;~Wenbo_Ding1", "gender": "M;F;M;F;M;M", "homepage": ";;;;https://www2.seas.gwu.edu/~tlan/;http://ssr-group.net/", "dblp": ";;;;;", "google_scholar": "825UyCgAAAAJ;https://scholar.google.com.hk/citations?hl=zh-CN;;JEieoFsAAAAJ;;xo2FkgIAAAAJ", "orcid": ";;;;;", "linkedin": ";;http://linkedin.com/in/muhammad-zeeshan-198553159;;;", "or_profile": "~Zihao_Zhao1;~Yuzhu_Mao1;~Muhammad_Zeeshan2;~Yang_Liu59;~Tian_Lan4;~Wenbo_Ding1", "aff": "Tsinghua University;Tsinghua University;Tsinghua University;Tsinghua University;George Washington University;Tsinghua University", "aff_domain": "tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;gwu.edu;tsinghua.edu.cn", "position": "MS student;MS student;PhD student;Associate Professor;Full Professor;Assistant Professor", "bibtex": "@misc{\nzhao2022aquila,\ntitle={{AQUILA}: Communication Efficient Federated Learning with Adaptive Quantization of Lazily-Aggregated Gradients},\nauthor={Zihao Zhao and Yuzhu Mao and Muhammad Zeeshan and Yang Liu and Tian Lan and Wenbo Ding},\nyear={2022},\nurl={https://openreview.net/forum?id=cdZLe5S0ur}\n}", "github": "", "project": "", "reviewers": "Dvx6;9gtf;E4L8", "site": "https://openreview.net/forum?id=cdZLe5S0ur", "pdf_size": 0, "recommendation": "6;6;8", "confidence": "3;3;3", "correctness": "4;3;4", "technical_novelty": "2;3;3", "empirical_novelty": "3;0;3", "wc_summary_paper": "127;43;27", "wc_summary_review": "43;69;82", "wc_main_review": "213;142;262", "wc_review": "383;254;371", "wc_reply_reviewers": "21;0;0", "wc_reply_authors": "509;696;483", "reply_reviewers": "1;0;0", "reply_authors": "1;1;1", "recommendation_avg": [ 6.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 3.0, 0.0 ], "correctness_avg": [ 3.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.0, 1.4142135623730951 ], "wc_summary_paper_avg": [ 65.66666666666667, 43.8583578757294 ], "wc_summary_review_avg": [ 64.66666666666667, 16.21384867602041 ], "wc_main_review_avg": [ 205.66666666666666, 49.2634640366356 ], "wc_review_avg": [ 336.0, 58.18934610390462 ], "wc_reply_reviewers_avg": [ 7.0, 9.899494936611665 ], "wc_reply_authors_avg": [ 562.6666666666666, 94.87652794846339 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.49999999999999983, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4954507831936334945&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "aff_unique_index": "0;0;0;0;1;0", "aff_unique_norm": "Tsinghua University;George Washington University", "aff_unique_dep": ";", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.gwu.edu", "aff_unique_abbr": "THU;GWU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;1;0", "aff_country_unique": "China;United States" }, { "id": "cdwobSbmsjA", "title": "RAVE: A variational autoencoder for fast and high-quality neural audio synthesis", "track": "main", "status": "Reject", "tldr": "", "abstract": "Deep generative models applied to audio have improved by a large margin the state-of-the-art in many speech and music related tasks. However, as raw waveform modelling remains an inherently difficult task, audio generative models are either computationally intensive, rely on low sampling rates, are complicated to control or restrict the nature of possible signals. Among those models, Variational AutoEncoders (VAE) give control over the generation by exposing latent variables, although they usually suffer from low synthesis quality. In this paper, we introduce a Realtime Audio Variational autoEncoder (RAVE) allowing both fast and high-quality audio waveform synthesis. We introduce a novel two-stage training procedure, namely representation learning and adversarial fine-tuning. We show that using a post-training analysis of the latent space allows a direct control between the reconstruction fidelity and the representation compactness. By leveraging a multi-band decomposition of the raw waveform, we show that our model is the first able to generate 48kHz audio signals, while simultaneously running 20 times faster than real-time on a standard laptop CPU. We evaluate synthesis quality using both quantitative and qualitative subjective experiments and show the superiority of our approach compared to existing models. Finally, we present applications of our model for timbre transfer and signal compression. All of our source code and audio examples are publicly available.", "keywords": "Variational Autoencoder;generative models;audio;music;deep learning;representation learning;latent space", "primary_area": "", "supplementary_material": "", "author": "Antoine Caillon;Philippe Esling", "authorids": "~Antoine_Caillon1;~Philippe_Esling1", "gender": "M;M", "homepage": "https://caillonantoine.github.io/;http://esling.github.io", "dblp": ";71/7966", "google_scholar": ";https://scholar.google.fr/citations?user=soZrPYAAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Antoine_Caillon1;~Philippe_Esling1", "aff": "IRCAM, UMR 9912;IRCAM, UMR 9912", "aff_domain": "sorbonne-universite.fr;sorbonne-universite.fr", "position": "PhD student;Associate Professor", "bibtex": "@misc{\ncaillon2022rave,\ntitle={{RAVE}: A variational autoencoder for fast and high-quality neural audio synthesis},\nauthor={Antoine Caillon and Philippe Esling},\nyear={2022},\nurl={https://openreview.net/forum?id=cdwobSbmsjA}\n}", "github": "", "project": "", "reviewers": "Pcii;4x1t;kQnp", "site": "https://openreview.net/forum?id=cdwobSbmsjA", "pdf_size": 0, "recommendation": "3;5;8", "confidence": "4;3;4", "correctness": "2;3;3", "technical_novelty": "2;2;4", "empirical_novelty": "2;2;3", "wc_summary_paper": "116;49;34", "wc_summary_review": "67;41;41", "wc_main_review": "480;173;96", "wc_review": "663;263;171", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "706;456;92", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "recommendation_avg": [ 5.333333333333333, 2.0548046676563256 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 2.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.6666666666666665, 0.9428090415820634 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 66.33333333333333, 35.64952859280034 ], "wc_summary_review_avg": [ 49.666666666666664, 12.256517540566824 ], "wc_main_review_avg": [ 249.66666666666666, 165.87612513224707 ], "wc_review_avg": [ 365.6666666666667, 213.57486327333143 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 418.0, 252.10050905673845 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.1147078669352809, "corr_recommendation_correctness": 0.8029550685469661, "gs_citation": 181, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6074015230012735092&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0", "aff_unique_norm": "Institut de Recherche et Coordination Acoustique/Musique", "aff_unique_dep": "UMR 9912", "aff_unique_url": "https://www.ircam.fr", "aff_unique_abbr": "IRCAM", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "France" }, { "id": "cggphp7nPuI", "title": "Reasoning-Modulated Representations", "track": "main", "status": "Reject", "tldr": "", "abstract": "Neural networks leverage robust internal representations in order to generalise. Learning them is difficult, and often requires a large training set that covers the data distribution densely. We study a common setting where our task is not purely opaque. Indeed, very often we may have access to information about the underlying system (e.g. that observations must obey certain laws of physics) that any \"tabula rasa\" neural network would need to re-learn from scratch, penalising data efficiency. We incorporate this information into a pre-trained reasoning module, and investigate its role in shaping the discovered representations in diverse self-supervised learning settings from pixels. Our approach paves the way for a new class of data-efficient representation learning.", "keywords": "representation learning;algorithmic reasoning;graph neural networks;relational learning", "primary_area": "", "supplementary_material": "", "author": "Petar Veli\u010dkovi\u0107;Matko Bo\u0161njak;Thomas Kipf;Alexander Lerchner;raia hadsell;Razvan Pascanu;Charles Blundell", "authorids": "~Petar_Veli\u010dkovi\u01071;~Matko_Bo\u0161njak2;~Thomas_Kipf2;~Alexander_Lerchner1;~raia_hadsell1;~Razvan_Pascanu1;~Charles_Blundell1", "gender": "M;M;F;M;;;M", "homepage": "https://petar-v.com;;http://www.raiahadsell.com;https://razp.info;http://www.gatsby.ucl.ac.uk/~ucgtcbl/;http://matko.info/;http://tkipf.github.io/", "dblp": "184/4786.html;21/3421;http://dblp.uni-trier.de/pers/hd/h/Hadsell:Raia;65/8368.html;35/8396;39/10827;186/8206", "google_scholar": "https://scholar.google.co.uk/citations?user=kcTK_FAAAAAJ;;EWQnacoAAAAJ;https://scholar.google.ca/citations?user=eSPY8LwAAAAJ;https://scholar.google.co.uk/citations?user=f31mvPsAAAAJ;https://scholar.google.co.uk/citations?user=JDaHecMAAAAJ;83HL5FwAAAAJ", "orcid": "0000-0002-2820-4692;;;;;;", "linkedin": "petarvelickovic;;;;;;thomas-kipf-6b260410a", "or_profile": "~Petar_Veli\u010dkovi\u01071;~Alexander_Lerchner1;~raia_hadsell1;~Razvan_Pascanu1;~Charles_Blundell1;~Matko_Bosnjak1;~Thomas_N._Kipf1", "aff": "Google DeepMind;Google DeepMind;Google DeepMind;Google DeepMind;Google DeepMind;Google DeepMind;Google", "aff_domain": "google.com;deepmind.com;deepmind.com;google.com;google.com;deepmind.com;google.com", "position": "Senior Staff Research Scientist;Research Scientist;Research Scientist;Research Scientist;Research Scientist;Researcher;Research Scientist", "bibtex": "@misc{\nveli{\\v{c}}kovi{\\'c}2022reasoningmodulated,\ntitle={Reasoning-Modulated Representations},\nauthor={Petar Veli{\\v{c}}kovi{\\'c} and Matko Bo{\\v{s}}njak and Thomas Kipf and Alexander Lerchner and raia hadsell and Razvan Pascanu and Charles Blundell},\nyear={2022},\nurl={https://openreview.net/forum?id=cggphp7nPuI}\n}", "github": "", "project": "", "reviewers": "HnMg;Yu3T;o4To;fio5", "site": "https://openreview.net/forum?id=cggphp7nPuI", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "4;4;4;3", "correctness": "4;3;4;4", "technical_novelty": "2;2;2;2", "empirical_novelty": "3;2;3;2", "wc_summary_paper": "117;59;351;145", "wc_summary_review": "35;50;60;193", "wc_main_review": "301;662;673;606", "wc_review": "453;771;1084;944", "wc_reply_reviewers": "95;345;253;84", "wc_reply_authors": "493;763;635;474", "reply_reviewers": "1;2;1;1", "reply_authors": "1;2;1;1", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 168.0, 110.11357772772621 ], "wc_summary_review_avg": [ 84.5, 63.27124149248219 ], "wc_main_review_avg": [ 560.5, 151.96134376873613 ], "wc_review_avg": [ 813.0, 235.56633885171286 ], "wc_reply_reviewers_avg": [ 194.25, 109.75284734347441 ], "wc_reply_authors_avg": [ 591.25, 117.06061464045027 ], "reply_reviewers_avg": [ 1.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14578096427958625333&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff_unique_index": "0;0;0;0;0;0;0", "aff_unique_norm": "Google", "aff_unique_dep": "Google DeepMind", "aff_unique_url": "https://deepmind.com", "aff_unique_abbr": "DeepMind", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0;0;0;0;1", "aff_country_unique": "United Kingdom;United States" }, { "title": "Deep ReLU Networks Preserve Expected Length", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6660", "id": "ci7LBzDn2Q", "poster": "", "openreview": "https://openreview.net/forum?id=ci7LBzDn2Q", "slides": "https://iclr.cc/virtual/2022/poster/6660", "video": "https://iclr.cc/virtual/2022/poster/6660", "author_site": "Boris Hanin, Ryan Jeong, David Rolnick", "tldr": "", "abstract": "Assessing the complexity of functions computed by a neural network helps us understand how the network will learn and generalize. One natural measure of complexity is how the network distorts length - if the network takes a unit-length curve as input, what is the length of the resulting curve of outputs? It has been widely believed that this length grows exponentially in network depth. We prove that in fact this is not the case: the expected length distortion does not grow with depth, and indeed shrinks slightly, for ReLU networks with standard random initialization. We also generalize this result by proving upper bounds both for higher moments of the length distortion and for the distortion of higher-dimensional volumes. These theoretical results are corroborated by our experiments.", "keywords": "deep learning theory;random ReLU networks;length distortion;initialization;expressivity", "primary_area": "", "supplementary_material": "", "author": "Boris Hanin;Ryan Jeong;David Rolnick", "authorids": "~Boris_Hanin1;~Ryan_Jeong1;~David_Rolnick1", "gender": ";M;M", "homepage": "https://hanin.princeton.edu;;http://www.davidrolnick.com/", "dblp": "205/2534;;37/10718", "google_scholar": ";https://scholar.google.com/citations?hl=en;P_luG3cAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Boris_Hanin1;~Ryan_Jeong1;~David_Rolnick1", "aff": "Princeton University;University of Pennsylvania;McGill University", "aff_domain": "princeton.edu;upenn.edu;cs.mcgill.ca", "position": "Assistant Professor;Undergrad student;Assistant Professor", "bibtex": "@inproceedings{\nhanin2022deep,\ntitle={Deep Re{LU} Networks Preserve Expected Length},\nauthor={Boris Hanin and Ryan Jeong and David Rolnick},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=ci7LBzDn2Q}\n}", "github": "", "project": "", "reviewers": "MMSS;mxm7;TwSU;VKDK", "pdf_size": 0, "recommendation": "6;6;8;8", "confidence": "3;4;4;4", "correctness": "4;4;4;4", "technical_novelty": "3;2;4;3", "empirical_novelty": "0;3;2;0", "wc_summary_paper": "152;164;58;192", "wc_summary_review": "71;96;20;47", "wc_main_review": "400;341;281;824", "wc_review": "623;601;359;1063", "wc_reply_reviewers": "17;12;0;6", "wc_reply_authors": "407;195;117;808", "reply_reviewers": "1;1;0;1", "reply_authors": "1;1;1;1", "recommendation_avg": [ 7.0, 1.0 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 4.0, 0.0 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 1.25, 1.299038105676658 ], "wc_summary_paper_avg": [ 141.5, 50.34630075785112 ], "wc_summary_review_avg": [ 58.5, 28.182441342083905 ], "wc_main_review_avg": [ 461.5, 213.47657951166445 ], "wc_review_avg": [ 661.5, 253.8951555268434 ], "wc_reply_reviewers_avg": [ 8.75, 6.378675411086537 ], "wc_reply_authors_avg": [ 381.75, 267.9994169769778 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": 0.0, "gs_citation": 22, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7196894590859930242&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "pdf": "https://openreview.net/pdf?id=ci7LBzDn2Q", "email": "princeton.edu;upenn.edu;cs.mcgill.ca", "author_num": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "Princeton University;University of Pennsylvania;McGill University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.princeton.edu;https://www.upenn.edu;https://www.mcgill.ca", "aff_unique_abbr": "Princeton;UPenn;McGill", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1", "aff_country_unique": "United States;Canada" }, { "id": "ciSap6Cw5mk", "title": "MANDERA: Malicious Node Detection in Federated Learning via Ranking", "track": "main", "status": "Reject", "tldr": "", "abstract": "Federated learning is a distributed learning paradigm which seeks to preserve the privacy of each participating node's data. However, federated learning is vulnerable to attacks, specifically to our interest, model integrity attacks. In this paper, we propose a novel method for malicious node detection called MANDERA. By transferring the original message matrix assembling the update gradients from local nodes into a ranking matrix that encodes the relative rankings of the outputs of all local nodes in different parameter dimensions, MANDERA seeks to distinguish the malicious nodes from the benign ones with high efficiency based on key characteristics of the rank domain. We have proved, under mild conditions, that MANDERA is guaranteed to detect all malicious nodes under typical Byzantine attacks with no prior knowledge or history about the participating nodes. The effectiveness of MANDERA is further confirmed by experiments on two classic datasets, CIFAR-10 and MNIST. Compared to the state-of-art methods in the literature for defending Byzantine attacks, MANDERA is unique in its way to identify the malicious nodes by ranking and its robustness to effectively defense a wide range of attacks.", "keywords": "Federated Learning;Data poisoning attack;Byzantine attack;Malicious node detection;Ranking", "primary_area": "", "supplementary_material": "/attachment/d62b5dbc367e675117ccab6958ea1961961e84fd.zip", "author": "Wanchuang Zhu;Benjamin Zi Hao Zhao;Simon Luo;Ke Deng", "authorids": "~Wanchuang_Zhu2;~Benjamin_Zi_Hao_Zhao1;~Simon_Luo1;~Ke_Deng3", "gender": "M;M;;M", "homepage": ";;;http://www.stat.tsinghua.edu.cn/kdeng/", "dblp": ";188/6037;199/2628;", "google_scholar": ";https://scholar.google.com.au/citations?user=USElgcQAAAAJ;;", "orcid": "0000-0002-9736-6062;0000-0002-2774-2675;;0000-0002-4383-8319", "linkedin": ";benjamin-zhao-75819b83/;;", "or_profile": "~Wanchuang_Zhu2;~Benjamin_Zi_Hao_Zhao1;~Simon_Luo1;~Ke_Deng3", "aff": "University of Sydney;Macquarie University;University of Sydney;Tsinghua University", "aff_domain": "sydney.edu.au;mq.edu.au;sydney.edu.au;tsinghua.edu.cn", "position": "Postdoc;Postdoc;Postdoc;Associate Professor", "bibtex": "@misc{\nzhu2022mandera,\ntitle={{MANDERA}: Malicious Node Detection in Federated Learning via Ranking},\nauthor={Wanchuang Zhu and Benjamin Zi Hao Zhao and Simon Luo and Ke Deng},\nyear={2022},\nurl={https://openreview.net/forum?id=ciSap6Cw5mk}\n}", "github": "", "project": "", "reviewers": "FjLC;5jZy;4WMU", "site": "https://openreview.net/forum?id=ciSap6Cw5mk", "pdf_size": 0, "recommendation": "3;6;8", "confidence": "3;3;3", "correctness": "4;4;4", "technical_novelty": "2;3;3", "empirical_novelty": "2;3;2", "wc_summary_paper": "21;100;45", "wc_summary_review": "11;60;29", "wc_main_review": "223;93;446", "wc_review": "255;253;520", "wc_reply_reviewers": "386;161;104", "wc_reply_authors": "1272;434;709", "reply_reviewers": "2;2;1", "reply_authors": "2;3;1", "recommendation_avg": [ 5.666666666666667, 2.0548046676563256 ], "confidence_avg": [ 3.0, 0.0 ], "correctness_avg": [ 4.0, 0.0 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 55.333333333333336, 33.06895153396242 ], "wc_summary_review_avg": [ 33.333333333333336, 20.237478982214054 ], "wc_main_review_avg": [ 254.0, 145.7692240037885 ], "wc_review_avg": [ 342.6666666666667, 125.39626079840748 ], "wc_reply_reviewers_avg": [ 217.0, 121.74563647211345 ], "wc_reply_authors_avg": [ 805.0, 348.78168912181536 ], "reply_reviewers_avg": [ 1.6666666666666667, 0.4714045207910317 ], "reply_authors_avg": [ 2.0, 0.816496580927726 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4974715516647577085&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1;0;2", "aff_unique_norm": "University of Sydney;Macquarie University;Tsinghua University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.sydney.edu.au;https://www.mq.edu.au;https://www.tsinghua.edu.cn", "aff_unique_abbr": "USYD;MQ;THU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1", "aff_country_unique": "Australia;China" }, { "id": "ciTmHV3Pt3v", "title": "Ripple Attention for Visual Perception with Sub-quadratic Complexity", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Transformer architectures are now central to modeling in natural language processing tasks. At its heart is the attention mechanism, which enables effective modeling of long-term dependencies in a sequence. Recently, transformers have been successfully applied in the computer vision domain, where 2D images are first segmented into patches and then treated as 1D sequences. Such linearization, however, impairs the notion of spatial locality in images, which bears important visual clues. To bridge the gap, we propose ripple attention, a sub-quadratic attention mechanism for visual perception. In ripple attention, contributions of different tokens to a query are weighted with respect to their relative spatial distances in the 2D space. To favor correlations with vicinal tokens yet permit long-term dependencies, we derive the spatial weights through a stick-breaking transformation. We further design a dynamic programming algorithm that computes weighted contributions for all queries in linear observed time, taking advantage of the summed-area table and recent advances in linearized attention. Extensive experiments and analyses demonstrate the effectiveness of ripple attention on various visual tasks.", "keywords": "attention mechanism;vision transformers;summed-area tables;stick-breaking transforms;dynamic programming", "primary_area": "", "supplementary_material": "", "author": "Lin Zheng;Huijie Pan;Lingpeng Kong", "authorids": "~Lin_Zheng1;~Huijie_Pan1;~Lingpeng_Kong1", "gender": "M;M;M", "homepage": "https://lzhengisme.github.io/;;https://ikekonglp.github.io/", "dblp": ";;144/7656", "google_scholar": "3NXH0t8AAAAJ;;f1hBi5wAAAAJ", "orcid": ";;", "linkedin": ";huijie-pan-8a0a871a4/;", "or_profile": "~Lin_Zheng1;~Huijie_Pan1;~Lingpeng_Kong1", "aff": "The University of Hong Kong;The University of Hong Kong;Department of Computer Science, The University of Hong Kong", "aff_domain": "hku.hk;hku.hk;cs.hku.hk", "position": "PhD student;Undergrad student;Assistant Professor", "bibtex": "@misc{\nzheng2022ripple,\ntitle={Ripple Attention for Visual Perception with Sub-quadratic Complexity},\nauthor={Lin Zheng and Huijie Pan and Lingpeng Kong},\nyear={2022},\nurl={https://openreview.net/forum?id=ciTmHV3Pt3v}\n}", "github": "", "project": "", "reviewers": "Siyf;wmN7;MU1H", "site": "https://openreview.net/forum?id=ciTmHV3Pt3v", "pdf_size": 0, "recommendation": "5;5;5", "confidence": "4;4;3", "correctness": "3;3;3", "technical_novelty": "3;2;2", "empirical_novelty": "2;2;2", "wc_summary_paper": "241;87;98", "wc_summary_review": "312;53;27", "wc_main_review": "790;237;461", "wc_review": "1343;377;586", "wc_reply_reviewers": "270;0;0", "wc_reply_authors": "1005;385;726", "reply_reviewers": "1;0;0", "reply_authors": "2;1;1", "recommendation_avg": [ 5.0, 0.0 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 142.0, 70.14746372226631 ], "wc_summary_review_avg": [ 130.66666666666666, 128.66062161965314 ], "wc_main_review_avg": [ 496.0, 227.11377471801808 ], "wc_review_avg": [ 768.6666666666666, 414.98139181842305 ], "wc_reply_reviewers_avg": [ 90.0, 127.27922061357856 ], "wc_reply_authors_avg": [ 705.3333333333334, 253.53544569196282 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8391912899448350799&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Hong Kong", "aff_unique_dep": "", "aff_unique_url": "https://www.hku.hk", "aff_unique_abbr": "HKU", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Hong Kong SAR", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "A Fine-Tuning Approach to Belief State Modeling", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6730", "id": "ckZY7DGa7FQ", "poster": "", "openreview": "https://openreview.net/forum?id=ckZY7DGa7FQ", "slides": "https://iclr.cc/virtual/2022/poster/6730", "video": "https://iclr.cc/virtual/2022/poster/6730", "author_site": "Samuel Sokota, Hengyuan Hu, David Wu, Zico Kolter, Jakob Foerster, Noam Brown", "tldr": "", "abstract": "We investigate the challenge of modeling the belief state of a partially observable Markov system, given sample-access to its dynamics model. This problem setting is often approached using parametric sequential generative modeling methods. However, these methods do not leverage any additional computation at inference time to increase their accuracy. Moreover, applying these methods to belief state modeling in certain multi-agent settings would require passing policies into the belief model---at the time of writing, there have been no successful demonstrations of this. Toward addressing these shortcomings, we propose an inference-time improvement framework for parametric sequential generative modeling methods called belief fine-tuning (BFT). BFT leverages approximate dynamic programming in the form of fine-tuning to determine the model parameters at each time step. It can improve the accuracy of the belief model at test time because it specializes the model to the space of local observations. Furthermore, because this specialization occurs after the action or policy has already been decided, BFT does not require the belief model to process it as input. As a result of the latter point, BFT enables, for the first time, approximate public belief state search in imperfect-information games where the number of possible information states is too large to track tabularly. We exhibit these findings on large-scale variants of the benchmark game Hanabi.", "keywords": "imperfect-information;partial observability;search;decision-time planning", "primary_area": "", "supplementary_material": "", "author": "Samuel Sokota;Hengyuan Hu;David J Wu;J Zico Kolter;Jakob Nicolaus Foerster;Noam Brown", "authorids": "~Samuel_Sokota1;~Hengyuan_Hu2;~David_J_Wu1;~J_Zico_Kolter1;~Jakob_Nicolaus_Foerster1;~Noam_Brown2", "gender": "M;;M;;M;M", "homepage": "https://ssokota.github.io/;;https://www.jakobfoerster.com;http://www.cs.cmu.edu/~noamb;;http://www.zicokolter.com", "dblp": "243/5881;;176/5095;https://dblp.uni-trier.de/pers/hd/b/Brown:Noam;;67/2526", "google_scholar": ";;6z4lQzMAAAAJ;RLDbLcUAAAAJ;oF46lMIAAAAJ;UXh1I6UAAAAJ", "orcid": ";0000-0002-5834-4936;;;;", "linkedin": "samuel-sokota-87a153149/;;;;;", "or_profile": "~Samuel_Sokota1;~David_J_Wu1;~Jakob_Nicolaus_Foerster1;~Noam_Brown2;~Hengyuan_Hu1;~Zico_Kolter1", "aff": "Carnegie Mellon University;Meta Facebook;University of Oxford, University of Oxford;Meta Facebook;Facebook AI Research;Carnegie Mellon University", "aff_domain": "cmu.edu;fb.com;eng.ox.ac.uk;facebook.com;fb.com;cmu.edu", "position": "PhD student;Researcher;Associate Professor;Research Scientist;Researcher;Full Professor", "bibtex": "@inproceedings{\nsokota2022a,\ntitle={A Fine-Tuning Approach to Belief State Modeling},\nauthor={Samuel Sokota and Hengyuan Hu and David J Wu and J Zico Kolter and Jakob Nicolaus Foerster and Noam Brown},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=ckZY7DGa7FQ}\n}", "github": "", "project": "", "reviewers": "WyMC;kvNa;F3L2;Pqjr", "pdf_size": 0, "recommendation": "3;8;8;8", "confidence": "3;4;4;3", "correctness": "4;4;4;4", "technical_novelty": "2;3;3;2", "empirical_novelty": "0;3;3;3", "wc_summary_paper": "64;93;92;91", "wc_summary_review": "39;131;26;59", "wc_main_review": "268;721;631;566", "wc_review": "371;945;749;716", "wc_reply_reviewers": "0;30;30;36", "wc_reply_authors": "515;555;348;1303", "reply_reviewers": "0;1;1;1", "reply_authors": "2;1;1;3", "recommendation_avg": [ 6.75, 2.165063509461097 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 4.0, 0.0 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.25, 1.299038105676658 ], "wc_summary_paper_avg": [ 85.0, 12.144957801491119 ], "wc_summary_review_avg": [ 63.75, 40.5670740872447 ], "wc_main_review_avg": [ 546.5, 169.95072815377992 ], "wc_review_avg": [ 695.25, 206.65959329293185 ], "wc_reply_reviewers_avg": [ 24.0, 14.071247279470288 ], "wc_reply_authors_avg": [ 680.25, 367.83241768501045 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.75, 0.82915619758885 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": 0.0, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16127011075649173399&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "pdf": "https://openreview.net/pdf?id=ckZY7DGa7FQ", "email": "cmu.edu;fb.com;eng.ox.ac.uk;facebook.com;fb.com;cmu.edu", "author_num": 6, "aff_unique_index": "0;1;2;1;1;0", "aff_unique_norm": "Carnegie Mellon University;Meta;University of Oxford", "aff_unique_dep": ";Meta Platforms, Inc.;", "aff_unique_url": "https://www.cmu.edu;https://meta.com;https://www.ox.ac.uk", "aff_unique_abbr": "CMU;Meta;Oxford", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;0;0", "aff_country_unique": "United States;United Kingdom" }, { "id": "clwYez4n8e8", "title": "Logarithmic Unbiased Quantization: Practical 4-bit Training in Deep Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Quantization of the weights and activations is one of the main methods to reduce the computational footprint of Deep Neural Networks (DNNs) training. Current methods enable 4-bit quantization of the forward phase. However, this constitutes only a third of the training process. Reducing the computational footprint of the entire training process requires the quantization of the neural gradients, i.e., the loss gradients with respect to the outputs of intermediate neural layers. \nIn this work, we examine the importance of having unbiased quantization in quantized neural network training, where to maintain it, and how. Based on this, we suggest a logarithmic unbiased quantization (LUQ) method to quantize both the forward and backward phase to 4-bit, achieving state-of-the-art results in 4-bit training. For example, in ResNet50 on ImageNet, we achieved a degradation of 1.18 %; we further improve this to degradation of only 0.64 % after a single epoch of high precision fine-tuning combined with a variance reduction method. Finally, we suggest a method that exploits the low precision format by avoiding multiplications during two-thirds of the training process, thus reducing by 5x the area used by the multiplier. A reference implementation is supplied in the supplementary material.", "keywords": "quantization;efficient training;4 bit training", "primary_area": "", "supplementary_material": "/attachment/27e793d4bfdd10fe56e13983a1f24d771aa7522f.zip", "author": "Brian Chmiel;Ron Banner;Elad Hoffer;Hilla Ben Yaacov;Daniel Soudry", "authorids": "~Brian_Chmiel1;~Ron_Banner1;~Elad_Hoffer1;hbyaacov@habana.ai;~Daniel_Soudry1", "gender": "M;M;M;;M", "homepage": ";;http://www.deeplearning.co.il;;https://soudry.github.io/", "dblp": "239/6051;03/5857;156/0135;;126/1779", "google_scholar": "https://scholar.google.co.il/citations?user=2U8VtKsAAAAJ;;https://scholar.google.co.il/citations?user=iEfTH7AAAAAJ;;https://scholar.google.co.il/citations?user=AEBWEm8AAAAJ", "orcid": ";;;;0000-0001-9368-6352", "linkedin": "brian-chmiel-89653893/;https://il.linkedin.com/in/ron-banner-69403a51;;;daniel-soudry-2aa3a88/", "or_profile": "~Brian_Chmiel1;~Ron_Banner1;~Elad_Hoffer1;hbyaacov@habana.ai;~Daniel_Soudry1", "aff": "Technion - Israel Institute of Technology, Technion;Intel;Habana Labs (Intel);;Technion - Israel Institute of Technology, Technion", "aff_domain": "campus.technion.ac.il;intel.com;habana.ai;;technion.ac.il", "position": "PhD student;Researcher;Researcher;;Associate Professor", "bibtex": "@misc{\nchmiel2022logarithmic,\ntitle={Logarithmic Unbiased Quantization: Practical 4-bit Training in Deep Learning},\nauthor={Brian Chmiel and Ron Banner and Elad Hoffer and Hilla Ben Yaacov and Daniel Soudry},\nyear={2022},\nurl={https://openreview.net/forum?id=clwYez4n8e8}\n}", "github": "", "project": "", "reviewers": "t7UD;9cKC;Sxpj;na7b", "site": "https://openreview.net/forum?id=clwYez4n8e8", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "4;4;4;4", "correctness": "3;2;3;3", "technical_novelty": "3;3;3;3", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "52;57;56;93", "wc_summary_review": "63;242;34;72", "wc_main_review": "426;1001;230;393", "wc_review": "541;1300;320;558", "wc_reply_reviewers": "228;756;0;247", "wc_reply_authors": "1661;2986;627;1435", "reply_reviewers": "2;1;0;1", "reply_authors": "3;5;1;3", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 64.5, 16.560495161679196 ], "wc_summary_review_avg": [ 102.75, 81.61303511082038 ], "wc_main_review_avg": [ 512.5, 291.6337600484553 ], "wc_review_avg": [ 679.75, 370.20425105608933 ], "wc_reply_reviewers_avg": [ 307.75, 276.4456320870344 ], "wc_reply_authors_avg": [ 1677.25, 847.7618695718745 ], "reply_reviewers_avg": [ 1.0, 0.7071067811865476 ], "reply_authors_avg": [ 3.0, 1.4142135623730951 ], "replies_avg": [ 26, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6938482700058731350&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "Technion - Israel Institute of Technology;Intel;Habana Labs", "aff_unique_dep": ";Intel Corporation;", "aff_unique_url": "https://www.technion.ac.il;https://www.intel.com;https://www.habana.ai", "aff_unique_abbr": "Technion;Intel;Habana Labs", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;0", "aff_country_unique": "Israel;United States" }, { "title": "Leveraging Automated Unit Tests for Unsupervised Code Translation", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6467", "id": "cmt-6KtR4c4", "poster": "", "openreview": "https://openreview.net/forum?id=cmt-6KtR4c4", "slides": "https://iclr.cc/virtual/2022/poster/6467", "video": "https://iclr.cc/virtual/2022/poster/6467", "author_site": "Baptiste Roziere, Jie Zhang, Fran\u00e7ois Charton, Mark Harman, Gabriel Synnaeve, Guillaume Lample", "tldr": "", "abstract": "With little to no parallel data available for programming languages, unsupervised methods are well-suited to source code translation. However, the majority of unsupervised machine translation approaches rely on back-translation, a method developed in the context of natural language translation and one that inherently involves training on noisy inputs. Unfortunately, source code is highly sensitive to small changes; a single token can result in compilation failures or erroneous programs, unlike natural languages where small inaccuracies may not change the meaning of a sentence. To address this issue, we propose to leverage an automated unit-testing system to filter out invalid translations, thereby creating a fully tested parallel corpus. We found that fine-tuning an unsupervised model with this filtered data set significantly reduces the noise in the translations so-generated, comfortably outperforming the state-of-the-art for all language pairs studied. In particular, for Java\u2192Python and Python\u2192C++ we outperform the best previous methods by more than 16% and 24% respectively, reducing the error rate by more than 35%.", "keywords": "unsupervised;translation;code;self-training;pseudo-labelling;unit tests;programming languages;deep learning;transformer", "primary_area": "", "supplementary_material": "/attachment/200f116f8517f802f4c65a6b62fb95224f70b3cc.zip", "author": "Baptiste Roziere;Jie Zhang;Francois Charton;Mark Harman;Gabriel Synnaeve;Guillaume Lample", "authorids": "~Baptiste_Roziere1;~Jie_Zhang21;~Francois_Charton1;~Mark_Harman1;~Gabriel_Synnaeve1;~Guillaume_Lample1", "gender": ";F;M;M;M;M", "homepage": ";https://sites.google.com/view/jie-zhang;;http://www0.cs.ucl.ac.uk/staff/M.Harman/;;", "dblp": ";84/6889-50;255/5318;h/MarkHarman.html;http://dblp.uni-trier.de/pers/hd/s/Synnaeve:Gabriel;", "google_scholar": "CrSf2CQAAAAJ;rPWRqf8AAAAJ;;https://scholar.google.com.tw/citations?user=IwSN8IgAAAAJ;wN9rBkcAAAAJ;H7sVDmIAAAAJ", "orcid": ";;;0000-0002-5864-4488;;", "linkedin": ";jie-zhang-5326aa187/;fran%C3%A7ois-charton-214187120/;markharman/?originalSubdomain=uk;;", "or_profile": "~Baptiste_Roziere1;~Jie_Zhang21;~Francois_Charton1;~Mark_Harman1;~Gabriel_Synnaeve1;~Guillaume_Lample1", "aff": "Meta AI;King's College London, University of London;Meta Facebook;University College London, University of London;Meta Facebook;Meta Facebook", "aff_domain": "fb.com;kcl.ac.uk;fb.com;ucl.ac.uk;fb.com;fb.com", "position": "Research assistant;Lecturer;Research Engineer;Full Professor;Research Scientist;Researcher", "bibtex": "@inproceedings{\nroziere2022leveraging,\ntitle={Leveraging Automated Unit Tests for Unsupervised Code Translation},\nauthor={Baptiste Roziere and Jie Zhang and Francois Charton and Mark Harman and Gabriel Synnaeve and Guillaume Lample},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=cmt-6KtR4c4}\n}", "github": "", "project": "", "reviewers": "zd7L;a7Vt;LnDd;19G3", "pdf_size": 0, "recommendation": "5;6;8;8", "confidence": "4;3;4;4", "correctness": "4;1;4;3", "technical_novelty": "2;2;3;4", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "240;57;87;61", "wc_summary_review": "127;64;34;35", "wc_main_review": "890;237;476;312", "wc_review": "1257;358;597;408", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "1318;603;798;784", "reply_reviewers": "0;0;0;0", "reply_authors": "2;1;1;1", "recommendation_avg": [ 6.75, 1.299038105676658 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.0, 1.224744871391589 ], "technical_novelty_avg": [ 2.75, 0.82915619758885 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 111.25, 75.22092461542866 ], "wc_summary_review_avg": [ 65.0, 37.769034936042516 ], "wc_main_review_avg": [ 478.75, 252.67704189340193 ], "wc_review_avg": [ 655.0, 358.8126251959371 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 875.75, 266.66493489021013 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": 0.15713484026367722, "gs_citation": 137, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7490980057630535653&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=cmt-6KtR4c4", "email": "fb.com;kcl.ac.uk;fb.com;ucl.ac.uk;fb.com;fb.com", "author_num": 6, "aff_unique_index": "0;1;0;2;0;0", "aff_unique_norm": "Meta;King's College London;University College London", "aff_unique_dep": "Meta AI;;", "aff_unique_url": "https://meta.com;https://www.kcl.ac.uk;https://www.ucl.ac.uk", "aff_unique_abbr": "Meta;KCL;UCL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;1;0;0", "aff_country_unique": "United States;United Kingdom" }, { "id": "coPc74qe9s", "title": "A Transferable General-Purpose Predictor for Neural Architecture Search", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Understanding and modelling the performance of neural architectures is key to Neural Architecture Search (NAS). Performance predictors for neural architectures are widely used in low-cost NAS and achieve high ranking correlations between predicted and ground truth performance in several search spaces. However, existing predictors are often designed based on network encodings specific to a predefined search space and are not generalizable across search spaces or to new families of architectures. In this work, we propose a transferable neural predictor for NAS that can generalize across architecture families, by representing any given candidate Convolutional Neural Network with a computation graph that consists of only primitive operators. Further combined with Contrastive Learning, we propose a semi-supervised graph representation learning procedure that is able to leverage both labelled accuracies and unlabeled information of architectures from multiple families to train universal embeddings of computation graphs and the performance predictor. Experiments conducted on three different NAS benchmarks, including NAS-Bench-101, NAS-Bench-201, and NAS-Bench-301, demonstrate that a predictor pre-trained on other families produces superior transferability when applied to a new family of architectures with a completely different design, after fine-tuning on a small amount of data. We then show that when the proposed transferable predictor is used in NAS, it achieves search results that are comparable to the state-of-the-arts on NAS-Bench-101 at a low evaluation cost.", "keywords": "Neural Architecture Search;Performance Estimation;Neural Predictor", "primary_area": "", "supplementary_material": "", "author": "Fred X. Han;Fabian Chudak;Keith G. Mills;Mohammad Salameh;Parsa Riahi;Jialin Zhang;Wei Lu;SHANGLING JUI;Di Niu", "authorids": "~Fred_X._Han1;fabian.chudak@huawei.com;~Keith_G._Mills1;~Mohammad_Salameh1;priahi12@gmail.com;zhangjialin10@hisilicon.com;robin.luwei@hisilicon.com;~SHANGLING_JUI1;~Di_Niu1", "gender": ";;M;M;;;;M;M", "homepage": ";;https://kgmills.github.io/;;;;;;https://www.ualberta.ca/~dniu", "dblp": ";;299/5864;91/9402;;;;;82/4953", "google_scholar": ";;CBOD_ngAAAAJ;https://scholar.google.ca/citations?hl=en;;;;;https://scholar.google.ca/citations?user=3kC5OogAAAAJ", "orcid": ";;0000-0001-6054-1798;;;;;0000-0002-1047-4264;0000-0002-5250-7327", "linkedin": ";;kgmills/;mohammadsalameh;;;;;", "or_profile": "~Fred_X._Han1;fabian.chudak@huawei.com;~Keith_G._Mills1;~Mohammad_Salameh1;priahi12@gmail.com;zhangjialin10@hisilicon.com;robin.luwei@hisilicon.com;~SHANGLING_JUI1;~Di_Niu1", "aff": ";;Huawei Technologies Ltd.;Huawei Technologies Ltd.;;;;Huawei Technologies Ltd.;University of Alberta", "aff_domain": ";;huawei.com;huawei.com;;;;huawei.com;ualberta.ca", "position": ";;Research Intern;Principal Researcher;;;;Principal Researcher;Associate Professor", "bibtex": "@misc{\nhan2022a,\ntitle={A Transferable General-Purpose Predictor for Neural Architecture Search},\nauthor={Fred X. Han and Fabian Chudak and Keith G. Mills and Mohammad Salameh and Parsa Riahi and Jialin Zhang and Wei Lu and SHANGLING JUI and Di Niu},\nyear={2022},\nurl={https://openreview.net/forum?id=coPc74qe9s}\n}", "github": "", "project": "", "reviewers": "RPMD;hA6J;mPwn;eANR", "site": "https://openreview.net/forum?id=coPc74qe9s", "pdf_size": 0, "recommendation": "3;5;5;5", "confidence": "4;5;3;4", "correctness": "2;3;3;3", "technical_novelty": "3;3;3;2", "empirical_novelty": "3;3;2;3", "wc_summary_paper": "26;59;44;78", "wc_summary_review": "32;35;26;3", "wc_main_review": "241;222;297;307", "wc_review": "299;316;367;388", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.5, 0.8660254037844386 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 51.75, 19.13602623325961 ], "wc_summary_review_avg": [ 24.0, 12.549900398011133 ], "wc_main_review_avg": [ 266.75, 36.05811281806079 ], "wc_review_avg": [ 342.5, 36.28015986734347 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 9, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 1.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:gqSlLnV92-IJ:scholar.google.com/&scioq=A+Transferable+General-Purpose+Predictor+for+Neural+Architecture+Search&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0;0;1", "aff_unique_norm": "Huawei;University of Alberta", "aff_unique_dep": "Huawei Technologies;", "aff_unique_url": "https://www.huawei.com;https://www.ualberta.ca", "aff_unique_abbr": "Huawei;UAlberta", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1", "aff_country_unique": "China;Canada" }, { "id": "coQhmtxr5SN", "title": "H-Entropy Search: Generalizing Bayesian Optimization with a Decision-theoretic Uncertainty Measure", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Bayesian optimization (BO) is a popular method for efficiently inferring optima of an expensive black-box function via a sequence of queries. Existing information-theoretic BO procedures aim to make queries that most reduce the uncertainty about optima, where the uncertainty is captured by Shannon entropy. However, an optimal measure of uncertainty would, ideally, factor in how we intend to use the inferred quantity in some downstream procedure. In this paper, we instead consider the H-entropy, a generalization of Shannon entropy from work in statistical decision theory (DeGroot, 1962; Rao, 1984), which contains a broad class of uncertainty measures parameterized by a problem-specific loss function corresponding to a downstream task. We first show that special cases of the H-entropy lead to popular acquisition functions used in BO procedures such as knowledge gradient, expected improvement, and entropy search. We then show how alternative choices for the loss yield a flexible family of acquisition functions for a variety of specialized optimization tasks, including variants of top-k estimation, level set estimation, and multi-valued search. For special cases of the loss and design space, we develop gradient-based methods to efficiently optimize our proposed family of acquisition functions, and demonstrate that the resulting BO procedure shows strong empirical performance on a diverse set of optimization tasks.", "keywords": "Bayesian optimization;entropy search;knowledge gradient;Bayesian optimal experimental design", "primary_area": "", "supplementary_material": "", "author": "Willie Neiswanger;Lantao Yu;Shengjia Zhao;Chenlin Meng;Stefano Ermon", "authorids": "~Willie_Neiswanger2;~Lantao_Yu2;~Shengjia_Zhao1;~Chenlin_Meng1;~Stefano_Ermon1", "gender": "M;M;M;F;M", "homepage": "https://willieneis.github.io/;http://lantaoyu.com/;http://sjzhao.me;https://chenlin9.github.io/;http://cs.stanford.edu/~ermon/", "dblp": "120/7593.html;186/7892;173/5122;227/2517;47/8135", "google_scholar": "QwKHApEAAAAJ;Ixg9n-EAAAAJ;;nEFU7wIAAAAJ;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~Willie_Neiswanger2;~Lantao_Yu2;~Shengjia_Zhao1;~Chenlin_Meng1;~Stefano_Ermon1", "aff": "Stanford University;Computer Science Department, Stanford University;Stanford University;Stanford University;Stanford University", "aff_domain": "stanford.edu;stanford.edu;stanford.edu;stanford.edu;stanford.edu", "position": "Postdoc;PhD student;PhD student;PhD student;Assistant Professor", "bibtex": "@misc{\nneiswanger2022hentropy,\ntitle={H-Entropy Search: Generalizing Bayesian Optimization with a Decision-theoretic Uncertainty Measure},\nauthor={Willie Neiswanger and Lantao Yu and Shengjia Zhao and Chenlin Meng and Stefano Ermon},\nyear={2022},\nurl={https://openreview.net/forum?id=coQhmtxr5SN}\n}", "github": "", "project": "", "reviewers": "YEC3;QqKr;NMWE;jUAT", "site": "https://openreview.net/forum?id=coQhmtxr5SN", "pdf_size": 0, "recommendation": "3;3;6;6", "confidence": "3;4;4;4", "correctness": "4;3;3;3", "technical_novelty": "2;2;3;3", "empirical_novelty": "0;3;3;4", "wc_summary_paper": "71;176;49;30", "wc_summary_review": "18;106;41;29", "wc_main_review": "423;315;664;405", "wc_review": "512;597;754;464", "wc_reply_reviewers": "0;267;40;0", "wc_reply_authors": "475;630;643;367", "reply_reviewers": "0;1;1;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 4.5, 1.5 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 1.5 ], "wc_summary_paper_avg": [ 81.5, 56.45573487255303 ], "wc_summary_review_avg": [ 48.5, 34.1796723214252 ], "wc_main_review_avg": [ 451.75, 129.19244366448063 ], "wc_review_avg": [ 581.75, 110.26417142481051 ], "wc_reply_reviewers_avg": [ 76.75, 111.04813145658957 ], "wc_reply_authors_avg": [ 528.75, 114.40798704635966 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.5773502691896258, "corr_recommendation_correctness": -0.5773502691896258, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:-gKvFZWtk7UJ:scholar.google.com/&scioq=H-Entropy+Search:+Generalizing+Bayesian+Optimization+with+a+Decision-theoretic+Uncertainty+Measure&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Stanford University", "aff_unique_dep": "", "aff_unique_url": "https://www.stanford.edu", "aff_unique_abbr": "Stanford", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "FILIP: Fine-grained Interactive Language-Image Pre-Training", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6909", "id": "cpDhcsEDC2", "poster": "", "openreview": "https://openreview.net/forum?id=cpDhcsEDC2", "slides": "https://iclr.cc/virtual/2022/poster/6909", "video": "https://iclr.cc/virtual/2022/poster/6909", "author_site": "Lewei Yao, Runhui Huang, LU HOU, Guansong Lu, Minzhe Niu, Hang Xu, Xiaodan Liang, Zhenguo Li, Xin Jiang, Chunjing Xu", "tldr": "", "abstract": "Unsupervised large-scale vision-language pre-training has shown promising advances on various downstream tasks. Existing methods often model the cross-modal interaction either via the similarity of the global feature of each modality which misses sufficient information, or finer-grained interactions using cross/self-attention upon visual and textual tokens. However, cross/self-attention suffers from inferior efficiency in both training and inference. In this paper, we introduce a large-scale Fine-grained Interactive Language-Image Pre-training (FILIP) to achieve finer-level alignment through a cross-modal late interaction mechanism, which uses a token-wise maximum similarity between visual and textual tokens to guide the contrastive objective. FILIP successfully leverages the finer-grained expressiveness between image patches and textual words by modifying only contrastive loss, while simultaneously gaining the ability to pre-compute image and text representations offline at inference, keeping both large-scale training and inference efficient. Furthermore, we construct a new large-scale image-text pair dataset called FILIP300M for pre-training. Experiments show that FILIP achieves state-of-the-art performance on multiple downstream vision-language tasks including zero-shot image classification and image-text retrieval. The visualization on word-patch alignment further shows that FILIP can learn meaningful fine-grained features with promising localization ability.", "keywords": "Visual-language pretraining;Language-Image Pretraining;Multi-modality model", "primary_area": "", "supplementary_material": "", "author": "Lewei Yao;Runhui Huang;Lu Hou;Guansong Lu;Minzhe Niu;Hang Xu;Xiaodan Liang;Zhenguo Li;Xin Jiang;Chunjing Xu", "authorids": "~Lewei_Yao1;huangrh9@mail2.sysu.edu.cn;~Lu_Hou2;~Guansong_Lu1;~Minzhe_Niu1;~Hang_Xu1;~Xiaodan_Liang2;~Zhenguo_Li1;~Xin_Jiang1;~Chunjing_Xu1", "gender": "M;;;M;M;M;F;M;M;M", "homepage": ";;;;https://github.com/nmzfrank;;https://www.sysu-hcp.net/;http://www.ee.columbia.edu/~zgli/;;", "dblp": "254/1943.html;;;220/3032;217/1826;;;23/6479;42/4142-2;", "google_scholar": "hqDyTg8AAAAJ;;;YIt8thUAAAAJ;;https://scholar.google.com.hk/citations?user=J_8TX6sAAAAJ;voxznZAAAAAJ;XboZC1AAAAAJ;DUfcez0AAAAJ;-CJ5LkMAAAAJ", "orcid": ";;;;;0000-0003-3645-8972;;;0000-0002-9117-8247;", "linkedin": ";;;;;;;;xin-jiang-9577b76/;", "or_profile": "~Lewei_Yao1;huangrh9@mail2.sysu.edu.cn;~Lu_Hou2;~Guansong_Lu1;~Minzhe_Niu1;~Hang_Xu1;~Xiaodan_Liang2;~Zhenguo_Li1;~Xin_Jiang1;~Chunjing_Xu1", "aff": "Hong Kong University of Science and Technology;;;Huawei;Huawei Technologies Ltd.;Huawei Noah\u2018s Ark Lab;SUN YAT-SEN UNIVERSITY;Huawei Noah's Ark Lab;Noah\u2019s Ark Lab, Huawei Technologies;", "aff_domain": "ust.hk;;;huawei.com;huawei.com;huawei.com;sysu.edu.cn;huawei.com;huawei.com;", "position": "PhD student;;;Researcher;Researcher;Researcher;Associate Professor;Principal Researcher;Principal Researcher;", "bibtex": "@inproceedings{\nyao2022filip,\ntitle={{FILIP}: Fine-grained Interactive Language-Image Pre-Training},\nauthor={Lewei Yao and Runhui Huang and Lu Hou and Guansong Lu and Minzhe Niu and Hang Xu and Xiaodan Liang and Zhenguo Li and Xin Jiang and Chunjing Xu},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=cpDhcsEDC2}\n}", "github": "", "project": "", "reviewers": "gJkL;AMdt;WLou;aFb7", "pdf_size": 0, "recommendation": "5;6;6;6", "confidence": "4;4;4;4", "correctness": "3;3;3;4", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;3;3;4", "wc_summary_paper": "126;88;55;58", "wc_summary_review": "28;106;23;81", "wc_main_review": "593;678;462;239", "wc_review": "747;872;540;378", "wc_reply_reviewers": "54;40;38;34", "wc_reply_authors": "2597;1312;1360;1063", "reply_reviewers": "1;1;1;1", "reply_authors": "5;2;2;2", "recommendation_avg": [ 5.75, 0.4330127018922193 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 81.75, 28.621451745150875 ], "wc_summary_review_avg": [ 59.5, 35.174564673923115 ], "wc_main_review_avg": [ 493.0, 165.60646122660793 ], "wc_review_avg": [ 634.25, 189.59479818813594 ], "wc_reply_reviewers_avg": [ 41.5, 7.533259586659682 ], "wc_reply_authors_avg": [ 1583.0, 596.1891478381672 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 2.75, 1.299038105676658 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 10, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 672, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7992693933021693275&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=cpDhcsEDC2", "email": "ust.hk;;;huawei.com;huawei.com;huawei.com;sysu.edu.cn;huawei.com;huawei.com;", "author_num": 10, "aff_unique_index": "0;1;1;1;2;1;1", "aff_unique_norm": "Hong Kong University of Science and Technology;Huawei;Sun Yat-sen University", "aff_unique_dep": ";Huawei Technologies Co., Ltd.;", "aff_unique_url": "https://www.ust.hk;https://www.huawei.com;http://www.sysu.edu.cn", "aff_unique_abbr": "HKUST;Huawei;SYSU", "aff_campus_unique_index": "0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "China" }, { "id": "cpstx0xuvRY", "title": "Information-Theoretic Generalization Bounds for Iterative Semi-Supervised Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "We consider iterative semi-supervised learning (SSL) algorithms that iteratively generate pseudo-labels for a large amount unlabelled data to progressively refine the model parameters. In particular, we seek to understand the behaviour of the {\\em generalization error} of iterative SSL algorithms using information-theoretic principles. To obtain bounds that are amenable to numerical evaluation, we first work with a simple model---namely, the binary Gaussian mixture model. Our theoretical results suggest that when the class conditional variances are not too large, the upper bound on the generalization error decreases monotonically with the number of iterations, but quickly saturates. The theoretical results on the simple model are corroborated by extensive experiments on several benchmark datasets such as the MNIST and CIFAR datasets in which we notice that the generalization error improves after several pseudo-labelling iterations, but saturates afterwards.", "keywords": "Generalization error;Information theory;Semi-supervised learning", "primary_area": "", "supplementary_material": "/attachment/826ca5bc09500dc05003f14df14b4fe8583dc7b1.zip", "author": "Haiyun He;Hanshu YAN;Vincent Tan", "authorids": "~Haiyun_He1;~Hanshu_YAN2;~Vincent_Tan1", "gender": "F;M;M", "homepage": "https://haiyun-he.github.io;;https://www.ece.nus.edu.sg/stfpage/vtan/pubs.htm", "dblp": ";243/3583;60/2327", "google_scholar": ";MG817V4AAAAJ;dJoAVvAAAAAJ", "orcid": ";;0000-0002-5008-4527", "linkedin": ";;", "or_profile": "~Haiyun_He1;~Hanshu_YAN2;~Vincent_Tan1", "aff": "National University of Singapore;National University of Singapore (NUS);", "aff_domain": "u.nus.edu;u.nus.edu;", "position": "PhD student;PhD student;", "bibtex": "@misc{\nhe2022informationtheoretic,\ntitle={Information-Theoretic Generalization Bounds for Iterative Semi-Supervised Learning},\nauthor={Haiyun He and Hanshu YAN and Vincent Tan},\nyear={2022},\nurl={https://openreview.net/forum?id=cpstx0xuvRY}\n}", "github": "", "project": "", "reviewers": "7Xsu;8ena;5JBw;hqP3", "site": "https://openreview.net/forum?id=cpstx0xuvRY", "pdf_size": 0, "recommendation": "5;5;5;6", "confidence": "5;3;3;4", "correctness": "3;4;3;3", "technical_novelty": "2;2;2;3", "empirical_novelty": "1;2;2;2", "wc_summary_paper": "103;191;68;103", "wc_summary_review": "22;64;211;46", "wc_main_review": "249;461;229;252", "wc_review": "374;716;508;401", "wc_reply_reviewers": "295;0;0;0", "wc_reply_authors": "765;388;821;883", "reply_reviewers": "1;0;0;0", "reply_authors": "2;1;2;2", "recommendation_avg": [ 5.25, 0.4330127018922193 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 116.25, 45.46083479215928 ], "wc_summary_review_avg": [ 85.75, 73.83215762796046 ], "wc_main_review_avg": [ 297.75, 94.66618984621701 ], "wc_review_avg": [ 499.75, 134.53322080437977 ], "wc_reply_reviewers_avg": [ 73.75, 127.7387470582047 ], "wc_reply_authors_avg": [ 714.25, 192.92922925259407 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.75, 0.4330127018922193 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.17407765595569782, "corr_recommendation_correctness": -0.3333333333333333, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3200619351620143297&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "aff_unique_index": "0;0", "aff_unique_norm": "National University of Singapore", "aff_unique_dep": "", "aff_unique_url": "https://www.nus.edu.sg", "aff_unique_abbr": "NUS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Singapore" }, { "id": "cqHeSMTkoBm", "title": "Learning Multi-Objective Curricula for Deep Reinforcement Learning", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Various automatic curriculum learning (ACL) methods have been proposed to improve the sample efficiency and final performance of deep reinforcement learning (DRL). They are designed to control how a DRL agent collects data, which is inspired by how humans gradually adapt their learning processes to their capabilities. For example, ACL can be used for subgoal generation, reward shaping, environment generation, or initial state generation. However, prior work only considers curriculum learning following one of the aforementioned predefined paradigms. It is unclear which of these paradigms are complementary, and how the combination of them can be learned from interactions with the environment. Therefore, in this paper, we propose a unified automatic curriculum learning framework to create multi-objective but coherent curricula that are generated by a set of parametric curriculum modules. Each curriculum module is instantiated as a neural network and is responsible for generating a particular curriculum. In order to coordinate those potentially conflicting modules in unified parameter space, we propose a multi-task hyper-net learning framework that uses a single hyper-net to parameterize all those curriculum modules. In addition to existing hand-designed curricula paradigms, we further design a flexible memory mechanism to learn an abstract curriculum, which may otherwise be difficult to design manually. We evaluate our method on a series of robotic manipulation tasks and demonstrate its superiority over other state-of-the-art ACL methods in terms of sample efficiency and final performance.", "keywords": "Curriculum Learning;Reinforcement learning;Hyper-network", "primary_area": "", "supplementary_material": "/attachment/32b8297e098428c129150fbee7f5a08402f79b7d.zip", "author": "Jikun Kang;Miao Liu;Abhinav Gupta;Christopher Pal;Xue Liu;Jie Fu", "authorids": "~Jikun_Kang1;~Miao_Liu1;~Abhinav_Gupta2;~Christopher_Pal1;~Xue_Liu1;~Jie_Fu2", "gender": "M;M;M;;M;M", "homepage": "https://luciferkonn.github.io;https://sites.google.com/view/miaoliuhome;https://www.guabhinav.com;https://scholar.google.ca/citations?user=1ScWJOoAAAAJ&hl=en&oi=ao;http://www.cs.mcgill.ca/~xueliu/;https://bigaidream.github.io/", "dblp": "299/0233;;36/7024-2;45/1217;l/XueLiu;", "google_scholar": "Jikun%20Kang;7QHvAEYAAAAJ;jAaCd7YAAAAJ;https://scholar.google.ca/citations?user=1ScWJOoAAAAJ;https://scholar.google.com.tw/citations?user=rfLIRakAAAAJ;66osleIAAAAJ", "orcid": "0009-0001-1334-7092;;;;;0000-0002-4494-843X", "linkedin": "kang-jikun-91993814b/;miao-liu-3273a32b;backpropper;;;", "or_profile": "~Jikun_Kang1;~Miao_Liu1;~Abhinav_Gupta2;~Christopher_Pal1;~Xue_Liu1;~Jie_Fu1", "aff": "McGill University;International Business Machines;Meta AI;Polytechnique Montreal;McGill University;University of Montreal", "aff_domain": "mcgill.ca;ibm.com;meta.com;polymtl.ca;mcgill.ca;umontreal.ca", "position": "PhD student;Research Staff Member;Research Intern;Full Professor;Full Professor;Postdoc", "bibtex": "@misc{\nkang2022learning,\ntitle={Learning Multi-Objective Curricula for Deep Reinforcement Learning},\nauthor={Jikun Kang and Miao Liu and Abhinav Gupta and Christopher Pal and Xue Liu and Jie Fu},\nyear={2022},\nurl={https://openreview.net/forum?id=cqHeSMTkoBm}\n}", "github": "", "project": "", "reviewers": "iZmE;Fq4s;roms;7obo", "site": "https://openreview.net/forum?id=cqHeSMTkoBm", "pdf_size": 0, "recommendation": "3;5;6;8", "confidence": "4;3;4;2", "correctness": "3;3;3;3", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;2;2;0", "wc_summary_paper": "124;55;61;25", "wc_summary_review": "68;23;20;15", "wc_main_review": "527;277;442;222", "wc_review": "719;355;523;262", "wc_reply_reviewers": "1478;294;47;340", "wc_reply_authors": "3467;1416;594;483", "reply_reviewers": "3;2;1;2", "reply_authors": "8;3;2;3", "recommendation_avg": [ 5.5, 1.8027756377319946 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 66.25, 36.02342987556848 ], "wc_summary_review_avg": [ 31.5, 21.266170318136737 ], "wc_main_review_avg": [ 367.0, 122.83118496538246 ], "wc_review_avg": [ 464.75, 174.06087297264713 ], "wc_reply_reviewers_avg": [ 539.75, 553.0390469939713 ], "wc_reply_authors_avg": [ 1490.0, 1196.9617788384055 ], "reply_reviewers_avg": [ 2.0, 0.7071067811865476 ], "reply_authors_avg": [ 4.0, 2.345207879911715 ], "replies_avg": [ 30, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.7526178090063818, "corr_recommendation_correctness": 0.0, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6528679197521556343&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;2;3;0;4", "aff_unique_norm": "McGill University;International Business Machines Corporation;Meta;Polytechnique Montreal;University of Montreal", "aff_unique_dep": ";;Meta AI;;", "aff_unique_url": "https://www.mcgill.ca;https://www.ibm.com;https://meta.com;https://www.polymtl.ca;https://wwwumontreal.ca", "aff_unique_abbr": "McGill;IBM;Meta;PolyMTL;UM", "aff_campus_unique_index": "1", "aff_campus_unique": ";Montreal", "aff_country_unique_index": "0;1;1;0;0;0", "aff_country_unique": "Canada;United States" }, { "id": "crq5s3LLESc", "title": "On Label Shift in Domain Adaptation via Wasserstein Distance", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "We study the label shift problem between the source and target domains in general domain adaptation (DA) settings. We consider transformations transporting the target to source domains, which enable us to align the source and target examples. Through those transformations, we define the label shift between two domains via optimal transport and develop the theory to investigate the properties of DA under various DA settings (e.g., closed-set, partial-set, open-set, and universal settings). Inspired from the developed theory, we propose Label and Data Shift Reduction via Optimal Transport (LDROT) which can mitigate the data and label shifts simultaneously. Finally, we conduct comprehensive experiments to verify9our theoretical findings and compare LDROT with state-of-the-art baselines.", "keywords": "Label shift;optimal transport;Wasserstein distance;domain adaptation", "primary_area": "", "supplementary_material": "/attachment/5b8a8eb3c2c47a459cf5d4b2ace5da2e2fa58fcb.zip", "author": "Trung Le;Dat Do;Tuan Nguyen;Huy Nguyen;Nhat Ho;Hung Bui;Dinh Phung", "authorids": "~Trung_Le2;~Dat_Do1;~Tuan_Nguyen5;~Huy_Nguyen5;~Nhat_Ho1;~Hung_Bui1;~Dinh_Phung2", "gender": "M;M;;M;M;M;M", "homepage": ";https://lsa.umich.edu/stats/people/phd-students/dodat.html;https://tuanrpt.github.io/;https://huynm99.github.io/;https://nhatptnk8912.github.io/;https://sites.google.com/site/buihhung/home;https://research.monash.edu/en/persons/dinh-phung", "dblp": ";221/4662;27/6562-4;48/6075;203/4479;;71/5859", "google_scholar": "https://scholar.google.com/citations?hl=en;;https://scholar.google.com.au/citations?user=DPsUJFgAAAAJ;_YYwzhQAAAAJ;https://scholar.google.ca/citations?user=Xs7cKMwAAAAJ;mDLwSZAAAAAJ;https://scholar.google.com.au/citations?user=OtA9SwIAAAAJ", "orcid": ";;;;;;0000-0002-9977-8247", "linkedin": ";;;huy-nguyen-081199/;nhat-pham-minh-ho-267b8164/;;https://linkedin.com/in/dinh-phung-6b537a6", "or_profile": "~Trung_Le2;~Dat_Do1;~Tuan_Nguyen5;~Huy_Nguyen5;~Nhat_Ho1;~Hung_Bui1;~Dinh_Phung1", "aff": "Monash University;University of Michigan;;VinAI Research;University of Texas, Austin;VinAI Research;Monash University", "aff_domain": "monash.edu;umich.edu;;vinai.io;utexas.edu;vinai.io;monash.edu", "position": "Assistant Professor;PhD student;;Researcher;Assistant Professor;Principal Researcher;Full Professor", "bibtex": "@misc{\nle2022on,\ntitle={On Label Shift in Domain Adaptation via Wasserstein Distance},\nauthor={Trung Le and Dat Do and Tuan Nguyen and Huy Nguyen and Nhat Ho and Hung Bui and Dinh Phung},\nyear={2022},\nurl={https://openreview.net/forum?id=crq5s3LLESc}\n}", "github": "", "project": "", "reviewers": "uzV3;rqfa;CmYM;ZKFg", "site": "https://openreview.net/forum?id=crq5s3LLESc", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "4;3;4;3", "correctness": "3;3;3;3", "technical_novelty": "3;3;2;3", "empirical_novelty": "2;3;2;3", "wc_summary_paper": "151;60;167;44", "wc_summary_review": "69;59;73;60", "wc_main_review": "1123;510;1001;448", "wc_review": "1343;629;1241;552", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 105.5, 54.09482415166907 ], "wc_summary_review_avg": [ 65.25, 5.931905258852336 ], "wc_main_review_avg": [ 770.5, 295.4881554309749 ], "wc_review_avg": [ 941.25, 353.6483953024529 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": -0.6882472016116854, "corr_recommendation_correctness": 0.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1997158912311199801&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff_unique_index": "0;1;2;3;2;0", "aff_unique_norm": "Monash University;University of Michigan;VinAI Research;University of Texas at Austin", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.monash.edu;https://www.umich.edu;https://www.vinai.io/;https://www.utexas.edu", "aff_unique_abbr": "Monash;UM;VinAI;UT Austin", "aff_campus_unique_index": "1", "aff_campus_unique": ";Austin", "aff_country_unique_index": "0;1;2;1;2;0", "aff_country_unique": "Australia;United States;Vietnam" }, { "id": "cuGIoqAJf6p", "title": "Newer is not always better: Rethinking transferability metrics, their peculiarities, stability and performance", "track": "main", "status": "Reject", "tldr": "", "abstract": "Fine-tuning of large pre-trained image and language models on small customized datasets has become increasingly popular for improved prediction and efficient use of limited resources. Fine-tuning requires identification of best models to transfer-learn from and quantifying transferability prevents expensive re-training on all of the candidate models/tasks pairs. In this paper, we show that the statistical problems with covariance estimation drive the poor performance of H-score (Bao et al., 2019) \u2014 a common baseline for newer metrics \u2014 and propose shrinkage-based estimator. This results in up to 80% absolute gain in H-score correlation performance, making it competitive with the state-of-the-art LogME measure by You et al. (2021). Our shrinkage-based H-score is 3-55 times faster to compute compared to LogME. Additionally, we look into a less common setting of target (as opposed to source) task selection. We demonstrate previously overlooked problems in such settings with different number of labels, class-imbalance ratios etc. for some recent metrics e.g., NCE (Tran et al., 2019), LEEP (Nguyen et al., 2020) that resulted in them being misrepresented as leading measures. We propose a correction and recommend measuring correlation performance against relative accuracy in such settings. We also outline the difficulties of comparing feature-dependent metrics, both supervised (e.g. H-score) and unsupervised measures (e.g., Maximum Mean (Long et al., 2015) and Central Moment Discrepancy (Zellinger et al., 2019)), across source models/layers with widely varying feature embedding dimension. We show that dimensionality reduction methods allow for meaningful comparison across models, cheaper computation (6x) and improved correlation performance of some of these measures. We investigate performance of 14 different supervised and unsupervised metrics and demonstrate that even unsupervised metrics can identify the leading models for domain adaptation. We support our findings with ~65,000 (fine-tuning trials) experiments.", "keywords": "transferability metrics;fine-tuning;transfer learning;discrepancy measures;domain adaptation", "primary_area": "", "supplementary_material": "", "author": "Shibal Ibrahim;Natalia Ponomareva;Rahul Mazumder", "authorids": "~Shibal_Ibrahim1;~Natalia_Ponomareva1;~Rahul_Mazumder1", "gender": "M;F;M", "homepage": "https://sites.google.com/view/shibal-ibrahim/;;http://www.mit.edu/~rahulmaz/", "dblp": "177/1113;71/6768-1;11/9365.html", "google_scholar": "_ADL3k8AAAAJ;eIdQR5oAAAAJ;cyCp3pIAAAAJ", "orcid": "0000-0002-3300-0213;0009-0005-6761-1468;0000-0003-1384-9743", "linkedin": "shibal-ibrahim-70097b77;;", "or_profile": "~Shibal_Ibrahim1;~Natalia_Ponomareva1;~Rahul_Mazumder1", "aff": "Massachusetts Institute of Technology;Google;Massachusetts Institute of Technology", "aff_domain": "mit.edu;google.com;mit.edu", "position": "PhD student;Software Engineer in Research;Associate Professor", "bibtex": "@misc{\nibrahim2022newer,\ntitle={Newer is not always better: Rethinking transferability metrics, their peculiarities, stability and performance},\nauthor={Shibal Ibrahim and Natalia Ponomareva and Rahul Mazumder},\nyear={2022},\nurl={https://openreview.net/forum?id=cuGIoqAJf6p}\n}", "github": "", "project": "", "reviewers": "M6p2;NC1U;7FSx", "site": "https://openreview.net/forum?id=cuGIoqAJf6p", "pdf_size": 0, "recommendation": "5;5;8", "confidence": "3;3;4", "correctness": "3;3;3", "technical_novelty": "2;3;4", "empirical_novelty": "2;3;3", "wc_summary_paper": "165;56;40", "wc_summary_review": "66;65;54", "wc_main_review": "302;166;80", "wc_review": "533;287;174", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 6.0, 1.4142135623730951 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 3.0, 0.816496580927726 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 87.0, 55.53977553669682 ], "wc_summary_review_avg": [ 61.666666666666664, 5.436502143433364 ], "wc_main_review_avg": [ 182.66666666666666, 91.39414034219529 ], "wc_review_avg": [ 331.3333333333333, 149.87624524549875 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 1.0, "corr_recommendation_correctness": 0.0, "gs_citation": 24, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8385846712065119125&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff_unique_index": "0;1;0", "aff_unique_norm": "Massachusetts Institute of Technology;Google", "aff_unique_dep": ";Google", "aff_unique_url": "https://web.mit.edu;https://www.google.com", "aff_unique_abbr": "MIT;Google", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "No Parameters Left Behind: Sensitivity Guided Adaptive Learning Rate for Training Large Transformer Models", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6667", "id": "cuvga_CiVND", "poster": "", "openreview": "https://openreview.net/forum?id=cuvga_CiVND", "slides": "https://iclr.cc/virtual/2022/poster/6667", "video": "https://iclr.cc/virtual/2022/poster/6667", "author_site": "Chen Liang, Haoming Jiang, Simiao Zuo, Xz W, Xiaodong Liu, Jianfeng Gao, Weizhu Chen, Tuo Zhao", "tldr": "", "abstract": "Recent research has shown the existence of significant redundancy in large Transformer models. One can prune the redundant parameters without significantly sacrificing the generalization performance. However, we question whether the redundant parameters could have contributed more if they were properly trained. To answer this question, we propose a novel training strategy that encourages all parameters to be trained sufficiently. Specifically, we adaptively adjust the learning rate for each parameter according to its sensitivity, a robust gradient-based measure reflecting this parameter's contribution to the model performance. A parameter with low sensitivity is redundant, and we improve its fitting by increasing its learning rate. In contrast, a parameter with high sensitivity is well-trained, and we regularize it by decreasing its learning rate to prevent further overfitting. We conduct extensive experiments on natural language understanding, neural machine translation, and image classification to demonstrate the effectiveness of the proposed schedule. Analysis shows that the proposed schedule indeed reduces the redundancy and improves generalization performance.", "keywords": "Training Large Transformer Models;Reducing Model Redundancy;Parameter Sensitivity;Adaptive Learning Rate Method;Model Generalization;Model Pruning", "primary_area": "", "supplementary_material": "/attachment/9f900a4dd519a5510d2149478bbee840169587c4.zip", "author": "Chen Liang;Haoming Jiang;Simiao Zuo;Pengcheng He;Xiaodong Liu;Jianfeng Gao;Weizhu Chen;Tuo Zhao", "authorids": "~Chen_Liang3;~Haoming_Jiang1;~Simiao_Zuo1;~Pengcheng_He2;~Xiaodong_Liu1;~Jianfeng_Gao1;~Weizhu_Chen1;~Tuo_Zhao1", "gender": "F;M;;M;;M;M;M", "homepage": "https://cliang1453.github.io/;https://hmjianggatech.github.io;;;;https://www.microsoft.com/en-us/research/people/jfgao/;https://www.microsoft.com/en-us/research/people/wzchen/;http://www2.isye.gatech.edu/~tzhao80", "dblp": "35/3221-6;230/3684;232/2089;116/8665;65/622;92/5339;79/2536;", "google_scholar": "https://scholar.google.com/citations?hl=en;XaFhuG8AAAAJ;J8TSTXMAAAAJ;https://scholar.google.com/citations?hl=en;NIewcxMAAAAJ;https://scholar.google.com/citations?hl=en;LG_E-4EAAAAJ;EJXN6tYAAAAJ", "orcid": ";;;;;;;", "linkedin": ";;;;;;;", "or_profile": "~Chen_Liang3;~Haoming_Jiang1;~Simiao_Zuo1;~Pengcheng_He2;~Xiaodong_Liu1;~Jianfeng_Gao1;~Weizhu_Chen1;~Tuo_Zhao1", "aff": "Georgia Institute of Technology;Amazon;Georgia Institute of Technology;Microsoft;Microsoft Research;Microsoft Research;Microsoft GenAI;Georgia Institute of Technology", "aff_domain": "gatech.edu;amazon.com;gatech.edu;microsoft.com;microsoft.com;microsoft.com;microsoft.com;gatech.edu", "position": "PhD student;Principal Researcher;PhD student;Principal Researcher;Researcher;Principal Researcher;Vice President;Associate Professor", "bibtex": "@inproceedings{\nliang2022no,\ntitle={No Parameters Left Behind: Sensitivity Guided Adaptive Learning Rate for Training Large Transformer Models},\nauthor={Chen Liang and Haoming Jiang and Simiao Zuo and Pengcheng He and Xiaodong Liu and Jianfeng Gao and Weizhu Chen and Tuo Zhao},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=cuvga_CiVND}\n}", "github": "", "project": "", "reviewers": "sSHP;MzBV;NSqH;4pzE", "pdf_size": 0, "recommendation": "6;6;6;8", "confidence": "3;4;4;4", "correctness": "4;3;4;3", "technical_novelty": "3;4;2;3", "empirical_novelty": "2;4;3;3", "wc_summary_paper": "96;60;104;122", "wc_summary_review": "31;80;48;40", "wc_main_review": "218;352;178;230", "wc_review": "345;492;330;392", "wc_reply_reviewers": "88;0;78;0", "wc_reply_authors": "305;751;361;695", "reply_reviewers": "1;0;1;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 95.5, 22.555487137279922 ], "wc_summary_review_avg": [ 49.75, 18.471261462065875 ], "wc_main_review_avg": [ 244.5, 64.98269000279997 ], "wc_review_avg": [ 389.75, 63.31024798561446 ], "wc_reply_reviewers_avg": [ 41.5, 41.650330130744464 ], "wc_reply_authors_avg": [ 528.0, 197.0 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": -0.5773502691896257, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17779998406940212088&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=cuvga_CiVND", "email": "gatech.edu;amazon.com;gatech.edu;microsoft.com;microsoft.com;microsoft.com;microsoft.com;gatech.edu", "author_num": 8, "aff_unique_index": "0;1;0;2;2;2;2;0", "aff_unique_norm": "Georgia Institute of Technology;Amazon;Microsoft", "aff_unique_dep": ";Amazon.com, Inc.;Microsoft Corporation", "aff_unique_url": "https://www.gatech.edu;https://www.amazon.com;https://www.microsoft.com", "aff_unique_abbr": "Georgia Tech;Amazon;Microsoft", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Group-based Interleaved Pipeline Parallelism for Large-scale DNN Training", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/5959", "id": "cw-EmNq5zfD", "poster": "", "openreview": "https://openreview.net/forum?id=cw-EmNq5zfD", "slides": "https://iclr.cc/virtual/2022/poster/5959", "video": "https://iclr.cc/virtual/2022/poster/5959", "author_site": "PengCheng Yang, Xiaoming Zhang, Wenpeng Zhang, Ming Yang, Hong Wei", "tldr": "", "abstract": "The recent trend of using large-scale deep neural networks (DNN) to boost performance has propelled the development of the parallel pipelining technique for efficient DNN training, which has resulted in the development of several prominent pipelines such as GPipe, PipeDream, and PipeDream-2BW. However, the current leading pipeline PipeDream-2BW still suffers from two major drawbacks, i.e., the excessive memory redundancy and the delayed weight updates across all stages. In this work, we propose a novel pipeline named WPipe, which achieves better memory efficiency and fresher weight updates. WPipe uses a novel pipelining scheme that divides model partitions into two groups. It moves the forward pass of the next period of weight updates to the front of the backward pass of the current period of weight updates in the first group, retains the order in the second group, and updates each group alternatively. This scheme can eliminate half of the delayed gradients and memory redundancy compared to PipeDream-2BW. The experiments, which train large BERT language models, show that compared to PipeDream-2BW, WPipe achieves $1.4\\times$ acceleration and reduces the memory footprint by 36%, without nearly sacrificing any final model accuracy.", "keywords": "Model parallelism;Pipeline parallelism;Distributed training", "primary_area": "", "supplementary_material": "/attachment/6f606daa0d230a8689305e7361c07f60fb3c95a1.zip", "author": "PengCheng Yang;Xiaoming Zhang;Wenpeng Zhang;Ming Yang;Hong Wei", "authorids": "~PengCheng_Yang4;~Xiaoming_Zhang2;~Wenpeng_Zhang1;~Ming_Yang9;~Hong_Wei3", "gender": "M;M;M;M;M", "homepage": ";;;;", "dblp": ";;203/4474.html;;", "google_scholar": "https://scholar.google.com/citations?hl=en;5Wp32IUAAAAJ;EMMkuFMAAAAJ;https://scholar.google.com/citations?view_op=list_works;https://scholar.google.com/citations?view_op=list_works", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~PengCheng_Yang4;~Xiaoming_Zhang2;~Wenpeng_Zhang1;~Ming_Yang9;~Hong_Wei3", "aff": ";Ant Group;Ant Group;;", "aff_domain": ";antgroup.com;ant.com;;", "position": ";Researcher;Researcher;;", "bibtex": "@inproceedings{\nyang2022groupbased,\ntitle={Group-based Interleaved Pipeline Parallelism for Large-scale {DNN} Training},\nauthor={PengCheng Yang and Xiaoming Zhang and Wenpeng Zhang and Ming Yang and Hong Wei},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=cw-EmNq5zfD}\n}", "github": "", "project": "", "reviewers": "VWfQ;fotX;5TDo;rrtJ", "pdf_size": 0, "recommendation": "3;6;8;8", "confidence": "3;4;4;4", "correctness": "3;4;4;3", "technical_novelty": "2;2;2;4", "empirical_novelty": "2;2;3;2", "wc_summary_paper": "59;81;63;53", "wc_summary_review": "31;31;113;24", "wc_main_review": "399;288;312;286", "wc_review": "489;400;488;363", "wc_reply_reviewers": "168;177;0;0", "wc_reply_authors": "1008;773;6;419", "reply_reviewers": "1;1;0;0", "reply_authors": "2;2;1;1", "recommendation_avg": [ 6.25, 2.0463381929681126 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.5, 0.8660254037844386 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 64.0, 10.44030650891055 ], "wc_summary_review_avg": [ 49.75, 36.62905267680288 ], "wc_main_review_avg": [ 321.25, 46.04006407467305 ], "wc_review_avg": [ 435.0, 55.07721852090935 ], "wc_reply_reviewers_avg": [ 86.25, 86.30867569369838 ], "wc_reply_authors_avg": [ 551.5, 378.34541096728003 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.9169493006161777, "corr_recommendation_correctness": 0.3665083330689157, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11056763990335813261&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "pdf": "https://openreview.net/pdf?id=cw-EmNq5zfD", "email": ";antgroup.com;ant.com;;", "author_num": 5, "aff_unique_index": "0;0", "aff_unique_norm": "Ant Group", "aff_unique_dep": "", "aff_unique_url": "https://www.antgroup.com", "aff_unique_abbr": "Ant Group", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "id": "czmQDWhGwd9", "title": "Representations of Computer Programs in the Human Brain", "track": "main", "status": "Reject", "tldr": "", "abstract": "We present the first study relating representations of computer programs generated by unsupervised machine learning (ML) models and representations of computer programs in the human brain. We analyze recordings---brain representations---from functional magnetic resonance imaging (fMRI) studies of people comprehending Python code. We discover brain representations, in different and specific regions of the brain, that encode static and dynamic properties of code such as abstract syntax tree (AST)-related information and runtime information. We also map brain representations to representations of a suite of ML models that vary in their complexity. We find that the Multiple Demand system, a system of brain regions previously shown to respond to code, contains information about multiple specific code properties, as well as machine learned representations of code. We make all the corresponding code, data, and analysis publicly available.", "keywords": "ML for PL/SE;ML models of code;Code representations;Brain representations;Cognitive neuroscience;Multivoxel pattern analysis;Representation decoding analysis;Representation similarity analysis;fMRI analysis", "primary_area": "", "supplementary_material": "", "author": "Shashank Srikant;Benjamin Lipkin;Anna A Ivanova;Evelina Fedorenko;Una-May O'Reilly", "authorids": "~Shashank_Srikant1;lipkinb@mit.edu;~Anna_A_Ivanova1;~Evelina_Fedorenko1;~Una-May_O'Reilly1", "gender": ";;F;F;F", "homepage": ";;https://neuranna.mit.edu/;http://evlab.mit.edu;https://alfagroup.csail.mit.edu/unamay", "dblp": "52/8772;;;;o/UnaMayOReilly", "google_scholar": ";;hBUjCB0AAAAJ;1CgET20AAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";;0000-0002-1184-8299;;0000-0001-6923-8445", "linkedin": ";;;;", "or_profile": "~Shashank_Srikant1;lipkinb@mit.edu;~Anna_A_Ivanova1;~Evelina_Fedorenko1;~Una-May_O'Reilly1", "aff": "Massachusetts Institute of Technology;;Massachusetts Institute of Technology;Massachusetts Institute of Technology;Massachusetts Institute of Technology", "aff_domain": "mit.edu;;mit.edu;mit.edu;mit.edu", "position": "PhD student;;PhD student;Associate Professor;Principal Researcher", "bibtex": "@misc{\nsrikant2022representations,\ntitle={Representations of Computer Programs in the Human Brain},\nauthor={Shashank Srikant and Benjamin Lipkin and Anna A Ivanova and Evelina Fedorenko and Una-May O'Reilly},\nyear={2022},\nurl={https://openreview.net/forum?id=czmQDWhGwd9}\n}", "github": "", "project": "", "reviewers": "mt3z;ELAX;txju;vpX5", "site": "https://openreview.net/forum?id=czmQDWhGwd9", "pdf_size": 0, "recommendation": "5;5;5;5", "confidence": "5;4;3;4", "correctness": "3;4;3;3", "technical_novelty": "1;2;2;3", "empirical_novelty": "2;3;2;2", "wc_summary_paper": "120;73;138;231", "wc_summary_review": "49;44;86;99", "wc_main_review": "941;283;311;1007", "wc_review": "1110;400;535;1337", "wc_reply_reviewers": "260;78;82;196", "wc_reply_authors": "3493;1417;2230;4893", "reply_reviewers": "2;1;1;2", "reply_authors": "6;3;4;10", "recommendation_avg": [ 5.0, 0.0 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 140.5, 57.38684518249805 ], "wc_summary_review_avg": [ 69.5, 23.521266972678152 ], "wc_main_review_avg": [ 635.5, 339.44771320484693 ], "wc_review_avg": [ 845.5, 389.3626201884305 ], "wc_reply_reviewers_avg": [ 154.0, 77.3950902835574 ], "wc_reply_authors_avg": [ 3008.25, 1315.7711379643497 ], "reply_reviewers_avg": [ 1.5, 0.5 ], "reply_authors_avg": [ 5.75, 2.680951323690902 ], "replies_avg": [ 34, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:RQ_2zuZ6BhwJ:scholar.google.com/&scioq=Representations+of+Computer+Programs+in+the+Human+Brain&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Massachusetts Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://web.mit.edu", "aff_unique_abbr": "MIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "d20jtFYzyxe", "title": "A Rate-Distortion Approach to Domain Generalization", "track": "main", "status": "Reject", "tldr": "", "abstract": "Domain generalization deals with the difference in the distribution between the training and testing datasets, i.e., the domain shift problem, by extracting domain-invariant features. In this paper, we propose an information-theoretic approach for domain generalization. We first establish the domain transformation model, mapping a domain-free latent image into a domain. Then, we cast the domain generalization as a rate-distortion problem, and use the information bottleneck penalty to measure how well the domain-free latent image is reconstructed from a compressed representation of a domain-specific image compared to its direct prediction from the domain-specific image itself. We prove that the information bottleneck penalty guarantees that domain-invariant features can be learned. Lastly, we draw links of our proposed method with self-supervised contrastive learning without negative data pairs. Our empirical study on two different tasks verifies the improvement over recent baselines. \n", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yihang Chen;Grigorios Chrysos;Volkan Cevher", "authorids": "~Yihang_Chen1;~Grigorios_Chrysos1;~Volkan_Cevher1", "gender": "M;M;M", "homepage": "https://yhangchen.github.io/;https://grigorisg9gr.github.io/;http://lions.epfl.ch", "dblp": ";75/6117-2;70/5301", "google_scholar": "HzlOQRkAAAAJ;1bU041kAAAAJ;https://scholar.google.ch/citations?user=hlWhzU8AAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Yihang_Chen1;~Grigorios_Chrysos1;~Volkan_Cevher1", "aff": "EPFL - EPF Lausanne;Swiss Federal Institute of Technology Lausanne;Swiss Institute of Technology", "aff_domain": "epfl.ch;epfl.ch;epfl.ch", "position": "MS student;Postdoc;Associate Professor", "bibtex": "@misc{\nchen2022a,\ntitle={A Rate-Distortion Approach to Domain Generalization},\nauthor={Yihang Chen and Grigorios Chrysos and Volkan Cevher},\nyear={2022},\nurl={https://openreview.net/forum?id=d20jtFYzyxe}\n}", "github": "", "project": "", "reviewers": "t26w;DWwT;eG9q;38sD", "site": "https://openreview.net/forum?id=d20jtFYzyxe", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "5;3;4;5", "correctness": "2;3;3;3", "technical_novelty": "2;2;2;3", "empirical_novelty": "3;2;1;2", "wc_summary_paper": "158;58;115;41", "wc_summary_review": "76;103;93;29", "wc_main_review": "396;232;348;518", "wc_review": "630;393;556;588", "wc_reply_reviewers": "0;0;0;160", "wc_reply_authors": "440;420;653;409", "reply_reviewers": "0;0;0;1", "reply_authors": "3;2;2;2", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 4.25, 0.82915619758885 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 93.0, 46.47042069962354 ], "wc_summary_review_avg": [ 75.25, 28.39344114403888 ], "wc_main_review_avg": [ 373.5, 102.54145503161148 ], "wc_review_avg": [ 541.75, 89.80082126573231 ], "wc_reply_reviewers_avg": [ 40.0, 69.2820323027551 ], "wc_reply_authors_avg": [ 480.5, 100.21102733731453 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 2.25, 0.4330127018922193 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.20751433915982243, "corr_recommendation_correctness": 0.9271726499455306, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:7NiOXOUGmvYJ:scholar.google.com/&scioq=A+Rate-Distortion+Approach+to+Domain+Generalization&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1;2", "aff_unique_norm": "EPFL;Swiss Federal Institute of Technology Lausanne;Swiss Federal Institute of Technology", "aff_unique_dep": ";;", "aff_unique_url": "https://www.epfl.ch;https://www.epfl.ch;https://www.ethz.ch", "aff_unique_abbr": "EPFL;EPFL;ETH Zurich", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Lausanne;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Switzerland" }, { "title": "Non-Linear Operator Approximations for Initial Value Problems", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7107", "id": "d2TT6gK9qZn", "poster": "", "openreview": "https://openreview.net/forum?id=d2TT6gK9qZn", "slides": "https://iclr.cc/virtual/2022/poster/7107", "video": "https://iclr.cc/virtual/2022/poster/7107", "author_site": "Gaurav Gupta, Xiongye Xiao, Radu Balan, Paul Bogdan", "tldr": "", "abstract": "Time-evolution of partial differential equations is the key to model several dynamical processes, events forecasting but the operators associated with such problems are non-linear. We propose a Pad\u00e9 approximation based exponential neural operator scheme for efficiently learning the map between a given initial condition and activities at a later time. The multiwavelets bases are used for space discretization. By explicitly embedding the exponential operators in the model, we reduce the training parameters and make it more data-efficient which is essential in dealing with scarce real-world datasets. The Pad\u00e9 exponential operator uses a $\\textit{recurrent structure with shared parameters}$ to model the non-linearity compared to recent neural operators that rely on using multiple linear operator layers in succession. We show theoretically that the gradients associated with the recurrent Pad\u00e9 network are bounded across the recurrent horizon. We perform experiments on non-linear systems such as Korteweg-de Vries (KdV) and Kuramoto\u2013Sivashinsky (KS) equations to show that the proposed approach achieves the best performance and at the same time is data-efficient. We also show that urgent real-world problems like Epidemic forecasting (for example, COVID-19) can be formulated as a 2D time-varying operator problem. The proposed Pad\u00e9 exponential operators yield better prediction results ($\\textbf{53\\%} (\\textbf{52\\%})$ better MAE than best neural operator (non-neural operator deep learning model)) compared to state-of-the-art forecasting models.", "keywords": "exponential operators;initial value problem;pade approximation;multiwavelets;partial differential equations", "primary_area": "", "supplementary_material": "/attachment/4efb22d2272d9a77a6ba7b24001ee48778f337e7.zip", "author": "Gaurav Gupta;Xiongye Xiao;Radu Balan;Paul Bogdan", "authorids": "~Gaurav_Gupta2;~Xiongye_Xiao1;~Radu_Balan1;~Paul_Bogdan1", "gender": "M;M;;M", "homepage": "http://guptagaurav.me/;;http://math.umd.edu/~rvbalan/;https://cps.usc.edu/", "dblp": ";301/0208;02/3242;05/5539", "google_scholar": "Maqaq6MAAAAJ;AvIxA64AAAAJ;YIOLxD4AAAAJ;Xw_v8-gAAAAJ", "orcid": ";0000-0002-3181-7166;0000-0002-6217-3236;0000-0003-2118-0816", "linkedin": "gaurav71531/;;;paul-bogdan-4b098a6/", "or_profile": "~Gaurav_Gupta2;~Xiongye_Xiao1;~Radu_Balan1;~Paul_Bogdan1", "aff": "University of Southern California;University of Southern California;University of Maryland, College Park;University of Southern California", "aff_domain": "usc.edu;usc.edu;umd.edu;usc.edu", "position": "PhD Candidate;PhD student;Professor;Jack Munushian Early Career Chair associate professor", "bibtex": "@inproceedings{\ngupta2022nonlinear,\ntitle={Non-Linear Operator Approximations for Initial Value Problems},\nauthor={Gaurav Gupta and Xiongye Xiao and Radu Balan and Paul Bogdan},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=d2TT6gK9qZn}\n}", "github": "", "project": "", "reviewers": "QGHZ;uucn;EkGn;tvEi", "pdf_size": 0, "recommendation": "3;5;6;8", "confidence": "4;4;4;4", "correctness": "3;3;4;4", "technical_novelty": "1;3;3;3", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "67;65;36;68", "wc_summary_review": "28;30;27;34", "wc_main_review": "408;350;113;185", "wc_review": "503;445;176;287", "wc_reply_reviewers": "0;0;116;17", "wc_reply_authors": "669;1288;674;594", "reply_reviewers": "0;0;1;1", "reply_authors": "1;3;2;2", "recommendation_avg": [ 5.5, 1.8027756377319946 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.5, 0.8660254037844386 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 59.0, 13.322912594474229 ], "wc_summary_review_avg": [ 29.75, 2.680951323690902 ], "wc_main_review_avg": [ 264.0, 119.55542647659286 ], "wc_review_avg": [ 352.75, 129.0820959699679 ], "wc_reply_reviewers_avg": [ 33.25, 48.27719440895463 ], "wc_reply_authors_avg": [ 806.25, 279.93782791898633 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 2.0, 0.7071067811865476 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.8320502943378437, "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12794317138589206376&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "pdf": "https://openreview.net/pdf?id=d2TT6gK9qZn", "email": "usc.edu;usc.edu;umd.edu;usc.edu", "author_num": 4, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "University of Southern California;University of Maryland", "aff_unique_dep": ";", "aff_unique_url": "https://www.usc.edu;https://www/umd.edu", "aff_unique_abbr": "USC;UMD", "aff_campus_unique_index": "0;0;1;0", "aff_campus_unique": "Los Angeles;College Park", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "d2XZsOT-_U_", "title": "Match Prediction Using Learned History Embeddings", "track": "main", "status": "Reject", "tldr": "", "abstract": "Contemporary ranking systems that are based on win/loss history, such as Elo or TrueSkill represent each player using a scalar estimate of ability (plus variance, in the latter case). While easily interpretable, this approach has a number of shortcomings: (i) latent attributes of a player cannot be represented, and (ii) it cannot seamlessly incorporate contextual information (e.g. home-field advantage). In this work, we propose a simple Transformer-based approach for pairwise competitions that recursively operates on game histories, rather than modeling players directly. By characterizing each player entirely by its history, rather than an underlying scalar skill estimate, it is able to make accurate predictions even for new players with limited history. Additionally, it is able to model both transitive and non-transitive relations and can leverage contextual information. When restricted to the same information as Elo and Glicko, our approach significantly outperforms them on predicting the outcome of real-world Chess, Baseball and Ice Hockey games. %Further gains can be achieved when game meta-data is added.\n", "keywords": "skill ranking;skill rating;skill", "primary_area": "", "supplementary_material": "", "author": "Maxwell Goldstein;Leon Bottou;Rob Fergus", "authorids": "~Maxwell_Goldstein1;~Leon_Bottou1;~Rob_Fergus1", "gender": ";M;M", "homepage": "https://wp.nyu.edu/cilvr/;http://leon.bottou.org;http://cs.nyu.edu/fergus/", "dblp": ";30/1046;77/3763", "google_scholar": ";kbN88gsAAAAJ;https://scholar.google.com.tw/citations?user=GgQ9GEkAAAAJ", "orcid": ";0000-0002-9894-8128;", "linkedin": ";;", "or_profile": "~Maxwell_Goldstein1;~Leon_Bottou1;~Rob_Fergus1", "aff": "New York University;New York University;Google", "aff_domain": "nyu.edu;nyu.edu;google.com", "position": "PhD student;Visiting faculty;Research scientist", "bibtex": "@misc{\ngoldstein2022match,\ntitle={Match Prediction Using Learned History Embeddings},\nauthor={Maxwell Goldstein and Leon Bottou and Rob Fergus},\nyear={2022},\nurl={https://openreview.net/forum?id=d2XZsOT-_U_}\n}", "github": "", "project": "", "reviewers": "ciZW;TCF7;eXjB", "site": "https://openreview.net/forum?id=d2XZsOT-_U_", "pdf_size": 0, "recommendation": "3;3;6", "confidence": "4;4;4", "correctness": "3;3;3", "technical_novelty": "3;2;4", "empirical_novelty": "3;2;2", "wc_summary_paper": "66;79;96", "wc_summary_review": "48;42;73", "wc_main_review": "259;336;542", "wc_review": "373;457;711", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "283;181;29", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "recommendation_avg": [ 4.0, 1.4142135623730951 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 3.0, 0.816496580927726 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 80.33333333333333, 12.283683848458853 ], "wc_summary_review_avg": [ 54.333333333333336, 13.424687043734844 ], "wc_main_review_avg": [ 379.0, 119.46826635833746 ], "wc_review_avg": [ 513.6666666666666, 143.68793346075452 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 164.33333333333334, 104.36261569908812 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:nSn5ObRp1WIJ:scholar.google.com/&scioq=Match+Prediction+Using+Learned+History+Embeddings&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;1", "aff_unique_norm": "New York University;Google", "aff_unique_dep": ";Google", "aff_unique_url": "https://www.nyu.edu;https://www.google.com", "aff_unique_abbr": "NYU;Google", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "d5IQ3k7ed__", "title": "Finding General Equilibria in Many-Agent Economic Simulations using Deep Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Real economies can be seen as a sequential imperfect-information game with many heterogeneous, interacting strategic agents of various agent types, such as consumers, firms, and governments. Dynamic general equilibrium models are common economic tools to model the economic activity, interactions, and outcomes in such systems. However, existing analytical and computational methods struggle to find explicit equilibria when all agents are strategic and interact, while joint learning is unstable and challenging. Amongst others, a key reason is that the actions of one economic agent may change the reward function of another agent, e.g., a consumer's expendable income changes when firms change prices or governments change taxes. We show that multi-agent deep reinforcement learning (RL) can discover stable solutions that are $\\epsilon$-Nash equilibria for a meta-game over agent types, in economic simulations with many agents, through the use of structured learning curricula and efficient GPU-only simulation and training.Conceptually, our approach is more flexible and does not need unrealistic assumptions, e.g., market clearing, that are commonly used for analytical tractability. Our GPU implementation enables training and analyzing economies with a large number of agents within reasonable time frames, e.g., training completes within a day. We demonstrate our approach in real-business-cycle models, a representative family of DGE models, with 100 worker-consumers, 10 firms, and a government who taxes and redistributes. We validate the learned meta-game $\\epsilon$-Nash equilibria through approximate best-response analyses, show that RL policies align with economic intuitions, and that our approach is constructive, e.g., by explicitly learning a spectrum of meta-game $\\epsilon$-Nash equilibria in open economic models.", "keywords": "reinforcement learning;economics;simulation;multi-agent RL;equilibrium", "primary_area": "", "supplementary_material": "", "author": "Michael Curry;Alexander R Trott;Soham Phade;Yu Bai;Stephan Zheng", "authorids": "~Michael_Curry2;~Alexander_R_Trott1;sphade@salesforce.com;~Yu_Bai1;~Stephan_Zheng1", "gender": "M;M;;;M", "homepage": "https://currymj.github.io;;;https://yubai.org;http://www.stephanzheng.com", "dblp": "255/4719;;;03/6325-17.html;https://dblp.org/pers/hd/z/Zheng:Stephan", "google_scholar": "EOlowBUAAAAJ;rB4bvV0AAAAJ;;owqhKD8AAAAJ;7mnKGGEAAAAJ", "orcid": ";;;;", "linkedin": ";;;;stephanzheng", "or_profile": "~Michael_Curry2;~Alexander_R_Trott1;sphade@salesforce.com;~Yu_Bai1;~Stephan_Zheng1", "aff": "University of Zurich;Salesforce Research;;Salesforce Research;SalesForce.com", "aff_domain": "uzh.ch;salesforce.com;;salesforce.com;salesforce.com", "position": "Postdoc;Research Scientist;;Research Scientist;Lead Research Scientist", "bibtex": "@misc{\ncurry2022finding,\ntitle={Finding General Equilibria in Many-Agent Economic Simulations using Deep Reinforcement Learning},\nauthor={Michael Curry and Alexander R Trott and Soham Phade and Yu Bai and Stephan Zheng},\nyear={2022},\nurl={https://openreview.net/forum?id=d5IQ3k7ed__}\n}", "github": "", "project": "", "reviewers": "pWNT;yaWE;JN4A;AWjh", "site": "https://openreview.net/forum?id=d5IQ3k7ed__", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "4;3;3;3", "correctness": "2;3;4;3", "technical_novelty": "2;2;3;3", "empirical_novelty": "3;2;3;3", "wc_summary_paper": "82;55;56;104", "wc_summary_review": "62;54;14;51", "wc_main_review": "464;790;109;214", "wc_review": "608;899;179;369", "wc_reply_reviewers": "0;959;0;32", "wc_reply_authors": "779;4403;104;604", "reply_reviewers": "0;2;0;1", "reply_authors": "1;8;1;1", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 74.25, 20.30240133580262 ], "wc_summary_review_avg": [ 45.25, 18.48479104561369 ], "wc_main_review_avg": [ 394.25, 262.364607940934 ], "wc_review_avg": [ 513.75, 269.4024637971969 ], "wc_reply_reviewers_avg": [ 247.75, 410.8481319173789 ], "wc_reply_authors_avg": [ 1472.5, 1709.960306556851 ], "reply_reviewers_avg": [ 0.75, 0.82915619758885 ], "reply_authors_avg": [ 2.75, 3.031088913245535 ], "replies_avg": [ 20, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.9271726499455306, "corr_recommendation_correctness": 0.6488856845230502, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15403514154993293589&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;1;1", "aff_unique_norm": "University of Zurich;Salesforce", "aff_unique_dep": ";Salesforce Research", "aff_unique_url": "https://www.unizh.ch;https://research.salesforce.com", "aff_unique_abbr": "UZH;Salesforce", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1", "aff_country_unique": "Switzerland;United States" }, { "title": "Objects in Semantic Topology", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6223", "id": "d5SCUJ5t1k", "poster": "", "openreview": "https://openreview.net/forum?id=d5SCUJ5t1k", "slides": "https://iclr.cc/virtual/2022/poster/6223", "video": "https://iclr.cc/virtual/2022/poster/6223", "author_site": "Shuo Yang, Peize Sun, Yi Jiang, Xiaobo Xia, Ruiheng Zhang, Zehuan Yuan, Changhu Wang, Ping Luo, Min Xu", "tldr": "", "abstract": "A more realistic object detection paradigm, Open-World Object Detection, has arised increasing research interests in the community recently. A qualified open-world object detector can not only identify objects of known categories, but also discover unknown objects, and incrementally learn to categorize them when their annotations progressively arrive. Previous works rely on independent modules to recognize unknown categories and perform incremental learning, respectively. In this paper, we provide a unified perspective: Semantic Topology. During the life-long learning of an open-world object detector, all object instances from the same category are assigned to their corresponding pre-defined node in the semantic topology, including the `unknown' category. This constraint builds up discriminative feature representations and consistent relationships among objects, thus enabling the detector to distinguish unknown objects out of the known categories, as well as making learned features of known objects undistorted when learning new categories incrementally. Extensive experiments demonstrate that semantic topology, either randomly-generated or derived from a well-trained language model, could outperform the current state-of-the-art open-world object detectors by a large margin, e.g., the absolute open-set error (the number of unknown instances that are wrongly labeled as known) is reduced from 7832 to 2546, exhibiting the inherent superiority of semantic topology on open-world object detection.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Shuo Yang;Peize Sun;Yi Jiang;Xiaobo Xia;Ruiheng Zhang;Zehuan Yuan;Changhu Wang;Ping Luo;Min Xu", "authorids": "~Shuo_Yang5;~Peize_Sun1;~Yi_Jiang2;~Xiaobo_Xia1;~Ruiheng_Zhang1;~Zehuan_Yuan1;~Changhu_Wang3;~Ping_Luo2;~Min_Xu5", "gender": "M;M;M;M;M;M;F;M;", "homepage": "https://faculty.hitsz.edu.cn/yangshuo;https://peizesun.github.io/;https://enjoyyi.github.io/;https://xiaoboxia.github.io/;;https://shallowyuan.github.io/;https://www.uts.edu.au/staff/min.xu;https://changhu.wang;http://luoping.me/", "dblp": "78/1102-6;249/2345;;242/8072;;227/3298;09/0-1.html;30/3393;54/4989-2.html", "google_scholar": "mVtxxCkAAAAJ;Grkp5AQAAAAJ;https://scholar.google.com.hk/citations?user=6dikuoYAAAAJ;jRsugY0AAAAJ;lPEuNiQAAAAJ;;https://scholar.google.com.au/citations?user=Ac6VCMkAAAAJ;DsVZkjAAAAAJ;https://scholar.google.com.hk/citations?hl=en", "orcid": ";;0000-0002-2133-8719;;0000-0002-5460-7196;;0000-0001-9581-8849;;0000-0002-6685-7950", "linkedin": ";;;;;;;;", "or_profile": "~Shuo_Yang5;~Peize_Sun1;~Yi_Jiang2;~Xiaobo_Xia1;~Ruiheng_Zhang1;~Zehuan_Yuan1;~Min_Xu5;~Changhu_Wang1;~Luo_Ping2", "aff": "University of Technology Sydney, Australia;The University of Hong Kong;Bytedance;The University of Sydney;Beijing Institute of Technology;ByteDance Inc.;University of Technology Sydney;ByteDance Inc.;The University of Hong Kong", "aff_domain": "student.uts.edu.au;hku.hk;bytedance.com;sydney.edu.au;bit.edu.cn;bytedance.com;uts.edu.au;bytedance.com;hku.hk", "position": "PhD student;PhD student;Researcher;PhD student;Associate Professor;Researcher;Associate Professor;Director;Assistant Professor", "bibtex": "@inproceedings{\nyang2022objects,\ntitle={Objects in Semantic Topology},\nauthor={Shuo Yang and Peize Sun and Yi Jiang and Xiaobo Xia and Ruiheng Zhang and Zehuan Yuan and Changhu Wang and Ping Luo and Min Xu},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=d5SCUJ5t1k}\n}", "github": "", "project": "", "reviewers": "8C8m;pp7T;JDQB;fqMP", "pdf_size": 0, "recommendation": "5;5;8;8", "confidence": "3;3;4;4", "correctness": "3;3;3;4", "technical_novelty": "2;2;3;2", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "223;120;105;251", "wc_summary_review": "337;48;35;244", "wc_main_review": "561;428;493;464", "wc_review": "1121;596;633;959", "wc_reply_reviewers": "0;0;238;60", "wc_reply_authors": "623;697;710;555", "reply_reviewers": "0;0;2;1", "reply_authors": "1;1;3;3", "recommendation_avg": [ 6.5, 1.5 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 174.75, 63.254940518508114 ], "wc_summary_review_avg": [ 166.0, 128.85068878356842 ], "wc_main_review_avg": [ 486.5, 48.78780585351221 ], "wc_review_avg": [ 827.25, 220.71290741594612 ], "wc_reply_reviewers_avg": [ 74.5, 97.52307419272631 ], "wc_reply_authors_avg": [ 646.25, 62.26305084076751 ], "reply_reviewers_avg": [ 0.75, 0.82915619758885 ], "reply_authors_avg": [ 2.0, 1.0 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 9, 0 ], "corr_recommendation_confidence": 1.0, "corr_recommendation_correctness": 0.5773502691896258, "gs_citation": 45, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5613182384850284042&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "pdf": "https://openreview.net/pdf?id=d5SCUJ5t1k", "email": "student.uts.edu.au;hku.hk;bytedance.com;sydney.edu.au;bit.edu.cn;bytedance.com;uts.edu.au;bytedance.com;hku.hk", "author_num": 9, "aff_unique_index": "0;1;2;3;4;2;0;2;1", "aff_unique_norm": "University of Technology Sydney;University of Hong Kong;ByteDance;University of Sydney;Beijing Institute of Technology", "aff_unique_dep": ";;;;", "aff_unique_url": "https://www.uts.edu.au;https://www.hku.hk;https://www.bytedance.com;https://www.sydney.edu.au;http://www.bit.edu.cn/", "aff_unique_abbr": "UTS;HKU;Bytedance;USYD;BIT", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;1;1;0;1;1;0;1;1", "aff_country_unique": "Australia;China" }, { "id": "d7-GwtDWNNJ", "title": "Learning Graph Structure from Convolutional Mixtures", "track": "main", "status": "Reject", "tldr": "", "abstract": "Machine learning frameworks such as graph neural networks typically rely on a given, fixed graph to exploit relational inductive biases and thus effectively learn from network data. However, assuming the knowledge of said graphs may be untenable in practice, which motivates the problem of inferring graph structure from data. In this paper, we postulate a graph convolutional relationship between the observed and latent graphs, and formulate the graph learning task as a network inverse (deconvolution) problem. In lieu of eigendecomposition-based spectral methods or iterative optimization solutions, we unroll and truncate proximal gradient iterations to arrive at a parameterized neural network architecture that we call a Graph Deconvolution Network (GDN). GDNs can learn a distribution of graphs in a supervised fashion, and perform link-prediction or edge-weight regression tasks by adapting the loss function. Since layers directly operate on, combine, and refine graph objects (instead of node features), GDNs are inherently inductive and can generalize to larger-sized graphs after training.\nAlgorithm unrolling offers an explicit handle on computational complexity; we trade-off training time in return for quick approximations to the inverse problem solution, obtained via a forward pass through the learnt model. We corroborate GDN's superior graph recovery performance using synthetic data in supervised settings, as well as its ability to generalize to graphs orders of magnitude larger that those seen in training. Using the Human Connectome Project-Young Adult neuroimaging dataset, we demonstrate the robustness and representation power of our model by inferring structural brain networks from functional connectivity estimated using fMRI signals.", "keywords": "Graph Neural Network;Graph Signal Processing;Graph Learning;Topology Inference;Algorithm Unrolling", "primary_area": "", "supplementary_material": "/attachment/3d23a51d4a19fe85b9d2d0c265d1b42b00cb971f.zip", "author": "Max Wasserman;Saurabh Sihag;Gonzalo Mateos;Alejandro Ribeiro", "authorids": "~Max_Wasserman1;~Saurabh_Sihag1;~Gonzalo_Mateos1;~Alejandro_Ribeiro1", "gender": ";M;M;M", "homepage": "https://github.com/maxwass;https://sihags.github.io/;https://www.hajim.rochester.edu/ece/sites/gmateos/;https://alelab.seas.upenn.edu", "dblp": ";172/0928;28/7822;32/15", "google_scholar": ";T8D94-QAAAAJ;4QAOifUAAAAJ;7mrPM4kAAAAJ", "orcid": ";;0000-0002-9847-6298;0000-0003-4230-9906", "linkedin": ";;;", "or_profile": "~Max_Wasserman1;~Saurabh_Sihag1;~Gonzalo_Mateos1;~Alejandro_Ribeiro1", "aff": "University of Rochester;University of Pennsylvania;University of Rochester;University of Pennsylvania", "aff_domain": "rochester.edu;upenn.edu;rochester.edu;upenn.edu", "position": "PhD student;Postdoc;Associate Professor;Full Professor", "bibtex": "@misc{\nwasserman2022learning,\ntitle={Learning Graph Structure from Convolutional Mixtures},\nauthor={Max Wasserman and Saurabh Sihag and Gonzalo Mateos and Alejandro Ribeiro},\nyear={2022},\nurl={https://openreview.net/forum?id=d7-GwtDWNNJ}\n}", "github": "", "project": "", "reviewers": "XnFE;CCxm;ADij;9UsD", "site": "https://openreview.net/forum?id=d7-GwtDWNNJ", "pdf_size": 0, "recommendation": "5;5;5;6", "confidence": "2;5;4;2", "correctness": "2;4;3;3", "technical_novelty": "2;3;3;4", "empirical_novelty": "2;2;3;4", "wc_summary_paper": "58;87;45;72", "wc_summary_review": "10;42;54;58", "wc_main_review": "108;603;458;242", "wc_review": "176;732;557;372", "wc_reply_reviewers": "82;121;0;36", "wc_reply_authors": "747;1850;1003;662", "reply_reviewers": "1;2;0;1", "reply_authors": "2;4;3;2", "recommendation_avg": [ 5.25, 0.4330127018922193 ], "confidence_avg": [ 3.25, 1.299038105676658 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 65.5, 15.660459763365825 ], "wc_summary_review_avg": [ 41.0, 18.841443681416774 ], "wc_main_review_avg": [ 352.75, 190.96514734369725 ], "wc_review_avg": [ 459.25, 207.23823850824442 ], "wc_reply_reviewers_avg": [ 59.75, 45.77321815210287 ], "wc_reply_authors_avg": [ 1065.5, 470.00026595737154 ], "reply_reviewers_avg": [ 1.0, 0.7071067811865476 ], "reply_authors_avg": [ 2.75, 0.82915619758885 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.5555555555555555, "corr_recommendation_correctness": 0.0, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5639502940429474905&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff_unique_index": "0;1;0;1", "aff_unique_norm": "University of Rochester;University of Pennsylvania", "aff_unique_dep": ";", "aff_unique_url": "https://www.rochester.edu;https://www.upenn.edu", "aff_unique_abbr": "U of R;UPenn", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "FedPara: Low-rank Hadamard Product for Communication-Efficient Federated Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6263", "id": "d71n4ftoCBy", "poster": "", "openreview": "https://openreview.net/forum?id=d71n4ftoCBy", "slides": "https://iclr.cc/virtual/2022/poster/6263", "video": "https://iclr.cc/virtual/2022/poster/6263", "author_site": "Nam Hyeon-Woo, Moon Ye-Bin, Tae-Hyun Oh", "tldr": "", "abstract": "In this work, we propose a communication-efficient parameterization, $\\texttt{FedPara}$, for federated learning (FL) to overcome the burdens on frequent model uploads and downloads. Our method re-parameterizes weight parameters of layers using low-rank weights followed by the Hadamard product. Compared to the conventional low-rank parameterization, our $\\texttt{FedPara}$ method is not restricted to low-rank constraints, and thereby it has a far larger capacity. This property enables to achieve comparable performance while requiring 3 to 10 times lower communication costs than the model with the original layers, which is not achievable by the traditional low-rank methods. The efficiency of our method can be further improved by combining with other efficient FL optimizers. In addition, we extend our method to a personalized FL application, $\\texttt{pFedPara}$, which separates parameters into global and local ones. We show that $\\texttt{pFedPara}$ outperforms competing personalized FL methods with more than three times fewer parameters.", "keywords": "Federated learning;Parameterization;Communication efficiency", "primary_area": "", "supplementary_material": "", "author": "Nam Hyeon-Woo;Moon Ye-Bin;Tae-Hyun Oh", "authorids": "~Nam_Hyeon-Woo1;~Moon_Ye-Bin1;~Tae-Hyun_Oh3", "gender": "F;M;M", "homepage": "https://sites.google.com/g.postech.edu/moon-ye-bin/\ud648;https://ami.kaist.ac.kr;https://sites.google.com/view/southhw/", "dblp": "299/7654;119/1450;299/7655", "google_scholar": "Nwq4vPAAAAAJ;dMCBjeIAAAAJ;https://scholar.google.fi/citations?user=1jQ1FNUAAAAJ", "orcid": "0000-0002-0390-6567;0000-0003-0468-1571;", "linkedin": "moon-ye-bin-451b5a245/;tae-hyun-oh-at-mit/;nam-hyeon-woo-8397b6246/", "or_profile": "~Moon_Ye-Bin1;~Tae-Hyun_Oh3;~Nam_Hyeon_Woo1", "aff": "Pohang University of Science and Technology;POSTECH;POSTECH", "aff_domain": "postech.edu;postech.ac.kr;postech.ac.kr", "position": "MS student;Assistant Professor;Master", "bibtex": "@inproceedings{\nhyeon-woo2022fedpara,\ntitle={FedPara: Low-rank Hadamard Product for Communication-Efficient Federated Learning},\nauthor={Nam Hyeon-Woo and Moon Ye-Bin and Tae-Hyun Oh},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=d71n4ftoCBy}\n}", "github": "", "project": "", "reviewers": "M7KC;yBc3;5RoZ;3Rte", "pdf_size": 0, "recommendation": "6;6;6;8", "confidence": "2;4;4;2", "correctness": "4;3;3;3", "technical_novelty": "3;2;2;2", "empirical_novelty": "3;3;2;3", "wc_summary_paper": "103;28;88;85", "wc_summary_review": "63;11;43;37", "wc_main_review": "266;186;337;138", "wc_review": "432;225;468;260", "wc_reply_reviewers": "41;0;23;5", "wc_reply_authors": "721;639;777;456", "reply_reviewers": "1;0;1;1", "reply_authors": "3;1;2;2", "recommendation_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 3.0, 1.0 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 76.0, 28.53944638566067 ], "wc_summary_review_avg": [ 38.5, 18.567444627627143 ], "wc_main_review_avg": [ 231.75, 76.04727148294013 ], "wc_review_avg": [ 346.25, 105.25771943187824 ], "wc_reply_reviewers_avg": [ 17.25, 16.161296358893985 ], "wc_reply_authors_avg": [ 648.25, 121.36180412304358 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 2.0, 0.7071067811865476 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": -0.3333333333333333, "gs_citation": 162, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18061026725686472950&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=d71n4ftoCBy", "email": "postech.edu;postech.ac.kr;postech.ac.kr", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Pohang University of Science and Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.postech.ac.kr", "aff_unique_abbr": "POSTECH", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Pohang", "aff_country_unique_index": "0;0;0", "aff_country_unique": "South Korea" }, { "id": "dAFxBu5OAXh", "title": "Residual Contrastive Learning: Unsupervised Representation Learning from Residuals", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "In the era of deep learning, supervised residual learning (ResL) has led to many breakthroughs in low-level vision such as image restoration and enhancement tasks. However, the question of how to formalize and take advantage of unsupervised ResL remains open. \nIn this paper we consider visual signals with additive noise and propose to build a connection between ResL and self-supervised learning (SSL) via contrastive learning. We present residual contrastive learning (RCL), an unsupervised representation learning framework for downstream low-level vision tasks with noisy inputs. While supervised image reconstruction tasks aim to minimize the residual terms directly, RCL formulates an instance-wise discrimination pretext task by using the residuals as the discriminative feature. Empirical results on low-level vision tasks show that RCL is able to learn more robust and transferable representations in comparison to other SSL frameworks when ingesting noisy images, whilst retaining significantly reduced annotation costs over fully supervised alternatives.", "keywords": "Self-Supervised Representation Learning;Residual Learning;Contrastive Learning", "primary_area": "", "supplementary_material": "", "author": "Nanqing Dong;Matteo Maggioni;Yongxin Yang;Eduardo P\u00e9rez-Pellitero;Ales Leonardis;Steven McDonagh", "authorids": "~Nanqing_Dong1;~Matteo_Maggioni1;~Yongxin_Yang1;~Eduardo_P\u00e9rez-Pellitero1;~Ales_Leonardis1;~Steven_McDonagh1", "gender": ";;;M;;", "homepage": ";;;https://perezpellitero.github.io;;https://smcdonagh.github.io/", "dblp": "198/1455;;;141/9842;;159/2641", "google_scholar": "0DX2YsQAAAAJ;;;oLWr6EwAAAAJ;;https://scholar.google.co.uk/citations?user=k8-q2AoAAAAJ", "orcid": ";;;;;0000-0001-7025-5197", "linkedin": ";;;;;", "or_profile": "~Nanqing_Dong1;~Matteo_Maggioni1;~Yongxin_Yang1;~Eduardo_P\u00e9rez-Pellitero1;~Ales_Leonardis1;~Steven_McDonagh1", "aff": "University of Oxford;;;Huawei Technologies R&D (UK) Ltd.;;Huawei Technologies Ltd.", "aff_domain": "ox.ac.uk;;;huawei.com;;huawei.com", "position": "PhD student;;;Principal Researcher;;Senior Research Scientist", "bibtex": "@misc{\ndong2022residual,\ntitle={Residual Contrastive Learning: Unsupervised Representation Learning from Residuals},\nauthor={Nanqing Dong and Matteo Maggioni and Yongxin Yang and Eduardo P{\\'e}rez-Pellitero and Ales Leonardis and Steven McDonagh},\nyear={2022},\nurl={https://openreview.net/forum?id=dAFxBu5OAXh}\n}", "github": "", "project": "", "reviewers": "wMFe;Kkbe;wRf6", "site": "https://openreview.net/forum?id=dAFxBu5OAXh", "pdf_size": 0, "recommendation": "3;3;6", "confidence": "4;3;3", "correctness": "2;4;3", "technical_novelty": "3;2;3", "empirical_novelty": "1;2;3", "wc_summary_paper": "88;244;54", "wc_summary_review": "67;104;34", "wc_main_review": "592;320;110", "wc_review": "747;668;198", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "789;730;409", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "recommendation_avg": [ 4.0, 1.4142135623730951 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "correctness_avg": [ 3.0, 0.816496580927726 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.0, 0.816496580927726 ], "wc_summary_paper_avg": [ 128.66666666666666, 82.72578531241365 ], "wc_summary_review_avg": [ 68.33333333333333, 28.592928418676454 ], "wc_main_review_avg": [ 340.6666666666667, 197.31756693772152 ], "wc_review_avg": [ 537.6666666666666, 242.33631359928066 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 642.6666666666666, 166.97371715998366 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.5, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:NaMjoE-4oJEJ:scholar.google.com/&scioq=Residual+Contrastive+Learning:+Unsupervised+Representation+Learning+from+Residuals&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;1", "aff_unique_norm": "University of Oxford;Huawei", "aff_unique_dep": ";R&D", "aff_unique_url": "https://www.ox.ac.uk;https://www.huawei.com/uk", "aff_unique_abbr": "Oxford;Huawei", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1", "aff_country_unique": "United Kingdom;China" }, { "id": "dDARN-TCiA", "title": "Stochastic Reweighted Gradient Descent", "track": "main", "status": "Reject", "tldr": "", "abstract": "Importance sampling is a promising strategy for improving the convergence rate of stochastic gradient methods. It is typically used to precondition the optimization problem, but it can also be used to reduce the variance of the gradient estimator. Unfortunately, this latter point of view has yet to lead to practical methods that improve the asymptotic error of stochastic gradient methods. In this work, we propose stochastic reweighted gradient (SRG), a variance-reduced stochastic gradient method based solely on importance sampling that can improve on the asymptotic error of stochastic gradient descent (SGD) in the strongly convex and smooth case. We show that SRG can be extended to combine the benefits of both importance-sampling-based preconditioning and variance reduction. When compared to SGD, the resulting algorithm can simultaneously reduce the condition number and the asymptotic error, both by up to a factor equal to the number of component functions. We demonstrate improved convergence in practice on $\\ell_2$-regularized logistic regression problems.", "keywords": "Stochastic gradient descent;Finite-sum optimization;Variance reduction;Importance sampling", "primary_area": "", "supplementary_material": "/attachment/db505f90cec5978f222c8100e72ab993bf090f0d.zip", "author": "Ayoub El Hanchi;Chris J. Maddison;David Alan Stephens", "authorids": "~Ayoub_El_Hanchi1;~Chris_J._Maddison1;david.stephens@mcgill.ca", "gender": "M;;", "homepage": "https://www.cs.toronto.edu/~aelhan/;;", "dblp": ";;", "google_scholar": "5ZzcGmgAAAAJ;;", "orcid": ";;", "linkedin": ";;", "or_profile": "~Ayoub_El_Hanchi1;~Chris_J._Maddison1;david.stephens@mcgill.ca", "aff": "University of Toronto;;", "aff_domain": "toronto.edu;;", "position": "PhD student;;", "bibtex": "@misc{\nhanchi2022stochastic,\ntitle={Stochastic Reweighted Gradient Descent},\nauthor={Ayoub El Hanchi and Chris J. Maddison and David Alan Stephens},\nyear={2022},\nurl={https://openreview.net/forum?id=dDARN-TCiA}\n}", "github": "", "project": "", "reviewers": "wvfK;i1ZH;N3uR;F7eT", "site": "https://openreview.net/forum?id=dDARN-TCiA", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "4;3;4;4", "correctness": "4;3;4;4", "technical_novelty": "2;3;3;2", "empirical_novelty": "2;3;3;2", "wc_summary_paper": "181;62;62;10", "wc_summary_review": "47;40;2;23", "wc_main_review": "89;181;176;23", "wc_review": "317;283;240;56", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "400;177;453;36", "reply_reviewers": "0;0;0;0", "reply_authors": "2;2;6;1", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 78.75, 62.735057981960935 ], "wc_summary_review_avg": [ 28.0, 17.363755354185336 ], "wc_main_review_avg": [ 117.25, 65.56818969591886 ], "wc_review_avg": [ 224.0, 100.75961492582233 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 266.5, 168.63051325308834 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.75, 1.920286436967152 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13495144079776331672&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff_unique_index": "0", "aff_unique_norm": "University of Toronto", "aff_unique_dep": "", "aff_unique_url": "https://www.utoronto.ca", "aff_unique_abbr": "U of T", "aff_country_unique_index": "0", "aff_country_unique": "Canada" }, { "title": "Incremental False Negative Detection for Contrastive Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6346", "id": "dDjSKKA5TP1", "poster": "", "openreview": "https://openreview.net/forum?id=dDjSKKA5TP1", "slides": "https://iclr.cc/virtual/2022/poster/6346", "video": "https://iclr.cc/virtual/2022/poster/6346", "author_site": "Tsai-Shien Chen, Wei-Chih Hung, Hung-Yu Tseng, Shao-Yi Chien, Ming-Hsuan Yang", "tldr": "", "abstract": "Self-supervised learning has recently shown great potential in vision tasks through contrastive learning, which aims to discriminate each image, or instance, in the dataset. However, such instance-level learning ignores the semantic relationship among instances and sometimes undesirably repels the anchor from the semantically similar samples, termed as \"false negatives\". In this work, we show that the unfavorable effect from false negatives is more significant for the large-scale datasets with more semantic concepts. To address the issue, we propose a novel self-supervised contrastive learning framework that incrementally detects and explicitly removes the false negative samples. Specifically, following the training process, our method dynamically detects increasing high-quality false negatives considering that the encoder gradually improves and the embedding space becomes more semantically structural. Next, we discuss two strategies to explicitly remove the detected false negatives during contrastive learning. Extensive experiments show that our framework outperforms other self-supervised contrastive learning methods on multiple benchmarks in a limited resource setup.", "keywords": "Self-supervised learning;Contrastive learning;Representation learning;Clustering-based learning", "primary_area": "", "supplementary_material": "", "author": "Tsai-Shien Chen;Wei-Chih Hung;Hung-Yu Tseng;Shao-Yi Chien;Ming-Hsuan Yang", "authorids": "~Tsai-Shien_Chen1;~Wei-Chih_Hung1;~Hung-Yu_Tseng2;~Shao-Yi_Chien1;~Ming-Hsuan_Yang1", "gender": "M;M;;M;M", "homepage": "https://tsaishien-chen.github.io/;;https://hytseng0509.github.io/;https://www.ee.ntu.edu.tw/profile1.php?teacher_id=943013&p=3;https://faculty.ucmerced.edu/mhyang/", "dblp": "250/5742;70/2879;144/5474;;79/3711.html", "google_scholar": "KWL0P_YAAAAJ;AjaDLjYAAAAJ;hzOgd9MAAAAJ;https://scholar.google.com.tw/citations?user=QYwb_54AAAAJ;p9-ohHsAAAAJ", "orcid": "0000-0002-8085-0042;;;0000-0002-0634-6294;0000-0003-4848-2304", "linkedin": "tsaishien-chen/;;;shao-yi-chien-a6594845/;minghsuanyang/", "or_profile": "~Tsai-Shien_Chen1;~Wei-Chih_Hung1;~Hung-Yu_Tseng2;~Shao-Yi_Chien1;~Ming-Hsuan_Yang1", "aff": "National Taiwan University;Waymo;Meta;National Taiwan University;University of California at Merced", "aff_domain": "ntu.edu.tw;waymo.com;meta.com;ntu.edu.tw;umcerced.edu", "position": "MS student;Researcher;Research Scientist;Full Professor;Professor", "bibtex": "@inproceedings{\nchen2022incremental,\ntitle={Incremental False Negative Detection for Contrastive Learning},\nauthor={Tsai-Shien Chen and Wei-Chih Hung and Hung-Yu Tseng and Shao-Yi Chien and Ming-Hsuan Yang},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=dDjSKKA5TP1}\n}", "github": "", "project": "", "reviewers": "T8CE;qg7u;7ZtC", "pdf_size": 0, "recommendation": "5;6;8", "confidence": "4;4;4", "correctness": "4;3;3", "technical_novelty": "2;3;2", "empirical_novelty": "3;2;3", "wc_summary_paper": "85;74;59", "wc_summary_review": "49;45;32", "wc_main_review": "407;239;222", "wc_review": "541;358;313", "wc_reply_reviewers": "536;0;23", "wc_reply_authors": "1727;748;558", "reply_reviewers": "2;0;1", "reply_authors": "4;1;1", "recommendation_avg": [ 6.333333333333333, 1.247219128924647 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 72.66666666666667, 10.656244908763854 ], "wc_summary_review_avg": [ 42.0, 7.2571803523590805 ], "wc_main_review_avg": [ 289.3333333333333, 83.49184923625113 ], "wc_review_avg": [ 404.0, 98.60020283954795 ], "wc_reply_reviewers_avg": [ 186.33333333333334, 247.42990028603163 ], "wc_reply_authors_avg": [ 1011.0, 512.1959260543437 ], "reply_reviewers_avg": [ 1.0, 0.816496580927726 ], "reply_authors_avg": [ 2.0, 1.4142135623730951 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": -0.7559289460184545, "gs_citation": 82, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=122478188010228753&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 7, "pdf": "https://openreview.net/pdf?id=dDjSKKA5TP1", "email": "ntu.edu.tw;waymo.com;meta.com;ntu.edu.tw;umcerced.edu", "author_num": 5, "aff_unique_index": "0;1;2;0;3", "aff_unique_norm": "National Taiwan University;Waymo;Meta;University of California, Merced", "aff_unique_dep": ";;Meta Platforms, Inc.;", "aff_unique_url": "https://www.ntu.edu.tw;https://www.waymo.com;https://meta.com;https://www.ucmerced.edu", "aff_unique_abbr": "NTU;Waymo;Meta;UC Merced", "aff_campus_unique_index": "0;0;2", "aff_campus_unique": "Taiwan;;Merced", "aff_country_unique_index": "0;1;1;0;1", "aff_country_unique": "China;United States" }, { "title": "Training Data Generating Networks: Shape Reconstruction via Bi-level Optimization", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/5933", "id": "dDo8druYppX", "poster": "", "openreview": "https://openreview.net/forum?id=dDo8druYppX", "slides": "https://iclr.cc/virtual/2022/poster/5933", "video": "https://iclr.cc/virtual/2022/poster/5933", "author_site": "Biao Zhang, Peter Wonka", "tldr": "", "abstract": "We propose a novel 3d shape representation for 3d shape reconstruction from a single image. Rather than predicting a shape directly, we train a network to generate a training set which will be fed into another learning algorithm to define the shape. The nested optimization problem can be modeled by bi-level optimization. Specifically, the algorithms for bi-level optimization are also being used in meta learning approaches for few-shot learning. Our framework establishes a link between 3D shape analysis and few-shot learning. We combine training data generating networks with bi-level optimization algorithms to obtain a complete framework for which all components can be jointly trained. We improve upon recent work on standard benchmarks for 3d shape reconstruction.", "keywords": "shape reconstruction single image;meta learning;few-shot learning;differentiable optimization;bi-level optimization", "primary_area": "", "supplementary_material": "", "author": "Biao Zhang;Peter Wonka", "authorids": "~Biao_Zhang5;~Peter_Wonka1", "gender": ";M", "homepage": "https://1zb.github.io;http://peterwonka.net", "dblp": "83/3266-5;98/5522", "google_scholar": "h5KukxEAAAAJ;https://scholar.google.com.tw/citations?user=0EKXSXgAAAAJ", "orcid": ";0000-0003-0627-9746", "linkedin": ";", "or_profile": "~Biao_Zhang5;~Peter_Wonka1", "aff": "KAUST;KAUST", "aff_domain": "kaust.edu.sa;kaust.edu.sa", "position": "PhD student;Full Professor", "bibtex": "@inproceedings{\nzhang2022training,\ntitle={Training Data Generating Networks: Shape Reconstruction via Bi-level Optimization},\nauthor={Biao Zhang and Peter Wonka},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=dDo8druYppX}\n}", "github": "", "project": "", "reviewers": "DXHG;7UAm;ABfL", "pdf_size": 0, "recommendation": "6;8;8", "confidence": "3;4;4", "correctness": "3;3;3", "technical_novelty": "3;3;3", "empirical_novelty": "3;3;3", "wc_summary_paper": "104;62;72", "wc_summary_review": "99;57;16", "wc_main_review": "310;174;109", "wc_review": "513;293;197", "wc_reply_reviewers": "37;0;23", "wc_reply_authors": "588;271;329", "reply_reviewers": "1;0;1", "reply_authors": "1;1;1", "recommendation_avg": [ 7.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 79.33333333333333, 17.9133717900592 ], "wc_summary_review_avg": [ 57.333333333333336, 33.88542787426805 ], "wc_main_review_avg": [ 197.66666666666666, 83.74697341131532 ], "wc_review_avg": [ 334.3333333333333, 132.27580613584968 ], "wc_reply_reviewers_avg": [ 20.0, 15.253414918196734 ], "wc_reply_authors_avg": [ 396.0, 137.81388415782595 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.9999999999999998, "corr_recommendation_correctness": 0.0, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14326817901643490826&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=dDo8druYppX", "email": "kaust.edu.sa;kaust.edu.sa", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "King Abdullah University of Science and Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.kaust.edu.sa", "aff_unique_abbr": "KAUST", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Saudi Arabia" }, { "id": "dEOeQgQTyvt", "title": "Structured Energy Network as a dynamic loss function. Case study. A case study with multi-label Classification", "track": "main", "status": "Reject", "tldr": "", "abstract": "We propose SEAL which utilizes this energy network as a trainable loss function for a simple feedfoward network. Structured prediction energy networks (SPENs) (Belanger & McCallum, 2016; Gygli et al., 2017) have shown that a neural network (i.e. energy network) can learn a reasonable energy function over the candidate structured outputs. We find that rather than using SPEN as a prediction network, using it as a trainable loss function is not only computationally efficient but also results in higher performance. compared to SPENs in both training and inference time. As the energy loss function is trainable, we propose SEAL to be dynamic which can adapt energy function to focus on the region where feedforward model will be affected most. We find this to be effective in ablation study comparing SEAL to the static version (\u00a74) where energy function is fixed after pretraining. We show the relation to previous work on the joint optimization model of energy network and feedforward model (INFNET) as we show that it is equivalent to SEAL using margin-based loss if INFNET relaxes their loss function. Based on the unique architecture of SEAL, we further propose a variant of SEAL that utilizes noise contrastive ranking (NCE) loss that by itself does not perform well as a structured energy network, but embodied in SEAL, it shows the greatest performance among the variants we study. We demonstrate the effectiveness of SEAL on 7 feature-based and 3 text-based multi-label classification datasets. The best version of SEAL that uses NCE ranking method achieves close to +2.85, +2.23 respective F1 point gain in average over cross-entropy and INFNET on the feature-based datasets, excluding one outlier that has an excessive gain of +50.0 F1 points. Lastly, examining whether the proposed framework is effective on a large pre-trained model as well, we observe SEAL achieving +0.87 F1 point gain in average on top of BERT-based adapter model o text datasets.", "keywords": "Structured Prediction;Energy network;Energy-based models;Loss-function learning;Dynamic loss function", "primary_area": "", "supplementary_material": "", "author": "Jay-Yoon Lee;Dhruvesh Patel;Purujit Goyal;Andrew McCallum", "authorids": "~Jay-Yoon_Lee1;~Dhruvesh_Patel1;~Purujit_Goyal1;~Andrew_McCallum1", "gender": ";M;M;M", "homepage": "http://dhruveshp.com;;http://www.cs.umass.edu/~mccallum;https://www.cs.cmu.edu/~jaylee", "dblp": "274/7280;301/8930;m/AndrewMcCallum;https://dblp.org/pers/l/Lee:Jay_Yoon", "google_scholar": "6F2CvwoAAAAJ;;yILa1y0AAAAJ;_USiaqwAAAAJ", "orcid": "0000-0003-3062-2292;;0009-0004-5487-2848;", "linkedin": "dhruveshp/;purujitgoyal/;andrew-mccallum-a412;", "or_profile": "~Dhruvesh_Patel1;~Purujit_Goyal1;~Andrew_McCallum1;~Jay_Yoon_Lee1", "aff": "College of Information and Computer Science, University of Massachusetts, Amherst;;University of Massachusetts Amherst;Department of Computer Science, University of Massachusetts, Amherst", "aff_domain": "cics.umass.edu;;cs.umass.edu;cs.umass.edu", "position": "PhD student;;Distinguished Professor;Postdoc", "bibtex": "@misc{\nlee2022structured,\ntitle={Structured Energy Network as a dynamic loss function. Case study. A case study with multi-label Classification},\nauthor={Jay-Yoon Lee and Dhruvesh Patel and Purujit Goyal and Andrew McCallum},\nyear={2022},\nurl={https://openreview.net/forum?id=dEOeQgQTyvt}\n}", "github": "", "project": "", "reviewers": "8Y6b;CpcQ;vt8f;5BUz", "site": "https://openreview.net/forum?id=dEOeQgQTyvt", "pdf_size": 0, "recommendation": "3;6;6;6", "confidence": "4;4;3;4", "correctness": "2;3;3;3", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "53;121;51;86", "wc_summary_review": "31;13;91;35", "wc_main_review": "271;432;146;408", "wc_review": "355;566;288;529", "wc_reply_reviewers": "549;19;28;149", "wc_reply_authors": "2165;2359;1406;1639", "reply_reviewers": "3;1;1;2", "reply_authors": "4;5;3;4", "recommendation_avg": [ 5.25, 1.299038105676658 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 77.75, 28.577744837547975 ], "wc_summary_review_avg": [ 42.5, 29.201883500897676 ], "wc_main_review_avg": [ 314.25, 114.92687892742933 ], "wc_review_avg": [ 434.5, 116.19487940524746 ], "wc_reply_reviewers_avg": [ 186.25, 215.63322448082994 ], "wc_reply_authors_avg": [ 1892.25, 384.97491801414805 ], "reply_reviewers_avg": [ 1.75, 0.82915619758885 ], "reply_authors_avg": [ 4.0, 0.7071067811865476 ], "replies_avg": [ 31, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": 1.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:34ifAfg3DVYJ:scholar.google.com/&scioq=Structured+Energy+Network+as+a+dynamic+loss+function.+Case+study.+A+case+study+with+multi-label+Classification&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Massachusetts Amherst", "aff_unique_dep": "College of Information and Computer Science", "aff_unique_url": "https://www.umass.edu", "aff_unique_abbr": "UMass Amherst", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Amherst", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "dEelotBE6e2", "title": "Defending Against Backdoor Attacks Using Ensembles of Weak Learners", "track": "main", "status": "Reject", "tldr": "", "abstract": "A recent line of work has shown that deep networks are susceptible to backdoor data poisoning attacks. Specifically, by injecting a small amount of malicious data into the training distribution, an adversary gains the ability to control the behavior of the model during inference. We propose an iterative training procedure for removing poisoned data from the training set. Our approach consists of two steps. We first train an ensemble of weak learners to automatically discover distinct subpopulations in the training set. We then leverage a boosting framework to exclude the poisoned data and recover the clean data. Our algorithm is based on a novel bootstrapped measure of generalization, which provably separates the clean from the dirty data under mild assumptions. Empirically, our method successfully defends against a state-of-the-art dirty label backdoor attack. We find that our approach significantly outperforms previous defenses.", "keywords": "data poisoning", "primary_area": "", "supplementary_material": "", "author": "Charles Jin;Melinda Sun;Martin Rinard", "authorids": "~Charles_Jin1;~Melinda_Sun1;~Martin_Rinard1", "gender": ";F;Not Specified", "homepage": "https://charlesjin.com;;http://people.csail.mit.edu/rinard/", "dblp": "245/5611;;", "google_scholar": "WC99LxgAAAAJ;;https://scholar.google.com.tw/citations?user=hxlxVEUAAAAJ", "orcid": "0000-0001-6871-5764;;", "linkedin": ";melinda-sun-608758212/;", "or_profile": "~Charles_Jin1;~Melinda_Sun1;~Martin_Rinard1", "aff": "Research, Google;Massachusetts Institute of Technology;Massachusetts Institute of Technology", "aff_domain": "research.google.com;mit.edu;mit.edu", "position": "Intern;Undergrad student;Full Professor", "bibtex": "@misc{\njin2022defending,\ntitle={Defending Against Backdoor Attacks Using Ensembles of Weak Learners },\nauthor={Charles Jin and Melinda Sun and Martin Rinard},\nyear={2022},\nurl={https://openreview.net/forum?id=dEelotBE6e2}\n}", "github": "", "project": "", "reviewers": "cvHA;21eY;gSj9;ey67", "site": "https://openreview.net/forum?id=dEelotBE6e2", "pdf_size": 0, "recommendation": "3;3;5;8", "confidence": "4;4;3;3", "correctness": "2;3;3;4", "technical_novelty": "2;2;3;4", "empirical_novelty": "2;1;3;4", "wc_summary_paper": "108;60;74;78", "wc_summary_review": "71;157;27;49", "wc_main_review": "835;673;460;136", "wc_review": "1014;890;561;263", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "1501;807;618;25", "reply_reviewers": "0;0;0;0", "reply_authors": "2;1;1;1", "recommendation_avg": [ 4.75, 2.0463381929681126 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.75, 0.82915619758885 ], "empirical_novelty_avg": [ 2.5, 1.118033988749895 ], "wc_summary_paper_avg": [ 80.0, 17.4928556845359 ], "wc_summary_review_avg": [ 76.0, 49.28488612140643 ], "wc_main_review_avg": [ 526.0, 261.5081260687706 ], "wc_review_avg": [ 682.0, 293.1253997865077 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 737.75, 526.7111993303351 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.8551861104941366, "corr_recommendation_correctness": 0.8638684255813602, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1909265259046840475&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;1", "aff_unique_norm": "Google;Massachusetts Institute of Technology", "aff_unique_dep": "Google Research;", "aff_unique_url": "https://research.google;https://web.mit.edu", "aff_unique_abbr": "Google;MIT", "aff_campus_unique_index": "0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "When should agents explore?", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7009", "id": "dEwfxt14bca", "poster": "", "openreview": "https://openreview.net/forum?id=dEwfxt14bca", "slides": "https://iclr.cc/virtual/2022/poster/7009", "video": "https://iclr.cc/virtual/2022/poster/7009", "author_site": "Miruna P\u00eeslar, David Szepesvari, Georg Ostrovski, Diana Borsa, Tom Schaul", "tldr": "", "abstract": "Exploration remains a central challenge for reinforcement learning (RL). Virtually all existing methods share the feature of a *monolithic* behaviour policy that changes only gradually (at best). In contrast, the exploratory behaviours of animals and humans exhibit a rich diversity, namely including forms of *switching* between modes. This paper presents an initial study of mode-switching, non-monolithic exploration for RL. We investigate different modes to switch between, at what timescales it makes sense to switch, and what signals make for good switching triggers. We also propose practical algorithmic components that make the switching mechanism adaptive and robust, which enables flexibility without an accompanying hyper-parameter-tuning burden. Finally, we report a promising initial study on Atari, using two-mode exploration and switching at sub-episodic time-scales.", "keywords": "exploration;mode-switching;reinforcement learning;Atari", "primary_area": "", "supplementary_material": "", "author": "Miruna Pislar;David Szepesvari;Georg Ostrovski;Diana L Borsa;Tom Schaul", "authorids": "~Miruna_Pislar1;~David_Szepesvari1;~Georg_Ostrovski1;~Diana_L_Borsa1;~Tom_Schaul2", "gender": "F;M;M;;M", "homepage": "https://github.com/MirunaPislar;;http://ostrovski.co.uk/;;http://schaul.site44.com/", "dblp": ";191/6739;133/8425;164/6204;50/254", "google_scholar": ";;;;https://scholar.google.com/citations?hl=en", "orcid": ";;0000-0001-7707-2633;;0000-0002-2961-8782", "linkedin": ";;georg-ostrovski-5690a538;diana-l-borsa-12834023;schaul/", "or_profile": "~Miruna_Pislar1;~David_Szepesvari1;~Georg_Ostrovski1;~Diana_L_Borsa1;~Tom_Schaul1", "aff": "Google;Google DeepMind;Google DeepMind;DeepMind/Google;Google DeepMind", "aff_domain": "google.com;google.com;deepmind.com;google.com;google.com", "position": "Researcher;Research Engineer;Researcher;Research Scientist;Researcher", "bibtex": "@inproceedings{\npislar2022when,\ntitle={When should agents explore?},\nauthor={Miruna Pislar and David Szepesvari and Georg Ostrovski and Diana L Borsa and Tom Schaul},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=dEwfxt14bca}\n}", "github": "", "project": "", "reviewers": "AkpW;3t1W;TbGe;ZCJR", "pdf_size": 0, "recommendation": "6;6;8;8", "confidence": "3;3;3;4", "correctness": "3;2;4;4", "technical_novelty": "4;3;3;3", "empirical_novelty": "4;3;3;3", "wc_summary_paper": "84;175;55;40", "wc_summary_review": "80;104;47;46", "wc_main_review": "307;371;191;504", "wc_review": "471;650;293;590", "wc_reply_reviewers": "262;0;0;19", "wc_reply_authors": "465;333;27;326", "reply_reviewers": "2;0;0;1", "reply_authors": "2;1;1;1", "recommendation_avg": [ 7.0, 1.0 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 3.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 88.5, 52.38558962157437 ], "wc_summary_review_avg": [ 69.25, 24.283482040267618 ], "wc_main_review_avg": [ 343.25, 113.0317986232193 ], "wc_review_avg": [ 501.0, 136.27729084480657 ], "wc_reply_reviewers_avg": [ 70.25, 110.97831995484523 ], "wc_reply_authors_avg": [ 287.75, 160.40476146299397 ], "reply_reviewers_avg": [ 0.75, 0.82915619758885 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": 0.9045340337332909, "gs_citation": 39, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4525159361446145106&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=dEwfxt14bca", "email": "google.com;google.com;deepmind.com;google.com;google.com", "author_num": 5, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Google", "aff_unique_dep": "Google", "aff_unique_url": "https://www.google.com", "aff_unique_abbr": "Google", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;1;1;0;1", "aff_country_unique": "United States;United Kingdom" }, { "title": "Equivariant Subgraph Aggregation Networks", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6212", "id": "dFbKQaRk15w", "poster": "", "openreview": "https://openreview.net/forum?id=dFbKQaRk15w", "slides": "https://iclr.cc/virtual/2022/poster/6212", "video": "https://iclr.cc/virtual/2022/poster/6212", "author_site": "Beatrice Bevilacqua, Fabrizio Frasca, Derek Lim, Balasubramaniam Srinivasan, Chen Cai, GOPINATH BALAMURUGAN, Michael Bronstein, Haggai Maron", "tldr": "", "abstract": "Message-passing neural networks (MPNNs) are the leading architecture for deep learning on graph-structured data, in large part due to their simplicity and scalability. Unfortunately, it was shown that these architectures are limited in their expressive power. This paper proposes a novel framework called Equivariant Subgraph Aggregation Networks (ESAN) to address this issue. Our main observation is that while two graphs may not be distinguishable by an MPNN, they often contain distinguishable subgraphs. Thus, we propose to represent each graph as a set of subgraphs derived by some predefined policy, and to process it using a suitable equivariant architecture. We develop novel variants of the 1-dimensional Weisfeiler-Leman (1-WL) test for graph isomorphism, and prove lower bounds on the expressiveness of ESAN in terms of these new WL variants. We further prove that our approach increases the expressive power of both MPNNs and more expressive architectures. Moreover, we provide theoretical results that describe how design choices such as the subgraph selection policy and equivariant neural architecture affect our architecture's expressive power. To deal with the increased computational cost, we propose a subgraph sampling scheme, which can be viewed as a stochastic version of our framework. A comprehensive set of experiments on real and synthetic datasets demonstrates that our framework improves the expressive power and overall performance of popular GNN architectures. ", "keywords": "Graph Neural Networks;Expressive power;Equivariance;Weisfeiler-Leman", "primary_area": "", "supplementary_material": "", "author": "Beatrice Bevilacqua;Fabrizio Frasca;Derek Lim;Balasubramaniam Srinivasan;Chen Cai;Gopinath Balamurugan;Michael M. Bronstein;Haggai Maron", "authorids": "~Beatrice_Bevilacqua1;~Fabrizio_Frasca1;~Derek_Lim1;~Balasubramaniam_Srinivasan1;~Chen_Cai1;~Gopinath_Balamurugan1;~Michael_M._Bronstein1;~Haggai_Maron1", "gender": "F;M;M;;;M;M;M", "homepage": "http://beabevi.github.io/;https://noired.github.io;https://cptq.github.io/;;https://chen-cai-osu.github.io/;;http://www.inf.usi.ch/bronstein/;https://haggaim.github.io/", "dblp": "275/2364;228/1840;267/5433;230/3792;;304/2499;07/2668;181/6629", "google_scholar": ";PT2CDA4AAAAJ;y9YTBIsAAAAJ;uM4EhgEAAAAJ;kWCCJIYAAAAJ;LC8KrlMAAAAJ;UU3N6-UAAAAJ;https://scholar.google.co.il/citations?user=4v8uJrIAAAAJ", "orcid": ";0000-0002-5165-1394;;;;;;", "linkedin": ";;;;;;mbronstein/;", "or_profile": "~Beatrice_Bevilacqua1;~Fabrizio_Frasca1;~Derek_Lim1;~Balasubramaniam_Srinivasan1;~Chen_Cai1;~Gopinath_Balamurugan1;~Michael_M._Bronstein1;~Haggai_Maron1", "aff": "Purdue University;Imperial College London;Meta Facebook;Purdue University;University of California, San Diego;Eberhard-Karls-Universit\u00e4t T\u00fcbingen;Twitter;NVIDIA", "aff_domain": "purdue.edu;imperial.ac.uk;fb.com;purdue.edu;ucsd.edu;uni-tuebingen.de;twitter.com;nvidia.com", "position": "PhD student;PhD student;Intern;PhD student;PhD student;MS student;Head of Graph ML;Research Scientist", "bibtex": "@inproceedings{\nbevilacqua2022equivariant,\ntitle={Equivariant Subgraph Aggregation Networks},\nauthor={Beatrice Bevilacqua and Fabrizio Frasca and Derek Lim and Balasubramaniam Srinivasan and Chen Cai and Gopinath Balamurugan and Michael M. Bronstein and Haggai Maron},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=dFbKQaRk15w}\n}", "github": "", "project": "", "reviewers": "B3oK;Ebj3;tGsP;rtwc", "pdf_size": 0, "recommendation": "6;6;8;8", "confidence": "4;4;4;4", "correctness": "3;3;4;4", "technical_novelty": "2;3;4;4", "empirical_novelty": "1;2;3;2", "wc_summary_paper": "165;87;70;79", "wc_summary_review": "66;42;35;41", "wc_main_review": "644;211;202;212", "wc_review": "875;340;307;332", "wc_reply_reviewers": "178;0;0;25", "wc_reply_authors": "2598;2056;694;355", "reply_reviewers": "1;0;0;1", "reply_authors": "6;4;1;1", "recommendation_avg": [ 7.0, 1.0 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.25, 0.82915619758885 ], "empirical_novelty_avg": [ 2.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 100.25, 37.86406607853942 ], "wc_summary_review_avg": [ 46.0, 11.853269591129697 ], "wc_main_review_avg": [ 317.25, 188.6893942435557 ], "wc_review_avg": [ 463.5, 237.8912566699331 ], "wc_reply_reviewers_avg": [ 50.75, 74.17336112109253 ], "wc_reply_authors_avg": [ 1425.75, 929.1593983273268 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 3.0, 2.1213203435596424 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 1.0, "gs_citation": 234, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6011099715044788714&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=dFbKQaRk15w", "email": "purdue.edu;imperial.ac.uk;fb.com;purdue.edu;ucsd.edu;uni-tuebingen.de;twitter.com;nvidia.com", "author_num": 8, "aff_unique_index": "0;1;2;0;3;4;5;6", "aff_unique_norm": "Purdue University;Imperial College London;Meta;University of California, San Diego;Eberhard Karls University of T\u00fcbingen;Twitter, Inc.;NVIDIA", "aff_unique_dep": ";;Meta Platforms, Inc.;;;;NVIDIA Corporation", "aff_unique_url": "https://www.purdue.edu;https://www.imperial.ac.uk;https://meta.com;https://www.ucsd.edu;https://www.uni-tuebingen.de/;https://twitter.com;https://www.nvidia.com", "aff_unique_abbr": "Purdue;ICL;Meta;UCSD;Uni T\u00fcbingen;Twitter;NVIDIA", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";San Diego;T\u00fcbingen", "aff_country_unique_index": "0;1;0;0;0;2;0;0", "aff_country_unique": "United States;United Kingdom;Germany" }, { "id": "dHJtoaE3yRP", "title": "NAFS: A Simple yet Tough-to-Beat Baseline for Graph Representation Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Recently, graph neural networks (GNNs) have shown prominent performance in graph representation learning by leveraging knowledge from both graph structure and node features. However, most of them have two major limitations. First, GNNs can learn higher-order structural information by stacking more layers but can not deal with large depth due to the over-smoothing issue. Second, it is not easy to apply these methods on large graphs due to the expensive computation cost and high memory usage. In this paper, we present node-adaptive feature smoothing (NAFS), a simple non-parametric method that constructs node representations without parameter learning. NAFS first extracts the features of each node with its neighbors of different hops by feature smoothing, and then adaptively combines the smoothed features. Besides, the constructed node representation can further be enhanced by the ensemble of smoothed features extracted via different smoothing strategies. We conduct experiments on four benchmark datasets on two different application scenarios: node clustering and link prediction. Remarkably, NAFS with feature ensemble outperforms the state-of-the-art GNNs on these tasks and mitigates the aforementioned two limitations of most learning-based GNN counterparts. ", "keywords": "Feature Smoothing;Graph Neural Network;Graph Representation Learning", "primary_area": "", "supplementary_material": "/attachment/ac2881583ad04f805d39d562cf7ec40e7bfd072e.zip", "author": "Wentao Zhang;Zeang Sheng;Mingyu Yang;Yang Li;Yu Shen;Zhi Yang;Zichao Yang;Bin CUI", "authorids": "~Wentao_Zhang1;~Zeang_Sheng1;~Mingyu_Yang2;~Yang_Li36;~Yu_Shen3;~Zhi_Yang4;~Zichao_Yang1;~Bin_CUI2", "gender": "M;M;M;M;M;M;M;M", "homepage": "https://scholar.google.com/citations?user=cIaU0iIAAAAJ&hl=en;https://github.com/williamy1996;https://thomas-young-2013.github.io/;https://salty-fish-97.github.io/;https://yangzhihome.github.io/;;https://cuibinpku.github.io/index.html;https://zwt233.github.io/", "dblp": "298/0674;;37/4190-106;48/4462-3.html;90/5587-1;07/8707;55/5031.html;41/3249-1.html", "google_scholar": "cIaU0iIAAAAJ;;_4s8hFYAAAAJ;WHCihd4AAAAJ;;https://scholar.google.co.uk/citations?user=siCYLcUAAAAJ;IJAU8KoAAAAJ;JE4VON0AAAAJ", "orcid": "0009-0002-4427-3038;;;0000-0001-6503-6504;;;0000-0003-1681-4677;0000-0002-7532-5550", "linkedin": ";;yang-thomas-li-b75554107/;;;;;", "or_profile": "~Zeang_Sheng1;~Mingyu_Yang2;~Yang_Li36;~Yu_Shen3;~Zhi_Yang4;~Zichao_Yang1;~Bin_CUI2;~Zhang_wen_tao1", "aff": "Peking University;Peking University;Peking University;Peking University;Peking University;;Peking University;Peking University", "aff_domain": "pku.edu.cn;pku.edu.cn;pku.edu.cn;pku.edu.cn;pku.edu.cn;;pku.edu.cn;pku.edu.cn", "position": "Undergrad student;PhD student;PhD student;PhD student;Associate Professor;;Full Professor;PhD student", "bibtex": "@misc{\nzhang2022nafs,\ntitle={{NAFS}: A Simple yet Tough-to-Beat Baseline for Graph Representation Learning},\nauthor={Wentao Zhang and Zeang Sheng and Mingyu Yang and Yang Li and Yu Shen and Zhi Yang and Zichao Yang and Bin CUI},\nyear={2022},\nurl={https://openreview.net/forum?id=dHJtoaE3yRP}\n}", "github": "", "project": "", "reviewers": "YM4P;w2Qg;SHxg", "site": "https://openreview.net/forum?id=dHJtoaE3yRP", "pdf_size": 0, "recommendation": "5;6;6", "confidence": "4;4;4", "correctness": "4;4;3", "technical_novelty": "2;3;3", "empirical_novelty": "2;3;2", "wc_summary_paper": "178;73;122", "wc_summary_review": "46;52;194", "wc_main_review": "115;147;383", "wc_review": "339;272;699", "wc_reply_reviewers": "9;0;0", "wc_reply_authors": "1536;1034;1867", "reply_reviewers": "1;0;0", "reply_authors": "4;2;5", "recommendation_avg": [ 5.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 124.33333333333333, 42.897811391983886 ], "wc_summary_review_avg": [ 97.33333333333333, 68.39753081962989 ], "wc_main_review_avg": [ 215.0, 119.51011114824831 ], "wc_review_avg": [ 436.6666666666667, 187.50348144916018 ], "wc_reply_reviewers_avg": [ 3.0, 4.242640687119285 ], "wc_reply_authors_avg": [ 1479.0, 342.45096972656785 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 3.6666666666666665, 1.247219128924647 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": -0.4999999999999999, "gs_citation": 32, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15414509502576388233&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0;0;0;0;0;0", "aff_unique_norm": "Peking University", "aff_unique_dep": "", "aff_unique_url": "http://www.pku.edu.cn", "aff_unique_abbr": "Peking U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "China" }, { "id": "dHd6pU-8_fF", "title": "L-SR1 Adaptive Regularization by Cubics for Deep Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Stochastic gradient descent and other first-order variants, such as Adam and AdaGrad, are commonly used in the field of deep learning due to their computational efficiency and low-storage memory requirements. However, these methods do not exploit curvature information. Consequently, iterates can converge to saddle points and poor local minima. To avoid these points, directions of negative curvature can be utilized, which requires computing the second-derivative matrix. In Deep Neural Networks (DNNs), the number of variables ($n$) can be of the order of tens of millions, making the Hessian impractical to store ($\\mathcal{O}(n^2)$) and to invert ($\\mathcal{O}(n^3)$). Alternatively, quasi-Newton methods compute Hessian approximations that do not have the same computational requirements. Quasi-Newton methods re-use previously computed iterates and gradients to compute a low-rank structured update. The most widely used quasi-Newton update is the L-BFGS, which guarantees a positive semi-definite Hessian approximation, making it suitable in a line search setting. However, the loss function in DNNs are non-convex, where the Hessian is potentially non-positive definite. In this paper, we propose using a Limited-Memory Symmetric Rank-1 quasi-Newton approach which allows for indefinite Hessian approximations, enabling directions of negative curvature to be exploited. Furthermore, we use a modified Adaptive Regularized Cubics approach, which generates a sequence of cubic subproblems that have closed-form solutions. We investigate the performance of our proposed method on autoencoders and feed-forward neural network models and compare our approach to state-of-the-art first-order adaptive stochastic methods as well as L-BFGS.", "keywords": "Non-convex optimization;deep learning;quasi-Newton methods;adaptive cubic regularization", "primary_area": "", "supplementary_material": "/attachment/2392f5fe7d4887d4cbfcca89894a659ec556dbbf.zip", "author": "Aditya Ranganath;Mukesh Singhal;Roummel Marcia", "authorids": "~Aditya_Ranganath1;~Mukesh_Singhal1;~Roummel_Marcia1", "gender": "M;M;", "homepage": ";;https://faculty.ucmerced.edu/rmarcia/", "dblp": ";s/MukeshSinghal;", "google_scholar": ";;", "orcid": ";;", "linkedin": "aranganath/;;", "or_profile": "~Aditya_Ranganath1;~Mukesh_Singhal1;~Roummel_Marcia1", "aff": "University of California at Merced;University of California at Merced;University of California, Merced", "aff_domain": "ucmerced.edu;ucmerced.edu;ucmerced.edu", "position": "PhD student;Full Professor;Full Professor", "bibtex": "@misc{\nranganath2022lsr,\ntitle={L-{SR}1 Adaptive Regularization by Cubics for Deep Learning},\nauthor={Aditya Ranganath and Mukesh Singhal and Roummel Marcia},\nyear={2022},\nurl={https://openreview.net/forum?id=dHd6pU-8_fF}\n}", "github": "", "project": "", "reviewers": "cfGs;oGs7;qdob;YQSL", "site": "https://openreview.net/forum?id=dHd6pU-8_fF", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "4;4;4;4", "correctness": "2;2;2;3", "technical_novelty": "2;2;2;3", "empirical_novelty": "1;3;2;2", "wc_summary_paper": "119;47;23;74", "wc_summary_review": "80;2;15;99", "wc_main_review": "561;168;418;824", "wc_review": "760;217;456;997", "wc_reply_reviewers": "77;0;0;0", "wc_reply_authors": "290;91;148;151", "reply_reviewers": "1;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 2.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 65.75, 35.64670391494843 ], "wc_summary_review_avg": [ 49.0, 41.309805131469695 ], "wc_main_review_avg": [ 492.75, 237.39984730407895 ], "wc_review_avg": [ 607.5, 295.9767727373214 ], "wc_reply_reviewers_avg": [ 19.25, 33.34197804570089 ], "wc_reply_authors_avg": [ 170.0, 73.29051780414709 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 1.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:V1SLL1m-gtEJ:scholar.google.com/&scioq=L-SR1+Adaptive+Regularization+by+Cubics+for+Deep+Learning&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of California, Merced", "aff_unique_dep": "", "aff_unique_url": "https://www.ucmerced.edu", "aff_unique_abbr": "UC Merced", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Merced", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "dIVrWHP9_1i", "title": "G-Mixup: Graph Augmentation for Graph Classification", "track": "main", "status": "Reject", "tldr": "", "abstract": "This work develops \\emph{mixup to graph data}. Mixup has shown superiority in improving the generalization and robustness of neural networks by interpolating features and labels of random two samples. Traditionally, Mixup can operate on regular, grid-like, and Euclidean data such as image or tabular data. However, it is challenging to directly adopt Mixup to augment graph data because two graphs typically: 1) have different numbers of nodes; 2) are not readily aligned; and 3) have unique topologies in non-Euclidean space. To this end, we propose $\\mathcal{G}$-Mixup to augment graphs for graph classification by interpolating the generator (i.e., graphon) of different classes of graphs. Specifically, we first use graphs within the same class to estimate a graphon. Then, instead of directly manipulating graphs, we interpolate graphons of different classes in the Euclidean space to get mixed graphons, where the synthetic graphs are generated through sampling based on the new graphons.", "keywords": "graph augmentation;mixup;graph classification;graphon", "primary_area": "", "supplementary_material": "", "author": "Xiaotian Han;Zhimeng Jiang;Ninghao Liu;Xia Hu", "authorids": "~Xiaotian_Han1;~Zhimeng_Jiang1;~Ninghao_Liu2;~Xia_Hu4", "gender": "M;M;M;M", "homepage": "https://ahxt.github.io/;http://www.zhimengjiang.com/;https://cobweb.cs.uga.edu/~ninghaoliu/;https://cs.rice.edu/~xh37/index.html", "dblp": ";217/3235;145/4489;256/9406.html", "google_scholar": "Uromx98AAAAJ;5Es3Yk4AAAAJ;Nir-EDYAAAAJ;https://scholar.google.com.tw/citations?user=pcCS60IAAAAJ", "orcid": ";0000-0001-6933-3952;0000-0002-9170-2424;", "linkedin": ";;;", "or_profile": "~Xiaotian_Han1;~Zhimeng_Jiang1;~Ninghao_Liu1;~Xia_Hu2", "aff": "Texas A&M University;Texas A&M University;University of Georgia;Rice University", "aff_domain": "tamu.edu;tamu.edu;uga.edu;rice.edu", "position": "PhD student;PhD student;Assistant Professor;Associate Professor", "bibtex": "@misc{\nhan2022gmixup,\ntitle={G-Mixup: Graph Augmentation for Graph Classification},\nauthor={Xiaotian Han and Zhimeng Jiang and Ninghao Liu and Xia Hu},\nyear={2022},\nurl={https://openreview.net/forum?id=dIVrWHP9_1i}\n}", "github": "", "project": "", "reviewers": "RkbU;s4VH;vVum", "site": "https://openreview.net/forum?id=dIVrWHP9_1i", "pdf_size": 0, "recommendation": "3;3;8", "confidence": "4;4;4", "correctness": "3;3;3", "technical_novelty": "3;2;3", "empirical_novelty": "2;2;3", "wc_summary_paper": "124;48;102", "wc_summary_review": "22;11;50", "wc_main_review": "619;259;406", "wc_review": "765;318;558", "wc_reply_reviewers": "355;296;303", "wc_reply_authors": "2764;1739;1366", "reply_reviewers": "2;2;1", "reply_authors": "7;6;6", "recommendation_avg": [ 4.666666666666667, 2.357022603955158 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 91.33333333333333, 31.930480039541457 ], "wc_summary_review_avg": [ 27.666666666666668, 16.418147141366337 ], "wc_main_review_avg": [ 428.0, 147.7903921099068 ], "wc_review_avg": [ 547.0, 182.65267586323503 ], "wc_reply_reviewers_avg": [ 318.0, 26.318561257535844 ], "wc_reply_authors_avg": [ 1956.3333333333333, 591.059124698104 ], "reply_reviewers_avg": [ 1.6666666666666667, 0.4714045207910317 ], "reply_authors_avg": [ 6.333333333333333, 0.4714045207910317 ], "replies_avg": [ 29, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:9DqgboubpMsJ:scholar.google.com/&scioq=G-Mixup:+Graph+Augmentation+for+Graph+Classification&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;1;2", "aff_unique_norm": "Texas A&M University;University of Georgia;Rice University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.tamu.edu;https://www.uga.edu;https://www.rice.edu", "aff_unique_abbr": "TAMU;UGA;Rice", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "dJk1vpEFYF0", "title": "Personalized Federated Learning with Clustered Generalization", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "The prevalent personalized federated learning (PFL) usually pursues a trade-off between personalization and generalization by maintaining a shared global model to guide the training process of local models. However, the sole global model may easily transfer deviated knowledge (e.g., biased updates) to some local models when rich statistical diversity exists across the local datasets. Thus, we argue it is of crucial importance to maintain the diversity of generalization to provide each client with fine-grained common knowledge that can better fit the local data distributions and facilitate faster model convergence. In this paper, we propose a novel concept called clustered generalization (CG) to handle the challenge of statistical heterogeneity, and properly design a CG-based framework of PFL, dubbed CGPFL. Concretely, we maintain K global (i.e., generalized) models in the server and each local model is dynamically associated with the nearest global model to conduct \u2018push\u2019 and \u2018pull\u2019 operations during the iterative algorithm. We conduct detailed theoretical analysis, in which the convergence guarantee is presented and $\\mathcal{O}(\\sqrt{K})$ speedup over most existing methods is granted. To quantitatively study the generalization-personalization trade-off, we introduce the \u2018generalization error\u2019 measure and prove that the proposed CGPFL can achieve a better trade-off than existing solutions. Moreover, our theoretical analysis further inspires a heuristic algorithm to find a near-optimal trade-off in CGPFL. Experimental results on multiple real-world datasets show that our approach surpasses the state-of-the-art methods on test accuracy by a significant margin. ", "keywords": "Federated Learning;Personalization;Generalization;Clustering;Convergence", "primary_area": "", "supplementary_material": "", "author": "Xueyang Tang;Song Guo;Jingcai Guo", "authorids": "~Xueyang_Tang1;~Song_Guo5;~Jingcai_Guo2", "gender": "M;M;M", "homepage": ";https://cse.hkust.edu.hk/~songguo/;https://jingcaiguo.github.io/", "dblp": ";01/267-1;192/7270", "google_scholar": "wAGIpRAAAAAJ;https://scholar.google.com/citations?hl=en;YjSHPjcAAAAJ", "orcid": "0000-0003-4284-9806;;0000-0002-0449-4525", "linkedin": ";;jingcai-guo", "or_profile": "~Xueyang_Tang1;~Song_Guo5;~Jingcai_Guo1", "aff": "The Hong Kong Polytechnic University;The Hong Kong Polytechnic University;University of Sydney", "aff_domain": "polyu.edu.hk;polyu.edu.hk;usyd.edu.au", "position": "PhD student;Full Professor;Postdoc", "bibtex": "@misc{\ntang2022personalized,\ntitle={Personalized Federated Learning with Clustered Generalization},\nauthor={Xueyang Tang and Song Guo and Jingcai Guo},\nyear={2022},\nurl={https://openreview.net/forum?id=dJk1vpEFYF0}\n}", "github": "", "project": "", "reviewers": "mGxH;bgWZ;8KAL;13vq", "site": "https://openreview.net/forum?id=dJk1vpEFYF0", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "3;5;4;5", "correctness": "3;3;3;3", "technical_novelty": "3;3;3;2", "empirical_novelty": "3;2;2;2", "wc_summary_paper": "54;38;126;86", "wc_summary_review": "57;29;49;30", "wc_main_review": "271;435;224;435", "wc_review": "382;502;399;551", "wc_reply_reviewers": "143;72;0;155", "wc_reply_authors": "1955;2211;524;1369", "reply_reviewers": "1;1;0;1", "reply_authors": "3;3;1;2", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 4.25, 0.82915619758885 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 76.0, 33.645207682521445 ], "wc_summary_review_avg": [ 41.25, 12.090802289343747 ], "wc_main_review_avg": [ 341.25, 95.21127821849679 ], "wc_review_avg": [ 458.5, 70.42904230500369 ], "wc_reply_reviewers_avg": [ 92.5, 62.114813048096664 ], "wc_reply_authors_avg": [ 1514.75, 648.3465026511673 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 2.25, 0.82915619758885 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.899228803025897, "corr_recommendation_correctness": 0.0, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2285322107725577341&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;1", "aff_unique_norm": "Hong Kong Polytechnic University;University of Sydney", "aff_unique_dep": ";", "aff_unique_url": "https://www.polyu.edu.hk;https://www.sydney.edu.au", "aff_unique_abbr": "PolyU;USYD", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;1", "aff_country_unique": "China;Australia" }, { "id": "dKLoUvtnq0C", "title": "Semi-supervised learning of partial differential operators and dynamical flows", "track": "main", "status": "Reject", "tldr": "", "abstract": "The evolution of dynamical systems is generically governed by nonlinear partial differential equations (PDEs), whose solution, in a simulation framework, requires vast amounts of computational resources. For a growing number of specific cases, neural network-based solvers have been shown to provide comparable results to other numerical methods while utilizing fewer resources.\nIn this work, we present a novel method that combines a hyper-network solver with a Fourier Neural Operator architecture. Our method treats time and space separately. As a result, it successfully propagates initial conditions in discrete time steps by employing the general composition properties of the partial differential operators. Following previous work, supervision is provided at a specific time point. We test our method on various time evolution PDEs, including nonlinear fluid flows in one, two, and three spatial dimensions. The results show that the new method improves the learning accuracy at the time point of supervision point, and is also able to interpolate and extrapolate the solutions to arbitrary times.", "keywords": "Hypernetworks;Partial Differential Equations;Fluid Dynamics", "primary_area": "", "supplementary_material": "/attachment/416a718f481d777c1fa59ccd7c0800e555f115cc.zip", "author": "Michael Rotman;Amit Dekel;Ran Ilan Ber;Lior Wolf;Yaron Oz", "authorids": "~Michael_Rotman1;~Amit_Dekel1;ranber01@gmail.com;~Lior_Wolf1;~Yaron_Oz1", "gender": ";M;;M;", "homepage": "https://rotmanmichael.com;;;http://www.cs.tau.ac.il/~wolf;", "dblp": "217/3007;259/2006;;83/4103;", "google_scholar": "tzlpNi8AAAAJ;mY12KaoAAAAJ;;UbFrXTsAAAAJ;", "orcid": ";;;0000-0001-5578-8892;", "linkedin": ";;;;", "or_profile": "~Michael_Rotman1;~Amit_Dekel1;ranber01@gmail.com;~Lior_Wolf1;~Yaron_Oz1", "aff": "General Electric;Univrses;;Tel Aviv University;Tel Aviv University, Technion", "aff_domain": "ge.com;univrses.com;;tau.ac.il;tau.ac.il", "position": "Researcher;Researcher;;Full Professor;Full Professor", "bibtex": "@misc{\nrotman2022semisupervised,\ntitle={Semi-supervised learning of partial differential operators and dynamical flows},\nauthor={Michael Rotman and Amit Dekel and Ran Ilan Ber and Lior Wolf and Yaron Oz},\nyear={2022},\nurl={https://openreview.net/forum?id=dKLoUvtnq0C}\n}", "github": "", "project": "", "reviewers": "3Uae;j3Ma;5dc6", "site": "https://openreview.net/forum?id=dKLoUvtnq0C", "pdf_size": 0, "recommendation": "5;5;8", "confidence": "4;4;4", "correctness": "3;4;3", "technical_novelty": "2;2;3", "empirical_novelty": "2;2;3", "wc_summary_paper": "114;138;21", "wc_summary_review": "23;26;24", "wc_main_review": "417;383;396", "wc_review": "554;547;441", "wc_reply_reviewers": "0;0;89", "wc_reply_authors": "866;1090;896", "reply_reviewers": "0;0;1", "reply_authors": "2;3;2", "recommendation_avg": [ 6.0, 1.4142135623730951 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 91.0, 50.457903246171455 ], "wc_summary_review_avg": [ 24.333333333333332, 1.247219128924647 ], "wc_main_review_avg": [ 398.6666666666667, 14.007934259633796 ], "wc_review_avg": [ 514.0, 51.697840058039816 ], "wc_reply_reviewers_avg": [ 29.666666666666668, 41.95500235040182 ], "wc_reply_authors_avg": [ 950.6666666666666, 99.28186586123817 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 2.3333333333333335, 0.4714045207910317 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": -0.5, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7638457215727254674&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff_unique_index": "0;1;2;2", "aff_unique_norm": "General Electric;Univrses;Tel Aviv University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.ge.com;;https://www.tau.ac.il", "aff_unique_abbr": "GE;;TAU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;2;2", "aff_country_unique": "United States;;Israel" }, { "id": "dKVsqZOGOHL", "title": "How and When Adversarial Robustness Transfers in Knowledge Distillation?", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Knowledge distillation (KD) has been widely used in teacher-student training, with applications to model compression in resource-constrained deep learning. Current works mainly focus on preserving the accuracy of the teacher model. However, other important model properties, such as adversarial robustness, can be lost during distillation. This paper studies how and when the adversarial robustness can be transferred from a teacher model to a student model in KD. We show that standard KD training fails to preserve adversarial robustness, and we propose KD with input gradient alignment (KDIGA) for remedy. Under certain assumptions, we prove that the student model using our proposed KDIGA can achieve at least the same certified robustness as the teacher model. Our experiments of KD contain a diverse set of teacher and student models with varying network architectures and sizes evaluated on ImageNet and CIFAR-10 datasets, including residual neural networks (ResNets) and vision transformers (ViTs). Our comprehensive analysis shows several novel insights that (1) With KDIGA, students can preserve or even exceed the adversarial robustness of the teacher model, even when their models have fundamentally different architectures; (2) KDIGA enables robustness transfer to pre-trained students, such as KD from an adversarially trained ResNet to a pre-trained ViT, without loss of clean accuracy; and (3) Our derived local linearity bounds for characterizing adversarial robustness in KD are consistent with the empirical results.", "keywords": "adversarial robustness;knowledge distillation;adversarial training;vision transformer", "primary_area": "", "supplementary_material": "/attachment/0cc08860da31c196411be4d643514625fd5a130e.zip", "author": "Rulin Shao;Jinfeng Yi;Cho-Jui Hsieh;Pin-Yu Chen", "authorids": "~Rulin_Shao1;~Jinfeng_Yi1;~Cho-Jui_Hsieh1;~Pin-Yu_Chen1", "gender": ";M;M;M", "homepage": "https://rulinshao.github.io/;http://jinfengyi.net/;http://web.cs.ucla.edu/~chohsieh/index.html;http://www.pinyuchen.com", "dblp": ";117/4898;14/2770;39/8969", "google_scholar": "Vdwh6bcAAAAJ;lZxRZ84AAAAJ;Wy89g4IAAAAJ;jxwlCUUAAAAJ", "orcid": ";;;0000-0003-1039-8369", "linkedin": ";https://www.linkedin.com/nhome/?trk=;;pin-yu-chen-940062a2", "or_profile": "~Rulin_Shao1;~Jinfeng_Yi1;~Cho-Jui_Hsieh1;~Pin-Yu_Chen1", "aff": ";JD AI Research;University of California, Los Angeles;International Business Machines", "aff_domain": ";jd.com;ucla.edu;ibm.com", "position": ";Senior Director;Assistant Professor;Research Staff Member", "bibtex": "@misc{\nshao2022how,\ntitle={How and When Adversarial Robustness Transfers in Knowledge Distillation?},\nauthor={Rulin Shao and Jinfeng Yi and Cho-Jui Hsieh and Pin-Yu Chen},\nyear={2022},\nurl={https://openreview.net/forum?id=dKVsqZOGOHL}\n}", "github": "", "project": "", "reviewers": "2BVD;B9rz;FjfA", "site": "https://openreview.net/forum?id=dKVsqZOGOHL", "pdf_size": 0, "recommendation": "3;5;6", "confidence": "4;4;4", "correctness": "3;3;3", "technical_novelty": "2;2;3", "empirical_novelty": "0;2;3", "wc_summary_paper": "56;169;56", "wc_summary_review": "17;147;43", "wc_main_review": "132;313;67", "wc_review": "205;629;166", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 4.666666666666667, 1.247219128924647 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 1.6666666666666667, 1.247219128924647 ], "wc_summary_paper_avg": [ 93.66666666666667, 53.26871084938658 ], "wc_summary_review_avg": [ 69.0, 56.166419386201454 ], "wc_main_review_avg": [ 170.66666666666666, 104.0843674888576 ], "wc_review_avg": [ 333.3333333333333, 209.67329083335554 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1815626920113659953&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;1;2", "aff_unique_norm": "JD;University of California, Los Angeles;International Business Machines Corporation", "aff_unique_dep": "JD AI Research;;", "aff_unique_url": "https://www.jd.com;https://www.ucla.edu;https://www.ibm.com", "aff_unique_abbr": "JD AI;UCLA;IBM", "aff_campus_unique_index": "1", "aff_campus_unique": ";Los Angeles", "aff_country_unique_index": "0;1;1", "aff_country_unique": "China;United States" }, { "id": "dK_t8oN8G4", "title": "Neurosymbolic Deep Generative Models for Sequence Data with Relational Constraints", "track": "main", "status": "Reject", "tldr": "", "abstract": "There has been significant recent progress designing deep generative models that generate realistic sequence data such as text or music. Nevertheless, it remains difficult to incorporate high-level structure to guide the generative process, and many such models perform well on local coherence at the cost of global coherence. We propose a novel approach for incorporating global structure in the form of relational constraints between different subcomponents of an example (e.g., lines of a poem or measures of music). Our generative model has two parts: (i) one model to generate a realistic set of relational constraints, and (ii) a second model to generate realistic data satisfying these constraints. For model (i), we propose a program synthesis algorithm that infers the relational constraints present in the training data, and then learn a generative model based on the resulting constraint data. In our experiments, we show that our approach significantly improves over state-of-the-art in terms of capturing high-level structure in the data, while performing comparably or better in terms of low-level structure.", "keywords": "synthesis;music;generative;constraints", "primary_area": "", "supplementary_material": "/attachment/1b7123e8e145c62ce22eae7ecc81bf1a9bc525b4.zip", "author": "Halley Young;Maxwell Du;Osbert Bastani", "authorids": "~Halley_Young1;~Maxwell_Du1;~Osbert_Bastani1", "gender": "F;;M", "homepage": "https://www.seas.upenn.edu/~halleyy/;;http://obastani.github.io", "dblp": "231/5126;;21/11275", "google_scholar": ";;cxYepGkAAAAJ", "orcid": ";;", "linkedin": ";maxwell-du-5a90541b7/;", "or_profile": "~Halley_Young1;~Maxwell_Du1;~Osbert_Bastani1", "aff": "Google;School of Engineering and Applied Science, University of Pennsylvania;University of Pennsylvania", "aff_domain": "google.com;seas.upenn.edu;upenn.edu", "position": "Researcher;Undergrad student;Assistant Professor", "bibtex": "@misc{\nyoung2022neurosymbolic,\ntitle={Neurosymbolic Deep Generative Models for Sequence Data with Relational Constraints},\nauthor={Halley Young and Maxwell Du and Osbert Bastani},\nyear={2022},\nurl={https://openreview.net/forum?id=dK_t8oN8G4}\n}", "github": "", "project": "", "reviewers": "nfwh;gmsG;HkdV;33jH", "site": "https://openreview.net/forum?id=dK_t8oN8G4", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "3;4;4;3", "correctness": "2;3;4;4", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;3;3;4", "wc_summary_paper": "92;139;55;49", "wc_summary_review": "50;57;54;28", "wc_main_review": "492;351;388;219", "wc_review": "634;547;497;296", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "128;178;677;373", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 83.75, 35.89829383132296 ], "wc_summary_review_avg": [ 47.25, 11.388041973930374 ], "wc_main_review_avg": [ 362.5, 97.65372496735596 ], "wc_review_avg": [ 493.5, 124.11788751022151 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 339.0, 215.5469786380686 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.9045340337332909, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1867268826050905974&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 5, "aff_unique_index": "0;1;1", "aff_unique_norm": "Google;University of Pennsylvania", "aff_unique_dep": "Google;School of Engineering and Applied Science", "aff_unique_url": "https://www.google.com;https://www.upenn.edu", "aff_unique_abbr": "Google;UPenn", "aff_campus_unique_index": "0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "dLDzuxaN0Hd", "title": "Unsupervised Pose-Aware Part Decomposition for 3D Articulated Objects", "track": "main", "status": "Reject", "tldr": "", "abstract": "Articulated objects exist widely in the real world. However, previous 3D generative methods for unsupervised part decomposition are unsuitable for such objects, because they assume a spatially fixed part location, resulting in inconsistent part parsing. In this paper, we propose PPD (unsupervised Pose-aware Part Decomposition) to address a novel setting that explicitly targets man-made articulated objects with mechanical joints, considering the part poses. We show that category-common prior learning for both part shapes and poses facilitates the unsupervised learning of (1) part decomposition with non-primitive-based implicit representation, and (2) part pose as joint parameters under single-frame shape supervision. We evaluate our method on synthetic and real datasets, and we show that it outperforms previous works in consistent part parsing of the articulated objects based on comparable part pose estimation performance to the supervised baseline.", "keywords": "unsupervised part decomposition;shape abstraction;3D shape representations;generative models;computer vision", "primary_area": "", "supplementary_material": "/attachment/0dfa102c620c0f40e4b881f3d9f953f9eee4b7ec.zip", "author": "Yuki Kawana;YUSUKE Mukuta;Tatsuya Harada", "authorids": "~Yuki_Kawana1;~YUSUKE_Mukuta1;~Tatsuya_Harada1", "gender": "M;;M", "homepage": ";https://www.mi.t.u-tokyo.ac.jp/mukuta/;https://www.mi.t.u-tokyo.ac.jp/harada/", "dblp": "165/1713;153/5464;14/5849", "google_scholar": "l2e8yqcAAAAJ;https://scholar.google.co.jp/citations?user=emo91rIAAAAJ;https://scholar.google.com/citations?hl=ja", "orcid": ";;", "linkedin": ";;", "or_profile": "~Yuki_Kawana1;~YUSUKE_Mukuta1;~Tatsuya_Harada1", "aff": "The University of Tokyo;The University of Tokyo;The University of Tokyo", "aff_domain": "u-tokyo.ac.jp;u-tokyo.ac.jp;u-tokyo.ac.jp", "position": "PhD student;Lecturer;Full Professor", "bibtex": "@misc{\nkawana2022unsupervised,\ntitle={Unsupervised Pose-Aware Part Decomposition for 3D Articulated Objects},\nauthor={Yuki Kawana and YUSUKE Mukuta and Tatsuya Harada},\nyear={2022},\nurl={https://openreview.net/forum?id=dLDzuxaN0Hd}\n}", "github": "", "project": "", "reviewers": "YkRG;NczN;4Pds;bBs4", "site": "https://openreview.net/forum?id=dLDzuxaN0Hd", "pdf_size": 0, "recommendation": "5;5;8;8", "confidence": "4;4;2;4", "correctness": "3;3;3;4", "technical_novelty": "4;3;3;3", "empirical_novelty": "3;3;3;3", "wc_summary_paper": "74;177;65;62", "wc_summary_review": "38;106;66;39", "wc_main_review": "358;1623;170;502", "wc_review": "470;1906;301;603", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "1232;2572;290;180", "reply_reviewers": "0;0;0;0", "reply_authors": "2;4;1;1", "recommendation_avg": [ 6.5, 1.5 ], "confidence_avg": [ 3.5, 0.8660254037844386 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 3.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 94.5, 47.83565615730592 ], "wc_summary_review_avg": [ 62.25, 27.643941470058138 ], "wc_main_review_avg": [ 663.25, 566.4792030604477 ], "wc_review_avg": [ 820.0, 636.0711438196203 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 1068.5, 959.5231888808107 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.0, 1.224744871391589 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.5773502691896258, "corr_recommendation_correctness": 0.5773502691896258, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18355054817467843868&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Tokyo", "aff_unique_dep": "", "aff_unique_url": "https://www.u-tokyo.ac.jp", "aff_unique_abbr": "UTokyo", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Japan" }, { "id": "dLTXoSIcrik", "title": "Avoiding Overfitting to the Importance Weights in Offline Policy Optimization", "track": "main", "status": "Reject", "tldr": "", "abstract": "Offline policy optimization has a critical impact on many real-world decision-making problems, as online learning is costly and concerning in many applications. Importance sampling and its variants are a widely used type of estimator in offline policy evaluation, which can be helpful to remove assumptions on the chosen function approximations used to represent value functions and process models. In this paper, we identify an important overfitting phenomenon in optimizing the importance weighted return, and propose an algorithm to avoid this overfitting. We provide a theoretical justification of the proposed algorithm through a better per-state-neighborhood normalization condition and show the limitation of previous attempts to this approach through an illustrative example. We further test our proposed method in a healthcare-inspired simulator and a logged dataset collected from real hospitals. These experiments show the proposed method with less overfitting and better test performance compared with state-of-the-art batch reinforcement learning algorithms.", "keywords": "Reinforcement Learning;Batch Reinforcement Learning;Policy Optimization;Overfitting", "primary_area": "", "supplementary_material": "/attachment/2c51e4c46784d8e2e3f7bf434d5d3880be6f5720.zip", "author": "Yao Liu;Emma Brunskill", "authorids": "~Yao_Liu1;~Emma_Brunskill2", "gender": "M;", "homepage": "http://yao-liu.com/;", "dblp": "64/424-9.html;", "google_scholar": "umAny5UAAAAJ;", "orcid": ";", "linkedin": ";", "or_profile": "~Yao_Liu1;~Emma_Brunskill2", "aff": "ByteDance;", "aff_domain": "bytedance.com;", "position": "Researcher;", "bibtex": "@misc{\nliu2022avoiding,\ntitle={Avoiding Overfitting to the Importance Weights in Offline Policy Optimization},\nauthor={Yao Liu and Emma Brunskill},\nyear={2022},\nurl={https://openreview.net/forum?id=dLTXoSIcrik}\n}", "github": "", "project": "", "reviewers": "oG3M;EAaa;2pVL;qYyg", "site": "https://openreview.net/forum?id=dLTXoSIcrik", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "4;4;3;3", "correctness": "4;3;4;3", "technical_novelty": "2;3;3;3", "empirical_novelty": "3;0;2;3", "wc_summary_paper": "124;62;88;40", "wc_summary_review": "147;47;47;72", "wc_main_review": "385;286;181;403", "wc_review": "656;395;316;515", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "563;402;284;677", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 1.224744871391589 ], "wc_summary_paper_avg": [ 78.5, 31.284980421921315 ], "wc_summary_review_avg": [ 78.25, 40.9839907768875 ], "wc_main_review_avg": [ 313.75, 88.64923857541022 ], "wc_review_avg": [ 470.5, 128.41436835494696 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 481.5, 150.1574173992081 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -1.0, "corr_recommendation_correctness": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14147514526721180348&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "ByteDance", "aff_unique_dep": "", "aff_unique_url": "https://www.bytedance.com", "aff_unique_abbr": "ByteDance", "aff_country_unique_index": "0", "aff_country_unique": "China" }, { "title": "The Role of Permutation Invariance in Linear Mode Connectivity of Neural Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6604", "id": "dNigytemkL", "poster": "", "openreview": "https://openreview.net/forum?id=dNigytemkL", "slides": "https://iclr.cc/virtual/2022/poster/6604", "video": "https://iclr.cc/virtual/2022/poster/6604", "author_site": "Rahim Entezari, Hanie Sedghi, Olga Saukh, Behnam Neyshabur", "tldr": "", "abstract": "In this paper, we conjecture that if the permutation invariance of neural networks is taken into account, SGD solutions will likely have no barrier in the linear interpolation between them. Although it is a bold conjecture, we show how extensive empirical attempts fall short of refuting it. We further provide a preliminary theoretical result to support our conjecture. Our conjecture has implications for the lottery ticket hypothesis, distributed training, and ensemble methods. The source code is available at \\url{https://github.com/rahimentezari/PermutationInvariance}.", "keywords": "Permutation;Invariance;Mode Connectivity;Energy Barrier;Loss landscape;Deep Learning", "primary_area": "", "supplementary_material": "", "author": "Rahim Entezari;Hanie Sedghi;Olga Saukh;Behnam Neyshabur", "authorids": "~Rahim_Entezari1;~Hanie_Sedghi1;~Olga_Saukh1;~Behnam_Neyshabur1", "gender": "M;F;F;M", "homepage": "http://rahimentezari.github.io;https://haniesedghi.com/;http://www.olgasaukh.com;https://www.neyshabur.net", "dblp": "193/7037.html;66/8332;37/2725;131/9898", "google_scholar": "CmTeX7kAAAAJ;_9GX96fDWAMC;https://scholar.google.ch/citations?user=f-MDKlYAAAAJ;e1ucbCYAAAAJ", "orcid": ";;0000-0001-7849-3368;", "linkedin": ";hanie-sedghi-71bb2582;saukh/;", "or_profile": "~Rahim_Entezari1;~Hanie_Sedghi1;~Olga_Saukh1;~Behnam_Neyshabur1", "aff": "University of Washington;Google Research, Brain team;Complexity Science Hub;Google", "aff_domain": "uw.edu;google.com;csh.ac.at;google.com", "position": "Intern;Senior Research Scientist;Research Group Leader;Research Scientist", "bibtex": "@inproceedings{\nentezari2022the,\ntitle={The Role of Permutation Invariance in Linear Mode Connectivity of Neural Networks},\nauthor={Rahim Entezari and Hanie Sedghi and Olga Saukh and Behnam Neyshabur},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=dNigytemkL}\n}", "github": "", "project": "", "reviewers": "D6tW;vZXp;EfdQ;iy5q", "pdf_size": 0, "recommendation": "6;6;8;8", "confidence": "4;4;4;3", "correctness": "3;3;4;4", "technical_novelty": "2;3;3;3", "empirical_novelty": "3;3;3;4", "wc_summary_paper": "40;72;116;118", "wc_summary_review": "28;56;49;191", "wc_main_review": "77;583;372;506", "wc_review": "145;711;537;815", "wc_reply_reviewers": "128;201;261;0", "wc_reply_authors": "320;882;687;838", "reply_reviewers": "1;1;1;0", "reply_authors": "2;2;2;2", "recommendation_avg": [ 7.0, 1.0 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 86.5, 32.53843880704789 ], "wc_summary_review_avg": [ 81.0, 64.33894621455966 ], "wc_main_review_avg": [ 384.5, 192.92291206593373 ], "wc_review_avg": [ 552.0, 255.1097802907603 ], "wc_reply_reviewers_avg": [ 147.5, 97.31520949985156 ], "wc_reply_authors_avg": [ 681.75, 221.023047440759 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 2.0, 0.0 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 1.0, "gs_citation": 237, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18352541695309676918&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "pdf": "https://openreview.net/pdf?id=dNigytemkL", "email": "uw.edu;google.com;csh.ac.at;google.com", "author_num": 4, "aff_unique_index": "0;1;2;1", "aff_unique_norm": "University of Washington;Google;Complexity Science Hub", "aff_unique_dep": ";Google Research;", "aff_unique_url": "https://www.washington.edu;https://research.google;", "aff_unique_abbr": "UW;Google;", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States;" }, { "title": "Optimization and Adaptive Generalization of Three layer Neural Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6848", "id": "dPyRNUlttBv", "poster": "", "openreview": "https://openreview.net/forum?id=dPyRNUlttBv", "slides": "https://iclr.cc/virtual/2022/poster/6848", "video": "https://iclr.cc/virtual/2022/poster/6848", "author_site": "Khashayar Gatmiry, Stefanie Jegelka, Jonathan Kelner", "tldr": "", "abstract": "While there has been substantial recent work studying generalization of neural networks, \nthe ability of deep nets in automating the process of feature extraction still evades a thorough mathematical understanding. \nAs a step toward this goal, we analyze learning and generalization of a three-layer neural network with ReLU activations in a regime that goes beyond the linear approximation of the network, and is hence not captured by the common Neural Tangent Kernel. We show that despite nonconvexity of the empirical loss, a variant of SGD converges in polynomially many iterations to a good solution that generalizes. In particular, our generalization bounds are adaptive: they automatically optimize over a family of kernels that includes the Neural Tangent Kernel, to provide the tightest bound. ", "keywords": "deep learning theory;adaptive kernel;robust deep learning;neural tangent kernel;adaptive generalization;non-convex optimization", "primary_area": "", "supplementary_material": "/attachment/ecbe39edaf5ae30cc427ed0c679a34b32f233847.zip", "author": "Khashayar Gatmiry;Stefanie Jegelka;Jonathan Kelner", "authorids": "~Khashayar_Gatmiry1;~Stefanie_Jegelka3;~Jonathan_Kelner1", "gender": "M;F;M", "homepage": "http://ce.sharif.edu/~kgatmiry/;http://people.csail.mit.edu/stefje/;https://math.mit.edu/~kelner/", "dblp": ";38/7003;64/4772.html", "google_scholar": ";gTWUZlsAAAAJ;", "orcid": ";;", "linkedin": ";;", "or_profile": "~Khashayar_Gatmiry1;~Stefanie_Jegelka3;~Jonathan_Kelner1", "aff": "Massachusetts Institute of Technology;Massachusetts Institute of Technology;Massachusetts Institute of Technology", "aff_domain": "mit.edu;mit.edu;mit.edu", "position": "PhD student;Associate Professor;Full Professor", "bibtex": "@inproceedings{\ngatmiry2022optimization,\ntitle={Optimization and Adaptive Generalization of Three layer Neural Networks},\nauthor={Khashayar Gatmiry and Stefanie Jegelka and Jonathan Kelner},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=dPyRNUlttBv}\n}", "github": "", "project": "", "reviewers": "NvLr;aHSb;nxBT;MU3r", "pdf_size": 0, "recommendation": "6;8;8;8", "confidence": "3;3;1;3", "correctness": "4;3;4;4", "technical_novelty": "4;4;4;3", "empirical_novelty": "0;1;0;0", "wc_summary_paper": "64;128;103;77", "wc_summary_review": "62;89;28;73", "wc_main_review": "646;184;222;200", "wc_review": "772;401;353;350", "wc_reply_reviewers": "192;0;0;0", "wc_reply_authors": "1051;84;45;162", "reply_reviewers": "1;0;0;0", "reply_authors": "3;1;1;1", "recommendation_avg": [ 7.5, 0.8660254037844386 ], "confidence_avg": [ 2.5, 0.8660254037844386 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 0.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 93.0, 24.606909598728564 ], "wc_summary_review_avg": [ 63.0, 22.371857321197094 ], "wc_main_review_avg": [ 313.0, 192.73038162158036 ], "wc_review_avg": [ 469.0, 176.10366265356322 ], "wc_reply_reviewers_avg": [ 48.0, 83.13843876330611 ], "wc_reply_authors_avg": [ 335.5, 415.23637846412254 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.5, 0.8660254037844386 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": -0.3333333333333333, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4433356447323076923&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "pdf": "https://openreview.net/pdf?id=dPyRNUlttBv", "email": "mit.edu;mit.edu;mit.edu", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Massachusetts Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://web.mit.edu", "aff_unique_abbr": "MIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Controlling the Complexity and Lipschitz Constant improves Polynomial Nets", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/5906", "id": "dQ7Cy_ndl1s", "poster": "", "openreview": "https://openreview.net/forum?id=dQ7Cy_ndl1s", "slides": "https://iclr.cc/virtual/2022/poster/5906", "video": "https://iclr.cc/virtual/2022/poster/5906", "author_site": "Zhenyu Zhu, Fabian Latorre, Grigorios Chrysos, Volkan Cevher", "tldr": "", "abstract": "While the class of Polynomial Nets demonstrates comparable performance to neural networks (NN), it currently has neither theoretical generalization characterization nor robustness guarantees. To this end, we derive new complexity bounds for the set of Coupled CP-Decomposition (CCP) and Nested Coupled CP-decomposition (NCP) models of Polynomial Nets in terms of the $\\ell_\\infty$-operator-norm and the $\\ell_2$-operator norm. In addition, we derive bounds on the Lipschitz constant for both models to establish a theoretical certificate for their robustness. The theoretical results enable us to propose a principled regularization scheme that we also evaluate experimentally and show that it improves the accuracy as well as the robustness of the models to adversarial perturbations. We showcase how this regularization can be combined with adversarial training, resulting in further improvements.", "keywords": "Polynomial Nets;Rademacher Complexity;Lipschitz constant;Coupled CP decomposition", "primary_area": "", "supplementary_material": "", "author": "Zhenyu Zhu;Fabian Latorre;Grigorios Chrysos;Volkan Cevher", "authorids": "~Zhenyu_Zhu1;~Fabian_Latorre1;~Grigorios_Chrysos1;~Volkan_Cevher1", "gender": "M;M;M;M", "homepage": "https://zhuzhenyu1997.github.io/;https://fabianlatorre.com;https://grigorisg9gr.github.io/;http://lions.epfl.ch", "dblp": ";244/9638;75/6117-2;70/5301", "google_scholar": "rft3OB4AAAAJ;B46S5NwAAAAJ;1bU041kAAAAJ;https://scholar.google.ch/citations?user=hlWhzU8AAAAJ", "orcid": ";;;", "linkedin": "zhenyu-zhu-045471139/;;;", "or_profile": "~Zhenyu_Zhu1;~Fabian_Latorre1;~Grigorios_Chrysos1;~Volkan_Cevher1", "aff": "Swiss Federal Institute of Technology Lausanne;SalesForce.com;Swiss Federal Institute of Technology Lausanne;Swiss Institute of Technology", "aff_domain": "epfl.ch;salesforce.com;epfl.ch;epfl.ch", "position": "MS student;Intern;Postdoc;Associate Professor", "bibtex": "@inproceedings{\nzhu2022controlling,\ntitle={Controlling the Complexity and Lipschitz Constant improves Polynomial Nets},\nauthor={Zhenyu Zhu and Fabian Latorre and Grigorios Chrysos and Volkan Cevher},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=dQ7Cy_ndl1s}\n}", "github": "", "project": "", "reviewers": "GLBZ;opuA;SK73;7Q5M", "pdf_size": 0, "recommendation": "5;5;6;8", "confidence": "3;3;3;3", "correctness": "3;4;4;4", "technical_novelty": "3;3;3;4", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "41;88;105;130", "wc_summary_review": "22;29;98;120", "wc_main_review": "343;260;172;326", "wc_review": "406;377;375;576", "wc_reply_reviewers": "46;117;0;0", "wc_reply_authors": "1093;2584;331;354", "reply_reviewers": "1;1;0;0", "reply_authors": "4;5;2;1", "recommendation_avg": [ 6.0, 1.224744871391589 ], "confidence_avg": [ 3.0, 0.0 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 91.0, 32.50384592629001 ], "wc_summary_review_avg": [ 67.25, 42.540421953713626 ], "wc_main_review_avg": [ 275.25, 67.1914243039988 ], "wc_review_avg": [ 433.5, 83.1820293092204 ], "wc_reply_reviewers_avg": [ 40.75, 47.861127232859864 ], "wc_reply_authors_avg": [ 1090.5, 915.1258110227249 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 3.0, 1.5811388300841898 ], "replies_avg": [ 20, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.4714045207910316, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5179107583328363512&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=dQ7Cy_ndl1s", "email": "epfl.ch;salesforce.com;epfl.ch;epfl.ch", "author_num": 4, "aff_unique_index": "0;1;0;2", "aff_unique_norm": "Swiss Federal Institute of Technology Lausanne;Salesforce;Swiss Federal Institute of Technology", "aff_unique_dep": ";;", "aff_unique_url": "https://www.epfl.ch;https://www.salesforce.com;https://www.ethz.ch", "aff_unique_abbr": "EPFL;Salesforce;ETH Zurich", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Lausanne;", "aff_country_unique_index": "0;1;0;0", "aff_country_unique": "Switzerland;United States" }, { "id": "dS3AxHZkrZT", "title": "You May Need both Good-GAN and Bad-GAN for Anomaly Detection", "track": "main", "status": "Reject", "tldr": "", "abstract": "Generative adversarial nets (GAN) have been successfully adapted for anomaly detection, where end-to-end anomaly scoring by so-called Bad-GAN has shown promising results. A Bad-GAN generates pseudo anomalies at the low-density area of inlier distribution, and thus the inlier/outlier distinction can be approximated. However, the generated pseudo anomalies from existing Bad-GAN approaches may (1) converge to certain patterns with limited diversity, and (2) differ from the real anomalies, making the anomaly detection hard to generalize. In this work, we propose a new model called Taichi-GAN to address the aforementioned issues of a conventional Bad-GAN. First, a new orthogonal loss is proposed to regularize the cosine distance of decentralized generated samples in a Bad-GAN. Second, we utilize few anomaly samples (when available) with a conventional GAN, i.e., so-called Good-GAN, to draw the generated pseudo anomalies closer to the real anomalies. Our Taichi-GAN incorporates Good-GAN and Bad-GAN in an adversarial manner; which generates pseudo anomalies that contributing to a more robust discriminator for anomaly scoring, and thus anomaly detection. Substantial improvements can be observed from our proposed model on multiple simulated and real-life anatomy detection tasks.", "keywords": "Anomaly Detection;GAN;Orthogonal Regularization;Bad-GAN", "primary_area": "", "supplementary_material": "", "author": "Riqiang Gao;Zhoubing Xu;Guillaume Chabin;Awais Mansoor;Florin-Cristian Ghesu;Bogdan Georgescu;Bennett A. Landman;Sasa Grbic", "authorids": "~Riqiang_Gao1;~Zhoubing_Xu1;guillaume.chabin@siemens-healthineers.com;awais.mansoor@siemens-healthineers.com;~Florin-Cristian_Ghesu1;bogdan.georgescu@siemens-healthineers.com;~Bennett_A._Landman1;sasa.grbic@siemens-healthineers.com", "gender": "M;M;;;M;;M;", "homepage": "https://riqianggao.github.io/;;;;;;https://my.vanderbilt.edu/masi/;", "dblp": "169/7226;;;;;;74/1206;", "google_scholar": "VjI_dtUAAAAJ;fLQ3qzAAAAAJ;;;https://scholar.google.co.uk/citations?user=Z1-KZ8RoM6YC;;tmTcH0QAAAAJ;", "orcid": "0000-0002-8729-1941;;;;;;0000-0001-5733-2127;", "linkedin": "riqiang-gao-97223b119/;;;;;;bennett-landman-96129/;", "or_profile": "~Riqiang_Gao1;~Zhoubing_Xu1;guillaume.chabin@siemens-healthineers.com;awais.mansoor@siemens-healthineers.com;~Florin-Cristian_Ghesu1;bogdan.georgescu@siemens-healthineers.com;~Bennett_A._Landman1;sasa.grbic@siemens-healthineers.com", "aff": "Vanderbilt University;Siemens Healthineers;;;Siemens Healthineers;;Vanderbilt University;", "aff_domain": "vanderbilt.edu;siemens-healthineers.com;;;siemens-healthineers.com;;vanderbilt.edu;", "position": "PhD student;Research Scientist;;;AI Research Scientist;;Full Professor;", "bibtex": "@misc{\ngao2022you,\ntitle={You May Need both Good-{GAN} and Bad-{GAN} for Anomaly Detection},\nauthor={Riqiang Gao and Zhoubing Xu and Guillaume Chabin and Awais Mansoor and Florin-Cristian Ghesu and Bogdan Georgescu and Bennett A. Landman and Sasa Grbic},\nyear={2022},\nurl={https://openreview.net/forum?id=dS3AxHZkrZT}\n}", "github": "", "project": "", "reviewers": "h6RB;LRLn;HJ5T;83KL", "site": "https://openreview.net/forum?id=dS3AxHZkrZT", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "4;4;3;3", "correctness": "3;3;4;4", "technical_novelty": "3;3;2;3", "empirical_novelty": "2;2;2;4", "wc_summary_paper": "84;57;139;78", "wc_summary_review": "115;37;26;65", "wc_main_review": "418;408;129;218", "wc_review": "617;502;294;361", "wc_reply_reviewers": "239;146;0;0", "wc_reply_authors": "1002;1032;334;432", "reply_reviewers": "1;1;0;0", "reply_authors": "2;2;1;1", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 89.5, 30.286135441815617 ], "wc_summary_review_avg": [ 60.75, 34.39749264117953 ], "wc_main_review_avg": [ 293.25, 123.86560256988217 ], "wc_review_avg": [ 443.5, 125.18086914540896 ], "wc_reply_reviewers_avg": [ 96.25, 101.71129484968716 ], "wc_reply_authors_avg": [ 700.0, 319.06425685118666 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": -0.6882472016116854, "corr_recommendation_correctness": 0.6882472016116854, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Uziu012CSjEJ:scholar.google.com/&scioq=You+May+Need+both+Good-GAN+and+Bad-GAN+for+Anomaly+Detection&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1;1;0", "aff_unique_norm": "Vanderbilt University;Siemens Healthineers", "aff_unique_dep": ";", "aff_unique_url": "https://www.vanderbilt.edu;https://www.siemens-healthineers.com", "aff_unique_abbr": "Vanderbilt;Siemens Healthineers", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;0", "aff_country_unique": "United States;Germany" }, { "title": "High Probability Bounds for a Class of Nonconvex Algorithms with AdaGrad Stepsize", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7173", "id": "dSw0QtRMJkO", "poster": "", "openreview": "https://openreview.net/forum?id=dSw0QtRMJkO", "slides": "https://iclr.cc/virtual/2022/poster/7173", "video": "https://iclr.cc/virtual/2022/poster/7173", "author_site": "Ali Kavis, Kfir Y Levy, Volkan Cevher", "tldr": "", "abstract": "In this paper, we propose a new, simplified high probability analysis of AdaGrad for smooth, non-convex problems. \nMore specifically, we focus on a particular accelerated gradient (AGD) template (Lan, 2020), through which we recover the original AdaGrad and its variant with averaging, and prove a convergence rate of $\\mathcal O (1/ \\sqrt{T})$ with high probability without the knowledge of smoothness and variance. \nWe use a particular version of Freedman's concentration bound for martingale difference sequences (Kakade & Tewari, 2008) which enables us to achieve the best-known dependence of $\\log (1 / \\delta )$ on the probability margin $\\delta$. \nWe present our analysis in a modular way and obtain a complementary $\\mathcal O (1 / T)$ convergence rate in the deterministic setting. \nTo the best of our knowledge, this is the first high probability result for AdaGrad with a truly adaptive scheme, i.e., completely oblivious to the knowledge of smoothness and uniform variance bound, which simultaneously has best-known dependence of $\\log( 1/ \\delta)$. \nWe further prove noise adaptation property of AdaGrad under additional noise assumptions.", "keywords": "adaptive methods;nonconvex optimization;stochastic optimization;high probability bounds", "primary_area": "", "supplementary_material": "", "author": "Ali Kavis;Kfir Yehuda Levy;Volkan Cevher", "authorids": "~Ali_Kavis1;~Kfir_Yehuda_Levy1;~Volkan_Cevher1", "gender": ";M;M", "homepage": "https://alikavis.github.io;http://kfiryehud.wixsite.com/kfir-y-levy;http://lions.epfl.ch", "dblp": "231/7697;83/11388;70/5301", "google_scholar": "sPrPq6oAAAAJ;;https://scholar.google.ch/citations?user=hlWhzU8AAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Ali_Kavis1;~Kfir_Yehuda_Levy1;~Volkan_Cevher1", "aff": "Swiss Federal Institute of Technology Lausanne;Technion - Israel Institute of Technology, Technion;Swiss Institute of Technology", "aff_domain": "epfl.ch;technion.ac.il;epfl.ch", "position": "PhD student;Assistant Professor;Associate Professor", "bibtex": "@inproceedings{\nkavis2022high,\ntitle={High Probability Bounds for a Class of Nonconvex Algorithms with AdaGrad Stepsize},\nauthor={Ali Kavis and Kfir Yehuda Levy and Volkan Cevher},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=dSw0QtRMJkO}\n}", "github": "", "project": "", "reviewers": "BF1x;goz1;GqwP", "pdf_size": 0, "recommendation": "6;6;8", "confidence": "4;4;3", "correctness": "4;4;4", "technical_novelty": "3;2;3", "empirical_novelty": "3;2;3", "wc_summary_paper": "105;157;28", "wc_summary_review": "20;81;25", "wc_main_review": "384;386;198", "wc_review": "509;624;251", "wc_reply_reviewers": "139;46;123", "wc_reply_authors": "755;731;1071", "reply_reviewers": "1;1;1", "reply_authors": "1;1;2", "recommendation_avg": [ 6.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 4.0, 0.0 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 96.66666666666667, 52.99266196580638 ], "wc_summary_review_avg": [ 42.0, 27.65260686927485 ], "wc_main_review_avg": [ 322.6666666666667, 88.15642662651179 ], "wc_review_avg": [ 461.3333333333333, 155.9622461438095 ], "wc_reply_reviewers_avg": [ 102.66666666666667, 40.59830319388019 ], "wc_reply_authors_avg": [ 852.3333333333334, 154.93080892952037 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.9999999999999998, "corr_recommendation_correctness": 0.0, "gs_citation": 49, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14815503456596540696&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "pdf": "https://openreview.net/pdf?id=dSw0QtRMJkO", "email": "epfl.ch;technion.ac.il;epfl.ch", "author_num": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "Swiss Federal Institute of Technology Lausanne;Technion - Israel Institute of Technology;Swiss Federal Institute of Technology", "aff_unique_dep": ";;", "aff_unique_url": "https://www.epfl.ch;https://www.technion.ac.il;https://www.ethz.ch", "aff_unique_abbr": "EPFL;Technion;ETH Zurich", "aff_campus_unique_index": "0", "aff_campus_unique": "Lausanne;", "aff_country_unique_index": "0;1;0", "aff_country_unique": "Switzerland;Israel" }, { "title": "Knowledge Removal in Sampling-based Bayesian Inference", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6020", "id": "dTqOcTUOQO", "poster": "", "openreview": "https://openreview.net/forum?id=dTqOcTUOQO", "slides": "https://iclr.cc/virtual/2022/poster/6020", "video": "https://iclr.cc/virtual/2022/poster/6020", "author_site": "Shaopeng Fu, Fengxiang He, Dacheng Tao", "tldr": "", "abstract": "The right to be forgotten has been legislated in many countries, but its enforcement in the AI industry would cause unbearable costs. When single data deletion requests come, companies may need to delete the whole models learned with massive resources. Existing works propose methods to remove knowledge learned from data for explicitly parameterized models, which however are not appliable to the sampling-based Bayesian inference, {\\it i.e.}, Markov chain Monte Carlo (MCMC), as MCMC can only infer implicit distributions. In this paper, we propose the first machine unlearning algorithm for MCMC. We first convert the MCMC unlearning problem into an explicit optimization problem. Based on this problem conversion, an {\\it MCMC influence function} is designed to provably characterize the learned knowledge from data, which then delivers the MCMC unlearning algorithm. Theoretical analysis shows that MCMC unlearning would not compromise the generalizability of the MCMC models. Experiments on Gaussian mixture models and Bayesian neural networks confirm the effectiveness of the proposed algorithm. The code is available at \\url{https://github.com/fshp971/mcmc-unlearning}.", "keywords": "Bayesian inference;Markov chain Monte Carlo;machine unlearning", "primary_area": "", "supplementary_material": "/attachment/331250a7e3bdf071d5d4598ff8f4ab72e4c4e6e9.zip", "author": "Shaopeng Fu;Fengxiang He;Dacheng Tao", "authorids": "~Shaopeng_Fu1;~Fengxiang_He1;~Dacheng_Tao1", "gender": ";;", "homepage": "https://shaopengfu.me;https://fengxianghe.github.io/;", "dblp": "278/8181;225/4682;", "google_scholar": "i7cWm4gAAAAJ;QSx-Yu0AAAAJ;", "orcid": ";;", "linkedin": ";fengxiang-he-35b173122;", "or_profile": "~Shaopeng_Fu1;~Fengxiang_He1;~Dacheng_Tao1", "aff": "JD Explore Academy;JD.com, Inc.;", "aff_domain": "jd.com;jd.com;", "position": "Researcher;Algorithm Scientist;", "bibtex": "@inproceedings{\nfu2022knowledge,\ntitle={Knowledge Removal in Sampling-based Bayesian Inference},\nauthor={Shaopeng Fu and Fengxiang He and Dacheng Tao},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=dTqOcTUOQO}\n}", "github": "", "project": "", "reviewers": "2pj7;b83d;VQTF;uE21", "pdf_size": 0, "recommendation": "3;8;8;8", "confidence": "3;2;2;3", "correctness": "4;4;4;4", "technical_novelty": "3;4;3;4", "empirical_novelty": "2;0;3;4", "wc_summary_paper": "63;126;101;70", "wc_summary_review": "76;57;30;15", "wc_main_review": "356;234;218;130", "wc_review": "495;417;349;215", "wc_reply_reviewers": "0;0;24;0", "wc_reply_authors": "1164;608;1028;11", "reply_reviewers": "0;0;1;0", "reply_authors": "2;1;3;1", "recommendation_avg": [ 6.75, 2.165063509461097 ], "confidence_avg": [ 2.5, 0.5 ], "correctness_avg": [ 4.0, 0.0 ], "technical_novelty_avg": [ 3.5, 0.5 ], "empirical_novelty_avg": [ 2.25, 1.479019945774904 ], "wc_summary_paper_avg": [ 90.0, 25.228951623085727 ], "wc_summary_review_avg": [ 44.5, 23.606143268225754 ], "wc_main_review_avg": [ 234.5, 80.5527777298834 ], "wc_review_avg": [ 369.0, 102.82995672468213 ], "wc_reply_reviewers_avg": [ 6.0, 10.392304845413264 ], "wc_reply_authors_avg": [ 702.75, 448.8971903454064 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.75, 0.82915619758885 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.0, "gs_citation": 36, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3535045679170951379&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=dTqOcTUOQO", "email": "jd.com;jd.com;", "author_num": 3, "aff_unique_index": "0;1", "aff_unique_norm": "JD;JD.com", "aff_unique_dep": "JD Explore Academy;", "aff_unique_url": ";https://www.jd.com", "aff_unique_abbr": ";JD.com", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "1", "aff_country_unique": ";China" }, { "id": "dUHgnS1Tu13", "title": "Local-Global Shifting Vision Transformers", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Recent work has shown the potential of transformers for computer vision applications. An image is first partitioned into patches, which are then used as input tokens for the attention mechanism. Due to the expensive quadratic cost of the attention mechanism, either a large patch size is used, resulting in coarse-grained global interactions, or alternatively, attention is applied only on a local region of the image at the expense of long-range interactions. In this work, we propose an approach that allows for both coarse global interactions and fine-grained local interactions simultaneously. At the core of our method is the application of local and global attention layers. In the local attention layer, we apply attention to each patch and its local shifts, resulting in virtually located local patches, which are not bound to a single, specific location. These virtually located patches are then used in global attention layers, where global coarse interactions are learned, using a pyramid of attention layers applied on decreasing resolution inputs. The separation of the attention layer into local and global counterparts allows for a low computational cost in the number of patches, while still supporting data-dependent localization, as opposed to the static positioning in other visual transformers. Our method is shown to be superior to both convolutional and transformer-based methods for image classification on CIFAR10, CIFAR100, and ImageNet.", "keywords": "Visual transformers", "primary_area": "", "supplementary_material": "", "author": "Shelly Sheynin;Sagie Benaim;Adam Polyak;Lior Wolf", "authorids": "~Shelly_Sheynin1;~Sagie_Benaim1;~Adam_Polyak1;~Lior_Wolf1", "gender": "F;M;;M", "homepage": ";https://sagiebenaim.github.io/;;http://www.cs.tau.ac.il/~wolf", "dblp": "291/4069;129/1316;;83/4103", "google_scholar": "7vLSIswAAAAJ;-zSM2I8AAAAJ;;UbFrXTsAAAAJ", "orcid": ";0000-0003-0002-3467;;0000-0001-5578-8892", "linkedin": "shelly-sheynin-40663a116?originalSubdomain=il;sagie-benaim-aab47474/;;", "or_profile": "~Shelly_Sheynin1;~Sagie_Benaim1;~Adam_Polyak1;~Lior_Wolf1", "aff": "META;Tel Aviv University;;Tel Aviv University", "aff_domain": "meta.com;tau.ac.il;;tau.ac.il", "position": "Researcher;PhD student;;Full Professor", "bibtex": "@misc{\nsheynin2022localglobal,\ntitle={Local-Global Shifting Vision Transformers},\nauthor={Shelly Sheynin and Sagie Benaim and Adam Polyak and Lior Wolf},\nyear={2022},\nurl={https://openreview.net/forum?id=dUHgnS1Tu13}\n}", "github": "", "project": "", "reviewers": "UiaB;7Sp9;duoi", "site": "https://openreview.net/forum?id=dUHgnS1Tu13", "pdf_size": 0, "recommendation": "3;3;6", "confidence": "4;4;2", "correctness": "3;2;3", "technical_novelty": "2;2;2", "empirical_novelty": "1;1;2", "wc_summary_paper": "98;55;83", "wc_summary_review": "87;37;42", "wc_main_review": "619;157;84", "wc_review": "804;249;209", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 4.0, 1.4142135623730951 ], "confidence_avg": [ 3.3333333333333335, 0.9428090415820634 ], "correctness_avg": [ 2.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 1.3333333333333333, 0.4714045207910317 ], "wc_summary_paper_avg": [ 78.66666666666667, 17.82008853949821 ], "wc_summary_review_avg": [ 55.333333333333336, 22.484562605386735 ], "wc_main_review_avg": [ 286.6666666666667, 236.87737099370403 ], "wc_review_avg": [ 420.6666666666667, 271.54905429569976 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -1.0, "corr_recommendation_correctness": 0.5, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12512994796351779086&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "1;1", "aff_unique_norm": ";Tel Aviv University", "aff_unique_dep": ";", "aff_unique_url": ";https://www.tau.ac.il", "aff_unique_abbr": ";TAU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "1;1", "aff_country_unique": ";Israel" }, { "title": "Revisiting Over-smoothing in BERT from the Perspective of Graph", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6118", "id": "dUV91uaXm3", "poster": "", "openreview": "https://openreview.net/forum?id=dUV91uaXm3", "slides": "https://iclr.cc/virtual/2022/poster/6118", "video": "https://iclr.cc/virtual/2022/poster/6118", "author_site": "Han Shi, JIAHUI GAO, Hang Xu, Xiaodan Liang, Zhenguo Li, Lingpeng Kong, Stephen Lee, James Kwok", "tldr": "", "abstract": "Recently over-smoothing phenomenon of Transformer-based models is observed in both vision and language fields. However, no existing work has delved deeper to further investigate the main cause of this phenomenon. In this work, we make the attempt to analyze the over-smoothing problem from the perspective of graph, where such problem was first discovered and explored. Intuitively, the self-attention matrix can be seen as a normalized adjacent matrix of a corresponding graph. Based on the above connection, we provide some theoretical analysis and find that layer normalization plays a key role in the over-smoothing issue of Transformer-based models. Specifically, if the standard deviation of layer normalization is sufficiently large, the output of Transformer stacks will converge to a specific low-rank subspace and result in over-smoothing. To alleviate the over-smoothing problem, we consider hierarchical fusion strategies, which combine the representations from different layers adaptively to make the output more diverse. Extensive experiment results on various data sets illustrate the effect of our fusion method.", "keywords": "BERT;Over-smoothing;Transformer", "primary_area": "", "supplementary_material": "", "author": "Han Shi;JIAHUI GAO;Hang Xu;Xiaodan Liang;Zhenguo Li;Lingpeng Kong;Stephen M. S. Lee;James Kwok", "authorids": "~Han_Shi1;~JIAHUI_GAO1;~Hang_Xu1;~Xiaodan_Liang2;~Zhenguo_Li1;~Lingpeng_Kong1;~Stephen_M._S._Lee1;~James_Kwok1", "gender": "M;;M;F;M;M;;", "homepage": "https://han-shi.github.io/;;;https://www.sysu-hcp.net/;http://www.ee.columbia.edu/~zgli/;https://ikekonglp.github.io/;;", "dblp": ";;;;23/6479;144/7656;;", "google_scholar": "https://scholar.google.com.hk/citations?user=Johp_14AAAAJ;;https://scholar.google.com.hk/citations?user=J_8TX6sAAAAJ;voxznZAAAAAJ;XboZC1AAAAAJ;f1hBi5wAAAAJ;;", "orcid": ";;0000-0003-3645-8972;;;;0000-0003-0482-5272;", "linkedin": ";;;;;;;", "or_profile": "~Han_Shi1;~JIAHUI_GAO1;~Hang_Xu1;~Xiaodan_Liang2;~Zhenguo_Li1;~Lingpeng_Kong1;~Stephen_M._S._Lee1;~James_Kwok1", "aff": "Department of Computer Science and Engineering, The Hong Kong University of Science and Technology;;Huawei Noah\u2018s Ark Lab;SUN YAT-SEN UNIVERSITY;Huawei Noah's Ark Lab;Department of Computer Science, The University of Hong Kong;The University of Hong Kong;", "aff_domain": "cse.ust.hk;;huawei.com;sysu.edu.cn;huawei.com;cs.hku.hk;hku.hk;", "position": "PhD student;;Researcher;Associate Professor;Principal Researcher;Assistant Professor;Full Professor;", "bibtex": "@inproceedings{\nshi2022revisiting,\ntitle={Revisiting Over-smoothing in {BERT} from the Perspective of Graph},\nauthor={Han Shi and JIAHUI GAO and Hang Xu and Xiaodan Liang and Zhenguo Li and Lingpeng Kong and Stephen M. S. Lee and James Kwok},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=dUV91uaXm3}\n}", "github": "", "project": "", "reviewers": "8SR3;9nYH;M9jn;g9XE", "pdf_size": 0, "recommendation": "6;6;8;8", "confidence": "3;3;4;3", "correctness": "4;3;4;4", "technical_novelty": "3;3;3;3", "empirical_novelty": "3;3;4;3", "wc_summary_paper": "34;128;107;132", "wc_summary_review": "31;59;41;71", "wc_main_review": "110;218;247;168", "wc_review": "175;405;395;371", "wc_reply_reviewers": "16;114;0;0", "wc_reply_authors": "383;563;244;462", "reply_reviewers": "1;1;0;0", "reply_authors": "3;2;1;1", "recommendation_avg": [ 7.0, 1.0 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 3.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 100.25, 39.410499869958514 ], "wc_summary_review_avg": [ 50.5, 15.5161206491829 ], "wc_main_review_avg": [ 185.75, 52.06906471216859 ], "wc_review_avg": [ 336.5, 94.05716346988144 ], "wc_reply_reviewers_avg": [ 32.5, 47.50526286633934 ], "wc_reply_authors_avg": [ 413.0, 116.57829986751393 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.75, 0.82915619758885 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 83, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=733392411162340551&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "pdf": "https://openreview.net/pdf?id=dUV91uaXm3", "email": "cse.ust.hk;;huawei.com;sysu.edu.cn;huawei.com;cs.hku.hk;hku.hk;", "author_num": 8, "aff_unique_index": "0;1;2;1;3;3", "aff_unique_norm": "Hong Kong University of Science and Technology;Huawei;Sun Yat-sen University;University of Hong Kong", "aff_unique_dep": "Department of Computer Science and Engineering;Noah's Ark Lab;;Department of Computer Science", "aff_unique_url": "https://www.ust.hk;https://www.huawei.com;http://www.sysu.edu.cn;https://www.hku.hk", "aff_unique_abbr": "HKUST;Huawei;SYSU;HKU", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "China" }, { "id": "dXPou9HkXcZ", "title": "Spatiotemporal Characterization of Gait from Monocular Videos with Transformers", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Human pose estimation from monocular video is a rapidly advancing field that offers great promise to human movement science and rehabilitation. This potential is tempered by the smaller body of work ensuring the outputs are clinically meaningful and properly calibrated. Gait analysis, typically performed in a dedicated lab, produces precise measurements including kinematics and step timing. Using more than 9000 monocular video from an instrumented gait analysis lab, we evaluated the performance of existing algorithms for measuring kinematics. While they produced plausible results that resemble walking, the joint angles and step length were noisy and poorly calibrated. We trained a transformer to map 3D joint location sequences and the height of individuals onto interpretable biomechanical outputs including joint kinematics and phase within the gait cycle. This task-specific layer greatly reduced errors in the kinematics of the hip, knee and foot, and accurately detected the timing of foot down and up events. We show, for the first time, that accurate spatiotemporal gait parameters including walking speed, step length, cadence, double support time, and single support time can be computed on a cycle-by-cycle basis from these interpretable outputs. Our results indicate lifted 3D joint locations contain enough information for gait analysis, but their representation is not biomechanically accurate enough to use directly, suggesting room for improvement in existing algorithms.", "keywords": "human pose estimation;explainable AI;kinematics;gait;rehabilitation;activity recognition", "primary_area": "", "supplementary_material": "/attachment/8539f4007b5423e5f987c0f85e5f9d2dc18ac7c5.zip", "author": "R. James Cotton;Emoonah McClerklin;Anthony Cimorelli;Ankit Patel", "authorids": "~R._James_Cotton1;~Emoonah_McClerklin1;acimorelli@sralab.org;~Ankit_Patel1", "gender": ";F;;", "homepage": ";https://www.sralab.org/researchers/emoonah-mcclerklin-ba;;http://ankitlab.co/", "dblp": ";;;99/646", "google_scholar": ";;;Gbe5UncAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~R._James_Cotton1;~Emoonah_McClerklin1;acimorelli@sralab.org;~Ankit_Patel1", "aff": ";;;Rice University", "aff_domain": ";;;rice.edu", "position": ";;;Assistant Professor", "bibtex": "@misc{\ncotton2022spatiotemporal,\ntitle={Spatiotemporal Characterization of Gait from Monocular Videos with Transformers},\nauthor={R. James Cotton and Emoonah McClerklin and Anthony Cimorelli and Ankit Patel},\nyear={2022},\nurl={https://openreview.net/forum?id=dXPou9HkXcZ}\n}", "github": "", "project": "", "reviewers": "5G2k;Eope;J8mv;Hp5E", "site": "https://openreview.net/forum?id=dXPou9HkXcZ", "pdf_size": 0, "recommendation": "3;3;3;3", "confidence": "4;4;5;4", "correctness": "3;4;2;2", "technical_novelty": "2;1;1;2", "empirical_novelty": "3;3;2;2", "wc_summary_paper": "86;175;60;53", "wc_summary_review": "33;69;87;26", "wc_main_review": "245;274;167;167", "wc_review": "364;518;314;246", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "281;242;153;319", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.82915619758885 ], "technical_novelty_avg": [ 1.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 93.5, 48.63383595810637 ], "wc_summary_review_avg": [ 53.75, 25.193004981542 ], "wc_main_review_avg": [ 213.25, 47.37285615202022 ], "wc_review_avg": [ 360.5, 100.11368537817394 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 248.75, 61.621323419738395 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10706625957515787792&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "Rice University", "aff_unique_dep": "", "aff_unique_url": "https://www.rice.edu", "aff_unique_abbr": "Rice", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "id": "dYUdt59fJ0e", "title": "Yformer: U-Net Inspired Transformer Architecture for Far Horizon Time Series Forecasting", "track": "main", "status": "Reject", "tldr": "", "abstract": "Time series data is ubiquitous in research as well as in a wide variety of industrial applications. Effectively analyzing the available historical data and providing insights into the far future allows us to make effective decisions. Recent research has witnessed the superior performance of transformer-based architectures, especially in the regime of far horizon time series forecasting. However, the current state of the art sparse Transformer architectures fail to couple down- and upsampling procedures to produce outputs in a similar resolution as the input. We propose the Yformer model, based on a novel Y-shaped encoder-decoder architecture that (1) uses direct connection from the downscaled encoder layer to the corresponding upsampled decoder layer in a U-Net inspired architecture, (2) Combines the downscaling/upsampling with sparse attention to capture long-range effects, and (3) stabilizes the encoder-decoder stacks with the addition of an auxiliary reconstruction loss. Extensive experiments have been conducted with relevant baselines on four benchmark datasets, demonstrating an average improvement of 19.82, 18.41 percentage MSE and 13.62, 11.85 percentage MAE in comparison to the current state of the art for the univariate and the multivariate settings respectively.", "keywords": "Time Series Forecasting;U-Net;Transformers", "primary_area": "", "supplementary_material": "/attachment/c84631ebc470f75d6bab3221f28375dd24245d82.zip", "author": "Kiran Madhusudhanan;Johannes Burchert;Nghia Duong-Trung;Stefan Born;Lars Schmidt-Thieme", "authorids": "~Kiran_Madhusudhanan1;burchert@ismll.uni-hildesheim.de;nghia.duong-trung@tu-berlin.de;born@math.tu-berlin.de;~Lars_Schmidt-Thieme1", "gender": "M;;;;M", "homepage": "https://www.ismll.uni-hildesheim.de/personen/madhusud_en.html;;;;https://www.ismll.uni-hildesheim.de/personen/lst_en.html", "dblp": "299/1336;;;;s/LarsSchmidtThieme", "google_scholar": "nPCo70UAAAAJ;;;;https://scholar.google.de/citations?user=l3taTdYAAAAJ", "orcid": "0000-0001-6356-8646;;;;0000-0001-5729-6023", "linkedin": ";;;;", "or_profile": "~Kiran_Madhusudhanan1;burchert@ismll.uni-hildesheim.de;nghia.duong-trung@tu-berlin.de;born@math.tu-berlin.de;~Lars_Schmidt-Thieme1", "aff": "University of Hildesheim;;;;University of Hildesheim", "aff_domain": "uni-hildesheim.de;;;;uni-hildesheim.de", "position": "PhD student;;;;Full Professor", "bibtex": "@misc{\nmadhusudhanan2022yformer,\ntitle={Yformer: U-Net Inspired Transformer Architecture for Far Horizon Time Series Forecasting},\nauthor={Kiran Madhusudhanan and Johannes Burchert and Nghia Duong-Trung and Stefan Born and Lars Schmidt-Thieme},\nyear={2022},\nurl={https://openreview.net/forum?id=dYUdt59fJ0e}\n}", "github": "", "project": "", "reviewers": "HmTV;yrSd;yQSB;p7jA", "site": "https://openreview.net/forum?id=dYUdt59fJ0e", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "4;3;4;3", "correctness": "2;3;3;4", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;3;2;3", "wc_summary_paper": "94;110;48;83", "wc_summary_review": "31;31;2;17", "wc_main_review": "295;530;329;276", "wc_review": "420;671;379;376", "wc_reply_reviewers": "49;0;0;54", "wc_reply_authors": "599;0;0;0", "reply_reviewers": "1;0;0;1", "reply_authors": "1;0;0;0", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 83.75, 22.76373211931646 ], "wc_summary_review_avg": [ 20.25, 11.986972094736853 ], "wc_main_review_avg": [ 357.5, 101.3866362002409 ], "wc_review_avg": [ 461.5, 122.19758590086795 ], "wc_reply_reviewers_avg": [ 25.75, 25.810608284191986 ], "wc_reply_authors_avg": [ 149.75, 259.3746084334394 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 0.25, 0.4330127018922193 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.816496580927726, "gs_citation": 23, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11647165354441728179&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff_unique_index": "0;0", "aff_unique_norm": "University of Hildesheim", "aff_unique_dep": "", "aff_unique_url": "https://www.uni-hildesheim.de/", "aff_unique_abbr": "", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Germany" }, { "title": "Relational Surrogate Loss Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/5918", "id": "dZPgfwaTaXv", "poster": "", "openreview": "https://openreview.net/forum?id=dZPgfwaTaXv", "slides": "https://iclr.cc/virtual/2022/poster/5918", "video": "https://iclr.cc/virtual/2022/poster/5918", "author_site": "Tao Huang, Zekang Li, Hua Lu, Yong Shan, Shusheng Yang, Yang Feng, Fei Wang, Shan You, Chang Xu", "tldr": "", "abstract": "Evaluation metrics in machine learning are often hardly taken as loss functions, as they could be non-differentiable and non-decomposable, e.g., average precision and F1 score. This paper aims to address this problem by revisiting the surrogate loss learning, where a deep neural network is employed to approximate the evaluation metrics. Instead of pursuing an exact recovery of the evaluation metric through a deep neural network, we are reminded of the purpose of the existence of these evaluation metrics, which is to distinguish whether one model is better or worse than another. In this paper, we show that directly maintaining the relation of models between surrogate losses and metrics suffices, and propose a rank correlation-based optimization method to maximize this relation and learn surrogate losses. Compared to previous works, our method is much easier to optimize and enjoys significant efficiency and performance gains. Extensive experiments show that our method achieves improvements on various tasks including image classification and neural machine translation, and even outperforms state-of-the-art methods on human pose estimation and machine reading comprehension tasks. Code is available at: https://github.com/hunto/ReLoss.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Tao Huang;Zekang Li;Hua Lu;Yong Shan;Shusheng Yang;Yang Feng;Fei Wang;Shan You;Chang Xu", "authorids": "~Tao_Huang5;~Zekang_Li1;~Hua_Lu2;~Yong_Shan1;~Shusheng_Yang1;~Yang_Feng4;~Fei_Wang9;~Shan_You3;~Chang_Xu4", "gender": "M;M;M;;M;;M;M;", "homepage": "https://taohuang.info;https://zekangli.com/;;https://scholar.google.com/citations?user=b0xZPMsAAAAJ&hl=zh-CN;https://shushengyang.com;http://people.ucas.edu.cn/~yangfeng?language=en;;https://shanyou92.github.io/;https://sydney.edu.au/engineering/about/our-people/academic-staff/c-xu.html", "dblp": "34/808-20;243/2436;;52/9451;290/1972;07/6095-4.html;;179/2548;97/2966-2", "google_scholar": "jkcRdBgAAAAJ;ZmfOwN8AAAAJ;;b0xZPMsAAAAJ;v6dmW5cntoMC;https://scholar.google.com/citations?hl=en;ljt16JkAAAAJ;https://scholar.google.com/citations?hl=en;N4F_3eoAAAAJ", "orcid": ";;;;;;;0000-0003-1964-0430;0000-0002-4756-0609", "linkedin": ";;;;shushengyang/;;;;", "or_profile": "~Tao_Huang5;~Zekang_Li1;~Hua_Lu2;~Yong_Shan1;~Shusheng_Yang1;~Yang_Feng4;~Fei_Wang9;~Shan_You3;~Charles_Xu1", "aff": "SenseTime Research;Institute Of Computing Technology, Chinese Academy of Sciences;Huazhong University of Science and Technology, Tsinghua University;Wechat AI, Tencent Inc;;Institute of Computing Technology, Chinese Academy of Sciences;University of Science and Technology of China;SenseTime Research;University of Sydney", "aff_domain": "sensetime.com;ict.ac.cn;hust.edu.cn;tencent.com;;ict.ac.cn;mail.ustc.edu.cn;sensetime.com;sydney.edu.au", "position": "Researcher;MS student;Undergrad student;Researcher;;Full Professor;PhD student;Researcher;Assistant Professor", "bibtex": "@inproceedings{\nhuang2022relational,\ntitle={Relational Surrogate Loss Learning},\nauthor={Tao Huang and Zekang Li and Hua Lu and Yong Shan and Shusheng Yang and Yang Feng and Fei Wang and Shan You and Chang Xu},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=dZPgfwaTaXv}\n}", "github": "", "project": "", "reviewers": "eBZN;L49G;Y6nu", "pdf_size": 0, "recommendation": "6;8;8", "confidence": "4;4;4", "correctness": "4;4;4", "technical_novelty": "4;4;4", "empirical_novelty": "4;4;3", "wc_summary_paper": "75;61;32", "wc_summary_review": "26;38;38", "wc_main_review": "125;267;206", "wc_review": "226;366;276", "wc_reply_reviewers": "115;17;0", "wc_reply_authors": "891;180;317", "reply_reviewers": "1;1;0", "reply_authors": "2;1;1", "recommendation_avg": [ 7.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 4.0, 0.0 ], "technical_novelty_avg": [ 4.0, 0.0 ], "empirical_novelty_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 56.0, 17.90716802475106 ], "wc_summary_review_avg": [ 34.0, 5.656854249492381 ], "wc_main_review_avg": [ 199.33333333333334, 58.16260730820867 ], "wc_review_avg": [ 289.3333333333333, 57.92715732327589 ], "wc_reply_reviewers_avg": [ 44.0, 50.68201521907615 ], "wc_reply_authors_avg": [ 462.6666666666667, 307.9981962429145 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 9, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10424444268949840679&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=dZPgfwaTaXv", "email": "sensetime.com;ict.ac.cn;hust.edu.cn;tencent.com;;ict.ac.cn;mail.ustc.edu.cn;sensetime.com;sydney.edu.au", "author_num": 9, "aff_unique_index": "0;1;2;3;1;4;0;5", "aff_unique_norm": "SenseTime;Chinese Academy of Sciences;Huazhong University of Science and Technology;Tencent;University of Science and Technology of China;University of Sydney", "aff_unique_dep": "SenseTime Research;Institute Of Computing Technology;;Wechat AI;;", "aff_unique_url": "https://www.sensetime.com;http://www.ict.ac.cn;http://www.hust.edu.cn;https://www.tencent.com;http://www.ustc.edu.cn;https://www.sydney.edu.au", "aff_unique_abbr": "SenseTime;CAS;HUST;Tencent;USTC;USYD", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;1", "aff_country_unique": "China;Australia" }, { "id": "dZTJQdXh3Gw", "title": "ImageNet as a Representative Basis for Deriving Generally Effective CNN Architectures", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "We investigate and improve the representativeness of ImageNet as a basis for deriving generally effective convolutional neural network (CNN) architectures that perform well on a diverse set of datasets and application domains. To this end, we conduct an extensive empirical study for which we train 500 CNN architectures, sampled from the broad AnyNetX design space, on ImageNet as well as 8 other image classification datasets. We observe that the performances of the architectures are highly dataset dependent. Some datasets even exhibit a negative error correlation with ImageNet across all architectures. We show how to significantly increase these correlations by utilizing ImageNet subsets restricted to fewer classes. We also identify the cumulative width across layers as well as the total depth of the network as the most sensitive design parameter with respect to changing datasets. ", "keywords": "ImageNet;CNN design;dataset representativeness;empirical study", "primary_area": "", "supplementary_material": "", "author": "Lukas Tuggener;Thilo Stadelmann;J\u00fcrgen Schmidhuber", "authorids": "~Lukas_Tuggener1;~Thilo_Stadelmann1;~J\u00fcrgen_Schmidhuber1", "gender": "M;M;M", "homepage": ";http://stdm.github.io;http://people.idsia.ch/~juergen/", "dblp": "218/5412;66/5566;s/JurgenSchmidhuber", "google_scholar": "26wU93QAAAAJ;https://scholar.google.ch/citations?user=6U6ZXzUAAAAJ;https://scholar.google.ch/citations?user=gLnCTgIAAAAJ", "orcid": ";0000-0002-3784-0420;", "linkedin": "lukas-tuggener-580485110/;thilo-stadelmann/;", "or_profile": "~Lukas_Tuggener1;~Thilo_Stadelmann1;~J\u00fcrgen_Schmidhuber1", "aff": "Universit\u00e0 della Svizzera Italiana;Zurich University of Applied Sciences;IDSIA", "aff_domain": "usi.ch;zhaw.ch;idsia.ch", "position": "PhD student;Full Professor;Scientific Director", "bibtex": "@misc{\ntuggener2022imagenet,\ntitle={ImageNet as a Representative Basis for Deriving Generally Effective {CNN} Architectures},\nauthor={Lukas Tuggener and Thilo Stadelmann and J{\\\"u}rgen Schmidhuber},\nyear={2022},\nurl={https://openreview.net/forum?id=dZTJQdXh3Gw}\n}", "github": "", "project": "", "reviewers": "xspT;ZP1L;QJZ1;cuGK", "site": "https://openreview.net/forum?id=dZTJQdXh3Gw", "pdf_size": 0, "recommendation": "3;3;3;6", "confidence": "4;3;4;5", "correctness": "3;3;4;4", "technical_novelty": "3;2;1;2", "empirical_novelty": "2;3;0;2", "wc_summary_paper": "84;45;73;75", "wc_summary_review": "43;42;6;33", "wc_main_review": "161;445;42;167", "wc_review": "288;532;121;275", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.75, 1.299038105676658 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 1.75, 1.0897247358851685 ], "wc_summary_paper_avg": [ 69.25, 14.600941750448838 ], "wc_summary_review_avg": [ 31.0, 14.949916387726054 ], "wc_main_review_avg": [ 203.75, 147.93812050989428 ], "wc_review_avg": [ 304.0, 147.11390145054273 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.816496580927726, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4825483202354164397&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;2", "aff_unique_norm": "Universit\u00e0 della Svizzera italiana;Zurich University of Applied Sciences;Institute of Digital Technologies", "aff_unique_dep": ";;", "aff_unique_url": "https://www.usi.ch;https://www.zhawk.ch;https://www.idsia.ch", "aff_unique_abbr": "USI;ZHAW;IDSIA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Switzerland" }, { "id": "dZ_4XPnNl56", "title": "Training Deep Spiking Neural Networks with Bio-plausible Learning Rules", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "There exists a marked cleavage between the biological plausible approaches and the practical backpropagation-based approaches on how to train a deep spiking neural network (DSNN) with better performance. The well-known bio-plausible learning rule Spike-Timing-Dependent-Plasticity (STDP) cannot explain how the brain adjusts synaptic weights to encode information through accurate spike timing, while the widely applied backpropagation (BP) algorithms lack a biologically credible mechanism. In this work, we wish to answer the question of whether it is possible to train a DSNN using bio-plausible learning rules only and reach the comparable accuracy trained by the BP-based learning rules. We observed that the STDP learning rule calculated between the membrane potential waveform in the apical dendrite of a pyramidal cell and its input spike train with the help of local recurrent connections synthesized by somatostatin (SOM) interneurons is able to perform supervised learning. This architecture is also supported by recent observations of the brain's cortical microcircuits. This new view of how spiking neurons may accurately adjust their complex temporal dynamics with the help of special local feedback connections bridges the performance gap between the bio-plausible approaches and the BP-based approaches and provides a possible answer to how our brain learns. We verify our observation with a simplified spiking neuron model and two different cell types on several datasets and further provide theoretical proof of the equivalence between STDP and BP under a special circumstance.", "keywords": "spiking neural network;bio-plausible;deep learning;STDP", "primary_area": "", "supplementary_material": "/attachment/bcf98e47fa60b9474fdb3792c03c9a94fd469ce1.zip", "author": "Yukun Yang;Peng Li", "authorids": "~Yukun_Yang1;~Peng_Li8", "gender": "M;M", "homepage": ";https://www.ece.ucsb.edu/~lip/", "dblp": "234/4164;83/6353-1.html", "google_scholar": "O-r0dBoAAAAJ;QYQUS7gAAAAJ", "orcid": ";0000-0003-3548-4589", "linkedin": "yangyukun/;peng-li-ucsb/", "or_profile": "~Yukun_Yang1;~Peng_Li8", "aff": "UC Santa Barbara;UC Santa Barbara", "aff_domain": "ucsb.edu;ucsb.edu", "position": "PhD student;Professor", "bibtex": "@misc{\nyang2022training,\ntitle={Training Deep Spiking Neural Networks with Bio-plausible Learning Rules},\nauthor={Yukun Yang and Peng Li},\nyear={2022},\nurl={https://openreview.net/forum?id=dZ_4XPnNl56}\n}", "github": "", "project": "", "reviewers": "ShSQ;L6RU;m3T6;Fbu3", "site": "https://openreview.net/forum?id=dZ_4XPnNl56", "pdf_size": 0, "recommendation": "3;5;5;5", "confidence": "4;3;4;4", "correctness": "2;3;3;3", "technical_novelty": "2;2;3;2", "empirical_novelty": "2;2;2;1", "wc_summary_paper": "75;37;106;69", "wc_summary_review": "119;139;57;67", "wc_main_review": "878;198;338;283", "wc_review": "1072;374;501;419", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.5, 0.8660254037844386 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 71.75, 24.488517717493643 ], "wc_summary_review_avg": [ 95.5, 34.42019755899143 ], "wc_main_review_avg": [ 424.25, 266.6780596524581 ], "wc_review_avg": [ 591.5, 281.1285293242221 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": 1.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:KcZEJM4gft0J:scholar.google.com/&scioq=Training+Deep+Spiking+Neural+Networks+with+Bio-plausible+Learning+Rules&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "University of California, Santa Barbara", "aff_unique_dep": "", "aff_unique_url": "https://www.ucsb.edu", "aff_unique_abbr": "UCSB", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Santa Barbara", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "DriPP: Driven Point Processes to Model Stimuli Induced Patterns in M/EEG Signals", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6724", "id": "d_2lcDh0Y9c", "poster": "", "openreview": "https://openreview.net/forum?id=d_2lcDh0Y9c", "slides": "https://iclr.cc/virtual/2022/poster/6724", "video": "https://iclr.cc/virtual/2022/poster/6724", "author_site": "C\u00e9dric Allain, Alexandre Gramfort, Thomas Moreau", "tldr": "", "abstract": "The quantitative analysis of non-invasive electrophysiology signals from electroencephalography (EEG) and magnetoencephalography (MEG) boils down to the identification of temporal patterns such as evoked responses, transient bursts of neural oscillations but also blinks or heartbeats for data cleaning. Several works have shown that these patterns can be extracted efficiently in an unsupervised way, e.g., using Convolutional Dictionary Learning. This leads to an event-based description of the data. Given these events, a natural question is to estimate how their occurrences are modulated by certain cognitive tasks and experimental manipulations. To address it, we propose a point process approach. While point processes have been used in neuroscience in the past, in particular for single cell recordings (spike trains), techniques such as Convolutional Dictionary Learning make them amenable to human studies based on EEG/MEG signals. We develop a novel statistical point process model \u2013 called driven temporal point processes (DriPP) \u2013 where the intensity function of the point process model is linked to a set of point processes corresponding to stimulation events. We derive a fast and principled expectation-maximization algorithm to estimate the parameters of this model. Simulations reveal that model parameters can be identified from long enough signals. Results on standard MEG datasets demonstrate that our methodology reveals event-related neural responses \u2013 both evoked and induced \u2013 and isolates non-task specific temporal patterns.", "keywords": "Electrophysiology;Neuroscience;Temporal point processes;Convolutional Dictionary Learning", "primary_area": "", "supplementary_material": "/attachment/d706a81153f8ccfeb8e6dacc044d32a23b0fc3d6.zip", "author": "C\u00e9dric Allain;Alexandre Gramfort;Thomas Moreau", "authorids": "~C\u00e9dric_Allain1;~Alexandre_Gramfort1;~Thomas_Moreau2", "gender": "M;M;M", "homepage": ";http://alexandre.gramfort.net;https://tommoral.github.io", "dblp": ";15/7980;150/2391-1", "google_scholar": ";fhxshS0AAAAJ;https://scholar.google.fr/citations?user=HEO_PsAAAAAJ", "orcid": ";0000-0001-9791-4404;0000-0002-1523-3419", "linkedin": "c\u00e9dricallain/;alexandregramfort/;thomasmoreau2010", "or_profile": "~C\u00e9dric_Allain1;~Alexandre_Gramfort1;~Thomas_Martin_Moreau1", "aff": "INRIA;INRIA;INRIA", "aff_domain": "inria.fr;inria.fr;inria.fr", "position": "PhD student;Full Professor;Researcher", "bibtex": "@inproceedings{\nallain2022dripp,\ntitle={Dri{PP}: Driven Point Processes to Model Stimuli Induced Patterns in M/{EEG} Signals},\nauthor={C{\\'e}dric Allain and Alexandre Gramfort and Thomas Moreau},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=d_2lcDh0Y9c}\n}", "github": "", "project": "", "reviewers": "87RT;RDNE;ZvAC;vbHx", "pdf_size": 0, "recommendation": "3;6;8;8", "confidence": "3;4;4;4", "correctness": "2;3;4;3", "technical_novelty": "3;3;4;3", "empirical_novelty": "3;3;4;3", "wc_summary_paper": "60;100;104;63", "wc_summary_review": "29;131;46;164", "wc_main_review": "220;535;123;481", "wc_review": "309;766;273;708", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 6.25, 2.0463381929681126 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 3.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 81.75, 20.327014045353536 ], "wc_summary_review_avg": [ 92.5, 56.54423047491229 ], "wc_main_review_avg": [ 339.75, 172.76772702099197 ], "wc_review_avg": [ 514.0, 224.30225143765276 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.9169493006161777, "corr_recommendation_correctness": 0.8638684255813602, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6674586941995450963&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "pdf": "https://openreview.net/pdf?id=d_2lcDh0Y9c", "email": "inria.fr;inria.fr;inria.fr", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "INRIA", "aff_unique_dep": "", "aff_unique_url": "https://www.inria.fr", "aff_unique_abbr": "INRIA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "France" }, { "id": "daYoG2O4TtU", "title": "Adaptive Speech Duration Modification using a Deep-Generative Framework", "track": "main", "status": "Reject", "tldr": "", "abstract": "We propose the first method to adaptively modify the duration of a given speechsignal. Our approach uses a Bayesian framework to define a latent attention mapthat links frames of the input and target utterances. We train a masked convolu-tional encoder-decoder network to generate this attention map via a stochastic ver-sion of the mean absolute error loss function. Our model also predicts the lengthof the target speech signal using the encoder embeddings, which determines thenumber of time steps for the decoding operation. During testing, we generate theattention map as a proxy for the similarity matrix between the given input speechand an unknown target speech signal. Using this similarity matrix, we compute awarping path of alignment between the two signals. Our experiments demonstratethat this adaptive framework produces similar results to dynamic time warping,which relies on a known target signal, on both voice conversion and emotion con-version tasks. We also show that the modified speech utterances achieve high userquality ratings, thus highlighting the practical utility of our method. ", "keywords": "Prosody;Encoder-Decoder;Attention;Adaptive Duration Modification;Dynamic Time Warping", "primary_area": "", "supplementary_material": "", "author": "Ravi Shankar;Archana Venkataraman", "authorids": "~Ravi_Shankar2;~Archana_Venkataraman1", "gender": "M;F", "homepage": "https://ravi-0841.github.io;https://engineering.jhu.edu/nsa/", "dblp": "61/4380;79/7823", "google_scholar": "https://scholar.google.com/citations?hl=en;dDtlmCAAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Ravi_Shankar2;~Archana_Venkataraman1", "aff": "Johns Hopkins University;Johns Hopkins University", "aff_domain": "jhu.edu;jhu.edu", "position": "PhD student;John C. Malone Assistant Professor", "bibtex": "@misc{\nshankar2022adaptive,\ntitle={Adaptive Speech Duration Modification using a Deep-Generative Framework},\nauthor={Ravi Shankar and Archana Venkataraman},\nyear={2022},\nurl={https://openreview.net/forum?id=daYoG2O4TtU}\n}", "github": "", "project": "", "reviewers": "Lbse;rjMF;uzK2;EmGa", "site": "https://openreview.net/forum?id=daYoG2O4TtU", "pdf_size": 0, "recommendation": "1;3;3;6", "confidence": "4;4;2;4", "correctness": "2;2;4;3", "technical_novelty": "2;3;2;3", "empirical_novelty": "1;2;2;3", "wc_summary_paper": "52;74;55;117", "wc_summary_review": "16;87;105;78", "wc_main_review": "786;277;83;629", "wc_review": "854;438;243;824", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.25, 1.7853571071357126 ], "confidence_avg": [ 3.5, 0.8660254037844386 ], "correctness_avg": [ 2.75, 0.82915619758885 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 74.5, 25.947061490658243 ], "wc_summary_review_avg": [ 71.5, 33.48507130050644 ], "wc_main_review_avg": [ 443.75, 278.1181178923804 ], "wc_review_avg": [ 589.75, 258.8265587222455 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.08084520834544431, "corr_recommendation_correctness": 0.37998029782867415, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:S0adnqYlbpwJ:scholar.google.com/&scioq=Adaptive+Speech+Duration+Modification+using+a+Deep-Generative+Framework&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Johns Hopkins University", "aff_unique_dep": "", "aff_unique_url": "https://www.jhu.edu", "aff_unique_abbr": "JHU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "demdsohU_e", "title": "Neural Capacitance: A New Perspective of Neural Network Selection via Edge Dynamics", "track": "main", "status": "Reject", "tldr": "", "abstract": "Efficient model selection for identifying a suitable pre-trained neural network to a downstream task is a fundamental yet challenging task in deep learning. Current practice requires expensive computational costs in model training for performance prediction. In this paper, we propose a novel framework for neural network selection by analyzing the governing dynamics over synaptic connections (edges) during training. Our framework is built on the fact that back-propagation during neural network training is equivalent to the dynamical evolution of synaptic connections. Therefore, a converged neural network is associated with an equilibrium state of a networked system composed of those edges. To this end, we construct a network mapping $\\phi$, converting a neural network $G_A$ to a directed line graph $G_B$ that is defined on those edges in $G_A$. Next, we derive a \\textit{neural capacitance} metric $\\beta_{\\rm eff}$ as a predictive measure universally capturing the generalization capability of $G_A$ on the downstream task using only a handful of early training results. We carried out extensive experiments using 17 popular pre-trained ImageNet models and five benchmark datasets, including CIFAR10, CIFAR100, SVHN, Fashion MNIST and Birds, to evaluate the fine-tuning performance of our framework. Our neural capacitance metric is shown to be a powerful indicator for model selection based only on early training results and is more efficient than state-of-the-art methods.", "keywords": "neural network selection;transfer learning;dynamical system;edge dynamics;network science", "primary_area": "", "supplementary_material": "", "author": "Chunheng Jiang;Tejaswini Pedapati;Pin-Yu Chen;Yizhou Sun;Jianxi Gao", "authorids": "~Chunheng_Jiang1;~Tejaswini_Pedapati1;~Pin-Yu_Chen1;~Yizhou_Sun1;~Jianxi_Gao1", "gender": "M;F;M;F;M", "homepage": "https://www.horsehour.com/;;http://www.pinyuchen.com;http://web.cs.ucla.edu/~yzsun/;https://www.gaojianxi.com", "dblp": "168/1904;203/8811;39/8969;37/3868;72/7131", "google_scholar": "https://scholar.google.com/citations?hl=en;nwSF2RkAAAAJ;jxwlCUUAAAAJ;https://scholar.google.com.tw/citations?user=TQgOjK0AAAAJ;qMcWVWgAAAAJ", "orcid": ";;0000-0003-1039-8369;;0000-0002-3952-208X", "linkedin": ";;pin-yu-chen-940062a2;;jianxi-gao-48a59a24/", "or_profile": "~Chunheng_Jiang1;~Tejaswini_Pedapati1;~Pin-Yu_Chen1;~Yizhou_Sun1;~Jianxi_Gao1", "aff": "Rensselaer Polytechnic Institute;International Business Machines;International Business Machines;University of California, Los Angeles;", "aff_domain": "rpi.edu;ibm.com;ibm.com;ucla.edu;", "position": "PhD student;Research Engineer;Research Staff Member;Associate Professor;", "bibtex": "@misc{\njiang2022neural,\ntitle={Neural Capacitance: A New Perspective of Neural Network Selection via Edge Dynamics},\nauthor={Chunheng Jiang and Tejaswini Pedapati and Pin-Yu Chen and Yizhou Sun and Jianxi Gao},\nyear={2022},\nurl={https://openreview.net/forum?id=demdsohU_e}\n}", "github": "", "project": "", "reviewers": "acZh;A1LA;EL9b", "site": "https://openreview.net/forum?id=demdsohU_e", "pdf_size": 0, "recommendation": "5;5;6", "confidence": "3;2;2", "correctness": "2;4;4", "technical_novelty": "3;3;3", "empirical_novelty": "0;2;3", "wc_summary_paper": "39;69;69", "wc_summary_review": "56;34;1", "wc_main_review": "515;172;96", "wc_review": "610;275;166", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "2162;401;213", "reply_reviewers": "0;0;0", "reply_authors": "4;1;1", "recommendation_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 2.3333333333333335, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.9428090415820634 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 1.6666666666666667, 1.247219128924647 ], "wc_summary_paper_avg": [ 59.0, 14.142135623730951 ], "wc_summary_review_avg": [ 30.333333333333332, 22.60285134421958 ], "wc_main_review_avg": [ 261.0, 182.26537429436965 ], "wc_review_avg": [ 350.3333333333333, 188.92738169877745 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 925.3333333333334, 877.8171158555117 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.0, 1.4142135623730951 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.4999999999999999, "corr_recommendation_correctness": 0.4999999999999999, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15715242313155804699&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;1;2", "aff_unique_norm": "Rensselaer Polytechnic Institute;International Business Machines Corporation;University of California, Los Angeles", "aff_unique_dep": ";;", "aff_unique_url": "https://www.rpi.edu;https://www.ibm.com;https://www.ucla.edu", "aff_unique_abbr": "RPI;IBM;UCLA", "aff_campus_unique_index": "1", "aff_campus_unique": ";Los Angeles", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "One After Another: Learning Incremental Skills for a Changing World", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6026", "id": "dg79moSRqIo", "poster": "", "openreview": "https://openreview.net/forum?id=dg79moSRqIo", "slides": "https://iclr.cc/virtual/2022/poster/6026", "video": "https://iclr.cc/virtual/2022/poster/6026", "author_site": "Nur Muhammad Shafiullah, Lerrel Pinto", "tldr": "", "abstract": "Reward-free, unsupervised discovery of skills is an attractive alternative to the bottleneck of hand-designing rewards in environments where task supervision is scarce or expensive. However, current skill pre-training methods, like many RL techniques, make a fundamental assumption -- stationary environments during training. Traditional methods learn all their skills simultaneously, which makes it difficult for them to both quickly adapt to changes in the environment, and to not forget earlier skills after such adaptation. On the other hand, in an evolving or expanding environment, skill learning must be able to adapt fast to new environment situations while not forgetting previously learned skills. These two conditions make it difficult for classic skill discovery to do well in an evolving environment. In this work, we propose a new framework for skill discovery, where skills are learned one after another in an incremental fashion. This framework allows newly learned skills to adapt to new environment or agent dynamics, while the fixed old skills ensure the agent doesn't forget a learned skill. We demonstrate experimentally that in both evolving and static environments, incremental skills significantly outperform current state-of-the-art skill discovery methods on both skill quality and the ability to solve downstream tasks. Videos for learned skills and code are made public on https://notmahi.github.io/disk\n", "keywords": "Skill discovery;Incremental reinforcement learning", "primary_area": "", "supplementary_material": "/attachment/b7f8cc7442d04952ed7c8b79e67dac7dd4801ce9.zip", "author": "Nur Muhammad Mahi Shafiullah;Lerrel Pinto", "authorids": "~Nur_Muhammad_Mahi_Shafiullah1;~Lerrel_Pinto1", "gender": "M;M", "homepage": "https://www.lerrelpinto.com/;https://mahis.life", "dblp": "168/8304;308/1737", "google_scholar": "pmVPj94AAAAJ;vAOw6aQAAAAJ", "orcid": ";0000-0003-3617-1293", "linkedin": ";", "or_profile": "~Lerrel_Pinto1;~Nur_Muhammad_Shafiullah1", "aff": "New York University;New York University", "aff_domain": "cs.nyu.edu;nyu.edu", "position": "Assistant Professor;PhD student", "bibtex": "@inproceedings{\nshafiullah2022one,\ntitle={One After Another: Learning Incremental Skills for a Changing World},\nauthor={Nur Muhammad Mahi Shafiullah and Lerrel Pinto},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=dg79moSRqIo}\n}", "github": "", "project": "", "reviewers": "bA27;HdqY;eZE6;N9w1", "pdf_size": 0, "recommendation": "6;6;6;6", "confidence": "5;3;4;4", "correctness": "4;4;3;3", "technical_novelty": "3;3;3;3", "empirical_novelty": "3;3;3;2", "wc_summary_paper": "53;62;70;70", "wc_summary_review": "89;58;60;67", "wc_main_review": "538;338;981;1234", "wc_review": "680;458;1111;1371", "wc_reply_reviewers": "353;24;651;442", "wc_reply_authors": "830;509;1930;2107", "reply_reviewers": "2;1;2;1", "reply_authors": "2;2;4;4", "recommendation_avg": [ 6.0, 0.0 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 63.75, 7.013380069552769 ], "wc_summary_review_avg": [ 68.5, 12.298373876248844 ], "wc_main_review_avg": [ 772.75, 353.63637751226895 ], "wc_review_avg": [ 905.0, 357.08052313168804 ], "wc_reply_reviewers_avg": [ 367.5, 225.90097388014954 ], "wc_reply_authors_avg": [ 1344.0, 686.8380449567424 ], "reply_reviewers_avg": [ 1.5, 0.5 ], "reply_authors_avg": [ 3.0, 1.0 ], "replies_avg": [ 26, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7328413134619288217&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=dg79moSRqIo", "email": "cs.nyu.edu;nyu.edu", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "New York University", "aff_unique_dep": "", "aff_unique_url": "https://www.nyu.edu", "aff_unique_abbr": "NYU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Learnability of convolutional neural networks for infinite dimensional input via mixed and anisotropic smoothness", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6811", "id": "dgxFTxuJ50e", "poster": "", "openreview": "https://openreview.net/forum?id=dgxFTxuJ50e", "slides": "https://iclr.cc/virtual/2022/poster/6811", "video": "https://iclr.cc/virtual/2022/poster/6811", "author_site": "Sho Okumoto, Taiji Suzuki", "tldr": "", "abstract": "Among a wide range of success of deep learning, convolutional neural networks have been extensively utilized in several tasks such as speech recognition, image processing, and natural language processing, which require inputs with large dimensions.\nSeveral studies have investigated function estimation capability of deep learning, but most of them have assumed that the dimensionality of the input is much smaller than the sample size. \nHowever, for typical data in applications such as those handled by the convolutional neural networks described above, \nthe dimensionality of inputs is relatively high or even infinite. \nIn this paper, we investigate the approximation and estimation errors of the (dilated) convolutional neural networks when the input is infinite dimensional. \nAlthough the approximation and estimation errors of neural networks are affected by the curse of dimensionality in the existing analyses for typical function spaces such as the \\Holder and Besov spaces, we show that, by considering anisotropic smoothness, they can alleviate exponential dependency on the dimensionality but they only depend on the smoothness of the target functions. \nOur theoretical analysis supports the great practical success of convolutional networks. \nFurthermore, we show that the dilated convolution is advantageous when the smoothness of the target function has a sparse structure.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Sho Okumoto;Taiji Suzuki", "authorids": "lebesgue0118@gmail.com;~Taiji_Suzuki1", "gender": ";M", "homepage": ";http://ibis.t.u-tokyo.ac.jp/suzuki/", "dblp": ";08/312", "google_scholar": ";x8osrBsAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "lebesgue0118@gmail.com;~Taiji_Suzuki1", "aff": ";The University of Tokyo", "aff_domain": ";tokyo.ac.jp", "position": ";Associate Professor", "bibtex": "@inproceedings{\nokumoto2022learnability,\ntitle={Learnability of convolutional neural networks for infinite dimensional input via mixed and anisotropic smoothness},\nauthor={Sho Okumoto and Taiji Suzuki},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=dgxFTxuJ50e}\n}", "github": "", "project": "", "reviewers": "okX4;BeQE;iiDe;DfK7", "pdf_size": 0, "recommendation": "6;8;8;8", "confidence": "3;5;3;3", "correctness": "4;4;4;4", "technical_novelty": "3;4;4;4", "empirical_novelty": "0;0;0;0", "wc_summary_paper": "85;57;168;49", "wc_summary_review": "49;28;17;21", "wc_main_review": "381;217;329;124", "wc_review": "515;302;514;194", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "435;440;658;247", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 7.5, 0.8660254037844386 ], "confidence_avg": [ 3.5, 0.8660254037844386 ], "correctness_avg": [ 4.0, 0.0 ], "technical_novelty_avg": [ 3.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 0, 0 ], "wc_summary_paper_avg": [ 89.75, 47.11355961928582 ], "wc_summary_review_avg": [ 28.75, 12.336429791475327 ], "wc_main_review_avg": [ 262.75, 99.64530846959128 ], "wc_review_avg": [ 381.25, 138.61344631744785 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 445.0, 145.51460407807872 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": 0.0, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4235198887343856169&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "pdf": "https://openreview.net/pdf?id=dgxFTxuJ50e", "email": ";tokyo.ac.jp", "author_num": 2, "aff_unique_index": "0", "aff_unique_norm": "University of Tokyo", "aff_unique_dep": "", "aff_unique_url": "https://www.u-tokyo.ac.jp", "aff_unique_abbr": "UTokyo", "aff_country_unique_index": "0", "aff_country_unique": "Japan" }, { "id": "dhLChxJwgMR", "title": "HFSP: A Hardware-friendly Soft Pruning Framework for Vision Transformers", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Recently, Vision Transformer (ViT) has continuously established new milestones in the computer vision field, while the high computation and memory cost makes its propagation in industrial production difficult. Pruning, a traditional model compression paradigm for hardware efficiency, has been widely applied in various DNN structures. Nevertheless, it stays ambiguous on how to perform exclusive pruning on the ViT structure. Considering three key points: the structural characteristics, the internal data pattern of ViT, and the related edge device deployment, we leverage the input token sparsity and propose a hardware-friendly soft pruning framework (HFSP), which can be set up on vanilla Transformers of both flatten and CNN-type structures, such as Pooling-based ViT (PiT). More concretely, we design a dynamic attention-based multi-head token selector, which is a lightweight module for adaptive instance-wise token selection. We further introduce a soft pruning technique to package the pruned tokens, which integrate the less informative tokens generated by the selector module into a package token, and participates in subsequent calculations rather than being discarded completely. From a hardware standpoint, our framework is bound to the tradeoff between accuracy and specific hardware constraints through our proposed hardware-oriented progressive training, and all the operators embedded in the framework have been well-supported. Experimental results demonstrate that the proposed framework significantly reduces the computational costs of ViTs while maintaining comparable performance on image classification. For example, our method reduces the FLOPs of DeiT-S by over 42.6% while only sacrificing 0.46% top-1 accuracy. Moreover, our framework can guarantee the identified model to meet resource specifications of mobile devices and FPGA, and even achieve the real-time execution of DeiT-T on mobile platforms. Code will be publicly released.", "keywords": "Vision Transformers;Hardware-friendly;Soft Token Pruning", "primary_area": "", "supplementary_material": "/attachment/921281869360aa6d2f1738c3ccaf41e751f07711.zip", "author": "Zhenglun Kong;Peiyan Dong;Xiaolong Ma;Xin Meng;Mengshu Sun;Wei Niu;Bin Ren;Minghai Qin;Hao Tang;Yanzhi Wang", "authorids": "~Zhenglun_Kong1;~Peiyan_Dong1;~Xiaolong_Ma2;~Xin_Meng1;~Mengshu_Sun1;~Wei_Niu3;~Bin_Ren1;~Minghai_Qin1;~Hao_Tang6;~Yanzhi_Wang3", "gender": "M;F;M;M;;M;M;M;M;M", "homepage": "https://sites.google.com/husky.neu.edu/zlk/home?authuser=1;https://peiyanflying.github.io/Peggy_Peiyan.github.io/;https://xiaolongma2016.com;https://www.linkedin.com/in/%E9%91%AB-%E5%AD%9F-b45849175/;;https://www.niuwei.info;http://www.cs.wm.edu/~bren/;https://sites.google.com/site/minghaiqin/home;https://ha0tang.github.io/;https://web.northeastern.edu/yanzhiwang/", "dblp": "211/6323;254/1329;;;193/2457;68/828-2.html;;;07/5751-5;", "google_scholar": "XYa4NVYAAAAJ;OGU3CVoAAAAJ;https://scholar.google.com/citations?hl=en;;JKUtxEgAAAAJ;w1RoaOMAAAAJ;9Uqwy4UAAAAJ;MSgWKbYAAAAJ;9zJkeEMAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": "0000-0002-8120-4456;;0000-0003-3753-7648;0000-0003-2228-0587;0000-0003-3540-1464;;;;0000-0002-2077-1246;", "linkedin": "zhenglun-kong-35b527150/;;xiaolong-ma-66b98910b/;%E9%91%AB-%E5%AD%9F-b45849175/;;;;;hao-tang-887475138/;", "or_profile": "~Zhenglun_Kong1;~Peiyan_Dong1;~Xiaolong_Ma2;~Xin_Meng1;~Mengshu_Sun1;~Wei_Niu3;~Bin_Ren1;~Minghai_Qin1;~Hao_Tang6;~Yanzhi_Wang3", "aff": "Northeastern University;Northeastern University;Northeastern University;Didi Auto-Driving;Northeastern University;College of William and Mary;William & Mary;Western Digital Corporation;ETH Zurich;Northeastern University", "aff_domain": "northeastern.edu;northeastern.edu;northeastern.edu;didichuxing.com;northeastern.edu;wm.edu;cs.wm.edu;wdc.com;vision.ee.ethz.ch;northeastern.edu", "position": "PhD student;PhD student;PhD student;Researcher;PhD student;PhD student;Associate Professor;senior technologist;Postdoc;Associate Professor", "bibtex": "@misc{\nkong2022hfsp,\ntitle={{HFSP}: A Hardware-friendly Soft Pruning Framework for Vision Transformers},\nauthor={Zhenglun Kong and Peiyan Dong and Xiaolong Ma and Xin Meng and Mengshu Sun and Wei Niu and Bin Ren and Minghai Qin and Hao Tang and Yanzhi Wang},\nyear={2022},\nurl={https://openreview.net/forum?id=dhLChxJwgMR}\n}", "github": "", "project": "", "reviewers": "rSCw;SP9h;gFBC", "site": "https://openreview.net/forum?id=dhLChxJwgMR", "pdf_size": 0, "recommendation": "3;5;5", "confidence": "4;3;4", "correctness": "2;3;3", "technical_novelty": "3;2;2", "empirical_novelty": "2;2;2", "wc_summary_paper": "45;110;63", "wc_summary_review": "63;35;80", "wc_main_review": "617;399;342", "wc_review": "725;544;485", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 4.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 2.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 72.66666666666667, 27.402351886086144 ], "wc_summary_review_avg": [ 59.333333333333336, 18.55322673343433 ], "wc_main_review_avg": [ 452.6666666666667, 118.50832132058163 ], "wc_review_avg": [ 584.6666666666666, 102.11213879303914 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 10, 0 ], "corr_recommendation_confidence": -0.49999999999999983, "corr_recommendation_correctness": 0.9999999999999998, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3452344025352633443&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;0;1;0;2;3;4;5;0", "aff_unique_norm": "Northeastern University;Didi Chuxing;College of William and Mary;College of William & Mary;Western Digital Corporation;ETH Zurich", "aff_unique_dep": ";Auto-Driving;;;;", "aff_unique_url": "https://www.northeastern.edu;https://www.didichuxing.com/;https://www.wm.edu;https://www.wm.edu;https://www.westerndigital.com;https://www.ethz.ch", "aff_unique_abbr": "NEU;Didi;WM;WM;WDC;ETHZ", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;0;0;0;0;2;0", "aff_country_unique": "United States;China;Switzerland" }, { "id": "di0r7vfKrq5", "title": "Boosting Search Engines with Interactive Agents", "track": "main", "status": "Reject", "tldr": "", "abstract": "This paper presents first successful steps in designing agents that learn meta-strategies for iterative query refinement. \nOur approach uses machine reading to guide the selection of refinement terms from aggregated search results.\n\nAgents are then empowered with simple but effective search operators to exert fine-grained and transparent control over queries and search results.\n\nWe develop a novel way of generating synthetic search sessions, which leverages the power of transformer-based language models through (self-)supervised learning. We also present a reinforcement learning agent with dynamically constrained actions that learns interactive search strategies from scratch. \n\nWe obtain retrieval and answer quality performance comparable to recent neural methods using a traditional term-based BM25 ranking function. We provide an in-depth analysis of the search policies.", "keywords": "query refinement;reinforcement learning;self-supervised learning;question answering;search engines;large language models", "primary_area": "", "supplementary_material": "/attachment/06fe1492ca3dc143b18f5ef2e3c3b5dabadb2d0b.zip", "author": "Leonard Adolphs;Benjamin B\u00f6rschinger;Christian Buck;Michelle Chen Huebscher;Massimiliano Ciaramita;Lasse Espeholt;Thomas Hofmann;Yannic Kilcher;Sascha Rothe;Pier Giuseppe Sessa;Lierni Sestorain", "authorids": "~Leonard_Adolphs3;~Benjamin_B\u00f6rschinger1;~Christian_Buck1;~Michelle_Chen_Huebscher1;~Massimiliano_Ciaramita2;~Lasse_Espeholt1;~Thomas_Hofmann1;~Yannic_Kilcher1;~Sascha_Rothe1;~Pier_Giuseppe_Sessa1;~Lierni_Sestorain1", "gender": ";;M;F;;M;M;M;M;;F", "homepage": ";;;https://arxiv.org/search/cs?searchtype=author&query=Huebscher%2C+M+C;;;http://www.da.inf.ethz.ch/;;;;", "dblp": ";71/10489;;;31/916;164/5668;h/ThHofmann;https://dblp.org/pers/k/Kilcher:Yannic.html;148/9544;;", "google_scholar": ";;DSb_wQ8AAAAJ;;;TxLjpCYAAAAJ;T3hAyLkAAAAJ;;https://scholar.google.de/citations?user=Vu6r1BEAAAAJ;;", "orcid": ";;;;;;;;;;", "linkedin": ";;;;;;thomas-hofmann-1ab2402/;;sascha-rothe-53b7b066/;;", "or_profile": "~Leonard_Adolphs3;~Benjamin_B\u00f6rschinger1;~Christian_Buck1;~Michelle_Chen_Huebscher1;~Massimiliano_Ciaramita2;~Lasse_Espeholt1;~Thomas_Hofmann1;~Yannic_Kilcher1;~Sascha_Rothe1;~Pier_Giuseppe_Sessa1;~Lierni_Sestorain1", "aff": ";;Google;;Google;Google;Swiss Federal Institute of Technology;DeepJudge;Google;;", "aff_domain": ";;google.com;;google.com;google.com;ethz.ch;deepjudge.ai;google.com;;", "position": ";;Researcher;;Research Scientist;Research Engineer;Full Professor;CTO;Researcher;;", "bibtex": "@misc{\nadolphs2022boosting,\ntitle={Boosting Search Engines with Interactive Agents},\nauthor={Leonard Adolphs and Benjamin B{\\\"o}rschinger and Christian Buck and Michelle Chen Huebscher and Massimiliano Ciaramita and Lasse Espeholt and Thomas Hofmann and Yannic Kilcher and Sascha Rothe and Pier Giuseppe Sessa and Lierni Sestorain},\nyear={2022},\nurl={https://openreview.net/forum?id=di0r7vfKrq5}\n}", "github": "", "project": "", "reviewers": "13nH;NfBM;NpVB", "site": "https://openreview.net/forum?id=di0r7vfKrq5", "pdf_size": 0, "recommendation": "3;3;6", "confidence": "4;4;3", "correctness": "3;2;3", "technical_novelty": "2;2;3", "empirical_novelty": "3;2;0", "wc_summary_paper": "118;91;59", "wc_summary_review": "73;89;52", "wc_main_review": "530;459;94", "wc_review": "721;639;205", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "542;796;113", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "recommendation_avg": [ 4.0, 1.4142135623730951 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 2.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 1.6666666666666667, 1.247219128924647 ], "wc_summary_paper_avg": [ 89.33333333333333, 24.115462996914562 ], "wc_summary_review_avg": [ 71.33333333333333, 15.15109090315135 ], "wc_main_review_avg": [ 361.0, 191.00959836266517 ], "wc_review_avg": [ 521.6666666666666, 226.40573216143525 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 483.6666666666667, 281.8679753990431 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 11, 0 ], "corr_recommendation_confidence": -1.0, "corr_recommendation_correctness": 0.5, "gs_citation": 26, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9748266028388209494&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0;0;1;2;0", "aff_unique_norm": "Google;Swiss Federal Institute of Technology;DeepJudge", "aff_unique_dep": "Google;;", "aff_unique_url": "https://www.google.com;https://www.ethz.ch;", "aff_unique_abbr": "Google;ETH Zurich;", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;0;0;1;0", "aff_country_unique": "United States;Switzerland;" }, { "id": "djZBr4Z7jcz", "title": "On the regularization landscape for the linear recommendation models", "track": "main", "status": "Reject", "tldr": "", "abstract": "Recently, a wide range of recommendation algorithms inspired by deep learning techniques have emerged as the performance leaders several standard recommendation benchmarks. While these algorithms were built on different DL techniques (e.g., dropouts, autoencoder), they have similar performance and even similar cost functions. This paper studies whether the models' comparable performance are sheer coincidence, or they can be unified into a single framework. We find that all linear performance leaders effectively add only a nuclear-norm based regularizer, or a Frobenius-norm based regularizer. The former ones possess a (surprisnig) rigid structure that limits the models' predictive power but their solutions are low rank and have closed form. The latter ones are more expressive and more efficient for recommendation but their solutions are either full-rank or require executing hard-to-tune numeric procedures such as ADMM. Along this line of finding, we further propose two low-rank, closed-form solutions, derived from carefully generalizing Frobenius-norm based regularizers. The new solutions get the best of both nuclear-norm and Frobenius-norm world. ", "keywords": "recommendation system;regularization;linear model", "primary_area": "", "supplementary_material": "/attachment/5aa140df6b7ca6a24a37e15ca0fc4a0478bea3f3.zip", "author": "Dong Li;Zhenming Liu;Ruoming Jin;Zhi Liu;Jing Gao;Bin Ren", "authorids": "~Dong_Li9;~Zhenming_Liu1;~Ruoming_Jin1;~Zhi_Liu5;~Jing_Gao3;~Bin_Ren1", "gender": "M;M;M;M;;M", "homepage": ";http://www.wm.edu/as/computerscience/faculty/liu_zhenming.php/;http://www.cs.kent.edu/~jin/;;;http://www.cs.wm.edu/~bren/", "dblp": ";51/2717;72/4662;;;", "google_scholar": "8jcTxP8AAAAJ;https://scholar.google.com.tw/citations?user=ozfkg2sAAAAJ;;hpzV9SEAAAAJ;;9Uqwy4UAAAAJ", "orcid": ";;;0000-0002-5248-4807;;", "linkedin": ";;;zhi-liu-6731666a/;jing-gao-dding/;", "or_profile": "~Dong_Li9;~Zhenming_Liu1;~Ruoming_Jin1;~Zhi_Liu5;~Jing_Gao3;~Bin_Ren1", "aff": "Kent State University;College of William and Mary;Kent State University;iLambda Inc.;iLambda Inc.;William & Mary", "aff_domain": "kent.edu;wm.edu;kent.edu;ilambda.ai;ilambda.com;cs.wm.edu", "position": "PhD student;Assistant Professor;Professor;Researcher;Senior Advisory Scientist;Associate Professor", "bibtex": "@misc{\nli2022on,\ntitle={On the regularization landscape for the linear recommendation models},\nauthor={Dong Li and Zhenming Liu and Ruoming Jin and Zhi Liu and Jing Gao and Bin Ren},\nyear={2022},\nurl={https://openreview.net/forum?id=djZBr4Z7jcz}\n}", "github": "", "project": "", "reviewers": "Uxog;bVYo;nwQT;7nvQ", "site": "https://openreview.net/forum?id=djZBr4Z7jcz", "pdf_size": 0, "recommendation": "5;5;5;6", "confidence": "3;4;3;4", "correctness": "3;3;4;3", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "55;34;195;99", "wc_summary_review": "18;35;190;106", "wc_main_review": "212;237;828;1202", "wc_review": "285;306;1213;1407", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "1223;901;1145;1987", "reply_reviewers": "0;0;0;0", "reply_authors": "2;2;2;3", "recommendation_avg": [ 5.25, 0.4330127018922193 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 95.75, 61.91677882448343 ], "wc_summary_review_avg": [ 87.25, 67.88731472079301 ], "wc_main_review_avg": [ 619.75, 416.8755059966944 ], "wc_review_avg": [ 802.75, 511.92009874588825 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 1314.0, 406.3065345278119 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.25, 0.4330127018922193 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": -0.3333333333333333, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11938885342784099622&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;0;2;2;3", "aff_unique_norm": "Kent State University;College of William and Mary;iLambda Inc.;College of William & Mary", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.kent.edu;https://www.wm.edu;;https://www.wm.edu", "aff_unique_abbr": "KSU;WM;;WM", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "djhu4DIZZHR", "title": "NAIL: A Challenging Benchmark for Na\\\"ive Logical Reasoning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Logical reasoning over natural text is an important capability towards human level intelligence.\nExisting datasets are either limited and inadequate to train and evaluate logical reasoning capability (e.g., LogiQA and ReClor),\nor not oriented for logical reasoning (e.g., SQuAD and HotpotQA).\nIn this paper, we focus on a specific category of logical reasoning, named \\emph{\\mytask}, and propose a new large scale benchmark, named \\mydata, targeted for learning and evaluating models' capabilities towards \\mytask.\n \\mydata is source from standardized exams such as Chinese National Civil Servants Examination and Law School Admission Test.\nFurthermore, to collect more data, we propose to imitate the example of standardized exams rather than designing them from scratch.\n\\mydata is available in both Chinese and English containing a total of $10,296 * 2$ instances.\nEmpirical results show that current state-of-the-art neural models struggle on \\mydata with very poor accuracy (the best result is 30.10\\% for \\mydata and 36.15\\% for Chinese \\mydata), while human experts can perform nearly 100\\% accuracy.\nFurther results indicate that human imitations can significantly help models learn logic from natural text.", "keywords": "Logical Reasoning;Benchmark", "primary_area": "", "supplementary_material": "", "author": "Xinbo Zhang;Changzhi Sun;Yue Zhang;Lei Li;Hao Zhou", "authorids": "~Xinbo_Zhang1;~Changzhi_Sun1;~Yue_Zhang7;~Lei_Li11;~Hao_Zhou5", "gender": "F;M;M;M;M", "homepage": ";http://www.czsun.site/;http://frcchang.github.io;https://www.cs.cmu.edu/~leili;https://zhouh.github.io/", "dblp": ";44/1920;47/722-4;13/7007-5.html;63/778-12", "google_scholar": ";raxcrcIAAAAJ;;BYXqAlwAAAAJ;https://scholar.google.com/citations?hl=zh-CN", "orcid": ";;0000-0002-5214-2268;0000-0003-3095-9776;", "linkedin": ";;;;", "or_profile": "~Xinbo_Zhang1;~Changzhi_Sun1;~Yue_Zhang7;~Lei_Li11;~Hao_Zhou5", "aff": "ByteDance;ByteDance;Westlake University;Computer Science Department, UC Santa Barbara;Bytedance", "aff_domain": "bytedance.com;bytedance.com;westlake.edu.cn;cs.ucsb.edu;bytedance.com", "position": "Researcher;Researcher;Associate Professor;Assistant Professor;Researcher", "bibtex": "@misc{\nzhang2022nail,\ntitle={{NAIL}: A Challenging Benchmark for Na{\\textbackslash}''ive Logical Reasoning},\nauthor={Xinbo Zhang and Changzhi Sun and Yue Zhang and Lei Li and Hao Zhou},\nyear={2022},\nurl={https://openreview.net/forum?id=djhu4DIZZHR}\n}", "github": "", "project": "", "reviewers": "xN3f;QPXV;kWbB;VxU4", "site": "https://openreview.net/forum?id=djhu4DIZZHR", "pdf_size": 0, "recommendation": "3;5;6;8", "confidence": "5;3;4;4", "correctness": "3;3;3;4", "technical_novelty": "1;3;3;3", "empirical_novelty": "1;2;2;3", "wc_summary_paper": "49;186;104;209", "wc_summary_review": "141;39;110;30", "wc_main_review": "171;468;549;103", "wc_review": "361;693;763;342", "wc_reply_reviewers": "0;219;37;0", "wc_reply_authors": "752;1666;1059;359", "reply_reviewers": "0;1;1;0", "reply_authors": "2;3;2;2", "recommendation_avg": [ 5.5, 1.8027756377319946 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.8660254037844386 ], "empirical_novelty_avg": [ 2.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 137.0, 64.0663718342158 ], "wc_summary_review_avg": [ 80.0, 46.90948731333567 ], "wc_main_review_avg": [ 322.75, 189.47608688169598 ], "wc_review_avg": [ 539.75, 189.98865097684126 ], "wc_reply_reviewers_avg": [ 64.0, 90.75516514226615 ], "wc_reply_authors_avg": [ 959.0, 477.6761455212098 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 2.25, 0.4330127018922193 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.39223227027636803, "corr_recommendation_correctness": 0.8006407690254357, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1401073220598747804&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;1;2;0", "aff_unique_norm": "ByteDance;Westlake University;University of California, Santa Barbara", "aff_unique_dep": ";;Computer Science Department", "aff_unique_url": "https://www.bytedance.com;https://www.westlake.edu.cn;https://www.ucsb.edu", "aff_unique_abbr": "ByteDance;WU;UCSB", "aff_campus_unique_index": "1", "aff_campus_unique": ";Santa Barbara", "aff_country_unique_index": "0;0;0;1;0", "aff_country_unique": "China;United States" }, { "id": "djwnKXz1B2", "title": "EP-GAN: Unsupervised Federated Learning with Expectation-Propagation Prior GAN", "track": "main", "status": "Reject", "tldr": "", "abstract": "Generative Adversarial Networks (GANs) are overwhelming in unsupervised learning tasks due to their expressive power in modeling fine-grained data distributions. However, it is challenging for GANs to model distributions of separate non-i.i.d. data partitions as it usually adopts an over-general prior, limiting its capability in capturing the latent structure of multiple data partitions and thus leading to mode collapse. In this paper, we present a new Bayesian GAN, dubbed expectation propagation prior GAN (EP-GAN), which addresses the above challenge of modeling non-i.i.d. federated data through imposing a partition-invariant prior distribution on a Bayesian GAN. Furthermore, unlike most existing algorithms for deep-learning-based EP inference that require numerical quadrature, here we propose a closed-form solution for each update step of EP, leading to a more efficient solution for federated data modeling. Experiments on both synthetic extremely non-i.i.d. image data partitions and realistic non-i.i.d. speech recognition tasks demonstrate that our framework effectively alleviates the performance deterioration caused by non-i.i.d. data. ", "keywords": "Bayesian Deep learning;Expectation Propagation;Unsupervised Learning;Acoustic Modeling", "primary_area": "", "supplementary_material": "/attachment/5079acf344429e6384a9fd75d244fcb3e05bed58.zip", "author": "Xueyang Wu;Hengguan Huang;Hao Wang;Ye Wang;Qian Xu", "authorids": "~Xueyang_Wu1;~Hengguan_Huang1;~Hao_Wang3;~Ye_Wang3;~Qian_Xu1", "gender": ";M;M;F;M", "homepage": "http://www.cse.ust.hk/~xwuba;;https://smcnus.comp.nus.edu.sg/;;http://www.wanghao.in", "dblp": "https://dblp.uni-trier.de/pid/194/1291-1;166/6435;44/6292-7;81/5941;w/HaoWang-14", "google_scholar": "ZySbpIAAAAAJ;GQm1eZEAAAAJ;https://scholar.google.com.sg/citations?user=CdgLLL8AAAAJ;;NrOA9QoAAAAJ", "orcid": "0000-0001-5419-7273;;0000-0002-0123-1260;;", "linkedin": ";;;;", "or_profile": "~Xueyang_Wu1;~Hengguan_Huang1;~Ye_Wang3;~Qian_Xu1;~Hao_Wang4", "aff": "Department of Computer Science and Engineering, The Hong Kong University of Science and Technology;National University of Singapore;National University of Singapore;;Rutgers University", "aff_domain": "cse.ust.hk;nus.edu.sg;nus.edu.sg;;cs.rutgers.edu", "position": "PhD student;PhD student;Associate Professor;;Assistant Professor", "bibtex": "@misc{\nwu2022epgan,\ntitle={{EP}-{GAN}: Unsupervised Federated Learning with Expectation-Propagation Prior {GAN}},\nauthor={Xueyang Wu and Hengguan Huang and Hao Wang and Ye Wang and Qian Xu},\nyear={2022},\nurl={https://openreview.net/forum?id=djwnKXz1B2}\n}", "github": "", "project": "", "reviewers": "cz2i;q41K;Nw7X;FkDz", "site": "https://openreview.net/forum?id=djwnKXz1B2", "pdf_size": 0, "recommendation": "3;5;6;6", "confidence": "2;3;4;3", "correctness": "2;3;3;3", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "43;99;66;71", "wc_summary_review": "37;121;25;35", "wc_main_review": "212;241;305;286", "wc_review": "292;461;396;392", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "798;1215;885;462", "reply_reviewers": "0;0;0;0", "reply_authors": "1;2;2;1", "recommendation_avg": [ 5.0, 1.224744871391589 ], "confidence_avg": [ 3.0, 0.7071067811865476 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 69.75, 19.917015338649513 ], "wc_summary_review_avg": [ 54.5, 38.66199684444661 ], "wc_main_review_avg": [ 261.0, 36.61283927804562 ], "wc_review_avg": [ 385.25, 60.40436656401588 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 840.0, 268.00093283419744 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.8660254037844386, "corr_recommendation_correctness": 0.9428090415820632, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7579758770873398587&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;1;2", "aff_unique_norm": "Hong Kong University of Science and Technology;National University of Singapore;Rutgers University", "aff_unique_dep": "Department of Computer Science and Engineering;;", "aff_unique_url": "https://www.ust.hk;https://www.nus.edu.sg;https://www.rutgers.edu", "aff_unique_abbr": "HKUST;NUS;Rutgers", "aff_campus_unique_index": "0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;1;1;2", "aff_country_unique": "China;Singapore;United States" }, { "id": "dmq_-R2LhQk", "title": "The Manifold Hypothesis for Gradient-Based Explanations", "track": "main", "status": "Reject", "tldr": "", "abstract": "When are gradient-based explanations meaningful? We propose a necessary criterion: explanations need to be aligned with the tangent space of the data manifold. To test this hypothesis, we employ autoencoders to estimate and generate data manifolds. Across a range of different datasets -- MNIST, EMNIST, CIFAR10, X-ray pneumonia and Diabetic Retinopathy detection -- we demonstrate empirically that the more an explanation is aligned with the tangent space of the data, the more interpretable it tends to be. In particular, popular post-hoc explanation methods such as Integrated Gradients and SmoothGrad tend to align their results with the data manifold. The same is true for the outcome of adversarial training, which has been claimed to lead to more interpretable explanations. Empirically, alignment with the data manifold happens early during training, and to some degree even when training with random labels. However, we theoretically prove that good generalization of neural networks does not imply good or bad alignment of model gradients with the data manifold. This leads to a number of interesting follow-up questions regarding gradient-based explanations.\n\n", "keywords": "Interpretability;Explainability", "primary_area": "", "supplementary_material": "", "author": "Sebastian Bordt;Uddeshya Upadhyay;Zeynep Akata;Ulrike von Luxburg", "authorids": "~Sebastian_Bordt1;~Uddeshya_Upadhyay1;~Zeynep_Akata1;~Ulrike_von_Luxburg1", "gender": ";M;F;F", "homepage": "http://www.tml.cs.uni-tuebingen.de/team/bordt/index.php;https://udion.xyz;https://eml-unitue.de/people/zeynep-akata;", "dblp": "270/0462;;117/4838;06/1082", "google_scholar": "https://scholar.google.de/citations?user=6PnL3BgAAAAJ;Zgk0Z6kAAAAJ;jQl9RtkAAAAJ;mMifMdoAAAAJ", "orcid": ";;0000-0002-1432-7747;", "linkedin": ";;zeynep-akata-36182045/?ppe=1;", "or_profile": "~Sebastian_Bordt1;~Uddeshya_Upadhyay1;~Zeynep_Akata1;~Ulrike_von_Luxburg1", "aff": "Max Planck Institute for Intelligent Systems, Max-Planck Institute;International Max Plank Research School for Intelligent Systems;University of T\u00fcbingen;University of Tuebingen", "aff_domain": "tue.mpg.de;uni-tuebingen.de;uni-tuebingen.de;uni-tuebingen.de", "position": "PhD student;PhD student;Full Professor;Professor", "bibtex": "@misc{\nbordt2022the,\ntitle={The Manifold Hypothesis for Gradient-Based Explanations},\nauthor={Sebastian Bordt and Uddeshya Upadhyay and Zeynep Akata and Ulrike von Luxburg},\nyear={2022},\nurl={https://openreview.net/forum?id=dmq_-R2LhQk}\n}", "github": "", "project": "", "reviewers": "gHy3;oTn6;993V;AMGx", "site": "https://openreview.net/forum?id=dmq_-R2LhQk", "pdf_size": 0, "recommendation": "3;3;3;6", "confidence": "5;5;4;3", "correctness": "2;3;1;3", "technical_novelty": "3;2;2;3", "empirical_novelty": "2;3;2;4", "wc_summary_paper": "59;80;51;78", "wc_summary_review": "67;61;36;66", "wc_main_review": "644;868;194;333", "wc_review": "770;1009;281;477", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.75, 1.299038105676658 ], "confidence_avg": [ 4.25, 0.82915619758885 ], "correctness_avg": [ 2.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 67.0, 12.349089035228468 ], "wc_summary_review_avg": [ 57.5, 12.619429464123963 ], "wc_main_review_avg": [ 509.75, 263.2986659669965 ], "wc_review_avg": [ 634.25, 277.6593011227969 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.8703882797784892, "corr_recommendation_correctness": 0.5222329678670935, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15039394346302237270&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff_unique_index": "0;0;1;2", "aff_unique_norm": "Max Planck Institute for Intelligent Systems;University of T\u00fcbingen;University of Tuebingen", "aff_unique_dep": "Intelligent Systems;;", "aff_unique_url": "https://www.mpi-is.mpg.de;https://www.uni-tuebingen.de/;https://www.uni-tuebingen.de/", "aff_unique_abbr": "MPI-IS;Uni T\u00fcbingen;Uni T\u00fcbingen", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Germany" }, { "id": "dn4B7Mes2z", "title": "The Low-Rank Simplicity Bias in Deep Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "Modern deep neural networks are highly over-parameterized compared to the data on which they are trained, yet they often generalize remarkably well. A flurry of recent work has asked: why do deep networks not overfit to their training data? In this work, we make a series of empirical observations that investigate the hypothesis that deeper networks are inductively biased to find solutions with lower rank embeddings. We conjecture that this bias exists because the volume of functions that maps to low-rank embedding increases with depth. We show empirically that our claim holds true on finite width linear and non-linear models and show that these are the solutions that generalize well. We then show that the low-rank simplicity bias exists even after training, using a wide variety of commonly used optimizers. We found this phenomenon to be resilient to initialization, hyper-parameters, and learning methods. We further demonstrate how linear over-parameterization of deep non-linear models can be used to induce low-rank bias, improving generalization performance without changing the effective model capacity. Practically, we demonstrate that simply linearly over-parameterizing standard models at training time can improve performance on image classification tasks, including ImageNet.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Minyoung Huh;Hossein Mobahi;Richard Zhang;Brian Cheung;Pulkit Agrawal;Phillip Isola", "authorids": "~Minyoung_Huh1;~Hossein_Mobahi2;~Richard_Zhang1;~Brian_Cheung1;~Pulkit_Agrawal1;~Phillip_Isola1", "gender": "M;M;M;M;M;M", "homepage": "https://people.csail.mit.edu/minhuh/;http://richzhang.github.io;https://briancheung.github.io/;https://people.eecs.berkeley.edu/~pulkitag/;http://web.mit.edu/phillipi/;http://people.csail.mit.edu/hmobahi/", "dblp": "220/3360;;;149/2672;36/9988;94/1490", "google_scholar": "2k18_1IAAAAJ;LW8ze_UAAAAJ;7N-ethYAAAAJ;UpZmJI0AAAAJ;ROILf3EAAAAJ;GSHmKZkAAAAJ", "orcid": ";;;;0000-0002-1411-6704;", "linkedin": ";;;;phillip-isola-a9955b20/;", "or_profile": "~Minyoung_Huh1;~Richard_Zhang1;~Brian_Cheung1;~Pulkit_Agrawal1;~Phillip_Isola1;~Hossein_Mobahi1", "aff": "Massachusetts Institute of Technology;Adobe Systems;Massachusetts Institute of Technology;Massachusetts Institute of Technology;Massachusetts Institute of Technology;Google", "aff_domain": "mit.edu;adobe.com;mit.edu;mit.edu;mit.edu;google.com", "position": "PhD student;Research Scientist;Research Fellow;Assistant Professor;Assistant Professor;Research Scientist", "bibtex": "@misc{\nhuh2022the,\ntitle={The Low-Rank Simplicity Bias in Deep Networks},\nauthor={Minyoung Huh and Hossein Mobahi and Richard Zhang and Brian Cheung and Pulkit Agrawal and Phillip Isola},\nyear={2022},\nurl={https://openreview.net/forum?id=dn4B7Mes2z}\n}", "github": "", "project": "", "reviewers": "utsM;cVHG;Yhy3;Jihb", "site": "https://openreview.net/forum?id=dn4B7Mes2z", "pdf_size": 0, "recommendation": "5;5;5;6", "confidence": "4;4;4;4", "correctness": "3;3;2;4", "technical_novelty": "2;2;2;2", "empirical_novelty": "3;3;3;2", "wc_summary_paper": "33;112;88;75", "wc_summary_review": "23;28;161;128", "wc_main_review": "462;142;898;459", "wc_review": "518;282;1147;662", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "817;357;1207;1191", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;2;2", "recommendation_avg": [ 5.25, 0.4330127018922193 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 77.0, 28.661821295933027 ], "wc_summary_review_avg": [ 85.0, 60.658882284460205 ], "wc_main_review_avg": [ 490.25, 268.93900330744145 ], "wc_review_avg": [ 652.25, 316.22015669466737 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 893.0, 346.58043799383717 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.816496580927726, "gs_citation": 133, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2065604695794124060&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;1;0;0;0;2", "aff_unique_norm": "Massachusetts Institute of Technology;Adobe;Google", "aff_unique_dep": ";Adobe Systems Incorporated;Google", "aff_unique_url": "https://web.mit.edu;https://www.adobe.com;https://www.google.com", "aff_unique_abbr": "MIT;Adobe;Google", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "doGDvfnHCEj", "title": "On the Expressiveness, Predictability and Interpretability of Neural Temporal Point Processes", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Despite the fast advance in neural temporal point processes (NTPP) which enjoys high model capacity, there are still some standing gaps to fill including model expressiveness, predictability, and interpretability, especially with the wide application of event sequence modeling. For expressiveness, we first show the incapacity of existing NTPP models for fitting time-varying especially non-terminating TPP, and propose a simple neural model for expressive intensity function modeling. To improve predictability which is not directly optimized by the TPP likelihood objective, we devise our new sampling techniques that enable error metric driven adaptive fine-tuning of the sampling hyperparameter for predictive TPP, based on the event history in training sequences. Moreover, we show how interval-based event prediction can be achieved by our prediction techniques. To achieve interpretable NTPP, we propose an influence definition from one event to the future by comparing the difference between the existence of the event and not, which enables the dependency learning among events and types. Experimental results on synthetic datasets and public benchmarks show the efficacy of our approach.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Liangliang Shi;Fangyu Ding;Junchi Yan;Yanjie Duan;Guangjian Tian", "authorids": "~Liangliang_Shi1;~Fangyu_Ding1;~Junchi_Yan2;~Yanjie_Duan1;~Guangjian_Tian1", "gender": "M;Not Specified;;M;M", "homepage": ";;;;http://thinklab.sjtu.edu.cn/", "dblp": "89/8730;238/7298.html;160/8658.html;52/7695.html;60/7949.html", "google_scholar": "Qf1k8lUAAAAJ;;_8mhvIMAAAAJ;;ga230VoAAAAJ", "orcid": "0000-0001-7033-4207;;;;0000-0001-9639-7679", "linkedin": ";;;;", "or_profile": "~Liangliang_Shi1;~Fangyu_Ding1;~Yanjie_Duan1;~Guangjian_Tian1;~Junchi_Yan1", "aff": "Shanghai Jiaotong University;Shanghai Jiaotong University;Huawei Noah\u2019s Ark Lab;Huawei Technologies Ltd.;Shanghai Jiaotong University", "aff_domain": "sjtu.edu.cn;sjtu.edu.cn;huawei.com;huawei.com;sjtu.edu.cn", "position": "PhD student;MS student;Researcher;Researcher;Associate Professor", "bibtex": "@misc{\nshi2022on,\ntitle={On the Expressiveness, Predictability and Interpretability of Neural Temporal Point Processes},\nauthor={Liangliang Shi and Fangyu Ding and Junchi Yan and Yanjie Duan and Guangjian Tian},\nyear={2022},\nurl={https://openreview.net/forum?id=doGDvfnHCEj}\n}", "github": "", "project": "", "reviewers": "p6bR;FtkZ;3BTn;7qu6", "site": "https://openreview.net/forum?id=doGDvfnHCEj", "pdf_size": 0, "recommendation": "1;3;3;5", "confidence": "5;4;5;3", "correctness": "1;2;2;3", "technical_novelty": "1;2;2;3", "empirical_novelty": "1;2;2;2", "wc_summary_paper": "31;142;68;81", "wc_summary_review": "17;55;48;15", "wc_main_review": "422;927;781;292", "wc_review": "470;1124;897;388", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.0, 1.4142135623730951 ], "confidence_avg": [ 4.25, 0.82915619758885 ], "correctness_avg": [ 2.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 1.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 80.5, 39.96561021678513 ], "wc_summary_review_avg": [ 33.75, 17.93564885918544 ], "wc_main_review_avg": [ 605.5, 257.9326462470387 ], "wc_review_avg": [ 719.75, 303.0135104248654 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.8528028654224418, "corr_recommendation_correctness": 1.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:_z6yrAN-l_YJ:scholar.google.com/&scioq=On+the+Expressiveness,+Predictability+and+Interpretability+of+Neural+Temporal+Point+Processes&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0;1;1;0", "aff_unique_norm": "Shanghai Jiao Tong University;Huawei", "aff_unique_dep": ";Noah\u2019s Ark Lab", "aff_unique_url": "https://www.sjtu.edu.cn;https://www.huawei.com", "aff_unique_abbr": "SJTU;Huawei", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "title": "LEARNING GUARANTEES FOR GRAPH CONVOLUTIONAL NETWORKS ON THE STOCHASTIC BLOCK MODEL", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7034", "id": "dpXL6lz4mOQ", "poster": "", "openreview": "https://openreview.net/forum?id=dpXL6lz4mOQ", "slides": "https://iclr.cc/virtual/2022/poster/7034", "video": "https://iclr.cc/virtual/2022/poster/7034", "tldr": "", "abstract": "An abundance of neural network models and algorithms for diverse tasks on graphs have been developed in the past five years. However, very few provable guarantees have been available for the performance of graph neural network models. This state of affairs is in contrast with the steady progress on the theoretical underpinnings of traditional dense and convolutional neural networks. In this paper we present the first provable guarantees for one of the best-studied families of graph neural network models, Graph Convolutional Networks (GCNs), for semi- supervised community detection tasks. We show that with high probability over the initialization and training data, a GCN will efficiently learn to detect communities on graphs drawn from a stochastic block model. Our proof relies on a fine-grained analysis of the training dynamics in order to overcome the complexity of a non-convex optimization landscape with many poorly-performing local minima.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Wei Lu", "authorids": "~Wei_Lu8", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "wei-lu-365320149/", "or_profile": "~Wei_Lu8", "aff": "Brandeis University", "aff_domain": "brandeis.edu", "position": "PhD student", "bibtex": "@inproceedings{\nlu2022learning,\ntitle={{LEARNING} {GUARANTEES} {FOR} {GRAPH} {CONVOLUTIONAL} {NETWORKS} {ON} {THE} {STOCHASTIC} {BLOCK} {MODEL}},\nauthor={Wei Lu},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=dpXL6lz4mOQ}\n}", "github": "", "project": "", "reviewers": "suTC;Pgr4;15hz;EjwU", "pdf_size": 0, "recommendation": "5;5;6;8", "confidence": "4;3;3;3", "correctness": "3;2;3;3", "technical_novelty": "4;2;4;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "59;98;81;106", "wc_summary_review": "86;49;23;1066", "wc_main_review": "246;198;415;48", "wc_review": "391;345;519;1220", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "326;226;304;540", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 6.0, 1.224744871391589 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.25, 0.82915619758885 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 86.0, 18.01388353465182 ], "wc_summary_review_avg": [ 306.0, 439.3569164130684 ], "wc_main_review_avg": [ 226.75, 130.94536074256317 ], "wc_review_avg": [ 618.75, 352.93793717876235 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 349.0, 116.36580253665593 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 1, 0 ], "corr_recommendation_confidence": -0.4714045207910316, "corr_recommendation_correctness": 0.4714045207910316, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10566665278405356925&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=dpXL6lz4mOQ", "email": "brandeis.edu", "author_num": 1, "aff_unique_index": "0", "aff_unique_norm": "Brandeis University", "aff_unique_dep": "", "aff_unique_url": "https://www.brandeis.edu", "aff_unique_abbr": "Brandeis", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "id": "drRnrGMZ3ze", "title": "Novel Policy Seeking with Constrained Optimization", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "In problem-solving, we humans tend to come up with different novel solutions to the same problem. \nHowever, conventional reinforcement learning algorithms ignore such a feat and only aim at producing a set of monotonous policies that maximize the cumulative reward. The resulting policies usually lack diversity and novelty. In this work, we aim at enabling the learning algorithms with the capacity of solving the task with multiple solutions through a practical novel policy generation workflow that can generate a set of diverse and well-performing policies. Specifically, we begin by introducing a new metric to evaluate the difference between policies. On top of this well-defined novelty metric, we propose to rethink the novelty-seeking problem through the lens of constrained optimization, to address the dilemma between the task performance and the behavioral novelty in existing multi-objective optimization approaches, we then propose a practical novel policy seeking algorithm, Interior Policy Differentiation (IPD), which is derived from the interior point method commonly known in the constrained optimization literature. Experimental comparisons on benchmark environments show IPD can achieve a substantial improvement over previous novelty-seeking methods in terms of both the novelty of generated policies and their performances in the primal task.\n", "keywords": "Novel Policy Discovery;Policy Diversity in Reinforcement Learning", "primary_area": "", "supplementary_material": "/attachment/1353ab10009319d65c0352a15e4a8ad6757bdffd.zip", "author": "Hao Sun;Zhenghao Peng;Bo Dai;Jian Guo;Dahua Lin;Bolei Zhou", "authorids": "~Hao_Sun3;~Zhenghao_Peng1;~Bo_Dai2;~Jian_Guo2;~Dahua_Lin1;~Bolei_Zhou5", "gender": "M;M;M;M;M;M", "homepage": "https://pengzhenghao.github.io;http://daibo.info/;https://idea.edu.cn/person/guojian/;http://dahua.site;https://boleizhou.github.io/;https://holarissun.github.io", "dblp": "220/3963;64/2903-2;96/2596-2;53/6088;46/8066;SunLLZL19", "google_scholar": "JZ8ws6IAAAAJ;https://scholar.google.com.hk/citations?user=KNWTvgEAAAAJ;;GMzzRRUAAAAJ;9D4aG8AAAAAJ;7ZNoHJkAAAAJ", "orcid": ";0000-0003-0777-9232;;;;", "linkedin": ";;;;;", "or_profile": "~Zhenghao_Peng1;~Bo_Dai2;~Jian_Guo2;~Dahua_Lin1;~Bolei_Zhou5;~Hao_Sun1", "aff": "The Chinese University of Hong Kong;Nanyang Technological University;International Digital Economy Academy, International Digital Economy Academy;The Chinese University of Hong Kong;University of California, Los Angeles;University of Cambridge", "aff_domain": "ie.cuhk.edu;ntu.edu.sg;idea.edu.cn;cuhk.edu.hk;ucla.edu;cam.ac.uk", "position": "MS student;Research Assistant Professor;Researcher;Associate Professor;Assistant Professor;PhD student", "bibtex": "@misc{\nsun2022novel,\ntitle={Novel Policy Seeking with Constrained Optimization},\nauthor={Hao Sun and Zhenghao Peng and Bo Dai and Jian Guo and Dahua Lin and Bolei Zhou},\nyear={2022},\nurl={https://openreview.net/forum?id=drRnrGMZ3ze}\n}", "github": "", "project": "", "reviewers": "UFwy;BgwA;u6Fj;X7og", "site": "https://openreview.net/forum?id=drRnrGMZ3ze", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "3;3;3;4", "correctness": "2;3;3;3", "technical_novelty": "3;2;3;2", "empirical_novelty": "3;2;2;3", "wc_summary_paper": "47;67;57;85", "wc_summary_review": "57;35;43;51", "wc_main_review": "517;324;298;520", "wc_review": "621;426;398;656", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 64.0, 14.035668847618199 ], "wc_summary_review_avg": [ 46.5, 8.2915619758885 ], "wc_main_review_avg": [ 414.75, 104.1618332211948 ], "wc_review_avg": [ 525.25, 114.35334494451835 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=597737097965802218&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff_unique_index": "0;1;2;0;3;4", "aff_unique_norm": "Chinese University of Hong Kong;Nanyang Technological University;International Digital Economy Academy;University of California, Los Angeles;University of Cambridge", "aff_unique_dep": ";;;;", "aff_unique_url": "https://www.cuhk.edu.hk;https://www.ntu.edu.sg;;https://www.ucla.edu;https://www.cam.ac.uk", "aff_unique_abbr": "CUHK;NTU;;UCLA;Cambridge", "aff_campus_unique_index": "0;0;2;3", "aff_campus_unique": "Hong Kong SAR;;Los Angeles;Cambridge", "aff_country_unique_index": "0;1;0;3;4", "aff_country_unique": "China;Singapore;;United States;United Kingdom" }, { "id": "drqmFn9fE9t", "title": "Self-Slimming Vision Transformer", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Vision transformers (ViTs) have become the popular structures and outperformed convolutional neural networks (CNNs) on various vision tasks. However, such powerful transformers bring huge computation burden, due to the exhausting token-to-token comparison. To make ViTs more efficient, we can prune them from two orthogonal directions: model structure and token number. However, pruning structure decreases the model capacity and struggles to speed up ViTs. Alternatively, we observe that ViTs exhibit sparse attention with high token similarity, while reducing tokens can greatly improve the throughput. Therefore, we propose a generic self-slimming learning approach for vanilla ViTs, namely SiT. Specifically, we first design a novel Token Slimming Module (TSM), which can boost the inference efficiency of ViTs by dynamic token aggregation. Different from the token hard dropping, our TSM softly integrates redundant tokens into fewer informative ones, which can dynamically zoom visual attention without cutting off discriminative token relations in the image. Furthermore, we introduce a concise Dense Knowledge Distillation (DKD) framework, which densely transfers token information in a flexible auto-encoder manner. Due to the similar structure between teacher and student, our framework can effectively leverage both parameter and structure knowledge to accelerate training convergence. Finally, we conduct extensive experiments to evaluate our SiT. In most cases, our method can speed up ViTs by 3.6x while maintaining 97% of their performance. Surprisingly, by simply arming LV-ViT with our SiT, we achieve new state-of-the-art performance on ImageNet, surpassing all the CNNs and ViTs in the recent literature.", "keywords": "Vision transformer;efficient transformer", "primary_area": "", "supplementary_material": "", "author": "Zhuofan Zong;Kunchang Li;Guanglu Song;Yali Wang;Yu Qiao;Biao Leng;Yu Liu", "authorids": "~Zhuofan_Zong1;~Kunchang_Li1;~Guanglu_Song2;~Yali_Wang1;~Yu_Qiao1;~Biao_Leng1;~Yu_Liu2", "gender": "M;M;M;M;;M;M", "homepage": "https://zongzhuofan.github.io/;https://andy1621.github.io/;;;;;http://liuyu.us", "dblp": "266/4989;;207/4745;01/773-1;;42/2913;97/2274-15", "google_scholar": "vls0YhoAAAAJ;D4tLSbsAAAAJ;Bd3v08QAAAAJ;https://scholar.google.com/citations?hl=en;;;", "orcid": ";0000-0001-5612-0341;;;;;", "linkedin": ";%E6%98%86%E6%98%8C-%E9%BB%8E-2a4a951b2/;;;;;", "or_profile": "~Zhuofan_Zong1;~Kunchang_Li1;~Guanglu_Song2;~Yali_Wang1;~Yu_Qiao1;~Biao_Leng1;~Yu_Liu2", "aff": "Beihang University;Shenzhen Institutes of Advanced Technology, Chinese Academy of Sciences, Chinese Academy of Sciences;Sensetime;SIAT, Chinese Academy of Sciences;;Beihang University;SenseTime", "aff_domain": "buaa.edu.cn;siat.ac.cn;sensetime.com;siat.ac.cn;;buaa.edu.cn;sensetime.com", "position": "MS student;PhD student;Computer Vision Researcher;Associate Professor;;Full Professor;Principal Researcher", "bibtex": "@misc{\nzong2022selfslimming,\ntitle={Self-Slimming Vision Transformer},\nauthor={Zhuofan Zong and Kunchang Li and Guanglu Song and Yali Wang and Yu Qiao and Biao Leng and Yu Liu},\nyear={2022},\nurl={https://openreview.net/forum?id=drqmFn9fE9t}\n}", "github": "", "project": "", "reviewers": "CgaS;y9p1;eBYg;4mmd", "site": "https://openreview.net/forum?id=drqmFn9fE9t", "pdf_size": 0, "recommendation": "5;5;5;6", "confidence": "4;4;4;5", "correctness": "3;1;3;3", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "84;35;27;53", "wc_summary_review": "13;21;29;75", "wc_main_review": "355;129;100;210", "wc_review": "452;185;156;338", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 5.25, 0.4330127018922193 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 2.5, 0.8660254037844386 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 49.75, 21.901769334919038 ], "wc_summary_review_avg": [ 34.5, 24.057223447438815 ], "wc_main_review_avg": [ 198.5, 98.94063876891032 ], "wc_review_avg": [ 282.75, 119.70667274634276 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 1.0, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:EUevtx2quwAJ:scholar.google.com/&scioq=Self-Slimming+Vision+Transformer&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;2;3;0;2", "aff_unique_norm": "Beihang University;Chinese Academy of Sciences;SenseTime;Shenzhen Institute of Advanced Technology", "aff_unique_dep": ";Shenzhen Institutes of Advanced Technology;;", "aff_unique_url": "http://www.buaa.edu.cn/;http://www.cas.cn;https://www.sensetime.com;http://www.siat.ac.cn", "aff_unique_abbr": "BUAA;CAS;SenseTime;SIAT", "aff_campus_unique_index": "1", "aff_campus_unique": ";Shenzhen", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Hidden Parameter Recurrent State Space Models For Changing Dynamics Scenarios", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6915", "id": "ds8yZOUsea", "poster": "", "openreview": "https://openreview.net/forum?id=ds8yZOUsea", "slides": "https://iclr.cc/virtual/2022/poster/6915", "video": "https://iclr.cc/virtual/2022/poster/6915", "author_site": "Vaisakh Shaj, Dieter B\u00fcchler, Rohit Sonker, Philipp Becker, Gerhard Neumann", "tldr": "", "abstract": "Recurrent State-space models (RSSMs) are highly expressive models for learning patterns in time series data and for system identification. However, these models are often based on the assumption that the dynamics are fixed and unchanging, which is rarely the case in real-world scenarios. Many control applications often exhibit tasks with similar, but not identical dynamics, that can be modelled as having a common latent structure. We introduce the Hidden Parameter Recurrent State Space Models (HiP-RSSMs), a framework that parametrizes a family of related state-space models with a low-dimensional set of latent factors. We present a simple and effective way of performing learning and inference over this Gaussian graphical model that avoids approximations like variational inference. We show that HiP-RSSMs outperforms RSSMs and competing multi-task models on several challenging robotic benchmarks both on real systems and simulations.", "keywords": "State Space Models;Changing Dynamics;Recurrent Neural Networks;Multi Task Learning", "primary_area": "", "supplementary_material": "", "author": "Vaisakh Shaj;Dieter B\u00fcchler;Rohit Sonker;Philipp Becker;Gerhard Neumann", "authorids": "~Vaisakh_Shaj1;~Dieter_B\u00fcchler1;rohitsonker96@gmail.com;~Philipp_Becker1;~Gerhard_Neumann2", "gender": "M;M;;M;", "homepage": ";http://embodied.ml/;;;", "dblp": "190/3994;181/4076.html;;66/1316;", "google_scholar": ";https://scholar.google.de/citations?user=8HYQ1tgAAAAJ;;https://scholar.google.de/citations?user=jXx-LuQAAAAJ;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~Vaisakh_Shaj1;~Dieter_B\u00fcchler1;rohitsonker96@gmail.com;~Philipp_Becker1;~Gerhard_Neumann2", "aff": "Karlsruhe Institute of Technology;Max Planck Institute for Intelligent Systems, Max-Planck Institute;;Karlsruhe Institute of Technology;", "aff_domain": "kit.edu;tuebingen.mpg.de;;kit.edu;", "position": "PhD student;Group Leader;;PhD student;", "bibtex": "@inproceedings{\nshaj2022hidden,\ntitle={Hidden Parameter Recurrent State Space Models For Changing Dynamics Scenarios},\nauthor={Vaisakh Shaj and Dieter B{\\\"u}chler and Rohit Sonker and Philipp Becker and Gerhard Neumann},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=ds8yZOUsea}\n}", "github": "", "project": "", "reviewers": "8V7G;VmRd;nyLc;vXB5", "pdf_size": 0, "recommendation": "5;5;6;8", "confidence": "4;4;3;4", "correctness": "2;2;3;3", "technical_novelty": "1;2;3;2", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "61;57;35;55", "wc_summary_review": "41;67;52;80", "wc_main_review": "437;771;375;652", "wc_review": "539;895;462;787", "wc_reply_reviewers": "68;360;135;52", "wc_reply_authors": "1052;1364;1221;619", "reply_reviewers": "1;1;2;1", "reply_authors": "4;4;4;3", "recommendation_avg": [ 6.0, 1.224744871391589 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 52.0, 10.04987562112089 ], "wc_summary_review_avg": [ 60.0, 14.781745499094482 ], "wc_main_review_avg": [ 558.75, 159.9474523085629 ], "wc_review_avg": [ 670.75, 176.590451327358 ], "wc_reply_reviewers_avg": [ 153.75, 123.08203565102423 ], "wc_reply_authors_avg": [ 1064.0, 279.6506749500169 ], "reply_reviewers_avg": [ 1.25, 0.4330127018922193 ], "reply_authors_avg": [ 3.75, 0.4330127018922193 ], "replies_avg": [ 25, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.8164965809277259, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11250070216520072781&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=ds8yZOUsea", "email": "kit.edu;tuebingen.mpg.de;;kit.edu;", "author_num": 5, "aff_unique_index": "0;1;0", "aff_unique_norm": "Karlsruhe Institute of Technology;Max Planck Institute for Intelligent Systems", "aff_unique_dep": ";Intelligent Systems", "aff_unique_url": "https://www.kit.edu;https://www.mpi-is.mpg.de", "aff_unique_abbr": "KIT;MPI-IS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Germany" }, { "id": "dtYnHcmQKeM", "title": "Physics-Informed Neural Operator for Learning Partial Differential Equations", "track": "main", "status": "Reject", "tldr": "", "abstract": "Machine learning methods have recently shown promise in solving partial differential equations (PDEs). They can be classified into two broad categories: solution function approximation, and operator learning. The Physics-Informed Neural Network (PINN) is an example of the former while the Fourier neural operator (FNO) is an example of the latter. Both these approaches have shortcomings. The optimization in PINN is challenging and prone to failure, especially on multi-scale dynamic systems. FNO does not suffer from this optimization issue since it carries out supervised learning on a given dataset, but obtaining such data may be too expensive or infeasible. In this work, we propose the physics-informed neural operator (PINO), where we combine the operating-learning and function-optimization frameworks, and this improves convergence rates and accuracy over both PINN and FNO models. In the operator-learning phase, PINO learns the solution operator over multiple instances of the parametric PDE family. In the test-time optimization phase, PINO optimizes the pre-trained operator ansatz for the querying instance of the PDE. Experiments show PINO outperforms previous ML methods on many popular PDE families while retaining the extraordinary speed-up of FNO compared to solvers. In particular, PINO accurately solves long temporal transient flows and chaotic Kolmogorov flows, while PINN and other methods fail to converge to a reasonable accuracy. ", "keywords": "Partial Differential Equations;operator learning;physics-informed;PINN;inverse problem;Navier-Stokes Equation", "primary_area": "", "supplementary_material": "/attachment/db19fa9c3ddbc2382eab49a78afffbc6a77622b5.zip", "author": "Zongyi Li;Hongkai Zheng;Nikola Borislavov Kovachki;David Jin;Haoxuan Chen;Burigede Liu;Andrew Stuart;Kamyar Azizzadenesheli;Anima Anandkumar", "authorids": "~Zongyi_Li1;~Hongkai_Zheng1;~Nikola_Borislavov_Kovachki1;djin@caltech.edu;~Haoxuan_Chen1;~Burigede_Liu1;~Andrew_Stuart1;~Kamyar_Azizzadenesheli1;~Anima_Anandkumar1", "gender": "M;;M;;M;M;M;M;", "homepage": "https://zongyi-li.github.io;;http://www.its.caltech.edu/~nkovachk/;;https://haoxuanstevec00.github.io/;;http://www.cms.caltech.edu/people;https://kamyar.page/;", "dblp": ";250/9194;;;212/7201.html;;;176/5584;", "google_scholar": ";lUDEZQMAAAAJ;;;https://scholar.google.com/citations?hl=en;GMKw0g8AAAAJ;;CxAS4SQAAAAJ;", "orcid": ";;;;0000-0002-8238-2764;;;;", "linkedin": ";;;;haoxuan-steve-chen-748b0a171/;;;;", "or_profile": "~Zongyi_Li1;~Hongkai_Zheng1;~Nikola_Borislavov_Kovachki1;djin@caltech.edu;~Haoxuan_Chen1;~Burigede_Liu1;~Andrew_Stuart1;~Kamyar_Azizzadenesheli1;~Anima_Anandkumar1", "aff": "California Institute of Technology;California Institute of Technology;California Institute of Technology;;California Institute of Technology;University of Cambridge;California Institute of Technology;Purdue University;", "aff_domain": "caltech.edu;caltech.edu;caltech.edu;;caltech.edu;cam.ac.uk;;purdue.edu;", "position": "PhD student;PhD student;PhD student;;Undergrad student;Assistant Professor;Full Professor;Assistant Professor;", "bibtex": "@misc{\nli2022physicsinformed,\ntitle={Physics-Informed Neural Operator for Learning Partial Differential Equations},\nauthor={Zongyi Li and Hongkai Zheng and Nikola Borislavov Kovachki and David Jin and Haoxuan Chen and Burigede Liu and Andrew Stuart and Kamyar Azizzadenesheli and Anima Anandkumar},\nyear={2022},\nurl={https://openreview.net/forum?id=dtYnHcmQKeM}\n}", "github": "", "project": "", "reviewers": "Tav6;oyPM;NBhE;WVp6", "site": "https://openreview.net/forum?id=dtYnHcmQKeM", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "2;2;3;3", "correctness": "2;4;3;3", "technical_novelty": "1;3;3;2", "empirical_novelty": "1;3;3;2", "wc_summary_paper": "83;50;30;99", "wc_summary_review": "29;27;33;140", "wc_main_review": "795;243;132;721", "wc_review": "907;320;195;960", "wc_reply_reviewers": "0;151;0;266", "wc_reply_authors": "1537;1065;1070;1527", "reply_reviewers": "0;1;0;1", "reply_authors": "3;2;2;3", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 2.5, 0.5 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.25, 0.82915619758885 ], "empirical_novelty_avg": [ 2.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 65.5, 27.060118255469618 ], "wc_summary_review_avg": [ 57.25, 47.82454913535516 ], "wc_main_review_avg": [ 472.75, 289.12313553225033 ], "wc_review_avg": [ 595.5, 341.39163727308846 ], "wc_reply_reviewers_avg": [ 104.25, 111.89811213778363 ], "wc_reply_authors_avg": [ 1299.75, 232.28363588509632 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 2.5, 0.5 ], "replies_avg": [ 23, 0 ], "authors#_avg": [ 9, 0 ], "corr_recommendation_confidence": 0.6882472016116854, "corr_recommendation_correctness": 0.6488856845230502, "gs_citation": 586, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7405646521912950463&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff_unique_index": "0;0;0;0;1;0;2", "aff_unique_norm": "California Institute of Technology;University of Cambridge;Purdue University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.caltech.edu;https://www.cam.ac.uk;https://www.purdue.edu", "aff_unique_abbr": "Caltech;Cambridge;Purdue", "aff_campus_unique_index": "0;0;0;0;1;0", "aff_campus_unique": "Pasadena;Cambridge;", "aff_country_unique_index": "0;0;0;0;1;0;0", "aff_country_unique": "United States;United Kingdom" }, { "id": "dtpgsBPJJW", "title": "Riemannian Manifold Embeddings for Straight-Through Estimator", "track": "main", "status": "Reject", "tldr": "", "abstract": "Quantized Neural Networks (QNNs) aim at replacing full-precision weights $\\boldsymbol{W}$ with quantized weights $\\boldsymbol{\\hat{W}}$, which make it possible to deploy large models to mobile and miniaturized devices easily. However, either infinite or zero gradients caused by non-differentiable quantization significantly affect the training of quantized models. In order to address this problem, most training-based quantization methods use Straight-Through Estimator (STE) to approximate gradients $\\nabla_{\\boldsymbol{W}}$ w.r.t. $\\boldsymbol{W}$ with gradients $\\nabla_{\\boldsymbol{\\hat{W}}}$ w.r.t. $\\boldsymbol{\\hat{W}}$ where the premise is that $\\boldsymbol{W}$ must be clipped to $[-1,+1]$. However, the simple application of STE brings with the gradient mismatch problem, which affects the stability of the training process. In this paper, we propose to revise an approximated gradient for penetrating the quantization function with manifold learning. Specifically, by viewing the parameter space as a metric tensor in the Riemannian manifold, we introduce the Manifold Quantization (ManiQuant) via revised STE to alleviate the gradient mismatch problem. The ablation studies and experimental results demonstrate that our proposed method has a better and more stable performance with various deep neural networks on CIFAR10/100 and ImageNet datasets.", "keywords": "Neural network quantization;Riemannian manifold;Information geometry;Mirror descent", "primary_area": "", "supplementary_material": "", "author": "Jun Chen;Hanwen Chen;Jiangning Zhang;yuang Liu;Tianxin Huang;Yong Liu", "authorids": "~Jun_Chen9;~Hanwen_Chen1;~Jiangning_Zhang1;~yuang_Liu2;~Tianxin_Huang1;~Yong_Liu11", "gender": "M;M;M;M;M;M", "homepage": ";https://april.zju.edu.cn/team/hanwen-chen/;https://www.researchgate.net/profile/Jiangning_Zhang2;https://april.zju.edu.cn/team/yuang-liu/;https://tianxinhuang.github.io/;https://person.zju.edu.cn/en/yongliu", "dblp": ";;241/9593;;251/3784;29/4867-7", "google_scholar": "YKc2O78AAAAJ;;https://scholar.google.com.hk/citations?user=2hA4X9wAAAAJ;;https://scholar.google.com.hk/citations?user=Fg7WYfcAAAAJ;https://scholar.google.com.hk/citations?user=qYcgBbEAAAAJ", "orcid": "0000-0001-6568-8801;;;;;0000-0003-4822-8939", "linkedin": ";;;;;", "or_profile": "~Jun_Chen9;~Hanwen_Chen1;~Jiangning_Zhang1;~yuang_Liu2;~Tianxin_Huang1;~Yong_Liu11", "aff": "Zhejiang University;Zhejiang University;Zhejiang University;Zhejiang University;Zhejiang University;Zhejiang University", "aff_domain": "zju.edu.cn;zju.edu.cn;zju.edu.cn;zju.edu.cn;zju.edu.cn;zju.edu.cn", "position": "PhD student;MS student;PhD student;MS student;PhD student;Full Professor", "bibtex": "@misc{\nchen2022riemannian,\ntitle={Riemannian Manifold Embeddings for Straight-Through Estimator},\nauthor={Jun Chen and Hanwen Chen and Jiangning Zhang and yuang Liu and Tianxin Huang and Yong Liu},\nyear={2022},\nurl={https://openreview.net/forum?id=dtpgsBPJJW}\n}", "github": "", "project": "", "reviewers": "Goby;WCKm;gTNq;HJGf", "site": "https://openreview.net/forum?id=dtpgsBPJJW", "pdf_size": 0, "recommendation": "3;3;6;6", "confidence": "3;3;4;3", "correctness": "2;2;3;3", "technical_novelty": "2;3;2;3", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "96;63;73;31", "wc_summary_review": "21;22;21;63", "wc_main_review": "155;108;164;274", "wc_review": "272;193;258;368", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.5, 1.5 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 65.75, 23.35995505132662 ], "wc_summary_review_avg": [ 31.75, 18.046814123273947 ], "wc_main_review_avg": [ 175.25, 60.84971240687995 ], "wc_review_avg": [ 272.75, 62.55147879946564 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.5773502691896258, "corr_recommendation_correctness": 1.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:rT3oMcVhkbIJ:scholar.google.com/&scioq=Riemannian+Manifold+Embeddings+for+Straight-Through+Estimator&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0;0;0;0;0", "aff_unique_norm": "Zhejiang University", "aff_unique_dep": "", "aff_unique_url": "https://www.zju.edu.cn", "aff_unique_abbr": "ZJU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "China" }, { "id": "dtt435G80Ng", "title": "CSQ: Centered Symmetric Quantization for Extremely Low Bit Neural Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "Recent advances in quantized neural networks (QNNs) are closing the performance gap with the full precision neural networks. However at very low precision (i.e., $\\le 3$-bits), QNNs often still suffer significant performance degradation. The conventional uniform symmetric quantization scheme allocates unequal numbers of positive and negative quantization levels. We show that this asymmetry in the number of positive and negative quantization levels can result in significant quantization error and performance degradation at low precision. We propose and analyze a quantizer called centered symmetric quantizer (CSQ), which preserves the symmetry of latent distribution by providing equal representations to the negative and positive sides of the distribution. We also propose a novel method to efficiently map CSQ to binarized neural network hardware using bitwise operations. Our analyses and experimental results using state-of-the-art quantization methods on ImageNet and CIFAR-10 show the importance of using CSQ for weight in place of the conventional quantization scheme at extremely low-bit precision (2$\\sim$3 bits).", "keywords": "deep learning;classification;low precision;uniform symmetric quantization;binary neural network hardware", "primary_area": "", "supplementary_material": "", "author": "Faaiz Asim;Jaewoo Park;Azat Azamat;Jongeun Lee", "authorids": "~Faaiz_Asim1;~Jaewoo_Park2;~Azat_Azamat1;~Jongeun_Lee1", "gender": "M;M;M;", "homepage": ";https://iccl.unist.ac.kr/;;https://iccl.unist.ac.kr/~jlee", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": "faaiz-asim-43b16616b/;;azatkariuly/;", "or_profile": "~Faaiz_Asim1;~Jaewoo_Park2;~Azat_Azamat1;~Jongeun_Lee1", "aff": "Ulsan National Institute of Science and Technology;Ulsan National Institute of Science and Technology;Ulsan National Institute of Science and Technology;Ulsan National Institute of Science and Technology", "aff_domain": "unist.ac.kr;unist.ac.kr;unist.ac.kr;unist.ac.kr", "position": "MS student;Undergrad student;MS student;Full Professor", "bibtex": "@misc{\nasim2022csq,\ntitle={{CSQ}: Centered Symmetric Quantization for Extremely Low Bit Neural Networks},\nauthor={Faaiz Asim and Jaewoo Park and Azat Azamat and Jongeun Lee},\nyear={2022},\nurl={https://openreview.net/forum?id=dtt435G80Ng}\n}", "github": "", "project": "", "reviewers": "PPJy;Z3wR;hDhb;FpP9", "site": "https://openreview.net/forum?id=dtt435G80Ng", "pdf_size": 0, "recommendation": "3;5;5;5", "confidence": "4;5;4;4", "correctness": "2;3;3;3", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;0;2;2", "wc_summary_paper": "97;71;12;44", "wc_summary_review": "18;29;25;43", "wc_main_review": "612;216;132;275", "wc_review": "727;316;169;362", "wc_reply_reviewers": "0;0;0;34", "wc_reply_authors": "1352;923;736;831", "reply_reviewers": "0;0;0;1", "reply_authors": "2;2;1;2", "recommendation_avg": [ 4.5, 0.8660254037844386 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 1.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 56.0, 31.567388235329194 ], "wc_summary_review_avg": [ 28.75, 9.12071817347735 ], "wc_main_review_avg": [ 308.75, 182.30657558080563 ], "wc_review_avg": [ 393.5, 205.3174371552499 ], "wc_reply_reviewers_avg": [ 8.5, 14.722431864335457 ], "wc_reply_authors_avg": [ 960.5, 235.5042462462195 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.75, 0.4330127018922193 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": 1.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10517423930246811365&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Ulsan National Institute of Science and Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.unist.ac.kr", "aff_unique_abbr": "UNIST", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "South Korea" }, { "id": "dut7suZoRqv", "title": "SparRL: Graph Sparsification via Deep Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Graph sparsification concerns data reduction where an edge-reduced graph of a similar structure is preferred. Existing methods are mostly sampling-based, which introduce high computation complexity in general and lack of flexibility for a different reduction objective. We present SparRL, the first general and effective reinforcement learning-based framework for graph sparsification. SparRL can easily adapt to different reduction goals and promise graph-size-independent complexity. Extensive experiments show that SparRL outperforms all prevailing sparsification methods in producing high-quality sparsified graphs concerning a variety of objectives. As graph representations are very versatile, SparRL carries the potential for a broad impact.", "keywords": "graph sparsification;graph theory;machine learning;reinforcement learning", "primary_area": "", "supplementary_material": "", "author": "Ryan Wickman;Xiaofei Zhang;Weizi Li", "authorids": "~Ryan_Wickman1;~Xiaofei_Zhang2;~Weizi_Li1", "gender": "M;;M", "homepage": ";http://www.cs.memphis.edu/~xzhang12/;http://weizi-li.github.io/", "dblp": ";83/4809-2;60/7775", "google_scholar": ";IBy4k-4AAAAJ;", "orcid": ";0000-0002-5605-6295;", "linkedin": "ryan-wickman-771160144/;;", "or_profile": "~Ryan_Wickman1;~Xiaofei_Zhang2;~Weizi_Li1", "aff": "University of Memphis;University of Memphis;University of Memphis", "aff_domain": "memphis.edu;memphis.edu;memphis.edu", "position": "PhD student;Assistant Professor;Assistant Professor", "bibtex": "@misc{\nwickman2022sparrl,\ntitle={Spar{RL}: Graph Sparsification via Deep Reinforcement Learning},\nauthor={Ryan Wickman and Xiaofei Zhang and Weizi Li},\nyear={2022},\nurl={https://openreview.net/forum?id=dut7suZoRqv}\n}", "github": "", "project": "", "reviewers": "zwSi;AdUd;eytn;na7w", "site": "https://openreview.net/forum?id=dut7suZoRqv", "pdf_size": 0, "recommendation": "3;3;3;6", "confidence": "4;4;4;4", "correctness": "3;2;4;4", "technical_novelty": "3;3;2;3", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "115;96;289;35", "wc_summary_review": "107;41;107;36", "wc_main_review": "560;337;826;212", "wc_review": "782;474;1222;283", "wc_reply_reviewers": "111;0;91;23", "wc_reply_authors": "637;538;1128;111", "reply_reviewers": "2;0;1;1", "reply_authors": "2;1;2;1", "recommendation_avg": [ 3.75, 1.299038105676658 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 133.75, 94.38054619464755 ], "wc_summary_review_avg": [ 72.75, 34.29559009552103 ], "wc_main_review_avg": [ 483.75, 233.63045071223058 ], "wc_review_avg": [ 690.25, 354.89179689026344 ], "wc_reply_reviewers_avg": [ 56.25, 46.02920268698992 ], "wc_reply_authors_avg": [ 603.5, 361.61754658755154 ], "reply_reviewers_avg": [ 1.0, 0.7071067811865476 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.5222329678670935, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16956928765594513847&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Memphis", "aff_unique_dep": "", "aff_unique_url": "https://www.memphis.edu", "aff_unique_abbr": "UM", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "dvl241Sbrda", "title": "Unit Ball Model for Embedding Hierarchical Structures in the Complex Hyperbolic Space", "track": "main", "status": "Reject", "tldr": "", "abstract": "Learning the representation of data with hierarchical structures in the hyperbolic space attracts increasing attention in recent years. Due to the constant negative curvature, the hyperbolic space resembles tree metrics and captures the tree-like properties naturally, which enables the hyperbolic embeddings to improve over traditional Euclidean models. However, most real-world hierarchically structured data such as taxonomies and multitree networks have varying local structures and they are not trees, thus they do not ubiquitously match the constant curvature property of the hyperbolic space. To address this limitation of hyperbolic embeddings, we explore the complex hyperbolic space, which has the variable negative curvature, for representation learning. Specifically, we propose to learn the embeddings of hierarchically structured data in the unit ball model of the complex hyperbolic space. The unit ball model based embeddings have a more powerful representation capacity to capture a variety of hierarchical structures. Through experiments on synthetic and real-world data, we show that our approach improves over the hyperbolic embedding models significantly. We also explore the competence of complex hyperbolic geometry on the multitree structure and 1-N structure.", "keywords": "Complex hyperbolic embeddings;hierarchical data embeddings;taxonomy embeddings", "primary_area": "", "supplementary_material": "", "author": "Huiru Xiao;Caigao JIANG;Yangqiu Song;James Y. Zhang;Junwu Xiong", "authorids": "~Huiru_Xiao1;~Caigao_JIANG2;~Yangqiu_Song1;~James_Y._Zhang1;~Junwu_Xiong1", "gender": "F;M;M;M;M", "homepage": "https://huiruxiao.github.io/;;https://www.cse.ust.hk/~yqsong/;https://scholar.google.com/citations?user=Ywakh_sAAAAJ;https://scholar.google.com/citations?user=c7rK9cgAAAAJ&hl=en", "dblp": "236/6078;292/3817;86/2159;151/3086;91/9300", "google_scholar": "VyFipuUAAAAJ;;MdQZ-q8AAAAJ;Ywakh_sAAAAJ;c7rK9cgAAAAJ", "orcid": ";;0000-0002-7818-6090;0000-0001-6519-676X;0009-0008-2028-510X", "linkedin": ";caigao-jiang-309710194;yqsong/;jamesymzhang/;", "or_profile": "~Huiru_Xiao1;~Caigao_JIANG2;~Yangqiu_Song1;~James_Y._Zhang1;~Junwu_Xiong1", "aff": "Hong Kong University of Science and Technology;Alibaba Group;Hong Kong University of Science and Technology;Ant Group;antgroup", "aff_domain": "ust.hk;alibaba-inc.com;ust.hk;alipay.com;antgroup.com", "position": "PhD student;Researcher;Assistant Professor;managing director;Researcher", "bibtex": "@misc{\nxiao2022unit,\ntitle={Unit Ball Model for Embedding Hierarchical Structures in the Complex Hyperbolic Space},\nauthor={Huiru Xiao and Caigao JIANG and Yangqiu Song and James Y. Zhang and Junwu Xiong},\nyear={2022},\nurl={https://openreview.net/forum?id=dvl241Sbrda}\n}", "github": "", "project": "", "reviewers": "ZxKu;PCHz;1y6j;N6rj", "site": "https://openreview.net/forum?id=dvl241Sbrda", "pdf_size": 0, "recommendation": "5;5;5;6", "confidence": "4;4;3;4", "correctness": "4;4;3;3", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "46;110;38;45", "wc_summary_review": "41;19;56;45", "wc_main_review": "159;380;242;494", "wc_review": "246;509;336;584", "wc_reply_reviewers": "0;240;0;62", "wc_reply_authors": "282;2346;1638;1476", "reply_reviewers": "0;1;0;1", "reply_authors": "1;4;3;4", "recommendation_avg": [ 5.25, 0.4330127018922193 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 59.75, 29.17511782324109 ], "wc_summary_review_avg": [ 40.25, 13.442005058770064 ], "wc_main_review_avg": [ 318.75, 128.33038416524747 ], "wc_review_avg": [ 418.75, 134.2970122526931 ], "wc_reply_reviewers_avg": [ 75.5, 98.28911435148859 ], "wc_reply_authors_avg": [ 1435.5, 741.9937668201802 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 3.0, 1.224744871391589 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": -0.5773502691896257, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3642820085735842437&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1;0;2;2", "aff_unique_norm": "Hong Kong University of Science and Technology;Alibaba Group;Ant Group", "aff_unique_dep": ";;", "aff_unique_url": "https://www.ust.hk;https://www.alibaba.com;https://www.antgroup.com", "aff_unique_abbr": "HKUST;Alibaba;Ant Group", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "title": "ViTGAN: Training GANs with Vision Transformers", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6287", "id": "dwg5rXg1WS_", "poster": "", "openreview": "https://openreview.net/forum?id=dwg5rXg1WS_", "slides": "https://iclr.cc/virtual/2022/poster/6287", "video": "https://iclr.cc/virtual/2022/poster/6287", "author_site": "Kwonjoon Lee, Huiwen Chang, Lu Jiang, Han Zhang, Zhuowen Tu, Ce Liu", "tldr": "", "abstract": "Recently, Vision Transformers (ViTs) have shown competitive performance on image recognition while requiring less vision-specific inductive biases. In this paper, we investigate if such performance can be extended to image generation. To this end, we integrate the ViT architecture into generative adversarial networks (GANs). For ViT discriminators, we observe that existing regularization methods for GANs interact poorly with self-attention, causing serious instability during training. To resolve this issue, we introduce several novel regularization techniques for training GANs with ViTs. For ViT generators, we examine architectural choices for latent and pixel mapping layers to faciliate convergence. Empirically, our approach, named ViTGAN, achieves comparable performance to the leading CNN- based GAN models on three datasets: CIFAR-10, CelebA, and LSUN bedroom.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Kwonjoon Lee;Huiwen Chang;Lu Jiang;Han Zhang;Zhuowen Tu;Ce Liu", "authorids": "~Kwonjoon_Lee1;~Huiwen_Chang2;~Lu_Jiang1;~Han_Zhang1;~Zhuowen_Tu1;~Ce_Liu1", "gender": "M;F;M;M;;M", "homepage": "https://kjunelee.github.io;;http://www.lujiang.info/;https://sites.google.com/corp/view/hanzhang;;http://people.csail.mit.edu/celiu/", "dblp": "127/7948;131/4389;22/752-4;;;61/3937-1", "google_scholar": "C6Wu8M0AAAAJ;eZQNcvcAAAAJ;jIKjjSYAAAAJ;cxEoVL4AAAAJ;;j7MW4iYAAAAJ", "orcid": "0000-0002-1433-551X;;0000-0003-0286-8439;;;", "linkedin": ";;roadjiang/;;;ce-liu-5697501a", "or_profile": "~Kwonjoon_Lee1;~Huiwen_Chang2;~Lu_Jiang1;~Han_Zhang1;~Zhuowen_Tu1;~Ce_Liu1", "aff": "University of California, San Diego;Research, Google;Google Research;Google;;Microsoft", "aff_domain": "ucsd.edu;research.google.com;google.com;google.com;;microsoft.com", "position": "PhD student;Researcher;Researcher;Researcher;;Chief Architect for Computer Vision", "bibtex": "@inproceedings{\nlee2022vitgan,\ntitle={Vi{TGAN}: Training {GAN}s with Vision Transformers},\nauthor={Kwonjoon Lee and Huiwen Chang and Lu Jiang and Han Zhang and Zhuowen Tu and Ce Liu},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=dwg5rXg1WS_}\n}", "github": "", "project": "", "reviewers": "mS8C;2CzQ;Wrur;vpzj;oHXw", "pdf_size": 0, "recommendation": "6;6;6;6;8", "confidence": "4;4;4;4;5", "correctness": "4;4;3;3;4", "technical_novelty": "2;2;3;3;3", "empirical_novelty": "3;3;3;2;4", "wc_summary_paper": "56;87;104;88;87", "wc_summary_review": "56;91;30;101;34", "wc_main_review": "253;728;206;290;257", "wc_review": "365;906;340;479;378", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "0;0;0;0;0", "reply_reviewers": "0;0;0;0;0", "reply_authors": "0;0;0;0;0", "recommendation_avg": [ 6.4, 0.7999999999999999 ], "confidence_avg": [ 4.2, 0.39999999999999997 ], "correctness_avg": [ 3.6, 0.4898979485566356 ], "technical_novelty_avg": [ 2.6, 0.4898979485566356 ], "empirical_novelty_avg": [ 3.0, 0.6324555320336759 ], "wc_summary_paper_avg": [ 84.4, 15.602563891873668 ], "wc_summary_review_avg": [ 62.4, 29.000689646972194 ], "wc_main_review_avg": [ 346.8, 192.47171220727475 ], "wc_review_avg": [ 493.6, 211.5567063460764 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 1.0, "corr_recommendation_correctness": 0.408248290463863, "gs_citation": 269, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11425422721644021530&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=dwg5rXg1WS_", "email": "ucsd.edu;research.google.com;google.com;google.com;;microsoft.com", "author_num": 6, "aff_unique_index": "0;1;1;1;2", "aff_unique_norm": "University of California, San Diego;Google;Microsoft", "aff_unique_dep": ";Google Research;Microsoft Corporation", "aff_unique_url": "https://www.ucsd.edu;https://research.google;https://www.microsoft.com", "aff_unique_abbr": "UCSD;Google;Microsoft", "aff_campus_unique_index": "0;1;1;1", "aff_campus_unique": "San Diego;Mountain View;", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "dzZQEvQ6dRK", "title": "Disentangling Properties of Contrastive Methods", "track": "main", "status": "Reject", "tldr": "", "abstract": "Disentangled representation learning is an important topic in representation learning, since it not only allows the representation to be human interpretable, but it is also robust and benefits downstream task performance. Prior methods achieved initial successes on simplistic synthetic datasets but failed to scale to complex real-world datasets. Most of the previous methods adopt image generative models, such as GAN and VAE, to learn the disentangled representation. But we observe they are hard to learn disentangled representation on real-world images. Recently, self-supervised contrastive methods such as MoCo, SimCLR, and BYOL have achieved impressive performances on large-scale visual recognition tasks. In this paper, we explored the possibility of using contrastive methods to learn a disentangled representation, a discriminative approach that is drastically different from previous approaches. Surprisingly, we find that the contrastive method learns a disentangled representation with only minor modifications. The contrastively learned representation satisfies a ``group disentanglement'' property, which is a relaxed version of the original disentanglement property. This relaxation might be useful for scaling disentanglement learning to large and complex datasets. We further find contrastive methods achieve state-of-thet-art disentanglement performance on several widely used benchmarks, such as dSprites and Car3D. It also achieves significantly higher performance on the real-world dataset CelebA.", "keywords": "self-supervised learning;representation disentanglement", "primary_area": "", "supplementary_material": "", "author": "Jinkun Cao;Qing Yang;Jialei Huang;Yang Gao", "authorids": "~Jinkun_Cao1;~Qing_Yang9;huangjl21@mails.tsinghua.edu.cn;~Yang_Gao1", "gender": "M;F;;M", "homepage": "https://www.jinkuncao.com;;;http://yang-gao.weebly.com", "dblp": "224/0126;;;89/4402-29", "google_scholar": "xDtTbmQAAAAJ;;;https://scholar.google.com/citations?hl=en", "orcid": ";;;", "linkedin": ";qing-yang-b3a02120b/;;yang-gao-45245348/", "or_profile": "~Jinkun_Cao1;~Qing_Yang9;huangjl21@mails.tsinghua.edu.cn;~Yang_Gao1", "aff": "Carnegie Mellon University;Shanghai Jiaotong University;;Tsinghua University", "aff_domain": "andrew.cmu.edu;en.sjtu.edu.cn;;tsinghua.edu.cn", "position": "PhD student;Undergrad student;;Assistant Professor", "bibtex": "@misc{\ncao2022disentangling,\ntitle={Disentangling Properties of Contrastive Methods},\nauthor={Jinkun Cao and Qing Yang and Jialei Huang and Yang Gao},\nyear={2022},\nurl={https://openreview.net/forum?id=dzZQEvQ6dRK}\n}", "github": "", "project": "", "reviewers": "xhwR;K9hY;By54;DZ9Z", "site": "https://openreview.net/forum?id=dzZQEvQ6dRK", "pdf_size": 0, "recommendation": "3;5;5;8", "confidence": "4;4;4;2", "correctness": "2;3;3;4", "technical_novelty": "2;1;4;3", "empirical_novelty": "3;2;2;3", "wc_summary_paper": "53;31;28;73", "wc_summary_review": "128;77;66;69", "wc_main_review": "364;596;340;95", "wc_review": "545;704;434;237", "wc_reply_reviewers": "0;165;0;0", "wc_reply_authors": "623;2067;708;117", "reply_reviewers": "0;1;0;0", "reply_authors": "1;5;1;1", "recommendation_avg": [ 5.25, 1.7853571071357126 ], "confidence_avg": [ 3.5, 0.8660254037844386 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.5, 1.118033988749895 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 46.25, 18.21228980661136 ], "wc_summary_review_avg": [ 85.0, 25.149552679918582 ], "wc_main_review_avg": [ 348.75, 177.36315147177555 ], "wc_review_avg": [ 480.0, 169.97499816149434 ], "wc_reply_reviewers_avg": [ 41.25, 71.44709581221619 ], "wc_reply_authors_avg": [ 878.75, 722.2819307583432 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 2.0, 1.7320508075688772 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.8892972917998875, "corr_recommendation_correctness": 0.9901475429766743, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:X-Mu7MwPmAwJ:scholar.google.com/&scioq=Disentangling+Properties+of+Contrastive+Methods&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;2", "aff_unique_norm": "Carnegie Mellon University;Shanghai Jiao Tong University;Tsinghua University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.cmu.edu;https://www.sjtu.edu.cn;https://www.tsinghua.edu.cn", "aff_unique_abbr": "CMU;SJTU;THU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1", "aff_country_unique": "United States;China" }, { "id": "e-IkMkna5uJ", "title": "Spectral Bias in Practice: the Role of Function Frequency in Generalization", "track": "main", "status": "Reject", "tldr": "", "abstract": "Despite their ability to represent highly expressive functions, deep learning models trained with SGD seem to find simple, constrained solutions that generalize surprisingly well. Spectral bias \u2013 the tendency of neural networks to prioritize learning low frequency functions \u2013 is one possible explanation for this phenomenon,but so far spectral bias has only been observed in theoretical models and simplified experiments. In this work, we propose methodologies for measuring spectral bias in modern image classification networks. We find that these networks indeed exhibit spectral bias, and that networks that generalize well strike a balance between having enough complexity (i.e. high frequencies) to fit the data while being simple enough to avoid overfitting. For example, we experimentally show that larger models learn high frequencies faster than smaller ones, but many forms of regularization, both explicit and implicit, amplify spectral bias and delay the learning of high frequencies. We also explore the connections between function frequency and image frequency and find that spectral bias is sensitive to the low frequencies prevalent in natural images. Our work enables measuring and ultimately controlling the spectral behavior of neural networks used for image classification, and is a step towards understanding why deep models generalize well.", "keywords": "spectral bias;generalization;function frequency;image classification", "primary_area": "", "supplementary_material": "/attachment/cf4f6d58fee3324d92173ee3149981daa9f89885.zip", "author": "Sara Fridovich-Keil;Raphael Gontijo-Lopes;Rebecca Roelofs", "authorids": "~Sara_Fridovich-Keil1;~Raphael_Gontijo-Lopes1;~Rebecca_Roelofs1", "gender": "F;F;M", "homepage": "https://sarafridov.github.io;;https://raphagl.com", "dblp": "236/7023;145/2224;", "google_scholar": "9xF7M6wAAAAJ;;-wpZQY0AAAAJ", "orcid": ";;", "linkedin": "sara-fridovich-keil-3aa744160/;;raphaelgontijolopes/", "or_profile": "~Sara_Fridovich-Keil1;~Rebecca_Roelofs1;~Raphael_Gontijo_Lopes1", "aff": "University of California, Berkeley;Google;Google Brain", "aff_domain": "berkeley.edu;google.com;google.com", "position": "PhD student;Research scientist;Research Associate", "bibtex": "@misc{\nfridovich-keil2022spectral,\ntitle={Spectral Bias in Practice: the Role of Function Frequency in Generalization},\nauthor={Sara Fridovich-Keil and Raphael Gontijo-Lopes and Rebecca Roelofs},\nyear={2022},\nurl={https://openreview.net/forum?id=e-IkMkna5uJ}\n}", "github": "", "project": "", "reviewers": "axgN;9z2T;tm3W;Vhe6", "site": "https://openreview.net/forum?id=e-IkMkna5uJ", "pdf_size": 0, "recommendation": "3;3;8;8", "confidence": "4;4;4;5", "correctness": "3;3;3;2", "technical_novelty": "1;2;3;3", "empirical_novelty": "2;3;2;3", "wc_summary_paper": "78;116;119;115", "wc_summary_review": "113;52;70;86", "wc_main_review": "603;405;270;1201", "wc_review": "794;573;459;1402", "wc_reply_reviewers": "625;0;73;633", "wc_reply_authors": "2614;899;400;1423", "reply_reviewers": "2;0;1;2", "reply_authors": "5;3;2;4", "recommendation_avg": [ 5.5, 2.5 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.82915619758885 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 107.0, 16.80773631397161 ], "wc_summary_review_avg": [ 80.25, 22.40954037904392 ], "wc_main_review_avg": [ 619.75, 355.8703239945697 ], "wc_review_avg": [ 807.0, 364.02403766784414 ], "wc_reply_reviewers_avg": [ 332.75, 297.3855872432287 ], "wc_reply_authors_avg": [ 1334.0, 822.7852089093484 ], "reply_reviewers_avg": [ 1.25, 0.82915619758885 ], "reply_authors_avg": [ 3.5, 1.118033988749895 ], "replies_avg": [ 31, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": -0.5773502691896257, "gs_citation": 35, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11957833069162665826&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff_unique_index": "0;1;1", "aff_unique_norm": "University of California, Berkeley;Google", "aff_unique_dep": ";Google", "aff_unique_url": "https://www.berkeley.edu;https://www.google.com", "aff_unique_abbr": "UC Berkeley;Google", "aff_campus_unique_index": "0;1;1", "aff_campus_unique": "Berkeley;Mountain View", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "e-JV6H8lwpl", "title": "Subspace State-Space Identification and Model Predictive Control of Nonlinear Dynamical Systems Using Deep Neural Network with Bottleneck", "track": "main", "status": "Reject", "tldr": "", "abstract": "A novel nonlinear system identification method that produces state estimator and predictor directly usable for model predictive control (MPC) is proposed in this paper. The main feature of the proposed method is that it uses a neural network with a bottleneck layer between the state estimator and predictor to represent the input-output dynamics, and it is proven that the state of the dynamical system can be extracted from the bottleneck layer based on the observability of the target system. The training of the network is shown to be a natural nonlinear extension of the subspace state-space system identification method established for linear dynamical systems. This correspondence gives interpretability to the resulting model based on linear control theory. The usefulness of the proposed method and the interpretability of the model are demonstrated through an illustrative example of MPC.", "keywords": "System identification;Model predictive control;Subspace state-space system identification", "primary_area": "", "supplementary_material": "/attachment/70f68ddbdfc6d4df76c5da334bd17b3c036582df.zip", "author": "Ichiro Maruta;Keito Yamada;Kenji Fujimoto", "authorids": "~Ichiro_Maruta1;~Keito_Yamada1;~Kenji_Fujimoto1", "gender": ";M;M", "homepage": "https://control.kuaero.kyoto-u.ac.jp/~maruta/;;", "dblp": "25/949;;31/5425.html", "google_scholar": "uepzlPoAAAAJ;;https://scholar.google.co.jp/citations?user=fO8RYK8AAAAJ", "orcid": "my-orcid?orcid=0000-0002-2246-3570;0000-0002-0199-0427;0000-0002-1190-1088", "linkedin": ";;", "or_profile": "~Ichiro_Maruta1;~Keito_Yamada1;~Kenji_Fujimoto1", "aff": "Kyoto University;Kyoto University;Kyoto University", "aff_domain": "kuaero.kyoto-u.ac.jp;kyoto-u.ac.jp;kyoto-u.ac.jp", "position": "Associate Professor;MS student;Professor", "bibtex": "@misc{\nmaruta2022subspace,\ntitle={Subspace State-Space Identification and Model Predictive Control of Nonlinear Dynamical Systems Using Deep Neural Network with Bottleneck},\nauthor={Ichiro Maruta and Keito Yamada and Kenji Fujimoto},\nyear={2022},\nurl={https://openreview.net/forum?id=e-JV6H8lwpl}\n}", "github": "", "project": "", "reviewers": "Sng6;pfti;dZQd", "site": "https://openreview.net/forum?id=e-JV6H8lwpl", "pdf_size": 0, "recommendation": "3;5;6", "confidence": "5;3;2", "correctness": "1;3;4", "technical_novelty": "1;2;3", "empirical_novelty": "1;2;2", "wc_summary_paper": "30;51;54", "wc_summary_review": "53;20;36", "wc_main_review": "232;80;290", "wc_review": "315;151;380", "wc_reply_reviewers": "497;0;0", "wc_reply_authors": "1514;134;750", "reply_reviewers": "3;0;0", "reply_authors": "5;1;1", "recommendation_avg": [ 4.666666666666667, 1.247219128924647 ], "confidence_avg": [ 3.3333333333333335, 1.247219128924647 ], "correctness_avg": [ 2.6666666666666665, 1.247219128924647 ], "technical_novelty_avg": [ 2.0, 0.816496580927726 ], "empirical_novelty_avg": [ 1.6666666666666667, 0.4714045207910317 ], "wc_summary_paper_avg": [ 45.0, 10.677078252031311 ], "wc_summary_review_avg": [ 36.333333333333336, 13.474255287605157 ], "wc_main_review_avg": [ 200.66666666666666, 88.54879383079641 ], "wc_review_avg": [ 282.0, 96.35697518429409 ], "wc_reply_reviewers_avg": [ 165.66666666666666, 234.28804683314277 ], "wc_reply_authors_avg": [ 799.3333333333334, 564.4615920404938 ], "reply_reviewers_avg": [ 1.0, 1.4142135623730951 ], "reply_authors_avg": [ 2.3333333333333335, 1.8856180831641267 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -1.0, "corr_recommendation_correctness": 1.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:AdImwEXVZmwJ:scholar.google.com/&scioq=Subspace+State-Space+Identification+and+Model+Predictive+Control+of+Nonlinear+Dynamical+Systems+Using+Deep+Neural+Network+with+Bottleneck&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;0", "aff_unique_norm": "Kyoto University", "aff_unique_dep": "", "aff_unique_url": "https://www.kyoto-u.ac.jp", "aff_unique_abbr": "Kyoto U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Japan" }, { "id": "e0TRvNWsVIH", "title": "Learning Representation for Bayesian Optimization with Collision-free Regularization", "track": "main", "status": "Reject", "tldr": "", "abstract": "Bayesian Optimization has been challenged by the large-scale and high-dimensional datasets, which are common in real-world scenarios. Recent works attempt to handle such input by applying neural networks ahead of the classical Gaussian process to learn a (low-dimensional) latent representation. We show that even with proper network design, such learned representation often leads to collision in the latent space: two points with significantly different observations collide in the learned latent space, leading to degraded optimization performance. To address this issue, we propose LOCo, an efficient deep Bayesian optimization framework which employs a novel regularizer to reduce the collision in the learned latent space and encourage the mapping from the latent space to the objective value to be Lipschitz continuous. LOCo takes in pairs of data points and penalizes those too close in the latent space compared to their target space distance. We provide a rigorous theoretical justification for LOCo by inspecting the regret of this dynamic-embedding-based Bayesian optimization algorithm, where the neural network is iteratively retrained with the regularizer. Our empirical results further demonstrate the effectiveness of LOCo on several synthetic and real-world benchmark Bayesian optimization tasks.", "keywords": "Latent space;Bayesian Optimization;Collision", "primary_area": "", "supplementary_material": "", "author": "Fengxue Zhang;Brian Nord;Yuxin Chen", "authorids": "~Fengxue_Zhang1;~Brian_Nord1;~Yuxin_Chen1", "gender": "M;;", "homepage": ";https://iamstarnord.com;http://yuxinchen.org/", "dblp": ";;11/5123-1", "google_scholar": ";;-k1N7HAAAAAJ", "orcid": ";;", "linkedin": "fengxue-zhang-18b205146/;;", "or_profile": "~Fengxue_Zhang1;~Brian_Nord1;~Yuxin_Chen1", "aff": "University of Chicago;;University of Chicago", "aff_domain": "uchicago.edu;;uchicago.edu", "position": "Ph.D. student;;Assistant Professor", "bibtex": "@misc{\nzhang2022learning,\ntitle={Learning Representation for Bayesian Optimization with Collision-free Regularization},\nauthor={Fengxue Zhang and Brian Nord and Yuxin Chen},\nyear={2022},\nurl={https://openreview.net/forum?id=e0TRvNWsVIH}\n}", "github": "", "project": "", "reviewers": "aX8w;DSCs;G6dp;Knwx", "site": "https://openreview.net/forum?id=e0TRvNWsVIH", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "4;5;3;4", "correctness": "2;3;4;3", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "39;82;46;75", "wc_summary_review": "49;14;79;40", "wc_main_review": "1117;887;208;470", "wc_review": "1205;983;333;585", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 60.5, 18.33712082089225 ], "wc_summary_review_avg": [ 45.5, 23.221757039466244 ], "wc_main_review_avg": [ 670.5, 353.6739317507017 ], "wc_review_avg": [ 776.5, 338.9760316010558 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.7071067811865475, "corr_recommendation_correctness": 0.7071067811865475, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14611099480124126964&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0", "aff_unique_norm": "University of Chicago", "aff_unique_dep": "", "aff_unique_url": "https://www.uchicago.edu", "aff_unique_abbr": "UChicago", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Signing the Supermask: Keep, Hide, Invert", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6550", "id": "e0jtGTfPihs", "poster": "", "openreview": "https://openreview.net/forum?id=e0jtGTfPihs", "slides": "https://iclr.cc/virtual/2022/poster/6550", "video": "https://iclr.cc/virtual/2022/poster/6550", "author_site": "Nils Koster, Oliver Grothe, Achim Rettinger", "tldr": "", "abstract": "The exponential growth in numbers of parameters of neural networks over the past years has been accompanied by an increase in performance across several fields. However, due to their sheer size, the networks not only became difficult to interpret but also problematic to train and use in real-world applications, since hardware requirements increased accordingly. \nTackling both issues, we present a novel approach that either drops a neural network's initial weights or inverts their respective sign. \nPut simply, a network is trained by weight selection and inversion without changing their absolute values.\nOur contribution extends previous work on masking by additionally sign-inverting the initial weights and follows the findings of the Lottery Ticket Hypothesis.\nThrough this extension and adaptations of initialization methods, we achieve a pruning rate of up to 99%, while still matching or exceeding the performance of various baseline and previous models.\nOur approach has two main advantages.\nFirst, and most notable, signed Supermask models drastically simplify a model's structure, while still performing well on given tasks.\nSecond, by reducing the neural network to its very foundation, we gain insights into which weights matter for performance. \nThe code is available on GitHub.", "keywords": "Neural Networks;Supermask;Lottery Ticket Hypothesis;Pruning;Weight Initialization;Interpretation;Subnetworks", "primary_area": "", "supplementary_material": "/attachment/2f412381258d6b54da2d60c36d9178823d1de8f9.zip", "author": "Nils Koster;Oliver Grothe;Achim Rettinger", "authorids": "~Nils_Koster1;~Oliver_Grothe1;~Achim_Rettinger1", "gender": "M;M;M", "homepage": "https://ewifo.econ.kit.edu/21_130.php;https://as.ior.kit.edu/;https://www.uni-trier.de/index.php?id=69122", "dblp": ";;55/6363", "google_scholar": ";UGyFIhYAAAAJ;https://scholar.google.de/citations?user=a5WsBc0AAAAJ", "orcid": ";;0000-0003-4950-1167", "linkedin": "https://de.linkedin.com/in/nils-koster-98914170;;achim-rettinger", "or_profile": "~Nils_Koster1;~Oliver_Grothe1;~Achim_Rettinger1", "aff": "Karlsruhe Institute of Technology;Karlsruhe Institute of Technology;Trier University", "aff_domain": "kit.edu;kit.edu;uni-trier.de", "position": "PhD student;Full Professor;Full Professor", "bibtex": "@inproceedings{\nkoster2022signing,\ntitle={Signing the Supermask: Keep, Hide, Invert},\nauthor={Nils Koster and Oliver Grothe and Achim Rettinger},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=e0jtGTfPihs}\n}", "github": "", "project": "", "reviewers": "okPF;UFKs;AEMP;sRRw", "pdf_size": 0, "recommendation": "5;5;6;8", "confidence": "4;4;4;5", "correctness": "3;2;3;3", "technical_novelty": "2;2;2;3", "empirical_novelty": "3;2;2;3", "wc_summary_paper": "79;83;62;111", "wc_summary_review": "16;38;22;215", "wc_main_review": "636;292;218;260", "wc_review": "731;413;302;586", "wc_reply_reviewers": "0;0;0;107", "wc_reply_authors": "767;623;466;416", "reply_reviewers": "0;0;0;1", "reply_authors": "1;2;1;1", "recommendation_avg": [ 6.0, 1.224744871391589 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 83.75, 17.597940220378067 ], "wc_summary_review_avg": [ 72.75, 82.520830703526 ], "wc_main_review_avg": [ 351.5, 166.33926175139771 ], "wc_review_avg": [ 508.0, 163.76354905778027 ], "wc_reply_reviewers_avg": [ 26.75, 46.332359102467464 ], "wc_reply_authors_avg": [ 568.0, 137.96195127642983 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.9428090415820632, "corr_recommendation_correctness": 0.4714045207910316, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10618821989752755915&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=e0jtGTfPihs", "email": "kit.edu;kit.edu;uni-trier.de", "author_num": 3, "aff_unique_index": "0;0;1", "aff_unique_norm": "Karlsruhe Institute of Technology;Trier University", "aff_unique_dep": ";", "aff_unique_url": "https://www.kit.edu;https://www.uni-trier.de", "aff_unique_abbr": "KIT;UT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Germany" }, { "id": "e0uknAgETh", "title": "Adversarial Attacks on Spiking Convolutional Networks for Event-based Vision", "track": "main", "status": "Reject", "tldr": "", "abstract": "Event-based sensing using dynamic vision sensors is gaining traction in low-power vision applications. Spiking neural networks work well with the sparse nature of event-based data and suit deployment on low-power neuromorphic hardware. Being a nascent field, the sensitivity of spiking neural networks to potentially malicious adversarial attacks has received very little attention so far. In this work, we show how white-box adversarial attack algorithms can be adapted to the discrete and sparse nature of event-based visual data, and to the continuous-time setting of spiking neural networks. We test our methods on the N-MNIST and IBM Gestures neuromorphic vision datasets and show adversarial perturbations achieve a high success rate, while injecting a relatively small number of appropriately placed events. We also verify, for the first time, the effectiveness of these perturbations directly on neuromorphic hardware. Finally, we discuss the properties of the resulting perturbations and possible future directions.", "keywords": "spiking neural networks;neuromorphic engineering;adversarial attacks;dynamic vision sensors", "primary_area": "", "supplementary_material": "/attachment/3392591526b4f069470beb14e8cc29747c40db18.zip", "author": "Julian B\u00fcchel;Gregor Lenz;Yalun Hu;Sadique Sheik;Martino Sorbaro", "authorids": "~Julian_B\u00fcchel1;~Gregor_Lenz1;yalun.hu@synsense.ai;~Sadique_Sheik1;~Martino_Sorbaro1", "gender": "M;M;;M;M", "homepage": "https://research.ibm.com/people/julian-buchel;https://lenzgregor.com/site/;;;https://martinosorb.github.io", "dblp": ";;;99/9846;https://dblp.uni-trier.de/pid/255/4940-1", "google_scholar": "AXl8G8sAAAAJ;;;J5Z-oUwAAAAJ;pQmRdm4AAAAJ", "orcid": ";;;0000-0003-0302-8511;0000-0002-0182-7443", "linkedin": ";;;;", "or_profile": "~Julian_B\u00fcchel1;~Gregor_Lenz1;yalun.hu@synsense.ai;~Sadique_Sheik1;~Martino_Sorbaro1", "aff": "Swiss Federal Institute of Technology;SynSense;;SynSense AI;Swiss Federal Institute of Technology", "aff_domain": "ethz.ch;synsense.ai;;synsense.ai;ethz.ch", "position": "MS student;ML Engineer;;Researcher;Postdoc", "bibtex": "@misc{\nb{\\\"u}chel2022adversarial,\ntitle={Adversarial Attacks on Spiking Convolutional Networks for Event-based Vision},\nauthor={Julian B{\\\"u}chel and Gregor Lenz and Yalun Hu and Sadique Sheik and Martino Sorbaro},\nyear={2022},\nurl={https://openreview.net/forum?id=e0uknAgETh}\n}", "github": "", "project": "", "reviewers": "YwAR;FiWC;jrcY;2Mf5", "site": "https://openreview.net/forum?id=e0uknAgETh", "pdf_size": 0, "recommendation": "5;5;5;5", "confidence": "5;2;4;4", "correctness": "3;3;4;3", "technical_novelty": "2;3;2;2", "empirical_novelty": "3;3;2;2", "wc_summary_paper": "73;87;87;24", "wc_summary_review": "57;41;74;24", "wc_main_review": "715;133;204;220", "wc_review": "845;261;365;268", "wc_reply_reviewers": "266;0;66;27", "wc_reply_authors": "691;196;349;779", "reply_reviewers": "1;0;1;1", "reply_authors": "1;1;1;1", "recommendation_avg": [ 5.0, 0.0 ], "confidence_avg": [ 3.75, 1.0897247358851685 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 67.75, 25.897635027160298 ], "wc_summary_review_avg": [ 49.0, 18.560711193270585 ], "wc_main_review_avg": [ 318.0, 231.53509453212487 ], "wc_review_avg": [ 434.75, 240.3979773209417 ], "wc_reply_reviewers_avg": [ 89.75, 104.42790575320373 ], "wc_reply_authors_avg": [ 503.75, 239.52178919672423 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7874396246247227200&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "Swiss Federal Institute of Technology;SynSense;SynSense AI", "aff_unique_dep": ";;", "aff_unique_url": "https://www.ethz.ch;;https://www.synsense.ai", "aff_unique_abbr": "ETH Zurich;;SynSense AI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;2;0", "aff_country_unique": "Switzerland;;China" }, { "id": "e1GzwU4W2Kh", "title": "ConCoDE: Hard-constrained Differentiable Co-Exploration Method for Neural Architectures and Hardware Accelerators", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "While DNNs achieve over-human performances in a number of areas, it is often accompanied by the skyrocketing computational costs. \nCo-exploration of an optimal neural architecture and its hardware accelerator is an approach of rising interest which addresses the computational cost problem, especially in low-profile systems (e.g., embedded, mobile). \nThe difficulty of having to search the large co-exploration space is often addressed by adopting the idea of differentiable neural architecture search. \nDespite the superior search efficiency of the differentiable co-exploration, it faces a critical challenge of not being able to systematically satisfy hard constraints, such as frame rate or power budget.\nTo handle the hard constraint problem of differentiable co-exploration, we propose ConCoDE, \nwhich searches for hard-constrained solutions without compromising the global design objectives.\nBy manipulating the gradients in the interest of the given hard constraint, high-quality solutions satisfying the constraint can be obtained.\nExperimental results show that ConCoDE is able to meet the constraints even in tight conditions. \nWe also show that the solutions searched by ConCoDE exhibit high quality compared to those searched without any constraint. ", "keywords": "accelerator;codesign;hard constraint;NAS", "primary_area": "", "supplementary_material": "/attachment/984ed0c2dbdc28951a556615c451a949dab3d597.zip", "author": "Deokki Hong;Kanghyun Choi;Hey Yoon Lee;Joonsang Yu;Youngsok Kim;Noseong Park;Jinho Lee", "authorids": "~Deokki_Hong1;~Kanghyun_Choi1;hylee817@yonsei.ac.kr;~Joonsang_Yu1;~Youngsok_Kim1;~Noseong_Park1;~Jinho_Lee2", "gender": "M;M;;M;M;;M", "homepage": ";https://aisys.snu.ac.kr/kanghyun.html;;;https://youngsok.github.io;;http://acsys.snu.ac.kr/people.html", "dblp": "274/2871;229/7353;;180/3693;147/1001.html;;", "google_scholar": "gZaHHFUAAAAJ;n9e6qnsAAAAJ;;https://scholar.google.com/citations?hl=ko;ukhi3-QAAAAJ;;https://scholar.google.com/citations?hl=ko", "orcid": ";;;;;;", "linkedin": "dk-hong;;;joonsang-yu-22370a168/;;;", "or_profile": "~Deokki_Hong1;~Kanghyun_Choi1;hylee817@yonsei.ac.kr;~Joonsang_Yu1;~Youngsok_Kim1;~Noseong_Park1;~Jinho_Lee2", "aff": "Yonsei University;Yonsei University;;NAVER;Yonsei University;;Yonsei University", "aff_domain": "yonsei.ac.kr;yonsei.ac.kr;;navercorp.com;yonsei.ac.kr;;yonsei.ac.kr", "position": "MS student;MS student;;Researcher;Assistant Professor;;Assistant Professor", "bibtex": "@misc{\nhong2022concode,\ntitle={ConCo{DE}: Hard-constrained Differentiable Co-Exploration Method for Neural Architectures and Hardware Accelerators},\nauthor={Deokki Hong and Kanghyun Choi and Hey Yoon Lee and Joonsang Yu and Youngsok Kim and Noseong Park and Jinho Lee},\nyear={2022},\nurl={https://openreview.net/forum?id=e1GzwU4W2Kh}\n}", "github": "", "project": "", "reviewers": "jQhr;RaYd;bNvm;DiBQ", "site": "https://openreview.net/forum?id=e1GzwU4W2Kh", "pdf_size": 0, "recommendation": "3;3;3;6", "confidence": "4;3;4;2", "correctness": "3;3;3;4", "technical_novelty": "2;2;2;3", "empirical_novelty": "1;2;2;2", "wc_summary_paper": "83;73;84;81", "wc_summary_review": "63;46;19;148", "wc_main_review": "277;172;118;57", "wc_review": "423;291;221;286", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "452;269;261;143", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 3.75, 1.299038105676658 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 80.25, 4.322904116447646 ], "wc_summary_review_avg": [ 69.0, 48.23380557244058 ], "wc_main_review_avg": [ 156.0, 80.84243934963862 ], "wc_review_avg": [ 305.25, 73.37702297040947 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 281.25, 110.48614166491652 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": -0.8703882797784892, "corr_recommendation_correctness": 1.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:EdPrfeec7VoJ:scholar.google.com/&scioq=ConCoDE:+Hard-constrained+Differentiable+Co-Exploration+Method+for+Neural+Architectures+and+Hardware+Accelerators&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;1;0;0", "aff_unique_norm": "Yonsei University;NAVER Corporation", "aff_unique_dep": ";", "aff_unique_url": "https://www.yonsei.ac.kr;https://www.naver.com", "aff_unique_abbr": "Yonsei;NAVER", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "South Korea" }, { "title": "Hidden Convexity of Wasserstein GANs: Interpretable Generative Models with Closed-Form Solutions", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6284", "id": "e2Lle5cij9D", "poster": "", "openreview": "https://openreview.net/forum?id=e2Lle5cij9D", "slides": "https://iclr.cc/virtual/2022/poster/6284", "video": "https://iclr.cc/virtual/2022/poster/6284", "author_site": "Arda Sahiner, Tolga Ergen, Batu Ozturkler, Burak Bartan, John M Pauly, Morteza Mardani, Mert Pilanci", "tldr": "", "abstract": "Generative Adversarial Networks (GANs) are commonly used for modeling complex distributions of data. Both the generators and discriminators of GANs are often modeled by neural networks, posing a non-transparent optimization problem which is non-convex and non-concave over the generator and discriminator, respectively. Such networks are often heuristically optimized with gradient descent-ascent (GDA), but it is unclear whether the optimization problem contains any saddle points, or whether heuristic methods can find them in practice. In this work, we analyze the training of Wasserstein GANs with two-layer neural network discriminators through the lens of convex duality, and for a variety of generators expose the conditions under which Wasserstein GANs can be solved exactly with convex optimization approaches, or can be represented as convex-concave games. Using this convex duality interpretation, we further demonstrate the impact of different activation functions of the discriminator. Our observations are verified with numerical results demonstrating the power of the convex interpretation, with an application in progressive training of convex architectures corresponding to linear generators and quadratic-activation discriminators for CelebA image generation. The code for our experiments is available at https://github.com/ardasahiner/ProCoGAN.", "keywords": "Wasserstein GAN;convex-concave game;saddle points;generative models;quadratic;polynomial activation;convex duality", "primary_area": "", "supplementary_material": "/attachment/170a0d7587ed2af69529a8c7742c9ddf9460c884.zip", "author": "Arda Sahiner;Tolga Ergen;Batu Ozturkler;Burak Bartan;John M. Pauly;Morteza Mardani;Mert Pilanci", "authorids": "~Arda_Sahiner1;~Tolga_Ergen1;~Batu_Ozturkler1;~Burak_Bartan1;~John_M._Pauly1;~Morteza_Mardani1;~Mert_Pilanci3", "gender": "M;M;;M;M;M;M", "homepage": "http://web.stanford.edu/~sahiner/;https://tolgaergen.github.io/;https://batuozt.github.io;http://web.stanford.edu/~bbartan/;http://www.stanford.edu/~pauly;http://web.stanford.edu/~morteza/;https://stanford.edu/~pilanci/", "dblp": "264/6371;202/7477.html;281/6970;197/1215;95/6728;74/258;45/8056", "google_scholar": "723GIZQAAAAJ;https://scholar.google.com.tr/citations?user=T1pWaCsAAAAJ;O_tiFfoAAAAJ;ij7tGwsAAAAJ;Fc6GIIQAAAAJ;H7edsyEAAAAJ;aSAS-aAAAAAJ", "orcid": ";0000-0003-4806-0224;;;;;", "linkedin": ";;;;john-pauly-69805911/;;mert-pilanci-ba615743/", "or_profile": "~Arda_Sahiner1;~Tolga_Ergen1;~Batu_Ozturkler1;~Burak_Bartan1;~John_M._Pauly1;~Morteza_Mardani1;~Mert_Pilanci3", "aff": "Stanford University;Stanford University;Microsoft;Stanford University;;NVIDIA;Stanford University", "aff_domain": "stanford.edu;stanford.edu;microsoft.com;stanford.edu;;nvidia.com;stanford.edu", "position": "PhD student;PhD student;Intern;PhD student;;Principal Researcher;Assistant Professor", "bibtex": "@inproceedings{\nsahiner2022hidden,\ntitle={Hidden Convexity of Wasserstein {GAN}s: Interpretable Generative Models with Closed-Form Solutions},\nauthor={Arda Sahiner and Tolga Ergen and Batu Ozturkler and Burak Bartan and John M. Pauly and Morteza Mardani and Mert Pilanci},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=e2Lle5cij9D}\n}", "github": "", "project": "", "reviewers": "RNn1;5mJ8;Hxh6;x4Dn", "pdf_size": 0, "recommendation": "5;8;8;8", "confidence": "4;5;3;5", "correctness": "4;3;4;4", "technical_novelty": "2;4;3;3", "empirical_novelty": "2;4;3;3", "wc_summary_paper": "103;50;114;31", "wc_summary_review": "30;33;34;70", "wc_main_review": "130;738;115;808", "wc_review": "263;821;263;909", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 7.25, 1.299038105676658 ], "confidence_avg": [ 4.25, 0.82915619758885 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 74.5, 34.87477598494362 ], "wc_summary_review_avg": [ 41.75, 16.37643123516232 ], "wc_main_review_avg": [ 447.75, 326.23333290759854 ], "wc_review_avg": [ 564.0, 302.6037012331475 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.17407765595569782, "corr_recommendation_correctness": -0.3333333333333333, "gs_citation": 25, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9526825653845388729&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=e2Lle5cij9D", "email": "stanford.edu;stanford.edu;microsoft.com;stanford.edu;;nvidia.com;stanford.edu", "author_num": 7, "aff_unique_index": "0;0;1;0;2;0", "aff_unique_norm": "Stanford University;Microsoft;NVIDIA", "aff_unique_dep": ";Microsoft Corporation;NVIDIA Corporation", "aff_unique_url": "https://www.stanford.edu;https://www.microsoft.com;https://www.nvidia.com", "aff_unique_abbr": "Stanford;Microsoft;NVIDIA", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Stanford;", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Pix2seq: A Language Modeling Framework for Object Detection", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6245", "id": "e42KbIw6Wb", "poster": "", "openreview": "https://openreview.net/forum?id=e42KbIw6Wb", "slides": "https://iclr.cc/virtual/2022/poster/6245", "video": "https://iclr.cc/virtual/2022/poster/6245", "author_site": "Ting Chen, Saurabh Saxena, Lala Li, David Fleet, Geoffrey Hinton", "tldr": "", "abstract": "We present Pix2Seq, a simple and generic framework for object detection. Unlike existing approaches that explicitly integrate prior knowledge about the task, we cast object detection as a language modeling task conditioned on the observed pixel inputs. Object descriptions (e.g., bounding boxes and class labels) are expressed as sequences of discrete tokens, and we train a neural network to perceive the image and generate the desired sequence. Our approach is based mainly on the intuition that if a neural network knows about where and what the objects are, we just need to teach it how to read them out. Beyond the use of task-specific data augmentations, our approach makes minimal assumptions about the task, yet it achieves competitive results on the challenging COCO dataset, compared to highly specialized and well optimized detection algorithms.", "keywords": "language modeling;object detection", "primary_area": "", "supplementary_material": "", "author": "Ting Chen;Saurabh Saxena;Lala Li;David J. Fleet;Geoffrey Hinton", "authorids": "~Ting_Chen1;srbs@google.com;~Lala_Li1;~David_J._Fleet1;~Geoffrey_Hinton1", "gender": "M;;;M;M", "homepage": ";;;http://www.cs.toronto.edu/~fleet/index.html;https://www.cs.toronto.edu/~hinton/bio.html", "dblp": "19/1766;;49/7563;07/2099;10/3248", "google_scholar": "KoXUMbsAAAAJ;;;https://scholar.google.com.tw/citations?user=njOmQFsAAAAJ;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~Ting_Chen1;srbs@google.com;~Lala_Li1;~David_J._Fleet1;~Geoffrey_Hinton1", "aff": "Google;;Google;Department of Computer Science, University of Toronto;University of Toronto", "aff_domain": "google.com;;google.com;cs.toronto.edu;utoronto.ca", "position": "Research Scientist;;Software Engineer;Full Professor;Full Professor", "bibtex": "@inproceedings{\nchen2022pixseq,\ntitle={Pix2seq: A Language Modeling Framework for Object Detection},\nauthor={Ting Chen and Saurabh Saxena and Lala Li and David J. Fleet and Geoffrey Hinton},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=e42KbIw6Wb}\n}", "github": "", "project": "", "reviewers": "bYa9;Wvff;JMKG;Hjdy;kHNj", "pdf_size": 0, "recommendation": "6;6;8;8;8", "confidence": "3;3;5;5;4", "correctness": "3;4;4;4;4", "technical_novelty": "2;3;4;4;4", "empirical_novelty": "3;2;2;2;4", "wc_summary_paper": "144;138;78;98;71", "wc_summary_review": "68;89;48;82;27", "wc_main_review": "596;573;611;502;428", "wc_review": "808;800;737;682;526", "wc_reply_reviewers": "331;276;549;103;0", "wc_reply_authors": "1187;1090;1113;883;675", "reply_reviewers": "2;1;2;1;0", "reply_authors": "2;3;3;2;1", "recommendation_avg": [ 7.2, 0.9797958971132712 ], "confidence_avg": [ 4.0, 0.8944271909999159 ], "correctness_avg": [ 3.8, 0.39999999999999997 ], "technical_novelty_avg": [ 3.4, 0.8 ], "empirical_novelty_avg": [ 2.6, 0.8 ], "wc_summary_paper_avg": [ 105.8, 30.13569312293978 ], "wc_summary_review_avg": [ 62.8, 22.727956353354784 ], "wc_main_review_avg": [ 542.0, 68.16744090839849 ], "wc_review_avg": [ 710.6, 103.02543375303014 ], "wc_reply_reviewers_avg": [ 251.8, 190.12143487781697 ], "wc_reply_authors_avg": [ 989.6, 186.87493143811452 ], "reply_reviewers_avg": [ 1.2, 0.7483314773547883 ], "reply_authors_avg": [ 2.2, 0.7483314773547882 ], "replies_avg": [ 23, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.9128709291752769, "corr_recommendation_correctness": 0.6123724356957947, "gs_citation": 415, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17102558257176551695&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=e42KbIw6Wb", "email": "google.com;;google.com;cs.toronto.edu;utoronto.ca", "author_num": 5, "aff_unique_index": "0;0;1;1", "aff_unique_norm": "Google;University of Toronto", "aff_unique_dep": "Google;Department of Computer Science", "aff_unique_url": "https://www.google.com;https://www.utoronto.ca", "aff_unique_abbr": "Google;U of T", "aff_campus_unique_index": "0;0;1", "aff_campus_unique": "Mountain View;Toronto;", "aff_country_unique_index": "0;0;1;1", "aff_country_unique": "United States;Canada" }, { "id": "e5S8XfS7iW-", "title": "Ontology-Driven Semantic Alignment of Artificial Neurons and Visual Concepts", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Semantic alignment methods attempt to establish a link between human-level concepts and the units of an artificial neural network. Current approaches evaluate the emergence of such meaningful neurons by analyzing the effect of semantically annotated inputs on their activations. In doing so, they often understate two aspects that characterize neural representations and semantic concepts, namely the distributed nature of the former and the existence of semantic relationships binding the latter. In this work, we explicitly tackle this interrelatedness, both at a neural and a conceptual level, by providing a novel semantic alignment framework that builds on aligning a structured ontology with the distributed neural representations. The ontology introduces semantic relations between concepts, enabling the clustering of topologically related units into semantically rich and meaningful neural circuits. Our empirical analysis on notable convolutional models for image classification discusses the emergence of such neural circuits. It also validates their meaningfulness by studying how the selected units are pivotal for the accuracy of classes that are semantically related to the aligned concepts. We also contribute by releasing the code implementing our alignment framework.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/b206978a205518e2875bfaed60fc4d519b784508.zip", "author": "Riccardo Massidda;Davide Bacciu", "authorids": "~Riccardo_Massidda1;~Davide_Bacciu1", "gender": "M;M", "homepage": "https://pages.di.unipi.it/massidda/;http://pages.di.unipi.it/bacciu/", "dblp": "255/8428;07/6626", "google_scholar": "3PVuVisAAAAJ;https://scholar.google.it/citations?user=1d5n2WkAAAAJ", "orcid": "0000-0003-0137-7793;0000-0001-5213-2468", "linkedin": ";bacciu/", "or_profile": "~Riccardo_Massidda1;~Davide_Bacciu1", "aff": "University of Pisa;University of Pisa", "aff_domain": "unipi.it;unipi.it", "position": "PhD student;Full Professor", "bibtex": "@misc{\nmassidda2022ontologydriven,\ntitle={Ontology-Driven Semantic Alignment of Artificial Neurons and Visual Concepts},\nauthor={Riccardo Massidda and Davide Bacciu},\nyear={2022},\nurl={https://openreview.net/forum?id=e5S8XfS7iW-}\n}", "github": "", "project": "", "reviewers": "Vej2;gMwn;kSuN;81Lw", "site": "https://openreview.net/forum?id=e5S8XfS7iW-", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "4;4;3;5", "correctness": "3;4;3;3", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "85;201;38;83", "wc_summary_review": "58;74;20;72", "wc_main_review": "381;157;218;68", "wc_review": "524;432;276;223", "wc_reply_reviewers": "8;0;69;0", "wc_reply_authors": "627;429;348;191", "reply_reviewers": "1;0;1;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 101.75, 60.30495419117736 ], "wc_summary_review_avg": [ 56.0, 21.6794833886788 ], "wc_main_review_avg": [ 206.0, 114.25191464478834 ], "wc_review_avg": [ 363.75, 120.25883543424159 ], "wc_reply_reviewers_avg": [ 19.25, 28.908260065247788 ], "wc_reply_authors_avg": [ 398.75, 157.12156917495446 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.3244428422615251, "corr_recommendation_correctness": 0.13245323570650439, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:i4iDV794D5MJ:scholar.google.com/&scioq=Ontology-Driven+Semantic+Alignment+of+Artificial+Neurons+and+Visual+Concepts&hl=en&as_sdt=0,14", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "University of Pisa", "aff_unique_dep": "", "aff_unique_url": "https://www.unipi.it", "aff_unique_abbr": "UNIP", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Italy" }, { "id": "e6L5E8ig792", "title": "Revisiting Linear Decision Boundaries for Few-Shot Learning with Transformer Hypernetworks", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Few-shot learning (FSL) methods aim to generalize a model to new unseen classes using only a small number of support examples. In image classification settings, many FSL approaches utilize a similar architecture to standard supervised learning, learning a model composed of a feature extractor followed by a linear classifier head. A common choice for the classifier is ProtoNet-style nearest neighbor, but this may be suboptimal as it is context-independent. As an alternative, some methods train a parametric classifier (e.g. logistic regression, support vector machine) using embeddings from novel classes. However, task-specific training requires time and resources, and poses optimization challenges such as overfitting on only a few samples. Instead, we propose to generate linear classifiers for new classes using a transformer-based hypernetwork, performing context aggregation in permutation invariant manner. A transformer hypernetwork allows us to instantiate a new task-specific classifier without any additional training on novel tasks. Experiments conducted on 1-shot 5-way and 5-shot 5-way MiniImageNet, TieredImageNet, and CIFAR-FS demonstrate that transformer hypernetworks are capable of generating classifiers that achieve up to 1.4% higher accuracy than other commonly used linear classifiers. Among the group of methods that offer optimization-free meta-inference, we achieve new state-of-the-art in most cases.", "keywords": "Few-Shot Learning;Meta learning;Hypernetworks;Transformers", "primary_area": "", "supplementary_material": "", "author": "Samrudhdhi B. Rangrej;Kevin J Liang;Xi Yin;Guan Pang;Theofanis Karaletsos;Lior Wolf;Tal Hassner", "authorids": "~Samrudhdhi_B._Rangrej1;~Kevin_J_Liang1;~Xi_Yin3;~Guan_Pang2;~Theofanis_Karaletsos1;~Lior_Wolf1;~Tal_Hassner2", "gender": "F;M;F;;M;M;M", "homepage": "https://samrudhdhirangrej.github.io/;https://kevinjliang.github.io/;https://xiyinmsu.github.io/;;http://karaletsos.com/;http://www.cs.tau.ac.il/~wolf;https://talhassner.github.io/home/", "dblp": "192/2618;230/8348;147/2735;99/8792;31/11191;83/4103;62/6", "google_scholar": "https://scholar.google.com/citations?hl=en;DBqwS2YAAAAJ;FAEzhskAAAAJ;https://scholar.google.com/citations?hl=en;zrxafGsAAAAJ;UbFrXTsAAAAJ;ehe5pyIAAAAJ", "orcid": ";;;0000-0002-9922-7074;;0000-0001-5578-8892;0000-0003-2275-1406", "linkedin": ";kevin-j-liang/;xyin/;;;;talhassner/", "or_profile": "~Samrudhdhi_B._Rangrej1;~Kevin_J_Liang1;~Xi_Yin3;~Guan_Pang2;~Theofanis_Karaletsos1;~Lior_Wolf1;~Tal_Hassner2", "aff": "McGill University;Meta;Meta Facebook;Meta Facebook;Insitro;Tel Aviv University;Meta inc.", "aff_domain": "mcgill.ca;meta.com;fb.com;meta.com;insitro.com;tau.ac.il;meta.com", "position": "PhD student;Research Scientist;Researcher;Research Scientist;VP of ML;Full Professor;Researcher & Research Manager", "bibtex": "@misc{\nrangrej2022revisiting,\ntitle={Revisiting Linear Decision Boundaries for Few-Shot Learning with Transformer Hypernetworks},\nauthor={Samrudhdhi B. Rangrej and Kevin J Liang and Xi Yin and Guan Pang and Theofanis Karaletsos and Lior Wolf and Tal Hassner},\nyear={2022},\nurl={https://openreview.net/forum?id=e6L5E8ig792}\n}", "github": "", "project": "", "reviewers": "vBio;9coH;UbNa;x2L2", "site": "https://openreview.net/forum?id=e6L5E8ig792", "pdf_size": 0, "recommendation": "3;5;5;5", "confidence": "5;3;4;4", "correctness": "3;3;3;2", "technical_novelty": "2;3;3;2", "empirical_novelty": "2;3;2;2", "wc_summary_paper": "36;25;48;55", "wc_summary_review": "36;31;60;36", "wc_main_review": "254;32;199;316", "wc_review": "326;88;307;407", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.5, 0.8660254037844386 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 41.0, 11.467344941179714 ], "wc_summary_review_avg": [ 40.75, 11.299889379989523 ], "wc_main_review_avg": [ 200.25, 105.58971304061774 ], "wc_review_avg": [ 282.0, 118.13339917229166 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": -0.816496580927726, "corr_recommendation_correctness": -0.3333333333333333, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:AcXPQjFzFvAJ:scholar.google.com/&scioq=Revisiting+Linear+Decision+Boundaries+for+Few-Shot+Learning+with+Transformer+Hypernetworks&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;1;1;2;3;1", "aff_unique_norm": "McGill University;Meta;Insitro;Tel Aviv University", "aff_unique_dep": ";Meta Platforms, Inc.;;", "aff_unique_url": "https://www.mcgill.ca;https://meta.com;https://www.insitro.com;https://www.tau.ac.il", "aff_unique_abbr": "McGill;Meta;;TAU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1;1;2;1", "aff_country_unique": "Canada;United States;Israel" }, { "id": "e6MVRAlKWGD", "title": "Cut the CARP: Fishing for zero-shot story evaluation", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Recent advances in large-scale language models (Raffel et al., 2019; Brown et al., 2020) have brought significant qualitative and quantitative improvements in machine-driven text generation. Despite this, generation and evaluation of machine-generated narrative text remains a challenging problem. Objective evaluation of computationally-generated stories may be prohibitively expensive, require meticulously annotated datasets, or may not adequately measure the logical coherence of a generated story\u2019s narratological structure. Informed by recent advances in contrastive learning (Radford et al., 2021),we present Contrastive Authoring and Reviewing Pairing (CARP): a scalable, efficient method for performing qualitatively superior, zero-shot evaluation of stories. We show a strong correlation between human evaluation of stories and those of CARP. Model outputs more significantly correlate with corresponding human input than those language-model based methods which utilize finetuning or prompt engineering approaches. We also present and analyze the Story-Critique Dataset, a new corpora composed of 1.3 million aligned story-critique pairs derived from over 80,000 stories. We expect this corpus to be of interest to NLP researchers.", "keywords": "Story Generation;Story Evaluation;Dataset;Storytelling;NLP;Evaluation;Contrastive learning;Language Models;Fine Tuning;Efficiency;Interactive Machine Learning;Narrative;Creativity;Human Centered AI;Creativity;Generative Models;World Models;Reader Models", "primary_area": "", "supplementary_material": "", "author": "Shahbuland Matiana;JR Smith;Ryan Teehan;Louis Castricato;Stella Biderman;Leo Gao;Spencer Frazier", "authorids": "~Shahbuland_Matiana1;~JR_Smith1;~Ryan_Teehan1;~Louis_Castricato2;~Stella_Biderman1;~Leo_Gao1;~Spencer_Frazier1", "gender": "M;;M;M;F;;", "homepage": ";;https://rteehas.github.io/;https://louiscastricato.com;http://www.stellabiderman.com;https://leogao.dev;http://spencerjohnfrazier.com", "dblp": "304/2425;;304/2042;;239/5641;279/3125;", "google_scholar": "JUwVT7cAAAAJ;;;;bO7H0DAAAAAJ;r6mBY50AAAAJ;VePWgDkAAAAJ", "orcid": ";;0000-0002-1426-6964;;0000-0001-8228-1042;;", "linkedin": "shahbuland/;;;;stellabiderman;;", "or_profile": "~Shahbuland_Matiana1;~JR_Smith1;~Ryan_Teehan1;~Louis_Castricato2;~Stella_Biderman1;~Leo_Gao1;~Spencer_Frazier1", "aff": "University of Waterloo;;New York University;Georgia Institute of Technology;Georgia Institute of Technology;OpenAI;Georgia Institute of Technology", "aff_domain": "uwaterloo.ca;;nyu.edu;gatech.edu;gatech.edu;openai.com;gatech.edu", "position": "Undergrad student;;PhD student;MS student;MS student;Researcher;PhD student", "bibtex": "@misc{\nmatiana2022cut,\ntitle={Cut the {CARP}: Fishing for zero-shot story evaluation},\nauthor={Shahbuland Matiana and JR Smith and Ryan Teehan and Louis Castricato and Stella Biderman and Leo Gao and Spencer Frazier},\nyear={2022},\nurl={https://openreview.net/forum?id=e6MVRAlKWGD}\n}", "github": "", "project": "", "reviewers": "ccn9;gViz;WeeN;nHtC", "site": "https://openreview.net/forum?id=e6MVRAlKWGD", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "3;3;4;4", "correctness": "2;2;3;2", "technical_novelty": "2;1;3;3", "empirical_novelty": "3;2;3;3", "wc_summary_paper": "106;38;142;70", "wc_summary_review": "64;24;24;121", "wc_main_review": "954;686;233;602", "wc_review": "1124;748;399;793", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 2.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.82915619758885 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 89.0, 38.92300091205713 ], "wc_summary_review_avg": [ 58.25, 39.738992186516256 ], "wc_main_review_avg": [ 618.75, 257.8753332523295 ], "wc_review_avg": [ 766.0, 256.85891068833877 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 1.0, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8806903906267770959&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;2;2;3;2", "aff_unique_norm": "University of Waterloo;New York University;Georgia Institute of Technology;OpenAI", "aff_unique_dep": ";;;", "aff_unique_url": "https://uwaterloo.ca;https://www.nyu.edu;https://www.gatech.edu;https://openai.com", "aff_unique_abbr": "UW;NYU;Georgia Tech;OpenAI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1;1;1", "aff_country_unique": "Canada;United States" }, { "id": "e6MWIbNeW1", "title": "Trading Quality for Efficiency of Graph Partitioning: An Inductive Method across Graphs", "track": "main", "status": "Reject", "tldr": "", "abstract": "Many applications of network systems can be formulated as several NP-hard combinatorial optimization problems regarding graph partitioning (GP), e.g., modularity maximization and NCut minimization. Due to the NP-hardness, to balance the quality and efficiency of GP remains a challenge. Existing methods use machine learning techniques to obtain high-quality solutions but usually have high complexity. Some fast GP methods adopt heuristic strategies to ensure low runtime but suffer from quality degradation. In contrast to conventional transductive GP methods applied to a static graph, we propose an inductive graph partitioning (IGP) framework across multiple evolving graph snapshots to alleviate the NP-hard challenge. IGP first conducts the offline training of a novel dual graph neural network on historical snapshots to capture the structural properties of a system. The trained model is then generalized to newly generated snapshots for fast high-quality online GP without additional optimization, where a better trade-off between quality and efficiency is achieved. IGP is also a generic framework that can capture the permutation invariant partitioning ground-truth of historical snapshots in the offline training and tackle the online GP on graphs with non-fixed number of nodes and clusters. Experiments on a set of benchmarks demonstrate that IGP achieves competitive quality and efficiency to various state-of-the-art baselines.", "keywords": "Graph Partitioning;Community Detection;Inductive Graph Embedding", "primary_area": "", "supplementary_material": "", "author": "Meng QIN;Chaorui Zhang;Bo Bai;Gong Zhang;Dit-Yan Yeung", "authorids": "~Meng_QIN1;~Chaorui_Zhang1;~Bo_Bai2;~Gong_Zhang1;~Dit-Yan_Yeung2", "gender": "M;M;M;;M", "homepage": "https://www.researchgate.net/profile/Meng-Qin-7?ev=hdr_xprf;;;;https://cse.hkust.edu.hk/faculty/dyyeung/", "dblp": "90/11201-2.html;54/8594.html;;;41/5668", "google_scholar": "vX-EtEEAAAAJ;ftame0UAAAAJ;zgk5dMoAAAAJ;xmw0OWQAAAAJ;nEsOOx8AAAAJ", "orcid": "0000-0003-3036-203X;;;;0000-0003-3716-8125", "linkedin": "%E5%AD%9F-%E8%A6%83-02673534b/;https://hk.linkedin.com/in/chaorui-zhang-281ab988;;;", "or_profile": "~Meng_QIN1;~Chaorui_Zhang1;~Bo_Bai2;~Gong_Zhang1;~Dit-Yan_Yeung2", "aff": "Hong Kong University of Science and Technology;Huawei Technologies Ltd.;;Huawei Technologies Ltd.;Hong Kong University of Science and Technology", "aff_domain": "ust.hk;huawei.com;;huawei.com;ust.hk", "position": "PhD student;Researcher;;Principal Researcher;Chair Professor", "bibtex": "@misc{\nqin2022trading,\ntitle={Trading Quality for Efficiency of Graph Partitioning: An Inductive Method across Graphs},\nauthor={Meng QIN and Chaorui Zhang and Bo Bai and Gong Zhang and Dit-Yan Yeung},\nyear={2022},\nurl={https://openreview.net/forum?id=e6MWIbNeW1}\n}", "github": "", "project": "", "reviewers": "njQS;CKG9;BgbP;7xoz;FSan;r3jt", "site": "https://openreview.net/forum?id=e6MWIbNeW1", "pdf_size": 0, "recommendation": "3;3;5;5;6;6", "confidence": "3;3;3;3;3;1", "correctness": "3;3;4;3;4;3", "technical_novelty": "2;3;3;3;3;3", "empirical_novelty": "2;2;3;3;3;3", "wc_summary_paper": "84;51;159;145;50;69", "wc_summary_review": "39;38;98;33;55;47", "wc_main_review": "304;223;191;362;343;85", "wc_review": "427;312;448;540;448;201", "wc_reply_reviewers": "22;0;0;0;0;15", "wc_reply_authors": "1936;1517;2666;1655;1722;337", "reply_reviewers": "1;0;0;0;0;1", "reply_authors": "5;3;5;4;4;2", "recommendation_avg": [ 4.666666666666667, 1.247219128924647 ], "confidence_avg": [ 2.6666666666666665, 0.7453559924999298 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.8333333333333335, 0.3726779962499649 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 93.0, 43.45495752308744 ], "wc_summary_review_avg": [ 51.666666666666664, 21.891144835805694 ], "wc_main_review_avg": [ 251.33333333333334, 96.20579793107875 ], "wc_review_avg": [ 396.0, 109.68591523071684 ], "wc_reply_reviewers_avg": [ 6.166666666666667, 8.952032667997189 ], "wc_reply_authors_avg": [ 1638.8333333333333, 690.4311253766655 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 3.8333333333333335, 1.0671873729054748 ], "replies_avg": [ 33, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.4780914437337575, "corr_recommendation_correctness": 0.4724555912615341, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:uO5a1niLQdAJ:scholar.google.com/&scioq=Trading+Quality+for+Efficiency+of+Graph+Partitioning:+An+Inductive+Method+across+Graphs&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;1;0", "aff_unique_norm": "Hong Kong University of Science and Technology;Huawei", "aff_unique_dep": ";Huawei Technologies", "aff_unique_url": "https://www.ust.hk;https://www.huawei.com", "aff_unique_abbr": "HKUST;Huawei", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "id": "e8JI3SBZKa4", "title": "Online approximate factorization of a kernel matrix by a Hebbian neural network", "track": "main", "status": "Reject", "tldr": "", "abstract": "We derive an online algorithm for unsupervised learning based on representing every input $\\mathbf{x}_t$ by a high dimensional vector $\\mathbf{y}_t$ with pairwise inner products that approximately match input similarities as measured by a kernel function: $\\mathbf{y}_s \\cdot \\mathbf{y}_{t} \\approx f(\\mathbf{x}_s, \\mathbf{x}_{t})$. The approximation is formulated using the objective function for classical multidimensional scaling. We derive an upper bound for this objective which only involves correlations between output vectors and nonlinear functions of input vectors. Minimizing this upper bound leads to a minimax optimization, which can be solved via stochastic gradient descent-ascent. This online algorithm can be interpreted as a recurrent neural network with Hebbian and anti-Hebbian connections, generalizing previous work on linear similarity matching. Through numerical experiments with two datasets, we demonstrate that unsupervised learning can be aided by the nonlinearity inherent in our kernel method. We also show that heavy-tailed representation vectors emerge from the learning even though no sparseness prior is used, lending further biological plausibility to the model. Our upper bound employs a rank-one Nystrom approximation to the kernel function, with the novelty of leading to an online algorithm that optimizes landmark placement.", "keywords": "online kernel methods;hebbian learning;similarity matching", "primary_area": "", "supplementary_material": "/attachment/be4ddaa3b0716cf7c3fbc78a2f9a6be2d5ab9666.zip", "author": "Kyle Luther;Sebastian Seung", "authorids": "~Kyle_Luther1;~Sebastian_Seung1", "gender": "M;M", "homepage": ";https://www.cs.princeton.edu/people/profile/sseung", "dblp": ";03/4883", "google_scholar": "JX_K0-QAAAAJ;https://scholar.google.com.tw/citations?user=BD8llAEAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Kyle_Luther1;~H._Sebastian_Seung1", "aff": "Princeton University;Princeton University", "aff_domain": "princeton.edu;princeton.edu", "position": "PhD student;Full Professor", "bibtex": "@misc{\nluther2022online,\ntitle={Online approximate factorization of a kernel matrix by a Hebbian neural network},\nauthor={Kyle Luther and Sebastian Seung},\nyear={2022},\nurl={https://openreview.net/forum?id=e8JI3SBZKa4}\n}", "github": "", "project": "", "reviewers": "yzmP;mZaz;UeSq;qZPh", "site": "https://openreview.net/forum?id=e8JI3SBZKa4", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "2;3;4;5", "correctness": "4;4;4;3", "technical_novelty": "2;2;3;4", "empirical_novelty": "2;2;3;4", "wc_summary_paper": "76;97;109;76", "wc_summary_review": "21;143;65;20", "wc_main_review": "212;194;339;316", "wc_review": "309;434;513;412", "wc_reply_reviewers": "0;0;421;78", "wc_reply_authors": "245;292;385;444", "reply_reviewers": "0;0;2;1", "reply_authors": "1;1;1;1", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 3.5, 1.118033988749895 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.82915619758885 ], "empirical_novelty_avg": [ 2.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 89.5, 14.150971698084906 ], "wc_summary_review_avg": [ 62.25, 50.03686141236279 ], "wc_main_review_avg": [ 265.25, 63.1006141016076 ], "wc_review_avg": [ 417.0, 72.79079612148777 ], "wc_reply_reviewers_avg": [ 124.75, 173.97898580000975 ], "wc_reply_authors_avg": [ 341.5, 77.71904528492357 ], "reply_reviewers_avg": [ 0.75, 0.82915619758885 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.9233805168766388, "corr_recommendation_correctness": -0.9271726499455306, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:ZpsmoCkWg4UJ:scholar.google.com/&scioq=Online+approximate+factorization+of+a+kernel+matrix+by+a+Hebbian+neural+network&hl=en&as_sdt=0,14", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Princeton University", "aff_unique_dep": "", "aff_unique_url": "https://www.princeton.edu", "aff_unique_abbr": "Princeton", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Equivariant and Stable Positional Encoding for More Powerful Graph Neural Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6134", "id": "e95i1IHcWj", "poster": "", "openreview": "https://openreview.net/forum?id=e95i1IHcWj", "slides": "https://iclr.cc/virtual/2022/poster/6134", "video": "https://iclr.cc/virtual/2022/poster/6134", "author_site": "Haorui Wang, Haoteng Yin, Muhan Zhang, Pan Li", "tldr": "", "abstract": "Graph neural networks (GNN) have shown great advantages in many graph-based learning tasks but often fail to predict accurately for a task-based on sets of nodes such as link/motif prediction and so on. Many works have recently proposed to address this problem by using random node features or node distance features. However, they suffer from either slow convergence, inaccurate prediction, or high complexity. In this work, we revisit GNNs that allow using positional features of nodes given by positional encoding (PE) techniques such as Laplacian Eigenmap, Deepwalk, etc. GNNs with PE often get criticized because they are not generalizable to unseen graphs (inductive) or stable. Here, we study these issues in a principled way and propose a provable solution, a class of GNN layers termed PEG with rigorous mathematical analysis. PEG uses separate channels to update the original node features and positional features. PEG imposes permutation equivariance w.r.t. the original node features and rotation equivariance w.r.t. the positional features simultaneously. Extensive link prediction experiments over 8 real-world networks demonstrate the advantages of PEG in generalization and scalability. Code is available at https://github.com/Graph-COM/PEG.", "keywords": "Graph Neural Network;Spectral Graph Theory;System Stability", "primary_area": "", "supplementary_material": "", "author": "Haorui Wang;Haoteng Yin;Muhan Zhang;Pan Li", "authorids": "~Haorui_Wang1;~Haoteng_Yin1;~Muhan_Zhang1;~Pan_Li2", "gender": "M;M;M;", "homepage": ";https://home.veritasyin.me/;https://muhanzhang.github.io/;", "dblp": ";206/6804;157/5518;https://dblp.org/pers/hd/l/Li_0005:Pan", "google_scholar": ";https://scholar.google.com/citations?hl=en;https://scholar.google.com.hk/citations?user=OBBqkosAAAAJ;IroP0EwAAAAJ", "orcid": ";;0000-0002-7680-6401;", "linkedin": "haorui-wang-6a1a92185/;;jerry-muhan-zhang-a33a1777/;pan-li-b951105a/", "or_profile": "~Haorui_Wang1;~Haoteng_Yin1;~Muhan_Zhang1;~Pan_Li2", "aff": "Wuhan University;Purdue University;Peking University;Purdue University", "aff_domain": "whu.edu.cn;purdue.edu;pku.edu.cn;purdue.edu", "position": "Undergrad student;PhD student;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\nwang2022equivariant,\ntitle={Equivariant and Stable Positional Encoding for More Powerful Graph Neural Networks},\nauthor={Haorui Wang and Haoteng Yin and Muhan Zhang and Pan Li},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=e95i1IHcWj}\n}", "github": "", "project": "", "reviewers": "rbLd;eyVz;FBSc;KJUw;L7CF", "pdf_size": 0, "recommendation": "6;6;6;8;8", "confidence": "4;4;5;4;3", "correctness": "3;3;1;4;4", "technical_novelty": "3;3;3;3;3", "empirical_novelty": "2;3;1;2;3", "wc_summary_paper": "99;65;210;229;98", "wc_summary_review": "96;65;14;95;10", "wc_main_review": "730;84;243;605;130", "wc_review": "925;214;467;929;238", "wc_reply_reviewers": "131;0;37;0;0", "wc_reply_authors": "1410;585;1342;1012;333", "reply_reviewers": "1;0;1;0;0", "reply_authors": "3;2;4;2;1", "recommendation_avg": [ 6.8, 0.9797958971132712 ], "confidence_avg": [ 4.0, 0.6324555320336759 ], "correctness_avg": [ 3.0, 1.0954451150103321 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.2, 0.7483314773547882 ], "wc_summary_paper_avg": [ 140.2, 66.16766581949223 ], "wc_summary_review_avg": [ 56.0, 37.63509000919222 ], "wc_main_review_avg": [ 358.4, 260.64427866346887 ], "wc_review_avg": [ 554.6, 316.6351843999652 ], "wc_reply_reviewers_avg": [ 33.6, 50.764554563198914 ], "wc_reply_authors_avg": [ 936.4, 420.0231422195687 ], "reply_reviewers_avg": [ 0.4, 0.48989794855663565 ], "reply_authors_avg": [ 2.4, 1.019803902718557 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.6454972243679028, "corr_recommendation_correctness": 0.74535599249993, "gs_citation": 155, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16446538441027140116&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=e95i1IHcWj", "email": "whu.edu.cn;purdue.edu;pku.edu.cn;purdue.edu", "author_num": 4, "aff_unique_index": "0;1;2;1", "aff_unique_norm": "Wuhan University;Purdue University;Peking University", "aff_unique_dep": ";;", "aff_unique_url": "http://www.whu.edu.cn/;https://www.purdue.edu;http://www.pku.edu.cn", "aff_unique_abbr": "WHU;Purdue;Peking U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;1", "aff_country_unique": "China;United States" }, { "id": "eAEcdRkcMHh", "title": "HoloFormer: Deep Compression of Pre-Trained Transforms via Unified Optimization of N:M Sparsity and Integer Quantization", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "In recent years, large pre-trained Transformer networks have demonstrated dramatic improvements in many Natural Language Processing (NLP) tasks. However, the huge size of these models brings significant challenges to fine-tuning and online deployment due to latency and cost constraints. Recently, hardware manufacturers have released new architectures that support efficient N:M sparsity and low-precision integer computation for fast inferencing. In contrast to unstructured sparsity, N:M sparsity specifies that out of each chunk of N contiguous weight parameters, exactly M parameters are non-zero. Moreover, these architectures also support processing data with reduced precision, such as INT8. Prior work often considers inducing N:M sparsity and integer quantization in isolation or as independent pieces of a compression pipeline. However, there lacks a systematic investigation towards how N:M sparsity and integer quantization can be effectively combined to exploit the maximum degree of redundancy and enable even faster acceleration for pre-trained Transformer networks.\n\nIn this work, we propose a unified, systematic approach to learning N:M sparsity and integer quantization for pre-trained Transformers using the Alternating Directions Method of Multipliers (ADMM). We show that both N:M sparsity and integer quantization and their combinations can be framed as non-convex constrained optimization problems and\nsolved in a unified manner. When evaluated across the GLUE suite of NLP benchmarks, our approach outperforms baselines that consider each of these problems independently, retaining 99.4\\% accuracy of the dense baseline while being able to execute on newly released hardware effectively. ", "keywords": "Efficient Inference;N:M Sparsification;Quantization;Transformer networks", "primary_area": "", "supplementary_material": "", "author": "Minjia Zhang;Connor Holmes;Yuxiong He;Bo Wu", "authorids": "~Minjia_Zhang1;~Connor_Holmes1;~Yuxiong_He1;~Bo_Wu1", "gender": "M;M;;M", "homepage": "https://minjiazhang.github.io/;;;https://inside.mines.edu/~bwu/", "dblp": "58/9033;;https://dblp.org/pers/hd/h/He:Yuxiong;47/6534-2.html", "google_scholar": "https://scholar.google.com/citations?hl=en;https://scholar.google.com/citations?hl=en;SB3_eb0AAAAJ;g6Wdt1YAAAAJ", "orcid": "0000-0002-8165-166X;;;0009-0001-1696-4272", "linkedin": "minjia-zhang-05857226/;;;bo-wu-47674734/", "or_profile": "~Minjia_Zhang1;~Connor_Holmes1;~Yuxiong_He1;~Bo_Wu1", "aff": "Microsoft ;Colorado School of Mines;Microsoft;Colorado School of Mines", "aff_domain": "microsoft.com;mines.edu;microsoft.com;mines.edu", "position": "Principle Researcher;PhD student;Researcher;Associate Professor", "bibtex": "@misc{\nzhang2022holoformer,\ntitle={HoloFormer: Deep Compression of Pre-Trained Transforms via Unified Optimization of N:M Sparsity and Integer Quantization},\nauthor={Minjia Zhang and Connor Holmes and Yuxiong He and Bo Wu},\nyear={2022},\nurl={https://openreview.net/forum?id=eAEcdRkcMHh}\n}", "github": "", "project": "", "reviewers": "EHvx;G7V8;xzLM;vTwG;cXJS", "site": "https://openreview.net/forum?id=eAEcdRkcMHh", "pdf_size": 0, "recommendation": "3;5;5;5;5", "confidence": "5;3;3;3;4", "correctness": "3;3;2;3;3", "technical_novelty": "3;2;2;2;2", "empirical_novelty": "3;3;2;3;2", "wc_summary_paper": "63;82;49;107;70", "wc_summary_review": "108;31;65;99;45", "wc_main_review": "629;262;153;302;126", "wc_review": "800;375;267;508;241", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "0;0;0;0;0", "reply_reviewers": "0;0;0;0;0", "reply_authors": "0;0;0;0;0", "recommendation_avg": [ 4.6, 0.7999999999999999 ], "confidence_avg": [ 3.6, 0.8 ], "correctness_avg": [ 2.8, 0.39999999999999997 ], "technical_novelty_avg": [ 2.2, 0.39999999999999997 ], "empirical_novelty_avg": [ 2.6, 0.4898979485566356 ], "wc_summary_paper_avg": [ 74.2, 19.56936381183609 ], "wc_summary_review_avg": [ 69.6, 29.850293130889018 ], "wc_main_review_avg": [ 294.4, 179.6759304971036 ], "wc_review_avg": [ 438.2, 203.94254092758578 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.8750000000000001, "corr_recommendation_correctness": -0.25000000000000006, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:a36R5N34sP0J:scholar.google.com/&scioq=HoloFormer:+Deep+Compression+of+Pre-Trained+Transforms+via+Unified+Optimization+of+N:M+Sparsity+and+Integer+Quantization&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;0;1", "aff_unique_norm": "Microsoft;Colorado School of Mines", "aff_unique_dep": "Microsoft Corporation;", "aff_unique_url": "https://www.microsoft.com;https://www.mines.edu", "aff_unique_abbr": "Microsoft;CSM", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "On Robust Prefix-Tuning for Text Classification", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/5987", "id": "eBCmOocUejf", "poster": "", "openreview": "https://openreview.net/forum?id=eBCmOocUejf", "slides": "https://iclr.cc/virtual/2022/poster/5987", "video": "https://iclr.cc/virtual/2022/poster/5987", "author_site": "Zonghan Yang, Yang Liu", "tldr": "", "abstract": "Recently, prefix-tuning has gained increasing attention as a parameter-efficient finetuning method for large-scale pretrained language models. The method keeps the pretrained models fixed and only updates the prefix token parameters for each downstream task. Despite being lightweight and modular, prefix-tuning still lacks robustness to textual adversarial attacks. However, most currently developed defense techniques necessitate auxiliary model update and storage, which inevitably hamper the modularity and low storage of prefix-tuning. In this work, we propose a robust prefix-tuning framework that preserves the efficiency and modularity of prefix-tuning. The core idea of our framework is leveraging the layerwise activations of the language model by correctly-classified training data as the standard for additional prefix finetuning. During the test phase, an extra batch-level prefix is tuned for each batch and added to the original prefix for robustness enhancement. Extensive experiments on three text classification benchmarks show that our framework substantially improves robustness over several strong baselines against five textual attacks of different types while maintaining comparable accuracy on clean texts. We also interpret our robust prefix-tuning framework from the optimal control perspective and pose several directions for future research.", "keywords": "prefix-tuning;pretrained language models;text classification;robustness in NLP;optimal control", "primary_area": "", "supplementary_material": "", "author": "Zonghan Yang;Yang Liu", "authorids": "~Zonghan_Yang1;~Yang_Liu19", "gender": "M;M", "homepage": "https://minicheshire.github.io/;http://nlp.csai.tsinghua.edu.cn/~ly/", "dblp": "222/7860;51/3710-5", "google_scholar": "rt9HOIUAAAAJ;https://scholar.google.com.hk/citations?user=lVhoKNcAAAAJ", "orcid": ";0000-0002-3087-242X", "linkedin": ";", "or_profile": "~Zonghan_Yang1;~Yang_Liu19", "aff": "Department of Computer Science and Technology, Tsinghua University;Tsinghua University", "aff_domain": "cs.tsinghua.edu.cn;tsinghua.edu.cn", "position": "PhD student;Professor", "bibtex": "@inproceedings{\nyang2022on,\ntitle={On Robust Prefix-Tuning for Text Classification},\nauthor={Zonghan Yang and Yang Liu},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=eBCmOocUejf}\n}", "github": "", "project": "", "reviewers": "GPh4;rkLx;iz2s;yKwP", "pdf_size": 0, "recommendation": "6;6;6;6", "confidence": "2;4;3;3", "correctness": "4;2;3;3", "technical_novelty": "4;3;3;3", "empirical_novelty": "4;3;3;4", "wc_summary_paper": "131;132;42;109", "wc_summary_review": "82;149;23;30", "wc_main_review": "408;653;118;567", "wc_review": "621;934;183;706", "wc_reply_reviewers": "0;164;0;51", "wc_reply_authors": "1063;1406;331;755", "reply_reviewers": "0;1;0;1", "reply_authors": "3;2;1;2", "recommendation_avg": [ 6.0, 0.0 ], "confidence_avg": [ 3.0, 0.7071067811865476 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 3.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.5, 0.5 ], "wc_summary_paper_avg": [ 103.5, 36.67764987018661 ], "wc_summary_review_avg": [ 71.0, 50.472764933179555 ], "wc_main_review_avg": [ 436.5, 203.8118004434483 ], "wc_review_avg": [ 611.0, 272.32241920194525 ], "wc_reply_reviewers_avg": [ 53.75, 66.97154246394508 ], "wc_reply_authors_avg": [ 888.75, 395.88026914712486 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 2.0, 0.7071067811865476 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 24, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5512236602536653945&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=eBCmOocUejf", "email": "cs.tsinghua.edu.cn;tsinghua.edu.cn", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Tsinghua University", "aff_unique_dep": "Department of Computer Science and Technology", "aff_unique_url": "https://www.tsinghua.edu.cn", "aff_unique_abbr": "THU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "title": "Analyzing and Improving the Optimization Landscape of Noise-Contrastive Estimation", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6871", "id": "eBS-3YiaIL-", "poster": "", "openreview": "https://openreview.net/forum?id=eBS-3YiaIL-", "slides": "https://iclr.cc/virtual/2022/poster/6871", "video": "https://iclr.cc/virtual/2022/poster/6871", "author_site": "Bingbin Liu, Elan Rosenfeld, Pradeep K Ravikumar, Andrej Risteski", "tldr": "", "abstract": "Noise-contrastive estimation (NCE) is a statistically consistent method for learning unnormalized probabilistic models. It has been empirically observed that the choice of the noise distribution is crucial for NCE\u2019s performance. However, such observation has never been made formal or quantitative. In fact, it is not even clear whether the difficulties arising from a poorly chosen noise distribution are statistical or algorithmic in nature.\nIn this work, we formally pinpoint reasons for NCE\u2019s poor performance when an inappropriate noise distribution is used. Namely, we prove these challenges arise due to an ill-behaved (more precisely, flat) loss landscape.\nTo address this, we introduce a variant of NCE called \\emph{eNCE} which uses an exponential loss and for which \\emph{normalized gradient descent} addresses the landscape issues \\emph{provably} when the target and noise distributions are in a given exponential family. ", "keywords": "noise contrastive estimation;contrastive learning;unsupervised learning;theory", "primary_area": "", "supplementary_material": "/attachment/7676e3ed2e2e43b2dfee0c9bf18e331e82568113.zip", "author": "Bingbin Liu;Elan Rosenfeld;Pradeep Kumar Ravikumar;Andrej Risteski", "authorids": "~Bingbin_Liu1;~Elan_Rosenfeld1;~Pradeep_Kumar_Ravikumar1;~Andrej_Risteski2", "gender": "F;M;M;M", "homepage": "https://clarabing.github.io/;;http://www.cs.cmu.edu/~pradeepr/;", "dblp": "222/1554;236/4508;94/3594;63/11143", "google_scholar": "2ud06rQAAAAJ;f0j0K8QAAAAJ;https://scholar.google.com.tw/citations?user=Q4DTPw4AAAAJ;", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Bingbin_Liu1;~Elan_Rosenfeld1;~Pradeep_Kumar_Ravikumar1;~Andrej_Risteski2", "aff": "Carnegie Mellon University;Carnegie Mellon University;School of Computer Science, Carnegie Mellon University;Carnegie Mellon University", "aff_domain": "cmu.edu;andrew.cmu.edu;cs.cmu.edu;cmu.edu", "position": "PhD student;PhD student;Associate Professor;Assistant Professor", "bibtex": "@inproceedings{\nliu2022analyzing,\ntitle={Analyzing and Improving the Optimization Landscape of Noise-Contrastive Estimation},\nauthor={Bingbin Liu and Elan Rosenfeld and Pradeep Kumar Ravikumar and Andrej Risteski},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=eBS-3YiaIL-}\n}", "github": "", "project": "", "reviewers": "6syu;rZGo;VmJ3;yXSX", "pdf_size": 0, "recommendation": "6;6;8;8", "confidence": "3;4;3;2", "correctness": "3;4;4;4", "technical_novelty": "2;3;4;4", "empirical_novelty": "2;3;0;3", "wc_summary_paper": "62;130;232;229", "wc_summary_review": "10;20;83;27", "wc_main_review": "299;165;938;262", "wc_review": "371;315;1253;518", "wc_reply_reviewers": "84;0;28;11", "wc_reply_authors": "415;170;1021;351", "reply_reviewers": "1;0;1;1", "reply_authors": "3;1;2;2", "recommendation_avg": [ 7.0, 1.0 ], "confidence_avg": [ 3.0, 0.7071067811865476 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.25, 0.82915619758885 ], "empirical_novelty_avg": [ 2.0, 1.224744871391589 ], "wc_summary_paper_avg": [ 163.25, 71.4260981714667 ], "wc_summary_review_avg": [ 35.0, 28.36370920736567 ], "wc_main_review_avg": [ 416.0, 305.32359882590146 ], "wc_review_avg": [ 614.25, 376.1604544605932 ], "wc_reply_reviewers_avg": [ 30.75, 32.32162588732194 ], "wc_reply_authors_avg": [ 489.25, 319.8846471776975 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 2.0, 0.7071067811865476 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.7071067811865475, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 22, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2306566866938710006&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=eBS-3YiaIL-", "email": "cmu.edu;andrew.cmu.edu;cs.cmu.edu;cmu.edu", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Carnegie Mellon University", "aff_unique_dep": "", "aff_unique_url": "https://www.cmu.edu", "aff_unique_abbr": "CMU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Pittsburgh", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "eBZsAZB8Rfh", "title": "Adaptive Unbiased Teacher for Cross-Domain Object Detection", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "We tackle the problem of domain adaptation in object detection, where the main challenge lies in significant domain shifts between source (one domain with supervision) and target (a domain of interest without supervision). Although the teacher-student framework (a student model learns from pseudo labels generated from a teacher model) has been adopted to enable domain adaptation and yielded accuracy gains on the target domain, the teacher model still generates a large number of low-quality pseudo labels (e.g.,false positives) due to its bias toward source domain. This leads to sub-optimal domain adaptation performance. To ad-dress this issue, we propose Adaptive Unbiased Teacher (AUT), a teacher-student framework leveraging adversarial learning (on features derived from backbone)and weak-strong data augmentation to address domain shifts. Specifically, we em-ploy feature-level adversarial training, ensuring features extracted from the source and target domains share similar statistics. This enables the student model to capture domain-invariant features. Furthermore, we apply weak-strong augmentation and mutual learning of the teacher for target domain and student model for both domains. This enables the updated teacher model to gradually benefit from the student model without suffering domain shift. We show that AUT demonstrates superiority over all existing approaches and even Oracle (fully-supervised) mod-els by a huge margin. For example, we achieve 50.9% (49.3%) mAP on FoggyCityscape (Clipart1K), which is 9.2% (5.2%) and 8.2% (11.0%) higher than previous state of the arts and Oracle, respectively.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yu-Jhe Li;Xiaoliang Dai;Chih-Yao Ma;Yen-Cheng Liu;Kan Chen;Bichen Wu;Zijian He;Kris M. Kitani;Peter Vajda", "authorids": "~Yu-Jhe_Li1;~Xiaoliang_Dai1;~Chih-Yao_Ma1;~Yen-Cheng_Liu1;~Kan_Chen1;~Bichen_Wu1;~Zijian_He2;~Kris_M._Kitani1;~Peter_Vajda1", "gender": "M;M;M;;M;M;M;;M", "homepage": "https://yujheli.github.io/;;https://chihyaoma.github.io/;https://ycliu93.github.io/;http://wind09.github.io/;;http://www.cs.cmu.edu/~kkitani/;https://sites.google.com/site/vajdap;", "dblp": "127/3527;192/3904;198/0963;29/7584;;130/1371;42/163;44/5953;", "google_scholar": "https://scholar.google.com.tw/citations?user=MpLiwTIAAAAJ;u4olrOcAAAAJ;HrrtgKkAAAAJ;yeAeAhsAAAAJ;https://scholar.google.com.hk/citations?user=BYrARP4AAAAJ;K3QJPdMAAAAJ;yv3sH74AAAAJ;k8QB5VUAAAAJ;G03EzSMAAAAJ", "orcid": ";;;;0000-0003-1415-5495;;0000-0002-9389-4060;;", "linkedin": ";;kevin-chih-yao-ma-9b5b3063/;;;bichenwu/;;p%C3%A9ter-vajda-9a03aaa/;", "or_profile": "~Yu-Jhe_Li1;~Xiaoliang_Dai1;~Chih-Yao_Ma1;~Yen-Cheng_Liu1;~Kan_Chen1;~Bichen_Wu1;~Kris_M._Kitani1;~Peter_Vajda1;~Zijian_He4", "aff": "Carnegie Mellon University;Meta Facebook;Meta;Georgia Institute of Technology;Waymo;Meta Facebook;Carnegie Mellon University;Meta;Meta GenAI", "aff_domain": "andrew.cmu.edu;fb.com;meta.com;gatech.edu;waymo.com;fb.com;cmu.edu;meta.com;meta.com", "position": "PhD student;Research Scientist;Research Scientist;PhD student;Research Scientist;Research Scientist;Associate Professor;Researcher;Researcher", "bibtex": "@misc{\nli2022adaptive,\ntitle={Adaptive Unbiased Teacher for Cross-Domain Object Detection},\nauthor={Yu-Jhe Li and Xiaoliang Dai and Chih-Yao Ma and Yen-Cheng Liu and Kan Chen and Bichen Wu and Zijian He and Kris M. Kitani and Peter Vajda},\nyear={2022},\nurl={https://openreview.net/forum?id=eBZsAZB8Rfh}\n}", "github": "", "project": "", "reviewers": "6y7a;QLcT;Pqep", "site": "https://openreview.net/forum?id=eBZsAZB8Rfh", "pdf_size": 0, "recommendation": "5;5;6", "confidence": "5;4;4", "correctness": "3;4;3", "technical_novelty": "3;2;3", "empirical_novelty": "2;2;1", "wc_summary_paper": "68;40;31", "wc_summary_review": "116;33;13", "wc_main_review": "149;164;183", "wc_review": "333;237;227", "wc_reply_reviewers": "82;0;0", "wc_reply_authors": "428;394;634", "reply_reviewers": "1;0;0", "reply_authors": "2;1;1", "recommendation_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 1.6666666666666667, 0.4714045207910317 ], "wc_summary_paper_avg": [ 46.333333333333336, 15.755069730795297 ], "wc_summary_review_avg": [ 54.0, 44.594469014292194 ], "wc_main_review_avg": [ 165.33333333333334, 13.912424503139471 ], "wc_review_avg": [ 265.6666666666667, 47.78656249988647 ], "wc_reply_reviewers_avg": [ 27.333333333333332, 38.6551707048646 ], "wc_reply_authors_avg": [ 485.3333333333333, 106.03563342365413 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 9, 0 ], "corr_recommendation_confidence": -0.4999999999999999, "corr_recommendation_correctness": -0.4999999999999999, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:YMhQQrpOndgJ:scholar.google.com/&scioq=Adaptive+Unbiased+Teacher+for+Cross-Domain+Object+Detection&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;1;2;3;1;0;1;1", "aff_unique_norm": "Carnegie Mellon University;Meta;Georgia Institute of Technology;Waymo", "aff_unique_dep": ";Meta Platforms, Inc.;;", "aff_unique_url": "https://www.cmu.edu;https://meta.com;https://www.gatech.edu;https://www.waymo.com", "aff_unique_abbr": "CMU;Meta;Georgia Tech;Waymo", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "eCPCn25gat", "title": "Pretraining for Language Conditioned Imitation with Transformers", "track": "main", "status": "Reject", "tldr": "", "abstract": "We study reinforcement learning (RL) agents which can utilize language inputs. To investigate this, we propose a new multimodal benchmark -- Text-Conditioned Frostbite -- in which an agent must complete tasks specified by text instructions in the Atari Frostbite environment. We curate and release a dataset of 5M text-labelled transitions for training and to encourage further research in this direction. On this benchmark, we evaluate Text Decision Transformer (TDT), a transformer directly operating on text, state, and action tokens, and find it improves upon other baseline architectures. Furthermore, we evaluate the effect of pretraining, finding unsupervised pretraining can yield improved results in low-data settings.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/cf3dc42bf459c093166677b271634da0eb1ae1f3.zip", "author": "Aaron L Putterman;Kevin Lu;Igor Mordatch;Pieter Abbeel", "authorids": "~Aaron_L_Putterman1;~Kevin_Lu2;~Igor_Mordatch5;~Pieter_Abbeel2", "gender": ";;;M", "homepage": ";http://kevinlu.ai/;;https://people.eecs.berkeley.edu/~pabbeel/", "dblp": ";17/8813;;", "google_scholar": ";E8s73dYAAAAJ;Vzr1RukAAAAJ;https://scholar.google.com.tw/citations?user=vtwH6GkAAAAJ", "orcid": ";;;", "linkedin": "aaron-louie-putterman-0085b51a5/;;;", "or_profile": "~Aaron_L_Putterman1;~Kevin_Lu2;~Igor_Mordatch5;~Pieter_Abbeel2", "aff": "University of California, Berkeley;Facebook AI Research;Research, Google;Covariant", "aff_domain": "berkeley.edu;fb.com;research.google.com;covariant.ai", "position": "Undergrad student;AI Resident;Researcher;Founder", "bibtex": "@misc{\nputterman2022pretraining,\ntitle={Pretraining for Language Conditioned Imitation with Transformers},\nauthor={Aaron L Putterman and Kevin Lu and Igor Mordatch and Pieter Abbeel},\nyear={2022},\nurl={https://openreview.net/forum?id=eCPCn25gat}\n}", "github": "", "project": "", "reviewers": "ZRZY;ZbHJ;EZAA;hbz9", "site": "https://openreview.net/forum?id=eCPCn25gat", "pdf_size": 0, "recommendation": "3;3;5;6", "confidence": "4;4;4;3", "correctness": "2;3;3;3", "technical_novelty": "1;2;2;3", "empirical_novelty": "1;2;2;3", "wc_summary_paper": "40;35;56;42", "wc_summary_review": "18;31;88;41", "wc_main_review": "390;221;260;73", "wc_review": "448;287;404;156", "wc_reply_reviewers": "0;36;49;0", "wc_reply_authors": "663;562;445;125", "reply_reviewers": "0;1;1;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 4.25, 1.299038105676658 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 43.25, 7.790218225441442 ], "wc_summary_review_avg": [ 44.5, 26.405491853021786 ], "wc_main_review_avg": [ 236.0, 113.01106140551022 ], "wc_review_avg": [ 323.75, 113.32337578805178 ], "wc_reply_reviewers_avg": [ 21.25, 21.741377601246892 ], "wc_reply_authors_avg": [ 448.75, 202.2107502087859 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.7777777777777777, "corr_recommendation_correctness": 0.5555555555555555, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5654397426764521769&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "University of California, Berkeley;Meta;Google;Covariant", "aff_unique_dep": ";Facebook AI Research;Google Research;", "aff_unique_url": "https://www.berkeley.edu;https://research.facebook.com;https://research.google;", "aff_unique_abbr": "UC Berkeley;FAIR;Google;", "aff_campus_unique_index": "0;2", "aff_campus_unique": "Berkeley;;Mountain View", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States;" }, { "id": "eDjxhFbaWX", "title": "HODA: Protecting DNNs Against Model Extraction Attacks via Hardness of Samples", "track": "main", "status": "Reject", "tldr": "", "abstract": "Model Extraction attacks exploit the target model's prediction API to create a surrogate model in order to steal or reconnoiter the functionality of the target model in the black-box setting. Several recent studies have shown that a data-limited adversary who has no or limited access to the samples from the target model's training data distribution can use synthesis or semantically similar samples to conduct model extraction attacks. As the training process of DNN-based classifiers is done in several epochs, we can consider this process as a sequence of subclassifiers so that each subclassifier is created at the end of an epoch. We use the sequence of subclassifiers to calculate the hardness degree of samples. In this paper, we investigate the hardness degree of samples and demonstrate that the hardness degree histogram of a data-limited adversary's sample sequences is distinguishable from the hardness degree histogram of benign users' samples sequences, consisting of normal samples. Normal samples come from the target classifier's training data distribution. We propose Hardness-Oriented Detection Approach (HODA) to detect the sample sequences of model extraction attacks. The results demonstrate that HODA can detect the sample sequences of model extraction attacks with a high success rate by only watching 100 samples of them.", "keywords": "Trustworthy Machine Learning;Model Extraction Attacks;Hardness of Samples", "primary_area": "", "supplementary_material": "/attachment/7f0b19b16889fa1c716949189ebd4600d1c71105.zip", "author": "AmirMahdi Sadeghzadeh;Faezeh Dehghan;Amir Sobhanian;Rasool Jalili", "authorids": "~AmirMahdi_Sadeghzadeh1;~Faezeh_Dehghan1;~Amir_Sobhanian1;~Rasool_Jalili1", "gender": "M;F;M;M", "homepage": "https://amsadeghzadeh.github.io/;;;http://www.sharif.edu/~jalili", "dblp": "260/0840.html;;;", "google_scholar": "SndfdlwAAAAJ;;;", "orcid": "0000-0002-1492-2714;;;", "linkedin": "amir-mahdi-sadeghzadeh-355375124/;faezeh-dehghan-766347206;amir-mohammad-sobhanian-8831b220b;", "or_profile": "~AmirMahdi_Sadeghzadeh1;~Faezeh_Dehghan1;~Amir_Sobhanian1;~Rasool_Jalili1", "aff": ";;Sharif University of Technology;Sharif University of Technology", "aff_domain": ";;sharif.edu;sharif.edu", "position": ";;MS student;Associate Professor", "bibtex": "@misc{\nsadeghzadeh2022hoda,\ntitle={{HODA}: Protecting {DNN}s Against Model Extraction Attacks via Hardness of Samples},\nauthor={AmirMahdi Sadeghzadeh and Faezeh Dehghan and Amir Sobhanian and Rasool Jalili},\nyear={2022},\nurl={https://openreview.net/forum?id=eDjxhFbaWX}\n}", "github": "", "project": "", "reviewers": "UQxC;QDdd;qQMY;Mihr", "site": "https://openreview.net/forum?id=eDjxhFbaWX", "pdf_size": 0, "recommendation": "1;3;5;6", "confidence": "4;4;4;4", "correctness": "1;3;3;3", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;3;3;2", "wc_summary_paper": "95;116;130;16", "wc_summary_review": "33;118;34;8", "wc_main_review": "540;673;316;270", "wc_review": "668;907;480;294", "wc_reply_reviewers": "43;0;47;0", "wc_reply_authors": "2226;1623;1545;710", "reply_reviewers": "1;0;1;0", "reply_authors": "5;3;4;1", "recommendation_avg": [ 3.75, 1.920286436967152 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 2.5, 0.8660254037844386 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 89.25, 44.08727140570167 ], "wc_summary_review_avg": [ 48.25, 41.595522595587134 ], "wc_main_review_avg": [ 449.75, 164.45725128433833 ], "wc_review_avg": [ 587.25, 227.078593222699 ], "wc_reply_reviewers_avg": [ 22.5, 22.544400635190993 ], "wc_reply_authors_avg": [ 1526.0, 539.8208036006023 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 3.25, 1.479019945774904 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.8268106308031117, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:N-rq0RNhHXkJ:scholar.google.com/&scioq=HODA:+Protecting+DNNs+Against+Model+Extraction+Attacks+via+Hardness+of+Samples&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Sharif University of Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.sharif.edu", "aff_unique_abbr": "SUT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Iran" }, { "id": "eELR-4Dk4U8", "title": "Model-based Reinforcement Learning with a Hamiltonian Canonical ODE Network", "track": "main", "status": "Reject", "tldr": "", "abstract": "Model-based reinforcement learning usually suffers from a high sample complexity in training the world model, especially for the environments with complex dynamics. To make the training for general physical environments more efficient, we introduce Hamiltonian canonical ordinary differential equations into the learning process, which inspires a novel model of neural ordinary differential auto-encoder (NODA). NODA can model the physical world by nature and is flexible to impose Hamiltonian mechanics (e.g., the dimension of the physical equations) which can further accelerate training of the environment models. It can consequentially empower an RL agent with the robust\nextrapolation using a small amount of samples as well as the guarantee on the physical plausibility. Theoretically, we prove that NODA has uniform bounds for multi-step transition errors and value errors under certain conditions. Extensive experiments show that NODA can learn the environment dynamics effectively with a high sample efficiency, making it possible to facilitate reinforcement learning agents at the early stage.", "keywords": "Reinforcement learning;Hamiltonian canonical equation;ODE;World model;Sample efficiency", "primary_area": "", "supplementary_material": "/attachment/997deaf142c058df1b487d6bc08d9c3432e039ae.zip", "author": "Yao Feng;Yuhong Jiang;Hang Su;Dong Yan;Jun Zhu", "authorids": "~Yao_Feng2;~Yuhong_Jiang1;~Hang_Su3;~Dong_Yan1;~Jun_Zhu2", "gender": "M;M;M;M;M", "homepage": "https://github.com/yaofeng1998;https://yh-jiang.github.io/;;http://ml.cs.tsinghua.edu.cn/~jun;", "dblp": ";;20/7834;50/2644-1;26/5371-6", "google_scholar": "W2lzq5MAAAAJ;;lvztRUkAAAAJ;axsP38wAAAAJ;dxN1_X0AAAAJ", "orcid": "0000-0002-8213-5181;;0000-0003-0641-8988;;", "linkedin": ";;;;", "or_profile": "~Yao_Feng2;~Yuhong_Jiang1;~Dong_Yan1;~Jun_Zhu2;~Hang_Su2", "aff": "Carnegie Mellon University;;Tsinghua University;Tsinghua University;Tsinghua University", "aff_domain": "cmu.edu;;cs.tsinghua.edu.cn;mail.tsinghua.edu.cn;tsinghua.edu.cn", "position": "MS student;;Researcher;Professor;Associate Professor", "bibtex": "@misc{\nfeng2022modelbased,\ntitle={Model-based Reinforcement Learning with a Hamiltonian Canonical {ODE} Network},\nauthor={Yao Feng and Yuhong Jiang and Hang Su and Dong Yan and Jun Zhu},\nyear={2022},\nurl={https://openreview.net/forum?id=eELR-4Dk4U8}\n}", "github": "", "project": "", "reviewers": "grUX;cybw;9nPp", "site": "https://openreview.net/forum?id=eELR-4Dk4U8", "pdf_size": 0, "recommendation": "3;3;3", "confidence": "4;3;5", "correctness": "2;2;2", "technical_novelty": "1;2;1", "empirical_novelty": "2;2;1", "wc_summary_paper": "39;53;100", "wc_summary_review": "37;31;48", "wc_main_review": "385;245;465", "wc_review": "461;329;613", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "correctness_avg": [ 2.0, 0.0 ], "technical_novelty_avg": [ 1.3333333333333333, 0.4714045207910317 ], "empirical_novelty_avg": [ 1.6666666666666667, 0.4714045207910317 ], "wc_summary_paper_avg": [ 64.0, 26.08958923913266 ], "wc_summary_review_avg": [ 38.666666666666664, 7.039570693980959 ], "wc_main_review_avg": [ 365.0, 90.92121131323904 ], "wc_review_avg": [ 467.6666666666667, 116.0383078508511 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9288866830372755784&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;1;1", "aff_unique_norm": "Carnegie Mellon University;Tsinghua University", "aff_unique_dep": ";", "aff_unique_url": "https://www.cmu.edu;https://www.tsinghua.edu.cn", "aff_unique_abbr": "CMU;THU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1", "aff_country_unique": "United States;China" }, { "id": "eENsxDifOGu", "title": "Assessing and Developing Text-Based Agents for Morally Salient Scenarios", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "When making everyday decisions, people are guided by their conscience, an internal sense of right and wrong, to behave morally. By contrast, artificial agents may behave immorally when trained on environments that ignore moral concerns, such as violent video games. With the advent of generally capable agents that pretrain on many environments, mitigating inherited biases towards immoral behavior will become necessary. However, prior work on aligning agents with human values and morals focuses on small-scale settings lacking in semantic complexity. To enable research in larger, more realistic settings, we introduce Jiminy Cricket, an environment suite of 25 text-based adventure games with thousands of semantically rich, morally salient scenarios. Via dense annotations for every possible action, Jiminy Cricket environments robustly evaluate whether agents can act morally while maximizing reward. To improve moral behavior, we leverage language models with commonsense moral knowledge and develop strategies to mediate this knowledge into actions. In extensive experiments, we find that our approach can steer agents towards moral behavior without sacrificing performance.", "keywords": "Transformers;RL;data bias;reward bias;machine ethics;value learning;safe exploration", "primary_area": "", "supplementary_material": "", "author": "Dan Hendrycks;Mantas Mazeika;Andy Zou;Sahil Patel;Christine Zhu;Jesus Navarro;Dawn Song;Bo Li;Jacob Steinhardt", "authorids": "~Dan_Hendrycks1;~Mantas_Mazeika3;~Andy_Zou1;sahil.patelsp@berkeley.edu;czhu43@berkeley.edu;navjesus@berkeley.edu;~Dawn_Song1;~Bo_Li8;~Jacob_Steinhardt1", "gender": ";M;;;;;F;;", "homepage": ";https://github.com/mmazeika;;;;;;;", "dblp": "182/2504;215/4447;274/2362;;;;s/DXSong;;35/10625", "google_scholar": ";;;;;;;;", "orcid": ";;;;;;;;", "linkedin": ";;andy-zou-09ba3616a/;;;;;;", "or_profile": "~Dan_Hendrycks1;~Mantas_Mazeika3;~Andy_Zou1;sahil.patelsp@berkeley.edu;czhu43@berkeley.edu;navjesus@berkeley.edu;~Dawn_Song1;~Bo_Li8;~Jacob_Steinhardt1", "aff": "UC Berkeley;University of Illinois, Urbana-Champaign;University of California, Berkeley;;;;University of California, Berkeley;University of Illinois, Urbana Champaign;University of California, Berkeley", "aff_domain": "berkeley.edu;uiuc.edu;berkeley.edu;;;;berkeley.edu;;berkeley.edu", "position": "PhD student;PhD student;MS student;;;;Full Professor;;Assistant Professor", "bibtex": "@misc{\nhendrycks2022assessing,\ntitle={Assessing and Developing Text-Based Agents for Morally Salient Scenarios},\nauthor={Dan Hendrycks and Mantas Mazeika and Andy Zou and Sahil Patel and Christine Zhu and Jesus Navarro and Dawn Song and Bo Li and Jacob Steinhardt},\nyear={2022},\nurl={https://openreview.net/forum?id=eENsxDifOGu}\n}", "github": "", "project": "", "reviewers": "", "site": "https://openreview.net/forum?id=eENsxDifOGu", "pdf_size": 0, "recommendation": "", "confidence": "", "correctness": "", "technical_novelty": "", "empirical_novelty": "", "wc_summary_paper": "", "wc_summary_review": "", "wc_main_review": "", "wc_review": "", "wc_reply_reviewers": "", "wc_reply_authors": "", "reply_reviewers": "", "reply_authors": "", "recommendation_avg": [ 0, 0 ], "confidence_avg": [ 0, 0 ], "correctness_avg": [ 0, 0 ], "technical_novelty_avg": [ 0, 0 ], "empirical_novelty_avg": [ 0, 0 ], "wc_summary_paper_avg": [ 0, 0 ], "wc_summary_review_avg": [ 0, 0 ], "wc_main_review_avg": [ 0, 0 ], "wc_review_avg": [ 0, 0 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 1, 0 ], "authors#_avg": [ 9, 0 ], "corr_recommendation_confidence": 0, "corr_recommendation_correctness": 0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:2Oi36pCFcV8J:scholar.google.com/&scioq=Assessing+and+Developing+Text-Based+Agents+for+Morally+Salient+Scenarios&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;0;0;2;0", "aff_unique_norm": "University of California, Berkeley;University of Illinois;University of Illinois Urbana-Champaign", "aff_unique_dep": ";;", "aff_unique_url": "https://www.berkeley.edu;https://illinois.edu;https://illinois.edu", "aff_unique_abbr": "UC Berkeley;UIUC;UIUC", "aff_campus_unique_index": "0;1;0;0;1;0", "aff_campus_unique": "Berkeley;Urbana-Champaign", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "eFP90pzlIz", "title": "Towards Achieving Adversarial Robustness Beyond Perceptual Limits", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "The vulnerability of Deep Neural Networks to Adversarial Attacks has fuelled research towards building robust models. While most Adversarial Training algorithms aim towards defending attacks constrained within low magnitude $\\ell_p$ norm bounds, real-world adversaries are not limited by such constraints. In this work, we aim to achieve adversarial robustness within larger bounds, against perturbations that may be perceptible, but do not change human (or Oracle) prediction. The presence of images that flip Oracle predictions and those that do not, makes this a challenging setting for adversarial robustness. We discuss the ideal goals of an adversarial defense algorithm beyond perceptual limits, and further highlight the shortcomings of naively extending existing training algorithms to higher perturbation bounds. In order to overcome these shortcomings, we propose a novel defense, Oracle-Aligned Adversarial Training (OA-AT), to align the predictions of the network with that of an Oracle during adversarial training. The proposed approach achieves state-of-the-art performance at large epsilon bounds (such as an $\\ell_\\infty$ bound of $16/255$ on CIFAR-10) while outperforming existing defenses (AWP, TRADES and PGD-AT) at standard perturbation bounds ($8/255$) as well.", "keywords": "Adversarial Robustness;Adversarial Defense;Adversarial Training", "primary_area": "", "supplementary_material": "/attachment/eeb1e613f5b6b0fb8de71df5642a24966e87f40d.zip", "author": "Sravanti Addepalli;Samyak Jain;Gaurang Sriramanan;Venkatesh Babu Radhakrishnan", "authorids": "~Sravanti_Addepalli1;~Samyak_Jain1;~Gaurang_Sriramanan1;~Venkatesh_Babu_Radhakrishnan2", "gender": "F;M;M;M", "homepage": ";https://samyakjain0112.github.io/;https://gaurangsriramanan.github.io/;http://cds.iisc.ac.in/faculty/venky", "dblp": "127/7715;249/4464.html;262/3916;20/6289", "google_scholar": "MOO12i0AAAAJ;https://scholar.google.co.in/citations?hl=en;t76Uk8oAAAAJ;cVg7HrEAAAAJ", "orcid": ";0000-0003-3785-4782;;0000-0002-1926-1804", "linkedin": "sravanti-addepalli/;samyak-jain-276738178/;gaurang-sriramanan-16141a1a3/;venkatesh-babu-radhakrishnan-16568939", "or_profile": "~Sravanti_Addepalli1;~Samyak_Jain1;~Gaurang_Sriramanan1;~Venkatesh_Babu_Radhakrishnan2", "aff": "Google;Indian Institute of Technology (BHU), Varanasi;University of Maryland, College Park;Indian Institute of Science", "aff_domain": "google.com;iitbhu.ac.in;umd.edu;iisc.ac.in", "position": "Student Researcher;Undergrad student;PhD student;Full Professor", "bibtex": "@misc{\naddepalli2022towards,\ntitle={Towards Achieving Adversarial Robustness Beyond Perceptual Limits},\nauthor={Sravanti Addepalli and Samyak Jain and Gaurang Sriramanan and Venkatesh Babu Radhakrishnan},\nyear={2022},\nurl={https://openreview.net/forum?id=eFP90pzlIz}\n}", "github": "", "project": "", "reviewers": "UNf3;macv;AS8k;5p75", "site": "https://openreview.net/forum?id=eFP90pzlIz", "pdf_size": 0, "recommendation": "3;3;5;6", "confidence": "4;4;4;4", "correctness": "3;3;2;4", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "125;178;65;73", "wc_summary_review": "43;25;50;36", "wc_main_review": "229;371;448;257", "wc_review": "397;574;563;366", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "1152;862;1097;555", "reply_reviewers": "0;0;0;0", "reply_authors": "2;2;2;1", "recommendation_avg": [ 4.25, 1.299038105676658 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 110.25, 45.394795957246025 ], "wc_summary_review_avg": [ 38.5, 9.233092656309694 ], "wc_main_review_avg": [ 326.25, 88.14583087134638 ], "wc_review_avg": [ 475.0, 94.22048609511629 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 916.5, 235.42143063026356 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.75, 0.4330127018922193 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.2721655269759087, "gs_citation": 24, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7031547554525749970&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "Google;Indian Institute of Technology;University of Maryland;Indian Institute of Science", "aff_unique_dep": "Google;;;", "aff_unique_url": "https://www.google.com;https://www.iitbhu.ac.in;https://www/umd.edu;https://www.iisc.ac.in", "aff_unique_abbr": "Google;IIT (BHU);UMD;IISc", "aff_campus_unique_index": "0;1;2", "aff_campus_unique": "Mountain View;Varanasi;College Park;", "aff_country_unique_index": "0;1;0;1", "aff_country_unique": "United States;India" }, { "id": "eGd34W56KIT", "title": "SPARK: co-exploring model SPArsity and low-RanKness for compact neural networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "Sparsification and low-rank decomposition are two important techniques for deep neural network (DNN) compression. To date, these two popular yet distinct approaches are typically used in a separate way; while their efficient integration for better compression performance is little explored. In this paper we perform systematic co-exploration on the model sparsity and low-rankness towards compact neural networks. We first investigate and analyze several important design factors for the joint pruning and low-rank factorization, including operational sequence, low-rank format, and optimization objective. Based on the observations and outcomes from our analysis, we then propose SPARK, a unified DNN compression framework that can simultaneously capture model SPArsity and low-RanKness in an efficient way. Empirical experiments demonstrate very promising performance of our proposed solution. Notably, on CIFAR-10 dataset, our approach can bring 1.25%, 1.02% and 0.16% accuracy increase over the baseline ResNet-20, ResNet-56 and DenseNet-40 models, respectively, and meanwhile the storage and computational costs are reduced by 70.4% and 71.1% (for ResNet-20), 37.5% and 39.3% (for ResNet-56) and 52.4% and 61.3% (for DenseNet-40), respectively. On ImageNet dataset, our approach can enable 0.52% accuracy increase over baseline model with 48.7% fewer parameters.", "keywords": "model compression;low-rankness;sparsity;tensor", "primary_area": "", "supplementary_material": "", "author": "Wanzhao Yang;Miao Yin;Yang Sui;Bo Yuan", "authorids": "~Wanzhao_Yang1;~Miao_Yin1;~Yang_Sui1;~Bo_Yuan3", "gender": ";;M;", "homepage": ";https://noodle-lab.github.io/;https://eclipsess.github.io/yangsui.github.io/;", "dblp": ";199/1982;77/10522;41/1662-1", "google_scholar": ";ILDdu98AAAAJ;Q2W1p6sAAAAJ;oUy9elEAAAAJ", "orcid": ";;0000-0003-3020-0612;", "linkedin": ";miao-yin-55ab64170/;yang-sui-308055117/;", "or_profile": "~Wanzhao_Yang1;~Miao_Yin1;~Yang_Sui1;~Bo_Yuan3", "aff": ";Google;Rutgers University;Rutgers University", "aff_domain": ";google.com;rutgers.edu;rutgers.edu", "position": ";Intern;PhD student;Assistant Professor", "bibtex": "@misc{\nyang2022spark,\ntitle={{SPARK}: co-exploring model {SPA}rsity and low-RanKness for compact neural networks},\nauthor={Wanzhao Yang and Miao Yin and Yang Sui and Bo Yuan},\nyear={2022},\nurl={https://openreview.net/forum?id=eGd34W56KIT}\n}", "github": "", "project": "", "reviewers": "cT6Q;WR6A;YjQo;Kmv9", "site": "https://openreview.net/forum?id=eGd34W56KIT", "pdf_size": 0, "recommendation": "3;6;6;8", "confidence": "5;3;3;3", "correctness": "3;4;4;4", "technical_novelty": "3;3;2;4", "empirical_novelty": "2;3;3;4", "wc_summary_paper": "96;62;50;95", "wc_summary_review": "30;36;21;15", "wc_main_review": "630;456;230;332", "wc_review": "756;554;301;442", "wc_reply_reviewers": "0;101;0;28", "wc_reply_authors": "3098;1962;2059;1201", "reply_reviewers": "0;2;0;1", "reply_authors": "6;5;6;3", "recommendation_avg": [ 5.75, 1.7853571071357126 ], "confidence_avg": [ 3.5, 0.8660254037844386 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 75.75, 20.20365066021485 ], "wc_summary_review_avg": [ 25.5, 8.077747210701755 ], "wc_main_review_avg": [ 412.0, 149.1509302686376 ], "wc_review_avg": [ 513.25, 166.36913024957485 ], "wc_reply_reviewers_avg": [ 32.25, 41.30602256330183 ], "wc_reply_authors_avg": [ 2080.0, 675.1536862078144 ], "reply_reviewers_avg": [ 0.75, 0.82915619758885 ], "reply_authors_avg": [ 5.0, 1.224744871391589 ], "replies_avg": [ 28, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.8892972917998875, "corr_recommendation_correctness": 0.8892972917998875, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:kIrgk57Na6IJ:scholar.google.com/&scioq=SPARK:+co-exploring+model+SPArsity+and+low-RanKness+for+compact+neural+networks&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;1", "aff_unique_norm": "Google;Rutgers University", "aff_unique_dep": "Google;", "aff_unique_url": "https://www.google.com;https://www.rutgers.edu", "aff_unique_abbr": "Google;Rutgers", "aff_campus_unique_index": "0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "eH8Jie3uiI", "title": "Vote for Nearest Neighbors Meta-Pruning of Self-Supervised Networks", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Pruning plays an essential role in deploying deep neural nets (DNNs) to the hardware of limited memory or computation. However, current high-quality iterative pruning can create a terrible carbon footprint when compressing a large DNN for a wide variety of devices and tasks. Can we reuse the pruning results on previous tasks to accelerate the pruning for a new task? Can we find a better initialization for a new task, e.g., a much smaller network closer to the final pruned model, by exploiting its similar tasks? We study this ``nearest neighbors meta-pruning'' problem by first investigating different choices of pre-trained models for pruning under limited iterations. Our empirical study reveals several advantages of the self-supervision pre-trained model when pruned for multiple tasks. We further study the overlap of pruned models for similar tasks and how the overlap changes for different layers. Inspired by these discoveries, we develop a simple but strong baseline ``Meta-Vote Pruning (MVP)'' that significantly reduces the pruning iterations for a new task by initializing a sub-network from the pruned models for tasks similar to it. In experiments, we demonstrate the advantages of MVP by extensive empirical studies and comparisons with popular pruning methods.", "keywords": "network pruning;meta pruning;self-supervision;multi-task pruning", "primary_area": "", "supplementary_material": "/attachment/73e8edd61eac11c360ee8f3d89a8c74d52d2c475.zip", "author": "Haiyan Zhao;Tianyi Zhou;Guodong Long;Jing Jiang;Liming Zhu;Chengqi Zhang", "authorids": "~Haiyan_Zhao2;~Tianyi_Zhou1;~Guodong_Long2;~Jing_Jiang6;~Liming_Zhu2;~Chengqi_Zhang1", "gender": "M;M;M;F;M;M", "homepage": "http://haiyan.tech/;https://tianyizhou.github.io/;https://www.uts.edu.au/staff/guodong.long;https://www.uts.edu.au/staff/jing.jiang;https://liming-zhu.org/;https://research.polyu.edu.hk/en/persons/chengqi-zhang", "dblp": ";88/8205-1;34/10089;68/1974-2;52/4318-1.html;71/964", "google_scholar": ";OKvgizMAAAAJ;https://scholar.google.com.au/citations?user=Pl8m7hMAAAAJ;https://scholar.google.com.au/citations?hl=en;https://scholar.google.com.au/citations?user=C1zfS7wAAAAJ;https://scholar.google.com.au/citations?user=B6lBmqEAAAAJ", "orcid": ";0000-0001-5348-0632;0000-0003-3740-9515;;0000-0001-5839-3765;0000-0001-5715-7154", "linkedin": ";tianyizhou;;;limingzhu/;chengqi-zhang-55aa8910/", "or_profile": "~Haiyan_Zhao2;~Tianyi_Zhou1;~Guodong_Long2;~Jing_Jiang6;~Liming_Zhu2;~Chengqi_Zhang1", "aff": ";University of Washington, Seattle;University of Technology Sydney;University of Technology Sydney;CSIRO;University of Technology Sydney", "aff_domain": ";uw.edu;uts.edu.au;uts.edu.au;csiro.au;uts.edu.au", "position": ";PhD student;Associate Professor;Lecturer;Research Director;Full Professor", "bibtex": "@misc{\nzhao2022vote,\ntitle={Vote for Nearest Neighbors Meta-Pruning of Self-Supervised Networks},\nauthor={Haiyan Zhao and Tianyi Zhou and Guodong Long and Jing Jiang and Liming Zhu and Chengqi Zhang},\nyear={2022},\nurl={https://openreview.net/forum?id=eH8Jie3uiI}\n}", "github": "", "project": "", "reviewers": "UQ81;7nB3;HzkJ;riss", "site": "https://openreview.net/forum?id=eH8Jie3uiI", "pdf_size": 0, "recommendation": "3;5;5;5", "confidence": "3;4;4;4", "correctness": "2;3;3;3", "technical_novelty": "3;2;2;2", "empirical_novelty": "3;2;2;3", "wc_summary_paper": "111;44;71;47", "wc_summary_review": "86;40;21;110", "wc_main_review": "417;86;349;206", "wc_review": "614;170;441;363", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.5, 0.8660254037844386 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 68.25, 26.808347580557815 ], "wc_summary_review_avg": [ 64.25, 35.442735503908274 ], "wc_main_review_avg": [ 264.5, 128.14152332479898 ], "wc_review_avg": [ 397.0, 159.4600263388916 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 1.0, "corr_recommendation_correctness": 1.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:2Qz7iDay5hkJ:scholar.google.com/&scioq=Vote+for+Nearest+Neighbors+Meta-Pruning+of+Self-Supervised+Networks&hl=en&as_sdt=0,5", "gs_version_total": 4, "aff_unique_index": "0;1;1;2;1", "aff_unique_norm": "University of Washington;University of Technology Sydney;Commonwealth Scientific and Industrial Research Organisation", "aff_unique_dep": ";;", "aff_unique_url": "https://www.washington.edu;https://www.uts.edu.au;https://www.csiro.au", "aff_unique_abbr": "UW;UTS;CSIRO", "aff_campus_unique_index": "0", "aff_campus_unique": "Seattle;", "aff_country_unique_index": "0;1;1;1;1", "aff_country_unique": "United States;Australia" }, { "id": "eIvzaLx6nKW", "title": "Multi-Domain Self-Supervised Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Contrastive self-supervised learning has recently gained significant attention owing to its ability to learn improved feature representations without the use of label information. Current contrastive learning approaches, however, are only effective when trained on a particular dataset, limiting their utility in diverse multi-domain settings. In fact, training these methods on a combination of several domains often degrades the quality of learned representations compared to the models trained on a single domain. In this paper, we propose a Multi-Domain Self-Supervised Learning (MDSSL) approach that can effectively perform representation learning on multiple, diverse datasets. In MDSSL, we propose a three-level hierarchical loss for measuring the agreement between augmented views of a given sample, agreement between samples within a dataset and agreement between samples across datasets. We show that MDSSL when trained on a mixture of CIFAR-10, STL-10, SVHN and CIFAR-100 produces powerful representations, achieving up to a $25\\%$ increase in top-1 accuracy on a linear classifier compared to single-domain self-supervised encoders. Moreover, MDSSL encoders can generalize more effectively to unseen datasets compared to both single-domain and multi-domain baselines. MDSSL is also highly efficient in terms of the resource usage as it stores and trains a single model for multiple datasets leading up to $17\\%$ reduction in training time. Finally, for multi-domain datasets where domain labels are unknown, we propose a modified approach that alternates between clustering and MDSSL. Thus, for diverse multi-domain datasets (even without domain labels), MDSSL provides an efficient and generalizable self-supervised encoder without sacrificing the quality of representations in individual domains. ", "keywords": "self-supervised learning;contrastive learning;multi-domain data;unsupervised learning", "primary_area": "", "supplementary_material": "/attachment/93d20283b1fc44a2a856edd35ed54b5413631f57.zip", "author": "Neha Mukund Kalibhat;Yogesh Balaji;C. Bayan Bruss;Soheil Feizi", "authorids": "~Neha_Mukund_Kalibhat1;~Yogesh_Balaji1;~C._Bayan_Bruss1;~Soheil_Feizi2", "gender": "F;M;M;M", "homepage": "https://sites.google.com/view/nehakalibhat;https://yogeshbalaji.github.io/;https://www.cbbruss.com;https://www.cs.umd.edu/~sfeizi/", "dblp": "276/0300;185/6906;;57/2132", "google_scholar": "HYT-q5MAAAAJ;0I2qH0oAAAAJ;ClqvGRQAAAAJ;lptAmrMAAAAJ", "orcid": ";;;", "linkedin": "neha-kalibhat/;;bayan-bruss/;", "or_profile": "~Neha_Mukund_Kalibhat1;~Yogesh_Balaji1;~C._Bayan_Bruss1;~Soheil_Feizi2", "aff": "University of Maryland, College Park;Department of Computer Science, University of Maryland, College Park;Capital One;University of Maryland, College Park", "aff_domain": "umd.edu;cs.umd.edu;capitalone.com;umd.edu", "position": "PhD student;PhD student;Director of Applied Research;Assistant Professor", "bibtex": "@misc{\nkalibhat2022multidomain,\ntitle={Multi-Domain Self-Supervised Learning},\nauthor={Neha Mukund Kalibhat and Yogesh Balaji and C. Bayan Bruss and Soheil Feizi},\nyear={2022},\nurl={https://openreview.net/forum?id=eIvzaLx6nKW}\n}", "github": "", "project": "", "reviewers": "ZUR1;KFWz;stxw", "site": "https://openreview.net/forum?id=eIvzaLx6nKW", "pdf_size": 0, "recommendation": "5;6;6", "confidence": "4;4;4", "correctness": "3;3;4", "technical_novelty": "2;2;2", "empirical_novelty": "3;2;3", "wc_summary_paper": "122;68;203", "wc_summary_review": "23;38;45", "wc_main_review": "278;350;189", "wc_review": "423;456;437", "wc_reply_reviewers": "407;57;20", "wc_reply_authors": "722;443;423", "reply_reviewers": "2;1;1", "reply_authors": "3;1;1", "recommendation_avg": [ 5.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 131.0, 55.47972602672079 ], "wc_summary_review_avg": [ 35.333333333333336, 9.177266598624136 ], "wc_main_review_avg": [ 272.3333333333333, 65.8499978908293 ], "wc_review_avg": [ 438.6666666666667, 13.523641850067197 ], "wc_reply_reviewers_avg": [ 161.33333333333334, 174.3680653738586 ], "wc_reply_authors_avg": [ 529.3333333333334, 136.48036081755092 ], "reply_reviewers_avg": [ 1.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.9428090415820634 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.4999999999999999, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:9lNfEJvnk2YJ:scholar.google.com/&scioq=Multi-Domain+Self-Supervised+Learning&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "University of Maryland;University of Maryland, College Park;Capital One", "aff_unique_dep": ";Department of Computer Science;", "aff_unique_url": "https://www/umd.edu;https://www/umd.edu;https://www.capitalone.com", "aff_unique_abbr": "UMD;UMD;Capital One", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "College Park;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "eJyt4hJzOLk", "title": "Discrepancy-Optimal Meta-Learning for Domain Generalization", "track": "main", "status": "Reject", "tldr": "", "abstract": "This work attempts to tackle the problem of domain generalization (DG) via learning to reduce domain shift with an episodic training procedure. In particular, we measure the domain shift with $\\mathcal{Y}$-discrepancy and learn to optimize $\\mathcal{Y}$-discrepancy between the unseen target domain and source domains only using source-domain samples. Theoretically, we give a PAC-style generalization bound for discrepancy-optimal meta-learning and further make comparisons with other DG bounds including ERM and domain-invariant learning. The theoretical analyses show that there is a tradeoff between classification performance and computational complexity for discrepancy-optimal meta-learning. The theoretical results also shed light on a bilevel optimization algorithm for DG. Empirically, we evaluate the algorithm with DomainBed and achieves state-of-the-art results on two DG benchmarks.", "keywords": "Domain generalization Meta-learning Transfer learning Generalization Bound", "primary_area": "", "supplementary_material": "", "author": "Chen Jia", "authorids": "~Chen_Jia1", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@misc{\njia2022discrepancyoptimal,\ntitle={Discrepancy-Optimal Meta-Learning for Domain Generalization},\nauthor={Chen Jia},\nyear={2022},\nurl={https://openreview.net/forum?id=eJyt4hJzOLk}\n}", "github": "", "project": "", "reviewers": "tVED;JFVr;dNaV;d88q", "site": "https://openreview.net/forum?id=eJyt4hJzOLk", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "5;4;3;2", "correctness": "2;2;3;4", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;3;2;3", "wc_summary_paper": "63;40;48;113", "wc_summary_review": "19;149;66;33", "wc_main_review": "565;993;247;110", "wc_review": "647;1182;361;256", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 3.5, 1.118033988749895 ], "correctness_avg": [ 2.75, 0.82915619758885 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 66.0, 28.36370920736567 ], "wc_summary_review_avg": [ 66.75, 50.45976119642264 ], "wc_main_review_avg": [ 478.75, 339.69719972351845 ], "wc_review_avg": [ 611.5, 359.11732066276056 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 1, 0 ], "corr_recommendation_confidence": -0.9233805168766388, "corr_recommendation_correctness": 0.7608859102526822, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:zTVMG0ZvUNgJ:scholar.google.com/&scioq=Discrepancy-Optimal+Meta-Learning+for+Domain+Generalization&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "title": "Sampling with Mirrored Stein Operators", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6304", "id": "eMudnJsb1T5", "poster": "", "openreview": "https://openreview.net/forum?id=eMudnJsb1T5", "slides": "https://iclr.cc/virtual/2022/poster/6304", "video": "https://iclr.cc/virtual/2022/poster/6304", "author_site": "Jiaxin Shi, Chang Liu, Lester Mackey", "tldr": "", "abstract": "We introduce a new family of particle evolution samplers suitable for constrained domains and non-Euclidean geometries. Stein Variational Mirror Descent and Mirrored Stein Variational Gradient Descent minimize the Kullback-Leibler (KL) divergence to constrained target distributions by evolving particles in a dual space defined by a mirror map. Stein Variational Natural Gradient exploits non-Euclidean geometry to more efficiently minimize the KL divergence to unconstrained targets. We derive these samplers from a new class of mirrored Stein operators and adaptive kernels developed in this work. We demonstrate that these new samplers yield accurate approximations to distributions on the simplex, deliver valid confidence intervals in post-selection inference, and converge more rapidly than prior methods in large-scale unconstrained posterior inference. Finally, we establish the convergence of our new procedures under verifiable conditions on the target distribution.", "keywords": "Stein's method;Sampling;Mirror descent;Natural gradient descent;Probabilistic inference;Bayesian inference;Post-selection inference;Stein operators", "primary_area": "", "supplementary_material": "", "author": "Jiaxin Shi;Chang Liu;Lester Mackey", "authorids": "~Jiaxin_Shi1;~Chang_Liu10;~Lester_Mackey1", "gender": "M;M;M", "homepage": "http://jiaxins.io;https://changliu00.github.io/;https://stanford.edu/~lmackey", "dblp": "151/7509;52/5716-30;05/2961", "google_scholar": "juZXbFoAAAAJ;rYd0GEsAAAAJ;erv7TP0AAAAJ", "orcid": ";0000-0001-5207-5440;0000-0002-1102-0387", "linkedin": ";chang-liu-9ab479168/;lester-mackey-5902909", "or_profile": "~Jiaxin_Shi1;~Chang_Liu10;~Lester_Mackey1", "aff": "Microsoft Research New England;Microsoft;Microsoft Research New England", "aff_domain": "microsoft.com;microsoft.com;microsoft.com", "position": "Postdoc;Researcher;Principal Researcher", "bibtex": "@inproceedings{\nshi2022sampling,\ntitle={Sampling with Mirrored Stein Operators},\nauthor={Jiaxin Shi and Chang Liu and Lester Mackey},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=eMudnJsb1T5}\n}", "github": "", "project": "", "reviewers": "BkpK;5TzE;dVdw;Zj9c", "pdf_size": 0, "recommendation": "6;8;8;10", "confidence": "4;3;3;3", "correctness": "3;4;3;4", "technical_novelty": "4;3;3;3", "empirical_novelty": "4;3;3;3", "wc_summary_paper": "110;70;80;48", "wc_summary_review": "24;34;19;43", "wc_main_review": "211;154;393;128", "wc_review": "345;258;492;219", "wc_reply_reviewers": "69;28;0;0", "wc_reply_authors": "470;87;304;350", "reply_reviewers": "2;1;0;0", "reply_authors": "3;1;1;1", "recommendation_avg": [ 8.0, 1.4142135623730951 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 77.0, 22.293496809607955 ], "wc_summary_review_avg": [ 30.0, 9.246621004453464 ], "wc_main_review_avg": [ 221.5, 103.46617804867444 ], "wc_review_avg": [ 328.5, 104.83916253003932 ], "wc_reply_reviewers_avg": [ 24.25, 28.252212302756046 ], "wc_reply_authors_avg": [ 302.75, 138.52323812270632 ], "reply_reviewers_avg": [ 0.75, 0.82915619758885 ], "reply_authors_avg": [ 1.5, 0.8660254037844386 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.816496580927726, "corr_recommendation_correctness": 0.7071067811865476, "gs_citation": 26, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8093287446916276740&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "pdf": "https://openreview.net/pdf?id=eMudnJsb1T5", "email": "microsoft.com;microsoft.com;microsoft.com", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Microsoft", "aff_unique_dep": "Microsoft Research", "aff_unique_url": "https://www.microsoft.com/en-us/research/group/microsoft-research-new-england", "aff_unique_abbr": "MSR NE", "aff_campus_unique_index": "0;0", "aff_campus_unique": "New England;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "eOdSD0B5TE", "title": "On the Implicit Biases of Architecture & Gradient Descent", "track": "main", "status": "Reject", "tldr": "", "abstract": "Do neural networks generalise because of bias in the functions returned by gradient descent, or bias already present in the network architecture? $\\textit{\u00bfPor qu\u00e9 no los dos?}$ This paper finds that while typical networks that fit the training data already generalise fairly well, gradient descent can further improve generalisation by selecting networks with a large margin. This conclusion is based on a careful study of the behaviour of infinite width networks trained by Bayesian inference and finite width networks trained by gradient descent. To measure the implicit bias of architecture, new technical tools are developed to both $\\textit{analytically bound}$ and $\\textit{consistently estimate}$ the average test error of the neural network--Gaussian process (NNGP) posterior. This error is found to be already better than chance, corroborating the findings of Valle-P\u00e9rez et al. (2019) and underscoring the importance of architecture. Going beyond this result, this paper finds that test performance can be substantially improved by selecting a function with much larger margin than is typical under the NNGP posterior. This highlights a curious fact: $\\textit{minimum a posteriori}$ functions can generalise best, and gradient descent can select for those functions. In summary, new technical tools suggest a nuanced portrait of generalisation involving both the implicit biases of architecture and gradient descent.", "keywords": "generalisation;function space;PAC-Bayes;NNGP;orthants;margin", "primary_area": "", "supplementary_material": "/attachment/536ff97a7aa5ef7982f2cfe25e14f73ecf086e92.zip", "author": "Jeremy Bernstein;Yisong Yue", "authorids": "~Jeremy_Bernstein1;~Yisong_Yue1", "gender": "M;M", "homepage": "https://jeremybernste.in;http://www.yisongyue.com", "dblp": "215/3638;28/1244", "google_scholar": ";tEk4qo8AAAAJ", "orcid": ";0000-0001-9127-1989", "linkedin": ";yisongyue/", "or_profile": "~Jeremy_Bernstein1;~Yisong_Yue1", "aff": "California Institute of Technology;Argo AI", "aff_domain": "caltech.edu;argo.ai", "position": "PhD student;Principal Researcher", "bibtex": "@misc{\nbernstein2022on,\ntitle={On the Implicit Biases of Architecture \\& Gradient Descent},\nauthor={Jeremy Bernstein and Yisong Yue},\nyear={2022},\nurl={https://openreview.net/forum?id=eOdSD0B5TE}\n}", "github": "", "project": "", "reviewers": "Lxe3;HuiV;CXBm;WQvV", "site": "https://openreview.net/forum?id=eOdSD0B5TE", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "3;3;3;2", "correctness": "3;3;3;3", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;1;2;2", "wc_summary_paper": "96;62;118;147", "wc_summary_review": "47;32;31;400", "wc_main_review": "322;253;754;907", "wc_review": "465;347;903;1454", "wc_reply_reviewers": "100;479;0;378", "wc_reply_authors": "385;936;780;1099", "reply_reviewers": "1;3;0;1", "reply_authors": "2;4;1;2", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 2.75, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 1.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 105.75, 31.06746690671771 ], "wc_summary_review_avg": [ 127.5, 157.4555492829643 ], "wc_main_review_avg": [ 559.0, 277.9091578196012 ], "wc_review_avg": [ 792.25, 434.6029078365675 ], "wc_reply_reviewers_avg": [ 239.25, 195.80778202104227 ], "wc_reply_authors_avg": [ 800.0, 264.8216380887332 ], "reply_reviewers_avg": [ 1.25, 1.0897247358851685 ], "reply_authors_avg": [ 2.25, 1.0897247358851685 ], "replies_avg": [ 20, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1423260849191581631&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1", "aff_unique_norm": "California Institute of Technology;Argo AI", "aff_unique_dep": ";", "aff_unique_url": "https://www.caltech.edu;https://www.argo.ai", "aff_unique_abbr": "Caltech;Argo AI", "aff_campus_unique_index": "0", "aff_campus_unique": "Pasadena;", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "ePI0bPbrih", "title": "Boundary Graph Neural Networks for 3D Simulations", "track": "main", "status": "Reject", "tldr": "", "abstract": "The abundance of data has given machine learning considerable momentum in natural sciences and engineering. However, the modeling of simulated physical processes remains difficult. A key problem is the correct handling of geometric boundaries. While triangularized geometric boundaries are very common in engineering applications, they are notoriously difficult to model by machine learning approaches due to their heterogeneity with respect to size and orientation. In this work, we introduce Boundary Graph Neural Networks (BGNNs), which dynamically modify graph structures to address boundary conditions. Boundary graph structures are constructed via modifying edges, augmenting node features, and dynamically inserting virtual nodes. The new BGNNs are tested on complex 3D granular flow processes of hoppers and rotating drums which are standard components of industrial machinery. Using precise simulations that are obtained by an expensive and complex discrete element method, BGNNs are evaluated in terms of computational efficiency as well as prediction accuracy of particle flows and mixing entropies. Even if complex boundaries are present, BGNNs are able to accurately reproduce 3D granular flows within simulation uncertainties over hundreds of thousands of simulation timesteps, and most notably particles completely stay within the geometric objects without using handcrafted conditions or restrictions.", "keywords": "Simulation;Graph Neural Network;Boundary Conditions;Granular Flow;Physics Application", "primary_area": "", "supplementary_material": "/attachment/e570e622d66094988037dd587e603cd3e8316a0e.zip", "author": "Andreas Mayr;Sebastian Lehner;Arno Mayrhofer;Christoph Kloss;Sepp Hochreiter;Johannes Brandstetter", "authorids": "~Andreas_Mayr2;~Sebastian_Lehner1;~Arno_Mayrhofer1;~Christoph_Kloss1;~Sepp_Hochreiter1;~Johannes_Brandstetter1", "gender": "M;;M;M;M;M", "homepage": "https://www.jku.at/institut-fuer-machine-learning/ueber-uns/team/dipl-ing-andreas-mayr-msc/;https://www.jku.at/institut-fuer-machine-learning/ueber-uns/team/dr-sebastian-lehner/;https://www.aspherix-dem.com/;https://www.dcs-computing.com;https://www.jku.at/en/institute-for-machine-learning/about-us/team/sepp-hochreiter/;", "dblp": ";292/2938;;;h/SeppHochreiter.html;251/8691", "google_scholar": "3-Iw0tgAAAAJ;gZO5TdUAAAAJ;;;https://scholar.google.at/citations?user=tvUH3WMAAAAJ;KiRvOHcAAAAJ", "orcid": ";;0000-0002-9277-4016;;0000-0001-7449-2528;", "linkedin": ";;arnomayrhofer/?lipi=urn%3Ali%3Apage%3Ad_flagship3_feed%3Bdi%2Bv6rzTTayjqGoLs2bIEg%3D%3D;;https://linkedin.com/in/sepp-hochreiter-41514846;", "or_profile": "~Andreas_Mayr2;~Sebastian_Lehner1;~Arno_Mayrhofer1;~Christoph_Kloss1;~Sepp_Hochreiter1;~Johannes_Brandstetter1", "aff": "Johannes Kepler University Linz;Johannes Kepler University Linz;;DCS Computing;Johannes Kepler University Linz;Microsoft", "aff_domain": "jku.at;jku.at;;dcs-computing.com;jku.at;microsoft.com", "position": "PhD student;Postdoc;;Principal Researcher;Full Professor;Researcher", "bibtex": "@misc{\nmayr2022boundary,\ntitle={Boundary Graph Neural Networks for 3D Simulations},\nauthor={Andreas Mayr and Sebastian Lehner and Arno Mayrhofer and Christoph Kloss and Sepp Hochreiter and Johannes Brandstetter},\nyear={2022},\nurl={https://openreview.net/forum?id=ePI0bPbrih}\n}", "github": "", "project": "", "reviewers": "QRXr;kKo7;93ce;bSwm", "site": "https://openreview.net/forum?id=ePI0bPbrih", "pdf_size": 0, "recommendation": "5;5;5;6", "confidence": "4;3;4;4", "correctness": "3;3;3;4", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;0;2;3", "wc_summary_paper": "191;52;98;68", "wc_summary_review": "289;19;177;96", "wc_main_review": "783;287;379;273", "wc_review": "1263;358;654;437", "wc_reply_reviewers": "474;92;172;0", "wc_reply_authors": "541;401;294;296", "reply_reviewers": "1;1;1;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 5.25, 0.4330127018922193 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 1.75, 1.0897247358851685 ], "wc_summary_paper_avg": [ 102.25, 53.83481680102571 ], "wc_summary_review_avg": [ 145.25, 100.04592695357468 ], "wc_main_review_avg": [ 430.5, 207.54939171194889 ], "wc_review_avg": [ 678.0, 354.7118548906986 ], "wc_reply_reviewers_avg": [ 184.5, 177.8784697483088 ], "wc_reply_authors_avg": [ 383.0, 100.96781665461525 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": 1.0, "gs_citation": 43, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=801640499274385022&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff_unique_index": "0;0;1;0;2", "aff_unique_norm": "Johannes Kepler University;DCS Computing;Microsoft", "aff_unique_dep": ";;Microsoft Corporation", "aff_unique_url": "https://www.jku.at;;https://www.microsoft.com", "aff_unique_abbr": "JKU;;Microsoft", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Linz;", "aff_country_unique_index": "0;0;0;2", "aff_country_unique": "Austria;;United States" }, { "id": "eR5TdQpRMCP", "title": "General Incremental Learning with Domain-aware Categorical Representations", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Incremental learning is necessary to achieve human-like intelligence system since the model must continuously accumulate knowledge in response to real-world streaming data.\nIn this work, we consider a general and yet under-explored incremental learning problem in which both the class distribution and class-specific domain distribution vary over sequential sessions.\nApart from the challenges discussed extensively in the class incremental learning, the problem also faces an intra-class stability-plasticity dilemma and intra-class domain imbalance issue. \nTo address above issues, we develop a novel domain-aware learning framework.\nConcretely, we introduce a flexible class representation based on the von Mises-Fisher mixture model to capture the intra-class structure as well as a bi-level balanced memory to deal with data imbalances within and between classes.\nIn particular, we build a mixture model on deep features of each class and devise an expansion-and-reduction strategy for dynamically increasing the number of components according to the concept complexity.\nCombining with distillation loss, our design encourages the model to learn a domain-ware representation, which aids in achieving inter- and intra-class stability-plasticity trade-off. \nWe conduct exhaustive experiments on three benchmarks, each with three representative splits.\nThe results show that our method consistently outperforms other methods with a significant margin, suggesting its superiority. ", "keywords": "Incremental learning;Domain-aware;EM algorithm", "primary_area": "", "supplementary_material": "", "author": "Jiangwei Xie;Shipeng Yan;Xuming He", "authorids": "~Jiangwei_Xie1;~Shipeng_Yan1;~Xuming_He3", "gender": "M;M;M", "homepage": ";;https://faculty.sist.shanghaitech.edu.cn/faculty/hexm/index.html", "dblp": "289/1931;63/9201;03/4230", "google_scholar": ";oYILsyoAAAAJ;0KyeZ2QAAAAJ", "orcid": ";;", "linkedin": "xiejw-a1b86a1a2;;", "or_profile": "~Jiangwei_Xie1;~Shipeng_Yan1;~Xuming_He3", "aff": "Microsoft;ShanghaiTech University;ShanghaiTech University", "aff_domain": "microsoft.com;shanghaitech.edu.cn;shanghaitech.edu.cn", "position": "Intern;PhD student;Associate Professor", "bibtex": "@misc{\nxie2022general,\ntitle={General Incremental Learning with Domain-aware Categorical Representations},\nauthor={Jiangwei Xie and Shipeng Yan and Xuming He},\nyear={2022},\nurl={https://openreview.net/forum?id=eR5TdQpRMCP}\n}", "github": "", "project": "", "reviewers": "aArS;1cjX;KTDg;Ld2c", "site": "https://openreview.net/forum?id=eR5TdQpRMCP", "pdf_size": 0, "recommendation": "5;5;5;6", "confidence": "5;4;4;4", "correctness": "3;3;2;1", "technical_novelty": "3;2;2;1", "empirical_novelty": "3;2;2;2", "wc_summary_paper": "44;64;72;115", "wc_summary_review": "42;27;28;29", "wc_main_review": "117;181;488;269", "wc_review": "203;272;588;413", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 5.25, 0.4330127018922193 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 2.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 73.75, 25.907286619790966 ], "wc_summary_review_avg": [ 31.5, 6.103277807866851 ], "wc_main_review_avg": [ 263.75, 140.26648744443557 ], "wc_review_avg": [ 369.0, 147.36179966327774 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": -0.8703882797784891, "gs_citation": 56, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4387392403170736641&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff_unique_index": "0;1;1", "aff_unique_norm": "Microsoft;ShanghaiTech University", "aff_unique_dep": "Microsoft Corporation;", "aff_unique_url": "https://www.microsoft.com;https://www.shanghaitech.edu.cn", "aff_unique_abbr": "Microsoft;ShanghaiTech", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1", "aff_country_unique": "United States;China" }, { "id": "eSHBmLnD1s8", "title": "Task Conditioned Stochastic Subsampling", "track": "main", "status": "Reject", "tldr": "", "abstract": "Deep Learning algorithms are designed to operate on huge volumes of high dimensional data such as images. In order to reduce the volume of data these algorithms must process, we propose a set-based two-stage end-to-end neural subsampling model that is jointly optimized with an \\textit{arbitrary} downstream task network such as a classifier. In the first stage, we efficiently subsample \\textit{candidate elements} using conditionally independent Bernoulli random variables, followed by conditionally dependent autoregressive subsampling of the candidate elements using Categorical random variables in the second stage. We apply our method to feature and instance selection and show that our method outperforms the relevant baselines under very low subsampling rates on many tasks including image classification, image reconstruction, function reconstruction and few-shot classification. Additionally, for nonparametric models such as Neural Processes that require to leverage whole training data at inference time, we show that our method enhances the scalability of these models. To ensure easy reproducibility, we provide source code in the \\textbf{Supplementary Material}.\n", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/67f24a6cfb805af167a55daef63fa96ea3ebddca.zip", "author": "Bruno Andreis;Seanie Lee;A. Tuan Nguyen;Juho Lee;Eunho Yang;Sung Ju Hwang", "authorids": "~Bruno_Andreis1;~Seanie_Lee1;~A._Tuan_Nguyen1;~Juho_Lee2;~Eunho_Yang1;~Sung_Ju_Hwang1", "gender": "M;M;M;M;M;", "homepage": "https://andreisbruno.github.io/;https://seanie12.github.io/;https://atuannguyen.com;https://juho.lee.github.io;https://sites.google.com/site/hleehome2/;", "dblp": "225/0404;219/6771;;55/3410-1;96/2621;", "google_scholar": "WzQ_v4IAAAAJ;zrZu6GkAAAAJ;V-guxukAAAAJ;Py4URJUAAAAJ;;", "orcid": ";;;;;", "linkedin": ";;a-tuan-nguyen/;;;", "or_profile": "~Bruno_Andreis1;~Seanie_Lee1;~A._Tuan_Nguyen1;~Juho_Lee2;~Eunho_Yang1;~Sung_Ju_Hwang1", "aff": "Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology;University of Oxford;Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology;", "aff_domain": "kaist.ac.kr;kaist.ac.kr;ox.ac.uk;kaist.ac.kr;kaist.ac.kr;", "position": "PhD student;MS student;PhD student;Assistant Professor;Associate Professor;", "bibtex": "@misc{\nbruno2022task,\ntitle={Task Conditioned Stochastic Subsampling},\nauthor={Bruno Andreis and Seanie Lee and A. Tuan Nguyen and Juho Lee and Eunho Yang and Sung Ju Hwang},\nyear={2022},\nurl={https://openreview.net/forum?id=eSHBmLnD1s8}\n}", "github": "", "project": "", "reviewers": "Lpik;oSsn;7mpS;zeQG", "site": "https://openreview.net/forum?id=eSHBmLnD1s8", "pdf_size": 0, "recommendation": "3;5;5;8", "confidence": "3;5;3;3", "correctness": "2;1;2;3", "technical_novelty": "2;2;2;3", "empirical_novelty": "0;4;2;3", "wc_summary_paper": "151;51;129;61", "wc_summary_review": "70;24;89;39", "wc_main_review": "308;45;144;173", "wc_review": "529;120;362;273", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "1719;679;2354;452", "reply_reviewers": "0;0;0;0", "reply_authors": "4;2;5;2", "recommendation_avg": [ 5.25, 1.7853571071357126 ], "confidence_avg": [ 3.5, 0.8660254037844386 ], "correctness_avg": [ 2.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 1.479019945774904 ], "wc_summary_paper_avg": [ 98.0, 42.86023798347368 ], "wc_summary_review_avg": [ 55.5, 25.480384612481814 ], "wc_main_review_avg": [ 167.5, 93.98005107468286 ], "wc_review_avg": [ 321.0, 148.02871343087463 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 1301.0, 773.1781812751832 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 3.25, 1.299038105676658 ], "replies_avg": [ 20, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.08084520834544431, "corr_recommendation_correctness": 0.5940885257860046, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:kTpcLYtuiYcJ:scholar.google.com/&scioq=Task+Conditioned+Stochastic+Subsampling&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0;1;0;0", "aff_unique_norm": "Korea Advanced Institute of Science and Technology;University of Oxford", "aff_unique_dep": ";", "aff_unique_url": "https://www.kaist.ac.kr;https://www.ox.ac.uk", "aff_unique_abbr": "KAIST;Oxford", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;0", "aff_country_unique": "South Korea;United Kingdom" }, { "id": "eV5d4I3eso", "title": "Geometric Random Walk Graph Neural Networks via Implicit Layers", "track": "main", "status": "Reject", "tldr": "", "abstract": "Graph neural networks have recently attracted a lot of attention and have been applied with great success to several important graph problems. The Random Walk Graph Neural Network model was recently proposed as a more intuitive alternative to the well-studied family of message passing neural networks. This model compares each input graph against a set of latent ``hidden graphs'' using a kernel that counts common random walks up to some length. In this paper, we propose a new architecture, called Geometric Random Walk Graph Neural Network (GRWNN), that generalizes the above model such that it can count common walks of infinite length in two graphs. The proposed model retains the transparency of Random Walk Graph Neural Networks since its first layer also consists of a number of trainable ``hidden graphs'' which are compared against the input graphs using the geometric random walk kernel. To compute the kernel, we employ a fixed-point iteration approach involving implicitly defined operations. Then, we capitalize on implicit differentiation to derive an efficient training scheme which requires only constant memory, regardless of the number of fixed-point iterations. The employed random walk kernel is differentiable, and therefore, the proposed model is end-to-end trainable. Experiments on standard graph classification datasets demonstrate the effectiveness of the proposed approach in comparison with state-of-the-art methods.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/c599431dd8329e677574e46c580b0f14a36e8005.zip", "author": "Giannis Nikolentzos;Michalis Vazirgiannis", "authorids": "~Giannis_Nikolentzos1;~Michalis_Vazirgiannis1", "gender": "M;M", "homepage": "http://users.uop.gr/~nikolentzos/;", "dblp": "163/6278;v/MVazirgiannis", "google_scholar": "bdom4I8AAAAJ;https://scholar.google.gr/citations?user=aWGJYcMAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Giannis_Nikolentzos1;~Michalis_Vazirgiannis1", "aff": "Ecole polytechnique;Ecole Polytechnique, France", "aff_domain": "polytechnique.edu;polytechnique.fr", "position": "Postdoc;Full Professor", "bibtex": "@misc{\nnikolentzos2022geometric,\ntitle={Geometric Random Walk Graph Neural Networks via Implicit Layers},\nauthor={Giannis Nikolentzos and Michalis Vazirgiannis},\nyear={2022},\nurl={https://openreview.net/forum?id=eV5d4I3eso}\n}", "github": "", "project": "", "reviewers": "iyPk;Nev5;rSPG;bEUY", "site": "https://openreview.net/forum?id=eV5d4I3eso", "pdf_size": 0, "recommendation": "5;5;5;5", "confidence": "4;5;4;4", "correctness": "3;2;4;4", "technical_novelty": "3;2;3;2", "empirical_novelty": "2;0;2;2", "wc_summary_paper": "72;161;141;50", "wc_summary_review": "47;21;63;15", "wc_main_review": "253;280;217;163", "wc_review": "372;462;421;228", "wc_reply_reviewers": "0;44;9;358", "wc_reply_authors": "356;235;396;214", "reply_reviewers": "0;1;1;1", "reply_authors": "1;1;1;1", "recommendation_avg": [ 5.0, 0.0 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 1.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 106.0, 46.211470437543966 ], "wc_summary_review_avg": [ 36.5, 19.461500456028563 ], "wc_main_review_avg": [ 228.25, 43.80282525134652 ], "wc_review_avg": [ 370.75, 88.36111984351489 ], "wc_reply_reviewers_avg": [ 102.75, 148.2824585040321 ], "wc_reply_authors_avg": [ 300.25, 77.41567993630231 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8685010607544929401&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0", "aff_unique_norm": "Ecole Polytechnique", "aff_unique_dep": "", "aff_unique_url": "https://www.polytechnique.edu", "aff_unique_abbr": "X", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "France" }, { "id": "eVzy-BWKY6Z", "title": "Edge Rewiring Goes Neural: Boosting Network Resilience via Policy Gradient", "track": "main", "status": "Reject", "tldr": "", "abstract": "Improving the resilience of a network protects the system from natural disasters and malicious attacks.\nThis is typically achieved by introducing new edges, which however may reach beyond the maximum number of connections a node could sustain.\nMany studies then resort to the degree-preserving operation of rewiring, which swaps existing edges $AC, BD$ to new edges $AB, CD$.\nA significant line of studies focuses on this technique for theoretical and practical results while leaving three limitations: network utility loss, local optimality, and transductivity. \nIn this paper, we propose ResiNet, a reinforcement learning (RL)-based framework to discover Resilient Network topologies against various disasters and attacks. \nResiNet is objective agnostic which allows the utility to be balanced by incorporating it into the objective function.\nThe local optimality, typically seen in greedy algorithms, is addressed by casting the cumulative resilience gain into a sequential decision process of step-wise rewiring.\nThe transductivity, which refers to the necessity to run a computationally intensive optimization for each input graph, is lifted by our variant of RL with auto-regressive permutation-invariant variable action space.\nResiNet is armed by our technical innovation, Filtration enhanced GNN (FireGNN), which distinguishes graphs with minor differences.\nIt is thus possible for ResiNet to capture local structure changes and adapt its decision among consecutive graphs, which is known to be infeasible for GNN.\nExtensive experiments demonstrate that with a small number of rewiring operations, ResiNet achieves a near-optimal resilience gain on multiple graphs while balancing the utility, with a large margin compared to existing approaches.", "keywords": "network resilience;neural combinatorial optimization;graph neural networks;reinforcement learning", "primary_area": "", "supplementary_material": "", "author": "Shanchao Yang;MA KAILI;Baoxiang Wang;Hongyuan Zha", "authorids": "~Shanchao_Yang1;~MA_KAILI1;~Baoxiang_Wang1;~Hongyuan_Zha1", "gender": "M;F;;", "homepage": "https://yangysc.github.io/;;;", "dblp": ";200/0854-1.html;;z/HongyuanZha", "google_scholar": "gakMZhcAAAAJ;;;n1DQMIsAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Shanchao_Yang1;~MA_KAILI1;~Baoxiang_Wang1;~Hongyuan_Zha1", "aff": "The Chinese University of Hong Kong, Shenzhen;Department of Computer Science and Engineering, The Chinese University of Hong Kong;;The Chinese University of Hong Kong, Shenzhen", "aff_domain": "cuhk.edu.cn;cse.cuhk.edu.hk;;cuhk.edu.cn", "position": "PhD student;PhD student;;Full Professor", "bibtex": "@misc{\nyang2022edge,\ntitle={Edge Rewiring Goes Neural: Boosting Network Resilience via Policy Gradient},\nauthor={Shanchao Yang and MA KAILI and Baoxiang Wang and Hongyuan Zha},\nyear={2022},\nurl={https://openreview.net/forum?id=eVzy-BWKY6Z}\n}", "github": "", "project": "", "reviewers": "Qtac;AvkM;vtoA", "site": "https://openreview.net/forum?id=eVzy-BWKY6Z", "pdf_size": 0, "recommendation": "3;3;6", "confidence": "5;3;3", "correctness": "3;4;3", "technical_novelty": "2;1;3", "empirical_novelty": "2;1;2", "wc_summary_paper": "41;11;67", "wc_summary_review": "14;6;67", "wc_main_review": "405;92;230", "wc_review": "460;109;364", "wc_reply_reviewers": "136;0;43", "wc_reply_authors": "3423;549;1196", "reply_reviewers": "2;0;1", "reply_authors": "5;1;3", "recommendation_avg": [ 4.0, 1.4142135623730951 ], "confidence_avg": [ 3.6666666666666665, 0.9428090415820634 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.0, 0.816496580927726 ], "empirical_novelty_avg": [ 1.6666666666666667, 0.4714045207910317 ], "wc_summary_paper_avg": [ 39.666666666666664, 22.88133640230735 ], "wc_summary_review_avg": [ 29.0, 27.067816067549053 ], "wc_main_review_avg": [ 242.33333333333334, 128.07896869596593 ], "wc_review_avg": [ 311.0, 148.11482032531384 ], "wc_reply_reviewers_avg": [ 59.666666666666664, 56.75874871379351 ], "wc_reply_authors_avg": [ 1722.6666666666667, 1230.9893943039838 ], "reply_reviewers_avg": [ 1.0, 0.816496580927726 ], "reply_authors_avg": [ 3.0, 1.632993161855452 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.5000000000000001, "corr_recommendation_correctness": -0.5, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:T0xO7guLzl0J:scholar.google.com/&scioq=Edge+Rewiring+Goes+Neural:+Boosting+Network+Resilience+via+Policy+Gradient&hl=en&as_sdt=0,5", "gs_version_total": 2, "aff_unique_index": "0;0;0", "aff_unique_norm": "Chinese University of Hong Kong", "aff_unique_dep": "", "aff_unique_url": "https://www.cuhk.edu.cn", "aff_unique_abbr": "CUHK", "aff_campus_unique_index": "0;1;0", "aff_campus_unique": "Shenzhen;Hong Kong SAR", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "On Predicting Generalization using GANs", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6778", "id": "eW5R4Cek6y6", "poster": "", "openreview": "https://openreview.net/forum?id=eW5R4Cek6y6", "slides": "https://iclr.cc/virtual/2022/poster/6778", "video": "https://iclr.cc/virtual/2022/poster/6778", "author_site": "Yi Zhang, Arushi Gupta, Nikunj Umesh Saunshi, Sanjeev Arora", "tldr": "", "abstract": "Research on generalization bounds for deep networks seeks to give ways to predict test error using just the training dataset and the network parameters. While generalization bounds can give many insights about architecture design, training algorithms etc., what they do not currently do is yield good predictions for actual test error. A recently introduced Predicting Generalization in Deep Learning competition aims to encourage discovery of methods to better predict test error. The current paper investigates a simple idea: can test error be predicted using {\\em synthetic data,} produced using a Generative Adversarial Network (GAN) that was trained on the same training dataset? Upon investigating several GAN models and architectures, we find that this turns out to be the case. \n\nIn fact, using GANs pre-trained on standard datasets, the test error can be predicted without requiring any additional hyper-parameter tuning. This result is surprising because GANs have well-known limitations (e.g. mode collapse) and are known to not learn the data distribution accurately. Yet the generated samples are good enough to substitute for test data. Several additional experiments are presented to explore reasons why GANs do well at this task. In addition to a new approach for predicting generalization, the counter-intuitive phenomena presented in our work may also call for a better understanding of GANs' strengths and limitations.", "keywords": "generalization;generative adversarial network", "primary_area": "", "supplementary_material": "", "author": "Yi Zhang;Arushi Gupta;Nikunj Saunshi;Sanjeev Arora", "authorids": "~Yi_Zhang1;~Arushi_Gupta1;~Nikunj_Saunshi1;~Sanjeev_Arora1", "gender": "M;;;", "homepage": "https://yi-zhang.me;;https://www.nikunjsaunshi.com/;http://www.cs.princeton.edu/~arora/", "dblp": "64/6544-74;;199/2236;a/SArora", "google_scholar": "lc6CVqEAAAAJ;;F24vXggAAAAJ;RUP4S68AAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Yi_Zhang1;~Arushi_Gupta1;~Nikunj_Saunshi1;~Sanjeev_Arora1", "aff": "Microsoft;Department of Computer Science, Princeton University;Princeton University;Princeton University", "aff_domain": "microsoft.com;cs.princeton.edu;princeton.edu;princeton.edu", "position": "Postdoc;PhD student;PhD student;Full Professor", "bibtex": "@inproceedings{\nzhang2022on,\ntitle={On Predicting Generalization using {GAN}s},\nauthor={Yi Zhang and Arushi Gupta and Nikunj Saunshi and Sanjeev Arora},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=eW5R4Cek6y6}\n}", "github": "", "project": "", "reviewers": "6YD8;d8Mz;jdtm;1WB7", "pdf_size": 0, "recommendation": "5;8;8;8", "confidence": "4;4;4;3", "correctness": "3;4;4;4", "technical_novelty": "3;3;3;4", "empirical_novelty": "3;3;4;4", "wc_summary_paper": "35;124;70;73", "wc_summary_review": "54;42;19;41", "wc_main_review": "284;446;144;107", "wc_review": "373;612;233;221", "wc_reply_reviewers": "34;0;0;6", "wc_reply_authors": "758;382;248;196", "reply_reviewers": "1;0;0;1", "reply_authors": "1;1;1;1", "recommendation_avg": [ 7.25, 1.299038105676658 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.5, 0.5 ], "wc_summary_paper_avg": [ 75.5, 31.737202145116697 ], "wc_summary_review_avg": [ 39.0, 12.62933094031509 ], "wc_main_review_avg": [ 245.25, 133.38548459258976 ], "wc_review_avg": [ 359.75, 157.41882828937585 ], "wc_reply_reviewers_avg": [ 10.0, 14.071247279470288 ], "wc_reply_authors_avg": [ 396.0, 219.74075634711008 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": 1.0, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6896139890020861985&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=eW5R4Cek6y6", "email": "microsoft.com;cs.princeton.edu;princeton.edu;princeton.edu", "author_num": 4, "aff_unique_index": "0;1;1;1", "aff_unique_norm": "Microsoft;Princeton University", "aff_unique_dep": "Microsoft Corporation;Department of Computer Science", "aff_unique_url": "https://www.microsoft.com;https://www.princeton.edu", "aff_unique_abbr": "Microsoft;Princeton", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "eWNpRVcfzi", "title": "MURO: Deployment Constrained Reinforcement Learning with Model-based Uncertainty Regularized Batch Optimization", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "In many contemporary applications such as healthcare, finance, robotics, and recommendation systems, continuous deployment of new policies for data collection and online learning is either cost ineffective or impractical. We consider a setting that lies between pure offline reinforcement learning (RL) and pure online RL called deployment constrained RL in which the number of policy deployments for data sampling is limited. To solve this challenging task, we propose a novel algorithmic learning framework called Model-based Uncertainty Regularized batch Optimization (MURO). Our framework discovers novel and high quality samples for each deployment to enable efficient data collection. During each offline training session, we bootstrap the policy update by quantifying the amount of uncertainty within our collected data. In the high support region (low uncertainty), we encourage our policy by taking an aggressive update. In the low support region (high uncertainty) when the policy bootstraps into the out-of-distribution region, we downweight it by our estimated uncertainty quantification. Experimental results show that MURO achieves state-of-the-art performance in the deployment constrained RL setting.", "keywords": "Deployment Constrained Reinforcement Learning;Deep Reinforcement Learning;Model-based Reinforcement Learning", "primary_area": "", "supplementary_material": "", "author": "DiJia Su;Jason D. Lee;John Mulvey;H. Vincent Poor", "authorids": "~DiJia_Su1;~Jason_D._Lee1;~John_Mulvey1;~H._Vincent_Poor1", "gender": ";M;M;M", "homepage": ";https://jasondlee88.github.io/;https://mulvey.princeton.edu;http://ee.princeton.edu/people/faculty/h-vincent-poor", "dblp": ";88/3262;;p/HVincentPoor", "google_scholar": ";GR_DsT0AAAAJ;TObKVqsAAAAJ;Dq93mOUAAAAJ", "orcid": ";;0000-0002-4290-0870;", "linkedin": ";;https://linkedin.com/in/john-mulvey-a6175823;vince-poor-974a3/", "or_profile": "~DiJia_Su1;~Jason_D._Lee1;~John_Mulvey1;~H._Vincent_Poor1", "aff": ";Princeton University;;Princeton University", "aff_domain": ";princeton.edu;;princeton.edu", "position": ";Assistant Professor;;Full Professor", "bibtex": "@misc{\nsu2022muro,\ntitle={{MURO}: Deployment Constrained Reinforcement Learning with Model-based Uncertainty Regularized Batch Optimization},\nauthor={DiJia Su and Jason D. Lee and John Mulvey and H. Vincent Poor},\nyear={2022},\nurl={https://openreview.net/forum?id=eWNpRVcfzi}\n}", "github": "", "project": "", "reviewers": "gigQ;QVVX;FFoW;9aMZ", "site": "https://openreview.net/forum?id=eWNpRVcfzi", "pdf_size": 0, "recommendation": "3;3;5;6", "confidence": "5;4;4;3", "correctness": "2;3;3;4", "technical_novelty": "1;2;3;2", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "78;87;52;85", "wc_summary_review": "68;23;51;77", "wc_main_review": "646;688;275;234", "wc_review": "792;798;378;396", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.25, 1.299038105676658 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 75.5, 13.97318861248212 ], "wc_summary_review_avg": [ 54.75, 20.571521577170707 ], "wc_main_review_avg": [ 460.75, 207.2913107199624 ], "wc_review_avg": [ 591.0, 204.1102643180886 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.816496580927726, "corr_recommendation_correctness": 0.816496580927726, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:hY7Na3cB8eQJ:scholar.google.com/&scioq=MURO:+Deployment+Constrained+Reinforcement+Learning+with+Model-based+Uncertainty+Regularized+Batch+Optimization&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Princeton University", "aff_unique_dep": "", "aff_unique_url": "https://www.princeton.edu", "aff_unique_abbr": "Princeton", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Efficient Neural Causal Discovery without Acyclicity Constraints", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6272", "id": "eYciPrLuUhG", "poster": "", "openreview": "https://openreview.net/forum?id=eYciPrLuUhG", "slides": "https://iclr.cc/virtual/2022/poster/6272", "video": "https://iclr.cc/virtual/2022/poster/6272", "author_site": "Phillip Lippe, Taco Cohen, Efstratios Gavves", "tldr": "", "abstract": "Learning the structure of a causal graphical model using both observational and interventional data is a fundamental problem in many scientific fields. A promising direction is continuous optimization for score-based methods, which, however, require constrained optimization to enforce acyclicity or lack convergence guarantees. In this paper, we present ENCO, an efficient structure learning method for directed, acyclic causal graphs leveraging observational and interventional data. ENCO formulates the graph search as an optimization of independent edge likelihoods, with the edge orientation being modeled as a separate parameter. Consequently, we provide for ENCO convergence guarantees under mild conditions, without having to constrain the score function with respect to acyclicity. In experiments, we show that ENCO can efficiently recover graphs with hundreds of nodes, an order of magnitude larger than what was previously possible, while handling deterministic variables and discovering latent confounders.", "keywords": "Causal discovery;structure learning", "primary_area": "", "supplementary_material": "", "author": "Phillip Lippe;Taco Cohen;Efstratios Gavves", "authorids": "~Phillip_Lippe1;~Taco_Cohen1;~Efstratios_Gavves1", "gender": "M;M;M", "homepage": "https://phlippe.github.io;http://www.ta.co.nl;https://www.egavves.com", "dblp": "267/9431;142/2903;03/8693", "google_scholar": "69hFZp4AAAAJ;a3q4YxEAAAAJ;https://scholar.google.nl/citations?user=QqfCvsgAAAAJ", "orcid": "0000-0002-3639-6938;;", "linkedin": "phillip-lippe/;;", "or_profile": "~Phillip_Lippe1;~Taco_Cohen1;~Efstratios_Gavves1", "aff": "University of Amsterdam;Qualcomm Inc, QualComm;University of Amsterdam", "aff_domain": "uva.nl;qti.qualcomm.com;uva.nl", "position": "PhD student;Principal Researcher;Associate Professor", "bibtex": "@inproceedings{\nlippe2022efficient,\ntitle={Efficient Neural Causal Discovery without Acyclicity Constraints},\nauthor={Phillip Lippe and Taco Cohen and Efstratios Gavves},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=eYciPrLuUhG}\n}", "github": "", "project": "", "reviewers": "Paee;qKQN;rJAC;b5p5;H7r5", "pdf_size": 0, "recommendation": "5;6;6;6;8", "confidence": "3;3;4;4;3", "correctness": "3;3;3;3;3", "technical_novelty": "2;3;3;3;3", "empirical_novelty": "2;3;2;4;3", "wc_summary_paper": "74;42;64;42;169", "wc_summary_review": "52;43;43;29;27", "wc_main_review": "385;142;282;932;304", "wc_review": "511;227;389;1003;500", "wc_reply_reviewers": "152;19;256;0;53", "wc_reply_authors": "1222;200;2044;2140;516", "reply_reviewers": "2;1;2;0;1", "reply_authors": "3;1;3;4;2", "recommendation_avg": [ 6.2, 0.9797958971132712 ], "confidence_avg": [ 3.4, 0.4898979485566356 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.8, 0.39999999999999997 ], "empirical_novelty_avg": [ 2.8, 0.7483314773547882 ], "wc_summary_paper_avg": [ 78.2, 47.08460470259892 ], "wc_summary_review_avg": [ 38.8, 9.431860898041277 ], "wc_main_review_avg": [ 409.0, 272.9644665519672 ], "wc_review_avg": [ 526.0, 259.4609797252758 ], "wc_reply_reviewers_avg": [ 96.0, 95.63472172804185 ], "wc_reply_authors_avg": [ 1224.4, 782.4703444859747 ], "reply_reviewers_avg": [ 1.2, 0.7483314773547883 ], "reply_authors_avg": [ 2.6, 1.019803902718557 ], "replies_avg": [ 27, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.16666666666666663, "corr_recommendation_correctness": 0.0, "gs_citation": 86, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17194562638128267873&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=eYciPrLuUhG", "email": "uva.nl;qti.qualcomm.com;uva.nl", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "University of Amsterdam;Qualcomm Incorporated", "aff_unique_dep": ";", "aff_unique_url": "https://www.uva.nl;https://www.qualcomm.com", "aff_unique_abbr": "UvA;Qualcomm", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", "aff_country_unique": "Netherlands;United States" }, { "id": "eYyvftCgtD", "title": "GroupBERT: Enhanced Transformer Architecture with Efficient Grouped Structures", "track": "main", "status": "Reject", "tldr": "", "abstract": "Attention based language models have become a critical component in state-of-the-art natural language processing systems. However, these models have significant computational requirements, due to long training times, dense operations and large parameter count. In this work we demonstrate a set of modifications to the structure of a Transformer layer, producing a more efficient architecture. First, we rely on grouped transformations to reduce the computational cost of dense feed-forward layers, while preserving the expressivity of the model . Secondly, we add a grouped convolution module to complement the self-attention module, decoupling the learning of local and global interactions. We apply the resulting architecture to language representation learning and demonstrate its superior performance compared to BERT models of different scales. We further highlight its improved efficiency, both in terms of floating-point operations (FLOPs) and time-to-train.", "keywords": "Transformer;BERT;self-supervision;compute efficiency;sparsity;convolution;natural language processing", "primary_area": "", "supplementary_material": "", "author": "Ivan Chelombiev;Daniel Justus;Douglas Orr;Anastasia S. D. Dietrich;Frithjof Gressmann;Alexandros Koliousis;Carlo Luschi", "authorids": "~Ivan_Chelombiev1;~Daniel_Justus1;~Douglas_Orr1;~Anastasia_S._D._Dietrich1;~Frithjof_Gressmann1;~Alexandros_Koliousis1;~Carlo_Luschi1", "gender": "M;M;M;F;M;M;M", "homepage": ";;https://douglasorr.github.io/;;https://frthjf.com/;https://akoliousis.com;", "dblp": ";;33/8535;;200/0179;;72/10621", "google_scholar": "https://scholar.google.com/citations?hl=en;ZMKOlBcAAAAJ;;;https://scholar.google.com/citations?hl=en;kQr7ip4AAAAJ;", "orcid": ";;;0000-0002-3839-0396;0009-0002-4155-7393;0000-0003-3006-9802;", "linkedin": "ivan-chelombiev-5a7790a9/;daniel-justus/;;;frithjof-gressmann-6a1606229/;alexandros-koliousis/;carlo-luschi-1908144/", "or_profile": "~Ivan_Chelombiev1;~Daniel_Justus1;~Douglas_Orr1;~Anastasia_S._D._Dietrich1;~Frithjof_Gressmann1;~Alexandros_Koliousis1;~Carlo_Luschi1", "aff": "Graphcore;Graphcore;Graphcore;Graphcore;University of Illinois, Urbana Champaign;New College of the Humanities, Northeastern University;Graphcore", "aff_domain": "graphcore.ai;graphcore.ai;graphcore.ai;graphcore.ai;illinois.edu;nchlondon.ac.uk;graphcore.ai", "position": "Researcher;Researcher;Researcher;Researcher;PhD student;Associate Professor;Director of Research", "bibtex": "@misc{\nchelombiev2022groupbert,\ntitle={Group{BERT}: Enhanced Transformer Architecture with Efficient Grouped Structures},\nauthor={Ivan Chelombiev and Daniel Justus and Douglas Orr and Anastasia S. D. Dietrich and Frithjof Gressmann and Alexandros Koliousis and Carlo Luschi},\nyear={2022},\nurl={https://openreview.net/forum?id=eYyvftCgtD}\n}", "github": "", "project": "", "reviewers": "EcxR;8smw;14JM;s6Mm", "site": "https://openreview.net/forum?id=eYyvftCgtD", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "5;4;4;3", "correctness": "3;3;3;3", "technical_novelty": "2;1;2;2", "empirical_novelty": "2;1;2;2", "wc_summary_paper": "85;77;65;24", "wc_summary_review": "86;119;21;31", "wc_main_review": "366;193;183;237", "wc_review": "537;389;269;292", "wc_reply_reviewers": "0;0;0;56", "wc_reply_authors": "836;615;398;610", "reply_reviewers": "0;0;0;1", "reply_authors": "2;2;2;2", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 1.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 62.75, 23.47738273317535 ], "wc_summary_review_avg": [ 64.25, 40.14582792769381 ], "wc_main_review_avg": [ 244.75, 72.89161474408425 ], "wc_review_avg": [ 371.75, 105.50207343934052 ], "wc_reply_reviewers_avg": [ 14.0, 24.24871130596428 ], "wc_reply_authors_avg": [ 614.75, 154.88281860813356 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 2.0, 0.0 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": -0.7071067811865475, "corr_recommendation_correctness": 0.0, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11367385825717892453&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0;0;0;1;2;0", "aff_unique_norm": "Graphcore;University of Illinois Urbana-Champaign;Northeastern University", "aff_unique_dep": ";;New College of the Humanities", "aff_unique_url": "https://www.graphcore.ai;https://illinois.edu;https://www.northeastern.edu", "aff_unique_abbr": "Graphcore;UIUC;NU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Urbana-Champaign", "aff_country_unique_index": "0;0;0;0;1;1;0", "aff_country_unique": "United Kingdom;United States" }, { "id": "eZ-xMLuKPc", "title": "Surgical Prediction with Interpretable Latent Representation", "track": "main", "status": "Reject", "tldr": "", "abstract": "Given the risks and cost of surgeries, there has been significant interest in exploiting predictive models to improve perioperative care. However, due to the high dimensionality and noisiness of perioperative data, it is challenging to develop accurate, robust and interpretable encoding for surgical applications. We propose surgical VAE (sVAE), a representation learning framework for perioperative data based on variational autoencoder (VAE). sVAE provides a holistic approach combining two salient features tailored for surgical applications. To overcome performance limitations of traditional VAE, it is prediction-guided with explicit expression of predicted outcome in the latent representation. Furthermore, it disentangles the latent space so that it can be interpreted in a clinically meaningful fashion. We apply sVAE to two real-world perioperative datasets and the open MIMIC-III dataset to evaluate its efficacy and performance in predicting diverse outcomes including surgery duration, postoperative complication, ICU duration, and mortality. Our results show that the latent representation provided by sVAE leads to superior performance in classification, regression and multi-task predictions. We further demonstrate the interpretability of the disentangled representation and its capability to capture intrinsic characteristics of surgical patients.", "keywords": "machine learning;healthcare applications;latent encoding;surgical predictions", "primary_area": "", "supplementary_material": "", "author": "Bing Xue;York Jiao;Thomas Kannampallil;Joanna Abraham;Christopher Ryan King;Bradley A Fritz;Michael Avidan;Chenyang Lu", "authorids": "~Bing_Xue1;york.jiao@wustl.edu;thomas.k@wustl.edu;~Joanna_Abraham1;~Christopher_Ryan_King1;~Bradley_A_Fritz1;~Michael_Avidan1;~Chenyang_Lu1", "gender": "M;;;;M;M;;M", "homepage": "http://xuebing1234.github.io;;;;;;;https://www.cse.wustl.edu/~lu/", "dblp": "40/6434-3;;;;;;;88/683", "google_scholar": "O78dLWMAAAAJ;;;;;;;https://scholar.google.com.tw/citations?user=tCq7Wx0AAAAJ", "orcid": "0000-0002-9162-098X;;;;0000-0002-4574-8616;0000-0002-7239-8877;;0000-0003-1709-6769", "linkedin": "bing-xue-047a6880/;;;;;;;", "or_profile": "~Bing_Xue1;york.jiao@wustl.edu;thomas.k@wustl.edu;~Joanna_Abraham1;~Christopher_Ryan_King1;~Bradley_A_Fritz1;~Michael_Avidan1;~Chenyang_Lu1", "aff": "Washington University, St. Louis;;;University of Illinois at Chicago;Washington University, St. Louis;Washington University, St. Louis;Washington University;Washington University, Saint Louis", "aff_domain": "wustl.edu;;;;wustl.edu;wustl.edu;;wustl.edu", "position": "PhD student;;;;Assistant Professor;Instructor;;Full Professor", "bibtex": "@misc{\nxue2022surgical,\ntitle={Surgical Prediction with Interpretable Latent Representation},\nauthor={Bing Xue and York Jiao and Thomas Kannampallil and Joanna Abraham and Christopher Ryan King and Bradley A Fritz and Michael Avidan and Chenyang Lu},\nyear={2022},\nurl={https://openreview.net/forum?id=eZ-xMLuKPc}\n}", "github": "", "project": "", "reviewers": "56Ru;4Kio;CEnW", "site": "https://openreview.net/forum?id=eZ-xMLuKPc", "pdf_size": 0, "recommendation": "3;5;6", "confidence": "4;4;4", "correctness": "3;4;3", "technical_novelty": "2;3;2", "empirical_novelty": "1;0;3", "wc_summary_paper": "178;70;78", "wc_summary_review": "108;72;29", "wc_main_review": "197;77;176", "wc_review": "483;219;283", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "1116;245;503", "reply_reviewers": "0;0;0", "reply_authors": "2;1;1", "recommendation_avg": [ 4.666666666666667, 1.247219128924647 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 1.3333333333333333, 1.247219128924647 ], "wc_summary_paper_avg": [ 108.66666666666667, 49.13473539383541 ], "wc_summary_review_avg": [ 69.66666666666667, 32.293790252754306 ], "wc_main_review_avg": [ 150.0, 52.32590180780452 ], "wc_review_avg": [ 328.3333333333333, 112.44356609230942 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 621.3333333333334, 365.2965309930489 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.18898223650461363, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:o49OVRocA-EJ:scholar.google.com/&scioq=Surgical+Prediction+with+Interpretable+Latent+Representation&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1;0;0;0;0", "aff_unique_norm": "Washington University in St. Louis;University of Illinois at Chicago", "aff_unique_dep": ";", "aff_unique_url": "https://wustl.edu;https://www.uic.edu", "aff_unique_abbr": "WUSTL;UIC", "aff_campus_unique_index": "0;1;0;0;0;2", "aff_campus_unique": "St. Louis;Chicago;Saint Louis", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "e_D6AmszH4P", "title": "ViViT: Curvature access through the generalized Gauss-Newton's low-rank structure", "track": "main", "status": "Reject", "tldr": "", "abstract": "Curvature in form of the Hessian or its generalized Gauss-Newton (GGN) approximation is valuable for algorithms that rely on a local model for the loss to train, compress, or explain deep networks. Existing methods based on implicit multiplication via automatic differentiation or Kronecker-factored block diagonal approximations do not consider noise in the mini-batch. We present ViViT, a curvature model that leverages the GGN's low-rank structure without further approximations. It allows for efficient computation of eigenvalues, eigenvectors, as well as per-sample first- and second-order directional derivatives. The representation is computed in parallel with gradients in one backward pass and offers a fine-grained cost-accuracy trade-off, which allows it to scale. As examples for ViViT's usefulness, we investigate the directional first- and second-order derivatives during training, and how noise information can be used to improve the stability of second-order methods.", "keywords": "generalized Gauss-Newton;curvature;second-order methods;Hessian spectrum in deep learning;automatic differentiation", "primary_area": "", "supplementary_material": "/attachment/4bec68ae3fe009ea28c1f059a933c9108b6b6738.zip", "author": "Felix Dangel;Lukas Tatzel;Philipp Hennig", "authorids": "~Felix_Dangel1;~Lukas_Tatzel1;~Philipp_Hennig1", "gender": "M;M;M", "homepage": "https://f-dangel.com;https://github.com/ltatzel;http://mml.inf.uni-tuebingen.de", "dblp": "236/4218;;08/9077", "google_scholar": "9hlJ9W0AAAAJ;;https://scholar.google.de/citations?user=UeG5w08AAAAJ", "orcid": "0000-0002-1414-8554;;0000-0001-7293-6092", "linkedin": ";;", "or_profile": "~Felix_Dangel1;~Lukas_Tatzel1;~Philipp_Hennig1", "aff": "University of Tuebingen;University of T\u00fcbingen;Max Planck Institute for Intelligent Systems, Max-Planck Institute", "aff_domain": "uni-tuebingen.de;uni-tuebingen.de;tuebingen.mpg.de", "position": "PhD student;PhD student;Adjunct Professor", "bibtex": "@misc{\ndangel2022vivit,\ntitle={ViViT: Curvature access through the generalized Gauss-Newton's low-rank structure},\nauthor={Felix Dangel and Lukas Tatzel and Philipp Hennig},\nyear={2022},\nurl={https://openreview.net/forum?id=e_D6AmszH4P}\n}", "github": "", "project": "", "reviewers": "9mCz;VGMZ;1dgm;GBwe", "site": "https://openreview.net/forum?id=e_D6AmszH4P", "pdf_size": 0, "recommendation": "5;5;5;6", "confidence": "4;2;4;4", "correctness": "3;3;4;3", "technical_novelty": "2;2;1;3", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "94;30;80;87", "wc_summary_review": "97;39;81;48", "wc_main_review": "581;95;285;261", "wc_review": "772;164;446;396", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 5.25, 0.4330127018922193 ], "confidence_avg": [ 3.5, 0.8660254037844386 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 72.75, 25.17315037892556 ], "wc_summary_review_avg": [ 66.25, 23.657715443381257 ], "wc_main_review_avg": [ 305.5, 175.07926776177698 ], "wc_review_avg": [ 444.5, 216.96255437286868 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": -0.3333333333333333, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1542630790580349940&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff_unique_index": "0;1;2", "aff_unique_norm": "University of Tuebingen;University of T\u00fcbingen;Max Planck Institute for Intelligent Systems", "aff_unique_dep": ";;Intelligent Systems", "aff_unique_url": "https://www.uni-tuebingen.de/;https://www.uni-tuebingen.de/;https://www.mpi-is.mpg.de", "aff_unique_abbr": "Uni T\u00fcbingen;Uni T\u00fcbingen;MPI-IS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Germany" }, { "id": "e_FK_rDajEv", "title": "Learning Neural Causal Models with Active Interventions", "track": "main", "status": "Reject", "tldr": "", "abstract": "Discovering causal structures from data is a challenging inference problem of fundamental importance in all areas of science. The appealing scaling properties of neural networks have recently led to a surge of interest in differentiable neural network-based methods for learning causal structures from data. So far, differentiable causal discovery has focused on static datasets of observational or interventional origin. In this work, we introduce an active intervention-targeting mechanism which enables quick identification of the underlying causal structure of the data-generating process. Our method significantly reduces the required number of interactions compared with random intervention targeting and is applicable for both discrete and continuous optimization formulations of learning the underlying directed acyclic graph (DAG) from data. We examine the proposed method across multiple frameworks in a wide range of settings and demonstrate superior performance on multiple benchmarks from simulated to real-world data. ", "keywords": "neural causal discovery;causal structure learning;active learning;experimental design", "primary_area": "", "supplementary_material": "", "author": "Nino Scherrer;Olexa Bilaniuk;Yashas Annadani;Anirudh Goyal;Patrick Schwab;Bernhard Sch\u00f6lkopf;Michael Curtis Mozer;Yoshua Bengio;Stefan Bauer;Nan Rosemary Ke", "authorids": "~Nino_Scherrer1;~Olexa_Bilaniuk1;~Yashas_Annadani1;~Anirudh_Goyal1;~Patrick_Schwab1;~Bernhard_Sch\u00f6lkopf1;~Michael_Curtis_Mozer1;~Yoshua_Bengio1;~Stefan_Bauer1;~Nan_Rosemary_Ke1", "gender": "M;M;;M;;;M;M;;F", "homepage": "https://ninodimontalcino.github.io/;;https://yashasannadani.com;https://anirudh9119.github.io/;http://schwabpatrick.com;;https://www.cs.colorado.edu/~mozer;http://yoshuabengio.org;https://cifar.ca/bios/stefan-bauer/;https://nke001.github.io/", "dblp": "295/0198;158/5760;190/7411;172/1039;152/9378;;m/MichaelCMozer;56/953;;120/5291", "google_scholar": "CG9n26kAAAAJ;;ExgzcVMAAAAJ;krrh6OUAAAAJ;https://scholar.google.at/citations?hl=de;;lmjR_qMAAAAJ;kukA0LcAAAAJ;O-oICE8AAAAJ;https://scholar.google.ca/citations?user=dxwPYhQAAAAJ", "orcid": ";;;;;;;;;", "linkedin": ";;;;;;;yoshuabengio/?originalSubdomain=ca;;", "or_profile": "~Nino_Scherrer1;~Olexa_Bilaniuk1;~Yashas_Annadani1;~Anirudh_Goyal1;~Patrick_Schwab1;~Bernhard_Sch\u00f6lkopf1;~Michael_Curtis_Mozer1;~Yoshua_Bengio1;~Stefan_Bauer1;~Nan_Rosemary_Ke1", "aff": "ETH Zurich (Swiss Federal Institute of Technology);;KTH Royal Institute of Technology;University of Montreal;GlaxoSmithKline plc;;Google DeepMind;University of Montreal;KTH Royal Institute of Technology;Mila", "aff_domain": "ethz.ch;;kth.se;umontreal.ca;gsk.com;;google.com;umontreal.ca;kth.se;mila.quebec", "position": "MS student;;PhD student;PhD student;Director;;Research Scientist;Full Professor;Assistant Professor;PhD student", "bibtex": "@misc{\nscherrer2022learning,\ntitle={Learning Neural Causal Models with Active Interventions},\nauthor={Nino Scherrer and Olexa Bilaniuk and Yashas Annadani and Anirudh Goyal and Patrick Schwab and Bernhard Sch{\\\"o}lkopf and Michael Curtis Mozer and Yoshua Bengio and Stefan Bauer and Nan Rosemary Ke},\nyear={2022},\nurl={https://openreview.net/forum?id=e_FK_rDajEv}\n}", "github": "", "project": "", "reviewers": "cZsR;KxZA;ma3K", "site": "https://openreview.net/forum?id=e_FK_rDajEv", "pdf_size": 0, "recommendation": "3;5;5", "confidence": "2;4;4", "correctness": "2;3;3", "technical_novelty": "2;3;2", "empirical_novelty": "2;2;2", "wc_summary_paper": "84;59;51", "wc_summary_review": "29;51;73", "wc_main_review": "380;486;393", "wc_review": "493;596;517", "wc_reply_reviewers": "112;0;192", "wc_reply_authors": "1598;706;755", "reply_reviewers": "1;0;1", "reply_authors": "3;1;1", "recommendation_avg": [ 4.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 3.3333333333333335, 0.9428090415820634 ], "correctness_avg": [ 2.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 64.66666666666667, 14.055445761538676 ], "wc_summary_review_avg": [ 51.0, 17.962924780409974 ], "wc_main_review_avg": [ 419.6666666666667, 47.204048790566915 ], "wc_review_avg": [ 535.3333333333334, 44.00252518006463 ], "wc_reply_reviewers_avg": [ 101.33333333333333, 78.74572298791146 ], "wc_reply_authors_avg": [ 1019.6666666666666, 409.4323984357966 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.9428090415820634 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 10, 0 ], "corr_recommendation_confidence": 1.0, "corr_recommendation_correctness": 0.9999999999999998, "gs_citation": 52, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17749637350560911885&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff_unique_index": "0;1;2;3;4;2;1;5", "aff_unique_norm": "ETH Zurich;KTH Royal Institute of Technology;University of Montreal;GlaxoSmithKline;Google;Mila", "aff_unique_dep": ";;;;Google DeepMind;Quebec Artificial Intelligence Institute", "aff_unique_url": "https://www.ethz.ch;https://www.kth.se;https://wwwumontreal.ca;https://www.gsk.com;https://deepmind.com;https://mila.quebec", "aff_unique_abbr": "ETHZ;KTH;UM;GSK;DeepMind;Mila", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2;3;3;2;1;2", "aff_country_unique": "Switzerland;Sweden;Canada;United Kingdom" }, { "id": "ebZ0gGRJwQx", "title": "On the Convergence of Shallow Neural Network Training with Randomly Masked Neurons", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Given a dense shallow neural network, we focus on iteratively creating, training, and combining randomly selected subnetworks (surrogate functions), towards training the full model. By carefully analyzing $i)$ the subnetworks' neural tangent kernel, $ii)$ the surrogate functions' gradient, and $iii)$ how we sample and combine the surrogate functions, we prove linear convergence rate of the training error --within an error region-- for an overparameterized single-hidden layer perceptron with ReLU activations for a regression task. Our result implies that, for fixed neuron selection probability, the error term decreases as we increase the number of surrogate models, and increases as we increase the number of local training steps for each selected subnetwork. The considered framework generalizes and provides new insights on dropout training, multi-sample dropout training, as well as Independent Subnet Training; for each case, we provide corresponding convergence results, as corollaries of our main theorem.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/79e2cd49bb4c13dff917ff49b95c9352d563e495.zip", "author": "Fangshuo Liao;Anastasios Kyrillidis", "authorids": "~Fangshuo_Liao1;~Anastasios_Kyrillidis2", "gender": "M;M", "homepage": "https://jasperliao.github.io/;http://akyrillidis.github.io", "dblp": "308/2837;53/9879", "google_scholar": "WIwcFN8AAAAJ;TEGzkZMAAAAJ", "orcid": ";", "linkedin": "fangshuo-liao-698043141/;", "or_profile": "~Fangshuo_Liao1;~Anastasios_Kyrillidis2", "aff": "Rice University;Rice University", "aff_domain": "rice.edu;rice.edu", "position": "PhD student;Assistant Professor", "bibtex": "@misc{\nliao2022on,\ntitle={On the Convergence of Shallow Neural Network Training with Randomly Masked Neurons},\nauthor={Fangshuo Liao and Anastasios Kyrillidis},\nyear={2022},\nurl={https://openreview.net/forum?id=ebZ0gGRJwQx}\n}", "github": "", "project": "", "reviewers": "Xapj;fUZe;3Yqj;5dbR", "site": "https://openreview.net/forum?id=ebZ0gGRJwQx", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "3;4;3;4", "correctness": "3;2;3;3", "technical_novelty": "3;3;3;3", "empirical_novelty": "1;3;0;0", "wc_summary_paper": "185;37;83;96", "wc_summary_review": "94;40;30;33", "wc_main_review": "309;220;275;167", "wc_review": "588;297;388;296", "wc_reply_reviewers": "72;0;51;95", "wc_reply_authors": "655;607;393;655", "reply_reviewers": "1;0;1;1", "reply_authors": "1;1;1;1", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 1.0, 1.224744871391589 ], "wc_summary_paper_avg": [ 100.25, 53.6161123171011 ], "wc_summary_review_avg": [ 49.25, 26.089988501338976 ], "wc_main_review_avg": [ 242.75, 54.04801106423806 ], "wc_review_avg": [ 392.25, 119.03019574880989 ], "wc_reply_reviewers_avg": [ 54.5, 35.103418636936205 ], "wc_reply_authors_avg": [ 577.5, 108.30858691719692 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5787256965156136082&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0", "aff_unique_norm": "Rice University", "aff_unique_dep": "", "aff_unique_url": "https://www.rice.edu", "aff_unique_abbr": "Rice", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "ebl1ssKFHBb", "title": "Differentiable Discrete Device-to-System Codesign for Optical Neural Networks via Gumbel-Softmax", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Deep neural networks (DNNs) have significantly improved the productions in many areas like large-scale computer vision and natural language processing. While conventional DNNs implemented on digital platforms have intrinsic limitations in computation and memory requirements, optical neural networks (ONNs), such as diffractive optical neural networks (DONNs), have attracted lots of attention as they can bring significant advantages in terms of power efficiency, parallelism, and computational speed. In order to train DONNs, fully differentiable physical optical propagations have been developed, which can be used to train the physical parameters in optical systems using conventional gradient descent algorithms. However, inversely mapping algorithm-trained physical model parameters onto the applied stimulus in real-world optical devices is a non-trivial task, which can involve multiple imperfections (e.g., quantization and non-monotonicity) and is especially challenging in complex-valued domains. This work proposes a novel device-to-system hardware-software codesign framework, which enables efficient training of DONNs w.r.t arbitrary experimental measured optical devices across layers. Specifically, Gumbel-Softmax with a novel complex-domain regularization method is employed to enable differentiable one-to-one mapping from discrete device parameters into the forward function of DONNs, where the physical parameters in DONNs can be trained by simply minimizing the loss function of the ML task. The experimental results have demonstrated significant advantages over traditional quantization-based methods with low-precision optical devices (e.g., 8 discrete values), with ~20% accuracy improvements for MNIST and ~28% for FashionMNIST. More importantly, our framework provides high versatility in codesign even for one system implemented with mixed optical devices. In addition, we include comprehensive studies of regularization analysis, temperature scheduling exploration, and runtime complexity evaluation of the proposed framework.", "keywords": "hardware-software codesign;optical neural network;regularization", "primary_area": "", "supplementary_material": "/attachment/6730faf09a29dedeb00ad4748e741b1b6bd4d12d.zip", "author": "Yingjie Li;Ruiyang Chen;Weilu Gao;CUNXI YU", "authorids": "~Yingjie_Li1;ruiyang.chen@utah.edu;~Weilu_Gao1;~CUNXI_YU1", "gender": "F;;M;", "homepage": "https://www.ece.utah.edu;;;", "dblp": ";;;", "google_scholar": ";;5v99W3kAAAAJ;", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Yingjie_Li1;ruiyang.chen@utah.edu;~Weilu_Gao1;~CUNXI_YU1", "aff": "University of Utah;;University of Utah;", "aff_domain": "utah.edu;;utah.edu;", "position": "PhD student;;Assistant Professor;", "bibtex": "@misc{\nli2022differentiable,\ntitle={Differentiable Discrete Device-to-System Codesign for Optical Neural Networks via Gumbel-Softmax},\nauthor={Yingjie Li and Ruiyang Chen and Weilu Gao and CUNXI YU},\nyear={2022},\nurl={https://openreview.net/forum?id=ebl1ssKFHBb}\n}", "github": "", "project": "", "reviewers": "RVdT;UdJ7;gePF;RAke", "site": "https://openreview.net/forum?id=ebl1ssKFHBb", "pdf_size": 0, "recommendation": "5;5;5;6", "confidence": "2;4;4;4", "correctness": "4;3;3;3", "technical_novelty": "3;2;2;3", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "39;59;89;39", "wc_summary_review": "107;20;48;29", "wc_main_review": "67;310;246;276", "wc_review": "213;389;383;344", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 5.25, 0.4330127018922193 ], "confidence_avg": [ 3.5, 0.8660254037844386 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 56.5, 20.463381929681123 ], "wc_summary_review_avg": [ 51.0, 33.87476937190864 ], "wc_main_review_avg": [ 224.75, 93.84928076442569 ], "wc_review_avg": [ 332.25, 70.98371292064117 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": -0.3333333333333333, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:O21TGgg53s4J:scholar.google.com/&scioq=Differentiable+Discrete+Device-to-System+Codesign+for+Optical+Neural+Networks+via+Gumbel-Softmax&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "University of Utah", "aff_unique_dep": "", "aff_unique_url": "https://www.utah.edu", "aff_unique_abbr": "Utah", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "An Information Fusion Approach to Learning with Instance-Dependent Label Noise", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7088", "id": "ecH2FKaARUp", "poster": "", "openreview": "https://openreview.net/forum?id=ecH2FKaARUp", "slides": "https://iclr.cc/virtual/2022/poster/7088", "video": "https://iclr.cc/virtual/2022/poster/7088", "author_site": "Zhimeng Jiang, Kaixiong Zhou, Zirui Liu, Li Li, Rui Chen, Soo-Hyun Choi, Xia Hu", "tldr": "", "abstract": "Instance-dependent label noise (IDN) widely exists in real-world datasets and usually misleads the training of deep neural networks. Noise transition matrix (NTM) (i.e., the probability that clean labels flip into noisy labels) is used to characterize the label noise and can be adopted to bridge the gap between clean and noisy underlying data distributions. However, most instances are long-tail, i.e., the number of occurrences of each instance is usually limited, which leads to the gap between the underlying distribution and the empirical distribution. Therefore, the genuine problem caused by IDN is \\emph{empirical}, instead of underlying, \\emph{data distribution mismatch} during training. To directly tackle the empirical distribution mismatch problem, we propose \\emph{posterior transition matrix} (PTM) to posteriorly model label noise given limited observed noisy labels, which achieves \\emph{statistically consistent classifiers}. Note that even if an instance is corrupted by the same NTM, the intrinsic randomness incurs different noisy labels, and thus requires different correction methods. Motivated by this observation, we propose an \\textbf{I}nformation \\textbf{F}usion (IF) approach to fine-tune the NTM based on the estimated PTM. Specifically, we adopt the noisy labels and model predicted probabilities to estimate the PTM and then correct the NTM in \\emph{forward propagation}. Empirical evaluations on synthetic and real-world datasets demonstrate that our method is superior to the state-of-the-art approaches, and achieves more stable training for instance-dependent label noise. ", "keywords": "Instance-dependent label noise;posterior transition matrix;statiscally consistent classifier", "primary_area": "", "supplementary_material": "", "author": "Zhimeng Jiang;Kaixiong Zhou;Zirui Liu;Li Li;Rui Chen;Soo-Hyun Choi;Xia Hu", "authorids": "~Zhimeng_Jiang1;~Kaixiong_Zhou1;~Zirui_Liu1;~Li_Li11;~Rui_Chen4;~Soo-Hyun_Choi1;~Xia_Hu4", "gender": "M;M;M;M;;M;", "homepage": "http://www.zhimengjiang.com/;https://kaixiong-zhou.github.io/;https://zirui-ray-liu.github.io/;;;;", "dblp": "217/3235;178/7315;196/8629-1.html;53/2189-35;;185/1826;", "google_scholar": "5Es3Yk4AAAAJ;zMspIjIAAAAJ;https://scholar.google.com/citations?hl=zh-CN;FPcI7HkAAAAJ;;Lm1BDEoAAAAJ;", "orcid": "0000-0001-6933-3952;0000-0001-5226-8736;;0000-0002-3365-8904;;0000-0001-5768-9978;", "linkedin": ";;;li-li-b8a08664/;;soo-hyun-choi-706b5297/;", "or_profile": "~Zhimeng_Jiang1;~Kaixiong_Zhou1;~Zirui_Liu1;~Li_Li11;~Rui_Chen4;~Soo-Hyun_Choi1;~Xia_Hu4", "aff": "Texas A&M University;Rice University;Rice University;Samsung;;Samsung Electronics America;", "aff_domain": "tamu.edu;rice.edu;rice.edu;samsung.com;;samsung.com;", "position": "PhD student;PhD student;PhD student;Researcher;;Principal Researcher;", "bibtex": "@inproceedings{\njiang2022an,\ntitle={An Information Fusion Approach to Learning with Instance-Dependent Label Noise},\nauthor={Zhimeng Jiang and Kaixiong Zhou and Zirui Liu and Li Li and Rui Chen and Soo-Hyun Choi and Xia Hu},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=ecH2FKaARUp}\n}", "github": "", "project": "", "reviewers": "d7yz;8i2r;oWz1;cR7o", "pdf_size": 0, "recommendation": "5;5;5;8", "confidence": "4;2;4;5", "correctness": "2;3;3;4", "technical_novelty": "3;3;4;3", "empirical_novelty": "2;3;4;2", "wc_summary_paper": "43;85;81;63", "wc_summary_review": "50;38;56;302", "wc_main_review": "355;228;357;70", "wc_review": "448;351;494;435", "wc_reply_reviewers": "771;102;0;55", "wc_reply_authors": "3893;947;1218;1125", "reply_reviewers": "4;1;0;1", "reply_authors": "9;2;2;3", "recommendation_avg": [ 5.75, 1.299038105676658 ], "confidence_avg": [ 3.75, 1.0897247358851685 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 3.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 68.0, 16.64331697709324 ], "wc_summary_review_avg": [ 111.5, 110.17599557072312 ], "wc_main_review_avg": [ 252.5, 117.61483749935634 ], "wc_review_avg": [ 432.0, 51.647846034466916 ], "wc_reply_reviewers_avg": [ 232.0, 313.2786299765753 ], "wc_reply_authors_avg": [ 1795.75, 1214.7566371500095 ], "reply_reviewers_avg": [ 1.5, 1.5 ], "reply_authors_avg": [ 4.0, 2.9154759474226504 ], "replies_avg": [ 29, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.662266178532522, "corr_recommendation_correctness": 0.816496580927726, "gs_citation": 45, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18051756974220714733&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2, "pdf": "https://openreview.net/pdf?id=ecH2FKaARUp", "email": "tamu.edu;rice.edu;rice.edu;samsung.com;;samsung.com;", "author_num": 7, "aff_unique_index": "0;1;1;2;2", "aff_unique_norm": "Texas A&M University;Rice University;Samsung", "aff_unique_dep": ";;Samsung", "aff_unique_url": "https://www.tamu.edu;https://www.rice.edu;https://www.samsung.com", "aff_unique_abbr": "TAMU;Rice;Samsung", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;0", "aff_country_unique": "United States;South Korea" }, { "id": "edN_G_4njyi", "title": "On the Impact of Client Sampling on Federated Learning Convergence", "track": "main", "status": "Reject", "tldr": "", "abstract": "While clients' sampling is a central operation of current state-of-the-art federated learning (FL) approaches, the impact of this procedure on the convergence and speed of FL remains under-investigated.In this work we introduce a novel decomposition theorem for the convergence of FL, allowing to clearly quantify the impact of client sampling on the global model update. Contrarily to previous convergence analyses, our theorem provides the exact decomposition of a given convergence step, thus enabling accurate considerations about the role of client sampling and heterogeneity. First, we provide a theoretical ground for previously reported experimental results on the relationship between FL convergence and the variance of the aggregation weights. Second, we prove for the first time that the quality of FL convergence is also impacted by the resulting \\emph{covariance} between aggregation weights. Our theory is general, and is here applied to Multinomial Distribution (MD) and Uniform sampling, the two default client sampling schemes of FL, and demonstrated through a series of experiments in non-iid and unbalanced scenarios. Our results suggest that MD sampling should be used as default sampling scheme, due to the resilience to the changes in data ratio during the learning process, while Uniform sampling is superior only in the special case when clients have the same amount of data.", "keywords": "Federated learning;client sampling;bias;convergence rate;distributed optimization;data heterogeneity", "primary_area": "", "supplementary_material": "/attachment/f32c9a2234c1afd588d941d05c1947bbf0c38819.zip", "author": "Yann Fraboni;Richard Vidal;Laetitia Kameni;Marco Lorenzi", "authorids": "~Yann_Fraboni1;~Richard_Vidal1;~Laetitia_Kameni1;~Marco_Lorenzi1", "gender": "M;M;F;M", "homepage": "https://www.linkedin.com/in/yannfraboni;;;https://marcolorenzi.github.io", "dblp": ";;;http://dblp.uni-trier.de/pers/hd/l/Lorenzi:Marco", "google_scholar": ";;;gOkjmJcAAAAJ", "orcid": ";;;", "linkedin": ";richardvidal/;laetitia-kameni-7111796/;marco-lorenzi-45105785/", "or_profile": "~Yann_Fraboni1;~Richard_Vidal1;~Laetitia_Kameni1;~Marco_Lorenzi1", "aff": "Accenture;Accenture;Accenture;Inria, France", "aff_domain": "accenture.com;accenture.com;accenture.com;inria.fr", "position": "PhD student;Researcher;Researcher;Principal Researcher", "bibtex": "@misc{\nfraboni2022on,\ntitle={On the Impact of Client Sampling on Federated Learning Convergence},\nauthor={Yann Fraboni and Richard Vidal and Laetitia Kameni and Marco Lorenzi},\nyear={2022},\nurl={https://openreview.net/forum?id=edN_G_4njyi}\n}", "github": "", "project": "", "reviewers": "FJmg;c4xq;Sg3m;mcBg;LQPA", "site": "https://openreview.net/forum?id=edN_G_4njyi", "pdf_size": 0, "recommendation": "3;3;5;5;6", "confidence": "3;4;4;3;5", "correctness": "4;4;4;3;4", "technical_novelty": "2;2;3;3;3", "empirical_novelty": "2;2;2;3;2", "wc_summary_paper": "40;158;68;33;89", "wc_summary_review": "19;95;47;60;79", "wc_main_review": "371;294;536;356;373", "wc_review": "430;547;651;449;541", "wc_reply_reviewers": "114;816;278;215;107", "wc_reply_authors": "466;1141;819;662;523", "reply_reviewers": "1;2;1;1;2", "reply_authors": "2;2;2;2;1", "recommendation_avg": [ 4.4, 1.2 ], "confidence_avg": [ 3.8, 0.7483314773547882 ], "correctness_avg": [ 3.8, 0.39999999999999997 ], "technical_novelty_avg": [ 2.6, 0.4898979485566356 ], "empirical_novelty_avg": [ 2.2, 0.39999999999999997 ], "wc_summary_paper_avg": [ 77.6, 44.92037399666214 ], "wc_summary_review_avg": [ 60.0, 26.21449980449751 ], "wc_main_review_avg": [ 386.0, 80.32185256827684 ], "wc_review_avg": [ 523.6, 79.2555360842383 ], "wc_reply_reviewers_avg": [ 306.0, 262.9182382414731 ], "wc_reply_authors_avg": [ 722.2, 242.36616925635477 ], "reply_reviewers_avg": [ 1.4, 0.4898979485566356 ], "reply_authors_avg": [ 1.8, 0.4 ], "replies_avg": [ 24, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.5345224838248488, "corr_recommendation_correctness": -0.24999999999999994, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7263315350369366254&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 2, "aff_unique_index": "0;0;0;1", "aff_unique_norm": "Accenture;INRIA", "aff_unique_dep": ";", "aff_unique_url": "https://www.accenture.com;https://www.inria.fr", "aff_unique_abbr": "Accenture;Inria", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1", "aff_country_unique": "United States;France" }, { "title": "Surrogate Gap Minimization Improves Sharpness-Aware Training", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6070", "id": "edONMAnhLu-", "poster": "", "openreview": "https://openreview.net/forum?id=edONMAnhLu-", "slides": "https://iclr.cc/virtual/2022/poster/6070", "video": "https://iclr.cc/virtual/2022/poster/6070", "author_site": "Juntang Zhuang, Boqing Gong, Liangzhe Yuan, Yin Cui, Hartwig Adam, Nicha C Dvornek, sekhar tatikonda, James s Duncan, Ting Liu", "tldr": "", "abstract": "The recently proposed Sharpness-Aware Minimization (SAM) improves generalization by minimizing a perturbed loss defined as the maximum loss within a neighborhood in the parameter space. However, we show that both sharp and flat minima can have a low perturbed loss, implying that SAM does not always prefer flat minima. Instead, we define a surrogate gap, a measure equivalent to the dominant eigenvalue of Hessian at a local minimum when the radius of neighborhood (to derive the perturbed loss) is small. The surrogate gap is easy to compute and feasible for direct minimization during training. Based on the above observations, we propose Surrogate Gap Guided Sharpness-Aware Minimization (GSAM), a novel improvement over SAM with negligible computation overhead. Conceptually, GSAM consists of two steps: 1) a gradient descent like SAM to minimize the perturbed loss, and 2) an ascent step in the orthogonal direction (after gradient decomposition) to minimize the surrogate gap and yet not affect the perturbed loss. GSAM seeks a region with both small loss (by step 1) and low sharpness (by step 2), giving rise to a model with high generalization capabilities. Theoretically, we show the convergence of GSAM and provably better generalization than SAM.Empirically, GSAM consistently improves generalization (e.g., +3.2% over SAM and +5.4% over AdamW on ImageNet top-1 accuracy for ViT-B/32). Code is released at https://sites.google.com/view/gsam-iclr22/home", "keywords": "generalization;sharpness-aware minimization;surrogate gap;deep learning", "primary_area": "", "supplementary_material": "/attachment/210c14e034871d4f8f5976c1586aaa7425b7d225.zip", "author": "Juntang Zhuang;Boqing Gong;Liangzhe Yuan;Yin Cui;Hartwig Adam;Nicha C Dvornek;sekhar tatikonda;James s Duncan;Ting Liu", "authorids": "~Juntang_Zhuang1;~Boqing_Gong1;~Liangzhe_Yuan2;~Yin_Cui1;~Hartwig_Adam1;~Nicha_C_Dvornek1;~sekhar_tatikonda1;~James_s_Duncan1;~Ting_Liu4", "gender": "M;M;M;M;He/him;F;;M;", "homepage": "https://juntang-zhuang.github.io/;http://boqinggong.info;https://yuanliangzhe.github.io;https://ycui.me/;https://research.google/people/author37870/;https://medicine.yale.edu/people/search/nicha_chitphakdithai.profile;;https://seas.yale.edu/faculty-research/faculty-directory/james-duncan;http://tliu.org", "dblp": "220/1417;29/7457;215/4356;47/8023.html;75/948;00/8526;;96/4489;52/5150-5", "google_scholar": "78_Vob4AAAAJ;lv9ZeVUAAAAJ;1H9CkZgAAAAJ;iP5m52IAAAAJ;fWd88tEAAAAJ;HrzdtUUAAAAJ;;_xvLDPoAAAAJ;4wSfAIQAAAAJ", "orcid": ";;;0000-0003-2882-2033;0000-0003-1258-4341;0000-0002-1648-6055;;;", "linkedin": ";boqing-gong-46aa5821/;;;hartwig-adam-1873392/;nicha-dvornek-0a99b027/;;;", "or_profile": "~Juntang_Zhuang1;~Boqing_Gong1;~Liangzhe_Yuan2;~Yin_Cui1;~Hartwig_Adam1;~Nicha_C_Dvornek1;~sekhar_tatikonda1;~James_s_Duncan1;~Ting_Liu4", "aff": "OpenAI;Google;Google DeepMind;Google;Google Research;Yale University;Yale University;Yale University;Google DeepMind", "aff_domain": "openai.com;google.com;google.com;google.com;google.com;yale.edu;yale.edu;yale.edu;google.com", "position": "Researcher;Research Scientist;Researcher;Research Scientist;Principal Researcher;Assistant Professor;;Professor;Researcher", "bibtex": "@inproceedings{\nzhuang2022surrogate,\ntitle={Surrogate Gap Minimization Improves Sharpness-Aware Training},\nauthor={Juntang Zhuang and Boqing Gong and Liangzhe Yuan and Yin Cui and Hartwig Adam and Nicha C Dvornek and sekhar tatikonda and James s Duncan and Ting Liu},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=edONMAnhLu-}\n}", "github": "", "project": "", "reviewers": "jbS6;ianU;jYWK;6UYP", "pdf_size": 0, "recommendation": "6;6;6;8", "confidence": "4;4;3;4", "correctness": "3;3;3;3", "technical_novelty": "3;3;3;3", "empirical_novelty": "3;3;3;3", "wc_summary_paper": "115;122;15;79", "wc_summary_review": "27;65;30;22", "wc_main_review": "327;287;238;142", "wc_review": "469;474;283;243", "wc_reply_reviewers": "20;0;0;16", "wc_reply_authors": "1380;782;0;192", "reply_reviewers": "1;0;0;1", "reply_authors": "3;1;0;2", "recommendation_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 82.75, 42.381452311123084 ], "wc_summary_review_avg": [ 36.0, 16.98528775146303 ], "wc_main_review_avg": [ 248.5, 69.09594778277523 ], "wc_review_avg": [ 367.25, 105.21971060595064 ], "wc_reply_reviewers_avg": [ 9.0, 9.1104335791443 ], "wc_reply_authors_avg": [ 588.5, 540.2450832723978 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.5, 1.118033988749895 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 9, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": 0.0, "gs_citation": 195, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12868288481762118791&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=edONMAnhLu-", "email": "openai.com;google.com;google.com;google.com;google.com;yale.edu;yale.edu;yale.edu;google.com", "author_num": 9, "aff_unique_index": "0;1;1;1;1;2;2;2;1", "aff_unique_norm": "OpenAI;Google;Yale University", "aff_unique_dep": ";Google;", "aff_unique_url": "https://openai.com;https://www.google.com;https://www.yale.edu", "aff_unique_abbr": "OpenAI;Google;Yale", "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;1;0;0;0;0;0;1", "aff_country_unique": "United States;United Kingdom" }, { "id": "edqz84cQ79T", "title": "Shaping latent representations using Self-Organizing Maps with Relevance Learning", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Recent work indicates that Deep Clustering (DC) methods are a viable option for unsupervised representations learning of visual features. By combining representation learning and clustering, traditional approaches have been shown to build latent representations that capture essential features of the data while preserving topological characteristics. In this sense, models based on Self-Organizing Maps models with relevance learning (SOMRL) were considered as they perform well in clustering besides being able to create a map that learns the relevance of each input dimension for each cluster, preserving the original relations and topology of the data. We hypothesize that this type of model can produce a more intuitive and disentangled representation in the latent space by promoting smoother transitions between cluster points over time. This work proposes a representation learning framework that combines a new gradient-based SOMRL model and autoencoders. The SOMRL learns the relevance weights for each input dimension of each cluster. It creates a tendency to separate the information into subspaces. To achieve this, we designed a new loss function term that weighs these learned relevances and provides an estimated unsupervised error to be used in combination with a reconstruction loss. The model is evaluated in terms of clustering performance and quality of the learned representations and then compared with start-of-the-art models, showing competitive results.", "keywords": "Deep Clustering;Learning Prototypes;Topological Representations", "primary_area": "", "supplementary_material": "/attachment/143451ecbc3ea4a788c65fed9175390f297c0cd3.zip", "author": "Pedro Braga;Heitor Medeiros;Hansenclever Bassani", "authorids": "~Pedro_Braga1;hrm@cin.ufpe.br;~Hansenclever_Bassani1", "gender": "M;;M", "homepage": "https://phbraga.com/;;https://hfbassani.github.io/", "dblp": ";;93/6335", "google_scholar": "https://scholar.google.com/citations?hl=en;;https://scholar.google.ca/citations?user=s14pJ00AAAAJ", "orcid": "0000-0003-3273-0617;;0000-0001-5307-9400", "linkedin": "pedromagalhaeshb/;;hansbassani/", "or_profile": "~Pedro_Braga1;hrm@cin.ufpe.br;~Hansenclever_Bassani1", "aff": "Universidade Federal de Pernambuco, Federal University of Pernambuco;;Universidade Federal de Pernambuco, Federal University of Pernambuco", "aff_domain": "cin.ufpe.br;;cin.ufpe.br", "position": "PhD student;;Assistant Professor", "bibtex": "@misc{\nbraga2022shaping,\ntitle={Shaping latent representations using Self-Organizing Maps with Relevance Learning},\nauthor={Pedro Braga and Heitor Medeiros and Hansenclever Bassani},\nyear={2022},\nurl={https://openreview.net/forum?id=edqz84cQ79T}\n}", "github": "", "project": "", "reviewers": "TaHG;pGQ2;EFAy", "site": "https://openreview.net/forum?id=edqz84cQ79T", "pdf_size": 0, "recommendation": "1;3;3", "confidence": "4;5;3", "correctness": "2;2;3", "technical_novelty": "2;2;3", "empirical_novelty": "0;2;2", "wc_summary_paper": "81;42;49", "wc_summary_review": "70;44;29", "wc_main_review": "415;601;211", "wc_review": "566;687;289", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 2.3333333333333335, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "correctness_avg": [ 2.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 1.3333333333333333, 0.9428090415820634 ], "wc_summary_paper_avg": [ 57.333333333333336, 16.97710877099579 ], "wc_summary_review_avg": [ 47.666666666666664, 16.937794687883333 ], "wc_main_review_avg": [ 409.0, 159.27334993651638 ], "wc_review_avg": [ 514.0, 166.5913163003002 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.5, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:k7AaJBdh2D8J:scholar.google.com/&scioq=Shaping+latent+representations+using+Self-Organizing+Maps+with+Relevance+Learning&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Universidade Federal de Pernambuco", "aff_unique_dep": "", "aff_unique_url": "https://ufpe.br", "aff_unique_abbr": "UFPE", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Brazil" }, { "title": "Symbolic Learning to Optimize: Towards Interpretability and Scalability", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6741", "id": "ef0nInZHKIC", "poster": "", "openreview": "https://openreview.net/forum?id=ef0nInZHKIC", "slides": "https://iclr.cc/virtual/2022/poster/6741", "video": "https://iclr.cc/virtual/2022/poster/6741", "author_site": "Wenqing Zheng, Tianlong Chen, Ting-Kuei Hu, Zhangyang Wang", "tldr": "", "abstract": "Recent studies on Learning to Optimize (L2O) suggest a promising path to automating and accelerating the optimization procedure for complicated tasks. Existing L2O models parameterize optimization rules by neural networks, and learn those numerical rules via meta-training. However, they face two common pitfalls: (1) scalability: the numerical rules represented by neural networks create extra memory overhead for applying L2O models, and limits their applicability to optimizing larger tasks; (2) interpretability: it is unclear what each L2O model has learned in its black-box optimization rule, nor is it straightforward to compare different L2O models in an explainable way. To avoid both pitfalls, this paper proves the concept that we can \"kill two birds by one stone\", by introducing the powerful tool of symbolic regression to L2O. In this paper, we establish a holistic symbolic representation and analysis framework for L2O, which yields a series of insights for learnable optimizers. Leveraging our findings, we further propose a lightweight L2O model that can be meta-trained on large-scale problems and outperformed human-designed and tuned optimizers. Our work is set to supply a brand-new perspective to L2O research. Codes are available at: https://github.com/VITA-Group/Symbolic-Learning-To-Optimize.", "keywords": "Symbolic Regression;Learning To Optimize;Interpretability", "primary_area": "", "supplementary_material": "/attachment/26ace5ae1da2325e4a24664d3cfb44cdb31ebb8e.zip", "author": "Wenqing Zheng;Tianlong Chen;Ting-Kuei Hu;Zhangyang Wang", "authorids": "~Wenqing_Zheng1;~Tianlong_Chen1;~Ting-Kuei_Hu1;~Zhangyang_Wang1", "gender": "M;M;M;M", "homepage": "https://wenqing-zheng.github.io;https://tianlong-chen.github.io;;https://vita-group.github.io", "dblp": ";;149/5032;119/4026", "google_scholar": "https://scholar.google.com/citations?hl=zh-CN;LE3ctn0AAAAJ;;pxFyKAIAAAAJ", "orcid": "0000-0002-8283-7511;0000-0001-7774-8197;;", "linkedin": ";tianlong-chen-783862167/;;", "or_profile": "~Wenqing_Zheng1;~Tianlong_Chen1;~Ting-Kuei_Hu1;~Zhangyang_Wang1", "aff": "University of Texas, Austin;University of Texas, Austin;;University of Texas, Austin", "aff_domain": "utexas.edu;utexas.edu;;utexas.edu", "position": "PhD student;PhD student;;Assistant Professor", "bibtex": "@inproceedings{\nzheng2022symbolic,\ntitle={Symbolic Learning to Optimize: Towards Interpretability and Scalability},\nauthor={Wenqing Zheng and Tianlong Chen and Ting-Kuei Hu and Zhangyang Wang},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=ef0nInZHKIC}\n}", "github": "", "project": "", "reviewers": "ALsy;VHkg;mSQF;FzGU;jXyx", "pdf_size": 0, "recommendation": "5;6;6;6;6", "confidence": "4;3;4;4;4", "correctness": "2;3;3;2;3", "technical_novelty": "2;3;3;3;3", "empirical_novelty": "1;3;2;2;3", "wc_summary_paper": "58;90;74;59;89", "wc_summary_review": "43;58;160;40;208", "wc_main_review": "438;114;926;529;356", "wc_review": "539;262;1160;628;653", "wc_reply_reviewers": "484;0;158;241;336", "wc_reply_authors": "1950;545;2330;1359;1132", "reply_reviewers": "1;0;1;2;2", "reply_authors": "5;2;4;6;5", "recommendation_avg": [ 5.8, 0.39999999999999997 ], "confidence_avg": [ 3.8, 0.39999999999999997 ], "correctness_avg": [ 2.6, 0.4898979485566356 ], "technical_novelty_avg": [ 2.8, 0.39999999999999997 ], "empirical_novelty_avg": [ 2.2, 0.7483314773547882 ], "wc_summary_paper_avg": [ 74.0, 13.870832707519762 ], "wc_summary_review_avg": [ 101.8, 69.08082223019642 ], "wc_main_review_avg": [ 472.6, 265.3824410167334 ], "wc_review_avg": [ 648.4, 291.01381410510396 ], "wc_reply_reviewers_avg": [ 243.8, 163.0980073452769 ], "wc_reply_authors_avg": [ 1463.2, 624.8133801384217 ], "reply_reviewers_avg": [ 1.2, 0.7483314773547883 ], "reply_authors_avg": [ 4.4, 1.3564659966250536 ], "replies_avg": [ 36, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.2500000000000001, "corr_recommendation_correctness": 0.6123724356957948, "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9878665703631985766&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=ef0nInZHKIC", "email": "utexas.edu;utexas.edu;;utexas.edu", "author_num": 4, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Texas at Austin", "aff_unique_dep": "", "aff_unique_url": "https://www.utexas.edu", "aff_unique_abbr": "UT Austin", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Austin", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "egkbgeGcGtj", "title": "Multi-dataset Pretraining: A Unified Model for Semantic Segmentation", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Collecting annotated data for semantic segmentation is time-consuming and hard to scale up. In this paper, we propose a unified framework, termed as Multi-Dataset Pretraining, to efficiently integrate the fragmented annotations of different datasets. The highlight is that the annotations from different datasets can be shared and consistently boost performance for each specific one. Towards this goal, we propose a pixel-to-prototype contrastive learning strategy over multiple datasets regardless of their taxonomy labels. In this way, the pixel level embeddings with the same labels are well clustered, which we find is beneficial for downstream tasks. In order to model the relationship among images and classes from different datasets, we extend the pixel level embeddings via cross-dataset mixing and propose a pixel-to-prototype consistency regularization for better transferability. MDP can be seamlessly extended to semi-supervised setting and utilize the widely available unlabeled data to further boost the feature representation. Experiments conducted on several benchmarks demonstrate its superior performance, and MDP consistently outperforms the pretrained models over ImageNet by a considerable margin.", "keywords": "Multi-dataset;semantic segmentation;contrastive learning", "primary_area": "", "supplementary_material": "", "author": "Bowen Shi;XIAOPENG ZHANG;Haohang Xu;Wenrui Dai;Junni Zou;Hongkai Xiong;Qi Tian", "authorids": "~Bowen_Shi2;~XIAOPENG_ZHANG7;~Haohang_Xu1;~Wenrui_Dai1;~Junni_Zou1;~Hongkai_Xiong1;~Qi_Tian3", "gender": "M;M;M;;F;M;M", "homepage": ";https://sites.google.com/site/zxphistory/;;;http://www.cs.sjtu.edu.cn/~zou-jn;http://min.sjtu.edu.cn;https://www.qitian1987.com/index.html", "dblp": ";;254/0948;16/5135.html;91/4613;21/3569;78/1467-1.html", "google_scholar": "lJHbpY0AAAAJ;Ud6aBAcAAAAJ;;Xg8MhyAAAAAJ;https://scholar.google.com/citations?hl=zh-CN;bB16iN4AAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";;0000-0002-4715-1338;;;0000-0003-4552-0029;0000-0002-7252-5047", "linkedin": ";;;;;;", "or_profile": "~Bowen_Shi2;~XIAOPENG_ZHANG7;~Haohang_Xu1;~Wenrui_Dai1;~Junni_Zou1;~Hongkai_Xiong1;~Qi_Tian3", "aff": "Shanghai Jiaotong University;Huawei Technologies Ltd.;Shanghai Jiaotong University;Shanghai Jiaotong University;Shanghai Jiaotong University;Shanghai Jiaotong University;Huawei Technologies Ltd.", "aff_domain": "sjtu.edu.cn;huawei.com;sjtu.edu.cn;sjtu.edu.cn;sjtu.edu.cn;sjtu.edu.cn;huawei.com", "position": "PhD student;Principal Researcher;PhD student;Associate Professor;Full Professor;Full Professor;Principal Researcher", "bibtex": "@misc{\nshi2022multidataset,\ntitle={Multi-dataset Pretraining: A Unified Model for Semantic Segmentation},\nauthor={Bowen Shi and XIAOPENG ZHANG and Haohang Xu and Wenrui Dai and Junni Zou and Hongkai Xiong and Qi Tian},\nyear={2022},\nurl={https://openreview.net/forum?id=egkbgeGcGtj}\n}", "github": "", "project": "", "reviewers": "v6So;xYA7;NDJu;AayQ", "site": "https://openreview.net/forum?id=egkbgeGcGtj", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "5;4;4;4", "correctness": "2;3;2;4", "technical_novelty": "2;3;2;3", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "37;70;61;42", "wc_summary_review": "71;65;100;20", "wc_main_review": "605;382;591;182", "wc_review": "713;517;752;244", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.82915619758885 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 52.5, 13.5 ], "wc_summary_review_avg": [ 64.0, 28.64437117480501 ], "wc_main_review_avg": [ 440.0, 173.17187993435886 ], "wc_review_avg": [ 556.5, 201.20201291239607 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": -0.9271726499455306, "corr_recommendation_correctness": 0.7608859102526822, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18414068560599533082&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;0;0;0;0;1", "aff_unique_norm": "Shanghai Jiao Tong University;Huawei", "aff_unique_dep": ";Huawei Technologies", "aff_unique_url": "https://www.sjtu.edu.cn;https://www.huawei.com", "aff_unique_abbr": "SJTU;Huawei", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Fortuitous Forgetting in Connectionist Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6512", "id": "ei3SY1_zYsE", "poster": "", "openreview": "https://openreview.net/forum?id=ei3SY1_zYsE", "slides": "https://iclr.cc/virtual/2022/poster/6512", "video": "https://iclr.cc/virtual/2022/poster/6512", "author_site": "Hattie Zhou, Ankit Vani, Hugo Larochelle, Aaron Courville", "tldr": "", "abstract": "Forgetting is often seen as an unwanted characteristic in both human and machine learning. However, we propose that forgetting can in fact be favorable to learning. We introduce forget-and-relearn as a powerful paradigm for shaping the learning trajectories of artificial neural networks. In this process, the forgetting step selectively removes undesirable information from the model, and the relearning step reinforces features that are consistently useful under different conditions. The forget-and-relearn framework unifies many existing iterative training algorithms in the image classification and language emergence literature, and allows us to understand the success of these algorithms in terms of the disproportionate forgetting of undesirable information. We leverage this understanding to improve upon existing algorithms by designing more targeted forgetting operations. Insights from our analysis provide a coherent view on the dynamics of iterative training in neural networks and offer a clear path towards performance improvements.", "keywords": "Neural Networks;Generalization;Iterative Training;Compositionality;Iterated Learning", "primary_area": "", "supplementary_material": "", "author": "Hattie Zhou;Ankit Vani;Hugo Larochelle;Aaron Courville", "authorids": "~Hattie_Zhou1;~Ankit_Vani1;~Hugo_Larochelle1;~Aaron_Courville3", "gender": "F;M;M;", "homepage": "http://hattiezhou.com;https://ankitvani.com/;https://mila.quebec/en/directory/hugo-larochelle;", "dblp": ";178/2855;86/3862.html;56/1688", "google_scholar": ";KtnTuq8AAAAJ;https://scholar.google.ca/citations?user=U89FHq4AAAAJ;https://scholar.google.ca/citations?user=km6CP8cAAAAJ", "orcid": ";;;", "linkedin": ";ankitvani/;;", "or_profile": "~Hattie_Zhou1;~Ankit_Vani1;~Hugo_Larochelle1;~Aaron_Courville3", "aff": "University of Montreal;Mila;Universit\u00e9 de Sherbrooke;Universit\u00e9 de Montr\u00e9al", "aff_domain": "umontreal.ca;mila.quebec;usherbrooke.ca; ", "position": "PhD student;PhD student;Adjunct Professor;Assistant Professor", "bibtex": "@inproceedings{\nzhou2022fortuitous,\ntitle={Fortuitous Forgetting in Connectionist Networks},\nauthor={Hattie Zhou and Ankit Vani and Hugo Larochelle and Aaron Courville},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=ei3SY1_zYsE}\n}", "github": "", "project": "", "reviewers": "hRdL;SWHW;UMJe;MVhg", "pdf_size": 0, "recommendation": "6;6;6;10", "confidence": "3;4;4;4", "correctness": "3;2;3;4", "technical_novelty": "3;3;2;3", "empirical_novelty": "3;2;2;4", "wc_summary_paper": "234;140;75;64", "wc_summary_review": "60;55;111;74", "wc_main_review": "391;458;637;371", "wc_review": "685;653;823;509", "wc_reply_reviewers": "0;31;0;171", "wc_reply_authors": "1119;976;1502;1038", "reply_reviewers": "0;1;0;1", "reply_authors": "2;2;3;2", "recommendation_avg": [ 7.0, 1.7320508075688772 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 128.25, 67.61055760752163 ], "wc_summary_review_avg": [ 75.0, 21.920310216782973 ], "wc_main_review_avg": [ 464.25, 104.81263044118299 ], "wc_review_avg": [ 667.5, 111.60085125123375 ], "wc_reply_reviewers_avg": [ 50.5, 70.71244586351118 ], "wc_reply_authors_avg": [ 1158.75, 204.5597406627218 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 2.25, 0.4330127018922193 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": 0.816496580927726, "gs_citation": 48, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=603488555859414419&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=ei3SY1_zYsE", "email": "umontreal.ca;mila.quebec;usherbrooke.ca; ", "author_num": 4, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "University of Montreal;Mila;Universit\u00e9 de Sherbrooke;Universit\u00e9 de Montr\u00e9al", "aff_unique_dep": ";Quebec Artificial Intelligence Institute;;", "aff_unique_url": "https://wwwumontreal.ca;https://mila.quebec;https://www.usherbrooke.ca;https://www.umontreal.ca", "aff_unique_abbr": "UM;Mila;UdeS;UdeM", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Canada" }, { "id": "eiwpbi3iwr", "title": "Neuronal Learning Analysis using Cycle-Consistent Adversarial Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "Understanding how activity in neural circuits reshapes following task learning could reveal fundamental mechanisms of learning. Thanks to the recent advances in neural imaging technologies, high-quality recordings can be obtained from hundreds of neurons over multiple days or even weeks. However, the complexity and dimensionality of population responses pose significant challenges for analysis. Existing methods of studying neuronal adaptation and learning often impose strong assumptions on the data or model, resulting in biased descriptions that do not generalize. In this work, we use a variant of deep generative models called - cycle-consistent adversarial networks, to learn the unknown mapping between pre- and post-learning neuronal activities recorded $\\textit{in vivo}$. To do so, we develop an end-to-end pipeline to preprocess, train and evaluate calcium fluorescence signals, and a procedure to interpret the resulting deep learning models. To assess the validity of our method, we first test our framework on a synthetic dataset with known ground-truth transformation. Subsequently, we applied our method to neuronal activities recorded from the primary visual cortex of behaving mice, where the mice transition from novice to expert-level performance in a visual-based virtual reality experiment. We evaluate model performance on generated calcium imaging signals and their inferred spike trains. To maximize performance, we derive a novel approach to pre-sort neurons such that convolutional-based networks can take advantage of the spatial information that exists in neuronal activities. In addition, we incorporate visual explanation methods to improve the interpretability of our work and gain insights into the learning process as manifested in the cellular activities. Together, our results demonstrate that analyzing neuronal learning processes with data-driven deep unsupervised methods holds the potential to unravel changes in an unbiased way.", "keywords": "neuronal learning;unsupervised learning;calcium imaging;generative adversarial networks;cycle-consistent adversarial networks;explainable AI", "primary_area": "", "supplementary_material": "/attachment/8e71275f9355f3f22a608200eb7861301e8c3de9.zip", "author": "Bryan M. Li;Theoklitos Amvrosiadis;Nathalie L. Rochefort;Arno Onken", "authorids": "~Bryan_M._Li1;t.amvrosiadis@ed.ac.uk;n.rochefort@ed.ac.uk;~Arno_Onken1", "gender": "M;;;M", "homepage": "https://bryanli.io;;;https://homepages.inf.ed.ac.uk/aonken/", "dblp": "213/8145;;;15/2035", "google_scholar": "QQrzFdAAAAAJ;;;JQh31ekAAAAJ", "orcid": "0000-0003-3144-4838;;;0000-0001-7387-5535", "linkedin": ";;;", "or_profile": "~Bryan_M._Li1;t.amvrosiadis@ed.ac.uk;n.rochefort@ed.ac.uk;~Arno_Onken1", "aff": "University of Edinburgh;;;University of Edinburgh", "aff_domain": "ed.ac.uk;;;ed.ac.uk", "position": "PhD student;;;Assistant Professor", "bibtex": "@misc{\nli2022neuronal,\ntitle={Neuronal Learning Analysis using Cycle-Consistent Adversarial Networks},\nauthor={Bryan M. Li and Theoklitos Amvrosiadis and Nathalie L. Rochefort and Arno Onken},\nyear={2022},\nurl={https://openreview.net/forum?id=eiwpbi3iwr}\n}", "github": "", "project": "", "reviewers": "aJ3M;vrSP;cz5b", "site": "https://openreview.net/forum?id=eiwpbi3iwr", "pdf_size": 0, "recommendation": "5;5;6", "confidence": "3;3;5", "correctness": "3;4;3", "technical_novelty": "2;3;1", "empirical_novelty": "3;2;3", "wc_summary_paper": "65;60;69", "wc_summary_review": "42;44;43", "wc_main_review": "617;671;291", "wc_review": "724;775;403", "wc_reply_reviewers": "137;78;0", "wc_reply_authors": "1023;780;397", "reply_reviewers": "1;1;0", "reply_authors": "3;2;1", "recommendation_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.9428090415820634 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.0, 0.816496580927726 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 64.66666666666667, 3.681787005729087 ], "wc_summary_review_avg": [ 43.0, 0.816496580927726 ], "wc_main_review_avg": [ 526.3333333333334, 167.85972980107198 ], "wc_review_avg": [ 634.0, 164.66329281293994 ], "wc_reply_reviewers_avg": [ 71.66666666666667, 56.10902086315731 ], "wc_reply_authors_avg": [ 733.3333333333334, 257.6849928799804 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 2.0, 0.816496580927726 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 1.0, "corr_recommendation_correctness": -0.4999999999999999, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5565944935183361939&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0", "aff_unique_norm": "University of Edinburgh", "aff_unique_dep": "", "aff_unique_url": "https://www.ed.ac.uk", "aff_unique_abbr": "Edinburgh", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United Kingdom" }, { "id": "ejFdNNTSq1", "title": "Analyzing the Implicit Position Encoding Ability of Transformer Decoder", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "A common limitation of Transformer Encoder's self-attention mechanism is that it cannot automatically capture the information of word order, so one needs to feed the explicit position encodings into the target model. On the other hand, Transformer Decoder with the auto-regressive attention masks is naturally sensitive to the word order information. In this work, based on the analysis of implicit position encoding power of Transformer Decoder, we obtain the conditions that at least two or more layers are required for the Decoder to encode word positions. To examine the correlations between the implicit and explicit position encodings respectively from the Transformer Encoder and Decoder, extensive experiments conducted on two large Wikipedia datasets demonstrate that all kinds of explicit position encoding mechanisms improve the performance of Decoder, but the gap of learnable position embeddings is smaller than the others. To make use of the power of implicit position encoding, we propose a new model, called \\textit{DecBERT}, and fine-tune it on GLUE benchmarks. Experimental results show that (1) the implicit position encoding ability is strong enough to enhance language modeling and perform well on downstream tasks; and (2) our model accelerates the pre-training process and achieves superior performances than the baseline systems when pre-training with the same amount of computational resource.", "keywords": "NLP;Transformer;BERT;Position Encodings", "primary_area": "", "supplementary_material": "", "author": "Ziyang Luo;Yadong Xi;Jing Ma;Xiaoxi Mao;Changjie Fan", "authorids": "~Ziyang_Luo2;~Yadong_Xi1;majing@hkbu.edu.hk;maoxiaoxi@corp.netease.com;~Changjie_Fan1", "gender": "M;M;;;M", "homepage": "https://chiyeunglaw.github.io/;;;;", "dblp": ";;;;71/882", "google_scholar": "VI8NeJEAAAAJ;https://scholar.google.com.tw/citations?user=G6_hgq4AAAAJ;;;", "orcid": ";;;;0000-0001-5420-0516", "linkedin": "ziyang-luo-681a17192/;;;;", "or_profile": "~Ziyang_Luo2;~Yadong_Xi1;majing@hkbu.edu.hk;maoxiaoxi@corp.netease.com;~Changjie_Fan1", "aff": "NetEase, Inc.;Netease, Fuxi AILab;;;Netease, Fuxi AI Lab", "aff_domain": "netease.com;163.com;;;corp.netease.com", "position": "Intern;Researcher;;;Principal Researcher", "bibtex": "@misc{\nluo2022analyzing,\ntitle={Analyzing the Implicit Position Encoding Ability of Transformer Decoder},\nauthor={Ziyang Luo and Yadong Xi and Jing Ma and Xiaoxi Mao and Changjie Fan},\nyear={2022},\nurl={https://openreview.net/forum?id=ejFdNNTSq1}\n}", "github": "", "project": "", "reviewers": "VGv5;NpFV;EzjF;JHEg", "site": "https://openreview.net/forum?id=ejFdNNTSq1", "pdf_size": 0, "recommendation": "3;5;5;5", "confidence": "4;4;4;3", "correctness": "2;2;3;2", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "60;78;32;80", "wc_summary_review": "5;31;80;14", "wc_main_review": "319;170;79;222", "wc_review": "384;279;191;316", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.5, 0.8660254037844386 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 2.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 62.5, 19.254869513969705 ], "wc_summary_review_avg": [ 32.5, 28.96981187374195 ], "wc_main_review_avg": [ 197.5, 86.83461291443638 ], "wc_review_avg": [ 292.5, 69.65809357138623 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:OCK3utTCSUYJ:scholar.google.com/&scioq=Analyzing+the+Implicit+Position+Encoding+Ability+of+Transformer+Decoder&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;1", "aff_unique_norm": "NetEase, Inc.;Netease", "aff_unique_dep": ";Fuxi AILab", "aff_unique_url": "https://www.163.com;https://www.netease.com", "aff_unique_abbr": "NetEase;Netease", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "Differentiable Prompt Makes Pre-trained Language Models Better Few-shot Learners", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6259", "id": "ek9a0qIafW", "poster": "", "openreview": "https://openreview.net/forum?id=ek9a0qIafW", "slides": "https://iclr.cc/virtual/2022/poster/6259", "video": "https://iclr.cc/virtual/2022/poster/6259", "author_site": "Ningyu Zhang, Luoqiu Li, Xiang Chen, Shumin Deng, Zhen Bi, Chuanqi Tan, Fei Huang, Huajun Chen", "tldr": "", "abstract": "Large-scale pre-trained language models have contributed significantly to natural language processing by demonstrating remarkable abilities as few-shot learners. However, their effectiveness depends mainly on scaling the model parameters and prompt design, hindering their implementation in most real-world applications. This study proposes a novel pluggable, extensible, and efficient approach named DifferentiAble pRompT (DART), which can convert small language models into better few-shot learners. The main principle behind this approach involves reformulating potential natural language processing tasks into the task of a pre-trained language model and differentially optimizing the prompt template as well as the target label with backpropagation. Furthermore, the proposed approach can be: (i) Plugged to any pre-trained language models; (ii) Extended to widespread classification tasks. A comprehensive evaluation of standard NLP tasks demonstrates that the proposed approach achieves a better few-shot performance.", "keywords": "prompt-tuning;pre-trained language model;few-shot learning", "primary_area": "", "supplementary_material": "/attachment/925f8944c1fc31182868c4e6175b784943f5d78a.zip", "author": "Ningyu Zhang;Luoqiu Li;Xiang Chen;Shumin Deng;Zhen Bi;Chuanqi Tan;Fei Huang;Huajun Chen", "authorids": "~Ningyu_Zhang1;~Luoqiu_Li1;~Xiang_Chen5;~Shumin_Deng1;~Zhen_Bi1;~Chuanqi_Tan3;~Fei_Huang2;~Huajun_Chen1", "gender": "M;M;M;F;M;M;M;M", "homepage": "https://person.zju.edu.cn/en/ningyu;;https://faculty.nuaa.edu.cn/ChenXiang/zh_CN/index.htm;https://231sm.github.io/;;https://sites.google.com/view/fei-huang;;https://www.researchgate.net/profile/Bi-Zhen-2/research", "dblp": "139/4181-1.html;;64/3062-16;213/1853;148/4497;h/FeiHuang.html;94/5089;279/8441", "google_scholar": "xQDOPvsAAAAJ;;pXivdn8AAAAJ;3am3hL4AAAAJ;tOfo4ncAAAAJ;9r98PpoAAAAJ;;https://scholar.google.com/citations?hl=zh-CN", "orcid": "0000-0002-1970-0678;;0000-0002-2594-0600;;0000-0002-6676-3057;;;", "linkedin": "ningyuzhang/;%E6%B3%BA%E7%A7%8B-%E6%9D%8E-6b4259177/;;;;fei-huang-cas-cmu;;", "or_profile": "~Ningyu_Zhang1;~Luoqiu_Li1;~Xiang_Chen5;~Shumin_Deng1;~Chuanqi_Tan3;~Fei_Huang2;~Huajun_Chen1;~Bi_Zhen1", "aff": "Zhejiang University;Zhejiang University;Zhejiang University;Zhejiang University;Alibaba Group;Alibaba Group US;Zhejiang University;Zhejiang University", "aff_domain": "zju.edu.cn;zju.edu.cn;zju.edu.cn;zju.edu.cn;alibaba-inc.com;alibaba-inc.com;zju.edu.cn;zju.edu.cn", "position": "Associate Professor;MS student;PhD student;PhD student;Full-time employee;Principal Researcher;Full Professor;PhD student", "bibtex": "@inproceedings{\nzhang2022differentiable,\ntitle={Differentiable Prompt Makes Pre-trained Language Models Better Few-shot Learners},\nauthor={Ningyu Zhang and Luoqiu Li and Xiang Chen and Shumin Deng and Zhen Bi and Chuanqi Tan and Fei Huang and Huajun Chen},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=ek9a0qIafW}\n}", "github": "", "project": "", "reviewers": "LUWM;H1jf;pzjK;EFFN", "pdf_size": 0, "recommendation": "6;6;8;8", "confidence": "4;4;3;4", "correctness": "3;3;4;4", "technical_novelty": "3;3;3;4", "empirical_novelty": "3;2;3;3", "wc_summary_paper": "71;59;25;51", "wc_summary_review": "76;62;30;27", "wc_main_review": "123;179;191;308", "wc_review": "270;300;246;386", "wc_reply_reviewers": "0;0;0;104", "wc_reply_authors": "130;349;145;418", "reply_reviewers": "0;0;0;1", "reply_authors": "1;1;1;1", "recommendation_avg": [ 7.0, 1.0 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 51.5, 16.874537030686206 ], "wc_summary_review_avg": [ 48.75, 20.873128658636684 ], "wc_main_review_avg": [ 200.25, 67.29552362527541 ], "wc_review_avg": [ 300.5, 52.94100490168278 ], "wc_reply_reviewers_avg": [ 26.0, 45.033320996790806 ], "wc_reply_authors_avg": [ 260.5, 125.50796787455369 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 1.0, "gs_citation": 220, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17540526705863454050&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=ek9a0qIafW", "email": "zju.edu.cn;zju.edu.cn;zju.edu.cn;zju.edu.cn;alibaba-inc.com;alibaba-inc.com;zju.edu.cn;zju.edu.cn", "author_num": 8, "aff_unique_index": "0;0;0;0;1;1;0;0", "aff_unique_norm": "Zhejiang University;Alibaba Group", "aff_unique_dep": ";", "aff_unique_url": "https://www.zju.edu.cn;https://www.alibaba.com", "aff_unique_abbr": "ZJU;Alibaba", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;1;0;0", "aff_country_unique": "China;United States" }, { "id": "eo1barn2Xmd", "title": "SLIM-QN: A Stochastic, Light, Momentumized Quasi-Newton Optimizer for Deep Neural Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "We propose SLIM-QN, a light stochastic quasi-Newton optimizer for training large-scale deep neural networks (DNNs).\nSLIM-QN addresses two key barriers in existing second-order methods for large-scale DNNs: 1) the high computational cost of obtaining the Hessian matrix and its inverse in every iteration (e.g. KFAC); 2) convergence instability due to stochastic training (e.g. L-BFGS).\nTo tackle the first challenge,SLIM-QN directly approximates the Hessian inverse using past parameters and gradients, without explicitly constructing the Hessian matrix and then computing its inverse.\nTo achieve stable convergence, SLIM-QN introduces momentum in Hessian updates together with an adaptive damping mechanism.\nWe provide rigorous theoretical results on the convergence of SLIM-QN in a stochastic setting.\nWe also demonstrate that SLIM-QN has much less compute and memory overhead compared to existing second-order methods. \nTo better understand the limitations and benefits of SLIM-QN, we evaluate its performance on various datasets and network architectures.\nFor instance on large datasets such as ImageNet, we show that SLIM-QN achieves near optimal accuracy $1.5\\times$ faster when compared with SGD ($1.36\\times$ faster in wall-clock time) using the same compute resources.\nWe also show that SLIM-QN can readily be applied to other contemporary non-convolutional architectures such as Transformers.", "keywords": "Second-Order Methods;Stochastic Optimization;Deep Neural Networks", "primary_area": "", "supplementary_material": "/attachment/6865c37b44369a53474b9089eb30f2c1a4935599.zip", "author": "Yue Niu;Zalan Fabian;Sunwoo Lee;Mahdi Soltanolkotabi;Salman Avestimehr", "authorids": "~Yue_Niu1;~Zalan_Fabian1;~Sunwoo_Lee1;~Mahdi_Soltanolkotabi1;~Salman_Avestimehr1", "gender": ";M;M;M;", "homepage": ";https://z-fabian.github.io/;https://sites.google.com/view/sunwoolee;http://www-bcf.usc.edu/~soltanol/;", "dblp": ";192/2874;56/7811-1;75/6691;", "google_scholar": ";5EKjsXQAAAAJ;WA9KNNcAAAAJ;narJyMAAAAAJ;", "orcid": ";;0000-0001-6334-3068;;", "linkedin": ";;sunwoo-lee-90a7308a;;", "or_profile": "~Yue_Niu1;~Zalan_Fabian1;~Sunwoo_Lee1;~Mahdi_Soltanolkotabi1;~Salman_Avestimehr1", "aff": ";University of Southern California;University of Southern California;University of Southern California;", "aff_domain": ";usc.edu;usc.edu;usc.edu;", "position": ";PhD student;Postdoc;Associate Professor;", "bibtex": "@misc{\nniu2022slimqn,\ntitle={{SLIM}-{QN}: A Stochastic, Light, Momentumized Quasi-Newton Optimizer for Deep Neural Networks},\nauthor={Yue Niu and Zalan Fabian and Sunwoo Lee and Mahdi Soltanolkotabi and Salman Avestimehr},\nyear={2022},\nurl={https://openreview.net/forum?id=eo1barn2Xmd}\n}", "github": "", "project": "", "reviewers": "doqb;bMXz;aBmG;TATQ;EjsS", "site": "https://openreview.net/forum?id=eo1barn2Xmd", "pdf_size": 0, "recommendation": "3;3;3;5;6", "confidence": "4;4;4;3;3", "correctness": "3;2;3;4;4", "technical_novelty": "2;2;2;2;3", "empirical_novelty": "2;3;3;2;3", "wc_summary_paper": "126;79;140;61;68", "wc_summary_review": "56;94;15;29;56", "wc_main_review": "259;986;507;75;32", "wc_review": "441;1159;662;165;156", "wc_reply_reviewers": "0;672;0;178;0", "wc_reply_authors": "543;1218;758;795;139", "reply_reviewers": "0;2;0;3;0", "reply_authors": "1;2;1;4;1", "recommendation_avg": [ 4.0, 1.2649110640673518 ], "confidence_avg": [ 3.6, 0.4898979485566356 ], "correctness_avg": [ 3.2, 0.7483314773547882 ], "technical_novelty_avg": [ 2.2, 0.39999999999999997 ], "empirical_novelty_avg": [ 2.6, 0.4898979485566356 ], "wc_summary_paper_avg": [ 94.8, 32.021242949017456 ], "wc_summary_review_avg": [ 50.0, 27.10719461692781 ], "wc_main_review_avg": [ 371.8, 349.7995997710689 ], "wc_review_avg": [ 516.6, 372.3410264797582 ], "wc_reply_reviewers_avg": [ 170.0, 260.2952170132982 ], "wc_reply_authors_avg": [ 690.6, 352.0344301343265 ], "reply_reviewers_avg": [ 1.0, 1.2649110640673518 ], "reply_authors_avg": [ 1.8, 1.1661903789690602 ], "replies_avg": [ 20, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.9682458365518543, "corr_recommendation_correctness": 0.8451542547285165, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:R_zKvyk1yRwJ:scholar.google.com/&scioq=SLIM-QN:+A+Stochastic,+Light,+Momentumized+Quasi-Newton+Optimizer+for+Deep+Neural+Networks&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Southern California", "aff_unique_dep": "", "aff_unique_url": "https://www.usc.edu", "aff_unique_abbr": "USC", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Los Angeles", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "eoShjXqWkr", "title": "STransGAN: An Empirical Study on Transformer in GANs", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Transformer becomes prevalent in computer vision, especially for high-level vision tasks. However, deploying Transformer in the generative adversarial network (GAN) framework is still an open yet challenging problem. In this paper, we conduct a comprehensive empirical study to investigate the intrinsic properties of Transformer in GAN for high-fidelity image synthesis. Our analysis highlights the importance of feature locality in image generation. We first investigate the effective ways to implement local attention. We then examine the influence of residual connections in self-attention layers and propose a novel way to reduce their negative impacts on learning discriminators and conditional generators. Our study leads to a new design of Transformers in GAN, a convolutional neural network (CNN)-free generator termed as STrans-G, which achieves competitive results in both unconditional and conditional image generations. The Transformer-based discriminator, STrans-D, also significantly reduces its gap against the CNN-based discriminators. Models and code will be publicly available.", "keywords": "GAN;Transformer;generative models", "primary_area": "", "supplementary_material": "", "author": "Rui Xu;Xiangyu Xu;Kai Chen;Bolei Zhou;Chen Change Loy", "authorids": "~Rui_Xu2;~Xiangyu_Xu3;~Kai_Chen4;~Bolei_Zhou5;~Chen_Change_Loy2", "gender": "M;M;M;M;M", "homepage": "https://nbei.github.io/;https://xuxy09.github.io/;https://chenkai.site/;https://boleizhou.github.io/;https://www.mmlab-ntu.com/person/ccloy/index.html", "dblp": "00/4859;172/1282-2.html;181/2839-26;46/8066;01/5855", "google_scholar": "1xn5GHgAAAAJ;Ec5Biz4AAAAJ;https://scholar.google.com.hk/citations?user=eGD0b7IAAAAJ;9D4aG8AAAAAJ;https://scholar.google.co.uk/citations?user=559LF80AAAAJ", "orcid": ";;0000-0002-6820-2325;;0000-0001-5345-1591", "linkedin": ";;;;", "or_profile": "~Rui_Xu2;~Xiangyu_Xu3;~Kai_Chen4;~Bolei_Zhou5;~Chen_Change_Loy2", "aff": "The Chinese University of Hong Kong;Sea AI Lab;SenseTime;University of California, Los Angeles;Nanyang Technological University", "aff_domain": "cuhk.edu.hk;sea.com;sensetime.com;ucla.edu;ntu.edu.sg", "position": "PhD student;Researcher;Researcher;Assistant Professor;Full Professor", "bibtex": "@misc{\nxu2022stransgan,\ntitle={{ST}rans{GAN}: An Empirical Study on Transformer in {GAN}s},\nauthor={Rui Xu and Xiangyu Xu and Kai Chen and Bolei Zhou and Chen Change Loy},\nyear={2022},\nurl={https://openreview.net/forum?id=eoShjXqWkr}\n}", "github": "", "project": "", "reviewers": "HBxC;iy9j;Yjiv;FQTj", "site": "https://openreview.net/forum?id=eoShjXqWkr", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "3;5;4;5", "correctness": "2;3;3;3", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;2;3;2", "wc_summary_paper": "45;103;67;53", "wc_summary_review": "47;28;53;120", "wc_main_review": "382;352;242;215", "wc_review": "474;483;362;388", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 4.25, 0.82915619758885 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 67.0, 22.22611077089287 ], "wc_summary_review_avg": [ 62.0, 34.734708865916815 ], "wc_main_review_avg": [ 297.75, 70.70493264263817 ], "wc_review_avg": [ 426.75, 52.65631491093922 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.899228803025897, "corr_recommendation_correctness": 0.9271726499455306, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13057872095751233062&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;2;3;4", "aff_unique_norm": "Chinese University of Hong Kong;Sea AI Lab;SenseTime;University of California, Los Angeles;Nanyang Technological University", "aff_unique_dep": ";;;;", "aff_unique_url": "https://www.cuhk.edu.hk;;https://www.sensetime.com;https://www.ucla.edu;https://www.ntu.edu.sg", "aff_unique_abbr": "CUHK;;SenseTime;UCLA;NTU", "aff_campus_unique_index": "0;2", "aff_campus_unique": "Hong Kong SAR;;Los Angeles", "aff_country_unique_index": "0;0;2;3", "aff_country_unique": "China;;United States;Singapore" }, { "id": "eqNpg2HMNi1", "title": "Physical System Design Using Hamiltonian Monte Carlo over Learned Manifolds", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "The design of complex physical systems entails satisfying several competing performance objectives. In practice, some design requirements are often implicit in the intuition and knowledge of designers who have many years of experience working with similar designs. Designers use this experience to sample a few promising candidates in the design space and evaluate or simulate them using detailed, typically slow multiphysics models. The goal in design is usually to generate a diverse set of high-performing design configurations that allow trade-offs across different objectives and avoid early concretization. In this paper, we develop a machine learning approach to automate physical system design. We use deep generative models to learn a manifold of the valid design space, followed by Hamiltonian Monte Carlo (HMC) with simulated annealing to explore and optimize design over the learned manifold, producing a diverse set of optimal designs. Our approach is akin to partial simulated annealing restricted to the learned design manifold, where the annealing schedule is varied to trade-off different objectives. To prevent our approach from traversing off the design manifold and proposing unreliable designs, we leverage Monte Carlo dropout as a way to detect and avoid design configurations where the learned model cannot be trusted. We demonstrate the efficacy of our proposed approach using several case studies that include the design of an SAE race vehicle, propeller, and air vehicle. Across these case studies, we successfully show how our method generates high-performing and diverse designs. ", "keywords": "Physical Design;Mechanical Design;Generative Modeling;Hamiltonian Monte Carlo", "primary_area": "", "supplementary_material": "/attachment/055ec2616eba990db8b55bb7f1c73781e67af935.zip", "author": "Adam D. Cobb;Anirban Roy;Kaushik Koneripalli;Daniel Elenius;Susmit Jha", "authorids": "~Adam_D._Cobb1;~Anirban_Roy3;~Kaushik_Koneripalli1;~Daniel_Elenius1;~Susmit_Jha1", "gender": "M;M;M;M;", "homepage": ";;https://kaushik333.github.io/;http://www.csl.sri.com/people/elenius/;http://susmitjha.github.io/", "dblp": "206/6601;;;;", "google_scholar": "XW1fyPcAAAAJ;N9eSuR4AAAAJ;faK6RPMAAAAJ;;https://scholar.google.com/citations?hl=en", "orcid": ";0009-0000-6889-0204;;;0000-0001-5983-9095", "linkedin": ";anirbanroylinkedin/;kaushik-koneripalli/;;susmitjha/", "or_profile": "~Adam_D._Cobb1;~Anirban_Roy3;~Kaushik_Koneripalli1;~Daniel_Elenius1;~Susmit_Jha1", "aff": "SRI International;SRI International;SRI International;SRI International;SRI International", "aff_domain": "sri.com;sri.com;sri.com;sri.com;sri.com", "position": "Researcher;Sr Scientist;Computer Scientist;Senior Software Engineer;Principal Scientist", "bibtex": "@misc{\ncobb2022physical,\ntitle={Physical System Design Using Hamiltonian Monte Carlo over Learned Manifolds},\nauthor={Adam D. Cobb and Anirban Roy and Kaushik Koneripalli and Daniel Elenius and Susmit Jha},\nyear={2022},\nurl={https://openreview.net/forum?id=eqNpg2HMNi1}\n}", "github": "", "project": "", "reviewers": "93BA;AQu1;fW4S;osDs", "site": "https://openreview.net/forum?id=eqNpg2HMNi1", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "5;3;3;3", "correctness": "3;2;3;3", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;2;3;2", "wc_summary_paper": "79;32;114;59", "wc_summary_review": "154;29;65;29", "wc_main_review": "556;108;231;156", "wc_review": "789;169;410;244", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 3.5, 0.8660254037844386 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 71.0, 29.90819285747636 ], "wc_summary_review_avg": [ 69.25, 51.08999412800906 ], "wc_main_review_avg": [ 262.75, 174.89050145734043 ], "wc_review_avg": [ 403.0, 239.31255712979208 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16946725915461318166&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "SRI International", "aff_unique_dep": "", "aff_unique_url": "https://www.sri.com", "aff_unique_abbr": "SRI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "eqRTPB134q0", "title": "Invariance in Policy Optimisation and Partial Identifiability in Reward Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "It is challenging to design a reward function for complex, real-world tasks. Reward learning algorithms let one instead infer a reward function from data. However, multiple reward functions often explain the data equally well, even in the limit of infinite data. Prior work has focused on situations where the reward function is uniquely recoverable, by introducing additional assumptions or data sources. By contrast, we formally characterise this partial identifiability for popular data sources such as demonstrations and trajectory preferences. We analyse the impact of this ambiguity on downstream tasks such as policy optimisation, including under shifts in environment dynamics. These results have implications for the practical design and selection of data sources for reward learning.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Joar Max Viktor Skalse;Matthew Farrugia-Roberts;Stuart Russell;Adam Gleave", "authorids": "~Joar_Max_Viktor_Skalse1;~Matthew_Farrugia-Roberts1;~Stuart_Russell1;~Adam_Gleave1", "gender": "M;;M;M", "homepage": ";;https://people.eecs.berkeley.edu/~russell/;https://gleave.me", "dblp": "242/8125;;;189/0008.html", "google_scholar": "GuzLUmQAAAAJ;;https://scholar.google.com.tw/citations?user=KJGrjCAAAAAJ;lBunDH0AAAAJ", "orcid": ";;;0000-0002-3467-528X", "linkedin": ";;;adamgleave/", "or_profile": "~Joar_Max_Viktor_Skalse1;~Matthew_Farrugia-Roberts1;~Stuart_Russell1;~Adam_Gleave1", "aff": "University of Oxford;;University of California, Berkeley;University of California, Berkeley", "aff_domain": "ox.ac.uk;;berkeley.edu;berkeley.edu", "position": "PhD student;;Full Professor;PhD student", "bibtex": "@misc{\nskalse2022invariance,\ntitle={Invariance in Policy Optimisation and Partial Identifiability in Reward Learning},\nauthor={Joar Max Viktor Skalse and Matthew Farrugia-Roberts and Stuart Russell and Adam Gleave},\nyear={2022},\nurl={https://openreview.net/forum?id=eqRTPB134q0}\n}", "github": "", "project": "", "reviewers": "CDLH;zVCQ;iFN9;zcgk", "site": "https://openreview.net/forum?id=eqRTPB134q0", "pdf_size": 0, "recommendation": "3;3;8;8", "confidence": "4;1;4;4", "correctness": "4;2;4;4", "technical_novelty": "2;2;3;3", "empirical_novelty": "0;0;0;0", "wc_summary_paper": "31;23;111;17", "wc_summary_review": "7;18;44;84", "wc_main_review": "191;61;209;458", "wc_review": "229;102;364;559", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "742;388;566;1354", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;4", "recommendation_avg": [ 5.5, 2.5 ], "confidence_avg": [ 3.25, 1.299038105676658 ], "correctness_avg": [ 3.5, 0.8660254037844386 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 0, 0 ], "wc_summary_paper_avg": [ 45.5, 38.141185089087095 ], "wc_summary_review_avg": [ 38.25, 29.634228520412 ], "wc_main_review_avg": [ 229.75, 143.61994116417122 ], "wc_review_avg": [ 313.5, 169.33177492721205 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 762.5, 363.71520452133973 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.75, 1.299038105676658 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.5773502691896258, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 53, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13906707069114225970&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 12, "aff_unique_index": "0;1;1", "aff_unique_norm": "University of Oxford;University of California, Berkeley", "aff_unique_dep": ";", "aff_unique_url": "https://www.ox.ac.uk;https://www.berkeley.edu", "aff_unique_abbr": "Oxford;UC Berkeley", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Berkeley", "aff_country_unique_index": "0;1;1", "aff_country_unique": "United Kingdom;United States" }, { "id": "eqaxDZg4MHw", "title": "Understanding the Generalization Gap in Visual Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Deep Reinforcement Learning (RL) agents have achieved superhuman performance on several video game suites. However, unlike humans, the trained policies fail to transfer between related games or even between different levels of the same game. Recent works have attempted to reduce this generalization gap using ideas such as data augmentation and learning domain invariant features. However, the transfer performance still remains unsatisfactory. In this work, we use procedurally generated video games to empirically investigate several hypotheses to explain the lack of transfer. We also show that simple auxiliary tasks can improve the generalization of policies. Contrary to the belief that policy adaptation to new levels requires full policy finetuning, we find that visual features transfer across levels, and only the parameters, that use these visual features to predict actions, require finetuning. Finally, to inform fruitful avenues for future research, we construct simple oracle methods that close the generalization gap. ", "keywords": "Visual Reinforcement Learning;Transfer in Reinforcement Learning;Generalization in Reinforcement Learning", "primary_area": "", "supplementary_material": "", "author": "Anurag Ajay;Ge Yang;Ofir Nachum;Pulkit Agrawal", "authorids": "~Anurag_Ajay1;~Ge_Yang1;~Ofir_Nachum1;~Pulkit_Agrawal1", "gender": "M;M;M;M", "homepage": "https://anuragajay.github.io/;http://www.episodeyang.com;https://scholar.google.com/citations?user=C-ZlBWMAAAAJ&hl=en;https://people.eecs.berkeley.edu/~pulkitag/", "dblp": "180/5483;48/4561-3;;149/2672", "google_scholar": ";vaQcF6kAAAAJ;C-ZlBWMAAAAJ;UpZmJI0AAAAJ", "orcid": ";0000-0001-7520-7055;;", "linkedin": ";;;", "or_profile": "~Anurag_Ajay1;~Ge_Yang1;~Ofir_Nachum1;~Pulkit_Agrawal1", "aff": "Massachusetts Institute of Technology;Massachusetts Institute of Technology;OpenAI;Massachusetts Institute of Technology", "aff_domain": "mit.edu;mit.edu;openai.com;mit.edu", "position": "PhD student;Postdoc;Researcher;Assistant Professor", "bibtex": "@misc{\najay2022understanding,\ntitle={Understanding the Generalization Gap in Visual Reinforcement Learning},\nauthor={Anurag Ajay and Ge Yang and Ofir Nachum and Pulkit Agrawal},\nyear={2022},\nurl={https://openreview.net/forum?id=eqaxDZg4MHw}\n}", "github": "", "project": "", "reviewers": "SFef;185P;sNjW;eZnU", "site": "https://openreview.net/forum?id=eqaxDZg4MHw", "pdf_size": 0, "recommendation": "3;3;3;6", "confidence": "4;4;5;3", "correctness": "2;2;3;2", "technical_novelty": "1;1;2;2", "empirical_novelty": "1;2;3;2", "wc_summary_paper": "77;123;32;168", "wc_summary_review": "73;143;104;25", "wc_main_review": "416;1129;762;241", "wc_review": "566;1395;898;434", "wc_reply_reviewers": "113;623;48;0", "wc_reply_authors": "1052;1728;1481;127", "reply_reviewers": "1;1;1;0", "reply_authors": "2;3;2;1", "recommendation_avg": [ 3.75, 1.299038105676658 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 2.25, 0.4330127018922193 ], "technical_novelty_avg": [ 1.5, 0.5 ], "empirical_novelty_avg": [ 2.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 100.0, 50.75923561284193 ], "wc_summary_review_avg": [ 86.25, 43.19360485071835 ], "wc_main_review_avg": [ 637.0, 340.34761641592263 ], "wc_review_avg": [ 823.25, 370.87017607243644 ], "wc_reply_reviewers_avg": [ 196.0, 249.76889317927484 ], "wc_reply_authors_avg": [ 1097.0, 610.0290976666605 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 2.0, 0.7071067811865476 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.816496580927726, "corr_recommendation_correctness": -0.3333333333333333, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:RX5ZeX6aglcJ:scholar.google.com/&scioq=Understanding+the+Generalization+Gap+in+Visual+Reinforcement+Learning&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Massachusetts Institute of Technology;OpenAI", "aff_unique_dep": ";", "aff_unique_url": "https://web.mit.edu;https://openai.com", "aff_unique_abbr": "MIT;OpenAI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "euAlnAcpQtv", "title": "Federated Contrastive Learning for Privacy-Preserving Unpaired Image-to-Image Translation", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "The goal of an unsupervised image-to-image translation (I2I) is to convert an input image in a specific domain to a target domain using a neural network trained with unpaired data. Existing I2I methods usually require a centrally stored dataset, which can compromise data privacy. A recent proposal of federated cycleGAN (FedCycleGAN) can protect the data-privacy by splitting the loss between the server and the clients so that the data does not need to be shared, but the weights and gradients of both generator and discriminators should be exchanged, demanding significant communication cost. To address this, here we propose a novel federated contrastive unpaired translation (FedCUT) approach for privacy-preserving image-to-image translation. Similar to FedCycleGAN, our method is based on the observation that the CUT loss can be decomposed into domain-specific local objectives, but in contrast to FedCycleGAN, our method only exchanges weights and gradients of a discriminator, significantly reducing the band-width requirement. In addition, by combining it with the pre-trained VGG network, the learnable part of the discriminator can be further reduced without impairing the image quality, resulting in two order magnitude reduction in the communication cost. Through extensive experiments for various translation tasks, we confirm that our method shows competitive performance compared to existing approaches.\n", "keywords": "image-to-image translation;contrastive learning;federated learning", "primary_area": "", "supplementary_material": "/attachment/90f1750ca07bd0d425ffe4ecd4272b15b0a87801.zip", "author": "Joonyoung Song;Jong Chul Ye", "authorids": "~Joonyoung_Song1;~Jong_Chul_Ye1", "gender": "M;M", "homepage": ";https://bispl.weebly.com/", "dblp": ";15/5613", "google_scholar": "https://scholar.google.com/citations?hl=ko;HNMjoNEAAAAJ", "orcid": "0000-0002-2645-8760;", "linkedin": ";", "or_profile": "~Joonyoung_Song1;~Jong_Chul_Ye1", "aff": "Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology", "aff_domain": "kaist.ac.kr;kaist.ac.kr", "position": "PhD student;Full Professor", "bibtex": "@misc{\nsong2022federated,\ntitle={Federated Contrastive Learning for Privacy-Preserving Unpaired Image-to-Image Translation},\nauthor={Joonyoung Song and Jong Chul Ye},\nyear={2022},\nurl={https://openreview.net/forum?id=euAlnAcpQtv}\n}", "github": "", "project": "", "reviewers": "jbDK;qdqa;HVqs", "site": "https://openreview.net/forum?id=euAlnAcpQtv", "pdf_size": 0, "recommendation": "3;3;6", "confidence": "5;4;4", "correctness": "2;3;3", "technical_novelty": "1;2;3", "empirical_novelty": "1;2;3", "wc_summary_paper": "105;119;192", "wc_summary_review": "50;51;87", "wc_main_review": "493;355;264", "wc_review": "648;525;543", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 4.0, 1.4142135623730951 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "correctness_avg": [ 2.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.0, 0.816496580927726 ], "empirical_novelty_avg": [ 2.0, 0.816496580927726 ], "wc_summary_paper_avg": [ 138.66666666666666, 38.14300576631172 ], "wc_summary_review_avg": [ 62.666666666666664, 17.21110752456745 ], "wc_main_review_avg": [ 370.6666666666667, 94.14291735913483 ], "wc_review_avg": [ 572.0, 54.24020648928247 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.5, "corr_recommendation_correctness": 0.5, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:B5xc4ajlRUEJ:scholar.google.com/&scioq=Federated+Contrastive+Learning+for+Privacy-Preserving+Unpaired+Image-to-Image+Translation&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Korea Advanced Institute of Science and Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.kaist.ac.kr", "aff_unique_abbr": "KAIST", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "South Korea" }, { "id": "eubJ4rgnN3", "title": "OUT-OF-DISTRIBUTION CLASSIFICATION WITH ADAPTIVE LEARNING OF LOW-LEVEL CONTEXTUAL FEATURES", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Humans can inherently identify what is unknown to them, but the existing Neural Network (NN) is still lacking in this aspect. Out-of-Distribution (OOD) classification is an incredibly challenging problem for any NN model. In General, any model tries to predict the OOD samples from the labels used for training only, but that is not acceptable for AGI (Artificial General Intelligence) [Fjelland(2020)]. There are several kinds of research already done to avoid this issue and build a model to predict the OOD samples, existing baseline work like 1) Thresholding SoftMax, 2) train model by adding extra OOD class as a label, or 3) Mahalanobis distance-based approach. All existing approach uses the CNN to get the spatial feature information and channel-wise information within the local receptive field at each layer. Here in this paper, we have proposed a method to learn the features of In-class and OOD sample\u2019s features with global receptive field among channels to learn the spatial relationship with modified SEnet block. Broadly, our model learns the interdependencies between channels with adaptive recalibration of the weights of stacked channels at each layer. To give more weightage to the In-class samples, we uniformly normalized the OOD samples with the total number of known class samples and trained our model to suppress the OOD class probability with a simple and effective loss function. We did our experiments for our model with MNIST and F-MNIST as In-class samples and EMNIST, KMNIST, not-MNIST, Omniglot, Uniform Noise, Gaussian Noise as OOD samples.", "keywords": "Out-of-Distribution;Adaptive Learning;Contextual Features", "primary_area": "", "supplementary_material": "", "author": "Neeraj Tiwari;Tushar Bangoria;Rajan Vaja", "authorids": "~Neeraj_Tiwari1;~Tushar_Bangoria1;~Rajan_Vaja2", "gender": "M;M;M", "homepage": ";https://in.linkedin.com/in/tushar-bangoria-b59329a;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": "neeraj-tiwari-b2b22177/;;https://in.linkedin.com/in/rajan-vaja-13649642", "or_profile": "~Neeraj_Tiwari1;~Tushar_Bangoria1;~Rajan_Vaja2", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\ntiwari2022outofdistribution,\ntitle={{OUT}-{OF}-{DISTRIBUTION} {CLASSIFICATION} {WITH} {ADAPTIVE} {LEARNING} {OF} {LOW}-{LEVEL} {CONTEXTUAL} {FEATURES}},\nauthor={Neeraj Tiwari and Tushar Bangoria and Rajan Vaja},\nyear={2022},\nurl={https://openreview.net/forum?id=eubJ4rgnN3}\n}", "github": "", "project": "", "reviewers": "ZbyT;oX1n;eYKX;vP1p", "site": "https://openreview.net/forum?id=eubJ4rgnN3", "pdf_size": 0, "recommendation": "1;1;3;3", "confidence": "4;4;4;3", "correctness": "3;1;4;2", "technical_novelty": "1;1;2;2", "empirical_novelty": "1;2;2;3", "wc_summary_paper": "72;111;42;82", "wc_summary_review": "39;41;26;48", "wc_main_review": "383;346;301;204", "wc_review": "494;498;369;334", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 2.0, 1.0 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 2.5, 1.118033988749895 ], "technical_novelty_avg": [ 1.5, 0.5 ], "empirical_novelty_avg": [ 2.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 76.75, 24.651318423159438 ], "wc_summary_review_avg": [ 38.5, 7.952986860293433 ], "wc_main_review_avg": [ 308.5, 66.95707580233773 ], "wc_review_avg": [ 423.75, 73.31567022131081 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.4472135954999579, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:z8JMGxZAeOYJ:scholar.google.com/&scioq=OUT-OF-DISTRIBUTION+CLASSIFICATION+WITH+ADAPTIVE+LEARNING+OF+LOW-LEVEL+CONTEXTUAL+FEATURES&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "eypsJ0rvAqo", "title": "1-bit LAMB: Communication Efficient Large-Scale Large-Batch Training with LAMB's Convergence Speed", "track": "main", "status": "Reject", "tldr": "", "abstract": "To train large models (like BERT and GPT-3) on hundreds of GPUs, communication has become a major bottleneck, especially on commodity systems with limited-bandwidth TCP network. On one side large batch-size optimization such as LAMB algorithm was proposed to reduce the frequency of communication. On the other side, communication compression algorithms such as 1-bit Adam help to reduce the volume of each communication. However, we find that simply using one of the techniques is not sufficient to solve the communication challenge, especially under low network bandwidth. Motivated by this we aim to combine the power of large-batch optimization and communication compression, but we find that existing compression strategies cannot be directly applied to LAMB due to its unique adaptive layerwise learning rates. To this end, we design a new communication-efficient algorithm, 1-bit LAMB, which introduces a novel way to support adaptive layerwise learning rates under compression. In addition, we introduce a new system implementation for compressed communication using the NCCL backend of PyTorch distributed, which improves both usability and performance. For BERT-Large pre-training task with batch sizes from 8K to 64K, our evaluations on up to 256 GPUs demonstrate that 1-bit LAMB with NCCL-based backend is able to achieve up to 4.6x communication volume reduction, up to 2.8x end-to-end time-wise speedup, and the same sample-wise convergence speed (and same fine-tuning task accuracy) compared to uncompressed LAMB.", "keywords": "optimization;communication compression;natural language processing;language model pre-training", "primary_area": "", "supplementary_material": "", "author": "Conglong Li;Ammar Ahmad Awan;Hanlin Tang;Samyam Rajbhandari;Yuxiong He", "authorids": "~Conglong_Li1;~Ammar_Ahmad_Awan1;~Hanlin_Tang2;~Samyam_Rajbhandari1;~Yuxiong_He1", "gender": ";M;;M;", "homepage": ";https://awan-10.github.io/;;https://www.snowflake.com/en/blog/authors/samyam-rajbhandari/;", "dblp": "158/7995;;;;https://dblp.org/pers/hd/h/He:Yuxiong", "google_scholar": ";JM_IZzQAAAAJ;RCGyfecAAAAJ;;SB3_eb0AAAAJ", "orcid": ";;;;", "linkedin": ";;;samyam-rajbhandari-08ba5730/;", "or_profile": "~Conglong_Li1;~Ammar_Ahmad_Awan1;~Hanlin_Tang2;~Samyam_Rajbhandari1;~Yuxiong_He1", "aff": "Microsoft;;Huawei Technologies Ltd.;Microsoft Research;Microsoft", "aff_domain": "microsoft.com;;huawei.com;research.microsoft.com;microsoft.com", "position": "Researcher;;Researcher;Principal Researcher;Researcher", "bibtex": "@misc{\nli2022bit,\ntitle={1-bit {LAMB}: Communication Efficient Large-Scale Large-Batch Training with {LAMB}'s Convergence Speed},\nauthor={Conglong Li and Ammar Ahmad Awan and Hanlin Tang and Samyam Rajbhandari and Yuxiong He},\nyear={2022},\nurl={https://openreview.net/forum?id=eypsJ0rvAqo}\n}", "github": "", "project": "", "reviewers": "amJh;otsu;5SMi", "site": "https://openreview.net/forum?id=eypsJ0rvAqo", "pdf_size": 0, "recommendation": "5;5;6", "confidence": "4;3;3", "correctness": "4;3;4", "technical_novelty": "2;3;2", "empirical_novelty": "2;3;3", "wc_summary_paper": "50;21;73", "wc_summary_review": "27;44;52", "wc_main_review": "216;179;329", "wc_review": "293;244;454", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "638;354;710", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "recommendation_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "correctness_avg": [ 3.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 48.0, 21.275964529643932 ], "wc_summary_review_avg": [ 41.0, 10.424330514074594 ], "wc_main_review_avg": [ 241.33333333333334, 63.80351784101633 ], "wc_review_avg": [ 330.3333333333333, 89.70445300479174 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 567.3333333333334, 153.68654968980061 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.4999999999999999, "corr_recommendation_correctness": 0.4999999999999999, "gs_citation": 39, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6829158126092922152&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff_unique_index": "0;1;0;0", "aff_unique_norm": "Microsoft;Huawei", "aff_unique_dep": "Microsoft Corporation;Huawei Technologies", "aff_unique_url": "https://www.microsoft.com;https://www.huawei.com", "aff_unique_abbr": "Microsoft;Huawei", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0", "aff_country_unique": "United States;China" }, { "id": "ezbMFmQY7L", "title": "C5T5: Controllable Generation of Organic Molecules with Transformers", "track": "main", "status": "Reject", "tldr": "", "abstract": "Methods for designing organic materials with desired properties have high potential impact across fields such as medicine, renewable energy, petrochemical engineering, and agriculture. However, using generative models for this task is difficult because candidate compounds must satisfy many constraints, including synthetic accessibility, intellectual property attributes, ``chemical beauty'' (Bickerton et al., 2020), and other considerations that are intuitive to domain experts but can be challenging to quantify. We propose C5T5, a novel self-supervised pretraining method that works in tandem with domain experts by making zero-shot select-and-replace edits, altering organic substances towards desired property values. C5T5 operates on IUPAC names---a standardized molecular representation that intuitively encodes rich structural information for organic chemists but that has been largely ignored by the ML community. Our technique requires no edited molecule pairs to train and only a rough estimate of molecular properties, and it has the potential to model long-range dependencies and symmetric molecular structures more easily than graph-based methods. We demonstrate C5T5's effectiveness on four physical properties relevant for drug discovery, showing that it learns successful and chemically intuitive strategies for altering molecules towards desired property values.\n", "keywords": "molecular modeling;sequence modeling;conditional sequence modeling;drug discovery", "primary_area": "", "supplementary_material": "", "author": "Daniel Rothchild;Alex Tamkin;Julie Yu;Ujval Misra;Joseph E. Gonzalez", "authorids": "~Daniel_Rothchild1;~Alex_Tamkin1;~Julie_Yu2;~Ujval_Misra1;~Joseph_E._Gonzalez1", "gender": "M;;F;;M", "homepage": ";;;https://people.eecs.berkeley.edu/~ujval/;http://eecs.berkeley.edu/~jegonzal", "dblp": "237/9911;;;;61/8262", "google_scholar": "https://scholar.google.com/citations?hl=en;;https://scholar.google.ca/citations?hl=en;xEm-9oMAAAAJ;https://scholar.google.com.tw/citations?user=gM2WW9UAAAAJ", "orcid": "0000-0002-4605-0949;;;;0000-0003-2921-956X", "linkedin": ";;;;", "or_profile": "~Daniel_Rothchild1;~Alex_Tamkin1;~Julie_Yu2;~Ujval_Misra1;~Joseph_E._Gonzalez1", "aff": "University of California, Berkeley;;;;University of California, Berkeley", "aff_domain": "berkeley.edu;;;;berkeley.edu", "position": "PhD student;;;;Associate Professor", "bibtex": "@misc{\nrothchild2022ct,\ntitle={C5T5: Controllable Generation of Organic Molecules with Transformers},\nauthor={Daniel Rothchild and Alex Tamkin and Julie Yu and Ujval Misra and Joseph E. Gonzalez},\nyear={2022},\nurl={https://openreview.net/forum?id=ezbMFmQY7L}\n}", "github": "", "project": "", "reviewers": "gkwv;MJUv;pKJo", "site": "https://openreview.net/forum?id=ezbMFmQY7L", "pdf_size": 0, "recommendation": "3;5;5", "confidence": "4;4;3", "correctness": "2;3;4", "technical_novelty": "3;3;2", "empirical_novelty": "2;2;3", "wc_summary_paper": "43;79;92", "wc_summary_review": "26;21;40", "wc_main_review": "886;437;328", "wc_review": "955;537;460", "wc_reply_reviewers": "61;111;173", "wc_reply_authors": "942;946;699", "reply_reviewers": "1;1;1", "reply_authors": "2;2;1", "recommendation_avg": [ 4.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.0, 0.816496580927726 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 71.33333333333333, 20.725722075613085 ], "wc_summary_review_avg": [ 29.0, 8.04155872120988 ], "wc_main_review_avg": [ 550.3333333333334, 241.48751980634987 ], "wc_review_avg": [ 650.6666666666666, 217.48001185294146 ], "wc_reply_reviewers_avg": [ 115.0, 45.81120678029194 ], "wc_reply_authors_avg": [ 862.3333333333334, 115.50565161738 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.49999999999999983, "corr_recommendation_correctness": 0.8660254037844385, "gs_citation": 37, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2677383469049315595&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0", "aff_unique_norm": "University of California, Berkeley", "aff_unique_dep": "", "aff_unique_url": "https://www.berkeley.edu", "aff_unique_abbr": "UC Berkeley", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Berkeley", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "f-KGT01Qze0", "title": "Robustmix: Improving Robustness by Regularizing the Frequency Bias of Deep Nets", "track": "main", "status": "Reject", "tldr": "", "abstract": "Deep networks have achieved impressive results on a range of well curated benchmark datasets. Surprisingly, their performance remains sensitive to perturbations that have little effect on human performance. In this work, we propose a novel extension of Mixup called Robustmix that regularizes networks to classify based on lower frequency spatial features. We show that this type of regularization improves robustness on a range of benchmarks such as Imagenet-C and Stylized Imagenet. It adds little computational overhead and furthermore does not require a priori knowledge of a large set of image transformations. We find that this approach further complements recent advances in model architecture and data augmentation attaining a state-of-the-art mCE of 44.8 with an EfficientNet-B8 model and RandAugment, which is a reduction of 16 mCE compared to the baseline.", "keywords": "robustness;imagenet-c;mixup", "primary_area": "", "supplementary_material": "", "author": "Jonas Ngnaw\u00e9;MARIANNE NJIFON;Jonathan Heek;Yann Dauphin", "authorids": "ngnawejonas@gmail.com;nabemgnigni@aimsammi.org;~Jonathan_Heek1;~Yann_Dauphin1", "gender": ";;;M", "homepage": ";;;https://www.dauphin.io", "dblp": ";;247/1004;22/9988", "google_scholar": ";;;XSforroAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "ngnawejonas@gmail.com;nabemgnigni@aimsammi.org;~Jonathan_Heek1;~Yann_Dauphin1", "aff": ";;Google;Google", "aff_domain": ";;google.com;google.com", "position": ";;Software Engineer;Researcher", "bibtex": "@misc{\nngnaw{\\'e}2022robustmix,\ntitle={Robustmix: Improving Robustness by Regularizing the Frequency Bias of Deep Nets},\nauthor={Jonas Ngnaw{\\'e} and MARIANNE NJIFON and Jonathan Heek and Yann Dauphin},\nyear={2022},\nurl={https://openreview.net/forum?id=f-KGT01Qze0}\n}", "github": "", "project": "", "reviewers": "3E2o;5zdd;EYUe", "site": "https://openreview.net/forum?id=f-KGT01Qze0", "pdf_size": 0, "recommendation": "3;5;5", "confidence": "5;3;4", "correctness": "4;2;3", "technical_novelty": "2;3;3", "empirical_novelty": "3;2;2", "wc_summary_paper": "100;86;64", "wc_summary_review": "81;73;24", "wc_main_review": "214;410;254", "wc_review": "395;569;342", "wc_reply_reviewers": "118;0;0", "wc_reply_authors": "452;566;459", "reply_reviewers": "1;0;0", "reply_authors": "1;1;1", "recommendation_avg": [ 4.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "correctness_avg": [ 3.0, 0.816496580927726 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 83.33333333333333, 14.817407180595247 ], "wc_summary_review_avg": [ 59.333333333333336, 25.197001585285676 ], "wc_main_review_avg": [ 292.6666666666667, 84.55898664377561 ], "wc_review_avg": [ 435.3333333333333, 96.96161898171646 ], "wc_reply_reviewers_avg": [ 39.333333333333336, 55.62573345334173 ], "wc_reply_authors_avg": [ 492.3333333333333, 52.16853031814828 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.8660254037844385, "corr_recommendation_correctness": -0.8660254037844385, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10957325195275543151&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0", "aff_unique_norm": "Google", "aff_unique_dep": "Google", "aff_unique_url": "https://www.google.com", "aff_unique_abbr": "Google", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "f-LuEgBQUg", "title": "Language-Driven Image Style Transfer", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Despite having promising results, style transfer, which requires preparing style images in advance, may result in lack of creativity and accessibility. Following human instruction, on the other hand, is the most natural way to perform artistic style transfer that can significantly improve controllability for visual effect applications. We introduce a new task\u2014language-driven image style transfer (LDIST)\u2014to manipulate the style of a content image, guided by a text. We propose contrastive language visual artist (CLVA) that learns to extract visual semantics from style instructions and accomplish LDIST by the patch-wise style discriminator. The discriminator considers the correlation between language and patches of style images or transferred results to jointly embed style instructions. CLVA further compares contrastive pairs of content image and style instruction to improve the mutual relativeness between transfer results. The transferred results from the same content image can preserve consistent content structures. Besides, they should present analogous style patterns from style instructions that contain similar visual semantics. The experiments show that our CLVA is effective and achieves superb transferred results on LDIST.", "keywords": "Artistic Style Transfer;Language-based Image Editing", "primary_area": "", "supplementary_material": "/attachment/4b9f3da0aecd65fc04398bc120aa4224304b5205.zip", "author": "Tsu-Jui Fu;Xin Eric Wang;William Yang Wang", "authorids": "~Tsu-Jui_Fu2;~Xin_Eric_Wang2;~William_Yang_Wang2", "gender": "M;M;M", "homepage": "https://tsujuifu.github.io;https://eric-xw.github.io;https://www.cs.ucsb.edu/~william/", "dblp": "218/5366.html;10/5630-61;08/9282", "google_scholar": "https://scholar.google.com.tw/citations?user=7QRDcC0AAAAJ;YjqluE0AAAAJ;gf8Ms_8AAAAJ", "orcid": ";0000-0003-2605-5504;", "linkedin": "tsujuifu1996;;", "or_profile": "~Tsu-Jui_Fu2;~Xin_Eric_Wang2;~William_Wang1", "aff": "UC Santa Barbara;University of California, Santa Cruz;UC Santa Barbara", "aff_domain": "ucsb.edu;ucsc.edu;ucsb.edu", "position": "PhD student;Assistant Professor;Full Professor", "bibtex": "@misc{\nfu2022languagedriven,\ntitle={Language-Driven Image Style Transfer},\nauthor={Tsu-Jui Fu and Xin Eric Wang and William Yang Wang},\nyear={2022},\nurl={https://openreview.net/forum?id=f-LuEgBQUg}\n}", "github": "", "project": "", "reviewers": "9uUT;k6rh;AKoh;exSe", "site": "https://openreview.net/forum?id=f-LuEgBQUg", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "4;5;5;4", "correctness": "4;2;3;3", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "102;89;91;64", "wc_summary_review": "56;96;37;45", "wc_main_review": "355;484;395;599", "wc_review": "513;669;523;708", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 4.5, 0.5 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 86.5, 13.901438774457844 ], "wc_summary_review_avg": [ 58.5, 22.677080940897135 ], "wc_main_review_avg": [ 458.25, 93.72132894917785 ], "wc_review_avg": [ 603.25, 86.43024644185623 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12896523958261979683&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "University of California, Santa Barbara;University of California, Santa Cruz", "aff_unique_dep": ";", "aff_unique_url": "https://www.ucsb.edu;https://www.ucsc.edu", "aff_unique_abbr": "UCSB;UCSC", "aff_campus_unique_index": "0;1;0", "aff_campus_unique": "Santa Barbara;Santa Cruz", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "f2K6ofowQoq", "title": "Efficient Second-Order Optimization for Deep Learning with Kernel Machines", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Second-order optimization has been recently explored in neural network training. However, the recomputation of the Hessian matrix in the second-order optimization posts much extra computation and memory burden in the training. There have been some attempts to address the issue by approximation on the Hessian matrix, which unfortunately degrades the performance of the neural models. To address the issue, we propose Kernel Stochastic Gradient Descent (Kernel SGD) which projects the optimization problem to a transformed space with the Hessian matrix of kernel machines. Kernel SGD eliminates the recomputation of the Hessian matrix and requires a much smaller memory cost which can be controlled via the mini-batch size. The additional advantage of Kernel SGD is its ability to converge to better solutions according to our theoretical analysis. Kernel SGD is theoretically guaranteed to converge. Experimental results on tabular, image and text data show that Kernel SGD converges up to 30 times faster than the existing second-order optimization techniques, and also shows remarkable performance in generalization.", "keywords": "Second-order optimization;Deep learning;Kernel machines", "primary_area": "", "supplementary_material": "/attachment/4c818626812501e4ae131eeea34b16017b37b5dd.zip", "author": "Yawen Chen;Zeyi Wen;Yile Chen;Jian Chen;Jin Huang", "authorids": "~Yawen_Chen1;zeyi.wen@uwa.edu.au;jireh.x6@gmail.com;~Jian_Chen7;jinhuang@scnu.edu.cn", "gender": "F;;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": "PGHINZMAAAAJ;;;;", "orcid": "0000-0002-0368-2006;;;0000-0003-4769-1526;", "linkedin": ";;;;", "or_profile": "~Yawen_Chen1;zeyi.wen@uwa.edu.au;jireh.x6@gmail.com;~Jian_Chen7;jinhuang@scnu.edu.cn", "aff": "South China University of Technology;;;South China University of Technology;", "aff_domain": "scut.edu.cn;;;scut.edu.cn;", "position": "PhD student;;;Full Professor;", "bibtex": "@misc{\nchen2022efficient,\ntitle={Efficient Second-Order Optimization for Deep Learning with Kernel Machines},\nauthor={Yawen Chen and Zeyi Wen and Yile Chen and Jian Chen and Jin Huang},\nyear={2022},\nurl={https://openreview.net/forum?id=f2K6ofowQoq}\n}", "github": "", "project": "", "reviewers": "zCMe;LNKd;bNM5;diqz", "site": "https://openreview.net/forum?id=f2K6ofowQoq", "pdf_size": 0, "recommendation": "3;3;3;6", "confidence": "4;4;2;2", "correctness": "1;2;2;3", "technical_novelty": "2;3;3;3", "empirical_novelty": "0;2;2;3", "wc_summary_paper": "57;12;64;57", "wc_summary_review": "27;23;50;14", "wc_main_review": "285;282;912;143", "wc_review": "369;317;1026;214", "wc_reply_reviewers": "0;35;0;0", "wc_reply_authors": "637;528;797;409", "reply_reviewers": "0;1;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 3.75, 1.299038105676658 ], "confidence_avg": [ 3.0, 1.0 ], "correctness_avg": [ 2.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.75, 1.0897247358851685 ], "wc_summary_paper_avg": [ 47.5, 20.694202086574876 ], "wc_summary_review_avg": [ 28.5, 13.275918047351754 ], "wc_main_review_avg": [ 405.5, 298.00209730805585 ], "wc_review_avg": [ 481.5, 319.2777004427337 ], "wc_reply_reviewers_avg": [ 8.75, 15.155444566227676 ], "wc_reply_authors_avg": [ 592.75, 142.85722767854625 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.816496580927726, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:I1YUOumL1lIJ:scholar.google.com/&scioq=Efficient+Second-Order+Optimization+for+Deep+Learning+with+Kernel+Machines&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "South China University of Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.scut.edu.cn", "aff_unique_abbr": "SCUT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "title": "Scale Efficiently: Insights from Pretraining and Finetuning Transformers", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/5960", "id": "f2OYVDyfIB", "poster": "", "openreview": "https://openreview.net/forum?id=f2OYVDyfIB", "slides": "https://iclr.cc/virtual/2022/poster/5960", "video": "https://iclr.cc/virtual/2022/poster/5960", "author_site": "Yi Tay, Mostafa Dehghani, Jinfeng Rao, William Fedus, Samira Abnar, Hyung Won Chung, SHARAN NARANG, Dani Yogatama, Ashish Vaswani, Donald Metzler", "tldr": "", "abstract": "There remain many open questions pertaining to the scaling behaviour of Transformer architectures. These scaling decisions and findings can be critical, as training runs often come with an associated computational cost which have both financial and/or environmental impact. The goal of this paper is to present scaling insights from pretraining and finetuning Transformers. While Kaplan et al. presents a comprehensive study of the scaling behaviour of Transformer language models, the scope is only on the upstream (pretraining) loss. Therefore, it is still unclear if these set of findings transfer to downstream task within the context of the pretrain-finetune paradigm. The key findings of this paper are as follows: (1) we show that aside from only the model size, model shape matters for downstream fine-tuning, (2) scaling protocols operate differently at different compute regions, (3) widely adopted T5-base and T5-large sizes are Pareto-inefficient. To this end, we present improved scaling protocols whereby our redesigned models achieve similar downstream fine-tuning quality while having 50\\% fewer parameters and training 40\\% faster compared to the widely adopted T5-base model. We publicly release over 100 pretrained checkpoints of different T5 configurations to facilitate future research and analysis.", "keywords": "transformers;attention;deep learning", "primary_area": "", "supplementary_material": "", "author": "Yi Tay;Mostafa Dehghani;Jinfeng Rao;William Fedus;Samira Abnar;Hyung Won Chung;Sharan Narang;Dani Yogatama;Ashish Vaswani;Donald Metzler", "authorids": "~Yi_Tay1;~Mostafa_Dehghani1;~Jinfeng_Rao1;~William_Fedus2;~Samira_Abnar1;~Hyung_Won_Chung1;~Sharan_Narang1;~Dani_Yogatama2;~Ashish_Vaswani1;~Donald_Metzler1", "gender": "M;M;;Unspecified;M;M;M;M;;", "homepage": "http://yitay.net;http://mostafadehghani.com/;;https://samiraabnar.github.io/;;;;https://research.google/people/DonaldMetzler/;;", "dblp": ";125/4062;134/5708;150/5405;;;;95/2272;08/8178;", "google_scholar": "VBclY_cAAAAJ;https://scholar.google.nl/citations?user=MiHOX3QAAAAJ;;https://scholar.google.nl/citations?user=jbxwjgMAAAAJ;1CAlXvYAAAAJ;CWOixywAAAAJ;6rUjwXUAAAAJ;bmXpOd8AAAAJ;;", "orcid": ";;;;;;;0000-0003-4276-6269;;", "linkedin": ";;;;;;;donmetzler/;;", "or_profile": "~Yi_Tay1;~Mostafa_Dehghani1;~Jinfeng_Rao1;~Samira_Abnar1;~Hyung_Won_Chung1;~Sharan_Narang1;~Ashish_Vaswani1;~Donald_Metzler1;~Dani_Yogatama1;~William_Fedus1", "aff": "Google;Google DeepMind;;Apple;Google Brain;Google;;Google;Google DeepMind;", "aff_domain": "google.com;google.com;;apple.com;google.com;google.com;;google.com;google.com;", "position": "Research Scientist;Research Scientist;;Researcher;Researcher;Research Engineer;;Research Scientist;Research Scientist;", "bibtex": "@inproceedings{\ntay2022scale,\ntitle={Scale Efficiently: Insights from Pretraining and Finetuning Transformers},\nauthor={Yi Tay and Mostafa Dehghani and Jinfeng Rao and William Fedus and Samira Abnar and Hyung Won Chung and Sharan Narang and Dani Yogatama and Ashish Vaswani and Donald Metzler},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=f2OYVDyfIB}\n}", "github": "", "project": "", "reviewers": "SEDW;C5AB;8qst;826H", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "4;4;3;4", "correctness": "3;4;3;4", "technical_novelty": "2;3;2;1", "empirical_novelty": "3;3;3;3", "wc_summary_paper": "76;88;51;69", "wc_summary_review": "25;61;68;3", "wc_main_review": "204;183;182;119", "wc_review": "305;332;301;191", "wc_reply_reviewers": "0;0;21;0", "wc_reply_authors": "426;189;236;156", "reply_reviewers": "0;0;1;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 71.0, 13.397761006974262 ], "wc_summary_review_avg": [ 39.25, 26.536531423680827 ], "wc_main_review_avg": [ 172.0, 31.835514759463212 ], "wc_review_avg": [ 282.25, 54.01562274009252 ], "wc_reply_reviewers_avg": [ 5.25, 9.093266739736606 ], "wc_reply_authors_avg": [ 251.75, 104.54275441177164 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 10, 0 ], "corr_recommendation_confidence": 0.13245323570650439, "corr_recommendation_correctness": 0.6882472016116854, "gs_citation": 170, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12072008243148468438&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=f2OYVDyfIB", "email": "google.com;google.com;;apple.com;google.com;google.com;;google.com;google.com;", "author_num": 10, "aff_unique_index": "0;0;1;0;0;0;0", "aff_unique_norm": "Google;Apple", "aff_unique_dep": "Google;Apple Inc.", "aff_unique_url": "https://www.google.com;https://www.apple.com", "aff_unique_abbr": "Google;Apple", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;1;0;0;0;0;1", "aff_country_unique": "United States;United Kingdom" }, { "title": "Bayesian Framework for Gradient Leakage", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6934", "id": "f2lrIbGx3x7", "poster": "", "openreview": "https://openreview.net/forum?id=f2lrIbGx3x7", "slides": "https://iclr.cc/virtual/2022/poster/6934", "video": "https://iclr.cc/virtual/2022/poster/6934", "author_site": "Mislav Balunovic, Dimitar I. Dimitrov, Robin Staab, Martin Vechev", "tldr": "", "abstract": "Federated learning is an established method for training machine learning models without sharing training data. However, recent work has shown that it cannot guarantee data privacy as shared gradients can still leak sensitive information. To formalize the problem of gradient leakage, we propose a theoretical framework that enables, for the first time, analysis of the Bayes optimal adversary phrased as an optimization problem. We demonstrate that existing leakage attacks can be seen as approximations of this optimal adversary with different assumptions on the probability distributions of the input data and gradients. Our experiments confirm the effectiveness of the Bayes optimal adversary when it has knowledge of the underlying distribution. Further, our experimental evaluation shows that several existing heuristic defenses are not effective against stronger attacks, especially early in the training process. Thus, our findings indicate that the construction of more effective defenses and their evaluation remains an open problem.\n", "keywords": "federated learning;privacy;gradient leakage", "primary_area": "", "supplementary_material": "", "author": "Mislav Balunovic;Dimitar Iliev Dimitrov;Robin Staab;Martin Vechev", "authorids": "~Mislav_Balunovic1;~Dimitar_Iliev_Dimitrov2;~Robin_Staab1;~Martin_Vechev1", "gender": "M;M;M;M", "homepage": "https://www.sri.inf.ethz.ch/people/mislav;https://www.sri.inf.ethz.ch/people/dimitadi;;https://www.sri.inf.ethz.ch/people/martin", "dblp": "231/7686;271/0915;304/3512;93/2189.html", "google_scholar": "fxkgmGwAAAAJ;https://scholar.google.com/citations?hl=en;;https://scholar.google.ch/citations?user=aZ1Rh50AAAAJ", "orcid": ";0000-0001-9813-0900;;", "linkedin": ";;robin-staab-b778a51a6/;", "or_profile": "~Mislav_Balunovic1;~Dimitar_Iliev_Dimitrov2;~Robin_Staab1;~Martin_Vechev1", "aff": "Swiss Federal Institute of Technology;Swiss Federal Institute of Technology;ETHZ - ETH Zurich;Swiss Federal Institute of Technology", "aff_domain": "ethz.ch;ethz.ch;ethz.ch;ethz.ch", "position": "PhD student;PhD student;MS student;Full Professor", "bibtex": "@inproceedings{\nbalunovic2022bayesian,\ntitle={Bayesian Framework for Gradient Leakage},\nauthor={Mislav Balunovic and Dimitar Iliev Dimitrov and Robin Staab and Martin Vechev},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=f2lrIbGx3x7}\n}", "github": "", "project": "", "reviewers": "cPAG;CRtt;DWMa;6AUa", "pdf_size": 0, "recommendation": "6;6;6;8", "confidence": "4;2;5;5", "correctness": "3;4;3;4", "technical_novelty": "3;3;2;3", "empirical_novelty": "1;2;2;3", "wc_summary_paper": "47;39;64;80", "wc_summary_review": "51;13;90;109", "wc_main_review": "216;64;238;450", "wc_review": "314;116;392;639", "wc_reply_reviewers": "196;0;302;458", "wc_reply_authors": "903;123;928;1014", "reply_reviewers": "1;0;2;2", "reply_authors": "2;1;2;2", "recommendation_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 4.0, 1.224744871391589 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 57.5, 15.819292019556375 ], "wc_summary_review_avg": [ 65.75, 36.94167700578846 ], "wc_main_review_avg": [ 242.0, 137.5136356875201 ], "wc_review_avg": [ 365.25, 187.35444350214917 ], "wc_reply_reviewers_avg": [ 239.0, 166.50825805346713 ], "wc_reply_authors_avg": [ 742.0, 359.74365873493866 ], "reply_reviewers_avg": [ 1.25, 0.82915619758885 ], "reply_authors_avg": [ 1.75, 0.4330127018922193 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.4714045207910316, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 57, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14925580502725272742&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=f2lrIbGx3x7", "email": "ethz.ch;ethz.ch;ethz.ch;ethz.ch", "author_num": 4, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Swiss Federal Institute of Technology;ETH Zurich", "aff_unique_dep": ";", "aff_unique_url": "https://www.ethz.ch;https://www.ethz.ch", "aff_unique_abbr": "ETH Zurich;ETHZ", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Switzerland" }, { "id": "f2zGmcA0bs7", "title": "Routing with Self-Attention for Multimodal Capsule Networks", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "The task of multimodal learning has seen a growing interest recently as it allows for training neural architectures based on different modalities such as vision, text, and audio. One challenge in training such models is that they need to jointly learn semantic concepts and their relationships across different input representations. Capsule networks have been shown to perform well in context of capturing the relation between low-level input features and higher-level concepts. However, capsules have so far mainly been used only in small-scale fully supervised settings due to the resource demand of conventional routing algorithms. We present a new multimodal capsule network that allows us to leverage the strength of capsules in the context of a multimodal learning framework on large amounts of video data. To adapt the capsules to large-scale input data, we propose a novel routing by self-attention mechanism that selects relevant capsules which are then used to generate a final joint multimodal feature representation. This allows not only for robust training with noisy video data, but also to scale up the size of the capsule network compared to traditional routing methods while still being computationally efficient. We evaluate the proposed architecture by pretraining it on a large-scale multimodal video dataset and applying it on four datasets in two challenging downstream tasks. Results show that the proposed multimodal capsule network is not only able to improve results compared to other routing techniques, but also achieves competitive performance on the task of multimodal learning.", "keywords": "multimodal;capsule networks;self-supervision;computer vision;self-attention;routing", "primary_area": "", "supplementary_material": "", "author": "Kevin Duarte;Brian Chen;Nina Shvetsova;Andrew Rouditchenko;Samuel Thomas;Alexander H. Liu;David Harwath;James R. Glass;Hilde Kuehne;Mubarak Shah", "authorids": "~Kevin_Duarte1;~Brian_Chen3;~Nina_Shvetsova1;~Andrew_Rouditchenko1;~Samuel_Thomas1;~Alexander_H._Liu1;~David_Harwath1;~James_R._Glass1;~Hilde_Kuehne5;~Mubarak_Shah3", "gender": "M;M;F;;;M;M;;F;M", "homepage": ";https://brian7685.github.io/;https://ninatu.github.io/;;;https://alexander-h-liu.github.io/;https://www.cs.utexas.edu/~harwath/index.html;;https://hildekuehne.github.io;https://www.crcv.ucf.edu/person/mubarak-shah/", "dblp": "220/4092;36/39-1;301/1304;218/5458;;227/2380;;;45/4963;s/MubarakShah", "google_scholar": "PxD5DrYAAAAJ;7zfiaA8AAAAJ;qZtU1L4AAAAJ;;S34WHG0AAAAJ;LIiCDa0AAAAJ;C0kDOzcAAAAJ;;pxhCcH0AAAAJ;https://scholar.google.com.tw/citations?user=p8gsO3gAAAAJ", "orcid": ";;0000-0003-0910-188X;;;;;;0000-0003-1079-4441;0000-0002-8216-1128", "linkedin": "kevin-duarte-vision/;brianchen2718/;;;;;;;hilde-kuehne-8b9aa661;mubarak-shah-b6aa68213/", "or_profile": "~Kevin_Duarte1;~Brian_Chen3;~Nina_Shvetsova1;~Andrew_Rouditchenko1;~Samuel_Thomas1;~Alexander_H._Liu1;~David_Harwath1;~James_R._Glass1;~Hilde_Kuehne5;~Mubarak_Shah3", "aff": "Adobe Systems;Columbia University;Goethe University;Massachusetts Institute of Technology;International Business Machines;Meta Facebook;University of Texas, Austin;;Goethe University Frankfurt;University of Central Florida", "aff_domain": "adobe.com;columbia.edu;uni-frankfurt.de;mit.edu;ibm.com;meta.com;utexas.edu;;uni-frankfurt.de;ucf.edu", "position": "Researcher;PhD student;PhD student;PhD student;Researcher;Intern;Assistant Professor;;Assistant Professor;Full Professor", "bibtex": "@misc{\nduarte2022routing,\ntitle={Routing with Self-Attention for Multimodal Capsule Networks},\nauthor={Kevin Duarte and Brian Chen and Nina Shvetsova and Andrew Rouditchenko and Samuel Thomas and Alexander H. Liu and David Harwath and James R. Glass and Hilde Kuehne and Mubarak Shah},\nyear={2022},\nurl={https://openreview.net/forum?id=f2zGmcA0bs7}\n}", "github": "", "project": "", "reviewers": "Hp4N;uXqs;ip1x;Cs1m", "site": "https://openreview.net/forum?id=f2zGmcA0bs7", "pdf_size": 0, "recommendation": "3;5;5;5", "confidence": "3;4;4;4", "correctness": "3;3;3;3", "technical_novelty": "2;2;2;2", "empirical_novelty": "0;3;0;2", "wc_summary_paper": "25;67;84;58", "wc_summary_review": "14;55;87;59", "wc_main_review": "135;351;641;217", "wc_review": "174;473;812;334", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "112;160;77;119", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 4.5, 0.8660254037844386 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 1.25, 1.299038105676658 ], "wc_summary_paper_avg": [ 58.5, 21.47673159491453 ], "wc_summary_review_avg": [ 53.75, 26.05163142684158 ], "wc_main_review_avg": [ 336.0, 192.2316311120519 ], "wc_review_avg": [ 448.25, 235.15566652751534 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 117.0, 29.487285395573462 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 10, 0 ], "corr_recommendation_confidence": 1.0, "corr_recommendation_correctness": 0.0, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5388268459903532644&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;2;3;4;5;6;2;7", "aff_unique_norm": "Adobe;Columbia University;Goethe University Frankfurt;Massachusetts Institute of Technology;International Business Machines Corporation;Meta;University of Texas at Austin;University of Central Florida", "aff_unique_dep": "Adobe Systems Incorporated;;;;;Meta Platforms, Inc.;;", "aff_unique_url": "https://www.adobe.com;https://www.columbia.edu;https://www.uni-frankfurt.de;https://web.mit.edu;https://www.ibm.com;https://meta.com;https://www.utexas.edu;https://www.ucf.edu", "aff_unique_abbr": "Adobe;Columbia;GU;MIT;IBM;Meta;UT Austin;UCF", "aff_campus_unique_index": "1;2;1", "aff_campus_unique": ";Frankfurt;Austin", "aff_country_unique_index": "0;0;1;0;0;0;0;1;0", "aff_country_unique": "United States;Germany" }, { "id": "f3QTgKQW0TD", "title": "Manifold Distance Judge, an Adversarial Samples Defense Strategy Based on Service Orchestration", "track": "main", "status": "Reject", "tldr": "", "abstract": "Deep neural networks (DNNs) are playing an increasingly significant role in the modern world. However, they are weak to adversarial examples that are generated by adding specially crafted perturbations. Most defenses against adversarial examples focused on refining the DNN models, which often sacrifice the performance and computational cost of models on benign samples. In this paper, we propose a manifold distance detection method to distinguish between legitimate samples and adversarial samples by measuring the different distances on the manifold. The manifold distance detection method neither modifies the protected models nor requires knowledge of the process for generating adversarial samples. Inspired by the effectiveness of the manifold distance detection, we demonstrated a well-designed orchestrated defense strategy, named Manifold Distance Judge (MDJ), which selects the best image processing method that will effectively expand the manifold distance between legitimate and adversarial samples, and thus, enhances the performance of the following manifold distance detection method. Tests on the ImageNet dataset, the MDJ is effective against the most adversarial samples under whitebox, graybox, and blackbox attack scenarios. We show empirically that the orchestration strategy MDJ is significantly better than Feature Squeezing on the recall rate. Meanwhile, MDJ achieves high detection rates against CW attack and DI-FGSM attack.", "keywords": "service orchestration;manifold distance detection;adversarial example;neural network.", "primary_area": "", "supplementary_material": "", "author": "Mengxin Zhang;Xiaofeng QIU", "authorids": "~Mengxin_Zhang1;~Xiaofeng_QIU2", "gender": "M;F", "homepage": "https://github.com/UPZmx;https://www.researchgate.net/profile/Xiaofeng-Qiu", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";xiaofeng-qiu-0800757/", "or_profile": "~Mengxin_Zhang1;~Xiaofeng_QIU2", "aff": "Beijing University of Post and Telecommunication;Beijing University of Post and Telecommunication, Tsinghua University", "aff_domain": "bupt.edu.cn;bupt.edu.cn", "position": "MS student;Associate Professor", "bibtex": "@misc{\nzhang2022manifold,\ntitle={Manifold Distance Judge, an Adversarial Samples Defense Strategy Based on Service Orchestration},\nauthor={Mengxin Zhang and Xiaofeng QIU},\nyear={2022},\nurl={https://openreview.net/forum?id=f3QTgKQW0TD}\n}", "github": "", "project": "", "reviewers": "wmbn;fqVb;Cbtd;MGG8", "site": "https://openreview.net/forum?id=f3QTgKQW0TD", "pdf_size": 0, "recommendation": "1;1;3;5", "confidence": "5;4;4;4", "correctness": "2;2;2;3", "technical_novelty": "1;1;2;2", "empirical_novelty": "2;1;0;2", "wc_summary_paper": "46;45;58;30", "wc_summary_review": "41;49;22;29", "wc_main_review": "768;321;175;82", "wc_review": "855;415;255;141", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 2.5, 1.6583123951777 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 2.25, 0.4330127018922193 ], "technical_novelty_avg": [ 1.5, 0.5 ], "empirical_novelty_avg": [ 1.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 44.75, 9.934158243152764 ], "wc_summary_review_avg": [ 35.25, 10.449282272003183 ], "wc_main_review_avg": [ 336.5, 263.28928956567904 ], "wc_review_avg": [ 416.5, 271.2319118392967 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.5222329678670935, "corr_recommendation_correctness": 0.8703882797784891, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:IiC-_9Ck4J8J:scholar.google.com/&scioq=Manifold+Distance+Judge,+an+Adversarial+Samples+Defense+Strategy+Based+on+Service+Orchestration&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1", "aff_unique_norm": "Beijing University of Posts and Telecommunications;Beijing University of Post and Telecommunication", "aff_unique_dep": ";", "aff_unique_url": "http://www.bupt.edu.cn/;http://www.bupt.edu.cn/", "aff_unique_abbr": "BUPT;BUPT", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Beijing", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "id": "f3qFAV_MH-C", "title": "Transfer and Marginalize: Explaining Away Label Noise with Privileged Information", "track": "main", "status": "Reject", "tldr": "", "abstract": "Supervised learning datasets often have privileged information, in the form of features which are available at training time but are not available at test time e.g. the ID of the annotator that provided the label. We argue that privileged information is useful for explaining away label noise, thereby reducing the harmful impact of noisy labels. We develop a simple and efficient method for supervised neural networks: it transfers the knowledge learned with privileged information via weight sharing and approximately marginalizes over privileged information at test time. Our method, TRAM (TRansfer and Marginalize), has minimal training time overhead and has the same test time cost as not using privileged information. TRAM performs strongly on CIFAR-10H, ImageNet and Civil Comments benchmarks.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/52561d2ea205b21770467dacfe454879dc010951.zip", "author": "Mark Collier;Rodolphe Jenatton;Effrosyni Kokiopoulou;Jesse Berent", "authorids": "~Mark_Collier1;~Rodolphe_Jenatton3;~Effrosyni_Kokiopoulou1;jberent@google.com", "gender": "M;M;F;", "homepage": ";http://rodolphejenatton.com/;;", "dblp": ";68/8398;05/960;", "google_scholar": "U4rBrcgAAAAJ;QIR6rygAAAAJ;9om-fCsAAAAJ;", "orcid": ";;;", "linkedin": "mark-collier-aa446032/;;;", "or_profile": "~Mark_Collier1;~Rodolphe_Jenatton3;~Effrosyni_Kokiopoulou1;jberent@google.com", "aff": "Google;Google;Google DeepMind;", "aff_domain": "google.com;google.com;google.com;", "position": "Researcher;Senior research scientist;Researcher;", "bibtex": "@misc{\ncollier2022transfer,\ntitle={Transfer and Marginalize: Explaining Away Label Noise with Privileged Information},\nauthor={Mark Collier and Rodolphe Jenatton and Effrosyni Kokiopoulou and Jesse Berent},\nyear={2022},\nurl={https://openreview.net/forum?id=f3qFAV_MH-C}\n}", "github": "", "project": "", "reviewers": "KQ8f;GTi4;6G97;Nesa", "site": "https://openreview.net/forum?id=f3qFAV_MH-C", "pdf_size": 0, "recommendation": "3;6;6;6", "confidence": "3;4;3;3", "correctness": "2;3;3;3", "technical_novelty": "3;3;3;2", "empirical_novelty": "2;3;2;2", "wc_summary_paper": "103;73;90;51", "wc_summary_review": "38;18;68;81", "wc_main_review": "276;231;303;224", "wc_review": "417;322;461;356", "wc_reply_reviewers": "305;93;31;27", "wc_reply_authors": "830;417;518;427", "reply_reviewers": "2;2;1;1", "reply_authors": "2;3;1;1", "recommendation_avg": [ 5.25, 1.299038105676658 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 79.25, 19.472737352514155 ], "wc_summary_review_avg": [ 51.25, 24.732316915323562 ], "wc_main_review_avg": [ 258.5, 32.53075467922624 ], "wc_review_avg": [ 389.0, 53.72615750265414 ], "wc_reply_reviewers_avg": [ 114.0, 113.33578428722325 ], "wc_reply_authors_avg": [ 548.0, 167.5007462669943 ], "reply_reviewers_avg": [ 1.5, 0.5 ], "reply_authors_avg": [ 1.75, 0.82915619758885 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": 1.0, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11032101970683787492&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff_unique_index": "0;0;0", "aff_unique_norm": "Google", "aff_unique_dep": "Google", "aff_unique_url": "https://www.google.com", "aff_unique_abbr": "Google", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;0;1", "aff_country_unique": "United States;United Kingdom" }, { "id": "f4c4JtbHJ7B", "title": "Pixab-CAM: Attend Pixel, not Channel", "track": "main", "status": "Reject", "tldr": "", "abstract": "To understand the internal behaviors of convolution neural networks (CNNs), many class activation mapping (CAM) based methods, which generate an explanation map by a linear combination of channels and corresponding weights, have been proposed. Previous CAM-based methods have tried to define a channel-wise weight that represents the importance of a channel for the target class. However, these methods have two common limitations. First, all pixels in the channel share a single scalar value. If the pixels are tied to a specific value, some of them are overestimated. Second, since the explanation map is the result of a linear combination of channels in the activation tensor, it is inevitably dependent on the activation tensor. To address these issues, we propose gradient-free Pixel-wise Ablation-CAM (Pixab-CAM), which utilizes pixel-wise weights rather than channel-wise weights to break the link between pixels in a channel. In addition, in order not to generate an explanation map dependent on the activation tensor, the explanation map is generated only with pixel-wise weights without linear combination with the activation tensor. In this paper, we also propose novel evaluation metrics to measure the quality of explanation maps using an adversarial attack. We demonstrate through experiments the qualitative and quantitative superiority of Pixab-CAM.", "keywords": "Explainable AI;Interpretable ML;Visual explanation of CNN;Class activation maps;Computer Vision", "primary_area": "", "supplementary_material": "/attachment/f7da32bf4fa56450f56d9fd84bcf39b23bff764f.zip", "author": "Jaeeun Jang;Seokjun Kim;Hyeoncheol Kim", "authorids": "~Jaeeun_Jang1;~Seokjun_Kim1;~Hyeoncheol_Kim1", "gender": "M;M;M", "homepage": ";https://ini.korea.ac.kr;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": "seokjun7607/;;jaeeun0779/", "or_profile": "~Seokjun_Kim1;~Hyeoncheol_Kim1;~Jae_Eun_Jang1", "aff": "Korea University;Korea University;Korea University", "aff_domain": "korea.ac.kr;korea.ac.kr;korea.ac.kr", "position": "MS student;Full Professor;MS student", "bibtex": "@misc{\njang2022pixabcam,\ntitle={Pixab-{CAM}: Attend Pixel, not Channel},\nauthor={Jaeeun Jang and Seokjun Kim and Hyeoncheol Kim},\nyear={2022},\nurl={https://openreview.net/forum?id=f4c4JtbHJ7B}\n}", "github": "", "project": "", "reviewers": "8WRU;Kcad;roRV;u8GW", "site": "https://openreview.net/forum?id=f4c4JtbHJ7B", "pdf_size": 0, "recommendation": "3;3;5;6", "confidence": "4;4;4;4", "correctness": "2;2;2;4", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "53;22;93;53", "wc_summary_review": "44;19;42;42", "wc_main_review": "335;397;363;184", "wc_review": "432;438;498;279", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.25, 1.299038105676658 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 2.5, 0.8660254037844386 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 55.25, 25.202926417382564 ], "wc_summary_review_avg": [ 36.75, 10.280442597476044 ], "wc_main_review_avg": [ 319.75, 81.39218328562025 ], "wc_review_avg": [ 411.75, 80.8714257324551 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.7777777777777777, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:psYHx9ZVM20J:scholar.google.com/&scioq=Pixab-CAM:+Attend+Pixel,+not+Channel&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0;0", "aff_unique_norm": "Korea University", "aff_unique_dep": "", "aff_unique_url": "https://www.korea.ac.kr", "aff_unique_abbr": "KU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "South Korea" }, { "id": "f5ggjj9Rfq", "title": "Faking Interpolation Until You Make It", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Deep over-parameterized neural networks exhibit the interpolation property on many data sets. That is, these models are able to achieve approximately zero loss on all training samples simultaneously. Recently, this property has been exploited to develop novel optimisation algorithms for this setting. These algorithms use the fact that the optimal loss value is known to employ a variation of a Polyak step-size calculated on a stochastic batch of data. We introduce a novel extension of this idea to tasks where the interpolation property does not hold. As we no longer have access to the optimal loss values a priori, we instead estimate these for each sample online. To realise this, we introduce a simple but highly effective heuristic for approximating the optimal value based on previous loss evaluations. This heuristic starts by setting the approximate optimal values to a known lower bound on the loss function, typically zero. It then updates them at fixed intervals through training in the direction of the best iterate visited so far. We provide rigorous experimentation on a wide range of problems including two natural language processing tasks, popular vision benchmarks and the challenging ImageNet classification data set. From our empirical analysis we demonstrate the effectiveness of our approach, which in the non-interpolating setting, outperforms state of the art baselines, namely adaptive gradient and line search methods.", "keywords": "Deep Learning;Optimisation;Step-size selection", "primary_area": "", "supplementary_material": "/attachment/1c4f2ca5e22ab1a5ab6dbe4cafcf9476283a9dd2.zip", "author": "Alasdair Paren;Rudra Poudel;M. Pawan Kumar", "authorids": "~Alasdair_Paren1;~Rudra_Poudel1;~M._Pawan_Kumar1", "gender": "M;M;", "homepage": "https://alasdair-p.github.io/Alasdair-P/;https://www.rudrapoudel.com;", "dblp": "312/6594;08/11431;45/2527", "google_scholar": "Mcq6dQIAAAAJ;https://scholar.google.co.uk/citations?user=Rw4cmbUAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": "0009-0003-5933-5243;;", "linkedin": "alasdair-paren-a66b88113/;https://linkedin.com/in/rudrapoudel;", "or_profile": "~Alasdair_Paren1;~Rudra_Poudel1;~M._Pawan_Kumar1", "aff": "University of Oxford;Toshiba Europe Ltd;Google DeepMind", "aff_domain": "ox.ac.uk;crl.toshiba.co.uk;deepmind.com", "position": "PhD student;Senior Research Scientist;Researcher", "bibtex": "@misc{\nparen2022faking,\ntitle={Faking Interpolation Until You Make It},\nauthor={Alasdair Paren and Rudra Poudel and M. Pawan Kumar},\nyear={2022},\nurl={https://openreview.net/forum?id=f5ggjj9Rfq}\n}", "github": "", "project": "", "reviewers": "nLXD;xiix;QDW3;C5yy", "site": "https://openreview.net/forum?id=f5ggjj9Rfq", "pdf_size": 0, "recommendation": "3;5;5;5", "confidence": "5;3;3;4", "correctness": "2;3;3;3", "technical_novelty": "2;2;2;2", "empirical_novelty": "3;2;3;2", "wc_summary_paper": "96;104;119;34", "wc_summary_review": "89;26;60;19", "wc_main_review": "689;323;458;345", "wc_review": "874;453;637;398", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.5, 0.8660254037844386 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 88.25, 32.39116391857508 ], "wc_summary_review_avg": [ 48.5, 28.0579756931964 ], "wc_main_review_avg": [ 453.75, 145.15745761069255 ], "wc_review_avg": [ 590.5, 186.07592536381486 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.8703882797784891, "corr_recommendation_correctness": 1.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2154786781750101280&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1;2", "aff_unique_norm": "University of Oxford;Toshiba;Google", "aff_unique_dep": ";;Google DeepMind", "aff_unique_url": "https://www.ox.ac.uk;https://www.toshiba.co.uk;https://deepmind.com", "aff_unique_abbr": "Oxford;Toshiba Europe;DeepMind", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United Kingdom" }, { "id": "f6CQliwyra", "title": "A Free Lunch from the Noise: Provable and Practical Exploration for Representation Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Representation learning lies at the heart of the empirical success of deep learning for dealing with the curse of dimensionality. However, the power of representation learning has not been fully exploited yet in reinforcement learning (RL), due to i), the trade-off between expressiveness and tractability; and ii), the coupling between exploration and representation learning. In this paper, we first reveal the fact that under some noise assumption in the stochastic control model, we can obtain the linear spectral feature of its corresponding Markov transition operator in closed-form for free. Based on this observation, we propose Spectral Dynamics Embedding (SPEDE), which breaks the trade-off and completes optimistic exploration for representation learning by exploiting the structure of the noise. We provide rigorous theoretical analysis of SPEDE, and demonstrate the practical superior performance over the existing state-of-the-art empirical algorithms on several benchmarks.\n", "keywords": "representation learning;reinforcement learning", "primary_area": "", "supplementary_material": "", "author": "Tongzheng Ren;Tianjun Zhang;Csaba Szepesvari;Bo Dai", "authorids": "~Tongzheng_Ren1;~Tianjun_Zhang1;~Csaba_Szepesvari1;~Bo_Dai1", "gender": "M;;M;", "homepage": "https://www.cs.utexas.edu/~tzren/;https://tianjunz.github.io;https://sites.ualberta.ca/~szepesva/;https://bo-dai.github.io/", "dblp": "211/8004;;http://dblp.uni-trier.de/pers/hd/s/Szepesv=aacute=ri:Csaba;64/2903", "google_scholar": "VgNDYeYAAAAJ;UE9jz_MAAAAJ;https://scholar.google.ca/citations?user=zvC19mQAAAAJ;TIKl_foAAAAJ", "orcid": ";;;0009-0002-8070-574X", "linkedin": ";;csaba-szepesvari-09376b1?trk=hp-identity-name;", "or_profile": "~Tongzheng_Ren1;~Tianjun_Zhang1;~Csaba_Szepesvari1;~Bo_Dai1", "aff": "Google;University of California, Berkeley;Google DeepMind;Google Brain", "aff_domain": "google.com;berkeley.edu;google.com;google.com", "position": "Intern;PhD student;Research Scientist;Research Scientist", "bibtex": "@misc{\nren2022a,\ntitle={A Free Lunch from the Noise: Provable and Practical Exploration for Representation Learning},\nauthor={Tongzheng Ren and Tianjun Zhang and Csaba Szepesvari and Bo Dai},\nyear={2022},\nurl={https://openreview.net/forum?id=f6CQliwyra}\n}", "github": "", "project": "", "reviewers": "8eEv;e9su;amjV;azcj", "site": "https://openreview.net/forum?id=f6CQliwyra", "pdf_size": 0, "recommendation": "3;5;5;8", "confidence": "4;3;3;3", "correctness": "3;4;3;4", "technical_novelty": "2;3;2;3", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "60;62;48;40", "wc_summary_review": "119;40;17;67", "wc_main_review": "462;633;327;140", "wc_review": "641;735;392;247", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "782;1191;816;307", "reply_reviewers": "0;0;0;0", "reply_authors": "2;3;2;1", "recommendation_avg": [ 5.25, 1.7853571071357126 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 52.5, 8.986100377805714 ], "wc_summary_review_avg": [ 60.75, 38.0024670251814 ], "wc_main_review_avg": [ 390.5, 180.7629663399005 ], "wc_review_avg": [ 503.75, 194.11513980109845 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 774.0, 313.769820091098 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.0, 0.7071067811865476 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.7276068751089989, "corr_recommendation_correctness": 0.7001400420140049, "gs_citation": 28, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7632946949412805126&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff_unique_index": "0;1;0;0", "aff_unique_norm": "Google;University of California, Berkeley", "aff_unique_dep": "Google;", "aff_unique_url": "https://www.google.com;https://www.berkeley.edu", "aff_unique_abbr": "Google;UC Berkeley", "aff_campus_unique_index": "0;1;0", "aff_campus_unique": "Mountain View;Berkeley;", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "United States;United Kingdom" }, { "id": "f6R69En9_tH", "title": "Cross Project Software Vulnerability Detection via Domain Adaptation and Max-Margin Principle", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Software vulnerabilities (SVs) have become a common, serious and crucial concern due to the ubiquity of computer software. Many machine learning-based approaches have been proposed to solve the software vulnerability detection (SVD) problem. However, there still have been two significant issues presenting for SVD in terms of i) learning automatic representations to improve the predictive performance of SVD, and ii) tackling the scarcity of labeled vulnerabilities datasets that conventionally need laborious labeling effort by experts. In this paper, we propose a novel end-to-end approach to tackle these two crucial issues. We first exploit the automatic representation learning with deep domain adaptation for software vulnerability detection. We then propose a novel cross-domain kernel classifier leveraging the max-margin principle to significantly improve the transfer learning process of software vulnerabilities from labeled projects into unlabeled ones. The experimental results on real-world software datasets show the superiority of our proposed method over state-of-the-art baselines.", "keywords": "Cybersecurity;Cross Project Software Vulnerability Detection;Domain Adaptation;Max-Margin Principle.", "primary_area": "", "supplementary_material": "", "author": "Van Nguyen;Trung Le;John C. Grundy;Dinh Phung", "authorids": "~Van_Nguyen2;~Trung_Le2;~John_C._Grundy1;~Dinh_Phung2", "gender": "M;M;M;M", "homepage": ";;https://sites.google.com/site/johncgrundy;https://research.monash.edu/en/persons/dinh-phung", "dblp": ";;g/JohnCGrundy.html;71/5859", "google_scholar": "KPpmKZ0AAAAJ;https://scholar.google.com/citations?hl=en;bbEQGY8AAAAJ;https://scholar.google.com.au/citations?user=OtA9SwIAAAAJ", "orcid": "0000-0002-5838-3409;;0000-0003-4928-7076;0000-0002-9977-8247", "linkedin": ";;jgrundy/;https://linkedin.com/in/dinh-phung-6b537a6", "or_profile": "~Van_Nguyen2;~Trung_Le2;~John_C._Grundy1;~Dinh_Phung1", "aff": "Monash University;Monash University;Monash University;Monash University", "aff_domain": "monash.edu;monash.edu;monash.edu;monash.edu", "position": "Postdoc;Assistant Professor;ARC Laureate Professor;Full Professor", "bibtex": "@misc{\nnguyen2022cross,\ntitle={Cross Project Software Vulnerability Detection via Domain Adaptation and Max-Margin Principle},\nauthor={Van Nguyen and Trung Le and John C. Grundy and Dinh Phung},\nyear={2022},\nurl={https://openreview.net/forum?id=f6R69En9_tH}\n}", "github": "", "project": "", "reviewers": "fdnR;cBfp;SmQT;R9uk", "site": "https://openreview.net/forum?id=f6R69En9_tH", "pdf_size": 0, "recommendation": "3;5;5;8", "confidence": "4;3;4;3", "correctness": "3;3;3;4", "technical_novelty": "2;3;2;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "226;96;104;48", "wc_summary_review": "54;46;61;26", "wc_main_review": "227;237;735;121", "wc_review": "507;379;900;195", "wc_reply_reviewers": "106;31;0;0", "wc_reply_authors": "1222;1329;807;168", "reply_reviewers": "2;1;0;0", "reply_authors": "3;3;1;1", "recommendation_avg": [ 5.25, 1.7853571071357126 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 118.5, 65.65630205852291 ], "wc_summary_review_avg": [ 46.75, 13.102957681378658 ], "wc_main_review_avg": [ 330.0, 238.20369434582665 ], "wc_review_avg": [ 495.25, 258.66230397953234 ], "wc_reply_reviewers_avg": [ 34.25, 43.31498008772485 ], "wc_reply_authors_avg": [ 881.5, 455.749108611306 ], "reply_reviewers_avg": [ 0.75, 0.82915619758885 ], "reply_authors_avg": [ 2.0, 1.0 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.7001400420140049, "corr_recommendation_correctness": 0.8892972917998875, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12223186658469015290&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Monash University", "aff_unique_dep": "", "aff_unique_url": "https://www.monash.edu", "aff_unique_abbr": "Monash", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Australia" }, { "id": "f7cWROZYSU", "title": "Detecting Worst-case Corruptions via Loss Landscape Curvature in Deep Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "The non-robustness of neural network policies to adversarial examples poses a challenge for deep reinforcement learning. One natural approach to mitigate the impact of adversarial examples is to develop methods to detect when a given input is adversarial. In this work we introduce a novel approach for detecting adversarial examples that is computationally efficient, is agnostic to the method used to generate adversarial examples, and theoretically well-motivated. Our method is based on a measure of the local curvature of the neural network policy, which we show differs between adversarial and clean examples. We empirically demonstrate the effectiveness of our method in the Atari environment against a large set of state-of-the-art algorithms for generating adversarial examples. Furthermore, we exhibit the effectiveness of our detection algorithm with the presence of multiple strong detection-aware adversaries. ", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/f380f3e77302eae0a1889209fec5ef2460c8b67a.zip", "author": "Ezgi Korkmaz;Jonah Brown-Cohen", "authorids": "~Ezgi_Korkmaz2;~Jonah_Brown-Cohen1", "gender": ";M", "homepage": "https://ezgikorkmaz.github.io/;https://jonahbc.github.io/", "dblp": "300/7830.html;157/1513", "google_scholar": ";fRc3A80AAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Ezgi_Korkmaz2;~Jonah_Brown-Cohen1", "aff": "University College London, University of London;Chalmers University", "aff_domain": "ucl.ac.uk;chalmers.se", "position": "PhD student;Assistant Professor", "bibtex": "@misc{\nkorkmaz2022detecting,\ntitle={Detecting Worst-case Corruptions via Loss Landscape Curvature in Deep Reinforcement Learning},\nauthor={Ezgi Korkmaz and Jonah Brown-Cohen},\nyear={2022},\nurl={https://openreview.net/forum?id=f7cWROZYSU}\n}", "github": "", "project": "", "reviewers": "ZsRu;x4K3;LjzU;Ccck", "site": "https://openreview.net/forum?id=f7cWROZYSU", "pdf_size": 0, "recommendation": "3;3;8;8", "confidence": "3;5;4;4", "correctness": "4;2;4;4", "technical_novelty": "1;2;3;3", "empirical_novelty": "1;2;3;4", "wc_summary_paper": "20;56;89;66", "wc_summary_review": "40;618;27;37", "wc_main_review": "68;43;205;138", "wc_review": "128;717;321;241", "wc_reply_reviewers": "0;471;41;0", "wc_reply_authors": "97;1174;375;97", "reply_reviewers": "0;3;1;0", "reply_authors": "1;2;1;1", "recommendation_avg": [ 5.5, 2.5 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.5, 0.8660254037844386 ], "technical_novelty_avg": [ 2.25, 0.82915619758885 ], "empirical_novelty_avg": [ 2.5, 1.118033988749895 ], "wc_summary_paper_avg": [ 57.75, 24.863376681376163 ], "wc_summary_review_avg": [ 180.5, 252.63659671551943 ], "wc_main_review_avg": [ 113.5, 63.27124149248219 ], "wc_review_avg": [ 351.75, 221.74464480568633 ], "wc_reply_reviewers_avg": [ 128.0, 198.73726374286227 ], "wc_reply_authors_avg": [ 435.75, 441.08013727666315 ], "reply_reviewers_avg": [ 1.0, 1.224744871391589 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:6akefgSI_8gJ:scholar.google.com/&scioq=Detecting+Worst-case+Corruptions+via+Loss+Landscape+Curvature+in+Deep+Reinforcement+Learning&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1", "aff_unique_norm": "University College London;Chalmers University of Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.ucl.ac.uk;https://www.chalmers.se", "aff_unique_abbr": "UCL;Chalmers", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "United Kingdom;Sweden" }, { "id": "f9AIc3mEprf", "title": "What classifiers know what they don't know?", "track": "main", "status": "Reject", "tldr": "", "abstract": "Being uncertain when facing the unknown is key to intelligent decision making. However, machine learning algorithms lack reliable estimates about their predictive uncertainty. This leads to wrong and overly-confident decisions when encountering classes unseen during training. Despite the importance of equipping classifiers with uncertainty estimates ready for the real world, prior work has focused on small datasets and little or no class discrepancy between training and testing data. To close this gap, we introduce UIMNET: a realistic, ImageNet-scale test-bed to evaluate predictive uncertainty estimates for deep image classifiers. Our benchmark provides implementations of eight state-of-the-art algorithms, six uncertainty measures, four in-domain metrics, three out-domain metrics, and a fully automated pipeline to train, calibrate, ensemble, select, and evaluate models. Our test-bed is open-source and all of our results are reproducible from a fixed commit in our repository. Adding new datasets, algorithms, measures, or metrics is a matter of a few lines of code-in so hoping that UIMNET becomes a stepping stone towards realistic, rigorous, and reproducible research in uncertainty estimation. Our results show that ensembles of ERM classifiers as well as single MIMO classifiers are the two best alternatives currently available to measure uncertainty about both in-domain and out-domain classe.", "keywords": "Uncertainty quantification;neural networks;benchmark", "primary_area": "", "supplementary_material": "/attachment/1875fa9f20628049c95e05487e184b95d8236126.zip", "author": "Mohamed Ishmael Belghazi;David Lopez-Paz", "authorids": "~Mohamed_Ishmael_Belghazi1;~David_Lopez-Paz2", "gender": "M;", "homepage": "https://github.com/ishmaelbelghazi/;http://lopezpaz.org", "dblp": "https://dblp.org/pers/b/Belghazi:Mohamed_Ishmael;74/10481", "google_scholar": "https://scholar.google.com/citations?authuser=1;", "orcid": ";", "linkedin": ";", "or_profile": "~Mohamed_Ishmael_Belghazi1;~David_Lopez-Paz2", "aff": "Universit\u00e9 Claude Bernard (Lyon I);Meta Facebook", "aff_domain": "univ-lyon1.fr;fb.com", "position": "PhD student;Research Scientist", "bibtex": "@misc{\nbelghazi2022what,\ntitle={What classifiers know what they don't know?},\nauthor={Mohamed Ishmael Belghazi and David Lopez-Paz},\nyear={2022},\nurl={https://openreview.net/forum?id=f9AIc3mEprf}\n}", "github": "", "project": "", "reviewers": "tETq;R3aT;tta3", "site": "https://openreview.net/forum?id=f9AIc3mEprf", "pdf_size": 0, "recommendation": "3;5;6", "confidence": "4;4;4", "correctness": "3;2;3", "technical_novelty": "2;1;2", "empirical_novelty": "2;2;3", "wc_summary_paper": "67;19;35", "wc_summary_review": "15;27;38", "wc_main_review": "147;292;149", "wc_review": "229;338;222", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 4.666666666666667, 1.247219128924647 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 2.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 1.6666666666666667, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 40.333333333333336, 19.955506062794353 ], "wc_summary_review_avg": [ 26.666666666666668, 9.392668535736915 ], "wc_main_review_avg": [ 196.0, 67.88716128007317 ], "wc_review_avg": [ 263.0, 53.10994884827763 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": -0.18898223650461363, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:g7oU9iDjhWIJ:scholar.google.com/&scioq=What+classifiers+know+what+they+don%27t+know%3F&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1", "aff_unique_norm": "Universit\u00e9 Claude Bernard;Meta", "aff_unique_dep": ";Meta Platforms, Inc.", "aff_unique_url": "https://www.universite-lyon1.fr;https://meta.com", "aff_unique_abbr": "UCBL;Meta", "aff_campus_unique_index": "0", "aff_campus_unique": "Lyon;", "aff_country_unique_index": "0;1", "aff_country_unique": "France;United States" }, { "title": "Online Coreset Selection for Rehearsal-based Continual Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6362", "id": "f9D-5WNG4Nv", "poster": "", "openreview": "https://openreview.net/forum?id=f9D-5WNG4Nv", "slides": "https://iclr.cc/virtual/2022/poster/6362", "video": "https://iclr.cc/virtual/2022/poster/6362", "author_site": "Jaehong Yoon, Divyam Madaan, Eunho Yang, Sung Ju Hwang", "tldr": "", "abstract": "A dataset is a shred of crucial evidence to describe a task. However, each data point in the dataset does not have the same potential, as some of the data points can be more representative or informative than others. This unequal importance among the data points may have a large impact in rehearsal-based continual learning, where we store a subset of the training examples (coreset) to be replayed later to alleviate catastrophic forgetting. In continual learning, the quality of the samples stored in the coreset directly affects the model's effectiveness and efficiency. The coreset selection problem becomes even more important under realistic settings, such as imbalanced continual learning or noisy data scenarios. To tackle this problem, we propose Online Coreset Selection (OCS), a simple yet effective method that selects the most representative and informative coreset at each iteration and trains them in an online manner. Our proposed method maximizes the model's adaptation to a target dataset while selecting high-affinity samples to past tasks, which directly inhibits catastrophic forgetting. We validate the effectiveness of our coreset selection mechanism over various standard, imbalanced, and noisy datasets against strong continual learning baselines, demonstrating that it improves task adaptation and prevents catastrophic forgetting in a sample-efficient manner. ", "keywords": "Continual Learning", "primary_area": "", "supplementary_material": "/attachment/886ff124cff5783349d6831b7eee770158a15bef.zip", "author": "Jaehong Yoon;Divyam Madaan;Eunho Yang;Sung Ju Hwang", "authorids": "~Jaehong_Yoon1;~Divyam_Madaan1;~Eunho_Yang1;~Sung_Ju_Hwang1", "gender": "M;M;M;", "homepage": "https://jaehong31.github.io/;https://dmadaan.com/;https://sites.google.com/site/hleehome2/;", "dblp": "203/4449;239/4899;96/2621;", "google_scholar": "-5comoUAAAAJ;DNk4dZkAAAAJ;;", "orcid": ";;;", "linkedin": "jaehongyoon/;;;", "or_profile": "~Jaehong_Yoon1;~Divyam_Madaan1;~Eunho_Yang1;~Sung_Ju_Hwang1", "aff": "Korea Advanced Institute of Science and Technology (KAIST);New York University;Korea Advanced Institute of Science & Technology;", "aff_domain": "kaist.ac.kr;nyu.edu;kaist.ac.kr;", "position": "PhD student;PhD student;Associate Professor;", "bibtex": "@inproceedings{\nyoon2022online,\ntitle={Online Coreset Selection for Rehearsal-based Continual Learning},\nauthor={Jaehong Yoon and Divyam Madaan and Eunho Yang and Sung Ju Hwang},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=f9D-5WNG4Nv}\n}", "github": "", "project": "", "reviewers": "btdc;iADy;mR25;XVBE", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "4;4;4;3", "correctness": "2;3;4;3", "technical_novelty": "3;3;3;3", "empirical_novelty": "3;3;0;3", "wc_summary_paper": "85;32;37;56", "wc_summary_review": "337;34;57;28", "wc_main_review": "80;526;123;96", "wc_review": "502;592;217;180", "wc_reply_reviewers": "29;18;0;29", "wc_reply_authors": "1707;2375;1119;710", "reply_reviewers": "2;1;0;1", "reply_authors": "7;8;4;5", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.25, 1.299038105676658 ], "wc_summary_paper_avg": [ 52.5, 20.79062288629179 ], "wc_summary_review_avg": [ 114.0, 129.20332813050908 ], "wc_main_review_avg": [ 206.25, 185.24628876174552 ], "wc_review_avg": [ 372.75, 177.6138719244643 ], "wc_reply_reviewers_avg": [ 19.0, 11.853269591129697 ], "wc_reply_authors_avg": [ 1477.75, 627.6453516915425 ], "reply_reviewers_avg": [ 1.0, 0.7071067811865476 ], "reply_authors_avg": [ 6.0, 1.5811388300841898 ], "replies_avg": [ 33, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.9271726499455306, "corr_recommendation_correctness": 0.3244428422615251, "gs_citation": 180, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3307069416491959783&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=f9D-5WNG4Nv", "email": "kaist.ac.kr;nyu.edu;kaist.ac.kr;", "author_num": 4, "aff_unique_index": "0;1;0", "aff_unique_norm": "Korea Advanced Institute of Science and Technology;New York University", "aff_unique_dep": ";", "aff_unique_url": "https://www.kaist.ac.kr;https://www.nyu.edu", "aff_unique_abbr": "KAIST;NYU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", "aff_country_unique": "South Korea;United States" }, { "id": "f9JwVXMJ1Up", "title": "The Needle in the haystack: Out-distribution aware Self-training in an Open-World Setting", "track": "main", "status": "Reject", "tldr": "", "abstract": "Traditional semi-supervised learning (SSL) has focused on the closed world assumption where all unlabeled samples are task-related. In practice, this assumption is often violated when leveraging data from very large image databases that contain mostly non-task-relevant samples. While standard self-training and other established methods fail in this open-world setting, we demonstrate that our out-distribution-aware self-learning (ODST) with a careful sample selection strategy can leverage unlabeled datasets with millions of samples, more than 1600 times larger than the labeled datasets, and which contain only about $2\\%$ task-relevant inputs. Standard and open world SSL techniques degrade in performance when the ratio of task-relevant sample decreases and show a significant distribution shift which is problematic regarding AI safety while ODST outperforms them with respect to test performance, corruption robustness and out-of-distribution detection.\n", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/1deccf3d9685d63d7d0e80a60213fb906e965966.zip", "author": "Maximilian Augustin;Matthias Hein", "authorids": "~Maximilian_Augustin1;~Matthias_Hein2", "gender": "M;M", "homepage": "https://uni-tuebingen.de/fakultaeten/mathematisch-naturwissenschaftliche-fakultaet/fachbereiche/informatik/lehrstuehle/maschinelles-lernen/news/;https://uni-tuebingen.de/de/164260", "dblp": "210/2432;97/1213-1", "google_scholar": "https://scholar.google.de/citations?user=f82UrTYAAAAJ;0ZAb3tsAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Maximilian_Augustin1;~Matthias_Hein2", "aff": "University of Tuebingen;University of T\u00fcbingen", "aff_domain": "uni-tuebingen.de;uni-tuebingen.de", "position": "PhD student;Full Professor", "bibtex": "@misc{\naugustin2022the,\ntitle={The Needle in the haystack: Out-distribution aware Self-training in an Open-World Setting},\nauthor={Maximilian Augustin and Matthias Hein},\nyear={2022},\nurl={https://openreview.net/forum?id=f9JwVXMJ1Up}\n}", "github": "", "project": "", "reviewers": "EKkN;kxNQ;xS21;b8X7;3Ko3", "site": "https://openreview.net/forum?id=f9JwVXMJ1Up", "pdf_size": 0, "recommendation": "3;5;5;5;8", "confidence": "2;4;3;4;4", "correctness": "2;3;3;4;4", "technical_novelty": "2;2;3;2;3", "empirical_novelty": "2;2;3;3;3", "wc_summary_paper": "43;51;72;43;27", "wc_summary_review": "50;23;30;58;25", "wc_main_review": "152;166;405;365;331", "wc_review": "245;240;507;466;383", "wc_reply_reviewers": "0;0;422;154;71", "wc_reply_authors": "676;1445;1590;1756;235", "reply_reviewers": "0;0;2;1;1", "reply_authors": "1;3;3;3;1", "recommendation_avg": [ 5.2, 1.6 ], "confidence_avg": [ 3.4, 0.8 ], "correctness_avg": [ 3.2, 0.7483314773547882 ], "technical_novelty_avg": [ 2.4, 0.4898979485566356 ], "empirical_novelty_avg": [ 2.6, 0.4898979485566356 ], "wc_summary_paper_avg": [ 47.2, 14.647866738880444 ], "wc_summary_review_avg": [ 37.2, 14.133647795243801 ], "wc_main_review_avg": [ 283.8, 104.65065694968187 ], "wc_review_avg": [ 368.2, 110.14790057009712 ], "wc_reply_reviewers_avg": [ 129.4, 156.91985215389417 ], "wc_reply_authors_avg": [ 1140.4, 584.6881561995249 ], "reply_reviewers_avg": [ 0.8, 0.7483314773547883 ], "reply_authors_avg": [ 2.2, 0.9797958971132712 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.71875, "corr_recommendation_correctness": 0.8017837257372732, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1919238141083886331&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1", "aff_unique_norm": "University of Tuebingen;University of T\u00fcbingen", "aff_unique_dep": ";", "aff_unique_url": "https://www.uni-tuebingen.de/;https://www.uni-tuebingen.de/", "aff_unique_abbr": "Uni T\u00fcbingen;Uni T\u00fcbingen", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Germany" }, { "title": "Dynamic Token Normalization improves Vision Transformers", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/5898", "id": "f9MHpAGUyMn", "poster": "", "openreview": "https://openreview.net/forum?id=f9MHpAGUyMn", "slides": "https://iclr.cc/virtual/2022/poster/5898", "video": "https://iclr.cc/virtual/2022/poster/5898", "author_site": "Wenqi Shao, Yixiao Ge, Zhaoyang Zhang, XUYUAN XU, Xiaogang Wang, Ying Shan, Ping Luo", "tldr": "", "abstract": "Vision Transformer (ViT) and its variants (e.g., Swin, PVT) have achieved great success in various computer vision tasks, owing to their capability to learn long-range contextual information. Layer Normalization (LN) is an essential ingredient in these models. However, we found that the ordinary LN makes tokens at different positions similar in magnitude because it normalizes embeddings within each token. It is difficult for Transformers to capture inductive bias such as the positional context in an image with LN. We tackle this problem by proposing a new normalizer, termed Dynamic Token Normalization (DTN), where normalization is performed both within each token (intra-token) and across different tokens (inter-token). DTN has several merits. Firstly, it is built on a unified formulation and thus can represent various existing normalization methods. Secondly, DTN learns to normalize tokens in both intra-token and inter-token manners, enabling Transformers to capture both the global contextual information and the local positional context. Thirdly, by simply replacing LN layers, DTN can be readily plugged into various vision transformers, such as ViT, Swin, and PVT. Extensive experiments show that the transformer equipped with DTN consistently outperforms baseline model with minimal extra parameters and computational overhead. For example, DTN outperforms LN on small ViT by $1.1\\%$ top-1 accuracy on ImageNet.", "keywords": "classification;Normalization;transformer", "primary_area": "", "supplementary_material": "", "author": "Wenqi Shao;Yixiao Ge;Zhaoyang Zhang;XUYUAN XU;Xiaogang Wang;Ying Shan;Ping Luo", "authorids": "~Wenqi_Shao2;~Yixiao_Ge2;~Zhaoyang_Zhang1;~XUYUAN_XU1;~Xiaogang_Wang2;~Ying_Shan2;~Ping_Luo2", "gender": "M;F;M;M;M;M;", "homepage": "https://wqshao126.github.io/;https://geyixiao.com/;https://zzyfd.github.io/#/;;http://www.ee.cuhk.edu.hk/~xgwang/;;http://luoping.me/", "dblp": "227/3122;228/6649;;;91/6236-1.html;68/5910;54/4989-2.html", "google_scholar": "Bs9mrwwAAAAJ;TtU74NAAAAAJ;Pf6o7uAAAAAJ;https://scholar.google.com.hk/citations?hl=zh-CN;https://scholar.google.com.hk/citations?user=-B5JgjsAAAAJ;4oXBp9UAAAAJ;https://scholar.google.com.hk/citations?hl=en", "orcid": ";;;;;0000-0001-7673-8325;0000-0002-6685-7950", "linkedin": ";;;;;YingShanProfile/;", "or_profile": "~Wenqi_Shao2;~Yixiao_Ge2;~Zhaoyang_Zhang1;~XUYUAN_XU1;~Xiaogang_Wang2;~Ying_Shan2;~Luo_Ping2", "aff": "The Chinese University of Hong Kong;Tencent;The Chinese University of Hong Kong;PCG AI Technology Center;The Chinese University of Hong Kong;Tencent PCG ARC Lab;The University of Hong Kong", "aff_domain": "cuhk.edu.hk;tencent.com;cuhk.edu.hk;tencent.com;cuhk.edu.hk;arc.tencent.com;hku.hk", "position": "PhD student;Researcher;PhD student;expert engineer;Full Professor;Director;Assistant Professor", "bibtex": "@inproceedings{\nshao2022dynamic,\ntitle={Dynamic Token Normalization improves Vision Transformers},\nauthor={Wenqi Shao and Yixiao Ge and Zhaoyang Zhang and XUYUAN XU and Xiaogang Wang and Ying Shan and Ping Luo},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=f9MHpAGUyMn}\n}", "github": "", "project": "", "reviewers": "PWvF;AKdH;g5rX;j3Et", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "4;4;4;4", "correctness": "3;2;4;3", "technical_novelty": "3;3;2;3", "empirical_novelty": "3;3;3;3", "wc_summary_paper": "95;61;107;49", "wc_summary_review": "52;36;138;28", "wc_main_review": "341;245;574;188", "wc_review": "488;342;819;265", "wc_reply_reviewers": "0;0;108;40", "wc_reply_authors": "778;388;2034;807", "reply_reviewers": "0;0;1;1", "reply_authors": "1;1;3;2", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 78.0, 23.769728648009426 ], "wc_summary_review_avg": [ 63.5, 43.871972830042644 ], "wc_main_review_avg": [ 337.0, 147.35162028291376 ], "wc_review_avg": [ 478.5, 212.2763528987626 ], "wc_reply_reviewers_avg": [ 37.0, 44.12482294582042 ], "wc_reply_authors_avg": [ 1001.75, 618.5104586827938 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.75, 0.82915619758885 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.7071067811865475, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15420753792733755461&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=f9MHpAGUyMn", "email": "cuhk.edu.hk;tencent.com;cuhk.edu.hk;tencent.com;cuhk.edu.hk;arc.tencent.com;hku.hk", "author_num": 7, "aff_unique_index": "0;1;0;2;0;1;3", "aff_unique_norm": "Chinese University of Hong Kong;Tencent;PCG AI Technology Center;University of Hong Kong", "aff_unique_dep": ";Tencent Holdings Limited;;", "aff_unique_url": "https://www.cuhk.edu.hk;https://www.tencent.com;;https://www.hku.hk", "aff_unique_abbr": "CUHK;Tencent;;HKU", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "China;" }, { "title": "LORD: Lower-Dimensional Embedding of Log-Signature in Neural Rough Differential Equations", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6473", "id": "fCG75wd39ze", "poster": "", "openreview": "https://openreview.net/forum?id=fCG75wd39ze", "slides": "https://iclr.cc/virtual/2022/poster/6473", "video": "https://iclr.cc/virtual/2022/poster/6473", "author_site": "Jaehoon Lee, Jeon Jinsung, Sheo yon Jhin, Jihyeon Hyeong, Jayoung Kim, Minju Jo, Kook Seungji, Noseong Park", "tldr": "", "abstract": "The problem of processing very long time-series data (e.g., a length of more than 10,000) is a long-standing research problem in machine learning. Recently, one breakthrough, called neural rough differential equations (NRDEs), has been proposed and has shown that it is able to process such data. Their main concept is to use the log-signature transform, which is known to be more efficient than the Fourier transform for irregular long time-series, to convert a very long time-series sample into a relatively shorter series of feature vectors. However, the log-signature transform causes non-trivial spatial overheads. To this end, we present the method of LOweR-Dimensional embedding of log-signature (LORD), where we define an NRDE-based autoencoder to implant the higher-depth log-signature knowledge into the lower-depth log-signature. We show that the encoder successfully combines the higher-depth and the lower-depth log-signature knowledge, which greatly stabilizes the training process and increases the model accuracy. In our experiments with benchmark datasets, the improvement ratio by our method is up to 75\\% in terms of various classification and forecasting evaluation metrics.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/2766ee250764ecb2befd7d8a98a36a95aef018cd.zip", "author": "JAEHOON LEE;Jinsung Jeon;Sheo yon Jhin;Jihyeon Hyeong;Jayoung Kim;Minju Jo;Kook Seungji;Noseong Park", "authorids": "~JAEHOON_LEE5;~Jinsung_Jeon1;~Sheo_yon_Jhin1;~Jihyeon_Hyeong1;~Jayoung_Kim1;~Minju_Jo1;~Kook_Seungji1;~Noseong_Park1", "gender": "M;;F;;F;F;;", "homepage": "https://sites.google.com/view/jaehoonlee/home;https://sites.google.com/view/npark/home?authuser=0;https://sheoyonj.space/;;;;;", "dblp": "95/386-2;294/0098;280/3334.html;;26/9969-2;236/8419.html;;", "google_scholar": "x521bdIAAAAJ;0R6W6lsAAAAJ;S_EBNdgAAAAJ;https://scholar.google.co.kr/citations?user=rgGIgXsAAAAJ;3qbSHGwAAAAJ;xLVtvn8AAAAJ;HVciBmAAAAAJ;", "orcid": ";0000-0002-9693-2739;;;;0000-0002-7908-5005;;", "linkedin": "jaehoon-lee-36125a1a7/;jinsung-jeon-994942289/;sheoyon-jhin/;;;;;", "or_profile": "~JAEHOON_LEE5;~Jinsung_Jeon1;~Sheo_yon_Jhin1;~Jihyeon_Hyeong1;~Jayoung_Kim1;~Minju_Jo1;~Kook_Seungji1;~Noseong_Park1", "aff": "Yonsei University;Yonsei University;Yonsei University;Yonsei University;Yonsei University;Yonsei University;Yonsei University;", "aff_domain": "yonsei.ac.kr;yonsei.ac.kr;yonsei.ac.kr;yonsei.ac.kr;yonsei.ac.kr;yonsei.ac.kr;yonsei.ac.kr;", "position": "Undergrad student;PhD student;MS student;MS student;MS student;MS student;MS student;", "bibtex": "@inproceedings{\nlee2022lord,\ntitle={{LORD}: Lower-Dimensional Embedding of Log-Signature in Neural Rough Differential Equations},\nauthor={JAEHOON LEE and Jinsung Jeon and Sheo yon Jhin and Jihyeon Hyeong and Jayoung Kim and Minju Jo and Kook Seungji and Noseong Park},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=fCG75wd39ze}\n}", "github": "", "project": "", "reviewers": "RtnZ;yePm;GYq6;ncTs", "pdf_size": 0, "recommendation": "6;8;8;8", "confidence": "4;5;4;4", "correctness": "3;4;4;4", "technical_novelty": "3;3;4;3", "empirical_novelty": "4;4;4;2", "wc_summary_paper": "58;35;78;74", "wc_summary_review": "58;33;51;30", "wc_main_review": "303;1153;646;222", "wc_review": "419;1221;775;326", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 7.5, 0.8660254037844386 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 61.25, 16.90229274388537 ], "wc_summary_review_avg": [ 43.0, 11.811011811017716 ], "wc_main_review_avg": [ 581.0, 366.59719038748784 ], "wc_review_avg": [ 685.25, 351.79281900004725 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": 1.0, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9583526015589000772&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=fCG75wd39ze", "email": "yonsei.ac.kr;yonsei.ac.kr;yonsei.ac.kr;yonsei.ac.kr;yonsei.ac.kr;yonsei.ac.kr;yonsei.ac.kr;", "author_num": 8, "aff_unique_index": "0;0;0;0;0;0;0", "aff_unique_norm": "Yonsei University", "aff_unique_dep": "", "aff_unique_url": "https://www.yonsei.ac.kr", "aff_unique_abbr": "Yonsei", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "South Korea" }, { "title": "A fast and accurate splitting method for optimal transport: analysis and implementation", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6984", "id": "fCSq8yrDkc", "poster": "", "openreview": "https://openreview.net/forum?id=fCSq8yrDkc", "slides": "https://iclr.cc/virtual/2022/poster/6984", "video": "https://iclr.cc/virtual/2022/poster/6984", "author_site": "Vien Mai, Jacob Lindb\u00e4ck, Mikael Johansson", "tldr": "", "abstract": "We develop a fast and reliable method for solving large-scale optimal transport (OT) problems at an unprecedented combination of speed and accuracy. Built on the celebrated Douglas-Rachford splitting technique, our method tackles the original OT problem directly instead of solving an approximate regularized problem, as many state-of-the-art techniques do. This allows us to provide sparse transport plans and avoid numerical issues of methods that use entropic regularization. The algorithm has the same cost per iteration as the popular Sinkhorn method, and each iteration can be executed efficiently, in parallel. The proposed method enjoys an iteration complexity $O(1/\\epsilon)$ compared to the best-known $O(1/\\epsilon^2)$ of the Sinkhorn method. In addition, we establish a linear convergence rate for our formulation of the OT problem. We detail an efficient GPU implementation of the proposed method that maintains a primal-dual stopping criterion at no extra cost. Substantial experiments demonstrate the effectiveness of our method, both in terms of computation times and robustness.", "keywords": "Optimal transport;Operator splitting;Douglas-Rachford;ADMM;GPUs", "primary_area": "", "supplementary_material": "/attachment/e3e662eed75085f17bddd7cc0809b5be2d68f7b7.zip", "author": "Vien V. Mai;Jacob Lindb\u00e4ck;Mikael Johansson", "authorids": "~Vien_V._Mai2;jlindbac@kth.se;~Mikael_Johansson3", "gender": ";;M", "homepage": ";;https://people.KTH.se/~mikaelj", "dblp": ";;53/764-1", "google_scholar": ";;wQSRT18AAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Vien_V._Mai2;jlindbac@kth.se;~Mikael_Johansson3", "aff": ";;KTH Royal Institute of Technology, Stockholm, Sweden", "aff_domain": ";;kth.se", "position": ";;Full Professor", "bibtex": "@inproceedings{\nmai2022a,\ntitle={A fast and accurate splitting method for optimal transport: analysis and implementation},\nauthor={Vien V. Mai and Jacob Lindb{\\\"a}ck and Mikael Johansson},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=fCSq8yrDkc}\n}", "github": "", "project": "", "reviewers": "Jdt4;szP9;6N6U;qeU8", "pdf_size": 0, "recommendation": "3;6;6;6", "confidence": "4;3;3;4", "correctness": "3;4;4;3", "technical_novelty": "3;3;3;3", "empirical_novelty": "2;3;2;0", "wc_summary_paper": "152;93;52;78", "wc_summary_review": "28;36;43;14", "wc_main_review": "798;143;184;221", "wc_review": "978;272;279;313", "wc_reply_reviewers": "1056;163;0;0", "wc_reply_authors": "1662;310;378;397", "reply_reviewers": "5;1;0;0", "reply_authors": "5;1;1;1", "recommendation_avg": [ 5.25, 1.299038105676658 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 1.75, 1.0897247358851685 ], "wc_summary_paper_avg": [ 93.75, 36.69042790701684 ], "wc_summary_review_avg": [ 30.25, 10.779030568655049 ], "wc_main_review_avg": [ 336.5, 267.87170436610137 ], "wc_review_avg": [ 460.5, 299.1809653036102 ], "wc_reply_reviewers_avg": [ 304.75, 438.80939768879153 ], "wc_reply_authors_avg": [ 686.75, 563.9890845574939 ], "reply_reviewers_avg": [ 1.5, 2.0615528128088303 ], "reply_authors_avg": [ 2.0, 1.7320508075688772 ], "replies_avg": [ 20, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12225190131562700632&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=fCSq8yrDkc", "email": ";;kth.se", "author_num": 3, "aff_unique_index": "0", "aff_unique_norm": "KTH Royal Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.kth.se", "aff_unique_abbr": "KTH", "aff_campus_unique_index": "0", "aff_campus_unique": "Stockholm", "aff_country_unique_index": "0", "aff_country_unique": "Sweden" }, { "id": "fE-sp8USacG", "title": "DropAttack: A Masked Weight Adversarial Training Method to Improve Generalization of Neural Networks", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Adversarial training has been proven to be a powerful regularization method to improve generalization of models. In this work, a novel masked weight adversarial training method, DropAttack, is proposed for improving generalization potential of neural network models. It enhances the coverage and diversity of adversarial attack by intentionally adding worst-case adversarial perturbations to both the input and hidden layers and randomly masking the attack perturbations on a certain proportion weight parameters. It then improves the generalization of neural networks by minimizing the internal adversarial risk generated by exponentially different attack combinations. Further, the method is a general technique that can be adopted to a wide variety of neural networks with different architectures. To validate the effectiveness of the proposed method, five public datasets were used in the fields of natural language processing (NLP) and computer vision (CV) for experimental evaluating. This study compared DropAttack with other adversarial training methods and regularization methods. It was found that the proposed method achieves state-of-the-art performance on all datasets. In addition, the experimental results of this study show that DropAttack method can achieve similar performance when it uses only a half training data required in standard training. Theoretical analysis revealed that DropAttack can perform gradient regularization at random on some of the input and weight parameters of the model. Further, visualization experiments of this study show that DropAttack can push the minimum risk of the neural network model to a lower and flatter loss landscapes.", "keywords": "Adversarial training;random mask;regularization;generalization;neural networks", "primary_area": "", "supplementary_material": "/attachment/b9822f2f76c5170e78ae6ce408a3c1cd6dec4110.zip", "author": "Shiwen Ni;Jiawen Li;Hung-Yu Kao", "authorids": "~Shiwen_Ni1;p78073012@gs.ncku.edu.tw;~Hung-Yu_Kao1", "gender": ";;M", "homepage": ";;http://140.116.245.107/advisor.html", "dblp": "279/5319.html;;64/5833.html", "google_scholar": "https://scholar.google.com.tw/citations?user=ln4hmCwAAAAJ;;https://scholar.google.com.tw/citations?user=X5Is2lAAAAAJ", "orcid": "0000-0002-4986-4446;;0000-0002-8890-8544", "linkedin": ";;", "or_profile": "~Shiwen_Ni1;p78073012@gs.ncku.edu.tw;~Hung-Yu_Kao1", "aff": "National Cheng Kung University;;CSIE", "aff_domain": "ncku.edu.tw;;csie.ncku.edu.tw", "position": "PhD student;;Full Professor", "bibtex": "@misc{\nni2022dropattack,\ntitle={DropAttack: A Masked Weight Adversarial Training Method to Improve Generalization of Neural Networks},\nauthor={Shiwen Ni and Jiawen Li and Hung-Yu Kao},\nyear={2022},\nurl={https://openreview.net/forum?id=fE-sp8USacG}\n}", "github": "", "project": "", "reviewers": "NJkv;QBE8;eB4n;8CGX", "site": "https://openreview.net/forum?id=fE-sp8USacG", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "5;5;5;4", "correctness": "2;3;2;2", "technical_novelty": "3;2;1;2", "empirical_novelty": "2;2;1;2", "wc_summary_paper": "56;82;68;44", "wc_summary_review": "36;46;14;94", "wc_main_review": "515;400;314;920", "wc_review": "607;528;396;1058", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 4.75, 0.4330127018922193 ], "correctness_avg": [ 2.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 1.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 62.5, 14.097872179871684 ], "wc_summary_review_avg": [ 47.5, 29.236107812087436 ], "wc_main_review_avg": [ 537.25, 232.2018249282292 ], "wc_review_avg": [ 647.25, 248.83867766084919 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -1.0, "corr_recommendation_correctness": -0.3333333333333333, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6660294166158058855&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1", "aff_unique_norm": "National Cheng Kung University;College of Computer Science and Information Engineering", "aff_unique_dep": ";", "aff_unique_url": "https://www.ncku.edu.tw;", "aff_unique_abbr": "NCKU;CSIE", "aff_campus_unique_index": "0", "aff_campus_unique": "Taiwan;", "aff_country_unique_index": "0", "aff_country_unique": "China;" }, { "id": "fEcbkaHqlur", "title": "Beyond Target Networks: Improving Deep $Q$-learning with Functional Regularization", "track": "main", "status": "Reject", "tldr": "", "abstract": "A majority of recent successes in deep Reinforcement Learning are based on minimization of square Bellman error. The training is often unstable due to a fast-changing target $Q$-values, and target networks are employed to stabilize by using an additional set of lagging parameters. Despite their advantages, target networks could inhibit the propagation of newly-encountered rewards which may ultimately slow down the training. In this work, we address this issue by augmenting the squared Bellman error with a functional regularizer. Unlike target networks', the regularization here is explicit which not only enables us to use up-to-date parameters but also control the regularization. This leads to a fast yet stable training method. Across a range of Atari environments, we demonstrate empirical improvements over target-network based methods in terms of both sample efficiency and performance. In summary, our approach provides a fast and stable alternative to replace the standard squared Bellman error.", "keywords": "Q learning;regularization;deep Q learning", "primary_area": "", "supplementary_material": "/attachment/585c0dbc35f6d332b5c3ca81a773e5b9c58b8c71.zip", "author": "Alexandre Pich\u00e9;Joseph Marino;Gian Maria Marconi;Christopher Pal;Mohammad Emtiyaz Khan", "authorids": "~Alexandre_Pich\u00e91;~Joseph_Marino1;~Gian_Maria_Marconi1;~Christopher_Pal1;~Mohammad_Emtiyaz_Khan1", "gender": "M;M;;M;M", "homepage": "http://joelouismarino.github.io;;https://scholar.google.ca/citations?user=1ScWJOoAAAAJ&hl=en&oi=ao;https://emtiyaz.github.io/;https://github.com/AlexPiche", "dblp": "31/8756;222/3259;45/1217;58/10432;", "google_scholar": "LTprTF0AAAAJ;;https://scholar.google.ca/citations?user=1ScWJOoAAAAJ;https://scholar.google.com/citations?hl=en;", "orcid": "0000-0001-6387-8062;;;;", "linkedin": ";;;;", "or_profile": "~Joseph_Marino1;~Gian_Maria_Marconi1;~Christopher_Pal1;~Mohammad_Emtiyaz_Khan1;~Alexandre_Piche1", "aff": "Google DeepMind;RIKEN;Polytechnique Montreal;RIKEN Center for AI Project;University of Montreal", "aff_domain": "deepmind.com;riken.jp;polymtl.ca;riken.jp;umontreal.ca", "position": "Research Scientist;Postdoc;Full Professor;Full Professor;PhD student", "bibtex": "@misc{\npich{\\'e}2022beyond,\ntitle={Beyond Target Networks: Improving Deep \\$Q\\$-learning with Functional Regularization},\nauthor={Alexandre Pich{\\'e} and Joseph Marino and Gian Maria Marconi and Christopher Pal and Mohammad Emtiyaz Khan},\nyear={2022},\nurl={https://openreview.net/forum?id=fEcbkaHqlur}\n}", "github": "", "project": "", "reviewers": "Gxgv;PwAu;ugJ6;yew3", "site": "https://openreview.net/forum?id=fEcbkaHqlur", "pdf_size": 0, "recommendation": "5;5;6;8", "confidence": "4;4;3;4", "correctness": "3;2;4;4", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;2;0;3", "wc_summary_paper": "67;76;165;72", "wc_summary_review": "104;51;35;73", "wc_main_review": "410;277;202;527", "wc_review": "581;404;402;672", "wc_reply_reviewers": "193;0;0;12", "wc_reply_authors": "818;522;346;420", "reply_reviewers": "1;0;0;1", "reply_authors": "2;1;1;1", "recommendation_avg": [ 6.0, 1.224744871391589 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.75, 1.0897247358851685 ], "wc_summary_paper_avg": [ 95.0, 40.54010360124897 ], "wc_summary_review_avg": [ 65.75, 25.878321042911576 ], "wc_main_review_avg": [ 354.0, 124.59735149673126 ], "wc_review_avg": [ 514.75, 116.29139048098101 ], "wc_reply_reviewers_avg": [ 51.25, 81.98589817767443 ], "wc_reply_authors_avg": [ 526.5, 179.5236753188838 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.7385489458759963, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16570024354423362713&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2, "aff_unique_index": "0;1;2;1;3", "aff_unique_norm": "Google;RIKEN;Polytechnique Montreal;University of Montreal", "aff_unique_dep": "Google DeepMind;;;", "aff_unique_url": "https://deepmind.com;https://www.riken.jp;https://www.polymtl.ca;https://wwwumontreal.ca", "aff_unique_abbr": "DeepMind;RIKEN;PolyMTL;UM", "aff_campus_unique_index": "1", "aff_campus_unique": ";Montreal", "aff_country_unique_index": "0;1;2;1;2", "aff_country_unique": "United Kingdom;Japan;Canada" }, { "title": "Learning to Dequantise with Truncated Flows", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6123", "id": "fExcSKdDo_", "poster": "", "openreview": "https://openreview.net/forum?id=fExcSKdDo_", "slides": "https://iclr.cc/virtual/2022/poster/6123", "video": "https://iclr.cc/virtual/2022/poster/6123", "author_site": "Shawn Tan, Chin-Wei Huang, Alessandro Sordoni, Aaron Courville", "tldr": "", "abstract": "Dequantisation is a general technique used for transforming data described by a discrete random variable $x$ into a continuous (latent) random variable $z$, for the purpose of it being modeled by likelihood-based density models. Dequantisation was first introduced in the context of ordinal data, such as image pixel values. However, when the data is categorical, the dequantisation scheme is not obvious.\nWe learn such a dequantisation scheme $q(z | x)$, using variational inference with TRUncated FLows (TRUFL) --- a novel flow-based model that allows the dequantiser to have a learnable truncated support. Unlike previous work, the TRUFL dequantiser is (i) capable of embedding the data losslessly in certain cases, since the truncation allows the conditional distributions $q(z | x)$ to have non-overlapping bounded supports, while being (ii) trainable with back-propagation. Addtionally, since the support of the marginal $q(z)$ is bounded and the support of prior $p(z)$ is not, we propose renormalising the prior distribution over the support of $q(z)$. We derive a lower bound for training, and propose a rejection sampling scheme to account for the invalid samples during generation.\nExperimentally, we benchmark TRUFL on constrained generation tasks, and find that it outperforms prior approaches. In addition, we find that rejection sampling results in higher validity for the constrained problems.", "keywords": "variational inference;variational bayes;dequantisation;normalizing flows", "primary_area": "", "supplementary_material": "/attachment/b2cd22cbe66bf3e5020be2c1e8b0eeeeb8de523f.zip", "author": "Shawn Tan;Chin-Wei Huang;Alessandro Sordoni;Aaron Courville", "authorids": "~Shawn_Tan1;~Chin-Wei_Huang1;~Alessandro_Sordoni2;~Aaron_Courville3", "gender": "M;M;;M", "homepage": "https://blog.wtf.sg;https://chinweihuang.com/;;", "dblp": ";87/7431;56/1688;57/7642", "google_scholar": "57Nf7EYAAAAJ;0sxcBnwAAAAJ;https://scholar.google.ca/citations?user=km6CP8cAAAAJ;", "orcid": ";;;", "linkedin": "tanshawn/;;;", "or_profile": "~Shawn_Tan1;~Chin-Wei_Huang1;~Aaron_Courville3;~Alessandro_Sordoni1", "aff": "Universit\u00e9 de Montr\u00e9al;University of Montreal;Universit\u00e9 de Montr\u00e9al;Microsoft", "aff_domain": "umontreal.ca;umontreal.ca; ;microsoft.com", "position": "PhD student;PhD student;Assistant Professor;Researcher", "bibtex": "@inproceedings{\ntan2022learning,\ntitle={Learning to Dequantise with Truncated Flows},\nauthor={Shawn Tan and Chin-Wei Huang and Alessandro Sordoni and Aaron Courville},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=fExcSKdDo_}\n}", "github": "", "project": "", "reviewers": "zxth;Pdpw;QH7t", "pdf_size": 0, "recommendation": "6;6;6", "confidence": "4;3;4", "correctness": "4;3;4", "technical_novelty": "3;2;3", "empirical_novelty": "2;3;3", "wc_summary_paper": "77;102;59", "wc_summary_review": "35;55;19", "wc_main_review": "486;269;189", "wc_review": "598;426;267", "wc_reply_reviewers": "172;268;0", "wc_reply_authors": "1145;1303;178", "reply_reviewers": "2;2;0", "reply_authors": "5;5;2", "recommendation_avg": [ 6.0, 0.0 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 79.33333333333333, 17.632041540584257 ], "wc_summary_review_avg": [ 36.333333333333336, 14.727148022916348 ], "wc_main_review_avg": [ 314.6666666666667, 125.47598265095287 ], "wc_review_avg": [ 430.3333333333333, 135.16491984074696 ], "wc_reply_reviewers_avg": [ 146.66666666666666, 110.86728803193283 ], "wc_reply_authors_avg": [ 875.3333333333334, 497.29021260784486 ], "reply_reviewers_avg": [ 1.3333333333333333, 0.9428090415820634 ], "reply_authors_avg": [ 4.0, 1.4142135623730951 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4545468046085577275&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "pdf": "https://openreview.net/pdf?id=fExcSKdDo_", "email": "umontreal.ca;umontreal.ca; ;microsoft.com", "author_num": 4, "aff_unique_index": "0;1;0;2", "aff_unique_norm": "Universit\u00e9 de Montr\u00e9al;University of Montreal;Microsoft", "aff_unique_dep": ";;Microsoft Corporation", "aff_unique_url": "https://www.umontreal.ca;https://wwwumontreal.ca;https://www.microsoft.com", "aff_unique_abbr": "UdeM;UM;Microsoft", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1", "aff_country_unique": "Canada;United States" }, { "id": "fG9WttDhAaa", "title": "Rethinking Positional Encoding", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "It is well noted that coordinate based MLPs benefit greatly -- in terms of preserving high-frequency information -- through the encoding of coordinate positions as an array of Fourier features. Hitherto, the rationale for the effectiveness of these positional encodings has been solely studied through a Fourier lens. In this paper, we strive to broaden this understanding by showing that alternative non-Fourier embedding functions can indeed be used for positional encoding. Moreover, we show that their performance is entirely determined by a trade-off between the stable rank of the embedded matrix and the distance preservation between embedded coordinates. We further establish that the now ubiquitous Fourier feature mapping of position is a special case that fulfills these conditions. Consequently, we present a more general theory to analyze positional encoding in terms of shifted basis functions. To this end, we develop the necessary theoretical formulae and empirically verify that our theoretical claims hold in practice.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/ace82ca225222534b877a9e958edcaa9ecaba764.zip", "author": "Jianqiao Zheng;Sameera Ramasinghe;Simon Lucey", "authorids": "~Jianqiao_Zheng1;~Sameera_Ramasinghe1;simon.lucey@adelaide.edu.au", "gender": "M;M;", "homepage": "https://github.com/osiriszjq;;", "dblp": "296/3803;181/4514;", "google_scholar": "https://scholar.google.com.hk/citations?view_op=list_works;https://scholar.google.com.au/citations?user=-j0m9aMAAAAJ;", "orcid": ";;", "linkedin": ";;", "or_profile": "~Jianqiao_Zheng1;~Sameera_Ramasinghe1;simon.lucey@adelaide.edu.au", "aff": "The University of Adelaide;Amazon;", "aff_domain": "adelaide.edu.au;amazon.com;", "position": "PhD student;Researcher;", "bibtex": "@misc{\nzheng2022rethinking,\ntitle={Rethinking Positional Encoding},\nauthor={Jianqiao Zheng and Sameera Ramasinghe and Simon Lucey},\nyear={2022},\nurl={https://openreview.net/forum?id=fG9WttDhAaa}\n}", "github": "", "project": "", "reviewers": "k3Xo;9v6t;6zRL;EYpJ", "site": "https://openreview.net/forum?id=fG9WttDhAaa", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "3;4;3;4", "correctness": "3;2;3;2", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "32;67;76;138", "wc_summary_review": "18;23;38;113", "wc_main_review": "296;436;680;1050", "wc_review": "346;526;794;1301", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 78.25, 38.21239981995373 ], "wc_summary_review_avg": [ 48.0, 38.242646351945886 ], "wc_main_review_avg": [ 615.5, 286.02928171779894 ], "wc_review_avg": [ 741.75, 360.0891382699567 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 69, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16241816499025775310&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1", "aff_unique_norm": "University of Adelaide;Amazon", "aff_unique_dep": ";Amazon.com, Inc.", "aff_unique_url": "https://www.adelaide.edu.au;https://www.amazon.com", "aff_unique_abbr": "Adelaide;Amazon", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "Australia;United States" }, { "id": "fGEoHDk0C", "title": "A framework of deep neural networks via the solution operator of partial differential equations", "track": "main", "status": "Reject", "tldr": "", "abstract": "There is a close connection between deep neural networks (DNN) and partial differential equations (PDEs). Many DNN architectures can be modeled by PDEs and have been proposed in the literature. However, their neural network design space is restricted due to the specific form of PDEs, which prevents the design of more effective neural network structures. In this paper, we attempt to derive a general form of PDEs for the design of ResNet-like DNN. To achieve this goal, we first formulate DNN as an adjustment operator applied on the base classifier. Then based on several reasonable assumption, we show the adjustment operator for ResNet-like DNN is the solution operator of PDEs. To show the effectiveness for general form of PDEs, we show that several effective networks can be interpreted by our general form of PDEs and design a training method motivated by PDEs theory to train DNN models for better robustness and less chance of overfitting. Theoretically, we prove that the robustness of DNN trained with our method is certifiable and our training method reduces the generalization gap for DNN. Furthermore, we demonstrate that DNN trained with our method can achieve better generalization and is more resistant to adversarial perturbations than baseline model.", "keywords": "deep neural networks;partial differential equations;solution operator", "primary_area": "", "supplementary_material": "/attachment/43796894aa571aa48d97c4d283b87482fcbc7af0.zip", "author": "Wenqi Tao;Zuoqiang Shi", "authorids": "~Wenqi_Tao1;~Zuoqiang_Shi1", "gender": "M;M", "homepage": "http://www.aminer.cn/profile/61406b36e55422cecdad1167;https://shizqi.github.io/", "dblp": ";18/1960", "google_scholar": ";", "orcid": ";0000-0002-9122-0302", "linkedin": ";", "or_profile": "~Wenqi_Tao1;~Zuoqiang_Shi1", "aff": ";Tsinghua University", "aff_domain": ";tsinghua.edu.cn", "position": ";Associate Professor", "bibtex": "@misc{\ntao2022a,\ntitle={A framework of deep neural networks via the solution operator of partial differential equations},\nauthor={Wenqi Tao and Zuoqiang Shi},\nyear={2022},\nurl={https://openreview.net/forum?id=fGEoHDk0C}\n}", "github": "", "project": "", "reviewers": "mSDj;zQvx;ZDUG;LaXY", "site": "https://openreview.net/forum?id=fGEoHDk0C", "pdf_size": 0, "recommendation": "3;5;6;6", "confidence": "4;4;3;4", "correctness": "3;3;3;4", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;2;3;2", "wc_summary_paper": "103;73;100;108", "wc_summary_review": "37;33;78;23", "wc_main_review": "136;284;1018;440", "wc_review": "276;390;1196;571", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "367;381;166;322", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 5.0, 1.224744871391589 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 96.0, 13.583077707206124 ], "wc_summary_review_avg": [ 42.75, 20.980645843252777 ], "wc_main_review_avg": [ 469.5, 334.42301057194015 ], "wc_review_avg": [ 608.25, 355.267768732262 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 309.0, 85.39028047734706 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.4714045207910316, "corr_recommendation_correctness": 0.4714045207910316, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:jZdV4MTiQUMJ:scholar.google.com/&scioq=A+framework+of+deep+neural+networks+via+the+solution+operator+of+partial+differential+equations&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "Tsinghua University", "aff_unique_dep": "", "aff_unique_url": "https://www.tsinghua.edu.cn", "aff_unique_abbr": "THU", "aff_country_unique_index": "0", "aff_country_unique": "China" }, { "id": "fHPdmN3I0tY", "title": "Decoupled Kernel Neural Processes: Neural Network-Parameterized Stochastic Processes using Explicit Data-driven Kernel", "track": "main", "status": "Reject", "tldr": "", "abstract": "Neural Processes (NPs) are a class of stochastic processes parametrized by neural networks. Unlike traditional stochastic processes (e.g., Gaussian processes), which require specifying explicit kernel functions, NPs implicitly learn kernel functions appropriate for a given task through observed data. While this data-driven learning of stochastic processes has been shown to model various types of data, the current NPs' implicit treatment of the mean and the covariance of the output variables limits its full potential when the underlying distribution of the given data is highly complex. To address this, we introduce a new neural stochastic processes, Decoupled Kernel Neural Processes (DKNPs), which explicitly learn a separate mean and kernel function to directly model the covariance between output variables in a data-driven manner. By estimating kernel functions with self- and mixed attentive neural networks, DKNPs demonstrate improved uncertainty estimation in terms of conditional likelihood and diversity in generated samples in 1-D and 2-D regression tasks, compared to other concurrent NP variants. Also, maintaining explicit kernel functions, a key component of stochastic processes, allows the model to reveal a deeper understanding of underlying distributions.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Daehoon Gwak;Gyubok Lee;Jaehoon Lee;Jaesik Choi;Jaegul Choo;Edward Choi", "authorids": "~Daehoon_Gwak1;~Gyubok_Lee1;~Jaehoon_Lee2;~Jaesik_Choi1;~Jaegul_Choo1;~Edward_Choi1", "gender": "M;M;;M;M;M", "homepage": ";https://sites.google.com/view/gyuboklee;https://jaehlee.github.io;https://sailab.kaist.ac.kr/jaesik;https://sites.google.com/site/jaegulchoo/;http://mp2893.com", "dblp": "276/7016;249/4944;95/386-1.html;13/1402;07/2074;41/3886", "google_scholar": "NyQ42l8AAAAJ;UYzauyYAAAAJ;d3YhiooAAAAJ;RqMLVzUAAAAJ;GHJYsLEAAAAJ;GUlGIPkAAAAJ", "orcid": ";;;;;", "linkedin": ";gyubok-lee-104915229;eejaehoon/;;;", "or_profile": "~Daehoon_Gwak1;~Gyubok_Lee1;~Jaehoon_Lee2;~Jaesik_Choi1;~Jaegul_Choo1;~Edward_Choi1", "aff": "Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology;Google;Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology", "aff_domain": "kaist.ac.kr;kaist.ac.kr;google.com;kaist.ac.kr;kaist.ac.kr;kaist.ac.kr", "position": "PhD student;PhD student;Research Scientist;Associate Professor;Associate Professor;Associate Professor", "bibtex": "@misc{\ngwak2022decoupled,\ntitle={Decoupled Kernel Neural Processes: Neural Network-Parameterized Stochastic Processes using Explicit Data-driven Kernel},\nauthor={Daehoon Gwak and Gyubok Lee and Jaehoon Lee and Jaesik Choi and Jaegul Choo and Edward Choi},\nyear={2022},\nurl={https://openreview.net/forum?id=fHPdmN3I0tY}\n}", "github": "", "project": "", "reviewers": "1GBU;Xxs5;Ngox;65De", "site": "https://openreview.net/forum?id=fHPdmN3I0tY", "pdf_size": 0, "recommendation": "5;5;6;8", "confidence": "5;4;2;4", "correctness": "3;2;1;4", "technical_novelty": "3;2;2;4", "empirical_novelty": "3;2;3;1", "wc_summary_paper": "68;86;46;75", "wc_summary_review": "10;31;38;22", "wc_main_review": "259;670;210;177", "wc_review": "337;787;294;274", "wc_reply_reviewers": "152;53;0;0", "wc_reply_authors": "429;1096;41;326", "reply_reviewers": "1;1;0;0", "reply_authors": "1;2;1;1", "recommendation_avg": [ 6.0, 1.224744871391589 ], "confidence_avg": [ 3.75, 1.0897247358851685 ], "correctness_avg": [ 2.5, 1.118033988749895 ], "technical_novelty_avg": [ 2.75, 0.82915619758885 ], "empirical_novelty_avg": [ 2.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 68.75, 14.618053906043718 ], "wc_summary_review_avg": [ 25.25, 10.473180032826706 ], "wc_main_review_avg": [ 329.0, 199.02638016102287 ], "wc_review_avg": [ 423.0, 211.3847203560371 ], "wc_reply_reviewers_avg": [ 51.25, 62.06196500272933 ], "wc_reply_authors_avg": [ 473.0, 386.748626371187 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.18731716231633877, "corr_recommendation_correctness": 0.5477225575051661, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:_CIsbsO7JYQJ:scholar.google.com/&scioq=Decoupled+Kernel+Neural+Processes:+Neural+Network-Parameterized+Stochastic+Processes+using+Explicit+Data-driven+Kernel&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;1;0;0;0", "aff_unique_norm": "Korea Advanced Institute of Science and Technology;Google", "aff_unique_dep": ";Google", "aff_unique_url": "https://www.kaist.ac.kr;https://www.google.com", "aff_unique_abbr": "KAIST;Google", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;1;0;0;0", "aff_country_unique": "South Korea;United States" }, { "id": "fHeK814NOMO", "title": "Trainable Learning Rate", "track": "main", "status": "Reject", "tldr": "", "abstract": "Selecting an appropriate learning rate for efficiently training deep neural networks is a difficult process that can be affected by numerous parameters, such as the dataset, the model architecture or even the batch size. In this work, we propose an algorithm for automatically adjusting the learning rate during gradient descent. The rationale behind our approach is to train the learning rate along with the model weight, akin to line-search. Contrary to existing approaches, learning rate is optimized via a simple extra gradient descent step, justified by an analysis that takes into consideration the structure of a neural network loss function. We formulate first and second-order gradients with respect to learning rate as functions of consecutive weight gradients, leading to a cost-effective implementation. We also show that the scheme can be extended to accommodate for different learning rates per layer. Extensive experimental evaluation is conducted, validating the effectiveness of the proposed method for a plethora of different settings. The proposed method has proven to be robust to both the initial learning rate and the batch size, making it ideal for an off-the-shelf optimization scheme.", "keywords": "Gradient Descent;Adaptive Step Size;Adaptive Learning Rate", "primary_area": "", "supplementary_material": "/attachment/e8b73bca98d861c88569631002a2bb3f02d8850f.zip", "author": "George Retsinas;Giorgos Sfikas;Panagiotis Filntisis;Petros Maragos", "authorids": "~George_Retsinas2;~Giorgos_Sfikas1;~Panagiotis_Filntisis1;~Petros_Maragos1", "gender": "M;M;M;M", "homepage": "http://users.iit.demokritos.gr/~georgeretsi/;http://www.cs.uoi.gr/~sfikas;http://cvsp.cs.ntua.gr/pfilntisis/index.shtm;http://robotics.ntua.gr/members/maragos/", "dblp": "171/5669;01/747;210/1927;22/4003", "google_scholar": "https://scholar.google.gr/;X73G9lYAAAAJ;Z3Z8ipkAAAAJ;A2XydgGCY9gC", "orcid": ";0000-0002-7305-2886;;", "linkedin": "george-retsinas-9b073b88/;giorgos-sfikas-15a30484/;;petros-maragos-76087b92/", "or_profile": "~George_Retsinas2;~Giorgos_Sfikas1;~Panagiotis_Filntisis1;~Petros_Maragos1", "aff": "National Technical University of Athens;University of West Attica;National Technical University of Athens;National Technical University of Athens", "aff_domain": "ntua.gr;uniwa.gr;ntua.gr;ntua.gr", "position": "Postdoc;Instructor;PhD student;Full Professor", "bibtex": "@misc{\nretsinas2022trainable,\ntitle={Trainable Learning Rate},\nauthor={George Retsinas and Giorgos Sfikas and Panagiotis Filntisis and Petros Maragos},\nyear={2022},\nurl={https://openreview.net/forum?id=fHeK814NOMO}\n}", "github": "", "project": "", "reviewers": "hkZ3;RyYD;sGYf;9YY4;Q9e6;rR3n", "site": "https://openreview.net/forum?id=fHeK814NOMO", "pdf_size": 0, "recommendation": "3;5;6;8;8;10", "confidence": "5;5;3;4;4;5", "correctness": "3;4;4;3;4;4", "technical_novelty": "2;3;3;3;3;4", "empirical_novelty": "2;3;3;3;3;4", "wc_summary_paper": "54;73;109;107;155;101", "wc_summary_review": "58;133;36;32;47;351", "wc_main_review": "406;777;385;405;276;1354", "wc_review": "518;983;530;544;478;1806", "wc_reply_reviewers": "1595;721;0;0;0;56", "wc_reply_authors": "1888;2183;398;748;329;74", "reply_reviewers": "3;2;0;0;0;1", "reply_authors": "3;4;1;1;1;2", "recommendation_avg": [ 6.666666666666667, 2.2852182001336816 ], "confidence_avg": [ 4.333333333333333, 0.7453559924999298 ], "correctness_avg": [ 3.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 3.0, 0.5773502691896257 ], "empirical_novelty_avg": [ 3.0, 0.5773502691896257 ], "wc_summary_paper_avg": [ 99.83333333333333, 31.62497255159108 ], "wc_summary_review_avg": [ 109.5, 113.16764849844087 ], "wc_main_review_avg": [ 600.5, 371.193009093652 ], "wc_review_avg": [ 809.8333333333334, 477.24711162620525 ], "wc_reply_reviewers_avg": [ 395.3333333333333, 595.7168417591998 ], "wc_reply_authors_avg": [ 936.6666666666666, 805.9767711017208 ], "reply_reviewers_avg": [ 1.0, 1.1547005383792515 ], "reply_authors_avg": [ 2.0, 1.1547005383792515 ], "replies_avg": [ 37, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.1304656146106883, "corr_recommendation_correctness": 0.36099743619057784, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:nEbNXSEZFIIJ:scholar.google.com/&scioq=Trainable+Learning+Rate&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1;0;0", "aff_unique_norm": "National Technical University of Athens;University of West Attica", "aff_unique_dep": ";", "aff_unique_url": "https://www.ntua.gr;https://www.uoa.gr", "aff_unique_abbr": "NTUA;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Greece" }, { "title": "Perceiver IO: A General Architecture for Structured Inputs & Outputs", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6269", "id": "fILj7WpI-g", "poster": "", "openreview": "https://openreview.net/forum?id=fILj7WpI-g", "slides": "https://iclr.cc/virtual/2022/poster/6269", "video": "https://iclr.cc/virtual/2022/poster/6269", "author_site": "Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, Fengning Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Henaff, Matthew Botvinick, Andrew Zisserman, Oriol Vinyals, Joao Carreira", "tldr": "", "abstract": "A central goal of machine learning is the development of systems that can solve many problems in as many data domains as possible. Current architectures, however, cannot be applied beyond a small set of stereotyped settings, as they bake in domain & task assumptions or scale poorly to large inputs or outputs. In this work, we propose Perceiver IO, a general-purpose architecture that handles data from arbitrary settings while scaling linearly with the size of inputs and outputs. Our model augments the Perceiver with a flexible querying mechanism that enables outputs of various sizes and semantics, doing away with the need for task-specific architecture engineering. The same architecture achieves strong results on tasks spanning natural language and visual understanding, multi-task and multi-modal reasoning, and StarCraft II. As highlights, Perceiver IO outperforms a Transformer-based BERT baseline on the GLUE language benchmark despite removing input tokenization and achieves state-of-the-art performance on Sintel optical flow estimation with no explicit mechanisms for multiscale correspondence.", "keywords": "Perceiver;BERT;natural language processing;optical flow;computer vision;multimodal;GLUE;ImageNet;StarCraft", "primary_area": "", "supplementary_material": "/attachment/3fc874e2f3eaa32a9fb8a6eecc715613204436b1.zip", "author": "Andrew Jaegle;Sebastian Borgeaud;Jean-Baptiste Alayrac;Carl Doersch;Catalin Ionescu;David Ding;Skanda Koppula;Daniel Zoran;Andrew Brock;Evan Shelhamer;Olivier J Henaff;Matthew Botvinick;Andrew Zisserman;Oriol Vinyals;Joao Carreira", "authorids": "~Andrew_Jaegle2;~Sebastian_Borgeaud1;~Jean-Baptiste_Alayrac2;~Carl_Doersch1;~Catalin_Ionescu1;~David_Ding2;~Skanda_Koppula1;~Daniel_Zoran1;~Andrew_Brock1;~Evan_Shelhamer2;~Olivier_J_Henaff1;~Matthew_Botvinick1;~Andrew_Zisserman1;~Oriol_Vinyals1;~Joao_Carreira1", "gender": ";M;;M;M;M;;M;;;;;;;M", "homepage": ";;;;http://catalinionescu.weebly.com/;;;;https://www.github.com/ajbrock;;https://www.olivierhenaff.com/;;;;", "dblp": ";;;12/8654;38/8484;;;18/9054;;;156/0035.html;98/5712;;05/726;61/5621-1", "google_scholar": ";-KzSL30AAAAJ;;SBTxvCoAAAAJ;https://scholar.google.co.uk/citations?user=hOl-5zcAAAAJ;Un10q9gAAAAJ;;1JQDH_AAAAAJ;https://scholar.google.co.uk/citations?user=NIxD36wAAAAJ;;Sx75CVsAAAAJ;;;https://scholar.google.co.uk/citations?user=NkzyCvUAAAAJ;https://scholar.google.pt/citations?user=IUZ-7_cAAAAJ", "orcid": ";;;;;;;;;;0000-0001-8183-9489;;;;", "linkedin": ";;;;;;;;;;;;;;jo%C3%A3o-carreira-56238a7/", "or_profile": "~Andrew_Jaegle2;~Sebastian_Borgeaud1;~Jean-Baptiste_Alayrac2;~Carl_Doersch1;~Catalin_Ionescu1;~David_Ding2;~Skanda_Koppula1;~Daniel_Zoran1;~Andrew_Brock1;~Evan_Shelhamer2;~Olivier_J_Henaff1;~Matthew_Botvinick1;~Andrew_Zisserman1;~Oriol_Vinyals1;~Joao_Carreira1", "aff": ";Google DeepMind;;Google DeepMind;Google;Google DeepMind;;Google DeepMind;Google DeepMind;;Google DeepMind;Google DeepMind;;Electrical Engineering & Computer Science Department;Google DeepMind", "aff_domain": ";deepmind.com;;google.com;google.com;deepmind.com;;google.com;deepmind.com;;google.com;google.com;;eecs.berkeley.edu;google.com", "position": ";Researcher;;Research Scientist;Postdoc;Research Engineer;;Research Scientist;Research Scientist;;Research Scientist;Researcher;;Researcher;Research Scientist", "bibtex": "@inproceedings{\njaegle2022perceiver,\ntitle={Perceiver {IO}: A General Architecture for Structured Inputs \\& Outputs},\nauthor={Andrew Jaegle and Sebastian Borgeaud and Jean-Baptiste Alayrac and Carl Doersch and Catalin Ionescu and David Ding and Skanda Koppula and Daniel Zoran and Andrew Brock and Evan Shelhamer and Olivier J Henaff and Matthew Botvinick and Andrew Zisserman and Oriol Vinyals and Joao Carreira},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=fILj7WpI-g}\n}", "github": "", "project": "", "reviewers": "h233;MYnu;qg6s;2doh", "pdf_size": 0, "recommendation": "8;8;8;8", "confidence": "4;3;3;3", "correctness": "4;3;4;4", "technical_novelty": "3;3;3;4", "empirical_novelty": "3;4;2;4", "wc_summary_paper": "60;220;160;119", "wc_summary_review": "67;80;155;31", "wc_main_review": "358;726;445;269", "wc_review": "485;1026;760;419", "wc_reply_reviewers": "0;27;257;0", "wc_reply_authors": "507;700;778;614", "reply_reviewers": "0;1;1;0", "reply_authors": "1;2;1;1", "recommendation_avg": [ 8.0, 0.0 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 139.75, 58.396810700585355 ], "wc_summary_review_avg": [ 83.25, 45.14628999153751 ], "wc_main_review_avg": [ 449.5, 171.3366569068044 ], "wc_review_avg": [ 672.5, 240.85109507743576 ], "wc_reply_reviewers_avg": [ 71.0, 107.95137794396142 ], "wc_reply_authors_avg": [ 649.75, 100.78287304894617 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 15, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 698, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16141074922608388769&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=fILj7WpI-g", "email": ";deepmind.com;;google.com;google.com;deepmind.com;;google.com;deepmind.com;;google.com;google.com;;eecs.berkeley.edu;google.com", "author_num": 15, "aff_unique_index": "0;0;0;0;0;0;0;0;1;0", "aff_unique_norm": "Google;Electrical Engineering & Computer Science Department", "aff_unique_dep": "Google DeepMind;Electrical Engineering & Computer Science", "aff_unique_url": "https://deepmind.com;", "aff_unique_abbr": "DeepMind;", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;1;0;0;0;0;0;0", "aff_country_unique": "United Kingdom;United States;" }, { "id": "fJ9iNyekd-", "title": "Positive and Unlabeled Federated Learning", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "We study the problem of learning from positive and unlabeled (PU) data in the federated setting, where each client only labels a little part of their dataset due to the limitation of resources and time. Different from the settings in traditional PU learning where the negative class consists of a single class, the negative samples which cannot be identified by a client in the federated setting may come from multiple classes which are unknown to the client. Therefore, existing PU learning methods can be hardly applied in this situation. To address this problem, we propose a novel framework, namely Federated learning with Positive and Unlabeled data (FedPU), to minimize the expected risk of multiple negative classes by leveraging the labeled data in other clients. We theoretically prove that the proposed FedPU can achieve a generalization bound which is no worse than $C\\sqrt{C}$ times (where $C$ denotes the number of classes) of the fully-supervised model. Empirical experiments show that the FedPU can achieve much better performance than conventional learning methods which can only use positive data. ", "keywords": "Positive and Unlabeled Learning;Federated Learning", "primary_area": "", "supplementary_material": "", "author": "Lin Xinyang;Hanting Chen;Yixing Xu;Chao Xu;XIAOLIN GUI;Yiping Deng;Yunhe Wang", "authorids": "~Lin_Xinyang1;~Hanting_Chen1;~Yixing_Xu2;~Chao_Xu1;~XIAOLIN_GUI1;~Yiping_Deng1;~Yunhe_Wang1", "gender": ";M;M;M;M;M;M", "homepage": "https://github.com/littleSunlxy;;;http://www.cis.pku.edu.cn/faculty/vision/xuchao/xuchao01.htm;http://gr.xjtu.edu.cn/web/xlgui;https://scholar.google.com/citations?user=niP8flkAAAAJ&hl=zh-CN;https://www.wangyunhe.site/", "dblp": ";232/2060;142/1013;;;;63/8217-1", "google_scholar": ";https://scholar.google.com/citations?hl=zh-CN;32tJoOkAAAAJ;https://scholar.google.co.uk/citations?hl=zh-CN;;niP8flkAAAAJ;https://scholar.google.com.sg/citations?user=isizOkYAAAAJ", "orcid": ";;;;0000-0003-4384-9891;;0000-0002-0142-509X", "linkedin": ";;;;;;", "or_profile": "~Lin_Xinyang1;~Hanting_Chen1;~Yixing_Xu2;~Chao_Xu1;~XIAOLIN_GUI1;~Yiping_Deng1;~Yunhe_Wang1", "aff": "Xi'an Jiaotong University;Peking University;Advanced Micro Devices;Peking University;Xi'an Jiaotong University;;Huawei Noah's Ark Lab", "aff_domain": "xjtu.edu;pku.edu.cn;amd.com;pku.edu;xjtu.edu;;huawei.com", "position": "MS student;PhD student;Principal Researcher;Full Professor;Full Professor;;Principal Researcher", "bibtex": "@misc{\nxinyang2022positive,\ntitle={Positive and Unlabeled Federated Learning},\nauthor={Lin Xinyang and Hanting Chen and Yixing Xu and Chao Xu and XIAOLIN GUI and Yiping Deng and Yunhe Wang},\nyear={2022},\nurl={https://openreview.net/forum?id=fJ9iNyekd-}\n}", "github": "", "project": "", "reviewers": "t8iW;acYi;q15M;iK1i", "site": "https://openreview.net/forum?id=fJ9iNyekd-", "pdf_size": 0, "recommendation": "3;5;5;5", "confidence": "4;4;3;3", "correctness": "2;3;4;2", "technical_novelty": "2;3;2;3", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "91;74;141;49", "wc_summary_review": "42;27;41;51", "wc_main_review": "268;494;112;242", "wc_review": "401;595;294;342", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.5, 0.8660254037844386 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 2.75, 0.82915619758885 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 88.75, 33.662850443775554 ], "wc_summary_review_avg": [ 40.25, 8.584142356694699 ], "wc_main_review_avg": [ 279.0, 137.48090776540573 ], "wc_review_avg": [ 408.0, 114.42246282963848 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.5222329678670935, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:jFs9SwtZyVYJ:scholar.google.com/&scioq=Positive+and+Unlabeled+Federated+Learning&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1;2;1;0;3", "aff_unique_norm": "Xi'an Jiao Tong University;Peking University;Advanced Micro Devices, Inc.;Huawei", "aff_unique_dep": ";;;Noah's Ark Lab", "aff_unique_url": "https://www.xjtu.edu.cn;http://www.pku.edu.cn;https://www.amd.com;https://www.huawei.com", "aff_unique_abbr": "XJTU;Peking U;AMD;Huawei", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;0;0", "aff_country_unique": "China;United States" }, { "id": "fJIrkNKGBNI", "title": "Effective Polynomial Filter Adaptation for Graph Neural Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "Graph Neural Networks (GNNs) exploit signals from node features and the input graph topology to improve node classification task performance. However, these models tend to perform poorly on heterophilic graphs, where connected nodes have different labels. Recently proposed GNNs work across graphs having varying levels of homophily. Among these, models relying on polynomial graph filters have shown promise. We observe that solutions to these polynomial graph filter models are also solutions to an overdetermined system of equations. It suggests that in some instances, the model needs to learn a reasonably high order polynomial. On investigation, we find the proposed models ineffective at learning such polynomials due to their designs. To mitigate this issue, we perform an eigendecomposition of the graph and propose to learn multiple adaptive polynomial filters acting on different subsets of the spectrum. We theoretically and empirically show that our proposed model learns a better filter, thereby improving classification accuracy. We study various aspects of our proposed model including, dependency on the number of eigencomponents utilized, latent polynomial filters learned, and performance of the individual polynomials on the node classification task. We further show that our model is scalable by evaluating over large graphs. Our model achieves performance gains of up to 10% over the state-of-the-art models and outperforms existing polynomial filter-based approaches in general.", "keywords": "Graph Neural Networks;Graph Filters", "primary_area": "", "supplementary_material": "", "author": "Vijay Lingam;Chanakya Ajit Ekbote;Manan Sharma;Rahul Ragesh;Arun Iyer;SUNDARARAJAN SELLAMANICKAM", "authorids": "~Vijay_Lingam1;~Chanakya_Ajit_Ekbote1;~Manan_Sharma1;~Rahul_Ragesh1;~Arun_Iyer1;~SUNDARARAJAN_SELLAMANICKAM2", "gender": "M;M;M;M;M;", "homepage": ";https://chanakyaekbote.netlify.app/;https://manan-s.github.io/;;;", "dblp": "219/1559.html;258/1037;;;262/6555;", "google_scholar": "FPOCruQAAAAJ;Jr2CK6QAAAAJ;https://scholar.google.com/citations?hl=en;;https://scholar.google.co.in/citations?user=Ngm0j_EAAAAJ;https://scholar.google.co.in/citations?user=JOk66doAAAAJ", "orcid": ";;;;0000-0001-7377-7599;", "linkedin": ";chanakyaekbote/;manansharma2/;rahulragesh/;iyerarunshankar/;", "or_profile": "~Vijay_Lingam1;~Chanakya_Ajit_Ekbote1;~Manan_Sharma1;~Rahul_Ragesh1;~Arun_Iyer1;~SUNDARARAJAN_SELLAMANICKAM2", "aff": "Microsoft;Montreal Institute for Learning Algorithms, University of Montreal, Universit\u00e9 de Montr\u00e9al;Microsoft;;Microsoft;Microsoft", "aff_domain": "microsoft.com;mila.umontreal.ca;microsoft.com;;microsoft.com;microsoft.com", "position": "Research Fellow;Intern;Researcher;;Principal Researcher;Principal Researcher", "bibtex": "@misc{\nlingam2022effective,\ntitle={Effective Polynomial Filter Adaptation for Graph Neural Networks},\nauthor={Vijay Lingam and Chanakya Ajit Ekbote and Manan Sharma and Rahul Ragesh and Arun Iyer and SUNDARARAJAN SELLAMANICKAM},\nyear={2022},\nurl={https://openreview.net/forum?id=fJIrkNKGBNI}\n}", "github": "", "project": "", "reviewers": "eu6Q;SUfF;kGGE;fpGz", "site": "https://openreview.net/forum?id=fJIrkNKGBNI", "pdf_size": 0, "recommendation": "5;5;5;5", "confidence": "5;4;4;4", "correctness": "3;4;3;4", "technical_novelty": "3;2;4;2", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "80;74;163;64", "wc_summary_review": "351;30;113;26", "wc_main_review": "2568;161;560;341", "wc_review": "2999;265;836;431", "wc_reply_reviewers": "972;147;61;0", "wc_reply_authors": "6291;3637;2652;3150", "reply_reviewers": "8;3;1;0", "reply_authors": "14;8;7;8", "recommendation_avg": [ 5.0, 0.0 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.82915619758885 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 95.25, 39.53084238920289 ], "wc_summary_review_avg": [ 130.0, 132.2365305050008 ], "wc_main_review_avg": [ 907.5, 969.0460515372838 ], "wc_review_avg": [ 1132.75, 1097.3140787851034 ], "wc_reply_reviewers_avg": [ 295.0, 394.33932089001723 ], "wc_reply_authors_avg": [ 3932.5, 1405.509605089912 ], "reply_reviewers_avg": [ 3.0, 3.082207001484488 ], "reply_authors_avg": [ 9.25, 2.7726341266023544 ], "replies_avg": [ 56, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:QlgcyP7WyWMJ:scholar.google.com/&scioq=Effective+Polynomial+Filter+Adaptation+for+Graph+Neural+Networks&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1;0;0;0", "aff_unique_norm": "Microsoft;University of Montreal", "aff_unique_dep": "Microsoft Corporation;Montreal Institute for Learning Algorithms", "aff_unique_url": "https://www.microsoft.com;https://www.mila.quebec", "aff_unique_abbr": "Microsoft;MILA", "aff_campus_unique_index": "1", "aff_campus_unique": ";Montreal", "aff_country_unique_index": "0;1;0;0;0", "aff_country_unique": "United States;Canada" }, { "id": "fKv__asZk47", "title": "Learning Similarity Metrics for Volumetric Simulations with Multiscale CNNs", "track": "main", "status": "Reject", "tldr": "", "abstract": "Simulations that produce three-dimensional data are ubiquitous in science, ranging from fluid flows to plasma physics. We propose a similarity model based on entropy, which allows for the creation of physically meaningful ground truth distances for the similarity assessment of scalar and vectorial data, produced from transport and motion-based simulations. Utilizing two data acquisition methods derived from this model, we create collections of fields from numerical PDE solvers and existing simulation data repositories, and highlight the importance of an appropriate data distribution for an effective training process. Furthermore, a multiscale CNN architecture that computes a volumetric similarity metric (VolSiM) is proposed. To the best of our knowledge this is the first learning method inherently designed to address the challenges arising for the similarity assessment of high-dimensional simulation data. Additionally, the tradeoff between a large batch size and an accurate correlation computation for correlation-based loss functions is investigated, and the metric's equivariance with respect to rotation and scale operations is analyzed. Finally, the robustness and generalization of VolSiM is evaluated on a large range of test data, as well as a particularly challenging turbulence case study, that is close to potential real-world applications.", "keywords": "metric learning;PDEs;numerical simulation;physical modeling", "primary_area": "", "supplementary_material": "/attachment/5f7f1594574bbee08eb1adfb370e5670f7f41e98.zip", "author": "Georg Kohl;Liwei Chen;Nils Thuerey", "authorids": "~Georg_Kohl1;~Liwei_Chen2;~Nils_Thuerey1", "gender": "M;M;M", "homepage": "https://ge.in.tum.de/about/georg-kohl/;;https://ge.in.tum.de", "dblp": "259/1567;;42/478", "google_scholar": "https://scholar.google.de/citations?user=9gVgWocAAAAJ;;https://scholar.google.com.tw/citations?user=GEehwv8AAAAJ", "orcid": "0000-0002-9661-575X;0000-0002-0309-2284;", "linkedin": ";liwei-chen-46557017/;", "or_profile": "~Georg_Kohl1;~Liwei_Chen2;~Nils_Thuerey1", "aff": "Technische Universit\u00e4t M\u00fcnchen;Technical University Munich;Technical University Munich", "aff_domain": "tum.de;tum.de;tum.de", "position": "PhD student;Postdoc;Associate Professor", "bibtex": "@misc{\nkohl2022learning,\ntitle={Learning Similarity Metrics for Volumetric Simulations with Multiscale {CNN}s},\nauthor={Georg Kohl and Liwei Chen and Nils Thuerey},\nyear={2022},\nurl={https://openreview.net/forum?id=fKv__asZk47}\n}", "github": "", "project": "", "reviewers": "f7VS;zceM;nzbf", "site": "https://openreview.net/forum?id=fKv__asZk47", "pdf_size": 0, "recommendation": "3;8;8", "confidence": "3;4;4", "correctness": "3;4;3", "technical_novelty": "2;4;3", "empirical_novelty": "2;4;3", "wc_summary_paper": "47;273;89", "wc_summary_review": "50;90;38", "wc_main_review": "356;167;323", "wc_review": "453;530;450", "wc_reply_reviewers": "0;8;0", "wc_reply_authors": "1283;177;778", "reply_reviewers": "0;1;0", "reply_authors": "2;1;1", "recommendation_avg": [ 6.333333333333333, 2.357022603955158 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 3.0, 0.816496580927726 ], "empirical_novelty_avg": [ 3.0, 0.816496580927726 ], "wc_summary_paper_avg": [ 136.33333333333334, 98.14728161741867 ], "wc_summary_review_avg": [ 59.333333333333336, 22.23110933404409 ], "wc_main_review_avg": [ 282.0, 82.42572413997951 ], "wc_review_avg": [ 477.6666666666667, 37.02551672683163 ], "wc_reply_reviewers_avg": [ 2.6666666666666665, 3.7712361663282534 ], "wc_reply_authors_avg": [ 746.0, 452.0892242319724 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.9999999999999998, "corr_recommendation_correctness": 0.5000000000000001, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4285194958875681657&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff_unique_index": "0;1;1", "aff_unique_norm": "Technische Universit\u00e4t M\u00fcnchen;Technical University of Munich", "aff_unique_dep": ";", "aff_unique_url": "https://www.tum.de;https://www.tum.de", "aff_unique_abbr": "TUM;TUM", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Germany" }, { "id": "fM8VzFD_2-", "title": "Discovering the neural correlate informed nosological relation among multiple neuropsychiatric disorders through dual utilisation of diagnostic information", "track": "main", "status": "Reject", "tldr": "", "abstract": "The unravelled nosological relation among diverse types of neuropsychiatric disorders serves as an important precursor in advocating the dimensional approach to psychiatric classification. Leveraging high-dimensional abnormal resting-state functional connectivity, the crux of mining corresponded nosological relations is to derive a low-dimensional embedding space that preserves the diagnostic attributes of represented disorders. To accomplish this goal, we seek to exploit the available diagnostic information in learning the optimal embedding space by proposing a novel type of conditional variational auto-encoder that incorporates dual utilisation of diagnostic information. Encouraged by the achieved promising results in challenging the conventional approaches in low dimensional density estimation of synthetic functional connectivity features, we further implement our approach on two empirical neuropsychiatric neuroimaging datasets and discover a reliable nosological relation among autism spectrum disorder, major depressive disorder, and schizophrenia.", "keywords": "computational psychiatric;variational auto-encoder;fMRI analysis", "primary_area": "", "supplementary_material": "/attachment/91d7fd3fc562b9d439a83b8e1a69431a8721d55e.zip", "author": "Wenjun Bai;Tomoki Tokuda;Okito Yamashita;Junichiro Yoshimoto", "authorids": "~Wenjun_Bai1;t-tokuda@atr.jp;~Okito_Yamashita1;~Junichiro_Yoshimoto1", "gender": "M;;;M", "homepage": "https://github.com/LeonBai;;;https://researchmap.jp/jun-y?lang=en", "dblp": ";;;70/6041", "google_scholar": ";;;https://scholar.google.co.jp/citations?user=eE-w9f0AAAAJ", "orcid": "0000-0002-4448-484X;;0000-0002-6039-3657;0000-0001-7995-0321", "linkedin": ";;;", "or_profile": "~Wenjun_Bai1;t-tokuda@atr.jp;~Okito_Yamashita1;~Junichiro_Yoshimoto1", "aff": "ATR;;ATR;Fujita Health University", "aff_domain": "atr.jp;;atr.jp;fujita-hu.ac.jp", "position": "Postdoc;;Principal Researcher;Full Professor", "bibtex": "@misc{\nbai2022discovering,\ntitle={Discovering the neural correlate informed nosological relation among multiple neuropsychiatric disorders through dual utilisation of diagnostic information},\nauthor={Wenjun Bai and Tomoki Tokuda and Okito Yamashita and Junichiro Yoshimoto},\nyear={2022},\nurl={https://openreview.net/forum?id=fM8VzFD_2-}\n}", "github": "", "project": "", "reviewers": "Cbpb;94nu;odDt;bgiV;nZ9a", "site": "https://openreview.net/forum?id=fM8VzFD_2-", "pdf_size": 0, "recommendation": "1;5;6;6;8", "confidence": "5;3;2;3;4", "correctness": "2;3;3;3;3", "technical_novelty": "1;2;3;3;3", "empirical_novelty": "1;2;3;3;2", "wc_summary_paper": "57;155;55;83;143", "wc_summary_review": "167;63;23;62;70", "wc_main_review": "119;606;11;376;524", "wc_review": "343;824;89;521;737", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "0;0;0;0;0", "reply_reviewers": "0;0;0;0;0", "reply_authors": "0;0;0;0;0", "recommendation_avg": [ 5.2, 2.3151673805580453 ], "confidence_avg": [ 3.4, 1.019803902718557 ], "correctness_avg": [ 2.8, 0.39999999999999997 ], "technical_novelty_avg": [ 2.4, 0.8 ], "empirical_novelty_avg": [ 2.2, 0.7483314773547882 ], "wc_summary_paper_avg": [ 98.6, 42.4904695196464 ], "wc_summary_review_avg": [ 77.0, 47.929114325219906 ], "wc_main_review_avg": [ 327.2, 228.98506501516644 ], "wc_review_avg": [ 502.8, 266.5020825434578 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.5421393180573577, "corr_recommendation_correctness": 0.9070618468604282, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:--uTzqIn4oYJ:scholar.google.com/&scioq=Discovering+the+neural+correlate+informed+nosological+relation+among+multiple+neuropsychiatric+disorders+through+dual+utilisation+of+diagnostic+information&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0;1", "aff_unique_norm": "Advanced Telecommunications Research Institute;Fujita Health University", "aff_unique_dep": ";", "aff_unique_url": "https://www.atr.jp;https://www.fujita-hu.ac.jp", "aff_unique_abbr": "ATR;FHU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Japan" }, { "id": "fNCVBsB-N9p", "title": "MECATS: Mixture-of-Experts for Probabilistic Forecasts of Aggregated Time Series", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "We introduce a mixture of heterogeneous experts framework called MECATS, which simultaneously forecasts the values of a set of time series that are related through an aggregation hierarchy. Different types of forecasting models can be employed as individual experts so that the form of each model can be tailored to the nature of the corresponding time series. MECATS learns hierarchical relationships during the training stage to help generalize better across all the time series being modeled and also mitigates coherency issues that arise due to constraints imposed by the hierarchy. We further build multiple quantile estimators on top of the point forecasts. The resulting probabilistic forecasts are nearly coherent, distribution-free, and independent of the choice of forecasting models. We conduct a comprehensive evaluation on both point and probabilistic forecasts and also formulate an extension for situations where change points exist in sequential data. In general, our method is robust, adaptive to datasets with different properties, and highly configurable and efficient for large-scale forecasting pipelines.", "keywords": "Time Series;Mixture-of-Experts;Data Aggregation;Uncertainty Estimation", "primary_area": "", "supplementary_material": "/attachment/299926e0ec2b4b725b3eb69682a2cd0eaeb77576.zip", "author": "Xing Han;Jing Hu;Joydeep Ghosh", "authorids": "~Xing_Han1;~Jing_Hu1;~Joydeep_Ghosh1", "gender": "M;F;M", "homepage": "https://aaronhan223.github.io/;;http://ideal.ece.utexas.edu/ghosh/", "dblp": "05/2143;;51/2272", "google_scholar": "Vejou24AAAAJ;;", "orcid": "0000-0003-0857-5506;;", "linkedin": "xing-han-628653b6/;hujing/;", "or_profile": "~Xing_Han1;~Jing_Hu1;~Joydeep_Ghosh1", "aff": "University of Texas at Austin;;University of Texas, Austin", "aff_domain": "utexas.edu;;utexas.edu", "position": "PhD student;;Full Professor", "bibtex": "@misc{\nhan2022mecats,\ntitle={{MECATS}: Mixture-of-Experts for Probabilistic Forecasts of Aggregated Time Series},\nauthor={Xing Han and Jing Hu and Joydeep Ghosh},\nyear={2022},\nurl={https://openreview.net/forum?id=fNCVBsB-N9p}\n}", "github": "", "project": "", "reviewers": "tXgU;CTFm;apbf", "site": "https://openreview.net/forum?id=fNCVBsB-N9p", "pdf_size": 0, "recommendation": "3;3;5", "confidence": "4;4;2", "correctness": "3;3;3", "technical_novelty": "2;2;3", "empirical_novelty": "3;2;3", "wc_summary_paper": "75;24;48", "wc_summary_review": "103;67;51", "wc_main_review": "577;386;122", "wc_review": "755;477;221", "wc_reply_reviewers": "283;0;0", "wc_reply_authors": "848;559;173", "reply_reviewers": "1;0;0", "reply_authors": "3;1;1", "recommendation_avg": [ 3.6666666666666665, 0.9428090415820634 ], "confidence_avg": [ 3.3333333333333335, 0.9428090415820634 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 49.0, 20.83266665599966 ], "wc_summary_review_avg": [ 73.66666666666667, 21.74600857373345 ], "wc_main_review_avg": [ 361.6666666666667, 186.54817667890035 ], "wc_review_avg": [ 484.3333333333333, 218.06624885316137 ], "wc_reply_reviewers_avg": [ 94.33333333333333, 133.40747938386198 ], "wc_reply_authors_avg": [ 526.6666666666666, 276.5144159392458 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.9428090415820634 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -1.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:yrO_H4Iwy_UJ:scholar.google.com/&scioq=MECATS:+Mixture-of-Experts+for+Probabilistic+Forecasts+of+Aggregated+Time+Series&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "University of Texas at Austin", "aff_unique_dep": "", "aff_unique_url": "https://www.utexas.edu", "aff_unique_abbr": "UT Austin", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Austin", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Dual Lottery Ticket Hypothesis", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6084", "id": "fOsN52jn25l", "poster": "", "openreview": "https://openreview.net/forum?id=fOsN52jn25l", "slides": "https://iclr.cc/virtual/2022/poster/6084", "video": "https://iclr.cc/virtual/2022/poster/6084", "author_site": "Yue Bai, Huan Wang, Zhiqiang Tao, Kunpeng Li, Yun Fu", "tldr": "", "abstract": "Fully exploiting the learning capacity of neural networks requires overparameterized dense networks. On the other side, directly training sparse neural networks typically results in unsatisfactory performance. Lottery Ticket Hypothesis (LTH) provides a novel view to investigate sparse network training and maintain its capacity. Concretely, it claims there exist winning tickets from a randomly initialized network found by iterative magnitude pruning and preserving promising trainability (or we say being in trainable condition). In this work, we regard the winning ticket from LTH as the subnetwork which is in trainable condition and its performance as our benchmark, then go from a complementary direction to articulate the Dual Lottery Ticket Hypothesis (DLTH): Randomly selected subnetworks from a randomly initialized dense network can be transformed into a trainable condition and achieve admirable performance compared with LTH --- random tickets in a given lottery pool can be transformed into winning tickets. Specifically, by using uniform-randomly selected subnetworks to represent the general cases, we propose a simple sparse network training strategy, Random Sparse Network Transformation (RST), to substantiate our DLTH. Concretely, we introduce a regularization term to borrow learning capacity and realize information extrusion from the weights which will be masked. After finishing the transformation for the randomly selected subnetworks, we conduct the regular finetuning to evaluate the model using fair comparisons with LTH and other strong baselines. Extensive experiments on several public datasets and comparisons with competitive approaches validate our DLTH as well as the effectiveness of the proposed model RST. Our work is expected to pave a way for inspiring new research directions of sparse network training in the future. Our code is available at https://github.com/yueb17/DLTH.", "keywords": "Dual Lottery Ticket Hypothesis;Sparse Network Training", "primary_area": "", "supplementary_material": "", "author": "Yue Bai;Huan Wang;ZHIQIANG TAO;Kunpeng Li;Yun Fu", "authorids": "~Yue_Bai1;~Huan_Wang3;~ZHIQIANG_TAO2;~Kunpeng_Li1;~Yun_Fu1", "gender": "M;M;;M;M", "homepage": "https://yueb17.github.io/;https://huanwang.tech/;http://ztao.cc/;https://kunpengli1994.github.io/;http://www1.ece.neu.edu/~yunfu/", "dblp": "119/0848;70/6155-14;135/5229.html;;00/5815-1", "google_scholar": "https://scholar.google.com/citations?hl=en;0-On0y4AAAAJ;sEKglOkAAAAJ;https://scholar.google.com/citations?hl=zh-CN;https://scholar.google.com.tw/citations?user=h-JEcQ8AAAAJ", "orcid": ";0000-0001-6951-901X;;;0000-0002-5098-2853", "linkedin": ";huanwang-zju/;;;furaymond/", "or_profile": "~Yue_Bai1;~Huan_Wang3;~ZHIQIANG_TAO2;~Kunpeng_Li1;~Yun_Fu1", "aff": "Northeastern University;Northeastern University;Santa Clara University;Meta;Northeastern University", "aff_domain": "neu.edu;neu.edu;scu.edu;fb.com;northeastern.edu", "position": "PhD student;PhD student;Assistant Professor;Researcher;Full Professor", "bibtex": "@inproceedings{\nbai2022dual,\ntitle={Dual Lottery Ticket Hypothesis},\nauthor={Yue Bai and Huan Wang and ZHIQIANG TAO and Kunpeng Li and Yun Fu},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=fOsN52jn25l}\n}", "github": "", "project": "", "reviewers": "vhgZ;XCnY;jr1G;mwor;W7x5", "pdf_size": 0, "recommendation": "6;6;8;8;8", "confidence": "4;5;4;4;4", "correctness": "2;2;3;3;3", "technical_novelty": "3;2;3;3;3", "empirical_novelty": "3;3;3;2;3", "wc_summary_paper": "87;111;70;59;47", "wc_summary_review": "80;69;51;45;44", "wc_main_review": "461;411;230;153;225", "wc_review": "628;591;351;257;316", "wc_reply_reviewers": "403;415;28;0;124", "wc_reply_authors": "1355;1698;512;542;838", "reply_reviewers": "3;1;1;0;2", "reply_authors": "4;5;2;1;3", "recommendation_avg": [ 7.2, 0.9797958971132712 ], "confidence_avg": [ 4.2, 0.39999999999999997 ], "correctness_avg": [ 2.6, 0.4898979485566356 ], "technical_novelty_avg": [ 2.8, 0.39999999999999997 ], "empirical_novelty_avg": [ 2.8, 0.39999999999999997 ], "wc_summary_paper_avg": [ 74.8, 22.38213573366045 ], "wc_summary_review_avg": [ 57.8, 14.274452704044384 ], "wc_main_review_avg": [ 296.0, 118.57149741822442 ], "wc_review_avg": [ 428.6, 151.1828032548676 ], "wc_reply_reviewers_avg": [ 194.0, 180.3407885088673 ], "wc_reply_authors_avg": [ 989.0, 466.2136849128305 ], "reply_reviewers_avg": [ 1.4, 1.019803902718557 ], "reply_authors_avg": [ 3.0, 1.4142135623730951 ], "replies_avg": [ 28, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.6123724356957947, "corr_recommendation_correctness": 1.0, "gs_citation": 60, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3069306637615595875&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=fOsN52jn25l", "email": "neu.edu;neu.edu;scu.edu;fb.com;northeastern.edu", "author_num": 5, "aff_unique_index": "0;0;1;2;0", "aff_unique_norm": "Northeastern University;Santa Clara University;Meta", "aff_unique_dep": ";;Meta Platforms, Inc.", "aff_unique_url": "https://www.northeastern.edu;https://www.scu.edu;https://meta.com", "aff_unique_abbr": "NEU;SCU;Meta", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Gradient Step Denoiser for convergent Plug-and-Play", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6192", "id": "fPhKeld3Okz", "poster": "", "openreview": "https://openreview.net/forum?id=fPhKeld3Okz", "slides": "https://iclr.cc/virtual/2022/poster/6192", "video": "https://iclr.cc/virtual/2022/poster/6192", "author_site": "Samuel Hurault, Arthur Leclaire, Nicolas Papadakis", "tldr": "", "abstract": "Plug-and-Play methods constitute a class of iterative algorithms for imaging problems where regularization is performed by an off-the-shelf denoiser. Although Plug-and-Play methods can lead to tremendous visual performance for various image problems, the few existing convergence guarantees are based on unrealistic (or suboptimal) hypotheses on the denoiser, or limited to strongly convex data terms. In this work, we propose a new type of Plug-and-Play methods, based on half-quadratic splitting, for which the denoiser is realized as a gradient descent step on a functional parameterized by a deep neural network. Exploiting convergence results for proximal gradient descent algorithms in the non-convex setting, we show that the proposed Plug-and-Play algorithm is a convergent iterative scheme that targets stationary points of an explicit global functional. Besides, experiments show that it is possible to learn such a deep denoiser while not compromising the performance in comparison to other state-of-the-art deep denoisers used in Plug-and-Play schemes. We apply our proximal gradient algorithm to various ill-posed inverse problems, e.g. deblurring, super-resolution and inpainting. For all these applications, numerical results empirically confirm the convergence results. Experiments also show that this new algorithm reaches state-of-the-art performance, both quantitatively and qualitatively.", "keywords": "Plug-and-Play;Inverse Problem;Image Restoration;Denoising", "primary_area": "", "supplementary_material": "/attachment/1e5855977beb186097e47f716e0ca0fb79449511.zip", "author": "Samuel Hurault;Arthur Leclaire;Nicolas Papadakis", "authorids": "~Samuel_Hurault1;arthur.leclaire@math.u-bordeaux.fr;~Nicolas_Papadakis3", "gender": "M;;M", "homepage": ";;https://www.math.u-bordeaux.fr/~npapadak/", "dblp": "239/3588;;70/1520", "google_scholar": "https://scholar.google.fr/citations?user=f_rtYCAAAAAJ;;https://scholar.google.fr/citations?user=hfyLiLYAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Samuel_Hurault1;arthur.leclaire@math.u-bordeaux.fr;~Nicolas_Papadakis3", "aff": "University of Bordeaux;;CNRS/IMB", "aff_domain": "u-bordeaux.fr;;u-bordeaux.fr", "position": "PhD student;;Researcher", "bibtex": "@inproceedings{\nhurault2022gradient,\ntitle={Gradient Step Denoiser for convergent Plug-and-Play},\nauthor={Samuel Hurault and Arthur Leclaire and Nicolas Papadakis},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=fPhKeld3Okz}\n}", "github": "", "project": "", "reviewers": "E8QG;GZzY;QQES;xYLt", "pdf_size": 0, "recommendation": "6;6;6;8", "confidence": "4;4;5;4", "correctness": "3;3;4;4", "technical_novelty": "3;2;2;4", "empirical_novelty": "3;3;2;3", "wc_summary_paper": "204;184;53;109", "wc_summary_review": "24;59;131;73", "wc_main_review": "232;418;1132;260", "wc_review": "460;661;1316;442", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "1111;880;2121;502", "reply_reviewers": "0;0;0;0", "reply_authors": "2;2;3;1", "recommendation_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.82915619758885 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 137.5, 60.28474102125678 ], "wc_summary_review_avg": [ 71.75, 38.58351331851468 ], "wc_main_review_avg": [ 510.5, 365.7632430958584 ], "wc_review_avg": [ 719.75, 354.8171183863597 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 1153.5, 599.3990740733589 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.0, 0.7071067811865476 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 134, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3194499334349587754&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=fPhKeld3Okz", "email": "u-bordeaux.fr;;u-bordeaux.fr", "author_num": 3, "aff_unique_index": "0;1", "aff_unique_norm": "University of Bordeaux;CNRS", "aff_unique_dep": ";Institut de Math\u00e9matiques de Bordeaux", "aff_unique_url": "https://www.u-bordeaux.fr;https://www.cnrs.fr", "aff_unique_abbr": "UBordeaux;CNRS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "France" }, { "title": "Fast Generic Interaction Detection for Model Interpretability and Compression", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6556", "id": "fQTlgI2qZqE", "poster": "", "openreview": "https://openreview.net/forum?id=fQTlgI2qZqE", "slides": "https://iclr.cc/virtual/2022/poster/6556", "video": "https://iclr.cc/virtual/2022/poster/6556", "author_site": "Tianjian Zhang, Feng Yin, Zhi-Quan Luo", "tldr": "", "abstract": "The ability of discovering feature interactions in a black-box model is vital to explainable deep learning. We propose a principled, global interaction detection method by casting our target as a multi-arm bandits problem and solving it swiftly with the UCB algorithm. This adaptive method is free of ad-hoc assumptions and among the cutting-edge methods with outstanding detection accuracy and stability. Based on the detection outcome, a lightweight and interpretable deep learning model (called ParaACE) is further built using the alternating conditional expectation (ACE) method. Our proposed ParaACE improves the prediction performance by 26 % and reduces the model size by 100+ times as compared to its Teacher model over various datasets. Furthermore, we show the great potential of our method for scientific discovery through interpreting various real datasets in the economics and smart medicine sectors. The code is available at https://github.com/zhangtj1996/ParaACE. ", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/13b6c5f5d5d4a02b2b8445bbf706ffb338b8840c.zip", "author": "Tianjian Zhang;Feng Yin;Zhi-Quan Luo", "authorids": "~Tianjian_Zhang1;~Feng_Yin1;~Zhi-Quan_Luo1", "gender": "M;M;M", "homepage": "https://sse.cuhk.edu.cn/en/teacher/309;https://sse.cuhk.edu.cn/en/faculty/yinfeng;", "dblp": "190/6500;59/6917;", "google_scholar": ";4mW1N5oAAAAJ;dW3gcXoAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Tianjian_Zhang1;~Feng_Yin1;~Zhi-Quan_Luo1", "aff": "The Chinese University of Hong Kong, Shenzhen;The Chinese University of Hong Kong;The Chinese University of Hong Kong, Shenzhen", "aff_domain": "cuhk.edu.cn;cuhk.edu.cn;cuhk.edu.cn", "position": "PhD student;Assistant Professor;Full Professor", "bibtex": "@inproceedings{\nzhang2022fast,\ntitle={Fast Generic Interaction Detection for Model Interpretability and Compression},\nauthor={Tianjian Zhang and Feng Yin and Zhi-Quan Luo},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=fQTlgI2qZqE}\n}", "github": "", "project": "", "reviewers": "QLX7;vh8A;akne;bpD2", "pdf_size": 0, "recommendation": "6;6;6;8", "confidence": "4;3;4;4", "correctness": "3;3;3;3", "technical_novelty": "4;3;3;3", "empirical_novelty": "4;2;2;4", "wc_summary_paper": "104;23;138;123", "wc_summary_review": "54;46;155;81", "wc_main_review": "304;568;928;475", "wc_review": "462;637;1221;679", "wc_reply_reviewers": "0;46;148;0", "wc_reply_authors": "782;886;493;399", "reply_reviewers": "0;1;1;0", "reply_authors": "1;2;2;1", "recommendation_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 3.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.0, 1.0 ], "wc_summary_paper_avg": [ 97.0, 44.390314258856066 ], "wc_summary_review_avg": [ 84.0, 42.99418565341132 ], "wc_main_review_avg": [ 568.75, 228.00370062786263 ], "wc_review_avg": [ 749.75, 283.9871255884675 ], "wc_reply_reviewers_avg": [ 48.5, 60.43798474469512 ], "wc_reply_authors_avg": [ 640.0, 200.23111646295138 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": 0.0, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11957379091472502401&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 2, "pdf": "https://openreview.net/pdf?id=fQTlgI2qZqE", "email": "cuhk.edu.cn;cuhk.edu.cn;cuhk.edu.cn", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Chinese University of Hong Kong", "aff_unique_dep": "", "aff_unique_url": "https://www.cuhk.edu.cn", "aff_unique_abbr": "CUHK", "aff_campus_unique_index": "0;1;0", "aff_campus_unique": "Shenzhen;Hong Kong SAR", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "Quadtree Attention for Vision Transformers", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6822", "id": "fR-EnKWL_Zb", "poster": "", "openreview": "https://openreview.net/forum?id=fR-EnKWL_Zb", "slides": "https://iclr.cc/virtual/2022/poster/6822", "video": "https://iclr.cc/virtual/2022/poster/6822", "author_site": "Shitao Tang, Jiahui Zhang, Siyu Zhu, Ping Tan", "tldr": "", "abstract": "Transformers have been successful in many vision tasks, thanks to their capability of capturing long-range dependency. However, their quadratic computational complexity poses a major obstacle for applying them to vision tasks requiring dense predictions, such as object detection, feature matching, stereo, etc. We introduce QuadTree Attention, which reduces the computational complexity from quadratic to linear. Our quadtree transformer builds token pyramids and computes attention in a coarse-to-fine manner. At each level, the top K patches with the highest attention scores are selected, such that at the next level, attention is only evaluated within the relevant regions corresponding to these top K patches. We demonstrate that quadtree attention achieves state-of-the-art performance in various vision tasks, e.g. with 4.0% improvement in feature matching on ScanNet, about 50% flops reduction in stereo matching, 0.4-1.5% improvement in top-1 accuracy on ImageNet classification, 1.2-1.8% improvement on COCO object detection, and 0.7-2.4% improvement on semantic segmentation over previous state-of-the-art transformers. The codes are available at https://github.com/Tangshitao/QuadtreeAttention.", "keywords": "Vision Transformer;Efficient Transformer;Feature matching;Stereo;image classification;detection;3D Vision", "primary_area": "", "supplementary_material": "", "author": "Shitao Tang;Jiahui Zhang;Siyu Zhu;Ping Tan", "authorids": "~Shitao_Tang1;~Jiahui_Zhang3;~Siyu_Zhu1;~Ping_Tan2", "gender": "M;M;M;M", "homepage": "https://tangshitao.github.io/;;https://sites.google.com/site/zhusiyucs;http://www.cs.sfu.ca/~pingtan/", "dblp": "203/8797;;81/8842-1;", "google_scholar": "https://scholar.google.com/citations?hl=zh-CN;l8YDfhgAAAAJ;vNCnDiMAAAAJ;XhyKVFMAAAAJ", "orcid": ";;;0000-0002-4506-6973", "linkedin": ";;;", "or_profile": "~Shitao_Tang1;~Jiahui_Zhang3;~Siyu_Zhu1;~Ping_Tan2", "aff": "Simon Fraser University;Alibaba Group;Alibaba Group;Simon Fraser University", "aff_domain": "sfu.ca;alibaba-inc.com;alibaba-inc.com;sfu.ca", "position": "PhD student;Researcher;Director;Professor", "bibtex": "@inproceedings{\ntang2022quadtree,\ntitle={Quadtree Attention for Vision Transformers},\nauthor={Shitao Tang and Jiahui Zhang and Siyu Zhu and Ping Tan},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=fR-EnKWL_Zb}\n}", "github": "", "project": "", "reviewers": "sHEA;fyBu;jF1Z;PDon", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "4;4;4;4", "correctness": "3;4;3;3", "technical_novelty": "2;3;3;3", "empirical_novelty": "0;3;3;3", "wc_summary_paper": "52;42;98;89", "wc_summary_review": "51;25;84;44", "wc_main_review": "277;110;513;242", "wc_review": "380;177;695;375", "wc_reply_reviewers": "39;13;116;43", "wc_reply_authors": "1069;182;1131;264", "reply_reviewers": "1;1;2;1", "reply_authors": "2;1;3;2", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 1.299038105676658 ], "wc_summary_paper_avg": [ 70.25, 23.731571797923543 ], "wc_summary_review_avg": [ 51.0, 21.295539439046856 ], "wc_main_review_avg": [ 285.5, 145.36247796456965 ], "wc_review_avg": [ 406.75, 185.47017954377463 ], "wc_reply_reviewers_avg": [ 52.75, 38.290827883450106 ], "wc_reply_authors_avg": [ 661.5, 440.0036931663188 ], "reply_reviewers_avg": [ 1.25, 0.4330127018922193 ], "reply_authors_avg": [ 2.0, 0.7071067811865476 ], "replies_avg": [ 20, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": -0.13245323570650439, "gs_citation": 211, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8134043907351506595&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=fR-EnKWL_Zb", "email": "sfu.ca;alibaba-inc.com;alibaba-inc.com;sfu.ca", "author_num": 4, "aff_unique_index": "0;1;1;0", "aff_unique_norm": "Simon Fraser University;Alibaba Group", "aff_unique_dep": ";", "aff_unique_url": "https://www.sfu.ca;https://www.alibaba.com", "aff_unique_abbr": "SFU;Alibaba", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;0", "aff_country_unique": "Canada;China" }, { "id": "fRb9LBWUo56", "title": "On the benefits of deep RL in accelerated MRI sampling", "track": "main", "status": "Reject", "tldr": "", "abstract": "Deep learning approaches have shown great promise in accelerating magnetic resonance imaging (MRI), by reconstructing high quality images from highly undersampled data. While previous sampling methods relied on heuristics, recent work has improved the state-of-the-art (SotA) with deep reinforcement learning (RL) sampling policies, which promise the possibility of long term planning and adapting to the observations at test time. In this work, we perform a careful reproduction and comparison of SotA RL sampling methods. We find that i) a simple, easy-to-code, greedily trained fixed policy can match or outperform deep RL methods and ii) find and resolve subtle variations in the preprocessing which previously made results incomparable across different works.\nOur results cast doubt on the added value of current RL approaches over fixed masks in MRI sampling and highlight the importance of leveraging strong fixed baselines, standardized reporting as well as isolating the source of improvement in a given work via ablations. We conclude with recommendations for the training and evaluation of deep reconstruction and sampling systems for adaptive MRI based on our findings.\n", "keywords": "MRI reconstruction;MRI sampling;reinforcement learning;accelerated MRI;replication", "primary_area": "", "supplementary_material": "", "author": "Thomas Sanchez;Igor Krawczuk;Volkan Cevher", "authorids": "~Thomas_Sanchez1;~Igor_Krawczuk1;~Volkan_Cevher1", "gender": ";Unspecified;M", "homepage": ";https://krawczuk.eu;http://lions.epfl.ch", "dblp": ";244/7380.html;70/5301", "google_scholar": ";https://scholar.google.ch/citations?user=rLQIkUsAAAAJ;https://scholar.google.ch/citations?user=hlWhzU8AAAAJ", "orcid": ";0000-0002-5281-8926;", "linkedin": ";https://linkedin.com/in/igorkrawczuk;", "or_profile": "~Thomas_Sanchez1;~Igor_Krawczuk1;~Volkan_Cevher1", "aff": ";Swiss Federal Institute of Technology Lausanne;Swiss Institute of Technology", "aff_domain": ";epfl.ch;epfl.ch", "position": ";PhD student;Associate Professor", "bibtex": "@misc{\nsanchez2022on,\ntitle={On the benefits of deep {RL} in accelerated {MRI} sampling},\nauthor={Thomas Sanchez and Igor Krawczuk and Volkan Cevher},\nyear={2022},\nurl={https://openreview.net/forum?id=fRb9LBWUo56}\n}", "github": "", "project": "", "reviewers": "zYDG;PeJ1;8cse;vo6W", "site": "https://openreview.net/forum?id=fRb9LBWUo56", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "4;2;2;3", "correctness": "3;4;3;3", "technical_novelty": "2;2;1;1", "empirical_novelty": "2;2;3;2", "wc_summary_paper": "114;161;47;35", "wc_summary_review": "38;27;69;51", "wc_main_review": "132;221;387;104", "wc_review": "284;409;503;190", "wc_reply_reviewers": "0;0;301;0", "wc_reply_authors": "137;336;1823;343", "reply_reviewers": "0;0;1;0", "reply_authors": "1;1;3;1", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 2.75, 0.82915619758885 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 1.5, 0.5 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 89.25, 51.20729928437937 ], "wc_summary_review_avg": [ 46.25, 15.642490210960657 ], "wc_main_review_avg": [ 211.0, 110.41512577541177 ], "wc_review_avg": [ 346.5, 119.16060590648236 ], "wc_reply_reviewers_avg": [ 75.25, 130.336823269558 ], "wc_reply_authors_avg": [ 659.75, 676.6762058030414 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.5, 0.8660254037844386 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.30151134457776363, "corr_recommendation_correctness": -0.5773502691896257, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:KUihF-t3Cf4J:scholar.google.com/&scioq=On+the+benefits+of+deep+RL+in+accelerated+MRI+sampling&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1", "aff_unique_norm": "Swiss Federal Institute of Technology Lausanne;Swiss Federal Institute of Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.epfl.ch;https://www.ethz.ch", "aff_unique_abbr": "EPFL;ETH Zurich", "aff_campus_unique_index": "0", "aff_campus_unique": "Lausanne;", "aff_country_unique_index": "0;0", "aff_country_unique": "Switzerland" }, { "id": "fRnRsdc_nR7", "title": "Towards fast and effective single-step adversarial training", "track": "main", "status": "Reject", "tldr": "", "abstract": "Recently, Wong et al. (2020) showed adversarial training with single-step FGSM leads to a characteristic failure mode named catastrophic overfitting (CO), in which a model becomes suddenly vulnerable to multi-step attacks. Moreover, they showed adding a random perturbation prior to FGSM (RS-FGSM) seemed to be sufficient to prevent CO. However, Andriushchenko & Flammarion (2020) observed that RS-FGSM still leads to CO for larger perturbations and argue that the only contribution of the random step is to reduce the magnitude of the attacks. They suggest a regularizer (GradAlign) that avoids CO but is significantly more expensive than RS-FGSM. In this work, we methodically revisit the role of noise and clipping in single-step adversarial training. Contrary to previous intuitions, we find that not clipping the perturbation around the clean sample and using a stronger noise is highly effective in avoiding CO for large perturbation radii, despite leading to an increase in the magnitude of the attacks. Based on these observations, we propose a method called Noise-FGSM (N-FGSM), which attacks noise-augmented samples directly using a single-step. Empirical analyses on a large suite of experiments show that N-FGSM is able to match or surpass the performance of GradAlign while achieving a 3x speed-up.", "keywords": "single-step adversarial training;catastrophic overfitting;FGSM;efficient adversarial training;fast adversarial training", "primary_area": "", "supplementary_material": "", "author": "Pau de Jorge;Adel Bibi;Riccardo Volpi;Amartya Sanyal;Philip Torr;Gr\u00e9gory Rogez;Puneet K. Dokania", "authorids": "~Pau_de_Jorge1;~Adel_Bibi1;~Riccardo_Volpi1;~Amartya_Sanyal1;~Philip_Torr1;~Gr\u00e9gory_Rogez1;~Puneet_K._Dokania1", "gender": "M;M;M;M;;M;M", "homepage": "https://europe.naverlabs.com/people_user/Pau-De-Jorge/;http://adelbibi.com;https://ricvolpi.github.io;https://amartya18x.github.io;http://www.robots.ox.ac.uk/~tvg/;https://europe.naverlabs.com/people_user/gregory-rogez/;http://puneetkdokania.github.io/", "dblp": "267/5657;176/0964;194/2478;203/8807;;49/4408;150/4211", "google_scholar": "https://scholar.google.hk/citations?user=9voBw90AAAAJ;Q4j2laYAAAAJ;YkeS_SoAAAAJ;;;Atzr3VgAAAAJ;https://scholar.google.fr/citations?user=WsM7ybkAAAAJ", "orcid": ";0000-0002-6169-3918;;0000-0002-4190-0449;;;", "linkedin": "pau-de-jorge-aranda/;adel-bibi-ba3671ab/;;;;gr\u00e9gory-rogez/?originalSubdomain=fr;", "or_profile": "~Pau_de_Jorge1;~Adel_Bibi1;~Riccardo_Volpi1;~Amartya_Sanyal1;~Philip_Torr1;~Gregory_Rogez3;~Puneet_Dokania1", "aff": "University of Oxford;University of Oxford;Naver Labs Europe;Swiss Federal Institute of Technology;University of Oxford;Naver Labs Europe;University of Oxford", "aff_domain": "ox.ac.uk;ox.ac.uk;naverlabs.com;ethz.ch;ox.ac.uk;naverlabs.com;oxford.ac.uk", "position": "PhD student;Postdoc;Researcher;Postdoc;Full Professor;Group Lead - Senior Scientist;Senior Researcher", "bibtex": "@misc{\njorge2022towards,\ntitle={Towards fast and effective single-step adversarial training},\nauthor={Pau de Jorge and Adel Bibi and Riccardo Volpi and Amartya Sanyal and Philip Torr and Gr{\\'e}gory Rogez and Puneet K. Dokania},\nyear={2022},\nurl={https://openreview.net/forum?id=fRnRsdc_nR7}\n}", "github": "", "project": "", "reviewers": "Gq3c;iWaX;6yud;ZY47", "site": "https://openreview.net/forum?id=fRnRsdc_nR7", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "4;4;2;5", "correctness": "3;2;3;4", "technical_novelty": "1;2;3;3", "empirical_novelty": "2;3;2;3", "wc_summary_paper": "44;78;64;55", "wc_summary_review": "35;132;58;155", "wc_main_review": "282;203;358;127", "wc_review": "361;413;480;337", "wc_reply_reviewers": "0;278;278;0", "wc_reply_authors": "1062;1383;2016;564", "reply_reviewers": "0;3;1;0", "reply_authors": "2;4;4;1", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 3.75, 1.0897247358851685 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.25, 0.82915619758885 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 60.25, 12.457427503300993 ], "wc_summary_review_avg": [ 95.0, 49.84475900232641 ], "wc_main_review_avg": [ 242.5, 86.31483070712703 ], "wc_review_avg": [ 397.75, 54.860618844486254 ], "wc_reply_reviewers_avg": [ 139.0, 139.0 ], "wc_reply_authors_avg": [ 1256.25, 526.8369648192883 ], "reply_reviewers_avg": [ 1.0, 1.224744871391589 ], "reply_authors_avg": [ 2.75, 1.299038105676658 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.15789473684210528, "corr_recommendation_correctness": 0.3244428422615251, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:wN0kRZ0OJ8QJ:scholar.google.com/&scioq=Towards+fast+and+effective+single-step+adversarial+training&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0;1;2;0;1;0", "aff_unique_norm": "University of Oxford;NAVER LABS;Swiss Federal Institute of Technology", "aff_unique_dep": ";;", "aff_unique_url": "https://www.ox.ac.uk;https://labs.naver.com;https://www.ethz.ch", "aff_unique_abbr": "Oxford;NLE;ETH Zurich", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;2;0;1;0", "aff_country_unique": "United Kingdom;Unknown;Switzerland" }, { "id": "fSeD40P0XTI", "title": "ACCTS: an Adaptive Model Training Policy for Continuous Classification of Time Series", "track": "main", "status": "Reject", "tldr": "", "abstract": "More and more real-world applications require to classify time series at every time. For example, critical patients should be detected for vital signs and diagnosed at all times to facilitate timely life-saving. For this demand, we propose a new concept, Continuous Classification of Time Series (CCTS), to achieve the high-accuracy classification at every time. Time series always evolves dynamically, changing features introducing the multi-distribution form. Thus, different from the existing one-shot classification, the key of CCTS is to model multiple distributions simultaneously. However, most models are hard to achieve it due to their independent identically distributed premise. If a model learns a new distribution, it will likely forget old ones. And if a model repeatedly learns similar data, it will likely be overfitted. Thus, two main problems are the catastrophic forgetting and the over fitting. In this work, we define CCTS as a continual learning task with the unclear distribution division. But different divisions differently affect two problems and a fixed division rule may become invalid as time series evolves. In order to overcome two main problems and finally achieve CCTS, we propose a novel Adaptive model training policy - ACCTS. Its adaptability represents in two aspects: (1) Adaptive multi-distribution extraction policy. Instead of the fixed rules and the prior knowledge, ACCTS extracts data distributions adaptive to the time series evolution and the model change; (2) Adaptive importance-based replay policy. Instead of reviewing all old distributions, ACCTS only replays the important samples adaptive to the contribution of data to the model. Experiments on four real-world datasets show that our method can classify more accurately than all baselines at every time.", "keywords": "Continuous classification of time series;Deep learning;Model training", "primary_area": "", "supplementary_material": "", "author": "Chenxi Sun;Moxian Song;Derun Cai;Shenda Hong;Hongyan Li", "authorids": "~Chenxi_Sun2;~Moxian_Song1;~Derun_Cai1;~Shenda_Hong1;~Hongyan_Li2", "gender": "F;M;M;;F", "homepage": ";https://song-moxian.github.io/;;;", "dblp": "https://dblp.uni-trier.de/pid/166/6086.html;198/2984.html;292/2694.html;;", "google_scholar": ";https://scholar.google.com/citations?hl=zh-CN;;;", "orcid": "0000-0002-1762-0877;0000-0002-3847-6384;;;0000-0001-7174-2851", "linkedin": ";;;;", "or_profile": "~Chenxi_Sun2;~Moxian_Song1;~Derun_Cai1;~Shenda_Hong1;~Hongyan_Li2", "aff": "Peking University;Peking University;Peking University;;Peking University", "aff_domain": "pku.edu.cn;pku.edu.cn;pku.edu.cn;;pku.edu.cn", "position": "PhD student;PhD student;MS student;;Full Professor", "bibtex": "@misc{\nsun2022accts,\ntitle={{ACCTS}: an Adaptive Model Training Policy for Continuous Classification of Time Series},\nauthor={Chenxi Sun and Moxian Song and Derun Cai and Shenda Hong and Hongyan Li},\nyear={2022},\nurl={https://openreview.net/forum?id=fSeD40P0XTI}\n}", "github": "", "project": "", "reviewers": "Kj5t;kGRM;Fizc;AmKR", "site": "https://openreview.net/forum?id=fSeD40P0XTI", "pdf_size": 0, "recommendation": "3;3;3;3", "confidence": "2;3;3;3", "correctness": "2;3;3;2", "technical_novelty": "3;2;3;2", "empirical_novelty": "2;3;2;3", "wc_summary_paper": "121;63;81;103", "wc_summary_review": "32;33;30;62", "wc_main_review": "359;217;296;738", "wc_review": "512;313;407;903", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 2.75, 0.4330127018922193 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 92.0, 21.93171219946131 ], "wc_summary_review_avg": [ 39.25, 13.179055353097201 ], "wc_main_review_avg": [ 402.5, 200.1280839862312 ], "wc_review_avg": [ 533.75, 224.50765577146808 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Wr_ci9_HtOQJ:scholar.google.com/&scioq=ACCTS:+an+Adaptive+Model+Training+Policy+for+Continuous+Classification+of+Time+Series&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Peking University", "aff_unique_dep": "", "aff_unique_url": "http://www.pku.edu.cn", "aff_unique_abbr": "Peking U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "id": "fStt6fyzrK", "title": "Model-Based Robust Adaptive Semantic Segmentation", "track": "main", "status": "Reject", "tldr": "", "abstract": "Semantic image segmentation enjoys a wide range of applications such as autonomous vehicles and medical imaging while it is typically accomplished by deep neural networks (DNNs). Nevertheless, DNNs are known to be fragile to input perturbations that are adversarially crafted or occur due to natural variations, such as changes in weather or lighting conditions. This issue of lack of robustness prevents the application of learning-based semantic segmentation methods on safety-critical applications. To mitigate this challenge, in this paper, we propose model-based robust adaptive training algorithm (MRTAdapt), a new training algorithm to enhance the robustness of DNN-based semantic segmentation methods against natural variations that leverages model-based robust training algorithms and generative adversarial networks. Natural variation effects are minimized from both image and label sides. We provide extensive experimental results on both real-world and synthetic datasets demonstrating that model-based robust adaptive training algorithm outperforms multiple state-of-the-art models under various natural variations. ", "keywords": "Semantic Segmentation;Robustness;Natural Variation", "primary_area": "", "supplementary_material": "", "author": "Jun Wang;Yiannis Kantaros", "authorids": "~Jun_Wang27;~Yiannis_Kantaros1", "gender": ";M", "homepage": "https://sites.google.com/view/kantaros;https://scholar.google.com/citations?user=BgPTZ4MAAAAJ&hl=en", "dblp": "121/0062;", "google_scholar": "HuCTrEEAAAAJ;", "orcid": ";", "linkedin": ";junwang1997/", "or_profile": "~Yiannis_Kantaros1;~Jun_Wang26", "aff": "Washington University, Saint Louis;Washington University, St. Louis", "aff_domain": "wustl.edu;wustl.edu", "position": "Assistant Professor;PhD student", "bibtex": "@misc{\nwang2022modelbased,\ntitle={Model-Based Robust Adaptive Semantic Segmentation},\nauthor={Jun Wang and Yiannis Kantaros},\nyear={2022},\nurl={https://openreview.net/forum?id=fStt6fyzrK}\n}", "github": "", "project": "", "reviewers": "9pKw;ChGc;kE5M", "site": "https://openreview.net/forum?id=fStt6fyzrK", "pdf_size": 0, "recommendation": "5;5;6", "confidence": "4;3;4", "correctness": "2;2;3", "technical_novelty": "2;2;3", "empirical_novelty": "2;2;3", "wc_summary_paper": "79;181;201", "wc_summary_review": "49;63;127", "wc_main_review": "382;867;422", "wc_review": "510;1111;750", "wc_reply_reviewers": "21;24;0", "wc_reply_authors": "915;1227;1120", "reply_reviewers": "1;1;0", "reply_authors": "3;4;4", "recommendation_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 2.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 153.66666666666666, 53.42492135906446 ], "wc_summary_review_avg": [ 79.66666666666667, 33.95421754199158 ], "wc_main_review_avg": [ 557.0, 219.81052446747555 ], "wc_review_avg": [ 790.3333333333334, 247.00922160023813 ], "wc_reply_reviewers_avg": [ 15.0, 10.677078252031311 ], "wc_reply_authors_avg": [ 1087.3333333333333, 129.4509774221715 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 3.6666666666666665, 0.4714045207910317 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.4999999999999999, "corr_recommendation_correctness": 0.9999999999999997, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:iyzAycuSzxQJ:scholar.google.com/&scioq=Model-Based+Robust+Adaptive+Semantic+Segmentation&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Washington University in St. Louis", "aff_unique_dep": "", "aff_unique_url": "https://wustl.edu", "aff_unique_abbr": "WUSTL", "aff_campus_unique_index": "0;1", "aff_campus_unique": "Saint Louis;St. Louis", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "fTYeefgXReA", "title": "Equivariant Heterogeneous Graph Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "Many real-world datasets include multiple distinct types of entities and relations, and so they are naturally best represented by heterogeneous graphs. However, the most common forms of neural networks operating on graphs either assume that their input graphs are homogeneous, or they convert heterogeneous graphs into homogeneous ones, losing valuable information in the process. Any neural network that acts on graph data should be equivariant or invariant to permutations of nodes, but this is complicated when there are multiple distinct node and edge types. With this as motivation, we design graph neural networks that are composed of linear layers that are maximally expressive while being equivariant only to permutations of nodes within each type. We demonstrate their effectiveness on heterogeneous graph node classification and link prediction benchmarks.", "keywords": "Heterogeneous Graphs;Graph Neural Networks;GNN;Equivariance", "primary_area": "", "supplementary_material": "", "author": "Daniel Levy;Siamak Ravanbakhsh", "authorids": "~Daniel_Levy3;~Siamak_Ravanbakhsh1", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": "~Daniel_Levy3;~Siamak_Ravanbakhsh1", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nlevy2022equivariant,\ntitle={Equivariant Heterogeneous Graph Networks},\nauthor={Daniel Levy and Siamak Ravanbakhsh},\nyear={2022},\nurl={https://openreview.net/forum?id=fTYeefgXReA}\n}", "github": "", "project": "", "reviewers": "Nimp;cyQD;Q6T2;XzQS", "site": "https://openreview.net/forum?id=fTYeefgXReA", "pdf_size": 0, "recommendation": "5;5;5;5", "confidence": "4;4;2;4", "correctness": "3;4;4;3", "technical_novelty": "2;2;3;2", "empirical_novelty": "2;1;3;2", "wc_summary_paper": "57;37;144;44", "wc_summary_review": "40;222;56;23", "wc_main_review": "259;170;392;205", "wc_review": "356;429;592;272", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 5.0, 0.0 ], "confidence_avg": [ 3.5, 0.8660254037844386 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 70.5, 43.03777410601064 ], "wc_summary_review_avg": [ 85.25, 79.8103220141355 ], "wc_main_review_avg": [ 256.5, 84.41119593987518 ], "wc_review_avg": [ 412.25, 117.71230819247408 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:dD4vYruIxkMJ:scholar.google.com/&scioq=Equivariant+Heterogeneous+Graph+Networks&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "fUhxuop_Q1r", "title": "Disentangling Generalization in Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": " Generalization in Reinforcement Learning (RL) is usually measured according to\n concepts from supervised learning. Unlike a supervised learning model however,\n an RL agent must generalize across states, actions and observations from\n limited reward-based feedback. We propose to measure an RL agent's capacity to\n generalize by evaluating it in a contextual decision process that combines a\n tabular environment with observations from a supervised learning dataset. The\n resulting environment, while simple, necessitates function approximation for\n state abstraction and provides ground-truth labels for optimal policies and\n value functions. The ground truth labels provided by our environment enable us\n to characterize generalization in RL across different axes: state-space,\n observation-space and action-space. Putting this method to work, we combine\n the MNIST dataset with various gridworld environments to rigorously evaluate\n generalization of DQN and QR-DQN in state, observation and action spaces for\n both online and offline learning. Contrary to previous reports about common\n regularization methods, we find that dropout does not improve observation\n generalization. We find, however, that dropout improves action generalization.\n Our results also corroborate recent findings that QR-DQN is able to generalize\n to new observations better than DQN in the offline setting. This success does\n not extend to state generalization, where DQN is able to generalize better\n than QR-DQN. These findings demonstrate the need for careful consideration\n of generalization in RL, and we hope that this line of research will continue\n to shed light on generalization claims in the literature.\n", "keywords": "Reinforcement learning;generalization", "primary_area": "", "supplementary_material": "", "author": "Alex Lewandowski;Dale Schuurmans;Jun Luo", "authorids": "~Alex_Lewandowski1;~Dale_Schuurmans1;~Jun_Luo1", "gender": ";;", "homepage": "https://lewandowskialex.com;;", "dblp": ";;42/2501", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": "~Alex_Lewandowski1;~Dale_Schuurmans1;~Jun_Luo1", "aff": "University of Alberta;;Huawei Technologies Ltd.", "aff_domain": "ualberta.ca;;huawei.com", "position": "PhD student;;Researcher", "bibtex": "@misc{\nlewandowski2022disentangling,\ntitle={Disentangling Generalization in Reinforcement Learning},\nauthor={Alex Lewandowski and Dale Schuurmans and Jun Luo},\nyear={2022},\nurl={https://openreview.net/forum?id=fUhxuop_Q1r}\n}", "github": "", "project": "", "reviewers": "1S93;74nY;iELT;NqjA", "site": "https://openreview.net/forum?id=fUhxuop_Q1r", "pdf_size": 0, "recommendation": "3;5;5;5", "confidence": "4;4;4;4", "correctness": "3;3;3;3", "technical_novelty": "2;3;2;2", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "108;47;73;115", "wc_summary_review": "65;92;167;65", "wc_main_review": "385;411;318;284", "wc_review": "558;550;558;464", "wc_reply_reviewers": "506;440;528;0", "wc_reply_authors": "1706;1418;1196;972", "reply_reviewers": "2;1;2;0", "reply_authors": "4;3;3;2", "recommendation_avg": [ 4.5, 0.8660254037844386 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 85.75, 27.453369556395078 ], "wc_summary_review_avg": [ 97.25, 41.75149697915034 ], "wc_main_review_avg": [ 349.5, 50.806003582253936 ], "wc_review_avg": [ 532.5, 39.68311983702894 ], "wc_reply_reviewers_avg": [ 368.5, 215.20397301165238 ], "wc_reply_authors_avg": [ 1323.0, 271.5897641664722 ], "reply_reviewers_avg": [ 1.25, 0.82915619758885 ], "reply_authors_avg": [ 3.0, 0.7071067811865476 ], "replies_avg": [ 25, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:_mlNOlS8lcwJ:scholar.google.com/&scioq=Disentangling+Generalization+in+Reinforcement+Learning&hl=en&as_sdt=0,14", "gs_version_total": 0, "aff_unique_index": "0;1", "aff_unique_norm": "University of Alberta;Huawei", "aff_unique_dep": ";Huawei Technologies", "aff_unique_url": "https://www.ualberta.ca;https://www.huawei.com", "aff_unique_abbr": "UAlberta;Huawei", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "Canada;China" }, { "title": "Efficient Self-supervised Vision Transformers for Representation Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6312", "id": "fVu3o-YUGQK", "poster": "", "openreview": "https://openreview.net/forum?id=fVu3o-YUGQK", "slides": "https://iclr.cc/virtual/2022/poster/6312", "video": "https://iclr.cc/virtual/2022/poster/6312", "author_site": "Chunyuan Li, Jianwei Yang, Pengchuan Zhang, Mei Gao, Bin Xiao, Xiyang Dai, Lu Yuan, Jianfeng Gao", "tldr": "", "abstract": "This paper investigates two techniques for developing efficient self-supervised vision transformers (EsViT) for visual representation learning. First, we show through a comprehensive empirical study that multi-stage architectures with sparse self-attentions can significantly reduce modeling complexity but with a cost of losing the ability to capture fine-grained correspondences between image regions. Second, we propose a new pre-training task, non-contrastive region-matching, which allows the model to capture fine-grained region dependencies and as a result significantly improves the quality of the learned vision representations. Our results show that combining the two techniques, EsViT achieves 81.3% top-1 on the ImageNet linear probe evaluation, outperforming prior arts with around an order magnitude of higher throughput. When transferring to downstream linear classification tasks, EsViT outperforms its supervised counterpart on 17 out of 18 datasets. The code and pre-trained models are released at: https://github.com/microsoft/esvit", "keywords": "self-supervised learning;vision transformers;non-contrastive region-matching task", "primary_area": "", "supplementary_material": "/attachment/46af8eba5e24f1e74e43db2667abbb8ee5d70ee0.zip", "author": "Chunyuan Li;Jianwei Yang;Pengchuan Zhang;Mei Gao;Bin Xiao;Xiyang Dai;Lu Yuan;Jianfeng Gao", "authorids": "~Chunyuan_Li1;~Jianwei_Yang1;~Pengchuan_Zhang1;~Mei_Gao1;~Bin_Xiao2;~Xiyang_Dai2;~Lu_Yuan1;~Jianfeng_Gao1", "gender": ";M;M;M;M;M;M;F", "homepage": "http://chunyuan.li/;https://pzzhang.github.io/pzzhang/;;https://sites.google.com/site/xiyangdai/;https://www.microsoft.com/en-us/research/people/luyuan/;https://www.microsoft.com/en-us/research/people/jfgao/;https://jwyang.github.io/;", "dblp": "64/9590;;43/5134-1;176/5470;;92/5339;;", "google_scholar": "Zd7WmXUAAAAJ;3VZ_E64AAAAJ;https://scholar.google.com/citations?authuser=1;QC8RwcoAAAAJ;k9TsUVsAAAAJ;https://scholar.google.com/citations?hl=en;Cl9byD8AAAAJ;", "orcid": ";;0000-0001-6477-5911;;;;;", "linkedin": ";;;;;;;xuemei-mei-gao-b2612228/", "or_profile": "~Chunyuan_Li1;~Pengchuan_Zhang1;~Bin_Xiao2;~Xiyang_Dai2;~Lu_Yuan1;~Jianfeng_Gao1;~Jianwei_Yang2;~Xuemei_Gao1", "aff": "Microsoft Research;Microsoft Research;Microsoft;Microsoft;Microsoft;Microsoft Research;Microsoft;Microsoft", "aff_domain": "microsoft.com;research.microsoft.com;microsoft.com;microsoft.com;microsoft.com;microsoft.com;microsoft.com;microsoft.com", "position": "Principal Researcher;Researcher;Principal Researcher;Researcher;Principal Research Manager;Principal Researcher;Researcher;Researcher", "bibtex": "@inproceedings{\nli2022efficient,\ntitle={Efficient Self-supervised Vision Transformers for Representation Learning},\nauthor={Chunyuan Li and Jianwei Yang and Pengchuan Zhang and Mei Gao and Bin Xiao and Xiyang Dai and Lu Yuan and Jianfeng Gao},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=fVu3o-YUGQK}\n}", "github": "", "project": "", "reviewers": "6vXm;nCrU;sJFu", "pdf_size": 0, "recommendation": "6;8;8", "confidence": "5;4;4", "correctness": "4;4;4", "technical_novelty": "2;3;3", "empirical_novelty": "3;3;3", "wc_summary_paper": "58;79;76", "wc_summary_review": "51;259;21", "wc_main_review": "100;589;162", "wc_review": "209;927;259", "wc_reply_reviewers": "0;35;0", "wc_reply_authors": "294;519;133", "reply_reviewers": "0;1;0", "reply_authors": "1;1;1", "recommendation_avg": [ 7.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "correctness_avg": [ 4.0, 0.0 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 71.0, 9.273618495495704 ], "wc_summary_review_avg": [ 110.33333333333333, 105.8342519645171 ], "wc_main_review_avg": [ 283.6666666666667, 217.38189641478633 ], "wc_review_avg": [ 465.0, 327.32043423328565 ], "wc_reply_reviewers_avg": [ 11.666666666666666, 16.49915822768611 ], "wc_reply_authors_avg": [ 315.3333333333333, 158.30420784749285 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": -0.9999999999999997, "corr_recommendation_correctness": 0.0, "gs_citation": 245, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15469437604545198809&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=fVu3o-YUGQK", "email": "microsoft.com;research.microsoft.com;microsoft.com;microsoft.com;microsoft.com;microsoft.com;microsoft.com;microsoft.com", "author_num": 8, "aff_unique_index": "0;0;0;0;0;0;0;0", "aff_unique_norm": "Microsoft", "aff_unique_dep": "Microsoft Research", "aff_unique_url": "https://www.microsoft.com/en-us/research", "aff_unique_abbr": "MSR", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "fWK3qhAtbbk", "title": "A Study of Aggregation of Long Time-series Input for LSTM Neural Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "Time series forecasting is the process of using time series data to create a prediction model. \nLong-short term memory (LSTM) models are the state-of-the-art for time-series forecasting.\nHowever, LSTMs can handle limited length input mostly since when the samples enter the model in sequence, \nthe oldest samples need to propagate through the LSTM cells self loop for each new sample and thus their data diminishes in this process.\n\nThis limits the length of the history that can be used in the training for each time epoch. The common way of handling this problem is by partitioning time records to uniform intervals, averaging each interval, and feeding the LSTM with rather short sequences, but each represents data from a longer history. \n \nIn this paper, we show that this common data aggregation method is far from optimal. We generalize the method of partitioning the data, and suggest an Exponential partitioning. We show that non-uniformly partitioning, and especially Exponential partitioning improves LSTM accuracy, significantly. Using other aggregation functions (such as median or maximum) are shown to further improve the accuracy. Overall, using 7 public datasets we show an improvement in accuracy by 6% to 27%. ", "keywords": "LSTM;Data Aggregation;Time Series", "primary_area": "", "supplementary_material": "/attachment/c68a455231302ae47842ce68aa03dd7e5d411d6e.zip", "author": "Nitzan Farhi;Yuval Shavitt", "authorids": "~Nitzan_Farhi1;shavitt@eng.tau.ac.il", "gender": "M;", "homepage": "https://nitzanfarhi.github.io/;", "dblp": ";", "google_scholar": "yA8PEnMAAAAJ;", "orcid": "0000-0001-9751-3694;", "linkedin": "nitzan-farhi-4601a1b1/;", "or_profile": "~Nitzan_Farhi1;shavitt@eng.tau.ac.il", "aff": "Tel Aviv University;", "aff_domain": "tau.ac.il;", "position": "PhD student;", "bibtex": "@misc{\nfarhi2022a,\ntitle={A Study of Aggregation of Long Time-series Input for {LSTM} Neural Networks},\nauthor={Nitzan Farhi and Yuval Shavitt},\nyear={2022},\nurl={https://openreview.net/forum?id=fWK3qhAtbbk}\n}", "github": "", "project": "", "reviewers": "nfPs;tJW4;Fmnr;2ThG", "site": "https://openreview.net/forum?id=fWK3qhAtbbk", "pdf_size": 0, "recommendation": "3;3;3;3", "confidence": "4;5;3;4", "correctness": "2;3;3;2", "technical_novelty": "2;2;1;1", "empirical_novelty": "1;2;2;3", "wc_summary_paper": "53;132;116;61", "wc_summary_review": "40;15;65;49", "wc_main_review": "601;190;656;599", "wc_review": "694;337;837;709", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 1.5, 0.5 ], "empirical_novelty_avg": [ 2.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 90.5, 34.09178786746157 ], "wc_summary_review_avg": [ 42.25, 18.102140757380052 ], "wc_main_review_avg": [ 511.5, 187.02205752263555 ], "wc_review_avg": [ 644.25, 185.89160147785051 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:4iCgmtL_xpwJ:scholar.google.com/&scioq=A+Study+of+Aggregation+of+Long+Time-series+Input+for+LSTM+Neural+Networks&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "Tel Aviv University", "aff_unique_dep": "", "aff_unique_url": "https://www.tau.ac.il", "aff_unique_abbr": "TAU", "aff_country_unique_index": "0", "aff_country_unique": "Israel" }, { "id": "fWVQqtshDj", "title": "MOBA: Multi-teacher Model Based Reinforcement Learning", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Although reinforcement learning (RL) shines at solving decision-making problems, it not only requires collecting a large amount of environment data but also is time-consuming for training and interaction, making it hard to apply to real applications. To reduce the time-cost and improve the data efficiency, model-based reinforcement learning uses a learned system model to predict system dynamics (i.e. states or rewards) and makes a plan accordingly, thus avoiding the frequent environment interaction. Model-based methods suffer from the model-bias problem, where certain spaces of model are inaccurate, resulting in policy learning variations and system performance degradation.\nWe propose a Multi-teacher MOdel BAsed Reinforcement Learning algorithm (MOBA), which leverages multi-teacher knowledge distillation theory to solve the model-bias problem. Specifically, different teachers search different spaces and learn various instances of a system. By distilling and transferring the teacher knowledge to a student, the student model is able to learn a generalized dynamic model that covers the state space. Moreover, to overcome the instability of multi-teacher knowledge transfer, we learn a set of student models and use an ensemble method to jointly predict system dynamics. We evaluate MOBA in high-dimensional control locomotion tasks. Results show that, compared with SOTA model-free methods, our method can improve the data efficiency and system performance by up to 75% and 10%, respectively. Moreover, our method outperforms other SOTA model-based approaches by up to 63.2% when exposed to high-range model-bias environments.", "keywords": "Model-based reinforcement leanring;Multi-teacher knowledge distillation;Emsemble learning", "primary_area": "", "supplementary_material": "/attachment/9a96064b1edf151ea0140e8bb1f4bdc204470925.zip", "author": "Jikun Kang;Xi Chen;Ju Wang;Chengming Hu;Xue Liu;Gregory Dudek", "authorids": "~Jikun_Kang1;~Xi_Chen22;~Ju_Wang3;~Chengming_Hu1;~Xue_Liu1;~Gregory_Dudek1", "gender": "M;M;M;M;;M", "homepage": "https://luciferkonn.github.io;https://juwang7.github.io/homepage/;https://sites.google.com/view/chengminghu/home;http://www.cs.mcgill.ca/~xueliu/;http://www.cim.mcgill.ca/~dudek;https://sites.google.com/site/xichenmcgill/home", "dblp": "299/0233;;312/9310;l/XueLiu;;16/3283-9.html", "google_scholar": "Jikun%20Kang;;nldbrJ8AAAAJ;https://scholar.google.com.tw/citations?user=rfLIRakAAAAJ;;https://scholar.google.ca/citations?user=HMuDi00AAAAJ", "orcid": "0009-0001-1334-7092;;;;;0000-0001-5395-4295", "linkedin": "kang-jikun-91993814b/;;;;;", "or_profile": "~Jikun_Kang1;~Ju_Wang3;~Chengming_Hu1;~Xue_Liu1;~Gregory_Dudek1;~Xi_Chen32", "aff": "McGill University;Samsung;McGill University;McGill University;McGill University;Huawei Technologies Ltd.", "aff_domain": "mcgill.ca;samsung.com;mcgill.ca;mcgill.ca;mcgill.ca;huawei.com", "position": "PhD student;Researcher;PhD student;Full Professor;Professor;Principal Researcher", "bibtex": "@misc{\nkang2022moba,\ntitle={{MOBA}: Multi-teacher Model Based Reinforcement Learning},\nauthor={Jikun Kang and Xi Chen and Ju Wang and Chengming Hu and Xue Liu and Gregory Dudek},\nyear={2022},\nurl={https://openreview.net/forum?id=fWVQqtshDj}\n}", "github": "", "project": "", "reviewers": "1m9F;YsNH;5UvE;EPAE", "site": "https://openreview.net/forum?id=fWVQqtshDj", "pdf_size": 0, "recommendation": "3;5;5;5", "confidence": "4;3;4;3", "correctness": "3;2;3;3", "technical_novelty": "2;3;2;3", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "64;62;100;88", "wc_summary_review": "42;78;46;35", "wc_main_review": "555;1542;355;305", "wc_review": "661;1682;501;428", "wc_reply_reviewers": "39;390;0;0", "wc_reply_authors": "974;1886;554;560", "reply_reviewers": "1;3;0;0", "reply_authors": "3;5;1;1", "recommendation_avg": [ 4.5, 0.8660254037844386 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 78.5, 16.08570794214541 ], "wc_summary_review_avg": [ 50.25, 16.498105951896417 ], "wc_main_review_avg": [ 689.25, 501.1428813222832 ], "wc_review_avg": [ 818.0, 505.8987052760661 ], "wc_reply_reviewers_avg": [ 107.25, 164.02038745229203 ], "wc_reply_authors_avg": [ 993.5, 542.6829184708139 ], "reply_reviewers_avg": [ 1.0, 1.224744871391589 ], "reply_authors_avg": [ 2.5, 1.6583123951777 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": -0.3333333333333333, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Q6ONaS6j99IJ:scholar.google.com/&scioq=MOBA:+Multi-teacher+Model+Based+Reinforcement+Learning&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1;0;0;0;2", "aff_unique_norm": "McGill University;Samsung;Huawei", "aff_unique_dep": ";Samsung;Huawei Technologies", "aff_unique_url": "https://www.mcgill.ca;https://www.samsung.com;https://www.huawei.com", "aff_unique_abbr": "McGill;Samsung;Huawei", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0;0;2", "aff_country_unique": "Canada;South Korea;China" }, { "title": "Gradient Importance Learning for Incomplete Observations", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6859", "id": "fXHl76nO2AZ", "poster": "", "openreview": "https://openreview.net/forum?id=fXHl76nO2AZ", "slides": "https://iclr.cc/virtual/2022/poster/6859", "video": "https://iclr.cc/virtual/2022/poster/6859", "author_site": "Qitong Gao, Dong Wang, Joshua Amason, Siyang Yuan, Chenyang Tao, Ricardo Henao, Majda Hadziahmetovic, Lawrence Carin, Miroslav Pajic", "tldr": "", "abstract": "Though recent works have developed methods that can generate estimates (or imputations) of the missing entries in a dataset to facilitate downstream analysis, most depend on assumptions that may not align with real-world applications and could suffer from poor performance in subsequent tasks such as classification. This is particularly true if the data have large missingness rates or a small sample size. More importantly, the imputation error could be propagated into the prediction step that follows, which may constrain the capabilities of the prediction model. In this work, we introduce the gradient importance learning (GIL) method to train multilayer perceptrons (MLPs) and long short-term memories (LSTMs) to directly perform inference from inputs containing missing values without imputation. Specifically, we employ reinforcement learning (RL) to adjust the gradients used to train these models via back-propagation. This allows the model to exploit the underlying information behind missingness patterns. We test the approach on real-world time-series (i.e., MIMIC-III), tabular data obtained from an eye clinic, and a standard dataset (i.e., MNIST), where our imputation-free predictions outperform the traditional two-step imputation-based predictions using state-of-the-art imputation methods.", "keywords": "Missing Data;Reinforcement Learning;Representation Learning", "primary_area": "", "supplementary_material": "/attachment/56abd6641e0629e6045cfeb1dd02edc87f195900.zip", "author": "Qitong Gao;Dong Wang;Joshua David Amason;Siyang Yuan;Chenyang Tao;Ricardo Henao;Majda Hadziahmetovic;Lawrence Carin;Miroslav Pajic", "authorids": "~Qitong_Gao1;~Dong_Wang2;~Joshua_David_Amason1;~Siyang_Yuan1;~Chenyang_Tao1;~Ricardo_Henao1;~Majda_Hadziahmetovic1;~Lawrence_Carin2;~Miroslav_Pajic2", "gender": "M;F;;F;M;M;F;M;M", "homepage": "http://qitonggao.com;https://jelly007.github.io/;;;http://cytao.wordpress.com;http://rhenaog.github.io;;https://people.ee.duke.edu/~lcarin/;http://people.duke.edu/~mp275/", "dblp": "238/5422;;;242/8930;170/6702;27/3207;;;74/7446.html", "google_scholar": "Flv4SrsAAAAJ;https://scholar.google.com.hk/citations?user=JB5mlIMAAAAJ;;;;p_mm4-YAAAAJ;FfYo7YYAAAAJ;yuxwFscAAAAJ;Fbn21-8AAAAJ", "orcid": ";;;;;0000-0003-4980-845X;;;", "linkedin": "qitong-gao;;;;;;majda-hadziahmetovic-8090a7112/;;", "or_profile": "~Qitong_Gao1;~Dong_Wang2;~Joshua_David_Amason1;~Siyang_Yuan1;~Chenyang_Tao1;~Ricardo_Henao1;~Majda_Hadziahmetovic1;~Lawrence_Carin2;~Miroslav_Pajic2", "aff": "Duke University;;;Duke University;Amazon;Duke University;Duke University, Ophthalmology;Duke University;Duke University", "aff_domain": "duke.edu;;;duke.edu;amazon.com;duke.edu;duke.edu;duke.edu;duke.edu", "position": "PhD student;;;PhD student;Researcher;Assistant Professor;Assistant Professor;Full Professor;Associate Professor", "bibtex": "@inproceedings{\ngao2022gradient,\ntitle={Gradient Importance Learning for Incomplete Observations},\nauthor={Qitong Gao and Dong Wang and Joshua David Amason and Siyang Yuan and Chenyang Tao and Ricardo Henao and Majda Hadziahmetovic and Lawrence Carin and Miroslav Pajic},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=fXHl76nO2AZ}\n}", "github": "", "project": "", "reviewers": "3AZM;4Cb5;azSY;he3p", "pdf_size": 0, "recommendation": "6;6;6;8", "confidence": "4;3;2;3", "correctness": "4;3;4;4", "technical_novelty": "3;4;3;4", "empirical_novelty": "3;3;4;0", "wc_summary_paper": "118;59;71;94", "wc_summary_review": "53;44;64;1", "wc_main_review": "371;228;128;68", "wc_review": "542;331;263;163", "wc_reply_reviewers": "196;112;151;0", "wc_reply_authors": "1633;225;1684;35", "reply_reviewers": "2;1;1;0", "reply_authors": "4;1;3;1", "recommendation_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 3.0, 0.7071067811865476 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 1.5 ], "wc_summary_paper_avg": [ 85.5, 22.588713996153036 ], "wc_summary_review_avg": [ 40.5, 23.879907872519105 ], "wc_main_review_avg": [ 198.75, 114.70260459117743 ], "wc_review_avg": [ 324.75, 138.93591148439629 ], "wc_reply_reviewers_avg": [ 114.75, 72.61327357997297 ], "wc_reply_authors_avg": [ 894.25, 767.4084228753292 ], "reply_reviewers_avg": [ 1.0, 0.7071067811865476 ], "reply_authors_avg": [ 2.25, 1.299038105676658 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 9, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3408438792226712835&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "pdf": "https://openreview.net/pdf?id=fXHl76nO2AZ", "email": "duke.edu;;;duke.edu;amazon.com;duke.edu;duke.edu;duke.edu;duke.edu", "author_num": 9, "aff_unique_index": "0;0;1;0;0;0;0", "aff_unique_norm": "Duke University;Amazon", "aff_unique_dep": ";Amazon.com, Inc.", "aff_unique_url": "https://www.duke.edu;https://www.amazon.com", "aff_unique_abbr": "Duke;Amazon", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "fY2-WyfrXhU", "title": "MemREIN: Rein the Domain Shift for Cross-Domain Few-Shot Learning", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Few-shot learning aims to enable models generalize to new categories (query instances) with only limited labeled samples (support instances) from each category. Metric-based mechanism is a promising direction which compares feature embeddings via different metrics. However, it always fail to generalize to unseen domains due to the considerable domain gap challenge. In this paper, we propose a novel framework, MemREIN, which considers Memorized, Restitution, and Instance Normalization for cross-domain few-shot learning. Specifically, an instance normalization algorithm is explored to alleviate feature dissimilarity, which provides the initial model generalization ability. However, naively normalizing the feature would lose fine-grained discriminative knowledge between different classes. To this end, a memorized module is further proposed to separate the most refined knowledge and remember it. Then, a restitution module is utilized to restitute the discrimination ability from the learned knowledge. A novel reverse contrastive learning strategy is proposed to stabilize the distillation process. Extensive experiments on five popular benchmark datasets demonstrate that MemREIN well addresses the domain shift challenge, and significantly improves the performance up to $16.37\\%$ compared with state-of-the-art baselines.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yi Xu;Lichen Wang;Yizhou Wang;Can Qin;Yulun Zhang;Yun Fu", "authorids": "~Yi_Xu9;~Lichen_Wang1;~Yizhou_Wang3;~Can_Qin1;~Yulun_Zhang1;~Yun_Fu1", "gender": "M;M;M;M;M;M", "homepage": "https://sites.google.com/view/homepage-of-yi-xu;https://sites.google.com/site/lichenwang123/;https://wyzjack.github.io/;http://canqin.tech;http://yulunzhang.com/;http://www1.ece.neu.edu/~yunfu/", "dblp": "14/5580-5;05/5102;71/3387-6;214/2488;166/2763-1.html;00/5815-1", "google_scholar": "https://scholar.google.com.hk/citations?user=12bRAdsAAAAJ;cE25iX4AAAAJ;H4kqV1MAAAAJ;QCik-YcAAAAJ;ORmLjWoAAAAJ;https://scholar.google.com.tw/citations?user=h-JEcQ8AAAAJ", "orcid": "0000-0001-5857-4179;;0000-0003-1601-9649;;0000-0002-2288-5079;0000-0002-5098-2853", "linkedin": "yi-xu-884755185/;lichenabc/;yizhou-wang-786603155/;;yulun-zhang-1116b5b9/;furaymond/", "or_profile": "~Yi_Xu9;~Lichen_Wang1;~Yizhou_Wang3;~Can_Qin1;~Yulun_Zhang1;~Yun_Fu1", "aff": "Honda Research Institute;Zillow Group, Inc.;Mitsubishi Electric Research Labs;Northeastern University;Swiss Federal Institute of Technology;Northeastern University", "aff_domain": "honda-ri.de;zillow.com;merl.com;neu.edu;ethz.ch;northeastern.edu", "position": "Intern;Applied Scientist;Intern;PhD student;Postdoc;Full Professor", "bibtex": "@misc{\nxu2022memrein,\ntitle={Mem{REIN}: Rein the Domain Shift for Cross-Domain Few-Shot Learning},\nauthor={Yi Xu and Lichen Wang and Yizhou Wang and Can Qin and Yulun Zhang and Yun Fu},\nyear={2022},\nurl={https://openreview.net/forum?id=fY2-WyfrXhU}\n}", "github": "", "project": "", "reviewers": "LXAQ;wiQ7;pZob;jpdN;kMHx", "site": "https://openreview.net/forum?id=fY2-WyfrXhU", "pdf_size": 0, "recommendation": "3;3;5;5;6", "confidence": "5;4;3;4;4", "correctness": "4;3;3;3;3", "technical_novelty": "2;2;2;2;3", "empirical_novelty": "2;2;2;2;3", "wc_summary_paper": "80;91;83;68;94", "wc_summary_review": "94;23;41;54;96", "wc_main_review": "630;344;157;305;144", "wc_review": "804;458;281;427;334", "wc_reply_reviewers": "0;0;29;0;0", "wc_reply_authors": "1811;815;982;1317;436", "reply_reviewers": "0;0;1;0;0", "reply_authors": "3;2;2;3;1", "recommendation_avg": [ 4.4, 1.2 ], "confidence_avg": [ 4.0, 0.6324555320336759 ], "correctness_avg": [ 3.2, 0.39999999999999997 ], "technical_novelty_avg": [ 2.2, 0.39999999999999997 ], "empirical_novelty_avg": [ 2.2, 0.39999999999999997 ], "wc_summary_paper_avg": [ 83.2, 9.15204895091804 ], "wc_summary_review_avg": [ 61.6, 29.000689646972194 ], "wc_main_review_avg": [ 316.0, 175.70771183986204 ], "wc_review_avg": [ 460.8, 182.94414448131428 ], "wc_reply_reviewers_avg": [ 5.8, 11.6 ], "wc_reply_authors_avg": [ 1072.2, 465.78767695163435 ], "reply_reviewers_avg": [ 0.2, 0.4000000000000001 ], "reply_authors_avg": [ 2.2, 0.7483314773547882 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.5270462766947298, "corr_recommendation_correctness": -0.5833333333333335, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1162032028840632615&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1;2;3;4;3", "aff_unique_norm": "Honda Research Institute;Zillow Group;Mitsubishi Electric Research Laboratories;Northeastern University;Swiss Federal Institute of Technology", "aff_unique_dep": ";;;;", "aff_unique_url": "https://www.honda-ri.com;https://www.zillow.com;https://www.merl.com;https://www.northeastern.edu;https://www.ethz.ch", "aff_unique_abbr": "HRI;Zillow;MERL;NEU;ETH Zurich", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1;2;1", "aff_country_unique": "Japan;United States;Switzerland" }, { "id": "fYor2QIp_3", "title": "An Effective GCN-based Hierarchical Multi-label classification for Protein Function Prediction", "track": "main", "status": "Reject", "tldr": "", "abstract": "We propose an effective method to improve Protein Function Prediction (PFP) utilizing hierarchical features of Gene Ontology (GO) terms. Our method consists of a language model for encoding the protein sequence and a Graph Convolutional Network (GCN) for representing Go terms. To reflect the hierarchical structure of GO to GCN, we employ node(GO term)-wise representations containing the whole hierarchical information. Our algorithm shows effectiveness in a large-scale graph by expanding the GO graph compared to previous models. Experimental results show that our method outperformed state-of-the-art PFP approaches.", "keywords": "Bioinformatics;Protein function prediction;Machine Learning", "primary_area": "", "supplementary_material": "", "author": "Kyudam Choi;Yurim Lee;Cheongwon Kim", "authorids": "~Kyudam_Choi1;~Yurim_Lee1;~Cheongwon_Kim1", "gender": "F;F;M", "homepage": ";;http://home.sejong.ac.kr/~wikim/", "dblp": "287/4293.html;284/0805-1.html;", "google_scholar": "YR-xog0AAAAJ;pPYXTLoAAAAJ;", "orcid": ";0000-0002-5012-7750;", "linkedin": ";;", "or_profile": "~Kyudam_Choi1;~Yurim_Lee1;~Cheongwon_Kim1", "aff": ";Sejong University;Sejong University", "aff_domain": ";sejong.ac.kr;sejong.ac.kr", "position": ";MS student;Full Professor", "bibtex": "@misc{\nchoi2022an,\ntitle={An Effective {GCN}-based Hierarchical Multi-label classification for Protein Function Prediction},\nauthor={Kyudam Choi and Yurim Lee and Cheongwon Kim},\nyear={2022},\nurl={https://openreview.net/forum?id=fYor2QIp_3}\n}", "github": "", "project": "", "reviewers": "k9w6;evqx;mvKR;BnVc", "site": "https://openreview.net/forum?id=fYor2QIp_3", "pdf_size": 0, "recommendation": "1;3;3;3", "confidence": "5;5;5;4", "correctness": "2;4;2;2", "technical_novelty": "2;2;3;1", "empirical_novelty": "2;2;3;1", "wc_summary_paper": "162;56;39;61", "wc_summary_review": "202;20;24;25", "wc_main_review": "814;213;522;84", "wc_review": "1178;289;585;170", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 2.5, 0.8660254037844386 ], "confidence_avg": [ 4.75, 0.4330127018922193 ], "correctness_avg": [ 2.5, 0.8660254037844386 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 79.5, 48.32442446630896 ], "wc_summary_review_avg": [ 67.75, 77.53184829474917 ], "wc_main_review_avg": [ 408.25, 283.2105003349982 ], "wc_review_avg": [ 555.5, 389.8746593457954 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17919921131627718807&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0", "aff_unique_norm": "Sejong University", "aff_unique_dep": "", "aff_unique_url": "https://www.sejong.ac.kr", "aff_unique_abbr": "Sejong", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "South Korea" }, { "id": "faMcf0MDk0f", "title": "BoolNet: Streamlining Binary Neural Networks Using Binary Feature Maps", "track": "main", "status": "Reject", "tldr": "", "abstract": "Recent works on Binary Neural Networks (BNNs) have made promising progress in narrowing the accuracy gap of BNNs to their 32-bit counterparts, often based on specialized model designs using additional 32-bit components. Furthermore, most previous BNNs use 32-bit values for feature maps and residual shortcuts, which helps to maintain the accuracy, but is not friendly to hardware accelerators with limited memory, energy, and computing resources. Thus, we raise the following question: How can accuracy and energy consumption be balanced in a BNN design? We extensively study this fundamental problem in this work and propose BoolNet: an architecture without most commonly used 32-bit components that uses 1-bit values to store feature maps. Experimental results on ImageNet demonstrate that BoolNet can achieve 63.0% Top-1 accuracy coupled with an energy reduction of 2.95x compared to recent state-of-the-art BNN architectures. Code and trained models are available at: (URL in the final version).", "keywords": "Binary Neural Networks;Hardware-Friendly Neural Architecture Design", "primary_area": "", "supplementary_material": "/attachment/907f175ac899517f0ccbaa458b46eaae99342b8f.zip", "author": "Nianhui Guo;Joseph Bethge;Haojin Yang;Kai Zhong;Xuefei Ning;Christoph Meinel;Yu Wang", "authorids": "~Nianhui_Guo1;~Joseph_Bethge1;~Haojin_Yang1;~Kai_Zhong2;~Xuefei_Ning1;~Christoph_Meinel1;~Yu_Wang3", "gender": ";M;M;Not Specified;;M;M", "homepage": "https://hpi.de/meinel/lehrstuhl/team-fotos/current-phd-students/joseph-bethge.html;https://hpi.de/meinel/lehrstuhl/team-fotos/senior-researcher/haojin-yang.html;https://nicsefc.ee.tsinghua.edu.cn/https://nicsefc.ee.tsinghua.edu.cn/;https://nics-effalg.com/ningxuefei/;;https://nicsefc.ee.tsinghua.edu.cn;", "dblp": ";94/10762;;202/9525;m/CMeinel;w/YuWang2.html;272/0605", "google_scholar": "https://scholar.google.de/citations?user=kWwB7HkAAAAJ;https://scholar.google.de/citations?user=-338Jh0AAAAJ;;oVslpJsAAAAJ;;https://scholar.google.com.hk/citations?user=j8JGVvoAAAAJ;https://scholar.google.com/citations?hl=zh-CN", "orcid": "0000-0002-6133-9994;;;;;0000-0001-6108-5157;", "linkedin": ";;;;;;", "or_profile": "~Joseph_Bethge1;~Haojin_Yang1;~Kai_Zhong2;~Xuefei_Ning1;~Christoph_Meinel1;~Yu_Wang3;~Guo_Nian_Hui1", "aff": "University of Potsdam;Hasso Plattner Institute;;Huawei Technologies Ltd.;;Tsinghua University;Hasso Plattner Institute", "aff_domain": "uni-potsdam.de;hpi.de;;huawei.com;;tsinghua.edu.cn;hpi.de", "position": "PhD student;Associate Professor;;Postdoc;;Full Professor;PhD student", "bibtex": "@misc{\nguo2022boolnet,\ntitle={BoolNet: Streamlining Binary Neural Networks Using Binary Feature Maps},\nauthor={Nianhui Guo and Joseph Bethge and Haojin Yang and Kai Zhong and Xuefei Ning and Christoph Meinel and Yu Wang},\nyear={2022},\nurl={https://openreview.net/forum?id=faMcf0MDk0f}\n}", "github": "", "project": "", "reviewers": "pRtS;MNqE;eCWL;nRTL", "site": "https://openreview.net/forum?id=faMcf0MDk0f", "pdf_size": 0, "recommendation": "3;3;5;8", "confidence": "4;2;4;5", "correctness": "3;3;2;3", "technical_novelty": "2;2;3;4", "empirical_novelty": "2;2;0;3", "wc_summary_paper": "92;73;91;25", "wc_summary_review": "41;45;99;21", "wc_main_review": "198;402;311;107", "wc_review": "331;520;501;153", "wc_reply_reviewers": "28;0;0;0", "wc_reply_authors": "704;1507;1388;188", "reply_reviewers": "1;0;0;0", "reply_authors": "1;3;2;1", "recommendation_avg": [ 4.75, 2.0463381929681126 ], "confidence_avg": [ 3.75, 1.0897247358851685 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.82915619758885 ], "empirical_novelty_avg": [ 1.75, 1.0897247358851685 ], "wc_summary_paper_avg": [ 70.25, 27.19719654670312 ], "wc_summary_review_avg": [ 51.5, 28.892040426387332 ], "wc_main_review_avg": [ 254.5, 111.68818200687126 ], "wc_review_avg": [ 376.25, 148.42064377976536 ], "wc_reply_reviewers_avg": [ 7.0, 12.12435565298214 ], "wc_reply_authors_avg": [ 946.75, 534.6051697280901 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.75, 0.82915619758885 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.7567450038061343, "corr_recommendation_correctness": -0.07053456158585983, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:ZgxLSTk-leYJ:scholar.google.com/&scioq=BoolNet:+Streamlining+Binary+Neural+Networks+Using+Binary+Feature+Maps&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1;2;3;1", "aff_unique_norm": "University of Potsdam;Hasso Plattner Institute;Huawei;Tsinghua University", "aff_unique_dep": ";;Huawei Technologies;", "aff_unique_url": "https://www.uni-potsdam.de;https://www.hpi.de;https://www.huawei.com;https://www.tsinghua.edu.cn", "aff_unique_abbr": "UP;HPI;Huawei;THU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;1;0", "aff_country_unique": "Germany;China" }, { "id": "famc03Gg231", "title": "Physical Gradients for Deep Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Solving inverse problems, such as parameter estimation and optimal control, is a vital part of science. Many experiments repeatedly collect data and employ machine learning algorithms to quickly infer solutions to the associated inverse problems. We find that state-of-the-art training techniques are not well-suited to many problems that involve physical processes since the magnitude and direction of the gradients can vary strongly. We propose a novel hybrid training approach that combines higher-order optimization methods with machine learning techniques. We replace the gradient of the physical process by a new construct, referred to as the physical gradient. This also allows us to introduce domain knowledge into training by incorporating priors about the solution space into the gradients. We demonstrate the capabilities of our method on a variety of canonical physical systems, showing that physical gradients yield significant improvements on a wide range of optimization and learning problems.", "keywords": "deep learning;simulation;optimization;inverse problems;physics", "primary_area": "", "supplementary_material": "", "author": "Philipp Holl;Nils Thuerey;Vladlen Koltun", "authorids": "~Philipp_Holl1;~Nils_Thuerey1;~Vladlen_Koltun1", "gender": "M;M;M", "homepage": ";https://ge.in.tum.de;http://vladlen.info/", "dblp": "256/9374;42/478;66/5458.html", "google_scholar": "LilimmEAAAAJ;https://scholar.google.com.tw/citations?user=GEehwv8AAAAJ;kg4bCpgAAAAJ", "orcid": ";;0000-0003-0858-0970", "linkedin": ";;vladlenkoltun/", "or_profile": "~Philipp_Holl1;~Nils_Thuerey1;~Vladlen_Koltun1", "aff": "Technical University Munich;Technical University Munich;Apple", "aff_domain": "tum.de;tum.de;apple.com", "position": "PhD student;Associate Professor;Distinguished Scientist", "bibtex": "@misc{\nholl2022physical,\ntitle={Physical Gradients for Deep Learning},\nauthor={Philipp Holl and Nils Thuerey and Vladlen Koltun},\nyear={2022},\nurl={https://openreview.net/forum?id=famc03Gg231}\n}", "github": "", "project": "", "reviewers": "mue1;ML7L;FRkr;pjrC", "site": "https://openreview.net/forum?id=famc03Gg231", "pdf_size": 0, "recommendation": "3;3;6;6", "confidence": "2;4;4;3", "correctness": "2;2;2;4", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;2;4;3", "wc_summary_paper": "34;183;134;65", "wc_summary_review": "22;50;37;51", "wc_main_review": "547;774;554;235", "wc_review": "603;1007;725;351", "wc_reply_reviewers": "222;542;229;0", "wc_reply_authors": "733;1002;659;194", "reply_reviewers": "1;2;1;0", "reply_authors": "2;3;2;1", "recommendation_avg": [ 4.5, 1.5 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "correctness_avg": [ 2.5, 0.8660254037844386 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 104.0, 58.22800013739094 ], "wc_summary_review_avg": [ 40.0, 11.76860229593982 ], "wc_main_review_avg": [ 527.5, 191.9641893687466 ], "wc_review_avg": [ 671.5, 236.02701116609515 ], "wc_reply_reviewers_avg": [ 248.25, 192.9875319806956 ], "wc_reply_authors_avg": [ 647.0, 291.0214768706942 ], "reply_reviewers_avg": [ 1.0, 0.7071067811865476 ], "reply_authors_avg": [ 2.0, 0.7071067811865476 ], "replies_avg": [ 28, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.3015113445777637, "corr_recommendation_correctness": 0.5773502691896258, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12975192105326127835&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;1", "aff_unique_norm": "Technical University of Munich;Apple", "aff_unique_dep": ";Apple Inc.", "aff_unique_url": "https://www.tum.de;https://www.apple.com", "aff_unique_abbr": "TUM;Apple", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1", "aff_country_unique": "Germany;United States" }, { "id": "ffS_Y258dZs", "title": "Meta-Referential Games to Learn Compositional Learning Behaviours", "track": "main", "status": "Reject", "tldr": "", "abstract": "Referring to compositional learning behaviours as the ability to learn to generalise compositionally from a limited set of stimuli, that are combinations of supportive stimulus components, to a larger set of novel stimuli, i.e. novel combinations of those same stimulus components, we acknowledge compositional learning behaviours as a valuable feat of intelligence that human beings often rely on, and assume their collaborative partners to use similarly. In order to build artificial agents able to collaborate with human beings, we propose a novel benchmark to investigate state-of-the-art artificial agents abilities to exhibit compositional learning behaviours. We provide baseline results on the single-agent tasks of learning compositional learning behaviours, using state-of-the-art RL agents, and show that our proposed benchmark is a compelling challenge that we hope will spur the research community towards developing more capable artificial agents.", "keywords": "language emergence;language grounding;compositionality;systematicity;few-shot learning", "primary_area": "", "supplementary_material": "", "author": "Kevin Yandoka Denamganai;Sondess Missaoui;James Alfred Walker", "authorids": "~Kevin_Yandoka_Denamganai1;~Sondess_Missaoui1;~James_Alfred_Walker1", "gender": "M;F;M", "homepage": "https://kevindenamganai.netlify.app/;https://digitalcreativity.ac.uk/people/dr-sondess-missaoui;", "dblp": "249/7680;143/1419.html;35/3889", "google_scholar": "PPdQb4QAAAAJ;K2yUNQIAAAAJ;https://scholar.google.co.uk/citations?user=Yl5OycsAAAAJ", "orcid": "0000-0002-8776-4331;;", "linkedin": ";sondess-missaoui-03583531/;", "or_profile": "~Kevin_Yandoka_Denamganai1;~Sondess_Missaoui1;~James_Alfred_Walker1", "aff": "University of York;University of York;University of York", "aff_domain": "york.ac.uk;york.ac.uk;york.ac.uk", "position": "PhD student;Researcher;Associate Professor", "bibtex": "@misc{\ndenamganai2022metareferential,\ntitle={Meta-Referential Games to Learn Compositional Learning Behaviours},\nauthor={Kevin Yandoka Denamganai and Sondess Missaoui and James Alfred Walker},\nyear={2022},\nurl={https://openreview.net/forum?id=ffS_Y258dZs}\n}", "github": "", "project": "", "reviewers": "VGZD;3vEd;o5wV;RCbW", "site": "https://openreview.net/forum?id=ffS_Y258dZs", "pdf_size": 0, "recommendation": "1;3;3;3", "confidence": "4;3;3;2", "correctness": "1;3;3;3", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "52;137;121;58", "wc_summary_review": "43;60;46;110", "wc_main_review": "1098;457;701;121", "wc_review": "1193;654;868;289", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 2.5, 0.8660254037844386 ], "confidence_avg": [ 3.0, 0.7071067811865476 ], "correctness_avg": [ 2.5, 0.8660254037844386 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 92.0, 37.489998666310996 ], "wc_summary_review_avg": [ 64.75, 26.901440481877547 ], "wc_main_review_avg": [ 594.25, 356.35752763201174 ], "wc_review_avg": [ 751.0, 328.5977784465379 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.816496580927726, "corr_recommendation_correctness": 1.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3952989524077152401&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of York", "aff_unique_dep": "", "aff_unique_url": "https://www.york.ac.uk", "aff_unique_abbr": "York", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United Kingdom" }, { "id": "fgcIb5gd99r", "title": "Multi-scale fusion self attention mechanism", "track": "main", "status": "Reject", "tldr": "", "abstract": "Self attention is widely used in various tasks because it can directly calculate the dependency between words, regardless of distance. However, the existing self attention lacks the ability to extract phrase level information. This is because the self attention only considers the one-to-one relationship between words and ignores the one-to-many relationship between words and phrases. Consequently, we design a multi-scale fusion self attention model for phrase information to resolve the above issues. Based on the traditional attention mechanism, multi-scale fusion self attention extracts phrase information at different scales by setting convolution kernels at different levels, and calculates the corresponding attention matrix at different scales, so that the model can better extract phrase level information. Compared with the traditional self attention model, we also designed a unique attention matrix sparsity strategy to better select the information that the model needs to pay attention to, so that our model can be more effective. Experimental results show that our model is superior to the existing baseline model in relation extraction task and GLUE task.", "keywords": "Attention;multi-scale;phrase information;sparsity scheme", "primary_area": "", "supplementary_material": "", "author": "Qibin Li;Nianmin Yao;Jian Zhao;Yanan Zhang", "authorids": "~Qibin_Li1;~Nianmin_Yao1;jzhao@dlut.edu.cn;zhangyanan@catarc.ac.cn", "gender": "M;M;;", "homepage": ";;;", "dblp": "211/6344;18/1894;;", "google_scholar": "https://scholar.google.com.hk/citations?user=aO9KXvcAAAAJ;https://scholar.google.com.hk/citations?user=ztMJF3gAAAAJ;;", "orcid": "0000-0003-1547-5757;0000-0001-9705-6649;;", "linkedin": ";;;", "or_profile": "~Qibin_Li1;~Nianmin_Yao1;jzhao@dlut.edu.cn;zhangyanan@catarc.ac.cn", "aff": "Dalian University of Technology,;Dalian University of Technology;;", "aff_domain": "dlut.edu.cn;dlut.edu.cn;;", "position": "PhD student;Full Professor;;", "bibtex": "@misc{\nli2022multiscale,\ntitle={Multi-scale fusion self attention mechanism},\nauthor={Qibin Li and Nianmin Yao and Jian Zhao and Yanan Zhang},\nyear={2022},\nurl={https://openreview.net/forum?id=fgcIb5gd99r}\n}", "github": "", "project": "", "reviewers": "c4sd;iGQt;hsJZ;cTsu", "site": "https://openreview.net/forum?id=fgcIb5gd99r", "pdf_size": 0, "recommendation": "3;3;3;3", "confidence": "3;3;4;5", "correctness": "3;3;3;2", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "43;71;58;36", "wc_summary_review": "12;29;38;26", "wc_main_review": "159;211;169;272", "wc_review": "214;311;265;334", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 52.0, 13.546217184144066 ], "wc_summary_review_avg": [ 26.25, 9.33742469849155 ], "wc_main_review_avg": [ 202.75, 44.488060195967186 ], "wc_review_avg": [ 281.0, 45.97281805589037 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1, "aff_unique_index": "0;0", "aff_unique_norm": "Dalian University of Technology", "aff_unique_dep": "", "aff_unique_url": "http://www.dlut.edu.cn", "aff_unique_abbr": "DUT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "title": "Pretrained Language Model in Continual Learning: A Comparative Study", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6154", "id": "figzpGMrdD", "poster": "", "openreview": "https://openreview.net/forum?id=figzpGMrdD", "slides": "https://iclr.cc/virtual/2022/poster/6154", "video": "https://iclr.cc/virtual/2022/poster/6154", "author_site": "Tongtong Wu, Massimo Caccia, Zhuang Li, Yuan-Fang Li, Guilin Qi, Gholamreza Haffari", "tldr": "", "abstract": "Continual learning (CL) is a setting in which a model learns from a stream of incoming data while avoiding to forget previously learned knowledge. Pre-trained language models (PLMs) have been successfully employed in continual learning of different natural language problems. With the rapid development of many continual learning methods and PLMs, understanding and disentangling their interactions become essential for continued improvement of continual learning performance. In this paper, we thoroughly compare the continual learning performance over the combination of 5 PLMs and 4 CL approaches on 3 benchmarks in 2 typical incremental settings. Our extensive experimental analyses reveal interesting performance differences across PLMs and across CL methods. Furthermore, our representativeness probing analyses dissect PLMs\u2019 performance characteristics in a layer-wise and task-wise manner, uncovering the extent to which their inner layers suffer from forgetting, and the effect of different CL approaches on each layer. Finally, our observations and analyses open up a number of important research questions that will inform and guide the design of effective continual learning techniques.", "keywords": "Continual Learning;Pre-trained Language Model", "primary_area": "", "supplementary_material": "/attachment/8c5d1ea68a211565fb9255957340c8c65a5781d1.zip", "author": "Tongtong Wu;Massimo Caccia;Zhuang Li;Yuan-Fang Li;Guilin Qi;Gholamreza Haffari", "authorids": "~Tongtong_Wu1;~Massimo_Caccia1;~Zhuang_Li1;~Yuan-Fang_Li1;gqi@seu.edu.cn;~Gholamreza_Haffari1", "gender": "M;;M;M;;M", "homepage": "https://wutong8023.site/;;https://zhuang-li.github.io/;https://users.monash.edu.au/~yli/;;https://rezahaffari.github.io/HomePage/HomePage.html", "dblp": "21/7109;43/6338.html;31/2814;20/2537;;", "google_scholar": "u1Qp8lUAAAAJ;WaE4GicAAAAJ;https://scholar.google.com.au/citations?user=4uhU0NUAAAAJ;https://scholar.google.com.tw/citations?user=wufXO1kAAAAJ;;https://scholar.google.com.tw/citations?user=Perjx5EAAAAJ", "orcid": ";;0000-0002-9808-9992;;;", "linkedin": ";;zhuang-li-68b855b1/;;;gholamrezahaffari/?originalSubdomain=au", "or_profile": "~Tongtong_Wu1;~Massimo_Caccia1;~Zhuang_Li1;~Yuan-Fang_Li1;gqi@seu.edu.cn;~Gholamreza_Haffari1", "aff": "Southeast University;University of Montreal;Monash University;Monash University;;Monash University", "aff_domain": "seu.edu.cn;umontreal.ca;monash.edu;monash.edu;;monash.edu", "position": "PhD student;PhD student;PhD student;Associate Professor;;Full Professor", "bibtex": "@inproceedings{\nwu2022pretrained,\ntitle={Pretrained Language Model in Continual Learning: A Comparative Study},\nauthor={Tongtong Wu and Massimo Caccia and Zhuang Li and Yuan-Fang Li and Guilin Qi and Gholamreza Haffari},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=figzpGMrdD}\n}", "github": "", "project": "", "reviewers": "P6Qd;7fZV;XW45;EuaL", "pdf_size": 0, "recommendation": "3;5;6;8", "confidence": "4;4;4;5", "correctness": "4;3;3;4", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;2;3;4", "wc_summary_paper": "116;67;100;82", "wc_summary_review": "40;7;55;141", "wc_main_review": "159;152;476;496", "wc_review": "315;226;631;719", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "399;413;1167;141", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;2;1", "recommendation_avg": [ 5.5, 1.8027756377319946 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 91.25, 18.45772196128222 ], "wc_summary_review_avg": [ 60.75, 49.47916228070156 ], "wc_main_review_avg": [ 320.75, 165.41973128983133 ], "wc_review_avg": [ 472.75, 207.03426648745855 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 530.0, 383.38622823466153 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.8006407690254357, "corr_recommendation_correctness": 0.0, "gs_citation": 101, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13794715165574321474&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2, "pdf": "https://openreview.net/pdf?id=figzpGMrdD", "email": "seu.edu.cn;umontreal.ca;monash.edu;monash.edu;;monash.edu", "author_num": 6, "aff_unique_index": "0;1;2;2;2", "aff_unique_norm": "Southeast University;University of Montreal;Monash University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.seu.edu.cn/;https://wwwumontreal.ca;https://www.monash.edu", "aff_unique_abbr": "SEU;UM;Monash", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2;2;2", "aff_country_unique": "China;Canada;Australia" }, { "id": "fkjO_FKVzw", "title": "Coarformer: Transformer for large graph via graph coarsening", "track": "main", "status": "Reject", "tldr": "", "abstract": "Although Transformer has been generalized to graph data, its advantages are mostly observed on small graphs, such as molecular graphs. In this paper, we identify the obstacles of applying Transformer to large graphs: (1) The vast number of distant nodes distract the necessary attention of each target node from its local neighborhood; (2) The quadratic computational complexity regarding the number of nodes makes the learning procedure costly. We get rid of these obstacles by exploiting the complementary natures of GNN and Transformer, and trade the fine-grained long-range information for the efficiency of Transformer. In particular, we present Coarformer, a two-view architecture that captures fine-grained local information using a GNN-based module on the original graph and coarse yet long-range information using a Transformer-based module on the coarse graph (with far fewer nodes). Meanwhile, we design a scheme to enable message passing across these two views to enhance each other. Finally, we conduct extensive experiments on real-world datasets, where Coarformer outperforms any single-view method that solely applies a GNN or Transformer. Besides, the coarse global view and the cross-view propagation scheme enable Coarformer to perform better than the combinations of different GNN-based and Transformer-based modules while consuming the least running time and GPU memory.", "keywords": "Graph Neural Networks;Transformer;Graph Coarsening", "primary_area": "", "supplementary_material": "", "author": "Weirui Kuang;Zhen WANG;Yaliang Li;Zhewei Wei;Bolin Ding", "authorids": "~Weirui_Kuang2;~Zhen_WANG2;~Yaliang_Li1;~Zhewei_Wei1;~Bolin_Ding3", "gender": "M;M;M;M;M", "homepage": "https://joneswong.github.io/;https://sites.google.com/site/yaliangli/;http://weizhewei.com;https://bolinding.github.io/;https://weiruikuang.com/", "dblp": "78/6727-36;https://dblp.org/pers/hd/l/Li:Yaliang;94/4260;46/3522.html;318/1583", "google_scholar": "e5CqTBMAAAAJ;CCPBcdYAAAAJ;https://scholar.google.com.hk/citations?user=qZ7dj4gAAAAJ;AjYkTi8AAAAJ;51p3plEAAAAJ", "orcid": "0000-0002-8140-8782;0000-0002-4204-6096;0000-0003-3620-5086;;", "linkedin": ";;;bolin-ding-50a0119/;", "or_profile": "~Zhen_WANG2;~Yaliang_Li1;~Zhewei_Wei1;~Bolin_Ding3;~weirui_kuang1", "aff": "Alibaba Group;Alibaba Group;Renmin University of China;Alibaba Group;Alibaba Group", "aff_domain": "alibaba-inc.com;alibaba-inc.com;ruc.edu.cn;alibaba-inc.com;alibaba-inc.com", "position": "Researcher;Staff Engineer;Full Professor;Senior Director;Researcher", "bibtex": "@misc{\nkuang2022coarformer,\ntitle={Coarformer: Transformer for large graph via graph coarsening},\nauthor={Weirui Kuang and Zhen WANG and Yaliang Li and Zhewei Wei and Bolin Ding},\nyear={2022},\nurl={https://openreview.net/forum?id=fkjO_FKVzw}\n}", "github": "", "project": "", "reviewers": "ErXY;1q2q;QYtX;ZeUY", "site": "https://openreview.net/forum?id=fkjO_FKVzw", "pdf_size": 0, "recommendation": "3;5;6;8", "confidence": "4;3;5;4", "correctness": "3;3;3;3", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "21;34;161;42", "wc_summary_review": "29;23;53;58", "wc_main_review": "145;154;374;410", "wc_review": "195;211;588;510", "wc_reply_reviewers": "0;0;557;24", "wc_reply_authors": "625;622;1044;954", "reply_reviewers": "0;0;2;1", "reply_authors": "1;1;3;2", "recommendation_avg": [ 5.5, 1.8027756377319946 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 64.5, 56.216100896451366 ], "wc_summary_review_avg": [ 40.75, 15.006248698458919 ], "wc_main_review_avg": [ 270.75, 121.95772833240213 ], "wc_review_avg": [ 376.0, 175.27549743190005 ], "wc_reply_reviewers_avg": [ 145.25, 237.92580250994217 ], "wc_reply_authors_avg": [ 811.25, 190.4302693901366 ], "reply_reviewers_avg": [ 0.75, 0.82915619758885 ], "reply_authors_avg": [ 1.75, 0.82915619758885 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.19611613513818402, "corr_recommendation_correctness": 0.0, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2043999168618336881&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;1;0;0", "aff_unique_norm": "Alibaba Group;Renmin University of China", "aff_unique_dep": ";", "aff_unique_url": "https://www.alibaba.com;http://www.ruc.edu.cn", "aff_unique_abbr": "Alibaba;RUC", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "id": "fpU10jwpPvw", "title": "Folded Hamiltonian Monte Carlo for Bayesian Generative Adversarial Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "Generative Adversarial Networks (GANs) can learn complex distributions over images, audio, and data that are difficult to model. We deploy a Bayesian formulation for unsupervised and semi-supervised GAN learning. We propose Folded Hamiltonian Monte Carlo (F-HMC) within this framework to marginalise the weights of the generators and discriminators. The resulting approach improves the performance by having suitable entropy in generated candidates for generator and discriminators' weights. Our proposed model efficiently approximates the high dimensional data due to its parallel composition, increases the accuracy of generated samples and generates interpretable and diverse candidate samples. We have presented the analytical formulation as well as the mathematical proof of the F-HMC. The performance of our model in terms of autocorrelation of generated samples on converging to a high dimensional multi-modal dataset exhibits the effectiveness of the proposed solution. Experimental results on high-dimensional synthetic multi-modal data and natural image benchmarks, including CIFAR-10, SVHN and ImageNet, show that F-HMC outperforms the state-of-the-art methods in terms of test error rates, runtimes per epoch, inception score and Frechet Inception Distance scores.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Narges Pourshahrokhi;Samaneh Kouchaki;Yunpeng Li;Payam M. Barnaghi", "authorids": "~Narges_Pourshahrokhi1;~Samaneh_Kouchaki2;~Yunpeng_Li1;~Payam_M._Barnaghi1", "gender": "F;;;M", "homepage": ";https://www.surrey.ac.uk/people/samaneh-kouchaki;https://www.kcl.ac.uk/people/yunpeng-li;https://www.imperial.ac.uk/people/p.barnaghi", "dblp": ";;;22/4255", "google_scholar": ";https://scholar.google.co.uk/citations?user=wBXhQ-IAAAAJ;JzyKdRUAAAAJ;D6R2cnwAAAAJ", "orcid": "0000-0003-1308-1666;;0000-0003-4798-541X;0000-0001-8591-9638", "linkedin": ";;;", "or_profile": "~Narges_Pourshahrokhi1;~Samaneh_Kouchaki2;~Yunpeng_Li1;~Payam_M._Barnaghi1", "aff": "University of Surrey;University of Surrey;University of Surrey;Imperial College London", "aff_domain": "surrey.ac.uk;surrey.ac.uk;surrey.ac.uk;imperial.ac.uk", "position": "PhD student;Lecturer;Lecturer;Full Professor", "bibtex": "@misc{\npourshahrokhi2022folded,\ntitle={Folded Hamiltonian Monte Carlo for Bayesian Generative Adversarial Networks},\nauthor={Narges Pourshahrokhi and Samaneh Kouchaki and Yunpeng Li and Payam M. Barnaghi},\nyear={2022},\nurl={https://openreview.net/forum?id=fpU10jwpPvw}\n}", "github": "", "project": "", "reviewers": "PuGd;jjmZ;Hh37", "site": "https://openreview.net/forum?id=fpU10jwpPvw", "pdf_size": 0, "recommendation": "1;3;6", "confidence": "4;3;3", "correctness": "1;2;3", "technical_novelty": "1;2;3", "empirical_novelty": "2;2;3", "wc_summary_paper": "20;56;32", "wc_summary_review": "34;54;46", "wc_main_review": "158;526;275", "wc_review": "212;636;353", "wc_reply_reviewers": "0;415;193", "wc_reply_authors": "556;641;482", "reply_reviewers": "0;1;1", "reply_authors": "1;1;1", "recommendation_avg": [ 3.3333333333333335, 2.0548046676563256 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "correctness_avg": [ 2.0, 0.816496580927726 ], "technical_novelty_avg": [ 2.0, 0.816496580927726 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 36.0, 14.966629547095765 ], "wc_summary_review_avg": [ 44.666666666666664, 8.219218670625303 ], "wc_main_review_avg": [ 319.6666666666667, 153.51945226003843 ], "wc_review_avg": [ 400.3333333333333, 176.30340010586548 ], "wc_reply_reviewers_avg": [ 202.66666666666666, 169.56087074820326 ], "wc_reply_authors_avg": [ 559.6666666666666, 64.9632374672185 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.8029550685469661, "corr_recommendation_correctness": 0.9933992677987827, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4300378884013872120&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0;0;1", "aff_unique_norm": "University of Surrey;Imperial College London", "aff_unique_dep": ";", "aff_unique_url": "https://www.surrey.ac.uk;https://www.imperial.ac.uk", "aff_unique_abbr": "Surrey;ICL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United Kingdom" }, { "id": "fuYtttFI-By", "title": "Programmable 3D snapshot microscopy with Fourier convolutional networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "3D snapshot microscopy enables fast volumetric imaging by capturing a 3D volume in a single 2D camera image and performing computational reconstruction. Fast volumetric imaging has a variety of biological applications such as whole brain imaging of rapid neural activity in larval zebrafish. The optimal microscope design for this optical 3D-to-2D encoding is both sample- and task-dependent, with no general solution known. Deep learning based decoders can be combined with a differentiable simulation of an optical encoder for end-to-end optimization of both the deep learning decoder and optical encoder. This technique has been used to engineer local optical encoders for other problems such as depth estimation, 3D particle localization, and lensless photography. However, 3D snapshot microscopy is known to require a highly non-local optical encoder which existing UNet-based decoders are not able to engineer. We show that a neural network architecture based on global kernel Fourier convolutional neural networks can efficiently decode information from multiple depths in a volume, globally encoded across a 3D snapshot image. We show in simulation that our proposed networks succeed in engineering and reconstructing optical encoders for 3D snapshot microscopy where the existing state-of-the-art UNet architecture fails. We also show that our networks outperform the state-of-the-art learned reconstruction algorithms for a computational photography dataset collected on a prototype lensless camera which also uses a highly non-local optical encoding.", "keywords": "computational microscopy;computational photography;computer vision;deep learning", "primary_area": "", "supplementary_material": "", "author": "Diptodip Deb;Zhenfei Jiao;Alex Bo-Yuan Chen;Misha Ahrens;Kaspar Podgorski;Srinivas C Turaga", "authorids": "~Diptodip_Deb1;~Zhenfei_Jiao2;~Alex_Bo-Yuan_Chen1;~Misha_Ahrens1;~Kaspar_Podgorski1;~Srinivas_C_Turaga1", "gender": ";M;M;M;;M", "homepage": "https://d2d.sh;;https://twitter.com/alexbchen;https://ahrenslab.org;http://www.janelia.org/lab/podgorski-lab;https://www.janelia.org/lab/turaga-lab", "dblp": "218/6098;;;88/6327;;91/747", "google_scholar": "gBcB_UUAAAAJ;https://scholar.google.com/citations?hl=en;ZHp1csYAAAAJ;nr9NPRwAAAAJ;AKKUvgEAAAAJ;V_NdI3sAAAAJ", "orcid": ";;;0000-0002-3457-4462;0000-0002-0374-2005;0000-0003-3247-6487", "linkedin": ";;;;;srini-turaga-4934923/", "or_profile": "~Diptodip_Deb1;~Zhenfei_Jiao2;~Alex_Bo-Yuan_Chen1;~Misha_Ahrens1;~Kaspar_Podgorski1;~Srinivas_C_Turaga1", "aff": "HHMI Janelia Research Campus;Huazhong University of Science and Technology;Harvard University;HHMI Janelia Research Campus;HHMI Janelia Research Campus;HHMI Janelia Research Campus", "aff_domain": "janelia.hhmi.org;hust.edu.cn;harvard.edu;janelia.hhmi.org;janelia.hhmi.org;janelia.hhmi.org", "position": "PhD student;PhD student;PhD student;Senior Group Leader;Principal Researcher;Associate Professor", "bibtex": "@misc{\ndeb2022programmable,\ntitle={Programmable 3D snapshot microscopy with Fourier convolutional networks},\nauthor={Diptodip Deb and Zhenfei Jiao and Alex Bo-Yuan Chen and Misha Ahrens and Kaspar Podgorski and Srinivas C Turaga},\nyear={2022},\nurl={https://openreview.net/forum?id=fuYtttFI-By}\n}", "github": "", "project": "", "reviewers": "3cwP;jREz;nest;cUSr", "site": "https://openreview.net/forum?id=fuYtttFI-By", "pdf_size": 0, "recommendation": "6;6;6;6", "confidence": "3;4;3;3", "correctness": "4;3;3;4", "technical_novelty": "3;2;3;2", "empirical_novelty": "3;3;2;3", "wc_summary_paper": "100;98;109;85", "wc_summary_review": "104;40;86;128", "wc_main_review": "218;300;372;624", "wc_review": "422;438;567;837", "wc_reply_reviewers": "22;0;101;197", "wc_reply_authors": "244;354;750;461", "reply_reviewers": "1;0;1;1", "reply_authors": "1;1;1;2", "recommendation_avg": [ 6.0, 0.0 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 98.0, 8.573214099741124 ], "wc_summary_review_avg": [ 89.5, 32.22964473896664 ], "wc_main_review_avg": [ 378.5, 151.85107836298036 ], "wc_review_avg": [ 566.0, 166.2543232520586 ], "wc_reply_reviewers_avg": [ 80.0, 77.28842086625913 ], "wc_reply_authors_avg": [ 452.25, 188.25033200501932 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7816654038176553745&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;2;0;0;0", "aff_unique_norm": "HHMI Janelia Research Campus;Huazhong University of Science and Technology;Harvard University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.janelia.org;http://www.hust.edu.cn;https://www.harvard.edu", "aff_unique_abbr": "HHMI Janelia;HUST;Harvard", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Janelia;", "aff_country_unique_index": "0;1;0;0;0;0", "aff_country_unique": "United States;China" }, { "id": "fuaHYhuYIDm", "title": "MAGNEx: A Model Agnostic Global Neural Explainer", "track": "main", "status": "Reject", "tldr": "", "abstract": "Black-box decision models have been widely adopted both in industry and academia due to their excellent performance across many challenging tasks and domains. However, much criticism has been raised around modern AI systems, to a large extent due to their inability to produce explainable decisions that both their end-users and their developers can trust. The need for such decisions, i.e., decisions accompanied by a rationale for why they are made, has ignited much recent research. We propose MAGNEx, a global algorithm that leverages neural-network based explainers to produce rationales for any black-box decision model, neural or not. MAGNEx is model-agnostic, and thus easily generalizable across domains and applications. More importantly, MAGNEx is global, i.e., it learns to create rationales by optimizing for a number of instances at once, contrary to local methods that aim at explaining a single example. The global nature of MAGNEx has two advantages over local methods: i) it generalizes across instances hence producing more faithful explanations, ii) it is computationally more efficient during inference. Our experiments confirm that MAGNEx outperforms popular explainability algorithms both in explanation quality and in computational efficiency.", "keywords": "Explainability;Neural Explainer;Faithfullness;Global;Post-hoc", "primary_area": "", "supplementary_material": "", "author": "Nikolaos Manginas;Prodromos Malakasiotis;Eirini Spyropoulou;Ion Androutsopoulos;Georgios Paliouras", "authorids": "~Nikolaos_Manginas1;~Prodromos_Malakasiotis1;espyropoulou@iit.demokritos.gr;~Ion_Androutsopoulos1;~Georgios_Paliouras1", "gender": "M;M;;M;M", "homepage": "https://github.com/nickmagginas;http://pages.cs.aueb.gr/~rulller/;;http://www.aueb.gr/users/ion/;https://users.iit.demokritos.gr/~paliourg", "dblp": ";16/5137;;87/6723;55/2039", "google_scholar": ";https://scholar.google.gr/citations?user=36n9818AAAAJ;;https://scholar.google.com.tw/citations?user=4UJm5EQAAAAJ;-pec7wIAAAAJ", "orcid": ";0009-0008-0055-5598;;0009-0000-2969-0509;0000-0001-9629-2367", "linkedin": ";rulller/;;ion-androutsopoulos-477b6b3a/;georgios-paliouras-a203a79/", "or_profile": "~Nikolaos_Manginas1;~Prodromos_Malakasiotis1;espyropoulou@iit.demokritos.gr;~Ion_Androutsopoulos1;~Georgios_Paliouras1", "aff": "NCSR Demokritos;Athens University of Economics and Business;;Athens University of Economics and Business;NCSR \u201cDemokritos\u201d", "aff_domain": "iit.demokritos.gr;aueb.gr;;aueb.gr;demokritos.gr", "position": "Researcher;Postdoc;;Faculty;Researcher", "bibtex": "@misc{\nmanginas2022magnex,\ntitle={{MAGNE}x: A Model Agnostic Global Neural Explainer},\nauthor={Nikolaos Manginas and Prodromos Malakasiotis and Eirini Spyropoulou and Ion Androutsopoulos and Georgios Paliouras},\nyear={2022},\nurl={https://openreview.net/forum?id=fuaHYhuYIDm}\n}", "github": "", "project": "", "reviewers": "9rPK;Mi4V;eFGa;A419", "site": "https://openreview.net/forum?id=fuaHYhuYIDm", "pdf_size": 0, "recommendation": "3;3;6;6", "confidence": "4;4;4;3", "correctness": "3;3;3;3", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "9;54;83;172", "wc_summary_review": "6;14;81;20", "wc_main_review": "265;186;344;170", "wc_review": "280;254;508;362", "wc_reply_reviewers": "0;0;0;11", "wc_reply_authors": "703;250;393;231", "reply_reviewers": "0;0;0;1", "reply_authors": "1;1;1;1", "recommendation_avg": [ 4.5, 1.5 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 79.5, 59.55879448074818 ], "wc_summary_review_avg": [ 30.25, 29.71847068743612 ], "wc_main_review_avg": [ 241.25, 69.3735360205893 ], "wc_review_avg": [ 351.0, 99.02019995940222 ], "wc_reply_reviewers_avg": [ 2.75, 4.763139720814412 ], "wc_reply_authors_avg": [ 394.25, 188.93567026900982 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.5773502691896258, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:2Ur1YBjy9rIJ:scholar.google.com/&scioq=MAGNEx:+A+Model+Agnostic+Global+Neural+Explainer&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;1;2", "aff_unique_norm": "National Centre for Scientific Research 'Demokritos';Athens University of Economics and Business;National Centre for Scientific Research \u201cDemokritos\u201d", "aff_unique_dep": ";;", "aff_unique_url": "https://www.demokritos.gr;https://www.aueb.gr;https://www.demokritos.gr", "aff_unique_abbr": "NCSR Demokritos;AUEB;NCSR Demokritos", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Athens", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Greece" }, { "title": "AS-MLP: An Axial Shifted MLP Architecture for Vision", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6277", "id": "fvLLcIYmXb", "poster": "", "openreview": "https://openreview.net/forum?id=fvLLcIYmXb", "slides": "https://iclr.cc/virtual/2022/poster/6277", "video": "https://iclr.cc/virtual/2022/poster/6277", "author_site": "Dongze Lian, Zehao Yu, Xing Sun, Shenghua Gao", "tldr": "", "abstract": "An Axial Shifted MLP architecture (AS-MLP) is proposed in this paper. Different from MLP-Mixer, where the global spatial feature is encoded for information flow through matrix transposition and one token-mixing MLP, we pay more attention to the local features interaction. By axially shifting channels of the feature map, AS-MLP is able to obtain the information flow from different axial directions, which captures the local dependencies. Such an operation enables us to utilize a pure MLP architecture to achieve the same local receptive field as CNN-like architecture. We can also design the receptive field size and dilation of blocks of AS-MLP, \\emph{etc}, in the same spirit of convolutional neural networks. With the proposed AS-MLP architecture, our model obtains 83.3\\% Top-1 accuracy with 88M parameters and 15.2 GFLOPs on the ImageNet-1K dataset. Such a simple yet effective architecture outperforms all MLP-based architectures and achieves competitive performance compared to the transformer-based architectures (\\emph{e.g.}, Swin Transformer) even with slightly lower FLOPs. In addition, AS-MLP is also the first MLP-based architecture to be applied to the downstream tasks (\\emph{e.g.}, object detection and semantic segmentation). The experimental results are also impressive. Our proposed AS-MLP obtains 51.5 mAP on the COCO validation set and 49.5 MS mIoU on the ADE20K dataset, which is competitive compared to the transformer-based architectures. Our AS-MLP establishes a strong baseline of MLP-based architecture. Code is available at \\url{https://github.com/svip-lab/AS-MLP}.", "keywords": "Architecture Design;MLP;Classification;Detection;Segmentation", "primary_area": "", "supplementary_material": "/attachment/d45563113193a9197eae5e7816797a07946560a6.zip", "author": "Dongze Lian;Zehao Yu;Xing Sun;Shenghua Gao", "authorids": "~Dongze_Lian1;~Zehao_Yu2;~Xing_Sun1;~Shenghua_Gao1", "gender": "M;M;M;M", "homepage": "https://dongzelian.com/;https://niujinshuchong.github.io;https://www.sunxing.org;", "dblp": "211/7697;168/2910;;63/7642", "google_scholar": "q-C8LqsAAAAJ;https://scholar.google.co.jp/citations?user=Z8MwnzsAAAAJ;IUtix9IAAAAJ;fe-1v0MAAAAJ", "orcid": ";;0000-0001-8132-9083;", "linkedin": ";;sunxings/;", "or_profile": "~Dongze_Lian1;~Zehao_Yu2;~Xing_Sun1;~Shenghua_Gao1", "aff": "National University of Singapore;University of Tuebingen;Tencent YouTu Lab;ShanghaiTech University", "aff_domain": "nus.edu.sg;uni-tuebingen.de;tencent.com;shanghaitech.edu.cn", "position": "Postdoc;PhD student;Principal Researcher;Associate Professor", "bibtex": "@inproceedings{\nlian2022asmlp,\ntitle={{AS}-{MLP}: An Axial Shifted {MLP} Architecture for Vision},\nauthor={Dongze Lian and Zehao Yu and Xing Sun and Shenghua Gao},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=fvLLcIYmXb}\n}", "github": "", "project": "", "reviewers": "Wh4q;HTjG;uLzi", "pdf_size": 0, "recommendation": "5;5;6", "confidence": "4;5;4", "correctness": "3;3;3", "technical_novelty": "3;1;3", "empirical_novelty": "4;2;3", "wc_summary_paper": "37;22;78", "wc_summary_review": "44;25;38", "wc_main_review": "273;255;912", "wc_review": "354;302;1028", "wc_reply_reviewers": "148;117;21", "wc_reply_authors": "1110;1383;1157", "reply_reviewers": "1;1;1", "reply_authors": "4;3;3", "recommendation_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.3333333333333335, 0.9428090415820634 ], "empirical_novelty_avg": [ 3.0, 0.816496580927726 ], "wc_summary_paper_avg": [ 45.666666666666664, 23.66901396810231 ], "wc_summary_review_avg": [ 35.666666666666664, 7.93025150224688 ], "wc_main_review_avg": [ 480.0, 305.5585050362696 ], "wc_review_avg": [ 561.3333333333334, 330.66532257791346 ], "wc_reply_reviewers_avg": [ 95.33333333333333, 54.06374837993467 ], "wc_reply_authors_avg": [ 1216.6666666666667, 119.17027966550869 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 3.3333333333333335, 0.4714045207910317 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.4999999999999999, "corr_recommendation_correctness": 0.0, "gs_citation": 256, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1534689713476232636&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=fvLLcIYmXb", "email": "nus.edu.sg;uni-tuebingen.de;tencent.com;shanghaitech.edu.cn", "author_num": 4, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "National University of Singapore;University of Tuebingen;Tencent;ShanghaiTech University", "aff_unique_dep": ";;YouTu Lab;", "aff_unique_url": "https://www.nus.edu.sg;https://www.uni-tuebingen.de/;https://www.tencent.com;https://www.shanghaitech.edu.cn", "aff_unique_abbr": "NUS;Uni T\u00fcbingen;Tencent;ShanghaiTech", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2;2", "aff_country_unique": "Singapore;Germany;China" }, { "id": "fvybrRLv4m", "title": "Dictionary Learning Under Generative Coefficient Priors with Applications to Compression", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "There is a rich literature on recovering data from limited measurements under the assumption of sparsity in some basis, whether known (compressed sensing) or unknown (dictionary learning). In particular, classical dictionary learning assumes the given dataset is well-described by sparse combinations of an unknown basis set. However, this assumption is of limited validity on real-world data. Recent work spanning theory and computational science has sought to replace the canonical sparsity assumption with more complex data priors, demonstrating how to incorporate pretrained generative models into frameworks such as compressed sensing and phase retrieval. Typically, the dimensionality of the input space of the generative model is much smaller than that of the output space, paralleling the \u201clow description complexity,\u201d or compressibility, of sparse vectors. In this paper, we study dictionary learning under this kind of known generative prior on the coefficients, which may capture non-trivial low-dimensional structure in the coefficients. This is a distributional learning approach to compression, in which we learn a suitable dictionary given access to a small dataset of training instances and a specified generative model for the coefficients. Equivalently, it may be viewed as transfer learning for generative models, in which we learn a new linear layer (the dictionary) to fine-tune a pretrained generative model (the coefficient prior) on a new dataset. We give, to our knowledge, the first provable algorithm for recovering the unknown dictionary given a suitable initialization. Finally, we compare our approach to traditional dictionary learning algorithms on synthetic compression and denoising tasks, demonstrating empirically the advantages of incorporating finer-grained structure than sparsity.", "keywords": "Dictionary learning;generative priors;sparsity;alternating minimization;linear transformation;transfer learning;compression;algorithms", "primary_area": "", "supplementary_material": "/attachment/c7d923647bc90d5c489627ad801563b6962c23ed.zip", "author": "Hannah Lawrence;Ankur Moitra", "authorids": "~Hannah_Lawrence1;~Ankur_Moitra1", "gender": "F;M", "homepage": "https://hannahlawrence.github.io/;http://people.csail.mit.edu/moitra/", "dblp": "251/5474;04/952", "google_scholar": ";https://scholar.google.com.tw/citations?user=umFQktIAAAAJ", "orcid": ";", "linkedin": "hannah-lawrence-417b5a130/;", "or_profile": "~Hannah_Lawrence1;~Ankur_Moitra1", "aff": "Massachusetts Institute of Technology;Massachusetts Institute of Technology", "aff_domain": "mit.edu;mit.edu", "position": "PhD student;", "bibtex": "@misc{\nlawrence2022dictionary,\ntitle={Dictionary Learning Under Generative Coefficient Priors with Applications to Compression},\nauthor={Hannah Lawrence and Ankur Moitra},\nyear={2022},\nurl={https://openreview.net/forum?id=fvybrRLv4m}\n}", "github": "", "project": "", "reviewers": "sQ71;dCoH;gYYQ;U1vb", "site": "https://openreview.net/forum?id=fvybrRLv4m", "pdf_size": 0, "recommendation": "3;3;5;6", "confidence": "4;4;4;4", "correctness": "2;3;3;3", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;2;1;2", "wc_summary_paper": "37;81;120;130", "wc_summary_review": "47;66;20;44", "wc_main_review": "621;331;488;537", "wc_review": "705;478;628;711", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.25, 1.299038105676658 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 92.0, 36.65378561622251 ], "wc_summary_review_avg": [ 44.25, 16.345871038277526 ], "wc_main_review_avg": [ 494.25, 105.57313815549863 ], "wc_review_avg": [ 630.5, 93.9321563683066 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.5555555555555555, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:UAS2L0fovDsJ:scholar.google.com/&scioq=Dictionary+Learning+Under+Generative+Coefficient+Priors+with+Applications+to+Compression&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Massachusetts Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://web.mit.edu", "aff_unique_abbr": "MIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "fwJWhOxuzV9", "title": "Semi-supervised Offline Reinforcement Learning with Pre-trained Decision Transformers", "track": "main", "status": "Reject", "tldr": "", "abstract": "Pre-training deep neural network models using large unlabelled datasets followed by fine-tuning them on small task-specific datasets has emerged as a dominant paradigm in natural language processing (NLP) and computer vision (CV). Despite the widespread success, such a paradigm has remained atypical in reinforcement learning (RL).\nIn this paper, we investigate how we can leverage large reward-free (i.e. task-agnostic) offline datasets of prior interactions to pre-train agents that can then be fine-tuned using a small reward-annotated dataset. To this end, we present Pre-trained Decision Transformer (PDT), a simple yet powerful algorithm for semi-supervised Offline RL. By masking reward tokens during pre-training, the transformer learns to autoregressivley predict actions based on previous state and action context and effectively extracts behaviors present in the dataset. During fine-tuning, rewards are un-masked and the agent learns the set of skills that should be invoked for the desired behavior as per the reward function. We demonstrate the efficacy of this simple and flexible approach on tasks from the D4RL benchmark with limited reward annotations.", "keywords": "Multi-task RL;Decision Transformer;self-supervised RL;Pretraining", "primary_area": "", "supplementary_material": "", "author": "Catherine Cang;Kourosh Hakhamaneshi;Ryan Rudes;Igor Mordatch;Aravind Rajeswaran;Pieter Abbeel;Michael Laskin", "authorids": "~Catherine_Cang1;~Kourosh_Hakhamaneshi1;~Ryan_Rudes1;~Igor_Mordatch4;~Aravind_Rajeswaran1;~Pieter_Abbeel2;~Michael_Laskin1", "gender": "F;M;M;M;M;M;M", "homepage": "http://catherinecang.github.io;https://kouroshhakha.github.io/;https://ryanrudes.github.io/;http://aravindr93.github.io/;https://people.eecs.berkeley.edu/~pabbeel/;http://mishalaskin.com;", "dblp": ";;;164/5778;;;21/17", "google_scholar": ";;https://scholar.google.com/citations?hl=en;_EJrRVAAAAAJ;https://scholar.google.com.tw/citations?user=vtwH6GkAAAAJ;DOGDnwsAAAAJ;", "orcid": ";;;;;;", "linkedin": ";;ryanrudes;;;mishalaskin;", "or_profile": "~Catherine_Cang1;~Kourosh_Hakhamaneshi1;~Ryan_Rudes1;~Aravind_Rajeswaran1;~Pieter_Abbeel2;~Michael_Laskin1;~Igor_Mordatch1", "aff": ";University of California, Berkeley;Half Hollow Hills High School East;Meta Facebook;Covariant;Google DeepMind;OpenAI", "aff_domain": ";berkeley.edu;hhh.k12.ny.us;meta.com;covariant.ai;deepmind.com;openai.com", "position": ";PhD student;High School Student;Research Scientist;Founder;Researcher;Research Scientist", "bibtex": "@misc{\ncang2022semisupervised,\ntitle={Semi-supervised Offline Reinforcement Learning with Pre-trained Decision Transformers},\nauthor={Catherine Cang and Kourosh Hakhamaneshi and Ryan Rudes and Igor Mordatch and Aravind Rajeswaran and Pieter Abbeel and Michael Laskin},\nyear={2022},\nurl={https://openreview.net/forum?id=fwJWhOxuzV9}\n}", "github": "", "project": "", "reviewers": "fBou;xxc6;z2Wf;qGs8", "site": "https://openreview.net/forum?id=fwJWhOxuzV9", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "3;4;4;4", "correctness": "2;3;3;3", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;2;2;0", "wc_summary_paper": "256;34;45;41", "wc_summary_review": "131;34;68;35", "wc_main_review": "671;219;257;322", "wc_review": "1058;287;370;398", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 1.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 94.0, 93.61356739276631 ], "wc_summary_review_avg": [ 67.0, 39.40177660969109 ], "wc_main_review_avg": [ 367.25, 179.19594721979624 ], "wc_review_avg": [ 528.25, 308.5631013261307 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3945289633525718714&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;2;3;4;5", "aff_unique_norm": "University of California, Berkeley;Half Hollow Hills High School;Meta;Covariant;Google;OpenAI", "aff_unique_dep": ";;Meta Platforms, Inc.;;Google DeepMind;", "aff_unique_url": "https://www.berkeley.edu;https://www.hhhhs.org;https://meta.com;;https://deepmind.com;https://openai.com", "aff_unique_abbr": "UC Berkeley;;Meta;;DeepMind;OpenAI", "aff_campus_unique_index": "0;1", "aff_campus_unique": "Berkeley;East;", "aff_country_unique_index": "0;0;0;2;0", "aff_country_unique": "United States;;United Kingdom" }, { "id": "fwsdscicqUm", "title": "Improving Fairness via Federated Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Recently, lots of algorithms have been proposed for learning a fair classifier from centralized data. However, how to privately train a fair classifier on decentralized data has not been fully studied yet. In this work, we first propose a new theoretical framework, with which we analyze the value of federated learning in improving fairness. Our analysis reveals that federated learning can strictly boost model fairness compared with all non-federated algorithms. We then theoretically and empirically show that the performance tradeoff of FedAvg-based fair learning algorithms is strictly worse than that of a fair classifier trained on centralized data. To resolve this, we propose FedFB, a private fair learning algorithm on decentralized data with a modified FedAvg protocol. Our extensive experimental results show that FedFB significantly outperforms existing approaches, sometimes achieving a similar tradeoff as the one trained on centralized data.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yuchen Zeng;Hongxu Chen;Kangwook Lee", "authorids": "~Yuchen_Zeng1;~Hongxu_Chen4;~Kangwook_Lee1", "gender": "F;M;M", "homepage": "https://yzeng58.github.io;https://sites.google.com/view/hongxuchen;http://kangwooklee.com/", "dblp": ";;88/9826-1", "google_scholar": ";;sCEl8r-n5VEC", "orcid": "0000-0002-2766-0055;;", "linkedin": ";;", "or_profile": "~Yuchen_Zeng1;~Hongxu_Chen4;~Kangwook_Lee1", "aff": "MIT-IBM Watson AI Lab;University of Wisconsin, Madison;KRAFTON", "aff_domain": "ibm.com;wisc.edu;krafton.com", "position": "Intern;PhD student;Researcher", "bibtex": "@misc{\nzeng2022improving,\ntitle={Improving Fairness via Federated Learning},\nauthor={Yuchen Zeng and Hongxu Chen and Kangwook Lee},\nyear={2022},\nurl={https://openreview.net/forum?id=fwsdscicqUm}\n}", "github": "", "project": "", "reviewers": "pTFp;V46Y;q1zx;6XNC", "site": "https://openreview.net/forum?id=fwsdscicqUm", "pdf_size": 0, "recommendation": "3;3;5;6", "confidence": "4;4;4;4", "correctness": "3;3;2;4", "technical_novelty": "2;2;1;4", "empirical_novelty": "2;2;0;4", "wc_summary_paper": "18;53;83;102", "wc_summary_review": "52;37;114;35", "wc_main_review": "268;252;615;42", "wc_review": "338;342;812;179", "wc_reply_reviewers": "0;103;165;0", "wc_reply_authors": "688;683;1023;138", "reply_reviewers": "0;1;1;0", "reply_authors": "1;2;3;1", "recommendation_avg": [ 4.25, 1.299038105676658 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.25, 1.0897247358851685 ], "empirical_novelty_avg": [ 2.0, 1.4142135623730951 ], "wc_summary_paper_avg": [ 64.0, 31.788362650504666 ], "wc_summary_review_avg": [ 59.5, 32.14420632089086 ], "wc_main_review_avg": [ 294.25, 205.53877371435297 ], "wc_review_avg": [ 417.75, 236.9244341557029 ], "wc_reply_reviewers_avg": [ 67.0, 70.49468065038667 ], "wc_reply_authors_avg": [ 633.0, 317.2735412857492 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.75, 0.82915619758885 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.2721655269759087, "gs_citation": 87, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6483361896357715557&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1;2", "aff_unique_norm": "Massachusetts Institute of Technology;University of Wisconsin;KRAFTON Inc.", "aff_unique_dep": "IBM Watson AI Lab;;", "aff_unique_url": "https://www.mitibmwatsonailab.org;https://www.wisc.edu;https://www.krafton.com", "aff_unique_abbr": "MIT-IBM AI Lab;UW;KRAFTON", "aff_campus_unique_index": "1", "aff_campus_unique": ";Madison", "aff_country_unique_index": "0;0;1", "aff_country_unique": "United States;South Korea" }, { "title": "Robbing the Fed: Directly Obtaining Private Data in Federated Learning with Modified Models", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7067", "id": "fwzUgo0FM9v", "poster": "", "openreview": "https://openreview.net/forum?id=fwzUgo0FM9v", "slides": "https://iclr.cc/virtual/2022/poster/7067", "video": "https://iclr.cc/virtual/2022/poster/7067", "author_site": "Liam H Fowl, Jonas Geiping, Wojciech Czaja, Micah Goldblum, Tom Goldstein", "tldr": "", "abstract": "Federated learning has quickly gained popularity with its promises of increased user privacy and efficiency. Previous works have shown that federated gradient updates contain information that can be used to approximately recover user data in some situations. These previous attacks on user privacy have been limited in scope and do not scale to gradient updates aggregated over even a handful of data points, leaving some to conclude that data privacy is still intact for realistic training regimes. In this work, we introduce a new threat model based on minimal but malicious modifications of the shared model architecture which enable the server to directly obtain a verbatim copy of user data from gradient updates without solving difficult inverse problems. Even user data aggregated over large batches \u2013 where previous methods fail to extract meaningful content \u2013 can be reconstructed by these minimally modified models.\n", "keywords": "Privacy;Federated Learning;Gradient Inversion", "primary_area": "", "supplementary_material": "/attachment/b1757dfe33925977b460c665a138cc681a0ebe2e.zip", "author": "Liam H Fowl;Jonas Geiping;Wojciech Czaja;Micah Goldblum;Tom Goldstein", "authorids": "~Liam_H_Fowl1;~Jonas_Geiping1;~Wojciech_Czaja1;~Micah_Goldblum1;~Tom_Goldstein1", "gender": ";M;;;M", "homepage": ";https://jonasgeiping.github.io/;;;https://www.cs.umd.edu/~tomg/", "dblp": "241/6940;190/7229;;241/7231;25/8184", "google_scholar": "IXv3ToAAAAAJ;https://scholar.google.de/citations?user=206vNCEAAAAJ;;pGDKzuUAAAAJ;KmSuVtgAAAAJ", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~Liam_H_Fowl1;~Jonas_Geiping1;~Wojciech_Czaja1;~Micah_Goldblum1;~Tom_Goldstein1", "aff": "University of Maryland, College Park;University of Maryland, College Park;;New York University;University of Maryland, College Park", "aff_domain": "umd.edu;umd.edu;;nyu.edu;umd.edu", "position": "PhD student;Postdoc;;Postdoc;Associate Professor", "bibtex": "@inproceedings{\nfowl2022robbing,\ntitle={Robbing the Fed: Directly Obtaining Private Data in Federated Learning with Modified Models},\nauthor={Liam H Fowl and Jonas Geiping and Wojciech Czaja and Micah Goldblum and Tom Goldstein},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=fwzUgo0FM9v}\n}", "github": "", "project": "", "reviewers": "Pq3S;AxhH;nVzn;t8bw", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "2;4;2;4", "correctness": "3;4;3;4", "technical_novelty": "2;2;3;4", "empirical_novelty": "2;2;3;4", "wc_summary_paper": "19;81;136;79", "wc_summary_review": "28;61;21;26", "wc_main_review": "146;100;225;330", "wc_review": "193;242;382;435", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "835;252;808;660", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 3.0, 1.0 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.82915619758885 ], "empirical_novelty_avg": [ 2.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 78.75, 41.390669238368204 ], "wc_summary_review_avg": [ 34.0, 15.795568998931314 ], "wc_main_review_avg": [ 200.25, 87.23638862309696 ], "wc_review_avg": [ 313.0, 98.85089782091005 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 638.75, 233.0164961971577 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.6882472016116854, "corr_recommendation_correctness": 0.6882472016116854, "gs_citation": 171, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15885116748368204506&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=fwzUgo0FM9v", "email": "umd.edu;umd.edu;;nyu.edu;umd.edu", "author_num": 5, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "University of Maryland;New York University", "aff_unique_dep": ";", "aff_unique_url": "https://www/umd.edu;https://www.nyu.edu", "aff_unique_abbr": "UMD;NYU", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "College Park;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "fyLvrx9M9YP", "title": "Towards Unsupervised Content Disentanglement in Sentence Representations via Syntactic Roles", "track": "main", "status": "Reject", "tldr": "", "abstract": "Linking neural representations to linguistic factors is crucial in order to build and analyze NLP models interpretable by humans. Among these factors, syntactic roles (e.g. subjects, direct objects,$\\dots$) and their realizations are essential markers since they can be understood as a decomposition of predicative structures and thus the meaning of sentences. Starting from a deep probabilistic generative model with attention, we measure the interaction between latent variables and realizations of syntactic roles, and show that it is possible to obtain, without supervision, representations of sentences where different syntactic roles correspond to clearly identified different latent variables. The probabilistic model we propose is an Attention-Driven Variational Autoencoder (ADVAE). Drawing inspiration from Transformer-based machine translation models, ADVAEs enable the analysis of the interactions between latent variables and input tokens through attention. We also develop an evaluation protocol to measure disentanglement with regard to the realizations of syntactic roles. This protocol is based on attention maxima for the encoder and on disturbing individual latent variables for the decoder. Our experiments on raw English text from the SNLI dataset show that $\\textit{i)}$ disentanglement of syntactic roles can be induced without supervision, $\\textit{ii)}$ ADVAE separates more syntactic roles than classical sequence VAEs, $\\textit{iii)}$ realizations of syntactic roles can be separately modified in sentences by mere intervention on the associated latent variables. Our work constitutes a first step towards unsupervised controllable content generation. The code for our work is publicly available.", "keywords": "NLP;disentanglement;unsupervised learning;controllable generation.", "primary_area": "", "supplementary_material": "/attachment/15d43e3a5640dc86121f26eda33526d7c1e5dccd.zip", "author": "Ghazi Felhi;Joseph Le Roux;Djam\u00e9 Seddah", "authorids": "~Ghazi_Felhi1;~Joseph_Le_Roux1;~Djam\u00e9_Seddah1", "gender": "M;M;M", "homepage": "https://ghazi-f.netlify.app/;https://www.lipn.fr/~leroux;http://pauillac.inria.fr/~seddah", "dblp": "245/3413;25/5993;19/1467", "google_scholar": "ODzuOSQAAAAJ;WVUs6rEAAAAJ;P7EtARsAAAAJ", "orcid": "0000-0002-8657-4640;;", "linkedin": "ghazi-felhi-862775106/;;djam\u00e9-seddah-00a1735/", "or_profile": "~Ghazi_Felhi1;~Joseph_Le_Roux1;~Djam\u00e9_Seddah1", "aff": "University Paris 13;Universit\u00e9 Paris 13;Inria Paris", "aff_domain": "lipn.univ-paris13.fr;univ-paris13.fr;almanach.inria.fr", "position": "PhD student;Associate Professor;Researcher", "bibtex": "@misc{\nfelhi2022towards,\ntitle={Towards Unsupervised Content Disentanglement in Sentence Representations via Syntactic Roles},\nauthor={Ghazi Felhi and Joseph Le Roux and Djam{\\'e} Seddah},\nyear={2022},\nurl={https://openreview.net/forum?id=fyLvrx9M9YP}\n}", "github": "", "project": "", "reviewers": "WuPD;7uFL;wDkZ;9pDc", "site": "https://openreview.net/forum?id=fyLvrx9M9YP", "pdf_size": 0, "recommendation": "5;5;6;8", "confidence": "3;3;3;3", "correctness": "2;3;3;4", "technical_novelty": "3;2;3;2", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "91;164;83;161", "wc_summary_review": "124;63;43;125", "wc_main_review": "607;196;287;353", "wc_review": "822;423;413;639", "wc_reply_reviewers": "0;0;0;337", "wc_reply_authors": "1200;345;583;1005", "reply_reviewers": "0;0;0;2", "reply_authors": "3;1;1;3", "recommendation_avg": [ 6.0, 1.224744871391589 ], "confidence_avg": [ 3.0, 0.0 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 124.75, 37.87066806910065 ], "wc_summary_review_avg": [ 88.75, 36.44430682562093 ], "wc_main_review_avg": [ 360.75, 152.70948726257973 ], "wc_review_avg": [ 574.25, 169.15285247373157 ], "wc_reply_reviewers_avg": [ 84.25, 145.9252805376779 ], "wc_reply_authors_avg": [ 783.25, 337.2746469866954 ], "reply_reviewers_avg": [ 0.5, 0.8660254037844386 ], "reply_authors_avg": [ 2.0, 1.0 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.8660254037844386, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7294578833882823428&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff_unique_index": "0;1;2", "aff_unique_norm": "University Paris 13;Universit\u00e9 Paris 13;INRIA", "aff_unique_dep": ";;", "aff_unique_url": "https://www.univ-paris13.fr;https://www.univ-paris13.fr;https://www.inria.fr", "aff_unique_abbr": "UP13;UP13;Inria", "aff_campus_unique_index": "1", "aff_campus_unique": ";Paris", "aff_country_unique_index": "0;0;0", "aff_country_unique": "France" }, { "title": "Structure-Aware Transformer Policy for Inhomogeneous Multi-Task Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6945", "id": "fy_XRVHqly", "poster": "", "openreview": "https://openreview.net/forum?id=fy_XRVHqly", "slides": "https://iclr.cc/virtual/2022/poster/6945", "video": "https://iclr.cc/virtual/2022/poster/6945", "author_site": "Sunghoon Hong, Deunsol Yoon, Kee-Eung Kim", "tldr": "", "abstract": "Modular Reinforcement Learning, where the agent is assumed to be morphologically structured as a graph, for example composed of limbs and joints, aims to learn a policy that is transferable to a structurally similar but different agent. Compared to traditional Multi-Task Reinforcement Learning, this promising approach allows us to cope with inhomogeneous tasks where the state and action space dimensions differ across tasks. Graph Neural Networks are a natural model for representing the pertinent policies, but a recent work has shown that their multi-hop message passing mechanism is not ideal for conveying important information to other modules and thus a transformer model without morphological information was proposed. In this work, we argue that the morphological information is still very useful and propose a transformer policy model that effectively encodes such information. Specifically, we encode the morphological information in terms of the traversal-based positional embedding and the graph-based relational embedding. We empirically show that the morphological information is crucial for modular reinforcement learning, substantially outperforming prior state-of-the-art methods on multi-task learning as well as transfer learning settings with different state and action space dimensions.", "keywords": "Multitask Reinforcement Learning;Modular Reinforcement Learning;Transfer Learning;Transformer;Structural Embedding", "primary_area": "", "supplementary_material": "", "author": "Sunghoon Hong;Deunsol Yoon;Kee-Eung Kim", "authorids": "~Sunghoon_Hong2;~Deunsol_Yoon1;~Kee-Eung_Kim2", "gender": "M;M;M", "homepage": "https://sunghoonhong.github.io;;http://ailab.kaist.ac.kr", "dblp": ";225/5388.html;35/6703", "google_scholar": "C5Vy-ZAAAAAJ;;https://scholar.google.com/citations?hl=ko", "orcid": ";;", "linkedin": ";;", "or_profile": "~Sunghoon_Hong2;~Deunsol_Yoon1;~Kee-Eung_Kim2", "aff": "Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology", "aff_domain": "kaist.ac.kr;kaist.ac.kr;kaist.ac.kr", "position": "MS student;MS student;Full Professor", "bibtex": "@inproceedings{\nhong2022structureaware,\ntitle={Structure-Aware Transformer Policy for Inhomogeneous Multi-Task Reinforcement Learning},\nauthor={Sunghoon Hong and Deunsol Yoon and Kee-Eung Kim},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=fy_XRVHqly}\n}", "github": "", "project": "", "reviewers": "4eRD;BRC4;bVEF;xDqR", "pdf_size": 0, "recommendation": "5;6;6;6", "confidence": "3;2;4;4", "correctness": "3;4;3;3", "technical_novelty": "2;3;3;2", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "49;191;77;109", "wc_summary_review": "43;78;72;39", "wc_main_review": "68;301;400;254", "wc_review": "160;570;549;402", "wc_reply_reviewers": "0;54;58;14", "wc_reply_authors": "206;383;346;441", "reply_reviewers": "0;1;1;1", "reply_authors": "1;1;1;1", "recommendation_avg": [ 5.75, 0.4330127018922193 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 106.5, 53.20479301717093 ], "wc_summary_review_avg": [ 58.0, 17.190113437671084 ], "wc_main_review_avg": [ 255.75, 120.52878286948723 ], "wc_review_avg": [ 420.25, 163.60375148510502 ], "wc_reply_reviewers_avg": [ 31.5, 25.034975534240093 ], "wc_reply_authors_avg": [ 344.0, 86.57078028988765 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.17407765595569782, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 37, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3674901670883664739&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=fy_XRVHqly", "email": "kaist.ac.kr;kaist.ac.kr;kaist.ac.kr", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Korea Advanced Institute of Science and Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.kaist.ac.kr", "aff_unique_abbr": "KAIST", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "South Korea" }, { "id": "g-xTi8MYSM", "title": "Improving Learning from Demonstrations by Learning from Experience", "track": "main", "status": "Withdraw", "tldr": "", "abstract": " How to make imitation learning more general when demonstrations are relative limited has been a persistent problem in reinforcement learning (RL). Poor demonstrations leads to narrow and biased date distribution, non-Markovian human expert demonstration makes it difficult for the agent to learn, and over-reliance on sub-optimal trajectories can make it hard for the agent to improve its performance. To solve these problems we propose a new algorithm named TD3fG that can smoothly transition from learning from experts to learning from experience. Our algorithm achieve good performance in mujoco environment with limited and sub-optimal demonstrations.\n We use behavior cloning to train network as a reference action generator and utilize it in terms of both loss function and exploration noise. This innovation can help agents extract a priori knowledge from demonstrations while reducing the detrimental effects of the poor Markovian properties of the demonstrations. It has better performance compared to the BC+ fine-tuning and DDPGfD approach, especially when the demonstrations are relatively limited. We call our method TD3fG meaning TD3 from a generator.", "keywords": "Behavior Cloning;Learning from demonstration", "primary_area": "", "supplementary_material": "", "author": "HAOFENG LIU;Yiwen Chen;Jiayi Tan;Marcelo H Ang Jr", "authorids": "~HAOFENG_LIU1;~Yiwen_Chen1;~Jiayi_Tan1;~Marcelo_H_Ang_Jr1", "gender": ";M;M;M", "homepage": "https://www.linkedin.com/in/haofeng-liu-aa48221b6/;https://github.com/yiwc;;http://guppy.mpe.nus.edu.sg/~mpeangh/", "dblp": ";;;", "google_scholar": ";;;dMogb2EAAAAJ", "orcid": ";;;0000-0001-8277-6408", "linkedin": ";;jiayitanJayee;marcelo-ang-41370b15", "or_profile": "~HAOFENG_LIU1;~Yiwen_Chen1;~Jiayi_Tan1;~Marcelo_H_Ang_Jr1", "aff": "National University of Singapore;National University of Singapore;National University of Singapore;National University of Singapore", "aff_domain": "nus.edu.sg;u.nus.edu;nus.edu.sg;nus.edu.sg", "position": "PhD student;PhD student;PhD student;Full Professor", "bibtex": "@misc{\nliu2022improving,\ntitle={Improving Learning from Demonstrations by Learning from Experience},\nauthor={HAOFENG LIU and Yiwen Chen and Jiayi Tan and Marcelo H Ang Jr},\nyear={2022},\nurl={https://openreview.net/forum?id=g-xTi8MYSM}\n}", "github": "", "project": "", "reviewers": "8mUY;mEmc;84BZ;CWhf", "site": "https://openreview.net/forum?id=g-xTi8MYSM", "pdf_size": 0, "recommendation": "1;1;3;3", "confidence": "4;5;4;3", "correctness": "1;4;4;2", "technical_novelty": "2;2;1;2", "empirical_novelty": "1;2;1;2", "wc_summary_paper": "68;10;47;59", "wc_summary_review": "111;2;33;80", "wc_main_review": "693;15;238;497", "wc_review": "872;27;318;636", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 2.0, 1.0 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 2.75, 1.299038105676658 ], "technical_novelty_avg": [ 1.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.5, 0.5 ], "wc_summary_paper_avg": [ 46.0, 22.079402165819616 ], "wc_summary_review_avg": [ 56.5, 41.96724913548659 ], "wc_main_review_avg": [ 360.75, 256.6927881729442 ], "wc_review_avg": [ 463.25, 319.50381453121963 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.7071067811865475, "corr_recommendation_correctness": 0.19245008972987523, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8584932004315282204&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "National University of Singapore", "aff_unique_dep": "", "aff_unique_url": "https://www.nus.edu.sg", "aff_unique_abbr": "NUS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Singapore" }, { "id": "g1D7SfQKbg", "title": "Learning with Noisy Labels by Efficient Transition Matrix Estimation to Combat Label Miscorrection", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Recent studies on learning with noisy labels have shown remarkable performance by exploiting a small clean dataset. In particular, model agnostic meta-learning-based label correction methods further improve performance by correcting noisy labels on the fly. However, there is no safeguard on the label miscorrection, resulting in unavoidable performance degradation. Moreover, every training step requires at least three back-propagations, significantly slowing down the training speed. To mitigate these issues, we propose a robust and efficient meta-learning method that learns a label transition matrix on the fly. Employing the transition matrix makes the classifier skeptical about all the corrected samples, which alleviates the miscorrection issue. We also introduce a two-head architecture to efficiently learn the label transition matrix every iteration within a single back-propagation, so that the matrix estimate closely follows the shifting distribution induced by label correction. Extensive experiments demonstrate that our approach shows the best performance in training efficiency while having comparable or better accuracy than existing methods.", "keywords": "Noisy Labels;Label Correction;Meta-Learning", "primary_area": "", "supplementary_material": "", "author": "Seong Min Kye;Kwanghee Choi;Joonyoung Yi;Buru Chang", "authorids": "~Seong_Min_Kye1;~Kwanghee_Choi1;~Joonyoung_Yi1;~Buru_Chang1", "gender": "M;M;M;Not Specified", "homepage": ";;;https://sites.google.com/view/buru-chang", "dblp": ";84/3338;;221/3390", "google_scholar": "ME7p0RUAAAAJ;IGXBRggAAAAJ;DfmoSNwAAAAJ;https://scholar.google.co.kr/citations?hl=ko", "orcid": ";;;0000-0002-7595-9035", "linkedin": ";;joonyoungyi/;", "or_profile": "~Seong_Min_Kye1;~Kwanghee_Choi1;~Joonyoung_Yi1;~Buru_Chang1", "aff": "Hyperconnect;Sogang University;;Hyperconnect", "aff_domain": "hpcnt.com;sogang.ac.kr;;hpcnt.com", "position": "Researcher;Undergrad student;;Research Scientist", "bibtex": "@misc{\nkye2022learning,\ntitle={Learning with Noisy Labels by Efficient Transition Matrix Estimation to Combat Label Miscorrection},\nauthor={Seong Min Kye and Kwanghee Choi and Joonyoung Yi and Buru Chang},\nyear={2022},\nurl={https://openreview.net/forum?id=g1D7SfQKbg}\n}", "github": "", "project": "", "reviewers": "kkDx;JQAo;xY1x;vg4C", "site": "https://openreview.net/forum?id=g1D7SfQKbg", "pdf_size": 0, "recommendation": "3;3;5;8", "confidence": "5;4;4;3", "correctness": "3;3;3;3", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "91;72;48;67", "wc_summary_review": "53;62;89;35", "wc_main_review": "428;170;402;142", "wc_review": "572;304;539;244", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.75, 2.0463381929681126 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 69.5, 15.305227865013967 ], "wc_summary_review_avg": [ 59.75, 19.48557158514987 ], "wc_main_review_avg": [ 285.5, 130.20272654595217 ], "wc_review_avg": [ 414.75, 142.81697203063788 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.8638684255813602, "corr_recommendation_correctness": 0.0, "gs_citation": 28, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13055133803881493745&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff_unique_index": "0;1;0", "aff_unique_norm": "Hyperconnect;Sogang University", "aff_unique_dep": ";", "aff_unique_url": ";https://www.sogang.ac.kr", "aff_unique_abbr": ";Sogang", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "1", "aff_country_unique": ";South Korea" }, { "title": "Wiring Up Vision: Minimizing Supervised Synaptic Updates Needed to Produce a Primate Ventral Stream", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6890", "id": "g1SzIRLQXMM", "poster": "", "openreview": "https://openreview.net/forum?id=g1SzIRLQXMM", "slides": "https://iclr.cc/virtual/2022/poster/6890", "video": "https://iclr.cc/virtual/2022/poster/6890", "author_site": "Franziska Geiger, Martin Schrimpf, Tiago Marques, James DiCarlo", "tldr": "", "abstract": "After training on large datasets, certain deep neural networks are surprisingly good models of the neural mechanisms of adult primate visual object recognition. Nevertheless, these models are considered poor models of the development of the visual system because they posit millions of sequential, precisely coordinated synaptic updates, each based on a labeled image. While ongoing research is pursuing the use of unsupervised proxies for labels, we here explore a complementary strategy of reducing the required number of supervised synaptic updates to produce an adult-like ventral visual stream (as judged by the match to V1, V2, V4, IT, and behavior). Such models might require less precise machinery and energy expenditure to coordinate these updates and would thus move us closer to viable neuroscientific hypotheses about how the visual system wires itself up. Relative to standard model training on labeled images in ImageNet, we here demonstrate that the total number of supervised weight updates can be substantially reduced using three complementary strategies: First, we find that only 2% of supervised updates (epochs and images) are needed to achieve 80% of the match to adult ventral stream. Specifically, training benefits predictions of higher visual cortex the most whereas early visual cortex predictions only improve marginally over the course of training. Second, by improving the random distribution of synaptic connectivity, we find that 54% of the brain match can already be achieved \u201cat birth\" (i.e. no training at all). Third, we find that, by training only 5% of model synapses, we can still achieve nearly 80% of the match to the ventral stream. This approach further improves on ImageNet performance over previous attempts in computer vision of minimizing trained components without substantially increasing the relative number of trained parameters. These results reflect first steps in modeling not just primate adult visual processing during inference, but also how the ventral visual stream might be \"wired up\" by evolution (a model's \"birth\" state) and by developmental learning (a model's updates based on visual experience).", "keywords": "computational neuroscience;primate visual ventral stream;convolutional neural networks;biologically plausible learning", "primary_area": "", "supplementary_material": "", "author": "Franziska Geiger;Martin Schrimpf;Tiago Marques;James J. DiCarlo", "authorids": "~Franziska_Geiger1;~Martin_Schrimpf1;~Tiago_Marques2;~James_J._DiCarlo1", "gender": "F;;M;M", "homepage": ";http://mschrimpf.com/;;http://dicarlolab.mit.edu", "dblp": ";190/7063;;80/7658", "google_scholar": ";RiZ-RdwAAAAJ;NWe5qUcAAAAJ;", "orcid": ";0000-0001-7766-7223;0000-0002-8973-0549;0000-0002-1592-5896", "linkedin": "franziska-geiger-8a8235b5/;mschrimpf/;tiago-marques-a18849b/;james-j-dicarlo/", "or_profile": "~Franziska_Geiger1;~Martin_Schrimpf1;~Tiago_Marques2;~James_J._DiCarlo1", "aff": ";Massachusetts Institute of Technology;Massachusetts Institute of Technology;Massachusetts Institute of Technology", "aff_domain": ";mit.edu;mit.edu;mit.edu", "position": ";PhD student;Postdoc;Full Professor", "bibtex": "@inproceedings{\ngeiger2022wiring,\ntitle={Wiring Up Vision: Minimizing Supervised Synaptic Updates Needed to Produce a Primate Ventral Stream},\nauthor={Franziska Geiger and Martin Schrimpf and Tiago Marques and James J. DiCarlo},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=g1SzIRLQXMM}\n}", "github": "", "project": "", "reviewers": "Rdb1;Ekkd;FEh9;pctr", "pdf_size": 0, "recommendation": "8;8;8;8", "confidence": "3;4;4;4", "correctness": "3;4;3;4", "technical_novelty": "3;3;3;3", "empirical_novelty": "2;4;4;3", "wc_summary_paper": "130;107;68;89", "wc_summary_review": "26;37;26;36", "wc_main_review": "304;316;694;368", "wc_review": "460;460;788;493", "wc_reply_reviewers": "53;35;56;0", "wc_reply_authors": "1220;989;485;249", "reply_reviewers": "1;1;1;0", "reply_authors": "2;3;1;1", "recommendation_avg": [ 8.0, 0.0 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 3.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 98.5, 22.830900113661748 ], "wc_summary_review_avg": [ 31.25, 5.261891294962297 ], "wc_main_review_avg": [ 420.5, 159.72711103629214 ], "wc_review_avg": [ 550.25, 137.92457177747553 ], "wc_reply_reviewers_avg": [ 36.0, 22.282279955157193 ], "wc_reply_authors_avg": [ 735.75, 386.7928224515031 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.75, 0.82915619758885 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8208290480111546994&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "pdf": "https://openreview.net/pdf?id=g1SzIRLQXMM", "email": ";mit.edu;mit.edu;mit.edu", "author_num": 4, "aff_unique_index": "0;0;0", "aff_unique_norm": "Massachusetts Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://web.mit.edu", "aff_unique_abbr": "MIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "End-to-End Learning of Probabilistic Hierarchies on Graphs", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/5940", "id": "g2LCQwG7Of", "poster": "", "openreview": "https://openreview.net/forum?id=g2LCQwG7Of", "slides": "https://iclr.cc/virtual/2022/poster/5940", "video": "https://iclr.cc/virtual/2022/poster/5940", "author_site": "Daniel Z\u00fcgner, Bertrand Charpentier, Morgane Ayle, Sascha Geringer, Stephan G\u00fcnnemann", "tldr": "", "abstract": "We propose a novel probabilistic model over hierarchies on graphs obtained by continuous relaxation of tree-based hierarchies. We draw connections to Markov chain theory, enabling us to perform hierarchical clustering by efficient end-to-end optimization of relaxed versions of quality metrics such as Dasgupta cost or Tree-Sampling Divergence (TSD). \nWe show that our model learns rich, high-quality hierarchies present in 11 real world graphs, including a large graph with 2.3M nodes. Our model consistently outperforms recent as well as strong traditional baselines such as average linkage. \nOur model also obtains strong results on link prediction despite not being trained on this task, highlighting the quality of the hierarchies discovered by our model.", "keywords": "hierarchical clustering;graphs;networks;graph mining;network mining;graph custering", "primary_area": "", "supplementary_material": "/attachment/d9198f18d4990899db54ba0f02ba3f931374569c.zip", "author": "Daniel Z\u00fcgner;Bertrand Charpentier;Morgane Ayle;Sascha Geringer;Stephan G\u00fcnnemann", "authorids": "~Daniel_Z\u00fcgner1;~Bertrand_Charpentier2;~Morgane_Ayle1;~Sascha_Geringer1;~Stephan_G\u00fcnnemann1", "gender": "M;;;;M", "homepage": ";https://sharpenb.github.io/;;https://www.in.tum.de/daml/;http://www.daml.in.tum.de", "dblp": "172/6951;222/1875;;;43/3011", "google_scholar": ";0rqI-ycAAAAJ;;;", "orcid": ";;;;", "linkedin": ";bertrand-charpentier-76995ab6/;morgane-ayle-732630184/;;", "or_profile": "~Daniel_Z\u00fcgner1;~Bertrand_Charpentier2;~Morgane_Ayle1;~Sascha_Geringer1;~Stephan_G\u00fcnnemann1", "aff": "Technical University Munich;Technical University Munich;;;Technical University Munich", "aff_domain": "tum.de;tum.de;;;tum.de", "position": "PhD student;PhD student;;;Professor", "bibtex": "@inproceedings{\nz{\\\"u}gner2022endtoend,\ntitle={End-to-End Learning of Probabilistic Hierarchies on Graphs},\nauthor={Daniel Z{\\\"u}gner and Bertrand Charpentier and Morgane Ayle and Sascha Geringer and Stephan G{\\\"u}nnemann},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=g2LCQwG7Of}\n}", "github": "", "project": "", "reviewers": "1sXn;knNt;8G7Q", "pdf_size": 0, "recommendation": "6;6;8", "confidence": "4;4;3", "correctness": "3;3;4", "technical_novelty": "3;3;4", "empirical_novelty": "3;3;0", "wc_summary_paper": "104;31;85", "wc_summary_review": "69;75;43", "wc_main_review": "166;306;96", "wc_review": "339;412;224", "wc_reply_reviewers": "37;24;0", "wc_reply_authors": "848;919;215", "reply_reviewers": "1;1;0", "reply_authors": "2;2;1", "recommendation_avg": [ 6.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 3.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.0, 1.4142135623730951 ], "wc_summary_paper_avg": [ 73.33333333333333, 30.922843048824312 ], "wc_summary_review_avg": [ 62.333333333333336, 13.888444437333106 ], "wc_main_review_avg": [ 189.33333333333334, 87.3053390247253 ], "wc_review_avg": [ 325.0, 77.38647599333275 ], "wc_reply_reviewers_avg": [ 20.333333333333332, 15.3260852434302 ], "wc_reply_authors_avg": [ 660.6666666666666, 316.4641457662393 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.9999999999999998, "corr_recommendation_correctness": 0.9999999999999998, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16422732650207450432&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2, "pdf": "https://openreview.net/pdf?id=g2LCQwG7Of", "email": "tum.de;tum.de;;;tum.de", "author_num": 5, "aff_unique_index": "0;0;0", "aff_unique_norm": "Technical University of Munich", "aff_unique_dep": "", "aff_unique_url": "https://www.tum.de", "aff_unique_abbr": "TUM", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Germany" }, { "id": "g4nVdxU9RK", "title": "Rewardless Open-Ended Learning (ROEL)", "track": "main", "status": "Reject", "tldr": "", "abstract": "Open-ended learning algorithms aim to automatically generate challenges and solutions to an unending sequence of learning opportunities. In Reinforcement Learning (RL) recent approaches to open-ended learning, such as Paired Open-Ended Trailblazer (POET), focus on collecting a diverse set of solutions based on the novelty of an agents pre-defined reward function. In many practical RL tasks defining an effective reward function a priori is often hard and can hinder an agents ability to explore many behaviors that could ultimately be more performant. In this work we combine open-ended learning with unsupervised reinforcement learning to train agents to learn a diverse set of complex skills. We propose a procedure to combine skill-discovery via mutual information, using the POET algorithm as an open-ended framework to teach agents increasingly complex groups of diverse skills. Experimentally we demonstrate this approach yields agents capable of demonstrating identifiable skills over a range of environments, that can be extracted and utilized to solve a variety of tasks.", "keywords": "unsupervised reinforcement learning;open-ended learning;skill discovery", "primary_area": "", "supplementary_material": "/attachment/485969c7f54881dccb8334d81f0eeebc907c18b2.zip", "author": "Alexander Quessy;Thomas Stuart Richardson", "authorids": "~Alexander_Quessy1;~Thomas_Stuart_Richardson1", "gender": "M;M", "homepage": "https://aos55.github.io;https://www.bristol.ac.uk/people/person/Tom-Richardson-63e47259-1d08-4e30-9353-9b1b22e0f749/", "dblp": ";", "google_scholar": "https://scholar.google.com/citations?hl=en;", "orcid": ";", "linkedin": "https://uk.linkedin.com/in/alexander-quessy;", "or_profile": "~Alexander_Quessy1;~Thomas_Stuart_Richardson1", "aff": "University of Bristol;University of Bristol", "aff_domain": "bris.ac.uk;bristol.ac.uk", "position": "PhD student;Full Professor", "bibtex": "@misc{\nquessy2022rewardless,\ntitle={Rewardless Open-Ended Learning ({ROEL})},\nauthor={Alexander Quessy and Thomas Stuart Richardson},\nyear={2022},\nurl={https://openreview.net/forum?id=g4nVdxU9RK}\n}", "github": "", "project": "", "reviewers": "Bne6;2GeS;HJjY;kQnm", "site": "https://openreview.net/forum?id=g4nVdxU9RK", "pdf_size": 0, "recommendation": "3;3;3;6", "confidence": "4;4;3;2", "correctness": "3;3;3;3", "technical_novelty": "3;2;2;3", "empirical_novelty": "2;2;0;2", "wc_summary_paper": "33;46;88;166", "wc_summary_review": "26;52;45;56", "wc_main_review": "254;505;784;595", "wc_review": "313;603;917;817", "wc_reply_reviewers": "0;0;125;86", "wc_reply_authors": "139;625;537;1193", "reply_reviewers": "0;0;1;1", "reply_authors": "1;1;1;2", "recommendation_avg": [ 3.75, 1.299038105676658 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 1.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 83.25, 51.92001059321926 ], "wc_summary_review_avg": [ 44.75, 11.519006033508273 ], "wc_main_review_avg": [ 534.5, 190.69674879242174 ], "wc_review_avg": [ 662.5, 231.47948073209426 ], "wc_reply_reviewers_avg": [ 52.75, 54.522357799346864 ], "wc_reply_authors_avg": [ 623.5, 376.34923940404076 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.8703882797784892, "corr_recommendation_correctness": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4409723982146156396&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "University of Bristol", "aff_unique_dep": "", "aff_unique_url": "https://www.bristol.ac.uk", "aff_unique_abbr": "Bristol", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United Kingdom" }, { "id": "g5odb-gVVZY", "title": "Multilevel physics informed neural networks (MPINNs)", "track": "main", "status": "Reject", "tldr": "", "abstract": "In this paper we introduce multilevel physics informed neural networks (MPINNs). Inspired by classical multigrid methods for the solution of linear systems arising from the discretization of PDEs, our MPINNs are based on the classical correction scheme, which represents the solution as the sum of a fine and a coarse term that are optimized in an alternate way. We show that the proposed approach allows to reproduce in the neural network training the classical acceleration effect observed for classical multigrid methods, thus providing a PINN that shows improved performance compared to the state-of-the-art. Thanks to the support of the coarse model, MPINNs provide indeed a faster and improved decrease of the approximation error in the case both of elliptic and nonlinear equations.\n", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/2ca02be9efa4b505dc88c8e92265b049f9868214.zip", "author": "Elisa Riccietti;Valentin Mercier;Serge Gratton;Pierre Boudier", "authorids": "~Elisa_Riccietti1;~Valentin_Mercier1;~Serge_Gratton2;~Pierre_Boudier1", "gender": "F;M;M;M", "homepage": "http://perso.ens-lyon.fr/elisa.riccietti/;;http://gratton.perso.enseeiht.fr/;https://www.linkedin.com/in/pierre-boudier-a9587b1/", "dblp": "179/5701;;71/3633;", "google_scholar": "NtPpissAAAAJ;;https://scholar.google.fr/citations?user=q9HdQc4AAAAJ;", "orcid": ";;0000-0002-5021-2357;", "linkedin": ";valentin-mercier-b2a142144/;;", "or_profile": "~Elisa_Riccietti1;~Valentin_Mercier1;~Serge_Gratton2;~Pierre_Boudier1", "aff": "ENS Lyon;IRIT;University of Toulouse, IRIT;NVIDIA", "aff_domain": "ens.fr;irit.fr;irit.fr;nvidia.com", "position": "Associate Professor;PhD student;Full Professor;software architect", "bibtex": "@misc{\nriccietti2022multilevel,\ntitle={Multilevel physics informed neural networks ({MPINN}s)},\nauthor={Elisa Riccietti and Valentin Mercier and Serge Gratton and Pierre Boudier},\nyear={2022},\nurl={https://openreview.net/forum?id=g5odb-gVVZY}\n}", "github": "", "project": "", "reviewers": "Ji4t;EmLF;9RP9;Ugje", "site": "https://openreview.net/forum?id=g5odb-gVVZY", "pdf_size": 0, "recommendation": "3;5;5;8", "confidence": "4;4;4;4", "correctness": "3;3;3;3", "technical_novelty": "2;3;3;4", "empirical_novelty": "2;3;3;1", "wc_summary_paper": "140;61;78;57", "wc_summary_review": "64;79;100;118", "wc_main_review": "386;313;426;707", "wc_review": "590;453;604;882", "wc_reply_reviewers": "305;0;269;7", "wc_reply_authors": "259;299;1058;625", "reply_reviewers": "1;0;1;1", "reply_authors": "2;1;2;1", "recommendation_avg": [ 5.25, 1.7853571071357126 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 84.0, 33.279122584587476 ], "wc_summary_review_avg": [ 90.25, 20.498475553074673 ], "wc_main_review_avg": [ 458.0, 149.36030262422474 ], "wc_review_avg": [ 632.25, 155.79533850536095 ], "wc_reply_reviewers_avg": [ 145.25, 142.341798147979 ], "wc_reply_authors_avg": [ 560.25, 320.5272024337404 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7829900509189088838&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "Ecole Normale Sup\u00e9rieure de Lyon;Institut de Recherche en Informatique de Toulouse;University of Toulouse;NVIDIA", "aff_unique_dep": ";;Institut de Recherche en Informatique de Toulouse (IRIT);NVIDIA Corporation", "aff_unique_url": "https://www.ens-lyon.fr;https://www.irit.fr;https://www.univ-toulouse.fr;https://www.nvidia.com", "aff_unique_abbr": "ENS Lyon;IRIT;UT;NVIDIA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1", "aff_country_unique": "France;United States" }, { "title": "On the Convergence of mSGD and AdaGrad for Stochastic Optimization", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6933", "id": "g5tANwND04i", "poster": "", "openreview": "https://openreview.net/forum?id=g5tANwND04i", "slides": "https://iclr.cc/virtual/2022/poster/6933", "video": "https://iclr.cc/virtual/2022/poster/6933", "author_site": "Ruinan Jin, Yu XING, Xingkang He", "tldr": "", "abstract": "As one of the most fundamental stochastic optimization algorithms, stochastic gradient descent (SGD) has been intensively developed and extensively applied in machine learning in the past decade. There have been some modified SGD-type algorithms, which outperform the SGD in many competitions and applications in terms of convergence rate and accuracy, such as momentum-based SGD (mSGD) and adaptive gradient algorithm (AdaGrad). Despite these empirical successes, the theoretical properties of these algorithms have not been well established due to technical difficulties. With this motivation, we focus on convergence analysis of mSGD and AdaGrad for any smooth (possibly non-convex) loss functions in stochastic optimization. First, we prove that the iterates of mSGD are asymptotically convergent to a connected set of stationary points with probability one, which is more general than existing works on subsequence convergence or convergence of time averages. Moreover, we prove that the loss function of mSGD decays at a certain rate faster than that of SGD. In addition, we prove the iterates of AdaGrad are asymptotically convergent to a connected set of stationary points with probability one. Also, this result extends the results from the literature on subsequence convergence and the convergence of time averages. Despite the generality of the above convergence results, we have relaxed some assumptions of gradient noises, convexity of loss functions, as well as boundedness of iterates.", "keywords": "stochastic gradient descent;adaptive gradient algorithm;asymptotic convergence", "primary_area": "", "supplementary_material": "", "author": "ruinan Jin;Yu Xing;Xingkang He", "authorids": "~ruinan_Jin2;~Yu_Xing3;~Xingkang_He1", "gender": "M;;M", "homepage": "https://scholar.google.com.hk/citations?hl=zh-CN&view_op=list_works&gmla=ABEO0YpMEhFteT3tG3yjwPnBjv4ebvgbyl-hmC7FT9L439bSws7FLcpMFODaF4dgg0F8vMhRXdeHXavir1trk1nOlso&user=tWQlpCUAAAAJ;;https://people.kth.se/~xingkang/index.html", "dblp": "index.html;;", "google_scholar": "https://scholar.google.com.hk/citations?hl=zh-CN;MZRqhuAAAAAJ;", "orcid": ";;", "linkedin": ";;", "or_profile": "~ruinan_Jin2;~Yu_Xing3;~Xingkang_He1", "aff": "Academy of Mathematics and Systems Science, Chinese Academy of Sciences, Chinese Academy of Sciences;;", "aff_domain": "amss.ac.cn;;", "position": "PhD student;;", "bibtex": "@inproceedings{\njin2022on,\ntitle={On the Convergence of m{SGD} and AdaGrad for Stochastic Optimization},\nauthor={ruinan Jin and Yu Xing and Xingkang He},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=g5tANwND04i}\n}", "github": "", "project": "", "reviewers": "qmsj;MDU9;ecdk", "pdf_size": 0, "recommendation": "6;6;6", "confidence": "3;4;2", "correctness": "4;2;4", "technical_novelty": "3;3;3", "empirical_novelty": "3;0;0", "wc_summary_paper": "47;34;7", "wc_summary_review": "61;19;43", "wc_main_review": "180;563;180", "wc_review": "288;616;230", "wc_reply_reviewers": "34;158;0", "wc_reply_authors": "321;949;172", "reply_reviewers": "1;3;0", "reply_authors": "1;3;1", "recommendation_avg": [ 6.0, 0.0 ], "confidence_avg": [ 3.0, 0.816496580927726 ], "correctness_avg": [ 3.3333333333333335, 0.9428090415820634 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 1.0, 1.4142135623730951 ], "wc_summary_paper_avg": [ 29.333333333333332, 16.659998666133067 ], "wc_summary_review_avg": [ 41.0, 17.204650534085253 ], "wc_main_review_avg": [ 307.6666666666667, 180.54793146296515 ], "wc_review_avg": [ 378.0, 169.94901196143115 ], "wc_reply_reviewers_avg": [ 64.0, 67.90189000805992 ], "wc_reply_authors_avg": [ 480.6666666666667, 336.70197834616624 ], "reply_reviewers_avg": [ 1.3333333333333333, 1.247219128924647 ], "reply_authors_avg": [ 1.6666666666666667, 0.9428090415820634 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 21, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2654312612032985154&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=g5tANwND04i", "email": "amss.ac.cn;;", "author_num": 3, "aff_unique_index": "0", "aff_unique_norm": "Chinese Academy of Sciences", "aff_unique_dep": "Academy of Mathematics and Systems Science", "aff_unique_url": "http://www.cas.cn", "aff_unique_abbr": "CAS", "aff_country_unique_index": "0", "aff_country_unique": "China" }, { "title": "Properties from mechanisms: an equivariance perspective on identifiable representation learning", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7092", "id": "g5ynW-jMq4M", "poster": "", "openreview": "https://openreview.net/forum?id=g5ynW-jMq4M", "slides": "https://iclr.cc/virtual/2022/poster/7092", "video": "https://iclr.cc/virtual/2022/poster/7092", "author_site": "Kartik Ahuja, Jason Hartford, Yoshua Bengio", "tldr": "", "abstract": "A key goal of unsupervised representation learning is ``inverting'' a data generating process to recover its latent properties. Existing work that provably achieves this goal relies on strong assumptions on relationships between the latent variables (e.g., independence conditional on auxiliary information). In this paper, we take a very different perspective on the problem and ask, ``Can we instead identify latent properties by leveraging knowledge of the mechanisms that govern their evolution?'' We provide a complete characterization of the sources of non-identifiability as we vary knowledge about a set of possible mechanisms. In particular, we prove that if we know the exact mechanisms under which the latent properties evolve, then identification can be achieved up to any equivariances that are shared by the underlying mechanisms. We generalize this characterization to settings where we only know some hypothesis class over possible mechanisms, as well as settings where the mechanisms are stochastic. We demonstrate the power of this mechanism-based perspective by showing that we can leverage our results to generalize existing identifiable representation learning results. These results suggest that by exploiting inductive biases on mechanisms, it is possible to design a range of new identifiable representation learning approaches.", "keywords": "representation learning;equivariance;independent component analysis;ICA;autoencoders", "primary_area": "", "supplementary_material": "", "author": "Kartik Ahuja;Jason Hartford;Yoshua Bengio", "authorids": "~Kartik_Ahuja1;~Jason_Hartford1;~Yoshua_Bengio1", "gender": ";M;M", "homepage": ";https://jhartford.github.io;http://yoshuabengio.org", "dblp": ";191/6716;56/953", "google_scholar": ";https://scholar.google.ca/citations?user=eBNK7SsAAAAJ;kukA0LcAAAAJ", "orcid": ";;", "linkedin": ";jasonhartford1/;yoshuabengio/?originalSubdomain=ca", "or_profile": "~Kartik_Ahuja1;~Jason_Hartford1;~Yoshua_Bengio1", "aff": ";Montreal Institute for Learning Algorithms, University of Montreal, University of Montreal;University of Montreal", "aff_domain": ";mila.umontreal.ca;umontreal.ca", "position": ";Postdoc;Full Professor", "bibtex": "@inproceedings{\nahuja2022properties,\ntitle={Properties from mechanisms: an equivariance perspective on identifiable representation learning},\nauthor={Kartik Ahuja and Jason Hartford and Yoshua Bengio},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=g5ynW-jMq4M}\n}", "github": "", "project": "", "reviewers": "bAsz;muVw;7um5", "pdf_size": 0, "recommendation": "6;6;8", "confidence": "3;2;4", "correctness": "4;4;3", "technical_novelty": "3;4;3", "empirical_novelty": "0;0;0", "wc_summary_paper": "70;43;260", "wc_summary_review": "78;9;56", "wc_main_review": "465;276;686", "wc_review": "613;328;1002", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 6.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 3.0, 0.816496580927726 ], "correctness_avg": [ 3.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 3.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 0, 0 ], "wc_summary_paper_avg": [ 124.33333333333333, 96.56201231448226 ], "wc_summary_review_avg": [ 47.666666666666664, 28.778850258865834 ], "wc_main_review_avg": [ 475.6666666666667, 167.55165041131514 ], "wc_review_avg": [ 647.6666666666666, 276.2490824519704 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.8660254037844385, "corr_recommendation_correctness": -0.9999999999999998, "gs_citation": 41, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2588161739937352461&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=g5ynW-jMq4M", "email": ";mila.umontreal.ca;umontreal.ca", "author_num": 3, "aff_unique_index": "0;0", "aff_unique_norm": "University of Montreal", "aff_unique_dep": "Montreal Institute for Learning Algorithms", "aff_unique_url": "https://www.umontreal.ca", "aff_unique_abbr": "UM", "aff_campus_unique_index": "0", "aff_campus_unique": "Montreal;", "aff_country_unique_index": "0;0", "aff_country_unique": "Canada" }, { "id": "g6UqpVislvH", "title": "Generalized Fourier Features for Coordinate-Based Learning of Functions on Manifolds", "track": "main", "status": "Reject", "tldr": "", "abstract": "Recently, positional encoding of input coordinates has been found crucial to enable learning of high-frequency functions with multilayer perceptrons taking low-dimensional coordinate values. In this setting, sinusoids are typically used as a basis for the encoding, which is commonly referred to as \"Fourier Features\". However, using sinusoids as a basis assumes that the input coordinates lie on Euclidean space. In this work, we generalize positional encoding with Fourier features to non-Euclidean manifolds. We find appropriate bases for positional encoding on manifolds through generalizations of Fourier series. By ensuring the encodings lie on a hypersphere and that the appropriate shifts on the manifold preserve inner-products between encodings, our model approximates convolutions on the manifold, according to the neural tangent kernel (NTK) assumptions. We demonstrate our method on various tasks on different manifolds: 1) learning panoramas on the sphere, 2) learning probability distributions on the rotation manifold, 3) learning neural radiance fields on the product of cube and sphere, and 4) learning light fields represented as the product of spheres.", "keywords": "positional encoding;fourier features;coordinate-based mlp", "primary_area": "", "supplementary_material": "", "author": "Carlos Esteves;Tianjian Lu;Mohammed Suhail;Yi-fan Chen\u200e;Ameesh Makadia", "authorids": "~Carlos_Esteves1;~Tianjian_Lu1;~Mohammed_Suhail1;yifanchen@google.com;~Ameesh_Makadia1", "gender": "M;M;M;;", "homepage": "http://machc.github.io;;https://mohammedsuhail.net;;http://www.ameeshmakadia.com/index.html", "dblp": "206/6834;;259/5076;;59/6004", "google_scholar": "cFFrCF0AAAAJ;eWEj9g0AAAAJ;Z8wBBggAAAAJ;;OT1uf7kAAAAJ", "orcid": "0000-0001-9413-1201;;;;", "linkedin": "machc;;mohammed-suhail-3653969a/;;", "or_profile": "~Carlos_Esteves1;~Tianjian_Lu1;~Mohammed_Suhail1;yifanchen@google.com;~Ameesh_Makadia1", "aff": "Google;Google;University of British Columbia;;Google", "aff_domain": "google.com;google.com;ubc.ca;;google.com", "position": "Research Scientist;Engineer;PhD student;;Research Scientist", "bibtex": "@misc{\nesteves2022generalized,\ntitle={Generalized Fourier Features for Coordinate-Based Learning of Functions on Manifolds},\nauthor={Carlos Esteves and Tianjian Lu and Mohammed Suhail and Yi-fan Chen\u200e and Ameesh Makadia},\nyear={2022},\nurl={https://openreview.net/forum?id=g6UqpVislvH}\n}", "github": "", "project": "", "reviewers": "BYCg;awek;1zs2;8frW;CgLb", "site": "https://openreview.net/forum?id=g6UqpVislvH", "pdf_size": 0, "recommendation": "3;3;5;6;10", "confidence": "4;4;4;4;4", "correctness": "3;4;2;3;4", "technical_novelty": "3;2;2;3;4", "empirical_novelty": "2;2;2;2;4", "wc_summary_paper": "102;19;36;20;35", "wc_summary_review": "159;64;63;35;10", "wc_main_review": "397;184;224;200;68", "wc_review": "658;267;323;255;113", "wc_reply_reviewers": "68;0;0;66;0", "wc_reply_authors": "633;188;272;589;11", "reply_reviewers": "1;0;0;1;0", "reply_authors": "1;1;1;2;1", "recommendation_avg": [ 5.4, 2.5768197453450252 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.2, 0.7483314773547882 ], "technical_novelty_avg": [ 2.8, 0.7483314773547882 ], "empirical_novelty_avg": [ 2.4, 0.8 ], "wc_summary_paper_avg": [ 42.4, 30.6502854799103 ], "wc_summary_review_avg": [ 66.2, 50.51494828266184 ], "wc_main_review_avg": [ 214.6, 105.82929651093784 ], "wc_review_avg": [ 323.2, 181.1545196786434 ], "wc_reply_reviewers_avg": [ 26.8, 32.82925524589311 ], "wc_reply_authors_avg": [ 338.6, 238.24743440381474 ], "reply_reviewers_avg": [ 0.4, 0.48989794855663565 ], "reply_authors_avg": [ 1.2, 0.4 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.2696654394346322, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12311416830906573750&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Google;University of British Columbia", "aff_unique_dep": "Google;", "aff_unique_url": "https://www.google.com;https://www.ubc.ca", "aff_unique_abbr": "Google;UBC", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "United States;Canada" }, { "title": "NODE-GAM: Neural Generalized Additive Model for Interpretable Deep Learning", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6355", "id": "g8NJR6fCCl8", "poster": "", "openreview": "https://openreview.net/forum?id=g8NJR6fCCl8", "slides": "https://iclr.cc/virtual/2022/poster/6355", "video": "https://iclr.cc/virtual/2022/poster/6355", "author_site": "Chun-Hao Chang, Rich Caruana, Anna Goldenberg", "tldr": "", "abstract": "Deployment of machine learning models in real high-risk settings (e.g. healthcare) often depends not only on the model's accuracy but also on its fairness, robustness, and interpretability. Generalized Additive Models (GAMs) are a class of interpretable models with a long history of use in these high-risk domains, but they lack desirable features of deep learning such as differentiability and scalability. In this work, we propose a neural GAM (NODE-GAM) and neural GA$^2$M (NODE-GA$^2$M) that scale well and perform better than other GAMs on large datasets, while remaining interpretable compared to other ensemble and deep learning models. We demonstrate that our models find interesting patterns in the data. Lastly, we show that we are able to improve model accuracy via self-supervised pre-training, an improvement that is not possible for non-differentiable GAMs.", "keywords": "Generalized Additive Model;Deep Learning Architecture;Interpretability", "primary_area": "", "supplementary_material": "/attachment/7cbc9f15fabf2a7e90216f2567043979fc8e3994.zip", "author": "Chun-Hao Chang;Rich Caruana;Anna Goldenberg", "authorids": "~Chun-Hao_Chang1;~Rich_Caruana1;~Anna_Goldenberg1", "gender": "M;M;F", "homepage": "http://www.cs.toronto.edu/~kingsley/;;http://goldenberglab.ca/", "dblp": "88/1509;;06/3543", "google_scholar": ";https://scholar.google.com/scholar?hl=en;https://scholar.google.com.tw/citations?user=cEepZOEAAAAJ", "orcid": ";;0000-0002-2416-833X", "linkedin": ";;", "or_profile": "~Chun-Hao_Chang1;~Rich_Caruana1;~Anna_Goldenberg1", "aff": ";;University of Toronto", "aff_domain": ";;utoronto.ca", "position": ";;Associate Professor", "bibtex": "@inproceedings{\nchang2022nodegam,\ntitle={{NODE}-{GAM}: Neural Generalized Additive Model for Interpretable Deep Learning},\nauthor={Chun-Hao Chang and Rich Caruana and Anna Goldenberg},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=g8NJR6fCCl8}\n}", "github": "", "project": "", "reviewers": "5LN4;qoW7;L13g", "pdf_size": 0, "recommendation": "5;8;8", "confidence": "2;3;4", "correctness": "2;4;4", "technical_novelty": "3;3;3", "empirical_novelty": "2;0;3", "wc_summary_paper": "75;271;121", "wc_summary_review": "37;17;98", "wc_main_review": "55;125;618", "wc_review": "167;413;837", "wc_reply_reviewers": "0;10;547", "wc_reply_authors": "301;181;1123", "reply_reviewers": "0;1;2", "reply_authors": "1;2;3", "recommendation_avg": [ 7.0, 1.4142135623730951 ], "confidence_avg": [ 3.0, 0.816496580927726 ], "correctness_avg": [ 3.3333333333333335, 0.9428090415820634 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 1.6666666666666667, 1.247219128924647 ], "wc_summary_paper_avg": [ 155.66666666666666, 83.68724846447968 ], "wc_summary_review_avg": [ 50.666666666666664, 34.451253807211266 ], "wc_main_review_avg": [ 266.0, 250.5367571169282 ], "wc_review_avg": [ 472.3333333333333, 276.72529499286634 ], "wc_reply_reviewers_avg": [ 185.66666666666666, 255.5338638136941 ], "wc_reply_authors_avg": [ 535.0, 418.6549892214352 ], "reply_reviewers_avg": [ 1.0, 0.816496580927726 ], "reply_authors_avg": [ 2.0, 0.816496580927726 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.8660254037844387, "corr_recommendation_correctness": 1.0, "gs_citation": 98, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3759801935043653070&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=g8NJR6fCCl8", "email": ";;utoronto.ca", "author_num": 3, "aff_unique_index": "0", "aff_unique_norm": "University of Toronto", "aff_unique_dep": "", "aff_unique_url": "https://www.utoronto.ca", "aff_unique_abbr": "U of T", "aff_country_unique_index": "0", "aff_country_unique": "Canada" }, { "id": "g9B7h9gycMg", "title": "LONG-TAILED RECOGNITION BY LEARNING FROM LATENT CATEGORIES", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "In this work, we address the challenging task of long-tailed recognition. Previous long-tailed recognition methods commonly focus on data augmentation of tailed classes or re-balancing strategy to give more attention to tailed classes during training. However, due to the limited training images for tailed classes, the diversity of augmented images are still restricted, which results in poor feature representations. In this work, we argue that there are common latent features between the head and tailed classes that can be used to give better feature representation. We propose to learn a set of semantic and class-agnostic latent features shared by the head and tailed classes. Then, we implicitly enrich the training sample diversity via leveraging semantic data augmentation for the commonality features. We evaluate our methods on several popular long-tailed datasets and achieve new state-of-the-art performance consistently.", "keywords": "Long-Tailed Recognition;Latent Category", "primary_area": "", "supplementary_material": "", "author": "Weide Liu;Zhonghua Wu;Yiming Wang;Henghui Ding;Fayao Liu;Jie Lin;Guosheng Lin", "authorids": "~Weide_Liu2;~Zhonghua_Wu2;~Yiming_Wang3;~Henghui_Ding2;~Fayao_Liu2;~Jie_Lin1;~Guosheng_Lin2", "gender": "M;M;F;M;M;M;M", "homepage": "https://wu-zhonghua.github.io/;https://github.com/chronbird;https://sites.google.com/site/fayaoliu/;;https://guosheng.github.io/;https://henghuiding.github.io/;", "dblp": ";;91/9687;88/6731;126/4778;230/1216;261/9166", "google_scholar": "https://scholar.google.com.sg/citations?user=wMDgLCYAAAAJ;;AxY1-SIAAAAJ;;https://scholar.google.com.au/citations?user=ZudEhvcAAAAJ;WI_flSwAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";;0000-0001-6649-7660;;0000-0002-0329-7458;0000-0003-4868-6526;", "linkedin": ";;;;;;weide-liu-44352aa2", "or_profile": "~Zhonghua_Wu2;~Yiming_Wang3;~Fayao_Liu2;~Jie_Lin1;~Guosheng_Lin2;~Henghui_Ding1;~WEIDE_LIU1", "aff": "Nanyang Technological University;Nanyang Technological University;A*STAR;I2R, A*STAR;Nanyang Technological University;Swiss Federal Institute of Technology;Nanyang Technological University", "aff_domain": "ntu.edu.sg;ntu.edu.sg;a-star.edu.sg;i2r.a-star.edu.sg;ntu.edu.sg;ethz.ch;ntu.edu.sg", "position": "PhD student;MS student;Research Scientist;Research Scientist;Assistant Professor;Postdoc;PhD student", "bibtex": "@misc{\nliu2022longtailed,\ntitle={{LONG}-{TAILED} {RECOGNITION} {BY} {LEARNING} {FROM} {LATENT} {CATEGORIES}},\nauthor={Weide Liu and Zhonghua Wu and Yiming Wang and Henghui Ding and Fayao Liu and Jie Lin and Guosheng Lin},\nyear={2022},\nurl={https://openreview.net/forum?id=g9B7h9gycMg}\n}", "github": "", "project": "", "reviewers": "vWgY;khAu;KYko;Gawj", "site": "https://openreview.net/forum?id=g9B7h9gycMg", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "5;4;4;3", "correctness": "2;4;3;3", "technical_novelty": "2;3;2;3", "empirical_novelty": "2;3;2;3", "wc_summary_paper": "29;68;129;54", "wc_summary_review": "45;70;37;29", "wc_main_review": "334;361;306;78", "wc_review": "408;499;472;161", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 70.0, 36.817115585010185 ], "wc_summary_review_avg": [ 45.25, 15.368392889303683 ], "wc_main_review_avg": [ 269.75, 112.40190167430443 ], "wc_review_avg": [ 385.0, 133.48220855230107 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": -0.9733285267845754, "corr_recommendation_correctness": 0.6488856845230502, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4515199496789275998&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0;1;2;0;3;0", "aff_unique_norm": "Nanyang Technological University;Agency for Science, Technology and Research;A*STAR;Swiss Federal Institute of Technology", "aff_unique_dep": ";;Institute for Infocomm Research;", "aff_unique_url": "https://www.ntu.edu.sg;https://www.a-star.edu.sg;https://www.a-star.edu.sg;https://www.ethz.ch", "aff_unique_abbr": "NTU;A*STAR;A*STAR;ETH Zurich", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;1;0", "aff_country_unique": "Singapore;Switzerland" }, { "id": "g9hjVsv3lOC", "title": "Deep Neural Networks on EEG signals to predict Attention Score using Gramian Angular Difference Field", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Auditory attention is a selective type of hearing in which people focus their attention intentionally on a specific source of a sound or spoken words whilst ignoring or inhibiting other auditory stimuli. In some sense, the auditory attention score of an individual shows the focus the person can have in auditory tasks. The recent advancements in deep learning and in the non-invasive technologies recording neural activity beg the question, can deep learning along with technologies such as electroencephalography (EEG) be used to predict the auditory attention score of an individual? In this paper, we focus on this very problem of estimating a person's auditory attention level based on their brain's electrical activity captured using 14-channeled EEG signals. More specifically, we deal with attention estimation as a regression problem. The work has been performed on the publicly available Phyaat dataset. The concept of Gramian Angular Difference Field (GADF) has been used to convert time-series EEG data into an image having 14 channels, enabling us to train various deep learning models such as 2D CNN, 3D CNN, and convolutional autoencoders. Their performances have been compared amongst themselves as well as with the work done previously. Amongst the different models we tried, 2D CNN gave the best performance. It outperformed the existing methods by a decent margin of 0.22 mean absolute error (MAE).", "keywords": "Auditory attention;Gramian Angular Difference Field;Electroencephalography", "primary_area": "", "supplementary_material": "", "author": "Mahak Kothari;Shreyansh Joshi;Adarsh Nandanwar;Veeky Baths", "authorids": "~Mahak_Kothari1;~Shreyansh_Joshi1;f20180396@goa.bits-pilani.ac.in;veeky@goa.bits-pilani.ac.in", "gender": "M;M;;", "homepage": ";https://shreyanshjoshi.github.io/;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": "mahak-kothari-001982167/;shreyansh-joshi-b7135018a/;;", "or_profile": "~Mahak_Kothari1;~Shreyansh_Joshi1;f20180396@goa.bits-pilani.ac.in;veeky@goa.bits-pilani.ac.in", "aff": "BITS Pilani, Dhirubhai Ambani Institute Of Information and Communication Technology;BITS Pilani, Dhirubhai Ambani Institute Of Information and Communication Technology;;", "aff_domain": "bits-pilani.ac.in;bits-pilani.ac.in;;", "position": "Undergrad student;Undergrad student;;", "bibtex": "@misc{\nkothari2022deep,\ntitle={Deep Neural Networks on {EEG} signals to predict Attention Score using Gramian Angular Difference Field},\nauthor={Mahak Kothari and Shreyansh Joshi and Adarsh Nandanwar and Veeky Baths},\nyear={2022},\nurl={https://openreview.net/forum?id=g9hjVsv3lOC}\n}", "github": "", "project": "", "reviewers": "b9Yg;zRBz;VF7Y;z5pi", "site": "https://openreview.net/forum?id=g9hjVsv3lOC", "pdf_size": 0, "recommendation": "1;1;3;3", "confidence": "5;5;5;4", "correctness": "2;1;2;2", "technical_novelty": "1;1;1;2", "empirical_novelty": "1;1;1;2", "wc_summary_paper": "39;35;75;33", "wc_summary_review": "62;93;136;105", "wc_main_review": "409;247;216;129", "wc_review": "510;375;427;267", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 2.0, 1.0 ], "confidence_avg": [ 4.75, 0.4330127018922193 ], "correctness_avg": [ 1.75, 0.4330127018922193 ], "technical_novelty_avg": [ 1.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 45.5, 17.168284713389397 ], "wc_summary_review_avg": [ 99.0, 26.504716561397142 ], "wc_main_review_avg": [ 250.25, 101.34933398893158 ], "wc_review_avg": [ 394.75, 88.08057390821202 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:5nRNIl6kniAJ:scholar.google.com/&scioq=Deep+Neural+Networks+on+EEG+signals+to+predict+Attention+Score+using+Gramian+Angular+Difference+Field&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Birla Institute of Technology and Science, Pilani", "aff_unique_dep": "Dhirubhai Ambani Institute Of Information and Communication Technology", "aff_unique_url": "https://www.bits-pilani.ac.in", "aff_unique_abbr": "BITS Pilani", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Pilani", "aff_country_unique_index": "0;0", "aff_country_unique": "India" }, { "id": "gCmCiclZV6Q", "title": "Inferring Offensiveness In Images From Natural Language Supervision", "track": "main", "status": "Reject", "tldr": "", "abstract": "Probing or fine-tuning (large-scale) pre-trained models results in state-of-the-art performance for many NLP tasks and, more recently, even for computer vision tasks when combined with image data. Unfortunately, these approaches also entail severe risks. In particular, large image datasets automatically scraped from the web may contain derogatory terms as categories and offensive images, and may also underrepresent specific classes. Consequently, there is an urgent need to carefully document datasets and curate their content. Unfortunately, this process is tedious and error-prone. We show that pre-trained transformers themselves provide a methodology for the automated curation of large-scale vision datasets. Based on human-annotated examples and the implicit knowledge of a CLIP based model, we demonstrate that one can select relevant prompts for rating the offensiveness of an image. \nIn addition to e.g. privacy violation and pornographic content previously identified in ImageNet, we demonstrate that our approach identifies further inappropriate and potentially offensive content.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/0a03628dcdc6ecf34eb0afc6b4581b4aaae15e27.zip", "author": "Patrick Schramowski;Kristian Kersting", "authorids": "~Patrick_Schramowski1;~Kristian_Kersting1", "gender": "M;M", "homepage": "https://ml-research.github.io/people/pschramowski/index.html;http://www.ml.informatik.tu-darmstadt.de/", "dblp": "217/1650;40/3793", "google_scholar": "GD481RkAAAAJ;QY-earAAAAAJ", "orcid": "0000-0003-1231-7120;0000-0002-2873-9152", "linkedin": ";", "or_profile": "~Patrick_Schramowski1;~Kristian_Kersting1", "aff": "TU Darmstadt;TU Darmstadt", "aff_domain": "tu-darmstadt.de;tu-darmstadt.de", "position": "PhD student;Full Professor", "bibtex": "@misc{\nschramowski2022inferring,\ntitle={Inferring Offensiveness In Images From Natural Language Supervision},\nauthor={Patrick Schramowski and Kristian Kersting},\nyear={2022},\nurl={https://openreview.net/forum?id=gCmCiclZV6Q}\n}", "github": "", "project": "", "reviewers": "yFoK;zJbR;dcH1", "site": "https://openreview.net/forum?id=gCmCiclZV6Q", "pdf_size": 0, "recommendation": "3;3;3", "confidence": "4;3;3", "correctness": "2;3;3", "technical_novelty": "1;2;2", "empirical_novelty": "2;3;2", "wc_summary_paper": "40;97;89", "wc_summary_review": "81;48;51", "wc_main_review": "732;461;172", "wc_review": "853;606;312", "wc_reply_reviewers": "290;168;0", "wc_reply_authors": "629;1078;759", "reply_reviewers": "1;2;0", "reply_authors": "1;3;1", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "correctness_avg": [ 2.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 1.6666666666666667, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 75.33333333333333, 25.197001585285676 ], "wc_summary_review_avg": [ 60.0, 14.89966442575134 ], "wc_main_review_avg": [ 455.0, 228.65840607042344 ], "wc_review_avg": [ 590.3333333333334, 221.1399757820573 ], "wc_reply_reviewers_avg": [ 152.66666666666666, 118.88743508976138 ], "wc_reply_authors_avg": [ 822.0, 188.6389850128193 ], "reply_reviewers_avg": [ 1.0, 0.816496580927726 ], "reply_authors_avg": [ 1.6666666666666667, 0.9428090415820634 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11983342493784943881&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0", "aff_unique_norm": "Technische Universit\u00e4t Darmstadt", "aff_unique_dep": "", "aff_unique_url": "https://www.tu-darmstadt.de", "aff_unique_abbr": "TU Darmstadt", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Darmstadt", "aff_country_unique_index": "0;0", "aff_country_unique": "Germany" }, { "id": "gD0KBsQcGKg", "title": "Distribution-Driven Disjoint Prediction Intervals for Deep Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "This paper redefines prediction intervals (PIs) as the form of a union of disjoint intervals. PIs represent predictive uncertainty in the regression problem. Since previous PI methods assumed a single continuous PI (one lower and upper bound), it suffers from performance degradation in the uncertainty estimation when the conditional density function has multiple modes. This paper demonstrates that multimodality should be considered in regression uncertainty estimation. To address the issue, we propose a novel method that generates a union of disjoint PIs. Throughout UCI benchmark experiments, our method improves over current state-of-the-art uncertainty quantification methods, reducing an average PI width by over 27$\\%$. Through qualitative experiments, we visualized that the multi-mode often exists in real-world datasets and why our method produces high-quality PIs compared to the previous PI. ", "keywords": "Uncertainty;Prediction Interval;Regression Uncertainty", "primary_area": "", "supplementary_material": "/attachment/0e4974611a137acc278b505ae7cb802479a89f8e.zip", "author": "Jaehak Cho;Jae Myung Kim;Seungyub Han;Jungwoo Lee", "authorids": "~Jaehak_Cho1;~Jae_Myung_Kim1;~Seungyub_Han1;~Jungwoo_Lee1", "gender": "M;M;M;M", "homepage": "http://cml.snu.ac.kr/;https://jaemyung-kim.github.io;;https://cml.snu.ac.kr", "dblp": ";51/1888;347/8731;34/516-1", "google_scholar": ";eP6FHFAAAAAJ;ot1-XNAAAAAJ;j98IWfoAAAAJ", "orcid": ";;0009-0001-8704-8968;0000-0002-6804-980X", "linkedin": ";;;", "or_profile": "~Jaehak_Cho1;~Jae_Myung_Kim1;~Seungyub_Han1;~Jungwoo_Lee1", "aff": "Seoul National University;University of Tuebingen;Seoul National University;Seoul National University", "aff_domain": "snu.ac.kr;uni-tuebingen.de;snu.ac.kr;snu.ac.kr", "position": "PhD student;PhD student;PhD student;Full Professor", "bibtex": "@misc{\ncho2022distributiondriven,\ntitle={Distribution-Driven Disjoint Prediction Intervals for Deep Learning},\nauthor={Jaehak Cho and Jae Myung Kim and Seungyub Han and Jungwoo Lee},\nyear={2022},\nurl={https://openreview.net/forum?id=gD0KBsQcGKg}\n}", "github": "", "project": "", "reviewers": "3Zfh;RmVv;8k1c", "site": "https://openreview.net/forum?id=gD0KBsQcGKg", "pdf_size": 0, "recommendation": "3;5;5", "confidence": "3;4;2", "correctness": "2;4;3", "technical_novelty": "3;2;2", "empirical_novelty": "2;2;2", "wc_summary_paper": "120;55;167", "wc_summary_review": "55;27;171", "wc_main_review": "432;294;488", "wc_review": "607;376;826", "wc_reply_reviewers": "194;69;0", "wc_reply_authors": "1289;766;830", "reply_reviewers": "1;1;0", "reply_authors": "2;2;1", "recommendation_avg": [ 4.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 3.0, 0.816496580927726 ], "correctness_avg": [ 3.0, 0.816496580927726 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 114.0, 45.92022067310507 ], "wc_summary_review_avg": [ 84.33333333333333, 62.339571880325124 ], "wc_main_review_avg": [ 404.6666666666667, 81.52436581919679 ], "wc_review_avg": [ 603.0, 183.73350266078313 ], "wc_reply_reviewers_avg": [ 87.66666666666667, 80.2925207531118 ], "wc_reply_authors_avg": [ 961.6666666666666, 232.92965080088499 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.8660254037844385, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:VUCTnl5Ldg8J:scholar.google.com/&scioq=Distribution-Driven+Disjoint+Prediction+Intervals+for+Deep+Learning&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;0;0", "aff_unique_norm": "Seoul National University;University of Tuebingen", "aff_unique_dep": ";", "aff_unique_url": "https://www.snu.ac.kr;https://www.uni-tuebingen.de/", "aff_unique_abbr": "SNU;Uni T\u00fcbingen", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0", "aff_country_unique": "South Korea;Germany" }, { "title": "Finetuned Language Models are Zero-Shot Learners", "status": "Oral", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6254", "id": "gEZrGCozdqR", "poster": "", "openreview": "https://openreview.net/forum?id=gEZrGCozdqR", "slides": "https://iclr.cc/virtual/2022/poster/6254", "video": "https://iclr.cc/virtual/2022/poster/6254", "author_site": "Jason Wei, Maarten Bosma, Vincent Zhao, Kelvin Guu, Wei Yu, Brian Lester, Nan Du, Andrew Dai, Quoc V Le", "tldr": "", "abstract": "This paper explores a simple method for improving the zero-shot learning abilities of language models. We show that instruction tuning\u2014finetuning language models on a collection of datasets described via instructions\u2014substantially improves zero-shot performance on unseen tasks. We take a 137B parameter pretrained language model and instruction tune it on over 60 NLP datasets verbalized via natural language instruction templates. We evaluate this instruction-tuned model, which we call FLAN, on unseen task types. FLAN substantially improves the performance of its unmodified counterpart and surpasses zero-shot 175B GPT-3 on 20 of 25 datasets that we evaluate. FLAN even outperforms few-shot GPT-3 by a large margin on ANLI, RTE, BoolQ, AI2-ARC, OpenbookQA, and StoryCloze. Ablation studies reveal that number of finetuning datasets, model scale, and natural language instructions are key to the success of instruction tuning.", "keywords": "natural language processing;zero-shot learning;language models", "primary_area": "", "supplementary_material": "/attachment/8e92660ea3d04e3380af73ef67555377395930c1.zip", "author": "Jason Wei;Maarten Bosma;Vincent Zhao;Kelvin Guu;Adams Wei Yu;Brian Lester;Nan Du;Andrew M. Dai;Quoc V Le", "authorids": "~Jason_Wei1;bosma@google.com;vzhao@google.com;~Kelvin_Guu1;~Adams_Wei_Yu1;~Brian_Lester1;~Nan_Du1;~Andrew_M._Dai1;~Quoc_V_Le1", "gender": "M;;;M;M;;M;;M", "homepage": "https://jasonwei20.github.io;;;http://kelvinguu.com/;https://adamsyu.github.io/;;;;", "dblp": "02/11220.html;;;164/5838;65/10635;;;;29/6166", "google_scholar": ";;;;-hW6cvgAAAAJ;;v474hP4AAAAJ;;", "orcid": ";;;;;;;;", "linkedin": ";;;;;;dunangatech/;;", "or_profile": "~Jason_Wei1;bosma@google.com;vzhao@google.com;~Kelvin_Guu1;~Adams_Wei_Yu1;~Brian_Lester1;~Nan_Du1;~Andrew_M._Dai1;~Quoc_V_Le1", "aff": "Google;;;Google;Google Brain;;Google Brain;;Google", "aff_domain": "google.com;;;google.com;google.com;;google.com;;google.com", "position": "Researcher;;;Senior Research Scientist/Manager;Research Scientist;;Research Scientist;;Scientist", "bibtex": "@inproceedings{\nwei2022finetuned,\ntitle={Finetuned Language Models are Zero-Shot Learners},\nauthor={Jason Wei and Maarten Bosma and Vincent Zhao and Kelvin Guu and Adams Wei Yu and Brian Lester and Nan Du and Andrew M. Dai and Quoc V Le},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=gEZrGCozdqR}\n}", "github": "", "project": "", "reviewers": "BGTp;K8bM;DjKD;4nJJ", "pdf_size": 0, "recommendation": "8;8;8;8", "confidence": "5;3;4;4", "correctness": "3;4;4;3", "technical_novelty": "1;2;3;4", "empirical_novelty": "4;4;3;4", "wc_summary_paper": "114;27;50;18", "wc_summary_review": "56;54;17;13", "wc_main_review": "715;423;281;466", "wc_review": "885;504;348;497", "wc_reply_reviewers": "18;0;0;0", "wc_reply_authors": "512;967;548;1283", "reply_reviewers": "1;0;0;0", "reply_authors": "1;2;1;2", "recommendation_avg": [ 8.0, 0.0 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.5, 1.118033988749895 ], "empirical_novelty_avg": [ 3.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 52.25, 37.512497917360825 ], "wc_summary_review_avg": [ 35.0, 20.062402647738878 ], "wc_main_review_avg": [ 471.25, 156.4966053944941 ], "wc_review_avg": [ 558.5, 198.53526135173067 ], "wc_reply_reviewers_avg": [ 4.5, 7.794228634059948 ], "wc_reply_authors_avg": [ 827.5, 318.04127090678026 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 9, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 3956, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3582238432300098245&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "pdf": "https://openreview.net/pdf?id=gEZrGCozdqR", "email": "google.com;;;google.com;google.com;;google.com;;google.com", "author_num": 9, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Google", "aff_unique_dep": "Google", "aff_unique_url": "https://www.google.com", "aff_unique_abbr": "Google", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "gEynpztqZug", "title": "Mako: Semi-supervised continual learning with minimal labeled data via data programming", "track": "main", "status": "Reject", "tldr": "", "abstract": "Lifelong machine learning (LML) is a well-known paradigm mimicking the human learning process by utilizing experiences from previous tasks. Nevertheless, an issue that has been rarely addressed is the lack of labels at the individual task level. The state-of-the-art of LML largely addresses supervised learning, with a few semi-supervised continual learning exceptions which require training additional models, which in turn impose constraints on the LML methods themselves. Therefore, we propose Mako, a wrapper tool that mounts on top of supervised LML frameworks, leveraging data programming. Mako imposes no additional knowledge base overhead and enables continual semi-supervised learning with a limited amount of labeled data. This tool achieves similar performance, in terms of per-task accuracy and resistance to catastrophic forgetting, as compared to fully labeled data. We ran extensive experiments on LML task sequences created from standard image classification data sets including MNIST, CIFAR-10 and CIFAR-100, and the results show that after utilizing Mako to leverage unlabeled data, LML tools are able to achieve $97\\%$ performance of supervised learning on fully labeled data in terms of accuracy and catastrophic forgetting prevention. Moreover, when compared to baseline semi-supervised LML tools such as CNNL, ORDisCo and DistillMatch, Mako significantly outperforms them, increasing accuracy by $0.25$ on certain benchmarks.", "keywords": "lifelong machine learning;data programming;semi-supervised learning", "primary_area": "", "supplementary_material": "", "author": "Pengyuan Lu;Seungwon Lee;Amanda Watson;David Kent;Insup Lee;ERIC EATON;James Weimer", "authorids": "~Pengyuan_Lu1;~Seungwon_Lee2;aawatson@seas.upenn.edu;dekent@seas.upenn.edu;~Insup_Lee1;~ERIC_EATON1;~James_Weimer1", "gender": "M;M;;;;;M", "homepage": ";;;;https://www.cis.upenn.edu/~lee/;;https://jamesweimer.net", "dblp": "231/5742;;;;l/InsupLee.html;22/2336;79/11048.html", "google_scholar": "WnO2b68AAAAJ;;;;qPlUgrgAAAAJ;QIZWnnQAAAAJ;IeuLakwAAAAJ", "orcid": ";;;;0000-0003-2672-1132;;0000-0001-8167-9163", "linkedin": "pengyuan-eric-lu-778124121/;;;;;;", "or_profile": "~Pengyuan_Lu1;~Seungwon_Lee2;aawatson@seas.upenn.edu;dekent@seas.upenn.edu;~Insup_Lee1;~ERIC_EATON1;~James_Weimer1", "aff": "Department of Computer and Information Science, School of Engineering and Applied Science;Department of Computer and Information Science, School of Engineering and Applied Science;;;University of Pennsylvania;University of Pennsylvania;Vanderbilt University", "aff_domain": "cis.upenn.edu;cis.upenn.edu;;;upenn.edu;upenn.edu;vanderbilt.edu", "position": "PhD student;PhD student;;;Full Professor;Faculty;Assistant Professor", "bibtex": "@misc{\nlu2022mako,\ntitle={Mako: Semi-supervised continual learning with minimal labeled data via data programming},\nauthor={Pengyuan Lu and Seungwon Lee and Amanda Watson and David Kent and Insup Lee and ERIC EATON and James Weimer},\nyear={2022},\nurl={https://openreview.net/forum?id=gEynpztqZug}\n}", "github": "", "project": "", "reviewers": "Fba4;XNuq;ECpE;E934", "site": "https://openreview.net/forum?id=gEynpztqZug", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "4;4;5;3", "correctness": "3;2;4;3", "technical_novelty": "1;2;3;2", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "50;129;70;73", "wc_summary_review": "22;73;56;71", "wc_main_review": "264;302;356;347", "wc_review": "336;504;482;491", "wc_reply_reviewers": "31;0;0;19", "wc_reply_authors": "482;461;324;345", "reply_reviewers": "1;0;0;1", "reply_authors": "1;1;1;1", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 80.5, 29.364093720052047 ], "wc_summary_review_avg": [ 55.5, 20.426698215815495 ], "wc_main_review_avg": [ 317.25, 36.92813967694555 ], "wc_review_avg": [ 453.25, 68.14460727012813 ], "wc_reply_reviewers_avg": [ 12.5, 13.200378782444085 ], "wc_reply_authors_avg": [ 403.0, 69.3000721500346 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.7071067811865475, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:MUtno463nMUJ:scholar.google.com/&scioq=Mako:+Semi-supervised+continual+learning+with+minimal+labeled+data+via+data+programming&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0;1;1;2", "aff_unique_norm": "School of Engineering and Applied Science;University of Pennsylvania;Vanderbilt University", "aff_unique_dep": "Department of Computer and Information Science;;", "aff_unique_url": ";https://www.upenn.edu;https://www.vanderbilt.edu", "aff_unique_abbr": ";UPenn;Vanderbilt", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "1;1;1", "aff_country_unique": ";United States" }, { "title": "How Did the Model Change? Efficiently Assessing Machine Learning API Shifts", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6133", "id": "gFDFKC4gHL4", "poster": "", "openreview": "https://openreview.net/forum?id=gFDFKC4gHL4", "slides": "https://iclr.cc/virtual/2022/poster/6133", "video": "https://iclr.cc/virtual/2022/poster/6133", "author_site": "Lingjiao Chen, Peter Bailis, James Y Zou", "tldr": "", "abstract": "ML prediction APIs from providers like Amazon and Google have made it simple to use ML in applications. A challenge for users is that such APIs continuously change over time as the providers update models, and changes can happen silently without users knowing. It is thus important to monitor when and how much the MLAPIs\u2019 performance shifts. To provide detailed change assessment, we model MLAPI shifts as confusion matrix differences, and propose a principled algorithmic framework, MASA, to provably assess these shifts efficiently given a sample budget constraint.MASAemploys an upper-confidence bound based approach to adaptively determine on which data point to query the ML API to estimate shifts. Empirically, we observe significant ML API shifts from 2020 to 2021 among 12 out of 36 applications using commercial APIs from Google, Microsoft, Amazon, and other providers. These real-world shifts include both improvements and reductions in accuracy. Extensive experiments show that MASA can estimate such API shifts more accurately than standard approaches given the same budget", "keywords": "ML API performance shifts;ML as a service;ML monitoring;ML performance evaluation", "primary_area": "", "supplementary_material": "/attachment/3865c217a2dfb2f67e202e03c1c3515b86524215.zip", "author": "Lingjiao Chen;Matei Zaharia;James Zou", "authorids": "~Lingjiao_Chen1;~Matei_Zaharia1;~James_Zou1", "gender": ";M;", "homepage": ";https://cs.stanford.edu/~matei/;", "dblp": "131/6638.html;36/2133;", "google_scholar": ";I1EvjZsAAAAJ;23ZXZvEAAAAJ", "orcid": ";0000-0002-7547-7204;", "linkedin": ";mateizaharia/;", "or_profile": "~Lingjiao_Chen1;~Matei_Zaharia1;~James_Zou1", "aff": "Stanford University;Stanford University;Stanford University", "aff_domain": "stanford.edu;stanford.edu;stanford.edu", "position": "PhD student;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\nchen2022how,\ntitle={How Did the Model Change? Efficiently Assessing Machine Learning {API} Shifts },\nauthor={Lingjiao Chen and Matei Zaharia and James Zou},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=gFDFKC4gHL4}\n}", "github": "", "project": "", "reviewers": "2mEB;NY7g;Bj8e;Z5qk", "pdf_size": 0, "recommendation": "6;6;6;8", "confidence": "4;3;4;4", "correctness": "3;4;4;4", "technical_novelty": "3;3;3;3", "empirical_novelty": "4;4;3;3", "wc_summary_paper": "37;32;87;58", "wc_summary_review": "62;74;104;37", "wc_main_review": "269;88;337;289", "wc_review": "368;194;528;384", "wc_reply_reviewers": "39;0;46;11", "wc_reply_authors": "131;157;258;342", "reply_reviewers": "1;0;1;1", "reply_authors": "1;1;1;1", "recommendation_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 3.5, 0.5 ], "wc_summary_paper_avg": [ 53.5, 21.66217902243447 ], "wc_summary_review_avg": [ 69.25, 24.097458372201828 ], "wc_main_review_avg": [ 245.75, 94.36995019602374 ], "wc_review_avg": [ 368.5, 118.45990882994973 ], "wc_reply_reviewers_avg": [ 24.0, 19.06567596493762 ], "wc_reply_authors_avg": [ 222.0, 83.9672555226143 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5409678907176982497&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "pdf": "https://openreview.net/pdf?id=gFDFKC4gHL4", "email": "stanford.edu;stanford.edu;stanford.edu", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Stanford University", "aff_unique_dep": "", "aff_unique_url": "https://www.stanford.edu", "aff_unique_abbr": "Stanford", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "gI7KCy4UDN9", "title": "Post-Training Quantization Is All You Need to Perform Cross-Platform Learned Image Compression", "track": "main", "status": "Reject", "tldr": "", "abstract": "It has been witnessed that learned image compression has outperformed conventional image coding techniques and tends to be practical in industrial applications. One of the most critical issues preventing it from being practical is the non-deterministic calculation, which makes the probability prediction cross-platform inconsistent and frustrates successful decoding. We propose to solve this problem by introducing well-developed post-training quantization and making the model inference integer-arithmetic-only, which is much simpler than presently existing training and fine-tuning based approaches yet still keeps the superior rate-distortion performance of learned image compression. Based on that, we further improve the discretization of the entropy parameters and extend the deterministic inference to fit Gaussian mixture models. With our proposed methods, the current state-of-the-art image compression models can infer in a cross-platform consistent manner, which makes the further development and practice of learned image compression more promising.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Dailan He;Ziming Yang;Yan Wang;Yuan Chen;Qi Zhang;Hongwei Qin", "authorids": "~Dailan_He1;~Ziming_Yang2;~Yan_Wang12;~Yuan_Chen4;~Qi_Zhang15;~Hongwei_Qin2", "gender": "M;M;;M;M;M", "homepage": ";;http://researchgate.net/profile/Yan_Wang154?ev=hdr_xprf;http://member.bitcron.com/;;http://qinhongwei.com/academic", "dblp": "246/2809;;59/2227-80;;;161/1819", "google_scholar": "f5MTTy4AAAAJ;https://scholar.google.cz/citations?user=IziPwhUAAAAJ;QOZnsYYAAAAJ;;;ZGM7HfgAAAAJ", "orcid": ";;;;;", "linkedin": "dailan-he-82741a164/;;;;%E7%90%A6-%E5%BC%A0-365687179/;", "or_profile": "~Dailan_He1;~Ziming_Yang2;~Yan_Wang12;~Yuan_Chen4;~Qi_Zhang15;~Hongwei_Qin2", "aff": "SenseTime Research;Beihang University;Tsinghua University;Beihang University;;SenseTime Co.", "aff_domain": "sensetime.com;buaa.edu.cn;tsinghua.edu.cn;buaa.edu.cn;;sensetime.com", "position": "Researcher;Undergrad student;Assistant Professor;Undergrad student;;Researcher", "bibtex": "@misc{\nhe2022posttraining,\ntitle={Post-Training Quantization Is All You Need to Perform Cross-Platform Learned Image Compression},\nauthor={Dailan He and Ziming Yang and Yan Wang and Yuan Chen and Qi Zhang and Hongwei Qin},\nyear={2022},\nurl={https://openreview.net/forum?id=gI7KCy4UDN9}\n}", "github": "", "project": "", "reviewers": "oV3R;GrpS;L7dn;eyVf;2XDr", "site": "https://openreview.net/forum?id=gI7KCy4UDN9", "pdf_size": 0, "recommendation": "3;6;6;6;6", "confidence": "4;5;4;3;3", "correctness": "2;4;4;2;3", "technical_novelty": "2;2;2;2;2", "empirical_novelty": "3;3;3;3;2", "wc_summary_paper": "74;76;58;90;107", "wc_summary_review": "83;43;15;66;48", "wc_main_review": "206;296;118;313;527", "wc_review": "363;415;191;469;682", "wc_reply_reviewers": "380;0;0;26;176", "wc_reply_authors": "2155;446;308;390;538", "reply_reviewers": "2;0;0;1;1", "reply_authors": "4;1;1;1;2", "recommendation_avg": [ 5.4, 1.2 ], "confidence_avg": [ 3.8, 0.7483314773547882 ], "correctness_avg": [ 3.0, 0.8944271909999159 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.8, 0.39999999999999997 ], "wc_summary_paper_avg": [ 81.0, 16.492422502470642 ], "wc_summary_review_avg": [ 51.0, 22.882307575941724 ], "wc_main_review_avg": [ 292.0, 136.64113582666093 ], "wc_review_avg": [ 424.0, 159.1728620085723 ], "wc_reply_reviewers_avg": [ 116.4, 147.17825926406388 ], "wc_reply_authors_avg": [ 767.4, 697.8302372353894 ], "reply_reviewers_avg": [ 0.8, 0.7483314773547883 ], "reply_authors_avg": [ 1.8, 1.1661903789690602 ], "replies_avg": [ 20, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.1336306209562122, "corr_recommendation_correctness": 0.5590169943749475, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13214582869894597595&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;2;1;0", "aff_unique_norm": "SenseTime;Beihang University;Tsinghua University", "aff_unique_dep": "SenseTime Research;;", "aff_unique_url": "https://www.sensetime.com;http://www.buaa.edu.cn/;https://www.tsinghua.edu.cn", "aff_unique_abbr": "SenseTime;BUAA;THU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "title": "High Probability Generalization Bounds with Fast Rates for Minimax Problems", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7058", "id": "gI7feJ9yXPz", "poster": "", "openreview": "https://openreview.net/forum?id=gI7feJ9yXPz", "slides": "https://iclr.cc/virtual/2022/poster/7058", "video": "https://iclr.cc/virtual/2022/poster/7058", "author_site": "Shaojie Li, Yong Liu", "tldr": "", "abstract": "Minimax problems are receiving an increasing amount of attention in a wide range of applications in machine learning (ML), for instance, reinforcement learning, robust optimization, adversarial learning, and distributed computing, to mention but a few. Current studies focus on the fundamental understanding of general minimax problems with an emphasis on convergence behavior. As a comparison, there is far less work to study the generalization performance. Additionally, existing generalization bounds are almost all derived in expectation, and the high probability bounds are all presented in the slow order $\\mathcal{O}(1/\\sqrt{n})$, where $n$ is the sample size. In this paper, we provide improved generalization analyses and obtain sharper high probability generalization bounds for most existing generalization measures of minimax problems. We then use the improved learning bounds to establish high probability generalization bounds with fast rates for classical empirical saddle point (ESP) solution and several popular gradient-based optimization algorithms, including gradient descent ascent (GDA), stochastic gradient descent ascent (SGDA), proximal point method (PPM), extra-gradient (EG), and optimistic gradient descent ascent (OGDA). In summary, we provide a systematical analysis of sharper generalization bounds of minimax problems.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Shaojie Li;Yong Liu", "authorids": "~Shaojie_Li2;~Yong_Liu7", "gender": "M;M", "homepage": ";https://iie-liuyong.github.io", "dblp": ";29/4867-18", "google_scholar": ";vVhmzbAAAAAJ", "orcid": ";0000-0002-6739-621X", "linkedin": ";", "or_profile": "~Shaojie_Li2;~Yong_Liu7", "aff": "Renmin University of China;Renmin University of China", "aff_domain": "ruc.edu.cn;ruc.edu.cn", "position": "PhD student;Associate Professor", "bibtex": "@inproceedings{\nli2022high,\ntitle={High Probability Generalization Bounds with Fast Rates for Minimax Problems},\nauthor={Shaojie Li and Yong Liu},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=gI7feJ9yXPz}\n}", "github": "", "project": "", "reviewers": "Wpqu;RFir;jPJ9;LsE5", "pdf_size": 0, "recommendation": "6;6;8;8", "confidence": "3;4;5;3", "correctness": "3;4;3;3", "technical_novelty": "3;4;3;4", "empirical_novelty": "3;4;0;0", "wc_summary_paper": "52;123;130;69", "wc_summary_review": "36;31;151;29", "wc_main_review": "265;361;314;255", "wc_review": "353;515;595;353", "wc_reply_reviewers": "45;0;138;0", "wc_reply_authors": "635;257;1486;1078", "reply_reviewers": "1;0;1;0", "reply_authors": "2;1;4;2", "recommendation_avg": [ 7.0, 1.0 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 3.5, 0.5 ], "empirical_novelty_avg": [ 1.75, 1.7853571071357126 ], "wc_summary_paper_avg": [ 93.5, 33.63406011768428 ], "wc_summary_review_avg": [ 61.75, 51.591544849907336 ], "wc_main_review_avg": [ 298.75, 42.310607417053234 ], "wc_review_avg": [ 454.0, 104.88565202161828 ], "wc_reply_reviewers_avg": [ 45.75, 56.339928114970114 ], "wc_reply_authors_avg": [ 864.0, 461.94426070685194 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 2.25, 1.0897247358851685 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.30151134457776363, "corr_recommendation_correctness": -0.5773502691896257, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3963787003287598915&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2, "pdf": "https://openreview.net/pdf?id=gI7feJ9yXPz", "email": "ruc.edu.cn;ruc.edu.cn", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Renmin University of China", "aff_unique_dep": "", "aff_unique_url": "http://www.ruc.edu.cn", "aff_unique_abbr": "RUC", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "title": "The Close Relationship Between Contrastive Learning and Meta-Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6024", "id": "gICys3ITSmj", "poster": "", "openreview": "https://openreview.net/forum?id=gICys3ITSmj", "slides": "https://iclr.cc/virtual/2022/poster/6024", "video": "https://iclr.cc/virtual/2022/poster/6024", "author_site": "Renkun Ni, Manli Shu, Hossein Souri, Micah Goldblum, Tom Goldstein", "tldr": "", "abstract": "Contrastive learning has recently taken off as a paradigm for learning from unlabeled data. In this paper, we discuss the close relationship between contrastive learning and meta-learning under a certain task distribution. We complement this observation by showing that established meta-learning methods, such as Prototypical Networks, achieve comparable performance to SimCLR when paired with this task distribution. This relationship can be leveraged by taking established techniques from meta-learning, such as task-based data augmentation, and showing that they benefit contrastive learning as well. These tricks also benefit state-of-the-art self-supervised learners without using negative pairs such as BYOL, which achieves 94.6\\% accuracy on CIFAR-10 using a self-supervised ResNet-18 feature extractor trained with our meta-learning tricks. We conclude that existing advances designed for contrastive learning or meta-learning can be exploited to benefit the other, and it is better for contrastive learning researchers to take lessons from the meta-learning literature (and vice-versa) than to reinvent the wheel. ", "keywords": "meta-learning;contrastive learning;self-supervised learning", "primary_area": "", "supplementary_material": "/attachment/9609c67c9ec522719fc98618be3f0f55b313a84b.zip", "author": "Renkun Ni;Manli Shu;Hossein Souri;Micah Goldblum;Tom Goldstein", "authorids": "~Renkun_Ni1;~Manli_Shu1;~Hossein_Souri1;~Micah_Goldblum1;~Tom_Goldstein1", "gender": "M;F;M;;M", "homepage": "https://www.cs.umd.edu/~rn9zm/;https://azshue.github.io/;https://hsouri.github.io/;;https://www.cs.umd.edu/~tomg/", "dblp": "183/7067;263/3503;250/2286;241/7231;25/8184", "google_scholar": ";https://scholar.google.com/citations?hl=en;rurbhy0AAAAJ;pGDKzuUAAAAJ;KmSuVtgAAAAJ", "orcid": ";;0000-0001-5264-798X;;", "linkedin": ";manli-shu-a804a8164/;hossein-souri-b7574795/;;", "or_profile": "~Renkun_Ni1;~Manli_Shu1;~Hossein_Souri1;~Micah_Goldblum1;~Tom_Goldstein1", "aff": "Department of Computer Science, University of Maryland, College Park;Department of Computer Science, University of Maryland, College Park;Johns Hopkins University;New York University;University of Maryland, College Park", "aff_domain": "cs.umd.edu;cs.umd.edu;jhu.edu;nyu.edu;umd.edu", "position": "PhD student;PhD student;PhD student;Postdoc;Associate Professor", "bibtex": "@inproceedings{\nni2022the,\ntitle={The Close Relationship Between Contrastive Learning and Meta-Learning},\nauthor={Renkun Ni and Manli Shu and Hossein Souri and Micah Goldblum and Tom Goldstein},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=gICys3ITSmj}\n}", "github": "", "project": "", "reviewers": "KYMy;AjN4;AV99;vRLC", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "4;3;4;3", "correctness": "1;3;4;3", "technical_novelty": "3;2;3;2", "empirical_novelty": "3;2;3;3", "wc_summary_paper": "117;50;66;63", "wc_summary_review": "74;60;48;11", "wc_main_review": "426;320;286;63", "wc_review": "617;430;400;137", "wc_reply_reviewers": "0;0;23;0", "wc_reply_authors": "516;670;183;51", "reply_reviewers": "0;0;1;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 2.75, 1.0897247358851685 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 74.0, 25.544079548889602 ], "wc_summary_review_avg": [ 48.25, 23.39203924415313 ], "wc_main_review_avg": [ 273.75, 132.17861967807048 ], "wc_review_avg": [ 396.0, 171.09500284929422 ], "wc_reply_reviewers_avg": [ 5.75, 9.959292143521045 ], "wc_reply_authors_avg": [ 355.0, 248.5689039280658 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.6882472016116854, "gs_citation": 23, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4320788425795597664&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "pdf": "https://openreview.net/pdf?id=gICys3ITSmj", "email": "cs.umd.edu;cs.umd.edu;jhu.edu;nyu.edu;umd.edu", "author_num": 5, "aff_unique_index": "0;0;1;2;3", "aff_unique_norm": "University of Maryland, College Park;Johns Hopkins University;New York University;University of Maryland", "aff_unique_dep": "Department of Computer Science;;;", "aff_unique_url": "https://www/umd.edu;https://www.jhu.edu;https://www.nyu.edu;https://www/umd.edu", "aff_unique_abbr": "UMD;JHU;NYU;UMD", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "College Park;", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Almost Tight L0-norm Certified Robustness of Top-k Predictions against Adversarial Perturbations", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6398", "id": "gJLEXy3ySpu", "poster": "", "openreview": "https://openreview.net/forum?id=gJLEXy3ySpu", "slides": "https://iclr.cc/virtual/2022/poster/6398", "video": "https://iclr.cc/virtual/2022/poster/6398", "author_site": "Jinyuan Jia, Binghui Wang, Xiaoyu Cao, Hongbin Liu, Neil Gong", "tldr": "", "abstract": "Top-$k$ predictions are used in many real-world applications such as machine learning as a service, recommender systems, and web searches. $\\ell_0$-norm adversarial perturbation characterizes an attack that arbitrarily modifies some features of an input such that a classifier makes an incorrect prediction for the perturbed input. $\\ell_0$-norm adversarial perturbation is easy to interpret and can be implemented in the physical world. Therefore, certifying robustness of top-$k$ predictions against $\\ell_0$-norm adversarial perturbation is important. However, existing studies either focused on certifying $\\ell_0$-norm robustness of top-$1$ predictions or $\\ell_2$-norm robustness of top-$k$ predictions. In this work, we aim to bridge the gap. Our approach is based on randomized smoothing, which builds a provably robust classifier from an arbitrary classifier via randomizing an input. Our major theoretical contribution is an almost tight $\\ell_0$-norm certified robustness guarantee for top-$k$ predictions. We empirically evaluate our method on CIFAR10 and ImageNet. For instance, our method can build a classifier that achieves a certified top-3 accuracy of 69.2\\% on ImageNet when an attacker can arbitrarily perturb 5 pixels of a testing image. ", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jinyuan Jia;Binghui Wang;Xiaoyu Cao;Hongbin Liu;Neil Zhenqiang Gong", "authorids": "~Jinyuan_Jia2;~Binghui_Wang2;~Xiaoyu_Cao1;~Hongbin_Liu2;~Neil_Zhenqiang_Gong1", "gender": ";M;;M;M", "homepage": "https://jinyuan-jia.github.io/;https://wangbinghui.net;;https://scholars.duke.edu/person/hongbin.liu;http://people.duke.edu/~zg70/", "dblp": "24/5124-1.html;123/7149;146/9100;82/6141-5;03/9437", "google_scholar": "iyg4ytkAAAAJ;SoOztcEAAAAJ;X5qafxAAAAAJ;1Vitx-wAAAAJ;t6uCsYoAAAAJ", "orcid": "0000-0002-9785-7769;0000-0001-5616-060X;;;0000-0002-9900-9309", "linkedin": ";;;hongbin-liu-002387158/;", "or_profile": "~Jinyuan_Jia2;~Binghui_Wang2;~Xiaoyu_Cao1;~Hongbin_Liu2;~Neil_Gong2", "aff": "Duke University;Illinois Institute of Technology;Duke University;Duke University;Duke University", "aff_domain": "duke.edu;iit.edu;duke.edu;duke.edu;duke.edu", "position": "PhD student;Assistant Professor;PhD student;PhD student;Associate Professor", "bibtex": "@inproceedings{\njia2022almost,\ntitle={Almost Tight L0-norm Certified Robustness of Top-k Predictions against Adversarial Perturbations},\nauthor={Jinyuan Jia and Binghui Wang and Xiaoyu Cao and Hongbin Liu and Neil Zhenqiang Gong},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=gJLEXy3ySpu}\n}", "github": "", "project": "", "reviewers": "PF8K;5Exk;NNyi;AjCc", "pdf_size": 0, "recommendation": "5;6;6;6", "confidence": "3;3;4;2", "correctness": "3;3;4;3", "technical_novelty": "3;3;3;3", "empirical_novelty": "3;3;3;2", "wc_summary_paper": "67;17;91;66", "wc_summary_review": "51;27;82;39", "wc_main_review": "76;122;243;225", "wc_review": "194;166;416;330", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "7;248;165;92", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 5.75, 0.4330127018922193 ], "confidence_avg": [ 3.0, 0.7071067811865476 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 60.25, 26.901440481877547 ], "wc_summary_review_avg": [ 49.75, 20.461854754640402 ], "wc_main_review_avg": [ 166.5, 69.72266489456639 ], "wc_review_avg": [ 276.5, 101.65997245720658 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 128.0, 89.03089351455482 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 26, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6768362961798727780&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "pdf": "https://openreview.net/pdf?id=gJLEXy3ySpu", "email": "duke.edu;iit.edu;duke.edu;duke.edu;duke.edu", "author_num": 5, "aff_unique_index": "0;1;0;0;0", "aff_unique_norm": "Duke University;Illinois Institute of Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.duke.edu;https://www.iit.edu", "aff_unique_abbr": "Duke;IIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Mapping Language Models to Grounded Conceptual Spaces", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/5992", "id": "gJcEM8sxHK", "poster": "", "openreview": "https://openreview.net/forum?id=gJcEM8sxHK", "slides": "https://iclr.cc/virtual/2022/poster/5992", "video": "https://iclr.cc/virtual/2022/poster/5992", "author_site": "Roma Patel, Ellie Pavlick", "tldr": "", "abstract": "A fundamental criticism of text-only language models (LMs) is their lack of grounding---that is, the ability to tie a word for which they have learned a representation, to its actual use in the world. However, despite this limitation, large pre-trained LMs have been shown to have a remarkable grasp of the conceptual structure of language, as demonstrated by their ability to answer questions, generate fluent text, or make inferences about entities, objects, and properties that they have never physically observed. In this work we investigate the extent to which the rich conceptual structure that LMs learn indeed reflects the conceptual structure of the non-linguistic world---which is something that LMs have never observed. We do this by testing whether the LMs can learn to map an entire conceptual domain (e.g., direction or colour) onto a grounded world representation given only a small number of examples. For example, we show a model what the word ``left\" means using a textual depiction of a grid world, and assess how well it can generalise to related concepts, for example, the word ``right\", in a similar grid world. We investigate a range of generative language models of varying sizes (including GPT-2 and GPT-3), and see that although the smaller models struggle to perform this mapping, the largest model can not only learn to ground the concepts that it is explicitly taught, but appears to generalise to several instances of unseen concepts as well. Our results suggest an alternative means of building grounded language models: rather than learning grounded representations ``from scratch'', it is possible that large text-only models learn a sufficiently rich conceptual structure that could allow them to be grounded in a data-efficient way.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Roma Patel;Ellie Pavlick", "authorids": "~Roma_Patel1;~Ellie_Pavlick1", "gender": "F;F", "homepage": "http://cs.brown.edu/people/rpatel59/;http://cs.brown.edu/people/epavlick/", "dblp": "168/1595;141/4059", "google_scholar": "16OCMAQAAAAJ;sFyrSa8AAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Roma_Patel1;~Ellie_Pavlick1", "aff": "Brown University;Brown University", "aff_domain": "brown.edu;brown.edu", "position": "PhD student;Assistant Professor", "bibtex": "@inproceedings{\npatel2022mapping,\ntitle={Mapping Language Models to Grounded Conceptual Spaces},\nauthor={Roma Patel and Ellie Pavlick},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=gJcEM8sxHK}\n}", "github": "", "project": "", "reviewers": "qf1D;e6Xz;TAjy;BFAr", "pdf_size": 0, "recommendation": "5;6;8;8", "confidence": "5;4;4;4", "correctness": "3;2;4;3", "technical_novelty": "3;3;3;3", "empirical_novelty": "4;3;3;3", "wc_summary_paper": "111;127;104;104", "wc_summary_review": "126;48;157;25", "wc_main_review": "2012;586;360;246", "wc_review": "2249;761;621;375", "wc_reply_reviewers": "482;169;0;0", "wc_reply_authors": "1046;500;362;143", "reply_reviewers": "1;1;0;0", "reply_authors": "2;1;1;1", "recommendation_avg": [ 6.75, 1.299038105676658 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 3.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 111.5, 9.394147114027968 ], "wc_summary_review_avg": [ 89.0, 54.244815420462075 ], "wc_main_review_avg": [ 801.0, 709.7978585484743 ], "wc_review_avg": [ 1001.5, 733.3789947905517 ], "wc_reply_reviewers_avg": [ 162.75, 196.808758697371 ], "wc_reply_authors_avg": [ 512.75, 333.15114812949395 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.7777777777777777, "corr_recommendation_correctness": 0.5443310539518174, "gs_citation": 168, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2595983470898586330&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "pdf": "https://openreview.net/pdf?id=gJcEM8sxHK", "email": "brown.edu;brown.edu", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Brown University", "aff_unique_dep": "", "aff_unique_url": "https://www.brown.edu", "aff_unique_abbr": "Brown", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Equivariant Self-Supervised Learning: Encouraging Equivariance in Representations", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6083", "id": "gKLAAfiytI", "poster": "", "openreview": "https://openreview.net/forum?id=gKLAAfiytI", "slides": "https://iclr.cc/virtual/2022/poster/6083", "video": "https://iclr.cc/virtual/2022/poster/6083", "author_site": "Rumen R Dangovski, Li Jing, Charlotte Loh, Seungwook Han, Akash Srivastava, Brian Cheung, Pulkit Agrawal, Marin Soljacic", "tldr": "", "abstract": "In state-of-the-art self-supervised learning (SSL) pre-training produces semantically good representations by encouraging them to be invariant under meaningful transformations prescribed from human knowledge. In fact, the property of invariance is a trivial instance of a broader class called equivariance, which can be intuitively understood as the property that representations transform according to the way the inputs transform. Here, we show that rather than using only invariance, pre-training that encourages non-trivial equivariance to some transformations, while maintaining invariance to other transformations, can be used to improve the semantic quality of representations. Specifically, we extend popular SSL methods to a more general framework which we name Equivariant Self-Supervised Learning (E-SSL). In E-SSL, a simple additional pre-training objective encourages equivariance by predicting the transformations applied to the input. We demonstrate E-SSL\u2019s effectiveness empirically on several popular computer vision benchmarks, e.g. improving SimCLR to 72.5% linear probe accuracy on ImageNet. Furthermore, we demonstrate usefulness of E-SSL for applications beyond computer vision; in particular, we show its utility on regression problems in photonics science. Our code, datasets and pre-trained models are available at https://github.com/rdangovs/essl to aid further research in E-SSL.", "keywords": "self-supervised learning;contrastive learning;photonics science", "primary_area": "", "supplementary_material": "/attachment/3a397d7c2ec44e7256103f9d03c9f82aedb15e7c.zip", "author": "Rumen Dangovski;Li Jing;Charlotte Loh;Seungwook Han;Akash Srivastava;Brian Cheung;Pulkit Agrawal;Marin Soljacic", "authorids": "~Rumen_Dangovski1;~Li_Jing1;~Charlotte_Loh1;~Seungwook_Han1;~Akash_Srivastava1;~Brian_Cheung1;~Pulkit_Agrawal1;~Marin_Soljacic1", "gender": "M;M;F;;M;M;M;", "homepage": "http://super-ms.mit.edu/rumen.html;http://jingli.io/;;;http://akashgit.github.io;https://briancheung.github.io/;https://people.eecs.berkeley.edu/~pulkitag/;https://www.rle.mit.edu/marin/", "dblp": "207/8546;59/6222;217/6481;119/3428;24/9528;;149/2672;131/2044", "google_scholar": ";VhxDLwcAAAAJ;https://scholar.google.com/citations?hl=en;B6tpjKkAAAAJ;https://scholar.google.co.uk/citations?user=2h6SZeEAAAAJ;7N-ethYAAAAJ;UpZmJI0AAAAJ;", "orcid": ";;;;;;;", "linkedin": ";li-jing-568b3765/;;;https://uk.linkedin.com/in/akash-srivastava-aa97361b;;;", "or_profile": "~Rumen_Dangovski1;~Li_Jing1;~Charlotte_Loh1;~Seungwook_Han1;~Akash_Srivastava1;~Brian_Cheung1;~Pulkit_Agrawal1;~Marin_Soljacic1", "aff": "Massachusetts Institute of Technology;Facebook AI Research;Massachusetts Institute of Technology;MIT-IBM Watson AI Lab;MIT-IBM Watson AI Research Lab;Massachusetts Institute of Technology;Massachusetts Institute of Technology;", "aff_domain": "mit.edu;fb.com;mit.edu;ibm.com;ibm.com;mit.edu;mit.edu;", "position": "PhD student;Postdoc;PhD student;Researcher;Research Scientist;Research Fellow;Assistant Professor;", "bibtex": "@inproceedings{\ndangovski2022equivariant,\ntitle={Equivariant Self-Supervised Learning: Encouraging Equivariance in Representations},\nauthor={Rumen Dangovski and Li Jing and Charlotte Loh and Seungwook Han and Akash Srivastava and Brian Cheung and Pulkit Agrawal and Marin Soljacic},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=gKLAAfiytI}\n}", "github": "", "project": "", "reviewers": "p4FW;tezr;7QWR;9Qn8", "pdf_size": 0, "recommendation": "6;6;6;8", "confidence": "4;2;4;3", "correctness": "3;4;3;4", "technical_novelty": "2;2;2;3", "empirical_novelty": "3;2;2;3", "wc_summary_paper": "87;107;48;89", "wc_summary_review": "52;25;71;27", "wc_main_review": "532;119;198;676", "wc_review": "671;251;317;792", "wc_reply_reviewers": "23;39;0;275", "wc_reply_authors": "1119;529;531;2168", "reply_reviewers": "1;1;0;2", "reply_authors": "2;2;1;5", "recommendation_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 82.75, 21.52179128232592 ], "wc_summary_review_avg": [ 43.75, 18.9917745353087 ], "wc_main_review_avg": [ 381.25, 230.19489025606106 ], "wc_review_avg": [ 507.75, 228.99495081769817 ], "wc_reply_reviewers_avg": [ 84.25, 110.99859233341655 ], "wc_reply_authors_avg": [ 1086.75, 668.9702441065671 ], "reply_reviewers_avg": [ 1.0, 0.7071067811865476 ], "reply_authors_avg": [ 2.5, 1.5 ], "replies_avg": [ 24, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": -0.17407765595569782, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1, "pdf": "https://openreview.net/pdf?id=gKLAAfiytI", "email": "mit.edu;fb.com;mit.edu;ibm.com;ibm.com;mit.edu;mit.edu;", "author_num": 8, "aff_unique_index": "0;1;0;0;0;0;0", "aff_unique_norm": "Massachusetts Institute of Technology;Meta", "aff_unique_dep": ";Facebook AI Research", "aff_unique_url": "https://web.mit.edu;https://research.facebook.com", "aff_unique_abbr": "MIT;FAIR", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "gKWxifgJVP", "title": "Fact-driven Logical Reasoning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Recent years have witnessed an increasing interest in training machines with reasoning ability, which deeply relies on accurate, clearly presented clue forms that are usually modeled as entity-like knowledge in existing studies. However, in real hierarchical reasoning motivated machine reading comprehension, such one-sided modeling is insufficient for those indispensable local complete facts or events when only \"global\" knowledge is really paid attention to. Thus, in view of language being a complete knowledge/clue carrier, we propose a general formalism to support representing logic units by extracting backbone constituents of the sentence such as the subject-verb-object formed \"facts\", covering both global and local knowledge pieces that are necessary as the basis for logical reasoning. Beyond building the ad-hoc graphs, we propose a more general and convenient fact-driven approach to construct a supergraph on top of our newly defined fact units, benefiting from both sides of the connections between facts and internal knowledge such as concepts or actions inside a fact. Experiments on two challenging logical reasoning benchmarks show that our proposed model, \\textsc{Focal Reasoner}, outperforms the baseline models dramatically and achieves state-of-the-art results.", "keywords": "logical reasoning;machine reading comprehension;language understanding", "primary_area": "", "supplementary_material": "/attachment/34242ee7e1c8042ae4a943e0bb1a517a4b04cc9d.zip", "author": "Siru Ouyang;Zhuosheng Zhang;hai zhao", "authorids": "~Siru_Ouyang1;~Zhuosheng_Zhang1;~hai_zhao1", "gender": "F;M;M", "homepage": "https://ozyyshr.github.io;https://bcmi.sjtu.edu.cn/~zhangzs/;http://bcmi.sjtu.edu.cn/~zhaohai/", "dblp": "https://dblp.org/search/pid/api?q=author:Siru_Ouyang:;06/9708;25/1145-1.html", "google_scholar": "fetoihAAAAAJ;https://scholar.google.co.jp/citations?user=63LTQhgAAAAJ;https://scholar.google.com.tw/citations?user=4dU5KS0AAAAJ", "orcid": "0009-0001-1331-424X;0000-0002-4183-3645;", "linkedin": ";;", "or_profile": "~Siru_Ouyang1;~Zhuosheng_Zhang1;~hai_zhao1", "aff": "Shanghai Jiaotong University;Shanghai Jiaotong University;Shanghai Jiaotong University", "aff_domain": "sjtu.edu.cn;sjtu.edu.cn;sjtu.edu.cn", "position": "Undergrad student;PhD student;Full Professor", "bibtex": "@misc{\nouyang2022factdriven,\ntitle={Fact-driven Logical Reasoning},\nauthor={Siru Ouyang and Zhuosheng Zhang and hai zhao},\nyear={2022},\nurl={https://openreview.net/forum?id=gKWxifgJVP}\n}", "github": "", "project": "", "reviewers": "MQGH;UKpM;fjem;vNnA", "site": "https://openreview.net/forum?id=gKWxifgJVP", "pdf_size": 0, "recommendation": "6;6;6;6", "confidence": "3;4;4;3", "correctness": "3;4;3;3", "technical_novelty": "3;3;4;3", "empirical_novelty": "2;3;2;3", "wc_summary_paper": "51;140;136;103", "wc_summary_review": "13;137;170;60", "wc_main_review": "404;615;697;171", "wc_review": "468;892;1003;334", "wc_reply_reviewers": "15;300;624;27", "wc_reply_authors": "488;819;777;215", "reply_reviewers": "1;1;2;1", "reply_authors": "1;3;3;1", "recommendation_avg": [ 6.0, 0.0 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 3.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 107.5, 35.64056677439347 ], "wc_summary_review_avg": [ 95.0, 61.92333970321691 ], "wc_main_review_avg": [ 471.75, 203.89871873064823 ], "wc_review_avg": [ 674.25, 280.0896061977309 ], "wc_reply_reviewers_avg": [ 241.5, 248.5160960581829 ], "wc_reply_authors_avg": [ 574.75, 243.67434723417236 ], "reply_reviewers_avg": [ 1.25, 0.4330127018922193 ], "reply_authors_avg": [ 2.0, 1.0 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1, "aff_unique_index": "0;0;0", "aff_unique_norm": "Shanghai Jiao Tong University", "aff_unique_dep": "", "aff_unique_url": "https://www.sjtu.edu.cn", "aff_unique_abbr": "SJTU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "id": "gKprVaCyQmA", "title": "There are free lunches", "track": "main", "status": "Reject", "tldr": "", "abstract": "No-Free-Lunch Theorems state that the performance of all algorithms is the same when averaged over all possible tasks. It has been argued that the necessary conditions for NFL are too restrictive to be found in practice. There must be some information for a set of tasks that ensures some algorithms perform better than others. In this paper we propose a novel idea, \"There are free lunches\" (TAFL) Theorem, which states that some algorithms can achieve the best performance in all possible tasks, in the condition that tasks are given in a specific order. Furthermore, we point out that with the number of solved tasks increasing, the difficulty of solving a new task decreases. We also present an example to explain how to combine the proposed theorem and the existing supervised learning algorithms.", "keywords": "No-Free-Lunch Theorems", "primary_area": "", "supplementary_material": "", "author": "Zhuoran Xu;hao liu;bo dong", "authorids": "~Zhuoran_Xu1;liuhao163@jd.com;dongbo5@jd.com", "gender": "M;;", "homepage": "https://www.linkedin.com/in/xu-zhuoran-68ba0133/;;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": "~Zhuoran_Xu1;liuhao163@jd.com;dongbo5@jd.com", "aff": "JD. Inc;;", "aff_domain": "jd.com;;", "position": "Researcher;;", "bibtex": "@misc{\nxu2022there,\ntitle={There are free lunches},\nauthor={Zhuoran Xu and hao liu and bo dong},\nyear={2022},\nurl={https://openreview.net/forum?id=gKprVaCyQmA}\n}", "github": "", "project": "", "reviewers": "BuWJ;SgLo;tzHo;WGUc", "site": "https://openreview.net/forum?id=gKprVaCyQmA", "pdf_size": 0, "recommendation": "1;3;3;5", "confidence": "3;3;2;2", "correctness": "1;2;2;3", "technical_novelty": "4;2;2;3", "empirical_novelty": "1;0;0;2", "wc_summary_paper": "127;117;105;44", "wc_summary_review": "31;3;93;39", "wc_main_review": "406;334;250;417", "wc_review": "564;454;448;500", "wc_reply_reviewers": "191;0;0;0", "wc_reply_authors": "613;0;0;0", "reply_reviewers": "2;0;0;0", "reply_authors": "2;0;0;0", "recommendation_avg": [ 3.0, 1.4142135623730951 ], "confidence_avg": [ 2.5, 0.5 ], "correctness_avg": [ 2.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.75, 0.82915619758885 ], "empirical_novelty_avg": [ 0.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 98.25, 32.275183965393595 ], "wc_summary_review_avg": [ 41.5, 32.59984662540608 ], "wc_main_review_avg": [ 351.75, 66.83702192647425 ], "wc_review_avg": [ 491.5, 46.44082255946809 ], "wc_reply_reviewers_avg": [ 47.75, 82.70542606141389 ], "wc_reply_authors_avg": [ 153.25, 265.43678625993044 ], "reply_reviewers_avg": [ 0.5, 0.8660254037844386 ], "reply_authors_avg": [ 0.5, 0.8660254037844386 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.7071067811865476, "corr_recommendation_correctness": 1.0, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1, "aff_unique_index": "0", "aff_unique_norm": "JD. Inc", "aff_unique_dep": "", "aff_unique_url": "https://www.jd.com", "aff_unique_abbr": "JD", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "id": "gLqnSGXVJ6l", "title": "Neural Combinatorial Optimization with Reinforcement Learning : Solving theVehicle Routing Problem with Time Windows", "track": "main", "status": "Reject", "tldr": "", "abstract": "In contrast to the classical techniques for solving combinatorial optimization problems, recent advancements in reinforcement learning yield the potential to independently learn heuristics without any human interventions. In this context, the current paper aims to present a complete framework for solving the vehicle routing problem with time windows (VRPTW) relying on neural networks and reinforcement learning. Our approach is mainly based on an attention model (AM) that predicts the near-optimal distribution over different problem instances. To optimize its parameters, this model is trained in a reinforcement learning(RL) environment using a stochastic policy gradient and through a real-time evaluation of the reward, quantity to meet the problem business and logical constraints. Using synthetic data, the proposed model outperforms some existing baselines. This performance comparison was on the basis of the solution quality (total tour length) and the computation time (inference time) for small and medium-sized samples.", "keywords": "rienforcement learning;neural combinatorial optimization;vehicle routing problem with time windows;attention model", "primary_area": "", "supplementary_material": "", "author": "Abdelhakim Abdellaoui;Issmail El Hallaoui;Loubna Benabbou", "authorids": "~Abdelhakim_Abdellaoui1;issmail.elhallaoui@gerad.ca;loubna_benabbou@uqar.ca", "gender": "M;;", "homepage": "https://www.gerad.ca/fr/people/abdelhakim-abdellaoui;;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": "~Abdelhakim_Abdellaoui1;issmail.elhallaoui@gerad.ca;loubna_benabbou@uqar.ca", "aff": "Polytechnique Montreal;;", "aff_domain": "polymtl.ca;;", "position": "PhD student;;", "bibtex": "@misc{\nabdellaoui2022neural,\ntitle={Neural Combinatorial Optimization with Reinforcement Learning : Solving theVehicle Routing Problem with Time Windows},\nauthor={Abdelhakim Abdellaoui and Issmail El Hallaoui and Loubna Benabbou},\nyear={2022},\nurl={https://openreview.net/forum?id=gLqnSGXVJ6l}\n}", "github": "", "project": "", "reviewers": "wGHF;RKZQ;CMcJ;kWGX", "site": "https://openreview.net/forum?id=gLqnSGXVJ6l", "pdf_size": 0, "recommendation": "1;3;3;3", "confidence": "4;5;4;4", "correctness": "3;1;3;3", "technical_novelty": "1;1;2;2", "empirical_novelty": "1;2;2;2", "wc_summary_paper": "26;35;49;75", "wc_summary_review": "41;29;9;84", "wc_main_review": "303;406;386;202", "wc_review": "370;470;444;361", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 2.5, 0.8660254037844386 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 2.5, 0.8660254037844386 ], "technical_novelty_avg": [ 1.5, 0.5 ], "empirical_novelty_avg": [ 1.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 46.25, 18.511820547963403 ], "wc_summary_review_avg": [ 40.75, 27.462474396892937 ], "wc_main_review_avg": [ 324.25, 80.45612158189084 ], "wc_review_avg": [ 411.25, 46.77272175103775 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": -0.3333333333333333, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:3Hedg2I6z9IJ:scholar.google.com/&scioq=Neural+Combinatorial+Optimization+with+Reinforcement+Learning+:+Solving+theVehicle+Routing+Problem+with+Time+Windows&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "Polytechnique Montreal", "aff_unique_dep": "", "aff_unique_url": "https://www.polymtl.ca", "aff_unique_abbr": "PolyMTL", "aff_campus_unique_index": "0", "aff_campus_unique": "Montreal", "aff_country_unique_index": "0", "aff_country_unique": "Canada" }, { "id": "gLtMe3vpfZa", "title": "Accelerating Stochastic Simulation with Interactive Neural Processes", "track": "main", "status": "Reject", "tldr": "", "abstract": "Stochastic simulations such as large-scale, spatiotemporal, age-structured epidemic models are computationally expensive at fine-grained resolution. We propose Interactive Neural Process (INP), a Bayesian active learning framework to proactively learn a deep learning surrogate model and accelerate simulation. Our framework is based on the novel integration of neural process, deep sequence model and active learning. In particular, we develop a novel spatiotemporal neural process model to mimic the simulator dynamics. Our model automatically infers the latent process which describes the intrinsic uncertainty of the simulator. This also gives rise to a new acquisition function based on the latent information gain. We design Bayesian active learning algorithms to iteratively query the simulator, gather more data, and continuously improve the model. We perform theoretical analysis and demonstrate that our approach reduces sample complexity compared with random sampling in high dimension. Empirically, we demonstrate our framework can faithfully imitate the behavior of a complex infectious disease simulator with a small number of examples, enabling rapid simulation and scenario exploration.", "keywords": "neural processes;bayesian active learning;stochastic process;deep sequence model;epidemic modeling", "primary_area": "", "supplementary_material": "/attachment/6c8627acbb60c1d9c2bee462f45c481e0f2a8179.zip", "author": "Dongxia Wu;Matteo Chinazzi;Alessandro Vespignani;Yian Ma;Rose Yu", "authorids": "~Dongxia_Wu1;~Matteo_Chinazzi1;~Alessandro_Vespignani1;~Yian_Ma1;~Rose_Yu1", "gender": "M;;M;M;F", "homepage": "https://dongxiaw.github.io/online-cv/;;http://www.mobs-lab.org/alessandro-vespignani.html/;https://sites.google.com/view/yianma;http://roseyu.com", "dblp": ";;;;164/7314", "google_scholar": "jZb2e8cAAAAJ;d1URvWUAAAAJ;https://scholar.google.com.tw/citations?user=U3CXAPsAAAAJ;A0TFlacAAAAJ;", "orcid": ";;;;", "linkedin": "dongxia-wu-2021/;;;;", "or_profile": "~Dongxia_Wu1;~Matteo_Chinazzi1;~Alessandro_Vespignani1;~Yian_Ma1;~Rose_Yu1", "aff": "University of California, San Diego;Northeastern University;Northeastern University;University of California, San Diego;University of California, San Diego", "aff_domain": "ucsd.edu;northeastern.edu;;ucsd.edu;ucsd.edu", "position": "Ph.D student;Researcher;Full Professor;Assistant Professor;Assistant Professor", "bibtex": "@misc{\nwu2022accelerating,\ntitle={Accelerating Stochastic Simulation with Interactive Neural Processes},\nauthor={Dongxia Wu and Matteo Chinazzi and Alessandro Vespignani and Yian Ma and Rose Yu},\nyear={2022},\nurl={https://openreview.net/forum?id=gLtMe3vpfZa}\n}", "github": "", "project": "", "reviewers": "XP3S;YBHd;TEvq;Epnf", "site": "https://openreview.net/forum?id=gLtMe3vpfZa", "pdf_size": 0, "recommendation": "5;6;6;6", "confidence": "4;4;3;3", "correctness": "2;3;2;3", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "94;101;70;77", "wc_summary_review": "113;31;30;57", "wc_main_review": "1005;564;248;473", "wc_review": "1212;696;348;607", "wc_reply_reviewers": "543;0;0;116", "wc_reply_authors": "1649;536;261;531", "reply_reviewers": "2;0;0;2", "reply_authors": "4;2;2;4", "recommendation_avg": [ 5.75, 0.4330127018922193 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 85.5, 12.5 ], "wc_summary_review_avg": [ 57.75, 33.6851228289285 ], "wc_main_review_avg": [ 572.5, 274.92226173956885 ], "wc_review_avg": [ 715.75, 313.7358562549075 ], "wc_reply_reviewers_avg": [ 164.75, 223.4584692957508 ], "wc_reply_authors_avg": [ 744.25, 534.0755447499913 ], "reply_reviewers_avg": [ 1.0, 1.0 ], "reply_authors_avg": [ 3.0, 1.0 ], "replies_avg": [ 23, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:YboYcG-LJEAJ:scholar.google.com/&scioq=Accelerating+Stochastic+Simulation+with+Interactive+Neural+Processes&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1;1;0;0", "aff_unique_norm": "University of California, San Diego;Northeastern University", "aff_unique_dep": ";", "aff_unique_url": "https://www.ucsd.edu;https://www.northeastern.edu", "aff_unique_abbr": "UCSD;NEU", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "San Diego;", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "gMJhuI6RGmv", "title": "Neural Face Identification in a 2D Wireframe Projection of a Manifold Object", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "In computer-aided design (CAD) systems, 2D line drawings are commonly used to illustrate 3D object designs. To reconstruct the 3D models depicted by a single 2D line drawing, an important key is finding the edge loops in the line drawing which correspond to the actual faces of the 3D object. In this paper, we approach the classical problem of face identification from a novel data-driven point of view. We cast it as a sequence generation problem: starting from an arbitrary edge, we adopt a variant of the popular Transformer model to predict the edges associated with the same face in a natural order. This allows us to avoid searching the space of all possible edges loops with various hand-crafted rules and heuristics as most existing methods do, deal with challenging cases such as curved surfaces and nested edge loops, and leverage additional cues such as face types. We further discuss how possibly imperfect predictions can be used for 3D object reconstruction.", "keywords": "Face Identification;Wireframe;Line Drawing;3D Reconstruction", "primary_area": "", "supplementary_material": "/attachment/c8edb3522f90f863dfbad7ae6f633dd927758c55.zip", "author": "Kehan Wang;Jia Zheng;Zihan Zhou", "authorids": "~Kehan_Wang1;~Jia_Zheng1;~Zihan_Zhou2", "gender": "M;M;", "homepage": ";http://bertjiazheng.github.io;", "dblp": ";80/3062-2;00/6525-1", "google_scholar": ";0LeXf0YAAAAJ;vEcgp3AAAAAJ", "orcid": ";0000-0002-4617-428X;", "linkedin": "wang-kehan/;;", "or_profile": "~Kehan_Wang1;~Jia_Zheng1;~Zihan_Zhou2", "aff": "University of California, Berkeley;Manycore Tech Inc.;Manycore Tech Inc.", "aff_domain": "berkeley.edu;qunhemail.com;qunhemail.com", "position": "Undergrad student;Research Engineer;Chief Scientist", "bibtex": "@misc{\nwang2022neural,\ntitle={Neural Face Identification in a 2D Wireframe Projection of a Manifold Object},\nauthor={Kehan Wang and Jia Zheng and Zihan Zhou},\nyear={2022},\nurl={https://openreview.net/forum?id=gMJhuI6RGmv}\n}", "github": "", "project": "", "reviewers": "RLTq;Z5f9;GTus;dgfP;fMYd", "site": "https://openreview.net/forum?id=gMJhuI6RGmv", "pdf_size": 0, "recommendation": "3;5;5;6;6", "confidence": "4;3;5;3;5", "correctness": "2;4;4;4;4", "technical_novelty": "2;3;3;3;4", "empirical_novelty": "2;3;2;3;4", "wc_summary_paper": "89;99;102;62;94", "wc_summary_review": "81;59;86;29;39", "wc_main_review": "296;289;592;585;253", "wc_review": "466;447;780;676;386", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "0;0;0;0;0", "reply_reviewers": "0;0;0;0;0", "reply_authors": "0;0;0;0;0", "recommendation_avg": [ 5.0, 1.0954451150103321 ], "confidence_avg": [ 4.0, 0.8944271909999159 ], "correctness_avg": [ 3.6, 0.8000000000000002 ], "technical_novelty_avg": [ 3.0, 0.6324555320336759 ], "empirical_novelty_avg": [ 2.8, 0.7483314773547882 ], "wc_summary_paper_avg": [ 89.2, 14.3024473430249 ], "wc_summary_review_avg": [ 58.8, 22.417850030723287 ], "wc_main_review_avg": [ 403.0, 152.17752790737535 ], "wc_review_avg": [ 551.0, 150.5536449243259 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.9128709291752768, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5718507503258968760&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff_unique_index": "0;1;1", "aff_unique_norm": "University of California, Berkeley;Manycore Tech Inc.", "aff_unique_dep": ";", "aff_unique_url": "https://www.berkeley.edu;", "aff_unique_abbr": "UC Berkeley;", "aff_campus_unique_index": "0", "aff_campus_unique": "Berkeley;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Fast Regression for Structured Inputs", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7050", "id": "gNp54NxHUPJ", "poster": "", "openreview": "https://openreview.net/forum?id=gNp54NxHUPJ", "slides": "https://iclr.cc/virtual/2022/poster/7050", "video": "https://iclr.cc/virtual/2022/poster/7050", "author_site": "Raphael Meyer, Cameron Musco, Christopher Musco, David Woodruff, Samson Zhou", "tldr": "", "abstract": "We study the $\\ell_p$ regression problem, which requires finding $\\mathbf{x}\\in\\mathbb R^{d}$ that minimizes $\\|\\mathbf{A}\\mathbf{x}-\\mathbf{b}\\|_p$ for a matrix $\\mathbf{A}\\in\\mathbb R^{n \\times d}$ and response vector $\\mathbf{b}\\in\\mathbb R^{n}$. There has been recent interest in developing subsampling methods for this problem that can outperform standard techniques when $n$ is very large. However, all known subsampling approaches have run time that depends exponentially on $p$, typically, $d^{\\mathcal{O}(p)}$, which can be prohibitively expensive. \n\nWe improve on this work by showing that for a large class of common \\emph{structured matrices}, such as combinations of low-rank matrices, sparse matrices, and Vandermonde matrices, there are subsampling based methods for $\\ell_p$ regression that depend polynomially on $p$. For example, we give an algorithm for $\\ell_p$ regression on Vandermonde matrices that runs in time $\\mathcal{O}(n\\log^3 n+(dp^2)^{0.5+\\omega}\\cdot\\text{polylog}\\,n)$, where $\\omega$ is the exponent of matrix multiplication. The polynomial dependence on $p$ crucially allows our algorithms to extend naturally to efficient algorithms for $\\ell_\\infty$ regression, via approximation of $\\ell_\\infty$ by $\\ell_{\\mathcal{O}(\\log n)}$. Of practical interest, we also develop a new subsampling algorithm for $\\ell_p$ regression for arbitrary matrices, which is simpler than previous approaches for $p \\ge 4$.", "keywords": "regression;sublinear time algorithm;structured input", "primary_area": "", "supplementary_material": "/attachment/33c6a51bd06a695c841c65487e83bf29502afc12.zip", "author": "Raphael A Meyer;Cameron N Musco;Christopher P Musco;David Woodruff;Samson Zhou", "authorids": "~Raphael_A_Meyer1;~Cameron_N_Musco1;~Christopher_P_Musco1;~David_Woodruff1;~Samson_Zhou1", "gender": "M;M;M;M;", "homepage": "https://ram900.com/;https://people.cs.umass.edu/~cmusco/;https://www.chrismusco.com/;http://www.cs.cmu.edu/~dwoodruf/;https://samsonzhou.github.io/", "dblp": "204/4381;149/2327;149/2243;w/DPWoodruff;179/2683", "google_scholar": "Xpi5HD0AAAAJ;EeYGZCwAAAAJ;HXXSrNMAAAAJ;https://scholar.google.com.tw/citations?user=0G2t-6sAAAAJ;NpjsgocAAAAJ", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~Raphael_A_Meyer1;~Cameron_N_Musco1;~Christopher_P_Musco1;~David_Woodruff1;~Samson_Zhou1", "aff": "New York University;University of Massachusetts, Amherst;New York University;Carnegie Mellon University;School of Computer Science, Carnegie Mellon University", "aff_domain": "nyu.edu;umass.edu;nyu.edu;cmu.edu;cs.cmu.edu", "position": "PhD student;Assistant Professor;Assistant Professor;Associate Professor;Postdoc", "bibtex": "@inproceedings{\nmeyer2022fast,\ntitle={Fast Regression for Structured Inputs},\nauthor={Raphael A Meyer and Cameron N Musco and Christopher P Musco and David Woodruff and Samson Zhou},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=gNp54NxHUPJ}\n}", "github": "", "project": "", "reviewers": "JQBd;2tK5;KU5L", "pdf_size": 0, "recommendation": "6;8;10", "confidence": "3;3;4", "correctness": "4;4;4", "technical_novelty": "2;3;4", "empirical_novelty": "0;2;3", "wc_summary_paper": "64;329;116", "wc_summary_review": "41;36;49", "wc_main_review": "335;490;167", "wc_review": "440;855;332", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 8.0, 1.632993161855452 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "correctness_avg": [ 4.0, 0.0 ], "technical_novelty_avg": [ 3.0, 0.816496580927726 ], "empirical_novelty_avg": [ 1.6666666666666667, 1.247219128924647 ], "wc_summary_paper_avg": [ 169.66666666666666, 114.64825433569506 ], "wc_summary_review_avg": [ 42.0, 5.354126134736337 ], "wc_main_review_avg": [ 330.6666666666667, 131.8997936145298 ], "wc_review_avg": [ 542.3333333333334, 225.44228135427974 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.8660254037844385, "corr_recommendation_correctness": 0.0, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15162984764355467989&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "pdf": "https://openreview.net/pdf?id=gNp54NxHUPJ", "email": "nyu.edu;umass.edu;nyu.edu;cmu.edu;cs.cmu.edu", "author_num": 5, "aff_unique_index": "0;1;0;2;2", "aff_unique_norm": "New York University;University of Massachusetts Amherst;Carnegie Mellon University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.nyu.edu;https://www.umass.edu;https://www.cmu.edu", "aff_unique_abbr": "NYU;UMass Amherst;CMU", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Amherst;Pittsburgh", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Compositional Training for End-to-End Deep AUC Maximization", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6071", "id": "gPvB4pdu_Z", "poster": "", "openreview": "https://openreview.net/forum?id=gPvB4pdu_Z", "slides": "https://iclr.cc/virtual/2022/poster/6071", "video": "https://iclr.cc/virtual/2022/poster/6071", "author_site": "Zhuoning Yuan, Zhishuai Guo, Nitesh Chawla, Tianbao Yang", "tldr": "", "abstract": "Recently, deep AUC maximization (DAM) has achieved great success in different domains (e.g., medical image classification). However, the end-to-end training for deep AUC maximization still remains a challenging problem. Previous studies employ an ad-hoc two-stage approach that first trains the network by optimizing a traditional loss (e.g., cross-entropy loss) and then finetunes the network by optimizing an AUC loss. This is because that training a deep neural network from scratch by maximizing an AUC loss usually does not yield a satisfactory performance. This phenomenon can be attributed to the degraded feature representations learned by maximizing the AUC loss from scratch. To address this issue, we propose a novel compositional training framework for end-to-end DAM, namely compositional DAM. The key idea of compositional training is to minimize a compositional objective function, where the outer function corresponds to an AUC loss and the inner function represents a gradient descent step for minimizing a traditional loss, e.g., the cross-entropy (CE) loss. To optimize the non-standard compositional objective, we propose an efficient and provable stochastic optimization algorithm. The proposed algorithm enhances the capabilities of both robust feature learning and robust classifier learning by alternatively taking a gradient descent step for the CE loss and for the AUC loss in a systematic way. We conduct extensive empirical studies on imbalanced benchmark and medical image datasets, which unanimously verify the effectiveness of the proposed method. Our results show that the compositional training approach dramatically improves both the feature representations and the testing AUC score compared with traditional deep learning approaches, and yields better performance than the two-stage approaches for DAM as well. The proposed method is implemented in our open-sourced library LibAUC (https://www.libauc.org) and code is available at https://github.com/Optimization-AI/LibAUC.", "keywords": "Compositional Training;Imbalanced Losses;AUC optimization;Deep Learning", "primary_area": "", "supplementary_material": "", "author": "Zhuoning Yuan;Zhishuai Guo;Nitesh Chawla;Tianbao Yang", "authorids": "~Zhuoning_Yuan1;~Zhishuai_Guo1;~Nitesh_Chawla1;~Tianbao_Yang1", "gender": "M;M;M;M", "homepage": "https://zhuoning.cc;https://zhishuaiguo.github.io;http://niteshchawla.nd.edu;https://people.tamu.edu/~tianbao-yang/publications.html", "dblp": "174/6855;221/2907;c/NiteshVChawla.html;56/7047", "google_scholar": "ZjJf6tYAAAAJ;sHow-tEAAAAJ;hDLBEhkAAAAJ;https://scholar.google.com.tw/citations?user=BCxFU0EAAAAJ", "orcid": ";;;", "linkedin": ";zhishuai-guo-5850671b3/;;", "or_profile": "~Zhuoning_Yuan1;~Zhishuai_Guo1;~Nitesh_Chawla1;~Tianbao_Yang1", "aff": "University of Iowa;University of Iowa;University of Notre Dame;University of Iowa", "aff_domain": "uiowa.edu;uiowa.edu;nd.edu;uiowa.edu", "position": "PhD student;MS student;Full Professor;Associate Professor", "bibtex": "@inproceedings{\nyuan2022compositional,\ntitle={Compositional Training for End-to-End Deep {AUC} Maximization},\nauthor={Zhuoning Yuan and Zhishuai Guo and Nitesh Chawla and Tianbao Yang},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=gPvB4pdu_Z}\n}", "github": "", "project": "", "reviewers": "TF22;qWty;LxEz", "pdf_size": 0, "recommendation": "6;8;8", "confidence": "3;3;3", "correctness": "3;3;3", "technical_novelty": "3;3;3", "empirical_novelty": "3;3;3", "wc_summary_paper": "91;111;134", "wc_summary_review": "61;32;19", "wc_main_review": "581;105;341", "wc_review": "733;248;494", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "1104;90;571", "reply_reviewers": "0;0;0", "reply_authors": "2;1;1", "recommendation_avg": [ 7.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 3.0, 0.0 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 112.0, 17.568911937472585 ], "wc_summary_review_avg": [ 37.333333333333336, 17.55625877635159 ], "wc_main_review_avg": [ 342.3333333333333, 194.3284733526087 ], "wc_review_avg": [ 491.6666666666667, 198.00729503957393 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 588.3333333333334, 414.145170468306 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 36, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2279114602026987436&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "pdf": "https://openreview.net/pdf?id=gPvB4pdu_Z", "email": "uiowa.edu;uiowa.edu;nd.edu;uiowa.edu", "author_num": 4, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "University of Iowa;University of Notre Dame", "aff_unique_dep": ";", "aff_unique_url": "https://www.uiowa.edu;https://www.nd.edu", "aff_unique_abbr": "UIowa;Notre Dame", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Provable Adaptation across Multiway Domains via Representation Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6899", "id": "gRCCdgpVZf", "poster": "", "openreview": "https://openreview.net/forum?id=gRCCdgpVZf", "slides": "https://iclr.cc/virtual/2022/poster/6899", "video": "https://iclr.cc/virtual/2022/poster/6899", "author_site": "Zhili Feng, Shaobo Han, Simon Du", "tldr": "", "abstract": "This paper studies zero-shot domain adaptation where each domain is indexed on a multi-dimensional array, and we only have data from a small subset of domains. Our goal is to produce predictors that perform well on \\emph{unseen} domains. We propose a model which consists of a domain-invariant latent representation layer and a domain-specific linear prediction layer with a low-rank tensor structure. Theoretically, we present explicit sample complexity bounds to characterize the prediction error on unseen domains in terms of the number of domains with training data and the number of data per domain. To our knowledge, this is the first finite-sample guarantee for zero-shot domain adaptation. In addition, we provide experiments on two-way MNIST and four-way fiber sensing datasets to demonstrate the effectiveness of our proposed model.", "keywords": "Representation learning;tensor;statistical learning theory", "primary_area": "", "supplementary_material": "", "author": "Zhili Feng;Shaobo Han;Simon Shaolei Du", "authorids": "~Zhili_Feng1;~Shaobo_Han1;~Simon_Shaolei_Du1", "gender": ";M;M", "homepage": "https://zhilif.github.io/;https://shaobohan.net/;http://simonshaoleidu.com", "dblp": "189/7590;;176/5602", "google_scholar": "_lnL4aQAAAAJ;3L333oYAAAAJ;OttawxUAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Zhili_Feng1;~Shaobo_Han1;~Simon_Shaolei_Du1", "aff": "Carnegie Mellon University;NEC Labs America;Meta Facebook", "aff_domain": "andrew.cmu.edu;nec-labs.com;fb.com", "position": "PhD student;Researcher;Visiting Professor", "bibtex": "@inproceedings{\nfeng2022provable,\ntitle={Provable Adaptation across Multiway Domains via Representation Learning},\nauthor={Zhili Feng and Shaobo Han and Simon Shaolei Du},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=gRCCdgpVZf}\n}", "github": "", "project": "", "reviewers": "Qy5s;mkWr;5mcf;b2ge", "pdf_size": 0, "recommendation": "3;6;6;8", "confidence": "5;4;3;2", "correctness": "4;4;4;3", "technical_novelty": "2;3;2;4", "empirical_novelty": "2;3;2;2", "wc_summary_paper": "64;108;209;57", "wc_summary_review": "33;47;46;55", "wc_main_review": "55;241;373;126", "wc_review": "152;396;628;238", "wc_reply_reviewers": "0;60;0;57", "wc_reply_authors": "114;583;305;125", "reply_reviewers": "0;1;0;1", "reply_authors": "1;1;1;1", "recommendation_avg": [ 5.75, 1.7853571071357126 ], "confidence_avg": [ 3.5, 1.118033988749895 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.82915619758885 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 109.5, 60.681545794417595 ], "wc_summary_review_avg": [ 45.25, 7.8859051477937525 ], "wc_main_review_avg": [ 198.75, 120.52463441139327 ], "wc_review_avg": [ 353.5, 181.03797944077922 ], "wc_reply_reviewers_avg": [ 29.25, 29.269224451631786 ], "wc_reply_authors_avg": [ 281.75, 189.73847132302927 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.9393364366277244, "corr_recommendation_correctness": -0.7276068751089989, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15471581355973678443&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=gRCCdgpVZf", "email": "andrew.cmu.edu;nec-labs.com;fb.com", "author_num": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "Carnegie Mellon University;NEC Labs America;Meta", "aff_unique_dep": ";;Meta Platforms, Inc.", "aff_unique_url": "https://www.cmu.edu;https://www.nec-labs.com;https://meta.com", "aff_unique_abbr": "CMU;NEC LA;Meta", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "PolyLoss: A Polynomial Expansion Perspective of Classification Loss Functions", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6089", "id": "gSdSJoenupI", "poster": "", "openreview": "https://openreview.net/forum?id=gSdSJoenupI", "slides": "https://iclr.cc/virtual/2022/poster/6089", "video": "https://iclr.cc/virtual/2022/poster/6089", "author_site": "Zhaoqi Leng, Mingxing Tan, Chenxi Liu, Ekin Cubuk, Jay Shi, Shuyang Cheng, Dragomir Anguelov", "tldr": "", "abstract": "Cross-entropy loss and focal loss are the most common choices when training deep neural networks for classification problems. Generally speaking, however, a good loss function can take on much more flexible forms, and should be tailored for different tasks and datasets. Motivated by how functions can be approximated via Taylor expansion, we propose a simple framework, named PolyLoss, to view and design loss functions as a linear combination of polynomial functions. Our PolyLoss allows the importance of different polynomial bases to be easily adjusted depending on the targeting tasks and datasets, while naturally subsuming the aforementioned cross-entropy loss and focal loss as special cases. Extensive experimental results show that the optimal choice within the PolyLoss is indeed dependent on the task and dataset. Simply by introducing one extra hyperparameter and adding one line of code, our Poly-1 formulation outperforms the cross-entropy loss and focal loss on 2D image classification, instance segmentation, object detection, and 3D object detection tasks, sometimes by a large margin.", "keywords": "classification;computer vision;loss", "primary_area": "", "supplementary_material": "", "author": "Zhaoqi Leng;Mingxing Tan;Chenxi Liu;Ekin Dogus Cubuk;Jay Shi;Shuyang Cheng;Dragomir Anguelov", "authorids": "~Zhaoqi_Leng1;~Mingxing_Tan3;~Chenxi_Liu1;~Ekin_Dogus_Cubuk1;~Jay_Shi1;~Shuyang_Cheng1;~Dragomir_Anguelov1", "gender": "M;M;;M;M;F;M", "homepage": "https://waymo.com/research/;;;;;https://www.linkedin.com/in/shuyang-cheng-27326257/;", "dblp": "262/3268;11/7863;146/8008;83/7734;;https://www.linkedin.com/in/shuyang-cheng-27326257/;a/DragomirAnguelov", "google_scholar": ";6POeyBoAAAAJ;;Mu_8iOEAAAAJ;;https://www.linkedin.com/in/shuyang-cheng-27326257/;https://scholar.google.com/citations?hl=en", "orcid": ";;;;;https://www.linkedin.com/in/shuyang-cheng-27326257/;", "linkedin": ";mingxing-tan-2724551b/;;ekin-dogus-cubuk-9148b8114/;xiaojie-jay-shi-b4391122;shuyang-cheng-27326257/;dragomiranguelov/", "or_profile": "~Zhaoqi_Leng1;~Mingxing_Tan3;~Chenxi_Liu1;~Ekin_Dogus_Cubuk1;~Jay_Shi1;~Shuyang_Cheng1;~Dragomir_Anguelov1", "aff": "Waymo LLC;Google/Waymo;Waymo;Google;Google;Waymo LLC;Waymo", "aff_domain": "waymo.com;google.com;waymo.com;google.com;google.com;waymo.com;waymo.com", "position": "Researcher;Researcher;Researcher;Staff Research Scientist;Software Engineer;Software Engineer;Researcher", "bibtex": "@inproceedings{\nleng2022polyloss,\ntitle={PolyLoss: A Polynomial Expansion Perspective of Classification Loss Functions},\nauthor={Zhaoqi Leng and Mingxing Tan and Chenxi Liu and Ekin Dogus Cubuk and Jay Shi and Shuyang Cheng and Dragomir Anguelov},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=gSdSJoenupI}\n}", "github": "", "project": "", "reviewers": "Vgzk;Zps3;KpJa;kfqh", "pdf_size": 0, "recommendation": "6;6;6;8", "confidence": "4;2;3;5", "correctness": "3;3;4;4", "technical_novelty": "3;2;3;3", "empirical_novelty": "3;2;2;3", "wc_summary_paper": "81;62;44;110", "wc_summary_review": "26;50;65;46", "wc_main_review": "110;132;238;74", "wc_review": "217;244;347;230", "wc_reply_reviewers": "0;0;57;0", "wc_reply_authors": "502;868;1206;106", "reply_reviewers": "0;0;1;0", "reply_authors": "1;2;3;1", "recommendation_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 3.5, 1.118033988749895 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 74.25, 24.437420076595647 ], "wc_summary_review_avg": [ 46.75, 13.91716565971678 ], "wc_main_review_avg": [ 138.5, 61.06349154773251 ], "wc_review_avg": [ 259.5, 51.41254710671316 ], "wc_reply_reviewers_avg": [ 14.25, 24.681724007856502 ], "wc_reply_authors_avg": [ 670.5, 410.1277240080217 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.75, 0.82915619758885 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.7745966692414834, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 239, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11188190542633762391&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=gSdSJoenupI", "email": "waymo.com;google.com;waymo.com;google.com;google.com;waymo.com;waymo.com", "author_num": 7, "aff_unique_index": "0;1;0;1;1;0;0", "aff_unique_norm": "Waymo;Google", "aff_unique_dep": ";Waymo", "aff_unique_url": "https://www.waymo.com;https://www.google.com", "aff_unique_abbr": "Waymo;Google", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "gTdmGt48ht1", "title": "On the Double Descent of Random Features Models Trained with SGD", "track": "main", "status": "Desk Reject", "tldr": "", "abstract": "We study generalization properties of random features (RF) regression in high dimensions optimized by stochastic gradient descent (SGD). In this regime, we derive precise non-asymptotic error bounds of RF regression under both constant and adaptive step-size SGD setting, and observe the double descent phenomenon both theoretically and empirically. Our analysis shows how to cope with multiple randomness sources of initialization, label noise, and data sampling (as well as stochastic gradients) with no closed-form solution, and also goes beyond the commonly-used Gaussian/spherical data assumption. Our theoretical results demonstrate that, with SGD training, RF regression still generalizes well in the interpolation setting, and is able to characterize the double descent behavior by the unimodality of variance and monotonic decrease of bias. Besides, we also prove that the constant step-size SGD setting incurs no loss in convergence rate when compared to the exact minimal-norm interpolator, as a theoretical justification of using SGD in practice.", "keywords": "random features;over-parameterized model;double descent;SGD", "primary_area": "", "supplementary_material": "", "author": "Fanghui Liu;Johan Suykens;Volkan Cevher", "authorids": "~Fanghui_Liu1;~Johan_Suykens1;~Volkan_Cevher1", "gender": "M;M;M", "homepage": "http://www.lfhsgre.org;https://www.kuleuven.be/wieiswie/nl/person/00015385;http://lions.epfl.ch", "dblp": "119/1038;61/3224;70/5301", "google_scholar": "AKxBgssAAAAJ;https://scholar.google.be/citations?user=WtBmh0UAAAAJ;https://scholar.google.ch/citations?user=hlWhzU8AAAAJ", "orcid": "0000-0003-4133-7921;0000-0002-8846-6352;", "linkedin": ";;", "or_profile": "~Fanghui_Liu1;~Johan_Suykens1;~Volkan_Cevher1", "aff": "\u00c9cole Polytechnique F\u00e9d\u00e9rale de Lausanne (EPFL);KU Leuven;Swiss Institute of Technology", "aff_domain": "epfl.ch;kuleuven.be;epfl.ch", "position": "Postdoc;Full Professor;Associate Professor", "bibtex": "@misc{\nliu2022on,\ntitle={On the Double Descent of Random Features Models Trained with {SGD}},\nauthor={Fanghui Liu and Johan Suykens and Volkan Cevher},\nyear={2022},\nurl={https://openreview.net/forum?id=gTdmGt48ht1}\n}", "github": "", "project": "", "reviewers": "", "site": "https://openreview.net/forum?id=gTdmGt48ht1", "pdf_size": 0, "recommendation": "", "confidence": "", "correctness": "", "technical_novelty": "", "empirical_novelty": "", "wc_summary_paper": "", "wc_summary_review": "", "wc_main_review": "", "wc_review": "", "wc_reply_reviewers": "", "wc_reply_authors": "", "reply_reviewers": "", "reply_authors": "", "recommendation_avg": [ 0, 0 ], "confidence_avg": [ 0, 0 ], "correctness_avg": [ 0, 0 ], "technical_novelty_avg": [ 0, 0 ], "empirical_novelty_avg": [ 0, 0 ], "wc_summary_paper_avg": [ 0, 0 ], "wc_summary_review_avg": [ 0, 0 ], "wc_main_review_avg": [ 0, 0 ], "wc_review_avg": [ 0, 0 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 1, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0, "corr_recommendation_correctness": 0, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6702164791376769682&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff_unique_index": "0;1;2", "aff_unique_norm": "EPFL;Katholieke Universiteit Leuven;Swiss Federal Institute of Technology", "aff_unique_dep": ";;", "aff_unique_url": "https://www.epfl.ch;https://www.kuleuven.be;https://www.ethz.ch", "aff_unique_abbr": "EPFL;KU Leuven;ETH Zurich", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", "aff_country_unique": "Switzerland;Belgium" }, { "id": "gULyf2IVll0", "title": "Empirical Study of the Decision Region and Robustness in Deep Neural Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "In general, the Deep Neural Networks (DNNs) is evaluated by the generalization performance measured on the unseen data excluded from the training phase. Along with the development of DNNs, the generalization performance converges to the state-of-the-art and it becomes difficult to evaluate DNNs solely based on the generalization performance. The robustness against the adversarial attack has been used as an additional metric to evaluate DNNs by measuring the vulnerability of them. However, few researches have been performed to analyze the adversarial robustness in terms of the geometry in DNNs. In this work, we perform empirical study to analyze the internal properties of DNNs which affect model robustness under adversarial attacks. Especially, we propose the novel concept Populated Region Set (PRS) where train samples populated more frequently to represent the internal properties of DNNs in the practical setting. From the systematic experiments with the proposed concept, we provide empirical evidences to validate that the low PRS ratio has strong relationship with the adversarial robustness of DNNs.", "keywords": "Decision Region;Adversarial Robustness;Deep Neural Networks", "primary_area": "", "supplementary_material": "/attachment/e2a2474db0b7cfb9f03bcde4b1bc3d106026de94.zip", "author": "Seongjin Park;Haedong Jeong;Giyoung Jeon;Jaesik Choi", "authorids": "~Seongjin_Park1;~Haedong_Jeong1;~Giyoung_Jeon1;~Jaesik_Choi1", "gender": "Not Specified;M;;M", "homepage": ";;;https://sailab.kaist.ac.kr/jaesik", "dblp": ";237/4766;;13/1402", "google_scholar": "uDA5q3QAAAAJ;3Ey5CcgAAAAJ;;RqMLVzUAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Seongjin_Park1;~Haedong_Jeong1;~Giyoung_Jeon1;~Jaesik_Choi1", "aff": "Samsung;Ulsan National Institute of Science and Technology;;Korea Advanced Institute of Science & Technology", "aff_domain": "samsung.com;unist.ac.kr;;kaist.ac.kr", "position": "Researcher;PhD student;;Associate Professor", "bibtex": "@misc{\npark2022empirical,\ntitle={Empirical Study of the Decision Region and Robustness in Deep Neural Networks},\nauthor={Seongjin Park and Haedong Jeong and Giyoung Jeon and Jaesik Choi},\nyear={2022},\nurl={https://openreview.net/forum?id=gULyf2IVll0}\n}", "github": "", "project": "", "reviewers": "pYmY;5u8a;Yvre", "site": "https://openreview.net/forum?id=gULyf2IVll0", "pdf_size": 0, "recommendation": "5;6;6", "confidence": "4;4;2", "correctness": "3;3;3", "technical_novelty": "3;2;2", "empirical_novelty": "3;2;2", "wc_summary_paper": "69;167;117", "wc_summary_review": "8;75;51", "wc_main_review": "269;690;175", "wc_review": "346;932;343", "wc_reply_reviewers": "0;454;0", "wc_reply_authors": "2208;4289;340", "reply_reviewers": "0;7;0", "reply_authors": "4;7;1", "recommendation_avg": [ 5.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.3333333333333335, 0.9428090415820634 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 117.66666666666667, 40.01110956832976 ], "wc_summary_review_avg": [ 44.666666666666664, 27.716822007983207 ], "wc_main_review_avg": [ 378.0, 223.93004860149222 ], "wc_review_avg": [ 540.3333333333334, 276.95286402001494 ], "wc_reply_reviewers_avg": [ 151.33333333333334, 214.01765243912837 ], "wc_reply_authors_avg": [ 2279.0, 1612.9540187701157 ], "reply_reviewers_avg": [ 2.3333333333333335, 3.2998316455372216 ], "reply_authors_avg": [ 4.0, 2.449489742783178 ], "replies_avg": [ 23, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.4999999999999999, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:2KV0KoAGXoUJ:scholar.google.com/&scioq=Empirical+Study+of+the+Decision+Region+and+Robustness+in+Deep+Neural+Networks&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;2", "aff_unique_norm": "Samsung;Ulsan National Institute of Science and Technology;Korea Advanced Institute of Science and Technology", "aff_unique_dep": "Samsung;;", "aff_unique_url": "https://www.samsung.com;https://www.unist.ac.kr;https://www.kaist.ac.kr", "aff_unique_abbr": "Samsung;UNIST;KAIST", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "South Korea" }, { "title": "Rethinking Adversarial Transferability from a Data Distribution Perspective", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7004", "id": "gVRhIEajG1k", "poster": "", "openreview": "https://openreview.net/forum?id=gVRhIEajG1k", "slides": "https://iclr.cc/virtual/2022/poster/7004", "video": "https://iclr.cc/virtual/2022/poster/7004", "author_site": "Yao Zhu, Jiacheng Sun, Zhenguo Li", "tldr": "", "abstract": "Adversarial transferability enables attackers to generate adversarial examples from the source model to attack the target model, which has raised security concerns about the deployment of DNNs in practice. In this paper, we rethink adversarial transferability from a data distribution perspective and further enhance transferability by score matching based optimization. We identify that some samples with injecting small Gaussian noise can fool different target models, and their adversarial examples under different source models have much stronger transferability. We hypothesize that these samples are in the low-density region of the ground truth distribution where models are not well trained. To improve the attack success rate of adversarial examples, we match the adversarial attacks with the directions which effectively decrease the ground truth density. We propose Intrinsic Adversarial Attack (IAA), which smooths the activation function and decreases the impact of the later layers of a given normal model, to increase the alignment of adversarial attack and the gradient of joint data distribution. We conduct comprehensive transferable attacks against multiple DNNs and show that our IAA can boost the transferability of the crafted attacks in all cases and go beyond state-of-the-art methods.", "keywords": "Adversarial Attack;Adversarial Transferability;Black-box Attack", "primary_area": "", "supplementary_material": "", "author": "Yao Zhu;Jiacheng Sun;Zhenguo Li", "authorids": "~Yao_Zhu2;~Jiacheng_Sun1;~Zhenguo_Li1", "gender": "M;M;M", "homepage": ";;http://www.ee.columbia.edu/~zgli/", "dblp": ";165/5350;23/6479", "google_scholar": "Te8bmo0AAAAJ;;XboZC1AAAAAJ", "orcid": "0000-0003-0991-1970;;", "linkedin": ";https://www.linkedin.cn/incareer/in/jiacheng-sun-ab622b131;", "or_profile": "~Yao_Zhu2;~Jiacheng_Sun1;~Zhenguo_Li1", "aff": "Zhejiang University;Huawei Noah's Ark Lab;Huawei Noah's Ark Lab", "aff_domain": "zju.edu.cn;huawei.com;huawei.com", "position": "PhD student;Senior Researcher;Principal Researcher", "bibtex": "@inproceedings{\nzhu2022rethinking,\ntitle={Rethinking Adversarial Transferability from a Data Distribution Perspective},\nauthor={Yao Zhu and Jiacheng Sun and Zhenguo Li},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=gVRhIEajG1k}\n}", "github": "", "project": "", "reviewers": "cPB9;wLmf;f4GM", "pdf_size": 0, "recommendation": "5;8;8", "confidence": "4;4;4", "correctness": "3;3;3", "technical_novelty": "3;3;3", "empirical_novelty": "3;2;3", "wc_summary_paper": "69;69;103", "wc_summary_review": "57;160;88", "wc_main_review": "111;377;281", "wc_review": "237;606;472", "wc_reply_reviewers": "0;212;0", "wc_reply_authors": "844;1429;641", "reply_reviewers": "0;1;0", "reply_authors": "2;3;1", "recommendation_avg": [ 7.0, 1.4142135623730951 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 80.33333333333333, 16.027753706895076 ], "wc_summary_review_avg": [ 101.66666666666667, 43.14574782705192 ], "wc_main_review_avg": [ 256.3333333333333, 109.98585767674355 ], "wc_review_avg": [ 438.3333333333333, 152.51302312334585 ], "wc_reply_reviewers_avg": [ 70.66666666666667, 99.93775840769871 ], "wc_reply_authors_avg": [ 971.3333333333334, 334.06220312324405 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 2.0, 0.816496580927726 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 44, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15539129257129277210&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "pdf": "https://openreview.net/pdf?id=gVRhIEajG1k", "email": "zju.edu.cn;huawei.com;huawei.com", "author_num": 3, "aff_unique_index": "0;1;1", "aff_unique_norm": "Zhejiang University;Huawei", "aff_unique_dep": ";Noah's Ark Lab", "aff_unique_url": "https://www.zju.edu.cn;https://www.huawei.com", "aff_unique_abbr": "ZJU;Huawei", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "id": "gWGexz8hFH", "title": "Distributed Skellam Mechanism: a Novel Approach to Federated Learning with Differential Privacy", "track": "main", "status": "Reject", "tldr": "", "abstract": "Deep neural networks have strong capabilities of memorizing the underlying training data; on the flip side, unintended data memorization can be a serious privacy concern. An effective and rigorous approach to addressing this problem is to train models with \\textit{differential privacy} (\\textit{DP}), which provides information-theoretic privacy guarantees by injecting random noise to the gradients. This paper focuses on the scenario where sensitive data are distributed among individual participants, who jointly train a model through \\textit{federated learning}, using both \\textit{secure multiparty computation} (\\textit{MPC}) to ensure the confidentiality of individual gradient updates, and differential privacy to avoid data leakage in the resulting model. We point out a major challenge in this problem setting: that common mechanisms for enforcing DP in deep learning, which require injecting \\textit{real-valued noise}, are fundamentally incompatible with MPC, which exchanges \\textit{finite-field integers} among the participants. Consequently, existing DP mechanisms require rather high noise levels, leading to poor model utility.\n\nMotivated by this, we design and develop \\textit{distributed Skellam mechanism} ({\\sf DSM}), a novel solution for enforcing differential privacy on models built through an MPC-based federated learning process. Compared to existing approaches, {\\sf DSM} has the advantage that its privacy guarantee is independent of the dimensionality of the gradients; further, {\\sf DSM} allows tight privacy accounting due to the nice composition and sub-sampling properties of the Skellam distribution, which are key to accurate deep learning with DP. The theoretical analysis of {\\sf DSM} is highly non-trivial, especially considering (i) the complicated math of differentially private deep learning in general and (ii) the fact that the Skellam distribution is rather complex, and to our knowledge, has not been applied to an iterative and sampling-based process, i.e., stochastic gradient descent. Meanwhile, through extensive experiments on various practical settings, we demonstrate that {\\sf DSM} consistently outperforms existing solutions in terms of model utility by a large margin.", "keywords": "Differential Privacy;Federated Learning;Skellam Distribution;Renyi Divergence", "primary_area": "", "supplementary_material": "", "author": "Ergute Bao;Yizheng Zhu;Xiaokui Xiao;Yin Yang;Beng Chin Ooi;Benjamin Hong Meng Tan;Khin Mi Mi Aung", "authorids": "~Ergute_Bao1;~Yizheng_Zhu2;~Xiaokui_Xiao2;~Yin_Yang3;~Beng_Chin_Ooi1;~Benjamin_Hong_Meng_Tan1;~Khin_Mi_Mi_Aung1", "gender": ";M;;M;M;M;", "homepage": ";https://homepage.zhuyizheng.club/;;https://www.hbku.edu.qa/en/cse/staff/david-yin-yang;http://www.comp.nus.edu.sg/~ooibc/;;", "dblp": ";;;56/2998-1;o/BengChinOoi;;", "google_scholar": ";;;r1VFrkAAAAAJ;https://scholar.google.com.tw/citations?user=9560QjYAAAAJ;fGfY1iQAAAAJ;https://scholar.google.com.sg/citations?user=mrjfgDEAAAAJ", "orcid": ";;;;0000-0003-4446-1100;;", "linkedin": ";;;;beng-chin-ooi-34b0634/;;", "or_profile": "~Ergute_Bao1;~Yizheng_Zhu2;~Xiaokui_Xiao2;~Yin_Yang3;~Beng_Chin_Ooi1;~Benjamin_Hong_Meng_Tan1;~Khin_Mi_Mi_Aung1", "aff": ";National University of Singapore;;Hamad Bin Khalifa University;National University of Singapore;;", "aff_domain": ";nus.edu.sg;;hbku.edu.qa;comp.nus.edu.sg;;", "position": ";Researcher;;Full Professor;Full Professor;;", "bibtex": "@misc{\nbao2022distributed,\ntitle={Distributed Skellam Mechanism: a Novel Approach to Federated Learning with Differential Privacy},\nauthor={Ergute Bao and Yizheng Zhu and Xiaokui Xiao and Yin Yang and Beng Chin Ooi and Benjamin Hong Meng Tan and Khin Mi Mi Aung},\nyear={2022},\nurl={https://openreview.net/forum?id=gWGexz8hFH}\n}", "github": "", "project": "", "reviewers": "ecND;85EN;VptB;QVkQ", "site": "https://openreview.net/forum?id=gWGexz8hFH", "pdf_size": 0, "recommendation": "3;5;6;8", "confidence": "4;4;4;4", "correctness": "2;4;3;4", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;2;3;0", "wc_summary_paper": "28;57;112;196", "wc_summary_review": "53;27;38;24", "wc_main_review": "156;86;332;229", "wc_review": "237;170;482;449", "wc_reply_reviewers": "0;47;112;0", "wc_reply_authors": "361;676;796;458", "reply_reviewers": "0;1;1;0", "reply_authors": "1;2;2;1", "recommendation_avg": [ 5.5, 1.8027756377319946 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 1.75, 1.0897247358851685 ], "wc_summary_paper_avg": [ 98.25, 63.99365202893174 ], "wc_summary_review_avg": [ 35.5, 11.368817000902073 ], "wc_main_review_avg": [ 200.75, 91.09713222709044 ], "wc_review_avg": [ 334.5, 133.6347634412543 ], "wc_reply_reviewers_avg": [ 39.75, 45.91500299466396 ], "wc_reply_authors_avg": [ 572.75, 172.1240468383195 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.7526178090063818, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8855063065743040592&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;0", "aff_unique_norm": "National University of Singapore;Hamad Bin Khalifa University", "aff_unique_dep": ";", "aff_unique_url": "https://www.nus.edu.sg;https://www.hbku.edu.qa", "aff_unique_abbr": "NUS;HBKU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", "aff_country_unique": "Singapore;Qatar" }, { "id": "gX9Ub6AwAd", "title": "ANOMALY DETECTION WITH FRAME-GROUP ATTENTION IN SURVEILLANCE VIDEOS", "track": "main", "status": "Reject", "tldr": "", "abstract": "The paper proposes an end-to-end abnormal behavior detection network to detect strenuous movements in slow moving crowds, such as running, bicycling, throwing from a height. The algorithm forms continuous video frames into a frame group and uses the frame-group feature extractor to obtain the spatio-temporal information. The implicit vector based attention mechanism will work on the extracted frame-group features to highlight the important features. We use fully connected layers to transform the space and reduce the computation. Finally, the group-pooling maps the processed frame-group features to the abnormal scores. The network input is flexible to cope with the form of video streams, and the network output is the abnormal score. The designed compound loss function will help the model improve the classification performance. This paper arranges several commonly used anomaly detection datasets and tests the algorithms on the integrated dataset. The experimental results show that the proposed algorithm has significant advantages in many objective metrics compared with other anomaly detection algorithms.", "keywords": "Anomaly detection;attention mechanism;frame-group;spatial-temporal feature", "primary_area": "", "supplementary_material": "", "author": "Jinsheng Xiao;Haowen Guo;Yuanxu Wu;Yunhua Chen;Honggang Xie", "authorids": "~Jinsheng_Xiao1;~Haowen_Guo1;whuwuyuanxu@whu.edu.cn;yhchen@gdut.edu.cn;~Honggang_Xie1", "gender": "M;M;;;", "homepage": "http://jszy.whu.edu.cn/xiaojinsheng/en/index.htm;http://www.whughw.com;;;", "dblp": "28/6940;;;;", "google_scholar": ";;;;", "orcid": "0000-0002-5403-1895;;;;", "linkedin": ";;;;", "or_profile": "~Jinsheng_Xiao1;~Haowen_Guo1;whuwuyuanxu@whu.edu.cn;yhchen@gdut.edu.cn;~Honggang_Xie1", "aff": "Wuhan University;Wuhan University;;;", "aff_domain": "whu.edu.cn;whu.edu.cn;;;", "position": "Associate Professor;MS student;;;", "bibtex": "@misc{\nxiao2022anomaly,\ntitle={{ANOMALY} {DETECTION} {WITH} {FRAME}-{GROUP} {ATTENTION} {IN} {SURVEILLANCE} {VIDEOS}},\nauthor={Jinsheng Xiao and Haowen Guo and Yuanxu Wu and Yunhua Chen and Honggang Xie},\nyear={2022},\nurl={https://openreview.net/forum?id=gX9Ub6AwAd}\n}", "github": "", "project": "", "reviewers": "5zpd;HXRz;uQtZ;gGwS", "site": "https://openreview.net/forum?id=gX9Ub6AwAd", "pdf_size": 0, "recommendation": "1;1;3;3", "confidence": "5;4;4;3", "correctness": "3;2;3;2", "technical_novelty": "1;1;1;2", "empirical_novelty": "1;2;1;2", "wc_summary_paper": "20;31;77;100", "wc_summary_review": "25;4;15;43", "wc_main_review": "335;113;226;320", "wc_review": "380;148;318;463", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 2.0, 1.0 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 1.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.5, 0.5 ], "wc_summary_paper_avg": [ 57.0, 32.76430985081175 ], "wc_summary_review_avg": [ 21.75, 14.341809509263467 ], "wc_main_review_avg": [ 248.5, 88.68624470570394 ], "wc_review_avg": [ 327.25, 115.57113610240232 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.7071067811865475, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:F5p1MCZpQxYJ:scholar.google.com/&scioq=ANOMALY+DETECTION+WITH+FRAME-GROUP+ATTENTION+IN+SURVEILLANCE+VIDEOS&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Wuhan University", "aff_unique_dep": "", "aff_unique_url": "http://www.whu.edu.cn/", "aff_unique_abbr": "WHU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "id": "gaYko_Y2_l", "title": "Weakly Supervised Graph Clustering", "track": "main", "status": "Reject", "tldr": "", "abstract": "Graph Clustering, which clusters the nodes of a graph given its collection of node features and edge connections in an unsupervised manner, has long been researched in graph learning and is essential in certain applications. While this task is common, more complex cases arise in practice\u2014can we cluster nodes better with some graph-level side information or in a weakly supervised manner as, for example, identifying potential fraud users in a social network given additional labels of fraud communities. This triggers an interesting problem which we define as Weakly Supervised Graph Clustering (WSGC). In this paper, we firstly discuss the various possible settings of WSGC, formally. Upon such discussion, we investigate a particular task of weakly supervised graph clustering by making use of the graph labels and node features, with the assistance of a hierarchical graph that further characterizes the connections between different graphs. To address this task, we propose Gaussian Mixture Graph Convolutional Network (GMGCN), a simple yet effective framework for learning node representations under the supervision of graph labels guided by a proposed consensus loss and then inferring the category of each node via a Gaussian Mixture Layer (GML). Extensive experiments are conducted to test the rationality of the formulation of weakly supervised graph clustering. The experimental results show that, with the assistance of graph labels, the weakly supervised graph clustering method has a great improvement over the traditional graph clustering method.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Tian Bian;Tingyang Xu;Yu Rong;Wenbing Huang;Xi Xiao;Peilin Zhao;Junzhou Huang;Hong Cheng", "authorids": "~Tian_Bian1;~Tingyang_Xu1;~Yu_Rong1;~Wenbing_Huang1;~Xi_Xiao1;~Peilin_Zhao2;~Junzhou_Huang2;~Hong_Cheng1", "gender": ";M;M;M;M;;M;F", "homepage": ";;https://royrong.me/;https://gsai.ruc.edu.cn/english/wenbing_huang;https://www.sigs.tsinghua.edu.cn/xx_en/main.htm;;http://ranger.uta.edu/~huang/;https://www1.se.cuhk.edu.hk/~hcheng/", "dblp": ";157/0940;24/10036-1;155/3181-1.html;;84/8411;22/1170.html;85/5637-1", "google_scholar": ";6gIs5YMAAAAJ;https://scholar.google.com.hk/citations?user=itezhEMAAAAJ;0yNkmO4AAAAJ;;https://scholar.google.com.hk/citations?user=HPeX_YcAAAAJ;https://scholar.google.com.tw/citations?user=X7KrguAAAAAJ;https://scholar.google.com.hk/citations?user=s3lQL7YAAAAJ", "orcid": ";0009-0002-0106-8376;0000-0001-7387-302X;;;0000-0001-8543-3953;0000-0002-9548-1227;0000-0002-4673-2587", "linkedin": ";;;;;;;", "or_profile": "~Tian_Bian1;~Tingyang_Xu1;~Yu_Rong1;~Wenbing_Huang1;~Xi_Xiao1;~Peilin_Zhao2;~Junzhou_Huang2;~Hong_Cheng1", "aff": ";Tencent AI Lab;Tencent AI Lab;Tsinghua University;Shenzhen International Graduate School, Tsinghua University;Tencent;University of Texas, Arlington;The Chinese University of Hong Kong", "aff_domain": ";tencent.com;tencent.com;tsinghua.edu.cn;tsinghua.edu.cn;tencent.com;uta.edu;cuhk.edu.hk", "position": ";Researcher;Senior Researcher;Researcher;Associate Professor;Researcher;Full Professor;Professor", "bibtex": "@misc{\nbian2022weakly,\ntitle={Weakly Supervised Graph Clustering},\nauthor={Tian Bian and Tingyang Xu and Yu Rong and Wenbing Huang and Xi Xiao and Peilin Zhao and Junzhou Huang and Hong Cheng},\nyear={2022},\nurl={https://openreview.net/forum?id=gaYko_Y2_l}\n}", "github": "", "project": "", "reviewers": "PUKs;r2ym;oDis;Khg3;inpd", "site": "https://openreview.net/forum?id=gaYko_Y2_l", "pdf_size": 0, "recommendation": "5;5;5;6;6", "confidence": "3;3;4;2;2", "correctness": "3;3;2;4;3", "technical_novelty": "3;2;2;3;3", "empirical_novelty": "3;2;2;3;3", "wc_summary_paper": "40;65;48;40;151", "wc_summary_review": "49;60;50;40;47", "wc_main_review": "44;288;266;240;212", "wc_review": "133;413;364;320;410", "wc_reply_reviewers": "4;0;12;117;43", "wc_reply_authors": "663;1224;1715;1058;438", "reply_reviewers": "1;0;1;1;1", "reply_authors": "5;4;4;3;3", "recommendation_avg": [ 5.4, 0.48989794855663565 ], "confidence_avg": [ 2.8, 0.7483314773547882 ], "correctness_avg": [ 3.0, 0.6324555320336759 ], "technical_novelty_avg": [ 2.6, 0.4898979485566356 ], "empirical_novelty_avg": [ 2.6, 0.4898979485566356 ], "wc_summary_paper_avg": [ 68.8, 42.10178143499393 ], "wc_summary_review_avg": [ 49.2, 6.43117407632541 ], "wc_main_review_avg": [ 210.0, 86.81013765684283 ], "wc_review_avg": [ 328.0, 103.28020139407165 ], "wc_reply_reviewers_avg": [ 35.2, 43.59541260270397 ], "wc_reply_authors_avg": [ 1019.6, 445.4788883886643 ], "reply_reviewers_avg": [ 0.8, 0.4000000000000001 ], "reply_authors_avg": [ 3.8, 0.7483314773547882 ], "replies_avg": [ 33, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": -0.8728715609439693, "corr_recommendation_correctness": 0.6454972243679027, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1, "aff_unique_index": "0;0;1;1;0;2;3", "aff_unique_norm": "Tencent;Tsinghua University;University of Texas at Arlington;Chinese University of Hong Kong", "aff_unique_dep": "Tencent AI Lab;;;", "aff_unique_url": "https://ai.tencent.com;https://www.tsinghua.edu.cn;https://www.uta.edu;https://www.cuhk.edu.hk", "aff_unique_abbr": "Tencent AI Lab;THU;UTA;CUHK", "aff_campus_unique_index": "1;2;3", "aff_campus_unique": ";Shenzhen;Arlington;Hong Kong SAR", "aff_country_unique_index": "0;0;0;0;0;1;0", "aff_country_unique": "China;United States" }, { "title": "Constrained Physical-Statistics Models for Dynamical System Identification and Prediction", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6791", "id": "gbe1zHyA73", "poster": "", "openreview": "https://openreview.net/forum?id=gbe1zHyA73", "slides": "https://iclr.cc/virtual/2022/poster/6791", "video": "https://iclr.cc/virtual/2022/poster/6791", "author_site": "J\u00e9r\u00e9mie DONA, Marie D\u00e9chelle, patrick gallinari, Marina Levy", "tldr": "", "abstract": "Modeling dynamical systems combining prior physical knowledge and machine learning (ML) is promising in scientific problems when the underlying processes are not fully understood, e.g. when the dynamics is partially known. A common practice to identify the respective parameters of the physical and ML components is to formulate the problem as supervised learning on observed trajectories. However, this formulation leads to an infinite number of possible decompositions. To solve this ill-posedness, we reformulate the learning problem by introducing an upper bound on the prediction error of a physical-statistical model. This allows us to control the contribution of both the physical and statistical components to the overall prediction. This framework generalizes several existing hybrid schemes proposed in the literature. We provide theoretical guarantees on the well-posedness of our formulation along with a proof of convergence in a simple affine setting. For more complex dynamics, we validate our framework experimentally.", "keywords": "Deep Learning;Hybrid Models;Differential Equations", "primary_area": "", "supplementary_material": "/attachment/a49c549a0ebe0ad2f87ddb84d81897e9d9af0287.zip", "author": "J\u00e9r\u00e9mie DONA;Marie D\u00e9chelle;patrick gallinari;Marina Levy", "authorids": "~J\u00e9r\u00e9mie_DONA1;~Marie_D\u00e9chelle1;~patrick_gallinari1;~Marina_Levy1", "gender": ";F;M;", "homepage": "https://www.lip6.fr/actualite/personnes-fiche.php?ident=D2261;https://www.lip6.fr/actualite/personnes-fiche.php?ident=D2349;;http://pagesperso.locean-ipsl.upmc.fr/marina/", "dblp": "271/7866;;g/PatrickGallinari;", "google_scholar": ";;rFaxB20AAAAJ;https://scholar.google.fr/citations?user=EAgdP-8AAAAJ", "orcid": ";;;0000-0003-2961-608X", "linkedin": ";;;marina-levy-b46254bb/", "or_profile": "~J\u00e9r\u00e9mie_DONA1;~Marie_D\u00e9chelle1;~patrick_gallinari1;~Marina_Levy1", "aff": "MLIA;LIP6;Sorbonne Universite;LOCEAN-IPSL", "aff_domain": "sorbonne-universite.fr;lip6.fr;sorbonne-universite.fr;sorbonne-universite.fr", "position": "PhD student;PhD student;Full Professor;Principal Researcher", "bibtex": "@inproceedings{\ndona2022constrained,\ntitle={Constrained Physical-Statistics Models for Dynamical System Identification and Prediction},\nauthor={J{\\'e}r{\\'e}mie DONA and Marie D{\\'e}chelle and patrick gallinari and Marina Levy},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=gbe1zHyA73}\n}", "github": "", "project": "", "reviewers": "GA76;TpQa;YsB3;1AX7", "pdf_size": 0, "recommendation": "3;6;6;8", "confidence": "3;4;3;4", "correctness": "4;4;3;4", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "58;68;123;192", "wc_summary_review": "31;120;48;53", "wc_main_review": "118;370;289;432", "wc_review": "207;558;460;677", "wc_reply_reviewers": "0;0;0;16", "wc_reply_authors": "474;897;565;518", "reply_reviewers": "0;0;0;1", "reply_authors": "1;2;1;1", "recommendation_avg": [ 5.75, 1.7853571071357126 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 110.25, 53.29340953626443 ], "wc_summary_review_avg": [ 63.0, 33.904277016329374 ], "wc_main_review_avg": [ 302.25, 117.84391159495682 ], "wc_review_avg": [ 475.5, 173.01806264086994 ], "wc_reply_reviewers_avg": [ 4.0, 6.928203230275509 ], "wc_reply_authors_avg": [ 613.5, 166.8120199506019 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.7001400420140049, "corr_recommendation_correctness": -0.08084520834544431, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5714123612907552967&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2, "pdf": "https://openreview.net/pdf?id=gbe1zHyA73", "email": "sorbonne-universite.fr;lip6.fr;sorbonne-universite.fr;sorbonne-universite.fr", "author_num": 4, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "Machine Learning and Interpretation of Data Group;Laboratoire d'Informatique de Paris 6;Sorbonne University;LOCEAN-IPSL", "aff_unique_dep": ";;;", "aff_unique_url": ";http://www.lip6.fr;https://www.sorbonne-universite.fr;https://www.locean.ipsl.fr", "aff_unique_abbr": "MLIA;LIP6;Sorbonne;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "1;1;1", "aff_country_unique": ";France" }, { "id": "gc8zLQWf2k", "title": "Towards the Memorization Effect of Neural Networks in Adversarial Training", "track": "main", "status": "Reject", "tldr": "", "abstract": "Recent studies suggest that \"memorization\" is one important factor for overparameterized deep neural networks (DNNs) to achieve optimal performance. Specifically, the perfectly fitted DNNs can memorize the labels of many atypical samples, generalize their memorization to correctly classify test atypical samples and enjoy better test performance. While, DNNs which are optimized via adversarial training algorithms can also achieve perfect training performance by memorizing the labels of atypical samples, as well as the adversarially perturbed atypical samples. However, adversarially trained models always suffer from poor generalization, with both relatively low clean accuracy and robustness on the test set. In this work, we study the effect of memorization in adversarial trained DNNs and disclose two important findings: (a) Memorizing atypical samples is only effective to improve DNN's accuracy on clean atypical samples, but hardly improve their adversarial robustness and (b) Memorizing certain atypical samples will even hurt the DNN's performance on typical samples. Based on these two findings, we propose Benign Adversarial Training (BAT) which can facilitate adversarial training to avoid fitting \"harmful\" atypical samples and fit as more \"benign\" atypical samples as possible. In our experiments, we validate the effectiveness of BAT, and show it can achieve better clean accuracy vs. robustness trade-off than baseline methods, in benchmark datasets such as CIFAR100 and Tiny ImageNet.", "keywords": "Adversarial training;Robustness;Overfitting;Neural networks", "primary_area": "", "supplementary_material": "", "author": "Han Xu;Xiaorui Liu;Wentao Wang;Wenbiao Ding;Zhongqin Wu;Zitao Liu;Anil Jain;Jiliang Tang", "authorids": "~Han_Xu1;~Xiaorui_Liu1;~Wentao_Wang3;~Wenbiao_Ding1;~Zhongqin_Wu2;~Zitao_Liu1;~Anil_Jain1;~Jiliang_Tang1", "gender": "M;M;;;M;M;;M", "homepage": "https://cse.msu.edu/~xuhan1/;https://sites.google.com/ncsu.edu/xiaorui/;http://www.cse.msu.edu/~wangw116/;;https://scholar.google.com/citations?user=pHnJPmcAAAAJ&hl=zh-CN;http://www.zitaoliu.com/;;https://www.cse.msu.edu/~tangjili/", "dblp": "32/34-2;172/0995;;;;210/0898;;64/10812", "google_scholar": "mX2rL3IAAAAJ;NhvN1KoAAAAJ;;;pHnJPmcAAAAJ;rRTzNm0AAAAJ;;WtzKMWAAAAAJ", "orcid": "0000-0002-4016-6748;0000-0001-8217-5688;;;;0000-0003-0491-307X;;0000-0001-7125-3898", "linkedin": ";;;;;;;", "or_profile": "~Han_Xu1;~Xiaorui_Liu1;~Wentao_Wang3;~Wenbiao_Ding1;~Zhongqin_Wu2;~Zitao_Liu1;~Anil_Jain1;~Jiliang_Tang1", "aff": "VISA;Michigan State University;Michigan State University;;;TAL Education Group;;Michigan State University", "aff_domain": "visa.com;msu.edu;msu.edu;;;100tal.com;;msu.edu", "position": "Intern;PhD student;PhD student;;;Director of Machine Learning;;Associate Professor", "bibtex": "@misc{\nxu2022towards,\ntitle={Towards the Memorization Effect of Neural Networks in Adversarial Training},\nauthor={Han Xu and Xiaorui Liu and Wentao Wang and Wenbiao Ding and Zhongqin Wu and Zitao Liu and Anil Jain and Jiliang Tang},\nyear={2022},\nurl={https://openreview.net/forum?id=gc8zLQWf2k}\n}", "github": "", "project": "", "reviewers": "ytJj;iXiX;qCTN;sm19", "site": "https://openreview.net/forum?id=gc8zLQWf2k", "pdf_size": 0, "recommendation": "5;5;6;8", "confidence": "4;4;3;4", "correctness": "3;3;3;4", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "37;89;59;105", "wc_summary_review": "45;96;91;38", "wc_main_review": "226;501;436;308", "wc_review": "308;686;586;451", "wc_reply_reviewers": "0;68;215;149", "wc_reply_authors": "983;1749;1500;1251", "reply_reviewers": "0;1;2;1", "reply_authors": "4;5;5;4", "recommendation_avg": [ 6.0, 1.224744871391589 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 72.5, 26.320144376503713 ], "wc_summary_review_avg": [ 67.5, 26.177280225416848 ], "wc_main_review_avg": [ 367.75, 107.3274778423494 ], "wc_review_avg": [ 507.75, 142.31720732223494 ], "wc_reply_reviewers_avg": [ 108.0, 81.23115165993893 ], "wc_reply_authors_avg": [ 1370.75, 284.8107924570275 ], "reply_reviewers_avg": [ 1.0, 0.7071067811865476 ], "reply_authors_avg": [ 4.5, 0.5 ], "replies_avg": [ 27, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.9428090415820632, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1046146272380879566&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1;1;2;1", "aff_unique_norm": "VISA;Michigan State University;TAL Education Group", "aff_unique_dep": ";;", "aff_unique_url": "https://www.visa.com;https://www.msu.edu;https://www.tal.com", "aff_unique_abbr": "VISA;MSU;TAL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;0", "aff_country_unique": "United States;China" }, { "id": "gccdzDu5Ur", "title": "Combining Diverse Feature Priors", "track": "main", "status": "Reject", "tldr": "", "abstract": "To improve model generalization, model designers often restrict the features that their models use, either implicitly or explicitly. In this work, we explore the design space of leveraging such feature priors by viewing them as distinct perspectives on the data. Specifically, we find that models trained with diverse sets of explicit feature priors have less overlapping failure modes, and can thus be combined more effectively. Moreover, we demonstrate that jointly training such models on additional (unlabeled) data allows them to correct each other's mistakes, which, in turn, leads to better generalization and resilience to spurious correlations.", "keywords": "robustness;spurious correlations;feature priors", "primary_area": "", "supplementary_material": "/attachment/bb878d7e7c054f5db14f96a8270ba278f1c3b7d0.zip", "author": "Saachi Jain;Dimitris Tsipras;Aleksander Madry", "authorids": "~Saachi_Jain1;~Dimitris_Tsipras1;~Aleksander_Madry1", "gender": "F;M;M", "homepage": "http://people.csail.mit.edu/saachij/;https://dtsipras.com;https://people.csail.mit.edu/madry/", "dblp": "227/2617;168/4752;67/2454", "google_scholar": "6hsn3EYAAAAJ;26eh1jAAAAAJ;SupjsEUAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Saachi_Jain1;~Dimitris_Tsipras1;~Aleksander_Madry1", "aff": "Massachusetts Institute of Technology;Stanford University;Massachusetts Institute of Technology", "aff_domain": "mit.edu;stanford.edu;mit.edu", "position": "PhD student;Postdoc;Professor", "bibtex": "@misc{\njain2022combining,\ntitle={Combining Diverse Feature Priors},\nauthor={Saachi Jain and Dimitris Tsipras and Aleksander Madry},\nyear={2022},\nurl={https://openreview.net/forum?id=gccdzDu5Ur}\n}", "github": "", "project": "", "reviewers": "uSUu;KJot;VXFn;LSTE", "site": "https://openreview.net/forum?id=gccdzDu5Ur", "pdf_size": 0, "recommendation": "3;3;6;8", "confidence": "2;4;4;4", "correctness": "2;3;3;4", "technical_novelty": "1;1;2;3", "empirical_novelty": "2;1;3;3", "wc_summary_paper": "43;23;132;149", "wc_summary_review": "28;9;56;82", "wc_main_review": "110;60;587;152", "wc_review": "181;92;775;383", "wc_reply_reviewers": "34;0;29;23", "wc_reply_authors": "409;83;409;31", "reply_reviewers": "1;0;1;1", "reply_authors": "1;1;1;1", "recommendation_avg": [ 5.0, 2.1213203435596424 ], "confidence_avg": [ 3.5, 0.8660254037844386 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 1.75, 0.82915619758885 ], "empirical_novelty_avg": [ 2.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 86.75, 54.545279355779265 ], "wc_summary_review_avg": [ 43.75, 27.698149757700424 ], "wc_main_review_avg": [ 227.25, 210.23959546193956 ], "wc_review_avg": [ 357.75, 262.9632816573447 ], "wc_reply_reviewers_avg": [ 21.5, 13.009611831257688 ], "wc_reply_authors_avg": [ 233.0, 176.95762204550556 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.5443310539518174, "corr_recommendation_correctness": 0.8333333333333334, "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3431368394631636693&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;1;0", "aff_unique_norm": "Massachusetts Institute of Technology;Stanford University", "aff_unique_dep": ";", "aff_unique_url": "https://web.mit.edu;https://www.stanford.edu", "aff_unique_abbr": "MIT;Stanford", "aff_campus_unique_index": "1", "aff_campus_unique": ";Stanford", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "gciJWCp3z1s", "title": "On the Convergence of Projected Alternating Maximization for Equitable and Optimal Transport", "track": "main", "status": "Reject", "tldr": "", "abstract": "This paper studies the equitable and optimal transport (EOT) problem, which has many applications such as fair division problems and optimal transport with multiple agents etc. In the discrete distributions case, the EOT problem can be formulated as a linear program (LP). Since this LP is prohibitively large for general LP solvers, Scetbon \\etal suggests to perturb the problem by adding an entropy regularization. They proposed a projected alternating maximization algorithm (PAM) to solve the dual of the entropy regularized EOT. In this paper, we provide the first convergence analysis of PAM. A novel rounding procedure is proposed to help construct the primal solution for the original EOT problem. We also propose a variant of PAM by incorporating the extrapolation technique that can numerically improve the performance of PAM. Results in this paper may shed lights on block coordinate (gradient) descent methods for general optimization problems. ", "keywords": "Equitable and Optimal Transport;Fairness;Saddle Point Problem;Projected Alternating Maximization;Block Coordinate Descent;Acceleration;Rounding", "primary_area": "", "supplementary_material": "/attachment/706846675a6acf63a721e031de99c1d342927921.zip", "author": "Minhui Huang;Shiqian Ma;Lifeng Lai", "authorids": "~Minhui_Huang1;~Shiqian_Ma3;~Lifeng_Lai1", "gender": "M;;M", "homepage": "https://mhhuang95.github.io;;https://sqma.rice.edu/", "dblp": "272/9225;12/4889;64/650", "google_scholar": "5j_jAr8AAAAJ;gOhaCfUAAAAJ;kkzUrUgAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Minhui_Huang1;~Lifeng_Lai1;~Shiqian_Ma2", "aff": "University of California, Davis;University of California, Davis;University of California, Davis", "aff_domain": "ucdavis.edu;ucdavis.edu;ucdavis.edu", "position": "PhD student;Full Professor;Associate Professor", "bibtex": "@misc{\nhuang2022on,\ntitle={On the Convergence of Projected Alternating Maximization for Equitable and Optimal Transport},\nauthor={Minhui Huang and Shiqian Ma and Lifeng Lai},\nyear={2022},\nurl={https://openreview.net/forum?id=gciJWCp3z1s}\n}", "github": "", "project": "", "reviewers": "9vP7;1p7i;rMWc;M8bm", "site": "https://openreview.net/forum?id=gciJWCp3z1s", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "4;3;4;4", "correctness": "4;3;4;4", "technical_novelty": "2;3;2;4", "empirical_novelty": "2;1;1;4", "wc_summary_paper": "134;127;78;87", "wc_summary_review": "16;66;105;46", "wc_main_review": "284;95;126;431", "wc_review": "434;288;309;564", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "481;309;278;408", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.82915619758885 ], "empirical_novelty_avg": [ 2.0, 1.224744871391589 ], "wc_summary_paper_avg": [ 106.5, 24.336187047275914 ], "wc_summary_review_avg": [ 58.25, 32.3293597214668 ], "wc_main_review_avg": [ 234.0, 134.43771792172018 ], "wc_review_avg": [ 398.75, 110.53364872291152 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 369.0, 80.53881052014613 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.13245323570650439, "corr_recommendation_correctness": 0.13245323570650439, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12027948693572825256&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of California, Davis", "aff_unique_dep": "", "aff_unique_url": "https://www.ucdavis.edu", "aff_unique_abbr": "UC Davis", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Davis", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "gdWQMQVJST", "title": "Neural Tangent Kernel Empowered Federated Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Federated learning (FL) is a privacy-preserving paradigm where multiple participants jointly solve a machine learning problem without sharing raw data. Unlike traditional distributed learning, a unique characteristic of FL is statistical heterogeneity, namely, data distributions across participants are different from each other. Meanwhile, recent advances in the interpretation of neural networks have seen a wide use of neural tangent kernel (NTK) for convergence and generalization analyses. In this paper, we propose a novel FL paradigm empowered by the NTK framework. The proposed paradigm addresses the challenge of statistical heterogeneity by transmitting update data that are more expressive than those of the traditional FL paradigms. Specifically, sample-wise Jacobian matrices, rather than model weights/gradients, are uploaded by participants. The server then constructs an empirical kernel matrix to update a global model without explicitly performing gradient descent. We further develop a variant with improved communication efficiency and enhanced privacy. Numerical results show that the proposed paradigm can achieve the same accuracy while reducing the number of communication rounds by an order of magnitude compared to federated averaging. ", "keywords": "Federated Learning;Neural Tangent Kernel", "primary_area": "", "supplementary_material": "", "author": "Kai Yue;Richeng Jin;Ryan Pilgrim;Chau-Wai Wong;Dror Baron;Huaiyu Dai", "authorids": "~Kai_Yue1;rjin2@ncsu.edu;rzpilgrim@gmail.com;~Chau-Wai_Wong1;~Dror_Baron2;~Huaiyu_Dai1", "gender": "M;;;M;M;", "homepage": "https://kaiyue.netlify.app/;;;https://ncsu-wong.org/;https://people.engr.ncsu.edu/dzbaron/;", "dblp": ";;;24/10474;;", "google_scholar": "5uWEGF8AAAAJ;;;ggreZvcAAAAJ;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": "~Kai_Yue1;rjin2@ncsu.edu;rzpilgrim@gmail.com;~Chau-Wai_Wong1;~Dror_Baron2;~Huaiyu_Dai1", "aff": "North Carolina State University;;;North Carolina State University;North Carolina State University;", "aff_domain": "ncsu.edu;;;ncsu.edu;ncsu.edu;", "position": "PhD student;;;Assistant Professor;associate;", "bibtex": "@misc{\nyue2022neural,\ntitle={Neural Tangent Kernel Empowered Federated Learning},\nauthor={Kai Yue and Richeng Jin and Ryan Pilgrim and Chau-Wai Wong and Dror Baron and Huaiyu Dai},\nyear={2022},\nurl={https://openreview.net/forum?id=gdWQMQVJST}\n}", "github": "", "project": "", "reviewers": "4JCT;uPVB;KsCY;C84w", "site": "https://openreview.net/forum?id=gdWQMQVJST", "pdf_size": 0, "recommendation": "5;5;5;5", "confidence": "3;4;3;4", "correctness": "3;2;3;3", "technical_novelty": "2;3;2;2", "empirical_novelty": "2;1;2;3", "wc_summary_paper": "115;142;73;80", "wc_summary_review": "41;55;90;46", "wc_main_review": "338;409;311;436", "wc_review": "494;606;474;562", "wc_reply_reviewers": "51;72;0;128", "wc_reply_authors": "397;1007;744;747", "reply_reviewers": "1;1;0;1", "reply_authors": "1;2;1;1", "recommendation_avg": [ 5.0, 0.0 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 102.5, 27.807373122968663 ], "wc_summary_review_avg": [ 58.0, 19.144189719076646 ], "wc_main_review_avg": [ 373.5, 50.82568248435037 ], "wc_review_avg": [ 534.0, 52.839379254491625 ], "wc_reply_reviewers_avg": [ 62.75, 45.87687325875642 ], "wc_reply_authors_avg": [ 723.75, 216.76412872059805 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 23, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5964318838593905351&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff_unique_index": "0;0;0", "aff_unique_norm": "North Carolina State University", "aff_unique_dep": "", "aff_unique_url": "https://www.ncsu.edu", "aff_unique_abbr": "NCSU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "gdegUuC_fxR", "title": "Hessian-Free High-Resolution Nesterov Acceleration for Sampling", "track": "main", "status": "Reject", "tldr": "", "abstract": "It is known (Shi et al., 2021) that Nesterov's Accelerated Gradient (NAG) for optimization starts to differ from its continuous time limit (noiseless kinetic Langevin) when its stepsize becomes finite. This work explores the sampling counterpart of this phenonemon and proposes an accelerated-gradient-based MCMC method, based on the optimizer of NAG for strongly convex functions (NAG-SC): we reformulate NAG-SC as a Hessian-Free High-Resolution ODE, change its high-resolution coefficient to a hyperparameter, inject appropriate noise, and discretize the resulting diffusion process. Accelerated sampling enabled by the new hyperparameter is quantified and it is not a false acceleration created by time-rescaling. At continuous-time level, additional acceleration over underdamped Langevin in $W_2$ distance is proved. At discrete algorithm level, a dedicated discretization is proposed to simulate the Hessian-Free High-Resolution SDE in a cost-efficient manner. For log-strong-concave-and-smooth target measures, the proposed algorithm achieves $\\tilde{\\mathcal{O}}(\\sqrt{d}/\\epsilon)$ iteration complexity in $W_2$ distance, same as underdamped Langevin dynamics, but with a reduced constant. Empirical experiments are conducted to numerically verify our theoretical results.", "keywords": "Markov Chain Monte Carlo;Nesterov Accelerated Gradient;accelerated sampling", "primary_area": "", "supplementary_material": "/attachment/ce12ce56d5c537443febdb71a635bda147d71302.zip", "author": "Ruilin Li;Hongyuan Zha;Molei Tao", "authorids": "~Ruilin_Li1;~Hongyuan_Zha1;~Molei_Tao1", "gender": "M;;", "homepage": ";;http://people.math.gatech.edu/~mtao8/", "dblp": ";z/HongyuanZha;56/9263", "google_scholar": "lLjVU_cAAAAJ;n1DQMIsAAAAJ;", "orcid": ";;", "linkedin": ";;", "or_profile": "~Ruilin_Li1;~Hongyuan_Zha1;~Molei_Tao1", "aff": ";The Chinese University of Hong Kong, Shenzhen;Georgia Institute of Technology", "aff_domain": ";cuhk.edu.cn;gatech.edu", "position": ";Full Professor;Associate Professor", "bibtex": "@misc{\nli2022hessianfree,\ntitle={Hessian-Free High-Resolution Nesterov Acceleration for Sampling},\nauthor={Ruilin Li and Hongyuan Zha and Molei Tao},\nyear={2022},\nurl={https://openreview.net/forum?id=gdegUuC_fxR}\n}", "github": "", "project": "", "reviewers": "97Mf;xkFf;4H3p;awRv;r34U;22pJ", "site": "https://openreview.net/forum?id=gdegUuC_fxR", "pdf_size": 0, "recommendation": "1;3;3;5;6;6", "confidence": "1;4;4;4;3;4", "correctness": "2;4;2;3;4;3", "technical_novelty": "1;3;3;3;3;2", "empirical_novelty": "1;2;2;3;3;3", "wc_summary_paper": "12;82;34;89;40;22", "wc_summary_review": "32;39;15;18;30;22", "wc_main_review": "37;493;281;255;264;92", "wc_review": "81;614;330;362;334;136", "wc_reply_reviewers": "0;1865;0;0;0;0", "wc_reply_authors": "107;5203;1180;896;377;405", "reply_reviewers": "0;10;0;0;0;0", "reply_authors": "1;12;2;2;1;1", "recommendation_avg": [ 4.0, 1.8257418583505538 ], "confidence_avg": [ 3.3333333333333335, 1.1055415967851334 ], "correctness_avg": [ 3.0, 0.816496580927726 ], "technical_novelty_avg": [ 2.5, 0.7637626158259734 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.7453559924999298 ], "wc_summary_paper_avg": [ 46.5, 29.027286013909972 ], "wc_summary_review_avg": [ 26.0, 8.386497083606082 ], "wc_main_review_avg": [ 237.0, 146.9410312563059 ], "wc_review_avg": [ 309.5, 172.6323164030034 ], "wc_reply_reviewers_avg": [ 310.8333333333333, 695.0444630061846 ], "wc_reply_authors_avg": [ 1361.3333333333333, 1754.2642015639747 ], "reply_reviewers_avg": [ 1.6666666666666667, 3.726779962499649 ], "reply_authors_avg": [ 3.1666666666666665, 3.975620147292188 ], "replies_avg": [ 37, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.5780059766913392, "corr_recommendation_correctness": 0.5590169943749475, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7440675111047681058&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff_unique_index": "0;1", "aff_unique_norm": "Chinese University of Hong Kong;Georgia Institute of Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.cuhk.edu.cn;https://www.gatech.edu", "aff_unique_abbr": "CUHK;Georgia Tech", "aff_campus_unique_index": "0", "aff_campus_unique": "Shenzhen;", "aff_country_unique_index": "0;1", "aff_country_unique": "China;United States" }, { "id": "gehXu3kDU1P", "title": "Learning Algebraic Representation for Systematic Generalization in Abstract Reasoning", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Is intelligence realized by connectionist or classicist? While connectionist approaches have achieved superhuman performance, there has been growing evidence that such task-specific superiority is particularly fragile in systematic generalization. This observation lies in the central debate (Fodor & McLaughlin, 1990; Fodor et al., 1988) between connectionist and classicist, wherein the latter continually advocates an algebraic treatment in cognitive architectures. In this work, we follow the classicist's call and propose a hybrid approach to improve systematic generalization in reasoning. Specifically, we showcase a prototype with algebraic representation for the abstract spatial-temporal reasoning task of Raven\u2019s Progressive Matrices (RPM) and present the ALgebra-Aware Neuro- Semi-Symbolic (ALANS) learner. The ALANS learner is motivated by abstract algebra and the representation theory. It consists of a neural visual perception frontend and an algebraic abstract reasoning backend: the frontend summarizes the visual information from object-based representation, while the backend transforms it into an algebraic structure and induces the hidden operator on the fly. The induced operator is later executed to predict the answer's representation, and the choice most similar to the prediction is selected as the solution. Extensive experiments show that by incorporating an algebraic treatment, the ALANS learner outperforms various pure connectionist models in domains requiring systematic generalization. We further show that the algebraic representation learned can be decoded by isomorphism and used to generate an answer.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/1c55a576f36f4d823aadc92f3de58230a2dfdb19.zip", "author": "Chi Zhang;Sirui Xie;Baoxiong Jia;Ying Nian Wu;Song-Chun Zhu;Yixin Zhu", "authorids": "~Chi_Zhang12;~Sirui_Xie1;~Baoxiong_Jia1;~Ying_Nian_Wu1;~Song-Chun_Zhu1;~Yixin_Zhu1", "gender": ";M;M;;M;M", "homepage": ";https://www.siruixie.com;https://buzz-beater.github.io/;;https://zhusongchun.net/;https://yzhu.io/", "dblp": ";232/3072;206/8738;;10/10313;91/1103-1.html", "google_scholar": ";9GJn5FIAAAAJ;qIBUK6sAAAAJ;;https://scholar.google.com.tw/citations?user=Al8dyb4AAAAJ;qG9l6JEAAAAJ", "orcid": ";;0000-0002-4968-3290;;;0000-0001-7024-1545", "linkedin": ";;baoxiong-jia-2b6094122?trk=public_post-text;;;", "or_profile": "~Chi_Zhang12;~Sirui_Xie1;~Baoxiong_Jia1;~Ying_Nian_Wu1;~Song-Chun_Zhu1;~Yixin_Zhu1", "aff": ";University of California, Los Angeles;University of California, Los Angeles;;Peking University;Peking University", "aff_domain": ";ucla.edu;ucla.edu;;pku.edu.cn;pku.edu.cn", "position": ";PhD student;PhD student;;Full Professor;Assistant Professor", "bibtex": "@misc{\nzhang2022learning,\ntitle={Learning Algebraic Representation for Systematic Generalization in Abstract Reasoning},\nauthor={Chi Zhang and Sirui Xie and Baoxiong Jia and Ying Nian Wu and Song-Chun Zhu and Yixin Zhu},\nyear={2022},\nurl={https://openreview.net/forum?id=gehXu3kDU1P}\n}", "github": "", "project": "", "reviewers": "9uXz;HpFa;u82Y;8moe", "site": "https://openreview.net/forum?id=gehXu3kDU1P", "pdf_size": 0, "recommendation": "3;5;6;8", "confidence": "3;3;3;3", "correctness": "2;4;4;3", "technical_novelty": "3;3;3;3", "empirical_novelty": "3;2;2;3", "wc_summary_paper": "36;149;79;75", "wc_summary_review": "22;42;46;46", "wc_main_review": "646;395;308;263", "wc_review": "704;586;433;384", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 5.5, 1.8027756377319946 ], "confidence_avg": [ 3.0, 0.0 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 84.75, 40.720848468567056 ], "wc_summary_review_avg": [ 39.0, 9.9498743710662 ], "wc_main_review_avg": [ 403.0, 148.10300469605605 ], "wc_review_avg": [ 526.75, 126.58470484225178 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.4181210050035454, "gs_citation": 37, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12366107265154890046&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 13, "aff_unique_index": "0;0;1;1", "aff_unique_norm": "University of California, Los Angeles;Peking University", "aff_unique_dep": ";", "aff_unique_url": "https://www.ucla.edu;http://www.pku.edu.cn", "aff_unique_abbr": "UCLA;Peking U", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Los Angeles;", "aff_country_unique_index": "0;0;1;1", "aff_country_unique": "United States;China" }, { "id": "gex-2G2bLdh", "title": "Hinge Policy Optimization: Rethinking Policy Improvement and Reinterpreting PPO", "track": "main", "status": "Reject", "tldr": "", "abstract": "Policy optimization is a fundamental principle for designing reinforcement learning algorithms, and one example is the proximal policy optimization algorithm with a clipped surrogate objective (PPO-clip), which has been popularly used in deep reinforcement learning due to its simplicity and effectiveness. Despite its superior empirical performance, PPO-clip has not been justified via theoretical proof up to date. This paper proposes to rethink policy optimization and reinterpret the theory of PPO-clip based on hinge policy optimization (HPO), called to improve policy by hinge loss in this paper. Specifically, we first identify sufficient conditions of state-wise policy improvement and then rethink policy update as solving a large-margin classification problem with hinge loss. By leveraging various types of classifiers, the proposed design opens up a whole new family of policy-based algorithms, including the PPO-clip as a special case. Based on this construct, we prove that these algorithms asymptotically attain a globally optimal policy. To our knowledge, this is the first ever that can prove global convergence to an optimal policy for a variant of PPO-clip. We corroborate the performance of a variety of HPO algorithms through experiments and an ablation study.", "keywords": "Reinforcement learning;policy optimization;hinge loss;policy improvement;PPO-clip", "primary_area": "", "supplementary_material": "/attachment/efeffeed0e016831f639718f6a46e611e5e0300e.zip", "author": "Hsuan-Yu Yao;Ping-Chun Hsieh;Kuo-Hao Ho;Kai-Chun Hu;Liang Chun Ouyang;I-Chen Wu", "authorids": "~Hsuan-Yu_Yao1;~Ping-Chun_Hsieh1;~Kuo-Hao_Ho1;~Kai-Chun_Hu1;~Liang_Chun_Ouyang1;~I-Chen_Wu3", "gender": ";M;M;M;M;", "homepage": ";https://pinghsieh.github.io/;;;https://cgilab.nctu.edu.tw/~icwu/;", "dblp": ";163/7352;160/0321;;06/983;239/8475.html", "google_scholar": ";ix38JgoAAAAJ;https://scholar.google.com.tw/citations?user=LfH40wMAAAAJ;;;", "orcid": ";;;;0000-0003-2535-0587;", "linkedin": "\u5ba3\u7fbd-\u59da-824751212;;;%E8%89%AF%E9%9B%8B-%E6%AD%90-a2b725117/;;", "or_profile": "~Hsuan-Yu_Yao1;~Ping-Chun_Hsieh1;~Kuo-Hao_Ho1;~Liang_Chun_Ouyang1;~I-Chen_Wu3;~Kai_chun_Hu1", "aff": ";National Yang Ming Chiao Tung University;National Chiao Tung University;National Chiao Tung University;Academia Sinica;", "aff_domain": ";nycu.edu.tw;nctu.edu.tw;nctu.edu.tw;sinica.edu.tw;", "position": ";Assistant Professor;PhD student;MS student;Research Fellow;", "bibtex": "@misc{\nyao2022hinge,\ntitle={Hinge Policy Optimization: Rethinking Policy Improvement and Reinterpreting {PPO}},\nauthor={Hsuan-Yu Yao and Ping-Chun Hsieh and Kuo-Hao Ho and Kai-Chun Hu and Liang Chun Ouyang and I-Chen Wu},\nyear={2022},\nurl={https://openreview.net/forum?id=gex-2G2bLdh}\n}", "github": "", "project": "", "reviewers": "PJFP;vtrj;FV8P;PyUp", "site": "https://openreview.net/forum?id=gex-2G2bLdh", "pdf_size": 0, "recommendation": "3;5;6;8", "confidence": "5;4;4;2", "correctness": "3;3;3;4", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "44;25;105;99", "wc_summary_review": "80;56;36;25", "wc_main_review": "280;293;356;390", "wc_review": "404;374;497;514", "wc_reply_reviewers": "503;387;0;0", "wc_reply_authors": "1709;964;881;383", "reply_reviewers": "2;2;0;0", "reply_authors": "4;3;2;1", "recommendation_avg": [ 5.5, 1.8027756377319946 ], "confidence_avg": [ 3.75, 1.0897247358851685 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 68.25, 34.47734763580284 ], "wc_summary_review_avg": [ 49.25, 20.94486810653149 ], "wc_main_review_avg": [ 329.75, 45.124134340727245 ], "wc_review_avg": [ 447.25, 59.512078605943515 ], "wc_reply_reviewers_avg": [ 222.5, 226.24820441276435 ], "wc_reply_authors_avg": [ 984.25, 473.770711948301 ], "reply_reviewers_avg": [ 1.0, 1.0 ], "reply_authors_avg": [ 2.5, 1.118033988749895 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.9544271444636667, "corr_recommendation_correctness": 0.8006407690254357, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1947644625537836883&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;1;2", "aff_unique_norm": "National Yang Ming Chiao Tung University;National Chiao Tung University;Academia Sinica", "aff_unique_dep": ";;", "aff_unique_url": "https://www.nycu.edu.tw;https://www.nctu.edu.tw;https://www.sinica.edu.tw", "aff_unique_abbr": "NYCU;NCTU;Academia Sinica", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Taiwan", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "id": "gf9buGzMCa", "title": "Expressiveness of Neural Networks Having Width Equal or Below the Input Dimension", "track": "main", "status": "Reject", "tldr": "", "abstract": "The understanding about the minimum width of deep neural networks needed to ensure universal approximation for different activation functions has progressively been extended \\citep{park2020minimum}. In particular, with respect to approximation on general compact sets in the input space, a network width less than or equal to the input dimension excludes universal approximation. In this work, we focus on network functions of width less than or equal to the latter critical bound. We prove a maximum principle from which we conclude that for all continuous and monotonic activation functions, universal approximation of arbitrary continuous functions is impossible on sets that coincide with the boundary of an open set plus an inner point. Conversely, we prove that in this regime, the exact fit of partially constant functions on disjoint compact sets is still possible for ReLU network functions under some conditions on the mutual location of these components. We also show that with cosine as activation function, a three layer network of width one is sufficient to approximate any function on arbitrary finite sets.", "keywords": "Neural network approximation;expressiveness of width bounded neural networks;maximum principle", "primary_area": "", "supplementary_material": "/attachment/61600cfaf782c1a47b9282e61cbea0ce4c01646d.zip", "author": "Hans-Peter Beise;Steve Dias Da Cruz", "authorids": "~Hans-Peter_Beise1;~Steve_Dias_Da_Cruz1", "gender": "M;M", "homepage": "https://www.hochschule-trier.de;https://cruz.lu/", "dblp": ";", "google_scholar": "https://scholar.google.de/citations?user=3p-O_84AAAAJ;https://scholar.google.de/citations?user=qgFaB1YAAAAJ", "orcid": "0000-0002-8949-6131;0000-0002-8322-934X", "linkedin": ";stevediasdacruz", "or_profile": "~Hans-Peter_Beise1;~Steve_Dias_Da_Cruz1", "aff": "Trier University of Applied Sciences;TU Kaiserslautern", "aff_domain": "hochschule-trier.de;uni-kl.de", "position": "Full Professor;PhD student", "bibtex": "@misc{\nbeise2022expressiveness,\ntitle={Expressiveness of Neural Networks Having Width Equal or Below the Input Dimension},\nauthor={Hans-Peter Beise and Steve Dias Da Cruz},\nyear={2022},\nurl={https://openreview.net/forum?id=gf9buGzMCa}\n}", "github": "", "project": "", "reviewers": "PCRn;oqnR;qHTG;99iL", "site": "https://openreview.net/forum?id=gf9buGzMCa", "pdf_size": 0, "recommendation": "5;6;6;6", "confidence": "4;3;3;4", "correctness": "4;3;4;3", "technical_novelty": "3;2;3;3", "empirical_novelty": "3;0;3;0", "wc_summary_paper": "247;306;84;124", "wc_summary_review": "28;67;35;76", "wc_main_review": "222;414;201;480", "wc_review": "497;787;320;680", "wc_reply_reviewers": "0;88;19;85", "wc_reply_authors": "369;297;95;196", "reply_reviewers": "0;1;1;1", "reply_authors": "1;1;1;1", "recommendation_avg": [ 5.75, 0.4330127018922193 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.5, 1.5 ], "wc_summary_paper_avg": [ 190.25, 89.85648279339672 ], "wc_summary_review_avg": [ 51.5, 20.402205763103165 ], "wc_main_review_avg": [ 329.25, 120.2692292317532 ], "wc_review_avg": [ 571.0, 178.1951177782377 ], "wc_reply_reviewers_avg": [ 48.0, 39.096035604649224 ], "wc_reply_authors_avg": [ 239.25, 103.49969806719244 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": -0.5773502691896257, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14281976711177548734&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1", "aff_unique_norm": "Trier University of Applied Sciences;Technische Universit\u00e4t Kaiserslautern", "aff_unique_dep": ";", "aff_unique_url": "https://www.fh-trier.de;https://www.tu-kl.de", "aff_unique_abbr": "FH Trier;TU Kaiserslautern", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Germany" }, { "id": "gfUPGPMxB7E", "title": "Data Sharing without Rewards in Multi-Task Offline Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Offline reinforcement learning (RL) bears the promise to learn effective control policies from static datasets but is thus far unable to learn from large databases of heterogeneous experience. The multi-task version of offline RL enables the possibility of learning a single policy that can tackle multiple tasks and allows the algorithm to share offline data across tasks. Recent works indicate that sharing data between tasks can be highly beneficial in multi-task learning. However, these benefits come at a cost -- for data to be shared between tasks, each transition must be annotated with reward labels corresponding to other tasks. This is particularly expensive and unscalable, since the manual effort in annotating reward grows quadratically with the number of tasks. Can we retain the benefits of data sharing without requiring reward relabeling for every task pair? In this paper, we show that, perhaps surprisingly, under a binary-reward assumption, simply utilizing data from other tasks with constant reward labels can not only provide a substantial improvement over only using the single-task data and previously proposed success classifiers, but it can also reach comparable performance to baselines that take advantage of the oracle multi-task reward information. We also show that this performance can be further improved by selectively deciding which transitions to share, again without introducing any additional models or classifiers. We discuss how these approaches relate to each other and baseline strategies under various assumptions on the dataset. Our empirical results show that it leads to improved performance across a range of different multi-task offline RL scenarios, including robotic manipulation from visual inputs and ant-maze navigation.", "keywords": "offline reinforcement learning;multi-task reinforcement learning", "primary_area": "", "supplementary_material": "/attachment/e597fdf06b02b55432ce1d1deac0b720efbf9561.zip", "author": "Tianhe Yu;Aviral Kumar;Yevgen Chebotar;Chelsea Finn;Sergey Levine;Karol Hausman", "authorids": "~Tianhe_Yu1;~Aviral_Kumar2;~Yevgen_Chebotar1;~Chelsea_Finn1;~Sergey_Levine1;~Karol_Hausman2", "gender": "M;M;M;F;M;M", "homepage": "https://cs.stanford.edu/~tianheyu/;https://aviralkumar2907.github.io/;;https://ai.stanford.edu/~cbfinn/;https://people.eecs.berkeley.edu/~svlevine/;https://karolhausman.github.io/", "dblp": "192/1797;202/7961;01/11424;131/1783;80/7594;135/8164", "google_scholar": ";;ADkiClQAAAAJ;vfPE6hgAAAAJ;8R35rCwAAAAJ;yy0UFOwAAAAJ", "orcid": ";;;;;", "linkedin": ";;;;;karolhausman/", "or_profile": "~Tianhe_Yu1;~Aviral_Kumar2;~Yevgen_Chebotar1;~Chelsea_Finn1;~Sergey_Levine1;~Karol_Hausman1", "aff": "Stanford University;University of California, Berkeley;Google;Google;Google;Google Brain", "aff_domain": "stanford.edu;berkeley.edu;google.com;google.com;google.com;google.com", "position": "PhD student;PhD student;Research Scientist;Research Scientist;Research Scientist;Research Scientist", "bibtex": "@misc{\nyu2022data,\ntitle={Data Sharing without Rewards in Multi-Task Offline Reinforcement Learning},\nauthor={Tianhe Yu and Aviral Kumar and Yevgen Chebotar and Chelsea Finn and Sergey Levine and Karol Hausman},\nyear={2022},\nurl={https://openreview.net/forum?id=gfUPGPMxB7E}\n}", "github": "", "project": "", "reviewers": "SFTg;UDVV;YPSx", "site": "https://openreview.net/forum?id=gfUPGPMxB7E", "pdf_size": 0, "recommendation": "5;5;5", "confidence": "4;3;3", "correctness": "4;3;4", "technical_novelty": "2;2;2", "empirical_novelty": "2;2;2", "wc_summary_paper": "90;66;100", "wc_summary_review": "128;30;39", "wc_main_review": "366;147;171", "wc_review": "584;243;310", "wc_reply_reviewers": "219;0;47", "wc_reply_authors": "2879;1546;777", "reply_reviewers": "2;0;1", "reply_authors": "6;5;4", "recommendation_avg": [ 5.0, 0.0 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "correctness_avg": [ 3.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 85.33333333333333, 14.2672897060218 ], "wc_summary_review_avg": [ 65.66666666666667, 44.22920101270452 ], "wc_main_review_avg": [ 228.0, 98.07140255956371 ], "wc_review_avg": [ 379.0, 147.51497099164771 ], "wc_reply_reviewers_avg": [ 88.66666666666667, 94.13583566079156 ], "wc_reply_authors_avg": [ 1734.0, 868.3735755230388 ], "reply_reviewers_avg": [ 1.0, 0.816496580927726 ], "reply_authors_avg": [ 5.0, 0.816496580927726 ], "replies_avg": [ 24, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9573512284303383333&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2, "aff_unique_index": "0;1;2;2;2;2", "aff_unique_norm": "Stanford University;University of California, Berkeley;Google", "aff_unique_dep": ";;Google", "aff_unique_url": "https://www.stanford.edu;https://www.berkeley.edu;https://www.google.com", "aff_unique_abbr": "Stanford;UC Berkeley;Google", "aff_campus_unique_index": "0;1;2;2;2;2", "aff_campus_unique": "Stanford;Berkeley;Mountain View", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Global Convergence of Multi-Agent Policy Gradient in Markov Potential Games", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7008", "id": "gfwON7rAm4", "poster": "", "openreview": "https://openreview.net/forum?id=gfwON7rAm4", "slides": "https://iclr.cc/virtual/2022/poster/7008", "video": "https://iclr.cc/virtual/2022/poster/7008", "author_site": "Stefanos Leonardos, Will Overman, Ioannis Panageas, Georgios Piliouras", "tldr": "", "abstract": "Potential games are arguably one of the most important and widely studied classes of normal form games. They define the archetypal setting of multi-agent coordination in which all agents utilities are perfectly aligned via a common potential function. Can this intuitive framework be transplanted in the setting of Markov games? What are the similarities and differences between multi-agent coordination with and without state dependence? To answer these questions, we study a natural class of Markov Potential Games (MPGs) that generalize prior attempts at capturing complex stateful multi-agent coordination. Counter-intuitively, insights from normal-form potential games do not carry over as MPGs involve settings where state-games can be zero-sum games. In the opposite direction, Markov games where every state-game is a potential game are not necessarily MPGs. Nevertheless, MPGs showcase standard desirable properties such as the existence of deterministic Nash policies. In our main technical result, we prove convergence of independent policy gradient and its stochastic counterpart to Nash policies (polynomially fast in the approximation error) by adapting recent gradient dominance property arguments developed for single-agent Markov decision processes to multi-agent learning settings. \n", "keywords": "Multi-agent Reinforcement Learning;Markov Potential Games;Policy Gradient", "primary_area": "", "supplementary_material": "/attachment/fd730c765d9a2bc5db1c20e681f030ddcbd8784d.zip", "author": "Stefanos Leonardos;Will Overman;Ioannis Panageas;Georgios Piliouras", "authorids": "~Stefanos_Leonardos1;~Will_Overman1;~Ioannis_Panageas1;~Georgios_Piliouras1", "gender": "M;M;M;", "homepage": "https://stefanosleonardos.com/;https://willoverman.github.io/;https://panageas.github.io;", "dblp": "192/1237;294/4924;139/3829;62/1236", "google_scholar": "PtiGrVsAAAAJ;B2XPxEkAAAAJ;5NiFWuwAAAAJ;", "orcid": ";;;", "linkedin": "stefanos-leonardos/;;;", "or_profile": "~Stefanos_Leonardos1;~Will_Overman1;~Ioannis_Panageas1;~Georgios_Piliouras1", "aff": "King's College London, University of London;University of California, Irvine;Donald Bren School of Information and Computer Sciences, University of California, Irvine;Singapore University of Technology and Design", "aff_domain": "kcl.ac.uk;uci.edu;ics.uci.edu;sutd.edu.sg", "position": "Lecturer;MS student;Assistant Professor;Associate Professor", "bibtex": "@inproceedings{\nleonardos2022global,\ntitle={Global Convergence of Multi-Agent Policy Gradient in Markov Potential Games},\nauthor={Stefanos Leonardos and Will Overman and Ioannis Panageas and Georgios Piliouras},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=gfwON7rAm4}\n}", "github": "", "project": "", "reviewers": "5gGX;43Uo;idii;awwm", "pdf_size": 0, "recommendation": "5;6;8;8", "confidence": "3;3;3;4", "correctness": "4;4;4;3", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;2;0;2", "wc_summary_paper": "35;56;47;39", "wc_summary_review": "29;41;16;39", "wc_main_review": "127;243;312;394", "wc_review": "191;340;375;472", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "468;464;354;538", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 6.75, 1.299038105676658 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 44.25, 8.042853971072706 ], "wc_summary_review_avg": [ 31.25, 9.908960591303208 ], "wc_main_review_avg": [ 269.0, 97.8698114844409 ], "wc_review_avg": [ 344.5, 100.95667387547986 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 456.0, 65.83312236253116 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.5555555555555555, "corr_recommendation_correctness": -0.5555555555555555, "gs_citation": 160, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6411481713074192654&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "pdf": "https://openreview.net/pdf?id=gfwON7rAm4", "email": "kcl.ac.uk;uci.edu;ics.uci.edu;sutd.edu.sg", "author_num": 4, "aff_unique_index": "0;1;1;2", "aff_unique_norm": "King's College London;University of California, Irvine;Singapore University of Technology and Design", "aff_unique_dep": ";;", "aff_unique_url": "https://www.kcl.ac.uk;https://www.uci.edu;https://www.sutd.edu.sg", "aff_unique_abbr": "KCL;UCI;SUTD", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Irvine", "aff_country_unique_index": "0;1;1;2", "aff_country_unique": "United Kingdom;United States;Singapore" }, { "id": "gggnCQBT_iE", "title": "Connecting Data to Mechanisms with Meta Structual Causal Model", "track": "main", "status": "Reject", "tldr": "", "abstract": "Recent years have seen impressive progress in theoretical and algorithmic developments of causal inference across various disciplines in science and engineering. However, there is still some unresolved theoretical problems, especially for cyclic causal relationships. In this article, we propose a meta structure causal model (Meta-SCM) framework inspired by understanding causality as information transfer. A key feature of our framework is the introduction of the concept of \\emph{active mechanisms} to connect data and the collection of underlying causal mechanisms. We show that the Meta-SCM provides a novel approach to address the theoretical complications for modeling cyclic causal relations. In addition, we propose a \\emph{sufficient activated mechanisms} assumption, and explain its relationship with existing assumptions in causal representation learning. Finally, we conclude the main idea of the meta-SCM framework with an emphasis on its theoretical and conceptual novelty.", "keywords": "meta-SCM;cyclic causal models;sufficient activated mechanisms", "primary_area": "", "supplementary_material": "", "author": "Gong Heyang", "authorids": "~Gong_Heyang1", "gender": "M", "homepage": "https://sites.google.com/view/causal-inference-zerotoall/home", "dblp": "", "google_scholar": "cKyArAwAAAAJ", "orcid": "", "linkedin": "", "or_profile": "~Gong_Heyang1", "aff": "Kuaishou Technology", "aff_domain": "kuaishou.com", "position": "Researcher", "bibtex": "@misc{\nheyang2022connecting,\ntitle={Connecting Data to Mechanisms with Meta Structual Causal Model},\nauthor={Gong Heyang},\nyear={2022},\nurl={https://openreview.net/forum?id=gggnCQBT_iE}\n}", "github": "", "project": "", "reviewers": "unQi;uRUL;mwX7", "site": "https://openreview.net/forum?id=gggnCQBT_iE", "pdf_size": 0, "recommendation": "3;3;8", "confidence": "3;4;4", "correctness": "4;2;4", "technical_novelty": "2;2;4", "empirical_novelty": "0;1;4", "wc_summary_paper": "39;131;70", "wc_summary_review": "58;40;24", "wc_main_review": "179;500;148", "wc_review": "276;671;242", "wc_reply_reviewers": "46;377;0", "wc_reply_authors": "621;1818;107", "reply_reviewers": "1;2;0", "reply_authors": "2;4;1", "recommendation_avg": [ 4.666666666666667, 2.357022603955158 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.9428090415820634 ], "technical_novelty_avg": [ 2.6666666666666665, 0.9428090415820634 ], "empirical_novelty_avg": [ 1.6666666666666667, 1.699673171197595 ], "wc_summary_paper_avg": [ 80.0, 38.21866908549625 ], "wc_summary_review_avg": [ 40.666666666666664, 13.888444437333106 ], "wc_main_review_avg": [ 275.6666666666667, 159.1316715455754 ], "wc_review_avg": [ 396.3333333333333, 194.71403533272982 ], "wc_reply_reviewers_avg": [ 141.0, 167.9305411968492 ], "wc_reply_authors_avg": [ 848.6666666666666, 716.8237037251365 ], "reply_reviewers_avg": [ 1.0, 0.816496580927726 ], "reply_authors_avg": [ 2.3333333333333335, 1.247219128924647 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 1, 0 ], "corr_recommendation_confidence": 0.5000000000000001, "corr_recommendation_correctness": 0.5, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:OlFi8_Yths0J:scholar.google.com/&scioq=Connecting+Data+to+Mechanisms+with+Meta+Structual+Causal+Model&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "Kuaishou Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.kuaishou.com", "aff_unique_abbr": "Kuaishou", "aff_country_unique_index": "0", "aff_country_unique": "China" }, { "id": "ghTlLwlBS-", "title": "Feudal Reinforcement Learning by Reading Manuals", "track": "main", "status": "Reject", "tldr": "", "abstract": "Reading to act is a prevalent but challenging task that requires the ability to follow a concise language instruction in an environment, with the help of textual knowledge about the environment. Previous works face the semantic mismatch between the low-level actions and the high-level language descriptions and require the human-designed curriculum to work properly. In this paper, we present a Feudal Reinforcement Learning (FRL) model consisting of a manager agent and a worker agent. The manager agent is a multi-hop planner, which deals with high-level abstract information and generates a series of sub-goals. The worker agent deals with the low-level perceptions and actions to achieve the sub-goals one by one. Our FRL framework effectively alleviates the mismatching between the text-level inference and the low-level perceptions and actions; and is general to various forms of environments, instructions and manuals. Our multi-hop planner contributes to the framework by further boosting the challenging tasks where multi-step reasoning from the texts is critical to achieving the instructed goals. We showcase our approach achieves competitive performance on two challenging tasks, Read to Fight Monsters (RTFM) and Messenger, without human-designed curriculum learning.", "keywords": "feudal reinforcement learning;textual instruction following;reading to act;text games;multi-hop reasoning", "primary_area": "", "supplementary_material": "", "author": "Kai Wang;Zhonghao Wang;Mo Yu;Humphrey Shi", "authorids": "~Kai_Wang10;~Zhonghao_Wang1;~Mo_Yu1;~Humphrey_Shi1", "gender": "M;M;M;M", "homepage": "https://kaiwang.com;http://researcher.ibm.com/researcher/view.php?person=us-yum;https://www.humphreyshi.com;", "dblp": "78/2022-58.html;32/7445.html;176/5516;", "google_scholar": "_yK8SN0AAAAJ;vC8DssQAAAAJ;WBvt5A8AAAAJ;opL6CL8AAAAJ", "orcid": ";;0000-0002-2922-5663;", "linkedin": ";;humphreyshi;zhonghao-wang-a4b659114/", "or_profile": "~Kai_Wang10;~Mo_Yu1;~Honghui_Shi1;~Zhonghao_Wang6", "aff": "University of Oregon;International Business Machines;University of Oregon;Google", "aff_domain": "uoregon.edu;ibm.com;uoregon.edu;google.com", "position": "PhD student;Research Staff Member;Assistant Professor;Intern", "bibtex": "@misc{\nwang2022feudal,\ntitle={Feudal Reinforcement Learning by Reading Manuals},\nauthor={Kai Wang and Zhonghao Wang and Mo Yu and Humphrey Shi},\nyear={2022},\nurl={https://openreview.net/forum?id=ghTlLwlBS-}\n}", "github": "", "project": "", "reviewers": "RC7c;tKa7;KMX2;eXEz", "site": "https://openreview.net/forum?id=ghTlLwlBS-", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "5;2;4;4", "correctness": "3;3;4;3", "technical_novelty": "2;2;3;3", "empirical_novelty": "4;3;3;3", "wc_summary_paper": "79;259;65;84", "wc_summary_review": "15;27;29;72", "wc_main_review": "128;179;575;364", "wc_review": "222;465;669;520", "wc_reply_reviewers": "0;4;140;88", "wc_reply_authors": "232;362;307;538", "reply_reviewers": "0;1;1;1", "reply_authors": "1;1;2;1", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 3.75, 1.0897247358851685 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 3.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 121.75, 79.5467629762519 ], "wc_summary_review_avg": [ 35.75, 21.602951187279945 ], "wc_main_review_avg": [ 311.5, 175.65377878087338 ], "wc_review_avg": [ 469.0, 160.9549626448343 ], "wc_reply_reviewers_avg": [ 58.0, 58.957611891934704 ], "wc_reply_authors_avg": [ 359.75, 112.78380867837369 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.4736842105263159, "corr_recommendation_correctness": 0.13245323570650439, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:tMw3sYdhlaoJ:scholar.google.com/&scioq=Feudal+Reinforcement+Learning+by+Reading+Manuals&hl=en&as_sdt=0,33", "gs_version_total": 3, "aff_unique_index": "0;1;0;2", "aff_unique_norm": "University of Oregon;International Business Machines Corporation;Google", "aff_unique_dep": ";;Google", "aff_unique_url": "https://www.uoregon.edu;https://www.ibm.com;https://www.google.com", "aff_unique_abbr": "UO;IBM;Google", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "gi4956J8g5", "title": "Second-Order Unsupervised Feature Selection via Knowledge Contrastive Distillation", "track": "main", "status": "Reject", "tldr": "", "abstract": "Unsupervised feature selection aims to select a subset from the original features that are most useful for the downstream tasks without external guidance information. While most unsupervised feature selection methods focus on ranking features based on the intrinsic properties of data, they do not pay much attention to the relationships between features, which often leads to redundancy among the selected features. In this paper, we propose a two-stage Second-Order unsupervised Feature selection via knowledge contrastive disTillation (SOFT) model that incorporates the second-order covariance matrix with the first-order data matrix for unsupervised feature selection. In the first stage, we learn a sparse attention matrix that can represent second-order relations between features. In the second stage, we build a relational graph based on the learned attention matrix and perform graph segmentation for feature selection. Experimental results on 12 public datasets show that SOFT outperforms classical and recent state-of-the-art methods, which demonstrates the effectiveness of our proposed method.", "keywords": "Machine Learning;Unsupervised Feature Selection;Knowledge Distillation", "primary_area": "", "supplementary_material": "", "author": "Han Yue;Jundong Li;Hongfu Liu", "authorids": "~Han_Yue2;~Jundong_Li2;~Hongfu_Liu2", "gender": "M;M;M", "homepage": ";https://jundongli.github.io/;http://hongfuliu.com/", "dblp": ";144/7997.html;32/9075-1", "google_scholar": "BkUhc7gAAAAJ;uY6ek7sAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": "0000-0003-4146-0436;;", "linkedin": ";;", "or_profile": "~Han_Yue2;~Jundong_Li2;~Hongfu_Liu2", "aff": "Brandeis University;University of Virginia;Brandeis University", "aff_domain": "brandeis.edu;virginia.edu;brandeis.edu", "position": "PhD student;Assistant Professor;Assistant Professor", "bibtex": "@misc{\nyue2022secondorder,\ntitle={Second-Order Unsupervised Feature Selection via Knowledge Contrastive Distillation},\nauthor={Han Yue and Jundong Li and Hongfu Liu},\nyear={2022},\nurl={https://openreview.net/forum?id=gi4956J8g5}\n}", "github": "", "project": "", "reviewers": "kEpS;3WQu;wcmh;ELaK;xFNs", "site": "https://openreview.net/forum?id=gi4956J8g5", "pdf_size": 0, "recommendation": "3;5;6;6;8", "confidence": "5;4;4;4;4", "correctness": "3;3;3;3;3", "technical_novelty": "2;3;3;2;3", "empirical_novelty": "2;3;3;3;3", "wc_summary_paper": "27;45;46;64;66", "wc_summary_review": "69;43;41;8;152", "wc_main_review": "271;504;228;68;198", "wc_review": "367;592;315;140;416", "wc_reply_reviewers": "797;225;0;0;0", "wc_reply_authors": "1990;2146;1207;438;494", "reply_reviewers": "2;1;0;0;0", "reply_authors": "3;4;2;1;1", "recommendation_avg": [ 5.6, 1.624807680927192 ], "confidence_avg": [ 4.2, 0.39999999999999997 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.6, 0.4898979485566356 ], "empirical_novelty_avg": [ 2.8, 0.39999999999999997 ], "wc_summary_paper_avg": [ 49.6, 14.291256067959877 ], "wc_summary_review_avg": [ 62.6, 48.713858397790666 ], "wc_main_review_avg": [ 253.8, 142.26510464622027 ], "wc_review_avg": [ 366.0, 146.46091628827125 ], "wc_reply_reviewers_avg": [ 204.4, 308.84857130963064 ], "wc_reply_authors_avg": [ 1255.0, 718.7516956501737 ], "reply_reviewers_avg": [ 0.6, 0.7999999999999999 ], "reply_authors_avg": [ 2.2, 1.16619037896906 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.8000946913656628, "corr_recommendation_correctness": 0.0, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13168238765470710191&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff_unique_index": "0;1;0", "aff_unique_norm": "Brandeis University;University of Virginia", "aff_unique_dep": ";", "aff_unique_url": "https://www.brandeis.edu;https://www.virginia.edu", "aff_unique_abbr": "Brandeis;UVA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Iterated Reasoning with Mutual Information in Cooperative and Byzantine Decentralized Teaming", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6808", "id": "giBFoa-uS12", "poster": "", "openreview": "https://openreview.net/forum?id=giBFoa-uS12", "slides": "https://iclr.cc/virtual/2022/poster/6808", "video": "https://iclr.cc/virtual/2022/poster/6808", "author_site": "Sachin Konan, Esmaeil Seraj, Matthew Gombolay", "tldr": "", "abstract": "Information sharing is key in building team cognition and enables coordination and cooperation. High-performing human teams also benefit from acting strategically with hierarchical levels of iterated communication and rationalizability, meaning a human agent can reason about the actions of their teammates in their decision-making. Yet, the majority of prior work in Multi-Agent Reinforcement Learning (MARL) does not support iterated rationalizability and only encourage inter-agent communication, resulting in a suboptimal equilibrium cooperation strategy. In this work, we show that reformulating an agent's policy to be conditional on the policies of its neighboring teammates inherently maximizes Mutual Information (MI) lower-bound when optimizing under Policy Gradient (PG). Building on the idea of decision-making under bounded rationality and cognitive hierarchy theory, we show that our modified PG approach not only maximizes local agent rewards but also implicitly reasons about MI between agents without the need for any explicit ad-hoc regularization terms. Our approach, InfoPG, outperforms baselines in learning emergent collaborative behaviors and sets the state-of-the-art in decentralized cooperative MARL tasks. Our experiments validate the utility of InfoPG by achieving higher sample efficiency and significantly larger cumulative reward in several complex cooperative multi-agent domains.", "keywords": "Multi-agent Reinforcement Learning;Cooperation and Coordination;Policy Gradient Optimization;Mutual Information;Iterated Reasoning", "primary_area": "", "supplementary_material": "/attachment/67d2e8568bba880cad67f8cd5c250fb8217ebb94.zip", "author": "Sachin G Konan;Esmaeil Seraj;Matthew Gombolay", "authorids": "~Sachin_G_Konan1;~Esmaeil_Seraj1;~Matthew_Gombolay1", "gender": "M;M;M", "homepage": "https://sachinkonan.github.io/;https://www.linkedin.com/in/esmaeil-seraj-70590b80/;https://core-robotics.gatech.edu/", "dblp": "310/1751.html;169/3595;144/1022", "google_scholar": "dxi6F8kAAAAJ;k0yj7xUAAAAJ;Ihyz20wAAAAJ", "orcid": ";0000-0002-0147-1037;", "linkedin": "sachin-konan;esmaeil-seraj-70590b80/;", "or_profile": "~Sachin_G_Konan1;~Esmaeil_Seraj1;~Matthew_Gombolay1", "aff": "Georgia Institute of Technology;Georgia Institute of Technology;Georgia Institute of Technology", "aff_domain": "gatech.edu;gatech.edu;cc.gatech.edu", "position": "Undergrad student;PhD student;Assistant Professor", "bibtex": "@inproceedings{\nkonan2022iterated,\ntitle={Iterated Reasoning with Mutual Information in Cooperative and Byzantine Decentralized Teaming},\nauthor={Sachin G Konan and Esmaeil Seraj and Matthew Gombolay},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=giBFoa-uS12}\n}", "github": "", "project": "", "reviewers": "4Fxp;SyxF;7nod", "pdf_size": 0, "recommendation": "3;6;8", "confidence": "4;3;3", "correctness": "3;3;4", "technical_novelty": "3;2;4", "empirical_novelty": "2;2;4", "wc_summary_paper": "99;69;192", "wc_summary_review": "78;212;23", "wc_main_review": "580;459;130", "wc_review": "757;740;345", "wc_reply_reviewers": "0;476;0", "wc_reply_authors": "1094;2332;275", "reply_reviewers": "0;2;0", "reply_authors": "3;5;1", "recommendation_avg": [ 5.666666666666667, 2.0548046676563256 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 3.0, 0.816496580927726 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.9428090415820634 ], "wc_summary_paper_avg": [ 120.0, 52.364109846344185 ], "wc_summary_review_avg": [ 104.33333333333333, 79.37393918801196 ], "wc_main_review_avg": [ 389.6666666666667, 190.1408834405572 ], "wc_review_avg": [ 614.0, 190.33829532352829 ], "wc_reply_reviewers_avg": [ 158.66666666666666, 224.38855189653108 ], "wc_reply_authors_avg": [ 1233.6666666666667, 845.5539932822478 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.9428090415820634 ], "reply_authors_avg": [ 3.0, 1.632993161855452 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.9176629354822472, "corr_recommendation_correctness": 0.8029550685469661, "gs_citation": 42, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15432684675510350837&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=giBFoa-uS12", "email": "gatech.edu;gatech.edu;cc.gatech.edu", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Georgia Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.gatech.edu", "aff_unique_abbr": "Georgia Tech", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "gijKplIZ2Y-", "title": "Mistill: Distilling Distributed Network Protocols from Examples", "track": "main", "status": "Reject", "tldr": "", "abstract": "New applications and use-cases in data center networks require the design of Traffic Engineering (TE) algorithms that account for application-specific traffic patterns. TE makes forwarding decisions from the global state of the network. Thus, new TE algorithms require the design and implementation of effective information exchange and efficient algorithms to compute forwarding decisions. This is a challenging and labor and time-intensive process. To automate and simplify this process, we propose MISTILL. MISTILL distills the forwarding behavior of TE policies from exemplary forwarding decisions into a Neural Network. MISTILL learns which network devices must exchange state with each other, how to process local state to send it over the network, and how to map the exchanged state into forwarding decisions. We show the ability of MISTILL to learn distributed protocols with three examples and verify their performance in simulations. We show that the learned protocols closely implement the desired policies.", "keywords": "communication networks;distributed protocols", "primary_area": "", "supplementary_material": "", "author": "Patrick Kr\u00e4mer;Johannes Zerwas;Andreas Blenk", "authorids": "~Patrick_Kr\u00e4mer1;johannes.zerwas@tum.de;andreas.blenk@tum.de", "gender": "M;;", "homepage": "https://www.ei.tum.de/en/lkn/team/staff/kraemer-patrick/;;", "dblp": ";;", "google_scholar": "https://scholar.google.de/citations?hl=de;;", "orcid": ";;", "linkedin": ";;", "or_profile": "~Patrick_Kr\u00e4mer1;johannes.zerwas@tum.de;andreas.blenk@tum.de", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nkr{\\\"a}mer2022mistill,\ntitle={Mistill: Distilling Distributed Network Protocols from Examples},\nauthor={Patrick Kr{\\\"a}mer and Johannes Zerwas and Andreas Blenk},\nyear={2022},\nurl={https://openreview.net/forum?id=gijKplIZ2Y-}\n}", "github": "", "project": "", "reviewers": "hRA9;noo6;wBxW;qhEy", "site": "https://openreview.net/forum?id=gijKplIZ2Y-", "pdf_size": 0, "recommendation": "5;5;6;8", "confidence": "3;4;4;2", "correctness": "3;3;4;4", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;3;0;0", "wc_summary_paper": "91;125;66;12", "wc_summary_review": "54;93;111;39", "wc_main_review": "366;442;634;244", "wc_review": "511;660;811;295", "wc_reply_reviewers": "0;0;35;65", "wc_reply_authors": "508;459;1130;577", "reply_reviewers": "0;0;1;1", "reply_authors": "1;1;2;1", "recommendation_avg": [ 6.0, 1.224744871391589 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.25, 1.299038105676658 ], "wc_summary_paper_avg": [ 73.5, 41.22196016688192 ], "wc_summary_review_avg": [ 74.25, 28.960101864461734 ], "wc_main_review_avg": [ 421.5, 141.56535593145662 ], "wc_review_avg": [ 569.25, 190.5811834888219 ], "wc_reply_reviewers_avg": [ 25.0, 27.15695122800054 ], "wc_reply_authors_avg": [ 668.5, 269.72439637526304 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.7385489458759963, "corr_recommendation_correctness": 0.8164965809277259, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14048154970753817539&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4 }, { "title": "Spherical Message Passing for 3D Molecular Graphs", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6403", "id": "givsRXsOt9r", "poster": "", "openreview": "https://openreview.net/forum?id=givsRXsOt9r", "slides": "https://iclr.cc/virtual/2022/poster/6403", "video": "https://iclr.cc/virtual/2022/poster/6403", "author_site": "Yi Liu, Limei Wang, Meng Liu, Yuchao Lin, Xuan Zhang, Bora Oztekin, Shuiwang Ji", "tldr": "", "abstract": "We consider representation learning of 3D molecular graphs in which each atom is associated with a spatial position in 3D. This is an under-explored area of research, and a principled message passing framework is currently lacking. In this work, we conduct analyses in the spherical coordinate system (SCS) for the complete identification of 3D graph structures. Based on such observations, we propose the spherical message passing (SMP) as a novel and powerful scheme for 3D molecular learning. SMP dramatically reduces training complexity, enabling it to perform efficiently on large-scale molecules. In addition, SMP is capable of distinguishing almost all molecular structures, and the uncovered cases may not exist in practice. Based on meaningful physically-based representations of 3D information, we further propose the SphereNet for 3D molecular learning. Experimental results demonstrate that the use of meaningful 3D information in SphereNet leads to significant performance improvements in prediction tasks. Our results also demonstrate the advantages of SphereNet in terms of capability, efficiency, and scalability.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/3941ea8156d88c0d6e5ac6fda6dc05de8739a49f.zip", "author": "Yi Liu;Limei Wang;Meng Liu;Yuchao Lin;Xuan Zhang;Bora Oztekin;Shuiwang Ji", "authorids": "~Yi_Liu12;~Limei_Wang1;~Meng_Liu3;~Yuchao_Lin1;~Xuan_Zhang3;~Bora_Oztekin1;~Shuiwang_Ji1", "gender": ";;M;M;M;;M", "homepage": ";https://limei0307.github.io/;https://mengliu1998.github.io;https://kruskallin.github.io/;https://github.com/floatlazer;https://boraoztekin.com;http://people.tamu.edu/~sji", "dblp": ";57/2674;41/7841-15;322/5499;;276/1721.html;84/6405", "google_scholar": ";https://scholar.google.com/citations?hl=en;https://scholar.google.com/citations?hl=en;;https://scholar.google.com/citations?view_op=list_works;k7tlPR0AAAAJ;BZGj6sAAAAAJ", "orcid": ";;;;;0000-0003-4766-4106;0000-0002-4205-4563", "linkedin": ";;meng-liu-4a1813197/;;;http://linkedin.com/in/boraoztekin;shuiwang-ji-9a040715/", "or_profile": "~Yi_Liu12;~Limei_Wang1;~Meng_Liu3;~Yuchao_Lin1;~Xuan_Zhang3;~Bora_Oztekin1;~Shuiwang_Ji1", "aff": ";Texas A&M;Texas A&M University - College Station;Texas A&M;Texas A&M;Texas A&M;Texas A&M University", "aff_domain": ";tamu.edu;tamu.edu;tamu.edu;tamu.edu;tamu.edu;tamu.edu", "position": ";PhD student;PhD student;PhD student;PhD student;Undergrad student;Professor", "bibtex": "@inproceedings{\nliu2022spherical,\ntitle={Spherical Message Passing for 3D Molecular Graphs},\nauthor={Yi Liu and Limei Wang and Meng Liu and Yuchao Lin and Xuan Zhang and Bora Oztekin and Shuiwang Ji},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=givsRXsOt9r}\n}", "github": "", "project": "", "reviewers": "epnb;6Y5X;xP3s", "pdf_size": 0, "recommendation": "5;8;8", "confidence": "5;3;5", "correctness": "3;4;3", "technical_novelty": "3;3;4", "empirical_novelty": "2;3;4", "wc_summary_paper": "54;37;46", "wc_summary_review": "59;33;66", "wc_main_review": "204;362;347", "wc_review": "317;432;459", "wc_reply_reviewers": "1495;97;297", "wc_reply_authors": "4250;1021;1652", "reply_reviewers": "4;1;2", "reply_authors": "9;2;3", "recommendation_avg": [ 7.0, 1.4142135623730951 ], "confidence_avg": [ 4.333333333333333, 0.9428090415820634 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 3.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 3.0, 0.816496580927726 ], "wc_summary_paper_avg": [ 45.666666666666664, 6.944222218666553 ], "wc_summary_review_avg": [ 52.666666666666664, 14.197026292697903 ], "wc_main_review_avg": [ 304.3333333333333, 71.2101740546173 ], "wc_review_avg": [ 402.6666666666667, 61.57019480957397 ], "wc_reply_reviewers_avg": [ 629.6666666666666, 617.3066948896275 ], "wc_reply_authors_avg": [ 2307.6666666666665, 1397.386687912675 ], "reply_reviewers_avg": [ 2.3333333333333335, 1.247219128924647 ], "reply_authors_avg": [ 4.666666666666667, 3.0912061651652345 ], "replies_avg": [ 26, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": -0.5, "corr_recommendation_correctness": 0.5, "gs_citation": 228, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17315498793029338830&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "pdf": "https://openreview.net/pdf?id=givsRXsOt9r", "email": ";tamu.edu;tamu.edu;tamu.edu;tamu.edu;tamu.edu;tamu.edu", "author_num": 7, "aff_unique_index": "0;0;0;0;0;0", "aff_unique_norm": "Texas A&M University", "aff_unique_dep": "", "aff_unique_url": "https://www.tamu.edu", "aff_unique_abbr": "TAMU", "aff_campus_unique_index": "1", "aff_campus_unique": ";College Station", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Coherence-based Label Propagation over Time Series for Accelerated Active Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6012", "id": "gjNcH0hj0LM", "poster": "", "openreview": "https://openreview.net/forum?id=gjNcH0hj0LM", "slides": "https://iclr.cc/virtual/2022/poster/6012", "video": "https://iclr.cc/virtual/2022/poster/6012", "author_site": "Yooju Shin, Susik Yoon, Sundong Kim, Hwanjun Song, Jae-Gil Lee, Byung Suk Lee", "tldr": "", "abstract": "Time-series data are ubiquitous these days, but lack of the labels in time-series data is regarded as a hurdle for its broad applicability. Meanwhile, active learning has been successfully adopted to reduce the labeling efforts in various tasks. Thus, this paper addresses an important issue, time-series active learning. Inspired by the temporal coherence in time-series data, where consecutive data points tend to have the same label, our label propagation framework, called TCLP, automatically assigns a queried label to the data points within an accurately estimated time-series segment, thereby significantly boosting the impact of an individual query. Compared with traditional time-series active learning, TCLP is shown to improve the classification accuracy by up to 7.1 times when only 0.8% of data points in the entire time series are queried for their labels.", "keywords": "active learning;time series;pseudo labeling", "primary_area": "", "supplementary_material": "", "author": "Yooju Shin;Susik Yoon;Sundong Kim;Hwanjun Song;Jae-Gil Lee;Byung Suk Lee", "authorids": "~Yooju_Shin1;~Susik_Yoon1;~Sundong_Kim2;~Hwanjun_Song2;~Jae-Gil_Lee1;~Byung_Suk_Lee1", "gender": "M;;;M;M;", "homepage": ";http://www.susikyoon.com;;https://songhwanjun.github.io/;https://dm.kaist.ac.kr/jaegil/;", "dblp": "https://dblp.uni-trier.de/pid/242/5190;179/5307;;204/3381;28/3904;", "google_scholar": "https://scholar.google.com/citations?hl=en;tCJs1zEAAAAJ;;Ijzuc-8AAAAJ;https://scholar.google.com.tw/citations?user=h9mbv9MAAAAJ;", "orcid": "0000-0002-1395-9136;0000-0001-5596-4972;;0000-0002-1105-0818;0000-0002-8711-7732;", "linkedin": ";;;;;", "or_profile": "~Yooju_Shin1;~Susik_Yoon1;~Sundong_Kim2;~Hwanjun_Song2;~Jae-Gil_Lee1;~Byung_Suk_Lee1", "aff": "Korea Advanced Institute of Science & Technology;University of Illinois, Urbana Champaign;;NAVER CLOVA;Korea Advanced Institute of Science & Technology;", "aff_domain": "kaist.ac.kr;illinois.edu;;navercorp.com;kaist.ac.kr;", "position": "PhD student;Postdoc;;Research Scientist;Associate Professor;", "bibtex": "@inproceedings{\nshin2022coherencebased,\ntitle={Coherence-based Label Propagation over Time Series for Accelerated Active Learning},\nauthor={Yooju Shin and Susik Yoon and Sundong Kim and Hwanjun Song and Jae-Gil Lee and Byung Suk Lee},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=gjNcH0hj0LM}\n}", "github": "", "project": "", "reviewers": "kJBx;pVjg;s4ST;hW9X", "pdf_size": 0, "recommendation": "6;6;6;10", "confidence": "3;3;2;4", "correctness": "3;4;3;4", "technical_novelty": "2;3;3;4", "empirical_novelty": "4;4;3;4", "wc_summary_paper": "154;33;44;74", "wc_summary_review": "26;75;53;31", "wc_main_review": "479;178;287;179", "wc_review": "659;286;384;284", "wc_reply_reviewers": "174;0;320;0", "wc_reply_authors": "2211;1204;1580;658", "reply_reviewers": "3;0;2;0", "reply_authors": "5;2;3;2", "recommendation_avg": [ 7.0, 1.7320508075688772 ], "confidence_avg": [ 3.0, 0.7071067811865476 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 3.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 76.25, 47.33061905363166 ], "wc_summary_review_avg": [ 46.25, 19.45989465541887 ], "wc_main_review_avg": [ 280.75, 122.73217793227658 ], "wc_review_avg": [ 403.25, 153.09045528706224 ], "wc_reply_reviewers_avg": [ 123.5, 133.85346465444965 ], "wc_reply_authors_avg": [ 1413.25, 565.3314846176534 ], "reply_reviewers_avg": [ 1.25, 1.299038105676658 ], "reply_authors_avg": [ 3.0, 1.224744871391589 ], "replies_avg": [ 23, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.816496580927726, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 18, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13195629994003166935&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=gjNcH0hj0LM", "email": "kaist.ac.kr;illinois.edu;;navercorp.com;kaist.ac.kr;", "author_num": 6, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "Korea Advanced Institute of Science and Technology;University of Illinois Urbana-Champaign;NAVER Corporation", "aff_unique_dep": ";;CLOVA", "aff_unique_url": "https://www.kaist.ac.kr;https://illinois.edu;https://www.naver.com", "aff_unique_abbr": "KAIST;UIUC;NAVER", "aff_campus_unique_index": "1", "aff_campus_unique": ";Urbana-Champaign", "aff_country_unique_index": "0;1;0;0", "aff_country_unique": "South Korea;United States" }, { "id": "gmxgG6_BL_N", "title": "Variational Component Decoder for Source Extraction from Nonlinear Mixture", "track": "main", "status": "Reject", "tldr": "", "abstract": "In many practical scenarios of signal extraction from a nonlinear mixture, only one (signal) source is intended to be extracted. However, modern methods involving Blind Source Separation are inefficient for this task since they are designed to recover all sources in the mixture. In this paper, we propose supervised Variational Component Decoder (sVCD) as a method dedicated to extracting a single source from nonlinear mixture. sVCD leverages the sequence-to-sequence (Seq2Seq) translation ability of a specially designed neural network to approximate a nonlinear inverse of the mixture process, assisted by priors of the interested source. In order to maintain the robustness in the face of real-life samples, sVCD combines Seq2Seq with variational inference to form a deep generative model, and it is trained by optimizing a variant of variational bound on the data likelihood concerning only the interested source. We demonstrate that sVCD has superior performance on nonlinear source extraction over a state-of-the-art method on diverse datasets, including artificially generated sequences, radio frequency (RF) sensing data, and electroencephalogram (EEG) results.", "keywords": "Source Extraction;Variational Inference;Disentanglement", "primary_area": "", "supplementary_material": "/attachment/fed69df30ace6dc1112b63462959412a71b62603.zip", "author": "Shujie Zhang;Tianyue Zheng;Zhe Chen;Jun Luo;Sinno Pan", "authorids": "~Shujie_Zhang1;~Tianyue_Zheng2;~Zhe_Chen8;~Jun_Luo6;~Sinno_Pan1", "gender": ";M;M;M;M", "homepage": "https://shujiez.github.io/index.html;;https://scholar.google.com/citations?user=hhl5-78AAAAJ&hl=en;https://personal.ntu.edu.sg/junluo/;http://www.cse.cuhk.edu.hk/~sinnopan/", "dblp": ";;;42/2501-1.html;80/5412", "google_scholar": ";A-YvDJgAAAAJ;hhl5-78AAAAJ;https://scholar.google.com.sg/citations?user=qXLjz30AAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";;;0000-0002-7036-5158;", "linkedin": ";;;;", "or_profile": "~Shujie_Zhang1;~Tianyue_Zheng2;~Zhe_Chen8;~Jun_Luo6;~Sinno_Pan1", "aff": "Nanyang Technological University;Nanyang Technological University;Fudan University;Nanyang Technological University;Nanyang Technological University", "aff_domain": "ntu.edu.sg;ntu.edu.sg;fudan.edu.cn;ntu.edu.sg;ntu.edu.sg", "position": "PhD student;PhD student;Researcher;Associate Professor;Associate Professor", "bibtex": "@misc{\nzhang2022variational,\ntitle={Variational Component Decoder for Source Extraction from Nonlinear Mixture},\nauthor={Shujie Zhang and Tianyue Zheng and Zhe Chen and Jun Luo and Sinno Pan},\nyear={2022},\nurl={https://openreview.net/forum?id=gmxgG6_BL_N}\n}", "github": "", "project": "", "reviewers": "aZ4J;epBn;MyNd;LaJi", "site": "https://openreview.net/forum?id=gmxgG6_BL_N", "pdf_size": 0, "recommendation": "3;5;8;8", "confidence": "4;4;4;2", "correctness": "3;3;4;4", "technical_novelty": "3;3;4;2", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "61;101;68;49", "wc_summary_review": "55;189;121;53", "wc_main_review": "578;495;59;150", "wc_review": "694;785;248;252", "wc_reply_reviewers": "177;598;0;0", "wc_reply_authors": "892;1837;102;132", "reply_reviewers": "1;3;0;0", "reply_authors": "3;6;1;1", "recommendation_avg": [ 6.0, 2.1213203435596424 ], "confidence_avg": [ 3.5, 0.8660254037844386 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 69.75, 19.279198634798075 ], "wc_summary_review_avg": [ 104.5, 55.9352304008842 ], "wc_main_review_avg": [ 320.5, 220.34575103686478 ], "wc_review_avg": [ 494.75, 246.85965142161243 ], "wc_reply_reviewers_avg": [ 193.75, 244.3239396784523 ], "wc_reply_authors_avg": [ 740.75, 707.6755524249795 ], "reply_reviewers_avg": [ 1.0, 1.224744871391589 ], "reply_authors_avg": [ 2.75, 2.0463381929681126 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.5443310539518174, "corr_recommendation_correctness": 0.9428090415820635, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17609020062074971547&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2, "aff_unique_index": "0;0;1;0;0", "aff_unique_norm": "Nanyang Technological University;Fudan University", "aff_unique_dep": ";", "aff_unique_url": "https://www.ntu.edu.sg;https://www.fudan.edu.cn", "aff_unique_abbr": "NTU;Fudan", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;0", "aff_country_unique": "Singapore;China" }, { "title": "Reverse Engineering of Imperceptible Adversarial Image Perturbations", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6892", "id": "gpp7cf0xdfN", "poster": "", "openreview": "https://openreview.net/forum?id=gpp7cf0xdfN", "slides": "https://iclr.cc/virtual/2022/poster/6892", "video": "https://iclr.cc/virtual/2022/poster/6892", "author_site": "Yifan Gong, Yuguang Yao, Yize Li, Yimeng Zhang, Xiaoming Liu, Xue Lin, Sijia Liu", "tldr": "", "abstract": "It has been well recognized that neural network based image classifiers are easily fooled by images with tiny perturbations crafted by an adversary. There has been a vast volume of research to generate and defend such adversarial attacks. However, the following problem is left unexplored: How to reverse-engineer adversarial perturbations from an adversarial image? This leads to a new adversarial learning paradigm\u2014Reverse Engineering of Deceptions (RED). If successful, RED allows us to estimate adversarial perturbations and recover the original images. However, carefully crafted, tiny adversarial perturbations are difficult to recover by optimizing a unilateral RED objective. For example, the pure image denoising method may overfit to minimizing the reconstruction error but hardly preserve the classification properties of the true adversarial perturbations. To tackle this challenge, we formalize the RED problem and identify a set of principles crucial to the RED approach design. Particularly, we find that prediction alignment and proper data augmentation (in terms of spatial transformations) are two criteria to achieve a generalizable RED approach. By integrating these RED principles with image denoising, we propose a new Class-Discriminative Denoising based RED framework, termed CDD-RED. Extensive experiments demonstrate the effectiveness of CDD-RED under different evaluation metrics (ranging from the pixel-level, prediction-level to the attribution-level alignment) and a variety of attack generation methods (e.g., FGSM, PGD, CW, AutoAttack, and adaptive attacks).", "keywords": "Reverse Engineering of Deceptions;adversarial examples;denoising;neural networks;interpretability", "primary_area": "", "supplementary_material": "", "author": "Yifan Gong;Yuguang Yao;Yize Li;Yimeng Zhang;Xiaoming Liu;Xue Lin;Sijia Liu", "authorids": "~Yifan_Gong2;~Yuguang_Yao1;~Yize_Li1;~Yimeng_Zhang2;~Xiaoming_Liu2;~Xue_Lin1;~Sijia_Liu1", "gender": "F;M;M;M;M;F;M", "homepage": "https://yifanfanfanfan.github.io/;https://www.cse.msu.edu/~yaoyugua/;;https://damon-demon.github.io;http://www.cse.msu.edu/~liuxm/;https://coe.northeastern.edu/people/lin-xue/;https://lsjxjtu.github.io/", "dblp": "49/3073-4.html;238/9467;68/2953;;l/XiaomingLiu0002;;128/6972-1", "google_scholar": "U_gevVgAAAAJ;-chIdAkAAAAJ;_wzQ1EgAAAAJ;https://scholar.google.com/citations?hl=en;https://scholar.google.com/citations?hl=en;p87KNLIAAAAJ;C7dO_UgAAAAJ", "orcid": "0000-0002-3912-097X;;;0000-0003-1608-2541;;0000-0001-6210-8883;", "linkedin": "yifan-gong-3059b8132/;tonyyaomsu/;yize-li-bb6112192/;;xiaoming-liu-5a7807b/;;", "or_profile": "~Yifan_Gong2;~Yuguang_Yao1;~Yize_Li1;~Yimeng_Zhang2;~Xiaoming_Liu2;~Xue_Lin1;~Sijia_Liu1", "aff": "Northeastern University;Michigan State University;Northeastern University;Intel;Michigan State University;Northeastern University;Michigan State University", "aff_domain": "neu.edu;msu.edu;neu.edu;intel.com;msu.edu;neu.edu;msu.edu", "position": "PhD student;PhD student;PhD student;Intern;Professor;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\ngong2022reverse,\ntitle={Reverse Engineering of Imperceptible Adversarial Image Perturbations},\nauthor={Yifan Gong and Yuguang Yao and Yize Li and Yimeng Zhang and Xiaoming Liu and Xue Lin and Sijia Liu},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=gpp7cf0xdfN}\n}", "github": "", "project": "", "reviewers": "e8YB;T72q;Ah1k", "pdf_size": 0, "recommendation": "6;6;8", "confidence": "4;2;4", "correctness": "4;3;3", "technical_novelty": "2;2;4", "empirical_novelty": "3;2;4", "wc_summary_paper": "109;70;75", "wc_summary_review": "60;126;191", "wc_main_review": "394;159;133", "wc_review": "563;355;399", "wc_reply_reviewers": "60;26;133", "wc_reply_authors": "2639;2198;792", "reply_reviewers": "1;1;1", "reply_authors": "7;7;3", "recommendation_avg": [ 6.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 3.3333333333333335, 0.9428090415820634 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.6666666666666665, 0.9428090415820634 ], "empirical_novelty_avg": [ 3.0, 0.816496580927726 ], "wc_summary_paper_avg": [ 84.66666666666667, 17.326921891156037 ], "wc_summary_review_avg": [ 125.66666666666667, 53.48104544810453 ], "wc_main_review_avg": [ 228.66666666666666, 117.38919124954487 ], "wc_review_avg": [ 439.0, 89.50232771647153 ], "wc_reply_reviewers_avg": [ 73.0, 44.63929509598765 ], "wc_reply_authors_avg": [ 1876.3333333333333, 787.5930985533639 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 5.666666666666667, 1.8856180831641267 ], "replies_avg": [ 28, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.5, "corr_recommendation_correctness": -0.49999999999999983, "gs_citation": 26, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16789227603564642801&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=gpp7cf0xdfN", "email": "neu.edu;msu.edu;neu.edu;intel.com;msu.edu;neu.edu;msu.edu", "author_num": 7, "aff_unique_index": "0;1;0;2;1;0;1", "aff_unique_norm": "Northeastern University;Michigan State University;Intel", "aff_unique_dep": ";;Intel Corporation", "aff_unique_url": "https://www.northeastern.edu;https://www.msu.edu;https://www.intel.com", "aff_unique_abbr": "NEU;MSU;Intel", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "gtvM-nBZEbc", "title": "Learning Visual-Linguistic Adequacy, Fidelity, and Fluency for Novel Object Captioning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Novel object captioning (NOC) learns image captioning models for describing objects or visual concepts which are unseen (i.e., novel) in the training captions. Such captioning models need to sufficiently describe such visual data with fluent and natural language expression. In other words, we expect the produced captions being linguistically fluent, containing novel objects of interest, and fitting the visual concept of the image. The above three aspects thus correspond to fluency, fidelity, and adequacy, respectively. However, most novel object captioning models are not explicitly designed to address the aforementioned properties due to the absence of caption annotations. In this paper, we start by providing an insight into the relationship between the above properties and existing visual/language models. Then, we present VLAF2, for learning Visual-Linguistic Adequacy, Fidelity, and Fluency, which utilizes linguistics observed from captions for describing visual information of images with novel objects. More specifically, we revisit BERT and CLIP, and explain how we leverage the intrinsic language knowledge from such popular models to reward captions with precise and rich visual content associated with novel images. To validate the effectiveness of our framework, we conduct extensive experiments on the nocaps dataset. Our method not only performs favorably against state-of-the-art novel captioning models in all caption evaluation metrics, but also surpasses the SPICE scores of human baseline. We perform quantitative and qualitative analysis to demonstrate how our model generates novel object captions with improved fluency, fidelity, and adequacy. Implementation details and code are available in the supplementary materials.", "keywords": "Semi-supervised Image Captioning;Novel Object Captioning", "primary_area": "", "supplementary_material": "/attachment/a17cdb6ddb90c1e39b875279720a8528b7e7b10e.zip", "author": "Cheng-Fu Yang;Yao-Hung Hubert Tsai;Wan-Cyuan Fan;Yu-Chiang Frank Wang;Louis-Philippe Morency;Ruslan Salakhutdinov", "authorids": "~Cheng-Fu_Yang1;~Yao-Hung_Hubert_Tsai1;~Wan-Cyuan_Fan1;~Yu-Chiang_Frank_Wang2;~Louis-Philippe_Morency1;~Ruslan_Salakhutdinov1", "gender": "M;M;M;M;M;M", "homepage": "https://joeyy5588.github.io/;;http://vllab.ee.ntu.edu.tw/ycwang.html;https://www.cs.cmu.edu/~morency/;https://www.cs.cmu.edu/~rsalakhu/;https://sites.google.com/view/wancyuanfan", "dblp": "51/8564;154/3702;30/1690;31/739;;300/5836", "google_scholar": "https://scholar.google.com.tw/citations?user=cJ5oowQAAAAJ;;HSGvdtoAAAAJ;https://scholar.google.com.tw/citations?user=APgaFK0AAAAJ;;EIPHoLEAAAAJ", "orcid": ";;0000-0002-2333-157X;0000-0001-6376-7696;;", "linkedin": ";;;morency?challengeId=AQELGK_OvMa0vwAAAY72L-VV4X9hW8juuY80VHVeeSGHZ1PJHeeEa5LTFoeTmDGU0t1OL07MXJTYC9EAi6qgPDd2z9ztnbdFYA&submissionId=09a0ff34-04ac-c717-bef7-8c9c8811b463&challengeSource=AgFhxWkU3q7v4wAAAY72L-1xRE0eG-BnZUNE9e3eAG95pgOCZ9u1nxEg-1dK2Dw&challegeType=AgHMzV0lqKgEFwAAAY72L-11X6DHMd3V_A3Iur8XZeyYF2-oBzoufs8&memberId=AgH4yz7pZ_riCgAAAY72L-146jmR2pdr3dmhy2icxBtEQzQ&recognizeDevice=AgFDCNyrhKiFSAAAAY72L-16m7z2EH2t0ueWmMKjyk1_ZJAkfFVe;;", "or_profile": "~Cheng-Fu_Yang1;~Yao-Hung_Hubert_Tsai1;~Yu-Chiang_Frank_Wang2;~Louis-Philippe_Morency1;~Russ_Salakhutdinov1;~WanCyuan_Fan1", "aff": "University of California, Los Angeles;Apple;National Taiwan University;Carnegie Mellon University;School of Computer Science, Carnegie Mellon University;National Taiwan University", "aff_domain": "cs.ucla.edu;apple.com;ntu.edu.tw;cmu.edu;cs.cmu.edu;ntu.edu.tw", "position": "PhD student;Principal Researcher;Full Professor;Associate Professor;Full Professor;MS student", "bibtex": "@misc{\nyang2022learning,\ntitle={Learning Visual-Linguistic Adequacy, Fidelity, and Fluency for Novel Object Captioning},\nauthor={Cheng-Fu Yang and Yao-Hung Hubert Tsai and Wan-Cyuan Fan and Yu-Chiang Frank Wang and Louis-Philippe Morency and Ruslan Salakhutdinov},\nyear={2022},\nurl={https://openreview.net/forum?id=gtvM-nBZEbc}\n}", "github": "", "project": "", "reviewers": "1659;2cGW;F11g;PRhm", "site": "https://openreview.net/forum?id=gtvM-nBZEbc", "pdf_size": 0, "recommendation": "5;6;6;6", "confidence": "4;3;4;5", "correctness": "3;3;3;3", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;2;0;4", "wc_summary_paper": "111;82;56;200", "wc_summary_review": "30;37;21;115", "wc_main_review": "263;388;264;425", "wc_review": "404;507;341;740", "wc_reply_reviewers": "52;95;0;0", "wc_reply_authors": "1696;2126;1562;1129", "reply_reviewers": "1;1;0;0", "reply_authors": "3;4;3;3", "recommendation_avg": [ 5.75, 0.4330127018922193 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 1.4142135623730951 ], "wc_summary_paper_avg": [ 112.25, 54.26958171941258 ], "wc_summary_review_avg": [ 50.75, 37.525824441309744 ], "wc_main_review_avg": [ 335.0, 72.68768809090024 ], "wc_review_avg": [ 498.0, 151.76462038301284 ], "wc_reply_reviewers_avg": [ 36.75, 39.77043499887825 ], "wc_reply_authors_avg": [ 1628.25, 355.66302520785035 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 3.25, 0.4330127018922193 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:ePN-UuNLH7gJ:scholar.google.com/&scioq=Learning+Visual-Linguistic+Adequacy,+Fidelity,+and+Fluency+for+Novel+Object+Captioning&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1;2;3;3;2", "aff_unique_norm": "University of California, Los Angeles;Apple;National Taiwan University;Carnegie Mellon University", "aff_unique_dep": ";Apple Inc.;;", "aff_unique_url": "https://www.ucla.edu;https://www.apple.com;https://www.ntu.edu.tw;https://www.cmu.edu", "aff_unique_abbr": "UCLA;Apple;NTU;CMU", "aff_campus_unique_index": "0;2;3;2", "aff_campus_unique": "Los Angeles;;Taiwan;Pittsburgh", "aff_country_unique_index": "0;0;1;0;0;1", "aff_country_unique": "United States;China" }, { "id": "gxRcqTbJpVW", "title": "Structured Pruning Meets Orthogonality", "track": "main", "status": "Reject", "tldr": "", "abstract": "Several recent works empirically found finetuning learning rate is crucial to the final performance in structured neural network pruning. It is shown that the \\emph{dynamical isometry} broken by pruning answers for this phenomenon. How to develop a filter pruning method that maintains or recovers dynamical isometry \\emph{and} is scalable to modern deep networks remains elusive up to now. In this paper, we present \\emph{orthogonality preserving pruning} (OPP), a regularization-based structured pruning method that maintains the dynamical isometry during pruning. Specifically, OPP regularizes the gram matrix of convolutional kernels to encourage kernel orthogonality among the important filters meanwhile driving the unimportant weights towards zero. We also propose to regularize batch-normalization parameters for better preserving dynamical isometry for the whole network. Empirically, OPP can compete with the \\emph{ideal} dynamical isometry recovery method on linear networks. On non-linear networks (ResNet56/VGG19, CIFAR datasets), it outperforms the available solutions \\emph{by a large margin}. Moreover, OPP can also work effectively with modern deep networks (ResNets) on ImageNet, delivering encouraging performance in comparison to many recent filter pruning methods. To our best knowledge, this is the \\emph{first} method that effectively maintains dynamical isometry during pruning for \\emph{large-scale} deep neural networks.", "keywords": "network pruning;structured pruning;dynamical isometry;model compression", "primary_area": "", "supplementary_material": "", "author": "Huan Wang;Yun Fu", "authorids": "~Huan_Wang3;~Yun_Fu1", "gender": "M;M", "homepage": "https://huanwang.tech/;http://www1.ece.neu.edu/~yunfu/", "dblp": "70/6155-14;00/5815-1", "google_scholar": "0-On0y4AAAAJ;https://scholar.google.com.tw/citations?user=h-JEcQ8AAAAJ", "orcid": "0000-0001-6951-901X;0000-0002-5098-2853", "linkedin": "huanwang-zju/;furaymond/", "or_profile": "~Huan_Wang3;~Yun_Fu1", "aff": "Northeastern University;Northeastern University", "aff_domain": "neu.edu;northeastern.edu", "position": "PhD student;Full Professor", "bibtex": "@misc{\nwang2022structured,\ntitle={Structured Pruning Meets Orthogonality},\nauthor={Huan Wang and Yun Fu},\nyear={2022},\nurl={https://openreview.net/forum?id=gxRcqTbJpVW}\n}", "github": "", "project": "", "reviewers": "idn1;8kct;ipz6;QYz3", "site": "https://openreview.net/forum?id=gxRcqTbJpVW", "pdf_size": 0, "recommendation": "3;3;6;6", "confidence": "5;4;5;4", "correctness": "2;3;3;3", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "58;23;35;71", "wc_summary_review": "75;33;19;24", "wc_main_review": "313;63;116;293", "wc_review": "446;119;170;388", "wc_reply_reviewers": "434;0;123;58", "wc_reply_authors": "1317;409;858;887", "reply_reviewers": "1;0;1;1", "reply_authors": "3;3;2;2", "recommendation_avg": [ 4.5, 1.5 ], "confidence_avg": [ 4.5, 0.5 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 46.75, 18.819869818890883 ], "wc_summary_review_avg": [ 37.75, 22.083647796503186 ], "wc_main_review_avg": [ 196.25, 108.61255682470605 ], "wc_review_avg": [ 280.75, 138.95930159582696 ], "wc_reply_reviewers_avg": [ 153.75, 167.55055207309823 ], "wc_reply_authors_avg": [ 867.75, 321.2252908785359 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 2.5, 0.5 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.5773502691896258, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:J_HQfhSHyBUJ:scholar.google.com/&scioq=Structured+Pruning+Meets+Orthogonality&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Northeastern University", "aff_unique_dep": "", "aff_unique_url": "https://www.northeastern.edu", "aff_unique_abbr": "NEU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "gxk4-rVATDA", "title": "Bit-wise Training of Neural Network Weights", "track": "main", "status": "Reject", "tldr": "", "abstract": "We propose an algorithm where the individual bits representing the weights of a neural network are learned. This method allows training weights with integer values on arbitrary bit-depths and naturally uncovers sparse networks, without additional constraints or regularization techniques. We show better results than the standard training technique with fully connected networks and similar performance as compared to standard training for residual networks. By training bits in a selective manner we found that the biggest contribution to achieving high accuracy is given by the first three most significant bits, while the rest provide an intrinsic regularization. As a consequence we show that more than 90% of a network can be used to store arbitrary codes without affecting the its accuracy. These codes can be random noise, binary files or even the weights of previously trained networks.", "keywords": "quantization;pruning;bit-wise training;resnet;lenet", "primary_area": "", "supplementary_material": "", "author": "Cristian Ivan", "authorids": "~Cristian_Ivan1", "gender": "", "homepage": "", "dblp": "241/7137", "google_scholar": "e5SSOkkAAAAJ", "orcid": "0000-0001-5719-2232", "linkedin": "", "or_profile": "~Cristian_Ivan1", "aff": "", "aff_domain": "", "position": "", "bibtex": "@misc{\nivan2022bitwise,\ntitle={Bit-wise Training of Neural Network Weights},\nauthor={Cristian Ivan},\nyear={2022},\nurl={https://openreview.net/forum?id=gxk4-rVATDA}\n}", "github": "", "project": "", "reviewers": "MjPw;LhyK;xT9U;gsV7", "site": "https://openreview.net/forum?id=gxk4-rVATDA", "pdf_size": 0, "recommendation": "3;3;5;6", "confidence": "4;5;4;3", "correctness": "1;2;3;3", "technical_novelty": "2;1;2;3", "empirical_novelty": "1;2;3;3", "wc_summary_paper": "16;110;122;71", "wc_summary_review": "34;45;25;29", "wc_main_review": "259;225;396;193", "wc_review": "309;380;543;293", "wc_reply_reviewers": "23;0;0;13", "wc_reply_authors": "347;318;506;312", "reply_reviewers": "1;0;0;1", "reply_authors": "1;1;1;1", "recommendation_avg": [ 4.25, 1.299038105676658 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 2.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 79.75, 41.35441330740892 ], "wc_summary_review_avg": [ 33.25, 7.495832175282475 ], "wc_main_review_avg": [ 268.25, 77.36076201796361 ], "wc_review_avg": [ 381.25, 98.96053506322609 ], "wc_reply_reviewers_avg": [ 9.0, 9.669539802906858 ], "wc_reply_authors_avg": [ 370.75, 79.20029987316967 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 1, 0 ], "corr_recommendation_confidence": -0.816496580927726, "corr_recommendation_correctness": 0.8703882797784892, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1799483053419260749&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "gzeruP-0J29", "title": "Revisiting and Advancing Fast Adversarial Training Through the lens of Bi-Level Optimization", "track": "main", "status": "Reject", "tldr": "", "abstract": "Adversarial training (AT) has become a widely recognized defense mechanism to improve the robustness of deep neural networks against adversarial attacks. It is originated from solving a min-max optimization problem, where the minimizer (i.e., defender) seeks a robust model to minimize the worst-case training loss at the presence of adversarial examples crafted by the maximizer (i.e., attacker). However,the min-max nature makes AT computationally intensive and thus difficult to scale. Thus, the problem of FAST-AT arises. Nearly all the recent progress is achieved based on the following simplification: The iterative attack generation method used in the maximization step of AT is replaced by the simplest one-shot gradient sign-based PGD method. Nevertheless, FAST-AT is far from satisfactory, and it lacks theoretically-grounded design. For example, a FAST-AT method may suffer from robustness catastrophic overfitting when training with strong adversaries.\n\nIn this paper, we foster a technological breakthrough for designing FAST-AT through the lens of bi-level optimization (BLO) instead of min-max optimization. First, we theoretically show that the most commonly-used algorithmic specification of FAST-AT is equivalent to the linearized BLO along the direction given by the sign of input gradient. Second, with the aid of BLO, we develop a new systematic and effective fast bi-level AT framework, termed FAST-BAT, whose algorithm is rigorously derived by leveraging the theory of implicit gradient. In contrast to FAST-AT, FAST-BAT has the least restriction to placing the tradeoff between computation efficiency and adversarial robustness. For example, it is capable of defending sign-based projected gradient descent (PGD) attacks without calling any gradient sign method and explicit robust regularization during training. Furthermore, we empirically show that our method outperforms state-of-the-art FAST-AT baselines. In particular, FAST-BAT can achieve superior model robustness without inducing robustness catastrophic overfitting and losing standard accuracy.", "keywords": "fast adversarial training;bi-level optimization;adversarial robustness;adversarial defense", "primary_area": "", "supplementary_material": "/attachment/6daee504be253cd48d2a567275ed44317ecff763.zip", "author": "Yihua Zhang;Guanhua Zhang;Prashant Khanduri;Mingyi Hong;Shiyu Chang;Sijia Liu", "authorids": "~Yihua_Zhang1;~Guanhua_Zhang1;~Prashant_Khanduri1;~Mingyi_Hong1;~Shiyu_Chang2;~Sijia_Liu1", "gender": "M;;M;M;Unspecified;M", "homepage": "https://yihua-zhang.com;;https://sites.google.com/view/khanduri-prashant/home?authuser=0;http://people.ece.umn.edu/~mhong/mingyi.html;http://people.csail.mit.edu/chang87/;https://lsjxjtu.github.io/", "dblp": ";171/0962.html;158/4888;57/8053;28/9988;128/6972-1", "google_scholar": "https://scholar.google.com/citations?hl=zh-CN;_hrEN-sAAAAJ;;qRnP-p0AAAAJ;r21asW4AAAAJ;C7dO_UgAAAAJ", "orcid": ";;;;;", "linkedin": "zhangyihua/;;prashant-khanduri-0497894b/;;;", "or_profile": "~Yihua_Zhang1;~Guanhua_Zhang1;~Prashant_Khanduri1;~Mingyi_Hong1;~Shiyu_Chang2;~Sijia_Liu1", "aff": "Michigan State University;;University of Minnesota, Minneapolis;University of Minnesota, Minneapolis;University of California, Santa Barbara;Michigan State University", "aff_domain": "msu.edu;;umn.edu;umn.edu;ucsb.edu;msu.edu", "position": "PhD student;;Postdoc;Associate Professor;Assistant Professor;Assistant Professor", "bibtex": "@misc{\nzhang2022revisiting,\ntitle={Revisiting and Advancing Fast Adversarial Training Through the lens of Bi-Level Optimization},\nauthor={Yihua Zhang and Guanhua Zhang and Prashant Khanduri and Mingyi Hong and Shiyu Chang and Sijia Liu},\nyear={2022},\nurl={https://openreview.net/forum?id=gzeruP-0J29}\n}", "github": "", "project": "", "reviewers": "6Qs9;N6su;c58a;TZX8", "site": "https://openreview.net/forum?id=gzeruP-0J29", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "4;5;4;4", "correctness": "2;2;3;3", "technical_novelty": "3;3;2;3", "empirical_novelty": "2;3;2;3", "wc_summary_paper": "149;68;76;163", "wc_summary_review": "72;31;268;68", "wc_main_review": "360;197;500;547", "wc_review": "581;296;844;778", "wc_reply_reviewers": "37;83;380;0", "wc_reply_authors": "1721;1172;3381;1899", "reply_reviewers": "1;1;2;0", "reply_authors": "5;4;9;4", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 114.0, 42.38513890504548 ], "wc_summary_review_avg": [ 109.75, 92.75336921104268 ], "wc_main_review_avg": [ 401.0, 136.39464798884157 ], "wc_review_avg": [ 624.75, 213.0415159071114 ], "wc_reply_reviewers_avg": [ 125.0, 150.13160893029823 ], "wc_reply_authors_avg": [ 2043.25, 817.5121940986569 ], "reply_reviewers_avg": [ 1.0, 0.7071067811865476 ], "reply_authors_avg": [ 5.5, 2.0615528128088303 ], "replies_avg": [ 33, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.13245323570650439, "corr_recommendation_correctness": 0.6882472016116854, "gs_citation": 108, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13176476866209995495&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "aff_unique_index": "0;1;1;2;0", "aff_unique_norm": "Michigan State University;University of Minnesota;University of California, Santa Barbara", "aff_unique_dep": ";;", "aff_unique_url": "https://www.msu.edu;https://www.minnesota.edu;https://www.ucsb.edu", "aff_unique_abbr": "MSU;UMN;UCSB", "aff_campus_unique_index": "1;1;2", "aff_campus_unique": ";Minneapolis;Santa Barbara", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "h-z_zqT2yJU", "title": "Reducing the Teacher-Student Gap via Adaptive Temperatures", "track": "main", "status": "Reject", "tldr": "", "abstract": "Knowledge distillation aims to obtain a small and effective deep model (student) by learning the output from a larger model (teacher). Previous studies found a severe degradation problem, that student performance would degrade unexpectedly when distilled from oversized teachers. It is well known that larger models tend to have sharper outputs. Based on this observation, we found that the sharpness gap between the teacher and student output may cause this degradation problem. To solve this problem, we first propose a metric to quantify the sharpness of the model output. Based on the second-order Taylor expansion of this metric, we propose Adaptive Temperature Knowledge Distillation (ATKD), which automatically changes the temperature of the teacher and the student, to reduce the sharpness gap. We conducted extensive experiments on CIFAR100 and ImageNet and achieved significant improvements. Specifically, ATKD trained the best ResNet18 model on ImageNet as we knew (73.0% accuracy).", "keywords": "Soft Labels;Knowledge distillation", "primary_area": "", "supplementary_material": "/attachment/a4e31041a1a96970cec25e342331fa44d2d06d91.zip", "author": "Jia Guo", "authorids": "~Jia_Guo2", "gender": "M", "homepage": "https://forjiuzhou.github.io/", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "~Jia_Guo2", "aff": "XiaoHongshu", "aff_domain": "xiaohongshu.com", "position": "Researcher", "bibtex": "@misc{\nguo2022reducing,\ntitle={Reducing the Teacher-Student Gap via Adaptive Temperatures},\nauthor={Jia Guo},\nyear={2022},\nurl={https://openreview.net/forum?id=h-z_zqT2yJU}\n}", "github": "", "project": "", "reviewers": "iJMG;Yenx;6R6z;iBXp", "site": "https://openreview.net/forum?id=h-z_zqT2yJU", "pdf_size": 0, "recommendation": "5;6;6;6", "confidence": "4;4;4;4", "correctness": "2;3;3;3", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "48;75;103;77", "wc_summary_review": "21;46;27;65", "wc_main_review": "112;124;118;304", "wc_review": "181;245;248;446", "wc_reply_reviewers": "0;66;69;104", "wc_reply_authors": "210;323;958;707", "reply_reviewers": "0;1;1;2", "reply_authors": "1;1;2;2", "recommendation_avg": [ 5.75, 0.4330127018922193 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 75.75, 19.45989465541887 ], "wc_summary_review_avg": [ 39.75, 17.25362280797862 ], "wc_main_review_avg": [ 164.5, 80.65203035262039 ], "wc_review_avg": [ 280.0, 99.50628120877596 ], "wc_reply_reviewers_avg": [ 59.75, 37.59238619720754 ], "wc_reply_authors_avg": [ 549.5, 299.26618586134987 ], "reply_reviewers_avg": [ 1.0, 0.7071067811865476 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 1, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 1.0, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7828476785254167290&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "Xiaohongshu", "aff_unique_dep": "", "aff_unique_url": "https://www.xiaohongshu.com", "aff_unique_abbr": "XHS", "aff_country_unique_index": "0", "aff_country_unique": "China" }, { "title": "Illiterate DALL-E Learns to Compose", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6051", "id": "h0OYV0We3oh", "poster": "", "openreview": "https://openreview.net/forum?id=h0OYV0We3oh", "slides": "https://iclr.cc/virtual/2022/poster/6051", "video": "https://iclr.cc/virtual/2022/poster/6051", "author_site": "Gautam Singh, Fei Deng, Sungjin Ahn", "tldr": "", "abstract": "Although DALL-E has shown an impressive ability of composition-based systematic generalization in image generation, it requires the dataset of text-image pairs and the compositionality is provided by the text. In contrast, object-centric representation models like the Slot Attention model learn composable representations without the text prompt. However, unlike DALL-E, its ability to systematically generalize for zero-shot generation is significantly limited. In this paper, we propose a simple but novel slot-based autoencoding architecture, called SLATE, for combining the best of both worlds: learning object-centric representations that allow systematic generalization in zero-shot image generation without text. As such, this model can also be seen as an illiterate DALL-E model. Unlike the pixel-mixture decoders of existing object-centric representation models, we propose to use the Image GPT decoder conditioned on the slots for capturing complex interactions among the slots and pixels. In experiments, we show that this simple and easy-to-implement architecture not requiring a text prompt achieves significant improvement in in-distribution and out-of-distribution (zero-shot) image generation and qualitatively comparable or better slot-attention structure than the models based on mixture decoders.", "keywords": "Zero-Shot Image Generation;Compositional Representation;Object-Centric Representation;Out-of-Distribution Generalization;Image Transformers", "primary_area": "", "supplementary_material": "", "author": "Gautam Singh;Fei Deng;Sungjin Ahn", "authorids": "~Gautam_Singh3;~Fei_Deng1;~Sungjin_Ahn1", "gender": "M;M;", "homepage": "https://singhgautam.github.io;;", "dblp": "35/2642;46/10037-1;", "google_scholar": "lXpFxDwAAAAJ;https://scholar.google.com/citations?hl=en;", "orcid": ";;", "linkedin": "gautam-singh-61302463/;;", "or_profile": "~Gautam_Singh3;~Fei_Deng1;~Sungjin_Ahn1", "aff": "Rutgers University;Rutgers University;", "aff_domain": "rutgers.edu;rutgers.edu;", "position": "PhD student;PhD student;", "bibtex": "@inproceedings{\nsingh2022illiterate,\ntitle={Illiterate {DALL}-E Learns to Compose},\nauthor={Gautam Singh and Fei Deng and Sungjin Ahn},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=h0OYV0We3oh}\n}", "github": "", "project": "", "reviewers": "8H9i;KXKd;o6Nv", "pdf_size": 0, "recommendation": "6;6;6", "confidence": "5;4;4", "correctness": "4;3;3", "technical_novelty": "2;3;2", "empirical_novelty": "3;3;3", "wc_summary_paper": "52;30;74", "wc_summary_review": "153;14;47", "wc_main_review": "653;104;291", "wc_review": "858;148;412", "wc_reply_reviewers": "1;0;89", "wc_reply_authors": "1525;683;1001", "reply_reviewers": "1;0;1", "reply_authors": "5;2;3", "recommendation_avg": [ 6.0, 0.0 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 52.0, 17.962924780409974 ], "wc_summary_review_avg": [ 71.33333333333333, 59.297742134268674 ], "wc_main_review_avg": [ 349.3333333333333, 227.89227474596169 ], "wc_review_avg": [ 472.6666666666667, 293.0134619584719 ], "wc_reply_reviewers_avg": [ 30.0, 41.72129751897305 ], "wc_reply_authors_avg": [ 1069.6666666666667, 347.15734505008277 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 3.3333333333333335, 1.247219128924647 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 159, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4019676252892800886&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=h0OYV0We3oh", "email": "rutgers.edu;rutgers.edu;", "author_num": 3, "aff_unique_index": "0;0", "aff_unique_norm": "Rutgers University", "aff_unique_dep": "", "aff_unique_url": "https://www.rutgers.edu", "aff_unique_abbr": "Rutgers", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "h4EOymDV3vV", "title": "Diffusion-Based Representation Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Score-based methods represented as stochastic differential equations on a continuous time domain have recently proven successful as a non-adversarial generative model. Training such models relies on denoising score matching, which can be seen as multi-scale denoising autoencoders. Here, we augment the denoising score-matching framework to enable representation learning without any supervised signal. GANs and VAEs learn representations by directly transforming latent codes to data samples. In contrast, the introduced diffusion based representation learning relies on a new formulation of the denoising score-matching objective and thus encodes information needed for denoising. We illustrate how this difference allows for manual control of the level of details encoded in the representation. Using the same approach, we propose to learn an infinite-dimensional latent code which achieves improvements of state-of-the-art models on semi-supervised image classification. As a side contribution, we show how adversarial training in score-based models can improve sample quality and improve sampling speed using a new approximation of the prior at smaller noise scales.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/9fcbf84926c27f65997509eaacce4e305290b141.zip", "author": "Korbinian Abstreiter;Stefan Bauer;Bernhard Sch\u00f6lkopf;Arash Mehrjou", "authorids": "~Korbinian_Abstreiter1;~Stefan_Bauer1;~Bernhard_Sch\u00f6lkopf1;~Arash_Mehrjou1", "gender": ";;;M", "homepage": "https://github.com/KorbinianBstrtr;https://cifar.ca/bios/stefan-bauer/;;https://distantvantagepoint.com", "dblp": ";;;174/1295", "google_scholar": ";O-oICE8AAAAJ;;pnypNygAAAAJ", "orcid": ";;;0000-0002-3832-7784", "linkedin": ";;;arash-mehrjou/", "or_profile": "~Korbinian_Abstreiter1;~Stefan_Bauer1;~Bernhard_Sch\u00f6lkopf1;~Arash_Mehrjou1", "aff": ";KTH Royal Institute of Technology;;GlaxoSmithKlein", "aff_domain": ";kth.se;;gsk.ai", "position": ";Assistant Professor;;Researcher", "bibtex": "@misc{\nabstreiter2022diffusionbased,\ntitle={Diffusion-Based Representation Learning},\nauthor={Korbinian Abstreiter and Stefan Bauer and Bernhard Sch{\\\"o}lkopf and Arash Mehrjou},\nyear={2022},\nurl={https://openreview.net/forum?id=h4EOymDV3vV}\n}", "github": "", "project": "", "reviewers": "XjvM;pE4h;VdM1;Wb9v", "site": "https://openreview.net/forum?id=h4EOymDV3vV", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "5;5;4;3", "correctness": "3;4;2;4", "technical_novelty": "3;4;2;3", "empirical_novelty": "3;2;3;3", "wc_summary_paper": "75;117;56;66", "wc_summary_review": "27;94;31;72", "wc_main_review": "215;368;349;140", "wc_review": "317;579;436;278", "wc_reply_reviewers": "85;119;54;75", "wc_reply_authors": "480;824;830;269", "reply_reviewers": "1;1;1;1", "reply_authors": "2;2;2;2", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 4.25, 0.82915619758885 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 78.5, 23.221757039466244 ], "wc_summary_review_avg": [ 56.0, 28.1336097932704 ], "wc_main_review_avg": [ 268.0, 94.54364071686683 ], "wc_review_avg": [ 402.5, 117.3509693185361 ], "wc_reply_reviewers_avg": [ 83.25, 23.47738273317535 ], "wc_reply_authors_avg": [ 600.75, 238.24081829107286 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 2.0, 0.0 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.7608859102526822, "corr_recommendation_correctness": 0.3458572319330373, "gs_citation": 25, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6726524037722471337&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff_unique_index": "0;1", "aff_unique_norm": "KTH Royal Institute of Technology;GlaxoSmithKline", "aff_unique_dep": ";", "aff_unique_url": "https://www.kth.se;https://www.gsk.com", "aff_unique_abbr": "KTH;GSK", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "Sweden;United Kingdom" }, { "id": "hB2HIO39r8G", "title": "Refining Multimodal Representations using a modality-centric self-supervised module", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Tasks that rely on multi-modal information typically include a fusion module that combines information from different modalities. In this work, we develop a self-supervised module, called REFINER, that refines multimodal representations using a decoding/defusing module applied downstream of the fused embedding. REFINER imposes a modality-centric responsibility condition, ensuring that both unimodal and fused representations are strongly encoded in the latent fusion space. Our approach provides both stronger generalization and reduced over-fitting. REFINER is only applied during training time keeping the inference time intact. The modular nature of REFINER lends itself to be combined with different fusion architectures easily. We demonstrate the power of REFINER on three datasets over powerful baseline fusion modules, and further show that they give a significant performance boost in few shot learning tasks.", "keywords": "Multimodal modeling;self-supervision;metric learning", "primary_area": "", "supplementary_material": "", "author": "Sethuraman Sankaran;David Yang;Ser-Nam Lim", "authorids": "~Sethuraman_Sankaran2;dzyang@fb.com;~Ser-Nam_Lim3", "gender": "M;;", "homepage": "http://www.sethusankaran.com/;;", "dblp": ";;", "google_scholar": "KyAReCEAAAAJ;;", "orcid": ";;", "linkedin": "sethuraman-sankaran-48a26614/;;", "or_profile": "~Sethuraman_Sankaran2;dzyang@fb.com;~Ser-Nam_Lim3", "aff": "Meta Facebook;;", "aff_domain": "facebook.com;;", "position": "Researcher;;", "bibtex": "@misc{\nsankaran2022refining,\ntitle={Refining Multimodal Representations using a modality-centric self-supervised module},\nauthor={Sethuraman Sankaran and David Yang and Ser-Nam Lim},\nyear={2022},\nurl={https://openreview.net/forum?id=hB2HIO39r8G}\n}", "github": "", "project": "", "reviewers": "RxU1;NjUK;sr5x;1vqe", "site": "https://openreview.net/forum?id=hB2HIO39r8G", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "4;5;3;4", "correctness": "3;3;4;3", "technical_novelty": "2;2;3;2", "empirical_novelty": "2;2;3;2", "wc_summary_paper": "64;106;78;64", "wc_summary_review": "40;222;23;29", "wc_main_review": "161;368;120;189", "wc_review": "265;696;221;282", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "350;1452;793;1441", "reply_reviewers": "0;0;0;0", "reply_authors": "1;2;1;2", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 78.0, 17.146428199482248 ], "wc_summary_review_avg": [ 78.5, 83.07376240426336 ], "wc_main_review_avg": [ 209.5, 94.74307362546351 ], "wc_review_avg": [ 366.0, 191.82153163813493 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 1009.0, 464.7068968715657 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.7071067811865475, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7119426962869259029&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "Meta", "aff_unique_dep": "Meta Platforms, Inc.", "aff_unique_url": "https://meta.com", "aff_unique_abbr": "Meta", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "id": "hC474P6AqN-", "title": "Unifying Categorical Models by Explicit Disentanglement of the Labels' Generative Factors", "track": "main", "status": "Reject", "tldr": "", "abstract": "In most machine learning tasks, the datasets are mainly annotated by categorical labels. For example, in emotion recognition, most datasets rely only on categorical labels, such as ``happy'' and ``sad''. Usually, different datasets use different labelling systems (e.g., different number of categories and different names), even when describing the same data attributes. As a consequence, only a small subset of all the available datasets can be used for any supervised learning task, since the labelling systems used in the training data are not compatible with each other.\nIn this paper, we propose a \\emph{multi-type continuous disentanglement variational autoencoder} to address this problem by identifying and disentangling the true dimensional generative factors that determine each categorical label. By doing so, it is possible to merge multiple datasets based on different categorical models by projecting the data points into a unified latent space.\nThe experiments performed on synthetic datasets show a perfect correlation between the disentangled latent values and the true generative factors. Also, by observing the displacement of each label's explicit distributions, we noticed that the encoded space is a simple affine transformation of the generative factors' space. As the latent structure can be autonomously learnt by the model, and each label can be explicitly decomposed in its generative factors, this framework is very promising for further exploring explainability in new and existing neural networks architectures.", "keywords": "Disentanglement;explainability;latent representation", "primary_area": "", "supplementary_material": "", "author": "Luca Pinchetti;Lei Sha;Thomas Lukasiewicz", "authorids": "~Luca_Pinchetti1;~Lei_Sha1;~Thomas_Lukasiewicz2", "gender": "M;M;", "homepage": ";https://shalei120.github.io;https://www.cs.ox.ac.uk/people/thomas.lukasiewicz/", "dblp": ";93/3906;l/ThomasLukasiewicz", "google_scholar": ";https://scholar.google.com.hk/citations?user=EbZ_P6gAAAAJ;arjucpEAAAAJ", "orcid": ";;", "linkedin": "luca-pinchetti-414230222/;;", "or_profile": "~Luca_Pinchetti1;~Lei_Sha1;~Thomas_Lukasiewicz2", "aff": "Department of Computer Science, University of Oxford;Department of Computer Science, University of Oxford;Department of Computer Science, University of Oxford", "aff_domain": "cs.ox.ac.uk;cs.ox.ac.uk;cs.ox.ac.uk", "position": "PhD student;Postdoc;Full Professor", "bibtex": "@misc{\npinchetti2022unifying,\ntitle={Unifying Categorical Models by Explicit Disentanglement of the Labels' Generative Factors},\nauthor={Luca Pinchetti and Lei Sha and Thomas Lukasiewicz},\nyear={2022},\nurl={https://openreview.net/forum?id=hC474P6AqN-}\n}", "github": "", "project": "", "reviewers": "ugco;w1er;4rCs;RNFG", "site": "https://openreview.net/forum?id=hC474P6AqN-", "pdf_size": 0, "recommendation": "3;3;3;6", "confidence": "2;4;4;3", "correctness": "3;3;4;4", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "114;105;49;82", "wc_summary_review": "26;34;36;14", "wc_main_review": "122;703;149;107", "wc_review": "262;842;234;203", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "594;276;250;479", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 3.75, 1.299038105676658 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 87.5, 25.104780421266383 ], "wc_summary_review_avg": [ 27.5, 8.645808232895291 ], "wc_main_review_avg": [ 270.25, 250.30119356487296 ], "wc_review_avg": [ 385.25, 264.5291808099817 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 399.75, 142.962189057107 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.17407765595569782, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:kcD8s_1m-Y4J:scholar.google.com/&scioq=Unifying+Categorical+Models+by+Explicit+Disentanglement+of+the+Labels%27+Generative+Factors&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Oxford", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.ox.ac.uk", "aff_unique_abbr": "Oxford", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Oxford", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United Kingdom" }, { "id": "hDQ-dYA8vB4", "title": "Towards Human-Understandable Visual Explanations: Human Imperceptible Cues Can Better Be Removed", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Explainable AI (XAI) methods focus on explaining what a neural network has learned - in other words, identifying the features that are the most influential to the prediction. In this paper, we call them \"distinguishing features\". \nHowever, whether a human can make sense of the generated explanation also depends on the perceptibility of these features to humans.\nTo make sure an explanation is human-understandable, we\nargue that the capabilities of humans, constrained by the Human Visual System (HVS) and psychophysics, need to be taken into account. \nWe propose the human perceptibility principle for XAI, stating that, to generate human-understandable explanations, neural networks should be steered towards focusing on human-understandable cues during training.\nWe conduct a case study \nregarding the classification of real vs. fake face images, where many of the distinguishing features picked up by standard neural networks turn out not to be perceptible to humans. By applying the proposed principle, a neural network with human-understandable explanations is trained which, in a survey, is shown to better align with human intuition. This is likely to make the AI more trustworthy and open the door to humans learning from machines. \nIn the case study, we specifically investigate and analyze the behaviour of the human-imperceptible high spatial frequency features in neural networks and XAI methods.\n", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Kaili Wang;Jose Oramas;Tinne Tuytelaars", "authorids": "~Kaili_Wang1;~Jose_Oramas1;~Tinne_Tuytelaars1", "gender": "M;M;", "homepage": ";http://idlab.uantwerpen.be/~joramasmogrovejo;", "dblp": ";47/9735;", "google_scholar": ";FurBYlUAAAAJ;", "orcid": ";0000-0002-8607-5067;", "linkedin": ";https://linkedin.com/in/jos%C3%A9-oramas-m-3183501b;", "or_profile": "~Kaili_Wang1;~Jose_Oramas1;~Tinne_Tuytelaars1", "aff": ";University of Antwerp;", "aff_domain": ";uantwerpen.be;", "position": ";Associate Professor;", "bibtex": "@misc{\nwang2022towards,\ntitle={Towards Human-Understandable Visual Explanations: Human Imperceptible Cues Can Better Be Removed},\nauthor={Kaili Wang and Jose Oramas and Tinne Tuytelaars},\nyear={2022},\nurl={https://openreview.net/forum?id=hDQ-dYA8vB4}\n}", "github": "", "project": "", "reviewers": "EvDj;BjyD;W8eC;XCg5", "site": "https://openreview.net/forum?id=hDQ-dYA8vB4", "pdf_size": 0, "recommendation": "3;3;3;6", "confidence": "4;4;4;3", "correctness": "2;2;2;3", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "69;111;48;43", "wc_summary_review": "44;57;49;19", "wc_main_review": "587;1001;210;286", "wc_review": "700;1169;307;348", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.75, 1.299038105676658 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 2.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 67.75, 26.808347580557815 ], "wc_summary_review_avg": [ 42.25, 14.201672436723781 ], "wc_main_review_avg": [ 521.0, 310.9268402695399 ], "wc_review_avg": [ 631.0, 346.14664522424596 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -1.0, "corr_recommendation_correctness": 1.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:EyKHs9m0Yk4J:scholar.google.com/&scioq=Towards+Human-Understandable+Visual+Explanations:+Human+Imperceptible+Cues+Can+Better+Be+Removed&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "University of Antwerp", "aff_unique_dep": "", "aff_unique_url": "https://www.uantwerp.be", "aff_unique_abbr": "UA", "aff_country_unique_index": "0", "aff_country_unique": "Belgium" }, { "id": "hEiwVblq4P", "title": "Proper Straight-Through Estimator: Breaking symmetry promotes convergence to true minimum", "track": "main", "status": "Reject", "tldr": "", "abstract": "In the quantized network, its gradient shows either vanishing or diverging. The network thus cannot be learned by the standard back-propagation, so that an alternative approach called Straight Through Estimator (STE), which replaces the part of the gradient with a simple differentiable function, is used. While STE is known to work well for learning the quantized network empirically, it has not been established theoretically. A recent study by Yin et. al. (2019) has provided theoretical support for STE. However, its justification is still limited to the model in the one-hidden layer network with the binary activation where Gaussian generates the input data, and the true labels are output from the teacher network with the same binary network architecture. In this paper, we discuss the effectiveness of STEs in more general situations without assuming the shape of the input distribution and the labels. By considering the scale symmetry of the network and specific properties of the STEs, we find that STE with clipped Relu is superior to STEs with identity function and vanilla Relu. The clipped Relu STE, which breaks the scale symmetry, may pick up one of the local minima degenerated in scales, while the identity STE and vanilla Relu STE, which keep the scale symmetry, may not pick it up. To confirm this observation, we further present an analysis of a simple misspecified model as an example. We find that all the stationary points are identical with the vanishing points of the cRelu STE gradient, while some of them are not identical with the vanishing points of the identity and Relu STE.", "keywords": "quantization;binary network;low bit network;Straight through estimator;STE", "primary_area": "", "supplementary_material": "", "author": "Shinya Gongyo;Kohta Ishikawa", "authorids": "~Shinya_Gongyo1;~Kohta_Ishikawa1", "gender": "M;M", "homepage": ";", "dblp": "393/7106;157/8482", "google_scholar": ";", "orcid": "0000-0002-7871-2986;", "linkedin": ";", "or_profile": "~Shinya_Gongyo1;~Kohta_Ishikawa1", "aff": "DENSO IT Laboratory;Denso IT Laboratory, Inc.", "aff_domain": "core.d-itlab.co.jp;d-itlab.co.jp", "position": "Researcher;Researcher", "bibtex": "@misc{\ngongyo2022proper,\ntitle={Proper Straight-Through Estimator: Breaking symmetry promotes convergence to true minimum},\nauthor={Shinya Gongyo and Kohta Ishikawa},\nyear={2022},\nurl={https://openreview.net/forum?id=hEiwVblq4P}\n}", "github": "", "project": "", "reviewers": "1Ehv;tphS;W5Me;Tac4", "site": "https://openreview.net/forum?id=hEiwVblq4P", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "4;5;3;3", "correctness": "3;4;2;3", "technical_novelty": "2;2;2;3", "empirical_novelty": "1;0;0;0", "wc_summary_paper": "80;128;116;66", "wc_summary_review": "49;52;39;64", "wc_main_review": "365;316;480;416", "wc_review": "494;496;635;546", "wc_reply_reviewers": "0;124;0;0", "wc_reply_authors": "577;583;425;875", "reply_reviewers": "0;1;0;0", "reply_authors": "1;1;1;2", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 0.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 97.5, 25.352514668174436 ], "wc_summary_review_avg": [ 51.0, 8.916277250063503 ], "wc_main_review_avg": [ 394.25, 60.83738570977553 ], "wc_review_avg": [ 542.75, 57.18992481198065 ], "wc_reply_reviewers_avg": [ 31.0, 53.693575034635195 ], "wc_reply_authors_avg": [ 615.0, 162.91715686200763 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.9045340337332909, "corr_recommendation_correctness": -0.7071067811865475, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:PaOs0_NsnB4J:scholar.google.com/&scioq=Proper+Straight-Through+Estimator:+Breaking+symmetry+promotes+convergence+to+true+minimum&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1", "aff_unique_norm": "DENSO Corporation;Denso IT Laboratory, Inc.", "aff_unique_dep": "IT Laboratory;", "aff_unique_url": "https://www.denso.com;https://www.denso.com", "aff_unique_abbr": "DENSO;Denso IT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Japan" }, { "title": "Discovering Invariant Rationales for Graph Neural Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6555", "id": "hGXij5rfiHw", "poster": "", "openreview": "https://openreview.net/forum?id=hGXij5rfiHw", "slides": "https://iclr.cc/virtual/2022/poster/6555", "video": "https://iclr.cc/virtual/2022/poster/6555", "author_site": "Ying-Xin Wu, Xiang Wang, An Zhang, Xiangnan He, Tat-Seng Chua", "tldr": "", "abstract": "Intrinsic interpretability of graph neural networks (GNNs) is to find a small subset of the input graph's features --- rationale --- which guides the model prediction. Unfortunately, the leading rationalization models often rely on data biases, especially shortcut features, to compose rationales and make predictions without probing the critical and causal patterns. Moreover, such data biases easily change outside the training distribution. As a result, these models suffer from a huge drop in interpretability and predictive performance on out-of-distribution data. In this work, we propose a new strategy of discovering invariant rationale (DIR) to construct intrinsically interpretable GNNs. It conducts interventions on the training distribution to create multiple interventional distributions. Then it approaches the causal rationales that are invariant across different distributions while filtering out the spurious patterns that are unstable. Experiments on both synthetic and real-world datasets validate the superiority of our DIR in terms of interpretability and generalization ability on graph classification over the leading baselines. Code and datasets are available at https://github.com/Wuyxin/DIR-GNN.", "keywords": "Interpretability;Graph Neural Networks;Causal Discovery;Invariant Learning", "primary_area": "", "supplementary_material": "", "author": "Yingxin Wu;Xiang Wang;An Zhang;Xiangnan He;Tat-Seng Chua", "authorids": "~Yingxin_Wu1;~Xiang_Wang6;~An_Zhang2;~Xiangnan_He1;~Tat-Seng_Chua2", "gender": "F;M;M;F;M", "homepage": "https://cs.stanford.edu/~shirwu;https://github.com/xiangwang1223;http://staff.ustc.edu.cn/~hexn;https://github.com/anzhang314;http://www.comp.nus.edu.sg/~chuats/", "dblp": "79/4173-2;31/2864-10;59/1007;78/5581-3;", "google_scholar": "r2cVEucAAAAJ;https://scholar.google.com.sg/citations?user=HdhaQB0AAAAJ;https://scholar.google.com.sg/citations?user=X45Go24AAAAJ;https://scholar.google.com.sg/citations?user=BcX7GJcAAAAJ;https://scholar.google.com.tw/citations?user=Z9DWCBEAAAAJ", "orcid": ";0000-0002-6148-6329;0000-0001-8472-7992;;0000-0001-6097-7807", "linkedin": ";;;;", "or_profile": "~Yingxin_Wu1;~Xiang_Wang6;~Xiangnan_He1;~AN_ZHANG1;~Tat-seng_Chua1", "aff": "University of Science and Technology of China;National University of Singapore;University of Science and Technology of China;National University of Singapore;National University of Singapore", "aff_domain": "ustc.edu.cn;nus.edu.sg;ustc.edu.cn;nus.edu.sg;nus.edu.sg", "position": "Undergrad student;Postdoc;Professor;Postdoc;Full Professor", "bibtex": "@inproceedings{\nwu2022discovering,\ntitle={Discovering Invariant Rationales for Graph Neural Networks},\nauthor={Yingxin Wu and Xiang Wang and An Zhang and Xiangnan He and Tat-Seng Chua},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=hGXij5rfiHw}\n}", "github": "", "project": "", "reviewers": "F9KM;derN;qBQC", "pdf_size": 0, "recommendation": "6;8;8", "confidence": "4;3;4", "correctness": "3;3;4", "technical_novelty": "2;3;3", "empirical_novelty": "2;3;4", "wc_summary_paper": "68;136;53", "wc_summary_review": "31;45;32", "wc_main_review": "493;127;151", "wc_review": "592;308;236", "wc_reply_reviewers": "168;99;0", "wc_reply_authors": "1997;557;94", "reply_reviewers": "2;1;0", "reply_authors": "4;2;1", "recommendation_avg": [ 7.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 3.0, 0.816496580927726 ], "wc_summary_paper_avg": [ 85.66666666666667, 36.11401697709938 ], "wc_summary_review_avg": [ 36.0, 6.377042156569663 ], "wc_main_review_avg": [ 257.0, 167.1645895517349 ], "wc_review_avg": [ 378.6666666666667, 153.68654968980064 ], "wc_reply_reviewers_avg": [ 89.0, 68.9492567037528 ], "wc_reply_authors_avg": [ 882.6666666666666, 810.3070748522165 ], "reply_reviewers_avg": [ 1.0, 0.816496580927726 ], "reply_authors_avg": [ 2.3333333333333335, 1.247219128924647 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.49999999999999983, "corr_recommendation_correctness": 0.49999999999999983, "gs_citation": 332, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6763314222815951542&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=hGXij5rfiHw", "email": "ustc.edu.cn;nus.edu.sg;ustc.edu.cn;nus.edu.sg;nus.edu.sg", "author_num": 5, "aff_unique_index": "0;1;0;1;1", "aff_unique_norm": "University of Science and Technology of China;National University of Singapore", "aff_unique_dep": ";", "aff_unique_url": "http://www.ustc.edu.cn;https://www.nus.edu.sg", "aff_unique_abbr": "USTC;NUS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;1;1", "aff_country_unique": "China;Singapore" }, { "id": "hHmtmT58pSL", "title": "Don\u2019t throw away that linear head: Few-shot protein fitness prediction with generative models", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Predicting the fitness, i.e. functional value, of a protein sequence is an important and challenging task in biology, particularly due to the scarcity of assay-labeled data. Traditional approaches utilize transfer learning from evolutionary data, yet discard useful information from a generative model\u2019s learned probability distribution. We propose generative fitness fine-tuning, termed gf-tuning, to utilize the generative model\u2019s log probabilities as logits for a pairwise ranking loss---allowing for the full distribution learned in unsupervised training to be repurposed for fine-tuning on assay-labeled fitness data. We demonstrate that gf-tuning achieves better performance than existing baselines across a variety of few-shot fitness prediction settings, including both low homology and highly epistatic systems as well as generalizing from single to multiple mutations. Generative fitness finetuning offers an effective strategy for few-shot fitness prediction which could enable advances to better understand and engineer proteins.", "keywords": "language modeling;proteins;fitness prediction", "primary_area": "", "supplementary_material": "", "author": "Ben Krause;Nikhil Naik;Wenhao Liu;Ali Madani", "authorids": "~Ben_Krause1;~Nikhil_Naik1;~Wenhao_Liu1;~Ali_Madani1", "gender": "M;M;;", "homepage": "https://benkrause.github.io/;mit.edu/~naik;;https://madani.ai", "dblp": ";43/7797;;", "google_scholar": "ONNif60AAAAJ;M1IgIyMAAAAJ;;ZEDT0-cAAAAJ", "orcid": ";;;0000-0002-3092-1295", "linkedin": ";;;madani-ali", "or_profile": "~Ben_Krause1;~Nikhil_Naik1;~Wenhao_Liu1;~Ali_Madani1", "aff": "SalesForce.com;MIT;;SalesForce.com", "aff_domain": "salesforce.com; ;;salesforce.com", "position": "Research Scientist;Graduate Student;;Research Scientist", "bibtex": "@misc{\nkrause2022dont,\ntitle={Don{\\textquoteright}t throw away that linear head: Few-shot protein fitness prediction with generative models},\nauthor={Ben Krause and Nikhil Naik and Wenhao Liu and Ali Madani},\nyear={2022},\nurl={https://openreview.net/forum?id=hHmtmT58pSL}\n}", "github": "", "project": "", "reviewers": "NDc4;JkFW;gSPV;FTpL", "site": "https://openreview.net/forum?id=hHmtmT58pSL", "pdf_size": 0, "recommendation": "3;3;5;6", "confidence": "4;5;5;3", "correctness": "4;2;3;3", "technical_novelty": "2;2;2;2", "empirical_novelty": "1;2;3;3", "wc_summary_paper": "79;89;52;119", "wc_summary_review": "112;55;341;36", "wc_main_review": "434;557;59;390", "wc_review": "625;701;452;545", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.25, 1.299038105676658 ], "confidence_avg": [ 4.25, 0.82915619758885 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 84.75, 23.962209831315644 ], "wc_summary_review_avg": [ 136.0, 121.616199578839 ], "wc_main_review_avg": [ 360.0, 184.24575978838698 ], "wc_review_avg": [ 580.75, 92.56450453602612 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.5222329678670935, "corr_recommendation_correctness": 0.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9048661550251085492&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;0", "aff_unique_norm": "Salesforce;Massachusetts Institute of Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.salesforce.com;https://web.mit.edu", "aff_unique_abbr": "Salesforce;MIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "hJk11f5yfy", "title": "Encoding Hierarchical Information in Neural Networks Helps in Subpopulation Shift", "track": "main", "status": "Reject", "tldr": "", "abstract": "Over the past decade, deep neural networks have proven to be adept in image classification tasks, often even surpassing humans in terms of accuracy. However, standard neural networks often fail to understand the concept of hierarchical structures and dependencies among different classes for vision related tasks. Humans on the other hand, seem to learn categories conceptually, progressively growing from understanding high-level concepts down to granular levels of categories. One of the issues arising from the inability of neural networks to encode such dependencies within its learned structure is that of subpopulation shift -- where models are queried with novel unseen classes taken from a shifted population of the training set categories. Since the neural network treats each class as independent from all others, it struggles to categorize shifting populations that are dependent at higher levels of the hierarchy. In this work, we study the aforementioned problems through the lens of a novel conditional supervised training framework. We tackle subpopulation shift by a structured learning procedure that incorporates hierarchical information conditionally through labels. Furthermore, we introduce a notion of graphical distance to model the catastrophic effect of mispredictions. We show that learning in this structured hierarchical manner results in networks that are more robust against subpopulation shifts, with an improvement of around 2% in terms of accuracy and around 8.5% in terms of graphical distance over standard models on subpopulation shift benchmarks.", "keywords": "Subpopulation shift;Hierarchical;Hierarchical Networks;Conditional Training;Domain Adaptation", "primary_area": "", "supplementary_material": "/attachment/2600dc3f898cb0e0d48381fe7b1ddd5e9d429a77.zip", "author": "Amitangshu Mukherjee;Isha Garg;Kaushik Roy", "authorids": "~Amitangshu_Mukherjee1;~Isha_Garg1;~Kaushik_Roy1", "gender": "M;F;M", "homepage": ";;https://engineering.purdue.edu/NRL/Group", "dblp": "239/5924;;r/KaushikRoy", "google_scholar": "aK1eTNkAAAAJ;;to4P8KgAAAAJ", "orcid": "0000-0003-2704-3580;;", "linkedin": "amitangshu-mukherjee-146837129;;", "or_profile": "~Amitangshu_Mukherjee1;~Isha_Garg1;~Kaushik_Roy1", "aff": "Purdue University;Purdue University;Purdue University", "aff_domain": "purdue.edu;purdue.edu;purdue.edu", "position": "PhD student;PhD student;Full Professor", "bibtex": "@misc{\nmukherjee2022encoding,\ntitle={Encoding Hierarchical Information in Neural Networks Helps in Subpopulation Shift},\nauthor={Amitangshu Mukherjee and Isha Garg and Kaushik Roy},\nyear={2022},\nurl={https://openreview.net/forum?id=hJk11f5yfy}\n}", "github": "", "project": "", "reviewers": "TcMm;1XvT;GFoK", "site": "https://openreview.net/forum?id=hJk11f5yfy", "pdf_size": 0, "recommendation": "3;5;5", "confidence": "4;4;4", "correctness": "3;3;3", "technical_novelty": "2;2;2", "empirical_novelty": "2;3;2", "wc_summary_paper": "90;81;54", "wc_summary_review": "32;23;42", "wc_main_review": "156;568;298", "wc_review": "278;672;394", "wc_reply_reviewers": "37;162;0", "wc_reply_authors": "266;109;495", "reply_reviewers": "1;1;0", "reply_authors": "1;1;1", "recommendation_avg": [ 4.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 75.0, 15.297058540778355 ], "wc_summary_review_avg": [ 32.333333333333336, 7.760297817881877 ], "wc_main_review_avg": [ 340.6666666666667, 170.8826758009392 ], "wc_review_avg": [ 448.0, 165.31989192673296 ], "wc_reply_reviewers_avg": [ 66.33333333333333, 69.31249686905112 ], "wc_reply_authors_avg": [ 290.0, 158.4950051789225 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4386135728844274241&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff_unique_index": "0;0;0", "aff_unique_norm": "Purdue University", "aff_unique_dep": "", "aff_unique_url": "https://www.purdue.edu", "aff_unique_abbr": "Purdue", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "hLZHO-wzuqM", "title": "Benchmarking Algorithms from Machine Learning for Low-Budget Black-Box Optimization", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Machine learning has invaded various domains of computer science, including black-box optimization. Recent research is particularly concerned with Bayesian Optimization and/or Monte Carlo Tree Search.\nHowever, the experiments are usually performed on rather small benchmarks and there are visible issues in the experimental setup, such as poor initialization of baselines, overfitting by specifying hyperparameters specifically for each test function, and low statistical significance. In addition, the interface is sometimes very problem-specific and has more impact on the results than the algorithm itself.\nWe compare several black-box optimization tools from the machine learning world and benchmark them on the classical BBOB benchmark, which is well known in the black-box optimization field, and on Direct Policy Search for OpenAI Gym. In particular, the benchmarks in this work include randomization of the optimum: BBOB considers 15 random instances per test function and dimension, i.e., 24 functions $\\times$ 6 dimensionalities $\\times$ 15 random instances $=2160$ cases. For OpenAI Gym, we consider tiny and larger neural networks, on a total number of 13 problems $\\times$ 8 budgets $\\times$ 10 repetitions $=1040$ and 18 problems $\\times$ 8 budgets $\\times$ 10 repetitions $=1440$ instances, respectively.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Elena Raponi;Nathana\u00ebl Carraz Rakotonirina;J\u00e9r\u00e9my Rapin;Olivier Teytaud;Carola Doerr", "authorids": "~Elena_Raponi1;~Nathana\u00ebl_Carraz_Rakotonirina1;~J\u00e9r\u00e9my_Rapin1;~Olivier_Teytaud2;~Carola_Doerr1", "gender": "F;M;M;;F", "homepage": "https://webia.lip6.fr/~raponi/;https://ncarraz.github.io;;;http://www-ia.lip6.fr/~doerr/", "dblp": ";256/9877;133/8584;;https://dblp.uni-trier.de/pid/62/8086", "google_scholar": "https://scholar.google.fr/citations?user=puWIVC4AAAAJ;https://scholar.google.com/citations?hl=en;tQ8DdN8AAAAJ;;CU-V1sEAAAAJ", "orcid": "0000-0001-6841-7409;0000-0003-0181-2644;;;0000-0002-4981-3227", "linkedin": "elena-raponi-4b1765138/;;j%C3%A9r%C3%A9my-rapin-13851613/;;", "or_profile": "~Elena_Raponi1;~Nathana\u00ebl_Carraz_Rakotonirina1;~J\u00e9r\u00e9my_Rapin1;~Olivier_Teytaud2;~Carola_Doerr1", "aff": "LIP6;Universitat Pompeu Fabra;Meta Facebook;;LIP6, CNRS, Sorbonne Universit\u00e9", "aff_domain": "lip6.fr;upf.edu;meta.com;;lip6.fr", "position": "Postdoc;PhD student;Research Engineer;;CNRS researcher at Sorbonne Universit\u00e9", "bibtex": "@misc{\nraponi2022benchmarking,\ntitle={Benchmarking Algorithms from Machine Learning for Low-Budget Black-Box Optimization},\nauthor={Elena Raponi and Nathana{\\\"e}l Carraz Rakotonirina and J{\\'e}r{\\'e}my Rapin and Olivier Teytaud and Carola Doerr},\nyear={2022},\nurl={https://openreview.net/forum?id=hLZHO-wzuqM}\n}", "github": "", "project": "", "reviewers": "qvNk;L8jK;ng8j;eb2B", "site": "https://openreview.net/forum?id=hLZHO-wzuqM", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "3;4;3;4", "correctness": "3;3;2;4", "technical_novelty": "2;1;1;1", "empirical_novelty": "2;2;2;4", "wc_summary_paper": "54;51;45;95", "wc_summary_review": "80;56;69;43", "wc_main_review": "430;218;364;332", "wc_review": "564;325;478;470", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 1.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 61.25, 19.753164303473 ], "wc_summary_review_avg": [ 62.0, 13.874436925511608 ], "wc_main_review_avg": [ 336.0, 76.74633541739958 ], "wc_review_avg": [ 459.25, 85.82358358866169 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": 0.816496580927726, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:dTs8YmN1tJAJ:scholar.google.com/&scioq=Benchmarking+Algorithms+from+Machine+Learning+for+Low-Budget+Black-Box+Optimization&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "Laboratoire d'Informatique de Paris 6;Universitat Pompeu Fabra;Meta;Sorbonne Universit\u00e9", "aff_unique_dep": ";;Meta Platforms, Inc.;LIP6", "aff_unique_url": "http://www.lip6.fr;https://www.upf.edu/;https://meta.com;https://www.sorbonne-universite.fr", "aff_unique_abbr": "LIP6;UPF;Meta;SU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2;0", "aff_country_unique": "France;Spain;United States" }, { "id": "hNgDQPe8Uj", "title": "Learning Graph Augmentations to Learn Graph Representations", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Devising augmentations for graph contrastive learning is challenging due to their irregular structure, and drastic distribution shifts and nonequivalent feature spaces across datasets. To address this, we propose LG2AR, Learning Graph Augmentations to Learn Graph Representations, which is an end-to-end automatic graph augmentation framework that helps encoders learn generalizable representations on both node and graph levels. LG2AR consists of a probabilistic policy that learns a distribution over augmentations and a set of probabilistic augmentation heads that learn distributions over augmentation parameters. Under linear evaluation protocol, LG2AR achieves state-of-the-art results on 8 out of 8 graph classification tasks and 6 out of 7 node classification benchmarks.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/597593e9979132d7c9566e9f3f1ccef242a09f43.zip", "author": "Kaveh Hassani;Amir Hosein Khasahmadi", "authorids": "~Kaveh_Hassani1;~Amir_Hosein_Khasahmadi1", "gender": "M;", "homepage": "https://kavehhassani.github.io/;", "dblp": "131/9880;259/1508", "google_scholar": "https://scholar.google.ca/citations?user=1CiEWwsAAAAJ;cFpYRhkAAAAJ", "orcid": "0000-0001-9162-9442;", "linkedin": "https://ca.linkedin.com/in/kavehhassani;amir-khas/", "or_profile": "~Kaveh_Hassani1;~Amir_Hosein_Khasahmadi2", "aff": "Autodesk Inc;Toronto University", "aff_domain": "autodesk.com;utoronto.ca", "position": "Principal AI Research Scientist;MS student", "bibtex": "@misc{\nhassani2022learning,\ntitle={Learning Graph Augmentations to Learn Graph Representations},\nauthor={Kaveh Hassani and Amir Hosein Khasahmadi},\nyear={2022},\nurl={https://openreview.net/forum?id=hNgDQPe8Uj}\n}", "github": "", "project": "", "reviewers": "EPfe;VHWB;nuiz", "site": "https://openreview.net/forum?id=hNgDQPe8Uj", "pdf_size": 0, "recommendation": "3;5;5", "confidence": "4;4;4", "correctness": "3;3;3", "technical_novelty": "2;2;2", "empirical_novelty": "2;2;3", "wc_summary_paper": "48;42;57", "wc_summary_review": "30;12;70", "wc_main_review": "304;230;689", "wc_review": "382;284;816", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 4.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 49.0, 6.164414002968976 ], "wc_summary_review_avg": [ 37.333333333333336, 24.239545283597124 ], "wc_main_review_avg": [ 407.6666666666667, 201.21354052073357 ], "wc_review_avg": [ 494.0, 231.17670009468225 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 28, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10364604276553775514&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1", "aff_unique_norm": "Autodesk;University of Toronto", "aff_unique_dep": ";", "aff_unique_url": "https://www.autodesk.com;https://www.utoronto.ca", "aff_unique_abbr": "Autodesk;U of T", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "United States;Canada" }, { "id": "hOaYDFpQk3g", "title": "Taking ROCKET on an efficiency mission: A distributed solution for fast and accurate multivariate time series classification", "track": "main", "status": "Reject", "tldr": "", "abstract": "Nowadays, with the rising number of sensors in sectors such as healthcare and industry, the problem of multivariate time series classification (MTSC) is getting increasingly relevant and is a prime target for machine and deep learning solutions. Their expanding adoption in real-world environments is causing a shift in focus from the pursuit of ever higher prediction accuracy with complex models towards practical, deployable solutions that balance accuracy and parameters such as prediction speed. An MTSC solution that has attracted attention recently is ROCKET, based on random convolutional kernels, both because of its very fast training process and its state-of-the-art accuracy. However, the large number of features it utilizes may be detrimental to inference time. Examining its theoretical background and limitations enables us to address potential drawbacks and present LightWaveS: a distributed solution for accurate MTSC, which is fast both during training and inference. Specifically, utilizing a wavelet scattering transformation of the time series and distributed feature selection, we manage to create a solution which employs just 2,5% of the ROCKET features, while achieving accuracy comparable to recent deep learning solutions. LightWaveS also scales well with more nodes and large numbers of channels. In addition, it can give interpretability into the nature of an MTSC problem and allows for tuning based on expert opinion. We present three versions of our algorithm and their results on training time, accuracy, inference speedup and scalability. We show that we achieve speedup ranging from 8x to 30x compared to ROCKET during inference on an edge device, on datasets with comparable accuracy.", "keywords": "distribution;time series;classification;multivariate;wavelet;scattering;feature selection;scaling", "primary_area": "", "supplementary_material": "/attachment/6cd6f2084bb06304b09135e37116172db6314e32.zip", "author": "Leonardos Pantiskas;Kees Verstoep;Mark Hoogendoorn;Henri Bal", "authorids": "~Leonardos_Pantiskas1;~Kees_Verstoep1;~Mark_Hoogendoorn2;h.e.bal@vu.nl", "gender": ";M;M;", "homepage": ";https://www.cs.vu.nl/~versto;http://www.cs.vu.nl/~mhoogen;", "dblp": ";;19/1103.html;", "google_scholar": "ZcNIVe8AAAAJ;;3s4lqHkAAAAJ;", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Leonardos_Pantiskas1;~Kees_Verstoep1;~Mark_Hoogendoorn2;h.e.bal@vu.nl", "aff": "Vrije Universiteit Amsterdam;Vrije Universiteit Amsterdam;VU University Amsterdam;", "aff_domain": "vu.nl;cs.vu.nl;vu.nl;", "position": "PhD student;Researcher;Full Professor;", "bibtex": "@misc{\npantiskas2022taking,\ntitle={Taking {ROCKET} on an efficiency mission: A distributed solution for fast and accurate multivariate time series classification},\nauthor={Leonardos Pantiskas and Kees Verstoep and Mark Hoogendoorn and Henri Bal},\nyear={2022},\nurl={https://openreview.net/forum?id=hOaYDFpQk3g}\n}", "github": "", "project": "", "reviewers": "DUKD;uVpJ;zJvV;Vj84;u4XS;PviB;ooqH", "site": "https://openreview.net/forum?id=hOaYDFpQk3g", "pdf_size": 0, "recommendation": "3;3;3;5;5;6;6", "confidence": "4;3;4;4;3;4;4", "correctness": "2;2;2;2;3;3;3", "technical_novelty": "2;2;2;3;2;4;2", "empirical_novelty": "2;2;2;3;2;3;2", "wc_summary_paper": "48;69;53;23;151;62;75", "wc_summary_review": "46;25;80;16;203;36;5", "wc_main_review": "382;212;662;88;260;132;196", "wc_review": "476;306;795;127;614;230;276", "wc_reply_reviewers": "184;16;50;0;0;0;0", "wc_reply_authors": "352;251;398;209;298;64;65", "reply_reviewers": "1;1;1;0;0;0;0", "reply_authors": "1;1;1;1;1;1;1", "recommendation_avg": [ 4.428571428571429, 1.2936264483053452 ], "confidence_avg": [ 3.7142857142857144, 0.4517539514526256 ], "correctness_avg": [ 2.4285714285714284, 0.4948716593053935 ], "technical_novelty_avg": [ 2.4285714285714284, 0.7284313590846836 ], "empirical_novelty_avg": [ 2.2857142857142856, 0.45175395145262565 ], "wc_summary_paper_avg": [ 68.71428571428571, 37.07411220674565 ], "wc_summary_review_avg": [ 58.714285714285715, 63.01408977525646 ], "wc_main_review_avg": [ 276.0, 180.10473143622374 ], "wc_review_avg": [ 403.42857142857144, 218.81359133998518 ], "wc_reply_reviewers_avg": [ 35.714285714285715, 62.88700589314204 ], "wc_reply_authors_avg": [ 233.85714285714286, 121.47360509466192 ], "reply_reviewers_avg": [ 0.42857142857142855, 0.4948716593053935 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.20952908873087348, "corr_recommendation_correctness": 0.8288497269823396, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9704257849002918450&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;1", "aff_unique_norm": "Vrije Universiteit Amsterdam;VU University Amsterdam", "aff_unique_dep": ";", "aff_unique_url": "https://www.vu.nl;https://www.vu.nl", "aff_unique_abbr": "VU Amsterdam;VU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Amsterdam", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Netherlands" }, { "id": "hP-SILoczR", "title": "NAS-Bench-Zero: A Large Scale Dataset for Understanding Zero-Shot Neural Architecture Search", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Zero-shot Neural Architecture Search (ZS-NAS) is a recently developed low-cost NAS framework which identifies top-performer neural architectures from a large candidate pool without training their parameters. Despite its popularity in recent NAS literatures, the effectiveness of ZS-NAS has not been comprehensively understood. Previous works analyze ZS-NAS methods on NAS benchmark datasets such as NAS-Bench-101/201/301 which are initially designed for learning network topology with irregular connections. However, most modern state-of-the-art networks as well as popular classical ones are designed in more conventional, well-established search spaces such as ResNet (RS) and MobileNet (MB) search space. This imposes a significant gap between the benchmark dataset and real-world practice, hindering a deeper understanding of ZS-NAS. In this work, we aim to bridge the gap systematically. First, we collect a novel large-scale dataset termed NAS-Bench-Zero for benchmarking and understanding popular ZS-NAS methods in the conventional RS/MB search spaces. Then the characteristics of these ZS-NAS methods are extensively examined from various aspects. Notably, we find that: 1) the performance of ZS-NAS on NAS-Bench-101/201/301 cannot transfer to RS/MB search spaces; 2) A proxy with higher ranking correlation score may actually perform worse in constrained NAS; 3) existing zero-shot proxies cannot outperform naive proxies such as FLOPs/params in RS/MB search spaces; 4) Top best zero-shot proxies as well as FLOPs/params compensate each other. Based on these new discoveries, we propose i) a novel hybrid zero-shot proxy which outperforms existing ones by a large margin and is transferable among popular search spaces; ii) a new index for better measuring the true performance of ZS-NAS proxies in constrained NAS. Source code and the NAS-Bench-Zero dataset will be released after publication.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Hanlin Chen;Ming Lin;Xiuyu Sun;Hao Li", "authorids": "~Hanlin_Chen2;~Ming_Lin4;~Xiuyu_Sun1;~Hao_Li16", "gender": "M;M;M;M", "homepage": "https://hlinchen.github.io/;https://minglin-home.github.io/;https://sites.google.com/view/sunxiuyu/home;", "dblp": ";;40/8845;17/5705-30", "google_scholar": "fBpYOzAAAAAJ;https://scholar.google.com/citations?hl=en;https://scholar.google.com/citations?hl=zh-CN;pHN-QIwAAAAJ", "orcid": "0000-0002-3323-8213;;0000-0002-7208-8078;", "linkedin": ";;;%E6%98%8A-%E6%9D%8E-392547a5/detail/recent-activity/", "or_profile": "~Hanlin_Chen2;~Ming_Lin4;~Xiuyu_Sun1;~Li_Hao1", "aff": "Alibaba Group;Alibaba Group;Alibaba Group;Alibaba Group", "aff_domain": "alibaba-inc.com;alibaba-inc.com;alibaba-inc.com;alibaba-inc.com", "position": "Researcher;Algorithm Engineer;Staff Algorithm Engineer;Researcher", "bibtex": "@misc{\nchen2022nasbenchzero,\ntitle={{NAS}-Bench-Zero: A Large Scale Dataset for Understanding Zero-Shot Neural Architecture Search},\nauthor={Hanlin Chen and Ming Lin and Xiuyu Sun and Hao Li},\nyear={2022},\nurl={https://openreview.net/forum?id=hP-SILoczR}\n}", "github": "", "project": "", "reviewers": "LccQ;P23M;Vhtm;QgE1;kJvF", "site": "https://openreview.net/forum?id=hP-SILoczR", "pdf_size": 0, "recommendation": "3;3;3;5;5", "confidence": "4;5;4;3;4", "correctness": "2;3;2;3;3", "technical_novelty": "2;2;2;3;2", "empirical_novelty": "2;3;2;0;2", "wc_summary_paper": "24;23;59;71;60", "wc_summary_review": "47;20;29;62;34", "wc_main_review": "784;180;404;274;322", "wc_review": "855;223;492;407;416", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "0;0;0;0;0", "reply_reviewers": "0;0;0;0;0", "reply_authors": "0;0;0;0;0", "recommendation_avg": [ 3.8, 0.9797958971132712 ], "confidence_avg": [ 4.0, 0.6324555320336759 ], "correctness_avg": [ 2.6, 0.4898979485566356 ], "technical_novelty_avg": [ 2.2, 0.39999999999999997 ], "empirical_novelty_avg": [ 1.8, 0.9797958971132712 ], "wc_summary_paper_avg": [ 47.4, 19.96597105076535 ], "wc_summary_review_avg": [ 38.4, 14.677874505527017 ], "wc_main_review_avg": [ 392.8, 208.60143815419875 ], "wc_review_avg": [ 478.6, 207.9582650437342 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.6454972243679028, "corr_recommendation_correctness": 0.6666666666666667, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3286465473037009984&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Alibaba Group", "aff_unique_dep": "", "aff_unique_url": "https://www.alibaba.com", "aff_unique_abbr": "Alibaba", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "id": "hRVZd5g-z7", "title": "A Joint Subspace View to Convolutional Neural Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "Motivated by the intuition that important image regions remain important across different layers and scales in a CNN, we propose in this paper a joint subspace view to convolutional filters across network layers. When we construct for each layer a filter subspace by decomposing convolutional filters over a small set of layer-specific filter atoms, we observe a low-rank structure within subspace coefficients across layers. The above observation matches widely-known cross-layer filter correlation and redundancy. Thus, we propose to jointly model filter subspace across different layers by enforcing cross-layer shared subspace coefficients. In other words, a CNN is now reduced to layers of filter atoms, typically a few hundred of parameters per layer, with a common block of subspace coefficients shared across layers. We further show that such subspace coefficient sharing can be easily extended to other network sub-structures, from sharing across the entire network to sharing within filter groups in a layer. While significantly reducing the parameter redundancy of a wide range of network architectures, the proposed joint subspace view also preserves the expressiveness of CNNs, and brings many additional advantages, such as easy model adaptation and better interpretation. We support our findings with extensive empirical evidence. ", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ze Wang;Xiuyuan Cheng;Guillermo Sapiro;Qiang Qiu", "authorids": "~Ze_Wang3;~Xiuyuan_Cheng1;~Guillermo_Sapiro1;~Qiang_Qiu1", "gender": "M;;;", "homepage": ";;;https://web.ics.purdue.edu/~qqiu/", "dblp": ";79/9747;82/5175;97/360", "google_scholar": "80Jw_w8AAAAJ;I2gwdssAAAAJ;https://scholar.google.co.il/citations?user=ISRNX3gAAAAJ;jdLtt_YAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Ze_Wang3;~Xiuyuan_Cheng1;~Guillermo_Sapiro1;~Qiang_Qiu1", "aff": "Purdue University;Duke University;Duke University;Purdue University", "aff_domain": "purdue.edu;duke.edu;duke.edu;purdue.edu", "position": "PhD student;Assistant Professor;Full Professor;Assistant Professor", "bibtex": "@misc{\nwang2022a,\ntitle={A Joint Subspace View to Convolutional Neural Networks},\nauthor={Ze Wang and Xiuyuan Cheng and Guillermo Sapiro and Qiang Qiu},\nyear={2022},\nurl={https://openreview.net/forum?id=hRVZd5g-z7}\n}", "github": "", "project": "", "reviewers": "vUb9;r8DN;FvfC;FSGC;v8NG", "site": "https://openreview.net/forum?id=hRVZd5g-z7", "pdf_size": 0, "recommendation": "6;6;6;6;6", "confidence": "5;3;4;3;5", "correctness": "4;3;3;4;3", "technical_novelty": "2;3;2;2;2", "empirical_novelty": "2;3;3;3;2", "wc_summary_paper": "30;80;76;108;62", "wc_summary_review": "14;29;34;101;57", "wc_main_review": "222;294;289;265;243", "wc_review": "266;403;399;474;362", "wc_reply_reviewers": "0;35;0;61;0", "wc_reply_authors": "360;216;590;634;375", "reply_reviewers": "0;1;0;1;0", "reply_authors": "1;1;1;2;1", "recommendation_avg": [ 6.0, 0.0 ], "confidence_avg": [ 4.0, 0.8944271909999159 ], "correctness_avg": [ 3.4, 0.4898979485566356 ], "technical_novelty_avg": [ 2.2, 0.39999999999999997 ], "empirical_novelty_avg": [ 2.6, 0.4898979485566356 ], "wc_summary_paper_avg": [ 71.2, 25.4432702300628 ], "wc_summary_review_avg": [ 47.0, 30.32490725459849 ], "wc_main_review_avg": [ 262.6, 27.28076245268816 ], "wc_review_avg": [ 380.8, 67.88637565815397 ], "wc_reply_reviewers_avg": [ 19.2, 24.9110417285187 ], "wc_reply_authors_avg": [ 435.0, 155.44259390527424 ], "reply_reviewers_avg": [ 0.4, 0.48989794855663565 ], "reply_authors_avg": [ 1.2, 0.4 ], "replies_avg": [ 23, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:ZdHPp1JhbLwJ:scholar.google.com/&scioq=A+Joint+Subspace+View+to+Convolutional+Neural+Networks&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;1;0", "aff_unique_norm": "Purdue University;Duke University", "aff_unique_dep": ";", "aff_unique_url": "https://www.purdue.edu;https://www.duke.edu", "aff_unique_abbr": "Purdue;Duke", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Scaling Laws for Neural Machine Translation", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6721", "id": "hR_SMu8cxCV", "poster": "", "openreview": "https://openreview.net/forum?id=hR_SMu8cxCV", "slides": "https://iclr.cc/virtual/2022/poster/6721", "video": "https://iclr.cc/virtual/2022/poster/6721", "author_site": "Behrooz Ghorbani, Orhan Firat, Markus Freitag, Ankur Bapna, Maxim Krikun, Xavier Garcia, Ciprian Chelba, Colin Cherry", "tldr": "", "abstract": "We present an empirical study of scaling properties of encoder-decoder Transformer models used in neural machine translation (NMT). We show that cross-entropy loss as a function of model size follows a certain scaling law. Specifically (i) We propose a formula which describes the scaling behavior of cross-entropy loss as a bivariate function of encoder and decoder size, and show that it gives accurate predictions under a variety of scaling approaches and languages; we show that the total number of parameters alone is not sufficient for such purposes. (ii) We observe different power law exponents when scaling the decoder vs scaling the encoder, and provide recommendations for optimal allocation of encoder/decoder capacity based on this observation. (iii) We also report that the scaling behavior of the model is acutely influenced by composition bias of the train/test sets, which we define as any deviation from naturally generated text (either via machine generated or human translated text). We observe that natural text on the target side enjoys scaling, which manifests as successful reduction of the cross-entropy loss. (iv) Finally, we investigate the relationship between the cross-entropy loss and the quality of the generated translations. We find two different behaviors, depending on the nature of the test data. For test sets which were originally translated from target language to source language, both loss and BLEU score improve as model size increases. In contrast, for test sets originally translated from source language to target language, the loss improves, but the BLEU score stops improving after a certain threshold. We release generated text from all models used in this study.", "keywords": "Scaling Laws;Neural Machine Translation;NMT;Model Scaling", "primary_area": "", "supplementary_material": "", "author": "Behrooz Ghorbani;Orhan Firat;Markus Freitag;Ankur Bapna;Maxim Krikun;Xavier Garcia;Ciprian Chelba;Colin Cherry", "authorids": "~Behrooz_Ghorbani1;~Orhan_Firat1;~Markus_Freitag2;~Ankur_Bapna1;~Maxim_Krikun1;~Xavier_Garcia1;~Ciprian_Chelba2;~Colin_Cherry1", "gender": ";M;M;M;;M;M;M", "homepage": ";;;;;https://research.google/people/author6342/;https://sites.google.com/site/colinacherry/;", "dblp": "162/0166;120/2225;57/8503;200/8008;05/1775;21/2502;99/6601;", "google_scholar": ";https://scholar.google.com.tr/citations?user=dLaR9lgAAAAJ;https://scholar.google.com/citations?hl=en;6hK9IZoAAAAJ;;Rtg5ZY8AAAAJ;TNr_OWMAAAAJ;Y2Hio6MAAAAJ", "orcid": ";;;;;;;", "linkedin": ";;markus-freitag-7b17b4101/;ankur-bapna-053b1269;;ciprian-chelba-202467/;colincherry/;", "or_profile": "~Behrooz_Ghorbani1;~Orhan_Firat1;~Markus_Freitag2;~Ankur_Bapna1;~Maxim_Krikun1;~Ciprian_Chelba2;~Colin_Cherry1;~Xavier_Garcia-rojas1", "aff": "Google;Google;Google;Google;Google;Google;Google;", "aff_domain": "google.com;google.com;google.com;google.com;google.com;google.com;google.com;", "position": "Researcher;Research Scientist;Researcher;Software Engineer;Software Engineer;Research Scientist;Researcher;", "bibtex": "@inproceedings{\nghorbani2022scaling,\ntitle={Scaling Laws for Neural Machine Translation},\nauthor={Behrooz Ghorbani and Orhan Firat and Markus Freitag and Ankur Bapna and Maxim Krikun and Xavier Garcia and Ciprian Chelba and Colin Cherry},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=hR_SMu8cxCV}\n}", "github": "", "project": "", "reviewers": "TAXy;q9c2;Dbxb;afSG", "pdf_size": 0, "recommendation": "8;8;8;10", "confidence": "4;4;4;5", "correctness": "4;4;4;4", "technical_novelty": "2;3;3;4", "empirical_novelty": "4;3;3;4", "wc_summary_paper": "76;72;59;43", "wc_summary_review": "32;40;36;11", "wc_main_review": "221;142;116;149", "wc_review": "329;254;211;203", "wc_reply_reviewers": "18;37;75;0", "wc_reply_authors": "573;244;302;109", "reply_reviewers": "1;1;2;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 8.5, 0.8660254037844386 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 4.0, 0.0 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 3.5, 0.5 ], "wc_summary_paper_avg": [ 62.5, 12.893796958227627 ], "wc_summary_review_avg": [ 29.75, 11.188722000300123 ], "wc_main_review_avg": [ 157.0, 38.942264957241505 ], "wc_review_avg": [ 249.25, 49.96186045375012 ], "wc_reply_reviewers_avg": [ 32.5, 27.807373122968663 ], "wc_reply_authors_avg": [ 307.0, 168.7853666642935 ], "reply_reviewers_avg": [ 1.0, 0.7071067811865476 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": 1.0, "corr_recommendation_correctness": 0.0, "gs_citation": 104, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2903038171120656886&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=hR_SMu8cxCV", "email": "google.com;google.com;google.com;google.com;google.com;google.com;google.com;", "author_num": 8, "aff_unique_index": "0;0;0;0;0;0;0", "aff_unique_norm": "Google", "aff_unique_dep": "Google", "aff_unique_url": "https://www.google.com", "aff_unique_abbr": "Google", "aff_campus_unique_index": "0;0;0;0;0;0;0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Automatic Loss Function Search for Predict-Then-Optimize Problems with Strong Ranking Property", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6648", "id": "hSktDu-h94", "poster": "", "openreview": "https://openreview.net/forum?id=hSktDu-h94", "slides": "https://iclr.cc/virtual/2022/poster/6648", "video": "https://iclr.cc/virtual/2022/poster/6648", "author_site": "Boshi Wang, Jialin Yi, Hang Dong, Bo Qiao, Chuan Luo, Qingwei Lin", "tldr": "", "abstract": "Combinatorial optimization problems with parameters to be predicted from side information are commonly seen in a variety of problems during the paradigm shift from reactive decision making to proactive decision making. Due to the misalignment between the continuous prediction results and the discrete decisions in optimization problems, it is hard to achieve a satisfactory prediction result with the ordinary $l_2$ loss in the prediction phase. To properly connect the prediction loss with the optimization goal, in this paper we propose a total group preorder (TGP) loss and its differential version called approximated total group preorder (ATGP) loss for predict-then-optimize (PTO) problems with strong ranking property. These new losses are provably more robust than the usual $l_2$ loss in a linear regression setting and have great potential to extend to other settings. We also propose an automatic searching algorithm that adapts the ATGP loss to PTO problems with different combinatorial structures. Extensive experiments on the ranking problem, the knapsack problem, and the shortest path problem have demonstrated that our proposed method can achieve a significant performance compared to the other methods designed for PTO problems.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Boshi Wang;Jialin Yi;Hang Dong;Bo Qiao;Chuan Luo;Qingwei Lin", "authorids": "wang.13930@buckeyemail.osu.edu;j.yi8@lse.ac.uk;~Hang_Dong3;~Bo_Qiao1;~Chuan_Luo1;~Qingwei_Lin1", "gender": ";;F;;;M", "homepage": ";;;;;https://www.microsoft.com/en-us/research/people/qlin/", "dblp": ";;;167/2640;;120/0743", "google_scholar": ";;xIjWJCoAAAAJ;_6ugrdYAAAAJ;;https://scholar.google.co.jp/citations?hl=zh-CN", "orcid": ";;;;;0000-0003-2559-2383", "linkedin": ";;;;;", "or_profile": "wang.13930@buckeyemail.osu.edu;j.yi8@lse.ac.uk;~Hang_Dong3;~Bo_Qiao1;~Chuan_Luo1;~Qingwei_Lin1", "aff": ";;Microsoft Research;Microsoft Research;;Microsoft Research", "aff_domain": ";;microsoft.com;microsoft.com;;microsoft.com", "position": ";;Researcher;RSDE;;Sr. Principal Researcher", "bibtex": "@inproceedings{\nwang2022automatic,\ntitle={Automatic Loss Function Search for Predict-Then-Optimize Problems with Strong Ranking Property},\nauthor={Boshi Wang and Jialin Yi and Hang Dong and Bo Qiao and Chuan Luo and Qingwei Lin},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=hSktDu-h94}\n}", "github": "", "project": "", "reviewers": "R87E;8c8S;AESs", "pdf_size": 0, "recommendation": "6;6;8", "confidence": "4;4;4", "correctness": "4;3;3", "technical_novelty": "3;3;3", "empirical_novelty": "4;3;3", "wc_summary_paper": "336;48;149", "wc_summary_review": "66;58;122", "wc_main_review": "313;325;402", "wc_review": "715;431;673", "wc_reply_reviewers": "0;6;83", "wc_reply_authors": "712;1152;624", "reply_reviewers": "0;1;1", "reply_authors": "1;2;2", "recommendation_avg": [ 6.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 177.66666666666666, 119.31005359519746 ], "wc_summary_review_avg": [ 82.0, 28.472208672083497 ], "wc_main_review_avg": [ 346.6666666666667, 39.432079439066975 ], "wc_review_avg": [ 606.3333333333334, 125.15945385343007 ], "wc_reply_reviewers_avg": [ 29.666666666666668, 37.791827452800185 ], "wc_reply_authors_avg": [ 829.3333333333334, 230.9708976376798 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": -0.49999999999999983, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12736074295134859529&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "pdf": "https://openreview.net/pdf?id=hSktDu-h94", "email": ";;microsoft.com;microsoft.com;;microsoft.com", "author_num": 6, "aff_unique_index": "0;0;0", "aff_unique_norm": "Microsoft", "aff_unique_dep": "Microsoft Research", "aff_unique_url": "https://www.microsoft.com/en-us/research", "aff_unique_abbr": "MSR", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "hUr6K4D9f7P", "title": "Adversarial Weight Perturbation Improves Generalization in Graph Neural Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "There is growing theoretical and empirical evidence that flatter local minima tend to improve generalization. An efficient and effective technique for finding such minima is Adversarial Weight Perturbation (AWP). The main idea is to minimize the loss w.r.t. a bounded worst-case perturbation of the model parameters by (approximately) solving an associated min-max problem. Intuitively, we favor local minima with a small loss in a neighborhood around them. The benefits of AWP, and more generally the connections between flatness and generalization, have been extensively studied for i.i.d. data such as images. In this paper we initiate the first study of this phenomenon for graph data. Along the way, we identify a vanishing-gradient issue with all existing formulations of AWP and we propose Weighted Truncated AWP (WT-AWP) to alleviate this issue. We show that regularizing graph neural networks with WT-AWP consistently improves both natural and robust generalization across many different graph learning tasks and models.", "keywords": "Graph neural networks;Adversarial weight perturbation", "primary_area": "", "supplementary_material": "", "author": "Yihan Wu;Aleksandar Bojchevski;Heng Huang", "authorids": "~Yihan_Wu1;~Aleksandar_Bojchevski1;~Heng_Huang1", "gender": "M;M;M", "homepage": "https://yihwu.github.io/;https://abojchevski.github.io/;https://www.cs.umd.edu/~heng/", "dblp": ";203/8114;03/281", "google_scholar": "cajTg_wAAAAJ;https://scholar.google.de/citations?user=F1APiN4AAAAJ;4OqLaDwAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Yihan_Wu1;~Aleksandar_Bojchevski1;~Heng_Huang1", "aff": "University of Pittsburgh;CISPA Helmholtz Center for Information Security;University of Pittsburgh", "aff_domain": "pitt.edu;cispa.de;pitt.edu", "position": "PhD student;Principal Researcher;Full Professor", "bibtex": "@misc{\nwu2022adversarial,\ntitle={Adversarial Weight Perturbation Improves Generalization in Graph Neural Networks},\nauthor={Yihan Wu and Aleksandar Bojchevski and Heng Huang},\nyear={2022},\nurl={https://openreview.net/forum?id=hUr6K4D9f7P}\n}", "github": "", "project": "", "reviewers": "gQzb;MYVq;2WEe;DWF4", "site": "https://openreview.net/forum?id=hUr6K4D9f7P", "pdf_size": 0, "recommendation": "3;5;6;6", "confidence": "4;3;4;2", "correctness": "3;4;3;2", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "205;57;50;62", "wc_summary_review": "30;59;36;39", "wc_main_review": "247;321;707;166", "wc_review": "482;437;793;267", "wc_reply_reviewers": "99;0;26;0", "wc_reply_authors": "543;501;662;333", "reply_reviewers": "1;0;1;0", "reply_authors": "2;1;1;1", "recommendation_avg": [ 5.0, 1.224744871391589 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 93.5, 64.5155020130821 ], "wc_summary_review_avg": [ 41.0, 10.88577052853862 ], "wc_main_review_avg": [ 360.25, 207.56610392836302 ], "wc_review_avg": [ 494.75, 189.94785468649022 ], "wc_reply_reviewers_avg": [ 31.25, 40.53008142108772 ], "wc_reply_authors_avg": [ 509.75, 117.9011768389103 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.4923659639173309, "corr_recommendation_correctness": -0.28867513459481287, "gs_citation": 33, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10319622101226258401&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff_unique_index": "0;1;0", "aff_unique_norm": "University of Pittsburgh;CISPA Helmholtz Center for Information Security", "aff_unique_dep": ";", "aff_unique_url": "https://www.pitt.edu;https://www.cispa.de/", "aff_unique_abbr": "Pitt;CISPA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", "aff_country_unique": "United States;Germany" }, { "id": "hW2kwAcXq5w", "title": "Discriminator-Weighted Offline Imitation Learning from Suboptimal Demonstrations", "track": "main", "status": "Reject", "tldr": "", "abstract": "We study the problem of offline Imitation Learning (IL) where an agent aims to learn an optimal expert behavior policy without additional online environment interactions. Instead, the agent is provided with a static offline dataset of state-action-next state transition triples from both optimal and non-optimal expert behaviors. This strictly offline imitation learning problem arises in many real-world problems, where environment interactions and expert annotations are costly. Prior works that address the problem either require that expert data occupies the majority proportion of the offline dataset, or need to learn a reward function and perform offline reinforcement learning (RL) based on the learned reward function. In this paper, we propose an imitation learning algorithm to address the problem without additional steps of reward learning and offline RL training for the case when demonstrations containing large-proportion of suboptimal data. Built upon behavioral cloning (BC), we introduce an additional discriminator to distinguish expert and non-expert data, we propose a cooperation strategy to boost the performance of both tasks, this will result in a new policy learning objective and surprisingly, we find its equivalence to a generalized BC objective, where the outputs of discriminator serve as the weights of the BC loss function. Experimental results show that the proposed algorithm can learn behavior policies that are much closer to the optimal policies than policies learned by baseline algorithms.", "keywords": "imitation learning;offline imitation learning", "primary_area": "", "supplementary_material": "", "author": "Haoran Xu;Xianyuan Zhan;Honglei Yin;Huiling Qin", "authorids": "~Haoran_Xu4;~Xianyuan_Zhan1;~Honglei_Yin1;~Huiling_Qin1", "gender": "M;M;F;M", "homepage": "https://ryanxhr.github.io/;http://zhanxianyuan.xyz/;;https://github.com/flag2freefish", "dblp": ";181/5081;213/0873;", "google_scholar": "iX8AJI0AAAAJ;pDMnGloAAAAJ;https://scholar.google.com.hk/citations?hl=en;", "orcid": ";0000-0002-3683-0554;;", "linkedin": ";;;", "or_profile": "~Haoran_Xu4;~Xianyuan_Zhan1;~Huiling_Qin1;~Yin_Honglei1", "aff": "JD.com;Tsinghua University;;JD Technology", "aff_domain": "jd.com;tsinghua.edu.cn;;jd.com", "position": "Researcher;Associate Professor;;Researcher", "bibtex": "@misc{\nxu2022discriminatorweighted,\ntitle={Discriminator-Weighted Offline Imitation Learning from Suboptimal Demonstrations},\nauthor={Haoran Xu and Xianyuan Zhan and Honglei Yin and Huiling Qin},\nyear={2022},\nurl={https://openreview.net/forum?id=hW2kwAcXq5w}\n}", "github": "", "project": "", "reviewers": "nHQm;jCdL;xvWD;x4DH", "site": "https://openreview.net/forum?id=hW2kwAcXq5w", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "3;3;3;4", "correctness": "2;3;2;3", "technical_novelty": "2;3;2;3", "empirical_novelty": "2;2;3;2", "wc_summary_paper": "167;128;90;158", "wc_summary_review": "29;79;39;70", "wc_main_review": "428;630;516;163", "wc_review": "624;837;645;391", "wc_reply_reviewers": "356;433;145;0", "wc_reply_authors": "1666;1266;720;182", "reply_reviewers": "1;2;1;0", "reply_authors": "4;3;2;1", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 135.75, 30.102948360584218 ], "wc_summary_review_avg": [ 54.25, 20.801141795584204 ], "wc_main_review_avg": [ 434.25, 172.2039125571774 ], "wc_review_avg": [ 624.25, 158.19193247444701 ], "wc_reply_reviewers_avg": [ 233.5, 171.14394526246028 ], "wc_reply_authors_avg": [ 958.5, 560.1220849064961 ], "reply_reviewers_avg": [ 1.0, 0.7071067811865476 ], "reply_authors_avg": [ 2.5, 1.118033988749895 ], "replies_avg": [ 22, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.6622661785325219, "corr_recommendation_correctness": 0.6882472016116854, "gs_citation": 97, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12184701455253705252&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "aff_unique_index": "0;1;2", "aff_unique_norm": "JD.com;Tsinghua University;JD", "aff_unique_dep": ";;JD Technology", "aff_unique_url": "https://www.jd.com;https://www.tsinghua.edu.cn;https://www.jd.com", "aff_unique_abbr": "JD;THU;JD", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "id": "h_kn4vXQp1x", "title": "Privacy Protected Multi-Domain Collaborative Learning", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Unsupervised domain adaptation (UDA) aims to transfer knowledge from one or more well-labeled source domains to improve model performance on the different-yet-related target domain without any annotations. However, existing UDA algorithms fail to bring any benefits to source domains and neglect privacy protection during data sharing. With these considerations, we define Privacy Protected Multi-Domain Collaborative Learning (P$^{2}$MDCL) and propose a novel Mask-Driven Federated Network (MDFNet) to reach a ``win-win'' deal for multiple domains with data protected. First, each domain is armed with individual local model via a mask disentangled mechanism to learn domain-invariant semantics. Second, the centralized server refines the global invariant model by integrating and exchanging local knowledge across all domains. Moreover, adaptive self-supervised optimization is deployed to learn discriminative features for unlabeled domains. Finally, theoretical studies and experimental results illustrate rationality and effectiveness of our method on solving P$^{2}$MDCL.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/7b4889920210a70de78d7441354d4da242298e67.zip", "author": "Haifeng Xia;TAOTAO JING;Zizhan Zheng;Zhengming Ding", "authorids": "~Haifeng_Xia2;~TAOTAO_JING1;~Zizhan_Zheng1;~Zhengming_Ding5", "gender": "M;M;M;M", "homepage": ";https://scottjingtt.github.io;https://www.cs.tulane.edu/~zzheng3/;http://www.cs.tulane.edu/~zding1/", "dblp": "191/6730.html;239/8523;23/286;122/3547", "google_scholar": "41LFIbQAAAAJ;OTPyfwkAAAAJ;B1v2AUYAAAAJ;TKbyRRsAAAAJ", "orcid": ";0000-0001-7597-6532;;0000-0002-6994-5278", "linkedin": ";;;", "or_profile": "~Haifeng_Xia2;~TAOTAO_JING1;~Zizhan_Zheng1;~Zhengming_Ding5", "aff": ";Tulane University;Tulane University;Tulane University", "aff_domain": ";tulane.edu;tulane.edu;tulane.edu", "position": ";PhD student;Assistant Professor;Assistant Professor", "bibtex": "@misc{\nxia2022privacy,\ntitle={Privacy Protected Multi-Domain Collaborative Learning},\nauthor={Haifeng Xia and TAOTAO JING and Zizhan Zheng and Zhengming Ding},\nyear={2022},\nurl={https://openreview.net/forum?id=h_kn4vXQp1x}\n}", "github": "", "project": "", "reviewers": "sSTR;Ni1q;9FRA;vJMS", "site": "https://openreview.net/forum?id=h_kn4vXQp1x", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "4;4;4;4", "correctness": "4;3;4;4", "technical_novelty": "2;2;3;3", "empirical_novelty": "3;2;3;3", "wc_summary_paper": "79;68;62;34", "wc_summary_review": "126;41;159;27", "wc_main_review": "148;305;146;83", "wc_review": "353;414;367;144", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 60.75, 16.60383991732033 ], "wc_summary_review_avg": [ 88.25, 55.71074851408837 ], "wc_main_review_avg": [ 170.5, 81.934424999508 ], "wc_review_avg": [ 319.5, 103.81353476305486 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:nlNNEOrd0gkJ:scholar.google.com/&scioq=Privacy+Protected+Multi-Domain+Collaborative+Learning&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;0", "aff_unique_norm": "Tulane University", "aff_unique_dep": "", "aff_unique_url": "https://www.tulane.edu", "aff_unique_abbr": "Tulane", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "hbGV3vzMPzG", "title": "On the Impact of Hard Adversarial Instances on Overfitting in Adversarial Training", "track": "main", "status": "Reject", "tldr": "", "abstract": "Adversarial training is a popular method to robustify models against adversarial attacks.\nHowever, it exhibits much more severe overfitting than training on clean inputs.\nIn this work, we investigate this phenomenon from the perspective of training instances, i.e., training input-target pairs.\nTo this end, we provide a quantitative and model-agnostic metric measuring the difficulty of an instance in the training set and analyze the model's behavior on instances of different difficulty levels.\nThis lets us show that the decay in generalization performance of adversarial training is a result of the model's attempt to fit hard adversarial instances.\nWe theoretically verify our observations for both linear and general nonlinear models, proving that models trained on hard instances have worse generalization performance than ones trained on easy instances.\nIn addition, this gap in generalization performance is larger in adversarial training.\nFinally, we investigate solutions to mitigating adversarial overfitting in several scenarios, including when relying on fast adversarial training and in the context of fine-tuning a pretrained model with additional data.\nOur results demonstrate adaptively using training data can improve model's robustness.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/fc555eb3ee2a8be08b4a4c2a176c33affc40133a.zip", "author": "Chen Liu;Zhichao Huang;Mathieu Salzmann;Tong Zhang;Sabine S\u00fcsstrunk", "authorids": "~Chen_Liu1;~Zhichao_Huang1;~Mathieu_Salzmann1;~Tong_Zhang2;~Sabine_S\u00fcsstrunk1", "gender": "M;M;M;M;", "homepage": "http://liuchen1993.cn/HomePage/index.html;;https://people.epfl.ch/mathieu.salzmann;http://tongzhang-ml.org;https://www.epfl.ch/labs/ivrl/", "dblp": "10/2639-27;;18/4533;07/4227-1;s/SSusstrunk", "google_scholar": "48PsswEAAAAJ;https://scholar.google.com/citations?hl=zh-TW;https://scholar.google.ch/citations?user=n-B0jr4AAAAJ;LurWtuYAAAAJ;https://scholar.google.com/citations?hl=de", "orcid": ";;;0000-0002-5511-2558;", "linkedin": ";;;;", "or_profile": "~Chen_Liu1;~Zhichao_Huang1;~Mathieu_Salzmann1;~Tong_Zhang2;~Sabine_S\u00fcsstrunk1", "aff": "Swiss Federal Institute of Technology Lausanne;Hong Kong University of Science and Technology;CSIRO;Hong Kong University of Science and Technology;EPFL - EPF Lausanne", "aff_domain": "epfl.ch;ust.hk;data61.csiro.au;ust.hk;epfl.ch", "position": "PhD student;PhD student;Collaborator;Full Professor;Full Professor", "bibtex": "@misc{\nliu2022on,\ntitle={On the Impact of Hard Adversarial Instances on Overfitting in Adversarial Training},\nauthor={Chen Liu and Zhichao Huang and Mathieu Salzmann and Tong Zhang and Sabine S{\\\"u}sstrunk},\nyear={2022},\nurl={https://openreview.net/forum?id=hbGV3vzMPzG}\n}", "github": "", "project": "", "reviewers": "crXx;oU1N;dYnb", "site": "https://openreview.net/forum?id=hbGV3vzMPzG", "pdf_size": 0, "recommendation": "3;5;6", "confidence": "4;5;4", "correctness": "3;3;3", "technical_novelty": "2;2;3", "empirical_novelty": "2;1;3", "wc_summary_paper": "65;51;82", "wc_summary_review": "27;53;23", "wc_main_review": "312;213;262", "wc_review": "404;317;367", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "1258;865;679", "reply_reviewers": "0;0;0", "reply_authors": "2;2;1", "recommendation_avg": [ 4.666666666666667, 1.247219128924647 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.0, 0.816496580927726 ], "wc_summary_paper_avg": [ 66.0, 12.675435561221029 ], "wc_summary_review_avg": [ 34.333333333333336, 13.299958228840001 ], "wc_main_review_avg": [ 262.3333333333333, 40.41726803676314 ], "wc_review_avg": [ 362.6666666666667, 35.64952859280034 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 934.0, 241.35865428859185 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.18898223650461357, "corr_recommendation_correctness": 0.0, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17805559211448797356&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff_unique_index": "0;1;2;1;3", "aff_unique_norm": "Swiss Federal Institute of Technology Lausanne;Hong Kong University of Science and Technology;Commonwealth Scientific and Industrial Research Organisation;EPFL", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.epfl.ch;https://www.ust.hk;https://www.csiro.au;https://www.epfl.ch", "aff_unique_abbr": "EPFL;HKUST;CSIRO;EPFL", "aff_campus_unique_index": "0;1;1;0", "aff_campus_unique": "Lausanne;Hong Kong SAR;", "aff_country_unique_index": "0;1;2;1;0", "aff_country_unique": "Switzerland;China;Australia" }, { "title": "Fixed Neural Network Steganography: Train the images, not the network", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6161", "id": "hcMvApxGSzZ", "poster": "", "openreview": "https://openreview.net/forum?id=hcMvApxGSzZ", "slides": "https://iclr.cc/virtual/2022/poster/6161", "video": "https://iclr.cc/virtual/2022/poster/6161", "author_site": "Varsha Kishore, Xiangyu Chen, Yan Wang, Boyi Li, Kilian Weinberger", "tldr": "", "abstract": "Recent attempts at image steganography make use of advances in deep learning to train an encoder-decoder network pair to hide and retrieve secret messages in images. These methods are able to hide large amounts of data, but they also incur high decoding error rates (around 20%). In this paper, we propose a novel algorithm for steganography that takes advantage of the fact that neural networks are sensitive to tiny perturbations. Our method, Fixed Neural Network Steganography (FNNS), yields significantly lower error rates when compared to prior state-of-the-art methods and achieves 0% error reliably for hiding up to 3 bits per pixel (bpp) of secret information in images. FNNS also successfully evades existing statistical steganalysis systems and can be modified to evade neural steganalysis systems as well. Recovering every bit correctly, up to 3 bpp, enables novel applications that requires encryption. We introduce one specific use case for facilitating anonymized and safe image sharing. Our code is available at https://github.com/varshakishore/FNNS.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Varsha Kishore;Xiangyu Chen;Yan Wang;Boyi Li;Kilian Q Weinberger", "authorids": "~Varsha_Kishore1;~Xiangyu_Chen1;~Yan_Wang10;~Boyi_Li1;~Kilian_Q_Weinberger1", "gender": "F;M;M;F;M", "homepage": ";https://www.cs.cornell.edu/~xchen/;https://www.cs.cornell.edu/~yanwang/;https://sites.google.com/site/boyilics/home;http://www.cs.cornell.edu/~kilian/", "dblp": "239/5696;;59/2227;;88/4801", "google_scholar": ";xBv-PMEAAAAJ;nZsD8XwAAAAJ;;jsxk8vsAAAAJ", "orcid": ";;;;0009-0008-9313-7239", "linkedin": ";;;;", "or_profile": "~Varsha_Kishore1;~Xiangyu_Chen1;~Yan_Wang10;~Boyi_Li1;~Kilian_Q_Weinberger1", "aff": "Cornell University;Cornell University;Waymo;Cornell University;ASAPP Inc.", "aff_domain": "cornell.edu;cornell.edu;waymo.com;cornell.edu;asapp.com", "position": "PhD student;PhD student;Researcher;PhD;Principal Researcher", "bibtex": "@inproceedings{\nkishore2022fixed,\ntitle={Fixed Neural Network Steganography: Train the images, not the network},\nauthor={Varsha Kishore and Xiangyu Chen and Yan Wang and Boyi Li and Kilian Q Weinberger},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=hcMvApxGSzZ}\n}", "github": "", "project": "", "reviewers": "Cqai;vPjm;NkWr;3seg", "pdf_size": 0, "recommendation": "5;8;8;8", "confidence": "5;4;3;3", "correctness": "3;3;4;3", "technical_novelty": "3;3;3;2", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "77;46;59;103", "wc_summary_review": "76;74;11;54", "wc_main_review": "850;310;118;574", "wc_review": "1003;430;188;731", "wc_reply_reviewers": "116;52;0;667", "wc_reply_authors": "1075;382;99;904", "reply_reviewers": "1;1;0;2", "reply_authors": "2;1;1;2", "recommendation_avg": [ 7.25, 1.299038105676658 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 71.25, 21.3819433167334 ], "wc_summary_review_avg": [ 53.75, 26.13785568863674 ], "wc_main_review_avg": [ 463.0, 275.91846621783037 ], "wc_review_avg": [ 588.0, 307.26128945898796 ], "wc_reply_reviewers_avg": [ 208.75, 267.74182994070986 ], "wc_reply_authors_avg": [ 615.0, 392.3219341306321 ], "reply_reviewers_avg": [ 1.0, 0.7071067811865476 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.8703882797784892, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 55, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5875349341202269455&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "pdf": "https://openreview.net/pdf?id=hcMvApxGSzZ", "email": "cornell.edu;cornell.edu;waymo.com;cornell.edu;asapp.com", "author_num": 5, "aff_unique_index": "0;0;1;0;2", "aff_unique_norm": "Cornell University;Waymo;ASAPP Inc.", "aff_unique_dep": ";;", "aff_unique_url": "https://www.cornell.edu;https://www.waymo.com;https://www.asapp.com", "aff_unique_abbr": "Cornell;Waymo;ASAPP", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Continuously Discovering Novel Strategies via Reward-Switching Policy Optimization", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6050", "id": "hcQHRHKfN_", "poster": "", "openreview": "https://openreview.net/forum?id=hcQHRHKfN_", "slides": "https://iclr.cc/virtual/2022/poster/6050", "video": "https://iclr.cc/virtual/2022/poster/6050", "author_site": "Zihan Zhou, Wei Fu, Bingliang Zhang, Yi Wu", "tldr": "", "abstract": "We present Reward-Switching Policy Optimization (RSPO), a paradigm to discover diverse strategies in complex RL environments by iteratively finding novel policies that are both locally optimal and sufficiently different from existing ones. To encourage the learning policy to consistently converge towards a previously undiscovered local optimum, RSPO switches between extrinsic and intrinsic rewards via a trajectory-based novelty measurement during the optimization process. When a sampled trajectory is sufficiently distinct, RSPO performs standard policy optimization with extrinsic rewards. For trajectories with high likelihood under existing policies, RSPO utilizes an intrinsic diversity reward to promote exploration. Experiments show that RSPO is able to discover a wide spectrum of strategies in a variety of domains, ranging from single-agent navigation tasks and MuJoCo control to multi-agent stag-hunt games and the StarCraft II Multi-Agent Challenge.", "keywords": "diverse behavior;deep reinforcement learning;multi-agent reinforcement learning", "primary_area": "", "supplementary_material": "/attachment/346c31fe592463574dc5a3cb22c9ee707492775a.zip", "author": "Zihan Zhou;Wei Fu;Bingliang Zhang;Yi Wu", "authorids": "~Zihan_Zhou1;~Wei_Fu1;~Bingliang_Zhang1;~Yi_Wu1", "gender": "M;M;M;M", "homepage": ";https://garrett4wade.github.io/;https://zhangbingliang2019.github.io/;https://jxwuyi.weebly.com", "dblp": "00/6525-2;;;", "google_scholar": ";https://scholar.google.com/citations?hl=en;;dusV5HMAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Zihan_Zhou1;~Wei_Fu1;~Bingliang_Zhang1;~Yi_Wu1", "aff": "Department of Computer Science, University of Toronto;Institute for Interdisciplinary Information Sciences, Tsinghua University, Tsinghua University;Tsinghua University;Tsinghua University", "aff_domain": "cs.toronto.edu;mails.tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn", "position": "PhD student;PhD student;Undergrad student;Assistant Professor", "bibtex": "@inproceedings{\nzhou2022continuously,\ntitle={Continuously Discovering Novel Strategies via Reward-Switching Policy Optimization},\nauthor={Zihan Zhou and Wei Fu and Bingliang Zhang and Yi Wu},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=hcQHRHKfN_}\n}", "github": "", "project": "", "reviewers": "tRht;kTab;YXUG;LnVU", "pdf_size": 0, "recommendation": "5;8;8;8", "confidence": "3;3;4;3", "correctness": "2;4;4;4", "technical_novelty": "2;3;3;2", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "81;58;134;53", "wc_summary_review": "86;25;66;16", "wc_main_review": "632;126;344;262", "wc_review": "799;209;544;331", "wc_reply_reviewers": "1278;0;653;16", "wc_reply_authors": "3397;215;1006;452", "reply_reviewers": "7;0;2;1", "reply_authors": "11;1;2;1", "recommendation_avg": [ 7.25, 1.299038105676658 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.8660254037844386 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 81.5, 32.09750769140807 ], "wc_summary_review_avg": [ 48.25, 28.81297450802329 ], "wc_main_review_avg": [ 341.0, 185.17289218457435 ], "wc_review_avg": [ 470.75, 224.25250834717545 ], "wc_reply_reviewers_avg": [ 486.75, 527.3155483199789 ], "wc_reply_authors_avg": [ 1267.5, 1262.5320788003764 ], "reply_reviewers_avg": [ 2.5, 2.692582403567252 ], "reply_authors_avg": [ 3.75, 4.205650960315181 ], "replies_avg": [ 33, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": 1.0, "gs_citation": 36, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5262959814893284728&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=hcQHRHKfN_", "email": "cs.toronto.edu;mails.tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn", "author_num": 4, "aff_unique_index": "0;1;1;1", "aff_unique_norm": "University of Toronto;Tsinghua University", "aff_unique_dep": "Department of Computer Science;Institute for Interdisciplinary Information Sciences", "aff_unique_url": "https://www.utoronto.ca;https://www.tsinghua.edu.cn", "aff_unique_abbr": "U of T;Tsinghua", "aff_campus_unique_index": "0", "aff_campus_unique": "Toronto;", "aff_country_unique_index": "0;1;1;1", "aff_country_unique": "Canada;China" }, { "title": "Fast AdvProp", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6172", "id": "hcoswsDHNAW", "poster": "", "openreview": "https://openreview.net/forum?id=hcoswsDHNAW", "slides": "https://iclr.cc/virtual/2022/poster/6172", "video": "https://iclr.cc/virtual/2022/poster/6172", "author_site": "Jieru Mei, Yucheng Han, Yutong Bai, Yixiao Zhang, Yinigwei Li, xianhang li, Alan Yuille, Cihang Xie", "tldr": "", "abstract": "Adversarial Propagation (AdvProp) is an effective way to improve recognition models, leveraging adversarial examples. Nonetheless, AdvProp suffers from the extremely slow training speed, mainly because: a) extra forward and backward passes are required for generating adversarial examples; b) both original samples and their adversarial counterparts are used for training (i.e., 2X data). In this paper, we introduce Fast AdvProp, which aggressively revamps AdvProp's costly training components, rendering the method nearly as cheap as the vanilla training. Specifically, our modifications in Fast AdvProp are guided by the hypothesis that disentangled learning with adversarial examples is the key for performance improvements, while other training recipes (e.g., paired clean and adversarial training samples, multi-step adversarial attackers) could be largely simplified. \n\nOur empirical results show that, compared to the vanilla training baseline, Fast AdvProp is able to further model performance on a spectrum of visual benchmarks, without incurring extra training cost. Additionally, our ablations find Fast AdvProp scales better if larger models are used, is compatible with existing data augmentation methods (i.e., Mixup and CutMix), and can be easily adapted to other recognition tasks like object detection. The code is available here: https://github.com/meijieru/fast_advprop.", "keywords": "Adversarial examples;efficient training;generalization", "primary_area": "", "supplementary_material": "", "author": "Jieru Mei;Yucheng Han;Yutong Bai;Yixiao Zhang;Yingwei Li;Xianhang Li;Alan Yuille;Cihang Xie", "authorids": "~Jieru_Mei2;~Yucheng_Han1;~Yutong_Bai1;~Yixiao_Zhang1;~Yingwei_Li4;~Xianhang_Li1;~Alan_Yuille1;~Cihang_Xie3", "gender": "M;M;F;M;M;M;M;M", "homepage": "https://meijieru.com/;https://tingxueronghua.github.io/;https://yutongbai.com/;;http://yingwei.li/;https://xhl-video.github.io/xianhangli/;;https://cihangxie.github.io/", "dblp": "198/9332.html;226/9017;216/8431;;;268/5945;y/AlanLYuille;175/3366", "google_scholar": "nHKExN0AAAAJ;LbwqJBQAAAAJ;N1-l4GsAAAAJ;lU3wroMAAAAJ;phWmJeIAAAAJ;YKpFz4YAAAAJ;;X3vVZPcAAAAJ", "orcid": ";;;;;;;", "linkedin": "meijieru/;;%E9%9B%A8%E6%A1%90-%E7%99%BD-59a44a136/;;;;;", "or_profile": "~Jieru_Mei2;~Yucheng_Han1;~Yutong_Bai1;~Yixiao_Zhang1;~Yingwei_Li4;~Xianhang_Li1;~Alan_Yuille1;~cihang_xie1", "aff": "Johns Hopkins University;Nanyang Technological University;Johns Hopkins University;Johns Hopkins University;Johns Hopkins University;University of California, Santa Cruz;Johns Hopkins University;University of California, Santa Cruz", "aff_domain": "jhu.edu;ntu.edu.sg;jhu.edu;jhu.edu;jhu.edu;ucsc.edu;johnshopkins.edu;ucsc.edu", "position": "PhD student;PhD student;PhD student;PhD student;PhD student;PhD student;Full Professor;Assistant Professor", "bibtex": "@inproceedings{\nmei2022fast,\ntitle={Fast AdvProp},\nauthor={Jieru Mei and Yucheng Han and Yutong Bai and Yixiao Zhang and Yingwei Li and Xianhang Li and Alan Yuille and Cihang Xie},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=hcoswsDHNAW}\n}", "github": "", "project": "", "reviewers": "QrYp;7oqV;63kv;e6SS", "pdf_size": 0, "recommendation": "5;5;8;8", "confidence": "4;4;4;5", "correctness": "3;3;4;4", "technical_novelty": "2;2;3;4", "empirical_novelty": "2;2;3;4", "wc_summary_paper": "55;55;69;110", "wc_summary_review": "24;111;46;40", "wc_main_review": "211;261;345;179", "wc_review": "290;427;460;329", "wc_reply_reviewers": "0;0;0;80", "wc_reply_authors": "513;264;179;317", "reply_reviewers": "0;0;0;1", "reply_authors": "1;1;1;1", "recommendation_avg": [ 6.5, 1.5 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.82915619758885 ], "empirical_novelty_avg": [ 2.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 72.25, 22.531921799970814 ], "wc_summary_review_avg": [ 55.25, 33.17661073708404 ], "wc_main_review_avg": [ 249.0, 62.65780079128216 ], "wc_review_avg": [ 376.5, 69.39200242102832 ], "wc_reply_reviewers_avg": [ 20.0, 34.64101615137755 ], "wc_reply_authors_avg": [ 318.25, 122.74236228784258 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": 0.5773502691896258, "corr_recommendation_correctness": 1.0, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17518006235660748268&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=hcoswsDHNAW", "email": "jhu.edu;ntu.edu.sg;jhu.edu;jhu.edu;jhu.edu;ucsc.edu;johnshopkins.edu;ucsc.edu", "author_num": 8, "aff_unique_index": "0;1;0;0;0;2;0;2", "aff_unique_norm": "Johns Hopkins University;Nanyang Technological University;University of California, Santa Cruz", "aff_unique_dep": ";;", "aff_unique_url": "https://www.jhu.edu;https://www.ntu.edu.sg;https://www.ucsc.edu", "aff_unique_abbr": "JHU;NTU;UCSC", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Santa Cruz", "aff_country_unique_index": "0;1;0;0;0;0;0;0", "aff_country_unique": "United States;Singapore" }, { "id": "hdSn_X7Hfvz", "title": "Deep Probability Estimation", "track": "main", "status": "Reject", "tldr": "", "abstract": "Reliable probability estimation is of crucial importance in many real-world applications where there is inherent uncertainty, such as weather forecasting, medical prognosis, or collision avoidance in autonomous vehicles. Probability-estimation models are trained on observed outcomes (e.g. whether it has rained or not, or whether a patient has died or not), because the ground-truth probabilities of the events of interest are typically unknown. The problem is therefore analogous to binary classification, with the important difference that the objective is to estimate probabilities rather than predicting the specific outcome. The goal of this work is to investigate probability estimation from high-dimensional data using deep neural networks. There exist several methods to improve the probabilities generated by these models but they mostly focus on classification problems where the probabilities are related to model uncertainty. In the case of problems with inherent uncertainty, it is challenging to evaluate performance without access to ground-truth probabilities. To address this, we build a synthetic dataset to study and compare different computable metrics. We evaluate existing methods on the synthetic data as well as on three real-world probability estimation tasks, all of which involve inherent uncertainty: precipitation forecasting from radar images, predicting cancer patient survival from histopathology images, and predicting car crashes from dashcam videos. Finally, we also propose a new method for probability estimation using neural networks, which modifies the training process to promote output probabilities that are consistent with empirical probabilities computed from the data. The method outperforms existing approaches on most metrics on the simulated as well as real-world data.", "keywords": "Probability estimation;calibration;uncertainty;weather forecasting;medical prognosis;car crash;benchmark datasets;deep learning;high dimensional data", "primary_area": "", "supplementary_material": "/attachment/cdb98ed379ddedad06dffde7631b93ba848414b0.zip", "author": "Weicheng Zhu;Matan Leibovich;Sheng Liu;Sreyas Mohan;Aakash Kaku;Boyang Yu;Laure Zanna;Narges Razavian;Carlos Fernandez-Granda", "authorids": "~Weicheng_Zhu1;ml7557@nyu.edu;~Sheng_Liu2;~Sreyas_Mohan1;~Aakash_Kaku1;~Boyang_Yu3;lz1955@nyu.edu;~Narges_Razavian1;~Carlos_Fernandez-Granda1", "gender": "M;;;M;;F;;;", "homepage": ";;https://shengliu66.github.io/;https://sreyas-mohan.github.io;https://aakashrkaku.github.io/;;;;https://cims.nyu.edu/~cfgranda/", "dblp": "180/5811;;;200/8516;254/2931;;;https://dblp.org/pers/hd/r/Razavian:Narges;77/11141", "google_scholar": "Glw83HYAAAAJ;;rzhzR-cAAAAJ;https://scholar.google.co.in/citations?user=jaobZDsAAAAJ;lgObq7UAAAAJ;;;;GX-PtukAAAAJ", "orcid": ";;;;0000-0002-2631-0897;;;;", "linkedin": ";;;;;boyang-yu-466537159;;;", "or_profile": "~Weicheng_Zhu1;ml7557@nyu.edu;~Sheng_Liu2;~Sreyas_Mohan1;~Aakash_Kaku1;~Boyang_Yu3;lz1955@nyu.edu;~Narges_Razavian1;~Carlos_Fernandez-Granda1", "aff": "New York University;;New York University;New York University;New York University;New York University;;New York University;New York University", "aff_domain": "nyu.edu;;nyu.edu;nyu.edu;nyu.edu;nyu.edu;;nyu.edu;nyu.edu", "position": "PhD student;;PhD student;PhD student;PhD student;PhD student;;Assistant Professor;Associate Professor", "bibtex": "@misc{\nzhu2022deep,\ntitle={Deep Probability Estimation},\nauthor={Weicheng Zhu and Matan Leibovich and Sheng Liu and Sreyas Mohan and Aakash Kaku and Boyang Yu and Laure Zanna and Narges Razavian and Carlos Fernandez-Granda},\nyear={2022},\nurl={https://openreview.net/forum?id=hdSn_X7Hfvz}\n}", "github": "", "project": "", "reviewers": "r2Pt;Lp12;vz2e;QWNy", "site": "https://openreview.net/forum?id=hdSn_X7Hfvz", "pdf_size": 0, "recommendation": "1;5;5;6", "confidence": "4;4;4;3", "correctness": "1;3;4;3", "technical_novelty": "1;3;2;3", "empirical_novelty": "1;3;2;3", "wc_summary_paper": "46;67;31;60", "wc_summary_review": "26;32;65;203", "wc_main_review": "321;167;392;73", "wc_review": "393;266;488;336", "wc_reply_reviewers": "281;31;257;108", "wc_reply_authors": "1880;919;1400;1064", "reply_reviewers": "2;1;2;1", "reply_authors": "5;2;3;2", "recommendation_avg": [ 4.25, 1.920286436967152 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 2.75, 1.0897247358851685 ], "technical_novelty_avg": [ 2.25, 0.82915619758885 ], "empirical_novelty_avg": [ 2.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 51.0, 13.80217374184226 ], "wc_summary_review_avg": [ 81.5, 71.70251041630272 ], "wc_main_review_avg": [ 238.25, 125.3702018025017 ], "wc_review_avg": [ 370.75, 81.27538065121566 ], "wc_reply_reviewers_avg": [ 169.25, 103.7457830468304 ], "wc_reply_authors_avg": [ 1315.75, 369.5486267056069 ], "reply_reviewers_avg": [ 1.5, 0.5 ], "reply_authors_avg": [ 3.0, 1.224744871391589 ], "replies_avg": [ 23, 0 ], "authors#_avg": [ 9, 0 ], "corr_recommendation_confidence": -0.5261522196019801, "corr_recommendation_correctness": 0.8661541520797733, "gs_citation": 18, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15071615876185321831&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff_unique_index": "0;0;0;0;0;0;0", "aff_unique_norm": "New York University", "aff_unique_dep": "", "aff_unique_url": "https://www.nyu.edu", "aff_unique_abbr": "NYU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Scalable One-Pass Optimisation of High-Dimensional Weight-Update Hyperparameters by Implicit Differentiation", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6509", "id": "hfU7Ka5cfrC", "poster": "", "openreview": "https://openreview.net/forum?id=hfU7Ka5cfrC", "slides": "https://iclr.cc/virtual/2022/poster/6509", "video": "https://iclr.cc/virtual/2022/poster/6509", "author_site": "Ross Clarke, Elre Oldewage, Jos\u00e9 Miguel Hern\u00e1ndez Lobato", "tldr": "", "abstract": "Machine learning training methods depend plentifully and intricately on hyperparameters, motivating automated strategies for their optimisation. Many existing algorithms restart training for each new hyperparameter choice, at considerable computational cost. Some hypergradient-based one-pass methods exist, but these either cannot be applied to arbitrary optimiser hyperparameters (such as learning rates and momenta) or take several times longer to train than their base models. We extend these existing methods to develop an approximate hypergradient-based hyperparameter optimiser which is applicable to any continuous hyperparameter appearing in a differentiable model weight update, yet requires only one training episode, with no restarts. We also provide a motivating argument for convergence to the true hypergradient, and perform tractable gradient-based optimisation of independent learning rates for each model parameter. Our method performs competitively from varied random hyperparameter initialisations on several UCI datasets and Fashion-MNIST (using a one-layer MLP), Penn Treebank (using an LSTM) and CIFAR-10 (using a ResNet-18), in time only 2-3x greater than vanilla training.", "keywords": "Hyperparameter Optimisation", "primary_area": "", "supplementary_material": "/attachment/f57e3b46ebbdb3ef82fb39cc04d569d0db8ee8e7.zip", "author": "Ross M Clarke;Elre Talea Oldewage;Jos\u00e9 Miguel Hern\u00e1ndez-Lobato", "authorids": "~Ross_M_Clarke1;~Elre_Talea_Oldewage1;~Jos\u00e9_Miguel_Hern\u00e1ndez-Lobato1", "gender": "M;F;", "homepage": ";http://mlg.eng.cam.ac.uk/?portfolio=elre-oldewage;http://jmhl.org", "dblp": "304/7918;;40/6058", "google_scholar": "1joGBpgAAAAJ;;BEBccCQAAAAJ", "orcid": "0000-0001-9884-046X;0000-0002-0568-8700;0000-0001-7610-949X", "linkedin": ";;", "or_profile": "~Ross_M_Clarke1;~Elre_Talea_Oldewage1;~Jose_Miguel_Hernandez_Lobato1", "aff": "University of Cambridge;University of Cambridge;University of Cambridge", "aff_domain": "cam.ac.uk;cam.ac.uk;cam.ac.uk", "position": "PhD student;PhD student;Associate Professor", "bibtex": "@inproceedings{\nclarke2022scalable,\ntitle={Scalable One-Pass Optimisation of High-Dimensional Weight-Update Hyperparameters by Implicit Differentiation},\nauthor={Ross M Clarke and Elre Talea Oldewage and Jos{\\'e} Miguel Hern{\\'a}ndez-Lobato},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=hfU7Ka5cfrC}\n}", "github": "", "project": "", "reviewers": "c7so;1R41;QKup;iZEW", "pdf_size": 0, "recommendation": "6;8;8;8", "confidence": "3;3;4;4", "correctness": "3;4;4;4", "technical_novelty": "2;3;3;3", "empirical_novelty": "3;3;2;3", "wc_summary_paper": "59;67;93;64", "wc_summary_review": "61;46;104;54", "wc_main_review": "439;358;746;425", "wc_review": "559;471;943;543", "wc_reply_reviewers": "71;26;105;309", "wc_reply_authors": "1031;619;1550;1248", "reply_reviewers": "1;1;1;2", "reply_authors": "2;1;3;2", "recommendation_avg": [ 7.5, 0.8660254037844386 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 70.75, 13.160072188251856 ], "wc_summary_review_avg": [ 66.25, 22.431841208425134 ], "wc_main_review_avg": [ 492.0, 149.80821072291064 ], "wc_review_avg": [ 629.0, 184.29324458590446 ], "wc_reply_reviewers_avg": [ 127.75, 108.33137818748546 ], "wc_reply_authors_avg": [ 1112.0, 339.09806841089494 ], "reply_reviewers_avg": [ 1.25, 0.4330127018922193 ], "reply_authors_avg": [ 2.0, 0.7071067811865476 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": 1.0, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13151691768844954794&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=hfU7Ka5cfrC", "email": "cam.ac.uk;cam.ac.uk;cam.ac.uk", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Cambridge", "aff_unique_dep": "", "aff_unique_url": "https://www.cam.ac.uk", "aff_unique_abbr": "Cambridge", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Cambridge", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United Kingdom" }, { "id": "hfjbX1UKNx", "title": "MCL-GAN: Generative Adversarial Networks with Multiple Specialized Discriminators", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "We propose a generative adversarial network with multiple discriminators, which collaborate to represent a real dataset more effectively. This approach facilitates learning a generator consistent with the underlying data distribution based on real images and thus mitigates the chronic mode collapse problem. From the inspiration of multiple choice learning, we guide each discriminator to have expertise in the subset of the entire data and allow the generator to find reasonable correspondences between the latent and real data spaces automatically without the extra supervision for training examples. Despite the use of multiple discriminators, the backbone networks are shared across the discriminators and the increase of training cost is marginal. We demonstrate the effectiveness of our algorithm using multiple evaluation metrics in the standard datasets for diverse tasks.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/9fc2909e6ac5172eba06eb44c8848d3e23cbb493.zip", "author": "Jinyoung Choi;Bohyung Han", "authorids": "~Jinyoung_Choi2;~Bohyung_Han1", "gender": "F;Not Specified", "homepage": ";http://cvlab.snu.ac.kr/~bhhan", "dblp": ";73/4880.html", "google_scholar": "https://scholar.google.com/citations?hl=en;9aaeCToAAAAJ", "orcid": ";", "linkedin": "jinyoung-choi-7b7470189/;", "or_profile": "~Jinyoung_Choi2;~Bohyung_Han1", "aff": "Seoul National University;Seoul National University", "aff_domain": "snu.ac.kr;snu.ac.kr", "position": "PhD student;Full Professor", "bibtex": "@misc{\nchoi2022mclgan,\ntitle={{MCL}-{GAN}: Generative Adversarial Networks with Multiple Specialized Discriminators},\nauthor={Jinyoung Choi and Bohyung Han},\nyear={2022},\nurl={https://openreview.net/forum?id=hfjbX1UKNx}\n}", "github": "", "project": "", "reviewers": "Qjrq;bNGB;pRtx;h6xM", "site": "https://openreview.net/forum?id=hfjbX1UKNx", "pdf_size": 0, "recommendation": "5;5;5;5", "confidence": "4;4;4;4", "correctness": "3;3;3;3", "technical_novelty": "3;3;2;2", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "98;122;47;84", "wc_summary_review": "130;63;16;429", "wc_main_review": "693;611;269;97", "wc_review": "921;796;332;610", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "646;448;346;603", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 5.0, 0.0 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 87.75, 27.169606180436254 ], "wc_summary_review_avg": [ 159.5, 160.78323917622757 ], "wc_main_review_avg": [ 417.5, 243.98514299030586 ], "wc_review_avg": [ 664.75, 221.70405386460573 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 510.75, 120.29417068170844 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 21, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5532779288466758321&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff_unique_index": "0;0", "aff_unique_norm": "Seoul National University", "aff_unique_dep": "", "aff_unique_url": "https://www.snu.ac.kr", "aff_unique_abbr": "SNU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "South Korea" }, { "title": "A generalization of the randomized singular value decomposition", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7116", "id": "hgKtwSb4S2", "poster": "", "openreview": "https://openreview.net/forum?id=hgKtwSb4S2", "slides": "https://iclr.cc/virtual/2022/poster/7116", "video": "https://iclr.cc/virtual/2022/poster/7116", "author_site": "Nicolas Boulle, Alex Townsend", "tldr": "", "abstract": "The randomized singular value decomposition (SVD) is a popular and effective algorithm for computing a near-best rank $k$ approximation of a matrix $A$ using matrix-vector products with standard Gaussian vectors. Here, we generalize the theory of randomized SVD to multivariate Gaussian vectors, allowing one to incorporate prior knowledge of $A$ into the algorithm. This enables us to explore the continuous analogue of the randomized SVD for Hilbert--Schmidt (HS) operators using operator-function products with functions drawn from a Gaussian process (GP). We then construct a new covariance kernel for GPs, based on weighted Jacobi polynomials, which allows us to rapidly sample the GP and control the smoothness of the randomly generated functions. Numerical examples on matrices and HS operators demonstrate the applicability of the algorithm.", "keywords": "Low rank approximation;Randomized SVD;Hilbert--Schmidt operators;Gaussian processes", "primary_area": "", "supplementary_material": "/attachment/22307c6009ec85164c46736b5486f1003eeed7a3.zip", "author": "Nicolas Boulle;Alex Townsend", "authorids": "~Nicolas_Boulle1;~Alex_Townsend1", "gender": ";M", "homepage": "https://nboulle.github.io/;http://pi.math.cornell.edu/~ajt/", "dblp": "247/6364;130/5109", "google_scholar": "EWw0oakAAAAJ;432SChwAAAAJ", "orcid": "0000-0002-1425-8307;0000-0002-8183-7077", "linkedin": "nboulle/;", "or_profile": "~Nicolas_Boulle1;~Alex_Townsend1", "aff": "University of Oxford;Cornell University", "aff_domain": "ox.ac.uk;cornell.edu", "position": "PhD student;Associate Professor", "bibtex": "@inproceedings{\nboulle2022a,\ntitle={A generalization of the randomized singular value decomposition},\nauthor={Nicolas Boulle and Alex Townsend},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=hgKtwSb4S2}\n}", "github": "", "project": "", "reviewers": "FGbe;ksBY;uuEP", "pdf_size": 0, "recommendation": "5;8;8", "confidence": "3;4;3", "correctness": "3;4;3", "technical_novelty": "3;4;2", "empirical_novelty": "2;3;3", "wc_summary_paper": "98;30;31", "wc_summary_review": "25;26;60", "wc_main_review": "290;474;492", "wc_review": "413;530;583", "wc_reply_reviewers": "0;9;22", "wc_reply_authors": "930;510;787", "reply_reviewers": "0;1;1", "reply_authors": "2;1;1", "recommendation_avg": [ 7.0, 1.4142135623730951 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 3.0, 0.816496580927726 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 53.0, 31.822423959633664 ], "wc_summary_review_avg": [ 37.0, 16.268579122549905 ], "wc_main_review_avg": [ 418.6666666666667, 91.27735510823895 ], "wc_review_avg": [ 508.6666666666667, 71.02268808079727 ], "wc_reply_reviewers_avg": [ 10.333333333333334, 9.030811456096044 ], "wc_reply_authors_avg": [ 742.3333333333334, 174.34894767550378 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.5, "corr_recommendation_correctness": 0.5, "gs_citation": 27, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9291959051125305483&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "pdf": "https://openreview.net/pdf?id=hgKtwSb4S2", "email": "ox.ac.uk;cornell.edu", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "University of Oxford;Cornell University", "aff_unique_dep": ";", "aff_unique_url": "https://www.ox.ac.uk;https://www.cornell.edu", "aff_unique_abbr": "Oxford;Cornell", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "United Kingdom;United States" }, { "title": "Maximizing Ensemble Diversity in Deep Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6498", "id": "hjd-kcpDpf2", "poster": "", "openreview": "https://openreview.net/forum?id=hjd-kcpDpf2", "slides": "https://iclr.cc/virtual/2022/poster/6498", "video": "https://iclr.cc/virtual/2022/poster/6498", "author_site": "Hassam Sheikh, mariano Phielipp, Ladislau Boloni", "tldr": "", "abstract": "Modern deep reinforcement learning (DRL) has been successful in solving a range of challenging sequential decision-making problems. Most of these algorithms use an ensemble of neural networks as their backbone structure and benefit from the diversity among the neural networks to achieve optimal results. Unfortunately, the members of the ensemble can converge to the same point either the parametric space or representation space during the training phase, therefore, losing all the leverage of an ensemble. In this paper, we describe Maximize Ensemble Diversity in Reinforcement Learning (MED-RL), a set of regularization methods inspired from the economics and consensus optimization to improve diversity in the ensemble-based deep reinforcement learning methods by encouraging inequality between the networks during training. We integrated MED-RL in five of the most common ensemble-based deep RL algorithms for both continuous and discrete control tasks and evaluated on six Mujoco environments and six Atari games. Our results show that MED-RL augmented algorithms outperform their un-regularized counterparts significantly and in some cases achieved more than 300$\\%$ in performance gains.", "keywords": "Ensemble Based Reinforcement Learning;Ensemble Diversity", "primary_area": "", "supplementary_material": "", "author": "Hassam Sheikh;Mariano Phielipp;Ladislau Boloni", "authorids": "~Hassam_Sheikh1;~Mariano_Phielipp2;~Ladislau_Boloni1", "gender": "M;M;M", "homepage": ";https://www.intel.com/content/www/us/en/research/researchers/mariano-phielipp.html;http://www.cs.ucf.edu/~lboloni/", "dblp": ";23/4518;b/LadislauBoloni", "google_scholar": "https://scholar.google.co.uk/citations?user=QTCAAGQAAAAJ;YArRsvEAAAAJ;drG1_tsAAAAJ", "orcid": ";;0000-0001-5336-9651", "linkedin": ";mariano-phielipp-941624;lotzi-b%C3%B6l%C3%B6ni-4a3b79/", "or_profile": "~Hassam_Sheikh1;~Mariano_Phielipp2;~Ladislau_Boloni1", "aff": "Intel Labs;Intel Labs;University of Central Florida", "aff_domain": "intel.com;intel.com;ucf.edu", "position": "Research Scientist;Principal Researcher;Full Professor", "bibtex": "@inproceedings{\nsheikh2022maximizing,\ntitle={Maximizing Ensemble Diversity in Deep Reinforcement Learning},\nauthor={Hassam Sheikh and Mariano Phielipp and Ladislau Boloni},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=hjd-kcpDpf2}\n}", "github": "", "project": "", "reviewers": "i4M1;TfGq;6miY;a9sA", "pdf_size": 0, "recommendation": "3;6;6;6", "confidence": "4;4;3;3", "correctness": "3;2;2;3", "technical_novelty": "2;2;1;3", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "56;52;70;23", "wc_summary_review": "41;216;32;17", "wc_main_review": "275;530;596;249", "wc_review": "372;798;698;289", "wc_reply_reviewers": "0;17;188;0", "wc_reply_authors": "469;456;529;287", "reply_reviewers": "0;1;3;0", "reply_authors": "2;2;3;1", "recommendation_avg": [ 5.25, 1.299038105676658 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 50.25, 17.09349291397168 ], "wc_summary_review_avg": [ 76.5, 80.99537023805743 ], "wc_main_review_avg": [ 412.5, 152.57539120054716 ], "wc_review_avg": [ 539.25, 213.74678360153166 ], "wc_reply_reviewers_avg": [ 51.25, 79.25709747398021 ], "wc_reply_authors_avg": [ 435.25, 89.91210986291001 ], "reply_reviewers_avg": [ 1.0, 1.224744871391589 ], "reply_authors_avg": [ 2.0, 0.7071067811865476 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": -0.5773502691896257, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14601374169084607034&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "pdf": "https://openreview.net/pdf?id=hjd-kcpDpf2", "email": "intel.com;intel.com;ucf.edu", "author_num": 3, "aff_unique_index": "0;0;1", "aff_unique_norm": "Intel;University of Central Florida", "aff_unique_dep": "Intel Labs;", "aff_unique_url": "https://www.intel.com;https://www.ucf.edu", "aff_unique_abbr": "Intel;UCF", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "hjlXybdILM3", "title": "When less is more: Simplifying inputs aids neural network understanding", "track": "main", "status": "Reject", "tldr": "", "abstract": "Are all bits useful? In this work, we propose SimpleBits, a method to synthesize simplified inputs by reducing information content, and carefully measure the effect of such simplification on learning. Crucially, SimpleBits does not require any domain-specific knowledge to constrain which input features should be removed. Instead, SimpleBits learns to remove the features of inputs which are least relevant for a given task. Concretely, we jointly optimize for input simplification by reducing inputs' bits per dimension as given by a pretrained generative model, as well as for the classification performance. We apply the simplification approach to a wide range of scenarios: conventional training, dataset condensation and post-hoc explanations. In this way, we analyze what simplified inputs tell us about the decisions made by classification networks. We show that our simplification approach successfully removes superfluous information for tasks with injected distractors. When applied post-hoc, our approach provides intuition into reasons for misclassifications of conventionally trained classifiers. Finally, for dataset condensation, we find that inputs can be simplified with only minimal accuracy degradation. Overall, our learning-based simplification approach offers a valuable new tool to explore the basis of network decisions.", "keywords": "interpretability;compression;network training", "primary_area": "", "supplementary_material": "", "author": "Robin Tibor Schirrmeister;Rosanne Liu;Sara Hooker;Tonio Ball", "authorids": "~Robin_Tibor_Schirrmeister1;~Rosanne_Liu1;~Sara_Hooker1;~Tonio_Ball1", "gender": "M;F;;M", "homepage": ";https://rosanneliu.com/;https://www.sarahooker.me/;https://www.ieeg.uni-freiburg.de", "dblp": "198/1371;218/6453;210/2611;11/10737", "google_scholar": "https://scholar.google.de/citations?user=lpuMlzsAAAAJ;_GzrRGwAAAAJ;2xy6h3sAAAAJ;https://scholar.google.de/citations?user=UVyn5ggAAAAJ", "orcid": "0000-0002-5518-7445;;;", "linkedin": ";;;", "or_profile": "~Robin_Tibor_Schirrmeister1;~Rosanne_Liu1;~Sara_Hooker1;~Tonio_Ball1", "aff": "University of Freiburg, Albert-Ludwigs-Universit\u00e4t Freiburg;ML Collective;Google Brain;Uniklinik Freiburg", "aff_domain": "cs.uni-freiburg.de;mlcollective.org;google.com;uniklinik-freiburg.de", "position": "PhD student;Researcher;Research Scientist;Associate Professor", "bibtex": "@misc{\nschirrmeister2022when,\ntitle={When less is more: Simplifying inputs aids neural network understanding},\nauthor={Robin Tibor Schirrmeister and Rosanne Liu and Sara Hooker and Tonio Ball},\nyear={2022},\nurl={https://openreview.net/forum?id=hjlXybdILM3}\n}", "github": "", "project": "", "reviewers": "RfmX;ZQYq;agcx;eYVm", "site": "https://openreview.net/forum?id=hjlXybdILM3", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "4;4;3;3", "correctness": "3;3;3;3", "technical_novelty": "2;3;3;3", "empirical_novelty": "3;2;3;3", "wc_summary_paper": "38;135;214;78", "wc_summary_review": "103;121;250;247", "wc_main_review": "294;237;305;343", "wc_review": "435;493;769;668", "wc_reply_reviewers": "14;0;0;0", "wc_reply_authors": "1449;793;784;1107", "reply_reviewers": "1;0;0;0", "reply_authors": "3;2;2;3", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 116.25, 66.13008014512005 ], "wc_summary_review_avg": [ 180.25, 68.55426682563238 ], "wc_main_review_avg": [ 294.75, 37.97614382740828 ], "wc_review_avg": [ 591.25, 133.74672893196305 ], "wc_reply_reviewers_avg": [ 3.5, 6.06217782649107 ], "wc_reply_authors_avg": [ 1033.25, 273.0076693061937 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 2.5, 0.5 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -1.0, "corr_recommendation_correctness": 0.0, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18041503167094259234&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "University of Freiburg;ML Collective;Google;University Medical Center Freiburg", "aff_unique_dep": ";;Google Brain;", "aff_unique_url": "https://www.uni-freiburg.de;;https://brain.google.com;https://www.uniklinik-freiburg.de", "aff_unique_abbr": "UoF;;Google Brain;UMF", "aff_campus_unique_index": "0;2;0", "aff_campus_unique": "Freiburg;;Mountain View", "aff_country_unique_index": "0;2;0", "aff_country_unique": "Germany;;United States" }, { "id": "hk3Cxc2laT-", "title": "Clustered Task-Aware Meta-Learning by Learning from Learning Paths", "track": "main", "status": "Reject", "tldr": "", "abstract": "To enable effective learning of new tasks with only few samples, meta-learning acquires common knowledge from the existing tasks with a globally shared meta-learner. To further address the problem of task heterogeneity, recent developments balance between customization and generalization by incorporating task clustering to generate the task-aware modulation to be applied on the global meta-learner. However, these methods learn task representation mostly from the features of input data, while the task-specific optimization process with respect to the base-learner model is often neglected. In this work, we propose a Clustered Task-Aware Meta-Learning (CTML) framework with task representation learned from its own learning path. We first conduct a rehearsed task learning from the common initialization, and collect a set of geometric quantities that adequately describes this learning path. By inputting this set of values into a meta path learner, we automatically abstract path representation optimized for the downstream clustering and modulation. To further save the computational cost incurred by the additional rehearsed learning, we devise a shortcut tunnel to directly map between the path and feature cluster assignments. Extensive experiments on two real-world application domains: few-shot image classification and cold-start recommendation demonstrate the superiority of CTML compared to state-of-the-art baselines.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Danni Peng;Sinno Pan", "authorids": "~Danni_Peng1;~Sinno_Pan1", "gender": ";M", "homepage": ";http://www.cse.cuhk.edu.hk/~sinnopan/", "dblp": ";80/5412", "google_scholar": ";https://scholar.google.com/citations?hl=en", "orcid": ";", "linkedin": ";", "or_profile": "~Danni_Peng1;~Sinno_Pan1", "aff": ";Nanyang Technological University", "aff_domain": ";ntu.edu.sg", "position": ";Associate Professor", "bibtex": "@misc{\npeng2022clustered,\ntitle={Clustered Task-Aware Meta-Learning by Learning from Learning Paths},\nauthor={Danni Peng and Sinno Pan},\nyear={2022},\nurl={https://openreview.net/forum?id=hk3Cxc2laT-}\n}", "github": "", "project": "", "reviewers": "oNDW;kvxn;9y7H;fGEV", "site": "https://openreview.net/forum?id=hk3Cxc2laT-", "pdf_size": 0, "recommendation": "5;6;6;6", "confidence": "4;4;4;2", "correctness": "3;3;3;3", "technical_novelty": "2;2;3;3", "empirical_novelty": "3;2;2;3", "wc_summary_paper": "81;80;227;77", "wc_summary_review": "18;149;282;33", "wc_main_review": "397;246;801;91", "wc_review": "496;475;1310;201", "wc_reply_reviewers": "167;0;1970;0", "wc_reply_authors": "2738;660;4416;519", "reply_reviewers": "1;0;5;0", "reply_authors": "7;2;9;1", "recommendation_avg": [ 5.75, 0.4330127018922193 ], "confidence_avg": [ 3.5, 0.8660254037844386 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 116.25, 63.95848262740447 ], "wc_summary_review_avg": [ 120.5, 106.13317106352754 ], "wc_main_review_avg": [ 383.75, 264.07894179581984 ], "wc_review_avg": [ 620.5, 414.7472121666401 ], "wc_reply_reviewers_avg": [ 534.25, 831.7296360596994 ], "wc_reply_authors_avg": [ 2083.25, 1608.0219797938087 ], "reply_reviewers_avg": [ 1.5, 2.0615528128088303 ], "reply_authors_avg": [ 4.75, 3.344772040064913 ], "replies_avg": [ 31, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": 0.0, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17896333807363164330&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "aff_unique_index": "0", "aff_unique_norm": "Nanyang Technological University", "aff_unique_dep": "", "aff_unique_url": "https://www.ntu.edu.sg", "aff_unique_abbr": "NTU", "aff_country_unique_index": "0", "aff_country_unique": "Singapore" }, { "id": "hkXZKTAH5g-", "title": "Image Dataset Compression Based on Matrix Product States", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Large-scale datasets have produced impressive advances in machine learning. However, storing datasets and training neural network models on large datasets have become increasingly expensive. In this paper, we present an effective dataset compression approach based on the matrix product states (MPS) from quantum many-body physics. It can decompose an original image into a sequential product of tensors which effectively retain short-range correlation information in the data for training deep neural networks from scratch. Based on the MPS structure, we propose a new dataset compression method that compresses datasets by filtering long-range correlation information in task-agnostic scenarios and uses dataset distillation to supplement the information in task-specific scenarios. Our approach boosts the model performance by information supplementation and meanwhile maximizes useful information for the downstream task. Extensive experiments have demonstrated the effectiveness of the proposed approach in dataset compression, especially obtained better model performance (3.19$\\%$ on average) than state-of-the-art methods for the same compression rate.", "keywords": "dataset compression;matrix product state", "primary_area": "", "supplementary_material": "", "author": "Ze-Feng Gao;Peiyu Liu;Xiao-Hui Zhang;Xin Zhao;Z. Y. Xie;Zhong-Yi Lu;Ji-Rong Wen", "authorids": "~Ze-Feng_Gao1;~Peiyu_Liu1;~Xiao-Hui_Zhang1;~Xin_Zhao10;~Z._Y._Xie1;~Zhong-Yi_Lu1;~Ji-Rong_Wen1", "gender": "M;M;;M;M;M;M", "homepage": "https://zfgao66.github.io/homepage/;https://peiyuliu.tech/;;https://gsai.ruc.edu.cn/addons/teacher/index/info.html?user_id=5&ruccode=20140041&ln=cn;https://arxiv.org/a/xie_z_4.html;;https://gsai.ruc.edu.cn/english/jrwen", "dblp": "239/5268.html;85/670-2;;https://dblp.uni-trier.de/pid/52/8700.html;;;w/JRWen", "google_scholar": "vB64k4IAAAAJ;0UyGs0YAAAAJ;;JNhNacoAAAAJ;;;tbxCHJgAAAAJ", "orcid": "0000-0002-6695-8209;;;0000-0002-8333-6196;;0000-0001-8866-3180;0000-0002-9777-9676", "linkedin": ";;;;;;", "or_profile": "~Ze-Feng_Gao1;~Peiyu_Liu1;~Xiao-Hui_Zhang1;~Xin_Zhao10;~Z._Y._Xie1;~Zhong-Yi_Lu1;~Ji-Rong_Wen1", "aff": "Renmin University of China;Renmin University of China;;Renmin University of China;Renmin University of China, Tsinghua University;Renmin University of China, Tsinghua University;Renmin University of China", "aff_domain": "ruc.edu.cn;ruc.edu.cn;;ruc.edu.cn;ruc.edu.cn;ruc.edu.cn;ruc.edu.cn", "position": "Postdoc;PhD student;;Full Professor;Associate Professor;Full Professor;Full Professor", "bibtex": "@misc{\ngao2022image,\ntitle={Image Dataset Compression Based on Matrix Product States},\nauthor={Ze-Feng Gao and Peiyu Liu and Xiao-Hui Zhang and Xin Zhao and Z. Y. Xie and Zhong-Yi Lu and Ji-Rong Wen},\nyear={2022},\nurl={https://openreview.net/forum?id=hkXZKTAH5g-}\n}", "github": "", "project": "", "reviewers": "MjbY;oesM;GkYX;mwL4", "site": "https://openreview.net/forum?id=hkXZKTAH5g-", "pdf_size": 0, "recommendation": "1;3;5;5", "confidence": "4;4;3;4", "correctness": "1;1;4;2", "technical_novelty": "1;1;3;2", "empirical_novelty": "1;1;3;2", "wc_summary_paper": "82;46;18;52", "wc_summary_review": "66;16;29;15", "wc_main_review": "251;269;99;208", "wc_review": "399;331;146;275", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.5, 1.6583123951777 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 2.0, 1.224744871391589 ], "technical_novelty_avg": [ 1.75, 0.82915619758885 ], "empirical_novelty_avg": [ 1.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 49.5, 22.73213584333861 ], "wc_summary_review_avg": [ 31.5, 20.670026608594387 ], "wc_main_review_avg": [ 206.75, 66.03928754915516 ], "wc_review_avg": [ 287.75, 92.87457940685384 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": -0.5222329678670935, "corr_recommendation_correctness": 0.7385489458759963, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:TD1BbbWjgRAJ:scholar.google.com/&scioq=Image+Dataset+Compression+Based+on+Matrix+Product+States&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0;0;0;0;0", "aff_unique_norm": "Renmin University of China", "aff_unique_dep": "", "aff_unique_url": "http://www.ruc.edu.cn", "aff_unique_abbr": "RUC", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Do We Need Anisotropic Graph Neural Networks?", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6535", "id": "hl9ePdHO4_s", "poster": "", "openreview": "https://openreview.net/forum?id=hl9ePdHO4_s", "slides": "https://iclr.cc/virtual/2022/poster/6535", "video": "https://iclr.cc/virtual/2022/poster/6535", "author_site": "Shyam Tailor, Felix Opolka, Pietro Lio, Nicholas Lane", "tldr": "", "abstract": "Common wisdom in the graph neural network (GNN) community dictates that anisotropic models---in which messages sent between nodes are a function of both the source and target node---are required to achieve state-of-the-art performance. Benchmarks to date have demonstrated that these models perform better than comparable isotropic models---where messages are a function of the source node only. In this work we provide empirical evidence challenging this narrative: we propose an isotropic GNN, which we call Efficient Graph Convolution (EGC), that consistently outperforms comparable anisotropic models, including the popular GAT or PNA architectures by using spatially-varying adaptive filters. In addition to raising important questions for the GNN community, our work has significant real-world implications for efficiency. EGC achieves higher model accuracy, with lower memory consumption and latency, along with characteristics suited to accelerator implementation, while being a drop-in replacement for existing architectures. As an isotropic model, it requires memory proportional to the number of vertices in the graph ($\\mathcal{O}(V)$); in contrast, anisotropic models require memory proportional to the number of edges ($\\mathcal{O}(E)$). We demonstrate that EGC outperforms existing approaches across 6 large and diverse benchmark datasets, and conclude by discussing questions that our work raise for the community going forward. Code and pretrained models for our experiments are provided at https://github.com/shyam196/egc.", "keywords": "graph neural networks;efficiency;latency reduction;memory reduction;architecture design;benchmarking;hardware-aware", "primary_area": "", "supplementary_material": "/attachment/7f8f481307bd6e5156cd01390f5b59064b078d33.zip", "author": "Shyam A. Tailor;Felix Opolka;Pietro Lio;Nicholas Donald Lane", "authorids": "~Shyam_A._Tailor1;~Felix_Opolka1;~Pietro_Lio1;~Nicholas_Donald_Lane1", "gender": "M;M;M;M", "homepage": "https://www.felixopolka.me;https://www.cst.cam.ac.uk/people/pl219;http://niclane.org;https://www.shyamt.com", "dblp": "239/4824;l/PietroLio.html;03/2663.html;256/9384", "google_scholar": ";https://scholar.google.co.uk/citations?user=3YrWf7EAAAAJ;https://scholar.google.co.uk/citations?hl=en;aJVp0DsAAAAJ", "orcid": ";0000-0002-0540-5053;0000-0002-2728-8273;", "linkedin": ";;niclane;", "or_profile": "~Felix_Opolka1;~Pietro_Lio1;~Nic_Lane2;~Shyam_Anil_Tailor1", "aff": "University of Cambridge;University of Cambridge;Samsung;Computer Laboratory", "aff_domain": "cam.ac.uk;cam.ac.uk;samsung.com;cl.cam.ac.uk", "position": "PhD student;Full Professor;Laboratory Director;PhD student", "bibtex": "@inproceedings{\ntailor2022adaptive,\ntitle={Adaptive Filters for Low-Latency and Memory-Efficient Graph Neural Networks},\nauthor={Shyam A. Tailor and Felix Opolka and Pietro Lio and Nicholas Donald Lane},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=hl9ePdHO4_s}\n}", "github": "", "project": "", "reviewers": "SZEe;Cq1y;gVtv;x8K1", "pdf_size": 0, "recommendation": "3;6;6;8", "confidence": "4;3;3;3", "correctness": "3;3;3;3", "technical_novelty": "3;3;3;3", "empirical_novelty": "2;2;3;4", "wc_summary_paper": "120;22;130;67", "wc_summary_review": "140;75;106;65", "wc_main_review": "621;522;315;254", "wc_review": "881;619;551;386", "wc_reply_reviewers": "0;0;333;0", "wc_reply_authors": "786;776;796;496", "reply_reviewers": "0;0;1;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 5.75, 1.7853571071357126 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 84.75, 43.42450345139251 ], "wc_summary_review_avg": [ 96.5, 29.312966414199707 ], "wc_main_review_avg": [ 428.0, 149.27323939675188 ], "wc_review_avg": [ 609.25, 178.30924681574984 ], "wc_reply_reviewers_avg": [ 83.25, 144.19322973010904 ], "wc_reply_authors_avg": [ 713.5, 125.77261228105267 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.8892972917998875, "corr_recommendation_correctness": 0.0, "gs_citation": 65, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9237123539359397207&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=hl9ePdHO4_s", "email": "cam.ac.uk;cam.ac.uk;samsung.com;cl.cam.ac.uk", "author_num": 4, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "University of Cambridge;Samsung", "aff_unique_dep": ";Samsung", "aff_unique_url": "https://www.cam.ac.uk;https://www.samsung.com", "aff_unique_abbr": "Cambridge;Samsung", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Cambridge;", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "United Kingdom;South Korea" }, { "title": "Learning 3D Representations of Molecular Chirality with Invariance to Bond Rotations", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6759", "id": "hm2tNDdgaFK", "poster": "", "openreview": "https://openreview.net/forum?id=hm2tNDdgaFK", "slides": "https://iclr.cc/virtual/2022/poster/6759", "video": "https://iclr.cc/virtual/2022/poster/6759", "author_site": "Keir Adams, Lagnajit Pattanaik, Connor Coley", "tldr": "", "abstract": "Molecular chirality, a form of stereochemistry most often describing relative spatial arrangements of bonded neighbors around tetrahedral carbon centers, influences the set of 3D conformers accessible to the molecule without changing its 2D graph connectivity. Chirality can strongly alter (bio)chemical interactions, particularly protein-drug binding. Most 2D graph neural networks (GNNs) designed for molecular property prediction at best use atomic labels to na\u00efvely treat chirality, while E(3)-invariant 3D GNNs are invariant to chirality altogether. To enable representation learning on molecules with defined stereochemistry, we design an SE(3)-invariant model that processes torsion angles of a 3D molecular conformer. We explicitly model conformational flexibility by integrating a novel type of invariance to rotations about internal molecular bonds into the architecture, mitigating the need for multi-conformer data augmentation. We test our model on four benchmarks: contrastive learning to distinguish conformers of different stereoisomers in a learned latent space, classification of chiral centers as R/S, prediction of how enantiomers rotate circularly polarized light, and ranking enantiomers by their docking scores in an enantiosensitive protein pocket. We compare our model, Chiral InterRoto-Invariant Neural Network (ChIRo), with 2D and 3D GNNs to demonstrate that our model achieves state of the art performance when learning chiral-sensitive functions from molecular structures.", "keywords": "geometric deep learning;equivariance;molecules", "primary_area": "", "supplementary_material": "/attachment/8530a394c5fd12494fa781bfe7ea203a48598f29.zip", "author": "Keir Adams;Lagnajit Pattanaik;Connor W. Coley", "authorids": "~Keir_Adams1;~Lagnajit_Pattanaik1;~Connor_W._Coley1", "gender": "M;M;M", "homepage": ";;https://coley.mit.edu", "dblp": ";;206/6284", "google_scholar": "eh75v58AAAAJ;bVT6lpwAAAAJ;l015S80AAAAJ", "orcid": "0000-0001-9035-7959;;0000-0002-8271-8723", "linkedin": "keir-adams-584675167/;;", "or_profile": "~Keir_Adams1;~Lagnajit_Pattanaik1;~Connor_Coley1", "aff": "Massachusetts Institute of Technology;Massachusetts Institute of Technology;Massachusetts Institute of Technology", "aff_domain": "mit.edu;mit.edu;mit.edu", "position": "PhD student;PhD student;Assistant Professor", "bibtex": "@inproceedings{\nadams2022learning,\ntitle={Learning 3D Representations of Molecular Chirality with Invariance to Bond Rotations},\nauthor={Keir Adams and Lagnajit Pattanaik and Connor W. Coley},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=hm2tNDdgaFK}\n}", "github": "", "project": "", "reviewers": "Viqv;pDCs;KqtQ;3jRF", "pdf_size": 0, "recommendation": "5;6;8;8", "confidence": "4;3;4;5", "correctness": "3;3;4;4", "technical_novelty": "2;3;4;3", "empirical_novelty": "3;3;3;3", "wc_summary_paper": "108;55;31;66", "wc_summary_review": "35;28;78;28", "wc_main_review": "518;174;594;280", "wc_review": "661;257;703;374", "wc_reply_reviewers": "229;0;191;179", "wc_reply_authors": "1832;766;1462;1373", "reply_reviewers": "1;0;1;3", "reply_authors": "4;1;3;3", "recommendation_avg": [ 6.75, 1.299038105676658 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 65.0, 27.865749586185547 ], "wc_summary_review_avg": [ 42.25, 20.837166314064877 ], "wc_main_review_avg": [ 391.5, 170.84130062721954 ], "wc_review_avg": [ 498.75, 188.44677630567205 ], "wc_reply_reviewers_avg": [ 149.75, 88.40637703242906 ], "wc_reply_authors_avg": [ 1358.25, 382.812209183563 ], "reply_reviewers_avg": [ 1.25, 1.0897247358851685 ], "reply_authors_avg": [ 2.75, 1.0897247358851685 ], "replies_avg": [ 25, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.5443310539518174, "corr_recommendation_correctness": 0.9622504486493761, "gs_citation": 49, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12423983501128658396&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=hm2tNDdgaFK", "email": "mit.edu;mit.edu;mit.edu", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Massachusetts Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://web.mit.edu", "aff_unique_abbr": "MIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "DeSKO: Stability-Assured Robust Control with a Deep Stochastic Koopman Operator", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7057", "id": "hniLRD_XCA", "poster": "", "openreview": "https://openreview.net/forum?id=hniLRD_XCA", "slides": "https://iclr.cc/virtual/2022/poster/7057", "video": "https://iclr.cc/virtual/2022/poster/7057", "author_site": "Minghao Han, Jacob Euler-Rolle, Robert Katzschmann", "tldr": "", "abstract": "The Koopman operator theory linearly describes nonlinear dynamical systems in a high-dimensional functional space and it allows to apply linear control methods to highly nonlinear systems. However, the Koopman operator does not account for any uncertainty in dynamical systems, causing it to perform poorly in real-world applications.\nTherefore, we propose a deep stochastic Koopman operator (DeSKO) model in a robust learning control framework to guarantee stability of nonlinear stochastic systems. The DeSKO model captures a dynamical system's uncertainty by inferring a distribution of observables. We use the inferred distribution to design a robust, stabilizing closed-loop controller for a dynamical system. Modeling and control experiments on several advanced control benchmarks show that our framework is more robust and scalable than state-of-the-art deep Koopman operators and reinforcement learning methods. Tested control benchmarks include a soft robotic arm, a legged robot, and a biological gene regulatory network. We also demonstrate that this robust control method resists previously unseen uncertainties, such as external disturbances, with a magnitude of up to five times the maximum control input. Our approach opens up new possibilities in learning control for high-dimensional nonlinear systems while robustly managing internal or external uncertainty.", "keywords": "Koopman Operator;Robust Control;Robotics;Model Predictive Control;Soft Robotics", "primary_area": "", "supplementary_material": "/attachment/8fd81957080cab01cd427ae09252f3266e67e4e5.zip", "author": "Minghao Han;Jacob Euler-Rolle;Robert K. Katzschmann", "authorids": "~Minghao_Han2;~Jacob_Euler-Rolle1;~Robert_K._Katzschmann1", "gender": "M;M;Not Specified", "homepage": "https://hithmh.github.io/MinghaoHan/;;http://srl.ethz.ch", "dblp": ";;139/3491", "google_scholar": "vSFTX1AAAAAJ;;https://scholar.google.ch/citations?hl=en", "orcid": ";;0000-0001-7143-7259", "linkedin": ";https://linkedin.com/in/jacob-euler-rolle-146398181;robertkatzschmann/", "or_profile": "~Minghao_Han2;~Jacob_Euler-Rolle1;~Robert_Kevin_Katzschmann1", "aff": "Harbin Institue of Technology;Swiss Federal Institute of Technology;Swiss Federal Institute of Technology", "aff_domain": "hit.edu.cn;ethz.ch;ethz.ch", "position": "PhD student;MS student;Assistant Professor", "bibtex": "@inproceedings{\nhan2022desko,\ntitle={De{SKO}: Stability-Assured Robust Control with a Deep Stochastic Koopman Operator},\nauthor={Minghao Han and Jacob Euler-Rolle and Robert K. Katzschmann},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=hniLRD_XCA}\n}", "github": "", "project": "", "reviewers": "GCEV;R938;sstK;25j7", "pdf_size": 0, "recommendation": "6;6;6;8", "confidence": "5;3;2;4", "correctness": "3;3;3;4", "technical_novelty": "3;3;2;2", "empirical_novelty": "2;3;2;3", "wc_summary_paper": "70;151;70;159", "wc_summary_review": "43;22;43;30", "wc_main_review": "345;257;174;75", "wc_review": "458;430;287;264", "wc_reply_reviewers": "37;0;0;0", "wc_reply_authors": "987;443;0;21", "reply_reviewers": "1;0;0;0", "reply_authors": "2;1;0;1", "recommendation_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 3.5, 1.118033988749895 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 112.5, 42.59401366389413 ], "wc_summary_review_avg": [ 34.5, 8.958236433584458 ], "wc_main_review_avg": [ 212.75, 99.90589321956938 ], "wc_review_avg": [ 359.75, 85.21846924229513 ], "wc_reply_reviewers_avg": [ 9.25, 16.021469970012117 ], "wc_reply_authors_avg": [ 362.75, 401.4065115316392 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.7071067811865476 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.2581988897471611, "corr_recommendation_correctness": 1.0, "gs_citation": 53, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=662372954932963003&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2, "pdf": "https://openreview.net/pdf?id=hniLRD_XCA", "email": "hit.edu.cn;ethz.ch;ethz.ch", "author_num": 3, "aff_unique_index": "0;1;1", "aff_unique_norm": "Harbin Institute of Technology;Swiss Federal Institute of Technology", "aff_unique_dep": ";", "aff_unique_url": "http://www.hit.edu.cn/;https://www.ethz.ch", "aff_unique_abbr": "HIT;ETH Zurich", "aff_campus_unique_index": "0", "aff_campus_unique": "Harbin;", "aff_country_unique_index": "0;1;1", "aff_country_unique": "China;Switzerland" }, { "id": "hopfHdHZGYe", "title": "TaCE: Time-aware Convolutional Embedding Learning for Temporal Knowledge Graph Completion", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Temporal knowledge graph completion (TKGC) is a challenging task to infer the missing component for quadruples. The key challenge lies at how to integrate time information into the embeddings of entities and relations. Recent TKGC methods tend to capture temporal patterns via linear or multilinear models, which are fast but not expressive enough. In this study, we propose a novel time-aware convolutional embedding model (TaCE) to represent the time-dependent facts in the task of TKGC. It highlights its novelty to feasibly convert timestamps as temporal convolutional filters to fully interact with entities and relations and learn temporal patterns in knowledge graphs (KGs). An extensive comparison proves that our model outperforms the state-of-the-art models on three public benchmark datasets of ICEWS14, ICEWS05-15 and GDELT. Results also demonstrate good temporal expressiveness and computation efficiency performed by our TaCE.", "keywords": "Representation learning;temporal knowledge graph completion;knowledge graph embedding;convolutional neural networks", "primary_area": "", "supplementary_material": "", "author": "Jin Luo;Hong Shen;YanFeng Hu;Chen Peng", "authorids": "~Jin_Luo3;shenhong@aircas.ac.cn;huyf@aircas.ac.cn;cpeng@mail.ie.ac.cn", "gender": ";;;", "homepage": "http://www.aircas.ac.cn/;;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Jin_Luo3;shenhong@aircas.ac.cn;huyf@aircas.ac.cn;cpeng@mail.ie.ac.cn", "aff": "Aerospace Information Research Institute, Suzhou;;;", "aff_domain": "aircas.ac.cn;;;", "position": "Researcher;;;", "bibtex": "@misc{\nluo2022tace,\ntitle={Ta{CE}: Time-aware Convolutional Embedding Learning for Temporal Knowledge Graph Completion},\nauthor={Jin Luo and Hong Shen and YanFeng Hu and Chen Peng},\nyear={2022},\nurl={https://openreview.net/forum?id=hopfHdHZGYe}\n}", "github": "", "project": "", "reviewers": "9ecU;yWGz;7tga;AUM2", "site": "https://openreview.net/forum?id=hopfHdHZGYe", "pdf_size": 0, "recommendation": "3;6;6;6", "confidence": "3;4;4;4", "correctness": "3;4;3;3", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "31;30;162;103", "wc_summary_review": "126;42;63;37", "wc_main_review": "95;201;230;356", "wc_review": "252;273;455;496", "wc_reply_reviewers": "0;97;0;263", "wc_reply_authors": "385;1049;163;1952", "reply_reviewers": "0;1;0;2", "reply_authors": "2;2;1;4", "recommendation_avg": [ 5.25, 1.299038105676658 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 81.5, 55.10217781540037 ], "wc_summary_review_avg": [ 67.0, 35.43303543305315 ], "wc_main_review_avg": [ 220.5, 92.97983652383994 ], "wc_review_avg": [ 369.0, 107.73810839252748 ], "wc_reply_reviewers_avg": [ 90.0, 107.44533493828385 ], "wc_reply_authors_avg": [ 887.25, 695.8176395435804 ], "reply_reviewers_avg": [ 0.75, 0.82915619758885 ], "reply_authors_avg": [ 2.25, 1.0897247358851685 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 1.0, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3362661291411952744&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "Aerospace Information Research Institute", "aff_unique_dep": "", "aff_unique_url": "", "aff_unique_abbr": "", "aff_campus_unique_index": "0", "aff_campus_unique": "Suzhou", "aff_country_unique_index": "0", "aff_country_unique": "China" }, { "title": "You are AllSet: A Multiset Function Framework for Hypergraph Neural Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6302", "id": "hpBTIv2uy_E", "poster": "", "openreview": "https://openreview.net/forum?id=hpBTIv2uy_E", "slides": "https://iclr.cc/virtual/2022/poster/6302", "video": "https://iclr.cc/virtual/2022/poster/6302", "author_site": "Eli Chien, Chao Pan, Jianhao Peng, Olgica Milenkovic", "tldr": "", "abstract": "Hypergraphs are used to model higher-order interactions amongst agents and there exist many practically relevant instances of hypergraph datasets. To enable the efficient processing of hypergraph data, several hypergraph neural network platforms have been proposed for learning hypergraph properties and structure, with a special focus on node classification tasks. However, almost all existing methods use heuristic propagation rules and offer suboptimal performance on benchmarking datasets. We propose AllSet, a new hypergraph neural network paradigm that represents a highly general framework for (hyper)graph neural networks and for the first time implements hypergraph neural network layers as compositions of two multiset functions that can be efficiently learned for each task and each dataset. The proposed AllSet framework also for the first time integrates Deep Sets and Set Transformers with hypergraph neural networks for the purpose of learning multiset functions and therefore allows for significant modeling flexibility and high expressive power. To evaluate the performance of AllSet, we conduct the most extensive experiments to date involving ten known benchmarking datasets and three newly curated datasets that represent significant challenges for hypergraph node classification. The results demonstrate that our method has the unique ability to either match or outperform all other hypergraph neural networks across the tested datasets: As an example, the performance improvements over existing methods and a new method based on heterogeneous graph neural networks are close to $4\\%$ on the Yelp and Zoo datasets, and $3\\%$ on the Walmart dataset.", "keywords": "Hypergraph neural networks;multiset functions;deep sets;set transformer", "primary_area": "", "supplementary_material": "/attachment/420eafec1d6aa1ab58d9af3fa745a9e5bf43293f.zip", "author": "Eli Chien;Chao Pan;Jianhao Peng;Olgica Milenkovic", "authorids": "~Eli_Chien1;~Chao_Pan2;~Jianhao_Peng1;~Olgica_Milenkovic1", "gender": "M;;F;M", "homepage": ";;https://www.ece.illinois.edu/directory/profile/milenkov/;https://sites.google.com/view/eli-chien/home", "dblp": "06/7730-3;223/8281;m/OlgicaMilenkovic;222/3243", "google_scholar": "M3T3YPIAAAAJ;https://scholar.google.com/citations?hl=en;G4LSqL8AAAAJ;N3BuEnYAAAAJ", "orcid": "0000-0002-9275-7072;;;", "linkedin": "chao-pan-5abb7314b/;jianhao-peng-598a47120/;;", "or_profile": "~Chao_Pan2;~Jianhao_Peng1;~Olgica_Milenkovic1;~I_Chien2", "aff": "University of Illinois, Urbana Champaign;;;University of Illinois, Urbana-Champaign", "aff_domain": "illinois.edu;;;uiuc.edu", "position": "PhD student;;;PhD student", "bibtex": "@inproceedings{\nchien2022you,\ntitle={You are AllSet: A Multiset Function Framework for Hypergraph Neural Networks},\nauthor={Eli Chien and Chao Pan and Jianhao Peng and Olgica Milenkovic},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=hpBTIv2uy_E}\n}", "github": "", "project": "", "reviewers": "LMVx;Bdsv;unoj;6wpr", "pdf_size": 0, "recommendation": "6;6;8;8", "confidence": "5;3;3;4", "correctness": "3;3;4;4", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "76;75;67;13", "wc_summary_review": "44;43;74;9", "wc_main_review": "387;164;242;113", "wc_review": "507;282;383;135", "wc_reply_reviewers": "276;0;0;0", "wc_reply_authors": "1845;755;698;26", "reply_reviewers": "2;0;0;0", "reply_authors": "4;2;2;1", "recommendation_avg": [ 7.0, 1.0 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 57.75, 26.070817018267764 ], "wc_summary_review_avg": [ 42.5, 23.005434140654682 ], "wc_main_review_avg": [ 226.5, 103.42751084696953 ], "wc_review_avg": [ 326.75, 136.40449955921542 ], "wc_reply_reviewers_avg": [ 69.0, 119.51150572225254 ], "wc_reply_authors_avg": [ 831.0, 651.8600309882482 ], "reply_reviewers_avg": [ 0.5, 0.8660254037844386 ], "reply_authors_avg": [ 2.25, 1.0897247358851685 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.30151134457776363, "corr_recommendation_correctness": 1.0, "gs_citation": 184, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2657795859999531247&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=hpBTIv2uy_E", "email": "illinois.edu;;;uiuc.edu", "author_num": 4, "aff_unique_index": "0;1", "aff_unique_norm": "University of Illinois Urbana-Champaign;University of Illinois", "aff_unique_dep": ";", "aff_unique_url": "https://illinois.edu;https://illinois.edu", "aff_unique_abbr": "UIUC;UIUC", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Urbana-Champaign", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "hq7vLjZTJPk", "title": "A Communication-Efficient Distributed Gradient Clipping Algorithm for Training Deep Neural Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "In distributed training of deep neural networks or Federated Learning (FL), people usually run Stochastic Gradient Descent (SGD) or its variants on each machine and communicate with other machines periodically. However, SGD might converge slowly in training some deep neural networks (e.g., RNN, LSTM) because of the exploding gradient issue. Gradient clipping is usually employed to address this issue in the single machine setting, but exploring this technique in the FL setting is still in its infancy: it remains mysterious whether the gradient clipping scheme can take advantage of multiple machines to enjoy parallel speedup in the FL setting. The main technical difficulty lies at dealing with nonconvex loss function, non-Lipschitz continuous gradient, and skipping communication rounds simultaneously. In this paper, we explore a relaxed-smoothness assumption of the loss landscape which LSTM was shown to satisfy in previous works, and design a communication-efficient gradient clipping algorithm. This algorithm can be run on multiple machines, where each machine employs a gradient clipping scheme and communicate with other machines after multiple steps of gradient-based updates. Our algorithm is proved to have $O\\left(\\frac{1}{N\\epsilon^4}\\right)$ iteration complexity for finding an $\\epsilon$-stationary point, where $N$ is the number of machines. This indicates that our algorithm enjoys linear speedup. Our experiments on several benchmark datasets demonstrate that our algorithm indeed exhibits fast convergence speed in practice and validate our theory.", "keywords": "Distributed Training;Federated Learning;Gradient Clipping;Communication-Efficient;Optimization", "primary_area": "", "supplementary_material": "/attachment/285f5f932d48cecf0cce4cdc6a2c3d30f682abfd.zip", "author": "Chunyang Liao;Zhenxun Zhuang;Mingrui Liu", "authorids": "~Chunyang_Liao1;~Zhenxun_Zhuang1;~Mingrui_Liu2", "gender": "M;M;", "homepage": "https://liaochunyang.github.io;http://cs-people.bu.edu/zxzhuang;https://mingrliu.github.io", "dblp": ";234/8537;", "google_scholar": "eAPnyCsAAAAJ;;KFoEnFQAAAAJ", "orcid": "0000-0001-8359-1747;;", "linkedin": ";zhenxunzhuang/;mingrui-liu-447a2aab/", "or_profile": "~Chunyang_Liao1;~Zhenxun_Zhuang1;~Mingrui_Liu2", "aff": "Texas A&M;Boston University;George Mason University", "aff_domain": "tamu.edu;bu.edu;gmu.edu", "position": "PhD student;PhD student;Assistant Professor", "bibtex": "@misc{\nliao2022a,\ntitle={A Communication-Efficient Distributed Gradient Clipping Algorithm for Training Deep Neural Networks},\nauthor={Chunyang Liao and Zhenxun Zhuang and Mingrui Liu},\nyear={2022},\nurl={https://openreview.net/forum?id=hq7vLjZTJPk}\n}", "github": "", "project": "", "reviewers": "nQu7;63cQ;oVjB;qwWt", "site": "https://openreview.net/forum?id=hq7vLjZTJPk", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "4;4;3;4", "correctness": "1;4;3;4", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "145;26;92;84", "wc_summary_review": "78;8;45;40", "wc_main_review": "1272;241;258;193", "wc_review": "1495;275;395;317", "wc_reply_reviewers": "1888;0;0;0", "wc_reply_authors": "3512;632;657;147", "reply_reviewers": "4;0;0;0", "reply_authors": "5;2;2;1", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.0, 1.224744871391589 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 86.75, 42.18634257671551 ], "wc_summary_review_avg": [ 42.75, 24.81305100143874 ], "wc_main_review_avg": [ 491.0, 451.5401421800724 ], "wc_review_avg": [ 620.5, 506.72551741549387 ], "wc_reply_reviewers_avg": [ 472.0, 817.5279811725101 ], "wc_reply_authors_avg": [ 1237.0, 1329.1115453565212 ], "reply_reviewers_avg": [ 1.0, 1.7320508075688772 ], "reply_authors_avg": [ 2.5, 1.5 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.13245323570650439, "corr_recommendation_correctness": 0.9365858115816939, "gs_citation": 33, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5333604100052232790&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff_unique_index": "0;1;2", "aff_unique_norm": "Texas A&M University;Boston University;George Mason University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.tamu.edu;https://www.bu.edu;https://www.gmu.edu", "aff_unique_abbr": "TAMU;BU;GMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "hqkN6lE1fFQ", "title": "Kernel Deformed Exponential Families for Sparse Continuous Attention", "track": "main", "status": "Reject", "tldr": "", "abstract": "Attention mechanisms take an expectation of a data representation with respect to probability weights. This creates summary statistics that focus on important features. Recently, Martins et al. (2020, 2021) proposed continuous attention mechanisms, focusing on unimodal attention densities from the exponential and deformed exponential families: the latter has sparse support. Farinhas et al. (2021) extended this to use Gaussian mixture attention densities, which are a flexible class with dense support. In this paper, we extend this to two general flexible classes: kernel exponential families and our new sparse counterpart kernel deformed exponential families. Theoretically, we show new existence results for both kernel exponential and deformed exponential families, and that the deformed case has similar approximation capabilities to kernel exponential families. Experiments show that kernel deformed exponential families can attend to non-overlapping intervals of time.", "keywords": "kernel methods;attention mechanism;theory;exponential families;deformed exponential families", "primary_area": "", "supplementary_material": "", "author": "Alexander Moreno;Supriya Nagesh;Zhenke Wu;Walter Dempsey;James Matthew Rehg", "authorids": "~Alexander_Moreno1;~Supriya_Nagesh1;~Zhenke_Wu1;wdem@umich.edu;~James_Matthew_Rehg1", "gender": "M;F;M;;", "homepage": ";https://supriyanagesh94.github.io/;https://zhenkewu.com;;", "dblp": "161/6588;;259/3143;;", "google_scholar": "zoqP2-IAAAAJ;i5qnTjAAAAAJ;3ffCNrEAAAAJ;;", "orcid": ";;0000-0001-7582-669X;;", "linkedin": ";;;;", "or_profile": "~Alexander_Moreno1;~Supriya_Nagesh1;~Zhenke_Wu1;wdem@umich.edu;~James_Matthew_Rehg1", "aff": "Luminous Computing;Georgia Institute of Technology;;;", "aff_domain": "lmns.com;gatech.edu;;;", "position": "Researcher;PhD student;;;", "bibtex": "@misc{\nmoreno2022kernel,\ntitle={Kernel Deformed Exponential Families for Sparse Continuous Attention},\nauthor={Alexander Moreno and Supriya Nagesh and Zhenke Wu and Walter Dempsey and James Matthew Rehg},\nyear={2022},\nurl={https://openreview.net/forum?id=hqkN6lE1fFQ}\n}", "github": "", "project": "", "reviewers": "eE87;wuA4;w8oo", "site": "https://openreview.net/forum?id=hqkN6lE1fFQ", "pdf_size": 0, "recommendation": "3;5;6", "confidence": "2;3;2", "correctness": "4;3;4", "technical_novelty": "2;3;3", "empirical_novelty": "2;2;2", "wc_summary_paper": "55;227;83", "wc_summary_review": "27;96;27", "wc_main_review": "166;287;164", "wc_review": "248;610;274", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 4.666666666666667, 1.247219128924647 ], "confidence_avg": [ 2.3333333333333335, 0.4714045207910317 ], "correctness_avg": [ 3.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 121.66666666666667, 75.35397947170556 ], "wc_summary_review_avg": [ 50.0, 32.526911934581186 ], "wc_main_review_avg": [ 205.66666666666666, 57.517147201701704 ], "wc_review_avg": [ 377.3333333333333, 164.86223204711123 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.18898223650461363, "corr_recommendation_correctness": -0.18898223650461363, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5287278774361865869&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1", "aff_unique_norm": "Luminous Computing;Georgia Institute of Technology", "aff_unique_dep": ";", "aff_unique_url": ";https://www.gatech.edu", "aff_unique_abbr": ";Georgia Tech", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "1", "aff_country_unique": ";United States" }, { "title": "Learning Towards The Largest Margins", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/5901", "id": "hqkhcFHOeKD", "poster": "", "openreview": "https://openreview.net/forum?id=hqkhcFHOeKD", "slides": "https://iclr.cc/virtual/2022/poster/5901", "video": "https://iclr.cc/virtual/2022/poster/5901", "author_site": "Xiong Zhou, Xianming Liu, Deming Zhai, Junjun Jiang, Xin Gao, Xiangyang Ji", "tldr": "", "abstract": "One of the main challenges for feature representation in deep learning-based classification is the design of appropriate loss functions that exhibit strong discriminative power. The classical softmax loss does not explicitly encourage discriminative learning of features. A popular direction of research is to incorporate margins in well-established losses in order to enforce extra intra-class compactness and inter-class separability, which, however, were developed through heuristic means, as opposed to rigorous mathematical principles. In this work, we attempt to address this limitation by formulating the principled optimization objective as learning towards the largest margins. Specifically, we firstly propose to employ the class margin as the measure of inter-class separability, and the sample margin as the measure of intra-class compactness. Accordingly, to encourage discriminative representation of features, the loss function should promote the largest possible margins for both classes and samples. Furthermore, we derive a generalized margin softmax loss to draw general conclusions for the existing margin-based losses. Not only does this principled framework offer new perspectives to understand and interpret existing margin-based losses, but it also provides new insights that can guide the design of new tools, including \\textit{sample margin regularization} and \\textit{largest margin softmax loss} for class balanced cases, and \\textit{zero centroid regularization} for class imbalanced cases. Experimental results demonstrate the effectiveness of our strategy for multiple tasks including visual classification, imbalanced classification, person re-identification, and face verification.", "keywords": "loss function design;margin-based loss;classification", "primary_area": "", "supplementary_material": "/attachment/27d3497792e0f4eff3040757232400ab271583da.zip", "author": "Xiong Zhou;Xianming Liu;Deming Zhai;Junjun Jiang;Xin Gao;Xiangyang Ji", "authorids": "~Xiong_Zhou3;~Xianming_Liu5;~Deming_Zhai2;~Junjun_Jiang2;~Xin_Gao1;~Xiangyang_Ji1", "gender": "M;M;F;M;M;", "homepage": "https://hitcszx.github.io/;http://homepage.hit.edu.cn/xmliu;;http://homepage.hit.edu.cn/jiangjunjun;http://cemse.kaust.edu.sa/sfb;", "dblp": ";89/58201.html;69/8937;https://dblp.uni-trier.de/pers/hd/j/Jiang:Junjun;56/2203-1.html;", "google_scholar": "BMGootgAAAAJ;;;WNH2_rgAAAAJ;https://scholar.google.ca/citations?user=wqdK8ugAAAAJ;", "orcid": "0000-0002-0856-6696;0000-0002-8857-1785;;0000-0002-5694-505X;0000-0002-7108-3574;", "linkedin": ";;;;;", "or_profile": "~Xiong_Zhou3;~Xianming_Liu5;~Deming_Zhai2;~Junjun_Jiang2;~Xin_Gao1;~Xiangyang_Ji1", "aff": "Harbin Institute of Technology;Harbin Institute of Technology;Harbin Institute of Technology;Harbin Institute of Technology;King Abdullah University of Science and Technology;", "aff_domain": "hit.edu.cn;hit.edu.cn;hit.edu.cn;hit.edu.cn;kaust.edu.sa;", "position": "PhD student;Full Professor;Associate Professor;Full Professor;Full Professor;", "bibtex": "@inproceedings{\nzhou2022learning,\ntitle={Learning Towards The Largest Margins},\nauthor={Xiong Zhou and Xianming Liu and Deming Zhai and Junjun Jiang and Xin Gao and Xiangyang Ji},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=hqkhcFHOeKD}\n}", "github": "", "project": "", "reviewers": "3YiD;pU1u;pGzf;sx1i", "pdf_size": 0, "recommendation": "6;6;8;8", "confidence": "4;4;4;4", "correctness": "3;3;4;4", "technical_novelty": "3;3;3;3", "empirical_novelty": "3;2;3;3", "wc_summary_paper": "88;96;106;56", "wc_summary_review": "25;95;197;25", "wc_main_review": "160;595;164;136", "wc_review": "273;786;467;217", "wc_reply_reviewers": "11;0;0;0", "wc_reply_authors": "354;1583;364;401", "reply_reviewers": "1;0;0;0", "reply_authors": "1;3;1;1", "recommendation_avg": [ 7.0, 1.0 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 86.5, 18.728320800328042 ], "wc_summary_review_avg": [ 85.5, 70.43259188756296 ], "wc_main_review_avg": [ 263.75, 191.54682847805128 ], "wc_review_avg": [ 435.75, 222.48075759489853 ], "wc_reply_reviewers_avg": [ 2.75, 4.763139720814412 ], "wc_reply_authors_avg": [ 675.5, 524.2377800197158 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.5, 0.8660254037844386 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 1.0, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8954887171565551941&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=hqkhcFHOeKD", "email": "hit.edu.cn;hit.edu.cn;hit.edu.cn;hit.edu.cn;kaust.edu.sa;", "author_num": 6, "aff_unique_index": "0;0;0;0;1", "aff_unique_norm": "Harbin Institute of Technology;King Abdullah University of Science and Technology", "aff_unique_dep": ";", "aff_unique_url": "http://www.hit.edu.cn/;https://www.kast.kau.edu.sa", "aff_unique_abbr": "HIT;KAUST", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Harbin;", "aff_country_unique_index": "0;0;0;0;1", "aff_country_unique": "China;Saudi Arabia" }, { "id": "ht61oVsaya", "title": "DESTA: A Framework for Safe Reinforcement Learning with Markov Games of Intervention", "track": "main", "status": "Reject", "tldr": "", "abstract": "Exploring in an unknown system can place an agent in dangerous situations,\nexposing to potentially catastrophic hazards. Many current approaches for tackling\nsafe learning in reinforcement learning (RL) lead to a trade-off between safe\nexploration and fulfilling the task. Though these methods possibly incur fewer\nsafety violations they often also lead to reduced task performance. In this paper, we\ntake the first step in introducing a generation of RL solvers that learn to minimise\nsafety violations while maximising the task reward to the extend that can be\ntolerated by safe policies. Our approach uses a new two-player framework for safe\nRL called DESTA. The core of DESTA is a novel game between two RL agents:\nSafety Agent that is delegated the task of minimising safety violations and Task\nAgent whose goal is to maximise the reward set by the environment task. Safety\nAgent can selectively take control of the system at any given point to prevent\nsafety violations while Task Agent is free to execute its actions at all other states.\nThis framework enables Safety Agent to learn to take actions that minimise future\nsafety violations (during and after training) by performing safe actions at certain\nstates while Task Agent performs actions that maximise the task performance\neverywhere else. We demonstrate DESTA\u2019s ability to tackle challenging tasks and\ncompare against state-of-the-art RL methods in Safety Gym Benchmarks which\nsimulate real-world physical systems and OpenAI\u2019s Lunar Lander.\n", "keywords": "Safe reinforcement learning;safety;Markov games;stochastic games", "primary_area": "", "supplementary_material": "", "author": "David Henry Mguni;Joel Jennings;Taher Jafferjee;Aivar Sootla;Changmin Yu;Usman Islam;Ziyan Wang;Yaodong Yang;Jun Wang", "authorids": "~David_Henry_Mguni1;~Joel_Jennings1;~Taher_Jafferjee1;~Aivar_Sootla1;~Changmin_Yu1;usman.islam.personal@gmail.com;~Ziyan_Wang3;~Yaodong_Yang1;~Jun_Wang2", "gender": "M;;Not Specified;M;M;;M;M;M", "homepage": ";;https://atlashugs.github.io/;;https://changmin-yu.github.io;;https://ziyan-wang98.github.io/;https://www.yangyaodong.com;http://www0.cs.ucl.ac.uk/staff/jun.wang/", "dblp": "217/2369;;267/1551;66/9184;266/9733;;;170/1496-1;w/JunWang12", "google_scholar": "K-_yzBsAAAAJ;;;https://scholar.google.co.uk/citations?hl=en;;;1Yu8JFIAAAAJ;https://scholar.google.co.uk/citations?user=6yL0xw8AAAAJ;https://scholar.google.co.uk/citations?user=wIE1tY4AAAAJ", "orcid": ";;;;;;;0000-0001-8132-5613;", "linkedin": ";;;;;;;yaodong-yang;", "or_profile": "~David_Henry_Mguni1;~Joel_Jennings1;~Taher_Jafferjee1;~Aivar_Sootla1;~Changmin_Yu1;usman.islam.personal@gmail.com;~Ziyan_Wang3;~Yaodong_Yang1;~Jun_Wang2", "aff": "Queen Mary University, London;;Huawei Technologies Ltd.;Huawei R&D UK;University College London;;King's College London;King's College London;University College London", "aff_domain": "qmul.ac.uk;;huawei.com;huawei.com;ucl.ac.uk;;kcl.ac.uk;kcl.ac.uk;ucl.ac.uk", "position": "Lecturer;;Researcher;Research scientist;PhD student;;PhD student;Assistant Professor;Professor", "bibtex": "@misc{\nmguni2022desta,\ntitle={{DESTA}: A Framework for Safe Reinforcement Learning with Markov Games of Intervention},\nauthor={David Henry Mguni and Joel Jennings and Taher Jafferjee and Aivar Sootla and Changmin Yu and Usman Islam and Ziyan Wang and Yaodong Yang and Jun Wang},\nyear={2022},\nurl={https://openreview.net/forum?id=ht61oVsaya}\n}", "github": "", "project": "", "reviewers": "mFYr;vp9k;7tNg;BFBF", "site": "https://openreview.net/forum?id=ht61oVsaya", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "2;3;2;2", "correctness": "2;2;2;3", "technical_novelty": "2;3;3;2", "empirical_novelty": "2;2;3;0", "wc_summary_paper": "83;88;52;28", "wc_summary_review": "36;60;41;35", "wc_main_review": "416;475;519;174", "wc_review": "535;623;612;237", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 2.25, 0.4330127018922193 ], "correctness_avg": [ 2.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 1.75, 1.0897247358851685 ], "wc_summary_paper_avg": [ 62.75, 24.34517405975977 ], "wc_summary_review_avg": [ 43.0, 10.074720839804943 ], "wc_main_review_avg": [ 396.0, 133.2797809121849 ], "wc_review_avg": [ 501.75, 156.56847543487163 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 9, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": 1.0, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11071017814996608833&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1;1;2;3;3;2", "aff_unique_norm": "Queen Mary University of London;Huawei;University College London;King's College London", "aff_unique_dep": ";Huawei Technologies;;", "aff_unique_url": "https://www.qmul.ac.uk;https://www.huawei.com;https://www.ucl.ac.uk;https://www.kcl.ac.uk", "aff_unique_abbr": "QMUL;Huawei;UCL;KCL", "aff_campus_unique_index": "0", "aff_campus_unique": "London;", "aff_country_unique_index": "0;1;0;0;0;0;0", "aff_country_unique": "United Kingdom;China" }, { "title": "FALCON: Fast Visual Concept Learning by Integrating Images, Linguistic descriptions, and Conceptual Relations", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6120", "id": "htWIlvDcY8", "poster": "", "openreview": "https://openreview.net/forum?id=htWIlvDcY8", "slides": "https://iclr.cc/virtual/2022/poster/6120", "video": "https://iclr.cc/virtual/2022/poster/6120", "author_site": "Lingjie Mei, Jiayuan Mao, Ziqi Wang, Chuang Gan, Joshua B Tenenbaum", "tldr": "", "abstract": "We present a meta-learning framework for learning new visual concepts quickly, from just one or a few examples, guided by multiple naturally occurring data streams: simultaneously looking at images, reading sentences that describe the objects in the scene, and interpreting supplemental sentences that relate the novel concept with other concepts. The learned concepts support downstream applications, such as answering questions by reasoning about unseen images. Our model, namely FALCON, represents individual visual concepts, such as colors and shapes, as axis-aligned boxes in a high-dimensional space (the ``box embedding space''). Given an input image and its paired sentence, our model first resolves the referential expression in the sentence and associates the novel concept with particular objects in the scene. Next, our model interprets supplemental sentences to relate the novel concept with other known concepts, such as ``X has property Y'' or ``X is a kind of Y''. Finally, it infers an optimal box embedding for the novel concept that jointly 1) maximizes the likelihood of the observed instances in the image, and 2) satisfies the relationships between the novel concepts and the known ones. We demonstrate the effectiveness of our model on both synthetic and real-world datasets.", "keywords": "Neuro-Symbolic Reasoning;Concept Learning;Meta-Learning", "primary_area": "", "supplementary_material": "", "author": "Lingjie Mei;Jiayuan Mao;Ziqi Wang;Chuang Gan;Joshua B. Tenenbaum", "authorids": "~Lingjie_Mei1;~Jiayuan_Mao1;~Ziqi_Wang2;~Chuang_Gan1;~Joshua_B._Tenenbaum1", "gender": "M;F;;M;", "homepage": "https://people.csail.mit.edu/jerrymei;http://jiayuanm.com;https://www.wzq016.github.io;http://people.csail.mit.edu/ganchuang/;", "dblp": "255/7024;200/8283;38/8097-3;139/6993;t/JoshuaBTenenbaum", "google_scholar": "https://scholar.google.com/citations?hl=en;-xaOIZIAAAAJ;xYRZiZkAAAAJ;PTeSCbIAAAAJ;", "orcid": "0009-0002-2022-1778;0000-0003-4798-3748;;;", "linkedin": ";;;;", "or_profile": "~Lingjie_Mei1;~Jiayuan_Mao1;~Ziqi_Wang2;~Chuang_Gan1;~Joshua_B._Tenenbaum1", "aff": "Massachusetts Institute of Technology;Massachusetts Institute of Technology;Google;MIT-IBM Watson AI Lab;Massachusetts Institute of Technology", "aff_domain": "mit.edu;mit.edu;google.com;ibm.com;mit.edu", "position": "MS student;PhD student;Intern;PhD student;Professor", "bibtex": "@inproceedings{\nmei2022falcon,\ntitle={{FALCON}: Fast Visual Concept Learning by Integrating Images, Linguistic descriptions, and Conceptual Relations},\nauthor={Lingjie Mei and Jiayuan Mao and Ziqi Wang and Chuang Gan and Joshua B. Tenenbaum},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=htWIlvDcY8}\n}", "github": "", "project": "", "reviewers": "2P5Z;98FU;6cyF;W3YC", "pdf_size": 0, "recommendation": "5;6;8;8", "confidence": "3;3;4;4", "correctness": "3;3;4;4", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;2;4;3", "wc_summary_paper": "72;38;129;94", "wc_summary_review": "48;24;77;53", "wc_main_review": "459;91;202;290", "wc_review": "579;153;408;437", "wc_reply_reviewers": "0;24;31;34", "wc_reply_authors": "986;290;296;822", "reply_reviewers": "0;1;1;1", "reply_authors": "2;1;1;2", "recommendation_avg": [ 6.75, 1.299038105676658 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 83.25, 33.10117067416196 ], "wc_summary_review_avg": [ 50.5, 18.82153022471871 ], "wc_main_review_avg": [ 260.5, 134.5594664079789 ], "wc_review_avg": [ 394.25, 153.58283595506367 ], "wc_reply_reviewers_avg": [ 22.25, 13.348689074212494 ], "wc_reply_authors_avg": [ 598.5, 310.9610104177049 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.9622504486493761, "corr_recommendation_correctness": 0.9622504486493761, "gs_citation": 24, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15109588222605519545&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=htWIlvDcY8", "email": "mit.edu;mit.edu;google.com;ibm.com;mit.edu", "author_num": 5, "aff_unique_index": "0;0;1;0;0", "aff_unique_norm": "Massachusetts Institute of Technology;Google", "aff_unique_dep": ";Google", "aff_unique_url": "https://web.mit.edu;https://www.google.com", "aff_unique_abbr": "MIT;Google", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "huXTh4GF2YD", "title": "Distance-Based Background Class Regularization for Open-Set Recognition", "track": "main", "status": "Reject", "tldr": "", "abstract": "In open-set recognition (OSR), classifiers should be able to reject unknown-class samples while maintaining robust closed-set classification performance. To solve the OSR problem based on pre-trained Softmax classifiers, previous studies investigated offline analyses, e.g., distance-based sample rejection, which can limit the feature space of known-class data items. Since such classifiers are trained solely based on known-class samples, one can use background class regularization (BCR), which employs background-class data as surrogates of unknown-class data during training phase, to enhance OSR performance. However, previous regularization methods have limited OSR performance, since they categorized known-class data into a single group and then aimed to distinguish them from anomalies. In this paper, we propose a novel distance-based BCR method suitable for OSR, which limits the feature space of known-class data in a class-wise manner and then makes background-class samples located far away from the limited feature space. Instead of conventional Softmax classifiers, we use distance-based classifiers, which utilize the principle of linear discriminant analysis. Based on the distance measure used for classification, we design a novel regularization loss function that can contrast known-class and background-class samples while maintaining robust closed-set classification performance. Through our extensive experiments, we show that the proposed method provides robust OSR results with a simple inference process.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Wonwoo Cho;Jaegul Choo", "authorids": "~Wonwoo_Cho1;~Jaegul_Choo1", "gender": "M;M", "homepage": ";https://sites.google.com/site/jaegulchoo/", "dblp": "152/2536;07/2074", "google_scholar": "https://scholar.google.co.kr/citations?user=rlW4HXAAAAAJ;GHJYsLEAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Wonwoo_Cho1;~Jaegul_Choo1", "aff": "Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology", "aff_domain": "kaist.ac.kr;kaist.ac.kr", "position": "PhD student;Associate Professor", "bibtex": "@misc{\ncho2022distancebased,\ntitle={Distance-Based Background Class Regularization for Open-Set Recognition},\nauthor={Wonwoo Cho and Jaegul Choo},\nyear={2022},\nurl={https://openreview.net/forum?id=huXTh4GF2YD}\n}", "github": "", "project": "", "reviewers": "Mkdh;ujMG;rop6;1wRX", "site": "https://openreview.net/forum?id=huXTh4GF2YD", "pdf_size": 0, "recommendation": "5;5;6;8", "confidence": "4;3;5;4", "correctness": "3;3;3;4", "technical_novelty": "2;2;2;3", "empirical_novelty": "0;2;3;3", "wc_summary_paper": "62;167;105;271", "wc_summary_review": "19;16;54;72", "wc_main_review": "184;116;754;714", "wc_review": "265;299;913;1057", "wc_reply_reviewers": "19;390;0;166", "wc_reply_authors": "425;2173;1067;779", "reply_reviewers": "1;2;0;1", "reply_authors": "1;4;2;2", "recommendation_avg": [ 6.0, 1.224744871391589 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 1.224744871391589 ], "wc_summary_paper_avg": [ 151.25, 78.56963471978217 ], "wc_summary_review_avg": [ 40.25, 23.64714570513744 ], "wc_main_review_avg": [ 442.0, 293.32916663707346 ], "wc_review_avg": [ 633.5, 355.37128471501467 ], "wc_reply_reviewers_avg": [ 143.75, 156.01342089704974 ], "wc_reply_authors_avg": [ 1111.0, 653.9495393377076 ], "reply_reviewers_avg": [ 1.0, 0.7071067811865476 ], "reply_authors_avg": [ 2.25, 1.0897247358851685 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.28867513459481287, "corr_recommendation_correctness": 0.9428090415820632, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:MkGxTMuPpooJ:scholar.google.com/&scioq=Distance-Based+Background+Class+Regularization+for+Open-Set+Recognition&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Korea Advanced Institute of Science and Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.kaist.ac.kr", "aff_unique_abbr": "KAIST", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "South Korea" }, { "id": "hw5Kug2Go3-", "title": "Prototypical Variational Autoencoders", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Variational autoencoders are unsupervised generative models that implement latent space regularization towards a known distribution, enabling stochastic synthesis from straightforward sampling procedures. Many works propose various regularization approaches, but most struggle to compromise between proper regularization and good reconstruction quality. This paper proposes distributing the regularization through the latent space using prototypical anchored clusters, each with an optimal position in the latent space and following a known distribution. Such schema enables obtaining an appropriate number of clusters with solid regularization for better reconstruction quality and improved synthesis control. We experiment with our method using widespread exploratory benchmarks and report that regularization anchored on prototypes' coordinates or cluster centroids neutralizes the adverse effects regularization terms often have on autoencoder reconstruction quality, matching non-regularized autoencoders' performance. We also report appealing results for interpreting data representatives with simple prototype synthesis and controlling the synthesis of samples with prototype-like characteristics from decoding white noise around prototype anchors.", "keywords": "Variational Autoencoders;Latent Space Regularization", "primary_area": "", "supplementary_material": "", "author": "Dario Augusto Borges Oliveira;Laura Elena Cue La Rosa", "authorids": "~Dario_Augusto_Borges_Oliveira1;~Laura_Elena_Cue_La_Rosa1", "gender": "M;F", "homepage": ";", "dblp": "10/1429;", "google_scholar": "https://scholar.google.com.br/citations?user=Nswjaf0AAAAJ;0YxpRNoAAAAJ", "orcid": "0000-0002-0674-5332;0000-0002-6284-9494", "linkedin": ";https://br.linkedin.com/in/lauracue", "or_profile": "~Dario_Augusto_Borges_Oliveira1;~Laura_Elena_Cue_La_Rosa1", "aff": "Technical University Munich;Pontificia Universidade Catolica, Rio de Janeiro, Brazil", "aff_domain": "tum.de;puc-rio.br", "position": "Guest Professor;PhD student", "bibtex": "@misc{\noliveira2022prototypical,\ntitle={Prototypical Variational Autoencoders},\nauthor={Dario Augusto Borges Oliveira and Laura Elena Cue La Rosa},\nyear={2022},\nurl={https://openreview.net/forum?id=hw5Kug2Go3-}\n}", "github": "", "project": "", "reviewers": "39QN;Ze5G;kZw7", "site": "https://openreview.net/forum?id=hw5Kug2Go3-", "pdf_size": 0, "recommendation": "3;3;3", "confidence": "4;4;4", "correctness": "4;3;2", "technical_novelty": "3;3;2", "empirical_novelty": "2;2;3", "wc_summary_paper": "54;44;116", "wc_summary_review": "26;77;80", "wc_main_review": "143;409;175", "wc_review": "223;530;371", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.0, 0.816496580927726 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 71.33333333333333, 31.8468557666565 ], "wc_summary_review_avg": [ 61.0, 24.779023386727733 ], "wc_main_review_avg": [ 242.33333333333334, 118.57299673853046 ], "wc_review_avg": [ 374.6666666666667, 125.35903991690782 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:4HUVteKZFFAJ:scholar.google.com/&scioq=Prototypical+Variational+Autoencoders&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1", "aff_unique_norm": "Technical University of Munich;Pontifical Catholic University of Rio de Janeiro", "aff_unique_dep": ";", "aff_unique_url": "https://www.tum.de;http://www.puc-rio.br/", "aff_unique_abbr": "TUM;PUC-Rio", "aff_campus_unique_index": "1", "aff_campus_unique": ";Rio de Janeiro", "aff_country_unique_index": "0;1", "aff_country_unique": "Germany;Brazil" }, { "id": "hxitw01k_Ql", "title": "How memory architecture affects learning in a simple POMDP: the two-hypothesis testing problem", "track": "main", "status": "Reject", "tldr": "", "abstract": "Reinforcement learning is generally difficult for partially observable Markov decision processes (POMDPs), which occurs when the agent's observation is partial or noisy. To seek good performance in POMDPs, one strategy is to endow the agent with a finite memory, whose update is governed by the policy. However, policy optimization is non-convex in that case and can lead to poor training performance for random initialization. The performance can be empirically improved by constraining the memory architecture, then sacrificing optimality to facilitate training. Here we study this trade-off in a two-hypothesis testing problem, akin to the two-arm bandit problem. We compare two extreme cases: (i) the random access memory where any transitions between $M$ memory states are allowed and (ii) a fixed memory where the agent can access its last $m$ actions and rewards. For (i), the probability $q$ to play the worst arm is known to be exponentially small in $M$ for the optimal policy. Our main result is to show that similar performance can be reached for (ii) as well, despite the simplicity of the memory architecture: using a conjecture on Gray-ordered binary necklaces, we find policies for which $q$ is exponentially small in $2^m$, i.e. $q\\sim\\alpha^{2^m}$ with $\\alpha < 1$. In addition, we observe empirically that training from random initialization leads to very poor results for (i), and significantly better results for (ii) thanks to the constraints on the memory architecture.", "keywords": "POMDP;memory architecture;optimization;reinforcement learning", "primary_area": "", "supplementary_material": "", "author": "Mario Geiger;Christophe Eloy;Matthieu Wyart", "authorids": "~Mario_Geiger1;~Christophe_Eloy1;~Matthieu_Wyart2", "gender": "M;;M", "homepage": ";https://www.irphe.univ-mrs.fr/~eloy/;http://pcsl.epfl.ch/", "dblp": "206/7093;;26/11007", "google_scholar": ";lJ8IK8kAAAAJ;https://scholar.google.ch/citations?user=1TttZYYAAAAJ", "orcid": "0000-0001-5433-0900;0000-0003-4114-7263;0000-0003-0644-0990", "linkedin": ";;", "or_profile": "~Mario_Geiger1;~Christophe_Eloy1;~Matthieu_Wyart2", "aff": "Massachusetts Institute of Technology;Ecole Centrale Marseille;Swiss Federal Institute of Technology Lausanne", "aff_domain": "mit.edu;centrale-marseille.fr;epfl.ch", "position": "Postdoc;Professor;Associate Professor", "bibtex": "@misc{\ngeiger2022how,\ntitle={How memory architecture affects learning in a simple {POMDP}: the two-hypothesis testing problem},\nauthor={Mario Geiger and Christophe Eloy and Matthieu Wyart},\nyear={2022},\nurl={https://openreview.net/forum?id=hxitw01k_Ql}\n}", "github": "", "project": "", "reviewers": "BH7m;7ywF;42t7;PEeX", "site": "https://openreview.net/forum?id=hxitw01k_Ql", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "3;3;3;2", "correctness": "4;2;2;4", "technical_novelty": "2;2;3;2", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "46;275;70;20", "wc_summary_review": "84;38;54;6", "wc_main_review": "195;505;341;111", "wc_review": "325;818;465;137", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 2.75, 0.4330127018922193 ], "correctness_avg": [ 3.0, 1.0 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 102.75, 101.00835361493623 ], "wc_summary_review_avg": [ 45.5, 28.155816450602174 ], "wc_main_review_avg": [ 288.0, 149.8966310495336 ], "wc_review_avg": [ 436.25, 249.2422265588237 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -1.0, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:0XZTU-reHVwJ:scholar.google.com/&scioq=How+memory+architecture+affects+learning+in+a+simple+POMDP:+the+two-hypothesis+testing+problem&hl=en&as_sdt=0,33", "gs_version_total": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "Massachusetts Institute of Technology;Ecole Centrale Marseille;Swiss Federal Institute of Technology Lausanne", "aff_unique_dep": ";;", "aff_unique_url": "https://web.mit.edu;https://www.ecm.fr;https://www.epfl.ch", "aff_unique_abbr": "MIT;ECM;EPFL", "aff_campus_unique_index": "1", "aff_campus_unique": ";Lausanne", "aff_country_unique_index": "0;1;2", "aff_country_unique": "United States;France;Switzerland" }, { "id": "hxznlKsIIKk", "title": "Leveraging Attribute Conditioning for Abstractive Multi Document Summarization", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Abstractive multi document summarization has evolved as a task through the basic sequence to sequence approaches to transformer and graph based techniques. Each of these approaches has primarily focused on the issues of multi document information synthesis and attention based approaches to extract salient information. A challenge that arises with multi document summarization which is not prevalent in single document summarization is the need to effectively summarize multiple documents that might have conflicting polarity, sentiment or information about a given topic. In this paper we leverage attribute conditioning in order to address the problem of conflicting information in multi document summarization and show strong gains in performance over the base abstractive multi document summarization methods. ", "keywords": "multi document summarization", "primary_area": "", "supplementary_material": "", "author": "Aiswarya Sankar;Ankit Chadha", "authorids": "~Aiswarya_Sankar1;ankitrchadha@stanford.edu", "gender": "F;", "homepage": "http://www.aiswaryas.com/;", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": "sankaraiswarya/;", "or_profile": "~Aiswarya_Sankar1;ankitrchadha@stanford.edu", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nsankar2022leveraging,\ntitle={Leveraging Attribute Conditioning for Abstractive Multi Document Summarization},\nauthor={Aiswarya Sankar and Ankit Chadha},\nyear={2022},\nurl={https://openreview.net/forum?id=hxznlKsIIKk}\n}", "github": "", "project": "", "reviewers": "wAhM;hu6A;VP4z;ZqF3;zf6H", "site": "https://openreview.net/forum?id=hxznlKsIIKk", "pdf_size": 0, "recommendation": "1;1;3;3;3", "confidence": "5;4;4;3;4", "correctness": "2;2;2;3;3", "technical_novelty": "2;1;2;4;2", "empirical_novelty": "2;2;2;3;2", "wc_summary_paper": "56;64;143;33;101", "wc_summary_review": "17;10;82;13;16", "wc_main_review": "70;297;319;48;246", "wc_review": "143;371;544;94;363", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "0;0;0;0;0", "reply_reviewers": "0;0;0;0;0", "reply_authors": "0;0;0;0;0", "recommendation_avg": [ 2.2, 0.9797958971132712 ], "confidence_avg": [ 4.0, 0.6324555320336759 ], "correctness_avg": [ 2.4, 0.4898979485566356 ], "technical_novelty_avg": [ 2.2, 0.9797958971132712 ], "empirical_novelty_avg": [ 2.2, 0.39999999999999997 ], "wc_summary_paper_avg": [ 79.4, 38.59844556455609 ], "wc_summary_review_avg": [ 27.6, 27.310071402323356 ], "wc_main_review_avg": [ 196.0, 114.5512985522207 ], "wc_review_avg": [ 303.0, 164.6730093245399 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.6454972243679028, "corr_recommendation_correctness": 0.6666666666666667, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:q_wFyPytYRsJ:scholar.google.com/&scioq=Leveraging+Attribute+Conditioning+for+Abstractive+Multi+Document+Summarization&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "hyuacPZQFb0", "title": "A Systematic Evaluation of Domain Adaptation Algorithms On Time Series Data", "track": "main", "status": "Reject", "tldr": "", "abstract": "Unsupervised domain adaptation methods aim to generalize well on unlabeled test data that may have a different (shifted) distribution from the training data. Such methods are typically developed on image data, and their application to time series data is less explored. Existing works on time series domain adaptation suffer from inconsistencies in evaluation schemes, datasets, and base neural network architectures. Moreover, labeled target data are usually employed for model selection, which violates the fundamental assumption of unsupervised domain adaptation. To address these issues, we propose AdaTime, a standard framework to systematically and fairly evaluate different domain adaptation methods on time series data. Specifically, we standardize the base neural network architectures and benchmarking datasets, while also exploring more realistic model selection approaches that can work with no labeled data or few labeled samples. Our evaluation includes adaptations of state-of-the-art visual domain adaptation methods to time series data in addition to recent methods specifically developed for time series data. We conduct extensive experiments to evaluate 10 state-of-the-art methods on 3 representative datasets spanning 15 cross-domain scenarios. Our results suggest that with careful selection of hyper-parameters, visual domain adaptation methods are competitive with methods proposed for time series domain adaptation. In addition, we find that model selection plays a key role and different selection strategies can significantly affect performance. Our work unveils practical insights for applying domain adaptation methods on time series data and builds a solid foundation for future works in the field. ", "keywords": "domain adaptation;time series data;benchmarking", "primary_area": "", "supplementary_material": "", "author": "Mohamed Ragab;Emadeldeen Eldele;Wee Ling Tan;Chuan-Sheng Foo;Zhenghua Chen;Min Wu;Chee Kwoh;Xiaoli Li", "authorids": "~Mohamed_Ragab1;~Emadeldeen_Eldele1;~Wee_Ling_Tan1;~Chuan-Sheng_Foo1;~Zhenghua_Chen2;~Min_Wu2;~Chee_Kwoh1;~Xiaoli_Li1", "gender": "M;M;M;M;M;M;M;M", "homepage": "http://mohamed-ragab.netlify.app;https://emadeldeen24.github.io/;;http://ai.stanford.edu/~csfoo;https://zhenghuantu.github.io/;https://sites.google.com/site/wumincf/;http://www.ntu.edu.sg/home/asckkwoh/;https://personal.ntu.edu.sg/xlli/", "dblp": "237/3528-2.html;295/9208.html;;73/1823;03/7457.html;16/0-8;32/228;l/XiaoliLi.html", "google_scholar": "nNeT_NUAAAAJ;2LdeHIYAAAAJ;;AgbeqGkAAAAJ;https://scholar.google.com.sg/citations?user=WUgu3nwAAAAJ;https://scholar.google.com.sg/citations?user=Hji1uWQAAAAJ;https://scholar.google.com.tw/citations?user=jVn0wDMAAAAJ;E3yQKloAAAAJ", "orcid": "0000-0002-2138-4395;0000-0002-9282-0991;;0000-0002-4748-5792;0000-0002-1719-0328;0000-0003-0977-3600;;0000-0002-0762-6562", "linkedin": "mohamedragab1/;emadeldeen-eldele-phd-1a291a301/;weeling-tan/;;;;;li-xiaoli-41027ba/", "or_profile": "~Mohamed_Ragab1;~Emadeldeen_Eldele1;~Wee_Ling_Tan1;~Chuan-Sheng_Foo1;~Zhenghua_Chen2;~Min_Wu2;~Chee_Kwoh1;~Xiaoli_Li1", "aff": "Nanyang Technological University;Nanyang Technological University;University of Oxford;Institute for Infocomm Research, A*STAR;I2R, A*STAR;Institute for Infocomm Research (I2R), A*STAR;National Technological University;A*STAR", "aff_domain": "ntu.edu.sg;ntu.edu.sg;oxford.ac.uk;i2r.a-star.edu.sg;i2r.a-star.edu.sg;i2r.a-star.edu.sg;ntu.edu;a-star.edu.sg", "position": "PhD student;PhD student;PhD student;Scientist;Researcher;Researcher;Associate Professor;Principal Researcher", "bibtex": "@misc{\nragab2022a,\ntitle={A Systematic Evaluation of Domain Adaptation Algorithms On Time Series Data},\nauthor={Mohamed Ragab and Emadeldeen Eldele and Wee Ling Tan and Chuan-Sheng Foo and Zhenghua Chen and Min Wu and Chee Kwoh and Xiaoli Li},\nyear={2022},\nurl={https://openreview.net/forum?id=hyuacPZQFb0}\n}", "github": "", "project": "", "reviewers": "X8V1;S7S5;uoXZ;f7Xp;oysD", "site": "https://openreview.net/forum?id=hyuacPZQFb0", "pdf_size": 0, "recommendation": "3;5;5;5;5", "confidence": "3;5;3;4;4", "correctness": "3;4;3;2;3", "technical_novelty": "1;2;3;1;3", "empirical_novelty": "2;2;3;2;3", "wc_summary_paper": "20;70;20;48;75", "wc_summary_review": "41;54;18;64;27", "wc_main_review": "222;36;269;396;226", "wc_review": "283;160;307;508;328", "wc_reply_reviewers": "0;0;0;398;0", "wc_reply_authors": "1277;124;373;2537;650", "reply_reviewers": "0;0;0;2;0", "reply_authors": "2;1;1;4;1", "recommendation_avg": [ 4.6, 0.7999999999999999 ], "confidence_avg": [ 3.8, 0.7483314773547882 ], "correctness_avg": [ 3.0, 0.6324555320336759 ], "technical_novelty_avg": [ 2.0, 0.8944271909999159 ], "empirical_novelty_avg": [ 2.4, 0.4898979485566356 ], "wc_summary_paper_avg": [ 46.6, 23.54230235129946 ], "wc_summary_review_avg": [ 40.8, 16.868906307167634 ], "wc_main_review_avg": [ 229.8, 115.5792368896767 ], "wc_review_avg": [ 317.2, 111.80948081446404 ], "wc_reply_reviewers_avg": [ 79.6, 159.2 ], "wc_reply_authors_avg": [ 992.2, 862.7628642912257 ], "reply_reviewers_avg": [ 0.4, 0.8 ], "reply_authors_avg": [ 1.8, 1.1661903789690602 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": 0.5345224838248488, "corr_recommendation_correctness": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10126888840272706356&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;1;2;3;2;4;5", "aff_unique_norm": "Nanyang Technological University;University of Oxford;Institute for Infocomm Research;A*STAR;National Technological University;Agency for Science, Technology and Research", "aff_unique_dep": ";;;Institute for Infocomm Research;;", "aff_unique_url": "https://www.ntu.edu.sg;https://www.ox.ac.uk;https://www.i2r.a-star.edu.sg;https://www.a-star.edu.sg;https://www.ntu.edu;https://www.a-star.edu.sg", "aff_unique_abbr": "NTU;Oxford;I2R;A*STAR;NTU;A*STAR", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;0;0;2;0", "aff_country_unique": "Singapore;United Kingdom;United States" }, { "title": "GNN is a Counter? Revisiting GNN for Question Answering", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6194", "id": "hzmQ4wOnSb", "poster": "", "openreview": "https://openreview.net/forum?id=hzmQ4wOnSb", "slides": "https://iclr.cc/virtual/2022/poster/6194", "video": "https://iclr.cc/virtual/2022/poster/6194", "author_site": "Kuan Wang, Yuyu Zhang, Diyi Yang, Le Song, Tao Qin", "tldr": "", "abstract": "Question Answering (QA) has been a long-standing research topic in AI and NLP fields, and a wealth of studies has been conducted to attempt to equip QA systems with human-level reasoning capability. To approximate the complicated human reasoning process, state-of-the-art QA systems commonly use pre-trained language models (LMs) to access knowledge encoded in LMs together with elaborately designed modules based on Graph Neural Networks (GNNs) to perform reasoning over knowledge graphs (KGs). However, many problems remain open regarding the reasoning functionality of these GNN-based modules. Can these GNN-based modules really perform a complex reasoning process? Are they under- or over-complicated for QA? To open the black box of GNN and investigate these problems, we dissect state-of-the-art GNN modules for QA and analyze their reasoning capability. We discover that even a very simple graph neural counter can outperform all the existing GNN modules on CommonsenseQA and OpenBookQA, two popular QA benchmark datasets which heavily rely on knowledge-aware reasoning. Our work reveals that existing knowledge-aware GNN modules may only carry out some simple reasoning such as counting. It remains a challenging open problem to build comprehensive reasoning modules for knowledge-powered QA.", "keywords": "GNN;Question Answering;QA;Reasoning;ML", "primary_area": "", "supplementary_material": "", "author": "Kuan Wang;Yuyu Zhang;Diyi Yang;Le Song;Tao Qin", "authorids": "~Kuan_Wang1;~Yuyu_Zhang1;~Diyi_Yang2;~Le_Song1;~Tao_Qin1", "gender": "M;F;M;M;M", "homepage": "http://kuanwang.me;https://cs.stanford.edu/~diyiy/;http://www.cc.gatech.edu/~lsong;https://www.microsoft.com/en-us/research/people/taoqin/;", "dblp": ";70/11145;94/3481;14/6841;", "google_scholar": "c1-_-dUAAAAJ;j9jhYqQAAAAJ;Xl4E0CsAAAAJ;Bl4SRU0AAAAJ;TIC2ujUAAAAJ", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~Kuan_Wang1;~Diyi_Yang2;~Le_Song1;~Tao_Qin1;~Yuyu_Zhang2", "aff": "Georgia Institute of Technology;Georgia Institute of Technology;College of Computing, Georgia Institute of Technology;Microsoft Research Asia;ByteDance", "aff_domain": "gatech.edu;gatech.edu;cc.gatech.edu;microsoft.com;bytedance.com", "position": "PhD student;Assistant Professor;Associate Professor;Principal Researcher;Research Scientist", "bibtex": "@inproceedings{\nwang2022gnn,\ntitle={{GNN} is a Counter? Revisiting {GNN} for Question Answering},\nauthor={Kuan Wang and Yuyu Zhang and Diyi Yang and Le Song and Tao Qin},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=hzmQ4wOnSb}\n}", "github": "", "project": "", "reviewers": "pyaj;zYDH;nabQ;oUEP", "pdf_size": 0, "recommendation": "5;6;8;8", "confidence": "3;5;3;4", "correctness": "3;3;3;4", "technical_novelty": "2;4;3;3", "empirical_novelty": "2;4;3;4", "wc_summary_paper": "478;54;179;113", "wc_summary_review": "4;26;119;30", "wc_main_review": "391;509;470;302", "wc_review": "873;589;768;445", "wc_reply_reviewers": "54;98;0;0", "wc_reply_authors": "1170;1829;782;432", "reply_reviewers": "1;1;0;0", "reply_authors": "3;5;2;1", "recommendation_avg": [ 6.75, 1.299038105676658 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 3.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 206.0, 163.14564045661777 ], "wc_summary_review_avg": [ 44.75, 43.996448720322874 ], "wc_main_review_avg": [ 418.0, 79.32527970325727 ], "wc_review_avg": [ 668.75, 164.31125189712358 ], "wc_reply_reviewers_avg": [ 38.0, 41.060930335295616 ], "wc_reply_authors_avg": [ 1053.25, 518.3981939590453 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 2.75, 1.479019945774904 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.058025885318565944, "corr_recommendation_correctness": 0.5555555555555555, "gs_citation": 41, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2205554280204185493&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=hzmQ4wOnSb", "email": "gatech.edu;gatech.edu;cc.gatech.edu;microsoft.com;bytedance.com", "author_num": 5, "aff_unique_index": "0;0;0;1;2", "aff_unique_norm": "Georgia Institute of Technology;Microsoft;ByteDance", "aff_unique_dep": ";Research;", "aff_unique_url": "https://www.gatech.edu;https://www.microsoft.com/en-us/research/group/asia;https://www.bytedance.com", "aff_unique_abbr": "Georgia Tech;MSR Asia;ByteDance", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Atlanta;Asia", "aff_country_unique_index": "0;0;0;1;1", "aff_country_unique": "United States;China" }, { "id": "i--G7mhB19P", "title": "Depth Without the Magic: Inductive Bias of Natural Gradient Descent", "track": "main", "status": "Reject", "tldr": "", "abstract": "In gradient descent, changing how we parametrize the model can lead to drastically different optimization trajectories, giving rise a surprising range of meaningful inductive biases: identifying sparse classifiers or reconstructing low-rank matrices without explicit regularization. This implicit regularization has been hypothesised to be a contributing factor to good generalization in deep learning. However, natural gradient descent is approximately invariant to reparameterization, it always follows the same trajectory and finds the same optimum. The question naturally arises: What happens if we eliminate the role of parameterization, which solution will be found, what new properties occur? We characterize the behaviour of natural gradient flow in deep linear networks for separable classification under logistic loss and deep matrix factorization. Some of our findings extend to nonlinear neural networks with sufficient but finite over-parametrization. We demonstrate that there exist learning problems where natural gradient descent fails to generalize, while gradient descent with the right architecture peforms well.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Anna Kerekes;Anna M\u00e9sz\u00e1ros;Ferenc Husz\u00e1r", "authorids": "~Anna_Kerekes1;~Anna_M\u00e9sz\u00e1ros1;~Ferenc_Husz\u00e1r1", "gender": "F;;M", "homepage": ";;", "dblp": ";;http://dblp.uni-trier.de/pers/hd/h/Huszar:Ferenc", "google_scholar": ";;https://scholar.google.co.uk/citations?user=koQCVT4AAAAJ", "orcid": ";;", "linkedin": "anna-kerekes-5a84651b8/;anna-m%C3%A9sz%C3%A1ros-2ba244222/;", "or_profile": "~Anna_Kerekes1;~Anna_M\u00e9sz\u00e1ros1;~Ferenc_Huszar1", "aff": "University of Cambridge;Eotvos Lorand University;University of Cambridge", "aff_domain": "cam.ac.uk;elte.hu;cam.ac.uk", "position": "Undergrad student;Undergrad student;Associate Professor", "bibtex": "@misc{\nkerekes2022depth,\ntitle={Depth Without the Magic: Inductive Bias of Natural Gradient Descent},\nauthor={Anna Kerekes and Anna M{\\'e}sz{\\'a}ros and Ferenc Husz{\\'a}r},\nyear={2022},\nurl={https://openreview.net/forum?id=i--G7mhB19P}\n}", "github": "", "project": "", "reviewers": "74k1;VPSX;2Yhk;CBDn;gWo5", "site": "https://openreview.net/forum?id=i--G7mhB19P", "pdf_size": 0, "recommendation": "5;5;5;5;6", "confidence": "3;3;2;3;3", "correctness": "3;4;3;3;4", "technical_novelty": "3;2;3;2;3", "empirical_novelty": "0;1;2;2;3", "wc_summary_paper": "43;95;79;39;73", "wc_summary_review": "44;56;62;23;15", "wc_main_review": "511;477;571;175;227", "wc_review": "598;628;712;237;315", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "824;563;744;297;563", "reply_reviewers": "0;0;0;0;0", "reply_authors": "1;1;1;1;1", "recommendation_avg": [ 5.2, 0.39999999999999997 ], "confidence_avg": [ 2.8, 0.39999999999999997 ], "correctness_avg": [ 3.4, 0.4898979485566356 ], "technical_novelty_avg": [ 2.6, 0.4898979485566356 ], "empirical_novelty_avg": [ 1.6, 1.019803902718557 ], "wc_summary_paper_avg": [ 65.8, 21.525798475317938 ], "wc_summary_review_avg": [ 40.0, 18.275666882497067 ], "wc_main_review_avg": [ 392.2, 159.8379179043571 ], "wc_review_avg": [ 498.0, 186.71154222489835 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 598.2, 181.90261130616022 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.25000000000000006, "corr_recommendation_correctness": 0.6123724356957947, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17672648873398315095&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "University of Cambridge;E\u00f6tv\u00f6s Lor\u00e1nd University", "aff_unique_dep": ";", "aff_unique_url": "https://www.cam.ac.uk;https://www.elte.hu", "aff_unique_abbr": "Cambridge;ELTE", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Cambridge;", "aff_country_unique_index": "0;1;0", "aff_country_unique": "United Kingdom;Hungary" }, { "id": "i1ogYhs0ByT", "title": "Transformer with a Mixture of Gaussian Keys", "track": "main", "status": "Reject", "tldr": "", "abstract": "Multi-head attention is a driving force behind state-of-the-art transformers which achieve remarkable performance across a variety of natural language processing (NLP) and computer vision tasks. It has been observed that for many applications, those attention heads learn redundant embedding, and most of them can be removed without degrading the performance of the model. Inspired by this observation, we propose Transformer with a Mixture of Gaussian Keys (Transformer-MGK), a novel transformer architecture that replaces redundant heads in transformers with a mixture of keys at each head. These mixtures of keys follow a Gaussian mixture model and allow each attention head to focus on different parts of the input sequence efficiently. Compared to its conventional transformer counterpart, Transformer-MGK accelerates training and inference, has fewer parameters, and requires fewer FLOPs to compute while achieving comparable or better accuracy across tasks. Transformer-MGK can also be easily extended to use with linear attention. We empirically demonstrate the advantage of Transformer-MGK in a range of practical applications including language modeling and tasks that involve very long sequences. On the Wikitext-103 and Long Range Arena benchmark, Transformer-MGKs with 4 heads attain comparable or better performance to the baseline transformers with 8 heads.", "keywords": "transformer;gaussian mixture model;attention heads;attention keys", "primary_area": "", "supplementary_material": "/attachment/2b513a6f9fea0d430e0d5683060b49da49352b4c.zip", "author": "Tam Minh Nguyen;Tan Minh Nguyen;Dung Duy Le;Nguyen Duy Khuong;Viet Anh TRAN;Richard Baraniuk;Nhat Ho;Stanley Osher", "authorids": "~Tam_Minh_Nguyen1;~Tan_Minh_Nguyen1;~Dung_Duy_Le1;~Nguyen_Duy_Khuong1;~Viet_Anh_TRAN1;~Richard_Baraniuk1;~Nhat_Ho1;~Stanley_Osher1", "gender": "F;M;M;M;;M;M;M", "homepage": ";https://tanmnguyen89.github.io/;https://khuongnd.github.io/;;http://richb.rice.edu/;https://nhatptnk8912.github.io/;https://www.math.ucla.edu/~sjo/;https://andrew-dungle.github.io/", "dblp": "251/1464;255/4725;;;32/2804;203/4479;;186/1477", "google_scholar": ";OizOh88AAAAJ;vAOT46YAAAAJ;IU78uEsAAAAJ;https://scholar.google.com.tw/citations?user=N-BBA20AAAAJ;https://scholar.google.ca/citations?user=Xs7cKMwAAAAJ;;https://scholar.google.com/citations?hl=en", "orcid": ";;;;;;;", "linkedin": "tam-nguyen-6a3935132/;;;https://fr.linkedin.com/in/va-tran;richard-baraniuk;nhat-pham-minh-ho-267b8164/;;", "or_profile": "~Tam_Minh_Nguyen1;~Tan_Minh_Nguyen1;~Nguyen_Duy_Khuong1;~Viet_Anh_TRAN1;~Richard_Baraniuk1;~Nhat_Ho1;~Stanley_Osher1;~Dung_D._Le2", "aff": "FPT Software;University of California, Los Angeles;FPT Software Ltd. - FPT Corporation;Deezer;William Marsh Rice University;University of Texas, Austin;University of California, Los Angeles;VinUniversity", "aff_domain": "fsoft.com.vn;ucla.edu;fpt-software.com;deezer.com;rice.edu;utexas.edu;ucla.edu;vinuni.edu.vn", "position": "FPT AI Residency;Postdoc;Researcher;Researcher;C. Sidney Burrus Professor;Assistant Professor;Full Professor;Assistant Professor", "bibtex": "@misc{\nnguyen2022transformer,\ntitle={Transformer with a Mixture of Gaussian Keys},\nauthor={Tam Minh Nguyen and Tan Minh Nguyen and Dung Duy Le and Nguyen Duy Khuong and Viet Anh TRAN and Richard Baraniuk and Nhat Ho and Stanley Osher},\nyear={2022},\nurl={https://openreview.net/forum?id=i1ogYhs0ByT}\n}", "github": "", "project": "", "reviewers": "r2tu;fA4U;qyjK;gyah;EtfP", "site": "https://openreview.net/forum?id=i1ogYhs0ByT", "pdf_size": 0, "recommendation": "5;6;6;8;8", "confidence": "5;4;3;4;4", "correctness": "3;3;3;3;3", "technical_novelty": "2;3;3;3;3", "empirical_novelty": "1;3;2;3;2", "wc_summary_paper": "144;70;156;86;104", "wc_summary_review": "31;46;53;35;62", "wc_main_review": "325;204;394;239;337", "wc_review": "500;320;603;360;503", "wc_reply_reviewers": "0;21;40;66;123", "wc_reply_authors": "2679;717;2532;841;2509", "reply_reviewers": "0;1;1;1;1", "reply_authors": "6;3;7;3;5", "recommendation_avg": [ 6.6, 1.2 ], "confidence_avg": [ 4.0, 0.6324555320336759 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.8, 0.39999999999999997 ], "empirical_novelty_avg": [ 2.2, 0.7483314773547882 ], "wc_summary_paper_avg": [ 112.0, 33.05752561823102 ], "wc_summary_review_avg": [ 45.4, 11.394735626595292 ], "wc_main_review_avg": [ 299.8, 68.94461545327525 ], "wc_review_avg": [ 457.2, 103.40096711346561 ], "wc_reply_reviewers_avg": [ 50.0, 42.487645263064415 ], "wc_reply_authors_avg": [ 1855.6, 881.8457007889759 ], "reply_reviewers_avg": [ 0.8, 0.4000000000000001 ], "reply_authors_avg": [ 4.8, 1.5999999999999999 ], "replies_avg": [ 41, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": -0.2635231383473649, "corr_recommendation_correctness": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4125467049580768418&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;0;2;3;4;1;5", "aff_unique_norm": "FPT Corporation;University of California, Los Angeles;Deezer;Rice University;University of Texas at Austin;VinUniversity", "aff_unique_dep": ";;;;;", "aff_unique_url": "https://www.fpt-software.com;https://www.ucla.edu;https://www.deezer.com;https://www.rice.edu;https://www.utexas.edu;https://vinuni.edu.vn", "aff_unique_abbr": "FPT;UCLA;Deezer;Rice;UT Austin;VinUni", "aff_campus_unique_index": "1;2;1", "aff_campus_unique": ";Los Angeles;Austin", "aff_country_unique_index": "0;1;0;2;1;1;1;0", "aff_country_unique": "Vietnam;United States;France" }, { "id": "i2baoZMYZ3", "title": "Continuous Control with Action Quantization from Demonstrations", "track": "main", "status": "Reject", "tldr": "", "abstract": "In Reinforcement Learning (RL), discrete actions, as opposed to continuous actions, result in less complex exploration problem and the immediate derivation of the maximum of the action-value function which is central to dynamic programming-based methods. In this paper, we propose a novel method: Action Quantization from Demonstrations (AQuaDem) to learn a discretization of continuous action spaces by leveraging the priors of demonstrations. This dramatically reduces the exploration problem, since the actions faced by the agent not only are in a finite number but also are plausible in light of the demonstrator\u2019s behavior. By discretizing the action space we can apply any discrete action deep RL algorithm to the continuous control problem. We evaluate the proposed method on three different setups: RL with demonstrations, RL with play data --demonstrations of a human playing in an environment but not solving any specific task-- and Imitation Learning. For all three setups, we only consider human data, thus most challenging than synthetic data. We found that AQuaDem consistently outperforms state-of-the-art continuous control methods, both in terms of performance and sample efficiency.", "keywords": "Deep Reinforcement Learning;Action Discretization;Learning from Demonstrations", "primary_area": "", "supplementary_material": "/attachment/44d55a8f6ce7a946bd72705abb747990e7c62409.zip", "author": "Robert Dadashi;Leonard Hussenot;Damien Vincent;Sertan Girgin;Anton Raichuk;Matthieu Geist;Olivier Pietquin", "authorids": "~Robert_Dadashi2;~Leonard_Hussenot1;~Damien_Vincent1;sertan@google.com;~Anton_Raichuk1;~Matthieu_Geist1;~Olivier_Pietquin1", "gender": ";;;;M;M;M", "homepage": ";;;;;;http://www.cristal.univ-lille.fr/~pietquin/", "dblp": ";241/9657;43/9268;;;38/6508;58/6269", "google_scholar": ";nTdWO9MAAAAJ;;;;ectPLEUAAAAJ;8K8-LdwAAAAJ", "orcid": ";;;;;;", "linkedin": ";;;;;;opietquin/", "or_profile": "~Robert_Dadashi2;~Leonard_Hussenot1;~Damien_Vincent1;sertan@google.com;~Anton_Raichuk1;~Matthieu_Geist1;~Olivier_Pietquin1", "aff": ";Google;Google;;Google;Google;Google Brain", "aff_domain": ";google.com;google.com;;google.com;google.com;google.com", "position": ";PhD student;PhD student;;SWE;Researcher;Staff Research Scientist", "bibtex": "@misc{\ndadashi2022continuous,\ntitle={Continuous Control with Action Quantization from Demonstrations},\nauthor={Robert Dadashi and Leonard Hussenot and Damien Vincent and Sertan Girgin and Anton Raichuk and Matthieu Geist and Olivier Pietquin},\nyear={2022},\nurl={https://openreview.net/forum?id=i2baoZMYZ3}\n}", "github": "", "project": "", "reviewers": "hp36;zgiZ;teiU;MjVt", "site": "https://openreview.net/forum?id=i2baoZMYZ3", "pdf_size": 0, "recommendation": "5;5;5;6", "confidence": "5;4;4;5", "correctness": "3;3;3;3", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;3;3;4", "wc_summary_paper": "56;97;115;244", "wc_summary_review": "67;65;43;150", "wc_main_review": "161;589;509;1057", "wc_review": "284;751;667;1451", "wc_reply_reviewers": "27;0;0;1032", "wc_reply_authors": "380;1323;816;2361", "reply_reviewers": "1;0;0;4", "reply_authors": "3;3;2;6", "recommendation_avg": [ 5.25, 0.4330127018922193 ], "confidence_avg": [ 4.5, 0.5 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 128.0, 70.30291601349121 ], "wc_summary_review_avg": [ 81.25, 40.794454279963105 ], "wc_main_review_avg": [ 579.0, 319.4557872382343 ], "wc_review_avg": [ 788.25, 421.1872356802851 ], "wc_reply_reviewers_avg": [ 264.75, 443.10911466590255 ], "wc_reply_authors_avg": [ 1220.0, 738.4622536054229 ], "reply_reviewers_avg": [ 1.25, 1.6393596310755 ], "reply_authors_avg": [ 3.5, 1.5 ], "replies_avg": [ 30, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": 0.0, "gs_citation": 38, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18354958382752460493&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Google", "aff_unique_dep": "Google", "aff_unique_url": "https://www.google.com", "aff_unique_abbr": "Google", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Hierarchical Variational Memory for Few-shot Learning Across Domains", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6912", "id": "i3RI65sR7N", "poster": "", "openreview": "https://openreview.net/forum?id=i3RI65sR7N", "slides": "https://iclr.cc/virtual/2022/poster/6912", "video": "https://iclr.cc/virtual/2022/poster/6912", "author_site": "Yingjun Du, Xiantong Zhen, Ling Shao, Cees G Snoek", "tldr": "", "abstract": "Neural memory enables fast adaptation to new tasks with just a few training samples. Existing memory models store features only from the single last layer, which does not generalize well in presence of a domain shift between training and test distributions. Rather than relying on a flat memory, we propose a hierarchical alternative that stores features at different semantic levels. We introduce a hierarchical prototype model, where each level of the prototype fetches corresponding information from the hierarchical memory. The model is endowed with the ability to flexibly rely on features at different semantic levels if the domain shift circumstances so demand. We meta-learn the model by a newly derived hierarchical variational inference framework, where hierarchical memory and prototypes are jointly optimized. To explore and exploit the importance of different semantic levels, we further propose to learn the weights associated with the prototype at each level in a data-driven way, which enables the model to adaptively choose the most generalizable features. We conduct thorough ablation studies to demonstrate the effectiveness of each component in our model. The new state-of-the-art performance on cross-domain and competitive performance on traditional few-shot classification further substantiates the benefit of hierarchical variational memory.", "keywords": "Meta-learning;Variational hierarchical memory;Variational hierarchical prototype;Cross-domain few-shot learning", "primary_area": "", "supplementary_material": "", "author": "Yingjun Du;Xiantong Zhen;Ling Shao;Cees G. M. Snoek", "authorids": "~Yingjun_Du1;~Xiantong_Zhen1;~Ling_Shao1;~Cees_G._M._Snoek1", "gender": "M;M;M;M", "homepage": "https://yingjundu.github.io/;;;http://www.ceessnoek.info", "dblp": "263/6794;78/10651;;s/CeesSnoek", "google_scholar": "oAeW6rAAAAAJ;https://scholar.google.ca/citations?user=DnBb3e0AAAAJ;z84rLjoAAAAJ;https://scholar.google.nl/citations?user=0uKdbscAAAAJ", "orcid": ";;;0000-0001-9092-1556", "linkedin": "%E8%8B%B1%E5%86%9B-%E6%9D%9C-a938a0174/;;;cgmsnoek/", "or_profile": "~Yingjun_Du1;~Xiantong_Zhen1;~Ling_Shao1;~Cees_Snoek1", "aff": "University of Amsterdam;Inception Institute of Artificial Intelligence;Terminus Group;University of Amsterdam", "aff_domain": "uva.nl;inceptioniai.org;terminusgroup.com;uva.nl", "position": "PhD student;Senior Scientist;Chief Scientist;Full Professor", "bibtex": "@inproceedings{\ndu2022hierarchical,\ntitle={Hierarchical Variational Memory for Few-shot Learning Across Domains},\nauthor={Yingjun Du and Xiantong Zhen and Ling Shao and Cees G. M. Snoek},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=i3RI65sR7N}\n}", "github": "", "project": "", "reviewers": "YEXZ;2Ajk;g1Bf", "pdf_size": 0, "recommendation": "5;6;8", "confidence": "4;4;4", "correctness": "3;3;3", "technical_novelty": "2;3;3", "empirical_novelty": "2;3;0", "wc_summary_paper": "108;100;99", "wc_summary_review": "57;41;74", "wc_main_review": "275;174;361", "wc_review": "440;315;534", "wc_reply_reviewers": "0;0;29", "wc_reply_authors": "203;500;597", "reply_reviewers": "0;0;1", "reply_authors": "1;1;1", "recommendation_avg": [ 6.333333333333333, 1.247219128924647 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 1.6666666666666667, 1.247219128924647 ], "wc_summary_paper_avg": [ 102.33333333333333, 4.0276819911981905 ], "wc_summary_review_avg": [ 57.333333333333336, 13.474255287605157 ], "wc_main_review_avg": [ 270.0, 76.42425443971742 ], "wc_review_avg": [ 429.6666666666667, 89.70445300479173 ], "wc_reply_reviewers_avg": [ 9.666666666666666, 13.67073110293992 ], "wc_reply_authors_avg": [ 433.3333333333333, 167.6153002827871 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 25, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1702336741267321422&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=i3RI65sR7N", "email": "uva.nl;inceptioniai.org;terminusgroup.com;uva.nl", "author_num": 4, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "University of Amsterdam;Inception Institute of Artificial Intelligence;Terminus Group", "aff_unique_dep": ";;", "aff_unique_url": "https://www.uva.nl;https://www.inceptioniai.org;", "aff_unique_abbr": "UvA;;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", "aff_country_unique": "Netherlands;United Arab Emirates;" }, { "id": "i3abvoMoeCZ", "title": "Exploring Covariate and Concept Shift for Detection and Confidence Calibration of Out-of-Distribution Data", "track": "main", "status": "Reject", "tldr": "", "abstract": "Moving beyond testing on in-distribution data, works on Out-of-Distribution (OOD) detection have recently increased in popularity. A recent attempt to categorize OOD data introduces the concept of near and far OOD detection. Specifically, prior works define characteristics of OOD data in terms of detection difficulty. We propose to characterize the spectrum of OOD data using two types of distribution shifts: covariate shift and concept shift, where covariate shift corresponds to change in style, e.g., noise, and concept shift indicates change in semantics. This characterization reveals that sensitivity to each type of shift is important to the detection and model calibration of OOD data. Consequently, we investigate score functions that capture sensitivity to each type of dataset shift and methods that improve them. To this end, we theoretically derive two score functions for OOD detection, the covariate shift score and concept shift score, based on the decomposition of KL-divergence for both scores, and propose a geometrically-inspired method (Geometric ODIN) to improve OOD detection under both shifts with only in-distribution data. Additionally, the proposed method naturally leads to an expressive post-hoc calibration function which yields state-of-the-art calibration performance on both in-distribution and out-of-distribution data. We are the first to propose a method that works well across both OOD detection and calibration, and under different types of shifts. Specifically, we improve the previous state-of-the-art OOD detection by relatively 7% AUROC on CIFAR100 vs. SVHN and achieve the best calibration performance of 0.084 Expected Calibration Error on the corrupted CIFAR100C dataset. ", "keywords": "out-of-distribution detection;calibration;distribution shift", "primary_area": "", "supplementary_material": "", "author": "Junjiao Tian;Yen-Chang Hsu;Yilin Shen;Hongxia Jin;Zsolt Kira", "authorids": "~Junjiao_Tian1;~Yen-Chang_Hsu1;~Yilin_Shen1;~Hongxia_Jin1;~Zsolt_Kira1", "gender": "M;M;M;;M", "homepage": ";;;;https://faculty.cc.gatech.edu/~zk15", "dblp": "246/3115.htm;172/1140;30/383;;36/4127", "google_scholar": "iHZD850AAAAJ;7QWAiigAAAAJ;9PSFMzAAAAAJ;;2a5XgNAAAAAJ", "orcid": ";;;;0000-0002-2626-2004", "linkedin": ";yenchanghsu/;;;", "or_profile": "~Junjiao_Tian1;~Yen-Chang_Hsu1;~Yilin_Shen1;~Hongxia_Jin1;~Zsolt_Kira1", "aff": "Georgia Institute of Technology;Samsung Research America;Samsung Research America;;Georgia Tech Research Institute", "aff_domain": "gatech.edu;samsung.com;gmail.com;;gtri.gatech.edu", "position": "PhD student;Research Scientist;Principal Researcher;;Senior Research Scientist", "bibtex": "@misc{\ntian2022exploring,\ntitle={Exploring Covariate and Concept Shift for Detection and Confidence Calibration of Out-of-Distribution Data},\nauthor={Junjiao Tian and Yen-Chang Hsu and Yilin Shen and Hongxia Jin and Zsolt Kira},\nyear={2022},\nurl={https://openreview.net/forum?id=i3abvoMoeCZ}\n}", "github": "", "project": "", "reviewers": "kPST;kVK3;rWjW;5ox7", "site": "https://openreview.net/forum?id=i3abvoMoeCZ", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "4;4;5;2", "correctness": "2;3;3;4", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;3;2;3", "wc_summary_paper": "81;154;80;76", "wc_summary_review": "55;47;44;52", "wc_main_review": "691;446;583;248", "wc_review": "827;647;707;376", "wc_reply_reviewers": "456;124;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "1;1;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 3.75, 1.0897247358851685 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 97.75, 32.529794035622174 ], "wc_summary_review_avg": [ 49.5, 4.272001872658765 ], "wc_main_review_avg": [ 492.0, 165.47960599421307 ], "wc_review_avg": [ 639.25, 165.2276838184207 ], "wc_reply_reviewers_avg": [ 145.0, 186.555621732501 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.4736842105263159, "corr_recommendation_correctness": 0.9733285267845754, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3250632644378197792&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;1;2", "aff_unique_norm": "Georgia Institute of Technology;Samsung;Georgia Tech Research Institute", "aff_unique_dep": ";Samsung Research America;", "aff_unique_url": "https://www.gatech.edu;https://www.samsung.com/us/careers/research/;https://www.gtri.gatech.edu", "aff_unique_abbr": "Georgia Tech;SRA;GTRI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "i4qKmHdq6y8", "title": "Learning to Abstain in the Presence of Uninformative Data", "track": "main", "status": "Reject", "tldr": "", "abstract": "Learning and decision making in domains with naturally high noise-to-signal ratios \u2013 such as Finance or Public Health \u2013 can be challenging and yet extremely important. In this paper, we study a problem of learning on datasets in which a significant proportion of samples does not contain useful information. To analyze this setting, we introduce a noisy generative process with a clear distinction between uninformative/not learnable/purely random data and a structured/informative component. This dichotomy is present both during the training and in the inference phase. We propose a novel approach to learn under these conditions via a loss inspired by the selective learning theory. By minimizing the loss, our method is guaranteed to make a near-optimal decision by simultaneously distinguishing structured data from the non-learnable and making predictions, even in a highly imbalanced setting. We build upon the strength of our theoretical guarantees by describing an iterative algorithm, which jointly optimizes both a predictor and a selector, and evaluate its empirical performance under a variety of conditions.", "keywords": "Selective learning;Uninformative data;PAC Learning;Sample Complexity", "primary_area": "", "supplementary_material": "", "author": "Yikai Zhang;Songzhu Zheng;Pengxiang Wu;Yuriy Nevmyvaka;Chao Chen", "authorids": "~Yikai_Zhang1;~Songzhu_Zheng1;~Pengxiang_Wu1;~Yuriy_Nevmyvaka1;~Chao_Chen1", "gender": ";M;;;M", "homepage": ";;https://pxiangwu.github.io/;;https://chaochen.github.io/", "dblp": ";226/4925;156/1749;92/1859;66/3019-12", "google_scholar": ";vq0hpV4AAAAJ;MXLs7GcAAAAJ;https://scholar.google.com/citations?hl=en;J-iIIFAAAAAJ", "orcid": ";;0000-0002-6929-5877;;0000-0003-1703-6483", "linkedin": ";;;;", "or_profile": "~Yikai_Zhang1;~Songzhu_Zheng1;~Pengxiang_Wu1;~Yuriy_Nevmyvaka1;~Chao_Chen1", "aff": ";State University of New York at Stony Brook;Snap Inc.;Morgan Stanley;State University of New York, Stony Brook", "aff_domain": ";stonybrook.edu;snapchat.com;morganstanley.com;stonybrook.edu", "position": ";PhD student;Researcher;Principal Researcher;Assistant Professor", "bibtex": "@misc{\nzhang2022learning,\ntitle={Learning to Abstain in the Presence of Uninformative Data},\nauthor={Yikai Zhang and Songzhu Zheng and Pengxiang Wu and Yuriy Nevmyvaka and Chao Chen},\nyear={2022},\nurl={https://openreview.net/forum?id=i4qKmHdq6y8}\n}", "github": "", "project": "", "reviewers": "8Gu4;zieu;QnTY;vqyY", "site": "https://openreview.net/forum?id=i4qKmHdq6y8", "pdf_size": 0, "recommendation": "3;6;6;6", "confidence": "4;2;2;3", "correctness": "4;4;4;4", "technical_novelty": "3;3;4;3", "empirical_novelty": "2;2;3;0", "wc_summary_paper": "484;40;143;50", "wc_summary_review": "161;39;96;15", "wc_main_review": "1234;173;568;140", "wc_review": "1879;252;807;205", "wc_reply_reviewers": "0;0;185;0", "wc_reply_authors": "2519;255;1260;244", "reply_reviewers": "0;0;2;0", "reply_authors": "5;1;4;1", "recommendation_avg": [ 5.25, 1.299038105676658 ], "confidence_avg": [ 2.75, 0.82915619758885 ], "correctness_avg": [ 4.0, 0.0 ], "technical_novelty_avg": [ 3.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.75, 1.0897247358851685 ], "wc_summary_paper_avg": [ 179.25, 180.47350913638266 ], "wc_summary_review_avg": [ 77.75, 56.35323859371349 ], "wc_main_review_avg": [ 528.75, 440.62533687930386 ], "wc_review_avg": [ 785.75, 674.1303193745257 ], "wc_reply_reviewers_avg": [ 46.25, 80.10734985006057 ], "wc_reply_authors_avg": [ 1069.5, 933.0328236455564 ], "reply_reviewers_avg": [ 0.5, 0.8660254037844386 ], "reply_authors_avg": [ 2.75, 1.7853571071357126 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.8703882797784892, "corr_recommendation_correctness": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17412848403644486702&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "State University of New York at Stony Brook;Snap Inc.;Morgan Stanley;State University of New York", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.stonybrook.edu;https://www.snapinc.com;https://www.morganstanley.com;https://www.stonybrook.edu", "aff_unique_abbr": "SUNY Stony Brook;Snap;Morgan Stanley;SUNY Stony Brook", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Stony Brook;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "i7-BqPD1e5", "title": "Adversarial Attack across Datasets", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "It has been observed that Deep Neural Networks (DNNs) are vulnerable to transfer attacks in the query-free black-box setting. However, all the previous studies on transfer attack assume that the white-box surrogate models possessed by the attacker and the black-box victim models are trained on the same dataset, which means the attacker implicitly knows the label set and the input size of the victim model. However, this assumption is usually unrealistic as the attacker may not know the dataset used by the victim model, and further, the attacker needs to attack any randomly encountered images that may not come from the same dataset. Therefore, in this paper, we define a new Generalized Transferable Attack (GTA) problem where we assume the attacker has a set of surrogate models trained on different datasets (with different label sets and image sizes), and none of them is equal to the dataset used by the victim model. We then propose a novel method called Image Classification Eraser (ICE) to erase classification information for any encountered images from arbitrary datasets. Extensive experiments on Cifar-10, Cifar-100, and TieredImageNet demonstrate the effectiveness of the proposed ICE on the GTA problem. Further, we show that existing transfer attack methods can be modified to tackle the GTA problem, but with significantly worse performance compared with ICE. ", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/4382903db544beab7bf54106e620cdce5d286a40.zip", "author": "Yunxiao Qin;Yuanhao Xiong;Jinfeng Yi;Cho-Jui Hsieh", "authorids": "~Yunxiao_Qin1;~Yuanhao_Xiong1;~Jinfeng_Yi1;~Cho-Jui_Hsieh1", "gender": "M;M;M;M", "homepage": "https://qyxqyx.github.io/homepage;https://xyh97.github.io/;http://jinfengyi.net/;http://web.cs.ucla.edu/~chohsieh/index.html", "dblp": "230/4075;232/1248;117/4898;14/2770", "google_scholar": "EMEy3gwAAAAJ;DVKxiMkAAAAJ;lZxRZ84AAAAJ;Wy89g4IAAAAJ", "orcid": "0000-0003-3209-020X;;;", "linkedin": ";;https://www.linkedin.com/nhome/?trk=;", "or_profile": "~Yunxiao_Qin1;~Yuanhao_Xiong1;~Jinfeng_Yi1;~Cho-Jui_Hsieh1", "aff": "Communication University of China;University of California, Los Angeles;JD AI Research;University of California, Los Angeles", "aff_domain": "cuc.edu.cn;cs.ucla.edu;jd.com;ucla.edu", "position": "Lecturer;PhD student;Senior Director;Assistant Professor", "bibtex": "@misc{\nqin2022adversarial,\ntitle={Adversarial Attack across Datasets},\nauthor={Yunxiao Qin and Yuanhao Xiong and Jinfeng Yi and Cho-Jui Hsieh},\nyear={2022},\nurl={https://openreview.net/forum?id=i7-BqPD1e5}\n}", "github": "", "project": "", "reviewers": "65Ly;G5E5;822A;o3fm;ggdd", "site": "https://openreview.net/forum?id=i7-BqPD1e5", "pdf_size": 0, "recommendation": "5;5;5;6;6", "confidence": "4;4;4;5;4", "correctness": "3;3;3;3;4", "technical_novelty": "3;2;3;3;3", "empirical_novelty": "3;3;2;3;2", "wc_summary_paper": "61;63;81;105;90", "wc_summary_review": "53;46;26;99;27", "wc_main_review": "406;446;265;901;356", "wc_review": "520;555;372;1105;473", "wc_reply_reviewers": "0;188;18;336;54", "wc_reply_authors": "852;781;725;2448;1218", "reply_reviewers": "0;1;1;1;1", "reply_authors": "4;6;5;9;4", "recommendation_avg": [ 5.4, 0.48989794855663565 ], "confidence_avg": [ 4.2, 0.39999999999999997 ], "correctness_avg": [ 3.2, 0.39999999999999997 ], "technical_novelty_avg": [ 2.8, 0.39999999999999997 ], "empirical_novelty_avg": [ 2.6, 0.4898979485566356 ], "wc_summary_paper_avg": [ 80.0, 16.589153082662175 ], "wc_summary_review_avg": [ 50.2, 26.573671180324332 ], "wc_main_review_avg": [ 474.8, 221.51243757405587 ], "wc_review_avg": [ 605.0, 257.4637838609539 ], "wc_reply_reviewers_avg": [ 119.2, 126.83595704688793 ], "wc_reply_authors_avg": [ 1204.8, 644.9810539853089 ], "reply_reviewers_avg": [ 0.8, 0.4000000000000001 ], "reply_authors_avg": [ 5.6, 1.8547236990991407 ], "replies_avg": [ 38, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.6123724356957945, "corr_recommendation_correctness": 0.6123724356957945, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16653147624451568838&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1;2;1", "aff_unique_norm": "Communication University of China;University of California, Los Angeles;JD", "aff_unique_dep": ";;JD AI Research", "aff_unique_url": "http://www.cuc.edu.cn/;https://www.ucla.edu;https://www.jd.com", "aff_unique_abbr": "CUC;UCLA;JD AI", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Los Angeles", "aff_country_unique_index": "0;1;0;1", "aff_country_unique": "China;United States" }, { "id": "i7FNvHnPvPc", "title": "Boosting the Transferability of Adversarial Attacks with Reverse Adversarial Perturbation", "track": "main", "status": "Reject", "tldr": "", "abstract": "Deep neural networks (DNNs) have shown to be vulnerable to adversarial examples, which can produce erroneous predictions by injecting imperceptible perturbations. In this work, we study the transferability of adversarial examples, which is of significant due to its threat to real-world applications where model architecture or parameters are usually unknown. Many existing works reveal that the adversarial examples are likely to overfit the surrogate model that they are generated from, limiting its transfer attack performance against different target models. Inspired by the connection between the flatness of loss landscape and the model generalization, we propose a novel attack method, dubbed reverse adversarial perturbation (RAP) to boost the transferability of adversarial examples. Specifically, instead of purely minimizing the adversarial loss at a single adversarial point, we advocate seeking adversarial examples locating at the low-value and flat region of the loss landscape, through injecting the worst-case perturbation, the reverse adversarial perturbation, for each step of the optimization procedure. The adversarial attack with RAP is formulated as a min-max bi-level optimization problem. Comprehensive experimental comparisons demonstrate that RAP can significantly boost the adversarial transferability. Furthermore, RAP can be naturally combined with many existing black-box attack techniques, to further boost the transferability. When attacking a real-world image recognition system, Google Cloud Vision API, we obtain 22% performance improvement of targeted attacks over the compared method.", "keywords": "Adversarial Examples;Black-Box Attacks;Adversarial Transferability", "primary_area": "", "supplementary_material": "", "author": "Zeyu Qin;Yanbo Fan;Yi Liu;Yong Zhang;Jue Wang;Baoyuan Wu", "authorids": "~Zeyu_Qin1;~Yanbo_Fan1;~Yi_Liu16;~Yong_Zhang6;~Jue_Wang2;~Baoyuan_Wu1", "gender": "M;M;M;M;M;M", "homepage": "https://alan-qin.github.io/;https://sites.google.com/site/yanbofan0124/;https://yzhang2016.github.io/yongnorriszhang.github.io/;https://juewang725.github.io/;https://sites.google.com/site/baoyuanwu2015/;https://peterlau61.github.io/", "dblp": "271/5778;181/4574;66/4615-34.html;;73/7781;", "google_scholar": "3LXI4-MAAAAJ;OlOqHyUAAAAJ;a_zSeVEAAAAJ;Bt4uDWMAAAAJ;JNTG1KoAAAAJ;https://scholar.google.com/citations?hl=zh-CN", "orcid": "0000-0003-1733-7892;0000-0002-8530-485X;;;0000-0003-2183-5990;", "linkedin": "zeyu-qin-546398179/;;;;;yi-liu-801403204/", "or_profile": "~Zeyu_Qin1;~Yanbo_Fan1;~Yong_Zhang6;~Jue_Wang2;~Baoyuan_Wu1;~Yi_Liu48", "aff": "Hong Kong University of Science and Technology;Tencent AI Lab;Tencent AI Lab;Tencent AI Lab;The Chinese University of Hong Kong, Shenzhen;The Chinese University of Hong Kong, Shenzhen", "aff_domain": "ust.hk;tencent.com;tencent.com;tencent.com;cuhk.edu.cn;cuhk.edu.cn", "position": "PhD student;Associate Professor;Researcher;Director;Associate Professor;Undergrad student", "bibtex": "@misc{\nqin2022boosting,\ntitle={Boosting the Transferability of Adversarial Attacks with Reverse Adversarial Perturbation},\nauthor={Zeyu Qin and Yanbo Fan and Yi Liu and Yong Zhang and Jue Wang and Baoyuan Wu},\nyear={2022},\nurl={https://openreview.net/forum?id=i7FNvHnPvPc}\n}", "github": "", "project": "", "reviewers": "5ohk;PVqn;b3AS;Mzh9", "site": "https://openreview.net/forum?id=i7FNvHnPvPc", "pdf_size": 0, "recommendation": "5;6;6;6", "confidence": "4;3;4;4", "correctness": "3;3;4;4", "technical_novelty": "2;2;4;3", "empirical_novelty": "2;4;4;3", "wc_summary_paper": "40;64;92;55", "wc_summary_review": "54;14;30;11", "wc_main_review": "155;257;212;164", "wc_review": "249;335;334;230", "wc_reply_reviewers": "33;304;37;0", "wc_reply_authors": "1647;2569;597;709", "reply_reviewers": "1;3;1;0", "reply_authors": "4;6;2;1", "recommendation_avg": [ 5.75, 0.4330127018922193 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.82915619758885 ], "empirical_novelty_avg": [ 3.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 62.75, 18.93904696651867 ], "wc_summary_review_avg": [ 27.25, 17.049560111627514 ], "wc_main_review_avg": [ 197.0, 40.85951541562871 ], "wc_review_avg": [ 287.0, 47.97395126524393 ], "wc_reply_reviewers_avg": [ 93.5, 122.3774897601679 ], "wc_reply_authors_avg": [ 1380.5, 798.1758891372252 ], "reply_reviewers_avg": [ 1.25, 1.0897247358851685 ], "reply_authors_avg": [ 3.25, 1.920286436967152 ], "replies_avg": [ 23, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 97, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=176390718812823613&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff_unique_index": "0;1;1;1;2;2", "aff_unique_norm": "Hong Kong University of Science and Technology;Tencent;Chinese University of Hong Kong", "aff_unique_dep": ";Tencent AI Lab;", "aff_unique_url": "https://www.ust.hk;https://ai.tencent.com;https://www.cuhk.edu.cn", "aff_unique_abbr": "HKUST;Tencent AI Lab;CUHK", "aff_campus_unique_index": "0;2;2", "aff_campus_unique": "Hong Kong SAR;;Shenzhen", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "China" }, { "id": "i7O3VGpb7qZ", "title": "Code Editing from Few Exemplars by Adaptive Multi-Extent Composition", "track": "main", "status": "Reject", "tldr": "", "abstract": "This paper considers the computer source code editing with few exemplars. The editing exemplar, containing the original and modified support code snippets, showcases a certain editorial style and implies the edit intention for a query code snippet. To achieve this, we propose a machine learning approach to adapt the editorial style derived from few exemplars to a query code snippet. Our learning approach combines edit representations extracted from editing exemplars and compositionally generalizes them to the query code snippet editing via multi-extent similarities ensemble. Specifically, we parse the code snippets using language-specific grammar into abstract syntax trees. We apply the similarities measurement in multiple extents from individual nodes to collective tree representations, and ensemble them through a similarity-ranking error estimator. We evaluate the proposed method on two datasets in C\\# and Python languages and respectively show 8.0\\% and 10.9\\% absolute accuracy improvements compared to baselines.", "keywords": "code editing;few-shot learning;compositional generalization", "primary_area": "", "supplementary_material": "", "author": "Peizhao Li;Xuchao Zhang;Ziyu Yao;Wei Cheng;Haifeng Chen;Hongfu Liu", "authorids": "~Peizhao_Li1;~Xuchao_Zhang2;~Ziyu_Yao1;~Wei_Cheng1;~Haifeng_Chen1;~Hongfu_Liu2", "gender": "M;;F;M;;M", "homepage": "https://peizhaoli.com;https://xuczhang.github.io/;http://ziyuyao.org;https://chengw07.github.io/;https://haifengchen.gitlab.io/intro/;http://hongfuliu.com/", "dblp": "232/1771;188/3475.html;;89/2506-2.html;08/57-1.html;32/9075-1", "google_scholar": "h8UyqB4AAAAJ;;4lYrMNUAAAAJ;PRrGVmoAAAAJ;QzakB68AAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";;0009-0007-4571-3505;;;", "linkedin": "peizhao-li-099037182/;;;wei-cheng-ml/;;", "or_profile": "~Peizhao_Li1;~Xuchao_Zhang2;~Ziyu_Yao1;~Wei_Cheng1;~Haifeng_Chen1;~Hongfu_Liu2", "aff": "Brandeis University;;George Mason University;NEC-Labs;NEC-Labs;Brandeis University", "aff_domain": "brandeis.edu;;gmu.edu;nec-labs.com;nec-labs.com;brandeis.edu", "position": "PhD student;;Assistant Professor;Principal Researcher;Researcher;Assistant Professor", "bibtex": "@misc{\nli2022code,\ntitle={Code Editing from Few Exemplars by Adaptive Multi-Extent Composition},\nauthor={Peizhao Li and Xuchao Zhang and Ziyu Yao and Wei Cheng and Haifeng Chen and Hongfu Liu},\nyear={2022},\nurl={https://openreview.net/forum?id=i7O3VGpb7qZ}\n}", "github": "", "project": "", "reviewers": "L4AR;ZRFf;4A6k;gFho", "site": "https://openreview.net/forum?id=i7O3VGpb7qZ", "pdf_size": 0, "recommendation": "5;5;5;6", "confidence": "2;4;3;4", "correctness": "3;4;3;4", "technical_novelty": "3;2;3;3", "empirical_novelty": "3;2;2;2", "wc_summary_paper": "93;109;75;208", "wc_summary_review": "21;88;83;36", "wc_main_review": "387;323;250;589", "wc_review": "501;520;408;833", "wc_reply_reviewers": "0;154;0;0", "wc_reply_authors": "1469;2308;932;995", "reply_reviewers": "0;1;0;0", "reply_authors": "3;4;2;2", "recommendation_avg": [ 5.25, 0.4330127018922193 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 121.25, 51.50910113756597 ], "wc_summary_review_avg": [ 57.0, 29.04307146291521 ], "wc_main_review_avg": [ 387.25, 126.16333659189583 ], "wc_review_avg": [ 565.5, 160.15071027004532 ], "wc_reply_reviewers_avg": [ 38.5, 66.68395609140178 ], "wc_reply_authors_avg": [ 1426.0, 549.9022640433479 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 2.75, 0.82915619758885 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.5222329678670935, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:ABo5BQOhwVEJ:scholar.google.com/&scioq=Code+Editing+from+Few+Exemplars+by+Adaptive+Multi-Extent+Composition&hl=en&as_sdt=0,5", "gs_version_total": 2, "aff_unique_index": "0;1;2;2;0", "aff_unique_norm": "Brandeis University;George Mason University;NEC Laboratories", "aff_unique_dep": ";;", "aff_unique_url": "https://www.brandeis.edu;https://www.gmu.edu;https://www.nec-labs.com", "aff_unique_abbr": "Brandeis;GMU;NEC-Labs", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "i7h4M45tU8", "title": "Neural Temporal Logic Programming", "track": "main", "status": "Reject", "tldr": "", "abstract": "Events across a timeline are a common data representation, seen in different temporal modalities. Individual atomic events can occur in a certain temporal ordering to compose higher level composite events. Examples of a composite event are a patient's medical symptom or a baseball player hitting a home run, caused distinct temporal orderings of patient vitals and player movements respectively. Such salient composite events are provided as labels in temporal datasets and most works optimize models to predict these composite event labels directly. We focus uncovering the underlying atomic events and their relations that lead to the composite events within a noisy temporal data setting. We propose Neural Temporal Logic Programming (Neural TLP) which first learns implicit temporal relations between atomic events and then lifts logic rules for composite events, given only the composite events labels for supervision. This is done through efficiently searching through the combinatorial space of all temporal logic rules in an end-to-end differentiable manner. We evaluate our method on video and on healthcare data where it outperforms the baseline methods for rule discovery. ", "keywords": "logic programming;time series;neuro-symbolic;video;healthcare", "primary_area": "", "supplementary_material": "/attachment/a76a2463bf92e19c81278b564457192609c74156.zip", "author": "Karan Samel;Zelin Zhao;Binghong Chen;Shuang Li;Dharmashankar Subramanian;Irfan Essa;Le Song", "authorids": "~Karan_Samel2;~Zelin_Zhao1;~Binghong_Chen1;shuangli@fas.harvard.edu;~Dharmashankar_Subramanian1;~Irfan_Essa1;~Le_Song1", "gender": "M;M;M;;M;M;M", "homepage": "https://karans.github.io;https://scholar.google.com/citations?user=UDi4V3AAAAAJ&hl=en;http://binghongchen.net/;;http://researcher.watson.ibm.com/researcher/view.php?person=us-dharmash;http://www.irfanessa.com/;http://www.cc.gatech.edu/~lsong", "dblp": ";177/4379;192/2022;;;e/IrfanAEssa;94/3481", "google_scholar": ";UDi4V3AAAAAJ;6Px5HxsAAAAJ;;j54RzcEAAAAJ;https://scholar.google.com.tw/citations?user=XM97iScAAAAJ;Xl4E0CsAAAAJ", "orcid": ";0000-0002-2638-0414;;;;0000-0002-6236-2969;", "linkedin": ";zelin-zhao-9354a6191/;binghong-chen-91b697181/;;;irfanessa/;", "or_profile": "~Karan_Samel2;~Zelin_Zhao1;~Binghong_Chen1;shuangli@fas.harvard.edu;~Dharmashankar_Subramanian1;~Irfan_Essa1;~Le_Song1", "aff": "Georgia Institute of Technology;The Chinese University of Hong Kong;Georgia Institute of Technology;;International Business Machines;Georgia Institute of Technology;College of Computing, Georgia Institute of Technology", "aff_domain": "gatech.edu;cuhk.edu.hk;gatech.edu;;ibm.com;gatech.edu;cc.gatech.edu", "position": "PhD student;PhD student;PhD student;;Principal Researcher;Full Professor;Associate Professor", "bibtex": "@misc{\nsamel2022neural,\ntitle={Neural Temporal Logic Programming},\nauthor={Karan Samel and Zelin Zhao and Binghong Chen and Shuang Li and Dharmashankar Subramanian and Irfan Essa and Le Song},\nyear={2022},\nurl={https://openreview.net/forum?id=i7h4M45tU8}\n}", "github": "", "project": "", "reviewers": "9nKB;54Fe;pPed;X4hj", "site": "https://openreview.net/forum?id=i7h4M45tU8", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "3;3;3;3", "correctness": "3;3;3;3", "technical_novelty": "2;2;3;3", "empirical_novelty": "1;2;3;3", "wc_summary_paper": "26;45;53;130", "wc_summary_review": "40;75;25;93", "wc_main_review": "105;382;322;313", "wc_review": "171;502;400;536", "wc_reply_reviewers": "0;0;0;18", "wc_reply_authors": "539;2130;1839;1271", "reply_reviewers": "0;0;0;1", "reply_authors": "1;3;3;4", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 3.0, 0.0 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 63.5, 39.62638010214912 ], "wc_summary_review_avg": [ 58.25, 27.049722734253674 ], "wc_main_review_avg": [ 280.5, 104.73896123219859 ], "wc_review_avg": [ 402.25, 142.58396648992482 ], "wc_reply_reviewers_avg": [ 4.5, 7.794228634059948 ], "wc_reply_authors_avg": [ 1444.75, 607.3657773533178 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 2.75, 1.0897247358851685 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:BanYsoKEaIcJ:scholar.google.com/&scioq=Neural+Temporal+Logic+Programming&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;0;2;0;0", "aff_unique_norm": "Georgia Institute of Technology;Chinese University of Hong Kong;International Business Machines Corporation", "aff_unique_dep": ";;", "aff_unique_url": "https://www.gatech.edu;https://www.cuhk.edu.hk;https://www.ibm.com", "aff_unique_abbr": "Georgia Tech;CUHK;IBM", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Hong Kong SAR;Atlanta", "aff_country_unique_index": "0;1;0;0;0;0", "aff_country_unique": "United States;China" }, { "id": "i8d2kdxii1L", "title": "$p$-Laplacian Based Graph Neural Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "Graph neural networks (GNNs) have demonstrated superior performance for semi-supervised node classification on graphs, as a result of their ability to exploit node features and topological information simultaneously. However, most GNNs implicitly assume that the labels of nodes and their neighbors in a graph are the same or consistent, which does not hold in heterophilic graphs, where the labels of linked nodes are likely to differ. Hence, when the topology is non-informative for label prediction, ordinary GNNs may work significantly worse than simply applying multi-layer perceptrons (MLPs) on each node. To tackle the above problem, we propose a new $p$-Laplacian based GNN model, termed as $^p$GNN, whose message passing mechanism is derived from a discrete regularization framework and could be theoretically explained as an approximation of a polynomial graph filter defined on the spectral domain of $p$-Laplacians. The spectral analysis shows that the new message passing mechanism works simultaneously as low-pass and high-pass filters, thus making $^p$GNNs effective on both homophilic and heterophilic graphs. Empirical studies on real-world and synthetic datasets validate our findings and demonstrate that $^p$GNNs significantly outperform several state-of-the-art GNN architectures on heterophilic benchmarks while achieving competitive performance on homophilic benchmarks. Moreover, $^p$GNNs can adaptively learn aggregation weights and are robust to noisy edges.", "keywords": "Graph neural networks;$p$-Laplacian;semi-supervised learning;node prediction", "primary_area": "", "supplementary_material": "/attachment/78cd20693c411b7aba47fdf16b23532031887db7.zip", "author": "Guoji Fu;Peilin Zhao;Yatao Bian", "authorids": "~Guoji_Fu1;~Peilin_Zhao2;~Yatao_Bian1", "gender": ";;", "homepage": ";;", "dblp": ";84/8411;", "google_scholar": ";https://scholar.google.com.hk/citations?user=HPeX_YcAAAAJ;", "orcid": ";0000-0001-8543-3953;", "linkedin": ";;", "or_profile": "~Guoji_Fu1;~Peilin_Zhao2;~Yatao_Bian1", "aff": ";Tencent;", "aff_domain": ";tencent.com;", "position": ";Researcher;", "bibtex": "@misc{\nfu2022plaplacian,\ntitle={\\$p\\$-Laplacian Based Graph Neural Networks},\nauthor={Guoji Fu and Peilin Zhao and Yatao Bian},\nyear={2022},\nurl={https://openreview.net/forum?id=i8d2kdxii1L}\n}", "github": "", "project": "", "reviewers": "Pzid;4UH6;rMNu", "site": "https://openreview.net/forum?id=i8d2kdxii1L", "pdf_size": 0, "recommendation": "3;5;8", "confidence": "4;4;3", "correctness": "2;3;4", "technical_novelty": "2;2;3", "empirical_novelty": "2;3;3", "wc_summary_paper": "67;20;136", "wc_summary_review": "4;67;106", "wc_main_review": "358;448;389", "wc_review": "429;535;631", "wc_reply_reviewers": "418;567;0", "wc_reply_authors": "3101;2869;1473", "reply_reviewers": "1;2;0", "reply_authors": "8;9;3", "recommendation_avg": [ 5.333333333333333, 2.0548046676563256 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.0, 0.816496580927726 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 74.33333333333333, 47.63985259796209 ], "wc_summary_review_avg": [ 59.0, 42.02380277890139 ], "wc_main_review_avg": [ 398.3333333333333, 37.33035702421764 ], "wc_review_avg": [ 531.6666666666666, 82.49983164965988 ], "wc_reply_reviewers_avg": [ 328.3333333333333, 240.00324071886104 ], "wc_reply_authors_avg": [ 2481.0, 719.0289748450105 ], "reply_reviewers_avg": [ 1.0, 0.816496580927726 ], "reply_authors_avg": [ 6.666666666666667, 2.6246692913372702 ], "replies_avg": [ 31, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.9176629354822472, "corr_recommendation_correctness": 0.9933992677987828, "gs_citation": 39, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15123165040444629585&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff_unique_index": "0", "aff_unique_norm": "Tencent", "aff_unique_dep": "Tencent Holdings Limited", "aff_unique_url": "https://www.tencent.com", "aff_unique_abbr": "Tencent", "aff_country_unique_index": "0", "aff_country_unique": "China" }, { "id": "iARgLYsH2P", "title": "Disentangled Mask Attention in Transformer", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Transformer conducts self attention which has achieved state-of-the-art performance in many applications. Multi-head attention in transformer basically gathers the features from individual tokens in input sequence to form the mapping to output sequence. There are twofold weaknesses in learning representation using transformer. First, due to the natural property that attention mechanism would mix up the features of different tokens in input and output sequences, it is likely that the representation of input tokens contains redundant information. Second, the patterns of attention weights between different heads tend to be similar, the representation capacity of the model might be bounded. To strengthen the sequential learning representation, this paper presents a new disentangled mask attention in transformer where the redundant features are reduced and the semantic information is enriched. Latent disentanglement in multi-head attention is learned. The attention weights are filtered by a mask which is optimized by semantic clustering. The proposed attention mechanism is implemented for sequential learning according to the clustered disentanglement objective. The experiments on machine translation show the merit of this disentangled transformer in sequence-to-sequence learning tasks.", "keywords": "sequential learning;mask attention;latent disentanglement;transformer", "primary_area": "", "supplementary_material": "/attachment/36e2aeaad4fb0935f6bf71a2c2c9f0e706b60e9b.zip", "author": "Jen-Tzung Chien;Yu-Han Huang", "authorids": "~Jen-Tzung_Chien1;yhhuang.ee08@nycu.edu.tw", "gender": "M;", "homepage": "http://chien.cm.nctu.edu.tw;", "dblp": "03/3569;", "google_scholar": ";", "orcid": "0000-0003-3466-8941;", "linkedin": "jen-tzung-chien-23a79158/;", "or_profile": "~Jen-Tzung_Chien1;yhhuang.ee08@nycu.edu.tw", "aff": "National Yang Ming Chiao Tung University;", "aff_domain": "nycu.edu.tw;", "position": "Full Professor;", "bibtex": "@misc{\nchien2022disentangled,\ntitle={Disentangled Mask Attention in Transformer},\nauthor={Jen-Tzung Chien and Yu-Han Huang},\nyear={2022},\nurl={https://openreview.net/forum?id=iARgLYsH2P}\n}", "github": "", "project": "", "reviewers": "qxZa;gHzn;DGfu;5GKP", "site": "https://openreview.net/forum?id=iARgLYsH2P", "pdf_size": 0, "recommendation": "5;5;5;6", "confidence": "3;2;4;4", "correctness": "3;3;4;3", "technical_novelty": "2;3;2;3", "empirical_novelty": "2;3;2;3", "wc_summary_paper": "198;83;150;73", "wc_summary_review": "76;23;42;33", "wc_main_review": "485;143;239;239", "wc_review": "759;249;431;345", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 5.25, 0.4330127018922193 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 126.0, 51.034302189801714 ], "wc_summary_review_avg": [ 43.5, 19.93113142799475 ], "wc_main_review_avg": [ 276.5, 126.5968009074479 ], "wc_review_avg": [ 446.0, 191.835867344978 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.5222329678670935, "corr_recommendation_correctness": -0.3333333333333333, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7693882523900422523&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "National Yang Ming Chiao Tung University", "aff_unique_dep": "", "aff_unique_url": "https://www.nycu.edu.tw", "aff_unique_abbr": "NYCU", "aff_campus_unique_index": "0", "aff_campus_unique": "Taiwan", "aff_country_unique_index": "0", "aff_country_unique": "China" }, { "title": "Poisoning and Backdooring Contrastive Learning", "status": "Oral", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6316", "id": "iC4UHbQ01Mp", "poster": "", "openreview": "https://openreview.net/forum?id=iC4UHbQ01Mp", "slides": "https://iclr.cc/virtual/2022/poster/6316", "video": "https://iclr.cc/virtual/2022/poster/6316", "author_site": "Nicholas Carlini, Andreas Terzis", "tldr": "", "abstract": "Multimodal contrastive learning methods like CLIP train on noisy and uncurated training datasets. This is cheaper than labeling datasets manually, and even improves out-of-distribution robustness. We show that this practice makes backdoor and poisoning attacks a significant threat. By poisoning just 0.01% of a dataset (e.g., just 300 images of the 3 million-example Conceptual Captions dataset), we can cause the model to misclassify test images by overlaying a small patch. Targeted poisoning attacks, whereby the model misclassifies a particular test input with an adversarially-desired label, are even easier requiring control of 0.0001% of the dataset (e.g., just three out of the 3 million images). Our attacks call into question whether training on noisy and uncurated Internet scrapes is desirable.", "keywords": "Contrastive Learning;Poisoning attack;Backdoor attack;CLIP", "primary_area": "", "supplementary_material": "", "author": "Nicholas Carlini;Andreas Terzis", "authorids": "~Nicholas_Carlini1;~Andreas_Terzis1", "gender": ";M", "homepage": "http://nicholas.carlini.com;https://aterzis-personal.github.io/aterzis/", "dblp": "145/1806;12/6664", "google_scholar": ";NcIqQ88AAAAJ", "orcid": ";", "linkedin": ";andreas-terzis-2395371/", "or_profile": "~Nicholas_Carlini1;~Andreas_Terzis1", "aff": "Google;Google Research, Brain Team", "aff_domain": "google.com;research.google.com", "position": "Researcher;Researcher", "bibtex": "@inproceedings{\ncarlini2022poisoning,\ntitle={Poisoning and Backdooring Contrastive Learning},\nauthor={Nicholas Carlini and Andreas Terzis},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=iC4UHbQ01Mp}\n}", "github": "", "project": "", "reviewers": "iEU2;AV9G;wv32;HsbX", "pdf_size": 0, "recommendation": "8;8;8;8", "confidence": "3;4;4;3", "correctness": "4;4;4;3", "technical_novelty": "3;4;3;2", "empirical_novelty": "3;4;3;4", "wc_summary_paper": "50;24;87;89", "wc_summary_review": "12;40;30;63", "wc_main_review": "127;297;216;472", "wc_review": "189;361;333;624", "wc_reply_reviewers": "0;19;0;20", "wc_reply_authors": "20;211;255;221", "reply_reviewers": "0;1;0;1", "reply_authors": "1;1;1;1", "recommendation_avg": [ 8.0, 0.0 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 3.5, 0.5 ], "wc_summary_paper_avg": [ 62.5, 27.115493725912497 ], "wc_summary_review_avg": [ 36.25, 18.417043736713012 ], "wc_main_review_avg": [ 278.0, 127.12395525627733 ], "wc_review_avg": [ 376.75, 156.9591905560168 ], "wc_reply_reviewers_avg": [ 9.75, 9.756408150543928 ], "wc_reply_authors_avg": [ 176.75, 91.95753095858979 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 201, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12638951884113766778&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=iC4UHbQ01Mp", "email": "google.com;research.google.com", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Google", "aff_unique_dep": "Google", "aff_unique_url": "https://www.google.com", "aff_unique_abbr": "Google", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "TRGP: Trust Region Gradient Projection for Continual Learning", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6867", "id": "iEvAf8i6JjO", "poster": "", "openreview": "https://openreview.net/forum?id=iEvAf8i6JjO", "slides": "https://iclr.cc/virtual/2022/poster/6867", "video": "https://iclr.cc/virtual/2022/poster/6867", "author_site": "Sen Lin, Li Yang, Deliang Fan, Junshan Zhang", "tldr": "", "abstract": "Catastrophic forgetting is one of the major challenges in continual learning. To address this issue, some existing methods put restrictive constraints on the optimization space of the new task for minimizing the interference to old tasks. However, this may lead to unsatisfactory performance for the new task, especially when the new task is strongly correlated with old tasks. To tackle this challenge, we propose Trust Region Gradient Projection (TRGP) for continual learning to facilitate the forward knowledge transfer based on an efficient characterization of task correlation. Particularly, we introduce a notion of 'trust region' to select the most related old tasks for the new task in a layer-wise and single-shot manner, using the norm of gradient projection onto the subspace spanned by task inputs. Then, a scaled weight projection is proposed to cleverly reuse the frozen weights of the selected old tasks in the trust region through a layer-wise scaling matrix. By jointly optimizing the scaling matrices and the model, where the model is updated along the directions orthogonal to the subspaces of old tasks, TRGP can effectively prompt knowledge transfer without forgetting. Extensive experiments show that our approach achieves significant improvement over related state-of-the-art methods.", "keywords": "trust region;gradient projection;scaled weight projection;continual learning;forward knowledge transfer;task correlation", "primary_area": "", "supplementary_material": "/attachment/5df19db8c9f9004aef610580629148c20f55fe0d.zip", "author": "Sen Lin;Li Yang;Deliang Fan;Junshan Zhang", "authorids": "~Sen_Lin1;~Li_Yang6;~Deliang_Fan1;~Junshan_Zhang1", "gender": ";M;M;M", "homepage": "https://slin70.github.io/;https://lyang-666.github.io/;https://faculty.engineering.asu.edu/dfan/;https://faculty.engineering.ucdavis.edu/jzhang/", "dblp": "70/9499-1.html;;129/1701;59/1232.html", "google_scholar": "94-TbUsAAAAJ;qpUT1I8AAAAJ;sAflhJUAAAAJ;UtAdFs8AAAAJ", "orcid": ";0000-0002-2839-6196;0000-0002-7989-6297;", "linkedin": ";li-yang-268710139/;;", "or_profile": "~Sen_Lin1;~Li_Yang6;~Deliang_Fan1;~Junshan_Zhang1", "aff": "Arizona State University;Arizona State University;Arizona State University;University of California, Davis", "aff_domain": "asu.edu;asu.edu;asu.edu;ucdavis.edu", "position": "Postdoc;PhD student;Assistant Professor;Full Professor", "bibtex": "@inproceedings{\nlin2022trgp,\ntitle={{TRGP}: Trust Region Gradient Projection for Continual Learning},\nauthor={Sen Lin and Li Yang and Deliang Fan and Junshan Zhang},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=iEvAf8i6JjO}\n}", "github": "", "project": "", "reviewers": "tgxV;aL1A;jo3a;YFLo", "pdf_size": 0, "recommendation": "3;6;8;8", "confidence": "4;4;3;4", "correctness": "2;3;3;4", "technical_novelty": "2;3;4;4", "empirical_novelty": "2;2;3;4", "wc_summary_paper": "82;104;143;372", "wc_summary_review": "42;269;50;112", "wc_main_review": "491;47;359;327", "wc_review": "615;420;552;811", "wc_reply_reviewers": "0;11;5;227", "wc_reply_authors": "1055;679;874;775", "reply_reviewers": "0;1;1;1", "reply_authors": "3;2;2;2", "recommendation_avg": [ 6.25, 2.0463381929681126 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 3.25, 0.82915619758885 ], "empirical_novelty_avg": [ 2.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 175.25, 115.6749216554738 ], "wc_summary_review_avg": [ 118.25, 91.15474480245118 ], "wc_main_review_avg": [ 306.0, 161.67560112769027 ], "wc_review_avg": [ 599.5, 140.93349495418042 ], "wc_reply_reviewers_avg": [ 60.75, 96.06345559056264 ], "wc_reply_authors_avg": [ 845.75, 139.0995596686057 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 2.25, 0.4330127018922193 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.49374193110101877, "corr_recommendation_correctness": 0.8638684255813602, "gs_citation": 108, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6594331056177251128&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=iEvAf8i6JjO", "email": "asu.edu;asu.edu;asu.edu;ucdavis.edu", "author_num": 4, "aff_unique_index": "0;0;0;1", "aff_unique_norm": "Arizona State University;University of California, Davis", "aff_unique_dep": ";", "aff_unique_url": "https://www.asu.edu;https://www.ucdavis.edu", "aff_unique_abbr": "ASU;UC Davis", "aff_campus_unique_index": "1", "aff_campus_unique": ";Davis", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "VAT-Mart: Learning Visual Action Trajectory Proposals for Manipulating 3D ARTiculated Objects", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6298", "id": "iEx3PiooLy", "poster": "", "openreview": "https://openreview.net/forum?id=iEx3PiooLy", "slides": "https://iclr.cc/virtual/2022/poster/6298", "video": "https://iclr.cc/virtual/2022/poster/6298", "author_site": "Ruihai Wu, Yan Zhao, Kaichun Mo, Zizheng Guo, Yian Wang, Tianhao Wu, Qingnan Fan, Xuelin Chen, Leonidas Guibas, Hao Dong", "tldr": "", "abstract": "Perceiving and manipulating 3D articulated objects (e.g., cabinets, doors) in human environments is an important yet challenging task for future home-assistant robots. The space of 3D articulated objects is exceptionally rich in their myriad semantic categories, diverse shape geometry, and complicated part functionality. Previous works mostly abstract kinematic structure with estimated joint parameters and part poses as the visual representations for manipulating 3D articulated objects. In this paper, we propose object-centric actionable visual priors as a novel perception-interaction handshaking point that the perception system outputs more actionable guidance than kinematic structure estimation, by predicting dense geometry-aware, interaction-aware, and task-aware visual action affordance and trajectory proposals. We design an interaction-for-perception framework VAT-Mart to learn such actionable visual representations by simultaneously training a curiosity-driven reinforcement learning policy exploring diverse interaction trajectories and a perception module summarizing and generalizing the explored knowledge for pointwise predictions among diverse shapes. Experiments prove the effectiveness of the proposed approach using the large-scale PartNet-Mobility dataset in SAPIEN environment and show promising generalization capabilities to novel test shapes, unseen object categories, and real-world data.", "keywords": "Visual Representation Learning for Robotics;Robotic Affordance and Trajectories;3D Shape Understanding", "primary_area": "", "supplementary_material": "/attachment/6e2db40294e936e65e4708c1fa1f985553ff0ab0.zip", "author": "Ruihai Wu;Yan Zhao;Kaichun Mo;Zizheng Guo;Yian Wang;Tianhao Wu;Qingnan Fan;Xuelin Chen;Leonidas Guibas;Hao Dong", "authorids": "~Ruihai_Wu1;~Yan_Zhao5;~Kaichun_Mo1;~Zizheng_Guo1;~Yian_Wang1;~Tianhao_Wu2;~Qingnan_Fan2;~Xuelin_Chen1;~Leonidas_Guibas1;~Hao_Dong3", "gender": "M;F;M;M;M;M;M;M;M;M", "homepage": "https://warshallrho.github.io/;https://sxy7147.github.io;https://cs.stanford.edu/~kaichun/;https://guozz.cn;https://tianhaowuhz.github.io/;https://fqnchina.github.io/;https://xuelin-chen.github.io/;http://geometry.stanford.edu/;https://zsdonghao.github.io;http://wangyian-me.github.io/", "dblp": "248/8028.html;88/5320-35;172/1283;;17/1976-1;;;g/LeonidasJGuibas;14/1525-3.html;71/10046", "google_scholar": "https://scholar.google.com/citations?hl=en;iIs4TDMAAAAJ;pL7JsOsAAAAJ;aWkrs30AAAAJ;eAW0tjMAAAAJ;;C7mNbwQAAAAJ;https://scholar.google.com.tw/citations?user=5JlEyTAAAAAJ;xLFL4sMAAAAJ;dUf3wx4AAAAJ", "orcid": ";;;;;;0009-0007-0158-9469;;0000-0003-2261-9122;", "linkedin": ";;;;;;;;;", "or_profile": "~Ruihai_Wu1;~Yan_Zhao5;~Kaichun_Mo1;~Zizheng_Guo1;~Tianhao_Wu2;~Qingnan_Fan2;~Xuelin_Chen1;~Leonidas_Guibas1;~Hao_Dong3;~\u9038\u5b89_\u738b1", "aff": "Peking University;Peking University;Stanford University;Peking University;Peking University;Tencent AI Lab;Tencent AI Lab;Stanford University;Peking University;Peking University", "aff_domain": "pku.edu.cn;pku.edu.cn;stanford.edu;pku.edu.cn;pku.edu.cn;tencent.com;tencent.com;stanford.edu;pku.edu.cn;pku.edu.cn", "position": "PhD student;Undergrad student;PhD student;Undergrad student;PhD student;Senior Researcher;Researcher;Full Professor;Assistant Professor;Undergrad student", "bibtex": "@inproceedings{\nwu2022vatmart,\ntitle={{VAT}-Mart: Learning Visual Action Trajectory Proposals for Manipulating 3D {ART}iculated Objects},\nauthor={Ruihai Wu and Yan Zhao and Kaichun Mo and Zizheng Guo and Yian Wang and Tianhao Wu and Qingnan Fan and Xuelin Chen and Leonidas Guibas and Hao Dong},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=iEx3PiooLy}\n}", "github": "", "project": "", "reviewers": "BNFC;1qco;dxTr", "pdf_size": 0, "recommendation": "6;6;6", "confidence": "3;3;4", "correctness": "3;3;4", "technical_novelty": "3;2;3", "empirical_novelty": "3;3;2", "wc_summary_paper": "36;72;52", "wc_summary_review": "46;80;75", "wc_main_review": "84;155;156", "wc_review": "166;307;283", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "545;594;1188", "reply_reviewers": "0;0;0", "reply_authors": "1;1;3", "recommendation_avg": [ 6.0, 0.0 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 53.333333333333336, 14.72714802291635 ], "wc_summary_review_avg": [ 67.0, 14.98888477061141 ], "wc_main_review_avg": [ 131.66666666666666, 33.70789554721894 ], "wc_review_avg": [ 252.0, 61.59545437773797 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 775.6666666666666, 292.24913268572 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.6666666666666667, 0.9428090415820634 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 10, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 104, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4481120337376036219&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=iEx3PiooLy", "email": "pku.edu.cn;pku.edu.cn;stanford.edu;pku.edu.cn;pku.edu.cn;tencent.com;tencent.com;stanford.edu;pku.edu.cn;pku.edu.cn", "author_num": 10, "aff_unique_index": "0;0;1;0;0;2;2;1;0;0", "aff_unique_norm": "Peking University;Stanford University;Tencent", "aff_unique_dep": ";;Tencent AI Lab", "aff_unique_url": "http://www.pku.edu.cn;https://www.stanford.edu;https://ai.tencent.com", "aff_unique_abbr": "Peking U;Stanford;Tencent AI Lab", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Stanford", "aff_country_unique_index": "0;0;1;0;0;0;0;1;0;0", "aff_country_unique": "China;United States" }, { "id": "iFf26yMjRdN", "title": "Federated Learning with Partial Model Personalization", "track": "main", "status": "Reject", "tldr": "", "abstract": "We propose and analyze a general framework of federated learning with partial model personalization. Compared with full model personalization, partial model personalization relies on domain knowledge to select a small portion of the model to personalize, thus imposing a much smaller on-device memory footprint. We propose two federated optimization algorithms for training partially personalized models, where the shared and personal parameters are updated either simultaneously or alternately on each device, but only the shared parameters are communicated and aggregated at the server. We give convergence analyses of both algorithms for minimizing smooth nonconvex functions, providing theoretical support of them for training deep learning models. Our experiments on real-world image and text datasets demonstrate that (a) partial model personalization can obtain most of the benefit of full model personalization with a small fraction of personalized parameters, and, (b) the alternating update algorithm often outperforms the simultaneous update algorithm.", "keywords": "personalization;federated learning;partial personalization;adapter modules;nonconvex minimization", "primary_area": "", "supplementary_material": "", "author": "Krishna Pillutla;Kshitiz Malik;Abdelrahman Mohamed;Michael Rabbat;Maziar Sanjabi;Lin Xiao", "authorids": "~Krishna_Pillutla1;~Kshitiz_Malik2;~Abdelrahman_Mohamed2;~Michael_Rabbat1;~Maziar_Sanjabi1;~Lin_Xiao1", "gender": "M;M;M;M;;M", "homepage": "https://krishnap25.github.io;http://www.cs.toronto.edu/~asamir;;https://sites.google.com/view/maziar;;", "dblp": "173/5185.html;28/8759;47/1744;21/8577;;", "google_scholar": "IL7N6sMAAAAJ;https://scholar.google.ca/citations?user=tJ_PrzgAAAAJ;https://scholar.google.ch/citations?user=cMPKe9UAAAAJ;bc_N2-oAAAAJ;vK0-CDcAAAAJ;pkAWLt8AAAAJ", "orcid": ";;;;0000-0002-9759-3898;", "linkedin": ";abdel-rahman-mohamed-a5808210;;;;", "or_profile": "~Krishna_Pillutla1;~Abdelrahman_Mohamed2;~Michael_Rabbat1;~Maziar_Sanjabi1;~Lin_Xiao1;~KSHITIZ_MALIK1", "aff": "University of Washington, Seattle;Meta Facebook;Mila;Meta;Meta Facebook;", "aff_domain": "uw.edu;fb.com;mila.quebec;meta.com;meta.com;", "position": "PhD student;research scientist;Associate Member;Researcher;Research Scientist;", "bibtex": "@misc{\npillutla2022federated,\ntitle={Federated Learning with Partial Model Personalization},\nauthor={Krishna Pillutla and Kshitiz Malik and Abdelrahman Mohamed and Michael Rabbat and Maziar Sanjabi and Lin Xiao},\nyear={2022},\nurl={https://openreview.net/forum?id=iFf26yMjRdN}\n}", "github": "", "project": "", "reviewers": "8i9S;Nawz;447D;mEwK", "site": "https://openreview.net/forum?id=iFf26yMjRdN", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "4;4;4;4", "correctness": "4;4;4;3", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "81;70;113;156", "wc_summary_review": "13;42;40;48", "wc_main_review": "296;356;123;212", "wc_review": "390;468;276;416", "wc_reply_reviewers": "0;101;0;0", "wc_reply_authors": "648;1260;186;423", "reply_reviewers": "0;1;0;0", "reply_authors": "1;2;1;1", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 105.0, 33.414068893207244 ], "wc_summary_review_avg": [ 35.75, 13.460590625971804 ], "wc_main_review_avg": [ 246.75, 87.86744277603623 ], "wc_review_avg": [ 387.5, 70.23353899669303 ], "wc_reply_reviewers_avg": [ 25.25, 43.73428289111415 ], "wc_reply_authors_avg": [ 629.25, 399.1261548683574 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": -0.5773502691896257, "gs_citation": 205, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4750968691898857474&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff_unique_index": "0;1;2;1;1", "aff_unique_norm": "University of Washington;Meta;Mila", "aff_unique_dep": ";Meta Platforms, Inc.;Quebec Artificial Intelligence Institute", "aff_unique_url": "https://www.washington.edu;https://meta.com;https://mila.quebec", "aff_unique_abbr": "UW;Meta;Mila", "aff_campus_unique_index": "0", "aff_campus_unique": "Seattle;", "aff_country_unique_index": "0;0;1;0;0", "aff_country_unique": "United States;Canada" }, { "id": "iGffRQ9jQpQ", "title": "Enhancing semi-supervised learning via self-interested coalitional learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Semi-supervised learning holds great promise for many real-world applications, due to its ability to leverage both unlabeled and expensive labeled data. However, most semi-supervised learning algorithms still heavily rely on the limited labeled data to infer and utilize the hidden information from unlabeled data. We note that any semi-supervised learning task under the self-training paradigm also hides an auxiliary task of discriminating label observability. Jointly solving these two tasks allows full utilization of information from both labeled and unlabeled data, thus alleviating the problem of over-reliance on labeled data. This naturally leads to a new learning framework, which we call Self-interested Coalitional Learning (SCL). The key idea of SCL is to construct a semi-cooperative ``game\u201d, which forges cooperation between a main self-interested semi-supervised learning task and a companion task that infers label observability to facilitate main task training. We show with theoretical deduction its connection to loss reweighting on noisy labels. Through comprehensive evaluation on both classification and regression tasks, we show that SCL can consistently enhance the performance of semi-supervised learning algorithms.", "keywords": "self-interested coalitional learning;semi-supervised learning;soft labeling;loss reweighting", "primary_area": "", "supplementary_material": "", "author": "Huiling Qin;Xianyuan Zhan;Yuanxun li;Haoran Xu;yu zheng", "authorids": "~Huiling_Qin1;~Xianyuan_Zhan1;liyuanxun1@jd.com;~Haoran_Xu4;msyuzheng@outlook.com", "gender": "F;M;;M;", "homepage": ";http://zhanxianyuan.xyz/;;https://ryanxhr.github.io/;", "dblp": "213/0873;181/5081;;;", "google_scholar": "https://scholar.google.com.hk/citations?hl=en;pDMnGloAAAAJ;;iX8AJI0AAAAJ;", "orcid": ";0000-0002-3683-0554;;;", "linkedin": ";;;;", "or_profile": "~Huiling_Qin1;~Xianyuan_Zhan1;liyuanxun1@jd.com;~Haoran_Xu4;msyuzheng@outlook.com", "aff": ";Tsinghua University;;JD.com;", "aff_domain": ";tsinghua.edu.cn;;jd.com;", "position": ";Associate Professor;;Researcher;", "bibtex": "@misc{\nqin2022enhancing,\ntitle={Enhancing semi-supervised learning via self-interested coalitional learning},\nauthor={Huiling Qin and Xianyuan Zhan and Yuanxun li and Haoran Xu and yu zheng},\nyear={2022},\nurl={https://openreview.net/forum?id=iGffRQ9jQpQ}\n}", "github": "", "project": "", "reviewers": "HSTc;Qifr;3FG6;dWui", "site": "https://openreview.net/forum?id=iGffRQ9jQpQ", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "4;3;3;3", "correctness": "2;4;3;4", "technical_novelty": "2;4;3;3", "empirical_novelty": "2;4;3;2", "wc_summary_paper": "52;50;51;87", "wc_summary_review": "7;17;45;19", "wc_main_review": "309;379;604;277", "wc_review": "368;446;700;383", "wc_reply_reviewers": "495;0;0;0", "wc_reply_authors": "1799;943;1227;746", "reply_reviewers": "2;0;0;0", "reply_authors": "4;2;2;1", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 60.0, 15.604486534327235 ], "wc_summary_review_avg": [ 22.0, 14.035668847618199 ], "wc_main_review_avg": [ 392.25, 127.69764093357402 ], "wc_review_avg": [ 474.25, 133.5821376532057 ], "wc_reply_reviewers_avg": [ 123.75, 214.34128743664857 ], "wc_reply_authors_avg": [ 1178.75, 396.8276546562752 ], "reply_reviewers_avg": [ 0.5, 0.8660254037844386 ], "reply_authors_avg": [ 2.25, 1.0897247358851685 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.9271726499455306, "corr_recommendation_correctness": 0.899228803025897, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:n7NxcuYcdI0J:scholar.google.com/&scioq=Enhancing+semi-supervised+learning+via+self-interested+coalitional+learning&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1", "aff_unique_norm": "Tsinghua University;JD.com", "aff_unique_dep": ";", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.jd.com", "aff_unique_abbr": "THU;JD", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "id": "iJ_nnX5Qvyt", "title": "Combinatorial Reinforcement Learning Based Scheduling for DNN Execution on Edge", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "The past half-decade has seen unprecedented growth in machine learning with deep neural networks (DNNs) that represent state-of-the-art in many real-world applications. However, DNNs have substantial computational and memory requirements, in which the compilation of its computational graphs has great impact in resource-constrained (e.g., computation, I/O, and memory bounded) edge computing systems. While efficient execution of its computational graph leads to high-performance and energy-efficient execution, generating an optimal computational graph schedule is known as \\textit{NP-hard} problem. The complexity of scheduling the DNNs computational graphs will further increase on pipelined multi-core system considering memory communication cost, as well as the increasing size of DNNs. This work presents a reinforcement learning based scheduling framework, which imitates the behaviors of optimal optimization algorithms at the speed of inference, and compiles arbitrary DNNs computational graphs without re-training. Our framework has demonstrated up to $\\sim$$2.5\\times$ runtime speedups over the commercial Edge TPU compiler, using ten popular ImageNet models, on physical Google Edge TPUs system. More importantly, compared to the exact optimization methods solved by heuristics and brute-force, the proposed RL scheduling improves the scheduling runtime by several orders of magnitude.", "keywords": "Reinforcement Learning;Combinatorial Optimization;Edge Computing", "primary_area": "", "supplementary_material": "/attachment/dc5ec718aa8d72aee44b871aabb6fe03ef39605b.zip", "author": "Qiwei Yuan;Jiaqi Yin;CUNXI YU", "authorids": "~Qiwei_Yuan1;u1346826@utah.edu;~CUNXI_YU1", "gender": "M;;", "homepage": "https://joshuayy.github.io/;;", "dblp": ";;", "google_scholar": "tKAoUN4AAAAJ;;", "orcid": ";;", "linkedin": "joshua-yuan-great/;;", "or_profile": "~Qiwei_Yuan1;u1346826@utah.edu;~CUNXI_YU1", "aff": "University of Utah;;", "aff_domain": "utah.edu;;", "position": "PhD student;;", "bibtex": "@misc{\nyuan2022combinatorial,\ntitle={Combinatorial Reinforcement Learning Based Scheduling for {DNN} Execution on Edge},\nauthor={Qiwei Yuan and Jiaqi Yin and CUNXI YU},\nyear={2022},\nurl={https://openreview.net/forum?id=iJ_nnX5Qvyt}\n}", "github": "", "project": "", "reviewers": "Erzh;k5YF;NTcu", "site": "https://openreview.net/forum?id=iJ_nnX5Qvyt", "pdf_size": 0, "recommendation": "3;3;3", "confidence": "5;4;3", "correctness": "2;2;2", "technical_novelty": "2;2;2", "empirical_novelty": "2;2;2", "wc_summary_paper": "43;85;122", "wc_summary_review": "46;68;168", "wc_main_review": "355;469;367", "wc_review": "444;622;657", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "correctness_avg": [ 2.0, 0.0 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 83.33333333333333, 32.27313984655902 ], "wc_summary_review_avg": [ 94.0, 53.09111664550546 ], "wc_main_review_avg": [ 397.0, 51.146847410177685 ], "wc_review_avg": [ 574.3333333333334, 93.2606860126793 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:FJtpoeJbhgEJ:scholar.google.com/&scioq=Combinatorial+Reinforcement+Learning+Based+Scheduling+for+DNN+Execution+on+Edge&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "University of Utah", "aff_unique_dep": "", "aff_unique_url": "https://www.utah.edu", "aff_unique_abbr": "Utah", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "title": "PAC-Bayes Information Bottleneck", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6238", "id": "iLHOIDsPv1P", "poster": "", "openreview": "https://openreview.net/forum?id=iLHOIDsPv1P", "slides": "https://iclr.cc/virtual/2022/poster/6238", "video": "https://iclr.cc/virtual/2022/poster/6238", "author_site": "Zifeng Wang, Shao-Lun Huang, Ercan Kuruoglu, Jimeng Sun, Xi Chen, Yefeng Zheng", "tldr": "", "abstract": "Understanding the source of the superior generalization ability of NNs remains one of the most important problems in ML research. There have been a series of theoretical works trying to derive non-vacuous bounds for NNs. Recently, the compression of information stored in weights (IIW) is proved to play a key role in NNs generalization based on the PAC-Bayes theorem. However, no solution of IIW has ever been provided, which builds a barrier for further investigation of the IIW's property and its potential in practical deep learning. In this paper, we propose an algorithm for the efficient approximation of IIW. Then, we build an IIW-based information bottleneck on the trade-off between accuracy and information complexity of NNs, namely PIB. From PIB, we can empirically identify the fitting to compressing phase transition during NNs' training and the concrete connection between the IIW compression and the generalization. Besides, we verify that IIW is able to explain NNs in broad cases, e.g., varying batch sizes, over-parameterization, and noisy labels. Moreover, we propose an MCMC-based algorithm to sample from the optimal weight posterior characterized by PIB, which fulfills the potential of IIW in enhancing NNs in practice.", "keywords": "information bottleneck;representation learning;generalization", "primary_area": "", "supplementary_material": "/attachment/6dd440805d8b2df78932fcf3fb4525bad404f5eb.zip", "author": "Zifeng Wang;Shao-Lun Huang;Ercan Engin Kuruoglu;Jimeng Sun;Xi Chen;Yefeng Zheng", "authorids": "~Zifeng_Wang3;~Shao-Lun_Huang3;~Ercan_Engin_Kuruoglu1;~Jimeng_Sun3;~Xi_Chen21;~Yefeng_Zheng2", "gender": "M;M;M;;M;M", "homepage": "https://zifengwang.xyz;https://sites.google.com/view/slhuang/home;https://www.isti.cnr.it/en/about/people-detail/169/Ercan_Engin_Kuruoglu;http://sunlab.org;;https://en.westlake.edu.cn/faculty/yefeng-zheng.html", "dblp": ";64/2243;;;;44/6510", "google_scholar": "kMlWwTAAAAAJ;;;9jmmp5sAAAAJ;https://scholar.google.com/citations?hl=zh-CN;vAIECxgAAAAJ", "orcid": ";;;0000-0003-1512-6426;;0000-0003-2195-2847", "linkedin": ";;;jimengsun/;;yefeng-zheng-bb45641/?originalSubdomain=cn", "or_profile": "~Zifeng_Wang3;~Shao-Lun_Huang3;~Ercan_Engin_Kuruoglu1;~Jimeng_Sun3;~Xi_Chen21;~Yefeng_Zheng2", "aff": "University of Illinois, Urbana Champaign;Tsinghua University;CNR;Georgia Institute of Technology;Tencent Content and Platform Group;Tencent Jarvis Lab", "aff_domain": "illinois.edu;tsinghua.edu.cn;isti.cnr.it;gatech.edu;tencent.com;tencent.com", "position": "PhD student;Associate Professor;Principal Researcher;Associate Professor;Researcher;Director", "bibtex": "@inproceedings{\nwang2022pacbayes,\ntitle={{PAC}-Bayes Information Bottleneck},\nauthor={Zifeng Wang and Shao-Lun Huang and Ercan Engin Kuruoglu and Jimeng Sun and Xi Chen and Yefeng Zheng},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=iLHOIDsPv1P}\n}", "github": "", "project": "", "reviewers": "gEEc;HBHz;CyLM;F2VH", "pdf_size": 0, "recommendation": "6;6;8;10", "confidence": "3;3;2;3", "correctness": "3;3;4;4", "technical_novelty": "4;2;4;4", "empirical_novelty": "3;2;3;4", "wc_summary_paper": "77;130;157;95", "wc_summary_review": "21;18;42;11", "wc_main_review": "806;135;211;194", "wc_review": "904;283;410;300", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "1049;294;81;280", "reply_reviewers": "0;0;0;0", "reply_authors": "2;1;1;1", "recommendation_avg": [ 7.5, 1.6583123951777 ], "confidence_avg": [ 2.75, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.5, 0.8660254037844386 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 114.75, 30.95460385790779 ], "wc_summary_review_avg": [ 23.0, 11.554220008291344 ], "wc_main_review_avg": [ 336.5, 272.52935621690375 ], "wc_review_avg": [ 474.25, 252.86001562129192 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 426.0, 369.4231990549592 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.17407765595569782, "corr_recommendation_correctness": 0.9045340337332909, "gs_citation": 43, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8594070314886177653&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=iLHOIDsPv1P", "email": "illinois.edu;tsinghua.edu.cn;isti.cnr.it;gatech.edu;tencent.com;tencent.com", "author_num": 6, "aff_unique_index": "0;1;2;3;4;4", "aff_unique_norm": "University of Illinois Urbana-Champaign;Tsinghua University;Consiglio Nazionale delle Ricerche;Georgia Institute of Technology;Tencent", "aff_unique_dep": ";;;;Content and Platform Group", "aff_unique_url": "https://illinois.edu;https://www.tsinghua.edu.cn;https://www.cnr.it;https://www.gatech.edu;https://www.tencent.com", "aff_unique_abbr": "UIUC;THU;CNR;Georgia Tech;Tencent", "aff_campus_unique_index": "0", "aff_campus_unique": "Urbana-Champaign;", "aff_country_unique_index": "0;1;2;0;1;1", "aff_country_unique": "United States;China;Italy" }, { "title": "Spike-inspired rank coding for fast and accurate recurrent neural networks", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6217", "id": "iMH1e5k7n3L", "poster": "", "openreview": "https://openreview.net/forum?id=iMH1e5k7n3L", "slides": "https://iclr.cc/virtual/2022/poster/6217", "video": "https://iclr.cc/virtual/2022/poster/6217", "author_site": "Alan Jeffares, Qinghai Guo, Pontus Stenetorp, Timoleon Moraitis", "tldr": "", "abstract": "Biological spiking neural networks (SNNs) can temporally encode information in their outputs, e.g. in the rank order in which neurons fire, whereas artificial neural networks (ANNs) conventionally do not. As a result, models of SNNs for neuromorphic computing are regarded as potentially more rapid and efficient than ANNs when dealing with temporal input. On the other hand, ANNs are simpler to train, and usually achieve superior performance. Here we show that temporal coding such as rank coding (RC) inspired by SNNs can also be applied to conventional ANNs such as LSTMs, and leads to computational savings and speedups.\nIn our RC for ANNs, we apply backpropagation through time using the standard real-valued activations, but only from a strategically early time step of each sequential input example, decided by a threshold-crossing event. Learning then incorporates naturally also when to produce an output, without other changes to the model or the algorithm. Both the forward and the backward training pass can be significantly shortened by skipping the remaining input sequence after that first event. RC-training also significantly reduces time-to-insight during inference, with a minimal decrease in accuracy. The desired speed-accuracy trade-off is tunable by varying the threshold or a regularization parameter that rewards output entropy. We demonstrate these in two toy problems of sequence classification, and in a temporally-encoded MNIST dataset where our RC model achieves 99.19% accuracy after the first input time-step, outperforming the state of the art in temporal coding with SNNs, as well as in spoken-word classification of Google Speech Commands, outperforming non-RC-trained early inference with LSTMs.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/c03cca126ba37b9d00278e5e9c2cc8f0d1497e1d.zip", "author": "Alan Jeffares;Qinghai Guo;Pontus Stenetorp;Timoleon Moraitis", "authorids": "~Alan_Jeffares1;~Qinghai_Guo1;~Pontus_Stenetorp1;~Timoleon_Moraitis1", "gender": ";M;Not Specified;M", "homepage": "https://alanjeffares.com;https://www.semanticscholar.org/author/Qinghai-Guo/47747957;https://pontus.stenetorp.se;https://www.tmoraitis.com", "dblp": "304/1985;12/8502;44/8358.html;", "google_scholar": "e65kJ08AAAAJ;;;https://scholar.google.ch/citations?user=w3KiO1MAAAAJ", "orcid": ";0000-0003-4697-9464;;0000-0002-6521-0717", "linkedin": "alanjeffares;;;timoleon-moraitis-56a81217/", "or_profile": "~Alan_Jeffares1;~Qinghai_Guo1;~Pontus_Stenetorp1;~Timoleon_Moraitis1", "aff": "University of Cambridge;Huawei Technologies Ltd.;University College London;Huawei Technologies Ltd.", "aff_domain": "cam.ac.uk;huawei.com;ucl.ac.uk;huawei.com", "position": "PhD student;Researcher;Associate Professor;Researcher", "bibtex": "@inproceedings{\njeffares2022spikeinspired,\ntitle={Spike-inspired rank coding for fast and accurate recurrent neural networks},\nauthor={Alan Jeffares and Qinghai Guo and Pontus Stenetorp and Timoleon Moraitis},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=iMH1e5k7n3L}\n}", "github": "", "project": "", "reviewers": "c3tt;zTtT;dNmj", "pdf_size": 0, "recommendation": "8;8;8", "confidence": "4;4;3", "correctness": "3;3;4", "technical_novelty": "3;3;3", "empirical_novelty": "3;3;0", "wc_summary_paper": "20;135;119", "wc_summary_review": "106;8;53", "wc_main_review": "265;174;327", "wc_review": "391;317;499", "wc_reply_reviewers": "36;179;30", "wc_reply_authors": "647;1955;753", "reply_reviewers": "1;3;1", "reply_authors": "2;6;3", "recommendation_avg": [ 8.0, 0.0 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.0, 1.4142135623730951 ], "wc_summary_paper_avg": [ 91.33333333333333, 50.86146762421321 ], "wc_summary_review_avg": [ 55.666666666666664, 40.052743004970615 ], "wc_main_review_avg": [ 255.33333333333334, 62.83488061755367 ], "wc_review_avg": [ 402.3333333333333, 74.73211417382015 ], "wc_reply_reviewers_avg": [ 81.66666666666667, 68.86863501543274 ], "wc_reply_authors_avg": [ 1118.3333333333333, 593.1932418885284 ], "reply_reviewers_avg": [ 1.6666666666666667, 0.9428090415820634 ], "reply_authors_avg": [ 3.6666666666666665, 1.699673171197595 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 21, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11265212199264538823&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=iMH1e5k7n3L", "email": "cam.ac.uk;huawei.com;ucl.ac.uk;huawei.com", "author_num": 4, "aff_unique_index": "0;1;2;1", "aff_unique_norm": "University of Cambridge;Huawei;University College London", "aff_unique_dep": ";Huawei Technologies;", "aff_unique_url": "https://www.cam.ac.uk;https://www.huawei.com;https://www.ucl.ac.uk", "aff_unique_abbr": "Cambridge;Huawei;UCL", "aff_campus_unique_index": "0", "aff_campus_unique": "Cambridge;", "aff_country_unique_index": "0;1;0;1", "aff_country_unique": "United Kingdom;China" }, { "title": "MT3: Multi-Task Multitrack Music Transcription", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6969", "id": "iMSjopcOn0p", "poster": "", "openreview": "https://openreview.net/forum?id=iMSjopcOn0p", "slides": "https://iclr.cc/virtual/2022/poster/6969", "video": "https://iclr.cc/virtual/2022/poster/6969", "author_site": "Josh Gardner, Ian Simon, Ethan Manilow, Curtis Hawthorne, Jesse Engel", "tldr": "", "abstract": "Automatic Music Transcription (AMT), inferring musical notes from raw audio, is a challenging task at the core of music understanding. Unlike Automatic Speech Recognition (ASR), which typically focuses on the words of a single speaker, AMT often requires transcribing multiple instruments simultaneously, all while preserving fine-scale pitch and timing information. Further, many AMT datasets are ``low-resource'', as even expert musicians find music transcription difficult and time-consuming. Thus, prior work has focused on task-specific architectures, tailored to the individual instruments of each task. In this work, motivated by the promising results of sequence-to-sequence transfer learning for low-resource Natural Language Processing (NLP), we demonstrate that a general-purpose Transformer model can perform multi-task AMT, jointly transcribing arbitrary combinations of musical instruments across several transcription datasets. We show this unified training framework achieves high-quality transcription results across a range of datasets, dramatically improving performance for low-resource instruments (such as guitar), while preserving strong performance for abundant instruments (such as piano). Finally, by expanding the scope of AMT, we expose the need for more consistent evaluation metrics and better dataset alignment, and provide a strong baseline for this new direction of multi-task AMT.", "keywords": "music transcription;transformer;multi-task learning;low resource learning;music understanding;music information retrieval", "primary_area": "", "supplementary_material": "", "author": "Joshua P Gardner;Ian Simon;Ethan Manilow;Curtis Hawthorne;Jesse Engel", "authorids": "~Joshua_P_Gardner1;~Ian_Simon1;~Ethan_Manilow1;~Curtis_Hawthorne1;~Jesse_Engel1", "gender": ";M;;M;M", "homepage": ";http://iansimon.org/;https://ethman.github.io/;https://g.co/magenta;", "dblp": ";33/2787;210/6197;207/8244;", "google_scholar": ";pKqwl3wAAAAJ;;https://scholar.google.fr/citations?user=9ziPoxAAAAAJ;Sc7qOfcAAAAJ", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~Joshua_P_Gardner1;~Ian_Simon1;~Ethan_Manilow1;~Curtis_Hawthorne1;~Jesse_Engel1", "aff": ";Google;Northwestern University;Google;Google Brain", "aff_domain": ";google.com;u.northwestern.edu;google.com;google.com", "position": ";Software Engineer;PhD student;Software Engineer;Research Scientist", "bibtex": "@inproceedings{\ngardner2022mt,\ntitle={{MT}3: Multi-Task Multitrack Music Transcription},\nauthor={Joshua P Gardner and Ian Simon and Ethan Manilow and Curtis Hawthorne and Jesse Engel},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=iMSjopcOn0p}\n}", "github": "", "project": "", "reviewers": "RijS;zmLf;vgm4;4Man", "pdf_size": 0, "recommendation": "8;8;8;8", "confidence": "4;5;3;4", "correctness": "4;4;3;3", "technical_novelty": "4;2;3;3", "empirical_novelty": "4;4;3;4", "wc_summary_paper": "37;132;135;64", "wc_summary_review": "16;69;26;87", "wc_main_review": "152;411;149;715", "wc_review": "205;612;310;866", "wc_reply_reviewers": "0;0;0;168", "wc_reply_authors": "637;337;230;557", "reply_reviewers": "0;0;0;1", "reply_authors": "1;1;1;1", "recommendation_avg": [ 8.0, 0.0 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 3.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 92.0, 42.59694824749773 ], "wc_summary_review_avg": [ 49.5, 29.415132160165456 ], "wc_main_review_avg": [ 356.75, 232.57727210542305 ], "wc_review_avg": [ 498.25, 259.62123853799017 ], "wc_reply_reviewers_avg": [ 42.0, 72.74613391789285 ], "wc_reply_authors_avg": [ 440.25, 163.71220937975275 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 126, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4757063593798788847&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=iMSjopcOn0p", "email": ";google.com;u.northwestern.edu;google.com;google.com", "author_num": 5, "aff_unique_index": "0;1;0;0", "aff_unique_norm": "Google;Northwestern University", "aff_unique_dep": "Google;", "aff_unique_url": "https://www.google.com;https://www.northwestern.edu", "aff_unique_abbr": "Google;NU", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Augmented Sliced Wasserstein Distances", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6818", "id": "iMqTLyfwnOO", "poster": "", "openreview": "https://openreview.net/forum?id=iMqTLyfwnOO", "slides": "https://iclr.cc/virtual/2022/poster/6818", "video": "https://iclr.cc/virtual/2022/poster/6818", "author_site": "Xiongjie Chen, Yongxin Yang, Yunpeng Li", "tldr": "", "abstract": "While theoretically appealing, the application of the Wasserstein distance to large-scale machine learning problems has been hampered by its prohibitive computational cost. The sliced Wasserstein distance and its variants improve the computational efficiency through the random projection, yet they suffer from low accuracy if the number of projections is not sufficiently large, because the majority of projections result in trivially small values. In this work, we propose a new family of distance metrics, called augmented sliced Wasserstein distances (ASWDs), constructed by first mapping samples to higher-dimensional hypersurfaces parameterized by neural networks. It is derived from a key observation that (random) linear projections of samples residing on these hypersurfaces would translate to much more flexible nonlinear projections in the original sample space, so they can capture complex structures of the data distribution. We show that the hypersurfaces can be optimized by gradient ascent efficiently. We provide the condition under which the ASWD is a valid metric and show that this can be obtained by an injective neural network architecture. Numerical results demonstrate that the ASWD significantly outperforms other Wasserstein variants for both synthetic and real-world problems.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xiongjie Chen;Yongxin Yang;Yunpeng Li", "authorids": "~Xiongjie_Chen1;~Yongxin_Yang1;~Yunpeng_Li1", "gender": "M;;M", "homepage": ";https://www.kcl.ac.uk/people/yunpeng-li;", "dblp": "237/8837;;150/4258", "google_scholar": "Tb9fTOsAAAAJ;JzyKdRUAAAAJ;https://scholar.google.co.uk/citations?user=F7PtrL8AAAAJ", "orcid": ";0000-0003-4798-541X;", "linkedin": ";;", "or_profile": "~Xiongjie_Chen1;~Yunpeng_Li1;~Yongxin_Yang3", "aff": "University of Surrey;University of Surrey;Queen Mary University of London", "aff_domain": "surrey.ac.uk;surrey.ac.uk;qmul.ac.uk", "position": "PhD student;Lecturer;Assistant Professor", "bibtex": "@inproceedings{\nchen2022augmented,\ntitle={Augmented Sliced Wasserstein Distances},\nauthor={Xiongjie Chen and Yongxin Yang and Yunpeng Li},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=iMqTLyfwnOO}\n}", "github": "", "project": "", "reviewers": "6LWm;1Q3z;RRsR;ViRm", "pdf_size": 0, "recommendation": "6;6;6;6", "confidence": "3;3;4;3", "correctness": "4;4;4;4", "technical_novelty": "3;3;3;3", "empirical_novelty": "0;3;3;3", "wc_summary_paper": "58;121;107;76", "wc_summary_review": "34;67;109;50", "wc_main_review": "252;602;354;118", "wc_review": "344;790;570;244", "wc_reply_reviewers": "0;124;13;0", "wc_reply_authors": "1227;1309;1440;1096", "reply_reviewers": "0;2;1;0", "reply_authors": "2;3;3;2", "recommendation_avg": [ 6.0, 0.0 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 4.0, 0.0 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.25, 1.299038105676658 ], "wc_summary_paper_avg": [ 90.5, 24.84451649760969 ], "wc_summary_review_avg": [ 65.0, 27.955321496988727 ], "wc_main_review_avg": [ 331.5, 177.18563711542762 ], "wc_review_avg": [ 487.0, 211.06634028191232 ], "wc_reply_reviewers_avg": [ 34.25, 52.08826643304613 ], "wc_reply_authors_avg": [ 1268.0, 125.02999640086374 ], "reply_reviewers_avg": [ 0.75, 0.82915619758885 ], "reply_authors_avg": [ 2.5, 0.5 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 24, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=955715037092022915&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "pdf": "https://openreview.net/pdf?id=iMqTLyfwnOO", "email": "surrey.ac.uk;surrey.ac.uk;qmul.ac.uk", "author_num": 3, "aff_unique_index": "0;0;1", "aff_unique_norm": "University of Surrey;Queen Mary University of London", "aff_unique_dep": ";", "aff_unique_url": "https://www.surrey.ac.uk;https://www.qmul.ac.uk", "aff_unique_abbr": "Surrey;QMUL", "aff_campus_unique_index": "1", "aff_campus_unique": ";London", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United Kingdom" }, { "title": "Phase Collapse in Neural Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6415", "id": "iPHLcmtietq", "poster": "", "openreview": "https://openreview.net/forum?id=iPHLcmtietq", "slides": "https://iclr.cc/virtual/2022/poster/6415", "video": "https://iclr.cc/virtual/2022/poster/6415", "author_site": "Florentin Guth, John Zarka, St\u00e9phane Mallat", "tldr": "", "abstract": "Deep convolutional classifiers linearly separate image classes and improve accuracy as depth increases. They progressively reduce the spatial dimension whereas the number of channels grows with depth. Spatial variability is therefore transformed into variability along channels. A fundamental challenge is to understand the role of non-linearities together with convolutional filters in this transformation. ReLUs with biases are often interpreted as thresholding operators that improve discrimination through sparsity. This paper demonstrates that it is a different mechanism called \\emph{phase collapse} which eliminates spatial variability while linearly separating classes. We show that collapsing the phases of complex wavelet coefficients is sufficient to reach the classification accuracy of ResNets of similar depths. However, replacing the phase collapses with thresholding operators that enforce sparsity considerably degrades the performance. We explain these numerical results by showing that the iteration of phase collapses progressively improves separation of classes, as opposed to thresholding non-linearities.", "keywords": "phase collapse;neural collapse;concentration;classification;imagenet;deep networks;complex networks;sparsity in deep networks", "primary_area": "", "supplementary_material": "/attachment/1b7e418c72c48ae40cb2086fe8c0d683b5a117e2.zip", "author": "Florentin Guth;John Zarka;St\u00e9phane Mallat", "authorids": "~Florentin_Guth1;~John_Zarka1;~St\u00e9phane_Mallat1", "gender": ";;M", "homepage": ";;https://www.di.ens.fr/~mallat/", "dblp": "223/6081;;61/3978", "google_scholar": "opC_fpQAAAAJ;;https://scholar.google.com.tw/citations?user=g_YTmSgAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Florentin_Guth1;~John_Zarka1;~St\u00e9phane_Mallat1", "aff": "Ecole Normale Sup\u00e9rieure;Ecole Normale Superieure;", "aff_domain": "ens.fr;ens.fr;", "position": "PhD student;PhD student;", "bibtex": "@inproceedings{\nguth2022phase,\ntitle={Phase Collapse in Neural Networks},\nauthor={Florentin Guth and John Zarka and St{\\'e}phane Mallat},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=iPHLcmtietq}\n}", "github": "", "project": "", "reviewers": "oXQJ;GfRM;3Vif;LHxR", "pdf_size": 0, "recommendation": "6;6;8;8", "confidence": "2;4;4;3", "correctness": "4;3;3;3", "technical_novelty": "3;3;4;2", "empirical_novelty": "3;3;4;2", "wc_summary_paper": "71;77;137;112", "wc_summary_review": "104;202;55;61", "wc_main_review": "199;351;783;353", "wc_review": "374;630;975;526", "wc_reply_reviewers": "0;21;0;82", "wc_reply_authors": "443;434;362;240", "reply_reviewers": "0;1;0;1", "reply_authors": "1;1;1;2", "recommendation_avg": [ 7.0, 1.0 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 99.25, 26.836309358777335 ], "wc_summary_review_avg": [ 105.5, 58.83238903869194 ], "wc_main_review_avg": [ 421.5, 217.8594730554538 ], "wc_review_avg": [ 626.25, 220.97553597627046 ], "wc_reply_reviewers_avg": [ 25.75, 33.58850249713434 ], "wc_reply_authors_avg": [ 369.75, 81.2230724609701 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.30151134457776363, "corr_recommendation_correctness": -0.5773502691896257, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2536829200192569115&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "pdf": "https://openreview.net/pdf?id=iPHLcmtietq", "email": "ens.fr;ens.fr;", "author_num": 3, "aff_unique_index": "0;1", "aff_unique_norm": "Ecole Normale Sup\u00e9rieure;Ecole Normale Superieure", "aff_unique_dep": ";", "aff_unique_url": "https://www.ens.fr;https://www.ens.fr", "aff_unique_abbr": "ENS;ENS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "France" }, { "title": "DISCOVERING AND EXPLAINING THE REPRESENTATION BOTTLENECK OF DNNS", "status": "Oral", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6622", "id": "iRCUlgmdfHJ", "poster": "", "openreview": "https://openreview.net/forum?id=iRCUlgmdfHJ", "slides": "https://iclr.cc/virtual/2022/poster/6622", "video": "https://iclr.cc/virtual/2022/poster/6622", "author_site": "Huiqi Deng, Qihan Ren, Hao Zhang, Quanshi Zhang", "tldr": "", "abstract": "This paper explores the bottleneck of feature representations of deep neural networks (DNNs), from the perspective of the complexity of interactions between input variables encoded in DNNs. To this end, we focus on the multi-order interaction between input variables, where the order represents the complexity of interactions. We discover that a DNN is more likely to encode both too simple and too complex interactions, but usually fails to learn interactions of intermediate complexity. Such a phenomenon is widely shared by different DNNs for different tasks. This phenomenon indicates a cognition gap between DNNs and humans, and we call it a representation bottleneck. We theoretically prove the underlying reason for the representation bottleneck. Furthermore, we propose losses to encourage/penalize the learning of interactions of specific complexities, and analyze the representation capacities of interactions of different complexities. The code is available at https://github.com/Nebularaid2000/bottleneck.", "keywords": "representation bottleneck;representation ability;interaction;explanation", "primary_area": "", "supplementary_material": "", "author": "Huiqi Deng;Qihan Ren;Hao Zhang;Quanshi Zhang", "authorids": "~Huiqi_Deng1;~Qihan_Ren1;~Hao_Zhang22;~Quanshi_Zhang1", "gender": "F;M;M;M", "homepage": ";https://nebularaid2000.github.io/;https://haozhang37.github.io;http://qszhang.com", "dblp": "229/1317;268/5838;55/2270-63;http://dblp.uni-trier.de/pers/hd/z/Zhang:Quanshi", "google_scholar": "QEjqzXgAAAAJ;ybTy_DwAAAAJ;3g6LlgwAAAAJ;iFFhHK0AAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Huiqi_Deng1;~Qihan_Ren1;~Hao_Zhang22;~Quanshi_Zhang1", "aff": "Shanghai jiaotong University;Shanghai Jiaotong University;Shanghai Jiaotong University;Shanghai Jiaotong University", "aff_domain": "edu.cn;sjtu.edu.cn;sjtu.edu.cn;sjtu.edu.cn", "position": "Postdoc;Undergrad student;MS student;Associate Professor", "bibtex": "@inproceedings{\ndeng2022discovering,\ntitle={{DISCOVERING} {AND} {EXPLAINING} {THE} {REPRESENTATION} {BOTTLENECK} {OF} {DNNS}},\nauthor={Huiqi Deng and Qihan Ren and Hao Zhang and Quanshi Zhang},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=iRCUlgmdfHJ}\n}", "github": "", "project": "", "reviewers": "FMrV;4jZH;cEoR;t6sY", "pdf_size": 0, "recommendation": "8;8;8;10", "confidence": "4;3;4;4", "correctness": "3;4;3;3", "technical_novelty": "3;4;3;4", "empirical_novelty": "3;4;3;3", "wc_summary_paper": "104;68;41;115", "wc_summary_review": "64;21;42;44", "wc_main_review": "332;500;231;365", "wc_review": "500;589;314;524", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 8.5, 0.8660254037844386 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 3.5, 0.5 ], "empirical_novelty_avg": [ 3.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 82.0, 29.368350311176826 ], "wc_summary_review_avg": [ 42.75, 15.22128443988877 ], "wc_main_review_avg": [ 357.0, 96.19511422104556 ], "wc_review_avg": [ 481.75, 102.17723572303177 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": -0.3333333333333333, "gs_citation": 76, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6321522337570019810&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=iRCUlgmdfHJ", "email": "edu.cn;sjtu.edu.cn;sjtu.edu.cn;sjtu.edu.cn", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Shanghai Jiao Tong University", "aff_unique_dep": "", "aff_unique_url": "https://www.sjtu.edu.cn", "aff_unique_abbr": "SJTU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "id": "iUt2KYdXBDD", "title": "Value Refinement Network (VRN)", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Sparse rewards and long decision horizons make agent navigation tasks difficult to solve via reinforcement learning (RL) such as (deep) Q-learning. Previous work has shown that some of these tasks are efficiently solvable by value-based planning in a state space abstraction, which defines sub-goals for a policy in the original state space. However, value-based planning scales poorly with the number of state space dimensions. Consequently, the planning might not be able to consider all the state information, like other agents' behaviors. Combining the benefits of planning and learning values, we propose the Value Refinement Network (VRN), an architecture that locally refines a plan in a (simpler) state space abstraction, represented by a pre-computed value function, with respect to the full agent state. Training the VRN via RL, it can learn how to correct this initial plan effectively to solve tasks that otherwise would require a prohibitively large abstraction. Evaluating on several simulated agent navigation tasks, we demonstrate the benefits of our VRN: We show that it can successfully refine shortest path plans to match the performance of value iteration in a more complex state space. Furthermore, in vehicle parking tasks where considering all relevant state space dimensions in planning is infeasible, the VRN still enables high task completion rates.", "keywords": "reinforcement learning;learning to plan;navigation", "primary_area": "", "supplementary_material": "", "author": "Jan W\u00f6hlke;Felix Schmitt;Herke van Hoof", "authorids": "~Jan_W\u00f6hlke1;~Felix_Schmitt1;~Herke_van_Hoof4", "gender": "M;M;M", "homepage": ";https://www.bosch-ai.com/about-us/our-people/;https://staff.fnwi.uva.nl/h.c.vanhoof/", "dblp": "223/4463;17/7452;123/6759", "google_scholar": ";;https://scholar.google.ca/citations?user=9owUkLYAAAAJ", "orcid": "0000-0003-4702-1768;0000-0001-5451-8233;", "linkedin": ";https://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=&cad=rja&uact=8&ved=2ahUKEwjDoITSpJXzAhWfgP0HHTXwD_kQFnoECAIQAQ&url=https%3A%2F%2Fde.linkedin.com%2Fin%2Ffelix-schmitt-341777135&usg=AOvVaw3MLjveLLBxzTNbpPfcolVO;", "or_profile": "~Jan_W\u00f6hlke1;~Felix_Schmitt1;~Herke_van_Hoof4", "aff": "University of Amsterdam;Bosch Center for Artificial Intelligence;University of Amsterdam", "aff_domain": "uva.nl;bosch.com;uva.nl", "position": "PhD student;Researcher;Assistant Professor", "bibtex": "@misc{\nw{\\\"o}hlke2022value,\ntitle={Value Refinement Network ({VRN})},\nauthor={Jan W{\\\"o}hlke and Felix Schmitt and Herke van Hoof},\nyear={2022},\nurl={https://openreview.net/forum?id=iUt2KYdXBDD}\n}", "github": "", "project": "", "reviewers": "dNKt;tfDa;kHgP;sZG3", "site": "https://openreview.net/forum?id=iUt2KYdXBDD", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "4;4;3;4", "correctness": "3;3;2;4", "technical_novelty": "2;2;2;3", "empirical_novelty": "1;2;3;2", "wc_summary_paper": "112;57;141;78", "wc_summary_review": "64;58;135;110", "wc_main_review": "219;407;678;220", "wc_review": "395;522;954;408", "wc_reply_reviewers": "216;356;690;0", "wc_reply_authors": "573;662;1168;492", "reply_reviewers": "1;2;2;0", "reply_authors": "2;2;4;1", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 97.0, 32.101401838549044 ], "wc_summary_review_avg": [ 91.75, 32.06536293261001 ], "wc_main_review_avg": [ 381.0, 187.78311958213922 ], "wc_review_avg": [ 569.75, 227.28217593995356 ], "wc_reply_reviewers_avg": [ 315.5, 250.66461656963074 ], "wc_reply_authors_avg": [ 723.75, 263.44105128092696 ], "reply_reviewers_avg": [ 1.25, 0.82915619758885 ], "reply_authors_avg": [ 2.25, 1.0897247358851685 ], "replies_avg": [ 20, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": 0.816496580927726, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16618096332073094404&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff_unique_index": "0;1;0", "aff_unique_norm": "University of Amsterdam;Bosch Center for Artificial Intelligence", "aff_unique_dep": ";Center for Artificial Intelligence", "aff_unique_url": "https://www.uva.nl;https://www.bosch-ai.com", "aff_unique_abbr": "UvA;BCAI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", "aff_country_unique": "Netherlands;Germany" }, { "title": "StyleNeRF: A Style-based 3D Aware Generator for High-resolution Image Synthesis", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6113", "id": "iUuzzTMUw9K", "poster": "", "openreview": "https://openreview.net/forum?id=iUuzzTMUw9K", "slides": "https://iclr.cc/virtual/2022/poster/6113", "video": "https://iclr.cc/virtual/2022/poster/6113", "author_site": "Jiatao Gu, Lingjie Liu, Peng Wang, Christian Theobalt", "tldr": "", "abstract": "We propose StyleNeRF, a 3D-aware generative model for photo-realistic high-resolution image synthesis with high multi-view consistency, which can be trained on unstructured 2D images. Existing approaches either cannot synthesize high-resolution images with fine details or yield clearly noticeable 3D-inconsistent artifacts. In addition, many of them lack control on style attributes and explicit 3D camera poses. To address these issues, StyleNeRF integrates the neural radiance field (NeRF) into a style-based generator to tackle the aforementioned challenges, i.e., improving rendering efficiency and 3D consistency for high-resolution image generation. To address the first issue, we perform volume rendering only to produce a low-resolution feature map, and progressively apply upsampling in 2D. To mitigate the inconsistencies caused by 2D upsampling, we propose multiple designs including a better upsampler choice and a new regularization loss to enforce 3D consistency. With these designs, StyleNeRF is able to synthesize high-resolution images at interactive rates while preserving 3D consistency at high quality. StyleNeRF also enables control of camera poses and different levels of styles, which can generalize to unseen views. It also supports challenging tasks such as style mixing, inversion and simple semantic edits. \n", "keywords": "Neural Radiance Field;StyleGAN;high resolution image generation", "primary_area": "", "supplementary_material": "/attachment/dce26819b324a6fdf2e161a5fcd4982721e7cb9a.zip", "author": "Jiatao Gu;Lingjie Liu;Peng Wang;Christian Theobalt", "authorids": "~Jiatao_Gu1;~Lingjie_Liu1;~Peng_Wang17;~Christian_Theobalt2", "gender": "M;F;M;M", "homepage": "http://jiataogu.me;https://lingjie0206.github.io/;https://totoro97.github.io;https://www.mpi-inf.mpg.de/~theobalt/", "dblp": "164/5848.html;204/0052;95/4442-99;55/3346", "google_scholar": "https://scholar.google.com.sg/citations?user=cB1mFBsAAAAJ;https://scholar.google.de/citations?user=HZPnJ9gAAAAJ;KvXvmawAAAAJ;https://scholar.google.com.tw/citations?user=eIWg8NMAAAAJ", "orcid": ";;;", "linkedin": "jiatao-gu-204b2672/;;;", "or_profile": "~Jiatao_Gu1;~Lingjie_Liu1;~Peng_Wang17;~Christian_Theobalt2", "aff": "Meta;Saarland Informatics Campus, Max-Planck Institute;The University of Hong Kong;Max-Planck-Institute for Informatics, Saarland Informatics Campus", "aff_domain": "fb.com;mpi-inf.mpg.de;hku.hk;mpi-inf.mpg.de", "position": "Researcher;Postdoc;PhD student;Director", "bibtex": "@inproceedings{\ngu2022stylenerf,\ntitle={StyleNe{RF}: A Style-based 3D Aware Generator for High-resolution Image Synthesis},\nauthor={Jiatao Gu and Lingjie Liu and Peng Wang and Christian Theobalt},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=iUuzzTMUw9K}\n}", "github": "", "project": "", "reviewers": "gKfo;TLAR;MWb6;Qm5b", "pdf_size": 0, "recommendation": "6;6;8;10", "confidence": "5;4;5;3", "correctness": "4;2;4;4", "technical_novelty": "2;2;3;3", "empirical_novelty": "3;2;3;3", "wc_summary_paper": "48;67;77;109", "wc_summary_review": "46;34;17;73", "wc_main_review": "580;226;217;245", "wc_review": "674;327;311;427", "wc_reply_reviewers": "0;0;20;0", "wc_reply_authors": "355;477;97;293", "reply_reviewers": "0;0;1;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 7.5, 1.6583123951777 ], "confidence_avg": [ 4.25, 0.82915619758885 ], "correctness_avg": [ 3.5, 0.8660254037844386 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 75.25, 22.094965489902897 ], "wc_summary_review_avg": [ 42.5, 20.402205763103165 ], "wc_main_review_avg": [ 317.0, 152.179170716626 ], "wc_review_avg": [ 434.75, 145.10750325189943 ], "wc_reply_reviewers_avg": [ 5.0, 8.660254037844387 ], "wc_reply_authors_avg": [ 305.5, 137.3781278078865 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.6363636363636364, "corr_recommendation_correctness": 0.5222329678670935, "gs_citation": 653, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14395986569552995803&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=iUuzzTMUw9K", "email": "fb.com;mpi-inf.mpg.de;hku.hk;mpi-inf.mpg.de", "author_num": 4, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "Meta;Max-Planck Institute;University of Hong Kong;Max-Planck-Institute for Informatics", "aff_unique_dep": "Meta Platforms, Inc.;Informatics;;", "aff_unique_url": "https://meta.com;https://www.mpi-sws.org;https://www.hku.hk;https://mpi-inf.mpg.de", "aff_unique_abbr": "Meta;MPI-SWS;HKU;MPII", "aff_campus_unique_index": "1;2;1", "aff_campus_unique": ";Saarland;Hong Kong SAR", "aff_country_unique_index": "0;1;2;1", "aff_country_unique": "United States;Germany;China" }, { "id": "iaqgio-pOv", "title": "Analogies and Feature Attributions for Model Agnostic Explanation of Similarity Learners", "track": "main", "status": "Reject", "tldr": "", "abstract": "Post-hoc explanations for black box models have been studied extensively in classification and regression settings. However, explanations for models that output similarity between two inputs have received comparatively lesser attention. In this paper, we provide model agnostic local explanations for similarity learners applicable to tabular and text data. We first propose a method that provides feature attributions to explain the similarity between a pair of inputs as determined by a black box similarity learner. We then propose analogies as a new form of explanation in machine learning. Here the goal is to identify diverse analogous pairs of examples that share the same level of similarity as the input pair and provide insight into (latent) factors underlying the model's prediction. The selection of analogies can optionally leverage feature attributions, thus connecting the two forms of explanation while still maintaining complementarity. We prove that our analogy objective function is submodular, making the search for good-quality analogies efficient. We apply the proposed approaches to explain similarities between sentences as predicted by a state-of-the-art sentence encoder, and between patients in a healthcare utilization application. Efficacy is measured through quantitative evaluations, a careful user study, and examples of explanations.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Karthikeyan Natesan Ramamurthy;Amit Dhurandhar;Dennis Wei;Zaid Bin Tariq", "authorids": "~Karthikeyan_Natesan_Ramamurthy1;~Amit_Dhurandhar1;~Dennis_Wei1;~Zaid_Bin_Tariq1", "gender": ";M;M;M", "homepage": "https://nrkarthikeyan.github.io/;https://researcher.watson.ibm.com/researcher/view.php?person=us-adhuran;https://sites.google.com/site/dennislwei/;https://www.linkedin.com/in/zaid-bin-tariq/", "dblp": "58/7800;66/3289;59/8761;", "google_scholar": "mG8HuhEAAAAJ;km9vIPEAAAAJ;r4ldy4AAAAAJ;", "orcid": "0000-0002-6021-5930;;;", "linkedin": ";;dennis-wei-4886036b/;", "or_profile": "~Karthikeyan_Natesan_Ramamurthy1;~Amit_Dhurandhar1;~Dennis_Wei1;~Zaid_Bin_Tariq1", "aff": "International Business Machines;International Business Machines;International Business Machines;University of Texas, Dallas", "aff_domain": "ibm.com;ibm.com;ibm.com;utdallas.edu", "position": "Research Staff Member;Principal Researcher;Research Staff Member;PhD student", "bibtex": "@misc{\nramamurthy2022analogies,\ntitle={Analogies and Feature Attributions for Model Agnostic Explanation of Similarity Learners},\nauthor={Karthikeyan Natesan Ramamurthy and Amit Dhurandhar and Dennis Wei and Zaid Bin Tariq},\nyear={2022},\nurl={https://openreview.net/forum?id=iaqgio-pOv}\n}", "github": "", "project": "", "reviewers": "UBTb;Un2j;F17R;QDef;Lu2q", "site": "https://openreview.net/forum?id=iaqgio-pOv", "pdf_size": 0, "recommendation": "1;5;6;6;6", "confidence": "5;4;3;3;3", "correctness": "2;3;3;3;3", "technical_novelty": "2;2;3;2;2", "empirical_novelty": "1;2;3;2;3", "wc_summary_paper": "164;253;102;77;100", "wc_summary_review": "77;59;21;417;12", "wc_main_review": "652;375;193;121;348", "wc_review": "893;687;316;615;460", "wc_reply_reviewers": "890;126;0;415;0", "wc_reply_authors": "1788;2021;246;2435;557", "reply_reviewers": "2;1;0;2;0", "reply_authors": "3;5;1;5;1", "recommendation_avg": [ 4.8, 1.9390719429665317 ], "confidence_avg": [ 3.6, 0.8 ], "correctness_avg": [ 2.8, 0.39999999999999997 ], "technical_novelty_avg": [ 2.2, 0.39999999999999997 ], "empirical_novelty_avg": [ 2.2, 0.7483314773547882 ], "wc_summary_paper_avg": [ 139.2, 63.80407510496489 ], "wc_summary_review_avg": [ 117.2, 151.79248993280268 ], "wc_main_review_avg": [ 337.8, 183.40054525545992 ], "wc_review_avg": [ 594.2, 196.84044299889186 ], "wc_reply_reviewers_avg": [ 286.2, 337.8309636489823 ], "wc_reply_authors_avg": [ 1409.4, 854.3223279301554 ], "reply_reviewers_avg": [ 1.0, 0.8944271909999159 ], "reply_authors_avg": [ 3.0, 1.7888543819998317 ], "replies_avg": [ 30, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.9540646527893837, "corr_recommendation_correctness": 0.9798501839458537, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11658492930130642341&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2, "aff_unique_index": "0;0;0;1", "aff_unique_norm": "International Business Machines Corporation;University of Texas at Dallas", "aff_unique_dep": ";", "aff_unique_url": "https://www.ibm.com;https://www.utdallas.edu", "aff_unique_abbr": "IBM;UT Dallas", "aff_campus_unique_index": "1", "aff_campus_unique": ";Dallas", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "iaxWbVx-CG_", "title": "Hierarchical Cross Contrastive Learning of Visual Representations", "track": "main", "status": "Reject", "tldr": "", "abstract": "The rapid progress of self-supervised learning (SSL) has greatly reduced the labeling cost in computer vision. The key idea of SSL is to learn invariant visual representations by maximizing the similarity between different views of the same input image. In most SSL methods, the representation invariant is measured by a contrastive loss which compares one of the network outputs after the projection head to its augmented version. Albeit being effective, this approach overlooks the information containing in the hidden layer of the projection head therefore could be sub-optimal. In this work, we propose a novel approach termed Hierarchical Cross Contrastive Learning(HCCL) to further distill the information mismatched by the conventional contrastive loss. The HCCL uses a hierarchical projection head to project the raw representations of the backbone into multiple latent spaces and then compares latent features across different levels and different views. By cross-level contrastive learning, HCCL not only regulates invariant on multiple hidden levels but also crosses different levels, improving the generalization ability of the learned visual representations. As a simple and generic method, HCCL can be applied to different SSL frameworks. We validate the efficacy of HCCL under classification, detection, segmentation, and few-shot learning tasks. Extensive experimental results show that HCCL outperforms most previous methods in various benchmark datasets.", "keywords": "Self-supervised Learning;Unsupervised Learning;Computer Vision", "primary_area": "", "supplementary_material": "", "author": "Hesen Chen;Ming Lin;Xiuyu Sun;Rong Jin", "authorids": "~Hesen_Chen1;~Ming_Lin4;~Xiuyu_Sun1;~Rong_Jin1", "gender": "M;M;M;M", "homepage": ";https://minglin-home.github.io/;https://sites.google.com/view/sunxiuyu/home;https://www.cse.msu.edu/~rongjin/", "dblp": "211/4075.html;;40/8845;j/RongJin", "google_scholar": "75v6J-cAAAAJ;https://scholar.google.com/citations?hl=en;https://scholar.google.com/citations?hl=zh-CN;", "orcid": "0009-0009-3434-4809;;0000-0002-7208-8078;", "linkedin": ";;;", "or_profile": "~Hesen_Chen1;~Ming_Lin4;~Xiuyu_Sun1;~Rong_Jin3", "aff": "Alibaba Group;Alibaba Group;Alibaba Group;Alibaba Group", "aff_domain": "alibaba-inc.com;alibaba-inc.com;alibaba-inc.com;alibaba-inc.com", "position": "Researcher;Algorithm Engineer;Staff Algorithm Engineer;Researcher", "bibtex": "@misc{\nchen2022hierarchical,\ntitle={Hierarchical Cross Contrastive Learning of Visual Representations},\nauthor={Hesen Chen and Ming Lin and Xiuyu Sun and Rong Jin},\nyear={2022},\nurl={https://openreview.net/forum?id=iaxWbVx-CG_}\n}", "github": "", "project": "", "reviewers": "NpsJ;mW6T;TpLR;RrYz", "site": "https://openreview.net/forum?id=iaxWbVx-CG_", "pdf_size": 0, "recommendation": "5;6;6;6", "confidence": "4;5;3;4", "correctness": "3;3;3;4", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "61;94;57;80", "wc_summary_review": "25;69;49;82", "wc_main_review": "218;434;252;161", "wc_review": "304;597;358;323", "wc_reply_reviewers": "0;474;0;7", "wc_reply_authors": "470;824;172;430", "reply_reviewers": "0;3;0;1", "reply_authors": "1;3;1;1", "recommendation_avg": [ 5.75, 0.4330127018922193 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 73.0, 14.916433890176299 ], "wc_summary_review_avg": [ 56.25, 21.533404282648853 ], "wc_main_review_avg": [ 266.25, 102.16255429461422 ], "wc_review_avg": [ 395.5, 117.93748343932052 ], "wc_reply_reviewers_avg": [ 120.25, 204.2576497955462 ], "wc_reply_authors_avg": [ 474.0, 232.1938845017241 ], "reply_reviewers_avg": [ 1.0, 1.224744871391589 ], "reply_authors_avg": [ 1.5, 0.8660254037844386 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4722840885469417179&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Alibaba Group", "aff_unique_dep": "", "aff_unique_url": "https://www.alibaba.com", "aff_unique_abbr": "Alibaba", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "id": "ib8vMnQPQ2", "title": "PIM-QAT: Neural Network Quantization For Processing-In-Memory (PIM) Systems", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Processing-in-memory (PIM), an increasingly studied neuromorphic hardware, promises orders of energy and throughput improvements for deep learning inference. Leveraging the massively parallel and ef\ufb01cient analog computing inside memories, PIM circumvents the bottlenecks of data movements in conventional digital hardware. However, an extra quantization step (i.e. PIM quantization), typically with limited resolution due to hardware constraints, is required to convert the analog computing results into digital domain. Meanwhile, non-ideal effects extensively exist in PIM quantization because of the imperfect analog-to-digital interface, which further compromises the inference accuracy. Due to hardware limitations, PIM systems decompose the bulky matrix multiplication into smaller subsets, making the computing \ufb02ow fundamentally different from the conventionally quantized models. In this paper, we propose a method for training quantized networks to incorporate PIM quantization, which is ubiquitous to all PIM systems. Speci\ufb01cally, we propose a PIM quantization aware training (PIM-QAT) algorithm, and introduce rescaling techniques during backward and forward propagation by analyzing the training dynamics to facilitate training convergence. We also propose two techniques, namely batch normalization (BN) calibration and adjusted precision training, to suppress the adverse effects of non-ideal linearity and stochastic thermal noise involved in real PIM chips. Our method is validated on three mainstream PIM decomposition schemes, and physically on a prototype chip. Comparing with directly deploying conventionally trained quantized model on PIM systems, which does not take into account this extra quantization step and thus fails, our method provides signi\ufb01cant improvement. It also achieves comparable inference accuracy on PIM systems as that of conventionally quantized models on digital hardware, across CIFAR10 and CIFAR100 datasets using various network depths for the most popular network topology.", "keywords": "Processing In-Memory System;Neuromorphic System;Deep Learning;Training Dynamics;Neural Network Quantization", "primary_area": "", "supplementary_material": "", "author": "Qing Jin;Zhiyu Chen;Jian Ren;Yanyu Li;Yanzhi Wang;Kaiyuan Yang", "authorids": "~Qing_Jin1;~Zhiyu_Chen3;~Jian_Ren2;~Yanyu_Li1;~Yanzhi_Wang3;~Kaiyuan_Yang1", "gender": ";M;M;;M;", "homepage": ";;https://alanspike.github.io/;;https://web.northeastern.edu/yanzhiwang/;https://vlsi.rice.edu", "dblp": "37/11144;;59/2180-5;194/5818;;", "google_scholar": "X9iggBcAAAAJ;;https://scholar.google.co.jp/citations?user=vDALiU4AAAAJ;https://scholar.google.com/citations?hl=en;https://scholar.google.com/citations?hl=en;", "orcid": "0000-0001-8795-9297;;;;;", "linkedin": ";zhiyu-chen-720445179/;;;;", "or_profile": "~Qing_Jin1;~Zhiyu_Chen3;~Jian_Ren2;~Yanyu_Li1;~Yanzhi_Wang3;~Kaiyuan_Yang1", "aff": "Northeastern University;Rice University;Snap Inc.;Northeastern University;Northeastern University;Rice University", "aff_domain": "northeastern.edu;rice.edu;snapchat.com;northeastern.edu;northeastern.edu;rice.edu", "position": "PhD Student;PhD student;Research Scientist;PhD student;Associate Professor;Assistant Professor", "bibtex": "@misc{\njin2022pimqat,\ntitle={{PIM}-{QAT}: Neural Network Quantization For Processing-In-Memory ({PIM}) Systems},\nauthor={Qing Jin and Zhiyu Chen and Jian Ren and Yanyu Li and Yanzhi Wang and Kaiyuan Yang},\nyear={2022},\nurl={https://openreview.net/forum?id=ib8vMnQPQ2}\n}", "github": "", "project": "", "reviewers": "mxGB;Durj;Lf4A", "site": "https://openreview.net/forum?id=ib8vMnQPQ2", "pdf_size": 0, "recommendation": "3;5;5", "confidence": "4;4;4", "correctness": "3;3;3", "technical_novelty": "2;2;2", "empirical_novelty": "2;2;2", "wc_summary_paper": "112;97;28", "wc_summary_review": "79;64;37", "wc_main_review": "280;305;279", "wc_review": "471;466;344", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 4.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 79.0, 36.578682316343766 ], "wc_summary_review_avg": [ 60.0, 17.378147196982766 ], "wc_main_review_avg": [ 288.0, 12.027745701779143 ], "wc_review_avg": [ 427.0, 58.72534943843814 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12374291825007681697&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;2;0;0;1", "aff_unique_norm": "Northeastern University;Rice University;Snap Inc.", "aff_unique_dep": ";;", "aff_unique_url": "https://www.northeastern.edu;https://www.rice.edu;https://www.snapinc.com", "aff_unique_abbr": "NEU;Rice;Snap", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "ibNr25jJrf", "title": "Direct Evolutionary Optimization of Variational Autoencoders With Binary Latents", "track": "main", "status": "Reject", "tldr": "", "abstract": "Many types of data are generated at least partly by discrete causes that are sparsely\nactive. To model such data, we here investigate a deep generative model in the\nform of a variational autoencoder (VAE) which can learn a sparse, binary code\nfor its latents. Because of the latents\u2019 discrete nature, standard VAE training is\nnot possible. The goal of previous approaches has therefore been to amend (i.e.,\ntypically anneal) discrete priors in order to train discrete VAEs analogously to\nconventional ones. Here, we divert much more strongly from conventional VAE\ntraining: We ask if it is also possible to keep the discrete nature of the latents\nfully intact by applying a direct, discrete optimization for the encoding model. In\ndoing so, we (1) sidestep standard VAE mechanisms such as sampling approximation, reparameterization trick and amortization, and (2) observe a much sparser\nencoding compared to autoencoders that use annealed discrete latents. Direct optimization of VAEs is enabled by an evolutionary algorithm in conjunction with\ntruncated posteriors as variational distributions, i.e. by a combination of methods\nwhich is here for the first time applied to a deep model. We first show how the discrete variational method (A) ties into gradient ascent for network weights, and how\nit (B) uses the decoder network to select binary latent states for training. Sparse\ncodes have prominently been applied to image patches, where latents encode edge-like structure. For our VAEs, we maintain this prototypical application domain\nand observe the emergence of much sparser codes compared to more conventional\nVAEs. To allow for a broad comparison to other approaches, the emerging encoding was then evaluated on denoising and inpainting tasks, which are canonically\nbenchmarks for image patch models. For datasets with many, large images of single objects (ImageNet, CIFAR etc) deep generative models with dense codes seem\npreferable. For image patches, however, we observed advantages of sparse codes\nthat give rise to state-of-the-art performance in \u2018zero-shot\u2019 denoising and inpainting benchmarks. Sparse codes can consequently make VAEs competitive on tasks\nwhere they have previously been outperformed by non-generative approaches.", "keywords": "variational optimization;variational autoencoders;denoising;inpainting;evolutionary algorithms", "primary_area": "", "supplementary_material": "/attachment/330637f31d878b7ec308b743031e0a71c046eeca.zip", "author": "Enrico Guiraud;Jakob Drefs;Filippos S Panagiotou;Jorg Lucke", "authorids": "~Enrico_Guiraud1;~Jakob_Drefs1;~Filippos_S_Panagiotou1;~Jorg_Lucke1", "gender": "M;M;;M", "homepage": ";https://uol.de/en/machine-learning/;;http://uol.de/ml", "dblp": ";;;http://dblp.uni-trier.de/pers/hd/l/L=uuml=cke:J=ouml=rg", "google_scholar": ";;;h-NXaIsAAAAJ", "orcid": ";;;", "linkedin": ";;filippos-panagiotou/;", "or_profile": "~Enrico_Guiraud1;~Jakob_Drefs1;~Filippos_S_Panagiotou1;~Jorg_Lucke1", "aff": ";Carl von Ossietzky Universit\u00e4t Oldenburg;Carl von Ossietzky Universit\u00e4t Oldenburg;University of Oldenburg", "aff_domain": ";uol.de;uol.de;uni-oldenburg.de", "position": ";PhD student;PhD student;Associate Professor", "bibtex": "@misc{\nguiraud2022direct,\ntitle={Direct Evolutionary Optimization of Variational Autoencoders With Binary Latents},\nauthor={Enrico Guiraud and Jakob Drefs and Filippos S Panagiotou and Jorg Lucke},\nyear={2022},\nurl={https://openreview.net/forum?id=ibNr25jJrf}\n}", "github": "", "project": "", "reviewers": "DBE8;hasK;XRYc;ALx1;TZnV", "site": "https://openreview.net/forum?id=ibNr25jJrf", "pdf_size": 0, "recommendation": "5;5;6;8;8", "confidence": "4;4;2;4;3", "correctness": "3;4;3;4;4", "technical_novelty": "2;3;3;4;3", "empirical_novelty": "2;3;2;4;3", "wc_summary_paper": "40;86;37;50;153", "wc_summary_review": "92;30;15;74;13", "wc_main_review": "417;355;85;519;174", "wc_review": "549;471;137;643;340", "wc_reply_reviewers": "90;0;0;22;0", "wc_reply_authors": "306;608;153;311;189", "reply_reviewers": "1;0;0;1;0", "reply_authors": "1;1;1;1;1", "recommendation_avg": [ 6.4, 1.3564659966250536 ], "confidence_avg": [ 3.4, 0.8 ], "correctness_avg": [ 3.6, 0.4898979485566356 ], "technical_novelty_avg": [ 3.0, 0.6324555320336759 ], "empirical_novelty_avg": [ 2.8, 0.7483314773547882 ], "wc_summary_paper_avg": [ 73.2, 43.54951205237551 ], "wc_summary_review_avg": [ 44.8, 32.24530973645625 ], "wc_main_review_avg": [ 310.0, 158.91884721454534 ], "wc_review_avg": [ 428.0, 176.14766532656628 ], "wc_reply_reviewers_avg": [ 22.4, 34.857423886454946 ], "wc_reply_authors_avg": [ 313.4, 160.03324654583497 ], "reply_reviewers_avg": [ 0.4, 0.48989794855663565 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.1474419561548971, "corr_recommendation_correctness": 0.5417363388859615, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5562816539052001738&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "aff_unique_index": "0;0;1", "aff_unique_norm": "Carl von Ossietzky University of Oldenburg;University of Oldenburg", "aff_unique_dep": ";", "aff_unique_url": "https://www.uni-oldenburg.de/;https://www.uni-oldenburg.de/", "aff_unique_abbr": "UvO;UOL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Germany" }, { "title": "Frequency-aware SGD for Efficient Embedding Learning with Provable Benefits", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6670", "id": "ibqTBNfJmi", "poster": "", "openreview": "https://openreview.net/forum?id=ibqTBNfJmi", "slides": "https://iclr.cc/virtual/2022/poster/6670", "video": "https://iclr.cc/virtual/2022/poster/6670", "author_site": "Yan Li, Dhruv Choudhary, Xiaohan Wei, Baichuan Yuan, Bhargav Bhushanam, Tuo Zhao, Guanghui Lan", "tldr": "", "abstract": "Embedding learning has found widespread applications in recommendation systems and natural language modeling, among other domains. To learn quality embeddings efficiently, adaptive learning rate algorithms have demonstrated superior empirical performance over SGD, largely accredited to their token-dependent learning rate. However, the underlying mechanism for the efficiency of token-dependent learning rate remains underexplored. We show that incorporating frequency information of tokens in the embedding learning problems leads to provably efficient algorithms, and demonstrate that common adaptive algorithms implicitly exploit the frequency information to a large extent. Specifically, we propose (Counter-based) Frequency-aware Stochastic Gradient Descent, which applies a frequency-dependent learning rate for each token, and exhibits provable speed-up compared to SGD when the token distribution is imbalanced. Empirically, we show the proposed algorithms are able to improve or match the performance of adaptive algorithms on benchmark recommendation tasks and a large-scale industrial recommendation system, closing the performance gap between SGD and adaptive algorithms. Our results are the first to show token-dependent learning rate provably improves convergence for non-convex embedding learning problems.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yan Li;Dhruv Choudhary;Xiaohan Wei;Baichuan Yuan;Bhargav Bhushanam;Tuo Zhao;Guanghui Lan", "authorids": "~Yan_Li9;~Dhruv_Choudhary1;~Xiaohan_Wei2;~Baichuan_Yuan1;bbhushanam@fb.com;~Tuo_Zhao1;~Guanghui_Lan1", "gender": "M;M;M;M;;M;M", "homepage": "https://gzliyan113.github.io/;;;https://ybcmath.github.io/;;http://www2.isye.gatech.edu/~tzhao80;", "dblp": ";28/8364.html;;215/6518;;;53/3033", "google_scholar": "wLfoeakAAAAJ;;6iZ_ddsAAAAJ;SwSL9NIAAAAJ;;EJXN6tYAAAAJ;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": "~Yan_Li9;~Dhruv_Choudhary1;~Xiaohan_Wei2;~Baichuan_Yuan1;bbhushanam@fb.com;~Tuo_Zhao1;~Guanghui_Lan1", "aff": "Georgia Institute of Technology;Meta Facebook;;Meta Facebook;;Georgia Institute of Technology;Georgia Institute of Technology", "aff_domain": "gatech.edu;fb.com;;facebook.com;;gatech.edu;gatech.edu", "position": "PhD student;Researcher;;Research Scientist;;Associate Professor;Associate Professor", "bibtex": "@inproceedings{\nli2022frequencyaware,\ntitle={Frequency-aware {SGD} for Efficient Embedding Learning with Provable Benefits},\nauthor={Yan Li and Dhruv Choudhary and Xiaohan Wei and Baichuan Yuan and Bhargav Bhushanam and Tuo Zhao and Guanghui Lan},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=ibqTBNfJmi}\n}", "github": "", "project": "", "reviewers": "Xi3e;xBEu;42Qf;8cB1", "pdf_size": 0, "recommendation": "6;6;6;8", "confidence": "3;3;2;3", "correctness": "3;3;4;3", "technical_novelty": "3;3;3;3", "empirical_novelty": "3;2;3;3", "wc_summary_paper": "48;41;65;47", "wc_summary_review": "64;66;14;16", "wc_main_review": "137;279;67;174", "wc_review": "249;386;146;237", "wc_reply_reviewers": "0;265;0;0", "wc_reply_authors": "364;1002;389;218", "reply_reviewers": "0;2;0;0", "reply_authors": "1;3;1;1", "recommendation_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 2.75, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 50.25, 8.926785535678562 ], "wc_summary_review_avg": [ 40.0, 25.019992006393608 ], "wc_main_review_avg": [ 164.25, 76.5877764398471 ], "wc_review_avg": [ 254.5, 85.73359901462203 ], "wc_reply_reviewers_avg": [ 66.25, 114.74836600143811 ], "wc_reply_authors_avg": [ 493.25, 300.8997964439325 ], "reply_reviewers_avg": [ 0.5, 0.8660254037844386 ], "reply_authors_avg": [ 1.5, 0.8660254037844386 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": -0.3333333333333333, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9866941340084117765&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 6, "pdf": "https://openreview.net/pdf?id=ibqTBNfJmi", "email": "gatech.edu;fb.com;;facebook.com;;gatech.edu;gatech.edu", "author_num": 7, "aff_unique_index": "0;1;1;0;0", "aff_unique_norm": "Georgia Institute of Technology;Meta", "aff_unique_dep": ";Meta Platforms, Inc.", "aff_unique_url": "https://www.gatech.edu;https://meta.com", "aff_unique_abbr": "Georgia Tech;Meta", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Neural Models for Output-Space Invariance in Combinatorial Problems", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6908", "id": "ibrUkC-pbis", "poster": "", "openreview": "https://openreview.net/forum?id=ibrUkC-pbis", "slides": "https://iclr.cc/virtual/2022/poster/6908", "video": "https://iclr.cc/virtual/2022/poster/6908", "author_site": "Yatin Nandwani, Vidit Jain, Mausam ., Parag Singla", "tldr": "", "abstract": "Recently many neural models have been proposed to solve combinatorial puzzles by implicitly learning underlying constraints using their solved instances, such as sudoku or graph coloring (GCP). One drawback of the proposed architectures, which are often based on Graph Neural Networks (GNN) (Zhou et al., 2020), is that they cannot generalize across the size of the output space from which variables are assigned a value, for example, set of colors in a GCP, or board-size in sudoku. We call the output space for the variables as \u2018value-set\u2019. While many works have demonstrated generalization of GNNs across graph size, there has been no study on how to design a GNN for achieving value-set invariance for problems that come from the same domain. For example, learning to solve 16 x 16 sudoku after being trained on only 9 x 9 sudokus, or coloring a 7 colorable graph after training on 4 colorable graphs. In this work, we propose novel methods to extend GNN based architectures to achieve value-set invariance. Specifically, our model builds on recently proposed Recurrent Relational Networks (RRN) (Palm et al., 2018). Our first approach exploits the graph-size invariance of GNNs by converting a multi-class node classification problem into a binary node classification problem. Our second approach works directly with multiple classes by adding multiple nodes corresponding to the values in the value-set, and then connecting variable nodes to value nodes depending on the problem initialization. Our experimental evaluation on three different combinatorial problems demonstrates that both our models perform well on our novel problem, compared to a generic neural reasoner. Between two of our models, we observe an inherent trade-off: while the binarized model gives better performance when trained on smaller value-sets, multi-valued model is much more memory efficient, resulting in improved performance when trained on larger value-sets, where binarized model fails to train.", "keywords": "neural reasoning;output space invariance", "primary_area": "", "supplementary_material": "", "author": "Yatin Nandwani;Vidit Jain;Mausam .;Parag Singla", "authorids": "~Yatin_Nandwani1;~Vidit_Jain2;~Mausam_.1;~Parag_Singla1", "gender": "M;;M;M", "homepage": "http://www.cse.iitd.ac.in/~yatin;;http://www.cse.iitd.ac.in/~parags;http://www.cse.iitd.ac.in/~mausam", "dblp": "255/7046;68/5650;14/167;30/6391.html", "google_scholar": "https://scholar.google.com/citations?hl=en;;https://scholar.google.co.in/citations?user=V49BsgMAAAAJ;https://scholar.google.co.in/citations?hl=en", "orcid": ";0000-0002-7911-1074;;0000-0003-4088-4296", "linkedin": "yatin-nandwani-0804ba9/;jvidit/;;", "or_profile": "~Yatin_Nandwani1;~Vidit_Jain2;~Parag_Singla1;~Mausam_Mausam2", "aff": "Indian Institute of Technology Delhi;Microsoft Research, India;Indian Institute of Technology, Delhi;Indian Institute of Technology Delhi", "aff_domain": "iitd.ac.in;microsoft.com;iitd.ac.in;iitd.ac.in", "position": "PhD student;Research SDE;Associate Professor;Full Professor", "bibtex": "@inproceedings{\nnandwani2022neural,\ntitle={Neural Models for Output-Space Invariance in Combinatorial Problems},\nauthor={Yatin Nandwani and Vidit Jain and Mausam . and Parag Singla},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=ibrUkC-pbis}\n}", "github": "", "project": "", "reviewers": "VvZR;XpUt;FY1P", "pdf_size": 0, "recommendation": "5;6;8", "confidence": "3;3;3", "correctness": "2;4;3", "technical_novelty": "3;3;3", "empirical_novelty": "2;3;3", "wc_summary_paper": "99;210;153", "wc_summary_review": "120;150;62", "wc_main_review": "231;219;206", "wc_review": "450;579;421", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "1084;1187;538", "reply_reviewers": "0;0;0", "reply_authors": "2;2;1", "recommendation_avg": [ 6.333333333333333, 1.247219128924647 ], "confidence_avg": [ 3.0, 0.0 ], "correctness_avg": [ 3.0, 0.816496580927726 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 154.0, 45.32107677449864 ], "wc_summary_review_avg": [ 110.66666666666667, 36.527006751473934 ], "wc_main_review_avg": [ 218.66666666666666, 10.208928554075703 ], "wc_review_avg": [ 483.3333333333333, 68.67475680497327 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 936.3333333333334, 284.78568940325795 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.3273268353539886, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=976085613099325485&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=ibrUkC-pbis", "email": "iitd.ac.in;microsoft.com;iitd.ac.in;iitd.ac.in", "author_num": 4, "aff_unique_index": "0;1;0;0", "aff_unique_norm": "Indian Institute of Technology Delhi;Microsoft", "aff_unique_dep": ";Microsoft Research", "aff_unique_url": "https://www.iitd.ac.in;https://www.microsoft.com/en-us/research/group/india.aspx", "aff_unique_abbr": "IIT Delhi;MSR India", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Delhi;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "India" }, { "title": "Towards Understanding the Data Dependency of Mixup-style Training", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6526", "id": "ieNJYujcGDO", "poster": "", "openreview": "https://openreview.net/forum?id=ieNJYujcGDO", "slides": "https://iclr.cc/virtual/2022/poster/6526", "video": "https://iclr.cc/virtual/2022/poster/6526", "author_site": "Muthu Chidambaram, Xiang Wang, Yuzheng Hu, Chenwei Wu, Rong Ge", "tldr": "", "abstract": "In the Mixup training paradigm, a model is trained using convex combinations of data points and their associated labels. Despite seeing very few true data points during training, models trained using Mixup seem to still minimize the original empirical risk and exhibit better generalization and robustness on various tasks when compared to standard training. In this paper, we investigate how these benefits of Mixup training rely on properties of the data in the context of classification. For minimizing the original empirical risk, we compute a closed form for the Mixup-optimal classification, which allows us to construct a simple dataset on which minimizing the Mixup loss leads to learning a classifier that does not minimize the empirical loss on the data. On the other hand, we also give sufficient conditions for Mixup training to also minimize the original empirical risk. For generalization, we characterize the margin of a Mixup classifier, and use this to understand why the decision boundary of a Mixup classifier can adapt better to the full structure of the training data when compared to standard training. In contrast, we also show that, for a large class of linear models and linearly separable datasets, Mixup training leads to learning the same classifier as standard training.", "keywords": "mixup;deep learning;semi-supervised learning;empirical risk minimization;generalization;margin;counterexample", "primary_area": "", "supplementary_material": "/attachment/0b39a7f8d327f5ede92ee48b1eb15e2616e48ce2.zip", "author": "Muthu Chidambaram;Xiang Wang;Yuzheng Hu;Chenwei Wu;Rong Ge", "authorids": "~Muthu_Chidambaram1;~Xiang_Wang1;~Yuzheng_Hu1;~Chenwei_Wu1;~Rong_Ge1", "gender": "M;M;M;M;M", "homepage": "https://2014mchidamb.github.io/;https://users.cs.duke.edu/~xwang/;https://mirnegg.github.io;https://users.cs.duke.edu/~cwwu/;https://users.cs.duke.edu/~rongge/", "dblp": "304/3319;;231/2255.html;https://dblp.uni-trier.de/pers/hd/w/Wu_0002:Chenwei;89/6869-1.html", "google_scholar": "R43EbqAAAAAJ;dHjYcrgAAAAJ;cVVimVcAAAAJ;WoB6M2cAAAAJ;https://scholar.google.com.tw/citations?user=MVxcjEoAAAAJ", "orcid": ";;;0000-0002-5226-7431;", "linkedin": "muthu-chidambaram-b8803919a/;;yuzheng-hu-a74b5823b/;chenwei-wu-22754012b/;", "or_profile": "~Muthu_Chidambaram1;~Xiang_Wang1;~Yuzheng_Hu1;~Chenwei_Wu1;~Rong_Ge1", "aff": "Duke University;Duke University;University of Illinois, Urbana Champaign;Duke University;Duke University", "aff_domain": "duke.edu;duke.edu;uiuc.edu;duke.edu;duke.edu", "position": "PhD student;PhD student;PhD student;PhD student;Assistant Professor", "bibtex": "@inproceedings{\nchidambaram2022towards,\ntitle={Towards Understanding the Data Dependency of Mixup-style Training},\nauthor={Muthu Chidambaram and Xiang Wang and Yuzheng Hu and Chenwei Wu and Rong Ge},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=ieNJYujcGDO}\n}", "github": "", "project": "", "reviewers": "mn55;NoVN;eGEK", "pdf_size": 0, "recommendation": "3;6;8", "confidence": "3;4;2", "correctness": "3;3;3", "technical_novelty": "2;3;3", "empirical_novelty": "2;3;3", "wc_summary_paper": "41;134;144", "wc_summary_review": "60;56;62", "wc_main_review": "228;616;667", "wc_review": "329;806;873", "wc_reply_reviewers": "0;44;0", "wc_reply_authors": "841;702;1194", "reply_reviewers": "0;1;0", "reply_authors": "1;1;2", "recommendation_avg": [ 5.666666666666667, 2.0548046676563256 ], "confidence_avg": [ 3.0, 0.816496580927726 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 106.33333333333333, 46.37767662236745 ], "wc_summary_review_avg": [ 59.333333333333336, 2.494438257849294 ], "wc_main_review_avg": [ 503.6666666666667, 196.0345774488663 ], "wc_review_avg": [ 669.3333333333334, 242.20147719523834 ], "wc_reply_reviewers_avg": [ 14.666666666666666, 20.741798914805393 ], "wc_reply_authors_avg": [ 912.3333333333334, 207.09471799691616 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.3973597071195132, "corr_recommendation_correctness": 0.0, "gs_citation": 26, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13244705498491864959&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "pdf": "https://openreview.net/pdf?id=ieNJYujcGDO", "email": "duke.edu;duke.edu;uiuc.edu;duke.edu;duke.edu", "author_num": 5, "aff_unique_index": "0;0;1;0;0", "aff_unique_norm": "Duke University;University of Illinois Urbana-Champaign", "aff_unique_dep": ";", "aff_unique_url": "https://www.duke.edu;https://illinois.edu", "aff_unique_abbr": "Duke;UIUC", "aff_campus_unique_index": "1", "aff_campus_unique": ";Urbana-Champaign", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "iedYJm92o0a", "title": "Show Your Work: Scratchpads for Intermediate Computation with Language Models", "track": "main", "status": "Reject", "tldr": "", "abstract": "Large pre-trained language models perform remarkably well on tasks that can be done \"in one pass\", such as generating realistic text or synthesizing computer programs. However, they struggle with tasks that require unbounded multi-step computation, such as adding integers or executing programs. Surprisingly, we find that these same models are able to perform complex multi-step computations --- even in the few-shot regime --- when asked to perform the operation \"step by step\", showing the results of intermediate computations. In particular, we train transformers to perform multi-step computations by asking them to emit intermediate computation steps into a \"scratchpad\". On a series of increasingly complex tasks ranging from long addition to the execution of arbitrary programs, we show that scratchpads dramatically improve the ability of language models to perform multi-step computations. ", "keywords": "program synthesis;transformers;language models;pre-training;program induction", "primary_area": "", "supplementary_material": "", "author": "Maxwell Nye;Anders Johan Andreassen;Guy Gur-Ari;Henryk Michalewski;Jacob Austin;David Bieber;David Dohan;Aitor Lewkowycz;Maarten Bosma;David Luan;Charles Sutton;Augustus Odena", "authorids": "~Maxwell_Nye1;~Anders_Johan_Andreassen1;~Guy_Gur-Ari1;~Henryk_Michalewski1;~Jacob_Austin1;~David_Bieber1;~David_Dohan1;~Aitor_Lewkowycz2;~Maarten_Bosma1;dluan@google.com;~Charles_Sutton1;~Augustus_Odena1", "gender": "M;M;M;M;;M;;M;M;;M;", "homepage": "https://maxwellnye.github.io/;;;https://www.mimuw.edu.pl/~henrykm/;;https://davidbieber.com;;https://scholar.google.com/citations?user=Yum1ah0AAAAJ&hl=en&authuser=1;;;http://homepages.inf.ed.ac.uk/csutton/;https://www.augustusodena.com/", "dblp": "224/0047;;;https://dblp.uni-trier.de/pers/hd/m/Michalewski:Henryk;;200/8035;;;;;59/5879;175/1583", "google_scholar": "NsuX8R8AAAAJ;;mx8P4QUAAAAJ;YdHW1ycAAAAJ;NWXxqQ8AAAAJ;KVXW75wAAAAJ;;;wkeFQPgAAAAJ;;https://scholar.google.co.uk/citations?user=hYtGXD0AAAAJ;vuwLi4MAAAAJ", "orcid": ";0000-0003-3504-3919;;;;;;;;;0000-0002-0041-3820;", "linkedin": ";;;henryk-michalewski-8a230a27/;;;;;;;charles-sutton-772aa126;", "or_profile": "~Maxwell_Nye1;~Anders_Johan_Andreassen1;~Guy_Gur-Ari1;~Henryk_Michalewski1;~Jacob_Austin1;~David_Bieber1;~David_Dohan1;~Aitor_Lewkowycz2;~Maarten_Bosma1;dluan@google.com;~Charles_Sutton1;~Augustus_Odena1", "aff": "Massachusetts Institute of Technology;Google;Google;Google DeepMind;Google;Montreal Institute for Learning Algorithms, University of Montreal, University of Montreal;;Google;Google;;University of Edinburgh;Google", "aff_domain": "mit.edu;google.com;google.com;google.com;google.com;mila.umontreal.ca;;google.com;google.com;;ed.ac.uk;google.com", "position": "PhD student;Research Scientist;Research Scientist;Researcher;AI Resident;PhD student;;Postdoc;Research Engineer;;Professor;Senior Research Scientist", "bibtex": "@misc{\nnye2022show,\ntitle={Show Your Work: Scratchpads for Intermediate Computation with Language Models},\nauthor={Maxwell Nye and Anders Johan Andreassen and Guy Gur-Ari and Henryk Michalewski and Jacob Austin and David Bieber and David Dohan and Aitor Lewkowycz and Maarten Bosma and David Luan and Charles Sutton and Augustus Odena},\nyear={2022},\nurl={https://openreview.net/forum?id=iedYJm92o0a}\n}", "github": "", "project": "", "reviewers": "WdDB;UqLx;uxzZ;PHbv", "site": "https://openreview.net/forum?id=iedYJm92o0a", "pdf_size": 0, "recommendation": "3;3;8;8", "confidence": "5;4;4;4", "correctness": "1;2;3;4", "technical_novelty": "2;2;4;3", "empirical_novelty": "3;2;4;0", "wc_summary_paper": "85;185;125;152", "wc_summary_review": "90;52;88;19", "wc_main_review": "1613;311;503;294", "wc_review": "1788;548;716;465", "wc_reply_reviewers": "477;477;0;0", "wc_reply_authors": "1252;534;35;99", "reply_reviewers": "1;1;0;0", "reply_authors": "3;2;1;1", "recommendation_avg": [ 5.5, 2.5 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 2.5, 1.118033988749895 ], "technical_novelty_avg": [ 2.75, 0.82915619758885 ], "empirical_novelty_avg": [ 2.25, 1.479019945774904 ], "wc_summary_paper_avg": [ 136.75, 36.663162711364656 ], "wc_summary_review_avg": [ 62.25, 29.192250684042847 ], "wc_main_review_avg": [ 680.25, 544.7418540005899 ], "wc_review_avg": [ 879.25, 532.4018101960211 ], "wc_reply_reviewers_avg": [ 238.5, 238.5 ], "wc_reply_authors_avg": [ 480.0, 485.3055738398231 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.75, 0.82915619758885 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 12, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.8944271909999159, "gs_citation": 725, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14260550009277354080&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff_unique_index": "0;1;1;1;1;2;1;1;3;1", "aff_unique_norm": "Massachusetts Institute of Technology;Google;University of Montreal;University of Edinburgh", "aff_unique_dep": ";Google;Montreal Institute for Learning Algorithms;", "aff_unique_url": "https://web.mit.edu;https://www.google.com;https://www.umontreal.ca;https://www.ed.ac.uk", "aff_unique_abbr": "MIT;Google;UM;Edinburgh", "aff_campus_unique_index": "1;1;1;2;1;1;1", "aff_campus_unique": ";Mountain View;Montreal", "aff_country_unique_index": "0;0;0;1;0;2;0;0;1;0", "aff_country_unique": "United States;United Kingdom;Canada" }, { "id": "iim-R8xu0TG", "title": "FitVid: High-Capacity Pixel-Level Video Prediction", "track": "main", "status": "Reject", "tldr": "", "abstract": "An agent that is capable of predicting what happens next can perform a variety of tasks through planning with no additional training. Furthermore, such an agent can internally represent the complex dynamics of the real-world and therefore can acquire a representation useful for a variety of visual perception tasks. This makes predicting the future frames of a video, conditioned on the observed past and potentially future actions, an interesting task which remains exceptionally challenging despite many recent advances. Existing video prediction models have shown promising results on simple narrow benchmarks but they generate low quality predictions on real-life datasets with more complicated dynamics or broader domain. There is a growing body of evidence that underfitting on the training data is one of the primary causes for the low quality predictions. In this paper, we argue that the inefficient use of parameters in the current video models is the main reason for underfitting. Therefore, we introduce a new architecture, named FitVid, which is capable of fitting the common benchmarks so well that it begins to suffer from overfitting -- while having similar parameter count as the current state-of-the-art models. We analyze the consequences of overfitting, illustrating how it can produce unexpected outcomes such as generating high quality output by repeating the training data, and how it can be mitigated using existing image augmentation techniques. As a result, FitVid outperforms the current state-of-the-art models across four different video prediction benchmarks on four different metrics. ", "keywords": "video prediction;self supervised learning;unsupervised learning;robotics", "primary_area": "", "supplementary_material": "/attachment/a294f2d3d9811eb9c7e6782257432b106998a083.zip", "author": "Mohammad Babaeizadeh;Mohammad Taghi Saffar;Suraj Nair;Sergey Levine;Chelsea Finn;Dumitru Erhan", "authorids": "~Mohammad_Babaeizadeh1;~Mohammad_Taghi_Saffar1;~Suraj_Nair1;~Sergey_Levine1;~Chelsea_Finn1;~Dumitru_Erhan1", "gender": "M;M;M;M;F;M", "homepage": ";;https://suraj-nair-1.github.io/;https://people.eecs.berkeley.edu/~svlevine/;https://ai.stanford.edu/~cbfinn/;http://dumitru.ca", "dblp": ";;;80/7594;131/1783;http://dblp.uni-trier.de/pers/hd/e/Erhan:Dumitru", "google_scholar": "3Y4egcYAAAAJ;p1cmEzsAAAAJ;EHSuFcwAAAAJ;8R35rCwAAAAJ;vfPE6hgAAAAJ;wfGiqXEAAAAJ", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": "~Mohammad_Babaeizadeh1;~Mohammad_Taghi_Saffar1;~Suraj_Nair1;~Sergey_Levine1;~Chelsea_Finn1;~Dumitru_Erhan1", "aff": "Google;Google;Meta Facebook;Google;Google;Google", "aff_domain": "google.com;google.com;facebook.com;google.com;google.com;google.com", "position": "Research Enginner;Research Engineer;Student Researcher;Research Scientist;Research Scientist;Research Scientist", "bibtex": "@misc{\nbabaeizadeh2022fitvid,\ntitle={FitVid: High-Capacity Pixel-Level Video Prediction},\nauthor={Mohammad Babaeizadeh and Mohammad Taghi Saffar and Suraj Nair and Sergey Levine and Chelsea Finn and Dumitru Erhan},\nyear={2022},\nurl={https://openreview.net/forum?id=iim-R8xu0TG}\n}", "github": "", "project": "", "reviewers": "7Y5j;exmV;jPAY;3nyU", "site": "https://openreview.net/forum?id=iim-R8xu0TG", "pdf_size": 0, "recommendation": "5;5;5;6", "confidence": "4;4;3;2", "correctness": "4;3;3;3", "technical_novelty": "2;2;2;3", "empirical_novelty": "3;3;2;3", "wc_summary_paper": "94;74;52;93", "wc_summary_review": "19;47;284;119", "wc_main_review": "181;561;48;327", "wc_review": "294;682;384;539", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "244;781;472;449", "reply_reviewers": "0;0;0;0", "reply_authors": "2;1;1;1", "recommendation_avg": [ 5.25, 0.4330127018922193 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 78.25, 17.122718826167766 ], "wc_summary_review_avg": [ 117.25, 102.95235548543802 ], "wc_main_review_avg": [ 279.25, 190.2582127005297 ], "wc_review_avg": [ 474.75, 148.3128028863321 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 486.5, 191.80263293291884 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.8703882797784891, "corr_recommendation_correctness": -0.3333333333333333, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:-dpjMN1CDdkJ:scholar.google.com/&scioq=FitVid:+High-Capacity+Pixel-Level+Video+Prediction&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0;1;0;0;0", "aff_unique_norm": "Google;Meta", "aff_unique_dep": "Google;Meta Platforms, Inc.", "aff_unique_url": "https://www.google.com;https://meta.com", "aff_unique_abbr": "Google;Meta", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "ijygjHyhcFp", "title": "Anarchic Federated Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Present-day federated learning (FL) systems deployed over edge networks consists of a large number of workers with high degrees of heterogeneity in data and/or computing capabilities, which call for flexible worker participation in terms of timing, effort, data heterogeneity, etc. To achieve these goals, in this work, we propose a new FL paradigm called ``Anarchic Federated Learning'' (AFL). In stark contrast to conventional FL models, each worker in AFL has complete freedom to choose i) when to participate in FL, and ii) the number of local steps to perform in each round based on its current situation (e.g., battery level, communication channels, privacy concerns). However, AFL also introduces significant challenges in algorithmic design because the server needs to handle the chaotic worker behaviors. Toward this end, we propose two Anarchic Federated Averaging (AFA) algorithms with two-sided learning rates for both cross-device and cross-silo settings, which are named AFA-CD and AFA-CS, respectively. Somewhat surprisingly, even with general worker information arrival processes, we show that both AFL algorithms achieve the same convergence rate order as the state-of-the-art algorithms for conventional FL. Moreover, they retain the highly desirable {\\em linear speedup effect} in the new AFL paradigm. We validate the proposed algorithms with extensive experiments on real-world datasets.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Haibo Yang;Xin Zhang;Prashant Khanduri;Jia Liu", "authorids": "~Haibo_Yang1;~Xin_Zhang16;~Prashant_Khanduri1;~Jia_Liu1", "gender": "M;M;M;M", "homepage": "https://haibo-yang-osu.github.io/homepage/;https://xinzhang-nac.github.io/;https://sites.google.com/view/khanduri-prashant/home?authuser=0;https://kevinliu-osu.github.io/index.html", "dblp": "43/7829-1;76/1584-54.html;158/4888;", "google_scholar": "eyy22VoAAAAJ;9u5Pa0gAAAAJ;;Ofx3dScAAAAJ", "orcid": "0000-0002-3245-2728;0000-0002-0784-2038;;", "linkedin": ";;prashant-khanduri-0497894b/;", "or_profile": "~Haibo_Yang1;~Xin_Zhang16;~Prashant_Khanduri1;~Jia_Liu1", "aff": "Ohio State University;Meta Facebook;University of Minnesota, Minneapolis;The Ohio State University", "aff_domain": "osu.edu;fb.com;umn.edu;osu.edu", "position": "PhD student;Research Scientist;Postdoc;Assistant Professor", "bibtex": "@misc{\nyang2022anarchic,\ntitle={Anarchic Federated Learning},\nauthor={Haibo Yang and Xin Zhang and Prashant Khanduri and Jia Liu},\nyear={2022},\nurl={https://openreview.net/forum?id=ijygjHyhcFp}\n}", "github": "", "project": "", "reviewers": "bjgp;8n7V;Ubxj;rgtg", "site": "https://openreview.net/forum?id=ijygjHyhcFp", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "4;4;4;3", "correctness": "3;3;4;4", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "193;65;54;102", "wc_summary_review": "62;113;87;24", "wc_main_review": "926;505;416;373", "wc_review": "1181;683;557;499", "wc_reply_reviewers": "389;0;0;0", "wc_reply_authors": "1742;1728;1412;1013", "reply_reviewers": "1;0;0;0", "reply_authors": "3;3;3;2", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 103.5, 54.646591842492796 ], "wc_summary_review_avg": [ 71.5, 32.82148686455262 ], "wc_main_review_avg": [ 555.0, 219.42310726083522 ], "wc_review_avg": [ 730.0, 268.74709300753375 ], "wc_reply_reviewers_avg": [ 97.25, 168.44194103607333 ], "wc_reply_authors_avg": [ 1473.75, 296.9447549629392 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 2.75, 0.4330127018922193 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.6622661785325219, "corr_recommendation_correctness": 0.6882472016116854, "gs_citation": 83, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6853314649307559893&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "Ohio State University;Meta;University of Minnesota", "aff_unique_dep": ";Meta Platforms, Inc.;", "aff_unique_url": "https://www.osu.edu;https://meta.com;https://www.minnesota.edu", "aff_unique_abbr": "OSU;Meta;UMN", "aff_campus_unique_index": "1", "aff_campus_unique": ";Minneapolis", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "in1ynkrXyMH", "title": "Introspective Learning : A Two-Stage approach for Inference in Neural Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "In this paper, we advocate for two stages in a neural network's decision making process. The first is the existing feed-forward inference framework where patterns in given data are sensed and associated with previously learned patterns. The second stage is a slower reflection stage where we ask the network to reflect on its feed-forward decision by considering and evaluating all available choices. Together, we term the two stages as introspective learning. We use gradients of trained neural networks as a measurement of this reflection. We perceptually visualize the explanations from both stages to provide a visual grounding to introspection. For the application of recognition, we show that an introspective network is $4\\%$ more robust and $42\\%$ less prone to calibration errors when generalizing to noisy data. We also illustrate the value of introspective networks in downstream tasks that require generalizability and calibration including active learning and out-of-distribution detection. Finally, we ground the proposed machine introspection to human introspection in the application of image quality assessment.", "keywords": "Reasoning;Knowledge Representation;Robustness;Recognition", "primary_area": "", "supplementary_material": "", "author": "Mohit Prabhushankar;Ghassan AlRegib", "authorids": "~Mohit_Prabhushankar1;~Ghassan_AlRegib1", "gender": "M;M", "homepage": "https://sites.google.com/view/mohit-prabhushankar;http://www.ghassanalregib.info", "dblp": "185/7435;83/1655", "google_scholar": "https://scholar.google.com/scholar?hl=en;https://scholar.google.com/citations?hl=en", "orcid": "0000-0002-8743-7058;", "linkedin": ";ghassan-alregib-0602131/", "or_profile": "~Mohit_Prabhushankar1;~Ghassan_AlRegib1", "aff": "Georgia Institute of Technology;Georgia Institute of Technology", "aff_domain": "gatech.edu;gatech.edu", "position": "Postdoc;Full Professor", "bibtex": "@misc{\nprabhushankar2022introspective,\ntitle={Introspective Learning : A Two-Stage approach for Inference in Neural Networks},\nauthor={Mohit Prabhushankar and Ghassan AlRegib},\nyear={2022},\nurl={https://openreview.net/forum?id=in1ynkrXyMH}\n}", "github": "", "project": "", "reviewers": "8FMs;Q4tj;DxqE;yYCU", "site": "https://openreview.net/forum?id=in1ynkrXyMH", "pdf_size": 0, "recommendation": "3;5;6;6", "confidence": "4;4;3;3", "correctness": "2;2;3;3", "technical_novelty": "2;3;3;3", "empirical_novelty": "0;3;2;3", "wc_summary_paper": "78;25;120;69", "wc_summary_review": "67;52;67;62", "wc_main_review": "707;744;510;347", "wc_review": "852;821;697;478", "wc_reply_reviewers": "0;0;36;0", "wc_reply_authors": "1574;1563;789;453", "reply_reviewers": "0;0;1;0", "reply_authors": "2;2;1;1", "recommendation_avg": [ 5.0, 1.224744871391589 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 1.224744871391589 ], "wc_summary_paper_avg": [ 73.0, 33.741665637605976 ], "wc_summary_review_avg": [ 62.0, 6.123724356957945 ], "wc_main_review_avg": [ 577.0, 159.82646839619525 ], "wc_review_avg": [ 712.0, 147.02210718119912 ], "wc_reply_reviewers_avg": [ 9.0, 15.588457268119896 ], "wc_reply_authors_avg": [ 1094.75, 488.43237761229545 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.8164965809277259, "corr_recommendation_correctness": 0.8164965809277259, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16860968089703315753&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff_unique_index": "0;0", "aff_unique_norm": "Georgia Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.gatech.edu", "aff_unique_abbr": "Georgia Tech", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "inA3szzFE5", "title": "Spatial Frequency Sensitivity Regularization for Robustness", "track": "main", "status": "Reject", "tldr": "", "abstract": "The ability to generalize to out-of-distribution data is a major challenge for modern deep neural networks. Recent work has shown that deep neural networks latch on to superficial Fourier statistics of the training data and fail to generalize when these statistics change, such as when images are subject to common corruptions. In this paper, we study the frequency characteristics of deep neural networks in order to improve their robustness. We first propose a general measure of a model's $\\textit{\\textbf{spatial frequency sensitivity}}$ based on its input-Jacobian represented in the Fourier-basis. When applied to deep neural networks, we find that standard minibatch training consistently leads to increased sensitivity towards particular spatial frequencies independent of network architecture. We further propose a family of $\\textit{\\textbf{spatial frequency regularizers}}$ based on our proposed measure to induce specific spatial frequency sensitivities in a model. In experiments on datasets with out-of-distribution test images arising from various common image corruptions, we find that deep neural networks trained with our proposed regularizers obtain significantly improved classification accuracy while maintaining high accuracy on in-distribution clean test images.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Kiran Chari;Chuan-Sheng Foo;See-Kiong Ng", "authorids": "~Kiran_Chari1;~Chuan-Sheng_Foo1;~See-Kiong_Ng1", "gender": ";M;M", "homepage": ";http://ai.stanford.edu/~csfoo;https://www.comp.nus.edu.sg/~ngsk/", "dblp": ";73/1823;00/5480", "google_scholar": ";AgbeqGkAAAAJ;https://scholar.google.com.tw/citations?user=_wsommYAAAAJ", "orcid": ";0000-0002-4748-5792;0000-0001-6565-7511", "linkedin": "kiranchari;;seekiong/?originalSubdomain=sg", "or_profile": "~Kiran_Chari1;~Chuan-Sheng_Foo1;~See-Kiong_Ng1", "aff": "National University of Singapore;Institute for Infocomm Research, A*STAR;National University of Singapore", "aff_domain": "nus.edu;i2r.a-star.edu.sg;nus.edu.sg", "position": "PhD student;Scientist;Full Professor", "bibtex": "@misc{\nchari2022spatial,\ntitle={Spatial Frequency Sensitivity Regularization for Robustness},\nauthor={Kiran Chari and Chuan-Sheng Foo and See-Kiong Ng},\nyear={2022},\nurl={https://openreview.net/forum?id=inA3szzFE5}\n}", "github": "", "project": "", "reviewers": "GEyu;womq;fjLz;pK3z", "site": "https://openreview.net/forum?id=inA3szzFE5", "pdf_size": 0, "recommendation": "3;5;6;6", "confidence": "5;4;3;3", "correctness": "2;3;4;4", "technical_novelty": "2;2;3;4", "empirical_novelty": "1;2;3;3", "wc_summary_paper": "51;47;61;180", "wc_summary_review": "43;35;64;61", "wc_main_review": "208;234;181;298", "wc_review": "302;316;306;539", "wc_reply_reviewers": "282;0;0;83", "wc_reply_authors": "646;164;316;332", "reply_reviewers": "1;0;0;1", "reply_authors": "2;1;1;2", "recommendation_avg": [ 5.0, 1.224744871391589 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.75, 0.82915619758885 ], "empirical_novelty_avg": [ 2.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 84.75, 55.22850260508608 ], "wc_summary_review_avg": [ 50.75, 12.132085558550928 ], "wc_main_review_avg": [ 230.25, 43.37265843823733 ], "wc_review_avg": [ 365.75, 100.15581610670446 ], "wc_reply_reviewers_avg": [ 91.25, 115.22450911156011 ], "wc_reply_authors_avg": [ 364.5, 175.25053494925487 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.9847319278346618, "corr_recommendation_correctness": 0.9847319278346618, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:CeZLG24gcAYJ:scholar.google.com/&scioq=Spatial+Frequency+Sensitivity+Regularization+for+Robustness&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1;0", "aff_unique_norm": "National University of Singapore;Institute for Infocomm Research", "aff_unique_dep": ";", "aff_unique_url": "https://www.nus.edu.sg;https://www.i2r.a-star.edu.sg", "aff_unique_abbr": "NUS;I2R", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Singapore" }, { "id": "inSTvgLk2YP", "title": "MeshInversion: 3D textured mesh reconstruction with generative prior", "track": "main", "status": "Reject", "tldr": "", "abstract": "Recovering a textured 3D mesh from a single image is highly challenging, particularly for in-the-wild objects that lack 3D ground truths. Prior attempts resort to weak supervision based on 2D silhouette annotations of monocular images. Since the supervision lies in the 2D space while the output is in the 3D space, such in-direct supervision often over-emphasizes the observable part of the 3D textured mesh, at the expense of the overall reconstruction quality. Although previous attempts have adopted various hand-crafted heuristics to reduce this gap, this issue is far from being solved. In this work, we present an alternative framework, \\textbf{MeshInversion}, that reduces the gap by exploiting the \\textit{generative prior} of a 3D GAN pre-trained for 3D textured mesh synthesis. Reconstruction is achieved by searching for a latent space in the 3D GAN that best resembles the target mesh in accordance with the single view observation. Since the pre-trained GAN encapsulates rich 3D semantics in terms of mesh geometry and texture, searching within the GAN manifold thus naturally regularizes the realness and fidelity of the reconstruction. Importantly, such regularization is directly applied in the 3D space, providing crucial guidance of mesh parts that are unobserved in the 2D space. Experiments on standard benchmarks show that our framework obtains faithful 3D reconstructions with consistent geometry and texture across both observed and unobserved parts. Moreover, it generalizes well to meshes that are less commonly seen, such as the extended articulation of deformable objects.", "keywords": "Single-view 3D object reconstruction;GAN inversion", "primary_area": "", "supplementary_material": "/attachment/4c1caef06c7f3774f743c9db28a580683ddab9c9.zip", "author": "Junzhe Zhang;Daxuan Ren;Zhongang Cai;Chai Kiat Yeo;Bo Dai;Chen Change Loy", "authorids": "~Junzhe_Zhang2;~Daxuan_Ren1;~Zhongang_Cai1;~Chai_Kiat_Yeo1;~Bo_Dai2;~Chen_Change_Loy2", "gender": "M;M;M;;M;M", "homepage": "https://junzhezhang.github.io/;;https://caizhongang.com;;http://daibo.info/;https://www.mmlab-ntu.com/person/ccloy/index.html", "dblp": ";268/6752;232/3190;;64/2903-2;01/5855", "google_scholar": "https://scholar.google.com.sg/citations?user=wupwVHAAAAAJ;CJ5dYFwAAAAJ;WrDKqIAAAAAJ;;https://scholar.google.com.hk/citations?user=KNWTvgEAAAAJ;https://scholar.google.co.uk/citations?user=559LF80AAAAJ", "orcid": "0000-0003-1931-8046;0000-0002-8449-3038;0000-0002-1810-3855;;0000-0003-0777-9232;0000-0001-5345-1591", "linkedin": "zhang-junzhe-68a62240/;rendaxuan;caizhongang/;;;", "or_profile": "~Junzhe_Zhang2;~Daxuan_Ren1;~Zhongang_Cai1;~Chai_Kiat_Yeo1;~Bo_Dai2;~Chen_Change_Loy2", "aff": "Nanyang Technological University;Nanyang Technological University;Nanyang Technological University;;Nanyang Technological University;Nanyang Technological University", "aff_domain": "ntu.edu.sg;ntu.edu.sg;ntu.edu.sg;;ntu.edu.sg;ntu.edu.sg", "position": "PhD student;PhD student;PhD student;;Research Assistant Professor;Full Professor", "bibtex": "@misc{\nzhang2022meshinversion,\ntitle={MeshInversion: 3D textured mesh reconstruction with generative prior},\nauthor={Junzhe Zhang and Daxuan Ren and Zhongang Cai and Chai Kiat Yeo and Bo Dai and Chen Change Loy},\nyear={2022},\nurl={https://openreview.net/forum?id=inSTvgLk2YP}\n}", "github": "", "project": "", "reviewers": "LKvE;i1Mn;owCV;pz7V", "site": "https://openreview.net/forum?id=inSTvgLk2YP", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "4;3;4;4", "correctness": "2;2;2;4", "technical_novelty": "2;3;2;3", "empirical_novelty": "2;3;2;3", "wc_summary_paper": "76;82;55;81", "wc_summary_review": "60;42;42;47", "wc_main_review": "591;430;747;255", "wc_review": "727;554;844;383", "wc_reply_reviewers": "359;166;60;106", "wc_reply_authors": "1681;957;1562;598", "reply_reviewers": "1;1;1;1", "reply_authors": "3;2;3;1", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 2.5, 0.8660254037844386 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 73.5, 10.920164833920778 ], "wc_summary_review_avg": [ 47.75, 7.361215932167728 ], "wc_main_review_avg": [ 505.75, 183.0865573984065 ], "wc_review_avg": [ 627.0, 174.60956445739163 ], "wc_reply_reviewers_avg": [ 172.75, 113.91087524902967 ], "wc_reply_authors_avg": [ 1199.5, 442.67849507289145 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 2.25, 0.82915619758885 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.13245323570650439, "corr_recommendation_correctness": 0.6622661785325219, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:__fTR12hVzgJ:scholar.google.com/&scioq=MeshInversion:+3D+textured+mesh+reconstruction+with+generative+prior&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Nanyang Technological University", "aff_unique_dep": "", "aff_unique_url": "https://www.ntu.edu.sg", "aff_unique_abbr": "NTU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "Singapore" }, { "title": "Focus on the Common Good: Group Distributional Robustness Follows", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6216", "id": "irARV_2VFs4", "poster": "", "openreview": "https://openreview.net/forum?id=irARV_2VFs4", "slides": "https://iclr.cc/virtual/2022/poster/6216", "video": "https://iclr.cc/virtual/2022/poster/6216", "author_site": "Vihari Piratla, Praneeth Netrapalli, Sunita Sarawagi", "tldr": "", "abstract": "We consider the problem of training a classification model with group annotated training data. Recent work has established that, if there is distribution shift across different groups, models trained using the standard empirical risk minimization (ERM) objective suffer from poor performance on minority groups and that group distributionally robust optimization (Group-DRO) objective is a better alternative. The starting point of this paper is the observation that though Group-DRO performs better than ERM on minority groups for some benchmark datasets, there are several other datasets where it performs much worse than ERM. Inspired by ideas from the closely related problem of domain generalization, this paper proposes a new and simple algorithm that explicitly encourages learning of features that are shared across various groups. The key insight behind our proposed algorithm is that while Group-DRO focuses on groups with worst regularized loss, focusing instead, on groups that enable better performance even on other groups, could lead to learning of shared/common features, thereby enhancing minority performance beyond what is achieved by Group-DRO. Empirically, we show that our proposed algorithm matches or achieves better performance compared to strong contemporary baselines including ERM and Group-DRO on standard benchmarks on both minority groups and across all groups. Theoretically, we show that the proposed algorithm is a descent method and finds first order stationary points of smooth nonconvex functions.", "keywords": "sub-population shift;robust optimization;domain generalization", "primary_area": "", "supplementary_material": "", "author": "Vihari Piratla;Praneeth Netrapalli;Sunita Sarawagi", "authorids": "~Vihari_Piratla1;~Praneeth_Netrapalli1;~Sunita_Sarawagi1", "gender": "M;M;F", "homepage": "https://vihari.github.io/;http://praneethnetrapalli.org/;https://www.cse.iitb.ac.in/~sunita/", "dblp": "161/3626;http://dblp.uni-trier.de/pers/hd/n/Netrapalli:Praneeth;s/SunitaSarawagi", "google_scholar": "https://scholar.google.co.in/citations?user=DQddccYAAAAJ;https://scholar.google.co.in/citations?user=mim8FQkAAAAJ;https://scholar.google.com.tw/citations?user=Hg4HmTAAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Vihari_Piratla1;~Praneeth_Netrapalli1;~Sunita_Sarawagi1", "aff": "Indian Institute of Technology Bombay;Google;IIT Bombay", "aff_domain": "iitb.ac.in;google.com;iitb.ac.in", "position": "PhD student;Research Scientist;Full Professor", "bibtex": "@inproceedings{\npiratla2022focus,\ntitle={Focus on the Common Good: Group Distributional Robustness Follows},\nauthor={Vihari Piratla and Praneeth Netrapalli and Sunita Sarawagi},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=irARV_2VFs4}\n}", "github": "", "project": "", "reviewers": "4DaZ;cTFe;4c9A;UQv8", "pdf_size": 0, "recommendation": "3;6;6;8", "confidence": "4;3;4;4", "correctness": "3;3;4;3", "technical_novelty": "2;3;4;4", "empirical_novelty": "3;3;4;4", "wc_summary_paper": "171;76;127;51", "wc_summary_review": "89;10;60;17", "wc_main_review": "279;169;439;254", "wc_review": "539;255;626;322", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "1102;139;795;313", "reply_reviewers": "0;0;0;0", "reply_authors": "2;1;1;1", "recommendation_avg": [ 5.75, 1.7853571071357126 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 3.25, 0.82915619758885 ], "empirical_novelty_avg": [ 3.5, 0.5 ], "wc_summary_paper_avg": [ 106.25, 46.34314943980394 ], "wc_summary_review_avg": [ 44.0, 32.27227912620985 ], "wc_main_review_avg": [ 285.25, 97.68412102281516 ], "wc_review_avg": [ 435.5, 152.04029071269233 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 587.25, 382.18737224037113 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.08084520834544431, "corr_recommendation_correctness": 0.08084520834544431, "gs_citation": 33, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7624890232005107632&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=irARV_2VFs4", "email": "iitb.ac.in;google.com;iitb.ac.in", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "Indian Institute of Technology Bombay;Google", "aff_unique_dep": ";Google", "aff_unique_url": "https://www.iitb.ac.in;https://www.google.com", "aff_unique_abbr": "IIT Bombay;Google", "aff_campus_unique_index": "0;1;2", "aff_campus_unique": "Bombay;Mountain View;Mumbai", "aff_country_unique_index": "0;1;0", "aff_country_unique": "India;United States" }, { "title": "The Efficiency Misnomer", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6607", "id": "iulEMLYh1uR", "poster": "", "openreview": "https://openreview.net/forum?id=iulEMLYh1uR", "slides": "https://iclr.cc/virtual/2022/poster/6607", "video": "https://iclr.cc/virtual/2022/poster/6607", "author_site": "Mostafa Dehghani, Yi Tay, Anurag Arnab, Lucas Beyer, Ashish Vaswani", "tldr": "", "abstract": "Model efficiency is a critical aspect of developing and deploying machine learning models. \nInference time and latency directly affect the user experience, and some applications have hard requirements. In addition to inference costs, model training also have direct financial and environmental impacts.\nAlthough there are numerous well-established metrics (cost indicators) for measuring model efficiency, researchers and practitioners often assume that these metrics are correlated with each other and report only a few of them.\nIn this paper, we thoroughly discuss common cost indicators, their advantages and disadvantages, and how they can contradict each other.\nWe demonstrate how incomplete reporting of cost indicators can lead to partial conclusions and a blurred or incomplete picture of the practical considerations of different models. We further present suggestions to improve reporting of efficiency metrics.", "keywords": "Efficiency in Machine Learning;FLOPs;Number of Parameters;Throughput", "primary_area": "", "supplementary_material": "", "author": "Mostafa Dehghani;Yi Tay;Anurag Arnab;Lucas Beyer;Ashish Vaswani", "authorids": "~Mostafa_Dehghani1;~Yi_Tay1;~Anurag_Arnab1;~Lucas_Beyer1;~Ashish_Vaswani1", "gender": "M;M;;;M", "homepage": "http://mostafadehghani.com/;http://yitay.net;;http://lucasb.eyer.be;", "dblp": "125/4062;;;126/4720;", "google_scholar": "https://scholar.google.nl/citations?user=MiHOX3QAAAAJ;VBclY_cAAAAJ;;p2gwhK4AAAAJ;6rUjwXUAAAAJ", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~Mostafa_Dehghani1;~Yi_Tay1;~Anurag_Arnab1;~Lucas_Beyer1;~Ashish_Vaswani1", "aff": "Google DeepMind;Google;;Google Brain;", "aff_domain": "google.com;google.com;;google.com;", "position": "Research Scientist;Research Scientist;;Researcher;", "bibtex": "@inproceedings{\ndehghani2022the,\ntitle={The Efficiency Misnomer},\nauthor={Mostafa Dehghani and Yi Tay and Anurag Arnab and Lucas Beyer and Ashish Vaswani},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=iulEMLYh1uR}\n}", "github": "", "project": "", "reviewers": "sinW;6vKw;AyyT;GqNw", "pdf_size": 0, "recommendation": "5;5;6;8", "confidence": "4;4;4;4", "correctness": "3;3;4;4", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "66;69;83;72", "wc_summary_review": "61;28;54;25", "wc_main_review": "273;187;195;918", "wc_review": "400;284;332;1015", "wc_reply_reviewers": "86;107;112;176", "wc_reply_authors": "486;386;233;868", "reply_reviewers": "1;1;1;2", "reply_authors": "2;2;1;2", "recommendation_avg": [ 6.0, 1.224744871391589 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 72.5, 6.422616289332565 ], "wc_summary_review_avg": [ 42.0, 15.732132722552274 ], "wc_main_review_avg": [ 393.25, 304.8215666582665 ], "wc_review_avg": [ 507.75, 295.7468300759959 ], "wc_reply_reviewers_avg": [ 120.25, 33.633130987167995 ], "wc_reply_authors_avg": [ 493.25, 234.37296665784643 ], "reply_reviewers_avg": [ 1.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.75, 0.4330127018922193 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.8164965809277259, "gs_citation": 128, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3107123743830524220&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=iulEMLYh1uR", "email": "google.com;google.com;;google.com;", "author_num": 5, "aff_unique_index": "0;0;0", "aff_unique_norm": "Google", "aff_unique_dep": "Google DeepMind", "aff_unique_url": "https://deepmind.com", "aff_unique_abbr": "DeepMind", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;1;1", "aff_country_unique": "United Kingdom;United States" }, { "title": "Sequential Reptile: Inter-Task Gradient Alignment for Multilingual Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6199", "id": "ivQruZvXxtz", "poster": "", "openreview": "https://openreview.net/forum?id=ivQruZvXxtz", "slides": "https://iclr.cc/virtual/2022/poster/6199", "video": "https://iclr.cc/virtual/2022/poster/6199", "author_site": "Seanie Lee, Hae Beom Lee, Juho Lee, Sung Ju Hwang", "tldr": "", "abstract": "Multilingual models jointly pretrained on multiple languages have achieved remarkable performance on various multilingual downstream tasks. Moreover, models finetuned on a single monolingual downstream task have shown to generalize to unseen languages. In this paper, we first show that it is crucial for those tasks to align gradients between them in order to maximize knowledge transfer while minimizing negative transfer. Despite its importance, the existing methods for gradient alignment either have a completely different purpose, ignore inter-task alignment, or aim to solve continual learning problems in rather inefficient ways. As a result of the misaligned gradients between tasks, the model suffers from severe negative transfer in the form of catastrophic forgetting of the knowledge acquired from the pretraining. To overcome the limitations, we propose a simple yet effective method that can efficiently align gradients between tasks. Specifically, we perform each inner-optimization by sequentially sampling batches from all the tasks, followed by a Reptile outer update. Thanks to the gradients aligned between tasks by our method, the model becomes less vulnerable to negative transfer and catastrophic forgetting. We extensively validate our method on various multi-task learning and zero-shot cross-lingual transfer tasks, where our method largely outperforms all the relevant baselines we consider.", "keywords": "multilingual language model;gradient alignment", "primary_area": "", "supplementary_material": "", "author": "Seanie Lee;Hae Beom Lee;Juho Lee;Sung Ju Hwang", "authorids": "~Seanie_Lee1;~Hae_Beom_Lee1;~Juho_Lee2;~Sung_Ju_Hwang1", "gender": "M;M;M;", "homepage": "https://seanie12.github.io/;https://haebeom-lee.github.io;https://juho.lee.github.io;", "dblp": "219/6771;326/7260;55/3410-1;", "google_scholar": "zrZu6GkAAAAJ;;Py4URJUAAAAJ;", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Seanie_Lee1;~Hae_Beom_Lee1;~Juho_Lee2;~Sung_Ju_Hwang1", "aff": "Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology;", "aff_domain": "kaist.ac.kr;kaist.ac.kr;kaist.ac.kr;", "position": "MS student;PhD student;Assistant Professor;", "bibtex": "@inproceedings{\nlee2022sequential,\ntitle={Sequential Reptile: Inter-Task Gradient Alignment for Multilingual Learning},\nauthor={Seanie Lee and Hae Beom Lee and Juho Lee and Sung Ju Hwang},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=ivQruZvXxtz}\n}", "github": "", "project": "", "reviewers": "8YbF;gNwa;vCuD;ZkcB", "pdf_size": 0, "recommendation": "5;5;5;6", "confidence": "5;4;4;3", "correctness": "3;3;3;3", "technical_novelty": "2;2;3;3", "empirical_novelty": "0;2;3;3", "wc_summary_paper": "97;51;44;116", "wc_summary_review": "22;60;26;26", "wc_main_review": "571;682;172;378", "wc_review": "690;793;242;520", "wc_reply_reviewers": "0;0;0;125", "wc_reply_authors": "1431;1772;816;1585", "reply_reviewers": "0;0;0;1", "reply_authors": "4;4;3;4", "recommendation_avg": [ 5.25, 0.4330127018922193 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.0, 1.224744871391589 ], "wc_summary_paper_avg": [ 77.0, 30.35621847332108 ], "wc_summary_review_avg": [ 33.5, 15.386682553429118 ], "wc_main_review_avg": [ 450.75, 194.2490347466365 ], "wc_review_avg": [ 561.25, 208.5106412152627 ], "wc_reply_reviewers_avg": [ 31.25, 54.12658773652741 ], "wc_reply_authors_avg": [ 1401.0, 358.6857956485035 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 3.75, 0.4330127018922193 ], "replies_avg": [ 23, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.816496580927726, "corr_recommendation_correctness": 0.0, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15390935828968359036&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "pdf": "https://openreview.net/pdf?id=ivQruZvXxtz", "email": "kaist.ac.kr;kaist.ac.kr;kaist.ac.kr;", "author_num": 4, "aff_unique_index": "0;0;0", "aff_unique_norm": "Korea Advanced Institute of Science and Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.kaist.ac.kr", "aff_unique_abbr": "KAIST", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "South Korea" }, { "id": "iw-ms2znSS2", "title": "The Remarkable Effectiveness of Combining Policy and Value Networks in A*-based Deep RL for AI Planning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Despite the tremendous success of applying traditional backtrack-style combinatorial search methods in various NP-complete domains such as SAT and CSP as well as using deep reinforcement learning (RL) to tackle two-player games such as Go, PSPACE-hard AI planning has remained out of reach for current AI planning systems. Even carefully designed domain-specific solvers fail quickly due to the exponential combinatorial search space on hard instances. Recent work based on deep learning guided search algorithms that combine traditional search-based methods, such as A\\textsc{*} and MCTS search, with deep neural networks' heuristic prediction has shown promising progress. These methods can solve a significant number of hard planning instances beyond specialized solvers. To better understanding why these approaches work we study the interplay of the policy and value networks in A\\textsc{*}-based deep RL and show the surprising effectiveness of the policy network, further enhanced by the value network, as a guiding heuristic for A\\textsc{*}. To further understand the phenomena, we study the cost distributions of deep planners and found planning instances can have heavy-tailed runtime distributions, with tails both on the right-hand and left-hand sides. In particular, for the first time, we show the existence of {\\textit{left}} heavy tails and propose a theoretical model that can explain the appearance of these tails. We provide extensive experimental data supporting our model. The experiments show the critical role of the policy network as a powerful heuristic guiding A\\textsc{*}, which can lead to left tails with polynomial scaling by avoiding exploring exponential size sub-trees early on in the search. Our results also demonstrate the importance of random restart strategies, as are widely used in traditional combinatorial solvers, for deep reinforcement learning and deep AI planning systems to avoid left and right heavy tails.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Dieqiao Feng;Carla P Gomes;Bart Selman", "authorids": "~Dieqiao_Feng1;~Carla_P_Gomes1;~Bart_Selman1", "gender": "M;;M", "homepage": "http://dqfeng.me;;http://www.cs.cornell.edu/selman/", "dblp": "192/1458;;http://dblp.uni-trier.de/pers/hd/s/Selman:Bart", "google_scholar": "nSpffi8AAAAJ;;pJ28HA0AAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Dieqiao_Feng1;~Carla_P_Gomes1;~Bart_Selman1", "aff": "Cornell University;;Cornell University", "aff_domain": "cornell.edu;;cornell.edu", "position": "PhD student;;Full Professor", "bibtex": "@misc{\nfeng2022the,\ntitle={The Remarkable Effectiveness of Combining Policy and Value Networks in A*-based Deep {RL} for {AI} Planning},\nauthor={Dieqiao Feng and Carla P Gomes and Bart Selman},\nyear={2022},\nurl={https://openreview.net/forum?id=iw-ms2znSS2}\n}", "github": "", "project": "", "reviewers": "VVfA;Lduu;whsB;4bCR", "site": "https://openreview.net/forum?id=iw-ms2znSS2", "pdf_size": 0, "recommendation": "1;3;5;6", "confidence": "5;4;4;4", "correctness": "2;2;2;2", "technical_novelty": "2;2;3;4", "empirical_novelty": "1;2;2;3", "wc_summary_paper": "145;107;98;71", "wc_summary_review": "557;76;57;103", "wc_main_review": "3155;252;791;1035", "wc_review": "3857;435;946;1209", "wc_reply_reviewers": "0;0;0;35", "wc_reply_authors": "780;425;759;812", "reply_reviewers": "0;0;0;1", "reply_authors": "1;1;1;2", "recommendation_avg": [ 3.75, 1.920286436967152 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 2.0, 0.0 ], "technical_novelty_avg": [ 2.75, 0.82915619758885 ], "empirical_novelty_avg": [ 2.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 105.25, 26.49882072847771 ], "wc_summary_review_avg": [ 198.25, 207.76835057342106 ], "wc_main_review_avg": [ 1308.25, 1103.2183317458064 ], "wc_review_avg": [ 1611.75, 1325.8316965210931 ], "wc_reply_reviewers_avg": [ 8.75, 15.155444566227676 ], "wc_reply_authors_avg": [ 694.0, 156.44967241896035 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.8268106308031117, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:42DOG8x9j9sJ:scholar.google.com/&scioq=The+Remarkable+Effectiveness+of+Combining+Policy+and+Value+Networks+in+A*-based+Deep+RL+for+AI+Planning&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Cornell University", "aff_unique_dep": "", "aff_unique_url": "https://www.cornell.edu", "aff_unique_abbr": "Cornell", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "iy2b91gvZpf", "title": "LDDMM-Face: Large Deformation Diffeomorphic Metric Learning for Cross-annotation Face Alignment", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "We innovatively propose a flexible and consistent cross-annotation face alignment framework, LDDMM-Face, the key contribution of which is a deformation layer that naturally embeds facial geometry in a diffeomorphic way. Instead of predicting facial landmarks via heatmap or coordinate regression, we formulate the face alignment task in a diffeomorphic registration manner and predict momenta that uniquely parameterize the deformation between initial boundary and true boundary. We then perform large deformation diffeomorphic metric mapping (LDDMM) simultaneously for curve and landmark to localize the facial landmarks. Due to the novel embedding of LDDMM into a deep network, LDDMM-Face can consistently annotate facial landmarks without ambiguity and flexibly handle various annotation schemes, and can even predict dense annotations from sparse ones. Our method can be easily integrated into various face alignment networks. We extensively evaluate LDDMM-Face on four benchmark datasets: 300W, WFLW, HELEN and COFW-68. LDDMM-Face distinguishes itself with outstanding performance when dealing with within-dataset cross-annotation learning (sparse-to-dense) and cross-dataset learning (different training and testing datasets). In addition, LDDMM-Face shows promising results on the most challenging task of cross-dataset cross-annotation learning (different training and testing datasets with different annotations).", "keywords": "Cross-annotation Face Alignment;Large Deformation Diffeomorphic Metric Mapping;Sparsely-supervised Learning", "primary_area": "", "supplementary_material": "", "author": "Huilin Yang;Junyan Lyu;Pujin Cheng;Roger Tam;Xiaoying Tang", "authorids": "~Huilin_Yang1;~Junyan_Lyu1;12032946@mail.sustech.edu.cn;~Roger_Tam1;tangxy@sustech.edu.cn", "gender": ";M;;M;", "homepage": "https://yanghuilin102.wixsite.com/huiliny;;;http://www.rogertam.ca;", "dblp": ";250/6172;;09/5693;", "google_scholar": ";OZMF8NwAAAAJ;;https://scholar.google.ca/citations?hl=en;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~Huilin_Yang1;~Junyan_Lyu1;12032946@mail.sustech.edu.cn;~Roger_Tam1;tangxy@sustech.edu.cn", "aff": "University of British Columbia;University of Queensland;;University of British Columbia;", "aff_domain": "ubc.ca;uq.edu;;ubc.ca;", "position": "PhD student;PhD student;;Associate Professor;", "bibtex": "@misc{\nyang2022lddmmface,\ntitle={{LDDMM}-Face: Large Deformation Diffeomorphic Metric Learning for Cross-annotation Face Alignment},\nauthor={Huilin Yang and Junyan Lyu and Pujin Cheng and Roger Tam and Xiaoying Tang},\nyear={2022},\nurl={https://openreview.net/forum?id=iy2b91gvZpf}\n}", "github": "", "project": "", "reviewers": "9WJt;P1fL;WMcm;VUmz", "site": "https://openreview.net/forum?id=iy2b91gvZpf", "pdf_size": 0, "recommendation": "3;3;5;8", "confidence": "5;5;5;4", "correctness": "3;2;3;4", "technical_novelty": "2;2;2;2", "empirical_novelty": "1;2;2;4", "wc_summary_paper": "29;119;48;148", "wc_summary_review": "23;107;22;42", "wc_main_review": "172;726;164;76", "wc_review": "224;952;234;266", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.75, 2.0463381929681126 ], "confidence_avg": [ 4.75, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.25, 1.0897247358851685 ], "wc_summary_paper_avg": [ 86.0, 49.05609034564414 ], "wc_summary_review_avg": [ 48.5, 34.70230539892127 ], "wc_main_review_avg": [ 284.5, 257.66790642220076 ], "wc_review_avg": [ 419.0, 308.11848370391544 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.9169493006161777, "corr_recommendation_correctness": 0.8638684255813602, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12205047069780256528&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff_unique_index": "0;1;0", "aff_unique_norm": "University of British Columbia;University of Queensland", "aff_unique_dep": ";", "aff_unique_url": "https://www.ubc.ca;https://www.uq.edu.au", "aff_unique_abbr": "UBC;UQ", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", "aff_country_unique": "Canada;Australia" }, { "title": "TAda! Temporally-Adaptive Convolutions for Video Understanding", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6801", "id": "izj68lUcBpt", "poster": "", "openreview": "https://openreview.net/forum?id=izj68lUcBpt", "slides": "https://iclr.cc/virtual/2022/poster/6801", "video": "https://iclr.cc/virtual/2022/poster/6801", "author_site": "Ziyuan Huang, Shiwei Zhang, Liang Pan, Zhiwu Qing, Mingqian Tang, Ziwei Liu, Marcelo Ang Jr", "tldr": "", "abstract": "Spatial convolutions are widely used in numerous deep video models. It fundamentally assumes spatio-temporal invariance, i.e., using shared weights for every location in different frames. This work presents Temporally-Adaptive Convolutions (TAdaConv) for video understanding, which shows that adaptive weight calibration along the temporal dimension is an efficient way to facilitate modelling complex temporal dynamics in videos. Specifically, TAdaConv empowers the spatial convolutions with temporal modelling abilities by calibrating the convolution weights for each frame according to its local and global temporal context. Compared to previous temporal modelling operations, TAdaConv is more efficient as it operates over the convolution kernels instead of the features, whose dimension is an order of magnitude smaller than the spatial resolutions. Further, the kernel calibration brings an increased model capacity. We construct TAda2D and TAdaConvNeXt networks by replacing the 2D convolutions in ResNet and ConvNeXt with TAdaConv, which leads to at least on par or better performance compared to state-of-the-art approaches on multiple video action recognition and localization benchmarks. We also demonstrate that as a readily plug-in operation with negligible computation overhead, TAdaConv can effectively improve many existing video models with a convincing margin.", "keywords": "Video understanding;Action classification;Dynamic networks", "primary_area": "", "supplementary_material": "", "author": "Ziyuan Huang;Shiwei Zhang;Liang Pan;Zhiwu Qing;Mingqian Tang;Ziwei Liu;Marcelo H Ang Jr", "authorids": "~Ziyuan_Huang1;~Shiwei_Zhang2;~Liang_Pan2;~Zhiwu_Qing1;~Mingqian_Tang1;~Ziwei_Liu1;~Marcelo_H_Ang_Jr1", "gender": "M;M;M;M;F;M;M", "homepage": "https://huang-ziyuan.github.io/;https://www.researchgate.net/profile/Shiwei_Zhang7/research;https://scholar.google.com/citations?user=lSDISOcAAAAJ&hl=en;;;https://liuziwei7.github.io/;http://guppy.mpe.nus.edu.sg/~mpeangh/", "dblp": ";;90/343;267/5389;;05/6300-2;", "google_scholar": "A9D-disAAAAJ;ZO3OQ-8AAAAJ;lSDISOcAAAAJ;q9refl4AAAAJ;;https://scholar.google.com.hk/citations?user=lc45xlcAAAAJ;dMogb2EAAAAJ", "orcid": ";0000-0002-6929-5295;;;0000-0002-7117-6666;;0000-0001-8277-6408", "linkedin": "ziyuan-huang-731b78177/;;;;;;marcelo-ang-41370b15", "or_profile": "~Ziyuan_Huang1;~Shiwei_Zhang2;~Liang_Pan2;~Zhiwu_Qing1;~Mingqian_Tang1;~Ziwei_Liu1;~Marcelo_H_Ang_Jr1", "aff": "National University of Singapore;Alibaba Group;Nanyang Technological University;Huazhong University of Science and Technology, Tsinghua University;Alibaba Group;Nanyang Technological University;National University of Singapore", "aff_domain": "u.nus.edu;alibaba-inc.com;ntu.eud.sg;hust.edu.cn;alibaba-inc.com;ntu.edu.sg;nus.edu.sg", "position": "PhD student;Researcher;Postdoc;PhD student;Staff Algorithm Engineer;Assistant Professor;Full Professor", "bibtex": "@inproceedings{\nhuang2022tada,\ntitle={{TA}da! Temporally-Adaptive Convolutions for Video Understanding},\nauthor={Ziyuan Huang and Shiwei Zhang and Liang Pan and Zhiwu Qing and Mingqian Tang and Ziwei Liu and Marcelo H Ang Jr},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=izj68lUcBpt}\n}", "github": "", "project": "", "reviewers": "jnMz;4biu;6Gv3;DgYX", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "4;4;4;3", "correctness": "3;3;4;4", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "61;109;44;155", "wc_summary_review": "83;39;40;81", "wc_main_review": "371;326;242;355", "wc_review": "515;474;326;591", "wc_reply_reviewers": "0;35;0;260", "wc_reply_authors": "1830;1241;421;1046", "reply_reviewers": "0;1;0;1", "reply_authors": "4;3;1;3", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 92.25, 43.36689405525833 ], "wc_summary_review_avg": [ 60.75, 21.26470079733077 ], "wc_main_review_avg": [ 323.5, 49.741833500585805 ], "wc_review_avg": [ 476.5, 96.5 ], "wc_reply_reviewers_avg": [ 73.75, 108.47666799823823 ], "wc_reply_authors_avg": [ 1134.5, 502.9853377584679 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 2.75, 1.0897247358851685 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": -0.9271726499455306, "corr_recommendation_correctness": 0.6882472016116854, "gs_citation": 72, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1325383719378653431&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=izj68lUcBpt", "email": "u.nus.edu;alibaba-inc.com;ntu.eud.sg;hust.edu.cn;alibaba-inc.com;ntu.edu.sg;nus.edu.sg", "author_num": 7, "aff_unique_index": "0;1;2;3;1;2;0", "aff_unique_norm": "National University of Singapore;Alibaba Group;Nanyang Technological University;Huazhong University of Science and Technology", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.nus.edu.sg;https://www.alibaba.com;https://www.ntu.edu.sg;http://www.hust.edu.cn", "aff_unique_abbr": "NUS;Alibaba;NTU;HUST", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;1;1;0;0", "aff_country_unique": "Singapore;China" }, { "title": "Unsupervised Learning of Full-Waveform Inversion: Connecting CNN and Partial Differential Equation in a Loop", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6618", "id": "izvwgBic9q", "poster": "", "openreview": "https://openreview.net/forum?id=izvwgBic9q", "slides": "https://iclr.cc/virtual/2022/poster/6618", "video": "https://iclr.cc/virtual/2022/poster/6618", "author_site": "Peng Jin, Xitong Zhang, Yinpeng Chen, Sharon Huang, Zicheng Liu, Youzuo Lin", "tldr": "", "abstract": "This paper investigates unsupervised learning of Full-Waveform Inversion (FWI), which has been widely used in geophysics to estimate subsurface velocity maps from seismic data. This problem is mathematically formulated by a second order partial differential equation (PDE), but is hard to solve. Moreover, acquiring velocity map is extremely expensive, making it impractical to scale up a supervised approach to train the mapping from seismic data to velocity maps with convolutional neural networks (CNN).We address these difficulties by $\\textit{integrating PDE and CNN in a loop}$, thus shifting the paradigm to unsupervised learning that only requires seismic data. In particular, we use finite difference to approximate the forward modeling of PDE as a differentiable operator (from velocity map to seismic data) and model its inversion by CNN (from seismic data to velocity map). Hence, we transform the supervised inversion task into an unsupervised seismic data reconstruction task. We also introduce a new large-scale dataset $\\textit{OpenFWI}$, to establish a more challenging benchmark for the community. Experiment results show that our model (using seismic data alone) yields comparable accuracy to the supervised counterpart (using both seismic data and velocity map). Furthermore, it outperforms the supervised model when involving more seismic data.", "keywords": "Unsupervised Learning;Full-Waveform Inversion;Partial Differential Equation;Physics-Informed Machine Learning", "primary_area": "", "supplementary_material": "", "author": "Peng Jin;Xitong Zhang;Yinpeng Chen;Sharon X Huang;Zicheng Liu;Youzuo Lin", "authorids": "~Peng_Jin6;~Xitong_Zhang1;~Yinpeng_Chen1;~Sharon_X_Huang1;~Zicheng_Liu1;~Youzuo_Lin1", "gender": ";M;M;F;M;M", "homepage": "https://ist.psu.edu/directory/pqj5125;;https://scholar.google.com/citations?user=V_VpLksAAAAJ&hl=en;https://faculty.ist.psu.edu/suh972/;https://sites.google.com/view/zichengliu/home?pli=1;https://sites.google.com/site/youzuolin044/", "dblp": ";156/9687;45/6977;293/8974;l/ZichengLiu;", "google_scholar": ";Ci9svAcAAAAJ;;iTtzc1UAAAAJ;bkALdvsAAAAJ;CMXuHYgAAAAJ", "orcid": ";;;0000-0003-2338-6535;0000-0001-5894-7828;", "linkedin": ";xitong-zhang-70118915a/;;sharon-x-huang-b570686/;;", "or_profile": "~Peng_Jin6;~Xitong_Zhang1;~Yinpeng_Chen1;~Sharon_X_Huang1;~Zicheng_Liu1;~Youzuo_Lin1", "aff": "Pennsylvania State University;Michigan State University;Microsoft;Pennsylvania State University;Microsoft;Los Alamos National Laboratory", "aff_domain": "psu.edu;msu.edu;microsoft.com;psu.edu;microsoft.com;lanl.gov", "position": "PhD student;PhD student;Researcher;Associate Professor;partner research manager;Researcher", "bibtex": "@inproceedings{\njin2022unsupervised,\ntitle={Unsupervised Learning of Full-Waveform Inversion: Connecting {CNN} and Partial Differential Equation in a Loop},\nauthor={Peng Jin and Xitong Zhang and Yinpeng Chen and Sharon X Huang and Zicheng Liu and Youzuo Lin},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=izvwgBic9q}\n}", "github": "", "project": "", "reviewers": "ZRwQ;vGXF;q6o6", "pdf_size": 0, "recommendation": "3;5;8", "confidence": "5;4;4", "correctness": "2;3;3", "technical_novelty": "2;2;4", "empirical_novelty": "2;2;4", "wc_summary_paper": "40;103;86", "wc_summary_review": "77;78;19", "wc_main_review": "296;483;454", "wc_review": "413;664;559", "wc_reply_reviewers": "280;120;20", "wc_reply_authors": "1872;1522;1851", "reply_reviewers": "1;1;1", "reply_authors": "6;4;5", "recommendation_avg": [ 5.333333333333333, 2.0548046676563256 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "correctness_avg": [ 2.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.6666666666666665, 0.9428090415820634 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.9428090415820634 ], "wc_summary_paper_avg": [ 76.33333333333333, 26.61244487494943 ], "wc_summary_review_avg": [ 58.0, 27.58018612458347 ], "wc_main_review_avg": [ 411.0, 82.17461083976404 ], "wc_review_avg": [ 545.3333333333334, 102.9249996626454 ], "wc_reply_reviewers_avg": [ 140.0, 107.08252269472673 ], "wc_reply_authors_avg": [ 1748.3333333333333, 160.2712977700277 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 5.0, 0.816496580927726 ], "replies_avg": [ 25, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.8029550685469661, "corr_recommendation_correctness": 0.8029550685469661, "gs_citation": 60, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10543109852078732427&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=izvwgBic9q", "email": "psu.edu;msu.edu;microsoft.com;psu.edu;microsoft.com;lanl.gov", "author_num": 6, "aff_unique_index": "0;1;2;0;2;3", "aff_unique_norm": "Pennsylvania State University;Michigan State University;Microsoft;Los Alamos National Laboratory", "aff_unique_dep": ";;Microsoft Corporation;", "aff_unique_url": "https://www.psu.edu;https://www.msu.edu;https://www.microsoft.com;https://www.lanl.gov", "aff_unique_abbr": "PSU;MSU;Microsoft;LANL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Learning Disentangled Representation by Exploiting Pretrained Generative Models: A Contrastive Learning View", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/5952", "id": "j-63FSNcO5a", "poster": "", "openreview": "https://openreview.net/forum?id=j-63FSNcO5a", "slides": "https://iclr.cc/virtual/2022/poster/5952", "video": "https://iclr.cc/virtual/2022/poster/5952", "author_site": "Xuanchi Ren, Tao Yang, Yuwang Wang, Wenjun Zeng", "tldr": "", "abstract": "From the intuitive notion of disentanglement, the image variations corresponding to different generative factors should be distinct from each other, and the disentangled representation should reflect those variations with separate dimensions. To discover the generative factors and learn disentangled representation, previous methods typically leverage an extra regularization term when learning to generate realistic images. However, the term usually results in a trade-off between disentanglement and generation quality. For the generative models pretrained without any disentanglement term, the generated images show semantically meaningful variations when traversing along different directions in the latent space. Based on this observation, we argue that it is possible to mitigate the trade-off by (i) leveraging the pretrained generative models with high generation quality, (ii) focusing on discovering the traversal directions as generative factors for disentangled representation learning. To achieve this, we propose Disentaglement via Contrast (DisCo) as a framework to model the variations based on the target disentangled representations, and contrast the variations to jointly discover disentangled directions and learn disentangled representations. DisCo achieves the state-of-the-art disentangled representation learning and distinct direction discovering, given pretrained non-disentangled generative models including GAN, VAE, and Flow. Source code is at https://github.com/xrenaa/DisCo.", "keywords": "Latent space discovery;Disentangled representation learning;Generative models;Contrastive learning", "primary_area": "", "supplementary_material": "", "author": "Xuanchi Ren;Tao Yang;Yuwang Wang;Wenjun Zeng", "authorids": "~Xuanchi_Ren1;~Tao_Yang9;~Yuwang_Wang3;~Wenjun_Zeng3", "gender": "M;M;M;M", "homepage": "https://xuanchiren.com/;https://github.com/ThomasMrY;;https://www.eias.ac.cn/h-col-187.html", "dblp": "255/5432;;161/2633;57/145", "google_scholar": "fDHUk18AAAAJ;https://scholar.google.com.hk/citations?user=qT5psCEAAAAJ;;_cUfvYQAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Xuanchi_Ren1;~Tao_Yang9;~Yuwang_Wang3;~Wenjun_Zeng3", "aff": "Hong Kong University of Science and Technology;Xi'an Jiaotong University;Microsoft Research Asia;Eastern Institute for Advanced Study", "aff_domain": "hkust.edu;xjtu.edu.cn;microsoft.com;eias.ac.cn", "position": "Undergrad student;PhD student;Researcher;Full Professor", "bibtex": "@inproceedings{\nren2022learning,\ntitle={Learning Disentangled Representation by Exploiting Pretrained Generative Models: A Contrastive Learning View},\nauthor={Xuanchi Ren and Tao Yang and Yuwang Wang and Wenjun Zeng},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=j-63FSNcO5a}\n}", "github": "", "project": "", "reviewers": "sBQs;j95X;3Z9R;Go6R", "pdf_size": 0, "recommendation": "6;6;8;8", "confidence": "4;4;5;4", "correctness": "3;4;4;4", "technical_novelty": "3;3;3;3", "empirical_novelty": "3;3;3;4", "wc_summary_paper": "30;70;30;95", "wc_summary_review": "518;30;17;98", "wc_main_review": "75;157;117;390", "wc_review": "623;257;164;583", "wc_reply_reviewers": "35;0;0;0", "wc_reply_authors": "1127;279;39;584", "reply_reviewers": "1;0;0;0", "reply_authors": "3;1;1;1", "recommendation_avg": [ 7.0, 1.0 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 3.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 56.25, 27.698149757700424 ], "wc_summary_review_avg": [ 165.75, 205.68467978923468 ], "wc_main_review_avg": [ 184.75, 121.9966700365219 ], "wc_review_avg": [ 406.75, 199.4873116265794 ], "wc_reply_reviewers_avg": [ 8.75, 15.155444566227676 ], "wc_reply_authors_avg": [ 507.25, 406.61306853075934 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.5, 0.8660254037844386 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 50, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5205978200663209990&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=j-63FSNcO5a", "email": "hkust.edu;xjtu.edu.cn;microsoft.com;eias.ac.cn", "author_num": 4, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "Hong Kong University of Science and Technology;Xi'an Jiao Tong University;Microsoft;Eastern Institute for Advanced Study", "aff_unique_dep": ";;Research;", "aff_unique_url": "https://www.ust.hk;https://www.xjtu.edu.cn;https://www.microsoft.com/en-us/research/group/asia;", "aff_unique_abbr": "HKUST;XJTU;MSR Asia;", "aff_campus_unique_index": "0;2", "aff_campus_unique": "Hong Kong SAR;;Asia", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China;" }, { "id": "j30wC0JM39Q", "title": "Why do embedding spaces look as they do?", "track": "main", "status": "Reject", "tldr": "", "abstract": "The power of embedding representations is a curious phenomenon. For embeddings to work effectively as feature representations, there must exist substantial latent structure inherent in the domain to be encoded. Language vocabularies and Wikipedia topics are human-generated structures that reflect how people organize their world, and what they find important. The structure of the resulting embedding spaces reflects the human evolution of language formation and the cultural processes shaping our world.\n\nThis paper studies what the observed structure of embeddings can tell us about the natural processes that generate new knowledge or concepts. We demonstrate that word and graph embeddings trained on standard datasets using several popular algorithms consistently share two distinct properties: (1) a decreasing neighbor frequency concentration with rank, and (2) specific clustering velocities and power-law based community structures.\nWe then assess a variety of generative models of embedding spaces by these criteria, and conclude that incremental insertion processes based on the Barab\u00e1si-Albert network generation process best model the observed phenomenon on language and network data.\n", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xingzhi Guo;Baojian Zhou;Haochen Chen;Sergiy Verstyuk;Steven Skiena", "authorids": "~Xingzhi_Guo1;bjzhou@fudan.edu.cn;haocchen@cs.stonybrook.edu;sergiy.verstyuk@gmail.com;~Steven_Skiena1", "gender": "M;;;;M", "homepage": "https://www.linkedin.com/in/xingzhi-guo;;;;https://www.cs.stonybrook.edu/~skiena", "dblp": ";;;;s/StevenSkiena.html", "google_scholar": "https://scholar.google.com/citations?hl=en;;;;https://scholar.google.com/citations?hl=en", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~Xingzhi_Guo1;bjzhou@fudan.edu.cn;haocchen@cs.stonybrook.edu;sergiy.verstyuk@gmail.com;~Steven_Skiena1", "aff": ", State University of New York, Stony Brook;;;;State University of New York at Stony Brook", "aff_domain": "cs.stonybrook.edu;;;;stonybrook.edu", "position": "PhD student;;;;Full Professor", "bibtex": "@misc{\nguo2022why,\ntitle={Why do embedding spaces look as they do?},\nauthor={Xingzhi Guo and Baojian Zhou and Haochen Chen and Sergiy Verstyuk and Steven Skiena},\nyear={2022},\nurl={https://openreview.net/forum?id=j30wC0JM39Q}\n}", "github": "", "project": "", "reviewers": "bApa;XDaN;rVxQ;6Hzd", "site": "https://openreview.net/forum?id=j30wC0JM39Q", "pdf_size": 0, "recommendation": "3;5;5;5", "confidence": "4;2;3;4", "correctness": "2;4;3;3", "technical_novelty": "3;3;3;2", "empirical_novelty": "2;2;3;2", "wc_summary_paper": "36;60;65;123", "wc_summary_review": "37;26;25;234", "wc_main_review": "181;214;105;415", "wc_review": "254;300;195;772", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "812;553;321;1358", "reply_reviewers": "0;0;0;0", "reply_authors": "2;1;1;2", "recommendation_avg": [ 4.5, 0.8660254037844386 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 71.0, 31.960913628993776 ], "wc_summary_review_avg": [ 80.5, 88.74823941915693 ], "wc_main_review_avg": [ 228.75, 114.56521068806184 ], "wc_review_avg": [ 380.25, 229.2186456202898 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 761.0, 385.9643765945246 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.5222329678670935, "corr_recommendation_correctness": 0.816496580927726, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:EWU4C9tIQZ8J:scholar.google.com/&scioq=Why+do+embedding+spaces+look+as+they+do%3F&hl=en&as_sdt=0,5", "gs_version_total": 2, "aff_unique_index": "0;1", "aff_unique_norm": "State University of New York;State University of New York at Stony Brook", "aff_unique_dep": ";", "aff_unique_url": "https://www.stonybrook.edu;https://www.stonybrook.edu", "aff_unique_abbr": "SUNY Stony Brook;SUNY Stony Brook", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Stony Brook", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Fooling Explanations in Text Classifiers", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/5998", "id": "j3krplz_4w6", "poster": "", "openreview": "https://openreview.net/forum?id=j3krplz_4w6", "slides": "https://iclr.cc/virtual/2022/poster/5998", "video": "https://iclr.cc/virtual/2022/poster/5998", "author_site": "Adam Ivankay, Ivan Girardi, Chiara Marchiori, Pascal Frossard", "tldr": "", "abstract": "State-of-the-art text classification models are becoming increasingly reliant on deep neural networks (DNNs). Due to their black-box nature, faithful and robust explanation methods need to accompany classifiers for deployment in real-life scenarios. However, it has been shown that explanation methods in vision applications are susceptible to local, imperceptible perturbations that can significantly alter the explanations without changing the predicted classes. We show here that the existence of such perturbations extends to text classifiers as well. Specifically, we introduce TextExplanationFooler (TEF), a novel explanation attack algorithm that alters text input samples imperceptibly so that the outcome of widely-used explanation methods changes considerably while leaving classifier predictions unchanged. We evaluate the attribution robustness estimation performance of TEF on five text classification datasets, utilizing three DNN architectures and a transformer architecture for each dataset. By significantly decreasing the correlation between unchanged and perturbed input attributions, we show that all models and explanation methods are susceptible to TEF perturbations. Moreover, we evaluate how the perturbations transfer to other model architectures and attribution methods, finding better than random performance in scenarios where the exact attacked model and explanation method are unknown. Finally, we introduce a semi-universal attack that is able to compute fast, computationally light perturbations with no knowledge of the attacked classifier nor explanation method. Overall, our work shows that explanations in text classifiers are fragile and users need to carefully address their robustness before relying on them in critical applications.", "keywords": "robustness;explainability;text classification;natural language processing", "primary_area": "", "supplementary_material": "/attachment/1d0f5af96a654304bd1af18a953c4ce2a6634845.zip", "author": "Adam Ivankay;Ivan Girardi;Chiara Marchiori;Pascal Frossard", "authorids": "~Adam_Ivankay1;~Ivan_Girardi1;chi@zurich.ibm.com;~Pascal_Frossard1", "gender": ";M;;", "homepage": ";https://ch.linkedin.com/in/ivan-girardi-phd-939606119;;", "dblp": "227/2695.html;227/3149;;", "google_scholar": "ZD-tg0kAAAAJ;;;", "orcid": "0000-0002-9140-0813;;;", "linkedin": ";;;", "or_profile": "~Adam_Ivankay1;~Ivan_Girardi1;chi@zurich.ibm.com;~Pascal_Frossard1", "aff": "Swiss Federal Institute of Technology Lausanne;International Business Machines;;", "aff_domain": "epfl.ch;ibm.com;;", "position": "PhD student;Researcher;;", "bibtex": "@inproceedings{\nivankay2022fooling,\ntitle={Fooling Explanations in Text Classifiers},\nauthor={Adam Ivankay and Ivan Girardi and Chiara Marchiori and Pascal Frossard},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=j3krplz_4w6}\n}", "github": "", "project": "", "reviewers": "p6VB;ZcSr;ZvQi", "pdf_size": 0, "recommendation": "5;5;6", "confidence": "4;4;4", "correctness": "3;3;3", "technical_novelty": "3;2;3", "empirical_novelty": "3;3;3", "wc_summary_paper": "102;81;131", "wc_summary_review": "72;37;17", "wc_main_review": "144;211;267", "wc_review": "318;329;415", "wc_reply_reviewers": "0;0;250", "wc_reply_authors": "399;183;714", "reply_reviewers": "0;0;2", "reply_authors": "1;1;2", "recommendation_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 104.66666666666667, 20.49932248202906 ], "wc_summary_review_avg": [ 42.0, 22.73030282830976 ], "wc_main_review_avg": [ 207.33333333333334, 50.28143019268865 ], "wc_review_avg": [ 354.0, 43.36665385600631 ], "wc_reply_reviewers_avg": [ 83.33333333333333, 117.85113019775793 ], "wc_reply_authors_avg": [ 432.0, 218.03210772727948 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.9428090415820634 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 29, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3047240786356809659&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=j3krplz_4w6", "email": "epfl.ch;ibm.com;;", "author_num": 4, "aff_unique_index": "0;1", "aff_unique_norm": "Swiss Federal Institute of Technology Lausanne;International Business Machines Corporation", "aff_unique_dep": ";", "aff_unique_url": "https://www.epfl.ch;https://www.ibm.com", "aff_unique_abbr": "EPFL;IBM", "aff_campus_unique_index": "0", "aff_campus_unique": "Lausanne;", "aff_country_unique_index": "0;1", "aff_country_unique": "Switzerland;United States" }, { "id": "j8J97VgdmsT", "title": "FLAME-in-NeRF: Neural control of Radiance Fields for Free View Face Animation", "track": "main", "status": "Reject", "tldr": "", "abstract": "This paper presents a neural rendering method for controllable portrait video synthesis.Recent advances in volumetric neural rendering, such as neural radiance fields (NeRF), have enabled the photorealistic novel view synthesis of static scenes with impressive results. However, modeling dynamic and controllable objects as part of a scene with such scene representations is still challenging. \nIn this work, we design a system that enables 1) novel view synthesis for portrait video, of both the human subject and the scene they are in and 2) explicit control of the facial expressions through a low-dimensional expression representation. \nWe represent the distribution of human facial expressions using the expression parameters of a 3D Morphable Model (3DMMs) and condition the NeRF volumetric function on them. \nFurthermore, we impose a spatial prior, brought by 3DMM fitting, to guide the network to learn disentangled control for static scene appearance and dynamic facial actions. We show the effectiveness of our method on free view synthesis of portrait videos with expression controls. To train a scene, our method only requires a short video of a subject captured by a mobile device.", "keywords": "Neural Rendering;Facial Reanimation;3D Scene Priors", "primary_area": "", "supplementary_material": "/attachment/c5d717e4fab3eb01d33ac5e6ed29d2890c8b34e1.zip", "author": "ShahRukh Athar;Zhixin Shu;Dimitris Samaras", "authorids": "~ShahRukh_Athar1;~Zhixin_Shu1;~Dimitris_Samaras3", "gender": ";M;M", "homepage": "http://shahrukhathar.github.io/;https://zhixinshu.github.io/;https://www.cs.stonybrook.edu/~samaras/", "dblp": "79/9032;129/3987;s/DimitrisSamaras", "google_scholar": "mdUv8wcAAAAJ;gp6HUP0AAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";;0000-0002-1373-0294", "linkedin": ";;", "or_profile": "~ShahRukh_Athar1;~Zhixin_Shu1;~Dimitris_Samaras3", "aff": "State University of New York, Stony Brook;Adobe Systems;Stony Brook University", "aff_domain": "stonybrook.edu;adobe.com;cs.stonybrook.edu", "position": "PhD student;Researcher;Full Professor", "bibtex": "@misc{\nathar2022flameinnerf,\ntitle={{FLAME}-in-Ne{RF}: Neural control of Radiance Fields for Free View Face Animation},\nauthor={ShahRukh Athar and Zhixin Shu and Dimitris Samaras},\nyear={2022},\nurl={https://openreview.net/forum?id=j8J97VgdmsT}\n}", "github": "", "project": "", "reviewers": "A3Tv;G8XN;SS8Y;z2xe", "site": "https://openreview.net/forum?id=j8J97VgdmsT", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "4;4;4;5", "correctness": "3;3;3;3", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;0;3;2", "wc_summary_paper": "68;105;67;96", "wc_summary_review": "143;26;68;60", "wc_main_review": "354;117;496;613", "wc_review": "565;248;631;769", "wc_reply_reviewers": "108;0;204;0", "wc_reply_authors": "494;0;493;386", "reply_reviewers": "1;0;2;0", "reply_authors": "2;0;3;1", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 1.75, 1.0897247358851685 ], "wc_summary_paper_avg": [ 84.0, 16.80773631397161 ], "wc_summary_review_avg": [ 74.25, 42.71050807471154 ], "wc_main_review_avg": [ 395.0, 184.85805365198456 ], "wc_review_avg": [ 553.25, 190.99001937274105 ], "wc_reply_reviewers_avg": [ 78.0, 85.06468127254695 ], "wc_reply_authors_avg": [ 343.25, 202.97706151188612 ], "reply_reviewers_avg": [ 0.75, 0.82915619758885 ], "reply_authors_avg": [ 1.5, 1.118033988749895 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": 0.0, "gs_citation": 44, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2348174675069057607&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff_unique_index": "0;1;2", "aff_unique_norm": "State University of New York;Adobe;Stony Brook University", "aff_unique_dep": ";Adobe Systems Incorporated;", "aff_unique_url": "https://www.stonybrook.edu;https://www.adobe.com;https://www.stonybrook.edu", "aff_unique_abbr": "SUNY Stony Brook;Adobe;SBU", "aff_campus_unique_index": "0", "aff_campus_unique": "Stony Brook;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "j97zf-nLhC", "title": "Zero-Shot Coordination via Semantic Relationships Between Actions and Observations", "track": "main", "status": "Reject", "tldr": "", "abstract": "An unaddressed challenge in zero-shot coordination is to take advantage of the semantic relationship between the features of an action and the features of observations. Humans take advantage of these relationships in highly intuitive ways. For instance in the absence of a shared-language, we might point to the object we desire or hold up fingers to indicate how many objects we want. To address this challenge, we investigate the effect of network architecture on the propensity of learning algorithms to make use of these relationships in human-compatible ways. We find that attention-based architectures that jointly process a featurized representation of the observation and the action, have a better inductive bias for exploiting semantic relationships for zero-shot coordination. Excitingly, in a set of diagnostic tasks, these agents produce highly human-compatible policies, without requiring the symmetry relationships of the problems to be hard-coded.", "keywords": "multi-agent communication;multi-agent reinforcement learning;attention mechanism;zero-shot coordination", "primary_area": "", "supplementary_material": "/attachment/5e874c3241bee678467b0e681836a4746f688a1c.zip", "author": "Mingwei Ma;Jizhou Liu;Samuel Sokota;Max Kleiman-Weiner;Jakob Nicolaus Foerster", "authorids": "~Mingwei_Ma1;jliu32@chicagobooth.edu;~Samuel_Sokota1;~Max_Kleiman-Weiner1;~Jakob_Nicolaus_Foerster1", "gender": "M;;M;Unspecified;M", "homepage": ";;https://ssokota.github.io/;http://www.mit.edu/~maxkw/;https://www.jakobfoerster.com", "dblp": ";;243/5881;160/7595;176/5095", "google_scholar": ";;;SACXQKYAAAAJ;6z4lQzMAAAAJ", "orcid": ";;;;", "linkedin": "marcus-mingwei-ma/;;samuel-sokota-87a153149/;;", "or_profile": "~Mingwei_Ma1;jliu32@chicagobooth.edu;~Samuel_Sokota1;~Max_Kleiman-Weiner1;~Jakob_Nicolaus_Foerster1", "aff": "University of Chicago;;Carnegie Mellon University;Common Sense Machines;University of Oxford, University of Oxford", "aff_domain": "uchicago.edu;;cmu.edu;csm.ai;eng.ox.ac.uk", "position": "PhD student;;PhD student;Principal Researcher;Associate Professor", "bibtex": "@misc{\nma2022zeroshot,\ntitle={Zero-Shot Coordination via Semantic Relationships Between Actions and Observations},\nauthor={Mingwei Ma and Jizhou Liu and Samuel Sokota and Max Kleiman-Weiner and Jakob Nicolaus Foerster},\nyear={2022},\nurl={https://openreview.net/forum?id=j97zf-nLhC}\n}", "github": "", "project": "", "reviewers": "JS7v;CXsY;Ba9g;rNES", "site": "https://openreview.net/forum?id=j97zf-nLhC", "pdf_size": 0, "recommendation": "6;6;6;6", "confidence": "2;4;3;3", "correctness": "3;4;3;3", "technical_novelty": "3;3;3;2", "empirical_novelty": "2;3;4;3", "wc_summary_paper": "137;57;64;75", "wc_summary_review": "22;57;37;183", "wc_main_review": "331;163;154;432", "wc_review": "490;277;255;690", "wc_reply_reviewers": "50;0;44;0", "wc_reply_authors": "1189;565;592;1280", "reply_reviewers": "1;0;1;0", "reply_authors": "3;2;2;3", "recommendation_avg": [ 6.0, 0.0 ], "confidence_avg": [ 3.0, 0.7071067811865476 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 83.25, 31.68891762114951 ], "wc_summary_review_avg": [ 74.75, 63.71960059510731 ], "wc_main_review_avg": [ 270.0, 117.12173154457716 ], "wc_review_avg": [ 428.0, 176.93077742439272 ], "wc_reply_reviewers_avg": [ 23.5, 23.595550427993835 ], "wc_reply_authors_avg": [ 906.5, 329.71237465403084 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 2.5, 0.5 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:DLjeth2XSYkJ:scholar.google.com/&scioq=Zero-Shot+Coordination+via+Semantic+Relationships+Between+Actions+and+Observations&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "University of Chicago;Carnegie Mellon University;Common Sense Machines;University of Oxford", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.uchicago.edu;https://www.cmu.edu;;https://www.ox.ac.uk", "aff_unique_abbr": "UChicago;CMU;;Oxford", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1", "aff_country_unique": "United States;United Kingdom" }, { "id": "jDK19MUBT4_", "title": "TailMix: Overcoming the Label Sparsity for Extreme Multi-label Classification", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Extreme multi-label classification (XMC) aims at finding the most relevant labels from a huge label set at the industrial scale. The XMC problem inherently poses two challenges: data scalability and label sparsity. This work introduces a new augmentation method, namely TailMix, to address the label sparsity issue, i.e., the long-tail labels in XMC have few positive instances. TailMix utilizes the context vector generated from the label attention layer in a label-wise manner instead of using the existing Mixup methods in a sample-wise manner. In this process, TailMix selectively chooses two context vectors and augments the most plausible positive instances to improve the accuracy for long-tail labels. Despite the simplicity of TailMix, extensive experimental results show that TailMix consistently improves the baseline models without TailMix and other Mixup-based methods on three benchmark datasets. Notably, TailMix is effective for improving the performance for long-tail labels on PSP@k and PSN@k, which are the common metrics that reflect the propensity of labels.", "keywords": "NLP;classification;extreme multi-label learning;data augmentation", "primary_area": "", "supplementary_material": "", "author": "Sangwoo Han;Chan Lim;Jongwuk Lee", "authorids": "~Sangwoo_Han1;~Chan_Lim1;~Jongwuk_Lee1", "gender": "M;M;M", "homepage": "https://github.com/uoo723;;", "dblp": ";58/2568;04/3445", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": "~Sangwoo_Han1;~Chan_Lim1;~Jongwuk_Lee1", "aff": "Sungkyunkwan University;Sungkyunkwan University;Sungkyunkwan University", "aff_domain": "skku.edu;skku.edu;skku.edu", "position": "MS student;MS student;Associate Professor", "bibtex": "@misc{\nhan2022tailmix,\ntitle={TailMix: Overcoming the Label Sparsity for Extreme Multi-label Classification},\nauthor={Sangwoo Han and Chan Lim and Jongwuk Lee},\nyear={2022},\nurl={https://openreview.net/forum?id=jDK19MUBT4_}\n}", "github": "", "project": "", "reviewers": "ex5A;8mQr;XiMF;BSRL", "site": "https://openreview.net/forum?id=jDK19MUBT4_", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "4;4;4;4", "correctness": "3;3;3;3", "technical_novelty": "2;2;3;2", "empirical_novelty": "2;0;3;3", "wc_summary_paper": "93;72;107;166", "wc_summary_review": "33;33;65;42", "wc_main_review": "250;333;343;818", "wc_review": "376;438;515;1026", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 1.224744871391589 ], "wc_summary_paper_avg": [ 109.5, 34.91776052383658 ], "wc_summary_review_avg": [ 43.25, 13.083864108129525 ], "wc_main_review_avg": [ 436.0, 223.4826615198593 ], "wc_review_avg": [ 588.75, 257.2035915379099 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:y6WPyK2ZcLoJ:scholar.google.com/&scioq=TailMix:+Overcoming+the+Label+Sparsity+for+Extreme+Multi-label+Classification&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;0", "aff_unique_norm": "Sungkyunkwan University", "aff_unique_dep": "", "aff_unique_url": "https://www.skku.edu", "aff_unique_abbr": "SKKU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "South Korea" }, { "id": "jE_ipyh20rb", "title": "FedProf: Selective Federated Learning with Representation Profiling", "track": "main", "status": "Reject", "tldr": "", "abstract": "Federated Learning (FL) has shown great potential as a privacy-preserving solution to learning from decentralized data that are only accessible to end devices (i.e., clients). In many scenarios however, a large proportion of the clients are probably in possession of low-quality data that are biased, noisy or even irrelevant. As a result, they could significantly slow down the convergence of the global model we aim to build and also compromise its quality. In light of this, we propose FedProf, a novel algorithm for optimizing FL under such circumstances without breaching data privacy. The key of our approach is a data representation profiling and matching scheme that uses the global model to dynamically profile data representations and allows for low-cost, lightweight representation matching. Based on the scheme we adaptively score each client and adjust its participation probability so as to mitigate the impact of low-value clients on the training process. We have conducted extensive experiments on public datasets using various FL settings. The results show that FedProf effectively reduces the number of communication rounds and overall time (up to 4.5x speedup) for the global model to converge and provides accuracy gain.", "keywords": "federated learning;neural network;representation learning;distributed computing", "primary_area": "", "supplementary_material": "/attachment/7c0974bf7fa88ce3a1b04d7b463ca8c770d6376c.zip", "author": "Wentai Wu;Ligang He;Weiwei Lin;carsten maple;Rui Mao", "authorids": "~Wentai_Wu1;~Ligang_He1;~Weiwei_Lin1;~carsten_maple1;~Rui_Mao2", "gender": "M;M;M;;M", "homepage": "https://wingter562.github.io/wentai_homepage/;https://www.dcs.warwick.ac.uk/~liganghe/;https://www.scholat.com/linweiwei;https://warwick.ac.uk/fac/sci/wmg/people/profile/?wmgid=1102;https://www.sics.ac.cn/mao/szu/eng/", "dblp": ";36/5655;53/282-1;05/2263.html;51/5793", "google_scholar": "hyTiOb0AAAAJ;https://scholar.google.co.uk/citations?user=g3lthZYAAAAJ;IWsha94AAAAJ;8MMdv50AAAAJ;", "orcid": ";;0000-0001-6876-1795;0000-0002-4715-212X;", "linkedin": ";;;;", "or_profile": "~Wentai_Wu1;~Ligang_He1;~Weiwei_Lin1;~carsten_maple1;~Rui_Mao2", "aff": "University of Warwick;The University of Warwick;South China University of Technology;The university of Warwick;Shenzhen University", "aff_domain": "warwick.ac.uk;warwick.ac.uk;scut.edu.cn;warwick.ac.uk;szu.edu.cn", "position": "PhD student;Assistant professor, Associate professor, Reader;Full Professor;Full Professor;Full Professor", "bibtex": "@misc{\nwu2022fedprof,\ntitle={FedProf: Selective Federated Learning with Representation Profiling},\nauthor={Wentai Wu and Ligang He and Weiwei Lin and carsten maple and Rui Mao},\nyear={2022},\nurl={https://openreview.net/forum?id=jE_ipyh20rb}\n}", "github": "", "project": "", "reviewers": "pcRb;kUXa;1UFK;5Q6Y", "site": "https://openreview.net/forum?id=jE_ipyh20rb", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "3;4;4;3", "correctness": "3;3;4;3", "technical_novelty": "2;3;2;2", "empirical_novelty": "3;2;2;3", "wc_summary_paper": "24;144;44;40", "wc_summary_review": "24;42;17;42", "wc_main_review": "191;343;177;134", "wc_review": "239;529;238;216", "wc_reply_reviewers": "173;102;11;56", "wc_reply_authors": "541;959;234;429", "reply_reviewers": "1;2;1;1", "reply_authors": "2;2;1;2", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 63.0, 47.36032094485847 ], "wc_summary_review_avg": [ 31.25, 11.031205736455105 ], "wc_main_review_avg": [ 211.25, 78.9125306906324 ], "wc_review_avg": [ 305.5, 129.36479428345257 ], "wc_reply_reviewers_avg": [ 85.5, 59.89365575751742 ], "wc_reply_authors_avg": [ 540.75, 265.2907602989595 ], "reply_reviewers_avg": [ 1.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.75, 0.4330127018922193 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.2294157338705618, "corr_recommendation_correctness": 0.13245323570650439, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6145511315104313130&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0;1;0;2", "aff_unique_norm": "University of Warwick;South China University of Technology;Shenzhen University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.warwick.ac.uk;https://www.scut.edu.cn;https://www.szu.edu.cn", "aff_unique_abbr": "Warwick;SCUT;SZU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;1", "aff_country_unique": "United Kingdom;China" }, { "id": "jFfRcKVut98", "title": "Learning Equivariances and Partial Equivariances From Data", "track": "main", "status": "Reject", "tldr": "", "abstract": "Group equivariant Convolutional Neural Networks (G-CNNs) constrain features to respect the chosen symmetries, and lead to better generalization when these symmetries appear in the data. However, if the chosen symmetries are not present, group equivariant architectures lead to overly constrained models and worse performance. Frequently, the distribution of the data can be better represented by a subset of a group than by the group as a whole, e.g., rotations in $[-90^{\\circ}, 90^{\\circ}]$. In such cases, a model that respects equivariance partially is better suited to represent the data. Moreover, relevant symmetries may differ for low and high-level features, e.g., edge orientations in a face, and face poses relative to the camera. As a result, the optimal level of equivariance may differ per layer. In this work, we introduce Partial G-CNNs: a family of equivariant networks able to learn partial and full equivariances from data at every layer end-to-end. Partial G-CNNs retain full equivariance whenever beneficial, e.g., for rotated MNIST, but are able to restrict it whenever it becomes harmful, e.g., for 6 / 9 or natural image classification. Partial G-CNNs perform on par with G-CNNs when full equivariance is necessary, and outperform them otherwise. Our method is applicable to discrete groups, continuous groups and combinations thereof.", "keywords": "group equivariance;learning equivariances from data;partial equivariance;group convolutional networks.", "primary_area": "", "supplementary_material": "", "author": "David W. Romero;Suhas Lohit", "authorids": "~David_W._Romero1;~Suhas_Lohit1", "gender": "M;", "homepage": "https://davidwromero.xyz/;http://suhaslohit.github.io", "dblp": "254/1396;169/9097", "google_scholar": "7tdzmVoAAAAJ;GMRYY5cAAAAJ", "orcid": ";", "linkedin": "david-w-romero-05893567/;", "or_profile": "~David_W._Romero1;~Suhas_Lohit1", "aff": "Vrije Universiteit Amsterdam;Mitsubishi Electric Research Labs", "aff_domain": "vu.nl;merl.com", "position": "PhD student;Researcher", "bibtex": "@misc{\nromero2022learning,\ntitle={Learning Equivariances and Partial Equivariances From Data},\nauthor={David W. Romero and Suhas Lohit},\nyear={2022},\nurl={https://openreview.net/forum?id=jFfRcKVut98}\n}", "github": "", "project": "", "reviewers": "xU2c;8mxH;3sxh;RivC", "site": "https://openreview.net/forum?id=jFfRcKVut98", "pdf_size": 0, "recommendation": "5;5;5;6", "confidence": "5;3;4;4", "correctness": "2;3;3;3", "technical_novelty": "2;3;3;4", "empirical_novelty": "2;2;2;4", "wc_summary_paper": "65;64;97;84", "wc_summary_review": "47;25;60;30", "wc_main_review": "671;180;631;582", "wc_review": "783;269;788;696", "wc_reply_reviewers": "259;103;244;487", "wc_reply_authors": "1965;960;3041;1922", "reply_reviewers": "1;1;1;2", "reply_authors": "3;3;5;4", "recommendation_avg": [ 5.25, 0.4330127018922193 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 77.5, 13.793114224133722 ], "wc_summary_review_avg": [ 40.5, 13.901438774457844 ], "wc_main_review_avg": [ 516.0, 196.5337121208471 ], "wc_review_avg": [ 634.0, 213.88431452539945 ], "wc_reply_reviewers_avg": [ 273.25, 137.59791967904167 ], "wc_reply_authors_avg": [ 1972.0, 736.4533250654789 ], "reply_reviewers_avg": [ 1.25, 0.4330127018922193 ], "reply_authors_avg": [ 3.75, 0.82915619758885 ], "replies_avg": [ 28, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1, "aff_unique_index": "0;1", "aff_unique_norm": "Vrije Universiteit Amsterdam;Mitsubishi Electric Research Laboratories", "aff_unique_dep": ";", "aff_unique_url": "https://www.vu.nl;https://www.merl.com", "aff_unique_abbr": "VU Amsterdam;MERL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "Netherlands;United States" }, { "id": "jFlWZEv6dv", "title": "AlignMix: Improving representations by interpolating aligned features", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Mixup is a powerful data augmentation method that interpolates between two or more examples in the input or feature space and between the corresponding target labels. However, how to best interpolate images is not well defined. Recent mixup methods overlay or cut-and-paste two or more objects into one image, which needs care in selecting regions. Mixup has also been connected to autoencoders, because often autoencoders generate an image that continuously deforms into another. However, such images are typically of low quality.\n\nIn this work, we revisit mixup from the deformation perspective and introduce AlignMix, where we geometrically align two images in the feature space. The correspondences allow us to interpolate between two sets of features, while keeping the locations of one set. Interestingly, this retains mostly the geometry or pose of one image and the appearance or texture of the other. We also show that an autoencoder can still improve representation learning under mixup, without the classifier ever seeing decoded images. AlignMix outperforms state-of-the-art mixup methods on five different benchmarks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Shashanka Venkataramanan;Ewa Kijak;laurent amsaleg;Yannis Avrithis", "authorids": "~Shashanka_Venkataramanan2;~Ewa_Kijak1;~laurent_amsaleg1;~Yannis_Avrithis2", "gender": "M;;;", "homepage": "https://shashankvkt.github.io/;;;https://avrithis.net/", "dblp": "218/8893;;a/LAmsaleg;a/YSAvrithis", "google_scholar": "CbfH47IAAAAJ;;;AF2SxG0AAAAJ", "orcid": ";;;0000-0001-7476-4482", "linkedin": "shashank-venkataramanan-1b2b9993/;;;yannisavrithis/", "or_profile": "~Shashanka_Venkataramanan2;~Ewa_Kijak1;~laurent_amsaleg1;~Yannis_Avrithis2", "aff": "INRIA;;IRISA;Athena RC", "aff_domain": "inria.fr;;irisa.fr;athenarc.gr", "position": "PhD student;;researcher;Research Director", "bibtex": "@misc{\nvenkataramanan2022alignmix,\ntitle={AlignMix: Improving representations by interpolating aligned features},\nauthor={Shashanka Venkataramanan and Ewa Kijak and laurent amsaleg and Yannis Avrithis},\nyear={2022},\nurl={https://openreview.net/forum?id=jFlWZEv6dv}\n}", "github": "", "project": "", "reviewers": "q3eD;ifDL;Epcf", "site": "https://openreview.net/forum?id=jFlWZEv6dv", "pdf_size": 0, "recommendation": "5;5;6", "confidence": "4;5;4", "correctness": "3;4;3", "technical_novelty": "3;3;2", "empirical_novelty": "3;2;2", "wc_summary_paper": "77;91;54", "wc_summary_review": "63;24;49", "wc_main_review": "216;240;526", "wc_review": "356;355;629", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 74.0, 15.253414918196734 ], "wc_summary_review_avg": [ 45.333333333333336, 16.131404843417148 ], "wc_main_review_avg": [ 327.3333333333333, 140.81982183706322 ], "wc_review_avg": [ 446.6666666666667, 128.92978278461842 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.4999999999999999, "corr_recommendation_correctness": -0.4999999999999999, "gs_citation": 97, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15402867223438087088&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 11, "aff_unique_index": "0;1;2", "aff_unique_norm": "INRIA;Institut de Recherche en Informatique et Automatique;Athena RC", "aff_unique_dep": ";;", "aff_unique_url": "https://www.inria.fr;https://www.irisa.fr;", "aff_unique_abbr": "INRIA;IRISA;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "France;" }, { "id": "jGmNTfiXwGb", "title": "Learning Predictive, Online Approximations of Explanatory, Offline Algorithms", "track": "main", "status": "Reject", "tldr": "", "abstract": "In this work, we introduce a general methodology for approximating offline algorithms in online settings. By encoding the behavior of offline algorithms in graphs, we train a multi-task learning model to simultaneously detect behavioral structures which have already occurred and predict those that may come next. We demonstrate the methodology on both synthetic data and historical stock market data, where the contrast between explanation and prediction is particularly stark. Taken together, our work represents the first general and end-to-end differentiable approach for generating online approximations of offline algorithms.", "keywords": "multi-task learning;machine learning;online algorithms;offline algorithms", "primary_area": "", "supplementary_material": "", "author": "Mattson Thieme;Ammar Gilani;Han Liu", "authorids": "~Mattson_Thieme1;~Ammar_Gilani1;~Han_Liu4", "gender": "M;M;", "homepage": "https://mattsonthieme.github.io;https://www.northwestern.edu/;", "dblp": ";225/5425;", "google_scholar": "SAjppGoAAAAJ;;", "orcid": ";;", "linkedin": "mattsonthieme/;;", "or_profile": "~Mattson_Thieme1;~Ammar_Gilani1;~Han_Liu4", "aff": "Northwestern University;;Northwestern University", "aff_domain": "northwestern.edu;;u.northwestern.edu", "position": "PhD student;;Associate Professor", "bibtex": "@misc{\nthieme2022learning,\ntitle={Learning Predictive, Online Approximations of Explanatory, Offline Algorithms},\nauthor={Mattson Thieme and Ammar Gilani and Han Liu},\nyear={2022},\nurl={https://openreview.net/forum?id=jGmNTfiXwGb}\n}", "github": "", "project": "", "reviewers": "gii5;EgW9;s1H9;aaFn;zoKR", "site": "https://openreview.net/forum?id=jGmNTfiXwGb", "pdf_size": 0, "recommendation": "3;3;5;6;6", "confidence": "4;4;2;2;3", "correctness": "1;1;3;4;2", "technical_novelty": "3;3;4;3;3", "empirical_novelty": "0;2;4;3;3", "wc_summary_paper": "129;158;50;53;122", "wc_summary_review": "53;105;39;81;37", "wc_main_review": "813;430;215;536;305", "wc_review": "995;693;304;670;464", "wc_reply_reviewers": "40;0;0;0;0", "wc_reply_authors": "278;168;116;351;261", "reply_reviewers": "1;0;0;0;0", "reply_authors": "1;1;1;1;1", "recommendation_avg": [ 4.6, 1.3564659966250536 ], "confidence_avg": [ 3.0, 0.8944271909999159 ], "correctness_avg": [ 2.2, 1.16619037896906 ], "technical_novelty_avg": [ 3.2, 0.39999999999999997 ], "empirical_novelty_avg": [ 2.4, 1.3564659966250536 ], "wc_summary_paper_avg": [ 102.4, 43.28787359064892 ], "wc_summary_review_avg": [ 63.0, 26.229754097208 ], "wc_main_review_avg": [ 459.8, 207.52580562426448 ], "wc_review_avg": [ 625.2, 233.45697676445656 ], "wc_reply_reviewers_avg": [ 8.0, 16.0 ], "wc_reply_authors_avg": [ 234.8, 83.2235543581263 ], "reply_reviewers_avg": [ 0.2, 0.4000000000000001 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.8242255917447339, "corr_recommendation_correctness": 0.8091547798786779, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:f8oCGCgAnnEJ:scholar.google.com/&scioq=Learning+Predictive,+Online+Approximations+of+Explanatory,+Offline+Algorithms&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Northwestern University", "aff_unique_dep": "", "aff_unique_url": "https://www.northwestern.edu", "aff_unique_abbr": "NU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "jJJWwrMrEsx", "title": "Truth Table Deep Convolutional Neural Network, A New SAT-Encodable Architecture - Application To Complete Robustness", "track": "main", "status": "Reject", "tldr": "", "abstract": "\nWith the expanding role of neural networks, the need for formal verification of their behavior, interpretability and human post-processing has become critical in many applications. In 2018, it has been shown that Binary Neural Networks (BNNs) have an equivalent representation in boolean logic and can be formally analyzed using logical reasoning tools such as SAT or MaxSAT solvers. This formulation is powerful as it allows us to address a vast range of questions: existential, probabilistic, explanation generation, etc. However, to date, only BNNs can be transformed into a SAT formula and their strong binary constraints limit their natural accuracy. Moreover, the corresponding SAT conversion method intrinsically leads to formulas with a large number of variables and clauses, impeding interpretability as well as formal verification scalability. In this work, we introduce Truth Table Deep Convolutional Neural Networks (TT-DCNNs), a new family of SAT-encodable models featuring real-valued weights and real intermediate values as well as a highly interpretable conversion method. The TT-DCNN architecture enables for the first time all the logical classification rules to be extracted from a performant neural network which can be then easily interpreted by anyone familiar with the domain. Therefore, this allows integrating human knowledge in post-processing as well as enumerating all possible inputs/outputs prior to deployment in production. We believe our new architecture paves the way between eXplainability AI (XAI) and formal verification. First, we experimentally show that TT-DCNNs offer a better tradeoff between natural accuracy and formal verification than BNNs. Then, in the robustness verification setting, we demonstrate that TT-DCNNs outperform the verifiable accuracy of BNNs with a comparable computation time. Finally, we also drastically decrease the number of clauses and variables, enabling the usage of general SAT solvers and exact model counting solvers. Our developed real-valued network has general applications and we believe that its demonstrated robustness constitutes a suitable response to the rising demand for functional formal verification. ", "keywords": "AI Safety;SAT-encodable Neural Network;Formal Verification;Complete Verification Robustness;Interpretability;Logic Rules;XAI", "primary_area": "", "supplementary_material": "", "author": "Adrien Benamira;Thomas Peyrin;Bryan Hooi", "authorids": "~Adrien_Benamira1;thomas.peyrin@ntu.edu.sg;~Bryan_Hooi1", "gender": "M;;", "homepage": ";;http://bhooi.github.io", "dblp": ";;169/9975", "google_scholar": "k3jLkWUAAAAJ;;", "orcid": ";;0000-0002-5645-1754", "linkedin": ";;", "or_profile": "~Adrien_Benamira1;thomas.peyrin@ntu.edu.sg;~Bryan_Hooi1", "aff": "Nanyang Technological University;;National University of Singapore", "aff_domain": "ntu.edu.sg;;nus.edu.sg", "position": "PhD student;;Assistant Professor", "bibtex": "@misc{\nbenamira2022truth,\ntitle={Truth Table Deep Convolutional Neural Network, A New {SAT}-Encodable Architecture - Application To Complete Robustness},\nauthor={Adrien Benamira and Thomas Peyrin and Bryan Hooi},\nyear={2022},\nurl={https://openreview.net/forum?id=jJJWwrMrEsx}\n}", "github": "", "project": "", "reviewers": "wThs;RhEH;UdYz;garj", "site": "https://openreview.net/forum?id=jJJWwrMrEsx", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "4;4;3;4", "correctness": "3;2;2;3", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "156;34;116;90", "wc_summary_review": "98;52;89;86", "wc_main_review": "275;340;370;278", "wc_review": "529;426;575;454", "wc_reply_reviewers": "0;54;538;0", "wc_reply_authors": "802;709;914;802", "reply_reviewers": "0;1;2;0", "reply_authors": "1;1;3;1", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 99.0, 44.28317965096906 ], "wc_summary_review_avg": [ 81.25, 17.455300054711177 ], "wc_main_review_avg": [ 315.75, 40.671703922997864 ], "wc_review_avg": [ 496.0, 59.14811915860047 ], "wc_reply_reviewers_avg": [ 148.0, 226.24323194296886 ], "wc_reply_authors_avg": [ 806.75, 72.63392802265344 ], "reply_reviewers_avg": [ 0.75, 0.82915619758885 ], "reply_authors_avg": [ 1.5, 0.8660254037844386 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4334436484667504212&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1", "aff_unique_norm": "Nanyang Technological University;National University of Singapore", "aff_unique_dep": ";", "aff_unique_url": "https://www.ntu.edu.sg;https://www.nus.edu.sg", "aff_unique_abbr": "NTU;NUS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Singapore" }, { "title": "Defending Against Image Corruptions Through Adversarial Augmentations", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6715", "id": "jJOjjiZHy3h", "poster": "", "openreview": "https://openreview.net/forum?id=jJOjjiZHy3h", "slides": "https://iclr.cc/virtual/2022/poster/6715", "video": "https://iclr.cc/virtual/2022/poster/6715", "author_site": "Dan A. Calian, Florian Stimberg, Olivia Wiles, Sylvestre-Alvise Rebuffi, Andras Gyorgy, Timothy A Mann, Sven Gowal", "tldr": "", "abstract": "Modern neural networks excel at image classification, yet they remain vulnerable to common image corruptions such as blur, speckle noise or fog. Recent methods that focus on this problem, such as AugMix and DeepAugment, introduce defenses that operate in expectation over a distribution of image corruptions. In contrast, the literature on Lp-norm bounded perturbations focuses on defenses against worst-case corruptions. In this work, we reconcile both approaches by proposing AdversarialAugment, a technique which optimizes the parameters of image-to-image models to generate adversarially corrupted augmented images. We theoretically motivate our method and give sufficient conditions for the consistency of its idealized version as well as that of DeepAugment. Our classifiers improve upon the state-of-the-art on common image corruption benchmarks conducted in expectation on CIFAR-10-C and improve worst-case performance against Lp-norm bounded perturbations on both CIFAR-10 and ImageNet.", "keywords": "robustness;adversarial training;image corruptions", "primary_area": "", "supplementary_material": "", "author": "Dan Andrei Calian;Florian Stimberg;Olivia Wiles;Sylvestre-Alvise Rebuffi;Andr\u00e1s Gy\u00f6rgy;Timothy A Mann;Sven Gowal", "authorids": "~Dan_Andrei_Calian1;~Florian_Stimberg1;~Olivia_Wiles1;~Sylvestre-Alvise_Rebuffi1;~Andr\u00e1s_Gy\u00f6rgy2;~Timothy_A_Mann1;~Sven_Gowal2", "gender": ";M;;M;;M;", "homepage": ";;;;;http://www.timothyamann.com;", "dblp": ";57/11107;194/3191;190/7811;;;", "google_scholar": ";https://scholar.google.com/citations?hl=en;https://scholar.google.co.uk/citations?user=XQzHJSgAAAAJ;swP3h24AAAAJ;;sIMkUXMAAAAJ;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": "~Dan_Andrei_Calian1;~Florian_Stimberg1;~Olivia_Wiles1;~Sylvestre-Alvise_Rebuffi1;~Andr\u00e1s_Gy\u00f6rgy2;~Timothy_A_Mann1;~Sven_Gowal2", "aff": ";Google DeepMind;Google;Google DeepMind;;Google DeepMind;", "aff_domain": ";deepmind.com;google.com;deepmind.com;;deepmind.com;", "position": ";Researcher;Researcher;Researcher;;Researcher;", "bibtex": "@inproceedings{\ncalian2022defending,\ntitle={Defending Against Image Corruptions Through Adversarial Augmentations},\nauthor={Dan Andrei Calian and Florian Stimberg and Olivia Wiles and Sylvestre-Alvise Rebuffi and Andr{\\'a}s Gy{\\\"o}rgy and Timothy A Mann and Sven Gowal},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=jJOjjiZHy3h}\n}", "github": "", "project": "", "reviewers": "31sh;AUQw;fuXu;w3th", "pdf_size": 0, "recommendation": "6;6;6;8", "confidence": "4;3;5;3", "correctness": "3;4;4;4", "technical_novelty": "3;3;3;3", "empirical_novelty": "3;3;0;3", "wc_summary_paper": "140;71;31;68", "wc_summary_review": "283;157;33;48", "wc_main_review": "615;654;405;128", "wc_review": "1038;882;469;244", "wc_reply_reviewers": "160;574;239;39", "wc_reply_authors": "2959;3617;1391;1349", "reply_reviewers": "1;4;1;1", "reply_authors": "6;8;4;4", "recommendation_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.25, 1.299038105676658 ], "wc_summary_paper_avg": [ 77.5, 39.37321424522006 ], "wc_summary_review_avg": [ 130.25, 100.33786673036258 ], "wc_main_review_avg": [ 450.5, 208.89530870749587 ], "wc_review_avg": [ 658.25, 316.8961777932956 ], "wc_reply_reviewers_avg": [ 253.0, 198.54596445155968 ], "wc_reply_authors_avg": [ 2329.0, 986.9255291054133 ], "reply_reviewers_avg": [ 1.75, 1.299038105676658 ], "reply_authors_avg": [ 5.5, 1.6583123951777 ], "replies_avg": [ 34, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": -0.5222329678670935, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 54, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9603472890595098693&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=jJOjjiZHy3h", "email": ";deepmind.com;google.com;deepmind.com;;deepmind.com;", "author_num": 7, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Google", "aff_unique_dep": "Google DeepMind", "aff_unique_url": "https://deepmind.com", "aff_unique_abbr": "DeepMind", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;1;0;0", "aff_country_unique": "United Kingdom;United States" }, { "id": "jJWK09skiNl", "title": "Zero-shot detection of daily objects in YCB video dataset", "track": "main", "status": "Reject", "tldr": "", "abstract": "To let robots be able to manipulate objects, they have to sense the location of objects. With the development of visual data collecting and processing technology, robots are gradually evolving to localize objects in a greater field of view rather than being limited to a small space where the object could appear. To train such a robot vision system, pictures of all the objects need to be taken under various orientations and illumination. In the traditional manufacturing environment, this is applicable since objects involved in the production process does not change frequently. However, in the vision of smart manufacturing and high-mix-low-volume production, parts and products for robots to handle may change frequently. Thus, it is unrealistic to re-training the vision system for new products and tasks. Under this situation, we discovered the necessity to introduce a hot concept which is zero-shot object detection. Zero-shot object detection is a subset of unsupervised learning, and it aims to detect novel objects in the image with the knowledge learned from and only from seen objects. With zero-shot object detection algorithm, time can be greatly saved from collecting training data and training the vision system. Previous works focus on detecting objects in outdoor scenes, such as bikes, car, people, and dogs. The detection of daily objects is actually more challenging since the knowledge can be learned from each object is very limited. In this work, we explore the zero-shot detection of daily objects in indoor scenes since the objects\u2019 size and environment are closely related to the manufacturing setup. The YCB Video Dataset is used in this work, which contains 21 objects in various categories. To the best of our knowledge, no previous work has explored zero-shot detection in this object size level and on this dataset.", "keywords": "zero-shot learning;object detection;multi-label learning;attribute vector", "primary_area": "", "supplementary_material": "", "author": "Wanqing Xia", "authorids": "~Wanqing_Xia1", "gender": "M", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "wanqing-xia-x666", "or_profile": "~Wanqing_Xia1", "aff": "", "aff_domain": "", "position": "", "bibtex": "@misc{\nxia2022zeroshot,\ntitle={Zero-shot detection of daily objects in {YCB} video dataset},\nauthor={Wanqing Xia},\nyear={2022},\nurl={https://openreview.net/forum?id=jJWK09skiNl}\n}", "github": "", "project": "", "reviewers": "Ajjn;uZSA;kFeR;8vns;TYzZ", "site": "https://openreview.net/forum?id=jJWK09skiNl", "pdf_size": 0, "recommendation": "1;1;1;3;3", "confidence": "5;5;4;4;4", "correctness": "2;1;3;2;3", "technical_novelty": "1;1;1;1;1", "empirical_novelty": "1;1;1;1;2", "wc_summary_paper": "27;87;52;101;109", "wc_summary_review": "43;26;23;71;55", "wc_main_review": "306;326;90;295;360", "wc_review": "376;439;165;467;524", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "0;0;0;0;0", "reply_reviewers": "0;0;0;0;0", "reply_authors": "0;0;0;0;0", "recommendation_avg": [ 1.8, 0.9797958971132713 ], "confidence_avg": [ 4.4, 0.48989794855663565 ], "correctness_avg": [ 2.2, 0.7483314773547882 ], "technical_novelty_avg": [ 1.0, 0.0 ], "empirical_novelty_avg": [ 1.2, 0.4 ], "wc_summary_paper_avg": [ 75.2, 31.012255641923243 ], "wc_summary_review_avg": [ 43.6, 17.97331355092878 ], "wc_main_review_avg": [ 275.4, 95.30078698520805 ], "wc_review_avg": [ 394.2, 124.1118850070371 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 1, 0 ], "corr_recommendation_confidence": -0.6666666666666665, "corr_recommendation_correctness": 0.32732683535398854, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14305878946206982225&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "jJis-v9Pzhj", "title": "Positive-Unlabeled Learning with Uncertainty-aware Pseudo-label Selection", "track": "main", "status": "Reject", "tldr": "", "abstract": "Positive-unlabeled (PU) learning aims at learning a binary classifier from only positive and unlabeled training data. Recent approaches address this problem via cost-sensitive learning by developing unbiased loss functions or via iterative pseudo-labeling solutions to further improve performance. However, two-steps procedures are vulnerable to incorrectly estimated pseudo-labels, as errors are propagated in later iterations when a new model is trained on erroneous predictions. To mitigate this issue we propose \\textit{PUUPL}, a new loss-agnostic training procedure for PU learning that incorporates epistemic uncertainty in pseudo-labeling. Using an ensemble of neural networks and assigning pseudo-labels based on high confidence predictions improves the reliability of pseudo-labels, increasing the predictive performance of our method and leads to new state-of-the-art results in PU learning. With extensive experiments, we show the effectiveness of our method over different datasets, modalities, and learning tasks, as well as improved robustness over mispecifications of hyper-parameters and biased positive data. The source code of the method and all the experiments are available in the supplementary material.", "keywords": "positive-unlabeled learning;semi-supervised learning;pseudo-labeling;deep ensembles;uncertainty quantification", "primary_area": "", "supplementary_material": "/attachment/acc03a40e0fcdb3531718ec75f174f6a8733e18e.zip", "author": "Emilio Dorigatti;Jann Goschenhofer;Benjamin Schubert;Mina Rezaei;Bernd Bischl", "authorids": "~Emilio_Dorigatti1;~Jann_Goschenhofer1;~Benjamin_Schubert1;~Mina_Rezaei1;~Bernd_Bischl1", "gender": ";;;F;M", "homepage": ";https://www.janngoschenhofer.com;https://www.helmholtz-muenchen.de/icb/research/groups/schubert-lab/overview/index.html;https://www.compstat.statistik.uni-muenchen.de/people/minar/;https://www.slds.stat.uni-muenchen.de/", "dblp": ";;;205/2767;48/5326", "google_scholar": ";IhTZ5ikAAAAJ;https://scholar.google.de/citations?hl=de;https://scholar.google.de/citations?hl=en;https://scholar.google.de/citations?user=s34UckkAAAAJ", "orcid": ";0000-0002-1251-459X;;0000-0001-6994-6345;0000-0001-6002-6980", "linkedin": ";;;mina-rezaei-b88a3a69/;", "or_profile": "~Emilio_Dorigatti1;~Jann_Goschenhofer1;~Benjamin_Schubert1;~Mina_Rezaei1;~Bernd_Bischl1", "aff": ";Fraunhofer IIS;Institute of computational biology;Ludwig-Maximilians-Universit\u00e4t M\u00fcnchen;LMU", "aff_domain": ";iis.fraunhofer.de;helmholz-muenchen.de;lmu.de;uni-muenchen.de", "position": ";PhD student;Principal Researcher;Principal Researcher;Full Professor", "bibtex": "@misc{\ndorigatti2022positiveunlabeled,\ntitle={Positive-Unlabeled Learning with Uncertainty-aware Pseudo-label Selection},\nauthor={Emilio Dorigatti and Jann Goschenhofer and Benjamin Schubert and Mina Rezaei and Bernd Bischl},\nyear={2022},\nurl={https://openreview.net/forum?id=jJis-v9Pzhj}\n}", "github": "", "project": "", "reviewers": "kfAD;3wif;9jfD;G93N", "site": "https://openreview.net/forum?id=jJis-v9Pzhj", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "5;4;4;4", "correctness": "2;3;3;3", "technical_novelty": "2;2;1;2", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "40;54;50;27", "wc_summary_review": "49;49;9;27", "wc_main_review": "255;288;190;155", "wc_review": "344;391;249;209", "wc_reply_reviewers": "0;467;0;0", "wc_reply_authors": "1136;1391;910;519", "reply_reviewers": "0;2;0;0", "reply_authors": "2;3;2;1", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 1.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 42.75, 10.425329730996522 ], "wc_summary_review_avg": [ 33.5, 16.75559608011604 ], "wc_main_review_avg": [ 222.0, 52.340233090806926 ], "wc_review_avg": [ 298.25, 72.60638746005753 ], "wc_reply_reviewers_avg": [ 116.75, 202.2169317836664 ], "wc_reply_authors_avg": [ 989.0, 320.2943958298365 ], "reply_reviewers_avg": [ 0.5, 0.8660254037844386 ], "reply_authors_avg": [ 2.0, 0.7071067811865476 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17153639149767447390&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "Fraunhofer Institute for Integrated Circuits;Institute of Computational Biology;Ludwig-Maximilians-Universit\u00e4t M\u00fcnchen;Ludwig Maximilian University of Munich", "aff_unique_dep": ";Computational Biology;;", "aff_unique_url": "https://www.iis.fraunhofer.de/;;https://www.lmu.de;https://www.lmu.de", "aff_unique_abbr": "Fraunhofer IIS;;LMU;LMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Germany;" }, { "id": "jKzjSZYsrGP", "title": "SCformer: Segment Correlation Transformer for Long Sequence Time Series Forecasting", "track": "main", "status": "Reject", "tldr": "", "abstract": "Long-term time series forecasting is widely used in real-world applications such as financial investment, electricity management and production planning. Recently, transformer-based models with strong sequence modeling ability have shown the potential in this task. However, most of these methods adopt point-wise dependencies discovery, whose complexity increases quadratically with the length of time series, which easily becomes intractable for long-term prediction. This paper proposes a new Transformer-based model called SCformer, which replaces the canonical self-attention with efficient segment correlation attention (SCAttention) mechanism. SCAttention divides time series into segments by the implicit series periodicity and utilizes correlations between segments to capture long short-term dependencies. Besides, we design a dual task that restores past series with the predicted future series to make SCformer more stable. Extensive experiments on several datasets in various fields demonstrate that our SCformer outperforms other Transformer-based methods and training with the additional dual task can enhance the generalization ability of the prediction model.", "keywords": "Transformer;time series forecasting;sparse attention", "primary_area": "", "supplementary_material": "", "author": "Dazhao Du;Bing Su;Zhewei Wei", "authorids": "~Dazhao_Du1;~Bing_Su1;~Zhewei_Wei1", "gender": "M;M;M", "homepage": "https://github.com/ddz16;https://gsai.ruc.edu.cn/bingsu;http://weizhewei.com", "dblp": ";41/5270-1;94/4260", "google_scholar": ";https://scholar.google.com.sg/citations?user=d3g2VJQAAAAJ;https://scholar.google.com.hk/citations?user=qZ7dj4gAAAAJ", "orcid": ";0000-0001-8560-1910;0000-0003-3620-5086", "linkedin": ";;", "or_profile": "~Dazhao_Du1;~Bing_Su1;~Zhewei_Wei1", "aff": "Chinese Academy of Sciences;Renmin University of China;Renmin University of China", "aff_domain": "ucas.ac.cn;ruc.edu.cn;ruc.edu.cn", "position": "MS student;Associate Professor;Full Professor", "bibtex": "@misc{\ndu2022scformer,\ntitle={{SC}former: Segment Correlation Transformer for Long Sequence Time Series Forecasting},\nauthor={Dazhao Du and Bing Su and Zhewei Wei},\nyear={2022},\nurl={https://openreview.net/forum?id=jKzjSZYsrGP}\n}", "github": "", "project": "", "reviewers": "pENX;dkGP;5TYz;SdzH", "site": "https://openreview.net/forum?id=jKzjSZYsrGP", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "4;4;5;4", "correctness": "2;3;3;3", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;2;3;2", "wc_summary_paper": "62;48;45;39", "wc_summary_review": "117;2;15;46", "wc_main_review": "158;228;270;257", "wc_review": "337;278;330;342", "wc_reply_reviewers": "0;0;40;0", "wc_reply_authors": "356;1226;1086;1064", "reply_reviewers": "0;0;1;0", "reply_authors": "3;3;6;3", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 48.5, 8.440971508067067 ], "wc_summary_review_avg": [ 45.0, 44.536501883286704 ], "wc_main_review_avg": [ 228.25, 43.31498008772485 ], "wc_review_avg": [ 321.75, 25.616157010761782 ], "wc_reply_reviewers_avg": [ 10.0, 17.320508075688775 ], "wc_reply_authors_avg": [ 933.0, 338.87608354677377 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 3.75, 1.299038105676658 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12600428361968674052&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;1", "aff_unique_norm": "Chinese Academy of Sciences;Renmin University of China", "aff_unique_dep": ";", "aff_unique_url": "https://www.cas.cn;http://www.ruc.edu.cn", "aff_unique_abbr": "CAS;RUC", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "id": "jLClifZ6YER", "title": "Fast fixed-backbone protein sequence and rotamer design", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Protein fixed-backbone sequence design is an important task in computational protein design, and being able to quickly and accurately modify or redesign side-chains is a useful subroutine in the context of functional design problems such as ligand binding site, enzyme, and binder design. We present a fast and accurate learned method for protein fixed-backbone sequence and rotamer design. We find that a graph attention model for joint rotamer and sequence prediction trained on-policy via imitation learning can produce a distributions of accurate sequences for target backbones. We show that this method generalizes to design sequences onto novel generated backbones.", "keywords": "protein design;imitation learning;attention;graph networks", "primary_area": "", "supplementary_material": "", "author": "Namrata Anand;Tudor Achim", "authorids": "~Namrata_Anand1;~Tudor_Achim1", "gender": "F;M", "homepage": "https://nanand2.github.io/;https://cs.stanford.edu/~tachim/", "dblp": ";151/9491", "google_scholar": "TtGiXw0AAAAJ;5iM8yzsAAAAJ", "orcid": ";", "linkedin": "namrata-a-427807188/;", "or_profile": "~Namrata_Anand1;~Tudor_Achim1", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nanand2022fast,\ntitle={Fast fixed-backbone protein sequence and rotamer design },\nauthor={Namrata Anand and Tudor Achim},\nyear={2022},\nurl={https://openreview.net/forum?id=jLClifZ6YER}\n}", "github": "", "project": "", "reviewers": "", "site": "https://openreview.net/forum?id=jLClifZ6YER", "pdf_size": 0, "recommendation": "", "confidence": "", "correctness": "", "technical_novelty": "", "empirical_novelty": "", "wc_summary_paper": "", "wc_summary_review": "", "wc_main_review": "", "wc_review": "", "wc_reply_reviewers": "", "wc_reply_authors": "", "reply_reviewers": "", "reply_authors": "", "recommendation_avg": [ 0, 0 ], "confidence_avg": [ 0, 0 ], "correctness_avg": [ 0, 0 ], "technical_novelty_avg": [ 0, 0 ], "empirical_novelty_avg": [ 0, 0 ], "wc_summary_paper_avg": [ 0, 0 ], "wc_summary_review_avg": [ 0, 0 ], "wc_main_review_avg": [ 0, 0 ], "wc_review_avg": [ 0, 0 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 1, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0, "corr_recommendation_correctness": 0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:ypXANlcZReIJ:scholar.google.com/&scioq=Fast+fixed-backbone+protein+sequence+and+rotamer+design&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "jM62SQw28f", "title": "DeepFIB: Self-Imputation for Time Series Anomaly Detection", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Time series (TS) anomaly detection (AD) plays an essential role in various applications, e.g., fraud detection in finance and healthcare monitoring. Due to the inherently unpredictable and highly varied nature of anomalies and the lack of anomaly labels in historical data, the AD problem is typically formulated as an unsupervised learning problem. The performance of existing solutions is often not satisfactory, especially in data-scarce scenarios. To tackle this problem, we propose a novel self-supervised learning technique for AD in time series, namely DeepFIB. We model the problem as a Fill In the Blank game by masking some elements in the TS and imputing them with the rest. Considering the two common anomaly shapes (point- or sequence-outliers) in TS data, we implement two masking strategies with many self-generated training samples. The corresponding self-imputation networks can extract more robust temporal relations than existing AD solutions and effectively facilitate identifying the two types of anomalies. For continuous outliers, we also propose an anomaly localization algorithm that dramatically reduces AD errors. Experiments on various real-world TS datasets demonstrate that DeepFIB outperforms state-of-the-art methods by a large margin, achieving up to 65.2% relative improvement in F1-score.\n", "keywords": "Time series anomaly detection;self-supervised learning;time series imputation", "primary_area": "", "supplementary_material": "/attachment/c5c1bec0bc9c265ff0ef0dc1edaf53176560830c.zip", "author": "Minhao LIU;Zhijian Xu;Qiang Xu", "authorids": "~Minhao_LIU1;~Zhijian_Xu1;~Qiang_Xu1", "gender": "M;M;M", "homepage": "https://scholar.google.com/citations?user=MUTHUDAAAAAJ&hl=en&oi=ao;http://notfornow.com;https://github.com/cure-lab", "dblp": "79/10137;72/8350;43/1230-1", "google_scholar": "MUTHUDAAAAAJ;;https://scholar.google.com.tw/citations?user=eSiKPqUAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Minhao_LIU1;~Zhijian_Xu1;~Qiang_Xu1", "aff": ";The Chinese University of Hong Kong;The Chinese University of Hong Kong", "aff_domain": ";cuhk.edu.hk;cuhk.edu.hk", "position": ";PhD student;Full Professor", "bibtex": "@misc{\nliu2022deepfib,\ntitle={Deep{FIB}: Self-Imputation for Time Series Anomaly Detection},\nauthor={Minhao LIU and Zhijian Xu and Qiang Xu},\nyear={2022},\nurl={https://openreview.net/forum?id=jM62SQw28f}\n}", "github": "", "project": "", "reviewers": "Xetw;T1aR;YYUh;uKv7;eLpw", "site": "https://openreview.net/forum?id=jM62SQw28f", "pdf_size": 0, "recommendation": "3;3;3;5;5", "confidence": "4;5;4;4;5", "correctness": "3;2;2;2;3", "technical_novelty": "2;2;2;3;3", "empirical_novelty": "2;2;0;2;3", "wc_summary_paper": "127;52;27;121;51", "wc_summary_review": "19;63;34;44;72", "wc_main_review": "225;593;246;907;162", "wc_review": "371;708;307;1072;285", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "0;0;0;0;0", "reply_reviewers": "0;0;0;0;0", "reply_authors": "0;0;0;0;0", "recommendation_avg": [ 3.8, 0.9797958971132712 ], "confidence_avg": [ 4.4, 0.4898979485566356 ], "correctness_avg": [ 2.4, 0.4898979485566356 ], "technical_novelty_avg": [ 2.4, 0.4898979485566356 ], "empirical_novelty_avg": [ 1.8, 0.9797958971132713 ], "wc_summary_paper_avg": [ 75.6, 40.564023469079096 ], "wc_summary_review_avg": [ 46.4, 19.189580506097574 ], "wc_main_review_avg": [ 426.6, 283.4590623000083 ], "wc_review_avg": [ 548.6, 302.9036810604982 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.16666666666666669, "corr_recommendation_correctness": 0.16666666666666663, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2540091948062291019&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Chinese University of Hong Kong", "aff_unique_dep": "", "aff_unique_url": "https://www.cuhk.edu.hk", "aff_unique_abbr": "CUHK", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Hong Kong SAR", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "id": "jNB6vfl_680", "title": "Global Magnitude Pruning With Minimum Threshold Is All We Need", "track": "main", "status": "Reject", "tldr": "", "abstract": "Neural network pruning remains a very important yet challenging problem to solve. Many pruning solutions have been proposed over the years with high degrees of algorithmic complexity. In this work, we shed light on a very simple pruning technique that achieves state-of-the-art (SOTA) performance. We showcase that magnitude based pruning, specifically, global magnitude pruning (GP) is sufficient to achieve SOTA performance on a range of neural network architectures. In certain architectures, the last few layers of a network may get over-pruned. For these cases, we introduce a straightforward method to mitigate this. We preserve a certain fixed number of weights in each layer of the network to ensure no layer is over-pruned. We call this the Minimum Threshold (MT). We find that GP combined with MT when needed, achieves SOTA performance on all datasets and architectures tested including ResNet-50 and MobileNet-V1 on ImageNet. Code available on github.", "keywords": "Pruning;Model Compression;One-shot;Global Magnitude Pruning", "primary_area": "", "supplementary_material": "", "author": "Manas Gupta;Vishandi Rudy Keneta;Abhishek Vaidyanathan;Ritwik Kanodia;Efe Camci;Chuan-Sheng Foo;Jie Lin", "authorids": "~Manas_Gupta1;~Vishandi_Rudy_Keneta1;~Abhishek_Vaidyanathan1;~Ritwik_Kanodia1;~Efe_Camci1;~Chuan-Sheng_Foo1;~Jie_Lin1", "gender": ";M;M;M;;M;M", "homepage": ";;;;;http://ai.stanford.edu/~csfoo;", "dblp": ";;;;;73/1823;88/6731", "google_scholar": "tHLXMFoAAAAJ;;;;stNKYrQAAAAJ;AgbeqGkAAAAJ;", "orcid": ";;;;0000-0002-5342-5163;0000-0002-4748-5792;", "linkedin": "manassingapore/;vishandirudykeneta/;abhishek-vaidyanathan-3364b2196/;ritwikkanodia/;https://sg.linkedin.com/in/efecamci;;", "or_profile": "~Manas_Gupta1;~Vishandi_Rudy_Keneta1;~Abhishek_Vaidyanathan1;~Ritwik_Kanodia1;~Efe_Camci1;~Chuan-Sheng_Foo1;~Jie_Lin1", "aff": "Institute for Infocomm Research, Agency for Science, Technology and Research (A*STAR), Singapore;I2R, A*STAR;Nanyang Technological University;Nanyang Technological University;I2R, A*STAR;Institute for Infocomm Research, A*STAR;I2R, A*STAR", "aff_domain": "i2r.a-star.edu.sg;i2r.a-star.edu.sg;ntu.edu.sg;ntu.edu.sg;i2r.a-star.edu.sg;i2r.a-star.edu.sg;i2r.a-star.edu.sg", "position": "Researcher;Undergrad student;Undergrad student;Undergrad student;Researcher;Scientist;Research Scientist", "bibtex": "@misc{\ngupta2022global,\ntitle={Global Magnitude Pruning With Minimum Threshold Is All We Need},\nauthor={Manas Gupta and Vishandi Rudy Keneta and Abhishek Vaidyanathan and Ritwik Kanodia and Efe Camci and Chuan-Sheng Foo and Jie Lin},\nyear={2022},\nurl={https://openreview.net/forum?id=jNB6vfl_680}\n}", "github": "", "project": "", "reviewers": "MHY2;YX1q;5P4J", "site": "https://openreview.net/forum?id=jNB6vfl_680", "pdf_size": 0, "recommendation": "3;5;6", "confidence": "5;5;2", "correctness": "3;3;3", "technical_novelty": "2;2;3", "empirical_novelty": "2;0;2", "wc_summary_paper": "193;46;53", "wc_summary_review": "146;34;39", "wc_main_review": "1017;135;116", "wc_review": "1356;215;208", "wc_reply_reviewers": "2251;0;0", "wc_reply_authors": "2885;740;436", "reply_reviewers": "6;0;0", "reply_authors": "7;1;1", "recommendation_avg": [ 4.666666666666667, 1.247219128924647 ], "confidence_avg": [ 4.0, 1.4142135623730951 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 1.3333333333333333, 0.9428090415820634 ], "wc_summary_paper_avg": [ 97.33333333333333, 67.70688459988557 ], "wc_summary_review_avg": [ 73.0, 51.65913923660233 ], "wc_main_review_avg": [ 422.6666666666667, 420.3287073496435 ], "wc_review_avg": [ 593.0, 539.530042413457 ], "wc_reply_reviewers_avg": [ 750.3333333333334, 1061.1315763006123 ], "wc_reply_authors_avg": [ 1353.6666666666667, 1089.9052965994595 ], "reply_reviewers_avg": [ 2.0, 2.8284271247461903 ], "reply_authors_avg": [ 3.0, 2.8284271247461903 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": -0.7559289460184545, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:GSBJWdV6sU0J:scholar.google.com/&scioq=Global+Magnitude+Pruning+With+Minimum+Threshold+Is+All+We+Need&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1;2;2;1;3;1", "aff_unique_norm": "Agency for Science, Technology and Research;A*STAR;Nanyang Technological University;Institute for Infocomm Research", "aff_unique_dep": "Institute for Infocomm Research;Institute for Infocomm Research;;", "aff_unique_url": "https://www.a-star.edu.sg;https://www.a-star.edu.sg;https://www.ntu.edu.sg;https://www.i2r.a-star.edu.sg", "aff_unique_abbr": "A*STAR;A*STAR;NTU;I2R", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "Singapore" }, { "id": "jNsynsmDkl", "title": "Contrastively Enforcing Distinctiveness for Multi-Label Classification", "track": "main", "status": "Reject", "tldr": "", "abstract": "Recently, as an effective way of learning latent representations, contrastive learning has been increasingly popular and successful in various domains. The success of constrastive learning in single-label classifications motivates us to leverage this learning framework to enhance distinctiveness for better performance in multi-label image classification. In this paper, we show that a direct application of contrastive learning can hardly improve in multi-label cases. Accordingly, we propose a novel framework for multi-label classification with contrastive learning in a fully supervised setting, which learns multiple representations of an image under the context of different labels. This facilities a simple yet intuitive adaption of contrastive learning into our model to boost its performance in multi-label image classification.\nExtensive experiments on two benchmark datasets show that the proposed framework achieves state-of-the-art performance in the comparison with the advanced methods in multi-label classification.", "keywords": "Multi-label Classification;Contrastive learning", "primary_area": "", "supplementary_material": "", "author": "Son Duy Dao;He Zhao;Dinh Phung;Jianfei Cai", "authorids": "~Son_Duy_Dao1;~He_Zhao1;~Dinh_Phung2;~Jianfei_Cai1", "gender": "M;;;M", "homepage": ";;;https://jianfei-cai.github.io/", "dblp": "147/9467;;;83/6096", "google_scholar": "https://scholar.google.com.au/citations?user=AZWFq1sAAAAJ;;;https://scholar.google.com.tw/citations?user=N6czCoUAAAAJ", "orcid": ";;;", "linkedin": "son-dao-0a9804129/;;;", "or_profile": "~Son_Duy_Dao1;~He_Zhao1;~Dinh_Phung2;~Jianfei_Cai1", "aff": "Monash University;;;Monash University", "aff_domain": "monash.edu;;;monash.edu", "position": "PhD student;;;Full Professor", "bibtex": "@misc{\ndao2022contrastively,\ntitle={Contrastively Enforcing Distinctiveness for Multi-Label Classification},\nauthor={Son Duy Dao and He Zhao and Dinh Phung and Jianfei Cai},\nyear={2022},\nurl={https://openreview.net/forum?id=jNsynsmDkl}\n}", "github": "", "project": "", "reviewers": "rsBB;2oF2;ToSu;3XCq", "site": "https://openreview.net/forum?id=jNsynsmDkl", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "4;4;4;3", "correctness": "3;3;3;3", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;3;0;3", "wc_summary_paper": "143;46;70;109", "wc_summary_review": "21;47;85;87", "wc_main_review": "267;238;251;356", "wc_review": "431;331;406;552", "wc_reply_reviewers": "63;93;15;0", "wc_reply_authors": "603;284;261;750", "reply_reviewers": "1;1;1;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.0, 1.224744871391589 ], "wc_summary_paper_avg": [ 92.0, 37.04726710568541 ], "wc_summary_review_avg": [ 60.0, 27.586228448267445 ], "wc_main_review_avg": [ 278.0, 46.18982571952399 ], "wc_review_avg": [ 430.0, 79.47012017104291 ], "wc_reply_reviewers_avg": [ 42.75, 37.19122880465231 ], "wc_reply_authors_avg": [ 474.5, 208.7372750612597 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:nfWtZsLarr8J:scholar.google.com/&scioq=Contrastively+Enforcing+Distinctiveness+for+Multi-Label+Classification&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Monash University", "aff_unique_dep": "", "aff_unique_url": "https://www.monash.edu", "aff_unique_abbr": "Monash", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Australia" }, { "id": "jPwC2MMI85Y", "title": "A molecular hypergraph convolutional network with functional group information", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "We propose a Molecular Hypergraph Convolutional Network (MolHGCN) that predicts the molecular properties of a molecule using the atom and functional group information as inputs. Molecules can contain many types of functional groups, which will affect the properties the molecules. For example, the toxicity of a molecule is associated with toxicophores, such as nitroaromatic groups and thiourea. Conventional graph-based methods that consider the pair-wise interactions between nodes are inefficient in expressing the complex relationship between multiple nodes in a graph flexibly, and applying multi-hops may result in oversmoothing and overfitting problems. Hence, we propose MolHGCN to capture the substructural difference between molecules using the atom and functional group information. MolHGCN constructs a hypergraph representation of a molecule using functional group information from the input SMILES strings, extracts hidden representation using a two-stage message passing process (atom and functional group message passing), and predicts the properties of the molecules using the extracted hidden representation. We evaluate the performance of our model using Tox21, ClinTox, SIDER, BBBP, BACE, ESOL, FreeSolv and Lipophilicity datasets. We show that our model is able to outperform other baseline methods for most of the datasets. We particularly show that incorporating functional group information along with atom information results in better separability in the latent space, thus increasing the prediction accuracy of the molecule property prediction.", "keywords": "Molecule;Functional group;graph;hypergraph;chemistry", "primary_area": "", "supplementary_material": "", "author": "Junyoung Park;Fangying Chen;Jinkyoo Park", "authorids": "~Junyoung_Park1;~Fangying_Chen1;~Jinkyoo_Park1", "gender": ";;M", "homepage": ";;http://silab.kaist.ac.kr/", "dblp": ";;156/7535", "google_scholar": ";;sH2a0nkAAAAJ", "orcid": ";;0000-0003-2620-1479", "linkedin": ";fangying-chen-2a640a148;", "or_profile": "~Junyoung_Park1;~Fangying_Chen1;~Jinkyoo_Park1", "aff": ";;Korea Advanced Institute of Science & Technology", "aff_domain": ";;kaist.ac.kr", "position": ";;Associate Professor", "bibtex": "@misc{\npark2022a,\ntitle={A molecular hypergraph convolutional network with functional group information},\nauthor={Junyoung Park and Fangying Chen and Jinkyoo Park},\nyear={2022},\nurl={https://openreview.net/forum?id=jPwC2MMI85Y}\n}", "github": "", "project": "", "reviewers": "b9t1;MJuF;7ayR;79qs", "site": "https://openreview.net/forum?id=jPwC2MMI85Y", "pdf_size": 0, "recommendation": "3;3;5;6", "confidence": "4;3;2;5", "correctness": "3;3;4;4", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "59;60;74;84", "wc_summary_review": "39;20;52;17", "wc_main_review": "262;205;189;329", "wc_review": "360;285;315;430", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.25, 1.299038105676658 ], "confidence_avg": [ 3.5, 1.118033988749895 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 69.25, 10.37725879025863 ], "wc_summary_review_avg": [ 32.0, 14.300349646075091 ], "wc_main_review_avg": [ 246.25, 54.94258366695181 ], "wc_review_avg": [ 347.5, 54.60082416960389 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.2581988897471611, "corr_recommendation_correctness": 0.9622504486493761, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:O4BvzwdJoJkJ:scholar.google.com/&scioq=A+molecular+hypergraph+convolutional+network+with+functional+group+information&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "Korea Advanced Institute of Science and Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.kaist.ac.kr", "aff_unique_abbr": "KAIST", "aff_country_unique_index": "0", "aff_country_unique": "South Korea" }, { "title": "From Intervention to Domain Transportation: A Novel Perspective to Optimize Recommendation", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6653", "id": "jT1EwXu-4hj", "poster": "", "openreview": "https://openreview.net/forum?id=jT1EwXu-4hj", "slides": "https://iclr.cc/virtual/2022/poster/6653", "video": "https://iclr.cc/virtual/2022/poster/6653", "author_site": "Da Xu, Yuting Ye, Chuanwei Ruan, evren korpeoglu, Sushant Kumar, kannan achan", "tldr": "", "abstract": "The interventional nature of recommendation has attracted increasing attention in recent years. It particularly motivates researchers to formulate learning and evaluating recommendation as causal inference and data missing-not-at-random problems. However, few take seriously the consequence of violating the critical assumption of overlapping, which we prove can significantly threaten the validity and interpretation of the outcome. We find a critical piece missing in the current understanding of information retrieval (IR) systems: as interventions, recommendation not only affects the already observed data, but it also interferes with the target domain (distribution) of interest. We then rephrase optimizing recommendation as finding an intervention that best transports the patterns it learns from the observed domain to its intervention domain. Towards this end, we use domain transportation to characterize the learning-intervention mechanism of recommendation. We design a principled transportation-constraint risk minimization objective and convert it to a two-player minimax game.\nWe prove the consistency, generalization, and excessive risk bounds for the proposed objective, and elaborate how they compare to the current results. Finally, we carry out extensive real-data and semi-synthetic experiments to demonstrate the advantage of our approach, and launch online testing with a real-world IR system.", "keywords": "Information retrieval;Learning theory;Causal inference;Missing data;Overlapping;Reweighting;Optimal transport", "primary_area": "", "supplementary_material": "/attachment/4b6d99ed3156754a98761a428ef733a1d68bf4ed.zip", "author": "Da Xu;Yuting Ye;Chuanwei Ruan;Evren Korpeoglu;Sushant Kumar;Kannan Achan", "authorids": "~Da_Xu2;~Yuting_Ye3;~Chuanwei_Ruan1;~Evren_Korpeoglu1;~Sushant_Kumar1;~Kannan_Achan1", "gender": "M;M;M;M;M;", "homepage": ";https://elric2718.github.io/;https://www.linkedin.com/in/chuanwei-r-507832126/;;;http://www.cs.toronto.edu/~kannan/", "dblp": ";;237/9763;;166/1095.html;39/463", "google_scholar": "-jl6A84AAAAJ;;T3EUphwAAAAJ;BFRRLJUAAAAJ;https://scholar.google.com/citations?hl=en;qRXIUuMAAAAJ", "orcid": ";;0009-0004-2307-0391;;;", "linkedin": ";;chuanwei-r-507832126/;;kumarsushant/;kannanachan", "or_profile": "~Da_Xu2;~Yuting_Ye3;~Chuanwei_Ruan1;~Evren_Korpeoglu1;~Sushant_Kumar1;~Kannan_Achan1", "aff": "Walmart Labs;;Instacart;Walmart;;", "aff_domain": "walmartlabs.com;;instacart.com;walmart.com;;", "position": "Researcher;;Researcher;Senior Director of Data Science;;", "bibtex": "@inproceedings{\nxu2022from,\ntitle={From Intervention to Domain Transportation: A Novel Perspective to Optimize Recommendation},\nauthor={Da Xu and Yuting Ye and Chuanwei Ruan and Evren Korpeoglu and Sushant Kumar and Kannan Achan},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=jT1EwXu-4hj}\n}", "github": "", "project": "", "reviewers": "n8vS;j5tz;ozid;Lm9j", "pdf_size": 0, "recommendation": "5;6;6;6", "confidence": "4;3;3;4", "correctness": "3;3;4;3", "technical_novelty": "3;3;3;3", "empirical_novelty": "3;3;3;3", "wc_summary_paper": "111;156;66;146", "wc_summary_review": "36;174;26;27", "wc_main_review": "584;204;206;294", "wc_review": "731;534;298;467", "wc_reply_reviewers": "135;0;0;0", "wc_reply_authors": "1058;426;762;636", "reply_reviewers": "1;0;0;0", "reply_authors": "2;1;1;1", "recommendation_avg": [ 5.75, 0.4330127018922193 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 119.75, 35.24468044967921 ], "wc_summary_review_avg": [ 65.75, 62.61938597591005 ], "wc_main_review_avg": [ 322.0, 155.56991997169632 ], "wc_review_avg": [ 507.5, 155.06853323611466 ], "wc_reply_reviewers_avg": [ 33.75, 58.45671475544961 ], "wc_reply_authors_avg": [ 720.5, 228.8553036309187 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11236130005599547802&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=jT1EwXu-4hj", "email": "walmartlabs.com;;instacart.com;walmart.com;;", "author_num": 6, "aff_unique_index": "0;1;2", "aff_unique_norm": "Walmart;Instacart;Walmart Inc.", "aff_unique_dep": "Walmart Labs;;", "aff_unique_url": "https://www.walmart.com;https://www.instacart.com;https://www.walmart.com", "aff_unique_abbr": "Walmart Labs;Instacart;Walmart", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "jT5vnpqlrSN", "title": "GIR Framework: Learning Graph Positional Embeddings with Anchor Indication and Path Encoding", "track": "main", "status": "Reject", "tldr": "", "abstract": "The majority of existing graph neural networks (GNNs) following the message passing neural network (MPNN) pattern have limited power in capturing position information for a given node. To solve such problems, recent works exploit positioning nodes with selected anchors, mostly in a process that first explicitly assign distances information and then perform message passing encoding. However, this two-stage strategy may ignore potentially useful interaction between intermediate results of the distance computing and encoding stages. In this work, we propose a novel framework which follows the anchor-based idea and aims at conveying distance information implicitly along the MPNN message passing steps for encoding position information, node attributes, and graph structure in a more flexible way. Specifically, we first leverage a simple anchor indication strategy to enable the position-aware ability for well-designed MPNNs. Then, following this strategy, we propose the Graph Inference Representation (GIR) model, which acts as a generalization of MPNNs with a more specific propagation path design for position-aware scenarios. \u00a0Meanwhile, we theoretically and empirically explore the ability of the proposed framework to get position-aware embeddings, and experimental results show that our proposed method generally outperforms previous position-aware GNN methods.", "keywords": "Graph neural networks;Anchor based GNN;Node representation learning", "primary_area": "", "supplementary_material": "/attachment/3cc0be80a946870a5e2028148e17c33e876a157e.zip", "author": "Yuheng Lu;Jinpeng Chen;Chuxiong Sun;Jie Hu", "authorids": "~Yuheng_Lu2;~Jinpeng_Chen1;~Chuxiong_Sun1;~Jie_Hu5", "gender": ";M;M;F", "homepage": "https://github.com/sutakori;https://teacher.bupt.edu.cn/chenjinpeng/zh_CN/index/73905/list/index.htm;;", "dblp": ";91/10208-1;;", "google_scholar": ";https://scholar.google.com/citations?hl=en;;", "orcid": ";0000-0003-4157-5110;0000-0001-6414-1599;", "linkedin": ";;;%E5%A9%95-%E8%83%A1-53a3a1108/", "or_profile": "~Yuheng_Lu2;~Jinpeng_Chen1;~Chuxiong_Sun1;~Jie_Hu5", "aff": "Beijing University of Posts and Telecommunications;Beijing University of Post and Telecommunication;China Telecom Research Institute Emerging Technology Research Division;", "aff_domain": "bupt.edu.cn;bupt.edu.cn;chinatelecom.cn;", "position": "MS student;Associate Professor;Researcher;", "bibtex": "@misc{\nlu2022gir,\ntitle={{GIR} Framework: Learning Graph Positional Embeddings with Anchor Indication and Path Encoding},\nauthor={Yuheng Lu and Jinpeng Chen and Chuxiong Sun and Jie Hu},\nyear={2022},\nurl={https://openreview.net/forum?id=jT5vnpqlrSN}\n}", "github": "", "project": "", "reviewers": "YkUE;C6uM;jwSq;Z48B", "site": "https://openreview.net/forum?id=jT5vnpqlrSN", "pdf_size": 0, "recommendation": "5;5;5;6", "confidence": "3;4;4;4", "correctness": "3;3;3;3", "technical_novelty": "3;3;2;2", "empirical_novelty": "2;3;2;2", "wc_summary_paper": "56;54;69;91", "wc_summary_review": "18;54;59;13", "wc_main_review": "249;432;241;123", "wc_review": "323;540;369;227", "wc_reply_reviewers": "0;254;0;0", "wc_reply_authors": "885;792;666;200", "reply_reviewers": "0;1;0;0", "reply_authors": "3;2;1;1", "recommendation_avg": [ 5.25, 0.4330127018922193 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 67.5, 14.739402972983676 ], "wc_summary_review_avg": [ 36.0, 20.65187642806338 ], "wc_main_review_avg": [ 261.25, 110.48614166491652 ], "wc_review_avg": [ 364.75, 113.41158450528764 ], "wc_reply_reviewers_avg": [ 63.5, 109.9852262806237 ], "wc_reply_authors_avg": [ 635.75, 263.311958520687 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.75, 0.82915619758885 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:JKxaHK28UFgJ:scholar.google.com/&scioq=GIR+Framework:+Learning+Graph+Positional+Embeddings+with+Anchor+Indication+and+Path+Encoding&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0;1", "aff_unique_norm": "Beijing University of Posts and Telecommunications;China Telecom Research Institute", "aff_unique_dep": ";Emerging Technology Research Division", "aff_unique_url": "http://www.bupt.edu.cn/;http://www.chinatelecom.com.cn", "aff_unique_abbr": "BUPT;CTRI", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Beijing;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "id": "jT9EDW9_PWF", "title": "GCN-SL: Graph Convolutional Network with Structure Learning for Disassortative Graphs", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "In representation learning on the graph-structured data, many popular GNNs may fail to capture long-range dependencies, which leads to their performance degradation. Furthermore, this weakness will be magnified when the concerned graph is disassortative. To solve the above-mentioned issue, we propose a graph convolutional network with structure learning (GCN-SL), and furthermore, the proposed approach can be applied to node classification. The proposed GCN-SL contains two improvements: corresponding to edges and node features, respectively. Since the original adjacency matrix may provide misleading information for the aggregation process in GNNs, especially in the disassortative graph. We build a re-connected adjacency matrix by structure learning from the perspective of edges. The structure learning module aims to learn an optimized graph structure and corresponding feature representations. Specifically, the re-connected adjacency matrix is built by using a special data preprocessing technique and similarity learning, and can be optimized directly along with GCN-SL parameters. Through structure learning, GCN-SL can search reliable adjacent nodes from the entire graph for aggregation. In the aspect of node features, we propose an efficient-spectral-clustering (ESC) and an ESC with anchors (ESC-ANCH) algorithms. The two algorithms can efficiently aggregate feature representations from similar nodes, no matter how far away these similar nodes are from the target node. Both of the two improvements can help GCN-SL capture long-range dependencies, then make GCN-SL is capable of performing representation learning on both disassortative and assortative graphs. Experimental results on a wide range of benchmark datasets illustrate that the proposed GCN-SL outperforms the state-of-the-art GNN counterparts.", "keywords": "Graph neural networks;disassortative graphs;representation learning;structure learning.", "primary_area": "", "supplementary_material": "", "author": "Mengying Jiang;Guizhong Liu;Yuanchao Su;Xinliang Wu", "authorids": "~Mengying_Jiang2;~Guizhong_Liu1;~Yuanchao_Su1;~Xinliang_Wu1", "gender": "F;M;M;M", "homepage": ";https://gr.xjtu.edu.cn/web/liugz;;", "dblp": "206/6312.html;;;00/4153.html", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;yuanchao-su-15277a116/;", "or_profile": "~Mengying_Jiang2;~Guizhong_Liu1;~Yuanchao_Su1;~Xinliang_Wu1", "aff": "Xi'an Jiaotong University;Xi'an Jiaotong University;;", "aff_domain": "xjtu.edu;xjtu.edu.cn;;", "position": "PhD student;Full Professor;;", "bibtex": "@misc{\njiang2022gcnsl,\ntitle={{GCN}-{SL}: Graph Convolutional Network with Structure Learning for Disassortative Graphs},\nauthor={Mengying Jiang and Guizhong Liu and Yuanchao Su and Xinliang Wu},\nyear={2022},\nurl={https://openreview.net/forum?id=jT9EDW9_PWF}\n}", "github": "", "project": "", "reviewers": "VPGF;CQ7q;KG9A", "site": "https://openreview.net/forum?id=jT9EDW9_PWF", "pdf_size": 0, "recommendation": "3;3;3", "confidence": "5;4;5", "correctness": "3;3;2", "technical_novelty": "2;2;1", "empirical_novelty": "2;3;2", "wc_summary_paper": "82;76;58", "wc_summary_review": "44;30;28", "wc_main_review": "516;244;521", "wc_review": "642;350;607", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "correctness_avg": [ 2.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 1.6666666666666667, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 72.0, 10.198039027185569 ], "wc_summary_review_avg": [ 34.0, 7.118052168020874 ], "wc_main_review_avg": [ 427.0, 129.4166398368721 ], "wc_review_avg": [ 533.0, 130.18704492639299 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:6YeJ85V7G1wJ:scholar.google.com/&scioq=GCN-SL:+Graph+Convolutional+Network+with+Structure+Learning+for+Disassortative+Graphs&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Xi'an Jiao Tong University", "aff_unique_dep": "", "aff_unique_url": "https://www.xjtu.edu.cn", "aff_unique_abbr": "XJTU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "id": "jWaLuyg6OEw", "title": "First-Order Optimization Inspired from Finite-Time Convergent Flows", "track": "main", "status": "Reject", "tldr": "", "abstract": "In this paper, we investigate the performance of two first-order optimization algorithms, obtained from forward Euler discretization of finite-time optimization flows. These flows are the rescaled-gradient flow (RGF) and the signed-gradient flow (SGF), and consist of non-Lipscthiz or discontinuous dynamical systems that converge locally in finite time to the minima of gradient-dominated functions. We propose an Euler discretization for these first-order finite-time flows, and provide convergence guarantees, in the deterministic and the stochastic setting. We then apply the proposed algorithms to academic examples, as well as deep neural networks training, where we empirically test their performances on the SVHN dataset. Our results show that our schemes demonstrate faster convergences against standard optimization alternatives.", "keywords": "Dynamical systems;continuous-time optimization flows;first-order optimization;DNN applications", "primary_area": "", "supplementary_material": "/attachment/33875793fa3ab517206a844d901169c3d1d2c1f9.zip", "author": "Siqi Zhang;Mouhacine Benosman;Orlando Romero;Anoop Cherian", "authorids": "~Siqi_Zhang2;~Mouhacine_Benosman1;~Orlando_Romero1;~Anoop_Cherian1", "gender": "M;M;M;M", "homepage": "https://siqi-z.github.io/;;https://www.grasp.upenn.edu/people/orlando-romero/;http://users.cecs.anu.edu.au/~cherian/", "dblp": ";;;44/7734", "google_scholar": "0M171lEAAAAJ;cs7AJxcAAAAJ;UfQg91QAAAAJ;https://scholar.google.com.au/citations?hl=en", "orcid": ";;;0000-0002-5566-0351", "linkedin": ";;;anoop-cherian-4678a04/", "or_profile": "~Siqi_Zhang2;~Mouhacine_Benosman1;~Orlando_Romero1;~Anoop_Cherian2", "aff": "University of Illinois, Urbana Champaign;Mitsubishi Electric Research Labs;University of Pennsylvania;Mitsubishi Electric Research Labs", "aff_domain": "illinois.edu;merl.com;seas.upenn.edu;merl.com", "position": "PhD student;Researcher;PhD student;Researcher", "bibtex": "@misc{\nzhang2022firstorder,\ntitle={First-Order Optimization Inspired from Finite-Time Convergent Flows},\nauthor={Siqi Zhang and Mouhacine Benosman and Orlando Romero and Anoop Cherian},\nyear={2022},\nurl={https://openreview.net/forum?id=jWaLuyg6OEw}\n}", "github": "", "project": "", "reviewers": "xRJ7;rpnN;5qiS;AvzH", "site": "https://openreview.net/forum?id=jWaLuyg6OEw", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "3;4;4;4", "correctness": "4;3;3;3", "technical_novelty": "2;3;2;2", "empirical_novelty": "0;2;2;2", "wc_summary_paper": "37;111;46;46", "wc_summary_review": "39;34;20;48", "wc_main_review": "137;522;239;265", "wc_review": "213;667;305;359", "wc_reply_reviewers": "0;703;0;0", "wc_reply_authors": "968;2789;819;957", "reply_reviewers": "0;2;0;0", "reply_authors": "3;6;2;2", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 60.0, 29.67322024991558 ], "wc_summary_review_avg": [ 35.25, 10.133484099755622 ], "wc_main_review_avg": [ 290.75, 141.82449541598942 ], "wc_review_avg": [ 386.0, 170.42593699317015 ], "wc_reply_reviewers_avg": [ 175.75, 304.4079294302302 ], "wc_reply_authors_avg": [ 1383.25, 813.7310289647311 ], "reply_reviewers_avg": [ 0.5, 0.8660254037844386 ], "reply_authors_avg": [ 3.25, 1.6393596310755 ], "replies_avg": [ 22, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": -0.5773502691896257, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18306250994030501623&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;2;1", "aff_unique_norm": "University of Illinois Urbana-Champaign;Mitsubishi Electric Research Laboratories;University of Pennsylvania", "aff_unique_dep": ";;", "aff_unique_url": "https://illinois.edu;https://www.merl.com;https://www.upenn.edu", "aff_unique_abbr": "UIUC;MERL;UPenn", "aff_campus_unique_index": "0", "aff_campus_unique": "Urbana-Champaign;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "jWxuLQE31IL", "title": "Efficient Winning Tickets Drawing over Fine-Grained Structured Sparsity", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "The fine-grained structured sparsity has been proposed as a middle-ground between unstructured sparsity, where weights are pruned independently, and coarse-grained structured sparsity, where entire blocks of weights are pruned. Specifically, N:M fine-grained structured sparsity allows for at most N nonzero weights across a group of M consecutive weights. A recent implementation of 2:4 sparsity (N=2 and M=4) in Sparse Tensor Cores of Nvidia A100 GPUs shows significant improvement in throughput compared to unstructured sparsity while maintaining similar performance (e.g., accuracy). However, despite its potential for superior computational performance, how to efficiently train DNNs with N:M fine-grained structured sparsity remains a challenging problem. In this work, we leverage the recent advance of~\\textit{Lottery Ticket Hypothesis} (LTH) and propose an iterative pruning algorithm for N:M fine-grained structured sparsity. By leveraging the N:M sparsity constraint, we can identify the unimportant weights across each group of M weights at earlier stages of iterative pruning, which significantly lowers the cost of iterative training compared to conventional unstructured pruning.", "keywords": "Pruning;Lottery Ticket Hypothesis", "primary_area": "", "supplementary_material": "/attachment/75325bce6020c5b7b408841f580591dd75bbe641.zip", "author": "Sai Qian Zhang;Bradley McDanel", "authorids": "~Sai_Qian_Zhang1;bmcdanel@fandm.edu", "gender": ";", "homepage": "https://saiqianzhang.com/;", "dblp": "164/7945;", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": "~Sai_Qian_Zhang1;bmcdanel@fandm.edu", "aff": "Meta Facebook;", "aff_domain": "facebook.com;", "position": "Researcher;", "bibtex": "@misc{\nzhang2022efficient,\ntitle={Efficient Winning Tickets Drawing over Fine-Grained Structured Sparsity},\nauthor={Sai Qian Zhang and Bradley McDanel},\nyear={2022},\nurl={https://openreview.net/forum?id=jWxuLQE31IL}\n}", "github": "", "project": "", "reviewers": "v2Xo;UFGe;yDB7", "site": "https://openreview.net/forum?id=jWxuLQE31IL", "pdf_size": 0, "recommendation": "3;3;5", "confidence": "4;4;4", "correctness": "3;3;3", "technical_novelty": "2;1;2", "empirical_novelty": "2;1;3", "wc_summary_paper": "25;82;72", "wc_summary_review": "64;26;17", "wc_main_review": "169;293;101", "wc_review": "258;401;190", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 3.6666666666666665, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 1.6666666666666667, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.0, 0.816496580927726 ], "wc_summary_paper_avg": [ 59.666666666666664, 24.850665092821068 ], "wc_summary_review_avg": [ 35.666666666666664, 20.368821489936252 ], "wc_main_review_avg": [ 187.66666666666666, 79.4872456575407 ], "wc_review_avg": [ 283.0, 87.93558248323978 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:erR1RA3gvkgJ:scholar.google.com/&scioq=Efficient+Winning+Tickets+Drawing+over+Fine-Grained+Structured+Sparsity&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "Meta", "aff_unique_dep": "Meta Platforms, Inc.", "aff_unique_url": "https://meta.com", "aff_unique_abbr": "Meta", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "title": "Byzantine-Robust Learning on Heterogeneous Datasets via Bucketing", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6400", "id": "jXKKDEi5vJt", "poster": "", "openreview": "https://openreview.net/forum?id=jXKKDEi5vJt", "slides": "https://iclr.cc/virtual/2022/poster/6400", "video": "https://iclr.cc/virtual/2022/poster/6400", "author_site": "Sai Karimireddy, Lie He, Martin Jaggi", "tldr": "", "abstract": "In Byzantine robust distributed or federated learning, a central server wants to train a machine learning model over data distributed across multiple workers. However, a fraction of these workers may deviate from the prescribed algorithm and send arbitrary messages. While this problem has received significant attention recently, most current defenses assume that the workers have identical data. For realistic cases when the data across workers are heterogeneous (non-iid), we design new attacks which circumvent current defenses, leading to significant loss of performance. We then propose a simple bucketing scheme that adapts existing robust algorithms to heterogeneous datasets at a negligible computational cost. We also theoretically and experimentally validate our approach, showing that combining bucketing with existing robust algorithms is effective against challenging attacks. Our work is the first to establish guaranteed convergence for the non-iid Byzantine robust problem under realistic assumptions.\n", "keywords": "Federated learning;Distributed learning;Byzantine robust optimization;Heterogeneity (Non-IID)", "primary_area": "", "supplementary_material": "/attachment/cc13add474cedff602c5984e31e80b1203484e96.zip", "author": "Sai Praneeth Karimireddy;Lie He;Martin Jaggi", "authorids": "~Sai_Praneeth_Karimireddy1;~Lie_He1;~Martin_Jaggi1", "gender": "M;M;M", "homepage": "https://spkreddy.org;https://liehe.github.io/;https://mlo.epfl.ch", "dblp": "217/3342;225/5245;17/4402", "google_scholar": "wKJeOQoAAAAJ;rIAYxaMAAAAJ;https://scholar.google.ch/citations?user=r1TJBr8AAAAJ", "orcid": ";;0000-0003-1579-5558", "linkedin": ";;", "or_profile": "~Sai_Praneeth_Karimireddy1;~Lie_He1;~Martin_Jaggi1", "aff": "University of California, Berkeley;EPFL - EPF Lausanne;EPFL", "aff_domain": "berkeley.edu;epfl.ch;epfl.ch", "position": "Postdoc;PhD student;Assistant Professor", "bibtex": "@inproceedings{\nkarimireddy2022byzantinerobust,\ntitle={Byzantine-Robust Learning on Heterogeneous Datasets via Bucketing},\nauthor={Sai Praneeth Karimireddy and Lie He and Martin Jaggi},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=jXKKDEi5vJt}\n}", "github": "", "project": "", "reviewers": "oF9Y;EczS;WV2X;ago1", "pdf_size": 0, "recommendation": "6;8;8;10", "confidence": "4;4;3;5", "correctness": "3;3;4;3", "technical_novelty": "3;3;3;3", "empirical_novelty": "3;3;3;3", "wc_summary_paper": "91;37;457;191", "wc_summary_review": "13;50;104;46", "wc_main_review": "243;171;283;1795", "wc_review": "347;258;844;2032", "wc_reply_reviewers": "881;231;79;107", "wc_reply_authors": "1809;196;433;866", "reply_reviewers": "5;2;1;1", "reply_authors": "6;2;1;2", "recommendation_avg": [ 8.0, 1.4142135623730951 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 194.0, 161.58279611394278 ], "wc_summary_review_avg": [ 53.25, 32.62954949122038 ], "wc_main_review_avg": [ 623.0, 677.84363978723 ], "wc_review_avg": [ 870.25, 706.9287004359068 ], "wc_reply_reviewers_avg": [ 324.5, 326.3475907678805 ], "wc_reply_authors_avg": [ 826.0, 616.2868650230995 ], "reply_reviewers_avg": [ 2.25, 1.6393596310755 ], "reply_authors_avg": [ 2.75, 1.920286436967152 ], "replies_avg": [ 25, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.5, "corr_recommendation_correctness": 0.0, "gs_citation": 175, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10653774941778356470&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2, "pdf": "https://openreview.net/pdf?id=jXKKDEi5vJt", "email": "berkeley.edu;epfl.ch;epfl.ch", "author_num": 3, "aff_unique_index": "0;1;1", "aff_unique_norm": "University of California, Berkeley;EPFL", "aff_unique_dep": ";", "aff_unique_url": "https://www.berkeley.edu;https://www.epfl.ch", "aff_unique_abbr": "UC Berkeley;EPFL", "aff_campus_unique_index": "0;1", "aff_campus_unique": "Berkeley;Lausanne;", "aff_country_unique_index": "0;1;1", "aff_country_unique": "United States;Switzerland" }, { "id": "jZQOWas0Lo3", "title": "Cycle monotonicity of adversarial attacks for optimal domain adaptation", "track": "main", "status": "Reject", "tldr": "", "abstract": "We reveal an intriguing connection between adversarial attacks and cycle monotone maps, also known as optimal transport maps. Based on this finding, we developed a novel method named source fiction for semi-supervised optimal transport-based domain adaptation. In our algorithm, instead of mapping from target to the source domain, optimal transport maps target samples to the set of adversarial examples. The trick is that these adversarial examples are labeled target samples perturbed to look like source samples for the source domain classifier. Due to the cycle monotonicity of adversarial attacks, optimal transport can naturally approximate this transformation. We conduct experiments on various datasets and show that our method can notably improve the performance of optimal transport methods in semi-supervised domain adaptation. ", "keywords": "Optimal Transport;Domain Adaptation;Adversarial Attacks", "primary_area": "", "supplementary_material": "/attachment/3a7ca46ea4b5bc960e8c6c130858fdd28ea94d0d.zip", "author": "Arip Asadulaev;Vitaly Shutov;Alexander Korotin;Alexander Panfilov;Andrey Filchenkov", "authorids": "~Arip_Asadulaev1;vitaly.shutov1@gmail.com;~Alexander_Korotin2;alexander.panfilov@student.uni-tuebingen.de;~Andrey_Filchenkov2", "gender": "M;;;;M", "homepage": ";;;;https://research.itmo.ru/en/person/188779/andrey_aleksandrovich_filchenkov.htm", "dblp": "243/2822;;;;169/0448.html", "google_scholar": "wcdrgdYAAAAJ;;;;ry63T9QAAAAJ", "orcid": ";;;;0000-0002-1133-8432", "linkedin": ";;;;", "or_profile": "~Arip_Asadulaev1;vitaly.shutov1@gmail.com;~Alexander_Korotin2;alexander.panfilov@student.uni-tuebingen.de;~Andrey_Filchenkov2", "aff": "ITMO University;;;;ITMO University", "aff_domain": "itmo.ru;;;;itmo.ru", "position": "PhD student;;;;Associate Professor", "bibtex": "@misc{\nasadulaev2022cycle,\ntitle={Cycle monotonicity of adversarial attacks for optimal domain adaptation},\nauthor={Arip Asadulaev and Vitaly Shutov and Alexander Korotin and Alexander Panfilov and Andrey Filchenkov},\nyear={2022},\nurl={https://openreview.net/forum?id=jZQOWas0Lo3}\n}", "github": "", "project": "", "reviewers": "Kkzh;kk9C;Ux4r;bUjz", "site": "https://openreview.net/forum?id=jZQOWas0Lo3", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "4;4;4;4", "correctness": "3;3;3;3", "technical_novelty": "2;3;2;3", "empirical_novelty": "2;3;2;2", "wc_summary_paper": "60;33;147;147", "wc_summary_review": "23;58;35;85", "wc_main_review": "554;495;520;648", "wc_review": "637;586;702;880", "wc_reply_reviewers": "84;61;50;135", "wc_reply_authors": "310;205;341;327", "reply_reviewers": "1;1;1;1", "reply_authors": "1;1;1;1", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 96.75, 51.14868033488254 ], "wc_summary_review_avg": [ 50.25, 23.678840765544244 ], "wc_main_review_avg": [ 554.25, 58.03608791088524 ], "wc_review_avg": [ 701.25, 111.08864703469928 ], "wc_reply_reviewers_avg": [ 82.5, 32.69938837348491 ], "wc_reply_authors_avg": [ 295.75, 53.53211652830476 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Me_dAlXoFBIJ:scholar.google.com/&scioq=Cycle+monotonicity+of+adversarial+attacks+for+optimal+domain+adaptation&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "ITMO University", "aff_unique_dep": "", "aff_unique_url": "https://www.itmo.ru", "aff_unique_abbr": "ITMO", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Russian Federation" }, { "title": "Visual Correspondence Hallucination", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6318", "id": "jaLDP8Hp_gc", "poster": "", "openreview": "https://openreview.net/forum?id=jaLDP8Hp_gc", "slides": "https://iclr.cc/virtual/2022/poster/6318", "video": "https://iclr.cc/virtual/2022/poster/6318", "author_site": "Hugo Germain, Vincent Lepetit, Guillaume Bourmaud", "tldr": "", "abstract": "Given a pair of partially overlapping source and target images and a keypoint in the source image, the keypoint's correspondent in the target image can be either visible, occluded or outside the field of view. Local feature matching methods are only able to identify the correspondent's location when it is visible, while humans can also hallucinate its location when it is occluded or outside the field of view through geometric reasoning. In this paper, we bridge this gap by training a network to output a peaked probability distribution over the correspondent's location, regardless of this correspondent being visible, occluded, or outside the field of view. We experimentally demonstrate that this network is indeed able to hallucinate correspondences on pairs of images captured in scenes that were not seen at training-time. We also apply this network to an absolute camera pose estimation problem and find it is significantly more robust than state-of-the-art local feature matching-based competitors.", "keywords": "visual correspondence hallucination;camera pose estimation", "primary_area": "", "supplementary_material": "/attachment/35d511b626b0b65a11b283affbbeaae3edaa0ef7.zip", "author": "Hugo Germain;Vincent Lepetit;Guillaume Bourmaud", "authorids": "~Hugo_Germain1;~Vincent_Lepetit1;~Guillaume_Bourmaud1", "gender": "M;M;", "homepage": "https://www.hugogermain.com/;https://vincentlepetit.github.io;", "dblp": ";80/5556;127/7587", "google_scholar": "0wU5I50AAAAJ;h0a5q3QAAAAJ;d4v2IYMAAAAJ", "orcid": ";0000-0001-9985-4433;", "linkedin": ";vincent-lepetit-58a18bb/;", "or_profile": "~Hugo_Germain1;~Vincent_Lepetit1;~Guillaume_Bourmaud1", "aff": ";TU Graz;University of Bordeaux", "aff_domain": ";tugraz.at;u-bordeaux.fr", "position": ";Full Professor;Associate Professor", "bibtex": "@inproceedings{\ngermain2022visual,\ntitle={Visual Correspondence Hallucination},\nauthor={Hugo Germain and Vincent Lepetit and Guillaume Bourmaud},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=jaLDP8Hp_gc}\n}", "github": "", "project": "", "reviewers": "nobf;BNXs;uCMw", "pdf_size": 0, "recommendation": "5;8;8", "confidence": "4;3;4", "correctness": "3;3;4", "technical_novelty": "2;3;4", "empirical_novelty": "2;3;3", "wc_summary_paper": "82;86;118", "wc_summary_review": "44;52;33", "wc_main_review": "231;230;623", "wc_review": "357;368;774", "wc_reply_reviewers": "293;54;135", "wc_reply_authors": "1261;574;640", "reply_reviewers": "1;1;2", "reply_authors": "2;1;2", "recommendation_avg": [ 7.0, 1.4142135623730951 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 3.0, 0.816496580927726 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 95.33333333333333, 16.110727964792762 ], "wc_summary_review_avg": [ 43.0, 7.788880963698615 ], "wc_main_review_avg": [ 361.3333333333333, 185.0267247964166 ], "wc_review_avg": [ 499.6666666666667, 194.03493385356037 ], "wc_reply_reviewers_avg": [ 160.66666666666666, 99.24492710237413 ], "wc_reply_authors_avg": [ 825.0, 309.4737468671616 ], "reply_reviewers_avg": [ 1.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.5, "corr_recommendation_correctness": 0.5, "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4848428848671309172&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=jaLDP8Hp_gc", "email": ";tugraz.at;u-bordeaux.fr", "author_num": 3, "aff_unique_index": "0;1", "aff_unique_norm": "Graz University of Technology;University of Bordeaux", "aff_unique_dep": ";", "aff_unique_url": "https://www.tugraz.at;https://www.u-bordeaux.fr", "aff_unique_abbr": "TU Graz;UBordeaux", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "Austria;France" }, { "title": "Constraining Linear-chain CRFs to Regular Languages", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6325", "id": "jbrgwbv8nD", "poster": "", "openreview": "https://openreview.net/forum?id=jbrgwbv8nD", "slides": "https://iclr.cc/virtual/2022/poster/6325", "video": "https://iclr.cc/virtual/2022/poster/6325", "author_site": "Sean Papay, Roman Klinger, Sebastian Pado", "tldr": "", "abstract": "A major challenge in structured prediction is to represent the interdependencies within output structures. When outputs are structured as sequences, linear-chain conditional random fields (CRFs) are a widely used model class which can learn local dependencies in the output. However, the CRF's Markov assumption makes it impossible for CRFs to represent distributions with nonlocal dependencies, and standard CRFs are unable to respect nonlocal constraints of the data (such as global arity constraints on output labels). We present a generalization of CRFs that can enforce a broad class of constraints, including nonlocal ones, by specifying the space of possible output structures as a regular language $\\mathcal{L}$. The resulting regular-constrained CRF (RegCCRF) has the same formal properties as a standard CRF, but assigns zero probability to all label sequences not in $\\mathcal{L}$. Notably, RegCCRFs can incorporate their constraints during training, while related models only enforce constraints during decoding. We prove that constrained training is never worse than constrained decoding, and show empirically that it can be substantially better in practice. Additionally, we demonstrate a practical benefit on downstream tasks by incorporating a RegCCRF into a deep neural model for semantic role labeling, exceeding state-of-the-art results on a standard dataset.", "keywords": "constrained training;probabilistic graphical models;CRF;semantic role labeling;sequence labeling", "primary_area": "", "supplementary_material": "/attachment/700225ac0d206eb1e771f378fca207d80a70e864.zip", "author": "Sean Papay;Roman Klinger;Sebastian Pado", "authorids": "~Sean_Papay1;~Roman_Klinger1;~Sebastian_Pado1", "gender": ";M;M", "homepage": "https://www.ims.uni-stuttgart.de/institut/team/Papay/;https://www.romanklinger.de;https://nlpado.de/~sebastian", "dblp": "229/3086;21/4183;p/SebastianPado", "google_scholar": ";https://scholar.google.de/citations?user=1flvefwAAAAJ;vKqag_AAAAAJ", "orcid": ";0000-0002-2014-6619;", "linkedin": ";romanklinger/;", "or_profile": "~Sean_Papay1;~Roman_Klinger1;~Sebastian_Pado1", "aff": "University of Stuttgart;University of Stuttgart;University of Stuttgart, Universit\u00e4t Stuttgart", "aff_domain": "ims.uni-stuttgart.de;uni-stuttgart.de;ims.uni-stuttgart.de", "position": "Postdoc;Associate Professor;Professor", "bibtex": "@inproceedings{\npapay2022constraining,\ntitle={Constraining Linear-chain {CRF}s to Regular Languages},\nauthor={Sean Papay and Roman Klinger and Sebastian Pado},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=jbrgwbv8nD}\n}", "github": "", "project": "", "reviewers": "qQNs;82Jb;1EHx;ephA", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "4;4;4;4", "correctness": "2;3;4;4", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;3;2;3", "wc_summary_paper": "140;89;316;139", "wc_summary_review": "29;35;48;63", "wc_main_review": "378;314;228;489", "wc_review": "547;438;592;691", "wc_reply_reviewers": "0;0;0;26", "wc_reply_authors": "1006;427;140;347", "reply_reviewers": "0;0;0;1", "reply_authors": "2;1;1;1", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 171.0, 86.21774759294051 ], "wc_summary_review_avg": [ 43.75, 13.06474263045392 ], "wc_main_review_avg": [ 352.25, 95.21652955238392 ], "wc_review_avg": [ 567.0, 90.88729284118875 ], "wc_reply_reviewers_avg": [ 6.5, 11.258330249197702 ], "wc_reply_authors_avg": [ 480.0, 321.23745111677124 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.7608859102526822, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14062623964127434433&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=jbrgwbv8nD", "email": "ims.uni-stuttgart.de;uni-stuttgart.de;ims.uni-stuttgart.de", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Stuttgart", "aff_unique_dep": "", "aff_unique_url": "https://www.uni-stuttgart.de", "aff_unique_abbr": "USTuttgart", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Germany" }, { "title": "Skill-based Meta-Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6702", "id": "jeLW-Fh9bV", "poster": "", "openreview": "https://openreview.net/forum?id=jeLW-Fh9bV", "slides": "https://iclr.cc/virtual/2022/poster/6702", "video": "https://iclr.cc/virtual/2022/poster/6702", "author_site": "Taewook Nam, Shao-Hua Sun, Karl Pertsch, Sung Ju Hwang, Joseph Lim", "tldr": "", "abstract": "While deep reinforcement learning methods have shown impressive results in robot learning, their sample inefficiency makes the learning of complex, long-horizon behaviors with real robot systems infeasible. To mitigate this issue, meta-reinforcement learning methods aim to enable fast learning on novel tasks by learning how to learn. Yet, the application has been limited to short-horizon tasks with dense rewards. To enable learning long-horizon behaviors, recent works have explored leveraging prior experience in the form of offline datasets without reward or task annotations. While these approaches yield improved sample efficiency, millions of interactions with environments are still required to solve complex tasks. In this work, we devise a method that enables meta-learning on long-horizon, sparse-reward tasks, allowing us to solve unseen target tasks with orders of magnitude fewer environment interactions. Our core idea is to leverage prior experience extracted from offline datasets during meta-learning. Specifically, we propose to (1) extract reusable skills and a skill prior from offline datasets, (2) meta-train a high-level policy that learns to efficiently compose learned skills into long-horizon behaviors, and (3) rapidly adapt the meta-trained policy to solve an unseen target task. Experimental results on continuous control tasks in navigation and manipulation demonstrate that the proposed method can efficiently solve long-horizon novel target tasks by combining the strengths of meta-learning and the usage of offline datasets, while prior approaches in RL, meta-RL, and multi-task RL require substantially more environment interactions to solve the tasks.", "keywords": "meta-RL;meta-reinforcement learning;skill-based meta-reinforcement learning;meta-learning;skill-based RL", "primary_area": "", "supplementary_material": "", "author": "Taewook Nam;Shao-Hua Sun;Karl Pertsch;Sung Ju Hwang;Joseph J Lim", "authorids": "~Taewook_Nam1;~Shao-Hua_Sun1;~Karl_Pertsch1;~Sung_Ju_Hwang1;~Joseph_J_Lim1", "gender": "M;M;;;M", "homepage": ";http://shaohua0116.github.io;https://kpertsch.github.io/;;http://people.csail.mit.edu/lim/", "dblp": ";158/9680;211/7137;;08/3086", "google_scholar": ";uXsfnaQAAAAJ;https://scholar.google.com/citations?view_op=list_works;;jTnQTBoAAAAJ", "orcid": ";0000-0001-7579-6734;;;", "linkedin": "https://www.linkedin.com/feed/;shaohua0116/;;;", "or_profile": "~Taewook_Nam1;~Shao-Hua_Sun1;~Karl_Pertsch1;~Sung_Ju_Hwang1;~Joseph_J_Lim1", "aff": "Korea Advanced Institute of Science & Technology;University of Southern California;University of Southern California;;Korea Advanced Institute of Science & Technology", "aff_domain": "kaist.ac.kr;usc.edu;usc.edu;;kaist.ac.kr", "position": "PhD student;PhD student;PhD student;;Associate Professor", "bibtex": "@inproceedings{\nnam2022skillbased,\ntitle={Skill-based Meta-Reinforcement Learning},\nauthor={Taewook Nam and Shao-Hua Sun and Karl Pertsch and Sung Ju Hwang and Joseph J Lim},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=jeLW-Fh9bV}\n}", "github": "", "project": "", "reviewers": "1NZY;AfGF;6qBE;jZvC", "pdf_size": 0, "recommendation": "6;6;6;8", "confidence": "4;4;4;4", "correctness": "3;3;4;4", "technical_novelty": "2;2;2;3", "empirical_novelty": "3;2;2;3", "wc_summary_paper": "63;114;100;204", "wc_summary_review": "42;147;29;56", "wc_main_review": "687;414;337;585", "wc_review": "792;675;466;845", "wc_reply_reviewers": "424;290;449;0", "wc_reply_authors": "1407;748;1161;473", "reply_reviewers": "1;2;2;0", "reply_authors": "3;3;3;1", "recommendation_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 120.25, 51.81879485283308 ], "wc_summary_review_avg": [ 68.5, 46.31684358848301 ], "wc_main_review_avg": [ 505.75, 137.86474349883656 ], "wc_review_avg": [ 694.5, 145.55840752082995 ], "wc_reply_reviewers_avg": [ 290.75, 178.41997505884817 ], "wc_reply_authors_avg": [ 947.25, 361.13458363884234 ], "reply_reviewers_avg": [ 1.25, 0.82915619758885 ], "reply_authors_avg": [ 2.5, 0.8660254037844386 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 64, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8367908304037497189&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "pdf": "https://openreview.net/pdf?id=jeLW-Fh9bV", "email": "kaist.ac.kr;usc.edu;usc.edu;;kaist.ac.kr", "author_num": 5, "aff_unique_index": "0;1;1;0", "aff_unique_norm": "Korea Advanced Institute of Science and Technology;University of Southern California", "aff_unique_dep": ";", "aff_unique_url": "https://www.kaist.ac.kr;https://www.usc.edu", "aff_unique_abbr": "KAIST;USC", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Los Angeles", "aff_country_unique_index": "0;1;1;0", "aff_country_unique": "South Korea;United States" }, { "id": "jf3q5f-uedA", "title": "Autonomous Shaping of Latent-Spaces from Reduced PDEs for Physical Neural Networks", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Numerical simulations using partial differential equations (PDEs) are a central tool for a wide variety of scientific and engineering applications. Due to their challenging nature, many numerical methods rely on a reduced representation of degrees of freedom and adopt an efficient solver that solves the PDEs in the reduced space. In general, however, it is extremely challenging to faithfully preserve the correct solutions over long timespans with reduced representations. This problem is particularly pronounced for solutions with large amounts of small scale features. To address this, data-driven methods can learn to restore the details as required for accurate solutions of the underlying PDE problem. This paper studies the training of deep neural network models that autonomously interact with a PDE solver to achieve the desired solutions. In contrast to previous work, we do not constrain the PDE solver but instead give the neural network complete freedom to shape the PDE solutions as degrees of freedom of a latent space. Surprisingly, this autonomy allows the neural network to discover new physical dynamics that allow for better performance in the given learning objectives. We showcase that this approach allows the trained encoder to transform accurate solutions into abstract yet physical reduced representations, which are significantly different from conventional down-sampling results. Moreover, we demonstrate that our decoder outperforms models trained with different methodologies in terms of restoration accuracy.", "keywords": "latent space representations;reduced PDE solvers;partial differential equations;differentiable physics", "primary_area": "", "supplementary_material": "/attachment/eca53dcc9a2b40f5e32ac83aee78f19afc43daf4.zip", "author": "Chlo\u00e9 Paliard;Nils Thuerey;Marco Cagnazzo;Kiwon Um", "authorids": "~Chlo\u00e9_Paliard1;~Nils_Thuerey1;marco.cagnazzo@telecom-paris.fr;~Kiwon_Um1", "gender": ";M;;M", "homepage": "https://perso.telecom-paristech.fr/cpaliard/;https://ge.in.tum.de;;https://perso.telecom-paristech.fr/kum/", "dblp": ";42/478;;71/6369", "google_scholar": ";https://scholar.google.com.tw/citations?user=GEehwv8AAAAJ;;H2Omi3wAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Chlo\u00e9_Paliard1;~Nils_Thuerey1;marco.cagnazzo@telecom-paris.fr;~Kiwon_Um1", "aff": "T\u00e9l\u00e9com Paris;Technical University Munich;;Telecom Paris", "aff_domain": "telecom-paristech.fr;tum.de;;telecom-paris.fr", "position": "PhD student;Associate Professor;;Assistant Professor", "bibtex": "@misc{\npaliard2022autonomous,\ntitle={Autonomous Shaping of Latent-Spaces from Reduced {PDE}s for Physical Neural Networks},\nauthor={Chlo{\\'e} Paliard and Nils Thuerey and Marco Cagnazzo and Kiwon Um},\nyear={2022},\nurl={https://openreview.net/forum?id=jf3q5f-uedA}\n}", "github": "", "project": "", "reviewers": "uzHf;1fKi;BFRe;C7Ld", "site": "https://openreview.net/forum?id=jf3q5f-uedA", "pdf_size": 0, "recommendation": "3;5;6;6", "confidence": "3;4;3;3", "correctness": "2;3;3;3", "technical_novelty": "4;3;3;3", "empirical_novelty": "3;2;3;3", "wc_summary_paper": "72;103;122;134", "wc_summary_review": "38;68;73;38", "wc_main_review": "1015;403;346;369", "wc_review": "1125;574;541;541", "wc_reply_reviewers": "52;0;0;0", "wc_reply_authors": "1030;447;513;721", "reply_reviewers": "1;0;0;0", "reply_authors": "3;1;1;1", "recommendation_avg": [ 5.0, 1.224744871391589 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 107.75, 23.41340428045439 ], "wc_summary_review_avg": [ 54.25, 16.345871038277526 ], "wc_main_review_avg": [ 533.25, 278.87665284135926 ], "wc_review_avg": [ 695.25, 248.4817649245111 ], "wc_reply_reviewers_avg": [ 13.0, 22.516660498395403 ], "wc_reply_authors_avg": [ 677.75, 227.12262656987744 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.5, 0.8660254037844386 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.9428090415820632, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:WReOjBNPE-kJ:scholar.google.com/&scioq=Autonomous+Shaping+of+Latent-Spaces+from+Reduced+PDEs+for+Physical+Neural+Networks&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1;2", "aff_unique_norm": "T\u00e9l\u00e9com Paris;Technical University of Munich;Telecom Paris", "aff_unique_dep": ";;", "aff_unique_url": "https://www.telecom-paris.fr;https://www.tum.de;https://www.telecom-paris.fr", "aff_unique_abbr": "T\u00e9l\u00e9com Paris;TUM;Telecom Paris", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", "aff_country_unique": "France;Germany" }, { "id": "jgAl403zfau", "title": "HALP: Hardware-Aware Latency Pruning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Structural pruning can simplify network architecture and improve the inference speed. We propose Hardware-Aware Latency Pruning (HALP) that formulates structural pruning as a global resource allocation optimization problem, aiming at maximizing the accuracy while constraining latency under a predefined budget. For filter importance ranking, HALP leverages latency lookup table to track latency reduction potential and global saliency score to gauge on accuracy drop. Both metrics can be evaluated very efficiently during pruning, allowing us to reformulate global structural pruning under a reward maximization problem given target constraint. This makes the problem solvable via our augmented knapsack solver, enabling HALP to surpass prior work in pruning efficacy and accuracy-efficiency trade-off. We examine HALP on both classification and detection tasks, over varying networks, on ImageNet1K and VOC datasets. In particular for ResNet-50/-101 pruning on ImageNet1K, HALP improves network speed by $1.60\\times$/$1.90\\times$ with $+0.3\\%$/$-0.2\\%$ top-1 accuracy changes, respectively. For SSD pruning on VOC, HALP improves throughput by $1.94\\times$ with only a $0.56$ mAP drop. HALP consistently outperforms prior art, sometimes by large margins.", "keywords": "Efficient deep learning;deep neural network pruning;latency reduction;hardware-aware pruning", "primary_area": "", "supplementary_material": "", "author": "Maying Shen;Hongxu Yin;Pavlo Molchanov;Lei Mao;Jianna Liu;Jose M. Alvarez", "authorids": "~Maying_Shen1;~Hongxu_Yin2;~Pavlo_Molchanov1;~Lei_Mao1;jiannal@nvidia.com;~Jose_M._Alvarez2", "gender": ";;M;M;;", "homepage": "https://mayings.github.io/;;;https://leimao.github.io/;;", "dblp": "195/2178;;165/8169.html;;;", "google_scholar": "https://scholar.google.com/citations?hl=en;;J9PoyoIAAAAJ;R2VUf7YAAAAJ;;", "orcid": "0009-0000-9416-680X;;;0000-0001-8579-3182;;", "linkedin": ";;;;;", "or_profile": "~Maying_Shen1;~Hongxu_Yin2;~Pavlo_Molchanov1;~Lei_Mao1;jiannal@nvidia.com;~Jose_M._Alvarez2", "aff": "NVIDIA;;NVIDIA Research;NVIDIA;;", "aff_domain": "nvidia.com;;nvidia.com;nvidia.com;;", "position": "Deep Learning R&D Engineer;;Research Scientist;Deep Learning Engineer;;", "bibtex": "@misc{\nshen2022halp,\ntitle={{HALP}: Hardware-Aware Latency Pruning},\nauthor={Maying Shen and Hongxu Yin and Pavlo Molchanov and Lei Mao and Jianna Liu and Jose M. Alvarez},\nyear={2022},\nurl={https://openreview.net/forum?id=jgAl403zfau}\n}", "github": "", "project": "", "reviewers": "dTnS;ywGD;DgPf;2vJd", "site": "https://openreview.net/forum?id=jgAl403zfau", "pdf_size": 0, "recommendation": "5;6;6;6", "confidence": "3;5;5;4", "correctness": "3;3;4;4", "technical_novelty": "2;3;4;3", "empirical_novelty": "2;3;4;3", "wc_summary_paper": "68;61;64;58", "wc_summary_review": "33;37;23;74", "wc_main_review": "328;367;263;147", "wc_review": "429;465;350;279", "wc_reply_reviewers": "0;25;0;54", "wc_reply_authors": "353;734;752;397", "reply_reviewers": "0;1;0;1", "reply_authors": "2;1;1;2", "recommendation_avg": [ 5.75, 0.4330127018922193 ], "confidence_avg": [ 4.25, 0.82915619758885 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 62.75, 3.6996621467371855 ], "wc_summary_review_avg": [ 41.75, 19.30511590226798 ], "wc_main_review_avg": [ 276.25, 83.35878777909382 ], "wc_review_avg": [ 380.75, 71.98046610018582 ], "wc_reply_reviewers_avg": [ 19.75, 22.252808811473667 ], "wc_reply_authors_avg": [ 559.0, 184.76606831342167 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.8703882797784891, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6814167406924458789&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;0", "aff_unique_norm": "NVIDIA", "aff_unique_dep": "NVIDIA Corporation", "aff_unique_url": "https://www.nvidia.com", "aff_unique_abbr": "NVIDIA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "jkpT8c7jal4", "title": "On Deep Neural Network Calibration by Regularization and its Impact on Refinement", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Deep neural networks have been shown to be highly miscalibrated. often they tend to be overconfident in their predictions. It poses a significant challenge for safety-critical systems to utilise deep neural networks (DNNs), reliably. Many recently proposed approaches to mitigate this have demonstrated substantial progress in improving DNN calibration. However, they hardly touch upon refinement, which historically has been an essential aspect of calibration. Refinement indicates separability of a network's correct and incorrect predictions. This paper presents a theoretically and empirically supported exposition reviewing refinement of a calibrated model. Firstly, we show the breakdown of expected calibration error (ECE), into predicted confidence and refinement under the assumption of over-confident predictions. Secondly, linking with this result, we highlight that regularisation based calibration only focuses on naively reducing a model's confidence. This logically has a severe downside to a model's refinement as correct and incorrect predictions become tightly coupled. Lastly, connecting refinement with ECE also provides support to existing refinement based approaches which improve calibration but do not explain the reasoning behind it. We support our claims through rigorous empirical evaluations of many state of the art calibration approaches on widely used datasets and neural networks. We find that many calibration approaches with the likes of label smoothing, mixup etc. lower the usefulness of a DNN by degrading its refinement. Even under natural data shift, this calibration-refinement trade-off holds for the majority of calibration methods.", "keywords": "calibration;refinement", "primary_area": "", "supplementary_material": "", "author": "Aditya Singh", "authorids": "~Aditya_Singh3", "gender": "M", "homepage": "", "dblp": "", "google_scholar": "kkE_sOoAAAAJ", "orcid": "", "linkedin": "", "or_profile": "~Aditya_Singh3", "aff": "Zebra Technologies", "aff_domain": "zebra.com", "position": "Computer Vision Researcher II", "bibtex": "@misc{\nsingh2022on,\ntitle={On Deep Neural Network Calibration by Regularization and its Impact on Refinement},\nauthor={Aditya Singh},\nyear={2022},\nurl={https://openreview.net/forum?id=jkpT8c7jal4}\n}", "github": "", "project": "", "reviewers": "bU5Z;N3vq;n7zc;upRd", "site": "https://openreview.net/forum?id=jkpT8c7jal4", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "3;4;4;3", "correctness": "2;2;1;3", "technical_novelty": "3;2;2;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "52;41;58;52", "wc_summary_review": "31;139;53;127", "wc_main_review": "178;278;489;306", "wc_review": "261;458;600;485", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 2.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 50.75, 6.139014578904337 ], "wc_summary_review_avg": [ 87.5, 46.354611421087334 ], "wc_main_review_avg": [ 312.75, 112.33293150274322 ], "wc_review_avg": [ 451.0, 121.96925842194827 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 1, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.816496580927726, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14762622862834183787&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0", "aff_unique_norm": "Zebra Technologies Corporation", "aff_unique_dep": "", "aff_unique_url": "https://www.zebra.com", "aff_unique_abbr": "Zebra", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "id": "jm0Ppu7xvok", "title": "An Optimally Weighted Echo State Neural Network for Highly Chaotic Time Series Modelling", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "We demonstrate the development and implementation of a series of echo state neural networks in conjunction with optimal weighted averaging to produce robust, model-free predictions of highly chaotic time series. We deploy our model on simulated mass accretion data sets governing the formation of a protostar, accretion disk and gas envelope. Our methodology extends the parallel series approach of averaging all reservoir outputs by instead selecting an optimal set of weights for each realization representative of what we believe constructs the optimal output signal that best represents the data\u2019s underlying temporal dynamics. The method is demonstrated by modelling hydrodynamic and stellar evolution simulations that are representative of highly chaotic systems.", "keywords": "time series;neural networks;young stellar objects", "primary_area": "", "supplementary_material": "", "author": "Gianfranco Bino;Gabriel Bruno Benigno;Lyle Muller;Shantanu Basu", "authorids": "~Gianfranco_Bino1;~Gabriel_Bruno_Benigno1;~Lyle_Muller1;~Shantanu_Basu1", "gender": "M;;M;M", "homepage": ";;http://mullerlab.ca;http://www.astro.uwo.ca/~basu/", "dblp": ";;;", "google_scholar": "voRXmW4AAAAJ;https://scholar.google.ca/citations?user=BsNdLCkAAAAJ;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Gianfranco_Bino1;~Gabriel_Bruno_Benigno1;~Lyle_Muller1;~Shantanu_Basu1", "aff": ";University of Western Ontario;Western University;University of Western Ontario", "aff_domain": ";uwo.ca;uwo.ca;uwo.ca", "position": ";PhD student;Assistant Professor;Full Professor", "bibtex": "@misc{\nbino2022an,\ntitle={An Optimally Weighted Echo State Neural Network for Highly Chaotic Time Series Modelling},\nauthor={Gianfranco Bino and Gabriel Bruno Benigno and Lyle Muller and Shantanu Basu},\nyear={2022},\nurl={https://openreview.net/forum?id=jm0Ppu7xvok}\n}", "github": "", "project": "", "reviewers": "EEHG;joZs;LLxS;4wnp", "site": "https://openreview.net/forum?id=jm0Ppu7xvok", "pdf_size": 0, "recommendation": "3;3;3;3", "confidence": "5;4;3;3", "correctness": "3;3;3;3", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;2;2;1", "wc_summary_paper": "38;20;39;15", "wc_summary_review": "54;24;23;23", "wc_main_review": "94;165;86;83", "wc_review": "186;209;148;121", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 1.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 28.0, 10.653637876331258 ], "wc_summary_review_avg": [ 31.0, 13.285330255586423 ], "wc_main_review_avg": [ 107.0, 33.726843908080106 ], "wc_review_avg": [ 166.0, 33.904277016329374 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:eff8Fh_wf-0J:scholar.google.com/&scioq=An+Optimally+Weighted+Echo+State+Neural+Network+for+Highly+Chaotic+Time+Series+Modelling&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;0", "aff_unique_norm": "University of Western Ontario;Western University", "aff_unique_dep": ";", "aff_unique_url": "https://www.uwo.ca;https://www.uwo.ca", "aff_unique_abbr": "UWO;Western", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Canada" }, { "id": "jm1RxJFQdDN", "title": "Perturbation Diversity Certificates Robust Generalisation", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Whilst adversarial training has been proven the most effective defending method against adversarial attacks for deep neural networks, it suffers from overfitting on unseen adversarial data and thus may not guarantee robust generalisation. It is possibly due to the fact that the conventional adversarial training methods generate adversarial perturbations usually in a supervised way, so that the adversarial samples are highly biased towards the decision boundary, resulting in an inhomogeneous data distribution. To mitigate this limitation, we propose a novel adversarial training method from a perturbation diversity perspective. Specifically, we generate perturbed samples not only adversarially but also diversely, so as to certificate significant robustness improvement through a homogeneous data distribution. We provide both theoretical and empirical analysis which establishes solid foundation to well support the proposed method. To verify our methods\u2019 effectiveness, we conduct extensive experiments over different datasets (e.g., CIFAR-10, CIFAR-100, SVHN) with different adversarial attacks (e.g., PGD, CW). Experimental results show that our method outperforms other state-of-the-arts (e.g., PGD and Feature Scattering) in robust generalisation performance. (Source codes are available in the supplementary material.)", "keywords": "adversarial example;robust generalisation;adversarial training", "primary_area": "", "supplementary_material": "/attachment/32171451ae74b6191f490f6d7600c8361218996a.zip", "author": "Zhuang QIAN;Shufei Zhang;Kaizhu Huang;Qiufeng Wang;Bin Gu;Huan Xiong;Xinping Yi", "authorids": "~Zhuang_QIAN1;~Shufei_Zhang1;~Kaizhu_Huang1;~Qiufeng_Wang2;~Bin_Gu1;~Huan_Xiong1;~Xinping_Yi1", "gender": "M;M;M;M;M;M;M", "homepage": ";;https://sites.google.com/view/kaizhu-huang-homepage;https://scholar.xjtlu.edu.cn/en/persons/QiufengWang;https://mbzuai.ac.ae/study/faculty/bin-gu/;https://scholar.google.com/citations?user=l4hm14MAAAAJ&hl=en;https://sites.google.com/site/xinpingyi00/", "dblp": "234/7908;152/7935;99/3390;86/7443-1;29/1758-1;;95/10043.html", "google_scholar": ";;https://scholar.google.com/citations?hl=en;qj66yXAAAAAJ;Vo8OgCgAAAAJ;l4hm14MAAAAJ;wAcbI5kAAAAJ", "orcid": ";;;0000-0002-0918-4606;0000-0001-6049-1815;;", "linkedin": ";;;;;;", "or_profile": "~Zhuang_QIAN1;~Shufei_Zhang1;~Kaizhu_Huang1;~Qiufeng_Wang2;~Bin_Gu1;~Huan_Xiong1;~Xinping_Yi1", "aff": "Xi'an Jiaotong-Liverpool University;Shanghai AI Lab;Xi'an Jiaotong-Liverpool University;Xi'an Jiaotong-Liverpool University;Mohamed bin Zayed University of Artificial Intelligence;;University of Liverpool", "aff_domain": "xjtlu.edu.cn;pjlab.org.cn;xjtlu.edu.cn;xjtlu.edu.cn;mbzuai.ac.ae;;liverpool.ac.uk", "position": "PhD student;Researcher;Full Professor;Associate Professor;Assistant Professor;;Assistant Professor", "bibtex": "@misc{\nqian2022perturbation,\ntitle={Perturbation Diversity Certificates Robust Generalisation},\nauthor={Zhuang QIAN and Shufei Zhang and Kaizhu Huang and Qiufeng Wang and Bin Gu and Huan Xiong and Xinping Yi},\nyear={2022},\nurl={https://openreview.net/forum?id=jm1RxJFQdDN}\n}", "github": "", "project": "", "reviewers": "Tbv6;aPPm;QmED;7z25", "site": "https://openreview.net/forum?id=jm1RxJFQdDN", "pdf_size": 0, "recommendation": "3;3;5;6", "confidence": "5;4;3;3", "correctness": "2;2;3;3", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "26;51;28;67", "wc_summary_review": "55;15;2;21", "wc_main_review": "284;481;85;264", "wc_review": "365;547;115;352", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.25, 1.299038105676658 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 43.0, 16.98528775146303 ], "wc_summary_review_avg": [ 23.25, 19.57517560585345 ], "wc_main_review_avg": [ 278.5, 140.257798357168 ], "wc_review_avg": [ 344.75, 153.42160050005995 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": -0.8703882797784892, "corr_recommendation_correctness": 0.9622504486493761, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:cg2acsDwqosJ:scholar.google.com/&scioq=Perturbation+Diversity+Certificates+Robust+Generalisation&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1;0;0;2;3", "aff_unique_norm": "Xi'an Jiao Tong-Liverpool University;Shanghai AI Lab;Mohamed bin Zayed University of Artificial Intelligence;University of Liverpool", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.xjtu.edu.cn/en;https://www.shanghaiailab.com;https://mbzuai.ac.ae;https://www.liverpool.ac.uk", "aff_unique_abbr": "XJTLU;SAIL;MBZUAI;Liv Uni", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Xi'an;", "aff_country_unique_index": "0;0;0;0;1;2", "aff_country_unique": "China;United Arab Emirates;United Kingdom" }, { "title": "Is Fairness Only Metric Deep? Evaluating and Addressing Subgroup Gaps in Deep Metric Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6112", "id": "js62_xuLDDv", "poster": "", "openreview": "https://openreview.net/forum?id=js62_xuLDDv", "slides": "https://iclr.cc/virtual/2022/poster/6112", "video": "https://iclr.cc/virtual/2022/poster/6112", "author_site": "Natalie Dullerud, Karsten Roth, Kimia Hamidieh, Nicolas Papernot, Marzyeh Ghassemi", "tldr": "", "abstract": "Deep metric learning (DML) enables learning with less supervision through its emphasis on the similarity structure of representations. There has been much work on improving generalization of DML in settings like zero-shot retrieval, but little is known about its implications for fairness. In this paper, we are the first to evaluate state-of-the-art DML methods trained on imbalanced data, and to show the negative impact these representations have on minority subgroup performance when used for downstream tasks. In this work, we first define fairness in DML through an analysis of three properties of the representation space -- inter-class alignment, intra-class alignment, and uniformity -- and propose \\textit{\\textbf{finDML}}, the \\textit{\\textbf{f}}airness \\textit{\\textbf{i}}n \\textit{\\textbf{n}}on-balanced \\textit{\\textbf{DML}} benchmark to characterize representation fairness. Utilizing \\textit{finDML}, we find bias in DML representations to propagate to common downstream classification tasks. Surprisingly, this bias is propagated even when training data in the downstream task is re-balanced. To address this problem, we present Partial Attribute De-correlation (\\textit{\\textbf{\\pad}}) to disentangle feature representations from sensitive attributes and reduce performance gaps between subgroups in both embedding space and downstream metrics.", "keywords": "deep metric learning;fairness;representation learning", "primary_area": "", "supplementary_material": "/attachment/9c2d8e567434fbb161734f50533b624e1ae94b8d.zip", "author": "Natalie Dullerud;Karsten Roth;Kimia Hamidieh;Nicolas Papernot;Marzyeh Ghassemi", "authorids": "~Natalie_Dullerud1;~Karsten_Roth1;~Kimia_Hamidieh1;~Nicolas_Papernot1;~Marzyeh_Ghassemi2", "gender": ";Not Specified;F;M;F", "homepage": ";https://karroth.com/;;https://www.papernot.fr;https://www.healthyml.org/", "dblp": ";234/7803;;162/1405;145/6563", "google_scholar": ";93ZjIs0AAAAJ;;cGxq0cMAAAAJ;", "orcid": ";;;;", "linkedin": "natalie-dullerud-777ba5178/;;kimia-hamidieh-956519212/;nicolaspapernot;", "or_profile": "~Natalie_Dullerud1;~Karsten_Roth1;~Kimia_Hamidieh1;~Nicolas_Papernot1;~Marzyeh_Ghassemi2", "aff": "Toronto University;University of Tuebingen;University of Toronto;Google;Massachusetts Institute of Technology", "aff_domain": "utoronto.ca;uni-tuebingen.de;cs.toronto.edu;google.com;mit.edu", "position": "MS student;PhD student;MS student;Research Scientist;Assistant Professor", "bibtex": "@inproceedings{\ndullerud2022is,\ntitle={Is Fairness Only Metric Deep? Evaluating and Addressing Subgroup Gaps in Deep Metric Learning},\nauthor={Natalie Dullerud and Karsten Roth and Kimia Hamidieh and Nicolas Papernot and Marzyeh Ghassemi},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=js62_xuLDDv}\n}", "github": "", "project": "", "reviewers": "vBmc;RQoJ;ryjh;atFA", "pdf_size": 0, "recommendation": "6;6;6;8", "confidence": "4;4;3;3", "correctness": "3;3;3;4", "technical_novelty": "3;2;3;3", "empirical_novelty": "3;3;3;3", "wc_summary_paper": "77;105;101;34", "wc_summary_review": "71;62;61;48", "wc_main_review": "245;389;225;57", "wc_review": "393;556;387;139", "wc_reply_reviewers": "0;0;64;0", "wc_reply_authors": "729;1519;438;387", "reply_reviewers": "0;0;1;0", "reply_authors": "1;4;1;1", "recommendation_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 79.25, 28.23450902707536 ], "wc_summary_review_avg": [ 60.5, 8.200609733428363 ], "wc_main_review_avg": [ 229.0, 117.74548823628021 ], "wc_review_avg": [ 368.75, 148.97042491716266 ], "wc_reply_reviewers_avg": [ 16.0, 27.712812921102035 ], "wc_reply_authors_avg": [ 768.25, 452.65404836364826 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.75, 1.299038105676658 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 1.0, "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9843183579476580291&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=js62_xuLDDv", "email": "utoronto.ca;uni-tuebingen.de;cs.toronto.edu;google.com;mit.edu", "author_num": 5, "aff_unique_index": "0;1;0;2;3", "aff_unique_norm": "University of Toronto;University of Tuebingen;Google;Massachusetts Institute of Technology", "aff_unique_dep": ";;Google;", "aff_unique_url": "https://www.utoronto.ca;https://www.uni-tuebingen.de/;https://www.google.com;https://web.mit.edu", "aff_unique_abbr": "U of T;Uni T\u00fcbingen;Google;MIT", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;1;0;2;2", "aff_country_unique": "Canada;Germany;United States" }, { "id": "jxTRL-VOoQo", "title": "Evaluating Deep Graph Neural Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "Graph Neural Networks (GNNs) have already been widely applied in various graph mining tasks. However, most GNNs only have shallow architectures, which limits performance improvement. In this paper, we conduct a systematic experimental evaluation on the fundamental limitations of current architecture designs. Based on the experimental results, we answer the following two essential questions: (1) what actually leads to the compromised performance of deep GNNs; (2) how to build deep GNNs. The answers to the above questions provide empirical insights and guidelines for researchers to design deep GNNs. Further, we present Deep Graph Multi-Layer Perceptron (DGMLP), a powerful approach implementing our proposed guidelines. Experimental results demonstrate three advantages of DGMLP: 1) high accuracy -- it achieves state-of-the-art node classification performance on various datasets; 2) high flexibility -- it can flexibly choose different propagation and transformation depths according to certain graph properties; 3) high scalability and efficiency -- it supports fast training on large-scale graphs.", "keywords": "Deep;Graph Neural Networks", "primary_area": "", "supplementary_material": "/attachment/8714062109e35c3a8940b051a27de2bf5839548e.zip", "author": "Wentao Zhang;Zeang Sheng;Jiang Yuezihan;Yikuan Xia;Jun Gao;Zhi Yang;Bin CUI", "authorids": "~Wentao_Zhang1;~Zeang_Sheng1;~Jiang_Yuezihan1;~Yikuan_Xia1;~Jun_Gao6;~Zhi_Yang4;~Bin_CUI2", "gender": "M;F;M;M;M;M;M", "homepage": "https://scholar.google.com/citations?user=cIaU0iIAAAAJ&hl=en;https://github.com/lovelyhan;;https://cs.pku.edu.cn/info/1066/1619.htm;https://yangzhihome.github.io/;https://cuibinpku.github.io/index.html;https://zwt233.github.io/", "dblp": "298/0674;;;82/4977-3.html;90/5587-1;55/5031.html;41/3249-1.html", "google_scholar": "cIaU0iIAAAAJ;;;afqMY6UAAAAJ;;IJAU8KoAAAAJ;JE4VON0AAAAJ", "orcid": "0009-0002-4427-3038;;my-orcid?orcid=0000-0002-5483-9309;0000-0002-6750-8496;;0000-0003-1681-4677;0000-0002-7532-5550", "linkedin": ";;;;;;", "or_profile": "~Zeang_Sheng1;~Jiang_Yuezihan1;~Yikuan_Xia1;~Jun_Gao6;~Zhi_Yang4;~Bin_CUI2;~Zhang_wen_tao1", "aff": "Peking University;Peking University;;computer science department, Peking University;Peking University;Peking University;Peking University", "aff_domain": "pku.edu.cn;pku.edu.cn;;cs.pku.edu.cn;pku.edu.cn;pku.edu.cn;pku.edu.cn", "position": "Undergrad student;MS student;;Full Professor;Associate Professor;Full Professor;PhD student", "bibtex": "@misc{\nzhang2022evaluating,\ntitle={Evaluating Deep Graph Neural Networks},\nauthor={Wentao Zhang and Zeang Sheng and Jiang Yuezihan and Yikuan Xia and Jun Gao and Zhi Yang and Bin CUI},\nyear={2022},\nurl={https://openreview.net/forum?id=jxTRL-VOoQo}\n}", "github": "", "project": "", "reviewers": "tpnS;ddQY;4Jnf;QtrD", "site": "https://openreview.net/forum?id=jxTRL-VOoQo", "pdf_size": 0, "recommendation": "3;3;5;6", "confidence": "4;5;5;4", "correctness": "2;2;2;3", "technical_novelty": "2;1;2;3", "empirical_novelty": "2;1;3;3", "wc_summary_paper": "88;33;61;214", "wc_summary_review": "42;64;43;70", "wc_main_review": "137;653;324;284", "wc_review": "267;750;428;568", "wc_reply_reviewers": "0;0;116;190", "wc_reply_authors": "382;1013;1297;539", "reply_reviewers": "0;0;1;1", "reply_authors": "1;2;3;2", "recommendation_avg": [ 4.25, 1.299038105676658 ], "confidence_avg": [ 4.5, 0.5 ], "correctness_avg": [ 2.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 99.0, 69.18453584436337 ], "wc_summary_review_avg": [ 54.75, 12.43734296383275 ], "wc_main_review_avg": [ 349.5, 188.55304293487285 ], "wc_review_avg": [ 503.25, 177.87267215623652 ], "wc_reply_reviewers_avg": [ 76.5, 80.85017006784834 ], "wc_reply_authors_avg": [ 807.75, 365.7125749820479 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 2.0, 0.7071067811865476 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": -0.19245008972987526, "corr_recommendation_correctness": 0.7777777777777777, "gs_citation": 36, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12847467981585094977&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;0;0;0;0", "aff_unique_norm": "Peking University", "aff_unique_dep": "", "aff_unique_url": "http://www.pku.edu.cn", "aff_unique_abbr": "Peking U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "China" }, { "id": "jxdyknFeCqO", "title": "Full-Precision Free Binary Graph Neural Networks", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Binary neural networks have become a promising research topic due to their fast inference speed and low energy consumption advantages. However, most existing works focus on binary convolutional neural networks, while less attention has been paid to binary graph neural networks. A common drawback of existing works on binary graph neural networks is that they still include lots of inefficient full-precision operations and hence are not efficient enough. In this paper, we propose a novel method, called full-precision free binary graph neural networks (FFBGN), to avoid full-precision operations for binarizing graph neural networks. To address the challenges introduced by re-quantization which is a necessary procedure for avoiding full-precision operations, in FFBGN we first study the impact of different computation orders to find an effective computation order and then introduce mixture of experts to increase the model capacity. Experiments on three large-scale datasets show that performing re-quantization in different computation orders significantly impacts the performance of binary graph neural network models, and FFBGN can outperform other baselines to achieve state-of-the-art performance.", "keywords": "Graph Neural Networks;Binary Neural Networks;Mixture of Experts", "primary_area": "", "supplementary_material": "", "author": "Kai-Lang Yao;Wu-Jun Li", "authorids": "~Kai-Lang_Yao1;~Wu-Jun_Li1", "gender": ";M", "homepage": ";https://cs.nju.edu.cn/lwj/", "dblp": "217/1659;26/988.html", "google_scholar": "rjA8qzkAAAAJ;NCCdqdcAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Kai-Lang_Yao1;~Wu-Jun_Li1", "aff": "Nanjing University;Nanjing University", "aff_domain": "nju.edu.cn;nju.edu.cn", "position": "PhD student;Full Professor", "bibtex": "@misc{\nyao2022fullprecision,\ntitle={Full-Precision Free Binary Graph Neural Networks},\nauthor={Kai-Lang Yao and Wu-Jun Li},\nyear={2022},\nurl={https://openreview.net/forum?id=jxdyknFeCqO}\n}", "github": "", "project": "", "reviewers": "y9Sq;4uJ3;Xf2a", "site": "https://openreview.net/forum?id=jxdyknFeCqO", "pdf_size": 0, "recommendation": "3;3;3", "confidence": "5;2;4", "correctness": "2;3;3", "technical_novelty": "3;2;2", "empirical_novelty": "2;2;2", "wc_summary_paper": "71;39;74", "wc_summary_review": "19;20;46", "wc_main_review": "464;168;764", "wc_review": "554;227;884", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 3.6666666666666665, 1.247219128924647 ], "correctness_avg": [ 2.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 61.333333333333336, 15.839472494022296 ], "wc_summary_review_avg": [ 28.333333333333332, 12.498888839501783 ], "wc_main_review_avg": [ 465.3333333333333, 243.3178077238811 ], "wc_review_avg": [ 555.0, 268.220058906861 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:R2Z2byJXqXQJ:scholar.google.com/&scioq=Full-Precision+Free+Binary+Graph+Neural+Networks&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Nanjing University", "aff_unique_dep": "", "aff_unique_url": "https://www.nju.edu.cn", "aff_unique_abbr": "Nanjing U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "id": "k-ES3OH7eqp", "title": "De novo design of protein target specific scaffold-based Inhibitors via Reinforcement Learning", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Efficient design and discovery of target-driven molecules is a critical step in facilitating lead optimization in drug discovery. Current approaches to develop molecules for a given protein target are intuition-driven, hampered by slow iterative design-test cycles due to computational challenges in utilizing 3D structural data, and ultimately limited by the expertise of the chemist \u2013 leading to bottlenecks in molecular design. In this contribution, we propose a novel framework, called 3D-MolGNN_RL, coupling reinforcement learning (RL) to a deep generative model based on 3D-Scaffold to generate target candidates specific to a protein pocket building up atom by atom from the starting core scaffold. 3D-MolGNN_RL provides an efficient way to optimize key features by multi-objective reward function within a protein pocket using parallel graph neural network models. The agent learns to build molecules in 3D space while optimizing the binding affinity, potency, and synthetic accessibility of the candidates generated for the SARS-CoV-2 Main Protease.", "keywords": "Protein target specific molecular design;reinforcement learning;Graph Neural Networks;Lead molecule optimization;Drug Discovery;and Protein-ligand interaction.", "primary_area": "", "supplementary_material": "/attachment/9a915cbd96794e45fcbc71aa12a3373714d87ec4.zip", "author": "Mridula Bontha;Andrew McNaughton;Carter Knutson;Jenna Pope;Neeraj Kumar", "authorids": "mridula.bontha@pnnl.gov;andrew.mcnaughton@pnnl.gov;carter.knutson@pnnl.gov;jenna.pope@pnnl.gov;~Neeraj_Kumar4", "gender": ";;;;M", "homepage": ";;;;https://www.pnnl.gov/people/neeraj-kumar-phd", "dblp": ";;;;64/2584", "google_scholar": ";;;;https://scholar.google.ca/citations?user=OWEJf5EAAAAJ", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "mridula.bontha@pnnl.gov;andrew.mcnaughton@pnnl.gov;carter.knutson@pnnl.gov;jenna.pope@pnnl.gov;~Neeraj_Kumar4", "aff": ";;;;Pacific Northwest National Laboratory", "aff_domain": ";;;;pnnl.gov", "position": ";;;;Chief Data Scientist", "bibtex": "@misc{\nbontha2022de,\ntitle={De novo design of protein target specific scaffold-based Inhibitors via Reinforcement Learning},\nauthor={Mridula Bontha and Andrew McNaughton and Carter Knutson and Jenna Pope and Neeraj Kumar},\nyear={2022},\nurl={https://openreview.net/forum?id=k-ES3OH7eqp}\n}", "github": "", "project": "", "reviewers": "w8E8;TFgy;obay;CG6o", "site": "https://openreview.net/forum?id=k-ES3OH7eqp", "pdf_size": 0, "recommendation": "1;3;3;3", "confidence": "4;4;4;4", "correctness": "2;2;1;3", "technical_novelty": "1;2;2;2", "empirical_novelty": "0;2;2;3", "wc_summary_paper": "35;65;43;111", "wc_summary_review": "25;67;295;36", "wc_main_review": "30;1553;61;435", "wc_review": "90;1685;399;582", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 2.5, 0.8660254037844386 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 2.0, 0.7071067811865476 ], "technical_novelty_avg": [ 1.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.75, 1.0897247358851685 ], "wc_summary_paper_avg": [ 63.5, 29.542342493444895 ], "wc_summary_review_avg": [ 105.75, 110.34349776946533 ], "wc_main_review_avg": [ 519.75, 617.4736330403105 ], "wc_review_avg": [ 689.0, 601.3247874485136 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12459942082700772909&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0", "aff_unique_norm": "Pacific Northwest National Laboratory", "aff_unique_dep": "", "aff_unique_url": "https://www.pnnl.gov", "aff_unique_abbr": "PNNL", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "id": "k-sNDIPY-1T", "title": "Modelling neuronal behaviour with time series regression: Recurrent Neural Networks on synthetic C. elegans data", "track": "main", "status": "Reject", "tldr": "", "abstract": "Given the inner complexity of the human nervous system, insight into the dynamics of brain activity can be gained from understanding smaller and simpler organisms, such as the nematode C. elegans. The behavioural and structural biology of these organisms is well-known, making them prime candidates for benchmarking modelling and simulation techniques. In these complex neuronal collections, classical white-box modelling techniques based on intrinsic structural or behavioural information are either unable to capture the profound nonlinearities of the neuronal response to different stimuli or generate extremely complex models, which are computationally intractable. In this paper we investigate whether it is possible to generate lower complexity black-box models that can capture the system dynamics with low error using only measured or simulated input-output information. We show how the nervous system of C. elegans can be modelled and simulated with data-driven models using different neural network architectures. Specifically, we target the use of state of the art recurrent neural networks architectures such as LSTMs and GRUs and compare these architectures in terms of their properties and their RMSE, as well as the complexity of the resulting models. \nWe show that GRU models with a hidden layer size of 4 units are able to accurately reproduce the system's response to very different stimuli.", "keywords": "Data-driven models;RNNs;LSTMs;GRUs;C. Elegans;Time series regression;black-box", "primary_area": "", "supplementary_material": "", "author": "Gon\u00e7alo Leote Cardoso Mestre;Ruxandra Barbulescu;Arlindo L. Oliveira;L. Miguel Silveira", "authorids": "~Gon\u00e7alo_Leote_Cardoso_Mestre1;~Ruxandra_Barbulescu1;~Arlindo_L._Oliveira1;~L._Miguel_Silveira1", "gender": "M;F;M;M", "homepage": "https://www.inesc-id.pt/member/23477/;;http://web.tecnico.ulisboa.pt/arlindo.oliveira/;https://fenix.tecnico.ulisboa.pt/homepage/ist12270/apresentacao", "dblp": ";;o/ArlindoLOliveira;s/LuisMiguelSilveira.html", "google_scholar": ";;dqtEnaoAAAAJ;IMt587kAAAAJ", "orcid": ";0000-0001-8960-5329;0000-0001-8638-5594;", "linkedin": ";;arlindo-oliveira-4119a1a/;luis-miguel-silveira-77a3304/?originalSubdomain=pt", "or_profile": "~Gon\u00e7alo_Leote_Cardoso_Mestre1;~Ruxandra_Barbulescu1;~Arlindo_L._Oliveira1;~L._Miguel_Silveira1", "aff": "Instituto Superior T\u00e9cnico;INESC-ID: Instituto de Engenharia de Sistemas e Computadores, Investiga\u00e7\u00e3o e Desenvolvimento em Lisboa;INESC-ID;INESC ID - Systems and Computers Research Institute, R&D Lisbon", "aff_domain": "tecnico.ulisboa.pt;inesc-id.pt;inesc-id.pt;inesc-id.pt", "position": "MS student;Postdoc;Researcher;Principal Researcher", "bibtex": "@misc{\nmestre2022modelling,\ntitle={Modelling neuronal behaviour with time series regression: Recurrent Neural Networks on synthetic C. elegans data},\nauthor={Gon{\\c{c}}alo Leote Cardoso Mestre and Ruxandra Barbulescu and Arlindo L. Oliveira and L. Miguel Silveira},\nyear={2022},\nurl={https://openreview.net/forum?id=k-sNDIPY-1T}\n}", "github": "", "project": "", "reviewers": "mRji;Y3zd;okcY", "site": "https://openreview.net/forum?id=k-sNDIPY-1T", "pdf_size": 0, "recommendation": "3;6;8", "confidence": "5;5;4", "correctness": "3;4;4", "technical_novelty": "1;3;2", "empirical_novelty": "2;3;3", "wc_summary_paper": "31;87;126", "wc_summary_review": "55;42;61", "wc_main_review": "512;80;404", "wc_review": "598;209;591", "wc_reply_reviewers": "124;0;559", "wc_reply_authors": "800;0;460", "reply_reviewers": "1;0;2", "reply_authors": "1;0;3", "recommendation_avg": [ 5.666666666666667, 2.0548046676563256 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "correctness_avg": [ 3.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.0, 0.816496580927726 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 81.33333333333333, 38.990027214945904 ], "wc_summary_review_avg": [ 52.666666666666664, 7.93025150224688 ], "wc_main_review_avg": [ 332.0, 183.56470248934025 ], "wc_review_avg": [ 466.0, 181.74891104671485 ], "wc_reply_reviewers_avg": [ 227.66666666666666, 239.69471324072953 ], "wc_reply_authors_avg": [ 420.0, 327.8210894171799 ], "reply_reviewers_avg": [ 1.0, 0.816496580927726 ], "reply_authors_avg": [ 1.3333333333333333, 1.247219128924647 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.8029550685469661, "corr_recommendation_correctness": 0.9176629354822472, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14128026455820047829&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "Instituto Superior T\u00e9cnico;Instituto de Engenharia de Sistemas e Computadores, Investiga\u00e7\u00e3o e Desenvolvimento;INESC-ID;INESC ID", "aff_unique_dep": ";;;Systems and Computers Research Institute", "aff_unique_url": "https://www.ist.utl.pt;https://www.inesc-id.pt;https://www.inesc-id.pt;https://www.inesc-id.pt", "aff_unique_abbr": "IST;INESC-ID;INESC-ID;", "aff_campus_unique_index": "1", "aff_campus_unique": ";Lisboa", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Portugal" }, { "id": "k0pi7xDoDTC", "title": "IID-GAN: an IID Sampling Perspective for Regularizing Mode Collapse", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Despite its success, generative adversarial networks (GANs) still suffer from mode collapse, namely the generator can only map latent variables to a partial set of modes of the target distribution. In this paper, we analyze and try to regularize this issue with an independent and identically distributed (IID) sampling perspective and emphasize that holding the IID property for generation for target distribution (i.e. real distribution) can naturally avoid mode collapse. This is based on the basic IID assumption for real data in machine learning. However, though the source samples $\\mathbf{z}$ obey IID, the generation $G(\\mathbf{z})$ may not necessarily be IID from the target distribution. Based on this observation, we propose a necessary condition of IID generation and provide a new loss to encourage the closeness between the inverse source of real data and the Gaussian source in the latent space to regularize the generation to be IID from the target distribution. The logic is that the inverse samples from target data should also be IID in the source distribution. Experiments on both synthetic and real-world data show the effectiveness of our model.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Liangliang Shi;Yang Li;Alan Yuhan Xi;Junchi Yan", "authorids": "~Liangliang_Shi1;~Yang_Li32;alanyxi@gmail.com;~Junchi_Yan2", "gender": "M;M;;", "homepage": ";https://yangco-le.github.io;;", "dblp": "89/8730;;;", "google_scholar": "Qf1k8lUAAAAJ;ecE0xDIAAAAJ;;", "orcid": "0000-0001-7033-4207;0000-0002-5249-3471;;", "linkedin": ";;;", "or_profile": "~Liangliang_Shi1;~Yang_Li32;alanyxi@gmail.com;~Junchi_Yan2", "aff": "Shanghai Jiaotong University;Shanghai Jiaotong University;;", "aff_domain": "sjtu.edu.cn;sjtu.edu.cn;;", "position": "PhD student;Undergrad student;;", "bibtex": "@misc{\nshi2022iidgan,\ntitle={{IID}-{GAN}: an {IID} Sampling Perspective for Regularizing Mode Collapse},\nauthor={Liangliang Shi and Yang Li and Alan Yuhan Xi and Junchi Yan},\nyear={2022},\nurl={https://openreview.net/forum?id=k0pi7xDoDTC}\n}", "github": "", "project": "", "reviewers": "GM9o;hyhz;aMta;zPja", "site": "https://openreview.net/forum?id=k0pi7xDoDTC", "pdf_size": 0, "recommendation": "3;5;5;5", "confidence": "4;4;3;4", "correctness": "2;3;3;3", "technical_novelty": "1;2;2;2", "empirical_novelty": "1;2;1;2", "wc_summary_paper": "63;90;34;105", "wc_summary_review": "25;103;44;5", "wc_main_review": "212;219;408;178", "wc_review": "300;412;486;288", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.5, 0.8660254037844386 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 1.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.5, 0.5 ], "wc_summary_paper_avg": [ 73.0, 27.08320512790168 ], "wc_summary_review_avg": [ 44.25, 36.615399765672365 ], "wc_main_review_avg": [ 254.25, 90.11208298557969 ], "wc_review_avg": [ 371.5, 81.90695941127348 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": 1.0, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12715800391165920803&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 6, "aff_unique_index": "0;0", "aff_unique_norm": "Shanghai Jiao Tong University", "aff_unique_dep": "", "aff_unique_url": "https://www.sjtu.edu.cn", "aff_unique_abbr": "SJTU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "id": "k32ZY1CmE0", "title": "How to train RNNs on chaotic data?", "track": "main", "status": "Reject", "tldr": "", "abstract": "Recurrent neural networks (RNNs) are wide-spread machine learning tools for modeling sequential and time series data. They are notoriously hard to train because their loss gradients backpropagated in time tend to saturate or diverge during training. This is known as the exploding and vanishing gradient problem. Previous solutions to this issue either built on rather complicated, purpose-engineered architectures with gated memory buffers, or - more recently - imposed constraints that ensure convergence to a fixed point or restrict (the eigenspectrum of) the recurrence matrix. Such constraints, however, convey severe limitations on the expressivity of the RNN. Essential intrinsic dynamics such as multistability or chaos are disabled. This is inherently at disaccord with the chaotic nature of many, if not most, time series encountered in nature and society. Here we offer a comprehensive theoretical treatment of this problem by relating the loss gradients during RNN training to the Lyapunov spectrum of RNN-generated orbits. We mathematically prove that RNNs producing stable equilibrium or cyclic behavior have bounded gradients, whereas the gradients of RNNs with chaotic dynamics always diverge. Based on these analyses and insights, we offer an effective yet simple training technique for chaotic data and guidance on how to choose relevant hyperparameters according to the Lyapunov spectrum. \n", "keywords": "dynamical systems;back-propagation through time;chaos;recurrent neural networks;LSTM;Lyapunov spectrum;time series", "primary_area": "", "supplementary_material": "", "author": "Zahra Monfared;Jonas Magdy Mikhaeil;Daniel Durstewitz", "authorids": "~Zahra_Monfared1;~Jonas_Magdy_Mikhaeil1;~Daniel_Durstewitz1", "gender": "F;M;", "homepage": ";;https://durstewitzlab.github.io", "dblp": ";;98/2120", "google_scholar": "https://scholar.google.pl/citations?user=OPUIwIoAAAAJ;;https://scholar.google.de/citations?user=2bcbKU0AAAAJ", "orcid": ";0000-0001-6745-7505;0000-0002-9340-3786", "linkedin": ";;", "or_profile": "~Zahra_Monfared1;~Jonas_Magdy_Mikhaeil1;~Daniel_Durstewitz1", "aff": "Heidelberg University(STRUCTURES);Heidelberg University;Heidelberg University", "aff_domain": "uni-heidelberg.de;uni-heidelberg.de;uni-heidelberg.de", "position": "Postdoc;MS student;Full Professor", "bibtex": "@misc{\nmonfared2022how,\ntitle={How to train {RNN}s on chaotic data?},\nauthor={Zahra Monfared and Jonas Magdy Mikhaeil and Daniel Durstewitz},\nyear={2022},\nurl={https://openreview.net/forum?id=k32ZY1CmE0}\n}", "github": "", "project": "", "reviewers": "DWxP;rTPw;BxLc;F1yn", "site": "https://openreview.net/forum?id=k32ZY1CmE0", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "4;4;3;4", "correctness": "2;3;4;4", "technical_novelty": "2;2;3;2", "empirical_novelty": "2;2;3;2", "wc_summary_paper": "33;76;78;245", "wc_summary_review": "16;68;10;197", "wc_main_review": "168;487;177;633", "wc_review": "217;631;265;1075", "wc_reply_reviewers": "0;0;0;526", "wc_reply_authors": "931;1236;342;3045", "reply_reviewers": "0;0;0;2", "reply_authors": "2;3;1;8", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 108.0, 81.11411714368838 ], "wc_summary_review_avg": [ 72.75, 75.19765621347517 ], "wc_main_review_avg": [ 366.25, 200.53350717523494 ], "wc_review_avg": [ 547.0, 344.3341400442309 ], "wc_reply_reviewers_avg": [ 131.5, 227.76468119530736 ], "wc_reply_authors_avg": [ 1388.5, 1008.9247989815693 ], "reply_reviewers_avg": [ 0.5, 0.8660254037844386 ], "reply_authors_avg": [ 3.5, 2.692582403567252 ], "replies_avg": [ 25, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.9045340337332909, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15811437095917049907&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;0", "aff_unique_norm": "Heidelberg University", "aff_unique_dep": "STRUCTURES", "aff_unique_url": "https://www.uni-heidelberg.de", "aff_unique_abbr": "Uni HD", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Germany" }, { "id": "k4jzOHrZ7F5", "title": "Interpreting Black-boxes Using Primitive Parameterized Functions", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "One approach for interpreting black-box machine learning models is to find a global approximation of the model using simple interpretable functions, which is called a metamodel (a model of the model). Approximating the black-box with a metamodel can be used to 1) estimate instance-wise feature importance; 2) understand the functional form of the model; 3) analyze feature interactions. In this work, we propose a new method for finding interpretable metamodels. Our approach utilizes Kolmogorov superposition theorem, which expresses multivariate functions as a composition of univariate functions (our primitive parameterized functions). This composition can be represented in the form of a tree. Inspired by symbolic regression, we use a modified form of genetic programming to search over different tree configurations. Gradient descent is used to optimize the parameters of a given configuration. Using several experiments, we show that our method outperforms recent metamodeling approaches suggested for interpreting black-boxes.\n", "keywords": "Interpretability;Symbolic Metamodeling;Symbolic Regression", "primary_area": "", "supplementary_material": "/attachment/f2795f18984521542a3c4bc72493b7eb656a6c3f.zip", "author": "Mahed Abroshan;Saumitra Mishra;Mohammad Mahdi Khalili", "authorids": "~Mahed_Abroshan1;~Saumitra_Mishra1;~Mohammad_Mahdi_Khalili3", "gender": "M;M;M", "homepage": ";https://sites.google.com/site/saumitramishrac4dm/;https://Khalilimahdi.github.io", "dblp": ";208/1387;159/2163.html", "google_scholar": "tYSPRRwAAAAJ;https://scholar.google.co.uk/citations?user=On6E6ogAAAAJ;hSgnKecAAAAJ", "orcid": ";;0000-0002-4223-3254", "linkedin": "mahed-abroshan/;;mohammad-mahdi-khalili-aa4241127", "or_profile": "~Mahed_Abroshan1;~Saumitra_Mishra1;~Mohammad_Mahdi_Khalili3", "aff": "Alan Turing Institute;J.P. Morgan Chase;University of Delaware", "aff_domain": "turing.ac.uk;jpmorgan.com;udel.edu", "position": "Postdoc;Researcher;Assistant Professor", "bibtex": "@misc{\nabroshan2022interpreting,\ntitle={Interpreting Black-boxes Using Primitive Parameterized Functions},\nauthor={Mahed Abroshan and Saumitra Mishra and Mohammad Mahdi Khalili},\nyear={2022},\nurl={https://openreview.net/forum?id=k4jzOHrZ7F5}\n}", "github": "", "project": "", "reviewers": "ir7c;zK8e;eUVj;gBvW", "site": "https://openreview.net/forum?id=k4jzOHrZ7F5", "pdf_size": 0, "recommendation": "3;3;5;6", "confidence": "4;3;4;5", "correctness": "2;2;3;3", "technical_novelty": "2;2;2;4", "empirical_novelty": "2;2;0;2", "wc_summary_paper": "115;121;115;107", "wc_summary_review": "31;36;46;100", "wc_main_review": "308;145;839;534", "wc_review": "454;302;1000;741", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "736;487;809;435", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 4.25, 1.299038105676658 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.5, 0.8660254037844386 ], "empirical_novelty_avg": [ 1.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 114.5, 4.9749371855331 ], "wc_summary_review_avg": [ 53.25, 27.52612395525385 ], "wc_main_review_avg": [ 456.5, 260.47888590056584 ], "wc_review_avg": [ 624.25, 268.1644784455988 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 616.75, 158.940830185324 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.816496580927726, "corr_recommendation_correctness": 0.9622504486493761, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:XDyMydQouh0J:scholar.google.com/&scioq=Interpreting+Black-boxes+Using+Primitive+Parameterized+Functions&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;2", "aff_unique_norm": "Alan Turing Institute;JPMorgan Chase & Co.;University of Delaware", "aff_unique_dep": ";;", "aff_unique_url": "https://www.turing.ac.uk;https://www.jpmorganchase.com;https://www.udel.edu", "aff_unique_abbr": "ATI;JPM;UD", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1", "aff_country_unique": "United Kingdom;United States" }, { "id": "k6F-4Bw7LpV", "title": "Distributional Generalization: Structure Beyond Test Error", "track": "main", "status": "Reject", "tldr": "", "abstract": "Classifiers in machine learning are often reduced to single dimensional quantities, such as test error or loss. Here, we initiate a much richer study of classifiers by considering the entire joint distribution of their inputs and outputs. We present both new empirical behaviors of standard classifiers, as well as quantitative conjectures which capture these behaviors. Informally, our conjecture states: the output distribution of an interpolating classifier matches the distribution of true labels, when conditioned on certain subgroups of the input space. For example, if we mislabel 30% of dogs as cats in the train set of CIFAR-10, then a ResNet trained to interpolation will in fact mislabel roughly 30% of dogs as cats on the *test set* as well, while leaving other classes unaffected. This conjecture has implications for the theory of overparameterization, scaling limits, implicit bias, and statistical consistency. Further, it can be seen as a new kind of generalization, which goes beyond measuring single-dimensional quantities to measuring entire distributions.", "keywords": "generalization;empirical phenomena;overparameterization", "primary_area": "", "supplementary_material": "", "author": "Preetum Nakkiran;Yamini Bansal", "authorids": "~Preetum_Nakkiran1;~Yamini_Bansal1", "gender": ";F", "homepage": "http://preetum.nakkiran.org;", "dblp": "151/6343;", "google_scholar": "zithBbUAAAAJ;uj1OljkAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Preetum_Nakkiran1;~Yamini_Bansal1", "aff": "University of California, San Diego;Harvard University", "aff_domain": "ucsd.edu;harvard.edu", "position": "Postdoc;PhD student", "bibtex": "@misc{\nnakkiran2022distributional,\ntitle={Distributional Generalization: Structure Beyond Test Error},\nauthor={Preetum Nakkiran and Yamini Bansal},\nyear={2022},\nurl={https://openreview.net/forum?id=k6F-4Bw7LpV}\n}", "github": "", "project": "", "reviewers": "bHpa;SzaL;4xds", "site": "https://openreview.net/forum?id=k6F-4Bw7LpV", "pdf_size": 0, "recommendation": "3;5;6", "confidence": "3;3;3", "correctness": "3;3;4", "technical_novelty": "2;2;2", "empirical_novelty": "2;2;4", "wc_summary_paper": "58;95;86", "wc_summary_review": "16;43;136", "wc_main_review": "432;122;399", "wc_review": "506;260;621", "wc_reply_reviewers": "440;113;186", "wc_reply_authors": "768;332;811", "reply_reviewers": "2;1;2", "reply_authors": "4;2;2", "recommendation_avg": [ 4.666666666666667, 1.247219128924647 ], "confidence_avg": [ 3.0, 0.0 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.9428090415820634 ], "wc_summary_paper_avg": [ 79.66666666666667, 15.755069730795299 ], "wc_summary_review_avg": [ 65.0, 51.40038910358559 ], "wc_main_review_avg": [ 317.6666666666667, 139.0115902442031 ], "wc_review_avg": [ 462.3333333333333, 150.5774071883148 ], "wc_reply_reviewers_avg": [ 246.33333333333334, 140.148334116234 ], "wc_reply_authors_avg": [ 637.0, 216.38083710593844 ], "reply_reviewers_avg": [ 1.6666666666666667, 0.4714045207910317 ], "reply_authors_avg": [ 2.6666666666666665, 0.9428090415820634 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.7559289460184545, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15387201536811022092&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1", "aff_unique_norm": "University of California, San Diego;Harvard University", "aff_unique_dep": ";", "aff_unique_url": "https://www.ucsd.edu;https://www.harvard.edu", "aff_unique_abbr": "UCSD;Harvard", "aff_campus_unique_index": "0", "aff_campus_unique": "San Diego;", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Cross-Lingual Transfer with Class-Weighted Language-Invariant Representations", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6898", "id": "k7-s5HSSPE5", "poster": "", "openreview": "https://openreview.net/forum?id=k7-s5HSSPE5", "slides": "https://iclr.cc/virtual/2022/poster/6898", "video": "https://iclr.cc/virtual/2022/poster/6898", "author_site": "Ruicheng Xian, Heng Ji, Han Zhao", "tldr": "", "abstract": "Recent advances in neural modeling have produced deep multilingual language models capable of extracting cross-lingual knowledge from non-parallel texts and enabling zero-shot downstream transfer. While their success is often attributed to shared representations, quantitative analyses are limited. Towards a better understanding, through empirical analyses, we show that the invariance of feature representations across languages\u2014an effect of shared representations\u2014strongly correlates with transfer performance. We also observe that distributional shifts in class priors between source and target language task data negatively affect performance, a largely overlooked issue that could cause negative transfer with existing unsupervised approaches. Based on these findings, we propose and evaluate a method for unsupervised transfer, called importance-weighted domain alignment (IWDA), that performs representation alignment with prior shift estimation and correction using unlabeled target language task data. Experiments demonstrate its superiority under large prior shifts, and show further performance gains when combined with existing semi-supervised learning techniques.", "keywords": "cross-lingual transfer;unsupervised cross-lingual learning;multilingual neural language model;domain adaptation", "primary_area": "", "supplementary_material": "", "author": "Ruicheng Xian;Heng Ji;Han Zhao", "authorids": "~Ruicheng_Xian1;~Heng_Ji3;~Han_Zhao1", "gender": "M;F;M", "homepage": "https://rxian.github.io;http://blender.cs.illinois.edu/hengji.html;https://hanzhaoml.github.io/", "dblp": "243/3086.html;;03/3520-2", "google_scholar": "Nmk26z4AAAAJ;z7GCqT4AAAAJ;x942ipYAAAAJ", "orcid": ";;0000-0002-8579-1600", "linkedin": ";;", "or_profile": "~Ruicheng_Xian1;~Heng_Ji3;~Han_Zhao1", "aff": "University of Illinois Urbana-Champaign;University of Illinois, Urbana-Champaign;University of Illinois, Urbana Champaign", "aff_domain": "illinois.edu;uiuc.edu;illinois.edu", "position": "PhD student;Full Professor;Assistant Professor", "bibtex": "@inproceedings{\nxian2022crosslingual,\ntitle={Cross-Lingual Transfer with Class-Weighted Language-Invariant Representations},\nauthor={Ruicheng Xian and Heng Ji and Han Zhao},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=k7-s5HSSPE5}\n}", "github": "", "project": "", "reviewers": "WfZj;aVzf;nGDq;gtqu", "pdf_size": 0, "recommendation": "6;6;6;6", "confidence": "3;4;3;5", "correctness": "3;4;2;3", "technical_novelty": "4;3;2;2", "empirical_novelty": "4;3;2;3", "wc_summary_paper": "52;201;112;157", "wc_summary_review": "52;145;144;61", "wc_main_review": "98;282;143;823", "wc_review": "202;628;399;1041", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "521;554;165;752", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 6.0, 0.0 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.75, 0.82915619758885 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 130.5, 55.174722473248565 ], "wc_summary_review_avg": [ 100.5, 44.11632350955823 ], "wc_main_review_avg": [ 336.5, 288.95371601694274 ], "wc_review_avg": [ 567.5, 312.18784409390446 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 498.0, 211.58331692267234 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11000711128096490232&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=k7-s5HSSPE5", "email": "illinois.edu;uiuc.edu;illinois.edu", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "University of Illinois Urbana-Champaign;University of Illinois", "aff_unique_dep": ";", "aff_unique_url": "https://illinois.edu;https://illinois.edu", "aff_unique_abbr": "UIUC;UIUC", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Urbana-Champaign", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Learning to Schedule Learning rate with Graph Neural Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6019", "id": "k7efTb0un9z", "poster": "", "openreview": "https://openreview.net/forum?id=k7efTb0un9z", "slides": "https://iclr.cc/virtual/2022/poster/6019", "video": "https://iclr.cc/virtual/2022/poster/6019", "author_site": "Yuanhao Xiong, Li-Cheng Lan, Xiangning Chen, Ruochen Wang, Cho-Jui Hsieh", "tldr": "", "abstract": "Recent decades have witnessed great development of stochastic optimization in training deep neural networks. Learning rate scheduling is one of the most important factors that influence the performance of stochastic optimizers like Adam. Traditional methods seek to find a relatively proper scheduling among a limited number of pre-defined rules and might not accommodate a particular target problem. Instead, we propose a novel Graph-Network-based Scheduler (GNS), aiming at learning a specific scheduling mechanism without restrictions to existing principles. By constructing a directed graph for the underlying neural network of the target problem, GNS encodes current dynamics with a graph message passing network and trains an agent to control the learning rate accordingly via reinforcement learning. The proposed scheduler can capture the intermediate layer information while being able to generalize to problems of varying scales. Besides, an efficient reward collection procedure is leveraged to speed up training. We evaluate our framework on benchmarking datasets, Fashion-MNIST and CIFAR10 for image classification, and GLUE for language understanding. GNS shows consistent improvement over popular baselines when training CNN and Transformer models. Moreover, GNS demonstrates great generalization to different datasets and network structures.", "keywords": "learning rate scheduling;graph neural networks", "primary_area": "", "supplementary_material": "/attachment/c7578d2e34717acfb6760abd1890f01ec87201a2.zip", "author": "Yuanhao Xiong;Li-Cheng Lan;Xiangning Chen;Ruochen Wang;Cho-Jui Hsieh", "authorids": "~Yuanhao_Xiong1;~Li-Cheng_Lan1;~Xiangning_Chen1;~Ruochen_Wang2;~Cho-Jui_Hsieh1", "gender": "M;M;M;M;M", "homepage": "https://xyh97.github.io/;https://lan-lc.github.io/;;https://ruocwang.github.io/;http://web.cs.ucla.edu/~chohsieh/index.html", "dblp": "232/1248;200/8672;56/7393;33/120;14/2770", "google_scholar": "DVKxiMkAAAAJ;https://scholar.google.com.tw/citations?view_op=list_works;vNcBx1sAAAAJ;8fXrlRAAAAAJ;Wy89g4IAAAAJ", "orcid": ";;;;", "linkedin": ";;;ruochen-wang-1699b1113/;", "or_profile": "~Yuanhao_Xiong1;~Li-Cheng_Lan1;~Xiangning_Chen1;~Ruochen_Wang2;~Cho-Jui_Hsieh1", "aff": "University of California, Los Angeles;University of California, Los Angeles;University of California, Los Angeles;University of California, Los Angeles;University of California, Los Angeles", "aff_domain": "cs.ucla.edu;ucla.edu;cs.ucla.edu;ucla.edu;ucla.edu", "position": "PhD student;PhD student;PhD student;PhD student;Assistant Professor", "bibtex": "@inproceedings{\nxiong2022learning,\ntitle={Learning to Schedule Learning rate with Graph Neural Networks},\nauthor={Yuanhao Xiong and Li-Cheng Lan and Xiangning Chen and Ruochen Wang and Cho-Jui Hsieh},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=k7efTb0un9z}\n}", "github": "", "project": "", "reviewers": "NP67;RAQV;RKqo;TPKw;woPs", "pdf_size": 0, "recommendation": "6;6;6;6;8", "confidence": "3;4;3;4;4", "correctness": "3;3;3;3;4", "technical_novelty": "3;2;3;2;3", "empirical_novelty": "3;3;2;2;3", "wc_summary_paper": "58;44;110;138;123", "wc_summary_review": "22;46;77;477;127", "wc_main_review": "137;143;473;125;267", "wc_review": "217;233;660;740;517", "wc_reply_reviewers": "0;38;0;38;31", "wc_reply_authors": "659;1172;1220;1688;261", "reply_reviewers": "0;1;0;1;1", "reply_authors": "1;4;3;4;2", "recommendation_avg": [ 6.4, 0.7999999999999999 ], "confidence_avg": [ 3.6, 0.4898979485566356 ], "correctness_avg": [ 3.2, 0.39999999999999997 ], "technical_novelty_avg": [ 2.6, 0.4898979485566356 ], "empirical_novelty_avg": [ 2.6, 0.4898979485566356 ], "wc_summary_paper_avg": [ 94.6, 36.95186057561919 ], "wc_summary_review_avg": [ 149.8, 167.32411661204134 ], "wc_main_review_avg": [ 229.0, 132.40543795479095 ], "wc_review_avg": [ 473.4, 215.0949557753505 ], "wc_reply_reviewers_avg": [ 21.4, 17.658992043715294 ], "wc_reply_authors_avg": [ 1000.0, 492.69260193349766 ], "reply_reviewers_avg": [ 0.6, 0.48989794855663565 ], "reply_authors_avg": [ 2.8, 1.16619037896906 ], "replies_avg": [ 26, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.408248290463863, "corr_recommendation_correctness": 1.0, "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9542962543224345540&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "pdf": "https://openreview.net/pdf?id=k7efTb0un9z", "email": "cs.ucla.edu;ucla.edu;cs.ucla.edu;ucla.edu;ucla.edu", "author_num": 5, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "University of California, Los Angeles", "aff_unique_dep": "", "aff_unique_url": "https://www.ucla.edu", "aff_unique_abbr": "UCLA", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Los Angeles", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Self-Supervised Graph Neural Networks for Improved Electroencephalographic Seizure Analysis", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7027", "id": "k9bx1EfHI_-", "poster": "", "openreview": "https://openreview.net/forum?id=k9bx1EfHI_-", "slides": "https://iclr.cc/virtual/2022/poster/7027", "video": "https://iclr.cc/virtual/2022/poster/7027", "author_site": "Siyi Tang, Jared Dunnmon, Khaled Saab, Xuan Zhang, Qianying Huang, Florian Dubost, Daniel Rubin, Christopher Lee-Messer", "tldr": "", "abstract": "Automated seizure detection and classification from electroencephalography (EEG) can greatly improve seizure diagnosis and treatment. However, several modeling challenges remain unaddressed in prior automated seizure detection and classification studies: (1) representing non-Euclidean data structure in EEGs, (2) accurately classifying rare seizure types, and (3) lacking a quantitative interpretability approach to measure model ability to localize seizures. In this study, we address these challenges by (1) representing the spatiotemporal dependencies in EEGs using a graph neural network (GNN) and proposing two EEG graph structures that capture the electrode geometry or dynamic brain connectivity, (2) proposing a self-supervised pre-training method that predicts preprocessed signals for the next time period to further improve model performance, particularly on rare seizure types, and (3) proposing a quantitative model interpretability approach to assess a model\u2019s ability to localize seizures within EEGs. When evaluating our approach on seizure detection and classification on a large public dataset (5,499 EEGs), we find that our GNN with self-supervised pre-training achieves 0.875 Area Under the Receiver Operating Characteristic Curve on seizure detection and 0.749 weighted F1-score on seizure classification, outperforming previous methods for both seizure detection and classification. Moreover, our self-supervised pre-training strategy significantly improves classification of rare seizure types (e.g. 47 points increase in combined tonic seizure accuracy over baselines). Furthermore, quantitative interpretability analysis shows that our GNN with self-supervised pre-training precisely localizes 25.4% focal seizures, a 21.9 point improvement over existing CNNs. Finally, by superimposing the identified seizure locations on both raw EEG signals and EEG graphs, our approach could provide clinicians with an intuitive visualization of localized seizure regions.", "keywords": "Graph neural network;Self-supervision;Interpretability;Visualization;Neuroscience;Electroencephalography;Seizure;Epilepsy;Time Series", "primary_area": "", "supplementary_material": "/attachment/a4cc6007cf7eeee223e5983202e49316eecb9ab4.zip", "author": "Siyi Tang;Jared Dunnmon;Khaled Kamal Saab;Xuan Zhang;Qianying Huang;Florian Dubost;Daniel Rubin;Christopher Lee-Messer", "authorids": "~Siyi_Tang1;~Jared_Dunnmon1;~Khaled_Kamal_Saab1;~Xuan_Zhang6;~Qianying_Huang1;~Florian_Dubost1;~Daniel_Rubin1;~Christopher_Lee-Messer1", "gender": "F;M;;;;M;;M", "homepage": "https://siyitang.me/;;https://web.stanford.edu/~ksaab/;;;http://floriandubost.com;http://rubin.web.stanford.edu;", "dblp": "184/7801;200/8265;176/4061;;;186/7845;;", "google_scholar": "https://scholar.google.com/citations?hl=en;;W77CiNUAAAAJ;;;_yNBmx8AAAAJ;;zEAw56MAAAAJ", "orcid": "0000-0001-7504-5885;;0000-0003-1427-0469;;;;;0000-0002-2938-6184", "linkedin": "tangsiyi/;;khaled-saab-181034122/;https://www.linkedin.com/chatin/wnc/in/kaylee-xuan-zhang-55b33811b;louise-huang-49b247b3/;fdubost/;;", "or_profile": "~Siyi_Tang1;~Jared_Dunnmon1;~Khaled_Kamal_Saab1;~Xuan_Zhang6;~Qianying_Huang1;~Florian_Dubost1;~Daniel_Rubin1;~Christopher_Lee-Messer1", "aff": "Stanford University;;Stanford University;;;Liminal Sciences Inc;Stanford University;", "aff_domain": "stanford.edu;;stanford.edu;;;liminalsciences.com;stanford.edu;", "position": "PhD student;;PhD student;;;Researcher;Full Professor;", "bibtex": "@inproceedings{\ntang2022selfsupervised,\ntitle={Self-Supervised Graph Neural Networks for Improved Electroencephalographic Seizure Analysis},\nauthor={Siyi Tang and Jared Dunnmon and Khaled Kamal Saab and Xuan Zhang and Qianying Huang and Florian Dubost and Daniel Rubin and Christopher Lee-Messer},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=k9bx1EfHI_-}\n}", "github": "", "project": "", "reviewers": "a7bX;6FoL;axL1", "pdf_size": 0, "recommendation": "6;8;8", "confidence": "4;4;4", "correctness": "3;3;3", "technical_novelty": "3;3;3", "empirical_novelty": "3;3;3", "wc_summary_paper": "46;75;41", "wc_summary_review": "47;34;78", "wc_main_review": "115;248;165", "wc_review": "208;357;284", "wc_reply_reviewers": "0;0;27", "wc_reply_authors": "859;1370;540", "reply_reviewers": "0;0;1", "reply_authors": "2;3;2", "recommendation_avg": [ 7.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 54.0, 14.98888477061141 ], "wc_summary_review_avg": [ 53.0, 18.457157599876172 ], "wc_main_review_avg": [ 176.0, 54.85131417447231 ], "wc_review_avg": [ 283.0, 60.83310502240262 ], "wc_reply_reviewers_avg": [ 9.0, 12.727922061357855 ], "wc_reply_authors_avg": [ 923.0, 341.8547449819392 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 2.3333333333333335, 0.4714045207910317 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 150, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12685516138349084049&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=k9bx1EfHI_-", "email": "stanford.edu;;stanford.edu;;;liminalsciences.com;stanford.edu;", "author_num": 8, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Stanford University;Liminal Sciences Inc", "aff_unique_dep": ";", "aff_unique_url": "https://www.stanford.edu;", "aff_unique_abbr": "Stanford;", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Stanford;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Attention-based Interpretability with Concept Transformers", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/5962", "id": "kAa9eDS0RdO", "poster": "", "openreview": "https://openreview.net/forum?id=kAa9eDS0RdO", "slides": "https://iclr.cc/virtual/2022/poster/5962", "video": "https://iclr.cc/virtual/2022/poster/5962", "author_site": "Mattia Rigotti, Christoph Miksovic, Ioana Giurgiu, Thomas Gschwind, Paolo Scotton", "tldr": "", "abstract": "Attention is a mechanism that has been instrumental in driving remarkable performance gains of deep neural network models in a host of visual, NLP and multimodal tasks.\nOne additional notable aspect of attention is that it conveniently exposes the ``reasoning'' behind each particular output generated by the model.\nSpecifically, attention scores over input regions or intermediate features have been interpreted as a measure of the contribution of the attended element to the model inference.\nWhile the debate in regard to the interpretability of attention is still not settled, researchers have pointed out the existence of architectures and scenarios that afford a meaningful interpretation of the attention mechanism.\n\nHere we propose the generalization of attention from low-level input features to high-level concepts as a mechanism to ensure the interpretability of attention scores within a given application domain.\nIn particular, we design the ConceptTransformer, a deep learning module that exposes explanations of the output of a model in which it is embedded in terms of attention over user-defined high-level concepts.\nSuch explanations are \\emph{plausible} (i.e.\\ convincing to the human user) and \\emph{faithful} (i.e.\\ truly reflective of the reasoning process of the model).\nPlausibility of such explanations is obtained by construction by training the attention heads to conform with known relations between inputs, concepts and outputs dictated by domain knowledge.\nFaithfulness is achieved by design by enforcing a linear relation between the transformer value vectors that represent the concepts and their contribution to the classification log-probabilities.\n\nWe validate our ConceptTransformer module on established explainability benchmarks and show how it can be used to infuse domain knowledge into classifiers to improve accuracy, and conversely to extract concept-based explanations of classification outputs. Code to reproduce our results is available at: \\url{https://github.com/ibm/concept_transformer}.", "keywords": "attention;transformer;concepts;interpretability", "primary_area": "", "supplementary_material": "", "author": "Mattia Rigotti;Christoph Miksovic;Ioana Giurgiu;Thomas Gschwind;Paolo Scotton", "authorids": "~Mattia_Rigotti1;~Christoph_Miksovic1;~Ioana_Giurgiu2;~Thomas_Gschwind1;psc@zurich.ibm.com", "gender": ";;;M;", "homepage": "http://www.matrig.net;;https://researcher.watson.ibm.com/researcher/view.php?person=zurich-IGI;http://www.gschwinds.net/tom/;", "dblp": "01/9816;;85/117;83/2798.html;", "google_scholar": "TmHt7CwAAAAJ;Y_WtyywAAAAJ;https://scholar.google.com/scholar?hl=en;qlGYL1QAAAAJ;", "orcid": "0000-0001-6466-2810;;0000-0001-7434-7873;;", "linkedin": ";;ioana-giurgiu-a739374/;;", "or_profile": "~Mattia_Rigotti1;~Christoph_Miksovic1;~Ioana_Giurgiu2;~Thomas_Gschwind1;psc@zurich.ibm.com", "aff": "International Business Machines;International Business Machines;International Business Machines;IBM Research;", "aff_domain": "ibm.com;ibm.com;ibm.com;zurich.ibm.com;", "position": "Researcher;Researcher;Research Scientist;Researcher;", "bibtex": "@inproceedings{\nrigotti2022attentionbased,\ntitle={Attention-based Interpretability with Concept Transformers},\nauthor={Mattia Rigotti and Christoph Miksovic and Ioana Giurgiu and Thomas Gschwind and Paolo Scotton},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=kAa9eDS0RdO}\n}", "github": "", "project": "", "reviewers": "St45;wR5k;H9Z1;5kGA", "pdf_size": 0, "recommendation": "5;5;6;8", "confidence": "5;4;4;4", "correctness": "2;4;3;4", "technical_novelty": "2;2;3;3", "empirical_novelty": "1;2;3;3", "wc_summary_paper": "83;68;43;101", "wc_summary_review": "112;14;19;68", "wc_main_review": "291;169;276;352", "wc_review": "486;251;338;521", "wc_reply_reviewers": "31;0;0;0", "wc_reply_authors": "714;667;513;279", "reply_reviewers": "1;0;0;0", "reply_authors": "2;1;1;1", "recommendation_avg": [ 6.0, 1.224744871391589 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 73.75, 21.25294097295713 ], "wc_summary_review_avg": [ 53.25, 39.94605737741836 ], "wc_main_review_avg": [ 272.0, 65.92799102050661 ], "wc_review_avg": [ 399.0, 109.63348028772963 ], "wc_reply_reviewers_avg": [ 7.75, 13.423393758658799 ], "wc_reply_authors_avg": [ 543.25, 169.71501848687404 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.4714045207910316, "corr_recommendation_correctness": 0.4923659639173309, "gs_citation": 85, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16544441967212254910&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=kAa9eDS0RdO", "email": "ibm.com;ibm.com;ibm.com;zurich.ibm.com;", "author_num": 5, "aff_unique_index": "0;0;0;1", "aff_unique_norm": "International Business Machines Corporation;IBM", "aff_unique_dep": ";IBM Research", "aff_unique_url": "https://www.ibm.com;https://www.ibm.com/research", "aff_unique_abbr": "IBM;IBM", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "kDF4Owotj5j", "title": "Thinking Deeper With Recurrent Networks: Logical Extrapolation Without Overthinking", "track": "main", "status": "Reject", "tldr": "", "abstract": "Classical machine learning systems perform best when they are trained and tested on the same distribution, and they lack a mechanism to increase model power after training is complete. In contrast, recent work has observed that recurrent networks can exhibit logical extrapolation; models trained only on small/simple problem instances can extend their abilities to solve large/complex instances at test time simply by performing more recurrent iterations. While preliminary results on these ``thinking systems'' are promising, existing recurrent systems, when iterated many times, often collapse rather than improve their performance. This ``overthinking'' phenomenon has prevented thinking systems from scaling to particularly large and complex problems. In this paper, we design a recall architecture that keeps an explicit copy of the problem instance in memory so that it cannot be forgotten. We also propose an incremental training routine that prevents the model from learning behaviors that are specific to iteration number and instead pushes it to learn behaviors that can be repeated indefinitely. Together, these design choices encourage models to converge to a steady state solution rather than deteriorate when many iterations are used. These innovations help to tackle the overthinking problem and boost deep thinking behavior on each of the benchmark tasks proposed by Schwarzschild et al. (2021a).", "keywords": "Deep learning;recurrent networks;thinking;extrapolation;generalization", "primary_area": "", "supplementary_material": "/attachment/78baafcd8c117f7c694424e25c365307ee8f5cf1.zip", "author": "Arpit Bansal;Avi Schwarzschild;Eitan Borgnia;Zeyad Emam;Furong Huang;Micah Goldblum;Tom Goldstein", "authorids": "~Arpit_Bansal1;~Avi_Schwarzschild1;~Eitan_Borgnia1;~Zeyad_Emam1;~Furong_Huang1;~Micah_Goldblum1;~Tom_Goldstein1", "gender": "M;M;M;F;;M;M", "homepage": "https://cs.umd.edu/~avi1;https://eitanborgnia.com;https://www.linkedin.com/in/zeyademam/;https://furong-huang.com;;https://www.cs.umd.edu/~tomg/;https://arpitbansal297.github.io/", "dblp": "249/9334.html;;220/1358;72/8513;241/7231;25/8184;190/9114", "google_scholar": "WNvQ7AcAAAAJ;;;13yyuCcAAAAJ;pGDKzuUAAAAJ;KmSuVtgAAAAJ;Pchxm4IAAAAJ", "orcid": ";;;;;;", "linkedin": ";;zeyademam/;;;;arpit-bansal-970865b1/", "or_profile": "~Avi_Schwarzschild1;~Eitan_Borgnia1;~Zeyad_Emam1;~Furong_Huang1;~Micah_Goldblum1;~Tom_Goldstein1;~Arpit_Amit_Bansal1", "aff": "University of Maryland, College Park;University of Maryland, College Park;University of Maryland Institute for Advanced Computer Studies, University of Maryland, College Park;University of Maryland;New York University;University of Maryland, College Park;University of Maryland, College Park", "aff_domain": "umd.edu;umd.edu;umiacs.umd.edu;cs.umd.edu;nyu.edu;umd.edu;umd.edu", "position": "PhD student;Researcher;PhD student;Assistant Professor;Postdoc;Associate Professor;PhD student", "bibtex": "@misc{\nbansal2022thinking,\ntitle={Thinking Deeper With Recurrent Networks: Logical Extrapolation Without Overthinking},\nauthor={Arpit Bansal and Avi Schwarzschild and Eitan Borgnia and Zeyad Emam and Furong Huang and Micah Goldblum and Tom Goldstein},\nyear={2022},\nurl={https://openreview.net/forum?id=kDF4Owotj5j}\n}", "github": "", "project": "", "reviewers": "ue13;ajNt;Ap3D;w7Ak", "site": "https://openreview.net/forum?id=kDF4Owotj5j", "pdf_size": 0, "recommendation": "5;5;6;8", "confidence": "4;4;4;5", "correctness": "3;3;3;4", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;1;3;4", "wc_summary_paper": "209;85;100;98", "wc_summary_review": "21;20;88;21", "wc_main_review": "383;542;627;429", "wc_review": "613;647;815;548", "wc_reply_reviewers": "131;0;788;0", "wc_reply_authors": "535;428;868;112", "reply_reviewers": "1;0;3;0", "reply_authors": "2;1;3;1", "recommendation_avg": [ 6.0, 1.224744871391589 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 1.118033988749895 ], "wc_summary_paper_avg": [ 123.0, 49.98499774932475 ], "wc_summary_review_avg": [ 37.5, 29.159046623646667 ], "wc_main_review_avg": [ 495.25, 95.56771159758928 ], "wc_review_avg": [ 655.75, 98.5834037756863 ], "wc_reply_reviewers_avg": [ 229.75, 326.7126987124927 ], "wc_reply_authors_avg": [ 485.75, 269.98368006233267 ], "reply_reviewers_avg": [ 1.0, 1.224744871391589 ], "reply_authors_avg": [ 1.75, 0.82915619758885 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.9428090415820632, "corr_recommendation_correctness": 0.9428090415820632, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:MVdCGSHUPrwJ:scholar.google.com/&scioq=Thinking+Deeper+With+Recurrent+Networks:+Logical+Extrapolation+Without+Overthinking&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;0;0;1;0;0", "aff_unique_norm": "University of Maryland;New York University", "aff_unique_dep": ";", "aff_unique_url": "https://www/umd.edu;https://www.nyu.edu", "aff_unique_abbr": "UMD;NYU", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "College Park;", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "kEvhVb452CC", "title": "Transformed CNNs: recasting pre-trained convolutional layers with self-attention", "track": "main", "status": "Reject", "tldr": "", "abstract": "Vision Transformers (ViT) have recently emerged as a powerful alternative to convolutional networks (CNNs). Although hybrid models attempt to bridge the gap between these two architectures, the self-attention layers they rely on induce a strong computational bottleneck, especially at large spatial resolutions. In this work, we explore the idea of reducing the time spent training these layers by initializing them from pre-trained convolutional layers. This enables us to transition smoothly from any pre-trained CNN to its functionally identical hybrid model, called Transformed CNN (T-CNN). With only 50 epochs of fine-tuning, the resulting T-CNNs demonstrate significant performance gains over the CNN as well as substantially improved robustness. We analyze the representations learnt by theT-CNN, providing deeper insights into the fruitful interplay between convolutions and self-attention.", "keywords": "convolutional networks;transformers;hybrid;fine-tuning", "primary_area": "", "supplementary_material": "/attachment/e0c5d593bda72644b417434ad005a7699d935c8f.zip", "author": "St\u00e9phane d'Ascoli;Levent Sagun;Giulio Biroli;Ari S. Morcos", "authorids": "~St\u00e9phane_d'Ascoli1;~Levent_Sagun1;~Giulio_Biroli1;~Ari_S._Morcos1", "gender": "M;Non-Binary;;M", "homepage": "https://sdascoli.github.io/;http://cims.nyu.edu/~sagun/;https://www.lpens.ens.psl.eu/giulio-biroli/;http://www.arimorcos.com", "dblp": "227/3250;155/9866;18/5547;217/3720", "google_scholar": "2GcqQgYAAAAJ;-iPZaBcAAAAJ;https://scholar.google.fr/citations?user=BadZJUsAAAAJ;v-A_7UsAAAAJ", "orcid": "0000-0002-3131-3371;0000-0001-5403-4124;;", "linkedin": "st%C3%A9phane-d-ascoli-182642130/;;;", "or_profile": "~St\u00e9phane_d'Ascoli1;~Levent_Sagun1;~Giulio_Biroli1;~Ari_Morcos1", "aff": "Ecole Normale Sup\u00e9rieure;Meta;Ecole Normale Superieure;Meta AI (FAIR)", "aff_domain": "ens.fr;meta.com;ens.fr;meta.com", "position": "PhD student;Research scientist;Full Professor;Research Scientist", "bibtex": "@misc{\nd'ascoli2022transformed,\ntitle={Transformed {CNN}s: recasting pre-trained convolutional layers with self-attention},\nauthor={St{\\'e}phane d'Ascoli and Levent Sagun and Giulio Biroli and Ari S. Morcos},\nyear={2022},\nurl={https://openreview.net/forum?id=kEvhVb452CC}\n}", "github": "", "project": "", "reviewers": "ojmG;zV42;ZjBY;Q4Pp", "site": "https://openreview.net/forum?id=kEvhVb452CC", "pdf_size": 0, "recommendation": "5;6;6;6", "confidence": "2;4;3;4", "correctness": "3;4;3;1", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "37;49;102;67", "wc_summary_review": "4;24;55;62", "wc_main_review": "112;129;379;151", "wc_review": "153;202;536;280", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 5.75, 0.4330127018922193 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "correctness_avg": [ 2.75, 1.0897247358851685 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 63.75, 24.529319191530774 ], "wc_summary_review_avg": [ 36.25, 23.47738273317535 ], "wc_main_review_avg": [ 192.75, 108.41673071994009 ], "wc_review_avg": [ 292.75, 147.56248676408242 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.8703882797784891, "corr_recommendation_correctness": -0.13245323570650439, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2071995496889501254&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff_unique_index": "0;1;2;1", "aff_unique_norm": "Ecole Normale Sup\u00e9rieure;Meta;Ecole Normale Superieure", "aff_unique_dep": ";Meta Platforms, Inc.;", "aff_unique_url": "https://www.ens.fr;https://meta.com;https://www.ens.fr", "aff_unique_abbr": "ENS;Meta;ENS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;1", "aff_country_unique": "France;United States" }, { "title": "Information Bottleneck: Exact Analysis of (Quantized) Neural Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6830", "id": "kF9DZQQrU0w", "poster": "", "openreview": "https://openreview.net/forum?id=kF9DZQQrU0w", "slides": "https://iclr.cc/virtual/2022/poster/6830", "video": "https://iclr.cc/virtual/2022/poster/6830", "author_site": "Stephan Lorenzen, Christian Igel, Mads Nielsen", "tldr": "", "abstract": "The information bottleneck (IB) principle has been suggested as a way to analyze deep neural networks. The learning dynamics are studied by inspecting the mutual information (MI) between the hidden layers and the input and output. Notably, separate fitting and compression phases during training have been reported. This led to some controversy including claims that the observations are not reproducible and strongly dependent on the type of activation function used as well as on the way the MI is estimated. Our study confirms that different ways of binning when computing the MI lead to qualitatively different results, either supporting or refusing IB conjectures.\nTo resolve the controversy, we study the IB principle in settings where MI is non-trivial and can be computed exactly. We monitor the dynamics of quantized neural networks, that is, we discretize the whole deep learning system so that no approximation is required when computing the MI. This allows us to quantify the information flow without measurement errors. \nIn this setting, we observed a fitting phase for all layers and a compression phase for the output layer in all experiments; the compression in the hidden layers was dependent on the type of activation function. Our study shows that the initial IB results were not artifacts of binning when computing the MI. However, the critical claim that the compression phase may not be observed for some networks also holds true.", "keywords": "information bottleneck;quantization;neural network", "primary_area": "", "supplementary_material": "/attachment/bfe7acd343967f1901729b4096b52d95c46e1ca9.zip", "author": "Stephan Sloth Lorenzen;Christian Igel;Mads Nielsen", "authorids": "~Stephan_Sloth_Lorenzen1;~Christian_Igel1;~Mads_Nielsen2", "gender": "M;M;M", "homepage": ";https://christian-igel.github.io/;https://di.ku.dk/Ansatte/?pure=da/persons/137906", "dblp": ";38/6146;", "google_scholar": ";https://scholar.google.dk/citations?user=d-jF4zIAAAAJ;https://scholar.google.dk/citations?user=2QCJXEkAAAAJ", "orcid": "0000-0001-6701-3752;0000-0003-2868-0856;0000-0003-1535-068X", "linkedin": "stephanlorenzen/;christianigel/;", "or_profile": "~Stephan_Sloth_Lorenzen1;~Christian_Igel1;~Mads_Nielsen1", "aff": "University of Copenhagen;University of Copenhagen;University of Copenhagen", "aff_domain": "ku.dk;ku.dk;diku.dk", "position": "Postdoc;Full Professor;Full Professor", "bibtex": "@inproceedings{\nlorenzen2022information,\ntitle={Information Bottleneck: Exact Analysis of (Quantized) Neural Networks},\nauthor={Stephan Sloth Lorenzen and Christian Igel and Mads Nielsen},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=kF9DZQQrU0w}\n}", "github": "", "project": "", "reviewers": "KkRc;vYX6;bFPZ", "pdf_size": 0, "recommendation": "6;6;8", "confidence": "4;5;3", "correctness": "3;3;4", "technical_novelty": "2;2;2", "empirical_novelty": "3;2;4", "wc_summary_paper": "498;35;115", "wc_summary_review": "214;14;35", "wc_main_review": "913;598;156", "wc_review": "1625;647;306", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 6.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 3.0, 0.816496580927726 ], "wc_summary_paper_avg": [ 216.0, 202.06104688105194 ], "wc_summary_review_avg": [ 87.66666666666667, 89.74160437364353 ], "wc_main_review_avg": [ 555.6666666666666, 310.4902932818065 ], "wc_review_avg": [ 859.3333333333334, 559.0195782697498 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.8660254037844385, "corr_recommendation_correctness": 0.9999999999999998, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14219492799643625897&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=kF9DZQQrU0w", "email": "ku.dk;ku.dk;diku.dk", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Copenhagen", "aff_unique_dep": "", "aff_unique_url": "https://www.ku.dk", "aff_unique_abbr": "UCPH", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Denmark" }, { "title": "Visual Representation Learning over Latent Domains", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/5897", "id": "kG0AtPi6JI1", "poster": "", "openreview": "https://openreview.net/forum?id=kG0AtPi6JI1", "slides": "https://iclr.cc/virtual/2022/poster/5897", "video": "https://iclr.cc/virtual/2022/poster/5897", "author_site": "Lucas Deecke, Timothy Hospedales, Hakan Bilen", "tldr": "", "abstract": "A fundamental shortcoming of deep neural networks is their specialization to a single task and domain. While multi-domain learning enables the learning of compact models that span multiple visual domains, these rely on the presence of domain labels, in turn requiring laborious curation of datasets. This paper proposes a less explored, but highly realistic new setting called latent domain learning: learning over data from different domains, without access to domain annotations. Experiments show that this setting is challenging for standard models and existing multi-domain approaches, calling for new customized solutions: a sparse adaptation strategy is formulated which enhances performance by accounting for latent domains in data. Our method can be paired seamlessly with existing models, and benefits conceptually related tasks, e.g. empirical fairness problems and long-tailed recognition.", "keywords": "transfer learning;latent domains;computer vision", "primary_area": "", "supplementary_material": "", "author": "Lucas Deecke;Timothy Hospedales;Hakan Bilen", "authorids": "~Lucas_Deecke1;~Timothy_Hospedales1;~Hakan_Bilen1", "gender": "M;M;M", "homepage": ";http://homepages.inf.ed.ac.uk/thospeda/;http://homepages.inf.ed.ac.uk/hbilen/", "dblp": "222/9834;32/3545;97/2993", "google_scholar": "6-x0_AsAAAAJ;https://scholar.google.fr/citations?user=nHhtvqkAAAAJ;PtBtfawAAAAJ", "orcid": ";0000-0003-4867-7486;0000-0002-6947-6918", "linkedin": ";timothyhospedales/;", "or_profile": "~Lucas_Deecke1;~Timothy_Hospedales1;~Hakan_Bilen1", "aff": "Twitter;Samsung AI Research Centre;University of Edinburgh", "aff_domain": "twitter.com;samsung.com;ed.ac.uk", "position": "Researcher;Principal Researcher;Assistant Professor", "bibtex": "@inproceedings{\ndeecke2022visual,\ntitle={Visual Representation Learning over Latent Domains},\nauthor={Lucas Deecke and Timothy Hospedales and Hakan Bilen},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=kG0AtPi6JI1}\n}", "github": "", "project": "", "reviewers": "Yiib;LPa6;cM2p;dZE1", "pdf_size": 0, "recommendation": "3;6;6;6", "confidence": "2;2;3;3", "correctness": "2;3;3;3", "technical_novelty": "2;2;3;2", "empirical_novelty": "3;3;3;2", "wc_summary_paper": "151;96;29;28", "wc_summary_review": "60;43;12;62", "wc_main_review": "422;483;129;207", "wc_review": "633;622;170;297", "wc_reply_reviewers": "0;19;24;0", "wc_reply_authors": "787;660;124;546", "reply_reviewers": "0;1;1;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 5.25, 1.299038105676658 ], "confidence_avg": [ 2.5, 0.5 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 76.0, 51.327380607235355 ], "wc_summary_review_avg": [ 44.25, 20.029665498954294 ], "wc_main_review_avg": [ 310.25, 146.49466713843205 ], "wc_review_avg": [ 430.5, 202.08970780324267 ], "wc_reply_reviewers_avg": [ 10.75, 10.894379284750462 ], "wc_reply_authors_avg": [ 529.25, 249.0174441680743 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": 1.0, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=75815756890514813&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=kG0AtPi6JI1", "email": "twitter.com;samsung.com;ed.ac.uk", "author_num": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "Twitter, Inc.;Samsung;University of Edinburgh", "aff_unique_dep": ";AI Research;", "aff_unique_url": "https://twitter.com;https://www.samsung.com/global/researchers/samsung-ai-research-centre/;https://www.ed.ac.uk", "aff_unique_abbr": "Twitter;SARC;Edinburgh", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2", "aff_country_unique": "United States;South Korea;United Kingdom" }, { "id": "kHNKTO2sYH", "title": "Repairing Systematic Outliers by Learning Clean Subspaces in VAEs", "track": "main", "status": "Reject", "tldr": "", "abstract": "Data cleaning often comprises outlier detection and data repair.\nSystematic errors result from nearly\ndeterministic transformations that occur repeatedly in the data, \ne.g. specific image pixels being set to default values or watermarks.\nConsequently, models with enough capacity easily overfit to these\nerrors, making detection and repair difficult.\nSeeing as a systematic outlier is a combination of patterns of a clean instance\nand systematic error patterns, our main insight is that inliers can be\nmodelled by a smaller representation (subspace) in a model than outliers.\nBy exploiting this,\nwe propose \\emph{Clean Subspace Variational Autoencoder (CLSVAE)}, \na novel semi-supervised model for detection and automated repair of\nsystematic errors.\nThe main idea is to partition the latent space and model inlier and \noutlier patterns separately.\nCLSVAE is effective with much less labelled data compared to previous related\nmodels, often with less than 2\\% of the data.\nWe provide experiments using three image datasets in scenarios with\ndifferent levels of corruption and labelled set sizes, comparing to relevant baselines.\nCLSVAE provides superior repairs without human intervention, \ne.g. with just 0.25\\% of labelled data we see a relative error \ndecrease of 58\\% compared to the closest baseline.", "keywords": "variational autoencoder;deep generative models;outlier detection;data repair", "primary_area": "", "supplementary_material": "", "author": "Simao Eduardo;Kai Xu;Alfredo Nazabal;Charles Sutton", "authorids": "~Simao_Eduardo3;~Kai_Xu4;~Alfredo_Nazabal1;~Charles_Sutton1", "gender": "M;M;;M", "homepage": ";https://xuk.ai;;http://homepages.inf.ed.ac.uk/csutton/", "dblp": ";;;59/5879", "google_scholar": ";https://scholar.google.ca/citations?user=kf3C60wAAAAJ;https://scholar.google.es/citations?user=IanHvT4AAAAJ;https://scholar.google.co.uk/citations?user=hYtGXD0AAAAJ", "orcid": ";;;0000-0002-0041-3820", "linkedin": ";;;charles-sutton-772aa126", "or_profile": "~Simao_Eduardo3;~Kai_Xu4;~Alfredo_Nazabal1;~Charles_Sutton1", "aff": "University of Edinburgh;Hazy;Alan Turing Institute;University of Edinburgh", "aff_domain": "ed.ac.uk;hazy.com;turing.ac.uk;ed.ac.uk", "position": "PhD student;Researcher;Postdoc;Professor", "bibtex": "@misc{\neduardo2022repairing,\ntitle={Repairing Systematic Outliers by Learning Clean Subspaces in {VAE}s},\nauthor={Simao Eduardo and Kai Xu and Alfredo Nazabal and Charles Sutton},\nyear={2022},\nurl={https://openreview.net/forum?id=kHNKTO2sYH}\n}", "github": "", "project": "", "reviewers": "5JVH;7fHt;Hvcm", "site": "https://openreview.net/forum?id=kHNKTO2sYH", "pdf_size": 0, "recommendation": "6;6;6", "confidence": "2;4;3", "correctness": "4;2;4", "technical_novelty": "2;2;2", "empirical_novelty": "2;2;3", "wc_summary_paper": "124;86;158", "wc_summary_review": "26;36;107", "wc_main_review": "313;251;87", "wc_review": "463;373;352", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "580;1185;1043", "reply_reviewers": "0;0;0", "reply_authors": "1;3;2", "recommendation_avg": [ 6.0, 0.0 ], "confidence_avg": [ 3.0, 0.816496580927726 ], "correctness_avg": [ 3.3333333333333335, 0.9428090415820634 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 122.66666666666667, 29.408993333483707 ], "wc_summary_review_avg": [ 56.333333333333336, 36.05859429071275 ], "wc_main_review_avg": [ 217.0, 95.34498763263157 ], "wc_review_avg": [ 396.0, 48.14561246884289 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 936.0, 258.3189243293388 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.0, 0.816496580927726 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5160037889430102086&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "University of Edinburgh;Hazy;Alan Turing Institute", "aff_unique_dep": ";;", "aff_unique_url": "https://www.ed.ac.uk;;https://www.turing.ac.uk", "aff_unique_abbr": "Edinburgh;;ATI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United Kingdom;" }, { "id": "kHkWgqOysk_", "title": "On Pseudo-Labeling for Class-Mismatch Semi-Supervised Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Semi-Supervised Learning (SSL) methods have shown superior performance when unlabeled data are drawn from the same distribution with labeled data. Among them, Pseudo-Labeling (PL) is a simple and widely used method that creates pseudo-labels for unlabeled data according to predictions of the training model itself. However, when there are unlabeled Out-Of-Distribution (OOD) data from other classes, these methods suffer from severe performance degradation and even get worse than merely training on labeled data. In this paper, we empirically analyze PL in class-mismatched SSL. We aim to answer the following questions: (1) How do OOD data influence PL? (2) What are the better pseudo-labels for OOD data? First, we show that the major problem of PL is imbalanced pseudo-labels on OOD data. Second, we find that when labeled as their ground truths, OOD data are beneficial to classification performance on In-Distribution (ID) data. Based on the findings, we propose our model which consists of two components -- Re-balanced Pseudo-Labeling (RPL) and Semantic Exploration Clustering (SEC). RPL re-balances pseudo-labels on ID classes to filter out OOD data while also addressing the imbalance problem. SEC uses balanced clustering on OOD data to create pseudo-labels on extra classes, simulating the process of training with their ground truths. Experiments show that our method achieves steady improvement over supervised baseline and state-of-the-art performance under all class mismatch ratios on different benchmarks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Lu Han;Han-Jia Ye;De-Chuan Zhan", "authorids": "~Lu_Han2;~Han-Jia_Ye1;~De-Chuan_Zhan1", "gender": "M;M;M", "homepage": "http://www.lamda.nju.edu.cn/hanlu/;http://www.lamda.nju.edu.cn/yehj;http://www.lamda.nju.edu.cn/zhandc/", "dblp": ";165/3014;74/498", "google_scholar": "https://scholar.google.com.hk/citations?user=m-WYn7gAAAAJ;mgOYhtoAAAAJ;mYJf4TcAAAAJ", "orcid": ";;0000-0002-3533-2078", "linkedin": ";;", "or_profile": "~Lu_Han2;~Han-Jia_Ye1;~De-Chuan_Zhan1", "aff": "Nanjing University;Nanjing University;Nanjing University", "aff_domain": "nju.edu.cn;nju.edu.cn;nju.edu.cn", "position": "MS student;Associate Researcher;Full Professor", "bibtex": "@misc{\nhan2022on,\ntitle={On Pseudo-Labeling for Class-Mismatch Semi-Supervised Learning},\nauthor={Lu Han and Han-Jia Ye and De-Chuan Zhan},\nyear={2022},\nurl={https://openreview.net/forum?id=kHkWgqOysk_}\n}", "github": "", "project": "", "reviewers": "ayVg;kZv3;xUsz;A13X", "site": "https://openreview.net/forum?id=kHkWgqOysk_", "pdf_size": 0, "recommendation": "3;6;6;6", "confidence": "5;3;4;4", "correctness": "3;4;3;4", "technical_novelty": "2;4;3;4", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "65;321;91;57", "wc_summary_review": "242;30;42;96", "wc_main_review": "75;328;367;183", "wc_review": "382;679;500;336", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "1053;730;1457;873", "reply_reviewers": "0;0;0;0", "reply_authors": "2;2;2;3", "recommendation_avg": [ 5.25, 1.299038105676658 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.25, 0.82915619758885 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 133.5, 108.98050284339855 ], "wc_summary_review_avg": [ 102.5, 84.28967908350346 ], "wc_main_review_avg": [ 238.25, 116.54907764542799 ], "wc_review_avg": [ 474.25, 132.48466892437025 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 1028.25, 272.7153965217219 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.25, 0.4330127018922193 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.816496580927726, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16959117824262416139&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;0", "aff_unique_norm": "Nanjing University", "aff_unique_dep": "", "aff_unique_url": "https://www.nju.edu.cn", "aff_unique_abbr": "Nanjing U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "id": "kK3DlGuusi", "title": "Quantized sparse PCA for neural network weight compression", "track": "main", "status": "Reject", "tldr": "", "abstract": "In this paper, we introduce a novel method of weight compression. In our method, we store weight tensors as sparse, quantized matrix factors, whose product is computed on the fly during inference to generate the target model's weight tensors. The underlying matrix factorization problem can be considered as a quantized sparse PCA problem and solved through iterative projected gradient descent methods. Seen as a unification of weight SVD, vector quantization and sparse PCA, our method achieves or is on par with state-of-the-art trade-offs between accuracy and model size. Our method is applicable to both moderate compression regime, unlike vector quantization, and extreme compression regime.", "keywords": "Model Compression;neural network quantization;sparse principal component analysis;vector quantization", "primary_area": "", "supplementary_material": "", "author": "Andrey Kuzmin;Mart Van Baalen;Markus Nagel;Arash Behboodi", "authorids": "~Andrey_Kuzmin1;~Mart_Van_Baalen1;~Markus_Nagel1;~Arash_Behboodi1", "gender": ";M;M;M", "homepage": "https://www.qualcomm.com/research/artificial-intelligence/ai-research;;;https://arashbehboodi.github.io/", "dblp": ";;38/1463;97/7718", "google_scholar": ";a-Au4JUAAAAJ;akNuBBEAAAAJ;", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Andrey_Kuzmin1;~Mart_Van_Baalen1;~Markus_Nagel1;~Arash_Behboodi1", "aff": "Qualcomm Inc, QualComm;QualComm;Qualcomm AI Research;QualComm", "aff_domain": "qti.qualcomm.com;qualcomm.com;qualcomm.com;qualcomm.com", "position": "Senior machine learning researcher;Researcher;Researcher;Machine Learning Researcher", "bibtex": "@misc{\nkuzmin2022quantized,\ntitle={Quantized sparse {PCA} for neural network weight compression},\nauthor={Andrey Kuzmin and Mart Van Baalen and Markus Nagel and Arash Behboodi},\nyear={2022},\nurl={https://openreview.net/forum?id=kK3DlGuusi}\n}", "github": "", "project": "", "reviewers": "rRp9;zn4a;iyVU", "site": "https://openreview.net/forum?id=kK3DlGuusi", "pdf_size": 0, "recommendation": "1;5;8", "confidence": "5;3;3", "correctness": "2;3;4", "technical_novelty": "2;2;3", "empirical_novelty": "1;2;3", "wc_summary_paper": "30;71;77", "wc_summary_review": "35;56;54", "wc_main_review": "402;284;483", "wc_review": "467;411;614", "wc_reply_reviewers": "235;83;132", "wc_reply_authors": "605;894;627", "reply_reviewers": "1;1;2", "reply_authors": "2;2;2", "recommendation_avg": [ 4.666666666666667, 2.8674417556808756 ], "confidence_avg": [ 3.6666666666666665, 0.9428090415820634 ], "correctness_avg": [ 3.0, 0.816496580927726 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.0, 0.816496580927726 ], "wc_summary_paper_avg": [ 59.333333333333336, 20.885933597094056 ], "wc_summary_review_avg": [ 48.333333333333336, 9.463379711052259 ], "wc_main_review_avg": [ 389.6666666666667, 81.7081527280003 ], "wc_review_avg": [ 497.3333333333333, 85.605036196606 ], "wc_reply_reviewers_avg": [ 150.0, 63.34561284466878 ], "wc_reply_authors_avg": [ 708.6666666666666, 131.3578657290415 ], "reply_reviewers_avg": [ 1.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 2.0, 0.0 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.9041944301794652, "corr_recommendation_correctness": 0.996615895540124, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:smAY4IazWLMJ:scholar.google.com/&scioq=Quantized+sparse+PCA+for+neural+network+weight+compression&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Qualcomm Incorporated;Qualcomm", "aff_unique_dep": ";Qualcomm AI Research", "aff_unique_url": "https://www.qualcomm.com;https://www.qualcomm.com/research", "aff_unique_abbr": "Qualcomm;QAI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Anytime Dense Prediction with Confidence Adaptivity", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6376", "id": "kNKFOXleuC", "poster": "", "openreview": "https://openreview.net/forum?id=kNKFOXleuC", "slides": "https://iclr.cc/virtual/2022/poster/6376", "video": "https://iclr.cc/virtual/2022/poster/6376", "author_site": "Zhuang Liu, Zhiqiu Xu, Hung-Ju Wang, trevor darrell, Evan Shelhamer", "tldr": "", "abstract": "Anytime inference requires a model to make a progression of predictions which might be halted at any time. Prior research on anytime visual recognition has mostly focused on image classification.We propose the first unified and end-to-end approach for anytime dense prediction. A cascade of \"exits\" is attached to the model to make multiple predictions. We redesign the exits to account for the depth and spatial resolution of the features for each exit. To reduce total computation, and make full use of prior predictions, we develop a novel spatially adaptive approach to avoid further computation on regions where early predictions are already sufficiently confident. Our full method, named anytime dense prediction with confidence (ADP-C), achieves the same level of final accuracy, and meanwhile significantly reduces total computation. We evaluate our method on Cityscapes semantic segmentation and MPII human pose estimation: ADP-C enables anytime inference without sacrificing accuracy while also reducing the total FLOPs of its base models by 44.4% and 59.1%. We compare with anytime inference by deep equilibrium networks and feature-based stochastic sampling, showing that ADP-C dominates both across the accuracy-computation curve. Our code is available at https://github.com/liuzhuang13/anytime.", "keywords": "Efficient Inference;Anytime Inference;Semantic Segmentation;Dense Prediction;Computer Vision", "primary_area": "", "supplementary_material": "/attachment/be17d20312589464e5ee7f5d3779aab2282cf1f4.zip", "author": "Zhuang Liu;Zhiqiu Xu;Hung-Ju Wang;Trevor Darrell;Evan Shelhamer", "authorids": "~Zhuang_Liu1;~Zhiqiu_Xu1;~Hung-Ju_Wang1;~Trevor_Darrell2;~Evan_Shelhamer2", "gender": "M;M;M;M;M", "homepage": "https://oscarxzq.github.io;;http://imaginarynumber.net;https://people.eecs.berkeley.edu/~trevor/;https://liuzhuang13.github.io/", "dblp": ";;150/6541;d/TrevorDarrell;56/11346-3", "google_scholar": "https://scholar.google.com/citations?hl=en;;-ltRSM0AAAAJ;https://scholar.google.com.tw/citations?user=bh-uRFMAAAAJ;7OTD-LEAAAAJ", "orcid": ";;;;", "linkedin": "oscar-xu-1250821a1/;hungju-wang-5a5124172/;;;zhuang-liu-19306b1b1/", "or_profile": "~Zhiqiu_Xu1;~Hung-Ju_Wang1;~Evan_G_Shelhamer1;~trevor_darrell1;~Zhuang_Liu2", "aff": "University of California, Berkeley;;Google DeepMind;Electrical Engineering & Computer Science Department;University of California, Berkeley", "aff_domain": "berkeley.edu;;deepmind.com;eecs.berkeley.edu;berkeley.edu", "position": "Undergrad student;;Research Scientist;Professor;PhD student", "bibtex": "@inproceedings{\nliu2022anytime,\ntitle={Anytime Dense Prediction with Confidence Adaptivity},\nauthor={Zhuang Liu and Zhiqiu Xu and Hung-Ju Wang and Trevor Darrell and Evan Shelhamer},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=kNKFOXleuC}\n}", "github": "", "project": "", "reviewers": "2wqr;ohPA;GYrC;hS3f", "pdf_size": 0, "recommendation": "6;6;6;8", "confidence": "3;4;4;3", "correctness": "3;3;3;3", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "103;261;77;111", "wc_summary_review": "51;63;40;35", "wc_main_review": "477;572;246;647", "wc_review": "631;896;363;793", "wc_reply_reviewers": "0;0;23;266", "wc_reply_authors": "1517;1749;879;829", "reply_reviewers": "0;0;1;1", "reply_authors": "3;3;3;2", "recommendation_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 138.0, 72.11795892841117 ], "wc_summary_review_avg": [ 47.25, 10.779030568655049 ], "wc_main_review_avg": [ 485.5, 150.82854504370187 ], "wc_review_avg": [ 670.75, 201.22919147082015 ], "wc_reply_reviewers_avg": [ 72.25, 112.2550110240073 ], "wc_reply_authors_avg": [ 1243.5, 398.43537744532676 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 2.75, 0.4330127018922193 ], "replies_avg": [ 23, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.0, "gs_citation": 21, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14058160425117298434&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=kNKFOXleuC", "email": "berkeley.edu;;deepmind.com;eecs.berkeley.edu;berkeley.edu", "author_num": 5, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "University of California, Berkeley;Google;Electrical Engineering & Computer Science Department", "aff_unique_dep": ";Google DeepMind;Electrical Engineering & Computer Science", "aff_unique_url": "https://www.berkeley.edu;https://deepmind.com;", "aff_unique_abbr": "UC Berkeley;DeepMind;", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Berkeley;", "aff_country_unique_index": "0;1;0", "aff_country_unique": "United States;United Kingdom;" }, { "id": "kO-wQWwqnO", "title": "L2BGAN: An image enhancement model for image quality improvement and image analysis tasks without paired supervision", "track": "main", "status": "Reject", "tldr": "", "abstract": "The paper presents an image enhancement model,\nL2BGAN, to translate low light images to bright images\nwithout a paired supervision. We introduce the use of geo-\nmetric and lighting consistency along with a contextual loss\ncriterion. These when combined with multiscale color, tex-\nture and edge discriminators prove to provide competitive\nresults. We perform extensive experiments on benchmark\ndatasets to compare our results visually as well as objec-\ntively. We observe the performance of L2BGAN on real time\ndriving datasets which are subject to motion blur, noise and\nother artifacts. We further demonstrate the application of\nimage understanding tasks on our enhanced images using\nDarkFace and ExDark datasets.\n", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/07e3dc5f4aa79e1c365a7a56f2c3561144122df5.zip", "author": "Jhilik Bhattacharya;Gianni Ramponi;Leonardo Gregorat;Shatrughan Modi", "authorids": "~Jhilik_Bhattacharya1;ramponi@units.it;~Leonardo_Gregorat1;~Shatrughan_Modi1", "gender": "F;;;M", "homepage": "https://sites.google.com/thapar.edu/jhilikbhattacharya/home;;;", "dblp": ";;;", "google_scholar": ";;;mLJezBgAAAAJ", "orcid": ";;;", "linkedin": ";;leonardogregorat/;", "or_profile": "~Jhilik_Bhattacharya1;ramponi@units.it;~Leonardo_Gregorat1;~Shatrughan_Modi1", "aff": "TIET;;;Thapar Institute of Engineering & Technology, Patiala", "aff_domain": "thapar.edu;;;thapar.edu", "position": "Associate Professor;;;Assistant Professor", "bibtex": "@misc{\nbhattacharya2022lbgan,\ntitle={L2{BGAN}: An image enhancement model for image quality improvement and image analysis tasks without paired supervision},\nauthor={Jhilik Bhattacharya and Gianni Ramponi and Leonardo Gregorat and Shatrughan Modi},\nyear={2022},\nurl={https://openreview.net/forum?id=kO-wQWwqnO}\n}", "github": "", "project": "", "reviewers": "1V61;efio;Fu5c;Et8P", "site": "https://openreview.net/forum?id=kO-wQWwqnO", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "4;5;3;3", "correctness": "3;2;3;3", "technical_novelty": "1;1;2;2", "empirical_novelty": "1;1;2;3", "wc_summary_paper": "20;42;65;42", "wc_summary_review": "22;18;50;10", "wc_main_review": "164;303;238;111", "wc_review": "206;363;353;163", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 1.5, 0.5 ], "empirical_novelty_avg": [ 1.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 42.25, 15.911866640969563 ], "wc_summary_review_avg": [ 25.0, 15.066519173319364 ], "wc_main_review_avg": [ 204.0, 72.81140020628638 ], "wc_review_avg": [ 271.25, 88.14299461670224 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.9045340337332909, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:ZYVNACk15W0J:scholar.google.com/&scioq=L2BGAN:+An+image+enhancement+model+for+image+quality+improvement+and+image+analysis+tasks+without+paired+supervision&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1", "aff_unique_norm": "Thapar Institute of Engineering and Technology;Thapar Institute of Engineering & Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.tiet.ac.in;https://www.thapar.edu", "aff_unique_abbr": "TIET;Thapar", "aff_campus_unique_index": "1", "aff_campus_unique": ";Patiala", "aff_country_unique_index": "0;0", "aff_country_unique": "India" }, { "id": "kOtkgUGAVTX", "title": "CIC: Contrastive Intrinsic Control for Unsupervised Skill Discovery", "track": "main", "status": "Reject", "tldr": "", "abstract": "We introduce Contrastive Intrinsic Control (CIC) - an algorithm for unsupervised skill discovery that maximizes the mutual information between skills and state transitions. In contrast to most prior approaches, CIC uses a decomposition of the mutual information that explicitly incentivizes diverse behaviors by maximizing state entropy. We derive a novel lower bound estimate for the mutual information which combines a particle estimator for state entropy to generate diverse behaviors and contrastive learning to distill these behaviors into distinct skills. We evaluate our algorithm on the Unsupervised Reinforcement Learning Benchmark, which consists of a long reward-free pre-training phase followed by a short adaptation phase to downstream tasks with extrinsic rewards. We find that CIC improves on prior unsupervised skill discovery methods by $91\\%$ and the next-leading overall exploration algorithm by $26\\%$ in terms of downstream task performance.\n", "keywords": "unsupervised learning;reinforcement learning;exploration", "primary_area": "", "supplementary_material": "/attachment/d2b9c08282e195695a0d02b0697d31da2887d885.zip", "author": "Michael Laskin;Hao Liu;Xue Bin Peng;Denis Yarats;Aravind Rajeswaran;Pieter Abbeel", "authorids": "~Michael_Laskin1;~Hao_Liu1;~Xue_Bin_Peng1;~Denis_Yarats1;~Aravind_Rajeswaran1;~Pieter_Abbeel2", "gender": "M;M;M;M;M;M", "homepage": "http://mishalaskin.com;https://xbpeng.github.io;http://denis-yarats.info/;http://aravindr93.github.io/;https://people.eecs.berkeley.edu/~pabbeel/;https://haoliu.ai", "dblp": ";;200/8142;164/5778;;09/3214-55", "google_scholar": "DOGDnwsAAAAJ;https://scholar.google.ca/citations?user=FwxfQosAAAAJ;7kaXqgMAAAAJ;_EJrRVAAAAAJ;https://scholar.google.com.tw/citations?user=vtwH6GkAAAAJ;wtK4Yh4AAAAJ", "orcid": ";;;;;", "linkedin": "mishalaskin;;;;;", "or_profile": "~Michael_Laskin1;~Xue_Bin_Peng1;~Denis_Yarats1;~Aravind_Rajeswaran1;~Pieter_Abbeel2;~Hao_Liu10", "aff": "Google DeepMind;Simon Fraser University;New York University;Meta Facebook;Covariant;University of California, Berkeley", "aff_domain": "deepmind.com;sfu.ca;cs.nyu.edu;meta.com;covariant.ai;berkeley.edu", "position": "Researcher;Assistant Professor;PhD student;Research Scientist;Founder;PhD student", "bibtex": "@misc{\nlaskin2022cic,\ntitle={{CIC}: Contrastive Intrinsic Control for Unsupervised Skill Discovery},\nauthor={Michael Laskin and Hao Liu and Xue Bin Peng and Denis Yarats and Aravind Rajeswaran and Pieter Abbeel},\nyear={2022},\nurl={https://openreview.net/forum?id=kOtkgUGAVTX}\n}", "github": "", "project": "", "reviewers": "tKs2;VTZ1;cP9K;h3mC", "site": "https://openreview.net/forum?id=kOtkgUGAVTX", "pdf_size": 0, "recommendation": "3;6;8;8", "confidence": "3;4;4;3", "correctness": "2;4;3;3", "technical_novelty": "2;2;2;3", "empirical_novelty": "3;3;4;3", "wc_summary_paper": "70;90;92;256", "wc_summary_review": "37;182;59;28", "wc_main_review": "285;674;652;206", "wc_review": "392;946;803;490", "wc_reply_reviewers": "366;306;21;0", "wc_reply_authors": "2383;2739;1050;790", "reply_reviewers": "1;1;1;0", "reply_authors": "4;4;2;1", "recommendation_avg": [ 6.25, 2.0463381929681126 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 127.0, 74.97332859090625 ], "wc_summary_review_avg": [ 76.5, 61.945540598173814 ], "wc_main_review_avg": [ 454.25, 210.75385524350438 ], "wc_review_avg": [ 657.75, 225.24916759002684 ], "wc_reply_reviewers_avg": [ 173.25, 164.2945145158535 ], "wc_reply_authors_avg": [ 1740.5, 835.171988275469 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 2.75, 1.299038105676658 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.3665083330689157, "corr_recommendation_correctness": 0.5183210553488161, "gs_citation": 75, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3577114145291477646&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1;2;3;4;5", "aff_unique_norm": "Google;Simon Fraser University;New York University;Meta;Covariant;University of California, Berkeley", "aff_unique_dep": "Google DeepMind;;;Meta Platforms, Inc.;;", "aff_unique_url": "https://deepmind.com;https://www.sfu.ca;https://www.nyu.edu;https://meta.com;;https://www.berkeley.edu", "aff_unique_abbr": "DeepMind;SFU;NYU;Meta;;UC Berkeley", "aff_campus_unique_index": "1", "aff_campus_unique": ";Berkeley", "aff_country_unique_index": "0;1;2;2;2", "aff_country_unique": "United Kingdom;Canada;United States;" }, { "title": "Filling the G_ap_s: Multivariate Time Series Imputation by Graph Neural Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/5891", "id": "kOu3-S3wJ7", "poster": "", "openreview": "https://openreview.net/forum?id=kOu3-S3wJ7", "slides": "https://iclr.cc/virtual/2022/poster/5891", "video": "https://iclr.cc/virtual/2022/poster/5891", "author_site": "Andrea Cini, Ivan Marisca, Cesare Alippi", "tldr": "", "abstract": "Dealing with missing values and incomplete time series is a labor-intensive, tedious, inevitable task when handling data coming from real-world applications. Effective spatio-temporal representations would allow imputation methods to reconstruct missing temporal data by exploiting information coming from sensors at different locations. However, standard methods fall short in capturing the nonlinear time and space dependencies existing within networks of interconnected sensors and do not take full advantage of the available - and often strong - relational information. Notably, most state-of-the-art imputation methods based on deep learning do not explicitly model relational aspects and, in any case, do not exploit processing frameworks able to adequately represent structured spatio-temporal data. Conversely, graph neural networks have recently surged in popularity as both expressive and scalable tools for processing sequential data with relational inductive biases. In this work, we present the first assessment of graph neural networks in the context of multivariate time series imputation. In particular, we introduce a novel graph neural network architecture, named GRIN, which aims at reconstructing missing data in the different channels of a multivariate time series by learning spatio-temporal representations through message passing. Empirical results show that our model outperforms state-of-the-art methods in the imputation task on relevant real-world benchmarks with mean absolute error improvements often higher than 20%.", "keywords": "graph neural networks;missing data;time series analysis;time series imputation", "primary_area": "", "supplementary_material": "/attachment/b37815dd4f2fc6137bdb944c378b7d0d97268813.zip", "author": "Andrea Cini;Ivan Marisca;Cesare Alippi", "authorids": "~Andrea_Cini1;~Ivan_Marisca1;~Cesare_Alippi1", "gender": "M;M;M", "homepage": "https://andreacini.github.io/;https://marshka.github.io/;https://alippi.faculty.polimi.it/", "dblp": "249/8223;298/8039;84/6337", "google_scholar": "bQI2UIUAAAAJ;loKgz80AAAAJ;SCZObbIAAAAJ", "orcid": ";0000-0002-9713-1626;", "linkedin": ";ivanmarisca;", "or_profile": "~Andrea_Cini1;~Ivan_Marisca1;~Cesare_Alippi1", "aff": "Universita della Svizzera Italiana;Universit\u00e0 della Svizzera Italiana;Politecnico di Milano", "aff_domain": "usi.ch;usi.ch;polimi.it", "position": "PhD student;PhD student;Full Professor", "bibtex": "@inproceedings{\ncini2022filling,\ntitle={Filling the G\\_ap\\_s: Multivariate Time Series Imputation by Graph Neural Networks},\nauthor={Andrea Cini and Ivan Marisca and Cesare Alippi},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=kOu3-S3wJ7}\n}", "github": "", "project": "", "reviewers": "9yUs;jZvK;JPtB;hJvE", "pdf_size": 0, "recommendation": "6;6;8;8", "confidence": "2;3;4;3", "correctness": "4;3;3;4", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;3;2;3", "wc_summary_paper": "97;58;135;44", "wc_summary_review": "180;27;71;33", "wc_main_review": "268;261;317;330", "wc_review": "545;346;523;407", "wc_reply_reviewers": "0;13;0;0", "wc_reply_authors": "918;648;709;598", "reply_reviewers": "0;1;0;0", "reply_authors": "2;2;1;1", "recommendation_avg": [ 7.0, 1.0 ], "confidence_avg": [ 3.0, 0.7071067811865476 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 83.5, 35.51408171415952 ], "wc_summary_review_avg": [ 77.75, 61.39778090452455 ], "wc_main_review_avg": [ 294.0, 29.958304357890484 ], "wc_review_avg": [ 455.25, 82.01943367275831 ], "wc_reply_reviewers_avg": [ 3.25, 5.629165124598851 ], "wc_reply_authors_avg": [ 718.25, 121.84082854281647 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.7071067811865475, "corr_recommendation_correctness": 0.0, "gs_citation": 190, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14193757514570115275&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "pdf": "https://openreview.net/pdf?id=kOu3-S3wJ7", "email": "usi.ch;usi.ch;polimi.it", "author_num": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "Universita della Svizzera Italiana;Universit\u00e0 della Svizzera italiana;Politecnico di Milano", "aff_unique_dep": ";;", "aff_unique_url": "https://www.usi.ch;https://www.usi.ch;https://www.polimi.it", "aff_unique_abbr": "USI;USI;Polimi", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1", "aff_country_unique": "Switzerland;Italy" }, { "title": "Towards Better Understanding and Better Generalization of Low-shot Classification in Histology Images with Contrastive Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6718", "id": "kQ2SOflIOVC", "poster": "", "openreview": "https://openreview.net/forum?id=kQ2SOflIOVC", "slides": "https://iclr.cc/virtual/2022/poster/6718", "video": "https://iclr.cc/virtual/2022/poster/6718", "author_site": "Jiawei Yang, Hanbo Chen, Jiangpeng Yan, Xiaoyu Chen, Jianhua Yao", "tldr": "", "abstract": "Few-shot learning is an established topic in natural images for years, but few work is attended to histology images, which is of high clinical value since well-labeled datasets and rare abnormal samples are expensive to collect. Here, we facilitate the study of few-shot learning in histology images by setting up three cross-domain tasks that simulate real clinics problems. To enable label-efficient learning and better generalizability, we propose to incorporate contrastive learning (CL) with latent augmentation (LA) to build a few-shot system. CL learns useful representations without manual labels, while LA transfers semantic variations of the base dataset in an unsupervised way. These two components fully exploit unlabeled training data and can scale gracefully to other label-hungry problems. In experiments, we find i) models learned by CL generalize better than supervised learning for histology images in unseen classes, and ii) LA brings consistent gains over baselines. Prior studies of self-supervised learning mainly focus on ImageNet-like images, which only present a dominant object in their centers. Recent attention has been paid to images with multi-objects and multi-textures. Histology images are a natural choice for such a study. We show the superiority of CL over supervised learning in terms of generalization for such data and provide our empirical understanding for this observation. The findings in this work could contribute to understanding how the model generalizes in the context of both representation learning and histological image analysis. Code is available.", "keywords": "Few shot learning;Histology Image;Knowledge Transferring", "primary_area": "", "supplementary_material": "", "author": "Jiawei Yang;Hanbo Chen;Jiangpeng Yan;Xiaoyu Chen;Jianhua Yao", "authorids": "~Jiawei_Yang1;hanbochen@tencent.com;~Jiangpeng_Yan1;xiaoyuchen@stu.xmu.edu.cn;jianhuayao@tencent.com", "gender": "M;;;;", "homepage": "https://jiawei-yang.github.io/;;https://yjump.github.io/;;", "dblp": "96/2976;;210/5075.html;;", "google_scholar": "OYrpIa8AAAAJ;;qZXFEbMAAAAJ;;", "orcid": ";;0000-0002-0767-1726;;", "linkedin": ";;;;", "or_profile": "~Jiawei_Yang1;hanbochen@tencent.com;~Jiangpeng_Yan1;xiaoyuchen@stu.xmu.edu.cn;jianhuayao@tencent.com", "aff": "University of California, Los Angeles;;Tsinghua University;;", "aff_domain": "ucla.edu;;tsinghua.edu.cn;;", "position": "MS student;;PhD student;;", "bibtex": "@inproceedings{\nyang2022towards,\ntitle={Towards Better Understanding and Better Generalization of Low-shot Classification in Histology Images with Contrastive Learning},\nauthor={Jiawei Yang and Hanbo Chen and Jiangpeng Yan and Xiaoyu Chen and Jianhua Yao},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=kQ2SOflIOVC}\n}", "github": "", "project": "", "reviewers": "PqMe;BmyB;ijae;qUzb;hJxe", "pdf_size": 0, "recommendation": "5;6;6;8;8", "confidence": "3;3;4;4;4", "correctness": "4;3;3;3;4", "technical_novelty": "3;3;3;2;3", "empirical_novelty": "2;3;3;3;4", "wc_summary_paper": "62;83;91;103;148", "wc_summary_review": "28;173;55;65;54", "wc_main_review": "378;876;261;459;462", "wc_review": "468;1132;407;627;664", "wc_reply_reviewers": "563;139;70;0;251", "wc_reply_authors": "4115;1573;1356;1168;2093", "reply_reviewers": "2;1;2;0;2", "reply_authors": "8;4;3;3;5", "recommendation_avg": [ 6.6, 1.2 ], "confidence_avg": [ 3.6, 0.4898979485566356 ], "correctness_avg": [ 3.4, 0.4898979485566356 ], "technical_novelty_avg": [ 2.8, 0.39999999999999997 ], "empirical_novelty_avg": [ 3.0, 0.6324555320336759 ], "wc_summary_paper_avg": [ 97.4, 28.611885642159272 ], "wc_summary_review_avg": [ 75.0, 50.505445250982596 ], "wc_main_review_avg": [ 487.2, 207.6953538238157 ], "wc_review_avg": [ 659.6, 254.85729340162115 ], "wc_reply_reviewers_avg": [ 204.6, 197.42603678339896 ], "wc_reply_authors_avg": [ 2061.0, 1072.6190376829977 ], "reply_reviewers_avg": [ 1.4, 0.8 ], "reply_authors_avg": [ 4.6, 1.8547236990991407 ], "replies_avg": [ 37, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.748455199183749, "corr_recommendation_correctness": -0.06804138174397723, "gs_citation": 34, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13032228234323857562&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=kQ2SOflIOVC", "email": "ucla.edu;;tsinghua.edu.cn;;", "author_num": 5, "aff_unique_index": "0;1", "aff_unique_norm": "University of California, Los Angeles;Tsinghua University", "aff_unique_dep": ";", "aff_unique_url": "https://www.ucla.edu;https://www.tsinghua.edu.cn", "aff_unique_abbr": "UCLA;THU", "aff_campus_unique_index": "0", "aff_campus_unique": "Los Angeles;", "aff_country_unique_index": "0;1", "aff_country_unique": "United States;China" }, { "id": "kQMXLDF_z20", "title": "Tackling Oversmoothing of GNNs with Contrastive Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Graph neural networks (GNNs) integrate the comprehensive relation of graph data and the representation learning capability of neural networks, which is one of the most popular deep learning methods and achieves state-of-the-art performance in many applications, such as natural language processing and computer vision. In real-world scenarios, increasing the depth (i.e., the number of layers) of GNNs is sometimes necessary to capture more latent knowledge of the input data to mitigate the uncertainty caused by missing values.\nHowever, involving more complex structures and more parameters will decrease the performance of GNN models. One reason called oversmoothing is recently proposed, whose research still remains nascent. In general, oversmoothing makes the final representations of nodes indiscriminative to hurt the node classification and link prediction performance.\nIn this paper, we first survey the current de-oversmoothing methods and propose three major metrics to evaluate a de-oversmoothing method, i.e., constant divergence indicator, easy-to-determine divergence indicator, and model-agnostic strategy. Then, we propose the Topology-guided Graph Contrastive Layer, named TGCL, which is the first de-oversmoothing method maintaining the three mentioned metrics. With the contrastive learning manner, we provide the theoretical analysis of the effectiveness of the proposed method. Last but not least, we design extensive experiments to illustrate the empirical performance of TGCL comparing with state-of-the-art baselines.", "keywords": "graph mining;oversmoothing;contrastive learning", "primary_area": "", "supplementary_material": "/attachment/4c0f3073874e2a678011d75f3ebb0ed129b97402.zip", "author": "Lecheng Zheng;Dongqi Fu;Jingrui He", "authorids": "~Lecheng_Zheng1;~Dongqi_Fu1;~Jingrui_He1", "gender": ";M;F", "homepage": "https://sites.google.com/view/lecheng-zheng/home;https://dongqifu.github.io/;https://www.hejingrui.org", "dblp": "234/8652;273/0228;34/2685", "google_scholar": "Lp09wUoAAAAJ;WByXZAcAAAAJ;hXpZynkAAAAJ", "orcid": "0000-0002-6869-3320;0000-0002-8726-9234;0000-0002-6429-6272", "linkedin": ";;", "or_profile": "~Lecheng_Zheng1;~Dongqi_Fu1;~Jingrui_He1", "aff": "University of Illinois Urbana-Champaign;University of Illinois, Urbana Champaign;University of Illinois, Urbana Champaign", "aff_domain": "illinois.edu;illinois.edu;illinois.edu", "position": "PhD student;PhD student;Associate Professor", "bibtex": "@misc{\nzheng2022tackling,\ntitle={Tackling Oversmoothing of {GNN}s with Contrastive Learning},\nauthor={Lecheng Zheng and Dongqi Fu and Jingrui He},\nyear={2022},\nurl={https://openreview.net/forum?id=kQMXLDF_z20}\n}", "github": "", "project": "", "reviewers": "ax27;R5AE;5qKE;wVaG", "site": "https://openreview.net/forum?id=kQMXLDF_z20", "pdf_size": 0, "recommendation": "3;3;5;6", "confidence": "4;5;4;4", "correctness": "3;2;3;3", "technical_novelty": "2;1;2;3", "empirical_novelty": "2;1;2;3", "wc_summary_paper": "45;44;70;74", "wc_summary_review": "16;26;22;34", "wc_main_review": "277;155;220;162", "wc_review": "338;225;312;270", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.25, 1.299038105676658 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 58.25, 13.827056809024834 ], "wc_summary_review_avg": [ 24.5, 6.5383484153110105 ], "wc_main_review_avg": [ 203.5, 49.36851223198852 ], "wc_review_avg": [ 286.25, 42.88574938135044 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.5555555555555555, "corr_recommendation_correctness": 0.5555555555555555, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2690998446053005370&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Illinois Urbana-Champaign", "aff_unique_dep": "", "aff_unique_url": "https://illinois.edu", "aff_unique_abbr": "UIUC", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Urbana-Champaign", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "kQns9y_JH6", "title": "Improved Fine-tuning by Leveraging Pre-training Data: Theory and Practice", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Fine-tuning a pre-trained model on the target data is widely used in many deep learning applications, especially for small data sets. However, recent studies have empirically shown that this training strategy offers almost no benefit in computer vision tasks over training from scratch. In this work, we first revisit this observation from the perspective of generalization analysis which is popular in learning theory. Our theory reveals that the final prediction precision has a weak dependency on the pre-trained model. Besides the pre-trained model, data for pre-training are also available for fine-tuning. The observation from pre-trained model inspires us to leverage pre-training data for fine-tuning. With the theoretical analysis, we find that the final performance on target data can be improved when the appropriate pre-training data are included in fine-tuning. Therefore, we propose to select a subset from pre-training data to help the optimization on the target data. A novel selection algorithm is developed according to our analysis. Extensive experiments on 8 benchmark data sets verify the effectiveness of the proposed fine-tuning pipeline.", "keywords": "pre-training;fine-tuning;generalization theory", "primary_area": "", "supplementary_material": "", "author": "Ziquan Liu;Yi Xu;Yuanhong Xu;Qi Qian;Hao Li;Antoni B. Chan;Rong Jin", "authorids": "~Ziquan_Liu1;~Yi_Xu8;~Yuanhong_Xu1;~Qi_Qian1;~Hao_Li16;~Antoni_B._Chan1;~Rong_Jin1", "gender": "M;M;;M;M;M;M", "homepage": "https://sites.google.com/view/ziquanliu;;http://qi-qian.com;http://www.cs.cityu.edu.hk/~abchan/;;https://www.cse.msu.edu/~rongjin/;https://yxu71.github.io", "dblp": "207/9035;223/4687;05/2084-1;55/5814;17/5705-30;j/RongJin;14/5580", "google_scholar": "https://scholar.google.com.hk/citations?user=x28OqBkAAAAJ;;Rp_40_gAAAAJ;j4vFSn8AAAAJ;pHN-QIwAAAAJ;;D4jEMqEAAAAJ", "orcid": ";;;0000-0002-2886-2513;;;0009-0000-9900-6143", "linkedin": ";%E6%B8%8A%E9%B8%BF-%E5%BE%90-37a542113/;;;%E6%98%8A-%E6%9D%8E-392547a5/detail/recent-activity/;;", "or_profile": "~Ziquan_Liu1;~Yuanhong_Xu1;~Qi_Qian1;~Antoni_B._Chan1;~Li_Hao1;~Rong_Jin3;~YI_XU3", "aff": "City University of Hong Kong;Alibaba Group;Alibaba Group;City University of Hong Kong;Alibaba Group;Alibaba Group;Dalian University of Technology", "aff_domain": "cityu.edu.hk;alibaba-inc.com;alibaba-inc.com;cityu.edu.hk;alibaba-inc.com;alibaba-inc.com;dlut.edu.cn", "position": "PhD student;Researcher;Researcher;Full Professor;Researcher;Researcher;Associate Professor", "bibtex": "@misc{\nliu2022improved,\ntitle={Improved Fine-tuning by Leveraging Pre-training Data: Theory and Practice},\nauthor={Ziquan Liu and Yi Xu and Yuanhong Xu and Qi Qian and Hao Li and Antoni B. Chan and Rong Jin},\nyear={2022},\nurl={https://openreview.net/forum?id=kQns9y_JH6}\n}", "github": "", "project": "", "reviewers": "Toui;8rSt;GhJo", "site": "https://openreview.net/forum?id=kQns9y_JH6", "pdf_size": 0, "recommendation": "3;5;6", "confidence": "4;4;3", "correctness": "2;3;3", "technical_novelty": "2;3;3", "empirical_novelty": "3;3;4", "wc_summary_paper": "87;98;61", "wc_summary_review": "95;89;29", "wc_main_review": "1443;515;200", "wc_review": "1625;702;290", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 4.666666666666667, 1.247219128924647 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 2.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 82.0, 15.513435037626794 ], "wc_summary_review_avg": [ 71.0, 29.79932885150268 ], "wc_main_review_avg": [ 719.3333333333334, 527.6212867409939 ], "wc_review_avg": [ 872.3333333333334, 558.1614660850588 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": -0.7559289460184545, "corr_recommendation_correctness": 0.9449111825230683, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12087697167002285475&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;1;0;1;1;2", "aff_unique_norm": "City University of Hong Kong;Alibaba Group;Dalian University of Technology", "aff_unique_dep": ";;", "aff_unique_url": "https://www.cityu.edu.hk;https://www.alibaba.com;http://www.dlut.edu.cn/", "aff_unique_abbr": "CityU;Alibaba;DUT", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "GATSBI: Generative Adversarial Training for Simulation-Based Inference", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6518", "id": "kR1hC6j48Tp", "poster": "", "openreview": "https://openreview.net/forum?id=kR1hC6j48Tp", "slides": "https://iclr.cc/virtual/2022/poster/6518", "video": "https://iclr.cc/virtual/2022/poster/6518", "author_site": "Poornima Ramesh, Jan-Matthis Lueckmann, Jan Boelts, \u00c1lvaro Tejero-Cantero, David Greenberg, Pedro Goncalves, Jakob Macke", "tldr": "", "abstract": "Simulation-based inference (SBI) refers to statistical inference on stochastic models for which we can generate samples, but not compute likelihoods.\nLike SBI algorithms, generative adversarial networks (GANs) do not require explicit likelihoods. We study the relationship between SBI and GANs, and introduce GATSBI, an adversarial approach to SBI. GATSBI reformulates the variational objective in an adversarial setting to learn implicit posterior distributions. Inference with GATSBI is amortised across observations, works in high-dimensional posterior spaces and supports implicit priors. We evaluate GATSBI on two common SBI benchmark problems and on two high-dimensional simulators. On a model for wave propagation on the surface of a shallow water body, we show that GATSBI can return well-calibrated posterior estimates even in high dimensions. \nOn a model of camera optics, it infers a high-dimensional posterior given an implicit prior, and performs better than a\nstate-of-the-art SBI approach. We also show how GATSBI can be extended to perform sequential posterior estimation to focus on individual observations.\nOverall, GATSBI opens up opportunities for leveraging advances in GANs to perform Bayesian inference on high-dimensional simulation-based models.", "keywords": "Machine Learning;simulation-based inference;generative adversarial networks;approximate bayesian computation;data-driven modelling;GANs;SBI;likelihood-free inference;implicit models", "primary_area": "", "supplementary_material": "/attachment/10d9479b5540c60f2de396ab3347eba4a2780237.zip", "author": "Poornima Ramesh;Jan-Matthis Lueckmann;Jan Boelts;\u00c1lvaro Tejero-Cantero;David S. Greenberg;Pedro J. Goncalves;Jakob H. Macke", "authorids": "~Poornima_Ramesh1;~Jan-Matthis_Lueckmann2;~Jan_Boelts1;~\u00c1lvaro_Tejero-Cantero1;~David_S._Greenberg1;~Pedro_J._Goncalves1;~Jakob_H._Macke1", "gender": ";M;;M;M;M;", "homepage": ";https://www.jan-matthis.de;;https://mlcolab.org;http://m-dml.org/;http://ppjgoncalves.github.io/;", "dblp": ";https://dblp.uni-trier.de/pers/hd/l/Lueckmann:Jan=Matthis;;;92/2024;209/4971;", "google_scholar": ";;;https://scholar.google.es/citations?user=VObPwpUAAAAJ;https://scholar.google.com/citations?hl=en;https://scholar.google.co.uk/citations?user=0bj8iLAAAAAJ;", "orcid": ";;;0000-0002-8768-4227;0000-0002-8515-0459;0000-0002-6987-4836;", "linkedin": ";;;;;;", "or_profile": "~Poornima_Ramesh1;~Jan-Matthis_Lueckmann2;~Jan_Boelts1;~\u00c1lvaro_Tejero-Cantero1;~David_S._Greenberg1;~Pedro_J._Goncalves1;~Jakob_H._Macke1", "aff": ";Google;;University of Tuebingen;Helmholtz Centre Hereon;University of Tuebingen;", "aff_domain": ";google.com;;uni-tuebingen.de;hereon.de;uni-tuebingen.de;", "position": ";Researcher;;Principal Researcher;Principal Researcher;Postdoc;", "bibtex": "@inproceedings{\nramesh2022gatsbi,\ntitle={{GATSBI}: Generative Adversarial Training for Simulation-Based Inference},\nauthor={Poornima Ramesh and Jan-Matthis Lueckmann and Jan Boelts and {\\'A}lvaro Tejero-Cantero and David S. Greenberg and Pedro J. Goncalves and Jakob H. Macke},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=kR1hC6j48Tp}\n}", "github": "", "project": "", "reviewers": "cWGX;DidY;eKE2;MYcd", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "4;3;5;4", "correctness": "3;4;4;3", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "106;68;66;41", "wc_summary_review": "43;57;77;20", "wc_main_review": "228;85;218;258", "wc_review": "377;210;361;319", "wc_reply_reviewers": "467;0;19;0", "wc_reply_authors": "919;500;423;448", "reply_reviewers": "1;0;1;0", "reply_authors": "2;1;1;1", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 70.25, 23.220411279734044 ], "wc_summary_review_avg": [ 49.25, 20.765054779605084 ], "wc_main_review_avg": [ 197.25, 66.45816353165351 ], "wc_review_avg": [ 316.75, 65.17044959182037 ], "wc_reply_reviewers_avg": [ 121.5, 199.62527395097044 ], "wc_reply_authors_avg": [ 572.5, 201.9709137474998 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": -0.2294157338705618, "gs_citation": 39, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15349002435008264502&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "pdf": "https://openreview.net/pdf?id=kR1hC6j48Tp", "email": ";google.com;;uni-tuebingen.de;hereon.de;uni-tuebingen.de;", "author_num": 7, "aff_unique_index": "0;1;2;1", "aff_unique_norm": "Google;University of Tuebingen;Helmholtz Centre Hereon", "aff_unique_dep": "Google;;", "aff_unique_url": "https://www.google.com;https://www.uni-tuebingen.de/;https://www.hereon.de", "aff_unique_abbr": "Google;Uni T\u00fcbingen;", "aff_campus_unique_index": "0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;1;1;1", "aff_country_unique": "United States;Germany" }, { "id": "kSqyNY_QrD9", "title": "Learning to Solve Multi-Robot Task Allocation with a Covariant-Attention based Neural Architecture", "track": "main", "status": "Reject", "tldr": "", "abstract": "This paper demonstrates how time-constrained multi-robot task allocation (MRTA) problems can be modeled as a Markov Decision Process (MDP) over graphs, such that approximate solutions can be modeled as a policy using Reinforcement Learning (RL) methods. \n Inspired by emerging approaches for learning to solve related combinatorial optimization (CO) problems such as multi-traveling salesman (mTSP) problems, a graph neural architecture is conceived in this paper to model the MRTA policy. The generalizability and scalability needs of the complex CO problem presented by MRTA are addressed by innovatively using the concept of Covariant Compositional Networks (CCN) to learn the local structures of graphs. The resulting learning architecture is called Covariant Attention-based Mechanism or CAM, which comprises: 1) an encoder: CCN-based embedding model to represent the task space as learnable feature vectors, 2) a decoder: an attention-based model to facilitate sequential decision outputs, and 3) context: to represent the state of the mission and the robots. To learn the feature vectors, a policy-gradient method is used. The CAM architecture is found to generally outperform a state-of-the-art encoder-decoder method that is purely based on Multi-head Attention (MHA) mechanism in terms of task completion and cost function, when applied to a class of MRTA problems with time deadlines, robot ferry range constraints, and multi-tour allowance. CAM also demonstrated significantly better scalability in terms of cost function over unseen scenarios with larger task/robot spaces than those used for training. Lastly, evidence regarding the unique potential of learning-based approaches in delivering highly time-efficient solutions is provided for a benchmark vehicle routing problem -- where solutions are achieved 100-1000 times faster compared to a non-learning baseline, and for a benchmark MRTA problem with time and capacity constraints -- where solutions for larger problems are achieved 10 times faster compared to non-learning baselines.", "keywords": "MRTA;Reinforcement learning;graph learning", "primary_area": "", "supplementary_material": "", "author": "Steve Paul;Payam Ghassemi;Souma Chowdhury", "authorids": "~Steve_Paul1;~Payam_Ghassemi2;~Souma_Chowdhury1", "gender": "M;M;M", "homepage": ";http://www.payamghassemi.com/;http://adams.eng.buffalo.edu/", "dblp": "202/8465;;", "google_scholar": "zRf7acsAAAAJ;4ad1bnMAAAAJ;9UojRnIAAAAJ", "orcid": "0000-0002-8138-5242;http://orcid.org/0000-0002-9200-4046;", "linkedin": "steve-paul-67699854/;;", "or_profile": "~Steve_Paul1;~Payam_Ghassemi2;~Souma_Chowdhury1", "aff": "State University of New York, Buffalo;;State University of New York, Buffalo", "aff_domain": "buffalo.edu;;buffalo.edu", "position": "PhD student;;Assistant Professor", "bibtex": "@misc{\npaul2022learning,\ntitle={Learning to Solve Multi-Robot Task Allocation with a Covariant-Attention based Neural Architecture},\nauthor={Steve Paul and Payam Ghassemi and Souma Chowdhury},\nyear={2022},\nurl={https://openreview.net/forum?id=kSqyNY_QrD9}\n}", "github": "", "project": "", "reviewers": "Yr3w;tBqb;uLHX;8Xfc;AMwa", "site": "https://openreview.net/forum?id=kSqyNY_QrD9", "pdf_size": 0, "recommendation": "3;5;6;6;8", "confidence": "4;2;3;3;3", "correctness": "2;3;3;3;3", "technical_novelty": "2;2;3;2;3", "empirical_novelty": "3;3;3;3;0", "wc_summary_paper": "67;48;95;60;75", "wc_summary_review": "30;6;69;30;59", "wc_main_review": "320;83;407;692;219", "wc_review": "417;137;571;782;353", "wc_reply_reviewers": "0;0;0;61;0", "wc_reply_authors": "1370;543;513;1123;179", "reply_reviewers": "0;0;0;1;0", "reply_authors": "7;1;7;6;1", "recommendation_avg": [ 5.6, 1.624807680927192 ], "confidence_avg": [ 3.0, 0.6324555320336759 ], "correctness_avg": [ 2.8, 0.39999999999999997 ], "technical_novelty_avg": [ 2.4, 0.4898979485566356 ], "empirical_novelty_avg": [ 2.4, 1.2 ], "wc_summary_paper_avg": [ 69.0, 15.735310610216755 ], "wc_summary_review_avg": [ 38.8, 22.5867217630182 ], "wc_main_review_avg": [ 344.2, 204.64349488806138 ], "wc_review_avg": [ 452.0, 216.01481430679704 ], "wc_reply_reviewers_avg": [ 12.2, 24.4 ], "wc_reply_authors_avg": [ 745.6, 435.5436143487814 ], "reply_reviewers_avg": [ 0.2, 0.4 ], "reply_authors_avg": [ 4.4, 2.8000000000000003 ], "replies_avg": [ 30, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.3892494720807615, "corr_recommendation_correctness": 0.8000946913656628, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12655184742891113766&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "aff_unique_index": "0;0", "aff_unique_norm": "State University of New York at Buffalo", "aff_unique_dep": "", "aff_unique_url": "https://www.buffalo.edu", "aff_unique_abbr": "SUNY Buffalo", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Buffalo", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "PipeGCN: Efficient Full-Graph Training of Graph Convolutional Networks with Pipelined Feature Communication", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6173", "id": "kSwqMH0zn1F", "poster": "", "openreview": "https://openreview.net/forum?id=kSwqMH0zn1F", "slides": "https://iclr.cc/virtual/2022/poster/6173", "video": "https://iclr.cc/virtual/2022/poster/6173", "author_site": "Cheng Wan, Youjie Li, Cameron Wolfe, Anastasios Kyrillidis, Nam Sung Kim, Yingyan Lin", "tldr": "", "abstract": "Graph Convolutional Networks (GCNs) is the state-of-the-art method for learning graph-structured data, and training large-scale GCNs requires distributed training across multiple accelerators such that each accelerator is able to hold a partitioned subgraph. However, distributed GCN training incurs prohibitive overhead of communicating node features and feature gradients among partitions for every GCN layer during each training iteration, limiting the achievable training efficiency and model scalability. To this end, we propose PipeGCN, a simple yet effective scheme that hides the communication overhead by pipelining inter-partition communication with intra-partition computation. It is non-trivial to pipeline for efficient GCN training, as communicated node features/gradients will become stale and thus can harm the convergence, negating the pipeline benefit. Notably, little is known regarding the convergence rate of GCN training with both stale features and stale feature gradients. This work not only provides a theoretical convergence analysis but also finds the convergence rate of PipeGCN to be close to that of the vanilla distributed GCN training without any staleness. Furthermore, we develop a smoothing method to further improve PipeGCN's convergence. Extensive experiments show that PipeGCN can largely boost the training throughput (1.7\u00d7~28.5\u00d7) while achieving the same accuracy as its vanilla counterpart and existing full-graph training methods. The code is available at https://github.com/RICE-EIC/PipeGCN.", "keywords": "Graph Neural Networks;Graph Convolutional Networks;Distributed Training;Asynchronous Training;Full-Graph Training;Large-Graph Training;Stale Features", "primary_area": "", "supplementary_material": "", "author": "Cheng Wan;Youjie Li;Cameron R. Wolfe;Anastasios Kyrillidis;Nam Sung Kim;Yingyan Lin", "authorids": "~Cheng_Wan2;~Youjie_Li1;~Cameron_R._Wolfe1;~Anastasios_Kyrillidis2;~Nam_Sung_Kim3;~Yingyan_Lin1", "gender": "M;;M;M;Unspecified;F", "homepage": "http://cc.gatech.edu/~cwan39;https://www.liyoujie.net/;https://wolfecameron.github.io;http://akyrillidis.github.io;https://ece.illinois.edu;https://eiclab.scs.gatech.edu/", "dblp": ";85/8440;238/0394;53/9879;;120/6981", "google_scholar": "JZCbRO0AAAAJ;9NujVeYAAAAJ;jXLvrUwAAAAJ;TEGzkZMAAAAJ;iccBxJIAAAAJ;dio8IesAAAAJ", "orcid": "0000-0002-2295-3481;;;;;", "linkedin": "cheng-wan/;youjie-li-1929b2aa/;;;;yingyan-celine-lin-a281211a/", "or_profile": "~Cheng_Wan2;~Youjie_Li1;~Cameron_R._Wolfe1;~Anastasios_Kyrillidis2;~Nam_Sung_Kim3;~Yingyan_Lin1", "aff": "Rice University;University of Illinois, Urbana Champaign;Rice University;Rice University;University of Illinois, Urbana Champaign;Rice University", "aff_domain": "rice.edu;illinois.edu;rice.edu;rice.edu;illinois.edu;rice.edu", "position": "PhD student;PhD student;PhD student;Assistant Professor;Full Professor;Assistant Professor", "bibtex": "@inproceedings{\nwan2022pipegcn,\ntitle={Pipe{GCN}: Efficient Full-Graph Training of Graph Convolutional Networks with Pipelined Feature Communication},\nauthor={Cheng Wan and Youjie Li and Cameron R. Wolfe and Anastasios Kyrillidis and Nam Sung Kim and Yingyan Lin},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=kSwqMH0zn1F}\n}", "github": "", "project": "", "reviewers": "L5WF;59N6;cPnX;KmNe", "pdf_size": 0, "recommendation": "6;6;6;6", "confidence": "4;4;3;3", "correctness": "3;3;4;3", "technical_novelty": "3;2;3;3", "empirical_novelty": "3;3;3;3", "wc_summary_paper": "55;60;54;28", "wc_summary_review": "179;55;19;42", "wc_main_review": "401;198;251;175", "wc_review": "635;313;324;245", "wc_reply_reviewers": "93;102;0;0", "wc_reply_authors": "1532;1380;922;551", "reply_reviewers": "1;1;0;0", "reply_authors": "3;5;2;1", "recommendation_avg": [ 6.0, 0.0 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 49.25, 12.47747971346778 ], "wc_summary_review_avg": [ 73.75, 62.118334652500145 ], "wc_main_review_avg": [ 256.25, 87.99822441390508 ], "wc_review_avg": [ 379.25, 150.7255369869353 ], "wc_reply_reviewers_avg": [ 48.75, 48.853735783458774 ], "wc_reply_authors_avg": [ 1096.25, 386.6693516429767 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 2.75, 1.479019945774904 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 88, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5927794723979100407&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "pdf": "https://openreview.net/pdf?id=kSwqMH0zn1F", "email": "rice.edu;illinois.edu;rice.edu;rice.edu;illinois.edu;rice.edu", "author_num": 6, "aff_unique_index": "0;1;0;0;1;0", "aff_unique_norm": "Rice University;University of Illinois Urbana-Champaign", "aff_unique_dep": ";", "aff_unique_url": "https://www.rice.edu;https://illinois.edu", "aff_unique_abbr": "Rice;UIUC", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Urbana-Champaign", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "kTcRljax0x9", "title": "Assessing Deep Reinforcement Learning Policies via Natural Corruptions at the Edge of Imperceptibility", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Deep reinforcement learning algorithms have recently achieved significant success in learning high-performing policies from purely visual observations. The ability to perform end-to-end learning from raw high dimensional input alone has led to deep reinforcement learning algorithms being deployed in a variety of fields. Thus, understanding and improving the ability of deep reinforcement learning policies to generalize to unseen data distributions is of critical importance. Much recent work has focused on assessing the generalization of deep reinforcement learning policies by introducing specifically crafted adversarial perturbations to their inputs. In this paper, we approach this problem from another perspective and propose a framework to assess the generalization skills of trained deep reinforcement learning policies. Rather than focusing on worst-case analysis of distribution shift, our approach is based on black-box perturbations that correspond to minimal semantically meaningful natural changes to the environment or the agent's visual observation system ranging from brightness to compression artifacts. We demonstrate that the perceptual similarity distance of the minimal natural perturbations is orders of magnitude smaller than the perceptual similarity distance of the adversarial perturbations to the unperturbed observations (i.e. minimal natural perturbations are perceptually more similar to the unperturbed states than the adversarial perturbations), while causing larger degradation in the policy performance. Furthermore, we investigate state-of-the-art adversarial training methods and show that adversarially trained deep reinforcement learning policies are more sensitive to almost all of the natural perturbations compared to vanilla trained policies. Lastly, we highlight that our framework captures a diverse set of bands in the Fourier spectrum; thus providing a better overall understanding of the policy's generalization capabilities. We believe our work can be crucial towards building resilient and generalizable deep reinforcement learning policies.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/9748554c486e7f5903da8c7debe66a45cc8e48f3.zip", "author": "Ezgi Korkmaz", "authorids": "~Ezgi_Korkmaz2", "gender": "", "homepage": "https://ezgikorkmaz.github.io/", "dblp": "300/7830.html", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "~Ezgi_Korkmaz2", "aff": "University College London, University of London", "aff_domain": "ucl.ac.uk", "position": "PhD student", "bibtex": "@misc{\nkorkmaz2022assessing,\ntitle={Assessing Deep Reinforcement Learning Policies via Natural Corruptions at the Edge of Imperceptibility},\nauthor={Ezgi Korkmaz},\nyear={2022},\nurl={https://openreview.net/forum?id=kTcRljax0x9}\n}", "github": "", "project": "", "reviewers": "ZEDG;DYH4;La3E;VXxv;9u6b", "site": "https://openreview.net/forum?id=kTcRljax0x9", "pdf_size": 0, "recommendation": "1;3;5;5;6", "confidence": "5;4;4;4;4", "correctness": "4;4;3;4;3", "technical_novelty": "1;2;3;2;3", "empirical_novelty": "2;2;3;2;3", "wc_summary_paper": "46;267;117;39;322", "wc_summary_review": "31;221;177;46;97", "wc_main_review": "288;1041;499;269;726", "wc_review": "365;1529;793;354;1145", "wc_reply_reviewers": "0;638;0;0;0", "wc_reply_authors": "479;308;0;0;0", "reply_reviewers": "0;1;0;0;0", "reply_authors": "3;2;0;0;0", "recommendation_avg": [ 4.0, 1.7888543819998317 ], "confidence_avg": [ 4.2, 0.39999999999999997 ], "correctness_avg": [ 3.6, 0.4898979485566356 ], "technical_novelty_avg": [ 2.2, 0.7483314773547882 ], "empirical_novelty_avg": [ 2.4, 0.4898979485566356 ], "wc_summary_paper_avg": [ 158.2, 115.8989214790198 ], "wc_summary_review_avg": [ 114.4, 73.78238272108052 ], "wc_main_review_avg": [ 564.6, 290.2403142225421 ], "wc_review_avg": [ 837.2, 454.25473030008175 ], "wc_reply_reviewers_avg": [ 127.6, 255.2 ], "wc_reply_authors_avg": [ 157.4, 200.21548391670413 ], "reply_reviewers_avg": [ 0.2, 0.4000000000000001 ], "reply_authors_avg": [ 1.0, 1.2649110640673518 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 1, 0 ], "corr_recommendation_confidence": -0.8385254915624213, "corr_recommendation_correctness": -0.6846531968814576, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8183638567500501307&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "University College London", "aff_unique_dep": "", "aff_unique_url": "https://www.ucl.ac.uk", "aff_unique_abbr": "UCL", "aff_country_unique_index": "0", "aff_country_unique": "United Kingdom" }, { "id": "kUGYDTJUcuc", "title": "Unifying Top-down and Bottom-up for Recurrent Visual Attention", "track": "main", "status": "Reject", "tldr": "", "abstract": "The idea of using the recurrent neural network for visual attention has gained popularity in computer vision community. Although the recurrent visual attention model (RAM) leverages the glimpses with more large patch size to increasing its scope, it may result in high variance and instability. For example, we need the Gaussian policy with high variance to explore object of interests in a large image, which may cause randomized search and unstable learning. In this paper, we propose to unify the top-down and bottom-up attention together for recurrent visual attention. Our model exploits the image pyramids and Q-learning to select regions of interests in the top-down attention mechanism, which in turn to guide the policy search in the bottom-up approach. In addition, we add another two constraints over the bottom-up recurrent neural networks for better exploration. We train our model in an end-to-end reinforcement learning framework, and evaluate our method on visual classification tasks. The experimental results outperform convolutional neural networks (CNNs) baseline and the bottom-up recurrent models with visual attention.", "keywords": "visual attention model;reinforcement learning", "primary_area": "", "supplementary_material": "", "author": "GANG CHEN", "authorids": "~GANG_CHEN1", "gender": "male", "homepage": "", "dblp": "67/6383", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "~GANG_CHEN1", "aff": "State University of New York, Buffalo", "aff_domain": "buffalo.edu", "position": "PhD student", "bibtex": "@misc{\nchen2022unifying,\ntitle={Unifying Top-down and Bottom-up for Recurrent Visual Attention},\nauthor={GANG CHEN},\nyear={2022},\nurl={https://openreview.net/forum?id=kUGYDTJUcuc}\n}", "github": "", "project": "", "reviewers": "ooVw;ixZU;ARgD", "site": "https://openreview.net/forum?id=kUGYDTJUcuc", "pdf_size": 0, "recommendation": "3;3;6", "confidence": "4;4;2", "correctness": "3;3;4", "technical_novelty": "2;2;4", "empirical_novelty": "2;2;4", "wc_summary_paper": "85;69;45", "wc_summary_review": "31;34;25", "wc_main_review": "276;192;69", "wc_review": "392;295;139", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 4.0, 1.4142135623730951 ], "confidence_avg": [ 3.3333333333333335, 0.9428090415820634 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.6666666666666665, 0.9428090415820634 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.9428090415820634 ], "wc_summary_paper_avg": [ 66.33333333333333, 16.438437341250605 ], "wc_summary_review_avg": [ 30.0, 3.7416573867739413 ], "wc_main_review_avg": [ 179.0, 85.00588214941364 ], "wc_review_avg": [ 275.3333333333333, 104.21878696067978 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 1, 0 ], "corr_recommendation_confidence": -1.0, "corr_recommendation_correctness": 1.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:ZIFeRWYwPAQJ:scholar.google.com/&scioq=Unifying+Top-down+and+Bottom-up+for+Recurrent+Visual+Attention&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "State University of New York at Buffalo", "aff_unique_dep": "", "aff_unique_url": "https://www.buffalo.edu", "aff_unique_abbr": "SUNY Buffalo", "aff_campus_unique_index": "0", "aff_campus_unique": "Buffalo", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "id": "kUtux8k0G6y", "title": "Avoiding Robust Misclassifications for Improved Robustness without Accuracy Loss", "track": "main", "status": "Reject", "tldr": "", "abstract": "While current methods for training robust deep learning models optimize robust accuracy, in practice, the resulting models are often both robust and inaccurate on numerous samples, providing a false sense of safety for those. Further, they significantly reduce natural accuracy, which hinders the adoption in practice. In this work, we address both of these challenges by extending prior works in three main directions. First, we propose a new training method that jointly maximizes robust accuracy and minimizes robust inaccuracy. Second, since the resulting models are trained to be robust only if they are accurate, we leverage robustness as a principled abstain mechanism. Finally, this abstain mechanism allows us to combine models in a compositional architecture that significantly boosts overall robustness without sacrificing accuracy. We demonstrate the effectiveness of our approach to both empirical and certified robustness on six recent state-of-the-art models and using several datasets. Our results show that our method effectively reduces robust and inaccurate samples by up to 97.28%. Further, it successfully enhanced the $\\epsilon_\\infty = 1/255$ robustness of a state-of-the-art model from 26% to 86% while only marginally reducing its natural accuracy from 97.8% to 97.6%.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yannick Merkli;Pavol Bielik;PETAR TSANKOV;Martin Vechev", "authorids": "~Yannick_Merkli1;~Pavol_Bielik1;~PETAR_TSANKOV1;~Martin_Vechev1", "gender": "M;;M;M", "homepage": ";https://www.sri.inf.ethz.ch/people/pavol;;https://www.sri.inf.ethz.ch/people/martin", "dblp": ";32/11105;;93/2189.html", "google_scholar": ";https://scholar.google.com/citations?hl=en;GydL0T8AAAAJ;https://scholar.google.ch/citations?user=aZ1Rh50AAAAJ", "orcid": ";;;", "linkedin": "ymerkli/;;petartsankov/;", "or_profile": "~Yannick_Merkli1;~Pavol_Bielik1;~PETAR_TSANKOV1;~Martin_Vechev1", "aff": ";LatticeFlow;LatticeFlow;Swiss Federal Institute of Technology", "aff_domain": ";latticeflow.ai;latticeflow.ai;ethz.ch", "position": ";CTO;CEO;Full Professor", "bibtex": "@misc{\nmerkli2022avoiding,\ntitle={Avoiding Robust Misclassifications for Improved Robustness without Accuracy Loss},\nauthor={Yannick Merkli and Pavol Bielik and PETAR TSANKOV and Martin Vechev},\nyear={2022},\nurl={https://openreview.net/forum?id=kUtux8k0G6y}\n}", "github": "", "project": "", "reviewers": "xSkN;sDbQ;wtee;7Sk1", "site": "https://openreview.net/forum?id=kUtux8k0G6y", "pdf_size": 0, "recommendation": "3;5;5;8", "confidence": "4;3;3;3", "correctness": "3;4;3;4", "technical_novelty": "1;2;3;3", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "92;76;33;169", "wc_summary_review": "23;35;43;46", "wc_main_review": "380;85;382;487", "wc_review": "495;196;458;702", "wc_reply_reviewers": "0;0;31;62", "wc_reply_authors": "635;325;454;404", "reply_reviewers": "0;0;1;1", "reply_authors": "1;1;1;2", "recommendation_avg": [ 5.25, 1.7853571071357126 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.25, 0.82915619758885 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 92.5, 49.1553659329274 ], "wc_summary_review_avg": [ 36.75, 8.898735865278843 ], "wc_main_review_avg": [ 333.5, 149.8574322481204 ], "wc_review_avg": [ 462.75, 179.9018829806959 ], "wc_reply_reviewers_avg": [ 23.25, 25.703842125254347 ], "wc_reply_authors_avg": [ 454.5, 113.90895487186246 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.7276068751089989, "corr_recommendation_correctness": 0.7001400420140049, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:jE6hfV2W3nUJ:scholar.google.com/&scioq=Avoiding+Robust+Misclassifications+for+Improved+Robustness+without+Accuracy+Loss&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0;1", "aff_unique_norm": "LatticeFlow;Swiss Federal Institute of Technology", "aff_unique_dep": ";", "aff_unique_url": ";https://www.ethz.ch", "aff_unique_abbr": ";ETH Zurich", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "1", "aff_country_unique": ";Switzerland" }, { "id": "kW05eAYtOma", "title": "Rethinking Pareto Approaches in Constrained Reinforcement Learning", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Constrained Reinforcement Learning (CRL) burgeons broad interest in recent years, which pursues both goals of maximizing long-term returns and constraining costs. Although CRL can be cast as a multi-objective optimization problem, it is still largely unsolved using standard Pareto optimization approaches. The key challenge is that gradient-based Pareto optimization agents tend to stick to known Pareto-optimal solutions even when they yield poor returns (i.e., the safest self-driving car that never moves) or violates the constraints (i.e., the record breaking racer that crashes the car). In this paper, we propose a novel Pareto optimization method for CRL with two gradient recalibration techniques to overcome the challenge. First, to explore around feasible Pareto optimal solutions, we use gradient re-balancing to let the agent improve more on under-optimized objectives at each policy update. Second, to escape from infeasible solutions, we propose gradient perturbation to temporarily sacrifice return to save costs. Experiments on the SafetyGym benchmarks show that our method consistently outperforms previous CRL methods in return while satisfying the cost constraints.", "keywords": "Constrained Reinforcement Learning;Pareto optimization;Constrained Markov Decision Process", "primary_area": "", "supplementary_material": "", "author": "Mengda Huang;Feiyang Pan;Jia He;Xiang Ao;Qing He", "authorids": "~Mengda_Huang2;~Feiyang_Pan1;hejia0149@gmail.com;~Xiang_Ao2;~Qing_He2", "gender": "M;M;;M;M", "homepage": "https://github.com/DavidVillaHMD;https://feiyang.github.io/;;https://aoxaustin.github.io/;http://www.ict.cas.cn/sourcedb_2018_ict_cas/cn/jssrck/200909/t20090917_2496626.html", "dblp": ";;;71/1982-1;14/3700-3.html", "google_scholar": ";514MRR4AAAAJ;;W8wrWfMAAAAJ;tkbgSDYAAAAJ", "orcid": ";;;0000-0001-9633-8361;0000-0001-8833-5398", "linkedin": ";;;;", "or_profile": "~Mengda_Huang2;~Feiyang_Pan1;hejia0149@gmail.com;~Xiang_Ao2;~Qing_He2", "aff": "ICT, Chinese Academy of Sciences;Huawei Technologies Ltd.;;Institute of Computing Technology, Chinese Academy of Sciences;Institute of Computing Technology, Chinese Academy of Sciences", "aff_domain": "ict.ac.cn;huawei.com;;ict.ac.cn;ict.ac.cn", "position": "MS student;Principal Researcher;;Associate Professor;Full Professor", "bibtex": "@misc{\nhuang2022rethinking,\ntitle={Rethinking Pareto Approaches in Constrained Reinforcement Learning},\nauthor={Mengda Huang and Feiyang Pan and Jia He and Xiang Ao and Qing He},\nyear={2022},\nurl={https://openreview.net/forum?id=kW05eAYtOma}\n}", "github": "", "project": "", "reviewers": "FPf5;JoyJ;Wjxi;Mp4K", "site": "https://openreview.net/forum?id=kW05eAYtOma", "pdf_size": 0, "recommendation": "5;5;5;5", "confidence": "4;3;3;3", "correctness": "2;3;3;3", "technical_novelty": "3;3;3;3", "empirical_novelty": "3;3;3;3", "wc_summary_paper": "95;170;127;56", "wc_summary_review": "48;110;39;65", "wc_main_review": "482;690;423;109", "wc_review": "625;970;589;230", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 5.0, 0.0 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 112.0, 41.87481343242021 ], "wc_summary_review_avg": [ 65.5, 27.33587386567329 ], "wc_main_review_avg": [ 426.0, 208.16459833506752 ], "wc_review_avg": [ 603.5, 261.9623064488477 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:w2RvSEPixBAJ:scholar.google.com/&scioq=Rethinking+Pareto+Approaches+in+Constrained+Reinforcement+Learning&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1;2;2", "aff_unique_norm": "Institute of Computing Technology, Chinese Academy of Sciences;Huawei;Chinese Academy of Sciences", "aff_unique_dep": ";Huawei Technologies;Institute of Computing Technology", "aff_unique_url": "http://www.ict.cas.cn;https://www.huawei.com;http://www.ict.ac.cn", "aff_unique_abbr": "ICT;Huawei;CAS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "id": "kWuBTQmkO8_", "title": "MixRL: Data Mixing Augmentation for Regression using Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Data augmentation is becoming essential for improving regression accuracy in critical applications including manufacturing and finance. Existing techniques for data augmentation largely focus on classification tasks and do not readily apply to regression tasks. In particular, the recent Mixup techniques for classification rely on the key assumption that linearity holds among training examples, which is reasonable if the label space is discrete, but has limitations when the label space is continuous as in regression. We show that mixing examples that either have a large data or label distance may have an increasingly-negative effect on model performance. Hence, we use the stricter assumption that linearity only holds within certain data or label distances for regression where the degree may vary by each example. We then propose MixRL, a data augmentation meta learning framework for regression that learns for each example how many nearest neighbors it should be mixed with for the best model performance using a small validation set. MixRL achieves these objectives using Monte Carlo policy gradient reinforcement learning. Our experiments conducted both on synthetic and real datasets show that MixRL significantly outperforms state-of-the-art data augmentation baselines. MixRL can also be integrated with other classification Mixup techniques for better results.", "keywords": "data augmentation;regression;mixup;reinforcement learning", "primary_area": "", "supplementary_material": "/attachment/dacb16601398c9e1f93094845cd7754cfd6199be.zip", "author": "Seong-Hyeon Hwang;Steven Euijong Whang", "authorids": "~Seong-Hyeon_Hwang1;~Steven_Euijong_Whang1", "gender": "M;M", "homepage": "https://sites.google.com/view/seonghyeonhwang/%ED%99%88;http://www.stevenwhang.com", "dblp": "262/3295;w/StevenEuijongWhang", "google_scholar": "https://scholar.google.com/citations?hl=ko;w6hts30AAAAJ", "orcid": "0009-0005-2377-8733;0000-0001-6419-931X", "linkedin": ";steven-euijong-whang-1612b5a/", "or_profile": "~Seong-Hyeon_Hwang1;~Steven_Euijong_Whang1", "aff": "Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology", "aff_domain": "kaist.ac.kr;kaist.ac.kr", "position": "PhD student;Associate Professor", "bibtex": "@misc{\nhwang2022mixrl,\ntitle={Mix{RL}: Data Mixing Augmentation for Regression using Reinforcement Learning},\nauthor={Seong-Hyeon Hwang and Steven Euijong Whang},\nyear={2022},\nurl={https://openreview.net/forum?id=kWuBTQmkO8_}\n}", "github": "", "project": "", "reviewers": "XF4Y;3Wyg;aYea", "site": "https://openreview.net/forum?id=kWuBTQmkO8_", "pdf_size": 0, "recommendation": "3;5;5", "confidence": "4;3;4", "correctness": "2;3;3", "technical_novelty": "2;3;2", "empirical_novelty": "2;3;2", "wc_summary_paper": "63;78;67", "wc_summary_review": "35;12;48", "wc_main_review": "540;76;118", "wc_review": "638;166;233", "wc_reply_reviewers": "212;0;0", "wc_reply_authors": "994;85;173", "reply_reviewers": "1;0;0", "reply_authors": "2;1;1", "recommendation_avg": [ 4.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 2.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 69.33333333333333, 6.342099196813483 ], "wc_summary_review_avg": [ 31.666666666666668, 14.884742374510738 ], "wc_main_review_avg": [ 244.66666666666666, 209.53493476957226 ], "wc_review_avg": [ 345.6666666666667, 208.51272276663494 ], "wc_reply_reviewers_avg": [ 70.66666666666667, 99.93775840769871 ], "wc_reply_authors_avg": [ 417.3333333333333, 409.34446238942684 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.49999999999999983, "corr_recommendation_correctness": 0.9999999999999998, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15491211359354009747&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Korea Advanced Institute of Science and Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.kaist.ac.kr", "aff_unique_abbr": "KAIST", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "South Korea" }, { "title": "Variational methods for simulation-based inference", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6748", "id": "kZ0UYdhqkNY", "poster": "", "openreview": "https://openreview.net/forum?id=kZ0UYdhqkNY", "slides": "https://iclr.cc/virtual/2022/poster/6748", "video": "https://iclr.cc/virtual/2022/poster/6748", "author_site": "Manuel Gl\u00f6ckler, Michael Deistler, Jakob Macke", "tldr": "", "abstract": "We present Sequential Neural Variational Inference (SNVI), an approach to perform Bayesian inference in models with intractable likelihoods. SNVI combines likelihood-estimation (or likelihood-ratio-estimation) with variational inference to achieve a scalable simulation-based inference approach. SNVI maintains the flexibility of likelihood(-ratio) estimation to allow arbitrary proposals for simulations, while simultaneously providing a functional estimate of the posterior distribution without requiring MCMC sampling. We present several variants of SNVI and demonstrate that they are substantially more computationally efficient than previous algorithms, without loss of accuracy on benchmark tasks. We apply SNVI to a neuroscience model of the pyloric network in the crab and demonstrate that it can infer the posterior distribution with one order of magnitude fewer simulations than previously reported. SNVI vastly reduces the computational cost of simulation-based inference while maintaining accuracy and flexibility, making it possible to tackle problems that were previously inaccessible.", "keywords": "likelihood-free inference;simulation-based inference;variational inference;neural density estimation", "primary_area": "", "supplementary_material": "", "author": "Manuel Gl\u00f6ckler;Michael Deistler;Jakob H. Macke", "authorids": "~Manuel_Gl\u00f6ckler1;~Michael_Deistler1;~Jakob_H._Macke1", "gender": "M;M;M", "homepage": "https://manuelgloeckler.github.io/;https://michaeldeistler.github.io/;http://www.mackelab.org", "dblp": ";243/5747;97/11106", "google_scholar": "0Vdv0H0AAAAJ;Q24H-zYAAAAJ;FKOqtF8AAAAJ", "orcid": ";0000-0002-3573-0404;0000-0001-5154-8912", "linkedin": ";;", "or_profile": "~Manuel_Gl\u00f6ckler1;~Michael_Deistler1;~Jakob_H_Macke1", "aff": "University of Tuebingen;University of Tuebingen;University of Tuebingen", "aff_domain": "uni-tuebingen.de;uni-tuebingen.de;uni-tuebingen.de", "position": "PhD student;PhD student;Full Professor", "bibtex": "@inproceedings{\ngl{\\\"o}ckler2022variational,\ntitle={Variational methods for simulation-based inference},\nauthor={Manuel Gl{\\\"o}ckler and Michael Deistler and Jakob H. Macke},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=kZ0UYdhqkNY}\n}", "github": "", "project": "", "reviewers": "9Rvr;NhK3;JNHt;NxCC", "pdf_size": 0, "recommendation": "6;6;8;8", "confidence": "4;3;4;3", "correctness": "4;4;4;3", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "33;261;79;67", "wc_summary_review": "49;65;38;30", "wc_main_review": "289;499;169;266", "wc_review": "371;825;286;363", "wc_reply_reviewers": "17;40;36;23", "wc_reply_authors": "766;791;283;809", "reply_reviewers": "1;1;1;1", "reply_authors": "1;1;1;1", "recommendation_avg": [ 7.0, 1.0 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 110.0, 88.79752248796134 ], "wc_summary_review_avg": [ 45.5, 13.124404748406688 ], "wc_main_review_avg": [ 305.75, 120.31910696144648 ], "wc_review_avg": [ 461.25, 212.61746753265592 ], "wc_reply_reviewers_avg": [ 29.0, 9.354143466934854 ], "wc_reply_authors_avg": [ 662.25, 219.4918848158173 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": -0.5773502691896257, "gs_citation": 60, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16337891944937937425&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=kZ0UYdhqkNY", "email": "uni-tuebingen.de;uni-tuebingen.de;uni-tuebingen.de", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Tuebingen", "aff_unique_dep": "", "aff_unique_url": "https://www.uni-tuebingen.de/", "aff_unique_abbr": "Uni T\u00fcbingen", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Germany" }, { "id": "k_Zy6glYaqc", "title": "Quantum Alphatron", "track": "main", "status": "Desk Reject", "tldr": "", "abstract": "Finding provably efficient algorithms for learning neural networks is a fundamental challenge in the theory of machine learning. The Alphatron of Goel and Klivans is the first provably efficient algorithm for learning neural networks with more than one nonlinear layer. The algorithm succeeds with any distribution on the $n$-dimensional unit ball and without any assumption on the structure of the network. In this work, we refine the original Alphatron by a pre-computing phase for its most time-consuming part, the evaluation of the kernel function. This refined algorithm improves the run time of the original Alphatron, while retaining the same learning guarantee. Based on the refined algorithm, we quantize the pre-computing phase with provable learning guarantee in the fault-tolerant quantum computing model. In a well-defined learning model, this quantum algorithm is able to provide a quadratic speedup in the data dimension $n$. In addition, we discuss the second type of speedup, quantizing the evaluation of the gradient in the stochastic gradient descent procedure. Our work contributes to the study of quantum learning with kernels and from samples.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Siyi Yang;Patrick Rebentrost;Miklos Santha", "authorids": "~Siyi_Yang1;~Patrick_Rebentrost1;~Miklos_Santha1", "gender": ";;", "homepage": ";;https://www.quantumlah.org/people/profile/miklos", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": "siyi-yang-4aa7b668/;;", "or_profile": "~Siyi_Yang1;~Patrick_Rebentrost1;~Miklos_Santha1", "aff": ";;National University of Singapore", "aff_domain": ";;nus.edu.sg", "position": ";;Principal Researcher", "bibtex": "@misc{\nyang2022quantum,\ntitle={Quantum Alphatron},\nauthor={Siyi Yang and Patrick Rebentrost and Miklos Santha},\nyear={2022},\nurl={https://openreview.net/forum?id=k_Zy6glYaqc}\n}", "github": "", "project": "", "reviewers": "", "site": "https://openreview.net/forum?id=k_Zy6glYaqc", "pdf_size": 0, "recommendation": "", "confidence": "", "correctness": "", "technical_novelty": "", "empirical_novelty": "", "wc_summary_paper": "", "wc_summary_review": "", "wc_main_review": "", "wc_review": "", "wc_reply_reviewers": "", "wc_reply_authors": "", "reply_reviewers": "", "reply_authors": "", "recommendation_avg": [ 0, 0 ], "confidence_avg": [ 0, 0 ], "correctness_avg": [ 0, 0 ], "technical_novelty_avg": [ 0, 0 ], "empirical_novelty_avg": [ 0, 0 ], "wc_summary_paper_avg": [ 0, 0 ], "wc_summary_review_avg": [ 0, 0 ], "wc_main_review_avg": [ 0, 0 ], "wc_review_avg": [ 0, 0 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 1, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0, "corr_recommendation_correctness": 0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:dS2Um9SsGOYJ:scholar.google.com/&scioq=Quantum+Alphatron&hl=en&as_sdt=0,14", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "National University of Singapore", "aff_unique_dep": "", "aff_unique_url": "https://www.nus.edu.sg", "aff_unique_abbr": "NUS", "aff_country_unique_index": "0", "aff_country_unique": "Singapore" }, { "id": "kamUXjlAZuw", "title": "On Learning with Fairness Trade-Offs", "track": "main", "status": "Reject", "tldr": "", "abstract": "Previous literature has shown that bias mitigating algorithms were sometimes prone to overfitting and had poor out-of-sample generalisation. This paper is first and foremost concerned with establishing a mathematical framework to tackle the specific issue of generalisation. Throughout this work, we consider fairness trade-offs and objectives mixing statistical loss over the whole sample and fairness penalties on categories (which could stem from different values of protected attributes), encompassing partial de-biasing. We do so by adopting two different but complementary viewpoints: first, we consider a PAC-type setup and derive probabilistic upper bounds involving sample-only information; second, we leverage an asymptotic framework to derive a closed-form limiting distribution for the difference between the empirical trade-off and the true trade-off. While these results provide guarantees for learning fairness metrics across categories, they also point out to the key (but asymmetric) role played by class imbalance. To summarise, learning fairness without having access to enough category-level samples is hard, and a simple numerical experiment shows that it can lead to spurious results. ", "keywords": "fairness;statistical learning;PAC;social welfare", "primary_area": "", "supplementary_material": "", "author": "Francois Buet-Golfouse", "authorids": "~Francois_Buet-Golfouse1", "gender": "", "homepage": "", "dblp": "277/9611", "google_scholar": "vFexmxwAAAAJ", "orcid": "", "linkedin": "", "or_profile": "~Francois_Buet-Golfouse1", "aff": "University College London", "aff_domain": "ucl.ac.uk", "position": "PhD student", "bibtex": "@misc{\nbuet-golfouse2022on,\ntitle={On Learning with Fairness Trade-Offs},\nauthor={Francois Buet-Golfouse},\nyear={2022},\nurl={https://openreview.net/forum?id=kamUXjlAZuw}\n}", "github": "", "project": "", "reviewers": "9tuc;bVZW;CZ3f;myf3", "site": "https://openreview.net/forum?id=kamUXjlAZuw", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "4;4;4;2", "correctness": "3;4;2;4", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "59;12;39;69", "wc_summary_review": "125;39;27;39", "wc_main_review": "449;161;228;229", "wc_review": "633;212;294;337", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 3.5, 0.8660254037844386 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 44.75, 21.775846711436962 ], "wc_summary_review_avg": [ 57.5, 39.27785635698567 ], "wc_main_review_avg": [ 266.75, 108.77126228926463 ], "wc_review_avg": [ 369.0, 158.89776587479133 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 1, 0 ], "corr_recommendation_confidence": -1.0, "corr_recommendation_correctness": 0.5222329678670935, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1, "aff_unique_index": "0", "aff_unique_norm": "University College London", "aff_unique_dep": "", "aff_unique_url": "https://www.ucl.ac.uk", "aff_unique_abbr": "UCL", "aff_country_unique_index": "0", "aff_country_unique": "United Kingdom" }, { "title": "Spatial Graph Attention and Curiosity-driven Policy for Antiviral Drug Discovery", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6351", "id": "kavTY__jxp", "poster": "", "openreview": "https://openreview.net/forum?id=kavTY__jxp", "slides": "https://iclr.cc/virtual/2022/poster/6351", "video": "https://iclr.cc/virtual/2022/poster/6351", "author_site": "Yulun Wu, Nicholas Choma, Andrew Chen, Mikaela Cashman, Erica Teixeira Prates, Veronica Melesse Vergara, Manesh Shah, Austin Clyde, Thomas Brettin, Wibe de Jong, Neeraj Kumar, Martha Head, Rick Stevens, Peter Nugent, Daniel Jacobson, James Brown", "tldr": "", "abstract": "We developed Distilled Graph Attention Policy Network (DGAPN), a reinforcement learning model to generate novel graph-structured chemical representations that optimize user-defined objectives by efficiently navigating a physically constrained domain. The framework is examined on the task of generating molecules that are designed to bind, noncovalently, to functional sites of SARS-CoV-2 proteins. We present a spatial Graph Attention (sGAT) mechanism that leverages self-attention over both node and edge attributes as well as encoding the spatial structure --- this capability is of considerable interest in synthetic biology and drug discovery. An attentional policy network is introduced to learn the decision rules for a dynamic, fragment-based chemical environment, and state-of-the-art policy gradient techniques are employed to train the network with stability. Exploration is driven by the stochasticity of the action space design and the innovation reward bonuses learned and proposed by random network distillation. In experiments, our framework achieved outstanding results compared to state-of-the-art algorithms, while reducing the complexity of paths to chemical synthesis.", "keywords": "reinforcement learning;graph neural network;molecule generation;drug discovery;curiosity-driven policy", "primary_area": "", "supplementary_material": "/attachment/edf7f47a13c0bb409641b00bcb5d89024085b72c.zip", "author": "Yulun Wu;Nicholas Choma;Andrew Deru Chen;Mikaela Cashman;Erica Teixeira Prates;Veronica G Melesse Vergara;Manesh B Shah;Austin Clyde;Thomas Brettin;Wibe Albert de Jong;Neeraj Kumar;Martha S Head;Rick L. Stevens;Peter Nugent;Daniel A Jacobson;James B Brown", "authorids": "~Yulun_Wu1;~Nicholas_Choma1;~Andrew_Deru_Chen1;~Mikaela_Cashman1;~Erica_Teixeira_Prates1;~Veronica_G_Melesse_Vergara1;~Manesh_B_Shah1;~Austin_Clyde1;~Thomas_Brettin1;~Wibe_Albert_de_Jong1;~Neeraj_Kumar4;~Martha_S_Head1;~Rick_L._Stevens1;~Peter_Nugent1;~Daniel_A_Jacobson1;~James_B_Brown1", "gender": ";M;;;F;F;M;;M;M;M;F;;M;M;M", "homepage": "https://github.com/yulun-rayn;;;;;https://www.olcf.ornl.gov/directory/staff-member/veronica-melesse-vergara/;https://www.researchgate.net/profile/Manesh-Shah-2;http://www.austinclyde.com;;https://crd.lbl.gov/departments/computational-science/ccmc/staff/staff-members/bert-de-jong/;https://www.pnnl.gov/people/neeraj-kumar-phd;;https://computerscience.uchicago.edu/people/profile/rick-stevens/;https://c3.lbl.gov/nugent/;;https://biosciences.lbl.gov/profiles/ben-brown/", "dblp": ";;;;;;;;;;64/2584;;;;;", "google_scholar": "5QJJxS4AAAAJ;nxto8UUAAAAJ;Sa-HjA0AAAAJ;;NDRCz6sAAAAJ;9RlQp6EAAAAJ;;;Qr7KgXYAAAAJ;3ZXF8aYAAAAJ;https://scholar.google.ca/citations?user=OWEJf5EAAAAJ;;2oSSsLYAAAAJ;xJf5H2AAAAAJ;;", "orcid": ";;;0000-0003-0620-7830;;0000-0002-4333-4145;;;;;;;;0000-0002-3389-0586;0000-0002-9822-8251;", "linkedin": "yu-lun-wu/;;andrewderu/;;;veronicamelessevergara/;;;;wadejong/;;mshead/;;;;", "or_profile": "~Yulun_Wu1;~Nicholas_Choma1;~Andrew_Deru_Chen1;~Mikaela_Cashman1;~Erica_Teixeira_Prates1;~Veronica_G_Melesse_Vergara1;~Manesh_B_Shah1;~Austin_Clyde1;~Thomas_Brettin1;~Wibe_Albert_de_Jong1;~Neeraj_Kumar4;~Martha_S_Head1;~Rick_L._Stevens1;~Peter_Nugent1;~Daniel_A_Jacobson1;~James_B_Brown1", "aff": "University of California, Berkeley;Lawrence Berkeley National Lab;Boston University, Boston University;Oak Ridge National Laboratory;;;Oak Ridge National Laboratory;University of Chicago;Argonne National Laboratory;;Pacific Northwest National Laboratory;Oak Ridge National Laboratory;University of Chicago;;Oak Ridge National Laboratory;Lawrence Berkeley National Lab", "aff_domain": "berkeley.edu;lbl.gov;bu.edu;ornl.gov;;;ornl.gov;uchicago.edu;anl.gov;;pnnl.gov;ornl.gov;uchicago.edu;;ornl.gov;lbl.gov", "position": "PhD student;Researcher;PhD student;Postdoc;;;Research Associate;PhD student;Strategic Program Manager;;Chief Data Scientist;Director;Full Professor;;Full Professor;Principal Researcher", "bibtex": "@inproceedings{\nwu2022spatial,\ntitle={Spatial Graph Attention and Curiosity-driven Policy for Antiviral Drug Discovery},\nauthor={Yulun Wu and Nicholas Choma and Andrew Deru Chen and Mikaela Cashman and Erica Teixeira Prates and Veronica G Melesse Vergara and Manesh B Shah and Austin Clyde and Thomas Brettin and Wibe Albert de Jong and Neeraj Kumar and Martha S Head and Rick L. Stevens and Peter Nugent and Daniel A Jacobson and James B Brown},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=kavTY__jxp}\n}", "github": "", "project": "", "reviewers": "ce6h;ZQrT;wuts", "pdf_size": 0, "recommendation": "6;6;8", "confidence": "3;4;4", "correctness": "4;3;3", "technical_novelty": "2;3;3", "empirical_novelty": "2;0;3", "wc_summary_paper": "113;132;61", "wc_summary_review": "62;48;23", "wc_main_review": "323;547;156", "wc_review": "498;727;240", "wc_reply_reviewers": "140;97;13", "wc_reply_authors": "817;804;555", "reply_reviewers": "1;1;1", "reply_authors": "1;1;1", "recommendation_avg": [ 6.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 1.6666666666666667, 1.247219128924647 ], "wc_summary_paper_avg": [ 102.0, 30.011109054259666 ], "wc_summary_review_avg": [ 44.333333333333336, 16.131404843417148 ], "wc_main_review_avg": [ 342.0, 160.18947114797112 ], "wc_review_avg": [ 488.3333333333333, 198.93438337524483 ], "wc_reply_reviewers_avg": [ 83.33333333333333, 52.740454639257294 ], "wc_reply_authors_avg": [ 725.3333333333334, 120.56072697561544 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 16, 0 ], "corr_recommendation_confidence": 0.49999999999999983, "corr_recommendation_correctness": -0.49999999999999983, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11129949119117457050&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 11, "pdf": "https://openreview.net/pdf?id=kavTY__jxp", "email": "berkeley.edu;lbl.gov;bu.edu;ornl.gov;;;ornl.gov;uchicago.edu;anl.gov;;pnnl.gov;ornl.gov;uchicago.edu;;ornl.gov;lbl.gov", "author_num": 16, "aff_unique_index": "0;1;2;3;3;4;5;6;3;4;3;1", "aff_unique_norm": "University of California, Berkeley;Lawrence Berkeley National Laboratory;Boston University;Oak Ridge National Laboratory;University of Chicago;Argonne National Laboratory;Pacific Northwest National Laboratory", "aff_unique_dep": ";;;;;;", "aff_unique_url": "https://www.berkeley.edu;https://www.lbl.gov;https://www.bu.edu;https://www.ornl.gov;https://www.uchicago.edu;https://www.anl.gov;https://www.pnnl.gov", "aff_unique_abbr": "UC Berkeley;LBNL;BU;ORNL;UChicago;ANL;PNNL", "aff_campus_unique_index": "0;0;1;0", "aff_campus_unique": "Berkeley;Boston;", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "kcadk-DShNO", "title": "Why be adversarial? Let's cooperate!: Cooperative Dataset Alignment via JSD Upper Bound", "track": "main", "status": "Reject", "tldr": "", "abstract": "Unsupervised dataset alignment estimates a transformation that maps two or more source domains to a shared aligned domain given only the domain datasets.\nThis task has many applications including generative modeling, unsupervised domain adaptation, and socially aware learning.\nMost prior works use adversarial learning (i.e., min-max optimization), which can be challenging to optimize and evaluate.\nA few recent works explore non-adversarial flow-based (i.e., invertible) approaches, but they lack a unified perspective.\nTherefore, we propose to unify and generalize previous flow-based approaches under a single non-adversarial framework, which we prove is equivalent to minimizing an upper bound on the Jensen-Shannon Divergence (JSD).\nImportantly, our problem reduces to a min-min, i.e., cooperative, problem and can provide a natural evaluation metric for unsupervised dataset alignment.\nWe present empirical results of our framework on both simulated and real-world datasets to demonstrate the benefits of our approach.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Wonwoong Cho;Ziyu Gong;David I. Inouye", "authorids": "~Wonwoong_Cho1;~Ziyu_Gong1;~David_I._Inouye1", "gender": "M;M;M", "homepage": "https://wonwoongcho.github.io/;;http://davidinouye.com", "dblp": "218/5243;290/1326;76/10817", "google_scholar": "https://scholar.google.com/citations?hl=en;;SVMQ_g4AAAAJ", "orcid": "0000-0003-0898-0341;;", "linkedin": "wonwoong-cho-9730921a0/;ziyu-gong-9700471b8/;", "or_profile": "~Wonwoong_Cho1;~Ziyu_Gong1;~David_I_Inouye1", "aff": "Purdue University;Purdue University;Purdue University", "aff_domain": "purdue.edu;purdue.edu;purdue.edu", "position": "PhD student;PhD student;Assistant Professor", "bibtex": "@misc{\ncho2022why,\ntitle={Why be adversarial? Let's cooperate!: Cooperative Dataset Alignment via {JSD} Upper Bound},\nauthor={Wonwoong Cho and Ziyu Gong and David I. Inouye},\nyear={2022},\nurl={https://openreview.net/forum?id=kcadk-DShNO}\n}", "github": "", "project": "", "reviewers": "sZ2C;fTCq;B4sF", "site": "https://openreview.net/forum?id=kcadk-DShNO", "pdf_size": 0, "recommendation": "3;6;6", "confidence": "4;4;4", "correctness": "3;3;3", "technical_novelty": "2;3;3", "empirical_novelty": "4;3;3", "wc_summary_paper": "18;84;75", "wc_summary_review": "28;19;56", "wc_main_review": "182;267;205", "wc_review": "228;370;336", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "1909;836;922", "reply_reviewers": "0;0;0", "reply_authors": "4;2;3", "recommendation_avg": [ 5.0, 1.4142135623730951 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 59.0, 29.223278392404914 ], "wc_summary_review_avg": [ 34.333333333333336, 15.755069730795297 ], "wc_main_review_avg": [ 218.0, 35.89800365851375 ], "wc_review_avg": [ 311.3333333333333, 60.53832578531462 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 1222.3333333333333, 486.8143611503488 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 3.0, 0.816496580927726 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:3lM8wcdwEJ0J:scholar.google.com/&scioq=Why+be+adversarial%3F+Let%27s+cooperate!:+Cooperative+Dataset+Alignment+via+JSD+Upper+Bound&hl=en&as_sdt=0,33", "gs_version_total": 2, "aff_unique_index": "0;0;0", "aff_unique_norm": "Purdue University", "aff_unique_dep": "", "aff_unique_url": "https://www.purdue.edu", "aff_unique_abbr": "Purdue", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "kcrIligNnl", "title": "Direct Molecular Conformation Generation", "track": "main", "status": "Reject", "tldr": "", "abstract": "Molecular conformation generation, which is to generate 3 dimensional coordinates of all the atoms in a molecule, is an important task for bioinformatics and pharmacology. Most existing machine learning based methods first predict interatomic distances and then generate conformations based on them. This two-stage approach has a potential limitation that the predicted distances may conflict with each other, e.g., violating the triangle inequality. In this work, we propose a method that directly outputs the coordinates of atoms, so that there is no violation of constraints. The conformation generator of our method stacks multiple blocks, and each block outputs a conformation which is then refined by the following block. We adopt the variational auto-encoder (VAE) framework and use a latent variable to generate diverse conformations. To handle the roto-translation equivariance, we adopt a loss that is invariant to rotation and translation of molecule coordinates, by computing the minimal achievable distance after any rotation and translation. Our method outperforms strong baselines on four public datasets, which shows the effectiveness of our method and the great potential of the direct approach. The code is released at \\url{https://github.com/DirectMolecularConfGen/DMCG}. ", "keywords": "Molecular Conformation Generation", "primary_area": "", "supplementary_material": "", "author": "Jinhua Zhu;Yingce Xia;Chang Liu;Lijun Wu;Shufang Xie;Wengang Zhou;Tao Qin;Houqiang Li;Tie-Yan Liu", "authorids": "~Jinhua_Zhu1;~Yingce_Xia1;~Chang_Liu10;~Lijun_Wu1;~Shufang_Xie1;~Wengang_Zhou1;~Tao_Qin1;~Houqiang_Li1;~Tie-Yan_Liu1", "gender": "M;M;M;M;M;M;M;M;M", "homepage": "https://github.com/teslacool;https://www.microsoft.com/en-us/research/people/yinxia/;https://changliu00.github.io/;https://apeterswu.github.io/;;http://staff.ustc.edu.cn/~zhwg/index.html;https://www.microsoft.com/en-us/research/people/taoqin/;https://staff.ustc.edu.cn/~lihq/;http://member.acm.org/~tieyanliu", "dblp": "18/1965-1;http://dblp.uni-trier.de/pers/hd/x/Xia:Yingce;52/5716-30;68/1284-3;https://dblp.uni-trier.de/pid/163/2704-3;22/4544-1;14/6841;59/7017.html;l/TieYanLiu", "google_scholar": "https://scholar.google.com.hk/citations?user=FvGy0LQAAAAJ;GS5wRxYAAAAJ;rYd0GEsAAAAJ;https://scholar.google.com/citations?hl=en;;8s1JF8YAAAAJ;Bl4SRU0AAAAJ;7sFMIKoAAAAJ;Nh832fgAAAAJ", "orcid": "0000-0003-2157-9077;;0000-0001-5207-5440;0000-0002-3530-590X;;0000-0003-1690-9836;;0000-0003-2188-3028;0000-0002-0476-8020", "linkedin": ";;chang-liu-9ab479168/;lijun-wu-59340478/;;;;;", "or_profile": "~Jinhua_Zhu1;~Yingce_Xia1;~Chang_Liu10;~Lijun_Wu1;~Shufang_Xie1;~Wengang_Zhou1;~Tao_Qin1;~Houqiang_Li1;~Tie-Yan_Liu1", "aff": "University of Science and Technology of China;Microsoft;Microsoft;Microsoft Research;Renmin University of China;University of Science and Technology of China;Microsoft Research Asia;University of Science and Technology of China;Microsoft", "aff_domain": "ustc.edu.cn;microsoft.com;microsoft.com;microsoft.com;ruc.edu.cn;ustc.edu.cn;microsoft.com;ustc.edu.cn;microsoft.com", "position": "PhD student;Researcher;Researcher;Researcher;PhD student;Full Professor;Principal Researcher;Professor;Distinguished Scientist", "bibtex": "@misc{\nzhu2022direct,\ntitle={Direct Molecular Conformation Generation},\nauthor={Jinhua Zhu and Yingce Xia and Chang Liu and Lijun Wu and Shufang Xie and Wengang Zhou and Tao Qin and Houqiang Li and Tie-Yan Liu},\nyear={2022},\nurl={https://openreview.net/forum?id=kcrIligNnl}\n}", "github": "", "project": "", "reviewers": "P4XU;rZ2g;T1Ea;Y6xJ", "site": "https://openreview.net/forum?id=kcrIligNnl", "pdf_size": 0, "recommendation": "3;5;6;6", "confidence": "3;5;3;5", "correctness": "3;2;4;3", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;0;2;3", "wc_summary_paper": "76;66;66;85", "wc_summary_review": "74;37;53;60", "wc_main_review": "583;359;302;200", "wc_review": "733;462;421;345", "wc_reply_reviewers": "386;37;26;0", "wc_reply_authors": "2240;978;658;259", "reply_reviewers": "1;1;1;0", "reply_authors": "6;4;1;1", "recommendation_avg": [ 5.0, 1.224744871391589 ], "confidence_avg": [ 4.0, 1.0 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 1.75, 1.0897247358851685 ], "wc_summary_paper_avg": [ 73.25, 7.917543811056558 ], "wc_summary_review_avg": [ 56.0, 13.322912594474229 ], "wc_main_review_avg": [ 361.0, 140.2586895703792 ], "wc_review_avg": [ 490.25, 146.3034090511906 ], "wc_reply_reviewers_avg": [ 112.25, 158.6196315088394 ], "wc_reply_authors_avg": [ 1033.75, 741.5478322401058 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 3.0, 2.1213203435596424 ], "replies_avg": [ 20, 0 ], "authors#_avg": [ 9, 0 ], "corr_recommendation_confidence": 0.40824829046386296, "corr_recommendation_correctness": 0.28867513459481287, "gs_citation": 63, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5769079079964389207&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1;1;1;2;0;1;0;1", "aff_unique_norm": "University of Science and Technology of China;Microsoft;Renmin University of China", "aff_unique_dep": ";Microsoft Corporation;", "aff_unique_url": "http://www.ustc.edu.cn;https://www.microsoft.com;http://www.ruc.edu.cn", "aff_unique_abbr": "USTC;Microsoft;RUC", "aff_campus_unique_index": "1", "aff_campus_unique": ";Asia", "aff_country_unique_index": "0;1;1;1;0;0;0;0;1", "aff_country_unique": "China;United States" }, { "title": "Graph-Relational Domain Adaptation", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7145", "id": "kcwyXtt7yDJ", "poster": "", "openreview": "https://openreview.net/forum?id=kcwyXtt7yDJ", "slides": "https://iclr.cc/virtual/2022/poster/7145", "video": "https://iclr.cc/virtual/2022/poster/7145", "author_site": "Zihao Xu, Hao He, Guang-He Lee, Bernie Wang, Hao Wang", "tldr": "", "abstract": "Existing domain adaptation methods tend to treat every domain equally and align them all perfectly. Such uniform alignment ignores topological structures among different domains; therefore it may be beneficial for nearby domains, but not necessarily for distant domains. In this work, we relax such uniform alignment by using a domain graph to encode domain adjacency, e.g., a graph of states in the US with each state as a domain and each edge indicating adjacency, thereby allowing domains to align flexibly based on the graph structure. We generalize the existing adversarial learning framework with a novel graph discriminator using encoding-conditioned graph embeddings. Theoretical analysis shows that at equilibrium, our method recovers classic domain adaptation when the graph is a clique, and achieves non-trivial alignment for other types of graphs. Empirical results show that our approach successfully generalizes uniform alignment, naturally incorporates domain information represented by graphs, and improves upon existing domain adaptation methods on both synthetic and real-world datasets.", "keywords": "Graphs;Network Topology;Transfer Learning;Domain Adaptation;Adversarial Learning", "primary_area": "", "supplementary_material": "/attachment/fc5015d5bf007b274b411fd3ff306a96395d676f.zip", "author": "Zihao Xu;Hao He;Guang-He Lee;Bernie Wang;Hao Wang", "authorids": "~Zihao_Xu2;~Hao_He1;~Guang-He_Lee1;~Bernie_Wang1;~Hao_Wang3", "gender": "M;M;M;M;M", "homepage": "https://shsjxzh.github.io/;http://people.csail.mit.edu/hehaodele;https://guanghelee.github.io;http://web.mit.edu/~ywang02/www/;http://www.wanghao.in", "dblp": ";;https://dblp.org/pers/hd/l/Lee:Guang=He;43/8355-1;w/HaoWang-14", "google_scholar": "https://scholar.google.com/citations?hl=en;https://scholar.google.com/citations?hl=en;;IKUm624AAAAJ;NrOA9QoAAAAJ", "orcid": ";;0000-0001-6561-0692;0000-0002-0291-7184;", "linkedin": ";;;;", "or_profile": "~Zihao_Xu2;~Hao_He1;~Guang-He_Lee1;~Bernie_Wang1;~Hao_Wang4", "aff": "Rutgers University;Massachusetts Institute of Technology;;Amazon;Rutgers University", "aff_domain": "rutgers.edu;mit.edu;;amazon.com;cs.rutgers.edu", "position": "PhD student;PhD student;;Principal Researcher;Assistant Professor", "bibtex": "@inproceedings{\nxu2022graphrelational,\ntitle={Graph-Relational Domain Adaptation},\nauthor={Zihao Xu and Hao He and Guang-He Lee and Bernie Wang and Hao Wang},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=kcwyXtt7yDJ}\n}", "github": "", "project": "", "reviewers": "rNQp;uDYW;n4Lk", "pdf_size": 0, "recommendation": "5;6;6", "confidence": "5;3;4", "correctness": "3;3;3", "technical_novelty": "3;3;2", "empirical_novelty": "2;3;0", "wc_summary_paper": "50;81;69", "wc_summary_review": "33;16;28", "wc_main_review": "231;174;187", "wc_review": "314;271;284", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "976;657;563", "reply_reviewers": "0;0;0", "reply_authors": "2;2;1", "recommendation_avg": [ 5.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 1.6666666666666667, 1.247219128924647 ], "wc_summary_paper_avg": [ 66.66666666666667, 12.762793146051099 ], "wc_summary_review_avg": [ 25.666666666666668, 7.133644853010899 ], "wc_main_review_avg": [ 197.33333333333334, 24.390344173235622 ], "wc_review_avg": [ 289.6666666666667, 18.00617178142601 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 732.0, 176.75029467207875 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.8660254037844385, "corr_recommendation_correctness": 0.0, "gs_citation": 41, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14268209839215754091&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "pdf": "https://openreview.net/pdf?id=kcwyXtt7yDJ", "email": "rutgers.edu;mit.edu;;amazon.com;cs.rutgers.edu", "author_num": 5, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "Rutgers University;Massachusetts Institute of Technology;Amazon", "aff_unique_dep": ";;Amazon.com, Inc.", "aff_unique_url": "https://www.rutgers.edu;https://web.mit.edu;https://www.amazon.com", "aff_unique_abbr": "Rutgers;MIT;Amazon", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "keQjAwuC7j-", "title": "Two Birds, One Stone: Achieving both Differential Privacy and Certified Robustness for Pre-trained Classifiers via Input Perturbation", "track": "main", "status": "Reject", "tldr": "", "abstract": "Recent studies have shown that pre-trained classifiers are increasingly powerful to improve the performance on different tasks, e.g, neural language processing, image classification. However, adversarial examples from attackers can trick pre-trained classifiers to misclassify. To solve this challenge, a reconstruction network is built before the public pre-trained classifiers to offer certified robustness and defend against adversarial examples through input perturbation. On the other hand, the reconstruction network requires training on the dataset, which incurs privacy leakage of training data through inference attacks. To prevent this leakage, differential privacy (DP) is applied to offer a provable privacy guarantee on training data through gradient perturbation. Most existing works employ certified robustness and DP independently and fail to exploit the fact that input perturbation designed to achieve certified robustness can achieve (partial) DP. In this paper, we propose perturbation transformation to show how the input perturbation designed for certified robustness can be transformed into gradient perturbation during training. We propose Multivariate Gaussian mechanism to analyze the privacy guarantee of this transformed gradient perturbation and precisely quantify the level of DP achieved by input perturbation. To satisfy the overall DP requirement, we add additional gradient perturbation during training and propose Mixed Multivariate Gaussian Analysis to analyze the privacy guarantee provided by the transformed gradient perturbation and additional gradient perturbation. Moreover, we prove that Mixed Multivariate Gaussian Analysis can work with moments accountant to provide a tight DP estimation. Extensive experiments on benchmark datasets show that our framework significantly outperforms state-of-the-art methods and achieves better accuracy and robustness under the same privacy guarantee.", "keywords": "Differential Privacy;Certified Robustness;Pre-trained Classifiers;Input Perturbation", "primary_area": "", "supplementary_material": "/attachment/7103b5dfb0e5eed269234f85e354b24debac242d.zip", "author": "Pengfei Tang;Wenjie Wang;Xiaolan Gu;Jian Lou;Li Xiong;Ming Li", "authorids": "~Pengfei_Tang1;wang.wenjie@emory.edu;~Xiaolan_Gu1;~Jian_Lou2;~Li_Xiong1;~Ming_Li8", "gender": "M;;F;;;", "homepage": "https://www.linkedin.com/in/pengfei-tang-347a734b/;;https://xiaolangu.github.io/;https://sites.google.com/view/jianlou;http://www.cs.emory.edu/~lxiong/;", "dblp": ";;;05/4625-1;39/3530-1.html;", "google_scholar": ";;Lz1WvxEAAAAJ;;jJ8BLgsAAAAJ;", "orcid": ";;;0000-0002-4110-2068;0000-0001-7354-0428;", "linkedin": ";;;;li-xiong-32472513/;", "or_profile": "~Pengfei_Tang1;wang.wenjie@emory.edu;~Xiaolan_Gu1;~Jian_Lou2;~Li_Xiong1;~Ming_Li8", "aff": "Emory University;;University of Arizona;www.hoiying.net;Emory University;", "aff_domain": "emory.edu;;arizona.edu;hoiying.net;emory.edu;", "position": "PhD student;;PhD student;Researcher;Professor;", "bibtex": "@misc{\ntang2022two,\ntitle={Two Birds, One Stone: Achieving both Differential Privacy and Certified Robustness for Pre-trained Classifiers via Input Perturbation},\nauthor={Pengfei Tang and Wenjie Wang and Xiaolan Gu and Jian Lou and Li Xiong and Ming Li},\nyear={2022},\nurl={https://openreview.net/forum?id=keQjAwuC7j-}\n}", "github": "", "project": "", "reviewers": "s8jm;wLgg;t4Xc;K9zu", "site": "https://openreview.net/forum?id=keQjAwuC7j-", "pdf_size": 0, "recommendation": "3;3;5;6", "confidence": "4;4;3;4", "correctness": "2;3;4;4", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "261;79;94;72", "wc_summary_review": "41;50;101;26", "wc_main_review": "837;484;299;199", "wc_review": "1139;613;494;297", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "486;388;692;301", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 4.25, 1.299038105676658 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 126.5, 78.05927235120758 ], "wc_summary_review_avg": [ 54.5, 28.182441342083905 ], "wc_main_review_avg": [ 454.75, 243.22661758121788 ], "wc_review_avg": [ 635.75, 311.6980710559499 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 466.75, 145.5873878466126 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": 0.8703882797784892, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9551743613188770919&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "Emory University;University of Arizona;Hoiying Limited", "aff_unique_dep": ";;", "aff_unique_url": "https://www.emory.edu;https://www.arizona.edu;http://www.hoiying.net", "aff_unique_abbr": "Emory;UA;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "United States;China" }, { "id": "keeCvPPd3vL", "title": "Improved Image Generation via Sparsity", "track": "main", "status": "Reject", "tldr": "", "abstract": "The interest of the deep learning community in image synthesis has grown massively in recent years. Nowadays, deep generative methods, and especially Generative Adversarial Networks (GANs), are leading to state-of-the-art performance, capable of synthesizing images that appear realistic. While the efforts for improving the quality of the generated images are extensive, most attempts still consider the generator part as an uncorroborated ``black-box''. In this paper, we aim to provide a better understanding and design of the image generation process. We interpret existing generators as implicitly relying on sparsity-inspired models. More specifically, we show that generators can be viewed as manifestations of the Convolutional Sparse Coding (CSC) and its Multi-Layered version (ML-CSC) synthesis processes. We leverage this observation by explicitly enforcing a sparsifying regularization on appropriately chosen activation layers in the generator, and demonstrate that this leads to improved image synthesis. Furthermore, we show that the same rationale and benefits apply to generators serving inverse problems, demonstrated on the Deep Image Prior (DIP) method.", "keywords": "Sparse Modeling;Image Generation", "primary_area": "", "supplementary_material": "", "author": "Roy Ganz;Michael Elad", "authorids": "~Roy_Ganz1;~Michael_Elad1", "gender": "M;M", "homepage": "https://royg27.github.io/;https://elad.cs.technion.ac.il/", "dblp": "289/5822;e/MichaelElad", "google_scholar": "2E0FHMoAAAAJ;UpZbV44AAAAJ", "orcid": ";0000-0001-8131-6928", "linkedin": "roy-ganz-270592/;michael-elad-5553852a3/", "or_profile": "~Roy_Ganz1;~Michael_Elad1", "aff": "Technion, Technion;Verily", "aff_domain": "technion.ac.il;verily.com", "position": "MS student;Principal Researcher", "bibtex": "@misc{\nganz2022improved,\ntitle={Improved Image Generation via Sparsity},\nauthor={Roy Ganz and Michael Elad},\nyear={2022},\nurl={https://openreview.net/forum?id=keeCvPPd3vL}\n}", "github": "", "project": "", "reviewers": "tsoA;ob62;4xX8;7WVd", "site": "https://openreview.net/forum?id=keeCvPPd3vL", "pdf_size": 0, "recommendation": "3;3;5;6", "confidence": "3;4;3;3", "correctness": "2;3;3;3", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "66;57;102;74", "wc_summary_review": "101;88;44;222", "wc_main_review": "860;333;203;434", "wc_review": "1027;478;349;730", "wc_reply_reviewers": "212;332;0;188", "wc_reply_authors": "350;768;272;441", "reply_reviewers": "1;1;0;1", "reply_authors": "2;2;1;1", "recommendation_avg": [ 4.25, 1.299038105676658 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 74.75, 16.843025262701474 ], "wc_summary_review_avg": [ 113.75, 65.97111110175423 ], "wc_main_review_avg": [ 457.5, 246.3884128769046 ], "wc_review_avg": [ 646.0, 259.15728814756494 ], "wc_reply_reviewers_avg": [ 183.0, 118.90752709563849 ], "wc_reply_authors_avg": [ 457.75, 188.8443472810346 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.5555555555555555, "corr_recommendation_correctness": 0.5555555555555555, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12032778295155075806&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1", "aff_unique_norm": "Technion - Israel Institute of Technology;Verily", "aff_unique_dep": ";", "aff_unique_url": "https://www.technion.ac.il/en/;https://www.verily.com", "aff_unique_abbr": "Technion;Verily", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "Israel;United States" }, { "title": "Clean Images are Hard to Reblur: Exploiting the Ill-Posed Inverse Task for Dynamic Scene Deblurring", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6155", "id": "kezNJydWvE", "poster": "", "openreview": "https://openreview.net/forum?id=kezNJydWvE", "slides": "https://iclr.cc/virtual/2022/poster/6155", "video": "https://iclr.cc/virtual/2022/poster/6155", "author_site": "Seungjun Nah, Sanghyun Son, Jaerin Lee, Kyoung Mu Lee", "tldr": "", "abstract": "The goal of dynamic scene deblurring is to remove the motion blur in a given image. Typical learning-based approaches implement their solutions by minimizing the L1 or L2 distance between the output and the reference sharp image. Recent attempts adopt visual recognition features in training to improve the perceptual quality. However, those features are primarily designed to capture high-level contexts rather than low-level structures such as blurriness. Instead, we propose a more direct way to make images sharper by exploiting the inverse task of deblurring, namely, reblurring. Reblurring amplifies the remaining blur to rebuild the original blur, however, a well-deblurred clean image with zero-magnitude blur is hard to reblur. Thus, we design two types of reblurring loss functions for better deblurring. The supervised reblurring loss at training stage compares the amplified blur between the deblurred and the sharp images. The self-supervised reblurring loss at inference stage inspects if noticeable blur remains in the deblurred. Our experimental results on large-scale benchmarks and real images demonstrate the effectiveness of the reblurring losses in improving the perceptual quality of the deblurred images in terms of NIQE and LPIPS scores as well as visual sharpness.", "keywords": "Deblur;Reblur;Loss;Test-time adaptation;Self-supervised", "primary_area": "", "supplementary_material": "/attachment/a879a11996948496162b67206db66b6fc0c6357f.zip", "author": "Seungjun Nah;Sanghyun Son;Jaerin Lee;Kyoung Mu Lee", "authorids": "~Seungjun_Nah1;~Sanghyun_Son1;~Jaerin_Lee1;~Kyoung_Mu_Lee2", "gender": "M;M;M;M", "homepage": "https://seungjunnah.github.io/;https://sanghyun-son.github.io/;https://jaerinlee.com;https://cv.snu.ac.kr/kmlee/", "dblp": "177/3248;68/6424;275/3156;17/4029", "google_scholar": "https://scholar.google.co.kr/citations?user=hEr2AKsAAAAJ;https://scholar.google.co.kr/citations?user=nWaSdu0AAAAJ;LMMwadAAAAAJ;Hofj9kAAAAAJ", "orcid": "0000-0003-3971-9402;;0009-0005-5271-8270;", "linkedin": "seungjun-nah-956432139/;;jaerin-lee-626ab1b1/;", "or_profile": "~Seungjun_Nah1;~Sanghyun_Son1;~Jaerin_Lee1;~Kyoung_Mu_Lee1", "aff": "NVIDIA;Seoul National University;Seoul National University;Seoul National University", "aff_domain": "nvidia.com;snu.ac.kr;snu.ac.kr;snu.ac.kr", "position": "Research Scientist;PhD student;PhD student;Full Professor", "bibtex": "@inproceedings{\nnah2022clean,\ntitle={Clean Images are Hard to Reblur: Exploiting the Ill-Posed Inverse Task for Dynamic Scene Deblurring},\nauthor={Seungjun Nah and Sanghyun Son and Jaerin Lee and Kyoung Mu Lee},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=kezNJydWvE}\n}", "github": "", "project": "", "reviewers": "Gxxe;oSPE;M9xc", "pdf_size": 0, "recommendation": "5;6;8", "confidence": "4;4;4", "correctness": "2;3;4", "technical_novelty": "3;3;3", "empirical_novelty": "0;3;3", "wc_summary_paper": "126;293;97", "wc_summary_review": "54;153;45", "wc_main_review": "662;757;236", "wc_review": "842;1203;378", "wc_reply_reviewers": "0;148;16", "wc_reply_authors": "1174;1437;254", "reply_reviewers": "0;1;1", "reply_authors": "2;3;1", "recommendation_avg": [ 6.333333333333333, 1.247219128924647 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.0, 0.816496580927726 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.0, 1.4142135623730951 ], "wc_summary_paper_avg": [ 172.0, 86.3751507475771 ], "wc_summary_review_avg": [ 84.0, 48.92851929090027 ], "wc_main_review_avg": [ 551.6666666666666, 226.55438395424815 ], "wc_review_avg": [ 807.6666666666666, 337.67867698285136 ], "wc_reply_reviewers_avg": [ 54.666666666666664, 66.31909394904474 ], "wc_reply_authors_avg": [ 955.0, 507.1771551111768 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 2.0, 0.816496580927726 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.9819805060619659, "gs_citation": 23, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16355346074292084975&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=kezNJydWvE", "email": "nvidia.com;snu.ac.kr;snu.ac.kr;snu.ac.kr", "author_num": 4, "aff_unique_index": "0;1;1;1", "aff_unique_norm": "NVIDIA;Seoul National University", "aff_unique_dep": "NVIDIA Corporation;", "aff_unique_url": "https://www.nvidia.com;https://www.snu.ac.kr", "aff_unique_abbr": "NVIDIA;SNU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1", "aff_country_unique": "United States;South Korea" }, { "id": "kfug4WKP_Jq", "title": "SS-MAIL: Self-Supervised Multi-Agent Imitation Learning", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "The current landscape of multi-agent expert imitation is broadly dominated by two families of algorithms - Behavioral Cloning (BC) and Adversarial Imitation Learning (AIL). BC approaches suffer from compounding errors, as they ignore the sequential decision-making nature of the trajectory generation problem. Furthermore, they cannot effectively model multi-modal behaviors. While AIL methods solve the issue of compounding errors and multi-modal policy training, they are plagued with instability in their training dynamics. In this work, we address this issue by introducing a novel self-supervised loss that encourages the discriminator to approximate a richer reward function. We employ our method to train a graph-based multi-agent actor-critic architecture that learns a centralized policy, conditioned on a learned latent interaction graph. We show that our method (SS-MAIL) outperforms prior state-of-the-art methods on real-world prediction tasks, as well as on custom-designed synthetic experiments. We prove that SS-MAIL is part of the family of AIL methods by providing a theoretical connection to cost-regularized apprenticeship learning. Moreover, we leverage the self-supervised formulation to introduce a novel teacher forcing-based curriculum (Trajectory Forcing) that improves sample efficiency by progressively increasing the length of the generated trajectory. The SS-MAIL framework improves multi-agent imitation capabilities by stabilizing the policy training, improving the reward shaping capabilities, as well as providing the ability for modeling multi-modal trajectories.", "keywords": "Imitation Learning;Generative Models;Multi-Agent;Time-Series Prediction", "primary_area": "", "supplementary_material": "/attachment/6267e1b78c38c97901d707a18be5dc874e36c28e.zip", "author": "Akshay Dharmavaram;Tejus Gupta;Jiachen Li;Katia P. Sycara", "authorids": "~Akshay_Dharmavaram1;~Tejus_Gupta1;~Jiachen_Li1;~Katia_P._Sycara1", "gender": "M;;M;F", "homepage": "https://akshayd.com;https://tejus-gupta.github.io/;https://jiachenli94.github.io/;", "dblp": ";228/4735;137/8316-1.html;s/KatiaPSycara", "google_scholar": ";4I5IIMgAAAAJ;1_f79vUAAAAJ;VWv6a9kAAAAJ", "orcid": ";;;", "linkedin": ";;jiachen-li/;", "or_profile": "~Akshay_Dharmavaram1;~Tejus_Gupta1;~Jiachen_Li1;~Katia_P._Sycara1", "aff": "Carnegie Mellon University;Carnegie Mellon University;Stanford University;Carnegie Mellon University", "aff_domain": "cmu.edu;cmu.edu;stanford.edu;cmu.edu", "position": "MS student;PhD student;Postdoc;Full Professor", "bibtex": "@misc{\ndharmavaram2022ssmail,\ntitle={{SS}-{MAIL}: Self-Supervised Multi-Agent Imitation Learning},\nauthor={Akshay Dharmavaram and Tejus Gupta and Jiachen Li and Katia P. Sycara},\nyear={2022},\nurl={https://openreview.net/forum?id=kfug4WKP_Jq}\n}", "github": "", "project": "", "reviewers": "bwwh;3SbE;QwuG", "site": "https://openreview.net/forum?id=kfug4WKP_Jq", "pdf_size": 0, "recommendation": "3;3;3", "confidence": "3;4;2", "correctness": "2;2;2", "technical_novelty": "3;2;2", "empirical_novelty": "2;2;2", "wc_summary_paper": "131;58;77", "wc_summary_review": "58;66;35", "wc_main_review": "480;442;665", "wc_review": "669;566;777", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 3.0, 0.816496580927726 ], "correctness_avg": [ 2.0, 0.0 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 88.66666666666667, 30.922843048824312 ], "wc_summary_review_avg": [ 53.0, 13.140268896284683 ], "wc_main_review_avg": [ 529.0, 97.40978732482002 ], "wc_review_avg": [ 670.6666666666666, 86.14845068575265 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:xuEHhja6MhoJ:scholar.google.com/&scioq=SS-MAIL:+Self-Supervised+Multi-Agent+Imitation+Learning&hl=en&as_sdt=0,33", "gs_version_total": 3, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Carnegie Mellon University;Stanford University", "aff_unique_dep": ";", "aff_unique_url": "https://www.cmu.edu;https://www.stanford.edu", "aff_unique_abbr": "CMU;Stanford", "aff_campus_unique_index": "1", "aff_campus_unique": ";Stanford", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "kiNEOCSEzt", "title": "Estimating and Penalizing Induced Preference Shifts in Recommender Systems", "track": "main", "status": "Reject", "tldr": "", "abstract": "The actions that a recommender system (RS) takes -- the content it exposes users to -- influence the preferences users have over what content they want. Therefore, when an RS designer chooses which system to deploy, they are implicitly \\emph{choosing how to shift} or influence user preferences. Even more, if the RS is trained via long-horizon optimization (e.g. reinforcement learning), it will have incentives to manipulate preferences, i.e to shift them so they are more easy to satisfy, and thus conducive to higher reward. While some work has argued for making systems myopic to avoid this issue, myopic systems can still influence preferences in undesirable ways. In this work, we argue that we need to enable system designers to \\textit{estimate} the shifts an RS \\emph{would} induce; \\textit{evaluate}, before deployment, whether the shifts are undesirable; and even \\textit{actively optimize} to avoid such shifts. These steps involve two challenging ingredients: \\emph{estimation} requires the ability to anticipate how hypothetical policies would influence user preferences if deployed -- we do this by training a user predictive model that implicitly contains their preference dynamics from historical user interaction data; \\emph{evaluation} and \\emph{optimization} additionally require metrics to assess whether such influences are manipulative or otherwise unwanted -- we introduce the notion of \u201csafe shifts\u201d, that define a trust region within which behavior is believed to be safe. We show that recommender systems that optimize for staying in the trust region can avoid manipulative behaviors (e.g., changing preferences in ways that make users more predictable), while still generating engagement.", "keywords": "recommender systems;preference shift;preference estimation;preference tampering", "primary_area": "", "supplementary_material": "", "author": "Micah Carroll;Dylan Hadfield-Menell;Stuart Russell;Anca Dragan", "authorids": "~Micah_Carroll1;~Dylan_Hadfield-Menell2;~Stuart_Russell1;~Anca_Dragan1", "gender": "M;M;M;F", "homepage": "https://micahcarroll.github.io/;http://people.csail.mit.edu/dhm/;https://people.eecs.berkeley.edu/~russell/;http://www.ancadragan.com/", "dblp": "250/9080;135/8332;;", "google_scholar": "MeNbzgIAAAAJ;4mVPFQ8AAAAJ;https://scholar.google.com.tw/citations?user=KJGrjCAAAAAJ;", "orcid": "0000-0002-0716-8071;0000-0002-6168-4763;;", "linkedin": "micah-carroll/;;;", "or_profile": "~Micah_Carroll1;~Dylan_Hadfield-Menell2;~Stuart_Russell1;~Anca_Dragan1", "aff": "University of California, Berkeley;Massachusetts Institute of Technology;University of California, Berkeley;University of California, Berkeley", "aff_domain": "berkeley.edu;mit.edu;berkeley.edu;berkeley.edu", "position": "PhD student;Assistant Professor;Full Professor;Associate Professor", "bibtex": "@misc{\ncarroll2022estimating,\ntitle={Estimating and Penalizing Induced Preference Shifts in Recommender Systems},\nauthor={Micah Carroll and Dylan Hadfield-Menell and Stuart Russell and Anca Dragan},\nyear={2022},\nurl={https://openreview.net/forum?id=kiNEOCSEzt}\n}", "github": "", "project": "", "reviewers": "kWQ2;HPbv;krSy;vNt7", "site": "https://openreview.net/forum?id=kiNEOCSEzt", "pdf_size": 0, "recommendation": "5;6;6;6", "confidence": "4;3;3;4", "correctness": "2;4;3;3", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;3;3;0", "wc_summary_paper": "64;131;152;51", "wc_summary_review": "42;18;87;35", "wc_main_review": "331;225;289;364", "wc_review": "437;374;528;450", "wc_reply_reviewers": "85;27;0;0", "wc_reply_authors": "1145;545;1093;0", "reply_reviewers": "1;1;0;0", "reply_authors": "2;1;2;0", "recommendation_avg": [ 5.75, 0.4330127018922193 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 1.224744871391589 ], "wc_summary_paper_avg": [ 99.5, 42.8981351576033 ], "wc_summary_review_avg": [ 45.5, 25.5 ], "wc_main_review_avg": [ 302.25, 51.92001059321926 ], "wc_review_avg": [ 447.25, 54.76940295456944 ], "wc_reply_reviewers_avg": [ 28.0, 34.70590727815655 ], "wc_reply_authors_avg": [ 695.75, 465.4102357060919 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.25, 0.82915619758885 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.816496580927726, "gs_citation": 59, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3587500308278200758&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff_unique_index": "0;1;0;0", "aff_unique_norm": "University of California, Berkeley;Massachusetts Institute of Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.berkeley.edu;https://web.mit.edu", "aff_unique_abbr": "UC Berkeley;MIT", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Berkeley;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "kiwu8tcVf38", "title": "Momentum as Variance-Reduced Stochastic Gradient", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Stochastic gradient descent with momentum (SGD+M) is widely used to empirically improve the convergence behavior and the generalization performance of plain stochastic gradient descent (SGD) in the training of deep learning models, but our theoretical understanding for SGD+M is still very limited. Contrary to the conventional wisdom that sees the momentum in SGD+M as a way to extrapolate the iterates, this work provides an alternative view that interprets the momentum in SGD+M as a (biased) variance-reduced stochastic gradient. We rigorously prove that the momentum in SGD+M converges to the real gradient, with the variance vanishing asymptotically. This reduced variance in gradient estimation thus provides better convergence behavior and opens up a different path for future analyses of momentum methods. Because the reduction of the variance in the momentum requires neither a finite-sum structure in the objective function nor complicated hyperparameters to tune, SGD+M works on complicated deep learning models with possible involvement of data augmentation and dropout, on which many other variance reduction methods fail.\n", "keywords": "momentum;variance reduction", "primary_area": "", "supplementary_material": "", "author": "Zih-Syuan Huang;Ching-pei Lee", "authorids": "zihsyuan@stat.sinica.edu.tw;~Ching-pei_Lee2", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": "zihsyuan@stat.sinica.edu.tw;~Ching-pei_Lee2", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nhuang2022momentum,\ntitle={Momentum as Variance-Reduced Stochastic Gradient},\nauthor={Zih-Syuan Huang and Ching-pei Lee},\nyear={2022},\nurl={https://openreview.net/forum?id=kiwu8tcVf38}\n}", "github": "", "project": "", "reviewers": "vHQi;DsKQ;qiFA;3jaD;Gf96", "site": "https://openreview.net/forum?id=kiwu8tcVf38", "pdf_size": 0, "recommendation": "1;3;3;3;3", "confidence": "3;4;5;3;4", "correctness": "1;3;3;3;4", "technical_novelty": "2;2;1;2;3", "empirical_novelty": "1;2;1;1;1", "wc_summary_paper": "99;42;31;47;36", "wc_summary_review": "306;42;30;63;31", "wc_main_review": "207;222;131;411;240", "wc_review": "612;306;192;521;307", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "0;0;0;0;0", "reply_reviewers": "0;0;0;0;0", "reply_authors": "0;0;0;0;0", "recommendation_avg": [ 2.6, 0.8000000000000002 ], "confidence_avg": [ 3.8, 0.7483314773547882 ], "correctness_avg": [ 2.8, 0.9797958971132712 ], "technical_novelty_avg": [ 2.0, 0.6324555320336759 ], "empirical_novelty_avg": [ 1.2, 0.4000000000000001 ], "wc_summary_paper_avg": [ 51.0, 24.60081299469593 ], "wc_summary_review_avg": [ 94.4, 106.4642663056483 ], "wc_main_review_avg": [ 242.2, 92.20715807354655 ], "wc_review_avg": [ 387.6, 154.6384169603401 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.5345224838248487, "corr_recommendation_correctness": 0.9185586535436916, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:pJJnPx4FfrQJ:scholar.google.com/&scioq=Momentum+as+Variance-Reduced+Stochastic+Gradient&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "title": "Discriminative Similarity for Data Clustering", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6248", "id": "kj0_45Y4r9i", "poster": "", "openreview": "https://openreview.net/forum?id=kj0_45Y4r9i", "slides": "https://iclr.cc/virtual/2022/poster/6248", "video": "https://iclr.cc/virtual/2022/poster/6248", "author_site": "Yingzhen Yang, Ping Li", "tldr": "", "abstract": "Similarity-based clustering methods separate data into clusters according to the pairwise similarity between the data, and the pairwise similarity is crucial for their performance. In this paper, we propose {\\em Clustering by Discriminative Similarity (CDS)}, a novel method which learns discriminative similarity for data clustering. CDS learns an unsupervised similarity-based classifier from each data partition, and searches for the optimal partition of the data by minimizing the generalization error of the learnt classifiers associated with the data partitions. By generalization analysis via Rademacher complexity, the generalization error bound for the unsupervised similarity-based classifier is expressed as the sum of discriminative similarity between the data from different classes. It is proved that the derived discriminative similarity can also be induced by the integrated squared error bound for kernel density classification. In order to evaluate the performance of the proposed discriminative similarity, we propose a new clustering method using a kernel as the similarity function, CDS via unsupervised kernel classification (CDSK), with its effectiveness demonstrated by experimental results.", "keywords": "Discriminative Similarity;Rademacher Complexity;Generalization Bound;Data Clustering", "primary_area": "", "supplementary_material": "", "author": "Yingzhen Yang;Ping Li", "authorids": "~Yingzhen_Yang1;~Ping_Li3", "gender": "M;M", "homepage": "http://yingzhenyang.com;http://www.stat.rutgers.edu/home/pingli/", "dblp": "66/3838.html;62/5860-1", "google_scholar": ";", "orcid": ";", "linkedin": "yingzhen-yang-9b869122;", "or_profile": "~Yingzhen_Yang1;~Ping_Li3", "aff": "Arizona State University;LinkedIn", "aff_domain": "asu.edu;linkedin.com", "position": "Assistant Professor;Engineer", "bibtex": "@inproceedings{\nyang2022discriminative,\ntitle={Discriminative Similarity for Data Clustering},\nauthor={Yingzhen Yang and Ping Li},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=kj0_45Y4r9i}\n}", "github": "", "project": "", "reviewers": "TUzr;AedU;qsFw;dtfu", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "3;3;4;5", "correctness": "2;3;4;4", "technical_novelty": "2;3;3;4", "empirical_novelty": "2;3;3;4", "wc_summary_paper": "94;30;88;111", "wc_summary_review": "8;25;63;49", "wc_main_review": "138;221;417;178", "wc_review": "240;276;568;338", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;300;2172;96", "reply_reviewers": "0;0;0;0", "reply_authors": "0;1;4;1", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 80.75, 30.49077729412617 ], "wc_summary_review_avg": [ 36.25, 21.22940178149163 ], "wc_main_review_avg": [ 238.5, 107.15526118674715 ], "wc_review_avg": [ 355.5, 127.59604225837101 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 642.0, 889.9640442175178 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.5, 1.5 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.899228803025897, "corr_recommendation_correctness": 0.7608859102526822, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13759944954428724758&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "pdf": "https://openreview.net/pdf?id=kj0_45Y4r9i", "email": "asu.edu;linkedin.com", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "Arizona State University;LinkedIn Corporation", "aff_unique_dep": ";", "aff_unique_url": "https://www.asu.edu;https://www.linkedin.com", "aff_unique_abbr": "ASU;LinkedIn", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "kj8TBnJ0SXh", "title": "FaceDet3D: Facial Expressions with 3D Geometric Detail Hallucination", "track": "main", "status": "Reject", "tldr": "", "abstract": "Facial Expressions induce a variety of high-level details on the 3D face geometry. For example, a smile causes the wrinkling of cheeks or the formation of dimples, while being angry often causes wrinkling of the forehead. Morphable Models (3DMMs) of the human face fail to capture such fine details in their PCA-based representations and consequently cannot generate such details when used to edit expressions. In this work, we introduce FaceDet3D, a method that generates - from a single image - geometric facial details that are consistent with any desired target expression. The facial details are represented as a vertex displacement map and used then by a Neural Renderer to photo-realistically render novel images of any single image in any desired expression and view. ", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/c0a9e142d5a0068d974842f3b8a70808c6e3caaa.zip", "author": "ShahRukh Athar;Albert Pumarola;Francesc Moreno-noguer;Dimitris Samaras", "authorids": "~ShahRukh_Athar1;~Albert_Pumarola2;~Francesc_Moreno-noguer2;~Dimitris_Samaras3", "gender": ";;M;M", "homepage": "http://shahrukhathar.github.io/;;https://www.cs.stonybrook.edu/~samaras/;https://www.albertpumarola.com/", "dblp": "79/9032;;s/DimitrisSamaras;138/0487", "google_scholar": "mdUv8wcAAAAJ;;https://scholar.google.com/citations?hl=en;https://scholar.google.es/citations?user=PtwauFAAAAAJ", "orcid": ";;0000-0002-1373-0294;", "linkedin": ";;;", "or_profile": "~ShahRukh_Athar1;~Francesc_Moreno-noguer2;~Dimitris_Samaras3;~Albert_Pumarola_Peris1", "aff": "State University of New York, Stony Brook;;Stony Brook University;Meta Facebook", "aff_domain": "stonybrook.edu;;cs.stonybrook.edu;fb.com", "position": "PhD student;;Full Professor;Researcher", "bibtex": "@misc{\nathar2022facedetd,\ntitle={FaceDet3D: Facial Expressions with 3D Geometric Detail Hallucination},\nauthor={ShahRukh Athar and Albert Pumarola and Francesc Moreno-noguer and Dimitris Samaras},\nyear={2022},\nurl={https://openreview.net/forum?id=kj8TBnJ0SXh}\n}", "github": "", "project": "", "reviewers": "6mwj;L964;eEoU;S1aQ", "site": "https://openreview.net/forum?id=kj8TBnJ0SXh", "pdf_size": 0, "recommendation": "3;5;5;5", "confidence": "3;3;4;4", "correctness": "3;3;3;3", "technical_novelty": "2;3;3;2", "empirical_novelty": "2;0;1;2", "wc_summary_paper": "48;156;98;274", "wc_summary_review": "26;66;66;122", "wc_main_review": "172;330;265;448", "wc_review": "246;552;429;844", "wc_reply_reviewers": "0;0;37;0", "wc_reply_authors": "82;274;323;268", "reply_reviewers": "0;0;1;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 4.5, 0.8660254037844386 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 1.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 144.0, 84.22588675698226 ], "wc_summary_review_avg": [ 70.0, 34.17601498127012 ], "wc_main_review_avg": [ 303.75, 100.4449476081301 ], "wc_review_avg": [ 517.75, 217.56421465856926 ], "wc_reply_reviewers_avg": [ 9.25, 16.021469970012117 ], "wc_reply_authors_avg": [ 236.75, 91.85688596942529 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:LHcwSqYKheoJ:scholar.google.com/&scioq=FaceDet3D:+Facial+Expressions+with+3D+Geometric+Detail+Hallucination&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;2", "aff_unique_norm": "State University of New York;Stony Brook University;Meta", "aff_unique_dep": ";;Meta Platforms, Inc.", "aff_unique_url": "https://www.stonybrook.edu;https://www.stonybrook.edu;https://meta.com", "aff_unique_abbr": "SUNY Stony Brook;SBU;Meta", "aff_campus_unique_index": "0", "aff_campus_unique": "Stony Brook;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "kkgh_x_DBSM", "title": "Protecting Proprietary Data: Poisoning for Secure Dataset Release", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Large organizations such as social media companies continually release data, for example user images. At the same time, these organizations leverage their massive corpora of released data to train proprietary models that give them an edge over their competitors. These two behaviors can be in conflict as an organization wants to prevent competitors from using their own data to replicate the performance of their proprietary models. We solve this problem by developing a data poisoning method by which publicly released data can be minimally modified to prevent others from training models on it. Moreover, our method can be used in an online fashion so that companies can protect their data in real time as they release it. We demonstrate the success of our approach on ImageNet classification and on facial recognition. ", "keywords": "Secure Dataset Release;Data Poisoning;Availability Attack", "primary_area": "", "supplementary_material": "/attachment/c942706d4ba27caa7ec84168fa6879093b27e2c4.zip", "author": "Liam H Fowl;Ping-yeh Chiang;Micah Goldblum;Jonas Geiping;Arpit Amit Bansal;Wojciech Czaja;Tom Goldstein", "authorids": "~Liam_H_Fowl1;~Ping-yeh_Chiang1;~Micah_Goldblum1;~Jonas_Geiping1;~Arpit_Amit_Bansal1;~Wojciech_Czaja1;~Tom_Goldstein1", "gender": ";;;M;M;;M", "homepage": ";;;https://jonasgeiping.github.io/;https://arpitbansal297.github.io/;;https://www.cs.umd.edu/~tomg/", "dblp": "241/6940;236/4288;241/7231;190/7229;190/9114;;25/8184", "google_scholar": "IXv3ToAAAAAJ;WUoMq1IAAAAJ;pGDKzuUAAAAJ;https://scholar.google.de/citations?user=206vNCEAAAAJ;Pchxm4IAAAAJ;;KmSuVtgAAAAJ", "orcid": ";;;;;;", "linkedin": ";;;;arpit-bansal-970865b1/;;", "or_profile": "~Liam_H_Fowl1;~Ping-yeh_Chiang1;~Micah_Goldblum1;~Jonas_Geiping1;~Arpit_Amit_Bansal1;~Wojciech_Czaja1;~Tom_Goldstein1", "aff": "University of Maryland, College Park;University of Maryland, College Park;New York University;University of Maryland, College Park;University of Maryland, College Park;;University of Maryland, College Park", "aff_domain": "umd.edu;umd.edu;nyu.edu;umd.edu;umd.edu;;umd.edu", "position": "PhD student;PhD student;Postdoc;Postdoc;PhD student;;Associate Professor", "bibtex": "@misc{\nfowl2022protecting,\ntitle={Protecting Proprietary Data: Poisoning for Secure Dataset Release},\nauthor={Liam H Fowl and Ping-yeh Chiang and Micah Goldblum and Jonas Geiping and Arpit Amit Bansal and Wojciech Czaja and Tom Goldstein},\nyear={2022},\nurl={https://openreview.net/forum?id=kkgh_x_DBSM}\n}", "github": "", "project": "", "reviewers": "Smcq;ioeQ;qT6q;61eq", "site": "https://openreview.net/forum?id=kkgh_x_DBSM", "pdf_size": 0, "recommendation": "3;3;5;6", "confidence": "3;4;4;4", "correctness": "2;3;3;3", "technical_novelty": "2;2;3;2", "empirical_novelty": "2;3;2;3", "wc_summary_paper": "100;84;61;91", "wc_summary_review": "47;89;38;100", "wc_main_review": "328;285;149;705", "wc_review": "475;458;248;896", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.25, 1.299038105676658 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 84.0, 14.439529078193651 ], "wc_summary_review_avg": [ 68.5, 26.48112535373072 ], "wc_main_review_avg": [ 366.75, 206.16301195898356 ], "wc_review_avg": [ 519.25, 235.17373896759815 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.5555555555555555, "corr_recommendation_correctness": 0.5555555555555555, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17982187068168302036&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;1;0;0;0", "aff_unique_norm": "University of Maryland;New York University", "aff_unique_dep": ";", "aff_unique_url": "https://www/umd.edu;https://www.nyu.edu", "aff_unique_abbr": "UMD;NYU", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "College Park;", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "kl8flCo98nm", "title": "LEARNING DISTRIBUTIONS GENERATED BY SINGLE-LAYER RELU NETWORKS IN THE PRESENCE OF ARBITRARY OUTLIERS", "track": "main", "status": "Reject", "tldr": "", "abstract": "We consider a set of data samples such that a constant fraction of the samples are arbitrary outliers and the rest are the output samples of a single-layer neural network (NN) with rectified linear unit (ReLU) activation. The goal of this paper is to estimate the parameters (weight matrix and bias vector) of the NN assuming the bias vector to be non-negative. Our proposed method is a two-step algorithm. We first estimate the norms of the rows of the weight matrix and the bias vector using the gradient descent algorithm. Here, we also incorporate either the median or the trimmed mean based filters to mitigate the effect of the arbitrary outliers. Next, we estimate the angles between any two row vectors of the weight matrix. Combining the estimates of the norms and the angles, we obtain the final estimate of the weight matrix. Further, we prove that ${O}(\\frac{1}{\\epsilon p^4}\\log\\frac{d}{\\delta})$ samples are sufficient for our algorithm to estimate the NN parameters within an error of $\\epsilon$ with probability $1-\\delta$ when the probability of a sample being uncorrupted is $p$ and the problem dimension is $d$. Our theoretical and simulation results provide insights on how the estimation of the NN parameters depends on the probability of a sample being uncorrupted, the number of samples, and the problem dimension. ", "keywords": "Learning distribution;ReLU;Truncated Gaussian;Unsupervised learning", "primary_area": "", "supplementary_material": "/attachment/67f8daf9dc1358d47791162698f9be6c424f5c43.zip", "author": "Saikiran Bulusu;Geethu Joseph;M. Cenk Gursoy;Pramod Varshney", "authorids": "~Saikiran_Bulusu1;~Geethu_Joseph1;~M._Cenk_Gursoy1;~Pramod_Varshney1", "gender": "M;;;M", "homepage": ";https://sites.google.com/view/geethujoseph/home;;https://ecs.syr.edu/faculty/gursoy/", "dblp": ";;;", "google_scholar": "NrfiUzAAAAAJ;;;qBxl76YAAAAJ", "orcid": "0000-0002-4594-4844;;;", "linkedin": ";;;", "or_profile": "~Saikiran_Bulusu1;~Geethu_Joseph1;~Pramod_Varshney1;~Mustafa_Gursoy1", "aff": "Syracuse University;Delft University of Technology;Syracuse University;Syracuse University", "aff_domain": "syr.edu;tudelft.nl;syr.edu;syr.edu", "position": "PhD student;Assistant Professor;;Full Professor", "bibtex": "@misc{\nbulusu2022learning,\ntitle={{LEARNING} {DISTRIBUTIONS} {GENERATED} {BY} {SINGLE}-{LAYER} {RELU} {NETWORKS} {IN} {THE} {PRESENCE} {OF} {ARBITRARY} {OUTLIERS}},\nauthor={Saikiran Bulusu and Geethu Joseph and M. Cenk Gursoy and Pramod Varshney},\nyear={2022},\nurl={https://openreview.net/forum?id=kl8flCo98nm}\n}", "github": "", "project": "", "reviewers": "D2X8;CkRT;fTQC;ApGT", "site": "https://openreview.net/forum?id=kl8flCo98nm", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "3;3;4;3", "correctness": "3;4;4;3", "technical_novelty": "2;2;2;3", "empirical_novelty": "0;1;0;0", "wc_summary_paper": "93;127;60;63", "wc_summary_review": "129;16;9;50", "wc_main_review": "602;194;169;257", "wc_review": "824;337;238;370", "wc_reply_reviewers": "110;37;0;26", "wc_reply_authors": "877;771;1058;836", "reply_reviewers": "1;1;0;1", "reply_authors": "2;1;2;2", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 0.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 85.75, 27.086666461563706 ], "wc_summary_review_avg": [ 51.0, 47.62877281643944 ], "wc_main_review_avg": [ 305.5, 174.1615629236256 ], "wc_review_avg": [ 442.25, 225.692683753816 ], "wc_reply_reviewers_avg": [ 43.25, 40.81283499096822 ], "wc_reply_authors_avg": [ 885.5, 106.52347159194541 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.75, 0.4330127018922193 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": -0.5773502691896257, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:h5DcFWIR1mUJ:scholar.google.com/&scioq=LEARNING+DISTRIBUTIONS+GENERATED+BY+SINGLE-LAYER+RELU+NETWORKS+IN+THE+PRESENCE+OF+ARBITRARY+OUTLIERS&hl=en&as_sdt=0,5", "gs_version_total": 11, "aff_unique_index": "0;1;0;0", "aff_unique_norm": "Syracuse University;Delft University of Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.syracuse.edu;https://www.tudelft.nl", "aff_unique_abbr": "Syracuse;TU Delft", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0", "aff_country_unique": "United States;Netherlands" }, { "id": "kocM6lVTIfJ", "title": "Feature Shapley: A general framework to discovering useful feature interactions", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "A machine learning system is typically composed of model and data. In many applications, feature is the input of models so as to generate a meaningful prediction. While a large amount of model-centric solutions are proposed to improve the capabilities of models, there is very limited exploration on how to discover useful feature interactions from a data-centric perspective. In this work, we propose a general framework named Feature Shapley with the purpose of discovering useful high-order feature interactions based on Feature Shapely values and thereby generating new features. Since computing exact Feature Shapley values is computationally infeasible, Monte-Carlo approximation and early truncation trick are applied for efficient estimation of Feature Shapley values in this work. Experimental results indicate that the decisive feature interactions exploited by Feature Shapley are of vital importance for the Click-through rate (CTR) prediction and asset pricing task. With decisive feature interactions exploited by Feature Shapley, even simple models (e.g., linear regression (LR) or shallow neural network) could achieve similar or even better performance comparing with more complex approaches and keep their superior interpretability at the same time.", "keywords": "Feature Learning;Shapley Value;Click-through Rate;Asset Pricing", "primary_area": "", "supplementary_material": "", "author": "Zhuoyi Lin;Biao Ye;Xu He;Shuo Sun;Rundong Wang;Rui Yin;Xu Chi;Chee Keong Kwoh", "authorids": "~Zhuoyi_Lin3;biaoye@usc.edu;~Xu_He2;shuo003@e.ntu.edu.sg;~Rundong_Wang1;yinr0002@e.ntu.edu.sg;~Xu_Chi1;asckkwoh@ntu.edu.sg", "gender": "M;;M;;M;;M;", "homepage": "https://sites.google.com/site/linzhuoyi1995/home;;https://scholar.google.com/citations?user=308KqrIAAAAJ&hl=en;;;;;", "dblp": "259/6990;;89/3991;;254/1228;;;", "google_scholar": "i04oBhIAAAAJ;;308KqrIAAAAJ;;JEVpgE8AAAAJ;;https://scholar.google.com/citations?hl=en;", "orcid": ";;;;;;0000-0001-5480-3974;", "linkedin": ";;;;;;;", "or_profile": "~Zhuoyi_Lin3;biaoye@usc.edu;~Xu_He2;shuo003@e.ntu.edu.sg;~Rundong_Wang1;yinr0002@e.ntu.edu.sg;~Xu_Chi1;asckkwoh@ntu.edu.sg", "aff": "Nanyang Technological University;;Huawei Technologies Ltd.;;Nanyang Technological University;;Singapore Institute of Manufacturing Technology, A*STAR;", "aff_domain": "ntu.edu.sg;;huawei.com;;ntu.edu.sg;;simtech.a-star.edu.sg;", "position": "PhD student;;Researcher;;PhD student;;Researcher;", "bibtex": "@misc{\nlin2022feature,\ntitle={Feature Shapley: A general framework to discovering useful feature interactions},\nauthor={Zhuoyi Lin and Biao Ye and Xu He and Shuo Sun and Rundong Wang and Rui Yin and Xu Chi and Chee Keong Kwoh},\nyear={2022},\nurl={https://openreview.net/forum?id=kocM6lVTIfJ}\n}", "github": "", "project": "", "reviewers": "Rc5u;UDoq;6APa;QriB", "site": "https://openreview.net/forum?id=kocM6lVTIfJ", "pdf_size": 0, "recommendation": "3;3;5;6", "confidence": "5;3;4;3", "correctness": "4;3;3;2", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "35;47;61;48", "wc_summary_review": "37;26;72;13", "wc_main_review": "430;297;520;156", "wc_review": "502;370;653;217", "wc_reply_reviewers": "0;0;42;0", "wc_reply_authors": "652;490;846;379", "reply_reviewers": "0;0;1;0", "reply_authors": "1;1;2;1", "recommendation_avg": [ 4.25, 1.299038105676658 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 47.75, 9.202581159652981 ], "wc_summary_review_avg": [ 37.0, 21.920310216782973 ], "wc_main_review_avg": [ 350.75, 137.60700381884638 ], "wc_review_avg": [ 435.5, 161.05977151355953 ], "wc_reply_reviewers_avg": [ 10.5, 18.186533479473212 ], "wc_reply_authors_avg": [ 591.75, 175.98916870080384 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": -0.4061811972299616, "corr_recommendation_correctness": -0.816496580927726, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:8rMaHpz-goMJ:scholar.google.com/&scioq=Feature+Shapley:+A+general+framework+to+discovering+useful+feature+interactions&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;0;2", "aff_unique_norm": "Nanyang Technological University;Huawei;Singapore Institute of Manufacturing Technology", "aff_unique_dep": ";Huawei Technologies;", "aff_unique_url": "https://www.ntu.edu.sg;https://www.huawei.com;https://www.simtech.a-star.edu.sg", "aff_unique_abbr": "NTU;Huawei;SIMTech", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0", "aff_country_unique": "Singapore;China" }, { "id": "krI-ahhgN2", "title": "Self-Contrastive Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "This paper proposes a novel contrastive learning framework, called Self-Contrastive (SelfCon) Learning, that self-contrasts within multiple outputs from the different levels of a multi-exit network. SelfCon learning does not require additional augmented samples, which resolves the concerns of multi-viewed batch (e.g., high computational cost and generalization error). Furthermore, we prove that SelfCon loss guarantees the lower bound of label-conditional mutual information between the intermediate and the last feature. In our experiments including ImageNet-100, SelfCon surpasses cross-entropy and Supervised Contrastive (SupCon) learning without the need for a multi-viewed batch. We demonstrate that the success of SelfCon learning is related to the regularization effect associated with the single-view and sub-network.", "keywords": "contrastive learning;representation learning;image classification;mutual information", "primary_area": "", "supplementary_material": "/attachment/dae80537d2c088ea78e41359452515d0aac33916.zip", "author": "Sangmin Bae;Sungnyun Kim;Jongwoo Ko;Gihun Lee;SeungJong Noh;Se-Young Yun", "authorids": "~Sangmin_Bae1;~Sungnyun_Kim1;~Jongwoo_Ko1;~Gihun_Lee1;~SeungJong_Noh1;~Se-Young_Yun1", "gender": "M;M;M;M;M;M", "homepage": "https://www.raymin0223.com;https://bit.ly/sungnyunkim;https://sites.google.com/view/jongwooko;https://github.com/Lee-Gihun;;https://fbsqkd.github.io", "dblp": "91/1588;276/5441;286/1503;264/0071;127/6142.html;23/8862", "google_scholar": "T5rHY14AAAAJ;DsWny60AAAAJ;l2jkwHwAAAAJ;zsZVyckAAAAJ;;X_IAjb8AAAAJ", "orcid": ";0000-0002-3251-1812;;;;", "linkedin": "raymin0223/;sungnyun-kim-38a029242/;jongwoo-ko-8b93051b4/;gihun-l-155159197/;;seyoung-yun-395130ab/", "or_profile": "~Sangmin_Bae1;~Sungnyun_Kim1;~Jongwoo_Ko1;~Gihun_Lee1;~SeungJong_Noh1;~Se-Young_Yun1", "aff": "Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology;;KAIST", "aff_domain": "kaist.ac.kr;kaist.ac.kr;kaist.ac.kr;kaist.ac.kr;;kaist.ac.kr", "position": "PhD student;PhD student;PhD student;PhD student;;Assistant Professor", "bibtex": "@misc{\nbae2022selfcontrastive,\ntitle={Self-Contrastive Learning},\nauthor={Sangmin Bae and Sungnyun Kim and Jongwoo Ko and Gihun Lee and SeungJong Noh and Se-Young Yun},\nyear={2022},\nurl={https://openreview.net/forum?id=krI-ahhgN2}\n}", "github": "", "project": "", "reviewers": "MBzi;ZiPE;hbNU;qb4T", "site": "https://openreview.net/forum?id=krI-ahhgN2", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "4;4;4;4", "correctness": "2;3;3;3", "technical_novelty": "2;2;3;3", "empirical_novelty": "3;2;4;3", "wc_summary_paper": "36;109;104;79", "wc_summary_review": "47;244;40;60", "wc_main_review": "448;1071;328;610", "wc_review": "531;1424;472;749", "wc_reply_reviewers": "656;212;48;0", "wc_reply_authors": "3938;3206;1364;1972", "reply_reviewers": "7;2;1;0", "reply_authors": "11;11;3;5", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 82.0, 28.88771365130858 ], "wc_summary_review_avg": [ 97.75, 84.74188751733112 ], "wc_main_review_avg": [ 614.25, 282.0535188576806 ], "wc_review_avg": [ 794.0, 378.0800179856111 ], "wc_reply_reviewers_avg": [ 229.0, 258.75664242681773 ], "wc_reply_authors_avg": [ 2620.0, 1009.6979746439031 ], "reply_reviewers_avg": [ 2.5, 2.692582403567252 ], "reply_authors_avg": [ 7.5, 3.570714214271425 ], "replies_avg": [ 49, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5014788656592720104&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Korea Advanced Institute of Science and Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.kaist.ac.kr", "aff_unique_abbr": "KAIST", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "South Korea" }, { "id": "krQLTdel74N", "title": "Robust Graph Data Learning with Latent Graph Convolutional Representation", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Graph Convolutional Representation (GCR) has achieved impressive performance for graph data representation. However, existing GCR is generally defined on the input fixed graph which may restrict the representation capacity and also be vulnerable to the structural attacks and noises. To address this issue, we propose a novel Latent Graph Convolutional Representation (LatGCR) for robust graph data representation and learning. Our LatGCR is derived based on reformulating graph convolutional representation from the aspect of graph neighborhood reconstruction. Given an input graph $\\textbf{A}$, LatGCR aims to generate a flexible latent graph $\\tilde{\\textbf{A}}$ for graph convolutional representation which obviously enhances the representation capacity and also performs robustly w.r.t graph structural attacks and noises. Moreover, LatGCR is implemented in a self-supervised manner and thus provides a basic block for both supervised and unsupervised graph learning tasks. Experiments on several datasets demonstrate the effectiveness and robustness of LatGCR.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Bo Jiang;Ziyan Zhang;Bin Luo", "authorids": "~Bo_Jiang5;~Ziyan_Zhang1;~Bin_Luo4", "gender": ";F;M", "homepage": ";;", "dblp": "34/2005-2;230/7909;36/4256-1", "google_scholar": "https://scholar.google.com.hk/citations?user=n-aTwuMAAAAJ;;https://scholar.google.com.hk/citations?user=0qaDapcAAAAJ", "orcid": "0000-0002-6238-1596;;", "linkedin": ";;", "or_profile": "~Bo_Jiang5;~Ziyan_Zhang1;~Bin_Luo4", "aff": ";;Anhui University", "aff_domain": ";;ahu.edu.cn", "position": ";;Full Professor", "bibtex": "@misc{\njiang2022robust,\ntitle={Robust Graph Data Learning with Latent Graph Convolutional Representation},\nauthor={Bo Jiang and Ziyan Zhang and Bin Luo},\nyear={2022},\nurl={https://openreview.net/forum?id=krQLTdel74N}\n}", "github": "", "project": "", "reviewers": "9ZwD;cGgK;2Evc;P7ZS", "site": "https://openreview.net/forum?id=krQLTdel74N", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "4;5;4;3", "correctness": "3;3;3;2", "technical_novelty": "2;1;3;2", "empirical_novelty": "2;1;0;2", "wc_summary_paper": "82;56;88;124", "wc_summary_review": "18;18;65;49", "wc_main_review": "792;106;190;311", "wc_review": "892;180;343;484", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 1.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 87.5, 24.264171117101856 ], "wc_summary_review_avg": [ 37.5, 20.303940504246953 ], "wc_main_review_avg": [ 349.75, 265.52812939498517 ], "wc_review_avg": [ 474.75, 263.8270029773298 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.7071067811865475, "corr_recommendation_correctness": -0.5773502691896257, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10790667029416492053&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0", "aff_unique_norm": "Anhui University", "aff_unique_dep": "", "aff_unique_url": "http://www.ahu.edu.cn/", "aff_unique_abbr": "AHU", "aff_country_unique_index": "0", "aff_country_unique": "China" }, { "id": "kroqZZb-6s", "title": "Cluster-based Feature Importance Learning for Electronic Health Record Time-series", "track": "main", "status": "Reject", "tldr": "", "abstract": "The recent availability of Electronic Health Records (EHR) has allowed for the development of algorithms predicting inpatient risk of deterioration and trajectory evolution. However, prediction of disease progression with EHR is challenging since these data are sparse, heterogeneous, multi-dimensional, and multi-modal time-series. As such, clustering is used to identify similar groups within the patient cohort to improve prediction. Current models \nhave shown some success in obtaining cluster representation of patient trajectories, however, they i) fail to obtain clinical interpretability for each cluster, and ii) struggle to learn meaningful cluster numbers in the context of the imbalanced distribution of disease outcomes. We propose a supervised deep learning model to cluster EHR data based on the identification of clinically understandable phenotypes with regard to both outcome prediction and patient trajectory. We introduce novel loss functions to address the problems of class imbalance and cluster collapse, and furthermore propose a feature-time attention mechanism to identify cluster-based phenotype importance across time and feature dimensions. We tested our model in over 100,000 unique trajectories from hospitalised patients with Type-II respiratory failure to predict five different outcomes. Our model yielded added interpretability to cluster formation and outperformed benchmarks by at least 5% in mean AUROC.", "keywords": "Clustering;Electronic Health Records", "primary_area": "", "supplementary_material": "", "author": "Henrique Aguiar;Mauro Santos;Peter Watkinson;Tingting Zhu", "authorids": "~Henrique_Aguiar1;~Mauro_Santos1;peter.watkinson@ndcn.ox.ac.uk;~Tingting_Zhu1", "gender": "M;;;F", "homepage": ";;;https://eng.ox.ac.uk/people/tingting-zhu/", "dblp": ";;;29/7666-1", "google_scholar": "https://scholar.google.com/citations?hl=en;;;https://scholar.google.com.vn/citations?user=fjGMIl0AAAAJ", "orcid": ";;;0000-0002-1552-5630", "linkedin": ";;;", "or_profile": "~Henrique_Aguiar1;~Mauro_Santos1;peter.watkinson@ndcn.ox.ac.uk;~Tingting_Zhu1", "aff": "University of Oxford;;;University of Oxford", "aff_domain": "ox.ac.uk;;;eng.ox.ac.uk", "position": "PhD student;;;RAEng Research Fellow", "bibtex": "@misc{\naguiar2022clusterbased,\ntitle={Cluster-based Feature Importance Learning for Electronic Health Record Time-series},\nauthor={Henrique Aguiar and Mauro Santos and Peter Watkinson and Tingting Zhu},\nyear={2022},\nurl={https://openreview.net/forum?id=kroqZZb-6s}\n}", "github": "", "project": "", "reviewers": "p3Vb;9DjJ;MGut;nD3g", "site": "https://openreview.net/forum?id=kroqZZb-6s", "pdf_size": 0, "recommendation": "5;5;6;8", "confidence": "3;5;3;4", "correctness": "3;3;3;3", "technical_novelty": "2;1;3;3", "empirical_novelty": "2;2;3;2", "wc_summary_paper": "45;62;52;50", "wc_summary_review": "23;61;27;35", "wc_main_review": "476;439;145;500", "wc_review": "544;562;224;585", "wc_reply_reviewers": "0;20;25;27", "wc_reply_authors": "609;770;655;755", "reply_reviewers": "0;1;1;1", "reply_authors": "1;1;1;1", "recommendation_avg": [ 6.0, 1.224744871391589 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.25, 0.82915619758885 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 52.25, 6.179603547154137 ], "wc_summary_review_avg": [ 36.5, 14.79019945774904 ], "wc_main_review_avg": [ 390.0, 143.11009747743168 ], "wc_review_avg": [ 478.75, 147.79610109877729 ], "wc_reply_reviewers_avg": [ 18.0, 10.700467279516348 ], "wc_reply_authors_avg": [ 697.25, 67.4550776443108 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:EISUn94FxQMJ:scholar.google.com/&scioq=Cluster-based+Feature+Importance+Learning+for+Electronic+Health+Record+Time-series&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "University of Oxford", "aff_unique_dep": "", "aff_unique_url": "https://www.ox.ac.uk", "aff_unique_abbr": "Oxford", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United Kingdom" }, { "id": "ks_uMcTPyW4", "title": "Reinforcement Learning with Efficient Active Feature Acquisition", "track": "main", "status": "Reject", "tldr": "", "abstract": "Solving real-life sequential decision making problems under partial observability involves an exploration-exploitation problem. To be successful, an agent needs to efficiently gather valuable information about the state of the world for making rewarding decisions. However, in real-life, acquiring valuable information is often highly costly, e.g., in the medical domain, information acquisition might correspond to performing a medical test on a patient. Thus it poses a significant challenge for the agent to learn optimal task policy while efficiently reducing the cost for information acquisition. In this paper, we introduce a model-based framework to solve such exploration-exploitation problem during its execution. Key to the success is a sequential variational auto-encoder which could learn high-quality representations over the partially observed/missing features, where such representation learning serves as a prime factor to drive efficient policy training under the cost-sensitive setting. We demonstrate our proposed method could significantly outperform conventional approaches in a control domain as well as using a medical simulator. ", "keywords": "Representation Learning;Reinforcement Learning;Active Learning", "primary_area": "", "supplementary_material": "/attachment/c8a5d137b823333849831dc5d95092f0f9891f57.zip", "author": "Haiyan Yin;Yingzhen Li;Sinno Pan;Cheng Zhang;Sebastian Tschiatschek", "authorids": "~Haiyan_Yin1;~Yingzhen_Li1;~Sinno_Pan1;~Cheng_Zhang1;~Sebastian_Tschiatschek1", "gender": ";F;M;F;M", "homepage": ";http://yingzhenli.net/home/en/;http://www.cse.cuhk.edu.hk/~sinnopan/;http://cheng-zhang.org;https://www.tschiatschek.net", "dblp": ";117/9230;80/5412;82/6384-5;33/10810", "google_scholar": ";https://scholar.google.se/citations?hl=en;https://scholar.google.com/citations?hl=en;r40iAwIAAAAJ;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~Haiyan_Yin1;~Yingzhen_Li1;~Sinno_Pan1;~Cheng_Zhang1;~Sebastian_Tschiatschek1", "aff": ";Imperial College London;Nanyang Technological University;Microsoft;University of Vienna", "aff_domain": ";imperial.ac.uk;ntu.edu.sg;microsoft.com;univie.ac.at", "position": ";Lecturer;Associate Professor;Principal Researcher;Assistant Professor", "bibtex": "@misc{\nyin2022reinforcement,\ntitle={Reinforcement Learning with Efficient Active Feature Acquisition},\nauthor={Haiyan Yin and Yingzhen Li and Sinno Pan and Cheng Zhang and Sebastian Tschiatschek},\nyear={2022},\nurl={https://openreview.net/forum?id=ks_uMcTPyW4}\n}", "github": "", "project": "", "reviewers": "82uH;jW1b;APEd", "site": "https://openreview.net/forum?id=ks_uMcTPyW4", "pdf_size": 0, "recommendation": "5;6;6", "confidence": "3;3;4", "correctness": "2;3;3", "technical_novelty": "2;2;3", "empirical_novelty": "2;2;3", "wc_summary_paper": "149;70;92", "wc_summary_review": "19;475;86", "wc_main_review": "238;414;414", "wc_review": "406;959;592", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 5.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "correctness_avg": [ 2.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 103.66666666666667, 33.289971796657056 ], "wc_summary_review_avg": [ 193.33333333333334, 201.03786265831175 ], "wc_main_review_avg": [ 355.3333333333333, 82.96719565922157 ], "wc_review_avg": [ 652.3333333333334, 229.7568763328363 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.4999999999999999, "corr_recommendation_correctness": 0.9999999999999997, "gs_citation": 18, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=856870478502509596&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "Imperial College London;Nanyang Technological University;Microsoft;University of Vienna", "aff_unique_dep": ";;Microsoft Corporation;", "aff_unique_url": "https://www.imperial.ac.uk;https://www.ntu.edu.sg;https://www.microsoft.com;https://univie.ac.at", "aff_unique_abbr": "ICL;NTU;Microsoft;UV", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2;3", "aff_country_unique": "United Kingdom;Singapore;United States;Austria" }, { "id": "ktHKpsbsxx", "title": "WeaveNet: A Differentiable Solver for Non-linear Assignment Problems", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Assignment, a task to match a limited number of elements, is a fundamental problem in informatics. Traditionally, non-linear assignment is discussed as a combinatorial optimization problem with its calculation complexity. On the other hand, it is often a sub-problem of image processing tasks, such as 3D point cloud matching. This paper proposes WeaveNet, a differentiable solver for diverse non-linear assignment problems. Traditional graph convolutional networks (GCNs) suffer from an over-smoothing problem when characterizing nodes with their relationship. WeaveNet overcomes this problem by forwarding edge-wise features at each layer rather than aggregated node features.\nTo deal with the exponentially large input space of combinatorial optimization problems, we designed WeaveNet to be highly parameter efficient while characterizing edges through stacked set-encoder with cross-concatenation operations. Experimental results show that WeaveNet approximates two strongly NP-hard variants of stable matching in a comparative performance with the gold standard hand-crafted algorithms under the limited size of problem instances. We have also confirmed that it can boost 3D point cloud matching performance significantly.", "keywords": "Non-linear assignment;Deep Learning;Stable Matching;3D Point Cloud Matching", "primary_area": "", "supplementary_material": "/attachment/a25fcd19310f1721a4ca2e60c6ce20cb167aa344.zip", "author": "Shusaku Sone;Atsushi Hashimoto;Jiaxin Ma;rintaro yanagi;Naoya Chiba;Yoshitaka Ushiku", "authorids": "~Shusaku_Sone1;~Atsushi_Hashimoto1;~Jiaxin_Ma1;yanagi@lmd.ist.hokudai.ac.jp;~Naoya_Chiba1;~Yoshitaka_Ushiku3", "gender": "M;M;M;;M;", "homepage": ";https://atsushihashimoto.github.io/cv/;;;https://sites.google.com/view/n-chiba-;", "dblp": "152/7279;89/6733-1;46/8046;;202/5745;", "google_scholar": "https://scholar.google.co.jp/citations?user=OaHyzV8AAAAJ;DHIGVL8AAAAJ;;;https://scholar.google.co.jp/citations?user=TkFr708AAAAJ;", "orcid": "0009-0005-6576-6490;0000-0002-0799-4269;;;;", "linkedin": "shusaku-sone-359799b/;atsushi-hashimoto-a125464a/;jiaxin-ma-698462aa/;;;", "or_profile": "~Shusaku_Sone1;~Atsushi_Hashimoto1;~Jiaxin_Ma1;yanagi@lmd.ist.hokudai.ac.jp;~Naoya_Chiba1;~Yoshitaka_Ushiku3", "aff": "OMRON Corporation;OMRON SINIC X Corp.;OMRON SINIC X Corp.;;Waseda University;", "aff_domain": "omron.com;sinicx.com;omron.com;;waseda.jp;", "position": "Researcher;Senior Researcher;Researcher;;Postdoc;", "bibtex": "@misc{\nsone2022weavenet,\ntitle={WeaveNet: A Differentiable Solver for Non-linear Assignment Problems},\nauthor={Shusaku Sone and Atsushi Hashimoto and Jiaxin Ma and rintaro yanagi and Naoya Chiba and Yoshitaka Ushiku},\nyear={2022},\nurl={https://openreview.net/forum?id=ktHKpsbsxx}\n}", "github": "", "project": "", "reviewers": "xHX2;UQCX;yrWW;v9jL", "site": "https://openreview.net/forum?id=ktHKpsbsxx", "pdf_size": 0, "recommendation": "3;5;6;6", "confidence": "3;4;3;4", "correctness": "2;3;4;3", "technical_novelty": "1;3;2;3", "empirical_novelty": "1;3;3;3", "wc_summary_paper": "60;49;44;64", "wc_summary_review": "23;39;44;44", "wc_main_review": "434;218;158;108", "wc_review": "517;306;246;216", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "424;500;320;358", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 5.0, 1.224744871391589 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.25, 0.82915619758885 ], "empirical_novelty_avg": [ 2.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 54.25, 8.073877630977572 ], "wc_summary_review_avg": [ 37.5, 8.616843969807043 ], "wc_main_review_avg": [ 229.5, 124.32517846357591 ], "wc_review_avg": [ 321.25, 117.56992600150771 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 400.5, 68.44523358130937 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.40824829046386296, "corr_recommendation_correctness": 0.8660254037844386, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:n4_Wt4qwqKMJ:scholar.google.com/&scioq=WeaveNet:+A+Differentiable+Solver+for+Non-linear+Assignment+Problems&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0;0;1", "aff_unique_norm": "OMRON Corporation;Waseda University", "aff_unique_dep": ";", "aff_unique_url": "https://www.omron.com;https://www.waseda.jp/top", "aff_unique_abbr": "OMRON;Waseda", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Japan" }, { "id": "kxARp2zoqAk", "title": "Information-Aware Time Series Meta-Contrastive Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Various contrastive learning approaches have been proposed in recent years and achieve significant empirical success. While effective and prevalent, contrastive learning has been less explored for time series data. A key component of contrastive learning is to select appropriate augmentations imposing some priors to construct feasible positive samples, such that an encoder can be trained to learn robust and discriminative representations. Unlike image and language domains where ``desired'' augmented samples can be generated with the rule of thumb guided by prefabricated human priors, the ad-hoc manual selection of time series augmentations is hindered by their diverse and human-unrecognizable temporal structures. How to find the desired augmentations of time series data that are meaningful for given contrastive learning tasks and datasets remains an open question. In this work, we address the problem by encouraging both high fidelity and variety based upon information theory. A theoretical analysis leads to the criteria for selecting feasible data augmentations. On top of that, we employ the meta-learning mechanism and propose an information-aware approach, InfoTS, that adaptively selects optimal time series augmentations for contrastive representation learning. The meta-learner and the encoder are jointly optimized in an end-to-end manner to avoid sub-optimal solutions. Experiments on various datasets show highly competitive performance with up to 11.4% reduction in MSE on the forecasting task and up to 2.8% relative improvement in accuracy on the classification task over the leading baselines.", "keywords": "Information-Aware Time Series Meta-Contrastive Learning", "primary_area": "", "supplementary_material": "/attachment/4c48f7d797e5476952c833e8e30557bf0b35d39c.zip", "author": "Dongsheng Luo;Wei Cheng;Yingheng Wang;Dongkuan Xu;Jingchao Ni;Wenchao Yu;Xuchao Zhang;Yanchi Liu;Haifeng Chen;Xiang Zhang", "authorids": "~Dongsheng_Luo1;~Wei_Cheng1;~Yingheng_Wang1;~Dongkuan_Xu2;~Jingchao_Ni1;~Wenchao_Yu1;~Xuchao_Zhang2;yanchi@nec-labs.com;~Haifeng_Chen1;~Xiang_Zhang4", "gender": "M;M;M;M;M;;;;;", "homepage": "https://users.cs.fiu.edu/~dluo/;https://chengw07.github.io/;https://isjakewong.github.io/publications/;https://dongkuanx27.github.io/;;;https://xuczhang.github.io/;;https://haifengchen.gitlab.io/intro/;", "dblp": ";89/2506-2.html;265/6357;142/8139;151/3208;07/8491;188/3475.html;;08/57-1.html;", "google_scholar": "https://scholar.google.com/citations?hl=en;PRrGVmoAAAAJ;4WEa7tMAAAAJ;https://scholar.google.com/citations?hl=en;rH9MTZMAAAAJ;;;;QzakB68AAAAJ;", "orcid": "0000-0003-4192-0826;;;0000-0002-1456-9658;;;;;;", "linkedin": ";wei-cheng-ml/;;dongkuan-dk-xu-%F0%9F%87%BA%F0%9F%87%A6-05038087/;jingchao-ni-930a3871/;;;;;", "or_profile": "~Dongsheng_Luo1;~Wei_Cheng1;~Yingheng_Wang1;~Dongkuan_Xu2;~Jingchao_Ni1;~Wenchao_Yu1;~Xuchao_Zhang2;yanchi@nec-labs.com;~Haifeng_Chen1;~Xiang_Zhang4", "aff": "Florida International University;NEC-Labs;Cornell University;Pennsylvania State University;NEC-Labs;University of California, Los Angeles;;;NEC-Labs;", "aff_domain": "fiu.edu;nec-labs.com;cornell.edu;psu.edu;nec-labs.com;ucla.edu;;;nec-labs.com;", "position": "Assistant Professor;Principal Researcher;PhD student;PhD student;Researcher;PhD student;;;Researcher;", "bibtex": "@misc{\nluo2022informationaware,\ntitle={Information-Aware Time Series Meta-Contrastive Learning},\nauthor={Dongsheng Luo and Wei Cheng and Yingheng Wang and Dongkuan Xu and Jingchao Ni and Wenchao Yu and Xuchao Zhang and Yanchi Liu and Haifeng Chen and Xiang Zhang},\nyear={2022},\nurl={https://openreview.net/forum?id=kxARp2zoqAk}\n}", "github": "", "project": "", "reviewers": "7xBr;ikmv;ZzdQ;61sZ", "site": "https://openreview.net/forum?id=kxARp2zoqAk", "pdf_size": 0, "recommendation": "3;5;6;10", "confidence": "4;4;5;3", "correctness": "2;2;3;4", "technical_novelty": "2;2;3;3", "empirical_novelty": "1;3;4;3", "wc_summary_paper": "66;41;55;91", "wc_summary_review": "27;24;51;106", "wc_main_review": "379;356;930;88", "wc_review": "472;421;1036;285", "wc_reply_reviewers": "561;0;747;0", "wc_reply_authors": "2055;887;2367;38", "reply_reviewers": "2;0;2;0", "reply_authors": "4;2;5;1", "recommendation_avg": [ 6.0, 2.5495097567963922 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 2.75, 0.82915619758885 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.75, 1.0897247358851685 ], "wc_summary_paper_avg": [ 63.25, 18.30812661087966 ], "wc_summary_review_avg": [ 52.0, 32.8861673048107 ], "wc_main_review_avg": [ 438.25, 306.0917958717613 ], "wc_review_avg": [ 553.5, 286.8348828158807 ], "wc_reply_reviewers_avg": [ 327.0, 333.54684828371563 ], "wc_reply_authors_avg": [ 1336.75, 930.9034254421883 ], "reply_reviewers_avg": [ 1.0, 1.0 ], "reply_authors_avg": [ 3.0, 1.5811388300841898 ], "replies_avg": [ 22, 0 ], "authors#_avg": [ 10, 0 ], "corr_recommendation_confidence": -0.5547001962252291, "corr_recommendation_correctness": 0.9460998335825322, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7518201561426789289&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;2;3;1;4;1", "aff_unique_norm": "Florida International University;NEC Laboratories;Cornell University;Pennsylvania State University;University of California, Los Angeles", "aff_unique_dep": ";;;;", "aff_unique_url": "https://www.fiu.edu;https://www.nec-labs.com;https://www.cornell.edu;https://www.psu.edu;https://www.ucla.edu", "aff_unique_abbr": "FIU;NEC-Labs;Cornell;PSU;UCLA", "aff_campus_unique_index": "1", "aff_campus_unique": ";Los Angeles", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "kz6rsFehYjd", "title": "Towards General Robustness to Bad Training Data", "track": "main", "status": "Reject", "tldr": "", "abstract": "In this paper, we focus on the problem of identifying bad training data when the underlying cause is unknown in advance. Our key insight is that regardless of how bad data are generated, they tend to contribute little to training a model with good prediction performance or more generally, to some utility function of the data analyst. We formulate the problem of good/bad data selection as utility optimization. We propose a theoretical framework for evaluating the worst-case performance of data selection heuristics. Remarkably, our results show that the popular heuristic based on the Shapley value may choose the worst data subset in certain practical scenarios, which sheds lights on its large performance variation observed empirically in the past work. We then develop an algorithmic framework, DataSifter, to detect a variety of and even unknown data issues---a step towards general robustness to bad training data. DataSifter is guided by the theoretically optimal solution to data selection and is made practical by the data utility learning technique. Our evaluation shows that DataSifter achieves and most often significantly improves the state-of-the-art performance over a wide range of tasks, including backdoor, poison, noisy/mislabel data detection, data summarization, and data debiasing. ", "keywords": "General Robustness;Data Valuation;Data Utility Learning", "primary_area": "", "supplementary_material": "", "author": "Tianhao Wang;Yi Zeng;Ming Jin;Ruoxi Jia", "authorids": "~Tianhao_Wang2;~Yi_Zeng3;~Ming_Jin2;~Ruoxi_Jia1", "gender": "M;M;M;", "homepage": "https://tianhaowang.netlify.app/;https://yizeng623.github.io/;http://www.jinming.tech/;https://ruoxijia.info/", "dblp": "274/2144;75/148;;147/5355-1", "google_scholar": "nvQOtgkAAAAJ;slUNmHQAAAAJ;YdxdTtkAAAAJ;JCrug-YAAAAJ", "orcid": ";0000-0002-6901-9194;;", "linkedin": "tian-hao-wang/;chnyizeng/;;", "or_profile": "~Tianhao_Wang2;~Yi_Zeng3;~Ming_Jin2;~Ruoxi_Jia1", "aff": "Princeton University;Virginia Tech;Virginia Tech;Virginia Tech", "aff_domain": "princeton.edu;vt.edu;vt.edu;vt.edu", "position": "PhD student;PhD student;Assistant Professor;Assistant Professor", "bibtex": "@misc{\nwang2022towards,\ntitle={Towards General Robustness to Bad Training Data},\nauthor={Tianhao Wang and Yi Zeng and Ming Jin and Ruoxi Jia},\nyear={2022},\nurl={https://openreview.net/forum?id=kz6rsFehYjd}\n}", "github": "", "project": "", "reviewers": "4LgM;GEXW;s9Xw;yuTL", "site": "https://openreview.net/forum?id=kz6rsFehYjd", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "3;3;3;4", "correctness": "3;2;3;3", "technical_novelty": "2;2;3;2", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "55;55;48;38", "wc_summary_review": "38;29;49;53", "wc_main_review": "295;197;292;1049", "wc_review": "388;281;389;1140", "wc_reply_reviewers": "0;0;0;398", "wc_reply_authors": "386;849;712;3524", "reply_reviewers": "0;0;0;2", "reply_authors": "3;3;3;6", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 49.0, 6.96419413859206 ], "wc_summary_review_avg": [ 42.25, 9.41740410091868 ], "wc_main_review_avg": [ 458.25, 343.33902705634847 ], "wc_review_avg": [ 549.5, 343.73863617580145 ], "wc_reply_reviewers_avg": [ 99.5, 172.33905535310328 ], "wc_reply_authors_avg": [ 1367.75, 1256.2201986514945 ], "reply_reviewers_avg": [ 0.5, 0.8660254037844386 ], "reply_authors_avg": [ 3.75, 1.299038105676658 ], "replies_avg": [ 23, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:LhBvcozKW6EJ:scholar.google.com/&scioq=Towards+General+Robustness+to+Bad+Training+Data&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;1;1", "aff_unique_norm": "Princeton University;Virginia Tech", "aff_unique_dep": ";", "aff_unique_url": "https://www.princeton.edu;https://www.vt.edu", "aff_unique_abbr": "Princeton;VT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "SphereFace2: Binary Classification is All You Need for Deep Face Recognition", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6265", "id": "l3SDgUh7qZO", "poster": "", "openreview": "https://openreview.net/forum?id=l3SDgUh7qZO", "slides": "https://iclr.cc/virtual/2022/poster/6265", "video": "https://iclr.cc/virtual/2022/poster/6265", "author_site": "Yandong Wen, Weiyang Liu, Adrian Weller, Bhiksha Raj, Rita Singh", "tldr": "", "abstract": "State-of-the-art deep face recognition methods are mostly trained with a softmax-based multi-class classification framework. Despite being popular and effective, these methods still have a few shortcomings that limit empirical performance. In this paper, we start by identifying the discrepancy between training and evaluation in the existing multi-class classification framework and then discuss the potential limitations caused by the \"competitive\" nature of softmax normalization. Motivated by these limitations, we propose a novel binary classification training framework, termed SphereFace2. In contrast to existing methods, SphereFace2 circumvents the softmax normalization, as well as the corresponding closed-set assumption. This effectively bridges the gap between training and evaluation, enabling the representations to be improved individually by each binary classification task. Besides designing a specific well-performing loss function, we summarize a few general principles for this \"one-vs-all\" binary classification framework so that it can outperform current competitive methods. Our experiments on popular benchmarks demonstrate that SphereFace2 can consistently outperform state-of-the-art deep face recognition methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yandong Wen;Weiyang Liu;Adrian Weller;Bhiksha Raj;Rita Singh", "authorids": "~Yandong_Wen1;~Weiyang_Liu1;~Adrian_Weller1;~Bhiksha_Raj1;~Rita_Singh1", "gender": "M;M;M;M;F", "homepage": ";http://wyliu.com/;http://mlg.eng.cam.ac.uk/adrian/;https://www.cs.cmu.edu/directory/bhikshar/;http://mlsp.cs.cmu.edu/people/rsingh/index.html", "dblp": "153/2125;137/1532;73/8324;60/3996;", "google_scholar": ";DMjROf0AAAAJ;https://scholar.google.co.uk/citations?user=Ek4hM10AAAAJ;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~Yandong_Wen1;~Weiyang_Liu1;~Adrian_Weller1;~Bhiksha_Raj1;~Rita_Singh1", "aff": "Carnegie Mellon University;University of Cambridge;University of Cambridge;Mohamed bin Zayed University of Artificial Intelligence;School of Computer Science, Carnegie Mellon University", "aff_domain": "andrew.cmu.edu;cam.ac.uk;cam.ac.uk;mbzuai.ac.ae;cs.cmu.edu", "position": "PhD student;Researcher;Principal Researcher;Full Professor;Research Professor", "bibtex": "@inproceedings{\nwen2022sphereface,\ntitle={SphereFace2: Binary Classification is All You Need for Deep Face Recognition},\nauthor={Yandong Wen and Weiyang Liu and Adrian Weller and Bhiksha Raj and Rita Singh},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=l3SDgUh7qZO}\n}", "github": "", "project": "", "reviewers": "6DJe;iYQo;4dBH", "pdf_size": 0, "recommendation": "8;8;8", "confidence": "3;4;5", "correctness": "3;4;3", "technical_novelty": "3;3;3", "empirical_novelty": "4;3;3", "wc_summary_paper": "44;104;89", "wc_summary_review": "53;41;84", "wc_main_review": "170;224;488", "wc_review": "267;369;661", "wc_reply_reviewers": "20;49;120", "wc_reply_authors": "192;529;1848", "reply_reviewers": "1;2;2", "reply_authors": "1;2;3", "recommendation_avg": [ 8.0, 0.0 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 79.0, 25.495097567963924 ], "wc_summary_review_avg": [ 59.333333333333336, 18.116904322268255 ], "wc_main_review_avg": [ 294.0, 138.93883546366726 ], "wc_review_avg": [ 432.3333333333333, 166.96772808606525 ], "wc_reply_reviewers_avg": [ 63.0, 42.00793575821915 ], "wc_reply_authors_avg": [ 856.3333333333334, 714.5834839649987 ], "reply_reviewers_avg": [ 1.6666666666666667, 0.4714045207910317 ], "reply_authors_avg": [ 2.0, 0.816496580927726 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 63, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11808076263858435671&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "pdf": "https://openreview.net/pdf?id=l3SDgUh7qZO", "email": "andrew.cmu.edu;cam.ac.uk;cam.ac.uk;mbzuai.ac.ae;cs.cmu.edu", "author_num": 5, "aff_unique_index": "0;1;1;2;0", "aff_unique_norm": "Carnegie Mellon University;University of Cambridge;Mohamed bin Zayed University of Artificial Intelligence", "aff_unique_dep": ";;", "aff_unique_url": "https://www.cmu.edu;https://www.cam.ac.uk;https://mbzuai.ac.ae", "aff_unique_abbr": "CMU;Cambridge;MBZUAI", "aff_campus_unique_index": "1;1;2", "aff_campus_unique": ";Cambridge;Pittsburgh", "aff_country_unique_index": "0;1;1;2;0", "aff_country_unique": "United States;United Kingdom;United Arab Emirates" }, { "id": "l431c_2eGO2", "title": "Mix-MaxEnt: Creating High Entropy Barriers To Improve Accuracy and Uncertainty Estimates of Deterministic Neural Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "We propose an extremely simple approach to regularize a single deterministic neural network to obtain improved accuracy and reliable uncertainty estimates. Our approach, on top of the cross-entropy loss, simply puts an entropy maximization regularizer corresponding to the predictive distribution in the regions of the embedding space between the class clusters. This is achieved by synthetically generating between-cluster samples via the convex combination of two images from different classes and maximizing the entropy on these samples. Such a data-dependent regularization guides the maximum likelihood estimation to prefer a solution that (1) maps out-of-distribution samples to high entropy regions (creating an entropy barrier); and (2) is more robust superficial input perturbations.\nVia extensive experiments on real-world datasets (CIFAR-10 and CIFAR-100) using ResNet and Wide-ResNet architectures, we demonstrate that Mix-MaxEnt consistently provides much improved classification accuracy, better calibrated probabilities for in-distribution data, and reliable uncertainty estimates when exposed to situations involving domain-shift and out-of-distribution samples.\n", "keywords": "regularizer;maximum entropy;uncertainty estimation;data-shift robustness;calibration;out-of-distribution detection", "primary_area": "", "supplementary_material": "", "author": "Francesco Pinto;Harry Yang;Ser-Nam Lim;Philip Torr;Puneet K. Dokania", "authorids": "~Francesco_Pinto1;~Harry_Yang2;~Ser-Nam_Lim3;~Philip_Torr1;~Puneet_K._Dokania1", "gender": "M;;;M;M", "homepage": ";http://leehomyc.github.io;http://www.robots.ox.ac.uk/~tvg/;http://puneetkdokania.github.io/;https://sites.google.com/site/sernam", "dblp": "281/7477;;;150/4211;04/6633", "google_scholar": "rqAdo2MAAAAJ;;;https://scholar.google.fr/citations?user=WsM7ybkAAAAJ;HX0BfLYAAAAJ", "orcid": ";;;;", "linkedin": "francesco-pinto-42a389b1?lipi=urn%3Ali%3Apage%3Ad_flagship3_profile_view_base_contact_details%3BishkY8oUQ8OTPPeV0SSCdw%3D%3D;;;;", "or_profile": "~Francesco_Pinto1;~Harry_Yang2;~Philip_Torr1;~Puneet_Dokania1;~Ser-Nam_Lim1", "aff": "University of Oxford;Meta Facebook;University of Oxford;University of Oxford;Meta Facebook", "aff_domain": "ox.ac.uk;meta.com;ox.ac.uk;oxford.ac.uk;facebook.com", "position": "PhD student;Researcher;Full Professor;Senior Researcher;Research Scientist Manager", "bibtex": "@misc{\npinto2022mixmaxent,\ntitle={Mix-MaxEnt: Creating High Entropy Barriers To Improve Accuracy and Uncertainty Estimates of Deterministic Neural Networks},\nauthor={Francesco Pinto and Harry Yang and Ser-Nam Lim and Philip Torr and Puneet K. Dokania},\nyear={2022},\nurl={https://openreview.net/forum?id=l431c_2eGO2}\n}", "github": "", "project": "", "reviewers": "ESue;L8Yz;iva2;R5hm;qdpu", "site": "https://openreview.net/forum?id=l431c_2eGO2", "pdf_size": 0, "recommendation": "3;5;5;6;6", "confidence": "5;3;4;4;3", "correctness": "3;4;2;3;3", "technical_novelty": "3;2;2;2;3", "empirical_novelty": "3;3;2;2;3", "wc_summary_paper": "66;46;120;120;122", "wc_summary_review": "130;27;47;71;34", "wc_main_review": "1302;135;520;377;419", "wc_review": "1498;208;687;568;575", "wc_reply_reviewers": "1743;0;595;47;15", "wc_reply_authors": "2554;1342;2989;1456;1394", "reply_reviewers": "6;0;1;1;1", "reply_authors": "4;3;4;4;2", "recommendation_avg": [ 5.0, 1.0954451150103321 ], "confidence_avg": [ 3.8, 0.7483314773547882 ], "correctness_avg": [ 3.0, 0.6324555320336759 ], "technical_novelty_avg": [ 2.4, 0.4898979485566356 ], "empirical_novelty_avg": [ 2.6, 0.4898979485566356 ], "wc_summary_paper_avg": [ 94.8, 32.313464685793136 ], "wc_summary_review_avg": [ 61.8, 37.25265091238475 ], "wc_main_review_avg": [ 550.6, 396.4132187503338 ], "wc_review_avg": [ 707.2, 427.0425739899946 ], "wc_reply_reviewers_avg": [ 480.0, 669.7026205712502 ], "wc_reply_authors_avg": [ 1947.0, 688.0592997700126 ], "reply_reviewers_avg": [ 1.8, 2.1354156504062622 ], "reply_authors_avg": [ 3.4, 0.8 ], "replies_avg": [ 32, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.7319250547113999, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:7yN55pzwOC8J:scholar.google.com/&scioq=Mix-MaxEnt:+Creating+High+Entropy+Barriers+To+Improve+Accuracy+and+Uncertainty+Estimates+of+Deterministic+Neural+Networks&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;0;0;1", "aff_unique_norm": "University of Oxford;Meta", "aff_unique_dep": ";Meta Platforms, Inc.", "aff_unique_url": "https://www.ox.ac.uk;https://meta.com", "aff_unique_abbr": "Oxford;Meta", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0;1", "aff_country_unique": "United Kingdom;United States" }, { "title": "Data-Efficient Graph Grammar Learning for Molecular Generation", "status": "Oral", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7011", "id": "l4IHywGq6a", "poster": "", "openreview": "https://openreview.net/forum?id=l4IHywGq6a", "slides": "https://iclr.cc/virtual/2022/poster/7011", "video": "https://iclr.cc/virtual/2022/poster/7011", "author_site": "Minghao Guo, Veronika Thost, Beichen Li, Payel Das, Jie Chen, Wojciech Matusik", "tldr": "", "abstract": "The problem of molecular generation has received significant attention recently. Existing methods are typically based on deep neural networks and require training on large datasets with tens of thousands of samples. In practice, however, the size of class-specific chemical datasets is usually limited (e.g., dozens of samples) due to labor-intensive experimentation and data collection. Another major challenge is to generate only physically synthesizable molecules. This is a non-trivial task for neural network-based generative models since the relevant chemical knowledge can only be extracted and generalized from the limited training data. In this work, we propose a data-efficient generative model that can be learned from datasets with orders of magnitude smaller sizes than common benchmarks. At the heart of this method is a learnable graph grammar that generates molecules from a sequence of production rules. Without any human assistance, these production rules are automatically constructed from training data. Furthermore, additional chemical knowledge can be incorporated into the model by further grammar optimization. Our learned graph grammar yields state-of-the-art results on generating high-quality molecules for three monomer datasets that contain only ${\\sim}20$ samples each. Our approach also achieves remarkable performance in a challenging polymer generation task with $only$ $117$ training samples and is competitive against existing methods using $81$k data points.\n", "keywords": "molecular generation;graph grammar;data efficient generative model", "primary_area": "", "supplementary_material": "", "author": "Minghao Guo;Veronika Thost;Beichen Li;Payel Das;Jie Chen;Wojciech Matusik", "authorids": "~Minghao_Guo1;~Veronika_Thost1;~Beichen_Li1;~Payel_Das1;~Jie_Chen1;~Wojciech_Matusik2", "gender": "M;F;M;F;;M", "homepage": "https://www.minghaoguo.com/;https://mitibmwatsonailab.mit.edu/people/veronika-thost/;https://people.csail.mit.edu/beichen;;https://jiechenjiechen.github.io;https://cdfg.mit.edu/wojciech", "dblp": "145/0008/;132/3874;;56/7926;92/6289-7;", "google_scholar": "Hq2unJcAAAAJ;TyScgJ0AAAAJ;zR5wuKUAAAAJ;;Z-lkme8AAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";0000-0003-4984-1532;0000-0002-9271-0055;;;0000-0003-0212-5643", "linkedin": ";;beichen-li-ba9b34106;;;wojciech-matusik-67238126/", "or_profile": "~Minghao_Guo1;~Veronika_Thost1;~Beichen_Li1;~Payel_Das1;~Jie_Chen1;~Wojciech_Matusik2", "aff": "Massachusetts Institute of Technology;IBM Research;Massachusetts Institute of Technology;IBM, International Business Machines;International Business Machines;Massachusetts Institute of Technology", "aff_domain": "mit.edu;ibm.com;mit.edu;us.ibm.com;ibm.com;mit.edu", "position": "PhD student;Research Scientist;PhD student;Principal Researcher;Research Staff Member;Full Professor", "bibtex": "@inproceedings{\nguo2022dataefficient,\ntitle={Data-Efficient Graph Grammar Learning for Molecular Generation},\nauthor={Minghao Guo and Veronika Thost and Beichen Li and Payel Das and Jie Chen and Wojciech Matusik},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=l4IHywGq6a}\n}", "github": "", "project": "", "reviewers": "LtXp;Qrrj;oHTg;4wRv", "pdf_size": 0, "recommendation": "8;8;8;8", "confidence": "4;3;4;3", "correctness": "4;4;4;3", "technical_novelty": "4;3;3;4", "empirical_novelty": "3;3;4;3", "wc_summary_paper": "59;207;101;111", "wc_summary_review": "21;44;27;31", "wc_main_review": "105;291;411;260", "wc_review": "185;542;539;402", "wc_reply_reviewers": "0;28;21;0", "wc_reply_authors": "311;575;724;380", "reply_reviewers": "0;1;1;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 8.0, 0.0 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.5, 0.5 ], "empirical_novelty_avg": [ 3.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 119.5, 54.1548705104167 ], "wc_summary_review_avg": [ 30.75, 8.437268515343103 ], "wc_main_review_avg": [ 266.75, 109.09256390790345 ], "wc_review_avg": [ 417.0, 145.39429149729366 ], "wc_reply_reviewers_avg": [ 12.25, 12.497499749949988 ], "wc_reply_authors_avg": [ 497.5, 162.7090962423429 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 44, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3349437997127524473&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "pdf": "https://openreview.net/pdf?id=l4IHywGq6a", "email": "mit.edu;ibm.com;mit.edu;us.ibm.com;ibm.com;mit.edu", "author_num": 6, "aff_unique_index": "0;1;0;2;3;0", "aff_unique_norm": "Massachusetts Institute of Technology;IBM;International Business Machines;International Business Machines Corporation", "aff_unique_dep": ";IBM Research;;", "aff_unique_url": "https://web.mit.edu;https://www.ibm.com/research;https://www.ibm.com;https://www.ibm.com", "aff_unique_abbr": "MIT;IBM;IBM;IBM", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "l5HdwFu2Ttp", "title": "Tabula: Efficiently Computing Nonlinear Activation Functions for Private Neural Network Inference", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Multiparty computation approaches to private neural network inference require significant communication between server and client, incur tremendous runtime penalties, and cost massive storage overheads. The primary source of these expenses is garbled circuits operations for nonlinear activation functions (typically ReLU), which require on the order of kilobytes of data transfer for each individual operation and tens of kilobytes of preprocessing storage per operation per inference. We propose a replacement for garbled circuits: Tabula, an algorithm to securely and efficiently perform single operand nonlinear functions for private neural network inference. Tabula performs a one time client initialization procedure with the help of a trusted third party (or via using fully homomorphic encryption), operates over smaller finite fields whose elements are representable with less than 16 bits, and employs a lookup table which stores the encrypted results of nonlinear operations over secretly shared values. We show Tabula is secure under a semi-honest threat model, allowing it to be used as a replacement for garbled circuits operations. Our results show that for private neural network inference, Tabula eliminates communication by a factor of more than $50 \\times$, enables speedups over $10 \\times$, and reduces storage costs from $O(n)$ to $O(1)$. ", "keywords": "private neural network inference;privacy;security;performance", "primary_area": "", "supplementary_material": "/attachment/39b9cfe001cb4464a8377493191e35e066e4f69d.zip", "author": "Max Lam;Michael Mitzenmacher;Vijay Janapa Reddi;Gu-Yeon Wei;David Brooks", "authorids": "~Max_Lam1;~Michael_Mitzenmacher1;~Vijay_Janapa_Reddi1;~Gu-Yeon_Wei1;~David_Brooks1", "gender": "M;M;M;M;", "homepage": ";;https://scholar.harvard.edu/vijay-janapa-reddi;;", "dblp": ";74/838;88/2610;21/5583;30/135", "google_scholar": ";e8aRmAsAAAAJ;https://scholar.google.com/citations?view_op=search_authors;IR0yJB8AAAAJ;vXHA_XYAAAAJ", "orcid": ";;0000-0002-5259-7721;;", "linkedin": ";;vijay-janapa-reddi-63a6a173/;;", "or_profile": "~Max_Lam1;~Michael_Mitzenmacher1;~Vijay_Janapa_Reddi1;~Gu-Yeon_Wei1;~David_Brooks1", "aff": ";Harvard University;Harvard University;Samsung;Meta Facebook", "aff_domain": ";harvard.edu;harvard.edu;samsung.com;facebook.com", "position": ";Full Professor;Associate Professor;Researcher;Visiting Research Scientist", "bibtex": "@misc{\nlam2022tabula,\ntitle={Tabula: Efficiently Computing Nonlinear Activation Functions for Private Neural Network Inference},\nauthor={Max Lam and Michael Mitzenmacher and Vijay Janapa Reddi and Gu-Yeon Wei and David Brooks},\nyear={2022},\nurl={https://openreview.net/forum?id=l5HdwFu2Ttp}\n}", "github": "", "project": "", "reviewers": "yge8;JwmY;CUK7", "site": "https://openreview.net/forum?id=l5HdwFu2Ttp", "pdf_size": 0, "recommendation": "1;3;6", "confidence": "5;2;2", "correctness": "1;3;3", "technical_novelty": "1;2;3", "empirical_novelty": "1;3;0", "wc_summary_paper": "23;46;34", "wc_summary_review": "10;79;11", "wc_main_review": "180;204;131", "wc_review": "213;329;176", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 3.3333333333333335, 2.0548046676563256 ], "confidence_avg": [ 3.0, 1.4142135623730951 ], "correctness_avg": [ 2.3333333333333335, 0.9428090415820634 ], "technical_novelty_avg": [ 2.0, 0.816496580927726 ], "empirical_novelty_avg": [ 1.3333333333333333, 1.247219128924647 ], "wc_summary_paper_avg": [ 34.333333333333336, 9.392668535736913 ], "wc_summary_review_avg": [ 33.333333333333336, 32.293790252754306 ], "wc_main_review_avg": [ 171.66666666666666, 30.379086373505192 ], "wc_review_avg": [ 239.33333333333334, 65.17838769271776 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.8029550685469661, "corr_recommendation_correctness": 0.802955068546966, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:2RDB1TzQaQoJ:scholar.google.com/&scioq=Tabula:+Efficiently+Computing+Nonlinear+Activation+Functions+for+Private+Neural+Network+Inference&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;1;2", "aff_unique_norm": "Harvard University;Samsung;Meta", "aff_unique_dep": ";Samsung;Meta Platforms, Inc.", "aff_unique_url": "https://www.harvard.edu;https://www.samsung.com;https://meta.com", "aff_unique_abbr": "Harvard;Samsung;Meta", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "United States;South Korea" }, { "title": "Demystifying Limited Adversarial Transferability in Automatic Speech Recognition Systems", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/5930", "id": "l5aSHXi8jG5", "poster": "", "openreview": "https://openreview.net/forum?id=l5aSHXi8jG5", "slides": "https://iclr.cc/virtual/2022/poster/5930", "video": "https://iclr.cc/virtual/2022/poster/5930", "author_site": "Hadi Abdullah, Aditya Karlekar, Vincent Bindschaedler, Patrick Traynor", "tldr": "", "abstract": "The targeted transferability of adversarial samples enables attackers to exploit black-box models in the real-world. The most popular method to produce these adversarial samples is optimization attacks, which have been shown to achieve a high level of transferability in some domains. However, recent research has demonstrated that these attack samples fail to transfer when applied to Automatic Speech Recognition Systems (ASRs). In this paper, we investigate factors preventing this transferability via exhaustive experimentation. To do so, we perform an ablation study on each stage of the ASR pipeline. We discover and quantify six factors (i.e., input type, MFCC, RNN, output type, and vocabulary and sequence sizes) that impact the targeted transferability of optimization attacks against ASRs. Future research can leverage our findings to build ASRs that are more robust to other transferable attack types (e.g., signal processing attacks), or to modify architectures in other domains to reduce their exposure to targeted transferability of optimization attacks.", "keywords": "optimization attacks;transferability;adversarial machine learning", "primary_area": "", "supplementary_material": "", "author": "Hadi Abdullah;Aditya Karlekar;Vincent Bindschaedler;Patrick Traynor", "authorids": "~Hadi_Abdullah1;~Aditya_Karlekar1;~Vincent_Bindschaedler1;~Patrick_Traynor1", "gender": ";M;;M", "homepage": "https://hadiabdullah.github.io/;;https://vbinds.ch;", "dblp": "205/2013;;117/2526;", "google_scholar": ";;uJMkuykAAAAJ;https://scholar.google.com/citations?view_op=list_works", "orcid": ";;;", "linkedin": ";aditya-karlekar/;;", "or_profile": "~Hadi_Abdullah1;~Aditya_Karlekar1;~Vincent_Bindschaedler1;~Patrick_Traynor1", "aff": "University of Florida;;University of Florida;University of Florida", "aff_domain": "ufl.edu;;ufl.edu;ufl.edu", "position": "PhD student;;Assistant Professor;Full Professor", "bibtex": "@inproceedings{\nabdullah2022demystifying,\ntitle={Demystifying Limited Adversarial Transferability in Automatic Speech Recognition Systems},\nauthor={Hadi Abdullah and Aditya Karlekar and Vincent Bindschaedler and Patrick Traynor},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=l5aSHXi8jG5}\n}", "github": "", "project": "", "reviewers": "qpFd;TNL7;9PZL;Vmod", "pdf_size": 0, "recommendation": "5;5;5;8", "confidence": "4;3;4;3", "correctness": "3;3;2;3", "technical_novelty": "2;3;2;3", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "78;80;123;51", "wc_summary_review": "40;30;64;13", "wc_main_review": "319;185;433;150", "wc_review": "437;295;620;214", "wc_reply_reviewers": "67;0;0;0", "wc_reply_authors": "597;478;672;190", "reply_reviewers": "1;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 5.75, 1.299038105676658 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 83.0, 25.777897509300484 ], "wc_summary_review_avg": [ 36.75, 18.45772196128222 ], "wc_main_review_avg": [ 271.75, 112.45304575688468 ], "wc_review_avg": [ 391.5, 154.19225012950554 ], "wc_reply_reviewers_avg": [ 16.75, 29.011851026778693 ], "wc_reply_authors_avg": [ 484.25, 183.42897126680944 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1412449290880196776&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=l5aSHXi8jG5", "email": "ufl.edu;;ufl.edu;ufl.edu", "author_num": 4, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Florida", "aff_unique_dep": "", "aff_unique_url": "https://www.ufl.edu", "aff_unique_abbr": "UF", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Implicit Bias of Adversarial Training for Deep Neural Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7019", "id": "l8It-0lE5e7", "poster": "", "openreview": "https://openreview.net/forum?id=l8It-0lE5e7", "slides": "https://iclr.cc/virtual/2022/poster/7019", "video": "https://iclr.cc/virtual/2022/poster/7019", "author_site": "Bochen Lyu, Zhanxing Zhu", "tldr": "", "abstract": "We provide theoretical understandings of the implicit bias imposed by adversarial training for homogeneous deep neural networks without any explicit regularization. In particular, for deep linear networks adversarially trained by gradient descent on a linearly separable dataset, we prove that the direction of the product of weight matrices converges to the direction of the max-margin solution of the original dataset. Furthermore, we generalize this result to the case of adversarial training for non-linear homogeneous deep neural networks without the linear separability of the dataset. We show that, when the neural network is adversarially trained with $\\ell_2$ or $\\ell_{\\infty}$ FGSM, FGM and PGD perturbations, the direction of the limit point of normalized parameters of the network along the trajectory of the gradient flow converges to a KKT point of a constrained optimization problem that aims to maximize the margin for adversarial examples. Our results theoretically justify the longstanding conjecture that adversarial training modifies the decision boundary by utilizing adversarial examples to improve robustness, and potentially provides insights for designing new robust training strategies.", "keywords": "adversarial training;adversarial examples", "primary_area": "", "supplementary_material": "", "author": "Bochen Lv;Zhanxing Zhu", "authorids": "~Bochen_Lv1;~Zhanxing_Zhu1", "gender": ";M", "homepage": ";https://zhanxingzhu.github.io/", "dblp": ";87/7756.html", "google_scholar": ";a2sHceIAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Bochen_Lv1;~Zhanxing_Zhu1", "aff": ";Peking University", "aff_domain": ";pku.edu.cn", "position": ";Assistant Professor", "bibtex": "@inproceedings{\nlv2022implicit,\ntitle={Implicit Bias of Adversarial Training for Deep Neural Networks},\nauthor={Bochen Lv and Zhanxing Zhu},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=l8It-0lE5e7}\n}", "github": "", "project": "", "reviewers": "eZ8u;uXwk;kpiH;QB5M", "pdf_size": 0, "recommendation": "5;5;8;8", "confidence": "3;4;3;3", "correctness": "3;2;4;4", "technical_novelty": "3;4;3;4", "empirical_novelty": "0;4;4;0", "wc_summary_paper": "72;222;95;69", "wc_summary_review": "75;100;34;18", "wc_main_review": "379;602;134;256", "wc_review": "526;924;263;343", "wc_reply_reviewers": "0;0;0;34", "wc_reply_authors": "514;1194;113;689", "reply_reviewers": "0;0;0;1", "reply_authors": "1;2;1;1", "recommendation_avg": [ 6.5, 1.5 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 3.5, 0.5 ], "empirical_novelty_avg": [ 2.0, 2.0 ], "wc_summary_paper_avg": [ 114.5, 62.87487574540406 ], "wc_summary_review_avg": [ 56.75, 32.491345001399985 ], "wc_main_review_avg": [ 342.75, 172.93550098230264 ], "wc_review_avg": [ 514.0, 255.18914553718776 ], "wc_reply_reviewers_avg": [ 8.5, 14.722431864335457 ], "wc_reply_authors_avg": [ 627.5, 388.03898000072104 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.5773502691896258, "corr_recommendation_correctness": 0.9045340337332909, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15103374593924729487&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2, "pdf": "https://openreview.net/pdf?id=l8It-0lE5e7", "email": ";pku.edu.cn", "author_num": 2, "aff_unique_index": "0", "aff_unique_norm": "Peking University", "aff_unique_dep": "", "aff_unique_url": "http://www.pku.edu.cn", "aff_unique_abbr": "Peking U", "aff_country_unique_index": "0", "aff_country_unique": "China" }, { "id": "l9tb1bKyfMn", "title": "LMSA: Low-relation Mutil-head Self-Attention Mechanism in Visual Transformer", "track": "main", "status": "Reject", "tldr": "", "abstract": "The Transformer backbone network with the self-attention mechanism as the core has achieved great success in the field of natural language processing and computer vision. However, through the self-attention mechanism brings high performance, it also brings higher computational complexity compared to the classic visual feature extraction methods. To further reduce the complexity of self-attention mechanism and explore its lighter version in computer vision, in this paper, we design a novel lightweighted self-attention mechanism: Low-relation Mutil-head Self-Attention (LMSA), which is superior than the recent self-attention. Specifically, the proposed self-attention mechanism breaks the barrier of the dimensional consistency of the traditional self-attention mechanism, resulting in lower computational complexity and occupies less storage space. In addition, employing the new mechanism can release part of the computing consumption of the Transformer network and make the best use of it. Experimental results show that the dimensional consistency inside the traditional self-attention mechanism is unnecessary. In particular, using Swin as the backbone model for training, the accuracy in CIFAR-10 image classification task is improved by 0.43$\\%$, in the meanwhile, the consumption of a single self-attention resource is reduced by 64.58$\\%$, and the number of model parameters and model size are reduced by more than 15$\\%$. By appropriately compressing the dimensions of the self-attention relationship variables, the Transformer network can be more efficient and even perform better. The results prompt us to rethink the reason why the self-attention mechanism works.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "JingJie Wang;Xiang Wei;Xiaoyu Liu", "authorids": "~JingJie_Wang1;~Xiang_Wei1;~Xiaoyu_Liu5", "gender": "M;M;F", "homepage": "https://github.com/wjjdeeplearning;http://faculty.bjtu.edu.cn/rjxy/9335.html;https://github.com/universelxy/", "dblp": ";37/1682-7;", "google_scholar": ";SydSP3gAAAAJ;", "orcid": ";0000-0002-8967-6423;", "linkedin": ";;", "or_profile": "~JingJie_Wang1;~Xiang_Wei1;~Xiaoyu_Liu5", "aff": "Beijing JiaoTong University;Beijing Jiaotong University;Beijing Jiaotong university", "aff_domain": "bjtu.edu.cn;bjtu.edu.cn;bjtu.edu.cn", "position": "PhD student;Assistant Professor;MS student", "bibtex": "@misc{\nwang2022lmsa,\ntitle={{LMSA}: Low-relation Mutil-head Self-Attention Mechanism in Visual Transformer},\nauthor={JingJie Wang and Xiang Wei and Xiaoyu Liu},\nyear={2022},\nurl={https://openreview.net/forum?id=l9tb1bKyfMn}\n}", "github": "", "project": "", "reviewers": "ZniF;S5RZ;kY4w", "site": "https://openreview.net/forum?id=l9tb1bKyfMn", "pdf_size": 0, "recommendation": "1;3;3", "confidence": "5;5;5", "correctness": "3;2;2", "technical_novelty": "1;1;1", "empirical_novelty": "1;1;1", "wc_summary_paper": "21;25;62", "wc_summary_review": "41;17;23", "wc_main_review": "415;137;168", "wc_review": "477;179;253", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 2.3333333333333335, 0.9428090415820634 ], "confidence_avg": [ 5.0, 0.0 ], "correctness_avg": [ 2.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 1.0, 0.0 ], "empirical_novelty_avg": [ 1.0, 0.0 ], "wc_summary_paper_avg": [ 36.0, 18.457157599876172 ], "wc_summary_review_avg": [ 27.0, 10.198039027185569 ], "wc_main_review_avg": [ 240.0, 124.38917423420202 ], "wc_review_avg": [ 303.0, 126.69122568933756 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": -1.0, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1, "aff_unique_index": "0;1;1", "aff_unique_norm": "Beijing JiaoTong University;Beijing Jiao Tong University", "aff_unique_dep": ";", "aff_unique_url": "https://www.bjtu.edu.cn;http://www.njtu.edu.cn/en", "aff_unique_abbr": "BJTU;BJTU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "id": "lD8qAOTu5FJ", "title": "Addressing the Stability-Plasticity Dilemma via Knowledge-Aware Continual Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Continual learning agents should incrementally learn a sequence of tasks while satisfying two main desiderata: accumulating on previous knowledge without forgetting and transferring previous relevant knowledge to help in future learning. Existing research largely focuses on alleviating the catastrophic forgetting problem. There, an agent is altered to prevent forgetting based solely on previous tasks. This hinders the balance between preventing forgetting and maximizing the forward transfer. In response to this, we investigate the stability-plasticity dilemma to determine which model components are eligible to be reused, added, fixed, or updated to achieve this balance. We address the class incremental learning scenario where the agent is prone to ambiguities between old and new classes. With our proposed Knowledge-Aware contiNual learner (KAN), we demonstrate that considering the semantic similarity between old and new classes helps in achieving this balance. We show that being aware of existing knowledge helps in: (1) increasing the forward transfer from similar knowledge, (2) reducing the required capacity by leveraging existing knowledge, (3) protecting dissimilar knowledge, and (4) increasing robustness to the class order in the sequence. We evaluated sequences of similar tasks, dissimilar tasks, and a mix of both constructed from the two commonly used benchmarks for class-incremental learning; CIFAR-10 and CIFAR-100.", "keywords": "Continual learning;Class incremental learning;Stability-plasticity dilemma;Sparse neural networks;Knowledge-Awareness", "primary_area": "", "supplementary_material": "/attachment/bcc1e2946ce8c35da2dec69923745d572b393bc8.zip", "author": "Ghada Sokar;Decebal Constantin Mocanu;Mykola Pechenizkiy", "authorids": "~Ghada_Sokar1;~Decebal_Constantin_Mocanu1;~Mykola_Pechenizkiy1", "gender": ";M;M", "homepage": "https://research.tue.nl/en/persons/ghada-sokar;https://wwwen.uni.lu/recherche/fstm/dcs/members/decebal_constantin_mocanu;http://www.win.tue.nl/~mpechen/", "dblp": "244/7833;133/7764;37/4649", "google_scholar": "https://scholar.google.nl/citations?user=0e6fdZsAAAAJ;RlQgUwEAAAAJ;https://scholar.google.com.tw/citations?user=F0uFT_kAAAAJ", "orcid": ";0000-0002-5636-7683;0000-0003-4955-0743", "linkedin": ";;mpechen/", "or_profile": "~Ghada_Sokar1;~Decebal_Constantin_Mocanu1;~Mykola_Pechenizkiy1", "aff": "Eindhoven University of Technology;University of Twente;Eindhoven University of Technology", "aff_domain": "tue.nl;utwente.nl;tue.nl", "position": "PhD student;Assistant Professor;Full Professor", "bibtex": "@misc{\nsokar2022addressing,\ntitle={Addressing the Stability-Plasticity Dilemma via Knowledge-Aware Continual Learning},\nauthor={Ghada Sokar and Decebal Constantin Mocanu and Mykola Pechenizkiy},\nyear={2022},\nurl={https://openreview.net/forum?id=lD8qAOTu5FJ}\n}", "github": "", "project": "", "reviewers": "o6Js;UadE;NFc9;xDnv", "site": "https://openreview.net/forum?id=lD8qAOTu5FJ", "pdf_size": 0, "recommendation": "3;3;6;6", "confidence": "4;5;5;3", "correctness": "3;2;4;3", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "38;51;73;83", "wc_summary_review": "26;57;22;37", "wc_main_review": "364;562;326;257", "wc_review": "428;670;421;377", "wc_reply_reviewers": "0;78;83;0", "wc_reply_authors": "953;1879;885;1236", "reply_reviewers": "0;1;1;0", "reply_authors": "2;4;2;3", "recommendation_avg": [ 4.5, 1.5 ], "confidence_avg": [ 4.25, 0.82915619758885 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 61.25, 17.725334975678173 ], "wc_summary_review_avg": [ 35.5, 13.573871960498227 ], "wc_main_review_avg": [ 377.25, 113.35205115038721 ], "wc_review_avg": [ 474.0, 114.83684077855851 ], "wc_reply_reviewers_avg": [ 40.25, 40.28880117352712 ], "wc_reply_authors_avg": [ 1238.25, 392.65721373737676 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 2.75, 0.82915619758885 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.3015113445777637, "corr_recommendation_correctness": 0.7071067811865476, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4547530085727977934&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2, "aff_unique_index": "0;1;0", "aff_unique_norm": "Eindhoven University of Technology;University of Twente", "aff_unique_dep": ";", "aff_unique_url": "https://www.tue.nl;https://www.utwente.nl", "aff_unique_abbr": "TU/e;UT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Netherlands" }, { "id": "lDvJM5XUyrx", "title": "Towards Understanding Catastrophic Overfitting in Fast Adversarial Training", "track": "main", "status": "Desk Reject", "tldr": "", "abstract": "After adversarial training was proposed, a series of works focus on improving the compunational efficiency of adversarial training for deep neural networks (DNNs). Recently, FGSM based single-step adversarial training has been found to be able to train a robust model with the robustness comparable to the one trained by multi-step PGD, but it is an order of magnitude faster. However, there exists a failure mode called Catastrophic Overfitting (CO) where the network loses its robustness against PGD attack suddenly and can be hardly recovered by itself during the training process. In this paper, we identify that CO is closely related to the high-order terms in Taylor expansion after rethinking and decomposing the min-max problem in adversarial training. The negative high-order terms lead to a phenomenon called Perturbation Loss Distortion, which is the underlying cause of CO. Based on the observations, we propose a simple but effective regularization method named Fast Linear Adversarial Training (FLAT) to avoid CO in the single-step adversarial training by making the loss surface flat.", "keywords": "Robustness;Fast Adversarial Training;Catastrophic Overfitting", "primary_area": "", "supplementary_material": "/attachment/b666e928d356f49500471f6e0d885cc9101a36f2.zip", "author": "Renjie Chen;Yuan Luo;Yisen Wang", "authorids": "~Renjie_Chen2;~Yuan_Luo1;~Yisen_Wang1", "gender": "M;M;M", "homepage": "http://crj1998.ml/;https://www.cs.sjtu.edu.cn/en/PeopleDetail.aspx?id=155;https://yisenwang.github.io/", "dblp": ";90/6959-3;172/1346-1", "google_scholar": ";;uMWPDboAAAAJ", "orcid": ";0000-0002-3910-5286;", "linkedin": ";;", "or_profile": "~Renjie_Chen2;~Yuan_Luo1;~Yisen_Wang1", "aff": "Shanghai Jiaotong University;Shanghai Jiaotong University;Peking University", "aff_domain": "sjtu.edu.cn;sjtu.edu.cn;pku.edu.cn", "position": "MS student;Full Professor;Assistant Professor", "bibtex": "@misc{\nchen2022towards,\ntitle={Towards Understanding Catastrophic Overfitting in Fast Adversarial Training},\nauthor={Renjie Chen and Yuan Luo and Yisen Wang},\nyear={2022},\nurl={https://openreview.net/forum?id=lDvJM5XUyrx}\n}", "github": "", "project": "", "reviewers": "", "site": "https://openreview.net/forum?id=lDvJM5XUyrx", "pdf_size": 0, "recommendation": "", "confidence": "", "correctness": "", "technical_novelty": "", "empirical_novelty": "", "wc_summary_paper": "", "wc_summary_review": "", "wc_main_review": "", "wc_review": "", "wc_reply_reviewers": "", "wc_reply_authors": "", "reply_reviewers": "", "reply_authors": "", "recommendation_avg": [ 0, 0 ], "confidence_avg": [ 0, 0 ], "correctness_avg": [ 0, 0 ], "technical_novelty_avg": [ 0, 0 ], "empirical_novelty_avg": [ 0, 0 ], "wc_summary_paper_avg": [ 0, 0 ], "wc_summary_review_avg": [ 0, 0 ], "wc_main_review_avg": [ 0, 0 ], "wc_review_avg": [ 0, 0 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 1, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0, "corr_recommendation_correctness": 0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:RhK2ZNGTpXUJ:scholar.google.com/&scioq=Towards+Understanding+Catastrophic+Overfitting+in+Fast+Adversarial+Training&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;1", "aff_unique_norm": "Shanghai Jiao Tong University;Peking University", "aff_unique_dep": ";", "aff_unique_url": "https://www.sjtu.edu.cn;http://www.pku.edu.cn", "aff_unique_abbr": "SJTU;Peking U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "id": "lEB5Dnz_MmH", "title": "A Collaborative Attention Adaptive Network for Financial Market Forecasting", "track": "main", "status": "Reject", "tldr": "", "abstract": "Forecasting the financial market with social media data and real market prices is a valuable issue for market participants, which helps traders make more appropriate trading decisions. However, taking into account the differences of different data types, how to use a fusion method adapted to financial data to fuse real market prices and tweets from social media, so that the prediction model can fully integrate different types of data remains a challenging problem. To address these problems, we propose a collaborative attention adaptive Transformer approach to financial market forecasting (CAFF), including parallel extraction of tweets and price features, parameter-level fusion and a joint feature processing module, that can successfully deeply fuse tweets and real prices in view of the fusion method. Extensive experimentation is performed on tweets and historical price of stock market, our method can achieve a better accuracy compared with the state-of-the-art methods on two evaluation metrics. Moreover, tweets play a relatively more critical role in the CAFF framework. Additional stock trading simulations show that an actual trading strategy based on our proposed model can increase profits; thus, the model has practical application value.", "keywords": "Financial market forecasting;Deep fusion;Collaborative attention", "primary_area": "", "supplementary_material": "", "author": "Qiuyue Zhang;Yunfeng Zhang;Fangxun Bao;Caiming Zhang;Peide Liu;Xunxiang Yao", "authorids": "~Qiuyue_Zhang1;~Yunfeng_Zhang2;fxbao@sdu.edu.cn;czhang@sdu.edu.cn;peide.liu@gmail.com;12967776@student.uts.edu.au", "gender": ";M;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": "0000-0001-7229-7200;0000-0002-1237-6035;;;;", "linkedin": ";;;;;", "or_profile": "~Qiuyue_Zhang1;~Yunfeng_Zhang2;fxbao@sdu.edu.cn;czhang@sdu.edu.cn;peide.liu@gmail.com;12967776@student.uts.edu.au", "aff": "Shandong University of Finance and Economics;Shandong University of Finance and Economics;;;;", "aff_domain": "sdufe.edu.cn;sdufe.edu.cn;;;;", "position": "PhD student;Full Professor;;;;", "bibtex": "@misc{\nzhang2022a,\ntitle={A Collaborative Attention Adaptive Network for Financial Market Forecasting},\nauthor={Qiuyue Zhang and Yunfeng Zhang and Fangxun Bao and Caiming Zhang and Peide Liu and Xunxiang Yao},\nyear={2022},\nurl={https://openreview.net/forum?id=lEB5Dnz_MmH}\n}", "github": "", "project": "", "reviewers": "GNAc;TVcH;K1Z9", "site": "https://openreview.net/forum?id=lEB5Dnz_MmH", "pdf_size": 0, "recommendation": "3;5;5", "confidence": "5;4;3", "correctness": "2;3;3", "technical_novelty": "2;2;3", "empirical_novelty": "2;2;2", "wc_summary_paper": "46;32;57", "wc_summary_review": "27;50;38", "wc_main_review": "236;412;325", "wc_review": "309;494;420", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 4.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "correctness_avg": [ 2.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 45.0, 10.23067283548187 ], "wc_summary_review_avg": [ 38.333333333333336, 9.392668535736913 ], "wc_main_review_avg": [ 324.3333333333333, 71.85324550003911 ], "wc_review_avg": [ 407.6666666666667, 76.02777270328404 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.8660254037844385, "corr_recommendation_correctness": 0.9999999999999998, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:M_xXusd1RyMJ:scholar.google.com/&scioq=A+Collaborative+Attention+Adaptive+Network+for+Financial+Market+Forecasting&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Shandong University of Finance and Economics", "aff_unique_dep": "", "aff_unique_url": "http://www.sdufe.edu.cn", "aff_unique_abbr": "SDUFE", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "id": "lEXrEcrbmV", "title": "Data-Efficient Contrastive Learning by Differentiable Hard Sample and Hard Positive Pair Generation", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Contrastive learning (CL), a self-supervised learning approach, can effectively learn visual representations from unlabeled data. However, CL requires learning on vast quantities of diverse data to achieve good performance, without which the performance of CL will greatly degrade. To tackle this problem, we propose a framework with two approaches to improve the data efficiency of CL training by generating beneficial samples. The first approach generates hard samples for the main model. In the training process, hard samples are dynamically customized to the training state of the main model, rather than fixed throughout the whole training process. With the progressively growing knowledge of the main model, the generated samples also become harder to constantly encourage the main model to learn better representations. Besides, a pair of data generators are proposed to generate similar but distinct samples as positive pairs. The hardness of positive pair is progressively increased by decreasing the similarity between a positive pair. In this way, the main model learns to cluster hard positives by pulling the representations of similar yet distinct samples together, by which the representations of similar samples are well-clustered and better representations can be learned. Comprehensive experiments show superior accuracy of the proposed approaches over the state-of-the-art on multiple datasets. For example, about 5% and 6% improvements are achieved on CIFAR-10/100 and ImageNet-100/10 with limited training data, respectively. Besides, about 2x data efficiency is achieved when reaching a similar accuracy as other methods.", "keywords": "Contrastive learning;Data-efficient learning;Hard sample generation", "primary_area": "", "supplementary_material": "", "author": "Yawen Wu;Zhepeng Wang;Dewen Zeng;Yiyu Shi;Jingtong Hu", "authorids": "~Yawen_Wu1;~Zhepeng_Wang1;~Dewen_Zeng1;~Yiyu_Shi1;~Jingtong_Hu1", "gender": "M;M;M;M;M", "homepage": "https://sites.google.com/view/yawenwu;;https://scholar.google.com/citations?user=RpJ5nSsAAAAJ&hl=en&authuser=1;;http://www.pitt.edu/~jthu/index.html", "dblp": "230/8649;242/8456;;94/5536;37/3401", "google_scholar": "73k09jEAAAAJ;JyPU5aEAAAAJ;RpJ5nSsAAAAJ;;OcWo8CYAAAAJ", "orcid": ";;;;0000-0003-4029-4034", "linkedin": "yawenwu06/;zhepeng-wang/;;;", "or_profile": "~Yawen_Wu1;~Zhepeng_Wang1;~Dewen_Zeng1;~Yiyu_Shi1;~Jingtong_Hu1", "aff": "University of Pittsburgh;George Mason University;University of Notre Dame;University of Notre Dame;University of Pittsburgh", "aff_domain": "pitt.edu;gmu.edu;nd.edu;nd.edu;pitt.edu", "position": "PhD student;PhD student;PhD student;Full Professor;Associate Professor", "bibtex": "@misc{\nwu2022dataefficient,\ntitle={Data-Efficient Contrastive Learning by Differentiable Hard Sample and Hard Positive Pair Generation},\nauthor={Yawen Wu and Zhepeng Wang and Dewen Zeng and Yiyu Shi and Jingtong Hu},\nyear={2022},\nurl={https://openreview.net/forum?id=lEXrEcrbmV}\n}", "github": "", "project": "", "reviewers": "jXLZ;h1Up;Y1EM;siCV", "site": "https://openreview.net/forum?id=lEXrEcrbmV", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "4;4;4;4", "correctness": "3;3;4;3", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "69;102;70;86", "wc_summary_review": "31;64;19;4", "wc_main_review": "488;147;261;309", "wc_review": "588;313;350;399", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 81.75, 13.497684986693088 ], "wc_summary_review_avg": [ 29.5, 22.096379793984354 ], "wc_main_review_avg": [ 301.25, 122.82991288770012 ], "wc_review_avg": [ 412.5, 105.81705911619355 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.13245323570650439, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:yJTfFUBFd1cJ:scholar.google.com/&scioq=Data-Efficient+Contrastive+Learning+by+Differentiable+Hard+Sample+and+Hard+Positive+Pair+Generation&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1;2;2;0", "aff_unique_norm": "University of Pittsburgh;George Mason University;University of Notre Dame", "aff_unique_dep": ";;", "aff_unique_url": "https://www.pitt.edu;https://www.gmu.edu;https://www.nd.edu", "aff_unique_abbr": "Pitt;GMU;Notre Dame", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "lEoFUoMH2Uu", "title": "Foreground-attention in neural decoding: Guiding Loop-Enc-Dec to reconstruct visual stimulus images from fMRI", "track": "main", "status": "Reject", "tldr": "", "abstract": "The reconstruction of visual stimulus images from functional Magnetic Resonance Imaging (fMRI) has received extensive attention in recent years, which provides a possibility to interpret the human brain. Due to the high-dimensional and high-noise characteristics of fMRI data, how to extract stable, reliable and useful information from fMRI data for image reconstruction has become a challenging problem. Inspired by the mechanism of human visual attention, in this paper, we propose a novel method of reconstructing visual stimulus images, which first decodes the distribution of visual attention from fMRI, and then reconstructs the visual images guided by visual attention. We define visual attention as foreground attention (F-attention). Because the human brain is strongly wound into sulci and gyri, some spatially adjacent voxels are not connected in practice. Therefore, it is necessary to consider the global information when decoding fMRI, so we introduce the self-attention module for capturing global information into the process of decoding F-attention. In addition, in order to obtain more loss constraints in the training process of encoder-decoder, we also propose a new training strategy called Loop-Enc-Dec. The experimental results show that the F-attention decoder decodes the visual attention from fMRI successfully, and the Loop-Enc-Dec guided by F-attention can also well reconstruct the visual stimulus images.", "keywords": "neural decoding;visual stimulus image reconstruction;visual attention;encoder-decoder;fMRI", "primary_area": "", "supplementary_material": "", "author": "Kai Chen;Yongqiang Ma;Mingyang Sheng;Nanning Zheng", "authorids": "~Kai_Chen14;musayq@xjtu.edu.cn;smysmy2016@stu.xjtu.edu.cn;~Nanning_Zheng1", "gender": ";;;M", "homepage": ";;;", "dblp": ";;;07/256-1", "google_scholar": ";;;https://scholar.google.com/citations?hl=zh-CN", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Kai_Chen14;musayq@xjtu.edu.cn;smysmy2016@stu.xjtu.edu.cn;~Nanning_Zheng1", "aff": ";;;Xi'an Jiaotong University", "aff_domain": ";;;xjtu.edu.cn", "position": ";;;Full Professor", "bibtex": "@misc{\nchen2022foregroundattention,\ntitle={Foreground-attention in neural decoding: Guiding Loop-Enc-Dec to reconstruct visual stimulus images from f{MRI}},\nauthor={Kai Chen and Yongqiang Ma and Mingyang Sheng and Nanning Zheng},\nyear={2022},\nurl={https://openreview.net/forum?id=lEoFUoMH2Uu}\n}", "github": "", "project": "", "reviewers": "UCCu;Pz1R;48tB", "site": "https://openreview.net/forum?id=lEoFUoMH2Uu", "pdf_size": 0, "recommendation": "3;3;5", "confidence": "5;3;5", "correctness": "3;3;2", "technical_novelty": "2;2;2", "empirical_novelty": "2;1;3", "wc_summary_paper": "69;29;57", "wc_summary_review": "65;33;47", "wc_main_review": "384;152;539", "wc_review": "518;214;643", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "1316;331;1055", "reply_reviewers": "0;0;0", "reply_authors": "2;1;2", "recommendation_avg": [ 3.6666666666666665, 0.9428090415820634 ], "confidence_avg": [ 4.333333333333333, 0.9428090415820634 ], "correctness_avg": [ 2.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.0, 0.816496580927726 ], "wc_summary_paper_avg": [ 51.666666666666664, 16.75974011996871 ], "wc_summary_review_avg": [ 48.333333333333336, 13.097921802925667 ], "wc_main_review_avg": [ 358.3333333333333, 159.03109409448484 ], "wc_review_avg": [ 458.3333333333333, 180.14870400742703 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 900.6666666666666, 416.66959998967474 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.5, "corr_recommendation_correctness": -1.0, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13809246757899765864&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "aff_unique_index": "0", "aff_unique_norm": "Xi'an Jiao Tong University", "aff_unique_dep": "", "aff_unique_url": "https://www.xjtu.edu.cn", "aff_unique_abbr": "XJTU", "aff_country_unique_index": "0", "aff_country_unique": "China" }, { "id": "lGRG9TxQ3x", "title": "Feature Grinding: Efficient Backdoor Sanitation in Deep Neural Networks", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Training deep neural networks (DNNs) is expensive and for this reason, third parties provide computational resources to train models. This makes DNNs vulnerable to backdoor attacks, in which the third party maliciously injects hidden functionalities in the model at training time. Removing a backdoor is challenging because although the defender has access to a clean, labeled dataset, they only have limited computational resources which are a fraction of the resources required to train a model from scratch. We propose Feature Grinding as an efficient, randomized backdoor sanitation technique against seven contemporary backdoors on CIFAR-10 and ImageNet. Feature Grinding requires at most six percent of the model's training time on CIFAR-10 and at most two percent on ImageNet for sanitizing the surveyed backdoors. We compare Feature Grinding with five other sanitation methods and find that it is often the most effective at decreasing the backdoor's success rate while preserving a high model accuracy. Our experiments include an ablation study over multiple parameters for each backdoor attack and sanitation technique to ensure a fair evaluation of all methods. Models suspected of containing a backdoor can be Feature Grinded using limited resources, which makes it a practical defense against backdoors that can be incorporated into any standard training procedure.", "keywords": "Backdoor Sanitation;Deep Neural Network Security;Feature Grinding", "primary_area": "", "supplementary_material": "", "author": "Nils Lukas;Charles Zhang;Florian Kerschbaum", "authorids": "~Nils_Lukas1;charles.zhang@uwaterloo.ca;~Florian_Kerschbaum1", "gender": "M;;", "homepage": "https://nilslukas.github.io;;", "dblp": ";;", "google_scholar": "https://scholar.google.com/citations?hl=de;;", "orcid": ";;", "linkedin": ";;", "or_profile": "~Nils_Lukas1;charles.zhang@uwaterloo.ca;~Florian_Kerschbaum1", "aff": "University of Waterloo;;", "aff_domain": "uwaterloo.ca;;", "position": "PhD student;;", "bibtex": "@misc{\nlukas2022feature,\ntitle={Feature Grinding: Efficient Backdoor Sanitation in Deep Neural Networks},\nauthor={Nils Lukas and Charles Zhang and Florian Kerschbaum},\nyear={2022},\nurl={https://openreview.net/forum?id=lGRG9TxQ3x}\n}", "github": "", "project": "", "reviewers": "srNE;WPdD;Y5LY;Ch82", "site": "https://openreview.net/forum?id=lGRG9TxQ3x", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "4;4;3;4", "correctness": "2;2;2;2", "technical_novelty": "3;2;2;2", "empirical_novelty": "2;1;2;2", "wc_summary_paper": "66;67;69;108", "wc_summary_review": "11;36;29;36", "wc_main_review": "282;419;495;531", "wc_review": "359;522;593;675", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 2.0, 0.0 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 77.5, 17.64227876437735 ], "wc_summary_review_avg": [ 28.0, 10.222524150130436 ], "wc_main_review_avg": [ 431.75, 95.44468293205232 ], "wc_review_avg": [ 537.25, 116.28494098549477 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:GRmpT7tjmz0J:scholar.google.com/&scioq=Feature+Grinding:+Efficient+Backdoor+Sanitation+in+Deep+Neural+Networks&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "University of Waterloo", "aff_unique_dep": "", "aff_unique_url": "https://uwaterloo.ca", "aff_unique_abbr": "UW", "aff_country_unique_index": "0", "aff_country_unique": "Canada" }, { "id": "lKcq2fe-HB", "title": "Metrics Matter: A Closer Look on Self-Paced Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Curriculum reinforcement learning (CRL) allows to solve complex tasks by generating a tailored sequence of learning tasks, starting from easy ones and subsequently increasing their difficulty. However, the generation of such task sequences is largely governed by application assumptions, often preventing a theoretical investigation of existing approaches. Recently, Klink et al. (2021) showed how self-paced learning induces a principled interpolation between task distributions in the context of RL, resulting in high learning performance. So far, this interpolation is unfortunately limited to Gaussian distributions. Here, we show that on one side, this parametric restriction is insufficient in many learning cases but that on the other, the interpolation of self-paced RL (SPRL) can be degenerate when not restricted to this parametric form. We show that the introduction of concepts from optimal transport into SPRL prevents aforementioned issues. Experiments demonstrate that the resulting introduction of metric structure into the curriculum allows for a well-behaving non-parametric version of SPRL that leads to stable learning performance across tasks.", "keywords": "Curriculum Learning;Reinforcement Learning;Self-Paced Learning", "primary_area": "", "supplementary_material": "/attachment/778ea618bfdf2ff6bcc900155e53c0097e7f82e7.zip", "author": "Pascal Klink;Haoyi Yang;Jan Peters;Joni Pajarinen", "authorids": "~Pascal_Klink2;~Haoyi_Yang1;~Jan_Peters3;~Joni_Pajarinen2", "gender": "M;M;M;", "homepage": ";https://github.com/haoyi-yang;https://www.jan-peters.net;", "dblp": ";;p/JanPeters1;23/8355", "google_scholar": "https://scholar.google.de/citations?user=ZjqU_KwAAAAJ;;https://scholar.google.de/citations?user=-kIVAcAAAAAJ;https://scholar.google.fi/citations?user=-2fJStwAAAAJ", "orcid": ";;0000-0002-5266-8091;0000-0003-4469-8191", "linkedin": ";;janrpeters/;", "or_profile": "~Pascal_Klink2;~Haoyi_Yang1;~Jan_Peters3;~Joni_Pajarinen2", "aff": "TU Darmstadt;;TU Darmstadt;Technische Universit\u00e4t Darmstadt", "aff_domain": "tu-darmstadt.de;;tu-darmstadt.de;tu-darmstadt.de", "position": "PhD student;;Full Professor;Researcher", "bibtex": "@misc{\nklink2022metrics,\ntitle={Metrics Matter: A Closer Look on Self-Paced Reinforcement Learning},\nauthor={Pascal Klink and Haoyi Yang and Jan Peters and Joni Pajarinen},\nyear={2022},\nurl={https://openreview.net/forum?id=lKcq2fe-HB}\n}", "github": "", "project": "", "reviewers": "5LxC;w8rz;zmCP", "site": "https://openreview.net/forum?id=lKcq2fe-HB", "pdf_size": 0, "recommendation": "5;6;6", "confidence": "3;4;2", "correctness": "3;3;4", "technical_novelty": "3;2;3", "empirical_novelty": "2;2;3", "wc_summary_paper": "88;65;117", "wc_summary_review": "129;62;64", "wc_main_review": "763;820;254", "wc_review": "980;947;435", "wc_reply_reviewers": "302;182;0", "wc_reply_authors": "1689;1337;895", "reply_reviewers": "1;1;0", "reply_authors": "3;2;2", "recommendation_avg": [ 5.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.0, 0.816496580927726 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 90.0, 21.275964529643932 ], "wc_summary_review_avg": [ 85.0, 31.123410267299864 ], "wc_main_review_avg": [ 612.3333333333334, 254.44623968313797 ], "wc_review_avg": [ 787.3333333333334, 249.5012803351696 ], "wc_reply_reviewers_avg": [ 161.33333333333334, 124.1540262022228 ], "wc_reply_authors_avg": [ 1307.0, 324.8425259516781 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 2.3333333333333335, 0.4714045207910317 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.4999999999999999, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8414390753437205227&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;0", "aff_unique_norm": "Technische Universit\u00e4t Darmstadt", "aff_unique_dep": "", "aff_unique_url": "https://www.tu-darmstadt.de", "aff_unique_abbr": "TU Darmstadt", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Darmstadt;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Germany" }, { "id": "lKrchawH4sB", "title": "Heterologous Normalization", "track": "main", "status": "Reject", "tldr": "", "abstract": "Batch Normalization has become a standard technique for training modern deep networks. However, its effectiveness diminishes when the batch size becomes smaller since the batch statistics estimation becomes inaccurate. This paper proposes Heterologous Normalization, which computes normalization's mean and standard deviation from different pixel sets to take advantage of different normalization methods. Specifically, it calculates the mean like Batch Normalization to maintain the advantage of Batch Normalization. Meanwhile, it enlarges the number of pixels from which the standard deviation is calculated, thus alleviating the problem caused by the small batch size. Experiments show that Heterologous Normalization surpasses or achieves comparable performance to existing homologous methods, with large or small batch sizes on various datasets.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/9caf226e4755b5c03d32e343b1d25fb07bf6b2e2.zip", "author": "Chunjie Luo;Jianfeng Zhan;Lei Wang;Wanling Gao", "authorids": "~Chunjie_Luo1;~Jianfeng_Zhan2;~Lei_Wang17;~Wanling_Gao1", "gender": "M;M;;F", "homepage": ";https://www.zhanjianfeng.org;https://www.benchcouncil.org/wl.html;http://www.benchcouncil.org/wlgao/", "dblp": "https://dblp.uni-trier.de/pers/hd/l/Luo:Chunjie;;w/LeiWang4.html;", "google_scholar": "6zI9GL4AAAAJ;eqwfFyYAAAAJ;;knkGjEMAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Chunjie_Luo1;~Jianfeng_Zhan2;~Lei_Wang17;~Wanling_Gao1", "aff": "Institute of Computing Technology, Chinese Academy of Sciences;Chinese Acaemy of Sciences and University of Chineses Academy of Sciences;Institute of Computing Technology, Chinese Academy of Sciences;Institute of Computing Technology, Chinese Academy of Sciences", "aff_domain": "ict.ac.cn;cas.ac.cn;ict.ac.cn;ict.ac.cn", "position": "PhD student;Full Professor;Associate Professor;Associate Professor", "bibtex": "@misc{\nluo2022heterologous,\ntitle={Heterologous Normalization},\nauthor={Chunjie Luo and Jianfeng Zhan and Lei Wang and Wanling Gao},\nyear={2022},\nurl={https://openreview.net/forum?id=lKrchawH4sB}\n}", "github": "", "project": "", "reviewers": "Zi9s;5kEH;cQjf;fNu6", "site": "https://openreview.net/forum?id=lKrchawH4sB", "pdf_size": 0, "recommendation": "5;5;5;5", "confidence": "4;4;4;4", "correctness": "3;3;3;3", "technical_novelty": "3;3;2;2", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "56;24;80;93", "wc_summary_review": "16;42;159;26", "wc_main_review": "133;433;309;343", "wc_review": "205;499;548;462", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 5.0, 0.0 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 63.25, 26.261902063635834 ], "wc_summary_review_avg": [ 60.75, 57.4777130721117 ], "wc_main_review_avg": [ 304.5, 108.88870464837021 ], "wc_review_avg": [ 428.5, 132.5943060617612 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:iNO815glVfwJ:scholar.google.com/&scioq=Heterologous+Normalization&hl=en&as_sdt=0,14", "gs_version_total": 0, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Chinese Academy of Sciences", "aff_unique_dep": "Institute of Computing Technology", "aff_unique_url": "http://www.ict.ac.cn", "aff_unique_abbr": "CAS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "title": "Open-vocabulary Object Detection via Vision and Language Knowledge Distillation", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6372", "id": "lL3lnMbR4WU", "poster": "", "openreview": "https://openreview.net/forum?id=lL3lnMbR4WU", "slides": "https://iclr.cc/virtual/2022/poster/6372", "video": "https://iclr.cc/virtual/2022/poster/6372", "author_site": "Xiuye Gu, Tsung-Yi Lin, Weicheng Kuo, Yin Cui", "tldr": "", "abstract": "We aim at advancing open-vocabulary object detection, which detects objects described by arbitrary text inputs. The fundamental challenge is the availability of training data. It is costly to further scale up the number of classes contained in existing object detection datasets. To overcome this challenge, we propose ViLD, a training method via Vision and Language knowledge Distillation. Our method distills the knowledge from a pretrained open-vocabulary image classification model (teacher) into a two-stage detector (student). Specifically, we use the teacher model to encode category texts and image regions of object proposals. Then we train a student detector, whose region embeddings of detected boxes are aligned with the text and image embeddings inferred by the teacher. We benchmark on LVIS by holding out all rare categories as novel categories that are not seen during training. ViLD obtains 16.1 mask APr with a ResNet-50 backbone, even outperforming the supervised counterpart by 3.8. When trained with a stronger teacher model ALIGN, ViLD achieves 26.3 APr. The model can directly transfer to other datasets without finetuning, achieving 72.2 AP50 on PASCAL VOC, 36.6 AP on COCO and 11.8 AP on Objects365. On COCO, ViLD outperforms the previous state-of-the-art (Zareian et al., 2021) by 4.8 on novel AP and 11.4 on overall AP. Code and demo are open-sourced at https://github.com/tensorflow/tpu/tree/master/models/official/detection/projects/vild.", "keywords": "Open-vocabulary recognition;Object detection;Knowledge distillation", "primary_area": "", "supplementary_material": "", "author": "Xiuye Gu;Tsung-Yi Lin;Weicheng Kuo;Yin Cui", "authorids": "~Xiuye_Gu1;~Tsung-Yi_Lin4;~Weicheng_Kuo1;~Yin_Cui1", "gender": "F;M;M;M", "homepage": "https://laoreja.github.io/;https://weichengkuo.github.io/;https://ycui.me/;https://tsungyilin.info", "dblp": "199/1920;163/2203;47/8023.html;47/8105", "google_scholar": "qCrypnoAAAAJ;;iP5m52IAAAAJ;_BPdgV0AAAAJ", "orcid": ";;0000-0003-2882-2033;0000-0003-4819-0627", "linkedin": "xiuyegu/;;;tsung-yi-lin-48a4b541/", "or_profile": "~Xiuye_Gu1;~Weicheng_Kuo1;~Yin_Cui1;~Tsung-Yi_Lin3", "aff": "Google;Google Deepmind;Google;Google", "aff_domain": "google.com;google.com;google.com;google.com", "position": "Researcher;Research Scientist;Research Scientist;Research Scientist", "bibtex": "@inproceedings{\ngu2022openvocabulary,\ntitle={Open-vocabulary Object Detection via Vision and Language Knowledge Distillation},\nauthor={Xiuye Gu and Tsung-Yi Lin and Weicheng Kuo and Yin Cui},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=lL3lnMbR4WU}\n}", "github": "", "project": "", "reviewers": "6g5Q;LJFG;Vqs3", "pdf_size": 0, "recommendation": "6;8;8", "confidence": "5;5;4", "correctness": "3;4;3", "technical_novelty": "3;2;3", "empirical_novelty": "3;4;4", "wc_summary_paper": "121;126;113", "wc_summary_review": "93;34;37", "wc_main_review": "741;413;705", "wc_review": "955;573;855", "wc_reply_reviewers": "51;22;219", "wc_reply_authors": "1406;638;755", "reply_reviewers": "1;1;1", "reply_authors": "3;1;2", "recommendation_avg": [ 7.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 120.0, 5.354126134736337 ], "wc_summary_review_avg": [ 54.666666666666664, 27.13341523329163 ], "wc_main_review_avg": [ 619.6666666666666, 146.87258272242494 ], "wc_review_avg": [ 794.3333333333334, 161.74328081527494 ], "wc_reply_reviewers_avg": [ 97.33333333333333, 86.84213007265285 ], "wc_reply_authors_avg": [ 933.0, 337.85499848307705 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 2.0, 0.816496580927726 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.4999999999999999, "corr_recommendation_correctness": 0.49999999999999983, "gs_citation": 1091, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15143736984694807048&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=lL3lnMbR4WU", "email": "google.com;google.com;google.com;google.com", "author_num": 4, "aff_unique_index": "0;1;0;0", "aff_unique_norm": "Google;DeepMind", "aff_unique_dep": "Google;DeepMind", "aff_unique_url": "https://www.google.com;https://deepmind.com", "aff_unique_abbr": "Google;DeepMind", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;1;0;0", "aff_country_unique": "United States;United Kingdom" }, { "id": "lNreaMZf9X", "title": "Learning Dynamics Models for Model Predictive Agents", "track": "main", "status": "Reject", "tldr": "", "abstract": "Model-Based Reinforcement Learning involves learning a dynamics model from data, and then using this model to optimise behaviour, most often with an online planner. Much of the recent research along these lines presents a particular set of design choices, involving problem definition, model learning and planning. Given the multiple contributions, it is difficult to evaluate the effects of each. This paper sets out to disambiguate the role of different design choices for learning dynamics models, by comparing their performance to planning with a ground-truth model -- the simulator. First, we collect a rich dataset from the training sequence of a model-free agent on 5 domains of the DeepMind Control Suite. Second, we train feed-forward dynamics models in a supervised fashion, and evaluate planner performance while varying and analysing different model design choices, including ensembling, stochasticity, multi-step training and timestep size. Besides the quantitative analysis, we describe a set of qualitative findings, rules of thumb, and future research directions for planning with learned dynamics models. Videos of the results are available at https://sites.google.com/view/learning-better-models.", "keywords": "Model Learning;Model Based Reinforcement Learning;Control", "primary_area": "", "supplementary_material": "", "author": "Michael Lutter;Leonard Hasenclever;Arunkumar Byravan;Gabriel Dulac-Arnold;Piotr Trochim;Nicolas Heess;Josh Merel;Yuval Tassa", "authorids": "~Michael_Lutter1;~Leonard_Hasenclever1;~Arunkumar_Byravan1;~Gabriel_Dulac-Arnold1;~Piotr_Trochim1;~Nicolas_Heess1;~Josh_Merel1;~Yuval_Tassa2", "gender": "M;M;M;M;;;;M", "homepage": "http://mlutter.eu;;https://homes.cs.washington.edu/~barun/;http://gabe.squirrelsoup.net;http://deepmind.com;;;", "dblp": ";150/1667;151/9400;58/9457;;76/9181;139/1361;20/4415", "google_scholar": "https://scholar.google.de/citations?user=Wvdo5bYAAAAJ;https://scholar.google.co.uk/citations?user=dD-3S4QAAAAJ;obYwWiMAAAAJ;https://scholar.google.fr/citations?user=KxaYraAAAAAJ;;79k7bGEAAAAJ;https://scholar.google.co.uk/citations?user=K4OcFXUAAAAJ;https://scholar.google.co.uk/citations?user=CjOTm_4AAAAJ", "orcid": ";;;;;;;", "linkedin": ";;;;;;;", "or_profile": "~Michael_Lutter1;~Leonard_Hasenclever1;~Arunkumar_Byravan1;~Gabriel_Dulac-Arnold1;~Piotr_Trochim1;~Nicolas_Heess1;~Josh_Merel1;~yuval_tassa1", "aff": "Boston Dynamics;Google DeepMind;Google;Google Research;;Google DeepMind;Meta Reality Labs;Google", "aff_domain": "bostondynamics.com;google.com;google.com;google.com;;google.com;fb.com;google.com", "position": "Researcher;Research Scientist;Research Scientist;Researcher;;Research Scientist;Research Scientist;Research Scientist", "bibtex": "@misc{\nlutter2022learning,\ntitle={Learning Dynamics Models for Model Predictive Agents},\nauthor={Michael Lutter and Leonard Hasenclever and Arunkumar Byravan and Gabriel Dulac-Arnold and Piotr Trochim and Nicolas Heess and Josh Merel and Yuval Tassa},\nyear={2022},\nurl={https://openreview.net/forum?id=lNreaMZf9X}\n}", "github": "", "project": "", "reviewers": "xfZq;nmSn;PwSp;sY5Q", "site": "https://openreview.net/forum?id=lNreaMZf9X", "pdf_size": 0, "recommendation": "3;5;6;6", "confidence": "3;4;4;3", "correctness": "3;2;4;3", "technical_novelty": "2;3;1;2", "empirical_novelty": "1;3;3;3", "wc_summary_paper": "48;110;90;36", "wc_summary_review": "877;144;136;48", "wc_main_review": "240;392;305;629", "wc_review": "1165;646;531;713", "wc_reply_reviewers": "178;137;109;223", "wc_reply_authors": "0;0;170;708", "reply_reviewers": "1;1;1;2", "reply_authors": "0;0;2;3", "recommendation_avg": [ 5.0, 1.224744871391589 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 71.0, 30.14962686336267 ], "wc_summary_review_avg": [ 301.25, 334.5365264063104 ], "wc_main_review_avg": [ 391.5, 147.3439852861324 ], "wc_review_avg": [ 763.75, 240.63185055183365 ], "wc_reply_reviewers_avg": [ 161.75, 43.04285655018728 ], "wc_reply_authors_avg": [ 219.5, 290.4492210352784 ], "reply_reviewers_avg": [ 1.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 1.299038105676658 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": 0.40824829046386296, "corr_recommendation_correctness": 0.28867513459481287, "gs_citation": 27, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17473149843131528505&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;1;1;1;2;1", "aff_unique_norm": "Boston Dynamics;Google;Meta", "aff_unique_dep": ";Google DeepMind;Meta Reality Labs", "aff_unique_url": "https://www.bostondynamics.com;https://deepmind.com;https://www.meta.com", "aff_unique_abbr": "BD;DeepMind;MRL", "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;1;0;0;1;0;0", "aff_country_unique": "United States;United Kingdom" }, { "id": "lP11WtZwquE", "title": "Language Model Pre-training on True Negatives", "track": "main", "status": "Reject", "tldr": "", "abstract": "Discriminative pre-trained language models (PrLMs) learn to predict original texts from intentionally corrupted ones. Taking the former text as positive and the latter as negative samples, the discriminative PrLM can be trained effectively for contextualized representation. However, though the training of such a type of PrLMs highly relies on the quality of the automatically constructed samples, existing PrLMs simply treat all corrupted texts as equal negative without any examination, which actually lets the resulting model inevitably suffer from the false negative issue where training is carried out on wrong data and leads to less efficiency and less robustness in the resulting PrLMs.\nThus in this work, on the basis of defining the false negative issue in discriminative PrLMs that has been ignored for a long time, we design enhanced pre-training methods to counteract false negative predictions and encourage pre-training language models on true negatives, by correcting the harmful gradient updates subject to false negative predictions. Experimental results on GLUE and SQuAD benchmarks show that our counter-false-negative pre-training methods indeed bring about better performance together with stronger robustness.", "keywords": "Pre-trained Language Models;Masked Language Modeling;False Negatives;Natural Language Understanding", "primary_area": "", "supplementary_material": "", "author": "Zhuosheng Zhang;hai zhao;Masao Utiyama;Eiichiro Sumita", "authorids": "~Zhuosheng_Zhang1;~hai_zhao1;~Masao_Utiyama2;~Eiichiro_Sumita1", "gender": "M;M;M;", "homepage": "https://bcmi.sjtu.edu.cn/~zhangzs/;http://bcmi.sjtu.edu.cn/~zhaohai/;http://www2.nict.go.jp/astrec-att/member/mutiyama/;", "dblp": "06/9708;25/1145-1.html;76/5745.html;95/5465", "google_scholar": "https://scholar.google.co.jp/citations?user=63LTQhgAAAAJ;https://scholar.google.com.tw/citations?user=4dU5KS0AAAAJ;artIO6gAAAAJ;", "orcid": "0000-0002-4183-3645;;;", "linkedin": ";;;", "or_profile": "~Zhuosheng_Zhang1;~hai_zhao1;~Masao_Utiyama2;~Eiichiro_Sumita1", "aff": "Shanghai Jiaotong University;Shanghai Jiaotong University;National Institute of Information and Communications Technology (NICT), National Institute of Advanced Industrial Science and Technology;", "aff_domain": "sjtu.edu.cn;sjtu.edu.cn;nict.go.jp;", "position": "PhD student;Full Professor;Researcher;", "bibtex": "@misc{\nzhang2022language,\ntitle={Language Model Pre-training on True Negatives},\nauthor={Zhuosheng Zhang and hai zhao and Masao Utiyama and Eiichiro Sumita},\nyear={2022},\nurl={https://openreview.net/forum?id=lP11WtZwquE}\n}", "github": "", "project": "", "reviewers": "QW2m;dhbH;1R2h;jW6M;syob", "site": "https://openreview.net/forum?id=lP11WtZwquE", "pdf_size": 0, "recommendation": "3;3;5;5;5", "confidence": "4;4;4;3;4", "correctness": "2;2;3;3;3", "technical_novelty": "2;2;2;2;2", "empirical_novelty": "2;2;2;3;2", "wc_summary_paper": "133;67;163;100;87", "wc_summary_review": "31;26;13;30;43", "wc_main_review": "304;150;260;203;156", "wc_review": "468;243;436;333;286", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "481;274;405;255;148", "reply_reviewers": "0;0;0;0;0", "reply_authors": "1;1;1;1;1", "recommendation_avg": [ 4.2, 0.9797958971132712 ], "confidence_avg": [ 3.8, 0.39999999999999997 ], "correctness_avg": [ 2.6, 0.4898979485566356 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.2, 0.39999999999999997 ], "wc_summary_paper_avg": [ 110.0, 34.10571799566753 ], "wc_summary_review_avg": [ 28.6, 9.645724441429994 ], "wc_main_review_avg": [ 214.6, 59.657690200006904 ], "wc_review_avg": [ 353.2, 86.1426723523249 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 312.6, 117.30063938444665 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.408248290463863, "corr_recommendation_correctness": 1.0, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16373105254005293169&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff_unique_index": "0;0;1", "aff_unique_norm": "Shanghai Jiao Tong University;National Institute of Information and Communications Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.sjtu.edu.cn;https://www.nict.go.jp/", "aff_unique_abbr": "SJTU;NICT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1", "aff_country_unique": "China;Japan" }, { "title": "Towards Model Agnostic Federated Learning Using Knowledge Distillation", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6644", "id": "lQI_mZjvBxj", "poster": "", "openreview": "https://openreview.net/forum?id=lQI_mZjvBxj", "slides": "https://iclr.cc/virtual/2022/poster/6644", "video": "https://iclr.cc/virtual/2022/poster/6644", "author_site": "Andrei Afonin, Sai Karimireddy", "tldr": "", "abstract": "Is it possible to design an universal API for federated learning using which an ad-hoc group of data-holders (agents) collaborate with each other and perform federated learning? Such an API would necessarily need to be model-agnostic i.e. make no assumption about the model architecture being used by the agents, and also cannot rely on having representative public data at hand. Knowledge distillation (KD) is the obvious tool of choice to design such protocols. However, surprisingly, we show that most natural KD-based federated learning protocols have poor performance.\n \n To investigate this, we propose a new theoretical framework, Federated Kernel ridge regression, which can capture both model heterogeneity as well as data heterogeneity. Our analysis shows that the degradation is largely due to a fundamental limitation of knowledge distillation under data heterogeneity. We further validate our framework by analyzing and designing new protocols based on KD. Their performance on real world experiments using neural networks, though still unsatisfactory, closely matches our theoretical predictions. ", "keywords": "Federated Learning;Knowledge Distillation;Model Agnostic Communication;Kernel Regression", "primary_area": "", "supplementary_material": "/attachment/0cd2dbddff5125736a190d6c94a898e61d277dcc.zip", "author": "Andrei Afonin;Sai Praneeth Karimireddy", "authorids": "~Andrei_Afonin1;~Sai_Praneeth_Karimireddy1", "gender": "M;M", "homepage": ";https://spkreddy.org", "dblp": ";217/3342", "google_scholar": ";wKJeOQoAAAAJ", "orcid": ";", "linkedin": "andrei-afonin-6a76141b6/;", "or_profile": "~Andrei_Afonin1;~Sai_Praneeth_Karimireddy1", "aff": "Swiss Federal Institute of Technology Lausanne;University of California, Berkeley", "aff_domain": "epfl.ch;berkeley.edu", "position": "MS student;Postdoc", "bibtex": "@inproceedings{\nafonin2022towards,\ntitle={Towards Model Agnostic Federated Learning Using Knowledge Distillation},\nauthor={Andrei Afonin and Sai Praneeth Karimireddy},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=lQI_mZjvBxj}\n}", "github": "", "project": "", "reviewers": "1yWX;fTfB;w5jn;mYLp", "pdf_size": 0, "recommendation": "3;6;6;8", "confidence": "4;3;3;4", "correctness": "3;4;3;4", "technical_novelty": "2;4;3;3", "empirical_novelty": "2;4;2;3", "wc_summary_paper": "150;114;79;106", "wc_summary_review": "120;53;53;61", "wc_main_review": "843;147;245;397", "wc_review": "1113;314;377;564", "wc_reply_reviewers": "0;25;0;0", "wc_reply_authors": "1557;250;150;433", "reply_reviewers": "0;1;0;0", "reply_authors": "3;1;1;1", "recommendation_avg": [ 5.75, 1.7853571071357126 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 112.25, 25.36114153582208 ], "wc_summary_review_avg": [ 71.75, 28.047950014216724 ], "wc_main_review_avg": [ 408.0, 266.47513955339247 ], "wc_review_avg": [ 592.0, 314.5369612621067 ], "wc_reply_reviewers_avg": [ 6.25, 10.825317547305483 ], "wc_reply_authors_avg": [ 597.5, 563.1858041534783 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.5, 0.8660254037844386 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.14002800840280097, "corr_recommendation_correctness": 0.7001400420140049, "gs_citation": 73, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=482276680258740092&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=lQI_mZjvBxj", "email": "epfl.ch;berkeley.edu", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "Swiss Federal Institute of Technology Lausanne;University of California, Berkeley", "aff_unique_dep": ";", "aff_unique_url": "https://www.epfl.ch;https://www.berkeley.edu", "aff_unique_abbr": "EPFL;UC Berkeley", "aff_campus_unique_index": "0;1", "aff_campus_unique": "Lausanne;Berkeley", "aff_country_unique_index": "0;1", "aff_country_unique": "Switzerland;United States" }, { "id": "lTiW8Jet8t", "title": "Efficient Ensembles of Graph Neural Networks", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Graph Neural Networks (GNNs) have enabled the power of deep learning to be applied to inputs beyond the Euclidean domain, with applications ranging from social networks and product recommendation engines to the life sciences. GNNs, like other classes of machine learning models, benefit from ensemble learning, wherein multiple models are combined to provide higher accuracy and robustness than single models. However, ensembles suffer from significantly higher inference processing and storage requirements, limiting their use in practical applications. In this work, we leverage the unique characteristics of GNNs to overcome these overheads, creating efficient ensemble GNNs that are faster than even single models at inference time. We observe that during message passing, nodes that are incorrectly classified (error nodes) also end up adversely affecting the representations of other nodes in their neighborhood. This error propagation also makes GNNs more difficult to approximate (e.g., through pruning) for efficient inference. We propose a technique to create ensembles of diverse models, and further propose Error Node Isolation (ENI), which prevents error nodes from sending messages to (and thereby influencing) other nodes. In addition to improving accuracy, ENI also leads to a significant reduction in the memory footprint and the number of arithmetic operations required to evaluate the computational graphs of all neighbors of error nodes. Remarkably, these savings outweigh even the overheads of using multiple models in the ensemble. A second key benefit of ENI is that it enhances the resilience of GNNs to approximations. Consequently, we propose Edge Pruning and Network Pruning techniques that target both the input graph and the neural networks used to process the graph. Our experiments on GNNs for transductive and inductive node classification demonstrate that ensembles with ENI are simultaneously more accurate (by up to 4.6% and 3.8%) and faster (by up to 2.8$\\times$ and 5.7$\\times$) when compared to the best-performing single models and ensembles without ENI, respectively. In addition, GNN ensembles with ENI are consistently more accurate than single models and ensembles without ENI when subject to pruning, leading to additional speedups of up to 5$\\times$ with no loss in accuracy.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Amrit Nagarajan;Jacob R. Stevens;Anand Raghunathan", "authorids": "~Amrit_Nagarajan1;~Jacob_R._Stevens1;~Anand_Raghunathan1", "gender": "M;M;", "homepage": ";;https://engineering.purdue.edu/~araghu/", "dblp": ";;74/3747.html", "google_scholar": ";CagpctsAAAAJ;OP7F8jEAAAAJ", "orcid": ";;", "linkedin": "https://in.linkedin.com/in/amrit-nagarajan-8a99b0152;jacobrstevens/;", "or_profile": "~Amrit_Nagarajan1;~Jacob_R._Stevens1;~Anand_Raghunathan1", "aff": "Purdue University;Purdue University;Purdue University", "aff_domain": "purdue.edu;purdue.edu;purdue.edu", "position": "PhD student;PhD student;Full Professor", "bibtex": "@misc{\nnagarajan2022efficient,\ntitle={Efficient Ensembles of Graph Neural Networks},\nauthor={Amrit Nagarajan and Jacob R. Stevens and Anand Raghunathan},\nyear={2022},\nurl={https://openreview.net/forum?id=lTiW8Jet8t}\n}", "github": "", "project": "", "reviewers": "WrrA;k6FD;3J8U;Ep7R", "site": "https://openreview.net/forum?id=lTiW8Jet8t", "pdf_size": 0, "recommendation": "3;3;5;6", "confidence": "4;4;4;3", "correctness": "2;2;3;3", "technical_novelty": "1;2;2;3", "empirical_novelty": "2;2;2;4", "wc_summary_paper": "65;298;46;157", "wc_summary_review": "20;88;69;65", "wc_main_review": "90;1097;181;514", "wc_review": "175;1483;296;736", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.25, 1.299038105676658 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 141.5, 99.63056759850362 ], "wc_summary_review_avg": [ 60.5, 24.944939366532843 ], "wc_main_review_avg": [ 470.5, 394.64699416060427 ], "wc_review_avg": [ 672.5, 512.3965749300048 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.7777777777777777, "corr_recommendation_correctness": 0.9622504486493761, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2298754274535492076&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Purdue University", "aff_unique_dep": "", "aff_unique_url": "https://www.purdue.edu", "aff_unique_abbr": "Purdue", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Phenomenology of Double Descent in Finite-Width Neural Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6861", "id": "lTqGXfn9Tv", "poster": "", "openreview": "https://openreview.net/forum?id=lTqGXfn9Tv", "slides": "https://iclr.cc/virtual/2022/poster/6861", "video": "https://iclr.cc/virtual/2022/poster/6861", "author_site": "Sidak Pal Singh, Aurelien Lucchi, Thomas Hofmann, Bernhard Schoelkopf", "tldr": "", "abstract": "`Double descent' delineates the generalization behaviour of models depending on the regime they belong to: under- or over-parameterized. The current theoretical understanding behind the occurrence of this phenomenon is primarily based on linear and kernel regression models --- with informal parallels to neural networks via the Neural Tangent Kernel. Therefore such analyses do not adequately capture the mechanisms behind double descent in finite-width neural networks, as well as, disregard crucial components --- such as the choice of the loss function. We address these shortcomings by leveraging influence functions in order to derive suitable expressions of the population loss and its lower bound, while imposing minimal assumptions on the form of the parametric model. Our derived bounds bear an intimate connection with the spectrum of the Hessian at the optimum, and importantly, exhibit a double descent behaviour at the interpolation threshold. Building on our analysis, we further investigate how the loss function affects double descent --- and thus uncover interesting properties of neural networks and their Hessian spectra near the interpolation threshold.", "keywords": "double descent;generalization;neural networks;hessian;flatness", "primary_area": "", "supplementary_material": "", "author": "Sidak Pal Singh;Aurelien Lucchi;Thomas Hofmann;Bernhard Sch\u00f6lkopf", "authorids": "~Sidak_Pal_Singh1;~Aurelien_Lucchi1;~Thomas_Hofmann1;~Bernhard_Sch\u00f6lkopf1", "gender": ";M;M;", "homepage": "http://sidakpal.com/;http://people.inf.ethz.ch/alucchi/;http://www.da.inf.ethz.ch/;", "dblp": "189/9168;14/5780;h/ThHofmann;", "google_scholar": "c59mPS4AAAAJ;https://scholar.google.ch/citations?user=V1ONSgIAAAAJ;T3hAyLkAAAAJ;", "orcid": ";;;", "linkedin": ";;thomas-hofmann-1ab2402/;", "or_profile": "~Sidak_Pal_Singh1;~Aurelien_Lucchi1;~Thomas_Hofmann1;~Bernhard_Sch\u00f6lkopf1", "aff": "Max Planck Institute for Intelligent Systems;University of Basel;Swiss Federal Institute of Technology;", "aff_domain": "tuebingen.mpg.de;unibas.ch;ethz.ch;", "position": "PhD student;Assistant Professor;Full Professor;", "bibtex": "@inproceedings{\nsingh2022phenomenology,\ntitle={Phenomenology of Double Descent in Finite-Width Neural Networks},\nauthor={Sidak Pal Singh and Aurelien Lucchi and Thomas Hofmann and Bernhard Sch{\\\"o}lkopf},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=lTqGXfn9Tv}\n}", "github": "", "project": "", "reviewers": "E8Hk;hRBt;SyuK;mefD;hctT", "pdf_size": 0, "recommendation": "3;8;8;8;8", "confidence": "3;4;4;4;4", "correctness": "2;3;3;3;4", "technical_novelty": "1;3;4;4;3", "empirical_novelty": "2;3;2;3;3", "wc_summary_paper": "65;106;166;140;134", "wc_summary_review": "46;63;100;42;876", "wc_main_review": "418;663;692;413;27", "wc_review": "529;832;958;595;1037", "wc_reply_reviewers": "0;588;109;21;0", "wc_reply_authors": "2009;2695;1410;429;477", "reply_reviewers": "0;4;1;1;0", "reply_authors": "3;5;3;1;1", "recommendation_avg": [ 7.0, 2.0 ], "confidence_avg": [ 3.8, 0.39999999999999997 ], "correctness_avg": [ 3.0, 0.6324555320336759 ], "technical_novelty_avg": [ 3.0, 1.0954451150103321 ], "empirical_novelty_avg": [ 2.6, 0.4898979485566356 ], "wc_summary_paper_avg": [ 122.2, 34.376736319784634 ], "wc_summary_review_avg": [ 225.4, 325.9445351589746 ], "wc_main_review_avg": [ 442.6, 238.73885314292687 ], "wc_review_avg": [ 790.2, 198.56626098106395 ], "wc_reply_reviewers_avg": [ 143.6, 225.81461423034602 ], "wc_reply_authors_avg": [ 1404.0, 876.6636755335537 ], "reply_reviewers_avg": [ 1.2, 1.4696938456699067 ], "reply_authors_avg": [ 2.6, 1.4966629547095767 ], "replies_avg": [ 27, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 1.0, "corr_recommendation_correctness": 0.7905694150420948, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2440287666380623574&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "pdf": "https://openreview.net/pdf?id=lTqGXfn9Tv", "email": "tuebingen.mpg.de;unibas.ch;ethz.ch;", "author_num": 4, "aff_unique_index": "0;1;2", "aff_unique_norm": "Max Planck Institute for Intelligent Systems;University of Basel;Swiss Federal Institute of Technology", "aff_unique_dep": "Intelligent Systems;;", "aff_unique_url": "https://www.mpi-is.mpg.de;https://www.unibas.ch;https://www.ethz.ch", "aff_unique_abbr": "MPI-IS;UniBas;ETH Zurich", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1", "aff_country_unique": "Germany;Switzerland" }, { "id": "lUyvp-6V9G", "title": "Multi-Vector Embedding on Networks with Taxonomies", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Networks serve as efficient tools to describe close relationships among nodes. Taxonomies consist of labels organized into hierarchical structures and are often employed to describe rich attributes of the network nodes. Existing methods that co-embed nodes and labels in a low-dimensional space all encounter an obstacle called under-fitting, which occurs when the vector of a node is obliged to fit all its labels and neighbor nodes. In this paper, we propose HIerarchical Multi-vector Embedding (HIME), which allows multiple vectors of a node to fit different sets of its labels in a Poincare ball, where the label hierarchy is well preserved. Experiments show that HIME has comprehensive advantages over existing network embedding methods in preserving both node-node and node-label relationships. ", "keywords": "network embedding;heterogeneous network embedding;hyperbolic space", "primary_area": "", "supplementary_material": "/attachment/00af1dd1887643eac2ff8e694e4ef73c8b4d9e86.zip", "author": "Yue Fan;Xiuli Ma", "authorids": "~Yue_Fan2;~Xiuli_Ma1", "gender": ";F", "homepage": "https://yuefan1014.github.io/;http://sai.pku.edu.cn/info/1362/2239.htm", "dblp": ";", "google_scholar": "https://scholar.google.com/citations?hl=en;", "orcid": ";", "linkedin": ";", "or_profile": "~Yue_Fan2;~Xiuli_Ma1", "aff": "Peking University;Peking University", "aff_domain": "pku.edu.cn;pku.edu.cn", "position": "MS student;Assistant Professor", "bibtex": "@misc{\nfan2022multivector,\ntitle={Multi-Vector Embedding on Networks with Taxonomies},\nauthor={Yue Fan and Xiuli Ma},\nyear={2022},\nurl={https://openreview.net/forum?id=lUyvp-6V9G}\n}", "github": "", "project": "", "reviewers": "ZQi4;CaDs;fEdj;R9zG", "site": "https://openreview.net/forum?id=lUyvp-6V9G", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "4;4;5;3", "correctness": "2;3;3;3", "technical_novelty": "2;2;3;2", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "46;35;83;48", "wc_summary_review": "21;25;95;44", "wc_main_review": "168;279;449;153", "wc_review": "235;339;627;245", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 53.0, 18.01388353465182 ], "wc_summary_review_avg": [ 46.25, 29.45653577731095 ], "wc_main_review_avg": [ 262.25, 118.2949174732372 ], "wc_review_avg": [ 361.5, 158.56465558250994 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13610915998760946255&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Peking University", "aff_unique_dep": "", "aff_unique_url": "http://www.pku.edu.cn", "aff_unique_abbr": "Peking U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "id": "lVRfcp9ZEB_", "title": "IsoScore: Measuring the Uniformity of Vector Space Utilization", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "The recent success of distributed word representations has led to an increased interest in analyzing the properties of their spatial distribution. Current metrics suggest that contextualized word embedding models do not uniformly utilize all available dimensions when embedding tokens in vector space. Previous works argue that encouraging isotropy in embedding space corresponds to improved performance on downstream tasks. However, existing metrics---average random cosine similarity, for example---do not properly measure isotropy and tend to obscure the true spatial distribution of point clouds. To address this issue, we propose IsoScore: a novel metric that quantifies the degree to which a point cloud uniformly utilizes the ambient vector space. We demonstrate that IsoScore has several desirable properties, such as mean invariance and direct correspondence to the number of dimensions used that existing scores do not possess.\nFurthermore, IsoScore is conceptually intuitive, making it well suited for analyzing the distribution of arbitrary point clouds in vector space, not necessarily limited to point clouds of word embeddings alone. We conclude by using IsoScore to demonstrate that a number of recent conclusions in the NLP literature that have been derived using brittle metrics of spatial distribution may be incomplete or altogether inaccurate.", "keywords": "Contextualized Word Embeddings;Isotropy;Natural Language Processing", "primary_area": "", "supplementary_material": "", "author": "William Rudman;Nate Gillman;Taylor Rayne;Carsten Eickhoff", "authorids": "~William_Rudman1;~Nate_Gillman1;~Taylor_Rayne2;~Carsten_Eickhoff1", "gender": "M;;M;F", "homepage": ";https://www.nategillman.com/;https://health-nlp.org;https://github.com/bcbi-edu/p_eickhoff_isoscore", "dblp": "299/8116;299/8280;42/8700;", "google_scholar": "https://scholar.google.com/citations?hl=en;twg9zD0AAAAJ;QQi1_rAAAAAJ;", "orcid": ";;0000-0001-9895-4061;", "linkedin": ";ngillman/;;taylor-walters-38a242167/", "or_profile": "~William_Rudman1;~Nate_Gillman1;~Carsten_Eickhoff1;~Taylor_Rayne1", "aff": "Brown University;Brown University;Brown University;Quest University Canada", "aff_domain": "brown.edu;brown.edu;brown.edu;quest.ca", "position": "PhD student;PhD student;Assistant Professor;Undergrad student", "bibtex": "@misc{\nrudman2022isoscore,\ntitle={IsoScore: Measuring the Uniformity of Vector Space Utilization},\nauthor={William Rudman and Nate Gillman and Taylor Rayne and Carsten Eickhoff},\nyear={2022},\nurl={https://openreview.net/forum?id=lVRfcp9ZEB_}\n}", "github": "", "project": "", "reviewers": "iRpp;wtTa;fSF3;UErH", "site": "https://openreview.net/forum?id=lVRfcp9ZEB_", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "3;4;4;4", "correctness": "2;2;3;3", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;3;3;2", "wc_summary_paper": "65;68;98;56", "wc_summary_review": "52;47;29;31", "wc_main_review": "469;320;75;466", "wc_review": "586;435;202;553", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 71.75, 15.785673884886892 ], "wc_summary_review_avg": [ 39.75, 9.934158243152764 ], "wc_main_review_avg": [ 332.5, 160.4033977196244 ], "wc_review_avg": [ 444.0, 150.57390212118435 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.9271726499455306, "corr_recommendation_correctness": 0.6882472016116854, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4295495213081445418&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;0;1", "aff_unique_norm": "Brown University;Quest University Canada", "aff_unique_dep": ";", "aff_unique_url": "https://www.brown.edu;https://www.questu.ca", "aff_unique_abbr": "Brown;Quest U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1", "aff_country_unique": "United States;Canada" }, { "id": "lVtq6C5_3QL", "title": "Generating Transferable Adversarial Patch by Simultaneously Optimizing its Position and Perturbations", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Adversarial patch is one kind of important form to perform adversarial attacks in the real world and brings serious risks to the robustness of deep neural networks. Previous methods generate adversarial patches by either optimizing their perturbation values while fixing the position on the image or manipulating the position while fixing the content of the patch.\nIn this paper, we propose a method to simultaneously optimize the position and perturbation to generate transferable adversarial patches, and thus obtain high attack success rates in the black-box setting. We adjust the transferability by taking the position, weights of surrogate models in the ensemble attack and the attack step size as parameters, and utilize the reinforcement learning framework to simultaneously solve these parameters based on the reward information obtained from the target model with a small number of queries.\nExtensive experiments are conducted on the Face Recognition (FR) task, and the results on four representative FR models demonstrate that our method can significantly improve the attack success rate and the query efficiency. Besides, experiments on the commercial FR service and physical environments confirm the practical application value of our method.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/3b529d5359634aeb071633baaa3b3289db75bc21.zip", "author": "Xingxing Wei;Ying Guo;Jie Yu;Huanqian Yan;Bo Zhang", "authorids": "~Xingxing_Wei1;~Ying_Guo1;~Jie_Yu4;~Huanqian_Yan1;~Bo_Zhang12", "gender": "M;;;M;M", "homepage": "https://sites.google.com/site/xingxingwei1988/;https://jinyugy21.github.io/;https://github.com/shighghyujie;;https://blade.tencent.com", "dblp": "57/4066;12/1208-8;;207/4666.html;", "google_scholar": "ak8D_cQAAAAJ;xrZHHi4AAAAJ;;;", "orcid": ";0000-0002-6429-9297;;0000-0002-9444-3165;", "linkedin": ";;;;", "or_profile": "~Xingxing_Wei1;~Ying_Guo1;~Jie_Yu4;~Huanqian_Yan1;~Bo_Zhang12", "aff": "Beihang University;Beihang University;Beihang University;Beihang University;Tencent Blade Team", "aff_domain": "buaa.edu.cn;buaa.edu.cn;buaa.edu.cn;scse.buaa.edu;tencent.com", "position": "Associate Professor;MS student;MS student;PhD student;Researcher", "bibtex": "@misc{\nwei2022generating,\ntitle={Generating Transferable Adversarial Patch by Simultaneously Optimizing its Position and Perturbations},\nauthor={Xingxing Wei and Ying Guo and Jie Yu and Huanqian Yan and Bo Zhang},\nyear={2022},\nurl={https://openreview.net/forum?id=lVtq6C5_3QL}\n}", "github": "", "project": "", "reviewers": "i3m8;a8Xn;oHKt;9fZG", "site": "https://openreview.net/forum?id=lVtq6C5_3QL", "pdf_size": 0, "recommendation": "3;3;6;8", "confidence": "5;4;3;4", "correctness": "2;2;4;3", "technical_novelty": "1;2;2;4", "empirical_novelty": "2;0;2;4", "wc_summary_paper": "35;40;89;60", "wc_summary_review": "41;18;45;55", "wc_main_review": "278;371;140;351", "wc_review": "354;429;274;466", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "722;778;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;0;0", "recommendation_avg": [ 5.0, 2.1213203435596424 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 2.75, 0.82915619758885 ], "technical_novelty_avg": [ 2.25, 1.0897247358851685 ], "empirical_novelty_avg": [ 2.0, 1.4142135623730951 ], "wc_summary_paper_avg": [ 56.0, 21.224985276791124 ], "wc_summary_review_avg": [ 39.75, 13.5531361684298 ], "wc_main_review_avg": [ 285.0, 90.58973451776973 ], "wc_review_avg": [ 380.75, 73.66605391902026 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 375.0, 375.52230293286175 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0.5, 0.5 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.5000000000000001, "corr_recommendation_correctness": 0.7106690545187014, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9898622065500159221&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;0;0;1", "aff_unique_norm": "Beihang University;Tencent", "aff_unique_dep": ";Blade Team", "aff_unique_url": "http://www.buaa.edu.cn/;https://www.tencent.com", "aff_unique_abbr": "BUAA;Tencent", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Prototype memory and attention mechanisms for few shot image generation", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6150", "id": "lY0-7bj0Vfz", "poster": "", "openreview": "https://openreview.net/forum?id=lY0-7bj0Vfz", "slides": "https://iclr.cc/virtual/2022/poster/6150", "video": "https://iclr.cc/virtual/2022/poster/6150", "author_site": "Tianqin Li, Zijie Li, Andrew Luo, Harold Rockwell, Amir Barati Farimani, Tai Lee", "tldr": "", "abstract": "Recent discoveries indicate that the neural codes in the primary visual cortex (V1) of macaque monkeys are complex, diverse and sparse. This leads us to ponder the computational advantages and functional role of these \u201cgrandmother cells.\" Here, we propose that such cells can serve as prototype memory priors that bias and shape the distributed feature processing within the image generation process in the brain. These memory prototypes are learned by momentum online clustering and are utilized via a memory-based attention operation, which we define as Memory Concept Attention (MoCA). To test our proposal, we show in a few-shot image generation task, that having a prototype memory during attention can improve image synthesis quality, learn interpretable visual concept clusters, as well as improve the robustness of the model. Interestingly, we also find that our attentional memory mechanism can implicitly modify the horizontal connections by updating the transformation into the prototype embedding space for self-attention. Insofar as GANs can be seen as plausible models for reasoning about the top-down synthesis in the analysis-by-synthesis loop of the hierarchical visual cortex, our findings demonstrate a plausible computational role for these \u201cprototype concept\" neurons in visual processing in the brain.", "keywords": "neuroscience;deep learning", "primary_area": "", "supplementary_material": "", "author": "Tianqin Li;Zijie Li;Andrew Luo;Harold Rockwell;Amir Barati Farimani;Tai Sing Lee", "authorids": "~Tianqin_Li2;~Zijie_Li2;~Andrew_Luo2;~Harold_Rockwell1;~Amir_Barati_Farimani2;~Tai_Sing_Lee1", "gender": "M;M;M;M;M;M", "homepage": "https://github.com/Crazy-Jack;https://andrewluo.net/;;https://sites.google.com/view/barati;http://www.cnbc.cmu.edu/~tai/;https://zijieli-jlee.github.io/", "dblp": "294/5434;234/8054;;;21/4105;", "google_scholar": "sQjEQEUAAAAJ;bWYvvkUAAAAJ;;aH52nxkAAAAJ;9TAiIIMAAAAJ;ji7TXTMAAAAJ", "orcid": "0000-0003-2567-8283;;;0000-0002-2952-8576;;0000-0002-8566-7538", "linkedin": "tianqin-li-b16299170/;;;amir-barati-farimani-a0b74169/;;", "or_profile": "~Tianqin_Li2;~Andrew_Luo2;~Harold_Rockwell1;~Amir_Barati_Farimani2;~Tai_Sing_Lee1;~zijie_li1", "aff": "Carnegie Mellon University;Carnegie Mellon University;University of Chicago;Carnegie Mellon University;Carnegie Mellon University;Carnegie Mellon University", "aff_domain": "andrew.cmu.edu;cmu.edu;uchicago.edu;andrew.cmu.edu;cmu.edu;cmu.edu", "position": "PhD student;PhD student;PhD student;Assistant Professor;Full Professor;PhD student", "bibtex": "@inproceedings{\nli2022prototype,\ntitle={Prototype memory and attention mechanisms for few shot image generation},\nauthor={Tianqin Li and Zijie Li and Andrew Luo and Harold Rockwell and Amir Barati Farimani and Tai Sing Lee},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=lY0-7bj0Vfz}\n}", "github": "", "project": "", "reviewers": "4n7p;JpF6;9zZN", "pdf_size": 0, "recommendation": "5;5;8", "confidence": "4;3;3", "correctness": "3;3;4", "technical_novelty": "2;3;3", "empirical_novelty": "3;2;3", "wc_summary_paper": "34;77;123", "wc_summary_review": "47;39;43", "wc_main_review": "259;479;184", "wc_review": "340;595;350", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "1127;1332;235", "reply_reviewers": "0;0;0", "reply_authors": "3;3;1", "recommendation_avg": [ 6.0, 1.4142135623730951 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 78.0, 36.34097778908359 ], "wc_summary_review_avg": [ 43.0, 3.265986323710904 ], "wc_main_review_avg": [ 307.3333333333333, 125.18874638809282 ], "wc_review_avg": [ 428.3333333333333, 117.92181967539153 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 898.0, 476.2233369614163 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.3333333333333335, 0.9428090415820634 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.5, "corr_recommendation_correctness": 1.0, "gs_citation": 26, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8278357415872168136&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=lY0-7bj0Vfz", "email": "andrew.cmu.edu;cmu.edu;uchicago.edu;andrew.cmu.edu;cmu.edu;cmu.edu", "author_num": 6, "aff_unique_index": "0;0;1;0;0;0", "aff_unique_norm": "Carnegie Mellon University;University of Chicago", "aff_unique_dep": ";", "aff_unique_url": "https://www.cmu.edu;https://www.uchicago.edu", "aff_unique_abbr": "CMU;UChicago", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Complete Verification via Multi-Neuron Relaxation Guided Branch-and-Bound", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6097", "id": "l_amHf1oaK", "poster": "", "openreview": "https://openreview.net/forum?id=l_amHf1oaK", "slides": "https://iclr.cc/virtual/2022/poster/6097", "video": "https://iclr.cc/virtual/2022/poster/6097", "author_site": "Claudio Ferrari, Mark N M\u00fcller, Nikola Jovanovi\u0107, Martin Vechev", "tldr": "", "abstract": "State-of-the-art neural network verifiers are fundamentally based on one of two paradigms: either encoding the whole verification problem via tight multi-neuron convex relaxations or applying a Branch-and-Bound (BaB) procedure leveraging imprecise but fast bounding methods on a large number of easier subproblems. The former can capture complex multi-neuron dependencies but sacrifices completeness due to the inherent limitations of convex relaxations. The latter enables complete verification but becomes increasingly ineffective on larger and more challenging networks. In this work, we present a novel complete verifier which combines the strengths of both paradigms: it leverages multi-neuron relaxations to drastically reduce the number of subproblems generated during the BaB process and an efficient GPU-based dual optimizer to solve the remaining ones. An extensive evaluation demonstrates that our verifier achieves a new state-of-the-art on both established benchmarks as well as networks with significantly higher accuracy than previously considered. The latter result (up to 28% certification gains) indicates meaningful progress towards creating verifiers that can handle practically relevant networks.", "keywords": "Certified Robustness;Branch-and-Bound;Convex Relaxation", "primary_area": "", "supplementary_material": "", "author": "Claudio Ferrari;Mark Niklas Mueller;Nikola Jovanovi\u0107;Martin Vechev", "authorids": "~Claudio_Ferrari2;~Mark_Niklas_Mueller2;~Nikola_Jovanovi\u01071;~Martin_Vechev1", "gender": "M;M;M;M", "homepage": "https://scholar.google.com/citations?user=HG9-UScAAAAJ&hl=en&oi=ao;https://www.sri.inf.ethz.ch/people/mark;https://www.sri.inf.ethz.ch/people/nikola;https://www.sri.inf.ethz.ch/people/martin", "dblp": ";287/4254;230/4424-1;93/2189.html", "google_scholar": "HG9-UScAAAAJ;RBpmcCAAAAAJ;https://scholar.google.com/citations?hl=en;https://scholar.google.ch/citations?user=aZ1Rh50AAAAJ", "orcid": ";0000-0002-2496-6542;;", "linkedin": ";mark-m%C3%BCller-8bb4b1140/;nikola-jovanovi%C4%87-9b599b105/;", "or_profile": "~Claudio_Ferrari2;~Mark_Niklas_Mueller2;~Nikola_Jovanovi\u01071;~Martin_Vechev1", "aff": ";Swiss Federal Institute of Technology;ETHZ - ETH Zurich;Swiss Federal Institute of Technology", "aff_domain": ";ethz.ch;ethz.ch;ethz.ch", "position": ";PhD student;PhD student;Full Professor", "bibtex": "@inproceedings{\nferrari2022complete,\ntitle={Complete Verification via Multi-Neuron Relaxation Guided Branch-and-Bound},\nauthor={Claudio Ferrari and Mark Niklas Mueller and Nikola Jovanovi{\\'c} and Martin Vechev},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=l_amHf1oaK}\n}", "github": "", "project": "", "reviewers": "3m7j;gB2C;Ns4C;qGFq;WZm2", "pdf_size": 0, "recommendation": "6;6;6;6;6", "confidence": "5;4;4;3;4", "correctness": "4;2;4;4;2", "technical_novelty": "2;3;2;3;3", "empirical_novelty": "0;3;2;3;3", "wc_summary_paper": "240;120;65;40;46", "wc_summary_review": "62;38;59;15;10", "wc_main_review": "537;998;287;212;388", "wc_review": "839;1156;411;267;444", "wc_reply_reviewers": "269;174;90;0;201", "wc_reply_authors": "703;538;336;754;773", "reply_reviewers": "1;2;1;0;2", "reply_authors": "2;2;1;1;2", "recommendation_avg": [ 6.0, 0.0 ], "confidence_avg": [ 4.0, 0.6324555320336759 ], "correctness_avg": [ 3.2, 0.9797958971132712 ], "technical_novelty_avg": [ 2.6, 0.4898979485566356 ], "empirical_novelty_avg": [ 2.2, 1.16619037896906 ], "wc_summary_paper_avg": [ 102.2, 74.4537440294308 ], "wc_summary_review_avg": [ 36.8, 21.55365398256175 ], "wc_main_review_avg": [ 484.4, 278.9312460087611 ], "wc_review_avg": [ 623.4, 326.9450106669316 ], "wc_reply_reviewers_avg": [ 146.8, 93.1523483332546 ], "wc_reply_authors_avg": [ 620.8, 164.70021250745248 ], "reply_reviewers_avg": [ 1.2, 0.7483314773547883 ], "reply_authors_avg": [ 1.6, 0.4898979485566356 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 138, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14769723255634252083&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "pdf": "https://openreview.net/pdf?id=l_amHf1oaK", "email": ";ethz.ch;ethz.ch;ethz.ch", "author_num": 4, "aff_unique_index": "0;1;0", "aff_unique_norm": "Swiss Federal Institute of Technology;ETH Zurich", "aff_unique_dep": ";", "aff_unique_url": "https://www.ethz.ch;https://www.ethz.ch", "aff_unique_abbr": "ETH Zurich;ETHZ", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Switzerland" }, { "title": "Object Pursuit: Building a Space of Objects via Discriminative Weight Generation", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6713", "id": "lbauk6wK2-y", "poster": "", "openreview": "https://openreview.net/forum?id=lbauk6wK2-y", "slides": "https://iclr.cc/virtual/2022/poster/6713", "video": "https://iclr.cc/virtual/2022/poster/6713", "author_site": "Chuanyu Pan, Yanchao Yang, Kaichun Mo, Yueqi Duan, Leonidas Guibas", "tldr": "", "abstract": "We propose a framework to continuously learn object-centric representations for visual learning and understanding. Existing object-centric representations either rely on supervisions that individualize objects in the scene, or perform unsupervised disentanglement that can hardly deal with complex scenes in the real world. To mitigate the annotation burden and relax the constraints on the statistical complexity of the data, our method leverages interactions to effectively sample diverse variations of an object and the corresponding training signals while learning the object-centric representations. Throughout learning, objects are streamed one by one in random order with unknown identities, and are associated with latent codes that can synthesize discriminative weights for each object through a convolutional hypernetwork. Moreover, re-identification of learned objects and forgetting prevention are employed to make the learning process efficient and robust. We perform an extensive study of the key features of the proposed framework and analyze the characteristics of the learned representations. Furthermore, we demonstrate the capability of the proposed framework in learning representations that can improve label efficiency in downstream tasks. Our code and trained models are made publicly available at: https://github.com/pptrick/Object-Pursuit.", "keywords": "object-centric;continual learning;representation learning;hypernetwork", "primary_area": "", "supplementary_material": "/attachment/312cd290b1e47b142e98a5764000271870cb606a.zip", "author": "Chuanyu Pan;Yanchao Yang;Kaichun Mo;Yueqi Duan;Leonidas Guibas", "authorids": "pancy17@mails.tsinghua.edu.cn;~Yanchao_Yang1;~Kaichun_Mo1;~Yueqi_Duan1;~Leonidas_Guibas1", "gender": ";M;M;M;M", "homepage": ";https://yanchaoyang.github.io/;https://cs.stanford.edu/~kaichun/;https://duanyueqi.github.io/;http://geometry.stanford.edu/", "dblp": ";84/8637-1;172/1283;168/8373;g/LeonidasJGuibas", "google_scholar": ";r2tKnV4AAAAJ;pL7JsOsAAAAJ;qDseo3cAAAAJ;https://scholar.google.com.tw/citations?user=5JlEyTAAAAAJ", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "pancy17@mails.tsinghua.edu.cn;~Yanchao_Yang1;~Kaichun_Mo1;~Yueqi_Duan1;~Leonidas_Guibas1", "aff": ";Stanford University;Stanford University;Tsinghua University;Stanford University", "aff_domain": ";stanford.edu;stanford.edu;tsinghua.edu.cn;stanford.edu", "position": ";Postdoc;PhD student;Assistant Professor;Full Professor", "bibtex": "@inproceedings{\npan2022object,\ntitle={Object Pursuit: Building a Space of Objects via Discriminative Weight Generation},\nauthor={Chuanyu Pan and Yanchao Yang and Kaichun Mo and Yueqi Duan and Leonidas Guibas},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=lbauk6wK2-y}\n}", "github": "", "project": "", "reviewers": "M8h3;SKNa;jB2Q;LiL2", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "3;3;2;3", "correctness": "2;3;3;3", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;3;2;3", "wc_summary_paper": "49;67;79;40", "wc_summary_review": "33;66;39;42", "wc_main_review": "440;252;96;120", "wc_review": "522;385;214;202", "wc_reply_reviewers": "37;0;18;29", "wc_reply_authors": "810;846;445;412", "reply_reviewers": "1;0;1;1", "reply_authors": "1;1;1;1", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 2.75, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 58.75, 15.20485119953497 ], "wc_summary_review_avg": [ 45.0, 12.549900398011133 ], "wc_main_review_avg": [ 227.0, 136.5686640485291 ], "wc_review_avg": [ 330.75, 132.02911610701634 ], "wc_reply_reviewers_avg": [ 21.0, 13.874436925511608 ], "wc_reply_authors_avg": [ 628.25, 200.49485654250586 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5693350228045051573&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=lbauk6wK2-y", "email": ";stanford.edu;stanford.edu;tsinghua.edu.cn;stanford.edu", "author_num": 5, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Stanford University;Tsinghua University", "aff_unique_dep": ";", "aff_unique_url": "https://www.stanford.edu;https://www.tsinghua.edu.cn", "aff_unique_abbr": "Stanford;THU", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Stanford;", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "United States;China" }, { "id": "ldkunzUzRWj", "title": "A Simple and Debiased Sampling Method for Personalized Ranking", "track": "main", "status": "Reject", "tldr": "", "abstract": "Pairwise ranking models have been widely used to address various problems, such as recommendation. The basic idea is to learn the rank of users' preferred items through separating items into positive samples if user-item interactions exist, and negative samples otherwise. Due to the limited number of observed interactions, pairwise ranking models face serious class-imbalance issue. Our theoretical analysis shows that current sampling-based methods cause the vertex-level imbalance problem, which makes the norm of learned item embeddings towards infinite after a certain training iterations, and consequently results in vanishing gradient and affects the model performance. To this end, we propose VINS, an efficient \\emph{\\underline{Vi}tal \\underline{N}egative \\underline{S}ampler}, to alleviate the class-imbalance issue for pairwise ranking models optimized by gradient methods. The core of VINS is a bias sampler with reject probability that will tend to accept a negative candidate with a larger popularity than the given positive item. Evaluation results on several real datasets demonstrate that the proposed sampling method speeds up the training procedure 30\\% to 50\\% for ranking models ranging from shallow to deep, while maintaining and even improving the quality of ranking results in top-N item recommendation. ", "keywords": "personalized ranking;class-imbalance;negative sampling;deep learning", "primary_area": "", "supplementary_material": "", "author": "Lu Yu;Shichao Pei;Chuxu Zhang;Xiangliang Zhang", "authorids": "~Lu_Yu1;~Shichao_Pei1;~Chuxu_Zhang2;~Xiangliang_Zhang1", "gender": "M;;;F", "homepage": ";https://scpei.github.io/;;https://sites.nd.edu/xiangliang-zhang/", "dblp": "04/1781-6.html;168/9433;;74/1890-1", "google_scholar": "ODK41KwAAAAJ;https://scholar.google.ca/citations?user=IDaNWgIAAAAJ;;BhRJe4wAAAAJ", "orcid": ";0000-0002-0802-1506;;0000-0002-3574-5665", "linkedin": ";;;", "or_profile": "~Lu_Yu1;~Shichao_Pei1;~Chuxu_Zhang2;~Xiangliang_Zhang1", "aff": "Ant Group;University of Notre Dame;;University of Notre Dame", "aff_domain": "antgroup.com;nd.edu;;nd.edu", "position": "Researcher;Postdoc;;Associate Professor", "bibtex": "@misc{\nyu2022a,\ntitle={A Simple and Debiased Sampling Method for Personalized Ranking},\nauthor={Lu Yu and Shichao Pei and Chuxu Zhang and Xiangliang Zhang},\nyear={2022},\nurl={https://openreview.net/forum?id=ldkunzUzRWj}\n}", "github": "", "project": "", "reviewers": "QcpY;A9Ak;1zJH;zgRr", "site": "https://openreview.net/forum?id=ldkunzUzRWj", "pdf_size": 0, "recommendation": "3;3;6;8", "confidence": "5;4;3;4", "correctness": "2;4;3;4", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;1;3;3", "wc_summary_paper": "197;203;97;35", "wc_summary_review": "17;26;94;43", "wc_main_review": "243;376;58;449", "wc_review": "457;605;249;527", "wc_reply_reviewers": "247;0;0;0", "wc_reply_authors": "1088;773;120;277", "reply_reviewers": "1;0;0;0", "reply_authors": "2;1;1;1", "recommendation_avg": [ 5.0, 2.1213203435596424 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 133.0, 70.52659073002182 ], "wc_summary_review_avg": [ 45.0, 29.790938219532464 ], "wc_main_review_avg": [ 281.5, 148.6783440854787 ], "wc_review_avg": [ 459.5, 132.3281904962053 ], "wc_reply_reviewers_avg": [ 61.75, 106.95413736737817 ], "wc_reply_authors_avg": [ 564.5, 386.5750250598194 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.5000000000000001, "corr_recommendation_correctness": 0.42640143271122094, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:WJc320vmdFYJ:scholar.google.com/&scioq=A+Simple+and+Debiased+Sampling+Method+for+Personalized+Ranking&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;1", "aff_unique_norm": "Ant Group;University of Notre Dame", "aff_unique_dep": ";", "aff_unique_url": "https://www.antgroup.com;https://www.nd.edu", "aff_unique_abbr": "Ant Group;Notre Dame", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1", "aff_country_unique": "China;United States" }, { "id": "lf0W6tcWmh-", "title": "Towards understanding how momentum improves generalization in deep learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Stochastic gradient descent (SGD) with momentum is widely used for training modern deep learning architectures. While it is well understood that using momentum can lead to faster convergence rate in various settings, it has also been observed that momentum yields higher generalization. Prior work argue that momentum stabilizes the SGD noise during training and this leads to higher generalization. In this paper, we take the opposite view to this result and first empirically show that gradient descent with momentum (GD+M) significantly improves generalization comparing to gradient descent (GD) in many deep learning tasks. From this observation, we formally study how momentum improves generalization in deep learning. We devise a binary classification setting where a two-layer (over-parameterized) convolutional neural network trained with GD+M provably generalizes better than the same network trained with vanilla GD, when both algorithms start from the same random initialization. The key insight in our analysis is that momentum is beneficial in datasets where the examples share some features but differ in their margin. Contrary to the GD model that memorizes the small margin data, GD+M can still learn the features in these data thanks to its historical gradients. We also empirically verify this learning process of momentum in real-world settings.", "keywords": "Deep learning theory;non-convex optimization", "primary_area": "", "supplementary_material": "/attachment/c8eb329980282a3a53a4a099b8453d339713aabf.zip", "author": "Samy Jelassi;Yuanzhi Li", "authorids": "~Samy_Jelassi1;~Yuanzhi_Li1", "gender": "M;M", "homepage": "https://sjelassi.github.io/;", "dblp": "222/3149;73/3628", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": "~Samy_Jelassi1;~Yuanzhi_Li1", "aff": "Princeton University;Carnegie Mellon University", "aff_domain": "princeton.edu;andrew.cmu.edu", "position": "PhD student;Assistant Professor", "bibtex": "@misc{\njelassi2022towards,\ntitle={Towards understanding how momentum improves generalization in deep learning},\nauthor={Samy Jelassi and Yuanzhi Li},\nyear={2022},\nurl={https://openreview.net/forum?id=lf0W6tcWmh-}\n}", "github": "", "project": "", "reviewers": "osph;6raf;Jrkp;Vuuz;PBhC", "site": "https://openreview.net/forum?id=lf0W6tcWmh-", "pdf_size": 0, "recommendation": "3;5;5;5;6", "confidence": "4;4;4;4;2", "correctness": "3;2;2;4;4", "technical_novelty": "2;3;3;2;4", "empirical_novelty": "2;3;2;2;4", "wc_summary_paper": "138;32;112;59;15", "wc_summary_review": "77;32;29;64;25", "wc_main_review": "647;781;229;268;118", "wc_review": "862;845;370;391;158", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "804;562;550;452;0", "reply_reviewers": "0;0;0;0;0", "reply_authors": "1;1;1;1;0", "recommendation_avg": [ 4.8, 0.9797958971132712 ], "confidence_avg": [ 3.6, 0.8 ], "correctness_avg": [ 3.0, 0.8944271909999159 ], "technical_novelty_avg": [ 2.8, 0.7483314773547882 ], "empirical_novelty_avg": [ 2.6, 0.8 ], "wc_summary_paper_avg": [ 71.2, 46.841861619709356 ], "wc_summary_review_avg": [ 45.4, 21.019990485249988 ], "wc_main_review_avg": [ 408.6, 257.677783287578 ], "wc_review_avg": [ 525.2, 280.22804998786256 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 473.6, 263.65856709009097 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0.8, 0.4 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.6123724356957946, "corr_recommendation_correctness": 0.22821773229381923, "gs_citation": 58, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3413448426995846490&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff_unique_index": "0;1", "aff_unique_norm": "Princeton University;Carnegie Mellon University", "aff_unique_dep": ";", "aff_unique_url": "https://www.princeton.edu;https://www.cmu.edu", "aff_unique_abbr": "Princeton;CMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "lgGKToqwtwG", "title": "Infusing Future Information into Monotonic Attention Through Language Models", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Simultaneous neural machine translation (SNMT) models start emitting the target sequence before they have processed the source sequence. The recent adaptive policies for SNMT use monotonic attention to perform read/write decisions based on the partial source and target sequences. The lack of sufficient information might cause the monotonic attention to take poor read/write decisions, which in turn negatively affects the performance of the SNMT model. On the other hand, human translators make better read/write decisions since they can anticipate the immediate future words using linguistic information and domain knowledge. In this work, we propose a framework to aid monotonic attention with an external language model to improve its decisions. We conduct experiments on the MuST-CEnglish-German and English-French speech-to-text translation tasks to show the effectiveness of the proposed framework. It improves the quality-latency trade-off over the state-of-the-art monotonic multihead attention.", "keywords": "Simultaneous Translation;Monotonic Attention;Speech Translation", "primary_area": "", "supplementary_material": "", "author": "Sathish Reddy Indurthi;Mohd Abbas Zaidi;Beomseok Lee;Nikhil Kumar Lakumarapu;Sangha Kim", "authorids": "~Sathish_Reddy_Indurthi2;~Mohd_Abbas_Zaidi1;scarletgunn37@gmail.com;~Nikhil_Kumar_Lakumarapu1;sangha01.kim@samsung.com", "gender": "M;M;;;", "homepage": ";https://mzaidi59.github.io/;;;", "dblp": "223/2379;266/4018.html;;;", "google_scholar": "xZrGdhgAAAAJ;i_cSaKgAAAAJ;;https://scholar.google.com/citations?hl=en;", "orcid": ";0000-0003-2961-4404;;;", "linkedin": "sathishindurthi/;mazaidi/;;;", "or_profile": "~Sathish_Reddy_Indurthi2;~Mohd_Abbas_Zaidi1;scarletgunn37@gmail.com;~Nikhil_Kumar_Lakumarapu1;sangha01.kim@samsung.com", "aff": "Zoom Video Communications;Samsung Research;;Samsung;", "aff_domain": "zoom.us;samsung.com;;samsung.com;", "position": "Senior Research Scientist;Research Engineer;;Research Engineer;", "bibtex": "@misc{\nindurthi2022infusing,\ntitle={Infusing Future Information into Monotonic Attention Through Language Models},\nauthor={Sathish Reddy Indurthi and Mohd Abbas Zaidi and Beomseok Lee and Nikhil Kumar Lakumarapu and Sangha Kim},\nyear={2022},\nurl={https://openreview.net/forum?id=lgGKToqwtwG}\n}", "github": "", "project": "", "reviewers": "B43A;3awi;WtnC", "site": "https://openreview.net/forum?id=lgGKToqwtwG", "pdf_size": 0, "recommendation": "3;3;6", "confidence": "3;5;4", "correctness": "2;2;4", "technical_novelty": "3;2;3", "empirical_novelty": "2;2;2", "wc_summary_paper": "107;103;168", "wc_summary_review": "30;27;37", "wc_main_review": "531;189;103", "wc_review": "668;319;308", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 4.0, 1.4142135623730951 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "correctness_avg": [ 2.6666666666666665, 0.9428090415820634 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 126.0, 29.743346594938952 ], "wc_summary_review_avg": [ 31.333333333333332, 4.189935029992178 ], "wc_main_review_avg": [ 274.3333333333333, 184.85549912176145 ], "wc_review_avg": [ 431.6666666666667, 167.1732301802202 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 1.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5877932945226632290&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;1", "aff_unique_norm": "Zoom Video Communications;Samsung", "aff_unique_dep": ";Samsung Research", "aff_unique_url": "https://zoom.us;https://research.samsung.com", "aff_unique_abbr": "Zoom;Samsung", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1", "aff_country_unique": "United States;South Korea" }, { "id": "lgOylcEZQgr", "title": "Online Unsupervised Learning of Visual Representations and Categories", "track": "main", "status": "Reject", "tldr": "", "abstract": "Real world learning scenarios involve a nonstationary distribution of classes with sequential dependencies among the samples, in contrast to the standard machine learning formulation of drawing samples independently from a fixed, typically uniform distribution. Furthermore, real world interactions demand learning on-the-fly from few or no class labels. In this work, we propose an unsupervised model that simultaneously performs online visual representation learning and few-shot learning of new categories without relying on any class labels. Our model is a prototype-based memory network with a control component that determines when to form a new class prototype. We formulate it as an online Gaussian mixture model, where components are created online with only a single new example, and assignments do not have to be balanced, which permits an approximation to natural imbalanced distributions from uncurated raw data. Learning includes a contrastive loss that encourages different views of the same image to be assigned to the same prototype. The result is a mechanism that forms categorical representations of objects in nonstationary environments. Experiments show that our method can learn from an online stream of visual input data and is significantly better at category recognition compared to state-of-the-art self-supervised learning methods.", "keywords": "Unsupervised learning;self-supervised learning;few-shot learning;visual representation learning;visual category learning", "primary_area": "", "supplementary_material": "", "author": "Mengye Ren;Tyler R. Scott;Michael Louis Iuzzolino;Michael Curtis Mozer;Richard Zemel", "authorids": "~Mengye_Ren1;~Tyler_R._Scott1;~Michael_Louis_Iuzzolino1;~Michael_Curtis_Mozer1;~Richard_Zemel1", "gender": ";M;M;M;M", "homepage": "http://www.cs.toronto.edu/~mren;https://tylersco.github.io/;http://michael-iuzzolino.github.io;https://www.cs.colorado.edu/~mozer;http://www.cs.columbia.edu/~zemel", "dblp": "163/1952;220/4343;233/0309;m/MichaelCMozer;16/6366", "google_scholar": "XcQ9WqMAAAAJ;kwjrViYAAAAJ;W7WpP24AAAAJ;lmjR_qMAAAAJ;https://scholar.google.ca/citations?user=iBeDoRAAAAAJ", "orcid": ";;;;", "linkedin": ";tyler-scott-8baa0abb/;michael-iuzzolino-3a060696/;;", "or_profile": "~Mengye_Ren1;~Tyler_R._Scott1;~Michael_Louis_Iuzzolino1;~Michael_Curtis_Mozer1;~Richard_Zemel1", "aff": "Google;University of Colorado, Boulder;University of Colorado, Boulder;Google DeepMind;Department of Computer Science, University of Toronto", "aff_domain": "google.com;colorado.edu;colorado.edu;google.com;cs.toronto.edu", "position": "Visiting Researcher;PhD student;PhD student;Research Scientist;Full Professor", "bibtex": "@misc{\nren2022online,\ntitle={Online Unsupervised Learning of Visual Representations and Categories},\nauthor={Mengye Ren and Tyler R. Scott and Michael Louis Iuzzolino and Michael Curtis Mozer and Richard Zemel},\nyear={2022},\nurl={https://openreview.net/forum?id=lgOylcEZQgr}\n}", "github": "", "project": "", "reviewers": "bgm4;cvrN;hYzM;LjvY", "site": "https://openreview.net/forum?id=lgOylcEZQgr", "pdf_size": 0, "recommendation": "3;6;6;6", "confidence": "3;3;3;4", "correctness": "4;2;3;3", "technical_novelty": "3;3;3;3", "empirical_novelty": "3;3;2;3", "wc_summary_paper": "77;242;56;66", "wc_summary_review": "208;60;88;184", "wc_main_review": "843;534;163;936", "wc_review": "1128;836;307;1186", "wc_reply_reviewers": "0;273;0;304", "wc_reply_authors": "2418;1269;355;2175", "reply_reviewers": "0;1;0;1", "reply_authors": "4;2;1;4", "recommendation_avg": [ 5.25, 1.299038105676658 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 110.25, 76.42766187709788 ], "wc_summary_review_avg": [ 135.0, 62.37788069500277 ], "wc_main_review_avg": [ 619.0, 302.41775741513595 ], "wc_review_avg": [ 864.25, 347.9988326129845 ], "wc_reply_reviewers_avg": [ 144.25, 144.66577860710527 ], "wc_reply_authors_avg": [ 1554.25, 814.0888695345245 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 2.75, 1.299038105676658 ], "replies_avg": [ 22, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": -0.816496580927726, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2421863458281804401&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;1;0;2", "aff_unique_norm": "Google;University of Colorado;University of Toronto", "aff_unique_dep": "Google;;Department of Computer Science", "aff_unique_url": "https://www.google.com;https://www.colorado.edu;https://www.utoronto.ca", "aff_unique_abbr": "Google;CU;U of T", "aff_campus_unique_index": "0;1;1;3", "aff_campus_unique": "Mountain View;Boulder;;Toronto", "aff_country_unique_index": "0;0;0;1;2", "aff_country_unique": "United States;United Kingdom;Canada" }, { "id": "liIJKb1gudP", "title": "Center Loss Regularization for Continual Learning", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "The ability to learn different tasks sequentially is essential to the development of artificial intelligence. In general, neural networks lack this capability, the major obstacle being catastrophic forgetting. It occurs when the incrementally available information from non-stationary data distributions is continually acquired, disrupting what the model has already learned. Our approach remembers old tasks by projecting the representations of new tasks close to that of old tasks while keeping the decision boundaries unchanged. We employ the center loss as a regularization penalty that enforces new tasks' features to have the same class centers as old tasks and makes the features highly discriminative. This, in turn, leads to the least forgetting of already learned information. This method is easy to implement, requires minimal computational and memory overhead, and allows the neural network to maintain high performance across many sequentially encountered tasks. We also demonstrate that using the center loss in conjunction with the memory replay outperforms other replay-based strategies. Along with standard MNIST variants for continual learning, we apply our method to continual domain adaptation scenarios with the Digits and PACS datasets. We demonstrate that our approach is scalable, effective, and gives competitive performance compared to state-of-the-art continual learning methods.", "keywords": "Continual Learning;Supervised Learning;Classification;Lifelong Learning;Catastrophic Forgetting;Domain Adaptation;Continual Domain Adaptation", "primary_area": "", "supplementary_material": "", "author": "Kaustubh Olpadkar;Ekta Gavas", "authorids": "~Kaustubh_Olpadkar1;~Ekta_Gavas1", "gender": ";F", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";ektagavas/", "or_profile": "~Kaustubh_Olpadkar1;~Ekta_Gavas1", "aff": ";International Institute of Information Technology, Hyderabad, International Institute of Information Technology Hyderabad", "aff_domain": ";research.iiit.ac.in", "position": ";MS student", "bibtex": "@misc{\nolpadkar2022center,\ntitle={Center Loss Regularization for Continual Learning},\nauthor={Kaustubh Olpadkar and Ekta Gavas},\nyear={2022},\nurl={https://openreview.net/forum?id=liIJKb1gudP}\n}", "github": "", "project": "", "reviewers": "3UPA;Xeuc;9bV2;YSGn;ekiz", "site": "https://openreview.net/forum?id=liIJKb1gudP", "pdf_size": 0, "recommendation": "3;3;3;5;6", "confidence": "5;5;4;4;4", "correctness": "2;3;3;3;3", "technical_novelty": "4;2;2;2;3", "empirical_novelty": "3;2;2;2;3", "wc_summary_paper": "57;116;88;63;74", "wc_summary_review": "77;21;50;106;44", "wc_main_review": "539;132;295;752;257", "wc_review": "673;269;433;921;375", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "0;0;0;0;0", "reply_reviewers": "0;0;0;0;0", "reply_authors": "0;0;0;0;0", "recommendation_avg": [ 4.0, 1.2649110640673518 ], "confidence_avg": [ 4.4, 0.48989794855663565 ], "correctness_avg": [ 2.8, 0.39999999999999997 ], "technical_novelty_avg": [ 2.6, 0.8 ], "empirical_novelty_avg": [ 2.4, 0.4898979485566356 ], "wc_summary_paper_avg": [ 79.6, 21.039011383617815 ], "wc_summary_review_avg": [ 59.6, 29.261578904768623 ], "wc_main_review_avg": [ 395.0, 221.9900898688948 ], "wc_review_avg": [ 534.2, 234.4341272084762 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.6454972243679027, "corr_recommendation_correctness": 0.39528470752104744, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2384266504765170245&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0", "aff_unique_norm": "International Institute of Information Technology, Hyderabad", "aff_unique_dep": "", "aff_unique_url": "https://iiit Hyderabad.ac.in", "aff_unique_abbr": "IIIT Hyderabad", "aff_campus_unique_index": "0", "aff_campus_unique": "Hyderabad", "aff_country_unique_index": "0", "aff_country_unique": "India" }, { "id": "liV-Re74fK", "title": "Density Estimation for Conservative Q-Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Batch Reinforcement Learning algorithms aim at learning the best policy from a batch of data without interacting with the environment. Within this setting, one difficulty is to correctly assess the value of state-action pairs that are far from the dataset. Indeed, the lack of information may provoke an overestimation of the value function, leading to non-desirable behaviors. A compromise between enhancing the behaviour policy's performance and staying close to it must be found. To alleviate this issue, most existing approaches introduce a regularization term to favor state-action pairs from the dataset. In this paper, we refine this idea by estimating the density of these state-action pairs to distinguish neighbourhoods. The resulting regularization guides the policy toward meaningful unseen regions, improving the learning process. We hence introduce Density Conservative Q-Learning (D-CQL), a batch-RL algorithm with strong theoretical guarantees that carefully penalizes the value function based on the amount of information collected in the state-action space. The performance of our approach is outlined on many classical benchmark in batch-RL.", "keywords": "Offline Reinforcement Learning;Batch Reinforcement Learning", "primary_area": "", "supplementary_material": "", "author": "Paul Daoudi;Merwan Barlier;Ludovic Dos Santos;Aladin Virmaux", "authorids": "~Paul_Daoudi2;~Merwan_Barlier1;~Ludovic_Dos_Santos1;~Aladin_Virmaux1", "gender": "M;;;M", "homepage": ";;https://avirmaux.github.io;https://scholar.google.com/citations?user=TNPp0cwAAAAJ", "dblp": "347/7716;185/0155;192/8303;167/4759", "google_scholar": ";;5FxvLvwAAAAJ;TNPp0cwAAAAJ", "orcid": "0009-0004-2784-952X;;;", "linkedin": "paul-daoudi-83101a126/;;;", "or_profile": "~Paul_Daoudi2;~Merwan_Barlier1;~Aladin_Virmaux1;~Ludovic_DOS_SANTOS3", "aff": "Huawei Technologies Ltd.;Huawei Technologies Ltd.;Huawei Technologies Ltd.;Huawei Technologies Ltd.", "aff_domain": "huawei.com;huawei.com;huawei.com;huawei.com", "position": "PhD student;Researcher;Researcher;Researcher", "bibtex": "@inproceedings{\ndaoudi2022density,\ntitle={Density Estimation for Conservative Q-Learning},\nauthor={Paul Daoudi and Merwan Barlier and Ludovic Dos Santos and Aladin Virmaux},\nbooktitle={Submitted to The Tenth International Conference on Learning Representations },\nyear={2022},\nurl={https://openreview.net/forum?id=liV-Re74fK},\nnote={under review}\n}", "github": "", "project": "", "reviewers": "H5Uq;eRAf;Ynri;PGkj", "site": "https://openreview.net/forum?id=liV-Re74fK", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "5;4;2;3", "correctness": "2;3;4;3", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "80;46;85;137", "wc_summary_review": "97;21;34;5", "wc_main_review": "287;241;309;694", "wc_review": "464;308;428;836", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "168;243;310;662", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 3.5, 1.118033988749895 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 87.0, 32.53459696999488 ], "wc_summary_review_avg": [ 39.25, 34.888214342382156 ], "wc_main_review_avg": [ 382.75, 181.367548089508 ], "wc_review_avg": [ 509.0, 197.43100060527476 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 345.75, 189.3705032469418 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.8944271909999159, "corr_recommendation_correctness": 0.7071067811865475, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15765619180220169319&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Huawei", "aff_unique_dep": "Huawei Technologies", "aff_unique_url": "https://www.huawei.com", "aff_unique_abbr": "Huawei", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "id": "lifRwnIuAv0", "title": "PGD-2 can be better than FGSM + GradAlign", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "One major issue of adversarial training (AT) with the fast gradient sign method (FGSM AT) is the phenomenon of catastrophic overfitting, meaning that the trained model suddenly loses its robustness over a single epoch. In addition to FGSM AT, Andriushchenko & Flammarion (2020) observed that two-step projected gradient descent adversarial training (PGD-2 AT) also suffers from catastrophic overfitting for large $\\ell_\\infty$ perturbations. To prevent catastrophic overfitting, Andriushchenko & Flammarion (2020) proposed a gradient alignment regularization method (GradAlign) and claimed that GradAlign can prevent catastrophic overfitting in FGSM AT and PGD-2 AT. In this paper, we show that PGD-2 AT with random initialization (PGD-2-RS AT) and attack step size $\\alpha=1.25\\epsilon/2$ only needs approximately a half computational cost of FGSM + GradAlign AT and actually can avoid catastrophic overfitting for large $\\ell_\\infty$ perturbations. We hypothesize that, if FGSM-RS AT with $\\alpha=1.25\\epsilon/2$ can avoid catastrophic overfitting for $\\ell_\\infty$ perturbation size $\\epsilon/2$, then PGD-2-RS AT with $\\alpha=1.25\\epsilon/2$ may be able to avoid catastrophic overfitting for $\\ell_\\infty$ perturbation size $\\epsilon$. Our intuitions to justify this empirical hypothesis induce a more unexpected finding: If we apply random noise from the uniform distribution $\\mathcal{U}(-\\epsilon/2, \\epsilon/2)$ to the perturbations before each step of PGD-2 with $\\alpha=1.25\\epsilon/2$, instead of initializing the perturbations with random noise from $\\mathcal{U}(-\\epsilon, \\epsilon)$ at the beginning ({\\em i.e.,} the conventional random initialization scheme), the corresponding AT method can also avoid catastrophic overfitting and even achieve better robust accuracy in most cases. We refer to this AT method as Qusai-PGD-2-RS AT. Extensive evaluations demonstrate that PGD-2-RS AT and Qusai-PGD-2-RS AT with $\\alpha=1.25\\epsilon/2$ achieve better performance and efficiency than FGSM + GradAlign AT. Notably, Qusai-PGD-2-RS AT achieves comparable robust accuracy against PGD-50-10 as PGD-3-RS AT on CIFAR10 and SVHN, and it also achieves approximately $18\\%$ top-1 and $38\\%$ top-5 robust accuracy against PGD-50-10 at $\\epsilon=8/255$ on ImageNet.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/db1ddad723d5c9bc9b0f3cdd8531dd6fa6b4e78f.zip", "author": "Tianhang Zheng;Baochun Li", "authorids": "~Tianhang_Zheng2;~Baochun_Li1", "gender": "M;M", "homepage": ";http://iqua.ece.toronto.edu/bli/", "dblp": "212/1269.html;l/BaochunLi", "google_scholar": ";https://scholar.google.com.tw/citations?user=rkb3_FgAAAAJ", "orcid": ";0000-0003-2404-0974", "linkedin": ";https://linkedin.com/in/baochun", "or_profile": "~Tianhang_Zheng2;~Baochun_Li1", "aff": "University of Toronto;University of Toronto", "aff_domain": "utoronto.ca;toronto.edu", "position": "Student;Full Professor", "bibtex": "@misc{\nzheng2022pgd,\ntitle={{PGD}-2 can be better than {FGSM} + GradAlign},\nauthor={Tianhang Zheng and Baochun Li},\nyear={2022},\nurl={https://openreview.net/forum?id=lifRwnIuAv0}\n}", "github": "", "project": "", "reviewers": "JXB1;LRRp;dGa1", "site": "https://openreview.net/forum?id=lifRwnIuAv0", "pdf_size": 0, "recommendation": "3;3;5", "confidence": "4;4;4", "correctness": "3;3;3", "technical_novelty": "2;2;2", "empirical_novelty": "2;2;3", "wc_summary_paper": "72;97;180", "wc_summary_review": "28;82;48", "wc_main_review": "109;694;436", "wc_review": "209;873;664", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 3.6666666666666665, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 116.33333333333333, 46.161552063258114 ], "wc_summary_review_avg": [ 52.666666666666664, 22.29100466306732 ], "wc_main_review_avg": [ 413.0, 239.37836159519514 ], "wc_review_avg": [ 582.0, 277.20870597199263 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:PZr8Ks_pXe4J:scholar.google.com/&scioq=PGD-2+can+be+better+than+FGSM+%2B+GradAlign&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "University of Toronto", "aff_unique_dep": "", "aff_unique_url": "https://www.utoronto.ca", "aff_unique_abbr": "U of T", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Canada" }, { "id": "ljCoTzUsdS", "title": "Distinguishing rule- and exemplar-based generalization in learning systems", "track": "main", "status": "Reject", "tldr": "", "abstract": "Despite the increasing scale of datasets in machine learning, generalization to unseen regions of the data distribution remains crucial. Such extrapolation is by definition underdetermined and is dictated by a learner\u2019s inductive biases. Machine learning systems often do not share the same inductive biases as humans and, as a result, extrapolate in ways that are inconsistent with our expectations. We investigate two distinct such inductive biases: feature-level bias (differences in which features are more readily learned) and exemplar-vs-rule bias (differences in how these learned features are used for generalization). Exemplar- vs. rule-based generalization has been studied extensively in cognitive psychology, and in this work we present a protocol inspired by these experimental approaches for directly probing this trade-off in learning systems. The measures we propose characterize changes in extrapolation behavior when feature coverage is manipulated in a combinatorial setting. We present empirical results across a range of models and across both expository and real-world image and language domains. We demonstrate that measuring the exemplar-rule trade-off while controlling for feature-level bias provides a more complete picture of extrapolation behavior than existing formalisms. We find that most standard neural network models have a propensity towards exemplar-based extrapolation and discuss the implications of these findings for research on data augmentation, fairness, and systematic generalization.", "keywords": "inductive bias;combinatorial generalization;cognitive psychology;robustness to spurious correlation", "primary_area": "", "supplementary_material": "/attachment/c3eb3fe0c37c0ac250e286e5f751ac9290386f4c.zip", "author": "Ishita Dasgupta;Erin Grant;Thomas L. Griffiths", "authorids": "~Ishita_Dasgupta1;~Erin_Grant1;~Thomas_L._Griffiths1", "gender": ";F;", "homepage": ";https://eringrant.github.io/;http://cocosci.princeton.edu/tom/", "dblp": "169/6218;169/3175;34/4472", "google_scholar": ";OSg3D9MAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";0009-0002-8623-7254;", "linkedin": "idasgupta6/;eringrant914;", "or_profile": "~Ishita_Dasgupta1;~Erin_Grant1;~Thomas_L._Griffiths1", "aff": "Google DeepMind;University of California, Berkeley;Princeton University", "aff_domain": "deepmind.com;berkeley.edu;princeton.edu", "position": "Researcher;PhD student;Professor", "bibtex": "@misc{\ndasgupta2022distinguishing,\ntitle={Distinguishing rule- and exemplar-based generalization in learning systems},\nauthor={Ishita Dasgupta and Erin Grant and Thomas L. Griffiths},\nyear={2022},\nurl={https://openreview.net/forum?id=ljCoTzUsdS}\n}", "github": "", "project": "", "reviewers": "TPBn;RJtk;UbRf;yoH5", "site": "https://openreview.net/forum?id=ljCoTzUsdS", "pdf_size": 0, "recommendation": "3;3;5;6", "confidence": "4;3;1;4", "correctness": "2;2;3;3", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;2;2;4", "wc_summary_paper": "104;25;151;222", "wc_summary_review": "9;20;38;38", "wc_main_review": "221;235;108;355", "wc_review": "334;280;297;615", "wc_reply_reviewers": "291;0;0;98", "wc_reply_authors": "990;1330;135;2171", "reply_reviewers": "1;0;0;1", "reply_authors": "2;3;1;5", "recommendation_avg": [ 4.25, 1.299038105676658 ], "confidence_avg": [ 3.0, 1.224744871391589 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 125.5, 71.63274390947201 ], "wc_summary_review_avg": [ 26.25, 12.376893794486563 ], "wc_main_review_avg": [ 229.75, 87.48535591743341 ], "wc_review_avg": [ 381.5, 136.21765671160256 ], "wc_reply_reviewers_avg": [ 97.25, 118.80104166209992 ], "wc_reply_authors_avg": [ 1156.5, 729.8111056979059 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 2.75, 1.479019945774904 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.15713484026367722, "corr_recommendation_correctness": 0.9622504486493761, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8589429647517094065&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff_unique_index": "0;1;2", "aff_unique_norm": "Google;University of California, Berkeley;Princeton University", "aff_unique_dep": "Google DeepMind;;", "aff_unique_url": "https://deepmind.com;https://www.berkeley.edu;https://www.princeton.edu", "aff_unique_abbr": "DeepMind;UC Berkeley;Princeton", "aff_campus_unique_index": "1", "aff_campus_unique": ";Berkeley", "aff_country_unique_index": "0;1;1", "aff_country_unique": "United Kingdom;United States" }, { "id": "ljnUrvex8d", "title": "Representation Topology Divergence: A Method for Comparing Neural Network Representations.", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Comparison of data representations is a complex multi-aspect problem that has not enjoyed a complete solution yet. We propose a method for comparing two data representations. We introduce the Representation Topology Divergence (RTD) score measuring the dissimilarity in multi-scale topology between two point clouds of equal size with a one-to-one correspondence between points. The data point clouds are allowed to lie in different ambient spaces. The RTD score is one of the few TDA-based practical methods applicable to real machine learning datasets. Experiments show the agreement of RTD with the intuitive assessment of data representation similarity. The proposed RTD score is sensitive to the data representation's fine topological structure. We use the RTD score to gain insights on neural networks representations in computer vision and NLP domains for various problems: training dynamics analysis, data distribution shift, transfer learning, ensemble learning, disentanglement assessment.", "keywords": "representation learning;understanding deep learning;topological data analysis", "primary_area": "", "supplementary_material": "/attachment/f0833ec7f51fa25ca0d7625aa53da68c8efdd17d.zip", "author": "Serguei Barannikov;Ilya Trofimov;Nikita Balabin;Evgeny Burnaev", "authorids": "~Serguei_Barannikov1;~Ilya_Trofimov1;~Nikita_Balabin1;~Evgeny_Burnaev1", "gender": ";;M;M", "homepage": ";;;http://faculty.skoltech.ru/people/evgenyburnaev", "dblp": "255/5203;130/0370;310/1857;144/7845", "google_scholar": "https://scholar.google.fr/citations?user=-soT8KcAAAAJ;https://scholar.google.ru/citations?user=V1c6KjgAAAAJ;https://scholar.google.com/citations?hl=en;https://scholar.google.ru/citations?user=pCRdcOwAAAAJ", "orcid": "0000-0002-9323-0651;0000-0002-2961-7368;;0000-0001-8424-0690", "linkedin": ";https://ru.linkedin.com/in/ilya-trofimov-ba122748;nikita-balabin-10455b17a/;", "or_profile": "~Serguei_Barannikov1;~Ilya_Trofimov1;~Nikita_Balabin1;~Evgeny_Burnaev1", "aff": "CNRS, Institut Mathematiques de Jussieu, Paris Diderot University;Skoltech;Skolkovo Institute of Science and Technology;Skolkovo Institute of Science and Technology", "aff_domain": "imj-prg.fr;skoltech.ru;skoltech.ru;skoltech.ru", "position": "Researcher;Research scientist;PhD student;Associate Professor", "bibtex": "@misc{\nbarannikov2022representation,\ntitle={Representation Topology Divergence: A Method for Comparing Neural Network Representations.},\nauthor={Serguei Barannikov and Ilya Trofimov and Nikita Balabin and Evgeny Burnaev},\nyear={2022},\nurl={https://openreview.net/forum?id=ljnUrvex8d}\n}", "github": "", "project": "", "reviewers": "8vVV;cQQo;mgv9;DKGG", "site": "https://openreview.net/forum?id=ljnUrvex8d", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "4;5;3;2", "correctness": "2;2;3;4", "technical_novelty": "3;1;3;2", "empirical_novelty": "3;2;2;3", "wc_summary_paper": "41;44;61;98", "wc_summary_review": "50;33;23;64", "wc_main_review": "237;203;437;431", "wc_review": "328;280;521;593", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 3.5, 1.118033988749895 ], "correctness_avg": [ 2.75, 0.82915619758885 ], "technical_novelty_avg": [ 2.25, 0.82915619758885 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 61.0, 22.68259244442751 ], "wc_summary_review_avg": [ 42.5, 15.724185193516387 ], "wc_main_review_avg": [ 327.0, 107.69401097554126 ], "wc_review_avg": [ 430.5, 130.14703223662076 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.8944271909999159, "corr_recommendation_correctness": 0.9045340337332909, "gs_citation": 70, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11154211416501540994&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff_unique_index": "0;1;1;1", "aff_unique_norm": "Paris Diderot University;Skolkovo Institute of Science and Technology", "aff_unique_dep": "Institut Mathematiques de Jussieu;", "aff_unique_url": "https://www.univ-paris-diderot.fr;https://www.skoltech.ru", "aff_unique_abbr": "Paris Diderot;Skoltech", "aff_campus_unique_index": "0", "aff_campus_unique": "Paris;", "aff_country_unique_index": "0;1;1;1", "aff_country_unique": "France;Russian Federation" }, { "title": "Closed-form Sample Probing for Learning Generative Models in Zero-shot Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6138", "id": "ljxWpdBl4V", "poster": "", "openreview": "https://openreview.net/forum?id=ljxWpdBl4V", "slides": "https://iclr.cc/virtual/2022/poster/6138", "video": "https://iclr.cc/virtual/2022/poster/6138", "author_site": "Samet Cetin, Orhun Baran, Ramazan Gokberk Cinbis", "tldr": "", "abstract": "Generative model based approaches have led to significant advances in zero-shot learning (ZSL) over the past few years. These approaches typically aim to learn a conditional generator that synthesizes training samples of classes conditioned on class definitions. The final zero-shot learning model is then obtained by training a supervised classification model over the real and/or synthesized training samples of seen and unseen classes, combined. Therefore, naturally, the generative model needs to produce not only relevant samples, but also those that are sufficiently rich for classifier training purposes, which is handled by various heuristics in existing works. In this paper, we introduce a principled approach for training generative models {\\em directly} for training data generation purposes. Our main observation is that the use of closed-form models opens doors to end-to-end training thanks to the differentiability of the solvers. In our approach, at each generative model update step, we fit a task-specific closed-form ZSL model from generated samples, and measure its loss on novel samples all within the compute graph, a procedure that we refer to as {\\em sample probing}. In this manner, the generator receives feedback directly based on the value of its samples for model training purposes. Our experimental results show that the proposed sample probing approach improves the ZSL results even when integrated into state-of-the-art generative models.\n", "keywords": "zero-shot learning;generative zero-shot learning;generative models", "primary_area": "", "supplementary_material": "/attachment/fe526ff8e2dc83ef303fadc220bed5fbcdc8a592.zip", "author": "Samet Cetin;Orhun Bu\u011fra Baran;Ramazan Gokberk Cinbis", "authorids": "~Samet_Cetin1;~Orhun_Bu\u011fra_Baran1;~Ramazan_Gokberk_Cinbis1", "gender": "M;M;M", "homepage": ";http://user.ceng.metu.edu.tr/~bugra;http://user.ceng.metu.edu.tr/~gcinbis/", "dblp": ";;54/2808", "google_scholar": ";;https://scholar.google.it/citations?user=Za7uka8AAAAJ", "orcid": ";;", "linkedin": "cetinsamet/;;", "or_profile": "~Samet_Cetin1;~Orhun_Bu\u011fra_Baran1;~Ramazan_G\u00f6kberk_Cinbi\u015f1", "aff": "METU;METU;METU", "aff_domain": "metu.edu.tr;metu.edu.tr;metu.edu.tr", "position": "MS student;PhD student;Assistant Professor", "bibtex": "@inproceedings{\ncetin2022closedform,\ntitle={Closed-form Sample Probing for Learning Generative Models in Zero-shot Learning},\nauthor={Samet Cetin and Orhun Bu{\\u{g}}ra Baran and Ramazan Gokberk Cinbis},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=ljxWpdBl4V}\n}", "github": "", "project": "", "reviewers": "dVi5;vUDu;yvpj;NR3H;UDYK", "pdf_size": 0, "recommendation": "5;5;6;6;6", "confidence": "2;5;4;3;4", "correctness": "3;4;3;3;3", "technical_novelty": "2;2;3;3;2", "empirical_novelty": "2;2;2;2;2", "wc_summary_paper": "22;76;74;177;140", "wc_summary_review": "16;18;40;240;105", "wc_main_review": "130;138;303;665;449", "wc_review": "168;232;417;1082;694", "wc_reply_reviewers": "0;0;0;295;0", "wc_reply_authors": "777;1015;650;1559;1263", "reply_reviewers": "0;0;0;1;0", "reply_authors": "2;2;1;2;2", "recommendation_avg": [ 5.6, 0.48989794855663565 ], "confidence_avg": [ 3.6, 1.019803902718557 ], "correctness_avg": [ 3.2, 0.39999999999999997 ], "technical_novelty_avg": [ 2.4, 0.4898979485566356 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 97.8, 54.480822313911524 ], "wc_summary_review_avg": [ 83.8, 84.48999940821399 ], "wc_main_review_avg": [ 337.0, 201.8583661877803 ], "wc_review_avg": [ 518.6, 335.7282234188839 ], "wc_reply_reviewers_avg": [ 59.0, 118.0 ], "wc_reply_authors_avg": [ 1052.8, 328.68367772069246 ], "reply_reviewers_avg": [ 0.2, 0.4 ], "reply_authors_avg": [ 1.8, 0.4 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.08006407690254361, "corr_recommendation_correctness": -0.6123724356957946, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11977259761302754277&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=ljxWpdBl4V", "email": "metu.edu.tr;metu.edu.tr;metu.edu.tr", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Middle East Technical University", "aff_unique_dep": "", "aff_unique_url": "https://www.metu.edu.tr", "aff_unique_abbr": "METU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "T\u00fcrkiye" }, { "id": "lkQ7meEa-qv", "title": "Learning Neural Acoustic Fields", "track": "main", "status": "Reject", "tldr": "", "abstract": "Our sensory perception of the world is rich and multimodal. When we walk into a cathedral, acoustics as much as appearance inform us of the sanctuary's wide open space. Similarly, when we drop a wineglass, the sound immediately informs us as to whether it has shattered or not. In this vein, while recent advances in learned implicit functions have led to increasingly higher quality representations of the visual world, there have not been commensurate advances in learning auditory representations. To address this gap, we introduce Neural Acoustic Fields (NAFs), an implicit representation that captures how sounds propagate in a physical scene. By modeling the acoustic properties of the scene as a linear time-invariant system, NAFs continuously map all emitter and listener location pairs to an impulse response function that can then be applied to new sounds. We demonstrate that NAFs capture environment reverberations of a scene with high fidelity and can predict sound propagation for novel locations. Leveraging the scene structure learned by NAFs, we also demonstrate improved cross-modal generation of novel views of the scene given sparse visual views. Finally, the continuous nature of NAFs enables potential downstream applications such as sound source localization.", "keywords": "Audio-Visual Learning;Acoustic", "primary_area": "", "supplementary_material": "", "author": "Andrew Luo;Yilun Du;Michael J. Tarr;Joshua B. Tenenbaum;Antonio Torralba;Chuang Gan", "authorids": "~Andrew_Luo2;~Yilun_Du1;~Michael_J._Tarr1;~Joshua_B._Tenenbaum1;~Antonio_Torralba1;~Chuang_Gan1", "gender": "M;;;M;M;M", "homepage": "https://andrewluo.net/;https://yilundu.github.io;;http://web.mit.edu/torralba/www//;http://people.csail.mit.edu/ganchuang/;https://tarrlab.org", "dblp": "234/8054;204/4379;t/JoshuaBTenenbaum;t/AntonioBTorralba;139/6993;36/1880", "google_scholar": "bWYvvkUAAAAJ;;;https://scholar.google.com.tw/citations?user=8cxDHS4AAAAJ;PTeSCbIAAAAJ;O8ALPlkAAAAJ", "orcid": ";;;;;0000-0003-4724-1744", "linkedin": ";;;;;michael-tarr-ab078046/", "or_profile": "~Andrew_Luo2;~Yilun_Du1;~Joshua_B._Tenenbaum1;~Antonio_Torralba1;~Chuang_Gan1;~Michael_Tarr1", "aff": "Carnegie Mellon University;Massachusetts Institute of Technology;Massachusetts Institute of Technology;Massachusetts Institute of Technology;MIT-IBM Watson AI Lab;Carnegie Mellon University", "aff_domain": "cmu.edu;mit.edu;mit.edu;mit.edu;ibm.com;cmu.edu", "position": "PhD student;PhD student;Professor;Full Professor;PhD student;Full Professor", "bibtex": "@misc{\nluo2022learning,\ntitle={Learning Neural Acoustic Fields},\nauthor={Andrew Luo and Yilun Du and Michael J. Tarr and Joshua B. Tenenbaum and Antonio Torralba and Chuang Gan},\nyear={2022},\nurl={https://openreview.net/forum?id=lkQ7meEa-qv}\n}", "github": "", "project": "", "reviewers": "U1r6;V59y;HhWA;doFB", "site": "https://openreview.net/forum?id=lkQ7meEa-qv", "pdf_size": 0, "recommendation": "3;3;5;6", "confidence": "4;4;4;4", "correctness": "3;4;2;3", "technical_novelty": "2;2;2;4", "empirical_novelty": "2;3;2;3", "wc_summary_paper": "63;75;71;98", "wc_summary_review": "97;213;67;40", "wc_main_review": "563;359;583;117", "wc_review": "723;647;721;255", "wc_reply_reviewers": "169;368;117;45", "wc_reply_authors": "1964;2134;1578;728", "reply_reviewers": "1;2;1;1", "reply_authors": "5;5;4;1", "recommendation_avg": [ 4.25, 1.299038105676658 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.5, 0.8660254037844386 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 76.75, 13.007209539328564 ], "wc_summary_review_avg": [ 104.25, 65.9445790038878 ], "wc_main_review_avg": [ 405.5, 188.21995112102223 ], "wc_review_avg": [ 586.5, 193.82659776202027 ], "wc_reply_reviewers_avg": [ 174.75, 119.94660270303615 ], "wc_reply_authors_avg": [ 1601.0, 542.7973839288469 ], "reply_reviewers_avg": [ 1.25, 0.4330127018922193 ], "reply_authors_avg": [ 3.75, 1.6393596310755 ], "replies_avg": [ 30, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": -0.5443310539518174, "gs_citation": 84, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13265441111858970234&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff_unique_index": "0;1;1;1;1;0", "aff_unique_norm": "Carnegie Mellon University;Massachusetts Institute of Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.cmu.edu;https://web.mit.edu", "aff_unique_abbr": "CMU;MIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "The Inductive Bias of In-Context Learning: Rethinking Pretraining Example Design", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6999", "id": "lnEaqbTJIRz", "poster": "", "openreview": "https://openreview.net/forum?id=lnEaqbTJIRz", "slides": "https://iclr.cc/virtual/2022/poster/6999", "video": "https://iclr.cc/virtual/2022/poster/6999", "author_site": "Yoav Levine, Noam Wies, Daniel Jannai, Dan Navon, Yedid Hoshen, Amnon Shashua", "tldr": "", "abstract": "Pretraining Neural Language Models (NLMs) over a large corpus involves chunking the text into training examples, which are contiguous text segments of sizes processable by the neural architecture. We highlight a bias introduced by this common practice: we prove that the pretrained NLM can model much stronger dependencies between text segments that appeared in the same training example, than it can between text segments that appeared in different training examples. This intuitive result has a twofold role. First, it formalizes the motivation behind a broad line of recent successful NLM training heuristics, proposed for the pretraining and fine-tuning stages, which do not necessarily appear related at first glance. Second, our result clearly indicates further improvements to be made in NLM pretraining for the benefit of Natural Language Understanding tasks. As an example, we propose ``kNN-Pretraining\": we show that including semantically related non-neighboring sentences in the same pretraining example yields improved sentence representations and open domain question answering abilities.\tThis theoretically motivated degree of freedom for pretraining example design indicates new training schemes for self-improving representations. ", "keywords": "Language Modeling;Pretraining;Self-attention;Transformers;Expressivity;Separation Rank;Sentence Embeddings", "primary_area": "", "supplementary_material": "", "author": "Yoav Levine;Noam Wies;Daniel Jannai;Dan Navon;Yedid Hoshen;Amnon Shashua", "authorids": "~Yoav_Levine1;~Noam_Wies1;~Daniel_Jannai1;dan.nav@mail.huji.ac.il;~Yedid_Hoshen3;~Amnon_Shashua1", "gender": "M;M;M;;M;M", "homepage": ";;https://www.linkedin.com/in/daniel-jannai/;;https://www.cs.huji.ac.il/~ydidh/;http://www.cs.huji.ac.il/~shashua/", "dblp": "199/1895;236/6106;;;136/0280;47/1492", "google_scholar": ";https://scholar.google.co.il/citations?user=FxlR8voAAAAJ;;;https://scholar.google.co.il/citations?user=6y1-qS4AAAAJ;https://scholar.google.com.tw/citations?user=dwi5wvYAAAAJ", "orcid": ";0000-0002-1337-2298;;;;", "linkedin": ";noam-wies-a5ab1663/;;;;", "or_profile": "~Yoav_Levine1;~Noam_Wies1;~Daniel_Jannai1;dan.nav@mail.huji.ac.il;~Yedid_Hoshen3;~Amnon_Shashua1", "aff": "Hebrew University;Hebrew University of Jerusalem;Hebrew University of Jerusalem;;Hebrew University of Jerusalem;Hebrew University, Hebrew University of Jerusalem", "aff_domain": "huji.ac.il;huji.ac.il;mail.huji.ac.il;;huji.ac.il;cs.huji.ac.il", "position": "PhD student;PhD student;MS student;;Assistant Professor;Professor", "bibtex": "@inproceedings{\nlevine2022the,\ntitle={The Inductive Bias of In-Context Learning: Rethinking Pretraining Example Design},\nauthor={Yoav Levine and Noam Wies and Daniel Jannai and Dan Navon and Yedid Hoshen and Amnon Shashua},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=lnEaqbTJIRz}\n}", "github": "", "project": "", "reviewers": "cz9E;twvb;Z6SX;yRBu;ADar", "pdf_size": 0, "recommendation": "8;8;8;8;8", "confidence": "3;2;3;3;3", "correctness": "3;4;3;3;3", "technical_novelty": "3;4;2;3;4", "empirical_novelty": "3;4;2;2;2", "wc_summary_paper": "105;71;129;128;189", "wc_summary_review": "41;21;59;42;55", "wc_main_review": "240;111;188;205;427", "wc_review": "386;203;376;375;671", "wc_reply_reviewers": "0;0;5;0;345", "wc_reply_authors": "604;64;967;647;1936", "reply_reviewers": "0;0;1;0;1", "reply_authors": "1;1;2;1;4", "recommendation_avg": [ 8.0, 0.0 ], "confidence_avg": [ 2.8, 0.39999999999999997 ], "correctness_avg": [ 3.2, 0.39999999999999997 ], "technical_novelty_avg": [ 3.2, 0.7483314773547882 ], "empirical_novelty_avg": [ 2.6, 0.8 ], "wc_summary_paper_avg": [ 124.4, 38.56215761598409 ], "wc_summary_review_avg": [ 43.6, 13.32066064427737 ], "wc_main_review_avg": [ 234.2, 105.23383486312756 ], "wc_review_avg": [ 402.2, 150.74667492187015 ], "wc_reply_reviewers_avg": [ 70.0, 137.5136356875201 ], "wc_reply_authors_avg": [ 843.6, 618.4498686231569 ], "reply_reviewers_avg": [ 0.4, 0.48989794855663565 ], "reply_authors_avg": [ 1.8, 1.1661903789690602 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 40, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7588785425292236580&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=lnEaqbTJIRz", "email": "huji.ac.il;huji.ac.il;mail.huji.ac.il;;huji.ac.il;cs.huji.ac.il", "author_num": 6, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Hebrew University of Jerusalem", "aff_unique_dep": "", "aff_unique_url": "https://www.huji.ac.il", "aff_unique_abbr": "HUJI", "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Jerusalem", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "Israel" }, { "title": "Learning Long-Term Reward Redistribution via Randomized Return Decomposition", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6587", "id": "lpkGn3k2YdD", "poster": "", "openreview": "https://openreview.net/forum?id=lpkGn3k2YdD", "slides": "https://iclr.cc/virtual/2022/poster/6587", "video": "https://iclr.cc/virtual/2022/poster/6587", "author_site": "Zhizhou Ren, Ruihan Guo, Yuan Zhou, Jian Peng", "tldr": "", "abstract": "Many practical applications of reinforcement learning require agents to learn from sparse and delayed rewards. It challenges the ability of agents to attribute their actions to future outcomes. In this paper, we consider the problem formulation of episodic reinforcement learning with trajectory feedback. It refers to an extreme delay of reward signals, in which the agent can only obtain one reward signal at the end of each trajectory. A popular paradigm for this problem setting is learning with a designed auxiliary dense reward function, namely proxy reward, instead of sparse environmental signals. Based on this framework, this paper proposes a novel reward redistribution algorithm, randomized return decomposition (RRD), to learn a proxy reward function for episodic reinforcement learning. We establish a surrogate problem by Monte-Carlo sampling that scales up least-squares-based reward redistribution to long-horizon problems. We analyze our surrogate loss function by connection with existing methods in the literature, which illustrates the algorithmic properties of our approach. In experiments, we extensively evaluate our proposed method on a variety of benchmark tasks with episodic rewards and demonstrate substantial improvement over baseline algorithms.", "keywords": "Reinforcement Learning;Long-Term Credit Assignment;Reward Redistribution;Return Decomposition", "primary_area": "", "supplementary_material": "", "author": "Zhizhou Ren;Ruihan Guo;Yuan Zhou;Jian Peng", "authorids": "~Zhizhou_Ren1;~Ruihan_Guo1;~Yuan_Zhou1;~Jian_Peng1", "gender": "M;M;M;M", "homepage": ";https://github.com/guoruihan;http://yuanz.web.illinois.edu;http://jianpeng.web.engr.illinois.edu/", "dblp": "https://dblp.uni-trier.de/pid/239/5714.html;;40/7018;29/4181-1", "google_scholar": "xgpMeDgAAAAJ;;https://scholar.google.com.tw/citations?user=aR34e1gAAAAJ;https://scholar.google.com.tw/citations?user=4wcAVXAAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Zhizhou_Ren1;~Ruihan_Guo1;~Yuan_Zhou1;~Jian_Peng1", "aff": "University of Illinois, Urbana Champaign;Shanghai Jiao Tong University, Tsinghua University;;University of Illinois, Urbana Champaign", "aff_domain": "illinois.edu;sjtu.edu.cn;;illinois.edu", "position": "PhD student;Undergrad student;;Assistant Professor", "bibtex": "@inproceedings{\nren2022learning,\ntitle={Learning Long-Term Reward Redistribution via Randomized Return Decomposition},\nauthor={Zhizhou Ren and Ruihan Guo and Yuan Zhou and Jian Peng},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=lpkGn3k2YdD}\n}", "github": "", "project": "", "reviewers": "UMu2;73tN;ySc8;qFZu", "pdf_size": 0, "recommendation": "5;8;8;8", "confidence": "4;3;4;2", "correctness": "3;2;4;4", "technical_novelty": "2;2;3;3", "empirical_novelty": "3;2;3;3", "wc_summary_paper": "63;60;114;86", "wc_summary_review": "50;51;45;37", "wc_main_review": "263;567;384;144", "wc_review": "376;678;543;267", "wc_reply_reviewers": "48;67;0;0", "wc_reply_authors": "834;950;324;354", "reply_reviewers": "1;2;0;0", "reply_authors": "2;3;1;1", "recommendation_avg": [ 7.25, 1.299038105676658 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 80.75, 21.672274915199836 ], "wc_summary_review_avg": [ 45.75, 5.539629951540085 ], "wc_main_review_avg": [ 339.5, 156.37215225224728 ], "wc_review_avg": [ 466.0, 156.9824830992299 ], "wc_reply_reviewers_avg": [ 28.75, 29.524354353651834 ], "wc_reply_authors_avg": [ 615.5, 279.7262054223737 ], "reply_reviewers_avg": [ 0.75, 0.82915619758885 ], "reply_authors_avg": [ 1.75, 0.82915619758885 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.5222329678670935, "corr_recommendation_correctness": 0.17407765595569782, "gs_citation": 45, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12389513535108604835&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "pdf": "https://openreview.net/pdf?id=lpkGn3k2YdD", "email": "illinois.edu;sjtu.edu.cn;;illinois.edu", "author_num": 4, "aff_unique_index": "0;1;0", "aff_unique_norm": "University of Illinois Urbana-Champaign;Shanghai Jiao Tong University", "aff_unique_dep": ";", "aff_unique_url": "https://illinois.edu;https://www.sjtu.edu.cn", "aff_unique_abbr": "UIUC;SJTU", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Urbana-Champaign;", "aff_country_unique_index": "0;1;0", "aff_country_unique": "United States;China" }, { "id": "lpwzJuyFs2", "title": "Learning Stochastic Representations of Physical Systems", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Learning representations of physical systems is an important problem at the interface of statistical physics and machine learning. Recently, there has been a growing interest in devising methods to analyze high-dimensional simulation data generated by unbiased or biased samplers. As statistical physics systems consisting of $N \\gg 1$ objects tend to have many degrees of freedom, dimensionality reduction methods are of particular interest. Here, we use a new method, multiscale reweighted stochastic embedding (MRSE), to analyze handwritten digits data sets and a biased trajectory of alanine tetrapeptide, and show that we can reconstruct low-dimensional representations of these data sets while retaining the most informative characteristics of their high-dimensional representation.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jakub Rydzewski;Omar Valsson", "authorids": "~Jakub_Rydzewski1;valsson@mpip-mainz.mpg.de", "gender": ";", "homepage": "https://fizyka.umk.pl/~jr;", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": "~Jakub_Rydzewski1;valsson@mpip-mainz.mpg.de", "aff": "Institute of Physics, Nicolaus Copernicus University;", "aff_domain": "fizyka.umk.pl;", "position": "Assistant Professor;", "bibtex": "@misc{\nrydzewski2022learning,\ntitle={Learning Stochastic Representations of Physical Systems},\nauthor={Jakub Rydzewski and Omar Valsson},\nyear={2022},\nurl={https://openreview.net/forum?id=lpwzJuyFs2}\n}", "github": "", "project": "", "reviewers": "sr3f;sVNN;VNt9;AZ5u", "site": "https://openreview.net/forum?id=lpwzJuyFs2", "pdf_size": 0, "recommendation": "1;3;3;3", "confidence": "5;4;3;4", "correctness": "3;4;4;3", "technical_novelty": "1;2;2;2", "empirical_novelty": "1;2;2;2", "wc_summary_paper": "27;24;30;43", "wc_summary_review": "17;83;35;42", "wc_main_review": "203;97;309;150", "wc_review": "247;204;374;235", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 2.5, 0.8660254037844386 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 1.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 31.0, 7.245688373094719 ], "wc_summary_review_avg": [ 44.25, 24.159625411003375 ], "wc_main_review_avg": [ 189.75, 78.38805712606991 ], "wc_review_avg": [ 265.0, 64.85753618508801 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.816496580927726, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:SelpAduAieYJ:scholar.google.com/&scioq=Learning+Stochastic+Representations+of+Physical+Systems&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "Nicolaus Copernicus University", "aff_unique_dep": "Institute of Physics", "aff_unique_url": "https://wwwfiz.univ.gda.pl/", "aff_unique_abbr": "", "aff_country_unique_index": "0", "aff_country_unique": "Poland" }, { "title": "Approximation and Learning with Deep Convolutional Models: a Kernel Perspective", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6387", "id": "lrocYB-0ST2", "poster": "", "openreview": "https://openreview.net/forum?id=lrocYB-0ST2", "slides": "https://iclr.cc/virtual/2022/poster/6387", "video": "https://iclr.cc/virtual/2022/poster/6387", "tldr": "", "abstract": "The empirical success of deep convolutional networks on tasks involving high-dimensional data such as images or audio suggests that they can efficiently approximate certain functions that are well-suited for such tasks. In this paper, we study this through the lens of kernel methods, by considering simple hierarchical kernels with two or three convolution and pooling layers, inspired by convolutional kernel networks. These achieve good empirical performance on standard vision datasets, while providing a precise description of their functional space that yields new insights on their inductive bias. We show that the RKHS consists of additive models of interaction terms between patches, and that its norm encourages spatial similarities between these terms through pooling layers. We then provide generalization bounds which illustrate how pooling and patches yield improved sample complexity guarantees when the target function presents such regularities.", "keywords": "kernel methods;deep learning theory;convolution;approximation;generalization", "primary_area": "", "supplementary_material": "/attachment/f97f90f40e8281a4aaf207b0702056a29a294971.zip", "author": "Alberto Bietti", "authorids": "~Alberto_Bietti1", "gender": "M", "homepage": "http://alberto.bietti.me", "dblp": "166/6461", "google_scholar": "iT7Tp70AAAAJ", "orcid": "", "linkedin": "", "or_profile": "~Alberto_Bietti1", "aff": "New York University", "aff_domain": "nyu.edu", "position": "Postdoc", "bibtex": "@inproceedings{\nbietti2022approximation,\ntitle={Approximation and Learning with Deep Convolutional Models: a Kernel Perspective},\nauthor={Alberto Bietti},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=lrocYB-0ST2}\n}", "github": "", "project": "", "reviewers": "FaP6;GUNE;GwRv;nNN4", "pdf_size": 0, "recommendation": "6;8;8;8", "confidence": "4;3;3;4", "correctness": "3;4;4;4", "technical_novelty": "4;3;3;4", "empirical_novelty": "4;3;3;4", "wc_summary_paper": "121;74;99;106", "wc_summary_review": "8;14;62;81", "wc_main_review": "284;150;527;375", "wc_review": "413;238;688;562", "wc_reply_reviewers": "0;0;234;0", "wc_reply_authors": "200;125;528;310", "reply_reviewers": "0;0;1;0", "reply_authors": "1;1;2;1", "recommendation_avg": [ 7.5, 0.8660254037844386 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.5, 0.5 ], "empirical_novelty_avg": [ 3.5, 0.5 ], "wc_summary_paper_avg": [ 100.0, 16.98528775146303 ], "wc_summary_review_avg": [ 41.25, 31.059418861272984 ], "wc_main_review_avg": [ 334.0, 137.19147203817008 ], "wc_review_avg": [ 475.25, 168.0407316694378 ], "wc_reply_reviewers_avg": [ 58.5, 101.32497224277932 ], "wc_reply_authors_avg": [ 290.75, 151.95949295782742 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 1, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 1.0, "gs_citation": 23, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16497248736027137488&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=lrocYB-0ST2", "email": "nyu.edu", "author_num": 1, "aff_unique_index": "0", "aff_unique_norm": "New York University", "aff_unique_dep": "", "aff_unique_url": "https://www.nyu.edu", "aff_unique_abbr": "NYU", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "id": "lsQCDXjOl3k", "title": "Unconditional Diffusion Guidance", "track": "main", "status": "Reject", "tldr": "", "abstract": "Classifier guidance is a recently introduced method to trade off mode coverage and sample fidelity in conditional diffusion models post training, in the same spirit as low temperature sampling or truncation in other types of generative models. Classifier guidance combines the score estimate of a diffusion model with the gradient of an image classifier and thereby requires training an image classifier separate from the diffusion model. It also raises the question of whether guidance can be performed without a classifier. We show that guidance can be indeed performed by a pure generative model without such a classifier: in what we call unconditional guidance, we jointly train a conditional and an unconditional diffusion model, and we combine the resulting conditional and unconditional score estimates to attain a trade-off between sample quality and diversity similar to that obtained using classifier guidance.", "keywords": "diffusion;score;guidance;generative", "primary_area": "", "supplementary_material": "/attachment/a66599ec407d12b6b2d99ec8e3417b721b397a23.zip", "author": "Jonathan Ho;Tim Salimans", "authorids": "~Jonathan_Ho1;~Tim_Salimans1", "gender": ";M", "homepage": ";", "dblp": "80/8677;116/2791", "google_scholar": "iVLAQysAAAAJ;", "orcid": ";", "linkedin": ";", "or_profile": "~Jonathan_Ho1;~Tim_Salimans1", "aff": "Google;Google", "aff_domain": "google.com;google.com", "position": "Researcher;Research Scientist", "bibtex": "@misc{\nho2022unconditional,\ntitle={Unconditional Diffusion Guidance},\nauthor={Jonathan Ho and Tim Salimans},\nyear={2022},\nurl={https://openreview.net/forum?id=lsQCDXjOl3k}\n}", "github": "", "project": "", "reviewers": "4V2e;Y1QD;U6n2;zw1c", "site": "https://openreview.net/forum?id=lsQCDXjOl3k", "pdf_size": 0, "recommendation": "5;5;5;6", "confidence": "3;3;4;4", "correctness": "3;3;3;3", "technical_novelty": "4;2;2;2", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "33;192;91;155", "wc_summary_review": "18;21;70;61", "wc_main_review": "173;578;434;568", "wc_review": "224;791;595;784", "wc_reply_reviewers": "0;237;209;159", "wc_reply_authors": "222;491;476;683", "reply_reviewers": "0;1;1;1", "reply_authors": "1;1;1;1", "recommendation_avg": [ 5.25, 0.4330127018922193 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.5, 0.8660254037844386 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 117.75, 60.825056514564785 ], "wc_summary_review_avg": [ 42.5, 23.243278598338918 ], "wc_main_review_avg": [ 438.25, 163.35601458164925 ], "wc_review_avg": [ 598.5, 230.07009801362713 ], "wc_reply_reviewers_avg": [ 151.25, 91.68526326515074 ], "wc_reply_authors_avg": [ 468.0, 163.8093403930313 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:jBSiVSlXCLkJ:scholar.google.com/&scioq=Unconditional+Diffusion+Guidance&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Google", "aff_unique_dep": "Google", "aff_unique_url": "https://www.google.com", "aff_unique_abbr": "Google", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "lsljy2bG3n", "title": "$m$-mix: Generating hard negatives via multiple samples mixing for contrastive learning", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Negative pairs are essential in contrastive learning, which plays the role of avoiding degenerate solutions. Hard negatives can improve the representation ability on the basis of common negatives. Inspired by recent hard negative mining methods via mixup operation in vision, we propose $m$-mix, which generates hard negatives dynamically. Compared with previous methods, $m$-mix mainly has three advantages: 1) adaptively chooses samples to mix; 2) simultaneously mixes multiple samples; 3) automatically and comprehensively assigns different mixing weights to the selected mixing samples. We evaluate our method on two image classification datasets, five node classification datasets (PPI, DBLP, Pubmed, etc), five graph classification datasets (IMDB, PTC\\_MR, etc), and downstream combinatorial tasks (graph edit distance and clustering). Results show that our method achieves state-of-the-art performance on most benchmarks under self-supervised settings.", "keywords": "Graph learning;self supervised learning", "primary_area": "", "supplementary_material": "", "author": "Shaofeng Zhang;Meng Liu;Junchi Yan;Hengrui Zhang;Lingxiao Huang;Pinyan Lu;Xiaokang Yang", "authorids": "~Shaofeng_Zhang1;~Meng_Liu5;~Junchi_Yan2;~Hengrui_Zhang2;~Lingxiao_Huang2;~Pinyan_Lu2;~Xiaokang_Yang1", "gender": "M;;M;M;;M;M", "homepage": "https://sherrylone.github.io;https://github.com/ruby135;https://hengruizhang98.github.io/;https://sites.google.com/site/lingxiaohuang1990;http://pinyanlu.com;https://icne.sjtu.edu.cn/info/1064/1078.htm;http://thinklab.sjtu.edu.cn/", "dblp": "132/2540;;;119/4814.html;;06/3071-1.html;60/7949.html", "google_scholar": "VoVVJIgAAAAJ;;;;;yDEavdMAAAAJ;ga230VoAAAAJ", "orcid": ";;;;;0000-0003-4029-3322;0000-0001-9639-7679", "linkedin": ";;;;;;", "or_profile": "~Shaofeng_Zhang1;~Meng_Liu5;~Hengrui_Zhang2;~Lingxiao_Huang2;~Pinyan_Lu2;~Xiaokang_Yang1;~Junchi_Yan1", "aff": "Shanghai Jiaotong University;Shanghai Jiaotong University;;Nanjing University;Institute for Theoretical Computer Science;Shanghai Jiaotong University;Shanghai Jiaotong University", "aff_domain": "sjtu.edu.cn;sjtu.edu.cn;;nju.edu.cn;shufe.edu.cn;sjtu.edu.cn;sjtu.edu.cn", "position": "PhD student;MS student;;Associate Professor;Full Professor;Full Professor;Associate Professor", "bibtex": "@misc{\nzhang2022mmix,\ntitle={\\$m\\$-mix: Generating hard negatives via multiple samples mixing for contrastive learning},\nauthor={Shaofeng Zhang and Meng Liu and Junchi Yan and Hengrui Zhang and Lingxiao Huang and Pinyan Lu and Xiaokang Yang},\nyear={2022},\nurl={https://openreview.net/forum?id=lsljy2bG3n}\n}", "github": "", "project": "", "reviewers": "AxgN;7bmf;docm;qctw", "site": "https://openreview.net/forum?id=lsljy2bG3n", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "4;4;3;3", "correctness": "2;3;4;3", "technical_novelty": "2;2;2;3", "empirical_novelty": "3;2;3;2", "wc_summary_paper": "60;58;61;77", "wc_summary_review": "84;62;32;47", "wc_main_review": "928;241;314;225", "wc_review": "1072;361;407;349", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 64.0, 7.582875444051551 ], "wc_summary_review_avg": [ 56.25, 19.2142525225417 ], "wc_main_review_avg": [ 427.0, 291.1915177336043 ], "wc_review_avg": [ 547.25, 303.7370367604188 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": -0.6882472016116854, "corr_recommendation_correctness": 0.6488856845230502, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:uEUDHZA9IKwJ:scholar.google.com/&scioq=%24m%24-mix:+Generating+hard+negatives+via+multiple+samples+mixing+for+contrastive+learning&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;1;2;0;0", "aff_unique_norm": "Shanghai Jiao Tong University;Nanjing University;Institute for Theoretical Computer Science", "aff_unique_dep": ";;Theoretical Computer Science", "aff_unique_url": "https://www.sjtu.edu.cn;https://www.nju.edu.cn;https://www.tu-dresden.de/ics", "aff_unique_abbr": "SJTU;Nanjing U;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;0;0", "aff_country_unique": "China;Germany" }, { "title": "Weighted Training for Cross-Task Learning", "status": "Oral", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7203", "id": "ltM1RMZntpu", "poster": "", "openreview": "https://openreview.net/forum?id=ltM1RMZntpu", "slides": "https://iclr.cc/virtual/2022/poster/7203", "video": "https://iclr.cc/virtual/2022/poster/7203", "author_site": "Shuxiao Chen, Koby Crammer, Hangfeng He, Dan Roth, Weijie J Su", "tldr": "", "abstract": "In this paper, we introduce Target-Aware Weighted Training (TAWT), a weighted training algorithm for cross-task learning based on minimizing a representation-based task distance between the source and target tasks. We show that TAWT is easy to implement, is computationally efficient, requires little hyperparameter tuning, and enjoys non-asymptotic learning-theoretic guarantees. The effectiveness of TAWT is corroborated through extensive experiments with BERT on four sequence tagging tasks in natural language processing (NLP), including part-of-speech (PoS) tagging, chunking, predicate detection, and named entity recognition (NER). As a byproduct, the proposed representation-based task distance allows one to reason in a theoretically principled way about several critical aspects of cross-task learning, such as the choice of the source data and the impact of fine-tuning.", "keywords": "Cross-task learning;Natural language processing;Representation learning", "primary_area": "", "supplementary_material": "/attachment/4554ccf3b03ed539601ba4727c22f5b6646fff7a.zip", "author": "Shuxiao Chen;Koby Crammer;Hangfeng He;Dan Roth;Weijie J Su", "authorids": "~Shuxiao_Chen1;~Koby_Crammer1;~Hangfeng_He3;~Dan_Roth3;~Weijie_J_Su1", "gender": ";;M;M;M", "homepage": ";;https://hornhehhf.github.io;https://www.cis.upenn.edu/~danroth/;http://stat.wharton.upenn.edu/~suw/", "dblp": "245/7358;74/6961;190/7762-1.html;r/DanRoth;228/9127", "google_scholar": "piBuFuUAAAAJ;;BbpI6QoAAAAJ;E-bpPWgAAAAJ;Uhf4nBkAAAAJ", "orcid": ";;0000-0001-5136-1218;;", "linkedin": ";;;dan-roth-8667361/;", "or_profile": "~Shuxiao_Chen1;~Koby_Crammer1;~Hangfeng_He3;~Dan_Roth3;~Weijie_J_Su1", "aff": "The Wharton School, University of Pennsylvania;;University of Pennsylvania;Amazon;University of Pennsylvania", "aff_domain": "wharton.upenn.edu;;upenn.edu;amazon.com;upenn.edu", "position": "PhD student;;PhD student;VP and Distinguished Scientist;Assistant Professor", "bibtex": "@inproceedings{\nchen2022weighted,\ntitle={Weighted Training for Cross-Task Learning},\nauthor={Shuxiao Chen and Koby Crammer and Hangfeng He and Dan Roth and Weijie J Su},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=ltM1RMZntpu}\n}", "github": "", "project": "", "reviewers": "jFGp;HcJj;wLuT;xsx7", "pdf_size": 0, "recommendation": "6;8;8;8", "confidence": "3;3;3;3", "correctness": "3;3;3;3", "technical_novelty": "3;4;4;4", "empirical_novelty": "3;3;3;4", "wc_summary_paper": "37;289;46;46", "wc_summary_review": "26;51;14;65", "wc_main_review": "235;321;238;182", "wc_review": "298;661;298;293", "wc_reply_reviewers": "0;0;0;4", "wc_reply_authors": "155;126;0;155", "reply_reviewers": "0;0;0;1", "reply_authors": "1;1;0;1", "recommendation_avg": [ 7.5, 0.8660254037844386 ], "confidence_avg": [ 3.0, 0.0 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 3.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 104.5, 106.58447354094311 ], "wc_summary_review_avg": [ 39.0, 20.087309426600665 ], "wc_main_review_avg": [ 244.0, 49.72423956180728 ], "wc_review_avg": [ 387.5, 157.91849163413383 ], "wc_reply_reviewers_avg": [ 1.0, 1.7320508075688772 ], "wc_reply_authors_avg": [ 109.0, 64.03514659934808 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 0.75, 0.4330127018922193 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 31, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5570518371918150850&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "pdf": "https://openreview.net/pdf?id=ltM1RMZntpu", "email": "wharton.upenn.edu;;upenn.edu;amazon.com;upenn.edu", "author_num": 5, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "University of Pennsylvania;Amazon", "aff_unique_dep": "The Wharton School;Amazon.com, Inc.", "aff_unique_url": "https://www.wharton.upenn.edu;https://www.amazon.com", "aff_unique_abbr": "UPenn Wharton;Amazon", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "luO6l9cP6b6", "title": "Identifying the Limits of Cross-Domain Knowledge Transfer for Pretrained Models", "track": "main", "status": "Reject", "tldr": "", "abstract": "There is growing evidence that pretrained language models improve task-specific fine-tuning even where the task examples are radically different from those seen in training. What is the nature of this surprising cross-domain transfer? We offer a partial answer via a systematic exploration of how much transfer occurs when models are denied any information about word identity via random scrambling. In four classification tasks and two sequence labeling tasks, we evaluate LSTMs using GloVe embeddings, BERT, and baseline models. Among these models, we find that only BERT shows high rates of transfer into our scrambled domains, and for classification but not sequence labeling tasks. Our analyses seek to explain why transfer succeeds for some tasks but not others, to isolate the separate contributions of pretraining versus fine-tuning, to show that the fine-tuning process is not merely learning to unscramble the scrambled inputs, and to quantify the role of word frequency. These findings help explain where and why cross-domain transfer occurs, which can guide future studies and practical fine-tuning efforts.", "keywords": "transfer learning;pretrained language model", "primary_area": "", "supplementary_material": "/attachment/093f9d57c0fd9f4e0042900632a4ba53e3bc983d.zip", "author": "Zhengxuan Wu;Nelson F. Liu;Christopher Potts", "authorids": "~Zhengxuan_Wu1;~Nelson_F._Liu1;~Christopher_Potts1", "gender": "M;M;M", "homepage": "https://cs.stanford.edu/~wuzhengx/;http://nelsonliu.me;http://web.stanford.edu/~cgpotts/", "dblp": "234/4650;203/9152;13/2617", "google_scholar": "CBvE6lwAAAAJ;ghGDz7MAAAAJ;3j08YoAAAAAJ", "orcid": ";;0000-0002-7978-6055", "linkedin": ";;", "or_profile": "~Zhengxuan_Wu1;~Nelson_F._Liu1;~Christopher_Potts1", "aff": "Stanford University;Stanford University;Stanford University", "aff_domain": "stanford.edu;stanford.edu;stanford.edu", "position": "MS student;PhD student;Full Professor", "bibtex": "@misc{\nwu2022identifying,\ntitle={Identifying the Limits of Cross-Domain Knowledge Transfer for Pretrained Models},\nauthor={Zhengxuan Wu and Nelson F. Liu and Christopher Potts},\nyear={2022},\nurl={https://openreview.net/forum?id=luO6l9cP6b6}\n}", "github": "", "project": "", "reviewers": "vdgZ;6a8W;4jKu;dnE4", "site": "https://openreview.net/forum?id=luO6l9cP6b6", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "4;3;5;4", "correctness": "2;3;3;4", "technical_novelty": "3;2;1;2", "empirical_novelty": "3;2;1;3", "wc_summary_paper": "130;55;22;132", "wc_summary_review": "16;17;41;37", "wc_main_review": "222;280;351;99", "wc_review": "368;352;414;268", "wc_reply_reviewers": "399;153;148;0", "wc_reply_authors": "1363;755;1599;371", "reply_reviewers": "3;1;2;0", "reply_authors": "7;3;7;1", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 84.75, 47.704166484700266 ], "wc_summary_review_avg": [ 27.75, 11.344051304538427 ], "wc_main_review_avg": [ 238.0, 92.34446382972831 ], "wc_review_avg": [ 350.5, 52.789677021175265 ], "wc_reply_reviewers_avg": [ 175.0, 143.19043264129067 ], "wc_reply_authors_avg": [ 1022.0, 485.87549845613745 ], "reply_reviewers_avg": [ 1.5, 1.118033988749895 ], "reply_authors_avg": [ 4.5, 2.598076211353316 ], "replies_avg": [ 31, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.9733285267845754, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13188776707872629330&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff_unique_index": "0;0;0", "aff_unique_norm": "Stanford University", "aff_unique_dep": "", "aff_unique_url": "https://www.stanford.edu", "aff_unique_abbr": "Stanford", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "lu_DAxnWsh", "title": "Guiding Transformers to Process in Steps", "track": "main", "status": "Reject", "tldr": "", "abstract": "Neural networks have matched or surpassed human abilities in many tasks that humans solve quickly and unconsciously, i.e., via Kahneman's \u201cSystem 1\u201d, but have not been as successful when applied to \u201cSystem 2\u201d tasks that involve conscious multi-step reasoning. In this work, we argue that the kind of training that works for System 1 tasks is not sufficient for System 2 tasks, propose an alternative, and empirically demonstrate its effectiveness. Specifically, while learning a direct mapping from inputs to outputs is feasible for System 1 tasks, we argue that algorithmic System 2 tasks can only be solved by learning a mapping from inputs to outputs through a series of intermediate steps. We first show that by using enough intermediate steps a 1-layer 1-head Transformer can in principle compute any finite function, proving the generality of the approach. We then show empirically that a 1-layer 1-head Transformer cannot learn to compute the sum of binary numbers directly from the inputs, but is able to compute the sum when trained to first generate a series of intermediate results. This demonstrates, at a small scale, how a fixed-size neural network can lack the expressivity to encode the direct input-output mapping for an algorithmic task and yet be fully capable of computing the outputs through intermediate steps. Finally, we show that a Frozen Pretrained Transformer is able to learn binary addition when trained to compute the carry bits before the sum, while it fails to learn the task without using intermediates. These results indicate that explicitly guiding the neural networks through the intermediate computations can be an effective approach for tackling algorithmic tasks.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/f4349e8c1f93fbfc52beb40fb5ade2901f00d0fa.zip", "author": "Simas Sakenis;Stuart Shieber", "authorids": "~Simas_Sakenis1;~Stuart_Shieber1", "gender": ";M", "homepage": ";http://www.eecs.harvard.edu/~shieber/", "dblp": ";", "google_scholar": ";", "orcid": ";0000-0002-7733-8195", "linkedin": "simassakenis/;", "or_profile": "~Simas_Sakenis1;~Stuart_Shieber1", "aff": "Harvard University;Harvard University", "aff_domain": "harvard.edu;harvard.edu", "position": "Undergrad student;Full Professor", "bibtex": "@misc{\nsakenis2022guiding,\ntitle={Guiding Transformers to Process in Steps},\nauthor={Simas Sakenis and Stuart Shieber},\nyear={2022},\nurl={https://openreview.net/forum?id=lu_DAxnWsh}\n}", "github": "", "project": "", "reviewers": "eRnm;urkD;4ZwD;KKJC", "site": "https://openreview.net/forum?id=lu_DAxnWsh", "pdf_size": 0, "recommendation": "3;3;3;3", "confidence": "5;3;4;4", "correctness": "3;2;3;3", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;2;2;1", "wc_summary_paper": "47;70;120;101", "wc_summary_review": "57;34;25;114", "wc_main_review": "245;615;682;734", "wc_review": "349;719;827;949", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 1.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 84.5, 28.0579756931964 ], "wc_summary_review_avg": [ 57.5, 34.64462440264001 ], "wc_main_review_avg": [ 569.0, 191.75896328464023 ], "wc_review_avg": [ 711.0, 224.28107365535774 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:2I95NZ2Y1yMJ:scholar.google.com/&scioq=Guiding+Transformers+to+Process+in+Steps&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Harvard University", "aff_unique_dep": "", "aff_unique_url": "https://www.harvard.edu", "aff_unique_abbr": "Harvard", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "lusH5Q9Vt5_", "title": "Generalized Sampling Method for Few Shot Learning", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Few shot learning is an important problem in machine learning as large labelled datasets take considerable time and effort to assemble. Most few-shot learning algorithms suffer from one of two limitations--- they either require the design of sophisticated models and loss functions, thus hampering interpretability; or employ statistical techniques but make assumptions that may not hold across different datasets or features. Developing on recent work in extrapolating distributions of small sample classes from the most similar larger classes, we propose a Generalized Sampling method that learns to estimate few-shot distributions for classification as weighted random variables of all large classes. We use a form of covariance shrinkage to provide robustness against singular covariances due to overparameterized features or small datasets. We show that a single hyperparameter in our method matches the accuracies from Euclidean, Mahalanobis and other forms of distances used for estimating the weights of random variables. Our method works with arbitrary off-the-shelf feature extractors and outperforms existing state-of-the-art on miniImagenet, CUB and Stanford Dogs datasets by 3% to 5% on 5way-1shot and 5way-5shot tasks.", "keywords": "Few-shot learning;distribution estimation;sampling method", "primary_area": "", "supplementary_material": "", "author": "Shakti Kumar;Hussain Asim Zaidi", "authorids": "~Shakti_Kumar1;~Hussain_Asim_Zaidi1", "gender": "M;", "homepage": "https://www.cs.toronto.edu/~shaktik/;", "dblp": ";", "google_scholar": ";QX-8a9EAAAAJ", "orcid": "0000-0003-3981-096X;", "linkedin": "https://ca.linkedin.com/in/shakti-kumar-6170a7101;", "or_profile": "~Shakti_Kumar1;~Hussain_Asim_Zaidi1", "aff": "The Vanguard Group;University of Virginia", "aff_domain": "vanguard.com;virginia.edu", "position": "Researcher;Principal Researcher", "bibtex": "@misc{\nkumar2022generalized,\ntitle={Generalized Sampling Method for Few Shot Learning},\nauthor={Shakti Kumar and Hussain Asim Zaidi},\nyear={2022},\nurl={https://openreview.net/forum?id=lusH5Q9Vt5_}\n}", "github": "", "project": "", "reviewers": "2AEZ;R4wg;vnvn;EVho", "site": "https://openreview.net/forum?id=lusH5Q9Vt5_", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "5;5;3;4", "correctness": "2;3;3;3", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "27;207;125;47", "wc_summary_review": "67;91;21;31", "wc_main_review": "402;416;194;129", "wc_review": "496;714;340;207", "wc_reply_reviewers": "315;0;0;129", "wc_reply_authors": "823;319;286;294", "reply_reviewers": "2;0;0;1", "reply_authors": "4;2;2;2", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 4.25, 0.82915619758885 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 101.5, 71.06862880343196 ], "wc_summary_review_avg": [ 52.5, 28.0490641555115 ], "wc_main_review_avg": [ 285.25, 125.96304021418347 ], "wc_review_avg": [ 439.25, 188.74503304723015 ], "wc_reply_reviewers_avg": [ 111.0, 129.01744068148307 ], "wc_reply_authors_avg": [ 430.5, 226.93666517334742 ], "reply_reviewers_avg": [ 0.75, 0.82915619758885 ], "reply_authors_avg": [ 2.5, 0.8660254037844386 ], "replies_avg": [ 20, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.9045340337332909, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:CdBsiiFv1bkJ:scholar.google.com/&scioq=Generalized+Sampling+Method+for+Few+Shot+Learning&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1", "aff_unique_norm": "Vanguard Group;University of Virginia", "aff_unique_dep": ";", "aff_unique_url": "https://www.vanguard.com;https://www.virginia.edu", "aff_unique_abbr": ";UVA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "lvM693mon8q", "title": "Compressed-VFL: Communication-Efficient Learning with Vertically Partitioned Data", "track": "main", "status": "Reject", "tldr": "", "abstract": "We propose Compressed Vertical Federated Learning (C-VFL) for communication-efficient training on vertically partitioned data. In C-VFL, a server and multiple parties collaboratively train a model on their respective features utilizing several local iterations and sharing compressed intermediate results periodically. Our work provides the first theoretical analysis of the effect message compression has on distributed training over vertically partitioned data. We prove convergence of non-convex objectives to a fixed point at a rate of $O(\\frac{1}{\\sqrt{T}})$ when the compression error is bounded over the course of training. We provide specific requirements for convergence with common compression techniques, such as quantization and top-$k$ sparsification. Finally, we experimentally show compression can reduce communication by over $90\\%$ without a significant decrease in accuracy over VFL without compression.", "keywords": "Federated Learning;Optimization;Split Learning;Cross-Silo Learning;Compression;Quantization;Sparsification", "primary_area": "", "supplementary_material": "/attachment/c2f57245fb66754358800cb7c15e8651abf08669.zip", "author": "Timothy Castiglia;Anirban Das;Shiqiang Wang;Stacy Patterson", "authorids": "~Timothy_Castiglia1;~Anirban_Das1;~Shiqiang_Wang1;~Stacy_Patterson1", "gender": "M;M;M;", "homepage": ";http://homepages.rpi.edu/~dasa2/;https://shiqiang.wang;https://www.cs.rpi.edu/~pattes3/", "dblp": ";;87/5094-1;", "google_scholar": "5zGUUmUAAAAJ;h-5uB0oAAAAJ;kA_vmOcAAAAJ;", "orcid": ";0000-0002-1417-5173;;", "linkedin": ";anirbandas01/;;", "or_profile": "~Timothy_Castiglia1;~Anirban_Das1;~Shiqiang_Wang1;~Stacy_Patterson1", "aff": "Rensselaer Polytechnic Institute;Capital One;IBM, International Business Machines;Rensselaer Polytechnic Institute", "aff_domain": "rpi.edu;capitalone.com;us.ibm.com;rpi.edu", "position": "PhD student;Researcher;Research Staff Member;Associate Professor", "bibtex": "@misc{\ncastiglia2022compressedvfl,\ntitle={Compressed-{VFL}: Communication-Efficient Learning with Vertically Partitioned Data},\nauthor={Timothy Castiglia and Anirban Das and Shiqiang Wang and Stacy Patterson},\nyear={2022},\nurl={https://openreview.net/forum?id=lvM693mon8q}\n}", "github": "", "project": "", "reviewers": "aP4B;JWDi;JhoT;2WsW", "site": "https://openreview.net/forum?id=lvM693mon8q", "pdf_size": 0, "recommendation": "5;5;5;6", "confidence": "5;3;3;3", "correctness": "2;3;2;4", "technical_novelty": "2;2;3;2", "empirical_novelty": "0;2;2;2", "wc_summary_paper": "55;87;109;7", "wc_summary_review": "28;66;13;28", "wc_main_review": "246;134;230;245", "wc_review": "329;287;352;280", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "693;846;945;411", "reply_reviewers": "0;0;0;0", "reply_authors": "1;2;2;1", "recommendation_avg": [ 5.25, 0.4330127018922193 ], "confidence_avg": [ 3.5, 0.8660254037844386 ], "correctness_avg": [ 2.75, 0.82915619758885 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 64.5, 38.35035853808932 ], "wc_summary_review_avg": [ 33.75, 19.60070151805797 ], "wc_main_review_avg": [ 213.75, 46.47781728954147 ], "wc_review_avg": [ 312.0, 29.740544715926102 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 723.75, 201.65239274553625 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": 0.8703882797784891, "gs_citation": 56, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7143121005409493886&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "Rensselaer Polytechnic Institute;Capital One;International Business Machines", "aff_unique_dep": ";;", "aff_unique_url": "https://www.rpi.edu;https://www.capitalone.com;https://www.ibm.com", "aff_unique_abbr": "RPI;Capital One;IBM", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Neural Variational Dropout Processes", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6082", "id": "lyLVzukXi08", "poster": "", "openreview": "https://openreview.net/forum?id=lyLVzukXi08", "slides": "https://iclr.cc/virtual/2022/poster/6082", "video": "https://iclr.cc/virtual/2022/poster/6082", "author_site": "Insu Jeon, Youngjin Park, Gunhee Kim", "tldr": "", "abstract": "Learning to infer the conditional posterior model is a key step for robust meta-learning. This paper presents a new Bayesian meta-learning approach called Neural Variational Dropout Processes (NVDPs). NVDPs model the conditional posterior distribution based on a task-specific dropout; a low-rank product of Bernoulli experts meta-model is utilized for a memory-efficient mapping of dropout rates from a few observed contexts. It allows for a quick reconfiguration of a globally learned and shared neural network for new tasks in multi-task few-shot learning. In addition, NVDPs utilize a novel prior conditioned on the whole task data to optimize the conditional dropout posterior in the amortized variational inference. Surprisingly, this enables the robust approximation of task-specific dropout rates that can deal with a wide range of functional ambiguities and uncertainties. We compared the proposed method with other meta-learning approaches in the few-shot learning tasks such as 1D stochastic regression, image inpainting, and classification. The results show the excellent performance of NVDPs.", "keywords": "Meta Learning;Few-shot Learning;Bayesian Neural Networks;Variatinoal Dropout", "primary_area": "", "supplementary_material": "", "author": "Insu Jeon;Youngjin Park;Gunhee Kim", "authorids": "~Insu_Jeon2;youngjin.create@gmail.com;~Gunhee_Kim1", "gender": ";;M", "homepage": ";;http://vision.snu.ac.kr/gunhee/", "dblp": ";;45/115", "google_scholar": ";;https://scholar.google.co.kr/citations?user=CiSdOV0AAAAJ", "orcid": ";;0000-0002-9543-7453", "linkedin": ";;", "or_profile": "~Insu_Jeon2;youngjin.create@gmail.com;~Gunhee_Kim1", "aff": ";;Seoul National University", "aff_domain": ";;snu.ac.kr", "position": ";;Full Professor", "bibtex": "@inproceedings{\njeon2022neural,\ntitle={Neural Variational Dropout Processes},\nauthor={Insu Jeon and Youngjin Park and Gunhee Kim},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=lyLVzukXi08}\n}", "github": "", "project": "", "reviewers": "uz6u;8oez;HrLQ", "pdf_size": 0, "recommendation": "6;6;8", "confidence": "3;3;4", "correctness": "4;3;3", "technical_novelty": "3;3;3", "empirical_novelty": "2;3;2", "wc_summary_paper": "82;95;94", "wc_summary_review": "92;16;25", "wc_main_review": "140;206;584", "wc_review": "314;317;703", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "368;856;556", "reply_reviewers": "0;0;0", "reply_authors": "1;2;1", "recommendation_avg": [ 6.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 90.33333333333333, 5.90668171555645 ], "wc_summary_review_avg": [ 44.333333333333336, 33.9050963065371 ], "wc_main_review_avg": [ 310.0, 195.61186058110076 ], "wc_review_avg": [ 444.6666666666667, 182.67335754169395 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 593.3333333333334, 200.966553325561 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.9999999999999998, "corr_recommendation_correctness": -0.49999999999999983, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1259027027664167106&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2, "pdf": "https://openreview.net/pdf?id=lyLVzukXi08", "email": ";;snu.ac.kr", "author_num": 3, "aff_unique_index": "0", "aff_unique_norm": "Seoul National University", "aff_unique_dep": "", "aff_unique_url": "https://www.snu.ac.kr", "aff_unique_abbr": "SNU", "aff_country_unique_index": "0", "aff_country_unique": "South Korea" }, { "id": "lycl1GD7fVP", "title": "Neural tangent kernel eigenvalues accurately predict generalization", "track": "main", "status": "Reject", "tldr": "", "abstract": "Finding a quantitative theory of neural network generalization has long been a central goal of deep learning research. We extend recent results to demonstrate that, by examining the eigensystem of a neural network's \"neural tangent kernel,\" one can predict its generalization performance when learning arbitrary functions. Our theory accurately predicts not only test mean-squared-error but all first- and second-order statistics of the network's learned function. Furthermore, using a measure quantifying the \"learnability\" of a given target function, we prove a new \"no free lunch\" theorem characterizing a fundamental tradeoff in the inductive bias of wide neural networks: improving a network\u2019s generalization for a given target function must worsen its generalization for orthogonal functions. We further demonstrate the utility of our theory by analytically predicting two surprising phenomena --- worse-than-chance generalization on hard-to-learn functions and nonmonotonic error curves in the small data regime --- which we subsequently observe in experiments. Though our theory is derived for infinite-width architectures, we find it agrees with networks as narrow as width 20, suggesting it is predictive of generalization in practical neural networks.", "keywords": "deep learning;generalization;neural tangent kernel;kernel regression;inductive bias", "primary_area": "", "supplementary_material": "/attachment/9867f41d310d8a0d2925d7d9d3ace57bca595e81.zip", "author": "James B Simon;Madeline Dickens;Michael R DeWeese", "authorids": "~James_B_Simon1;dickens@berkeley.edu;~Michael_R_DeWeese1", "gender": "M;;M", "homepage": "https://james-simon.github.io/;;", "dblp": "294/5406;;", "google_scholar": "zjGfh3sAAAAJ;;DZ9-LmkAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~James_B_Simon1;dickens@berkeley.edu;~Michael_R_DeWeese1", "aff": "University of California, Berkeley;;University of California, Berkeley", "aff_domain": "berkeley.edu;;berkeley.edu", "position": "PhD student;;Associate Professor", "bibtex": "@misc{\nsimon2022neural,\ntitle={Neural tangent kernel eigenvalues accurately predict generalization},\nauthor={James B Simon and Madeline Dickens and Michael R DeWeese},\nyear={2022},\nurl={https://openreview.net/forum?id=lycl1GD7fVP}\n}", "github": "", "project": "", "reviewers": "gb7t;Mosm;7tiq;q2g8", "site": "https://openreview.net/forum?id=lycl1GD7fVP", "pdf_size": 0, "recommendation": "3;5;6;8", "confidence": "5;3;2;5", "correctness": "2;3;3;3", "technical_novelty": "1;2;3;4", "empirical_novelty": "2;3;3;1", "wc_summary_paper": "76;21;116;224", "wc_summary_review": "63;112;75;193", "wc_main_review": "272;323;234;719", "wc_review": "411;456;425;1136", "wc_reply_reviewers": "804;0;0;0", "wc_reply_authors": "1240;886;813;1073", "reply_reviewers": "2;0;0;0", "reply_authors": "3;2;1;2", "recommendation_avg": [ 5.5, 1.8027756377319946 ], "confidence_avg": [ 3.75, 1.299038105676658 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 1.118033988749895 ], "empirical_novelty_avg": [ 2.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 109.25, 74.34169422336298 ], "wc_summary_review_avg": [ 110.75, 50.80538849374149 ], "wc_main_review_avg": [ 387.0, 194.2639956348062 ], "wc_review_avg": [ 607.0, 305.85208843491654 ], "wc_reply_reviewers_avg": [ 201.0, 348.14221232134435 ], "wc_reply_authors_avg": [ 1003.0, 166.47672509993703 ], "reply_reviewers_avg": [ 0.5, 0.8660254037844386 ], "reply_authors_avg": [ 2.0, 0.7071067811865476 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.053376051268362375, "corr_recommendation_correctness": 0.8006407690254357, "gs_citation": 18, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13730121206513998962&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "University of California, Berkeley", "aff_unique_dep": "", "aff_unique_url": "https://www.berkeley.edu", "aff_unique_abbr": "UC Berkeley", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Berkeley", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "lyzRAErG6Kv", "title": "Self-Supervised Structured Representations for Deep Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Recent reinforcement learning (RL) methods have found extracting high-level features from raw pixels with self-supervised learning to be effective in learning policies. However, these methods focus on learning global representations of images, and disregard local spatial structures present in the consecutively stacked frames. In this paper, we propose a novel approach that learns self-supervised structured representations ($\\mathbf{S}^3$R) for effectively encoding such spatial structures in an unsupervised manner. Given the input frames, the structured latent volumes are first generated individually using an encoder, and they are used to capture the change in terms of spatial structures, i.e., flow maps among multiple frames. To be specific, the proposed method establishes flow vectors between two latent volumes via a supervision by the image reconstruction loss. This enables for providing plenty of local samples for training the encoder of deep RL. We further attempt to leverage the structured representations in the self-predictive representations (SPR) method that predicts future representations using the action-conditioned transition model. The proposed method imposes similarity constraints on the three latent volumes; warped query representations by estimated flows, predicted target representations from the transition model, and target representations of future state. Experimental results on complex tasks in Atari Games and DeepMind Control Suite demonstrate that the RL methods are significantly boosted by the proposed self-supervised learning of structured representations.\nThe code is available at https://sites.google.com/view/iclr2022-s3r.", "keywords": "Reinforcement Learning;Representation Learning;Optical Flow Estimation;Structured representation;Self-supervised Learning", "primary_area": "", "supplementary_material": "", "author": "Hyesong Choi;Hunsang Lee;Wonil Song;Sangryul Jeon;Kwanghoon Sohn;Dongbo Min", "authorids": "~Hyesong_Choi1;~Hunsang_Lee1;~Wonil_Song1;~Sangryul_Jeon1;~Kwanghoon_Sohn2;~Dongbo_Min3", "gender": "F;M;M;;M;M", "homepage": ";;http://diml.yonsei.ac.kr/;https://sr-jeon.github.io/;https://diml.yonsei.ac.kr;http://cvl.ewha.ac.kr", "dblp": "275/3868;258/7814;;195/6099;21/2373;44/1149", "google_scholar": "Ll3vLUsAAAAJ;Ol5YZf4AAAAJ;;MIO6n6AAAAAJ;zEtk0QsAAAAJ;3REUPXYAAAAJ", "orcid": "0000-0003-4440-0164;0000-0002-6670-5455;;;;", "linkedin": ";;;;;", "or_profile": "~Hyesong_Choi1;~Hunsang_Lee1;~Wonil_Song1;~Sangryul_Jeon1;~Kwanghoon_Sohn2;~Dongbo_Min3", "aff": "Ewha Womans University;Yonsei University;Yonsei Univ.;Yonsei Univ.;Yonsei University;Ewha Womans University", "aff_domain": "ewha.ac.kr;yonsei.ac.kr;yonsei.ac.kr;ee.yonsei.ac.kr;yonsei.ac.kr;ewha.ac.kr", "position": "PhD student;PhD student;PhD student;PhD student;Full Professor;Associate Professor", "bibtex": "@misc{\nchoi2022selfsupervised,\ntitle={Self-Supervised Structured Representations for Deep Reinforcement Learning},\nauthor={Hyesong Choi and Hunsang Lee and Wonil Song and Sangryul Jeon and Kwanghoon Sohn and Dongbo Min},\nyear={2022},\nurl={https://openreview.net/forum?id=lyzRAErG6Kv}\n}", "github": "", "project": "", "reviewers": "Wtop;aHc1;pMqF;iodY", "site": "https://openreview.net/forum?id=lyzRAErG6Kv", "pdf_size": 0, "recommendation": "5;5;6;8", "confidence": "2;4;3;2", "correctness": "4;3;4;4", "technical_novelty": "2;3;2;4", "empirical_novelty": "2;3;3;2", "wc_summary_paper": "115;133;69;66", "wc_summary_review": "44;45;58;27", "wc_main_review": "376;231;111;216", "wc_review": "535;409;238;309", "wc_reply_reviewers": "0;174;0;0", "wc_reply_authors": "1779;1391;1152;734", "reply_reviewers": "0;1;0;0", "reply_authors": "4;3;2;1", "recommendation_avg": [ 6.0, 1.224744871391589 ], "confidence_avg": [ 2.75, 0.82915619758885 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.82915619758885 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 95.75, 28.977361853695378 ], "wc_summary_review_avg": [ 43.5, 11.01135777277262 ], "wc_main_review_avg": [ 233.5, 94.37293044088437 ], "wc_review_avg": [ 372.75, 111.64760409431095 ], "wc_reply_reviewers_avg": [ 43.5, 75.34421012924616 ], "wc_reply_authors_avg": [ 1264.0, 379.0771689247455 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 2.5, 1.118033988749895 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.4923659639173309, "corr_recommendation_correctness": 0.4714045207910316, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17252338828429699563&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;1;1;1;0", "aff_unique_norm": "Ewha Womans University;Yonsei University", "aff_unique_dep": ";", "aff_unique_url": "http://www.ewha.ac.kr;https://www.yonsei.ac.kr", "aff_unique_abbr": "Ewha;Yonsei", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "South Korea" }, { "id": "lzg1FIdbPht", "title": "Denoised Internal Models: a Brain-Inspired Autoencoder against Adversarial Attacks", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Despite its great success, deep learning severely suffers from robustness; that is, deep neural networks are very vulnerable to adversarial attacks, even the simplest ones. Inspired by recent advances in brain science, we propose the Denoised Internal Models (DIM), a novel generative autoencoder-based model to tackle this challenge. Simulating the pipeline in the human brain for visual signal processing, DIM adopts a two-stage approach. In the first stage, DIM uses a denoiser to reduce the noise and the dimensions of inputs, reflecting the information pre-processing in the thalamus. Inspired from the sparse coding of memory-related traces in the primary visual cortex, the second stage produces a set of internal models, one for each category. We evaluate DIM over 42 adversarial attacks, showing that DIM effectively defenses against all the attacks and outperforms the SOTA on the overall robustness.", "keywords": "adversarial attacks;robustness of neural network;autoencoder;visual cortex;engram cell", "primary_area": "", "supplementary_material": "/attachment/695f648d295d54d9c270fb7ce0c523b50da674b5.zip", "author": "Kaiyuan Liu;Xingyu Li;Yi Zhou;Jisong Guan;Yurui Lai;Hang Su;Ge Zhang;Jiachen Wang;Chunxu Guo", "authorids": "~Kaiyuan_Liu2;~Xingyu_Li2;yzhou@bsbii.cn;guanjs@shanghaitech.edu.cn;~Yurui_Lai1;~Hang_Su5;~Ge_Zhang4;~Jiachen_Wang1;~Chunxu_Guo1", "gender": "M;M;;;M;;M;;M", "homepage": "https://webvpn.tsinghua.edu.cn/http/77726476706e69737468656265737421f9f9479369247b59700f81b9991b2631506205de/render.userLayoutRootNode.uP;https://users.soe.ucsc.edu/~xli279/;;;https://github.com/laiyurui;https://me.suhang.tech/;;https://peppacat.github.io/;https://openreview.net/profile/activate?token=36c2e8330e2755a41cc9242149a319784536a726792bcbb5c2ee8fbc1f2e6b11", "dblp": ";45/2385;;;307/3251;;;;", "google_scholar": ";;;;LKGkoLcAAAAJ;yQQlNIEAAAAJ;;;", "orcid": ";0000-0002-0043-316X;;;0009-0000-4402-3798;0000-0003-3365-4361;;;", "linkedin": ";xingyu-li-588814164/;;;;%E6%9D%AD-%E8%8B%8F-4a6a05150/;%E5%93%BF-%E5%BC%A0-407862198;;", "or_profile": "~Kaiyuan_Liu2;~Xingyu_Li2;yzhou@bsbii.cn;guanjs@shanghaitech.edu.cn;~Yurui_Lai1;~Hang_Su5;~Ge_Zhang4;~Jiachen_Wang1;~Chunxu_Guo1", "aff": "Tsinghua University;Shanghai Center for Brain Science and Brain-Inspired Technology;;;ShanghaiTech University;ShanghaiTech University;;Shanghaitech University;ShanghaiTech University", "aff_domain": "tsinghua.edu.cn;bsbii.cn;;;shanghaitech.edu.cn;shanghaitech.edu.cn;;shanghaitech.edu.cn;shanghaitech.edu.cn", "position": "PhD student;Postdoc;;;MS student;PhD student;;Undergrad student;Undergrad student", "bibtex": "@misc{\nliu2022denoised,\ntitle={Denoised Internal Models: a Brain-Inspired Autoencoder against Adversarial Attacks},\nauthor={Kaiyuan Liu and Xingyu Li and Yi Zhou and Jisong Guan and Yurui Lai and Hang Su and Ge Zhang and Jiachen Wang and Chunxu Guo},\nyear={2022},\nurl={https://openreview.net/forum?id=lzg1FIdbPht}\n}", "github": "", "project": "", "reviewers": "czHs;yQ5L;kAGG;8M45", "site": "https://openreview.net/forum?id=lzg1FIdbPht", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "3;4;5;4", "correctness": "2;2;3;4", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "61;100;65;36", "wc_summary_review": "39;73;88;40", "wc_main_review": "194;219;231;401", "wc_review": "294;392;384;477", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 2.75, 0.82915619758885 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 65.5, 22.808989455914087 ], "wc_summary_review_avg": [ 60.0, 21.17781858454737 ], "wc_main_review_avg": [ 261.25, 81.78133955860591 ], "wc_review_avg": [ 386.75, 64.77412677913921 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 9, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.8703882797784891, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4682415831773580508&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff_unique_index": "0;1;2;2;2;2", "aff_unique_norm": "Tsinghua University;Shanghai Center for Brain Science and Brain-Inspired Technology;ShanghaiTech University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.tsinghua.edu.cn;;https://www.shanghaitech.edu.cn", "aff_unique_abbr": "THU;;ShanghaiTech", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Distribution Compression in Near-Linear Time", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7193", "id": "lzupY5zjaU9", "poster": "", "openreview": "https://openreview.net/forum?id=lzupY5zjaU9", "slides": "https://iclr.cc/virtual/2022/poster/7193", "video": "https://iclr.cc/virtual/2022/poster/7193", "author_site": "Abhishek Shetty, Raaz Dwivedi, Lester Mackey", "tldr": "", "abstract": "In distribution compression, one aims to accurately summarize a probability distribution $\\mathbb{P}$ using a small number of representative points. Near-optimal thinning procedures achieve this goal by sampling $n$ points from a Markov chain and identifying $\\sqrt{n}$ points with $\\widetilde{\\mathcal{O}}(1/\\sqrt{n})$ discrepancy to $\\mathbb{P}$. Unfortunately, these algorithms suffer from quadratic or super-quadratic runtime in the sample size $n$. To address this deficiency, we introduce Compress++, a simple meta-procedure for speeding up any thinning algorithm while suffering at most a factor of $4$ in error. When combined with the quadratic-time kernel halving and kernel thinning algorithms of Dwivedi and Mackey (2021), Compress++ delivers $\\sqrt{n}$ points with $\\mathcal{O}(\\sqrt{\\log n/n})$ integration error and better-than-Monte-Carlo maximum mean discrepancy in $\\mathcal{O}(n \\log^3 n)$ time and $\\mathcal{O}( \\sqrt{n} \\log^2 n )$ space. Moreover, Compress++ enjoys the same near-linear runtime given any quadratic-time input and reduces the runtime of super-quadratic algorithms by a square-root factor. In our benchmarks with high-dimensional Monte Carlo samples and Markov chains targeting challenging differential equation posteriors, Compress++ matches or nearly matches the accuracy of its input algorithm in orders of magnitude less time.", "keywords": "Distribution compression;linear time;thinning;i.i.d. sampling;Markov chain Monte Carlo;maximum mean discrepancy;reproducing kernel Hilbert space", "primary_area": "", "supplementary_material": "/attachment/d8d1db0c5d670ef21d0b2c6f8f991f806c77a127.zip", "author": "Abhishek Shetty;Raaz Dwivedi;Lester Mackey", "authorids": "~Abhishek_Shetty1;~Raaz_Dwivedi1;~Lester_Mackey1", "gender": "M;M;M", "homepage": "https://ashettyv.github.io/;https://raazdwivedi.github.io/;https://stanford.edu/~lmackey", "dblp": "223/4770;180/9006;05/2961", "google_scholar": "https://scholar.google.co.in/citations?user=M-y2aLUAAAAJ;9ehX_58AAAAJ;erv7TP0AAAAJ", "orcid": ";;0000-0002-1102-0387", "linkedin": ";raaz-dwivedi;lester-mackey-5902909", "or_profile": "~Abhishek_Shetty1;~Raaz_Dwivedi1;~Lester_Mackey1", "aff": "University of California, Berkeley;Massachusetts Institute of Technology;Microsoft Research New England", "aff_domain": "berkeley.edu;mit.edu;microsoft.com", "position": "PhD student;Postdoc;Principal Researcher", "bibtex": "@inproceedings{\nshetty2022distribution,\ntitle={Distribution Compression in Near-Linear Time},\nauthor={Abhishek Shetty and Raaz Dwivedi and Lester Mackey},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=lzupY5zjaU9}\n}", "github": "", "project": "", "reviewers": "hLMP;FqWv;aMbB", "pdf_size": 0, "recommendation": "6;8;8", "confidence": "3;3;3", "correctness": "4;3;4", "technical_novelty": "2;3;2", "empirical_novelty": "2;4;4", "wc_summary_paper": "164;113;109", "wc_summary_review": "29;14;18", "wc_main_review": "73;416;79", "wc_review": "266;543;206", "wc_reply_reviewers": "6;18;0", "wc_reply_authors": "375;672;296", "reply_reviewers": "1;1;0", "reply_authors": "3;5;2", "recommendation_avg": [ 7.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 3.0, 0.0 ], "correctness_avg": [ 3.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 3.3333333333333335, 0.9428090415820634 ], "wc_summary_paper_avg": [ 128.66666666666666, 25.037749277618563 ], "wc_summary_review_avg": [ 20.333333333333332, 6.342099196813483 ], "wc_main_review_avg": [ 189.33333333333334, 160.29625350858606 ], "wc_review_avg": [ 338.3333333333333, 146.77950205060046 ], "wc_reply_reviewers_avg": [ 8.0, 7.483314773547883 ], "wc_reply_authors_avg": [ 447.6666666666667, 161.87306412398848 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 3.3333333333333335, 1.247219128924647 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": -0.49999999999999983, "gs_citation": 22, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10525101075406225014&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=lzupY5zjaU9", "email": "berkeley.edu;mit.edu;microsoft.com", "author_num": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "University of California, Berkeley;Massachusetts Institute of Technology;Microsoft", "aff_unique_dep": ";;Microsoft Research", "aff_unique_url": "https://www.berkeley.edu;https://web.mit.edu;https://www.microsoft.com/en-us/research/group/microsoft-research-new-england", "aff_unique_abbr": "UC Berkeley;MIT;MSR NE", "aff_campus_unique_index": "0;2", "aff_campus_unique": "Berkeley;;New England", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "m22XrToDacC", "title": "Distributionally Robust Recourse Action", "track": "main", "status": "Reject", "tldr": "", "abstract": "Recourse actions, also known as counterfactual explanations, aim to explain a particular algorithmic decision by showing one or multiple ways in which the instance could be modified to receive an alternate outcome. Existing recourse recommendations often assume that the machine learning models do not change over time. However, this assumption does not always hold in practice because of data distribution shifts, and in this case, the recourse actions may become invalid. To redress this shortcoming, we propose the Distributionally Robust Recourse Action framework, which generates a recourse action that has high probability of being valid under a mixture of model shifts. We show that the robust recourse can be found efficiently using a projected gradient descent algorithm and we discuss several extensions of our framework. Numerical experiments with both synthetic and real-world datasets demonstrate the benefits of our proposed framework.", "keywords": "Algorithmic recourse;Robust optimization", "primary_area": "", "supplementary_material": "/attachment/c36bc77e8c3c97b5739bb8fd2519d595d44bd61c.zip", "author": "Duy Nguyen;Ngoc Bui;Viet Anh Nguyen", "authorids": "~Duy_Nguyen2;~Ngoc_Bui1;~Viet_Anh_Nguyen2", "gender": "M;M;M", "homepage": "https://duykhuongnguyen.github.io/;http://ngocbh.github.io;http://www.vietanhnguyen.net", "dblp": ";312/6811;", "google_scholar": "y323M_cAAAAJ;;3iyf-EoAAAAJ", "orcid": ";;", "linkedin": "duy-nguyen-89272a17b/;;", "or_profile": "~Duy_Nguyen2;~Ngoc_Bui1;~Viet_Anh_Nguyen2", "aff": "Hanoi University of Science and Technology;Hanoi University of Science and Technology;VinAI Research, Vietnam", "aff_domain": "hust.edu.vn;hust.edu.vn;vinai.io", "position": "Undergrad student;MS student;Research Scientist", "bibtex": "@misc{\nnguyen2022distributionally,\ntitle={Distributionally Robust Recourse Action},\nauthor={Duy Nguyen and Ngoc Bui and Viet Anh Nguyen},\nyear={2022},\nurl={https://openreview.net/forum?id=m22XrToDacC}\n}", "github": "", "project": "", "reviewers": "a91J;4adh;RFkz", "site": "https://openreview.net/forum?id=m22XrToDacC", "pdf_size": 0, "recommendation": "3;5;5", "confidence": "5;3;3", "correctness": "3;3;4", "technical_novelty": "3;3;2", "empirical_novelty": "1;2;2", "wc_summary_paper": "61;91;113", "wc_summary_review": "56;73;59", "wc_main_review": "824;213;189", "wc_review": "941;377;361", "wc_reply_reviewers": "0;37;0", "wc_reply_authors": "2300;1310;733", "reply_reviewers": "0;1;0", "reply_authors": "4;4;1", "recommendation_avg": [ 4.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.9428090415820634 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 1.6666666666666667, 0.4714045207910317 ], "wc_summary_paper_avg": [ 88.33333333333333, 21.312489817527705 ], "wc_summary_review_avg": [ 62.666666666666664, 7.408703590297622 ], "wc_main_review_avg": [ 408.6666666666667, 293.8484114112051 ], "wc_review_avg": [ 559.6666666666666, 269.72249113157443 ], "wc_reply_reviewers_avg": [ 12.333333333333334, 17.441967269268172 ], "wc_reply_authors_avg": [ 1447.6666666666667, 647.08903732193 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 3.0, 1.4142135623730951 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -1.0, "corr_recommendation_correctness": 0.49999999999999983, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4373094515805631486&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0;1", "aff_unique_norm": "Hanoi University of Science and Technology;VinAI Research", "aff_unique_dep": ";", "aff_unique_url": "https://www.hust.edu.vn;https://www.vin.ai", "aff_unique_abbr": "HUST;VinAI", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Hanoi;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Vietnam" }, { "id": "m2MiIwuI0m", "title": "An Analysis of Attentive Walk-Aggregating Graph Neural Networks", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Graph neural networks (GNNs) have been shown to possess strong representation power, which can be exploited for downstream prediction tasks on graph-structured data, such as molecules and social networks. They typically learn representations by aggregating information from the K-hop neighborhood of individual vertices or from the enumerated walks in the graph. Prior studies have demonstrated the effectiveness of incorporating weighting schemes into GNNs; however, this has been primarily limited to K-hop neighborhood GNNs so far. In this paper, we aim to extensively analyze the effect of incorporating weighting schemes into walk-aggregating GNNs. Towards this objective, we propose a novel GNN model, called AWARE, that aggregates information about the walks in the graph using attention schemes in a principled way to obtain an end-to-end supervised learning method for graph-level prediction tasks. We perform theoretical, empirical, and interpretability analyses of AWARE. Our theoretical analysis provides the first provable guarantees for weighted GNNs, demonstrating how the graph information is encoded in the representation, and how the weighting schemes in AWARE affect the representation and learning performance. We empirically demonstrate the superiority of AWARE over prior baselines in the domains of molecular property prediction (61 tasks) and social networks (4 tasks). Our interpretation study illustrates that AWARE can successfully learn to capture the important substructures of the input graph.", "keywords": "graph neural network;graph representation learning;attention weighting;walk aggregation;representation power;learning guarantees;interpretability", "primary_area": "", "supplementary_material": "/attachment/98feadcddee254539a2a33123bdd4d4068247b8f.zip", "author": "Mehmet F Demirel;Shengchao Liu;Siddhant Garg;Yingyu Liang", "authorids": "~Mehmet_F_Demirel1;~Shengchao_Liu1;~Siddhant_Garg2;~Yingyu_Liang1", "gender": ";M;M;", "homepage": ";https://chao1224.github.io/;https://sid7954.github.io/;", "dblp": ";;82/8467;", "google_scholar": ";F1ws3XUAAAAJ;https://scholar.google.com/citations?hl=en;", "orcid": ";0000-0003-2030-2367;;", "linkedin": ";;siddhant-garg-898253a6/;", "or_profile": "~Mehmet_F_Demirel1;~Shengchao_Liu1;~Siddhant_Garg2;~Yingyu_Liang1", "aff": ";MILA-UdeM;Amazon;", "aff_domain": ";mila.quebec;amazon.com;", "position": ";PhD student;Researcher;", "bibtex": "@misc{\ndemirel2022an,\ntitle={An Analysis of Attentive Walk-Aggregating Graph Neural Networks},\nauthor={Mehmet F Demirel and Shengchao Liu and Siddhant Garg and Yingyu Liang},\nyear={2022},\nurl={https://openreview.net/forum?id=m2MiIwuI0m}\n}", "github": "", "project": "", "reviewers": "EPbJ;XtiV;6wFo;KdaG", "site": "https://openreview.net/forum?id=m2MiIwuI0m", "pdf_size": 0, "recommendation": "5;5;5;5", "confidence": "4;4;4;4", "correctness": "3;4;3;3", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "57;28;85;42", "wc_summary_review": "53;23;63;79", "wc_main_review": "433;260;379;369", "wc_review": "543;311;527;490", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 5.0, 0.0 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 53.0, 21.130546609115438 ], "wc_summary_review_avg": [ 54.5, 20.414455662593603 ], "wc_main_review_avg": [ 360.25, 62.79082337412052 ], "wc_review_avg": [ 467.75, 92.51857921520413 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6642574294744789142&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1", "aff_unique_norm": "Mila;Amazon", "aff_unique_dep": "Montreal Institute for Learning Algorithms;Amazon.com, Inc.", "aff_unique_url": "https://mila.quebec;https://www.amazon.com", "aff_unique_abbr": "MILA;Amazon", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "Canada;United States" }, { "id": "m4BAEB_Imy", "title": "iPrune: A Magnitude Based Unstructured Pruning Method for Efficient Binary Networks in Hardware", "track": "main", "status": "Reject", "tldr": "", "abstract": "Modern image recognition models span millions of parameters occupying several megabytes and sometimes gigabytes of space, making it difficult to run on resource constrained edge hardware. Binary Neural Networks address this problem by reducing the memory requirements (one single bit per weight and/or activation). The computation requirement and power consumption are also reduced accordingly. Nevertheless, each neuron in such networks has a large number of inputs, making it difficult to implement them efficiently in binary hardware accelerators, especially LUT-based approaches.\n\nIn this work, we present a pruning algorithm and associated results on convolutional and dense layers from aforementioned binary networks. We reduce the computation by 4-70x and the memory by 190-2200x with less than 2% loss of accuracy on MNIST and less than 3% loss of accuracy on CIFAR-10 compared to full precision, fully connected equivalents. Compared to very recent work on pruning for binary networks, we still have a gain of 1% on the precision and up to 30% reduction in memory (526KiB vs 750KiB).", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Adithya Venkateswaran;Jean-Pierre David", "authorids": "~Adithya_Venkateswaran1;~Jean-Pierre_David1", "gender": ";", "homepage": ";https://www.polymtl.ca/expertises/en/david-jean-pierre", "dblp": ";81/170", "google_scholar": ";https://scholar.google.fr/citations?user=yVubPz4AAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Adithya_Venkateswaran1;~Jean-Pierre_David1", "aff": ";\u00c9cole Polytechnique de Montr\u00e9al, Universit\u00e9 de Montr\u00e9al", "aff_domain": ";polymtl.ca", "position": ";Full Professor", "bibtex": "@misc{\nvenkateswaran2022iprune,\ntitle={iPrune: A Magnitude Based Unstructured Pruning Method for Efficient Binary Networks in Hardware},\nauthor={Adithya Venkateswaran and Jean-Pierre David},\nyear={2022},\nurl={https://openreview.net/forum?id=m4BAEB_Imy}\n}", "github": "", "project": "", "reviewers": "TbnU;r65C;1172;MtDD", "site": "https://openreview.net/forum?id=m4BAEB_Imy", "pdf_size": 0, "recommendation": "3;3;3;3", "confidence": "5;5;4;3", "correctness": "2;4;2;2", "technical_novelty": "1;1;2;2", "empirical_novelty": "0;1;0;1", "wc_summary_paper": "16;64;27;33", "wc_summary_review": "27;43;50;14", "wc_main_review": "99;105;141;222", "wc_review": "142;212;218;269", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 4.25, 0.82915619758885 ], "correctness_avg": [ 2.5, 0.8660254037844386 ], "technical_novelty_avg": [ 1.5, 0.5 ], "empirical_novelty_avg": [ 0.5, 0.5 ], "wc_summary_paper_avg": [ 35.0, 17.81852968120546 ], "wc_summary_review_avg": [ 33.5, 14.0089257261219 ], "wc_main_review_avg": [ 141.75, 49.037613114832574 ], "wc_review_avg": [ 210.25, 45.20163160771965 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:DWZcfZabw_UJ:scholar.google.com/&scioq=iPrune:+A+Magnitude+Based+Unstructured+Pruning+Method+for+Efficient+Binary+Networks+in+Hardware&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "\u00c9cole Polytechnique de Montr\u00e9al", "aff_unique_dep": "", "aff_unique_url": "https://www.polymtl.ca", "aff_unique_abbr": "Polytechnique Montr\u00e9al", "aff_campus_unique_index": "0", "aff_campus_unique": "Montr\u00e9al", "aff_country_unique_index": "0", "aff_country_unique": "Canada" }, { "id": "m5EBN92vjN", "title": "AASEG: ATTENTION AWARE NETWORK FOR REAL TIME SEMANTIC SEGMENTATION", "track": "main", "status": "Reject", "tldr": "", "abstract": "In this paper, we present a new network named Attention Aware Network (AASeg)\nfor real time semantic image segmentation. Our network incorporates spatial and\nchannel information using Spatial Attention (SA) and Channel Attention (CA)\nmodules respectively. It also uses dense local multi-scale context information\nusing Multi Scale Context (MSC) module. The feature maps are concatenated\nindividually to produce the final segmentation map. We demonstrate the effectiveness of our method using a comprehensive analysis, quantitative experimental\nresults and ablation study using Cityscapes, ADE20K and Camvid datasets. Our\nnetwork performs better than most previous architectures with a 74.4% Mean IOU\non Cityscapes test dataset while running at 202.7 FPS.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Abhinav Sagar", "authorids": "~Abhinav_Sagar1", "gender": "M", "homepage": "", "dblp": "", "google_scholar": "5ntkLcgAAAAJ", "orcid": "", "linkedin": "https://linkedin.com/in/abhinavsagar4", "or_profile": "~Abhinav_Sagar1", "aff": "University of Maryland, College Park", "aff_domain": "umd.edu", "position": "MS student", "bibtex": "@misc{\nsagar2022aaseg,\ntitle={{AASEG}: {ATTENTION} {AWARE} {NETWORK} {FOR} {REAL} {TIME} {SEMANTIC} {SEGMENTATION}},\nauthor={Abhinav Sagar},\nyear={2022},\nurl={https://openreview.net/forum?id=m5EBN92vjN}\n}", "github": "", "project": "", "reviewers": "og3N;dXYM;kwfp;2HPE", "site": "https://openreview.net/forum?id=m5EBN92vjN", "pdf_size": 0, "recommendation": "1;1;1;3", "confidence": "5;4;5;4", "correctness": "2;2;2;3", "technical_novelty": "1;1;1;2", "empirical_novelty": "2;1;1;2", "wc_summary_paper": "41;20;84;73", "wc_summary_review": "21;11;25;54", "wc_main_review": "210;127;496;338", "wc_review": "272;158;605;465", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 1.5, 0.8660254037844386 ], "confidence_avg": [ 4.5, 0.5 ], "correctness_avg": [ 2.25, 0.4330127018922193 ], "technical_novelty_avg": [ 1.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.5, 0.5 ], "wc_summary_paper_avg": [ 54.5, 25.42144763777232 ], "wc_summary_review_avg": [ 27.75, 15.990231392947383 ], "wc_main_review_avg": [ 292.75, 139.3545388568309 ], "wc_review_avg": [ 375.0, 172.26288050534856 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 1, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 1.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14779700645559112843&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0", "aff_unique_norm": "University of Maryland", "aff_unique_dep": "", "aff_unique_url": "https://www/umd.edu", "aff_unique_abbr": "UMD", "aff_campus_unique_index": "0", "aff_campus_unique": "College Park", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "id": "m716e-0clj", "title": "Communicate Then Adapt: An Effective Decentralized Adaptive Method for Deep Training", "track": "main", "status": "Reject", "tldr": "", "abstract": "Decentralized adaptive gradient methods, in which each node averages only with its neighbors, are critical to save communication and wall-clock training time in deep learning tasks. While different in concrete recursions, existing decentralized adaptive methods share the same algorithm structure: each node scales its gradient with information of the past squared gradients (which is referred to as the adaptive step) before or while it communicates with neighbors. In this paper, we identify the limitation of such adapt-then/while-communicate structure: it will make the developed algorithms highly sensitive to heterogeneous data distributions, and hence deviate their limiting points from the stationary solution. To overcome this limitation, we propose an effective decentralized adaptive method with a communicate-then-adapt structure, in which each node conducts the adaptive step after finishing the neighborhood communications. The new method is theoretically guaranteed to approach to the stationary solution in the non-convex scenario. Experimental results on a variety of CV/NLP tasks show that our method has a clear superiority to other existing decentralized adaptive methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Bicheng Ying;Kun Yuan;Yiming Chen;Hanbin Hu;Yingya Zhang;Pan Pan;Wotao Yin", "authorids": "~Bicheng_Ying2;~Kun_Yuan4;~Yiming_Chen1;~Hanbin_Hu2;~Yingya_Zhang3;~Pan_Pan1;~Wotao_Yin1", "gender": "M;;M;M;M;M;M", "homepage": ";;;;;;http://wotaoyin.com", "dblp": ";;;158/7413;142/2510;;76/2265", "google_scholar": "xY7a1YwAAAAJ;;LxiMyjQAAAAJ;v26nby4AAAAJ;16RDSEUAAAAJ;;kpQGGFUAAAAJ", "orcid": "0000-0002-5246-2982;;;0000-0003-4223-5898;;;0000-0001-6697-9731", "linkedin": ";;yiming-chen/;hanbinhu/;;;", "or_profile": "~Bicheng_Ying2;~Kun_Yuan4;~Yiming_Chen1;~Hanbin_Hu2;~Yingya_Zhang3;~Pan_Pan1;~Wotao_Yin1", "aff": "Google;;Alibaba Group;Google;Alibaba Group;Alibaba Group;Alibaba Group US", "aff_domain": "google.com;;alibaba-inc.com;google.com;alibaba-inc.com;alibaba-inc.com;alibaba-inc.com", "position": "Researcher;;Researcher;Software Engineer;Researcher;Senior Staff Algorithm Engineer;Principal Researcher", "bibtex": "@misc{\nying2022communicate,\ntitle={Communicate Then Adapt: An Effective Decentralized Adaptive Method for Deep Training},\nauthor={Bicheng Ying and Kun Yuan and Yiming Chen and Hanbin Hu and Yingya Zhang and Pan Pan and Wotao Yin},\nyear={2022},\nurl={https://openreview.net/forum?id=m716e-0clj}\n}", "github": "", "project": "", "reviewers": "XYfg;RKoY;1RX3;i1gD", "site": "https://openreview.net/forum?id=m716e-0clj", "pdf_size": 0, "recommendation": "3;5;5;8", "confidence": "5;4;4;3", "correctness": "3;2;2;4", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "36;47;44;16", "wc_summary_review": "18;19;160;108", "wc_main_review": "178;306;706;184", "wc_review": "232;372;910;308", "wc_reply_reviewers": "51;0;0;0", "wc_reply_authors": "1905;1526;1762;256", "reply_reviewers": "1;0;0;0", "reply_authors": "4;2;3;1", "recommendation_avg": [ 5.25, 1.7853571071357126 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 2.75, 0.82915619758885 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 35.75, 12.090802289343747 ], "wc_summary_review_avg": [ 76.25, 60.606827173182396 ], "wc_main_review_avg": [ 343.5, 215.43154365134183 ], "wc_review_avg": [ 455.5, 267.0444719517706 ], "wc_reply_reviewers_avg": [ 12.75, 22.083647796503186 ], "wc_reply_authors_avg": [ 1362.25, 652.8745572466429 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 2.5, 1.118033988749895 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": -0.9901475429766743, "corr_recommendation_correctness": 0.5488604301969737, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:1tznlvCuo4kJ:scholar.google.com/&scioq=Communicate+Then+Adapt:+An+Effective+Decentralized+Adaptive+Method+for+Deep+Training&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1;0;1;1;1", "aff_unique_norm": "Google;Alibaba Group", "aff_unique_dep": "Google;", "aff_unique_url": "https://www.google.com;https://www.alibaba.com", "aff_unique_abbr": "Google;Alibaba", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;1;0;1;1;0", "aff_country_unique": "United States;China" }, { "id": "m7S4NvprHVl", "title": "Demystifying Hyperparameter Optimization in Federated Learning", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Federated Learning (FL) is a new machine learning paradigm that enables training models collaboratively across clients without sharing private data. In FL, data is non-uniformly distributed among clients (i.e., data heterogeneity) and cannot be balanced nor monitored like in conventional ML. Such data heterogeneity and privacy requirements bring unique challenges for learning hyperparameter optimization as the training dynamics change across clients even within the same training round and they are difficult to measure due to privacy constraints. State-of-the-art frameworks in FL focus on developing better aggregation algorithms and policies with the aim of mitigating these challenges. However, almost all existing FL systems adopt a ``global'' tuning method that uses a single set of learning hyperparameters across all the clients, regardless of their underlying data distributions. Our study shows that such a widely adopted global tuning method is not suitable for FL due to its data heterogeneity-oblivious nature. We demonstrate that the data quantity and distribution of the clients have a significant impact on the choice of hyperparameters, making it necessary to have customized tuning for each client. Based on these observations, we propose a first of its kind heterogeneity-aware hyperparameter optimization methodology, FedTune, that adopts a proxy data based hyperparameter customization approach to address the privacy and tuning cost challenges. Together with a Bayesian strengthened tuner, the proposed customized tuning approach is effective, lightweight, and privacy preserving. Extensive evaluation demonstrates that FedTune can achieve up to 7/4/4/6% better accuracy than the widely adopted globally tuned method for popular FL benchmarks FEMNIST, Cifar100, Cifar10, and Fashion-MNIST respectively.", "keywords": "Federated Learning;Data Heterogeneity;Hyperparameter Optimization", "primary_area": "", "supplementary_material": "/attachment/607b769b45059c1502621803194393caabc4cd87.zip", "author": "Syed Zawad;Jun Yi;Minjia Zhang;Cheng Li;Feng Yan;Yuxiong He", "authorids": "~Syed_Zawad1;~Jun_Yi2;~Minjia_Zhang1;~Cheng_Li10;~Feng_Yan2;~Yuxiong_He1", "gender": "M;F;;;M;M", "homepage": "https://minjiazhang.github.io/;https://chengli.netlify.app/;http://www.cs.uh.edu/~fyan/;;https://www.cse.unr.edu/~jyi/;https://www.semanticscholar.org/author/Syed-Zawad/70272436", "dblp": "58/9033;;62/3960-1.html;https://dblp.org/pers/hd/h/He:Yuxiong;;", "google_scholar": "https://scholar.google.com/citations?hl=en;da9Vl6QAAAAJ;iLE0_VAAAAAJ;SB3_eb0AAAAJ;;https://scholar.google.com/citations?hl=en", "orcid": "0000-0002-8165-166X;;;;;", "linkedin": "minjia-zhang-05857226/;;;;jun-yi-272a1766/;", "or_profile": "~Minjia_Zhang1;~Cheng_Li10;~Feng_Yan2;~Yuxiong_He1;~JUN_YI1;~Syed_Zawad2", "aff": "Microsoft ;Microsoft;University of Nevada, Reno;Microsoft;University of Nevada, Reno, University of Nevada, Reno;University of Nevada, Reno, University of Nevada, Reno", "aff_domain": "microsoft.com;microsoft.com;unr.edu;microsoft.com;nevada.unr.edu;nevada.unr.edu", "position": "Principle Researcher;Researcher;Assistant Professor;Researcher;PhD student;PhD student", "bibtex": "@misc{\nzawad2022demystifying,\ntitle={Demystifying Hyperparameter Optimization in Federated Learning},\nauthor={Syed Zawad and Jun Yi and Minjia Zhang and Cheng Li and Feng Yan and Yuxiong He},\nyear={2022},\nurl={https://openreview.net/forum?id=m7S4NvprHVl}\n}", "github": "", "project": "", "reviewers": "SfpY;v1tD;MWne;Z3yt", "site": "https://openreview.net/forum?id=m7S4NvprHVl", "pdf_size": 0, "recommendation": "3;3;5;6", "confidence": "4;5;3;4", "correctness": "2;3;4;3", "technical_novelty": "2;3;2;3", "empirical_novelty": "3;3;3;3", "wc_summary_paper": "78;91;82;88", "wc_summary_review": "56;35;656;26", "wc_main_review": "522;511;149;130", "wc_review": "656;637;887;244", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "965;663;1052;253", "reply_reviewers": "0;0;0;0", "reply_authors": "2;1;2;1", "recommendation_avg": [ 4.25, 1.299038105676658 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 84.75, 5.0682837331783235 ], "wc_summary_review_avg": [ 193.25, 267.3905149776259 ], "wc_main_review_avg": [ 328.0, 188.65974663398654 ], "wc_review_avg": [ 606.0, 231.01190445516005 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 733.25, 312.6038827334043 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.5443310539518174, "corr_recommendation_correctness": 0.5443310539518174, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12201203202274684824&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;1;0;2;2", "aff_unique_norm": "Microsoft;University of Nevada;University of Nevada, Reno", "aff_unique_dep": "Microsoft Corporation;;", "aff_unique_url": "https://www.microsoft.com;https://www.unr.edu;https://www.unr.edu", "aff_unique_abbr": "Microsoft;UNR;UNR", "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Reno", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "m7zsaLt1Sab", "title": "Finding One Missing Puzzle of Contextual Word Embedding: Representing Contexts as Manifold", "track": "main", "status": "Reject", "tldr": "", "abstract": "The current understanding of contextual word embedding interprets the representation by associating each token to a vector that is dynamically modulated by the context. However, this \u201ctoken-centric\u201d understanding does not explain how a model represents context itself, leading to a lack of characterization from such a perspective. In this work, to establish a rigorous definition of \u201ccontext representation\u201d, we formalize this intuition using a category theory framework, which indicates the necessity of including the information from both tokens and how transitions happen among different tokens in a given context. As a practical instantiation of our theoretical understanding, we also show how to leverage a manifold learning method to characterize how a representation model (i.e., BERT) encodes different contexts and how a representation of context changes when going through different components such as attention and FFN. We hope this novel theoretic perspective sheds light on the further improvements in Transformer-based language representation models.", "keywords": "Contextual Word Embedding;Category Theory;Manifold", "primary_area": "", "supplementary_material": "", "author": "Hailin Hu;Rong Yao;Cheng LI", "authorids": "~Hailin_Hu1;~Rong_Yao1;~Cheng_LI12", "gender": ";;M", "homepage": ";;", "dblp": "209/7188-2;;", "google_scholar": "rvYUgBwAAAAJ;https://scholar.google.com/citations?view_op=list_works;K162I_IAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Hailin_Hu1;~Rong_Yao1;~Cheng_LI12", "aff": "Huawei Noah's Ark Lab;;Huawei Technologies Ltd.", "aff_domain": "huawei.com;;huawei.com", "position": "Researcher;;Researcher", "bibtex": "@misc{\nhu2022finding,\ntitle={Finding One Missing Puzzle of Contextual Word Embedding: Representing Contexts as Manifold},\nauthor={Hailin Hu and Rong Yao and Cheng LI},\nyear={2022},\nurl={https://openreview.net/forum?id=m7zsaLt1Sab}\n}", "github": "", "project": "", "reviewers": "awgJ;ptkS;K1ke;3Rbc;BMjD", "site": "https://openreview.net/forum?id=m7zsaLt1Sab", "pdf_size": 0, "recommendation": "1;3;3;3;3", "confidence": "3;2;4;3;2", "correctness": "3;2;2;2;3", "technical_novelty": "1;2;2;2;2", "empirical_novelty": "1;2;2;2;2", "wc_summary_paper": "183;61;115;131;118", "wc_summary_review": "41;71;45;115;47", "wc_main_review": "294;323;1015;1023;198", "wc_review": "518;455;1175;1269;363", "wc_reply_reviewers": "372;130;224;129;59", "wc_reply_authors": "155;102;288;193;409", "reply_reviewers": "1;1;1;1;1", "reply_authors": "1;1;1;1;1", "recommendation_avg": [ 2.6, 0.8000000000000002 ], "confidence_avg": [ 2.8, 0.7483314773547882 ], "correctness_avg": [ 2.4, 0.4898979485566356 ], "technical_novelty_avg": [ 1.8, 0.4000000000000001 ], "empirical_novelty_avg": [ 1.8, 0.4000000000000001 ], "wc_summary_paper_avg": [ 121.6, 38.95433223660752 ], "wc_summary_review_avg": [ 63.8, 27.672368890284762 ], "wc_main_review_avg": [ 570.6, 368.4565646042963 ], "wc_review_avg": [ 756.0, 384.81787900252243 ], "wc_reply_reviewers_avg": [ 182.8, 108.16912683386143 ], "wc_reply_authors_avg": [ 229.4, 108.4261960966998 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.13363062095621214, "corr_recommendation_correctness": -0.6123724356957944, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:w7XhXarA2pUJ:scholar.google.com/&scioq=Finding+One+Missing+Puzzle+of+Contextual+Word+Embedding:+Representing+Contexts+as+Manifold&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Huawei", "aff_unique_dep": "Noah's Ark Lab", "aff_unique_url": "https://www.huawei.com", "aff_unique_abbr": "Huawei", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "title": "Neural Solvers for Fast and Accurate Numerical Optimal Control", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6968", "id": "m8bypnj7Yl5", "poster": "", "openreview": "https://openreview.net/forum?id=m8bypnj7Yl5", "slides": "https://iclr.cc/virtual/2022/poster/6968", "video": "https://iclr.cc/virtual/2022/poster/6968", "author_site": "Federico Berto, Stefano Massaroli, Michael Poli, Jinkyoo Park", "tldr": "", "abstract": "Synthesizing optimal controllers for dynamical systems often involves solving optimization problems with hard real-time constraints. These constraints determine the class of numerical methods that can be applied: computationally expensive but accurate numerical routines are replaced by fast and inaccurate methods, trading inference time for solution accuracy. This paper provides techniques to improve the quality of optimized control policies given a fixed computational budget. We achieve the above via a hypersolvers approach, which hybridizes a differential equation solver and a neural network. The performance is evaluated in direct and receding-horizon optimal control tasks in both low and high dimensions, where the proposed approach shows consistent Pareto improvements in solution accuracy and control performance.", "keywords": "Deep Learning;Numerical Methods;Optimal Control", "primary_area": "", "supplementary_material": "/attachment/43f9be371d40c4433861b0220f08aa0ba007b3ab.zip", "author": "Federico Berto;Stefano Massaroli;Michael Poli;Jinkyoo Park", "authorids": "~Federico_Berto1;~Stefano_Massaroli1;~Michael_Poli1;~Jinkyoo_Park1", "gender": "M;;M;M", "homepage": "https://fedebotu.github.io/;;;http://silab.kaist.ac.kr/", "dblp": "317/1711;;;156/7535", "google_scholar": "https://scholar.google.com/citations?hl=en;IwCfl4UAAAAJ;RgIBwboAAAAJ;sH2a0nkAAAAJ", "orcid": "0000-0002-7438-8365;;;0000-0003-2620-1479", "linkedin": "federicoberto/;;;", "or_profile": "~Federico_Berto1;~Stefano_Massaroli1;~Michael_Poli1;~Jinkyoo_Park1", "aff": "Korea Advanced Institute of Science & Technology;The University of Tokyo;Stanford University;Korea Advanced Institute of Science & Technology", "aff_domain": "kaist.ac.kr;u-tokyo.ac.jp;stanford.edu;kaist.ac.kr", "position": "MS student;PhD student;PhD student;Associate Professor", "bibtex": "@inproceedings{\nberto2022neural,\ntitle={Neural Solvers for Fast and Accurate Numerical Optimal Control},\nauthor={Federico Berto and Stefano Massaroli and Michael Poli and Jinkyoo Park},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=m8bypnj7Yl5}\n}", "github": "", "project": "", "reviewers": "J9x8;jwFx;RBHY", "pdf_size": 0, "recommendation": "5;6;8", "confidence": "4;4;3", "correctness": "4;4;4", "technical_novelty": "2;3;2", "empirical_novelty": "2;2;4", "wc_summary_paper": "52;90;164", "wc_summary_review": "33;61;68", "wc_main_review": "166;409;210", "wc_review": "251;560;442", "wc_reply_reviewers": "219;0;24", "wc_reply_authors": "913;982;708", "reply_reviewers": "1;0;1", "reply_authors": "2;2;1", "recommendation_avg": [ 6.333333333333333, 1.247219128924647 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 4.0, 0.0 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.9428090415820634 ], "wc_summary_paper_avg": [ 102.0, 46.50448007092076 ], "wc_summary_review_avg": [ 54.0, 15.121728296285006 ], "wc_main_review_avg": [ 261.6666666666667, 105.71765331401478 ], "wc_review_avg": [ 417.6666666666667, 127.31675284720215 ], "wc_reply_reviewers_avg": [ 81.0, 98.07140255956371 ], "wc_reply_authors_avg": [ 867.6666666666666, 116.36246053698856 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.944911182523068, "corr_recommendation_correctness": 0.0, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3860528662060774857&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "pdf": "https://openreview.net/pdf?id=m8bypnj7Yl5", "email": "kaist.ac.kr;u-tokyo.ac.jp;stanford.edu;kaist.ac.kr", "author_num": 4, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "Korea Advanced Institute of Science and Technology;University of Tokyo;Stanford University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.kaist.ac.kr;https://www.u-tokyo.ac.jp;https://www.stanford.edu", "aff_unique_abbr": "KAIST;UTokyo;Stanford", "aff_campus_unique_index": "1", "aff_campus_unique": ";Stanford", "aff_country_unique_index": "0;1;2;0", "aff_country_unique": "South Korea;Japan;United States" }, { "title": "Creating Training Sets via Weak Indirect Supervision", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6191", "id": "m8uJvVgwRci", "poster": "", "openreview": "https://openreview.net/forum?id=m8uJvVgwRci", "slides": "https://iclr.cc/virtual/2022/poster/6191", "video": "https://iclr.cc/virtual/2022/poster/6191", "author_site": "Jieyu Zhang, Bohan Wang, Xiangchen Song, Yujing Wang, Yaming Yang, Jing Bai, Alex Ratner", "tldr": "", "abstract": "Creating labeled training sets has become one of the major roadblocks in machine learning. To address this, recent Weak Supervision (WS) frameworks synthesize training labels from multiple potentially noisy supervision sources. However, existing frameworks are restricted to supervision sources that share the same output space as the target task. To extend the scope of usable sources, we formulate Weak Indirect Supervision (WIS), a new research problem for automatically synthesizing training labels based on indirect supervision sources that have different output label spaces. To overcome the challenge of mismatched output spaces, we develop a probabilistic modeling approach, PLRM, which uses user-provided label relations to model and leverage indirect supervision sources. Moreover, we provide a theoretically-principled test of the distinguishability of PLRM for unseen labels, along with an generalization bound. On both image and text classification tasks as well as an industrial advertising application, we demonstrate the advantages of PLRM by outperforming baselines by a margin of 2%-9%.", "keywords": "weak supervision;data programming;training label synthesis", "primary_area": "", "supplementary_material": "", "author": "Jieyu Zhang;Bohan Wang;Xiangchen Song;Yujing Wang;Yaming Yang;Jing Bai;Alexander Ratner", "authorids": "~Jieyu_Zhang1;~Bohan_Wang1;~Xiangchen_Song1;~Yujing_Wang1;~Yaming_Yang1;~Jing_Bai3;~Alexander_Ratner1", "gender": "M;M;M;F;M;;M", "homepage": "https://jieyuz2.github.io/;https://bhwangfy.github.io/;https://xiangchensong.github.io/;;;;https://ajratner.github.io/", "dblp": ";202/1184;261/9024;16/4075;204/3789-1.html;;180/5513", "google_scholar": "T_INUHUAAAAJ;LfkHCEUAAAAJ;foR8BIoAAAAJ;https://scholar.google.com/citations?hl=en;;;rfwwtFYAAAAJ", "orcid": "0000-0002-1846-2436;;;;;;", "linkedin": "jieyu-zhang-3baaa8154/;;;;;;alexander-ratner-038ba239/", "or_profile": "~Jieyu_Zhang1;~Bohan_Wang1;~Xiangchen_Song1;~Yujing_Wang1;~Yaming_Yang1;~Jing_Bai3;~Alexander_Ratner1", "aff": "University of Washington;Microsoft Research Asia, University of Science and Technology of China;Carnegie Mellon University;Microsoft Research Asia;Peking University;;Department of Computer Science, University of Washington", "aff_domain": "cs.washington.edu;ustc.edu.cn;cmu.edu;microsoft.com;pku.edu.cn;;cs.washington.edu", "position": "MS student;PhD student;PhD student;Researcher;PhD student;;Assistant Professor", "bibtex": "@inproceedings{\nzhang2022creating,\ntitle={Creating Training Sets via Weak Indirect Supervision},\nauthor={Jieyu Zhang and Bohan Wang and Xiangchen Song and Yujing Wang and Yaming Yang and Jing Bai and Alexander Ratner},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=m8uJvVgwRci}\n}", "github": "", "project": "", "reviewers": "Zxiq;EDJu;ZVDL;Vocb", "pdf_size": 0, "recommendation": "6;8;8;8", "confidence": "4;4;2;4", "correctness": "3;4;4;4", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "49;144;67;200", "wc_summary_review": "89;92;39;37", "wc_main_review": "227;912;303;190", "wc_review": "365;1148;409;427", "wc_reply_reviewers": "37;0;45;0", "wc_reply_authors": "251;1153;425;98", "reply_reviewers": "1;0;1;0", "reply_authors": "2;2;2;1", "recommendation_avg": [ 7.5, 0.8660254037844386 ], "confidence_avg": [ 3.5, 0.8660254037844386 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 115.0, 60.67536567668958 ], "wc_summary_review_avg": [ 64.25, 26.280934153869037 ], "wc_main_review_avg": [ 408.0, 293.82222516344814 ], "wc_review_avg": [ 587.25, 324.5338002427482 ], "wc_reply_reviewers_avg": [ 20.5, 20.694202086574876 ], "wc_reply_authors_avg": [ 481.75, 404.44614907302554 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.75, 0.4330127018922193 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": 1.0, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5297639812737287173&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=m8uJvVgwRci", "email": "cs.washington.edu;ustc.edu.cn;cmu.edu;microsoft.com;pku.edu.cn;;cs.washington.edu", "author_num": 7, "aff_unique_index": "0;1;2;1;3;0", "aff_unique_norm": "University of Washington;Microsoft;Carnegie Mellon University;Peking University", "aff_unique_dep": ";Research;;", "aff_unique_url": "https://www.washington.edu;https://www.microsoft.com/en-us/research/group/microsoft-research-asia;https://www.cmu.edu;http://www.pku.edu.cn", "aff_unique_abbr": "UW;MSRA;CMU;Peking U", "aff_campus_unique_index": "1;1;2", "aff_campus_unique": ";Asia;Seattle", "aff_country_unique_index": "0;1;0;1;1;0", "aff_country_unique": "United States;China" }, { "id": "mF122BuAnnW", "title": "Localized Randomized Smoothing for Collective Robustness Certification", "track": "main", "status": "Reject", "tldr": "", "abstract": "Models for image segmentation, node classification and many other tasks map a single input to multiple labels. By perturbing this single shared input (e.g. the image) an adversary can manipulate several predictions (e.g. misclassify several pixels). A recent collective robustness certificate provides strong guarantees on the number of predictions that are simultaneously robust. This method is however limited to strictily models, where each prediction is associated with a small receptive field. We propose a more general collective certificate for the larger class of softly local models, where each output is dependent on the entire input but assigns different levels of importance to different input regions (e.g. based on their proximity in the image). The certificate is based on our novel localized randomized smoothing approach, where the random perturbation strength for different input regions is proportional to their importance for the outputs. The resulting locally smoothed model yields strong collective guarantees while maintaining high prediction quality on both image segmentation and node classification tasks.", "keywords": "Adversarial robustness;Robustness certification;Robust machine learning;Randomized smoothing;Verification", "primary_area": "", "supplementary_material": "", "author": "Jan Schuchardt;Tom Wollschl\u00e4ger;Aleksandar Bojchevski;Stephan G\u00fcnnemann", "authorids": "~Jan_Schuchardt1;~Tom_Wollschl\u00e4ger1;~Aleksandar_Bojchevski1;~Stephan_G\u00fcnnemann1", "gender": ";M;M;M", "homepage": "https://www.cs.cit.tum.de/daml/team/jan-schuchardt/;https://www.linkedin.com/in/wollschlaeger/;https://abojchevski.github.io/;http://www.daml.in.tum.de", "dblp": "241/5487;332/0829;203/8114;43/3011", "google_scholar": "O-cixlwAAAAJ;https://scholar.google.com/citations?hl=en;https://scholar.google.de/citations?user=F1APiN4AAAAJ;", "orcid": ";;;", "linkedin": ";wollschlaeger/;;", "or_profile": "~Jan_Schuchardt1;~Tom_Wollschl\u00e4ger1;~Aleksandar_Bojchevski1;~Stephan_G\u00fcnnemann1", "aff": "Department of Informatics, Technical University Munich;Technische Universit\u00e4t M\u00fcnchen;CISPA Helmholtz Center for Information Security;Technical University Munich", "aff_domain": "in.tum.de;tum.de;cispa.de;tum.de", "position": "PhD student;PhD student;Principal Researcher;Professor", "bibtex": "@misc{\nschuchardt2022localized,\ntitle={Localized Randomized Smoothing for Collective Robustness Certification},\nauthor={Jan Schuchardt and Tom Wollschl{\\\"a}ger and Aleksandar Bojchevski and Stephan G{\\\"u}nnemann},\nyear={2022},\nurl={https://openreview.net/forum?id=mF122BuAnnW}\n}", "github": "", "project": "", "reviewers": "y7cf;Fv4q;z9Ri;ruKP", "site": "https://openreview.net/forum?id=mF122BuAnnW", "pdf_size": 0, "recommendation": "3;3;5;8", "confidence": "5;3;4;4", "correctness": "3;3;3;4", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "72;45;150;158", "wc_summary_review": "65;57;2;80", "wc_main_review": "299;410;775;167", "wc_review": "436;512;927;405", "wc_reply_reviewers": "33;0;1029;0", "wc_reply_authors": "1813;2342;6678;118", "reply_reviewers": "1;0;2;0", "reply_authors": "3;4;14;1", "recommendation_avg": [ 4.75, 2.0463381929681126 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 106.25, 48.776915646645804 ], "wc_summary_review_avg": [ 51.0, 29.47032405658275 ], "wc_main_review_avg": [ 412.75, 226.14417414561004 ], "wc_review_avg": [ 570.0, 209.75819411884723 ], "wc_reply_reviewers_avg": [ 265.5, 441.0127549175874 ], "wc_reply_authors_avg": [ 2737.75, 2418.700102844501 ], "reply_reviewers_avg": [ 0.75, 0.82915619758885 ], "reply_authors_avg": [ 5.5, 5.024937810560445 ], "replies_avg": [ 32, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.9169493006161777, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1992399688580716259&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "Technical University Munich;Technische Universit\u00e4t M\u00fcnchen;CISPA Helmholtz Center for Information Security;Technical University of Munich", "aff_unique_dep": "Department of Informatics;;;", "aff_unique_url": "https://www.tum.de;https://www.tum.de;https://www.cispa.de/;https://www.tum.de", "aff_unique_abbr": "TUM;TUM;CISPA;TUM", "aff_campus_unique_index": "0", "aff_campus_unique": "Munich;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Germany" }, { "id": "mF5tmqUfdsw", "title": "Zeroth-Order Actor-Critic", "track": "main", "status": "Reject", "tldr": "", "abstract": "Evolution based zeroth-order optimization methods and policy gradient based first-order methods are two promising alternatives to solve reinforcement learning (RL) problems with complementary advantages. The former work with arbitrary policies, drive state-dependent and temporally-extended exploration, possess robustness-seeking property, but suffer from high sample complexity, while the latter are more sample efficient but restricted to differentiable policies and the learned policies are less robust. We propose Zeroth-Order Actor-Critic algorithm (ZOAC) that unifies these two methods into an on-policy actor-critic architecture to preserve the advantages from both. ZOAC conducts rollouts collection with timestep-wise perturbation in parameter space, first-order policy evaluation (PEV) and zeroth-order policy improvement (PIM) alternately in each iteration. The modified rollouts collection strategy and the introduced critic network help to reduce the variance of zeroth-order gradient estimators and improve the sample efficiency and stability of the algorithm. We evaluate our proposed method using two different types of policies, linear policies and neural networks, on a range of challenging continuous control benchmarks, where ZOAC outperforms zeroth-order and first-order baseline algorithms.", "keywords": "reinforcement learning;zeroth-order optimization;actor-critic", "primary_area": "", "supplementary_material": "", "author": "Yuheng Lei;Jianyu Chen;Shengbo Eben Li;Sifa Zheng", "authorids": "~Yuheng_Lei1;~Jianyu_Chen1;~Shengbo_Eben_Li2;~Sifa_Zheng1", "gender": "M;M;M;M", "homepage": "https://sites.google.com/view/yuhenglei;http://people.iiis.tsinghua.edu.cn/~jychen/;http://www.idlab-tsinghua.com/thulab/labweb/dpeople.html?11;http://www.svm.tsinghua.edu.cn/essay/80/1835.html", "dblp": "312/6546.html;;;", "google_scholar": ";;Dxiw1K8AAAAJ;", "orcid": "0009-0006-1940-3573;;;0000-0001-5160-1365", "linkedin": "yuhenglei;;;", "or_profile": "~Yuheng_Lei1;~Jianyu_Chen1;~Shengbo_Eben_Li2;~Sifa_Zheng1", "aff": "Tsinghua University;Tsinghua University;Tsinghua University;Tsinghua University", "aff_domain": "mails.tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn", "position": "MS student;Assistant Professor;Full Professor;Full Professor", "bibtex": "@misc{\nlei2022zerothorder,\ntitle={Zeroth-Order Actor-Critic},\nauthor={Yuheng Lei and Jianyu Chen and Shengbo Eben Li and Sifa Zheng},\nyear={2022},\nurl={https://openreview.net/forum?id=mF5tmqUfdsw}\n}", "github": "", "project": "", "reviewers": "wk6f;eX28;cTPK", "site": "https://openreview.net/forum?id=mF5tmqUfdsw", "pdf_size": 0, "recommendation": "6;6;6", "confidence": "3;4;3", "correctness": "3;3;1", "technical_novelty": "3;3;2", "empirical_novelty": "3;3;2", "wc_summary_paper": "121;56;83", "wc_summary_review": "66;62;36", "wc_main_review": "284;662;256", "wc_review": "471;780;375", "wc_reply_reviewers": "108;0;0", "wc_reply_authors": "553;1239;1520", "reply_reviewers": "1;0;0", "reply_authors": "1;2;4", "recommendation_avg": [ 6.0, 0.0 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "correctness_avg": [ 2.3333333333333335, 0.9428090415820634 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 86.66666666666667, 26.662499674428293 ], "wc_summary_review_avg": [ 54.666666666666664, 13.299958228840001 ], "wc_main_review_avg": [ 400.6666666666667, 185.14378796552214 ], "wc_review_avg": [ 542.0, 172.7946758439044 ], "wc_reply_reviewers_avg": [ 36.0, 50.91168824543142 ], "wc_reply_authors_avg": [ 1104.0, 406.153501359605 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 2.3333333333333335, 1.247219128924647 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3773505299830379461&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Tsinghua University", "aff_unique_dep": "", "aff_unique_url": "https://www.tsinghua.edu.cn", "aff_unique_abbr": "THU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "id": "mFpP0THYeaX", "title": "Gradual Domain Adaptation in the Wild: When Intermediate Distributions are Absent", "track": "main", "status": "Reject", "tldr": "", "abstract": "We focus on the problem of domain adaptation when the goal is shifting the model towards the target distribution, rather than learning domain invariant representations. It is shown that under the following two assumptions: (a) access to samples from intermediate distributions, and (b) samples being annotated with the amount of change from the source distribution; self-training can be successfully applied on gradually shifted samples to adapt the model toward the target distribution. We hypothesize having (a) is enough to enable iterative self-training to slowly adapt the model to the target distribution, by making use of an implicit curriculum. In the case where (a) does not hold, we observe that iterative self-training falls short. We propose GIFT (Gradual Interpolation of Features toward Target), a method that creates virtual samples from intermediate distributions by interpolating representations of examples from source and target domains.\nOur analysis of various synthetic distribution shifts shows that in the presence of (a) iterative self-training naturally forms a curriculum of samples which helps the model to adapt better to the target domain. Furthermore, we show that when (a) does not hold, more iterations hurt the performance of self-training, and in these settings GIFT is advantageous. Additionally, we evaluate self-training, iterative self-training and GIFT on two benchmarks with different types of natural distribution shifts and show that when applied on top of other domain adaptation methods, GIFT improves the performance of the model on the target dataset.", "keywords": "gradual domain adaptation;self-training;gradual distribution shift;curriculum learning", "primary_area": "", "supplementary_material": "/attachment/544de41c685049887cc599684021c70bb8ce79e6.zip", "author": "Samira Abnar;Rianne van den Berg;Golnaz Ghiasi;Mostafa Dehghani;Nal Kalchbrenner;Hanie Sedghi", "authorids": "~Samira_Abnar1;~Rianne_van_den_Berg1;~Golnaz_Ghiasi2;~Mostafa_Dehghani1;~Nal_Kalchbrenner1;~Hanie_Sedghi1", "gender": "Unspecified;F;F;M;;F", "homepage": "https://samiraabnar.github.io/;https://research.google/people/RiannevandenBerg/;;http://mostafadehghani.com/;;https://haniesedghi.com/", "dblp": "150/5405;198/1077;17/8614;125/4062;;66/8332", "google_scholar": "https://scholar.google.nl/citations?user=jbxwjgMAAAAJ;KARgiboAAAAJ;9pNIbGkAAAAJ;https://scholar.google.nl/citations?user=MiHOX3QAAAAJ;https://scholar.google.co.uk/citations?user=LFyg0tAAAAAJ;_9GX96fDWAMC", "orcid": ";0000-0001-5076-2802;;;;", "linkedin": ";;;;;hanie-sedghi-71bb2582", "or_profile": "~Samira_Abnar1;~Rianne_van_den_Berg1;~Golnaz_Ghiasi2;~Mostafa_Dehghani1;~Nal_Kalchbrenner1;~Hanie_Sedghi1", "aff": "Apple;Microsoft;Research, Google;Google DeepMind;;Google Research, Brain team", "aff_domain": "apple.com;microsoft.com;research.google.com;google.com;;google.com", "position": "Researcher;Researcher;Researcher;Research Scientist;;Senior Research Scientist", "bibtex": "@misc{\nabnar2022gradual,\ntitle={Gradual Domain Adaptation in the Wild: When Intermediate Distributions are Absent},\nauthor={Samira Abnar and Rianne van den Berg and Golnaz Ghiasi and Mostafa Dehghani and Nal Kalchbrenner and Hanie Sedghi},\nyear={2022},\nurl={https://openreview.net/forum?id=mFpP0THYeaX}\n}", "github": "", "project": "", "reviewers": "peK3;Les7;DHFk;v3Ep;KPWG", "site": "https://openreview.net/forum?id=mFpP0THYeaX", "pdf_size": 0, "recommendation": "3;3;5;5;6", "confidence": "5;4;3;2;4", "correctness": "3;2;4;3;4", "technical_novelty": "1;3;2;3;2", "empirical_novelty": "1;2;2;3;4", "wc_summary_paper": "27;194;77;77;42", "wc_summary_review": "72;143;51;32;35", "wc_main_review": "622;855;119;170;108", "wc_review": "721;1192;247;279;185", "wc_reply_reviewers": "170;119;0;45;13", "wc_reply_authors": "596;876;434;516;198", "reply_reviewers": "1;1;0;1;1", "reply_authors": "1;2;1;1;1", "recommendation_avg": [ 4.4, 1.2 ], "confidence_avg": [ 3.6, 1.019803902718557 ], "correctness_avg": [ 3.2, 0.7483314773547882 ], "technical_novelty_avg": [ 2.2, 0.7483314773547882 ], "empirical_novelty_avg": [ 2.4, 1.019803902718557 ], "wc_summary_paper_avg": [ 83.4, 58.66719696730022 ], "wc_summary_review_avg": [ 66.6, 40.75585847458007 ], "wc_main_review_avg": [ 374.8, 306.67859397095197 ], "wc_review_avg": [ 524.8, 383.85017910638004 ], "wc_reply_reviewers_avg": [ 69.4, 65.07411159593345 ], "wc_reply_authors_avg": [ 524.0, 220.69345255353636 ], "reply_reviewers_avg": [ 0.8, 0.4 ], "reply_authors_avg": [ 1.2, 0.4000000000000001 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.5229763603684907, "corr_recommendation_correctness": 0.801783725737273, "gs_citation": 25, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6871956697271264534&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1;2;2;2", "aff_unique_norm": "Apple;Microsoft;Google", "aff_unique_dep": "Apple Inc.;Microsoft Corporation;Google Research", "aff_unique_url": "https://www.apple.com;https://www.microsoft.com;https://research.google", "aff_unique_abbr": "Apple;Microsoft;Google", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0;1;0", "aff_country_unique": "United States;United Kingdom" }, { "title": "Boosting Randomized Smoothing with Variance Reduced Classifiers", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6327", "id": "mHu2vIds_-b", "poster": "", "openreview": "https://openreview.net/forum?id=mHu2vIds_-b", "slides": "https://iclr.cc/virtual/2022/poster/6327", "video": "https://iclr.cc/virtual/2022/poster/6327", "author_site": "Mikl\u00f3s Horv\u00e1th, Mark N M\u00fcller, Marc Fischer, Martin Vechev", "tldr": "", "abstract": "Randomized Smoothing (RS) is a promising method for obtaining robustness certi\ufb01cates by evaluating a base model under noise. In this work, we: (i) theoretically motivate why ensembles are a particularly suitable choice as base models for RS, and (ii) empirically con\ufb01rm this choice, obtaining state-of-the-art results in multiple settings. The key insight of our work is that the reduced variance of ensembles over the perturbations introduced in RS leads to signi\ufb01cantly more consistent classi\ufb01cations for a given input. This, in turn, leads to substantially increased certi\ufb01able radii for samples close to the decision boundary. Additionally, we introduce key optimizations which enable an up to 55-fold decrease in sample complexity of RS for predetermined radii, thus drastically reducing its computational overhead. Experimentally, we show that ensembles of only 3 to 10 classi\ufb01ers consistently improve on their strongest constituting model with respect to their average certi\ufb01ed radius (ACR) by 5% to 21% on both CIFAR10 and ImageNet, achieving a new state-of-the-art ACR of 0.86 and 1.11, respectively. We release all code and models required to reproduce our results at https://github.com/eth-sri/smoothing-ensembles.", "keywords": "adversarial robustness;certified robustness;randomized smoothing", "primary_area": "", "supplementary_material": "/attachment/d413f9e7ffc5bdf88371fbcd10f6367c7be00f90.zip", "author": "Mikl\u00f3s Z. Horv\u00e1th;Mark Niklas Mueller;Marc Fischer;Martin Vechev", "authorids": "~Mikl\u00f3s_Z._Horv\u00e1th1;~Mark_Niklas_Mueller2;~Marc_Fischer1;~Martin_Vechev1", "gender": ";M;M;M", "homepage": ";https://www.sri.inf.ethz.ch/people/mark;;https://www.sri.inf.ethz.ch/people/martin", "dblp": ";287/4254;37/9373-2;93/2189.html", "google_scholar": "KGmeFloAAAAJ;RBpmcCAAAAAJ;;https://scholar.google.ch/citations?user=aZ1Rh50AAAAJ", "orcid": ";0000-0002-2496-6542;;", "linkedin": "mzhorvath/;mark-m%C3%BCller-8bb4b1140/;;", "or_profile": "~Mikl\u00f3s_Z._Horv\u00e1th1;~Mark_Niklas_Mueller2;~Marc_Fischer1;~Martin_Vechev1", "aff": "Swiss Federal Institute of Technology;Swiss Federal Institute of Technology;Swiss Federal Institute of Technology;Swiss Federal Institute of Technology", "aff_domain": "ethz.ch;ethz.ch;ethz.ch;ethz.ch", "position": "Master's Student;PhD student;PhD student;Full Professor", "bibtex": "@inproceedings{\nhorv{\\'a}th2022boosting,\ntitle={Boosting Randomized Smoothing with Variance Reduced Classifiers},\nauthor={Mikl{\\'o}s Z. Horv{\\'a}th and Mark Niklas Mueller and Marc Fischer and Martin Vechev},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=mHu2vIds_-b}\n}", "github": "", "project": "", "reviewers": "ZAuf;RVtN;CscF", "pdf_size": 0, "recommendation": "6;8;8", "confidence": "4;4;3", "correctness": "3;3;3", "technical_novelty": "3;3;2", "empirical_novelty": "3;2;0", "wc_summary_paper": "58;72;75", "wc_summary_review": "40;384;72", "wc_main_review": "328;26;215", "wc_review": "426;482;362", "wc_reply_reviewers": "61;14;0", "wc_reply_authors": "418;858;333", "reply_reviewers": "1;1;0", "reply_authors": "2;2;1", "recommendation_avg": [ 7.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 1.6666666666666667, 1.247219128924647 ], "wc_summary_paper_avg": [ 68.33333333333333, 7.408703590297623 ], "wc_summary_review_avg": [ 165.33333333333334, 155.17158961041233 ], "wc_main_review_avg": [ 189.66666666666666, 124.58553509760094 ], "wc_review_avg": [ 423.3333333333333, 49.026070162267295 ], "wc_reply_reviewers_avg": [ 25.0, 26.08958923913266 ], "wc_reply_authors_avg": [ 536.3333333333334, 230.08452553113798 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.49999999999999983, "corr_recommendation_correctness": 0.0, "gs_citation": 55, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14327532718877741433&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=mHu2vIds_-b", "email": "ethz.ch;ethz.ch;ethz.ch;ethz.ch", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Swiss Federal Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.ethz.ch", "aff_unique_abbr": "ETH Zurich", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Switzerland" }, { "id": "mJXARDIxVl6", "title": "Optimistic Policy Optimization is Provably Efficient in Non-stationary MDPs", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "We study episodic reinforcement learning (RL) in non-stationary linear kernel Markov decision processes (MDPs). In this setting, both the reward function and the transition kernel are linear with respect to the given feature maps and are allowed to vary over time, as long as their respective parameter variations do not exceed certain variation budgets. We propose the \\underline{p}eriodically \\underline{r}estarted \\underline{o}ptimistic \\underline{p}olicy \\underline{o}ptimization algorithm (PROPO), which is an optimistic policy optimization algorithm with linear function approximation. PROPO features two mechanisms: sliding-window-based policy evaluation and periodic-restart-based policy improvement, which are tailored for policy optimization in a non-stationary environment. In addition, only utilizing the technique of sliding window, we propose a value-iteration algorithm. We establish dynamic upper bounds for the proposed methods and a matching minimax lower bound which shows the (near-) optimality of the proposed methods. To our best knowledge, PROPO is the first provably efficient policy optimization algorithm that handles non-stationarity. ", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/04132b56560c443ad03949b0735c363fc78318e2.zip", "author": "Han Zhong;Zhuoran Yang;Zhaoran Wang;Csaba Szepesvari", "authorids": "~Han_Zhong1;~Zhuoran_Yang1;~Zhaoran_Wang1;~Csaba_Szepesvari1", "gender": ";M;Not Specified;M", "homepage": "https://hanzhong-ml.github.io/;https://zhuoranyang.github.io/;https://zhaoranwang.github.io/;https://sites.ualberta.ca/~szepesva/", "dblp": "137/8096.html;;117/2756;http://dblp.uni-trier.de/pers/hd/s/Szepesv=aacute=ri:Csaba", "google_scholar": "Bk5q_pAAAAAJ;;https://scholar.google.com.tw/citations?user=HSx0BgQAAAAJ;https://scholar.google.ca/citations?user=zvC19mQAAAAJ", "orcid": ";;;", "linkedin": ";;;csaba-szepesvari-09376b1?trk=hp-identity-name", "or_profile": "~Han_Zhong1;~Zhuoran_Yang1;~Zhaoran_Wang1;~Csaba_Szepesvari1", "aff": "Peking University;University of California, Berkeley;;Google DeepMind", "aff_domain": "stu.pku.edu.cn;berkeley.edu;;google.com", "position": "PhD student;Postdoc;;Research Scientist", "bibtex": "@misc{\nzhong2022optimistic,\ntitle={Optimistic Policy Optimization is Provably Efficient in Non-stationary {MDP}s},\nauthor={Han Zhong and Zhuoran Yang and Zhaoran Wang and Csaba Szepesvari},\nyear={2022},\nurl={https://openreview.net/forum?id=mJXARDIxVl6}\n}", "github": "", "project": "", "reviewers": "tdJV;L9zz;ukq3;fRkQ", "site": "https://openreview.net/forum?id=mJXARDIxVl6", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "4;4;3;4", "correctness": "3;4;4;3", "technical_novelty": "2;3;2;2", "empirical_novelty": "1;0;0;0", "wc_summary_paper": "65;64;489;51", "wc_summary_review": "19;72;121;12", "wc_main_review": "439;364;587;279", "wc_review": "523;500;1197;342", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 0.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 167.25, 185.84452507405214 ], "wc_summary_review_avg": [ 56.0, 44.11915683691156 ], "wc_main_review_avg": [ 417.25, 113.17768110365223 ], "wc_review_avg": [ 640.5, 328.7632126622442 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.0, "gs_citation": 23, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12264741571417829214&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "Peking University;University of California, Berkeley;Google", "aff_unique_dep": ";;Google DeepMind", "aff_unique_url": "http://www.pku.edu.cn;https://www.berkeley.edu;https://deepmind.com", "aff_unique_abbr": "Peking U;UC Berkeley;DeepMind", "aff_campus_unique_index": "1", "aff_campus_unique": ";Berkeley", "aff_country_unique_index": "0;1;2", "aff_country_unique": "China;United States;United Kingdom" }, { "title": "Deep Point Cloud Reconstruction", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6776", "id": "mKDtUtxIGJ", "poster": "", "openreview": "https://openreview.net/forum?id=mKDtUtxIGJ", "slides": "https://iclr.cc/virtual/2022/poster/6776", "video": "https://iclr.cc/virtual/2022/poster/6776", "author_site": "Jaesung Choe, ByeongIn Joung, Francois Rameau, Jaesik Park, In Kweon", "tldr": "", "abstract": "Point cloud obtained from 3D scanning is often sparse, noisy, and irregular. To cope with these issues, recent studies have been separately conducted to densify, denoise, and complete inaccurate point cloud. In this paper, we advocate that jointly solving these tasks leads to significant improvement for point cloud reconstruction. To this end, we propose a deep point cloud reconstruction network consisting of two stages: 1) a 3D sparse stacked-hourglass network as for the initial densification and denoising, 2) a refinement via transformers converting the discrete voxels into continuous 3D points. In particular, we further improve the performance of the transformers by a newly proposed module called amplified positional encoding. This module has been designed to differently amplify the magnitude of positional encoding vectors based on the points' distances for adaptive refinements. Extensive experiments demonstrate that our network achieves state-of-the-art performance among the recent studies in the ScanNet, ICL-NUIM, and ShapeNet datasets. Moreover, we underline the ability of our network to generalize toward real-world and unmet scenes.\n", "keywords": "Computer Vision;3D Geometry;Deep Learning based Point Cloud Understanding;Point Cloud Denoising;Point Cloud Upsampling", "primary_area": "", "supplementary_material": "", "author": "Jaesung Choe;ByeongIn Joung;Francois Rameau;Jaesik Park;In So Kweon", "authorids": "~Jaesung_Choe1;~ByeongIn_Joung1;~Francois_Rameau1;~Jaesik_Park3;~In_So_Kweon2", "gender": ";;M;M;", "homepage": ";https://vi.kaist.ac.kr;;http://jaesik.info;", "dblp": ";;31/10782;00/10336;", "google_scholar": ";;https://scholar.google.fr/citations?user=Hfx_pykAAAAJ;_3q6KBIAAAAJ;", "orcid": ";;;;", "linkedin": ";;fran%C3%A7ois-rameau-07b1ba77/;;", "or_profile": "~Jaesung_Choe1;~ByeongIn_Joung1;~Francois_Rameau1;~Jaesik_Park3;~In_So_Kweon2", "aff": ";Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology;Pohang University of Science and Technology;", "aff_domain": ";kaist.ac.kr;kaist.ac.kr;postech.edu;", "position": ";MS student;Postdoc;Assistant Professor;", "bibtex": "@inproceedings{\nchoe2022deep,\ntitle={Deep Point Cloud Reconstruction},\nauthor={Jaesung Choe and ByeongIn Joung and Francois Rameau and Jaesik Park and In So Kweon},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=mKDtUtxIGJ}\n}", "github": "", "project": "", "reviewers": "dx4y;1Mes;veKS;G4Aq", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "4;4;3;4", "correctness": "3;3;3;4", "technical_novelty": "2;3;2;3", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "83;88;40;61", "wc_summary_review": "57;74;237;67", "wc_main_review": "578;351;791;158", "wc_review": "718;513;1068;286", "wc_reply_reviewers": "0;40;0;0", "wc_reply_authors": "1641;745;2386;434", "reply_reviewers": "0;2;0;0", "reply_authors": "7;7;14;2", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 68.0, 19.091883092036785 ], "wc_summary_review_avg": [ 108.75, 74.29123434160991 ], "wc_main_review_avg": [ 469.5, 237.80716978257826 ], "wc_review_avg": [ 646.25, 287.47032455542256 ], "wc_reply_reviewers_avg": [ 10.0, 17.320508075688775 ], "wc_reply_authors_avg": [ 1301.5, 767.0803412941829 ], "reply_reviewers_avg": [ 0.5, 0.8660254037844386 ], "reply_authors_avg": [ 7.5, 4.272001872658765 ], "replies_avg": [ 38, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.13245323570650439, "corr_recommendation_correctness": 0.9271726499455306, "gs_citation": 28, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9091898933049896922&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=mKDtUtxIGJ", "email": ";kaist.ac.kr;kaist.ac.kr;postech.edu;", "author_num": 5, "aff_unique_index": "0;0;1", "aff_unique_norm": "Korea Advanced Institute of Science and Technology;Pohang University of Science and Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.kaist.ac.kr;https://www.postech.ac.kr", "aff_unique_abbr": "KAIST;POSTECH", "aff_campus_unique_index": "1", "aff_campus_unique": ";Pohang", "aff_country_unique_index": "0;0;0", "aff_country_unique": "South Korea" }, { "id": "mKsMcL8FfsV", "title": "Learning Rich Nearest Neighbor Representations from Self-supervised Ensembles", "track": "main", "status": "Reject", "tldr": "", "abstract": "Pretraining convolutional neural networks via self-supervision, and applying them in transfer learning, is an incredibly fast-growing field that is rapidly and iteratively improving performance across practically all image domains. \nMeanwhile, model ensembling is one of the most universally applicable techniques in supervised learning literature and practice, offering a simple solution to reliably improve performance. But how to optimally combine self-supervised models to maximize representation quality has largely remained unaddressed.\nIn this work, we provide a framework to perform self-supervised model ensembling via a novel method of learning representations directly through gradient descent at inference time.\nThis technique improves representation quality, as measured by k-nearest neighbors, both on the in-domain dataset and in the transfer setting, with models transferable from the former setting to the latter.\nAdditionally, this direct learning of feature through backpropagation improves representations from even a single model, echoing the improvements found in self-distillation.", "keywords": "k-NN;ensemble;self-supervised learning", "primary_area": "", "supplementary_material": "", "author": "Bram Wallace;Devansh Arpit;Huan Wang;Caiming Xiong", "authorids": "~Bram_Wallace1;~Devansh_Arpit2;~Huan_Wang1;~Caiming_Xiong1", "gender": "M;M;M;M", "homepage": "https://sites.coecis.cornell.edu/bram/;;http://www.cs.yale.edu/homes/wang-huan/;http://cmxiong.com/", "dblp": ";120/8494;70/6155-16.html;80/7282", "google_scholar": "bI-VIDcAAAAJ;https://scholar.google.ca/citations?hl=en;7NpTttkAAAAJ;vaSdahkAAAAJ", "orcid": ";;;", "linkedin": ";;huanwangyale/;caiming-xiong-150a1417", "or_profile": "~Bram_Wallace1;~Devansh_Arpit2;~Huan_Wang1;~Caiming_Xiong1", "aff": "SalesForce.com;Salesforce Research;Salesforce.com;Salesforce Research", "aff_domain": "salesforce.com;salesforce.com;salesforce.com;salesforce.com", "position": "Researcher;Senior Research Scientist;Researcher;Research Scientist", "bibtex": "@misc{\nwallace2022learning,\ntitle={Learning Rich Nearest Neighbor Representations from Self-supervised Ensembles},\nauthor={Bram Wallace and Devansh Arpit and Huan Wang and Caiming Xiong},\nyear={2022},\nurl={https://openreview.net/forum?id=mKsMcL8FfsV}\n}", "github": "", "project": "", "reviewers": "TyZ7;HHFN;94Ck;mwTd", "site": "https://openreview.net/forum?id=mKsMcL8FfsV", "pdf_size": 0, "recommendation": "3;5;5;5", "confidence": "4;4;3;4", "correctness": "3;3;2;3", "technical_novelty": "2;2;2;2", "empirical_novelty": "0;4;2;2", "wc_summary_paper": "43;64;36;56", "wc_summary_review": "157;52;42;37", "wc_main_review": "65;563;129;295", "wc_review": "265;679;207;388", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.5, 0.8660254037844386 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.0, 1.4142135623730951 ], "wc_summary_paper_avg": [ 49.75, 10.917302780449024 ], "wc_summary_review_avg": [ 72.0, 49.371044145328746 ], "wc_main_review_avg": [ 263.0, 192.47337478207214 ], "wc_review_avg": [ 384.75, 182.02249174209214 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": -0.3333333333333333, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:3e6sH_EKD5sJ:scholar.google.com/&scioq=Learning+Rich+Nearest+Neighbor+Representations+from+Self-supervised+Ensembles&hl=en&as_sdt=0,5", "gs_version_total": 3, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Salesforce", "aff_unique_dep": "", "aff_unique_url": "https://www.salesforce.com", "aff_unique_abbr": "Salesforce", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "mL07kYPn3E", "title": "Few-shot Learning with Big Prototypes", "track": "main", "status": "Reject", "tldr": "", "abstract": "Using dense vectors, i.e., prototypes, to represent abstract information of classes has become a common approach in low-data machine learning scenarios. Typically, prototypes are mean output embeddings over the instances for each class. In this case, prototypes have the same dimension of example embeddings, and such tensors could be regarded as ``points'' in the feature space from the geometrical perspective. But these points may lack the expressivity of the whole class-level information due to the biased sampling.\nIn this paper, we propose to use tensor fields (``areas'') to model prototypes to enhance the expressivity of class-level information. Specifically, we present \\textit{big prototypes}, where prototypes are represented by hyperspheres with dynamic sizes. A big prototype could be effectively modeled by two sets of learnable parameters, one is the center of the hypersphere, which is an embedding with the same dimension of training examples. The other is the radius of the sphere, which is a constant. Compared with atactic manifolds with complex boundaries, representing hypersphere with parameters is immensely easier. Moreover, it is convenient to perform metric-based classification with big prototypes in few-shot learning, where we only need to calculate the distance from a data point to the surface of the hypersphere.\nExtensive experiments on few-shot learning tasks across NLP and CV demonstrate the effectiveness of big prototypes.\n", "keywords": "Prototype;Few-shot Learning;Meta-learning", "primary_area": "", "supplementary_material": "/attachment/0244f64f72a164011f6e1e4c2f6cebe9329f0884.zip", "author": "Ning Ding;Yulin Chen;Xiaobin Wang;Hai-Tao Zheng;Zhiyuan Liu;Pengjun Xie", "authorids": "~Ning_Ding5;~Yulin_Chen1;~Xiaobin_Wang1;~Hai-Tao_Zheng2;~Zhiyuan_Liu1;~Pengjun_Xie2", "gender": "M;F;M;M;M;M", "homepage": "https://www.stingning.cn/;;;https://www.sigs.tsinghua.edu.cn/fg3/105069.jhtml;http://nlp.csai.tsinghua.edu.cn/~lzy;", "dblp": ";;17/5812;20/134-2;53/3245-1;212/1755.html", "google_scholar": "uZXQuYAAAAAJ;tAiXl18AAAAJ;;https://scholar.google.com.hk/citations?user=7VPeORoAAAAJ;dT0v5u0AAAAJ;", "orcid": ";;;0000-0001-5128-5649;0000-0002-7709-2543;", "linkedin": ";;;;;", "or_profile": "~Ning_Ding5;~Yulin_Chen1;~Xiaobin_Wang1;~Hai-Tao_Zheng2;~Zhiyuan_Liu1;~Pengjun_Xie2", "aff": "Tsinghua University;Tsinghua University;Alibaba Group;Tsinghua University;Tsinghua University;Alibaba Group", "aff_domain": "tsinghua.edu.cn;tsinghua.edu.cn;alibaba-inc.com;tsinghua.edu.cn;tsinghua.edu.cn;alibaba-inc.com", "position": "PhD student;MS student;Intern;Associate Professor;Associate Professor;Researcher", "bibtex": "@misc{\nding2022fewshot,\ntitle={Few-shot Learning with Big Prototypes},\nauthor={Ning Ding and Yulin Chen and Xiaobin Wang and Hai-Tao Zheng and Zhiyuan Liu and Pengjun Xie},\nyear={2022},\nurl={https://openreview.net/forum?id=mL07kYPn3E}\n}", "github": "", "project": "", "reviewers": "bJMv;7SK7;6kXL;4NsC", "site": "https://openreview.net/forum?id=mL07kYPn3E", "pdf_size": 0, "recommendation": "5;6;6;6", "confidence": "4;4;3;2", "correctness": "2;3;3;4", "technical_novelty": "1;2;3;3", "empirical_novelty": "1;2;3;3", "wc_summary_paper": "48;30;119;44", "wc_summary_review": "81;112;120;57", "wc_main_review": "153;501;211;201", "wc_review": "282;643;450;302", "wc_reply_reviewers": "0;373;45;0", "wc_reply_authors": "812;1600;359;288", "reply_reviewers": "0;3;1;0", "reply_authors": "3;4;1;1", "recommendation_avg": [ 5.75, 0.4330127018922193 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.25, 0.82915619758885 ], "empirical_novelty_avg": [ 2.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 60.25, 34.571483913769164 ], "wc_summary_review_avg": [ 92.5, 25.144581921360317 ], "wc_main_review_avg": [ 266.5, 137.15228762219024 ], "wc_review_avg": [ 419.25, 144.56378350057113 ], "wc_reply_reviewers_avg": [ 104.5, 156.10333116240665 ], "wc_reply_authors_avg": [ 764.75, 522.4458704019011 ], "reply_reviewers_avg": [ 1.0, 1.224744871391589 ], "reply_authors_avg": [ 2.25, 1.299038105676658 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.5222329678670935, "corr_recommendation_correctness": 0.816496580927726, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=653568830512832540&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;1;0;0;1", "aff_unique_norm": "Tsinghua University;Alibaba Group", "aff_unique_dep": ";", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.alibaba.com", "aff_unique_abbr": "THU;Alibaba", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "China" }, { "id": "mMiKHj7Pobj", "title": "Revealing the Incentive to Cause Distributional Shift", "track": "main", "status": "Reject", "tldr": "", "abstract": "Decisions made by machine learning systems have increasing influence on the world, yet it is common for machine learning algorithms to assume that no such influence exists. An example is the use of the i.i.d. assumption in content recommendation: In fact, the (choice of) content displayed can change users\u2019 perceptions and preferences, or even drive them away, causing a shift in the distribution of users. We introduce the term auto-induced distributional shift (ADS) to describe the phenomenon of an algorithm causing change in the distribution of its own inputs. Leveraging ADS can be a means of increasing performance. But this is not always desirable, since performance metrics often underspecify what type of behaviour is desirable. When real-world conditions violate assumptions (such as i.i.d. data), this underspecification can result in unexpected behaviour. To diagnose such issues, we introduce the approach of unit tests for incentives: simple environments designed to show whether an algorithm will hide or reveal incentives to achieve performance via certain means (in our case, via ADS). We use these unit tests to demonstrate that changes to the learning algorithm (e.g. introducing meta-learning) can cause previously hidden incentives to be revealed, resulting in qualitatively different behaviour despite no change in performance metric. We further introduce a toy environment for modelling real-world issues with ADS in content recommendation, where we demonstrate that strong meta-learners achieve gains in performance via ADS. These experiments confirm that the unit tests work \u2013 an algorithm\u2019s failure of the unit test correctly diagnoses its propensity to reveal incentives for ADS.", "keywords": "alignment;incentives;unit testing;distributional shift;content recommendation;myopic reinforcement learning", "primary_area": "", "supplementary_material": "", "author": "David Krueger;Tegan Maharaj;Jan Leike", "authorids": "~David_Krueger1;~Tegan_Maharaj1;~Jan_Leike1", "gender": "M;F;M", "homepage": "https://mila.umontreal.ca/en/person/david-scott-krueger/;http://teganmaharaj.com;https://jan.leike.name", "dblp": "142/2741.html;;https://dblp.uni-trier.de/pers/hd/l/Leike:Jan", "google_scholar": "https://scholar.google.ca/citations?user=5Uz70IoAAAAJ;https://scholar.google.ca/citations?user=XpscC-EAAAAJ;beiWcokAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~David_Krueger1;~Tegan_Maharaj1;~Jan_Leike1", "aff": "University of Montreal;Ecole Polytechnique de Montreal;OpenAI", "aff_domain": "umontreal.ca;polymtl.ca;openai.com", "position": "PhD student;PhD student;Alignment Team Lead", "bibtex": "@misc{\nkrueger2022revealing,\ntitle={Revealing the Incentive to Cause Distributional Shift},\nauthor={David Krueger and Tegan Maharaj and Jan Leike},\nyear={2022},\nurl={https://openreview.net/forum?id=mMiKHj7Pobj}\n}", "github": "", "project": "", "reviewers": "vVp6;yBdS;nHgz;C74h", "site": "https://openreview.net/forum?id=mMiKHj7Pobj", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "4;3;2;3", "correctness": "4;4;4;3", "technical_novelty": "3;2;2;2", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "100;121;60;209", "wc_summary_review": "114;26;72;101", "wc_main_review": "780;396;349;885", "wc_review": "994;543;481;1195", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 3.0, 0.7071067811865476 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 122.5, 54.536684901082864 ], "wc_summary_review_avg": [ 78.25, 33.78146681244022 ], "wc_main_review_avg": [ 602.5, 233.5685124326479 ], "wc_review_avg": [ 803.25, 300.59472300757375 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": -1.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:WuRelFiOpHUJ:scholar.google.com/&scioq=Revealing+the+Incentive+to+Cause+Distributional+Shift&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;2", "aff_unique_norm": "University of Montreal;Ecole Polytechnique de Montreal;OpenAI", "aff_unique_dep": ";;", "aff_unique_url": "https://wwwumontreal.ca;https://www.polymtl.ca;https://openai.com", "aff_unique_abbr": "UM;Polytechnique Montreal;OpenAI", "aff_campus_unique_index": "1", "aff_campus_unique": ";Montreal", "aff_country_unique_index": "0;0;1", "aff_country_unique": "Canada;United States" }, { "id": "mNLLDtkAy4X", "title": "Escaping Stochastic Traps with Aleatoric Mapping Agents", "track": "main", "status": "Reject", "tldr": "", "abstract": "When extrinsic rewards are sparse, artificial agents struggle to explore an environment. Curiosity, implemented as an intrinsic reward for prediction errors, can improve exploration but fails when faced with action-dependent noise sources. We present aleatoric mapping agents (AMAs), a neuroscience inspired solution modeled on the cholinergic system of the mammalian brain. AMAs aim to explicitly ascertain which dynamics of the environment are unpredictable, regardless of whether those dynamics are induced by the actions of the agent. This is achieved by generating separate forward predictions for the mean and aleatoric uncertainty of future states with reducing intrinsic rewards for those states that are unpredictable. We show AMAs are able to effectively circumvent action-dependent stochastic traps that immobilise conventional curiosity driven agents.", "keywords": "Curiosity;Neuroscience;Acetylcholine;Uncertainty;Reinforcement learning;Intrinsic Rewards", "primary_area": "", "supplementary_material": "/attachment/bda5b54f3b875a2ca4f7a1905671a2cc34276f4d.zip", "author": "Augustine N. Mavor-Parker;Kimberly A Young;Caswell Barry;Lewis Griffin", "authorids": "~Augustine_N._Mavor-Parker1;~Kimberly_A_Young1;~Caswell_Barry1;~Lewis_Griffin1", "gender": "M;F;;M", "homepage": "https://self-supervisor.github.io/;;;http://www.cs.ucl.ac.uk/people/L.Griffin.html/", "dblp": ";;220/3769;93/910", "google_scholar": "J7XkuPwAAAAJ;;;https://scholar.google.com.tw/citations?hl=en", "orcid": ";;;", "linkedin": ";kimberly-young-8b647753/;;lewis-griffin-290b433/", "or_profile": "~Augustine_N._Mavor-Parker1;~Kimberly_A_Young1;~Caswell_Barry1;~Lewis_Griffin1", "aff": ";Boston University;University College London;University College London, University of London", "aff_domain": ";bu.edu;ucl.ac.uk;ucl.ac.uk", "position": ";PhD student;Principal Researcher;Full Professor", "bibtex": "@misc{\nmavor-parker2022escaping,\ntitle={Escaping Stochastic Traps with Aleatoric Mapping Agents},\nauthor={Augustine N. Mavor-Parker and Kimberly A Young and Caswell Barry and Lewis Griffin},\nyear={2022},\nurl={https://openreview.net/forum?id=mNLLDtkAy4X}\n}", "github": "", "project": "", "reviewers": "j6wR;DE7e;QuEW", "site": "https://openreview.net/forum?id=mNLLDtkAy4X", "pdf_size": 0, "recommendation": "3;5;6", "confidence": "4;4;5", "correctness": "2;4;2", "technical_novelty": "1;2;4", "empirical_novelty": "1;2;4", "wc_summary_paper": "33;87;120", "wc_summary_review": "73;83;49", "wc_main_review": "558;600;489", "wc_review": "664;770;658", "wc_reply_reviewers": "488;1311;75", "wc_reply_authors": "1676;1348;430", "reply_reviewers": "1;3;1", "reply_authors": "3;3;2", "recommendation_avg": [ 4.666666666666667, 1.247219128924647 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "correctness_avg": [ 2.6666666666666665, 0.9428090415820634 ], "technical_novelty_avg": [ 2.3333333333333335, 1.247219128924647 ], "empirical_novelty_avg": [ 2.3333333333333335, 1.247219128924647 ], "wc_summary_paper_avg": [ 80.0, 35.86084215408221 ], "wc_summary_review_avg": [ 68.33333333333333, 14.2672897060218 ], "wc_main_review_avg": [ 549.0, 45.760244754590204 ], "wc_review_avg": [ 697.3333333333334, 51.44144459696114 ], "wc_reply_reviewers_avg": [ 624.6666666666666, 513.7654025806806 ], "wc_reply_authors_avg": [ 1151.3333333333333, 527.3438684156246 ], "reply_reviewers_avg": [ 1.6666666666666667, 0.9428090415820634 ], "reply_authors_avg": [ 2.6666666666666665, 0.4714045207910317 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.7559289460184544, "corr_recommendation_correctness": 0.1889822365046137, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11884790335057377023&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;1", "aff_unique_norm": "Boston University;University College London", "aff_unique_dep": ";", "aff_unique_url": "https://www.bu.edu;https://www.ucl.ac.uk", "aff_unique_abbr": "BU;UCL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1", "aff_country_unique": "United States;United Kingdom" }, { "id": "mOXHnLqR7AC", "title": "Causal Scene BERT: Improving object detection by searching for challenging groups", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Autonomous vehicles (AV) rely on learning-based perception modules parametrized with neural networks for tasks like object detection. These modules frequently have low expected error overall but high error on atypical groups of data due to biases inherent in the training process. Multiple heuristics are employed to identify \"failures\" in AVs, a typical example being driver interventions. After identification, a human team combs through the associated data to group perception failures that share common causes. More data from these groups is then collected and annotated before retraining the model to fix the issue. In other words, error groups are found and addressed in hindsight as they appear. Our main contribution is a pseudo-automatic method to discover such groups in foresight by performing causal interventions on simulated driving scenes. To keep our interventions on the data manifold, we use masked language models. We verify that the prioritized groups found via intervention are challenging for the object detector and show that retraining with data collected from these groups helps inordinately compared to adding more IID data. We also release software to run interventions in simulated scenes, which we hope will benefit the causality community. ", "keywords": "object detection;computer vision;masked language modeling;autonomous vehicles;causality", "primary_area": "", "supplementary_material": "", "author": "Cinjon Resnick;Or Litany;Amlan Kar;Karsten Kreis;James Lucas;Kyunghyun Cho;Sanja Fidler", "authorids": "~Cinjon_Resnick1;~Or_Litany1;~Amlan_Kar2;~Karsten_Kreis1;~James_Lucas1;~Kyunghyun_Cho1;~Sanja_Fidler1", "gender": "M;M;M;;M;M;F", "homepage": ";http://orlitany.github.io;https://amlankar.github.io;https://karstenkreis.github.io/;http://www.cs.toronto.edu/~jlucas/;http://kyunghyuncho.me;http://www.cs.toronto.edu/~fidler/", "dblp": "199/1935;119/1476;https://dblp.uni-trier.de/pers/hd/k/Kar:Amlan;238/6834;24/2474;41/9736;08/6607", "google_scholar": ";https://scholar.google.co.il/citations?user=Ihs8dwsAAAAJ;iu-Gqo4AAAAJ;https://scholar.google.de/citations?user=rFd-DiAAAAAJ;https://scholar.google.ca/citations?user=AYaHBAQAAAAJ;https://scholar.google.fi/citations?user=0RAmmIAAAAAJ;CUlqK5EAAAAJ", "orcid": ";;;;;;", "linkedin": ";;;karstenkreis;;;sanja-fidler-2846a1a?trk=hp-identity-name", "or_profile": "~Cinjon_Resnick1;~Or_Litany1;~Amlan_Kar2;~Karsten_Kreis1;~James_Lucas1;~Kyunghyun_Cho1;~Sanja_Fidler1", "aff": "New York University;NVIDIA;Department of Computer Science, University of Toronto;NVIDIA;Department of Computer Science, University of Toronto;New York University;Department of Computer Science, University of Toronto", "aff_domain": "nyu.edu;nvidia.com;cs.toronto.edu;nvidia.com;cs.toronto.edu;nyu.edu;cs.toronto.edu", "position": "PhD student;Research Scientist;PhD student;Research Scientist;PhD Candidate;Associate Professor;Associate Professor", "bibtex": "@misc{\nresnick2022causal,\ntitle={Causal Scene {BERT}: Improving object detection by searching for challenging groups},\nauthor={Cinjon Resnick and Or Litany and Amlan Kar and Karsten Kreis and James Lucas and Kyunghyun Cho and Sanja Fidler},\nyear={2022},\nurl={https://openreview.net/forum?id=mOXHnLqR7AC}\n}", "github": "", "project": "", "reviewers": "", "site": "https://openreview.net/forum?id=mOXHnLqR7AC", "pdf_size": 0, "recommendation": "", "confidence": "", "correctness": "", "technical_novelty": "", "empirical_novelty": "", "wc_summary_paper": "", "wc_summary_review": "", "wc_main_review": "", "wc_review": "", "wc_reply_reviewers": "", "wc_reply_authors": "", "reply_reviewers": "", "reply_authors": "", "recommendation_avg": [ 0, 0 ], "confidence_avg": [ 0, 0 ], "correctness_avg": [ 0, 0 ], "technical_novelty_avg": [ 0, 0 ], "empirical_novelty_avg": [ 0, 0 ], "wc_summary_paper_avg": [ 0, 0 ], "wc_summary_review_avg": [ 0, 0 ], "wc_main_review_avg": [ 0, 0 ], "wc_review_avg": [ 0, 0 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 1, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0, "corr_recommendation_correctness": 0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10262133295756212155&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "aff_unique_index": "0;1;2;1;2;0;2", "aff_unique_norm": "New York University;NVIDIA;University of Toronto", "aff_unique_dep": ";NVIDIA Corporation;Department of Computer Science", "aff_unique_url": "https://www.nyu.edu;https://www.nvidia.com;https://www.utoronto.ca", "aff_unique_abbr": "NYU;NVIDIA;U of T", "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Toronto", "aff_country_unique_index": "0;0;1;0;1;0;1", "aff_country_unique": "United States;Canada" }, { "id": "mPlm356yMIP", "title": "Digging Into Output Representation for Monocular 3D Object Detection", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "\tMonocular 3D object detection aims to recognize and localize objects in 3D space from a single image. Recent researches have conducted remarkable advancements, while all of them follow a typical output representation in LiDAR-based 3D detection. However, in this paper, we argue that the existing discrete output representation is not suitable for monocular 3D detection. Specifically, monocular 3D detection has only two-dimensional information input while is required to output three-dimensional detections. This characteristic indicates that monocular 3D detection is inherently different from other typical detection tasks that have the same dimensional input and output. The dimension gap causes a large lower bound for the error of estimated depth. Therefore, we propose to reformulate the existing discrete output representation as a spatial probability distribution according to depth. This probability distribution considers the uncertainty caused by the absent depth dimension, allowing us to accurately and comprehensively represent objects in 3D space. Extensive experiments exhibit the superiority of our output representation. As a result, we have applied our method to 12 SOTA monocular 3D detectors, consistently boosting their average precision (AP) by ~ 20% relative improvements. The source code will be publicly available soon.", "keywords": "Computer vision;monocular 3D object detection;output representation", "primary_area": "", "supplementary_material": "", "author": "Liang Peng;Senbo Yan;Chenxi Huang;Xiaofei He;Deng Cai", "authorids": "~Liang_Peng3;~Senbo_Yan1;~Chenxi_Huang2;~Xiaofei_He2;~Deng_Cai4", "gender": "M;M;F;M;M", "homepage": "https://spengliang.github.io/;;http://mrsempress.top;https://person.zju.edu.cn/0007101;http://www.cad.zju.edu.cn/home/dengcai/", "dblp": "57/3505-1;;88/1185-4;h/XiaofeiHe.html;c/DCai", "google_scholar": "_sJpS34AAAAJ;;e14HvOcAAAAJ;QLLFowsAAAAJ;vzxDyJoAAAAJ", "orcid": ";0000-0002-5051-0506;;0009-0001-9107-2354;", "linkedin": ";;;;", "or_profile": "~Liang_Peng3;~Senbo_Yan1;~Chenxi_Huang2;~Xiaofei_He2;~Deng_Cai4", "aff": "Zhejiang University;Zhejiang University;Zhejiang University;Zhejiang University;Zhejiang University", "aff_domain": "zju.edu.cn;zju.edu.cn;zju.edu.cn;zju.edu.cn;zju.edu.cn", "position": "PhD student;PhD student;PhD student;Professor;Professor", "bibtex": "@misc{\npeng2022digging,\ntitle={Digging Into Output Representation for Monocular 3D Object Detection},\nauthor={Liang Peng and Senbo Yan and Chenxi Huang and Xiaofei He and Deng Cai},\nyear={2022},\nurl={https://openreview.net/forum?id=mPlm356yMIP}\n}", "github": "", "project": "", "reviewers": "3SBP;8Hi4;tfTu;CemN;VSay", "site": "https://openreview.net/forum?id=mPlm356yMIP", "pdf_size": 0, "recommendation": "3;5;5;5;8", "confidence": "4;3;4;4;3", "correctness": "3;4;3;3;3", "technical_novelty": "1;2;3;3;4", "empirical_novelty": "2;2;3;2;3", "wc_summary_paper": "72;52;83;103;74", "wc_summary_review": "25;79;80;83;91", "wc_main_review": "151;107;357;646;401", "wc_review": "248;238;520;832;566", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "0;0;0;0;0", "reply_reviewers": "0;0;0;0;0", "reply_authors": "0;0;0;0;0", "recommendation_avg": [ 5.2, 1.6 ], "confidence_avg": [ 3.6, 0.4898979485566356 ], "correctness_avg": [ 3.2, 0.39999999999999997 ], "technical_novelty_avg": [ 2.6, 1.019803902718557 ], "empirical_novelty_avg": [ 2.4, 0.4898979485566356 ], "wc_summary_paper_avg": [ 76.8, 16.557777628655362 ], "wc_summary_review_avg": [ 71.6, 23.677837739117987 ], "wc_main_review_avg": [ 332.4, 193.58057753814043 ], "wc_review_avg": [ 480.8, 221.488058368843 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.6634034720037775, "corr_recommendation_correctness": -0.0625, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2962258856183303075&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Zhejiang University", "aff_unique_dep": "", "aff_unique_url": "https://www.zju.edu.cn", "aff_unique_abbr": "ZJU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "id": "mQDpmgFKu1P", "title": "Language Modeling using LMUs: 10x Better Data Efficiency or Improved Scaling Compared to Transformers", "track": "main", "status": "Reject", "tldr": "", "abstract": "Recent studies have demonstrated that the performance of transformers on the task of language modeling obeys a power-law relationship with model size over six orders of magnitude. While transformers exhibit impressive scaling, their performance hinges on processing large amounts of data, and their computational and memory requirements grow quadratically with sequence length. Motivated by these considerations, we construct a Legendre Memory Unit based model that introduces a general prior for sequence processing and exhibits an $O(n)$ and $O(n \\ln n)$ (or better) dependency for memory and computation respectively. Over three orders of magnitude, we show that our new architecture attains the same accuracy as transformers with 10x fewer tokens. We also show that for the same amount of training our model improves the loss over transformers about as much as transformers improve over LSTMs. Additionally, we demonstrate that adding global self-attention complements our architecture and the augmented model improves performance even further.", "keywords": "Recurrent Neural Network;Legendre Memory Unit;Natural Language Processing", "primary_area": "", "supplementary_material": "/attachment/51c8616421be757db2b662b97b531053c3fac387.zip", "author": "Narsimha Reddy Chilkuri;Eric Hunsberger;Aaron Russell Voelker;Gurshaant Singh Malik;Chris Eliasmith", "authorids": "~Narsimha_Reddy_Chilkuri1;eric.hunsberger@appliedbrainresearch.com;~Aaron_Russell_Voelker1;~Gurshaant_Singh_Malik1;~Chris_Eliasmith1", "gender": "M;;;;", "homepage": "http://compneuro.uwaterloo.ca/people/narsimha-r-chilkuri.html;;http://compneuro.uwaterloo.ca/people/aaron-r-voelker.html;;https://arts.uwaterloo.ca/~celiasmi/", "dblp": ";;;;68/8", "google_scholar": ";;https://scholar.google.ca/citations?user=CmFzxbYAAAAJ;ATRPt_EAAAAJ;KOBO-6QAAAAJ", "orcid": ";;0000-0002-4211-3973;;", "linkedin": ";;aaron-voelker-5b9895169;;", "or_profile": "~Narsimha_Reddy_Chilkuri1;eric.hunsberger@appliedbrainresearch.com;~Aaron_Russell_Voelker1;~Gurshaant_Singh_Malik1;~Chris_Eliasmith1", "aff": "Applied Brain Research;;;Applied Brain Research;University of Waterloo", "aff_domain": "appliedbrainresearch.ca;;;appliedbrainresearch.com;uwaterloo.ca", "position": "Machine Learning Engineering;;;AI Research Engineer;Full Professor", "bibtex": "@misc{\nchilkuri2022language,\ntitle={Language Modeling using {LMU}s: 10x Better Data Efficiency or Improved Scaling Compared to Transformers},\nauthor={Narsimha Reddy Chilkuri and Eric Hunsberger and Aaron Russell Voelker and Gurshaant Singh Malik and Chris Eliasmith},\nyear={2022},\nurl={https://openreview.net/forum?id=mQDpmgFKu1P}\n}", "github": "", "project": "", "reviewers": "ooZ6;HHvw;hsfs;9uKE", "site": "https://openreview.net/forum?id=mQDpmgFKu1P", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "4;4;4;3", "correctness": "2;2;3;2", "technical_novelty": "2;3;2;3", "empirical_novelty": "1;2;1;3", "wc_summary_paper": "103;178;107;106", "wc_summary_review": "80;150;157;61", "wc_main_review": "564;187;228;369", "wc_review": "747;515;492;536", "wc_reply_reviewers": "82;0;42;0", "wc_reply_authors": "348;355;244;341", "reply_reviewers": "1;0;1;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 2.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 1.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 123.5, 31.5 ], "wc_summary_review_avg": [ 112.0, 42.11294337849113 ], "wc_main_review_avg": [ 337.0, 147.4228611850957 ], "wc_review_avg": [ 572.5, 101.9423856891725 ], "wc_reply_reviewers_avg": [ 31.0, 34.07345007480164 ], "wc_reply_authors_avg": [ 322.0, 45.304525160297175 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -1.0, "corr_recommendation_correctness": -0.3333333333333333, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16823252096379226219&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;1", "aff_unique_norm": "Applied Brain Research;University of Waterloo", "aff_unique_dep": ";", "aff_unique_url": "https://www.appliedbrainresearch.com;https://uwaterloo.ca", "aff_unique_abbr": ";UW", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Canada" }, { "title": "Regularized Autoencoders for Isometric Representation Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/5986", "id": "mQxt8l7JL04", "poster": "", "openreview": "https://openreview.net/forum?id=mQxt8l7JL04", "slides": "https://iclr.cc/virtual/2022/poster/5986", "video": "https://iclr.cc/virtual/2022/poster/5986", "author_site": "Yonghyeon Lee, Sangwoong Yoon, MinJun Son, Frank Park", "tldr": "", "abstract": "The recent success of autoencoders for representation learning can be traced in large part to the addition of a regularization term.\nSuch regularized autoencoders ``constrain\" the representation so as to prevent overfitting to the data while producing a parsimonious generative model. A regularized autoencoder should in principle learn not only the data manifold, but also a set of geometry-preserving coordinates for the latent representation space; by geometry-preserving we mean that the latent space representation should attempt to preserve actual distances and angles on the data manifold. In this paper we first formulate a hierarchy for geometry-preserving mappings (isometry, conformal mapping of degree $k$, area-preserving mappings). We then show that a conformal regularization term of degree zero -- i.e., one that attempts to preserve angles and relative distances, instead of angles and exact distances -- produces data representations that are superior to other existing methods. Applying our algorithm to an unsupervised information retrieval task for CelebA data with 40 annotations, we achieve 79\\% precision at five retrieved images, an improvement of more than 10\\% compared to recent related work. Code is available at https://github.com/Gabe-YHLee/IRVAE-public.", "keywords": "Autoencoders;Manifold Learning;Regularization;Geometry;Distortion", "primary_area": "", "supplementary_material": "", "author": "Yonghyeon Lee;Sangwoong Yoon;MinJun Son;Frank C. Park", "authorids": "~Yonghyeon_Lee2;~Sangwoong_Yoon1;~MinJun_Son1;~Frank_C._Park1", "gender": "M;M;M;M", "homepage": "https://www.gabe-yhlee.com;https://swyoon.github.io/;https://sites.google.com/robotics.snu.ac.kr/fcp/;http://robotics.snu.ac.kr", "dblp": "182/6796;237/1318;;p/FrankChongwooPark", "google_scholar": ";https://scholar.google.co.kr/citations?user=cH2rjfIAAAAJ;;u-h3PJIAAAAJ", "orcid": ";0000-0002-7251-3230;;0000-0002-0293-6975", "linkedin": ";;;", "or_profile": "~Yonghyeon_Lee2;~Sangwoong_Yoon1;~MinJun_Son1;~Frank_C._Park1", "aff": "Seoul National University;Seoul National University;Seoul National University;Seoul National University", "aff_domain": "snu.ac.kr;snu.ac.kr;snu.ac.kr;snu.ac.kr", "position": "PhD student;PhD student;MS student;Full Professor", "bibtex": "@inproceedings{\nlee2022regularized,\ntitle={Regularized Autoencoders for Isometric Representation Learning},\nauthor={Yonghyeon Lee and Sangwoong Yoon and MinJun Son and Frank C. Park},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=mQxt8l7JL04}\n}", "github": "", "project": "", "reviewers": "MiWx;ZGHS;YhcR;i9fR;5xzF", "pdf_size": 0, "recommendation": "5;5;5;6;8", "confidence": "3;4;3;4;3", "correctness": "4;3;3;3;3", "technical_novelty": "3;2;3;3;3", "empirical_novelty": "2;2;2;2;3", "wc_summary_paper": "66;97;140;75;154", "wc_summary_review": "93;117;18;45;24", "wc_main_review": "519;632;277;511;349", "wc_review": "678;846;435;631;527", "wc_reply_reviewers": "184;75;14;0;18", "wc_reply_authors": "1504;1374;1275;1349;1042", "reply_reviewers": "1;1;1;0;1", "reply_authors": "3;3;3;3;3", "recommendation_avg": [ 5.8, 1.16619037896906 ], "confidence_avg": [ 3.4, 0.4898979485566356 ], "correctness_avg": [ 3.2, 0.39999999999999997 ], "technical_novelty_avg": [ 2.8, 0.39999999999999997 ], "empirical_novelty_avg": [ 2.2, 0.39999999999999997 ], "wc_summary_paper_avg": [ 106.4, 34.931933814205024 ], "wc_summary_review_avg": [ 59.4, 39.04151636399386 ], "wc_main_review_avg": [ 457.6, 127.6300904959328 ], "wc_review_avg": [ 623.4, 139.55443382422501 ], "wc_reply_reviewers_avg": [ 58.2, 67.91877501839974 ], "wc_reply_authors_avg": [ 1308.8, 152.50888498707215 ], "reply_reviewers_avg": [ 0.8, 0.4 ], "reply_authors_avg": [ 3.0, 0.0 ], "replies_avg": [ 25, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.21004201260420152, "corr_recommendation_correctness": -0.3429971702850177, "gs_citation": 47, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8317035158200730786&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "pdf": "https://openreview.net/pdf?id=mQxt8l7JL04", "email": "snu.ac.kr;snu.ac.kr;snu.ac.kr;snu.ac.kr", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Seoul National University", "aff_unique_dep": "", "aff_unique_url": "https://www.snu.ac.kr", "aff_unique_abbr": "SNU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "South Korea" }, { "id": "mRF387I4Wl", "title": "FlowX: Towards Explainable Graph Neural Networks via Message Flows", "track": "main", "status": "Reject", "tldr": "", "abstract": "We investigate the explainability of graph neural networks (GNNs) as a step towards elucidating their working mechanisms. While most current methods focus on explaining graph nodes, edges, or features, we argue that, as the inherent functional mechanism of GNNs, message flows are more natural for performing explainability. To this end, we propose a novel method here, known as FlowX, to explain GNNs by identifying important message flows. To quantify the importance of flows, we propose to employ the concept of Shapley values from cooperative game theory. To tackle the complexity of computing Shapley values, we propose an approximation scheme to compute Shapley values as initial assessments of flow importance. We then propose a learning algorithm to refine scores and improve explainability. Experimental studies on both synthetic and real-world datasets demonstrate that our proposed FlowX leads to improved explainability of GNNs.", "keywords": "Deep learning;Graph Neural Networks;Explainability", "primary_area": "", "supplementary_material": "", "author": "Shurui Gui;Hao Yuan;Jie Wang;Qicheng Lao;Kang Li;Shuiwang Ji", "authorids": "~Shurui_Gui1;~Hao_Yuan1;~Jie_Wang1;~Qicheng_Lao2;~Kang_Li9;~Shuiwang_Ji1", "gender": "M;M;M;;M;M", "homepage": "https://cm-bf.github.io;https://sites.google.com/site/hyuanustc;http://staff.ustc.edu.cn/~jwangx;;;http://people.tamu.edu/~sji", "dblp": "272/0674.html;;29/5259-5;222/3004;l/KangLi;84/6405", "google_scholar": "U4AjtOkAAAAJ;dooagDcAAAAJ;OugG4dUAAAAJ;;;BZGj6sAAAAAJ", "orcid": ";;;;;0000-0002-4205-4563", "linkedin": ";;;qicheng-lao-02909871;kang-li-484142b/;shuiwang-ji-9a040715/", "or_profile": "~Shurui_Gui1;~Hao_Yuan1;~Jie_Wang1;~Qicheng_Lao2;~Kang_Li9;~Shuiwang_Ji1", "aff": "Texas A&M University;;University of Science and Technology of China;Beijing University of Posts and Telecommunications;;Texas A&M University", "aff_domain": "tamu.edu;;ustc.edu.cn;bupt.edu.cn;;tamu.edu", "position": "PhD student;;Full Professor;Assistant Professor;;Professor", "bibtex": "@misc{\ngui2022flowx,\ntitle={FlowX: Towards Explainable Graph Neural Networks via Message Flows},\nauthor={Shurui Gui and Hao Yuan and Jie Wang and Qicheng Lao and Kang Li and Shuiwang Ji},\nyear={2022},\nurl={https://openreview.net/forum?id=mRF387I4Wl}\n}", "github": "", "project": "", "reviewers": "ef9t;bCqP;aAaM;ctC1", "site": "https://openreview.net/forum?id=mRF387I4Wl", "pdf_size": 0, "recommendation": "3;5;6;6", "confidence": "5;4;5;5", "correctness": "2;3;3;4", "technical_novelty": "2;3;2;3", "empirical_novelty": "1;2;0;3", "wc_summary_paper": "51;139;77;36", "wc_summary_review": "46;4;50;30", "wc_main_review": "296;362;746;200", "wc_review": "393;505;873;266", "wc_reply_reviewers": "0;0;0;17", "wc_reply_authors": "1144;924;830;143", "reply_reviewers": "0;0;0;1", "reply_authors": "2;2;2;1", "recommendation_avg": [ 5.0, 1.224744871391589 ], "confidence_avg": [ 4.75, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 1.5, 1.118033988749895 ], "wc_summary_paper_avg": [ 75.75, 39.35336707322513 ], "wc_summary_review_avg": [ 32.5, 18.07622748252522 ], "wc_main_review_avg": [ 401.0, 207.3475343475297 ], "wc_review_avg": [ 509.25, 226.3938769048315 ], "wc_reply_reviewers_avg": [ 4.25, 7.361215932167728 ], "wc_reply_authors_avg": [ 760.25, 374.1459441180674 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.75, 0.4330127018922193 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.8660254037844386, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10846508143473967145&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "Texas A&M University;University of Science and Technology of China;Beijing University of Posts and Telecommunications", "aff_unique_dep": ";;", "aff_unique_url": "https://www.tamu.edu;http://www.ustc.edu.cn;http://www.bupt.edu.cn/", "aff_unique_abbr": "TAMU;USTC;BUPT", "aff_campus_unique_index": "1", "aff_campus_unique": ";Beijing", "aff_country_unique_index": "0;1;1;0", "aff_country_unique": "United States;China" }, { "id": "mRc_t2b3l1-", "title": "Rethinking the limiting dynamics of SGD: modified loss, phase space oscillations, and anomalous diffusion", "track": "main", "status": "Reject", "tldr": "", "abstract": "In this work we explore the limiting dynamics of deep neural networks trained with stochastic gradient descent (SGD). We find empirically that long after performance has converged, networks continue to move through parameter space by a process of anomalous diffusion in which distance travelled grows as a power law in the number of gradient updates with a nontrivial exponent. We reveal an intricate interaction between the hyperparameters of optimization, the structure in the gradient noise, and the Hessian matrix at the end of training that explains this anomalous diffusion. To build this understanding, we first derive a continuous-time model for SGD with finite learning rates and batch sizes as an underdamped Langevin equation. We study this equation in the setting of linear regression, where we can derive exact, analytic expressions for the phase space dynamics of the parameters and their instantaneous velocities from initialization to stationarity. Using the Fokker-Planck equation, we show that the key ingredient driving these dynamics is not the original training loss, but rather the combination of a modified loss, which implicitly regularizes the velocity, and probability currents, which cause oscillations in phase space. We identify qualitative and quantitative predictions of this theory in the dynamics of a ResNet-18 model trained on ImageNet. Through the lens of statistical physics, we uncover a mechanistic origin for the anomalous limiting dynamics of deep neural networks trained with SGD.", "keywords": "learning dynamics;loss landscape;stochastic differential equation;modified equation analysis;hessian;geometry;physics;fokker-plank;modified loss;probability currents;diffusion", "primary_area": "", "supplementary_material": "/attachment/da86241150529959e20a623675aaf71d89d6ca36.zip", "author": "Daniel Kunin;Javier Sagastuy-Brena;Lauren Gillespie;Eshed Margalit;Hidenori Tanaka;Surya Ganguli;Daniel LK Yamins", "authorids": "~Daniel_Kunin1;~Javier_Sagastuy-Brena1;~Lauren_Gillespie2;~Eshed_Margalit1;~Hidenori_Tanaka1;~Surya_Ganguli1;~Daniel_LK_Yamins1", "gender": ";;;M;;M;M", "homepage": "https://daniel-kunin.com/;https://www.javiersagastuy.com/;https://cs.stanford.edu/~gillespl/;https://eshedmargalit.com;https://sites.google.com/view/htanaka/home;http://ganguli-gang.stanford.edu/surya.html;https://Neuroailab.stanford.edu", "dblp": "234/8632;;202/9045;;;56/10453;", "google_scholar": "qilW2ZMAAAAJ;dtlIL-IAAAAJ;XnzwIMUAAAAJ;ijttsicAAAAJ;f_pWOGIAAAAJ;;", "orcid": ";;0000-0003-2496-8035;0000-0003-0841-7444;;;", "linkedin": ";jvrsgsty/;;eshed-margalit-437222a7/;;;", "or_profile": "~Daniel_Kunin1;~Javier_Sagastuy-Brena1;~Lauren_Gillespie2;~Eshed_Margalit1;~Hidenori_Tanaka1;~Surya_Ganguli1;~Daniel_LK_Yamins1", "aff": "Stanford University;Stanford University;Stanford University;Stanford University;Physics & Informatics Lab, NTT Research, Inc.;Stanford University;Stanford University", "aff_domain": "stanford.edu;stanford.edu;stanford.edu;stanford.edu;ntt-research.com;@stanford.edu;stanford.edu", "position": "PhD student;PhD student;PhD student;PhD student;Senior Research Scientist;Assistant Professor;Assistant Professor", "bibtex": "@misc{\nkunin2022rethinking,\ntitle={Rethinking the limiting dynamics of {SGD}: modified loss, phase space oscillations, and anomalous diffusion},\nauthor={Daniel Kunin and Javier Sagastuy-Brena and Lauren Gillespie and Eshed Margalit and Hidenori Tanaka and Surya Ganguli and Daniel LK Yamins},\nyear={2022},\nurl={https://openreview.net/forum?id=mRc_t2b3l1-}\n}", "github": "", "project": "", "reviewers": "4wwS;sDRt;pbf9;erMi", "site": "https://openreview.net/forum?id=mRc_t2b3l1-", "pdf_size": 0, "recommendation": "3;5;6;6", "confidence": "5;5;4;3", "correctness": "3;3;3;3", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;4;2;3", "wc_summary_paper": "112;82;312;26", "wc_summary_review": "154;67;167;30", "wc_main_review": "491;737;705;135", "wc_review": "757;886;1184;191", "wc_reply_reviewers": "925;369;509;0", "wc_reply_authors": "1514;2085;1290;358", "reply_reviewers": "2;1;2;0", "reply_authors": "2;3;2;1", "recommendation_avg": [ 5.0, 1.224744871391589 ], "confidence_avg": [ 4.25, 0.82915619758885 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 133.0, 107.8563859954523 ], "wc_summary_review_avg": [ 104.5, 57.6909871643743 ], "wc_main_review_avg": [ 517.0, 239.97083156083784 ], "wc_review_avg": [ 754.5, 360.31271140496835 ], "wc_reply_reviewers_avg": [ 450.75, 330.97007039912233 ], "wc_reply_authors_avg": [ 1311.75, 622.2806340390161 ], "reply_reviewers_avg": [ 1.25, 0.82915619758885 ], "reply_authors_avg": [ 2.0, 0.7071067811865476 ], "replies_avg": [ 20, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": -0.7385489458759963, "corr_recommendation_correctness": 0.0, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8787583267406023607&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;0;0;1;0;0", "aff_unique_norm": "Stanford University;NTT Research, Inc.", "aff_unique_dep": ";Physics & Informatics Lab", "aff_unique_url": "https://www.stanford.edu;https://www.ntt-research.com", "aff_unique_abbr": "Stanford;NTT Research", "aff_campus_unique_index": "0;0;0;0;0;0", "aff_campus_unique": "Stanford;", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "mTcO4-QCOB", "title": "Analyzing the Effects of Classifier Lipschitzness on Explainers", "track": "main", "status": "Reject", "tldr": "", "abstract": "Machine learning methods are getting increasingly better at making predictions, but at the same time they are also becoming more complicated and less transparent. As a result, explanation methods are often relied on to provide interpretability to these complicated and often black-box prediction models. As crucial diagnostics tools, it is important that these explainer methods themselves are reliable. In this paper we focus on one particular aspect of reliability, namely that an explainer should give similar explanations for similar data inputs. We formalize this notion by introducing and defining explainer astuteness, analogous to astuteness of classifiers. Our formalism is inspired by the concept of probabilistic Lipschitzness, which captures the probability of local smoothness of a function. For a variety of explainers (e.g., SHAP, RISE, CXPlain, PredDiff), we provide lower bound guarantees on the astuteness of these explainers given the Lipschitzness of the prediction function. These theoretical results imply that locally smooth prediction functions lend themselves to locally robust explanations. We evaluate these results empirically on simulated as well as real datasets.", "keywords": "Explainers;Explanation;Robustness;Astuteness;Lipschitz;Blackbox;Classifiers", "primary_area": "", "supplementary_material": "/attachment/931673693d2965ea186dbf59db5db3a433eb48ce.zip", "author": "Zulqarnain Khan;Aria Masoomi;Davin Hill;Jennifer Dy", "authorids": "~Zulqarnain_Khan1;~Aria_Masoomi1;~Davin_Hill1;~Jennifer_Dy1", "gender": ";M;;", "homepage": ";;;https://mllabneu.github.io/", "dblp": ";242/9324;;24/6000", "google_scholar": ";KXcX8coAAAAJ;;6h7b0fAAAAAJ", "orcid": ";;;", "linkedin": ";aria-masoomi-779a02232;;", "or_profile": "~Zulqarnain_Khan1;~Aria_Masoomi1;~Davin_Hill1;~Jennifer_Dy1", "aff": ";Northeastern University;;Northeastern University", "aff_domain": ";northeastern.edu;;northeastern.edu", "position": ";PhD student;;Full Professor", "bibtex": "@misc{\nkhan2022analyzing,\ntitle={Analyzing the Effects of Classifier Lipschitzness on Explainers},\nauthor={Zulqarnain Khan and Aria Masoomi and Davin Hill and Jennifer Dy},\nyear={2022},\nurl={https://openreview.net/forum?id=mTcO4-QCOB}\n}", "github": "", "project": "", "reviewers": "Np65;ftCD;ARPU", "site": "https://openreview.net/forum?id=mTcO4-QCOB", "pdf_size": 0, "recommendation": "3;5;5", "confidence": "5;4;3", "correctness": "2;3;4", "technical_novelty": "2;2;2", "empirical_novelty": "2;2;2", "wc_summary_paper": "141;111;137", "wc_summary_review": "12;35;59", "wc_main_review": "225;391;99", "wc_review": "378;537;295", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "602;538;229", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "recommendation_avg": [ 4.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "correctness_avg": [ 3.0, 0.816496580927726 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 129.66666666666666, 13.299958228840003 ], "wc_summary_review_avg": [ 35.333333333333336, 19.189117286165672 ], "wc_main_review_avg": [ 238.33333333333334, 119.58074910099684 ], "wc_review_avg": [ 403.3333333333333, 100.4069497373342 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 456.3333333333333, 162.85849345026156 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.8660254037844385, "corr_recommendation_correctness": 0.8660254037844385, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16162534865333599608&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Northeastern University", "aff_unique_dep": "", "aff_unique_url": "https://www.northeastern.edu", "aff_unique_abbr": "NEU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "mYaOK2og0tf", "title": "A Practical PAC-Bayes Generalisation Bound for Deep Learning", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Under a PAC-Bayesian framework, we derive an implementation efficient parameterisation invariant metric to measure the difference between our true and empirical risk. We show that for solutions of low training loss, this metric can be approximated at the same cost as a single step of SGD. We investigate the usefulness of this metric on pathological examples, where traditional Hessian based sharpness metrics increase but generalisation also increases and find good experimental agreement. As a consequence of our PAC-Bayesian framework and theoretical arguments on the effect of sub-sampling the Hessian, we include a trace of Hessian term into our structural risk. We find that this term promotes generalisation on a variety of experiments using Wide-Residual Networks on the CIFAR datasets. ", "keywords": "generalisation;hessian;pac-bayesian", "primary_area": "", "supplementary_material": "/attachment/f7b7fc1a70cf681f957d9246f6cd112f90c50cba.zip", "author": "Diego Granziol;Mingtian Zhang;Nicholas Baskerville", "authorids": "~Diego_Granziol1;~Mingtian_Zhang1;~Nicholas_Baskerville1", "gender": "M;;M", "homepage": "http://tomo.wiki;;", "dblp": "230/8340;;", "google_scholar": ";https://scholar.google.co.uk/citations?user=ZfuJzgcAAAAJ;https://scholar.google.co.uk/citations?user=-MuqKlIAAAAJ", "orcid": ";0000-0001-5275-3598;0000-0003-3169-2081", "linkedin": ";;", "or_profile": "~Mingtian_Zhang1;~Nicholas_Baskerville1;~Diego_Marco_Granziol1", "aff": "University College London;University of Bristol;University of Oxford", "aff_domain": "ucl.ac.uk;bristol.ac.uk;oxford.ac.uk", "position": "PhD student;PhD student;Researcher", "bibtex": "@misc{\ngranziol2022a,\ntitle={A Practical {PAC}-Bayes Generalisation Bound for Deep Learning},\nauthor={Diego Granziol and Mingtian Zhang and Nicholas Baskerville},\nyear={2022},\nurl={https://openreview.net/forum?id=mYaOK2og0tf}\n}", "github": "", "project": "", "reviewers": "KdSJ;wngb;pvJW", "site": "https://openreview.net/forum?id=mYaOK2og0tf", "pdf_size": 0, "recommendation": "1;1;3", "confidence": "4;3;3", "correctness": "1;2;2", "technical_novelty": "1;2;2", "empirical_novelty": "1;2;2", "wc_summary_paper": "54;76;109", "wc_summary_review": "49;34;147", "wc_main_review": "315;635;805", "wc_review": "418;745;1061", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 1.6666666666666667, 0.9428090415820634 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "correctness_avg": [ 1.6666666666666667, 0.4714045207910317 ], "technical_novelty_avg": [ 1.6666666666666667, 0.4714045207910317 ], "empirical_novelty_avg": [ 1.6666666666666667, 0.4714045207910317 ], "wc_summary_paper_avg": [ 79.66666666666667, 22.60285134421958 ], "wc_summary_review_avg": [ 76.66666666666667, 50.108770578501414 ], "wc_main_review_avg": [ 585.0, 203.14198646923452 ], "wc_review_avg": [ 741.3333333333334, 262.516454510739 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.4999999999999999, "corr_recommendation_correctness": 0.4999999999999999, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:CSR6JfFlLdEJ:scholar.google.com/&scioq=A+Practical+PAC-Bayes+Generalisation+Bound+for+Deep+Learning&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1;2", "aff_unique_norm": "University College London;University of Bristol;University of Oxford", "aff_unique_dep": ";;", "aff_unique_url": "https://www.ucl.ac.uk;https://www.bristol.ac.uk;https://www.ox.ac.uk", "aff_unique_abbr": "UCL;Bristol;Oxford", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United Kingdom" }, { "id": "mZsZy481_F", "title": "FROB: Few-shot ROBust Model for Classification with Out-of-Distribution Detection", "track": "main", "status": "Reject", "tldr": "", "abstract": "Nowadays, classification and Out-of-Distribution (OoD) detection in the few-shot setting remain challenging aims mainly due to rarity and the limited samples in the few-shot setting, and because of adversarial attacks. Accomplishing these aims is important for critical systems in safety, security, and defence. In parallel, OoD detection is challenging since deep neural network classifiers set high confidence to OoD samples away from the training data. To address such limitations, we propose the Few-shot ROBust (FROB) model for classification and few-shot OoD detection. We devise a methodology for improved robustness and reliable confidence prediction for few-shot OoD detection. We generate the support boundary of the normal class distribution and combine it with few-shot Outlier Exposure (OE). We propose a self-supervised learning few-shot confidence boundary methodology based on generative and discriminative models, including classification. The main contribution of FROB is the combination of the generated boundary in a self-supervised learning manner and the imposition of low confidence at this learned boundary. FROB implicitly generates strong adversarial samples on the boundary and forces samples from OoD, including our boundary, to be less confident by the classifier. FROB achieves generalization to unseen anomalies and adversarial attacks, with applicability to unknown, in the wild, test sets that do not correlate to the training datasets. To improve robustness, FROB redesigns and streamlines OE to work even for zero-shots. By including our learned boundary, FROB effectively reduces the threshold linked to the model\u2019s few-shot robustness, and maintains the OoD performance approximately constant and independent of the number of few-shot samples. The few-shot robustness analysis evaluation of FROB on different image sets and on One-Class Classification (OCC) data shows that FROB achieves competitive state-of-the-art performance and outperforms benchmarks in terms of robustness to the outlier OoD few-shot sample population and variability.", "keywords": "Classification and Out-of-Distribution Detection;Confidence Prediction;Few-Shot Out-of-Distribution Detection;Outlier Exposure;Robustness", "primary_area": "", "supplementary_material": "", "author": "Nikolaos Dionelis;Mehrdad Yaghoobi;Sotirios A. Tsaftaris", "authorids": "~Nikolaos_Dionelis2;~Mehrdad_Yaghoobi1;~Sotirios_A._Tsaftaris1", "gender": "M;M;", "homepage": "https://philab.esa.int/team/nikolaos-dionelis-2/;https://www.eng.ed.ac.uk/about/people/dr-mehrdad-yaghoobi;https://vios.science/", "dblp": "191/1019;;14/613", "google_scholar": "https://scholar.google.com/citations?hl=en;;jC1uFnYAAAAJ", "orcid": "0000-0001-9662-8537;;", "linkedin": "https://it.linkedin.com/in/nikolaos-dionelis-60688279;;", "or_profile": "~Nikolaos_Dionelis2;~Mehrdad_Yaghoobi1;~Sotirios_A._Tsaftaris1", "aff": "University of Edinburgh;;University of Edinburgh", "aff_domain": "ed.ac.uk;;ed.ac.uk", "position": "Postdoc;;Professor in machine learning and computer vision", "bibtex": "@misc{\ndionelis2022frob,\ntitle={{FROB}: Few-shot {ROB}ust Model for Classification with Out-of-Distribution Detection},\nauthor={Nikolaos Dionelis and Mehrdad Yaghoobi and Sotirios A. Tsaftaris},\nyear={2022},\nurl={https://openreview.net/forum?id=mZsZy481_F}\n}", "github": "", "project": "", "reviewers": "qcc7;1PDU;r838;94aV", "site": "https://openreview.net/forum?id=mZsZy481_F", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "5;3;4;2", "correctness": "3;2;2;3", "technical_novelty": "2;3;3;2", "empirical_novelty": "2;1;2;2", "wc_summary_paper": "51;44;33;26", "wc_summary_review": "87;44;28;39", "wc_main_review": "388;379;518;12", "wc_review": "526;467;579;77", "wc_reply_reviewers": "40;0;0;0", "wc_reply_authors": "1516;2108;1929;333", "reply_reviewers": "1;0;0;0", "reply_authors": "3;3;3;1", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 3.5, 1.118033988749895 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 1.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 38.5, 9.656603957913983 ], "wc_summary_review_avg": [ 49.5, 22.41093483101497 ], "wc_main_review_avg": [ 324.25, 188.48126564727858 ], "wc_review_avg": [ 412.25, 197.56944981448927 ], "wc_reply_reviewers_avg": [ 10.0, 17.320508075688775 ], "wc_reply_authors_avg": [ 1471.5, 691.4840923694485 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 2.5, 0.8660254037844386 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.7745966692414834, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13413812319860460482&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff_unique_index": "0;0", "aff_unique_norm": "University of Edinburgh", "aff_unique_dep": "", "aff_unique_url": "https://www.ed.ac.uk", "aff_unique_abbr": "Edinburgh", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United Kingdom" }, { "title": "Training Structured Neural Networks Through Manifold Identification and Variance Reduction", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6536", "id": "mdUYT5QV0O", "poster": "", "openreview": "https://openreview.net/forum?id=mdUYT5QV0O", "slides": "https://iclr.cc/virtual/2022/poster/6536", "video": "https://iclr.cc/virtual/2022/poster/6536", "author_site": "Zih-Syuan Huang, Ching-pei Lee", "tldr": "", "abstract": "This paper proposes an algorithm, RMDA, for training neural networks (NNs) with a regularization term for promoting desired structures. RMDA does not incur computation additional to proximal SGD with momentum, and achieves variance reduction without requiring the objective function to be of the finite-sum form. Through the tool of manifold identification from nonlinear optimization, we prove that after a finite number of iterations, all iterates of RMDA possess a desired structure identical to that induced by the regularizer at the stationary point of asymptotic convergence, even in the presence of engineering tricks like data augmentation that complicate the training process. Experiments on training NNs with structured sparsity confirm that variance reduction is necessary for such an identification, and show that RMDA thus significantly outperforms existing methods for this task. For unstructured sparsity, RMDA also outperforms a state-of-the-art pruning method, validating the benefits of training structured NNs through regularization. \nImplementation of RMDA is available at https://www.github.com/zihsyuan1214/rmda.", "keywords": "Structured neural networks;variance reduction;manifold identification;proximal methods", "primary_area": "", "supplementary_material": "", "author": "Zih-Syuan Huang;Ching-pei Lee", "authorids": "~Zih-Syuan_Huang1;~Ching-pei_Lee2", "gender": ";Unspecified", "homepage": "https://github.com/zihsyuan1214;http://leepei.github.io", "dblp": ";", "google_scholar": ";https://scholar.google.com/citations?hl=en", "orcid": ";", "linkedin": ";", "or_profile": "~Zih-Syuan_Huang1;~Ching-Pei_Lee1", "aff": "Academia Sinica;", "aff_domain": "sinica.edu.tw;", "position": "research assistant;", "bibtex": "@inproceedings{\nhuang2022training,\ntitle={Training Structured Neural Networks Through Manifold Identification and Variance Reduction},\nauthor={Zih-Syuan Huang and Ching-pei Lee},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=mdUYT5QV0O}\n}", "github": "", "project": "", "reviewers": "mrfp;KsXr;MELK", "pdf_size": 0, "recommendation": "6;8;8", "confidence": "4;4;3", "correctness": "3;1;4", "technical_novelty": "3;3;3", "empirical_novelty": "2;3;3", "wc_summary_paper": "85;56;106", "wc_summary_review": "67;113;31", "wc_main_review": "348;758;137", "wc_review": "500;927;274", "wc_reply_reviewers": "0;331;0", "wc_reply_authors": "1508;1786;388", "reply_reviewers": "0;2;0", "reply_authors": "3;4;1", "recommendation_avg": [ 7.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 2.6666666666666665, 1.247219128924647 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 82.33333333333333, 20.49932248202906 ], "wc_summary_review_avg": [ 70.33333333333333, 33.559234529741914 ], "wc_main_review_avg": [ 414.3333333333333, 257.82466048761813 ], "wc_review_avg": [ 567.0, 270.7631191035195 ], "wc_reply_reviewers_avg": [ 110.33333333333333, 156.0348963818315 ], "wc_reply_authors_avg": [ 1227.3333333333333, 604.2523387533463 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.9428090415820634 ], "reply_authors_avg": [ 2.6666666666666665, 1.247219128924647 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.49999999999999983, "corr_recommendation_correctness": -0.18898223650461365, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3809096100711986966&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=mdUYT5QV0O", "email": "sinica.edu.tw;", "author_num": 2, "aff_unique_index": "0", "aff_unique_norm": "Academia Sinica", "aff_unique_dep": "", "aff_unique_url": "https://www.sinica.edu.tw", "aff_unique_abbr": "Academia Sinica", "aff_campus_unique_index": "0", "aff_campus_unique": "Taiwan", "aff_country_unique_index": "0", "aff_country_unique": "China" }, { "title": "Continual Learning with Filter Atom Swapping", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6027", "id": "metRpM4Zrcb", "poster": "", "openreview": "https://openreview.net/forum?id=metRpM4Zrcb", "slides": "https://iclr.cc/virtual/2022/poster/6027", "video": "https://iclr.cc/virtual/2022/poster/6027", "author_site": "Zichen Miao, Ze Wang, Wei Chen, Qiang Qiu", "tldr": "", "abstract": "Continual learning has been widely studied in recent years to resolve the catastrophic forgetting of deep neural networks. In this paper, we first enforce a low-rank filter subspace by decomposing convolutional filters within each network layer over a small set of filter atoms. Then, we perform continual learning with filter atom swapping. In other words, we learn for each task a new filter subspace for each convolutional layer, i.e., hundreds of parameters as filter atoms, but keep subspace coefficients shared across tasks. By maintaining a small footprint memory of filter atoms, we can easily archive models for past tasks to avoid forgetting. The effectiveness of this simple scheme for continual learning is illustrated both empirically and theoretically. The proposed atom swapping framework further enables flexible and efficient model ensemble with members selected within a task or across tasks to improve the performance in different continual learning settings. Being validated on multiple benchmark datasets with different convolutional network structures, the proposed method outperforms the state-of-the-art methods in both accuracy and scalability.", "keywords": "continual learning", "primary_area": "", "supplementary_material": "", "author": "Zichen Miao;Ze Wang;Wei Chen;Qiang Qiu", "authorids": "~Zichen_Miao1;~Ze_Wang3;~Wei_Chen26;~Qiang_Qiu1", "gender": "M;M;M;", "homepage": "https://zichenmiao.github.io;;https://weichennone.github.io/myhomepage/;https://web.ics.purdue.edu/~qqiu/", "dblp": "206/1549;;181/2832-124.html;97/360", "google_scholar": "Kmv2KIkAAAAJ;80Jw_w8AAAAJ;jVT7rQgAAAAJ;jdLtt_YAAAAJ", "orcid": ";;0000-0001-6722-4322;", "linkedin": ";;;", "or_profile": "~Zichen_Miao1;~Ze_Wang3;~Wei_Chen26;~Qiang_Qiu1", "aff": "Purdue University;Purdue University;Purdue University;Purdue University", "aff_domain": "purdue.edu;purdue.edu;purdue.edu;purdue.edu", "position": "PhD student;PhD student;PhD student;Assistant Professor", "bibtex": "@inproceedings{\nmiao2022continual,\ntitle={Continual Learning with Filter Atom Swapping},\nauthor={Zichen Miao and Ze Wang and Wei Chen and Qiang Qiu},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=metRpM4Zrcb}\n}", "github": "", "project": "", "reviewers": "B5kZ;WerS;GWGw;ymsc", "pdf_size": 0, "recommendation": "6;8;8;8", "confidence": "3;4;3;4", "correctness": "3;3;4;3", "technical_novelty": "3;3;3;3", "empirical_novelty": "2;3;4;4", "wc_summary_paper": "62;166;84;57", "wc_summary_review": "24;70;38;23", "wc_main_review": "145;910;165;128", "wc_review": "231;1146;287;208", "wc_reply_reviewers": "0;126;0;0", "wc_reply_authors": "320;820;320;273", "reply_reviewers": "0;1;0;0", "reply_authors": "1;2;1;1", "recommendation_avg": [ 7.5, 0.8660254037844386 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 3.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 92.25, 43.77427897750002 ], "wc_summary_review_avg": [ 38.75, 18.9917745353087 ], "wc_main_review_avg": [ 337.0, 331.08080584654857 ], "wc_review_avg": [ 468.0, 392.49649679965296 ], "wc_reply_reviewers_avg": [ 31.5, 54.559600438419636 ], "wc_reply_authors_avg": [ 433.25, 224.11311318171457 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 37, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10541373769258487423&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 0, "pdf": "https://openreview.net/pdf?id=metRpM4Zrcb", "email": "purdue.edu;purdue.edu;purdue.edu;purdue.edu", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Purdue University", "aff_unique_dep": "", "aff_unique_url": "https://www.purdue.edu", "aff_unique_abbr": "Purdue", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Igeood: An Information Geometry Approach to Out-of-Distribution Detection", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/5915", "id": "mfwdY3U_9ea", "poster": "", "openreview": "https://openreview.net/forum?id=mfwdY3U_9ea", "slides": "https://iclr.cc/virtual/2022/poster/5915", "video": "https://iclr.cc/virtual/2022/poster/5915", "author_site": "Eduardo Dadalto C\u00e2mara Gomes, Florence Alberge, Pierre Duhamel, Pablo Piantanida", "tldr": "", "abstract": "Reliable out-of-distribution (OOD) detection is fundamental to implementing safer modern machine learning (ML) systems. In this paper, we introduce Igeood, an effective method for detecting OOD samples. Igeood applies to any pre-trained neural network, works under various degrees of access to the ML model, does not require OOD samples or assumptions on the OOD data but can also benefit (if available) from OOD samples. By building on the geodesic (Fisher-Rao) distance between the underlying data distributions, our discriminator can combine confidence scores from the logits outputs and the learned features of a deep neural network. Empirically, we show that Igeood outperforms competing state-of-the-art methods on a variety of network architectures and datasets.", "keywords": "out-of-distribution detection;anomaly detection;deep learning", "primary_area": "", "supplementary_material": "/attachment/c051c492e82c715c1205d85e6cb456deb71c00ca.zip", "author": "Eduardo Dadalto Camara Gomes;Florence Alberge;Pierre Duhamel;Pablo Piantanida", "authorids": "~Eduardo_Dadalto_Camara_Gomes1;~Florence_Alberge1;~Pierre_Duhamel1;~Pablo_Piantanida2", "gender": "M;F;M;M", "homepage": "https://edadaltocg.github.io;https://l2s.centralesupelec.fr/u/alberge-florence/;;https://www.pablo-piantanida.org", "dblp": "306/2391;;;44/1416", "google_scholar": "ImL09qAAAAAJ;8CcRfB8AAAAJ;https://scholar.google.fr/citations?user=gWj_W9YAAAAJ;https://scholar.google.fr/citations?user=QyBEFv0AAAAJ", "orcid": ";;0000-0002-7942-0934;", "linkedin": "edadaltocg/;;;pablo-piantanida-60a51bb5/?locale=en_US", "or_profile": "~Eduardo_Dadalto_Camara_Gomes1;~Florence_Alberge1;~Pierre_Duhamel1;~Pablo_Piantanida2", "aff": "Universit\u00e9 Paris-Saclay CNRS CentraleSup\u00e9lec;;;Universit\u00e9 Paris-Saclay, CNRS ", "aff_domain": "centralesupelec.fr;;;centralesupelec.fr", "position": "PhD student;;;Full Professor", "bibtex": "@inproceedings{\ngomes2022igeood,\ntitle={Igeood: An Information Geometry Approach to Out-of-Distribution Detection},\nauthor={Eduardo Dadalto Camara Gomes and Florence Alberge and Pierre Duhamel and Pablo Piantanida},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=mfwdY3U_9ea}\n}", "github": "", "project": "", "reviewers": "dbMS;R2Ss;eVhF;KjND", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "3;3;4;4", "correctness": "2;3;3;4", "technical_novelty": "3;3;3;3", "empirical_novelty": "2;2;3;4", "wc_summary_paper": "176;75;123;72", "wc_summary_review": "39;53;93;39", "wc_main_review": "418;403;714;479", "wc_review": "633;531;930;590", "wc_reply_reviewers": "0;45;113;32", "wc_reply_authors": "401;692;797;259", "reply_reviewers": "0;1;1;1", "reply_authors": "1;1;1;1", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 111.5, 42.38218965556169 ], "wc_summary_review_avg": [ 56.0, 22.11334438749598 ], "wc_main_review_avg": [ 503.5, 124.82087165213997 ], "wc_review_avg": [ 671.0, 153.85545164211763 ], "wc_reply_reviewers_avg": [ 47.5, 41.20982892466311 ], "wc_reply_authors_avg": [ 537.25, 216.45135134713297 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.6882472016116854, "corr_recommendation_correctness": 0.9733285267845754, "gs_citation": 55, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14684067719933018833&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "pdf": "https://openreview.net/pdf?id=mfwdY3U_9ea", "email": "centralesupelec.fr;;;centralesupelec.fr", "author_num": 4, "aff_unique_index": "0;0", "aff_unique_norm": "Universit\u00e9 Paris-Saclay", "aff_unique_dep": "", "aff_unique_url": "https://www.universite-paris-saclay.fr", "aff_unique_abbr": "UPS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "France" }, { "title": "Machine Learning For Elliptic PDEs: Fast Rate Generalization Bound, Neural Scaling Law and Minimax Optimality", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6547", "id": "mhYUBYNoGz", "poster": "", "openreview": "https://openreview.net/forum?id=mhYUBYNoGz", "slides": "https://iclr.cc/virtual/2022/poster/6547", "video": "https://iclr.cc/virtual/2022/poster/6547", "author_site": "Yiping Lu, Haoxuan Chen, Jianfeng Lu, Lexing Ying, Jose Blanchet", "tldr": "", "abstract": "In this paper, we study the statistical limits of deep learning techniques for solving elliptic partial differential equations (PDEs) from random samples using the Deep Ritz Method (DRM) and Physics-Informed Neural Networks (PINNs). To simplify the problem, we focus on a prototype elliptic PDE: the Schr\\\"odinger equation on a hypercube with zero Dirichlet boundary condition, which has wide application in the quantum-mechanical systems. We establish upper and lower bounds for both methods, which improves upon concurrently developed upper bounds for this problem via a fast rate generalization bound. We discover that the current Deep Ritz Methods is sub-optimal and propose a modified version of it. We also prove that PINN and the modified version of DRM can achieve minimax optimal bounds over Sobolev spaces. Empirically, following recent work which has shown that the deep model accuracy will improve with growing training sets according to a power law, we supply computational experiments to show a similar behavior of dimension dependent power law for deep PDE solvers.", "keywords": "Numerical PDE;non-parametric statistics;computational physics", "primary_area": "", "supplementary_material": "", "author": "Yiping Lu;Haoxuan Chen;Jianfeng Lu;Lexing Ying;Jose Blanchet", "authorids": "~Yiping_Lu1;~Haoxuan_Chen1;~Jianfeng_Lu1;~Lexing_Ying1;~Jose_Blanchet1", "gender": "M;M;M;;M", "homepage": "https://2prime.github.io/;https://haoxuanstevec00.github.io/;https://services.math.duke.edu/~jianfeng/;http://web.stanford.edu/~lexing;https://web.stanford.edu/~jblanche/", "dblp": "93/683-1;212/7201.html;82/6187-1.html;68/3945;75/5093.html", "google_scholar": "NmhvVBgAAAAJ;https://scholar.google.com/citations?hl=en;ej9SRrAAAAAJ;OwA3zyMAAAAJ;https://scholar.google.co.in/citations?user=O24CcQQAAAAJ", "orcid": ";0000-0002-8238-2764;0000-0001-6255-5165;;", "linkedin": ";haoxuan-steve-chen-748b0a171/;;;jose-blanchet", "or_profile": "~Yiping_Lu1;~Haoxuan_Chen1;~Jianfeng_Lu1;~Lexing_Ying1;~Jose_Blanchet1", "aff": "Stanford University;California Institute of Technology;Duke University;Stanford University;Stanford University", "aff_domain": "stanford.edu;caltech.edu;duke.edu;stanford.edu;stanford.edu", "position": "PhD student;Undergrad student;Professor;Professor;Professor", "bibtex": "@inproceedings{\nlu2022machine,\ntitle={Machine Learning For Elliptic {PDE}s: Fast Rate Generalization Bound, Neural Scaling Law and Minimax Optimality},\nauthor={Yiping Lu and Haoxuan Chen and Jianfeng Lu and Lexing Ying and Jose Blanchet},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=mhYUBYNoGz}\n}", "github": "", "project": "", "reviewers": "Er9e;7DAR;16v3;RB8r", "pdf_size": 0, "recommendation": "6;6;8;8", "confidence": "3;3;3;4", "correctness": "3;4;4;3", "technical_novelty": "3;3;4;3", "empirical_novelty": "3;1;2;2", "wc_summary_paper": "88;56;61;158", "wc_summary_review": "29;57;9;30", "wc_main_review": "364;305;76;207", "wc_review": "481;418;146;395", "wc_reply_reviewers": "0;0;10;0", "wc_reply_authors": "427;404;138;608", "reply_reviewers": "0;0;1;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 7.0, 1.0 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 90.75, 40.69014008331748 ], "wc_summary_review_avg": [ 31.25, 17.06421694658152 ], "wc_main_review_avg": [ 238.0, 109.052739534594 ], "wc_review_avg": [ 360.0, 127.5009803883876 ], "wc_reply_reviewers_avg": [ 2.5, 4.330127018922194 ], "wc_reply_authors_avg": [ 394.25, 167.7205637362336 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": 0.0, "gs_citation": 57, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6989530816144386888&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "pdf": "https://openreview.net/pdf?id=mhYUBYNoGz", "email": "stanford.edu;caltech.edu;duke.edu;stanford.edu;stanford.edu", "author_num": 5, "aff_unique_index": "0;1;2;0;0", "aff_unique_norm": "Stanford University;California Institute of Technology;Duke University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.stanford.edu;https://www.caltech.edu;https://www.duke.edu", "aff_unique_abbr": "Stanford;Caltech;Duke", "aff_campus_unique_index": "0;1;0;0", "aff_campus_unique": "Stanford;Pasadena;", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "mhv2gWm3sf", "title": "$f$-Divergence Thermodynamic Variational Objective: a Deformed Geometry Perspective", "track": "main", "status": "Reject", "tldr": "", "abstract": "In this paper, we propose a $f$-divergence Thermodynamic Variational Objective ($f$-TVO). $f$-TVO generalizes the Thermodynamic Variational Objective (TVO) by replacing Kullback\u2013Leibler (KL) divergence with arbitary differeitiable $f$-divergence. In particular, $f$-TVO approximates dual function of model evidence $f^*(p(x))$ rather than the log model evidence $\\log p(x)$ in TVO. $f$-TVO is derived from a deformed $\\chi$-geometry perspective. By defining $\\chi$-exponential family exponential, we are able to integral $f$-TVO along the $\\chi$-path, which is the deformed geodesic between variational posterior distribution and true posterior distribution. Optimizing scheme of $f$-TVO includes reparameterization trick and Monte Carlo approximation. Experiments on VAE and Bayesian neural network show that the proposed $f$-TVO performs better than cooresponding baseline $f$-divergence variational inference.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jun Li;Ping Li", "authorids": "~Jun_Li13;~Ping_Li3", "gender": "M;M", "homepage": "https://junli-galios.github.io/;http://www.stat.rutgers.edu/home/pingli/", "dblp": "116/1011-98;62/5860-1", "google_scholar": "fyQZYz8AAAAJ;", "orcid": ";", "linkedin": ";", "or_profile": "~Jun_Li13;~Ping_Li3", "aff": "Baidu;LinkedIn", "aff_domain": "baidu.com;linkedin.com", "position": "Postdoc;Engineer", "bibtex": "@misc{\nli2022fdivergence,\ntitle={\\$f\\$-Divergence Thermodynamic Variational Objective: a Deformed Geometry Perspective},\nauthor={Jun Li and Ping Li},\nyear={2022},\nurl={https://openreview.net/forum?id=mhv2gWm3sf}\n}", "github": "", "project": "", "reviewers": "QFxg;BE5D;6bLb;Qe9J", "site": "https://openreview.net/forum?id=mhv2gWm3sf", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "4;4;4;3", "correctness": "2;3;3;3", "technical_novelty": "3;3;2;4", "empirical_novelty": "2;1;3;2", "wc_summary_paper": "60;46;50;136", "wc_summary_review": "40;46;31;129", "wc_main_review": "244;430;525;233", "wc_review": "344;522;606;498", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "154;206;181;171", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 73.0, 36.72873534441391 ], "wc_summary_review_avg": [ 61.5, 39.33509883043387 ], "wc_main_review_avg": [ 358.0, 124.19138456430865 ], "wc_review_avg": [ 492.5, 94.65067353167646 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 178.0, 18.828170383762732 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -1.0, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:crZXnwysm5YJ:scholar.google.com/&scioq=%24f%24-Divergence+Thermodynamic+Variational+Objective:+a+Deformed+Geometry+Perspective&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1", "aff_unique_norm": "Baidu;LinkedIn Corporation", "aff_unique_dep": "Baidu, Inc.;", "aff_unique_url": "https://www.baidu.com;https://www.linkedin.com", "aff_unique_abbr": "Baidu;LinkedIn", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "China;United States" }, { "id": "miA4AkGK00R", "title": "EF21 with Bells & Whistles: Practical Algorithmic Extensions of Modern Error Feedback", "track": "main", "status": "Reject", "tldr": "", "abstract": "First proposed by Seide et al (2014) as a heuristic, error feedback (EF) is a very popular mechanism for enforcing convergence of distributed gradient-based optimization methods enhanced with communication compression strategies based on the application of contractive compression operators. However, existing theory of EF relies on very strong assumptions (e.g., bounded gradients), and provides pessimistic convergence rates (e.g., while the best known rate for EF in the smooth nonconvex regime, and when full gradients are compressed, is $O(1/T^{2/3})$, the rate of gradient descent in the same regime is $O(1/T)$). Recently, Richt\\'{a}rik et al (2021) proposed a new error feedback mechanism, EF21, based on the construction of a Markov compressor induced by a contractive compressor. EF21 removes the aforementioned theoretical deficiencies of EF and at the same time works better in practice. In this work we propose six practical extensions of EF21: partial participation, stochastic approximation, variance reduction, proximal setting, momentum and bidirectional compression. Our extensions are supported by strong convergence theory in the smooth nonconvex and also Polyak-\u0141ojasiewicz regimes. Several of these techniques were never analyzed in conjunction with EF before, and in cases where they were (e.g., bidirectional compression), our rates are vastly superior.", "keywords": "EF21;error feedback;bidirectional compression;regularization;variance reduction;heavy ball momentum;stochastic approximation", "primary_area": "", "supplementary_material": "/attachment/b4be401a86c65329c81bb8a6c8cbdd97152b01ae.zip", "author": "Ilyas Fatkhullin;Igor Sokolov;Eduard Gorbunov;Zhize Li;Peter Richt\u00e1rik", "authorids": "~Ilyas_Fatkhullin1;~Igor_Sokolov3;~Eduard_Gorbunov1;~Zhize_Li1;~Peter_Richt\u00e1rik1", "gender": "Not Specified;M;M;M;M", "homepage": "https://ai.ethz.ch/people/ilyas-fatkhullin.html;https://cemse.kaust.edu.sa/people/person/igor-sokolov;https://eduardgorbunov.github.io;https://zhizeli.github.io/;https://richtarik.org", "dblp": "294/8711;202/5678-1;215/5512.html;178/3238;62/8001", "google_scholar": "UCOWHb4AAAAJ;https://scholar.google.ru/citations?user=OBbPecwAAAAJ;https://scholar.google.ru/citations?user=85j2RqQAAAAJ;uAFPPigAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";0000-0002-2338-0187;;;0000-0003-4380-5848", "linkedin": ";igor-sokolov-7a6b47147/;;;richtarik/", "or_profile": "~Ilyas_Fatkhullin1;~Igor_Sokolov3;~Eduard_Gorbunov1;~Zhize_Li1;~Peter_Richtarik1", "aff": "ETHZ - ETH Zurich;King Abdullah University of Science and Technology;Mohamed bin Zayed University of Artificial Intelligence;King Abdullah University of Science and Technology;King Abdullah University of Science and Technology (KAUST)", "aff_domain": "ethz.ch;kaust.edu.sa;mbzuai.ac.ae;kaust.edu.sa;kaust.edu.sa", "position": "PhD student;MS student;Postdoc;Research Scientist;Full Professor", "bibtex": "@misc{\nfatkhullin2022ef,\ntitle={{EF}21 with Bells \\& Whistles: Practical Algorithmic Extensions of Modern Error Feedback},\nauthor={Ilyas Fatkhullin and Igor Sokolov and Eduard Gorbunov and Zhize Li and Peter Richt{\\'a}rik},\nyear={2022},\nurl={https://openreview.net/forum?id=miA4AkGK00R}\n}", "github": "", "project": "", "reviewers": "yZfa;2qZ7;DvwK;YjdL", "site": "https://openreview.net/forum?id=miA4AkGK00R", "pdf_size": 0, "recommendation": "5;5;5;6", "confidence": "4;3;4;4", "correctness": "3;4;3;3", "technical_novelty": "3;2;2;2", "empirical_novelty": "1;2;2;2", "wc_summary_paper": "29;18;75;46", "wc_summary_review": "15;22;15;22", "wc_main_review": "185;214;215;312", "wc_review": "229;254;305;380", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "555;430;826;600", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;2;1", "recommendation_avg": [ 5.25, 0.4330127018922193 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 42.0, 21.50581316760657 ], "wc_summary_review_avg": [ 18.5, 3.5 ], "wc_main_review_avg": [ 231.5, 48.0130190677487 ], "wc_review_avg": [ 292.0, 57.719147602853596 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 602.75, 143.1526719974168 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": -0.3333333333333333, "gs_citation": 62, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3198759116044975204&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff_unique_index": "0;1;2;1;1", "aff_unique_norm": "ETH Zurich;King Abdullah University of Science and Technology;Mohamed bin Zayed University of Artificial Intelligence", "aff_unique_dep": ";;", "aff_unique_url": "https://www.ethz.ch;https://www.kast.kau.edu.sa;https://mbzuai.ac.ae", "aff_unique_abbr": "ETHZ;KAUST;MBZUAI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2;1;1", "aff_country_unique": "Switzerland;Saudi Arabia;United Arab Emirates" }, { "title": "What\u2019s Wrong with Deep Learning in Tree Search for Combinatorial Optimization", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6426", "id": "mk0HzdqY7i1", "poster": "", "openreview": "https://openreview.net/forum?id=mk0HzdqY7i1", "slides": "https://iclr.cc/virtual/2022/poster/6426", "video": "https://iclr.cc/virtual/2022/poster/6426", "author_site": "Maximilian B\u00f6ther, Otto Ki\u00dfig, Martin Taraz, Sarel Cohen, Karen Seidel, Tobias Friedrich", "tldr": "", "abstract": "Combinatorial optimization lies at the core of many real-world problems. Especially since the rise of graph neural networks (GNNs), the deep learning community has been developing solvers that derive solutions to NP-hard problems by learning the problem-specific solution structure. However, reproducing the results of these publications proves to be difficult. We make three contributions. First, we present an open-source benchmark suite for the NP-hard Maximum Independent Set problem, in both its weighted and unweighted variants. The suite offers a unified interface to various state-of-the-art traditional and machine learning-based solvers. Second, using our benchmark suite, we conduct an in-depth analysis of the popular guided tree search algorithm by Li et al. [NeurIPS 2018], testing various configurations on small and large synthetic and real-world graphs. By re-implementing their algorithm with a focus on code quality and extensibility, we show that the graph convolution network used in the tree search does not learn a meaningful representation of the solution structure, and can in fact be replaced by random values. Instead, the tree search relies on algorithmic techniques like graph kernelization to find good solutions. Thus, the results from the original publication are not reproducible. Third, we extend the analysis to compare the tree search implementations to other solvers, showing that the classical algorithmic solvers often are faster, while providing solutions of similar quality. Additionally, we analyze a recent solver based on reinforcement learning and observe that for this solver, the GNN is responsible for the competitive solution quality.", "keywords": "deep learning;combinatorial optimization;maximum independent set", "primary_area": "", "supplementary_material": "/attachment/954db6d49975c87f0bfc6818f7e3a797c02a0cb0.zip", "author": "Maximilian B\u00f6ther;Otto Ki\u00dfig;Martin Taraz;Sarel Cohen;Karen Seidel;Tobias Friedrich", "authorids": "~Maximilian_B\u00f6ther2;~Otto_Ki\u00dfig1;~Martin_Taraz1;~Sarel_Cohen1;~Karen_Seidel1;~Tobias_Friedrich1", "gender": "M;M;;;;", "homepage": "https://mboether.com;;;;https://hpi.de/friedrich/people/karen-seidel.html;", "dblp": "https://dblp.uni-trier.de/pid/273/3658;273/4154.html;;;;", "google_scholar": "Z6qj7ygAAAAJ;;;;;", "orcid": "0000-0003-4093-4361;0000-0002-9414-9206;;;;", "linkedin": "maximilian-boether/;;martin-taraz-9a34b213b/;;;", "or_profile": "~Maximilian_B\u00f6ther2;~Otto_Ki\u00dfig1;~Martin_Taraz1;~Sarel_Cohen1;~Karen_Seidel1;~Tobias_Friedrich1", "aff": "Hasso Plattner Institute;Hasso Plattner Institute;;;Hasso Plattner Institute;", "aff_domain": "hpi.de;hpi.de;;;hpi.de;", "position": "MS student;MS student;;;PhD student;", "bibtex": "@inproceedings{\nb{\\\"o}ther2022whats,\ntitle={What{\\textquoteright}s Wrong with Deep Learning in Tree Search for Combinatorial Optimization},\nauthor={Maximilian B{\\\"o}ther and Otto Ki{\\ss}ig and Martin Taraz and Sarel Cohen and Karen Seidel and Tobias Friedrich},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=mk0HzdqY7i1}\n}", "github": "", "project": "", "reviewers": "oh3B;BEvM;m13R;Qd1X", "pdf_size": 0, "recommendation": "6;8;8;8", "confidence": "4;4;5;3", "correctness": "3;3;4;3", "technical_novelty": "2;1;1;3", "empirical_novelty": "3;4;3;3", "wc_summary_paper": "95;130;134;97", "wc_summary_review": "48;112;85;39", "wc_main_review": "378;676;775;139", "wc_review": "521;918;994;275", "wc_reply_reviewers": "25;108;174;0", "wc_reply_authors": "755;651;953;100", "reply_reviewers": "1;1;1;0", "reply_authors": "1;2;2;1", "recommendation_avg": [ 7.5, 0.8660254037844386 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 1.75, 0.82915619758885 ], "empirical_novelty_avg": [ 3.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 114.0, 18.069310999592652 ], "wc_summary_review_avg": [ 71.0, 29.283100928692644 ], "wc_main_review_avg": [ 492.0, 250.77380245950732 ], "wc_review_avg": [ 677.0, 293.47487115594754 ], "wc_reply_reviewers_avg": [ 76.75, 68.92523122920953 ], "wc_reply_authors_avg": [ 614.75, 316.3719132603272 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 50, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18330070821470336106&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "pdf": "https://openreview.net/pdf?id=mk0HzdqY7i1", "email": "hpi.de;hpi.de;;;hpi.de;", "author_num": 6, "aff_unique_index": "0;0;0", "aff_unique_norm": "Hasso Plattner Institute", "aff_unique_dep": "", "aff_unique_url": "https://www.hpi.de", "aff_unique_abbr": "HPI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Germany" }, { "id": "mk8AzPcd3x", "title": "BCDR: Betweenness Centrality-based Distance Resampling for Graph Shortest Distance Embedding", "track": "main", "status": "Reject", "tldr": "", "abstract": "Along with unprecedented development in network analysis such as biomedical structure prediction and social relationship analysis, Shortest Distance Queries (SDQs) in graphs receive an increasing attention. Approximate algorithms of SDQs with reduced complexity are of vital importance to complex graph applications. Among different approaches, embedding-based distance prediction has made a breakthrough in both efficiency and accuracy, ascribing to the significant performance of Graph Representation Learning (GRL). Embedding-based distance prediction usually leverages truncated random walk followed by Pointwise Mutual Information (PMI)-based optimization to embed local structural features into a dense vector on each node and integrates with a subsequent predictor for global extraction of nodes' mutual shortest distance. It has several shortcomings. Random walk as an unstrained node sequence possesses a limited distance exploration, failing to take into account remote nodes under graph's shortest distance metric, while the PMI-based maximum likelihood optimization of node embeddings reflects excessively versatile local similarity, which incurs an adverse impact on the preservation of the exact shortest distance relation during the mapping from the original graph space to the embedded vector space.\n\t\t\nTo address these shortcomings, we propose in this paper a novel graph shortest distance embedding method called Betweenness Centrality-based Distance Resampling (BCDR). First, we prove in a statistical perspective that Betweenness Centrality(BC)-based random walk can occupy a wider distance range measured by the intrinsic metric in the graph domain due to its awareness of the path structure. Second, we perform Distance Resampling (DR) from original walk paths before maximum likelihood optimization instead of the PMI-based optimization and prove that this strategy preserves distance relation with respect to any calibrated node via steering optimization objective to reconstruct a global distance matrix. Our proposed method possesses a strong theoretical background and shows much better performance than existing methods when evaluated on a broad class of real-world graph datasets with large diameters in SDQ problems. It should also outperform existing methods in other graph structure-related applications.", "keywords": "graph representation learning;graph shortest path distance;shortest distance query;graph embedding;random walk", "primary_area": "", "supplementary_material": "", "author": "Haoyu Wang;Chun Yuan", "authorids": "~Haoyu_Wang1;~Chun_Yuan1", "gender": "M;M", "homepage": ";https://www.sigs.tsinghua.edu.cn/fg3/105064.jhtml", "dblp": "50/8499;", "google_scholar": ";https://scholar.google.com.hk/citations?user=fYdxi2sAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Haoyu_Wang1;~Chun_Yuan1", "aff": "Tsinghua University;Tsinghua University", "aff_domain": "tsinghua.edu.cn;tsinghua.edu.cn", "position": "MS student;Full Professor", "bibtex": "@misc{\nwang2022bcdr,\ntitle={{BCDR}: Betweenness Centrality-based Distance Resampling for Graph Shortest Distance Embedding},\nauthor={Haoyu Wang and Chun Yuan},\nyear={2022},\nurl={https://openreview.net/forum?id=mk8AzPcd3x}\n}", "github": "", "project": "", "reviewers": "28dr;FZhh;UCXV;7mP1", "site": "https://openreview.net/forum?id=mk8AzPcd3x", "pdf_size": 0, "recommendation": "3;3;6;6", "confidence": "4;4;4;2", "correctness": "3;3;1;4", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "86;79;142;78", "wc_summary_review": "31;28;50;31", "wc_main_review": "487;257;174;184", "wc_review": "604;364;366;293", "wc_reply_reviewers": "72;0;74;0", "wc_reply_authors": "2038;1132;355;374", "reply_reviewers": "1;0;1;0", "reply_authors": "4;2;2;1", "recommendation_avg": [ 4.5, 1.5 ], "confidence_avg": [ 3.5, 0.8660254037844386 ], "correctness_avg": [ 2.75, 1.0897247358851685 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 96.25, 26.592997198510737 ], "wc_summary_review_avg": [ 35.0, 8.74642784226795 ], "wc_main_review_avg": [ 275.5, 126.24282157810003 ], "wc_review_avg": [ 406.75, 117.61669736903855 ], "wc_reply_reviewers_avg": [ 36.5, 36.50684867254362 ], "wc_reply_authors_avg": [ 974.75, 689.2421109450582 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 2.25, 1.0897247358851685 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.5773502691896258, "corr_recommendation_correctness": -0.2294157338705618, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:pcCbhs5_mQUJ:scholar.google.com/&scioq=BCDR:+Betweenness+Centrality-based+Distance+Resampling+for+Graph+Shortest+Distance+Embedding&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Tsinghua University", "aff_unique_dep": "", "aff_unique_url": "https://www.tsinghua.edu.cn", "aff_unique_abbr": "THU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "title": "Contact Points Discovery for Soft-Body Manipulations with Differentiable Physics", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6125", "id": "mmUA7_O9mjY", "poster": "", "openreview": "https://openreview.net/forum?id=mmUA7_O9mjY", "slides": "https://iclr.cc/virtual/2022/poster/6125", "video": "https://iclr.cc/virtual/2022/poster/6125", "author_site": "Sizhe Li, Zhiao Huang, Tao Du, Hao Su, Joshua B Tenenbaum, Chuang Gan", "tldr": "", "abstract": "Differentiable physics has recently been shown as a powerful tool for solving soft-body manipulation tasks. However, the differentiable physics solver often gets stuck when the initial contact points of the end effectors are sub-optimal or when performing multi-stage tasks that require contact point switching, which often leads to local minima.\nTo address this challenge, we propose a contact point discovery approach (CPDeform) that guides the stand-alone differentiable physics solver to deform various soft-body plasticines. The key idea of our approach is to integrate optimal transport-based contact points discovery into the differentiable physics solver to overcome the local minima from initial contact points or contact switching.\nOn single-stage tasks, our method can automatically find suitable initial contact points based on transport priorities. On complex multi-stage tasks, we can iteratively switch the contact points of end-effectors based on transport priorities. To evaluate the effectiveness of our method, we introduce PlasticineLab-M that extends the existing differentiable physics benchmark PlasticineLab to seven new challenging multi-stage soft-body manipulation tasks. Extensive experimental results suggest that: 1) on multi-stage tasks that are infeasible for the vanilla differentiable physics solver, our approach discovers contact points that efficiently guide the solver to completion; 2) on tasks where the vanilla solver performs sub-optimally or near-optimally, our contact point discovery method performs better than or on par with the manipulation performance obtained with handcrafted contact points.\n", "keywords": "differentiable physics;soft body manipulation", "primary_area": "", "supplementary_material": "", "author": "Sizhe Li;Zhiao Huang;Tao Du;Hao Su;Joshua B. Tenenbaum;Chuang Gan", "authorids": "~Sizhe_Li1;~Zhiao_Huang1;~Tao_Du1;~Hao_Su1;~Joshua_B._Tenenbaum1;~Chuang_Gan1", "gender": ";M;;M;;M", "homepage": "https://sizhe-li.github.io/;;https://people.iiis.tsinghua.edu.cn/~taodu/;http://ai.ucsd.edu/~haosu;;http://people.csail.mit.edu/ganchuang/", "dblp": ";172/1410;51/3026-1;09/4945-1;t/JoshuaBTenenbaum;139/6993", "google_scholar": ";;https://scholar.google.com/citations?hl=en;1P8Zu04AAAAJ;;PTeSCbIAAAAJ", "orcid": ";;0000-0001-7337-7667;;;", "linkedin": ";;;;;", "or_profile": "~Sizhe_Li1;~Zhiao_Huang1;~Tao_Du1;~Hao_Su1;~Joshua_B._Tenenbaum1;~Chuang_Gan1", "aff": "University of Rochester;University of California, San Diego, University of California, San Diego;Massachusetts Institute of Technology;University of California, San Diego;Massachusetts Institute of Technology;MIT-IBM Watson AI Lab", "aff_domain": "rochester.edu;eng.ucsd.edu;mit.edu;ucsd.edu;mit.edu;ibm.com", "position": "Undergrad student;PhD student;Postdoc;Assistant Professor;Professor;PhD student", "bibtex": "@inproceedings{\nli2022contact,\ntitle={Contact Points Discovery for Soft-Body Manipulations with Differentiable Physics},\nauthor={Sizhe Li and Zhiao Huang and Tao Du and Hao Su and Joshua B. Tenenbaum and Chuang Gan},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=mmUA7_O9mjY}\n}", "github": "", "project": "", "reviewers": "rUod;P7JN;Ept9", "pdf_size": 0, "recommendation": "6;8;8", "confidence": "4;3;3", "correctness": "3;3;4", "technical_novelty": "2;3;3", "empirical_novelty": "3;3;3", "wc_summary_paper": "30;123;103", "wc_summary_review": "48;57;48", "wc_main_review": "463;238;183", "wc_review": "541;418;334", "wc_reply_reviewers": "199;98;0", "wc_reply_authors": "1142;735;735", "reply_reviewers": "1;1;0", "reply_authors": "3;2;1", "recommendation_avg": [ 7.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 85.33333333333333, 39.96943276499625 ], "wc_summary_review_avg": [ 51.0, 4.242640687119285 ], "wc_main_review_avg": [ 294.6666666666667, 121.1289487373775 ], "wc_review_avg": [ 431.0, 85.00588214941364 ], "wc_reply_reviewers_avg": [ 99.0, 81.24448699245178 ], "wc_reply_authors_avg": [ 870.6666666666666, 191.8616399619499 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 2.0, 0.816496580927726 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.9999999999999998, "corr_recommendation_correctness": 0.49999999999999983, "gs_citation": 26, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17825405461433914787&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "pdf": "https://openreview.net/pdf?id=mmUA7_O9mjY", "email": "rochester.edu;eng.ucsd.edu;mit.edu;ucsd.edu;mit.edu;ibm.com", "author_num": 6, "aff_unique_index": "0;1;2;1;2;2", "aff_unique_norm": "University of Rochester;University of California, San Diego;Massachusetts Institute of Technology", "aff_unique_dep": ";;", "aff_unique_url": "https://www.rochester.edu;https://www.ucsd.edu;https://web.mit.edu", "aff_unique_abbr": "U of R;UCSD;MIT", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";San Diego", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "mniwiEAuzL", "title": "Sample-efficient actor-critic algorithms with an etiquette for zero-sum Markov games", "track": "main", "status": "Reject", "tldr": "", "abstract": "We introduce algorithms based on natural policy gradient and two time-scale natural actor-critic, and analyze their sample complexity for solving two player zero-sum Markov games in the tabular case. Our results improve the best-known sample complexities of policy gradient/actor-critic methods for convergence to Nash equilibrium in the multi-agent setting. We use the error propagation scheme in approximate dynamic programming, recent advances for global convergence of policy gradient methods, temporal difference learning, and techniques from stochastic primal-dual optimization literature. Our algorithms feature two stages, requiring agents to agree on an etiquette before starting their interactions, which is feasible for instance in self-play. On the other hand, the agents only access to joint reward and joint next state and not to each other's actions or policies. Our sample complexities also match the best-known results for global convergence of policy gradient and two time-scale actor-critic algorithms in the single agent setting. We provide numerical verification of our method for a two-player bandit environment and a two player game, Alesia. We observe improved empirical performance as compared to the recently proposed optimistic gradient descent ascent variant for Markov games.", "keywords": "zero sum Markov-games;policy gradient;actor-critic;temporal difference", "primary_area": "", "supplementary_material": "/attachment/c4261cf2e1b52e45433a6df989e30a8c1b24227c.zip", "author": "Ahmet Alacaoglu;Luca Viano;Niao He;Volkan Cevher", "authorids": "~Ahmet_Alacaoglu2;~Luca_Viano1;~Niao_He3;~Volkan_Cevher1", "gender": ";;M;", "homepage": "https://ahmetalacaoglu.github.io;https://scholar.google.com/citations?hl=en&user=e9Bpg5gAAAAJ;http://lions.epfl.ch;http://people.inf.ethz.ch/niaohe", "dblp": "209/4889;268/8179;70/5301;https://dblp.uni-trier.de/pers/h/He:Niao.html", "google_scholar": "-yRi8D4AAAAJ;E_dAUKEAAAAJ;https://scholar.google.ch/citations?user=hlWhzU8AAAAJ;iNcA81MAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Ahmet_Alacaoglu2;~Luca_Viano1;~Volkan_Cevher1;~Niao_He1", "aff": "University of Wisconsin-Madison;EPFL - EPF Lausanne;Swiss Institute of Technology;Swiss Federal Institute of Technology", "aff_domain": "wisc.edu;epfl.ch;epfl.ch;ethz.ch", "position": "Postdoc;PhD student;Associate Professor;Assistant Professor", "bibtex": "@misc{\nalacaoglu2022sampleefficient,\ntitle={Sample-efficient actor-critic algorithms with an etiquette for zero-sum Markov games},\nauthor={Ahmet Alacaoglu and Luca Viano and Niao He and Volkan Cevher},\nyear={2022},\nurl={https://openreview.net/forum?id=mniwiEAuzL}\n}", "github": "", "project": "", "reviewers": "HqK9;jDaF;JFQa;TPgg", "site": "https://openreview.net/forum?id=mniwiEAuzL", "pdf_size": 0, "recommendation": "5;6;6;6", "confidence": "4;4;3;2", "correctness": "4;4;3;4", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;0;0;0", "wc_summary_paper": "70;37;111;77", "wc_summary_review": "33;21;41;15", "wc_main_review": "524;478;281;161", "wc_review": "627;536;433;253", "wc_reply_reviewers": "0;216;43;0", "wc_reply_authors": "1073;735;402;272", "reply_reviewers": "0;1;1;0", "reply_authors": "3;3;1;1", "recommendation_avg": [ 5.75, 0.4330127018922193 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 0.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 73.75, 26.280934153869037 ], "wc_summary_review_avg": [ 27.5, 10.136567466356647 ], "wc_main_review_avg": [ 361.0, 147.1886544540713 ], "wc_review_avg": [ 462.25, 138.9449081470782 ], "wc_reply_reviewers_avg": [ 64.75, 89.07124957021766 ], "wc_reply_authors_avg": [ 620.5, 311.0711333441276 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 2.0, 1.0 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.5222329678670935, "corr_recommendation_correctness": -0.3333333333333333, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:ST1R-STyMLgJ:scholar.google.com/&scioq=Sample-efficient+actor-critic+algorithms+with+an+etiquette+for+zero-sum+Markov+games&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1;2;2", "aff_unique_norm": "University of Wisconsin-Madison;EPFL;Swiss Federal Institute of Technology", "aff_unique_dep": ";;", "aff_unique_url": "https://www.wisc.edu;https://www.epfl.ch;https://www.ethz.ch", "aff_unique_abbr": "UW-Madison;EPFL;ETH Zurich", "aff_campus_unique_index": "0;1", "aff_campus_unique": "Madison;Lausanne;", "aff_country_unique_index": "0;1;1;1", "aff_country_unique": "United States;Switzerland" }, { "title": "Peek-a-Boo: What (More) is Disguised in a Randomly Weighted Neural Network, and How to Find It Efficiently", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6586", "id": "moHCzz6D5H3", "poster": "", "openreview": "https://openreview.net/forum?id=moHCzz6D5H3", "slides": "https://iclr.cc/virtual/2022/poster/6586", "video": "https://iclr.cc/virtual/2022/poster/6586", "author_site": "Xiaohan Chen, Jason Zhang, Zhangyang Wang", "tldr": "", "abstract": "Sparse neural networks (NNs) are intensively investigated in literature due to their appeal in saving storage, memory, and computational costs. A recent work (Ramanujan et al., 2020) showed that, different from conventional pruning-and-finetuning pipeline, there exist hidden subnetworks in randomly initialized NNs that have good performance without training the weights. However, such \"hidden subnetworks\" have mediocre performances and require an expensive edge-popup algorithm to search for them. In this work, we define an extended class of subnetworks in randomly initialized NNs called disguised subnetworks, which are not only \"hidden\" in the random networks but also \"disguised\" -- hence can only be \"unmasked\" with certain transformations on weights. We argue that the unmasking process plays an important role in enlarging the capacity of the subnetworks and thus grants two major benefits: (i) the disguised subnetworks easily outperform the hidden counterparts; (ii) the unmasking process helps to relax the quality requirement on the sparse subnetwork mask so that the expensive edge-popup algorithm can be replaced with more efficient alternatives. On top of this new concept, we propose a novel two-stage algorithm that plays a Peek-a-Boo (PaB) game to identify the disguised subnetworks with a combination of two operations: (1) searching efficiently for a subnetwork at random initialization; (2) unmasking the disguise by learning to transform the resulting subnetwork's remaining weights. Furthermore, we show that the unmasking process can be efficiently implemented (a) without referring to any latent weights or scores; and (b) by only leveraging approximated gradients, so that the whole training algorithm is computationally light. Extensive experiments with several large models (ResNet-18, ResNet-50, and WideResNet-28) and datasets (CIFAR-10, CIFAR-100 and ImageNet) demonstrate the competency of PaB over edge-popup and other counterparts. Our codes are available at: https://github.com/VITA-Group/Peek-a-Boo.", "keywords": "Sparse Neural Network;Lottery Ticket Hypothesis;Efficient Machine Leanring", "primary_area": "", "supplementary_material": "", "author": "Xiaohan Chen;Jason Zhang;Zhangyang Wang", "authorids": "~Xiaohan_Chen1;~Jason_Zhang2;~Zhangyang_Wang1", "gender": "M;;M", "homepage": "http://xiaohanchen.com;;https://vita-group.github.io", "dblp": "94/3802;;119/4026", "google_scholar": "https://scholar.google.com/citations?authuser=1;;pxFyKAIAAAAJ", "orcid": "0000-0002-0360-0402;;", "linkedin": "xiaohan-chen-400b00147/;jason-zhang271/;", "or_profile": "~Xiaohan_Chen1;~Jason_Zhang2;~Zhangyang_Wang1", "aff": "University of Texas, Austin;Carnegie Mellon University;University of Texas, Austin", "aff_domain": "utexas.edu;cmu.edu;utexas.edu", "position": "PhD student;MS student;Assistant Professor", "bibtex": "@inproceedings{\nchen2022peekaboo,\ntitle={Peek-a-Boo: What (More) is Disguised in a Randomly Weighted Neural Network, and How to Find It Efficiently},\nauthor={Xiaohan Chen and Jason Zhang and Zhangyang Wang},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=moHCzz6D5H3}\n}", "github": "", "project": "", "reviewers": "rMZy;7V6H;jduC;QSy2", "pdf_size": 0, "recommendation": "5;6;8;8", "confidence": "3;4;4;4", "correctness": "3;4;4;4", "technical_novelty": "3;2;4;3", "empirical_novelty": "2;2;4;4", "wc_summary_paper": "54;253;72;78", "wc_summary_review": "42;92;47;57", "wc_main_review": "324;935;237;143", "wc_review": "420;1280;356;278", "wc_reply_reviewers": "0;568;0;17", "wc_reply_authors": "696;1683;271;543", "reply_reviewers": "0;2;0;1", "reply_authors": "2;5;1;1", "recommendation_avg": [ 6.75, 1.299038105676658 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 3.0, 1.0 ], "wc_summary_paper_avg": [ 114.25, 80.59272609857543 ], "wc_summary_review_avg": [ 59.5, 19.525624189766635 ], "wc_main_review_avg": [ 409.75, 309.93497301853495 ], "wc_review_avg": [ 583.5, 405.2564003196001 ], "wc_reply_reviewers_avg": [ 146.25, 243.59636183654302 ], "wc_reply_authors_avg": [ 798.25, 533.0062734152385 ], "reply_reviewers_avg": [ 0.75, 0.82915619758885 ], "reply_authors_avg": [ 2.25, 1.6393596310755 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.7777777777777777, "corr_recommendation_correctness": 0.7777777777777777, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5346163527672701724&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "pdf": "https://openreview.net/pdf?id=moHCzz6D5H3", "email": "utexas.edu;cmu.edu;utexas.edu", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "University of Texas at Austin;Carnegie Mellon University", "aff_unique_dep": ";", "aff_unique_url": "https://www.utexas.edu;https://www.cmu.edu", "aff_unique_abbr": "UT Austin;CMU", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Austin;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "morSrUyWG26", "title": "AutoOED: Automated Optimal Experimental Design Platform with Data- and Time-Efficient Multi-Objective Optimization", "track": "main", "status": "Reject", "tldr": "", "abstract": "We present AutoOED, an Automated Optimal Experimental Design platform powered by machine learning to accelerate discovering solutions with optimal objective trade-offs. To solve expensive multi-objective problems in a data-efficient manner, we implement popular multi-objective Bayesian optimization (MOBO) algorithms with state-of-the-art performance in a modular framework. To further accelerate the optimization in a time-efficient manner, we propose a novel strategy called Believer-Penalizer (BP), which allows batch experiments to be accelerated asynchronously without affecting performance. AutoOED serves as a testbed for machine learning researchers to quickly develop and evaluate their own MOBO algorithms. We also provide a graphical user interface (GUI) for users with little or no experience with coding, machine learning, or optimization to visualize and guide the experiment design intuitively. Finally, we demonstrate that AutoOED can control and guide real-world hardware experiments in a fully automated way without human intervention.", "keywords": "optimal experiment design;bayesian optimization;multi-objective optimization;software platform", "primary_area": "", "supplementary_material": "", "author": "Yunsheng Tian;Mina Konakovic Lukovic;Michael Foshey;Timothy Erps;Beichen Li;Wojciech Matusik", "authorids": "~Yunsheng_Tian1;~Mina_Konakovic_Lukovic1;~Michael_Foshey1;~Timothy_Erps1;~Beichen_Li1;~Wojciech_Matusik2", "gender": "M;F;;;M;M", "homepage": "https://www.yunshengtian.com/;http://people.csail.mit.edu/mina/;;;https://people.csail.mit.edu/beichen;https://cdfg.mit.edu/wojciech", "dblp": "224/0723;;;;;", "google_scholar": "sf6RjM4AAAAJ;32Q2ni8AAAAJ;;https://scholar.google.com/scholar?hl=en;zR5wuKUAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";0000-0002-2895-0206;;;0000-0002-9271-0055;0000-0003-0212-5643", "linkedin": ";;michael-foshey/;;beichen-li-ba9b34106;wojciech-matusik-67238126/", "or_profile": "~Yunsheng_Tian1;~Mina_Konakovic_Lukovic1;~Michael_Foshey1;~Timothy_Erps1;~Beichen_Li1;~Wojciech_Matusik2", "aff": "Massachusetts Institute of Technology;Massachusetts Institute of Technology;Massachusetts Institute of Technology;;Massachusetts Institute of Technology;Massachusetts Institute of Technology", "aff_domain": "csail.mit.edu;mit.edu;mit.edu;;mit.edu;mit.edu", "position": "PhD student;Assistant Professor;Researcher;;PhD student;Full Professor", "bibtex": "@misc{\ntian2022autooed,\ntitle={Auto{OED}: Automated Optimal Experimental Design Platform with Data- and Time-Efficient Multi-Objective Optimization},\nauthor={Yunsheng Tian and Mina Konakovic Lukovic and Michael Foshey and Timothy Erps and Beichen Li and Wojciech Matusik},\nyear={2022},\nurl={https://openreview.net/forum?id=morSrUyWG26}\n}", "github": "", "project": "", "reviewers": "QfiR;iLNv;Bmro;f9tL", "site": "https://openreview.net/forum?id=morSrUyWG26", "pdf_size": 0, "recommendation": "5;5;5;6", "confidence": "5;5;4;3", "correctness": "3;3;3;3", "technical_novelty": "2;2;3;3", "empirical_novelty": "1;2;3;3", "wc_summary_paper": "115;36;33;128", "wc_summary_review": "44;9;21;44", "wc_main_review": "610;158;181;170", "wc_review": "769;203;235;342", "wc_reply_reviewers": "97;0;0;0", "wc_reply_authors": "665;211;197;74", "reply_reviewers": "1;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 5.25, 0.4330127018922193 ], "confidence_avg": [ 4.25, 0.82915619758885 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 78.0, 43.754999714318366 ], "wc_summary_review_avg": [ 29.5, 15.107944929738128 ], "wc_main_review_avg": [ 279.75, 190.84335854307324 ], "wc_review_avg": [ 387.25, 226.33423846161676 ], "wc_reply_reviewers_avg": [ 24.25, 42.00223208354527 ], "wc_reply_authors_avg": [ 286.75, 224.79365538199693 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.8703882797784891, "corr_recommendation_correctness": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7979805672751581623&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Massachusetts Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://web.mit.edu", "aff_unique_abbr": "MIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "mqIeP6qPvta", "title": "FoveaTer: Foveated Transformer for Image Classification", "track": "main", "status": "Reject", "tldr": "", "abstract": "Many animals and humans process the visual field with varying spatial resolution (foveated vision) and use peripheral processing to make eye movements and point the fovea to acquire high-resolution information about objects of interest. This architecture results in computationally efficient rapid scene exploration. Recent progress in vision Transformers has brought about new alternatives to the traditionally convolution-reliant computer vision systems. However, the Transformer models do not explicitly model the foveated properties of the visual system nor the interaction between eye movements and the classification task. We propose foveated Transformer (FoveaTer) model, which uses pooling regions and eye movements to perform object classification tasks using a vision Transformer architecture. Our proposed model pools the image features using squared pooling regions, an approximation to the biologically-inspired foveated architecture, and uses the pooled features as an input to a Transformer Network. It decides on subsequent fixation locations based on the attention assigned by the Transformer to various locations from previous and present fixations. The model uses a confidence threshold to stop scene exploration, dynamically allocating more fixation/computational resources to more challenging images. After reaching the stopping criterion, the model makes the final object category decision. We construct a Foveated model using our proposed approach and compare it against a Full-resolution model, which does not contain any pooling. On the ImageNet-100 dataset, our Foveated model achieves the accuracy of the Full-resolution model using only 35% transformer computations and 73% overall computations. Finally, we demonstrate our model's robustness against adversarial attacks, where it outperforms the full-resolution model.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Aditya Jonnalagadda;William Yang Wang;B.S. Manjunath;Miguel Eckstein", "authorids": "~Aditya_Jonnalagadda1;~William_Yang_Wang2;~B.S._Manjunath1;~Miguel_Eckstein1", "gender": "M;;;M", "homepage": "https://viu.psych.ucsb.edu/people/aditya-jonnalagadda;;;https://psych.ucsb.edu/people/faculty/miguel-eckstein", "dblp": ";;;56/975", "google_scholar": ";;;G5dQztgAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Aditya_Jonnalagadda1;~William_Yang_Wang2;~B.S._Manjunath1;~Miguel_Eckstein1", "aff": "UC Santa Barbara;;;", "aff_domain": "ucsb.edu;;;", "position": "PhD student;;;", "bibtex": "@misc{\njonnalagadda2022foveater,\ntitle={FoveaTer: Foveated Transformer for Image Classification},\nauthor={Aditya Jonnalagadda and William Yang Wang and B.S. Manjunath and Miguel Eckstein},\nyear={2022},\nurl={https://openreview.net/forum?id=mqIeP6qPvta}\n}", "github": "", "project": "", "reviewers": "czVG;gFeo;EWqw;NG64", "site": "https://openreview.net/forum?id=mqIeP6qPvta", "pdf_size": 0, "recommendation": "3;5;6;8", "confidence": "4;4;4;4", "correctness": "3;2;4;3", "technical_novelty": "2;3;4;4", "empirical_novelty": "0;2;3;3", "wc_summary_paper": "55;51;94;44", "wc_summary_review": "39;173;66;41", "wc_main_review": "657;833;189;418", "wc_review": "751;1057;349;503", "wc_reply_reviewers": "476;447;58;244", "wc_reply_authors": "521;1009;116;280", "reply_reviewers": "2;1;1;1", "reply_authors": "2;2;1;1", "recommendation_avg": [ 5.5, 1.8027756377319946 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 3.25, 0.82915619758885 ], "empirical_novelty_avg": [ 2.0, 1.224744871391589 ], "wc_summary_paper_avg": [ 61.0, 19.45507645834372 ], "wc_summary_review_avg": [ 79.75, 54.87884382892919 ], "wc_main_review_avg": [ 524.25, 243.22353401757815 ], "wc_review_avg": [ 665.0, 267.93655965545275 ], "wc_reply_reviewers_avg": [ 306.25, 168.91473440762945 ], "wc_reply_authors_avg": [ 481.5, 336.90094983540786 ], "reply_reviewers_avg": [ 1.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.19611613513818402, "gs_citation": 36, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9670164795000389792&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0", "aff_unique_norm": "University of California, Santa Barbara", "aff_unique_dep": "", "aff_unique_url": "https://www.ucsb.edu", "aff_unique_abbr": "UCSB", "aff_campus_unique_index": "0", "aff_campus_unique": "Santa Barbara", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "id": "ms7xJWbf8Ku", "title": "Efficient Packing: Towards 2x NLP Speed-Up without Loss of Accuracy for BERT", "track": "main", "status": "Reject", "tldr": "", "abstract": "We find that at sequence length 512 padding tokens represent in excess of 50% of the Wikipedia dataset used for pretraining BERT (Bidirectional Encoder Representations from Transformers). Therefore by removing all padding we achieve a 2x speed-up in terms of sequences/sec. To exploit this characteristic of the dataset, we develop and contrast two deterministic packing algorithms. Both algorithms rely on the assumption that sequences are interchangeable and therefore packing can be performed on the histogram of sequence lengths, rather than per sample. This transformation of the problem leads to algorithms which are fast and have linear complexity in dataset size. The shortest-pack-first histogram-packing (SPFHP) algorithm determines the packing order for the Wikipedia dataset of over 16M sequences in 0.02 seconds. The non-negative least-squares histogram-packing (NNLSHP) algorithm converges in 28.4 seconds but produces solutions which are more depth efficient, managing to get near optimal packing by combining a maximum of 3 sequences in one sample. Using the dataset with multiple sequences per sample requires adjusting the model and the hyperparameters. We demonstrate that these changes are straightforward to implement and have relatively little impact on the achievable performance gain on modern hardware. Finally, we pretrain BERT-Large using the packed dataset, demonstrating no loss of convergence and the desired 2x speed-up.", "keywords": "deep learning;BERT;IPU;GPU;hardware-acceleration;padding;Wikipedia;NLP", "primary_area": "", "supplementary_material": "/attachment/51691df25096ababb74ff5095638e63912990ccc.zip", "author": "Matej Kosec;Sheng Fu;Mario Michael Krell", "authorids": "~Matej_Kosec1;~Sheng_Fu1;~Mario_Michael_Krell2", "gender": ";M;M", "homepage": ";https://www.linkedin.com/in/shengfu2019/;http://mmkrell.github.io/CV/index.html", "dblp": ";;142/4016", "google_scholar": ";;https://scholar.google.de/citations?user=77LZ0ckAAAAJ", "orcid": ";;", "linkedin": "matejkosec/;;", "or_profile": "~Matej_Kosec1;~Sheng_Fu1;~Mario_Michael_Krell2", "aff": "Graphcore Inc;;Graphcore", "aff_domain": "graphcore.ai;;graphcore.ai", "position": "AI Applications Specialist;;Researcher", "bibtex": "@misc{\nkosec2022efficient,\ntitle={Efficient Packing: Towards 2x {NLP} Speed-Up without Loss of Accuracy for {BERT}},\nauthor={Matej Kosec and Sheng Fu and Mario Michael Krell},\nyear={2022},\nurl={https://openreview.net/forum?id=ms7xJWbf8Ku}\n}", "github": "", "project": "", "reviewers": "4Mr1;nPLY;xs2k;EN5A;g5tJ", "site": "https://openreview.net/forum?id=ms7xJWbf8Ku", "pdf_size": 0, "recommendation": "3;5;5;6;6", "confidence": "4;4;4;3;3", "correctness": "3;3;3;4;3", "technical_novelty": "2;3;2;3;3", "empirical_novelty": "1;4;2;2;2", "wc_summary_paper": "76;60;74;28;88", "wc_summary_review": "25;47;5;3;62", "wc_main_review": "264;219;548;140;271", "wc_review": "365;326;627;171;421", "wc_reply_reviewers": "56;27;98;0;139", "wc_reply_authors": "382;358;275;0;413", "reply_reviewers": "1;1;2;0;1", "reply_authors": "1;1;1;0;3", "recommendation_avg": [ 5.0, 1.0954451150103321 ], "confidence_avg": [ 3.6, 0.4898979485566356 ], "correctness_avg": [ 3.2, 0.39999999999999997 ], "technical_novelty_avg": [ 2.6, 0.4898979485566356 ], "empirical_novelty_avg": [ 2.2, 0.9797958971132712 ], "wc_summary_paper_avg": [ 65.2, 20.614557962760202 ], "wc_summary_review_avg": [ 28.4, 23.148218073968458 ], "wc_main_review_avg": [ 288.4, 137.93418720534805 ], "wc_review_avg": [ 382.0, 147.98107987171875 ], "wc_reply_reviewers_avg": [ 64.0, 49.61854492022111 ], "wc_reply_authors_avg": [ 285.6, 149.96346221663464 ], "reply_reviewers_avg": [ 1.0, 0.6324555320336759 ], "reply_authors_avg": [ 1.2, 0.9797958971132712 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.74535599249993, "corr_recommendation_correctness": 0.4564354645876385, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:RWKY57PJ4MYJ:scholar.google.com/&scioq=Efficient+Packing:+Towards+2x+NLP+Speed-Up+without+Loss+of+Accuracy+for+BERT&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1", "aff_unique_norm": "Graphcore Inc;Graphcore", "aff_unique_dep": ";", "aff_unique_url": "https://www.graphcore.ai;https://www.graphcore.ai", "aff_unique_abbr": "Graphcore;Graphcore", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United Kingdom" }, { "title": "Learned Simulators for Turbulence", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6985", "id": "msRBojTz-Nh", "poster": "", "openreview": "https://openreview.net/forum?id=msRBojTz-Nh", "slides": "https://iclr.cc/virtual/2022/poster/6985", "video": "https://iclr.cc/virtual/2022/poster/6985", "author_site": "Kimberly Stachenfeld, Drummond Fielding, Dmitrii Kochkov, Miles Cranmer, Tobias Pfaff, Jonathan Godwin, Can Cui, Shirley Ho, Peter Battaglia, Alvaro Sanchez Gonzalez", "tldr": "", "abstract": "Turbulence simulation with classical numerical solvers requires high-resolution grids to accurately resolve dynamics. Here we train learned simulators at low spatial and temporal resolutions to capture turbulent dynamics generated at high resolution. We show that our proposed model can simulate turbulent dynamics more accurately than classical numerical solvers at the comparably low resolutions across various scientifically relevant metrics. Our model is trained end-to-end from data and is capable of learning a range of challenging chaotic and turbulent dynamics at low resolution, including trajectories generated by the state-of-the-art Athena++ engine. We show that our simpler, general-purpose architecture outperforms various more specialized, turbulence-specific architectures from the learned turbulence simulation literature. In general, we see that learned simulators yield unstable trajectories; however, we show that tuning training noise and temporal downsampling solves this problem. We also find that while generalization beyond the training distribution is a challenge for learned models, training noise, added loss constraints, and dataset augmentation can help. Broadly, we conclude that our learned simulator outperforms traditional solvers run on coarser grids, and emphasize that simple design choices can offer stability and robust generalization.", "keywords": "learned simulation;turbulence", "primary_area": "", "supplementary_material": "", "author": "Kim Stachenfeld;Drummond Buschman Fielding;Dmitrii Kochkov;Miles Cranmer;Tobias Pfaff;Jonathan Godwin;Can Cui;Shirley Ho;Peter Battaglia;Alvaro Sanchez-Gonzalez", "authorids": "~Kim_Stachenfeld1;~Drummond_Buschman_Fielding1;~Dmitrii_Kochkov1;~Miles_Cranmer1;~Tobias_Pfaff1;~Jonathan_Godwin1;~Can_Cui2;~Shirley_Ho2;~Peter_Battaglia1;~Alvaro_Sanchez-Gonzalez1", "gender": "F;;M;M;;F;;M;M;", "homepage": "https://neurokim.com/;https://dfielding14.github.io;;http://tobiaspfaff.com;https://www.linkedin.com/in/jonathan-godwin-12907638/;;https://www.shirleyho.space/;;;https://astroautomata.com/", "dblp": "155/1888;;;67/7591;;;162/2218;41/3400;222/1889;205/2493", "google_scholar": "jNtH2WUAAAAJ;;https://scholar.google.com/citations?hl=en;3oUgDKQAAAAJ;;;fhOi--4AAAAJ;https://scholar.google.co.uk/citations?user=nQ7Ij30AAAAJ;https://scholar.google.co.uk/citations?user=d1oQ8NcAAAAJ;10WfwCQAAAAJ", "orcid": ";0000-0003-3806-8548;;;;;;;;0000-0002-6458-3423", "linkedin": ";;dmitrii-kochkov/;;;elainecc;;;;milescranmer/", "or_profile": "~Kim_Stachenfeld1;~Drummond_Buschman_Fielding1;~Dmitrii_Kochkov1;~Tobias_Pfaff1;~Jonathan_Godwin1;~Can_Cui2;~Shirley_Ho2;~Peter_Battaglia1;~Alvaro_Sanchez-Gonzalez1;~Miles_Cranmer2", "aff": "Google DeepMind;Simons Foundation;Google;Deepmind;Google DeepMind;;Carnegie Mellon University;Google DeepMind;Google DeepMind;Princeton University", "aff_domain": "deepmind.com;simonsfoundation.org;google.com;google.com;deepmind.com;;cmu.edu;google.com;google.com;princeton.edu", "position": "Research Scientist;Postdoc;Researcher;Research scientist;Researcher;;Associate Professor;Researcher;Senior Research Engineer;PhD student", "bibtex": "@inproceedings{\nstachenfeld2022learned,\ntitle={Learned Simulators for Turbulence},\nauthor={Kim Stachenfeld and Drummond Buschman Fielding and Dmitrii Kochkov and Miles Cranmer and Tobias Pfaff and Jonathan Godwin and Can Cui and Shirley Ho and Peter Battaglia and Alvaro Sanchez-Gonzalez},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=msRBojTz-Nh}\n}", "github": "", "project": "", "reviewers": "HsnX;zinf;YTSP;1vW7", "pdf_size": 0, "recommendation": "6;6;8;8", "confidence": "3;3;4;3", "correctness": "3;3;3;3", "technical_novelty": "2;3;3;3", "empirical_novelty": "0;3;3;3", "wc_summary_paper": "106;40;87;53", "wc_summary_review": "27;71;57;46", "wc_main_review": "142;206;341;292", "wc_review": "275;317;485;391", "wc_reply_reviewers": "25;0;69;180", "wc_reply_authors": "644;639;718;528", "reply_reviewers": "1;0;1;1", "reply_authors": "1;1;1;1", "recommendation_avg": [ 7.0, 1.0 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 1.299038105676658 ], "wc_summary_paper_avg": [ 71.5, 26.291633650269812 ], "wc_summary_review_avg": [ 50.25, 16.08376510646683 ], "wc_main_review_avg": [ 245.25, 76.73778404410699 ], "wc_review_avg": [ 367.0, 79.78721702127478 ], "wc_reply_reviewers_avg": [ 68.5, 68.95106960736722 ], "wc_reply_authors_avg": [ 632.25, 67.83205363248263 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 10, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": 0.0, "gs_citation": 47, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9272021876665731793&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "pdf": "https://openreview.net/pdf?id=msRBojTz-Nh", "email": "deepmind.com;simonsfoundation.org;google.com;google.com;deepmind.com;;cmu.edu;google.com;google.com;princeton.edu", "author_num": 10, "aff_unique_index": "0;1;0;2;0;3;0;0;4", "aff_unique_norm": "Google;Simons Foundation;DeepMind;Carnegie Mellon University;Princeton University", "aff_unique_dep": "Google DeepMind;;;;", "aff_unique_url": "https://deepmind.com;https://www.simonsfoundation.org;https://deepmind.com;https://www.cmu.edu;https://www.princeton.edu", "aff_unique_abbr": "DeepMind;Simons Foundation;DeepMind;CMU;Princeton", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;1;1;0;0;1;0;0;1", "aff_country_unique": "United Kingdom;United States" }, { "id": "mvq4blDaCkN", "title": "Efficient Semi-Supervised Adversarial Training without Guessing Labels", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Adversarial training has been proved to be the most effective defensive strategy to protect models from adversarial attacks. In the practical application scenario of adversarial training, we face not only labeled data, but also an enormous amount of unlabeled data. However, existing adversarial training methods are naturally targeting supervised learning problems. To adapt to semi-supervised learning problems, they need to estimate labels for unlabeled data in advance, which inevitably degenerates the performance of the learned model due to the bias on the estimation of labels for unlabeled data. \nTo mitigate this degeneration, in this paper, we propose a new semi-supervised adversarial training framework via maximizing AUCs which is also a minimax problem but treats the unlabeled samples as both positive and negative ones, so that we do not need to guess the labels for unlabeled data. Unsurprisingly, the minimax problem can be solved via the traditional adversarial training algorithm by extending singly stochastic gradients to triply stochastic gradients, to adapt to the three (i.e. positive, negative, and unlabeled) data sources. To further accelerate the training procedure, we transform the minimax adversarial training problem into an equivalent minimization one based on the kernel perspective. For the minimization problem, we discuss scalable and efficient algorithms not only for deep neural networks but also for kernel support vector machines. Extensive experimental results show that our algorithms not only achieve better generalization performance against various adversarial attacks, but also enjoy efficiency and scalability when considered from the kernel perspective.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/3360e0d2bf59f7322148c2ef1b6873159abc275c.zip", "author": "Huimin Wu;Heng Huang;Bin Gu", "authorids": "~Huimin_Wu1;~Heng_Huang1;~Bin_Gu1", "gender": ";M;M", "homepage": "https://www.researchgate.net/profile/Huimin-Wu-7;https://www.cs.umd.edu/~heng/;https://mbzuai.ac.ae/study/faculty/bin-gu/", "dblp": ";03/281;29/1758-1", "google_scholar": ";4OqLaDwAAAAJ;Vo8OgCgAAAAJ", "orcid": ";;0000-0001-6049-1815", "linkedin": ";;", "or_profile": "~Huimin_Wu1;~Heng_Huang1;~Bin_Gu1", "aff": "NUIST;University of Pittsburgh;Mohamed bin Zayed University of Artificial Intelligence", "aff_domain": "nuist.edu.cn;pitt.edu;mbzuai.ac.ae", "position": "MS student;Full Professor;Assistant Professor", "bibtex": "@misc{\nwu2022efficient,\ntitle={Efficient Semi-Supervised Adversarial Training without Guessing Labels},\nauthor={Huimin Wu and Heng Huang and Bin Gu},\nyear={2022},\nurl={https://openreview.net/forum?id=mvq4blDaCkN}\n}", "github": "", "project": "", "reviewers": "b8fQ;fvdt;NyVU", "site": "https://openreview.net/forum?id=mvq4blDaCkN", "pdf_size": 0, "recommendation": "1;3;5", "confidence": "5;4;3", "correctness": "1;3;4", "technical_novelty": "2;2;2", "empirical_novelty": "1;2;2", "wc_summary_paper": "33;117;47", "wc_summary_review": "12;53;60", "wc_main_review": "171;293;542", "wc_review": "216;463;649", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 3.0, 1.632993161855452 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "correctness_avg": [ 2.6666666666666665, 1.247219128924647 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 1.6666666666666667, 0.4714045207910317 ], "wc_summary_paper_avg": [ 65.66666666666667, 36.745370078721784 ], "wc_summary_review_avg": [ 41.666666666666664, 21.171259344267224 ], "wc_main_review_avg": [ 335.3333333333333, 154.3898384681525 ], "wc_review_avg": [ 442.6666666666667, 177.35526180209283 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -1.0, "corr_recommendation_correctness": 0.9819805060619659, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16500116012421067216&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;1;2", "aff_unique_norm": "Nanjing University of Information Science & Technology;University of Pittsburgh;Mohamed bin Zayed University of Artificial Intelligence", "aff_unique_dep": ";;", "aff_unique_url": "http://www.nuist.edu.cn/;https://www.pitt.edu;https://mbzuai.ac.ae", "aff_unique_abbr": "NUIST;Pitt;MBZUAI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2", "aff_country_unique": "China;United States;United Arab Emirates" }, { "title": "Policy Smoothing for Provably Robust Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6719", "id": "mwdfai8NBrJ", "poster": "", "openreview": "https://openreview.net/forum?id=mwdfai8NBrJ", "slides": "https://iclr.cc/virtual/2022/poster/6719", "video": "https://iclr.cc/virtual/2022/poster/6719", "author_site": "Aounon Kumar, Alexander Levine, Soheil Feizi", "tldr": "", "abstract": "The study of provable adversarial robustness for deep neural networks (DNNs) has mainly focused on $\\textit{static}$ supervised learning tasks such as image classification. However, DNNs have been used extensively in real-world $\\textit{adaptive}$ tasks such as reinforcement learning (RL), making such systems vulnerable to adversarial attacks as well. Prior works in provable robustness in RL seek to certify the behaviour of the victim policy at every time-step against a non-adaptive adversary using methods developed for the static setting. But in the real world, an RL adversary can infer the defense strategy used by the victim agent by observing the states, actions, etc. from previous time-steps and adapt itself to produce stronger attacks in future steps (e.g., by focusing more on states critical to the agent's performance). We present an efficient procedure, designed specifically to defend against an adaptive RL adversary, that can directly certify the total reward without requiring the policy to be robust at each time-step. Focusing on randomized smoothing based defenses, our main theoretical contribution is to prove an $\\textit{adaptive version}$ of the Neyman-Pearson Lemma -- a key lemma for smoothing-based certificates -- where the adversarial perturbation at a particular time can be a stochastic function of current and previous observations and states as well as previous actions. Building on this result, we propose $\\textit{policy smoothing}$ where the agent adds a Gaussian noise to its observation at each time-step before passing it through the policy function. Our robustness certificates guarantee that the final total reward obtained by policy smoothing remains above a certain threshold, even though the actions at intermediate time-steps may change under the attack. We show that our certificates are $\\textit{tight}$ by constructing a worst-case scenario that achieves the bounds derived in our analysis. Our experiments on various environments like Cartpole, Pong, Freeway and Mountain Car show that our method can yield meaningful robustness guarantees in practice.\n", "keywords": "Reinforcement Learning;Provable Adversarial Robustness;Randomized Smoothing", "primary_area": "", "supplementary_material": "/attachment/3b9b7584c1f052d05681c01e9c2d48b380c17c27.zip", "author": "Aounon Kumar;Alexander Levine;Soheil Feizi", "authorids": "~Aounon_Kumar1;~Alexander_Levine2;~Soheil_Feizi2", "gender": "M;;M", "homepage": "https://aounon.github.io;;https://www.cs.umd.edu/~sfeizi/", "dblp": "191/8334;;57/2132", "google_scholar": "NjhpUykAAAAJ;;lptAmrMAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Aounon_Kumar1;~Alexander_Levine2;~Soheil_Feizi2", "aff": "University of Maryland, College Park;;University of Maryland, College Park", "aff_domain": "umd.edu;;umd.edu", "position": "PhD student;;Assistant Professor", "bibtex": "@inproceedings{\nkumar2022policy,\ntitle={Policy Smoothing for Provably Robust Reinforcement Learning},\nauthor={Aounon Kumar and Alexander Levine and Soheil Feizi},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=mwdfai8NBrJ}\n}", "github": "", "project": "", "reviewers": "xL3X;LDzS;4eym;RqNT;6S68", "pdf_size": 0, "recommendation": "5;6;6;6;8", "confidence": "2;2;5;3;4", "correctness": "3;4;3;4;4", "technical_novelty": "2;3;3;3;3", "empirical_novelty": "2;3;3;3;3", "wc_summary_paper": "40;34;53;58;173", "wc_summary_review": "34;39;43;53;39", "wc_main_review": "471;185;360;178;232", "wc_review": "545;258;456;289;444", "wc_reply_reviewers": "8;0;0;0;50", "wc_reply_authors": "1172;645;390;349;714", "reply_reviewers": "1;0;0;0;1", "reply_authors": "4;1;1;1;2", "recommendation_avg": [ 6.2, 0.9797958971132712 ], "confidence_avg": [ 3.2, 1.16619037896906 ], "correctness_avg": [ 3.6, 0.4898979485566356 ], "technical_novelty_avg": [ 2.8, 0.39999999999999997 ], "empirical_novelty_avg": [ 2.8, 0.39999999999999997 ], "wc_summary_paper_avg": [ 71.6, 51.429952362412315 ], "wc_summary_review_avg": [ 41.6, 6.374950980203691 ], "wc_main_review_avg": [ 285.2, 113.55949982278013 ], "wc_review_avg": [ 398.4, 108.230494778505 ], "wc_reply_reviewers_avg": [ 11.6, 19.44839324982915 ], "wc_reply_authors_avg": [ 654.0, 294.8647147422017 ], "reply_reviewers_avg": [ 0.4, 0.48989794855663565 ], "reply_authors_avg": [ 1.8, 1.1661903789690602 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.4900980294098034, "corr_recommendation_correctness": 0.5833333333333334, "gs_citation": 65, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8163078198036506273&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=mwdfai8NBrJ", "email": "umd.edu;;umd.edu", "author_num": 3, "aff_unique_index": "0;0", "aff_unique_norm": "University of Maryland", "aff_unique_dep": "", "aff_unique_url": "https://www/umd.edu", "aff_unique_abbr": "UMD", "aff_campus_unique_index": "0;0", "aff_campus_unique": "College Park", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "mz7Bkl2Pz6", "title": "Global Convergence and Stability of Stochastic Gradient Descent", "track": "main", "status": "Reject", "tldr": "", "abstract": "In machine learning, stochastic gradient descent (SGD) is widely deployed to train models using highly non-convex objectives with equally complex noise models. Unfortunately, SGD theory often makes restrictive assumptions that fail to capture the non-convexity of real problems, and almost entirely ignore the complex noise models that exist in practice. In this work, we make substantial progress on this shortcoming. First, we establish that SGD\u2019s iterates will either globally converge to a stationary point or diverge under nearly arbitrary nonconvexity and noise models. Under a slightly more restrictive assumption on the joint behavior of the non-convexity and noise model that generalizes current assumptions in the literature, we show that the objective function cannot diverge, even if the iterates diverge. As a consequence of our results, SGD can be applied to a greater range of stochastic optimization problems with confidence about its global convergence behavior and stability.", "keywords": "Stochastic Gradient Descent;Nonconvexity;Noise Model;Global Convergence;Stability", "primary_area": "", "supplementary_material": "/attachment/d2c749d96a1ea74c4b2fe89a775d601c2936c9cb.zip", "author": "Vivak Patel;Bowen Tian;Shushu Zhang", "authorids": "~Vivak_Patel1;~Bowen_Tian1;~Shushu_Zhang1", "gender": ";M;F", "homepage": "http://vivakpatel.org;;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";bowen-tian-202644221/;shushu-zhang-68b2a7193/", "or_profile": "~Vivak_Patel1;~Bowen_Tian1;~Shushu_Zhang1", "aff": "University of Wisconsin, Madison;University of Wisconsin, Madison;", "aff_domain": "wisc.edu;wisc.edu;", "position": "Assistant Professor;MS student;", "bibtex": "@misc{\npatel2022global,\ntitle={Global Convergence and Stability of Stochastic Gradient Descent},\nauthor={Vivak Patel and Bowen Tian and Shushu Zhang},\nyear={2022},\nurl={https://openreview.net/forum?id=mz7Bkl2Pz6}\n}", "github": "", "project": "", "reviewers": "tECd;TjZW;bDLV", "site": "https://openreview.net/forum?id=mz7Bkl2Pz6", "pdf_size": 0, "recommendation": "6;6;6", "confidence": "4;3;3", "correctness": "4;4;4", "technical_novelty": "3;3;3", "empirical_novelty": "3;0;0", "wc_summary_paper": "115;39;132", "wc_summary_review": "22;23;330", "wc_main_review": "237;112;97", "wc_review": "374;174;559", "wc_reply_reviewers": "5;0;4", "wc_reply_authors": "178;27;201", "reply_reviewers": "1;0;1", "reply_authors": "1;1;1", "recommendation_avg": [ 6.0, 0.0 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "correctness_avg": [ 4.0, 0.0 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 1.0, 1.4142135623730951 ], "wc_summary_paper_avg": [ 95.33333333333333, 40.43375927228247 ], "wc_summary_review_avg": [ 125.0, 144.95746502566422 ], "wc_main_review_avg": [ 148.66666666666666, 62.76056794587576 ], "wc_review_avg": [ 369.0, 157.21535124365772 ], "wc_reply_reviewers_avg": [ 3.0, 2.160246899469287 ], "wc_reply_authors_avg": [ 135.33333333333334, 77.17656523985906 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 45, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4663190547659044194&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff_unique_index": "0;0", "aff_unique_norm": "University of Wisconsin", "aff_unique_dep": "", "aff_unique_url": "https://www.wisc.edu", "aff_unique_abbr": "UW", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Madison", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Efficient Sharpness-aware Minimization for Improved Training of Neural Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6193", "id": "n0OeTdNRG0Q", "poster": "", "openreview": "https://openreview.net/forum?id=n0OeTdNRG0Q", "slides": "https://iclr.cc/virtual/2022/poster/6193", "video": "https://iclr.cc/virtual/2022/poster/6193", "author_site": "Jiawei Du, Hanshu Yan, Jiashi Feng, Joey T Zhou, Liangli Zhen, Rick Goh, Vincent Tan", "tldr": "", "abstract": "Overparametrized Deep Neural Networks (DNNs) often achieve astounding performances, but may potentially result in severe generalization error. Recently, the relation between the sharpness of the loss landscape and the generalization error has been established by Foret et al. (2020), in which the Sharpness Aware Minimizer (SAM) was proposed to mitigate the degradation of the generalization. Unfortunately, SAM\u2019s computational cost is roughly double that of base optimizers, such as Stochastic Gradient Descent (SGD). This paper thus proposes Efficient Sharpness Aware Minimizer (ESAM), which boosts SAM\u2019s efficiency at no cost to its generalization performance. ESAM includes two novel and efficient training strategies\u2014StochasticWeight Perturbation and Sharpness-Sensitive Data Selection. In the former, the sharpness measure is approximated by perturbing a stochastically chosen set of weights in each iteration; in the latter, the SAM loss is optimized using only a judiciously selected subset of data that is sensitive to the sharpness. We provide theoretical explanations as to why these strategies perform well. We also show, via extensive experiments on the CIFAR and ImageNet\ndatasets, that ESAM enhances the efficiency over SAM from requiring 100% extra computations to 40% vis-`a-vis base optimizers, while test accuracies are preserved or even improved.", "keywords": "Efficient learning;gengeralization;training algorithm", "primary_area": "", "supplementary_material": "/attachment/019d5e9ab7e5c7ea8f9e8ecd83a75759a359623f.zip", "author": "Jiawei Du;Hanshu Yan;Jiashi Feng;Joey Tianyi Zhou;Liangli Zhen;Rick Siow Mong Goh;Vincent Tan", "authorids": "~Jiawei_Du1;~Hanshu_Yan1;~Jiashi_Feng1;~Joey_Tianyi_Zhou1;~Liangli_Zhen1;~Rick_Siow_Mong_Goh1;~Vincent_Tan1", "gender": "M;M;M;;M;M;M", "homepage": ";https://joeyzhouty.github.io/;https://liangli-zhen.github.io/;https://sites.google.com/view/rickgoh/home;https://www.ece.nus.edu.sg/stfpage/vtan/pubs.htm;;https://sites.google.com/site/jshfeng/", "dblp": ";123/5110;128/3352;https://dblp.uni-trier.de/pers/g/Goh:Rick_Siow_Mong;60/2327;243/3583;56/8278", "google_scholar": "WrJKEzEAAAAJ;https://scholar.google.com.sg/citations?user=cYNqDokAAAAJ;dtv_LZkAAAAJ;https://scholar.google.com.sg/citations?user=fBsBJjoAAAAJ;dJoAVvAAAAAJ;MG817V4AAAAJ;https://scholar.google.com.sg/citations?user=Q8iay0gAAAAJ", "orcid": ";0000-0002-4675-7055;;0000-0001-9116-1595;0000-0002-5008-4527;;0000-0001-6843-0064", "linkedin": ";;;rickgoh/;;;", "or_profile": "~Jiawei_Du1;~Joey_Tianyi_Zhou1;~Liangli_Zhen1;~Rick_Siow_Mong_Goh1;~Vincent_Tan1;~Hanshu_YAN2;~Jiashi_Feng2", "aff": "National University of Singapore;A*STAR Centre for Frontier AI Research;Agency for Science, Technology and Research;Institute of High Performance Computing, Singapore, A*STAR;;National University of Singapore (NUS);ByteDance", "aff_domain": "u.nus.edu;cfar.a-star.edu.sg;ihpc.a-star.edu.sg;ihpc.a-star.edu.sg;;u.nus.edu;bytedance.com", "position": "PhD student;Principal Researcher;Researcher;Director;;PhD student;Research Lead", "bibtex": "@inproceedings{\ndu2022efficient,\ntitle={Efficient Sharpness-aware Minimization for Improved Training of Neural Networks},\nauthor={Jiawei Du and Hanshu Yan and Jiashi Feng and Joey Tianyi Zhou and Liangli Zhen and Rick Siow Mong Goh and Vincent Tan},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=n0OeTdNRG0Q}\n}", "github": "", "project": "", "reviewers": "wp3y;cAHr;jSHm;mQEh", "pdf_size": 0, "recommendation": "6;8;8;8", "confidence": "4;4;4;4", "correctness": "4;4;3;4", "technical_novelty": "3;4;3;4", "empirical_novelty": "3;4;3;4", "wc_summary_paper": "92;75;41;21", "wc_summary_review": "100;71;17;38", "wc_main_review": "239;161;205;281", "wc_review": "431;307;263;340", "wc_reply_reviewers": "0;15;78;35", "wc_reply_authors": "238;516;422;790", "reply_reviewers": "0;1;1;1", "reply_authors": "1;1;1;2", "recommendation_avg": [ 7.5, 0.8660254037844386 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.5, 0.5 ], "empirical_novelty_avg": [ 3.5, 0.5 ], "wc_summary_paper_avg": [ 57.25, 27.842189209902298 ], "wc_summary_review_avg": [ 56.5, 31.64253466459348 ], "wc_main_review_avg": [ 221.5, 44.099319722644246 ], "wc_review_avg": [ 335.25, 61.66188044489075 ], "wc_reply_reviewers_avg": [ 32.0, 29.317230428538096 ], "wc_reply_authors_avg": [ 491.5, 199.24545164193836 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": -0.3333333333333333, "gs_citation": 165, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15803669707023220896&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=n0OeTdNRG0Q", "email": "u.nus.edu;cfar.a-star.edu.sg;ihpc.a-star.edu.sg;ihpc.a-star.edu.sg;;u.nus.edu;bytedance.com", "author_num": 7, "aff_unique_index": "0;1;2;3;0;4", "aff_unique_norm": "National University of Singapore;A*STAR;Agency for Science, Technology and Research;Institute of High Performance Computing;ByteDance", "aff_unique_dep": ";Centre for Frontier AI Research;;;", "aff_unique_url": "https://www.nus.edu.sg;https://www.a-star.edu.sg;https://www.a-star.edu.sg;https://www.ihpc.a-star.edu.sg;https://www.bytedance.com", "aff_unique_abbr": "NUS;A*STAR;A*STAR;IHPC;ByteDance", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;1", "aff_country_unique": "Singapore;China" }, { "id": "n1BMcctC12", "title": "Randomized Primal-Dual Coordinate Method for Large-scale Linearly Constrained Nonsmooth Nonconvex Optimization", "track": "main", "status": "Reject", "tldr": "", "abstract": "The large-scale linearly constrained nonsmooth nonconvex optimization finds wide applications in machine learning, including non-PSD Kernel SVM, linearly constrained Lasso with nonsmooth nonconvex penalty, etc. To tackle this class of optimization problems, we propose an efficient algorithm called Nonconvex Randomized Primal-Dual Coordinate (N-RPDC) method. At each iteration, this method only randomly selects a block of primal variables to update rather than updating all the variables, which is suitable for large-scale problems. We provide two types of convergence results for N-RPDC. We first show that any cluster point of the sequence of iterates generated by N-RPDC is almost surely (i.e., with probability 1) a stationary point. In addition, we also provide an almost sure asymptotic convergence rate of $O(1/\\sqrt{k})$. Next, we establish the expected $O(\\varepsilon^{-2})$ iteration complexity of N-RPDC in order to drive a natural stationarity measure below $\\varepsilon$ in expectation. The fundamental aspect to establishing the aforementioned convergence results is a \\emph{surrogate stationarity measure} we discovered for analyzing N-RPDC. Finally, we conduct a set of experiments to show the efficacy of N-RPDC.", "keywords": "primal-dual methods;constrained nonconvex nonsmooth optimization;coordinate descent methods;global convergence;iteration complexity", "primary_area": "", "supplementary_material": "", "author": "Lei Zhao;Daoli Zhu;Xiao Li", "authorids": "~Lei_Zhao8;~Daoli_Zhu1;~Xiao_Li5", "gender": "M;M;M", "homepage": ";http://www.sugli.sjtu.edu.cn/teacher1/2179.html;https://www.xiao-li.org/", "dblp": ";;66/2069-9", "google_scholar": "https://scholar.google.com/citations?view_op=list_works;9qs9fEUAAAAJ;https://scholar.google.com/citations?view_op=list_works", "orcid": "0000-0001-7515-5144;;0000-0001-5577-6963", "linkedin": ";;", "or_profile": "~Lei_Zhao8;~Daoli_Zhu1;~Xiao_Li5", "aff": "Shanghai Jiaotong University;Shanghai Jiaotong University;The Chinese University of Hong Kong, Shenzhen", "aff_domain": "sjtu.edu.cn;sjtu.edu.cn;cuhk.edu.cn", "position": "Assistant Professor;Full Professor;Assistant Professor", "bibtex": "@misc{\nzhao2022randomized,\ntitle={Randomized Primal-Dual Coordinate Method for Large-scale Linearly Constrained Nonsmooth Nonconvex Optimization},\nauthor={Lei Zhao and Daoli Zhu and Xiao Li},\nyear={2022},\nurl={https://openreview.net/forum?id=n1BMcctC12}\n}", "github": "", "project": "", "reviewers": "Adyx;Wyna;NZQN;6TRm", "site": "https://openreview.net/forum?id=n1BMcctC12", "pdf_size": 0, "recommendation": "3;6;6;6", "confidence": "4;3;3;2", "correctness": "3;4;4;3", "technical_novelty": "2;3;2;2", "empirical_novelty": "1;3;0;3", "wc_summary_paper": "80;79;53;89", "wc_summary_review": "59;22;16;2", "wc_main_review": "655;117;157;266", "wc_review": "794;218;226;357", "wc_reply_reviewers": "934;0;0;0", "wc_reply_authors": "2740;343;433;383", "reply_reviewers": "2;0;0;0", "reply_authors": "5;2;2;2", "recommendation_avg": [ 5.25, 1.299038105676658 ], "confidence_avg": [ 3.0, 0.7071067811865476 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.75, 1.299038105676658 ], "wc_summary_paper_avg": [ 75.25, 13.423393758658799 ], "wc_summary_review_avg": [ 24.75, 21.063890903629368 ], "wc_main_review_avg": [ 298.75, 212.7867183355202 ], "wc_review_avg": [ 398.75, 234.77582392571856 ], "wc_reply_reviewers_avg": [ 233.5, 404.4338635673328 ], "wc_reply_authors_avg": [ 974.75, 1019.6662137680154 ], "reply_reviewers_avg": [ 0.5, 0.8660254037844386 ], "reply_authors_avg": [ 2.75, 1.299038105676658 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.816496580927726, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:ixM2UPJNOdQJ:scholar.google.com/&scioq=Randomized+Primal-Dual+Coordinate+Method+for+Large-scale+Linearly+Constrained+Nonsmooth+Nonconvex+Optimization&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;1", "aff_unique_norm": "Shanghai Jiao Tong University;Chinese University of Hong Kong", "aff_unique_dep": ";", "aff_unique_url": "https://www.sjtu.edu.cn;https://www.cuhk.edu.cn", "aff_unique_abbr": "SJTU;CUHK", "aff_campus_unique_index": "1", "aff_campus_unique": ";Shenzhen", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "id": "n2EEbUzETI", "title": "Contextual Text Detection", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Most existing scene text detectors focus on the detection of characters or words which capture partial textual messages only in most cases due to the missing of contextual information. For a better understanding of text in scenes, it is more desired to detect contextual text blocks which consist of one or multiple integral text units (e.g., characters, words, or phrases) in a specific order, delivering certain independent and complete textual messages. This paper presents Contextual Text Detection, a new setup that detects contextual text blocks for better understanding of texts in scenes. We formulate the new setup by a dual detection task that first detects integral text units and then groups them into a contextual text block. Specifically, we design a novel scene text grouping technique which treats each integral text unit as a token and groups multiple integral tokens belonging to the same contextual text block into an ordered token sequence. To facilitate the future research, we create two new datasets SCUT-CTW-Context and ReCTS-Context where each contextual text block is well annotated by an ordered sequence of integral text units. In addition, we introduce three evaluation metrics that measure contextual text detection in local accuracy, continuity, and global accuracy, respectively. Extensive experiments show that the proposed method detects contextual text blocks effectively. This development including codes, datasets and annotation tools will be published at http://xxxxxxx.", "keywords": "Scene Text Detection;Contextual Text Detection", "primary_area": "", "supplementary_material": "", "author": "Chuhui Xue;Jiaxing Huang;Wenqing Zhang;Shijian Lu;Song Bai;Changhu Wang", "authorids": "~Chuhui_Xue2;~Jiaxing_Huang2;~Wenqing_Zhang1;~Shijian_Lu1;~Song_Bai3;~Changhu_Wang3", "gender": "F;M;;M;;M", "homepage": ";https://jxhuang0508.github.io/;https://hannibalape.github.io/;https://personal.ntu.edu.sg/shijian.lu/;https://songbai.site/;https://changhu.wang", "dblp": "223/4745;62/6016-1.html;;42/2718;;30/3393", "google_scholar": "https://scholar.google.com.sg/citations?user=KJU5YRYAAAAJ;czirNcwAAAAJ;;https://scholar.google.com.sg/scholar?hl=en;LXuWMF4AAAAJ;DsVZkjAAAAAJ", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": "~Chuhui_Xue2;~Jiaxing_Huang2;~Wenqing_Zhang1;~Shijian_Lu1;~Song_Bai3;~Changhu_Wang1", "aff": "Nanyang Technological University;Nanyang Technological University;Huazhong University of Science and Technology;Nanyang Technological University;ByteDance;ByteDance Inc.", "aff_domain": "ntu.edu.sg;ntu.edu.sg;hust.edu;ntu.edu.sg;bytedance.com;bytedance.com", "position": "PhD student;PhD student;MS student;Associate Professor;Computer Vision Lead;Director", "bibtex": "@misc{\nxue2022contextual,\ntitle={Contextual Text Detection},\nauthor={Chuhui Xue and Jiaxing Huang and Wenqing Zhang and Shijian Lu and Song Bai and Changhu Wang},\nyear={2022},\nurl={https://openreview.net/forum?id=n2EEbUzETI}\n}", "github": "", "project": "", "reviewers": "EfP9;QJGP;ED3q;xv3K", "site": "https://openreview.net/forum?id=n2EEbUzETI", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "4;5;4;4", "correctness": "3;3;3;3", "technical_novelty": "2;2;3;3", "empirical_novelty": "3;2;3;3", "wc_summary_paper": "160;88;119;179", "wc_summary_review": "55;26;59;178", "wc_main_review": "511;287;313;455", "wc_review": "726;401;491;812", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 136.5, 35.415392133929565 ], "wc_summary_review_avg": [ 79.5, 58.27735409230587 ], "wc_main_review_avg": [ 391.5, 94.06779470148112 ], "wc_review_avg": [ 607.5, 167.3895158007215 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.0, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1, "aff_unique_index": "0;0;1;0;2;2", "aff_unique_norm": "Nanyang Technological University;Huazhong University of Science and Technology;ByteDance", "aff_unique_dep": ";;", "aff_unique_url": "https://www.ntu.edu.sg;http://www.hust.edu.cn;https://www.bytedance.com", "aff_unique_abbr": "NTU;HUST;ByteDance", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;1;1", "aff_country_unique": "Singapore;China" }, { "id": "n54Drs00M1", "title": "Learning affective meanings that derives the social behavior using Bidirectional Encoder Representations from Transformers", "track": "main", "status": "Reject", "tldr": "", "abstract": "Cultural sentiments of a society characterize social behaviors, but modeling sentiments to manifest every potential interaction remains an immense challenge. Affect Control Theory (ACT) offers a solution to this problem. ACT is a generative theory of culture and behavior based on a three-dimensional sentiment lexicon. Traditionally, the sentiments are quantified using survey data which is fed into a regression model to explain social behavior. The lexicons used in the survey are limited due to prohibitive cost. This paper uses a fine-tuned Bidirectional Encoder Representations from Transformers (BERT) model for developing a replacement for these surveys. This model achieves state-of-the-art accuracy in estimating affective meanings, expanding the affective lexicon, and allowing more behaviors to be explained. ", "keywords": "Affect Control Theory;Bidirectional Encoder Representations from Transformers;affective lexicon;formal theory", "primary_area": "", "supplementary_material": "", "author": "Moeen Mostafavi;Michael D. Porter;Dawn T Robinson", "authorids": "~Moeen_Mostafavi1;mdp2u@virginia.edu;~Dawn_T_Robinson1", "gender": "M;;woman", "homepage": ";;https://sociology.uga.edu/directory/people/dawn-t-robinson", "dblp": ";;", "google_scholar": ";;LZUzYjYAAAAJ", "orcid": ";;0000-0002-6809-0474", "linkedin": "moeenmostafavi/;;", "or_profile": "~Moeen_Mostafavi1;mdp2u@virginia.edu;~Dawn_T_Robinson1", "aff": "University of Virginia;;University of Georgia", "aff_domain": "virginia.edu;;uga.edu", "position": "PhD student;;Full Professor", "bibtex": "@misc{\nmostafavi2022learning,\ntitle={Learning affective meanings that derives the social behavior using Bidirectional Encoder Representations from Transformers},\nauthor={Moeen Mostafavi and Michael D. Porter and Dawn T Robinson},\nyear={2022},\nurl={https://openreview.net/forum?id=n54Drs00M1}\n}", "github": "", "project": "", "reviewers": "4E5D;cj4C;UkRH;Ketb", "site": "https://openreview.net/forum?id=n54Drs00M1", "pdf_size": 0, "recommendation": "1;3;3;5", "confidence": "4;2;4;3", "correctness": "2;3;2;3", "technical_novelty": "1;2;2;2", "empirical_novelty": "1;2;2;3", "wc_summary_paper": "33;53;20;53", "wc_summary_review": "38;14;22;20", "wc_main_review": "169;266;88;197", "wc_review": "240;333;130;270", "wc_reply_reviewers": "0;0;58;0", "wc_reply_authors": "882;537;699;988", "reply_reviewers": "0;0;1;0", "reply_authors": "2;1;1;2", "recommendation_avg": [ 3.0, 1.4142135623730951 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 1.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 39.75, 14.02453207775575 ], "wc_summary_review_avg": [ 23.5, 8.874119674649425 ], "wc_main_review_avg": [ 180.0, 63.776955085673386 ], "wc_review_avg": [ 243.25, 73.49617336977484 ], "wc_reply_reviewers_avg": [ 14.5, 25.11473670974872 ], "wc_reply_authors_avg": [ 776.5, 172.6477628004487 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.4264014327112209, "corr_recommendation_correctness": 0.7071067811865476, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4075204699103225987&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1", "aff_unique_norm": "University of Virginia;University of Georgia", "aff_unique_dep": ";", "aff_unique_url": "https://www.virginia.edu;https://www.uga.edu", "aff_unique_abbr": "UVA;UGA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "n6Bc3YElODq", "title": "Model-Based Opponent Modeling", "track": "main", "status": "Reject", "tldr": "", "abstract": "When one agent interacts with a multi-agent environment, it is challenging to deal with various opponents unseen before. Modeling the behaviors, goals, or beliefs of opponents could help the agent adjust its policy to adapt to different opponents. In addition, it is also important to consider opponents who are learning simultaneously or capable of reasoning. However, existing work usually tackles only one of the aforementioned types of opponent. In this paper, we propose model-based opponent modeling (MBOM), which employs the environment model to adapt to all kinds of opponent. MBOM simulates the recursive reasoning process in the environment model and imagines a set of improving opponent policies. To effectively and accurately represent the opponent policy, MBOM further mixes the imagined opponent policies according to the similarity with the real behaviors of opponents. Empirically, we show that MBOM achieves more effective adaptation than existing methods in competitive and cooperative environments, respectively with different types of opponent, i.e., fixed policy, naive learner, and reasoning learner.", "keywords": "multi-agent reinforcement learning;opponent modeling", "primary_area": "", "supplementary_material": "", "author": "XiaoPeng Yu;Jiechuan Jiang;Haobin Jiang;Zongqing Lu", "authorids": "~XiaoPeng_Yu1;~Jiechuan_Jiang1;~Haobin_Jiang1;~Zongqing_Lu2", "gender": ";;M;", "homepage": ";;https://github.com/SigmaBM;", "dblp": ";220/4026;199/9785;", "google_scholar": ";;5Oc2LAEAAAAJ;", "orcid": ";;0009-0009-7114-534X;", "linkedin": ";;haobin-jiang-84178b2a9;", "or_profile": "~XiaoPeng_Yu1;~Jiechuan_Jiang1;~Haobin_Jiang1;~Zongqing_Lu2", "aff": ";;Peking University;", "aff_domain": ";;pku.edu.cn;", "position": ";;PhD student;", "bibtex": "@misc{\nyu2022modelbased,\ntitle={Model-Based Opponent Modeling},\nauthor={XiaoPeng Yu and Jiechuan Jiang and Haobin Jiang and Zongqing Lu},\nyear={2022},\nurl={https://openreview.net/forum?id=n6Bc3YElODq}\n}", "github": "", "project": "", "reviewers": "3i5i;SaHd;8g3Q;KD9g", "site": "https://openreview.net/forum?id=n6Bc3YElODq", "pdf_size": 0, "recommendation": "3;5;5;5", "confidence": "3;4;4;4", "correctness": "2;2;3;3", "technical_novelty": "3;3;3;3", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "123;146;43;126", "wc_summary_review": "86;45;18;22", "wc_main_review": "341;1166;628;409", "wc_review": "550;1357;689;557", "wc_reply_reviewers": "152;190;0;0", "wc_reply_authors": "716;1727;375;581", "reply_reviewers": "1;1;0;0", "reply_authors": "1;3;1;1", "recommendation_avg": [ 4.5, 0.8660254037844386 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 109.5, 39.39860403618382 ], "wc_summary_review_avg": [ 42.75, 27.012728481217888 ], "wc_main_review_avg": [ 636.0, 323.8510460072655 ], "wc_review_avg": [ 788.25, 333.00403526083585 ], "wc_reply_reviewers_avg": [ 85.5, 86.54911900187084 ], "wc_reply_authors_avg": [ 849.75, 520.8336466665725 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.5, 0.8660254037844386 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 1.0, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 49, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5538391915292261879&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff_unique_index": "0", "aff_unique_norm": "Peking University", "aff_unique_dep": "", "aff_unique_url": "http://www.pku.edu.cn", "aff_unique_abbr": "Peking U", "aff_country_unique_index": "0", "aff_country_unique": "China" }, { "id": "n7bD7_GSsce", "title": "Knowledge is reward: Learning optimal exploration by predictive reward cashing", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "There is a strong link between the general concept of intelligence and the ability to collect and use information. The theory of Bayes-adaptive exploration offers an attractive optimality framework for training machines to perform complex information gathering tasks. However, the computational complexity of the resulting optimal control problem has limited the diffusion of the theory to mainstream deep AI research. In this paper we exploit the inherent mathematical structure of Bayes-adaptive problems in order to dramatically simplify the problem by making the reward structure denser while simultaneously decoupling the learning of exploitation and exploration policies. The key to this simplification comes from the novel concept of cross-value (i.e. the value of being in an environment while acting optimally according to another), which we use to quantify the value of currently available information. This results in a new denser reward structure that \"cashes in\" all future rewards that can be predicted from the current information state. In a set of experiments we show that the approach makes it possible to learn challenging information gathering tasks without the use of shaping and heuristic bonuses in situations where the standard RL algorithms fail.", "keywords": "Optimal exploration;Bayes-adaptive;Belief-augmented;Information;Reinforcement Learning", "primary_area": "", "supplementary_material": "/attachment/b11c5ed4a63e8881fb2fc77ae6f934129558f507.zip", "author": "Luca Ambrogioni", "authorids": "~Luca_Ambrogioni1", "gender": "M", "homepage": "https://scholar.google.nl/citations?user=J9IABpQAAAAJ&hl=en", "dblp": "151/9813", "google_scholar": "https://scholar.google.nl/citations?user=J9IABpQAAAAJ", "orcid": "", "linkedin": "", "or_profile": "~Luca_Ambrogioni1", "aff": "Radboud University Nijmegen", "aff_domain": "ru.nl", "position": "Assistant Professor", "bibtex": "@misc{\nambrogioni2022knowledge,\ntitle={Knowledge is reward: Learning optimal exploration by predictive reward cashing},\nauthor={Luca Ambrogioni},\nyear={2022},\nurl={https://openreview.net/forum?id=n7bD7_GSsce}\n}", "github": "", "project": "", "reviewers": "Gvbs;dvYt;rL8W;mwD3", "site": "https://openreview.net/forum?id=n7bD7_GSsce", "pdf_size": 0, "recommendation": "5;5;5;8", "confidence": "4;4;2;2", "correctness": "3;4;3;3", "technical_novelty": "3;3;3;4", "empirical_novelty": "3;2;2;2", "wc_summary_paper": "122;127;106;92", "wc_summary_review": "50;118;30;36", "wc_main_review": "252;1604;405;374", "wc_review": "424;1849;541;502", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 5.75, 1.299038105676658 ], "confidence_avg": [ 3.0, 1.0 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 3.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 111.75, 13.790848414800301 ], "wc_summary_review_avg": [ 58.5, 35.11053972812152 ], "wc_main_review_avg": [ 658.75, 548.7291567795537 ], "wc_review_avg": [ 829.0, 590.4019817039913 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 1, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": -0.3333333333333333, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:efPMetgBDj8J:scholar.google.com/&scioq=Knowledge+is+reward:+Learning+optimal+exploration+by+predictive+reward+cashing&hl=en&as_sdt=0,33", "gs_version_total": 3, "aff_unique_index": "0", "aff_unique_norm": "Radboud University", "aff_unique_dep": "", "aff_unique_url": "https://www.ru.nl/", "aff_unique_abbr": "RU", "aff_campus_unique_index": "0", "aff_campus_unique": "Nijmegen", "aff_country_unique_index": "0", "aff_country_unique": "Netherlands" }, { "title": "UniFormer: Unified Transformer for Efficient Spatial-Temporal Representation Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/5991", "id": "nBU_u6DLvoK", "poster": "", "openreview": "https://openreview.net/forum?id=nBU_u6DLvoK", "slides": "https://iclr.cc/virtual/2022/poster/5991", "video": "https://iclr.cc/virtual/2022/poster/5991", "author_site": "Kunchang Li, Yali Wang, Gao Peng, Guanglu Song, Yu Liu, Hongsheng Li, Yu Qiao", "tldr": "", "abstract": "It is a challenging task to learn rich and multi-scale spatiotemporal semantics from high-dimensional videos, due to large local redundancy and complex global dependency between video frames. The recent advances in this research have been mainly driven by 3D convolutional neural networks and vision transformers. Although 3D convolution can efficiently aggregate local context to suppress local redundancy from a small 3D neighborhood, it lacks the capability to capture global dependency because of the limited receptive field. Alternatively, vision transformers can effectively capture long-range dependency by self-attention mechanism, while having the limitation on reducing local redundancy with blind similarity comparison among all the tokens in each layer. Based on these observations, we propose a novel Unified transFormer (UniFormer) which seamlessly integrates merits of 3D convolution and spatiotemporal self-attention in a concise transformer format, and achieves a preferable balance between computation and accuracy. Different from traditional transformers, our relation aggregator can tackle both spatiotemporal redundancy and dependency, by learning local and global token affinity respectively in shallow and deep layers. We conduct extensive experiments on the popular video benchmarks, e.g., Kinetics-400, Kinetics-600, and Something-Something V1&V2. With only ImageNet-1K pretraining, our UniFormer achieves 82.9%/84.8% top-1 accuracy on Kinetics-400/Kinetics-600, while requiring 10x fewer GFLOPs than other state-of-the-art methods. For Something-Something V1 and V2, our UniFormer achieves new state-of-the-art performances of 60.9% and 71.2% top-1 accuracy respectively. Code is available at https://github.com/Sense-X/UniFormer.", "keywords": "Spatial-Temporal Representation Learning;3D Convolution;Transformer", "primary_area": "", "supplementary_material": "", "author": "Kunchang Li;Yali Wang;Gao Peng;Guanglu Song;Yu Liu;Hongsheng Li;Yu Qiao", "authorids": "~Kunchang_Li1;~Yali_Wang1;~Gao_Peng1;~Guanglu_Song2;~Yu_Liu2;~Hongsheng_Li3;~Yu_Qiao1", "gender": "M;M;M;M;M;M;", "homepage": "https://andy1621.github.io/;;;;http://liuyu.us;http://www.ee.cuhk.edu.hk/~hsli;", "dblp": ";01/773-1;;207/4745;97/2274-15;27/7402-1;", "google_scholar": "D4tLSbsAAAAJ;https://scholar.google.com/citations?hl=en;miFIAFMAAAAJ;Bd3v08QAAAAJ;;BN2Ze-QAAAAJ;", "orcid": "0000-0001-5612-0341;;;;;;", "linkedin": "%E6%98%86%E6%98%8C-%E9%BB%8E-2a4a951b2/;;;;;;", "or_profile": "~Kunchang_Li1;~Yali_Wang1;~Gao_Peng1;~Guanglu_Song2;~Yu_Liu2;~Hongsheng_Li3;~Yu_Qiao1", "aff": "Shenzhen Institutes of Advanced Technology, Chinese Academy of Sciences, Chinese Academy of Sciences;SIAT, Chinese Academy of Sciences;shanghai ai lab ;Sensetime;SenseTime;The Chinese University of Hong Kong;", "aff_domain": "siat.ac.cn;siat.ac.cn;pjlab.org.cn;sensetime.com;sensetime.com;cuhk.edu.hk;", "position": "PhD student;Associate Professor;Researcher;Computer Vision Researcher;Principal Researcher;Assistant Professor;", "bibtex": "@inproceedings{\nli2022uniformer,\ntitle={UniFormer: Unified Transformer for Efficient Spatial-Temporal Representation Learning},\nauthor={Kunchang Li and Yali Wang and Gao Peng and Guanglu Song and Yu Liu and Hongsheng Li and Yu Qiao},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=nBU_u6DLvoK}\n}", "github": "", "project": "", "reviewers": "4ga6;rXsf;axKi;Na5h", "pdf_size": 0, "recommendation": "6;8;8;8", "confidence": "3;4;3;4", "correctness": "3;3;4;4", "technical_novelty": "3;2;4;3", "empirical_novelty": "3;4;4;3", "wc_summary_paper": "85;68;54;78", "wc_summary_review": "40;61;10;58", "wc_main_review": "185;642;74;239", "wc_review": "310;771;138;375", "wc_reply_reviewers": "0;46;17;0", "wc_reply_authors": "646;612;265;491", "reply_reviewers": "0;1;1;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 7.5, 0.8660254037844386 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 3.5, 0.5 ], "wc_summary_paper_avg": [ 71.25, 11.648497757221744 ], "wc_summary_review_avg": [ 42.25, 20.27775875189366 ], "wc_main_review_avg": [ 285.0, 214.52622217342102 ], "wc_review_avg": [ 398.5, 231.8410015506317 ], "wc_reply_reviewers_avg": [ 15.75, 18.793283374652763 ], "wc_reply_authors_avg": [ 503.5, 149.26235292263084 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 364, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13061863280402646662&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=nBU_u6DLvoK", "email": "siat.ac.cn;siat.ac.cn;pjlab.org.cn;sensetime.com;sensetime.com;cuhk.edu.hk;", "author_num": 7, "aff_unique_index": "0;1;2;3;3;4", "aff_unique_norm": "Chinese Academy of Sciences;Shenzhen Institute of Advanced Technology;Shanghai AI Lab;SenseTime;Chinese University of Hong Kong", "aff_unique_dep": "Shenzhen Institutes of Advanced Technology;;;;", "aff_unique_url": "http://www.cas.cn;http://www.siat.ac.cn;https://www.shanghaiailab.com;https://www.sensetime.com;https://www.cuhk.edu.hk", "aff_unique_abbr": "CAS;SIAT;Shanghai AI Lab;SenseTime;CUHK", "aff_campus_unique_index": "0;2", "aff_campus_unique": "Shenzhen;;Hong Kong SAR", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "China" }, { "id": "nCw4talHmo5", "title": "ParaDiS: Parallelly Distributable Slimmable Neural Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "When several limited power devices are available, one of the most efficient ways to make profit of these resources, while reducing the processing latency and communication load, is to run in parallel several neural sub-networks and to fuse the result at the end of processing. However, such a combination of sub-networks must be trained specifically for each particular configuration of devices (characterized by number of devices and their capacities) which may vary over different model deployments and even within the same deployment. In this work we introduce parallelly distributable slimmable (ParaDiS) neural networks that are splittable in parallel among various device configurations without retraining. While inspired by slimmable networks allowing instant adaptation to resources on just one device, ParaDiS networks consist of several multi-device distributable configurations or switches that strongly share the parameters between them. We evaluate ParaDiS framework on MobileNet v1 and ResNet-50 architectures on ImageNet classification task. We show that ParaDiS switches achieve similar or better accuracy than the individual models, i.e., distributed models of the same structure trained individually. Moreover, we show that, as compared to universally slimmable networks that are not distributable, the accuracy of distributable ParaDiS switches either does not drop at all or drops by a maximum of 1 % only in the worst cases.", "keywords": "convolutional neural networks;efficient inference;distributed inference;parallel distribution;slimmable neural networks;flexible neural networks", "primary_area": "", "supplementary_material": "", "author": "Alexey Ozerov;Anne Lambert;Suresh Kirthi Kumaraswamy", "authorids": "~Alexey_Ozerov2;~Anne_Lambert1;~Suresh_Kirthi_Kumaraswamy1", "gender": "M;F;", "homepage": "https://sites.google.com/view/alexey-ozerov;https://www.interdigital.com/talent/?id=105;", "dblp": "82/151.html;133/5032;", "google_scholar": "LnV-0z0AAAAJ;4R1O95YAAAAJ;https://scholar.google.co.in/citations?user=QNHLFqIAAAAJ", "orcid": ";;", "linkedin": "alexey-ozerov-07687419/;annelambert;", "or_profile": "~Alexey_Ozerov2;~Anne_Lambert1;~Suresh_Kirthi_Kumaraswamy1", "aff": ";InterDigital;", "aff_domain": ";interdigital.com;", "position": ";Researcher;", "bibtex": "@misc{\nozerov2022paradis,\ntitle={ParaDiS: Parallelly Distributable Slimmable Neural Networks},\nauthor={Alexey Ozerov and Anne Lambert and Suresh Kirthi Kumaraswamy},\nyear={2022},\nurl={https://openreview.net/forum?id=nCw4talHmo5}\n}", "github": "", "project": "", "reviewers": "T9P1;eUCM;6dzy;fReP", "site": "https://openreview.net/forum?id=nCw4talHmo5", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "4;4;4;2", "correctness": "3;3;3;3", "technical_novelty": "2;2;3;3", "empirical_novelty": "1;2;3;2", "wc_summary_paper": "29;59;87;63", "wc_summary_review": "87;5;64;55", "wc_main_review": "206;378;633;200", "wc_review": "322;442;784;318", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "840;804;807;655", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 3.5, 0.8660254037844386 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 59.5, 20.60946384552495 ], "wc_summary_review_avg": [ 52.75, 29.93639089803579 ], "wc_main_review_avg": [ 354.25, 176.09425743050227 ], "wc_review_avg": [ 466.5, 189.95986418188448 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 776.5, 71.55592218677641 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.6622661785325219, "corr_recommendation_correctness": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4103886947527085410&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0", "aff_unique_norm": "InterDigital", "aff_unique_dep": "", "aff_unique_url": "https://www.interdigital.com", "aff_unique_abbr": "InterDigital", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "id": "nD9Pf-PjTbT", "title": "Convergence of Generalized Belief Propagation Algorithm on Graphs with Motifs", "track": "main", "status": "Reject", "tldr": "", "abstract": "Belief propagation is a fundamental message-passing algorithm for numerous applications in machine learning. It is known that belief propagation algorithm is exact on tree graphs. However, belief propagation is run on loopy graphs in most applications. So, understanding the behavior of belief propagation on loopy graphs has been a major topic for researchers in different areas. In this paper, we study the convergence behavior of generalized belief propagation algorithm on graphs with motifs (triangles, loops, etc.) We show under a certain initialization, generalized belief propagation converges to the global optimum of the Bethe free energy for ferromagnetic Ising models on graphs with motifs.", "keywords": "Belief Propagation;Bethe energy function", "primary_area": "", "supplementary_material": "", "author": "Yitao Chen;Deepanshu Vasal", "authorids": "~Yitao_Chen1;~Deepanshu_Vasal1", "gender": "M;", "homepage": "https://yitaochen.github.io/;https://sites.google.com/view/dvasal/home", "dblp": ";", "google_scholar": "T7teB94AAAAJ;", "orcid": ";", "linkedin": "yitao-chen-aa543b75/;", "or_profile": "~Yitao_Chen1;~Deepanshu_Vasal1", "aff": "QualComm;Northwestern University", "aff_domain": "qualcomm.com;northwestern.edu", "position": "Qualcomm;Researcher", "bibtex": "@misc{\nchen2022convergence,\ntitle={Convergence of Generalized Belief Propagation Algorithm on Graphs with Motifs},\nauthor={Yitao Chen and Deepanshu Vasal},\nyear={2022},\nurl={https://openreview.net/forum?id=nD9Pf-PjTbT}\n}", "github": "", "project": "", "reviewers": "F8Ar;PMGJ;BLzF;bjzY", "site": "https://openreview.net/forum?id=nD9Pf-PjTbT", "pdf_size": 0, "recommendation": "1;1;3;3", "confidence": "5;4;3;4", "correctness": "1;1;3;3", "technical_novelty": "2;1;2;2", "empirical_novelty": "1;1;0;0", "wc_summary_paper": "35;65;43;24", "wc_summary_review": "72;34;57;34", "wc_main_review": "743;354;310;345", "wc_review": "850;453;410;403", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 2.0, 1.0 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 2.0, 1.0 ], "technical_novelty_avg": [ 1.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 0.5, 0.5 ], "wc_summary_paper_avg": [ 41.75, 15.022899187573616 ], "wc_summary_review_avg": [ 49.25, 16.145819892467525 ], "wc_main_review_avg": [ 438.0, 176.8572870989488 ], "wc_review_avg": [ 529.0, 186.31559247685095 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.7071067811865475, "corr_recommendation_correctness": 1.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:L0TsicbkqVkJ:scholar.google.com/&scioq=Convergence+of+Generalized+Belief+Propagation+Algorithm+on+Graphs+with+Motifs&hl=en&as_sdt=0,5", "gs_version_total": 4, "aff_unique_index": "0;1", "aff_unique_norm": "Qualcomm Incorporated;Northwestern University", "aff_unique_dep": ";", "aff_unique_url": "https://www.qualcomm.com;https://www.northwestern.edu", "aff_unique_abbr": "Qualcomm;NU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "nDY6Y5x9vkA", "title": "A Two-Stage Data-Free Adversarial Patch Generation Framework", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "General adversarial patch generation (APG) methods rely on training datasets of target models and are not applicable to data-free scenarios. This article presents a two-stage APG framework that exploits a determined proxy dataset in place of an unknown training dataset. The proxy dataset selection stage calculates the proposed average patch saliency (APS) of each available dataset to select a high-APS proxy dataset that can guarantee patches' fooling abilities. Then, the patch generation stage applies the proposed data-free Expectation over Transformation (DF-EoT) as the APG method in case only low-APS datasets are available. Evaluation results show that the determined high-APS proxy datasets enable EoT (benchmark APG method) to generate patches of comparable fooling abilities to patches utilising training datasets, and DF-EoT can further improve the fooling abilities for both low-APS and high-APS proxy datasets. Specifically, DF-EoT enhances average targeted fooling rates (ATFR) of patches utilising a low-APS dataset from 42.71% of EoT to 78.34% on target model VGG-19 and increases ATFR from 62.57% to 84.33% with a high-APS dataset on Inception-v1.", "keywords": "Deep Learning;Computer Vision;Adversarial Attack", "primary_area": "", "supplementary_material": "/attachment/28823c80387dbf7321cff95405a702f49652066a.zip", "author": "Jiawei Liu;Hang Gao;Yunfeng hu;Xun Gong", "authorids": "~Jiawei_Liu3;~Hang_Gao4;~Yunfeng_hu1;~Xun_Gong3", "gender": "M;M;M;M", "homepage": "https://sites.google.com/view/jiaweiliu/;https://github.com/Herbert-Gao;https://dce.jlu.edu.cn/info/1182/5267.htm;https://sites.google.com/view/xungong-jlu-ai/home", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Jiawei_Liu3;~Hang_Gao4;~Yunfeng_hu1;~Xun_Gong3", "aff": "Jilin University, China;Jilin University, China;Jilin University;", "aff_domain": "jlu.edu.cn;jlu.edu.cn;jlu.edu;", "position": "MS student;MS student;Full Professor;", "bibtex": "@misc{\nliu2022a,\ntitle={A Two-Stage Data-Free Adversarial Patch Generation Framework},\nauthor={Jiawei Liu and Hang Gao and Yunfeng hu and Xun Gong},\nyear={2022},\nurl={https://openreview.net/forum?id=nDY6Y5x9vkA}\n}", "github": "", "project": "", "reviewers": "tTEH;58cw;gFPs", "site": "https://openreview.net/forum?id=nDY6Y5x9vkA", "pdf_size": 0, "recommendation": "3;5;6", "confidence": "5;3;4", "correctness": "2;3;4", "technical_novelty": "2;2;3", "empirical_novelty": "2;2;3", "wc_summary_paper": "78;65;66", "wc_summary_review": "13;32;12", "wc_main_review": "290;195;240", "wc_review": "381;292;318", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "1359;633;418", "reply_reviewers": "0;0;0", "reply_authors": "2;1;1", "recommendation_avg": [ 4.666666666666667, 1.247219128924647 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "correctness_avg": [ 3.0, 0.816496580927726 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 69.66666666666667, 5.90668171555645 ], "wc_summary_review_avg": [ 19.0, 9.201449161228174 ], "wc_main_review_avg": [ 241.66666666666666, 38.80148908940939 ], "wc_review_avg": [ 330.3333333333333, 37.366057086910075 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 803.3333333333334, 402.60016007062006 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.6546536707079772, "corr_recommendation_correctness": 0.9819805060619659, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:MzQxuYN_okYJ:scholar.google.com/&scioq=A+Two-Stage+Data-Free+Adversarial+Patch+Generation+Framework&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;0", "aff_unique_norm": "Jilin University", "aff_unique_dep": "", "aff_unique_url": "http://www.jlu.edu.cn", "aff_unique_abbr": "JLU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "id": "nEfdkfAyRT8", "title": "Escaping Saddle Points in Nonconvex Minimax Optimization via Cubic-Regularized Gradient Descent-Ascent", "track": "main", "status": "Reject", "tldr": "", "abstract": "The gradient descent-ascent (GDA) algorithm has been widely applied to solve nonconvex minimax optimization problems. However, the existing GDA-type algorithms can only find first-order stationary points of the envelope function of nonconvex minimax optimization problems, which does not rule out the possibility to get stuck at suboptimal saddle points. In this paper, we develop Cubic-GDA -- the first GDA-type algorithm for escaping strict saddle points in nonconvex-strongly-concave minimax optimization. Specifically, the algorithm uses gradient ascent to estimate the second-order information of the minimax objective function, and it leverages the cubic regularization technique to efficiently escape the strict saddle points. Under standard smoothness assumptions on the objective function, we show that Cubic-GDA admits an intrinsic potential function whose value monotonically decreases in the minimax optimization process. Such a property leads to a desired global convergence of Cubic-GDA to a second-order stationary point at a sublinear rate. Moreover, we analyze the convergence rate of Cubic-GDA in the full spectrum of a gradient dominant-type nonconvex geometry. Our result shows that Cubic-GDA achieves an orderwise faster convergence rate than the standard GDA for a wide spectrum of gradient dominant geometry. Our study bridges minimax optimization with second-order optimization and may inspire new developments along this direction.", "keywords": "Minimax optimization;Gradient descent-ascent;saddle point;cubic regularization;Lojasiewicz gradient geometry", "primary_area": "", "supplementary_material": "", "author": "Ziyi Chen;Qunwei Li;Yi Zhou", "authorids": "~Ziyi_Chen2;~Qunwei_Li1;~Yi_Zhou2", "gender": "M;M;M", "homepage": ";;https://sites.google.com/site/yizhouhomepage/home", "dblp": "37/1439-2;122/5081;", "google_scholar": "zjSBVOIAAAAJ;hMEgrQ4AAAAJ;4fK8bYIAAAAJ", "orcid": ";;", "linkedin": "ziyi-chen-84616184/;;", "or_profile": "~Ziyi_Chen2;~Qunwei_Li1;~Yi_Zhou2", "aff": "University of Utah;Ant Group;University of Utah", "aff_domain": "utah.edu;antgroup.com;utah.edu", "position": "PhD student;Engineer;Assistant Professor", "bibtex": "@misc{\nchen2022escaping,\ntitle={Escaping Saddle Points in Nonconvex Minimax Optimization via Cubic-Regularized Gradient Descent-Ascent},\nauthor={Ziyi Chen and Qunwei Li and Yi Zhou},\nyear={2022},\nurl={https://openreview.net/forum?id=nEfdkfAyRT8}\n}", "github": "", "project": "", "reviewers": "ENLL;W1TD;N6EU;Xmar;c2gL", "site": "https://openreview.net/forum?id=nEfdkfAyRT8", "pdf_size": 0, "recommendation": "3;3;5;6;8", "confidence": "5;4;4;5;3", "correctness": "4;4;4;4;4", "technical_novelty": "2;2;4;4;3", "empirical_novelty": "0;1;0;3;3", "wc_summary_paper": "53;59;40;106;87", "wc_summary_review": "33;39;54;81;63", "wc_main_review": "304;469;214;301;331", "wc_review": "390;567;308;488;481", "wc_reply_reviewers": "93;0;0;0;0", "wc_reply_authors": "587;638;528;414;221", "reply_reviewers": "1;0;0;0;0", "reply_authors": "2;1;1;1;1", "recommendation_avg": [ 5.0, 1.8973665961010275 ], "confidence_avg": [ 4.2, 0.7483314773547882 ], "correctness_avg": [ 4.0, 0.0 ], "technical_novelty_avg": [ 3.0, 0.8944271909999159 ], "empirical_novelty_avg": [ 1.4, 1.3564659966250536 ], "wc_summary_paper_avg": [ 69.0, 24.041630560342615 ], "wc_summary_review_avg": [ 54.0, 17.181385275931625 ], "wc_main_review_avg": [ 323.8, 82.586681735011 ], "wc_review_avg": [ 446.8, 89.22645347653352 ], "wc_reply_reviewers_avg": [ 18.6, 37.2 ], "wc_reply_authors_avg": [ 477.6, 148.40835555992123 ], "reply_reviewers_avg": [ 0.2, 0.4000000000000001 ], "reply_authors_avg": [ 1.2, 0.4000000000000001 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.5634361698190111, "corr_recommendation_correctness": 0.0, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15337373316592228497&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "University of Utah;Ant Group", "aff_unique_dep": ";", "aff_unique_url": "https://www.utah.edu;https://www.antgroup.com", "aff_unique_abbr": "Utah;Ant Group", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", "aff_country_unique": "United States;China" }, { "id": "nG4DkcHDw_", "title": "Does Adversarial Robustness Really Imply Backdoor Vulnerability?", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Recent research has revealed a trade-off between the robustness against adversarial attacks and backdoor attacks. Specifically, with the increasing adversarial robustness obtained through adversarial training, the model easily memorizes the malicious behaviors embedded in poisoned data and becomes more vulnerable to backdoor attacks. Meanwhile, some studies have demonstrated that adversarial training can somewhat mitigate the effect of poisoned data during training. This paper revisits the trade-off and raises a question \\textit{whether adversarial robustness really implies backdoor vulnerability.} Based on thorough experiments, we find that such trade-off ignores the interactions between the perturbation budget of adversarial training and the magnitude of the backdoor trigger. Indeed, an adversarially trained model is capable of achieving backdoor robustness as long as the perturbation budget surpasses the trigger magnitude, while it is vulnerable to backdoor attacks only for adversarial training with a small perturbation budget. To always mitigate the backdoor vulnerability, we propose an adversarial-training based detection strategy and a general pipeline against backdoor attacks, which consistently brings backdoor robustness regardless of the perturbation budget.\n", "keywords": "Adversarial training;backdoor attack", "primary_area": "", "supplementary_material": "/attachment/62359b50dddebeffc731cd1e9c24867ea1a7a87f.zip", "author": "Yinghua Gao;Dongxian Wu;Jingfeng Zhang;Shu-Tao Xia;Gang Niu;Masashi Sugiyama", "authorids": "~Yinghua_Gao1;~Dongxian_Wu1;~Jingfeng_Zhang1;~Shu-Tao_Xia1;~Gang_Niu1;~Masashi_Sugiyama1", "gender": "M;M;M;M;M;M", "homepage": ";;https://zjfheart.github.io;https://www.sigs.tsinghua.edu.cn/xst/list.htm;https://niug1984.github.io;http://www.ms.k.u-tokyo.ac.jp/sugi/", "dblp": "275/7938.html;259/1755;227/2664.html;03/6195;26/3367-1;35/1228", "google_scholar": ";ZQzqQqwAAAAJ;NS0P1FkAAAAJ;https://scholar.google.com.hk/citations?user=koAXTXgAAAAJ;https://scholar.google.co.jp/citations?user=HOkcy00AAAAJ;https://scholar.google.co.jp/citations?user=GkYIrlIAAAAJ", "orcid": ";;0000-0003-3491-8074;0000-0002-8639-982X;;0000-0001-6658-6743", "linkedin": ";;;;;", "or_profile": "~Yinghua_Gao1;~Dongxian_Wu1;~Jingfeng_Zhang1;~Shu-Tao_Xia1;~Gang_Niu1;~Masashi_Sugiyama1", "aff": "Tsinghua University;The University of Tokyo;RIKEN;Shenzhen International Graduate School, Tsinghua University;RIKEN;The University of Tokyo", "aff_domain": "tsinghua.edu.cn;u-tokyo.ac.jp;riken.jp;sz.tsinghua.edu.cn;riken.jp;u-tokyo.ac.jp", "position": "PhD student;Postdoc;Postdoc;Full Professor;Research Scientist (tenured);Full Professor", "bibtex": "@misc{\ngao2022does,\ntitle={Does Adversarial Robustness Really Imply Backdoor Vulnerability?},\nauthor={Yinghua Gao and Dongxian Wu and Jingfeng Zhang and Shu-Tao Xia and Gang Niu and Masashi Sugiyama},\nyear={2022},\nurl={https://openreview.net/forum?id=nG4DkcHDw_}\n}", "github": "", "project": "", "reviewers": "fzki;8UQ6;Bhd4;1EYS", "site": "https://openreview.net/forum?id=nG4DkcHDw_", "pdf_size": 0, "recommendation": "3;3;3;8", "confidence": "4;4;5;3", "correctness": "3;3;2;4", "technical_novelty": "1;2;2;3", "empirical_novelty": "2;3;3;4", "wc_summary_paper": "147;86;143;156", "wc_summary_review": "25;78;109;25", "wc_main_review": "259;524;572;343", "wc_review": "431;688;824;524", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.25, 2.165063509461097 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 133.0, 27.54087870784082 ], "wc_summary_review_avg": [ 59.25, 35.960916284210555 ], "wc_main_review_avg": [ 424.5, 128.14932695882567 ], "wc_review_avg": [ 616.75, 150.94266295517647 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.816496580927726, "corr_recommendation_correctness": 0.816496580927726, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9671771071668731068&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;2;0;2;1", "aff_unique_norm": "Tsinghua University;University of Tokyo;RIKEN", "aff_unique_dep": ";;", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.u-tokyo.ac.jp;https://www.riken.jp", "aff_unique_abbr": "THU;UTokyo;RIKEN", "aff_campus_unique_index": "1", "aff_campus_unique": ";Shenzhen", "aff_country_unique_index": "0;1;1;0;1;1", "aff_country_unique": "China;Japan" }, { "title": "Does your graph need a confidence boost? Convergent boosted smoothing on graphs with tabular node features", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7020", "id": "nHpzE7DqAnG", "poster": "", "openreview": "https://openreview.net/forum?id=nHpzE7DqAnG", "slides": "https://iclr.cc/virtual/2022/poster/7020", "video": "https://iclr.cc/virtual/2022/poster/7020", "author_site": "Jiuhai Chen, Jonas Mueller, Vassilis N. Ioannidis, Soji Adeshina, Yangkun Wang, Tom Goldstein, David Wipf", "tldr": "", "abstract": "Many practical modeling tasks require making predictions using tabular data composed of heterogeneous feature types (e.g., text-based, categorical, continuous, etc.). In this setting boosted decision trees and related ensembling techniques generally dominate real-world applications involving iid training/test sets. However, when there are relations between samples and the iid assumption is no longer reasonable, it remains unclear how to incorporate these dependencies within existing boosting pipelines. To this end, we propose a generalized framework for combining boosted trees and more general model ensembling techniques, with graph propagation layers that share node/sample information across edges connecting related samples. And unlike previous efforts to integrate graph-based models with boosting, our approach is anchored to a principled meta loss function such that provable convergence can be guaranteed under relatively mild assumptions. Across a variety of benchmarks involving non-iid graph data with tabular node features, our framework achieves comparable or superior performance.", "keywords": "Graph Neural Network;Boosting;Node classification;Tabular Data", "primary_area": "", "supplementary_material": "", "author": "Jiuhai Chen;Jonas Mueller;Vassilis N. Ioannidis;Soji Adeshina;Yangkun Wang;Tom Goldstein;David Wipf", "authorids": "~Jiuhai_Chen1;~Jonas_Mueller1;~Vassilis_N._Ioannidis1;~Soji_Adeshina1;~Yangkun_Wang1;~Tom_Goldstein1;~David_Wipf1", "gender": "M;M;;;;M;M", "homepage": "https://www.linkedin.com/in/jiuhai-chen-6a486715a/;;https://scholar.google.com/citations?hl=en&user=mjmiI4sAAAAJ&view_op=list_works&authuser=1;;;https://www.cs.umd.edu/~tomg/;http://www.davidwipf.com/", "dblp": ";178/3250;;298/4855;;25/8184;81/6421", "google_scholar": ";HeVcLzAAAAAJ;;O2IS5isAAAAJ;;KmSuVtgAAAAJ;YJx1WSgAAAAJ", "orcid": ";;0000-0002-8367-0733;;;;", "linkedin": ";;;sojiadeshina/;;;", "or_profile": "~Jiuhai_Chen1;~Jonas_Mueller1;~Vassilis_N._Ioannidis1;~Soji_Adeshina1;~Yangkun_Wang1;~Tom_Goldstein1;~David_Wipf1", "aff": "University of Maryland, College Park;Amazon;Amazon Web Services;Amazon;;University of Maryland, College Park;Amazon AI Research Lab", "aff_domain": "umd.edu;amazon.com;amazon.com;amazon.com;;umd.edu;amazon.com", "position": "PhD student;Scientist;Applied Scientist II;Researcher;;Associate Professor;Principal Research Scientist", "bibtex": "@inproceedings{\nchen2022does,\ntitle={Does your graph need a confidence boost? Convergent boosted smoothing on graphs with tabular node features},\nauthor={Jiuhai Chen and Jonas Mueller and Vassilis N. Ioannidis and Soji Adeshina and Yangkun Wang and Tom Goldstein and David Wipf},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=nHpzE7DqAnG}\n}", "github": "", "project": "", "reviewers": "zcK6;omBE;ahft;Mnik", "pdf_size": 0, "recommendation": "6;6;8;8", "confidence": "4;2;3;3", "correctness": "4;3;4;3", "technical_novelty": "2;3;3;4", "empirical_novelty": "3;2;3;3", "wc_summary_paper": "93;40;44;86", "wc_summary_review": "31;65;34;64", "wc_main_review": "115;303;271;589", "wc_review": "239;408;349;739", "wc_reply_reviewers": "54;20;0;0", "wc_reply_authors": "604;555;204;1204", "reply_reviewers": "1;1;0;0", "reply_authors": "3;1;1;2", "recommendation_avg": [ 7.0, 1.0 ], "confidence_avg": [ 3.0, 0.7071067811865476 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 65.75, 23.920441049445557 ], "wc_summary_review_avg": [ 48.5, 16.03901493234544 ], "wc_main_review_avg": [ 319.5, 171.08112110925623 ], "wc_review_avg": [ 433.75, 186.38049120012533 ], "wc_reply_reviewers_avg": [ 18.5, 22.06241147291021 ], "wc_reply_authors_avg": [ 641.75, 359.409498344159 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.75, 0.82915619758885 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12401387776343070829&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=nHpzE7DqAnG", "email": "umd.edu;amazon.com;amazon.com;amazon.com;;umd.edu;amazon.com", "author_num": 7, "aff_unique_index": "0;1;1;1;0;1", "aff_unique_norm": "University of Maryland;Amazon", "aff_unique_dep": ";Amazon.com, Inc.", "aff_unique_url": "https://www/umd.edu;https://www.amazon.com", "aff_unique_abbr": "UMD;Amazon", "aff_campus_unique_index": "0;0", "aff_campus_unique": "College Park;", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "nK7eZEURiJ4", "title": "Towards Understanding Distributional Reinforcement Learning: Regularization, Optimization, Acceleration and Sinkhorn Algorithm", "track": "main", "status": "Reject", "tldr": "", "abstract": "Distributional reinforcement learning~(RL) is a class of state-of-the-art algorithms that estimate the whole distribution of the total return rather than only its expectation. Despite the remarkable performance of distributional RL, a theoretical understanding of its advantages over expectation-based RL remains elusive. In this paper, we interpret distributional RL as entropy-regularized maximum likelihood estimation in the \\textit{neural Z-fitted iteration} framework and establish the connection of the resulting risk-aware regularization with maximum entropy RL. In addition, We shed light on the stability-promoting distributional loss with desirable smoothness properties in distributional RL, which can yield stable optimization and guaranteed generalization. We also analyze the acceleration behavior while optimizing distributional RL algorithms and show that an appropriate approximation to the true target distribution can speed up the convergence. From the perspective of representation, we find that distributional RL encourages state representation from the same action class classified by the policy in tighter clusters. Finally, we propose a class of \\textit{Sinkhorn distributional RL} algorithm that interpolates between the Wasserstein distance and maximum mean discrepancy~(MMD). Experiments on a suite of Atari games reveal the competitive performance of our algorithm relative to existing state-of-the-art distributional RL algorithms.", "keywords": "distributional reinforcement learning", "primary_area": "", "supplementary_material": "", "author": "Ke Sun;Yingnan Zhao;Yi Liu;Enze Shi;Yafei Wang;Aref Sadeghi;Xiaodong Yan;Bei Jiang;Linglong Kong", "authorids": "~Ke_Sun6;~Yingnan_Zhao1;~Yi_Liu13;eshi@ualberta.ca;~Yafei_Wang1;asadegh1@ualberta.ca;yanxiaodong@sdu.edu.cn;~Bei_Jiang1;~Linglong_Kong2", "gender": "M;M;M;;;;;F;M", "homepage": "https://sites.google.com/view/kesun;;https://apps.ualberta.ca/directory/person/yliu16;;https://apps.ualberta.ca/directory/person/yafei2;;;https://www.ualberta.ca/~bei1;https://www.ualberta.ca/~lkong", "dblp": "69/476-13;;97/4626-62;;;;;190/4697;35/8525", "google_scholar": "lYdNhFQAAAAJ;NMgYY5cAAAAJ;;;6zZR3_gAAAAJ;;;https://scholar.google.ca/citations?user=MfOZ8G0AAAAJ;https://scholar.google.ca/citations?hl=en", "orcid": ";;;;;;;0000-0002-0033-839X;0000-0003-3011-9216", "linkedin": ";;;;;;;;", "or_profile": "~Ke_Sun6;~Yingnan_Zhao1;~Yi_Liu13;eshi@ualberta.ca;~Yafei_Wang1;asadegh1@ualberta.ca;yanxiaodong@sdu.edu.cn;~Bei_Jiang1;~Linglong_Kong2", "aff": "University of Alberta;;University of Alberta;;University of Alberta;;;University of Alberta;University of Alberta", "aff_domain": "ualberta.ca;;ualberta.ca;;ualberta.ca;;;ualberta.ca;ualberta.ca", "position": "PhD student;;PhD student;;Postdoc;;;Associate Professor;Associate Professor", "bibtex": "@misc{\nsun2022towards,\ntitle={Towards Understanding Distributional Reinforcement Learning: Regularization, Optimization, Acceleration and Sinkhorn Algorithm},\nauthor={Ke Sun and Yingnan Zhao and Yi Liu and Enze Shi and Yafei Wang and Aref Sadeghi and Xiaodong Yan and Bei Jiang and Linglong Kong},\nyear={2022},\nurl={https://openreview.net/forum?id=nK7eZEURiJ4}\n}", "github": "", "project": "", "reviewers": "Qz4z;r4R2;Q4X1;GhjJ", "site": "https://openreview.net/forum?id=nK7eZEURiJ4", "pdf_size": 0, "recommendation": "1;3;3;5", "confidence": "4;5;3;4", "correctness": "1;4;2;4", "technical_novelty": "1;2;2;2", "empirical_novelty": "1;2;0;2", "wc_summary_paper": "29;46;91;29", "wc_summary_review": "50;81;112;17", "wc_main_review": "645;449;693;320", "wc_review": "724;576;896;366", "wc_reply_reviewers": "0;0;36;0", "wc_reply_authors": "549;392;417;438", "reply_reviewers": "0;0;1;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 3.0, 1.4142135623730951 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 2.75, 1.299038105676658 ], "technical_novelty_avg": [ 1.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 48.75, 25.36114153582208 ], "wc_summary_review_avg": [ 65.0, 35.33411948810951 ], "wc_main_review_avg": [ 526.75, 150.3435648772504 ], "wc_review_avg": [ 640.5, 194.78385456705595 ], "wc_reply_reviewers_avg": [ 9.0, 15.588457268119896 ], "wc_reply_authors_avg": [ 449.0, 59.98749869764533 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 9, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.8164965809277259, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16899585936945700451&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "University of Alberta", "aff_unique_dep": "", "aff_unique_url": "https://www.ualberta.ca", "aff_unique_abbr": "UAlberta", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "Canada" }, { "title": "AlphaZero-based Proof Cost Network to Aid Game Solving", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7121", "id": "nKWjE4QF1hB", "poster": "", "openreview": "https://openreview.net/forum?id=nKWjE4QF1hB", "slides": "https://iclr.cc/virtual/2022/poster/7121", "video": "https://iclr.cc/virtual/2022/poster/7121", "author_site": "Ti-Rong Wu, Chung-Chin Shih, Ting Han Wei, Meng Yu Tsai, Wei-Yuan Hsu, I-Chen Wu", "tldr": "", "abstract": "The AlphaZero algorithm learns and plays games without hand-crafted expert knowledge. However, since its objective is to play well, we hypothesize that a better objective can be defined for the related but separate task of solving games. This paper proposes a novel approach to solving problems by modifying the training target of the AlphaZero algorithm, such that it prioritizes solving the game quickly, rather than winning. We train a Proof Cost Network (PCN), where proof cost is a heuristic that estimates the amount of work required to solve problems. This matches the general concept of the so-called proof number from proof number search, which has been shown to be well-suited for game solving. We propose two specific training targets. The first finds the shortest path to a solution, while the second estimates the proof cost. We conduct experiments on solving 15x15 Gomoku and 9x9 Killall-Go problems with both MCTS-based and FDFPN solvers. Comparisons between using AlphaZero networks and PCN as heuristics show that PCN can solve more problems.", "keywords": "Monte-Carlo Tree Search;Solving Games;AlphaZero;Deep Reinforcement Learning", "primary_area": "", "supplementary_material": "/attachment/74372572d1441e09d2444d5562c08ab56db2eaa7.zip", "author": "Ti-Rong Wu;Chung-Chin Shih;Ting Han Wei;Meng-Yu Tsai;Wei-Yuan Hsu;I-Chen Wu", "authorids": "~Ti-Rong_Wu1;~Chung-Chin_Shih2;~Ting_Han_Wei1;~Meng-Yu_Tsai1;~Wei-Yuan_Hsu1;~I-Chen_Wu3", "gender": ";M;M;M;M;M", "homepage": "https://www.iis.sinica.edu.tw/pages/tirongwu;;;;;https://cgilab.nctu.edu.tw/~icwu/", "dblp": "200/8767;131/2745.html;https://dblp.uni-trier.de/pers/hd/w/Wei:Ting=Han;17/4105;;06/983", "google_scholar": "sjt4scUAAAAJ;;;;https://scholar.google.com.tw/citations?view_op=list_works;", "orcid": "0000-0002-7532-3176;;;0009-0005-4126-6141;;0000-0003-2535-0587", "linkedin": ";;;;;", "or_profile": "~Ti-Rong_Wu1;~Chung-Chin_Shih2;~Ting_Han_Wei1;~Meng-Yu_Tsai1;~Wei-Yuan_Hsu1;~I-Chen_Wu3", "aff": "National Yang Ming Chiao Tung University;National Chiao Tung University, National Chiao Tung University;University of Alberta;Independent;;Academia Sinica", "aff_domain": "nycu.edu.tw;cs.nctu.edu.tw;ualberta.ca;gmail.com;;sinica.edu.tw", "position": "Postdoc;PhD student;Postdoc;Independent Researcher;;Research Fellow", "bibtex": "@inproceedings{\nwu2022alphazerobased,\ntitle={AlphaZero-based Proof Cost Network to Aid Game Solving},\nauthor={Ti-Rong Wu and Chung-Chin Shih and Ting Han Wei and Meng-Yu Tsai and Wei-Yuan Hsu and I-Chen Wu},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=nKWjE4QF1hB}\n}", "github": "", "project": "", "reviewers": "8SpM;EgiY;b9Ze;CHG3", "pdf_size": 0, "recommendation": "5;5;8;8", "confidence": "3;3;4;3", "correctness": "4;4;3;3", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "60;97;70;77", "wc_summary_review": "18;33;90;23", "wc_main_review": "363;143;976;317", "wc_review": "441;273;1136;417", "wc_reply_reviewers": "186;101;119;28", "wc_reply_authors": "1354;350;503;193", "reply_reviewers": "2;2;2;1", "reply_authors": "4;3;3;2", "recommendation_avg": [ 6.5, 1.5 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 76.0, 13.546217184144066 ], "wc_summary_review_avg": [ 41.0, 28.801041647829337 ], "wc_main_review_avg": [ 449.75, 314.71524828009206 ], "wc_review_avg": [ 566.75, 334.877869528579 ], "wc_reply_reviewers_avg": [ 108.5, 56.24277731406941 ], "wc_reply_authors_avg": [ 600.0, 448.9081197750827 ], "reply_reviewers_avg": [ 1.75, 0.4330127018922193 ], "reply_authors_avg": [ 3.0, 0.7071067811865476 ], "replies_avg": [ 27, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.5773502691896258, "corr_recommendation_correctness": -1.0, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11791482733960981863&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "pdf": "https://openreview.net/pdf?id=nKWjE4QF1hB", "email": "nycu.edu.tw;cs.nctu.edu.tw;ualberta.ca;gmail.com;;sinica.edu.tw", "author_num": 6, "aff_unique_index": "0;1;2;3;4", "aff_unique_norm": "National Yang Ming Chiao Tung University;National Chiao Tung University;University of Alberta;Independent;Academia Sinica", "aff_unique_dep": ";;;;", "aff_unique_url": "https://www.nycu.edu.tw;https://www.nctu.edu.tw;https://www.ualberta.ca;;https://www.sinica.edu.tw", "aff_unique_abbr": "NYCU;NCTU;UAlberta;;Academia Sinica", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Taiwan;", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "China;Canada;" }, { "id": "nKZvpGRdJlG", "title": "Mind Your Solver! On Adversarial Attack and Defense for Combinatorial Optimization", "track": "main", "status": "Reject", "tldr": "", "abstract": "Combinatorial optimization (CO) is a long-standing challenging task not only in its inherent complexity (e.g. NP-hard) but also the possible sensitivity to input conditions. In this paper, we take an initiative on developing the mechanisms for adversarial attack and defense towards combinatorial optimization solvers, whereby the solver is treated as a black-box function and the original problem's underlying graph structure (which is often available and associated with the problem instance, e.g. DAG, TSP) is attacked under a given budget. Experimental results on three real-world combinatorial optimization problems reveal the vulnerability of existing solvers to adversarial attack, including the commercial solvers like Gurobi. In particular, we present a simple yet effective defense strategy to modify the graph structure to increase the robustness of solvers, which shows its universal effectiveness across tasks and solvers. ", "keywords": "Adversarial Attack;Combinatorial Optimization;Reinforcement Learning", "primary_area": "", "supplementary_material": "", "author": "Han Lu;Zenan Li;Runzhong Wang;Qibing Ren;Junchi Yan;Zhigang Hua;Gan Liu;JUN ZHOU;Xiaokang Yang", "authorids": "~Han_Lu2;~Zenan_Li4;~Runzhong_Wang1;~Qibing_Ren1;~Junchi_Yan2;~Zhigang_Hua1;~Gan_Liu1;~JUN_ZHOU6;~Xiaokang_Yang1", "gender": "M;M;M;;;;M;M;M", "homepage": ";https://github.com/Emiyalzn;http://runzhong.wang;;;;;https://scholar.google.com/citations?user=mCVvloEAAAAJ&hl=en;https://icne.sjtu.edu.cn/info/1064/1078.htm", "dblp": ";;239/4351;;;;;99/3847-11;06/3071-1.html", "google_scholar": "HESzE0UAAAAJ;;uoM0g3cAAAAJ;;;;;mCVvloEAAAAJ;yDEavdMAAAAJ", "orcid": ";;0000-0002-9566-738X;;;;;0000-0001-6033-6102;0000-0003-4029-3322", "linkedin": ";;;;;;gan-liu-b870195b/;;", "or_profile": "~Han_Lu2;~Zenan_Li4;~Runzhong_Wang1;~Qibing_Ren1;~Junchi_Yan2;~Zhigang_Hua1;~Gan_Liu1;~JUN_ZHOU6;~Xiaokang_Yang1", "aff": "Shanghai Jiaotong University;Tsinghua University;Shanghai Jiaotong University;;;;Ant Group;Ant Group;Shanghai Jiaotong University", "aff_domain": "sjtu.edu.cn;tsinghua.edu.cn;sjtu.edu.cn;;;;antgroup.com;antgroup.com;sjtu.edu.cn", "position": "PhD student;Intern;PhD student;;;;Algorithm Developer;Researcher;Full Professor", "bibtex": "@misc{\nlu2022mind,\ntitle={Mind Your Solver! On Adversarial Attack and Defense for Combinatorial Optimization},\nauthor={Han Lu and Zenan Li and Runzhong Wang and Qibing Ren and Junchi Yan and Zhigang Hua and Gan Liu and JUN ZHOU and Xiaokang Yang},\nyear={2022},\nurl={https://openreview.net/forum?id=nKZvpGRdJlG}\n}", "github": "", "project": "", "reviewers": "Pdsc;pi8H;CZdE;SYfi", "site": "https://openreview.net/forum?id=nKZvpGRdJlG", "pdf_size": 0, "recommendation": "1;3;3;3", "confidence": "5;3;4;3", "correctness": "1;1;2;2", "technical_novelty": "3;2;1;3", "empirical_novelty": "1;3;2;2", "wc_summary_paper": "51;71;54;320", "wc_summary_review": "39;21;49;18", "wc_main_review": "546;373;309;29", "wc_review": "636;465;412;367", "wc_reply_reviewers": "647;0;0;0", "wc_reply_authors": "1387;788;393;527", "reply_reviewers": "3;0;0;0", "reply_authors": "4;1;1;1", "recommendation_avg": [ 2.5, 0.8660254037844386 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 1.5, 0.5 ], "technical_novelty_avg": [ 2.25, 0.82915619758885 ], "empirical_novelty_avg": [ 2.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 124.0, 113.41737080359428 ], "wc_summary_review_avg": [ 31.75, 12.794041581923986 ], "wc_main_review_avg": [ 314.25, 186.11471596840482 ], "wc_review_avg": [ 470.0, 101.9239912876257 ], "wc_reply_reviewers_avg": [ 161.75, 280.1592181242659 ], "wc_reply_authors_avg": [ 773.75, 381.48877768553035 ], "reply_reviewers_avg": [ 0.75, 1.299038105676658 ], "reply_authors_avg": [ 1.75, 1.299038105676658 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 9, 0 ], "corr_recommendation_confidence": -0.8703882797784891, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17743484271377956810&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;0;2;2;0", "aff_unique_norm": "Shanghai Jiao Tong University;Tsinghua University;Ant Group", "aff_unique_dep": ";;", "aff_unique_url": "https://www.sjtu.edu.cn;https://www.tsinghua.edu.cn;https://www.antgroup.com", "aff_unique_abbr": "SJTU;THU;Ant Group", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "China" }, { "id": "nL2lDlsrZU", "title": "SAINT: Improved Neural Networks for Tabular Data via Row Attention and Contrastive Pre-Training", "track": "main", "status": "Reject", "tldr": "", "abstract": "Tabular data underpins numerous high-impact applications of machine learning from fraud detection to genomics and healthcare. Classical approaches to solving tabular problems, such as gradient boosting and random forests, are widely used by practitioners. However, recent deep learning methods have achieved a degree of performance competitive with popular techniques. We devise a hybrid deep learning approach to solving tabular data problems. Our method, SAINT, performs attention over both rows and columns, and it includes an enhanced embedding method. We also study a new contrastive self-supervised pre-training method for use when labels are scarce. SAINT consistently improves performance over previous deep learning methods, and it even performs competitively with gradient boosting methods, including XGBoost, CatBoost, and LightGBM, on average over $30$ benchmark datasets in regression, binary classification, and multi-class classification tasks.", "keywords": "Transformer;Tabular;Attention;Contrastive Pre-Training", "primary_area": "", "supplementary_material": "/attachment/0e424ccb4581a69e0fbe435e3fd13c611ac247ad.zip", "author": "Gowthami Somepalli;Avi Schwarzschild;Micah Goldblum;C. Bayan Bruss;Tom Goldstein", "authorids": "~Gowthami_Somepalli1;~Avi_Schwarzschild1;~Micah_Goldblum1;~C._Bayan_Bruss1;~Tom_Goldstein1", "gender": "F;M;;M;M", "homepage": "https://somepago.github.io/;https://cs.umd.edu/~avi1;;https://www.cbbruss.com;https://www.cs.umd.edu/~tomg/", "dblp": "286/5012;249/9334.html;241/7231;;25/8184", "google_scholar": "T2ezBDsAAAAJ;WNvQ7AcAAAAJ;pGDKzuUAAAAJ;ClqvGRQAAAAJ;KmSuVtgAAAAJ", "orcid": ";;;;", "linkedin": ";;;bayan-bruss/;", "or_profile": "~Gowthami_Somepalli1;~Avi_Schwarzschild1;~Micah_Goldblum1;~C._Bayan_Bruss1;~Tom_Goldstein1", "aff": "University of Maryland, College Park;University of Maryland, College Park;New York University;Capital One;University of Maryland, College Park", "aff_domain": "umd.edu;umd.edu;nyu.edu;capitalone.com;umd.edu", "position": "PhD student;PhD student;Postdoc;Director of Applied Research;Associate Professor", "bibtex": "@misc{\nsomepalli2022saint,\ntitle={{SAINT}: Improved Neural Networks for Tabular Data via Row Attention and Contrastive Pre-Training},\nauthor={Gowthami Somepalli and Avi Schwarzschild and Micah Goldblum and C. Bayan Bruss and Tom Goldstein},\nyear={2022},\nurl={https://openreview.net/forum?id=nL2lDlsrZU}\n}", "github": "", "project": "", "reviewers": "WEK7;Qd8Y;bykr;ufhZ", "site": "https://openreview.net/forum?id=nL2lDlsrZU", "pdf_size": 0, "recommendation": "3;3;5;6", "confidence": "5;4;4;4", "correctness": "2;2;3;3", "technical_novelty": "3;2;3;3", "empirical_novelty": "3;2;2;3", "wc_summary_paper": "221;61;36;48", "wc_summary_review": "103;51;29;27", "wc_main_review": "430;311;279;569", "wc_review": "754;423;344;644", "wc_reply_reviewers": "0;0;91;0", "wc_reply_authors": "939;419;1392;729", "reply_reviewers": "0;0;2;0", "reply_authors": "2;1;4;1", "recommendation_avg": [ 4.25, 1.299038105676658 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 91.5, 75.2877812131557 ], "wc_summary_review_avg": [ 52.5, 30.63902739970706 ], "wc_main_review_avg": [ 397.25, 114.00959389454907 ], "wc_review_avg": [ 541.25, 164.85656644489475 ], "wc_reply_reviewers_avg": [ 22.75, 39.40415587219196 ], "wc_reply_authors_avg": [ 869.75, 353.73957581814335 ], "reply_reviewers_avg": [ 0.5, 0.8660254037844386 ], "reply_authors_avg": [ 2.0, 1.224744871391589 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.5555555555555555, "corr_recommendation_correctness": 0.9622504486493761, "gs_citation": 385, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12173783028299097998&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff_unique_index": "0;0;1;2;0", "aff_unique_norm": "University of Maryland;New York University;Capital One", "aff_unique_dep": ";;", "aff_unique_url": "https://www/umd.edu;https://www.nyu.edu;https://www.capitalone.com", "aff_unique_abbr": "UMD;NYU;Capital One", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "College Park;", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "nLb60uXd6Np", "title": "Geometric Algebra Attention Networks for Small Point Clouds", "track": "main", "status": "Reject", "tldr": "", "abstract": "Much of the success of deep learning is drawn from building architectures that properly respect underlying symmetry and structure in the data on which they operate\u2014a set of considerations that have been united under the banner of geometric deep learning. Often problems in the physical sciences deal with relatively small sets of points in two- or three-dimensional space wherein translation, rotation, and permutation equivariance are important or even vital for models to be useful in practice. In this work, we present rotation- and permutation-equivariant architectures for deep learning on these small point clouds, composed of a set of products of terms from the geometric algebra and reductions over those products using an attention mechanism. The geometric algebra provides valuable mathematical structure by which to combine vector, scalar, and other types of geometric inputs in a systematic way to account for rotation invariance or covariance, while attention yields a powerful way to impose permutation equivariance. We demonstrate the usefulness of these architectures by training models to solve sample problems relevant to physics, chemistry, and biology.\n", "keywords": "deep learning;geometric algebra;equivariance;geometric deep learning;rotation equivariance;permutation equivariance;chemistry;physics;biology;attention;point cloud", "primary_area": "", "supplementary_material": "/attachment/eb32b37286cc596b22756c6ea7a990fbb7011f2c.zip", "author": "Matthew Spellings", "authorids": "~Matthew_Spellings1", "gender": "M", "homepage": "", "dblp": "", "google_scholar": "a8MTO6YAAAAJ", "orcid": "0000-0002-4061-4299", "linkedin": "", "or_profile": "~Matthew_Spellings1", "aff": "Vector Institute", "aff_domain": "vectorinstitute.ai", "position": "Postdoc", "bibtex": "@misc{\nspellings2022geometric,\ntitle={Geometric Algebra Attention Networks for Small Point Clouds},\nauthor={Matthew Spellings},\nyear={2022},\nurl={https://openreview.net/forum?id=nLb60uXd6Np}\n}", "github": "", "project": "", "reviewers": "sEBH;6SEn;TD8X;vhG8", "site": "https://openreview.net/forum?id=nLb60uXd6Np", "pdf_size": 0, "recommendation": "3;6;6;6", "confidence": "4;2;3;3", "correctness": "2;4;4;3", "technical_novelty": "3;3;3;3", "empirical_novelty": "3;3;4;2", "wc_summary_paper": "78;86;67;78", "wc_summary_review": "13;54;52;41", "wc_main_review": "407;188;198;222", "wc_review": "498;328;317;341", "wc_reply_reviewers": "118;0;0;79", "wc_reply_authors": "816;375;392;451", "reply_reviewers": "1;0;0;1", "reply_authors": "2;1;1;2", "recommendation_avg": [ 5.25, 1.299038105676658 ], "confidence_avg": [ 3.0, 0.7071067811865476 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 77.25, 6.7592529172978875 ], "wc_summary_review_avg": [ 40.0, 16.355427233796124 ], "wc_main_review_avg": [ 253.75, 89.33749212956451 ], "wc_review_avg": [ 371.0, 73.81395532011545 ], "wc_reply_reviewers_avg": [ 49.25, 51.143792389692806 ], "wc_reply_authors_avg": [ 508.5, 179.76164774500705 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 1, 0 ], "corr_recommendation_confidence": -0.816496580927726, "corr_recommendation_correctness": 0.8703882797784892, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9245769141863874607&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0", "aff_unique_norm": "Vector Institute", "aff_unique_dep": "", "aff_unique_url": "https://vectorinstitute.ai/", "aff_unique_abbr": "Vector Institute", "aff_country_unique_index": "0", "aff_country_unique": "Canada" }, { "id": "nMo44IjBHX5", "title": "Continual Learning Using Pseudo-Replay via Latent Space Sampling", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "This paper investigates continual learning in the setting of class-incremental learning (CIL). Although numerous techniques have been proposed, CIL remains to be a highly challenging problem due to catastrophic forgetting (CF). However, so far few existing techniques have made use of pre-trained image feature extractors. In this paper, we propose to use a recently reported strong pre-trained feature extractor called CLIP and also propose a novel and yet simple pseudo-replay method to deal with CF. The proposed method is called PLS. Unlike the popular pseudo-replay approach that builds data generators to generate pseudo previous task data, PLS works in the latent space by sampling pseudo feature representations of previous tasks from the last layer of the pre-trained feature extractor. PLS is not only simple and efficient but also does not invade data privacy due to the fact that it works in the latent feature space. Experimental results show that the proposed method PLS outperforms state-of-the-art baselines by a large margin, where both PLS and the baselines leverage the CLIP pre-trained image feature extractor.", "keywords": "Continual learning;lifelong learning", "primary_area": "", "supplementary_material": "/attachment/11ec0721a8bd900f45d5f85de642b6104c50d709.zip", "author": "Gyuhak Kim;Sepideh Esmaeilpour;Zixuan Ke;Tatsuya Konishi;Bing Liu", "authorids": "~Gyuhak_Kim1;~Sepideh_Esmaeilpour1;~Zixuan_Ke1;~Tatsuya_Konishi2;~Bing_Liu1", "gender": ";F;M;M;M", "homepage": "https://k-gyuhak.github.io/;;https://vincent950129.github.io/;https://www.cs.uic.edu/~liub/;", "dblp": "317/0166;251/9539;196/3817;l/BingLiu1.html;185/3974.html", "google_scholar": "https://scholar.google.com/citations?hl=en;mMsSs-8AAAAJ;SZ4sFNEAAAAJ;Kt1bjZoAAAAJ;tx15SxoAAAAJ", "orcid": ";;;;0000-0002-2255-0156", "linkedin": ";sepideh-esmaeilpour-ab723686/;;;ukaznil/", "or_profile": "~Gyuhak_Kim1;~Sepideh_Esmaeilpour1;~Zixuan_Ke1;~Bing_Liu1;~Tatsuya_KONISHI1", "aff": "University of Illinois, Chicago;University of Illinois, Chicago;University of Illinois, Chicago;University of Illinois at Chicago;KDDI Research, Inc.", "aff_domain": "uic.edu;uic.edu;uic.edu;uic.edu;kddi-research.jp", "position": "PhD student;PhD student;PhD student;Full Professor;Researcher", "bibtex": "@misc{\nkim2022continual,\ntitle={Continual Learning Using Pseudo-Replay via Latent Space Sampling},\nauthor={Gyuhak Kim and Sepideh Esmaeilpour and Zixuan Ke and Tatsuya Konishi and Bing Liu},\nyear={2022},\nurl={https://openreview.net/forum?id=nMo44IjBHX5}\n}", "github": "", "project": "", "reviewers": "DDKt;TNwZ;gzmy", "site": "https://openreview.net/forum?id=nMo44IjBHX5", "pdf_size": 0, "recommendation": "5;5;6", "confidence": "3;4;3", "correctness": "4;3;4", "technical_novelty": "3;3;3", "empirical_novelty": "3;2;3", "wc_summary_paper": "53;76;41", "wc_summary_review": "45;19;32", "wc_main_review": "197;200;177", "wc_review": "295;295;250", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "correctness_avg": [ 3.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 56.666666666666664, 14.522013940527977 ], "wc_summary_review_avg": [ 32.0, 10.614455552060438 ], "wc_main_review_avg": [ 191.33333333333334, 10.208928554075703 ], "wc_review_avg": [ 280.0, 21.213203435596427 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.4999999999999999, "corr_recommendation_correctness": 0.4999999999999999, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17230323113192109708&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;0;0;1", "aff_unique_norm": "University of Illinois at Chicago;KDDI Research", "aff_unique_dep": ";", "aff_unique_url": "https://www.uic.edu;https://www.kddi-research.com", "aff_unique_abbr": "UIC;KDDI", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Chicago;", "aff_country_unique_index": "0;0;0;0;1", "aff_country_unique": "United States;Japan" }, { "id": "nNpDhjI2T_s", "title": "Learning to Coordinate in Multi-Agent Systems: A Coordinated Actor-Critic Algorithm and Finite-Time Guarantees", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Multi-agent reinforcement learning (MARL) has attracted much research attention recently. However, unlike its single-agent counterpart, many theoretical and algorithmic aspects of MARL have not been well-understood. In this paper, we study the emergence of coordinated behavior by autonomous agents using an actor-critic (AC) algorithm. Specifically, we propose and analyze a class of coordinated actor-critic algorithms (CAC) in which individually parametrized policies have a shared part (which is jointly optimized among all agents) and a personalized part (which is only locally optimized). Such kind of partially personalized policy allows agents to learn to coordinate by leveraging peers\u2019 past experience and adapt to individual tasks. The flexibility in our design allows the proposed MARL-CAC algorithm to be used in a fully decentralized setting, where the agents can only communicate with their neighbors, as well as a federated setting, where the agents occasionally communicate with a server while optimizing their (partially personalized) local models. Theoretically, we show that under some standard regularity assumptions, the proposed MARL-CAC algorithm requires $\\mathcal{O}(\\epsilon^{-\\frac{5}{2}})$ samples to achieve an\u000f$\\epsilon$-stationary solution (defined as the solution whose squared norm of the gradient of the objective function is less than\u000f$\\epsilon$). To the best of our knowledge, this work provides the first finite-sample guarantee for decentralized AC algorithm with partially personalized policies.\n", "keywords": "Reinforcement Learning;Multi-Agent System;Optimization", "primary_area": "", "supplementary_material": "", "author": "Siliang Zeng;Tianyi Chen;Alfredo Garcia;Mingyi Hong", "authorids": "~Siliang_Zeng1;~Tianyi_Chen5;~Alfredo_Garcia1;~Mingyi_Hong1", "gender": "M;M;M;M", "homepage": "https://siliangzeng.github.io/index.html;https://chentianyi1991.github.io/;https://agarcia.engr.tamu.edu;http://people.ece.umn.edu/~mhong/mingyi.html", "dblp": "38/9;;;57/8053", "google_scholar": "IfqsDyYAAAAJ;kFwvv38AAAAJ;;qRnP-p0AAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Siliang_Zeng1;~Tianyi_Chen5;~Alfredo_Garcia1;~Mingyi_Hong1", "aff": "University of Minnesota, Twin Cities;Rensselaer Polytechnic Institute;Texas A&M University - College Station;University of Minnesota, Minneapolis", "aff_domain": "umn.edu;rpi.edu;tamu.edu;umn.edu", "position": "PhD student;Assistant Professor;Full Professor;Associate Professor", "bibtex": "@misc{\nzeng2022learning,\ntitle={Learning to Coordinate in Multi-Agent Systems: A Coordinated Actor-Critic Algorithm and Finite-Time Guarantees},\nauthor={Siliang Zeng and Tianyi Chen and Alfredo Garcia and Mingyi Hong},\nyear={2022},\nurl={https://openreview.net/forum?id=nNpDhjI2T_s}\n}", "github": "", "project": "", "reviewers": "2aty;WYxG;bbD4", "site": "https://openreview.net/forum?id=nNpDhjI2T_s", "pdf_size": 0, "recommendation": "5;5;6", "confidence": "4;4;2", "correctness": "3;4;4", "technical_novelty": "3;3;2", "empirical_novelty": "2;2;2", "wc_summary_paper": "79;58;125", "wc_summary_review": "87;41;34", "wc_main_review": "463;411;273", "wc_review": "629;510;432", "wc_reply_reviewers": "122;0;78", "wc_reply_authors": "1624;1583;1272", "reply_reviewers": "1;0;2", "reply_authors": "3;3;4", "recommendation_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.3333333333333335, 0.9428090415820634 ], "correctness_avg": [ 3.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 87.33333333333333, 27.980151695244412 ], "wc_summary_review_avg": [ 54.0, 23.50886357667394 ], "wc_main_review_avg": [ 382.3333333333333, 80.17203724214295 ], "wc_review_avg": [ 523.6666666666666, 81.00342928268873 ], "wc_reply_reviewers_avg": [ 66.66666666666667, 50.446891766380304 ], "wc_reply_authors_avg": [ 1493.0, 157.16445738991584 ], "reply_reviewers_avg": [ 1.0, 0.816496580927726 ], "reply_authors_avg": [ 3.3333333333333335, 0.4714045207910317 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -1.0, "corr_recommendation_correctness": 0.4999999999999999, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14486045675400229551&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "University of Minnesota;Rensselaer Polytechnic Institute;Texas A&M University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.minnesota.edu;https://www.rpi.edu;https://www.tamu.edu", "aff_unique_abbr": "UMN;RPI;TAMU", "aff_campus_unique_index": "0;2;3", "aff_campus_unique": "Twin Cities;;College Station;Minneapolis", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "nNqA3yrZdDJ", "title": "Sample Complexity of Offline Reinforcement Learning with Deep ReLU Networks", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Offline reinforcement learning (RL) leverages previously collected data for policy optimization without any further active exploration. Despite the recent interest in this problem, its theoretical foundations in neural network function approximation settings remain limited. In this paper, we study the statistical theory of offline RL with deep ReLU network function approximation. In particular, we establish the sample complexity of $\\tilde{\\mathcal{O}}\\left( \\kappa^{1 + d/\\alpha} \\cdot \\epsilon^{-2 - 2d/\\alpha} \\right)$ for offline RL with deep ReLU networks, where $\\kappa$ is a measure of distributional shift, $d$ is the dimension of the state-action space, $\\alpha$ is a (possibly fractional) smoothness parameter of the underlying Markov decision process (MDP), and $\\epsilon$ is a desired error. Notably, our sample complexity holds under two novel considerations, namely the Besov dynamic closure and the correlated structure that arises from value regression for offline RL. While the Besov dynamic closure generalizes the dynamic conditions for offline RL in the prior works, the correlated structure renders the existing analyses improper or inefficient. To our knowledge, our work is the first to provide such a comprehensive analysis for offline RL with deep ReLU network function approximation. ", "keywords": "offline reinforcement learning;deep ReLU networks;function approximation", "primary_area": "", "supplementary_material": "", "author": "Thanh Nguyen-Tang;Sunil Gupta;Hung Tran-The;Svetha Venkatesh", "authorids": "~Thanh_Nguyen-Tang1;~Sunil_Gupta2;~Hung_Tran-The1;~Svetha_Venkatesh1", "gender": "M;F;M;M", "homepage": ";https://www.deakin.edu.au/about-deakin/people/svetha-venkatesh;https://personal-sites.deakin.edu.au/~sunilg/;https://thanhnguyentang.github.io/", "dblp": "76/9697;81/1984;47/333-1;287/5102.html", "google_scholar": "https://scholar.google.com.au/citations?user=um-FS-gAAAAJ;AEkRUQcAAAAJ;https://scholar.google.com.au/citations?user=bXeL2t8AAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";;0000-0002-3308-1930;0000-0002-1917-2190", "linkedin": ";;;thanhnguyentang/", "or_profile": "~Hung_Tran-The1;~Svetha_Venkatesh1;~Sunil_Kumar_Gupta1;~Thanh_Tang_Nguyen2", "aff": "Deakin University;Deakin University;Deakin University;Deakin University", "aff_domain": "deakin.edu.au;deakin.edu.au;deakin.edu.au;deakin.edu.au", "position": "Researcher;Full Professor;Associate Professor;PhD student", "bibtex": "@misc{\nnguyen-tang2022sample,\ntitle={Sample Complexity of Offline Reinforcement Learning with Deep Re{LU} Networks },\nauthor={Thanh Nguyen-Tang and Sunil Gupta and Hung Tran-The and Svetha Venkatesh},\nyear={2022},\nurl={https://openreview.net/forum?id=nNqA3yrZdDJ}\n}", "github": "", "project": "", "reviewers": "CGHQ;3Jm8;nefQ;w6PR", "site": "https://openreview.net/forum?id=nNqA3yrZdDJ", "pdf_size": 0, "recommendation": "5;5;5;6", "confidence": "3;3;3;3", "correctness": "4;3;3;3", "technical_novelty": "2;2;2;2", "empirical_novelty": "1;1;0;0", "wc_summary_paper": "97;50;86;52", "wc_summary_review": "20;43;6;28", "wc_main_review": "310;533;464;340", "wc_review": "427;626;556;420", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 5.25, 0.4330127018922193 ], "confidence_avg": [ 3.0, 0.0 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 0.5, 0.5 ], "wc_summary_paper_avg": [ 71.25, 20.632195714465293 ], "wc_summary_review_avg": [ 24.25, 13.386093530227555 ], "wc_main_review_avg": [ 411.75, 90.73691365701173 ], "wc_review_avg": [ 507.25, 87.36525339057857 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": -0.3333333333333333, "gs_citation": 18, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15200171882289778659&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Deakin University", "aff_unique_dep": "", "aff_unique_url": "https://www.deakin.edu.au", "aff_unique_abbr": "Deakin", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Australia" }, { "title": "Efficient Active Search for Combinatorial Optimization Problems", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6761", "id": "nO5caZwFwYu", "poster": "", "openreview": "https://openreview.net/forum?id=nO5caZwFwYu", "slides": "https://iclr.cc/virtual/2022/poster/6761", "video": "https://iclr.cc/virtual/2022/poster/6761", "author_site": "Andr\u00e9 Hottung, Yeong Dae Kwon, Kevin Tierney", "tldr": "", "abstract": "Recently numerous machine learning based methods for combinatorial optimization problems have been proposed that learn to construct solutions in a sequential decision process via reinforcement learning. While these methods can be easily combined with search strategies like sampling and beam search, it is not straightforward to integrate them into a high-level search procedure offering strong search guidance. Bello et al. (2016) propose active search, which adjusts the weights of a (trained) model with respect to a single instance at test time using reinforcement learning. While active search is simple to implement, it is not competitive with state-of-the-art methods because adjusting all model weights for each test instance is very time and memory intensive. Instead of updating all model weights, we propose and evaluate three efficient active search strategies that only update a subset of parameters during the search. The proposed methods offer a simple way to significantly improve the search performance of a given model and outperform state-of-the-art machine learning based methods on combinatorial problems, even surpassing the well-known heuristic solver LKH3 on the capacitated vehicle routing problem. Finally, we show that (efficient) active search enables learned models to effectively solve instances that are much larger than those seen during training.", "keywords": "heuristic search;combinatorial optimization;learning to optimize;reinforcement learning;traveling salesperson problem;vehicle routing problem;job shop scheduling problem", "primary_area": "", "supplementary_material": "", "author": "Andr\u00e9 Hottung;Yeong-Dae Kwon;Kevin Tierney", "authorids": "~Andr\u00e9_Hottung1;~Yeong-Dae_Kwon1;~Kevin_Tierney1", "gender": ";M;M", "homepage": ";http://www.tierney.de;", "dblp": ";13/7407;277/6514", "google_scholar": "zzqATFsAAAAJ;https://scholar.google.de/citations?user=G-EGfLEAAAAJ;cEKyTVUAAAAJ", "orcid": "0000-0002-7251-9093;0000-0002-5931-4907;0000-0002-7823-6860", "linkedin": ";kevinbtierney/;", "or_profile": "~Andr\u00e9_Hottung1;~Kevin_Tierney1;~Yeong_Dae_Kwon1", "aff": "Bielefeld University;Bielefeld University;Samsung SDS", "aff_domain": "uni-bielefeld.de;uni-bielefeld.de;samsung.com", "position": "PhD student;Full Professor;Engineer", "bibtex": "@inproceedings{\nhottung2022efficient,\ntitle={Efficient Active Search for Combinatorial Optimization Problems},\nauthor={Andr{\\'e} Hottung and Yeong-Dae Kwon and Kevin Tierney},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=nO5caZwFwYu}\n}", "github": "", "project": "", "reviewers": "4SkE;xF8a;cAC5;JA4x", "pdf_size": 0, "recommendation": "6;6;8;8", "confidence": "3;3;4;4", "correctness": "4;4;4;4", "technical_novelty": "3;2;3;3", "empirical_novelty": "3;3;2;3", "wc_summary_paper": "151;56;120;62", "wc_summary_review": "31;53;115;5", "wc_main_review": "289;378;637;175", "wc_review": "471;487;872;242", "wc_reply_reviewers": "0;61;0;7", "wc_reply_authors": "271;305;780;325", "reply_reviewers": "0;1;0;1", "reply_authors": "1;1;1;1", "recommendation_avg": [ 7.0, 1.0 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 4.0, 0.0 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 97.25, 39.84579651606929 ], "wc_summary_review_avg": [ 51.0, 40.66939881532551 ], "wc_main_review_avg": [ 369.75, 170.24889867485194 ], "wc_review_avg": [ 518.0, 226.19792218320663 ], "wc_reply_reviewers_avg": [ 17.0, 25.563646062328434 ], "wc_reply_authors_avg": [ 420.25, 208.59694988182355 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 1.0, "corr_recommendation_correctness": 0.0, "gs_citation": 141, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13404693543769371304&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=nO5caZwFwYu", "email": "uni-bielefeld.de;uni-bielefeld.de;samsung.com", "author_num": 3, "aff_unique_index": "0;0;1", "aff_unique_norm": "Bielefeld University;Samsung", "aff_unique_dep": ";Samsung SDS", "aff_unique_url": "https://www.uni-bielefeld.de/;https://www.samsungsds.com", "aff_unique_abbr": "Uni Bielefeld;Samsung SDS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1", "aff_country_unique": "Germany;South Korea" }, { "id": "nRCS3BfynGQ", "title": "Symmetry-driven graph neural networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "Exploiting symmetries and invariance in data is a powerful, yet not fully exploited, way to achieve better generalisation with more\nefficiency. In this paper, we introduce two graph network architectures that are equivariant to several types of transformations affecting the node coordinates. First, we build equivariance to any transformation in the coordinate embeddings that preserves the distance between neighbouring nodes, allowing for equivariance to the Euclidean group. Then, we introduce angle attributes to build equivariance to any angle preserving transformation - thus, to the conformal group. Thanks to their equivariance properties, the proposed models can be vastly more data efficient with respect to classical graph architectures, intrinsically equipped with a better inductive bias and better at generalising. We demonstrate these capabilities on a synthetic dataset composed of $n$-dimensional geometric objects. Additionally, we provide examples of their limitations when (the right) symmetries are not present in the data.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Francesco Farina;Emma Slade", "authorids": "~Francesco_Farina1;~Emma_Slade1", "gender": "M;F", "homepage": ";", "dblp": ";", "google_scholar": "_7glimkAAAAJ;https://scholar.google.co.uk/citations?user=hrCtIbsAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Francesco_Farina1;~Emma_Slade1", "aff": "GlaxoSmithKline;GlaxoSmithKline", "aff_domain": "gsk.com;gsk.com", "position": "Researcher;Researcher", "bibtex": "@misc{\nfarina2022symmetrydriven,\ntitle={Symmetry-driven graph neural networks},\nauthor={Francesco Farina and Emma Slade},\nyear={2022},\nurl={https://openreview.net/forum?id=nRCS3BfynGQ}\n}", "github": "", "project": "", "reviewers": "CQdG;ECjd;W7xU;yqP4", "site": "https://openreview.net/forum?id=nRCS3BfynGQ", "pdf_size": 0, "recommendation": "3;5;6;6", "confidence": "4;4;4;4", "correctness": "2;3;4;4", "technical_novelty": "2;2;3;3", "empirical_novelty": "0;2;3;3", "wc_summary_paper": "75;57;50;90", "wc_summary_review": "30;24;26;29", "wc_main_review": "450;494;185;248", "wc_review": "555;575;261;367", "wc_reply_reviewers": "0;193;21;195", "wc_reply_authors": "805;1085;350;940", "reply_reviewers": "0;1;1;1", "reply_authors": "2;3;1;2", "recommendation_avg": [ 5.0, 1.224744871391589 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.0, 1.224744871391589 ], "wc_summary_paper_avg": [ 68.0, 15.636495771111889 ], "wc_summary_review_avg": [ 27.25, 2.384848003542364 ], "wc_main_review_avg": [ 344.25, 130.60699636696344 ], "wc_review_avg": [ 439.5, 131.16687844116746 ], "wc_reply_reviewers_avg": [ 102.25, 92.05263440010829 ], "wc_reply_authors_avg": [ 795.0, 275.3406980451673 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 2.0, 0.7071067811865476 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.9847319278346618, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5134440813612605758&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0", "aff_unique_norm": "GlaxoSmithKline", "aff_unique_dep": "", "aff_unique_url": "https://www.gsk.com", "aff_unique_abbr": "GSK", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United Kingdom" }, { "title": "FairCal: Fairness Calibration for Face Verification", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6347", "id": "nRj0NcmSuxb", "poster": "", "openreview": "https://openreview.net/forum?id=nRj0NcmSuxb", "slides": "https://iclr.cc/virtual/2022/poster/6347", "video": "https://iclr.cc/virtual/2022/poster/6347", "author_site": "Tiago Salvador, Stephanie Cairns, Vikram Voleti, Noah Marshall, Adam Oberman", "tldr": "", "abstract": "Despite being widely used, face recognition models suffer from bias: the probability of a false positive (incorrect face match) strongly depends on sensitive attributes such as the ethnicity of the face. As a result, these models can disproportionately and negatively impact minority groups, particularly when used by law enforcement. The majority of bias reduction methods have several drawbacks: they use an end-to-end retraining approach, may not be feasible due to privacy issues, and often reduce accuracy. An alternative approach is post-processing methods that build fairer decision classifiers using the features of pre-trained models, thus avoiding the cost of retraining. However, they still have drawbacks: they reduce accuracy (AGENDA, FTC), or require retuning for different false positive rates (FSN). In this work, we introduce the Fairness Calibration (FairCal) method, a post-training approach that simultaneously: (i) increases model accuracy (improving the state-of-the-art), (ii) produces fairly-calibrated probabilities, (iii) significantly reduces the gap in the false positive rates, (iv) does not require knowledge of the sensitive attribute, and (v) does not require retraining, training an additional model or retuning. We apply it to the task of Face Verification, and obtain state-of-the-art results with all the above advantages.", "keywords": "face verification;bias;fairness;clustering;calibration", "primary_area": "", "supplementary_material": "/attachment/fe191ae42c33df9cd472f8bbf4d08425aa9d684f.zip", "author": "Tiago Salvador;Stephanie Cairns;Vikram Voleti;Noah Marshall;Adam M Oberman", "authorids": "~Tiago_Salvador1;~Stephanie_Cairns2;~Vikram_Voleti1;~Noah_Marshall1;~Adam_M_Oberman1", "gender": "M;F;M;;M", "homepage": "https://www.math.mcgill.ca/tsalvador/index.html;https://www.linkedin.com/in/stephanie-cairns-a336a6180/;https://voletiv.github.io;;https://www.adamoberman.net/", "dblp": "160/1251;;243/6609;;31/8186", "google_scholar": "2_dImZEAAAAJ;;PPCRqZUAAAAJ;;https://scholar.google.ca/citations?user=LPAZlL8AAAAJ", "orcid": ";;;0000-0002-7119-8507;", "linkedin": ";;vikram-voleti-45372222;;adam-oberman-527348107/", "or_profile": "~Tiago_Salvador1;~Stephanie_Cairns2;~Vikram_Voleti1;~Noah_Marshall1;~Adam_M_Oberman1", "aff": "McGill University;;Unity Technologies;McGill University;McGill University", "aff_domain": "mcgill.ca;;unity.com;mcgill.ca;mcgill.ca", "position": "Postdoc;;Intern;MS student;Full Professor", "bibtex": "@inproceedings{\nsalvador2022faircal,\ntitle={FairCal: Fairness Calibration for Face Verification},\nauthor={Tiago Salvador and Stephanie Cairns and Vikram Voleti and Noah Marshall and Adam M Oberman},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=nRj0NcmSuxb}\n}", "github": "", "project": "", "reviewers": "aX6s;JNHP;xwE8", "pdf_size": 0, "recommendation": "3;6;6", "confidence": "3;5;3", "correctness": "3;3;3", "technical_novelty": "2;2;3", "empirical_novelty": "2;0;2", "wc_summary_paper": "69;99;71", "wc_summary_review": "45;75;115", "wc_main_review": "62;276;387", "wc_review": "176;450;573", "wc_reply_reviewers": "0;103;25", "wc_reply_authors": "633;2615;1184", "reply_reviewers": "0;1;1", "reply_authors": "1;4;2", "recommendation_avg": [ 5.0, 1.4142135623730951 ], "confidence_avg": [ 3.6666666666666665, 0.9428090415820634 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 1.3333333333333333, 0.9428090415820634 ], "wc_summary_paper_avg": [ 79.66666666666667, 13.695092389449425 ], "wc_summary_review_avg": [ 78.33333333333333, 28.674417556808756 ], "wc_main_review_avg": [ 241.66666666666666, 134.883488817407 ], "wc_review_avg": [ 399.6666666666667, 165.93640013236666 ], "wc_reply_reviewers_avg": [ 42.666666666666664, 43.86595744107522 ], "wc_reply_authors_avg": [ 1477.3333333333333, 835.3100555415868 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 2.3333333333333335, 1.247219128924647 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.5000000000000001, "corr_recommendation_correctness": 0.0, "gs_citation": 24, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14127562251110451674&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "pdf": "https://openreview.net/pdf?id=nRj0NcmSuxb", "email": "mcgill.ca;;unity.com;mcgill.ca;mcgill.ca", "author_num": 5, "aff_unique_index": "0;1;0;0", "aff_unique_norm": "McGill University;Unity Technologies", "aff_unique_dep": ";", "aff_unique_url": "https://www.mcgill.ca;https://unity.com", "aff_unique_abbr": "McGill;Unity", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0", "aff_country_unique": "Canada;United States" }, { "id": "nT0GS37Clr", "title": "FSL: Federated Supermask Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Federated learning (FL) allows multiple clients with (private) data to collaboratively train a common machine learning model without sharing their private training data. In-the-wild deployment of FL faces two major hurdles: robustness to poisoning attacks and communication efficiency. To address these concurrently, we propose Federated Supermask Learning (FSL). FSL server trains a global subnetwork within a randomly initialized neural network by aggregating local subnetworks of all collaborating clients. FSL clients share local subnetworks in the form of rankings of network edges; more useful edges have higher ranks. By sharing integer rankings, instead of float weights, FSL restricts the space available to craft effective poisoning updates, and by sharing subnetworks, FSL reduces the communication cost of training. We show theoretically and empirically that FSL is robust by design and also significantly communication efficient; all this without compromising clients' privacy. Our experiments demonstrate the superiority of FSL in real-world FL settings; in particular, (1) FSL achieves similar performances as state-of-the-art FedAvg with significantly lower communication costs: for CIFAR10, FSL achieves same performance as Federated Averaging while reducing communication cost by $\\sim35\\%$. (2) FSL is substantially more robust to poisoning attacks than state-of-the-art robust aggregation algorithms.", "keywords": "Collaborative learning;robustness;poisoning attacks;communication efficiency", "primary_area": "", "supplementary_material": "/attachment/885827cbdf15f8ef99685b82783bd395f904586b.zip", "author": "Hamid Mozaffari;Virat Shejwalkar;Amir Houmansadr", "authorids": "~Hamid_Mozaffari1;~Virat_Shejwalkar1;~Amir_Houmansadr1", "gender": "M;M;M", "homepage": "https://people.cs.umass.edu/~hamid/;https://people.cs.umass.edu/~vshejwalkar/;https://www.cs.umass.edu/~amir/", "dblp": ";243/3113.html;22/1797", "google_scholar": "mNE1AeAAAAAJ;M6GAEdUAAAAJ;https://scholar.google.com.tw/citations?user=cTTFHNwAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Hamid_Mozaffari1;~Virat_Shejwalkar1;~Amir_Houmansadr1", "aff": "Department of Computer Science, University of Massachusetts, Amherst;University of Massachusetts at Amherst;University of Massachusetts, Amherst", "aff_domain": "cs.umass.edu;cs.umass.edu;umass.edu", "position": "PhD student;PhD student;Associate Professor", "bibtex": "@misc{\nmozaffari2022fsl,\ntitle={{FSL}: Federated Supermask Learning},\nauthor={Hamid Mozaffari and Virat Shejwalkar and Amir Houmansadr},\nyear={2022},\nurl={https://openreview.net/forum?id=nT0GS37Clr}\n}", "github": "", "project": "", "reviewers": "bj9t;JJjz;XrNo;q5LC", "site": "https://openreview.net/forum?id=nT0GS37Clr", "pdf_size": 0, "recommendation": "3;6;6;6", "confidence": "4;4;4;5", "correctness": "2;3;3;3", "technical_novelty": "2;2;3;2", "empirical_novelty": "2;0;2;2", "wc_summary_paper": "82;50;47;63", "wc_summary_review": "35;43;36;93", "wc_main_review": "258;1126;391;286", "wc_review": "375;1219;474;442", "wc_reply_reviewers": "111;307;569;0", "wc_reply_authors": "1249;2636;3157;615", "reply_reviewers": "1;1;3;0", "reply_authors": "2;4;7;1", "recommendation_avg": [ 5.25, 1.299038105676658 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 60.5, 13.793114224133722 ], "wc_summary_review_avg": [ 51.75, 24.014318645341575 ], "wc_main_review_avg": [ 515.25, 356.0852250515317 ], "wc_review_avg": [ 627.5, 343.36605831095187 ], "wc_reply_reviewers_avg": [ 246.75, 216.09532040282593 ], "wc_reply_authors_avg": [ 1914.25, 1024.201975930529 ], "reply_reviewers_avg": [ 1.25, 1.0897247358851685 ], "reply_authors_avg": [ 3.5, 2.29128784747792 ], "replies_avg": [ 24, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": 1.0, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18422819259599424648&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Massachusetts Amherst", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.umass.edu", "aff_unique_abbr": "UMass Amherst", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Amherst", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "nUoI0DKg_Ti", "title": "Learning Sampling Policy for Faster Derivative Free Optimization", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Zeroth-order (ZO, also known as derivative-free) methods, which estimate a noisy gradient based on the finite difference with two function evaluations, have attracted much attention recently because of its broad applications in machine learning community. The function evaluations are normally requested on a point plus a random perturbations drawn from a (standard Gaussian) distribution. The accurateness of noisy gradient highly depends on how many perturbations randomly sampled from the distribution, which intrinsically conflicts to the efficiency of ZO algorithms. Although there have been much effort made to improve the efficiency of ZO algorithms, \nhowever, we explore a new direction, i.e., learn an optimal sampling policy based on reinforcement learning (RL) to generate perturbation instead of using totally random strategy, which make it possible to calculate a ZO gradient with only 2 function evaluations. Specifically, we first formulate the problem of learning a sampling policy as a Markov decision process. Then, we propose our ZO-RL algorithm, i.e., using deep deterministic policy gradient, an actor-critic RL algorithm to learn a sampling policy which can guide the generation of perturbed vectors in getting ZO gradients as accurate as possible. Since our method only affects the generation of perturbed vectors which is parallel to existing efforts of accelerating ZO methods such as learning a data driven Gaussian distribution, we show how to combine our method with other acceleration techniques to further improve the efficiency of ZO algorithms. Experimental results with different ZO estimators show that our ZO-RL algorithm can effectively reduce the query complexity of ZO algorithms especially in the later stage of the optimization process, and converge faster than existing ZO algorithms.", "keywords": "Derivative free optimization;reinforcement learning", "primary_area": "", "supplementary_material": "", "author": "Zhou Zhai;Xiang Li;Bin Gu;Heng Huang", "authorids": "~Zhou_Zhai2;lxiang2@uwo.ca;~Bin_Gu1;~Heng_Huang1", "gender": ";;M;M", "homepage": ";;https://mbzuai.ac.ae/study/faculty/bin-gu/;https://www.cs.umd.edu/~heng/", "dblp": ";;29/1758-1;03/281", "google_scholar": ";;Vo8OgCgAAAAJ;4OqLaDwAAAAJ", "orcid": ";;0000-0001-6049-1815;", "linkedin": ";;;", "or_profile": "~Zhou_Zhai2;lxiang2@uwo.ca;~Bin_Gu1;~Heng_Huang1", "aff": ";;Mohamed bin Zayed University of Artificial Intelligence;University of Pittsburgh", "aff_domain": ";;mbzuai.ac.ae;pitt.edu", "position": ";;Assistant Professor;Full Professor", "bibtex": "@misc{\nzhai2022learning,\ntitle={Learning Sampling Policy for Faster Derivative Free Optimization},\nauthor={Zhou Zhai and Xiang Li and Bin Gu and Heng Huang},\nyear={2022},\nurl={https://openreview.net/forum?id=nUoI0DKg_Ti}\n}", "github": "", "project": "", "reviewers": "U8zs;vZ7k;xmBS;thB2", "site": "https://openreview.net/forum?id=nUoI0DKg_Ti", "pdf_size": 0, "recommendation": "3;3;3;6", "confidence": "4;4;4;3", "correctness": "2;3;2;4", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "75;55;32;89", "wc_summary_review": "33;22;14;26", "wc_main_review": "949;326;391;132", "wc_review": "1057;403;437;247", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.75, 1.299038105676658 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.82915619758885 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 62.75, 21.47527648250425 ], "wc_summary_review_avg": [ 23.75, 6.869315832017043 ], "wc_main_review_avg": [ 449.5, 303.71903134311486 ], "wc_review_avg": [ 536.0, 309.2135184625666 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -1.0, "corr_recommendation_correctness": 0.8703882797784892, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=162751371116572919&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff_unique_index": "0;1", "aff_unique_norm": "Mohamed bin Zayed University of Artificial Intelligence;University of Pittsburgh", "aff_unique_dep": ";", "aff_unique_url": "https://mbzuai.ac.ae;https://www.pitt.edu", "aff_unique_abbr": "MBZUAI;Pitt", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "United Arab Emirates;United States" }, { "id": "nWFFfnnz-mF", "title": "Effects of Conservatism on Offline Learning", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Conservatism, the act of underestimating an agent's expected value estimates, has demonstrated profound success in model-free, model-based, multi-task, safe and other realms of offline Reinforcement Learning (RL). Recent work, on the other hand, has noted that conservatism often hinders learning of behaviors. To that end, the paper asks the question how does conservatism affect offline learning? The proposed answer studies conservatism in light of value function optimization, approximate objectives that upper bound underestimations and behavior cloning as auxilary regularization objective. Conservative agents implicitly steer estimates away from the true value function, resulting in optimization objectives with high condition numbers. Mitigating these issues requires an upper bounding objective. These approximate upper bounds, however, impose strong geometrical assumptions on the dataset design, a result which is only sparsely fulfilled. Driven by theoretical observations, provision of an auxilary behavior cloning objective as variational regularization to estimates results in accurate value estimation, well-conditioned search spaces and expressive parameterizations. In an empirical study of discrete and continuous control tasks, we validate our theoretical insights and demonstrate the practical effects of learning underestimated value functions.", "keywords": "Conservatism;Offline Reinforcement Learning;Optimization.", "primary_area": "", "supplementary_material": "", "author": "Karush Suri;Florian Shkurti", "authorids": "~Karush_Suri1;~Florian_Shkurti1", "gender": "M;M", "homepage": "https://karush17.github.io/;http://www.cs.toronto.edu/~florian/", "dblp": "252/3260;21/10333", "google_scholar": "https://scholar.google.co.in/citations?user=ZFCHp9gAAAAJ;https://scholar.google.ca/citations?hl=en", "orcid": ";", "linkedin": ";", "or_profile": "~Karush_Suri1;~Florian_Shkurti1", "aff": "Google;University of Toronto", "aff_domain": "google.com;cs.toronto.edu", "position": "Researcher;Assistant Professor", "bibtex": "@misc{\nsuri2022effects,\ntitle={Effects of Conservatism on Offline Learning},\nauthor={Karush Suri and Florian Shkurti},\nyear={2022},\nurl={https://openreview.net/forum?id=nWFFfnnz-mF}\n}", "github": "", "project": "", "reviewers": "uHLR;hV1s;bN9a;rMqA;goBM", "site": "https://openreview.net/forum?id=nWFFfnnz-mF", "pdf_size": 0, "recommendation": "3;3;5;5;5", "confidence": "3;3;3;2;4", "correctness": "2;2;2;3;3", "technical_novelty": "2;2;3;3;2", "empirical_novelty": "2;2;2;2;3", "wc_summary_paper": "29;107;76;89;78", "wc_summary_review": "39;5;47;79;35", "wc_main_review": "202;1083;532;705;333", "wc_review": "270;1195;655;873;446", "wc_reply_reviewers": "337;0;254;399;384", "wc_reply_authors": "1088;2800;2411;2330;2216", "reply_reviewers": "1;0;2;2;1", "reply_authors": "3;5;6;4;5", "recommendation_avg": [ 4.2, 0.9797958971132712 ], "confidence_avg": [ 3.0, 0.6324555320336759 ], "correctness_avg": [ 2.4, 0.4898979485566356 ], "technical_novelty_avg": [ 2.4, 0.4898979485566356 ], "empirical_novelty_avg": [ 2.2, 0.39999999999999997 ], "wc_summary_paper_avg": [ 75.8, 25.85652722234755 ], "wc_summary_review_avg": [ 41.0, 23.731835158706122 ], "wc_main_review_avg": [ 571.0, 308.03441366185046 ], "wc_review_avg": [ 687.8, 324.23781395759505 ], "wc_reply_reviewers_avg": [ 274.8, 146.40136611384472 ], "wc_reply_authors_avg": [ 2169.0, 575.0506064686829 ], "reply_reviewers_avg": [ 1.2, 0.7483314773547883 ], "reply_authors_avg": [ 4.6, 1.0198039027185568 ], "replies_avg": [ 36, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.6666666666666667, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:37IIgTpl2bsJ:scholar.google.com/&scioq=Effects+of+Conservatism+on+Offline+Learning&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1", "aff_unique_norm": "Google;University of Toronto", "aff_unique_dep": "Google;", "aff_unique_url": "https://www.google.com;https://www.utoronto.ca", "aff_unique_abbr": "Google;U of T", "aff_campus_unique_index": "0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;1", "aff_country_unique": "United States;Canada" }, { "id": "nWlk4jwupZ", "title": "ScheduleNet: Learn to solve multi-agent scheduling problems with reinforcement learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "We propose ScheduleNet, an RL-based decentralized constructive scheduler for coordinating multi-agent to finish tasks with minimum completion time. We formulate multi-agent scheduling problems (mSPs) as an event-based Markov decision process (MDP) with an episodic reward (e.g., makespan) and derive a decentralized decision-making policy using reinforcement learning. The decision making procedure of ScheduleNet includes: (1) representing the state of a scheduling problem with the agent-task graph, (2) extracting node embeddings for agent and tasks nodes by employing the type-aware graph attention (TGA), and (3) computing the assignment probability with the computed node embeddings. We validate the effectiveness of ScheduleNet on two types of mSPs: multiple traveling salesmen problem (mTSP) and job-shop scheduling problem (JSP). We empirically show that ScheduleNet can outperform other heuristic approaches and existing deep RL approaches, particularly validating its exceptional effectiveness in solving large and practical problems. Furthermore, we have demonstrated that ScheduleNet can effectively solve online vehicle routing problems where the new target customer appears dynamically during the course of scheduling.", "keywords": "scheduling problems;combinatorial optimization;reinforcement learning;graph;graph neural network", "primary_area": "", "supplementary_material": "", "author": "Junyoung Park;Sanzhar Bakhtiyarov;Jinkyoo Park", "authorids": "~Junyoung_Park1;~Sanzhar_Bakhtiyarov1;~Jinkyoo_Park1", "gender": ";M;M", "homepage": ";https://github.com/bakhsanzh;http://silab.kaist.ac.kr/", "dblp": ";;156/7535", "google_scholar": ";;sH2a0nkAAAAJ", "orcid": ";;0000-0003-2620-1479", "linkedin": ";;", "or_profile": "~Junyoung_Park1;~Sanzhar_Bakhtiyarov1;~Jinkyoo_Park1", "aff": ";;Korea Advanced Institute of Science & Technology", "aff_domain": ";;kaist.ac.kr", "position": ";;Associate Professor", "bibtex": "@misc{\npark2022schedulenet,\ntitle={ScheduleNet: Learn to solve multi-agent scheduling problems with reinforcement learning},\nauthor={Junyoung Park and Sanzhar Bakhtiyarov and Jinkyoo Park},\nyear={2022},\nurl={https://openreview.net/forum?id=nWlk4jwupZ}\n}", "github": "", "project": "", "reviewers": "6uYb;7p6U;XP6V;VLpo", "site": "https://openreview.net/forum?id=nWlk4jwupZ", "pdf_size": 0, "recommendation": "6;6;6;6", "confidence": "3;3;4;3", "correctness": "3;3;3;3", "technical_novelty": "2;3;3;2", "empirical_novelty": "2;3;4;3", "wc_summary_paper": "72;52;142;100", "wc_summary_review": "32;43;96;82", "wc_main_review": "301;506;302;836", "wc_review": "405;601;540;1018", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "1440;2152;1297;1522", "reply_reviewers": "0;0;0;0", "reply_authors": "3;5;3;3", "recommendation_avg": [ 6.0, 0.0 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 91.5, 33.77499074759311 ], "wc_summary_review_avg": [ 63.25, 26.508253431714433 ], "wc_main_review_avg": [ 486.25, 218.50672186456873 ], "wc_review_avg": [ 641.0, 228.924660095849 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 1602.75, 327.1722596737077 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 3.5, 0.8660254037844386 ], "replies_avg": [ 20, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 65, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13623825575112623503&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0", "aff_unique_norm": "Korea Advanced Institute of Science and Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.kaist.ac.kr", "aff_unique_abbr": "KAIST", "aff_country_unique_index": "0", "aff_country_unique": "South Korea" }, { "id": "nWprF5r2spe", "title": "ON THE GENERALIZATION OF WASSERSTEIN ROBUST FEDERATED LEARNING", "track": "main", "status": "Reject", "tldr": "", "abstract": "In Federated learning (FL), participating clients typically possess non-i.i.d. data, posing a significant challenge to generalization to unseen distributions. To address this, we propose a Wasserstein distributionally robust optimization scheme called WAFL. Leveraging its duality, we frame WAFL as an empirical surrogate risk minimization problem, and solve it using a novel local SGD-based algorithm with convergence guarantees. We show that the robustness of WAFL is more general than related approaches, and the generalization bound is robust to all adversarial distributions inside the Wasserstein ball (ambiguity set). Since the center location and radius of the Wasserstein ball can be suitably modified, WAFL shows its applicability not only in robustness but also in domain adaptation. Through empirical evaluation, we demonstrate that WAFL generalizes better than the vanilla FedAvg in non-i.i.d. settings, and is more robust than other related methods in distribution shift settings. Further, using benchmark datasets we show that WAFL is capable of generalizing to unseen target domains.", "keywords": "Federated Learning;Robust Optimization;Adversarial Training", "primary_area": "", "supplementary_material": "/attachment/3a7b3f8a0d44bb1eb77e30f2150b4820198ec37a.zip", "author": "Long Tan Le;Josh Nguyen;Canh T. Dinh;Nguyen Hoang Tran", "authorids": "~Long_Tan_Le1;~Josh_Nguyen1;~Canh_T._Dinh1;~Nguyen_Hoang_Tran1", "gender": "M;M;M;M", "homepage": "https://longtanle.github.io/;https://nguyenhoangtran.github.io/;https://thecanhdinh.github.io/;https://joshnguyen.net", "dblp": "322/0383;03/1312.html;267/5107.html;45/3372", "google_scholar": "https://scholar.google.com.au/citations?user=CZZTrOoAAAAJ;3W8RLcwAAAAJ;https://scholar.google.com.au/citations?user=L3bMES0AAAAJ;mmy1v8oAAAAJ", "orcid": "0000-0003-3284-1990;0000-0001-7323-9213;0000-0002-0205-7743;0000-0002-1105-005X", "linkedin": ";;;joshnguyen99/", "or_profile": "~Long_Tan_Le1;~Nguyen_Hoang_Tran1;~Canh_The_Dinh1;~Tuan_Dung_Nguyen1", "aff": "University of Sydney;University of Sydney;University of Sydney;Australian National University", "aff_domain": "sydney.edu.au;sydney.edu.au;sydney.edu.au;anu.edu.au", "position": "PhD student;Associate Professor;PhD student;MPhil student", "bibtex": "@misc{\nle2022on,\ntitle={{ON} {THE} {GENERALIZATION} {OF} {WASSERSTEIN} {ROBUST} {FEDERATED} {LEARNING}},\nauthor={Long Tan Le and Josh Nguyen and Canh T. Dinh and Nguyen Hoang Tran},\nyear={2022},\nurl={https://openreview.net/forum?id=nWprF5r2spe}\n}", "github": "", "project": "", "reviewers": "g9vL;1hSQ;M1m9", "site": "https://openreview.net/forum?id=nWprF5r2spe", "pdf_size": 0, "recommendation": "3;5;6", "confidence": "4;4;3", "correctness": "3;3;4", "technical_novelty": "1;3;3", "empirical_novelty": "2;2;3", "wc_summary_paper": "82;127;59", "wc_summary_review": "47;45;13", "wc_main_review": "294;255;157", "wc_review": "423;427;229", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "1091;752;611", "reply_reviewers": "0;0;0", "reply_authors": "2;1;1", "recommendation_avg": [ 4.666666666666667, 1.247219128924647 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.3333333333333335, 0.9428090415820634 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 89.33333333333333, 28.241026106633512 ], "wc_summary_review_avg": [ 35.0, 15.57776192739723 ], "wc_main_review_avg": [ 235.33333333333334, 57.63293811316195 ], "wc_review_avg": [ 359.6666666666667, 92.4097156989218 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 818.0, 201.43981731524678 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.7559289460184545, "corr_recommendation_correctness": 0.7559289460184545, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17686714460376155192&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2, "aff_unique_index": "0;0;0;1", "aff_unique_norm": "University of Sydney;Australian National University", "aff_unique_dep": ";", "aff_unique_url": "https://www.sydney.edu.au;https://www.anu.edu.au", "aff_unique_abbr": "USYD;ANU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Australia" }, { "title": "Granger causal inference on DAGs identifies genomic loci regulating transcription", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6680", "id": "nZOUYEN6Wvy", "poster": "", "openreview": "https://openreview.net/forum?id=nZOUYEN6Wvy", "slides": "https://iclr.cc/virtual/2022/poster/6680", "video": "https://iclr.cc/virtual/2022/poster/6680", "author_site": "Alexander Wu, Rohit Singh, Bonnie Berger", "tldr": "", "abstract": "When a dynamical system can be modeled as a sequence of observations, Granger causality is a powerful approach for detecting predictive interactions between its variables. However, traditional Granger causal inference has limited utility in domains where the dynamics need to be represented as directed acyclic graphs (DAGs) rather than as a linear sequence, such as with cell differentiation trajectories. Here, we present GrID-Net, a framework based on graph neural networks with lagged message passing for Granger causal inference on DAG-structured systems. Our motivating application is the analysis of single-cell multimodal data to identify genomic loci that mediate the regulation of specific genes. To our knowledge, GrID-Net is the first single-cell analysis tool that accounts for the temporal lag between a genomic locus becoming accessible and its downstream effect on a target gene's expression. We applied GrID-Net on multimodal single-cell assays that profile chromatin accessibility (ATAC-seq) and gene expression (RNA-seq) in the same cell and show that it dramatically outperforms existing methods for inferring regulatory locus-gene links, achieving up to 71% greater agreement with independent population genetics-based estimates. By extending Granger causality to DAG-structured dynamical systems, our work unlocks new domains for causal analyses and, more specifically, opens a path towards elucidating gene regulatory interactions relevant to cellular differentiation and complex human diseases at unprecedented scale and resolution.", "keywords": "Granger causality;causal inference;graph neural networks;gene regulation;single-cell genomics;chromatin accessibility;directed acyclic graphs;single-cell multimodal", "primary_area": "", "supplementary_material": "/attachment/59e32c9e174ddc61794670360dd59ebbea7cbb9c.zip", "author": "Alexander P Wu;Rohit Singh;Bonnie Berger", "authorids": "~Alexander_P_Wu1;~Rohit_Singh1;~Bonnie_Berger1", "gender": "M;;F", "homepage": ";http://people.csail.mit.edu/rsingh/;https://people.csail.mit.edu/bab/", "dblp": ";;b/BonnieBerger", "google_scholar": ";N65pn8kAAAAJ;bYjKaowAAAAJ", "orcid": ";my-orcid?orcid=0000-0002-4084-7340;", "linkedin": "alexander-wu-57122678/;rohit-singh-0b509b2;", "or_profile": "~Alexander_P_Wu1;~Rohit_Singh1;~Bonnie_Berger1", "aff": "Massachusetts Institute of Technology;Massachusetts Institute of Technology;Massachusetts Institute of Technology", "aff_domain": "mit.edu;mit.edu;mit.edu", "position": "PhD student;Researcher;Full Professor", "bibtex": "@inproceedings{\nwu2022granger,\ntitle={Granger causal inference on {DAG}s identifies genomic loci regulating transcription},\nauthor={Alexander P Wu and Rohit Singh and Bonnie Berger},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=nZOUYEN6Wvy}\n}", "github": "", "project": "", "reviewers": "A4M6;dibr;QYmT;xWaf", "pdf_size": 0, "recommendation": "8;8;8;8", "confidence": "4;3;3;4", "correctness": "4;3;4;4", "technical_novelty": "3;2;2;3", "empirical_novelty": "3;3;2;0", "wc_summary_paper": "76;61;164;143", "wc_summary_review": "233;70;111;38", "wc_main_review": "348;373;1017;189", "wc_review": "657;504;1292;370", "wc_reply_reviewers": "0;102;84;0", "wc_reply_authors": "718;745;2517;683", "reply_reviewers": "0;1;1;0", "reply_authors": "2;3;5;2", "recommendation_avg": [ 8.0, 0.0 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.0, 1.224744871391589 ], "wc_summary_paper_avg": [ 111.0, 43.46837931186301 ], "wc_summary_review_avg": [ 113.0, 73.9560680404252 ], "wc_main_review_avg": [ 481.75, 316.98215643786637 ], "wc_review_avg": [ 705.75, 353.3754200563474 ], "wc_reply_reviewers_avg": [ 46.5, 46.93346354148605 ], "wc_reply_authors_avg": [ 1165.75, 780.4541546433077 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 3.0, 1.224744871391589 ], "replies_avg": [ 22, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 18, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5680288035364685822&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=nZOUYEN6Wvy", "email": "mit.edu;mit.edu;mit.edu", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Massachusetts Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://web.mit.edu", "aff_unique_abbr": "MIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "nZXmDrV5OA2", "title": "Assumption-Free Survival Analysis Under Local Smoothness Prior", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Survival analysis appears in various fields such as medicine, economics, engineering, and business. Due to the difficulty of integration that naturally arises in continuous-time modeling, previous works either made a strong assumption or discretized the time domain, thus limits their practical usages. In this paper, we propose assumption-free survival analysis, which models continuous-time survival function without any assumption. Our model obtains an assumption-free survival function by integrating an assumption-free hazard function using Neural Ordinary Differential Equations. Inspired by smoothness prior from semi-supervised learning literature, we further propose a regularizer that encourages the survival function to be locally smooth by minimizing the variation of the survival function in the covariate space. We found this regularizer increases the predictive power of the survival function as it propagates high-quality local information to the neighborhoods of data points. Experimental results on three public benchmarks show that our approach has better predictive power and is well-calibrated compared to strong baselines. Moreover, the proposed regularizer is superior to global regularizers and insensitive to hyperparameters.", "keywords": "Survival analysis;time-to-event modeling;Neural ODE;regularization", "primary_area": "", "supplementary_material": "", "author": "Seungjae Jung;Min-Kyu Kim;Juho Lee;Young-Jin Park;Nahyeon Park;Kyung-Min Kim", "authorids": "~Seungjae_Jung1;min.kyu.kim@navercorp.com;3juho.lee@navercorp.com;~Young-Jin_Park1;nahyeon.ryu@navercorp.com;~Kyung-Min_Kim1", "gender": "M;;;M;;M", "homepage": ";;;https://young-j-park.github.io;;", "dblp": ";;;31/2521;;85/8572", "google_scholar": "wP9lL-sAAAAJ;;;ylx5pYAAAAAJ;;https://scholar.google.com/citations?hl=en", "orcid": ";;;;;0000-0003-2426-2198", "linkedin": "seungjae-jung-3a2b74141/;;;;;", "or_profile": "~Seungjae_Jung1;min.kyu.kim@navercorp.com;3juho.lee@navercorp.com;~Young-Jin_Park1;nahyeon.ryu@navercorp.com;~Kyung-Min_Kim1", "aff": "NAVER;;;NAVER;;NAVER", "aff_domain": "navercorp.com;;;navercorp.com;;navercorp.com", "position": "Researcher;;;Researcher;;Leader", "bibtex": "@misc{\njung2022assumptionfree,\ntitle={Assumption-Free Survival Analysis Under Local Smoothness Prior},\nauthor={Seungjae Jung and Min-Kyu Kim and Juho Lee and Young-Jin Park and Nahyeon Park and Kyung-Min Kim},\nyear={2022},\nurl={https://openreview.net/forum?id=nZXmDrV5OA2}\n}", "github": "", "project": "", "reviewers": "vr58;hfuy;9AD5;TnKU", "site": "https://openreview.net/forum?id=nZXmDrV5OA2", "pdf_size": 0, "recommendation": "1;3;3;5", "confidence": "5;3;5;4", "correctness": "2;3;3;3", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;3;2;2", "wc_summary_paper": "89;71;67;42", "wc_summary_review": "115;115;32;51", "wc_main_review": "767;602;738;348", "wc_review": "971;788;837;441", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.0, 1.4142135623730951 ], "confidence_avg": [ 4.25, 0.82915619758885 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 67.25, 16.768646337734005 ], "wc_summary_review_avg": [ 78.25, 37.35890121510535 ], "wc_main_review_avg": [ 613.75, 165.59344038940674 ], "wc_review_avg": [ 759.25, 195.5714383543773 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.4264014327112209, "corr_recommendation_correctness": 0.816496580927726, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:m162oWfJDyYJ:scholar.google.com/&scioq=Assumption-Free+Survival+Analysis+Under+Local+Smoothness+Prior&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;0", "aff_unique_norm": "NAVER Corporation", "aff_unique_dep": "", "aff_unique_url": "https://www.naver.com", "aff_unique_abbr": "NAVER", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "South Korea" }, { "title": "LoRA: Low-Rank Adaptation of Large Language Models", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6319", "id": "nZeVKeeFYf9", "poster": "", "openreview": "https://openreview.net/forum?id=nZeVKeeFYf9", "slides": "https://iclr.cc/virtual/2022/poster/6319", "video": "https://iclr.cc/virtual/2022/poster/6319", "author_site": "Edward Hu, yelong shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, Weizhu Chen", "tldr": "", "abstract": "An important paradigm of natural language processing consists of large-scale pre-training on general domain data and adaptation to particular tasks or domains. As we pre-train larger models, full fine-tuning, which retrains all model parameters, becomes less feasible.\nUsing GPT-3 175B as an example -- deploying independent instances of fine-tuned models, each with 175B parameters, is prohibitively expensive. We propose Low-Rank Adaptation, or LoRA, which freezes the pre-trained model weights and injects trainable rank decomposition matrices into each layer of the Transformer architecture, greatly reducing the number of trainable parameters for downstream tasks. Compared to GPT-3 175B fine-tuned with Adam, LoRA can reduce the number of trainable parameters by a factor of 10,000 and the GPU memory requirement by a factor of 3. LoRA performs on-par or better than fine-tuning in model quality on RoBERTa, DeBERTa, GPT-2, and GPT-3, despite having fewer trainable parameters, a higher training throughput, and, unlike adapters, no additional inference latency. We also provide an empirical investigation into rank-deficiency in language model adaptation, which sheds light on the efficacy of LoRA. We release a package that facilitates the integration of LoRA with PyTorch models and provide our implementations and model checkpoints for RoBERTa, DeBERTa, and GPT-2 at https://github.com/microsoft/LoRA.", "keywords": "Transfer learning;Adaptation;Transformer;Fine-tuning;Low-rank;RoBERTa;DeBERTa;GPT-2;GPT-3", "primary_area": "", "supplementary_material": "/attachment/9f1ca59ff81f3339a6099be6c48a9e33db065654.zip", "author": "Edward J Hu;yelong shen;Phillip Wallis;Zeyuan Allen-Zhu;Yuanzhi Li;Shean Wang;Lu Wang;Weizhu Chen", "authorids": "~Edward_J_Hu1;~yelong_shen1;~Phillip_Wallis1;~Zeyuan_Allen-Zhu1;~Yuanzhi_Li1;~Shean_Wang1;luw@microsoft.com;~Weizhu_Chen1", "gender": "M;;M;;M;M;;M", "homepage": "https://edwardjhu.com;;;;;https://www.linkedin.com/in/shean-wang-18a20841/;;https://www.microsoft.com/en-us/research/people/wzchen/", "dblp": "295/8436;;241/6270.html;;73/3628;;;79/2536", "google_scholar": "2eADy_8AAAAJ;;8IqHSXYAAAAJ;;;;;LG_E-4EAAAAJ", "orcid": ";;0000-0002-5765-0528;;;;;", "linkedin": "edwardjhu/;;phillip-wallis-0aa11761/;;;;;", "or_profile": "~Edward_J_Hu1;~yelong_shen1;~Phillip_Wallis1;~Zeyuan_Allen-Zhu1;~Yuanzhi_Li1;~Shean_Wang1;luw@microsoft.com;~Weizhu_Chen1", "aff": "Montreal Institute for Learning Algorithms, University of Montreal, Universit\u00e9 de Montr\u00e9al;;Microsoft;;Carnegie Mellon University;;;Microsoft GenAI", "aff_domain": "mila.umontreal.ca;;microsoft.com;;andrew.cmu.edu;;;microsoft.com", "position": "PhD student;;Senior Applied Scientist;;Assistant Professor;;;Vice President", "bibtex": "@inproceedings{\nhu2022lora,\ntitle={Lo{RA}: Low-Rank Adaptation of Large Language Models},\nauthor={Edward J Hu and yelong shen and Phillip Wallis and Zeyuan Allen-Zhu and Yuanzhi Li and Shean Wang and Lu Wang and Weizhu Chen},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=nZeVKeeFYf9}\n}", "github": "", "project": "", "reviewers": "TD94;Vqdv;jHjA;i5pN", "pdf_size": 0, "recommendation": "6;6;8;8", "confidence": "5;4;5;4", "correctness": "3;2;4;4", "technical_novelty": "2;3;3;2", "empirical_novelty": "3;2;3;3", "wc_summary_paper": "65;67;266;142", "wc_summary_review": "175;3;40;46", "wc_main_review": "73;278;316;248", "wc_review": "313;348;622;436", "wc_reply_reviewers": "9;592;74;0", "wc_reply_authors": "368;1826;304;320", "reply_reviewers": "1;3;1;0", "reply_authors": "1;3;2;1", "recommendation_avg": [ 7.0, 1.0 ], "confidence_avg": [ 4.5, 0.5 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 135.0, 81.75267579718721 ], "wc_summary_review_avg": [ 66.0, 65.04998078400946 ], "wc_main_review_avg": [ 228.75, 93.0950455180081 ], "wc_review_avg": [ 429.75, 119.70040726747759 ], "wc_reply_reviewers_avg": [ 168.75, 246.02578625014087 ], "wc_reply_authors_avg": [ 704.5, 647.9265004612791 ], "reply_reviewers_avg": [ 1.25, 1.0897247358851685 ], "reply_authors_avg": [ 1.75, 0.82915619758885 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.9045340337332909, "gs_citation": 14155, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9664431088510823229&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 12, "pdf": "https://openreview.net/pdf?id=nZeVKeeFYf9", "email": "mila.umontreal.ca;;microsoft.com;;andrew.cmu.edu;;;microsoft.com", "author_num": 8, "aff_unique_index": "0;1;2;1", "aff_unique_norm": "University of Montreal;Microsoft;Carnegie Mellon University", "aff_unique_dep": "Montreal Institute for Learning Algorithms;Microsoft Corporation;", "aff_unique_url": "https://www.mila.quebec;https://www.microsoft.com;https://www.cmu.edu", "aff_unique_abbr": "MILA;Microsoft;CMU", "aff_campus_unique_index": "0", "aff_campus_unique": "Montreal;", "aff_country_unique_index": "0;1;1;1", "aff_country_unique": "Canada;United States" }, { "id": "nZon4NT0WSw", "title": "TsmoBN: Interventional Generalization for Unseen Clients in Federated Learning", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Generalizing federated learning (FL) models to unseen clients with non-iid data is a crucial topic, yet unsolved so far. In this work, we propose to tackle this problem from a novel causal perspective. Specifically, we form a training structural causal model (SCM) to explain the challenges of model generalization in a distributed learning paradigm. Based on this, we present a simple yet effective method using test-specific and momentum tracked batch normalization (TsmoBN) to generalize FL models to testing clients. We give a causal analysis by formulating another testing SCM and demonstrate that the key factor in TsmoBN is the test-specific statistics (i.e., mean and variance) of features. Such statistics can be seen as a surrogate variable for causal intervention. In addition, by considering generalization bounds in FL, we show that our TsmoBN method can reduce divergence between training and testing feature distributions, which achieves a lower generalization gap than standard model testing. Our extensive experimental evaluations demonstrate significant improvements for unseen client generalization on three datasets with various types of distribution shifts and numbers of clients. It is worth noting that our proposed approach can be flexibly applied to different state-of-the-art federated learning algorithms and is orthogonal to existing domain generalization methods. ", "keywords": "Federated Learning;Unseen Client Generalization;Structural Causal Model", "primary_area": "", "supplementary_material": "", "author": "Meirui Jiang;Xiaofei Zhang;Michael Kamp;Xiaoxiao Li;Qi Dou", "authorids": "~Meirui_Jiang2;~Xiaofei_Zhang1;~Michael_Kamp1;~Xiaoxiao_Li1;~Qi_Dou2", "gender": "F;M;Unspecified;F;M", "homepage": ";http://michaelkamp.org;https://xxlya.github.io/;https://www.cse.cuhk.edu.hk/~qdou;https://meiruijiang.github.io/MeiruiJiang/", "dblp": ";133/7744;71/8042;165/7846;285/5480", "google_scholar": ";https://scholar.google.de/citations?user=8R5jbvQAAAAJ;sdENOQ4AAAAJ;https://scholar.google.com.hk/citations?user=iHh7IJQAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": " 0000-0002-0551-855X;0000-0001-6231-0694;;0000-0002-3416-9950;0000-0003-4228-8420", "linkedin": ";michael-kamp-29096a95/;;;", "or_profile": "~Xiaofei_Zhang1;~Michael_Kamp1;~Xiaoxiao_Li1;~Qi_Dou2;~Meirui_JIANG1", "aff": "Zhongnan University of Economics and Law;Institute for AI in Medicine IKIM;University of British Columbia;The Chinese University of Hong Kong;Department of Computer Science and Engineering, The Chinese University of Hong Kong", "aff_domain": "zuel.edu.cn;uk-essen.de;ece.ubc.ca;cuhk.edu.hk;cse.cuhk.edu.hk", "position": "Assistant Professor;Research Group Leader;Assistant Professor;Assistant Professor;PhD student", "bibtex": "@misc{\njiang2022tsmobn,\ntitle={Tsmo{BN}: Interventional Generalization for Unseen Clients in Federated Learning},\nauthor={Meirui Jiang and Xiaofei Zhang and Michael Kamp and Xiaoxiao Li and Qi Dou},\nyear={2022},\nurl={https://openreview.net/forum?id=nZon4NT0WSw}\n}", "github": "", "project": "", "reviewers": "Q35V;F5AG;b4Kq", "site": "https://openreview.net/forum?id=nZon4NT0WSw", "pdf_size": 0, "recommendation": "3;3;5", "confidence": "4;4;4", "correctness": "2;2;2", "technical_novelty": "2;2;3", "empirical_novelty": "2;2;3", "wc_summary_paper": "92;65;53", "wc_summary_review": "44;62;67", "wc_main_review": "677;329;347", "wc_review": "813;456;467", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 3.6666666666666665, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 2.0, 0.0 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 70.0, 16.30950643030009 ], "wc_summary_review_avg": [ 57.666666666666664, 9.877021593352703 ], "wc_main_review_avg": [ 451.0, 159.97499804656977 ], "wc_review_avg": [ 578.6666666666666, 165.7595313163687 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5052440089172779668&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;2;3;3", "aff_unique_norm": "Zhongnan University of Economics and Law;Institute for AI in Medicine;University of British Columbia;Chinese University of Hong Kong", "aff_unique_dep": ";AI in Medicine;;", "aff_unique_url": "http://www.zuel.edu.cn/;;https://www.ubc.ca;https://www.cuhk.edu.hk", "aff_unique_abbr": "ZUEL;IKIM;UBC;CUHK", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;1;2;0;0", "aff_country_unique": "China;Unknown;Canada" }, { "id": "naoQDOYsHnS", "title": "Learning Pseudometric-based Action Representations for Offline Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Offline reinforcement learning is a promising approach for practical applications since it does not require interactions with real-world environments. However, existing offline RL methods only work well in environments with continuous or small discrete action spaces. In environments with large and discrete action spaces, such as recommender systems and dialogue systems, the performance of existing methods decreases drastically because they suffer from inaccurate value estimation for a large proportion of o.o.d. actions. While recent works have demonstrated that online RL benefits from incorporating semantic information in action representations, unfortunately, they fail to learn reasonable relative distances between action representations, which is key to offline RL to reduce the influence of out-of-distribution (o.o.d.) actions. This paper proposes an action representation learning framework for offline RL based on a pseudometric, which measures both the behavioral relation and the data-distributional relation between actions. We provide theoretical analysis on the continuity and the bounds of the expected Q-values using the learned action representations. Experimental results show that our methods significantly improve the performance of two typical offline RL methods in environments with large and discrete action spaces. ", "keywords": "offline reinforcement learning\uff0crepresentation learning\uff0c metric learning", "primary_area": "", "supplementary_material": "", "author": "Pengjie Gu;Mengchen Zhao;Chen Chen;Dong Li;Jianye Hao;Bo An", "authorids": "~Pengjie_Gu1;~Mengchen_Zhao1;~Chen_Chen3;~Dong_Li10;haojianye@huawei.com;~Bo_An2", "gender": "M;M;F;M;;M", "homepage": ";https://batmanzzmc.github.io/;;;;https://personal.ntu.edu.sg/boan/", "dblp": "226/1222;178/8719;;47/4826-16;;42/6178-1.html", "google_scholar": ";nLgORGMAAAAJ;l8_g4oAAAAAJ;;;PEEpuNwAAAAJ", "orcid": ";;;;;0000-0002-7064-7438", "linkedin": ";;;;;", "or_profile": "~Pengjie_Gu1;~Mengchen_Zhao1;~Chen_Chen3;~Dong_Li10;haojianye@huawei.com;~Bo_An2", "aff": "Nanyang Technological University;Huawei Noah's Ark Lab;Huawei Technologies Ltd.;Huawei Technologies Ltd.;;Nanyang Technological University", "aff_domain": "ntu.edu.sg;huawei.com;huawei.com;huawei.com;;ntu.edu.sg", "position": "PhD student;Research Scientist;Researcher;Principal Researcher;;Full Professor", "bibtex": "@misc{\ngu2022learning,\ntitle={Learning Pseudometric-based Action Representations for Offline Reinforcement Learning},\nauthor={Pengjie Gu and Mengchen Zhao and Chen Chen and Dong Li and Jianye Hao and Bo An},\nyear={2022},\nurl={https://openreview.net/forum?id=naoQDOYsHnS}\n}", "github": "", "project": "", "reviewers": "pJJJ;bL2q;hprr;gR99", "site": "https://openreview.net/forum?id=naoQDOYsHnS", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "4;3;2;3", "correctness": "3;3;4;4", "technical_novelty": "1;2;3;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "96;153;33;169", "wc_summary_review": "19;61;38;133", "wc_main_review": "195;230;315;650", "wc_review": "310;444;386;952", "wc_reply_reviewers": "319;0;56;0", "wc_reply_authors": "928;527;746;578", "reply_reviewers": "1;0;1;0", "reply_authors": "2;1;2;1", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 3.0, 0.7071067811865476 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.25, 0.82915619758885 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 112.75, 53.44331108754397 ], "wc_summary_review_avg": [ 62.75, 43.19939235683761 ], "wc_main_review_avg": [ 347.5, 180.0173602739469 ], "wc_review_avg": [ 523.0, 252.20031720836514 ], "wc_reply_reviewers_avg": [ 93.75, 132.04237009384525 ], "wc_reply_authors_avg": [ 694.75, 157.16452366867023 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.7071067811865475, "corr_recommendation_correctness": 1.0, "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7326857603943144470&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff_unique_index": "0;1;1;1;0", "aff_unique_norm": "Nanyang Technological University;Huawei", "aff_unique_dep": ";Noah's Ark Lab", "aff_unique_url": "https://www.ntu.edu.sg;https://www.huawei.com", "aff_unique_abbr": "NTU;Huawei", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1;0", "aff_country_unique": "Singapore;China" }, { "title": "Optimization inspired Multi-Branch Equilibrium Models", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6461", "id": "nbC8iTTXIrk", "poster": "", "openreview": "https://openreview.net/forum?id=nbC8iTTXIrk", "slides": "https://iclr.cc/virtual/2022/poster/6461", "video": "https://iclr.cc/virtual/2022/poster/6461", "author_site": "Mingjie Li, Yisen Wang, Xingyu Xie, Zhouchen Lin", "tldr": "", "abstract": "Works have shown the strong connections between some implicit models and optimization problems. However, explorations on such relationships are limited. Most works pay attention to some common mathematical properties, such as sparsity. In this work, we propose a new type of implicit model inspired by the designing of the systems' hidden objective functions, called the Multi-branch Optimization induced Equilibrium networks~(MOptEqs). The model architecture is designed based on modelling the hidden objective function for the multi-resolution recognition task. Furthermore, we also propose a new strategy inspired by our understandings of the hidden objective function. In this manner, the proposed model can better utilize the hierarchical patterns for recognition tasks and retain the abilities for interpreting the whole structure as trying to obtain the minima of the problem's goal. Comparing with the state-of-the-art models, our MOptEqs not only enjoys better explainability but are also superior to MDEQ with less parameter consumption and better performance on practical tasks. Furthermore, we also implement various experiments to demonstrate the effectiveness of our new methods and explore the applicability of the model's hidden objective function.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/5755acb9850c2c7760083a2d75aa69346cde740d.zip", "author": "Mingjie Li;Yisen Wang;Xingyu Xie;Zhouchen Lin", "authorids": "~Mingjie_Li1;~Yisen_Wang1;~Xingyu_Xie1;~Zhouchen_Lin1", "gender": "M;M;M;M", "homepage": "https://mingjieli0111.github.io/;https://yisenwang.github.io/;;https://zhouchenlin.github.io", "dblp": ";172/1346-1;174/9633;l/ZhouchenLin", "google_scholar": ";uMWPDboAAAAJ;BpFCmZMAAAAJ;https://scholar.google.com.tw/citations?user=TanjFwoAAAAJ", "orcid": "0000-0002-1588-2654;;;0000-0003-1493-7569", "linkedin": ";;;", "or_profile": "~Mingjie_Li1;~Yisen_Wang1;~Xingyu_Xie1;~Zhouchen_Lin1", "aff": "Peking University;Peking University;Peking University;Peking University", "aff_domain": "pku.edu.cn;pku.edu.cn;pku.edu.cn;pku.edu.cn", "position": "PhD student;Assistant Professor;PhD student;Professor", "bibtex": "@inproceedings{\nli2022optimization,\ntitle={Optimization inspired Multi-Branch Equilibrium Models},\nauthor={Mingjie Li and Yisen Wang and Xingyu Xie and Zhouchen Lin},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=nbC8iTTXIrk}\n}", "github": "", "project": "", "reviewers": "8Er5;61kJ;DkRL;2WhH", "pdf_size": 0, "recommendation": "5;6;6;6", "confidence": "3;1;2;3", "correctness": "3;3;4;4", "technical_novelty": "2;3;2;3", "empirical_novelty": "2;3;2;3", "wc_summary_paper": "83;76;23;97", "wc_summary_review": "66;37;42;39", "wc_main_review": "255;61;176;161", "wc_review": "404;174;241;297", "wc_reply_reviewers": "0;0;17;22", "wc_reply_authors": "791;490;418;735", "reply_reviewers": "0;0;1;1", "reply_authors": "2;1;1;1", "recommendation_avg": [ 5.75, 0.4330127018922193 ], "confidence_avg": [ 2.25, 0.82915619758885 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 69.75, 28.030117730755254 ], "wc_summary_review_avg": [ 46.0, 11.683321445547923 ], "wc_main_review_avg": [ 163.25, 68.99411206762501 ], "wc_review_avg": [ 279.0, 84.28819608936948 ], "wc_reply_reviewers_avg": [ 9.75, 9.908960591303208 ], "wc_reply_authors_avg": [ 608.5, 157.82981340671984 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.5222329678670935, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14721047002914500142&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "pdf": "https://openreview.net/pdf?id=nbC8iTTXIrk", "email": "pku.edu.cn;pku.edu.cn;pku.edu.cn;pku.edu.cn", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Peking University", "aff_unique_dep": "", "aff_unique_url": "http://www.pku.edu.cn", "aff_unique_abbr": "Peking U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "title": "Minimax Optimality (Probably) Doesn't Imply Distribution Learning for GANs", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7087", "id": "nc0ETaieux", "poster": "", "openreview": "https://openreview.net/forum?id=nc0ETaieux", "slides": "https://iclr.cc/virtual/2022/poster/7087", "video": "https://iclr.cc/virtual/2022/poster/7087", "author_site": "Sitan Chen, Jerry Li, Yuanzhi Li, Raghu Meka", "tldr": "", "abstract": "Arguably the most fundamental question in the theory of generative adversarial networks (GANs) is to understand when GANs can actually learn the underlying distribution. Theoretical and empirical evidence (see e.g. Arora-Risteski-Zhang '18) suggest local optimality of the empirical training objective is insufficient, yet it does not rule out the possibility that achieving a true population minimax optimal solution might imply distribution learning. In this paper, we show that standard cryptographic assumptions imply that this stronger condition is still insufficient. Namely, we show that if local pseudorandom generators (PRGs) exist, then for a large family of natural target distributions, there are ReLU network generators of constant depth and poly size which take Gaussian random seeds so that (i) the output is far in Wasserstein distance from the target distribution, but (ii) no polynomially large Lipschitz discriminator ReLU network can detect this. This implies that even achieving a population minimax optimal solution to the Wasserstein GAN objective is likely insufficient for distribution learning. Our techniques reveal a deep connection between GANs and PRGs, which we believe will lead to further insights into the computational landscape of GANs.", "keywords": "theory of GANs;distribution learning;pseudorandom generators;cryptography", "primary_area": "", "supplementary_material": "/attachment/7036053b48423a94822be67ff3be2b1fcce7369a.zip", "author": "Sitan Chen;Jerry Li;Yuanzhi Li;Raghu Meka", "authorids": "~Sitan_Chen1;~Jerry_Li1;~Yuanzhi_Li1;~Raghu_Meka1", "gender": "M;M;M;M", "homepage": "https://sitanchen.com;https://jerryzli.github.io/;;http://raghumeka.org", "dblp": "141/7670;;73/3628;76/1906", "google_scholar": "YnJVsp4AAAAJ;4zybTq4AAAAJ;;xuDZ9-sAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Sitan_Chen1;~Jerry_Li1;~Yuanzhi_Li1;~Raghu_Meka1", "aff": "University of California, Berkeley;Microsoft;Carnegie Mellon University;University of California, Los Angeles", "aff_domain": "berkeley.edu;microsoft.com;andrew.cmu.edu;ucla.edu", "position": "Postdoc;Senior Researcher;Assistant Professor;Associate Professor", "bibtex": "@inproceedings{\nchen2022minimax,\ntitle={Minimax Optimality (Probably) Doesn't Imply Distribution Learning for {GAN}s},\nauthor={Sitan Chen and Jerry Li and Yuanzhi Li and Raghu Meka},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=nc0ETaieux}\n}", "github": "", "project": "", "reviewers": "yN8S;h3Jv;qEsw;DfoH;v7TX", "pdf_size": 0, "recommendation": "3;6;6;6;6", "confidence": "3;3;4;3;4", "correctness": "3;3;3;3;4", "technical_novelty": "3;4;3;3;4", "empirical_novelty": "1;3;3;0;1", "wc_summary_paper": "54;34;126;116;49", "wc_summary_review": "36;45;30;23;118", "wc_main_review": "242;185;475;202;171", "wc_review": "332;264;631;341;338", "wc_reply_reviewers": "0;0;15;73;20", "wc_reply_authors": "658;33;386;478;318", "reply_reviewers": "0;0;1;1;1", "reply_authors": "2;1;1;2;1", "recommendation_avg": [ 5.4, 1.2 ], "confidence_avg": [ 3.4, 0.4898979485566356 ], "correctness_avg": [ 3.2, 0.39999999999999997 ], "technical_novelty_avg": [ 3.4, 0.4898979485566356 ], "empirical_novelty_avg": [ 1.6, 1.2 ], "wc_summary_paper_avg": [ 75.8, 37.62127057928799 ], "wc_summary_review_avg": [ 50.4, 34.563564630980984 ], "wc_main_review_avg": [ 255.0, 112.5468791215465 ], "wc_review_avg": [ 381.2, 128.09277887531366 ], "wc_reply_reviewers_avg": [ 21.6, 26.911707489492375 ], "wc_reply_authors_avg": [ 374.6, 205.44157320269917 ], "reply_reviewers_avg": [ 0.6, 0.48989794855663565 ], "reply_authors_avg": [ 1.4, 0.4898979485566356 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.4082482904638631, "corr_recommendation_correctness": 0.25, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5905271088691545189&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=nc0ETaieux", "email": "berkeley.edu;microsoft.com;andrew.cmu.edu;ucla.edu", "author_num": 4, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "University of California, Berkeley;Microsoft;Carnegie Mellon University;University of California, Los Angeles", "aff_unique_dep": ";Microsoft Corporation;;", "aff_unique_url": "https://www.berkeley.edu;https://www.microsoft.com;https://www.cmu.edu;https://www.ucla.edu", "aff_unique_abbr": "UC Berkeley;Microsoft;CMU;UCLA", "aff_campus_unique_index": "0;2", "aff_campus_unique": "Berkeley;;Los Angeles", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Unsupervised Disentanglement with Tensor Product Representations on the Torus", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/5949", "id": "neqU3HWDgE", "poster": "", "openreview": "https://openreview.net/forum?id=neqU3HWDgE", "slides": "https://iclr.cc/virtual/2022/poster/5949", "video": "https://iclr.cc/virtual/2022/poster/5949", "author_site": "Michael Rotman, Amit Dekel, shir gur, Yaron Oz, Lior Wolf", "tldr": "", "abstract": "The current methods for learning representations with auto-encoders almost exclusively employ vectors as the latent representations. In this work, we propose to employ a tensor product structure for this purpose. This way, the obtained representations are naturally disentangled. In contrast to the conventional variations methods, which are targeted toward normally distributed features, the latent space in our representation is distributed uniformly over a set of unit circles. We argue that the torus structure of the latent space captures the generative factors effectively. We employ recent tools for measuring unsupervised disentanglement, and in an extensive set of experiments demonstrate the advantage of our method in terms of disentanglement, completeness, and informativeness. The code for our proposed method is available at https://github.com/rotmanmi/Unsupervised-Disentanglement-Torus.", "keywords": "Variational Auto-Encoder;Disentanglement Learning", "primary_area": "", "supplementary_material": "/attachment/40d8a3d6a5f52a58c91870187a926f231723c1a0.zip", "author": "Michael Rotman;Amit Dekel;Shir Gur;Yaron Oz;Lior Wolf", "authorids": "~Michael_Rotman1;~Amit_Dekel1;~Shir_Gur1;~Yaron_Oz1;~Lior_Wolf1", "gender": ";M;;;M", "homepage": "https://rotmanmichael.com;;http://www.gurshir.com;;http://www.cs.tau.ac.il/~wolf", "dblp": "217/3007;259/2006;211/7229;;83/4103", "google_scholar": "tzlpNi8AAAAJ;mY12KaoAAAAJ;;;UbFrXTsAAAAJ", "orcid": ";;;;0000-0001-5578-8892", "linkedin": ";;;;", "or_profile": "~Michael_Rotman1;~Amit_Dekel1;~Shir_Gur1;~Yaron_Oz1;~Lior_Wolf1", "aff": "General Electric;Univrses;School of Computer Science, Tel Aviv University;Tel Aviv University, Technion;Tel Aviv University", "aff_domain": "ge.com;univrses.com;cs.tau.ac.il;tau.ac.il;tau.ac.il", "position": "Researcher;Researcher;PhD student;Full Professor;Full Professor", "bibtex": "@inproceedings{\nrotman2022unsupervised,\ntitle={Unsupervised Disentanglement with Tensor Product Representations on the Torus},\nauthor={Michael Rotman and Amit Dekel and Shir Gur and Yaron Oz and Lior Wolf},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=neqU3HWDgE}\n}", "github": "", "project": "", "reviewers": "UzjD;4Dgs;YGh8;DucY", "pdf_size": 0, "recommendation": "3;6;8;8", "confidence": "5;3;3;4", "correctness": "3;3;4;4", "technical_novelty": "2;4;4;3", "empirical_novelty": "2;3;4;3", "wc_summary_paper": "27;60;106;102", "wc_summary_review": "20;33;79;50", "wc_main_review": "349;171;347;370", "wc_review": "396;264;532;522", "wc_reply_reviewers": "540;0;277;31", "wc_reply_authors": "2111;747;2023;177", "reply_reviewers": "5;0;2;1", "reply_authors": "6;1;5;1", "recommendation_avg": [ 6.25, 2.0463381929681126 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.25, 0.82915619758885 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 73.75, 32.45285041410076 ], "wc_summary_review_avg": [ 45.5, 22.073740054644116 ], "wc_main_review_avg": [ 309.25, 80.32550964668697 ], "wc_review_avg": [ 428.5, 109.05388576295665 ], "wc_reply_reviewers_avg": [ 212.0, 217.666028585078 ], "wc_reply_authors_avg": [ 1264.5, 828.001660626354 ], "reply_reviewers_avg": [ 2.0, 1.8708286933869707 ], "reply_authors_avg": [ 3.25, 2.277608394786075 ], "replies_avg": [ 26, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.6998739952495694, "corr_recommendation_correctness": 0.8551861104941366, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12503699134919857893&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=neqU3HWDgE", "email": "ge.com;univrses.com;cs.tau.ac.il;tau.ac.il;tau.ac.il", "author_num": 5, "aff_unique_index": "0;1;2;2;2", "aff_unique_norm": "General Electric;Univrses;Tel Aviv University", "aff_unique_dep": ";;School of Computer Science", "aff_unique_url": "https://www.ge.com;;https://www.tau.ac.il", "aff_unique_abbr": "GE;;TAU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Tel Aviv", "aff_country_unique_index": "0;2;2;2", "aff_country_unique": "United States;;Israel" }, { "title": "Surreal-GAN:Semi-Supervised Representation Learning via GAN for uncovering heterogeneous disease-related imaging patterns", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6944", "id": "nf3A0WZsXS5", "poster": "", "openreview": "https://openreview.net/forum?id=nf3A0WZsXS5", "slides": "https://iclr.cc/virtual/2022/poster/6944", "video": "https://iclr.cc/virtual/2022/poster/6944", "author_site": "Zhijian Yang, Junhao Wen, Christos Davatzikos", "tldr": "", "abstract": "A plethora of machine learning methods have been applied to imaging data, enabling the construction of clinically relevant imaging signatures of neurological and neuropsychiatric diseases. Oftentimes, such methods don't explicitly model the heterogeneity of disease effects, or approach it via nonlinear models that are not interpretable. Moreover, unsupervised methods may parse heterogeneity that is driven by nuisance confounding factors that affect brain structure or function, rather than heterogeneity relevant to a pathology of interest. On the other hand, semi-supervised clustering methods seek to derive a dichotomous subtype membership, ignoring the truth that disease heterogeneity spatially and temporally extends along a continuum. To address the aforementioned limitations, herein, we propose a novel method, termed Surreal-GAN (Semi-SUpeRvised ReprEsentAtion Learning via GAN). Using cross-sectional imaging data, Surreal-GAN dissects underlying disease-related heterogeneity under the principle of semi-supervised clustering (cluster mappings from normal control to patient), proposes a continuously dimensional representation, and infers the disease severity of patients at individual level along each dimension. The model first learns a transformation function from normal control (CN) domain to the patient (PT) domain with latent variables controlling transformation directions. An inverse mapping function together with regularization on function continuity, pattern orthogonality and monotonicity was also imposed to make sure that the transformation function captures necessarily meaningful imaging patterns with clinical significance. We first validated the model through extensive semi-synthetic experiments, and then demonstrate its potential in capturing biologically plausible imaging patterns in Alzheimer's disease (AD).", "keywords": "Representation Learning;Disease-related imaging patterns;Alzheimer's disease;MRI;GAN", "primary_area": "", "supplementary_material": "", "author": "Zhijian Yang;Junhao Wen;Christos Davatzikos", "authorids": "~Zhijian_Yang2;junhao.wen89@gmail.com;~Christos_Davatzikos1", "gender": "M;;M", "homepage": "https://zhijian-yang.github.io;;", "dblp": ";;", "google_scholar": "CdJ2xRIAAAAJ;;", "orcid": "0000-0002-7863-827X;;", "linkedin": ";;", "or_profile": "~Zhijian_Yang2;junhao.wen89@gmail.com;~Christos_Davatzikos1", "aff": "University of Pennsyvania;;", "aff_domain": "sas.upenn.edu;;", "position": "PhD student;;", "bibtex": "@inproceedings{\nyang2022surrealgansemisupervised,\ntitle={Surreal-{GAN}:Semi-Supervised Representation Learning via {GAN} for uncovering heterogeneous disease-related imaging patterns},\nauthor={Zhijian Yang and Junhao Wen and Christos Davatzikos},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=nf3A0WZsXS5}\n}", "github": "", "project": "", "reviewers": "jVuo;8j3t;QqEE;Rbz3", "pdf_size": 0, "recommendation": "5;6;8;8", "confidence": "4;4;3;4", "correctness": "3;3;3;3", "technical_novelty": "1;3;3;4", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "33;121;93;161", "wc_summary_review": "29;21;33;53", "wc_main_review": "170;322;494;291", "wc_review": "232;464;620;505", "wc_reply_reviewers": "0;13;56;0", "wc_reply_authors": "925;991;991;1172", "reply_reviewers": "0;1;1;0", "reply_authors": "2;3;3;2", "recommendation_avg": [ 6.75, 1.299038105676658 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.75, 1.0897247358851685 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 102.0, 46.593991028886975 ], "wc_summary_review_avg": [ 34.0, 11.789826122551595 ], "wc_main_review_avg": [ 319.25, 115.77861417377564 ], "wc_review_avg": [ 455.25, 141.00952981979623 ], "wc_reply_reviewers_avg": [ 17.25, 22.993205518152532 ], "wc_reply_authors_avg": [ 1019.75, 91.93849846500649 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 2.5, 0.5 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.5555555555555555, "corr_recommendation_correctness": 0.0, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1, "pdf": "https://openreview.net/pdf?id=nf3A0WZsXS5", "email": "sas.upenn.edu;;", "author_num": 3, "aff_unique_index": "0", "aff_unique_norm": "University of Pennsylvania", "aff_unique_dep": "", "aff_unique_url": "https://www.upenn.edu", "aff_unique_abbr": "UPenn", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "id": "ngjR4Gw9oAp", "title": "Soft Actor-Critic with Inhibitory Networks for Faster Retraining", "track": "main", "status": "Reject", "tldr": "", "abstract": "Reusing previously trained models is critical in deep reinforcement learning to speed up training of new agents. However, it is unclear how to acquire new skills when objectives and constraints are in conflict with previously learned skills. Moreover, when retraining, there is an intrinsic conflict between exploiting what has already been learned and exploring new skills. In soft actor-critic (SAC) methods, a temperature parameter can be dynamically adjusted to weight the action entropy and balance the explore $\\times$ exploit trade-off. However, controlling a single coefficient can be challenging within the context of retraining, even more so when goals are contradictory. In this work, inspired by neuroscience research, we propose a novel approach using inhibitory networks to allow separate and adaptive state value evaluations, as well as distinct automatic entropy tuning. Ultimately, our approach allows for controlling inhibition to handle conflict between exploiting less risky, acquired behaviors and exploring novel ones to overcome more challenging tasks. We validate our method through experiments in OpenAI Gym environments. ", "keywords": "soft actor-critic;SAC;maximum entropy RL;inhibitory response;cognitive control", "primary_area": "", "supplementary_material": "/attachment/fce454a55bbd0fc0128bc934dbf149a76ed8097f.zip", "author": "Jaime S. Ide;Daria Micovic;Adrian P Pope;Michael John Guarino;Kevin Alcedo;David Rosenbluth", "authorids": "~Jaime_S._Ide1;~Daria_Micovic1;~Adrian_P_Pope1;~Michael_John_Guarino1;~Kevin_Alcedo1;~David_Rosenbluth1", "gender": "F;;M;;M;M", "homepage": ";https://www.primordial-labs.com;;http://google.com;;https://medicine.yale.edu/profile/jaime_ide/", "dblp": ";;;;;42/962", "google_scholar": ";;;;;6VoMxawAAAAJ", "orcid": ";;;;;0000-0002-7223-1102", "linkedin": "daria-mi%C4%87ovi%C4%87-aa531910a;;michael-g-331006116;;david-rosenbluth-353123bb/;jaime-ide-a911407/", "or_profile": "~Daria_Micovic1;~Adrian_P_Pope1;~Michael_John_Guarino1;~Kevin_Alcedo1;~David_Rosenbluth1;~Jaime_Shinsuke_Ide1", "aff": "Lockheed Martin;UC Santa Barbara;;School of Engineering and Applied Science, University of Pennsylvania;;Yale University", "aff_domain": "lmco.com;ucsb.edu;;seas.upenn.edu;;yale.edu", "position": "Researcher;Undergrad student;;MS student;;Researcher", "bibtex": "@misc{\nide2022soft,\ntitle={Soft Actor-Critic with Inhibitory Networks for Faster Retraining},\nauthor={Jaime S. Ide and Daria Micovic and Adrian P Pope and Michael John Guarino and Kevin Alcedo and David Rosenbluth},\nyear={2022},\nurl={https://openreview.net/forum?id=ngjR4Gw9oAp}\n}", "github": "", "project": "", "reviewers": "y7rr;tm8g;6ibM;dDqD", "site": "https://openreview.net/forum?id=ngjR4Gw9oAp", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "3;3;4;4", "correctness": "2;3;3;3", "technical_novelty": "2;2;3;2", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "90;69;90;72", "wc_summary_review": "23;36;202;96", "wc_main_review": "727;625;488;880", "wc_review": "840;730;780;1048", "wc_reply_reviewers": "56;0;0;96", "wc_reply_authors": "746;482;974;585", "reply_reviewers": "1;0;0;1", "reply_authors": "1;1;2;1", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 80.25, 9.807522622966516 ], "wc_summary_review_avg": [ 89.25, 70.68017756061455 ], "wc_main_review_avg": [ 680.0, 143.26374279628465 ], "wc_review_avg": [ 849.5, 121.04028255089295 ], "wc_reply_reviewers_avg": [ 38.0, 40.54626986542659 ], "wc_reply_authors_avg": [ 696.75, 185.6736047476862 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2823374662951755278&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "Lockheed Martin Corporation;University of California, Santa Barbara;University of Pennsylvania;Yale University", "aff_unique_dep": ";;School of Engineering and Applied Science;", "aff_unique_url": "https://www.lockheedmartin.com;https://www.ucsb.edu;https://www.upenn.edu;https://www.yale.edu", "aff_unique_abbr": "Lockheed Martin;UCSB;UPenn;Yale", "aff_campus_unique_index": "1", "aff_campus_unique": ";Santa Barbara", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "A Comparison of Hamming Errors of Representative Variable Selection Methods", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6651", "id": "nhN-fqxmNGx", "poster": "", "openreview": "https://openreview.net/forum?id=nhN-fqxmNGx", "slides": "https://iclr.cc/virtual/2022/poster/6651", "video": "https://iclr.cc/virtual/2022/poster/6651", "author_site": "Tracy Ke, Longlin Wang", "tldr": "", "abstract": "Lasso is a celebrated method for variable selection in linear models, but it faces challenges when the covariates are moderately or strongly correlated. This motivates alternative approaches such as using a non-convex penalty, adding a ridge regularization, or conducting a post-Lasso thresholding. In this paper, we compare Lasso with 5 other methods: Elastic net, SCAD, forward selection, thresholded Lasso, and forward backward selection. We measure their performances theoretically by the expected Hamming error, assuming that the regression coefficients are ${\\it iid}$ drawn from a two-point mixture and that the Gram matrix is block-wise diagonal. By deriving the rates of convergence of Hamming errors and the phase diagrams, we obtain useful conclusions about the pros and cons of different methods.", "keywords": "Lasso;Hamming error;phase diagram;rare and weak signals;elastic net;SCAD;thresholded Lasso;forward selection;forward backward selection", "primary_area": "", "supplementary_material": "/attachment/4f16334f29e6421dce68a951e210ea5abf0531db.zip", "author": "Tracy Ke;Longlin Wang", "authorids": "~Tracy_Ke1;~Longlin_Wang1", "gender": "M;F", "homepage": ";https://www.tracyke.net", "dblp": ";185/0521.html", "google_scholar": ";JlIhAO4AAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Longlin_Wang1;~Zheng_Ke1", "aff": "Statistics Department, Harvard University;Harvard University", "aff_domain": "g.harvard.edu;harvard.edu", "position": "PhD student;Assistant Professor", "bibtex": "@inproceedings{\nke2022a,\ntitle={A Comparison of Hamming Errors of Representative Variable Selection Methods},\nauthor={Tracy Ke and Longlin Wang},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=nhN-fqxmNGx}\n}", "github": "", "project": "", "reviewers": "darD;fogT;3QXZ;YAB8", "pdf_size": 0, "recommendation": "3;6;6;8", "confidence": "5;3;3;4", "correctness": "3;4;3;4", "technical_novelty": "2;3;3;4", "empirical_novelty": "1;3;3;4", "wc_summary_paper": "36;130;73;110", "wc_summary_review": "146;181;16;20", "wc_main_review": "173;208;99;240", "wc_review": "355;519;188;370", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 5.75, 1.7853571071357126 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.75, 1.0897247358851685 ], "wc_summary_paper_avg": [ 87.25, 35.967867604293694 ], "wc_summary_review_avg": [ 90.75, 73.80845141310039 ], "wc_main_review_avg": [ 180.0, 52.426138518872435 ], "wc_review_avg": [ 358.0, 117.23267462614679 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.5488604301969737, "corr_recommendation_correctness": 0.7001400420140049, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6939802812733498505&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=nhN-fqxmNGx", "email": "g.harvard.edu;harvard.edu", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Harvard University", "aff_unique_dep": "Statistics Department", "aff_unique_url": "https://www.harvard.edu", "aff_unique_abbr": "Harvard", "aff_campus_unique_index": "0", "aff_campus_unique": "Cambridge;", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Learning Vision-Guided Quadrupedal Locomotion End-to-End with Cross-Modal Transformers", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6919", "id": "nhnJ3oo6AB", "poster": "", "openreview": "https://openreview.net/forum?id=nhnJ3oo6AB", "slides": "https://iclr.cc/virtual/2022/poster/6919", "video": "https://iclr.cc/virtual/2022/poster/6919", "author_site": "Ruihan Yang, Minghao Zhang, Nicklas Hansen, Huazhe Xu, Xiaolong Wang", "tldr": "", "abstract": "We propose to address quadrupedal locomotion tasks using Reinforcement Learning (RL) with a Transformer-based model that learns to combine proprioceptive information and high-dimensional depth sensor inputs. While learning-based locomotion has made great advances using RL, most methods still rely on domain randomization for training blind agents that generalize to challenging terrains. Our key insight is that proprioceptive states only offer contact measurements for immediate reaction, whereas an agent equipped with visual sensory observations can learn to proactively maneuver environments with obstacles and uneven terrain by anticipating changes in the environment many steps ahead. In this paper, we introduce LocoTransformer, an end-to-end RL method that leverages both proprioceptive states and visual observations for locomotion control. We evaluate our method in challenging simulated environments with different obstacles and uneven terrain. We transfer our learned policy from simulation to a real robot by running it indoor and in-the-wild with unseen obstacles and terrain. Our method not only significantly improves over baselines, but also achieves far better generalization performance, especially when transferred to the real robot. Our project page with videos is at https://rchalyang.github.io/LocoTransformer/.", "keywords": "Reinforcement Learning;Robotics;Locomotion Control;Multi-Modal Transformer", "primary_area": "", "supplementary_material": "", "author": "Ruihan Yang;Minghao Zhang;Nicklas Hansen;Huazhe Xu;Xiaolong Wang", "authorids": "~Ruihan_Yang2;~Minghao_Zhang1;~Nicklas_Hansen1;~Huazhe_Xu1;~Xiaolong_Wang3", "gender": "M;M;Non-Binary;M;M", "homepage": "http://rchalyang.github.io/;https://www.minghaozhang.com;https://nicklashansen.github.io;http://hxu.rocks;https://xiaolonw.github.io/", "dblp": ";137/0566;258/0744.html;164/9006;91/952-4", "google_scholar": "b-o1o7cAAAAJ;moOv1BsAAAAJ;OFtDgzwAAAAJ;t9HPFawAAAAJ;Y8O9N_0AAAAJ", "orcid": ";;0000-0001-9897-4003;;", "linkedin": ";;ncklas;;", "or_profile": "~Ruihan_Yang2;~Minghao_Zhang1;~Nicklas_Hansen1;~Huazhe_Xu1;~Xiaolong_Wang3", "aff": "University of California, San Diego;Tsinghua University;Meta;Stanford University;University of California, San Diego", "aff_domain": "ucsd.edu;tsinghua.edu.cn;fb.com;stanford.edu;ucsd.edu", "position": "PhD student;Undergrad student;Intern;Postdoc;Assistant Professor", "bibtex": "@inproceedings{\nyang2022learning,\ntitle={Learning Vision-Guided Quadrupedal Locomotion End-to-End with Cross-Modal Transformers},\nauthor={Ruihan Yang and Minghao Zhang and Nicklas Hansen and Huazhe Xu and Xiaolong Wang},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=nhnJ3oo6AB}\n}", "github": "", "project": "", "reviewers": "XXAB;Dcnh;GRoY;scjr", "pdf_size": 0, "recommendation": "6;8;8;8", "confidence": "4;5;4;4", "correctness": "2;4;4;3", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "70;49;93;105", "wc_summary_review": "43;46;47;114", "wc_main_review": "148;502;475;615", "wc_review": "261;597;615;834", "wc_reply_reviewers": "0;7;0;0", "wc_reply_authors": "85;752;326;677", "reply_reviewers": "0;1;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 7.5, 0.8660254037844386 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 79.25, 21.52179128232592 ], "wc_summary_review_avg": [ 62.5, 29.769951293208393 ], "wc_main_review_avg": [ 435.0, 173.82318602534014 ], "wc_review_avg": [ 576.75, 204.78571117145844 ], "wc_reply_reviewers_avg": [ 1.75, 3.031088913245535 ], "wc_reply_authors_avg": [ 460.0, 269.69149041080254 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": 0.8703882797784891, "gs_citation": 109, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18096638761623546135&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "pdf": "https://openreview.net/pdf?id=nhnJ3oo6AB", "email": "ucsd.edu;tsinghua.edu.cn;fb.com;stanford.edu;ucsd.edu", "author_num": 5, "aff_unique_index": "0;1;2;3;0", "aff_unique_norm": "University of California, San Diego;Tsinghua University;Meta;Stanford University", "aff_unique_dep": ";;Meta Platforms, Inc.;", "aff_unique_url": "https://www.ucsd.edu;https://www.tsinghua.edu.cn;https://meta.com;https://www.stanford.edu", "aff_unique_abbr": "UCSD;THU;Meta;Stanford", "aff_campus_unique_index": "0;2;0", "aff_campus_unique": "San Diego;;Stanford", "aff_country_unique_index": "0;1;0;0;0", "aff_country_unique": "United States;China" }, { "id": "niZImJIrqVt", "title": "Mean-Variance Efficient Reinforcement Learning by Expected Quadratic Utility Maximization", "track": "main", "status": "Reject", "tldr": "", "abstract": "Risk management is critical in decision making, and mean-variance (MV) trade-off is one of the most common criteria. However, in reinforcement learning (RL) for sequential decision making under uncertainty, most of the existing methods for MV control suffer from computational difficulties owing to calculating the gradient of the variance term. In this paper, in contrast to strict MV control, we consider learning MV efficient policies that achieve Pareto efficiency regarding MV trade-off. To achieve this purpose, we train an agent to maximize the expected quadratic utility function, a common objective of risk management in finance and economics. We call our approach RL based on expected quadratic utility maximization (EQUMRL). The EQUMRL does not suffer from the computational difficulties because it does not include gradient estimation of the variance. We confirm that the maximizer of the objective in the EQUMRL directly corresponds to an MV efficient policy under a certain condition. We conduct experiments with benchmark settings to demonstrate the effectiveness of the EQUMRL.", "keywords": "Reinforcement learning;Mean-variance tradeoff", "primary_area": "", "supplementary_material": "/attachment/94e2c41f8ec3d209e054554c0edf9d2c59ff07ff.zip", "author": "Masahiro Kato;Kei Nakagawa;Kenshi Abe;Tetsuro Morimura", "authorids": "~Masahiro_Kato1;~Kei_Nakagawa1;~Kenshi_Abe1;~Tetsuro_Morimura1", "gender": "M;M;M;M", "homepage": "https://masakat0.github.io/;https://sites.google.com/view/keinakagawa/home;https://bakanaouji.github.io/;", "dblp": ";151/9863;254/2763;36/1501", "google_scholar": "https://scholar.google.co.jp/schhp?hl=ja;https://scholar.google.co.jp/citations?user=SDYNtbAAAAAJ;rImmohoAAAAJ;https://scholar.google.co.jp/citations?user=IgjF21EAAAAJ", "orcid": ";0000-0001-5046-8128;;", "linkedin": ";kei-nakagawa-0046979a/;;", "or_profile": "~Masahiro_Kato1;~Kei_Nakagawa1;~Kenshi_Abe1;~Tetsuro_Morimura1", "aff": "Cyberagent;Nomura Asset Management co,ltd.;CyberAgent, Inc.;CyberAgent, Inc.", "aff_domain": "cyberagent.co.jp;nomura-am.co.jp;cyberagent.co.jp;cyberagent.co.jp", "position": "Researcher;Principal Researcher;Research scientist;Researcher", "bibtex": "@misc{\nkato2022meanvariance,\ntitle={Mean-Variance Efficient Reinforcement Learning by Expected Quadratic Utility Maximization},\nauthor={Masahiro Kato and Kei Nakagawa and Kenshi Abe and Tetsuro Morimura},\nyear={2022},\nurl={https://openreview.net/forum?id=niZImJIrqVt}\n}", "github": "", "project": "", "reviewers": "TnBm;wFpo;FXvX;fgBH;MBfd", "site": "https://openreview.net/forum?id=niZImJIrqVt", "pdf_size": 0, "recommendation": "5;5;5;6;8", "confidence": "4;3;3;4;4", "correctness": "3;3;4;3;4", "technical_novelty": "2;2;3;3;3", "empirical_novelty": "2;3;2;0;3", "wc_summary_paper": "104;58;64;57;90", "wc_summary_review": "62;55;16;29;16", "wc_main_review": "485;287;242;138;159", "wc_review": "651;400;322;224;265", "wc_reply_reviewers": "0;269;109;0;0", "wc_reply_authors": "793;818;728;437;174", "reply_reviewers": "0;2;1;0;0", "reply_authors": "1;3;2;1;1", "recommendation_avg": [ 5.8, 1.16619037896906 ], "confidence_avg": [ 3.6, 0.4898979485566356 ], "correctness_avg": [ 3.4, 0.4898979485566356 ], "technical_novelty_avg": [ 2.6, 0.4898979485566356 ], "empirical_novelty_avg": [ 2.0, 1.0954451150103321 ], "wc_summary_paper_avg": [ 74.6, 18.969449122207003 ], "wc_summary_review_avg": [ 35.6, 19.417517864031954 ], "wc_main_review_avg": [ 262.2, 123.8860766995226 ], "wc_review_avg": [ 372.4, 151.31239209000697 ], "wc_reply_reviewers_avg": [ 75.6, 105.51322192028827 ], "wc_reply_authors_avg": [ 590.0, 248.4761557976942 ], "reply_reviewers_avg": [ 0.6, 0.7999999999999999 ], "reply_authors_avg": [ 1.6, 0.8 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.560112033611204, "corr_recommendation_correctness": 0.4900980294098034, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10829040113741111485&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1;2;2", "aff_unique_norm": "CyberAgent Inc.;Nomura Asset Management Co., Ltd.;CyberAgent", "aff_unique_dep": ";;", "aff_unique_url": "https://www.cyberagent.co.jp;https://www.nomura-am.com;https://www.cyberagent.co.jp", "aff_unique_abbr": "CyberAgent;NAM;CyberAgent", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Japan" }, { "title": "Likelihood Training of Schr\u00f6dinger Bridge using Forward-Backward SDEs Theory", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6506", "id": "nioAdKCEdXB", "poster": "", "openreview": "https://openreview.net/forum?id=nioAdKCEdXB", "slides": "https://iclr.cc/virtual/2022/poster/6506", "video": "https://iclr.cc/virtual/2022/poster/6506", "author_site": "Tianrong Chen, Guan-Horng Liu, Evangelos Theodorou", "tldr": "", "abstract": "Schr\u00f6dinger Bridge (SB) is an entropy-regularized optimal transport problem that has received increasing attention in deep generative modeling for its mathematical flexibility compared to the Scored-based Generative Model (SGM). However, it remains unclear whether the optimization principle of SB relates to the modern training of deep generative models, which often rely on constructing log-likelihood objectives.This raises questions on the suitability of SB models as a principled alternative for generative applications. In this work, we present a novel computational framework for likelihood training of SB models grounded on Forward-Backward Stochastic Differential Equations Theory \u2013 a mathematical methodology appeared in stochastic optimal control that transforms the optimality condition of SB into a set of SDEs. Crucially, these SDEs can be used to construct the likelihood objectives for SB that, surprisingly, generalizes the ones for SGM as special cases. This leads to a new optimization principle that inherits the same SB optimality yet without losing applications of modern generative training techniques, and we show that the resulting training algorithm achieves comparable results on generating realistic images on MNIST, CelebA, and CIFAR10. Our code is available at https://github.com/ghliu/SB-FBSDE.", "keywords": "Schr\u00f6dinger Bridge;score-based generative model;optimal transport;forward-backward stochastic differential equations;stochastic optimal control", "primary_area": "", "supplementary_material": "", "author": "Tianrong Chen;Guan-Horng Liu;Evangelos Theodorou", "authorids": "~Tianrong_Chen1;~Guan-Horng_Liu1;~Evangelos_Theodorou1", "gender": "M;;M", "homepage": "https://tianrongchen.github.io/;https://ghliu.github.io;", "dblp": "227/7295;143/6907;155/9964", "google_scholar": "r9D3Fg50gMoC;2Dt0VJ4AAAAJ;", "orcid": ";;", "linkedin": "tianrong-chen-757b3216a/;;", "or_profile": "~Tianrong_Chen1;~Guan-Horng_Liu1;~Evangelos_Theodorou1", "aff": "Georgia Institute of Technology;Georgia Institute of Technology;Georgia Institute of Technology", "aff_domain": "gatech.edu;gatech.edu;gatech.edu", "position": "PhD student;PhD student;Assistant Professor", "bibtex": "@inproceedings{\nchen2022likelihood,\ntitle={Likelihood Training of Schr\\\"odinger Bridge using Forward-Backward {SDE}s Theory},\nauthor={Tianrong Chen and Guan-Horng Liu and Evangelos Theodorou},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=nioAdKCEdXB}\n}", "github": "", "project": "", "reviewers": "4oR9;dQMT;Sm2V;8tG7", "pdf_size": 0, "recommendation": "5;6;8;8", "confidence": "3;5;4;3", "correctness": "3;3;3;3", "technical_novelty": "4;3;4;3", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "67;95;103;69", "wc_summary_review": "58;127;67;49", "wc_main_review": "645;848;555;432", "wc_review": "770;1070;725;550", "wc_reply_reviewers": "93;792;83;144", "wc_reply_authors": "2131;2925;695;1522", "reply_reviewers": "1;1;1;2", "reply_authors": "5;5;1;4", "recommendation_avg": [ 6.75, 1.299038105676658 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 3.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 83.5, 15.771810295587505 ], "wc_summary_review_avg": [ 75.25, 30.548117781624452 ], "wc_main_review_avg": [ 620.0, 151.804150140897 ], "wc_review_avg": [ 778.75, 187.1621957020167 ], "wc_reply_reviewers_avg": [ 278.0, 297.65836121298526 ], "wc_reply_authors_avg": [ 1818.25, 817.3375603139746 ], "reply_reviewers_avg": [ 1.25, 0.4330127018922193 ], "reply_authors_avg": [ 3.75, 1.6393596310755 ], "replies_avg": [ 33, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.058025885318565944, "corr_recommendation_correctness": 0.0, "gs_citation": 181, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17490002779543160036&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=nioAdKCEdXB", "email": "gatech.edu;gatech.edu;gatech.edu", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Georgia Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.gatech.edu", "aff_unique_abbr": "Georgia Tech", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "nj6G6ZPMuX", "title": "Reconstruction for disentanglement, Contrast for invariance", "track": "main", "status": "Withdraw", "tldr": "", "abstract": " Disentangled and invariant representation are two vital goals for representation learning and many approaches have been proposed to achieve one of them. However, those two goals are actually complementary to each other and we propose a framework to accomplish both of them together. We introduce weakly supervised signals to learn disentangled representation and use contrastive methods to enforce invariant representation. By experimenting on state-of-the-art datasets, the results show that our framework outperforms previous works on both tasks.\n\n\n\n", "keywords": "Representation learning;Disentanglement learning;Invariant learning", "primary_area": "", "supplementary_material": "", "author": "Jiageng Zhu;Hanchen Xie;Wael AbdAlmageed", "authorids": "~Jiageng_Zhu1;~Hanchen_Xie1;~Wael_AbdAlmageed2", "gender": "M;M;M", "homepage": ";;https://www.clemson.edu/cecas/departments/ece/faculty_staff/faculty/wabdalmageed.html", "dblp": "321/9991;280/0511;", "google_scholar": "UhGyWuYAAAAJ;yVPreH4AAAAJ;tRGH8FkAAAAJ", "orcid": "0009-0002-0162-6534;0009-0004-4474-4877;", "linkedin": "jiageng-zhu-05269214a;;wael-abdalmageed-8233536", "or_profile": "~Jiageng_Zhu1;~Hanchen_Xie1;~Wael_AbdAlmageed2", "aff": "University of Southern California;USC/ISI;USC Information Sciences Institute", "aff_domain": "usc.edu;isi.edu;isi.edu", "position": "PhD student;Graduate Research Assistant;Research Director", "bibtex": "@misc{\nzhu2022reconstruction,\ntitle={Reconstruction for disentanglement, Contrast for invariance},\nauthor={Jiageng Zhu and Hanchen Xie and Wael AbdAlmageed},\nyear={2022},\nurl={https://openreview.net/forum?id=nj6G6ZPMuX}\n}", "github": "", "project": "", "reviewers": "i9bg;b4dg;w5Q5;oijK", "site": "https://openreview.net/forum?id=nj6G6ZPMuX", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "4;5;4;4", "correctness": "3;2;3;4", "technical_novelty": "2;2;3;2", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "185;158;24;111", "wc_summary_review": "75;64;29;24", "wc_main_review": "554;1243;413;347", "wc_review": "814;1465;466;482", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 119.5, 61.1657583947097 ], "wc_summary_review_avg": [ 48.0, 21.920310216782973 ], "wc_main_review_avg": [ 639.25, 356.50411989204275 ], "wc_review_avg": [ 806.75, 404.6352524187679 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.7071067811865475, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:pML_gnpuQcoJ:scholar.google.com/&scioq=Reconstruction+for+disentanglement,+Contrast+for+invariance&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Southern California", "aff_unique_dep": "", "aff_unique_url": "https://www.usc.edu", "aff_unique_abbr": "USC", "aff_campus_unique_index": "0;1", "aff_campus_unique": "Los Angeles;ISI;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Autonomous Reinforcement Learning: Formalism and Benchmarking", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7153", "id": "nkaba3ND7B5", "poster": "", "openreview": "https://openreview.net/forum?id=nkaba3ND7B5", "slides": "https://iclr.cc/virtual/2022/poster/7153", "video": "https://iclr.cc/virtual/2022/poster/7153", "author_site": "Archit Sharma, Kelvin Xu, Nikhil Sardana, Abhishek Gupta, Karol Hausman, Sergey Levine, Chelsea Finn", "tldr": "", "abstract": "Reinforcement learning (RL) provides a naturalistic framing for learning through trial and error, which is appealing both because of its simplicity and effectiveness and because of its resemblance to how humans and animals acquire skills through experience. However, real-world embodied learning, such as that performed by humans and animals, is situated in a continual, non-episodic world, whereas common benchmark tasks in RL are episodic, with the environment resetting between trials to provide the agent with multiple attempts. This discrepancy presents a major challenge when we attempt to take RL algorithms developed for episodic simulated environments and run them on real-world platforms, such as robots. In this paper, we aim to address this discrepancy by laying out a framework for Autonomous Reinforcement Learning (ARL): reinforcement learning where the agent not only learns through its own experience, but also contends with lack of human supervision to reset between trials. We introduce a simulated benchmark EARL based on this framework, containing a set of diverse and challenging simulated tasks reflective of the hurdles introduced to learning when only a minimal reliance on extrinsic intervention can be assumed. We show that standard approaches to episodic RL and existing approaches struggle as interventions are minimized, underscoring the need for developing new algorithms for reinforcement learning with a greater focus on autonomy.", "keywords": "reinforcement learning;autonomous;reset-free reinforcement learning;continual reinforcement learning", "primary_area": "", "supplementary_material": "/attachment/b29a03fb4c6837df5c14ab66e062b49205ece33a.zip", "author": "Archit Sharma;Kelvin Xu;Nikhil Sardana;Abhishek Gupta;Karol Hausman;Sergey Levine;Chelsea Finn", "authorids": "~Archit_Sharma1;~Kelvin_Xu2;~Nikhil_Sardana1;~Abhishek_Gupta1;~Karol_Hausman2;~Sergey_Levine1;~Chelsea_Finn1", "gender": "M;Unspecified;;M;;M;F", "homepage": ";http://kelvinxu.github.io/;;https://homes.cs.washington.edu/~abhgupta/;;https://people.eecs.berkeley.edu/~svlevine/;https://ai.stanford.edu/~cbfinn/", "dblp": "220/3163.html;159/1894;;18/6404-4;;80/7594;131/1783", "google_scholar": "_0IIzxgAAAAJ;GyoKzFwAAAAJ;;1wLVDP4AAAAJ;;8R35rCwAAAAJ;vfPE6hgAAAAJ", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": "~Archit_Sharma1;~Kelvin_Xu2;~Nikhil_Sardana1;~Abhishek_Gupta1;~Karol_Hausman2;~Sergey_Levine1;~Chelsea_Finn1", "aff": "Stanford University;University of California, Berkeley;;Massachusetts Institute of Technology;;Google;Google", "aff_domain": "stanford.edu;berkeley.edu;;mit.edu;;google.com;google.com", "position": "Graduate Student;PhD student;;Postdoc;;Research Scientist;Research Scientist", "bibtex": "@inproceedings{\nsharma2022autonomous,\ntitle={Autonomous Reinforcement Learning: Formalism and Benchmarking},\nauthor={Archit Sharma and Kelvin Xu and Nikhil Sardana and Abhishek Gupta and Karol Hausman and Sergey Levine and Chelsea Finn},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=nkaba3ND7B5}\n}", "github": "", "project": "", "reviewers": "AL8i;omLm;sj9u;C7Cj", "pdf_size": 0, "recommendation": "3;5;8;8", "confidence": "4;4;4;4", "correctness": "2;4;4;4", "technical_novelty": "1;2;3;2", "empirical_novelty": "2;1;3;3", "wc_summary_paper": "61;55;82;148", "wc_summary_review": "36;260;48;46", "wc_main_review": "483;49;331;238", "wc_review": "580;364;461;432", "wc_reply_reviewers": "0;0;105;0", "wc_reply_authors": "1340;626;271;69", "reply_reviewers": "0;0;1;0", "reply_authors": "2;1;1;1", "recommendation_avg": [ 6.0, 2.1213203435596424 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.5, 0.8660254037844386 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 86.5, 36.89512162874653 ], "wc_summary_review_avg": [ 97.5, 93.92949483522202 ], "wc_main_review_avg": [ 275.25, 157.19792460462065 ], "wc_review_avg": [ 459.25, 78.1004961571948 ], "wc_reply_reviewers_avg": [ 26.25, 45.46633369868303 ], "wc_reply_authors_avg": [ 576.5, 483.80497103688384 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.8164965809277261, "gs_citation": 36, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9771677506162307722&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=nkaba3ND7B5", "email": "stanford.edu;berkeley.edu;;mit.edu;;google.com;google.com", "author_num": 7, "aff_unique_index": "0;1;2;3;3", "aff_unique_norm": "Stanford University;University of California, Berkeley;Massachusetts Institute of Technology;Google", "aff_unique_dep": ";;;Google", "aff_unique_url": "https://www.stanford.edu;https://www.berkeley.edu;https://web.mit.edu;https://www.google.com", "aff_unique_abbr": "Stanford;UC Berkeley;MIT;Google", "aff_campus_unique_index": "0;1;3;3", "aff_campus_unique": "Stanford;Berkeley;;Mountain View", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Capturing Structural Locality in Non-parametric Language Models", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6544", "id": "nnU3IUMJmN", "poster": "", "openreview": "https://openreview.net/forum?id=nnU3IUMJmN", "slides": "https://iclr.cc/virtual/2022/poster/6544", "video": "https://iclr.cc/virtual/2022/poster/6544", "author_site": "Frank F Xu, Junxian He, Graham Neubig, Vincent Hellendoorn", "tldr": "", "abstract": "Structural locality is a ubiquitous feature of real-world datasets, wherein data points are organized into local hierarchies. Some examples include topical clusters in text or project hierarchies in source code repositories. In this paper, we explore utilizing this structural locality within non-parametric language models, which generate sequences that reference retrieved examples from an external source. We propose a simple yet effective approach for adding locality information into such models by adding learned parameters that improve the likelihood of retrieving examples from local neighborhoods. Experiments on two different domains, Java source code and Wikipedia text, demonstrate that locality features improve model efficacy over models without access to these features, with interesting differences. We also perform an analysis of how and where locality features contribute to improving performance and why the traditionally used contextual similarity metrics alone are not enough to grasp the locality structure.\n", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/82559872eef8dd03d176c8d5ab48392be1b4929d.zip", "author": "Frank F. Xu;Junxian He;Graham Neubig;Vincent Josua Hellendoorn", "authorids": "~Frank_F._Xu1;~Junxian_He1;~Graham_Neubig1;~Vincent_Josua_Hellendoorn1", "gender": "M;M;M;M", "homepage": "https://frankxfz.me/;https://jxhe.github.io;http://phontron.com;http://vhellendoorn.github.io", "dblp": "190/4519;188/6127.html;03/8155;164/5751", "google_scholar": "1hXyfIkAAAAJ;BIFGeoUAAAAJ;wlosgkoAAAAJ;PfYrc5kAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Frank_F._Xu1;~Junxian_He1;~Graham_Neubig1;~Vincent_Josua_Hellendoorn1", "aff": "Carnegie Mellon University;Carnegie Mellon University;Carnegie Mellon University;Carnegie Mellon University", "aff_domain": "cmu.edu;cmu.edu;cmu.edu;cmu.edu", "position": "PhD student;PhD student;Associate Professor;Assistant Professor", "bibtex": "@inproceedings{\nxu2022capturing,\ntitle={Capturing Structural Locality in Non-parametric Language Models},\nauthor={Frank F. Xu and Junxian He and Graham Neubig and Vincent Josua Hellendoorn},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=nnU3IUMJmN}\n}", "github": "", "project": "", "reviewers": "7oBt;bn3A;PnXs;gYwe", "pdf_size": 0, "recommendation": "6;6;6;8", "confidence": "3;2;4;3", "correctness": "4;4;4;4", "technical_novelty": "2;2;2;3", "empirical_novelty": "0;1;2;3", "wc_summary_paper": "26;168;96;75", "wc_summary_review": "49;19;22;3", "wc_main_review": "173;315;260;106", "wc_review": "248;502;378;184", "wc_reply_reviewers": "284;87;37;0", "wc_reply_authors": "1101;635;867;109", "reply_reviewers": "2;1;1;0", "reply_authors": "4;2;4;1", "recommendation_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 3.0, 0.7071067811865476 ], "correctness_avg": [ 4.0, 0.0 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.5, 1.118033988749895 ], "wc_summary_paper_avg": [ 91.25, 51.07531204016281 ], "wc_summary_review_avg": [ 23.25, 16.528384676065595 ], "wc_main_review_avg": [ 213.5, 80.09525578959094 ], "wc_review_avg": [ 328.0, 122.38463955905577 ], "wc_reply_reviewers_avg": [ 102.0, 109.51940467332719 ], "wc_reply_authors_avg": [ 678.0, 367.5119045690901 ], "reply_reviewers_avg": [ 1.0, 0.7071067811865476 ], "reply_authors_avg": [ 2.75, 1.299038105676658 ], "replies_avg": [ 26, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8938489530354720570&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=nnU3IUMJmN", "email": "cmu.edu;cmu.edu;cmu.edu;cmu.edu", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Carnegie Mellon University", "aff_unique_dep": "", "aff_unique_url": "https://www.cmu.edu", "aff_unique_abbr": "CMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Counterfactual Plans under Distributional Ambiguity", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6806", "id": "noaG7SrPVK0", "poster": "", "openreview": "https://openreview.net/forum?id=noaG7SrPVK0", "slides": "https://iclr.cc/virtual/2022/poster/6806", "video": "https://iclr.cc/virtual/2022/poster/6806", "author_site": "Ngoc Bui, Duy Nguyen, Viet Anh Nguyen", "tldr": "", "abstract": "Counterfactual explanations are attracting significant attention due to the flourishing applications of machine learning models in consequential domains. A counterfactual plan consists of multiple possibilities to modify a given instance so that the model's prediction will be altered. As the predictive model can be updated subject to the future arrival of new data, a counterfactual plan may become ineffective or infeasible, with respect to the future values of the model parameters. In this work, we study the counterfactual plans under model uncertainty, in which the distribution of the model parameters is partially prescribed using only the first- and second-moment information. First, we propose an uncertainty quantification tool to compute the lower and upper bounds of the probability of feasibility for any given counterfactual plan. We then provide corrective methods to adjust the counterfactual plan to improve the feasibility measure. The numerical experiments validate our bounds and demonstrate that our correction increases the robustness of the counterfactual plans in different real-world datasets.", "keywords": "Counterfactual explanations;Robust optimization", "primary_area": "", "supplementary_material": "/attachment/b6f840ca34d2f311344041ff18b0e210ded1151e.zip", "author": "Ngoc Bui;Duy Nguyen;Viet Anh Nguyen", "authorids": "~Ngoc_Bui1;~Duy_Nguyen2;~Viet_Anh_Nguyen2", "gender": "M;M;M", "homepage": "http://ngocbh.github.io;https://duykhuongnguyen.github.io/;http://www.vietanhnguyen.net", "dblp": "312/6811;;", "google_scholar": ";y323M_cAAAAJ;3iyf-EoAAAAJ", "orcid": ";;", "linkedin": ";duy-nguyen-89272a17b/;", "or_profile": "~Ngoc_Bui1;~Duy_Nguyen2;~Viet_Anh_Nguyen2", "aff": "Hanoi University of Science and Technology;Hanoi University of Science and Technology;VinAI Research, Vietnam", "aff_domain": "hust.edu.vn;hust.edu.vn;vinai.io", "position": "MS student;Undergrad student;Research Scientist", "bibtex": "@inproceedings{\nbui2022counterfactual,\ntitle={Counterfactual Plans under Distributional Ambiguity},\nauthor={Ngoc Bui and Duy Nguyen and Viet Anh Nguyen},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=noaG7SrPVK0}\n}", "github": "", "project": "", "reviewers": "A84V;pA7G;qAnf", "pdf_size": 0, "recommendation": "5;6;8", "confidence": "3;3;4", "correctness": "3;4;4", "technical_novelty": "3;3;4", "empirical_novelty": "3;3;3", "wc_summary_paper": "122;73;140", "wc_summary_review": "81;13;41", "wc_main_review": "624;188;1069", "wc_review": "827;274;1250", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "1686;1383;1660", "reply_reviewers": "0;0;0", "reply_authors": "4;3;4", "recommendation_avg": [ 6.333333333333333, 1.247219128924647 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "correctness_avg": [ 3.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 3.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 111.66666666666667, 28.311756490114764 ], "wc_summary_review_avg": [ 45.0, 27.9045993819418 ], "wc_main_review_avg": [ 627.0, 359.6729996353169 ], "wc_review_avg": [ 783.6666666666666, 399.6267703189509 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 1576.3333333333333, 137.11876441813337 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 3.6666666666666665, 0.4714045207910317 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.944911182523068, "corr_recommendation_correctness": 0.7559289460184545, "gs_citation": 31, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16318179024765381236&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=noaG7SrPVK0", "email": "hust.edu.vn;hust.edu.vn;vinai.io", "author_num": 3, "aff_unique_index": "0;0;1", "aff_unique_norm": "Hanoi University of Science and Technology;VinAI Research", "aff_unique_dep": ";", "aff_unique_url": "https://www.hust.edu.vn;https://www.vin.ai", "aff_unique_abbr": "HUST;VinAI", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Hanoi;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Vietnam" }, { "id": "np5BgCFSsbm", "title": "Neocortical cell type classification from electrophysiology recordings using deep neural networks", "track": "main", "status": "Desk Reject", "tldr": "", "abstract": "Understanding the neural code requires identifying different functional units involved in the neural circuits. One way to identify these functional units is to solve a neuron type classification problem. For decades, current clamp electrophysiology recordings have provided the means to classify the neurons based on subtle differences in action potential shapes and spiking patterns. However, significant variations in neuronal type definitions, classification pipelines, and variability in the neuronal activities make unambiguous determination of neuron type challenging. Previous solutions to this electrophysiology-based cell type classification problem consisted of dimensionality reduction juxtaposed with clustering using hand-crafted action potential features. Recent discoveries have allowed genetic-based cell-type classifications, which have fewer ambiguities, but they are less practical in vivo and have even lower throughput. Leveraging the unprecedented ground truth data published in the Allen Institute Cell Types Database, which contains anatomical, genetic, and electrophysiology characterizations of neurons in the mouse neocortex, we construct a robust and efficient convolutional neural network (CNN) that successfully classifies neurons according to their genetic label or broad type (excitatory or inhibitory) solely using current-clamp electrophysiology recordings. The CNN is configured as a multiple-input single-output network consisting of three subnetworks that take in the raw time series electrophysiology recording as well as the real and imaginary components of its Fourier coefficients. Our single pipeline method is fast and streamlined while simultaneously outperforming previous methods and achieving more classification classes using only single current-clamp trace as the input. This end-to-end convolutional neural network-based classification method removes the need for hand-crafted features, specific knowledge, or human intervention for quick identification of the cell type with high accuracy, enabling interpretation of the experimental data in a bias-free manner and a much broader scientific context.", "keywords": "neuron type classification;convolutional neural network;electrophysiology", "primary_area": "", "supplementary_material": "", "author": "Raymond Wang;Sang Min Han;Marta Agnieszka Gajowa;Chunlei Liu", "authorids": "~Raymond_Wang1;~Sang_Min_Han1;~Marta_Agnieszka_Gajowa1;~Chunlei_Liu1", "gender": ";M;F;M", "homepage": ";https://people.eecs.berkeley.edu/~smhan/;;https://www2.eecs.berkeley.edu/Faculty/Homepages/chunleiliu.html/", "dblp": ";;;", "google_scholar": ";UNOStlYAAAAJ;;YnyRkG0AAAAJ", "orcid": ";0000-0001-8841-0220;0000-0002-6399-6883;", "linkedin": "raymond-w2/;smhan/;martagajowa/;", "or_profile": "~Raymond_Wang1;~Sang_Min_Han1;~Marta_Agnieszka_Gajowa1;~Chunlei_Liu1", "aff": "University of California, Berkeley;University of California, Berkeley;University of California, Berkeley;University of California, Berkeley", "aff_domain": "berkeley.edu;berkeley.edu;berkeley.edu;berkeley.edu", "position": "Undergrad student;Postdoc;Postdoc;Professor", "bibtex": "@misc{\nwang2022neocortical,\ntitle={Neocortical cell type classification from electrophysiology recordings using deep neural networks},\nauthor={Raymond Wang and Sang Min Han and Marta Agnieszka Gajowa and Chunlei Liu},\nyear={2022},\nurl={https://openreview.net/forum?id=np5BgCFSsbm}\n}", "github": "", "project": "", "reviewers": "", "site": "https://openreview.net/forum?id=np5BgCFSsbm", "pdf_size": 0, "recommendation": "", "confidence": "", "correctness": "", "technical_novelty": "", "empirical_novelty": "", "wc_summary_paper": "", "wc_summary_review": "", "wc_main_review": "", "wc_review": "", "wc_reply_reviewers": "", "wc_reply_authors": "", "reply_reviewers": "", "reply_authors": "", "recommendation_avg": [ 0, 0 ], "confidence_avg": [ 0, 0 ], "correctness_avg": [ 0, 0 ], "technical_novelty_avg": [ 0, 0 ], "empirical_novelty_avg": [ 0, 0 ], "wc_summary_paper_avg": [ 0, 0 ], "wc_summary_review_avg": [ 0, 0 ], "wc_main_review_avg": [ 0, 0 ], "wc_review_avg": [ 0, 0 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 1, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0, "corr_recommendation_correctness": 0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=979824368026087180&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "University of California, Berkeley", "aff_unique_dep": "", "aff_unique_url": "https://www.berkeley.edu", "aff_unique_abbr": "UC Berkeley", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Berkeley", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Online Continual Learning on Class Incremental Blurry Task Configuration with Anytime Inference", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6797", "id": "nrGGfMbY_qK", "poster": "", "openreview": "https://openreview.net/forum?id=nrGGfMbY_qK", "slides": "https://iclr.cc/virtual/2022/poster/6797", "video": "https://iclr.cc/virtual/2022/poster/6797", "author_site": "Hyunseo Koh, Dahyun Kim, Jung-Woo Ha, Jonghyun Choi", "tldr": "", "abstract": "Despite rapid advances in continual learning, a large body of research is devoted to improving performance in the existing setups.\nWhile a handful of work do propose new continual learning setups, they still lack practicality in certain aspects.\nFor better practicality, we first propose a novel continual learning setup that is online, task-free, class-incremental, of blurry task boundaries and subject to inference queries at any moment.\nWe additionally propose a new metric to better measure the performance of the continual learning methods subject to inference queries at any moment.\nTo address the challenging setup and evaluation protocol, we propose an effective method that employs a new memory management scheme and novel learning techniques.\nOur empirical validation demonstrates that the proposed method outperforms prior arts by large margins. Code and data splits are available at https://github.com/naver-ai/i-Blurry.", "keywords": "online;continual learning;task-free continual learning;any-time inference", "primary_area": "", "supplementary_material": "/attachment/82e03b6d6fd59d09c337e250793d406fb48d085c.zip", "author": "Hyunseo Koh;Dahyun Kim;Jung-Woo Ha;Jonghyun Choi", "authorids": "~Hyunseo_Koh1;~Dahyun_Kim1;~Jung-Woo_Ha1;~Jonghyun_Choi1", "gender": "M;M;M;M", "homepage": ";;https://aidljwha.wordpress.com/;https://ppolon.github.io/", "dblp": "304/4369;196/7883;66/867-1;21/11103", "google_scholar": "Mi4cMxgAAAAJ;atD6Rs4AAAAJ;https://scholar.google.co.kr/citations?user=eGj3ay4AAAAJ;uiGWnm4AAAAJ", "orcid": "0000-0002-2576-1581;;0000-0002-7400-7681;0000-0002-7934-8434", "linkedin": "%ED%98%84%EC%84%9C-%EA%B3%A0-66298a221/;dahyun-kim-0a1711163;jung-woo-ha-b2782862?trk=hp-identity-name;jonghyun-choi-459bb615/", "or_profile": "~Hyunseo_Koh1;~Dahyun_Kim1;~Jung-Woo_Ha1;~Jonghyun_Choi1", "aff": "Gwangju Institute of Science and Technology;Gwangju Institute of Science and Technology;NAVER AI Lab;NAVER", "aff_domain": "gist.ac.kr;gist.ac.kr;navercorp.com;navercorp.com", "position": "PhD student;MS student;Head (Executive Director);AI Advisor Committee", "bibtex": "@inproceedings{\nkoh2022online,\ntitle={Online Continual Learning on Class Incremental Blurry Task Configuration with Anytime Inference},\nauthor={Hyunseo Koh and Dahyun Kim and Jung-Woo Ha and Jonghyun Choi},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=nrGGfMbY_qK}\n}", "github": "", "project": "", "reviewers": "QQWa;Vfw2;6BAu;F4rS", "pdf_size": 0, "recommendation": "3;6;8;8", "confidence": "4;3;3;4", "correctness": "2;4;3;3", "technical_novelty": "2;3;3;2", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "50;71;243;69", "wc_summary_review": "28;17;209;148", "wc_main_review": "191;243;439;335", "wc_review": "269;331;891;552", "wc_reply_reviewers": "1017;8;0;138", "wc_reply_authors": "2419;713;1897;882", "reply_reviewers": "4;1;0;3", "reply_authors": "7;2;4;3", "recommendation_avg": [ 6.25, 2.0463381929681126 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 108.25, 78.22843153227603 ], "wc_summary_review_avg": [ 100.5, 81.0200592446093 ], "wc_main_review_avg": [ 302.0, 94.41927769264072 ], "wc_review_avg": [ 510.75, 243.4362082764189 ], "wc_reply_reviewers_avg": [ 290.75, 422.86367484095865 ], "wc_reply_authors_avg": [ 1477.75, 707.3688482680022 ], "reply_reviewers_avg": [ 2.0, 1.5811388300841898 ], "reply_authors_avg": [ 4.0, 1.8708286933869707 ], "replies_avg": [ 34, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.3665083330689157, "corr_recommendation_correctness": 0.5183210553488161, "gs_citation": 83, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5710319088637309523&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=nrGGfMbY_qK", "email": "gist.ac.kr;gist.ac.kr;navercorp.com;navercorp.com", "author_num": 4, "aff_unique_index": "0;0;1;1", "aff_unique_norm": "Gwangju Institute of Science and Technology;NAVER Corporation", "aff_unique_dep": ";NAVER AI Lab", "aff_unique_url": "https://www.gist.ac.kr;https://www.naver.com", "aff_unique_abbr": "GIST;NAVER", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Gwangju;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "South Korea" }, { "id": "nsjkNB2oKsQ", "title": "Off-Policy Reinforcement Learning with Delayed Rewards", "track": "main", "status": "Reject", "tldr": "", "abstract": "We study deep reinforcement learning (RL) algorithms with delayed rewards. In many real-world tasks, instant rewards are often not readily accessible or even defined immediately after the agent performs actions. In this work, we first formally define the environment with delayed rewards and discuss the challenges raised due to the non-Markovian nature of such environments. Then, we introduce a general off-policy RL framework with a new $Q$-function formulation that can handle the delayed rewards with theoretical convergence guarantees. For practical tasks with high dimensional state spaces, we further introduce the HC-decomposition rule of the $Q$-function in our framework which naturally leads to an approximation scheme that helps boost the training efficiency and stability. We finally conduct extensive experiments to demonstrate the superior performance of our algorithms over the existing work and their variants.", "keywords": "Delayed Rewards;Off-Policy Reinforcement Learning", "primary_area": "", "supplementary_material": "/attachment/c0f4e9fdd7e9929a439c23e8fda8aa1da5a7698d.zip", "author": "Beining Han;Zhizhou Ren;Zuofan Wu;Yuan Zhou;Jian Peng", "authorids": "~Beining_Han1;~Zhizhou_Ren1;~Zuofan_Wu1;~Yuan_Zhou1;~Jian_Peng1", "gender": "M;M;M;M;M", "homepage": ";;;http://yuanz.web.illinois.edu;http://jianpeng.web.engr.illinois.edu/", "dblp": "266/7819;https://dblp.uni-trier.de/pid/239/5714.html;;40/7018;29/4181-1", "google_scholar": "LVjU7xIAAAAJ;xgpMeDgAAAAJ;;https://scholar.google.com.tw/citations?user=aR34e1gAAAAJ;https://scholar.google.com.tw/citations?user=4wcAVXAAAAAJ", "orcid": ";;;;", "linkedin": "%E8%B4%9D%E5%AE%81-%E9%9F%A9-b79204207/details/experience/;;zuofan-wu-b08398213/;;", "or_profile": "~Beining_Han1;~Zhizhou_Ren1;~Zuofan_Wu1;~Yuan_Zhou1;~Jian_Peng1", "aff": "IIIS, Tsinghua University;University of Illinois, Urbana Champaign;Helixon Research;;University of Illinois, Urbana Champaign", "aff_domain": "mails.tsinghua.edu.cn;illinois.edu;helixon.com;;illinois.edu", "position": "Undergrad student;PhD student;Researcher;;Assistant Professor", "bibtex": "@misc{\nhan2022offpolicy,\ntitle={Off-Policy Reinforcement Learning with Delayed Rewards},\nauthor={Beining Han and Zhizhou Ren and Zuofan Wu and Yuan Zhou and Jian Peng},\nyear={2022},\nurl={https://openreview.net/forum?id=nsjkNB2oKsQ}\n}", "github": "", "project": "", "reviewers": "fukM;oufu;mJkz;6XaH", "site": "https://openreview.net/forum?id=nsjkNB2oKsQ", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "4;4;4;4", "correctness": "2;2;3;3", "technical_novelty": "2;3;2;2", "empirical_novelty": "2;0;2;2", "wc_summary_paper": "49;107;43;148", "wc_summary_review": "98;49;12;53", "wc_main_review": "415;390;551;248", "wc_review": "562;546;606;449", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 86.75, 43.30343519860751 ], "wc_summary_review_avg": [ 53.0, 30.504098085339287 ], "wc_main_review_avg": [ 401.0, 107.50116278440899 ], "wc_review_avg": [ 540.75, 57.34707926302786 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 43, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10466467384922349914&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff_unique_index": "0;1;2;1", "aff_unique_norm": "Tsinghua University;University of Illinois Urbana-Champaign;Helixon Research", "aff_unique_dep": "Institute for Interdisciplinary Information Sciences;;", "aff_unique_url": "https://www.tsinghua.edu.cn;https://illinois.edu;", "aff_unique_abbr": "THU;UIUC;", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Urbana-Champaign", "aff_country_unique_index": "0;1;1;1", "aff_country_unique": "China;United States" }, { "id": "nuWpS9FNSKn", "title": "One Objective for All Models --- Self-supervised Learning for Topic Models", "track": "main", "status": "Reject", "tldr": "", "abstract": "Self-supervised learning has significantly improved the performance of many NLP tasks. In this paper, we highlight a key advantage of self-supervised learning - when applied to data generated by topic models, self-supervised learning can be oblivious to the specific model, and hence is less susceptible to model mis-specification. In particular, we prove that commonly used self-supervised objectives based on reconstruction or contrastive samples can both recover useful posterior information for general topic models. Empirically, we show that the same objectives can perform competitively against posterior inference using the correct model, while outperforming posterior inference using mis-specified model.", "keywords": "self-supervised learning;topic models", "primary_area": "", "supplementary_material": "/attachment/710b11709437dc6bc94513f39147810fdf0e2116.zip", "author": "Zeping Luo;Cindy Weng;Shiyou Wu;Mo Zhou;Rong Ge", "authorids": "~Zeping_Luo1;~Cindy_Weng1;~Shiyou_Wu1;~Mo_Zhou3;~Rong_Ge1", "gender": "M;;;M;M", "homepage": ";https://github.com/wengcindy;;https://mozhou7.github.io/;https://users.cs.duke.edu/~rongge/", "dblp": ";;;;89/6869-1.html", "google_scholar": ";;;j_SEFF8AAAAJ;https://scholar.google.com.tw/citations?user=MVxcjEoAAAAJ", "orcid": ";;;;", "linkedin": "zeping-luo-a78b28178/;;shiyou-tony-wu/;;", "or_profile": "~Zeping_Luo1;~Cindy_Weng1;~Shiyou_Wu1;~Mo_Zhou3;~Rong_Ge1", "aff": "Duke University;Duke University;Duke University;Duke University;Duke University", "aff_domain": "duke.edu;duke.edu;duke.edu;duke.edu;duke.edu", "position": "Undergrad student;Undergrad student;Undergrad student;PhD student;Assistant Professor", "bibtex": "@misc{\nluo2022one,\ntitle={One Objective for All Models --- Self-supervised Learning for Topic Models},\nauthor={Zeping Luo and Cindy Weng and Shiyou Wu and Mo Zhou and Rong Ge},\nyear={2022},\nurl={https://openreview.net/forum?id=nuWpS9FNSKn}\n}", "github": "", "project": "", "reviewers": "KozZ;b3P7;HZxy;HibN", "site": "https://openreview.net/forum?id=nuWpS9FNSKn", "pdf_size": 0, "recommendation": "5;6;6;6", "confidence": "4;3;3;3", "correctness": "3;3;3;3", "technical_novelty": "3;3;3;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "418;61;70;128", "wc_summary_review": "79;31;45;29", "wc_main_review": "1077;271;310;372", "wc_review": "1574;363;425;529", "wc_reply_reviewers": "1543;78;0;731", "wc_reply_authors": "1889;276;365;1010", "reply_reviewers": "4;1;0;4", "reply_authors": "5;2;1;4", "recommendation_avg": [ 5.75, 0.4330127018922193 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 169.25, 145.8995801913083 ], "wc_summary_review_avg": [ 46.0, 20.024984394500787 ], "wc_main_review_avg": [ 507.5, 330.76766770650363 ], "wc_review_avg": [ 722.75, 495.0355416533241 ], "wc_reply_reviewers_avg": [ 588.0, 620.1447411693499 ], "wc_reply_authors_avg": [ 885.0, 645.1592826581665 ], "reply_reviewers_avg": [ 2.25, 1.7853571071357126 ], "reply_authors_avg": [ 3.0, 1.5811388300841898 ], "replies_avg": [ 28, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -1.0, "corr_recommendation_correctness": 0.0, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5706061759533658493&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Duke University", "aff_unique_dep": "", "aff_unique_url": "https://www.duke.edu", "aff_unique_abbr": "Duke", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Diverse Client Selection for Federated Learning via Submodular Maximization", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7047", "id": "nwKXyFvaUm", "poster": "", "openreview": "https://openreview.net/forum?id=nwKXyFvaUm", "slides": "https://iclr.cc/virtual/2022/poster/7047", "video": "https://iclr.cc/virtual/2022/poster/7047", "author_site": "Ravikumar Balakrishnan, Tian Li, Tianyi Zhou, Nageen Himayat, Virginia Smith, Jeff Bilmes", "tldr": "", "abstract": "In every communication round of federated learning, a random subset of clients communicate their model updates back to the server which then aggregates them all. The optimal size of this subset is not known and several studies have shown that typically random selection does not perform very well in terms of convergence, learning efficiency and fairness. We, in this paper, propose to select a small diverse subset of clients, namely those carrying representative gradient information, and we transmit only these updates to the server. Our aim is for updating via only a subset to approximate updating via aggregating all client information. We achieve this by choosing a subset that maximizes a submodular facility location function defined over gradient space. We introduce \u201cfederated averaging with diverse client selection (DivFL)\u201d. We provide a thorough analysis of its convergence in the heterogeneous setting and apply it both to synthetic and to real datasets. Empirical results show several benefits to our approach including improved learning efficiency, faster convergence and also more uniform (i.e., fair) performance across clients. We further show a communication-efficient version of DivFL that can still outperform baselines on the above metrics.", "keywords": "federated learning;submodularity;diversity", "primary_area": "", "supplementary_material": "", "author": "Ravikumar Balakrishnan;Tian Li;Tianyi Zhou;Nageen Himayat;Virginia Smith;Jeff Bilmes", "authorids": "~Ravikumar_Balakrishnan1;~Tian_Li1;~Tianyi_Zhou1;nageen.himayat@intel.com;~Virginia_Smith1;~Jeff_Bilmes1", "gender": "M;;M;;F;M", "homepage": ";https://litian96.github.io/;https://tianyizhou.github.io/;;;http://melodi.ee.washington.edu/people/bilmes", "dblp": ";91/7844-5;88/8205-1;;120/0921;b/JeffABilmes", "google_scholar": "zVvdiTsAAAAJ;https://scholar.google.com/citations?hl=en;OKvgizMAAAAJ;;;L9QufAsAAAAJ", "orcid": ";;0000-0001-5348-0632;;;0000-0002-7372-8778", "linkedin": "ravikumar-balakrishnan-05193819/;;tianyizhou;;;jbilmes/", "or_profile": "~Ravikumar_Balakrishnan1;~Tian_Li1;~Tianyi_Zhou1;nageen.himayat@intel.com;~Virginia_Smith1;~Jeff_Bilmes1", "aff": ";Carnegie Mellon University;University of Washington, Seattle;;Carnegie Mellon University;University of Washington, Seattle", "aff_domain": ";cmu.edu;uw.edu;;cmu.edu;uw.edu", "position": ";PhD student;PhD student;;Associate Professor;Full Professor", "bibtex": "@inproceedings{\nbalakrishnan2022diverse,\ntitle={Diverse Client Selection for Federated Learning via Submodular Maximization},\nauthor={Ravikumar Balakrishnan and Tian Li and Tianyi Zhou and Nageen Himayat and Virginia Smith and Jeff Bilmes},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=nwKXyFvaUm}\n}", "github": "", "project": "", "reviewers": "RXZV;2aPT;KjQr;CFhn", "pdf_size": 0, "recommendation": "3;6;6;8", "confidence": "4;2;3;3", "correctness": "2;4;4;3", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;4;3;3", "wc_summary_paper": "51;219;45;104", "wc_summary_review": "40;63;47;67", "wc_main_review": "222;266;38;364", "wc_review": "313;548;130;535", "wc_reply_reviewers": "51;0;0;39", "wc_reply_authors": "846;391;188;178", "reply_reviewers": "1;0;0;1", "reply_authors": "2;1;1;1", "recommendation_avg": [ 5.75, 1.7853571071357126 ], "confidence_avg": [ 3.0, 0.7071067811865476 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 104.75, 69.84402265047454 ], "wc_summary_review_avg": [ 54.25, 11.121488209767612 ], "wc_main_review_avg": [ 222.5, 118.27404618089295 ], "wc_review_avg": [ 381.5, 172.6477628004487 ], "wc_reply_reviewers_avg": [ 22.5, 22.89650628371062 ], "wc_reply_authors_avg": [ 400.75, 270.75023084016016 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.5940885257860046, "corr_recommendation_correctness": 0.5488604301969737, "gs_citation": 147, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16082870955349610333&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=nwKXyFvaUm", "email": ";cmu.edu;uw.edu;;cmu.edu;uw.edu", "author_num": 6, "aff_unique_index": "0;1;0;1", "aff_unique_norm": "Carnegie Mellon University;University of Washington", "aff_unique_dep": ";", "aff_unique_url": "https://www.cmu.edu;https://www.washington.edu", "aff_unique_abbr": "CMU;UW", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Seattle", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Zero Pixel Directional Boundary by Vector Transform", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6022", "id": "nxcABL7jbQh", "poster": "", "openreview": "https://openreview.net/forum?id=nxcABL7jbQh", "slides": "https://iclr.cc/virtual/2022/poster/6022", "video": "https://iclr.cc/virtual/2022/poster/6022", "author_site": "Edoardo Mello Rella, Ajad Chhatkuli, Yun Liu, Ender Konukoglu, Luc Van Gool", "tldr": "", "abstract": "Boundaries or contours are among the primary visual cues used by human and computer vision systems. One of the key problems in boundary detection is the loss formulation, which typically leads to class imbalance and, as a consequence, to thick boundaries which require non-differential post-processing steps to be thinned.\nIn this paper, we re-interpret boundaries as 1-D surfaces and formulate a one-to-one vector transform function that allows for training of boundary prediction completely avoiding the class imbalance issue. Specifically, we define the boundary representation at any point as the unit vector pointing to the closest boundary surface.\nOur problem formulation leads to the estimation of direction as well as richer contextual information of the boundary, and, if desired, the availability of zero-pixel thin boundaries also at training time. Our method uses no hyper-parameter in the training loss and a fixed stable hyper-parameter at inference. We provide theoretical justification/discussions of the vector transform representation. We evaluate the proposed loss method using a standard architecture and show the excellent performance over other losses and representations on several datasets.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Edoardo Mello Rella;Ajad Chhatkuli;Yun Liu;Ender Konukoglu;Luc Van Gool", "authorids": "~Edoardo_Mello_Rella1;~Ajad_Chhatkuli1;~Yun_Liu1;~Ender_Konukoglu1;~Luc_Van_Gool1", "gender": "M;M;M;;", "homepage": ";https://ajadchhatkuli.github.io;https://yun-liu.github.io/;http://www.vision.ee.ethz.ch/~kender;", "dblp": "272/1665;149/7655;50/2482-11;45/7041;61/5017", "google_scholar": "jNPBzzQAAAAJ;3BHMHU4AAAAJ;https://scholar.google.com.hk/citations?user=UB3doCoAAAAJ;https://scholar.google.ch/citations?user=OeEMrhQAAAAJ;https://scholar.google.be/citations?user=TwMib_QAAAAJ", "orcid": ";0000-0003-2051-2209;0000-0001-6143-0264;;", "linkedin": "edoardo-mello-rella-80914115b/;;yun-liu-082989145/;;", "or_profile": "~Edoardo_Mello_Rella1;~Ajad_Chhatkuli1;~Yun_Liu1;~Ender_Konukoglu1;~Luc_Van_Gool1", "aff": "Swiss Federal Institute of Technology;Swiss Federal Institute of Technology;ETHZ - ETH Zurich;Swiss Federal Institute of Technology;KU Leuven", "aff_domain": "ethz.ch;ethz.ch;ethz.ch;ethz.ch;kuleuven.be", "position": "PhD student;Postdoc;Postdoc;Assistant Professor;Emeritus", "bibtex": "@inproceedings{\nrella2022zero,\ntitle={Zero Pixel Directional Boundary by Vector Transform},\nauthor={Edoardo Mello Rella and Ajad Chhatkuli and Yun Liu and Ender Konukoglu and Luc Van Gool},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=nxcABL7jbQh}\n}", "github": "", "project": "", "reviewers": "5b3T;GvzB;CywP", "pdf_size": 0, "recommendation": "6;6;8", "confidence": "3;4;3", "correctness": "3;3;4", "technical_novelty": "3;3;4", "empirical_novelty": "3;2;4", "wc_summary_paper": "110;116;43", "wc_summary_review": "24;55;26", "wc_main_review": "119;398;49", "wc_review": "253;569;118", "wc_reply_reviewers": "0;106;0", "wc_reply_authors": "441;1111;143", "reply_reviewers": "0;1;0", "reply_authors": "1;3;1", "recommendation_avg": [ 6.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 3.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 3.0, 0.816496580927726 ], "wc_summary_paper_avg": [ 89.66666666666667, 33.089105289942324 ], "wc_summary_review_avg": [ 35.0, 14.165686240583852 ], "wc_main_review_avg": [ 188.66666666666666, 150.75439923560734 ], "wc_review_avg": [ 313.3333333333333, 188.99794237563069 ], "wc_reply_reviewers_avg": [ 35.333333333333336, 49.968879203849355 ], "wc_reply_authors_avg": [ 565.0, 404.79459811942485 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.9428090415820634 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.49999999999999983, "corr_recommendation_correctness": 0.9999999999999998, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4154866420989883552&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=nxcABL7jbQh", "email": "ethz.ch;ethz.ch;ethz.ch;ethz.ch;kuleuven.be", "author_num": 5, "aff_unique_index": "0;0;1;0;2", "aff_unique_norm": "Swiss Federal Institute of Technology;ETH Zurich;Katholieke Universiteit Leuven", "aff_unique_dep": ";;", "aff_unique_url": "https://www.ethz.ch;https://www.ethz.ch;https://www.kuleuven.be", "aff_unique_abbr": "ETH Zurich;ETHZ;KU Leuven", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;1", "aff_country_unique": "Switzerland;Belgium" }, { "id": "nzqZufLU1v", "title": "Lidar Range Image Compression with Deep Delta Encoding", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Lidars are widely used in applications such as autonomous driving and augmented reality. However, the large volume of data produced by lidars can lead to high cost in data storage and transmission. Besides rising standards in point cloud compression from the MPEG (G-PCC and V-PCC), recent works also explore using deep networks to improve the compression rates. However, most prior work focus on the generic point cloud representation, neglecting the spatial patterns of the points from lidar range images. In this work, we leverage the range image representation and propose a novel deep delta encoding model to compress lidar data. Our deep model takes in local range image patches and predicts the next pixel value in a raster-scanning manner. The residuals between the prediction and the original value can be entropy encoded to achieve lossless compression under certain quantization rates. Evaluated on the Waymo Open Dataset and KITTI, our method demonstrates significant improvement compared to widely algorithms as well as recent deep methods based on the point cloud representation, in both the point cloud reconstruction quality and the downstream perception model performance. ", "keywords": "lidar;point cloud;range image;compression;delta encoding", "primary_area": "", "supplementary_material": "", "author": "Xuanyu Zhou;Charles R. Qi;Yin Zhou;Dragomir Anguelov", "authorids": "xuanyuzhou@waymo.com;~Charles_R._Qi1;~Yin_Zhou1;~Dragomir_Anguelov1", "gender": ";;M;M", "homepage": ";;;", "dblp": ";;;a/DragomirAnguelov", "google_scholar": ";;https://scholar.google.com/citations?scilu=9351241097416630746:0,18260587605580260227:0;https://scholar.google.com/citations?hl=en", "orcid": ";;;", "linkedin": ";;;dragomiranguelov/", "or_profile": "xuanyuzhou@waymo.com;~Charles_R._Qi1;~Yin_Zhou1;~Dragomir_Anguelov1", "aff": ";;Waymo;Waymo", "aff_domain": ";;waymo.com;waymo.com", "position": ";;Researcher;Researcher", "bibtex": "@misc{\nzhou2022lidar,\ntitle={Lidar Range Image Compression with Deep Delta Encoding},\nauthor={Xuanyu Zhou and Charles R. Qi and Yin Zhou and Dragomir Anguelov},\nyear={2022},\nurl={https://openreview.net/forum?id=nzqZufLU1v}\n}", "github": "", "project": "", "reviewers": "4aD5;7cy5;4PwJ", "site": "https://openreview.net/forum?id=nzqZufLU1v", "pdf_size": 0, "recommendation": "3;3;5", "confidence": "3;5;2", "correctness": "2;3;2", "technical_novelty": "2;1;2", "empirical_novelty": "1;1;2", "wc_summary_paper": "38;54;120", "wc_summary_review": "65;19;22", "wc_main_review": "680;222;190", "wc_review": "783;295;332", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 3.6666666666666665, 0.9428090415820634 ], "confidence_avg": [ 3.3333333333333335, 1.247219128924647 ], "correctness_avg": [ 2.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 1.6666666666666667, 0.4714045207910317 ], "empirical_novelty_avg": [ 1.3333333333333333, 0.4714045207910317 ], "wc_summary_paper_avg": [ 70.66666666666667, 35.490217744549774 ], "wc_summary_review_avg": [ 35.333333333333336, 21.01322334996598 ], "wc_main_review_avg": [ 364.0, 223.82731438916625 ], "wc_review_avg": [ 470.0, 221.83928116243675 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.7559289460184545, "corr_recommendation_correctness": -0.5, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:BuodndRg8PYJ:scholar.google.com/&scioq=Lidar+Range+Image+Compression+with+Deep+Delta+Encoding&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Waymo", "aff_unique_dep": "", "aff_unique_url": "https://www.waymo.com", "aff_unique_abbr": "Waymo", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "On Incorporating Inductive Biases into VAEs", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6009", "id": "nzvbBD_3J-g", "poster": "", "openreview": "https://openreview.net/forum?id=nzvbBD_3J-g", "slides": "https://iclr.cc/virtual/2022/poster/6009", "video": "https://iclr.cc/virtual/2022/poster/6009", "author_site": "Ning Miao, Emile Mathieu, Siddharth N, Yee Whye Teh, Tom Rainforth", "tldr": "", "abstract": "We explain why directly changing the prior can be a surprisingly ineffective mechanism for incorporating inductive biases into variational auto-encoders (VAEs), and introduce a simple and effective alternative approach: Intermediary Latent Space VAEs (InteL-VAEs). InteL-VAEs use an intermediary set of latent variables to control the stochasticity of the encoding process, before mapping these in turn to the latent representation using a parametric function that encapsulates our desired inductive bias(es). This allows us to impose properties like sparsity or clustering on learned representations, and incorporate human knowledge into the generative model. Whereas changing the prior only indirectly encourages behavior through regularizing the encoder, InteL-VAEs are able to directly enforce desired characteristics. Moreover, they bypass the computation and encoder design issues caused by non-Gaussian priors, while allowing for additional flexibility through training of the parametric mapping function. We show that these advantages, in turn, lead to both better generative models and better representations being learned.", "keywords": "VAEs;Variational autoencoders;Variational auto-encoders;Representation learning;Inductive biases", "primary_area": "", "supplementary_material": "/attachment/3a16bf226f62e6f590d16b3c51718a223aff218f.zip", "author": "Ning Miao;Emile Mathieu;Siddharth N;Yee Whye Teh;Tom Rainforth", "authorids": "~Ning_Miao3;~Emile_Mathieu1;~Siddharth_N1;~Yee_Whye_Teh2;~Tom_Rainforth1", "gender": "M;M;M;M;M", "homepage": "http://www.ningmiao.space/;http://emilemathieu.fr;https://homepages.inf.ed.ac.uk/snaraya3/;http://www.robots.ox.ac.uk/~twgr;http://csml.stats.ox.ac.uk/people/teh/", "dblp": "230/7777;223/6084.html;67/8366;166/1198;88/2483", "google_scholar": ";g9BjTqgAAAAJ;V7D7hxMAAAAJ;https://scholar.google.co.uk/citations?user=ieLRNKMAAAAJ;https://scholar.google.co.uk/citations?user=y-nUzMwAAAAJ", "orcid": ";;0000-0003-4911-7333;;", "linkedin": ";;;;", "or_profile": "~Ning_Miao3;~Emile_Mathieu1;~Siddharth_N1;~Tom_Rainforth1;~Yee_Whye_Teh1", "aff": ";Oxford, University of Oxford;University of Edinburgh;;University of Oxford", "aff_domain": ";stats.ox.ac.uk;ed.ac.uk;ox.ac.uk;ox.ac.uk", "position": ";Postdoc;Reader (Associate Professor);Postdoc;Full Professor", "bibtex": "@inproceedings{\nmiao2022on,\ntitle={On Incorporating Inductive Biases into {VAE}s},\nauthor={Ning Miao and Emile Mathieu and Siddharth N and Yee Whye Teh and Tom Rainforth},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=nzvbBD_3J-g}\n}", "github": "", "project": "", "reviewers": "pbbP;NaAZ;9rm9;tE7T", "pdf_size": 0, "recommendation": "6;6;6;8", "confidence": "5;4;4;4", "correctness": "4;2;4;3", "technical_novelty": "3;2;3;3", "empirical_novelty": "3;2;3;3", "wc_summary_paper": "102;63;45;115", "wc_summary_review": "54;118;25;65", "wc_main_review": "215;412;264;557", "wc_review": "371;593;334;737", "wc_reply_reviewers": "54;133;606;161", "wc_reply_authors": "642;1194;1148;351", "reply_reviewers": "1;1;2;1", "reply_authors": "2;3;2;2", "recommendation_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 81.25, 28.358199872347328 ], "wc_summary_review_avg": [ 65.5, 33.64892271678248 ], "wc_main_review_avg": [ 362.0, 133.91975209057102 ], "wc_review_avg": [ 508.75, 164.85504996814626 ], "wc_reply_reviewers_avg": [ 238.5, 215.7736082100867 ], "wc_reply_authors_avg": [ 833.75, 352.9691027554678 ], "reply_reviewers_avg": [ 1.25, 0.4330127018922193 ], "reply_authors_avg": [ 2.25, 0.4330127018922193 ], "replies_avg": [ 20, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": -0.17407765595569782, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15494277357139593439&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=nzvbBD_3J-g", "email": ";stats.ox.ac.uk;ed.ac.uk;ox.ac.uk;ox.ac.uk", "author_num": 5, "aff_unique_index": "0;1;0", "aff_unique_norm": "University of Oxford;University of Edinburgh", "aff_unique_dep": ";", "aff_unique_url": "https://www.ox.ac.uk;https://www.ed.ac.uk", "aff_unique_abbr": "Oxford;Edinburgh", "aff_campus_unique_index": "0", "aff_campus_unique": "Oxford;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United Kingdom" }, { "title": "Bridging the Gap: Providing Post-Hoc Symbolic Explanations for Sequential Decision-Making Problems with Inscrutable Representations", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7046", "id": "o-1v9hdSult", "poster": "", "openreview": "https://openreview.net/forum?id=o-1v9hdSult", "slides": "https://iclr.cc/virtual/2022/poster/7046", "video": "https://iclr.cc/virtual/2022/poster/7046", "author_site": "Sarath Sreedharan, Utkarsh Soni, Mudit Verma, Siddharth Srivastava, Subbarao Kambhampati", "tldr": "", "abstract": "As increasingly complex AI systems are introduced into our daily lives, it becomes important for such systems to be capable of explaining the rationale for their decisions and allowing users to contest these decisions. A significant hurdle to allowing for such explanatory dialogue could be the {\\em vocabulary mismatch} between the user and the AI system. This paper introduces methods for providing contrastive explanations in terms of user-specified concepts for sequential decision-making settings where the system's model of the task may be best represented as an inscrutable model. We do this by building partial symbolic models of a local approximation of the task that can be leveraged to answer the user queries. We test these methods on a popular Atari game (Montezuma's Revenge) and variants of Sokoban (a well-known planning benchmark) and report the results of user studies to evaluate whether people find explanations generated in this form useful.", "keywords": "Explanations;XAI;Post-hoc explanations", "primary_area": "", "supplementary_material": "/attachment/cadc687dfa3160fd9f9298c433a8db107c10c110.zip", "author": "Sarath Sreedharan;Utkarsh Soni;Mudit Verma;Siddharth Srivastava;Subbarao Kambhampati", "authorids": "~Sarath_Sreedharan1;~Utkarsh_Soni1;~Mudit_Verma2;~Siddharth_Srivastava2;~Subbarao_Kambhampati1", "gender": ";M;M;;M", "homepage": ";https://usoni1.github.io/;https://famishedrover.github.io/;;http://rakaposhi.eas.asu.edu", "dblp": "162/5110;;192/7474;;k/SKambhampati", "google_scholar": ";3Nqzr90AAAAJ;8TtypKwAAAAJ;;yl3L07sAAAAJ", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~Sarath_Sreedharan1;~Utkarsh_Soni1;~Mudit_Verma2;~Siddharth_Srivastava2;~Subbarao_Kambhampati1", "aff": "Arizona State University;;Arizona State University;;Arizona State University", "aff_domain": "asu.edu;;asu.edu;;asu.edu", "position": "PhD student;;PhD student;;Full Professor", "bibtex": "@inproceedings{\nsreedharan2022bridging,\ntitle={Bridging the Gap: Providing Post-Hoc Symbolic Explanations for Sequential Decision-Making Problems with Inscrutable Representations},\nauthor={Sarath Sreedharan and Utkarsh Soni and Mudit Verma and Siddharth Srivastava and Subbarao Kambhampati},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=o-1v9hdSult}\n}", "github": "", "project": "", "reviewers": "ike6;wvCE;mLyQ;C3CZ", "pdf_size": 0, "recommendation": "5;6;8;10", "confidence": "4;2;4;4", "correctness": "2;4;3;4", "technical_novelty": "2;3;3;3", "empirical_novelty": "0;3;3;4", "wc_summary_paper": "259;40;165;204", "wc_summary_review": "87;33;51;139", "wc_main_review": "960;511;244;477", "wc_review": "1306;584;460;820", "wc_reply_reviewers": "23;0;0;0", "wc_reply_authors": "581;253;105;456", "reply_reviewers": "1;0;0;0", "reply_authors": "1;1;1;2", "recommendation_avg": [ 7.25, 1.920286436967152 ], "confidence_avg": [ 3.5, 0.8660254037844386 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 1.5 ], "wc_summary_paper_avg": [ 167.0, 80.56984547583544 ], "wc_summary_review_avg": [ 77.5, 40.48147724577254 ], "wc_main_review_avg": [ 548.0, 259.11869866916203 ], "wc_review_avg": [ 792.5, 323.4451267216744 ], "wc_reply_reviewers_avg": [ 5.75, 9.959292143521045 ], "wc_reply_authors_avg": [ 348.75, 183.04695435871093 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.3758230140014144, "corr_recommendation_correctness": 0.5888015039841447, "gs_citation": 42, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6902879575183490899&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "pdf": "https://openreview.net/pdf?id=o-1v9hdSult", "email": "asu.edu;;asu.edu;;asu.edu", "author_num": 5, "aff_unique_index": "0;0;0", "aff_unique_norm": "Arizona State University", "aff_unique_dep": "", "aff_unique_url": "https://www.asu.edu", "aff_unique_abbr": "ASU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Know Thyself: Transferable Visual Control Policies Through Robot-Awareness", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6041", "id": "o0ehFykKVtr", "poster": "", "openreview": "https://openreview.net/forum?id=o0ehFykKVtr", "slides": "https://iclr.cc/virtual/2022/poster/6041", "video": "https://iclr.cc/virtual/2022/poster/6041", "author_site": "Edward Hu, Kun Huang, Oleh Rybkin, Dinesh Jayaraman", "tldr": "", "abstract": "Training visual control policies from scratch on a new robot typically requires generating large amounts of robot-specific data. How might we leverage data previously collected on another robot to reduce or even completely remove this need for robot-specific data? We propose a \"robot-aware control\" paradigm that achieves this by exploiting readily available knowledge about the robot. We then instantiate this in a robot-aware model-based RL policy by training modular dynamics models that couple a transferable, robot-aware world dynamics module with a robot-specific, potentially analytical, robot dynamics module. This also enables us to set up visual planning costs that separately consider the robot agent and the world. Our experiments on tabletop manipulation tasks with simulated and real robots demonstrate that these plug-in improvements dramatically boost the transferability of visual model-based RL policies, even permitting zero-shot transfer of visual manipulation skills onto new robots. Project website: https://www.seas.upenn.edu/~hued/rac", "keywords": "visual foresight;dynamics models;visuomotor control;video prediction;planning;transfer", "primary_area": "", "supplementary_material": "", "author": "Edward S. Hu;Kun Huang;Oleh Rybkin;Dinesh Jayaraman", "authorids": "~Edward_S._Hu1;~Kun_Huang6;~Oleh_Rybkin1;~Dinesh_Jayaraman2", "gender": "M;M;M;M", "homepage": ";http://olehrybkin.com/;https://www.seas.upenn.edu/~dineshj/;https://www.edwardshu.com", "dblp": ";217/2946;145/3870;245/4627", "google_scholar": "RYLugBwAAAAJ;https://scholar.google.com/citations?view_op=list_works;QxLpghAAAAAJ;", "orcid": ";0000-0002-5898-006X;0000-0002-6888-3095;", "linkedin": "kun-huang-620034171/;oleh-rybkin/;dinesh-jayaraman-44b31539/;", "or_profile": "~Kun_Huang6;~Oleh_Rybkin1;~Dinesh_Jayaraman2;~Edward_Shichao_Hu1", "aff": "School of Engineering and Applied Science, University of Pennsylvania;Google DeepMind;University of Pennsylvania;University of Pennsylvania", "aff_domain": "seas.upenn.edu;deepmind.com;upenn.edu;upenn.edu", "position": "MS student;Intern;Assistant Professor;PhD student", "bibtex": "@inproceedings{\nhu2022know,\ntitle={Know Thyself: Transferable Visual Control Policies Through Robot-Awareness},\nauthor={Edward S. Hu and Kun Huang and Oleh Rybkin and Dinesh Jayaraman},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=o0ehFykKVtr}\n}", "github": "", "project": "", "reviewers": "2sNR;U6rV;BJUg;rNxF", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "3;3;4;4", "correctness": "3;3;4;3", "technical_novelty": "2;4;3;3", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "198;58;54;172", "wc_summary_review": "147;71;12;104", "wc_main_review": "358;251;174;506", "wc_review": "703;380;240;782", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 120.5, 65.1670929227321 ], "wc_summary_review_avg": [ 83.5, 49.29756586282937 ], "wc_main_review_avg": [ 322.25, 124.5960974509234 ], "wc_review_avg": [ 526.25, 223.59380022710826 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.6882472016116854, "corr_recommendation_correctness": -0.13245323570650439, "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12842278673686640517&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=o0ehFykKVtr", "email": "seas.upenn.edu;deepmind.com;upenn.edu;upenn.edu", "author_num": 4, "aff_unique_index": "0;1;0;0", "aff_unique_norm": "University of Pennsylvania;Google", "aff_unique_dep": "School of Engineering and Applied Science;Google DeepMind", "aff_unique_url": "https://www.upenn.edu;https://deepmind.com", "aff_unique_abbr": "UPenn;DeepMind", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0", "aff_country_unique": "United States;United Kingdom" }, { "id": "o1FEqIONNAa", "title": "Rank4Class: Examining Multiclass Classification through the Lens of Learning to Rank", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Multiclass classification (MCC) is a classical machine learning problem which aims to classify each instance into one of a predefined set of classes. Given an instance, a classification model computes a score for each class, all of which are then used to sort the classes. The performance of a classification model is usually measured by Top-K Accuracy (e.g., K=1 or 5). In this paper, we examine MCC through the lens of learning to rank (LTR) in the deep learning setting. By viewing MCC as to rank classes for an instance, we first argue that ranking metrics from the information retrieval literature, such as Normalized Discounted Cumulative Gain (NDCG), can be more informative than the existing Top-K metrics in evaluating the performance of classification models, especially for real-world user-facing applications. We further demonstrate that the most popular MCC architecture in deep learning can be mathematically formulated as a LTR pipeline equivalently, with a specific set of choices in terms of ranking model architecture and loss function. Based on these observations, we propose several techniques, stemmed from the rich LTR literature, to improve the MCC performance. Comprehensive empirical results on both text and image classification tasks, with diverse datasets and backbone models (e.g., BERT for text classification and ResNet for image classification) show the value of our proposed framework.", "keywords": "multiclass classification;learning to rank;neural ranking models", "primary_area": "", "supplementary_material": "", "author": "Nan Wang;Zhen Qin;Le Yan;Honglei Zhuang;Xuanhui Wang;Michael Bendersky;Marc Najork", "authorids": "~Nan_Wang6;~Zhen_Qin5;~Le_Yan1;~Honglei_Zhuang1;~Xuanhui_Wang1;~Michael_Bendersky1;~Marc_Najork1", "gender": "M;M;M;M;M;;M", "homepage": "http://www.cs.virginia.edu/~nw6a/;http://alumni.cs.ucr.edu/~zqin001/;;https://hongleizhuang.github.io/;;http://bendersky.github.io/;http://marc.najork.org/", "dblp": "84/864;;67/2358;10/9988;67/2661;80/4305;n/MarcNajork", "google_scholar": "https://scholar.google.com/citations?hl=en;Kv1yk3YAAAAJ;X_knTr4AAAAJ;FxEDj4wAAAAJ;;C9mxM5IAAAAJ;7HeAnjwAAAAJ", "orcid": ";0000-0001-6739-134X;;0000-0001-8134-1509;;0000-0002-2941-6240;0000-0003-1423-0854", "linkedin": "https://www.linkedin.com/public-profile/in/nan-nolen-wang-493341163?challengeId=AQEquDuYuK0KdAAAAXd-p60BoYifuxHUM8sbuGC1zveND5ifUDR5jduLsQ3NFivCjMxOS21SsmFG6K4n20UdyeCKLgXz2EFH-w&submissionId=b5d1bff9-5998-6116-18d7-1a300fe1552b;;;;;;najork/", "or_profile": "~Nan_Wang6;~Zhen_Qin5;~Le_Yan1;~Honglei_Zhuang1;~Xuanhui_Wang1;~Michael_Bendersky1;~Marc_Najork1", "aff": "University of Virginia;Google Deepmind;Google;Google DeepMind;Google;Google;Google Research", "aff_domain": "virginia.edu;google.com;google.com;google.com;google.com;google.com;google.com", "position": "PhD student;Researcher;Software Engineer;Research Scientist;Software Engineer;Researcher;Director, Research Engineering", "bibtex": "@misc{\nwang2022rankclass,\ntitle={Rank4Class: Examining Multiclass Classification through the Lens of Learning to Rank},\nauthor={Nan Wang and Zhen Qin and Le Yan and Honglei Zhuang and Xuanhui Wang and Michael Bendersky and Marc Najork},\nyear={2022},\nurl={https://openreview.net/forum?id=o1FEqIONNAa}\n}", "github": "", "project": "", "reviewers": "VmsX;p3x7;ijr5;yf95", "site": "https://openreview.net/forum?id=o1FEqIONNAa", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "4;4;3;4", "correctness": "2;3;3;3", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "112;65;45;45", "wc_summary_review": "23;35;37;82", "wc_main_review": "154;240;151;131", "wc_review": "289;340;233;258", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 66.75, 27.371289702898547 ], "wc_summary_review_avg": [ 44.25, 22.442983313276336 ], "wc_main_review_avg": [ 169.0, 41.934472692523514 ], "wc_review_avg": [ 280.0, 39.91866731242415 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:18E62NVzpNQJ:scholar.google.com/&scioq=Rank4Class:+Examining+Multiclass+Classification+through+the+Lens+of+Learning+to+Rank&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;2;2;2;2;2", "aff_unique_norm": "University of Virginia;DeepMind;Google", "aff_unique_dep": ";DeepMind;Google", "aff_unique_url": "https://www.virginia.edu;https://deepmind.com;https://www.google.com", "aff_unique_abbr": "UVA;DeepMind;Google", "aff_campus_unique_index": "1;1;1;1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;1;0;1;0;0;0", "aff_country_unique": "United States;United Kingdom" }, { "id": "o2Pgj6cCPXt", "title": "A Biology-Informed Similarity Metric for Simulated Patches of Human Cell Membrane", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Complex scientific inquiries rely increasingly upon large and autonomous multiscale simulation campaigns, which fundamentally require similarity metrics to quantify \"sufficient'' changes among data and/or configurations. However, subject matter experts are often unable to articulate similarity precisely or in terms of well-formulated definitions, especially when new hypotheses are to be explored, making it challenging to design a meaningful metric. Furthermore, the key to practical usefulness of such metrics to enable autonomous simulations lies in in situ inference, which requires generalization to possibly substantial distributional shifts in unseen, future data. \n\nHere, we address these challenges in a cancer biology application and develop a meaningful similarity metric for \"patches\" --- regions of simulated human cell membrane that express interactions between certain proteins of interest and relevant lipids. In the absence of well-defined conditions for similarity, we leverage several biology-informed notions about data and the underlying simulations to impose inductive biases on our metric learning framework, resulting in a suitable similarity metric that also generalizes well to significant distributional shifts encountered during the deployment. We combine these intuitions to organize the learned metric space in a multiscale manner, which makes the metric robust to incomplete and even contradictory intuitions. Our approach delivers a metric that not only performs well on the conditions used for its development and other relevant criteria, but also learns key temporal relationships from statistical mechanics without ever being exposed to any such information during training.", "keywords": "Applications;Cancer Biology;Autonomous Multiscale;Metric Learning;Similarity Metrics", "primary_area": "", "supplementary_material": "", "author": "Harsh Bhatia;Jayaraman J. Thiagarajan;Rushil Anirudh;T.S. Jayram;Tomas Oppelstrup;Helgi I. Ingolfsson;Felice C Lightstone;Peer-timo Bremer", "authorids": "~Harsh_Bhatia1;~Jayaraman_J._Thiagarajan3;~Rushil_Anirudh1;~T.S._Jayram1;~Tomas_Oppelstrup1;~Helgi_I._Ingolfsson1;~Felice_C_Lightstone1;~Peer-timo_Bremer1", "gender": ";M;;M;M;;M;M", "homepage": "http://www.sci.utah.edu/~hbhatia/publications;https://rushila.com/;http://tsjayram.net/;;;https://bbs.llnl.gov/FeliceLightstone.html;;https://jjthiagarajan.com", "dblp": ";136/5391;j/TSJayram;;;;20/3591;16/7803", "google_scholar": "https://scholar.google.com/citations?hl=en;WkoIlpQAAAAJ;8P-bBdIAAAAJ;;939CnJMAAAAJ;7FDtktoAAAAJ;https://scholar.google.com/citations?hl=en;cMz65_oAAAAJ", "orcid": "0000-0001-8712-7773;0000-0002-4186-3502;;0000-0002-6366-7190;0000-0002-7613-9143;;0000-0003-4107-3831;", "linkedin": ";rushilanirudh/;;;;;pebremer/;", "or_profile": "~Harsh_Bhatia1;~Rushil_Anirudh1;~T.S._Jayram1;~Tomas_Oppelstrup1;~Helgi_I._Ingolfsson1;~Felice_C_Lightstone1;~Peer-timo_Bremer1;~Jayaraman_J._Thiagarajan2", "aff": "Lawrence Livermore National Laboratory;Lawrence Livermore National Laboratory;Lawrence Livermore National Labs;Lawrence Livermore National Labs;;Lawrence Livermore National Labs;Lawrence Livermore National Labs;Lawrence Livermore National Labs", "aff_domain": "llnl.gov;llnl.gov;llnl.gov;llnl.gov;;llnl.gov;llnl.gov;llnl.gov", "position": "Computer Scientist;Computer Scientist;Research Scientist;Staff Scientist;;Principal Researcher;Principal Researcher;Computer Scientist", "bibtex": "@misc{\nbhatia2022a,\ntitle={A Biology-Informed Similarity Metric for Simulated Patches of Human Cell Membrane},\nauthor={Harsh Bhatia and Jayaraman J. Thiagarajan and Rushil Anirudh and T.S. Jayram and Tomas Oppelstrup and Helgi I. Ingolfsson and Felice C Lightstone and Peer-timo Bremer},\nyear={2022},\nurl={https://openreview.net/forum?id=o2Pgj6cCPXt}\n}", "github": "", "project": "", "reviewers": "M1o5;bfeR;DcqX;tkUj", "site": "https://openreview.net/forum?id=o2Pgj6cCPXt", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "5;4;3;3", "correctness": "2;3;3;4", "technical_novelty": "2;4;2;3", "empirical_novelty": "2;2;2;0", "wc_summary_paper": "41;125;246;236", "wc_summary_review": "17;118;40;64", "wc_main_review": "379;725;141;361", "wc_review": "437;968;427;661", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.75, 0.82915619758885 ], "empirical_novelty_avg": [ 1.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 162.0, 84.47188881515554 ], "wc_summary_review_avg": [ 59.75, 37.512497917360825 ], "wc_main_review_avg": [ 401.5, 208.96112078566193 ], "wc_review_avg": [ 623.25, 219.93223388125716 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": -0.899228803025897, "corr_recommendation_correctness": 0.9733285267845754, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18323215239250953422&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0;0;0;0;0;0", "aff_unique_norm": "Lawrence Livermore National Laboratory", "aff_unique_dep": "", "aff_unique_url": "https://www.llnl.gov", "aff_unique_abbr": "LLNL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "o2UwRc8fbXI", "title": "Adaptive Graph Capsule Convolutional Networks", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "In recent years, many studies utilize Convolutional Neural Networks (CNNs) to deal with non-grid graph data, known as Graph Convolutional Networks (GCNs). However, there exist two main restrictions of the prevalent GCNs. First, GCNs have a latent information loss problem since they use scalar-valued neurons rather than vector-valued ones to iterate through graph convolutions. Second, GCNs are presented statically with fixed architectures during training, which would limit their representation power. To tackle these two issues, based on a GNN model (CapsGNN) which encodes node embeddings as vectors, we propose Adaptive Graph Capsule Convolutional Networks (AdaGCCN) to adaptively adjust the model architecture at runtime. Specifically, we leverage Reinforcement Learning (RL) to design an assistant module for continuously selecting the optimal modification to the model structure through the whole training process. Moreover, we determine the architecture search space through analyzing the impacts of model's depth and width. To mitigate the computation overhead brought by the assistant module, we then deploy multiple workers to compute in parallel on GPU. Evaluations show that AdaGCCN achieves SOTA accuracy results and outperforms CapsGNN almost on all datasets in both bioinformatics and social fields. We also conduct experiments to indicate the efficiency of the paralleling strategy.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Shangwei Wu;Yingtong Xiong;Chuliang Weng", "authorids": "~Shangwei_Wu1;~Yingtong_Xiong1;~Chuliang_Weng1", "gender": "M;F;M", "homepage": ";;https://chuliangweng.github.io", "dblp": "286/1956;;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";yingtong-xiong-b6a7461a3/;", "or_profile": "~Shangwei_Wu1;~Yingtong_Xiong1;~Chuliang_Weng1", "aff": "East China Normal University;East China Normal University;East China Normal University", "aff_domain": "ecnu.edu.cn;ecnu.edu.cn;ecnu.edu.cn", "position": "PhD student;MS student;Full Professor", "bibtex": "@misc{\nwu2022adaptive,\ntitle={Adaptive Graph Capsule Convolutional Networks},\nauthor={Shangwei Wu and Yingtong Xiong and Chuliang Weng},\nyear={2022},\nurl={https://openreview.net/forum?id=o2UwRc8fbXI}\n}", "github": "", "project": "", "reviewers": "WHPK;4fse;ww6W;AxTh", "site": "https://openreview.net/forum?id=o2UwRc8fbXI", "pdf_size": 0, "recommendation": "3;3;3;6", "confidence": "4;4;4;4", "correctness": "3;2;2;3", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;0;0;3", "wc_summary_paper": "126;84;50;100", "wc_summary_review": "43;78;28;50", "wc_main_review": "340;958;82;119", "wc_review": "509;1120;160;269", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "688;1367;720;223", "reply_reviewers": "0;0;0;0", "reply_authors": "1;2;1;1", "recommendation_avg": [ 3.75, 1.299038105676658 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.25, 1.299038105676658 ], "wc_summary_paper_avg": [ 90.0, 27.53179979587241 ], "wc_summary_review_avg": [ 49.75, 18.14352501582865 ], "wc_main_review_avg": [ 374.75, 350.89127589611 ], "wc_review_avg": [ 514.5, 371.6856871067273 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 749.5, 407.17348882263934 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:eAT7-YV7c1UJ:scholar.google.com/&scioq=Adaptive+Graph+Capsule+Convolutional+Networks&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0;0", "aff_unique_norm": "East China Normal University", "aff_unique_dep": "", "aff_unique_url": "http://www.ecnu.edu.cn", "aff_unique_abbr": "ECNU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "id": "o6dG7nVYDS", "title": "Finding lost DG: Explaining domain generalization via model complexity", "track": "main", "status": "Reject", "tldr": "", "abstract": "The domain generalization (DG) problem setting challenges a model trained on multiple known data distributions to generalise well on unseen data distributions. Due to its practical importance, a large number of methods have been proposed to address this challenge. However most of this work is empirical, as the DG problem is hard to model formally; and recent evaluations have cast doubt on existing methods\u2019 practical efficacy -- in particular compared to a well chosen empirical risk minimisation baseline. \nWe present a novel learning-theoretic generalisation bound for DG that bounds novel domain generalisation performance in terms of the model\u2019s Rademacher complexity. Based on this, we conjecture that the causal factor behind existing methods\u2019 efficacy or lack thereof is a variant of the standard empirical risk-predictor complexity tradeoff, and demonstrate that their performance variability can be explained in these terms. Algorithmically, this analysis suggests that domain generalisation should be achieved by simply performing regularised ERM with a leave-one-domain-out cross-validation objective. Empirical results on the DomainBed benchmark corroborate this.", "keywords": "domain generalisation;rademacher complexity", "primary_area": "", "supplementary_material": "", "author": "Da Li;Henry Gouk;Timothy Hospedales", "authorids": "~Da_Li3;~Henry_Gouk1;~Timothy_Hospedales1", "gender": "M;M;M", "homepage": "https://dali-dl.github.io/;https://www.henrygouk.com;http://homepages.inf.ed.ac.uk/thospeda/", "dblp": "43/4804-1;172/0943;32/3545", "google_scholar": "RPvaE3oAAAAJ;https://scholar.google.co.nz/citations?user=i1bzlyAAAAAJ;https://scholar.google.fr/citations?user=nHhtvqkAAAAJ", "orcid": "0000-0002-2101-2989;;0000-0003-4867-7486", "linkedin": ";;timothyhospedales/", "or_profile": "~Da_Li3;~Henry_Gouk1;~Timothy_Hospedales1", "aff": "University of Edinburgh;University of Edinburgh;Samsung AI Research Centre", "aff_domain": "ed.ac.uk;ed.ac.uk;samsung.com", "position": "Visiting Scholar;Postdoc;Principal Researcher", "bibtex": "@misc{\nli2022finding,\ntitle={Finding lost {DG}: Explaining domain generalization via model complexity},\nauthor={Da Li and Henry Gouk and Timothy Hospedales},\nyear={2022},\nurl={https://openreview.net/forum?id=o6dG7nVYDS}\n}", "github": "", "project": "", "reviewers": "YVsq;jRak;ZKqa;LqhM", "site": "https://openreview.net/forum?id=o6dG7nVYDS", "pdf_size": 0, "recommendation": "3;5;5;8", "confidence": "4;4;4;4", "correctness": "2;3;3;3", "technical_novelty": "2;2;2;4", "empirical_novelty": "1;2;3;4", "wc_summary_paper": "61;83;79;54", "wc_summary_review": "13;119;63;10", "wc_main_review": "150;1535;399;456", "wc_review": "224;1737;541;520", "wc_reply_reviewers": "72;213;109;6", "wc_reply_authors": "472;748;282;515", "reply_reviewers": "1;1;1;1", "reply_authors": "2;1;1;1", "recommendation_avg": [ 5.25, 1.7853571071357126 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.8660254037844386 ], "empirical_novelty_avg": [ 2.5, 1.118033988749895 ], "wc_summary_paper_avg": [ 69.25, 12.090802289343747 ], "wc_summary_review_avg": [ 51.25, 44.420575187631236 ], "wc_main_review_avg": [ 635.0, 532.2034385458253 ], "wc_review_avg": [ 755.5, 580.3673405697464 ], "wc_reply_reviewers_avg": [ 100.0, 74.94998332221296 ], "wc_reply_authors_avg": [ 504.25, 165.80466670151355 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.7276068751089989, "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=108693391993816795&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "aff_unique_index": "0;0;1", "aff_unique_norm": "University of Edinburgh;Samsung", "aff_unique_dep": ";AI Research", "aff_unique_url": "https://www.ed.ac.uk;https://www.samsung.com/global/researchers/samsung-ai-research-centre/", "aff_unique_abbr": "Edinburgh;SARC", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1", "aff_country_unique": "United Kingdom;South Korea" }, { "id": "o86_622j0sb", "title": "Imperceptible Black-box Attack via Refining in Salient Region", "track": "main", "status": "Reject", "tldr": "", "abstract": "Deep neural networks are vulnerable to adversarial examples, even in the black-box setting where the attacker only has query access to the model output. Recent studies have devised successful black-box attacks with high query efficiency. However, such performance often comes at the cost of the imperceptibility of adversarial attacks, which is essential for attackers. To address this issue, in this paper we propose to use segmentation priors for black-box attacks such that the perturbations are limited in the salient region. We find that state-of-the-art black-box attacks equipped with segmentation priors can achieve much better imperceptibility performance with little reduction in query efficiency and success rate. We further propose the Saliency Attack, a new gradient-free black-box attack that can further improve the imperceptibility by refining perturbations in the salient region. Experimental results show that the perturbations generated by our approach are much more imperceptible than the ones generated by other attacks, and are interpretable to some extent. Furthermore, our approach is found to be more robust to detection-based defense, which demonstrates its efficacy as well.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zeyu Dai;Shengcai Liu;Ke Tang;Qing Li", "authorids": "~Zeyu_Dai2;~Shengcai_Liu1;~Ke_Tang2;~Qing_Li5", "gender": "M;;M;M", "homepage": "https://scholar.google.com/citations?user=D-6MHNUAAAAJ&hl=en;;https://faculty.sustech.edu.cn/tangk3/;https://www4.comp.polyu.edu.hk/~csqli/", "dblp": "262/2474.html;;https://dblp.uni-trier.de/pers/hd/t/Tang:Ke.html;(2024-11-14-1812689)", "google_scholar": "D-6MHNUAAAAJ;;mzLHFbAAAAAJ;https://scholar.google.co.in/citations?user=D1LEg-YAAAAJ", "orcid": "0000-0002-1351-476X;;0000-0002-6236-2002;0000-0003-3370-471X", "linkedin": ";;;", "or_profile": "~Zeyu_Dai2;~Shengcai_Liu1;~Ke_Tang2;~Qing_Li5", "aff": "The Hong Kong Polytechnic University;;Southern University of Science and Technology;Hong Kong Polytechnic University", "aff_domain": "polyu.edu.hk;;sustech.edu.cn;polyu.edu.hk", "position": "PhD student;;Full Professor;Full Professor", "bibtex": "@misc{\ndai2022imperceptible,\ntitle={Imperceptible Black-box Attack via Refining in Salient Region},\nauthor={Zeyu Dai and Shengcai Liu and Ke Tang and Qing Li},\nyear={2022},\nurl={https://openreview.net/forum?id=o86_622j0sb}\n}", "github": "", "project": "", "reviewers": "Hs6q;zKra;3FMW;2cEz", "site": "https://openreview.net/forum?id=o86_622j0sb", "pdf_size": 0, "recommendation": "5;5;5;5", "confidence": "4;4;4;4", "correctness": "3;3;4;2", "technical_novelty": "3;2;2;2", "empirical_novelty": "2;3;3;2", "wc_summary_paper": "86;75;72;89", "wc_summary_review": "25;92;59;28", "wc_main_review": "228;173;332;424", "wc_review": "339;340;463;541", "wc_reply_reviewers": "0;0;361;58", "wc_reply_authors": "687;550;1000;761", "reply_reviewers": "0;0;2;1", "reply_authors": "1;1;3;2", "recommendation_avg": [ 5.0, 0.0 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 80.5, 7.158910531638177 ], "wc_summary_review_avg": [ 51.0, 27.15695122800054 ], "wc_main_review_avg": [ 289.25, 96.5022668127542 ], "wc_review_avg": [ 420.75, 85.80319050012068 ], "wc_reply_reviewers_avg": [ 104.75, 149.82886070447177 ], "wc_reply_authors_avg": [ 749.5, 163.23985420233626 ], "reply_reviewers_avg": [ 0.75, 0.82915619758885 ], "reply_authors_avg": [ 1.75, 0.82915619758885 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:EVjtfmL43K4J:scholar.google.com/&scioq=Imperceptible+Black-box+Attack+via+Refining+in+Salient+Region&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;0", "aff_unique_norm": "Hong Kong Polytechnic University;Southern University of Science and Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.polyu.edu.hk;https://www.sustech.edu.cn", "aff_unique_abbr": "PolyU;SUSTech", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "id": "o8gZlfQNZDJ", "title": "An Integrated System Architecture for Generative Audio Modeling", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "We introduce a new system for data-driven audio sound model design built around two different neural network architectures, a Generative Adversarial Network(GAN) and a Recurrent Neural Network (RNN), that takes advantage of the unique characteristics of each to achieve the system objectives that neither is capable of addressing alone. The objective of the system is to generate interactively controllable sound models given (a) a range of sounds the model should be able to synthesize, and (b) a specification of the parametric controls for navigating that space of sounds. The range of sounds is defined by a dataset provided by the designer, while the means of navigation is defined by a combination of data labels and the selection of a sub-manifold from the latent space learned by the GAN. Our proposed system takes advantage of the rich latent space of GAN that consists of sounds that fill out the spaces \u201cbetween\u201d real data-like sounds. This augmented data from GAN is then used to train an RNN, that has the capability of immediate parameter response, and generation of audio over unlimited periods of time. Furthermore, we develop a self organizing map technique for \u201dsmoothing\u201d the latent space of GAN that results in perceptually smooth interpolation between audio timbres. We validate this process through user studies. Our system contributes advances to the state of the art for generative sound model design that include system configuration and components for improving interpolation and the expansion of audio modeling capabilities beyond musical pitch and percussive instrument sounds into the more complex space of audio textures.", "keywords": "audio textures;sound synthesis;generative adversarial network;recurrent neural network;self organizing map", "primary_area": "", "supplementary_material": "", "author": "Lonce Wyse;Purnima Kamath;Chitralekha Gupta", "authorids": "~Lonce_Wyse1;~Purnima_Kamath1;~Chitralekha_Gupta1", "gender": "M;F;F", "homepage": "https://lonce.org;;https://chitralekha18.github.io/home/", "dblp": "w/LonceLWyse;;116/5545", "google_scholar": "EQjThO4AAAAJ;;NFi7pkcAAAAJ", "orcid": "0000-0002-9200-1048;0000-0003-0351-6574;0000-0003-1350-9095", "linkedin": ";;chitralekha-gupta-04213546/", "or_profile": "~Lonce_Wyse1;~Purnima_Kamath1;~Chitralekha_Gupta2", "aff": "National University of Singapore;National University of Singapore;National University of Singapore", "aff_domain": "nus.edu.sg;nus.edu;nus.edu.sg", "position": "Associate Professor;PhD student;Postdoc", "bibtex": "@misc{\nwyse2022an,\ntitle={An Integrated System Architecture for Generative Audio Modeling},\nauthor={Lonce Wyse and Purnima Kamath and Chitralekha Gupta},\nyear={2022},\nurl={https://openreview.net/forum?id=o8gZlfQNZDJ}\n}", "github": "", "project": "", "reviewers": "E25e;jqzf;Jh5X;XXCg", "site": "https://openreview.net/forum?id=o8gZlfQNZDJ", "pdf_size": 0, "recommendation": "3;5;6;6", "confidence": "4;4;3;4", "correctness": "2;3;4;3", "technical_novelty": "2;3;2;3", "empirical_novelty": "2;2;0;3", "wc_summary_paper": "102;164;79;234", "wc_summary_review": "39;53;46;98", "wc_main_review": "152;363;212;882", "wc_review": "293;580;337;1214", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 5.0, 1.224744871391589 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 1.75, 1.0897247358851685 ], "wc_summary_paper_avg": [ 144.75, 60.180457791545585 ], "wc_summary_review_avg": [ 59.0, 23.054283766797006 ], "wc_main_review_avg": [ 402.25, 287.4546703395163 ], "wc_review_avg": [ 606.0, 367.65132938696144 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.4714045207910316, "corr_recommendation_correctness": 0.8660254037844386, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:ppSFrZgwQKIJ:scholar.google.com/&scioq=An+Integrated+System+Architecture+for+Generative+Audio+Modeling&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0;0", "aff_unique_norm": "National University of Singapore", "aff_unique_dep": "", "aff_unique_url": "https://www.nus.edu.sg", "aff_unique_abbr": "NUS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Singapore" }, { "id": "o8iGesI9HN-", "title": "Optimized Separable Convolution: Yet Another Efficient Convolution Operator", "track": "main", "status": "Reject", "tldr": "", "abstract": "The convolution operation is the most critical component in recent surge of deep learning research. Conventional 2D convolution needs O(C^{2}K^{2}) parameters to represent, where C is the channel size and K is the kernel size. The amount of parameters has become really costly considering that these parameters increased tremendously recently to meet the needs of demanding applications. Among various implementations of the convolution, separable convolution has been proven to be more efficient in reducing the model size. For example, depth separable convolution reduces the complexity to O(C\\cdot(C+K^{2})) while spatial separable convolution reduces the complexity to O(C^{2}K). However, these are considered ad hoc designs which cannot ensure that they can in general achieve optimal separation. In this research, we propose a novel and principled operator called optimized separable convolution by optimal design for the internal number of groups and kernel sizes for general separable convolutions can achieve the complexity of O(C^{\\frac{3}{2}}K). When the restriction in the number of separated convolutions can be lifted, an even lower complexity at O(C\\cdot\\log(CK^{2})) can be achieved. Experimental results demonstrate that the proposed optimized separable convolution is able to achieve an improved performance in terms of accuracy-#Params trade-offs over both conventional, depth-wise, and depth/spatial separable convolutions. ", "keywords": "Separable Convolution;Volumetric Receptive Field;Optimized", "primary_area": "", "supplementary_material": "", "author": "Tao Wei;Yonghong Tian;Yaowei Wang;Yun Liang;Chang Wen Chen", "authorids": "~Tao_Wei1;~Yonghong_Tian1;~Yaowei_Wang1;~Yun_Liang1;changwen.chen@polyu.edu.hk", "gender": "M;M;M;M;", "homepage": ";http://www.pkuml.org;https://dblp.org/pid/68/2992.html;http://eecs.pku.edu.cn/EN/People/Faculty/Detail/?ID=6191;", "dblp": ";86/5857;68/2992-1;;", "google_scholar": ";https://scholar.google.com/citations?hl=en;https://scholar.google.com/citations?hl=zh-CN;https://scholar.google.com.tw/citations?user=Ltp8loUAAAAJ;", "orcid": ";0000-0002-2978-5935;0000-0002-6110-4036;;", "linkedin": ";;yaowei-wang-971ab310/;;", "or_profile": "~Tao_Wei1;~Yonghong_Tian1;~Yaowei_Wang1;~Yun_Liang1;changwen.chen@polyu.edu.hk", "aff": ";Peking University;Pengcheng Laboratory;Peking University;", "aff_domain": ";pku.edu.cn;pcl.ac.cn;pku.edu.cn;", "position": ";Full Professor;Full Professor;Associate Professor;", "bibtex": "@misc{\nwei2022optimized,\ntitle={Optimized Separable Convolution: Yet Another Efficient Convolution Operator},\nauthor={Tao Wei and Yonghong Tian and Yaowei Wang and Yun Liang and Chang Wen Chen},\nyear={2022},\nurl={https://openreview.net/forum?id=o8iGesI9HN-}\n}", "github": "", "project": "", "reviewers": "yy2C;a8wG;bjqr;VokC", "site": "https://openreview.net/forum?id=o8iGesI9HN-", "pdf_size": 0, "recommendation": "5;5;5;5", "confidence": "4;5;4;3", "correctness": "2;3;4;3", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "66;66;64;47", "wc_summary_review": "56;31;78;44", "wc_main_review": "246;284;624;179", "wc_review": "368;381;766;270", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 5.0, 0.0 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 60.75, 7.980444849756184 ], "wc_summary_review_avg": [ 52.25, 17.297037318569906 ], "wc_main_review_avg": [ 333.25, 172.02234593214916 ], "wc_review_avg": [ 446.25, 189.52885664193724 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6723771875113616306&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff_unique_index": "0;1;0", "aff_unique_norm": "Peking University;Pengcheng Laboratory", "aff_unique_dep": ";", "aff_unique_url": "http://www.pku.edu.cn;", "aff_unique_abbr": "Peking U;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "id": "o9DnX55PEAo", "title": "Cross-Architecture Distillation Using Bidirectional CMOW Embeddings", "track": "main", "status": "Reject", "tldr": "", "abstract": "Large pretrained language models (PreLMs) are revolutionizing natural language processing across all benchmarks. However, their sheer size is prohibitive for small laboratories or deployment on mobile devices. Approaches like pruning and distillation reduce the model size but typically retain the same model architecture. In contrast, we explore distilling PreLMs into a different, more efficient architecture CMOW, which embeds each word as a matrix and uses matrix multiplication to encode sequences. We extend the CMOW architecture and its CMOW/CBOW-Hybrid variant with a bidirectional component, per-token representations for distillation during pretraining, and a two-sequence encoding scheme that facilitates downstream tasks on sentence pairs such as natural language inferencing. Our results show that the embedding-based models yield scores comparable to DistilBERT on QQP and RTE, while using only half of its parameters and providing three times faster inference speed. We match or exceed the scores of ELMo, and only fall behind more expensive models on linguistic acceptability. Still, our distilled bidirectional CMOW/CBOW-Hybrid model more than doubles the scores on linguistic acceptability compared to previous cross-architecture distillation approaches. Furthermore, our experiments confirm the positive effects of bidirection and the two-sequence encoding scheme.", "keywords": "natural language processing;word embedding;knowledge distillation;model compression;efficient methods;transfer learning", "primary_area": "", "supplementary_material": "/attachment/246510cf27084667b79e96b66e632a2678589bf2.zip", "author": "Lukas Paul Achatius Galke;Isabelle Cuber;Christoph Meyer;Henrik Ferdinand N\u00f6lscher;Angelina Sonderecker;Ansgar Scherp", "authorids": "~Lukas_Paul_Achatius_Galke1;isabelle.cuber@uni-ulm.de;christoph-1.meyer@uni-ulm.de;~Henrik_Ferdinand_N\u00f6lscher1;angelina.sonderecker@uni-ulm.de;~Ansgar_Scherp1", "gender": "M;;;;;M", "homepage": "http://www.lpag.de;;;;;", "dblp": "200/7830;;;;;06/2380", "google_scholar": "https://scholar.google.de/citations?user=AHGGdYQAAAAJ;;;;;", "orcid": "0000-0001-6124-1092;;;;;0000-0002-2653-9245", "linkedin": "lukas-galke-8086b0155/;;;henrik-ferdinand-n%C3%B6lscher-a30526105/;;", "or_profile": "~Lukas_Paul_Achatius_Galke1;isabelle.cuber@uni-ulm.de;christoph-1.meyer@uni-ulm.de;~Henrik_Ferdinand_N\u00f6lscher1;angelina.sonderecker@uni-ulm.de;~Ansgar_Scherp1", "aff": "Kiel University;;;Ulm University;;Ulm University", "aff_domain": "informatik.uni-kiel.de;;;uni-ulm.de;;uni-ulm.de", "position": "PhD student;;;MS student;;Full Professor", "bibtex": "@misc{\ngalke2022crossarchitecture,\ntitle={Cross-Architecture Distillation Using Bidirectional {CMOW} Embeddings},\nauthor={Lukas Paul Achatius Galke and Isabelle Cuber and Christoph Meyer and Henrik Ferdinand N{\\\"o}lscher and Angelina Sonderecker and Ansgar Scherp},\nyear={2022},\nurl={https://openreview.net/forum?id=o9DnX55PEAo}\n}", "github": "", "project": "", "reviewers": "M3tk;AXE2;mHTJ", "site": "https://openreview.net/forum?id=o9DnX55PEAo", "pdf_size": 0, "recommendation": "3;5;6", "confidence": "5;3;4", "correctness": "2;3;4", "technical_novelty": "1;3;3", "empirical_novelty": "0;3;2", "wc_summary_paper": "70;45;80", "wc_summary_review": "35;108;47", "wc_main_review": "108;77;229", "wc_review": "213;230;356", "wc_reply_reviewers": "117;467;16", "wc_reply_authors": "252;852;191", "reply_reviewers": "1;3;1", "reply_authors": "1;4;1", "recommendation_avg": [ 4.666666666666667, 1.247219128924647 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "correctness_avg": [ 3.0, 0.816496580927726 ], "technical_novelty_avg": [ 2.3333333333333335, 0.9428090415820634 ], "empirical_novelty_avg": [ 1.6666666666666667, 1.247219128924647 ], "wc_summary_paper_avg": [ 65.0, 14.719601443879744 ], "wc_summary_review_avg": [ 63.333333333333336, 31.961782734314987 ], "wc_main_review_avg": [ 138.0, 65.57946833168646 ], "wc_review_avg": [ 266.3333333333333, 63.782616928299696 ], "wc_reply_reviewers_avg": [ 200.0, 193.2476821766995 ], "wc_reply_authors_avg": [ 431.6666666666667, 298.2620026456978 ], "reply_reviewers_avg": [ 1.6666666666666667, 0.9428090415820634 ], "reply_authors_avg": [ 2.0, 1.4142135623730951 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.6546536707079772, "corr_recommendation_correctness": 0.9819805060619659, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:wzDLzaV8BnIJ:scholar.google.com/&scioq=Cross-Architecture+Distillation+Using+Bidirectional+CMOW+Embeddings&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;1", "aff_unique_norm": "Kiel University;Ulm University", "aff_unique_dep": ";", "aff_unique_url": "https://www.uni-kiel.de;https://www.uni-ulm.de/", "aff_unique_abbr": "CAU;U Ulm", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Germany" }, { "title": "CoordX: Accelerating Implicit Neural Representation with a Split MLP Architecture", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7142", "id": "oAy7yPmdNz", "poster": "", "openreview": "https://openreview.net/forum?id=oAy7yPmdNz", "slides": "https://iclr.cc/virtual/2022/poster/7142", "video": "https://iclr.cc/virtual/2022/poster/7142", "author_site": "Ruofan Liang, Hongyi Sun, Nandita Vijaykumar", "tldr": "", "abstract": "Implicit neural representations with multi-layer perceptrons (MLPs) have recently gained prominence for a wide variety of tasks such as novel view synthesis and 3D object representation and rendering. However, a significant challenge with these representations is that both training and inference with an MLP over a large number of input coordinates to learn and represent an image, video, or 3D object, require large amounts of computation and incur long processing times. In this work, we aim to accelerate inference and training of coordinate-based MLPs for implicit neural representations by proposing a new split MLP architecture, CoordX. With CoordX, the initial layers are split to learn each dimension of the input coordinates separately. The intermediate features are then fused by the last layers to generate the learned signal at the corresponding coordinate point. This significantly reduces the amount of computation required and leads to large speedups in training and inference, while achieving similar accuracy as the baseline MLP. This approach thus aims at first learning functions that are a decomposition of the original signal and then fusing them to generate the learned signal. Our proposed architecture can be generally used for many implicit neural representation tasks with no additional memory overheads. We demonstrate a speedup of up to 2.92x compared to the baseline model for image, video, and 3D shape representation and rendering tasks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ruofan Liang;Hongyi Sun;Nandita Vijaykumar", "authorids": "~Ruofan_Liang1;~Hongyi_Sun2;nandita@cs.toronto.edu", "gender": "M;M;", "homepage": "https://nexuslrf.github.io/;https://www.linkedin.com/in/hongyi-sun/;", "dblp": "246/4635;;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": "~Ruofan_Liang1;~Hongyi_Sun2;nandita@cs.toronto.edu", "aff": "University of Toronto;Toronto University;", "aff_domain": "toronto.edu;utoronto.ca;", "position": "PhD student;Undergrad student;", "bibtex": "@inproceedings{\nliang2022coordx,\ntitle={CoordX: Accelerating Implicit Neural Representation with a Split {MLP} Architecture},\nauthor={Ruofan Liang and Hongyi Sun and Nandita Vijaykumar},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=oAy7yPmdNz}\n}", "github": "", "project": "", "reviewers": "z7hu;KUjc;xWSd;MGAi", "pdf_size": 0, "recommendation": "6;6;8;8", "confidence": "4;3;4;4", "correctness": "3;4;4;4", "technical_novelty": "3;2;3;4", "empirical_novelty": "3;2;2;3", "wc_summary_paper": "217;53;87;134", "wc_summary_review": "207;12;260;33", "wc_main_review": "706;44;657;453", "wc_review": "1130;109;1004;620", "wc_reply_reviewers": "31;0;21;0", "wc_reply_authors": "1042;126;252;179", "reply_reviewers": "1;0;1;0", "reply_authors": "2;1;1;1", "recommendation_avg": [ 7.0, 1.0 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 122.75, 61.54825342769687 ], "wc_summary_review_avg": [ 128.0, 107.4081002531932 ], "wc_main_review_avg": [ 465.0, 260.9262347867688 ], "wc_review_avg": [ 715.75, 397.49363202446403 ], "wc_reply_reviewers_avg": [ 13.0, 13.47219358530748 ], "wc_reply_authors_avg": [ 399.75, 373.49188411530446 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1735681538525757283&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=oAy7yPmdNz", "email": "toronto.edu;utoronto.ca;", "author_num": 3, "aff_unique_index": "0;0", "aff_unique_norm": "University of Toronto", "aff_unique_dep": "", "aff_unique_url": "https://www.utoronto.ca", "aff_unique_abbr": "U of T", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Canada" }, { "id": "oC12z8lkbrU", "title": "Generate, Annotate, and Learn: Generative Models Advance Self-Training and Knowledge Distillation", "track": "main", "status": "Reject", "tldr": "", "abstract": "Semi-Supervised Learning (SSL) has seen success in many application domains, but this success often relies on the availability of task-specific unlabeled data. Knowledge distillation (KD) has enabled compressing deep networks, achieving the best results when distilling knowledge on fresh task-specific unlabeled examples. However, task-specific unlabeled data can be challenging to find, especially for NLP problems. We present a simple framework called \"generate, annotate, and learn (GAL)\" that uses unconditional language models to synthesize in-domain unlabeled data, helping advance SSL and KD on NLP and tabular tasks. To obtain strong task-specific generative models, we either fine-tune a large language model (LLM) on inputs from specific tasks, or prompt a LLM with a few input examples to generate more unlabeled examples. Then, we use existing classifiers to annotate generated unlabeled examples with pseudo labels, which are used as additional training data or as additional prompts. GAL improves prompt-based few-shot learning on several NLP tasks. It also yields a new state-of-the-art for 6-layer transformers on the GLUE leaderboard. Finally, self-training with GAL offers large gains on four tabular tasks from the UCI repository.", "keywords": "deep generative models;semi-supervised learning;knowledge distillation;large language models", "primary_area": "", "supplementary_material": "", "author": "Xuanli He;Islam Nassar;Jamie Ryan Kiros;Gholamreza Haffari;Mohammad Norouzi", "authorids": "~Xuanli_He2;~Islam_Nassar1;~Jamie_Ryan_Kiros2;~Gholamreza_Haffari1;~Mohammad_Norouzi1", "gender": "M;M;;M;M", "homepage": ";;;https://rezahaffari.github.io/HomePage/HomePage.html;https://norouzi.github.io/", "dblp": "182/1859;255/6650;;;https://dblp.org/pers/hd/n/Norouzi_0002:Mohammad", "google_scholar": "TU8t0iAAAAAJ;WUrsctAAAAAJ;;https://scholar.google.com.tw/citations?user=Perjx5EAAAAJ;Lncr-VoAAAAJ", "orcid": ";;;;", "linkedin": ";ihnassar/;;gholamrezahaffari/?originalSubdomain=au;", "or_profile": "~Xuanli_He2;~Islam_Nassar1;~Jamie_Ryan_Kiros2;~Gholamreza_Haffari1;~Mohammad_Norouzi1", "aff": "Monash University;Monash University;Google;Monash University;Google Brain", "aff_domain": "monash.edu.au;monash.edu;google.com;monash.edu;google.com", "position": "PhD student;PhD student;Research Scientist;Full Professor;Research Scientist", "bibtex": "@misc{\nhe2022generate,\ntitle={Generate, Annotate, and Learn: Generative Models Advance Self-Training and Knowledge Distillation},\nauthor={Xuanli He and Islam Nassar and Jamie Ryan Kiros and Gholamreza Haffari and Mohammad Norouzi},\nyear={2022},\nurl={https://openreview.net/forum?id=oC12z8lkbrU}\n}", "github": "", "project": "", "reviewers": "Xyfj;Ke74;zaeu;uF4y", "site": "https://openreview.net/forum?id=oC12z8lkbrU", "pdf_size": 0, "recommendation": "5;5;6;8", "confidence": "4;4;4;3", "correctness": "3;4;4;4", "technical_novelty": "3;2;3;3", "empirical_novelty": "3;3;4;3", "wc_summary_paper": "44;109;122;75", "wc_summary_review": "42;157;21;65", "wc_main_review": "546;478;195;426", "wc_review": "632;744;338;566", "wc_reply_reviewers": "103;816;0;0", "wc_reply_authors": "834;1446;120;320", "reply_reviewers": "1;1;0;0", "reply_authors": "2;2;1;1", "recommendation_avg": [ 6.0, 1.224744871391589 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 87.5, 30.41792234851026 ], "wc_summary_review_avg": [ 71.25, 51.895929512824026 ], "wc_main_review_avg": [ 411.25, 131.90408447049697 ], "wc_review_avg": [ 570.0, 148.2902559172382 ], "wc_reply_reviewers_avg": [ 229.75, 341.07358077107057 ], "wc_reply_authors_avg": [ 680.0, 513.2426326797103 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.9428090415820632, "corr_recommendation_correctness": 0.4714045207910316, "gs_citation": 30, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13822079291884042543&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "aff_unique_index": "0;0;1;0;1", "aff_unique_norm": "Monash University;Google", "aff_unique_dep": ";Google", "aff_unique_url": "https://www.monash.edu;https://www.google.com", "aff_unique_abbr": "Monash;Google", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;1;0;1", "aff_country_unique": "Australia;United States" }, { "title": "Self-Supervision Enhanced Feature Selection with Correlated Gates", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7103", "id": "oDFvtxzPOx", "poster": "", "openreview": "https://openreview.net/forum?id=oDFvtxzPOx", "slides": "https://iclr.cc/virtual/2022/poster/7103", "video": "https://iclr.cc/virtual/2022/poster/7103", "author_site": "Changhee Lee, Fergus Imrie, Mihaela van der Schaar", "tldr": "", "abstract": "Discovering relevant input features for predicting a target variable is a key scientific question. However, in many domains, such as medicine and biology, feature selection is confounded by a scarcity of labeled samples coupled with significant correlations among features. In this paper, we propose a novel deep learning approach to feature selection that addresses both challenges simultaneously. First, we pre-train the network using unlabeled samples within a self-supervised learning framework by solving pretext tasks that require the network to learn informative representations from partial feature sets. Then, we fine-tune the pre-trained network to discover relevant features using labeled samples. During both training phases, we explicitly account for the correlation structure of the input features by generating correlated gate vectors from a multivariate Bernoulli distribution. Experiments on multiple real-world datasets including clinical and omics demonstrate that our model discovers relevant features that provide superior prediction performance compared to the state-of-the-art benchmarks in practical scenarios where there is often limited labeled data and high correlations among features.", "keywords": "Feature Selection;Feature Importance;Self-Supervised Learning", "primary_area": "", "supplementary_material": "/attachment/f8284301f79e18548a4d2c1d486f6825ffd71039.zip", "author": "Changhee Lee;Fergus Imrie;Mihaela van der Schaar", "authorids": "~Changhee_Lee1;~Fergus_Imrie1;~Mihaela_van_der_Schaar2", "gender": ";;F", "homepage": ";;https://www.vanderschaar-lab.com", "dblp": ";281/4466;", "google_scholar": "https://scholar.google.com/citations?hl=en;4qCGgpsAAAAJ;DZ3S--MAAAAJ", "orcid": ";0000-0002-6241-0123;", "linkedin": ";;", "or_profile": "~Changhee_Lee1;~Fergus_Imrie1;~Mihaela_van_der_Schaar2", "aff": "ChungAng University;University of California, Los Angeles;University of California, Los Angeles", "aff_domain": "cau.ac.kr;ucla.edu;ucla.edu", "position": "Assistant Professor;Postdoc;Full Professor", "bibtex": "@inproceedings{\nlee2022selfsupervision,\ntitle={Self-Supervision Enhanced Feature Selection with Correlated Gates},\nauthor={Changhee Lee and Fergus Imrie and Mihaela van der Schaar},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=oDFvtxzPOx}\n}", "github": "", "project": "", "reviewers": "K8kD;gJvk;uaxu", "pdf_size": 0, "recommendation": "8;8;10", "confidence": "3;2;4", "correctness": "4;4;4", "technical_novelty": "3;3;3", "empirical_novelty": "3;3;3", "wc_summary_paper": "51;54;83", "wc_summary_review": "17;18;78", "wc_main_review": "121;131;378", "wc_review": "189;203;539", "wc_reply_reviewers": "0;0;15", "wc_reply_authors": "716;195;669", "reply_reviewers": "0;0;1", "reply_authors": "1;1;1", "recommendation_avg": [ 8.666666666666666, 0.9428090415820634 ], "confidence_avg": [ 3.0, 0.816496580927726 ], "correctness_avg": [ 4.0, 0.0 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 62.666666666666664, 14.429907214608907 ], "wc_summary_review_avg": [ 37.666666666666664, 28.522895287041877 ], "wc_main_review_avg": [ 210.0, 118.86406802169724 ], "wc_review_avg": [ 310.3333333333333, 161.79273435135735 ], "wc_reply_reviewers_avg": [ 5.0, 7.0710678118654755 ], "wc_reply_authors_avg": [ 526.6666666666666, 235.30736400621964 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.8660254037844385, "corr_recommendation_correctness": 0.0, "gs_citation": 28, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12895226581972495416&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "pdf": "https://openreview.net/pdf?id=oDFvtxzPOx", "email": "cau.ac.kr;ucla.edu;ucla.edu", "author_num": 3, "aff_unique_index": "0;1;1", "aff_unique_norm": "Chungang University;University of California, Los Angeles", "aff_unique_dep": ";", "aff_unique_url": "http://www.cau.ac.kr;https://www.ucla.edu", "aff_unique_abbr": "CAU;UCLA", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Los Angeles", "aff_country_unique_index": "0;1;1", "aff_country_unique": "South Korea;United States" }, { "id": "oEV21dutJ0L", "title": "Joint Self-Supervised Learning for Vision-based Reinforcement Learning", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Vision-based reinforcement learning requires efficient and robust representations of image-based observations, especially when the images contain distracting (task-irrelevant) elements such as shadows, clouds, and light. It becomes even more difficult if those distractions are not exposed during training. Although several recent studies have shown that representation robustness can be improved, they still suffer from relatively low performance even in simple and static backgrounds. To enhance the quality of representation, we design an RL framework that combines three different self-supervised learning methods; 1) Adversarial Representation, 2) Forward Dynamics, and 3) Inverse Dynamics. For a set of continuous control tasks on the DeepMind Control suite, our joint self-supervised RL (JS2RL) efficiently learns the task control in both simple and distracting backgrounds, and significantly improves generalization performance for unseen backgrounds. In an autonomous driving task, CARLA, our JS2RL also achieved the best performance on complex and realistic observations containing a lot of task-irrelevant information.", "keywords": "reinforcement learning;self-supervised learning;data efficiency;generalization", "primary_area": "", "supplementary_material": "/attachment/680e496b0ea4ae0b1d950216bbc71afa9225b19c.zip", "author": "Kyoungsoo Kim;Jeongsoo Ha;Yusung Kim", "authorids": "~Kyoungsoo_Kim1;~Jeongsoo_Ha1;~Yusung_Kim1", "gender": "M;M;M", "homepage": ";https://github.com/JeongsooHa;", "dblp": ";;29/5153-1.html", "google_scholar": ";;https://scholar.google.co.kr/citations?user=xGbTgpoAAAAJ", "orcid": ";;0000-0002-9306-8738", "linkedin": "http://www.linkedin.com/in/kyoungsookim;jeongsoo-ha-795944117/;", "or_profile": "~Kyoungsoo_Kim1;~Jeongsoo_Ha1;~Yusung_Kim1", "aff": "Sungkyunkwan University;SungKyunKwan University;Sung Kyun Kwan University", "aff_domain": "g.skku.edu;g.skku.edu;skku.edu", "position": "MS student;Undergrad student;Associate Professor", "bibtex": "@misc{\nkim2022joint,\ntitle={Joint Self-Supervised Learning for Vision-based Reinforcement Learning},\nauthor={Kyoungsoo Kim and Jeongsoo Ha and Yusung Kim},\nyear={2022},\nurl={https://openreview.net/forum?id=oEV21dutJ0L}\n}", "github": "", "project": "", "reviewers": "A7MX;e6pr;mrcC;Uaov;9iLS", "site": "https://openreview.net/forum?id=oEV21dutJ0L", "pdf_size": 0, "recommendation": "1;3;5;5;6", "confidence": "4;4;4;4;4", "correctness": "2;2;3;3;3", "technical_novelty": "1;2;2;2;2", "empirical_novelty": "2;2;3;2;3", "wc_summary_paper": "92;64;66;77;84", "wc_summary_review": "80;34;61;92;56", "wc_main_review": "391;395;449;420;226", "wc_review": "563;493;576;589;366", "wc_reply_reviewers": "250;68;0;68;0", "wc_reply_authors": "811;913;913;846;816", "reply_reviewers": "1;1;0;1;0", "reply_authors": "2;2;2;2;2", "recommendation_avg": [ 4.0, 1.7888543819998317 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 2.6, 0.4898979485566356 ], "technical_novelty_avg": [ 1.8, 0.4000000000000001 ], "empirical_novelty_avg": [ 2.4, 0.4898979485566356 ], "wc_summary_paper_avg": [ 76.6, 10.61319932913728 ], "wc_summary_review_avg": [ 64.6, 20.05592181875468 ], "wc_main_review_avg": [ 376.2, 77.91123153948986 ], "wc_review_avg": [ 517.4, 82.65252567223823 ], "wc_reply_reviewers_avg": [ 77.2, 91.59563308367927 ], "wc_reply_authors_avg": [ 859.8, 45.05729685633615 ], "reply_reviewers_avg": [ 0.6, 0.48989794855663565 ], "reply_authors_avg": [ 2.0, 0.0 ], "replies_avg": [ 20, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.9128709291752769, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:LUg9o2YDb90J:scholar.google.com/&scioq=Joint+Self-Supervised+Learning+for+Vision-based+Reinforcement+Learning&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0;0", "aff_unique_norm": "Sungkyunkwan University", "aff_unique_dep": "", "aff_unique_url": "https://www.skku.edu", "aff_unique_abbr": "SKKU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "South Korea" }, { "id": "oEyUP37aoU7", "title": "Secure Domain Adaptation with Multiple Sources", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Multi-source unsupervised domain adaptation (MUDA) is a recently explored learning framework within UDA, where the goal is to address the challenge of annotated data scarcity in a target domain via transferring knowledge from multiple source domains with annotated data. When the source domains are distributed, data privacy and security can become a significant concern, e.g., medical domains, yet existing MUDA methods overlook this concern. We develop an algorithm to address MUDA when source domains' data cannot be shared. Our method is based on aligning the distributions of the source and target domains indirectly via internally learned distributions in an intermediate embedding space. Our theoretical analysis supports our approach and extensive empirical results demonstrate our algorithm is effective and compares favorably against existing MUDA methods.", "keywords": "unsupervised domain adaptation;multi-source domain adaptation;data privacy;source-free adaptation", "primary_area": "", "supplementary_material": "/attachment/7d4afc8ba59ac2ef11becb57cefc485bba1584da.zip", "author": "Serban Stan;Mohammad Rostami", "authorids": "~Serban_Stan1;~Mohammad_Rostami1", "gender": "M;M", "homepage": ";https://viterbi.usc.edu/directory/faculty/Rostami/Mohammad", "dblp": "203/4477;83/9890", "google_scholar": "se6-c6cAAAAJ;Uzx8nLoAAAAJ", "orcid": ";", "linkedin": "serban012/;", "or_profile": "~Serban_Stan1;~Mohammad_Rostami1", "aff": "University of Southern California;USC/ISI", "aff_domain": "usc.edu;isi.edu", "position": "PhD student;Research Scientist", "bibtex": "@misc{\nstan2022secure,\ntitle={Secure Domain Adaptation with Multiple Sources},\nauthor={Serban Stan and Mohammad Rostami},\nyear={2022},\nurl={https://openreview.net/forum?id=oEyUP37aoU7}\n}", "github": "", "project": "", "reviewers": "j3aJ;S64s;Xjvr;bDsV", "site": "https://openreview.net/forum?id=oEyUP37aoU7", "pdf_size": 0, "recommendation": "1;3;3;3", "confidence": "4;5;5;5", "correctness": "3;3;3;3", "technical_novelty": "2;2;2;1", "empirical_novelty": "2;2;3;1", "wc_summary_paper": "73;79;125;53", "wc_summary_review": "16;18;44;68", "wc_main_review": "297;314;586;185", "wc_review": "386;411;755;306", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 2.5, 0.8660254037844386 ], "confidence_avg": [ 4.75, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 1.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 82.5, 26.358110706194402 ], "wc_summary_review_avg": [ 36.5, 21.277922830953212 ], "wc_main_review_avg": [ 345.5, 147.43218780171446 ], "wc_review_avg": [ 464.5, 172.14601360473034 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 1.0, "corr_recommendation_correctness": 0.0, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9152141243814212262&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0", "aff_unique_norm": "University of Southern California", "aff_unique_dep": "", "aff_unique_url": "https://www.usc.edu", "aff_unique_abbr": "USC", "aff_campus_unique_index": "0;1", "aff_campus_unique": "Los Angeles;ISI", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "OBJECT DYNAMICS DISTILLATION FOR SCENE DECOMPOSITION AND REPRESENTATION", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6826", "id": "oJGDYQFKL3i", "poster": "", "openreview": "https://openreview.net/forum?id=oJGDYQFKL3i", "slides": "https://iclr.cc/virtual/2022/poster/6826", "video": "https://iclr.cc/virtual/2022/poster/6826", "author_site": "Qu Tang, Xiangyu Zhu, Zhen Lei, Zhaoxiang Zhang", "tldr": "", "abstract": "The ability to perceive scenes in terms of abstract entities is crucial for us to\nachieve higher-level intelligence. Recently, several methods have been proposed\nto learn object-centric representations of scenes with multiple objects, yet most\nof which focus on static scenes. In this paper, we work on object dynamics and\npropose Object Dynamics Distillation Network (ODDN), a framework that distillates explicit object dynamics (e.g., velocity) from sequential static representations. ODDN also builds a relation module to model object interactions. We verify\nour approach on tasks of video reasoning and video prediction, which are two important evaluations for video understanding. The results show that the reasoning\nmodel with visual representations of ODDN performs better in answering reasoning questions around physical events in a video compared to the previous state-of-the-art methods. The distilled object dynamics also could be used to predict\nfuture video frames given two input frames, involving occlusion and objects collision. In addition, our architecture brings better segmentation quality and higher\nreconstruction accuracy.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Qu Tang;Xiangyu Zhu;Zhen Lei;Zhaoxiang Zhang", "authorids": "~Qu_Tang1;~Xiangyu_Zhu3;~Zhen_Lei2;~Zhaoxiang_Zhang3", "gender": "M;;M;M", "homepage": ";https://xiangyuzhu-open.github.io/homepage/;http://www.cbsr.ia.ac.cn/users/zlei/;http://zhaoxiangzhang.net", "dblp": ";19/10065-1;55/112-1.html;55/2285-1.html", "google_scholar": ";1rbNk5oAAAAJ;cuJ3QG8AAAAJ;qxWfV6cAAAAJ", "orcid": ";0000-0002-4636-9677;0000-0002-0791-189X;", "linkedin": "qu-tang-4b7989221/;;;", "or_profile": "~Qu_Tang1;~Xiangyu_Zhu3;~Zhen_Lei2;~Zhaoxiang_Zhang3", "aff": "Chinese Academy of Sciences;Institute of Automation, Chinese Academy of Sciences;Institute of Automation, Chinese Academy of Sciences;Institute of Automation, Chinese Academy of Sciences", "aff_domain": "ac.cn;ia.ac.cn;ia.ac.cn;ia.ac.cn", "position": "PhD student;Associate Professor;Full Professor;Full Professor", "bibtex": "@inproceedings{\ntang2022object,\ntitle={{OBJECT} {DYNAMICS} {DISTILLATION} {FOR} {SCENE} {DECOMPOSITION} {AND} {REPRESENTATION}},\nauthor={Qu Tang and Xiangyu Zhu and Zhen Lei and Zhaoxiang Zhang},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=oJGDYQFKL3i}\n}", "github": "", "project": "", "reviewers": "pDGg;YU79;icxn;xJFb;dfem", "pdf_size": 0, "recommendation": "5;6;6;6;8", "confidence": "3;3;3;2;3", "correctness": "2;3;4;3;4", "technical_novelty": "3;3;3;2;3", "empirical_novelty": "2;2;3;3;3", "wc_summary_paper": "76;259;39;119;114", "wc_summary_review": "87;96;45;37;71", "wc_main_review": "310;1122;177;563;86", "wc_review": "473;1477;261;719;271", "wc_reply_reviewers": "56;32;29;18;60", "wc_reply_authors": "1039;930;543;464;107", "reply_reviewers": "1;1;1;1;1", "reply_authors": "2;2;1;1;1", "recommendation_avg": [ 6.2, 0.9797958971132712 ], "confidence_avg": [ 2.8, 0.39999999999999997 ], "correctness_avg": [ 3.2, 0.7483314773547882 ], "technical_novelty_avg": [ 2.8, 0.39999999999999997 ], "empirical_novelty_avg": [ 2.6, 0.4898979485566356 ], "wc_summary_paper_avg": [ 121.4, 74.62600083080963 ], "wc_summary_review_avg": [ 67.2, 22.981731875557163 ], "wc_main_review_avg": [ 451.6, 371.74324472678717 ], "wc_review_avg": [ 640.2, 450.44440278462775 ], "wc_reply_reviewers_avg": [ 39.0, 16.24807680927192 ], "wc_reply_authors_avg": [ 616.6, 336.16579242986637 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 1.4, 0.4898979485566356 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.10206207261596575, "corr_recommendation_correctness": 0.7637626158259733, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17946989340094359545&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "pdf": "https://openreview.net/pdf?id=oJGDYQFKL3i", "email": "ac.cn;ia.ac.cn;ia.ac.cn;ia.ac.cn", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Chinese Academy of Sciences", "aff_unique_dep": "", "aff_unique_url": "https://www.cas.cn", "aff_unique_abbr": "CAS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "id": "oLYTo-pL0Be", "title": "Towards Scheduling Federated Deep Learning using Meta-Gradients for Inter-Hospital Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Given the abundance and ease of access of personal data today, individual privacy has become of paramount importance, particularly in the healthcare domain. In this work, we aim to utilise patient data extracted from multiple hospital data centres to train a machine learning model without sacrificing patient privacy. We develop a scheduling algorithm in conjunction with a student-teacher algorithm that is deployed in a federated manner. This allows a central model to learn from batches of data at each federal node. The teacher acts between data centres to update the main task (student) algorithm using the data that is stored in the various data centres. We show that the scheduler, trained using meta-gradients, can effectively organise training and as a result train a machine learning model on a diverse dataset without needing explicit access to the patient data. We achieve state-of-the-art performance and show how our method overcomes some of the problems faced in the federated learning such as node poisoning. We further show how the scheduler can be used as a mechanism for transfer learning, allowing different teachers to work together in training a student for state-of-the-art performance.", "keywords": "federated learning;hospital", "primary_area": "", "supplementary_material": "/attachment/bf072c31654e925202838ebc154ef9139f152b12.zip", "author": "Rasheed El-Bouri;Tingting Zhu;David A. Clifton", "authorids": "~Rasheed_El-Bouri1;~Tingting_Zhu1;~David_A._Clifton1", "gender": "M;F;M", "homepage": ";https://eng.ox.ac.uk/people/tingting-zhu/;http://www.eng.ox.ac.uk/chi", "dblp": ";29/7666-1;89/6424", "google_scholar": "biRaHFoAAAAJ;https://scholar.google.com.vn/citations?user=fjGMIl0AAAAJ;", "orcid": ";0000-0002-1552-5630;", "linkedin": "rasheed-el-bouri/;;", "or_profile": "~Rasheed_El-Bouri1;~Tingting_Zhu1;~David_A._Clifton1", "aff": ";University of Oxford;University of Oxford", "aff_domain": ";eng.ox.ac.uk;ox.ac.uk", "position": ";RAEng Research Fellow;Full Professor", "bibtex": "@misc{\nel-bouri2022towards,\ntitle={Towards Scheduling Federated Deep Learning using Meta-Gradients for Inter-Hospital Learning},\nauthor={Rasheed El-Bouri and Tingting Zhu and David A. Clifton},\nyear={2022},\nurl={https://openreview.net/forum?id=oLYTo-pL0Be}\n}", "github": "", "project": "", "reviewers": "sDWS;waUP;cKiw", "site": "https://openreview.net/forum?id=oLYTo-pL0Be", "pdf_size": 0, "recommendation": "3;5;5", "confidence": "4;4;4", "correctness": "2;3;2", "technical_novelty": "2;2;3", "empirical_novelty": "2;0;2", "wc_summary_paper": "108;40;75", "wc_summary_review": "97;4;44", "wc_main_review": "694;337;382", "wc_review": "899;381;501", "wc_reply_reviewers": "189;102;0", "wc_reply_authors": "812;382;594", "reply_reviewers": "1;1;0", "reply_authors": "1;1;1", "recommendation_avg": [ 4.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 2.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 1.3333333333333333, 0.9428090415820634 ], "wc_summary_paper_avg": [ 74.33333333333333, 27.764885897278397 ], "wc_summary_review_avg": [ 48.333333333333336, 38.09053542402481 ], "wc_main_review_avg": [ 471.0, 158.75137794677562 ], "wc_review_avg": [ 593.6666666666666, 221.39155860651556 ], "wc_reply_reviewers_avg": [ 97.0, 77.23988606931007 ], "wc_reply_authors_avg": [ 596.0, 175.5524612948126 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.49999999999999983, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3790576399107313327&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0", "aff_unique_norm": "University of Oxford", "aff_unique_dep": "", "aff_unique_url": "https://www.ox.ac.uk", "aff_unique_abbr": "Oxford", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United Kingdom" }, { "title": "DAB-DETR: Dynamic Anchor Boxes are Better Queries for DETR", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7150", "id": "oMI9PjOb9Jl", "poster": "", "openreview": "https://openreview.net/forum?id=oMI9PjOb9Jl", "slides": "https://iclr.cc/virtual/2022/poster/7150", "video": "https://iclr.cc/virtual/2022/poster/7150", "author_site": "Shilong Liu, Feng Li, Hao Zhang, Xiao Yang, Xianbiao Qi, Hang Su, Jun Zhu, Lei Zhang", "tldr": "", "abstract": "We present in this paper a novel query formulation using dynamic anchor boxes for DETR (DEtection TRansformer) and offer a deeper understanding of the role of queries in DETR. This new formulation directly uses box coordinates as queries in Transformer decoders and dynamically updates them layer by layer. Using box coordinates not only helps using explicit positional priors to improve the query-to-feature similarity and eliminate the slow training convergence issue in DETR, but also allows us to modulate the positional attention map using the box width and height information. Such a design makes it clear that queries in DETR can be implemented as performing soft ROI pooling layer by layer in a cascade manner. As a result, it leads to the best performance on the MS-COCO benchmark among the DETR-like detection models under the same setting, e.g., AP 45.7\\% using ResNet50-DC5 as backbone trained in 50 epochs. We also conducted extensive experiments to confirm our analysis and verify the effectiveness of our methods. Code is available at \\url{https://github.com/IDEA-opensource/DAB-DETR}.", "keywords": "Object detection;Transformer", "primary_area": "", "supplementary_material": "", "author": "Shilong Liu;Feng Li;Hao Zhang;Xiao Yang;Xianbiao Qi;Hang Su;Jun Zhu;Lei Zhang", "authorids": "~Shilong_Liu1;~Feng_Li9;~Hao_Zhang39;~Xiao_Yang4;~Xianbiao_Qi2;~Hang_Su3;~Jun_Zhu2;~Lei_Zhang23", "gender": "M;M;M;M;M;M;M;M", "homepage": "https://www.lsl.zone;https://fengli-ust.github.io/;https://haozhang534.github.io/;https://ml.cs.tsinghua.edu.cn/~xiaoyang/;https://www.linkedin.com/in/xianbiao-qi-39617727/;http://ml.cs.tsinghua.edu.cn/~jun;;https://www.leizhang.org/", "dblp": ";92/2954-40.html;55/2270-97;57/33851;118/3741;50/2644-1;26/5371-6;z/LeiZhang", "google_scholar": "nkSVY3MAAAAJ;https://scholar.google.com/citations?hl=zh-CN;B8hPxMQAAAAJ;bwkwp0MAAAAJ;odjSydQAAAAJ;axsP38wAAAAJ;dxN1_X0AAAAJ;fIlGZToAAAAJ", "orcid": ";;;0000-0001-9502-9962;;;;", "linkedin": ";;hao-zhang-3b09b8196/;;;;;", "or_profile": "~Shilong_Liu1;~Feng_Li9;~Hao_Zhang39;~Xiao_Yang4;~Xianbiao_Qi2;~Jun_Zhu2;~Hang_Su2;~Lei_Zhang1", "aff": "International Digital Economy Academy;Hong Kong University of Science and Technology;Hong Kong University of Science and Technology;Tsinghua University;International Digital Economy Academy;Tsinghua University;Tsinghua University;International Digital Economy Academy", "aff_domain": "idea.edu.cn;ust.hk;ust.hk;mail.tsinghua.edu.cn;idea.edu.cn;mail.tsinghua.edu.cn;tsinghua.edu.cn;idea.edu.cn", "position": "Research Intern;PhD student;PhD student;PhD student;Researcher;Professor;Associate Professor;Chief Scientist", "bibtex": "@inproceedings{\nliu2022dabdetr,\ntitle={{DAB}-{DETR}: Dynamic Anchor Boxes are Better Queries for {DETR}},\nauthor={Shilong Liu and Feng Li and Hao Zhang and Xiao Yang and Xianbiao Qi and Hang Su and Jun Zhu and Lei Zhang},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=oMI9PjOb9Jl}\n}", "github": "", "project": "", "reviewers": "zkuH;iJ5R;H1Mp", "pdf_size": 0, "recommendation": "5;6;8", "confidence": "5;4;4", "correctness": "3;2;3", "technical_novelty": "2;3;3", "empirical_novelty": "2;3;3", "wc_summary_paper": "77;102;75", "wc_summary_review": "40;454;30", "wc_main_review": "254;589;191", "wc_review": "371;1145;296", "wc_reply_reviewers": "779;0;0", "wc_reply_authors": "3779;1254;300", "reply_reviewers": "2;0;0", "reply_authors": "7;3;1", "recommendation_avg": [ 6.333333333333333, 1.247219128924647 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "correctness_avg": [ 2.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 84.66666666666667, 12.283683848458853 ], "wc_summary_review_avg": [ 174.66666666666666, 197.5606798485524 ], "wc_main_review_avg": [ 344.6666666666667, 174.6736639819778 ], "wc_review_avg": [ 604.0, 383.7681591794713 ], "wc_reply_reviewers_avg": [ 259.6666666666667, 367.2241216962136 ], "wc_reply_authors_avg": [ 1777.6666666666667, 1467.7716292242317 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.9428090415820634 ], "reply_authors_avg": [ 3.6666666666666665, 2.494438257849294 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": -0.7559289460184544, "corr_recommendation_correctness": 0.18898223650461363, "gs_citation": 1046, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11838073149065061192&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=oMI9PjOb9Jl", "email": "idea.edu.cn;ust.hk;ust.hk;mail.tsinghua.edu.cn;idea.edu.cn;mail.tsinghua.edu.cn;tsinghua.edu.cn;idea.edu.cn", "author_num": 8, "aff_unique_index": "0;1;1;2;0;2;2;0", "aff_unique_norm": "International Digital Economy Academy;Hong Kong University of Science and Technology;Tsinghua University", "aff_unique_dep": ";;", "aff_unique_url": ";https://www.ust.hk;https://www.tsinghua.edu.cn", "aff_unique_abbr": ";HKUST;THU", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "1;1;1;1;1", "aff_country_unique": ";China" }, { "id": "oOuPVoT1kA5", "title": "FEVERLESS: Fast and Secure Vertical Federated Learning based on XGBoost for Decentralized Labels", "track": "main", "status": "Reject", "tldr": "", "abstract": "Vertical Federated Learning (VFL) enables multiple clients to collaboratively train a global model over vertically partitioned data without revealing private local information. Tree-based models, like XGBoost and LightGBM, have been widely used in VFL to enhance the interpretation and efficiency of training. However, there is a fundamental lack of research on how to conduct VFL securely over distributed labels. This work is the first to fill this gap by designing a novel protocol, called FEVERLESS, based on XGBoost. FEVERLESS leverages secure aggregation via information masking technique and global differential privacy provided by a fairly and randomly selected noise leader to prevent private information from being leaked in the training process. Furthermore, it provides label and data privacy against honest-but-curious adversary even in the case of collusion of $n-2$ out of $n$ clients. We present a comprehensive security and efficiency analysis for our design, and the empirical experiment results demonstrate that FEVERLESS is fast and secure. In particular, it outperforms the solution based on additive homomorphic encryption in runtime cost and provides better accuracy than the local differential privacy approach.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Rui Wang;O\u011fuzhan Ersoy;Hangyu Zhu;Yaochu Jin;Kaitai Liang", "authorids": "~Rui_Wang24;o.ersoy@tudelft.nl;hangyu.zhu@surrey.ac.uk;yaochu.jin@surrey.ac.uk;kaitai.liang@tudelft.nl", "gender": "M;;;;", "homepage": "https://www.tudelft.nl/en/eemcs/the-faculty/departments/intelligent-systems/cybersecurityeemcs/people/rui-wang;;;;", "dblp": ";;;;", "google_scholar": "https://scholar.google.com/citations?hl=en;;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~Rui_Wang24;o.ersoy@tudelft.nl;hangyu.zhu@surrey.ac.uk;yaochu.jin@surrey.ac.uk;kaitai.liang@tudelft.nl", "aff": "Delft University of Technology;;;;", "aff_domain": "tudelft.nl;;;;", "position": "PhD student;;;;", "bibtex": "@misc{\nwang2022feverless,\ntitle={{FEVERLESS}: Fast and Secure Vertical Federated Learning based on {XGB}oost for Decentralized Labels},\nauthor={Rui Wang and O{\\u{g}}uzhan Ersoy and Hangyu Zhu and Yaochu Jin and Kaitai Liang},\nyear={2022},\nurl={https://openreview.net/forum?id=oOuPVoT1kA5}\n}", "github": "", "project": "", "reviewers": "NGrb;dHiP;chCc;Wj2y", "site": "https://openreview.net/forum?id=oOuPVoT1kA5", "pdf_size": 0, "recommendation": "3;3;6;8", "confidence": "3;4;2;3", "correctness": "3;2;3;4", "technical_novelty": "2;1;2;3", "empirical_novelty": "2;0;3;3", "wc_summary_paper": "123;34;71;95", "wc_summary_review": "41;16;25;34", "wc_main_review": "296;291;141;125", "wc_review": "460;341;237;254", "wc_reply_reviewers": "202;0;0;0", "wc_reply_authors": "903;634;219;96", "reply_reviewers": "1;0;0;0", "reply_authors": "2;1;1;1", "recommendation_avg": [ 5.0, 2.1213203435596424 ], "confidence_avg": [ 3.0, 0.7071067811865476 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.0, 1.224744871391589 ], "wc_summary_paper_avg": [ 80.75, 32.66783586342995 ], "wc_summary_review_avg": [ 29.0, 9.40744386111339 ], "wc_main_review_avg": [ 213.25, 80.4685497570324 ], "wc_review_avg": [ 323.0, 88.38834764831844 ], "wc_reply_reviewers_avg": [ 50.5, 87.4685657822283 ], "wc_reply_authors_avg": [ 463.0, 322.903236279849 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.5000000000000001, "corr_recommendation_correctness": 0.8333333333333334, "gs_citation": 25, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6089292486322251260&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0", "aff_unique_norm": "Delft University of Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.tudelft.nl", "aff_unique_abbr": "TU Delft", "aff_country_unique_index": "0", "aff_country_unique": "Netherlands" }, { "id": "oPON8TpOQVz", "title": "Chameleon Sampling: Diverse and Pure Example Selection for Online Continual Learning with Noisy Labels", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "AI models suffer from continuously changing data distribution and noisy labels when applied to most real-world problems. Although many solutions have addressed issues for each problem of continual learning or noisy label, tackling both issues is of importance and yet underexplored. Here, we address the task of online continual learning with noisy labels, which is a more realistic, practical, and challenging continual learning setup by assuming ground-truth labels may be noisy. Specifically, we argue the importance of both diversity and purity of examples in the episodic memory of continual learning models. To balance diversity and purity in the memory, we propose to combine a novel memory management strategy and robust learning. Specifically, we propose a metric to balance the trade-off between diversity and purity in the episodic memory with noisy labels. We then refurbish or apply unsupervised learning by splitting noisy examples into multiple groups using the Gaussian mixture model for addressing label noise. We validate our approach on four real-world or synthetic benchmark datasets, including two CIFARs, Clothing1M, and mini-WebVision, demonstrate significant improvements over representative methods on this challenging task set-up.", "keywords": "Continual Learning;Robust Learning;Noisy Labels;Label Noise", "primary_area": "", "supplementary_material": "", "author": "Jihwan Bang;Hyunseo Koh;Seulki Park;Hwanjun Song;Jung-Woo Ha;Jonghyun Choi", "authorids": "~Jihwan_Bang1;~Hyunseo_Koh1;~Seulki_Park1;~Hwanjun_Song2;~Jung-Woo_Ha1;~Jonghyun_Choi1", "gender": "M;M;F;M;M;M", "homepage": "https://hwany-j.github.io/;;https://sites.google.com/view/seulkipark/home;https://songhwanjun.github.io/;https://aidljwha.wordpress.com/;https://ppolon.github.io/", "dblp": "221/4643;304/4369;166/7234;204/3381;66/867-1;21/11103", "google_scholar": "molKYzwAAAAJ;Mi4cMxgAAAAJ;6Wh4hxcAAAAJ;Ijzuc-8AAAAJ;https://scholar.google.co.kr/citations?user=eGj3ay4AAAAJ;uiGWnm4AAAAJ", "orcid": ";0000-0002-2576-1581;;0000-0002-1105-0818;0000-0002-7400-7681;0000-0002-7934-8434", "linkedin": "jihwan-bang/;%ED%98%84%EC%84%9C-%EA%B3%A0-66298a221/;seulki-park-49a775147;;jung-woo-ha-b2782862?trk=hp-identity-name;jonghyun-choi-459bb615/", "or_profile": "~Jihwan_Bang1;~Hyunseo_Koh1;~Seulki_Park1;~Hwanjun_Song2;~Jung-Woo_Ha1;~Jonghyun_Choi1", "aff": "NAVER;Gwangju Institute of Science and Technology;Seoul National University;NAVER CLOVA;NAVER AI Lab;NAVER", "aff_domain": "navercorp.com;gist.ac.kr;snu.ac.kr;navercorp.com;navercorp.com;navercorp.com", "position": "Researcher;PhD student;PhD student;Research Scientist;Head (Executive Director);AI Advisor Committee", "bibtex": "@misc{\nbang2022chameleon,\ntitle={Chameleon Sampling: Diverse and Pure Example Selection for Online Continual Learning with Noisy Labels},\nauthor={Jihwan Bang and Hyunseo Koh and Seulki Park and Hwanjun Song and Jung-Woo Ha and Jonghyun Choi},\nyear={2022},\nurl={https://openreview.net/forum?id=oPON8TpOQVz}\n}", "github": "", "project": "", "reviewers": "m13F;dkmc;RZ2P;NE83", "site": "https://openreview.net/forum?id=oPON8TpOQVz", "pdf_size": 0, "recommendation": "5;5;5;5", "confidence": "3;4;4;2", "correctness": "3;2;2;3", "technical_novelty": "2;2;3;2", "empirical_novelty": "2;3;2;2", "wc_summary_paper": "29;157;101;159", "wc_summary_review": "217;147;16;32", "wc_main_review": "74;1295;252;306", "wc_review": "320;1599;369;497", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 5.0, 0.0 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 111.5, 53.016506863428866 ], "wc_summary_review_avg": [ 103.0, 82.97891298396237 ], "wc_main_review_avg": [ 481.75, 477.31246317271035 ], "wc_review_avg": [ 696.25, 525.1939522690641 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:xk6TDS0owE0J:scholar.google.com/&scioq=Chameleon+Sampling:+Diverse+and+Pure+Example+Selection+for+Online+Continual+Learning+with+Noisy+Labels&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;2;0;0;0", "aff_unique_norm": "NAVER Corporation;Gwangju Institute of Science and Technology;Seoul National University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.naver.com;https://www.gist.ac.kr;https://www.snu.ac.kr", "aff_unique_abbr": "NAVER;GIST;SNU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Gwangju", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "South Korea" }, { "id": "oSP1hwZB24", "title": "Dynamic Parameterized Network for CTR Prediction", "track": "main", "status": "Reject", "tldr": "", "abstract": "Learning to capture feature relations effectively and efficiently is essential in click-through rate (CTR) prediction of modern recommendation systems. Most existing CTR prediction methods model such relations either through tedious manually-designed low-order interactions or through inflexible and inefficient high-order interactions, which both require extra DNN modules for implicit interaction modeling. In this paper, we proposed a novel plug-in operation, Dynamic Parameterized Operation (DPO), to learn both explicit and implicit interaction instance-wisely. We showed that the introduction of DPO into DNN modules and Attention modules can respectively benefit two main tasks in CTR prediction, enhancing the adaptiveness of feature-based modeling and improving user behavior modeling with the instance-wise locality. Our Dynamic Parameterized Networks significantly outperforms state-of-the-art methods in the offline experiments on the public dataset and real-world production dataset, together with an online A/B test. Furthermore, the proposed Dynamic Parameterized Networks has been deployed in the ranking system of one of the world's largest e-commerce companies, serving the main traffic of hundreds of millions of active users.", "keywords": "Recommendation System;Feature modeling;User Behavior modeling;Dynamic Network", "primary_area": "", "supplementary_material": "", "author": "Jian Zhu;Congcong Liu;Pei Wang;Xiwei Zhao;Guangpeng Chen;Jin Jun Sheng;Changping Peng;Zhangang Lin;Jingping Shao", "authorids": "~Jian_Zhu1;~Congcong_Liu1;~Pei_Wang9;~Xiwei_Zhao1;~Guangpeng_Chen2;~Jin_Jun_Sheng1;~Changping_Peng1;~Zhangang_Lin1;~Jingping_Shao1", "gender": "M;;F;M;M;;M;M;M", "homepage": ";;https://peggy95.github.io/;;https://www.linkedin.com/in/%E5%B9%BF%E6%9C%8B-%E9%99%88-7122b3221/;https://www.linkedin.com/in/%E5%9D%87%E7%94%9F-%E9%87%91-760784220/;;;https://www.jd.com/", "dblp": ";;;;;;274/7573.html;;", "google_scholar": "2p0_yFIAAAAJ;;;;;;;;", "orcid": ";;;;;;;;", "linkedin": ";;;%E5%A4%95%E7%82%9C-%E8%B5%B5-6a822714a/;;;;jack-zhangang-lin-020b2237/;", "or_profile": "~Jian_Zhu1;~Congcong_Liu1;~Pei_Wang9;~Xiwei_Zhao1;~Guangpeng_Chen2;~Jin_Jun_Sheng1;~Changping_Peng1;~Zhangang_Lin1;~Jingping_Shao1", "aff": ";;;;;;;JD;JD", "aff_domain": ";;;;;;;jd.com;jd.com", "position": ";;;;;;;Researcher;Researcher", "bibtex": "@misc{\nzhu2022dynamic,\ntitle={Dynamic Parameterized Network for {CTR} Prediction},\nauthor={Jian Zhu and Congcong Liu and Pei Wang and Xiwei Zhao and Guangpeng Chen and Jin Jun Sheng and Changping Peng and Zhangang Lin and Jingping Shao},\nyear={2022},\nurl={https://openreview.net/forum?id=oSP1hwZB24}\n}", "github": "", "project": "", "reviewers": "AzFU;pP6H;2XAa", "site": "https://openreview.net/forum?id=oSP1hwZB24", "pdf_size": 0, "recommendation": "3;5;6", "confidence": "5;4;4", "correctness": "3;3;4", "technical_novelty": "1;2;3", "empirical_novelty": "2;2;2", "wc_summary_paper": "70;47;33", "wc_summary_review": "12;12;71", "wc_main_review": "209;103;191", "wc_review": "291;162;295", "wc_reply_reviewers": "234;164;0", "wc_reply_authors": "2183;651;1002", "reply_reviewers": "1;1;0", "reply_authors": "5;2;2", "recommendation_avg": [ 4.666666666666667, 1.247219128924647 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.0, 0.816496580927726 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 50.0, 15.253414918196734 ], "wc_summary_review_avg": [ 31.666666666666668, 27.812866726670865 ], "wc_main_review_avg": [ 167.66666666666666, 46.312945154555756 ], "wc_review_avg": [ 249.33333333333334, 61.77557949725945 ], "wc_reply_reviewers_avg": [ 132.66666666666666, 98.06573758907281 ], "wc_reply_authors_avg": [ 1278.6666666666667, 655.3189215098927 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 3.0, 1.4142135623730951 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 9, 0 ], "corr_recommendation_confidence": -0.944911182523068, "corr_recommendation_correctness": 0.7559289460184545, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8769630480826322631&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0", "aff_unique_norm": "JD", "aff_unique_dep": "JD", "aff_unique_url": "", "aff_unique_abbr": "", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "", "aff_country_unique": "" }, { "id": "oTQNAU_g_AZ", "title": "DAIR: Disentangled Attention Intrinsic Regularization for Safe and Efficient Bimanual Manipulation", "track": "main", "status": "Reject", "tldr": "", "abstract": "We address the problem of safely solving complex bimanual robot manipulation tasks with sparse rewards. Such challenging tasks can be decomposed into sub-tasks that are accomplishable by different robots concurrently or sequentially for better efficiency. While previous reinforcement learning approaches primarily focus on modeling the compositionality of sub-tasks, two fundamental issues are largely ignored particularly when learning cooperative strategies for two robots: (i) domination, i.e., one robot may try to solve a task by itself and leaves the other idle; (ii) conflict, i.e., one robot can interrupt another's workspace when executing different sub-tasks simultaneously, which leads to unsafe collisions. To tackle these two issues, we propose a novel technique called disentangled attention, which provides an intrinsic regularization for two robots to focus on separate sub-tasks and objects. We evaluate our method on five bimanual manipulation tasks. Experimental results show that our proposed intrinsic regularization successfully avoids domination and reduces conflicts for the policies, which leads to significantly more efficient and safer cooperative strategies than all the baselines. Our project page with videos is at https://bimanual-attention.github.io/.", "keywords": "Reinforcement Learning;Safe Robotics;Bimanual Manipulation;Attention Mechanism", "primary_area": "", "supplementary_material": "", "author": "Minghao Zhang;Pingcheng Jian;Yi Wu;Huazhe Xu;Xiaolong Wang", "authorids": "~Minghao_Zhang1;~Pingcheng_Jian1;~Yi_Wu1;~Huazhe_Xu1;~Xiaolong_Wang3", "gender": "M;M;M;M;M", "homepage": "https://www.minghaozhang.com;https://pingcheng-jian.github.io/;https://jxwuyi.weebly.com;http://hxu.rocks;https://xiaolonw.github.io/", "dblp": "137/0566;278/2436;;164/9006;91/952-4", "google_scholar": "moOv1BsAAAAJ;2m63kY0AAAAJ;dusV5HMAAAAJ;t9HPFawAAAAJ;Y8O9N_0AAAAJ", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~Minghao_Zhang1;~Pingcheng_Jian1;~Yi_Wu1;~Huazhe_Xu1;~Xiaolong_Wang3", "aff": "Tsinghua University;Duke University;Tsinghua University;Stanford University;University of California, San Diego", "aff_domain": "tsinghua.edu.cn;duke.edu;tsinghua.edu.cn;stanford.edu;ucsd.edu", "position": "Undergrad student;PhD student;Assistant Professor;Postdoc;Assistant Professor", "bibtex": "@misc{\nzhang2022dair,\ntitle={{DAIR}: Disentangled Attention Intrinsic Regularization for Safe and Efficient Bimanual Manipulation},\nauthor={Minghao Zhang and Pingcheng Jian and Yi Wu and Huazhe Xu and Xiaolong Wang},\nyear={2022},\nurl={https://openreview.net/forum?id=oTQNAU_g_AZ}\n}", "github": "", "project": "", "reviewers": "jKrv;RR63;a42g;v16H", "site": "https://openreview.net/forum?id=oTQNAU_g_AZ", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "5;3;4;3", "correctness": "2;3;3;3", "technical_novelty": "3;3;3;2", "empirical_novelty": "2;2;3;2", "wc_summary_paper": "52;134;106;53", "wc_summary_review": "104;96;83;21", "wc_main_review": "646;1191;739;345", "wc_review": "802;1421;928;419", "wc_reply_reviewers": "0;0;19;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;1;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 86.25, 35.173676236640375 ], "wc_summary_review_avg": [ 76.0, 32.626676202150904 ], "wc_main_review_avg": [ 730.25, 303.26669368725607 ], "wc_review_avg": [ 892.5, 358.10787480869504 ], "wc_reply_reviewers_avg": [ 4.75, 8.227241335952167 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.899228803025897, "corr_recommendation_correctness": 0.9271726499455306, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3801751735832522300&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;0;2;3", "aff_unique_norm": "Tsinghua University;Duke University;Stanford University;University of California, San Diego", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.duke.edu;https://www.stanford.edu;https://www.ucsd.edu", "aff_unique_abbr": "THU;Duke;Stanford;UCSD", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Stanford;San Diego", "aff_country_unique_index": "0;1;0;1;1", "aff_country_unique": "China;United States" }, { "title": "Self-ensemble Adversarial Training for Improved Robustness", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6997", "id": "oU3aTsmeRQV", "poster": "", "openreview": "https://openreview.net/forum?id=oU3aTsmeRQV", "slides": "https://iclr.cc/virtual/2022/poster/6997", "video": "https://iclr.cc/virtual/2022/poster/6997", "author_site": "Hongjun Wang, Yisen Wang", "tldr": "", "abstract": "Due to numerous breakthroughs in real-world applications brought by machine intelligence, deep neural networks (DNNs) are widely employed in critical applications. However, predictions of DNNs are easily manipulated with imperceptible adversarial perturbations, which impedes the further deployment of DNNs and may result in profound security and privacy implications. By incorporating adversarial samples into the training data pool, adversarial training is the strongest principled strategy against various adversarial attacks among all sorts of defense methods. Recent works mainly focus on developing new loss functions or regularizers, attempting to find the unique optimal point in the weight space. But none of them taps the potentials of classifiers obtained from standard adversarial training, especially states on the searching trajectory of training. In this work, we are dedicated to the weight states of models through the training process and devise a simple but powerful \\emph{Self-Ensemble Adversarial Training} (SEAT) method for yielding a robust classifier by averaging weights of history models. This considerably improves the robustness of the target model against several well known adversarial attacks, even merely utilizing the naive cross-entropy loss to supervise. We also discuss the relationship between the ensemble of predictions from different adversarially trained models and the prediction of weight-ensembled models, as well as provide theoretical and empirical evidence that the proposed self-ensemble method provides a smoother loss landscape and better robustness than both individual models and the ensemble of predictions from different classifiers. We further analyze a subtle but fatal issue in the general settings for the self-ensemble model, which causes the deterioration of the weight-ensembled method in the late phases. ", "keywords": "Adversarial Example;Adversarial Training", "primary_area": "", "supplementary_material": "/attachment/bb8a1f1f6741c848adbf090e350e2dc51fa16950.zip", "author": "Hongjun Wang;Yisen Wang", "authorids": "~Hongjun_Wang2;~Yisen_Wang1", "gender": "M;M", "homepage": "https://whj363636.github.io/;https://yisenwang.github.io/", "dblp": "65/3627-5;172/1346-1", "google_scholar": "DNi-nB0AAAAJ;uMWPDboAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Hongjun_Wang2;~Yisen_Wang1", "aff": "Peking University;Peking University", "aff_domain": "pku.edu.cn;pku.edu.cn", "position": "Intern;Assistant Professor", "bibtex": "@inproceedings{\nwang2022selfensemble,\ntitle={Self-ensemble Adversarial Training for Improved Robustness},\nauthor={Hongjun Wang and Yisen Wang},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=oU3aTsmeRQV}\n}", "github": "", "project": "", "reviewers": "K6Do;E4Bu;Ztw4;np74", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "4;4;3;4", "correctness": "3;3;3;4", "technical_novelty": "4;2;3;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "92;42;62;57", "wc_summary_review": "27;60;12;21", "wc_main_review": "624;461;153;244", "wc_review": "743;563;227;322", "wc_reply_reviewers": "0;39;0;0", "wc_reply_authors": "2042;1929;1013;975", "reply_reviewers": "0;1;0;0", "reply_authors": "4;5;3;3", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 63.25, 18.15729880791744 ], "wc_summary_review_avg": [ 30.0, 18.12456896039186 ], "wc_main_review_avg": [ 370.5, 184.22879796600748 ], "wc_review_avg": [ 463.75, 202.46897910544223 ], "wc_reply_reviewers_avg": [ 9.75, 16.887495373796554 ], "wc_reply_authors_avg": [ 1489.75, 497.53862915355626 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 3.75, 0.82915619758885 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.13245323570650439, "corr_recommendation_correctness": 0.9271726499455306, "gs_citation": 69, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5523117763790476247&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=oU3aTsmeRQV", "email": "pku.edu.cn;pku.edu.cn", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Peking University", "aff_unique_dep": "", "aff_unique_url": "http://www.pku.edu.cn", "aff_unique_abbr": "Peking U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "title": "Divergence-aware Federated Self-Supervised Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/5908", "id": "oVE1z8NlNe", "poster": "", "openreview": "https://openreview.net/forum?id=oVE1z8NlNe", "slides": "https://iclr.cc/virtual/2022/poster/5908", "video": "https://iclr.cc/virtual/2022/poster/5908", "author_site": "Weiming Zhuang, Yonggang Wen, Shuai Zhang", "tldr": "", "abstract": "Self-supervised learning (SSL) is capable of learning remarkable representations from centrally available data. Recent works further implement federated learning with SSL to learn from rapidly growing decentralized unlabeled images (e.g., from cameras and phones), often resulted from privacy constraints. Extensive attention has been paid to SSL approaches based on Siamese networks. However, such an effort has not yet revealed deep insights into various fundamental building blocks for the federated self-supervised learning (FedSSL) architecture. We aim to fill in this gap via in-depth empirical study and propose a new method to tackle the non-independently and identically distributed (non-IID) data problem of decentralized data. Firstly, we introduce a generalized FedSSL framework that embraces existing SSL methods based on Siamese networks and presents flexibility catering to future methods. In this framework, a server coordinates multiple clients to conduct SSL training and periodically updates local models of clients with the aggregated global model. Using the framework, our study uncovers unique insights of FedSSL: 1) stop-gradient operation, previously reported to be essential, is not always necessary in FedSSL; 2) retaining local knowledge of clients in FedSSL is particularly beneficial for non-IID data. Inspired by the insights, we then propose a new approach for model update, Federated Divergence-aware Exponential Moving Average update (FedEMA). FedEMA updates local models of clients adaptively using EMA of the global model, where the decay rate is dynamically measured by model divergence. Extensive experiments demonstrate that FedEMA outperforms existing methods by 3-4% on linear evaluation. We hope that this work will provide useful insights for future research.", "keywords": "Federated Learning;Self-supervised Learning;Unsupervised representation learning", "primary_area": "", "supplementary_material": "", "author": "Weiming Zhuang;Yonggang Wen;Shuai Zhang", "authorids": "~Weiming_Zhuang1;~Yonggang_Wen1;zhangshuai@sensetime.com", "gender": ";M;", "homepage": "https://weiming.me/;https://personal.ntu.edu.sg/ygwen/;", "dblp": "274/0724;;", "google_scholar": "lLuLAzEAAAAJ;https://scholar.google.com.tw/citations?user=byeygOkAAAAJ;", "orcid": ";;", "linkedin": ";;", "or_profile": "~Weiming_Zhuang1;~Yonggang_Wen1;zhangshuai@sensetime.com", "aff": "Nanyang Technological University;Nanyang Technological University;", "aff_domain": "ntu.edu.sg;ntu.edu.sg;", "position": "PhD student;Full Professor;", "bibtex": "@inproceedings{\nzhuang2022divergenceaware,\ntitle={Divergence-aware Federated Self-Supervised Learning},\nauthor={Weiming Zhuang and Yonggang Wen and Shuai Zhang},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=oVE1z8NlNe}\n}", "github": "", "project": "", "reviewers": "48jX;AcAS;wF5G;Nu54", "pdf_size": 0, "recommendation": "3;5;6;8", "confidence": "5;2;3;4", "correctness": "3;3;3;4", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "17;67;301;39", "wc_summary_review": "19;73;88;96", "wc_main_review": "317;88;368;611", "wc_review": "353;228;757;746", "wc_reply_reviewers": "179;0;0;233", "wc_reply_authors": "866;858;1009;2079", "reply_reviewers": "1;0;0;1", "reply_authors": "3;2;2;3", "recommendation_avg": [ 5.5, 1.8027756377319946 ], "confidence_avg": [ 3.5, 1.118033988749895 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 106.0, 113.96929411029973 ], "wc_summary_review_avg": [ 69.0, 30.024989592004857 ], "wc_main_review_avg": [ 346.0, 185.8184597934231 ], "wc_review_avg": [ 521.0, 234.73069675694316 ], "wc_reply_reviewers_avg": [ 103.0, 104.75447484475305 ], "wc_reply_authors_avg": [ 1203.0, 509.3147356988604 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 2.5, 0.5 ], "replies_avg": [ 27, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.2480694691784169, "corr_recommendation_correctness": 0.8006407690254357, "gs_citation": 125, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17230003057420920178&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=oVE1z8NlNe", "email": "ntu.edu.sg;ntu.edu.sg;", "author_num": 3, "aff_unique_index": "0;0", "aff_unique_norm": "Nanyang Technological University", "aff_unique_dep": "", "aff_unique_url": "https://www.ntu.edu.sg", "aff_unique_abbr": "NTU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Singapore" }, { "id": "oVfIKuhqfC", "title": "Non-Denoising Forward-Time Diffusions", "track": "main", "status": "Reject", "tldr": "", "abstract": "The scope of this paper is generative modeling through diffusion processes.\nAn approach falling within this paradigm is the work of Song et al. (2021), which relies on a time-reversal argument to construct a diffusion process targeting the desired data distribution.\nWe show that the time-reversal argument, common to all denoising diffusion probabilistic modeling proposals, is not necessary.\nWe obtain diffusion processes targeting the desired data distribution by taking appropriate mixtures of diffusion bridges.\nThe resulting transport is exact by construction, allows for greater flexibility in choosing the dynamics of the underlying diffusion, and can be approximated by means of a neural network via novel training objectives.\nWe develop an unifying view of the drift adjustments corresponding to our and to time-reversal approaches and make use of this representation to inspect the inner workings of diffusion-based generative models.\nFinally, we leverage on scalable simulation and inference techniques common in spatial statistics to move beyond fully factorial distributions in the underlying diffusion dynamics.\nThe methodological advances contained in this work contribute toward establishing a general framework for generative modeling based on diffusion processes.", "keywords": "deep learning;diffusion;SDE;generative modelling;DDPM", "primary_area": "", "supplementary_material": "", "author": "Stefano Peluchetti", "authorids": "~Stefano_Peluchetti1", "gender": "M", "homepage": "https://stefanopeluchetti.com", "dblp": "128/1385", "google_scholar": "w3Gi3TEAAAAJ", "orcid": "", "linkedin": "stefanopeluchetti/", "or_profile": "~Stefano_Peluchetti1", "aff": "Cogent Labs", "aff_domain": "cogent.co.jp", "position": "Principal Research Scientist", "bibtex": "@misc{\npeluchetti2022nondenoising,\ntitle={Non-Denoising Forward-Time Diffusions},\nauthor={Stefano Peluchetti},\nyear={2022},\nurl={https://openreview.net/forum?id=oVfIKuhqfC}\n}", "github": "", "project": "", "reviewers": "QAeW;e93q;oeyA;FUzA", "site": "https://openreview.net/forum?id=oVfIKuhqfC", "pdf_size": 0, "recommendation": "3;5;5;8", "confidence": "3;2;4;2", "correctness": "4;3;3;4", "technical_novelty": "3;2;3;4", "empirical_novelty": "1;2;2;1", "wc_summary_paper": "65;33;125;178", "wc_summary_review": "68;13;68;31", "wc_main_review": "1619;250;649;109", "wc_review": "1752;296;842;318", "wc_reply_reviewers": "0;0;563;0", "wc_reply_authors": "2337;645;2057;15", "reply_reviewers": "0;0;1;0", "reply_authors": "3;1;3;1", "recommendation_avg": [ 5.25, 1.7853571071357126 ], "confidence_avg": [ 2.75, 0.82915619758885 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 1.5, 0.5 ], "wc_summary_paper_avg": [ 100.25, 55.72869548087412 ], "wc_summary_review_avg": [ 45.0, 23.86419912756345 ], "wc_main_review_avg": [ 656.75, 589.8009727865833 ], "wc_review_avg": [ 802.0, 590.4218830632889 ], "wc_reply_reviewers_avg": [ 140.75, 243.78615116531947 ], "wc_reply_authors_avg": [ 1263.5, 964.797776738732 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 2.0, 1.0 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 1, 0 ], "corr_recommendation_confidence": -0.46442036401282394, "corr_recommendation_correctness": 0.14002800840280097, "gs_citation": 28, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17302407955330866296&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0", "aff_unique_norm": "Cogent Labs", "aff_unique_dep": "", "aff_unique_url": "https://www.cogentlabs.com", "aff_unique_abbr": "", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "title": "On the Generalization of Models Trained with SGD: Information-Theoretic Bounds and Implications", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6184", "id": "oWZsQ8o5EA", "poster": "", "openreview": "https://openreview.net/forum?id=oWZsQ8o5EA", "slides": "https://iclr.cc/virtual/2022/poster/6184", "video": "https://iclr.cc/virtual/2022/poster/6184", "author_site": "Ziqiao Wang, Yongyi Mao", "tldr": "", "abstract": "This paper follows up on a recent work of Neu et al. (2021) and presents some new information-theoretic upper bounds for the generalization error of machine learning models, such as neural networks, trained with SGD. We apply these bounds to analyzing the generalization behaviour of linear and two-layer ReLU networks. Experimental study of these bounds provide some insights on the SGD training of neural networks. They also point to a new and simple regularization scheme which we show performs comparably to the current state of the art. ", "keywords": "deep learning;generalization;information theory;learning bound;regularization", "primary_area": "", "supplementary_material": "/attachment/78bdf7df1a4f031f74b220da6a9caa3a6be4fa49.zip", "author": "Ziqiao Wang;Yongyi Mao", "authorids": "~Ziqiao_Wang1;~Yongyi_Mao2", "gender": "M;M", "homepage": "https://ziqiaowanggeothe.github.io;http://www.eecs.uottawa.ca/~yymao", "dblp": "222/9220;86/2933", "google_scholar": "iBL7APIAAAAJ;https://scholar.google.ca/citations?user=jM5l70wAAAAJ", "orcid": "0000-0003-0504-4830;0000-0001-5298-5778", "linkedin": "ziqiao-wang-987565155/?locale=en_US;", "or_profile": "~Ziqiao_Wang1;~Yongyi_Mao1", "aff": "University of Ottawa;University of Ottawa", "aff_domain": "uottawa.ca;eecs.uottawa.ca", "position": "PhD student;Full Professor", "bibtex": "@inproceedings{\nwang2022on,\ntitle={On the Generalization of Models Trained with {SGD}: Information-Theoretic Bounds and Implications},\nauthor={Ziqiao Wang and Yongyi Mao},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=oWZsQ8o5EA}\n}", "github": "", "project": "", "reviewers": "ULcg;Xt5A;RzUv;A9tc", "pdf_size": 0, "recommendation": "5;6;8;10", "confidence": "3;3;4;3", "correctness": "4;3;3;4", "technical_novelty": "3;3;3;4", "empirical_novelty": "3;2;3;3", "wc_summary_paper": "85;63;209;140", "wc_summary_review": "19;26;24;16", "wc_main_review": "357;355;177;709", "wc_review": "461;444;410;865", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "856;843;534;1473", "reply_reviewers": "0;0;0;0", "reply_authors": "2;2;1;2", "recommendation_avg": [ 7.25, 1.920286436967152 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 124.25, 56.39758416811841 ], "wc_summary_review_avg": [ 21.25, 3.960744879438715 ], "wc_main_review_avg": [ 399.5, 193.05633892726755 ], "wc_review_avg": [ 545.0, 185.66232789664144 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 926.5, 340.8302363347477 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.75, 0.4330127018922193 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.22549380840084865, "corr_recommendation_correctness": 0.13018891098082386, "gs_citation": 29, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=34477169376579483&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=oWZsQ8o5EA", "email": "uottawa.ca;eecs.uottawa.ca", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "University of Ottawa", "aff_unique_dep": "", "aff_unique_url": "https://www.uottawa.ca", "aff_unique_abbr": "U Ottawa", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Canada" }, { "id": "oZe7Zdia1H5", "title": "Lottery Tickets can have Structural Sparsity", "track": "main", "status": "Reject", "tldr": "", "abstract": "The lottery ticket hypothesis (LTH) has shown that dense models contain highly sparse subnetworks (i.e., $\\textit{winning tickets}$) that can be trained in isolation to match full accuracy. Despite many exciting efforts being made, there is one \"commonsense\" seldomly challenged: a winning ticket is found by iterative magnitude pruning (IMP) and hence the resultant pruned subnetworks have only unstructured sparsity. That gap limits the appeal of winning tickets in practice, since the highly irregular sparse patterns are challenging to accelerate on hardware. Meanwhile, directly substituting structured pruning for unstructured pruning in IMP damages performance more severely and is usually unable to locate winning tickets. \n\nIn this paper, we demonstrate $\\textbf{the first positive result}$ that a structurally sparse winning ticket can be effectively found in general. The core idea is to append ``post-processing techniques\" after each round of (unstructured) IMP, to enforce the formation of structural sparsity. Specifically, we first ``re-fill\" pruned elements back in some channels deemed to be important, and then ``re-group\" non-zero elements to create flexible group-wise structural patterns. Both our identified channel- and group-wise structural subnetworks win the lottery, with substantial inference speedups readily supported by practical hardware. Extensive experiments, conducted on diverse datasets across multiple network backbones, consistently validate our proposal, showing that the hardware acceleration roadblock of LTH is now removed. Specifically, the structural winning tickets obtain up to $\\{64.93\\%, 64.84\\%, 64.84\\%\\}$ running time savings at $\\{36\\%\\sim 80\\%, 74\\%, 58\\%\\}$ sparsity on CIFAR, Tiny-ImageNet, ImageNet, while maintaining comparable accuracy. All the codes and pre-trained models will be publicly released.", "keywords": "Lottery Ticket Hypothesis;Structural Winning Tickets", "primary_area": "", "supplementary_material": "/attachment/28655fb375ee2abf8ff5f07e167b972ff57110f0.zip", "author": "Tianlong Chen;Xuxi Chen;Xiaolong Ma;Yanzhi Wang;Zhangyang Wang", "authorids": "~Tianlong_Chen1;~Xuxi_Chen1;~Xiaolong_Ma2;~Yanzhi_Wang3;~Zhangyang_Wang1", "gender": "M;Unspecified;M;M;M", "homepage": "https://tianlong-chen.github.io;;https://xiaolongma2016.com;https://web.northeastern.edu/yanzhiwang/;https://vita-group.github.io", "dblp": ";267/9662;;;119/4026", "google_scholar": "LE3ctn0AAAAJ;afsDlKYAAAAJ;https://scholar.google.com/citations?hl=en;https://scholar.google.com/citations?hl=en;pxFyKAIAAAAJ", "orcid": "0000-0001-7774-8197;;0000-0003-3753-7648;;", "linkedin": "tianlong-chen-783862167/;;xiaolong-ma-66b98910b/;;", "or_profile": "~Tianlong_Chen1;~Xuxi_Chen1;~Xiaolong_Ma2;~Yanzhi_Wang3;~Zhangyang_Wang1", "aff": "University of Texas, Austin;University of Texas at Austin;Northeastern University;Northeastern University;University of Texas, Austin", "aff_domain": "utexas.edu;utexas.edu;northeastern.edu;northeastern.edu;utexas.edu", "position": "PhD student;PhD student;PhD student;Associate Professor;Assistant Professor", "bibtex": "@misc{\nchen2022lottery,\ntitle={Lottery Tickets can have Structural Sparsity},\nauthor={Tianlong Chen and Xuxi Chen and Xiaolong Ma and Yanzhi Wang and Zhangyang Wang},\nyear={2022},\nurl={https://openreview.net/forum?id=oZe7Zdia1H5}\n}", "github": "", "project": "", "reviewers": "YWUh;7uex;iox5;TFc4", "site": "https://openreview.net/forum?id=oZe7Zdia1H5", "pdf_size": 0, "recommendation": "5;6;8;8", "confidence": "5;4;4;4", "correctness": "1;4;4;4", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;3;4;4", "wc_summary_paper": "59;52;17;34", "wc_summary_review": "19;29;16;39", "wc_main_review": "686;178;239;286", "wc_review": "764;259;272;359", "wc_reply_reviewers": "820;32;20;0", "wc_reply_authors": "2759;361;544;596", "reply_reviewers": "2;1;1;0", "reply_authors": "7;1;2;1", "recommendation_avg": [ 6.75, 1.299038105676658 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.25, 1.299038105676658 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 40.5, 16.347782724271816 ], "wc_summary_review_avg": [ 25.75, 9.03811374126261 ], "wc_main_review_avg": [ 347.25, 199.29046013294263 ], "wc_review_avg": [ 413.5, 205.9811884614709 ], "wc_reply_reviewers_avg": [ 218.0, 347.75278575447817 ], "wc_reply_authors_avg": [ 1065.0, 981.9182756217546 ], "reply_reviewers_avg": [ 1.0, 0.7071067811865476 ], "reply_authors_avg": [ 2.75, 2.48746859276655 ], "replies_avg": [ 20, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.7777777777777777, "corr_recommendation_correctness": 0.7777777777777778, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:oLt8qVW9gLgJ:scholar.google.com/&scioq=Lottery+Tickets+can+have+Structural+Sparsity&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;1;1;0", "aff_unique_norm": "University of Texas at Austin;Northeastern University", "aff_unique_dep": ";", "aff_unique_url": "https://www.utexas.edu;https://www.northeastern.edu", "aff_unique_abbr": "UT Austin;NEU", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Austin;", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Leveraging unlabeled data to predict out-of-distribution performance", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6847", "id": "o_HsiMPYh_x", "poster": "", "openreview": "https://openreview.net/forum?id=o_HsiMPYh_x", "slides": "https://iclr.cc/virtual/2022/poster/6847", "video": "https://iclr.cc/virtual/2022/poster/6847", "author_site": "Saurabh Garg, Sivaraman Balakrishnan, Zachary Lipton, Behnam Neyshabur, Hanie Sedghi", "tldr": "", "abstract": "Real-world machine learning deployments are characterized by mismatches between the source (training) and target (test) distributions\nthat may cause performance drops. In this work, we investigate methods for predicting the target domain accuracy using only labeled source data and unlabeled target data. We propose Average Thresholded Confidence (ATC), a practical method that learns a \\emph{threshold} on the model's confidence, predicting accuracy as the fraction of unlabeled examples for which model confidence exceeds that threshold. ATC outperforms previous methods across several model architectures, types of distribution shifts (e.g., due to synthetic corruptions, dataset reproduction, or novel subpopulations), and datasets (\\textsc{Wilds}-FMoW, ImageNet, \\breeds, CIFAR, and MNIST). In our experiments, ATC estimates target performance $2\\text{--}4\\times$ more accurately than prior methods. We also explore the theoretical foundations of the problem, proving that, in general, identifying the accuracy is just as hard as identifying the optimal predictor and thus, the efficacy of any method rests upon (perhaps unstated) assumptions on the nature of the shift. Finally, analyzing our method on some toy distributions, we provide insights concerning when it works.\n\n", "keywords": "Distribution Shift;OOD error prediction;Deep Learning", "primary_area": "", "supplementary_material": "", "author": "Saurabh Garg;Sivaraman Balakrishnan;Zachary Chase Lipton;Behnam Neyshabur;Hanie Sedghi", "authorids": "~Saurabh_Garg3;~Sivaraman_Balakrishnan1;~Zachary_Chase_Lipton1;~Behnam_Neyshabur1;~Hanie_Sedghi1", "gender": "M;M;Unspecified;M;F", "homepage": "http://saurabhgarg1996.github.io/;http://www.stat.cmu.edu/~siva/;http://zacklipton.com;https://www.neyshabur.net;https://haniesedghi.com/", "dblp": "80/208;52/10671;;131/9898;66/8332", "google_scholar": "SAnJ1hIAAAAJ;o7yFQXUAAAAJ;MN9Kfg8AAAAJ;e1ucbCYAAAAJ;_9GX96fDWAMC", "orcid": ";;;;", "linkedin": "saurabh-garg-b680b5b8/;;;;hanie-sedghi-71bb2582", "or_profile": "~Saurabh_Garg3;~Sivaraman_Balakrishnan1;~Zachary_Chase_Lipton1;~Behnam_Neyshabur1;~Hanie_Sedghi1", "aff": "Carnegie Mellon University;Carnegie Mellon University;Carnegie Mellon University;Google;Google Research, Brain team", "aff_domain": "cmu.edu;cmu.edu;cmu.edu;google.com;google.com", "position": "PhD student;Assistant Professor;Assistant Professor;Research Scientist;Senior Research Scientist", "bibtex": "@inproceedings{\ngarg2022leveraging,\ntitle={Leveraging unlabeled data to predict out-of-distribution performance},\nauthor={Saurabh Garg and Sivaraman Balakrishnan and Zachary Chase Lipton and Behnam Neyshabur and Hanie Sedghi},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=o_HsiMPYh_x}\n}", "github": "", "project": "", "reviewers": "t6Wr;zA7T;M2yw;uW7x;s1vP", "pdf_size": 0, "recommendation": "5;6;8;8;8", "confidence": "4;3;4;3;4", "correctness": "3;4;4;4;4", "technical_novelty": "2;3;3;3;3", "empirical_novelty": "3;3;3;3;3", "wc_summary_paper": "85;104;175;59;126", "wc_summary_review": "9;42;60;55;38", "wc_main_review": "174;135;208;355;511", "wc_review": "268;281;443;469;675", "wc_reply_reviewers": "0;0;51;28;166", "wc_reply_authors": "797;50;470;797;1177", "reply_reviewers": "0;0;1;1;2", "reply_authors": "2;1;1;2;3", "recommendation_avg": [ 7.0, 1.2649110640673518 ], "confidence_avg": [ 3.6, 0.4898979485566356 ], "correctness_avg": [ 3.8, 0.39999999999999997 ], "technical_novelty_avg": [ 2.8, 0.39999999999999997 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 109.8, 39.3517471022571 ], "wc_summary_review_avg": [ 40.8, 17.837040113202637 ], "wc_main_review_avg": [ 276.6, 138.8331372547635 ], "wc_review_avg": [ 427.2, 148.40539073766828 ], "wc_reply_reviewers_avg": [ 49.0, 61.54023074379881 ], "wc_reply_authors_avg": [ 658.2, 377.62701174571714 ], "reply_reviewers_avg": [ 0.8, 0.7483314773547883 ], "reply_authors_avg": [ 1.8, 0.7483314773547883 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.7905694150420949, "gs_citation": 168, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5646390275734787221&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "pdf": "https://openreview.net/pdf?id=o_HsiMPYh_x", "email": "cmu.edu;cmu.edu;cmu.edu;google.com;google.com", "author_num": 5, "aff_unique_index": "0;0;0;1;1", "aff_unique_norm": "Carnegie Mellon University;Google", "aff_unique_dep": ";Google", "aff_unique_url": "https://www.cmu.edu;https://www.google.com", "aff_unique_abbr": "CMU;Google", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "oaKw-GmBZZ", "title": "Learning Time-dependent PDE Solver using Message Passing Graph Neural Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "One of the main challenges in solving time-dependent partial differential equations is to develop computationally efficient solvers that are accurate and stable. Here, we introduce a general graph neural network approach to finding efficient PDE solvers through learning using message-passing models. We first introduce domain invariant features for PDE-data inspired by classical PDE solvers for an efficient physical representation. Next, we use graphs to represent PDE-data on an unstructured mesh and show that message passing graph neural networks (MPGNN) can parameterize governing equations, and as a result, efficiently learn accurate solver schemes for linear/nonlinear PDEs. We further show that the solvers are independent of the initial training geometry and can solve the same PDE on more complex domains. Lastly, we show that a recurrent graph neural network approach can find a temporal sequence of solutions to a PDE.", "keywords": "graph neural networks;partial differential equations;time-dependent PDE;message passing graph neural networks", "primary_area": "", "supplementary_material": "", "author": "Pourya Pilva;Ahmad Zareei", "authorids": "pourya.pilva@rwth-aachen.de;~Ahmad_Zareei1", "gender": ";", "homepage": ";https://ahmadzareei.github.io/azareei/", "dblp": ";", "google_scholar": ";iaXmWTEAAAAJ", "orcid": ";", "linkedin": ";ahmad-zareei-61390210/", "or_profile": "pourya.pilva@rwth-aachen.de;~Ahmad_Zareei1", "aff": ";Harvard University", "aff_domain": ";harvard.edu", "position": ";Postdoc", "bibtex": "@misc{\npilva2022learning,\ntitle={Learning Time-dependent {PDE} Solver using Message Passing Graph Neural Networks},\nauthor={Pourya Pilva and Ahmad Zareei},\nyear={2022},\nurl={https://openreview.net/forum?id=oaKw-GmBZZ}\n}", "github": "", "project": "", "reviewers": "kLFD;Eu7u;neAM;LoDm", "site": "https://openreview.net/forum?id=oaKw-GmBZZ", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "4;4;5;3", "correctness": "3;3;3;4", "technical_novelty": "1;2;2;3", "empirical_novelty": "0;2;1;0", "wc_summary_paper": "95;79;37;88", "wc_summary_review": "60;160;34;19", "wc_main_review": "277;513;357;98", "wc_review": "432;752;428;205", "wc_reply_reviewers": "94;436;110;19", "wc_reply_authors": "440;801;738;402", "reply_reviewers": "1;2;1;1", "reply_authors": "1;1;1;1", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 0.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 74.75, 22.520823697191894 ], "wc_summary_review_avg": [ 68.25, 54.96532998172575 ], "wc_main_review_avg": [ 311.25, 149.53657579334896 ], "wc_review_avg": [ 454.25, 194.91328199997045 ], "wc_reply_reviewers_avg": [ 164.75, 160.3299332626319 ], "wc_reply_authors_avg": [ 595.25, 176.18083749375242 ], "reply_reviewers_avg": [ 1.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.3244428422615251, "corr_recommendation_correctness": 0.6622661785325219, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10826076796746966034&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0", "aff_unique_norm": "Harvard University", "aff_unique_dep": "", "aff_unique_url": "https://www.harvard.edu", "aff_unique_abbr": "Harvard", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "title": "Einops: Clear and Reliable Tensor Manipulations with Einstein-like Notation", "status": "Oral", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6602", "id": "oapKSVM2bcj", "poster": "", "openreview": "https://openreview.net/forum?id=oapKSVM2bcj", "slides": "https://iclr.cc/virtual/2022/poster/6602", "video": "https://iclr.cc/virtual/2022/poster/6602", "tldr": "", "abstract": "Tensor computations underlie modern scientific computing and deep learning.\nA number of tensor frameworks emerged varying in execution model, hardware support, memory management, model definition, etc.\nHowever, tensor operations in all frameworks follow the same paradigm.\nRecent neural network architectures demonstrate demand for higher expressiveness of tensor operations.\nThe current paradigm is not suited to write readable, reliable, or easy-to-modify code for multidimensional tensor manipulations. \nMoreover, some commonly used operations do not provide sufficient checks and can break a tensor structure.\nThese mistakes are elusive as no tools or tests can detect them.\nIndependently, API discrepancies complicate code transfer between frameworks.\nWe propose einops notation: a uniform and generic way to manipulate tensor structure, that significantly improves code readability and flexibility by focusing on the structure of input and output tensors.\nWe implement einops notation in a Python package that efficiently supports multiple widely used frameworks and provides framework-independent minimalist API for tensor manipulations.", "keywords": "tensor manipulations;tensor transformation;einops;einstein notation;einsum", "primary_area": "", "supplementary_material": "", "author": "Alex Rogozhnikov", "authorids": "~Alex_Rogozhnikov1", "gender": "M", "homepage": "https://arogozhnikov.github.io/", "dblp": "", "google_scholar": "tQLqzbMAAAAJ", "orcid": "0000-0002-7413-9553", "linkedin": "alexrogozhnikov/", "or_profile": "~Alex_Rogozhnikov1", "aff": "", "aff_domain": "", "position": "", "bibtex": "@inproceedings{\nrogozhnikov2022einops,\ntitle={Einops: Clear and Reliable Tensor Manipulations with Einstein-like Notation},\nauthor={Alex Rogozhnikov},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=oapKSVM2bcj}\n}", "github": "", "project": "", "reviewers": "p5Km;gRMH;ntk4;c6LK", "pdf_size": 0, "recommendation": "3;3;6;8", "confidence": "4;5;4;4", "correctness": "2;2;4;4", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "60;74;88;124", "wc_summary_review": "64;115;127;46", "wc_main_review": "442;1515;173;476", "wc_review": "566;1704;388;646", "wc_reply_reviewers": "0;629;0;0", "wc_reply_authors": "522;3558;322;327", "reply_reviewers": "0;1;0;0", "reply_authors": "1;5;1;1", "recommendation_avg": [ 5.0, 2.1213203435596424 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.0, 1.0 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 86.5, 23.806511714234826 ], "wc_summary_review_avg": [ 88.0, 33.87476937190864 ], "wc_main_review_avg": [ 651.5, 512.1730664531277 ], "wc_review_avg": [ 826.0, 515.4434983584525 ], "wc_reply_reviewers_avg": [ 157.25, 272.36498949020597 ], "wc_reply_authors_avg": [ 1182.25, 1374.0088018277029 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 2.0, 1.7320508075688772 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 1, 0 ], "corr_recommendation_confidence": -0.5443310539518174, "corr_recommendation_correctness": 0.9428090415820635, "gs_citation": 134, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1927726956296600102&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "pdf": "https://openreview.net/pdf?id=oapKSVM2bcj", "email": "", "author_num": 1 }, { "id": "obi9EkyVeED", "title": "FedDrop: Trajectory-weighted Dropout for Efficient Federated Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Federated learning (FL) enables edge clients to train collaboratively while preserving individual's data privacy. As clients do not inherently share identical data distributions, they may disagree in the direction of parameter updates, resulting in high compute and communication costs in comparison to centralized learning. Recent advances in FL focus on reducing data transmission during training; yet they neglected the increase of computational cost that dwarfs the merit of reduced communication. To this end, we propose FedDrop, which introduces channel-wise weighted dropout layers between convolutions to accelerate training while minimizing their impact on convergence. Empirical results show that FedDrop can drastically reduce the amount of FLOPs required for training with a small increase in communication, and push the Pareto frontier of communication/computation trade-off further than competing FL algorithms.", "keywords": "federated learning;efficient training;dropout;stochastic model;channel selection", "primary_area": "", "supplementary_material": "", "author": "Dongping Liao;Xitong Gao;Yiren Zhao;Hao Dai;Li Li;Kafeng Wang;Kejiang Ye;Yang Wang;Cheng-zhong Xu", "authorids": "~Dongping_Liao1;~Xitong_Gao1;~Yiren_Zhao2;~Hao_Dai1;~Li_Li10;~Kafeng_Wang1;kj.ye@siat.ac.cn;yang.wang1@siat.ac.cn;~Cheng-zhong_Xu1", "gender": "M;M;M;M;M;M;;;", "homepage": "https://github.com/ldpbuaa;https://github.com/admk;https://aaronzhao.me;;https://www.fst.um.edu.mo/personal/llili/;https://dblp.org/pid/245/3352.html;;;", "dblp": "158/7101;140/2071;https://dblp.uni-trier.de/pers/hd/z/Zhao:Yiren;;53/2189-64;245/3352.html;;;", "google_scholar": "fxg96-oAAAAJ;-YIUCL8AAAAJ;lOOmgEgAAAAJ;;uLzU3OcAAAAJ;_cF9-CkAAAAJ;;;", "orcid": ";0000-0002-2063-2051;;0000-0003-1018-2162;0000-0002-2044-8289;;;;", "linkedin": ";;yiren-aaron-zhao-baa8b5116/;;;;;;", "or_profile": "~Dongping_Liao1;~Xitong_Gao1;~Yiren_Zhao2;~Hao_Dai1;~Li_Li10;~Kafeng_Wang1;kj.ye@siat.ac.cn;yang.wang1@siat.ac.cn;~Cheng-zhong_Xu1", "aff": "University of Macau;Shenzhen Institute of Advanced Technology, Chinese Academy of Sciences;Imperial College London;;University of Macau;Shenzhen Institutes of Advanced Technology, Chinese Academy of Sciences, Chinese Academy of Sciences;;;", "aff_domain": "um.edu.mo;siat.ac.cn;ic.ac.uk;;um.edu.mo;siat.ac.cn;;;", "position": "PhD student;Researcher;Assistant Professor;;Assistant Professor;PhD student;;;", "bibtex": "@misc{\nliao2022feddrop,\ntitle={FedDrop: Trajectory-weighted Dropout for Efficient Federated Learning},\nauthor={Dongping Liao and Xitong Gao and Yiren Zhao and Hao Dai and Li Li and Kafeng Wang and Kejiang Ye and Yang Wang and Cheng-zhong Xu},\nyear={2022},\nurl={https://openreview.net/forum?id=obi9EkyVeED}\n}", "github": "", "project": "", "reviewers": "wCnx;A6bP;guLZ;tPTk", "site": "https://openreview.net/forum?id=obi9EkyVeED", "pdf_size": 0, "recommendation": "3;5;5;5", "confidence": "4;3;4;3", "correctness": "2;4;4;3", "technical_novelty": "2;3;2;4", "empirical_novelty": "2;2;2;4", "wc_summary_paper": "19;78;62;46", "wc_summary_review": "33;42;64;40", "wc_main_review": "1092;280;226;153", "wc_review": "1144;400;352;239", "wc_reply_reviewers": "0;252;0;0", "wc_reply_authors": "1376;645;399;245", "reply_reviewers": "0;1;0;0", "reply_authors": "3;2;1;1", "recommendation_avg": [ 4.5, 0.8660254037844386 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.75, 0.82915619758885 ], "empirical_novelty_avg": [ 2.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 51.25, 21.787324296480282 ], "wc_summary_review_avg": [ 44.75, 11.60549438843516 ], "wc_main_review_avg": [ 437.75, 380.410551246939 ], "wc_review_avg": [ 533.75, 357.14309107135193 ], "wc_reply_reviewers_avg": [ 63.0, 109.11920087683927 ], "wc_reply_authors_avg": [ 666.25, 433.89824555994693 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.75, 0.82915619758885 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 9, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.8703882797784891, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1005098190534502148&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;2;0;3", "aff_unique_norm": "University of Macau;Shenzhen Institute of Advanced Technology;Imperial College London;Chinese Academy of Sciences", "aff_unique_dep": ";;;Shenzhen Institutes of Advanced Technology", "aff_unique_url": "https://www.um.edu.mo;http://www.siat.cas.cn;https://www.imperial.ac.uk;http://www.cas.cn", "aff_unique_abbr": "UM;SIAT;ICL;CAS", "aff_campus_unique_index": "0;1;0;1", "aff_campus_unique": "Macau SAR;Shenzhen;", "aff_country_unique_index": "0;0;1;0;0", "aff_country_unique": "China;United Kingdom" }, { "id": "oe8U8WETg4t", "title": "Linear Backpropagation Leads to Faster Convergence", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Backpropagation is widely used for calculating gradients in deep neural networks (DNNs). Applied often along with stochastic gradient descent (SGD) or its variants, backpropagation is considered as a de-facto choice in a variety of machine learning tasks including DNN training and adversarial attack/defense. Nevertheless, unlike SGD which has been intensively studied over the past years, backpropagation is somehow overlooked. In this paper, we study the very recent method called ``linear backpropagation'' (LinBP), which modifies the standard backpropagation and can improve the transferability in black-box adversarial attack. By providing theoretical analyses on LinBP in neural-network-involved learning tasks including white-box adversarial attack and model training, we will demonstrate that, somewhat surprisingly, LinBP can lead to faster convergence in these tasks. We will also confirm our theoretical results with extensive experiments.", "keywords": "Convergence analysis;Backpropagation analysis", "primary_area": "", "supplementary_material": "/attachment/4130b90842c8f6b2542a20abb75018dd454e33cf.zip", "author": "Li Ziang;Yiwen Guo;Haodi Liu;Changshui Zhang", "authorids": "~Li_Ziang1;~Yiwen_Guo1;~Haodi_Liu1;~Changshui_Zhang2", "gender": "M;;M;M", "homepage": "https://github.com/lzalza;;;http://bigeye.au.tsinghua.edu.cn/english/Introduction.html", "dblp": ";;309/7001;z/ChangshuiZhang", "google_scholar": ";;;GL9M37YAAAAJ", "orcid": ";;;", "linkedin": ";;haodi-liu-8643a5167/;", "or_profile": "~Li_Ziang1;~Yiwen_Guo1;~Haodi_Liu1;~Changshui_Zhang2", "aff": "Tsinghua University;;Tsinghua University;Tsinghua University", "aff_domain": "tsinghua.edu.cn;;tsinghua.edu.cn;mail.tsinghua.edu.cn", "position": "PhD student;;PhD student;Full Professor", "bibtex": "@misc{\nziang2022linear,\ntitle={Linear Backpropagation Leads to Faster Convergence},\nauthor={Li Ziang and Yiwen Guo and Haodi Liu and Changshui Zhang},\nyear={2022},\nurl={https://openreview.net/forum?id=oe8U8WETg4t}\n}", "github": "", "project": "", "reviewers": "wUME;TtvR;oSuV", "site": "https://openreview.net/forum?id=oe8U8WETg4t", "pdf_size": 0, "recommendation": "3;3;6", "confidence": "5;3;4", "correctness": "2;4;3", "technical_novelty": "2;2;3", "empirical_novelty": "2;2;3", "wc_summary_paper": "56;27;50", "wc_summary_review": "12;25;49", "wc_main_review": "214;263;497", "wc_review": "282;315;596", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "65;120;39", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "recommendation_avg": [ 4.0, 1.4142135623730951 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "correctness_avg": [ 3.0, 0.816496580927726 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 44.333333333333336, 12.498888839501783 ], "wc_summary_review_avg": [ 28.666666666666668, 15.3260852434302 ], "wc_main_review_avg": [ 324.6666666666667, 123.48909083621741 ], "wc_review_avg": [ 397.6666666666667, 140.88845075291144 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 74.66666666666667, 33.76717669901087 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:yB8vz8j7BQkJ:scholar.google.com/&scioq=Linear+Backpropagation+Leads+to+Faster+Convergence&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0;0", "aff_unique_norm": "Tsinghua University", "aff_unique_dep": "", "aff_unique_url": "https://www.tsinghua.edu.cn", "aff_unique_abbr": "THU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "id": "of3y9kPkAWA", "title": "Reinforcement Learning with Predictive Consistent Representations", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Learning informative representations from image-based observations is a fundamental problem in deep Reinforcement Learning (RL). However, data inefficiency remains a significant barrier. To this end, we investigate Predictive Consistent Representations (PCR) that enforces predictive consistency on a learned dynamic model. Unlike previous algorithms that simply exploit a forward dynamics model, the PCR agent is trained to predict the future state and retain consistency across the predicted state of observation and its multiple views, which is demonstrated through careful ablation experiments. We empirically show that PCR outperforms the current state-of-the-art baselines in terms of data efficiency on a series of pixel-based control tasks in the DeepMind control suite. Notably, on challenging tasks like Cheetah-run, PCR reaches a 47.4% improvement when environmental steps are limited to 100k steps.", "keywords": "Reinforcement Learning;Data Efficiency;Representation Learning", "primary_area": "", "supplementary_material": "/attachment/2c5581c36a5a92064d5349157e47e1c3f319a48e.zip", "author": "Tao Huang;Xiao Chen;Jiachen Wang", "authorids": "~Tao_Huang6;~Xiao_Chen4;~Jiachen_Wang1", "gender": "M;M;", "homepage": "https://taohuang13.github.io/; https://123.com;https://peppacat.github.io/", "dblp": ";;", "google_scholar": "9jRIXGsAAAAJ;;", "orcid": ";;", "linkedin": ";;", "or_profile": "~Tao_Huang6;~Xiao_Chen4;~Jiachen_Wang1", "aff": "ShanghaiTech University;Google;Shanghaitech University", "aff_domain": "shanghaitech.edu.cn;google.com;shanghaitech.edu.cn", "position": "Undergrad student;Full Professor;Undergrad student", "bibtex": "@misc{\nhuang2022reinforcement,\ntitle={Reinforcement Learning with Predictive Consistent Representations},\nauthor={Tao Huang and Xiao Chen and Jiachen Wang},\nyear={2022},\nurl={https://openreview.net/forum?id=of3y9kPkAWA}\n}", "github": "", "project": "", "reviewers": "uVzJ;CZ6H;MAiw;AtHm;tHbZ", "site": "https://openreview.net/forum?id=of3y9kPkAWA", "pdf_size": 0, "recommendation": "1;3;5;5;5", "confidence": "4;3;4;4;4", "correctness": "2;3;3;3;3", "technical_novelty": "2;2;3;2;3", "empirical_novelty": "1;2;2;2;3", "wc_summary_paper": "79;189;87;65;613", "wc_summary_review": "41;93;32;134;88", "wc_main_review": "916;262;291;897;624", "wc_review": "1036;544;410;1096;1325", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "0;0;0;0;0", "reply_reviewers": "0;0;0;0;0", "reply_authors": "0;0;0;0;0", "recommendation_avg": [ 3.8, 1.6 ], "confidence_avg": [ 3.8, 0.39999999999999997 ], "correctness_avg": [ 2.8, 0.39999999999999997 ], "technical_novelty_avg": [ 2.4, 0.4898979485566356 ], "empirical_novelty_avg": [ 2.0, 0.6324555320336759 ], "wc_summary_paper_avg": [ 206.6, 207.89766713457848 ], "wc_summary_review_avg": [ 77.6, 37.26982693815467 ], "wc_main_review_avg": [ 598.0, 282.25732939996436 ], "wc_review_avg": [ 882.2, 347.21428542040144 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.25, "corr_recommendation_correctness": 0.8750000000000001, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:06e1mqJ1uXkJ:scholar.google.com/&scioq=Reinforcement+Learning+with+Predictive+Consistent+Representations&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1;0", "aff_unique_norm": "ShanghaiTech University;Google", "aff_unique_dep": ";Google", "aff_unique_url": "https://www.shanghaitech.edu.cn;https://www.google.com", "aff_unique_abbr": "ShanghaiTech;Google", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;1;0", "aff_country_unique": "China;United States" }, { "id": "ofLwshMBL_H", "title": "Continual Learning Using Task Conditional Neural Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "Conventional deep learning models have limited capacity in learning multiple tasks sequentially. The issue of forgetting the previously learned tasks in continual learning is known as catastrophic forgetting or interference. When the input data or the goal of learning changes, a continual model will learn and adapt to the new status. However, the model will not remember or recognise any revisits to the previous states. This causes performance reduction and re-training curves in dealing with periodic or irregularly reoccurring changes in the data or goals. Dynamic approaches, which assign new neuron resources to the upcoming tasks, are introduced to address this issue. However, most of the dynamic methods need task information about the upcoming tasks during the inference phase to activate the corresponding neurons. To address this issue, we introduce Task Conditional Neural Network which allows the model to identify the task information automatically. The proposed model can continually learn and embed new tasks into the model without losing the information about previously learned tasks. We evaluate the proposed model combined with the mixture of experts approach on the MNIST and CIFAR100 datasets and show how it significantly improves the continual learning process without requiring task information in advance.", "keywords": "catastrophic forgetting;continual learning", "primary_area": "", "supplementary_material": "", "author": "Honglin Li;Frieder Ganz;David J. Sharp;Payam M. Barnaghi", "authorids": "~Honglin_Li1;~Frieder_Ganz1;~David_J._Sharp1;~Payam_M._Barnaghi1", "gender": "M;M;M;M", "homepage": "https://www.imperial.ac.uk/people/honglin.li20;;;https://www.imperial.ac.uk/people/p.barnaghi", "dblp": ";;91/6655;22/4255", "google_scholar": ";u9ySZkUAAAAJ;Sbz45kEAAAAJ;D6R2cnwAAAAJ", "orcid": ";;;0000-0001-8591-9638", "linkedin": ";;;", "or_profile": "~Honglin_Li1;~Frieder_Ganz1;~David_J._Sharp1;~Payam_M._Barnaghi1", "aff": ";;Imperial College London;Imperial College London", "aff_domain": ";;ic.ac.uk;imperial.ac.uk", "position": ";;Full Professor;Full Professor", "bibtex": "@misc{\nli2022continual,\ntitle={Continual Learning Using Task Conditional Neural Networks},\nauthor={Honglin Li and Frieder Ganz and David J. Sharp and Payam M. Barnaghi},\nyear={2022},\nurl={https://openreview.net/forum?id=ofLwshMBL_H}\n}", "github": "", "project": "", "reviewers": "YEsG;NcCF;H38t;sgG4", "site": "https://openreview.net/forum?id=ofLwshMBL_H", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "4;4;4;4", "correctness": "3;2;3;3", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "55;46;35;60", "wc_summary_review": "22;37;8;40", "wc_main_review": "216;220;137;259", "wc_review": "293;303;180;359", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "374;241;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;0;0", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 49.0, 9.513148795220223 ], "wc_summary_review_avg": [ 26.75, 12.794041581923986 ], "wc_main_review_avg": [ 208.0, 44.30011286667337 ], "wc_review_avg": [ 283.75, 64.96681845373067 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 153.75, 160.77993500434064 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0.5, 0.5 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6073209960288813545&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Imperial College London", "aff_unique_dep": "", "aff_unique_url": "https://www.imperial.ac.uk", "aff_unique_abbr": "ICL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United Kingdom" }, { "title": "PF-GNN: Differentiable particle filtering based approximation of universal graph representations", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/5982", "id": "oh4TirnfSem", "poster": "", "openreview": "https://openreview.net/forum?id=oh4TirnfSem", "slides": "https://iclr.cc/virtual/2022/poster/5982", "video": "https://iclr.cc/virtual/2022/poster/5982", "author_site": "Mohammed Haroon Dupty, Yanfei Dong, Wee Sun Lee", "tldr": "", "abstract": "Message passing Graph Neural Networks (GNNs) are known to be limited in expressive power by the 1-WL color-refinement test for graph isomorphism. Other more expressive models either are computationally expensive or need preprocessing to extract structural features from the graph. In this work, we propose to make GNNs universal by guiding the learning process with exact isomorphism solver techniques which operate on the paradigm of $\\textit{Individualization and refinement}$ (IR), a method to artificially introduce asymmetry and further refine the coloring when 1-WL stops. Isomorphism solvers generate a search-tree of colorings whose leaves uniquely identify the graph. However, the tree grows exponentially large and needs hand-crafted pruning techniques which are not desirable from a learning perspective. We take a probabilistic view and approximate the search tree of colorings ( i.e. embeddings) by sampling multiple paths from root to leaves of the search-tree. To learn more discriminative representations, we guide the sampling process with $\\textit{particle filter}$ updates, a principled approach for sequential state estimation. Our algorithm is end-to-end differentiable, can be applied with any GNN as backbone and learns richer graph representations with only linear increase in runtime. Experimental evaluation shows that our approach consistently outperforms leading GNN models on both synthetic benchmarks for isomorphism detection as well as real-world datasets.", "keywords": "Graph Neural Networks;Graph representation learning;Expressive GNN", "primary_area": "", "supplementary_material": "", "author": "Mohammed Haroon Dupty;Yanfei Dong;Wee Sun Lee", "authorids": "~Mohammed_Haroon_Dupty1;~Yanfei_Dong1;~Wee_Sun_Lee1", "gender": ";F;M", "homepage": "https://dmharoon.github.io;;http://www.comp.nus.edu.sg/~leews/", "dblp": "186/7914;;86/1498", "google_scholar": "https://scholar.google.com/citations?hl=en;;https://scholar.google.com.sg/citations?user=8PCrLgwAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Mohammed_Haroon_Dupty1;~Yanfei_Dong1;~Wee_Sun_Lee1", "aff": "National University of Singapore;National University of Singapore;National University of Singapore", "aff_domain": "nus.edu;u.nus.edu;nus.edu.sg", "position": "PhD student;PhD student;Full Professor", "bibtex": "@inproceedings{\ndupty2022pfgnn,\ntitle={{PF}-{GNN}: Differentiable particle filtering based approximation of universal graph representations},\nauthor={Mohammed Haroon Dupty and Yanfei Dong and Wee Sun Lee},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=oh4TirnfSem}\n}", "github": "", "project": "", "reviewers": "YGAz;tTwv;weJp;WUrz", "pdf_size": 0, "recommendation": "6;6;8;8", "confidence": "2;5;5;3", "correctness": "3;3;3;4", "technical_novelty": "3;4;2;3", "empirical_novelty": "3;3;2;3", "wc_summary_paper": "29;82;210;42", "wc_summary_review": "39;43;124;43", "wc_main_review": "268;267;447;509", "wc_review": "336;392;781;594", "wc_reply_reviewers": "146;472;318;43", "wc_reply_authors": "532;2392;1523;807", "reply_reviewers": "1;3;2;1", "reply_authors": "2;6;4;2", "recommendation_avg": [ 7.0, 1.0 ], "confidence_avg": [ 3.75, 1.299038105676658 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 90.75, 71.56596607326698 ], "wc_summary_review_avg": [ 62.25, 35.688758734369 ], "wc_main_review_avg": [ 372.75, 107.50901125022033 ], "wc_review_avg": [ 525.75, 175.85843027844868 ], "wc_reply_reviewers_avg": [ 244.75, 163.907557787919 ], "wc_reply_authors_avg": [ 1313.5, 720.1279400217715 ], "reply_reviewers_avg": [ 1.75, 0.82915619758885 ], "reply_authors_avg": [ 3.5, 1.6583123951777 ], "replies_avg": [ 27, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.19245008972987523, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10303969868752291265&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "pdf": "https://openreview.net/pdf?id=oh4TirnfSem", "email": "nus.edu;u.nus.edu;nus.edu.sg", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "National University of Singapore", "aff_unique_dep": "", "aff_unique_url": "https://www.nus.edu.sg", "aff_unique_abbr": "NUS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Singapore" }, { "id": "ohKxcPdAscw", "title": "Improved Generalization-Robustness Trade-off via Uncertainty Targeted Attacks", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "The deep learning models' sensitivity to small input perturbations raises security concerns and limits their use for applications where reliability is critical. While adversarial training methods aim at training more robust models, these techniques often result in a lower unperturbed (clean) test accuracy, including the most widely used Projected Gradient Descent (PGD) method. In this work, we propose uncertainty-targeted attacks (UTA), where the perturbations are obtained by maximizing the model's estimated uncertainty. We demonstrate on MNIST, Fashion-MNIST and CIFAR-10 that this approach does not drastically deteriorate the clean test accuracy relative to PGD whilst it is robust to PGD attacks. In particular, uncertainty-based attacks allow for using larger $L_\\infty$-balls around the training data points, are less prone to overfitting the attack, and yield improved generalization-robustness trade-off.", "keywords": "uncertainty estimation;adversarial-training", "primary_area": "", "supplementary_material": "", "author": "Matteo Pagliardini;Gilberto Manunza;Martin Jaggi;Tatjana Chavdarova", "authorids": "~Matteo_Pagliardini1;~Gilberto_Manunza1;~Martin_Jaggi1;~Tatjana_Chavdarova2", "gender": "M;M;M;F", "homepage": ";;https://mlo.epfl.ch;https://chavdarova.github.io", "dblp": "140/7789;;17/4402;160/6038", "google_scholar": "https://scholar.google.ch/citations?user=FXacC3oAAAAJ;;https://scholar.google.ch/citations?user=r1TJBr8AAAAJ;", "orcid": ";;0000-0003-1579-5558;", "linkedin": ";gilberto-manunza-619379140/;;", "or_profile": "~Matteo_Pagliardini1;~Gilberto_Manunza1;~Martin_Jaggi1;~Tatjana_Chavdarova2", "aff": "Swiss Federal Institute of Technology Lausanne;;EPFL;University of California, Berkeley", "aff_domain": "epfl.ch;;epfl.ch;berkeley.edu", "position": "PhD student;;Assistant Professor;Postdoc", "bibtex": "@misc{\npagliardini2022improved,\ntitle={Improved Generalization-Robustness Trade-off via Uncertainty Targeted Attacks},\nauthor={Matteo Pagliardini and Gilberto Manunza and Martin Jaggi and Tatjana Chavdarova},\nyear={2022},\nurl={https://openreview.net/forum?id=ohKxcPdAscw}\n}", "github": "", "project": "", "reviewers": "LwLQ;XGNS;mAmo;g8mT", "site": "https://openreview.net/forum?id=ohKxcPdAscw", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "3;4;4;4", "correctness": "3;3;3;2", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;2;1;2", "wc_summary_paper": "59;48;69;21", "wc_summary_review": "25;50;76;175", "wc_main_review": "522;819;535;566", "wc_review": "606;917;680;762", "wc_reply_reviewers": "0;131;256;1287", "wc_reply_authors": "594;1359;749;1858", "reply_reviewers": "0;1;1;3", "reply_authors": "1;2;1;3", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 49.25, 17.92170471802278 ], "wc_summary_review_avg": [ 81.5, 56.91440942327347 ], "wc_main_review_avg": [ 610.5, 121.43413852784562 ], "wc_review_avg": [ 741.25, 115.50189392386602 ], "wc_reply_reviewers_avg": [ 418.5, 509.53336495267905 ], "wc_reply_authors_avg": [ 1140.0, 503.60748604443916 ], "reply_reviewers_avg": [ 1.25, 1.0897247358851685 ], "reply_authors_avg": [ 1.75, 0.82915619758885 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": -1.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8151331952371288671&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;2", "aff_unique_norm": "Swiss Federal Institute of Technology Lausanne;EPFL;University of California, Berkeley", "aff_unique_dep": ";;", "aff_unique_url": "https://www.epfl.ch;https://www.epfl.ch;https://www.berkeley.edu", "aff_unique_abbr": "EPFL;EPFL;UC Berkeley", "aff_campus_unique_index": "0;2", "aff_campus_unique": "Lausanne;;Berkeley", "aff_country_unique_index": "0;0;1", "aff_country_unique": "Switzerland;United States" }, { "id": "ohYt48SDnEQ", "title": "Stochastic Variance Reduced Ensemble Adversarial Attack", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Black-box adversarial attack has attracted much attention for its practical use in deep learning applications, and it is very challenging as there is no access to the architecture and weights of the target model. Based on the hypothesis that if an example remains adversarial for multiple models, then it is more likely to transfer to other models, the ensemble-based attack methods are efficient and widely used in the black-box setting. Nevertheless, existing ensemble-based approaches simply aggregate the outputs of all models but ignore the variance of different models, leading to a rather poor local optimum. To address this issue, we propose a stochastic variance reduced ensemble attack method to boost the performance of black-box adversarial attacks. By integrating the stochastic variance reduced gradient technique into the model ensemble attack, our method can balance the gradient of different models and leads to a better local maximum, resulting in highly transferable adversarial examples. Empirical results on the standard ImageNet dataset demonstrate that our method can boost the ensemble attack performance and significantly improve the transferability of the generated adversarial examples. ", "keywords": "Ensemble-based attack;adversarial example;stochastic variance reduced model;transferability", "primary_area": "", "supplementary_material": "/attachment/c3cdb274adaaf70af8de4b3728bb36125252908e.zip", "author": "jiadong lin;Yifeng Xiong;Min Zhang;John E. Hopcroft;Kun He", "authorids": "~jiadong_lin1;xiongyf@hust.edu.cn;m_zhang@hust.edu.cn;~John_E._Hopcroft1;~Kun_He1", "gender": "M;;;M;F", "homepage": "https://dblp.org/pid/205/7870;;;http://www.cs.cornell.edu/jeh/;http://faculty.hust.edu.cn/hekun/zh_CN/more/1411001/jsjjgd/index.htm", "dblp": "205/7870;;;h/JohnEHopcroft;59/1028-1", "google_scholar": ";;;4Z6vo5QAAAAJ;YTQnGJsAAAAJ", "orcid": ";;;0000-0001-8681-6075;0000-0001-7627-4604", "linkedin": ";;;;", "or_profile": "~jiadong_lin1;xiongyf@hust.edu.cn;m_zhang@hust.edu.cn;~John_E._Hopcroft1;~Kun_He1", "aff": ";;;Department of Computer Science, Cornell University;Huazhong University of Sceince and Technology", "aff_domain": ";;;cs.cornell.edu;hust.edu.cn", "position": ";;;Full Professor;Full Professor", "bibtex": "@misc{\nlin2022stochastic,\ntitle={Stochastic Variance Reduced Ensemble Adversarial Attack},\nauthor={jiadong lin and Yifeng Xiong and Min Zhang and John E. Hopcroft and Kun He},\nyear={2022},\nurl={https://openreview.net/forum?id=ohYt48SDnEQ}\n}", "github": "", "project": "", "reviewers": "PxT9;B4NX;H2va;FzzK", "site": "https://openreview.net/forum?id=ohYt48SDnEQ", "pdf_size": 0, "recommendation": "3;5;5;5", "confidence": "4;4;4;4", "correctness": "4;3;3;3", "technical_novelty": "1;2;2;2", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "62;57;49;51", "wc_summary_review": "11;40;21;21", "wc_main_review": "239;330;140;136", "wc_review": "312;427;210;208", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.5, 0.8660254037844386 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 1.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 54.75, 5.11737237261468 ], "wc_summary_review_avg": [ 23.25, 10.497023387608508 ], "wc_main_review_avg": [ 211.25, 80.01679511202633 ], "wc_review_avg": [ 289.25, 89.96492372030335 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": -1.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:kiIJJs7HLyYJ:scholar.google.com/&scioq=Stochastic+Variance+Reduced+Ensemble+Adversarial+Attack&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1", "aff_unique_norm": "Cornell University;Huazhong University of Science and Technology", "aff_unique_dep": "Department of Computer Science;", "aff_unique_url": "https://www.cornell.edu;http://www.hust.edu.cn", "aff_unique_abbr": "Cornell;HUST", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "United States;China" }, { "title": "Neural Network Approximation based on Hausdorff distance of Tropical Zonotopes", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/5971", "id": "oiZJwC_fyS", "poster": "", "openreview": "https://openreview.net/forum?id=oiZJwC_fyS", "slides": "https://iclr.cc/virtual/2022/poster/5971", "video": "https://iclr.cc/virtual/2022/poster/5971", "author_site": "Panagiotis Misiakos, Georgios Smyrnis, George Retsinas, Petros Maragos", "tldr": "", "abstract": "In this work we theoretically contribute to neural network approximation by providing a novel tropical geometrical viewpoint to structured neural network compression. In particular, we show that the approximation error between two neural networks with ReLU activations and one hidden layer depends on the Hausdorff distance of the tropical zonotopes of the networks. This theorem comes as a first step towards a purely geometrical interpretation of neural network approximation. Based on this theoretical contribution, we propose geometrical methods that employ the K-means algorithm to compress the fully connected parts of ReLU activated deep neural networks. We analyze the error bounds of our algorithms theoretically based on our approximation theorem and evaluate them empirically on neural network compression. Our experiments follow a proof-of-concept strategy and indicate that our geometrical tools achieve improved performance over relevant tropical geometry techniques and can be competitive against non-tropical methods. ", "keywords": "Tropical Geometry;Zonotopes;Hausdorff Approximation;Neural Network Compression", "primary_area": "", "supplementary_material": "/attachment/e4ebfc4ae844913fb8a8941da483a5e99aebc3cb.zip", "author": "Panagiotis Misiakos;Georgios Smyrnis;George Retsinas;Petros Maragos", "authorids": "~Panagiotis_Misiakos1;~Georgios_Smyrnis1;~George_Retsinas2;~Petros_Maragos1", "gender": "M;M;M;M", "homepage": "https://acl.inf.ethz.ch/people/panosm/;;http://users.iit.demokritos.gr/~georgeretsi/;http://robotics.ntua.gr/members/maragos/", "dblp": "270/4194;255/9114;171/5669;22/4003", "google_scholar": "PlqKbB4AAAAJ;;https://scholar.google.gr/;A2XydgGCY9gC", "orcid": ";;;", "linkedin": ";;george-retsinas-9b073b88/;petros-maragos-76087b92/", "or_profile": "~Panagiotis_Misiakos1;~Georgios_Smyrnis1;~George_Retsinas2;~Petros_Maragos1", "aff": "Department of Computer Science, ETHZ - ETH Zurich;Google;National Technical University of Athens;National Technical University of Athens", "aff_domain": "inf.ethz.ch;google.com;ntua.gr;ntua.gr", "position": "PhD student;Student Researcher;Postdoc;Full Professor", "bibtex": "@inproceedings{\nmisiakos2022neural,\ntitle={Neural Network Approximation based on Hausdorff distance of Tropical Zonotopes},\nauthor={Panagiotis Misiakos and Georgios Smyrnis and George Retsinas and Petros Maragos},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=oiZJwC_fyS}\n}", "github": "", "project": "", "reviewers": "FTWK;srEK;ayne;mhFZ", "pdf_size": 0, "recommendation": "5;5;5;6", "confidence": "2;1;3;2", "correctness": "3;3;3;3", "technical_novelty": "3;3;4;3", "empirical_novelty": "3;3;2;3", "wc_summary_paper": "50;62;49;108", "wc_summary_review": "75;103;107;67", "wc_main_review": "286;101;474;110", "wc_review": "411;266;630;285", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "378;203;253;81", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 5.25, 0.4330127018922193 ], "confidence_avg": [ 2.0, 0.7071067811865476 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 3.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 67.25, 24.076700355322778 ], "wc_summary_review_avg": [ 88.0, 17.291616465790582 ], "wc_main_review_avg": [ 242.75, 152.53093948442066 ], "wc_review_avg": [ 398.0, 145.07411898750237 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 228.75, 106.48562109505677 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14835536009355422877&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=oiZJwC_fyS", "email": "inf.ethz.ch;google.com;ntua.gr;ntua.gr", "author_num": 4, "aff_unique_index": "0;1;2;2", "aff_unique_norm": "ETH Zurich;Google;National Technical University of Athens", "aff_unique_dep": "Department of Computer Science;Google;", "aff_unique_url": "https://www.ethz.ch;https://www.google.com;https://www.ntua.gr", "aff_unique_abbr": "ETHZ;Google;NTUA", "aff_campus_unique_index": "0;1", "aff_campus_unique": "Zurich;Mountain View;", "aff_country_unique_index": "0;1;2;2", "aff_country_unique": "Switzerland;United States;Greece" }, { "id": "oiy9BAuqnDg", "title": "Information Condensing Active Learning", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "We introduce Information Condensing Active Learning, a batch mode model agnostic Active Learning method targeted at Deep Bayesian Active Learning that focuses on acquiring labels for points which have as much information as possible about the still unacquired points. ICAL uses the Hilbert Schmidt Independence Criterion (HSIC) to measure the strength of the dependency between a candidate batch of points and the unlabeled set. We develop key optimizations that allow us to scale our method to large unlabeled sets. We show significant improvements in terms of model accuracy and negative log likelihood on several image datasets compared to state of the art batch mode AL methods for deep learning.", "keywords": "active learning", "primary_area": "", "supplementary_material": "/attachment/91af388f5694f6a1c71eb938fbcffc95fcb6c601.zip", "author": "Siddhartha Jain;Ge Liu;David Gifford", "authorids": "~Siddhartha_Jain1;~Ge_Liu2;~David_Gifford1", "gender": "M;F;M", "homepage": "https://tmfs10.github.io/;http://www.mit.edu/~geliu/;http://giffordlab.mit.edu", "dblp": "81/8212;;g/DavidKGifford", "google_scholar": "mBJIa8cAAAAJ;P6EahzcAAAAJ;", "orcid": ";0000-0001-9383-5186;", "linkedin": ";;", "or_profile": "~Siddhartha_Jain1;~Ge_Liu2;~David_Gifford1", "aff": "Amazon;Amazon AWS AI;Massachusetts Institute of Technology", "aff_domain": "amazon.com;amazon.com;mit.edu", "position": "Applied Scientist;Researcher;Full Professor", "bibtex": "@misc{\njain2022information,\ntitle={Information Condensing Active Learning},\nauthor={Siddhartha Jain and Ge Liu and David Gifford},\nyear={2022},\nurl={https://openreview.net/forum?id=oiy9BAuqnDg}\n}", "github": "", "project": "", "reviewers": "TpNn;BVRg;72MN;JDae", "site": "https://openreview.net/forum?id=oiy9BAuqnDg", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "4;3;2;3", "correctness": "2;1;1;3", "technical_novelty": "2;3;2;3", "empirical_novelty": "2;3;2;3", "wc_summary_paper": "4;62;89;73", "wc_summary_review": "37;40;21;22", "wc_main_review": "488;353;131;312", "wc_review": "529;455;241;407", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 3.0, 0.7071067811865476 ], "correctness_avg": [ 1.75, 0.82915619758885 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 57.0, 32.07023542164915 ], "wc_summary_review_avg": [ 30.0, 8.573214099741124 ], "wc_main_review_avg": [ 321.0, 127.56762912275198 ], "wc_review_avg": [ 408.0, 105.75916035975324 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.6488856845230502, "corr_recommendation_correctness": 0.20751433915982243, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=427288786869855412&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0;1", "aff_unique_norm": "Amazon;Massachusetts Institute of Technology", "aff_unique_dep": "Amazon.com, Inc.;", "aff_unique_url": "https://www.amazon.com;https://web.mit.edu", "aff_unique_abbr": "Amazon;MIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Decentralized Learning for Overparameterized Problems: A Multi-Agent Kernel Approximation Approach", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6887", "id": "oj2yn1Q4Ett", "poster": "", "openreview": "https://openreview.net/forum?id=oj2yn1Q4Ett", "slides": "https://iclr.cc/virtual/2022/poster/6887", "video": "https://iclr.cc/virtual/2022/poster/6887", "author_site": "Prashant Khanduri, Haibo Yang, Mingyi Hong, Jia Liu, Hoi To Wai, Sijia Liu", "tldr": "", "abstract": "This work develops a novel framework for communication-efficient distributed learning where the models to be learned are overparameterized. We focus on a class of kernel learning problems (which includes the popular neural tangent kernel (NTK) learning as a special case) and propose a novel {\\it multi-agent kernel approximation} technique that allows the agents to distributedly estimate the full kernel function, and subsequently perform decentralized optimization, without directly exchanging any local data or parameters. The proposed framework is a significant departure from the classical consensus-based approaches, because the agents do not exchange problem parameters, and no consensus is required. We analyze the optimization and the generalization performance of the proposed framework for the $\\ell_2$ loss. We show that with $M$ agents and $N$ total samples when certain generalized inner-product kernels (resp. the random features kernel) are used, each agent needs to communicate $\\mathcal{O}\\big({N^2}/{M}\\big)$ bits (resp. $\\mathcal{O}\\big(N \\sqrt{N}/M \\big)$ real values) to achieve minimax optimal generalization performance. We validate the theoretical results on 90 UCI benchmarking datasets (with average data size $N \\approx 1000$) and show that each agent needs to share a total of $200N/M$ bits (resp. $3N/M$ real values) to closely match the performance of the centralized algorithms, and these numbers are independent of parameter and feature dimensions. ", "keywords": "distributed optimization;over-parameterized optimization;kernel learning", "primary_area": "", "supplementary_material": "/attachment/875eb6b57775c3f8d2a374fa8f984ff55905bab6.zip", "author": "Prashant Khanduri;Haibo Yang;Mingyi Hong;Jia Liu;Hoi To Wai;Sijia Liu", "authorids": "~Prashant_Khanduri1;~Haibo_Yang1;~Mingyi_Hong1;~Jia_Liu1;~Hoi_To_Wai1;~Sijia_Liu1", "gender": "M;M;M;M;M;M", "homepage": "https://sites.google.com/view/khanduri-prashant/home?authuser=0;https://haibo-yang-osu.github.io/homepage/;http://people.ece.umn.edu/~mhong/mingyi.html;https://kevinliu-osu.github.io/index.html;http://www1.se.cuhk.edu.hk/~htwai/;https://lsjxjtu.github.io/", "dblp": "158/4888;43/7829-1;57/8053;;29/9875;128/6972-1", "google_scholar": ";eyy22VoAAAAJ;qRnP-p0AAAAJ;Ofx3dScAAAAJ;https://scholar.google.com.hk/citations?user=5-J7LeMAAAAJ;C7dO_UgAAAAJ", "orcid": ";0000-0002-3245-2728;;;;", "linkedin": "prashant-khanduri-0497894b/;;;;;", "or_profile": "~Prashant_Khanduri1;~Haibo_Yang1;~Mingyi_Hong1;~Jia_Liu1;~Hoi_To_Wai1;~Sijia_Liu1", "aff": "University of Minnesota, Minneapolis;Ohio State University;University of Minnesota, Minneapolis;The Ohio State University;The Chinese University of Hong Kong;Michigan State University", "aff_domain": "umn.edu;osu.edu;umn.edu;osu.edu;cuhk.edu.hk;msu.edu", "position": "Postdoc;PhD student;Associate Professor;Assistant Professor;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\nkhanduri2022decentralized,\ntitle={Decentralized Learning for Overparameterized Problems: A Multi-Agent Kernel Approximation Approach},\nauthor={Prashant Khanduri and Haibo Yang and Mingyi Hong and Jia Liu and Hoi To Wai and Sijia Liu},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=oj2yn1Q4Ett}\n}", "github": "", "project": "", "reviewers": "gN2n;ut1q;MEfo", "pdf_size": 0, "recommendation": "5;6;6", "confidence": "5;4;4", "correctness": "3;4;3", "technical_novelty": "2;3;2", "empirical_novelty": "2;2;0", "wc_summary_paper": "40;115;105", "wc_summary_review": "70;38;232", "wc_main_review": "439;283;180", "wc_review": "549;436;517", "wc_reply_reviewers": "0;25;0", "wc_reply_authors": "1839;668;1854", "reply_reviewers": "0;1;0", "reply_authors": "3;1;3", "recommendation_avg": [ 5.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 1.3333333333333333, 0.9428090415820634 ], "wc_summary_paper_avg": [ 86.66666666666667, 33.2498955721 ], "wc_summary_review_avg": [ 113.33333333333333, 84.92087820763251 ], "wc_main_review_avg": [ 300.6666666666667, 106.47169681291936 ], "wc_review_avg": [ 500.6666666666667, 47.55581516024676 ], "wc_reply_reviewers_avg": [ 8.333333333333334, 11.785113019775793 ], "wc_reply_authors_avg": [ 1453.6666666666667, 555.5839770507745 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 2.3333333333333335, 0.9428090415820634 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.9999999999999998, "corr_recommendation_correctness": 0.4999999999999999, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1, "pdf": "https://openreview.net/pdf?id=oj2yn1Q4Ett", "email": "umn.edu;osu.edu;umn.edu;osu.edu;cuhk.edu.hk;msu.edu", "author_num": 6, "aff_unique_index": "0;1;0;1;2;3", "aff_unique_norm": "University of Minnesota;Ohio State University;Chinese University of Hong Kong;Michigan State University", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.minnesota.edu;https://www.osu.edu;https://www.cuhk.edu.hk;https://www.msu.edu", "aff_unique_abbr": "UMN;OSU;CUHK;MSU", "aff_campus_unique_index": "0;0;2", "aff_campus_unique": "Minneapolis;;Hong Kong SAR", "aff_country_unique_index": "0;0;0;0;1;0", "aff_country_unique": "United States;China" }, { "id": "okmZ6-zU6Lz", "title": "Quantifying the Controllability of Coarsely Characterized Networked Dynamical Systems", "track": "main", "status": "Reject", "tldr": "", "abstract": "We study the controllability of large-scale networked dynamical systems when complete knowledge of network structure is unavailable. In particular, we establish the power of learning community-based representations to understand the ability of a group of control nodes to steer the network to a target state. We are motivated by abundant real-world examples, ranging from power and water systems to brain networks, in which practitioners do not have access to fine-scale knowledge of the network. Rather, knowledge is limited to coarse summaries of network structure. Existing work on \"model order reduction\" starts with full knowledge of fine-scale structure and derives a coarse-scale (lower-dimensional) model that well-approximates the fine-scale system. In contrast, in this paper the controllability aspects of the coarse system are derived from coarse summaries {\\em without} knowledge of the fine-scale structure. We study under what conditions measures of controllability for the (unobserved) fine-scale system can be well approximated by measures of controllability derived from the (observed) coarse-scale system. To accomplish this, we require knowledge of some inherent parametric structure of the fine-scale system that makes this type of inverse problem feasible. To this end, we assume that the underlying fine-scale network is generated by the stochastic block model (SBM) often studied in community detection. We quantify controllability using the ``average\ncontrollability'' metric and bound the difference between the controllability of the fine-scale system and that of the coarse-scale system. Our analysis indicates the necessity of underlying structure to make possible the learning of community-based representations, and to be able to quantify accurately the controllability of coarsely characterized networked dynamical systems.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/65fa51cda868bd15455eb1ef0ccb4d0298052f90.zip", "author": "Nafiseh Ghoroghchian;Rajasekhar Anguluri;Gautam Dasarathy;Stark Draper", "authorids": "~Nafiseh_Ghoroghchian1;~Rajasekhar_Anguluri1;~Gautam_Dasarathy1;~Stark_Draper1", "gender": "F;M;M;M", "homepage": ";;http://gautamdasarathy.com;https://www.ece.utoronto.ca/people/draper-s/", "dblp": ";;24/8356;", "google_scholar": "o0dHsvEAAAAJ;https://scholar.google.com/citations?hl=en;iSL1cKsAAAAJ;", "orcid": ";0000-0003-2537-2778;;", "linkedin": "nafisehGhoroghchian/;anguluri-rajasekhar-32292723/;;", "or_profile": "~Nafiseh_Ghoroghchian1;~Rajasekhar_Anguluri1;~Gautam_Dasarathy1;~Stark_Draper1", "aff": "Toronto University;Arizona State University;Arizona State University;Toronto University", "aff_domain": "utoronto.ca;asu.edu;asu.edu;utoronto.ca", "position": "PhD student;Postdoc;Associate Professor;Full Professor", "bibtex": "@misc{\nghoroghchian2022quantifying,\ntitle={Quantifying the Controllability of Coarsely Characterized Networked Dynamical Systems},\nauthor={Nafiseh Ghoroghchian and Rajasekhar Anguluri and Gautam Dasarathy and Stark Draper},\nyear={2022},\nurl={https://openreview.net/forum?id=okmZ6-zU6Lz}\n}", "github": "", "project": "", "reviewers": "zMsx;AEYY;SdpL", "site": "https://openreview.net/forum?id=okmZ6-zU6Lz", "pdf_size": 0, "recommendation": "3;6;6", "confidence": "2;4;4", "correctness": "3;3;3", "technical_novelty": "3;3;3", "empirical_novelty": "1;3;0", "wc_summary_paper": "69;44;25", "wc_summary_review": "62;76;45", "wc_main_review": "197;460;198", "wc_review": "328;580;268", "wc_reply_reviewers": "54;31;0", "wc_reply_authors": "1830;1475;748", "reply_reviewers": "1;1;0", "reply_authors": "4;3;1", "recommendation_avg": [ 5.0, 1.4142135623730951 ], "confidence_avg": [ 3.3333333333333335, 0.9428090415820634 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 1.3333333333333333, 1.247219128924647 ], "wc_summary_paper_avg": [ 46.0, 18.01850900231944 ], "wc_summary_review_avg": [ 61.0, 12.675435561221029 ], "wc_main_review_avg": [ 285.0, 123.74436014084304 ], "wc_review_avg": [ 392.0, 135.17396198972642 ], "wc_reply_reviewers_avg": [ 28.333333333333332, 22.125902367034783 ], "wc_reply_authors_avg": [ 1351.0, 450.34283236959226 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 2.6666666666666665, 1.247219128924647 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 1.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:QHYbIPaDkyoJ:scholar.google.com/&scioq=Quantifying+the+Controllability+of+Coarsely+Characterized+Networked+Dynamical+Systems&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;1;0", "aff_unique_norm": "University of Toronto;Arizona State University", "aff_unique_dep": ";", "aff_unique_url": "https://www.utoronto.ca;https://www.asu.edu", "aff_unique_abbr": "U of T;ASU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;0", "aff_country_unique": "Canada;United States" }, { "id": "olQbo52II9", "title": "Learning to Solve Combinatorial Problems via Efficient Exploration", "track": "main", "status": "Reject", "tldr": "", "abstract": "From logistics to the natural sciences, combinatorial optimisation on graphs underpins numerous real-world applications. Reinforcement learning (RL) has shown particular promise in this setting as it can adapt to specific problem structures and does not require pre-solved instances for these, often NP-hard, problems. However, state-of-the-art (SOTA) approaches typically suffer from severe scalability issues, primarily due to their reliance on expensive graph neural networks (GNNs) at each decision step. We introduce ECORD; a novel RL algorithm that alleviates this expense by restricting the GNN to a single pre-processing step, before entering a fast-acting exploratory phase directed by a recurrent unit. Experimentally, we demonstrate that ECORD achieves a new SOTA for RL algorithms on the Maximum Cut problem, whilst also providing orders of magnitude improvement in speed and scalability. Compared to the nearest competitor, ECORD reduces the optimality gap by up to 73% on 500 vertex graphs with a decreased wall-clock time. Moreover, ECORD retains strong performance when generalising to larger graphs with up to 10000 vertices.", "keywords": "reinforcement learning;combinatorial optimisation;graph neural network;maximum cut", "primary_area": "", "supplementary_material": "", "author": "Thomas D Barrett;Christopher William Falke Parsonson;Alexandre Laterre", "authorids": "~Thomas_D_Barrett1;~Christopher_William_Falke_Parsonson1;~Alexandre_Laterre1", "gender": "M;M;M", "homepage": ";https://cwfparsonson.github.io/;", "dblp": "248/8263;;223/4200", "google_scholar": "nJa1KGIAAAAJ;2Vw7d64AAAAJ;HrMSaicAAAAJ", "orcid": "0000-0001-6241-3028;;", "linkedin": "tom-barrett-62b180a2/;christopher-parsonson-381830a3/?originalSubdomain=uk;reinforce/", "or_profile": "~Thomas_D_Barrett1;~Christopher_William_Falke_Parsonson1;~Alexandre_Laterre1", "aff": "InstaDeep;University College London;InstaDeep", "aff_domain": "instadeep.com;ucl.ac.uk;instadeep.com", "position": "Researcher;PhD student;head of research", "bibtex": "@misc{\nbarrett2022learning,\ntitle={Learning to Solve Combinatorial Problems via Efficient Exploration},\nauthor={Thomas D Barrett and Christopher William Falke Parsonson and Alexandre Laterre},\nyear={2022},\nurl={https://openreview.net/forum?id=olQbo52II9}\n}", "github": "", "project": "", "reviewers": "B7TT;m4dC;3ifn;5Z4N", "site": "https://openreview.net/forum?id=olQbo52II9", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "4;3;3;3", "correctness": "2;3;2;4", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;3;2;3", "wc_summary_paper": "199;94;65;65", "wc_summary_review": "104;38;27;66", "wc_main_review": "573;200;305;361", "wc_review": "876;332;397;492", "wc_reply_reviewers": "446;258;0;118", "wc_reply_authors": "2430;1389;1786;995", "reply_reviewers": "2;2;0;1", "reply_authors": "4;3;3;3", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.82915619758885 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 105.75, 55.12429137866536 ], "wc_summary_review_avg": [ 58.75, 29.74369681125734 ], "wc_main_review_avg": [ 359.75, 136.00988015581808 ], "wc_review_avg": [ 524.25, 210.90326574048112 ], "wc_reply_reviewers_avg": [ 205.5, 166.1949156863711 ], "wc_reply_authors_avg": [ 1650.0, 530.1042350330735 ], "reply_reviewers_avg": [ 1.25, 0.82915619758885 ], "reply_authors_avg": [ 3.25, 0.4330127018922193 ], "replies_avg": [ 24, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.9271726499455306, "corr_recommendation_correctness": 0.7608859102526822, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:LmYAsU6NsfQJ:scholar.google.com/&scioq=Learning+to+Solve+Combinatorial+Problems+via+Efficient+Exploration&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;0", "aff_unique_norm": "InstaDeep;University College London", "aff_unique_dep": ";", "aff_unique_url": "https://www.instadeep.com;https://www.ucl.ac.uk", "aff_unique_abbr": "InstaDeep;UCL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United Kingdom" }, { "id": "on54StZqGQ_", "title": "Degradation Attacks on Certifiably Robust Neural Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "Certifiably robust neural networks employ provable run-time defenses against adversarial examples by checking if the model is locally robust at the input under evaluation. We show through examples and experiments that these defenses are inherently over-cautious. Specifically, they flag inputs for which local robustness checks fail, but yet that are not adversarial; i.e., they are classified consistently with all valid inputs within a distance of $\\epsilon$. As a result, while a norm-bounded adversary cannot change the classification of an input, it can use norm-bounded changes to degrade the utility of certifiably robust networks by forcing them to reject otherwise correctly classifiable inputs. We empirically demonstrate the efficacy of such attacks against state-of-the-art certifiable defenses.", "keywords": "adversarial examples;certified defenses;degradation attacks", "primary_area": "", "supplementary_material": "", "author": "Klas Leino;Chi Zhang;Ravi Mangal;Matt Fredrikson;Bryan Parno;Corina Pasareanu", "authorids": "~Klas_Leino1;~Chi_Zhang21;~Ravi_Mangal1;~Matt_Fredrikson1;~Bryan_Parno1;~Corina_Pasareanu1", "gender": "M;F;M;M;M;F", "homepage": "https://klas.leino.tech;;https://www.andrew.cmu.edu/user/rmangal/;https://cs.cmu.edu/~mfredrik;https://www.andrew.cmu.edu/user/bparno/;https://www.andrew.cmu.edu/user/pcorina/", "dblp": ";;143/2729;38/2612;49/6324.html;03/4368", "google_scholar": ";;5OFQ4A8AAAAJ;https://scholar.google.com.tw/citations?user=tMYCvLAAAAAJ;https://scholar.google.com.tw/citations?user=kTTBpJkAAAAJ;pwIuivQAAAAJ", "orcid": ";;;;0000-0002-9113-1684;", "linkedin": ";chizhang1997/;;;;", "or_profile": "~Klas_Leino1;~Chi_Zhang21;~Ravi_Mangal1;~Matt_Fredrikson1;~Bryan_Parno1;~Corina_Pasareanu1", "aff": "Carnegie Mellon University;Carnegie Mellon University;Carnegie Mellon University;Carnegie Mellon University;Carnegie Mellon University;NASA Ames", "aff_domain": "cs.cmu.edu;cmu.edu;cmu.edu;cmu.edu;cmu.edu;nasa.gov", "position": "PhD student;PhD student;Postdoc;Associate Professor;Associate Professor;Researcher", "bibtex": "@misc{\nleino2022degradation,\ntitle={Degradation Attacks on Certifiably Robust Neural Networks},\nauthor={Klas Leino and Chi Zhang and Ravi Mangal and Matt Fredrikson and Bryan Parno and Corina Pasareanu},\nyear={2022},\nurl={https://openreview.net/forum?id=on54StZqGQ_}\n}", "github": "", "project": "", "reviewers": "akrY;YcnJ;rHQj;TyKj", "site": "https://openreview.net/forum?id=on54StZqGQ_", "pdf_size": 0, "recommendation": "5;6;6;6", "confidence": "5;3;5;3", "correctness": "2;2;3;4", "technical_novelty": "2;2;4;3", "empirical_novelty": "3;2;2;3", "wc_summary_paper": "55;68;65;58", "wc_summary_review": "85;139;93;53", "wc_main_review": "442;1002;283;666", "wc_review": "582;1209;441;777", "wc_reply_reviewers": "0;16;0;53", "wc_reply_authors": "960;1383;767;1271", "reply_reviewers": "0;1;0;1", "reply_authors": "2;2;1;2", "recommendation_avg": [ 5.75, 0.4330127018922193 ], "confidence_avg": [ 4.0, 1.0 ], "correctness_avg": [ 2.75, 0.82915619758885 ], "technical_novelty_avg": [ 2.75, 0.82915619758885 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 61.5, 5.220153254455275 ], "wc_summary_review_avg": [ 92.5, 30.736785778607366 ], "wc_main_review_avg": [ 598.25, 269.9077388664504 ], "wc_review_avg": [ 752.25, 289.4368454430085 ], "wc_reply_reviewers_avg": [ 17.25, 21.649191670822262 ], "wc_reply_authors_avg": [ 1095.25, 244.81051345887906 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.75, 0.4330127018922193 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.5222329678670935, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14629769069165190946&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;0;0;0;1", "aff_unique_norm": "Carnegie Mellon University;NASA Ames Research Center", "aff_unique_dep": ";", "aff_unique_url": "https://www.cmu.edu;https://ames.nasa.gov", "aff_unique_abbr": "CMU;NASA Ames", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "onqK4xDBYji", "title": "BERMo: What can BERT learn from ELMo?", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "We propose BERMo, an architectural modification to BERT, which makes predictions based on a hierarchy of surface, syntactic and semantic language features. We use linear combination scheme proposed in Embeddings from Language Models (ELMo) to combine the scaled internal representations from different network depths. Our approach has two-fold benefits: (1) improved gradient flow for the downstream task as every layer has a direct connection to the gradients of the loss function and (2) increased representative power as the model no longer needs to copy the features learned in the shallower layer which are necessary for the downstream task. Further, our model has a negligible parameter overhead as there is a single scalar parameter associated with each layer in the network. Experiments on the probing task from SentEval dataset show that our model performs up to $4.65\\%$ better in accuracy than the baseline with an average improvement of $2.67\\%$ on the semantic tasks. When subject to compression techniques, we find that our model enables stable pruning for compressing small datasets like SST-2, where the BERT model commonly diverges. We observe that our approach converges $1.67\\times$ and $1.15\\times$ faster than the baseline on MNLI and QQP tasks from GLUE dataset. Moreover, our results show that our approach can obtain better parameter efficiency for penalty based pruning approaches on QQP task.", "keywords": "BERT;Pruning;Faster Convergence;Stable Pruning", "primary_area": "", "supplementary_material": "", "author": "Sangamesh Kodge;Kaushik Roy", "authorids": "~Sangamesh_Kodge1;~Kaushik_Roy1", "gender": "M;M", "homepage": ";https://engineering.purdue.edu/NRL/Group", "dblp": "203/5657.html;r/KaushikRoy", "google_scholar": ";to4P8KgAAAAJ", "orcid": "0000-0001-9713-5400;", "linkedin": "sangameshkodge;", "or_profile": "~Sangamesh_Kodge1;~Kaushik_Roy1", "aff": "Purdue University;Purdue University", "aff_domain": "purdue.edu;purdue.edu", "position": "PhD student;Full Professor", "bibtex": "@misc{\nkodge2022bermo,\ntitle={{BERM}o: What can {BERT} learn from {ELM}o?},\nauthor={Sangamesh Kodge and Kaushik Roy},\nyear={2022},\nurl={https://openreview.net/forum?id=onqK4xDBYji}\n}", "github": "", "project": "", "reviewers": "rVJ5;etPJ;818S;x8RB", "site": "https://openreview.net/forum?id=onqK4xDBYji", "pdf_size": 0, "recommendation": "3;3;3;3", "confidence": "4;4;4;3", "correctness": "3;3;2;3", "technical_novelty": "1;2;1;2", "empirical_novelty": "2;2;1;2", "wc_summary_paper": "132;81;122;62", "wc_summary_review": "28;58;55;46", "wc_main_review": "1444;430;39;148", "wc_review": "1604;569;216;256", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 1.5, 0.5 ], "empirical_novelty_avg": [ 1.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 99.25, 28.769558564566125 ], "wc_summary_review_avg": [ 46.75, 11.691342951089922 ], "wc_main_review_avg": [ 515.25, 554.871775728411 ], "wc_review_avg": [ 661.25, 561.1957657538054 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3472713357336614173&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0", "aff_unique_norm": "Purdue University", "aff_unique_dep": "", "aff_unique_url": "https://www.purdue.edu", "aff_unique_abbr": "Purdue", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "onwTC5W0XJ", "title": "Causally Focused Convolutional Networks Through Minimal Human Guidance", "track": "main", "status": "Reject", "tldr": "", "abstract": "Convolutional Neural Networks (CNNs) are the state of the art in image classification mainly due to their ability to automatically extract features from the images and in turn, achieve accuracy higher than any method in history. However, the flip side is, they are correlational models which aggressively learn features that highly correlate with the labels. Such features may not be causally related to the labels as per human cognition. For example, in a subset of images, cows can be on grassland, but classifying an image as cow based on the presence of grassland is incorrect. To marginalize out the effect of all possible contextual features we need to gather a huge training dataset, which is not always possible. Moreover, this prohibits the model to justify the decision. This issue has some serious implications in certain domains such as medicine, where the amount of data can be limited but the model is expected to justify its decisions. In order to mitigate this issue, our proposal is to focus CNN to extract features that are causal from a human perspective. We propose a mechanism to accept guidance from humans in the form of activation masks to modify the learning process of CNN. The amount of additional guidance can be small and can be easily formed. Through detailed analysis, we show that this method not only improves the learning of causal features but also helps in learning efficiently with less data. We demonstrate the effectiveness of our method against multiple datasets using quantitative as well as qualitative results.", "keywords": "Causal Features;Convolutional Networks;Interpretability;Minimal Guidance;Computer Vision;Deep Learning", "primary_area": "", "supplementary_material": "", "author": "Rimmon Saloman Bhosale;Mrinal Das", "authorids": "~Rimmon_Saloman_Bhosale1;~Mrinal_Das1", "gender": "M;M", "homepage": ";http://nmrinl.github.io", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": "rimmon/;", "or_profile": "~Rimmon_Saloman_Bhosale1;~Mrinal_Kanti_Das1", "aff": ";Indian Institute of Technology Palakkad", "aff_domain": ";iitpkd.ac.in", "position": ";Assistant Professor", "bibtex": "@misc{\nbhosale2022causally,\ntitle={Causally Focused Convolutional Networks Through Minimal Human Guidance},\nauthor={Rimmon Saloman Bhosale and Mrinal Das},\nyear={2022},\nurl={https://openreview.net/forum?id=onwTC5W0XJ}\n}", "github": "", "project": "", "reviewers": "nnUS;teVu;HtjX", "site": "https://openreview.net/forum?id=onwTC5W0XJ", "pdf_size": 0, "recommendation": "3;3;5", "confidence": "3;4;4", "correctness": "3;2;3", "technical_novelty": "2;2;2", "empirical_novelty": "2;2;2", "wc_summary_paper": "83;17;157", "wc_summary_review": "108;40;51", "wc_main_review": "535;321;505", "wc_review": "726;378;713", "wc_reply_reviewers": "0;184;101", "wc_reply_authors": "985;302;793", "reply_reviewers": "0;1;1", "reply_authors": "2;1;1", "recommendation_avg": [ 3.6666666666666665, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 2.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 85.66666666666667, 57.18585683735291 ], "wc_summary_review_avg": [ 66.33333333333333, 29.80305726300948 ], "wc_main_review_avg": [ 453.6666666666667, 94.60561411577127 ], "wc_review_avg": [ 605.6666666666666, 161.07210255727782 ], "wc_reply_reviewers_avg": [ 95.0, 75.23740204623407 ], "wc_reply_authors_avg": [ 693.3333333333334, 287.6019625956834 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.5, "corr_recommendation_correctness": 0.5, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:sop1ih3qocQJ:scholar.google.com/&scioq=Causally+Focused+Convolutional+Networks+Through+Minimal+Human+Guidance&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "Indian Institute of Technology Palakkad", "aff_unique_dep": "", "aff_unique_url": "https://www.iitpkd.ac.in", "aff_unique_abbr": "IIT Palakkad", "aff_campus_unique_index": "0", "aff_campus_unique": "Palakkad", "aff_country_unique_index": "0", "aff_country_unique": "India" }, { "id": "oopnT6Vqho", "title": "Provable Regret Bounds for Deep Online Learning and Control", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "The use of deep neural networks has been highly successful in reinforcement learning and control,\nalthough few theoretical guarantees for deep learning exist for these problems. \nThere are two main challenges for deriving performance guarantees: a) control has state information and thus is inherently online and b) deep networks are non-convex predictors for which online learning cannot provide provable guarantees in general.\n\nBuilding on the linearization technique for overparameterized neural networks, we derive provable regret bounds for efficient online learning with deep neural networks. Specifically, we show that over any sequence of convex loss functions, any low-regret algorithm can be adapted to optimize the parameters of a neural network such that it competes with the best net in hindsight. As an application of these results in the online setting, we obtain provable bounds for online episodic control with deep neural network controllers.", "keywords": "online learning;deep neural networks;online control;nonstochastic control", "primary_area": "", "supplementary_material": "", "author": "Xinyi Chen;Edgar Minasyan;Jason D. Lee;Elad Hazan", "authorids": "~Xinyi_Chen1;~Edgar_Minasyan2;~Jason_D._Lee1;~Elad_Hazan1", "gender": "F;;M;M", "homepage": ";https://minasyan.github.io;https://jasondlee88.github.io/;https://www.ehazan.com", "dblp": "84/6214;238/2556;88/3262;72/739", "google_scholar": ";HjWpRCIAAAAJ;GR_DsT0AAAAJ;LnhCGNMAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Xinyi_Chen1;~Edgar_Minasyan2;~Jason_D._Lee1;~Elad_Hazan1", "aff": "Google DeepMind;Princeton University;Princeton University;Princeton University", "aff_domain": "google.com;princeton.edu;princeton.edu;princeton.edu", "position": "Researcher;PhD student;Assistant Professor;Full Professor", "bibtex": "@misc{\nchen2022provable,\ntitle={Provable Regret Bounds for Deep Online Learning and Control},\nauthor={Xinyi Chen and Edgar Minasyan and Jason D. Lee and Elad Hazan},\nyear={2022},\nurl={https://openreview.net/forum?id=oopnT6Vqho}\n}", "github": "", "project": "", "reviewers": "b4pY;xYk7;vkuy;TsLt", "site": "https://openreview.net/forum?id=oopnT6Vqho", "pdf_size": 0, "recommendation": "3;3;6;8", "confidence": "3;3;3;2", "correctness": "4;3;4;3", "technical_novelty": "1;2;3;3", "empirical_novelty": "0;0;0;0", "wc_summary_paper": "56;73;68;33", "wc_summary_review": "22;65;18;34", "wc_main_review": "211;384;264;164", "wc_review": "289;522;350;231", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "396;472;208;248", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 5.0, 2.1213203435596424 ], "confidence_avg": [ 2.75, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.25, 0.82915619758885 ], "empirical_novelty_avg": [ 0, 0 ], "wc_summary_paper_avg": [ 57.5, 15.435349040433131 ], "wc_summary_review_avg": [ 34.75, 18.430613120566555 ], "wc_main_review_avg": [ 255.75, 82.06209539123407 ], "wc_review_avg": [ 348.0, 108.91510455395982 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 331.0, 107.38249391777042 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.8164965809277261, "corr_recommendation_correctness": -0.23570226039551587, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7291073831914238900&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;1;1", "aff_unique_norm": "Google;Princeton University", "aff_unique_dep": "Google DeepMind;", "aff_unique_url": "https://deepmind.com;https://www.princeton.edu", "aff_unique_abbr": "DeepMind;Princeton", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1", "aff_country_unique": "United Kingdom;United States" }, { "id": "otOZeCahAhL", "title": "Towards Robust Domain Generalization in 2D Neural Audio Processing", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "While using two-dimensional convolutional neural networks (2D-CNNs) in image processing, it is possible to manipulate domain information using channel statistics, and instance normalization has been a promising way to get domain-invariant features. Although 2D image features represent spatial information, 2D audio features like log-Mel spectrogram represent two different temporal and spectral information. Unlike image processing, we analyze that domain-relevant information in the audio feature is dominant in frequency statistics rather than channel statistics. Motivated by our analysis, we introduce RFN, a plug-and-play, explicit normalization module along the frequency axis, eliminating instance-specific domain discrepancy in the audio feature while relaxing undesirable loss of useful discriminative information. Empirically, simply adding RFN to networks shows clear margins compared to previous domain generalization approaches on acoustic scene classification, keyword spotting, and speaker verification tasks and yields improved robustness to audio-device, speaker-ID, or genre.", "keywords": "2D audio processing;Domain generalization;Explicit normalization;Frequency-wise normalization;Domain-invariant feature", "primary_area": "", "supplementary_material": "", "author": "Byeonggeun Kim;Seunghan Yang;Jangho Kim;Hyunsin Park;Jun-Tae Lee;Simyung Chang", "authorids": "~Byeonggeun_Kim1;~Seunghan_Yang1;~Jangho_Kim1;~Hyunsin_Park2;~Jun-Tae_Lee1;~Simyung_Chang1", "gender": "M;M;M;M;M;M", "homepage": "https://sites.google.com/view/byeonggeun-kim;;;;https://sites.google.com/view/home-jtlee;", "dblp": "250/9485;250/9141;118/1777;50/9205;18/8673;206/6540", "google_scholar": "Pee89n0AAAAJ;g1-oNmAAAAAJ;https://scholar.google.co.kr/citations?user=mzFEJVIAAAAJ;mwtBKioAAAAJ;wzigGywAAAAJ;https://scholar.google.co.kr/citations?user=0-tF1dwAAAAJ", "orcid": ";;;0000-0003-3556-5792;;", "linkedin": "byeonggeun-kim-b8112a194/;;;hyunsin-park-598aa0221/;;", "or_profile": "~Byeonggeun_Kim1;~Seunghan_Yang1;~Jangho_Kim1;~Hyunsin_Park2;~Jun-Tae_Lee1;~Simyung_Chang1", "aff": "QualComm;Qualcomm AI Research;Seoul National University;Qualcomm Inc, QualComm;Qualcomm Inc, QualComm;Seoul National University", "aff_domain": "qti.qualcomm.com;qti.qualcomm.com;snu.ac.kr;qti.qualcomm.com;qti.qualcomm.com;snu.ac.kr", "position": "Researcher;Researcher;PhD student;Staff Engineer;Researcher;PhD student", "bibtex": "@misc{\nkim2022towards,\ntitle={Towards Robust Domain Generalization in 2D Neural Audio Processing},\nauthor={Byeonggeun Kim and Seunghan Yang and Jangho Kim and Hyunsin Park and Jun-Tae Lee and Simyung Chang},\nyear={2022},\nurl={https://openreview.net/forum?id=otOZeCahAhL}\n}", "github": "", "project": "", "reviewers": "CNwq;fK4s;bUJW;ESJG", "site": "https://openreview.net/forum?id=otOZeCahAhL", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "4;5;4;3", "correctness": "2;1;3;3", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;1;2;2", "wc_summary_paper": "144;44;77;55", "wc_summary_review": "9;36;97;71", "wc_main_review": "579;361;419;710", "wc_review": "732;441;593;836", "wc_reply_reviewers": "281;375;0;0", "wc_reply_authors": "807;1923;837;474", "reply_reviewers": "1;1;0;0", "reply_authors": "2;3;1;1", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 2.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 1.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 80.0, 38.8136573901507 ], "wc_summary_review_avg": [ 53.25, 33.48413803579241 ], "wc_main_review_avg": [ 517.25, 136.96053263623065 ], "wc_review_avg": [ 650.5, 148.53366621746062 ], "wc_reply_reviewers_avg": [ 164.0, 167.33349933590702 ], "wc_reply_authors_avg": [ 1010.25, 545.8943922591621 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.75, 0.82915619758885 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.816496580927726, "corr_recommendation_correctness": 0.5222329678670935, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9600142976554019659&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;2;0;0;2", "aff_unique_norm": "Qualcomm Incorporated;Qualcomm;Seoul National University", "aff_unique_dep": ";Qualcomm AI Research;", "aff_unique_url": "https://www.qualcomm.com;https://www.qualcomm.com/research;https://www.snu.ac.kr", "aff_unique_abbr": "Qualcomm;QAI;SNU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;0;1", "aff_country_unique": "United States;South Korea" }, { "id": "ovRQmeVFbrC", "title": "PARS: PSEUDO-LABEL AWARE ROBUST SAMPLE SELECTION FOR LEARNING WITH NOISY LABELS", "track": "main", "status": "Reject", "tldr": "", "abstract": "Acquiring accurate labels on large-scale datasets is both time consuming and expensive. To reduce the dependency of deep learning models on learning from clean labeled data, several recent research efforts are focused on learning with noisy labels. These methods typically fall into three design categories to learn a noise robust model: sample selection approaches, noise robust loss functions, or label correction methods. In this paper, we propose PARS: Pseudo-Label Aware Robust Sample Selection, a hybrid approach that combines the best from all three worlds in a joint-training framework to achieve robustness to noisy labels. Specifically, PARS exploits all training samples using both the raw/noisy labels and estimated/refurbished pseudo-labels via self-training, divides samples into an ambiguous and a noisy subset via loss analysis, and designs label-dependent noise-aware loss functions for both sets of filtered labels. Results show that PARS significantly outperforms the state of the art on extensive studies on the noisy CIFAR-10 and CIFAR-100 datasets, particularly on challenging high-noise and low-resource settings. In particular, PARS achieved an absolute 12% improvement in test accuracy on the CIFAR-100 dataset with 90% symmetric label noise, and an absolute 27% improvement in test accuracy when only 1/5 of the noisy labels are available during training as an additional restriction. On a real-world noisy dataset, Clothing1M, PARS achieves competitive results to the state of the art.", "keywords": "learning with label noise", "primary_area": "", "supplementary_material": "", "author": "Arushi Goel;Yunlong Jiao;Jordan Massiah", "authorids": "~Arushi_Goel2;~Yunlong_Jiao1;~Jordan_Massiah1", "gender": "F;M;", "homepage": "https://goelarushi.github.io/;https://yunlongjiao.github.io/;", "dblp": ";164/7317;", "google_scholar": "tj08PZcAAAAJ;https://scholar.google.co.uk/citations?user=NgTM33MAAAAJ;", "orcid": ";0000-0002-0776-0550;", "linkedin": ";yunlong-jiao/;jordan-massiah-562862136/", "or_profile": "~Arushi_Goel2;~Yunlong_Jiao1;~Jordan_Massiah1", "aff": "Google Research;Amazon;Amazon", "aff_domain": "research.google.com;amazon.com;amazon.com", "position": "Student Researcher;Machine Learning Scientist;Research Engineer", "bibtex": "@misc{\ngoel2022pars,\ntitle={{PARS}: {PSEUDO}-{LABEL} {AWARE} {ROBUST} {SAMPLE} {SELECTION} {FOR} {LEARNING} {WITH} {NOISY} {LABELS}},\nauthor={Arushi Goel and Yunlong Jiao and Jordan Massiah},\nyear={2022},\nurl={https://openreview.net/forum?id=ovRQmeVFbrC}\n}", "github": "", "project": "", "reviewers": "mAAg;yGpG;NCaX", "site": "https://openreview.net/forum?id=ovRQmeVFbrC", "pdf_size": 0, "recommendation": "5;6;6", "confidence": "3;4;4", "correctness": "3;4;3", "technical_novelty": "2;3;3", "empirical_novelty": "2;3;3", "wc_summary_paper": "99;86;93", "wc_summary_review": "46;36;57", "wc_main_review": "378;603;125", "wc_review": "523;725;275", "wc_reply_reviewers": "0;0;74", "wc_reply_authors": "926;489;678", "reply_reviewers": "0;0;1", "reply_authors": "2;1;2", "recommendation_avg": [ 5.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 92.66666666666667, 5.312459150169743 ], "wc_summary_review_avg": [ 46.333333333333336, 8.576453553512405 ], "wc_main_review_avg": [ 368.6666666666667, 195.25425020270933 ], "wc_review_avg": [ 507.6666666666667, 184.03139828723673 ], "wc_reply_reviewers_avg": [ 24.666666666666668, 34.883934538536344 ], "wc_reply_authors_avg": [ 697.6666666666666, 178.9456776665912 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.9999999999999997, "corr_recommendation_correctness": 0.4999999999999999, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17152036312075052229&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;1", "aff_unique_norm": "Google;Amazon", "aff_unique_dep": "Google Research;Amazon.com, Inc.", "aff_unique_url": "https://research.google;https://www.amazon.com", "aff_unique_abbr": "Google Research;Amazon", "aff_campus_unique_index": "0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "oxC2IBx8OuZ", "title": "Towards Federated Learning on Time-Evolving Heterogeneous Data", "track": "main", "status": "Reject", "tldr": "", "abstract": "Federated Learning (FL) is an emerging learning paradigm that preserves privacy by ensuring client data locality on edge devices. The optimization of FL is challenging in practice due to the diversity and heterogeneity of the learning system. Despite recent research efforts on improving the optimization of heterogeneous data, the impact of time-evolving heterogeneous data in real-world scenarios, such as changing client data or intermittent clients joining or leaving during training, has not been well studied.\nIn this work, we propose Continual Federated Learning (CFL), a flexible framework, to capture the time-evolving heterogeneity of FL. CFL covers complex and realistic scenarios---which are challenging to evaluate in previous FL formulations---by extracting the information of past local datasets and approximating the local objective functions. Theoretically, we demonstrate that CFL methods achieve a faster convergence rate than FedAvg in time-evolving scenarios, with the benefit being dependent on approximation quality. In a series of experiments, we show that the numerical findings match the convergence analysis, and CFL methods significantly outperform the other SOTA FL baselines.", "keywords": "Federated Learning;Continual Learning;Convergence Analysis", "primary_area": "", "supplementary_material": "/attachment/2e24d5e4c2cf7cf9a415e4c7d578a0d01452f5ae.zip", "author": "Yongxin Guo;Tao Lin;Xiaoying Tang", "authorids": "~Yongxin_Guo1;~Tao_Lin1;~Xiaoying_Tang2", "gender": "M;M;F", "homepage": "https://gyxxyg.github.io/yongxinguo/;https://lins-lab.github.io/;https://sse.cuhk.edu.cn/en/faculty/tangxiaoying", "dblp": ";64/4492-4.html;134/9714-2", "google_scholar": "5Cl1GZwAAAAJ;QE9pa_cAAAAJ;https://scholar.google.com/citations?hl=zh-TW", "orcid": "0009-0001-8652-0722;0000-0002-3246-6935;0000-0003-3955-1195", "linkedin": ";;", "or_profile": "~Yongxin_Guo1;~Tao_Lin1;~Xiaoying_Tang2", "aff": "Chinese University of HongKong, Shenzhen;Swiss Federal Institute of Technology Lausanne;The Chinese University of Hong Kong, Shenzhen", "aff_domain": "cuhk.edu.cn;epfl.ch;cuhk.edu.cn", "position": "PhD student;PhD student;Assistant Professor", "bibtex": "@misc{\nguo2022towards,\ntitle={Towards Federated Learning on Time-Evolving Heterogeneous Data},\nauthor={Yongxin Guo and Tao Lin and Xiaoying Tang},\nyear={2022},\nurl={https://openreview.net/forum?id=oxC2IBx8OuZ}\n}", "github": "", "project": "", "reviewers": "87pq;XVxz;s89S;2NMQ", "site": "https://openreview.net/forum?id=oxC2IBx8OuZ", "pdf_size": 0, "recommendation": "3;3;8;8", "confidence": "4;3;3;3", "correctness": "2;3;3;3", "technical_novelty": "2;2;4;3", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "47;95;63;45", "wc_summary_review": "850;16;69;104", "wc_main_review": "850;573;417;258", "wc_review": "1747;684;549;407", "wc_reply_reviewers": "0;534;39;0", "wc_reply_authors": "2044;2517;733;638", "reply_reviewers": "0;2;1;0", "reply_authors": "4;6;1;1", "recommendation_avg": [ 5.5, 2.5 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.82915619758885 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 62.5, 20.0187412191676 ], "wc_summary_review_avg": [ 259.75, 342.21804087452784 ], "wc_main_review_avg": [ 524.5, 218.44965094959525 ], "wc_review_avg": [ 846.75, 528.9075415419977 ], "wc_reply_reviewers_avg": [ 143.25, 226.1607558795292 ], "wc_reply_authors_avg": [ 1483.0, 815.5369396906556 ], "reply_reviewers_avg": [ 0.75, 0.82915619758885 ], "reply_authors_avg": [ 3.0, 2.1213203435596424 ], "replies_avg": [ 22, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 33, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16074211013754866762&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "Chinese University of Hong Kong;Swiss Federal Institute of Technology Lausanne", "aff_unique_dep": ";", "aff_unique_url": "https://www.cuhk.edu.cn;https://www.epfl.ch", "aff_unique_abbr": "CUHK;EPFL", "aff_campus_unique_index": "0;1;0", "aff_campus_unique": "Shenzhen;Lausanne", "aff_country_unique_index": "0;1;0", "aff_country_unique": "China;Switzerland" }, { "id": "oxwsctgY5da", "title": "A Branch and Bound Framework for Stronger Adversarial Attacks of ReLU Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "Strong adversarial attacks are important for evaluating the true robustness of deep neural networks. Most existing attacks find adversarial examples via searching in the input space, e.g., using gradient descent. In this work, we formulate an adversarial attack using a branch-and-bound (BaB) procedure on ReLU neural networks and search adversarial examples in the activation space corresponding to binary variables in a mixed integer programming (MIP) formulation. This attack formulation can be used to tackle hard instances where none of the existing adversarial attacks can succeed. Existing attacks using this formulation rely on generic solvers which cannot exploit the structure of neural networks and also cannot utilize GPU acceleration, so they are mostly limited to small networks and easy problem instances. To improve its scalability and practicability, we propose a top-down beam-search approach to quickly identify the subspace that may contain adversarial examples. The search utilizes the bound propagation based neural network verifiers on GPUs to rapidly evaluate a large number of searching regions, which is not possible in generic MIP solvers. Moreover, we exploit the fact that good candidates of adversarial examples can be easily found via gradient based attacks, and build an adversarial candidates pool to further guide the search in activation space via diving techniques. Additionally, any candidate adversarial examples found during the process are refined using a bottom-up large neighbourhood search (LNS) guided by the candidates pool. Our adversarial attack framework, BaB-Attack, opens up a new opportunity for designing novel adversarial attacks not limited to searching the input space, and enables us to borrow techniques from integer programming theory and neural network verification to build stronger attacks. In experiments, we can successfully generate adversarial examples for hard input instances where existing strong adversarial attacks fail, and outperform off-the-shelf MIP solver based attacks in both success rates and efficiency. Our results further close the gap between the upper bound of robust accuracy obtained by attacks and the lower bound obtained by verification.", "keywords": "adversarial attack;branch and bound;adversarial robustness;deep neural network", "primary_area": "", "supplementary_material": "", "author": "Huan Zhang;Shiqi Wang;Kaidi Xu;Yihan Wang;Suman Jana;Cho-Jui Hsieh;J Zico Kolter", "authorids": "~Huan_Zhang1;~Shiqi_Wang2;~Kaidi_Xu1;~Yihan_Wang2;~Suman_Jana1;~Cho-Jui_Hsieh1;~J_Zico_Kolter1", "gender": "M;M;M;F;M;M;M", "homepage": "http://huan-zhang.com;https://shiqi-wang.github.io;https://kaidixu.com/;https://yihanwang617.github.io;http://sumanj.info;http://web.cs.ucla.edu/~chohsieh/index.html;http://www.zicokolter.com", "dblp": "23/1797-1.html;58/9145-2;195/8175;;74/28;14/2770;67/2526", "google_scholar": "LTa3GzEAAAAJ;u_MzXeMAAAAJ;lYK0wlsAAAAJ;;https://scholar.google.com.tw/citations?user=SDY9FwUAAAAJ;Wy89g4IAAAAJ;UXh1I6UAAAAJ", "orcid": ";0000-0002-6338-1432;;;;;", "linkedin": ";tcwangshiqi/;;;;;", "or_profile": "~Huan_Zhang1;~Shiqi_Wang2;~Kaidi_Xu1;~Yihan_Wang2;~Suman_Jana1;~Cho-Jui_Hsieh1;~Zico_Kolter1", "aff": "Carnegie Mellon University;Columbia University;Drexel University;University of California, Los Angeles;, Columbia University;University of California, Los Angeles;Carnegie Mellon University", "aff_domain": "cmu.edu;columbia.edu;drexel.edu;ucla.edu;cs.columbia.edu;ucla.edu;cmu.edu", "position": "Postdoc;PhD student;Assistant Professor;MS student;Associate Professor;Assistant Professor;Full Professor", "bibtex": "@misc{\nzhang2022a,\ntitle={A Branch and Bound Framework for Stronger Adversarial Attacks of Re{LU} Networks},\nauthor={Huan Zhang and Shiqi Wang and Kaidi Xu and Yihan Wang and Suman Jana and Cho-Jui Hsieh and J Zico Kolter},\nyear={2022},\nurl={https://openreview.net/forum?id=oxwsctgY5da}\n}", "github": "", "project": "", "reviewers": "aJSd;LiPk;wXK6;G18n", "site": "https://openreview.net/forum?id=oxwsctgY5da", "pdf_size": 0, "recommendation": "6;6;8;8", "confidence": "2;3;3;2", "correctness": "4;4;3;3", "technical_novelty": "3;3;3;3", "empirical_novelty": "3;3;3;2", "wc_summary_paper": "61;78;43;60", "wc_summary_review": "28;11;28;21", "wc_main_review": "122;160;104;363", "wc_review": "211;249;175;444", "wc_reply_reviewers": "19;12;20;72", "wc_reply_authors": "543;1007;459;1679", "reply_reviewers": "1;1;1;2", "reply_authors": "2;2;2;3", "recommendation_avg": [ 7.0, 1.0 ], "confidence_avg": [ 2.5, 0.5 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 60.5, 12.379418403139947 ], "wc_summary_review_avg": [ 22.0, 6.96419413859206 ], "wc_main_review_avg": [ 187.25, 103.4634597333764 ], "wc_review_avg": [ 269.75, 103.95040884960481 ], "wc_reply_reviewers_avg": [ 30.75, 24.014318645341575 ], "wc_reply_authors_avg": [ 922.0, 484.325303902243 ], "reply_reviewers_avg": [ 1.25, 0.4330127018922193 ], "reply_authors_avg": [ 2.25, 0.4330127018922193 ], "replies_avg": [ 20, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": -1.0, "gs_citation": 38, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2786522381978800098&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff_unique_index": "0;1;2;3;1;3;0", "aff_unique_norm": "Carnegie Mellon University;Columbia University;Drexel University;University of California, Los Angeles", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.cmu.edu;https://www.columbia.edu;https://www.drexel.edu;https://www.ucla.edu", "aff_unique_abbr": "CMU;Columbia;Drexel;UCLA", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Los Angeles", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Topological Graph Neural Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6389", "id": "oxxUMeFwEHd", "poster": "", "openreview": "https://openreview.net/forum?id=oxxUMeFwEHd", "slides": "https://iclr.cc/virtual/2022/poster/6389", "video": "https://iclr.cc/virtual/2022/poster/6389", "author_site": "Max Horn, Edward De Brouwer, Michael Moor, Yves Moreau, Bastian Rieck, Karsten Borgwardt", "tldr": "", "abstract": "Graph neural networks (GNNs) are a powerful architecture for tackling graph learning tasks, yet have been shown to be oblivious to eminent substructures such as cycles. We present TOGL, a novel layer that incorporates global topological information of a graph using persistent homology. TOGL can be easily integrated into any type of GNN and is strictly more expressive (in terms the Weisfeiler\u2013Lehman graph isomorphism test) than message-passing GNNs. Augmenting GNNs with TOGL leads to improved predictive performance for graph and node classification tasks, both on synthetic data sets, which can be classified by humans using their topology but not by ordinary GNNs, and on real-world data.", "keywords": "topology;persistent homology;gnn;graph neural networks;graph classification;node classification;filtrations;topological data analysis;tda", "primary_area": "", "supplementary_material": "", "author": "Max Horn;Edward De Brouwer;Michael Moor;Yves Moreau;Bastian Rieck;Karsten Borgwardt", "authorids": "~Max_Horn1;~Edward_De_Brouwer1;~Michael_Moor1;~Yves_Moreau2;~Bastian_Rieck1;~Karsten_Borgwardt2", "gender": "M;M;;M;M;", "homepage": "https://expectationmax.github.io;https://edwarddebrouwer.xyz;;;https://bastian.rieck.me;https://www.biochem.mpg.de/borgwardt", "dblp": "https://dblp.uni-trier.de/pers/hd/h/Horn:Max;;;;119/8860;11/3733.html", "google_scholar": "60cGPvIAAAAJ;-Pm4XtAAAAAJ;;zWftTEUAAAAJ;https://scholar.google.ch/citations?user=La7zuKQAAAAJ;v3JsjMYAAAAJ", "orcid": "0000-0002-8269-9948;;;;0000-0003-4335-0302;0000-0001-7221-2393", "linkedin": ";edwarddebrouwer/;;;br-ml/;", "or_profile": "~Max_Horn1;~Edward_De_Brouwer1;~Michael_Moor1;~Yves_Moreau2;~Bastian_Rieck1;~Karsten_Borgwardt2", "aff": "Amazon Development Center Germany;KU Leuven;;University of Leuven;Helmholtz Zentrum M\u00fcnchen;ETHZ - ETH Zurich", "aff_domain": "amazon.de;kuleuven.be;;kuleuven.be;helmholtz-munich.de;ethz.ch", "position": "Researcher;PhD student;;Professor;Principal Investigator;Full Professor", "bibtex": "@inproceedings{\nhorn2022topological,\ntitle={Topological Graph Neural Networks},\nauthor={Max Horn and Edward De Brouwer and Michael Moor and Yves Moreau and Bastian Rieck and Karsten Borgwardt},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=oxxUMeFwEHd}\n}", "github": "", "project": "", "reviewers": "vj7C;PtXE;CsQm;2oqu", "pdf_size": 0, "recommendation": "6;6;6;6", "confidence": "3;3;3;3", "correctness": "4;3;3;3", "technical_novelty": "3;3;3;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "75;30;30;62", "wc_summary_review": "27;14;63;54", "wc_main_review": "121;205;282;170", "wc_review": "223;249;375;286", "wc_reply_reviewers": "7;0;0;0", "wc_reply_authors": "443;383;36;0", "reply_reviewers": "1;0;0;0", "reply_authors": "2;2;1;0", "recommendation_avg": [ 6.0, 0.0 ], "confidence_avg": [ 3.0, 0.0 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 49.25, 19.79109648301478 ], "wc_summary_review_avg": [ 39.5, 19.80530232033836 ], "wc_main_review_avg": [ 194.5, 58.670691149840735 ], "wc_review_avg": [ 283.25, 57.508151596099836 ], "wc_reply_reviewers_avg": [ 1.75, 3.031088913245535 ], "wc_reply_authors_avg": [ 215.5, 199.04333698971186 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.82915619758885 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 141, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18101743901347787747&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "pdf": "https://openreview.net/pdf?id=oxxUMeFwEHd", "email": "amazon.de;kuleuven.be;;kuleuven.be;helmholtz-munich.de;ethz.ch", "author_num": 6, "aff_unique_index": "0;1;2;3;4", "aff_unique_norm": "Amazon;Katholieke Universiteit Leuven;University of Leuven;Helmholtz Zentrum M\u00fcnchen;ETH Zurich", "aff_unique_dep": "Development Center;;;;", "aff_unique_url": "https://www.amazon.de;https://www.kuleuven.be;https://www.kuleuven.be;https://www.helmholtz-muenchen.de;https://www.ethz.ch", "aff_unique_abbr": "Amazon;KU Leuven;KU Leuven;;ETHZ", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;0;2", "aff_country_unique": "Germany;Belgium;Switzerland" }, { "id": "oykI6Kmq3Xi", "title": "Fast Convergence of Optimistic Gradient Ascent in Network Zero-Sum Extensive Form Games", "track": "main", "status": "Desk Reject", "tldr": "", "abstract": "The study of learning in games has thus far focused primarily on normal form games. In contrast, our understanding of learning in extensive form games (EFG) and particularly in EFGs with many agents lags far behind, despite them being closer in nature to many real world applications. We consider the natural class of Network Zero-Sum Extensive Form Games, which combines the global zero-sum property of agent payoffs, the efficient representation of graphical games as well the expressive power of EFGs. We examine the convergence properties of Optimistic Gradient Ascent (OGA) in these games. We prove that the time-average behavior of such online learning dynamics exhibits $O(1/T)$ rate of convergence to the set of Nash equilibria. Moreover, we show that the day-to-day behavior also converges to Nash with rate $O(c^{-t})$ for some game-dependent constant $c > 0$.", "keywords": "extensive form games;network extensive form games;online learning;optimistic gradient descent ascent", "primary_area": "", "supplementary_material": "/attachment/b6afbceebc0b7eb95095bd9d51b2bf977089ca7a.zip", "author": "Ryann Sim;EFSTRATIOS PANTELEIMON SKOULAKIS;Lillian J Ratliff;Georgios Piliouras", "authorids": "~Ryann_Sim1;~EFSTRATIOS_PANTELEIMON_SKOULAKIS2;~Lillian_J_Ratliff1;~Georgios_Piliouras1", "gender": "M;;M;F", "homepage": ";;http://www.corelab.ntua.gr/~sskoul/;https://faculty.washington.edu/ratliffl/", "dblp": "281/7000;62/1236;183/0979.html;127/7426", "google_scholar": "https://scholar.google.com.sg/citations?user=nejjvjcAAAAJ;;Juo2Tk8AAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";;;0000-0001-8936-0229", "linkedin": "ryann-sim-06206b116/;;;", "or_profile": "~Ryann_Sim1;~Georgios_Piliouras1;~Stratis_Skoulakis2;~Lillian_Ratliff1", "aff": "Singapore University of Technology and Design;Singapore University of Technology and Design;EPFL - EPF Lausanne;University of Washington, Seattle", "aff_domain": "sutd.edu.sg;sutd.edu.sg;epfl.ch;uw.edu", "position": "PhD student;Associate Professor;Postdoc;Assistant Professor", "bibtex": "@misc{\nsim2022fast,\ntitle={Fast Convergence of Optimistic Gradient Ascent in Network Zero-Sum Extensive Form Games},\nauthor={Ryann Sim and EFSTRATIOS PANTELEIMON SKOULAKIS and Lillian J Ratliff and Georgios Piliouras},\nyear={2022},\nurl={https://openreview.net/forum?id=oykI6Kmq3Xi}\n}", "github": "", "project": "", "reviewers": "", "site": "https://openreview.net/forum?id=oykI6Kmq3Xi", "pdf_size": 0, "recommendation": "", "confidence": "", "correctness": "", "technical_novelty": "", "empirical_novelty": "", "wc_summary_paper": "", "wc_summary_review": "", "wc_main_review": "", "wc_review": "", "wc_reply_reviewers": "", "wc_reply_authors": "", "reply_reviewers": "", "reply_authors": "", "recommendation_avg": [ 0, 0 ], "confidence_avg": [ 0, 0 ], "correctness_avg": [ 0, 0 ], "technical_novelty_avg": [ 0, 0 ], "empirical_novelty_avg": [ 0, 0 ], "wc_summary_paper_avg": [ 0, 0 ], "wc_summary_review_avg": [ 0, 0 ], "wc_main_review_avg": [ 0, 0 ], "wc_review_avg": [ 0, 0 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 1, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0, "corr_recommendation_correctness": 0, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=376870672142567662&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff_unique_index": "0;0;1;2", "aff_unique_norm": "Singapore University of Technology and Design;EPFL;University of Washington", "aff_unique_dep": ";;", "aff_unique_url": "https://www.sutd.edu.sg;https://www.epfl.ch;https://www.washington.edu", "aff_unique_abbr": "SUTD;EPFL;UW", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Lausanne;Seattle", "aff_country_unique_index": "0;0;1;2", "aff_country_unique": "Singapore;Switzerland;United States" }, { "title": "BEiT: BERT Pre-Training of Image Transformers", "status": "Oral", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6323", "id": "p-BhZSz59o4", "poster": "", "openreview": "https://openreview.net/forum?id=p-BhZSz59o4", "slides": "https://iclr.cc/virtual/2022/poster/6323", "video": "https://iclr.cc/virtual/2022/poster/6323", "author_site": "Hangbo Bao, Li Dong, Songhao Piao, Furu Wei", "tldr": "", "abstract": "We introduce a self-supervised vision representation model BEiT, which stands for Bidirectional Encoder representation from Image Transformers. Following BERT developed in the natural language processing area, we propose a masked image modeling task to pretrain vision Transformers. Specifically, each image has two views in our pre-training, i.e., image patches (such as 16 x 16 pixels), and visual tokens (i.e., discrete tokens). We first ``tokenize'' the original image into visual tokens. Then we randomly mask some image patches and fed them into the backbone Transformer. The pre-training objective is to recover the original visual tokens based on the corrupted image patches. After pre-training BEiT, we directly fine-tune the model parameters on downstream tasks by appending task layers upon the pretrained encoder. Experimental results on image classification and semantic segmentation show that our model achieves competitive results with previous pre-training methods.", "keywords": "self-supervised learning;pre-training;vision Transformer", "primary_area": "", "supplementary_material": "", "author": "Hangbo Bao;Li Dong;Songhao Piao;Furu Wei", "authorids": "~Hangbo_Bao1;~Li_Dong1;~Songhao_Piao1;~Furu_Wei1", "gender": "M;M;M;M", "homepage": "https://scholar.google.com/citations?user=lXCZGqYAAAAJ&hl=en;http://dong.li;http://homepage.hit.edu.cn/piaosh;https://www.microsoft.com/en-us/research/people/fuwei/", "dblp": "199/2036;85/5090-4;96/6541;72/5870", "google_scholar": "lXCZGqYAAAAJ;wEfQgPgAAAAJ;;G-V1VpwAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Hangbo_Bao1;~Li_Dong1;~Songhao_Piao1;~Furu_Wei1", "aff": "Microsoft;Microsoft Research;harbin institue of technology;Microsoft Research", "aff_domain": "microsoft.com;microsoft.com;hit.edu.cn;microsoft.com", "position": "Intern;Principal Researcher;Full Professor;Distinguished Scientist", "bibtex": "@inproceedings{\nbao2022beit,\ntitle={{BE}iT: {BERT} Pre-Training of Image Transformers},\nauthor={Hangbo Bao and Li Dong and Songhao Piao and Furu Wei},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=p-BhZSz59o4}\n}", "github": "", "project": "", "reviewers": "GYF1;pM4q;Vvct;7cfB", "pdf_size": 0, "recommendation": "8;8;8;8", "confidence": "5;4;4;4", "correctness": "3;4;4;3", "technical_novelty": "3;3;3;3", "empirical_novelty": "3;3;3;4", "wc_summary_paper": "46;113;72;107", "wc_summary_review": "43;31;26;47", "wc_main_review": "600;351;510;196", "wc_review": "689;495;608;350", "wc_reply_reviewers": "38;0;278;0", "wc_reply_authors": "341;155;632;165", "reply_reviewers": "1;0;1;0", "reply_authors": "1;1;2;1", "recommendation_avg": [ 8.0, 0.0 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 3.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 84.5, 27.189152248645048 ], "wc_summary_review_avg": [ 36.75, 8.554969316134336 ], "wc_main_review_avg": [ 414.25, 154.35733704621882 ], "wc_review_avg": [ 535.5, 127.34696698390583 ], "wc_reply_reviewers_avg": [ 79.0, 115.93532679903913 ], "wc_reply_authors_avg": [ 323.25, 192.9978950662416 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 3437, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4802045854930781683&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=p-BhZSz59o4", "email": "microsoft.com;microsoft.com;hit.edu.cn;microsoft.com", "author_num": 4, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Microsoft;Harbin Institute of Technology", "aff_unique_dep": "Microsoft Corporation;", "aff_unique_url": "https://www.microsoft.com;http://www.hit.edu.cn/", "aff_unique_abbr": "Microsoft;HIT", "aff_campus_unique_index": "1", "aff_campus_unique": ";Harbin", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "United States;China" }, { "title": "Visual hyperacuity with moving sensor and recurrent neural computations", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6105", "id": "p0rCmDEN_-", "poster": "", "openreview": "https://openreview.net/forum?id=p0rCmDEN_-", "slides": "https://iclr.cc/virtual/2022/poster/6105", "video": "https://iclr.cc/virtual/2022/poster/6105", "author_site": "Alexander Rivkind, Or Ram, Eldad Assa, Michael Kreiserman, Ehud Ahissar", "tldr": "", "abstract": "Dynamical phenomena, such as recurrent neuronal activity and perpetual motion of the eye, are typically overlooked in models of bottom-up visual perception. Recent experiments suggest that tiny inter-saccadic eye motion (\"fixational drift\") enhances visual acuity beyond the limit imposed by the density of retinal photoreceptors. Here we hypothesize that such an enhancement is enabled by recurrent neuronal computations in early visual areas. Specifically, we explore a setting involving a low-resolution dynamical sensor that moves with respect to a static scene, with drift-like tiny steps. This setting mimics a dynamical eye viewing objects in perceptually-challenging conditions. The dynamical sensory input is classified by a convolutional neural network with recurrent connectivity added to its lower layers, in analogy to recurrent connectivity in early visual areas. Applying our system to CIFAR-10 and CIFAR-100 datasets down-sampled via 8x8 sensor, we found that (i) classification accuracy, which is drastically reduced by this down-sampling, is mostly restored to its 32x32 baseline level when using a moving sensor and recurrent connectivity, (ii) in this setting, neurons in the early layers exhibit a wide repertoire of selectivity patterns, spanning the spatiotemporal selectivity space, with neurons preferring different combinations of spatial and temporal patterning, and (iii) curved sensor's trajectories improve visual acuity compared to straight trajectories, echoing recent experimental findings involving eye-tracking in challenging conditions. Our work sheds light on the possible role of recurrent connectivity in early vision as well as the roles of fixational drift and temporal-frequency selective cells in the visual system. It also proposes a solution for artificial image recognition in settings with limited resolution and multiple time samples, such as in edge AI applications.", "keywords": "visual system;convolutional neural networks;recurrent neural networks;active vision;active sensing;ocular drift", "primary_area": "", "supplementary_material": "/attachment/a9f19eda2ed21055fcdf20a05cd2b716c7ec47fe.zip", "author": "Alexander Rivkind;Or Ram;Eldad Assa;Michael Kreiserman;Ehud Ahissar", "authorids": "~Alexander_Rivkind1;~Or_Ram1;~Eldad_Assa1;~Michael_Kreiserman1;~Ehud_Ahissar1", "gender": "M;Not Specified;;M;", "homepage": ";https://github.com/PopGalacticHistory;;;https://www.weizmann.ac.il/neurobiology/labs/ahissar/home", "dblp": ";;;;", "google_scholar": "XkP_-nQAAAAJ;;;;", "orcid": ";;;;", "linkedin": "alexander-rivkind-00491871/;;eldad-assa-14819560;michael-kreiserman-153a4b141;", "or_profile": "~Alexander_Rivkind1;~Or_Ram1;~Eldad_Assa1;~Michael_Kreiserman1;~Ehud_Ahissar1", "aff": "Weizmann Institute, Technion;Weizmann Institute, Technion;Weizmann Institute of Science;Weizmann Institute;Weizmann Institute", "aff_domain": "weizmann.ac.il;weizmann.ac.il;weizmann.ac.il;weizmann.ac.il;weizmann.ac.il", "position": "Postdoc;MS student;Postdoc;Hardware Engineer;Full Professor", "bibtex": "@inproceedings{\nrivkind2022visual,\ntitle={Visual hyperacuity with moving sensor and recurrent neural computations},\nauthor={Alexander Rivkind and Or Ram and Eldad Assa and Michael Kreiserman and Ehud Ahissar},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=p0rCmDEN_-}\n}", "github": "", "project": "", "reviewers": "RJzH;zsN3;9EUA;vR8C", "pdf_size": 0, "recommendation": "3;3;5;10", "confidence": "3;5;4;4", "correctness": "2;2;3;4", "technical_novelty": "3;2;2;4", "empirical_novelty": "2;2;2;4", "wc_summary_paper": "166;49;87;165", "wc_summary_review": "65;7;152;76", "wc_main_review": "505;667;368;1208", "wc_review": "736;723;607;1449", "wc_reply_reviewers": "0;460;171;0", "wc_reply_authors": "755;1054;1442;683", "reply_reviewers": "0;1;1;0", "reply_authors": "1;2;2;1", "recommendation_avg": [ 5.25, 2.8613807855648994 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 2.75, 0.82915619758885 ], "technical_novelty_avg": [ 2.75, 0.82915619758885 ], "empirical_novelty_avg": [ 2.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 116.75, 50.56864146879961 ], "wc_summary_review_avg": [ 75.0, 51.60910772334666 ], "wc_main_review_avg": [ 687.0, 318.87536750272824 ], "wc_review_avg": [ 878.75, 333.0423208842984 ], "wc_reply_reviewers_avg": [ 157.75, 187.9499600957659 ], "wc_reply_authors_avg": [ 983.5, 299.0422210992956 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.9746972340815895, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18178381920059833301&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=p0rCmDEN_-", "email": "weizmann.ac.il;weizmann.ac.il;weizmann.ac.il;weizmann.ac.il;weizmann.ac.il", "author_num": 5, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Weizmann Institute of Science", "aff_unique_dep": "", "aff_unique_url": "https://www.weizmann.org.il", "aff_unique_abbr": "Weizmann", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "Israel" }, { "id": "p36db089HBP", "title": "SONG: Self-Organizing Neural Graphs", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Recent years have seen a surge in research on combining deep neural networks with other methods, including decision trees and graphs. There are at least three advantages of incorporating decision trees and graphs: they are easy to interpret since they are based on sequential decisions, they can make decisions faster, and they provide a hierarchy of classes. However, one of the well-known drawbacks of decision trees, as compared to decision graphs, is that decision trees cannot reuse the decision nodes. Nevertheless, decision graphs were not commonly used in deep learning due to the lack of efficient gradient-based training techniques. In this paper, we fill this gap and provide a general paradigm based on Markov processes, which allows for efficient training of the special type of decision graphs, which we call Self-Organizing Neural Graphs (SONG). We provide an extensive theoretical study of SONG, complemented by experiments conducted on Letter, Connect4, MNIST, CIFAR, and TinyImageNet datasets, showing that our method performs on par or better than existing decision models.", "keywords": "decision graphs;self-organizing models;neural networks;Markov processes;decision trees;interpretability", "primary_area": "", "supplementary_material": "/attachment/e66e30bab51c1445d44c3412e34c5c6f90290ef6.zip", "author": "\u0141ukasz Struski;Tomasz Danel;Marek \u015amieja;Jacek Tabor;Bartosz Micha\u0142 Zieli\u0144ski", "authorids": "~\u0141ukasz_Struski1;~Tomasz_Danel1;~Marek_\u015amieja1;~Jacek_Tabor1;~Bartosz_Micha\u0142_Zieli\u0144ski1", "gender": "M;M;M;M;M", "homepage": ";;https://mareksmieja.github.io/;;https://bartoszzielinski.github.io/", "dblp": "120/7679;248/8081;81/10360;31/5172;12/3424-1", "google_scholar": "https://scholar.google.pl/citations?user=_lWKf3UAAAAJ;WZq_OCsAAAAJ;https://scholar.google.pl/citations?user=MOmnpZcAAAAJ;https://scholar.google.pl/citations?user=zSKYziUAAAAJ;https://scholar.google.pl/citations?user=AJHaOpkAAAAJ", "orcid": "0000-0003-4006-356X;0000-0001-6053-0028;0000-0003-2027-4132;0000-0001-6652-7727;0000-0002-3063-3621", "linkedin": ";;;;bartosz-zieli%C5%84ski-1b2b1ab/", "or_profile": "~\u0141ukasz_Struski1;~Tomasz_Danel1;~Marek_\u015amieja1;~Jacek_Tabor1;~Bartosz_Micha\u0142_Zieli\u0144ski1", "aff": "Jagiellonian University in Krakow;Jagiellonian University;Jagiellonian University;Jagiellonian University;Jagiellonian University", "aff_domain": "uj.edu.pl;uj.edu.pl;uj.edu.pl;uj.edu.pl;uj.edu.pl", "position": "Assistant Professor;PhD student;Assistant Professor;Full Professor;Assistant Professor", "bibtex": "@misc{\nstruski2022song,\ntitle={{SONG}: Self-Organizing Neural Graphs},\nauthor={{\\L}ukasz Struski and Tomasz Danel and Marek {\\'S}mieja and Jacek Tabor and Bartosz Micha{\\l} Zieli{\\'n}ski},\nyear={2022},\nurl={https://openreview.net/forum?id=p36db089HBP}\n}", "github": "", "project": "", "reviewers": "zwoq;3WfY;X7L7;D4sB", "site": "https://openreview.net/forum?id=p36db089HBP", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "4;3;4;3", "correctness": "3;3;4;4", "technical_novelty": "2;3;2;3", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "42;23;27;99", "wc_summary_review": "74;18;14;45", "wc_main_review": "496;105;68;365", "wc_review": "612;146;109;509", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 47.75, 30.425112982534674 ], "wc_summary_review_avg": [ 37.75, 24.087081599894994 ], "wc_main_review_avg": [ 258.5, 178.60641085918502 ], "wc_review_avg": [ 344.0, 219.9306708942616 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.6882472016116854, "corr_recommendation_correctness": 0.6882472016116854, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15080247437683387692&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Jagiellonian University", "aff_unique_dep": "", "aff_unique_url": "https://www.uj.edu.pl", "aff_unique_abbr": "UJ", "aff_campus_unique_index": "0", "aff_campus_unique": "Krakow;", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "Poland" }, { "title": "Temporal Alignment Prediction for Supervised Representation Learning and Few-Shot Sequence Classification", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6429", "id": "p3DKPQ7uaAi", "poster": "", "openreview": "https://openreview.net/forum?id=p3DKPQ7uaAi", "slides": "https://iclr.cc/virtual/2022/poster/6429", "video": "https://iclr.cc/virtual/2022/poster/6429", "author_site": "Bing Su, Ji-Rong Wen", "tldr": "", "abstract": "Explainable distances for sequence data depend on temporal alignment to tackle sequences with different lengths and local variances. Most sequence alignment methods infer the optimal alignment by solving an optimization problem under pre-defined feasible alignment constraints, which not only is time-consuming, but also makes end-to-end sequence learning intractable. In this paper, we propose a learnable sequence distance called Temporal Alignment Prediction (TAP). TAP employs a lightweight convolutional neural network to directly predict the optimal alignment between two sequences, so that only straightforward calculations are required and no optimization is involved in inference. TAP can be applied in different distance-based machine learning tasks. For supervised sequence representation learning, we show that TAP trained with various metric learning losses achieves completive performances with much faster inference speed. For few-shot action classification, we apply TAP as the distance measure in the metric learning-based episode-training paradigm. This simple strategy achieves comparable results with state-of-the-art few-shot action recognition methods.", "keywords": "Temporal Alignment;Supervised Representation Learning;Few-shot Action Recognition;Alignment Prediction;Sequence Classification", "primary_area": "", "supplementary_material": "", "author": "Bing Su;Ji-Rong Wen", "authorids": "~Bing_Su1;~Ji-Rong_Wen1", "gender": "M;M", "homepage": "https://gsai.ruc.edu.cn/bingsu;https://gsai.ruc.edu.cn/english/jrwen", "dblp": "41/5270-1;w/JRWen", "google_scholar": "https://scholar.google.com.sg/citations?user=d3g2VJQAAAAJ;tbxCHJgAAAAJ", "orcid": "0000-0001-8560-1910;0000-0002-9777-9676", "linkedin": ";", "or_profile": "~Bing_Su1;~Ji-Rong_Wen1", "aff": "Renmin University of China;Renmin University of China", "aff_domain": "ruc.edu.cn;ruc.edu.cn", "position": "Associate Professor;Full Professor", "bibtex": "@inproceedings{\nsu2022temporal,\ntitle={Temporal Alignment Prediction for Supervised Representation Learning and Few-Shot Sequence Classification},\nauthor={Bing Su and Ji-Rong Wen},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=p3DKPQ7uaAi}\n}", "github": "", "project": "", "reviewers": "UnWN;tA1W;1HmZ", "pdf_size": 0, "recommendation": "6;6;8", "confidence": "3;3;4", "correctness": "3;3;3", "technical_novelty": "3;2;3", "empirical_novelty": "2;3;3", "wc_summary_paper": "99;51;130", "wc_summary_review": "18;81;90", "wc_main_review": "385;287;443", "wc_review": "502;419;663", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 6.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 93.33333333333333, 32.49957264676294 ], "wc_summary_review_avg": [ 63.0, 32.03123475609393 ], "wc_main_review_avg": [ 371.6666666666667, 64.38081149604197 ], "wc_review_avg": [ 528.0, 101.29494887044795 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.9999999999999998, "corr_recommendation_correctness": 0.0, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5648741062527144654&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "pdf": "https://openreview.net/pdf?id=p3DKPQ7uaAi", "email": "ruc.edu.cn;ruc.edu.cn", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Renmin University of China", "aff_unique_dep": "", "aff_unique_url": "http://www.ruc.edu.cn", "aff_unique_abbr": "RUC", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "id": "p46vOpFJkr_", "title": "ExCon: Explanation-driven Supervised Contrastive Learning for Image Classification", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Contrastive learning has led to substantial improvements in the quality of learned embedding representations for tasks such as image classification. However, a key drawback of existing contrastive augmentation methods is that they may lead to the modification of the image content which can yield undesired alterations of its semantics. This can affect the performance of the model on downstream tasks. Hence, in this paper, we ask whether we can augment image data in contrastive learning such that the task-relevant semantic content of an image is preserved. For this purpose, we propose to leverage saliency-based explanation methods to create content-preserving masked augmentations for contrastive learning. Our novel explanation-driven supervised contrastive learning (ExCon) methodology critically serves the dual goals of encouraging nearby image embeddings to have similar content and explanation, which we verify through t-SNE visualizations of embeddings. To quantify the impact of ExCon's embedding methodology, we conduct experiments on CIFAR100 as well as the Tiny ImageNet dataset and demonstrate that ExCon outperforms vanilla supervised contrastive learning \\emph{both} in terms of classification accuracy and in terms of explanation quality of the model.", "keywords": "Representation Learning;Explainable Deep Learning", "primary_area": "", "supplementary_material": "", "author": "Zhibo Zhang;Jongseong Jang;Chiheb Trabelsi;Ruiwen Li;Scott Sanner;Yeonjeong Jeong;Dongsub Shim", "authorids": "~Zhibo_Zhang2;~Jongseong_Jang1;chiheb.trabelsi@utoronto.ca;~Ruiwen_Li1;~Scott_Sanner1;~Yeonjeong_Jeong2;~Dongsub_Shim1", "gender": "M;M;;;M;;", "homepage": ";https://sites.google.com/view/jongseong-jang;;;http://d3m.mie.utoronto.ca/;;", "dblp": ";140/4388;;;88/3374;;274/1579", "google_scholar": ";https://scholar.google.co.kr/citations?user=-DJPQqgAAAAJ;;;https://scholar.google.ca/citations?user=kB8UPNIAAAAJ;;NxE-ZasAAAAJ", "orcid": ";;;;;;0009-0006-8178-0476", "linkedin": "zhibo-darren-zhang/;;;ruiwen-li-4a272b55/;;;", "or_profile": "~Zhibo_Zhang2;~Jongseong_Jang1;chiheb.trabelsi@utoronto.ca;~Ruiwen_Li1;~Scott_Sanner1;~Yeonjeong_Jeong2;~Dongsub_Shim1", "aff": "University of Toronto;LG AI Research;;Toronto University;Department of Computer Science;;LG AI Research", "aff_domain": "utoronto.ca;lgresearch.ai;;utoronto.ca;cs.toronto.edu;;lgresearch.ai", "position": "MS student;Researcher;;MS student;Cross-appointed;;Researcher", "bibtex": "@misc{\nzhang2022excon,\ntitle={ExCon: Explanation-driven Supervised Contrastive Learning for Image Classification},\nauthor={Zhibo Zhang and Jongseong Jang and Chiheb Trabelsi and Ruiwen Li and Scott Sanner and Yeonjeong Jeong and Dongsub Shim},\nyear={2022},\nurl={https://openreview.net/forum?id=p46vOpFJkr_}\n}", "github": "", "project": "", "reviewers": "s8Hk;Bn9D;7KES;ciAC", "site": "https://openreview.net/forum?id=p46vOpFJkr_", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "4;5;4;4", "correctness": "4;3;3;3", "technical_novelty": "3;2;2;2", "empirical_novelty": "2;3;3;2", "wc_summary_paper": "56;57;188;71", "wc_summary_review": "39;36;155;33", "wc_main_review": "264;616;1104;331", "wc_review": "359;709;1447;435", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 93.0, 55.16792546398677 ], "wc_summary_review_avg": [ 65.75, 51.57215818637029 ], "wc_main_review_avg": [ 578.75, 330.80309475577764 ], "wc_review_avg": [ 737.5, 429.81711226985834 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": -0.5773502691896257, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5304146056512185501&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;0;2;1", "aff_unique_norm": "University of Toronto;LG;Unknown Institution", "aff_unique_dep": ";LG AI Research;Department of Computer Science", "aff_unique_url": "https://www.utoronto.ca;https://www.lgaires.com;", "aff_unique_abbr": "U of T;LG AI;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;1", "aff_country_unique": "Canada;South Korea;" }, { "id": "p4H9QlbJvx", "title": "Rethinking Again the Value of Network Pruning -- A Dynamical Isometry Perspective", "track": "main", "status": "Reject", "tldr": "", "abstract": "Several recent works questioned the value of inheriting weight in structured neural network pruning because they empirically found training from scratch can match or even outperform finetuning a pruned model. In this paper, we present evidences that this argument is actually \\emph{inaccurate} because of using improperly small finetuning learning rates. With larger learning rates, our results consistently suggest pruning outperforms training from scratch on multiple networks (ResNets, VGG11) and datasets (MNIST, CIFAR10, ImageNet) over most pruning ratios. To deeply understand why finetuning learning rate holds such a critical role, we examine the theoretical reason behind through the lens of \\emph{dynamical isometry}, a nice property of networks that can make the gradient signals preserve norm during propagation. Our results suggest that weight removal in pruning breaks dynamical isometry, \\emph{which fundamentally answers for the performance gap between a large finetuning LR and~a small one}. Therefore, it is necessary to recover the dynamical isometry before finetuning. In this regard, we also present a regularization-based technique to do so, which is rather simple-to-implement yet effective in dynamical isometry recovery on modern residual convolutional neural networks.", "keywords": "neural network pruning;dynamical isometry;model compression;filter pruning", "primary_area": "", "supplementary_material": "", "author": "Huan Wang;Can Qin;Yue Bai;Yun Fu", "authorids": "~Huan_Wang3;~Can_Qin1;~Yue_Bai1;~Yun_Fu1", "gender": "M;M;M;M", "homepage": "https://huanwang.tech/;http://canqin.tech;https://yueb17.github.io/;http://www1.ece.neu.edu/~yunfu/", "dblp": "70/6155-14;214/2488;119/0848;00/5815-1", "google_scholar": "0-On0y4AAAAJ;QCik-YcAAAAJ;https://scholar.google.com/citations?hl=en;https://scholar.google.com.tw/citations?user=h-JEcQ8AAAAJ", "orcid": "0000-0001-6951-901X;;;0000-0002-5098-2853", "linkedin": "huanwang-zju/;;;furaymond/", "or_profile": "~Huan_Wang3;~Can_Qin1;~Yue_Bai1;~Yun_Fu1", "aff": "Northeastern University;Northeastern University;Northeastern University;Northeastern University", "aff_domain": "neu.edu;neu.edu;neu.edu;northeastern.edu", "position": "PhD student;PhD student;PhD student;Full Professor", "bibtex": "@misc{\nwang2022rethinking,\ntitle={Rethinking Again the Value of Network Pruning -- A Dynamical Isometry Perspective},\nauthor={Huan Wang and Can Qin and Yue Bai and Yun Fu},\nyear={2022},\nurl={https://openreview.net/forum?id=p4H9QlbJvx}\n}", "github": "", "project": "", "reviewers": "nWUS;YYie;VvbL;bhQQ", "site": "https://openreview.net/forum?id=p4H9QlbJvx", "pdf_size": 0, "recommendation": "3;5;5;8", "confidence": "4;4;3;5", "correctness": "2;2;3;3", "technical_novelty": "1;2;3;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "42;90;122;127", "wc_summary_review": "11;145;59;51", "wc_main_review": "200;723;418;212", "wc_review": "253;958;599;390", "wc_reply_reviewers": "204;566;166;0", "wc_reply_authors": "1609;3010;2207;1456", "reply_reviewers": "1;2;1;0", "reply_authors": "2;6;5;2", "recommendation_avg": [ 5.25, 1.7853571071357126 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.25, 0.82915619758885 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 95.25, 33.862774546690645 ], "wc_summary_review_avg": [ 66.5, 48.833902158234295 ], "wc_main_review_avg": [ 388.25, 211.8045974477419 ], "wc_review_avg": [ 550.0, 265.8354754354655 ], "wc_reply_reviewers_avg": [ 234.0, 206.46065000382035 ], "wc_reply_authors_avg": [ 2070.5, 610.7137218042509 ], "reply_reviewers_avg": [ 1.0, 0.7071067811865476 ], "reply_authors_avg": [ 3.75, 1.7853571071357126 ], "replies_avg": [ 27, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.5940885257860046, "corr_recommendation_correctness": 0.7001400420140049, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Wn-WGScGO3oJ:scholar.google.com/&scioq=Rethinking+Again+the+Value+of+Network+Pruning+--+A+Dynamical+Isometry+Perspective&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Northeastern University", "aff_unique_dep": "", "aff_unique_url": "https://www.northeastern.edu", "aff_unique_abbr": "NEU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "p7LSrQ3AADp", "title": "Beyond Faithfulness: A Framework to Characterize and Compare Saliency Methods", "track": "main", "status": "Reject", "tldr": "", "abstract": "Saliency methods calculate how important each input feature is to a machine learning model\u2019s prediction, and are commonly used to understand model reasoning. \u201cFaithfulness,\u201d or how fully and accurately the saliency output reflects the underlying model, is an oft-cited desideratum for these methods. However, explanation methods must necessarily sacrifice certain information in service of user-oriented goals such as simplicity. To that end, and akin to performance metrics, we frame saliency methods as abstractions: individual tools that provide insight into specific aspects of model behavior and entail tradeoffs. Using this framing, we describe a framework of nine dimensions to characterize and compare the properties of saliency methods. We group these dimensions into three categories that map to different phases of the interpretation process: methodology, or how the saliency is calculated; sensitivity, or relationships between the saliency result and the underlying model or input; and, perceptibility, or how a user interprets the result. As we show, these dimensions give us a granular vocabulary for describing and comparing saliency methods \u2014 for instance, allowing us to develop \u201csaliency cards\u201d as a form of documentation, or helping downstream users understand tradeoffs and choose a method for a particular use case. Moreover, by situating existing saliency methods within this framework, we identify opportunities for future work, including filling gaps in the landscape and developing new evaluation metrics.\n", "keywords": "saliency methods;interpretability;faithfulness;explainability;attribution;feature importance", "primary_area": "", "supplementary_material": "", "author": "Angie Boggust;Harini Suresh;Hendrik Strobelt;John Guttag;Arvind Satyanarayan", "authorids": "~Angie_Boggust1;~Harini_Suresh1;~Hendrik_Strobelt1;~John_Guttag2;~Arvind_Satyanarayan1", "gender": "F;M;M;;F", "homepage": "https://harinisuresh.com;http://hendrik.strobelt.com;https://people.csail.mit.edu/guttag/;http://vis.csail.mit.edu;http://angieboggust.com", "dblp": ";67/7527;g/JohnVGuttag;;255/4773", "google_scholar": ";H4vEe_oAAAAJ;;;pQd1HSK5lzEC", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~Harini_Suresh1;~Hendrik_Strobelt1;~John_Guttag2;~Arvind_Satyanarayan1;~Angie_Wynne_Boggust1", "aff": "Massachusetts Institute of Technology;International Business Machines;Massachusetts Institute of Technology;Massachusetts Institute of Technology;Computer Science and Artificial Intelligence Laboratory, Electrical Engineering & Computer Science", "aff_domain": "mit.edu;ibm.com;mit.edu;mit.edu;csail.mit.edu", "position": "PhD student;Principal Researcher;Full Professor;Assistant Professor;PhD student", "bibtex": "@misc{\nboggust2022beyond,\ntitle={Beyond Faithfulness: A Framework to Characterize and Compare Saliency Methods},\nauthor={Angie Boggust and Harini Suresh and Hendrik Strobelt and John Guttag and Arvind Satyanarayan},\nyear={2022},\nurl={https://openreview.net/forum?id=p7LSrQ3AADp}\n}", "github": "", "project": "", "reviewers": "2CJF;DxGz;JNYz", "site": "https://openreview.net/forum?id=p7LSrQ3AADp", "pdf_size": 0, "recommendation": "3;5;8", "confidence": "4;5;5", "correctness": "3;3;4", "technical_novelty": "2;3;3", "empirical_novelty": "2;3;0", "wc_summary_paper": "86;58;110", "wc_summary_review": "74;31;58", "wc_main_review": "162;185;322", "wc_review": "322;274;490", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "568;658;167", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "recommendation_avg": [ 5.333333333333333, 2.0548046676563256 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 1.6666666666666667, 1.247219128924647 ], "wc_summary_paper_avg": [ 84.66666666666667, 21.249836600678968 ], "wc_summary_review_avg": [ 54.333333333333336, 17.745108872274887 ], "wc_main_review_avg": [ 223.0, 70.63049388661152 ], "wc_review_avg": [ 362.0, 92.6066952223218 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 464.3333333333333, 213.4327893168141 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.8029550685469661, "corr_recommendation_correctness": 0.9176629354822472, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13686386807018183154&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;0;0;0", "aff_unique_norm": "Massachusetts Institute of Technology;International Business Machines Corporation", "aff_unique_dep": ";", "aff_unique_url": "https://web.mit.edu;https://www.ibm.com", "aff_unique_abbr": "MIT;IBM", "aff_campus_unique_index": "1", "aff_campus_unique": ";Cambridge", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Discrepancy-Based Active Learning for Domain Adaptation", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6378", "id": "p98WJxUC3Ca", "poster": "", "openreview": "https://openreview.net/forum?id=p98WJxUC3Ca", "slides": "https://iclr.cc/virtual/2022/poster/6378", "video": "https://iclr.cc/virtual/2022/poster/6378", "author_site": "Antoine De mathelin, Fran\u00e7ois Deheeger, Mathilde MOUGEOT, Nicolas Vayatis", "tldr": "", "abstract": "The goal of the paper is to design active learning strategies which lead to domain adaptation under an assumption of Lipschitz functions. Building on previous work by Mansour et al. (2009) we adapt the concept of discrepancy distance between source and target distributions to restrict the maximization over the hypothesis class to a localized class of functions which are performing accurate labeling on the source domain. We derive generalization error bounds for such active learning strategies in terms of Rademacher average and localized discrepancy for general loss functions which satisfy a regularity condition. A practical K-medoids algorithm that can address the case of large data set is inferred from the theoretical bounds. Our numerical experiments show that the proposed algorithm is competitive against other state-of-the-art active learning techniques in the context of domain adaptation, in particular on large data sets of around one hundred thousand images.", "keywords": "active learning;domain adaptation;discrepancy;kmedoids;single batch;covariate shift", "primary_area": "", "supplementary_material": "", "author": "Antoine de Mathelin;Fran\u00e7ois Deheeger;Mathilde MOUGEOT;Nicolas Vayatis", "authorids": "~Antoine_de_Mathelin2;~Fran\u00e7ois_Deheeger1;~Mathilde_MOUGEOT1;~Nicolas_Vayatis1", "gender": "M;F;M;", "homepage": "https://www.linkedin.com/in/fran%C3%A7ois-deheeger-88370054/?originalSubdomain=fr;https://sites.google.com/site/mougeotmathilde/;;", "dblp": ";59/6349;00/582;", "google_scholar": ";j2cJzNAAAAAJ;;https://scholar.google.fr/citations?user=h79bffAAAAAJ", "orcid": ";0009-0009-6346-4519;;", "linkedin": ";mathilde-mougeot-bb5a8a24/?originalSubdomain=fr;;", "or_profile": "~Fran\u00e7ois_Deheeger1;~Mathilde_MOUGEOT1;~Nicolas_Vayatis1;~Antoine_De_mathelin1", "aff": ";ENSIIE;Ecole Normale Superieure Paris-Saclay;CMLA - ENS Paris Saclay, ENS Paris-Saclay", "aff_domain": ";ensiie.fr;ens-paris-saclay.fr;cmla.ens-cachan.fr", "position": ";Full Professor;Full Professor;PhD student", "bibtex": "@inproceedings{\nmathelin2022discrepancybased,\ntitle={Discrepancy-Based Active Learning for Domain Adaptation},\nauthor={Antoine De mathelin and Fran{\\c{c}}ois Deheeger and Mathilde MOUGEOT and Nicolas Vayatis},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=p98WJxUC3Ca}\n}", "github": "", "project": "", "reviewers": "Xjwo;Nd1E;bRBm;jrGL", "pdf_size": 0, "recommendation": "6;6;6;6", "confidence": "4;3;4;3", "correctness": "4;4;3;3", "technical_novelty": "2;3;2;3", "empirical_novelty": "3;3;2;2", "wc_summary_paper": "84;84;140;280", "wc_summary_review": "18;35;35;43", "wc_main_review": "349;178;468;513", "wc_review": "451;297;643;836", "wc_reply_reviewers": "0;13;20;0", "wc_reply_authors": "389;288;791;807", "reply_reviewers": "0;1;1;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 6.0, 0.0 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 147.0, 80.11866199581718 ], "wc_summary_review_avg": [ 32.75, 9.12071817347735 ], "wc_main_review_avg": [ 377.0, 129.57816174031797 ], "wc_review_avg": [ 556.75, 202.52947316378425 ], "wc_reply_reviewers_avg": [ 8.25, 8.613216588476108 ], "wc_reply_authors_avg": [ 568.75, 233.0712069304143 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 21, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=953867872378953329&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "pdf": "https://openreview.net/pdf?id=p98WJxUC3Ca", "email": ";ensiie.fr;ens-paris-saclay.fr;cmla.ens-cachan.fr", "author_num": 4, "aff_unique_index": "0;1;2", "aff_unique_norm": "\u00c9cole Nationale Sup\u00e9rieure d'Informatique pour l'Ing\u00e9nierie et les Techniques Avanc\u00e9es;Ecole Normale Superieure Paris-Saclay;\u00c9cole Normale Sup\u00e9rieure Paris-Saclay", "aff_unique_dep": ";;CMLA", "aff_unique_url": "https://www.ensiie.fr;https://www.ensparis-saclay.fr;https://www.ens-paris-saclay.fr", "aff_unique_abbr": "ENSIIE;ENS Paris-Saclay;ENS Paris-Saclay", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Paris-Saclay", "aff_country_unique_index": "0;0;0", "aff_country_unique": "France" }, { "id": "pC00NfsvnSK", "title": "Improving zero-shot generalization in offline reinforcement learning using generalized similarity functions", "track": "main", "status": "Reject", "tldr": "", "abstract": "Reinforcement learning (RL) agents are widely used for solving complex sequential decision making tasks, but still exhibit difficulty in generalizing to scenarios not seen during training. While prior online approaches demonstrated that using additional signals beyond the reward function can lead to better generalization capabilities in RL agents, i.e. using self-supervised learning (SSL), they struggle in the offline RL setting, i.e. learning from a static dataset. We show that the performance of online algorithms for generalization in RL can be hindered in the offline setting due to poor estimation of similarity between observations. We propose a new theoretically-motivated framework called Generalized Similarity Functions (GSF), which uses contrastive learning to train an offline RL agent to aggregate observations based on the similarity of their expected future behavior, where we quantify this similarity using generalized value functions. We show that GSF is general enough to recover existing SSL objectives while also improving zero-shot generalization performance on a complex offline RL benchmark, offline Procgen.", "keywords": "reinforcement learning;representation learning;self-supervised learning;offline RL;generalized value function;generalization", "primary_area": "", "supplementary_material": "/attachment/b0e37476e5cd8ecde791db3be2e861ff5cedbed9.zip", "author": "Bogdan Mazoure;Ilya Kostrikov;Ofir Nachum;Jonathan Tompson", "authorids": "~Bogdan_Mazoure1;~Ilya_Kostrikov1;~Ofir_Nachum1;~Jonathan_Tompson1", "gender": "M;M;M;M", "homepage": "https://bmazoure.github.io;;https://scholar.google.com/citations?user=C-ZlBWMAAAAJ&hl=en;http://jonathantompson.com", "dblp": ";https://dblp.org/pers/k/Kostrikov:Ilya.html;;139/0769", "google_scholar": "https://scholar.google.ca/citations?user=NaxShlcAAAAJ;PTS2AOgAAAAJ;C-ZlBWMAAAAJ;U_Jw8DUAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Bogdan_Mazoure1;~Ilya_Kostrikov1;~Ofir_Nachum1;~Jonathan_Tompson1", "aff": "McGill University, McGill University;University of California, Berkeley;OpenAI;Google DeepMind", "aff_domain": "mail.mcgill.ca;berkeley.edu;openai.com;google.com", "position": "PhD student;Postdoc;Researcher;Researcher", "bibtex": "@misc{\nmazoure2022improving,\ntitle={Improving zero-shot generalization in offline reinforcement learning using generalized similarity functions},\nauthor={Bogdan Mazoure and Ilya Kostrikov and Ofir Nachum and Jonathan Tompson},\nyear={2022},\nurl={https://openreview.net/forum?id=pC00NfsvnSK}\n}", "github": "", "project": "", "reviewers": "f74c;8hYV;nnNd;vyyv", "site": "https://openreview.net/forum?id=pC00NfsvnSK", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "2;3;4;4", "correctness": "3;2;3;4", "technical_novelty": "3;2;3;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "78;27;110;60", "wc_summary_review": "30;43;96;8", "wc_main_review": "348;653;156;219", "wc_review": "456;723;362;287", "wc_reply_reviewers": "0;0;0;106", "wc_reply_authors": "486;1155;278;394", "reply_reviewers": "0;0;0;1", "reply_authors": "1;3;1;1", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 68.75, 30.028111828751403 ], "wc_summary_review_avg": [ 44.25, 32.39116391857508 ], "wc_main_review_avg": [ 344.0, 191.35438327877415 ], "wc_review_avg": [ 457.0, 164.83476575043264 ], "wc_reply_reviewers_avg": [ 26.5, 45.89934640057525 ], "wc_reply_authors_avg": [ 578.25, 341.0457264062988 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.5, 0.8660254037844386 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.9045340337332909, "corr_recommendation_correctness": 0.7071067811865475, "gs_citation": 24, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10013850786811575699&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "McGill University;University of California, Berkeley;OpenAI;Google", "aff_unique_dep": ";;;Google DeepMind", "aff_unique_url": "https://www.mcgill.ca;https://www.berkeley.edu;https://openai.com;https://deepmind.com", "aff_unique_abbr": "McGill;UC Berkeley;OpenAI;DeepMind", "aff_campus_unique_index": "1", "aff_campus_unique": ";Berkeley", "aff_country_unique_index": "0;1;1;2", "aff_country_unique": "Canada;United States;United Kingdom" }, { "title": "Disentanglement Analysis with Partial Information Decomposition", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6465", "id": "pETy-HVvGtt", "poster": "", "openreview": "https://openreview.net/forum?id=pETy-HVvGtt", "slides": "https://iclr.cc/virtual/2022/poster/6465", "video": "https://iclr.cc/virtual/2022/poster/6465", "author_site": "Seiya Tokui, Issei Sato", "tldr": "", "abstract": "We propose a framework to analyze how multivariate representations disentangle ground-truth generative factors. A quantitative analysis of disentanglement has been based on metrics designed to compare how one variable explains each generative factor. Current metrics, however, may fail to detect entanglement that involves more than two variables, e.g., representations that duplicate and rotate generative factors in high dimensional spaces. In this work, we establish a framework to analyze information sharing in a multivariate representation with Partial Information Decomposition and propose a new disentanglement metric. This framework enables us to understand disentanglement in terms of uniqueness, redundancy, and synergy. We develop an experimental protocol to assess how increasingly entangled representations are evaluated with each metric and confirm that the proposed metric correctly responds to entanglement. Through experiments on variational autoencoders, we find that models with similar disentanglement scores have a variety of characteristics in entanglement, for each of which a distinct strategy may be required to obtain a disentangled representation.", "keywords": "disentangled representations;variational autoencoders;deep generative models", "primary_area": "", "supplementary_material": "", "author": "Seiya Tokui;Issei Sato", "authorids": "~Seiya_Tokui1;sato@g.ecc.u-tokyo.ac.jp", "gender": "M;", "homepage": "https://www.beam2d.net/;", "dblp": "162/3213;", "google_scholar": "qOkcVXMAAAAJ;", "orcid": ";", "linkedin": ";", "or_profile": "~Seiya_Tokui1;sato@g.ecc.u-tokyo.ac.jp", "aff": "The University of Tokyo;", "aff_domain": "u-tokyo.ac.jp;", "position": "PhD student;", "bibtex": "@inproceedings{\ntokui2022disentanglement,\ntitle={Disentanglement Analysis with Partial Information Decomposition},\nauthor={Seiya Tokui and Issei Sato},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=pETy-HVvGtt}\n}", "github": "", "project": "", "reviewers": "ezka;kftq;mDLP;25MQ", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "5;4;3;4", "correctness": "3;3;4;4", "technical_novelty": "2;2;3;4", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "28;71;147;104", "wc_summary_review": "586;39;44;54", "wc_main_review": "67;635;402;276", "wc_review": "681;745;593;434", "wc_reply_reviewers": "0;216;166;0", "wc_reply_authors": "1279;805;1202;217", "reply_reviewers": "0;1;1;0", "reply_authors": "2;3;3;1", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.82915619758885 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 87.5, 43.660622991432454 ], "wc_summary_review_avg": [ 180.75, 234.03351789861213 ], "wc_main_review_avg": [ 345.0, 205.78751176881457 ], "wc_review_avg": [ 613.25, 116.71412725116014 ], "wc_reply_reviewers_avg": [ 95.5, 97.12234552357145 ], "wc_reply_authors_avg": [ 875.75, 420.71568487518977 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 2.25, 0.82915619758885 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.3244428422615251, "corr_recommendation_correctness": 0.6882472016116854, "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4815922564452747765&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=pETy-HVvGtt", "email": "u-tokyo.ac.jp;", "author_num": 2, "aff_unique_index": "0", "aff_unique_norm": "University of Tokyo", "aff_unique_dep": "", "aff_unique_url": "https://www.u-tokyo.ac.jp", "aff_unique_abbr": "UTokyo", "aff_country_unique_index": "0", "aff_country_unique": "Japan" }, { "title": "IntSGD: Adaptive Floatless Compression of Stochastic Gradients", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6683", "id": "pFyXqxChZc", "poster": "", "openreview": "https://openreview.net/forum?id=pFyXqxChZc", "slides": "https://iclr.cc/virtual/2022/poster/6683", "video": "https://iclr.cc/virtual/2022/poster/6683", "author_site": "Konstantin Mishchenko, Bokun Wang, Dmitry Kovalev, Peter Richtarik", "tldr": "", "abstract": "We propose a family of adaptive integer compression operators for distributed Stochastic Gradient Descent (SGD) that do not communicate a single float. This is achieved by multiplying floating-point vectors with a number known to every device and then rounding to integers. In contrast to the prior work on integer compression for SwitchML by (Sapio et al., 2021), our IntSGD method is provably convergent and computationally cheaper as it estimates the scaling of vectors adaptively. Our theory shows that the iteration complexity of IntSGD matches that of SGD up to constant factors for both convex and non-convex, smooth and non-smooth functions, with and without overparameterization. Moreover, our algorithm can also be tailored for the popular all-reduce primitive and shows promising empirical performance.", "keywords": "optimization;distributed optimization;compression;theory;parallel training;switchML", "primary_area": "", "supplementary_material": "/attachment/52186daf95dd017b7b6b9878730dd61a8bf0ebfc.zip", "author": "Konstantin Mishchenko;Bokun Wang;Dmitry Kovalev;Peter Richt\u00e1rik", "authorids": "~Konstantin_Mishchenko1;~Bokun_Wang2;~Dmitry_Kovalev2;~Peter_Richt\u00e1rik1", "gender": ";M;M;M", "homepage": "https://konstmish.com/;https://bokun-wang.github.io/;https://www.dmitry-kovalev.com;https://richtarik.org", "dblp": "222/9853;207/1922;136/8468.html;62/8001", "google_scholar": "Z8Y8nhQAAAAJ;H9GqvAYAAAAJ;qHFA5z4AAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";;0000-0003-1467-2994;0000-0003-4380-5848", "linkedin": ";;;richtarik/", "or_profile": "~Konstantin_Mishchenko1;~Bokun_Wang2;~Dmitry_Kovalev2;~Peter_Richtarik1", "aff": "INRIA;University of Iowa;KAUST;King Abdullah University of Science and Technology (KAUST)", "aff_domain": "inria.fr;uiowa.edu;kaust.edu.sa;kaust.edu.sa", "position": "Postdoc;PhD student;PhD student;Full Professor", "bibtex": "@inproceedings{\nmishchenko2022intsgd,\ntitle={Int{SGD}: Adaptive Floatless Compression of Stochastic Gradients},\nauthor={Konstantin Mishchenko and Bokun Wang and Dmitry Kovalev and Peter Richt{\\'a}rik},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=pFyXqxChZc}\n}", "github": "", "project": "", "reviewers": "GF5a;7fYz;q8za", "pdf_size": 0, "recommendation": "6;8;8", "confidence": "3;3;2", "correctness": "3;4;3", "technical_novelty": "3;2;3", "empirical_novelty": "3;3;3", "wc_summary_paper": "136;5;73", "wc_summary_review": "35;41;48", "wc_main_review": "251;172;220", "wc_review": "422;218;341", "wc_reply_reviewers": "29;0;36", "wc_reply_authors": "1124;11;325", "reply_reviewers": "1;0;1", "reply_authors": "2;1;1", "recommendation_avg": [ 7.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 2.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 71.33333333333333, 53.493509471295255 ], "wc_summary_review_avg": [ 41.333333333333336, 5.312459150169743 ], "wc_main_review_avg": [ 214.33333333333334, 32.49957264676294 ], "wc_review_avg": [ 327.0, 83.86894538504703 ], "wc_reply_reviewers_avg": [ 21.666666666666668, 15.584892970081281 ], "wc_reply_authors_avg": [ 486.6666666666667, 468.53981213505807 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.49999999999999983, "corr_recommendation_correctness": 0.49999999999999983, "gs_citation": 24, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16969044896100418296&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "pdf": "https://openreview.net/pdf?id=pFyXqxChZc", "email": "inria.fr;uiowa.edu;kaust.edu.sa;kaust.edu.sa", "author_num": 4, "aff_unique_index": "0;1;2;2", "aff_unique_norm": "INRIA;University of Iowa;King Abdullah University of Science and Technology", "aff_unique_dep": ";;", "aff_unique_url": "https://www.inria.fr;https://www.uiowa.edu;https://www.kaust.edu.sa", "aff_unique_abbr": "INRIA;UIowa;KAUST", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2;2", "aff_country_unique": "France;United States;Saudi Arabia" }, { "id": "pIjvdJ_QUYv", "title": "Practical and Private Heterogeneous Federated Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Heterogeneous federated learning (HFL) enables clients with different computation/communication capabilities to collaboratively train their own customized models, in which the knowledge of models is shared via clients' predictions on a public dataset. However, there are two major limitations: 1) The assumption of public datasets may be unrealistic for data-critical scenarios such as Healthcare and Finance. 2) HFL is vulnerable to various privacy violations since the samples and predictions are completely exposed to adversaries. In this work, we develop PrivHFL, a general and practical framework for privacy-preserving HFL. We bypass the limitations of public datasets by designing a simple yet effective dataset expansion method. The main insight is that expanded data could provide good coverage of natural distributions, which is conducive to the sharing of model knowledge. To further tackle the privacy issue, we exploit the lightweight additive secret sharing technique to construct a series of tailored cryptographic protocols for key building blocks such as secure prediction. Our protocols implement ciphertext operations through simple vectorized computations, which are friendly with GPUs and can be processed by highly-optimized CUDA kernels. Extensive evaluations demonstrate that PrivHFL outperforms prior art up to two orders of magnitude in efficiency and realizes significant accuracy gains on top of the stand-alone method.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Hanxiao Chen;Meng Hao;Hongwei Li;Guangxiao Niu;Guowen Xu;Huawei Wang;Yuan Zhang;Tianwei Zhang", "authorids": "~Hanxiao_Chen2;~Meng_Hao1;~Hongwei_Li2;~Guangxiao_Niu1;~Guowen_Xu1;~Huawei_Wang2;~Yuan_Zhang14;~Tianwei_Zhang1", "gender": ";;M;;M;M;;M", "homepage": ";;https://faculty.uestc.edu.cn/lihongwei/zh_CN/index.htm;;https://guowen-xu.github.io/;https://github.com/greatdreams;;https://personal.ntu.edu.sg/tianwei.zhang/index.html", "dblp": "255/5224;;39/5544-1;;87/10142;;;77/7902-4", "google_scholar": ";;-o6u2gwAAAAJ;;https://scholar.google.com.hk/citations?user=MDKdG80AAAAJ;;7rWSrzsAAAAJ;9vpiYDIAAAAJ", "orcid": "0000-0003-0136-073X;;;0000-0002-8156-9869;0000-0002-9764-9345;;;", "linkedin": ";;;;guowen-xu-92b7201b1/?originalSubdomain=hk;;;", "or_profile": "~Hanxiao_Chen2;~Meng_Hao1;~Hongwei_Li2;~Guangxiao_Niu1;~Guowen_Xu1;~Huawei_Wang2;~Yuan_Zhang14;~Tianwei_Zhang1", "aff": "University of Electronic Science and Technology of China;;University of Electronic Science and Technology of China, Tsinghua University;University of Electronic Science and Technology of China;Nanyang Technological University;Beijing University of Post and Telecommunication;University of Electronic Science and Technology of China;Nanyang Technological University", "aff_domain": "uestc.edu.cn;;uestc.edu.cn;uestc.edu.cn;ntu.edu.sg;bupt.edu.cn;uestc.edu.cn;ntu.edu.sg", "position": "PhD student;;Full Professor;MS student;Postdoc;Postdoc;Associate Professor;Assistant Professor", "bibtex": "@misc{\nchen2022practical,\ntitle={Practical and Private Heterogeneous Federated Learning},\nauthor={Hanxiao Chen and Meng Hao and Hongwei Li and Guangxiao Niu and Guowen Xu and Huawei Wang and Yuan Zhang and Tianwei Zhang},\nyear={2022},\nurl={https://openreview.net/forum?id=pIjvdJ_QUYv}\n}", "github": "", "project": "", "reviewers": "kXk2;3JAx;ZGSK", "site": "https://openreview.net/forum?id=pIjvdJ_QUYv", "pdf_size": 0, "recommendation": "5;6;6", "confidence": "4;3;2", "correctness": "2;3;2", "technical_novelty": "2;2;3", "empirical_novelty": "2;3;2", "wc_summary_paper": "85;66;83", "wc_summary_review": "67;47;19", "wc_main_review": "966;201;75", "wc_review": "1118;314;177", "wc_reply_reviewers": "523;36;46", "wc_reply_authors": "3035;1137;613", "reply_reviewers": "3;1;1", "reply_authors": "7;4;3", "recommendation_avg": [ 5.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.0, 0.816496580927726 ], "correctness_avg": [ 2.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 78.0, 8.524474568362947 ], "wc_summary_review_avg": [ 44.333333333333336, 19.68643074697787 ], "wc_main_review_avg": [ 414.0, 393.6978536898569 ], "wc_review_avg": [ 536.3333333333334, 415.0858010366317 ], "wc_reply_reviewers_avg": [ 201.66666666666666, 227.25365172472416 ], "wc_reply_authors_avg": [ 1595.0, 1040.4627175764958 ], "reply_reviewers_avg": [ 1.6666666666666667, 0.9428090415820634 ], "reply_authors_avg": [ 4.666666666666667, 1.699673171197595 ], "replies_avg": [ 23, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": -0.8660254037844385, "corr_recommendation_correctness": 0.4999999999999999, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:skhBeVCYqfAJ:scholar.google.com/&scioq=Practical+and+Private+Heterogeneous+Federated+Learning&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;0;1;2;0;1", "aff_unique_norm": "University of Electronic Science and Technology of China;Nanyang Technological University;Beijing University of Posts and Telecommunications", "aff_unique_dep": ";;", "aff_unique_url": "https://www.uestc.edu.cn;https://www.ntu.edu.sg;http://www.bupt.edu.cn/", "aff_unique_abbr": "UESTC;NTU;BUPT", "aff_campus_unique_index": "1", "aff_campus_unique": ";Beijing", "aff_country_unique_index": "0;0;0;1;0;0;1", "aff_country_unique": "China;Singapore" }, { "id": "pJAwaNEexRV", "title": "Gradient Assisted Learning", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "In distributed settings, collaborations between different entities, such as financial institutions, medical centers, and retail markets, are crucial to providing improved service and performance. However, the underlying entities may have little interest in sharing their private data, proprietary models, and objective functions. These privacy requirements have created new challenges for collaboration. In this work, we propose Gradient Assisted Learning (GAL), a new method for various entities to assist each other in supervised learning tasks without sharing data, models, and objective functions. In this framework, all participants collaboratively optimize the aggregate of local loss functions, and each participant autonomously builds its own model by iteratively fitting the gradients of the objective function. Experimental studies demonstrate that Gradient Assisted Learning can achieve performance close to centralized learning when all data, models, and objective functions are fully disclosed.", "keywords": "Multi-Organization Learning;Distributed Machine Learning;Machine Learning Applications", "primary_area": "", "supplementary_material": "", "author": "Enmao Diao;Jie Ding;Vahid Tarokh", "authorids": "~Enmao_Diao1;~Jie_Ding2;~Vahid_Tarokh1", "gender": "M;M;", "homepage": "https://diaoenmao.com/;http://jding.org;", "dblp": "226/5549;94/1825-2;", "google_scholar": "jhVVjF4AAAAJ;ZyqvoqcAAAAJ;", "orcid": "0000-0002-9151-7990;;", "linkedin": "enmaodiao/;;", "or_profile": "~Enmao_Diao1;~Jie_Ding2;~Vahid_Tarokh1", "aff": "Duke University;University of Minnesota, Minneapolis;", "aff_domain": "duke.edu;umn.edu;", "position": "PhD student;Assistant Professor;", "bibtex": "@misc{\ndiao2022gradient,\ntitle={Gradient Assisted Learning},\nauthor={Enmao Diao and Jie Ding and Vahid Tarokh},\nyear={2022},\nurl={https://openreview.net/forum?id=pJAwaNEexRV}\n}", "github": "", "project": "", "reviewers": "bj7h;2f2X;LJjS;eaJk", "site": "https://openreview.net/forum?id=pJAwaNEexRV", "pdf_size": 0, "recommendation": "5;5;5;6", "confidence": "4;2;4;3", "correctness": "4;3;3;3", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;3;3;0", "wc_summary_paper": "179;52;91;65", "wc_summary_review": "87;13;70;50", "wc_main_review": "494;223;413;224", "wc_review": "760;288;574;339", "wc_reply_reviewers": "656;75;0;119", "wc_reply_authors": "2499;857;907;1017", "reply_reviewers": "2;1;0;1", "reply_authors": "6;3;3;2", "recommendation_avg": [ 5.25, 0.4330127018922193 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.0, 1.224744871391589 ], "wc_summary_paper_avg": [ 96.75, 49.51956683978567 ], "wc_summary_review_avg": [ 55.0, 27.55902755904134 ], "wc_main_review_avg": [ 338.5, 118.51265755184127 ], "wc_review_avg": [ 490.25, 189.4470572481927 ], "wc_reply_reviewers_avg": [ 212.5, 259.56550233033664 ], "wc_reply_authors_avg": [ 1320.0, 683.1522524298665 ], "reply_reviewers_avg": [ 1.0, 0.7071067811865476 ], "reply_authors_avg": [ 3.5, 1.5 ], "replies_avg": [ 24, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.17407765595569782, "corr_recommendation_correctness": -0.3333333333333333, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14106789366775612982&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1", "aff_unique_norm": "Duke University;University of Minnesota", "aff_unique_dep": ";", "aff_unique_url": "https://www.duke.edu;https://www.minnesota.edu", "aff_unique_abbr": "Duke;UMN", "aff_campus_unique_index": "1", "aff_campus_unique": ";Minneapolis", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "pLNLdHrZmcX", "title": "SANE: Specialization-Aware Neural Network Ensemble", "track": "main", "status": "Reject", "tldr": "", "abstract": "Real-world data is often generated by some complex distribution, which can be approximated by a composition of multiple simpler distributions. Thus, it is intuitive to divide the complex model learning into training several simpler models, each of which specializes in one simple distribution. Ensemble learning is one way to realize specialization, and has been widely used in practical machine learning scenarios. Many ensemble methods propose to increase diversity of base models, which could potentially result in model specialization. However, our studies show that without explicitly enforcing specification, pursuing diversity may not be enough to achieve satisfactory ensemble performance. In this paper, we propose SANE --- an end-to-end ensemble learning method that actively enforces model specification, where base models are trained to specialize in sub-regions of a latent space representing the simple distribution composition, and aggregated based on their specialties. Experiments in several prediction tasks on both image datasets and tabular datasets demonstrate the superior performance of our proposed method over state-of-the-art ensemble methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ziyue Li;Kan Ren;XINYANG JIANG;Mingzhe Han;Haipeng Zhang;Dongsheng Li", "authorids": "~Ziyue_Li1;~Kan_Ren1;~XINYANG_JIANG2;~Mingzhe_Han1;~Haipeng_Zhang3;~Dongsheng_Li2", "gender": "F;M;M;M;M;M", "homepage": "https://litzy0619.github.io/;https://saying.ren;;;https://faculty.sist.shanghaitech.edu.cn/zhanghp/;http://recmind.cn", "dblp": ";28/7458;155/6316;;;254/0830-2.html", "google_scholar": "NQVzCSkAAAAJ;USnQVWgAAAAJ;JiTfWVMAAAAJ;;377DmKgAAAAJ;VNg5rA8AAAAJ", "orcid": ";;;;;0000-0003-3103-8442", "linkedin": "litzyli/;;xinyang-jiang-ab5416b0/;Mingzhe-Han/;;", "or_profile": "~Ziyue_Li1;~Kan_Ren1;~XINYANG_JIANG2;~Mingzhe_Han1;~Haipeng_Zhang3;~Dongsheng_Li2", "aff": "ShanghaiTech University;Microsoft;Microsoft;Shanghai Jiaotong University;ShanghaiTech University;Microsoft Research Asia", "aff_domain": "shanghaitech.edu.cn;microsoft.com;microsoft.com;sjtu.edu.cn;shanghaitech.edu.cn;microsoft.com", "position": "MS student;Researcher;Senior Researcher;MS student;Assistant Professor;Principal Researcher", "bibtex": "@misc{\nli2022sane,\ntitle={{SANE}: Specialization-Aware Neural Network Ensemble},\nauthor={Ziyue Li and Kan Ren and XINYANG JIANG and Mingzhe Han and Haipeng Zhang and Dongsheng Li},\nyear={2022},\nurl={https://openreview.net/forum?id=pLNLdHrZmcX}\n}", "github": "", "project": "", "reviewers": "G9Xj;vevN;y2rX;etDH", "site": "https://openreview.net/forum?id=pLNLdHrZmcX", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "3;3;4;5", "correctness": "3;4;4;2", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;3;2;1", "wc_summary_paper": "20;36;117;44", "wc_summary_review": "71;55;38;57", "wc_main_review": "23;186;191;388", "wc_review": "114;277;346;489", "wc_reply_reviewers": "0;77;0;188", "wc_reply_authors": "782;1012;350;1733", "reply_reviewers": "0;2;0;1", "reply_authors": "2;3;1;4", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 54.25, 37.24496610281717 ], "wc_summary_review_avg": [ 55.25, 11.712706775122479 ], "wc_main_review_avg": [ 197.0, 129.3387026376869 ], "wc_review_avg": [ 306.5, 134.90088954488033 ], "wc_reply_reviewers_avg": [ 66.25, 77.00121752284181 ], "wc_reply_authors_avg": [ 969.25, 500.917845060445 ], "reply_reviewers_avg": [ 0.75, 0.82915619758885 ], "reply_authors_avg": [ 2.5, 1.118033988749895 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.9045340337332909, "corr_recommendation_correctness": -0.30151134457776363, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:0UFhV28wYz0J:scholar.google.com/&scioq=SANE:+Specialization-Aware+Neural+Network+Ensemble&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1;1;2;0;1", "aff_unique_norm": "ShanghaiTech University;Microsoft;Shanghai Jiao Tong University", "aff_unique_dep": ";Microsoft Corporation;", "aff_unique_url": "https://www.shanghaitech.edu.cn;https://www.microsoft.com;https://www.sjtu.edu.cn", "aff_unique_abbr": "ShanghaiTech;Microsoft;SJTU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Asia", "aff_country_unique_index": "0;1;1;0;0;0", "aff_country_unique": "China;United States" }, { "title": "Language modeling via stochastic processes", "status": "Oral", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/5950", "id": "pMQwKL1yctf", "poster": "", "openreview": "https://openreview.net/forum?id=pMQwKL1yctf", "slides": "https://iclr.cc/virtual/2022/poster/5950", "video": "https://iclr.cc/virtual/2022/poster/5950", "author_site": "Rose Wang, Esin Durmus, Noah Goodman, Tatsunori Hashimoto", "tldr": "", "abstract": "Modern language models can generate high-quality short texts. However, they often meander or are incoherent when generating longer texts. These issues arise from the next-token-only language modeling objective. To address these issues, we introduce Time Control (TC), a language model that implicitly plans via a latent stochastic process. TC does this by learning a representation which maps the dynamics of how text changes in a document to the dynamics of a stochastic process of interest. Using this representation, the language model can generate text by first implicitly generating a document plan via a stochastic process, and then generating text that is consistent with this latent plan. Compared to domain-specific methods and fine-tuning GPT2 across a variety of text domains, TC improves performance on text infilling and discourse coherence. On long text generation settings, TC preserves the text structure both in terms of ordering (up to +40% better) and text length consistency (up to +17% better). Human evaluators also prefer TC's output 28.6% more than the baselines.", "keywords": "contrastive learning;language modelling;stochastic processes", "primary_area": "", "supplementary_material": "/attachment/3c726fd45d501529d53d01a306dac4372f0175aa.zip", "author": "Rose E Wang;Esin Durmus;Noah Goodman;Tatsunori Hashimoto", "authorids": "~Rose_E_Wang1;~Esin_Durmus1;~Noah_Goodman1;~Tatsunori_Hashimoto1", "gender": "F;;;M", "homepage": "https://cs.stanford.edu/~rewang;;https://cocolab.stanford.edu/;https://thashim.github.io", "dblp": "259/1500;219/6227;96/1216;", "google_scholar": "V-dlwF4AAAAJ;;OUpIbcQAAAAJ;5ygiTwsAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Rose_E_Wang1;~Esin_Durmus1;~Noah_Goodman1;~Tatsunori_Hashimoto1", "aff": "Stanford University;Stanford University;Stanford University;Stanford University", "aff_domain": "stanford.edu;stanford.edu;stanford.edu;stanford.edu", "position": "PhD student;Postdoc;Full Professor;Assistant Professor", "bibtex": "@inproceedings{\nwang2022language,\ntitle={Language modeling via stochastic processes},\nauthor={Rose E Wang and Esin Durmus and Noah Goodman and Tatsunori Hashimoto},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=pMQwKL1yctf}\n}", "github": "", "project": "", "reviewers": "HWGg;JgDa;mT85;zz52", "pdf_size": 0, "recommendation": "8;8;8;8", "confidence": "4;4;4;4", "correctness": "3;3;4;4", "technical_novelty": "3;3;4;3", "empirical_novelty": "3;3;4;3", "wc_summary_paper": "79;210;71;251", "wc_summary_review": "24;129;29;63", "wc_main_review": "1333;588;290;257", "wc_review": "1436;927;390;571", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "1362;1131;122;375", "reply_reviewers": "0;0;0;0", "reply_authors": "2;2;1;1", "recommendation_avg": [ 8.0, 0.0 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 152.75, 79.14030262767511 ], "wc_summary_review_avg": [ 61.25, 41.89495793051952 ], "wc_main_review_avg": [ 617.0, 433.02020738067176 ], "wc_review_avg": [ 831.0, 399.16224771388386 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 747.5, 513.490262809335 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 40, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15213113550965798696&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=pMQwKL1yctf", "email": "stanford.edu;stanford.edu;stanford.edu;stanford.edu", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Stanford University", "aff_unique_dep": "", "aff_unique_url": "https://www.stanford.edu", "aff_unique_abbr": "Stanford", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Contrastive Clustering to Mine Pseudo Parallel Data for Unsupervised Translation", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6424", "id": "pN1JOdrSY9", "poster": "", "openreview": "https://openreview.net/forum?id=pN1JOdrSY9", "slides": "https://iclr.cc/virtual/2022/poster/6424", "video": "https://iclr.cc/virtual/2022/poster/6424", "author_site": "Xuan-Phi Nguyen, Hongyu Gong, Yun Tang, Changhan Wang, Philipp Koehn, Shafiq Joty", "tldr": "", "abstract": "Modern unsupervised machine translation systems mostly train their models by generating synthetic parallel training data from large unlabeled monolingual corpora of different languages through various means, such as iterative back-translation. However, there may exist small amount of actual parallel data hidden in the sea of unlabeled data, which has not been exploited. We develop a new fine-tuning objective, called Language-Agnostic Constraint for SwAV loss, or LAgSwAV, which enables a pre-trained model to extract such pseudo-parallel data from the monolingual corpora in a fully unsupervised manner. We then propose an effective strategy to utilize the obtained synthetic data to augment unsupervised machine translation. Our method achieves the state of the art in the WMT'14 English-French, WMT'16 German-English and English-Romanian bilingual unsupervised translation tasks, with 40.2, 36.8, 37.0 BLEU, respectively. We also achieve substantial improvements in the FLoRes low-resource English-Nepali and English-Sinhala unsupervised tasks with 5.3 and 5.4 BLEU, respectively.\n", "keywords": "machine translation;unsupervised machine translation;pseudo-parallel data;contrastive clustering;pretraining", "primary_area": "", "supplementary_material": "/attachment/e6d6548dfe8e6cd897f796567ff95bc3b925a931.zip", "author": "Xuan-Phi Nguyen;Hongyu Gong;Yun Tang;Changhan Wang;Philipp Koehn;Shafiq Joty", "authorids": "~Xuan-Phi_Nguyen1;~Hongyu_Gong1;~Yun_Tang1;changhan@fb.com;~Philipp_Koehn2;~Shafiq_Joty1", "gender": ";F;M;;M;M", "homepage": ";https://hongyugong.github.io/;;;http://www.cs.jhu.edu/~phi/;https://raihanjoty.github.io/", "dblp": ";163/7318;67/764-2;;84/4538.html;62/2078", "google_scholar": ";Jam1IpgAAAAJ;https://scholar.google.com/citations?hl=en;;OsIZgIYAAAAJ;hR249csAAAAJ", "orcid": ";;;;0000-0003-1565-064X;", "linkedin": ";;;;philipp-koehn-bbb8024/;", "or_profile": "~Xuan-Phi_Nguyen1;~Hongyu_Gong1;~Yun_Tang1;changhan@fb.com;~Philipp_Koehn2;~Shafiq_Joty1", "aff": ";Meta Facebook;Meta AI Research;;Meta;SalesForce.com", "aff_domain": ";fb.com;facebook.com;;meta.com;salesforce.com", "position": ";Researcher;Researcher;;Research Scientist;Principal Researcher", "bibtex": "@inproceedings{\nnguyen2022contrastive,\ntitle={Contrastive Clustering to Mine Pseudo Parallel Data for Unsupervised Translation},\nauthor={Xuan-Phi Nguyen and Hongyu Gong and Yun Tang and Changhan Wang and Philipp Koehn and Shafiq Joty},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=pN1JOdrSY9}\n}", "github": "", "project": "", "reviewers": "NzbT;ARKs;3VMA;QiHo", "pdf_size": 0, "recommendation": "5;6;8;8", "confidence": "4;4;4;4", "correctness": "3;3;4;3", "technical_novelty": "4;3;4;3", "empirical_novelty": "2;3;2;3", "wc_summary_paper": "42;128;116;71", "wc_summary_review": "34;26;69;50", "wc_main_review": "353;226;451;169", "wc_review": "429;380;636;290", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 6.75, 1.299038105676658 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 3.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 89.25, 34.57871455100666 ], "wc_summary_review_avg": [ 44.75, 16.452583383772897 ], "wc_main_review_avg": [ 299.75, 109.82571420209385 ], "wc_review_avg": [ 433.75, 126.96530037770162 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.5555555555555555, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3441833484692803239&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "pdf": "https://openreview.net/pdf?id=pN1JOdrSY9", "email": ";fb.com;facebook.com;;meta.com;salesforce.com", "author_num": 6, "aff_unique_index": "0;0;0;1", "aff_unique_norm": "Meta;Salesforce", "aff_unique_dep": "Meta Platforms, Inc.;", "aff_unique_url": "https://meta.com;https://www.salesforce.com", "aff_unique_abbr": "Meta;Salesforce", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "pP9ag2g5f0", "title": "Exploring Complicated Search Spaces with Interleaving-Free Sampling", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "The design of search space plays a crucial role in neural architecture search (NAS). Existing search spaces mostly involve short-distance connections arguably due to the increasing difficulty brought by long-distance ones. This paper systematically studies this problem in the context of super-network optimization, and reveals that the interleaved connections introduce significant noises to the amortized accuracy. Based on the observation, we propose a simple yet effective interleaving-free sampling algorithm to aid the search process, and name our method as IF-NAS. In each iteration, IF-NAS samples a sub-network that does not contain any interleaved connections. We design a difficult search space with a large number of complicatedly connected nodes, $10^{186}\\times$ larger than the DARTS space, on which IF-NAS outperforms other competitors evidently. IF-NAS also generalizes to the known (easier) search spaces including DARTS and GOLD-NAS, the design of which carries great prior knowledge. Our research sheds light on extending the macro structure which is well acknowledged as a major challenge of NAS.", "keywords": "Complicated search spaces;Interleaving-free sampling;Interleaved connections;Degradation", "primary_area": "", "supplementary_material": "", "author": "Yunjie Tian;Lingxi Xie;Jiemin Fang;Qixiang Ye;Jianbin Jiao;Qi Tian", "authorids": "~Yunjie_Tian1;~Lingxi_Xie1;~Jiemin_Fang1;~Qixiang_Ye1;~Jianbin_Jiao1;~Qi_Tian3", "gender": "M;M;M;M;M;M", "homepage": "https://sunsmarterjie.github.io/;http://lingxixie.com/;https://jaminfong.cn;http://people.ucas.ac.cn/~qxye?language=en;http://lamp.ucas.ac.cn/;https://www.qitian1987.com/index.html", "dblp": "270/0554;123/2869;233/1239;06/4335;;78/1467-1.html", "google_scholar": "https://scholar.google.com.hk/citations?user=DuetWVcAAAAJ;EEMm7hwAAAAJ;-JcFoOoAAAAJ;https://scholar.google.com.hk/citations?user=tjEfgsEAAAAJ;;https://scholar.google.com/citations?hl=en", "orcid": "0000-0002-5103-3748;;;;;0000-0002-7252-5047", "linkedin": ";;;;;", "or_profile": "~Yunjie_Tian1;~Lingxi_Xie1;~Jiemin_Fang1;~Qixiang_Ye1;~Jianbin_Jiao1;~Qi_Tian3", "aff": "University of Chinese Academy of Sciences;Huawei Technologies Ltd.;Huazhong University of Science and Technology;University of Chinese Academy of Sciences;University of Chinese Academy of Sciences;Huawei Technologies Ltd.", "aff_domain": "ucas.ac.cn;huawei.com;hust.edu.cn;ucas.ac.cn;ucas.ac.cn;huawei.com", "position": "PhD student;Researcher;PhD student;Full Professor;Full Professor;Principal Researcher", "bibtex": "@misc{\ntian2022exploring,\ntitle={Exploring Complicated Search Spaces with Interleaving-Free Sampling},\nauthor={Yunjie Tian and Lingxi Xie and Jiemin Fang and Qixiang Ye and Jianbin Jiao and Qi Tian},\nyear={2022},\nurl={https://openreview.net/forum?id=pP9ag2g5f0}\n}", "github": "", "project": "", "reviewers": "yFWj;bMFo;TLuw;b1we", "site": "https://openreview.net/forum?id=pP9ag2g5f0", "pdf_size": 0, "recommendation": "3;5;5;8", "confidence": "4;3;4;4", "correctness": "2;3;3;3", "technical_novelty": "2;3;2;3", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "71;45;81;78", "wc_summary_review": "56;32;34;49", "wc_main_review": "274;119;317;176", "wc_review": "401;196;432;303", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 5.25, 1.7853571071357126 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 68.75, 14.184057952504283 ], "wc_summary_review_avg": [ 42.75, 10.084022015049353 ], "wc_main_review_avg": [ 221.5, 78.18727517953289 ], "wc_review_avg": [ 333.0, 92.32280325033464 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.08084520834544431, "corr_recommendation_correctness": 0.7276068751089989, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:J7PRwntMmVYJ:scholar.google.com/&scioq=Exploring+Complicated+Search+Spaces+with+Interleaving-Free+Sampling&hl=en&as_sdt=0,33", "gs_version_total": 5, "aff_unique_index": "0;1;2;0;0;1", "aff_unique_norm": "University of Chinese Academy of Sciences;Huawei;Huazhong University of Science and Technology", "aff_unique_dep": ";Huawei Technologies;", "aff_unique_url": "http://www.ucas.ac.cn;https://www.huawei.com;http://www.hust.edu.cn", "aff_unique_abbr": "UCAS;Huawei;HUST", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "China" }, { "id": "pQ02Y-onvZA", "title": "$\\sbf{\\delta^2}$-exploration for Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Effectively tackling the \\emph{exploration-exploitation dilemma} is still a major challenge in reinforcement learning.\nUncertainty-based exploration strategies developed in the bandit setting could theoretically offer a principled way to trade off exploration and exploitation, but applying them to the general reinforcement learning setting is impractical due to their requirement to represent posterior distributions over models, which is computationally intractable in generic sequential decision tasks.\n\nRecently, \\emph{Sample Average Uncertainty (SAU)} was develop as an alternative method to tackle exploration in bandit problems in a scalable way.\nWhat makes SAU particularly efficient is that it only depends on the value predictions, meaning that it does not need to rely on maintaining model posterior distributions.\nIn this work we propose \\emph{$\\delta^2$-exploration}, an exploration strategy that extends SAU from bandits to the general sequential Reinforcement Learning scenario. \nWe empirically study $\\delta^2$-exploration in the tabular as well as in the Deep Q-learning case, proving its strong practical advantage and wide adaptability to complex reward models such as those deployed in modern Reinforcement Learning.", "keywords": "Reinforcement learning;exploration;Q-learning;DQN", "primary_area": "", "supplementary_material": "", "author": "Rong Zhu;Mattia Rigotti", "authorids": "~Rong_Zhu4;~Mattia_Rigotti1", "gender": ";", "homepage": ";http://www.matrig.net", "dblp": ";01/9816", "google_scholar": ";TmHt7CwAAAAJ", "orcid": ";0000-0001-6466-2810", "linkedin": ";", "or_profile": "~Rong_Zhu4;~Mattia_Rigotti1", "aff": ";International Business Machines", "aff_domain": ";ibm.com", "position": ";Researcher", "bibtex": "@misc{\nzhu2022sbfdeltaexploration,\ntitle={\\${\\textbackslash}sbf\\{{\\textbackslash}delta{\\textasciicircum}2\\}\\$-exploration for Reinforcement Learning},\nauthor={Rong Zhu and Mattia Rigotti},\nyear={2022},\nurl={https://openreview.net/forum?id=pQ02Y-onvZA}\n}", "github": "", "project": "", "reviewers": "uzFB;PFBt;o8hT;zyS1", "site": "https://openreview.net/forum?id=pQ02Y-onvZA", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "4;4;3;4", "correctness": "2;3;3;2", "technical_novelty": "2;1;3;2", "empirical_novelty": "1;2;2;3", "wc_summary_paper": "63;43;79;111", "wc_summary_review": "35;55;78;100", "wc_main_review": "467;339;379;187", "wc_review": "565;437;536;398", "wc_reply_reviewers": "78;0;0;4", "wc_reply_authors": "309;703;352;446", "reply_reviewers": "1;0;0;1", "reply_authors": "2;1;1;1", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 74.0, 24.879710609249457 ], "wc_summary_review_avg": [ 67.0, 24.38237068047322 ], "wc_main_review_avg": [ 343.0, 101.27191120937731 ], "wc_review_avg": [ 484.0, 68.68405928597988 ], "wc_reply_reviewers_avg": [ 20.5, 33.237779709240506 ], "wc_reply_authors_avg": [ 452.5, 152.87658421092485 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:82nnHfNFiJwJ:scholar.google.com/&scioq=%24%5Csbf%7B%5Cdelta%5E2%7D%24-exploration+for+Reinforcement+Learning&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "International Business Machines Corporation", "aff_unique_dep": "", "aff_unique_url": "https://www.ibm.com", "aff_unique_abbr": "IBM", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "id": "pSbqyZRKzbw", "title": "WHICH SAMPLES SHOULD BE LEARNED FIRST\uff1aEASY OR HARD\uff1f", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "An effective weighting scheme for training samples is essential for learning tasks. Numerous weighting schemes have been proposed. Some schemes take the easy-first mode on samples, whereas some others take the hard-first mode. Naturally, an interesting yet realistic question is raised. Which samples should be learned first given a new learning task, easy or hard? To answer this question, three aspects of research are carried out. First, a high-level unified weighted loss is proposed, providing a more comprehensive view for existing schemes. Theoretical analysis is subsequently conducted and preliminary conclusions are obtained. Second, a flexible weighting scheme is proposed to overcome the defects of existing schemes. The three modes, namely, easy/medium/hard-first, can be flexibly switched in the proposed scheme. Third, a wide range of experiments are conducted to further compare the weighting schemes in different modes. On the basis of these works, reasonable answers are obtained. Factors including prior knowledge and data characteristics determine which samples should be learned first in a learning task.", "keywords": "Sample weighting;Priority mode;Unified weighting scheme;Focal loss;Self-paced learning", "primary_area": "", "supplementary_material": "/attachment/c9907e70eeae38e2a4992b11c8696260a5521aa8.zip", "author": "Xiaoling Zhou;Ou Wu", "authorids": "~Xiaoling_Zhou2;~Ou_Wu1", "gender": "F;M", "homepage": ";", "dblp": "127/6140.html;64/1258", "google_scholar": "https://scholar.google.com.hk/citations?user=bMAf09gAAAAJ;", "orcid": "0000-0002-7305-1779;", "linkedin": ";", "or_profile": "~Xiaoling_Zhou2;~Ou_Wu1", "aff": "Tianjin University;Tianjin University", "aff_domain": "tju.edu.cn;tju.edu.cn", "position": "MS student;Full Professor", "bibtex": "@misc{\nzhou2022which,\ntitle={{WHICH} {SAMPLES} {SHOULD} {BE} {LEARNED} {FIRST}\uff1a{EASY} {OR} {HARD}\uff1f},\nauthor={Xiaoling Zhou and Ou Wu},\nyear={2022},\nurl={https://openreview.net/forum?id=pSbqyZRKzbw}\n}", "github": "", "project": "", "reviewers": "TLA7;3h4h;iEc4;pqKe", "site": "https://openreview.net/forum?id=pSbqyZRKzbw", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "5;3;3;3", "correctness": "3;3;4;4", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "69;71;71;58", "wc_summary_review": "5;52;22;44", "wc_main_review": "104;303;105;147", "wc_review": "178;426;198;249", "wc_reply_reviewers": "0;124;61;0", "wc_reply_authors": "259;1039;798;317", "reply_reviewers": "0;1;1;0", "reply_authors": "1;2;1;1", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 3.5, 0.8660254037844386 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 67.25, 5.402545696243577 ], "wc_summary_review_avg": [ 30.75, 18.48479104561369 ], "wc_main_review_avg": [ 164.75, 81.68345915789806 ], "wc_review_avg": [ 262.75, 97.74296649887397 ], "wc_reply_reviewers_avg": [ 46.25, 51.334077375560184 ], "wc_reply_authors_avg": [ 603.25, 327.2051153328749 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.9271726499455306, "corr_recommendation_correctness": 0.6882472016116854, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10086075689149222006&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff_unique_index": "0;0", "aff_unique_norm": "Tianjin University", "aff_unique_dep": "", "aff_unique_url": "http://www.tju.edu.cn", "aff_unique_abbr": "TJU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "id": "pSy3DZV3PGJ", "title": "Safe Multi-Task Learning", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "In recent years, Multi-Task Learning (MTL) attracts much attention due to its good performance in many applications. However, many existing MTL models cannot guarantee that its performance is no worse than its single-task counterpart on each task. Though this phenomenon has been empirically observed by some works, little work aims to handle the resulting problem, which is formally defined as negative sharing in this paper. To achieve safe multi-task learning where no negative sharing}occurs, we propose a Safe Multi-Task Learning (SMTL) model, which consists of a public encoder shared by all the tasks, private encoders, gates, and private decoders. Specifically, each task has a private encoder, a gate, and a private decoder, where the gate is to learn how to combine the private encoder and public encoder for the downstream private decoder. To reduce the storage cost during the inference stage, a lite version of SMTL is proposed to allow the gate to choose either the public encoder or the corresponding private encoder. Moreover, we propose a variant of SMTL to place all the gates after decoders of all the tasks. Experiments on several benchmark datasets demonstrate the effectiveness of the proposed methods.", "keywords": "multi-task learning", "primary_area": "", "supplementary_material": "/attachment/a0a6e98e7e5e2fb68c72fdcfd058f0055291d979.zip", "author": "Pengxin Guo;Feiyang Ye;Yu Zhang", "authorids": "~Pengxin_Guo1;~Feiyang_Ye4;~Yu_Zhang3", "gender": "M;M;M", "homepage": "https://pengxin-guo.github.io/;https://feiyang-ye.github.io/;http://cse.sustech.edu.cn/faculty/~zhangy/", "dblp": "258/4931;285/4704;50/671-6", "google_scholar": "v1oYGZQAAAAJ;3EX25cAAAAAJ;https://scholar.google.com.hk/citations?user=jaRS5w4AAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Pengxin_Guo1;~Feiyang_Ye4;~Yu_Zhang3", "aff": "Southern University of Science and Technology;A*STAR;Southern University of Science and Technology", "aff_domain": "mail.sustech.edu.cn;cfar.a-star.edu.sg;sustc.edu.cn", "position": "MS student;Intern;Associate Professor", "bibtex": "@misc{\nguo2022safe,\ntitle={Safe Multi-Task Learning},\nauthor={Pengxin Guo and Feiyang Ye and Yu Zhang},\nyear={2022},\nurl={https://openreview.net/forum?id=pSy3DZV3PGJ}\n}", "github": "", "project": "", "reviewers": "fXJq;6KBB;kYfH;Ny3C;43ho", "site": "https://openreview.net/forum?id=pSy3DZV3PGJ", "pdf_size": 0, "recommendation": "3;3;5;5;5", "confidence": "5;5;5;4;3", "correctness": "2;2;4;3;3", "technical_novelty": "2;2;2;2;2", "empirical_novelty": "2;1;2;1;2", "wc_summary_paper": "31;62;60;67;53", "wc_summary_review": "42;16;42;41;30", "wc_main_review": "539;296;237;218;358", "wc_review": "612;374;339;326;441", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "0;0;0;0;0", "reply_reviewers": "0;0;0;0;0", "reply_authors": "0;0;0;0;0", "recommendation_avg": [ 4.2, 0.9797958971132712 ], "confidence_avg": [ 4.4, 0.8 ], "correctness_avg": [ 2.8, 0.7483314773547882 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 1.6, 0.4898979485566356 ], "wc_summary_paper_avg": [ 54.6, 12.626955294131678 ], "wc_summary_review_avg": [ 34.2, 10.166612021711067 ], "wc_main_review_avg": [ 329.6, 115.5968857712006 ], "wc_review_avg": [ 418.4, 104.6949855532728 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.6123724356957946, "corr_recommendation_correctness": 0.8728715609439696, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7205038268241682904&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "Southern University of Science and Technology;Agency for Science, Technology and Research", "aff_unique_dep": ";", "aff_unique_url": "https://www.sustech.edu.cn;https://www.a-star.edu.sg", "aff_unique_abbr": "SUSTech;A*STAR", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", "aff_country_unique": "China;Singapore" }, { "id": "pTJKCACq6tM", "title": "Robust Training in High Dimensions via Block Coordinate Geometric Median Descent", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Geometric median (GM) is a classical method in statistics for achieving a robust estimation of the uncorrupted data; under gross corruption, it achieves the optimal breakdown point of 0.5. However, its computational complexity makes it infeasible for robustifying stochastic gradient descent (SGD) for high-dimensional optimization problems. In this paper, we show that by applying GM to only a judiciously chosen block of coordinates at a time and using a memory mechanism, one can retain the breakdown point of 0.5 for smooth non-convex problems, with non-asymptotic convergence rates comparable to the SGD with GM.\n\nWe validate both the runtime and the robustness of our approach empirically on three neural network settings including ResNet-18 on CIFAR-10 and MLP / LeNet on Fashion-MNIST.", "keywords": "robust;optimization;efficient;geometric median;median;breakdown point", "primary_area": "", "supplementary_material": "", "author": "Anish Acharya;Abolfazl Hashemi;Prateek Jain;sujay sanghavi;Inderjit S Dhillon;ufuk topcu", "authorids": "~Anish_Acharya1;~Abolfazl_Hashemi1;~Prateek_Jain1;~sujay_sanghavi1;~Inderjit_S_Dhillon1;~ufuk_topcu1", "gender": "M;M;M;M;M;Unspecified", "homepage": "https://anishacharya.github.io/;https://abolfazlh.github.io/;http://prateekjain.org;https://sites.utexas.edu/sanghavi;http://www.cs.utexas.edu/users/inderjit/;https://autonomy.oden.utexas.edu/", "dblp": "120/7655.html;176/5595;https://dblp.uni-trier.de/pers/j/Jain_0002:Prateek.html;69/4911.html;d/InderjitSDhillon;12/6659.html", "google_scholar": "https://scholar.google.co.in/citations?hl=en;Se7mocgAAAAJ;qYhRbJoAAAAJ;O-DazBUAAAAJ;xBv5ZfkAAAAJ;jeNGFfQAAAAJ", "orcid": ";0000-0002-8421-4270;;;;0000-0003-0819-9985", "linkedin": "anish-acharya-a98a9383/;abolfazlh;;;inderjit-dhillon-a20888b0/;", "or_profile": "~Anish_Acharya1;~Abolfazl_Hashemi1;~Prateek_Jain1;~sujay_sanghavi1;~Inderjit_S_Dhillon1;~ufuk_topcu1", "aff": "Meta;Purdue University;Google;University of Texas, Austin;University of Texas, Austin;University of Texas, Austin", "aff_domain": "fb.com;purdue.edu;google.com;utexas.edu;utexas.edu;utexas.edu", "position": "Researcher;Assistant Professor;Researcher;Associate Professor;Full Professor;Full Professor", "bibtex": "@misc{\nacharya2022robust,\ntitle={Robust Training in High Dimensions via Block Coordinate Geometric Median Descent},\nauthor={Anish Acharya and Abolfazl Hashemi and Prateek Jain and sujay sanghavi and Inderjit S Dhillon and ufuk topcu},\nyear={2022},\nurl={https://openreview.net/forum?id=pTJKCACq6tM}\n}", "github": "", "project": "", "reviewers": "", "site": "https://openreview.net/forum?id=pTJKCACq6tM", "pdf_size": 0, "recommendation": "", "confidence": "", "correctness": "", "technical_novelty": "", "empirical_novelty": "", "wc_summary_paper": "", "wc_summary_review": "", "wc_main_review": "", "wc_review": "", "wc_reply_reviewers": "", "wc_reply_authors": "", "reply_reviewers": "", "reply_authors": "", "recommendation_avg": [ 0, 0 ], "confidence_avg": [ 0, 0 ], "correctness_avg": [ 0, 0 ], "technical_novelty_avg": [ 0, 0 ], "empirical_novelty_avg": [ 0, 0 ], "wc_summary_paper_avg": [ 0, 0 ], "wc_summary_review_avg": [ 0, 0 ], "wc_main_review_avg": [ 0, 0 ], "wc_review_avg": [ 0, 0 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 1, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0, "corr_recommendation_correctness": 0, "gs_citation": 38, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4867031682711093930&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "aff_unique_index": "0;1;2;3;3;3", "aff_unique_norm": "Meta;Purdue University;Google;University of Texas at Austin", "aff_unique_dep": "Meta Platforms, Inc.;;Google;", "aff_unique_url": "https://meta.com;https://www.purdue.edu;https://www.google.com;https://www.utexas.edu", "aff_unique_abbr": "Meta;Purdue;Google;UT Austin", "aff_campus_unique_index": "1;2;2;2", "aff_campus_unique": ";Mountain View;Austin", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "pVU7Gp7Nq4k", "title": "Representation mitosis in wide neural networks", "track": "main", "status": "Reject", "tldr": "", "abstract": " Deep neural networks (DNNs) defy the classical bias-variance trade-off: adding parameters to a DNN that interpolates its training data will typically improve its generalization performance. Explaining the mechanism behind this ``benign overfitting'' in deep networks remains an outstanding challenge. Here, we study the last hidden layer representations of various state-of-the-art convolutional neural networks and find evidence for an underlying mechanism that we call \"representation mitosis\": if the last hidden representation is wide enough, its neurons tend to split into groups which carry identical information, and differ from each other only by a statistically independent noise. Like in a mitosis process, the number of such groups, or \"clones'', increases linearly with the width of the layer, but only if the width is above a critical value. We show that a key ingredient to activate mitosis is continuing the training process until the training error is zero ", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Diego Doimo;Aldo Glielmo;Sebastian Goldt;Alessandro Laio", "authorids": "~Diego_Doimo1;~Aldo_Glielmo1;~Sebastian_Goldt1;~Alessandro_Laio1", "gender": ";M;M;M", "homepage": ";https://aldoglielmo.github.io/;https://datascience.sissa.it/research-unit/12/theory-of-neural-networks;https://people.sissa.it/~laio/", "dblp": "270/0353;239/6524;234/8941;", "google_scholar": "yu7h58MAAAAJ;ux0SMq4AAAAJ;R06wsMkAAAAJ;https://scholar.google.it/citations?user=ma-T1oEAAAAJ", "orcid": "0000-0002-1553-1504;0000-0002-4737-2878;;", "linkedin": "diego-doimo-84575b158;https://it.linkedin.com/in/aldo-glielmo-a1999764;;", "or_profile": "~Diego_Doimo1;~Aldo_Glielmo1;~Sebastian_Goldt1;~Alessandro_Laio1", "aff": ";Banca d'Italia;SISSA;SISSA/ISAS", "aff_domain": ";bancaditalia.it;sissa.it;sissa.it", "position": ";Researcher;Assistant Professor;Full Professor", "bibtex": "@misc{\ndoimo2022representation,\ntitle={Representation mitosis in wide neural networks},\nauthor={Diego Doimo and Aldo Glielmo and Sebastian Goldt and Alessandro Laio},\nyear={2022},\nurl={https://openreview.net/forum?id=pVU7Gp7Nq4k}\n}", "github": "", "project": "", "reviewers": "7hrx;vhrZ;YoAs;KFV6", "site": "https://openreview.net/forum?id=pVU7Gp7Nq4k", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "4;2;4;2", "correctness": "3;3;3;3", "technical_novelty": "2;3;1;2", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "55;102;151;139", "wc_summary_review": "26;61;155;272", "wc_main_review": "477;193;578;1392", "wc_review": "558;356;884;1803", "wc_reply_reviewers": "0;0;0;456", "wc_reply_authors": "747;576;884;1628", "reply_reviewers": "0;0;0;1", "reply_authors": "2;2;2;3", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 3.0, 1.0 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 111.75, 37.41239767777521 ], "wc_summary_review_avg": [ 128.5, 95.33755818144284 ], "wc_main_review_avg": [ 660.0, 445.5687376825264 ], "wc_review_avg": [ 900.25, 554.2032005501231 ], "wc_reply_reviewers_avg": [ 114.0, 197.45379206285202 ], "wc_reply_authors_avg": [ 958.75, 401.5030354804307 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 2.25, 0.4330127018922193 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17694809488536676437&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1;1", "aff_unique_norm": "Banca d'Italia;Scuola Internazionale Superiore di Studi Avanzati", "aff_unique_dep": ";", "aff_unique_url": "https://www.bancaditalia.it;https://www.sissa.it", "aff_unique_abbr": "BDI;SISSA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Italy" }, { "title": "An Operator Theoretic View On Pruning Deep Neural Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6940", "id": "pWBNOgdeURp", "poster": "", "openreview": "https://openreview.net/forum?id=pWBNOgdeURp", "slides": "https://iclr.cc/virtual/2022/poster/6940", "video": "https://iclr.cc/virtual/2022/poster/6940", "author_site": "William Redman, Maria Fonoberova, Ryan Mohr, Yannis Kevrekidis, Igor Mezic", "tldr": "", "abstract": "The discovery of sparse subnetworks that are able to perform as well as full models has found broad applied and theoretical interest. While many pruning methods have been developed to this end, the na\u00efve approach of removing parameters based on their magnitude has been found to be as robust as more complex, state-of-the-art algorithms. The lack of theory behind magnitude pruning's success, especially pre-convergence, and its relation to other pruning methods, such as gradient based pruning, are outstanding open questions in the field that are in need of being addressed. We make use of recent advances in dynamical systems theory, namely Koopman operator theory, to define a new class of theoretically motivated pruning algorithms. We show that these algorithms can be equivalent to magnitude and gradient based pruning, unifying these seemingly disparate methods, and find that they can be used to shed light on magnitude pruning's performance during the early part of training.", "keywords": "deep neural network pruning;Koopman operator theory", "primary_area": "", "supplementary_material": "", "author": "William T Redman;MARIA FONOBEROVA;Ryan Mohr;Yannis Kevrekidis;Igor Mezic", "authorids": "~William_T_Redman1;~MARIA_FONOBEROVA1;mohrr@aimdyn.com;~Yannis_Kevrekidis1;mezic@ucsb.edu", "gender": "M;;;M;", "homepage": "https://wredman4.wixsite.com/wtredman;;;https://engineering.jhu.edu/faculty/ioannis-kevrekidis/;", "dblp": "266/7985;00/4583;;;", "google_scholar": "-SOfw0AAAAAJ;wz5pBY4AAAAJ;;;", "orcid": ";;;;", "linkedin": ";maria-fonoberova-52506137/;;;", "or_profile": "~William_T_Redman1;~MARIA_FONOBEROVA1;mohrr@aimdyn.com;~Yannis_Kevrekidis1;mezic@ucsb.edu", "aff": "UC Santa Barbara;AIMdyn, Inc.;;Johns Hopkins University;", "aff_domain": "ucsb.edu;aimdyn.com;;jh.edu;", "position": "PhD student;Director of Research;;Full Professor;", "bibtex": "@inproceedings{\nredman2022an,\ntitle={An Operator Theoretic View On Pruning Deep Neural Networks},\nauthor={William T Redman and MARIA FONOBEROVA and Ryan Mohr and Yannis Kevrekidis and Igor Mezic},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=pWBNOgdeURp}\n}", "github": "", "project": "", "reviewers": "zqcK;Q2k5;Zwfq;kuYT", "pdf_size": 0, "recommendation": "6;6;6;6", "confidence": "4;3;3;2", "correctness": "3;4;4;3", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "44;20;70;45", "wc_summary_review": "82;21;30;30", "wc_main_review": "589;171;244;54", "wc_review": "715;212;344;129", "wc_reply_reviewers": "153;19;55;0", "wc_reply_authors": "1066;329;586;0", "reply_reviewers": "2;1;1;0", "reply_authors": "2;2;2;0", "recommendation_avg": [ 6.0, 0.0 ], "confidence_avg": [ 3.0, 0.7071067811865476 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 44.75, 17.68297203526602 ], "wc_summary_review_avg": [ 40.75, 24.097458372201828 ], "wc_main_review_avg": [ 264.5, 199.23164909220623 ], "wc_review_avg": [ 350.0, 224.24651613793245 ], "wc_reply_reviewers_avg": [ 56.75, 58.976160437926104 ], "wc_reply_authors_avg": [ 495.25, 389.51981656906753 ], "reply_reviewers_avg": [ 1.0, 0.7071067811865476 ], "reply_authors_avg": [ 1.5, 0.8660254037844386 ], "replies_avg": [ 22, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8445825947992630550&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=pWBNOgdeURp", "email": "ucsb.edu;aimdyn.com;;jh.edu;", "author_num": 5, "aff_unique_index": "0;1;2", "aff_unique_norm": "University of California, Santa Barbara;AIMdyn, Inc.;Johns Hopkins University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.ucsb.edu;;https://www.jhu.edu", "aff_unique_abbr": "UCSB;;JHU", "aff_campus_unique_index": "0", "aff_campus_unique": "Santa Barbara;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "pXNXwaLu5MN", "title": "Domain-Invariant Representation Learning with Global and Local Consistency", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "In this paper, we give an analysis of the existing representation learning framework of unsupervised domain adaptation and show that the learned feature representations of the source domain samples are with discriminability, compressibility, and transferability. However, the learned feature representations of the target domain samples are only with compressibility and transferability. To address this challenge, we propose a novel framework and show from the information theory view that this framework can effectively improve the discriminability of the target domain sample representation. We also propose a method, namely domain-invariant representation learning with global and local consistency (RLGLC), under this framework. In particular, to maintain the global consistency, RLGLC proposes a new metric called asymmetrically-relaxed Wasserstein of Wasserstein distance (AR-WWD), AR-WWD can not only extract the transferability and compressibility of the feature representation of two domains, but also correlates well with human perception. To impose the local consistency structures, we propose a regularized contrastive loss, which can not only keep as much as possible predictive information contained in the feature representation of the target domain, but also alleviates the problem that semantically similar instances are undesirable pushed apart in training processing. Finally, we verify the effectiveness of RLGLC from both theoretical analyses on Bayes error rate and experimental validation on several benchmarks.", "keywords": "Domain adaptation;Wasserstein distance;Contrastive loss;Information theory", "primary_area": "", "supplementary_material": "", "author": "Wenwen Qiang;Jiangmeng Li;Jie Hu;Bing Su;Changwen Zheng;Hui Xiong", "authorids": "~Wenwen_Qiang1;~Jiangmeng_Li1;~Jie_Hu4;~Bing_Su1;~Changwen_Zheng1;~Hui_Xiong1", "gender": "M;M;M;M;M;M", "homepage": ";https://jiangmengli.github.io/;https://gsai.ruc.edu.cn/bingsu;http://people.ucas.ac.cn/~cwzheng;https://www.hkust-gz.edu.cn/people/hui-xiong/;", "dblp": "261/6913;293/0997;41/5270-1;81/2728;262/1686-1.html;90/5064-19", "google_scholar": "https://scholar.google.com/citations?hl=zh-CN;https://scholar.google.com.sg/citations?user=-kU4VLcAAAAJ;https://scholar.google.com.sg/citations?user=d3g2VJQAAAAJ;-lErK1QAAAAJ;cVDF1tkAAAAJ;DAJdHnkAAAAJ", "orcid": "0000-0002-7985-5743;0000-0002-3376-1522;0000-0001-8560-1910;0000-0002-2311-6757;0000-0001-6016-6465;0000-0002-5150-1003", "linkedin": ";jiangmeng-li-86aaa7125/;;;;%E6%9D%B0-%E8%83%A1-97093710a/", "or_profile": "~Wenwen_Qiang1;~Jiangmeng_Li1;~Bing_Su1;~Changwen_Zheng1;~Hui_Xiong1;~Jie_Hu3", "aff": "Institute of Software Chinese Academy of Sciences;Institute of Software, Chinese Academy of Sciences;Renmin University of China;Institute of Software, Chinese Academy of Sciences;Hong Kong University of Science and Technology (Guangzhou);Institute of software, Chinese Academy of Sciences", "aff_domain": "iscas.ac.cn;iscas.ac.cn;ruc.edu.cn;iscas.ac.cn;hkust.edu;ios.ac.cn", "position": "PhD student;PhD student;Associate Professor;Full Professor;Full Professor;PhD student", "bibtex": "@misc{\nqiang2022domaininvariant,\ntitle={Domain-Invariant Representation Learning with Global and Local Consistency},\nauthor={Wenwen Qiang and Jiangmeng Li and Jie Hu and Bing Su and Changwen Zheng and Hui Xiong},\nyear={2022},\nurl={https://openreview.net/forum?id=pXNXwaLu5MN}\n}", "github": "", "project": "", "reviewers": "wdUo;ZUAj;ajRB;vEVb", "site": "https://openreview.net/forum?id=pXNXwaLu5MN", "pdf_size": 0, "recommendation": "3;5;5;5", "confidence": "4;3;4;4", "correctness": "2;2;2;3", "technical_novelty": "3;3;3;3", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "51;59;58;150", "wc_summary_review": "35;63;39;149", "wc_main_review": "798;361;271;615", "wc_review": "884;483;368;914", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.5, 0.8660254037844386 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 2.25, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 79.5, 40.81972562377165 ], "wc_summary_review_avg": [ 71.5, 46.00815145167213 ], "wc_main_review_avg": [ 511.25, 208.13742455406717 ], "wc_review_avg": [ 662.25, 240.44996880848208 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:HmbCPZqxjPwJ:scholar.google.com/&scioq=Domain-Invariant+Representation+Learning+with+Global+and+Local+Consistency&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;1;0;2;0", "aff_unique_norm": "Chinese Academy of Sciences;Renmin University of China;Hong Kong University of Science and Technology", "aff_unique_dep": "Institute of Software;;", "aff_unique_url": "http://www.is.cas.cn;http://www.ruc.edu.cn;https://www.ust.hk", "aff_unique_abbr": "CAS;RUC;HKUST", "aff_campus_unique_index": "1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "China" }, { "id": "pabrsHBfKU", "title": "Adapt to Adaptation: Learning to Personalize for Cross-Silo Federated Learning", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "The goal of conventional federated learning (FL) is to train a global model for a federation of clients with decentralized data, reducing the systemic privacy risk of centralized training. The distribution shift across non-IID datasets, also known as the data heterogeneity, often poses a challenge for this one-global-model-fits-all solution. In this work, we propose APPLE, a personalized cross-silo FL framework that adaptively learns how much each client can benefit from other clients\u2019 models. We also introduce a method to flexibly control the focus of training APPLE between global and local objectives. We empirically evaluate our method's convergence and generalization behavior and performed extensive experiments on two benchmark datasets and two medical imaging datasets under two non-IID settings. The results show that the proposed personalized FL framework, APPLE, achieves state-of-the-art performance compared to several other personalized FL approaches in the literature.", "keywords": "Federated learning;Personalization;Optimization", "primary_area": "", "supplementary_material": "", "author": "Jun Luo;Shandong Wu", "authorids": "~Jun_Luo4;~Shandong_Wu1", "gender": "M;M", "homepage": "https://www.junluo.me;https://www.dbmi.pitt.edu/node/52452", "dblp": "42/2501-10.html;", "google_scholar": "Feoyi2AAAAAJ;", "orcid": "0000-0002-8283-3991;", "linkedin": "jun-luo-1b3ab118b;", "or_profile": "~Jun_Luo4;~Shandong_Wu1", "aff": "University of Pittsburgh;University of Pittsburgh", "aff_domain": "pitt.edu;pitt.edu", "position": "PhD student;Associate Professor", "bibtex": "@misc{\nluo2022adapt,\ntitle={Adapt to Adaptation: Learning to Personalize for Cross-Silo Federated Learning},\nauthor={Jun Luo and Shandong Wu},\nyear={2022},\nurl={https://openreview.net/forum?id=pabrsHBfKU}\n}", "github": "", "project": "", "reviewers": "nVsS;6HDh;MU6C;o6Lb", "site": "https://openreview.net/forum?id=pabrsHBfKU", "pdf_size": 0, "recommendation": "3;3;5;6", "confidence": "3;4;4;3", "correctness": "3;3;3;4", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "118;134;81;60", "wc_summary_review": "58;27;141;29", "wc_main_review": "458;486;460;162", "wc_review": "634;647;682;251", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.25, 1.299038105676658 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 98.25, 29.277764600460877 ], "wc_summary_review_avg": [ 63.75, 46.25675626327467 ], "wc_main_review_avg": [ 391.5, 132.9614605816287 ], "wc_review_avg": [ 553.5, 175.52848771638182 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.19245008972987526, "corr_recommendation_correctness": 0.7777777777777777, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:fvisYAIHEo0J:scholar.google.com/&scioq=Adapt+to+Adaptation:+Learning+to+Personalize+for+Cross-Silo+Federated+Learning&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "University of Pittsburgh", "aff_unique_dep": "", "aff_unique_url": "https://www.pitt.edu", "aff_unique_abbr": "Pitt", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "pbduKpYzn9j", "title": "A Comprehensive Overhaul of Distilling Unconditional GANs", "track": "main", "status": "Reject", "tldr": "", "abstract": "Generative adversarial networks (GANs) have achieved impressive results on various content generation tasks. Yet, their high demand on storage and computation impedes their deployment on resource-constrained devices. Though several GAN compression methods have been proposed to address the problem, most of them focus on conditional GANs. In this paper, we provide a comprehensive overhaul of distilling unconditional GAN, especially for the popular StyleGAN2 architecture. Our key insight is that the main challenge of unconditional GAN distillation lies in the output discrepancy issue, where the teacher and student model yield different outputs given the same input latent code. Standard knowledge distillation losses typically fail under this heterogeneous distillation scenario. We conduct thorough analysis about the reasons and effects of this discrepancy issue, and identify that the style module plays a vital role in determining semantic information of generated images. Based on this finding, we propose a novel initialization strategy for the student model, which can ensure the output consistency to the maximum extent. To further enhance the semantic consistency between the teacher and student model, we present another latent-direction-based distillation loss that preserves the semantic relations in latent space. Extensive experiments demonstrate that our framework achieves state-of-the-art results in StyleGAN2 distillation, outperforming the existing GAN distillation methods by a large margin.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Guodong Xu;Yuenan Hou;Ziwei Liu;Chen Change Loy", "authorids": "~Guodong_Xu2;~Yuenan_Hou1;~Ziwei_Liu1;~Chen_Change_Loy2", "gender": "M;M;M;M", "homepage": ";https://cardwing.github.io/;https://liuziwei7.github.io/;https://www.mmlab-ntu.com/person/ccloy/index.html", "dblp": ";210/3047;05/6300-2;01/5855", "google_scholar": ";https://scholar.google.com.hk/citations?user=o9mX9sUAAAAJ;https://scholar.google.com.hk/citations?user=lc45xlcAAAAJ;https://scholar.google.co.uk/citations?user=559LF80AAAAJ", "orcid": ";0000-0002-2844-7416;;0000-0001-5345-1591", "linkedin": "%E5%9B%BD%E6%A0%8B-%E5%BE%90-854072123/;yuenan-hou-859589136/;;", "or_profile": "~Guodong_Xu2;~Yuenan_Hou1;~Ziwei_Liu1;~Chen_Change_Loy2", "aff": "The Chinese University of Hong Kong;Shanghai AI Laboratory;Nanyang Technological University;Nanyang Technological University", "aff_domain": "ie.cuhk.edu;pjlab.org.cn;ntu.edu.sg;ntu.edu.sg", "position": "PhD student;Researcher;Assistant Professor;Full Professor", "bibtex": "@misc{\nxu2022a,\ntitle={A Comprehensive Overhaul of Distilling Unconditional {GAN}s},\nauthor={Guodong Xu and Yuenan Hou and Ziwei Liu and Chen Change Loy},\nyear={2022},\nurl={https://openreview.net/forum?id=pbduKpYzn9j}\n}", "github": "", "project": "", "reviewers": "bbKQ;RUSD;F96C;MFc1;fZmP", "site": "https://openreview.net/forum?id=pbduKpYzn9j", "pdf_size": 0, "recommendation": "3;5;5;6;6", "confidence": "4;5;4;3;4", "correctness": "2;3;2;4;3", "technical_novelty": "2;2;2;3;3", "empirical_novelty": "2;2;2;2;3", "wc_summary_paper": "95;68;79;76;108", "wc_summary_review": "56;117;141;97;114", "wc_main_review": "273;666;948;139;276", "wc_review": "424;851;1168;312;498", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "216;801;748;142;310", "reply_reviewers": "0;0;0;0;0", "reply_authors": "1;1;1;1;1", "recommendation_avg": [ 5.0, 1.0954451150103321 ], "confidence_avg": [ 4.0, 0.6324555320336759 ], "correctness_avg": [ 2.8, 0.7483314773547882 ], "technical_novelty_avg": [ 2.4, 0.4898979485566356 ], "empirical_novelty_avg": [ 2.2, 0.39999999999999997 ], "wc_summary_paper_avg": [ 85.2, 14.386104406683556 ], "wc_summary_review_avg": [ 105.0, 28.234730386529282 ], "wc_main_review_avg": [ 460.4, 300.8139624419053 ], "wc_review_avg": [ 650.6, 315.3243409570533 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 443.4, 276.04608310932434 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.2886751345948129, "corr_recommendation_correctness": 0.7319250547113999, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:UhFBsm2UqBEJ:scholar.google.com/&scioq=A+Comprehensive+Overhaul+of+Distilling+Unconditional+GANs&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1;2;2", "aff_unique_norm": "Chinese University of Hong Kong;Shanghai AI Laboratory;Nanyang Technological University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.cuhk.edu.hk;https://www.shanghai-ai-lab.com;https://www.ntu.edu.sg", "aff_unique_abbr": "CUHK;SAIL;NTU", "aff_campus_unique_index": "0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;1;1", "aff_country_unique": "China;Singapore" }, { "id": "per0G3dnkYh", "title": "Marginal Tail-Adaptive Normalizing Flows", "track": "main", "status": "Reject", "tldr": "", "abstract": "Learning the tail behavior of a distribution is a notoriously difficult problem. The number of samples from the tail is small, and deep generative models, such as normalizing flows, tend to concentrate on learning the body of the distribution. In this paper, we focus on improving the ability of normalizing flows to correctly capture the tail behavior and, thus, form more accurate models. We prove that the marginal tailedness of a triangular flow can be controlled via the tailedness of the marginals of the base distribution of the normalizing flow. This theoretical insight leads us to a novel type of triangular flows based on learnable base distributions and data-driven permutations. Since the proposed flows preserve marginal tailedness, we call them marginal tail-adaptive flows (mTAFs). An empirical analysis on synthetic data shows that mTAF improves on the robustness and efficiency of vanilla flows and\u2014motivated by our theory\u2014allows to successfully generate tail samples from the distributions. More generally, our experiments affirm that a careful choice of the base distribution is an effective way to introducing inductive biases to normalizing flows.", "keywords": "Normalizing Flows;Heavy-Tailed Data;Generative Models", "primary_area": "", "supplementary_material": "/attachment/f8fce7f6db2917fbb56ef4f30402d8aadc28c9c3.zip", "author": "Mike Laszkiewicz;Johannes Lederer;Asja Fischer", "authorids": "~Mike_Laszkiewicz1;~Johannes_Lederer1;~Asja_Fischer1", "gender": "M;;F", "homepage": ";;", "dblp": "https://dblp.uni-trier.de/pid/264/5914.html;;76/8485", "google_scholar": ";;FyZbyIUAAAAJ", "orcid": ";;0000-0002-1916-7033", "linkedin": ";;", "or_profile": "~Mike_Laszkiewicz1;~Johannes_Lederer1;~Asja_Fischer1", "aff": "Ruhr-Universt\u00e4t Bochum;;Ruhr-Universit\u00e4t Bochum", "aff_domain": "rub.de;;ruhr-uni-bochum.de", "position": "PhD student;;Full Professor", "bibtex": "@misc{\nlaszkiewicz2022marginal,\ntitle={Marginal Tail-Adaptive Normalizing Flows},\nauthor={Mike Laszkiewicz and Johannes Lederer and Asja Fischer},\nyear={2022},\nurl={https://openreview.net/forum?id=per0G3dnkYh}\n}", "github": "", "project": "", "reviewers": "dggh;EStc;ZEHf;pWUG", "site": "https://openreview.net/forum?id=per0G3dnkYh", "pdf_size": 0, "recommendation": "3;3;3;3", "confidence": "4;4;4;3", "correctness": "4;4;4;3", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;2;1;2", "wc_summary_paper": "84;40;79;81", "wc_summary_review": "21;77;46;27", "wc_main_review": "156;524;563;266", "wc_review": "261;641;688;374", "wc_reply_reviewers": "34;78;0;0", "wc_reply_authors": "0;164;395;0", "reply_reviewers": "1;1;0;0", "reply_authors": "0;1;1;0", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 71.0, 17.98610574860495 ], "wc_summary_review_avg": [ 42.75, 21.821720830401986 ], "wc_main_review_avg": [ 377.25, 171.29415489151987 ], "wc_review_avg": [ 491.0, 178.81414932829 ], "wc_reply_reviewers_avg": [ 28.0, 32.03123475609393 ], "wc_reply_authors_avg": [ 139.75, 161.86471975078447 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 0.5, 0.5 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3241792279775112520&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff_unique_index": "0;0", "aff_unique_norm": "Ruhr-Universit\u00e4t Bochum", "aff_unique_dep": "", "aff_unique_url": "https://www.ruhr-uni-bochum.de", "aff_unique_abbr": "RUB", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Germany" }, { "title": "Vector-quantized Image Modeling with Improved VQGAN", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6405", "id": "pfNyExj7z2", "poster": "", "openreview": "https://openreview.net/forum?id=pfNyExj7z2", "slides": "https://iclr.cc/virtual/2022/poster/6405", "video": "https://iclr.cc/virtual/2022/poster/6405", "author_site": "Jiahui Yu, Xin Li, Jing Yu Koh, Han Zhang, Ruoming Pang, James Qin, Alexander Ku, Yuanzhong Xu, Jason Baldridge, Yonghui Wu", "tldr": "", "abstract": "Pretraining language models with next-token prediction on massive text corpora has delivered phenomenal zero-shot, few-shot, transfer learning and multi-tasking capabilities on both generative and discriminative language tasks. Motivated by this success, we explore a Vector-quantized Image Modeling (VIM) approach that involves pretraining a Transformer to predict rasterized image tokens autoregressively. The discrete image tokens are encoded from a learned Vision-Transformer-based VQGAN (ViT-VQGAN). We first propose multiple improvements over vanilla VQGAN from architecture to codebook learning, yielding better efficiency and reconstruction fidelity. The improved ViT-VQGAN further improves vector-quantized image modeling tasks, including unconditional, class-conditioned image generation and unsupervised representation learning. When trained on ImageNet at 256x256 resolution, we achieve Inception Score (IS) of 175.1 and Fr'echet Inception Distance (FID) of 4.17, a dramatic improvement over the vanilla VQGAN, which obtains 70.6 and 17.04 for IS and FID, respectively. Based on ViT-VQGAN and unsupervised pretraining, we further evaluate the pretrained Transformer by averaging intermediate features, similar to Image GPT (iGPT). This ImageNet-pretrained VIM-L significantly beats iGPT-L on linear-probe accuracy from 60.3% to 73.2% for a similar model size. ViM-L also outperforms iGPT-XL which is trained with extra web image data and larger model size.", "keywords": "VQGAN;Vision Transformers;Vector-quantized Image Modeling", "primary_area": "", "supplementary_material": "", "author": "Jiahui Yu;Xin Li;Jing Yu Koh;Han Zhang;Ruoming Pang;James Qin;Alexander Ku;Yuanzhong Xu;Jason Baldridge;Yonghui Wu", "authorids": "~Jiahui_Yu1;~Xin_Li41;~Jing_Yu_Koh2;~Han_Zhang1;ruoming@gmail.com;~James_Qin1;~Alexander_Ku1;~Yuanzhong_Xu1;~Jason_Baldridge1;~Yonghui_Wu1", "gender": "M;M;;M;;M;M;;;M", "homepage": "http://jiahuiyu.com/;;;https://sites.google.com/corp/view/hanzhang;;https://scholar.google.com/citations?hl=en&user=zkDuH4MAAAAJ;https://alexyku.github.io/;;;", "dblp": "185/1060;;;;;;215/4289.html;;;26/2189", "google_scholar": "-CLCMk4AAAAJ;;;cxEoVL4AAAAJ;;;Lh_ZqdcAAAAJ;KzRHnx0AAAAJ;;55FnA9wAAAAJ", "orcid": ";;;;;;;;;", "linkedin": "jiahuiyuu/;xin-li-b339b368;;;;;;;;", "or_profile": "~Jiahui_Yu1;~Xin_Li41;~Jing_Yu_Koh2;~Han_Zhang1;ruoming@gmail.com;~James_Qin1;~Alexander_Ku1;~Yuanzhong_Xu1;~Jason_Baldridge1;~Yonghui_Wu1", "aff": "Google Brain;;;Google;;Google;Google;Google;;", "aff_domain": "google.com;;;google.com;;google.com;google.com;google.com;;", "position": "Research Scientist;;;Researcher;;Software Engineer;Researcher;Software Engineer;;", "bibtex": "@inproceedings{\nyu2022vectorquantized,\ntitle={Vector-quantized Image Modeling with Improved {VQGAN}},\nauthor={Jiahui Yu and Xin Li and Jing Yu Koh and Han Zhang and Ruoming Pang and James Qin and Alexander Ku and Yuanzhong Xu and Jason Baldridge and Yonghui Wu},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=pfNyExj7z2}\n}", "github": "", "project": "", "reviewers": "mxmU;GySC;zYPH;6eTH", "pdf_size": 0, "recommendation": "6;6;6;6", "confidence": "4;5;4;4", "correctness": "3;4;3;4", "technical_novelty": "2;3;3;2", "empirical_novelty": "3;3;3;4", "wc_summary_paper": "53;124;42;25", "wc_summary_review": "30;58;186;57", "wc_main_review": "251;155;314;255", "wc_review": "334;337;542;337", "wc_reply_reviewers": "32;0;129;95", "wc_reply_authors": "560;566;938;716", "reply_reviewers": "1;0;1;1", "reply_authors": "1;1;2;1", "recommendation_avg": [ 6.0, 0.0 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 3.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 61.0, 37.71604433129222 ], "wc_summary_review_avg": [ 82.75, 60.660427792754646 ], "wc_main_review_avg": [ 243.75, 56.98848567912645 ], "wc_review_avg": [ 387.5, 89.20902420719554 ], "wc_reply_reviewers_avg": [ 64.0, 50.75923561284193 ], "wc_reply_authors_avg": [ 695.0, 153.58710883404245 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 10, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 575, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7624552553944725680&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=pfNyExj7z2", "email": "google.com;;;google.com;;google.com;google.com;google.com;;", "author_num": 10, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Google", "aff_unique_dep": "Google Brain", "aff_unique_url": "https://brain.google.com", "aff_unique_abbr": "Google Brain", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "pgKE5Q-CF2", "title": "Neuron-Enhanced Autoencoder based Collaborative filtering: Theory and Practice", "track": "main", "status": "Reject", "tldr": "", "abstract": "This paper presents a novel recommendation method called neuron-enhanced autoencoder based collaborative filtering (NE-AECF). The method uses an additional neural network to enhance the reconstruction capability of autoencoder. Different from the main neural network implemented in a layer-wise manner, the additional neural network is implemented in an element-wise manner. They are trained simultaneously to construct an enhanced autoencoder of which the activation function in the output layer is learned adaptively to approximate possibly complicated response functions in real data. We provide theoretical analysis for NE-AECF to investigate the generalization ability of autoencoder and deep learning in collaborative filtering. We prove that the element-wise neural network is able to reduce the upper bound of the prediction error for the unknown ratings, the data sparsity is not problematic but useful, and the prediction performance is closely related to the difference between the number of users and the number of items.\nNumerical results show that our NE-AECF has promising performance on a few benchmark datasets.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jicong Fan;Rui Chen;Chris Ding", "authorids": "~Jicong_Fan2;116010018@link.cuhk.edu.cn;~Chris_Ding1", "gender": "M;;M", "homepage": "https://jicongfan.github.io/;;http://ranger.uta.edu/~chqding/", "dblp": "139/1570;;https://dblp.uni-trier.de/pers/hd/d/Ding:Chris", "google_scholar": "vdJsnhIAAAAJ;;q7FfnjgAAAAJ", "orcid": "0000-0001-9665-0355;;", "linkedin": ";;", "or_profile": "~Jicong_Fan2;116010018@link.cuhk.edu.cn;~Chris_Ding1", "aff": "The Chinese University of Hong Kong, Shenzhen;;University of Texas at Arlington", "aff_domain": "cuhk.edu.cn;;cse.uta.edu", "position": "Research Assistant Professor;;Professor", "bibtex": "@misc{\nfan2022neuronenhanced,\ntitle={Neuron-Enhanced Autoencoder based Collaborative filtering: Theory and Practice},\nauthor={Jicong Fan and Rui Chen and Chris Ding},\nyear={2022},\nurl={https://openreview.net/forum?id=pgKE5Q-CF2}\n}", "github": "", "project": "", "reviewers": "83pQ;rgcR;7kpq;WENm", "site": "https://openreview.net/forum?id=pgKE5Q-CF2", "pdf_size": 0, "recommendation": "1;3;5;5", "confidence": "5;3;4;4", "correctness": "1;3;4;3", "technical_novelty": "2;2;3;3", "empirical_novelty": "1;2;2;3", "wc_summary_paper": "67;47;50;76", "wc_summary_review": "23;58;36;217", "wc_main_review": "279;274;135;151", "wc_review": "369;379;221;444", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.5, 1.6583123951777 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 2.75, 1.0897247358851685 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 60.0, 11.979148550710939 ], "wc_summary_review_avg": [ 83.5, 78.08488970345032 ], "wc_main_review_avg": [ 209.75, 67.01259210029112 ], "wc_review_avg": [ 353.25, 81.60384488490723 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.42640143271122083, "corr_recommendation_correctness": 0.899228803025897, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:R-POBYLsxZkJ:scholar.google.com/&scioq=Neuron-Enhanced+Autoencoder+based+Collaborative+filtering:+Theory+and+Practice&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1", "aff_unique_norm": "Chinese University of Hong Kong;University of Texas at Arlington", "aff_unique_dep": ";", "aff_unique_url": "https://www.cuhk.edu.cn;https://www.uta.edu", "aff_unique_abbr": "CUHK;UTA", "aff_campus_unique_index": "0;1", "aff_campus_unique": "Shenzhen;Arlington", "aff_country_unique_index": "0;1", "aff_country_unique": "China;United States" }, { "title": "Generative Principal Component Analysis", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/5902", "id": "pgir5f7ekAL", "poster": "", "openreview": "https://openreview.net/forum?id=pgir5f7ekAL", "slides": "https://iclr.cc/virtual/2022/poster/5902", "video": "https://iclr.cc/virtual/2022/poster/5902", "author_site": "Zhaoqiang Liu, Jiulong Liu, Subhroshekhar Ghosh, Jun Han, Jonathan Scarlett", "tldr": "", "abstract": "In this paper, we study the problem of principal component analysis with generative modeling assumptions, adopting a general model for the observed matrix that encompasses notable special cases, including spiked matrix recovery and phase retrieval. The key assumption is that the first principal eigenvector lies near the range of an $L$-Lipschitz continuous generative model with bounded $k$-dimensional inputs. We propose a quadratic estimator, and show that it enjoys a statistical rate of order $\\sqrt{\\frac{k\\log L}{m}}$, where $m$ is the number of samples. Moreover, we provide a variant of the classic power method, which projects the calculated data onto the range of the generative model during each iteration. We show that under suitable conditions, this method converges exponentially fast to a point achieving the above-mentioned statistical rate. This rate is conjectured in~\\citep{aubin2019spiked,cocola2020nonasymptotic} to be the best possible even when we only restrict to the special case of spiked matrix models. We perform experiments on various image datasets for spiked matrix and phase retrieval models, and illustrate performance gains of our method to the classic power method and the truncated power method devised for sparse principal component analysis.", "keywords": "Principal component analysis;generative models;sparse principal component analysis;projected power methods;optimal statistical rates", "primary_area": "", "supplementary_material": "", "author": "Zhaoqiang Liu;Jiulong Liu;Subhroshekhar Ghosh;Jun Han;Jonathan Scarlett", "authorids": "~Zhaoqiang_Liu1;~Jiulong_Liu1;~Subhroshekhar_Ghosh1;~Jun_Han4;~Jonathan_Scarlett1", "gender": "M;M;;;M", "homepage": ";;https://subhro-ghosh.github.io/;;https://www.comp.nus.edu.sg/~scarlett/", "dblp": "198/1405;;;02/3721-4;78/9667", "google_scholar": "EmGrPbIAAAAJ;de0zoQ4AAAAJ;RpGHEzsAAAAJ;;https://scholar.google.co.uk/citations?user=a4D08aQAAAAJ", "orcid": ";0000-0001-7199-4581;;;", "linkedin": ";;;;", "or_profile": "~Zhaoqiang_Liu1;~Jiulong_Liu1;~Subhroshekhar_Ghosh1;~Jun_Han4;~Jonathan_Scarlett1", "aff": ";Chinese Academy of Sciences;National University of Singapore;PCG, Tencent;National University of Singapore", "aff_domain": ";ac.cn;nus.edu.sg;tencent.com;nus.edu.sg", "position": ";Associate Professor;Assistant Professor;Senior Researcher;Associate Professor", "bibtex": "@inproceedings{\nliu2022generative,\ntitle={Generative Principal Component Analysis},\nauthor={Zhaoqiang Liu and Jiulong Liu and Subhroshekhar Ghosh and Jun Han and Jonathan Scarlett},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=pgir5f7ekAL}\n}", "github": "", "project": "", "reviewers": "itua;kCBb;4eD5", "pdf_size": 0, "recommendation": "5;6;8", "confidence": "4;4;4", "correctness": "4;3;4", "technical_novelty": "2;4;3", "empirical_novelty": "2;3;3", "wc_summary_paper": "290;100;364", "wc_summary_review": "96;130;26", "wc_main_review": "369;367;171", "wc_review": "755;597;561", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "444;1051;225", "reply_reviewers": "0;0;0", "reply_authors": "1;2;1", "recommendation_avg": [ 6.333333333333333, 1.247219128924647 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 3.0, 0.816496580927726 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 251.33333333333334, 111.19152645573112 ], "wc_summary_review_avg": [ 84.0, 43.29742101634538 ], "wc_main_review_avg": [ 302.3333333333333, 92.87027990099357 ], "wc_review_avg": [ 637.6666666666666, 84.25886039791634 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 573.3333333333334, 349.3940786879798 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.18898223650461363, "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8634676628677545132&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "pdf": "https://openreview.net/pdf?id=pgir5f7ekAL", "email": ";ac.cn;nus.edu.sg;tencent.com;nus.edu.sg", "author_num": 5, "aff_unique_index": "0;1;2;1", "aff_unique_norm": "Chinese Academy of Sciences;National University of Singapore;Tencent", "aff_unique_dep": ";;PCG", "aff_unique_url": "https://www.cas.cn;https://www.nus.edu.sg;https://www.tencent.com", "aff_unique_abbr": "CAS;NUS;Tencent", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;1", "aff_country_unique": "China;Singapore" }, { "id": "pgkwZxLW8b", "title": "Efficient Image Representation Learning with Federated Sampled Softmax", "track": "main", "status": "Reject", "tldr": "", "abstract": "Learning image representations on decentralized data can bring many benefits in cases where data cannot be aggregated across data silos. Softmax cross entropy loss is highly effective and commonly used for learning image representations. Using a large number of classes has proven to be particularly beneficial for the descriptive power of such representations in centralized learning. However, doing so on decentralized data with Federated Learning is not straightforward, as the demand on computation and communication increases proportionally to the number of classes. In this work we introduce Federated Sampled Softmax, a novel resource-efficient approach for learning image representation with Federated Learning. Specifically, the FL clients sample a set of negative classes and optimize only the corresponding model parameters with respect to a sampled softmax objective that approximates the global full softmax objective. We analytically examine the loss formulation and empirically show that our method significantly reduces the number of parameters transferred to and optimized by the client devices, while performing on par with the standard full softmax method. This work creates a possibility for efficiently learning image representations on decentralized data with a large number of classes in a privacy preserving way.", "keywords": "Federated learning;sampled softmax", "primary_area": "", "supplementary_material": "", "author": "Sagar M. Waghmare;Hang Qi;Huizhong Chen;Mikhail Sirotenko;Tomer Meron", "authorids": "~Sagar_M._Waghmare1;~Hang_Qi1;~Huizhong_Chen2;~Mikhail_Sirotenko1;tomer.meron@gmail.com", "gender": "M;;M;M;", "homepage": ";;https://huizhongchen.github.io/;https://www.linkedin.com/in/mihail-sirotenko-33187913/;", "dblp": ";96/1046-1;05/10534;263/7266;", "google_scholar": "l6e9JeEAAAAJ;72jdrSUAAAAJ;WghqyVMAAAAJ;IpGXRaAAAAAJ;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~Sagar_M._Waghmare1;~Hang_Qi1;~Huizhong_Chen2;~Mikhail_Sirotenko1;tomer.meron@gmail.com", "aff": "Google;Google;Google;Google DeepMind;", "aff_domain": "google.com;google.com;google.com;google.com;", "position": "Researcher;Researcher;Researcher;TLM;", "bibtex": "@misc{\nwaghmare2022efficient,\ntitle={Efficient Image Representation Learning with Federated Sampled Softmax},\nauthor={Sagar M. Waghmare and Hang Qi and Huizhong Chen and Mikhail Sirotenko and Tomer Meron},\nyear={2022},\nurl={https://openreview.net/forum?id=pgkwZxLW8b}\n}", "github": "", "project": "", "reviewers": "24vu;nvFP;UuaD;q79o", "site": "https://openreview.net/forum?id=pgkwZxLW8b", "pdf_size": 0, "recommendation": "3;3;3;8", "confidence": "4;4;3;4", "correctness": "4;3;3;4", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;2;2;4", "wc_summary_paper": "113;61;115;122", "wc_summary_review": "84;23;34;32", "wc_main_review": "545;203;196;242", "wc_review": "742;287;345;396", "wc_reply_reviewers": "80;383;0;0", "wc_reply_authors": "422;330;392;246", "reply_reviewers": "1;1;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 4.25, 2.165063509461097 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 102.75, 24.33490291741473 ], "wc_summary_review_avg": [ 43.25, 23.889066536807167 ], "wc_main_review_avg": [ 296.5, 144.5380572721247 ], "wc_review_avg": [ 442.5, 177.1644716075997 ], "wc_reply_reviewers_avg": [ 115.75, 157.7155271366773 ], "wc_reply_authors_avg": [ 347.5, 67.34055241828656 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10758209079020230036&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Google", "aff_unique_dep": "Google", "aff_unique_url": "https://www.google.com", "aff_unique_abbr": "Google", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;0;0;1", "aff_country_unique": "United States;United Kingdom" }, { "title": "Variational oracle guiding for reinforcement learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6904", "id": "pjqqxepwoMy", "poster": "", "openreview": "https://openreview.net/forum?id=pjqqxepwoMy", "slides": "https://iclr.cc/virtual/2022/poster/6904", "video": "https://iclr.cc/virtual/2022/poster/6904", "author_site": "Dongqi Han, Tadashi Kozuno, Xufang Luo, Zhao-Yun Chen, Kenji Doya, Yuqing Yang, Dongsheng Li", "tldr": "", "abstract": "How to make intelligent decisions is a central problem in machine learning and artificial intelligence. Despite recent successes of deep reinforcement learning (RL) in various decision making problems, an important but under-explored aspect is how to leverage oracle observation (the information that is invisible during online decision making, but is available during offline training) to facilitate learning. For example, human experts will look at the replay after a Poker game, in which they can check the opponents' hands to improve their estimation of the opponents' hands from the visible information during playing. In this work, we study such problems based on Bayesian theory and derive an objective to leverage oracle observation in RL using variational methods. Our key contribution is to propose a general learning framework referred to as variational latent oracle guiding (VLOG) for DRL. VLOG is featured with preferable properties such as its robust and promising performance and its versatility to incorporate with any value-based DRL algorithm. We empirically demonstrate the effectiveness of VLOG in online and offline RL domains with tasks ranging from video games to a challenging tile-based game Mahjong. Furthermore, we publish the Mahjong environment and an offline RL dataset as a benchmark to facilitate future research on oracle guiding (https://github.com/Agony5757/mahjong).", "keywords": "variational Bayes;oracle guiding;reinforcement learning;decision making;probabilistic modeling;game;Mahjong", "primary_area": "", "supplementary_material": "/attachment/db5a510d63aee06fc1fadc9d11c429537c48581c.zip", "author": "Dongqi Han;Tadashi Kozuno;Xufang Luo;Zhao-Yun Chen;Kenji Doya;Yuqing Yang;Dongsheng Li", "authorids": "~Dongqi_Han1;~Tadashi_Kozuno1;~Xufang_Luo1;~Zhao-Yun_Chen1;~Kenji_Doya1;~Yuqing_Yang1;~Dongsheng_Li2", "gender": "M;M;F;M;M;;M", "homepage": "https://frosthan.github.io/;;;https://chenzhaoyun.com;https://groups.oist.jp/ncu;;http://recmind.cn", "dblp": ";207/8504;218/7350;291/4228;00/100;91/9064-1.html;254/0830-2.html", "google_scholar": "3V_9fRUAAAAJ;4VJmx8QAAAAJ;;https://scholar.google.com/citations?hl=en;https://scholar.google.co.jp/citations?user=SHufeXQAAAAJ;4BtNQAEAAAAJ;VNg5rA8AAAAJ", "orcid": "0000-0002-6872-7121;;;0000-0002-5181-160X;0000-0002-2446-6820;0000-0003-3518-5212;0000-0003-3103-8442", "linkedin": ";;;;;;", "or_profile": "~Dongqi_Han1;~Tadashi_Kozuno1;~Xufang_Luo1;~Zhao-Yun_Chen1;~Kenji_Doya1;~Yuqing_Yang1;~Dongsheng_Li2", "aff": "Okinawa Institute of Science and Technology (OIST);University of Alberta;Microsoft Research;Institude of Artificial Intelligence, Hefei Comprehensive National Science Center;Okinawa Institute of Science and Technology Graduate University;Microsoft Research;Microsoft Research Asia", "aff_domain": "oist.jp;ualberta.ca;microsoft.com;iai.ustc.edu.cn;oist.jp;research.microsoft.com;microsoft.com", "position": "PhD student;Postdoc;Researcher;Researcher;Full Professor;Researcher;Principal Researcher", "bibtex": "@inproceedings{\nhan2022variational,\ntitle={Variational oracle guiding for reinforcement learning},\nauthor={Dongqi Han and Tadashi Kozuno and Xufang Luo and Zhao-Yun Chen and Kenji Doya and Yuqing Yang and Dongsheng Li},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=pjqqxepwoMy}\n}", "github": "", "project": "", "reviewers": "8vyx;5XWD;7M2j;Wjja", "pdf_size": 0, "recommendation": "3;6;6;8", "confidence": "4;4;3;4", "correctness": "2;3;2;3", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "113;61;82;66", "wc_summary_review": "80;3;37;73", "wc_main_review": "464;573;261;381", "wc_review": "657;637;380;520", "wc_reply_reviewers": "483;83;96;36", "wc_reply_authors": "3990;1622;641;1070", "reply_reviewers": "2;1;1;1", "reply_authors": "9;4;2;3", "recommendation_avg": [ 5.75, 1.7853571071357126 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 80.5, 20.303940504246953 ], "wc_summary_review_avg": [ 48.25, 30.80077109424373 ], "wc_main_review_avg": [ 419.75, 114.17831449097504 ], "wc_review_avg": [ 548.5, 110.46379497373789 ], "wc_reply_reviewers_avg": [ 174.5, 179.50557094419102 ], "wc_reply_authors_avg": [ 1830.75, 1294.2355610552509 ], "reply_reviewers_avg": [ 1.25, 0.4330127018922193 ], "reply_authors_avg": [ 4.5, 2.692582403567252 ], "replies_avg": [ 30, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": -0.08084520834544431, "corr_recommendation_correctness": 0.7001400420140049, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15597792281821947388&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "pdf": "https://openreview.net/pdf?id=pjqqxepwoMy", "email": "oist.jp;ualberta.ca;microsoft.com;iai.ustc.edu.cn;oist.jp;research.microsoft.com;microsoft.com", "author_num": 7, "aff_unique_index": "0;1;2;3;4;2;2", "aff_unique_norm": "Okinawa Institute of Science and Technology;University of Alberta;Microsoft;Hefei Comprehensive National Science Center;Okinawa Institute of Science and Technology Graduate University", "aff_unique_dep": ";;Microsoft Research;Institude of Artificial Intelligence;", "aff_unique_url": "https://www.oist.jp;https://www.ualberta.ca;https://www.microsoft.com/en-us/research;http://www.hfcnst.ac.cn;https://www.oist.jp", "aff_unique_abbr": "OIST;UAlberta;MSR;;OIST", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Hefei;Asia", "aff_country_unique_index": "0;1;2;3;0;2;3", "aff_country_unique": "Japan;Canada;United States;China" }, { "id": "pk7XtG0ln6Z", "title": "Response-based Distillation for Incremental Object Detection", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Traditional object detection are ill-equipped for incremental learning. However, fine-tuning directly on a well-trained detection model with only new data will leads to catastrophic forgetting. Knowledge distillation is a straightforward way to mitigate catastrophic forgetting. In Incremental Object Detection (IOD), previous work mainly focuses on feature-level knowledge distillation, but the different response of detector has not been fully explored yet. In this paper, we propose a fully response-based incremental distillation method focusing on learning response from detection bounding boxes and classification predictions. Firstly, our method transferring category knowledge while equipping student model with the ability to retain localization knowledge during incremental learning. In addition, we further evaluate the qualities of all locations and provides valuable response by adaptive pseudo-label selection (APS) strategies. Finally, we elucidate that knowledge from different responses should be assigned with different importance during incremental distillation. Extensive experiments conducted on MS COCO demonstrate significant advantages of our method, which substantially narrow the performance gap towards full training.", "keywords": "Incremental Object Detection;Incremental Learning;Knowledge Distillation", "primary_area": "", "supplementary_material": "", "author": "Tao Feng;Mang Wang", "authorids": "~Tao_Feng1;~Mang_Wang1", "gender": "M;M", "homepage": ";", "dblp": ";133/2868", "google_scholar": ";", "orcid": "my-orcid?orcid=0000-0001-5571-8018;", "linkedin": ";", "or_profile": "~Tao_Feng1;~Mang_Wang1", "aff": "Alibaba Group;Alibaba Group", "aff_domain": "alibaba-inc.com;alibaba-inc.com", "position": "Researcher;Algorithm Expert", "bibtex": "@misc{\nfeng2022responsebased,\ntitle={Response-based Distillation for Incremental Object Detection},\nauthor={Tao Feng and Mang Wang},\nyear={2022},\nurl={https://openreview.net/forum?id=pk7XtG0ln6Z}\n}", "github": "", "project": "", "reviewers": "xgsQ;BEWR;4JHK", "site": "https://openreview.net/forum?id=pk7XtG0ln6Z", "pdf_size": 0, "recommendation": "3;3;3", "confidence": "4;4;4", "correctness": "3;3;3", "technical_novelty": "2;2;2", "empirical_novelty": "2;2;0", "wc_summary_paper": "76;61;49", "wc_summary_review": "39;34;46", "wc_main_review": "272;393;204", "wc_review": "387;488;299", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 1.3333333333333333, 0.9428090415820634 ], "wc_summary_paper_avg": [ 62.0, 11.045361017187261 ], "wc_summary_review_avg": [ 39.666666666666664, 4.921607686744467 ], "wc_main_review_avg": [ 289.6666666666667, 78.16364599707178 ], "wc_review_avg": [ 391.3333333333333, 77.21974416487592 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1470627051793527853&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0", "aff_unique_norm": "Alibaba Group", "aff_unique_dep": "", "aff_unique_url": "https://www.alibaba.com", "aff_unique_abbr": "Alibaba", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "id": "pkh8bwJbUbL", "title": "Fair Representation Learning through Implicit Path Alignment", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "We considered a fair representation learning perspective, where optimal predictors, on top of the data representation, are ensured to be invariant with respect to different subgroups. Specifically, we formulated the problem as a bi-level optimization, where the representation is learned in the outer-level, and invariant optimal group predictors are updated in the inner-level. To avoid the high computational and memory cost of differentiating in the inner-level optimization, we proposed the implicit path alignment algorithm, which only relies on the solution of inner optimization and the implicit differentiation rather than the exact optimization path. Moreover, the proposed bi-level objective is demonstrated to fulfill the sufficient rule, which is desirable in various practical scenarios but was not commonly studied in fair representation learning. We further analyzed the error gap of the implicit approach and empirically validated the proposed method in both classification and regression settings. Experimental results show the consistently better trade-off in prediction performance and fairness measurement. ", "keywords": "Fairness", "primary_area": "", "supplementary_material": "/attachment/1a7f62058738549e83a2fcef3368680b9e27813c.zip", "author": "Changjian Shui;Qi CHEN;Jiaqi Li;Boyu Wang;Christian Gagn\u00e9", "authorids": "~Changjian_Shui2;~Qi_CHEN6;~Jiaqi_Li2;~Boyu_Wang3;~Christian_Gagn\u00e91", "gender": ";F;;M;M", "homepage": ";https://livreq.github.io/;;https://sites.google.com/site/borriewang/;http://vision.gel.ulaval.ca/~cgagne/english.html", "dblp": ";66/6320-15.html;;41/6565-4.html;80/5084-1", "google_scholar": ";MqLoSeoAAAAJ;;qAZM5KcAAAAJ;https://scholar.google.ca/citations?user=egixsbEAAAAJ", "orcid": ";0000-0002-7213-0221;;0000-0002-7413-4162;0000-0003-3697-4184", "linkedin": ";;;;", "or_profile": "~Changjian_Shui2;~Qi_CHEN6;~Jiaqi_Li2;~Boyu_Wang3;~Christian_Gagn\u00e91", "aff": ";Laval university;;University of Western Ontario;Universit\u00e9 Laval", "aff_domain": ";ulaval.ca;;uwo.ca;ulaval.ca", "position": ";PhD student;;Assistant Professor;Full Professor", "bibtex": "@misc{\nshui2022fair,\ntitle={Fair Representation Learning through Implicit Path Alignment},\nauthor={Changjian Shui and Qi CHEN and Jiaqi Li and Boyu Wang and Christian Gagn{\\'e}},\nyear={2022},\nurl={https://openreview.net/forum?id=pkh8bwJbUbL}\n}", "github": "", "project": "", "reviewers": "MvUn;hpjt;bVZZ;Hjp8", "site": "https://openreview.net/forum?id=pkh8bwJbUbL", "pdf_size": 0, "recommendation": "3;6;6;6", "confidence": "3;3;4;4", "correctness": "2;3;4;3", "technical_novelty": "2;2;3;4", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "45;65;109;69", "wc_summary_review": "15;26;53;23", "wc_main_review": "181;270;255;367", "wc_review": "241;361;417;459", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "740;1806;635;566", "reply_reviewers": "0;0;0;0", "reply_authors": "2;3;2;1", "recommendation_avg": [ 5.25, 1.299038105676658 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.75, 0.82915619758885 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 72.0, 23.2163735324878 ], "wc_summary_review_avg": [ 29.25, 14.289419162443238 ], "wc_main_review_avg": [ 268.25, 66.22452340334357 ], "wc_review_avg": [ 369.5, 81.93137372215847 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 936.75, 505.6715213456261 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.0, 0.7071067811865476 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": 0.816496580927726, "gs_citation": 27, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8629565957981360081&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff_unique_index": "0;1;2", "aff_unique_norm": "Laval University;University of Western Ontario;Universit\u00e9 Laval", "aff_unique_dep": ";;", "aff_unique_url": "https://www.laval.ca;https://www.uwo.ca;https://www.ulaval.ca", "aff_unique_abbr": "Laval;UWO;ULaval", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Canada" }, { "id": "pntT0DUWqw", "title": "DisTop: Discovering a Topological representation to learn diverse and rewarding skills", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "An efficient way for a deep reinforcement learning agent to explore can be to learn a set of skills that achieves a uniform distribution of terminal states. Following this, we introduce DisTop, a new model that simultaneously learns diverse skills and focuses on improving rewarding skills. DisTop progressively builds a discrete topology of the environment using an unsupervised contrastive loss, a growing network and a goal-conditioned policy. Using this topology, a state-independent hierarchical policy can select where the agent has to keep discovering skills in the state space. In turn, the new set of visited states allows an improved learnt representation. If the agent gets overloaded by the number of skills, the agent can autonomously forget the skills unrelated to its eventual task. Our experiments emphasize that DisTop is agnostic to the ground state representation and that the agent can discover the topology of its environment whether the states are high-dimensional binary data, images, or proprioceptive inputs. We demonstrate that this paradigm is competitive on MuJoCo benchmarks with state-of-the-art algorithms on both single-task dense rewards and diverse skill discovery without rewards. By combining these two aspects, we show that DisTop outperforms a state-of-the-art hierarchical reinforcement learning algorithm when rewards are sparse. We believe DisTop opens new perspectives by showing that bottom-up skill discovery combined with representation learning can tackle different complex state spaces and reward settings when it is endowed with the ability to explicitly select the skills to improve.", "keywords": "Hierarchical reinforcement learning;Representation learning;Developmental learning;Reinforcement learning", "primary_area": "", "supplementary_material": "/attachment/c349862933368fbea6cfaa6f41b4537def9cf6a5.zip", "author": "Arthur Aubret;Laetitia Matignon;Salima Hassas", "authorids": "~Arthur_Aubret1;~Laetitia_Matignon1;~Salima_Hassas1", "gender": ";;", "homepage": ";https://perso.liris.cnrs.fr/laetitia.matignon/;", "dblp": "247/6336;;38/2749", "google_scholar": ";;", "orcid": "0000-0003-3495-4323;;", "linkedin": ";;", "or_profile": "~Arthur_Aubret1;~Laetitia_Matignon1;~Salima_Hassas1", "aff": "Institut Pascal, Universit\u00e9 Clermont Auvergne;LIRIS, CNRS;Universit\u00e9 Lyon 1", "aff_domain": "uca.fr;liris.cnrs.fr;univ-lyon1.fr", "position": "Postdoc;Associate Professor;Full Professor", "bibtex": "@misc{\naubret2022distop,\ntitle={DisTop: Discovering a Topological representation to learn diverse and rewarding skills},\nauthor={Arthur Aubret and Laetitia Matignon and Salima Hassas},\nyear={2022},\nurl={https://openreview.net/forum?id=pntT0DUWqw}\n}", "github": "", "project": "", "reviewers": "ceuE;jgAA;3Fo4;Rfa2", "site": "https://openreview.net/forum?id=pntT0DUWqw", "pdf_size": 0, "recommendation": "1;3;3;6", "confidence": "5;4;4;4", "correctness": "2;2;2;4", "technical_novelty": "1;3;3;2", "empirical_novelty": "1;2;0;3", "wc_summary_paper": "49;83;25;219", "wc_summary_review": "46;274;97;100", "wc_main_review": "717;532;586;510", "wc_review": "812;889;708;829", "wc_reply_reviewers": "211;324;63;110", "wc_reply_authors": "1607;761;739;688", "reply_reviewers": "1;1;1;1", "reply_authors": "2;1;1;1", "recommendation_avg": [ 3.25, 1.7853571071357126 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 2.5, 0.8660254037844386 ], "technical_novelty_avg": [ 2.25, 0.82915619758885 ], "empirical_novelty_avg": [ 1.5, 1.118033988749895 ], "wc_summary_paper_avg": [ 94.0, 75.05331438384317 ], "wc_summary_review_avg": [ 129.25, 86.2826025337669 ], "wc_main_review_avg": [ 586.25, 80.39395188694235 ], "wc_review_avg": [ 809.5, 65.20927848090331 ], "wc_reply_reviewers_avg": [ 177.0, 100.31201323869439 ], "wc_reply_authors_avg": [ 948.75, 380.96218644374665 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.7276068751089989, "corr_recommendation_correctness": 0.8892972917998875, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10456126929391851062&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff_unique_index": "0;1;2", "aff_unique_norm": "Universit\u00e9 Clermont Auvergne;CNRS;Universit\u00e9 Lyon 1", "aff_unique_dep": "Institut Pascal;LIRIS;", "aff_unique_url": "https://www.uca.fr;https://www.cnrs.fr;https://www.univ-lyon1.fr", "aff_unique_abbr": ";CNRS;UCBL", "aff_campus_unique_index": "1", "aff_campus_unique": ";Lyon", "aff_country_unique_index": "0;0;0", "aff_country_unique": "France" }, { "id": "pqD4hEOH2NW", "title": "Fingerprinting Multi-exit Deep Neural Network Models via Inference Time", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Transforming large deep neural network (DNN) models into the multi-exit architectures can overcome the overthinking issue and distribute a large DNN model on resource-constrained scenarios (e.g. IoT frontend devices and backend servers) for inference and transmission efficiency. Nevertheless, intellectual property (IP) protection for the multi-exit models in the wild is still an unsolved challenge. Previous efforts to verify DNN model ownership mainly rely on querying the model with specific samples and checking the responses, e.g., DNN watermarking and fingerprinting. However, they are vulnerable to adversarial settings such as adversarial training and are not suitable for the IP verification for multi-exit DNN models. In this paper, we propose a novel approach to fingerprint multi-exit models via inference time rather than inference predictions. Specifically, we design an effective method to generate a set of fingerprint samples to craft the inference process with a unique and robust inference time cost as the evidence for model ownership. We conduct extensive experiments to prove the uniqueness and robustness of our method on three structures (ResNet-56, VGG-16, and MobileNet) and three datasets (CIFAR-10, CIFAR-100, and Tiny-ImageNet) under comprehensive adversarial settings.", "keywords": "Adversarial Machine Learning;DNN Watermarking;DNN Fingerprinting;Intellectual Property Protection;Multi-exit Models;Robustness", "primary_area": "", "supplementary_material": "", "author": "Tian Dong;Han Qiu;Tianwei Zhang;Jiwei Li;Hewu Li;Jialiang LU", "authorids": "~Tian_Dong1;~Han_Qiu3;~Tianwei_Zhang1;~Jiwei_Li1;lihewu@cernet.edu.cn;~Jialiang_LU1", "gender": "Not Specified;M;M;M;;M", "homepage": "https://chichidd.github.io/;https://qiuhan.info;https://personal.ntu.edu.sg/tianwei.zhang/index.html;https://nlp.stanford.edu/~bdlijiwei/;;http://speit.sjtu.edu.cn/index!people.html?sideNav=366&teacherId=5&colorid=3", "dblp": "25/8475.html;15/4507-1;77/7902-4;73/5746-1;;32/4850", "google_scholar": "3SpfUgIAAAAJ;https://scholar.google.fr/citations?user=6JWNv6gAAAAJ;9vpiYDIAAAAJ;PwU16JEAAAAJ;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": "~Tian_Dong1;~Han_Qiu3;~Tianwei_Zhang1;~Jiwei_Li1;lihewu@cernet.edu.cn;~Jialiang_LU1", "aff": "Shanghai Jiaotong University;Tsinghua University;Nanyang Technological University;Zhejiang University;;Shanghai Jiaotong University", "aff_domain": "sjtu.edu.cn;tsinghua.edu.cn;ntu.edu.sg;zju.edu.cn;;sjtu.edu.cn", "position": "MS student;Assistant Professor;Assistant Professor;Assistant Professor;;Associate Professor", "bibtex": "@misc{\ndong2022fingerprinting,\ntitle={Fingerprinting Multi-exit Deep Neural Network Models via Inference Time},\nauthor={Tian Dong and Han Qiu and Tianwei Zhang and Jiwei Li and Hewu Li and Jialiang LU},\nyear={2022},\nurl={https://openreview.net/forum?id=pqD4hEOH2NW}\n}", "github": "", "project": "", "reviewers": "aZtn;ZS2L;vWLU;ntBm", "site": "https://openreview.net/forum?id=pqD4hEOH2NW", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "4;5;4;4", "correctness": "3;2;2;4", "technical_novelty": "2;1;3;2", "empirical_novelty": "3;2;0;2", "wc_summary_paper": "62;68;25;37", "wc_summary_review": "34;49;37;76", "wc_main_review": "344;686;465;127", "wc_review": "440;803;527;240", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.82915619758885 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 1.75, 1.0897247358851685 ], "wc_summary_paper_avg": [ 48.0, 17.649362594722792 ], "wc_summary_review_avg": [ 49.0, 16.56804152578089 ], "wc_main_review_avg": [ 405.5, 202.2158500217033 ], "wc_review_avg": [ 502.5, 202.30731573524474 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": 0.8703882797784891, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4513979188484331173&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;2;3;0", "aff_unique_norm": "Shanghai Jiao Tong University;Tsinghua University;Nanyang Technological University;Zhejiang University", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.sjtu.edu.cn;https://www.tsinghua.edu.cn;https://www.ntu.edu.sg;https://www.zju.edu.cn", "aff_unique_abbr": "SJTU;THU;NTU;ZJU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;0", "aff_country_unique": "China;Singapore" }, { "id": "prGV5dvPYy", "title": "Gradient flows on the feature-Gaussian manifold", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "The scarcity of labeled data is a long-standing challenge for cross-domain machine learning tasks. This paper leverages the existing dataset (i.e., source) to augment new samples that are close to the dataset of interest (i.e., target). To relieve the need to learn a metric on the feature-label space, we lift both datasets to the space of probability distributions on the feature-Gaussian manifold, and then develop a gradient flow that minimizes the maximum mean discrepancy loss. To perform the gradient flow of distributions on the curved feature-Gaussian space, we unravel the Riemannian structure of the space and compute explicitly the Riemannian gradient of the loss function induced by the optimal transport metric. For practical purposes, we also propose a discretized flow, and provide conditional results guaranteeing the global convergence of the flow to the optimum. We illustrate the results of our proposed gradient flow method in several real-world datasets.", "keywords": "gradient flow;feature-Gaussian manifold;MMD;hierarchical optimal transport", "primary_area": "", "supplementary_material": "/attachment/273512d1e5feedca02c28e922b8e8a5e8d7ce221.zip", "author": "Truyen Nguyen;Xinru Hua;Tam Le;Jose Blanchet;Viet Anh Nguyen", "authorids": "~Truyen_Nguyen1;~Xinru_Hua1;~Tam_Le2;~Jose_Blanchet1;~Viet_Anh_Nguyen2", "gender": "M;F;M;M;M", "homepage": "https://sites.google.com/site/truyennguyen3/;;https://tamle-ml.github.io/;https://web.stanford.edu/~jblanche/;http://www.vietanhnguyen.net", "dblp": "94/9501.html;;137/4218;75/5093.html;", "google_scholar": "D6LuRHsAAAAJ;6V5aaYEAAAAJ;ZyrRB_8AAAAJ;https://scholar.google.co.in/citations?user=O24CcQQAAAAJ;3iyf-EoAAAAJ", "orcid": ";;;;", "linkedin": ";;lttam;jose-blanchet;", "or_profile": "~Truyen_Nguyen1;~Xinru_Hua1;~Tam_Le2;~Jose_Blanchet1;~Viet_Anh_Nguyen2", "aff": "University of Akron;Stanford University;RIKEN AIP;Stanford University;VinAI Research, Vietnam", "aff_domain": "uakron.edu;stanford.edu;riken.jp;stanford.edu;vinai.io", "position": "Full Professor;PhD student;Research Scientist;Professor;Research Scientist", "bibtex": "@misc{\nnguyen2022gradient,\ntitle={Gradient flows on the feature-Gaussian manifold},\nauthor={Truyen Nguyen and Xinru Hua and Tam Le and Jose Blanchet and Viet Anh Nguyen},\nyear={2022},\nurl={https://openreview.net/forum?id=prGV5dvPYy}\n}", "github": "", "project": "", "reviewers": "5jgr;hdE6;XUgZ;W5aM", "site": "https://openreview.net/forum?id=prGV5dvPYy", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "3;3;3;3", "correctness": "2;3;3;3", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;1;3;1", "wc_summary_paper": "38;171;43;99", "wc_summary_review": "150;31;18;24", "wc_main_review": "762;543;175;505", "wc_review": "950;745;236;628", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 3.0, 0.0 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 87.75, 53.699976722527545 ], "wc_summary_review_avg": [ 55.75, 54.609408530032624 ], "wc_main_review_avg": [ 496.25, 209.81345881520565 ], "wc_review_avg": [ 639.75, 260.040742000172 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.9271726499455306, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:2NX34NCtDVAJ:scholar.google.com/&scioq=Gradient+flows+on+the+feature-Gaussian+manifold&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;2;1;3", "aff_unique_norm": "University of Akron;Stanford University;RIKEN;VinAI Research", "aff_unique_dep": ";;Advanced Institute for Computational Science;", "aff_unique_url": "https://www.uakron.edu;https://www.stanford.edu;https://www.aip.riken.jp;https://www.vin.ai", "aff_unique_abbr": "UA;Stanford;RIKEN AIP;VinAI", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Stanford", "aff_country_unique_index": "0;0;1;0;2", "aff_country_unique": "United States;Japan;Vietnam" }, { "id": "psNSQsmd4JI", "title": "Containerized Distributed Value-Based Multi-Agent Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Multi-agent reinforcement learning tasks put a high demand on the volume of training samples. Different from its single-agent counterpart, distributed value-based multi-agent reinforcement learning faces the unique challenges of demanding data transfer, inter-process communication management, and high requirement of exploration. We propose a containerized learning framework to solve these problems. We pack several environment instances, a local learner and buffer, and a carefully designed multi-queue manager which avoids blocking into a container. Local policies of each container are encouraged to be as diverse as possible, and only trajectories with highest priority are sent to a global learner. In this way, we achieve a scalable, time-efficient, and diverse distributed MARL learning framework with high system throughput. To own knowledge, our method is the first to solve the challenging Google Research Football full game $\\mathtt{5\\_v\\_5}$. On the StarCraft II micromanagement benchmark, our method gets 4-18$\\times$ better results compared to state-of-the-art non-distributed MARL algorithms.", "keywords": "Multi-agent reinforcement learning;Distributed reinforcement learning", "primary_area": "", "supplementary_material": "/attachment/4fb4eab1250bf43b779c23ea34474c29737c9071.zip", "author": "Siyang Wu;Tonghan Wang;Chenghao Li;Chongjie Zhang", "authorids": "~Siyang_Wu1;~Tonghan_Wang1;~Chenghao_Li1;~Chongjie_Zhang1", "gender": "M;M;M;", "homepage": "https://nj-wusiyang.github.io/;https://tonghanwang.github.io/;;", "dblp": ";175/6039-1.html;;29/6693", "google_scholar": "O507FCgAAAAJ;-AR1yc4AAAAJ;_z3byK8AAAAJ;LjxqXycAAAAJ", "orcid": "0000-0002-1757-8197;;;", "linkedin": ";;;", "or_profile": "~Siyang_Wu1;~Tonghan_Wang1;~Chenghao_Li1;~Chongjie_Zhang1", "aff": "Tsinghua University;Tsinghua University;Tsinghua University;Tsinghua University", "aff_domain": "tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn", "position": "Undergrad student;MS student;PhD student;Assistant Professor", "bibtex": "@misc{\nwu2022containerized,\ntitle={Containerized Distributed Value-Based Multi-Agent Reinforcement Learning},\nauthor={Siyang Wu and Tonghan Wang and Chenghao Li and Chongjie Zhang},\nyear={2022},\nurl={https://openreview.net/forum?id=psNSQsmd4JI}\n}", "github": "", "project": "", "reviewers": "Qy4m;axQN;uCBF;HX1e", "site": "https://openreview.net/forum?id=psNSQsmd4JI", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "3;3;3;4", "correctness": "3;2;3;2", "technical_novelty": "3;2;2;2", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "36;111;58;86", "wc_summary_review": "43;119;46;33", "wc_main_review": "679;685;232;557", "wc_review": "758;915;336;676", "wc_reply_reviewers": "228;0;0;28", "wc_reply_authors": "539;701;437;804", "reply_reviewers": "1;0;0;1", "reply_authors": "1;1;1;1", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 72.75, 28.314086600135983 ], "wc_summary_review_avg": [ 60.25, 34.259122872601395 ], "wc_main_review_avg": [ 538.25, 184.0426241390836 ], "wc_review_avg": [ 671.25, 211.75147579178758 ], "wc_reply_reviewers_avg": [ 64.0, 95.37295214053091 ], "wc_reply_authors_avg": [ 620.25, 141.83330885232849 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": 0.0, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1697410789313122994&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Tsinghua University", "aff_unique_dep": "", "aff_unique_url": "https://www.tsinghua.edu.cn", "aff_unique_abbr": "THU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "id": "psQ6wcNXjS1", "title": "EBM Life Cycle: MCMC Strategies for Synthesis, Defense, and Density Modeling", "track": "main", "status": "Reject", "tldr": "", "abstract": "This work presents strategies to learn an Energy-Based Model (EBM) according to the desired length of its MCMC sampling trajectories. MCMC trajectories of different lengths correspond to models with different purposes. Our experiments cover three different trajectory magnitudes and learning outcomes: 1) shortrun sampling for image generation; 2) midrun sampling for classifier-agnostic adversarial defense; and 3) longrun sampling for principled modeling of image probability densities. To achieve these outcomes, we introduce three novel methods of MCMC initialization for negative samples used in Maximum Likelihood (ML) learning. With standard network architectures and an unaltered ML objective, our MCMC initialization methods alone enable significant performance gains across the three applications that we investigate. Our results include state-of-the-art FID scores for unnormalized image densities on the CIFAR-10 and ImageNet datasets; state-of-the-art adversarial defense on CIFAR-10 among purification methods and the first EBM defense on ImageNet; and scalable techniques for learning valid probability densities.", "keywords": "energy-based model;MCMC sampling;Langevin sampling;generative modeling;unsupervised learning;image synthesis;adversarial defense;density estimation", "primary_area": "", "supplementary_material": "/attachment/ff04d2a4095e803026d019d972ac2e091322ae03.zip", "author": "Mitch Hill;Jonathan Craig Mitchell;Chu Chen;Yuan Du;Mubarak Shah;Song-Chun Zhu", "authorids": "~Mitch_Hill1;~Jonathan_Craig_Mitchell1;~Chu_Chen1;~Yuan_Du1;~Mubarak_Shah3;~Song-Chun_Zhu1", "gender": "M;M;F;F;M;M", "homepage": ";http://jonathancmitchell.github.io/;https://github.com/voidflight;https://yuan-du.com/;https://www.crcv.ucf.edu/person/mubarak-shah/;https://zhusongchun.net/", "dblp": "217/3317;;;;s/MubarakShah;10/10313", "google_scholar": "ycEHnWoAAAAJ;6HTl6wIAAAAJ;;https://scholar.google.com/citations?view_op=list_works;https://scholar.google.com.tw/citations?user=p8gsO3gAAAAJ;https://scholar.google.com.tw/citations?user=Al8dyb4AAAAJ", "orcid": ";;;0000-0003-2830-6960;0000-0002-8216-1128;", "linkedin": ";jonathancmitchell/;;;mubarak-shah-b6aa68213/;", "or_profile": "~Mitch_Hill1;~Jonathan_Craig_Mitchell1;~Chu_Chen1;~Yuan_Du1;~Mubarak_Shah3;~Song-Chun_Zhu1", "aff": "University of Central Florida;University of California, Los Angeles;University of Arizona;University of Central Florida;University of Central Florida;Peking University", "aff_domain": "ucf.edu;ucla.edu;arizona.edu;ucf.edu;ucf.edu;pku.edu.cn", "position": "Assistant Professor;PhD student;Undergrad student;PhD student;Full Professor;Full Professor", "bibtex": "@misc{\nhill2022ebm,\ntitle={{EBM} Life Cycle: {MCMC} Strategies for Synthesis, Defense, and Density Modeling},\nauthor={Mitch Hill and Jonathan Craig Mitchell and Chu Chen and Yuan Du and Mubarak Shah and Song-Chun Zhu},\nyear={2022},\nurl={https://openreview.net/forum?id=psQ6wcNXjS1}\n}", "github": "", "project": "", "reviewers": "PEpN;8SkN;oMaP;gMv3", "site": "https://openreview.net/forum?id=psQ6wcNXjS1", "pdf_size": 0, "recommendation": "3;3;5;8", "confidence": "4;4;4;4", "correctness": "3;3;3;3", "technical_novelty": "2;2;1;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "80;41;83;232", "wc_summary_review": "55;49;31;45", "wc_main_review": "855;599;143;654", "wc_review": "990;689;257;931", "wc_reply_reviewers": "0;0;0;28", "wc_reply_authors": "1084;709;264;955", "reply_reviewers": "0;0;0;1", "reply_authors": "2;1;1;2", "recommendation_avg": [ 4.75, 2.0463381929681126 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 109.0, 72.92119033586876 ], "wc_summary_review_avg": [ 45.0, 8.831760866327848 ], "wc_main_review_avg": [ 562.75, 260.4038930200545 ], "wc_review_avg": [ 716.75, 288.4045552691566 ], "wc_reply_reviewers_avg": [ 7.0, 12.12435565298214 ], "wc_reply_authors_avg": [ 753.0, 312.81863755217654 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:cijIM6YtYrkJ:scholar.google.com/&scioq=EBM+Life+Cycle:+MCMC+Strategies+for+Synthesis,+Defense,+and+Density+Modeling&hl=en&as_sdt=0,33", "gs_version_total": 4, "aff_unique_index": "0;1;2;0;0;3", "aff_unique_norm": "University of Central Florida;University of California, Los Angeles;University of Arizona;Peking University", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.ucf.edu;https://www.ucla.edu;https://www.arizona.edu;http://www.pku.edu.cn", "aff_unique_abbr": "UCF;UCLA;UA;Peking U", "aff_campus_unique_index": "1", "aff_campus_unique": ";Los Angeles", "aff_country_unique_index": "0;0;0;0;0;1", "aff_country_unique": "United States;China" }, { "title": "COPA: Certifying Robust Policies for Offline Reinforcement Learning against Poisoning Attacks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6853", "id": "psh0oeMSBiF", "poster": "", "openreview": "https://openreview.net/forum?id=psh0oeMSBiF", "slides": "https://iclr.cc/virtual/2022/poster/6853", "video": "https://iclr.cc/virtual/2022/poster/6853", "author_site": "Fan Wu, Linyi Li, Huan Zhang, Bhavya Kailkhura, Krishnaram Kenthapadi, DING ZHAO, Bo Li", "tldr": "", "abstract": "As reinforcement learning (RL) has achieved near human-level performance in a variety of tasks, its robustness has raised great attention. While a vast body of research has explored test-time (evasion) attacks in RL and corresponding defenses, its robustness against training-time (poisoning) attacks remains largely unanswered. In this work, we focus on certifying the robustness of of\ufb02ine RL in the presence of poisoning attacks, where a subset of training trajectories could be arbitrarily manipulated. We propose the \ufb01rst certi\ufb01cation framework, COPA, to certify the number of poisoning trajectories that can be tolerated regarding different certi\ufb01cation criteria. Given the complex structure of RL, we propose two certi\ufb01cation criteria: per-state action stability and cumulative reward bound. To further improve the certi\ufb01cation, we propose new partition and aggregation protocols to train robust policies. We further prove that some of the proposed certi\ufb01cation methods are theoretically tight and some are NP-Complete problems. We leverage COPA to certify three RL environments trained with different algorithms and conclude: (1) The proposed robust aggregation protocols such as temporal aggregation can signi\ufb01cantly improve the certi\ufb01cations; (2) Our certi\ufb01cations for both per-state action stability and cumulative reward bound are ef\ufb01cient and tight; (3) The certi\ufb01cation for different training algorithms and environments are different, implying their intrinsic robustness properties. All experimental results are available at https://copa-leaderboard.github.io.", "keywords": "certified robustness;poisoning attacks;reinforcement learning", "primary_area": "", "supplementary_material": "/attachment/1385e87d991dcb64b971334c8597727cd87c2d4c.zip", "author": "Fan Wu;Linyi Li;Huan Zhang;Bhavya Kailkhura;Krishnaram Kenthapadi;Ding Zhao;Bo Li", "authorids": "~Fan_Wu6;~Linyi_Li1;~Huan_Zhang1;~Bhavya_Kailkhura1;~Krishnaram_Kenthapadi1;~Ding_Zhao1;~Bo_Li19", "gender": "F;M;M;M;M;;F", "homepage": ";http://linyil.com;http://huan-zhang.com;https://people.llnl.gov/kailkhura1;https://cs.stanford.edu/people/kngk/;https://safeai-lab.github.io;http://boli.cs.illinois.edu/", "dblp": "07/6378-11;99/4340-1.html;23/1797-1.html;132/8938;29/4781;;50/3402-26", "google_scholar": "qd8WzBMAAAAJ;-b0sk-YAAAAJ;LTa3GzEAAAAJ;SQpJmOgAAAAJ;av5rGaEAAAAJ;z7tPc9IAAAAJ;K8vJkTcAAAAJ", "orcid": ";;;;0000-0003-1237-087X;;", "linkedin": ";;;;krishnaramkenthapadi/;;", "or_profile": "~Fan_Wu6;~Linyi_Li1;~Huan_Zhang1;~Bhavya_Kailkhura1;~Krishnaram_Kenthapadi1;~Ding_Zhao1;~Bo_Li19", "aff": "University of Illinois, Urbana Champaign;Microsoft Research;Carnegie Mellon University;Lawrence Livermore National Laboratory;Fiddler AI;Carnegie Mellon University;University of Illinois, Urbana Champaign", "aff_domain": "illinois.edu;microsoft.com;cmu.edu;llnl.gov;fiddler.ai;cmu.edu;illinois.edu", "position": "PhD student;Research Intern;Postdoc;Research Staff;Chief Scientist;Associate Professor;Assistant Professor", "bibtex": "@inproceedings{\nwu2022copa,\ntitle={{COPA}: Certifying Robust Policies for Offline Reinforcement Learning against Poisoning Attacks},\nauthor={Fan Wu and Linyi Li and Huan Zhang and Bhavya Kailkhura and Krishnaram Kenthapadi and Ding Zhao and Bo Li},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=psh0oeMSBiF}\n}", "github": "", "project": "", "reviewers": "zvac;6rjC;xuEG;RQX2", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "2;4;4;2", "correctness": "4;3;3;3", "technical_novelty": "3;2;3;4", "empirical_novelty": "4;2;3;3", "wc_summary_paper": "108;74;69;104", "wc_summary_review": "32;45;9;99", "wc_main_review": "359;656;230;601", "wc_review": "499;775;308;804", "wc_reply_reviewers": "0;1110;0;0", "wc_reply_authors": "770;4974;870;2191", "reply_reviewers": "0;4;0;0", "reply_authors": "2;10;2;4", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 3.0, 1.0 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 88.75, 17.397916541931107 ], "wc_summary_review_avg": [ 46.25, 33.070946463625745 ], "wc_main_review_avg": [ 461.5, 174.20462106385122 ], "wc_review_avg": [ 596.5, 204.72969984836104 ], "wc_reply_reviewers_avg": [ 277.5, 480.64409910036346 ], "wc_reply_authors_avg": [ 2201.25, 1696.242225479604 ], "reply_reviewers_avg": [ 1.0, 1.7320508075688772 ], "reply_authors_avg": [ 4.5, 3.278719262151 ], "replies_avg": [ 28, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": -0.5773502691896257, "gs_citation": 38, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11901953356085311316&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=psh0oeMSBiF", "email": "illinois.edu;microsoft.com;cmu.edu;llnl.gov;fiddler.ai;cmu.edu;illinois.edu", "author_num": 7, "aff_unique_index": "0;1;2;3;4;2;0", "aff_unique_norm": "University of Illinois Urbana-Champaign;Microsoft;Carnegie Mellon University;Lawrence Livermore National Laboratory;Fiddler AI", "aff_unique_dep": ";Microsoft Research;;;", "aff_unique_url": "https://illinois.edu;https://www.microsoft.com/en-us/research;https://www.cmu.edu;https://www.llnl.gov;https://www.fiddler.ai", "aff_unique_abbr": "UIUC;MSR;CMU;LLNL;Fiddler AI", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Urbana-Champaign;", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "ptZfV8tJbpe", "title": "Modeling label correlations implicitly through latent label encodings for multi-label text classification", "track": "main", "status": "Reject", "tldr": "", "abstract": "Multi-label text classification (MLTC) aims to assign a set of labels to each given document. Unlike single-label text classification methods that often focus on document representation learning, MLTC faces a key challenge of modeling label correlations due to complex label dependencies. Previous state-of-the-art works model label correlations explicitly. It lacks flexibility and is prone to introduce inductive bias that may not always hold, such as label-correlation simplification, sequencing label sets, and label-correlation overload. To address this issue, this paper uses latent label representations to model label correlations implicitly. Specifically, the proposed method concatenates a set of latent labels (instead of actual labels) to the text tokens, inputs them to BERT, then maps the contextual encodings of these latent labels to actual labels cooperatively. The correlations between labels, and between labels and the text are modeled indirectly through these latent-label encodings and their correlations. Such latent and distributed correlation modeling can impose less a priori limits and provide more flexibility. The method is conceptually simple but quite effective. It improves the state-of-the-art results on two widely used benchmark datasets by a large margin. Further experiments demonstrate that its effectiveness lies in label-correlation utilization rather than document representation. Feature study reveals the importance of using latent label embeddings. It also reveals that contrary to the other token embeddings, the embeddings of these latent labels are sensitive to tasks; sometimes pretraining them can lead to significant performance loss rather than promotion. This result suggests that they are more related to task information (i.e., the actual labels) than the other tokens.", "keywords": "multi-label text classification;multi-label classification;label correlation;label embedding", "primary_area": "", "supplementary_material": "/attachment/2e24e2595b2fc5c48088254f8ff7edd3a7d695a3.zip", "author": "Zhizhong Zeng;Yufen Liu;Wenpeng Gao;Baihong Li;Ting Zhang;Xinguo Yu;Zongkai Yang", "authorids": "~Zhizhong_Zeng1;~Yufen_Liu1;~Wenpeng_Gao4;~Baihong_Li1;~Ting_Zhang8;xgyu@mail.ccnu.edu.cn;zkyang@mail.ccnu.edu.cn", "gender": "M;;M;M;F;;", "homepage": "http://foaie.ccnu.edu.cn/info/1102/3358.htm;;https://github.com/GWPunk/;https://www.cnblogs.com/liberhome/;http://foaie.ccnu.edu.cn/info/1104/2422.htm;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": "~Zhizhong_Zeng1;~Yufen_Liu1;~Wenpeng_Gao4;~Baihong_Li1;~Ting_Zhang8;xgyu@mail.ccnu.edu.cn;zkyang@mail.ccnu.edu.cn", "aff": "Central China Normal University;;Central China Normal University;Central China Normal University;Central China Normal University;;", "aff_domain": "ccnu.edu.cn;;ccnu.edu.cn;ccnu.edu.cn;ccnu.edu;;", "position": "Associate Professor;;MS student;MS student;Assistant Professor;;", "bibtex": "@misc{\nzeng2022modeling,\ntitle={Modeling label correlations implicitly through latent label encodings for multi-label text classification},\nauthor={Zhizhong Zeng and Yufen Liu and Wenpeng Gao and Baihong Li and Ting Zhang and Xinguo Yu and Zongkai Yang},\nyear={2022},\nurl={https://openreview.net/forum?id=ptZfV8tJbpe}\n}", "github": "", "project": "", "reviewers": "fS9V;BUhV;FMut;82Sn;8DdA", "site": "https://openreview.net/forum?id=ptZfV8tJbpe", "pdf_size": 0, "recommendation": "3;3;3;5;6", "confidence": "4;4;4;4;4", "correctness": "3;3;2;2;3", "technical_novelty": "2;2;1;1;3", "empirical_novelty": "1;2;2;2;3", "wc_summary_paper": "128;59;100;53;59", "wc_summary_review": "24;44;21;75;12", "wc_main_review": "376;225;315;135;264", "wc_review": "528;328;436;263;335", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "0;0;0;0;0", "reply_reviewers": "0;0;0;0;0", "reply_authors": "0;0;0;0;0", "recommendation_avg": [ 4.0, 1.2649110640673518 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 2.6, 0.4898979485566356 ], "technical_novelty_avg": [ 1.8, 0.7483314773547883 ], "empirical_novelty_avg": [ 2.0, 0.6324555320336759 ], "wc_summary_paper_avg": [ 79.8, 29.376180827330156 ], "wc_summary_review_avg": [ 35.2, 22.480213522117623 ], "wc_main_review_avg": [ 263.0, 81.61127373102322 ], "wc_review_avg": [ 378.0, 93.20729585177332 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:edh48UVOhhQJ:scholar.google.com/&scioq=Modeling+label+correlations+implicitly+through+latent+label+encodings+for+multi-label+text+classification&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Central China Normal University", "aff_unique_dep": "", "aff_unique_url": "http://www.ccnu.edu.cn", "aff_unique_abbr": "CCNU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "id": "ptxGmKMLH_", "title": "A Closer Look at Prototype Classifier for Few-shot Image Classification", "track": "main", "status": "Reject", "tldr": "", "abstract": "The prototypical network is a prototype classifier based on meta-learning and is widely used for few-shot learning because it classifies unseen examples by constructing class-specific prototypes without adjusting hyper-parameters during meta-testing.\nInterestingly, recent research has attracted a lot of attention, showing that a linear classifier with fine-tuning, which does not use a meta-learning algorithm, performs comparably with the prototypical network.\nHowever, fine-tuning requires additional hyper-parameters when adapting a model to a new environment. In addition, although the purpose of few-shot learning is to enable the model to quickly adapt to a new environment, fine-tuning needs to be applied every time a new class appears, making fast adaptation difficult.\nIn this paper, \nwe analyze how a prototype classifier works equally well without fine-tuning and meta-learning.\nWe experimentally found that directly using the feature vector extracted using standard pre-trained models to construct a prototype classifier in meta-testing does not perform as well as the prototypical network and linear classifiers with fine-tuning and feature vectors of pre-trained models.\nThus, we derive a novel generalization bound for the prototypical network and show that focusing on the variance of the norm of a feature vector can improve performance.\nWe experimentally investigated several normalization methods for minimizing the variance of the norm and found that the same performance can be obtained by using the L2 normalization and embedding space transformation without fine-tuning or meta-learning.", "keywords": "few-shot;meta-learning;prototypical network;fine-tuning;prototypical clasifier", "primary_area": "", "supplementary_material": "", "author": "Mingcheng Hou;Issei Sato", "authorids": "~Mingcheng_Hou1;sato@g.ecc.u-tokyo.ac.jp", "gender": ";", "homepage": "https://www.ml.is.s.u-tokyo.ac.jp/members-en;", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": "~Mingcheng_Hou1;sato@g.ecc.u-tokyo.ac.jp", "aff": "The University of Tokyo, Tokyo Institute of Technology;", "aff_domain": "u-tokyo.ac.jp;", "position": "MS student;", "bibtex": "@misc{\nhou2022a,\ntitle={A Closer Look at Prototype Classifier for Few-shot Image Classification},\nauthor={Mingcheng Hou and Issei Sato},\nyear={2022},\nurl={https://openreview.net/forum?id=ptxGmKMLH_}\n}", "github": "", "project": "", "reviewers": "M6Y5;8nVE;krD7", "site": "https://openreview.net/forum?id=ptxGmKMLH_", "pdf_size": 0, "recommendation": "5;6;6", "confidence": "4;4;4", "correctness": "4;3;4", "technical_novelty": "3;2;3", "empirical_novelty": "2;2;2", "wc_summary_paper": "191;72;116", "wc_summary_review": "47;46;48", "wc_main_review": "820;174;361", "wc_review": "1058;292;525", "wc_reply_reviewers": "354;62;264", "wc_reply_authors": "2753;609;1308", "reply_reviewers": "2;1;2", "reply_authors": "5;1;3", "recommendation_avg": [ 5.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 126.33333333333333, 49.12795085850371 ], "wc_summary_review_avg": [ 47.0, 0.816496580927726 ], "wc_main_review_avg": [ 451.6666666666667, 271.4090803361024 ], "wc_review_avg": [ 625.0, 320.61295461454245 ], "wc_reply_reviewers_avg": [ 226.66666666666666, 122.0965009963658 ], "wc_reply_authors_avg": [ 1556.6666666666667, 892.7710917263295 ], "reply_reviewers_avg": [ 1.6666666666666667, 0.4714045207910317 ], "reply_authors_avg": [ 3.0, 1.632993161855452 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": -0.4999999999999999, "gs_citation": 30, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5355821176273735692&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff_unique_index": "0", "aff_unique_norm": "University of Tokyo", "aff_unique_dep": "", "aff_unique_url": "https://www.u-tokyo.ac.jp", "aff_unique_abbr": "UTokyo", "aff_campus_unique_index": "0", "aff_campus_unique": "Tokyo", "aff_country_unique_index": "0", "aff_country_unique": "Japan" }, { "id": "pu-8VNGljir", "title": "Learning to Affiliate: Mutual Centralized Learning for Few-shot Classification", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Few-shot learning (FSL) aims to learn a classifier that can be easily adapted to accommodate new tasks not seen during training, given only a few examples. To handle the limited-data problem in few-shot regimes, recent methods tend to collectively use a set of local features to densely represent an image instead of using a mixed global feature. They generally explore a unidirectional query-to-support paradigm in FSL, e.g., find the nearest/optimal support feature for each query feature and aggregate these local matches for a joint classification. In this paper, we propose a new method \\emph{Mutual Centralized Learning} (MCL) to fully affiliate the two disjoint sets of dense features in a bidirectional paradigm. We associate each local feature with a particle that can bidirectionally random walk in a discrete feature space by the affiliations. To estimate the class probability, we propose the features' accessibility that measures the expected number of visits to the support features of that class in a Markov process. We relate our method to learning a centrality on an affiliation network and demonstrate its capability to be plugged in existing methods by highlighting centralized local features. Experiments show that our method achieves the state-of-the-art on both miniImageNet and tieredImageNet.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yang Liu;Weifeng Zhang;Chao Xiang;Tu Zheng;Deng Cai;Zheng Yang;Xiaofei He", "authorids": "~Yang_Liu42;~Weifeng_Zhang3;~Chao_Xiang1;~Tu_Zheng1;~Deng_Cai4;~Zheng_Yang2;~Xiaofei_He2", "gender": "M;M;M;;M;M;M", "homepage": ";;;http://person.zjulearning.org.cn/tuzheng/;http://www.cad.zju.edu.cn/home/dengcai/;https://www.linkedin.com/in/zheng-yang-5455774b/;https://person.zju.edu.cn/0007101", "dblp": "51/3710-212.html;;https://dblp.dagstuhl.de/pid/154/8208.html;229/4199;c/DCai;59/5806-8;h/XiaofeiHe.html", "google_scholar": "DzQAV2gAAAAJ;https://scholar.google.com/citations?hl=zh-CN;;;vzxDyJoAAAAJ;y8b7ARgAAAAJ;QLLFowsAAAAJ", "orcid": ";;;;;0009-0009-2840-2494;0009-0001-9107-2354", "linkedin": ";;;;;;", "or_profile": "~Yang_Liu42;~Weifeng_Zhang3;~Chao_Xiang1;~Tu_Zheng1;~Deng_Cai4;~Zheng_Yang2;~Xiaofei_He2", "aff": "Zhejiang University;;Zhejiang University;Fabu;Zhejiang University;Fabu Inc;Zhejiang University", "aff_domain": "zju.edu.cn;;zju.edu.cn;fabu.ai;zju.edu.cn;fabu.ai;zju.edu.cn", "position": "PhD student;;PhD student;Researcher;Professor;CTO;Professor", "bibtex": "@misc{\nliu2022learning,\ntitle={Learning to Affiliate: Mutual Centralized Learning for Few-shot Classification},\nauthor={Yang Liu and Weifeng Zhang and Chao Xiang and Tu Zheng and Deng Cai and Zheng Yang and Xiaofei He},\nyear={2022},\nurl={https://openreview.net/forum?id=pu-8VNGljir}\n}", "github": "", "project": "", "reviewers": "in5L;1hus;aV91;9aup", "site": "https://openreview.net/forum?id=pu-8VNGljir", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "3;3;3;4", "correctness": "3;3;4;3", "technical_novelty": "2;3;3;2", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "129;36;85;81", "wc_summary_review": "62;14;61;17", "wc_main_review": "328;209;209;319", "wc_review": "519;259;355;417", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 82.75, 32.91181398829302 ], "wc_summary_review_avg": [ 38.5, 23.02715787933891 ], "wc_main_review_avg": [ 266.25, 57.33835976028613 ], "wc_review_avg": [ 387.5, 94.51322658760519 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 103, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14447790618194656650&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff_unique_index": "0;0;1;0;2;0", "aff_unique_norm": "Zhejiang University;Fabu;Fabu Inc", "aff_unique_dep": ";;", "aff_unique_url": "https://www.zju.edu.cn;;", "aff_unique_abbr": "ZJU;;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;2;0", "aff_country_unique": "China;;United States" }, { "title": "Target-Side Input Augmentation for Sequence to Sequence Generation", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/5922", "id": "pz1euXohm4H", "poster": "", "openreview": "https://openreview.net/forum?id=pz1euXohm4H", "slides": "https://iclr.cc/virtual/2022/poster/5922", "video": "https://iclr.cc/virtual/2022/poster/5922", "author_site": "Shufang Xie, Ang Lv, Yingce Xia, Lijun Wu, Tao Qin, Tie-Yan Liu, Rui Yan", "tldr": "", "abstract": "Autoregressive sequence generation, a prevalent task in machine learning and natural language processing, generates every target token conditioned on both a source input and previously generated target tokens. Previous data augmentation methods, which have been shown to be effective for the task, mainly enhance source inputs (e.g., injecting noise into the source sequence by random swapping or masking, back translation, etc.) while overlooking the target-side augmentation. In this work, we propose a target-side augmentation method for sequence generation. In training, we use the decoder output probability distributions as soft indicators, which are multiplied with target token embeddings, to build pseudo tokens. These soft pseudo tokens are then used as target tokens to enhance the training. We conduct comprehensive experiments on various sequence generation tasks, including dialog generation, machine translation, and abstractive summarization. Without using any extra labeled data or introducing additional model parameters, our method significantly outperforms strong baselines. The code is available at https://github.com/TARGET-SIDE-DATA-AUG/TSDASG.", "keywords": "Sequence Gerneration;Data Augmentation", "primary_area": "", "supplementary_material": "", "author": "Shufang Xie;Ang Lv;Yingce Xia;Lijun Wu;Tao Qin;Tie-Yan Liu;Rui Yan", "authorids": "~Shufang_Xie1;~Ang_Lv1;~Yingce_Xia1;~Lijun_Wu1;~Tao_Qin1;~Tie-Yan_Liu1;~Rui_Yan2", "gender": "M;M;M;M;M;M;M", "homepage": ";https://trestad.github.io;https://www.microsoft.com/en-us/research/people/yinxia/;https://apeterswu.github.io/;https://www.microsoft.com/en-us/research/people/taoqin/;http://member.acm.org/~tieyanliu;https://gsai.ruc.edu.cn/english/ruiyan", "dblp": "https://dblp.uni-trier.de/pid/163/2704-3;326/5506;http://dblp.uni-trier.de/pers/hd/x/Xia:Yingce;68/1284-3;14/6841;l/TieYanLiu;19/2405-1", "google_scholar": ";https://scholar.google.com/citations?view_op=list_works;GS5wRxYAAAAJ;https://scholar.google.com/citations?hl=en;Bl4SRU0AAAAJ;Nh832fgAAAAJ;eLw6g-UAAAAJ", "orcid": ";0000-0002-8027-2270;;0000-0002-3530-590X;;0000-0002-0476-8020;0000-0002-3356-6823", "linkedin": ";;;lijun-wu-59340478/;;;", "or_profile": "~Shufang_Xie1;~Ang_Lv1;~Yingce_Xia1;~Lijun_Wu1;~Tao_Qin1;~Tie-Yan_Liu1;~Rui_Yan2", "aff": "Renmin University of China;Renmin University of China;Microsoft;Microsoft Research;Microsoft Research Asia;Microsoft;Renmin University of China", "aff_domain": "ruc.edu.cn;ruc.edu.cn;microsoft.com;microsoft.com;microsoft.com;microsoft.com;ruc.edu.cn", "position": "PhD student;PhD student;Researcher;Researcher;Principal Researcher;Distinguished Scientist;Associate Professor", "bibtex": "@inproceedings{\nxie2022targetside,\ntitle={Target-Side Input Augmentation for Sequence to Sequence Generation},\nauthor={Shufang Xie and Ang Lv and Yingce Xia and Lijun Wu and Tao Qin and Tie-Yan Liu and Rui Yan},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=pz1euXohm4H}\n}", "github": "", "project": "", "reviewers": "k7SM;ojZ7;vrEs;iLjj", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "4;4;4;4", "correctness": "2;3;3;3", "technical_novelty": "2;3;2;3", "empirical_novelty": "3;2;2;3", "wc_summary_paper": "91;97;99;137", "wc_summary_review": "34;146;61;47", "wc_main_review": "314;435;215;354", "wc_review": "439;678;375;538", "wc_reply_reviewers": "479;0;108;176", "wc_reply_authors": "1366;997;429;471", "reply_reviewers": "3;0;1;1", "reply_authors": "3;2;2;1", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 106.0, 18.138357147217054 ], "wc_summary_review_avg": [ 72.0, 43.777848279695064 ], "wc_main_review_avg": [ 329.5, 79.18491017864451 ], "wc_review_avg": [ 507.5, 114.29020080479341 ], "wc_reply_reviewers_avg": [ 190.75, 177.86142780265766 ], "wc_reply_authors_avg": [ 815.75, 388.6047960331936 ], "reply_reviewers_avg": [ 1.25, 1.0897247358851685 ], "reply_authors_avg": [ 2.0, 0.7071067811865476 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.6622661785325219, "gs_citation": 18, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14317568997077070576&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 0, "pdf": "https://openreview.net/pdf?id=pz1euXohm4H", "email": "ruc.edu.cn;ruc.edu.cn;microsoft.com;microsoft.com;microsoft.com;microsoft.com;ruc.edu.cn", "author_num": 7, "aff_unique_index": "0;0;1;1;1;1;0", "aff_unique_norm": "Renmin University of China;Microsoft", "aff_unique_dep": ";Microsoft Corporation", "aff_unique_url": "http://www.ruc.edu.cn;https://www.microsoft.com", "aff_unique_abbr": "RUC;Microsoft", "aff_campus_unique_index": "1", "aff_campus_unique": ";Asia", "aff_country_unique_index": "0;0;1;1;0;1;0", "aff_country_unique": "China;United States" }, { "id": "pzgENfIRBil", "title": "Self-consistent Gradient-like Eigen Decomposition in Solving Schr\u00f6dinger Equations", "track": "main", "status": "Reject", "tldr": "", "abstract": "The Schr\u00f6dinger equation is at the heart of modern quantum mechanics. Since exact solutions of the ground state are typically intractable, standard approaches approximate Schr\u00f6dinger's equation as forms of nonlinear generalized eigenvalue problems $F(V)V = SV\\Lambda$ in which $F(V)$, the matrix to be decomposed, is a function of its own top-$k$ smallest eigenvectors $V$, leading to a ``self-consistency problem''. Traditional iterative methods heavily rely on high-quality initial guesses of $V$ generated via domain-specific heuristics methods based on quantum mechanics. In this work, we eliminate such a need for domain-specific heuristics by presenting a novel framework, Self-consistent Gradient-like Eigen Decomposition (SCGLED) that regards $F(V)$ as a special ``online data generator'', thus allows gradient-like eigendecomposition methods in streaming $k$-PCA to approach the self-consistency of the equation from scratch in an iterative way similar to online learning. With several critical numerical improvements, SCGLED is robust to initial guesses, free of quantum-mechanism-based heuristics designs, and neat in implementation. Our experiments show that it not only can simply replace traditional heuristics-based initial guess methods with large performance advantage (achieved averagely 25x more precise than the best baseline in similar wall time), but also is capable of finding highly precise solutions independently without any traditional iterative methods.", "keywords": "Schr\u00f6dinger Equation;Hartree-Fock;Self-consistent;Eigen-decomposition;online learning;stochastic k-PCA;Oja's Algorithm;EigenGame", "primary_area": "", "supplementary_material": "/attachment/d479a9c3d80d15ba9a9b29118655ba7e554a5978.zip", "author": "Xihan Li;Xiang Chen;Rasul Tutunov;Haitham Bou Ammar;Lei Wang;Jun Wang", "authorids": "~Xihan_Li1;~Xiang_Chen8;~Rasul_Tutunov3;~Haitham_Bou_Ammar1;~Lei_Wang2;~Jun_Wang2", "gender": "M;;;M;;M", "homepage": "https://snowkylin.github.io/;;;;;http://www0.cs.ucl.ac.uk/staff/jun.wang/", "dblp": "81/4133-1.html;;;;;w/JunWang12", "google_scholar": "2Y-QNGEAAAAJ;2cj3OTIAAAAJ;Zcov4c4AAAAJ;https://scholar.google.co.uk/citations?user=AE5suDoAAAAJ;;https://scholar.google.co.uk/citations?user=wIE1tY4AAAAJ", "orcid": "0000-0002-7000-7983;;;;;", "linkedin": ";;;;;", "or_profile": "~Xihan_Li1;~Xiang_Chen8;~Rasul_Tutunov3;~Haitham_Bou_Ammar1;~Lei_Wang2;~Jun_Wang2", "aff": "University College London;Huawei Technologies Ltd.;;Huawei R&D UK;;University College London", "aff_domain": "ucl.ac.uk;huawei.com;;huawei.com;;ucl.ac.uk", "position": "PhD student;Researcher;;Principal Researcher;;Professor", "bibtex": "@misc{\nli2022selfconsistent,\ntitle={Self-consistent Gradient-like Eigen Decomposition in Solving Schr\\\"odinger Equations},\nauthor={Xihan Li and Xiang Chen and Rasul Tutunov and Haitham Bou Ammar and Lei Wang and Jun Wang},\nyear={2022},\nurl={https://openreview.net/forum?id=pzgENfIRBil}\n}", "github": "", "project": "", "reviewers": "pQwL;9vsL;sMJ3;2D5z", "site": "https://openreview.net/forum?id=pzgENfIRBil", "pdf_size": 0, "recommendation": "5;5;5;8", "confidence": "2;3;3;4", "correctness": "3;2;3;4", "technical_novelty": "2;2;3;4", "empirical_novelty": "2;2;3;4", "wc_summary_paper": "49;32;124;106", "wc_summary_review": "2;24;43;13", "wc_main_review": "67;163;329;156", "wc_review": "118;219;496;275", "wc_reply_reviewers": "0;15;0;0", "wc_reply_authors": "524;460;723;267", "reply_reviewers": "0;1;0;0", "reply_authors": "1;2;1;1", "recommendation_avg": [ 5.75, 1.299038105676658 ], "confidence_avg": [ 3.0, 0.7071067811865476 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.75, 0.82915619758885 ], "empirical_novelty_avg": [ 2.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 77.75, 38.26470305647229 ], "wc_summary_review_avg": [ 20.5, 15.14100392972672 ], "wc_main_review_avg": [ 178.75, 94.64241913645276 ], "wc_review_avg": [ 277.0, 138.39255760336246 ], "wc_reply_reviewers_avg": [ 3.75, 6.49519052838329 ], "wc_reply_authors_avg": [ 493.5, 162.80740155164938 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.816496580927726, "corr_recommendation_correctness": 0.816496580927726, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:KUuLU5HYnRYJ:scholar.google.com/&scioq=Self-consistent+Gradient-like+Eigen+Decomposition+in+Solving+Schr%C3%B6dinger+Equations&hl=en&as_sdt=0,5", "gs_version_total": 4, "aff_unique_index": "0;1;1;0", "aff_unique_norm": "University College London;Huawei", "aff_unique_dep": ";Huawei Technologies", "aff_unique_url": "https://www.ucl.ac.uk;https://www.huawei.com", "aff_unique_abbr": "UCL;Huawei", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0", "aff_country_unique": "United Kingdom;China" }, { "id": "q1QmAqT_4Zh", "title": "Koopman Q-learning: Offline Reinforcement Learning via Symmetries of Dynamics", "track": "main", "status": "Reject", "tldr": "", "abstract": "Offline reinforcement learning leverages large datasets to train policies without interactions with the environment. The learned policies may then be deployed in real-world settings where interactions are costly or dangerous. Current algorithms over-fit to the training dataset and as a consequence perform poorly when deployed to out-of-distribution generalizations of the environment. We aim to address these limitations by learning a Koopman latent representation which allows us to infer symmetries of the system's underlying dynamic. The latter is then utilized to extend the otherwise static offline dataset during training; this constitutes a novel data augmentation framework which reflects the system's dynamic and is thus to be interpreted as an exploration of the environments phase space. To obtain the symmetries we employ Koopman theory in which nonlinear dynamics are represented in terms of a linear operator acting on the space of measurement functions of the system and thus symmetries of the dynamics may be inferred directly. We provide novel theoretical results on the existence and nature of symmetries relevant for control systems such as reinforcement learning settings. Moreover, we empirically evaluate our method on several benchmark offline reinforcement learning tasks and datasets including D4RL, Metaworld and Robosuite and find that by using our framework we consistently improve the state-of-the-art for Q-learning methods.", "keywords": "offline reinforcement learning;representation learning;Koopman theory;symmetries of representations;data augmentation", "primary_area": "", "supplementary_material": "", "author": "Matthias Weissenbacher;Samarth Sinha;Animesh Garg;Yoshinobu Kawahara", "authorids": "~Matthias_Weissenbacher1;~Samarth_Sinha1;~Animesh_Garg1;~Yoshinobu_Kawahara1", "gender": "M;M;M;M", "homepage": ";https://samsinha.me;http://animesh.garg.tech;https://mls.ist.osaka-u.ac.jp/en/~kawahara/", "dblp": ";;123/5728;09/4700", "google_scholar": "xwo5JWgAAAAJ;https://scholar.google.ca/citations?user=lnCKs0AAAAAJ;zp8V7ZMAAAAJ;B8sRETUAAAAJ", "orcid": ";;0000-0003-0482-4296;0000-0001-7789-4709", "linkedin": "matthias-weissenbacher-8bb850107/;;animeshgarg/;", "or_profile": "~Matthias_Weissenbacher1;~Samarth_Sinha1;~Animesh_Garg1;~Yoshinobu_Kawahara1", "aff": ";University of Toronto;University of Toronto;Kyushu University", "aff_domain": ";cs.toronto;toronto.edu;imi.kyushu-u.ac.jp", "position": ";PhD student;Assistant Professor;Full Professor", "bibtex": "@misc{\nweissenbacher2022koopman,\ntitle={Koopman Q-learning: Offline Reinforcement Learning via Symmetries of Dynamics},\nauthor={Matthias Weissenbacher and Samarth Sinha and Animesh Garg and Yoshinobu Kawahara},\nyear={2022},\nurl={https://openreview.net/forum?id=q1QmAqT_4Zh}\n}", "github": "", "project": "", "reviewers": "ohJ3;SsNL;VgB7;HCMe", "site": "https://openreview.net/forum?id=q1QmAqT_4Zh", "pdf_size": 0, "recommendation": "5;6;6;6", "confidence": "3;4;3;4", "correctness": "3;3;3;4", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "47;60;84;95", "wc_summary_review": "32;23;12;72", "wc_main_review": "866;360;134;518", "wc_review": "945;443;230;685", "wc_reply_reviewers": "211;19;0;0", "wc_reply_authors": "1224;596;388;810", "reply_reviewers": "1;1;0;0", "reply_authors": "3;2;1;1", "recommendation_avg": [ 5.75, 0.4330127018922193 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 71.5, 18.980252896102307 ], "wc_summary_review_avg": [ 34.75, 22.64260364887395 ], "wc_main_review_avg": [ 469.5, 266.51219484293773 ], "wc_review_avg": [ 575.75, 267.13608423423443 ], "wc_reply_reviewers_avg": [ 57.5, 88.96207056942863 ], "wc_reply_authors_avg": [ 754.5, 309.41679010680724 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.75, 0.82915619758885 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 35, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2153236057017473258&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff_unique_index": "0;0;1", "aff_unique_norm": "University of Toronto;Kyushu University", "aff_unique_dep": ";", "aff_unique_url": "https://www.utoronto.ca;https://www.kyushu-u.ac.jp", "aff_unique_abbr": "U of T;Kyushu U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1", "aff_country_unique": "Canada;Japan" }, { "id": "q23I9kJE3gA", "title": "Conditional set generation using Seq2seq models", "track": "main", "status": "Reject", "tldr": "", "abstract": "Conditional set generation learns a mapping from an input sequence of tokens to a set. Several popular natural language processing (NLP) tasks, such as entity typing and dialogue emotion tagging, are instances of set generation. Sequence-to-sequence models are a popular choice to model set generation but this typical approach of treating a set as a sequence does not fully leverage its key properties, namely order-invariance and cardinality. We propose a novel data augmentation approach that recovers informative orders for labels using their dependence information. Further, we jointly model the set cardinality and output by listing the set size as the first element and taking advantage of the autoregressive factorization used by seq2seq models. Our experiments in simulated settings and on three diverse NLP datasets show that our method improves over strong seq2seq baselines by about 9% on absolute F1 score. We will release all code and data upon acceptance.", "keywords": "natural language processing;nlp;language generation", "primary_area": "", "supplementary_material": "/attachment/8947606df2cf2110336bb268eb098e5aea454490.zip", "author": "Aman Madaan;Dheeraj Rajagopal;Antoine Bosselut;Yiming Yang", "authorids": "~Aman_Madaan1;~Dheeraj_Rajagopal1;~Antoine_Bosselut1;~Yiming_Yang1", "gender": ";M;M;F", "homepage": "https://madaan.github.io;https://dheerajrajagopal.github.io;https://atcbosselut.github.io/;http://www.cs.cmu.edu/~yiming/", "dblp": "138/1043;127/0193;184/3742;25/1666", "google_scholar": "jW9ts2cAAAAJ;NsJZccUAAAAJ;XD9hkJwAAAAJ;MlZq4XwAAAAJ", "orcid": ";;;0000-0001-8322-607X", "linkedin": "amnmadaan/;;;yiming-yang-24100924/", "or_profile": "~Aman_Madaan1;~Dheeraj_Rajagopal1;~Antoine_Bosselut1;~Yiming_Yang1", "aff": "Carnegie Mellon University;Carnegie Mellon University;Swiss Federal Institute of Technology Lausanne;School of Computer Science, Carnegie Mellon University", "aff_domain": "cmu.edu;cmu.edu;epfl.ch;cs.cmu.edu", "position": "PhD student;PhD student;Assistant Professor;Full Professor", "bibtex": "@misc{\nmadaan2022conditional,\ntitle={Conditional set generation using Seq2seq models},\nauthor={Aman Madaan and Dheeraj Rajagopal and Antoine Bosselut and Yiming Yang},\nyear={2022},\nurl={https://openreview.net/forum?id=q23I9kJE3gA}\n}", "github": "", "project": "", "reviewers": "MDhc;NrsZ;ue1r;vizW", "site": "https://openreview.net/forum?id=q23I9kJE3gA", "pdf_size": 0, "recommendation": "5;5;5;6", "confidence": "3;4;4;4", "correctness": "3;3;3;3", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;3;2;3", "wc_summary_paper": "56;69;57;114", "wc_summary_review": "23;55;40;18", "wc_main_review": "373;398;180;163", "wc_review": "452;522;277;295", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "652;341;453;368", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 5.25, 0.4330127018922193 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 74.0, 23.65375234502974 ], "wc_summary_review_avg": [ 34.0, 14.611639196202457 ], "wc_main_review_avg": [ 278.5, 107.53255321064408 ], "wc_review_avg": [ 386.5, 103.69787847395915 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 453.5, 121.82877328447496 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": 0.0, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14573663823344679815&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Carnegie Mellon University;Swiss Federal Institute of Technology Lausanne", "aff_unique_dep": ";", "aff_unique_url": "https://www.cmu.edu;https://www.epfl.ch", "aff_unique_abbr": "CMU;EPFL", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Lausanne;Pittsburgh", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "United States;Switzerland" }, { "id": "q2DCMRTvdZ-", "title": "Picking up the pieces: separately evaluating supernet training and architecture selection", "track": "main", "status": "Reject", "tldr": "", "abstract": "Differentiable Neural Architecture Search (NAS) has emerged as a simple and efficient method for the automated design of neural networks. Recent research has demonstrated improvements on various aspects on the original algorithm (DARTS), but comparative evaluation of these advances remains costly and difficult. We frame supernet NAS as a two-stage search, decoupling the training of the supernet from the extraction of a final design from the supernet. We propose a set of metrics which utilize benchmark data sets to evaluate each stage of the search process independently. We demonstrate two metrics measuring separately the quality of the supernet's shared weights and the quality of the learned sampling distribution, as well as corresponding statistics approximating the reliance of the second stage search on these components of the supernet. These metrics facilitate both more robust evaluation of NAS algorithms and provide practical method for designing complete NAS algorithms from separate supernet training and architecture selection techniques.", "keywords": "neural architecture search;automl;deep learning theory", "primary_area": "", "supplementary_material": "", "author": "Gabriel Meyer-Lee;Nick Cheney", "authorids": "~Gabriel_Meyer-Lee1;ncheney@uvm.edu", "gender": ";", "homepage": "http://www.uvm.edu/neurobotics/;", "dblp": "228/4648;", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": "~Gabriel_Meyer-Lee1;ncheney@uvm.edu", "aff": "University of Vermont;", "aff_domain": "uvm.edu;", "position": "PhD student;", "bibtex": "@misc{\nmeyer-lee2022picking,\ntitle={Picking up the pieces: separately evaluating supernet training and architecture selection},\nauthor={Gabriel Meyer-Lee and Nick Cheney},\nyear={2022},\nurl={https://openreview.net/forum?id=q2DCMRTvdZ-}\n}", "github": "", "project": "", "reviewers": "dryA;XbXs;eM7q;JdoT;SuQ3", "site": "https://openreview.net/forum?id=q2DCMRTvdZ-", "pdf_size": 0, "recommendation": "3;3;3;3;5", "confidence": "4;5;5;4;4", "correctness": "3;4;3;3;4", "technical_novelty": "2;2;2;1;3", "empirical_novelty": "2;2;3;2;3", "wc_summary_paper": "45;79;30;51;90", "wc_summary_review": "2;45;63;19;54", "wc_main_review": "394;560;136;218;524", "wc_review": "441;684;229;288;668", "wc_reply_reviewers": "0;0;0;0;96", "wc_reply_authors": "192;105;73;152;182", "reply_reviewers": "0;0;0;0;1", "reply_authors": "1;1;1;1;1", "recommendation_avg": [ 3.4, 0.8 ], "confidence_avg": [ 4.4, 0.48989794855663565 ], "correctness_avg": [ 3.4, 0.4898979485566356 ], "technical_novelty_avg": [ 2.0, 0.6324555320336759 ], "empirical_novelty_avg": [ 2.4, 0.4898979485566356 ], "wc_summary_paper_avg": [ 59.0, 22.190087877248256 ], "wc_summary_review_avg": [ 36.6, 22.703303724348135 ], "wc_main_review_avg": [ 366.4, 166.24512022913635 ], "wc_review_avg": [ 462.0, 188.00319146227278 ], "wc_reply_reviewers_avg": [ 19.2, 38.4 ], "wc_reply_authors_avg": [ 140.8, 45.43742950475962 ], "reply_reviewers_avg": [ 0.2, 0.4 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.40824829046386296, "corr_recommendation_correctness": 0.6123724356957946, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:85EKDNPY7dAJ:scholar.google.com/&scioq=Picking+up+the+pieces:+separately+evaluating+supernet+training+and+architecture+selection&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "University of Vermont", "aff_unique_dep": "", "aff_unique_url": "https://www.uvm.edu", "aff_unique_abbr": "UVM", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "id": "q2ZaVU6bEsT", "title": "CONTEXT AUGMENTATION AND FEATURE REFINEMENT NETWORK FOR TINY OBJECT DETECTION", "track": "main", "status": "Reject", "tldr": "", "abstract": "Tiny objects are hard to detect due to their low resolution and small size. The poor detection performance of tiny objects is mainly caused by the limitation of network and the imbalance of training dataset. A new feature pyramid network is proposed to combine context augmentation and feature refinement. The features from multi-scale dilated convolution are fused and injected into feature pyramid network from top to bottom to supplement context information. The channel and spatial feature refinement mechanism is introduced to suppress the conflicting formation in multi-scale feature fusion and prevent tiny objects from being submerged in the conflict information. In addition, a data enhancement method called copy-reduce-paste is proposed, which can increase the contribution of tiny objects to loss during training, ensuring a more balanced training. Experimental results show that the mean average precision of target targets on the VOC dataset of the proposed network reaches 16.9% (IOU=0.5:0.95), which is 3.9% higher than YOLOV4, 7.7% higher than CenterNet, and 5.3% higher than RefineDet.", "keywords": "Tiny Object Detection;context augmentation;feature refinement;Data Enhancement", "primary_area": "", "supplementary_material": "", "author": "Jinsheng Xiao;Tao Zhao;Yuntao Yao;Qiuze Yu;Yunhua Chen", "authorids": "~Jinsheng_Xiao1;~Tao_Zhao4;dolphin_tao@whu.edu.cn;yuhenry007@whu.edu.cn;yhchen@gdut.edu.cn", "gender": "M;M;;;", "homepage": "http://jszy.whu.edu.cn/xiaojinsheng/en/index.htm;;;;", "dblp": "28/6940;;;;", "google_scholar": ";;;;", "orcid": "0000-0002-5403-1895;;;;", "linkedin": ";https://www.linkedin.com/feed/?trk=onboarding-landing;;;", "or_profile": "~Jinsheng_Xiao1;~Tao_Zhao4;dolphin_tao@whu.edu.cn;yuhenry007@whu.edu.cn;yhchen@gdut.edu.cn", "aff": "Wuhan University;Wuhan University;;;", "aff_domain": "whu.edu.cn;whu.edu;;;", "position": "Associate Professor;MS student;;;", "bibtex": "@misc{\nxiao2022context,\ntitle={{CONTEXT} {AUGMENTATION} {AND} {FEATURE} {REFINEMENT} {NETWORK} {FOR} {TINY} {OBJECT} {DETECTION}},\nauthor={Jinsheng Xiao and Tao Zhao and Yuntao Yao and Qiuze Yu and Yunhua Chen},\nyear={2022},\nurl={https://openreview.net/forum?id=q2ZaVU6bEsT}\n}", "github": "", "project": "", "reviewers": "VtbS;bK8X;VkyR", "site": "https://openreview.net/forum?id=q2ZaVU6bEsT", "pdf_size": 0, "recommendation": "3;3;3", "confidence": "4;4;5", "correctness": "3;2;1", "technical_novelty": "3;1;1", "empirical_novelty": "2;2;1", "wc_summary_paper": "36;45;64", "wc_summary_review": "36;16;11", "wc_main_review": "157;476;221", "wc_review": "229;537;296", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "correctness_avg": [ 2.0, 0.816496580927726 ], "technical_novelty_avg": [ 1.6666666666666667, 0.9428090415820634 ], "empirical_novelty_avg": [ 1.6666666666666667, 0.4714045207910317 ], "wc_summary_paper_avg": [ 48.333333333333336, 11.67142760000773 ], "wc_summary_review_avg": [ 21.0, 10.801234497346433 ], "wc_main_review_avg": [ 284.6666666666667, 137.79292031482927 ], "wc_review_avg": [ 354.0, 132.25984525420657 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 46, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14247091562107689185&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Wuhan University", "aff_unique_dep": "", "aff_unique_url": "http://www.whu.edu.cn/", "aff_unique_abbr": "WHU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "title": "Declarative nets that are equilibrium models", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7155", "id": "q4HaTeMO--y", "poster": "", "openreview": "https://openreview.net/forum?id=q4HaTeMO--y", "slides": "https://iclr.cc/virtual/2022/poster/7155", "video": "https://iclr.cc/virtual/2022/poster/7155", "author_site": "Russell Tsuchida, Suk Yee Yong, Mohammad Ali Armin, Lars Petersson, Cheng Soon Ong", "tldr": "", "abstract": "Implicit layers are computational modules that output the solution to some problem depending on the input and the layer parameters. Deep equilibrium models (DEQs) output a solution to a fixed point equation. Deep declarative networks (DDNs) solve an optimisation problem in their forward pass, an arguably more intuitive, interpretable problem than finding a fixed point. We show that solving a kernelised regularised maximum likelihood estimate as an inner problem in a DDN yields a large class of DEQ architectures. Our proof uses the exponential family in canonical form, and provides a closed-form expression for the DEQ parameters in terms of the kernel. The activation functions have interpretations in terms of the derivative of the log partition function. Building on existing literature, we interpret DEQs as fine-tuned, unrolled classical algorithms, giving an intuitive justification for why DEQ models are sensible. We use our theoretical result to devise an initialisation scheme for DEQs that allows them to solve kGLMs in their forward pass at initialisation. We empirically show that this initialisation scheme improves training stability and performance over random initialisation.", "keywords": "deep equilibrium models;deep declarative networks;implicit layers;kernel methods;generalised linear models", "primary_area": "", "supplementary_material": "/attachment/72066ec7a2392d6d58e86a17668039ad1f5c0639.zip", "author": "Russell Tsuchida;Suk Yee Yong;Mohammad Ali Armin;Lars Petersson;Cheng Soon Ong", "authorids": "~Russell_Tsuchida1;~Suk_Yee_Yong1;~Mohammad_Ali_Armin1;~Lars_Petersson2;~Cheng_Soon_Ong1", "gender": "M;;M;M;M", "homepage": ";https://yongsukyee.github.io/;;;https://ong-home.my", "dblp": "210/2569;;;18/5485;58/2283", "google_scholar": "pQ7EkegAAAAJ;TjkXwcUAAAAJ;qYYGjaAAAAAJ;32RHN4oAAAAJ;ofMZr0IAAAAJ", "orcid": ";0000-0002-5204-2902;;;0000-0002-2302-9733", "linkedin": ";;;lars-petersson-9013541/;cheng-soon-ong-38bbb524/", "or_profile": "~Russell_Tsuchida1;~Suk_Yee_Yong1;~Mohammad_Ali_Armin1;~Lars_Petersson2;~Cheng_Soon_Ong1", "aff": "CSIRO;CSIRO;dat61, CSIRO;CSIRO;CSIRO", "aff_domain": "csiro.au;csiro.au;data61.csiro.au;csiro.au;csiro.au", "position": "Postdoc;Postdoc;Reseacrh Scientist;Principal Research Scientist;Principal Researcher", "bibtex": "@inproceedings{\ntsuchida2022declarative,\ntitle={Declarative nets that are equilibrium models},\nauthor={Russell Tsuchida and Suk Yee Yong and Mohammad Ali Armin and Lars Petersson and Cheng Soon Ong},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=q4HaTeMO--y}\n}", "github": "", "project": "", "reviewers": "JoBL;4xxf;zqw8;Ldsv", "pdf_size": 0, "recommendation": "6;6;6;8", "confidence": "2;4;3;3", "correctness": "4;3;4;3", "technical_novelty": "3;2;3;4", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "58;104;124;90", "wc_summary_review": "37;163;29;47", "wc_main_review": "254;450;337;316", "wc_review": "349;717;490;453", "wc_reply_reviewers": "0;164;0;47", "wc_reply_authors": "344;1151;1166;569", "reply_reviewers": "0;1;0;1", "reply_authors": "1;3;2;2", "recommendation_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 3.0, 0.7071067811865476 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 94.0, 24.041630560342615 ], "wc_summary_review_avg": [ 69.0, 54.644304369257 ], "wc_main_review_avg": [ 339.25, 70.84975299886374 ], "wc_review_avg": [ 502.25, 134.330515892704 ], "wc_reply_reviewers_avg": [ 52.75, 67.03497221600081 ], "wc_reply_authors_avg": [ 807.5, 359.9406201028164 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 2.0, 0.7071067811865476 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": -0.5773502691896257, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9646591638440181450&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=q4HaTeMO--y", "email": "csiro.au;csiro.au;data61.csiro.au;csiro.au;csiro.au", "author_num": 5, "aff_unique_index": "0;0;1;0;0", "aff_unique_norm": "Commonwealth Scientific and Industrial Research Organisation;CSIRO", "aff_unique_dep": ";dat61", "aff_unique_url": "https://www.csiro.au;https://www.csiro.au", "aff_unique_abbr": "CSIRO;CSIRO", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "Australia" }, { "id": "q4pQkTlImdk", "title": "Not All Attention Is All You Need", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Dropout has shown an effective mediation to alleviate the over-fitting of neural models by forcedly blocking less helpful connections. However, common dropout has to be done crudely over all neural structures with the same dropout pattern once for all to dodge huge search space of tuning every individual structures. Thus in terms of meta-learning, we propose $AttendOut$ which is capable of performing smart unit-specific dropout for attention models. The proposed smart dropout is nearly parameter-free and makes it possible to achieve even stronger performances with a faster tuning circle even though we evaluate our proposed method on state-of-the-art pre-trained language models. Eventually, we verify the universality of our approach on extensive downstream tasks in both pre-training and fine-tuning stages.", "keywords": "dropout;meta-learning;pre-trained language model;self-attention", "primary_area": "", "supplementary_material": "/attachment/1ef149f23ab8648fb6eeb835a02fa40bd3248c61.zip", "author": "Hongqiu Wu;hai zhao;Min Zhang", "authorids": "~Hongqiu_Wu1;~hai_zhao1;~Min_Zhang9", "gender": "M;M;M", "homepage": "https://gingasan.github.io;http://bcmi.sjtu.edu.cn/~zhaohai/;https://zhangmin-nlp-ai.github.io/", "dblp": ";25/1145-1.html;83/5342-5", "google_scholar": "https://scholar.google.com/citations?hl=en;https://scholar.google.com.tw/citations?user=4dU5KS0AAAAJ;https://scholar.google.com/citations?hl=zh-CN", "orcid": ";;", "linkedin": ";;", "or_profile": "~Hongqiu_Wu1;~hai_zhao1;~Min_Zhang9", "aff": "Shanghai Jiaotong University;Shanghai Jiaotong University;Suzhou University", "aff_domain": "sjtu.edu.cn;sjtu.edu.cn;suda.edu.cn", "position": "PhD student;Full Professor;Full Professor", "bibtex": "@misc{\nwu2022not,\ntitle={Not All Attention Is All You Need},\nauthor={Hongqiu Wu and hai zhao and Min Zhang},\nyear={2022},\nurl={https://openreview.net/forum?id=q4pQkTlImdk}\n}", "github": "", "project": "", "reviewers": "Vene;wtAZ;eXme;Ccb6", "site": "https://openreview.net/forum?id=q4pQkTlImdk", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "3;3;4;4", "correctness": "3;3;3;3", "technical_novelty": "2;3;3;2", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "49;40;50;63", "wc_summary_review": "28;43;41;74", "wc_main_review": "416;327;162;249", "wc_review": "493;410;253;386", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "510;823;511;279", "reply_reviewers": "0;0;0;0", "reply_authors": "2;1;1;1", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 50.5, 8.200609733428363 ], "wc_summary_review_avg": [ 46.5, 16.889345754054535 ], "wc_main_review_avg": [ 288.5, 93.94280174659472 ], "wc_review_avg": [ 385.5, 86.18729604761945 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 530.75, 193.3964516220502 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.6882472016116854, "corr_recommendation_correctness": 0.0, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16803844860410091669&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;1", "aff_unique_norm": "Shanghai Jiao Tong University;Suzhou University", "aff_unique_dep": ";", "aff_unique_url": "https://www.sjtu.edu.cn;https://www.suda.edu.cn", "aff_unique_abbr": "SJTU;Suda", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "It Takes Four to Tango: Multiagent Self Play for Automatic Curriculum Generation", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6991", "id": "q4tZR1Y-UIs", "poster": "", "openreview": "https://openreview.net/forum?id=q4tZR1Y-UIs", "slides": "https://iclr.cc/virtual/2022/poster/6991", "video": "https://iclr.cc/virtual/2022/poster/6991", "author_site": "Yuqing Du, Pieter Abbeel, Aditya Grover", "tldr": "", "abstract": "We are interested in training general-purpose reinforcement learning agents that can solve a wide variety of goals. Training such agents efficiently requires automatic generation of a goal curriculum. This is challenging as it requires (a) exploring goals of increasing difficulty, while ensuring that the agent (b) is exposed to a diverse set of goals in a sample efficient manner and (c) does not catastrophically forget previously solved goals. We propose Curriculum Self Play (CuSP), an automated goal generation framework that seeks to satisfy these desiderata by virtue of a multi-player game with 4 agents. We extend the asymmetric curricula learning in PAIRED (Dennis et al., 2020) to a symmetrized game that carefully balances cooperation and competition between two off-policy student learners and two regret-maximizing teachers. CuSP additionally introduces entropic goal coverage and accounts for the non-stationary nature of the students, allowing us to automatically induce a curriculum that balances progressive exploration with anti-catastrophic exploitation. We demonstrate that our method succeeds at generating an effective curricula of goals for a range of control tasks, outperforming other methods at zero-shot test-time generalization to novel out-of-distribution goals.", "keywords": "curriculum generation;unsupervised reinforcement learning;goal conditioned reinforcement learning;multi agent", "primary_area": "", "supplementary_material": "/attachment/047ac30a146793649501c5ef3f9aac75db853146.zip", "author": "Yuqing Du;Pieter Abbeel;Aditya Grover", "authorids": "~Yuqing_Du1;~Pieter_Abbeel2;~Aditya_Grover1", "gender": ";M;M", "homepage": "http://yuqingd.github.io;https://people.eecs.berkeley.edu/~pabbeel/;https://aditya-grover.github.io", "dblp": "218/5572;;162/5052", "google_scholar": ";https://scholar.google.com.tw/citations?user=vtwH6GkAAAAJ;oOhnPUgAAAAJ", "orcid": ";;", "linkedin": "yuqingdu;;", "or_profile": "~Yuqing_Du1;~Pieter_Abbeel2;~Aditya_Grover1", "aff": "University of California, Berkeley;Covariant;University of California, Los Angeles", "aff_domain": "berkeley.edu;covariant.ai;ucla.edu", "position": "PhD student;Founder;Assistant Professor", "bibtex": "@inproceedings{\ndu2022it,\ntitle={It Takes Four to Tango: Multiagent Self Play for Automatic Curriculum Generation},\nauthor={Yuqing Du and Pieter Abbeel and Aditya Grover},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=q4tZR1Y-UIs}\n}", "github": "", "project": "", "reviewers": "qGDC;JLa8;GBqb;A4MW", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "3;4;3;4", "correctness": "3;4;4;3", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "60;70;90;51", "wc_summary_review": "33;111;53;54", "wc_main_review": "265;237;183;524", "wc_review": "358;418;326;629", "wc_reply_reviewers": "43;0;0;333", "wc_reply_authors": "1149;706;699;1169", "reply_reviewers": "1;0;0;4", "reply_authors": "4;3;3;5", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 67.75, 14.49784466739798 ], "wc_summary_review_avg": [ 62.75, 29.089302157322372 ], "wc_main_review_avg": [ 302.25, 131.37612987144962 ], "wc_review_avg": [ 432.75, 118.0198606167623 ], "wc_reply_reviewers_avg": [ 94.0, 139.0988856892822 ], "wc_reply_authors_avg": [ 930.75, 228.37291323622424 ], "reply_reviewers_avg": [ 1.25, 1.6393596310755 ], "reply_authors_avg": [ 3.75, 0.82915619758885 ], "replies_avg": [ 26, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.6882472016116854, "corr_recommendation_correctness": -0.2294157338705618, "gs_citation": 23, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12921508805700086972&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=q4tZR1Y-UIs", "email": "berkeley.edu;covariant.ai;ucla.edu", "author_num": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "University of California, Berkeley;Covariant;University of California, Los Angeles", "aff_unique_dep": ";;", "aff_unique_url": "https://www.berkeley.edu;;https://www.ucla.edu", "aff_unique_abbr": "UC Berkeley;;UCLA", "aff_campus_unique_index": "0;2", "aff_campus_unique": "Berkeley;;Los Angeles", "aff_country_unique_index": "0;0", "aff_country_unique": "United States;" }, { "id": "q58E59ZPLp", "title": "Conceptron: a probabilistic deep one-class classification method", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "One-class learning through deep architectures is a particularly challenging task; in this scenario the crasis of kernel methods and deep networks can represent a viable strategy to empower already effective methods. In this contribution we present Conceptron, a probabilistic and deep one-class classification method. The proposed algorithm is a hybridization of the Nystrom version of the Import Vector Domain Description (IVDD) to deep learning layers rendering the approach highly scalable (via batch stochastic gradient optimization) and automatically learning the underlying feature space. Further we modify the cost function to allow to get a Laplace distribution of the samples probabilities. Experiments on MNIST, CIFAR-10 and other benchmark datasets show that Conceptron (and/or variations) performs comparably or better with competing state-of-the-art methods with the additional capability of providing probabilities (through a logistic model) and avoiding any degeneracy in the training process.", "keywords": "unsupervised learning;one-class learning;anomaly detection;nystrom method;kernel methods", "primary_area": "", "supplementary_material": "", "author": "Erika Gardini;Andrea Cavalli;Sergio Decherchi", "authorids": "~Erika_Gardini1;andrea.cavalli@iit.it;~Sergio_Decherchi1", "gender": "F;;M", "homepage": "https://www.unibo.it/sitoweb/erika.gardini4/didattica;;", "dblp": ";;84/2830", "google_scholar": ";;https://scholar.google.it/citations?user=T09qQ1IAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Erika_Gardini1;andrea.cavalli@iit.it;~Sergio_Decherchi1", "aff": "University of Bologna;;Istituto Italiano di Tecnologia", "aff_domain": "unibo.it;;iit.it", "position": "PhD student;;Researcher", "bibtex": "@misc{\ngardini2022conceptron,\ntitle={Conceptron: a probabilistic deep one-class classification method},\nauthor={Erika Gardini and Andrea Cavalli and Sergio Decherchi},\nyear={2022},\nurl={https://openreview.net/forum?id=q58E59ZPLp}\n}", "github": "", "project": "", "reviewers": "MnYt;5HWW;YtcK;YxAS;p2id", "site": "https://openreview.net/forum?id=q58E59ZPLp", "pdf_size": 0, "recommendation": "3;3;3;3;5", "confidence": "5;4;4;4;4", "correctness": "4;2;2;2;3", "technical_novelty": "2;2;2;2;2", "empirical_novelty": "1;1;2;1;2", "wc_summary_paper": "70;136;74;171;98", "wc_summary_review": "33;67;31;66;32", "wc_main_review": "297;1110;275;380;262", "wc_review": "400;1313;380;617;392", "wc_reply_reviewers": "0;1297;0;0;0", "wc_reply_authors": "315;1071;564;739;320", "reply_reviewers": "0;2;0;0;0", "reply_authors": "1;2;1;1;1", "recommendation_avg": [ 3.4, 0.8 ], "confidence_avg": [ 4.2, 0.39999999999999997 ], "correctness_avg": [ 2.6, 0.8 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 1.4, 0.4898979485566356 ], "wc_summary_paper_avg": [ 109.8, 38.566306538220644 ], "wc_summary_review_avg": [ 45.8, 16.9162643630324 ], "wc_main_review_avg": [ 464.8, 325.2023370149729 ], "wc_review_avg": [ 620.4, 357.2789386459829 ], "wc_reply_reviewers_avg": [ 259.4, 518.7999999999998 ], "wc_reply_authors_avg": [ 601.8, 283.57249514013165 ], "reply_reviewers_avg": [ 0.4, 0.8000000000000002 ], "reply_authors_avg": [ 1.2, 0.4000000000000001 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.25, "corr_recommendation_correctness": 0.25000000000000006, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Hh6NL5jwGTEJ:scholar.google.com/&scioq=Conceptron:+a+probabilistic+deep+one-class+classification+method&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1", "aff_unique_norm": "University of Bologna;Istituto Italiano di Tecnologia", "aff_unique_dep": ";", "aff_unique_url": "https://www.unibo.it;https://www.iit.it", "aff_unique_abbr": "Unibo;IIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Italy" }, { "id": "q5ru7alcpfM", "title": "Unsupervised Domain Adaptation By Optimal Transportation Of Clusters Between Domains", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Unsupervised domain adaptation (UDA) aims to transfer the knowledge from a labeled source domain to an unlabeled target domain. Typically, to guarantee desirable knowledge transfer, aligning the distribution between source and target domain from a global perspective is widely adopted in UDA. Recent researchers further point out the importance of local-level alignment and borrow the experience from Optimal Transport (OT) theory to construct instance-pair alignment.\n However, existing OT-based algorithms are limited to resolve class imbalance challenge and require a huge computation cost when considering a large-scale training situation. \nIn this paper, we address these two issues by proposing a Clustering-based Optimal Transport (COT) algorithm, which formulates the alignment procedure as an Optimal Transport problem by capturing the fine-grained attribute alignment. Concretely, COT innovatively designs the loss derived from discrete Kantorovich dual form to construct a mapping between clustering centers in source and target domain, which simultaneously eliminates the negative effect brought by class imbalance and reduces the computation cost on the basis of theoretical analysis. Finally, our COT together with some previous UDA methods achieve superior performance on several benchmarks. ", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yang Liu;Zhipeng Zhou;Lei Shang;Baigui Sun;Hao Li;Rong Jin", "authorids": "~Yang_Liu51;~Zhipeng_Zhou4;~Lei_Shang1;~Baigui_Sun1;~Hao_Li16;~Rong_Jin1", "gender": "M;M;;M;M;M", "homepage": ";;;;;https://www.cse.msu.edu/~rongjin/", "dblp": "27/3367-5;;;186/8016;17/5705-30;j/RongJin", "google_scholar": "t1emSE0AAAAJ;Ot0PPAcAAAAJ;WO1eMcIAAAAJ;ZNhTHywAAAAJ;pHN-QIwAAAAJ;", "orcid": ";;;0000-0001-7722-4748;;", "linkedin": ";;;;%E6%98%8A-%E6%9D%8E-392547a5/detail/recent-activity/;", "or_profile": "~Yang_Liu51;~Zhipeng_Zhou4;~Lei_Shang1;~Baigui_Sun1;~Li_Hao1;~Rong_Jin3", "aff": "Alibaba Group;Alibaba Group;Alibaba Group;Alibaba Group;Alibaba Group;Alibaba Group", "aff_domain": "alibaba-inc.com;alibaba-inc.com;alibaba-inc.com;alibaba-inc.com;alibaba-inc.com;alibaba-inc.com", "position": "Researcher at Alibaba Group;Researcher;Researcher;Researcher;Researcher;Researcher", "bibtex": "@misc{\nliu2022unsupervised,\ntitle={Unsupervised Domain Adaptation By Optimal Transportation Of Clusters Between Domains},\nauthor={Yang Liu and Zhipeng Zhou and Lei Shang and Baigui Sun and Hao Li and Rong Jin},\nyear={2022},\nurl={https://openreview.net/forum?id=q5ru7alcpfM}\n}", "github": "", "project": "", "reviewers": "DWgV;dRKj;efDP", "site": "https://openreview.net/forum?id=q5ru7alcpfM", "pdf_size": 0, "recommendation": "1;3;3", "confidence": "5;3;3", "correctness": "4;2;3", "technical_novelty": "2;2;3", "empirical_novelty": "1;2;2", "wc_summary_paper": "37;73;51", "wc_summary_review": "11;37;71", "wc_main_review": "88;180;291", "wc_review": "136;290;413", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 2.3333333333333335, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.9428090415820634 ], "correctness_avg": [ 3.0, 0.816496580927726 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 1.6666666666666667, 0.4714045207910317 ], "wc_summary_paper_avg": [ 53.666666666666664, 14.817407180595247 ], "wc_summary_review_avg": [ 39.666666666666664, 24.567367696917707 ], "wc_main_review_avg": [ 186.33333333333334, 82.99531445944538 ], "wc_review_avg": [ 279.6666666666667, 113.32058751857738 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -1.0, "corr_recommendation_correctness": -0.8660254037844387, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17381411919376270140&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;0;0;0;0", "aff_unique_norm": "Alibaba Group", "aff_unique_dep": "", "aff_unique_url": "https://www.alibaba.com", "aff_unique_abbr": "Alibaba", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Learning to Complete Code with Sketches", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6551", "id": "q79uMSC6ZBT", "poster": "", "openreview": "https://openreview.net/forum?id=q79uMSC6ZBT", "slides": "https://iclr.cc/virtual/2022/poster/6551", "video": "https://iclr.cc/virtual/2022/poster/6551", "author_site": "Daya Guo, Alexey Svyatkovskiy, Jian Yin, Nan Duan, Marc Brockschmidt, Miltiadis Allamanis", "tldr": "", "abstract": "Code completion is usually cast as a language modelling problem, i.e., continuing an input in a left-to-right fashion. However, in practice, some parts of the completion (e.g., string literals) may be very hard to predict, whereas subsequent parts directly follow from the context.\nTo handle this, we instead consider the scenario of generating code completions with \"holes\" inserted in places where a model is uncertain. We develop Grammformer, a Transformer-based model that guides the code generation by the programming language grammar, and compare it to a variety of more standard sequence models.\n\nWe train the models on code completion for C# and Python given partial code context. To evaluate models, we consider both ROUGE as well as a new metric RegexAcc that measures success of generating completions matching long outputs with as few holes as possible.\nIn our experiments, Grammformer generates 10-50% more accurate completions compared to traditional generative models and 37-50% longer sketches compared to sketch-generating baselines trained with similar techniques.", "keywords": "sketch;generative model;ml4code", "primary_area": "", "supplementary_material": "", "author": "Daya Guo;Alexey Svyatkovskiy;Jian Yin;Nan Duan;Marc Brockschmidt;Miltiadis Allamanis", "authorids": "~Daya_Guo3;~Alexey_Svyatkovskiy1;~Jian_Yin3;~Nan_Duan1;~Marc_Brockschmidt1;~Miltiadis_Allamanis1", "gender": ";M;M;M;M;", "homepage": ";https://www.microsoft.com/en-us/research/people/alsvyatk/;http://sai.sysu.edu.cn/teacher/teacher01/1385356.htm;https://nanduan.github.io/;;", "dblp": ";198/0454;95/578-1;;80/8292;", "google_scholar": ";0Oj4J4wAAAAJ;;Qaa6OxIAAAAJ;https://scholar.google.co.uk/citations?user=pF27eLMAAAAJ;", "orcid": ";0000-0001-7714-4481;;;;", "linkedin": ";;;;;", "or_profile": "~Daya_Guo3;~Alexey_Svyatkovskiy1;~Jian_Yin3;~Nan_Duan1;~Marc_Brockschmidt1;~Miltiadis_Allamanis1", "aff": ";Microsoft;SUN YAT-SEN UNIVERSITY;Microsoft Research Asia;Microsoft Research;", "aff_domain": ";microsoft.com;sysu.edu.cn;microsoft.com;microsoft.com;", "position": ";Principal Researcher;Full Professor;Principal Researcher;Researcher;", "bibtex": "@inproceedings{\nguo2022learning,\ntitle={Learning to Complete Code with Sketches},\nauthor={Daya Guo and Alexey Svyatkovskiy and Jian Yin and Nan Duan and Marc Brockschmidt and Miltiadis Allamanis},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=q79uMSC6ZBT}\n}", "github": "", "project": "", "reviewers": "Arrw;suKF;Qkek;aRD4", "pdf_size": 0, "recommendation": "5;6;8;8", "confidence": "4;4;5;4", "correctness": "3;4;4;4", "technical_novelty": "3;3;3;3", "empirical_novelty": "3;3;3;3", "wc_summary_paper": "52;42;94;57", "wc_summary_review": "26;91;75;53", "wc_main_review": "226;568;598;351", "wc_review": "304;701;767;461", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "277;421;1226;136", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;2;1", "recommendation_avg": [ 6.75, 1.299038105676658 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 61.25, 19.66437133498043 ], "wc_summary_review_avg": [ 61.25, 24.416951079117148 ], "wc_main_review_avg": [ 435.75, 154.10446943550988 ], "wc_review_avg": [ 558.25, 185.77859806769993 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 515.0, 422.68250496087484 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.5555555555555555, "corr_recommendation_correctness": 0.7777777777777777, "gs_citation": 55, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4424270975921359577&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=q79uMSC6ZBT", "email": ";microsoft.com;sysu.edu.cn;microsoft.com;microsoft.com;", "author_num": 6, "aff_unique_index": "0;1;0;0", "aff_unique_norm": "Microsoft;Sun Yat-sen University", "aff_unique_dep": "Microsoft Corporation;", "aff_unique_url": "https://www.microsoft.com;http://www.sysu.edu.cn", "aff_unique_abbr": "Microsoft;SYSU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Asia", "aff_country_unique_index": "0;1;1;0", "aff_country_unique": "United States;China" }, { "title": "$\\beta$-Intact-VAE: Identifying and Estimating Causal Effects under Limited Overlap", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6819", "id": "q7n2RngwOM", "poster": "", "openreview": "https://openreview.net/forum?id=q7n2RngwOM", "slides": "https://iclr.cc/virtual/2022/poster/6819", "video": "https://iclr.cc/virtual/2022/poster/6819", "author_site": "Pengzhou Wu, Kenji Fukumizu", "tldr": "", "abstract": "As an important problem in causal inference, we discuss the identification and estimation of treatment effects (TEs) under limited overlap; that is, when subjects with certain features belong to a single treatment group. We use a latent variable to model a prognostic score which is widely used in biostatistics and sufficient for TEs; i.e., we build a generative prognostic model. We prove that the latent variable recovers a prognostic score, and the model identifies individualized treatment effects. The model is then learned as $\\beta$-Intact-VAE\u2013\u2013a new type of variational autoencoder (VAE). We derive the TE error bounds that enable representations balanced for treatment groups conditioned on individualized features. The proposed method is compared with recent methods using (semi-)synthetic datasets. ", "keywords": "VAE;variational autoencoder;balanced representation Learning;treatment effects;causal inference;identifiability;identification;CATE;ATE;weak overlap;limited overlap;Prognostic Model;Prognostic score", "primary_area": "", "supplementary_material": "/attachment/b62f5e57992d1cdabff0277e0af771194c7d3006.zip", "author": "Pengzhou Abel Wu;Kenji Fukumizu", "authorids": "~Pengzhou_Abel_Wu1;~Kenji_Fukumizu1", "gender": "M;M", "homepage": ";http://www.ism.ac.jp/~fukumizu/", "dblp": "256/1725;96/464", "google_scholar": "4IuyryIAAAAJ;", "orcid": ";0000-0002-3488-2625", "linkedin": ";", "or_profile": "~Pengzhou_Abel_Wu1;~Kenji_Fukumizu1", "aff": "The Institute of Statistical Mathematics;The Institute of Statistical Mathematics, Japan, Tokyo Institute of Technology", "aff_domain": "ism.ac.jp;ism.ac.jp", "position": "PhD student;Full Professor", "bibtex": "@inproceedings{\nwu2022betaintactvae,\ntitle={\\${\\textbackslash}beta\\$-Intact-{VAE}: Identifying and Estimating Causal Effects under Limited Overlap},\nauthor={Pengzhou Abel Wu and Kenji Fukumizu},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=q7n2RngwOM}\n}", "github": "", "project": "", "reviewers": "ENDL;ZUkW;sh3V;3gmW", "pdf_size": 0, "recommendation": "6;6;6;8", "confidence": "3;3;3;3", "correctness": "4;4;3;3", "technical_novelty": "3;3;3;3", "empirical_novelty": "3;3;2;2", "wc_summary_paper": "140;95;107;64", "wc_summary_review": "29;53;116;35", "wc_main_review": "314;401;967;179", "wc_review": "483;549;1190;278", "wc_reply_reviewers": "78;510;146;0", "wc_reply_authors": "345;828;1206;201", "reply_reviewers": "1;1;1;0", "reply_authors": "3;2;5;1", "recommendation_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 3.0, 0.0 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 101.5, 27.207535720825582 ], "wc_summary_review_avg": [ 58.25, 34.491846862700754 ], "wc_main_review_avg": [ 465.25, 300.290172166856 ], "wc_review_avg": [ 625.0, 341.1649161329459 ], "wc_reply_reviewers_avg": [ 183.5, 195.45523784232543 ], "wc_reply_authors_avg": [ 645.0, 398.5429713343343 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 2.75, 1.479019945774904 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": -0.5773502691896257, "gs_citation": 29, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2544493232264415763&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=q7n2RngwOM", "email": "ism.ac.jp;ism.ac.jp", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Institute of Statistical Mathematics", "aff_unique_dep": "", "aff_unique_url": "https://www.ism.ac.jp", "aff_unique_abbr": "ISM", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Japan" }, { "id": "q9zIvzRaU94", "title": "Causal discovery from conditionally stationary time-series", "track": "main", "status": "Reject", "tldr": "", "abstract": "Causal discovery, i.e., inferring underlying cause-effect relationships from observations of a scene or system, is an inherent mechanism in human cognition, but has been shown to be highly challenging to automate. The majority of approaches in the literature aiming for this task consider constrained scenarios with fully observed variables or data from stationary time-series. \nIn this work we aim for causal discovery in a more general class of scenarios, scenes with non-stationary behavior over time. For our purposes we here regard a scene as a composition objects interacting with each other over time. Non-stationarity is modeled as stationarity conditioned on an underlying variable, a state, which can be of varying dimension, more or less hidden given observations of the scene, and also depend more or less directly on these observations.\nWe propose a probabilistic deep learning approach called State-Dependent Causal Inference (SDCI) for causal discovery in such conditionally stationary time-series data. Results in two different synthetic scenarios show that this method is able to recover the underlying causal dependencies with high accuracy even in cases with hidden states.", "keywords": "causal discovery;temporal inference;graph neural network;time series;non-stationary;probabilistic modelling", "primary_area": "", "supplementary_material": "/attachment/1815becd6b3721717bd63d00e0f7632c41dcfeb4.zip", "author": "Carles Balsells Rodas;Ruibo Tu;Hedvig Kjellstrom", "authorids": "~Carles_Balsells_Rodas1;~Ruibo_Tu1;~Hedvig_Kjellstrom1", "gender": "M;M;F", "homepage": ";https://www.kth.se/profile/ruibo/?l=en;https://www.kth.se/profile/hedvig", "dblp": ";223/4417;k/HedvigKjellstrom", "google_scholar": "ZHmqn_AAAAAJ;https://scholar.google.se/citations?user=auIx_r0AAAAJ;wr3CtKAAAAAJ", "orcid": ";;0000-0002-5750-9655", "linkedin": "carles-balsells-rodas-a07911150/;;hedvig-kjellstr%C3%B6m-aaa973/", "or_profile": "~Carles_Balsells_Rodas1;~Ruibo_Tu1;~Hedvig_Kjellstrom1", "aff": "Imperial College London, Imperial College London;KTH Royal Institute of Technology, Stockholm, Sweden;KTH Royal Institute of Technology", "aff_domain": "imperial.ac.uk;kth.se;kth.se", "position": "PhD student;PhD student;Full Professor", "bibtex": "@misc{\nrodas2022causal,\ntitle={Causal discovery from conditionally stationary time-series },\nauthor={Carles Balsells Rodas and Ruibo Tu and Hedvig Kjellstrom},\nyear={2022},\nurl={https://openreview.net/forum?id=q9zIvzRaU94}\n}", "github": "", "project": "", "reviewers": "ukPw;BArP;quAM;15XZ", "site": "https://openreview.net/forum?id=q9zIvzRaU94", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "4;3;4;3", "correctness": "2;3;3;2", "technical_novelty": "2;2;3;2", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "33;67;90;55", "wc_summary_review": "23;76;22;44", "wc_main_review": "181;295;503;210", "wc_review": "237;438;615;309", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 61.25, 20.595812681222366 ], "wc_summary_review_avg": [ 41.25, 21.901769334919038 ], "wc_main_review_avg": [ 297.25, 125.96105548938529 ], "wc_review_avg": [ 399.75, 143.63038501654168 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=646150921720601296&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1;1", "aff_unique_norm": "Imperial College London;KTH Royal Institute of Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.imperial.ac.uk;https://www.kth.se", "aff_unique_abbr": "ICL;KTH", "aff_campus_unique_index": "1", "aff_campus_unique": ";Stockholm", "aff_country_unique_index": "0;1;1", "aff_country_unique": "United Kingdom;Sweden" }, { "id": "qCBmozgVr9r", "title": "Few-Shot Attribute Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Semantic concepts are often defined by a combination of attributes. The use of attributes also facilitates learning of new concepts with zero or few examples. However, the zero-shot learning paradigm assumes that the set of attributes are known and fixed, which is a limitation if a test-time task depends on a previously irrelevant attribute. In this work we study rapid learning of attributes that are previously not labeled in the dataset. Compared to standard few-shot learning of semantic classes, learning new attributes imposes a stiffer challenge. We found that directly supervising the model with a set of training attributes does not generalize well on the test attributes, whereas self-supervised pre-training brings significant improvement. We further experimented with random splits of the attribute space and found that the predictability of attributes provides an informative estimate of a model's ability to generalize.", "keywords": "Few-shot learning;transfer learning;attribute learning", "primary_area": "", "supplementary_material": "", "author": "Mengye Ren;Eleni Triantafillou;Kuan-Chieh Wang;James Lucas;Jake Snell;Xaq Pitkow;Andreas S. Tolias;Richard Zemel", "authorids": "~Mengye_Ren1;~Eleni_Triantafillou1;~Kuan-Chieh_Wang1;~James_Lucas1;~Jake_Snell1;~Xaq_Pitkow1;~Andreas_S._Tolias1;~Richard_Zemel1", "gender": ";F;;M;M;M;;M", "homepage": "http://www.cs.toronto.edu/~mren;http://www.cs.toronto.edu/~eleni/;https://wangkua1.github.io;http://www.cs.toronto.edu/~jlucas/;https://www.jakesnell.com;http://xaqlab.com;;http://www.cs.columbia.edu/~zemel", "dblp": "163/1952;183/8430;13/7562;24/2474;172/1406;116/2845;32/3057;16/6366", "google_scholar": "XcQ9WqMAAAAJ;Y5x2ZgQAAAAJ;https://scholar.google.ca/citations?user=LgMuT6IAAAAJ;https://scholar.google.ca/citations?user=AYaHBAQAAAAJ;MbXKAK8AAAAJ;;;https://scholar.google.ca/citations?user=iBeDoRAAAAAJ", "orcid": ";;;;;;;", "linkedin": ";;;;;;;", "or_profile": "~Mengye_Ren1;~Eleni_Triantafillou1;~Kuan-Chieh_Wang1;~James_Lucas1;~Jake_Snell1;~Xaq_Pitkow1;~Andreas_S._Tolias1;~Richard_Zemel1", "aff": "Google;Google;Stanford University;Department of Computer Science, University of Toronto;Department of Computer Science, University of Toronto;Baylor College of Medicine;Baylor College of Medicine;Department of Computer Science, University of Toronto", "aff_domain": "google.com;google.com;stanford.edu;cs.toronto.edu;cs.toronto.edu;bcm.edu;bcm.edu;cs.toronto.edu", "position": "Visiting Researcher;Researcher;Postdoc;PhD Candidate;Postdoc;;Professor;Full Professor", "bibtex": "@misc{\nren2022fewshot,\ntitle={Few-Shot Attribute Learning},\nauthor={Mengye Ren and Eleni Triantafillou and Kuan-Chieh Wang and James Lucas and Jake Snell and Xaq Pitkow and Andreas S. Tolias and Richard Zemel},\nyear={2022},\nurl={https://openreview.net/forum?id=qCBmozgVr9r}\n}", "github": "", "project": "", "reviewers": "ym8e;tUmn;5A2w;7SAf", "site": "https://openreview.net/forum?id=qCBmozgVr9r", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "5;4;4;3", "correctness": "2;3;4;3", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "144;95;106;100", "wc_summary_review": "114;50;62;22", "wc_main_review": "923;546;368;385", "wc_review": "1181;691;536;507", "wc_reply_reviewers": "524;38;212;0", "wc_reply_authors": "2194;1017;497;986", "reply_reviewers": "1;1;1;0", "reply_authors": "4;2;1;2", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 111.25, 19.30511590226798 ], "wc_summary_review_avg": [ 62.0, 33.34666400106613 ], "wc_main_review_avg": [ 555.5, 223.25601895581673 ], "wc_review_avg": [ 728.75, 270.31497831233844 ], "wc_reply_reviewers_avg": [ 193.5, 206.8785875821855 ], "wc_reply_authors_avg": [ 1173.5, 624.2437424596261 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 2.25, 1.0897247358851685 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": -0.9733285267845754, "corr_recommendation_correctness": 0.6488856845230502, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1, "aff_unique_index": "0;0;1;2;2;3;3;2", "aff_unique_norm": "Google;Stanford University;University of Toronto;Baylor College of Medicine", "aff_unique_dep": "Google;;Department of Computer Science;", "aff_unique_url": "https://www.google.com;https://www.stanford.edu;https://www.utoronto.ca;https://www.bcm.edu", "aff_unique_abbr": "Google;Stanford;U of T;BCM", "aff_campus_unique_index": "0;0;1;2;2;2", "aff_campus_unique": "Mountain View;Stanford;Toronto;", "aff_country_unique_index": "0;0;0;1;1;0;0;1", "aff_country_unique": "United States;Canada" }, { "id": "qDx6DXD3Fzt", "title": "Provably Robust Detection of Out-of-distribution Data (almost) for free", "track": "main", "status": "Reject", "tldr": "", "abstract": "The application of machine learning in safety-critical systems requires a reliable assessment of uncertainy. However, deep neural networks are known to produce highly overconfident predictions on out-of-distribution (OOD) data. Even if trained to be non-confident on OOD data one can still adversarially manipulate OOD data so that the classifier again assigns high confidence to the manipulated samples. In this paper we propose a novel method that combines a certifiable OOD detector with a standard classifier from first principles into an OOD aware classifier. This way we achieve the best of two worlds: certifiably adversarially robust OOD detection, even for OOD samples close to the in-distribution, without loss in either prediction accuracy or detection performance for non-manipulated OOD data. Moreover, due to the particular construction our classifier provably avoids the asymptotic overconfidence problem of standard neural networks.", "keywords": "out-of-distribution detection;adversarial noise;provable robustness;guarantees", "primary_area": "", "supplementary_material": "/attachment/446082ed83fcdffbc56016577ec4758d56607832.zip", "author": "Alexander Meinke;Julian Bitterwolf;Matthias Hein", "authorids": "~Alexander_Meinke1;~Julian_Bitterwolf1;~Matthias_Hein2", "gender": "M;;M", "homepage": ";https://uni-tuebingen.de/fakultaeten/mathematisch-naturwissenschaftliche-fakultaet/fachbereiche/informatik/lehrstuehle/maschinelles-lernen/team/julian-bitterwolf-msc/;https://uni-tuebingen.de/de/164260", "dblp": "249/5767;232/1887;97/1213-1", "google_scholar": "https://scholar.google.de/citations?user=PqHTP_AAAAAJ;;0ZAb3tsAAAAJ", "orcid": ";;", "linkedin": "alexander-meinke-a32904173/;;", "or_profile": "~Alexander_Meinke1;~Julian_Bitterwolf1;~Matthias_Hein2", "aff": "Max-Planck-Institute for Intelligent Systems, Max-Planck Institute;University of T\u00fcbingen;University of T\u00fcbingen", "aff_domain": "is.mpg.de;uni-tuebingen.de;uni-tuebingen.de", "position": "PhD student;PhD student;Full Professor", "bibtex": "@misc{\nmeinke2022provably,\ntitle={Provably Robust Detection of Out-of-distribution Data (almost) for free},\nauthor={Alexander Meinke and Julian Bitterwolf and Matthias Hein},\nyear={2022},\nurl={https://openreview.net/forum?id=qDx6DXD3Fzt}\n}", "github": "", "project": "", "reviewers": "guNu;urW9;fH7P;U32o;6Xmu", "site": "https://openreview.net/forum?id=qDx6DXD3Fzt", "pdf_size": 0, "recommendation": "3;5;6;6;8", "confidence": "4;4;4;4;5", "correctness": "2;3;4;2;3", "technical_novelty": "2;2;3;2;3", "empirical_novelty": "2;2;4;2;3", "wc_summary_paper": "97;54;75;57;90", "wc_summary_review": "158;49;14;78;78", "wc_main_review": "568;279;261;440;923", "wc_review": "823;382;350;575;1091", "wc_reply_reviewers": "1080;0;0;285;134", "wc_reply_authors": "2961;962;806;1303;976", "reply_reviewers": "4;0;0;2;2", "reply_authors": "5;2;2;3;3", "recommendation_avg": [ 5.6, 1.624807680927192 ], "confidence_avg": [ 4.2, 0.39999999999999997 ], "correctness_avg": [ 2.8, 0.7483314773547882 ], "technical_novelty_avg": [ 2.4, 0.4898979485566356 ], "empirical_novelty_avg": [ 2.6, 0.8 ], "wc_summary_paper_avg": [ 74.6, 17.165080832900262 ], "wc_summary_review_avg": [ 75.4, 47.54618807012819 ], "wc_main_review_avg": [ 494.2, 242.05239102310063 ], "wc_review_avg": [ 644.2, 279.88954964414086 ], "wc_reply_reviewers_avg": [ 299.8, 404.0249497246427 ], "wc_reply_authors_avg": [ 1401.6, 796.3131544813259 ], "reply_reviewers_avg": [ 1.6, 1.4966629547095767 ], "reply_authors_avg": [ 3.0, 1.0954451150103321 ], "replies_avg": [ 30, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.7385489458759964, "corr_recommendation_correctness": 0.4276686017238498, "gs_citation": 26, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12894786482786004446&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff_unique_index": "0;1;1", "aff_unique_norm": "Max-Planck-Institute for Intelligent Systems;University of T\u00fcbingen", "aff_unique_dep": "Intelligent Systems;", "aff_unique_url": "https://www.mpi-is.mpg.de;https://www.uni-tuebingen.de/", "aff_unique_abbr": "MPI-IS;Uni T\u00fcbingen", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Germany" }, { "id": "qEGBB9YB31", "title": "Where do Models go Wrong? Parameter-Space Saliency Maps for Explainability", "track": "main", "status": "Reject", "tldr": "", "abstract": "Conventional saliency maps highlight input features to which neural network predictions are highly sensitive. We take a different approach to saliency, in which we identify and analyze the network parameters, rather than inputs, which are responsible for erroneous decisions. We first verify that identified salient parameters are indeed responsible for misclassification by showing that turning these parameters off improves predictions on the associated samples, more than pruning the same number of random or least salient parameters. We further validate the link between salient parameters and network misclassification errors by observing that fine-tuning a small number of the most salient parameters on a single sample results in error correction on other samples which were misclassified for similar reasons -- nearest neighbors in the saliency space. After validating our parameter-space saliency maps, we demonstrate that samples which cause similar parameters to malfunction are semantically similar. Further, we introduce an input-space saliency counterpart which reveals how image features cause specific network components to malfunction.", "keywords": "explainability;interpretability;saliency maps;parameter saliency", "primary_area": "", "supplementary_material": "/attachment/bf16b3fa472f84d5a8035af5928d059999e1188b.zip", "author": "Roman Levin;Manli Shu;Eitan Borgnia;Furong Huang;Micah Goldblum;Tom Goldstein", "authorids": "~Roman_Levin1;~Manli_Shu1;~Eitan_Borgnia1;~Furong_Huang1;~Micah_Goldblum1;~Tom_Goldstein1", "gender": "M;F;M;F;;M", "homepage": ";https://azshue.github.io/;https://eitanborgnia.com;https://furong-huang.com;;https://www.cs.umd.edu/~tomg/", "dblp": "278/3201;263/3503;;72/8513;241/7231;25/8184", "google_scholar": "WWKiF4wAAAAJ;https://scholar.google.com/citations?hl=en;;13yyuCcAAAAJ;pGDKzuUAAAAJ;KmSuVtgAAAAJ", "orcid": ";;;;;", "linkedin": "rilevin;manli-shu-a804a8164/;;;;", "or_profile": "~Roman_Levin1;~Manli_Shu1;~Eitan_Borgnia1;~Furong_Huang1;~Micah_Goldblum1;~Tom_Goldstein1", "aff": "University of Washington, Seattle;Department of Computer Science, University of Maryland, College Park;University of Maryland, College Park;University of Maryland;New York University;University of Maryland, College Park", "aff_domain": "uw.edu;cs.umd.edu;umd.edu;cs.umd.edu;nyu.edu;umd.edu", "position": "PhD student;PhD student;Researcher;Assistant Professor;Postdoc;Associate Professor", "bibtex": "@misc{\nlevin2022where,\ntitle={Where do Models go Wrong? Parameter-Space Saliency Maps for Explainability},\nauthor={Roman Levin and Manli Shu and Eitan Borgnia and Furong Huang and Micah Goldblum and Tom Goldstein},\nyear={2022},\nurl={https://openreview.net/forum?id=qEGBB9YB31}\n}", "github": "", "project": "", "reviewers": "sCeW;tkQp;3o7Z;qR4u", "site": "https://openreview.net/forum?id=qEGBB9YB31", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "5;4;4;2", "correctness": "4;2;3;4", "technical_novelty": "3;3;3;4", "empirical_novelty": "3;2;3;3", "wc_summary_paper": "39;45;63;32", "wc_summary_review": "8;84;43;41", "wc_main_review": "88;606;120;47", "wc_review": "135;735;226;120", "wc_reply_reviewers": "0;54;0;17", "wc_reply_authors": "714;2107;525;41", "reply_reviewers": "0;1;0;1", "reply_authors": "2;5;2;2", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 3.75, 1.0897247358851685 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 3.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 44.75, 11.497282287566918 ], "wc_summary_review_avg": [ 44.0, 26.953663943887108 ], "wc_main_review_avg": [ 215.25, 227.078593222699 ], "wc_review_avg": [ 304.0, 252.12199428054666 ], "wc_reply_reviewers_avg": [ 17.75, 22.049659861322123 ], "wc_reply_authors_avg": [ 846.75, 767.8881347566194 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 2.75, 1.299038105676658 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.6882472016116854, "corr_recommendation_correctness": 0.30151134457776363, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=765476118688107912&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff_unique_index": "0;1;2;2;3;2", "aff_unique_norm": "University of Washington;University of Maryland, College Park;University of Maryland;New York University", "aff_unique_dep": ";Department of Computer Science;;", "aff_unique_url": "https://www.washington.edu;https://www/umd.edu;https://www/umd.edu;https://www.nyu.edu", "aff_unique_abbr": "UW;UMD;UMD;NYU", "aff_campus_unique_index": "0;1;1;1", "aff_campus_unique": "Seattle;College Park;", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "qESp3gXBm2g", "title": "TRAKR \u2013 A reservoir-based tool for fast and accurate classification of neural time-series patterns", "track": "main", "status": "Reject", "tldr": "", "abstract": "Neuroscience has seen a dramatic increase in the types of recording modalities and complexity of neural time-series data collected from them. The brain is a highly recurrent system producing rich, complex dynamics that result in different behaviors. Correctly distinguishing such nonlinear neural time series in real-time, especially those with non-obvious links to behavior, could be useful for a wide variety of applications. These include detecting anomalous clinical events such as seizures in epilepsy, and identifying optimal control spaces for brain machine interfaces. It remains challenging to correctly distinguish nonlinear time-series patterns because of the high intrinsic dimensionality of such data, making accurate inference of state changes (for intervention or control) difficult. Simple distance metrics, which can be computed quickly do not yield accurate classifications. On the other end of the spectrum of classification methods, ensembles of classifiers or deep supervised tools offer higher accuracy but are slow, data-intensive, and computationally expensive. We introduce a reservoir-based tool, state tracker (TRAKR), which offers the high accuracy of ensembles or deep supervised methods while preserving the computational benefits of simple distance metrics. After one-shot training, TRAKR can accurately, and in real time, detect deviations in test patterns. By forcing the weighted dynamics of the reservoir to fit a desired pattern directly, we avoid many rounds of expensive optimization. Then, keeping the output weights frozen, we use the error signal generated by the reservoir in response to a particular test pattern as a classification boundary. We show that, using this approach, TRAKR accurately detects changes in synthetic time series. We then compare our tool to several others, showing that it achieves highest classification performance on a benchmark dataset\u2013sequential MNIST\u2013even when corrupted by noise. Additionally, we apply TRAKR to electrocorticography (ECoG) data from the macaque orbitofrontal cortex (OFC), a higher-order brain region involved in encoding the value of expected outcomes. We show that TRAKR can classify different behaviorally relevant epochs in the neural time series more accurately and efficiently than conventional approaches. Therefore, TRAKR can be used as a fast and accurate tool to distinguish patterns in complex nonlinear time-series data, such as neural recordings.", "keywords": "neuroscience;recurrent neural networks;reservoir networks;time series classification;neural data;electrophysiological recordings", "primary_area": "", "supplementary_material": "/attachment/49efdbc88e1fa3b24b6d387ac00c9908b75fd5d8.zip", "author": "Muhammad Furqan Afzal;Christian D Marton;Erin L. Rich;Kanaka Rajan", "authorids": "~Muhammad_Furqan_Afzal1;~Christian_D_Marton1;~Erin_L._Rich1;~Kanaka_Rajan1", "gender": "M;;F;F", "homepage": "https://furqanafzal.wixsite.com/furqanafzal;https://cdmdc.github.io/;http://labs.neuroscience.mssm.edu/project/rich-lab/;https://www.rajanlab.com/", "dblp": ";;;94/10452", "google_scholar": "btXbSigAAAAJ;;MA_UEhYAAAAJ;IC6n33kAAAAJ", "orcid": "0000-0002-8476-4539;0000-0002-3703-1295;0000-0002-7153-6027;0000-0003-2749-2917", "linkedin": ";;;rajankdr", "or_profile": "~Muhammad_Furqan_Afzal1;~Christian_D_Marton1;~Erin_L._Rich1;~Kanaka_Rajan1", "aff": "Icahn School of Medicine at Mount Sinai;;Mount Sinai School of Medicine;Icahn School of Medicine at Mount Sinai", "aff_domain": "mssm.edu;;mssm.edu;mssm.edu", "position": "PhD Candidate;;Assistant Professor;Assistant Professor", "bibtex": "@misc{\nafzal2022trakr,\ntitle={{TRAKR} {\\textendash} A reservoir-based tool for fast and accurate classification of neural time-series patterns},\nauthor={Muhammad Furqan Afzal and Christian D Marton and Erin L. Rich and Kanaka Rajan},\nyear={2022},\nurl={https://openreview.net/forum?id=qESp3gXBm2g}\n}", "github": "", "project": "", "reviewers": "m9nG;dzZZ;xpen", "site": "https://openreview.net/forum?id=qESp3gXBm2g", "pdf_size": 0, "recommendation": "3;6;6", "confidence": "4;3;3", "correctness": "3;3;3", "technical_novelty": "2;3;2", "empirical_novelty": "2;2;2", "wc_summary_paper": "36;110;236", "wc_summary_review": "88;78;8", "wc_main_review": "340;253;384", "wc_review": "464;441;628", "wc_reply_reviewers": "0;62;229", "wc_reply_authors": "1678;821;1714", "reply_reviewers": "0;1;1", "reply_authors": "3;2;4", "recommendation_avg": [ 5.0, 1.4142135623730951 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 127.33333333333333, 82.56445293762255 ], "wc_summary_review_avg": [ 58.0, 35.59026084010437 ], "wc_main_review_avg": [ 325.6666666666667, 54.43242497711165 ], "wc_review_avg": [ 511.0, 83.26263667856469 ], "wc_reply_reviewers_avg": [ 97.0, 96.70918605110202 ], "wc_reply_authors_avg": [ 1404.3333333333333, 412.74070418228547 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 3.0, 0.816496580927726 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -1.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:hKfE4XNnRhgJ:scholar.google.com/&scioq=TRAKR+%E2%80%93+A+reservoir-based+tool+for+fast+and+accurate+classification+of+neural+time-series+patterns&hl=en&as_sdt=0,5", "gs_version_total": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "Icahn School of Medicine at Mount Sinai;Mount Sinai School of Medicine", "aff_unique_dep": "School of Medicine;School of Medicine", "aff_unique_url": "https://icahn.mssm.edu;https://www.mountsinai.org", "aff_unique_abbr": "ISMMS;Mount Sinai", "aff_campus_unique_index": "0;0", "aff_campus_unique": "New York;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "qHsuiKXkUb", "title": "High Precision Score-based Diffusion Models", "track": "main", "status": "Reject", "tldr": "", "abstract": "Recent advances in diffusion models bring the state-of-the art performance on image generation tasks. However, the image generation is still an arduous task in high resolution, both theoretically and practically. From the theory side, the difficulty arises in estimating the high precision diffusion because the data score goes to $\\infty$ as $t \\rightarrow 0$ of the diffusion time. This paper resolves this difficulty by improving the previous diffusion models from three aspects. First, we propose an alternative parameterization for such unbounded data score, which theoretically enables the unbounded score estimation. Second, we provide a practical soft truncation method (ST-trick) to handle the extreme variation of the score scales. Third, we design a reciprocal variance exploding stochastic differential equation (RVESDE) to enable the sampling at the high precision of $t$. These three improvements are applicable to the variations of both NCSN and DDPM, and our improved versions are named as HNCSN and HDDPM, respectively. The experiments show that the improvements result in the state-of-the-art performances in the high resolution image generation, i.e. CelebA-HQ. Also, our ablation study empirically illustrates that all of alternative parameterization, ST-trick, and RVESDE contributes to the performance enhancement.", "keywords": "Diffusion Model;Score-based Model", "primary_area": "", "supplementary_material": "", "author": "Dongjun Kim;Seungjae Shin;Kyungwoo Song;Wanmo Kang;Il-chul Moon", "authorids": "~Dongjun_Kim1;~Seungjae_Shin1;~Kyungwoo_Song1;~Wanmo_Kang1;~Il-chul_Moon1", "gender": "M;M;;M;", "homepage": "https://sites.google.com/view/dongjun-kim?pli=1;https://sites.google.com/view/seungjae-shin;https://mlai.yonsei.ac.kr;https://sites.google.com/site/wanmokang/;", "dblp": "03/4394;29/551;155/4867;;", "google_scholar": "https://scholar.google.com/citations?hl=ko;https://scholar.google.com/citations?hl=en;HWxRii4AAAAJ;;", "orcid": ";;0000-0003-0082-4280;;", "linkedin": ";seungjae-shin-hoodie/;kyungwoo-song-862863155/;;", "or_profile": "~Dongjun_Kim1;~Seungjae_Shin1;~Kyungwoo_Song1;~Wanmo_Kang1;~Il-chul_Moon1", "aff": "Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology;University of Seoul;Korea Advanced Institute of Science & Technology;", "aff_domain": "kaist.ac.kr;kaist.ac.kr;uos.ac.kr;kaist.ac.kr;", "position": "PhD student;PhD student;Assistant Professor;Professor;", "bibtex": "@misc{\nkim2022high,\ntitle={High Precision Score-based Diffusion Models},\nauthor={Dongjun Kim and Seungjae Shin and Kyungwoo Song and Wanmo Kang and Il-chul Moon},\nyear={2022},\nurl={https://openreview.net/forum?id=qHsuiKXkUb}\n}", "github": "", "project": "", "reviewers": "e3pu;QU8e;Veok;ekSh;sL3N", "site": "https://openreview.net/forum?id=qHsuiKXkUb", "pdf_size": 0, "recommendation": "3;5;5;5;5", "confidence": "4;3;4;3;3", "correctness": "2;3;3;3;2", "technical_novelty": "2;3;2;2;3", "empirical_novelty": "2;4;4;2;3", "wc_summary_paper": "503;129;41;95;65", "wc_summary_review": "67;63;54;88;62", "wc_main_review": "781;632;1016;418;613", "wc_review": "1351;824;1111;601;740", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "0;0;0;0;0", "reply_reviewers": "0;0;0;0;0", "reply_authors": "0;0;0;0;0", "recommendation_avg": [ 4.6, 0.7999999999999999 ], "confidence_avg": [ 3.4, 0.4898979485566356 ], "correctness_avg": [ 2.6, 0.4898979485566356 ], "technical_novelty_avg": [ 2.4, 0.4898979485566356 ], "empirical_novelty_avg": [ 3.0, 0.8944271909999159 ], "wc_summary_paper_avg": [ 166.6, 170.76486758112745 ], "wc_summary_review_avg": [ 66.8, 11.408768557561329 ], "wc_main_review_avg": [ 692.0, 198.90399694324898 ], "wc_review_avg": [ 925.4, 270.36020417213774 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.6123724356957948, "corr_recommendation_correctness": 0.6123724356957948, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:udB5fIFCp2kJ:scholar.google.com/&scioq=High+Precision+Score-based+Diffusion+Models&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Korea Advanced Institute of Science and Technology;University of Seoul", "aff_unique_dep": ";", "aff_unique_url": "https://www.kaist.ac.kr;http://www.useoul.edu", "aff_unique_abbr": "KAIST;UOS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "South Korea" }, { "title": "FILM: Following Instructions in Language with Modular Methods", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6979", "id": "qI4542Y2s1D", "poster": "", "openreview": "https://openreview.net/forum?id=qI4542Y2s1D", "slides": "https://iclr.cc/virtual/2022/poster/6979", "video": "https://iclr.cc/virtual/2022/poster/6979", "author_site": "So Yeon Min, Devendra Singh Chaplot, Pradeep K Ravikumar, Yonatan Bisk, Ruslan Salakhutdinov", "tldr": "", "abstract": "Recent methods for embodied instruction following are typically trained end-to-end using imitation learning. This often requires the use of expert trajectories and low-level language instructions. Such approaches assume that neural states will integrate multimodal semantics to perform state tracking, building spatial memory, exploration, and long-term planning. In contrast, we propose a modular method with structured representations that (1) builds a semantic map of the scene and (2) performs exploration with a semantic search policy, to achieve the natural language goal. Our modular method achieves SOTA performance (24.46 %) with a substantial (8.17 % absolute) gap from previous work while using less data by eschewing both expert trajectories and low-level instructions. Leveraging low-level language, however, can further increase our performance (26.49 %). Our findings suggest that an explicit spatial memory and a semantic search policy can provide a stronger and more general representation for state-tracking and guidance, even in the absence of expert trajectories or low-level instructions.", "keywords": "Instruction Following;Visual Language Navigation;Embodied Instruction Following;VLN;ALFRED", "primary_area": "", "supplementary_material": "", "author": "So Yeon Min;Devendra Singh Chaplot;Pradeep Kumar Ravikumar;Yonatan Bisk;Ruslan Salakhutdinov", "authorids": "~So_Yeon_Min2;~Devendra_Singh_Chaplot2;~Pradeep_Kumar_Ravikumar1;~Yonatan_Bisk1;~Ruslan_Salakhutdinov1", "gender": "F;M;M;M;M", "homepage": ";http://www.cs.cmu.edu/~pradeepr/;http://www.YonatanBisk.com;https://devendrachaplot.github.io/;https://www.cs.cmu.edu/~rsalakhu/", "dblp": "78/84;94/3594;38/9282;161/0038;", "google_scholar": "dkRTvvcAAAAJ;https://scholar.google.com.tw/citations?user=Q4DTPw4AAAAJ;bWoGh8UAAAAJ;1MSpdmQAAAAJ;", "orcid": ";;0000-0002-2111-9081;;", "linkedin": ";;yonatanbisk/;;", "or_profile": "~So_Yeon_Min2;~Pradeep_Kumar_Ravikumar1;~Yonatan_Bisk1;~Devendra_Chaplot1;~Russ_Salakhutdinov1", "aff": "Apple;School of Computer Science, Carnegie Mellon University;Meta;Meta Facebook;School of Computer Science, Carnegie Mellon University", "aff_domain": "apple.com;cs.cmu.edu;meta.com;fb.com;cs.cmu.edu", "position": "Intern;Associate Professor;Visiting Professor;Researcher;Full Professor", "bibtex": "@inproceedings{\nmin2022film,\ntitle={{FILM}: Following Instructions in Language with Modular Methods},\nauthor={So Yeon Min and Devendra Singh Chaplot and Pradeep Kumar Ravikumar and Yonatan Bisk and Ruslan Salakhutdinov},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=qI4542Y2s1D}\n}", "github": "", "project": "", "reviewers": "dBxN;qMNi;NHgG;aTek", "pdf_size": 0, "recommendation": "6;6;6;6", "confidence": "5;4;4;3", "correctness": "3;3;3;2", "technical_novelty": "3;2;3;3", "empirical_novelty": "4;2;3;2", "wc_summary_paper": "125;296;98;406", "wc_summary_review": "66;131;107;53", "wc_main_review": "843;681;708;445", "wc_review": "1034;1108;913;904", "wc_reply_reviewers": "0;228;0;58", "wc_reply_authors": "680;886;519;502", "reply_reviewers": "0;1;0;1", "reply_authors": "2;3;2;2", "recommendation_avg": [ 6.0, 0.0 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 231.25, 126.268315503138 ], "wc_summary_review_avg": [ 89.25, 31.275989192989563 ], "wc_main_review_avg": [ 669.25, 143.28010154937775 ], "wc_review_avg": [ 989.75, 85.41772357069696 ], "wc_reply_reviewers_avg": [ 71.5, 93.40637023244186 ], "wc_reply_authors_avg": [ 646.75, 154.61140805257548 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 2.25, 0.4330127018922193 ], "replies_avg": [ 20, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 173, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5571461167414719963&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=qI4542Y2s1D", "email": "apple.com;cs.cmu.edu;meta.com;fb.com;cs.cmu.edu", "author_num": 5, "aff_unique_index": "0;1;2;2;1", "aff_unique_norm": "Apple;Carnegie Mellon University;Meta", "aff_unique_dep": "Apple Inc.;School of Computer Science;Meta Platforms, Inc.", "aff_unique_url": "https://www.apple.com;https://www.cmu.edu;https://meta.com", "aff_unique_abbr": "Apple;CMU;Meta", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Pittsburgh", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "qLm6hqXBIj_", "title": "Learning Representations that Support Robust Transfer of Predictors", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Ensuring generalization to unseen environments remains a challenge. Domain shift can lead to substantially degraded performance unless shifts are well-exercised within the available training environments. We introduce a simple robust estimation criterion -- transfer risk -- that is specifically geared towards optimizing transfer to new environments. Effectively, the criterion amounts to finding a representation that minimizes the risk of applying any optimal predictor trained on one environment to another. The transfer risk essentially decomposes into two terms, a direct transfer term and a weighted gradient-matching term arising from the optimality of per-environment predictors. Although inspired by IRM, we show that transfer risk serves as a better out-of-distribution generalization criterion theoretically and empirically. We further demonstrate the impact of optimizing such transfer risk on two controlled settings, each representing a different pattern of environment shift, as well as on two real-world datasets. Experimentally, the approach outperforms baselines across various out-of-distribution generalization tasks.", "keywords": "out-of-distribution generalization;representation learning", "primary_area": "", "supplementary_material": "/attachment/aaf3c39608bdb6eb71136173c60b79ec9bcfd125.zip", "author": "Yilun Xu;Tommi S. Jaakkola", "authorids": "~Yilun_Xu1;~Tommi_S._Jaakkola1", "gender": "M;", "homepage": "http://yilun-xu.com;", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": "~Yilun_Xu1;~Tommi_S._Jaakkola1", "aff": "Massachusetts Institute of Technology;", "aff_domain": "mit.edu;", "position": "PhD student;", "bibtex": "@misc{\nxu2022learning,\ntitle={Learning Representations that Support Robust Transfer of Predictors},\nauthor={Yilun Xu and Tommi S. Jaakkola},\nyear={2022},\nurl={https://openreview.net/forum?id=qLm6hqXBIj_}\n}", "github": "", "project": "", "reviewers": "BHGY;GgTb;bCPQ;yDJ3", "site": "https://openreview.net/forum?id=qLm6hqXBIj_", "pdf_size": 0, "recommendation": "3;5;5;5", "confidence": "5;3;4;5", "correctness": "3;3;2;3", "technical_novelty": "2;3;2;2", "empirical_novelty": "1;3;3;2", "wc_summary_paper": "59;158;48;87", "wc_summary_review": "44;60;70;22", "wc_main_review": "542;434;461;184", "wc_review": "645;652;579;293", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.5, 0.8660254037844386 ], "confidence_avg": [ 4.25, 0.82915619758885 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 88.0, 42.84273567362383 ], "wc_summary_review_avg": [ 49.0, 18.138357147217054 ], "wc_main_review_avg": [ 405.25, 133.77850163609997 ], "wc_review_avg": [ 542.25, 146.69590144240567 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.5222329678670935, "corr_recommendation_correctness": -0.3333333333333333, "gs_citation": 30, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2651583209314654095&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0", "aff_unique_norm": "Massachusetts Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://web.mit.edu", "aff_unique_abbr": "MIT", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "id": "qLqeb9AjD2o", "title": "Confidence-aware Training of Smoothed Classifiers for Certified Robustness", "track": "main", "status": "Reject", "tldr": "", "abstract": "Any classifier can be \"smoothed out\" under Gaussian noise to build a new classifier that is provably robust to $\\ell_2$-adversarial perturbations, viz., by averaging its predictions over the noise, namely via randomized smoothing. Under the smoothed classifiers, the fundamental trade-off between accuracy and (adversarial) robustness has been well evidenced in the literature: i.e., increasing the robustness of a classifier for an input can be at the expense of decreased accuracy for some other inputs. In this paper, we propose a simple training method leveraging this trade-off for obtaining more robust smoothed classifiers, in particular, through a sample-wise control of robustness over the training samples. We enable this control feasible by investigating the correspondence between robustness and prediction confidence of smoothed classifiers: specifically, we propose to use the \"accuracy under Gaussian noise\" as an easy-to-compute proxy of adversarial robustness for each input. We differentiate the training objective depending on this proxy to filter out samples that are unlikely to benefit from the worst-case (adversarial) objective. Our experiments following the standard benchmarks consistently show that the proposed method, despite its simplicity, exhibits improved certified robustness upon existing state-of-the-art training methods.", "keywords": "randomized smoothing;adversarial robustness;certified defense;adversarial defense;robust training;confidence calibration", "primary_area": "", "supplementary_material": "", "author": "Jongheon Jeong;Seojin Kim;Jinwoo Shin", "authorids": "~Jongheon_Jeong1;~Seojin_Kim2;~Jinwoo_Shin1", "gender": "M;M;M", "homepage": "https://jh-jeong.github.io;https://alinlab.kaist.ac.kr/members.html;https://sites.google.com/site/mijirim/", "dblp": "241/5923;95/102;31/7062", "google_scholar": "mZB2qfcAAAAJ;;https://scholar.google.com.tw/citations?user=m3eDp7kAAAAJ", "orcid": "0000-0002-4058-5774;;", "linkedin": "jongheonj/;;", "or_profile": "~Jongheon_Jeong1;~Seojin_Kim2;~Jinwoo_Shin1", "aff": "Amazon;Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology", "aff_domain": "amazon.com;kaist.ac.kr;kaist.ac.kr", "position": "Intern;MS student;Associate Professor", "bibtex": "@misc{\njeong2022confidenceaware,\ntitle={Confidence-aware Training of Smoothed Classifiers for Certified Robustness},\nauthor={Jongheon Jeong and Seojin Kim and Jinwoo Shin},\nyear={2022},\nurl={https://openreview.net/forum?id=qLqeb9AjD2o}\n}", "github": "", "project": "", "reviewers": "6h9p;dpjw;5emU;girT", "site": "https://openreview.net/forum?id=qLqeb9AjD2o", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "4;5;4;3", "correctness": "4;3;3;3", "technical_novelty": "3;1;3;2", "empirical_novelty": "2;1;3;2", "wc_summary_paper": "51;64;49;71", "wc_summary_review": "61;21;37;48", "wc_main_review": "537;241;257;279", "wc_review": "649;326;343;398", "wc_reply_reviewers": "117;0;145;0", "wc_reply_authors": "436;586;383;733", "reply_reviewers": "1;0;1;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.82915619758885 ], "empirical_novelty_avg": [ 2.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 58.75, 9.12071817347735 ], "wc_summary_review_avg": [ 41.75, 14.686303142724515 ], "wc_main_review_avg": [ 328.5, 121.13112729600101 ], "wc_review_avg": [ 429.0, 129.77480495072993 ], "wc_reply_reviewers_avg": [ 65.5, 66.24386764071072 ], "wc_reply_authors_avg": [ 534.5, 136.66473575871723 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.7071067811865475, "corr_recommendation_correctness": -0.5773502691896257, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15446203807550726615&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff_unique_index": "0;1;1", "aff_unique_norm": "Amazon;Korea Advanced Institute of Science and Technology", "aff_unique_dep": "Amazon.com, Inc.;", "aff_unique_url": "https://www.amazon.com;https://www.kaist.ac.kr", "aff_unique_abbr": "Amazon;KAIST", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1", "aff_country_unique": "United States;South Korea" }, { "id": "qNcedShvOs4", "title": "EinSteinVI: General and Integrated Stein Variational Inference", "track": "main", "status": "Reject", "tldr": "", "abstract": "Stein variational inference is a technique for approximate Bayesian inference that has recently gained popularity because it combines the scalability of variational inference (VI) with the flexibility of non-parametric inference methods. While there has been considerable progress in developing algorithms for Stein variational inference, integration in existing probabilistic programming languages (PPLs) with an easy-to-use interface is currently lacking. EinSteinVI is a lightweight compostable library that integrates the latest Stein variational inference method with the PPL NumPyro (Phan et al., 2019). EinSteinVI provides ELBO-within-Stein to support the use of custom inference programs (guides), implementations of a wide range of kernels, non-linear scaling of the repulsion force (Wang & Liu,2019b), and second-order gradient updates using matrix-valued kernels (Wang et al.,2019b). We illustrate EinSteinVI using toy examples and show results on par with or better than existing state-of-the-art methods for real-world problems. These include Bayesian neural networks for regression and a Stein-mixture deep Markov model, which shows EinSteinVI scales to large models with more than 500,000 parameters.", "keywords": "Stein variational inference;variational inference;probabilistic programming;Pyro;deep probabilistic programming;deep learning", "primary_area": "", "supplementary_material": "", "author": "Ola R\u00f8nning;Ahmad Salim Al-Sibahi;Christophe Ley;Thomas Hamelryck", "authorids": "~Ola_R\u00f8nning1;~Ahmad_Salim_Al-Sibahi1;christophe.ley@ugent.be;~Thomas_Hamelryck1", "gender": "M;M;;M", "homepage": ";https://alsibahi.xyz;;https://thamelry.github.io", "dblp": "220/2035;166/7500.html;;18/2705", "google_scholar": "V-RLwukAAAAJ;mcVJvU8AAAAJ;;YoTlzjkAAAAJ", "orcid": ";;;0000-0003-2917-3602", "linkedin": ";;;thomas-hamelryck-41a0a64/", "or_profile": "~Ola_R\u00f8nning1;~Ahmad_Salim_Al-Sibahi1;christophe.ley@ugent.be;~Thomas_Hamelryck1", "aff": "University of Copenhagen;University of Copenhagen;;University of Copenhagen", "aff_domain": "ku.dk;ku.dk;;bio.ku.dk", "position": "PhD student;Assistant Professor;;Associate Professor", "bibtex": "@misc{\nr{\\o}nning2022einsteinvi,\ntitle={EinStein{VI}: General and Integrated Stein Variational Inference},\nauthor={Ola R{\\o}nning and Ahmad Salim Al-Sibahi and Christophe Ley and Thomas Hamelryck},\nyear={2022},\nurl={https://openreview.net/forum?id=qNcedShvOs4}\n}", "github": "", "project": "", "reviewers": "uwM1;Hnfk;KQPj;N78P", "site": "https://openreview.net/forum?id=qNcedShvOs4", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "4;4;3;3", "correctness": "4;3;4;3", "technical_novelty": "2;2;1;2", "empirical_novelty": "2;2;1;2", "wc_summary_paper": "36;55;71;61", "wc_summary_review": "22;22;18;57", "wc_main_review": "138;148;128;482", "wc_review": "196;225;217;600", "wc_reply_reviewers": "0;72;162;0", "wc_reply_authors": "252;579;864;522", "reply_reviewers": "0;1;2;0", "reply_authors": "1;1;2;1", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 1.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 55.75, 12.754901018824098 ], "wc_summary_review_avg": [ 29.75, 15.81731646013318 ], "wc_main_review_avg": [ 224.0, 149.12410938543775 ], "wc_review_avg": [ 309.5, 168.05430669875736 ], "wc_reply_reviewers_avg": [ 58.5, 66.59391864126934 ], "wc_reply_authors_avg": [ 554.25, 217.34347816302196 ], "reply_reviewers_avg": [ 0.75, 0.82915619758885 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -1.0, "corr_recommendation_correctness": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6109136530628393113&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Copenhagen", "aff_unique_dep": "", "aff_unique_url": "https://www.ku.dk", "aff_unique_abbr": "UCPH", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Denmark" }, { "id": "qO-PN1zjmi_", "title": "Novelty detection using ensembles with regularized disagreement", "track": "main", "status": "Reject", "tldr": "", "abstract": "Despite their excellent performance on in-distribution (ID) data, deep neural networks often confidently predict on out-of-distribution (OOD) samples that come from novel classes instead of flagging them for expert evaluation. Even though conventional OOD detection algorithms can distinguish far OOD samples, current methods that can identify near OOD samples require training with labeled data that is very similar to these near OOD samples. In turn, we develop a new ensemble-based procedure for \\emph{semi-supervised novelty detection} (SSND) that only utilizes a mixture of unlabeled ID and OOD samples to achieve good detection performance on near OOD data. It crucially relies on regularization to promote diversity on the OOD data while preserving agreement on ID data. Extensive comparisons of our approach to state-of-the-art SSND methods on standard image data sets (SVHN/CIFAR-10/CIFAR-100) and medical image data sets reveal significant gains with negligible increase in computational cost.\n", "keywords": "out-of-distribution detection;novelty detection;ensembles;ensemble diversity;outlier detection;regularization", "primary_area": "", "supplementary_material": "", "author": "Alexandru Tifrea;Eric Petru Stavarache;Fanny Yang", "authorids": "~Alexandru_Tifrea1;~Eric_Petru_Stavarache1;~Fanny_Yang1", "gender": "M;M;", "homepage": ";https://n.ethz.ch/~ericst/;http://www.fanny-yang.de", "dblp": "183/4666;;126/4852", "google_scholar": "i7T1FUsAAAAJ;;BfDKicQAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Alexandru_Tifrea1;~Eric_Petru_Stavarache1;~Fanny_Yang1", "aff": "Swiss Federal Institute of Technology;;Swiss Federal Institute of Technology", "aff_domain": "ethz.ch;;ethz.ch", "position": "PhD student;;Professor", "bibtex": "@misc{\ntifrea2022novelty,\ntitle={Novelty detection using ensembles with regularized disagreement},\nauthor={Alexandru Tifrea and Eric Petru Stavarache and Fanny Yang},\nyear={2022},\nurl={https://openreview.net/forum?id=qO-PN1zjmi_}\n}", "github": "", "project": "", "reviewers": "QNSM;iZuG;7CPC;VFKa;Etvi", "site": "https://openreview.net/forum?id=qO-PN1zjmi_", "pdf_size": 0, "recommendation": "3;5;5;6;6", "confidence": "4;4;4;4;3", "correctness": "3;3;4;3;3", "technical_novelty": "2;2;2;3;3", "empirical_novelty": "2;2;3;3;2", "wc_summary_paper": "46;184;133;70;67", "wc_summary_review": "30;33;121;337;13", "wc_main_review": "270;310;768;435;341", "wc_review": "346;527;1022;842;421", "wc_reply_reviewers": "88;0;490;208;31", "wc_reply_authors": "1587;1565;1531;1403;859", "reply_reviewers": "1;0;1;1;1", "reply_authors": "3;2;3;3;2", "recommendation_avg": [ 5.0, 1.0954451150103321 ], "confidence_avg": [ 3.8, 0.39999999999999997 ], "correctness_avg": [ 3.2, 0.39999999999999997 ], "technical_novelty_avg": [ 2.4, 0.4898979485566356 ], "empirical_novelty_avg": [ 2.4, 0.4898979485566356 ], "wc_summary_paper_avg": [ 100.0, 51.088159097779204 ], "wc_summary_review_avg": [ 106.8, 121.10887663586017 ], "wc_main_review_avg": [ 424.8, 180.03044187025705 ], "wc_review_avg": [ 631.6, 258.2793836139462 ], "wc_reply_reviewers_avg": [ 163.4, 178.08492356176592 ], "wc_reply_authors_avg": [ 1389.0, 272.5582506547912 ], "reply_reviewers_avg": [ 0.8, 0.4000000000000001 ], "reply_authors_avg": [ 2.6, 0.4898979485566356 ], "replies_avg": [ 25, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.4564354645876385, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:ZnYwrAtRMRIJ:scholar.google.com/&scioq=Novelty+detection+using+ensembles+with+regularized+disagreement&hl=en&as_sdt=0,5", "gs_version_total": 4, "aff_unique_index": "0;0", "aff_unique_norm": "Swiss Federal Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.ethz.ch", "aff_unique_abbr": "ETH Zurich", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Switzerland" }, { "id": "qOcf6HgSmRH", "title": "DeeperGCN: All You Need to Train Deeper GCNs", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Graph Neural Networks (GNNs) have been drawing significant attention to the power of representation learning on graphs. Recent works developed frameworks to train very deep GNNs. Such works show impressive results in tasks like point cloud learning and protein interaction prediction. In this work, we study the performance of such deep models in large-scale graph datasets from the Open Graph Benchmark (OGB). In particular, we look at the effect of adequately choosing an aggregation function and its effect on final performance. Common choices of aggregation are mean, max, and sum. It has been shown that GNNs are sensitive to such aggregations when applied to different datasets. We systematically study this point on large-scale graphs and propose to alleviate it by introducing a novel Generalized Aggregation Function. Proposed aggregation functions extend beyond the commonly used ones. The generalized aggregation functions are fully differentiable, and thus their parameters can be learned in an end-to-end fashion. We show that deep residual GNNs equipped with generalized aggregation functions achieve state-of-the-art results in several benchmarks from OGB across tasks and domains.", "keywords": "Graph Neural Networks;Graph Representation Learning", "primary_area": "", "supplementary_material": "", "author": "Guohao Li;Chenxin Xiong;Guocheng Qian;Ali Thabet;Bernard Ghanem", "authorids": "~Guohao_Li1;~Chenxin_Xiong1;~Guocheng_Qian1;~Ali_Thabet1;~Bernard_Ghanem1", "gender": "M;F;M;M;M", "homepage": "https://ghli.org/;https://www.kaust.edu.sa/en;https://guochengqian.github.io/;https://www.alithabet.com/;https://ivul.kaust.edu.sa", "dblp": "211/7175-1;;241/7000;161/1812;37/2516", "google_scholar": "J9K-D0sAAAAJ;;DUDaxg4AAAAJ;7T0CPEkAAAAJ;rVsGTeEAAAAJ", "orcid": "0000-0003-0260-5129;;0000-0002-2935-8570;;0000-0002-5534-587X", "linkedin": ";;guochengqian/;akthabet/;bernardghanem/", "or_profile": "~Guohao_Li1;~Chenxin_Xiong1;~Guocheng_Qian1;~Ali_Thabet1;~Bernard_Ghanem1", "aff": "KAUST;;KAUST;Meta;King Abdullah University of Science and Technology", "aff_domain": "kaust.edu.sa;;kaust.edu.sa;fb.com;kaust.edu.sa", "position": "PhD student;;PhD student;Applied Research Manager;Associate Professor", "bibtex": "@misc{\nli2022deepergcn,\ntitle={Deeper{GCN}: All You Need to Train Deeper {GCN}s},\nauthor={Guohao Li and Chenxin Xiong and Guocheng Qian and Ali Thabet and Bernard Ghanem},\nyear={2022},\nurl={https://openreview.net/forum?id=qOcf6HgSmRH}\n}", "github": "", "project": "", "reviewers": "8xa2;YqHT;BMUG;wMTG", "site": "https://openreview.net/forum?id=qOcf6HgSmRH", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "4;4;4;4", "correctness": "1;2;3;3", "technical_novelty": "2;3;2;2", "empirical_novelty": "2;0;3;3", "wc_summary_paper": "37;28;58;72", "wc_summary_review": "43;72;31;22", "wc_main_review": "207;283;104;228", "wc_review": "287;383;193;322", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 2.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 1.224744871391589 ], "wc_summary_paper_avg": [ 48.75, 17.282577932704367 ], "wc_summary_review_avg": [ 42.0, 18.854707634964804 ], "wc_main_review_avg": [ 205.5, 64.8401881551866 ], "wc_review_avg": [ 296.25, 68.80179866834878 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.899228803025897, "gs_citation": 556, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8249787359953166337&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "King Abdullah University of Science and Technology;Meta", "aff_unique_dep": ";Meta Platforms, Inc.", "aff_unique_url": "https://www.kaust.edu.sa;https://meta.com", "aff_unique_abbr": "KAUST;Meta", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "Saudi Arabia;United States" }, { "id": "qPQRIj_Y_EW", "title": "Learning to Solve an Order Fulfillment Problem in Milliseconds with Edge-Feature-Embedded Graph Attention", "track": "main", "status": "Reject", "tldr": "", "abstract": "The order fulfillment problem is one of the fundamental combinatorial optimization problems in supply chain management and it is required to be solved in real-time for modern online retailing. Such a problem is computationally hard to address by exact mathematical programming methods. In this paper, we propose a machine learning method to solve it in milliseconds by formulating a tripartite graph and learning the best assignment policy through the proposed edge-feature-embedded graph attention mechanism. The edge-feature-embedded graph attention considers the high-dimensional edge features and accounts for the heterogeneous information, which are important characteristics of the studied optimization problem. The model is also size-invariant for problem instances of any scale, and it can address cases that are completely unseen during training. Experiments show that our model substantially outperforms the baseline heuristic method in optimality. The online inference time is milliseconds, which is thousands of times faster than the exact mathematical programming methods.", "keywords": "Combinatorial Optimization;Machine Learning;Modern Supply Chain Management", "primary_area": "", "supplementary_material": "", "author": "Jingwei Yang;Qingchun Hou;Xiaoqing Wang;Yang Wei;Yuming Deng;Hongyang Jia;Ning Zhang", "authorids": "~Jingwei_Yang2;~Qingchun_Hou1;~Xiaoqing_Wang1;~Yang_Wei3;~Yuming_Deng1;~Hongyang_Jia2;~Ning_Zhang11", "gender": "M;;;M;M;M;M", "homepage": ";https://person.zju.edu.cn/en/houqingchun;;;;https://www.researchgate.net/profile/Hongyang-Jia-2;http://www.ningzhang.net/", "dblp": ";249/5756;;;;02/11188;181/2597-8", "google_scholar": "ayX_-lwAAAAJ;MBVZbRMAAAAJ;;;;SQJ8vlUAAAAJ;Ho3ZGIEAAAAJ", "orcid": ";0000-0001-8334-9897;;0000-0003-1266-048X;;0000-0001-5136-1189;0000-0003-0366-4657", "linkedin": ";;https://www.linkedin.cn/in/xiaoqing-wang-20b85b85;;yuming-deng-a38780b;;", "or_profile": "~Jingwei_Yang2;~Qingchun_Hou1;~Xiaoqing_Wang1;~Yang_Wei3;~Yuming_Deng1;~Hongyang_Jia2;~Ning_Zhang11", "aff": "Alibaba Group;Alibaba Group;Alibaba Group;Alibaba Group;;Tsinghua University;Tsinghua University", "aff_domain": "alibaba-inc.com;alibaba-inc.com;alibaba-inc.com;alibaba-inc.com;;mails.tsinghua.edu.cn;tsinghua.edu.cn", "position": "Researcher;Algorithm Expert;Director;Instructor;;PhD student;Associate Professor", "bibtex": "@misc{\nyang2022learning,\ntitle={Learning to Solve an Order Fulfillment Problem in Milliseconds with Edge-Feature-Embedded Graph Attention},\nauthor={Jingwei Yang and Qingchun Hou and Xiaoqing Wang and Yang Wei and Yuming Deng and Hongyang Jia and Ning Zhang},\nyear={2022},\nurl={https://openreview.net/forum?id=qPQRIj_Y_EW}\n}", "github": "", "project": "", "reviewers": "ciU3;qjBL;PyJm;pGWk", "site": "https://openreview.net/forum?id=qPQRIj_Y_EW", "pdf_size": 0, "recommendation": "3;3;5;6", "confidence": "3;4;4;3", "correctness": "3;3;3;3", "technical_novelty": "2;3;2;3", "empirical_novelty": "3;3;2;3", "wc_summary_paper": "122;95;185;88", "wc_summary_review": "130;25;37;61", "wc_main_review": "617;655;385;190", "wc_review": "869;775;607;339", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.25, 1.299038105676658 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 122.5, 38.252450901870326 ], "wc_summary_review_avg": [ 63.25, 40.6594085052894 ], "wc_main_review_avg": [ 461.75, 187.8741267444775 ], "wc_review_avg": [ 647.5, 201.32746956140883 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": -0.19245008972987526, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:eeIoTZJihzoJ:scholar.google.com/&scioq=Learning+to+Solve+an+Order+Fulfillment+Problem+in+Milliseconds+with+Edge-Feature-Embedded+Graph+Attention&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;0;0;1;1", "aff_unique_norm": "Alibaba Group;Tsinghua University", "aff_unique_dep": ";", "aff_unique_url": "https://www.alibaba.com;https://www.tsinghua.edu.cn", "aff_unique_abbr": "Alibaba;THU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "China" }, { "id": "qPzR-M6HY8x", "title": "Approximating Instance-Dependent Noise via Instance-Confidence Embedding", "track": "main", "status": "Reject", "tldr": "", "abstract": "Label noise in multiclass classification is a major obstacle to the deployment of learning systems. However, unlike the widely used class-conditional noise (CCN) assumption that the noisy label is independent of the input feature given the true label, label noise in real-world datasets can be aleatory and heavily dependent on individual instances. In this work, we investigate the instance-dependent noise (IDN) model and propose an efficient approximation of IDN to capture the instance-specific label corruption. Concretely, noting the fact that most columns of the IDN transition matrix have only limited influence on the class-posterior estimation, we propose a variational approximation that uses a single-scalar confidence parameter. To cope with the situation where the mapping from the instance to its confidence value could vary significantly for two adjacent instances, we suggest using instance embedding that assigns a trainable parameter to each instance. The resulting instance-confidence embedding (ICE) method not only performs well under label noise but also can effectively detect ambiguous or mislabeled instances. We validate its utility on various image and text classification tasks.", "keywords": "instance-dependent noise;label noise;classification;robustness;weakly supervised learning", "primary_area": "", "supplementary_material": "/attachment/00c70f6032bb11d5d379c4465be1735b2d567f04.zip", "author": "Yivan Zhang;Masashi Sugiyama", "authorids": "~Yivan_Zhang1;~Masashi_Sugiyama1", "gender": "M;M", "homepage": "https://yivan.xyz;http://www.ms.k.u-tokyo.ac.jp/sugi/", "dblp": "250/9557;35/1228", "google_scholar": "Q7S9kh4AAAAJ;https://scholar.google.co.jp/citations?user=GkYIrlIAAAAJ", "orcid": ";0000-0001-6658-6743", "linkedin": ";", "or_profile": "~Yivan_Zhang1;~Masashi_Sugiyama1", "aff": "RIKEN AIP;The University of Tokyo", "aff_domain": "riken.jp;u-tokyo.ac.jp", "position": "Researcher;Full Professor", "bibtex": "@misc{\nzhang2022approximating,\ntitle={Approximating Instance-Dependent Noise via Instance-Confidence Embedding},\nauthor={Yivan Zhang and Masashi Sugiyama},\nyear={2022},\nurl={https://openreview.net/forum?id=qPzR-M6HY8x}\n}", "github": "", "project": "", "reviewers": "ZuLD;ARcm;4pht;cgg6", "site": "https://openreview.net/forum?id=qPzR-M6HY8x", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "4;4;4;4", "correctness": "2;3;3;3", "technical_novelty": "2;2;4;3", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "86;76;63;46", "wc_summary_review": "15;48;61;78", "wc_main_review": "276;244;320;613", "wc_review": "377;368;444;737", "wc_reply_reviewers": "212;390;152;438", "wc_reply_authors": "507;981;783;804", "reply_reviewers": "1;1;1;1", "reply_authors": "2;3;1;2", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.82915619758885 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 67.75, 14.972892172189045 ], "wc_summary_review_avg": [ 50.5, 23.092206477510977 ], "wc_main_review_avg": [ 363.25, 146.69590144240567 ], "wc_review_avg": [ 481.5, 150.40694797781117 ], "wc_reply_reviewers_avg": [ 298.0, 119.13857477744142 ], "wc_reply_authors_avg": [ 768.75, 169.56470004101678 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 2.0, 0.7071067811865476 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.9271726499455306, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3244709758197771362&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1", "aff_unique_norm": "RIKEN;University of Tokyo", "aff_unique_dep": "Advanced Institute for Computational Science;", "aff_unique_url": "https://www.aip.riken.jp;https://www.u-tokyo.ac.jp", "aff_unique_abbr": "RIKEN AIP;UTokyo", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Japan" }, { "id": "qQuzhbU3Gto", "title": "An Interpretable Graph Generative Model with Heterophily", "track": "main", "status": "Reject", "tldr": "", "abstract": "Many models for graphs fall under the framework of edge-independent dot product models. These models output the probabilities of edges existing between all pairs of nodes, and the probability of a link between two nodes increases with the dot product of vectors associated with the nodes. Recent work has shown that these models are unable to capture key structures in real-world graphs, particularly heterophilous structures, wherein links occur between dissimilar nodes. We propose the first edge-independent graph generative model that is a) expressive enough to capture heterophily, b) produces nonnegative embeddings, which allow link predictions to be interpreted in terms of communities, and c) optimizes effectively on real-world graphs with gradient descent on a cross-entropy loss. Our theoretical results demonstrate the expressiveness of our model in its ability to exactly reconstruct a graph using a number of clusters that is linear in the maximum degree, along with its ability to capture both heterophily and homophily in the data. Further, our experiments demonstrate the effectiveness of our model for a variety of important application tasks such as multi-label clustering and link prediction. ", "keywords": "graph;network;generative;heterophily;community detection;link prediction;interpretability", "primary_area": "", "supplementary_material": "/attachment/9e5bd95fdb6531fc138ff643c179285a87fa89a0.zip", "author": "Sudhanshu Chanpuriya;Ryan Rossi;Anup Rao;Tung Mai;Nedim Lipka;Zhao Song;Cameron N Musco", "authorids": "~Sudhanshu_Chanpuriya1;~Ryan_Rossi1;~Anup_Rao1;~Tung_Mai1;lipka@adobe.com;zsong@adobe.com;~Cameron_N_Musco1", "gender": ";M;;M;;;M", "homepage": ";http://ryanrossi.com;;;;;https://people.cs.umass.edu/~cmusco/", "dblp": ";17/5085;63/6846;177/8902.html;;;149/2327", "google_scholar": ";_Dc6lbQAAAAJ;pkwXPU0AAAAJ;eUt8nlIAAAAJ;;;EeYGZCwAAAAJ", "orcid": ";0000-0001-9758-0635;;;;;", "linkedin": ";;;;;;", "or_profile": "~Sudhanshu_Chanpuriya1;~Ryan_Rossi1;~Anup_Rao1;~Tung_Mai1;lipka@adobe.com;zsong@adobe.com;~Cameron_N_Musco1", "aff": ";Adobe Research;Adobe Systems;Adobe;;;University of Massachusetts, Amherst", "aff_domain": ";adobe.com;adobe.com;adobe.com;;;umass.edu", "position": ";Senior Research Scientist;Researcher;Research Scientist;;;Assistant Professor", "bibtex": "@misc{\nchanpuriya2022an,\ntitle={An Interpretable Graph Generative Model with Heterophily},\nauthor={Sudhanshu Chanpuriya and Ryan Rossi and Anup Rao and Tung Mai and Nedim Lipka and Zhao Song and Cameron N Musco},\nyear={2022},\nurl={https://openreview.net/forum?id=qQuzhbU3Gto}\n}", "github": "", "project": "", "reviewers": "Fdty;TiK2;gNYK", "site": "https://openreview.net/forum?id=qQuzhbU3Gto", "pdf_size": 0, "recommendation": "3;3;5", "confidence": "4;5;4", "correctness": "3;2;3", "technical_novelty": "2;3;3", "empirical_novelty": "2;2;2", "wc_summary_paper": "54;76;208", "wc_summary_review": "53;41;49", "wc_main_review": "386;472;449", "wc_review": "493;589;706", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "428;458;558", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "recommendation_avg": [ 3.6666666666666665, 0.9428090415820634 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "correctness_avg": [ 2.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 112.66666666666667, 68.00653563363515 ], "wc_summary_review_avg": [ 47.666666666666664, 4.988876515698588 ], "wc_main_review_avg": [ 435.6666666666667, 36.353205574688396 ], "wc_review_avg": [ 596.0, 87.0976463516667 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 481.3333333333333, 55.57777333511022 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": -0.4999999999999999, "corr_recommendation_correctness": 0.5, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3799505906951667782&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;0;1", "aff_unique_norm": "Adobe;University of Massachusetts Amherst", "aff_unique_dep": "Adobe Research;", "aff_unique_url": "https://research.adobe.com;https://www.umass.edu", "aff_unique_abbr": "Adobe;UMass Amherst", "aff_campus_unique_index": "1", "aff_campus_unique": ";Amherst", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "qR4qv6_113C", "title": "Exploring the Optimality of Tight-Frame Scattering Networks", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "The wavelet scattering transform creates geometric invariants and deformation stability. In multiple signal domains, it has been shown to yield more discriminative representations compared to other non-learned representations, and to outperform learned representations in certain tasks, particularly on limited labeled data and highly structured signals. The wavelet filters used in the scattering transform are typically selected to create a tight frame via a parameterized mother wavelet. In this work, we investigate if such a tight frame construction is optimal. Focusing on Morlet wavelets, we propose to learn the scales, orientations, and aspect ratios of the filters to produce problem-specific parameterizations of the scattering transform. We show that our learned versions of the scattering transform yield significant performance gains in small-sample classification settings over the standard scattering transform. Moreover, our empirical results suggest that tight-frames may not always be necessary for scattering transforms to extract effective representations.", "keywords": "Scattering Transforms;Wavelets;Few-sample learning", "primary_area": "", "supplementary_material": "", "author": "Shanel Gauthier;Benjamin Th\u00e9rien;Laurent Als\u00e8ne-Racicot;Muawiz Sajjad Chaudhary;Irina Rish;Eugene Belilovsky;Michael Eickenberg;Guy Wolf", "authorids": "~Shanel_Gauthier1;~Benjamin_Th\u00e9rien1;~Laurent_Als\u00e8ne-Racicot1;~Muawiz_Sajjad_Chaudhary1;~Irina_Rish1;~Eugene_Belilovsky1;~Michael_Eickenberg5;~Guy_Wolf1", "gender": "F;M;;Non-Binary;F;M;M;M", "homepage": ";https://bentherien.github.io/;;;http://irina-rish.com;http://eugenium.github.io;http://eickenberg.github.io;http://guywolf.org", "dblp": ";297/8891;;266/9021.html;;42/11445;117/7268;120/1308", "google_scholar": "qg09zTEAAAAJ;RbO_ULYAAAAJ;;4Z8ePskAAAAJ;Avse5gIAAAAJ;https://scholar.google.fr/citations?user=CffJDoEAAAAJ;GW0werQAAAAJ;g0k3SjcAAAAJ", "orcid": ";;;;;;;0000-0002-6740-059X", "linkedin": "gauthiershanel/;benjamintherien/;laurent-als%C3%A8ne-racicot-2368a5221/;muawizchaudhary;irina-rish-8b2162;;;", "or_profile": "~Shanel_Gauthier1;~Benjamin_Th\u00e9rien1;~Laurent_Als\u00e8ne-Racicot1;~Muawiz_Sajjad_Chaudhary1;~Irina_Rish1;~Eugene_Belilovsky1;~Michael_Eickenberg5;~Guy_Wolf1", "aff": "University of Montreal;University of Waterloo;University of Montreal;Concordia University, Montreal;University of Montreal;Concordia University, Montreal;Flatiron Institute;University of Montreal", "aff_domain": "umontreal.ca;uwaterloo.ca;umontreal.ca;concordia.ca;mila.quebec;concordia.ca;flatironinstitute.org;umontreal.ca", "position": "MS student;MS student;MS student;MS student;Professor;Assistant Professor;Researcher;Associate Professor", "bibtex": "@misc{\ngauthier2022exploring,\ntitle={Exploring the Optimality of Tight-Frame Scattering Networks},\nauthor={Shanel Gauthier and Benjamin Th{\\'e}rien and Laurent Als{\\`e}ne-Racicot and Muawiz Sajjad Chaudhary and Irina Rish and Eugene Belilovsky and Michael Eickenberg and Guy Wolf},\nyear={2022},\nurl={https://openreview.net/forum?id=qR4qv6_113C}\n}", "github": "", "project": "", "reviewers": "kunJ;wSP9;coP1;rTV6", "site": "https://openreview.net/forum?id=qR4qv6_113C", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "4;4;2;2", "correctness": "2;3;3;3", "technical_novelty": "2;2;1;2", "empirical_novelty": "2;3;2;2", "wc_summary_paper": "108;60;83;65", "wc_summary_review": "109;39;50;34", "wc_main_review": "425;218;60;79", "wc_review": "642;317;193;178", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 3.0, 1.0 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 1.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 79.0, 18.801595676963167 ], "wc_summary_review_avg": [ 58.0, 30.008332176247315 ], "wc_main_review_avg": [ 195.5, 145.86723415489854 ], "wc_review_avg": [ 332.5, 186.65543120948826 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:zULN7eUjdbUJ:scholar.google.com/&scioq=Exploring+the+Optimality+of+Tight-Frame+Scattering+Networks&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;0;2;0;2;3;0", "aff_unique_norm": "University of Montreal;University of Waterloo;Concordia University;Flatiron Institute", "aff_unique_dep": ";;;", "aff_unique_url": "https://wwwumontreal.ca;https://uwaterloo.ca;https://www.concordia.ca;https://flatironinstitute.org", "aff_unique_abbr": "UM;UW;Concordia;Flatiron", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Montreal", "aff_country_unique_index": "0;0;0;0;0;0;1;0", "aff_country_unique": "Canada;United States" }, { "title": "Which Shortcut Cues Will DNNs Choose? A Study from the Parameter-Space Perspective", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/5892", "id": "qRDQi3ocgR3", "poster": "", "openreview": "https://openreview.net/forum?id=qRDQi3ocgR3", "slides": "https://iclr.cc/virtual/2022/poster/5892", "video": "https://iclr.cc/virtual/2022/poster/5892", "author_site": "Luca Scimeca, Seong Joon Oh, Sanghyuk Chun, Michael Poli, Sangdoo Yun", "tldr": "", "abstract": "Deep neural networks (DNNs) often rely on easy\u2013to\u2013learn discriminatory features, or cues, that are not necessarily essential to the problem at hand. For example, ducks in an image may be recognized based on their typical background scenery, such as lakes or streams. This phenomenon, also known as shortcut learning, is emerging as a key limitation of the current generation of machine learning models. In this work, we introduce a set of experiments to deepen our understanding of shortcut learning and its implications. We design a training setup with several shortcut cues, named WCST-ML, where each cue is equally conducive to the visual recognition problem at hand. Even under equal opportunities, we observe that (1) certain cues are preferred to others, (2) solutions biased to the easy\u2013to\u2013learn cues tend to converge to relatively flat minima on the loss surface, and (3) the solutions focusing on those preferred cues are far more abundant in the parameter space. We explain the abundance of certain cues via their Kolmogorov (descriptional) complexity: solutions corresponding to Kolmogorov-simple cues are abundant in the parameter space and are thus preferred by DNNs. Our studies are based on the synthetic dataset DSprites and the face dataset UTKFace. In our WCST-ML, we observe that the inborn bias of models leans toward simple cues, such as color and ethnicity. Our findings emphasize the importance of active human intervention to remove the inborn model biases that may cause negative societal impacts.", "keywords": "shortcut learning;shortcut bias;loss geometry;simplicity bias;flat minima;generalization;wisconsin card sorting test", "primary_area": "", "supplementary_material": "", "author": "Luca Scimeca;Seong Joon Oh;Sanghyuk Chun;Michael Poli;Sangdoo Yun", "authorids": "~Luca_Scimeca1;~Seong_Joon_Oh1;~Sanghyuk_Chun1;~Michael_Poli1;~Sangdoo_Yun1", "gender": "M;M;M;M;M", "homepage": "https://lucascimeca.com;https://seongjoonoh.com;https://sanghyukchun.github.io/home/;;https://sangdooyun.github.io/", "dblp": "223/6396;168/8835;213/1095.html;;124/3009.html", "google_scholar": "fKJvAvMAAAAJ;https://scholar.google.de/citations?user=kmXOOdsAAAAJ;https://scholar.google.co.kr/citations?user=4_uj0xcAAAAJ;RgIBwboAAAAJ;o0qtjzYAAAAJ", "orcid": "0000-0002-2821-0072;0000-0002-8985-7689;0000-0002-4533-2610;;", "linkedin": "luca-scimeca/;seong-joon-oh-32113479/;https://kr.linkedin.com/in/sanghyukchun/en;;", "or_profile": "~Luca_Scimeca1;~Seong_Joon_Oh1;~Sanghyuk_Chun1;~Michael_Poli1;~Sangdoo_Yun1", "aff": "Harvard University;NAVER;NAVER AI Lab;Stanford University;NAVER", "aff_domain": "harvard.edu;navercorp.com;navercorp.com;stanford.edu;navercorp.com", "position": "Postdoc;Research scientist;Lead research scientist;PhD student;Research Scientist", "bibtex": "@inproceedings{\nscimeca2022which,\ntitle={Which Shortcut Cues Will {DNN}s Choose? A Study from the Parameter-Space Perspective},\nauthor={Luca Scimeca and Seong Joon Oh and Sanghyuk Chun and Michael Poli and Sangdoo Yun},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=qRDQi3ocgR3}\n}", "github": "", "project": "", "reviewers": "QJQp;tkjD;s3vA", "pdf_size": 0, "recommendation": "5;6;8", "confidence": "4;4;4", "correctness": "4;3;3", "technical_novelty": "3;2;3", "empirical_novelty": "2;2;3", "wc_summary_paper": "156;104;87", "wc_summary_review": "22;47;65", "wc_main_review": "388;425;419", "wc_review": "566;576;571", "wc_reply_reviewers": "226;0;0", "wc_reply_authors": "1380;1084;1308", "reply_reviewers": "1;0;0", "reply_authors": "2;2;2", "recommendation_avg": [ 6.333333333333333, 1.247219128924647 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 115.66666666666667, 29.352266617001757 ], "wc_summary_review_avg": [ 44.666666666666664, 17.632041540584257 ], "wc_main_review_avg": [ 410.6666666666667, 16.21384867602041 ], "wc_review_avg": [ 571.0, 4.08248290463863 ], "wc_reply_reviewers_avg": [ 75.33333333333333, 106.53742169877317 ], "wc_reply_authors_avg": [ 1257.3333333333333, 126.04055784636238 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 2.0, 0.0 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": -0.7559289460184545, "gs_citation": 60, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1777708567808717925&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "pdf": "https://openreview.net/pdf?id=qRDQi3ocgR3", "email": "harvard.edu;navercorp.com;navercorp.com;stanford.edu;navercorp.com", "author_num": 5, "aff_unique_index": "0;1;1;2;1", "aff_unique_norm": "Harvard University;NAVER Corporation;Stanford University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.harvard.edu;https://www.naver.com;https://www.stanford.edu", "aff_unique_abbr": "Harvard;NAVER;Stanford", "aff_campus_unique_index": "1", "aff_campus_unique": ";Stanford", "aff_country_unique_index": "0;1;1;0;1", "aff_country_unique": "United States;South Korea" }, { "id": "qSTEPv2uLR8", "title": "Physics Informed Convex Artificial Neural Networks (PICANNs) for Optimal Transport based Density Estimation", "track": "main", "status": "Reject", "tldr": "", "abstract": "Optimal Mass Transport (OMT) is a well studied problem with a variety of applications in a diverse set of fields ranging from Physics to Computer Vision and in particular Statistics and Data Science. Since the original formulation of Monge in 1781 significant theoretical progress been made on the existence, uniqueness and properties of the optimal transport maps. The actual numerical computation of the transport maps, particularly in high dimensions, remains a challenging problem. In the past decade several neural network based algorithms have been proposed to tackle this task. In this paper, building on recent developments of input convex neural networks and physics informed neural networks for solving PDE's, we propose a new Deep Learning approach to solve the continuous OMT problem. Our framework is based on Brenier's theorem, which reduces the continuous OMT problem to that of solving a non-linear PDE of Monge-Ampere type whose solution is a convex function. To demonstrate the accuracy of our framework we compare our method to several other deep learning based algorithms. We then focus on applications to the ubiquitous density estimation and generative modeling tasks in statistics and machine learning. Finally as an example we present how our framework can be incorporated with an autoencoder to estimate an effective probabilistic generative model.", "keywords": "Optimal Mass Transport;Density Estimation;Physics Informed Neural Networks;Input Convex Neural Networks;Monge Ampere Equation", "primary_area": "", "supplementary_material": "", "author": "Amanpreet Singh;Martin Bauer;Sarang Joshi", "authorids": "~Amanpreet_Singh3;~Martin_Bauer2;~Sarang_Joshi1", "gender": "M;M;M", "homepage": ";https://www.math.fsu.edu/~bauer/;https://www.bme.utah.edu/profile/?unid=u0492366", "dblp": ";62/4807-4;15/2650", "google_scholar": "d0seUs0AAAAJ;https://scholar.google.at/citations?user=2WgXNeMAAAAJ;GyqdQTEAAAAJ", "orcid": "0000-0001-5980-7608;0000-0001-7771-056X;", "linkedin": ";;", "or_profile": "~Amanpreet_Singh3;~Martin_Bauer2;~Sarang_Joshi1", "aff": "University of Utah;Florida State University;University of Utah", "aff_domain": "utah.edu;fsu.edu;utah.edu", "position": "PhD student;Associate Professor;Full Professor", "bibtex": "@misc{\nsingh2022physics,\ntitle={Physics Informed Convex Artificial Neural Networks ({PICANN}s) for Optimal Transport based Density Estimation},\nauthor={Amanpreet Singh and Martin Bauer and Sarang Joshi},\nyear={2022},\nurl={https://openreview.net/forum?id=qSTEPv2uLR8}\n}", "github": "", "project": "", "reviewers": "ecZV;U1ba;44BN;U76A", "site": "https://openreview.net/forum?id=qSTEPv2uLR8", "pdf_size": 0, "recommendation": "5;5;6;8", "confidence": "4;3;5;3", "correctness": "3;4;3;3", "technical_novelty": "2;4;2;4", "empirical_novelty": "2;1;2;3", "wc_summary_paper": "101;52;23;112", "wc_summary_review": "48;110;269;94", "wc_main_review": "303;211;116;606", "wc_review": "452;373;408;812", "wc_reply_reviewers": "420;0;0;270", "wc_reply_authors": "1866;433;599;770", "reply_reviewers": "2;0;0;1", "reply_authors": "4;1;1;1", "recommendation_avg": [ 6.0, 1.224744871391589 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 1.0 ], "empirical_novelty_avg": [ 2.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 72.0, 36.20082871979591 ], "wc_summary_review_avg": [ 130.25, 83.27777314505954 ], "wc_main_review_avg": [ 309.0, 183.77839916595204 ], "wc_review_avg": [ 511.25, 175.8797529563878 ], "wc_reply_reviewers_avg": [ 172.5, 180.46814123273947 ], "wc_reply_authors_avg": [ 917.0, 560.7116014494438 ], "reply_reviewers_avg": [ 0.75, 0.82915619758885 ], "reply_authors_avg": [ 1.75, 1.299038105676658 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.24618298195866545, "corr_recommendation_correctness": -0.4714045207910316, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16698555345927396485&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff_unique_index": "0;1;0", "aff_unique_norm": "University of Utah;Florida State University", "aff_unique_dep": ";", "aff_unique_url": "https://www.utah.edu;https://www.fsu.edu", "aff_unique_abbr": "Utah;FSU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Few-Shot Backdoor Attacks on Visual Object Tracking", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6523", "id": "qSV5CuSaK_a", "poster": "", "openreview": "https://openreview.net/forum?id=qSV5CuSaK_a", "slides": "https://iclr.cc/virtual/2022/poster/6523", "video": "https://iclr.cc/virtual/2022/poster/6523", "author_site": "Yiming Li, Haoxiang Zhong, Xingjun Ma, Yong Jiang, Shu-Tao Xia", "tldr": "", "abstract": "Visual object tracking (VOT) has been widely adopted in mission-critical applications, such as autonomous driving and intelligent surveillance systems. In current practice, third-party resources such as datasets, backbone networks, and training platforms are frequently used to train high-performance VOT models. Whilst these resources bring certain convenience, they also introduce new security threats into VOT models. In this paper, we reveal such a threat where an adversary can easily implant hidden backdoors into VOT models by tempering with the training process. Specifically, we propose a simple yet effective few-shot backdoor attack (FSBA) that optimizes two losses alternately: 1) a \\emph{feature loss} defined in the hidden feature space, and 2) the standard \\emph{tracking loss}. We show that, once the backdoor is embedded into the target model by our FSBA, it can trick the model to lose track of specific objects even when the \\emph{trigger} only appears in one or a few frames. We examine our attack in both digital and physical-world settings and show that it can significantly degrade the performance of state-of-the-art VOT trackers. We also show that our attack is resistant to potential defenses, highlighting the vulnerability of VOT models to potential backdoor attacks.", "keywords": "Backdoor Attack;Visual Object Tracking;AI Security;Deep Learning", "primary_area": "", "supplementary_material": "/attachment/9276c97033583e9790e0143165d7dc35d09ded58.zip", "author": "Yiming Li;Haoxiang Zhong;Xingjun Ma;Yong Jiang;Shu-Tao Xia", "authorids": "~Yiming_Li1;~Haoxiang_Zhong1;~Xingjun_Ma1;~Yong_Jiang3;~Shu-Tao_Xia1", "gender": "M;M;M;M;M", "homepage": "http://liyiming.tech;;http://xingjunma.com/;;https://www.sigs.tsinghua.edu.cn/xst/list.htm", "dblp": "l/YimingLi-4;;195/8270;74/1552-1.html;03/6195", "google_scholar": "mSW7kU8AAAAJ;VOw9qmYAAAAJ;https://scholar.google.com.au/citations?user=XQViiyYAAAAJ;;https://scholar.google.com.hk/citations?user=koAXTXgAAAAJ", "orcid": "0000-0002-2258-265X;;;;0000-0002-8639-982X", "linkedin": "yiming-li-thu/;;xingjun-ma-173532129/;;", "or_profile": "~Yiming_Li1;~Haoxiang_Zhong1;~Xingjun_Ma1;~Yong_Jiang3;~Shu-Tao_Xia1", "aff": "Tsinghua University;Tsinghua University;Deakin University;Tsinghua University;Shenzhen International Graduate School, Tsinghua University", "aff_domain": "mails.tsinghua.edu.cn;mails.tsinghua.edu.cn;deakin.edu.au;tsinghua.edu.cn;sz.tsinghua.edu.cn", "position": "PhD student;MS student;Assistant Professor;Full Professor;Full Professor", "bibtex": "@inproceedings{\nli2022fewshot,\ntitle={Few-Shot Backdoor Attacks on Visual Object Tracking},\nauthor={Yiming Li and Haoxiang Zhong and Xingjun Ma and Yong Jiang and Shu-Tao Xia},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=qSV5CuSaK_a}\n}", "github": "", "project": "", "reviewers": "DNKJ;Whrd;1drU", "pdf_size": 0, "recommendation": "6;6;6", "confidence": "4;2;3", "correctness": "3;3;3", "technical_novelty": "3;2;3", "empirical_novelty": "3;3;3", "wc_summary_paper": "66;46;79", "wc_summary_review": "59;37;36", "wc_main_review": "216;142;131", "wc_review": "341;225;246", "wc_reply_reviewers": "89;28;22", "wc_reply_authors": "676;974;1373", "reply_reviewers": "1;1;1", "reply_authors": "4;5;5", "recommendation_avg": [ 6.0, 0.0 ], "confidence_avg": [ 3.0, 0.816496580927726 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 63.666666666666664, 13.572848714334887 ], "wc_summary_review_avg": [ 44.0, 10.614455552060438 ], "wc_main_review_avg": [ 163.0, 37.74475681027322 ], "wc_review_avg": [ 270.6666666666667, 50.46671070011289 ], "wc_reply_reviewers_avg": [ 46.333333333333336, 30.26916289265731 ], "wc_reply_authors_avg": [ 1007.6666666666666, 285.54314715798887 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 4.666666666666667, 0.4714045207910317 ], "replies_avg": [ 22, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 74, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14007756108337436&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=qSV5CuSaK_a", "email": "mails.tsinghua.edu.cn;mails.tsinghua.edu.cn;deakin.edu.au;tsinghua.edu.cn;sz.tsinghua.edu.cn", "author_num": 5, "aff_unique_index": "0;0;1;0;0", "aff_unique_norm": "Tsinghua University;Deakin University", "aff_unique_dep": ";", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.deakin.edu.au", "aff_unique_abbr": "THU;Deakin", "aff_campus_unique_index": "1", "aff_campus_unique": ";Shenzhen", "aff_country_unique_index": "0;0;1;0;0", "aff_country_unique": "China;Australia" }, { "id": "qTBC7E4c454", "title": "Recursive Construction of Stable Assemblies of Recurrent Neural Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "Advanced applications of modern machine learning will likely involve combinations of trained networks, as are already used in spectacular systems such as DeepMind's AlphaGo. Recursively building such combinations in an effective and stable fashion while also allowing for continual refinement of the individual networks - as nature does for biological networks - will require new analysis tools. This paper takes a step in this direction by establishing contraction properties of broad classes of nonlinear recurrent networks and neural ODEs, and showing how these quantified properties allow in turn to recursively construct stable networks of networks in a systematic fashion. The results can also be used to stably combine recurrent networks and physical systems with quantified contraction properties. Similarly, they may be applied to modular computational models of cognition. We perform experiments with these combined networks on benchmark sequential tasks (e.g permuted sequential MNIST) to demonstrate their capacity for processing information across a long timescale in a provably stable manner.", "keywords": "stability;deep learning;RNN;combinations;modularity;sparsity;negative feedback;sequence learning;contraction analysis", "primary_area": "", "supplementary_material": "/attachment/8a1f5d374a87912b5106221494643a1f849f2b82.zip", "author": "Leo Kozachkov;Michaela M Ennis;Jean-Jacques Slotine", "authorids": "leokoz8@mit.edu;~Michaela_M_Ennis1;~Jean-Jacques_Slotine1", "gender": ";F;M", "homepage": ";https://menace.live;http://web.mit.edu/nsl/www/index.html", "dblp": ";295/9006;22/3009", "google_scholar": ";1PVaM_AAAAAJ;TcREpMQAAAAJ", "orcid": ";0000-0001-7898-8184;", "linkedin": ";ennisthemennis/;", "or_profile": "leokoz8@mit.edu;~Michaela_M_Ennis1;~Jean-Jacques_Slotine1", "aff": ";Harvard University;Massachusetts Institute of Technology", "aff_domain": ";harvard.edu;mit.edu", "position": ";PhD student;Full Professor", "bibtex": "@misc{\nkozachkov2022recursive,\ntitle={Recursive Construction of Stable Assemblies of Recurrent Neural Networks},\nauthor={Leo Kozachkov and Michaela M Ennis and Jean-Jacques Slotine},\nyear={2022},\nurl={https://openreview.net/forum?id=qTBC7E4c454}\n}", "github": "", "project": "", "reviewers": "q4Si;RFwT;hNbM;V8Ay", "site": "https://openreview.net/forum?id=qTBC7E4c454", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "4;2;2;3", "correctness": "4;2;4;4", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;2;3;2", "wc_summary_paper": "61;33;44;121", "wc_summary_review": "38;44;46;99", "wc_main_review": "192;483;148;406", "wc_review": "291;560;238;626", "wc_reply_reviewers": "54;226;17;175", "wc_reply_authors": "639;572;273;225", "reply_reviewers": "1;1;1;2", "reply_authors": "3;2;1;2", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 2.75, 0.82915619758885 ], "correctness_avg": [ 3.5, 0.8660254037844386 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 64.75, 33.973335132129726 ], "wc_summary_review_avg": [ 56.75, 24.57005290999594 ], "wc_main_review_avg": [ 307.25, 140.78596343385942 ], "wc_review_avg": [ 428.75, 166.95414789696002 ], "wc_reply_reviewers_avg": [ 118.0, 85.45466634420849 ], "wc_reply_authors_avg": [ 427.25, 180.61613300034966 ], "reply_reviewers_avg": [ 1.25, 0.4330127018922193 ], "reply_authors_avg": [ 2.0, 0.7071067811865476 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.20751433915982243, "corr_recommendation_correctness": 0.13245323570650439, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1203337979419052260&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1", "aff_unique_norm": "Harvard University;Massachusetts Institute of Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.harvard.edu;https://web.mit.edu", "aff_unique_abbr": "Harvard;MIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Learning transferable motor skills with hierarchical latent mixture policies", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6750", "id": "qTHBE7E9iej", "poster": "", "openreview": "https://openreview.net/forum?id=qTHBE7E9iej", "slides": "https://iclr.cc/virtual/2022/poster/6750", "video": "https://iclr.cc/virtual/2022/poster/6750", "author_site": "Dushyant Rao, Fereshteh Sadeghi, Leonard Hasenclever, Markus Wulfmeier, Martina Zambelli, Giulia Vezzani, Dhruva Tirumala, Yusuf Aytar, Josh Merel, Nicolas Heess, Raia Hadsell", "tldr": "", "abstract": "For robots operating in the real world, it is desirable to learn reusable abstract behaviours that can effectively be transferred across numerous tasks and scenarios.\nWe propose an approach to learn skills from data using a hierarchical mixture latent variable model.\nOur method exploits a multi-level hierarchy of both discrete and continuous latent variables, to model a discrete set of abstract high-level behaviours while allowing for variance in how they are executed.\nWe demonstrate in manipulation domains that the method can effectively cluster offline data into distinct, executable behaviours, while retaining the flexibility of a continuous latent variable model.\nThe resulting skills can be transferred to new tasks, unseen objects, and from state to vision-based policies, yielding significantly better sample efficiency and asymptotic performance compared to existing skill- and imitation-based methods.\nWe also perform further analysis showing how and when the skills are most beneficial: they encourage directed exploration to cover large regions of the state space relevant to the task, making them most effective in challenging sparse-reward settings.", "keywords": "Robotics;Reinforcement Learning;Hierarchical;Latent Variable Models;Skills;Transfer", "primary_area": "", "supplementary_material": "", "author": "Dushyant Rao;Fereshteh Sadeghi;Leonard Hasenclever;Markus Wulfmeier;Martina Zambelli;Giulia Vezzani;Dhruva Tirumala;Yusuf Aytar;Josh Merel;Nicolas Heess;raia hadsell", "authorids": "~Dushyant_Rao1;~Fereshteh_Sadeghi3;~Leonard_Hasenclever1;~Markus_Wulfmeier1;~Martina_Zambelli2;~Giulia_Vezzani1;~Dhruva_Tirumala1;~Yusuf_Aytar1;~Josh_Merel1;~Nicolas_Heess1;~raia_hadsell1", "gender": "M;F;M;M;F;F;M;;;F;", "homepage": ";http://homes.cs.washington.edu/~fsadeghi/;;;;;;;;http://www.raiahadsell.com;", "dblp": ";;150/1667;166/1552;190/8518;;41/5577;139/1361;76/9181;http://dblp.uni-trier.de/pers/hd/h/Hadsell:Raia;190/7697.html", "google_scholar": ";vS8b6GwAAAAJ;https://scholar.google.co.uk/citations?user=dD-3S4QAAAAJ;;;https://scholar.google.it/citations?user=Zlpuln8AAAAJ;0ncQNL8AAAAJ;https://scholar.google.co.uk/citations?user=K4OcFXUAAAAJ;79k7bGEAAAAJ;EWQnacoAAAAJ;HqKq-2YAAAAJ", "orcid": ";;;;;;;;;;", "linkedin": ";;;;;;;;;;", "or_profile": "~Dushyant_Rao1;~Fereshteh_Sadeghi3;~Leonard_Hasenclever1;~Markus_Wulfmeier1;~Martina_Zambelli2;~Giulia_Vezzani1;~Yusuf_Aytar1;~Josh_Merel1;~Nicolas_Heess1;~raia_hadsell1;~Dhruva_TB1", "aff": "Google DeepMind;Google DeepMind;Google DeepMind;Google DeepMind;Google DeepMind;Google DeepMind;Google DeepMind;Meta Reality Labs;Google DeepMind;Google DeepMind;University College London", "aff_domain": "google.com;deepmind.com;google.com;deepmind.com;deepmind.com;deepmind.com;google.com;fb.com;google.com;deepmind.com;ucl.ac.uk", "position": "Research Scientist;Researcher;Research Scientist;Research Scientist;Research Scientist;Researcher;Research Scientist;Research Scientist;Research Scientist;Research Scientist;PhD student", "bibtex": "@inproceedings{\nrao2022learning,\ntitle={Learning transferable motor skills with hierarchical latent mixture policies},\nauthor={Dushyant Rao and Fereshteh Sadeghi and Leonard Hasenclever and Markus Wulfmeier and Martina Zambelli and Giulia Vezzani and Dhruva Tirumala and Yusuf Aytar and Josh Merel and Nicolas Heess and raia hadsell},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=qTHBE7E9iej}\n}", "github": "", "project": "", "reviewers": "cfLn;sbA9;NRgW;wAVF", "pdf_size": 0, "recommendation": "8;8;8;8", "confidence": "4;4;4;4", "correctness": "3;3;3;4", "technical_novelty": "3;2;2;3", "empirical_novelty": "3;3;3;2", "wc_summary_paper": "387;133;275;82", "wc_summary_review": "213;114;142;54", "wc_main_review": "851;517;656;398", "wc_review": "1451;764;1073;534", "wc_reply_reviewers": "167;38;457;114", "wc_reply_authors": "486;711;699;861", "reply_reviewers": "1;1;1;2", "reply_authors": "2;1;2;3", "recommendation_avg": [ 8.0, 0.0 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 219.25, 119.92158896545692 ], "wc_summary_review_avg": [ 130.75, 57.146194098994904 ], "wc_main_review_avg": [ 605.5, 168.60382557937407 ], "wc_review_avg": [ 955.5, 344.1151696743403 ], "wc_reply_reviewers_avg": [ 194.0, 158.61431209068115 ], "wc_reply_authors_avg": [ 689.25, 133.5821376532057 ], "reply_reviewers_avg": [ 1.25, 0.4330127018922193 ], "reply_authors_avg": [ 2.0, 0.7071067811865476 ], "replies_avg": [ 22, 0 ], "authors#_avg": [ 11, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 38, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13311269075007662914&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=qTHBE7E9iej", "email": "google.com;deepmind.com;google.com;deepmind.com;deepmind.com;deepmind.com;google.com;fb.com;google.com;deepmind.com;ucl.ac.uk", "author_num": 11, "aff_unique_index": "0;0;0;0;0;0;0;1;0;0;2", "aff_unique_norm": "Google;Meta;University College London", "aff_unique_dep": "Google DeepMind;Meta Reality Labs;", "aff_unique_url": "https://deepmind.com;https://www.meta.com;https://www.ucl.ac.uk", "aff_unique_abbr": "DeepMind;MRL;UCL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;1;0;0;0", "aff_country_unique": "United Kingdom;United States" }, { "id": "qTTccuW4dja", "title": "AriEL: volume coding for sentence generation comparisons", "track": "main", "status": "Reject", "tldr": "", "abstract": "Saving sequences of data to a point in a continuous space makes it difficult to retrieve them via random sampling. Mapping the input to a volume makes it easier, which is the strategy followed by Variational Autoencoders. However optimizing for prediction and for smoothness, forces them to trade-off between the two. We analyze the ability of standard deep learning techniques to generate sentences through latent space sampling. We compare toAriEL, an entropic coding method to construct volumes without the need for extra loss terms. We benchmark on a toy grammar, to automatically evaluate the language learned and generated, and find where it is stored in the latent space. Then, we benchmark on a dataset of human dialogues and using GPT-2 inside AriEL. Our results indicate that the random access to stored information can be improved since AriEL is able to generate a wider variety of correct language by randomly sampling the latent space. This supports the hypothesis that encoding information into volumes, leads to improved retrieval of learned information with random sampling.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/08e6d0b49e5922bf080faf0d8ed659e201f62251.zip", "author": "Luca Celotti;Simon Brodeur;Jean Rouat", "authorids": "~Luca_Celotti1;~Simon_Brodeur1;~Jean_Rouat1", "gender": "M;M;", "homepage": "https://lucehe.github.io/;https://simonbrodeur.com/;https://www.gel.usherbrooke.ca/rouat/", "dblp": ";;26/5977", "google_scholar": ";;LQVQJ50AAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Luca_Celotti1;~Simon_Brodeur1;~Jean_Rouat1", "aff": "Universit\u00e9 de Sherbrooke;Menya Solutions, the AI Division of Levio;Universit\u00e9 de Sherbrooke", "aff_domain": "usherbrooke.ca;levio.ca;usherbrooke.ca", "position": "PhD student;Researcher;Full Professor", "bibtex": "@misc{\ncelotti2022ariel,\ntitle={Ari{EL}: volume coding for sentence generation comparisons},\nauthor={Luca Celotti and Simon Brodeur and Jean Rouat},\nyear={2022},\nurl={https://openreview.net/forum?id=qTTccuW4dja}\n}", "github": "", "project": "", "reviewers": "qCdr;JWsh;JLnX;SHoR", "site": "https://openreview.net/forum?id=qTTccuW4dja", "pdf_size": 0, "recommendation": "3;3;3;6", "confidence": "4;3;3;2", "correctness": "3;2;2;3", "technical_novelty": "3;1;2;3", "empirical_novelty": "1;0;2;3", "wc_summary_paper": "83;37;58;38", "wc_summary_review": "190;60;12;32", "wc_main_review": "1364;308;251;131", "wc_review": "1637;405;321;201", "wc_reply_reviewers": "135;0;0;0", "wc_reply_authors": "857;405;332;153", "reply_reviewers": "1;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 3.75, 1.299038105676658 ], "confidence_avg": [ 3.0, 0.7071067811865476 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.25, 0.82915619758885 ], "empirical_novelty_avg": [ 1.5, 1.118033988749895 ], "wc_summary_paper_avg": [ 54.0, 18.721645226849056 ], "wc_summary_review_avg": [ 73.5, 69.38839960685071 ], "wc_main_review_avg": [ 513.5, 495.1749690765881 ], "wc_review_avg": [ 641.0, 579.5929606197784 ], "wc_reply_reviewers_avg": [ 33.75, 58.45671475544961 ], "wc_reply_authors_avg": [ 436.75, 259.37653614002943 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.816496580927726, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:S9U3D3_K5vMJ:scholar.google.com/&scioq=AriEL:+volume+coding+for+sentence+generation+comparisons&hl=en&as_sdt=0,33", "gs_version_total": 2, "aff_unique_index": "0;1;0", "aff_unique_norm": "Universit\u00e9 de Sherbrooke;Menya Solutions", "aff_unique_dep": ";AI Division", "aff_unique_url": "https://www.usherbrooke.ca;", "aff_unique_abbr": "UdeS;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Canada;" }, { "id": "qWhajfmKEUt", "title": "Delving into Feature Space: Improving Adversarial Robustness by Feature Spectral Regularization", "track": "main", "status": "Reject", "tldr": "", "abstract": "The study of adversarial examples in deep neural networks has attracted great attention. Numerous methods are proposed to eliminate the gap of features between natural examples and adversarial examples. Nevertheless, every feature may play a different role in adversarial robustness. It is worth exploring which feature is more beneficial for robustness. In this paper, we delve into this problem from the perspective of spectral analysis in feature space. We define a new metric to measure the change of features along eigenvectors under adversarial attacks. One key finding is that eigenvectors with smaller eigenvalues are more non-robust, i.e., adversary adds more components along such directions. We attribute this phenomenon to the dominance of the top eigenvalues. To alleviate this problem, we propose a method called \\textit{Feature Spectral Regularization (FSR)} to penalize the largest eigenvalue, and as a result, the other smaller eigenvalues get increased relatively. Comprehensive experiments demonstrate that FSR is effective to alleviate the dominance of larger eigenvalues and improve adversarial robustness on different datasets. Our codes will be publicly available soon.", "keywords": "adversarial example;adversarial robustness;spectral signature", "primary_area": "", "supplementary_material": "", "author": "Zhen Cheng;Fei Zhu;Xu-yao Zhang;Cheng-lin Liu", "authorids": "~Zhen_Cheng3;~Fei_Zhu1;~Xu-yao_Zhang1;~Cheng-lin_Liu1", "gender": "M;M;;M", "homepage": "http://www.nlpr.ia.ac.cn/pal/People/ChengZhen.html;http://www.nlpr.ia.ac.cn/pal/People/ZhuFei.html;;http://www.nlpr.ia.ac.cn/liucl/", "dblp": ";;;24/3006-1.html", "google_scholar": "zcwWjhUAAAAJ;fjZ1CBwAAAAJ;;8r3y8IMAAAAJ", "orcid": ";;;0000-0002-6743-4175", "linkedin": ";;;", "or_profile": "~Zhen_Cheng3;~Fei_Zhu1;~Xu-yao_Zhang1;~Cheng-lin_Liu1", "aff": "Institute of Automation, Chinese Academy of Sciences;Institute of Automation, Chinese Academy of Sciences;;Institute of Automation, Chinese Academy of Sciences", "aff_domain": "ia.ac.cn;ia.ac.cn;;ia.ac.cn", "position": "PhD student;PhD student;;Full Professor", "bibtex": "@misc{\ncheng2022delving,\ntitle={Delving into Feature Space: Improving Adversarial Robustness by Feature Spectral Regularization},\nauthor={Zhen Cheng and Fei Zhu and Xu-yao Zhang and Cheng-lin Liu},\nyear={2022},\nurl={https://openreview.net/forum?id=qWhajfmKEUt}\n}", "github": "", "project": "", "reviewers": "wNJf;kop5;gwLs", "site": "https://openreview.net/forum?id=qWhajfmKEUt", "pdf_size": 0, "recommendation": "3;5;6", "confidence": "4;4;3", "correctness": "2;3;3", "technical_novelty": "1;2;3", "empirical_novelty": "2;3;0", "wc_summary_paper": "156;100;78", "wc_summary_review": "39;48;45", "wc_main_review": "507;223;350", "wc_review": "702;371;473", "wc_reply_reviewers": "1104;0;0", "wc_reply_authors": "3970;845;713", "reply_reviewers": "6;0;0", "reply_authors": "8;2;2", "recommendation_avg": [ 4.666666666666667, 1.247219128924647 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 2.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.0, 0.816496580927726 ], "empirical_novelty_avg": [ 1.6666666666666667, 1.247219128924647 ], "wc_summary_paper_avg": [ 111.33333333333333, 32.836294282732666 ], "wc_summary_review_avg": [ 44.0, 3.7416573867739413 ], "wc_main_review_avg": [ 360.0, 116.1579384573722 ], "wc_review_avg": [ 515.3333333333334, 138.40600500781107 ], "wc_reply_reviewers_avg": [ 368.0, 520.430590953299 ], "wc_reply_authors_avg": [ 1842.6666666666667, 1505.2167802531153 ], "reply_reviewers_avg": [ 2.0, 2.8284271247461903 ], "reply_authors_avg": [ 4.0, 2.8284271247461903 ], "replies_avg": [ 23, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.7559289460184545, "corr_recommendation_correctness": 0.9449111825230683, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Z6_XkqgZLZsJ:scholar.google.com/&scioq=Delving+into+Feature+Space:+Improving+Adversarial+Robustness+by+Feature+Spectral+Regularization&hl=en&as_sdt=0,5", "gs_version_total": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Chinese Academy of Sciences", "aff_unique_dep": "Institute of Automation", "aff_unique_url": "http://www.ia.cas.cn", "aff_unique_abbr": "CAS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "id": "qXBs0jmQkqx", "title": "FIFO: Learning Fog-invariant Features for Foggy Scene Segmentation", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Robust visual recognition under adverse weather conditions is of great importance in real-world applications. In this context, we propose a new method for learning semantic segmentation models robust against fog. Its key idea is to consider the fog condition of an image as its style and close the gap between images with different fog conditions in neural style spaces of a segmentation model. In particular, since the neural style of an image is in general affected by other factors as well as fog, we introduce a fog-pass filter module that learns to extract a fog-relevant factor from the style. Optimizing the fog-pass filter and the segmentation model alternately gradually closes the style gap between different fog conditions and allows to learn fog-invariant features in consequence. Our method substantially outperforms previous work on three real foggy image datasets. Moreover, it improves performance on both foggy and clear weather images, while existing methods often degrade performance on clear scenes.", "keywords": "Foggy scene understanding;Fog-invariant feature learning;Semantic segmentation", "primary_area": "", "supplementary_material": "", "author": "Sohyun Lee;Taeyoung Son;Suha Kwak", "authorids": "~Sohyun_Lee1;~Taeyoung_Son1;~Suha_Kwak3", "gender": ";M;M", "homepage": "https://sohyun-l.github.io/;https://github.com/329tyson;https://suhakwak.github.io/", "dblp": "317/6799;;65/6173", "google_scholar": "https://scholar.google.com/citations?hl=ko;;-gscDIEAAAAJ", "orcid": ";;", "linkedin": "sohyun-lee-858616233/;;", "or_profile": "~Sohyun_Lee1;~Taeyoung_Son1;~Suha_Kwak3", "aff": "POSTECH;POSTECH;POSTECH", "aff_domain": "postech.ac.kr;postech.ac.kr;postech.ac.kr", "position": "PhD student;PhD student;Associate Professor", "bibtex": "@misc{\nlee2022fifo,\ntitle={{FIFO}: Learning Fog-invariant Features for Foggy Scene Segmentation},\nauthor={Sohyun Lee and Taeyoung Son and Suha Kwak},\nyear={2022},\nurl={https://openreview.net/forum?id=qXBs0jmQkqx}\n}", "github": "", "project": "", "reviewers": "1bHL;GJup;vS5m", "site": "https://openreview.net/forum?id=qXBs0jmQkqx", "pdf_size": 0, "recommendation": "3;5;8", "confidence": "5;4;4", "correctness": "2;3;4", "technical_novelty": "2;2;4", "empirical_novelty": "2;3;3", "wc_summary_paper": "59;65;96", "wc_summary_review": "30;25;14", "wc_main_review": "117;292;86", "wc_review": "206;382;196", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 5.333333333333333, 2.0548046676563256 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "correctness_avg": [ 3.0, 0.816496580927726 ], "technical_novelty_avg": [ 2.6666666666666665, 0.9428090415820634 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 73.33333333333333, 16.21384867602041 ], "wc_summary_review_avg": [ 23.0, 6.683312551921141 ], "wc_main_review_avg": [ 165.0, 90.68994799131085 ], "wc_review_avg": [ 261.3333333333333, 85.42182911228774 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.8029550685469661, "corr_recommendation_correctness": 0.9933992677987828, "gs_citation": 81, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10025377918833329993&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff_unique_index": "0;0;0", "aff_unique_norm": "Pohang University of Science and Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.postech.ac.kr", "aff_unique_abbr": "POSTECH", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Pohang", "aff_country_unique_index": "0;0;0", "aff_country_unique": "South Korea" }, { "id": "qXa0nhTRZGV", "title": "Understanding Sharpness-Aware Minimization", "track": "main", "status": "Reject", "tldr": "", "abstract": "Sharpness-Aware Minimization (SAM) is a recent training method that relies on worst-case weight perturbations. SAM significantly improves generalization in various settings, however, existing justifications for its success do not seem conclusive. First, we analyze the implicit bias of SAM over diagonal linear networks, and prove that it always chooses a solution that enjoys better generalisation properties than standard gradient descent. We also provide a convergence proof of SAM for non-convex objectives when used with stochastic gradients and empirically discuss the convergence and generalization behavior of SAM for deep networks. Next, we discuss why SAM can be helpful in the noisy label setting where we first show that it can help to improve generalization even for linear classifiers. Then we discuss a gradient reweighting interpretation of SAM and show a further beneficial effect of combining SAM with a robust loss. Finally, we draw parallels between overfitting observed in learning with noisy labels and in adversarial training where SAM also improves generalization. This connection suggests that, more generally, techniques from the noisy label literature can be useful to improve robust generalization.", "keywords": "Sharpness-aware minimization;implicit bias;noisy labels;adversarial training", "primary_area": "", "supplementary_material": "", "author": "Maksym Andriushchenko;Nicolas Flammarion", "authorids": "~Maksym_Andriushchenko1;~Nicolas_Flammarion1", "gender": "M;M", "homepage": "https://www.andriushchenko.me/;", "dblp": "200/8865;164/7417", "google_scholar": "ZNtuJYoAAAAJ;", "orcid": ";", "linkedin": ";", "or_profile": "~Maksym_Andriushchenko1;~Nicolas_Flammarion1", "aff": "Swiss Federal Institute of Technology Lausanne;Swiss Federal Institute of Technology Lausanne", "aff_domain": "epfl.ch;epfl.ch", "position": "PhD Student;Assistant Professor", "bibtex": "@misc{\nandriushchenko2022understanding,\ntitle={Understanding Sharpness-Aware Minimization},\nauthor={Maksym Andriushchenko and Nicolas Flammarion},\nyear={2022},\nurl={https://openreview.net/forum?id=qXa0nhTRZGV}\n}", "github": "", "project": "", "reviewers": "iJV1;BAw4;nmv6;EJMz", "site": "https://openreview.net/forum?id=qXa0nhTRZGV", "pdf_size": 0, "recommendation": "3;3;3;6", "confidence": "3;4;3;4", "correctness": "4;3;4;3", "technical_novelty": "2;1;3;3", "empirical_novelty": "2;2;0;3", "wc_summary_paper": "75;81;16;121", "wc_summary_review": "51;104;33;47", "wc_main_review": "300;432;331;225", "wc_review": "426;617;380;393", "wc_reply_reviewers": "0;0;0;515", "wc_reply_authors": "538;956;666;1406", "reply_reviewers": "0;0;0;1", "reply_authors": "1;2;1;3", "recommendation_avg": [ 3.75, 1.299038105676658 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.25, 0.82915619758885 ], "empirical_novelty_avg": [ 1.75, 1.0897247358851685 ], "wc_summary_paper_avg": [ 73.25, 37.485830656396026 ], "wc_summary_review_avg": [ 58.75, 26.966414296305693 ], "wc_main_review_avg": [ 322.0, 74.28660713749147 ], "wc_review_avg": [ 454.0, 95.59027147152581 ], "wc_reply_reviewers_avg": [ 128.75, 223.00154147449294 ], "wc_reply_authors_avg": [ 891.5, 333.42277966569713 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.75, 0.82915619758885 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": -0.5773502691896257, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17739106947051142215&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Swiss Federal Institute of Technology Lausanne", "aff_unique_dep": "", "aff_unique_url": "https://www.epfl.ch", "aff_unique_abbr": "EPFL", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Lausanne", "aff_country_unique_index": "0;0", "aff_country_unique": "Switzerland" }, { "title": "DISSECT: Disentangled Simultaneous Explanations via Concept Traversals", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6367", "id": "qY79G8jGsep", "poster": "", "openreview": "https://openreview.net/forum?id=qY79G8jGsep", "slides": "https://iclr.cc/virtual/2022/poster/6367", "video": "https://iclr.cc/virtual/2022/poster/6367", "author_site": "Asma Ghandeharioun, Been Kim, Chun-Liang Li, Brendan Jou, Brian Eoff, Rosalind Picard", "tldr": "", "abstract": "Explaining deep learning model inferences is a promising venue for scientific understanding, improving safety, uncovering hidden biases, evaluating fairness, and beyond, as argued by many scholars. One of the principal benefits of counterfactual explanations is allowing users to explore \"what-if\" scenarios through what does not and cannot exist in the data, a quality that many other forms of explanation such as heatmaps and influence functions are inherently incapable of doing. However, most previous work on generative explainability cannot disentangle important concepts effectively, produces unrealistic examples, or fails to retain relevant information. We propose a novel approach, DISSECT, that jointly trains a generator, a discriminator, and a concept disentangler to overcome such challenges using little supervision. DISSECT generates Concept Traversals (CTs), defined as a sequence of generated examples with increasing degrees of concepts that influence a classifier's decision. By training a generative model from a classifier's signal, DISSECT offers a way to discover a classifier's inherent \"notion\" of distinct concepts automatically rather than rely on user-predefined concepts. We show that DISSECT produces CTs that (1) disentangle several concepts, (2) are influential to a classifier's decision and are coupled to its reasoning due to joint training (3), are realistic, (4) preserve relevant information, and (5) are stable across similar inputs. We validate DISSECT on several challenging synthetic and realistic datasets where previous methods fall short of satisfying desirable criteria for interpretability and show that it performs consistently well. Finally, we present experiments showing applications of DISSECT for detecting potential biases of a classifier and identifying spurious artifacts that impact predictions.", "keywords": "Explainability;Interpretability;Counterfactual generation;Generative Adversarial Network;Variational Autoencoder", "primary_area": "", "supplementary_material": "", "author": "Asma Ghandeharioun;Been Kim;Chun-Liang Li;Brendan Jou;Brian Eoff;Rosalind Picard", "authorids": "~Asma_Ghandeharioun1;~Been_Kim1;~Chun-Liang_Li1;~Brendan_Jou1;~Brian_Eoff1;~Rosalind_Picard1", "gender": ";;M;M;M;F", "homepage": "https://alum.mit.edu/www/asma_gh;https://beenkim.github.io/;http://chunliangli.github.io;;;https://web.media.mit.edu/~picard/", "dblp": "124/3110;https://dblp.uni-trier.de/pers/k/Kim:Been.html;;120/8567;54/3434;", "google_scholar": "CkfQy2gAAAAJ;;https://scholar.google.com.tw/citations?user=vqHIt_sAAAAJ;k7eC8-0AAAAJ;;https://scholar.google.com/scholar?hl=en", "orcid": ";;;0000-0001-8033-0330;;", "linkedin": ";;;brendanjou/;;", "or_profile": "~Asma_Ghandeharioun1;~Been_Kim1;~Chun-Liang_Li1;~Brendan_Jou1;~Brian_Eoff1;~Rosalind_Picard1", "aff": "Google;Google DeepMind;Google;Google DeepMind;Google;Massachusetts Institute of Technology", "aff_domain": "google.com;google.com;google.com;google.com;google.com;mit.edu", "position": "Research Scientist;Research Scientist;Researcher;Research Manager;Researcher;Full Professor", "bibtex": "@inproceedings{\nghandeharioun2022dissect,\ntitle={{DISSECT}: Disentangled Simultaneous Explanations via Concept Traversals},\nauthor={Asma Ghandeharioun and Been Kim and Chun-Liang Li and Brendan Jou and Brian Eoff and Rosalind Picard},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=qY79G8jGsep}\n}", "github": "", "project": "", "reviewers": "47RT;1keG;jPGJ;WoTB", "pdf_size": 0, "recommendation": "6;6;6;6", "confidence": "4;3;4;3", "correctness": "3;3;3;4", "technical_novelty": "2;3;3;2", "empirical_novelty": "3;3;3;3", "wc_summary_paper": "84;45;47;42", "wc_summary_review": "106;68;24;43", "wc_main_review": "395;325;173;43", "wc_review": "585;438;244;128", "wc_reply_reviewers": "0;15;24;0", "wc_reply_authors": "475;201;793;188", "reply_reviewers": "0;1;1;0", "reply_authors": "1;1;2;1", "recommendation_avg": [ 6.0, 0.0 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 54.5, 17.124543789543708 ], "wc_summary_review_avg": [ 60.25, 30.678779310787448 ], "wc_main_review_avg": [ 234.0, 136.38548309845882 ], "wc_review_avg": [ 348.75, 175.7005620366651 ], "wc_reply_reviewers_avg": [ 9.75, 10.256095748383007 ], "wc_reply_authors_avg": [ 414.25, 246.88395553376895 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 64, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4475466485614086287&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "pdf": "https://openreview.net/pdf?id=qY79G8jGsep", "email": "google.com;google.com;google.com;google.com;google.com;mit.edu", "author_num": 6, "aff_unique_index": "0;0;0;0;0;1", "aff_unique_norm": "Google;Massachusetts Institute of Technology", "aff_unique_dep": "Google;", "aff_unique_url": "https://www.google.com;https://web.mit.edu", "aff_unique_abbr": "Google;MIT", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;1;0;1;0;0", "aff_country_unique": "United States;United Kingdom" }, { "id": "qZNw8Ao_BIC", "title": "Understanding and Improving Robustness of Vision Transformers through Patch-based Negative Augmentation", "track": "main", "status": "Reject", "tldr": "", "abstract": "We investigate the robustness of vision transformers (ViTs) through the lens of their special patch-based architectural structure, i.e., they process an image as a sequence of image patches. We find that ViTs are surprisingly insensitive to patch-based transformations, even when the transformation largely destroys the original semantics and makes the image unrecognizable by humans. This indicates that ViTs heavily use features that survived such transformations but are generally not indicative of the semantic class to humans. Further investigations show that these features are useful but non-robust, as ViTs trained on them can achieve high in-distribution accuracy, but break down under distribution shifts. From this understanding, we ask: can training the model to rely less on these features improve ViT robustness and out-of-distribution performance? We use the images transformed with our patch-based operations as negatively augmented views and offer losses to regularize the training away from using non-robust features. This is a complementary view to existing research that mostly focuses on augmenting inputs with semantic-preserving transformations to enforce models' invariance. We show that patch-based negative augmentation consistently improves robustness of ViTs across a wide set of ImageNet based robustness benchmarks. Furthermore, we find our patch-based negative augmentation are complementary to traditional (positive) data augmentation, and together boost the performance further.", "keywords": "Vision Transformer;robustness under distributional shift;data augmentation", "primary_area": "", "supplementary_material": "", "author": "Yao Qin;Chiyuan Zhang;Ting Chen;Balaji Lakshminarayanan;Alex Beutel;Xuezhi Wang", "authorids": "~Yao_Qin1;~Chiyuan_Zhang1;~Ting_Chen1;~Balaji_Lakshminarayanan1;~Alex_Beutel1;~Xuezhi_Wang3", "gender": ";M;M;M;;", "homepage": "https://yaoqin1.github.io;http://pluskid.org;;http://www.gatsby.ucl.ac.uk/~balaji/;;https://research.google/people/105995/", "dblp": "66/10420-1;21/8315;19/1766;71/8324;;70/4090-2", "google_scholar": "https://scholar.google.com/citations?view_op=list_works;l_G2vr0AAAAJ;KoXUMbsAAAAJ;QYn8RbgAAAAJ;;ScLUQ-YAAAAJ", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": "~Yao_Qin1;~Chiyuan_Zhang1;~Ting_Chen1;~Balaji_Lakshminarayanan1;~Alex_Beutel1;~Xuezhi_Wang3", "aff": "Google;Google;Google;Google Brain;;Google DeepMind", "aff_domain": "google.com;google.com;google.com;google.com;;google.com", "position": "Researcher;Research Scientist;Research Scientist;Research Scientist;;Research Scientist", "bibtex": "@misc{\nqin2022understanding,\ntitle={Understanding and Improving Robustness of Vision Transformers through Patch-based Negative Augmentation},\nauthor={Yao Qin and Chiyuan Zhang and Ting Chen and Balaji Lakshminarayanan and Alex Beutel and Xuezhi Wang},\nyear={2022},\nurl={https://openreview.net/forum?id=qZNw8Ao_BIC}\n}", "github": "", "project": "", "reviewers": "FXi6;j5h2;rziK;DY4w", "site": "https://openreview.net/forum?id=qZNw8Ao_BIC", "pdf_size": 0, "recommendation": "3;5;6;8", "confidence": "4;4;5;4", "correctness": "1;4;3;4", "technical_novelty": "2;3;4;4", "empirical_novelty": "2;4;0;4", "wc_summary_paper": "77;64;79;130", "wc_summary_review": "45;54;124;71", "wc_main_review": "281;308;40;658", "wc_review": "403;426;243;859", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "824;731;412;1015", "reply_reviewers": "0;0;0;0", "reply_authors": "2;1;1;2", "recommendation_avg": [ 5.5, 1.8027756377319946 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.0, 1.224744871391589 ], "technical_novelty_avg": [ 3.25, 0.82915619758885 ], "empirical_novelty_avg": [ 2.5, 1.6583123951777 ], "wc_summary_paper_avg": [ 87.5, 25.20416632225712 ], "wc_summary_review_avg": [ 73.5, 30.614539029683264 ], "wc_main_review_avg": [ 321.75, 220.39552513606077 ], "wc_review_avg": [ 482.75, 228.37729199725615 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 745.5, 218.07395534542863 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.16012815380508713, "corr_recommendation_correctness": 0.792593923901217, "gs_citation": 53, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9671462841658657921&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Google", "aff_unique_dep": "Google", "aff_unique_url": "https://www.google.com", "aff_unique_abbr": "Google", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;0;0;0;1", "aff_country_unique": "United States;United Kingdom" }, { "id": "qaQ8kUBYhEK", "title": "Spectral Multiplicity Entails Sample-wise Multiple Descent", "track": "main", "status": "Reject", "tldr": "", "abstract": "In this paper, we study the generalization risk of ridge and ridgeless linear regression. We assume that the data features follow a multivariate normal distribution and that the spectrum of the covariance matrix consists of a given set of eigenvalues of proportionally growing multiplicity. We characterize the limiting bias and variance when the dimension and the number of training samples tend to infinity proportionally. Exact formulae for the bias and variance are derived using the random matrix theory and convex Gaussian min-max theorem. Based on these formulae, we study the sample-wise multiple descent phenomenon of the generalization risk curve, i.e., with more data, the generalization risk can be non-monotone, and specifically, can increase and then decrease multiple times with more training data samples. We prove that sample-wise multiple descent occurs when the spectrum of the covariance matrix is highly ill-conditioned. We also present numerical results to confirm the values of the bias and variance predicted by our theory and illustrate the multiple descent of the generalization risk curve. Moreover, we theoretically show that the ridge estimator with optimal regularization can result in a monotone generalization risk curve and thereby eliminate multiple descent under some assumptions. ", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Lin Chen;Song Mei", "authorids": "~Lin_Chen14;~Song_Mei1", "gender": ";M", "homepage": ";https://www.stat.berkeley.edu/~songmei/", "dblp": ";https://dblp.org/pers/hd/m/Mei:Song", "google_scholar": ";https://scholar.google.com.hk/citations?hl=en", "orcid": ";", "linkedin": ";", "or_profile": "~Lin_Chen14;~Song_Mei1", "aff": ";University of California, Berkeley", "aff_domain": ";berkeley.edu", "position": ";Assistant Professor", "bibtex": "@misc{\nchen2022spectral,\ntitle={Spectral Multiplicity Entails Sample-wise Multiple Descent},\nauthor={Lin Chen and Song Mei},\nyear={2022},\nurl={https://openreview.net/forum?id=qaQ8kUBYhEK}\n}", "github": "", "project": "", "reviewers": "6s7y;Xsqg;7ii8;RwFH", "site": "https://openreview.net/forum?id=qaQ8kUBYhEK", "pdf_size": 0, "recommendation": "3;6;6;8", "confidence": "4;2;2;4", "correctness": "4;4;4;4", "technical_novelty": "1;2;3;3", "empirical_novelty": "0;0;3;3", "wc_summary_paper": "47;171;66;55", "wc_summary_review": "36;54;39;34", "wc_main_review": "516;298;82;350", "wc_review": "599;523;187;439", "wc_reply_reviewers": "475;63;0;0", "wc_reply_authors": "715;444;289;290", "reply_reviewers": "1;1;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 5.75, 1.7853571071357126 ], "confidence_avg": [ 3.0, 1.0 ], "correctness_avg": [ 4.0, 0.0 ], "technical_novelty_avg": [ 2.25, 0.82915619758885 ], "empirical_novelty_avg": [ 1.5, 1.5 ], "wc_summary_paper_avg": [ 84.75, 50.25124376570196 ], "wc_summary_review_avg": [ 40.75, 7.854139036202504 ], "wc_main_review_avg": [ 311.5, 155.0443484942292 ], "wc_review_avg": [ 437.0, 155.03547981026796 ], "wc_reply_reviewers_avg": [ 134.5, 198.26308279657107 ], "wc_reply_authors_avg": [ 434.5, 173.796576491023 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.14002800840280097, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:_FEA40wljrgJ:scholar.google.com/&scioq=Spectral+Multiplicity+Entails+Sample-wise+Multiple+Descent&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "University of California, Berkeley", "aff_unique_dep": "", "aff_unique_url": "https://www.berkeley.edu", "aff_unique_abbr": "UC Berkeley", "aff_campus_unique_index": "0", "aff_campus_unique": "Berkeley", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "title": "GPT-Critic: Offline Reinforcement Learning for End-to-End Task-Oriented Dialogue Systems", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6823", "id": "qaxhBG1UUaS", "poster": "", "openreview": "https://openreview.net/forum?id=qaxhBG1UUaS", "slides": "https://iclr.cc/virtual/2022/poster/6823", "video": "https://iclr.cc/virtual/2022/poster/6823", "author_site": "Youngsoo Jang, Jongmin Lee, Kee-Eung Kim", "tldr": "", "abstract": "Training a task-oriented dialogue agent can be naturally formulated as offline reinforcement learning (RL) problem, where the agent aims to learn a conversational strategy to achieve user goals, only from a dialogue corpus. It is very challenging in terms of RL since the natural language action space is astronomical, while feasible (syntactically and semantically correct) actions are very sparse. Thus, standard RL methods easily fail and generate responses diverging from human language, even when fine-tuning a powerful pre-trained language model. In this paper, we introduce GPT-Critic, an offline RL method for task-oriented dialogue. GPT-Critic is built upon GPT-2, fine-tuning the language model through behavior cloning of the critic-guided self-generated sentences. GPT-Critic is essentially free from the issue of diverging from human language since it learns from the sentences sampled from the pre-trained language model. In the experiments, we demonstrate that our algorithm outperforms the state-of-the-art in the task-oriented dialogue benchmarks including MultiWOZ 2.0 and ConvLab.", "keywords": "task-oriented dialogue;pre-trained language model;offline reinforcement learning", "primary_area": "", "supplementary_material": "", "author": "Youngsoo Jang;Jongmin Lee;Kee-Eung Kim", "authorids": "~Youngsoo_Jang2;~Jongmin_Lee1;~Kee-Eung_Kim2", "gender": ";M;M", "homepage": "http://www.ysjang.me;https://www.jmlee.kr;http://ailab.kaist.ac.kr", "dblp": "195/0471;68/222-4.html;35/6703", "google_scholar": "6EoBBggAAAAJ;https://scholar.google.co.kr/citations?user=rFcK8EEAAAAJ;https://scholar.google.com/citations?hl=ko", "orcid": ";;", "linkedin": ";jmlee123/;", "or_profile": "~Youngsoo_Jang2;~Jongmin_Lee1;~Kee-Eung_Kim2", "aff": "Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology", "aff_domain": "kaist.ac.kr;kaist.ac.kr;kaist.ac.kr", "position": "PhD student;PhD student;Full Professor", "bibtex": "@inproceedings{\njang2022gptcritic,\ntitle={{GPT}-Critic: Offline Reinforcement Learning for End-to-End Task-Oriented Dialogue Systems},\nauthor={Youngsoo Jang and Jongmin Lee and Kee-Eung Kim},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=qaxhBG1UUaS}\n}", "github": "", "project": "", "reviewers": "JTos;fa2f;u1n3;ujyM", "pdf_size": 0, "recommendation": "6;6;6;6", "confidence": "4;5;3;3", "correctness": "3;4;3;3", "technical_novelty": "3;1;2;3", "empirical_novelty": "3;2;3;3", "wc_summary_paper": "101;36;95;93", "wc_summary_review": "34;76;21;64", "wc_main_review": "256;62;478;401", "wc_review": "391;174;594;558", "wc_reply_reviewers": "0;0;32;155", "wc_reply_authors": "310;362;642;530", "reply_reviewers": "0;0;1;1", "reply_authors": "1;1;1;2", "recommendation_avg": [ 6.0, 0.0 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.82915619758885 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 81.25, 26.290445032368698 ], "wc_summary_review_avg": [ 48.75, 22.151467220028564 ], "wc_main_review_avg": [ 299.25, 158.47929675512825 ], "wc_review_avg": [ 429.25, 166.08337514634027 ], "wc_reply_reviewers_avg": [ 46.75, 63.848942826017094 ], "wc_reply_authors_avg": [ 461.0, 132.40468269664785 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 59, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15221504271160916378&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "pdf": "https://openreview.net/pdf?id=qaxhBG1UUaS", "email": "kaist.ac.kr;kaist.ac.kr;kaist.ac.kr", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Korea Advanced Institute of Science and Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.kaist.ac.kr", "aff_unique_abbr": "KAIST", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "South Korea" }, { "id": "qfGcsAGhFbc", "title": "Rethinking Client Reweighting for Selfish Federated Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Most federated learning (FL) algorithms aim to learn a model which achieves optimal overall performance across all clients. However, for some clients, the model obtained by conventional federated training may perform even worse than that obtained by local training. Therefore, for a stakeholder who only cares about the performance of a few $\\textit{internal clients}$, the outcome of conventional federated learning may be unsatisfactory. To this end, we study a new $\\textit{selfish}$ variant of federated learning, in which the ultimate objective is to learn a model with optimal performance on internal clients $\\textit{alone}$ instead of all clients. We further propose Variance Reduction Selfish Learning (VaRSeL), a novel algorithm that reweights the external clients based on variance reduction for learning a model desired in this setting. Within each round of federated training, it guides the model to update towards the direction favored by the internal clients. We give a convergence analysis for both strongly-convex and non-convex cases, highlighting its fine-tune effect. Finally, we perform extensive experiments on both synthesized and real-world datasets, covering image classification, language modeling, and medical image segmentation. Experimental results empirically justify our theoretical results and show the advantage of VaRSeL over related FL algorithms.", "keywords": "Federated Learning;Sample Reweighting;Variance Reduction;Medical Image", "primary_area": "", "supplementary_material": "/attachment/9dc4f5ecfb509b76551e26878fabb20ea46f565a.zip", "author": "Ruichen Luo;Shoubo Hu;Lequan Yu", "authorids": "luoruichen10@outlook.com;~Shoubo_Hu1;~Lequan_Yu1", "gender": ";M;M", "homepage": ";https://amber0309.github.io/about/;https://yulequan.github.io/", "dblp": ";218/9202;165/8092", "google_scholar": ";;https://scholar.google.com.hk/citations?user=llXf3wUAAAAJ", "orcid": ";;0000-0002-9315-6527", "linkedin": ";;", "or_profile": "luoruichen10@outlook.com;~Shoubo_Hu1;~Lequan_Yu1", "aff": ";Huawei Technologies Ltd.;The University of Hong Kong", "aff_domain": ";huawei.com;hku.hk", "position": ";Researcher;Assistant Professor", "bibtex": "@misc{\nluo2022rethinking,\ntitle={Rethinking Client Reweighting for Selfish Federated Learning},\nauthor={Ruichen Luo and Shoubo Hu and Lequan Yu},\nyear={2022},\nurl={https://openreview.net/forum?id=qfGcsAGhFbc}\n}", "github": "", "project": "", "reviewers": "nesy;1vSj;zFoE;pXQG", "site": "https://openreview.net/forum?id=qfGcsAGhFbc", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "3;4;3;3", "correctness": "2;2;3;4", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "95;28;77;87", "wc_summary_review": "16;33;35;67", "wc_main_review": "574;395;207;578", "wc_review": "685;456;319;732", "wc_reply_reviewers": "592;0;164;0", "wc_reply_authors": "961;388;638;674", "reply_reviewers": "1;0;1;0", "reply_authors": "2;1;1;1", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.82915619758885 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 71.75, 26.05163142684158 ], "wc_summary_review_avg": [ 37.75, 18.430613120566555 ], "wc_main_review_avg": [ 438.5, 152.72933575446467 ], "wc_review_avg": [ 548.0, 168.47106576501497 ], "wc_reply_reviewers_avg": [ 189.0, 242.11360969594418 ], "wc_reply_authors_avg": [ 665.25, 203.19617983613767 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.9045340337332909, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11904454713645520306&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1", "aff_unique_norm": "Huawei;University of Hong Kong", "aff_unique_dep": "Huawei Technologies;", "aff_unique_url": "https://www.huawei.com;https://www.hku.hk", "aff_unique_abbr": "Huawei;HKU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "id": "qfLJBJf_DnH", "title": "Brain insights improve RNNs' accuracy and robustness for hierarchical control of continually learned autonomous motor motifs", "track": "main", "status": "Reject", "tldr": "", "abstract": "We study the problem of learning dynamics that can produce hierarchically organized continuous outputs consisting of the flexible chaining of re-usable motor \u2018motifs\u2019 from which complex behavior is generated. Can a motif library be efficiently and extendably learned without interference between motifs, and can these motifs be chained in arbitrary orders without first learning the corresponding motif transitions during training? This requires (i) parameter updates while learning a new motif that do not interfere with the parameters used for the previously acquired ones; and (ii) successful motif generation when starting from the network states reached at the end of any of the other motifs, even if these states were not present during training (a case of out-of-distribution generalization). We meet the first requirement by designing recurrent neural networks (RNNs) with specific architectures that segregate motif-dependent parameters (as customary in continual learning works), and try a standard method to address the second by training with random initial states. We find that these standard RNNs are very unreliable during zero-shot transfer to motif chaining. We then use insights from the motor thalamocortical circuit, featuring a specific module that shapes motif transitions. We develop a method to constrain the RNNs to function similarly to the thalamocortical circuit during motif transitions, while preserving the large expressivity afforded by gradient-based training of non-analytically tractable RNNs. We then show that this thalamocortical inductive bias not only acts in synergy with gradient-descent RNN training to improve accuracy during in-training-distribution motif production, but also leads to zero-shot transfer to new motif chains with no performance cost. Besides proposing an efficient, robust and flexible RNN architecture, our results shed new light on the function of motor preparation in the brain.", "keywords": "neuroscience;dynamical systems;thalamocortical architecture;motor preparation;continual learning;hierarchical continuous motor control;out-of-distribution generalization;robustness", "primary_area": "", "supplementary_material": "", "author": "Laureline Logiaco;G Sean Escola", "authorids": "~Laureline_Logiaco1;~G_Sean_Escola1", "gender": "F;", "homepage": "https://www.researchgate.net/profile/Laureline-Logiaco;https://seanslice.github.io/", "dblp": ";", "google_scholar": "NFQh44IAAAAJ;0kkWrRoAAAAJ", "orcid": "0000-0001-5252-7662;", "linkedin": ";", "or_profile": "~Laureline_Logiaco1;~G_Sean_Escola1", "aff": "Columbia University;Columbia University", "aff_domain": "columbia.edu;columbia.edu", "position": "Postdoc;Assistant Professor", "bibtex": "@misc{\nlogiaco2022brain,\ntitle={Brain insights improve {RNN}s' accuracy and robustness for hierarchical control of continually learned autonomous motor motifs},\nauthor={Laureline Logiaco and G Sean Escola},\nyear={2022},\nurl={https://openreview.net/forum?id=qfLJBJf_DnH}\n}", "github": "", "project": "", "reviewers": "W1np;MvgU;BooE;hBJi", "site": "https://openreview.net/forum?id=qfLJBJf_DnH", "pdf_size": 0, "recommendation": "3;3;5;6", "confidence": "3;4;3;2", "correctness": "2;3;3;4", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;2;2;4", "wc_summary_paper": "110;200;21;59", "wc_summary_review": "67;22;5;62", "wc_main_review": "857;761;77;101", "wc_review": "1034;983;103;222", "wc_reply_reviewers": "0;32;0;0", "wc_reply_authors": "1223;1172;484;308", "reply_reviewers": "0;1;0;0", "reply_authors": "2;2;1;1", "recommendation_avg": [ 4.25, 1.299038105676658 ], "confidence_avg": [ 3.0, 0.7071067811865476 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 97.5, 67.07644892210678 ], "wc_summary_review_avg": [ 39.0, 26.258332011001766 ], "wc_main_review_avg": [ 449.0, 361.69600495443683 ], "wc_review_avg": [ 585.5, 425.4694466116222 ], "wc_reply_reviewers_avg": [ 8.0, 13.856406460551018 ], "wc_reply_authors_avg": [ 796.75, 405.9528143762524 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.816496580927726, "corr_recommendation_correctness": 0.816496580927726, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:A3qdvt25LqoJ:scholar.google.com/&scioq=Brain+insights+improve+RNNs%27+accuracy+and+robustness+for+hierarchical+control+of+continually+learned+autonomous+motor+motifs&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Columbia University", "aff_unique_dep": "", "aff_unique_url": "https://www.columbia.edu", "aff_unique_abbr": "Columbia", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "qfaNCudAnji", "title": "Deep Q-Network with Proximal Iteration", "track": "main", "status": "Reject", "tldr": "", "abstract": "We employ Proximal Iteration for value-function optimization in reinforcement learning. Proximal Iteration is a computationally efficient technique that enables us to bias the optimization procedure towards more desirable solutions. As a concrete application of Proximal Iteration in deep reinforcement learning, we endow the objective function of the Deep Q-Network (DQN) agent with a proximal term to ensure that the online-network component of DQN remains in the vicinity of the target network. The resultant agent, which we call DQN with Proximal Iteration, or DQNPro, exhibits significant improvements over the original DQN on the Atari benchmark. Our results accentuate the power of employing sound optimization techniques for deep reinforcement learning.", "keywords": "reinforcement learning;deep learning", "primary_area": "", "supplementary_material": "", "author": "Kavosh Asadi;Rasool Fakoor;Omer Gottesman;Michael Littman;Alex Smola", "authorids": "~Kavosh_Asadi1;~Rasool_Fakoor1;~Omer_Gottesman1;~Michael_Littman1;~Alex_Smola1", "gender": ";M;M;M;M", "homepage": "http://cs.brown.edu/~kasadiat/;http://rasoolfa.github.io;https://omergott.github.io/;http://www.cs.brown.edu/~mlittman;http://alex.smola.org", "dblp": "192/1404;123/2447;;http://dblp.uni-trier.de/pers/hd/l/Littman:Michael_L=;s/AlexanderJSmola", "google_scholar": "-2qyBJEAAAAJ;nVsOPtQAAAAJ;glNJx5zYUbsC;Jj00ksMAAAAJ;Tb0ZrYwAAAAJ", "orcid": ";;;0000-0002-5596-1840;", "linkedin": ";rasool-fakoor-695b5845/;;michael-littman-b26351/;smola", "or_profile": "~Kavosh_Asadi1;~Rasool_Fakoor1;~Omer_Gottesman1;~Michael_Littman1;~Alex_Smola1", "aff": "Amazon;Amazon Web Services;Brown University;Georgia Institute of Technology;Amazon", "aff_domain": "amazon.com;amazon.com;brown.edu;gatech.edu;amazon.com", "position": "Researcher;Researcher;Postdoc;Adjunct;Distinguished Scientist", "bibtex": "@misc{\nasadi2022deep,\ntitle={Deep Q-Network with Proximal Iteration},\nauthor={Kavosh Asadi and Rasool Fakoor and Omer Gottesman and Michael Littman and Alex Smola},\nyear={2022},\nurl={https://openreview.net/forum?id=qfaNCudAnji}\n}", "github": "", "project": "", "reviewers": "UNLa;xGvm;ULKo;rd5C", "site": "https://openreview.net/forum?id=qfaNCudAnji", "pdf_size": 0, "recommendation": "3;5;5;5", "confidence": "4;5;4;4", "correctness": "3;4;3;4", "technical_novelty": "2;3;2;3", "empirical_novelty": "0;3;2;0", "wc_summary_paper": "49;34;43;38", "wc_summary_review": "34;48;41;162", "wc_main_review": "128;158;239;926", "wc_review": "211;240;323;1126", "wc_reply_reviewers": "88;327;55;295", "wc_reply_authors": "133;78;190;190", "reply_reviewers": "1;1;1;1", "reply_authors": "1;1;1;1", "recommendation_avg": [ 4.5, 0.8660254037844386 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 1.25, 1.299038105676658 ], "wc_summary_paper_avg": [ 41.0, 5.612486080160912 ], "wc_summary_review_avg": [ 71.25, 52.6278205894943 ], "wc_main_review_avg": [ 362.75, 327.71738968202465 ], "wc_review_avg": [ 475.0, 378.0958873090264 ], "wc_reply_reviewers_avg": [ 191.25, 120.84778649193373 ], "wc_reply_authors_avg": [ 147.75, 46.51007955271631 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8707837867292786113&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;1;2;0", "aff_unique_norm": "Amazon;Brown University;Georgia Institute of Technology", "aff_unique_dep": "Amazon.com, Inc.;;", "aff_unique_url": "https://www.amazon.com;https://www.brown.edu;https://www.gatech.edu", "aff_unique_abbr": "Amazon;Brown;Georgia Tech", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "qgVYxyz2p7W", "title": "S2C2 - An orthogonal method for Semi-Supervised Learning on ambiguous labels", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Semi-Supervised Learning (SSL) can decrease the required amount of labeled image data and thus the cost for deep learning. Most SSL methods assume a clear distinction between classes, but class boundaries are often ambiguous in real-world datasets due to intra- or interobserver variability. This ambiguity of annotations must be addressed as it will otherwise limit the performance of SSL and deep learning in general due to inconsistent label information. We propose SemiSupervised Classification & Clustering (S2C2) which can extend many deep SSL algorithms. S2C2 automatically estimates the ambiguity of an image and applies the respective SSL algorithm as a classification to certainly labeled data while partitioning the ambiguous data into clusters of visual similar images. We show that S2C2 results in a 7.6% better F1-score for classifications and 7.9% lower inner distance of clusters on average across multiple SSL algorithms and datasets. Moreover, the output of S2C2 can be used to decrease the ambiguity of labels with the help of human experts. Overall, a combination of Semi-Supervised Learning with our method S2C2 leads to better handling of ambiguous labels and thus realworld datasets.", "keywords": "Semi-Supervised;Data-Centric;Clustering;Classification", "primary_area": "", "supplementary_material": "/attachment/e511090fa9fb650d6b384d775b5a2adae49a5fe6.zip", "author": "Lars Schmarje;Monty Santarossa;Simon-Martin Schr\u00f6der;Claudius Zelenka;Rainer Kiko;Jenny Stracke;Nina Volkmann;Reinhard Koch", "authorids": "~Lars_Schmarje1;~Monty_Santarossa1;sms@informatik.uni-kiel.de;~Claudius_Zelenka1;~Rainer_Kiko1;jenny.stracke@tiho-hannover.de;~Nina_Volkmann1;~Reinhard_Koch1", "gender": "M;M;;;M;;F;M", "homepage": ";;;https://www.uni-kiel.de/de/person/zelenka-claudius-46803;;;;https://www.mip.informatik.uni-kiel.de/en/team/prof.-dr.-ing.-reinhard-koch", "dblp": ";;;152/4990;;;;55/6577", "google_scholar": ";;;m5vXg2sAAAAJ;https://scholar.google.de/citations?user=5cGQZcYAAAAJ;;;https://scholar.google.de/citations?user=xgjKDqAAAAAJ", "orcid": "0000-0002-6945-5957;0000-0002-4159-1367;;0000-0002-9902-2212;;;0000-0003-2870-9954;", "linkedin": ";;;;;;;", "or_profile": "~Lars_Schmarje1;~Monty_Santarossa1;sms@informatik.uni-kiel.de;~Claudius_Zelenka1;~Rainer_Kiko1;jenny.stracke@tiho-hannover.de;~Nina_Volkmann1;~Reinhard_Koch1", "aff": "Multimedia Information Processing Group;Christian-Albrechts-Universit\u00e4t zu Kiel;;Christian-Albrechts-Universit\u00e4t Kiel;ISIR, UMR 7222;;University of Veterinary Medicine Hannover;Christian-Albrechts-Universitat Kiel", "aff_domain": "informatik.uni-kiel.de;uni-kiel.de;;uni-kiel.de;sorbonne-universite.fr;;tiho.de;uni-kiel.de", "position": "PhD student;PhD student;;Lecturer;Researcher;;Researcher;Professor", "bibtex": "@misc{\nschmarje2022sc,\ntitle={S2C2 - An orthogonal method for Semi-Supervised Learning on ambiguous labels},\nauthor={Lars Schmarje and Monty Santarossa and Simon-Martin Schr{\\\"o}der and Claudius Zelenka and Rainer Kiko and Jenny Stracke and Nina Volkmann and Reinhard Koch},\nyear={2022},\nurl={https://openreview.net/forum?id=qgVYxyz2p7W}\n}", "github": "", "project": "", "reviewers": "oxLT;veiL;7nHk;QFZw", "site": "https://openreview.net/forum?id=qgVYxyz2p7W", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "4;5;4;3", "correctness": "4;2;2;3", "technical_novelty": "1;2;2;2", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "69;62;160;32", "wc_summary_review": "10;22;92;11", "wc_main_review": "288;415;461;319", "wc_review": "367;499;713;362", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "324;207;234;360", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 2.75, 0.82915619758885 ], "technical_novelty_avg": [ 1.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 80.75, 47.819321408819675 ], "wc_summary_review_avg": [ 33.75, 33.95861451826325 ], "wc_main_review_avg": [ 370.75, 70.05132047292184 ], "wc_review_avg": [ 485.25, 142.50679808345987 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 281.25, 62.79878581628788 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": -0.816496580927726, "corr_recommendation_correctness": 0.17407765595569782, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16836224135106112416&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;2;3;4;1", "aff_unique_norm": "Multimedia Information Processing Group;Christian-Albrechts-Universit\u00e4t zu Kiel;Christian-Albrechts-Universit\u00e4t;Institut des Sciences de l'Ing\u00e9nierie de Robotique;University of Veterinary Medicine Hannover", "aff_unique_dep": "Department of Multimedia Information Processing;;;UMR 7222;", "aff_unique_url": ";https://www.uni-kiel.de;https://www.uni-kiel.de;https://www.isir.upmc.fr;https://www.tiho-hannover.de", "aff_unique_abbr": ";CAU;CAU;ISIR;TiHo", "aff_campus_unique_index": "1", "aff_campus_unique": ";Kiel", "aff_country_unique_index": "1;1;2;1;1", "aff_country_unique": ";Germany;France" }, { "title": "Generative Models as a Data Source for Multiview Representation Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6339", "id": "qhAeZjs7dCL", "poster": "", "openreview": "https://openreview.net/forum?id=qhAeZjs7dCL", "slides": "https://iclr.cc/virtual/2022/poster/6339", "video": "https://iclr.cc/virtual/2022/poster/6339", "author_site": "Ali Jahanian, Xavier Puig, Yonglong Tian, Phillip Isola", "tldr": "", "abstract": "Generative models are now capable of producing highly realistic images that look nearly indistinguishable from the data on which they are trained. This raises the question: if we have good enough generative models, do we still need datasets? We investigate this question in the setting of learning general-purpose visual representations from a black-box generative model rather than directly from data. Given an off-the-shelf image generator without any access to its training data, we train representations from the samples output by this generator. We compare several representation learning methods that can be applied to this setting, using the latent space of the generator to generate multiple \"views\" of the same semantic content. We show that for contrastive methods, this multiview data can naturally be used to identify positive pairs (nearby in latent space) and negative pairs (far apart in latent space). We find that the resulting representations rival or even outperform those learned directly from real data, but that good performance requires care in the sampling strategy applied and the training method. Generative models can be viewed as a compressed and organized copy of a dataset, and we envision a future where more and more \"model zoos\" proliferate while datasets become increasingly unwieldy, missing, or private. This paper suggests several techniques for dealing with visual representation learning in such a future. Code is available on our project page https://ali-design.github.io/GenRep/.", "keywords": "Generative models;GANs;Contrastive Learning;Representation Learning", "primary_area": "", "supplementary_material": "", "author": "Ali Jahanian;Xavier Puig;Yonglong Tian;Phillip Isola", "authorids": "~Ali_Jahanian1;~Xavier_Puig1;~Yonglong_Tian1;~Phillip_Isola1", "gender": "M;M;;M", "homepage": "http://people.csail.mit.edu/jahanian/;https://people.csail.mit.edu/xavierpuig/;http://people.csail.mit.edu/yonglong/;http://web.mit.edu/phillipi/", "dblp": ";50/8429;151/6328;36/9988", "google_scholar": "nMpyjcwAAAAJ;;https://scholar.google.com.hk/citations?user=OsP7JHAAAAAJ;ROILf3EAAAAJ", "orcid": ";;;0000-0002-1411-6704", "linkedin": ";;;phillip-isola-a9955b20/", "or_profile": "~Ali_Jahanian1;~Xavier_Puig1;~Yonglong_Tian1;~Phillip_Isola1", "aff": "Massachusetts Institute of Technology;Massachusetts Institute of Technology;Massachusetts Institute of Technology;Massachusetts Institute of Technology", "aff_domain": "mit.edu;mit.edu;mit.edu;mit.edu", "position": "Research Scientist;PhD student;PhD student;Assistant Professor", "bibtex": "@inproceedings{\njahanian2022generative,\ntitle={Generative Models as a Data Source for Multiview Representation Learning},\nauthor={Ali Jahanian and Xavier Puig and Yonglong Tian and Phillip Isola},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=qhAeZjs7dCL}\n}", "github": "", "project": "", "reviewers": "P9Ys;ZFjV;ZWWA;pxGz", "pdf_size": 0, "recommendation": "6;8;8;8", "confidence": "4;3;4;4", "correctness": "3;4;4;4", "technical_novelty": "3;3;2;2", "empirical_novelty": "3;4;3;4", "wc_summary_paper": "84;72;72;64", "wc_summary_review": "321;73;101;14", "wc_main_review": "360;130;453;336", "wc_review": "765;275;626;414", "wc_reply_reviewers": "396;0;29;0", "wc_reply_authors": "1384;127;573;193", "reply_reviewers": "2;0;1;0", "reply_authors": "3;1;1;1", "recommendation_avg": [ 7.5, 0.8660254037844386 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 3.5, 0.5 ], "wc_summary_paper_avg": [ 73.0, 7.14142842854285 ], "wc_summary_review_avg": [ 127.25, 116.18600389031374 ], "wc_main_review_avg": [ 319.75, 117.94569725089593 ], "wc_review_avg": [ 520.0, 188.76043017539453 ], "wc_reply_reviewers_avg": [ 106.25, 167.70565732854692 ], "wc_reply_authors_avg": [ 569.25, 500.2451274125516 ], "reply_reviewers_avg": [ 0.75, 0.82915619758885 ], "reply_authors_avg": [ 1.5, 0.8660254037844386 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": 1.0, "gs_citation": 158, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13492462163020342656&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=qhAeZjs7dCL", "email": "mit.edu;mit.edu;mit.edu;mit.edu", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Massachusetts Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://web.mit.edu", "aff_unique_abbr": "MIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "CrossBeam: Learning to Search in Bottom-Up Program Synthesis", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7036", "id": "qhC8mr2LEKq", "poster": "", "openreview": "https://openreview.net/forum?id=qhC8mr2LEKq", "slides": "https://iclr.cc/virtual/2022/poster/7036", "video": "https://iclr.cc/virtual/2022/poster/7036", "author_site": "Kensen Shi, Hanjun Dai, Kevin Ellis, Charles Sutton", "tldr": "", "abstract": "Many approaches to program synthesis perform a search within an enormous space of programs to find one that satisfies a given specification. Prior works have used neural models to guide combinatorial search algorithms, but such approaches still explore a huge portion of the search space and quickly become intractable as the size of the desired program increases. To tame the search space blowup, we propose training a neural model to learn a hands-on search policy for bottom-up synthesis, instead of relying on a combinatorial search algorithm. Our approach, called CrossBeam, uses the neural model to choose how to combine previously-explored programs into new programs, taking into account the search history and partial program executions. Motivated by work in structured prediction on learning to search, CrossBeam is trained on-policy using data extracted from its own bottom-up searches on training tasks. We evaluate CrossBeam in two very different domains, string manipulation and logic programming. We observe that CrossBeam learns to search efficiently, exploring much smaller portions of the program space compared to the state-of-the-art.\n", "keywords": "Program Synthesis;Bottom-Up Search", "primary_area": "", "supplementary_material": "", "author": "Kensen Shi;Hanjun Dai;Kevin Ellis;Charles Sutton", "authorids": "~Kensen_Shi1;~Hanjun_Dai1;~Kevin_Ellis1;~Charles_Sutton1", "gender": "M;M;M;M", "homepage": ";https://hanjun-dai.github.io;https://www.cs.cornell.edu/~ellisk/;http://homepages.inf.ed.ac.uk/csutton/", "dblp": "135/8307;144/7311;;59/5879", "google_scholar": "LAL4SIMAAAAJ;obpl7GQAAAAJ;L7XI6asAAAAJ;https://scholar.google.co.uk/citations?user=hYtGXD0AAAAJ", "orcid": "0000-0001-7140-7869;;;0000-0002-0041-3820", "linkedin": ";hanjun-dai;;charles-sutton-772aa126", "or_profile": "~Kensen_Shi1;~Hanjun_Dai1;~Kevin_Ellis1;~Charles_Sutton1", "aff": "Google;Google Research;Cornell University;University of Edinburgh", "aff_domain": "google.com;google.com;cornell.edu;ed.ac.uk", "position": "Software Engineer;Researcher;Assistant Professor;Professor", "bibtex": "@inproceedings{\nshi2022crossbeam,\ntitle={CrossBeam: Learning to Search in Bottom-Up Program Synthesis},\nauthor={Kensen Shi and Hanjun Dai and Kevin Ellis and Charles Sutton},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=qhC8mr2LEKq}\n}", "github": "", "project": "", "reviewers": "cqKb;D2Hc;tHmt;Q8M3", "pdf_size": 0, "recommendation": "6;8;8;8", "confidence": "3;5;4;4", "correctness": "3;3;3;4", "technical_novelty": "3;3;4;3", "empirical_novelty": "3;3;3;0", "wc_summary_paper": "141;67;88;124", "wc_summary_review": "90;91;24;15", "wc_main_review": "417;549;220;260", "wc_review": "648;707;332;399", "wc_reply_reviewers": "0;65;11;0", "wc_reply_authors": "1064;400;365;220", "reply_reviewers": "0;1;1;0", "reply_authors": "2;2;1;1", "recommendation_avg": [ 7.5, 0.8660254037844386 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 3.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 1.299038105676658 ], "wc_summary_paper_avg": [ 105.0, 29.111853256019273 ], "wc_summary_review_avg": [ 55.0, 35.644073841243234 ], "wc_main_review_avg": [ 361.5, 130.92077757178194 ], "wc_review_avg": [ 521.5, 159.16108192645586 ], "wc_reply_reviewers_avg": [ 19.0, 26.93510720231126 ], "wc_reply_authors_avg": [ 512.25, 325.62276870636674 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.816496580927726, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 35, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14342383468818615250&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=qhC8mr2LEKq", "email": "google.com;google.com;cornell.edu;ed.ac.uk", "author_num": 4, "aff_unique_index": "0;0;1;2", "aff_unique_norm": "Google;Cornell University;University of Edinburgh", "aff_unique_dep": "Google;;", "aff_unique_url": "https://www.google.com;https://www.cornell.edu;https://www.ed.ac.uk", "aff_unique_abbr": "Google;Cornell;Edinburgh", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;0;0;1", "aff_country_unique": "United States;United Kingdom" }, { "title": "Can an Image Classifier Suffice For Action Recognition?", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6047", "id": "qhkFX-HLuHV", "poster": "", "openreview": "https://openreview.net/forum?id=qhkFX-HLuHV", "slides": "https://iclr.cc/virtual/2022/poster/6047", "video": "https://iclr.cc/virtual/2022/poster/6047", "author_site": "Quanfu Fan, Chun-Fu (Richard) Chen, Rameswar Panda", "tldr": "", "abstract": "We explore a new perspective on video understanding by casting the video recognition problem as an image recognition task. Our approach rearranges input video frames into super images, which allow for training an image classifier directly to fulfill the task of action recognition, in exactly the same way as image classification. With such a simple idea, we show that transformer-based image classifiers alone can suffice for action recognition. In particular, our approach demonstrates strong and promising performance against SOTA methods on several public datasets including Kinetics400, Moments In Time, Something-Something V2 (SSV2), Jester and Diving48. We also experiment with the prevalent ResNet image classifiers in computer vision to further validate our idea. The results on both Kinetics400 and SSV2 are comparable to some of the best-performed CNN approaches based on spatio-temporal modeling. Our source codes and models are available at \\url{https://github.com/IBM/sifar-pytorch}.", "keywords": "action recognition;image classifier;super image;vision transformer", "primary_area": "", "supplementary_material": "", "author": "Quanfu Fan;Chun-Fu Chen;Rameswar Panda", "authorids": "~Quanfu_Fan1;~Chun-Fu_Chen1;~Rameswar_Panda1", "gender": "M;M;M", "homepage": ";;https://rpand002.github.io/", "dblp": "66/3950;48/915;126/0986", "google_scholar": "kCxHiwUAAAAJ;9gqd5cYAAAAJ;_ySuu6gAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Quanfu_Fan1;~Chun-Fu_Chen1;~Rameswar_Panda1", "aff": "MIT-IBM Watson AI Lab;JPMorganChase, GTAR;MIT-IBM Watson AI Lab", "aff_domain": "us.ibm.com;jpmchase.com;ibm.com", "position": "Researcher;Executive Director;Research Scientist", "bibtex": "@inproceedings{\nfan2022can,\ntitle={Can an Image Classifier Suffice For Action Recognition?},\nauthor={Quanfu Fan and Chun-Fu Chen and Rameswar Panda},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=qhkFX-HLuHV}\n}", "github": "", "project": "", "reviewers": "utfq;K68r;7Wf5;JuKq", "pdf_size": 0, "recommendation": "6;8;8;8", "confidence": "5;3;3;4", "correctness": "3;3;4;4", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;4;3;3", "wc_summary_paper": "61;216;101;34", "wc_summary_review": "44;57;56;31", "wc_main_review": "311;413;237;72", "wc_review": "416;686;394;137", "wc_reply_reviewers": "0;188;17;0", "wc_reply_authors": "498;456;245;31", "reply_reviewers": "0;2;1;0", "reply_authors": "1;2;1;1", "recommendation_avg": [ 7.5, 0.8660254037844386 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 103.0, 69.45862077525007 ], "wc_summary_review_avg": [ 47.0, 10.559356040971437 ], "wc_main_review_avg": [ 258.25, 124.36915815426266 ], "wc_review_avg": [ 408.25, 194.2837808464721 ], "wc_reply_reviewers_avg": [ 51.25, 79.25709747398021 ], "wc_reply_authors_avg": [ 307.5, 186.2129157711677 ], "reply_reviewers_avg": [ 0.75, 0.82915619758885 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.8703882797784891, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 42, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6764800425144348390&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=qhkFX-HLuHV", "email": "us.ibm.com;jpmchase.com;ibm.com", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "Massachusetts Institute of Technology;JPMorgan Chase", "aff_unique_dep": "IBM Watson AI Lab;Global Technology, Analytics, and Research (GTAR)", "aff_unique_url": "https://www.mitibmwatsonailab.org;https://www.jpmorganchase.com", "aff_unique_abbr": "MIT-IBM AI Lab;JPM", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "qhqxE0z3r3y", "title": "A Decidability-Based Loss Function", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Computer vision problems often use deep learning models to extract features from images, also known as embeddings. \nMoreover, the loss function used during training strongly influences the quality of the generated embeddings.\nIn this work, a loss function based on the decidability index is proposed to improve the quality of embeddings for the verification routine.\nOur proposal, the D-loss, avoids some Triplet-based loss disadvantages such as the use of hard samples and tricky parameter tuning, which can lead to slow convergence.\nThe proposed approach is compared against the Softmax (cross-entropy), Triplets Soft-Hard, and the Multi Similarity losses in four different benchmarks: MNIST, Fashion-MNIST, CIFAR10 and CASIA-IrisV4.\nThe achieved results show the efficacy of the proposal when compared to other popular metrics in the literature. The D-loss computation, besides being simple, non-parametric and easy to implement, favors both the inter-class and intra-class scenarios. Our code will be available at GitHub. ", "keywords": "Metric Learning;Deep Learning", "primary_area": "", "supplementary_material": "/attachment/4cb8d7b949c901e791a1bd260d13c3158e971c1d.zip", "author": "Pedro Silva;Gladston Moreira;Vander Freitas;Rodrigo Silva;David Menotti;Eduardo Luz", "authorids": "~Pedro_Silva2;gladston@ufop.edu.br;vander.freitas@ufop.edu.br;rodrigo.silva@ufop.edu.br;menottid@gmail.com;eduluz@ufop.edu.br", "gender": "M;;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": "https://scholar.google.com.br/citations?user=JZt9ClYAAAAJ;;;;;", "orcid": "0000-0002-5525-6121;;;;;", "linkedin": "pedro-silva-957a35125/;;;;;", "or_profile": "~Pedro_Silva2;gladston@ufop.edu.br;vander.freitas@ufop.edu.br;rodrigo.silva@ufop.edu.br;menottid@gmail.com;eduluz@ufop.edu.br", "aff": "Universidade Federal de Ouro Preto;;;;;", "aff_domain": "ufop.br;;;;;", "position": "Assistant Professor;;;;;", "bibtex": "@misc{\nsilva2022a,\ntitle={A Decidability-Based Loss Function},\nauthor={Pedro Silva and Gladston Moreira and Vander Freitas and Rodrigo Silva and David Menotti and Eduardo Luz},\nyear={2022},\nurl={https://openreview.net/forum?id=qhqxE0z3r3y}\n}", "github": "", "project": "", "reviewers": "MLru;jU1S;xbTX;r9yX", "site": "https://openreview.net/forum?id=qhqxE0z3r3y", "pdf_size": 0, "recommendation": "1;1;3;3", "confidence": "4;4;4;4", "correctness": "2;2;3;3", "technical_novelty": "1;1;2;2", "empirical_novelty": "1;1;2;2", "wc_summary_paper": "49;36;51;19", "wc_summary_review": "39;17;13;36", "wc_main_review": "530;140;182;241", "wc_review": "618;193;246;296", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 2.0, 1.0 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 1.5, 0.5 ], "empirical_novelty_avg": [ 1.5, 0.5 ], "wc_summary_paper_avg": [ 38.75, 12.774486291041217 ], "wc_summary_review_avg": [ 26.25, 11.388041973930374 ], "wc_main_review_avg": [ 273.25, 152.51454848636573 ], "wc_review_avg": [ 338.25, 165.56928308113194 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 1.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:kmCPKHuZ4TkJ:scholar.google.com/&scioq=A+Decidability-Based+Loss+Function&hl=en&as_sdt=0,33", "gs_version_total": 6, "aff_unique_index": "0", "aff_unique_norm": "Universidade Federal de Ouro Preto", "aff_unique_dep": "", "aff_unique_url": "https://www.ufop.edu.br", "aff_unique_abbr": "UFOP", "aff_country_unique_index": "0", "aff_country_unique": "Brazil" }, { "id": "qiBTPIoQ0lz", "title": "Improving OOD Generalization with Causal Invariant Transformations", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "In real-world applications, it is important and desirable to learn a model that performs well on out-of-distribution (OOD) data. Recently, causality has become a powerful tool to tackle the OOD generalization problem, with the core idea resting on the causal mechanism that is invariant across the domains of interest. To leverage the generally unknown causal mechanism, existing works assume the linear form of causal feature or require sufficiently many and diverse training domains, which are usually restrictive in practice. In this work, we obviate these assumptions and tackle the OOD problem without explicitly recovering the causal feature. Our approach is based on transformations that modify the non-causal feature but leave the causal part unchanged, which can be either obtained from prior knowledge or learned from the training data. Under the setting of invariant causal mechanism, we theoretically show that if all such transformations are available, then we can learn a minimax optimal model across the domains using only single domain data. Noticing that knowing a complete set of these causal invariant transformations may be impractical, we further show that it suffices to know only an appropriate subset of these transformations. Based on the theoretical findings, a regularized training procedure is proposed to improve the OOD generalization capability. Extensive experimental results on both synthetic and real datasets verify the effectiveness of the proposed algorithm, even with only a few causal invariant transformations.", "keywords": "domain generalization;minimax problem;structural model", "primary_area": "", "supplementary_material": "", "author": "Ruoyu Wang;Mingyang Yi;Shengyu Zhu;Zhitang Chen", "authorids": "~Ruoyu_Wang2;~Mingyang_Yi1;~Shengyu_Zhu1;~Zhitang_Chen1", "gender": "M;M;M;M", "homepage": ";http://mingyangyi.github.io;https://zhushyu.github.io/;", "dblp": "278/6323;;131/6555;06/10875", "google_scholar": "1mO8fMgAAAAJ;RlOZiPUAAAAJ;;", "orcid": "0000-0002-4561-2954;;;", "linkedin": ";;;", "or_profile": "~Ruoyu_Wang2;~Mingyang_Yi1;~Shengyu_Zhu1;~Zhitang_Chen1", "aff": "Academy of Mathematics and Systems Science, Chinese Academy of Sciences, Chinese Academy of Sciences;Academy of Mathematics and Systems Science, Chinese Academy of Sciences, Chinese Academy of Sciences;Huawei Noah's Ark Lab;Huawei Technologies Ltd.", "aff_domain": "amss.ac.cn;amss.ac.cn;huawei.com;huawei.com", "position": "PhD student;PhD student;Principal Researcher;Researcher", "bibtex": "@misc{\nwang2022improving,\ntitle={Improving {OOD} Generalization with Causal Invariant Transformations},\nauthor={Ruoyu Wang and Mingyang Yi and Shengyu Zhu and Zhitang Chen},\nyear={2022},\nurl={https://openreview.net/forum?id=qiBTPIoQ0lz}\n}", "github": "", "project": "", "reviewers": "GEhg;Cqpm;oMtJ;2br8", "site": "https://openreview.net/forum?id=qiBTPIoQ0lz", "pdf_size": 0, "recommendation": "1;3;5;8", "confidence": "5;5;3;3", "correctness": "3;2;3;4", "technical_novelty": "1;2;2;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "77;120;113;94", "wc_summary_review": "104;72;38;47", "wc_main_review": "745;664;213;401", "wc_review": "926;856;364;542", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.25, 2.5860201081971503 ], "confidence_avg": [ 4.0, 1.0 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 101.0, 16.80773631397161 ], "wc_summary_review_avg": [ 65.25, 25.606395685453272 ], "wc_main_review_avg": [ 505.75, 211.51758201151978 ], "wc_review_avg": [ 672.0, 229.2029668219851 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.8700628401410974, "corr_recommendation_correctness": 0.6835859270246631, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:PabYirMMkbEJ:scholar.google.com/&scioq=Improving+OOD+Generalization+with+Causal+Invariant+Transformations&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;1;1", "aff_unique_norm": "Chinese Academy of Sciences;Huawei", "aff_unique_dep": "Academy of Mathematics and Systems Science;Noah's Ark Lab", "aff_unique_url": "http://www.cas.cn;https://www.huawei.com", "aff_unique_abbr": "CAS;Huawei", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "title": "How unlabeled data improve generalization in self-training? A one-hidden-layer theoretical analysis", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6549", "id": "qiMXBIf4NfB", "poster": "", "openreview": "https://openreview.net/forum?id=qiMXBIf4NfB", "slides": "https://iclr.cc/virtual/2022/poster/6549", "video": "https://iclr.cc/virtual/2022/poster/6549", "author_site": "shuai ZHANG, Meng Wang, Sijia Liu, Pin-Yu Chen, Jinjun Xiong", "tldr": "", "abstract": "Self-training, a semi-supervised learning algorithm, leverages a large amount of unlabeled data to improve learning when the labeled data are limited. Despite empirical successes, its theoretical characterization remains elusive. To the best of our knowledge, this work establishes the first theoretical analysis for the known iterative self-training paradigm and formally proves the benefits of unlabeled data in both training convergence and generalization ability. To make our theoretical analysis feasible, we focus on the case of one-hidden-layer neural networks. However, theoretical understanding of iterative self-training is non-trivial even for a shallow neural network. One of the key challenges is that existing neural network landscape analysis built upon supervised learning no longer holds in the (semi-supervised) self-training paradigm. We address this challenge and prove that iterative self-training converges linearly with both convergence rate and generalization accuracy improved in the order of $1/\\sqrt{M}$, where $M$ is the number of unlabeled samples. Extensive experiments from shallow neural networks to deep neural networks are also provided to justify the correctness of our established theoretical insights on self-training.", "keywords": "Self-training;Semi-supervised learning;Convergence analysis;Generalization analysis", "primary_area": "", "supplementary_material": "/attachment/781bd11dc934b1794750fea689c2b3276839f5d4.zip", "author": "Shuai Zhang;Meng Wang;Sijia Liu;Pin-Yu Chen;Jinjun Xiong", "authorids": "~Shuai_Zhang6;~Meng_Wang4;~Sijia_Liu1;~Pin-Yu_Chen1;~Jinjun_Xiong1", "gender": "M;F;M;M;", "homepage": "https://inchs708.github.io/shuaizhang.github.io/index.html;https://www.ecse.rpi.edu/~wang/index.html;https://lsjxjtu.github.io/;http://www.pinyuchen.com;https://www.xlab-ub.com", "dblp": "71/208-15;93/6765-3;128/6972-1;39/8969;81/1130", "google_scholar": "https://scholar.google.com/citations?view_op=list_works;;C7dO_UgAAAAJ;jxwlCUUAAAAJ;tRt1xPYAAAAJ", "orcid": "0000-0001-8280-6988;;;0000-0003-1039-8369;0000-0002-2620-4859", "linkedin": ";;;pin-yu-chen-940062a2;jinjun-xiong-314774/", "or_profile": "~Shuai_Zhang6;~Meng_Wang4;~Sijia_Liu1;~Pin-Yu_Chen1;~Jinjun_Xiong1", "aff": "Rensselaer Polytechnic Institute;Rensselaer Polytechnic Institute;Michigan State University;International Business Machines;State University of New York at Buffalo", "aff_domain": "rpi.edu;rpi.edu;msu.edu;ibm.com;buffalo.edu", "position": "Postdoc;Associate Professor;Assistant Professor;Research Staff Member;Professor", "bibtex": "@inproceedings{\nzhang2022how,\ntitle={How unlabeled data improve generalization in self-training? A one-hidden-layer theoretical analysis},\nauthor={Shuai Zhang and Meng Wang and Sijia Liu and Pin-Yu Chen and Jinjun Xiong},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=qiMXBIf4NfB}\n}", "github": "", "project": "", "reviewers": "uV34;7chM;LAdw;SoSg", "pdf_size": 0, "recommendation": "6;6;6;8", "confidence": "3;3;3;3", "correctness": "2;3;3;4", "technical_novelty": "3;3;2;4", "empirical_novelty": "3;3;1;3", "wc_summary_paper": "69;66;83;469", "wc_summary_review": "51;10;81;61", "wc_main_review": "646;94;582;1843", "wc_review": "766;170;746;2373", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 3.0, 0.0 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 171.75, 171.73726299204841 ], "wc_summary_review_avg": [ 50.75, 25.8879798362097 ], "wc_main_review_avg": [ 791.25, 643.6650429377069 ], "wc_review_avg": [ 1013.75, 820.4487720144384 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.816496580927726, "gs_citation": 32, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3684925465125024606&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 12, "pdf": "https://openreview.net/pdf?id=qiMXBIf4NfB", "email": "rpi.edu;rpi.edu;msu.edu;ibm.com;buffalo.edu", "author_num": 5, "aff_unique_index": "0;0;1;2;3", "aff_unique_norm": "Rensselaer Polytechnic Institute;Michigan State University;International Business Machines Corporation;State University of New York at Buffalo", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.rpi.edu;https://www.msu.edu;https://www.ibm.com;https://www.buffalo.edu", "aff_unique_abbr": "RPI;MSU;IBM;SUNY Buffalo", "aff_campus_unique_index": "1", "aff_campus_unique": ";Buffalo", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "qiSE7datah", "title": "Training Deep Neural Networks with Joint Quantization and Pruning of Features and Weights", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Quantization and pruning are widely used to reduce the inference costs of deep neural networks. In this work, we propose a framework to train deep neural networks using novel methods for uniform quantization and unstructured pruning on both the features and weights. We demonstrate that our method delivers an increased performance per memory footprint over existing state-of-the-art solutions. Using our framework, we empirically evaluate the prune-then-quantize paradigm and independence assumption across a wide range of computer vision tasks and observe the non-commutativity of quantization and pruning when applied to both features and weights.", "keywords": "Deep Learning;Quantization;Pruning;Feature Sparsity;Model Compression;Unstructured Feature Pruning", "primary_area": "", "supplementary_material": "/attachment/24f9e24499927e9af7f84232cd2866c8f023a5a4.zip", "author": "Xinyu Zhang;Ian Colbert;\u202aKen Kreutz-Delgado\u202c;Srinjoy Das", "authorids": "~Xinyu_Zhang7;icolbert@eng.ucsd.edu;kreutz@eng.ucsd.edu;srinjoy.das@mail.wvu.edu", "gender": "M;;;", "homepage": "https://mlzxy.github.io/;;;", "dblp": ";;;", "google_scholar": "M7hnG9oAAAAJ;;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Xinyu_Zhang7;icolbert@eng.ucsd.edu;kreutz@eng.ucsd.edu;srinjoy.das@mail.wvu.edu", "aff": "Rutgers University;;;", "aff_domain": "rutgers.edu;;;", "position": "PhD student;;;", "bibtex": "@misc{\nzhang2022training,\ntitle={Training Deep Neural Networks with Joint Quantization and Pruning of Features and Weights},\nauthor={Xinyu Zhang and Ian Colbert and \u202aKen Kreutz-Delgado\u202c and Srinjoy Das},\nyear={2022},\nurl={https://openreview.net/forum?id=qiSE7datah}\n}", "github": "", "project": "", "reviewers": "", "site": "https://openreview.net/forum?id=qiSE7datah", "pdf_size": 0, "recommendation": "", "confidence": "", "correctness": "", "technical_novelty": "", "empirical_novelty": "", "wc_summary_paper": "", "wc_summary_review": "", "wc_main_review": "", "wc_review": "", "wc_reply_reviewers": "", "wc_reply_authors": "", "reply_reviewers": "", "reply_authors": "", "recommendation_avg": [ 0, 0 ], "confidence_avg": [ 0, 0 ], "correctness_avg": [ 0, 0 ], "technical_novelty_avg": [ 0, 0 ], "empirical_novelty_avg": [ 0, 0 ], "wc_summary_paper_avg": [ 0, 0 ], "wc_summary_review_avg": [ 0, 0 ], "wc_main_review_avg": [ 0, 0 ], "wc_review_avg": [ 0, 0 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 1, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0, "corr_recommendation_correctness": 0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:IbKwXrqB-fEJ:scholar.google.com/&scioq=Training+Deep+Neural+Networks+with+Joint+Quantization+and+Pruning+of+Features+and+Weights&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "Rutgers University", "aff_unique_dep": "", "aff_unique_url": "https://www.rutgers.edu", "aff_unique_abbr": "Rutgers", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "id": "qiukmqxQF6", "title": "LatTe Flows: Latent Temporal Flows for Multivariate Sequence Analysis", "track": "main", "status": "Reject", "tldr": "", "abstract": "We introduce Latent Temporal Flows (\\emph{LatTe-Flows}), a method for probabilistic multivariate time-series analysis tailored for high dimensional systems whose temporal dynamics are driven by variations in a lower-dimensional discriminative subspace. We perform indirect learning from hidden traits of observed sequences by assuming that the random vector representing the data is generated from an unobserved low-dimensional latent vector. \\emph{LatTe-Flows} jointly learns auto-encoder mappings to a latent space and learns the temporal distribution of lower-dimensional embeddings of input sequences. Since encoder networks retain only the essential information to generate a latent manifold, the temporal distribution transitions can be more efficiently uncovered by time conditioned Normalizing Flows. The learned latent effects can then be directly transferred into the observed space through the decoder network. We demonstrate that the proposed method significantly outperforms the state-of-the-art on multi-step forecasting benchmarks, while enjoying reduced computational complexity on several real-world datasets. We apply {\\emph{LatTe-Flows}} to a challenging sensor-signal forecasting task, using multivariate time-series measurements collected by wearable devices, an increasingly relevant health application.\n", "keywords": "time-series analysis;multivariate time series forecasting;latent space;autoregressive models", "primary_area": "", "supplementary_material": "/attachment/6278130b0170dd6ede429a42bde1b9aa0e4da938.zip", "author": "Magda Amiridi;Gregory Darnell;Sean Jewell", "authorids": "~Magda_Amiridi1;~Gregory_Darnell1;~Sean_Jewell1", "gender": "F;M;M", "homepage": ";https://www.gregdarnell.com/;https://jewellsean.github.io", "dblp": ";96/11537;164/7303", "google_scholar": ";yok1ABYAAAAJ;V4cKeL4AAAAJ", "orcid": ";0000-0003-0425-940X;", "linkedin": "magda-amiridi-1a3198197/;gregory-darnell1/;", "or_profile": "~Magda_Amiridi1;~Gregory_Darnell1;~Sean_Jewell1", "aff": "University of Virginia;Apple;Apple", "aff_domain": "virginia.edu;apple.com;apple.com", "position": "PhD student;Research Scientist;Research Scientist", "bibtex": "@misc{\namiridi2022latte,\ntitle={LatTe Flows: Latent Temporal Flows for Multivariate Sequence Analysis },\nauthor={Magda Amiridi and Gregory Darnell and Sean Jewell},\nyear={2022},\nurl={https://openreview.net/forum?id=qiukmqxQF6}\n}", "github": "", "project": "", "reviewers": "kY2T;f9pX;s1RF", "site": "https://openreview.net/forum?id=qiukmqxQF6", "pdf_size": 0, "recommendation": "1;3;5", "confidence": "5;3;4", "correctness": "2;3;3", "technical_novelty": "1;1;2", "empirical_novelty": "1;2;3", "wc_summary_paper": "38;40;71", "wc_summary_review": "16;29;27", "wc_main_review": "90;140;305", "wc_review": "144;209;403", "wc_reply_reviewers": "0;210;0", "wc_reply_authors": "423;712;736", "reply_reviewers": "0;1;0", "reply_authors": "1;1;1", "recommendation_avg": [ 3.0, 1.632993161855452 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "correctness_avg": [ 2.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 1.3333333333333333, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.0, 0.816496580927726 ], "wc_summary_paper_avg": [ 49.666666666666664, 15.107025591499546 ], "wc_summary_review_avg": [ 24.0, 5.715476066494082 ], "wc_main_review_avg": [ 178.33333333333334, 91.86342519680446 ], "wc_review_avg": [ 252.0, 110.02121007636057 ], "wc_reply_reviewers_avg": [ 70.0, 98.99494936611666 ], "wc_reply_authors_avg": [ 623.6666666666666, 142.2306421118725 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.5, "corr_recommendation_correctness": 0.8660254037844385, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:ywl1apWhR_gJ:scholar.google.com/&scioq=LatTe+Flows:+Latent+Temporal+Flows+for+Multivariate+Sequence+Analysis&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;1", "aff_unique_norm": "University of Virginia;Apple", "aff_unique_dep": ";Apple Inc.", "aff_unique_url": "https://www.virginia.edu;https://www.apple.com", "aff_unique_abbr": "UVA;Apple", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Real-Time Neural Voice Camouflage", "status": "Oral", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6986", "id": "qj1IZ-6TInc", "poster": "", "openreview": "https://openreview.net/forum?id=qj1IZ-6TInc", "slides": "https://iclr.cc/virtual/2022/poster/6986", "video": "https://iclr.cc/virtual/2022/poster/6986", "author_site": "Mia Chiquier, Chengzhi Mao, Carl Vondrick", "tldr": "", "abstract": "Automatic speech recognition systems have created exciting possibilities for applications, however they also enable opportunities for systematic eavesdropping.We propose a method to camouflage a person's voice from these systems without inconveniencing the conversation between people in the room. Standard adversarial attacks are not effective in real-time streaming situations because the characteristics of the signal will have changed by the time the attack is executed. We introduce predictive adversarial attacks, which achieves real-time performance by forecasting the attack vector that will be the most effective in the future. Under real-time constraints, our method jams the established speech recognition system DeepSpeech 3.9x more than online projected gradient descent as measured through word error rate, and 6.6x more as measured through character error rate. We furthermore demonstrate our approach is practically effective in realistic environments with complex scene geometries. ", "keywords": "automatic speech recognition;predictive models;privacy", "primary_area": "", "supplementary_material": "/attachment/15a7fedce4527cfd5286ff5ab74da7a6be0319c1.zip", "author": "Mia Chiquier;Chengzhi Mao;Carl Vondrick", "authorids": "~Mia_Chiquier1;~Chengzhi_Mao2;~Carl_Vondrick2", "gender": "F;M;M", "homepage": "http://www.cs.columbia.edu/~mia.chiquier/;http://www.cs.columbia.edu/~mcz/;http://www.cs.columbia.edu/~vondrick/", "dblp": ";;26/8610", "google_scholar": ";pTTEiHUAAAAJ;3MzhkFIAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Mia_Chiquier1;~Chengzhi_Mao2;~Carl_Vondrick2", "aff": "Columbia University;Columbia University;Columbia University", "aff_domain": "columbia.edu;columbia.edu;columbia.edu", "position": "PhD student;PhD student;Assistant Professor", "bibtex": "@inproceedings{\nchiquier2022realtime,\ntitle={Real-Time Neural Voice Camouflage},\nauthor={Mia Chiquier and Chengzhi Mao and Carl Vondrick},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=qj1IZ-6TInc}\n}", "github": "", "project": "", "reviewers": "z8ss;j1oS;9gbp", "pdf_size": 0, "recommendation": "8;8;8", "confidence": "4;4;4", "correctness": "3;4;3", "technical_novelty": "3;3;3", "empirical_novelty": "2;3;3", "wc_summary_paper": "202;27;74", "wc_summary_review": "48;33;52", "wc_main_review": "468;242;307", "wc_review": "718;302;433", "wc_reply_reviewers": "137;38;0", "wc_reply_authors": "700;232;240", "reply_reviewers": "1;1;0", "reply_authors": "1;1;1", "recommendation_avg": [ 8.0, 0.0 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 101.0, 73.95043385042895 ], "wc_summary_review_avg": [ 44.333333333333336, 8.178562764256865 ], "wc_main_review_avg": [ 339.0, 94.99824559783546 ], "wc_review_avg": [ 484.3333333333333, 173.6669865640048 ], "wc_reply_reviewers_avg": [ 58.333333333333336, 57.74849685393455 ], "wc_reply_authors_avg": [ 390.6666666666667, 218.75607928060472 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3493386016848549350&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=qj1IZ-6TInc", "email": "columbia.edu;columbia.edu;columbia.edu", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Columbia University", "aff_unique_dep": "", "aff_unique_url": "https://www.columbia.edu", "aff_unique_abbr": "Columbia", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "GradMax: Growing Neural Networks using Gradient Information", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7131", "id": "qjN4h_wwUO", "poster": "", "openreview": "https://openreview.net/forum?id=qjN4h_wwUO", "slides": "https://iclr.cc/virtual/2022/poster/7131", "video": "https://iclr.cc/virtual/2022/poster/7131", "author_site": "Utku Evci, Bart van Merrienboer, Thomas Unterthiner, Fabian Pedregosa, Max Vladymyrov", "tldr": "", "abstract": "The architecture and the parameters of neural networks are often optimized independently, which requires costly retraining of the parameters whenever the architecture is modified. In this work we instead focus on growing the architecture without requiring costly retraining. We present a method that adds new neurons during training without impacting what is already learned, while improving the training dynamics. We achieve the latter by maximizing the gradients of the new weights and efficiently find the optimal initialization by means of the singular value decomposition (SVD). We call this technique Gradient Maximizing Growth (GradMax) and demonstrate its effectiveness in variety of vision tasks and architectures. We open sourced our code at https://github.com/google-research/growneuron", "keywords": "efficient training;efficient;computer vision;architecture search", "primary_area": "", "supplementary_material": "", "author": "Utku Evci;Bart van Merrienboer;Thomas Unterthiner;Fabian Pedregosa;Max Vladymyrov", "authorids": "~Utku_Evci1;~Bart_van_Merrienboer1;~Thomas_Unterthiner1;~Fabian_Pedregosa1;~Max_Vladymyrov1", "gender": ";M;;M;M", "homepage": "http://evcu.github.io;;;http://fa.bianp.net;https://max-vladymyrov.github.io/", "dblp": "179/8146;147/5356;https://dblp.uni-trier.de/pers/u/Unterthiner:Thomas;11/9764;116/3059", "google_scholar": "8yGMMwcAAAAJ;XE9SDzgAAAAJ;https://scholar.google.at/citations?user=QCARd5gAAAAJ;https://scholar.google.fr/citations?hl=en;pQZCrqcAAAAJ", "orcid": ";;;0000-0003-4025-3953;", "linkedin": ";;;http://www.linkedin.com/in/fabianpedregosa;max-vladymyrov-5803b711/", "or_profile": "~Utku_Evci1;~Bart_van_Merrienboer1;~Thomas_Unterthiner1;~Fabian_Pedregosa1;~Max_Vladymyrov1", "aff": "Google;University of Montreal;Google;Google AI;Google Research", "aff_domain": "google.com;umontreal.ca;google.com;google.com;google.com", "position": "Researcher;PhD student;Researcher;Research Scientist;Research Scientist", "bibtex": "@inproceedings{\nevci2022gradmax,\ntitle={GradMax: Growing Neural Networks using Gradient Information},\nauthor={Utku Evci and Bart van Merrienboer and Thomas Unterthiner and Fabian Pedregosa and Max Vladymyrov},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=qjN4h_wwUO}\n}", "github": "", "project": "", "reviewers": "LS3D;igLv;LUg4;1m2b", "pdf_size": 0, "recommendation": "5;6;6;6", "confidence": "3;4;3;3", "correctness": "2;3;3;3", "technical_novelty": "2;4;3;3", "empirical_novelty": "2;4;3;2", "wc_summary_paper": "68;244;23;69", "wc_summary_review": "18;67;27;33", "wc_main_review": "358;1027;54;192", "wc_review": "444;1338;104;294", "wc_reply_reviewers": "0;0;0;83", "wc_reply_authors": "625;1011;312;611", "reply_reviewers": "0;0;0;1", "reply_authors": "2;3;1;1", "recommendation_avg": [ 5.75, 0.4330127018922193 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 101.0, 84.62564623091512 ], "wc_summary_review_avg": [ 36.25, 18.538810641462412 ], "wc_main_review_avg": [ 407.75, 373.374058418632 ], "wc_review_avg": [ 545.0, 473.42686869251514 ], "wc_reply_reviewers_avg": [ 20.75, 35.94005425705421 ], "wc_reply_authors_avg": [ 639.75, 248.13844422015706 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.75, 0.82915619758885 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": 1.0, "gs_citation": 69, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11971978084540378903&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=qjN4h_wwUO", "email": "google.com;umontreal.ca;google.com;google.com;google.com", "author_num": 5, "aff_unique_index": "0;1;0;0;0", "aff_unique_norm": "Google;University of Montreal", "aff_unique_dep": "Google;", "aff_unique_url": "https://www.google.com;https://wwwumontreal.ca", "aff_unique_abbr": "Google;UM", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;1;0;0;0", "aff_country_unique": "United States;Canada" }, { "id": "qkTEaJ9orc1", "title": "MOG: Molecular Out-of-distribution Generation with Energy-based Models", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Recent advances of deep generative models opened up a new horizon for de novo drug discovery. However, a well-known problem of existing works on molecule generation is that the generated molecules highly resemble those in the training set. Models that do not require training molecules such as RL-based models circumvent this problem, but they lack information about existing molecules. In this paper, we propose Molecular Out-of-distribution Generation (MOG), a novel framework that explicitly generates OOD molecules with respect to given molecules by combining two aspects of energy-based models (EBMs): generation and out-of-distribution (OOD) detection. This can be done by introducing multiple energy pivots to Langevin dynamics in generation and increase energy instead of minimizing it. We also utilize a property predictor to provide the property gradient of molecules to the modified Langevin dynamics. To validate the ability to explore the chemical space beyond the known molecular distribution, we experiment with MOG to generate molecules of high absolute values of docking score, which is the affinity score based on a physical binding simulation between a target protein and a given molecule. Docking score is a strong proxy to drug activity unlike penalized logP or QED and requires stronger exploration as it is nonlinear to local molecular structures and has many local optima. MOG is able to generate molecules with high docking scores compared to existing methods. Moreover, we further show the energy-increasing strategy based on EBMs can be universally applied to existing models and enhance their resulting novelty.", "keywords": "Drug Discovery;Molecule Generation;Energy-based Models", "primary_area": "", "supplementary_material": "", "author": "Seul Lee;Dong Bok Lee;Sung Ju Hwang", "authorids": "~Seul_Lee1;~Dong_Bok_Lee1;~Sung_Ju_Hwang1", "gender": "Not Specified;;", "homepage": "https://seullee05.github.io;;", "dblp": "159/0357;;", "google_scholar": "Ek0N9YYAAAAJ;;", "orcid": ";;", "linkedin": ";;", "or_profile": "~Seul_Lee1;~Dong_Bok_Lee1;~Sung_Ju_Hwang1", "aff": "Korea Advanced Institute of Science & Technology;;", "aff_domain": "kaist.ac.kr;;", "position": "MS student;;", "bibtex": "@misc{\nlee2022mog,\ntitle={{MOG}: Molecular Out-of-distribution Generation with Energy-based Models},\nauthor={Seul Lee and Dong Bok Lee and Sung Ju Hwang},\nyear={2022},\nurl={https://openreview.net/forum?id=qkTEaJ9orc1}\n}", "github": "", "project": "", "reviewers": "NcwF;q9Gr;u7Vm;4ecH", "site": "https://openreview.net/forum?id=qkTEaJ9orc1", "pdf_size": 0, "recommendation": "5;5;5;6", "confidence": "4;3;4;4", "correctness": "4;3;2;3", "technical_novelty": "3;2;3;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "152;27;143;165", "wc_summary_review": "220;20;125;53", "wc_main_review": "850;224;131;249", "wc_review": "1222;271;399;467", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 5.25, 0.4330127018922193 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 121.75, 55.260180057614726 ], "wc_summary_review_avg": [ 104.5, 76.73493337457198 ], "wc_main_review_avg": [ 363.5, 284.30133661310845 ], "wc_review_avg": [ 589.75, 371.7508406177449 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": 0.0, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2986286151886413271&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "Korea Advanced Institute of Science and Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.kaist.ac.kr", "aff_unique_abbr": "KAIST", "aff_country_unique_index": "0", "aff_country_unique": "South Korea" }, { "id": "qkpR1lriAKA", "title": "Vicinal Counting Networks", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "We tackle the task of Few-Shot Counting. Given an image containing multiple objects of a novel visual category and few exemplar bounding boxes depicting the visual category of interest, we want to count all of the instances of the desired visual category in the image. A key challenge in building an accurate few-shot visual counter is the scarcity of annotated training data due to the laborious effort needed for collecting and annotating the data. To address this challenge, we propose Vicinal Counting Networks, which learn to augment the existing training data along with learning to count. A Vicinal Counting Network consists of a generator and a counting network. The generator takes as input an image along with a random noise vector and generates an augmented version of the input image. The counting network learns to count the objects in the original and augmented images. The training signal for the generator comes from the counting loss of the counting network, and the generator aims to synthesize images which result in a small counting loss. Unlike GANs which are trained in an adversarial setting, Vicinal Counting Networks are trained in a cooperative setting where the generator aims to help the counting network in achieving accurate predictions on the synthesized images. We also show that our proposed data augmentation framework can be extended to other counting tasks like crowd counting. Our code and trained model will be released for research usage.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/8331e0b12158628787e1ffe489cdb8aab4bcaec1.zip", "author": "Viresh Ranjan;Minh Hoai", "authorids": "~Viresh_Ranjan1;~Minh_Hoai2", "gender": "M;M", "homepage": ";https://minhhoai.net", "dblp": "155/3191;135/4935", "google_scholar": "https://scholar.google.co.in/citations?user=E2tKQCgAAAAJ;hRV0tY4AAAAJ", "orcid": ";0000-0002-2415-6048", "linkedin": ";", "or_profile": "~Viresh_Ranjan1;~Minh_Hoai_Nguyen1", "aff": ", State University of New York at Stony Brook;State University of New York, Stony Brook", "aff_domain": "cs.stonybrook.edu;stonybrook.edu", "position": "PhD student;Associate Professor", "bibtex": "@misc{\nranjan2022vicinal,\ntitle={Vicinal Counting Networks},\nauthor={Viresh Ranjan and Minh Hoai},\nyear={2022},\nurl={https://openreview.net/forum?id=qkpR1lriAKA}\n}", "github": "", "project": "", "reviewers": "igPn;QnvZ;F2Qw;YHYT", "site": "https://openreview.net/forum?id=qkpR1lriAKA", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "4;4;3;4", "correctness": "3;3;2;3", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;2;2;0", "wc_summary_paper": "95;34;34;47", "wc_summary_review": "5;14;44;17", "wc_main_review": "247;105;214;248", "wc_review": "347;153;292;312", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 1.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 52.5, 25.104780421266383 ], "wc_summary_review_avg": [ 20.0, 14.543039572248986 ], "wc_main_review_avg": [ 203.5, 58.4914523669912 ], "wc_review_avg": [ 276.0, 73.69192628775556 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": -0.5773502691896257, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16590606392274104282&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff_unique_index": "0;1", "aff_unique_norm": "State University of New York at Stony Brook;State University of New York", "aff_unique_dep": ";", "aff_unique_url": "https://www.stonybrook.edu;https://www.stonybrook.edu", "aff_unique_abbr": "SUNY Stony Brook;SUNY Stony Brook", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Stony Brook", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "qmf56RZbzFJ", "title": "State-Only Imitation Learning by Trajectory Distribution Matching", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "The best performing state-only imitation learning approaches are based on adversarial imitation learning. The main drawback, however, is that adversarial training is often unstable and lacks a reliable convergence estimator. When the true environment reward is unknown and cannot be used to select the best-performing model, this can result in bad real-world policy performance. We propose a non-adversarial learning-from-observations approach, with an interpretable convergence and performance metric. \n \nOur training objective minimizes the Kulback-Leibler divergence between the policy and expert state transition trajectories which can be optimized in a non-adversarial fashion. For this, additional density models estimate the expert state transition distribution and the environment's forward and backward dynamics. We demonstrate the effectiveness of our approach on well-known continuous control environments, where our method can generalize to expert performance. We demonstrate that our method and loss are better suited to select the best-performing policy compared to objectives from adversarial methods by being competitive to or outperforming the state-of-the-art learning-from-observation approach in these environments. \n", "keywords": "Imitation Learning;Normalising Flows;Learning from Observations;Density Models", "primary_area": "", "supplementary_material": "", "author": "Damian Boborzi;Christoph-Nikolas Straehle;Jens Stefan Buchner;Lars Mikelsons", "authorids": "~Damian_Boborzi1;~Christoph-Nikolas_Straehle1;~Jens_Stefan_Buchner1;~Lars_Mikelsons1", "gender": "M;;M;", "homepage": "https://www.uni-augsburg.de/de/fakultaet/fai/informatik/prof/imech/team/damian-boborzi/;;;https://www.uni-augsburg.de/de/fakultaet/fai/informatik/prof/imech/team/lars-mikelsons/", "dblp": "305/3045;;;76/4786", "google_scholar": ";;;https://scholar.google.de/citations?user=-5UzGXkAAAAJ", "orcid": "0000-0002-9986-1964;;;0009-0005-9006-9726", "linkedin": "damian-boborzi-6b0775163;;dr-jens-buchner-000a9b100;lars-mikelsons-8a20abb3/", "or_profile": "~Damian_Boborzi1;~Christoph-Nikolas_Straehle1;~Jens_Stefan_Buchner1;~Lars_Mikelsons1", "aff": "University of Augsburg;;;University of Augsburg", "aff_domain": "uni-augsburg.de;;;uni-augsburg.de", "position": "PhD student;;;Full Professor", "bibtex": "@misc{\nboborzi2022stateonly,\ntitle={State-Only Imitation Learning by Trajectory Distribution Matching},\nauthor={Damian Boborzi and Christoph-Nikolas Straehle and Jens Stefan Buchner and Lars Mikelsons},\nyear={2022},\nurl={https://openreview.net/forum?id=qmf56RZbzFJ}\n}", "github": "", "project": "", "reviewers": "3kBi;qMCX;Ti4B;bg1F", "site": "https://openreview.net/forum?id=qmf56RZbzFJ", "pdf_size": 0, "recommendation": "3;3;3;3", "confidence": "5;5;4;4", "correctness": "1;1;2;3", "technical_novelty": "1;2;2;2", "empirical_novelty": "2;2;1;2", "wc_summary_paper": "136;131;58;124", "wc_summary_review": "54;101;47;42", "wc_main_review": "403;574;322;156", "wc_review": "593;806;427;322", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 4.5, 0.5 ], "correctness_avg": [ 1.75, 0.82915619758885 ], "technical_novelty_avg": [ 1.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 112.25, 31.60992723813201 ], "wc_summary_review_avg": [ 61.0, 23.484037131634757 ], "wc_main_review_avg": [ 363.75, 150.53965424432195 ], "wc_review_avg": [ 537.0, 182.90844704386947 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15101308850840116695&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "University of Augsburg", "aff_unique_dep": "", "aff_unique_url": "https://www.uni-augsburg.de", "aff_unique_abbr": "UOA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Germany" }, { "title": "Variational Inference for Discriminative Learning with Generative Modeling of Feature Incompletion", "status": "Oral", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6533", "id": "qnQN4yr6FJz", "poster": "", "openreview": "https://openreview.net/forum?id=qnQN4yr6FJz", "slides": "https://iclr.cc/virtual/2022/poster/6533", "video": "https://iclr.cc/virtual/2022/poster/6533", "author_site": "Kohei Miyaguchi, Takayuki Katsuki, Akira Koseki, Toshiya Iwamori", "tldr": "", "abstract": "We are concerned with the problem of distributional prediction with incomplete features: The goal is to estimate the distribution of target variables given feature vectors with some of the elements missing. A typical approach to this problem is to perform missing-value imputation and regression, simultaneously or sequentially, which we call the generative approach. Another approach is to perform regression after appropriately encoding missing values into the feature, which we call the discriminative approach. In comparison, the generative approach is more robust to the feature corruption while the discriminative approach is more favorable to maximize the performance of prediction. \nIn this study, we propose a hybrid method to take the best of both worlds. Our method utilizes the black-box variational inference framework so that it can be applied to a wide variety of modern machine learning models, including the variational autoencoders. We also confirmed the effectiveness of the proposed method empirically.\n", "keywords": "Black-box variational inference;missing values;evidence upper bound", "primary_area": "", "supplementary_material": "", "author": "Kohei Miyaguchi;Takayuki Katsuki;Akira Koseki;Toshiya Iwamori", "authorids": "~Kohei_Miyaguchi1;~Takayuki_Katsuki2;~Akira_Koseki1;~Toshiya_Iwamori1", "gender": "M;;M;M", "homepage": "https://koheimiya.github.io/about/;https://research.ibm.com/people/takayuki-katsuki;;", "dblp": "172/7749;01/10264;86/2676;244/7862.html", "google_scholar": "p78Mw3QAAAAJ;bZZ0I4UAAAAJ;;", "orcid": ";0000-0002-3670-1138;;", "linkedin": ";;ak-110061175/;", "or_profile": "~Kohei_Miyaguchi1;~Takayuki_Katsuki2;~Akira_Koseki1;~Toshiya_Iwamori1", "aff": "International Business Machines;International Business Machines;International Business Machines;International Business Machines", "aff_domain": "ibm.com;ibm.com;ibm.com;ibm.com", "position": "Researcher;Research staff member;Researcher;Researcher", "bibtex": "@inproceedings{\nmiyaguchi2022variational,\ntitle={Variational Inference for Discriminative Learning with Generative Modeling of Feature Incompletion},\nauthor={Kohei Miyaguchi and Takayuki Katsuki and Akira Koseki and Toshiya Iwamori},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=qnQN4yr6FJz}\n}", "github": "", "project": "", "reviewers": "unNG;WZsz;vurp;7Sj3", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "3;4;3;4", "correctness": "4;4;4;4", "technical_novelty": "3;3;3;4", "empirical_novelty": "1;3;3;3", "wc_summary_paper": "88;174;192;60", "wc_summary_review": "120;79;33;42", "wc_main_review": "599;452;389;423", "wc_review": "807;705;614;525", "wc_reply_reviewers": "0;0;102;257", "wc_reply_authors": "781;519;1386;385", "reply_reviewers": "0;0;1;1", "reply_authors": "1;2;2;1", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 4.0, 0.0 ], "technical_novelty_avg": [ 3.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 128.5, 55.75616557834658 ], "wc_summary_review_avg": [ 68.5, 34.3693177121688 ], "wc_main_review_avg": [ 465.75, 80.09798686608796 ], "wc_review_avg": [ 662.75, 104.81501562276276 ], "wc_reply_reviewers_avg": [ 89.75, 105.15791696301329 ], "wc_reply_authors_avg": [ 767.75, 384.31196637627613 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.6882472016116854, "corr_recommendation_correctness": 0.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3096867829485286682&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=qnQN4yr6FJz", "email": "ibm.com;ibm.com;ibm.com;ibm.com", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "International Business Machines Corporation", "aff_unique_dep": "", "aff_unique_url": "https://www.ibm.com", "aff_unique_abbr": "IBM", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "qnm-2v-baW", "title": "Poly-CAM: High resolution class activation map for convolutional neural networks", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "The need for Explainable AI is increasing with the development of deep learning. The saliency maps derived from convolutional neural networks generally fail in localizing with accuracy the image features justifying the network prediction. This is because those maps are either low-resolution as for CAM (Zhou et al.,2016), or smooth as for perturbation-based methods (Zeiler & Fergus, 2014), or do correspond to a large number of widespread peaky spots as for gradient-based approaches (Sundararajan et al., 2017; Smilkov et al., 2017). In contrast, our work proposes to combine the information from earlier network layers with the one from later layers to produce a high resolution Class Activation Map that is competitive with the previous art in term of insertion-deletion faithfulness metrics, while out-performing it in term of precision of class-specific features localization.", "keywords": "XAI;explainability;saliency map;CAM;deep learning;CNN;convolutional neural network", "primary_area": "", "supplementary_material": "/attachment/37247ceebc4ed51910e56ff3f5d223baf31f3bb7.zip", "author": "Alexandre Englebert;Olivier Cornu;Christophe De Vleeschouwer", "authorids": "~Alexandre_Englebert1;~Olivier_Cornu1;~Christophe_De_Vleeschouwer1", "gender": "M;M;M", "homepage": ";http://www.international-saintluc.be/en/medecin/professor-olivier-cornu;", "dblp": ";;", "google_scholar": ";pka2cCcAAAAJ;xb3Zc3cAAAAJ", "orcid": ";;0000-0001-5049-2929", "linkedin": "alexandre-englebert-813824209;http://www.linkedin.com/in/olivier-cornu-943b1970;", "or_profile": "~Alexandre_Englebert1;~Olivier_Cornu1;~Christophe_De_Vleeschouwer1", "aff": "UCL;UCL;UCLouvain", "aff_domain": "uclouvain.be;uclouvain.be;uclouvain.be", "position": "PhD student;Full Professor;Associate Professor", "bibtex": "@misc{\nenglebert2022polycam,\ntitle={Poly-{CAM}: High resolution class activation map for convolutional neural networks},\nauthor={Alexandre Englebert and Olivier Cornu and Christophe De Vleeschouwer},\nyear={2022},\nurl={https://openreview.net/forum?id=qnm-2v-baW}\n}", "github": "", "project": "", "reviewers": "6Awa;aZq6;BXML;1byE", "site": "https://openreview.net/forum?id=qnm-2v-baW", "pdf_size": 0, "recommendation": "5;5;5;5", "confidence": "3;4;4;5", "correctness": "4;3;4;3", "technical_novelty": "3;2;2;2", "empirical_novelty": "3;2;2;2", "wc_summary_paper": "50;51;67;61", "wc_summary_review": "116;27;35;66", "wc_main_review": "77;240;250;195", "wc_review": "243;318;352;322", "wc_reply_reviewers": "0;22;0;147", "wc_reply_authors": "478;102;213;483", "reply_reviewers": "0;1;0;2", "reply_authors": "2;1;1;2", "recommendation_avg": [ 5.0, 0.0 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 57.25, 7.084313657652377 ], "wc_summary_review_avg": [ 61.0, 34.935655139126844 ], "wc_main_review_avg": [ 190.5, 68.72590486854284 ], "wc_review_avg": [ 308.75, 40.17072939342775 ], "wc_reply_reviewers_avg": [ 42.25, 61.14071883777619 ], "wc_reply_authors_avg": [ 319.0, 166.2092055212346 ], "reply_reviewers_avg": [ 0.75, 0.82915619758885 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 21, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13847822556558791509&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "aff_unique_index": "0;0;1", "aff_unique_norm": "University College London;Universit\u00e9 catholique de Louvain", "aff_unique_dep": ";", "aff_unique_url": "https://www.ucl.ac.uk;https://www.uclouvain.be", "aff_unique_abbr": "UCL;UCL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1", "aff_country_unique": "United Kingdom;Belgium" }, { "id": "qoEa_G3pKop", "title": "ACCELERATING VARIATIONAL QUANTUM ALGORITHMS WITH MULTIPLE QUANTUM PROCESSORS", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Variational quantum algorithms (VQAs) are prime contenders that exploit near-term quantum machines to gain computational advantages over classical algorithms. As such, how to accelerate the optimization of modern VQAs has attracted great attention in past years. Here we propose a \\texttt{QU}antum \\texttt{DI}stributed \\texttt{O}ptimization scheme (abbreviated as QUDIO) to address this issue. Conceptually, QUDIO utilizes a classical central server to partition the learning problem into multiple subproblems and allocate them to a set of quantum local nodes. In the training stage, all local nodes proceed with parallel training and the classical server synchronizes optimization information among local nodes timely. In doing so, we prove a sublinear convergence rate of QUDIO in the number of global iterations under the ideal scenario. Moreover, when the imperfection of the quantum system is considered, we prove that an increased synchronization time leads to a degraded convergence rate or even incurs divergent optimization. Numerical results on standard benchmarks illustrate that QUDIO can surprisingly reach a superlinear clock-time speedup in terms of the number of local nodes. Our proposal can be readily mixed with other advanced VQAs-based techniques to narrow the gap between the state of the art and applications with the quantum advantage. ", "keywords": "Quantum machine learning;quantum neural network", "primary_area": "", "supplementary_material": "/attachment/6f608308a35805935dd5d9cff92f1b72a43c6ac3.zip", "author": "Yuxuan Du;Yang Qian;Dacheng Tao", "authorids": "~Yuxuan_Du2;~Yang_Qian1;~Dacheng_Tao1", "gender": "M;M;", "homepage": "https://github.com/yuxuan-du/Yuxuan-Du.github.io;https://qqqyang.github.io/;", "dblp": ";;", "google_scholar": "https://scholar.google.com.au/citations?user=50sFkzIAAAAJ;8Bt-CfgAAAAJ;", "orcid": "0000-0002-1193-9756;;", "linkedin": ";;", "or_profile": "~Yuxuan_Du2;~Yang_Qian1;~Dacheng_Tao1", "aff": "JD.com;University of Sydney;", "aff_domain": "jd.com;sydney.edu.au;", "position": "Researcher;PhD student;", "bibtex": "@misc{\ndu2022accelerating,\ntitle={{ACCELERATING} {VARIATIONAL} {QUANTUM} {ALGORITHMS} {WITH} {MULTIPLE} {QUANTUM} {PROCESSORS}},\nauthor={Yuxuan Du and Yang Qian and Dacheng Tao},\nyear={2022},\nurl={https://openreview.net/forum?id=qoEa_G3pKop}\n}", "github": "", "project": "", "reviewers": "4jPj;Ywkp;vJBc;YqKv", "site": "https://openreview.net/forum?id=qoEa_G3pKop", "pdf_size": 0, "recommendation": "3;3;3;6", "confidence": "3;4;3;5", "correctness": "2;3;2;4", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "84;57;96;55", "wc_summary_review": "173;43;69;66", "wc_main_review": "955;377;199;637", "wc_review": "1212;477;364;758", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.75, 1.299038105676658 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 2.75, 0.82915619758885 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 73.0, 17.53567791675018 ], "wc_summary_review_avg": [ 87.75, 50.23631654490604 ], "wc_main_review_avg": [ 542.0, 284.8104632909402 ], "wc_review_avg": [ 702.75, 327.14780680909354 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.8703882797784892, "corr_recommendation_correctness": 0.8703882797784892, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18115742652762255630&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1", "aff_unique_norm": "JD.com;University of Sydney", "aff_unique_dep": ";", "aff_unique_url": "https://www.jd.com;https://www.sydney.edu.au", "aff_unique_abbr": "JD;USYD", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "China;Australia" }, { "id": "qpcG27kYK6z", "title": "Concentric Spherical GNN for 3D Representation Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Learning 3D representations of point clouds that generalize well to arbitrary orientations is a challenge of practical importance in problems ranging from computer vision to molecular modeling.\nThe proposed approach is based on a concentric spherical representation of 3D space, formed by nesting spatially-sampled spheres resulting from the highly regular icosahedral discretization.\nWe propose separate intra-sphere and inter-sphere convolutions over the resulting concentric spherical grid, which are combined into a convolutional framework for learning volumetric and rotationally equivariant representations over point clouds.\nWe demonstrate the effectiveness of our approach for 3D object classification, and towards resolving the electronic structure of atomistic systems.", "keywords": "spherical cnn;CNN;point cloud;graph convolution;rotation;equivariance;3D;molecular;volumetric", "primary_area": "", "supplementary_material": "", "author": "James S Fox;Bo Zhao;Beatriz Gonzalez Del Rio;Siva Rajamanickam;Rampi Ramprasad;Le Song", "authorids": "~James_S_Fox1;~Bo_Zhao6;brio3@gatech.edu;~Siva_Rajamanickam1;rampi.ramprasad@mse.gatech.edu;~Le_Song1", "gender": "M;;;M;;M", "homepage": ";https://b-zhao.github.io;;https://siva.science;;http://www.cc.gatech.edu/~lsong", "dblp": ";;;;;94/3481", "google_scholar": "gYa-FTQAAAAJ;ZCCrFoIAAAAJ;;o_N5F7sAAAAJ;;Xl4E0CsAAAAJ", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": "~James_S_Fox1;~Bo_Zhao6;brio3@gatech.edu;~Siva_Rajamanickam1;rampi.ramprasad@mse.gatech.edu;~Le_Song1", "aff": "University of California, Berkeley;University of California, San Diego;;Sandia National Laboratories;;College of Computing, Georgia Institute of Technology", "aff_domain": "berkeley.edu;ucsd.edu;;sandia.gov;;cc.gatech.edu", "position": "Undergrad student;PhD student;;Researcher;;Associate Professor", "bibtex": "@misc{\nfox2022concentric,\ntitle={Concentric Spherical {GNN} for 3D Representation Learning},\nauthor={James S Fox and Bo Zhao and Beatriz Gonzalez Del Rio and Siva Rajamanickam and Rampi Ramprasad and Le Song},\nyear={2022},\nurl={https://openreview.net/forum?id=qpcG27kYK6z}\n}", "github": "", "project": "", "reviewers": "vMzd;KVV8;oVKM;xhPe", "site": "https://openreview.net/forum?id=qpcG27kYK6z", "pdf_size": 0, "recommendation": "5;5;5;6", "confidence": "4;5;3;3", "correctness": "3;3;3;3", "technical_novelty": "2;2;2;3", "empirical_novelty": "1;2;2;0", "wc_summary_paper": "45;52;72;75", "wc_summary_review": "82;45;10;55", "wc_main_review": "293;199;717;229", "wc_review": "420;296;799;359", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "498;294;938;384", "reply_reviewers": "0;0;0;0", "reply_authors": "1;2;2;1", "recommendation_avg": [ 5.25, 0.4330127018922193 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 61.0, 12.786711852544421 ], "wc_summary_review_avg": [ 48.0, 25.777897509300484 ], "wc_main_review_avg": [ 359.5, 209.17636099712607 ], "wc_review_avg": [ 468.5, 195.78623547124042 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 528.5, 247.2301559276295 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.5222329678670935, "corr_recommendation_correctness": 0.0, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15715060486956336768&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "University of California, Berkeley;University of California, San Diego;Sandia National Laboratories;Georgia Institute of Technology", "aff_unique_dep": ";;;College of Computing", "aff_unique_url": "https://www.berkeley.edu;https://www.ucsd.edu;https://www.sandia.gov;https://www.gatech.edu", "aff_unique_abbr": "UC Berkeley;UCSD;SNL;Georgia Tech", "aff_campus_unique_index": "0;1;3", "aff_campus_unique": "Berkeley;San Diego;;Atlanta", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Exploiting Class Activation Value for Partial-Label Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/5942", "id": "qqdXHUGec9h", "poster": "", "openreview": "https://openreview.net/forum?id=qqdXHUGec9h", "slides": "https://iclr.cc/virtual/2022/poster/5942", "video": "https://iclr.cc/virtual/2022/poster/5942", "author_site": "Fei Zhang, Lei Feng, Bo Han, Tongliang Liu, Gang Niu, Tao Qin, Masashi Sugiyama", "tldr": "", "abstract": "Partial-label learning (PLL) solves the multi-class classification problem, where each training instance is assigned a set of candidate labels that include the true label. Recent advances showed that PLL can be compatible with deep neural networks, which achieved state-of-the-art performance. However, most of the existing deep PLL methods focus on designing proper training objectives under various assumptions on the collected data, which may limit their performance when the collected data cannot satisfy the adopted assumptions. In this paper, we propose to exploit the learned intrinsic representation of the model to identify the true label in the training process, which does not rely on any assumptions on the collected data. We make two key contributions. As the first contribution, we empirically show that the class activation map (CAM), a simple technique for discriminating the learning patterns of each class in images, could surprisingly be utilized to make accurate predictions on selecting the true label from candidate labels. Unfortunately, as CAM is confined to image inputs with convolutional neural networks, we are yet unable to directly leverage CAM to address the PLL problem with general inputs and models. Thus, as the second contribution, we propose the class activation value (CAV), which owns similar properties of CAM, while CAV is versatile in various types of inputs and models. Building upon CAV, we propose a novel method named CAV Learning (CAVL) that selects the true label by the class with the maximum CAV for model training. Extensive experiments on various datasets demonstrate that our proposed CAVL method achieves state-of-the-art performance.", "keywords": "Partial-label Learning;Class Activation Map", "primary_area": "", "supplementary_material": "", "author": "Fei Zhang;Lei Feng;Bo Han;Tongliang Liu;Gang Niu;Tao Qin;Masashi Sugiyama", "authorids": "~Fei_Zhang3;~Lei_Feng1;~Bo_Han1;~Tongliang_Liu1;~Gang_Niu1;~Tao_Qin1;~Masashi_Sugiyama1", "gender": "M;M;M;M;M;M;M", "homepage": ";https://lfeng1995.github.io/;https://tongliang-liu.github.io/;https://niug1984.github.io;https://www.microsoft.com/en-us/research/people/taoqin/;http://www.ms.k.u-tokyo.ac.jp/sugi/;https://bhanml.github.io/", "dblp": ";76/847-6;150/6667;26/3367-1;14/6841;35/1228;241/0472-3", "google_scholar": "https://scholar.google.com/citations?hl=zh-CN;https://scholar.google.com.sg/citations?user=KomQOFkAAAAJ;https://scholar.google.com.au/citations?user=EiLdZ_YAAAAJ;https://scholar.google.co.jp/citations?user=HOkcy00AAAAJ;Bl4SRU0AAAAJ;https://scholar.google.co.jp/citations?user=GkYIrlIAAAAJ;nTNjqHwAAAAJ", "orcid": ";0000-0003-2839-5799;;;;0000-0001-6658-6743;", "linkedin": "ferenas97/;;;;;;", "or_profile": "~Fei_Zhang3;~Lei_Feng1;~Tongliang_Liu1;~Gang_Niu1;~Tao_Qin1;~Masashi_Sugiyama1;~bo_han2", "aff": "Shanghai Jiaotong University;Chongqing University;University of Sydney;RIKEN;Microsoft Research Asia;The University of Tokyo;Microsoft Research", "aff_domain": "sjtu.edu.cn;cqu.edu.cn;sydney.edu.au;riken.jp;microsoft.com;u-tokyo.ac.jp;microsoft.com", "position": "MS student;Full Professor;Lecturer;Research Scientist (tenured);Principal Researcher;Full Professor;Researcher", "bibtex": "@inproceedings{\nzhang2022exploiting,\ntitle={Exploiting Class Activation Value for Partial-Label Learning},\nauthor={Fei Zhang and Lei Feng and Bo Han and Tongliang Liu and Gang Niu and Tao Qin and Masashi Sugiyama},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=qqdXHUGec9h}\n}", "github": "", "project": "", "reviewers": "sUUe;1wbR;Dgv3", "pdf_size": 0, "recommendation": "3;6;8", "confidence": "4;5;4", "correctness": "2;3;4", "technical_novelty": "2;2;3", "empirical_novelty": "2;3;3", "wc_summary_paper": "111;74;124", "wc_summary_review": "39;87;58", "wc_main_review": "365;306;276", "wc_review": "515;467;458", "wc_reply_reviewers": "278;708;27", "wc_reply_authors": "2159;2485;499", "reply_reviewers": "2;3;1", "reply_authors": "4;6;2", "recommendation_avg": [ 5.666666666666667, 2.0548046676563256 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "correctness_avg": [ 3.0, 0.816496580927726 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 103.0, 21.18175315375634 ], "wc_summary_review_avg": [ 61.333333333333336, 19.73716179078328 ], "wc_main_review_avg": [ 315.6666666666667, 36.97146046464609 ], "wc_review_avg": [ 480.0, 25.019992006393608 ], "wc_reply_reviewers_avg": [ 337.6666666666667, 281.2002054685515 ], "wc_reply_authors_avg": [ 1714.3333333333333, 869.6149850492586 ], "reply_reviewers_avg": [ 2.0, 0.816496580927726 ], "reply_authors_avg": [ 4.0, 1.632993161855452 ], "replies_avg": [ 23, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.11470786693528084, "corr_recommendation_correctness": 0.9933992677987828, "gs_citation": 59, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10070847295089974075&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2, "pdf": "https://openreview.net/pdf?id=qqdXHUGec9h", "email": "sjtu.edu.cn;cqu.edu.cn;sydney.edu.au;riken.jp;microsoft.com;u-tokyo.ac.jp;microsoft.com", "author_num": 7, "aff_unique_index": "0;1;2;3;4;5;4", "aff_unique_norm": "Shanghai Jiao Tong University;Chongqing University;University of Sydney;RIKEN;Microsoft;University of Tokyo", "aff_unique_dep": ";;;;Research;", "aff_unique_url": "https://www.sjtu.edu.cn;https://www.cqu.edu.cn;https://www.sydney.edu.au;https://www.riken.jp;https://www.microsoft.com/en-us/research/group/asia;https://www.u-tokyo.ac.jp", "aff_unique_abbr": "SJTU;CQU;USYD;RIKEN;MSR Asia;UTokyo", "aff_campus_unique_index": "1", "aff_campus_unique": ";Asia", "aff_country_unique_index": "0;0;1;2;0;2;3", "aff_country_unique": "China;Australia;Japan;United States" }, { "id": "qrdbsZEZPZ", "title": "Certified Robustness for Free in Differentially Private Federated Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Federated learning (FL) provides an efficient training paradigm to jointly train a global model leveraging data from distributed users.\nAs the local training data comes from different users who may not be trustworthy, several studies have shown that FL is vulnerable to poisoning attacks where adversaries add malicious data during training. On the other hand, to protect the privacy of users, FL is usually trained in a differentially private way (DPFL). Given these properties of FL, in this paper, we aim to ask: Can we leverage the innate privacy property of DPFL to provide robustness certification against poisoning attacks? Can we further improve the privacy of FL to improve such certification?\nTo this end, we first investigate both the user-level and instance-level privacy of FL, and propose novel randomization mechanisms and analysis to achieve improved differential privacy.\nWe then provide two robustness certification criteria: certified prediction and certified attack cost for DPFL on both levels. Theoretically, given different privacy properties of DPFL, we prove their certified robustness under a bounded number of adversarial users or instances. \nEmpirically, we conduct extensive experiments to verify our theories under different attacks on a range of datasets. We show that the global model with a tighter privacy guarantee always provides stronger robustness certification in terms of the certified attack cost, while may exhibit tradeoffs regarding the certified prediction.\nWe believe our work will inspire future research of developing certifiably robust DPFL based on its inherent properties.", "keywords": "Certified Robustness;Differential Privacy;Federated Learning", "primary_area": "", "supplementary_material": "/attachment/ab6e969efbd19e6ccf9e42f34b614b736001af93.zip", "author": "Chulin Xie;Yunhui Long;Pin-Yu Chen;Krishnaram Kenthapadi;Bo Li", "authorids": "~Chulin_Xie1;~Yunhui_Long1;~Pin-Yu_Chen1;~Krishnaram_Kenthapadi1;~Bo_Li19", "gender": "F;Unspecified;M;M;F", "homepage": ";;http://www.pinyuchen.com;https://cs.stanford.edu/people/kngk/;http://boli.cs.illinois.edu/", "dblp": "245/4284;;39/8969;29/4781;50/3402-26", "google_scholar": "WeJnzAgAAAAJ;;jxwlCUUAAAAJ;av5rGaEAAAAJ;K8vJkTcAAAAJ", "orcid": ";;0000-0003-1039-8369;0000-0003-1237-087X;", "linkedin": ";;pin-yu-chen-940062a2;krishnaramkenthapadi/;", "or_profile": "~Chulin_Xie1;~Yunhui_Long1;~Pin-Yu_Chen1;~Krishnaram_Kenthapadi1;~Bo_Li19", "aff": "University of Illinois, Urbana Champaign;;International Business Machines;Fiddler AI;University of Illinois, Urbana Champaign", "aff_domain": "illinois.edu;;ibm.com;fiddler.ai;illinois.edu", "position": "PhD student;;Research Staff Member;Chief Scientist;Assistant Professor", "bibtex": "@misc{\nxie2022certified,\ntitle={Certified Robustness for Free in Differentially Private Federated Learning},\nauthor={Chulin Xie and Yunhui Long and Pin-Yu Chen and Krishnaram Kenthapadi and Bo Li},\nyear={2022},\nurl={https://openreview.net/forum?id=qrdbsZEZPZ}\n}", "github": "", "project": "", "reviewers": "Rayz;oz1G;T4mM;u4ds", "site": "https://openreview.net/forum?id=qrdbsZEZPZ", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "4;3;4;2", "correctness": "3;4;3;4", "technical_novelty": "2;2;2;2", "empirical_novelty": "0;2;2;3", "wc_summary_paper": "257;15;19;75", "wc_summary_review": "102;15;19;15", "wc_main_review": "546;230;208;188", "wc_review": "905;260;246;278", "wc_reply_reviewers": "0;30;428;0", "wc_reply_authors": "2070;1683;2347;801", "reply_reviewers": "0;1;1;0", "reply_authors": "4;3;4;2", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 1.75, 1.0897247358851685 ], "wc_summary_paper_avg": [ 91.5, 98.451764839438 ], "wc_summary_review_avg": [ 37.75, 37.13068138345969 ], "wc_main_review_avg": [ 293.0, 146.82302271782856 ], "wc_review_avg": [ 422.25, 278.9465674640934 ], "wc_reply_reviewers_avg": [ 114.5, 181.4132023861549 ], "wc_reply_authors_avg": [ 1725.25, 583.4056800374847 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 3.25, 0.82915619758885 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.7608859102526822, "corr_recommendation_correctness": 0.6882472016116854, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9527394774180239127&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "University of Illinois Urbana-Champaign;International Business Machines Corporation;Fiddler AI", "aff_unique_dep": ";;", "aff_unique_url": "https://illinois.edu;https://www.ibm.com;https://www.fiddler.ai", "aff_unique_abbr": "UIUC;IBM;Fiddler AI", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Urbana-Champaign;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Hindsight is 20/20: Leveraging Past Traversals to Aid 3D Perception", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6048", "id": "qsZoGvFiJn1", "poster": "", "openreview": "https://openreview.net/forum?id=qsZoGvFiJn1", "slides": "https://iclr.cc/virtual/2022/poster/6048", "video": "https://iclr.cc/virtual/2022/poster/6048", "author_site": "Yurong You, Katie Luo, Xiangyu Chen, Junan Chen, Wei-Lun Chao, Wen Sun, Bharath Hariharan, Mark Campbell, Kilian Weinberger", "tldr": "", "abstract": "Self-driving cars must detect vehicles, pedestrians, and other traf\ufb01c participants accurately to operate safely. Small, far-away, or highly occluded objects are particularly challenging because there is limited information in the LiDAR point clouds for detecting them. To address this challenge, we leverage valuable information from the past: in particular, data collected in past traversals of the same scene. We posit that these past data, which are typically discarded, provide rich contextual information for disambiguating the above-mentioned challenging cases. To this end, we propose a novel end-to-end trainable Hindsight framework to extract this contextual information from past traversals and store it in an easy-to-query data structure, which can then be leveraged to aid future 3D object detection of the same scene. We show that this framework is compatible with most modern 3D detection architectures and can substantially improve their average precision on multiple autonomous driving datasets, most notably by more than 300% on the challenging cases. Our code is available at https://github.com/YurongYou/Hindsight.", "keywords": "3D object detection;perception with historical context", "primary_area": "", "supplementary_material": "", "author": "Yurong You;Katie Z Luo;Xiangyu Chen;Junan Chen;Wei-Lun Chao;Wen Sun;Bharath Hariharan;Mark Campbell;Kilian Q Weinberger", "authorids": "~Yurong_You1;~Katie_Z_Luo1;~Xiangyu_Chen1;~Junan_Chen1;~Wei-Lun_Chao1;~Wen_Sun1;~Bharath_Hariharan3;~Mark_Campbell1;~Kilian_Q_Weinberger1", "gender": "M;F;M;M;M;;M;M;M", "homepage": "http://yurongyou.com;https://www.cs.cornell.edu/~katieluo/;https://www.cs.cornell.edu/~xchen/;http://cornell-asl.org/main/index.html;https://sites.google.com/view/wei-lun-harry-chao;https://wensun.github.io;http://campbell.mae.cornell.edu;http://www.cs.cornell.edu/~kilian/;http://home.bharathh.info", "dblp": "199/1968;207/8564;;238/1143;64/8842;;;88/4801;05/8412", "google_scholar": "rdwkreIAAAAJ;qlmK27YAAAAJ;xBv-PMEAAAAJ;;PGKakWwAAAAJ;iOLC30YAAAAJ;e1iAhHQAAAAJ;jsxk8vsAAAAJ;TpglobcAAAAJ", "orcid": ";;;;0000-0003-1269-7231;;;0009-0008-9313-7239;", "linkedin": "yurong-you/;katieluo;;;;;;;", "or_profile": "~Yurong_You1;~Katie_Z_Luo1;~Xiangyu_Chen1;~Junan_Chen1;~Wei-Lun_Chao1;~Wen_Sun1;~Mark_Campbell1;~Kilian_Q_Weinberger1;~Bharath_Hariharan2", "aff": "Cornell University;Cornell University;Cornell University;Cornell University;Ohio State University;Cornell University;Cornell University;ASAPP Inc.;Cornell University", "aff_domain": "cornell.edu;cornell.edu;cornell.edu;cornell.edu;osu.edu;cornell.edu;cornell.edu;asapp.com;cornell.edu", "position": "PhD student;PhD student;PhD student;PhD student;Assistant Professor;Assistant Professor;Full Professor;Principal Researcher;Assistant Professor", "bibtex": "@inproceedings{\nyou2022hindsight,\ntitle={Hindsight is 20/20: Leveraging Past Traversals to Aid 3D Perception},\nauthor={Yurong You and Katie Z Luo and Xiangyu Chen and Junan Chen and Wei-Lun Chao and Wen Sun and Bharath Hariharan and Mark Campbell and Kilian Q Weinberger},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=qsZoGvFiJn1}\n}", "github": "", "project": "", "reviewers": "i3iC;TGjG;iXT8;9mnc", "pdf_size": 0, "recommendation": "6;8;8;8", "confidence": "4;4;3;4", "correctness": "3;4;4;4", "technical_novelty": "3;4;4;4", "empirical_novelty": "3;4;4;4", "wc_summary_paper": "113;335;120;87", "wc_summary_review": "81;62;24;55", "wc_main_review": "689;799;165;369", "wc_review": "883;1196;309;511", "wc_reply_reviewers": "176;17;16;115", "wc_reply_authors": "766;400;276;311", "reply_reviewers": "1;1;1;1", "reply_authors": "2;1;1;1", "recommendation_avg": [ 7.5, 0.8660254037844386 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 163.75, 99.63276318561078 ], "wc_summary_review_avg": [ 55.5, 20.524375751773793 ], "wc_main_review_avg": [ 505.5, 252.18396063191648 ], "wc_review_avg": [ 724.75, 341.19523370058965 ], "wc_reply_reviewers_avg": [ 81.0, 68.01102851743973 ], "wc_reply_authors_avg": [ 438.25, 194.55124646221108 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 9, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": 1.0, "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15674924686724150204&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=qsZoGvFiJn1", "email": "cornell.edu;cornell.edu;cornell.edu;cornell.edu;osu.edu;cornell.edu;cornell.edu;asapp.com;cornell.edu", "author_num": 9, "aff_unique_index": "0;0;0;0;1;0;0;2;0", "aff_unique_norm": "Cornell University;Ohio State University;ASAPP Inc.", "aff_unique_dep": ";;", "aff_unique_url": "https://www.cornell.edu;https://www.osu.edu;https://www.asapp.com", "aff_unique_abbr": "Cornell;OSU;ASAPP", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "qvUJV2-t_c", "title": "Using a one dimensional parabolic model of the full-batch loss to estimate learning rates during training", "track": "main", "status": "Reject", "tldr": "", "abstract": "A fundamental challenge in Deep Learning is to find optimal step sizes for stochastic gradient descent automatically. In traditional optimization, line searches are a commonly used method to determine step sizes. One problem in Deep Learning is that finding appropriate step sizes on the full-batch loss is unfeasibly expensive. Therefore, classical line search approaches, designed for losses without inherent noise, are usually not applicable. Recent empirical findings suggest that the full-batch loss behaves locally parabolically in the direction of noisy update step directions. Furthermore, the trend of the optimal update step size changes slowly. By exploiting these findings, this work introduces a line-search method that approximates the full-batch loss with a parabola estimated over several mini-batches. Learning rates are derived from such parabolas during training. In the experiments conducted, our approach mostly outperforms SGD tuned with a piece-wise constant learning rate schedule and other line search approaches for Deep Learning across models, datasets, and batch sizes on validation and test accuracy.", "keywords": "Empirics based Optimization;Optimization;Line Search;SGD;Deep Learning", "primary_area": "", "supplementary_material": "/attachment/625e1c35bc39611fb7fc24b3305c7aa9c55da634.zip", "author": "Maximus Mutschler;Kevin Alexander Laube;Andreas Zell", "authorids": "~Maximus_Mutschler1;~Kevin_Alexander_Laube1;~Andreas_Zell1", "gender": ";M;M", "homepage": "https://uni-tuebingen.de/fakultaeten/mathematisch-naturwissenschaftliche-fakultaet/fachbereiche/informatik/lehrstuehle/kognitive-systeme/the-chair/staff/maximus-mutschler/;;https://uni-tuebingen.de/fakultaeten/mathematisch-naturwissenschaftliche-fakultaet/fachbereiche/informatik/lehrstuehle/kognitive-systeme/", "dblp": ";232/1731;05/4192", "google_scholar": ";;", "orcid": ";;", "linkedin": ";laubeke/;", "or_profile": "~Maximus_Mutschler1;~Kevin_Alexander_Laube1;~Andreas_Zell1", "aff": "University of Tuebingen;Bosch;Eberhard-Karls-Universit\u00e4t T\u00fcbingen", "aff_domain": "uni-tuebingen.de;bosch.com;uni-tuebingen.de", "position": "PhD student;Research Engineer;Full Professor", "bibtex": "@misc{\nmutschler2022using,\ntitle={Using a one dimensional parabolic model of the full-batch loss to estimate learning rates during training},\nauthor={Maximus Mutschler and Kevin Alexander Laube and Andreas Zell},\nyear={2022},\nurl={https://openreview.net/forum?id=qvUJV2-t_c}\n}", "github": "", "project": "", "reviewers": "mG5J;kiNL;rHNN;w2rC", "site": "https://openreview.net/forum?id=qvUJV2-t_c", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "4;4;2;4", "correctness": "3;2;2;4", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "102;91;100;135", "wc_summary_review": "45;45;28;60", "wc_main_review": "283;442;519;195", "wc_review": "430;578;647;390", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 3.5, 0.8660254037844386 ], "correctness_avg": [ 2.75, 0.82915619758885 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 107.0, 16.688319268278637 ], "wc_summary_review_avg": [ 44.5, 11.324751652906125 ], "wc_main_review_avg": [ 359.75, 127.63106009118627 ], "wc_review_avg": [ 511.25, 105.10322307141679 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": 0.8703882797784891, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:k-Vqsru0tJQJ:scholar.google.com/&scioq=Using+a+one+dimensional+parabolic+model+of+the+full-batch+loss+to+estimate+learning+rates+during+training&hl=en&as_sdt=0,33", "gs_version_total": 6, "aff_unique_index": "0;1;2", "aff_unique_norm": "University of Tuebingen;Robert Bosch GmbH;Eberhard Karls University of T\u00fcbingen", "aff_unique_dep": ";;", "aff_unique_url": "https://www.uni-tuebingen.de/;https://www.bosch.com;https://www.uni-tuebingen.de/", "aff_unique_abbr": "Uni T\u00fcbingen;Bosch;Uni T\u00fcbingen", "aff_campus_unique_index": "1", "aff_campus_unique": ";T\u00fcbingen", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Germany" }, { "id": "qw674L9PfQE", "title": "CLOOB: Modern Hopfield Networks with InfoLOOB Outperform CLIP", "track": "main", "status": "Reject", "tldr": "", "abstract": "Contrastive learning with the InfoNCE objective is exceptionally successful in various self-supervised learning tasks. Recently, the CLIP model yielded impressive results on zero-shot transfer learning when using InfoNCE for learning visual representations from natural language supervision. However, InfoNCE as a lower bound on the mutual information has been shown to perform poorly for high mutual information. In contrast, the InfoLOOB upper bound (leave one out bound) works well for high mutual information but suffers from large variance and instabilities. We introduce \"Contrastive Leave One Out Boost\" (CLOOB), where modern Hopfield networks boost learning with the InfoLOOB objective. Modern Hopfield networks replace the original embeddings by retrieved embeddings in the InfoLOOB objective. The retrieved embeddings give InfoLOOB two assets. Firstly, the retrieved embeddings stabilize InfoLOOB, since they are less noisy and more similar to one another than the original embeddings. Secondly, they are enriched by correlations, since the covariance structure of embeddings is reinforced through retrievals. We compare CLOOB to CLIP after learning on the Conceptual Captions and the YFCC dataset with respect to their zero-shot transfer learning performance on other datasets. CLOOB consistently outperforms CLIP at zero-shot transfer learning across all considered architectures and datasets.", "keywords": "Deep learning;Associative memory;Hopfield networks;Contrastive learning;Multimodal learning", "primary_area": "", "supplementary_material": "", "author": "Andreas F\u00fcrst;Elisabeth Rumetshofer;Viet Thuong Tran;Hubert Ramsauer;Fei Tang;Johannes Lehner;D P Kreil;Michael K Kopp;G\u00fcnter Klambauer;Angela Bitto-Nemling;Sepp Hochreiter", "authorids": "~Andreas_F\u00fcrst1;~Elisabeth_Rumetshofer1;~Viet_Thuong_Tran1;~Hubert_Ramsauer2;fei.tang@here.com;~Johannes_Lehner1;~D_P_Kreil1;~Michael_K_Kopp1;~G\u00fcnter_Klambauer1;~Angela_Bitto-Nemling1;~Sepp_Hochreiter1", "gender": ";F;;M;;M;;M;M;;M", "homepage": ";;https://www.jku.at/en/institute-for-machine-learning/about-us/team/viet-tran-mmsc/;;;;https://www.iarai.ac.at;;http://www.bioinf.jku.at/people/klambauer/;https://www.jku.at/institut-fuer-machine-learning/ueber-uns/team/di-dr-angela-bitto-nemling;https://www.jku.at/en/institute-for-machine-learning/about-us/team/sepp-hochreiter/", "dblp": ";245/4821;;159/2020;;232/0972;;;119/4499;;h/SeppHochreiter.html", "google_scholar": ";0AUdatYAAAAJ;;;;W-kY2_oAAAAJ;;DCwTo40AAAAJ;https://scholar.google.at/citations?user=rb2AvxIAAAAJ;;https://scholar.google.at/citations?user=tvUH3WMAAAAJ", "orcid": ";;;;;;0000-0001-7538-2056;0000-0002-1385-1109;0000-0003-2861-5552;;0000-0001-7449-2528", "linkedin": ";;;;;;;michael-kopp-95931490;;;https://linkedin.com/in/sepp-hochreiter-41514846", "or_profile": "~Andreas_F\u00fcrst1;~Elisabeth_Rumetshofer1;~Viet_Thuong_Tran1;~Hubert_Ramsauer2;fei.tang@here.com;~Johannes_Lehner1;~D_P_Kreil1;~Michael_K_Kopp1;~G\u00fcnter_Klambauer1;~Angela_Bitto-Nemling1;~Sepp_Hochreiter1", "aff": ";Johannes Kepler University Linz;Johannes Kepler University Linz;Johannes Kepler University Linz;;Johannes Kepler University Linz;Institute of Advanced Research in AI (IARAI);Institute of Advanced Research in Artificial Intelligence (IARAI);;Johannes Kepler University Linz;Johannes Kepler University Linz", "aff_domain": ";jku.at;jku.at;jku.at;;jku.at;iarai.ac.at;iarai.ac.at;;jku.at;jku.at", "position": ";PhD student;PhD student;PhD student;;PhD student;Scientific Director;Director;;Postdoc;Full Professor", "bibtex": "@misc{\nf{\\\"u}rst2022cloob,\ntitle={{CLOOB}: Modern Hopfield Networks with Info{LOOB} Outperform {CLIP}},\nauthor={Andreas F{\\\"u}rst and Elisabeth Rumetshofer and Viet Thuong Tran and Hubert Ramsauer and Fei Tang and Johannes Lehner and D P Kreil and Michael K Kopp and G{\\\"u}nter Klambauer and Angela Bitto-Nemling and Sepp Hochreiter},\nyear={2022},\nurl={https://openreview.net/forum?id=qw674L9PfQE}\n}", "github": "", "project": "", "reviewers": "jmHN;BgtC;vr8w;z4BR", "site": "https://openreview.net/forum?id=qw674L9PfQE", "pdf_size": 0, "recommendation": "5;5;6;8", "confidence": "4;4;4;3", "correctness": "3;4;3;3", "technical_novelty": "2;2;3;2", "empirical_novelty": "3;3;3;3", "wc_summary_paper": "84;53;73;59", "wc_summary_review": "37;41;55;88", "wc_main_review": "415;83;380;136", "wc_review": "536;177;508;283", "wc_reply_reviewers": "722;59;90;196", "wc_reply_authors": "1045;359;1456;488", "reply_reviewers": "1;1;1;2", "reply_authors": "3;1;3;1", "recommendation_avg": [ 6.0, 1.224744871391589 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 67.25, 12.090802289343747 ], "wc_summary_review_avg": [ 55.25, 20.054612935681405 ], "wc_main_review_avg": [ 253.5, 145.74035131012963 ], "wc_review_avg": [ 376.0, 151.05793590540023 ], "wc_reply_reviewers_avg": [ 266.75, 267.70261018525764 ], "wc_reply_authors_avg": [ 837.0, 440.6557613375774 ], "reply_reviewers_avg": [ 1.25, 0.4330127018922193 ], "reply_authors_avg": [ 2.0, 1.0 ], "replies_avg": [ 20, 0 ], "authors#_avg": [ 11, 0 ], "corr_recommendation_confidence": -0.9428090415820632, "corr_recommendation_correctness": -0.4714045207910316, "gs_citation": 127, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6479027454847744406&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff_unique_index": "0;0;0;0;1;2;0;0", "aff_unique_norm": "Johannes Kepler University;Institute of Advanced Research in AI;Institute of Advanced Research in Artificial Intelligence", "aff_unique_dep": ";;", "aff_unique_url": "https://www.jku.at;;https://www.ia-rai.at", "aff_unique_abbr": "JKU;IARAI;IARAI", "aff_campus_unique_index": "0;0;0;0;0;0", "aff_campus_unique": "Linz;", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "Austria;" }, { "title": "Optimal Transport for Causal Discovery", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6780", "id": "qwBK94cP1y", "poster": "", "openreview": "https://openreview.net/forum?id=qwBK94cP1y", "slides": "https://iclr.cc/virtual/2022/poster/6780", "video": "https://iclr.cc/virtual/2022/poster/6780", "author_site": "Ruibo Tu, Kun Zhang, Hedvig Kjellstr\u00f6m, Cheng Zhang", "tldr": "", "abstract": "To determine causal relationships between two variables, approaches based on Functional Causal Models (FCMs) have been proposed by properly restricting model classes; however, the performance is sensitive to the model assumptions, which makes it difficult to use. In this paper, we provide a novel dynamical-system view of FCMs and propose a new framework for identifying causal direction in the bivariate case. We first show the connection between FCMs and optimal transport, and then study optimal transport under the constraints of FCMs. Furthermore, by exploiting the dynamical interpretation of optimal transport under the FCM constraints, we determine the corresponding underlying dynamical process of the static cause-effect pair data. It provides a new dimension for describing static causal discovery tasks while enjoying more freedom for modeling the quantitative causal influences. In particular, we show that Additive Noise Models (ANMs) correspond to volume-preserving pressureless flows. Consequently, based on their velocity field divergence, we introduce a criterion for determining causal direction. With this criterion, we propose a novel optimal transport-based algorithm for ANMs which is robust to the choice of models and extend it to post-nonlinear models. Our method demonstrated state-of-the-art results on both synthetic and causal discovery benchmark datasets.", "keywords": "causal discovery;optimal transport;functional causal model", "primary_area": "", "supplementary_material": "/attachment/e894ce6b442b67ed4947ed85203f417026f8ba5d.zip", "author": "Ruibo Tu;Kun Zhang;Hedvig Kjellstrom;Cheng Zhang", "authorids": "~Ruibo_Tu1;~Kun_Zhang1;~Hedvig_Kjellstrom1;~Cheng_Zhang1", "gender": "M;M;F;F", "homepage": "https://www.kth.se/profile/ruibo/?l=en;http://www.andrew.cmu.edu/user/kunz1/;https://www.kth.se/profile/hedvig;http://cheng-zhang.org", "dblp": "223/4417;96/3115-1;k/HedvigKjellstrom;82/6384-5", "google_scholar": "https://scholar.google.se/citations?user=auIx_r0AAAAJ;RGoypN4AAAAJ;wr3CtKAAAAAJ;r40iAwIAAAAJ", "orcid": ";;0000-0002-5750-9655;", "linkedin": ";;hedvig-kjellstr%C3%B6m-aaa973/;", "or_profile": "~Ruibo_Tu1;~Kun_Zhang1;~Hedvig_Kjellstrom1;~Cheng_Zhang1", "aff": "KTH Royal Institute of Technology, Stockholm, Sweden;Carnegie Mellon University;KTH Royal Institute of Technology;Microsoft", "aff_domain": "kth.se;cmu.edu;kth.se;microsoft.com", "position": "PhD student;Associate Professor;Full Professor;Principal Researcher", "bibtex": "@inproceedings{\ntu2022optimal,\ntitle={Optimal Transport for Causal Discovery},\nauthor={Ruibo Tu and Kun Zhang and Hedvig Kjellstrom and Cheng Zhang},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=qwBK94cP1y}\n}", "github": "", "project": "", "reviewers": "xS6T;GA77;pbTi", "pdf_size": 0, "recommendation": "6;6;8", "confidence": "4;3;4", "correctness": "4;4;4", "technical_novelty": "3;4;4", "empirical_novelty": "3;3;3", "wc_summary_paper": "132;58;89", "wc_summary_review": "118;41;12", "wc_main_review": "216;317;508", "wc_review": "466;416;609", "wc_reply_reviewers": "110;54;0", "wc_reply_authors": "2011;854;893", "reply_reviewers": "2;1;0", "reply_authors": "7;4;2", "recommendation_avg": [ 6.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 4.0, 0.0 ], "technical_novelty_avg": [ 3.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 93.0, 30.342489460600735 ], "wc_summary_review_avg": [ 57.0, 44.7288124888943 ], "wc_main_review_avg": [ 347.0, 121.08123994519822 ], "wc_review_avg": [ 497.0, 81.78426906604146 ], "wc_reply_reviewers_avg": [ 54.666666666666664, 44.90978611493144 ], "wc_reply_authors_avg": [ 1252.6666666666667, 536.4589660190444 ], "reply_reviewers_avg": [ 1.0, 0.816496580927726 ], "reply_authors_avg": [ 4.333333333333333, 2.0548046676563256 ], "replies_avg": [ 20, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.49999999999999983, "corr_recommendation_correctness": 0.0, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1189978967786124882&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "pdf": "https://openreview.net/pdf?id=qwBK94cP1y", "email": "kth.se;cmu.edu;kth.se;microsoft.com", "author_num": 4, "aff_unique_index": "0;1;0;2", "aff_unique_norm": "KTH Royal Institute of Technology;Carnegie Mellon University;Microsoft", "aff_unique_dep": ";;Microsoft Corporation", "aff_unique_url": "https://www.kth.se;https://www.cmu.edu;https://www.microsoft.com", "aff_unique_abbr": "KTH;CMU;Microsoft", "aff_campus_unique_index": "0", "aff_campus_unique": "Stockholm;", "aff_country_unique_index": "0;1;0;1", "aff_country_unique": "Sweden;United States" }, { "title": "Random matrices in service of ML footprint: ternary random features with no performance loss", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6229", "id": "qwULHx9zld", "poster": "", "openreview": "https://openreview.net/forum?id=qwULHx9zld", "slides": "https://iclr.cc/virtual/2022/poster/6229", "video": "https://iclr.cc/virtual/2022/poster/6229", "author_site": "Hafiz Tiomoko Ali, Zhenyu Liao, Romain Couillet", "tldr": "", "abstract": "In this article, we investigate the spectral behavior of random features kernel matrices of the type ${\\bf K} = \\mathbb{E}_{{\\bf w}} \\left[\\sigma\\left({\\bf w}^{\\sf T}{\\bf x}_i\\right)\\sigma\\left({\\bf w}^{\\sf T}{\\bf x}_j\\right)\\right]_{i,j=1}^n$, with nonlinear function $\\sigma(\\cdot)$, data ${\\bf x}_1, \\ldots, {\\bf x}_n \\in \\mathbb{R}^p$, and random projection vector ${\\bf w} \\in \\mathbb{R}^p$ having i.i.d. entries. In a high-dimensional setting where the number of data $n$ and their dimension $p$ are both large and comparable, we show, under a Gaussian mixture model for the data, that the eigenspectrum of ${\\bf K}$ is independent of the distribution of the i.i.d.(zero-mean and unit-variance) entries of ${\\bf w}$, and only depends on $\\sigma(\\cdot)$ via its (generalized) Gaussian moments $\\mathbb{E}_{z\\sim \\mathcal N(0,1)}[\\sigma'(z)]$ and $\\mathbb{E}_{z\\sim \\mathcal N(0,1)}[\\sigma''(z)]$. As a result, for any kernel matrix ${\\bf K}$ of the form above, we propose a novel random features technique, called Ternary Random Features (TRFs), that (i) asymptotically yields the same limiting kernel as the original ${\\bf K}$ in a spectral sense and (ii) can be computed and stored much more efficiently, by wisely tuning (in a data-dependent manner) the function $\\sigma$ and the random vector ${\\bf w}$, both taking values in $\\{-1,0,1\\}$. The computation of the proposed random features requires no multiplication, and a factor of $b$ times less bits for storage compared to classical random features such as random Fourier features, with $b$ the number of bits to store full precision values. Besides, it appears in our experiments on real data that the substantial gains in computation and storage are accompanied with somewhat improved performances compared to state-of-the-art random features methods.", "keywords": "Computationally efficient methods;kernel methods;random features;random matrix theory", "primary_area": "", "supplementary_material": "/attachment/4a8c158efe92889493fd4b09b921b0d957d7dcb2.zip", "author": "Hafiz Tiomoko Ali;Zhenyu Liao;Romain Couillet", "authorids": "~Hafiz_Tiomoko_Ali1;~Zhenyu_Liao1;~Romain_Couillet1", "gender": "M;M;", "homepage": ";https://zhenyu-liao.github.io/;", "dblp": "177/9093;49/10218-1;00/2812", "google_scholar": ";https://scholar.google.fr/citations?user=SPYhJV8AAAAJ;", "orcid": ";0000-0002-1915-8559;", "linkedin": ";;", "or_profile": "~Hafiz_Tiomoko_Ali1;~Zhenyu_Liao1;~Romain_Couillet1", "aff": ";Huazhong University of Science and Technology;University of Grenoble-Alpes", "aff_domain": ";hust.edu.cn;univ-grenoble-alpes.fr", "position": ";Associate Professor;Full Professor", "bibtex": "@inproceedings{\nali2022random,\ntitle={Random matrices in service of {ML} footprint: ternary random features with no performance loss},\nauthor={Hafiz Tiomoko Ali and Zhenyu Liao and Romain Couillet},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=qwULHx9zld}\n}", "github": "", "project": "", "reviewers": "c4jW;RdKc;AjvY;2gn7", "pdf_size": 0, "recommendation": "6;6;8;8", "confidence": "2;3;3;2", "correctness": "3;3;4;3", "technical_novelty": "2;4;3;4", "empirical_novelty": "2;3;4;4", "wc_summary_paper": "147;78;132;231", "wc_summary_review": "38;45;54;51", "wc_main_review": "245;401;1077;294", "wc_review": "430;524;1263;576", "wc_reply_reviewers": "0;0;114;0", "wc_reply_authors": "238;586;547;443", "reply_reviewers": "0;0;1;0", "reply_authors": "1;1;2;1", "recommendation_avg": [ 7.0, 1.0 ], "confidence_avg": [ 2.5, 0.5 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 3.25, 0.82915619758885 ], "empirical_novelty_avg": [ 3.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 147.0, 54.868023474515645 ], "wc_summary_review_avg": [ 47.0, 6.123724356957945 ], "wc_main_review_avg": [ 504.25, 335.45444921777386 ], "wc_review_avg": [ 698.25, 330.2305066162119 ], "wc_reply_reviewers_avg": [ 28.5, 49.363448015713004 ], "wc_reply_authors_avg": [ 453.5, 134.95276951585691 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5706402488540408892&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2, "pdf": "https://openreview.net/pdf?id=qwULHx9zld", "email": ";hust.edu.cn;univ-grenoble-alpes.fr", "author_num": 3, "aff_unique_index": "0;1", "aff_unique_norm": "Huazhong University of Science and Technology;University of Grenoble-Alpes", "aff_unique_dep": ";", "aff_unique_url": "http://www.hust.edu.cn;https://www.univ-grenoble-alpes.fr", "aff_unique_abbr": "HUST;UGA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "China;France" }, { "id": "qy4uO5c_OB", "title": "Efficient Bi-level Optimization for Non-smooth Optimization", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Bi-level optimization plays a key role in a lot of machine learning applications. However, existing state-of-the-art bi-level optimization methods are limited to smooth or some specific non-smooth lower-level problems. Even worse, most of them depend on approximating hypergradients to update upper-level variable which is the inherent reason for non-efficiency. Currently, achieving a generalized and efficient optimization algorithm for bi-level problems with a non-smooth, even non-Lipschitz continuous lower-level objective is still an open question to the best of our knowledge. To address these challenging problems, in this paper, we propose a new bi-level optimization algorithm based on the smoothing and penalty techniques. Specifically, we first produce a sequence of smoothed lower-level objectives with an exponential decay smoothing parameter for the non-smooth lower-level problem. Then, we transform the smoothed bi-level optimization to an unconstrained penalty problem by replacing the smoothed sub-problem with its first-order necessary conditions. Finally, we update the upper and lower-level variables alternately with doubly stochastic gradients of the unconstrained penalty problem. Importantly, we provide the theoretical analysis to show that our method can converge to a stationary point of original non-smooth bi-level problem if the lower-level problem is convex, and we give the necessary condition of the original problem if the lower-level problem is nonconvex. We compare our method with existing state-of-the-art bi-level optimization methods in three tasks, and all the experimental results demonstrate that our method is superior to the others in terms of accuracy and efficiency.", "keywords": "bilevel optimization;nonsmooth", "primary_area": "", "supplementary_material": "/attachment/8df1094942a1ff691c244204a1bacd6acabc7285.zip", "author": "Wanli Shi;Heng Huang;Bin Gu", "authorids": "~Wanli_Shi1;~Heng_Huang1;~Bin_Gu1", "gender": "M;M;M", "homepage": ";https://www.cs.umd.edu/~heng/;https://mbzuai.ac.ae/study/faculty/bin-gu/", "dblp": "245/9064;03/281;29/1758-1", "google_scholar": "Li38vbwAAAAJ;4OqLaDwAAAAJ;Vo8OgCgAAAAJ", "orcid": ";;0000-0001-6049-1815", "linkedin": ";;", "or_profile": "~Wanli_Shi1;~Heng_Huang1;~Bin_Gu1", "aff": "Nanjing University of Information Science and Technology;University of Pittsburgh;Mohamed bin Zayed University of Artificial Intelligence", "aff_domain": "nuist.edu.cn;pitt.edu;mbzuai.ac.ae", "position": "PhD student;Full Professor;Assistant Professor", "bibtex": "@misc{\nshi2022efficient,\ntitle={Efficient Bi-level Optimization for Non-smooth Optimization},\nauthor={Wanli Shi and Heng Huang and Bin Gu},\nyear={2022},\nurl={https://openreview.net/forum?id=qy4uO5c_OB}\n}", "github": "", "project": "", "reviewers": "2NWN;iwR9;5gvu", "site": "https://openreview.net/forum?id=qy4uO5c_OB", "pdf_size": 0, "recommendation": "3;5;6", "confidence": "4;4;4", "correctness": "2;4;4", "technical_novelty": "2;2;3", "empirical_novelty": "3;3;3", "wc_summary_paper": "41;50;63", "wc_summary_review": "46;21;51", "wc_main_review": "522;297;206", "wc_review": "609;368;320", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 4.666666666666667, 1.247219128924647 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.3333333333333335, 0.9428090415820634 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 51.333333333333336, 9.030811456096044 ], "wc_summary_review_avg": [ 39.333333333333336, 13.12334645668635 ], "wc_main_review_avg": [ 341.6666666666667, 132.8164983058288 ], "wc_review_avg": [ 432.3333333333333, 126.44981437533056 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.944911182523068, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:48Vc6mVTyl4J:scholar.google.com/&scioq=Efficient+Bi-level+Optimization+for+Non-smooth+Optimization&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;2", "aff_unique_norm": "Nanjing University of Information Science and Technology;University of Pittsburgh;Mohamed bin Zayed University of Artificial Intelligence", "aff_unique_dep": ";;", "aff_unique_url": "http://www.nuist.edu.cn;https://www.pitt.edu;https://mbzuai.ac.ae", "aff_unique_abbr": ";Pitt;MBZUAI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2", "aff_country_unique": "China;United States;United Arab Emirates" }, { "title": "CrowdPlay: Crowdsourcing Human Demonstrations for Offline Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6086", "id": "qyTBxTztIpQ", "poster": "", "openreview": "https://openreview.net/forum?id=qyTBxTztIpQ", "slides": "https://iclr.cc/virtual/2022/poster/6086", "video": "https://iclr.cc/virtual/2022/poster/6086", "author_site": "Matthias Gerstgrasser, Rakshit Trivedi, David Parkes", "tldr": "", "abstract": "Crowdsourcing has been instrumental for driving AI advances that rely on large-scale data. At the same time, reinforcement learning has seen rapid progress through benchmark environments that strike a balance between tractability and real-world complexity, such as ALE and OpenAI Gym. In this paper, we aim to fill a gap at the intersection of these two: The use of crowdsourcing to generate large-scale human demonstration data in the support of advancing research into imitation learning and offline learning.\nTo this end, we present CrowdPlay, a complete crowdsourcing pipeline for any standard RL environment including OpenAI Gym (made available under an open-source license); a large-scale publicly available crowdsourced dataset of human gameplay demonstrations in Atari 2600 games, including multimodal behavior and human-human and human-AI multiagent data; offline learning benchmarks with extensive human data evaluation; and a detailed study of incentives, including real-time feedback to drive high quality data.\nWe hope that this will drive the improvement in design of algorithms that account for the complexity of human, behavioral data and thereby enable a step forward in direction of effective learning for real-world settings. Our code and dataset are available at https://mgerstgrasser.github.io/crowdplay/.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Matthias Gerstgrasser;Rakshit Trivedi;David C. Parkes", "authorids": "~Matthias_Gerstgrasser1;~Rakshit_Trivedi1;~David_C._Parkes1", "gender": ";;M", "homepage": "https://matthias.gerstgrasser.net/;;https://parkes.seas.harvard.edu/", "dblp": "182/1338;;p/DavidCParkes.html", "google_scholar": "qEirpPYAAAAJ;;JUn8PgwAAAAJ", "orcid": ";;0000-0002-2701-3464", "linkedin": ";;", "or_profile": "~Matthias_Gerstgrasser1;~Rakshit_Trivedi1;~David_C._Parkes1", "aff": "Harvard University;;Google", "aff_domain": "harvard.edu;;deepmind.com", "position": "Postdoc;;Senior Research Scientist", "bibtex": "@inproceedings{\ngerstgrasser2022crowdplay,\ntitle={CrowdPlay: Crowdsourcing Human Demonstrations for Offline Learning},\nauthor={Matthias Gerstgrasser and Rakshit Trivedi and David C. Parkes},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=qyTBxTztIpQ}\n}", "github": "", "project": "", "reviewers": "mjxC;1Le5;YicS", "pdf_size": 0, "recommendation": "5;6;8", "confidence": "4;4;4", "correctness": "3;3;3", "technical_novelty": "3;4;4", "empirical_novelty": "3;3;2", "wc_summary_paper": "47;77;50", "wc_summary_review": "91;46;55", "wc_main_review": "468;187;406", "wc_review": "606;310;511", "wc_reply_reviewers": "338;0;76", "wc_reply_authors": "2415;578;1059", "reply_reviewers": "2;0;2", "reply_authors": "6;1;3", "recommendation_avg": [ 6.333333333333333, 1.247219128924647 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 3.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 58.0, 13.490737563232042 ], "wc_summary_review_avg": [ 64.0, 19.44222209522358 ], "wc_main_review_avg": [ 353.6666666666667, 120.53860607936178 ], "wc_review_avg": [ 475.6666666666667, 123.39728071918316 ], "wc_reply_reviewers_avg": [ 138.0, 144.78489792332164 ], "wc_reply_authors_avg": [ 1350.6666666666667, 777.7936030136073 ], "reply_reviewers_avg": [ 1.3333333333333333, 0.9428090415820634 ], "reply_authors_avg": [ 3.3333333333333335, 2.0548046676563256 ], "replies_avg": [ 22, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8800852621014155879&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2, "pdf": "https://openreview.net/pdf?id=qyTBxTztIpQ", "email": "harvard.edu;;deepmind.com", "author_num": 3, "aff_unique_index": "0;1", "aff_unique_norm": "Harvard University;Google", "aff_unique_dep": ";Google", "aff_unique_url": "https://www.harvard.edu;https://www.google.com", "aff_unique_abbr": "Harvard;Google", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "qynB_fAt5TQ", "title": "Centroid Approximation for Bootstrap", "track": "main", "status": "Reject", "tldr": "", "abstract": "Bootstrap is a principled and powerful frequentist statistical tool for uncertainty quantification. Unfortunately, standard bootstrap methods are computationally intensive due to the need of drawing a large i.i.d. bootstrap sample to approximate the ideal bootstrap distribution; this largely hinders their application in large-scale machine learning, especially deep learning problems. In this work, we propose an efficient method to explicitly \\emph{optimize} a small set of high quality ``centroid'' points to better approximate the ideal bootstrap distribution. We achieve this by minimizing a simple objective function that is asymptotically equivalent to the Wasserstein distance to the ideal bootstrap distribution. This allows us to provide an accurate estimation of uncertainty with a small number of bootstrap centroids, outperforming the naive i.i.d. sampling approach. Empirically, we show that our method can boost the performance of bootstrap in a variety of applications.", "keywords": "bootstrap;centroid approximation;uncertainty", "primary_area": "", "supplementary_material": "/attachment/cc2eb819385545ccb8191ccabc809170f8dc3929.zip", "author": "Mao Ye;qiang liu", "authorids": "~Mao_Ye11;~qiang_liu4", "gender": "M;M", "homepage": "https://lushleaf.github.io/;https://www.cs.utexas.edu/~lqiang/", "dblp": "36/2301;61/3234-1", "google_scholar": "V5gL_H0AAAAJ;https://scholar.google.com.tw/citations?user=2qDh4WUAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Mao_Ye11;~Qiang_Liu1", "aff": "University of Texas, Austin;University of Texas, Austin", "aff_domain": "utexas.edu;utexas.edu", "position": "PhD student;Assistant Professor", "bibtex": "@misc{\nye2022centroid,\ntitle={Centroid Approximation for Bootstrap},\nauthor={Mao Ye and qiang liu},\nyear={2022},\nurl={https://openreview.net/forum?id=qynB_fAt5TQ}\n}", "github": "", "project": "", "reviewers": "y4AP;Uq9L;oMBb;5nLX", "site": "https://openreview.net/forum?id=qynB_fAt5TQ", "pdf_size": 0, "recommendation": "3;5;5;5", "confidence": "4;4;4;3", "correctness": "2;3;3;3", "technical_novelty": "3;3;3;1", "empirical_novelty": "2;2;3;2", "wc_summary_paper": "97;60;183;76", "wc_summary_review": "316;82;409;27", "wc_main_review": "784;424;389;353", "wc_review": "1197;566;981;456", "wc_reply_reviewers": "298;0;0;0", "wc_reply_authors": "1088;465;420;442", "reply_reviewers": "2;0;0;0", "reply_authors": "4;1;1;1", "recommendation_avg": [ 4.5, 0.8660254037844386 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.8660254037844386 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 104.0, 47.460509900337144 ], "wc_summary_review_avg": [ 208.5, 158.66710434113304 ], "wc_main_review_avg": [ 487.5, 173.01517274505147 ], "wc_review_avg": [ 800.0, 301.43904856537745 ], "wc_reply_reviewers_avg": [ 74.5, 129.03778516388135 ], "wc_reply_authors_avg": [ 603.75, 280.0342612967206 ], "reply_reviewers_avg": [ 0.5, 0.8660254037844386 ], "reply_authors_avg": [ 1.75, 1.299038105676658 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": 1.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4971330963630491456&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2, "aff_unique_index": "0;0", "aff_unique_norm": "University of Texas at Austin", "aff_unique_dep": "", "aff_unique_url": "https://www.utexas.edu", "aff_unique_abbr": "UT Austin", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Austin", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "qynwf18DgXM", "title": "Manifold Micro-Surgery with Linearly Nearly Euclidean Metrics", "track": "main", "status": "Reject", "tldr": "", "abstract": "The Ricci flow is a method of manifold surgery, which can trim manifolds to more regular. However, in most cases, the Rich flow tends to develop singularities and lead to divergence of the solution. In this paper, we propose linearly nearly Euclidean metrics to assist manifold micro-surgery, which means that we prove the dynamical stability and convergence of such metrics under the Ricci-DeTurck flow. From the information geometry and mirror descent points of view, we give the approximation of the steepest descent gradient flow on the linearly nearly Euclidean manifold with dynamical stability. In practice, the regular shrinking or expanding of Ricci solitons with linearly nearly Euclidean metrics will provide a geometric optimization method for the solution on a manifold.", "keywords": "Geometry;Ricci flow;Neural network;Metric learning;Information geometry", "primary_area": "", "supplementary_material": "", "author": "Jun Chen;Tianxin Huang;Wenzhou Chen;Yong Liu", "authorids": "~Jun_Chen9;~Tianxin_Huang1;~Wenzhou_Chen1;~Yong_Liu11", "gender": "M;M;M;M", "homepage": ";https://tianxinhuang.github.io/;;https://person.zju.edu.cn/en/yongliu", "dblp": ";251/3784;;29/4867-7", "google_scholar": "YKc2O78AAAAJ;https://scholar.google.com.hk/citations?user=Fg7WYfcAAAAJ;0QH8EecAAAAJ;https://scholar.google.com.hk/citations?user=qYcgBbEAAAAJ", "orcid": "0000-0001-6568-8801;;;0000-0003-4822-8939", "linkedin": ";;;", "or_profile": "~Jun_Chen9;~Tianxin_Huang1;~Wenzhou_Chen1;~Yong_Liu11", "aff": "Zhejiang University;Zhejiang University;Zhejiang University;Zhejiang University", "aff_domain": "zju.edu.cn;zju.edu.cn;zju.edu.cn;zju.edu.cn", "position": "PhD student;PhD student;PhD student;Full Professor", "bibtex": "@misc{\nchen2022manifold,\ntitle={Manifold Micro-Surgery with Linearly Nearly Euclidean Metrics},\nauthor={Jun Chen and Tianxin Huang and Wenzhou Chen and Yong Liu},\nyear={2022},\nurl={https://openreview.net/forum?id=qynwf18DgXM}\n}", "github": "", "project": "", "reviewers": "g1ct;ibvS;HjEu", "site": "https://openreview.net/forum?id=qynwf18DgXM", "pdf_size": 0, "recommendation": "3;3;5", "confidence": "3;2;3", "correctness": "2;3;3", "technical_novelty": "2;3;3", "empirical_novelty": "1;0;0", "wc_summary_paper": "58;21;52", "wc_summary_review": "31;148;12", "wc_main_review": "185;323;244", "wc_review": "274;492;308", "wc_reply_reviewers": "0;516;0", "wc_reply_authors": "176;567;201", "reply_reviewers": "0;2;0", "reply_authors": "1;3;1", "recommendation_avg": [ 3.6666666666666665, 0.9428090415820634 ], "confidence_avg": [ 2.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 2.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 0.3333333333333333, 0.4714045207910317 ], "wc_summary_paper_avg": [ 43.666666666666664, 16.21384867602041 ], "wc_summary_review_avg": [ 63.666666666666664, 60.13503323539634 ], "wc_main_review_avg": [ 250.66666666666666, 56.53514148051831 ], "wc_review_avg": [ 358.0, 95.76359781601079 ], "wc_reply_reviewers_avg": [ 172.0, 243.24473272817235 ], "wc_reply_authors_avg": [ 314.6666666666667, 178.71827612816273 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.9428090415820634 ], "reply_authors_avg": [ 1.6666666666666667, 0.9428090415820634 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.5, "corr_recommendation_correctness": 0.5, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:YUt2CuwrVgsJ:scholar.google.com/&scioq=Manifold+Micro-Surgery+with+Linearly+Nearly+Euclidean+Metrics&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Zhejiang University", "aff_unique_dep": "", "aff_unique_url": "https://www.zju.edu.cn", "aff_unique_abbr": "ZJU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "id": "qyzTEWWM0Pp", "title": "Multiresolution Equivariant Graph Variational Autoencoder", "track": "main", "status": "Reject", "tldr": "", "abstract": "In this paper, we propose Multiresolution Equivariant Graph Variational Autoencoders (MGVAE), the first hierarchical generative model to learn and generate graphs in a multiresolution and equivariant manner. At each resolution level, MGVAE employs higher order message passing to encode the graph while learning to partition it into mutually exclusive clusters and coarsening into a lower resolution that eventually creates a hierarchy of latent distributions. MGVAE then constructs a hierarchical generative model to variationally decode into a hierarchy of coarsened graphs. Importantly, our proposed framework is end-to-end permutation equivariant with respect to node ordering. MGVAE achieves competitive results with several generative tasks including general graph generation, molecular generation, unsupervised molecular representation learning to predict molecular properties, link prediction on citation graphs, and graph-based image generation.", "keywords": "group equivariant model;graph neural network;hierarchical generative model;variational autoencoder;graph generation;link prediction;unsupervised representation learning", "primary_area": "", "supplementary_material": "", "author": "Truong Son Hy;Risi Kondor", "authorids": "~Truong_Son_Hy1;~Risi_Kondor1", "gender": "M;M", "homepage": "https://hytruongson.github.io/HySonLab/;http://people.cs.uchicago.edu/~risi/", "dblp": "213/7552;90/869", "google_scholar": "JiKBo6UAAAAJ;v12-jLUAAAAJ", "orcid": "0000-0002-5092-3757;", "linkedin": "truong-son-h-4a9185b6/;", "or_profile": "~Truong_Son_Hy1;~Risi_Kondor1", "aff": "University of Chicago;University of Chicago", "aff_domain": "uchicago.edu;uchicago.edu", "position": "PhD student;Associate Professor", "bibtex": "@misc{\nhy2022multiresolution,\ntitle={Multiresolution Equivariant Graph Variational Autoencoder},\nauthor={Truong Son Hy and Risi Kondor},\nyear={2022},\nurl={https://openreview.net/forum?id=qyzTEWWM0Pp}\n}", "github": "", "project": "", "reviewers": "PA3h;XesM;dMFu;KkPB;867y;9mH3", "site": "https://openreview.net/forum?id=qyzTEWWM0Pp", "pdf_size": 0, "recommendation": "3;5;5;5;5;6", "confidence": "4;4;4;4;3;3", "correctness": "2;3;2;4;4;3", "technical_novelty": "3;2;2;3;3;3", "empirical_novelty": "2;2;0;3;1;3", "wc_summary_paper": "22;83;116;86;142;37", "wc_summary_review": "43;16;57;14;210;77", "wc_main_review": "230;226;577;450;393;390", "wc_review": "295;325;750;550;745;504", "wc_reply_reviewers": "0;36;0;0;0;0", "wc_reply_authors": "291;330;452;559;312;230", "reply_reviewers": "0;1;0;0;0;0", "reply_authors": "1;1;1;1;1;1", "recommendation_avg": [ 4.833333333333333, 0.8975274678557505 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.0, 0.816496580927726 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 1.8333333333333333, 1.0671873729054748 ], "wc_summary_paper_avg": [ 81.0, 41.617304093369626 ], "wc_summary_review_avg": [ 69.5, 66.58015219768326 ], "wc_main_review_avg": [ 377.6666666666667, 122.58421141765724 ], "wc_review_avg": [ 528.1666666666666, 179.31854028206033 ], "wc_reply_reviewers_avg": [ 6.0, 13.416407864998739 ], "wc_reply_authors_avg": [ 362.3333333333333, 110.22502236586553 ], "reply_reviewers_avg": [ 0.16666666666666666, 0.372677996249965 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.5252257314388904, "corr_recommendation_correctness": 0.4548588261473422, "gs_citation": 27, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9948847485797288502&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "aff_unique_index": "0;0", "aff_unique_norm": "University of Chicago", "aff_unique_dep": "", "aff_unique_url": "https://www.uchicago.edu", "aff_unique_abbr": "UChicago", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "qzT7ONeJKaK", "title": "Confident Data-free Model Stealing for Black-box Adversarial Attacks", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Deep machine learning models are increasingly deployed in the wild, subject to adversarial attacks. White-box model attacks assume to have full knowledge of the deployed target models, whereas the black-box models need to infer the target model via curated labeled dataset or sending abundant data queries and launch attacks. The challenge of black-box lies in how to acquire data for querying target models and effectively learn the substitute model using a minimum number of query data, which can be real or synthetic. In this paper, we propose an effective and confident data-free black-box attack, CODFE, which steals target model by queries of synthetically generated data. The core of our attack is a model stealing optimization consisting of two collaborating models (i) substitute model which imitates the target model and (ii) generator which generates most representative data to maximize the confidence of substitute model. We propose a novel training procedure that steers the synthesizing direction based on the confidence of substitute model and exploit a given set of synthetically generated images by multiple training iterations. We show the theoretical convergence of the proposed model stealing optimization and empirically evaluate its success rate on three datasets. Our results show that the accuracy of substitute models and attack success rate can be up to 56% and 34% higher than the state of the art data-free black-box attacks.", "keywords": "machine learning;black-box attacks;data-free model stealing", "primary_area": "", "supplementary_material": "/attachment/0600897f120d6ea31d460779c11645858d109a60.zip", "author": "Chi Hong;Jiyue Huang;Lydia Y. Chen", "authorids": "~Chi_Hong1;~Jiyue_Huang1;~Lydia_Y._Chen1", "gender": "male;F;F", "homepage": ";;https://www.lydiaychen.com/", "dblp": "202/1780;231/1059;https://dblp.uni-trier.de/pers/c/Chen:Lydia_Y=.html", "google_scholar": "zppla80AAAAJ;ICKOpU4AAAAJ;https://scholar.google.ch/citations?hl=en", "orcid": ";;", "linkedin": ";;", "or_profile": "~Chi_Hong1;~Jiyue_Huang1;~Lydia_Y._Chen1", "aff": "Delft University of Technology;Delft University of Technology;Delft University of Technology", "aff_domain": "tudelft.nl;tudelft.nl;tudelft.nl", "position": "PhD student;PhD student;Associate Professor", "bibtex": "@misc{\nhong2022confident,\ntitle={Confident Data-free Model Stealing for Black-box Adversarial Attacks},\nauthor={Chi Hong and Jiyue Huang and Lydia Y. Chen},\nyear={2022},\nurl={https://openreview.net/forum?id=qzT7ONeJKaK}\n}", "github": "", "project": "", "reviewers": "zzK9;ja4X;MMsk;oAq7", "site": "https://openreview.net/forum?id=qzT7ONeJKaK", "pdf_size": 0, "recommendation": "3;3;3;3", "confidence": "4;4;2;5", "correctness": "2;2;3;3", "technical_novelty": "3;3;2;2", "empirical_novelty": "3;2;2;2", "wc_summary_paper": "78;74;48;74", "wc_summary_review": "45;76;22;34", "wc_main_review": "810;467;131;400", "wc_review": "933;617;201;508", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 3.75, 1.0897247358851685 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 68.5, 11.947803145348521 ], "wc_summary_review_avg": [ 44.25, 20.054612935681405 ], "wc_main_review_avg": [ 452.0, 241.93697526422042 ], "wc_review_avg": [ 564.75, 261.6642648509727 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:f0uPhAJ5ieEJ:scholar.google.com/&scioq=Confident+Data-free+Model+Stealing+for+Black-box+Adversarial+Attacks&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0;0", "aff_unique_norm": "Delft University of Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.tudelft.nl", "aff_unique_abbr": "TU Delft", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Netherlands" }, { "id": "r4PibJdCyn", "title": "TotalRecall: A Bidirectional Candidates Generation Framework for Large Scale Recommender \\& Advertising Systems", "track": "main", "status": "Reject", "tldr": "", "abstract": "Recommender (RS) and Advertising/Marketing Systems (AS) play the key roles in E-commerce companies like Amazaon and Alibaba. RS needs to generate thousands of item candidates for each user ($u2i$), while AS needs to identify thousands or even millions of high-potential users for given items so that the merchant can advertise these items efficiently with limited budget ($i2u$). This paper proposes an elegant bidirectional candidates generation framework that can serve both purposes all together. Besides, our framework is also superior in these aspects: $i).$ Our framework can easily incorporate many DNN-architectures of RS ($u2i$), and increase the HitRate and Recall by a large margin. $ii).$ We archive much better results in $i2u$ candidates generation compare to strong baselines. $iii).$ We empirically show that our framework can diversify the generated candidates, and ensure fast convergence to better results.", "keywords": "Recommender System;Advertising System;Collaborative filtering;Matrix Factorization;Contrastive Learning;Candidates Generation;Embeddings", "primary_area": "", "supplementary_material": "", "author": "Qifang Zhao;Yu Jiang;Yuqing Liu;Meng Du;Qinghui Sun;Chao Xu;huan xu;Zhongyao Wang", "authorids": "~Qifang_Zhao1;jiangyu.jiangyu@alibaba-inc.com;liuyq89@mail2.sysu.edu.cn;dmmeng.dm@alibaba-inc.com;yuyang.sqh@alibaba-inc.com;mudao.xc@alibaba-inc.com;~huan_xu1;zhongyao.wangzy@alibaba-inc.com", "gender": ";;;;;;M;", "homepage": ";;;;;;;", "dblp": ";;;;;;35/2843;", "google_scholar": ";;;;;;;", "orcid": ";;;;;;;", "linkedin": "zhaoqf123/;;;;;;;", "or_profile": "~Qifang_Zhao1;jiangyu.jiangyu@alibaba-inc.com;liuyq89@mail2.sysu.edu.cn;dmmeng.dm@alibaba-inc.com;yuyang.sqh@alibaba-inc.com;mudao.xc@alibaba-inc.com;~huan_xu1;zhongyao.wangzy@alibaba-inc.com", "aff": "Alibaba Group;;;;;;Georgia Institute of Technology;", "aff_domain": "alibaba-inc.com;;;;;;gatech.edu;", "position": "Researcher;;;;;;Assistant Professor;", "bibtex": "@misc{\nzhao2022totalrecall,\ntitle={TotalRecall: A Bidirectional Candidates Generation Framework for Large Scale Recommender {\\textbackslash}\\& Advertising Systems},\nauthor={Qifang Zhao and Yu Jiang and Yuqing Liu and Meng Du and Qinghui Sun and Chao Xu and huan xu and Zhongyao Wang},\nyear={2022},\nurl={https://openreview.net/forum?id=r4PibJdCyn}\n}", "github": "", "project": "", "reviewers": "qfRw;knEH;eQ3i;vmLg", "site": "https://openreview.net/forum?id=r4PibJdCyn", "pdf_size": 0, "recommendation": "3;3;3;3", "confidence": "4;4;4;4", "correctness": "3;2;3;2", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "90;58;57;73", "wc_summary_review": "23;34;25;7", "wc_main_review": "345;1996;212;205", "wc_review": "458;2088;294;285", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "558;2601;283;117", "reply_reviewers": "0;0;0;0", "reply_authors": "1;5;1;1", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 69.5, 13.425721582097552 ], "wc_summary_review_avg": [ 22.25, 9.730750228014282 ], "wc_main_review_avg": [ 689.5, 756.3678007424695 ], "wc_review_avg": [ 781.25, 757.58873242677 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 889.75, 1000.4652355279518 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.0, 1.7320508075688772 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:W53N12-dAmAJ:scholar.google.com/&scioq=TotalRecall:+A+Bidirectional+Candidates+Generation+Framework+for+Large+Scale+Recommender+%5C%26+Advertising+Systems&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1", "aff_unique_norm": "Alibaba Group;Georgia Institute of Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.alibaba.com;https://www.gatech.edu", "aff_unique_abbr": "Alibaba;Georgia Tech", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "China;United States" }, { "id": "r5hq-Ooh_Ba", "title": "MutexMatch: Semi-supervised Learning with Mutex-based Consistency Regularization", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "The core issue in semi-supervised learning (SSL) lies in how to effectively leverage unlabeled data, whereas most existing methods usually concentrate on the utilization of high-confidence samples yet seldom fully explore the usage of low-confidence samples. Early SSL methods mostly require low-confidence samples to optimize the same loss function as high-confidence samples, but this setting might largely challenge the low-confidence samples especially at the early training stage. In this paper, we aim to utilize low-confidence samples in a novel way, which is realized by our proposed mutex-based consistency regularization, namely MutexMatch. To be specific, the high-confidence samples are required to exactly predict \"What it is\" by conventional True-Positive Classifier, while the low-confidence samples, for a much simpler goal, are employed to predict \"What it is not\" by True-Negative Classifier with ease. In this way, we not only mitigate the pseudo-labeling errors but also make full use of the low-confidence unlabeled data in the training stage. The proposed MutexMatch achieves superior performance on multiple benchmark datasets, i.e., CIFAR-10, CIFAR-100, SVHN, and STL-10. Particularly, our method shows further superiority under few quantities of labeled data, e.g., 91.77% accuracy with only 20 labeled data on CIFAR-10.", "keywords": "Semi-supervised learning;Barely supervised Learning;Mutex-based consistency regularization", "primary_area": "", "supplementary_material": "/attachment/98020842454c0c6175cf4428aaccb02f7beecc94.zip", "author": "Yue Duan;Zhen Zhao;Lei Qi;Lei Wang;Luping Zhou;Yinghuan Shi;Yang Gao", "authorids": "~Yue_Duan1;~Zhen_Zhao4;~Lei_Qi1;~Lei_Wang13;~Luping_Zhou3;~Yinghuan_Shi3;~Yang_Gao3", "gender": "M;M;M;M;F;M;M", "homepage": "https://njuyued.github.io/;http://zhaozhen.me/;http://palm.seu.edu.cn/qilei/;https://sites.google.com/view/lei-hs-wang;https://sites.google.com/view/lupingzhou;https://cs.nju.edu.cn/shiyh/;https://cs.nju.edu.cn/gaoyang/", "dblp": "10/9994;29/1773-1;15/2464-1;w/LeiWang1;45/933;30/7184;89/4402-1", "google_scholar": "https://scholar.google.is/citations?hl=zh-CN;7mpuhO8AAAAJ;7mm8iZwAAAAJ;5ClujcoAAAAJ;https://scholar.google.com.au/citations?user=SgofT2MAAAAJ;m6BKDUMAAAAJ;https://scholar.google.com.tw/citations?user=CJwLwzQAAAAJ", "orcid": "0000-0003-4131-7146;0000-0002-0796-4078;0000-0001-7091-0702;0000-0002-0961-0441;;;", "linkedin": ";;;;;;", "or_profile": "~Yue_Duan1;~Zhen_Zhao4;~Lei_Qi1;~Lei_Wang13;~Luping_Zhou3;~Yinghuan_Shi3;~Yang_Gao3", "aff": "Nanjing University;University of Sydney;Nanjing University;University of Wollonong;University of Sydney;Nanjing University;Nanjing University", "aff_domain": "nju.edu.cn;usyd.edu.au;nju.edu.cn;uow.edu.au;sydney.edu.au;nju.edu.cn;nju.edu.cn", "position": "MS student;PhD student;PhD student;Associate Professor;Associate Professor;Associate Professor;Full Professor", "bibtex": "@misc{\nduan2022mutexmatch,\ntitle={MutexMatch: Semi-supervised Learning with Mutex-based Consistency Regularization},\nauthor={Yue Duan and Zhen Zhao and Lei Qi and Lei Wang and Luping Zhou and Yinghuan Shi and Yang Gao},\nyear={2022},\nurl={https://openreview.net/forum?id=r5hq-Ooh_Ba}\n}", "github": "", "project": "", "reviewers": "y2Kv;7zCe;3Bqn", "site": "https://openreview.net/forum?id=r5hq-Ooh_Ba", "pdf_size": 0, "recommendation": "5;5;5", "confidence": "4;3;4", "correctness": "3;4;4", "technical_novelty": "4;2;3", "empirical_novelty": "3;3;0", "wc_summary_paper": "75;91;67", "wc_summary_review": "120;13;33", "wc_main_review": "425;161;179", "wc_review": "620;265;279", "wc_reply_reviewers": "381;498;0", "wc_reply_authors": "2527;2268;992", "reply_reviewers": "2;2;0", "reply_authors": "4;5;2", "recommendation_avg": [ 5.0, 0.0 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 3.0, 0.816496580927726 ], "empirical_novelty_avg": [ 2.0, 1.4142135623730951 ], "wc_summary_paper_avg": [ 77.66666666666667, 9.977753031397176 ], "wc_summary_review_avg": [ 55.333333333333336, 46.44949467492144 ], "wc_main_review_avg": [ 255.0, 120.43255373859678 ], "wc_review_avg": [ 388.0, 164.14830692598284 ], "wc_reply_reviewers_avg": [ 293.0, 212.61702659947062 ], "wc_reply_authors_avg": [ 1929.0, 670.9431173107499 ], "reply_reviewers_avg": [ 1.3333333333333333, 0.9428090415820634 ], "reply_authors_avg": [ 3.6666666666666665, 1.247219128924647 ], "replies_avg": [ 20, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 43, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14399257337953274610&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff_unique_index": "0;1;0;2;1;0;0", "aff_unique_norm": "Nanjing University;University of Sydney;University of Wollongong", "aff_unique_dep": ";;", "aff_unique_url": "https://www.nju.edu.cn;https://www.sydney.edu.au;https://www.uow.edu.au", "aff_unique_abbr": "Nanjing U;USYD;UOW", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;1;1;0;0", "aff_country_unique": "China;Australia" }, { "title": "MaGNET: Uniform Sampling from Deep Generative Network Manifolds Without Retraining", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6820", "id": "r5qumLiYwf9", "poster": "", "openreview": "https://openreview.net/forum?id=r5qumLiYwf9", "slides": "https://iclr.cc/virtual/2022/poster/6820", "video": "https://iclr.cc/virtual/2022/poster/6820", "author_site": "Ahmed Imtiaz Humayun, Randall Balestriero, Richard Baraniuk", "tldr": "", "abstract": "Deep Generative Networks (DGNs) are extensively employed in Generative Adversarial Networks (GANs), Variational Autoencoders (VAEs), and their variants to approximate the data manifold, and data distribution on that manifold. However, training samples are often obtained based on preferences, costs, or convenience producing artifacts in the empirical data distribution e.g. the large fraction of smiling faces in the CelebA dataset or the large fraction of dark-haired individuals in FFHQ). {\\em These inconsistencies will be reproduced when sampling from the trained DGN, which has far-reaching potential implications for fairness, data augmentation, anomaly detection, domain adaptation, and beyond.} In response, we develop a differential geometry based sampler -coined MaGNET- that, given any trained DGN, produces samples that are uniformly distributed on the learned manifold. We prove theoretically and empirically that our technique produces a uniform distribution on the manifold regardless of the training set distribution. We perform a range of experiments on various datasets and DGNs. One of them considers the state-of-the-art StyleGAN2 trained on FFHQ dataset, where uniform sampling via MaGNET increases distribution precision \\& recall by 4.12\\% \\& 3.01\\% and decreases gender bias by 41.2\\%, without requiring labels or retraining.", "keywords": "Deep Generative Networks;Uniform Sampling;Fairness;Data Augmentation", "primary_area": "", "supplementary_material": "", "author": "Ahmed Imtiaz Humayun;Randall Balestriero;Richard Baraniuk", "authorids": "~Ahmed_Imtiaz_Humayun1;~Randall_Balestriero1;~Richard_Baraniuk1", "gender": "M;M;", "homepage": "https://imtiazhumayun.github.io;https://randallbalestriero.github.io/;http://richb.rice.edu/", "dblp": "222/1771;175/5364;32/2804", "google_scholar": "wJ2HUn4AAAAJ;S1x_xqcAAAAJ;https://scholar.google.com.tw/citations?user=N-BBA20AAAAJ", "orcid": ";;", "linkedin": ";randallbalestriero/;richard-baraniuk", "or_profile": "~Ahmed_Imtiaz_Humayun1;~Randall_Balestriero1;~Richard_Baraniuk1", "aff": "Rice University;Meta Facebook;William Marsh Rice University", "aff_domain": "rice.edu;facebook.com;rice.edu", "position": "PhD student;Postdoc;C. Sidney Burrus Professor", "bibtex": "@inproceedings{\nhumayun2022magnet,\ntitle={Ma{GNET}: Uniform Sampling from Deep Generative Network Manifolds Without Retraining},\nauthor={Ahmed Imtiaz Humayun and Randall Balestriero and Richard Baraniuk},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=r5qumLiYwf9}\n}", "github": "", "project": "", "reviewers": "TJkz;5nSv;orST", "pdf_size": 0, "recommendation": "5;6;8", "confidence": "4;4;3", "correctness": "3;3;3", "technical_novelty": "4;2;4", "empirical_novelty": "2;2;3", "wc_summary_paper": "80;24;168", "wc_summary_review": "66;16;52", "wc_main_review": "398;364;172", "wc_review": "544;404;392", "wc_reply_reviewers": "0;21;0", "wc_reply_authors": "944;1020;424", "reply_reviewers": "0;1;0", "reply_authors": "2;2;1", "recommendation_avg": [ 6.333333333333333, 1.247219128924647 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 3.3333333333333335, 0.9428090415820634 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 90.66666666666667, 59.269628722380986 ], "wc_summary_review_avg": [ 44.666666666666664, 21.06075866524175 ], "wc_main_review_avg": [ 311.3333333333333, 99.49651026822778 ], "wc_review_avg": [ 446.6666666666667, 68.99919484232326 ], "wc_reply_reviewers_avg": [ 7.0, 9.899494936611665 ], "wc_reply_authors_avg": [ 796.0, 264.8672623535545 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.944911182523068, "corr_recommendation_correctness": 0.0, "gs_citation": 36, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11229096232217500232&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=r5qumLiYwf9", "email": "rice.edu;facebook.com;rice.edu", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "Rice University;Meta", "aff_unique_dep": ";Meta Platforms, Inc.", "aff_unique_url": "https://www.rice.edu;https://meta.com", "aff_unique_abbr": "Rice;Meta", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "r88Isj2alz", "title": "NODEAttack: Adversarial Attack on the Energy Consumption of Neural ODEs", "track": "main", "status": "Reject", "tldr": "", "abstract": "Recently, Neural ODE (Ordinary Differential Equation) models have been proposed, which use ordinary differential equation solving to predict the output of neural network. Due to the low memory usage, Neural ODE models can be considered as an alternative that can be deployed in resource-constrained devices (e.g., IoT devices, mobile devices). However, to deploy a Deep Learning model in resource-constrained devices, low inference energy cost is also required along with low memory cost. Unlike the memory cost, the energy consumption of the Neural ODEs during inference can be adaptive because of the adaptive nature of the ODE solvers. Attackers can leverage the adaptive behaviour of Neural ODEs to attack the energy consumption of Neural ODEs. However, energy-based attack scenarios have not been explored against Neural ODEs. To show the vulnerability of Neural ODEs against adversarial energy-based attack, we propose NODEAttack.\nThe objective of NODEAttack is to generate adversarial inputs that require more ODE solvers computations, therefore increasing neural ODEs inference-time energy consumption.\nOur extensive evaluation on two datasets and two popular ODE solvers show that the samples generated through NODEAttack can increase up to 168% energy consumption than average energy consumption of benign test data during inference time. Our evaluation also shows the attack transferability is feasible across solvers and architectures.\nAlso, we perform a case study showing the impact of the generated adversarial examples, which shows that NODEAttack generated adversarial examples can decrease 50% efficiency of an object-recognition-based mobile application. ", "keywords": "Adversarial Machine Learning;Energy Consumption", "primary_area": "", "supplementary_material": "/attachment/b62f7cfc795cf0ff88e95b43652baf4bfd326ee3.zip", "author": "Mirazul Haque;Simin Chen;Wasif Arman Haque;Cong Liu;Wei Yang", "authorids": "~Mirazul_Haque1;~Simin_Chen1;wah180000@utdallas.edu;~Cong_Liu2;~Wei_Yang7", "gender": "M;;;;", "homepage": "https://www.linkedin.com/in/mirazul-haque-b4b331a6/;http://seekingdream.github.io/;;https://intra.ece.ucr.edu/~cong/;", "dblp": "272/0796;;;https://dblp.uni-trier.de/pers/l/Liu_0005:Cong.html;", "google_scholar": "1YLCVDgAAAAJ;bgCd-_YAAAAJ;;vpc4bggAAAAJ;", "orcid": ";;;;", "linkedin": "mirazul-haque-b4b331a6;;;;", "or_profile": "~Mirazul_Haque1;~Simin_Chen1;wah180000@utdallas.edu;~Cong_Liu2;~Wei_Yang7", "aff": "University of Texas at Dallas;University of Texas at Dallas ;;University of Texas, Dallas;", "aff_domain": "utdallas.edu;utdallas.edu;;utdallas.edu;", "position": "PhD student;PhD student;;Associate Professor;", "bibtex": "@misc{\nhaque2022nodeattack,\ntitle={{NODEA}ttack: Adversarial Attack on the Energy Consumption of Neural {ODE}s},\nauthor={Mirazul Haque and Simin Chen and Wasif Arman Haque and Cong Liu and Wei Yang},\nyear={2022},\nurl={https://openreview.net/forum?id=r88Isj2alz}\n}", "github": "", "project": "", "reviewers": "byMA;BRCY;v5cj;BU3E", "site": "https://openreview.net/forum?id=r88Isj2alz", "pdf_size": 0, "recommendation": "3;3;6;6", "confidence": "4;4;4;4", "correctness": "2;3;3;3", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "79;64;43;142", "wc_summary_review": "35;76;9;38", "wc_main_review": "524;217;168;211", "wc_review": "638;357;220;391", "wc_reply_reviewers": "0;0;0;15", "wc_reply_authors": "364;151;159;94", "reply_reviewers": "0;0;0;1", "reply_authors": "1;1;1;1", "recommendation_avg": [ 4.5, 1.5 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 82.0, 36.92560087527351 ], "wc_summary_review_avg": [ 39.5, 23.90083680543424 ], "wc_main_review_avg": [ 280.0, 142.13549873272333 ], "wc_review_avg": [ 401.5, 150.80202253285597 ], "wc_reply_reviewers_avg": [ 3.75, 6.49519052838329 ], "wc_reply_authors_avg": [ 192.0, 102.41826009066938 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.5773502691896258, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2838765583768619566&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Texas at Dallas", "aff_unique_dep": "", "aff_unique_url": "https://www.utdallas.edu", "aff_unique_abbr": "UT Dallas", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Dallas", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "r8S93OsHWEf", "title": "Improving Adversarial Defense with Self-supervised Test-time Fine-tuning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Although adversarial training and its variants currently constitute the most effective way to achieve robustness against adversarial attacks, their poor generalization limits their performance on the test samples. In this work, we propose to improve the generalization and robust accuracy of adversarially-trained networks via self-supervised test-time fine-tuning. To this end, we introduce a meta adversarial training method to find a good starting point for test-time fine-tuning. It incorporates the test-time fine-tuning procedure into the training phase and strengthens the correlation between the self-supervised and classification tasks. The extensive experiments on CIFAR10, STL10 and Tiny ImageNet using different self-supervised tasks show that our method consistently improves the robust accuracy under different attack strategies for both the white-box and black-box attacks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhichao Huang;Chen Liu;Mathieu Salzmann;Sabine S\u00fcsstrunk;Tong Zhang", "authorids": "~Zhichao_Huang1;~Chen_Liu1;~Mathieu_Salzmann1;~Sabine_S\u00fcsstrunk1;~Tong_Zhang2", "gender": "M;M;M;;M", "homepage": ";http://liuchen1993.cn/HomePage/index.html;https://people.epfl.ch/mathieu.salzmann;https://www.epfl.ch/labs/ivrl/;http://tongzhang-ml.org", "dblp": ";10/2639-27;18/4533;s/SSusstrunk;07/4227-1", "google_scholar": "https://scholar.google.com/citations?hl=zh-TW;48PsswEAAAAJ;https://scholar.google.ch/citations?user=n-B0jr4AAAAJ;https://scholar.google.com/citations?hl=de;LurWtuYAAAAJ", "orcid": ";;;;0000-0002-5511-2558", "linkedin": ";;;;", "or_profile": "~Zhichao_Huang1;~Chen_Liu1;~Mathieu_Salzmann1;~Sabine_S\u00fcsstrunk1;~Tong_Zhang2", "aff": "Hong Kong University of Science and Technology;Swiss Federal Institute of Technology Lausanne;CSIRO;EPFL - EPF Lausanne;Hong Kong University of Science and Technology", "aff_domain": "ust.hk;epfl.ch;data61.csiro.au;epfl.ch;ust.hk", "position": "PhD student;PhD student;Collaborator;Full Professor;Full Professor", "bibtex": "@misc{\nhuang2022improving,\ntitle={Improving Adversarial Defense with Self-supervised Test-time Fine-tuning},\nauthor={Zhichao Huang and Chen Liu and Mathieu Salzmann and Sabine S{\\\"u}sstrunk and Tong Zhang},\nyear={2022},\nurl={https://openreview.net/forum?id=r8S93OsHWEf}\n}", "github": "", "project": "", "reviewers": "itcW;tnn3;wsz9;VL4k", "site": "https://openreview.net/forum?id=r8S93OsHWEf", "pdf_size": 0, "recommendation": "3;3;5;6", "confidence": "4;4;4;3", "correctness": "3;3;4;4", "technical_novelty": "3;2;3;3", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "47;159;78;53", "wc_summary_review": "98;327;35;27", "wc_main_review": "429;772;287;210", "wc_review": "574;1258;400;290", "wc_reply_reviewers": "78;441;0;54", "wc_reply_authors": "760;663;392;134", "reply_reviewers": "1;1;0;1", "reply_authors": "1;1;1;1", "recommendation_avg": [ 4.25, 1.299038105676658 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 84.25, 44.69549753610536 ], "wc_summary_review_avg": [ 121.75, 121.64985614459229 ], "wc_main_review_avg": [ 424.5, 215.46055323422894 ], "wc_review_avg": [ 630.5, 376.1711711442013 ], "wc_reply_reviewers_avg": [ 143.25, 174.21161700644421 ], "wc_reply_authors_avg": [ 487.25, 244.50907447372992 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.7777777777777777, "corr_recommendation_correctness": 0.9622504486493761, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5945607651674313334&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;2;3;0", "aff_unique_norm": "Hong Kong University of Science and Technology;Swiss Federal Institute of Technology Lausanne;Commonwealth Scientific and Industrial Research Organisation;EPFL", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.ust.hk;https://www.epfl.ch;https://www.csiro.au;https://www.epfl.ch", "aff_unique_abbr": "HKUST;EPFL;CSIRO;EPFL", "aff_campus_unique_index": "0;1;1;0", "aff_campus_unique": "Hong Kong SAR;Lausanne;", "aff_country_unique_index": "0;1;2;1;0", "aff_country_unique": "China;Switzerland;Australia" }, { "id": "r9cpyzP-DQ", "title": "Learning Efficient and Robust Ordinary Differential Equations via Diffeomorphisms", "track": "main", "status": "Reject", "tldr": "", "abstract": "Advances in differentiable numerical integrators have enabled the use of gradient descent techniques to learn ordinary differential equations (ODEs), where a flexible function approximator (often a neural network) is used to estimate the system dynamics, given as a time derivative. However, these integrators can be unsatisfactorily slow and unstable when learning systems of ODEs from long sequences. We propose to learn an ODE of interest from data by viewing its dynamics as a vector field related to another \\emph{base} vector field via a diffeomorphism (i.e., a differentiable bijection). By learning both the diffeomorphism and the dynamics of the base ODE, we provide an avenue to offload some of the complexity in modelling the dynamics directly on to learning the diffeomorphism. Consequently, by restricting the base ODE to be amenable to integration, we can speed up and improve the robustness of integrating trajectories from the learned system. We demonstrate the efficacy of our method in training and evaluating benchmark ODE systems, as well as within continuous-depth neural networks models. We show that our approach attains speed-ups of up to two orders of magnitude when integrating learned ODEs.", "keywords": "Differentiable ODE integrator;Neural ODEs;diffeomorphisms;differential geometry", "primary_area": "", "supplementary_material": "/attachment/b9a68e698dbd6fc9e65f1fde0c64457ba5783cea.zip", "author": "Weiming Zhi;Tin Lai;Lionel Ott;Edwin V Bonilla;Fabio Ramos", "authorids": "~Weiming_Zhi1;~Tin_Lai2;~Lionel_Ott1;~Edwin_V_Bonilla1;~Fabio_Ramos1", "gender": ";;M;M;", "homepage": "https://cs.tinyiu.com;http://www.ott.ai;http://ebonilla.github.io/;https://fabioramos.github.io/;", "dblp": ";117/2028;23/1754;22/2488;208/4705", "google_scholar": ";;https://scholar.google.com.au/citations?user=uDLRZQMAAAAJ;https://scholar.google.com.au/citations?user=T_mJiHoAAAAJ;", "orcid": ";0000-0001-6554-0575;0000-0002-9904-2408;;", "linkedin": ";;;fabio-ramos-3256b421/;", "or_profile": "~Tin_Lai2;~Lionel_Ott1;~Edwin_V_Bonilla1;~Fabio_Ramos1;~William_Zhi1", "aff": "University of Sydney;Swiss Federal Institute of Technology;CSIRO's Data61;NVIDIA;University of Sydney", "aff_domain": "sydney.edu.au;ethz.ch;data61.csiro.au;nvidia.com;sydney.edu.au", "position": "PhD student;Senior Researcher;Principal Research Scientist;Principal Research Scientist;PhD student", "bibtex": "@misc{\nzhi2022learning,\ntitle={Learning Efficient and Robust Ordinary Differential Equations via Diffeomorphisms},\nauthor={Weiming Zhi and Tin Lai and Lionel Ott and Edwin V Bonilla and Fabio Ramos},\nyear={2022},\nurl={https://openreview.net/forum?id=r9cpyzP-DQ}\n}", "github": "", "project": "", "reviewers": "HKf9;HKa2;9Lrk;CkYu", "site": "https://openreview.net/forum?id=r9cpyzP-DQ", "pdf_size": 0, "recommendation": "3;3;6;6", "confidence": "4;4;5;3", "correctness": "2;3;4;4", "technical_novelty": "2;2;4;2", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "60;42;73;168", "wc_summary_review": "65;24;34;37", "wc_main_review": "529;683;482;109", "wc_review": "654;749;589;314", "wc_reply_reviewers": "0;190;0;0", "wc_reply_authors": "645;745;589;263", "reply_reviewers": "0;1;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 4.5, 1.5 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.5, 0.8660254037844386 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 85.75, 48.74615369441983 ], "wc_summary_review_avg": [ 40.0, 15.215124054702938 ], "wc_main_review_avg": [ 450.75, 210.85110267674676 ], "wc_review_avg": [ 576.5, 161.88344572562076 ], "wc_reply_reviewers_avg": [ 47.5, 82.27241335952168 ], "wc_reply_authors_avg": [ 560.5, 180.62322663489323 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.9045340337332909, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:aLGHNI6hxZ8J:scholar.google.com/&scioq=Learning+Efficient+and+Robust+Ordinary+Differential+Equations+via+Diffeomorphisms&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1;2;3;0", "aff_unique_norm": "University of Sydney;Swiss Federal Institute of Technology;CSIRO;NVIDIA", "aff_unique_dep": ";;Data61;NVIDIA Corporation", "aff_unique_url": "https://www.sydney.edu.au;https://www.ethz.ch;https://www.csiro.au;https://www.nvidia.com", "aff_unique_abbr": "USYD;ETH Zurich;CSIRO;NVIDIA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;2;0", "aff_country_unique": "Australia;Switzerland;United States" }, { "id": "rF5UoZFrsF4", "title": "VUT: Versatile UI Transformer for Multimodal Multi-Task User Interface Modeling", "track": "main", "status": "Reject", "tldr": "", "abstract": "User interface modeling is inherently multimodal, which involves several distinct types of data: images, structures and language. The tasks are also diverse, including object detection, language generation and grounding. In this paper, we present VUT, a Versatile UI Transformer that takes multimodal input and simultaneously accomplishes 5 distinct tasks with the same model. Our model consists of a multimodal Transformer encoder that jointly encodes UI images and structures, and performs UI object detection when the UI structures are absent in the input. Our model also consists of an auto-regressive Transformer model that encodes the language input and decodes output, for both question-answering and command grounding with respect to the UI. Our experiments show that for most of the tasks, when trained jointly for multi-tasks, VUT has achieved accuracy either on par with or exceeding the accuracy when the model is trained for individual tasks separately.", "keywords": "User Interface Modeling;Multimodal input;Multi-task learning;Transformer;Layout Detection;Language Grounding;Image Captioning;Screen Summarization;Tappability Prediction.", "primary_area": "", "supplementary_material": "", "author": "Yang Li;Gang Li;Xin Zhou;Mostafa Dehghani;Alexey A. Gritsenko", "authorids": "~Yang_Li2;~Gang_Li13;~Xin_Zhou3;~Mostafa_Dehghani1;~Alexey_A._Gritsenko1", "gender": "M;;;M;Not Specified", "homepage": "http://yangl.org;;;http://mostafadehghani.com/;", "dblp": "37/4190-58;62/2655-21;;125/4062;30/11478", "google_scholar": "ZZdB48QAAAAJ;gmBt9v8AAAAJ;;https://scholar.google.nl/citations?user=MiHOX3QAAAAJ;https://scholar.google.nl/citations?user=zTy9cUwAAAAJ", "orcid": ";0000-0002-9490-2990;;;", "linkedin": "yang-li-127a2a41/;;xin-zhou-94b67251;;agritsenko/", "or_profile": "~Yang_Li2;~Gang_Li13;~Xin_Zhou3;~Mostafa_Dehghani1;~Alexey_Alexeevich_Gritsenko1", "aff": "Google;Google;Google;Google DeepMind;Google", "aff_domain": "google.com;google.com;google.com;google.com;google.com", "position": "Research Scientist;Software Engineer;Researcher;Research Scientist;Researcher", "bibtex": "@misc{\nli2022vut,\ntitle={{VUT}: Versatile {UI} Transformer for Multimodal Multi-Task User Interface Modeling },\nauthor={Yang Li and Gang Li and Xin Zhou and Mostafa Dehghani and Alexey A. Gritsenko},\nyear={2022},\nurl={https://openreview.net/forum?id=rF5UoZFrsF4}\n}", "github": "", "project": "", "reviewers": "69JK;j2ye;e7Hg", "site": "https://openreview.net/forum?id=rF5UoZFrsF4", "pdf_size": 0, "recommendation": "5;5;5", "confidence": "4;3;3", "correctness": "3;3;3", "technical_novelty": "2;2;2", "empirical_novelty": "2;2;2", "wc_summary_paper": "85;48;38", "wc_summary_review": "111;16;33", "wc_main_review": "650;177;109", "wc_review": "846;241;180", "wc_reply_reviewers": "0;0;62", "wc_reply_authors": "842;349;681", "reply_reviewers": "0;0;1", "reply_authors": "1;1;2", "recommendation_avg": [ 5.0, 0.0 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 57.0, 20.215505600075073 ], "wc_summary_review_avg": [ 53.333333333333336, 41.36289265620683 ], "wc_main_review_avg": [ 312.0, 240.60894968115102 ], "wc_review_avg": [ 422.3333333333333, 300.6108595657996 ], "wc_reply_reviewers_avg": [ 20.666666666666668, 29.227080289043965 ], "wc_reply_authors_avg": [ 624.0, 205.2624336469454 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 36, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9512345862984363897&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Google", "aff_unique_dep": "Google", "aff_unique_url": "https://www.google.com", "aff_unique_abbr": "Google", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;0;0;1;0", "aff_country_unique": "United States;United Kingdom" }, { "title": "On Redundancy and Diversity in Cell-based Neural Architecture Search", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6911", "id": "rFJWoYoxrDB", "poster": "", "openreview": "https://openreview.net/forum?id=rFJWoYoxrDB", "slides": "https://iclr.cc/virtual/2022/poster/6911", "video": "https://iclr.cc/virtual/2022/poster/6911", "author_site": "Xingchen Wan, Binxin Ru, Pedro Esperan\u00e7a, Zhenguo Li", "tldr": "", "abstract": "Searching for the architecture cells is a dominant paradigm in NAS. However, little attention has been devoted to the analysis of the cell-based search spaces even though it is highly important for the continual development of NAS. \nIn this work, we conduct an empirical post-hoc analysis of architectures from the popular cell-based search spaces and find that the existing search spaces contain a high degree of redundancy: the architecture performance is less sensitive to changes at large parts of the cells, and universally adopted design rules, like the explicit search for a reduction cell, significantly increase the complexities but have very limited impact on the performance.\nAcross architectures found by a diverse set of search strategies, we consistently find that the parts of the cells that do matter for architecture performance often follow similar and simple patterns. By constraining cells to include these patterns, randomly sampled architectures can match or even outperform the state of the art.\nThese findings cast doubts into our ability to discover truly novel architectures in the existing cell-based search spaces and, inspire our suggestions for improvement to guide future NAS research.\nCode is available at https://github.com/xingchenwan/cell-based-NAS-analysis.", "keywords": "NAS;machine learning architectures;AutoML", "primary_area": "", "supplementary_material": "", "author": "Xingchen Wan;Binxin Ru;Pedro M Esperan\u00e7a;Zhenguo Li", "authorids": "~Xingchen_Wan1;~Binxin_Ru1;~Pedro_M_Esperan\u00e7a1;~Zhenguo_Li1", "gender": "M;M;M;M", "homepage": "https://xingchen.one;;;http://www.ee.columbia.edu/~zgli/", "dblp": "255/7214;;;23/6479", "google_scholar": "6KkohssAAAAJ;https://scholar.google.co.uk/citations?user=4piw-XMAAAAJ;https://scholar.google.co.uk/citations?user=ralB4sUAAAAJ;XboZC1AAAAAJ", "orcid": "0000-0003-0074-0597;;;", "linkedin": ";;;", "or_profile": "~Xingchen_Wan1;~Binxin_Ru1;~Pedro_M_Esperan\u00e7a1;~Zhenguo_Li1", "aff": "University of Oxford;University of Oxford;Huawei Technologies Ltd.;Huawei Noah's Ark Lab", "aff_domain": "robots.ox.ac.uk;ox.ac.uk;huawei.com;huawei.com", "position": "PhD student;PhD student;Researcher;Principal Researcher", "bibtex": "@inproceedings{\nwan2022on,\ntitle={On Redundancy and Diversity in Cell-based Neural Architecture Search},\nauthor={Xingchen Wan and Binxin Ru and Pedro M Esperan{\\c{c}}a and Zhenguo Li},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=rFJWoYoxrDB}\n}", "github": "", "project": "", "reviewers": "2nKR;7BWB;zTwP;eT12;1mk2", "pdf_size": 0, "recommendation": "5;6;6;6;8", "confidence": "4;4;4;4;4", "correctness": "3;3;4;4;3", "technical_novelty": "2;2;3;2;3", "empirical_novelty": "2;3;3;3;3", "wc_summary_paper": "63;86;135;76;60", "wc_summary_review": "71;57;36;47;40", "wc_main_review": "355;322;274;309;254", "wc_review": "489;465;445;432;354", "wc_reply_reviewers": "1566;58;46;27;18", "wc_reply_authors": "4984;1678;1008;803;545", "reply_reviewers": "3;1;1;1;1", "reply_authors": "8;3;3;1;1", "recommendation_avg": [ 6.2, 0.9797958971132712 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.4, 0.4898979485566356 ], "technical_novelty_avg": [ 2.4, 0.4898979485566356 ], "empirical_novelty_avg": [ 2.8, 0.39999999999999997 ], "wc_summary_paper_avg": [ 84.0, 27.151427218472328 ], "wc_summary_review_avg": [ 50.2, 12.607934009979589 ], "wc_main_review_avg": [ 302.8, 35.61684994493477 ], "wc_review_avg": [ 437.0, 45.75150270756142 ], "wc_reply_reviewers_avg": [ 343.0, 611.6606902523653 ], "wc_reply_authors_avg": [ 1803.6, 1633.9506234889718 ], "reply_reviewers_avg": [ 1.4, 0.8000000000000002 ], "reply_authors_avg": [ 3.2, 2.5612496949731396 ], "replies_avg": [ 30, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": -0.16666666666666663, "gs_citation": 29, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1908527067267342822&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "pdf": "https://openreview.net/pdf?id=rFJWoYoxrDB", "email": "robots.ox.ac.uk;ox.ac.uk;huawei.com;huawei.com", "author_num": 4, "aff_unique_index": "0;0;1;1", "aff_unique_norm": "University of Oxford;Huawei", "aff_unique_dep": ";Huawei Technologies", "aff_unique_url": "https://www.ox.ac.uk;https://www.huawei.com", "aff_unique_abbr": "Oxford;Huawei", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;1", "aff_country_unique": "United Kingdom;China" }, { "id": "rFUwBW8qgIZ", "title": "CrossMatch: Improving Semi-Supervised Object Detection via Multi-Scale Consistency", "track": "main", "status": "Withdraw", "tldr": "", "abstract": " We present a novel method, CrossMatch, for semi-supervised object detection. Inspired by the fact that teacher/student pseudo-labeling approaches result in a weak and sparse gradient signal due to the difficulty of confidence-thresholding, CrossMatch leverages \\textit{multi-scale feature extraction} in object detection. Specifically, we enforce consistency between different scales across the student and teacher networks. To the best of our knowledge, this is the first work to use multi-scale consistency in semi-supervised object detection. Furthermore, unlike prior work that mostly uses hard pseudo-labeling methods, CrossMatch further densifies the gradient signal by enforcing multi-scale consistency through both hard and soft labels. This combination effectively strengthens the weak supervision signal from potentially noisy pseudo-labels. We evaluate our method on MS COCO and Pascal VOC under different experiment protocols, and our method significantly improves on previous state of the arts. Specifically, CrossMatch achieves 17.33 and 21.53 mAP with only 0.5\\% and 1\\% labeled data respectively on MS COCO, outperforming other state-of-the-art methods by $\\sim$3 mAP. ", "keywords": "semi-supervised learning;object detection;multi-scale learning", "primary_area": "", "supplementary_material": "", "author": "Zhuoran Yu;Yen-Cheng Liu;Chih-Yao Ma;Zsolt Kira", "authorids": "~Zhuoran_Yu2;~Yen-Cheng_Liu1;~Chih-Yao_Ma1;~Zsolt_Kira1", "gender": "M;;M;M", "homepage": "https://www.zhuoranyu.com;https://ycliu93.github.io/;https://chihyaoma.github.io/;https://faculty.cc.gatech.edu/~zk15", "dblp": "120/3973;29/7584;198/0963;36/4127", "google_scholar": "txxhxREAAAAJ;yeAeAhsAAAAJ;HrrtgKkAAAAJ;2a5XgNAAAAAJ", "orcid": ";;;0000-0002-2626-2004", "linkedin": ";;kevin-chih-yao-ma-9b5b3063/;", "or_profile": "~Zhuoran_Yu2;~Yen-Cheng_Liu1;~Chih-Yao_Ma1;~Zsolt_Kira1", "aff": "Amazon;Georgia Institute of Technology;Meta;Georgia Tech Research Institute", "aff_domain": "amazon.com;gatech.edu;meta.com;gtri.gatech.edu", "position": "Applied Scientist Intern;PhD student;Research Scientist;Senior Research Scientist", "bibtex": "@misc{\nyu2022crossmatch,\ntitle={CrossMatch: Improving Semi-Supervised Object Detection via Multi-Scale Consistency},\nauthor={Zhuoran Yu and Yen-Cheng Liu and Chih-Yao Ma and Zsolt Kira},\nyear={2022},\nurl={https://openreview.net/forum?id=rFUwBW8qgIZ}\n}", "github": "", "project": "", "reviewers": "ybCP;5hAT;JKFs", "site": "https://openreview.net/forum?id=rFUwBW8qgIZ", "pdf_size": 0, "recommendation": "5;5;6", "confidence": "5;5;4", "correctness": "3;3;3", "technical_novelty": "2;2;2", "empirical_novelty": "2;2;2", "wc_summary_paper": "79;99;36", "wc_summary_review": "19;68;66", "wc_main_review": "242;238;321", "wc_review": "340;405;423", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 71.33333333333333, 26.284765338288427 ], "wc_summary_review_avg": [ 51.0, 22.642143596988927 ], "wc_main_review_avg": [ 267.0, 38.21866908549625 ], "wc_review_avg": [ 389.3333333333333, 35.64952859280034 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.9999999999999998, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:ADBxpJ8ueDQJ:scholar.google.com/&scioq=CrossMatch:+Improving+Semi-Supervised+Object+Detection+via+Multi-Scale+Consistency&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "Amazon;Georgia Institute of Technology;Meta;Georgia Tech Research Institute", "aff_unique_dep": "Amazon.com, Inc.;;Meta Platforms, Inc.;", "aff_unique_url": "https://www.amazon.com;https://www.gatech.edu;https://meta.com;https://www.gtri.gatech.edu", "aff_unique_abbr": "Amazon;Georgia Tech;Meta;GTRI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Automated Self-Supervised Learning for Graphs", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6334", "id": "rFbR4Fv-D6-", "poster": "", "openreview": "https://openreview.net/forum?id=rFbR4Fv-D6-", "slides": "https://iclr.cc/virtual/2022/poster/6334", "video": "https://iclr.cc/virtual/2022/poster/6334", "author_site": "Wei Jin, Xiaorui Liu, Xiangyu Zhao, Yao Ma, Neil Shah, Jiliang Tang", "tldr": "", "abstract": "Graph self-supervised learning has gained increasing attention due to its capacity to learn expressive node representations. Many pretext tasks, or loss functions have been designed from distinct perspectives. However, we observe that different pretext tasks affect downstream tasks differently cross datasets, which suggests that searching pretext tasks is crucial for graph self-supervised learning. Different from existing works focusing on designing single pretext tasks, this work aims to investigate how to automatically leverage multiple pretext tasks effectively. Nevertheless, evaluating representations derived from multiple pretext tasks without direct access to ground truth labels makes this problem challenging. To address this obstacle, we make use of a key principle of many real-world graphs, i.e., homophily, or the principle that ``like attracts like,'' as the guidance to effectively search various self-supervised pretext tasks. We provide theoretical understanding and empirical evidence to justify the flexibility of homophily in this search task. Then we propose the AutoSSL framework which can automatically search over combinations of various self-supervised tasks. By evaluating the framework on 7 real-world datasets, our experimental results show that AutoSSL can significantly boost the performance on downstream tasks including node clustering and node classification compared with training under individual tasks. ", "keywords": "Self-supervised learning;Graph neural networks;AutoML", "primary_area": "", "supplementary_material": "", "author": "Wei Jin;Xiaorui Liu;Xiangyu Zhao;Yao Ma;Neil Shah;Jiliang Tang", "authorids": "~Wei_Jin4;~Xiaorui_Liu1;~Xiangyu_Zhao1;~Yao_Ma3;~Neil_Shah2;~Jiliang_Tang1", "gender": ";M;M;M;M;M", "homepage": "http://www.cs.emory.edu/~wjin30/;https://sites.google.com/ncsu.edu/xiaorui/;https://zhaoxyai.github.io/;https://yaoma24.github.io/;http://nshah.net;https://www.cse.msu.edu/~tangjili/", "dblp": "66/2173-9;172/0995;08/890-1.html;212/7871.html;71/7771;64/10812", "google_scholar": "eWow24EAAAAJ;NhvN1KoAAAAJ;;wf9TTOIAAAAJ;Qut69OgAAAAJ;WtzKMWAAAAAJ", "orcid": ";0000-0001-8217-5688;0000-0003-2926-4416;;0000-0003-3261-8430;0000-0001-7125-3898", "linkedin": ";;;;;", "or_profile": "~Wei_Jin4;~Xiaorui_Liu1;~Xiangyu_Zhao1;~Yao_Ma3;~Neil_Shah2;~Jiliang_Tang1", "aff": "Michigan State University;Michigan State University;City University of Hong Kong;New Jersey Institute of Technology;Snap Inc.;Michigan State University", "aff_domain": "msu.edu;msu.edu;cityu.edu.hk;njit.edu;snap.com;msu.edu", "position": "PhD student;PhD student;Assistant Professor;Assistant Professor;Research Scientist;Associate Professor", "bibtex": "@inproceedings{\njin2022automated,\ntitle={Automated Self-Supervised Learning for Graphs},\nauthor={Wei Jin and Xiaorui Liu and Xiangyu Zhao and Yao Ma and Neil Shah and Jiliang Tang},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=rFbR4Fv-D6-}\n}", "github": "", "project": "", "reviewers": "yXJg;DNwN;3wSZ;H8HY", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "4;3;4;4", "correctness": "3;4;3;4", "technical_novelty": "2;2;3;2", "empirical_novelty": "2;2;3;2", "wc_summary_paper": "185;14;148;100", "wc_summary_review": "33;9;39;22", "wc_main_review": "221;73;54;387", "wc_review": "439;96;241;509", "wc_reply_reviewers": "0;0;13;0", "wc_reply_authors": "1221;715;952;1435", "reply_reviewers": "0;0;1;0", "reply_authors": "2;1;2;3", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 111.75, 63.978023570598054 ], "wc_summary_review_avg": [ 25.75, 11.431863365173676 ], "wc_main_review_avg": [ 183.75, 133.97644382502472 ], "wc_review_avg": [ 321.25, 163.01284458594054 ], "wc_reply_reviewers_avg": [ 3.25, 5.629165124598851 ], "wc_reply_authors_avg": [ 1080.75, 271.8054221313475 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 2.0, 0.7071067811865476 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.13245323570650439, "corr_recommendation_correctness": 0.6882472016116854, "gs_citation": 99, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8260281940315648872&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "pdf": "https://openreview.net/pdf?id=rFbR4Fv-D6-", "email": "msu.edu;msu.edu;cityu.edu.hk;njit.edu;snap.com;msu.edu", "author_num": 6, "aff_unique_index": "0;0;1;2;3;0", "aff_unique_norm": "Michigan State University;City University of Hong Kong;New Jersey Institute of Technology;Snap Inc.", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.msu.edu;https://www.cityu.edu.hk;https://www.njit.edu;https://www.snapinc.com", "aff_unique_abbr": "MSU;CityU;NJIT;Snap", "aff_campus_unique_index": "1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;1;0;0;0", "aff_country_unique": "United States;China" }, { "id": "rGg-Qcyplgq", "title": "Distributional Perturbation for Efficient Exploration in Distributional Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Distributional reinforcement learning aims to learn distribution of return under stochastic environments. Since the learned distribution of return contains rich information about the stochasticity of the environment, previous studies have relied on descriptive statistics, such as standard deviation, for optimism in face of uncertainty. These prior works are divided into risk-seeking or averse methods, which can be considered as having a one-sided tendency on risk. Unexpectedly, such approaches hinder convergence. In this paper, we propose a novel distributional reinforcement learning that explores by randomizing the risk criterion to reach a risk-neutral optimal policy. First, we provide a perturbed distributional Bellman optimality operator by distorting the risk measure in action selection. Second, we prove the convergence and optimality of the proposed method by using weaker contraction property. Our theoretical results support that the proposed method does not fall into biased exploration and converges to an optimal return distribution. Finally, we empirically show that our method outperforms other existing distribution-based algorithms in various environments including Atari games.", "keywords": "distributional reinforcement learning;perturbation;exploration", "primary_area": "", "supplementary_material": "", "author": "Taehyun Cho;Seungyub Han;Heesoo Lee;Kyungjae Lee;Jungwoo Lee", "authorids": "~Taehyun_Cho1;~Seungyub_Han1;~Heesoo_Lee1;~Kyungjae_Lee1;~Jungwoo_Lee1", "gender": "M;M;M;M;M", "homepage": ";;https://orcid.org/0000-0001-5525-1892;https://sites.google.com/view/kyungjaelee;https://cml.snu.ac.kr", "dblp": "274/0287;347/8731;;13/7265-1;34/516-1", "google_scholar": "https://scholar.google.com/citations?view_op=list_works;ot1-XNAAAAAJ;;https://scholar.google.co.kr/citations?user=OZZJagIAAAAJ;j98IWfoAAAAJ", "orcid": "0000-0003-1047-9847;0009-0001-8704-8968;;0000-0003-0147-2715;0000-0002-6804-980X", "linkedin": ";;;;", "or_profile": "~Taehyun_Cho1;~Seungyub_Han1;~Heesoo_Lee1;~Kyungjae_Lee1;~Jungwoo_Lee1", "aff": "Seoul National University;Seoul National University;Seoul National University;ChungAng University;Seoul National University", "aff_domain": "snu.ac.kr;snu.ac.kr;snu.ac.kr;cau.ac.kr;snu.ac.kr", "position": "MS student;PhD student;MS student;Assistant Professor;Full Professor", "bibtex": "@misc{\ncho2022distributional,\ntitle={Distributional Perturbation for Efficient Exploration in Distributional Reinforcement Learning},\nauthor={Taehyun Cho and Seungyub Han and Heesoo Lee and Kyungjae Lee and Jungwoo Lee},\nyear={2022},\nurl={https://openreview.net/forum?id=rGg-Qcyplgq}\n}", "github": "", "project": "", "reviewers": "Ywdd;61i6;GS3Z", "site": "https://openreview.net/forum?id=rGg-Qcyplgq", "pdf_size": 0, "recommendation": "5;6;6", "confidence": "4;3;3", "correctness": "4;3;3", "technical_novelty": "2;3;3", "empirical_novelty": "2;3;3", "wc_summary_paper": "63;79;204", "wc_summary_review": "21;27;48", "wc_main_review": "410;237;212", "wc_review": "494;343;464", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "1520;457;299", "reply_reviewers": "0;0;0", "reply_authors": "2;1;1", "recommendation_avg": [ 5.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 115.33333333333333, 63.03614483417871 ], "wc_summary_review_avg": [ 32.0, 11.575836902790225 ], "wc_main_review_avg": [ 286.3333333333333, 88.03913271317982 ], "wc_review_avg": [ 433.6666666666667, 65.27037783115877 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 758.6666666666666, 542.1945120423932 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.9999999999999997, "corr_recommendation_correctness": -0.9999999999999997, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17314559968895648684&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;0;1;0", "aff_unique_norm": "Seoul National University;Chungang University", "aff_unique_dep": ";", "aff_unique_url": "https://www.snu.ac.kr;http://www.cau.ac.kr", "aff_unique_abbr": "SNU;CAU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "South Korea" }, { "title": "CLEVA-Compass: A Continual Learning Evaluation Assessment Compass to Promote Research Transparency and Comparability", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6188", "id": "rHMaBYbkkRJ", "poster": "", "openreview": "https://openreview.net/forum?id=rHMaBYbkkRJ", "slides": "https://iclr.cc/virtual/2022/poster/6188", "video": "https://iclr.cc/virtual/2022/poster/6188", "author_site": "Martin Mundt, Steven Lang, Quentin Delfosse, Kristian Kersting", "tldr": "", "abstract": "What is the state of the art in continual machine learning? Although a natural question for predominant static benchmarks, the notion to train systems in a lifelong manner entails a plethora of additional challenges with respect to set-up and evaluation. The latter have recently sparked a growing amount of critiques on prominent algorithm-centric perspectives and evaluation protocols being too narrow, resulting in several attempts at constructing guidelines in favor of specific desiderata or arguing against the validity of prevalent assumptions. In this work, we depart from this mindset and argue that the goal of a precise formulation of desiderata is an ill-posed one, as diverse applications may always warrant distinct scenarios. Instead, we introduce the Continual Learning EValuation Assessment Compass: the CLEVA-Compass. The compass provides the visual means to both identify how approaches are practically reported and how works can simultaneously be contextualized in the broader literature landscape. In addition to promoting compact specification in the spirit of recent replication trends, it thus provides an intuitive chart to understand the priorities of individual systems, where they resemble each other, and what elements are missing towards a fair comparison. ", "keywords": "continual learning;lifelong learning;machine learning evaluation", "primary_area": "", "supplementary_material": "/attachment/1f0fc05a83dd0837ce91e091e9707aa8a66f70d9.zip", "author": "Martin Mundt;Steven Lang;Quentin Delfosse;Kristian Kersting", "authorids": "~Martin_Mundt1;~Steven_Lang2;~Quentin_Delfosse1;~Kristian_Kersting1", "gender": "M;M;M;M", "homepage": "https://owl-ml.uni-bremen.de;https://www.steven-braun.com;https://quentindelfosse.me/;http://www.ml.informatik.tu-darmstadt.de/", "dblp": "200/8146;https://dblp.uni-trier.de/pid/10/1284;286/1466.html;40/3793", "google_scholar": "riGBurAAAAAJ;sja9tq0AAAAJ;k1E0FgIAAAAJ;QY-earAAAAAJ", "orcid": "0000-0003-1639-8255;0000-0002-5627-8058;;0000-0002-2873-9152", "linkedin": ";;quentin-delfosse-70b377150/;", "or_profile": "~Martin_Mundt1;~Steven_Lang2;~Quentin_Delfosse1;~Kristian_Kersting1", "aff": "TU Darmstadt;TU Darmstadt;CS Department, TU Darmstadt, TU Darmstadt;TU Darmstadt", "aff_domain": "tu-darmstadt.de;tu-darmstadt.de;cs.tu-darmstadt.de;tu-darmstadt.de", "position": "Postdoc;PhD student;PhD student;Full Professor", "bibtex": "@inproceedings{\nmundt2022clevacompass,\ntitle={{CLEVA}-Compass: A Continual Learning Evaluation Assessment Compass to Promote Research Transparency and Comparability},\nauthor={Martin Mundt and Steven Lang and Quentin Delfosse and Kristian Kersting},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=rHMaBYbkkRJ}\n}", "github": "", "project": "", "reviewers": "BGUM;JwDM;RMSx;FRys", "pdf_size": 0, "recommendation": "5;8;8;8", "confidence": "3;4;4;5", "correctness": "2;4;3;3", "technical_novelty": "3;3;3;3", "empirical_novelty": "3;3;3;0", "wc_summary_paper": "76;101;45;227", "wc_summary_review": "65;59;105;54", "wc_main_review": "215;426;389;744", "wc_review": "356;586;539;1025", "wc_reply_reviewers": "0;182;52;39", "wc_reply_authors": "926;783;818;691", "reply_reviewers": "0;1;1;1", "reply_authors": "2;1;1;1", "recommendation_avg": [ 7.25, 1.299038105676658 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.25, 1.299038105676658 ], "wc_summary_paper_avg": [ 112.25, 69.15697723295894 ], "wc_summary_review_avg": [ 70.75, 20.154093876927337 ], "wc_main_review_avg": [ 443.5, 190.9116287710102 ], "wc_review_avg": [ 626.5, 245.59570435982792 ], "wc_reply_reviewers_avg": [ 68.25, 68.40458683450986 ], "wc_reply_authors_avg": [ 804.5, 84.09667056429761 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.816496580927726, "corr_recommendation_correctness": 0.816496580927726, "gs_citation": 43, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16474577021609169344&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=rHMaBYbkkRJ", "email": "tu-darmstadt.de;tu-darmstadt.de;cs.tu-darmstadt.de;tu-darmstadt.de", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Technische Universit\u00e4t Darmstadt", "aff_unique_dep": "", "aff_unique_url": "https://www.tu-darmstadt.de", "aff_unique_abbr": "TU Darmstadt", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Darmstadt;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Germany" }, { "title": "Understanding approximate and unrolled dictionary learning for pattern recovery", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6331", "id": "rI0LYgGeYaw", "poster": "", "openreview": "https://openreview.net/forum?id=rI0LYgGeYaw", "slides": "https://iclr.cc/virtual/2022/poster/6331", "video": "https://iclr.cc/virtual/2022/poster/6331", "author_site": "Beno\u00eet Mal\u00e9zieux, Thomas Moreau, Matthieu Kowalski", "tldr": "", "abstract": "Dictionary learning consists of finding a sparse representation from noisy data and is a common way to encode data-driven prior knowledge on signals. Alternating minimization (AM) is standard for the underlying optimization, where gradient descent steps alternate with sparse coding procedures. The major drawback of this method is its prohibitive computational cost, making it unpractical on large real-world data sets. This work studies an approximate formulation of dictionary learning based on unrolling and compares it to alternating minimization to find the best trade-off between speed and precision. We analyze the asymptotic behavior and convergence rate of gradients estimates in both methods. We show that unrolling performs better on the support of the inner problem solution and during the first iterations. Finally, we apply unrolling on pattern learning in magnetoencephalography (MEG) with the help of a stochastic algorithm and compare the performance to a state-of-the-art method.", "keywords": "Dictionary learning;bi-level optimization;unrolling;pattern learning", "primary_area": "", "supplementary_material": "/attachment/ec8d0b5a9c03560c01c265be9f8d3ed907ac1ec8.zip", "author": "Beno\u00eet Mal\u00e9zieux;Thomas Moreau;Matthieu Kowalski", "authorids": "~Beno\u00eet_Mal\u00e9zieux1;~Thomas_Moreau2;~Matthieu_Kowalski1", "gender": ";M;M", "homepage": ";http://hebergement.universite-paris-saclay.fr/mkowalski/;https://tommoral.github.io", "dblp": ";94/7055;150/2391-1", "google_scholar": ";https://scholar.google.fr/citations?user=hwrwDIkAAAAJ;https://scholar.google.fr/citations?user=HEO_PsAAAAAJ", "orcid": ";0000-0002-9981-237X;0000-0002-1523-3419", "linkedin": "benoit-malezieux-203283148/;;thomasmoreau2010", "or_profile": "~Beno\u00eet_Mal\u00e9zieux1;~Matthieu_Kowalski1;~Thomas_Martin_Moreau1", "aff": "INRIA;universite paris saclay;INRIA", "aff_domain": "inria.fr;universite-paris-saclay.fr;inria.fr", "position": "PhD student;Associate Professor;Researcher", "bibtex": "@inproceedings{\nmal{\\'e}zieux2022understanding,\ntitle={Understanding approximate and unrolled dictionary learning for pattern recovery},\nauthor={Beno{\\^\\i}t Mal{\\'e}zieux and Thomas Moreau and Matthieu Kowalski},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=rI0LYgGeYaw}\n}", "github": "", "project": "", "reviewers": "njnY;5JDj;S6uo;EyuW", "pdf_size": 0, "recommendation": "3;6;6;8", "confidence": "4;5;4;3", "correctness": "2;3;3;4", "technical_novelty": "2;3;2;3", "empirical_novelty": "1;2;2;3", "wc_summary_paper": "39;240;86;134", "wc_summary_review": "80;126;45;32", "wc_main_review": "276;1105;246;85", "wc_review": "395;1471;377;251", "wc_reply_reviewers": "183;0;12;0", "wc_reply_authors": "1313;1648;420;73", "reply_reviewers": "1;0;1;0", "reply_authors": "3;2;1;1", "recommendation_avg": [ 5.75, 1.7853571071357126 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 124.75, 74.53648435497881 ], "wc_summary_review_avg": [ 70.75, 36.409991760504425 ], "wc_main_review_avg": [ 428.0, 397.5569141645005 ], "wc_review_avg": [ 623.5, 492.4395902037122 ], "wc_reply_reviewers_avg": [ 48.75, 77.6639395086291 ], "wc_reply_authors_avg": [ 863.5, 640.1314318169356 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.75, 0.82915619758885 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.39605901719066966, "corr_recommendation_correctness": 0.9901475429766743, "gs_citation": 18, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14808406536807467476&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 14, "pdf": "https://openreview.net/pdf?id=rI0LYgGeYaw", "email": "inria.fr;universite-paris-saclay.fr;inria.fr", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "INRIA;Universit\u00e9 Paris-Saclay", "aff_unique_dep": ";", "aff_unique_url": "https://www.inria.fr;https://www.universite-paris-saclay.fr", "aff_unique_abbr": "INRIA;UPS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "France" }, { "title": "Multi-Critic Actor Learning: Teaching RL Policies to Act with Style", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6568", "id": "rJvY_5OzoI", "poster": "", "openreview": "https://openreview.net/forum?id=rJvY_5OzoI", "slides": "https://iclr.cc/virtual/2022/poster/6568", "video": "https://iclr.cc/virtual/2022/poster/6568", "author_site": "Siddharth Mysore, George Cheng, yunqi zhao, Kate Saenko, Meng Wu", "tldr": "", "abstract": "Using a single value function (critic) shared over multiple tasks in Actor-Critic multi-task reinforcement learning (MTRL) can result in negative interference between tasks, which can compromise learning performance. Multi-Critic Actor Learning (MultiCriticAL) proposes instead maintaining separate critics for each task being trained while training a single multi-task actor. Explicitly distinguishing between tasks also eliminates the need for critics to learn to do so and mitigates interference between task-value estimates. MultiCriticAL is tested in the context of multi-style learning, a special case of MTRL where agents are trained to behave with different distinct behavior styles, and yields up to 56% performance gains over the single-critic baselines and even successfully learns behavior styles in cases where single-critic approaches may simply fail to learn. In a simulated real-world use case, MultiCriticAL enables learning policies that smoothly transition between multiple fighting styles on an experimental build of EA\u2019s UFC game.", "keywords": "Reinforcement Learning;Multi-Style Learning;Multi-Task Learning;Actor-Critic", "primary_area": "", "supplementary_material": "/attachment/d860f607da2c6fec97bfa5a71e0350d3ae203047.zip", "author": "Siddharth Mysore;George Cheng;Yunqi Zhao;Kate Saenko;Meng Wu", "authorids": "~Siddharth_Mysore1;gecheng@ea.com;yuzhao@ea.com;~Kate_Saenko1;febmeng@gmail.com", "gender": "M;;;F;", "homepage": "http://cs-people.bu.edu/sidmys;;;http://ai.bu.edu;", "dblp": "281/7006;;;88/2754;", "google_scholar": "H196-6QAAAAJ;;;https://scholar.google.com.tw/citations?user=9xDADY4AAAAJ;", "orcid": "0000-0002-1292-1523;;;0000-0002-5704-7614;", "linkedin": ";;;;", "or_profile": "~Siddharth_Mysore1;gecheng@ea.com;yuzhao@ea.com;~Kate_Saenko1;febmeng@gmail.com", "aff": "Boston University;;;Boston University, Boston University;", "aff_domain": "bu.edu;;;bu.edu;", "position": "PhD student;;;Full Professor;", "bibtex": "@inproceedings{\nmysore2022multicritic,\ntitle={Multi-Critic Actor Learning: Teaching {RL} Policies to Act with Style},\nauthor={Siddharth Mysore and George Cheng and Yunqi Zhao and Kate Saenko and Meng Wu},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=rJvY_5OzoI}\n}", "github": "", "project": "", "reviewers": "eF9d;mdph;Y35C;Lh1z;kVWj", "pdf_size": 0, "recommendation": "6;6;6;8;8", "confidence": "4;4;5;4;4", "correctness": "3;3;3;3;3", "technical_novelty": "3;3;2;3;2", "empirical_novelty": "2;3;2;3;3", "wc_summary_paper": "106;86;14;93;77", "wc_summary_review": "82;56;65;50;58", "wc_main_review": "509;302;119;369;304", "wc_review": "697;444;198;512;439", "wc_reply_reviewers": "0;384;0;106;41", "wc_reply_authors": "559;2242;564;571;1185", "reply_reviewers": "0;2;0;1;1", "reply_authors": "1;5;1;1;4", "recommendation_avg": [ 6.8, 0.9797958971132712 ], "confidence_avg": [ 4.2, 0.39999999999999997 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.6, 0.4898979485566356 ], "empirical_novelty_avg": [ 2.6, 0.4898979485566356 ], "wc_summary_paper_avg": [ 75.2, 32.0337322208949 ], "wc_summary_review_avg": [ 62.2, 10.998181667894016 ], "wc_main_review_avg": [ 320.6, 125.77853552971588 ], "wc_review_avg": [ 458.0, 160.12120409239995 ], "wc_reply_reviewers_avg": [ 106.2, 144.2087375993563 ], "wc_reply_authors_avg": [ 1024.2, 654.5958753307266 ], "reply_reviewers_avg": [ 0.8, 0.7483314773547883 ], "reply_authors_avg": [ 2.4, 1.7435595774162693 ], "replies_avg": [ 23, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.408248290463863, "corr_recommendation_correctness": 0.0, "gs_citation": 32, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16215883614250413203&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2, "pdf": "https://openreview.net/pdf?id=rJvY_5OzoI", "email": "bu.edu;;;bu.edu;", "author_num": 5, "aff_unique_index": "0;0", "aff_unique_norm": "Boston University", "aff_unique_dep": "", "aff_unique_url": "https://www.bu.edu", "aff_unique_abbr": "BU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Boston", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "rLARdZ3FxCM", "title": "Projective Manifold Gradient Layer for Deep Rotation Regression", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Regressing rotations on SO(3) manifold using deep neural networks is an important yet unsolved problem. The gap between Euclidean network output space and the non-Euclidean SO(3) manifold imposes a severe challenge for neural network learning in both forward and backward passes. While several works have proposed different regression-friendly rotation representations, very few works have been devoted to improving the gradient backpropagating in the backward pass. In this paper, we propose a manifold-aware gradient that directly backpropagates into deep network weights. Leveraging the Riemannian gradient and a novel projective gradient, our proposed regularized projective manifold gradient (RPMG) helps networks achieve new state-of-the-art performance in a variety of rotation estimation tasks. The proposed gradient layer can also be applied to other smooth manifolds such as the unit sphere.", "keywords": "regression;rotation;manifold", "primary_area": "", "supplementary_material": "/attachment/4845fc95a17dec8be3cf7a4d8bb4f9a0f31b7660.zip", "author": "Jiayi Chen;Yingda Yin;Tolga Birdal;Baoquan Chen;Leonidas Guibas;He Wang", "authorids": "~Jiayi_Chen5;~Yingda_Yin1;~Tolga_Birdal3;~Baoquan_Chen1;~Leonidas_Guibas1;~He_Wang5", "gender": "M;M;M;M;M;M", "homepage": "https://yd-yin.github.io/;http://tolgabirdal.github.io;https://baoquanchen.info;http://geometry.stanford.edu/;https://hughw19.github.io;https://github.com/JYChen18", "dblp": "255/4832;143/7056;23/4197;g/LeonidasJGuibas;01/6368-10;42/1159-3", "google_scholar": ";_Bxd5ggAAAAJ;iHWtrEAAAAAJ;https://scholar.google.com.tw/citations?user=5JlEyTAAAAAJ;roCAWkoAAAAJ;", "orcid": ";0000-0001-7915-7964;;;;", "linkedin": ";https://linkedin.com/in/tbirdal;baoquan/;;;", "or_profile": "~Yingda_Yin1;~Tolga_Birdal3;~Baoquan_Chen1;~Leonidas_Guibas1;~He_Wang5;~jiayi_chen3", "aff": "Peking University;Imperial College London;Peking University;Stanford University;Peking University;Peking University", "aff_domain": "pku.edu.cn;imperial.ac.uk;pku.edu.cn;stanford.edu;pku.edu.cn;pku.edu.cn", "position": "PhD student;Assistant Professor;Full Professor;Full Professor;Assistant Professor;Undergrad student", "bibtex": "@misc{\nchen2022projective,\ntitle={Projective Manifold Gradient Layer for Deep Rotation Regression},\nauthor={Jiayi Chen and Yingda Yin and Tolga Birdal and Baoquan Chen and Leonidas Guibas and He Wang},\nyear={2022},\nurl={https://openreview.net/forum?id=rLARdZ3FxCM}\n}", "github": "", "project": "", "reviewers": "7aen;TpPa;H1gH;gdNe", "site": "https://openreview.net/forum?id=rLARdZ3FxCM", "pdf_size": 0, "recommendation": "3;3;5;6", "confidence": "4;5;4;3", "correctness": "2;2;4;3", "technical_novelty": "2;3;2;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "192;78;246;43", "wc_summary_review": "56;52;406;49", "wc_main_review": "1044;391;343;221", "wc_review": "1292;521;995;313", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.25, 1.299038105676658 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 2.75, 0.82915619758885 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 139.75, 82.45112188442313 ], "wc_summary_review_avg": [ 140.75, 153.16229137747973 ], "wc_main_review_avg": [ 499.75, 320.27595523235897 ], "wc_review_avg": [ 780.25, 385.20733053772483 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.816496580927726, "corr_recommendation_correctness": 0.7543365091413573, "gs_citation": 33, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3379264059637576430&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff_unique_index": "0;1;0;2;0;0", "aff_unique_norm": "Peking University;Imperial College London;Stanford University", "aff_unique_dep": ";;", "aff_unique_url": "http://www.pku.edu.cn;https://www.imperial.ac.uk;https://www.stanford.edu", "aff_unique_abbr": "Peking U;ICL;Stanford", "aff_campus_unique_index": "1", "aff_campus_unique": ";Stanford", "aff_country_unique_index": "0;1;0;2;0;0", "aff_country_unique": "China;United Kingdom;United States" }, { "id": "rMbLORc8oS", "title": "SemiRetro: Semi-template framework boosts deep retrosynthesis prediction", "track": "main", "status": "Reject", "tldr": "", "abstract": "Retrosynthesis brings scientific and societal benefits by inferring possible reaction routes toward novel molecules. Recently, template-based (TB) and template-free (TF) molecule graph learning methods have shown promising results to solve this problem. TB methods are more accurate using pre-encoded reaction templates, and TF methods are more scalable by decomposing retrosynthesis into subproblems, i.e., center identification and synthon completion. To combine both advantages of TB and TF, we suggest breaking a full-template into several semi-templates and embedding them into the two-step TF framework. Since many semi-templates are reduplicative, the template redundancy can be reduced while the essential chemical knowledge is still preserved to facilitate synthon completion. We call our method SemiRetro and introduce a directed relational graph attention (DRGAT) layer to extract expressive features for better center identification. Experimental results show that SemiRetro significantly outperforms both existing TB and TF methods. In scalability, SemiRetro covers 96.9\\% data using 150 semi-templates, while previous template-based GLN requires 11,647 templates to cover 93.3\\% data. In top-1 accuracy, SemiRetro exceeds template-free G2G 3.4\\% (class known) and 6.4\\% (class unknown). Besides, SemiReto has better interpretability and training efficiency than existing methods.", "keywords": "Retrosynthesis prediction;molecular graph learning", "primary_area": "", "supplementary_material": "/attachment/9a95be4058cc65b15d0b703af03fad989c5e7307.zip", "author": "Zhangyang Gao;Cheng Tan;Lirong Wu;Haitao Lin;Stan Z. Li", "authorids": "~Zhangyang_Gao1;~Cheng_Tan1;~Lirong_Wu1;~Haitao_Lin2;~Stan_Z._Li2", "gender": "M;M;;M;M", "homepage": ";https://chengtan9907.github.io/;;;https://en.westlake.edu.cn/academics/School_of_Engineering/About/Our_People/Faculty/201912/t20191206_2497.shtml", "dblp": "275/3266;70/1533-12.html;15/10330;34/1040;l/StanZLi", "google_scholar": "4SclT-QAAAAJ;6kTV6aMAAAAJ;Tk7TrCoAAAAJ;o5A23qIAAAAJ;https://scholar.google.com/citations?hl=zh-CN", "orcid": "0000-0003-1026-6083;;;;", "linkedin": ";;;;stan-z-li-%E6%9D%8E%E5%AD%90%E9%9D%92-55753224/", "or_profile": "~Zhangyang_Gao1;~Cheng_Tan1;~Lirong_Wu1;~Haitao_Lin2;~Stan_Z._Li1", "aff": "Westlake University, China;Zhejiang University & Westlake University;Westlake University;Westlake University;Westlake University", "aff_domain": "westlake.edu.cn;westlake.edu.cn;westlake.edu.cn;westlake.edu.cn;westlake.edu.cn", "position": "PhD student;PhD student;PhD student;PhD student;Chair Professor", "bibtex": "@misc{\ngao2022semiretro,\ntitle={SemiRetro: Semi-template framework boosts deep retrosynthesis prediction},\nauthor={Zhangyang Gao and Cheng Tan and Lirong Wu and Haitao Lin and Stan Z. Li},\nyear={2022},\nurl={https://openreview.net/forum?id=rMbLORc8oS}\n}", "github": "", "project": "", "reviewers": "XXdN;XCBM;83gD;G14H", "site": "https://openreview.net/forum?id=rMbLORc8oS", "pdf_size": 0, "recommendation": "3;3;5;6", "confidence": "5;4;3;5", "correctness": "3;2;4;4", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "49;177;95;28", "wc_summary_review": "147;107;169;6", "wc_main_review": "348;689;300;155", "wc_review": "544;973;564;189", "wc_reply_reviewers": "1625;638;77;125", "wc_reply_authors": "3922;1491;660;213", "reply_reviewers": "9;2;1;1", "reply_authors": "11;3;2;2", "recommendation_avg": [ 4.25, 1.299038105676658 ], "confidence_avg": [ 4.25, 0.82915619758885 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 87.25, 57.203037506761824 ], "wc_summary_review_avg": [ 107.25, 62.5394875258824 ], "wc_main_review_avg": [ 373.0, 195.78942770231492 ], "wc_review_avg": [ 567.5, 277.6044848340891 ], "wc_reply_reviewers_avg": [ 616.25, 622.5284632046955 ], "wc_reply_authors_avg": [ 1571.5, 1432.4528788061407 ], "reply_reviewers_avg": [ 3.25, 3.344772040064913 ], "reply_authors_avg": [ 4.5, 3.774917217635375 ], "replies_avg": [ 39, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.058025885318565944, "corr_recommendation_correctness": 0.8703882797784892, "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5631592216566667844&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1;0;0;0", "aff_unique_norm": "Westlake University;Zhejiang University", "aff_unique_dep": ";", "aff_unique_url": "https://www.westlake.edu.cn;http://www.zju.edu.cn", "aff_unique_abbr": "WU;ZJU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "id": "rN9tjzY9UD", "title": "Adaptive Learning of Tensor Network Structures", "track": "main", "status": "Reject", "tldr": "", "abstract": "Tensor Networks (TN) offer a powerful framework to efficiently represent very high-dimensional objects. TN have recently shown their potential for machine learning applications and offer a unifying view of common tensor decomposition models such as Tucker, tensor train (TT) and tensor ring (TR). However, identifying the best tensor network structure from data for a given task is challenging. In this work, we leverage the TN formalism to develop a generic and efficient adaptive algorithm to jointly learn the structure and the parameters of a TN from data. Our method is based on a simple greedy approach starting from a rank one tensor and successively identifying the most promising tensor network edges for small rank increments. Our algorithm can adaptively identify TN structures with small number of parameters that effectively optimize any differentiable objective function. Experiments on tensor decomposition, tensor completion and model compression tasks demonstrate the effectiveness of the proposed algorithm. In particular, our method outperforms the state-of-the-art evolutionary topology search [Li and Sun, 2020] for tensor decomposition of images (while being orders of magnitude faster) and finds efficient tensor network structures to compress neural networks outperforming popular TT based approaches [Novikov et al., 2015].", "keywords": "Tensor Networks;Tensor Network Topology;Structure Learning;Tensor Completion;Tensor Decomposition", "primary_area": "", "supplementary_material": "/attachment/0bb4cd3cfeaf55b2439624211e56c009cc0988ac.zip", "author": "Meraj Hashemizadeh;Michelle Liu;Jacob Miller;Guillaume Rabusseau", "authorids": "~Meraj_Hashemizadeh1;~Michelle_Liu2;~Jacob_Miller2;~Guillaume_Rabusseau1", "gender": ";;;M", "homepage": "http://www-ens.iro.umontreal.ca/~hashemis/;;;https://www-labs.iro.umontreal.ca/~grabus/", "dblp": ";;;143/7327", "google_scholar": ";;;https://scholar.google.fr/citations?user=t2i4V4EAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Meraj_Hashemizadeh1;~Michelle_Liu2;~Jacob_Miller2;~Guillaume_Rabusseau1", "aff": ";;;Universit\u00e9 de Montr\u00e9al", "aff_domain": ";;;umontreal.ca", "position": ";;;Associate Professor", "bibtex": "@misc{\nhashemizadeh2022adaptive,\ntitle={Adaptive Learning of Tensor Network Structures},\nauthor={Meraj Hashemizadeh and Michelle Liu and Jacob Miller and Guillaume Rabusseau},\nyear={2022},\nurl={https://openreview.net/forum?id=rN9tjzY9UD}\n}", "github": "", "project": "", "reviewers": "RmTg;yrur;uoVk;7PSK", "site": "https://openreview.net/forum?id=rN9tjzY9UD", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "4;5;3;4", "correctness": "3;2;2;3", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;1;2;3", "wc_summary_paper": "75;79;38;34", "wc_summary_review": "37;67;28;19", "wc_main_review": "263;445;192;207", "wc_review": "375;591;258;260", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 56.5, 20.59732992404598 ], "wc_summary_review_avg": [ 37.75, 18.046814123273947 ], "wc_main_review_avg": [ 276.75, 100.67863477421612 ], "wc_review_avg": [ 371.0, 135.55994983770097 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.7071067811865475, "corr_recommendation_correctness": 0.0, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=557430182440163092&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0", "aff_unique_norm": "Universit\u00e9 de Montr\u00e9al", "aff_unique_dep": "", "aff_unique_url": "https://www.umontreal.ca", "aff_unique_abbr": "UdeM", "aff_country_unique_index": "0", "aff_country_unique": "Canada" }, { "id": "rOGm97YR22N", "title": "Mixed-Memory RNNs for Learning Long-term Dependencies in Irregularly Sampled Time Series", "track": "main", "status": "Reject", "tldr": "", "abstract": "Recurrent neural networks (RNNs) with continuous-time hidden states are a natural fit for modeling irregularly sampled time series. These models, however, face difficulties when the input data possess long-term dependencies. We prove that similar to standard RNNs, the underlying reason for this issue is the vanishing or exploding of the gradient during training. This phenomenon is expressed by the ordinary differential equation (ODE) representation of the hidden state, regardless of the ODE solver's choice. We provide a solution by equipping arbitrary continuous-time networks with a memory compartment separated from its time-continuous state. This way, we encode a continuous-time dynamical flow within the RNN, allowing it to respond to inputs arriving at arbitrary time-lags while ensuring a constant error propagation through the memory path. We call these models Mixed-Memory-RNNs (mmRNNs). We experimentally show that Mixed-Memory-RNNs outperform recently proposed RNN-based counterparts on non-uniformly sampled data with long-term dependencies.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/de61d6d5faf6a52d8a6102db48d65c55f834068d.zip", "author": "Mathias Lechner;Ramin Hasani", "authorids": "~Mathias_Lechner1;~Ramin_Hasani1", "gender": "Unspecified;M", "homepage": "https://mlech26l.github.io/pages/;http://www.raminhasani.com", "dblp": "209/9862;190/3168", "google_scholar": "https://scholar.google.at/citations?hl=en;https://scholar.google.at/citations?user=YarJF3QAAAAJ", "orcid": ";0000-0002-9889-5222", "linkedin": ";raminhasani/", "or_profile": "~Mathias_Lechner1;~Ramin_M._Hasani1", "aff": "Institute of Science and Technology Austria;Massachusetts Institute of Technology", "aff_domain": "ist.ac.at;mit.edu", "position": "PhD student;Researcher", "bibtex": "@misc{\nlechner2022mixedmemory,\ntitle={Mixed-Memory {RNN}s for Learning Long-term Dependencies in Irregularly Sampled Time Series},\nauthor={Mathias Lechner and Ramin Hasani},\nyear={2022},\nurl={https://openreview.net/forum?id=rOGm97YR22N}\n}", "github": "", "project": "", "reviewers": "pULY;6YwU;ypgB;X1Ep;ceHF", "site": "https://openreview.net/forum?id=rOGm97YR22N", "pdf_size": 0, "recommendation": "3;5;5;8;8", "confidence": "4;4;3;3;4", "correctness": "2;3;2;4;3", "technical_novelty": "2;2;3;4;3", "empirical_novelty": "3;3;3;4;3", "wc_summary_paper": "73;60;71;76;116", "wc_summary_review": "27;25;57;54;49", "wc_main_review": "1012;789;384;233;254", "wc_review": "1112;874;512;363;419", "wc_reply_reviewers": "93;0;0;0;72", "wc_reply_authors": "1611;750;414;132;657", "reply_reviewers": "1;0;0;0;1", "reply_authors": "3;1;1;1;2", "recommendation_avg": [ 5.8, 1.9390719429665315 ], "confidence_avg": [ 3.6, 0.4898979485566356 ], "correctness_avg": [ 2.8, 0.7483314773547882 ], "technical_novelty_avg": [ 2.8, 0.7483314773547882 ], "empirical_novelty_avg": [ 3.2, 0.39999999999999997 ], "wc_summary_paper_avg": [ 79.2, 19.177069640588993 ], "wc_summary_review_avg": [ 42.4, 13.646977687385585 ], "wc_main_review_avg": [ 534.4, 311.4511839759162 ], "wc_review_avg": [ 656.0, 289.21064987306397 ], "wc_reply_reviewers_avg": [ 33.0, 40.95851559810243 ], "wc_reply_authors_avg": [ 712.8, 497.56824657528136 ], "reply_reviewers_avg": [ 0.4, 0.48989794855663565 ], "reply_authors_avg": [ 1.6, 0.8 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.29475317237328164, "corr_recommendation_correctness": 0.7994108773089582, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1542230171456746965&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1", "aff_unique_norm": "Institute of Science and Technology Austria;Massachusetts Institute of Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.ist.ac.at;https://web.mit.edu", "aff_unique_abbr": "IST Austria;MIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "Austria;United States" }, { "id": "rRg0ghtqRw2", "title": "That Escalated Quickly: Compounding Complexity by Editing Levels at the Frontier of Agent Capabilities", "track": "main", "status": "Reject", "tldr": "", "abstract": "Deep Reinforcement Learning (RL) has recently produced impressive results in a series of settings such as games and robotics. However, a key challenge that limits the utility of RL agents for real-world problems is the agent's ability to generalize to unseen variations (or levels). To train more robust agents, the field of Unsupervised Environment Design (UED) seeks to produce a curriculum by updating both the agent and the distribution over training environments. Recent advances in UED have come from promoting levels with high regret, which provides theoretical guarantees in equilibrium and empirically has been shown to produce agents capable of zero-shot transfer to unseen human-designed environments. However, current methods require either learning an environment-generating adversary, which remains a challenging optimization problem, or curating a curriculum from randomly sampled levels, which is ineffective if the search space is too large. In this paper we instead propose to evolve a curriculum, by making edits to previously selected levels. Our approach, which we call Adversarially Compounding Complexity by Editing Levels (ACCEL), produces levels at the frontier of an agent's capabilities, resulting in curricula that start simple but become increasingly complex. ACCEL maintains the theoretical benefits of prior works, while outperforming them empirically when transferring to complex out-of-distribution environments.", "keywords": "Reinforcement Learning;Unsupervised Environment Design", "primary_area": "", "supplementary_material": "", "author": "Jack Parker-Holder;Minqi Jiang;Michael D Dennis;Mikayel Samvelyan;Jakob Nicolaus Foerster;Edward Grefenstette;Tim Rockt\u00e4schel", "authorids": "~Jack_Parker-Holder1;~Minqi_Jiang1;~Michael_D_Dennis1;~Mikayel_Samvelyan1;~Jakob_Nicolaus_Foerster1;~Edward_Grefenstette1;~Tim_Rockt\u00e4schel1", "gender": "M;M;M;M;M;M;M", "homepage": "https://jparkerholder.github.io/;https://twitter.com/minqijiang;;https://www.samvelyan.com/;https://www.jakobfoerster.com;http://egrefen.com/;http://rockt.ai", "dblp": "237/9793.html;270/7949;;170/0101;176/5095;http://dblp.uni-trier.de/pers/hd/g/Grefenstette:Edward;43/11537", "google_scholar": ";;WXXu26AAAAAJ;2Qs19WAAAAAJ;6z4lQzMAAAAJ;https://scholar.google.co.uk/citations?user=ezllEwMAAAAJ;https://scholar.google.co.uk/citations?user=mWBY8aIAAAAJ", "orcid": ";;;0009-0001-6748-8755;;;", "linkedin": ";minqi-jiang-585a6536/;;samvelyan;;;rockt/", "or_profile": "~Jack_Parker-Holder1;~Minqi_Jiang1;~Michael_D_Dennis1;~Mikayel_Samvelyan1;~Jakob_Nicolaus_Foerster1;~Edward_Grefenstette1;~Tim_Rocktaeschel1", "aff": "University of Oxford;University College London;University of California, Berkeley;Meta (FAIR);University of Oxford, University of Oxford;Meta Facebook;Facebook AI Research", "aff_domain": "ox.ac.uk;ucl.ac.uk;berkeley.edu;fb.com;eng.ox.ac.uk;fb.com;facebook.com", "position": "PhD student;PhD;PhD student;Research Assistant;Associate Professor;Research Scientist;Manager, Research Scientist", "bibtex": "@misc{\nparker-holder2022that,\ntitle={That Escalated Quickly: Compounding Complexity by Editing Levels at the Frontier of Agent Capabilities},\nauthor={Jack Parker-Holder and Minqi Jiang and Michael D Dennis and Mikayel Samvelyan and Jakob Nicolaus Foerster and Edward Grefenstette and Tim Rockt{\\\"a}schel},\nyear={2022},\nurl={https://openreview.net/forum?id=rRg0ghtqRw2}\n}", "github": "", "project": "", "reviewers": "Sh63;MHvz;Xz9V;dd8g", "site": "https://openreview.net/forum?id=rRg0ghtqRw2", "pdf_size": 0, "recommendation": "5;5;5;6", "confidence": "3;4;3;4", "correctness": "4;2;4;3", "technical_novelty": "2;2;2;2", "empirical_novelty": "3;2;3;0", "wc_summary_paper": "42;74;147;22", "wc_summary_review": "47;36;91;62", "wc_main_review": "221;617;501;525", "wc_review": "310;727;739;609", "wc_reply_reviewers": "178;279;309;552", "wc_reply_authors": "787;552;588;1414", "reply_reviewers": "1;1;1;1", "reply_authors": "2;1;1;3", "recommendation_avg": [ 5.25, 0.4330127018922193 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.0, 1.224744871391589 ], "wc_summary_paper_avg": [ 71.25, 47.504605039932706 ], "wc_summary_review_avg": [ 59.0, 20.65187642806338 ], "wc_main_review_avg": [ 466.0, 147.9290370414139 ], "wc_review_avg": [ 596.25, 172.89791062936533 ], "wc_reply_reviewers_avg": [ 329.5, 137.32170258192986 ], "wc_reply_authors_avg": [ 835.25, 345.92005940679417 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 1.75, 0.82915619758885 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": -0.17407765595569782, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12681643027950480238&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "aff_unique_index": "0;1;2;3;0;3;3", "aff_unique_norm": "University of Oxford;University College London;University of California, Berkeley;Meta", "aff_unique_dep": ";;;FAIR", "aff_unique_url": "https://www.ox.ac.uk;https://www.ucl.ac.uk;https://www.berkeley.edu;https://meta.org", "aff_unique_abbr": "Oxford;UCL;UC Berkeley;Meta", "aff_campus_unique_index": "1", "aff_campus_unique": ";Berkeley", "aff_country_unique_index": "0;0;1;1;0;1;1", "aff_country_unique": "United Kingdom;United States" }, { "title": "Towards Understanding Generalization via Decomposing Excess Risk Dynamics", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6570", "id": "rS9-7AuPKWK", "poster": "", "openreview": "https://openreview.net/forum?id=rS9-7AuPKWK", "slides": "https://iclr.cc/virtual/2022/poster/6570", "video": "https://iclr.cc/virtual/2022/poster/6570", "author_site": "Jiaye Teng, Jianhao Ma, Yang Yuan", "tldr": "", "abstract": "Generalization is one of the fundamental issues in machine learning. However, traditional techniques like uniform convergence may be unable to explain generalization under overparameterization \\citep{nagarajan2019uniform}. As alternative approaches, techniques based on stability analyze the training dynamics and derive algorithm-dependent generalization bounds. Unfortunately, the stability-based bounds are still far from explaining the surprising generalization in deep learning since neural networks usually suffer from unsatisfactory stability. This paper proposes a novel decomposition framework to improve the stability-based bounds via a more fine-grained analysis of the signal and noise, inspired by the observation that neural networks converge relatively slowly when fitting noise (which indicates better stability). Concretely, we decompose the excess risk dynamics and apply the stability-based bound only on the noise component. The decomposition framework performs well in both linear regimes (overparameterized linear regression) and non-linear regimes (diagonal matrix recovery). Experiments on neural networks verify the utility of the decomposition framework.", "keywords": "generalization;excess risk;stability;dynamics", "primary_area": "", "supplementary_material": "", "author": "Jiaye Teng;Jianhao Ma;Yang Yuan", "authorids": "~Jiaye_Teng2;~Jianhao_Ma1;~Yang_Yuan4", "gender": "M;M;M", "homepage": "http://www.tengjiaye.com;https://jianhaoma.github.io/;http://people.iiis.tsinghua.edu.cn/~yuanyang/index.html", "dblp": "266/8187;;", "google_scholar": "NGqfK2wAAAAJ;https://scholar.google.com/citations?hl=en;", "orcid": "0000-0002-4385-5792;;", "linkedin": ";jianhao-ma/;", "or_profile": "~Jiaye_Teng2;~Jianhao_Ma1;~Yang_Yuan4", "aff": "Tsinghua University;University of Michigan;Tsinghua University", "aff_domain": "iiis.tsinghua.edu.cn;umich.edu;tsinghua.edu.cn", "position": "PhD student;PhD student;Assistant Professor", "bibtex": "@inproceedings{\nteng2022towards,\ntitle={Towards Understanding Generalization via Decomposing Excess Risk Dynamics},\nauthor={Jiaye Teng and Jianhao Ma and Yang Yuan},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=rS9-7AuPKWK}\n}", "github": "", "project": "", "reviewers": "PEVU;HZ9d;MkVE;Pc9J", "pdf_size": 0, "recommendation": "5;5;5;5", "confidence": "4;3;2;3", "correctness": "3;4;3;3", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;0;3;2", "wc_summary_paper": "78;81;61;40", "wc_summary_review": "30;76;50;80", "wc_main_review": "380;278;716;433", "wc_review": "488;435;827;553", "wc_reply_reviewers": "0;47;0;0", "wc_reply_authors": "769;798;365;599", "reply_reviewers": "0;1;0;0", "reply_authors": "1;2;1;1", "recommendation_avg": [ 5.0, 0.0 ], "confidence_avg": [ 3.0, 0.7071067811865476 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 1.75, 1.0897247358851685 ], "wc_summary_paper_avg": [ 65.0, 16.32482771731451 ], "wc_summary_review_avg": [ 59.0, 20.322401432901575 ], "wc_main_review_avg": [ 451.75, 162.41670942363044 ], "wc_review_avg": [ 575.75, 150.95922462704954 ], "wc_reply_reviewers_avg": [ 11.75, 20.351596988934308 ], "wc_reply_authors_avg": [ 632.75, 172.26487599043514 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=577359059581675065&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=rS9-7AuPKWK", "email": "iiis.tsinghua.edu.cn;umich.edu;tsinghua.edu.cn", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "Tsinghua University;University of Michigan", "aff_unique_dep": ";", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.umich.edu", "aff_unique_abbr": "THU;UM", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", "aff_country_unique": "China;United States" }, { "id": "rS9t6WH34p", "title": "Decomposing 3D Scenes into Objects via Unsupervised Volume Segmentation", "track": "main", "status": "Reject", "tldr": "", "abstract": "We present ObSuRF, a method which turns a single image of a scene into a 3D model represented as a set of Neural Radiance Fields (NeRFs), with each NeRF corresponding to a different object. A single forward pass of an encoder network outputs a set of latent vectors describing the objects in the scene. These vectors are used independently to condition a NeRF decoder, defining the geometry and appearance of each object. We make learning more computationally efficient by deriving a novel loss, which allows training NeRFs on RGB-D inputs without explicit ray marching. After confirming that the model performs equal or better than state of the art on three 2D image segmentation benchmarks, we apply it to two multi-object 3D datasets: A multiview version of CLEVR, and a novel dataset in which scenes are populated by ShapeNet models. We find that after training ObSuRF on RGB-D views of training scenes, it is capable of not only recovering the 3D geometry of a scene depicted in a single input image, but also to segment it into objects, despite receiving no supervision in that regard. ", "keywords": "Representation Learning;Unsupervised Object Discovery;Neural Radiance Fields", "primary_area": "", "supplementary_material": "", "author": "Karl Stelzner;Kristian Kersting;Adam R. Kosiorek", "authorids": "~Karl_Stelzner1;~Kristian_Kersting1;~Adam_R._Kosiorek1", "gender": ";M;M", "homepage": ";http://www.ml.informatik.tu-darmstadt.de/;http://akosiorek.github.io/", "dblp": "https://dblp.org/pers/s/Stelzner:Karl;40/3793;202/1842", "google_scholar": "6eyxiGIAAAAJ;QY-earAAAAAJ;https://scholar.google.se/citations?user=i7eVfzwAAAAJ", "orcid": ";0000-0002-2873-9152;", "linkedin": ";;adamkosiorek/", "or_profile": "~Karl_Stelzner1;~Kristian_Kersting1;~Adam_Roman_Kosiorek1", "aff": "TU Darmstadt;TU Darmstadt;Google DeepMind", "aff_domain": "tu-darmstadt.de;tu-darmstadt.de;google.com", "position": "PhD student;Full Professor;Research Scientist", "bibtex": "@misc{\nstelzner2022decomposing,\ntitle={Decomposing 3D Scenes into Objects via Unsupervised Volume Segmentation},\nauthor={Karl Stelzner and Kristian Kersting and Adam R. Kosiorek},\nyear={2022},\nurl={https://openreview.net/forum?id=rS9t6WH34p}\n}", "github": "", "project": "", "reviewers": "VrrK;bAmB;8UAh;1keq", "site": "https://openreview.net/forum?id=rS9t6WH34p", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "3;3;3;2", "correctness": "3;3;4;3", "technical_novelty": "2;2;3;4", "empirical_novelty": "2;3;3;4", "wc_summary_paper": "94;113;88;70", "wc_summary_review": "446;62;88;64", "wc_main_review": "178;161;261;277", "wc_review": "718;336;437;411", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "747;340;410;379", "reply_reviewers": "0;0;0;0", "reply_authors": "2;2;2;1", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 2.75, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.82915619758885 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 91.25, 15.35211711784404 ], "wc_summary_review_avg": [ 165.0, 162.5576820700886 ], "wc_main_review_avg": [ 219.25, 50.43002577829998 ], "wc_review_avg": [ 475.5, 144.83525123394512 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 469.0, 162.40843574149713 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.75, 0.4330127018922193 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.9271726499455306, "corr_recommendation_correctness": -0.13245323570650439, "gs_citation": 106, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12652830788460198402&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0;1", "aff_unique_norm": "Technische Universit\u00e4t Darmstadt;Google", "aff_unique_dep": ";Google DeepMind", "aff_unique_url": "https://www.tu-darmstadt.de;https://deepmind.com", "aff_unique_abbr": "TU Darmstadt;DeepMind", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Darmstadt;", "aff_country_unique_index": "0;0;1", "aff_country_unique": "Germany;United Kingdom" }, { "id": "rSI-tyrv-ni", "title": "Does Entity Abstraction Help Generative Transformers Reason?", "track": "main", "status": "Reject", "tldr": "", "abstract": "Pre-trained language models (LMs) often struggle to reason logically or generalize in a compositional fashion. Recent work suggests that incorporating external entity knowledge can improve language models' abilities to reason and generalize. However the effect of explicitly providing entity abstraction remains unclear, especially with recent studies suggesting that pre-trained models already encode some of that knowledge in their parameters. In this work, we study the utility of incorporating entity type abstractions into pre-trained Transformers and test these methods on three different NLP tasks requiring different forms of logical reasoning: (1) compositional language understanding with text-based relational reasoning (CLUTRR), (2) multi-hop question answering (HotpotQA), and (3) conversational question answering (CoQA). We propose and empirically explore three different ways to add such abstraction: (i) as additional input embeddings, (ii) as a separate sequence to encode, and (iii) as an auxiliary prediction task for the model. Overall our analysis demonstrate that models with abstract entity knowledge performs slightly better than without it. However, our experiments also show that the benefits strongly depend on the technique used and the task at hand. The best abstraction aware model achieved an overall accuracy of 88.8% compared to the baseline model achieving 62.3% on CLUTRR. In addition, abstraction-aware models showed improved compositional generalization in both interpolation and extrapolation settings. However, for HotpotQA and CoQA, we find that F1 scores improve by only 0.5% on average. Our results suggest that the benefits of explicit abstraction could be very significant in formally defined logical reasoning settings such as CLUTRR, but point to the notion that explicit abstraction is likely less beneficial for NLP tasks having less formal logical structure.", "keywords": "Transformers;reasoning;compositional generalization;entity type;abstraction", "primary_area": "", "supplementary_material": "", "author": "Nicolas Gontier;Siva Reddy;Christopher Pal", "authorids": "~Nicolas_Gontier1;~Siva_Reddy1;~Christopher_Pal1", "gender": "M;;M", "homepage": "http://sivareddy.in;https://scholar.google.ca/citations?user=1ScWJOoAAAAJ&hl=en&oi=ao;https://sites.google.com/view/nicolasag", "dblp": "64/8153;45/1217;203/9460", "google_scholar": ";https://scholar.google.ca/citations?user=1ScWJOoAAAAJ;https://scholar.google.ca/citations?user=KNaO2qUAAAAJ", "orcid": ";;", "linkedin": ";;nicolasgontier/", "or_profile": "~Siva_Reddy1;~Christopher_Pal1;~Nicolas_A._Gontier1", "aff": "Mila, McGill University;Polytechnique Montreal;Polytechnique Montreal", "aff_domain": "mila.quebec;polymtl.ca;polymtl.ca", "position": "Assistant Professor;Full Professor;PhD student", "bibtex": "@misc{\ngontier2022does,\ntitle={Does Entity Abstraction Help Generative Transformers Reason?},\nauthor={Nicolas Gontier and Siva Reddy and Christopher Pal},\nyear={2022},\nurl={https://openreview.net/forum?id=rSI-tyrv-ni}\n}", "github": "", "project": "", "reviewers": "pDcD;r8rD;oWCK;sWnL", "site": "https://openreview.net/forum?id=rSI-tyrv-ni", "pdf_size": 0, "recommendation": "3;3;5;6", "confidence": "4;4;4;3", "correctness": "3;2;4;3", "technical_novelty": "2;2;3;2", "empirical_novelty": "2;1;2;2", "wc_summary_paper": "65;57;80;43", "wc_summary_review": "94;53;36;29", "wc_main_review": "152;112;304;82", "wc_review": "311;222;420;154", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "463;404;579;202", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 4.25, 1.299038105676658 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 61.25, 13.386093530227555 ], "wc_summary_review_avg": [ 53.0, 25.228951623085727 ], "wc_main_review_avg": [ 162.5, 85.38588876389353 ], "wc_review_avg": [ 276.75, 99.69798142389845 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 412.0, 136.61442090789683 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.7777777777777777, "corr_recommendation_correctness": 0.5443310539518174, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14095982615363159503&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff_unique_index": "0;1;1", "aff_unique_norm": "McGill University;Polytechnique Montreal", "aff_unique_dep": "Mila;", "aff_unique_url": "https://www.mcgill.ca;https://www.polymtl.ca", "aff_unique_abbr": "McGill;PolyMTL", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Montreal", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Canada" }, { "title": "Eigencurve: Optimal Learning Rate Schedule for SGD on Quadratic Objectives with Skewed Hessian Spectrums", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6626", "id": "rTAclwH46Tb", "poster": "", "openreview": "https://openreview.net/forum?id=rTAclwH46Tb", "slides": "https://iclr.cc/virtual/2022/poster/6626", "video": "https://iclr.cc/virtual/2022/poster/6626", "author_site": "Rui Pan, Haishan Ye, Tong Zhang", "tldr": "", "abstract": "Learning rate schedulers have been widely adopted in training deep neural networks. Despite their practical importance, there is a discrepancy between its practice and its theoretical analysis. For instance, it is not known what schedules of SGD achieve best convergence, even for simple problems such as optimizing quadratic objectives. In this paper, we propose Eigencurve, the first family of learning rate schedules that can achieve minimax optimal convergence rates (up to a constant) for SGD on quadratic objectives when the eigenvalue distribution of the underlying Hessian matrix is skewed. The condition is quite common in practice. Experimental results show that Eigencurve can significantly outperform step decay in image classification tasks on CIFAR-10, especially when the number of epochs is small. Moreover, the theory inspires two simple learning rate schedulers for practical applications that can approximate eigencurve.\n For some problems, the optimal shape of the proposed schedulers resembles that of cosine decay, which sheds light to the success of cosine decay for such situations. For other situations, the proposed schedulers are superior to cosine decay.", "keywords": "optimization;learning rate schedule;optimal convergence rate", "primary_area": "", "supplementary_material": "", "author": "Rui Pan;Haishan Ye;Tong Zhang", "authorids": "~Rui_Pan4;~Haishan_Ye2;~Tong_Zhang2", "gender": "M;M;M", "homepage": ";;http://tongzhang-ml.org", "dblp": "74/9957;162/0002.html;07/4227-1", "google_scholar": ";;LurWtuYAAAAJ", "orcid": "0000-0001-7217-0656;;0000-0002-5511-2558", "linkedin": ";;", "or_profile": "~Rui_Pan4;~Haishan_Ye2;~Tong_Zhang2", "aff": "Hong Kong University of Science and Technology;;Hong Kong University of Science and Technology", "aff_domain": "ust.hk;;ust.hk", "position": "MS student;;Full Professor", "bibtex": "@inproceedings{\npan2022eigencurve,\ntitle={Eigencurve: Optimal Learning Rate Schedule for {SGD} on Quadratic Objectives with Skewed Hessian Spectrums},\nauthor={Rui Pan and Haishan Ye and Tong Zhang},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=rTAclwH46Tb}\n}", "github": "", "project": "", "reviewers": "qo4b;vptL;7X3r;obgH", "pdf_size": 0, "recommendation": "6;6;6;8", "confidence": "4;3;4;4", "correctness": "3;3;4;3", "technical_novelty": "3;2;4;3", "empirical_novelty": "3;2;1;3", "wc_summary_paper": "113;93;224;53", "wc_summary_review": "58;61;159;51", "wc_main_review": "448;404;1116;315", "wc_review": "619;558;1499;419", "wc_reply_reviewers": "1576;37;298;45", "wc_reply_authors": "5097;1165;2624;807", "reply_reviewers": "5;1;1;1", "reply_authors": "9;3;5;2", "recommendation_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 120.75, 63.404948545046544 ], "wc_summary_review_avg": [ 82.25, 44.45995389111419 ], "wc_main_review_avg": [ 570.75, 318.4253248408487 ], "wc_review_avg": [ 773.75, 424.95021767261164 ], "wc_reply_reviewers_avg": [ 489.0, 636.295921721961 ], "wc_reply_authors_avg": [ 2423.25, 1687.0608132192508 ], "reply_reviewers_avg": [ 2.0, 1.7320508075688772 ], "reply_authors_avg": [ 4.75, 2.680951323690902 ], "replies_avg": [ 32, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": -0.3333333333333333, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11376476558105496833&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "pdf": "https://openreview.net/pdf?id=rTAclwH46Tb", "email": "ust.hk;;ust.hk", "author_num": 3, "aff_unique_index": "0;0", "aff_unique_norm": "Hong Kong University of Science and Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.ust.hk", "aff_unique_abbr": "HKUST", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Hong Kong SAR", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "id": "rUPMwMfrVvb", "title": "Improving Discriminative Visual Representation Learning via Automatic Mixup", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Mixup, a convex interpolation technique for data augmentation, has achieved great success in deep neural networks. However, the community usually confines it to supervised scenarios or applies it as a predefined augmentation strategy in various fields, grossly underestimating its capacity for modeling relationships between two classes or instances. In this paper, we decompose mixup into two sub-tasks of mixup generation and classification and formulate it for discriminative representations as class- and instance-level mixup. We first analyze and summarize the properties of instance-level mixup as local smoothness and global discrimination. Then, we improve mixup generation with these properties from two aspects: we enhance modeling non-linear mixup relationships between two samples and discuss learning objectives for mixup generation. Eventually, we propose a general mixup training method called AMix to improve discriminative representations on various scenarios. Extensive experiments on supervised and self-supervised scenarios show that AMix consistently outperforms leading methods by a large margin.", "keywords": "representation learning;classification;mixup", "primary_area": "", "supplementary_material": "", "author": "Siyuan Li;Zicheng Liu;Di Wu;Stan Z. Li", "authorids": "~Siyuan_Li6;~Zicheng_Liu2;~Di_Wu10;~Stan_Z._Li2", "gender": "M;M;M;M", "homepage": "https://lupin1998.github.io/;;;https://en.westlake.edu.cn/academics/School_of_Engineering/About/Our_People/Faculty/201912/t20191206_2497.shtml", "dblp": "63/9705-2;l/ZichengLiu-6;;l/StanZLi", "google_scholar": "https://scholar.google.com/citations?hl=zh-CN;https://scholar.google.com/citations?hl=zh-CN;https://scholar.google.com/citations?hl=en;https://scholar.google.com/citations?hl=zh-CN", "orcid": "0000-0001-6806-2468;;;", "linkedin": "https://www.linkedin.cn/incareer/in/siyuan-li-lupin1998/;;;stan-z-li-%E6%9D%8E%E5%AD%90%E9%9D%92-55753224/", "or_profile": "~Siyuan_Li6;~Zicheng_Liu2;~Di_Wu10;~Stan_Z._Li1", "aff": "Westlake University & Zhejiang University;Zhejiang University;Westlake University;Westlake University", "aff_domain": "westlake.edu.cn;zju.edu.cn;westlake.edu.cn;westlake.edu.cn", "position": "PhD student;PhD student;PhD student;Chair Professor", "bibtex": "@misc{\nli2022improving,\ntitle={Improving Discriminative Visual Representation Learning via Automatic Mixup},\nauthor={Siyuan Li and Zicheng Liu and Di Wu and Stan Z. Li},\nyear={2022},\nurl={https://openreview.net/forum?id=rUPMwMfrVvb}\n}", "github": "", "project": "", "reviewers": "LBpz;VyAr;YpNa", "site": "https://openreview.net/forum?id=rUPMwMfrVvb", "pdf_size": 0, "recommendation": "5;5;6", "confidence": "4;3;4", "correctness": "3;3;3", "technical_novelty": "2;2;2", "empirical_novelty": "2;3;3", "wc_summary_paper": "52;34;79", "wc_summary_review": "33;58;101", "wc_main_review": "360;456;183", "wc_review": "445;548;363", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 55.0, 18.49324200890693 ], "wc_summary_review_avg": [ 64.0, 28.083209693100727 ], "wc_main_review_avg": [ 333.0, 113.07519621915321 ], "wc_review_avg": [ 452.0, 75.68795588907568 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.4999999999999999, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:7oVnZul3eR4J:scholar.google.com/&scioq=Improving+Discriminative+Visual+Representation+Learning+via+Automatic+Mixup&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;0;0", "aff_unique_norm": "Westlake University;Zhejiang University", "aff_unique_dep": ";", "aff_unique_url": "https://www.westlake.edu.cn;https://www.zju.edu.cn", "aff_unique_abbr": "WU;ZJU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "title": "In a Nutshell, the Human Asked for This: Latent Goals for Following Temporal Specifications", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7075", "id": "rUwm9wCjURV", "poster": "", "openreview": "https://openreview.net/forum?id=rUwm9wCjURV", "slides": "https://iclr.cc/virtual/2022/poster/7075", "video": "https://iclr.cc/virtual/2022/poster/7075", "author_site": "Borja G. Leon, Murray Shanahan, Francesco Belardinelli", "tldr": "", "abstract": "We address the problem of building agents whose goal is to learn to execute out-of distribution (OOD) multi-task instructions expressed in temporal logic (TL) by using deep reinforcement learning (DRL). Recent works provided evidence that the agent's neural architecture is a key feature when DRL agents are learning to solve OOD tasks in TL. Yet, the studies on this topic are still in their infancy. In this work, we propose a new deep learning configuration with inductive biases that lead agents to generate latent representations of their current goal, yielding a stronger generalization performance. We use these latent-goal networks within a neuro-symbolic framework that executes multi-task formally-defined instructions and contrast the performance of the proposed neural networks against employing different state-of-the-art (SOTA) architectures when generalizing to unseen instructions in OOD environments. ", "keywords": "Deep Reinforcement Learning;Out-Of-Distribution Generalisation;Temporal Logic", "primary_area": "", "supplementary_material": "", "author": "Borja G. Le\u00f3n;Murray Shanahan;Francesco Belardinelli", "authorids": "~Borja_G._Le\u00f3n1;~Murray_Shanahan1;~Francesco_Belardinelli1", "gender": "M;M;M", "homepage": "https://www.doc.ic.ac.uk/~bg19/;https://www.doc.ic.ac.uk/~mpsha/;https://www.doc.ic.ac.uk/~fbelard/", "dblp": "259/1299;11/5268;59/2916", "google_scholar": "https://scholar.google.es/citations?user=sJiadiMAAAAJ;https://scholar.google.co.uk/citations?user=00bnGpAAAAAJ;https://scholar.google.fr/citations?user=Mr35r1EAAAAJ", "orcid": ";0000-0001-5984-2964;0000-0002-7768-1794", "linkedin": "borja-gonzalez-leon/;;", "or_profile": "~Borja_G._Le\u00f3n1;~Murray_Shanahan1;~Francesco_Belardinelli1", "aff": "Meta Facebook;Imperial College London;Imperial College London", "aff_domain": "fb.com;;imperial.ac.uk", "position": "Intern;Full Professor;Lecturer", "bibtex": "@inproceedings{\nle{\\'o}n2022in,\ntitle={In a Nutshell, the Human Asked for This: Latent Goals for Following Temporal Specifications},\nauthor={Borja G. Le{\\'o}n and Murray Shanahan and Francesco Belardinelli},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=rUwm9wCjURV}\n}", "github": "", "project": "", "reviewers": "FvKn;2S8A;bGYm;Ymus", "pdf_size": 0, "recommendation": "3;6;8;8", "confidence": "4;4;3;4", "correctness": "2;3;4;4", "technical_novelty": "2;2;3;3", "empirical_novelty": "1;3;3;4", "wc_summary_paper": "13;179;149;123", "wc_summary_review": "34;103;40;75", "wc_main_review": "276;335;160;369", "wc_review": "323;617;349;567", "wc_reply_reviewers": "344;0;0;0", "wc_reply_authors": "1630;923;446;1243", "reply_reviewers": "1;0;0;0", "reply_authors": "4;4;1;3", "recommendation_avg": [ 6.25, 2.0463381929681126 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.75, 1.0897247358851685 ], "wc_summary_paper_avg": [ 116.0, 62.68173577685928 ], "wc_summary_review_avg": [ 63.0, 27.90161285660741 ], "wc_main_review_avg": [ 285.0, 79.47012017104291 ], "wc_review_avg": [ 464.0, 129.5414991421668 ], "wc_reply_reviewers_avg": [ 86.0, 148.95636945092346 ], "wc_reply_authors_avg": [ 1060.5, 434.20991467261547 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 3.0, 1.224744871391589 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.49374193110101877, "corr_recommendation_correctness": 0.9945577827230725, "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14969448845199870512&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=rUwm9wCjURV", "email": "fb.com;;imperial.ac.uk", "author_num": 3, "aff_unique_index": "0;1;1", "aff_unique_norm": "Meta;Imperial College London", "aff_unique_dep": "Meta Platforms, Inc.;", "aff_unique_url": "https://meta.com;https://www.imperial.ac.uk", "aff_unique_abbr": "Meta;ICL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1", "aff_country_unique": "United States;United Kingdom" }, { "title": "AdaAug: Learning Class- and Instance-adaptive Data Augmentation Policies", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6420", "id": "rWXfFogxRJN", "poster": "", "openreview": "https://openreview.net/forum?id=rWXfFogxRJN", "slides": "https://iclr.cc/virtual/2022/poster/6420", "video": "https://iclr.cc/virtual/2022/poster/6420", "author_site": "Tsz Him Cheung, Dit-Yan Yeung", "tldr": "", "abstract": "Data augmentation is an effective way to improve the generalization capability of modern deep learning models. However, the underlying augmentation methods mostly rely on handcrafted operations. Moreover, an augmentation policy useful to one dataset may not transfer well to other datasets. Therefore, Automated Data Augmentation (AutoDA) methods, like \\textit{AutoAugment} and \\textit{Population-based Augmentation}, have been proposed recently to automate the process of searching for optimal augmentation policies. However, the augmentation policies found are not adaptive to the dataset used, hindering the effectiveness of these AutoDA methods. In this paper, we propose a novel AutoDA method called \\texttt{AdaAug} to efficiently learn adaptive augmentation policies in a class-dependent and potentially instance-dependent manner. Our experiments show that the adaptive augmentation policies learned by our method transfer well to unseen datasets such as the Oxford Flowers, Oxford-IIT Pets, FGVC Aircraft, and Stanford Cars datasets when compared with other AutoDA baselines. In addition, our method also achieves state-of-the-art performance on the CIFAR-10, CIFAR-100, and SVHN datasets.", "keywords": "Data Augmentation;Automated Data Augmentation", "primary_area": "", "supplementary_material": "/attachment/d21ee9d1692e0095d4a647a463a633089b02ff82.zip", "author": "Tsz-Him Cheung;Dit-Yan Yeung", "authorids": "~Tsz-Him_Cheung1;~Dit-Yan_Yeung2", "gender": "M;M", "homepage": ";https://cse.hkust.edu.hk/faculty/dyyeung/", "dblp": "295/5321;41/5668", "google_scholar": ";nEsOOx8AAAAJ", "orcid": "0000-0002-3600-2927;0000-0003-3716-8125", "linkedin": ";", "or_profile": "~Tsz-Him_Cheung1;~Dit-Yan_Yeung2", "aff": "Hong Kong University of Science and Technology;Hong Kong University of Science and Technology", "aff_domain": "ust.hk;ust.hk", "position": "PhD student;Chair Professor", "bibtex": "@inproceedings{\ncheung2022adaaug,\ntitle={AdaAug: Learning Class- and Instance-adaptive Data Augmentation Policies},\nauthor={Tsz-Him Cheung and Dit-Yan Yeung},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=rWXfFogxRJN}\n}", "github": "", "project": "", "reviewers": "xMkp;kS7y;xQTc;gmjs", "pdf_size": 0, "recommendation": "6;6;6;8", "confidence": "4;4;4;5", "correctness": "3;4;4;4", "technical_novelty": "3;3;3;3", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "86;62;64;53", "wc_summary_review": "57;70;36;33", "wc_main_review": "459;302;74;573", "wc_review": "602;434;174;659", "wc_reply_reviewers": "176;247;0;64", "wc_reply_authors": "790;1566;26;452", "reply_reviewers": "1;2;0;1", "reply_authors": "2;3;1;2", "recommendation_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 66.25, 12.132085558550928 ], "wc_summary_review_avg": [ 49.0, 15.247950681976906 ], "wc_main_review_avg": [ 352.0, 187.13230613659417 ], "wc_review_avg": [ 467.25, 188.43218276080125 ], "wc_reply_reviewers_avg": [ 121.75, 95.90196817584089 ], "wc_reply_authors_avg": [ 708.5, 564.257698219528 ], "reply_reviewers_avg": [ 1.0, 0.7071067811865476 ], "reply_authors_avg": [ 2.0, 0.7071067811865476 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 1.0, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 31, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14967598855528469734&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "pdf": "https://openreview.net/pdf?id=rWXfFogxRJN", "email": "ust.hk;ust.hk", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Hong Kong University of Science and Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.ust.hk", "aff_unique_abbr": "HKUST", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Hong Kong SAR", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "id": "rX3rZYP8zZF", "title": "CareGraph: A Graph-based Recommender System for Diabetes Self-Care", "track": "main", "status": "Reject", "tldr": "", "abstract": "In this work, we build a knowledge graph that captures key attributes of content and notifications in a digital health platform for diabetes management. We propose a Deep Neural Network-based recommender that uses the knowledge graph embeddings to recommend health nudges for maximizing engagement by combating the cold-start and sparsity problems. We use a leave-one-out approach to evaluate the model. We compare the proposed model performance with a text similarity and Deep-and-Cross Network-based approach as the baseline. The overall improvement in Click-Through-Rate prediction AUC for the Knowledge-Graph-based model was 11%. We also observe that our model improved the average AUC by 5% in cold-start situations. ", "keywords": "knowledge graph;knowledge graph embedding;recommendation system", "primary_area": "", "supplementary_material": "", "author": "Sirinart Tangruamsub;Karthik Kappaganthu;John O'Donovan;Anmol Madan", "authorids": "~Sirinart_Tangruamsub1;~Karthik_Kappaganthu1;jodonovan@teladochealth.com;anmol.madan@teladochealth.com", "gender": ";;;", "homepage": ";;;", "dblp": "23/7408;;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Sirinart_Tangruamsub1;~Karthik_Kappaganthu1;jodonovan@teladochealth.com;anmol.madan@teladochealth.com", "aff": "Teladoc Health;;;", "aff_domain": "teladochealth.com;;;", "position": "Applied machine learning scientist;;;", "bibtex": "@misc{\ntangruamsub2022caregraph,\ntitle={CareGraph: A Graph-based Recommender System for Diabetes Self-Care},\nauthor={Sirinart Tangruamsub and Karthik Kappaganthu and John O'Donovan and Anmol Madan},\nyear={2022},\nurl={https://openreview.net/forum?id=rX3rZYP8zZF}\n}", "github": "", "project": "", "reviewers": "qBZ3;ScTh;zvjx;r8hu", "site": "https://openreview.net/forum?id=rX3rZYP8zZF", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "4;4;4;5", "correctness": "3;2;2;2", "technical_novelty": "2;1;2;2", "empirical_novelty": "3;1;2;2", "wc_summary_paper": "34;39;50;60", "wc_summary_review": "35;20;52;195", "wc_main_review": "146;272;690;209", "wc_review": "215;331;792;464", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 2.25, 0.4330127018922193 ], "technical_novelty_avg": [ 1.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 45.75, 10.059199769365355 ], "wc_summary_review_avg": [ 75.5, 69.91602105383286 ], "wc_main_review_avg": [ 329.25, 212.98987651998863 ], "wc_review_avg": [ 450.5, 215.95427756819265 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 1.0, "corr_recommendation_correctness": -0.3333333333333333, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6834145486590060977&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "Teladoc Health", "aff_unique_dep": "", "aff_unique_url": "https://www.teladochealth.com", "aff_unique_abbr": "Teladoc", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "id": "rYzcqIR5Uq-", "title": "Burst Image Restoration and Enhancement", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Modern handheld devices can acquire burst image sequences in quick succession. However, the individual acquired frames suffer from multiple degradations and are misaligned due to camera shake and object motions. The goal of Burst Image Restoration is to effectively combine complementary cues across multiple burst frames to generate high-quality outputs. Towards this goal, we develop a novel approach by solely focusing on the effective information exchange between burst frames, such that the degradations get filtered out while the actual scene details are preserved and enhanced. Our central idea is to create a set of pseudo-burst features that combine complementary information from all the input burst frames to seamlessly exchange information. The pseudo-burst representations encode channel-wise features from the original burst images, thus making it easier for the model to learn distinctive information offered by multiple burst frames. However, the pseudo-burst cannot be successfully created unless the individual burst frames are properly aligned to discount inter-frame movements. Therefore, our approach initially extracts preprocessed features from each burst frame and matches them using an edge-boosting burst alignment module. The pseudo-burst features are then created and enriched using multi-scale contextual information. Our final step is to adaptively aggregate information from the pseudo-burst features to progressively increase resolution in multiple stages while merging the pseudo-burst features. In comparison to existing works that usually follow a late fusion scheme with single-stage upsampling, our approach performs favorably, delivering state-of-the-art performance on burst super-resolution and low-light image enhancement tasks. Our codes and models will be publicly released.", "keywords": "Burst super-resolution;multi-frame processing;feature alignment;burst image enhancement", "primary_area": "", "supplementary_material": "", "author": "Akshay Dudhane;Syed Waqas Zamir;Salman Khan;Fahad Khan;Ming-Hsuan Yang", "authorids": "~Akshay_Dudhane2;~Syed_Waqas_Zamir2;~Salman_Khan4;~Fahad_Khan1;~Ming-Hsuan_Yang1", "gender": "M;M;M;M;M", "homepage": "https://www.ival-mbzuai.com/about;https://salman-h-khan.github.io/;https://sites.google.com/view/fahadkhans/home;https://faculty.ucmerced.edu/mhyang/;", "dblp": "213/7979;32/11535-1;05/8618;79/3711.html;140/7811", "google_scholar": "BG_XEmkAAAAJ;https://scholar.google.es/citations?user=M59O9lkAAAAJ;zvaeYnUAAAAJ;p9-ohHsAAAAJ;POoai-QAAAAJ", "orcid": ";0000-0002-9502-1749;;0000-0003-4848-2304;", "linkedin": ";;;minghsuanyang/;", "or_profile": "~Akshay_Dudhane2;~Salman_Khan4;~Fahad_Khan1;~Ming-Hsuan_Yang1;~Syed_Waqas_Zamir1", "aff": "Mohamed bin Zayed University of Artificial Intelligence;Australian National University;Link\u00f6ping University;University of California at Merced;Inception Institute of Artificial Intelligence", "aff_domain": "mbzuai.ac.ae;anu.edu.au;liu.se;umcerced.edu;inceptioniai.org", "position": "Postdoc;Lecturer;Associate Professor;Professor;Researcher", "bibtex": "@misc{\ndudhane2022burst,\ntitle={Burst Image Restoration and Enhancement},\nauthor={Akshay Dudhane and Syed Waqas Zamir and Salman Khan and Fahad Khan and Ming-Hsuan Yang},\nyear={2022},\nurl={https://openreview.net/forum?id=rYzcqIR5Uq-}\n}", "github": "", "project": "", "reviewers": "htZ7;8YpW;Esb6;6jCJ", "site": "https://openreview.net/forum?id=rYzcqIR5Uq-", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "4;4;3;4", "correctness": "3;1;3;3", "technical_novelty": "2;2;3;2", "empirical_novelty": "3;2;2;3", "wc_summary_paper": "178;42;62;17", "wc_summary_review": "77;21;38;15", "wc_main_review": "571;248;165;217", "wc_review": "826;311;265;249", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 2.5, 0.8660254037844386 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 74.75, 61.70646238442129 ], "wc_summary_review_avg": [ 37.75, 24.180312239505923 ], "wc_main_review_avg": [ 300.25, 159.1059002677148 ], "wc_review_avg": [ 412.75, 239.67308463822133 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 133, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2235176340185033730&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 11, "aff_unique_index": "0;1;2;3;4", "aff_unique_norm": "Mohamed bin Zayed University of Artificial Intelligence;Australian National University;Link\u00f6ping University;University of California, Merced;Inception Institute of Artificial Intelligence", "aff_unique_dep": ";;;;", "aff_unique_url": "https://mbzuai.ac.ae;https://www.anu.edu.au;https://www.liu.se;https://www.ucmerced.edu;https://www.inceptioniai.org", "aff_unique_abbr": "MBZUAI;ANU;LiU;UC Merced;", "aff_campus_unique_index": "1", "aff_campus_unique": ";Merced", "aff_country_unique_index": "0;1;2;3;0", "aff_country_unique": "United Arab Emirates;Australia;Sweden;United States" }, { "id": "rbFPSQHlllm", "title": "AutoMO-Mixer: An automated multi-objective multi-layer perspecton Mixer model for medical image based diagnosis", "track": "main", "status": "Reject", "tldr": "", "abstract": "Medical image based diagnosis is one of the most challenging things which is vital to human life. Accurately identifying the patient's status through medical images plays an important role in treatment of diseases. Deep learning has achieved great success in medical image analysis. Particularly, Convolutional neural network CNN) can obtain promising performance by learning the features in a supervised way. However, since there are too many parameters to train, CNN always requires a large scale dataset to feed, while it is very difficult to collect the required amount of patient images for a particular clinical problem. Recently, MLP-Mixer (Mixer) which is developed based multiple layer perceptron (MLP) was proposed, in which the number of training parameters is greatly decreased by removing convolutions in the architecture, while it can achieve the similar performance with CNN. Furthermore, obtaining the balanced outcome between sensitivity and specificity is of great importance in patient's status identification. As such, a new automated multi-objective Mixer (AutoMO-Mixer) model was developed in this study. In AutoMO-Mixer, sensitivity and specificity were considered as the objective functions simultaneously to train the model and a Pareto-optimal Mixer model set can be obtained in the training stage. Additionally, since there are several hyperparameters to train, the Bayesian optimization was introduced. To obtain a more reliable results in testing stage, the final output was obtained by fusing the output probabilities of Pareto optimal models through the evidence reasoning (ER) approach. The experimental study demonstrated that AutoMO-Mixer can obtain better performance compared with Mixer and CNN.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/cff1fb92e98cf3dc67de2d50facc1cfd8d79ee7b.zip", "author": "Xi Chen;Jiahuan Lv;Xuanqing Mou;Zhiguo Zhou", "authorids": "~Xi_Chen24;~Jiahuan_Lv1;xqmou@mail.xjtu.edu.cn;~Zhiguo_Zhou2", "gender": ";;;M", "homepage": "http://gr.xjtu.edu.cn/web/candy;https://mail.qq.com;;", "dblp": ";;;", "google_scholar": ";;;bjClJewAAAAJ", "orcid": ";;;", "linkedin": ";;;zhiguo-zhou-051b8baa/", "or_profile": "~Xi_Chen24;~Jiahuan_Lv1;xqmou@mail.xjtu.edu.cn;~Zhiguo_Zhou2", "aff": "Xi'an Jiaotong University;Xi'an Jiaotong University;;University of Central Missouri", "aff_domain": "xjtu.edu.cn;xjtu.edu.cn;;ucmo.edu", "position": "Associate Professor;MS student;;Assistant Professor", "bibtex": "@misc{\nchen2022automomixer,\ntitle={Auto{MO}-Mixer: An automated multi-objective multi-layer perspecton Mixer model for medical image based diagnosis},\nauthor={Xi Chen and Jiahuan Lv and Xuanqing Mou and Zhiguo Zhou},\nyear={2022},\nurl={https://openreview.net/forum?id=rbFPSQHlllm}\n}", "github": "", "project": "", "reviewers": "Bur6;SKkc;rxpX;5jFR", "site": "https://openreview.net/forum?id=rbFPSQHlllm", "pdf_size": 0, "recommendation": "1;1;3;3", "confidence": "5;4;4;4", "correctness": "1;2;2;3", "technical_novelty": "1;1;2;2", "empirical_novelty": "1;1;2;2", "wc_summary_paper": "37;43;87;157", "wc_summary_review": "33;15;45;25", "wc_main_review": "778;100;638;548", "wc_review": "848;158;770;730", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 2.0, 1.0 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 2.0, 0.7071067811865476 ], "technical_novelty_avg": [ 1.5, 0.5 ], "empirical_novelty_avg": [ 1.5, 0.5 ], "wc_summary_paper_avg": [ 81.0, 47.93745925682754 ], "wc_summary_review_avg": [ 29.5, 10.988630487917955 ], "wc_main_review_avg": [ 516.0, 253.77549133042774 ], "wc_review_avg": [ 626.5, 273.79691378830404 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.7071067811865475, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:3v_k1DerMC4J:scholar.google.com/&scioq=AutoMO-Mixer:+An+automated+multi-objective+multi-layer+perspecton+Mixer+model+for+medical+image+based+diagnosis&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;1", "aff_unique_norm": "Xi'an Jiao Tong University;University of Central Missouri", "aff_unique_dep": ";", "aff_unique_url": "https://www.xjtu.edu.cn;https://www.ucmo.edu", "aff_unique_abbr": "XJTU;UCM", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1", "aff_country_unique": "China;United States" }, { "id": "rbPg0zkHGi", "title": "Deep Active Learning with Noise Stability", "track": "main", "status": "Reject", "tldr": "", "abstract": "Uncertainty estimation for unlabeled data is crucial to active learning. With a deep neural network employed as the backbone model, the data selection process is highly challenged due to the potential over-confidence of the model inference. Existing methods usually resort to multi-pass model training or adversarial training to handle this challenge, resulting in complex and inefficient pipelines, which prevent the deployment in practice. To address such an issue, in this work we propose a novel Single-Training Multi-Inference algorithm that leverages noise stability to estimate data uncertainty. Specifically, it is measured by to what degree the output deviates from the original observation when the model parameters are randomly perturbed by noise. We provide theoretical analysis of using small Gaussian noise, showing that our method has a solid connection with the classical theory of variance reduction, i.e. labelling a data sample of higher uncertainty, indicated by the inverse noise stability, contributes more to reducing the variance of existing training samples. Despite its simplicity and efficiency, our method outperforms the state-of-the-art active learning baselines in image classification and semantic segmentation tasks.", "keywords": "deep learning;active learning;noise stability", "primary_area": "", "supplementary_material": "/attachment/13d54edf77d50982ee1dd0484497a868f2f53b88.zip", "author": "Xingjian Li;Pengkun Yang;Tianyang Wang;Min Xu;Dejing Dou;Cheng-zhong Xu", "authorids": "~Xingjian_Li1;~Pengkun_Yang1;~Tianyang_Wang1;~Min_Xu4;~Dejing_Dou3;~Cheng-zhong_Xu1", "gender": "M;M;M;;;", "homepage": ";;https://wangt0716.github.io/;;;", "dblp": "79/8061-2;139/0917;;;;", "google_scholar": "https://scholar.google.com/citations?hl=en;fXy1pfcAAAAJ;QbTV0r0AAAAJ;;;", "orcid": ";;0000-0003-3184-0566;;;", "linkedin": ";;tianyang-wang-03a86a4a/;;;", "or_profile": "~Xingjian_Li1;~Pengkun_Yang1;~Tianyang_Wang1;~Min_Xu4;~Dejing_Dou3;~Cheng-zhong_Xu1", "aff": "Baidu;Tsinghua University;Austin Peay State University;;;", "aff_domain": "baidu.com;tsinghua.edu.cn;apsu.edu;;;", "position": "Senior Researcher;Assistant Professor;Assistant Professor;;;", "bibtex": "@misc{\nli2022deep,\ntitle={Deep Active Learning with Noise Stability},\nauthor={Xingjian Li and Pengkun Yang and Tianyang Wang and Min Xu and Dejing Dou and Cheng-zhong Xu},\nyear={2022},\nurl={https://openreview.net/forum?id=rbPg0zkHGi}\n}", "github": "", "project": "", "reviewers": "pooj;owNu;9WDB", "site": "https://openreview.net/forum?id=rbPg0zkHGi", "pdf_size": 0, "recommendation": "3;5;6", "confidence": "5;3;4", "correctness": "1;3;3", "technical_novelty": "2;3;3", "empirical_novelty": "2;0;3", "wc_summary_paper": "49;90;90", "wc_summary_review": "307;101;57", "wc_main_review": "63;1045;172", "wc_review": "419;1236;319", "wc_reply_reviewers": "246;121;0", "wc_reply_authors": "691;1345;522", "reply_reviewers": "1;1;0", "reply_authors": "1;2;1", "recommendation_avg": [ 4.666666666666667, 1.247219128924647 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "correctness_avg": [ 2.3333333333333335, 0.9428090415820634 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 1.6666666666666667, 1.247219128924647 ], "wc_summary_paper_avg": [ 76.33333333333333, 19.3275853524323 ], "wc_summary_review_avg": [ 155.0, 108.97094413955799 ], "wc_main_review_avg": [ 426.6666666666667, 439.4863163082808 ], "wc_review_avg": [ 658.0, 410.7416057166192 ], "wc_reply_reviewers_avg": [ 122.33333333333333, 100.43350481233287 ], "wc_reply_authors_avg": [ 852.6666666666666, 354.9031166702009 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.6546536707079772, "corr_recommendation_correctness": 0.944911182523068, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6872715215088998908&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff_unique_index": "0;1;2", "aff_unique_norm": "Baidu;Tsinghua University;Austin Peay State University", "aff_unique_dep": "Baidu, Inc.;;", "aff_unique_url": "https://www.baidu.com;https://www.tsinghua.edu.cn;https://www.apsu.edu", "aff_unique_abbr": "Baidu;THU;APSU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1", "aff_country_unique": "China;United States" }, { "id": "rbv-uYT1zR", "title": "Coherence-Based Document Clustering", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Latent Dirichlet Allocation or Non-negative Matrix Factorization are just two widely used algorithms for extracting latent topics from large text corpora. While these algorithms differ in their modeling approach, they have in common that hyperparameter optimization is difficult and is mainly achieved by maximizing the extracted topic coherence scores via a grid search. Models using word-document embeddings can automatically detect the number of latent topics, but tend to have problems with smaller datasets and often require pre-trained embedding layers for successful topic extraction. We leverage widely used coherence scores by integrating them into a novel document-level clustering approach using keyword extraction methods. The metric by which most topic extraction methods optimize their hyperparameters is thus optimized during clustering, resulting in ultra-coherent clusters. Moreover, unlike traditional methods, the number of extracted topics or clusters does not need to be determined in advance, saving us an additional optimization step and a time- and computationally-intensive grid search. Additionally, the number of topics is detected much more accurately than by models leveraging word-document embeddings.", "keywords": "Topic Modeling;LDA;Transformers;Coherence;Document Clustering", "primary_area": "", "supplementary_material": "", "author": "Anton Frederik Thielmann;Christoph Weisser;Thomas Kneib;Benjamin Saefken", "authorids": "~Anton_Frederik_Thielmann1;kontakt@christoph-weisser.de;tkneib@uni-goettingen.de;benjamin.saefken@uni-goettingen.de", "gender": "M;;;", "homepage": "https://www.uni-goettingen.de/de/645829.html;;;", "dblp": "307/3231;;;", "google_scholar": "https://scholar.google.de/citations?user=xgsIa7QAAAAJ;;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Anton_Frederik_Thielmann1;kontakt@christoph-weisser.de;tkneib@uni-goettingen.de;benjamin.saefken@uni-goettingen.de", "aff": "Georg-August Universit\u00e4t G\u00f6ttingen;;;", "aff_domain": "uni-goettingen.de;;;", "position": "PhD student;;;", "bibtex": "@misc{\nthielmann2022coherencebased,\ntitle={Coherence-Based Document Clustering},\nauthor={Anton Frederik Thielmann and Christoph Weisser and Thomas Kneib and Benjamin Saefken},\nyear={2022},\nurl={https://openreview.net/forum?id=rbv-uYT1zR}\n}", "github": "", "project": "", "reviewers": "6cSV;4jST;HKJK", "site": "https://openreview.net/forum?id=rbv-uYT1zR", "pdf_size": 0, "recommendation": "1;1;3", "confidence": "5;4;4", "correctness": "1;3;2", "technical_novelty": "1;2;2", "empirical_novelty": "1;1;1", "wc_summary_paper": "56;67;160", "wc_summary_review": "30;21;79", "wc_main_review": "455;361;224", "wc_review": "541;449;463", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 1.6666666666666667, 0.9428090415820634 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "correctness_avg": [ 2.0, 0.816496580927726 ], "technical_novelty_avg": [ 1.6666666666666667, 0.4714045207910317 ], "empirical_novelty_avg": [ 1.0, 0.0 ], "wc_summary_paper_avg": [ 94.33333333333333, 46.64999702274612 ], "wc_summary_review_avg": [ 43.333333333333336, 25.48637980482037 ], "wc_main_review_avg": [ 346.6666666666667, 94.84841707810531 ], "wc_review_avg": [ 484.3333333333333, 40.47495796442811 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.4999999999999999, "corr_recommendation_correctness": 0.0, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3676903102301676631&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0", "aff_unique_norm": "Georg-August Universit\u00e4t G\u00f6ttingen", "aff_unique_dep": "", "aff_unique_url": "https://www.uni-goettingen.de", "aff_unique_abbr": "GAU", "aff_country_unique_index": "0", "aff_country_unique": "Germany" }, { "id": "rczz7TUKIIB", "title": "Loss meta-learning for forecasting", "track": "main", "status": "Reject", "tldr": "", "abstract": "Meta-learning of loss functions for supervised learning has been used to date for classification tasks, or as a way to enable few-shot learning. In this paper, we show how a fairly simple loss meta-learning approach can substantially improve regression results. Specifically, we target forecasting of time series and explore case studies grounded on real-world data, and show that meta-learned losses can benefit the quality of the prediction both in cases that are apparently naive and in practical scenarios where the performance metric is complex, time-correlated, non-differentiable, or not known a-priori.\n\n", "keywords": "meta-learning;loss function;forecasting;learning to learn", "primary_area": "", "supplementary_material": "", "author": "Alan Collet;Antonio Bazco-Nogueras;Albert Banchs;Marco Fiore", "authorids": "alan.collet@imdea.org;~Antonio_Bazco-Nogueras1;albert.banchs@imdea.org;marco.fiore@imdea.org", "gender": ";;;", "homepage": ";https://networks.imdea.org/team/imdea-networks-team/people/antonio-bazco-nogueras/;;", "dblp": ";195/5765;;", "google_scholar": ";ZWz1nF0AAAAJ;;", "orcid": ";0000-0001-7367-0898;;", "linkedin": ";antonio-bazco-nogueras/;;", "or_profile": "alan.collet@imdea.org;~Antonio_Bazco-Nogueras1;albert.banchs@imdea.org;marco.fiore@imdea.org", "aff": ";IMDEA Networks Institute;;", "aff_domain": ";imdea.org;;", "position": ";Postdoc;;", "bibtex": "@misc{\ncollet2022loss,\ntitle={Loss meta-learning for forecasting},\nauthor={Alan Collet and Antonio Bazco-Nogueras and Albert Banchs and Marco Fiore},\nyear={2022},\nurl={https://openreview.net/forum?id=rczz7TUKIIB}\n}", "github": "", "project": "", "reviewers": "m1fA;9TL4;qQ83;5PNy", "site": "https://openreview.net/forum?id=rczz7TUKIIB", "pdf_size": 0, "recommendation": "1;3;3;6", "confidence": "4;4;5;3", "correctness": "2;2;1;3", "technical_novelty": "1;2;2;3", "empirical_novelty": "1;1;1;3", "wc_summary_paper": "48;79;99;127", "wc_summary_review": "17;40;216;43", "wc_main_review": "318;452;621;470", "wc_review": "383;571;936;640", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.25, 1.7853571071357126 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 2.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 1.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 88.25, 28.82164984868146 ], "wc_summary_review_avg": [ 79.0, 79.73393255070265 ], "wc_main_review_avg": [ 465.25, 107.39966247619216 ], "wc_review_avg": [ 632.5, 198.87244655808908 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.5940885257860046, "corr_recommendation_correctness": 0.5940885257860046, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8076922365536551945&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "IMDEA Networks Institute", "aff_unique_dep": "", "aff_unique_url": "https://www.imdea.org/", "aff_unique_abbr": "IMDEA", "aff_country_unique_index": "0", "aff_country_unique": "Spain" }, { "id": "rdBuE6EigGl", "title": "The Importance of the Current Input in Sequence Modeling", "track": "main", "status": "Reject", "tldr": "", "abstract": "The last advances in sequence modeling are mainly based on deep learning approaches. The current state of the art involves the use of variations of the standard LSTM architecture, combined with several tricks that improve the final prediction rates of the trained neural networks. However, in some cases, these adaptations might be too much tuned to the particular problems being addressed. In this article, we show that a very simple idea, to add a direct connection between the input and the output, skipping the recurrent module, leads to an increase of the prediction accuracy in sequence modeling problems related to natural language processing. Experiments carried out on different problems show that the addition of this kind of connection to a recurrent network always improves the results, regardless of the architecture and training-specific details. When this idea is introduced into the models that lead the field, the resulting networks achieve a new state-of-the-art perplexity in language modeling problems.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/ecc8b84876130a71771440da55732198f6946033.zip", "author": "Christian Oliva;Luis F. Lago-Fernandez", "authorids": "christian.oliva@estudiante.uam.es;~Luis_F._Lago-Fernandez1", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";https://scholar.google.es/citations?hl=es", "orcid": ";", "linkedin": ";", "or_profile": "christian.oliva@estudiante.uam.es;~Luis_F._Lago-Fernandez1", "aff": ";Universidad Aut\u00f3noma de Madrid", "aff_domain": ";uam.es", "position": ";Associate Professor", "bibtex": "@misc{\noliva2022the,\ntitle={The Importance of the Current Input in Sequence Modeling},\nauthor={Christian Oliva and Luis F. Lago-Fernandez},\nyear={2022},\nurl={https://openreview.net/forum?id=rdBuE6EigGl}\n}", "github": "", "project": "", "reviewers": "zNdy;m9iZ;6pt8", "site": "https://openreview.net/forum?id=rdBuE6EigGl", "pdf_size": 0, "recommendation": "3;3;5", "confidence": "3;5;5", "correctness": "4;3;3", "technical_novelty": "2;1;3", "empirical_novelty": "2;0;2", "wc_summary_paper": "62;58;87", "wc_summary_review": "29;24;29", "wc_main_review": "101;197;637", "wc_review": "192;279;753", "wc_reply_reviewers": "4;415;0", "wc_reply_authors": "514;939;1045", "reply_reviewers": "1;1;0", "reply_authors": "1;2;2", "recommendation_avg": [ 3.6666666666666665, 0.9428090415820634 ], "confidence_avg": [ 4.333333333333333, 0.9428090415820634 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.0, 0.816496580927726 ], "empirical_novelty_avg": [ 1.3333333333333333, 0.9428090415820634 ], "wc_summary_paper_avg": [ 69.0, 12.832251036613439 ], "wc_summary_review_avg": [ 27.333333333333332, 2.3570226039551585 ], "wc_main_review_avg": [ 311.6666666666667, 233.3599984763646 ], "wc_review_avg": [ 408.0, 246.52383251929214 ], "wc_reply_reviewers_avg": [ 139.66666666666666, 194.69691545807524 ], "wc_reply_authors_avg": [ 832.6666666666666, 229.44909868833412 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.5, "corr_recommendation_correctness": -0.5, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1057076845830444376&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff_unique_index": "0", "aff_unique_norm": "Universidad Aut\u00f3noma de Madrid", "aff_unique_dep": "", "aff_unique_url": "https://www.uam.es", "aff_unique_abbr": "UAM", "aff_country_unique_index": "0", "aff_country_unique": "Spain" }, { "id": "reFFte7mA0F", "title": "Conditional Expectation based Value Decomposition for Scalable On-Demand Ride Pooling", "track": "main", "status": "Reject", "tldr": "", "abstract": "Owing to the benefits for customers (lower prices), drivers (higher revenues), aggregation companies (higher revenues) and the environment (fewer vehicles), on-demand ride pooling (e.g., Uber pool, Grab Share) has become quite popular. The significant computational complexity of matching vehicles to combinations of requests has meant that traditional ride pooling approaches are myopic in that they do not consider the impact of current matches on future value for vehicles/drivers.\n\nRecently, Neural Approximate Dynamic Programming (NeurADP) has employed value decomposition with Approximate Dynamic Programming (ADP) to outperform leading approaches by considering the impact of an individual agent's (vehicle) chosen actions on the future value of that agent. However, in order to ensure scalability and facilitate city-scale ride pooling, NeurADP completely ignores the impact of other agents actions on individual agent/vehicle value. As demonstrated in our experimental results, ignoring the impact of other agents actions on individual value can have a significant impact on the overall performance when there is increased competition among vehicles for demand. Our key contribution is a novel mechanism based on computing conditional expectations through joint conditional probabilities for capturing dependencies on other agents actions without increasing the complexity of training or decision making. We show that our new approach, Conditional Expectation based Value Decomposition (CEVD) outperforms NeurADP by up to 9.76$\\% $in terms of overall requests served, which is a significant improvement on a city wide benchmark taxi dataset. ", "keywords": "Ride-Pool Matching Problem(RMP);Value Decomposition;Approximate Dynamic Programming(ADP);Reinforcement Learning", "primary_area": "", "supplementary_material": "/attachment/ecfd8776903506ea7698a200d75acfe340af1119.zip", "author": "Avinandan Bose;Pradeep Varakantham", "authorids": "~Avinandan_Bose1;~Pradeep_Varakantham1", "gender": "M;M", "homepage": "https://avinandan22.github.io/;http://www.mysmu.edu.sg/faculty/pradeepv", "dblp": "305/7490;72/759", "google_scholar": "https://scholar.google.com/citations?pli=1;https://scholar.google.com.sg/citations?user=BAdQpFkAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Avinandan_Bose1;~Pradeep_Varakantham1", "aff": "IIT Kanpur;Singapore Management University", "aff_domain": "iitk.ac.in;smu.edu.sg", "position": "Undergrad student;Full Professor", "bibtex": "@misc{\nbose2022conditional,\ntitle={Conditional Expectation based Value Decomposition for Scalable On-Demand Ride Pooling},\nauthor={Avinandan Bose and Pradeep Varakantham},\nyear={2022},\nurl={https://openreview.net/forum?id=reFFte7mA0F}\n}", "github": "", "project": "", "reviewers": "9FHL;Siyp;kKMH", "site": "https://openreview.net/forum?id=reFFte7mA0F", "pdf_size": 0, "recommendation": "5;5;8", "confidence": "4;4;3", "correctness": "3;3;3", "technical_novelty": "2;2;3", "empirical_novelty": "3;2;3", "wc_summary_paper": "111;115;40", "wc_summary_review": "71;42;16", "wc_main_review": "450;353;241", "wc_review": "632;510;297", "wc_reply_reviewers": "15;0;0", "wc_reply_authors": "609;744;123", "reply_reviewers": "1;0;0", "reply_authors": "1;1;1", "recommendation_avg": [ 6.0, 1.4142135623730951 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 88.66666666666667, 34.451253807211266 ], "wc_summary_review_avg": [ 43.0, 22.464787260658994 ], "wc_main_review_avg": [ 348.0, 85.39711158269152 ], "wc_review_avg": [ 479.6666666666667, 138.43490247124177 ], "wc_reply_reviewers_avg": [ 5.0, 7.0710678118654755 ], "wc_reply_authors_avg": [ 492.0, 266.67958302052295 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -1.0, "corr_recommendation_correctness": 0.0, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13426406003588071553&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1", "aff_unique_norm": "Indian Institute of Technology Kanpur;Singapore Management University", "aff_unique_dep": ";", "aff_unique_url": "https://www.iitk.ac.in;https://www.smu.edu.sg", "aff_unique_abbr": "IITK;SMU", "aff_campus_unique_index": "0", "aff_campus_unique": "Kanpur;", "aff_country_unique_index": "0;1", "aff_country_unique": "India;Singapore" }, { "id": "rhDaUTtfsqs", "title": "Curriculum Learning: A Regularization Method for Efficient and Stable Billion-Scale GPT Model Pre-Training", "track": "main", "status": "Reject", "tldr": "", "abstract": "Recent works have demonstrated great success in training high-capacity autoregressive language models (GPT, GPT-2, GPT-3) on a huge amount of unlabeled text corpus for text generation. Despite showing great results, autoregressive models are facing a growing training instability issue. Our study on GPT-2 models (117M and 1.5B parameters) show that larger model sizes, sequence lengths, batch sizes, and learning rates would lead to lower training stability and increasing divergence risks. To avoid divergence and achieve better generalization performance, one has to train with smaller batch sizes and learning rates, which leads to worse training efficiency and longer training time. To overcome this stability-efficiency dilemma, we present a study of a curriculum learning-based approach, which helps improves the pre-training convergence speed of autoregressive models. More importantly, we find that curriculum learning, as a regularization method, exerts a gradient variance reduction effect and enables to train autoregressive models with much larger batch sizes and learning rates without training instability, further improving the training speed. Our evaluations demonstrate that curriculum learning enables training GPT-2 models with 8x larger batch size and 4x larger learning rate, whereas the baseline approach struggles with training divergence. To achieve the same validation perplexity targets during pre-training, curriculum learning reduces the required number of tokens and wall clock time by up to 61% and 49%, respectively. To achieve the same or better zero-shot WikiText-103/LAMBADA evaluation results at the end of pre-training, curriculum learning reduces the required number of tokens and wall clock time by up to 54% and 70%, respectively.", "keywords": "curriculum learning;natural language processing;language model pre-training", "primary_area": "", "supplementary_material": "", "author": "Conglong Li;Minjia Zhang;Yuxiong He", "authorids": "~Conglong_Li1;~Minjia_Zhang1;~Yuxiong_He1", "gender": ";M;", "homepage": ";https://minjiazhang.github.io/;", "dblp": "158/7995;58/9033;https://dblp.org/pers/hd/h/He:Yuxiong", "google_scholar": ";https://scholar.google.com/citations?hl=en;SB3_eb0AAAAJ", "orcid": ";0000-0002-8165-166X;", "linkedin": ";minjia-zhang-05857226/;", "or_profile": "~Conglong_Li1;~Minjia_Zhang1;~Yuxiong_He1", "aff": "Microsoft;Microsoft ;Microsoft", "aff_domain": "microsoft.com;microsoft.com;microsoft.com", "position": "Researcher;Principle Researcher;Researcher", "bibtex": "@misc{\nli2022curriculum,\ntitle={Curriculum Learning: A Regularization Method for Efficient and Stable Billion-Scale {GPT} Model Pre-Training},\nauthor={Conglong Li and Minjia Zhang and Yuxiong He},\nyear={2022},\nurl={https://openreview.net/forum?id=rhDaUTtfsqs}\n}", "github": "", "project": "", "reviewers": "D2sE;W3V1;qUd8;LtMz", "site": "https://openreview.net/forum?id=rhDaUTtfsqs", "pdf_size": 0, "recommendation": "5;5;5;8", "confidence": "3;5;4;3", "correctness": "4;4;2;4", "technical_novelty": "2;2;3;3", "empirical_novelty": "3;2;3;3", "wc_summary_paper": "53;48;63;112", "wc_summary_review": "63;29;40;13", "wc_main_review": "543;181;196;569", "wc_review": "659;258;299;694", "wc_reply_reviewers": "322;50;308;105", "wc_reply_authors": "1940;833;1316;1282", "reply_reviewers": "1;1;1;1", "reply_authors": "4;2;2;2", "recommendation_avg": [ 5.75, 1.299038105676658 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 3.5, 0.8660254037844386 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 69.0, 25.406692031825003 ], "wc_summary_review_avg": [ 36.25, 18.18481509391833 ], "wc_main_review_avg": [ 372.25, 184.0562074476164 ], "wc_review_avg": [ 477.5, 199.91060502134448 ], "wc_reply_reviewers_avg": [ 196.25, 120.4333321800904 ], "wc_reply_authors_avg": [ 1342.75, 394.00467954073844 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 2.5, 0.8660254037844386 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.5222329678670935, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 27, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18408936429804558057&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2, "aff_unique_index": "0;0;0", "aff_unique_norm": "Microsoft", "aff_unique_dep": "Microsoft Corporation", "aff_unique_url": "https://www.microsoft.com", "aff_unique_abbr": "Microsoft", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Enabling Arbitrary Translation Objectives with Adaptive Tree Search", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7206", "id": "rhOiUS8KQM9", "poster": "", "openreview": "https://openreview.net/forum?id=rhOiUS8KQM9", "slides": "https://iclr.cc/virtual/2022/poster/7206", "video": "https://iclr.cc/virtual/2022/poster/7206", "author_site": "Wang Ling, Wojciech Stokowiec, Domenic Donato, Chris Dyer, Lei Yu, Laurent Sartran, Austin Matthews", "tldr": "", "abstract": "We introduce an adaptive tree search algorithm, which is a deterministic variant of Monte Carlo tree search, that can find high-scoring outputs under translation models that make no assumptions about the form or structure of the search objective. This algorithm enables the exploration of new kinds of models that are unencumbered by constraints imposed to make decoding tractable, such as autoregressivity or conditional independence assumptions. When applied to autoregressive models, our algorithm has different biases than beam search has, which enables a new analysis of the role of decoding bias in autoregressive models. Empirically, we show that our adaptive tree search algorithm finds outputs with substantially better model scores compared to beam search in autoregressive models, and compared to reranking techniques in models whose scores do not decompose additively with respect to the words in the output. We also characterise the correlation of several translation model objectives with respect to BLEU. We find that while some standard models are poorly calibrated and benefit from the beam search bias, other often more robust models (autoregressive models tuned to maximize expected automatic metric scores, the noisy channel model and a newly proposed objective) benefit from increasing amounts of search using our proposed decoder, whereas the beam search bias limits the improvements obtained from such objectives. Thus, we argue that as models improve, the improvements may be masked by over-reliance on beam search or reranking based methods.", "keywords": "Machine Translation;Decoding;MCTS;Beam Search", "primary_area": "", "supplementary_material": "", "author": "Wang Ling;Wojciech Stokowiec;Domenic Donato;Chris Dyer;Lei Yu;Laurent Sartran;Austin Matthews", "authorids": "~Wang_Ling1;wstokowiec@google.com;~Domenic_Donato1;~Chris_Dyer1;~Lei_Yu4;~Laurent_Sartran1;armatthe@gmail.com", "gender": "M;;M;M;F;;", "homepage": ";;;http://www.cs.cmu.edu/~cdyer/;;;", "dblp": "91/7651;;283/4175;41/6895;https://dblp.uni-trier.de/pid/01/2775-0008;;", "google_scholar": "https://scholar.google.se/citations?user=gl0PhvEAAAAJ;;VQipQCgAAAAJ;W2DsnAkAAAAJ;https://scholar.google.co.uk/citations?user=gX5JBc4AAAAJ;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": "~Wang_Ling1;wstokowiec@google.com;~Domenic_Donato1;~Chris_Dyer1;~Lei_Yu4;~Laurent_Sartran1;armatthe@gmail.com", "aff": ";;Google DeepMind;Google DeepMind;Google DeepMind;Google DeepMind;", "aff_domain": ";;deepmind.com;google.com;deepmind.com;deepmind.com;", "position": ";;Researcher;Research scientist;Research Scientist;Research Engineer;", "bibtex": "@inproceedings{\nling2022enabling,\ntitle={Enabling Arbitrary Translation Objectives with Adaptive Tree Search},\nauthor={Wang Ling and Wojciech Stokowiec and Domenic Donato and Chris Dyer and Lei Yu and Laurent Sartran and Austin Matthews},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=rhOiUS8KQM9}\n}", "github": "", "project": "", "reviewers": "26JG;i2pz;XcaS;9nVZ", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "3;4;4;4", "correctness": "3;3;3;4", "technical_novelty": "3;3;3;3", "empirical_novelty": "3;2;3;4", "wc_summary_paper": "67;178;106;81", "wc_summary_review": "53;44;55;26", "wc_main_review": "613;504;198;198", "wc_review": "733;726;359;305", "wc_reply_reviewers": "0;444;30;0", "wc_reply_authors": "986;1611;865;906", "reply_reviewers": "0;1;1;0", "reply_authors": "4;3;2;4", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 108.0, 42.7609635064506 ], "wc_summary_review_avg": [ 44.5, 11.4564392373896 ], "wc_main_review_avg": [ 378.25, 184.3235945287526 ], "wc_review_avg": [ 530.75, 199.6802130908318 ], "wc_reply_reviewers_avg": [ 118.5, 188.32617980514553 ], "wc_reply_authors_avg": [ 1092.0, 302.7878795460611 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 3.25, 0.82915619758885 ], "replies_avg": [ 20, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.6622661785325219, "corr_recommendation_correctness": 0.9271726499455306, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6920257071182753282&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=rhOiUS8KQM9", "email": ";;deepmind.com;google.com;deepmind.com;deepmind.com;", "author_num": 7, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Google", "aff_unique_dep": "Google DeepMind", "aff_unique_url": "https://deepmind.com", "aff_unique_abbr": "DeepMind", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United Kingdom" }, { "id": "rl8jF3GENq", "title": "Wavelet-Packet Powered Deepfake Image Detection", "track": "main", "status": "Reject", "tldr": "", "abstract": "As neural networks become able to generate realistic artificial images, they have the potential to improve movies, music, video games and make the internet an even more creative and inspiring place.\nYet, at the same time, the latest technology potentially enables new digital ways to lie. In response, the need for a diverse and reliable method toolbox arises to identify artificial images and other content.\nPrevious work primarily relies on pixel-space CNN or the Fourier transform. To the best of our knowledge, synthesized fake image analysis and detection methods based on a multi-scale wavelet representation,\nwhich is localized in both space and frequency, have been absent thus far. \nThis paper proposes to learn a model for the detection of synthetic images based on the wavelet-packet representation of natural and GAN-generated images. We evaluate our method on FFHQ, CelebA, and LSUN source identification problems and find improved or competitive performance. Our forensic classifier has a small network size and can be learned efficiently.\nFurthermore, a comparison of the wavelet coefficients from these two sources of images allows an interpretation and identifies significant differences.", "keywords": "signal processing;wavelets;wavelet packets;deepfake detection", "primary_area": "", "supplementary_material": "/attachment/e82140eacd5d0729e8f8a5b971f2aa185d5d5309.zip", "author": "Moritz Wolter;Felix Blanke;Charles Tapley Hoyt;Jochen Garcke", "authorids": "~Moritz_Wolter1;~Felix_Blanke1;~Charles_Tapley_Hoyt1;~Jochen_Garcke1", "gender": "M;M;M;", "homepage": "https://www.wolter.tech/;https://github.com/felixblanke;https://cthoyt.com;", "dblp": "222/2629;;199/2168;89/6182", "google_scholar": "https://scholar.google.de/citations?user=OLvO62sAAAAJ;;PjrpzUIAAAAJ;BGZK0XMAAAAJ", "orcid": "0000-0002-1511-7768;;0000-0003-4423-4370;0000-0002-8334-3695", "linkedin": ";;cthoyt/;", "or_profile": "~Moritz_Wolter1;~Felix_Blanke1;~Charles_Tapley_Hoyt1;~Jochen_Garcke1", "aff": "Fraunhofer Gesellschaft;University of Bonn;Harvard Medical School;Fraunhofer SCAI", "aff_domain": "fraunhofer.de;uni-bonn.de;hms.harvard.edu;scai.fraunhofer.de", "position": "Researcher;MS student;Postdoc;Full Professor", "bibtex": "@misc{\nwolter2022waveletpacket,\ntitle={Wavelet-Packet Powered Deepfake Image Detection},\nauthor={Moritz Wolter and Felix Blanke and Charles Tapley Hoyt and Jochen Garcke},\nyear={2022},\nurl={https://openreview.net/forum?id=rl8jF3GENq}\n}", "github": "", "project": "", "reviewers": "feXB;sHXs;SoZC;zoMp;qN3N", "site": "https://openreview.net/forum?id=rl8jF3GENq", "pdf_size": 0, "recommendation": "3;3;5;5;5", "confidence": "4;5;3;5;4", "correctness": "3;4;3;4;3", "technical_novelty": "2;1;2;3;3", "empirical_novelty": "2;2;2;2;2", "wc_summary_paper": "37;67;46;49;77", "wc_summary_review": "62;32;54;32;28", "wc_main_review": "730;166;201;267;317", "wc_review": "829;265;301;348;422", "wc_reply_reviewers": "96;0;36;0;0", "wc_reply_authors": "629;408;374;236;345", "reply_reviewers": "1;0;1;0;0", "reply_authors": "2;1;1;1;1", "recommendation_avg": [ 4.2, 0.9797958971132712 ], "confidence_avg": [ 4.2, 0.7483314773547882 ], "correctness_avg": [ 3.4, 0.4898979485566356 ], "technical_novelty_avg": [ 2.2, 0.7483314773547882 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 55.2, 14.620533505997654 ], "wc_summary_review_avg": [ 41.6, 13.705473359209451 ], "wc_main_review_avg": [ 336.2, 203.70704455172876 ], "wc_review_avg": [ 433.0, 204.84628383253624 ], "wc_reply_reviewers_avg": [ 26.4, 37.48919844435194 ], "wc_reply_authors_avg": [ 398.4, 128.91020130307763 ], "reply_reviewers_avg": [ 0.4, 0.48989794855663565 ], "reply_authors_avg": [ 1.2, 0.4000000000000001 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.3273268353539886, "corr_recommendation_correctness": -0.16666666666666663, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8543016414798329530&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "Fraunhofer Gesellschaft;University of Bonn;Harvard University;Fraunhofer Society", "aff_unique_dep": ";;Medical School;SCAI (Supercomputing and Data Analysis)", "aff_unique_url": "https://www.fraunhofer.de;https://www.uni-bonn.de/;https://hms.harvard.edu;https://www.scai.fraunhofer.de", "aff_unique_abbr": "Fraunhofer;UBonn;HMS;Fraunhofer SCAI", "aff_campus_unique_index": "1", "aff_campus_unique": ";Boston", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "Germany;United States" }, { "title": "Graph-Enhanced Exploration for Goal-oriented Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/5917", "id": "rlYiXFdSy70", "poster": "", "openreview": "https://openreview.net/forum?id=rlYiXFdSy70", "slides": "https://iclr.cc/virtual/2022/poster/5917", "video": "https://iclr.cc/virtual/2022/poster/5917", "author_site": "Jiarui Jin, Sijin Zhou, Weinan Zhang, Tong He, Yong Yu, Rasool Fakoor", "tldr": "", "abstract": "Goal-oriented Reinforcement Learning (GoRL) is a promising approach for scaling up RL techniques on sparse reward environments requiring long horizon planning. Recent works attempt to build suitable abstraction graph of the environment and enhance GoRL with classical graphical methods such as shortest path searching; however, these approaches mainly focus on either graph construction or agent exploitation, but leave the exploration lack of study. This paper proposes Graph-enhanced GoRL (G2RL), a new GoRL framework for effective exploration and efficient training based on the state-transition graph. We first introduce the optimal goals for exploration on the graph and then use them as supervised signals to train the goal generator in G2RL in a hindsight manner. Furthermore, we define relevant trajectories of a state based on its graph neighborhood and show that giving high priority to these trajectories would lead to an efficient policy learning. In addition to the theoretical results regarding optimal goal generation, our empirical results on standard discrete and continuous control benchmarks show that leveraging the state-transition graph is beneficial for GoRL to learn an effective and informative exploration strategy and outperform the state-of-the-art methods.", "keywords": "Deep Reinforcement Learning;Goal-oriented Reinforcement Learning;Graph Structure;Exploration", "primary_area": "", "supplementary_material": "", "author": "Jiarui Jin;Sijin Zhou;Weinan Zhang;Tong He;Yong Yu;Rasool Fakoor", "authorids": "~Jiarui_Jin1;~Sijin_Zhou1;~Weinan_Zhang1;~Tong_He5;~Yong_Yu1;~Rasool_Fakoor1", "gender": "M;F;M;M;;M", "homepage": "https://jinjiarui.github.io/;http://apex.sjtu.edu.cn/;http://wnzhang.net;https://hetong007.github.io/;https://apex.sjtu.edu.cn/members/yyu;http://rasoolfa.github.io", "dblp": "241/9563;;28/10261-1;02/1554-2;43/5685.html;123/2447", "google_scholar": "unCPHQEAAAAJ;;Qzss0GEAAAAJ;hV5D8GYAAAAJ;;nVsOPtQAAAAJ", "orcid": "0000-0001-6458-1586;;0000-0002-0127-2425;;0000-0003-4457-2820;", "linkedin": "jiarui-jerry-jin-ba4a84176/;;;;;rasool-fakoor-695b5845/", "or_profile": "~Jiarui_Jin1;~Sijin_Zhou1;~Weinan_Zhang1;~Tong_He5;~Yong_Yu1;~Rasool_Fakoor1", "aff": "Shanghai Jiaotong University;;Shanghai Jiaotong University;Amazon;Shanghai Jiaotong University;Amazon Web Services", "aff_domain": "sjtu.edu.cn;;sjtu.edu.cn;amazon.com;sjtu.edu.cn;amazon.com", "position": "PhD student;;Associate Professor;Researcher;Full Professor;Researcher", "bibtex": "@misc{\njin2022graphenhanced,\ntitle={Graph-Enhanced Exploration for Goal-oriented Reinforcement Learning},\nauthor={Jiarui Jin and Sijin Zhou and Weinan Zhang and Tong He and Yong Yu and Rasool Fakoor},\nyear={2022},\nurl={https://openreview.net/forum?id=rlYiXFdSy70}\n}", "github": "", "project": "", "reviewers": "EMRL;Sjwy;Vww5;6obU", "pdf_size": 0, "recommendation": "6;6;6;6", "confidence": "4;4;2;3", "correctness": "3;3;4;3", "technical_novelty": "3;3;2;4", "empirical_novelty": "3;3;3;3", "wc_summary_paper": "125;32;147;113", "wc_summary_review": "59;104;42;91", "wc_main_review": "688;140;332;655", "wc_review": "872;276;521;859", "wc_reply_reviewers": "271;0;0;78", "wc_reply_authors": "2499;584;743;1086", "reply_reviewers": "1;0;0;1", "reply_authors": "4;2;1;3", "recommendation_avg": [ 6.0, 0.0 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 104.25, 43.45903243285566 ], "wc_summary_review_avg": [ 74.0, 24.68805379125702 ], "wc_main_review_avg": [ 453.75, 228.38385998139185 ], "wc_review_avg": [ 632.0, 249.09134870565055 ], "wc_reply_reviewers_avg": [ 87.25, 110.76410745363319 ], "wc_reply_authors_avg": [ 1228.0, 755.9044251755641 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 2.5, 1.118033988749895 ], "replies_avg": [ 20, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10054125936349127942&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "pdf": "https://openreview.net/pdf?id=rlYiXFdSy70", "email": "sjtu.edu.cn;;sjtu.edu.cn;amazon.com;sjtu.edu.cn;amazon.com", "author_num": 6, "aff_unique_index": "0;0;1;0;1", "aff_unique_norm": "Shanghai Jiao Tong University;Amazon", "aff_unique_dep": ";Amazon.com, Inc.", "aff_unique_url": "https://www.sjtu.edu.cn;https://www.amazon.com", "aff_unique_abbr": "SJTU;Amazon", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;1", "aff_country_unique": "China;United States" }, { "id": "rmMOupN1Sqp", "title": "Don't Take It Literally: An Edit-Invariant Sequence Loss for Text Generation", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Neural text generation models are typically trained by maximizing log-likelihood with the sequence cross entropy loss, which encourages an \\emph{exact} token-by-token match between a target sequence with a generated sequence. Such training objective is sub-optimal when the target sequence is not perfect, e.g., when the target sequence is corrupted with noises, or when only weak sequence supervision is available. To address this challenge, we propose a novel Edit-Invariant Sequence Loss (EISL), which computes the matching loss of a target $n$-gram with all $n$-grams in the generated sequence. Drawing inspirations from the classical convolutional networks (ConvNets) which capture shift-invariance in image modeling, EISL is designed to be robust to the shift of $n$-grams to tolerate various noises and edits in the target sequences. Moreover, the EISL computation is essentially a convolution operation with target $n$-grams as kernels, which is easy to implement and efficient to compute with existing libraries. To demonstrate the effectiveness of EISL, we conduct experiments on a wide range of tasks, including machine translation with noisy target sequences, unsupervised text style transfer with only weak training signals, and non-autoregressive generation with non-predefined generation order. Experimental results show our method significantly outperforms the common cross-entropy loss and other strong baselines on all the tasks. ", "keywords": "text generation;edit invariance;natural language processing;text stytle transfer;learning with noise", "primary_area": "", "supplementary_material": "/attachment/69dd0d99051a0e3bab7631708235d6972ec21612.zip", "author": "Guangyi Liu;Zichao Yang;Tianhua Tao;Xiaodan Liang;Zhen Li;Bowen Zhou;Shuguang Cui;Zhiting Hu", "authorids": "~Guangyi_Liu1;~Zichao_Yang1;~Tianhua_Tao1;~Xiaodan_Liang2;~Zhen_Li6;~Bowen_Zhou4;~Shuguang_Cui1;~Zhiting_Hu3", "gender": ";M;M;F;;;M;M", "homepage": ";;http://www.taotianhua.com/;https://www.sysu-hcp.net/;;;https://sse.cuhk.edu.cn/en/content/1415;http://zhiting.ucsd.edu", "dblp": ";07/8707;296/1990.html;;;;48/4914;134/4031", "google_scholar": ";https://scholar.google.co.uk/citations?user=siCYLcUAAAAJ;;voxznZAAAAAJ;;https://scholar.google.com/citations?hl=zh-CN;https://scholar.google.com.hk/citations?user=1o_qvR0AAAAJ;N7_xhHoAAAAJ", "orcid": ";;;;;;0000-0003-2608-775X;", "linkedin": ";;;;;;;", "or_profile": "~Guangyi_Liu1;~Zichao_Yang1;~Tianhua_Tao1;~Xiaodan_Liang2;~Zhen_Li6;~Bowen_Zhou4;~Shuguang_Cui1;~Zhiting_Hu3", "aff": ";;Tsinghua University;SUN YAT-SEN UNIVERSITY;;JD.com;Shenzhen Research Institute of Big Data;Amazon", "aff_domain": ";;tsinghua.edu.cn;sysu.edu.cn;;jd.com;sribd.cn;amazon.com", "position": ";;Undergrad student;Associate Professor;;Vice President;Vice Executive Director;Researcher", "bibtex": "@misc{\nliu2022dont,\ntitle={Don't Take It Literally: An Edit-Invariant Sequence Loss for Text Generation},\nauthor={Guangyi Liu and Zichao Yang and Tianhua Tao and Xiaodan Liang and Zhen Li and Bowen Zhou and Shuguang Cui and Zhiting Hu},\nyear={2022},\nurl={https://openreview.net/forum?id=rmMOupN1Sqp}\n}", "github": "", "project": "", "reviewers": "2bw3;5Pst;ASST;iuYx", "site": "https://openreview.net/forum?id=rmMOupN1Sqp", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "4;4;3;4", "correctness": "2;3;3;4", "technical_novelty": "3;3;3;4", "empirical_novelty": "3;4;2;3", "wc_summary_paper": "117;81;80;72", "wc_summary_review": "54;68;30;40", "wc_main_review": "386;215;149;188", "wc_review": "557;364;259;300", "wc_reply_reviewers": "183;0;0;0", "wc_reply_authors": "675;500;356;945", "reply_reviewers": "2;0;0;0", "reply_authors": "2;1;1;2", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 3.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 87.5, 17.38533865071371 ], "wc_summary_review_avg": [ 48.0, 14.352700094407323 ], "wc_main_review_avg": [ 234.5, 90.56075308874148 ], "wc_review_avg": [ 370.0, 114.26504277336967 ], "wc_reply_reviewers_avg": [ 45.75, 79.24132444627614 ], "wc_reply_authors_avg": [ 619.0, 219.51195867195938 ], "reply_reviewers_avg": [ 0.5, 0.8660254037844386 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": -0.13245323570650439, "corr_recommendation_correctness": 0.9733285267845754, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1, "aff_unique_index": "0;1;2;3;4", "aff_unique_norm": "Tsinghua University;Sun Yat-sen University;JD.com;Shenzhen Research Institute of Big Data;Amazon", "aff_unique_dep": ";;;;Amazon.com, Inc.", "aff_unique_url": "https://www.tsinghua.edu.cn;http://www.sysu.edu.cn;https://www.jd.com;http://www.sribd.cn;https://www.amazon.com", "aff_unique_abbr": "THU;SYSU;JD;;Amazon", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;1", "aff_country_unique": "China;United States" }, { "id": "roaUjIvWD8j", "title": "Fingerprints of Super Resolution Networks", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Several recent studies have demonstrated that deep-learning based image generation models, such as GANs, can be uniquely identified, and possibly even reverse engineered, by the fingerprints they leave on their output images. We extend this research to a previously unstudied type of image generator: single image super-resolution (SISR) networks. Compared to previously studied models, SISR networks are a uniquely challenging class of image generation model from which to extract and analyze fingerprints, as they can often generate images that closely match the corresponding ground truth and thus likely leave little flexibility to embed signatures. We take SISR models as examples to investigate if the findings from the previous work on fingerprints of GAN-based networks are valid for general image generation models. In this paper, we present an analysis of the capabilities and limitations of model fingerprinting in this domain. We show that SISR networks with a high upscaling factor or trained using adversarial loss leave highly distinctive fingerprints, and show promising results for reverse engineering some hyperparameters of SISR networks, including scale and loss function.", "keywords": "super resolution;model attribution;model parsing;GAN;image forensics", "primary_area": "", "supplementary_material": "", "author": "Jeremy Vonderfecht;Feng Liu", "authorids": "~Jeremy_Vonderfecht1;~Feng_Liu6", "gender": ";", "homepage": "https://github.com/JeremyIV;", "dblp": ";77/1318-15", "google_scholar": ";https://scholar.google.com/citations?hl=en", "orcid": ";", "linkedin": ";", "or_profile": "~Jeremy_Vonderfecht1;~Feng_Liu6", "aff": "Portland State University;Portland State University", "aff_domain": "pdx.edu;pdx.edu", "position": "PhD student;Full Professor", "bibtex": "@misc{\nvonderfecht2022fingerprints,\ntitle={Fingerprints of Super Resolution Networks},\nauthor={Jeremy Vonderfecht and Feng Liu},\nyear={2022},\nurl={https://openreview.net/forum?id=roaUjIvWD8j}\n}", "github": "", "project": "", "reviewers": "VGrY;wP88;qqpG;kxmx", "site": "https://openreview.net/forum?id=roaUjIvWD8j", "pdf_size": 0, "recommendation": "3;3;3;6", "confidence": "4;4;5;3", "correctness": "2;3;4;3", "technical_novelty": "2;2;2;3", "empirical_novelty": "3;2;2;3", "wc_summary_paper": "121;65;50;79", "wc_summary_review": "45;24;4;47", "wc_main_review": "485;240;425;54", "wc_review": "651;329;479;180", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.75, 1.299038105676658 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 78.75, 26.461056290329758 ], "wc_summary_review_avg": [ 30.0, 17.507141400011598 ], "wc_main_review_avg": [ 301.0, 168.79129124454258 ], "wc_review_avg": [ 409.75, 174.8590503805851 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.816496580927726, "corr_recommendation_correctness": 0.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5999605161727251500&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0", "aff_unique_norm": "Portland State University", "aff_unique_dep": "", "aff_unique_url": "https://www.pdx.edu", "aff_unique_abbr": "PSU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "roaZrQMGsd6", "title": "CARD: Certifiably Robust Machine Learning Pipeline via Domain Knowledge Integration", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "The advent of ubiquitous machine learning (ML) has led to exciting revolution in computing today. However, recent studies have shown that ML, especially deep neural networks (DNNs), are vulnerable to adversarial examples, which are able to mislead DNNs with carefully crafted stealthy perturbations. So far, many defense approaches have been proposed against such adversarial attacks, both empirically and theoretically. Though effective under certain conditions, existing empirical defenses are usually found vulnerable against new attacks; existing certified defenses are only able to certify robustness against limited perturbation radius. As current pure data-driven defenses have reached a bottleneck towards certifiably robust ML, in this paper we propose a certifiably robust ML pipeline CARD, aiming to integrate exogenous information, such as domain knowledge, as logical rules with ML models to improve the certified robustness. Intuitively, domain knowledge (e.g., cat belongs to the animal category) will prevent attacks that violate these knowledge rules, and it is also challenging to construct adaptive attacks satisfying such pre-defined logical relationships. In particular, we express the domain knowledge as first-order logic rules and embed these logic rules in a probabilistic graphical model. We then prove that such a probabilistic graphical model can be mapped to a 1-layer NN for efficient training. We conduct extensive experiments on several high-dimensional datasets and show that our proposed CARD achieves the state-of-the-art certified robustness.", "keywords": "certified robustness;knowledge rule integration", "primary_area": "", "supplementary_material": "/attachment/b6bfdcb1a80a392b0852945db5489b2a53370d94.zip", "author": "Jiawei Zhang;Linyi Li;Bo Li", "authorids": "~Jiawei_Zhang9;~Linyi_Li1;~Bo_Li19", "gender": "M;M;F", "homepage": "https://github.com/javyduck;http://linyil.com;http://boli.cs.illinois.edu/", "dblp": ";99/4340-1.html;50/3402-26", "google_scholar": "vCY9ZRcAAAAJ;-b0sk-YAAAAJ;K8vJkTcAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Jiawei_Zhang9;~Linyi_Li1;~Bo_Li19", "aff": "University of Illinois, Urbana Champaign;Microsoft Research;University of Illinois, Urbana Champaign", "aff_domain": "illinois.edu;microsoft.com;illinois.edu", "position": "MS student;Research Intern;Assistant Professor", "bibtex": "@misc{\nzhang2022card,\ntitle={{CARD}: Certifiably Robust Machine Learning Pipeline via Domain Knowledge Integration},\nauthor={Jiawei Zhang and Linyi Li and Bo Li},\nyear={2022},\nurl={https://openreview.net/forum?id=roaZrQMGsd6}\n}", "github": "", "project": "", "reviewers": "kp9G;XDWJ;jLZo;N949", "site": "https://openreview.net/forum?id=roaZrQMGsd6", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "4;4;3;4", "correctness": "3;2;3;3", "technical_novelty": "3;3;3;3", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "83;61;87;99", "wc_summary_review": "102;39;42;32", "wc_main_review": "644;367;507;234", "wc_review": "829;467;636;365", "wc_reply_reviewers": "0;233;0;0", "wc_reply_authors": "1234;742;1140;804", "reply_reviewers": "0;1;0;0", "reply_authors": "2;2;2;2", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 82.5, 13.738631664034086 ], "wc_summary_review_avg": [ 53.75, 28.092481200492063 ], "wc_main_review_avg": [ 438.0, 153.1780010314797 ], "wc_review_avg": [ 574.25, 176.06728117398757 ], "wc_reply_reviewers_avg": [ 58.25, 100.8919595408871 ], "wc_reply_authors_avg": [ 980.0, 210.79373804740976 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 2.0, 0.0 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Jvbo_uRaeY4J:scholar.google.com/&scioq=CARD:+Certifiably+Robust+Machine+Learning+Pipeline+via+Domain+Knowledge+Integration&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1;0", "aff_unique_norm": "University of Illinois Urbana-Champaign;Microsoft", "aff_unique_dep": ";Microsoft Research", "aff_unique_url": "https://illinois.edu;https://www.microsoft.com/en-us/research", "aff_unique_abbr": "UIUC;MSR", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Urbana-Champaign;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "roxWnqcguNq", "title": "Constituency Tree Representation for Argument Unit Recognition", "track": "main", "status": "Reject", "tldr": "", "abstract": "The extraction of arguments from sentences is usually studied by considering only the neighbourhood dependencies of words. Such a representation does not rely on the syntactic structure of the sentence and can lead to poor results especially in languages where grammatical categories are scattered in the sentence. In this paper, we investigate the advantages of using a constituency tree representation of sentences for argument discourse unit (ADU) prediction. We demonstrate that the constituency structure is more powerful than simple linear dependencies between neighbouring words in the sentence. Our work was organised as follows: First, we compare the maximum depth allowed for our constituency trees. This first step allows us to choose an optimal maximum depth. Secondly, we combine this structure with graph neural networks, which are very successful in neural network tasks. Finally, we evaluate the benefits of adding a conditional random field to model global dependencies between labels, given local dependency rules. We improve the current best models for argument unit recognition at token level and also present an explainability method to evaluate the suitability of our model architecture.", "keywords": "transformer;attention;bert;graph attention network;constituency parsing;deep learning", "primary_area": "", "supplementary_material": "/attachment/733de2b2447d4eba718616f0c3687b8f0966280c.zip", "author": "Samuel Guilluy;Florian M\u00e9hats;Billal Chouli", "authorids": "~Samuel_Guilluy1;~Florian_M\u00e9hats1;billal.chouli@neurochaintech.io", "gender": "M;M;", "homepage": ";https://perso.univ-rennes1.fr/florian.mehats/;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": "samuel-guilluy/;;", "or_profile": "~Samuel_Guilluy1;~Florian_M\u00e9hats1;billal.chouli@neurochaintech.io", "aff": "Universit\u00e9 Rennes 1 - IRMAR;Rennes 1 University and IRMAR;", "aff_domain": "univ-rennes1.fr;univ-rennes1.fr;", "position": "PhD student;Professor on leave;", "bibtex": "@misc{\nguilluy2022constituency,\ntitle={Constituency Tree Representation for Argument Unit Recognition},\nauthor={Samuel Guilluy and Florian M{\\'e}hats and Billal Chouli},\nyear={2022},\nurl={https://openreview.net/forum?id=roxWnqcguNq}\n}", "github": "", "project": "", "reviewers": "cyhs;iNq3;ExWX;sFFn", "site": "https://openreview.net/forum?id=roxWnqcguNq", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "4;5;4;4", "correctness": "2;3;2;3", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "99;132;100;41", "wc_summary_review": "87;96;37;57", "wc_main_review": "370;296;475;196", "wc_review": "556;524;612;294", "wc_reply_reviewers": "70;122;0;0", "wc_reply_authors": "84;318;314;271", "reply_reviewers": "1;1;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 93.0, 32.8252951243397 ], "wc_summary_review_avg": [ 69.25, 23.562417108607512 ], "wc_main_review_avg": [ 334.25, 102.05972516129955 ], "wc_review_avg": [ 496.5, 121.08158406628152 ], "wc_reply_reviewers_avg": [ 48.0, 51.40038910358559 ], "wc_reply_authors_avg": [ 246.75, 95.75326365195079 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:w7mwB9yKMxgJ:scholar.google.com/&scioq=Constituency+Tree+Representation+for+Argument+Unit+Recognition&hl=en&as_sdt=0,5", "gs_version_total": 4, "aff_unique_index": "0;1", "aff_unique_norm": "Universit\u00e9 Rennes 1;Rennes 1 University", "aff_unique_dep": "Institut de Recherche Math\u00e9matique de Rennes (IRMAR);", "aff_unique_url": "https://www.univ-rennes1.fr;https://www.univ-rennes1.fr", "aff_unique_abbr": "UR1;UR1", "aff_campus_unique_index": "0", "aff_campus_unique": "Rennes;", "aff_country_unique_index": "0;0", "aff_country_unique": "France" }, { "title": "Proof Artifact Co-Training for Theorem Proving with Language Models", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6391", "id": "rpxJc9j04U", "poster": "", "openreview": "https://openreview.net/forum?id=rpxJc9j04U", "slides": "https://iclr.cc/virtual/2022/poster/6391", "video": "https://iclr.cc/virtual/2022/poster/6391", "author_site": "Jesse Han, Jason Rute, Yuhuai Wu, Edward Ayers, Stanislas Polu", "tldr": "", "abstract": "Labeled data for imitation learning of theorem proving in large libraries of formalized mathematics is scarce as such libraries require years of concentrated effort by human specialists to be built. This is particularly challenging when applying large Transformer language models to tactic prediction, because the scaling of performance with respect to model size is quickly disrupted in the data-scarce, easily-overfitted regime. We propose PACT (Proof Artifact Co-Training), a general methodology for extracting abundant self-supervised data from kernel-level proof terms for joint training alongside the usual tactic prediction objective. We apply this methodology to Lean,an interactive proof assistant which hosts some of the most sophisticated formalized mathematics to date. We instrument Lean with a neural theorem prover driven by a Transformer language model and show that PACT improves theorem proving success rate on a held-out suite of test theorems from 32% to 48%.", "keywords": "self-supervised learning;mathematics;reasoning;theorem proving;language modeling", "primary_area": "", "supplementary_material": "", "author": "Jesse Michael Han;Jason Rute;Yuhuai Wu;Edward Ayers;Stanislas Polu", "authorids": "~Jesse_Michael_Han1;~Jason_Rute1;~Yuhuai_Wu1;e.w.ayers@maths.cam.ac.uk;~Stanislas_Polu1", "gender": "M;M;M;;M", "homepage": "https://jesse-michael-han.github.io;https://jasonrute.github.io;http://www.cs.toronto.edu/~ywu/;;", "dblp": ";141/9655;;;", "google_scholar": ";Z-oVfDMAAAAJ;https://scholar.google.ca/citations?user=bOQGfFIAAAAJ;;", "orcid": ";0000-0002-6247-1882;;;", "linkedin": ";jason-rute;;;", "or_profile": "~Jesse_Michael_Han1;~Jason_Rute1;~Yuhuai_Wu1;e.w.ayers@maths.cam.ac.uk;~Stanislas_Polu1", "aff": "University of Pittsburgh;IBM Research;Stanford University;;OpenAI", "aff_domain": "pitt.edu;research.ibm.com;stanford.edu;;openai.com", "position": "PhD student;Postdoc;Postdoc;;Research Engineer", "bibtex": "@inproceedings{\nhan2022proof,\ntitle={Proof Artifact Co-Training for Theorem Proving with Language Models},\nauthor={Jesse Michael Han and Jason Rute and Yuhuai Wu and Edward Ayers and Stanislas Polu},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=rpxJc9j04U}\n}", "github": "", "project": "", "reviewers": "TVGW;K4ag;s8q7;Hmxh", "pdf_size": 0, "recommendation": "5;5;8;8", "confidence": "3;4;5;4", "correctness": "4;2;4;4", "technical_novelty": "2;2;2;3", "empirical_novelty": "3;2;3;4", "wc_summary_paper": "103;28;61;59", "wc_summary_review": "34;17;41;61", "wc_main_review": "241;1124;85;483", "wc_review": "378;1169;187;603", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "400;716;94;407", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 6.5, 1.5 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.5, 0.8660254037844386 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 62.75, 26.668098919870534 ], "wc_summary_review_avg": [ 38.25, 15.769828787910159 ], "wc_main_review_avg": [ 483.25, 396.1845371793301 ], "wc_review_avg": [ 584.25, 368.31737333446546 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 404.25, 219.9254134928476 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.7071067811865476, "corr_recommendation_correctness": 0.5773502691896258, "gs_citation": 130, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1388829292669107103&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "pdf": "https://openreview.net/pdf?id=rpxJc9j04U", "email": "pitt.edu;research.ibm.com;stanford.edu;;openai.com", "author_num": 5, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "University of Pittsburgh;IBM;Stanford University;OpenAI", "aff_unique_dep": ";IBM Research;;", "aff_unique_url": "https://www.pitt.edu;https://www.ibm.com/research;https://www.stanford.edu;https://openai.com", "aff_unique_abbr": "Pitt;IBM;Stanford;OpenAI", "aff_campus_unique_index": "1", "aff_campus_unique": ";Stanford", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "rq1-7_lwisw", "title": "Beyond Object Recognition: A New Benchmark towards Object Concept Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Understanding objects is a central building block of artificial intelligence, especially for embodied AI. Even though object recognition excels with deep learning, current machines still struggle to learn higher-level knowledge, e.g., what attributes does an object have, what can we do with an object. In this work, we propose a challenging Object Concept Learning (OCL) task to push the envelope of object understanding. It requires machines to reason out object affordances and simultaneously give the reason: what attributes make an object possesses these affordances. To support OCL, we build a densely annotated knowledge base including extensive labels for three levels of object concept: categories, attributes, and affordances, together with their causal relations. By analyzing the causal structure of OCL, we present a strong baseline, Object Concept Reasoning Network (OCRN). It leverages causal intervention and concept instantiation to infer the three levels following their causal relations. In extensive experiments, OCRN effectively infers the object knowledge while follows the causalities well. Our data and code will be publicly available.", "keywords": "Object Concept Learning;Attributes;Affordance;Causal Inference", "primary_area": "", "supplementary_material": "/attachment/4867a101b5a6cd91f43846f904be0fe787939b3f.zip", "author": "Yong-Lu Li;Yue Xu;Xinyu Xu;Xiaohan Mao;Yuan Yao;Siqi Liu;Cewu Lu", "authorids": "~Yong-Lu_Li1;~Yue_Xu4;~Xinyu_Xu2;~Xiaohan_Mao1;~Yuan_Yao11;~Siqi_Liu4;~Cewu_Lu3", "gender": "M;M;M;M;M;F;M", "homepage": "https://dirtyharrylyl.github.io/;https://silicx.github.io;https://xuxinyu.website/;;https://github.com/yyNoBug;https://mayuoshima.github.io/;https://www.mvig.org/", "dblp": "198/9345;;59/6858;262/3946;;;", "google_scholar": "https://scholar.google.com.hk/citations?user=UExAaVgAAAAJ;N03Uc1oAAAAJ;https://scholar.google.com/citations?hl=zh-CN;-zT1NKwAAAAJ;Aj6mYj4AAAAJ;;https://scholar.google.com.tw/citations?user=QZVQEWAAAAAJ", "orcid": "0000-0003-0478-0692;0000-0001-7489-7269;;;0000-0002-5789-3554;0000-0001-9738-9136;", "linkedin": "%E6%B0%B8%E9%9C%B2-%E6%9D%8E-991b99139/;;;;;haruna-oshima-16a096333/;", "or_profile": "~Yong-Lu_Li1;~Yue_Xu4;~Xinyu_Xu2;~Xiaohan_Mao1;~Yuan_Yao11;~Siqi_Liu4;~Cewu_Lu3", "aff": "Hong Kong University of Science and Technology;Shanghai Jiaotong University;Shanghai Jiaotong University;Shanghai Jiaotong University;Shanghai Jiaotong University;Shanghai Jiaotong University;Shanghai Jiaotong University", "aff_domain": "ust.hk;sjtu.edu;sjtu.edu;sjtu.edu.cn;sjtu.edu;sjtu.edu;sjtu.edu.cn", "position": "Postdoc;MS student;Undergrad student;Undergrad student;Undergrad student;PhD student;Full Professor", "bibtex": "@misc{\nli2022beyond,\ntitle={Beyond Object Recognition: A New Benchmark towards Object Concept Learning},\nauthor={Yong-Lu Li and Yue Xu and Xinyu Xu and Xiaohan Mao and Yuan Yao and Siqi Liu and Cewu Lu},\nyear={2022},\nurl={https://openreview.net/forum?id=rq1-7_lwisw}\n}", "github": "", "project": "", "reviewers": "xQXf;mgiJ;mY2o;XzUp", "site": "https://openreview.net/forum?id=rq1-7_lwisw", "pdf_size": 0, "recommendation": "3;5;6;6", "confidence": "4;3;3;4", "correctness": "3;3;3;3", "technical_novelty": "2;2;4;3", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "85;44;108;94", "wc_summary_review": "18;30;34;76", "wc_main_review": "289;348;516;422", "wc_review": "392;422;658;592", "wc_reply_reviewers": "0;360;0;0", "wc_reply_authors": "876;1903;1472;1276", "reply_reviewers": "0;1;0;0", "reply_authors": "2;4;2;2", "recommendation_avg": [ 5.0, 1.224744871391589 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.75, 0.82915619758885 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 82.75, 23.826193569263218 ], "wc_summary_review_avg": [ 39.5, 21.880356487041066 ], "wc_main_review_avg": [ 393.75, 84.8657027308441 ], "wc_review_avg": [ 516.0, 111.97321108193691 ], "wc_reply_reviewers_avg": [ 90.0, 155.88457268119896 ], "wc_reply_authors_avg": [ 1381.75, 369.73394150388737 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 2.5, 0.8660254037844386 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": -0.40824829046386296, "corr_recommendation_correctness": 0.0, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11599236881770341656&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff_unique_index": "0;1;1;1;1;1;1", "aff_unique_norm": "Hong Kong University of Science and Technology;Shanghai Jiao Tong University", "aff_unique_dep": ";", "aff_unique_url": "https://www.ust.hk;https://www.sjtu.edu.cn", "aff_unique_abbr": "HKUST;SJTU", "aff_campus_unique_index": "0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "China" }, { "id": "rqcLsG8Kme9", "title": "rQdia: Regularizing Q-Value Distributions With Image Augmentation", "track": "main", "status": "Reject", "tldr": "", "abstract": "rQdia (pronounced \u201cArcadia\u201d) regularizes Q-value distributions with augmented images in pixel-based deep reinforcement learning. With a simple auxiliary loss, that equalizes these distributions via MSE, rQdia boosts DrQ and SAC on 9/12 and 10/12 tasks respectively in the MuJoCo Continuous Control Suite from pixels, and Data-Efficient Rainbow on 18/26 Atari Arcade environments. Gains are measured in both sample efficiency and longer-term training. Moreover, the addition of rQdia finally propels model-free continuous control from pixels over the state encoding baseline. Additional results, namely more random seeds, pending.", "keywords": "deep reinforcement learning;regularization;q-value distributions;invariance;image augmentation;continuous control;Atari", "primary_area": "", "supplementary_material": "/attachment/b83a7cd5ff837c9119c319e219da9b8830ff2cbd.zip", "author": "Samuel Lerman;Jing Bi;Chenliang Xu", "authorids": "~Samuel_Lerman1;~Jing_Bi1;~Chenliang_Xu1", "gender": "M;;M", "homepage": ";;https://www.cs.rochester.edu/~cxu22/", "dblp": ";;117/4770", "google_scholar": "3xy30K0AAAAJ;;https://scholar.google.com.tw/citations?user=54HfyDIAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Samuel_Lerman1;~Jing_Bi1;~Chenliang_Xu1", "aff": "University of Rochester;;University of Rochester", "aff_domain": "rochester.edu;;rochester.edu", "position": "PhD student;;Assistant Professor", "bibtex": "@misc{\nlerman2022rqdia,\ntitle={rQdia: Regularizing Q-Value Distributions With Image Augmentation},\nauthor={Samuel Lerman and Jing Bi and Chenliang Xu},\nyear={2022},\nurl={https://openreview.net/forum?id=rqcLsG8Kme9}\n}", "github": "", "project": "", "reviewers": "5Bt1;n4Pr;tset;dutv", "site": "https://openreview.net/forum?id=rqcLsG8Kme9", "pdf_size": 0, "recommendation": "3;3;6;6", "confidence": "4;5;3;5", "correctness": "2;3;4;4", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "34;52;40;91", "wc_summary_review": "39;31;12;41", "wc_main_review": "131;16;215;160", "wc_review": "204;99;267;292", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "399;488;469;579", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 4.5, 1.5 ], "confidence_avg": [ 4.25, 0.82915619758885 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 54.25, 22.18529918662356 ], "wc_summary_review_avg": [ 30.75, 11.453711188955307 ], "wc_main_review_avg": [ 130.5, 72.66532873386042 ], "wc_review_avg": [ 215.5, 74.51342160980128 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 483.75, 64.2081575814164 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.3015113445777637, "corr_recommendation_correctness": 0.9045340337332909, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10256704024493543560&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "University of Rochester", "aff_unique_dep": "", "aff_unique_url": "https://www.rochester.edu", "aff_unique_abbr": "U of R", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "rqolQhuq6Hs", "title": "Logarithmic landscape and power-law escape rate of SGD", "track": "main", "status": "Reject", "tldr": "", "abstract": "Stochastic gradient descent (SGD) undergoes complicated multiplicative noise for the mean-square loss. We use this property of the SGD noise to derive a stochastic differential equation (SDE) with simpler additive noise by performing a random time change. In the SDE, the loss gradient is replaced by the logarithmized loss gradient. By using this formalism, we obtain the escape rate formula from a local minimum, which is determined not by the loss barrier height $\\Delta L=L(\\theta^s)-L(\\theta^*)$ between a minimum $\\theta^*$ and a saddle $\\theta^s$ but by the logarithmized loss barrier height $\\Delta\\log L=\\log[L(\\theta^s)/L(\\theta^*)]$. Our escape-rate formula strongly depends on the typical magnitude $h^*$ and the number $n$ of the outlier eigenvalues of the Hessian. This result explains an empirical fact that SGD prefers flat minima with low effective dimensions, which gives an insight into implicit biases of SGD.", "keywords": "stochastic gradient descent;noise structure;escape rate;flat minima;statistical physics", "primary_area": "", "supplementary_material": "", "author": "Takashi Mori;Liu Ziyin;Kangqiao Liu;Masahito Ueda", "authorids": "~Takashi_Mori1;~Liu_Ziyin1;~Kangqiao_Liu1;~Masahito_Ueda1", "gender": "M;;M;M", "homepage": "https://sites.google.com/view/takashimori/home;https://www.mit.edu/~ziyinl/;https://kangqiaoliu.github.io/;http://cat.phys.s.u-tokyo.ac.jp/index-e.html", "dblp": ";;280/3114;", "google_scholar": "https://scholar.google.co.jp/citations?hl=ja;NpN9oRMAAAAJ;utIJkHcAAAAJ;https://scholar.google.co.jp/citations?user=Xpjx9CwAAAAJ", "orcid": ";;0000-0002-4014-5728;0000-0002-5367-1436", "linkedin": ";;kangqiaoliu/?originalSubdomain=jp;", "or_profile": "~Takashi_Mori1;~Liu_Ziyin1;~Kangqiao_Liu1;~Masahito_Ueda1", "aff": "RIKEN;The University of Tokyo;The University of Tokyo;The University of Tokyo", "aff_domain": "riken.jp;u-tokyo.ac.jp;u-tokyo.ac.jp;u-tokyo.ac.jp", "position": "Postdoc;PhD student;PhD student;Full Professor", "bibtex": "@misc{\nmori2022logarithmic,\ntitle={Logarithmic landscape and power-law escape rate of {SGD}},\nauthor={Takashi Mori and Liu Ziyin and Kangqiao Liu and Masahito Ueda},\nyear={2022},\nurl={https://openreview.net/forum?id=rqolQhuq6Hs}\n}", "github": "", "project": "", "reviewers": "hfnj;zZQR;bWLZ", "site": "https://openreview.net/forum?id=rqolQhuq6Hs", "pdf_size": 0, "recommendation": "3;6;6", "confidence": "3;3;2", "correctness": "4;3;3", "technical_novelty": "2;3;4", "empirical_novelty": "0;2;3", "wc_summary_paper": "17;117;108", "wc_summary_review": "21;65;26", "wc_main_review": "185;328;227", "wc_review": "223;510;361", "wc_reply_reviewers": "134;0;40", "wc_reply_authors": "961;564;609", "reply_reviewers": "1;0;1", "reply_authors": "2;1;2", "recommendation_avg": [ 5.0, 1.4142135623730951 ], "confidence_avg": [ 2.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 3.0, 0.816496580927726 ], "empirical_novelty_avg": [ 1.6666666666666667, 1.247219128924647 ], "wc_summary_paper_avg": [ 80.66666666666667, 45.16881913690265 ], "wc_summary_review_avg": [ 37.333333333333336, 19.669491322575908 ], "wc_main_review_avg": [ 246.66666666666666, 60.01296156294534 ], "wc_review_avg": [ 364.6666666666667, 117.19594228849772 ], "wc_reply_reviewers_avg": [ 58.0, 56.166419386201454 ], "wc_reply_authors_avg": [ 711.3333333333334, 177.4942878580103 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.5, "corr_recommendation_correctness": -1.0, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=740983480534211910&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;1;1", "aff_unique_norm": "RIKEN;University of Tokyo", "aff_unique_dep": ";", "aff_unique_url": "https://www.riken.jp;https://www.u-tokyo.ac.jp", "aff_unique_abbr": "RIKEN;UTokyo", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Japan" }, { "title": "Autonomous Learning of Object-Centric Abstractions for High-Level Planning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7202", "id": "rrWeE9ZDw_", "poster": "", "openreview": "https://openreview.net/forum?id=rrWeE9ZDw_", "slides": "https://iclr.cc/virtual/2022/poster/7202", "video": "https://iclr.cc/virtual/2022/poster/7202", "author_site": "Steven James, Benjamin Rosman, George D Konidaris", "tldr": "", "abstract": "We propose a method for autonomously learning an object-centric representation of a continuous and high-dimensional environment that is suitable for planning. Such representations can immediately be transferred between tasks that share the same types of objects, resulting in agents that require fewer samples to learn a model of a new task. We first demonstrate our approach on a 2D crafting domain consisting of numerous objects where the agent learns a compact, lifted representation that generalises across objects. We then apply it to a series of Minecraft tasks to learn object-centric representations and object types - directly from pixel data - that can be leveraged to solve new tasks quickly. The resulting learned representations enable the use of a task-level planner, resulting in an agent capable of transferring learned representations to form complex, long-term plans.", "keywords": "reinforcement learning;planning;multitask;transfer;objects", "primary_area": "", "supplementary_material": "/attachment/d22e396b03b9c2f0a52724f663c8d254fe53f06f.zip", "author": "Steven James;Benjamin Rosman;George Konidaris", "authorids": "~Steven_James1;~Benjamin_Rosman1;~George_Konidaris1", "gender": "M;M;M", "homepage": ";http://www.raillab.org;http://cs.brown.edu/people/gdk/", "dblp": "195/8202;45/4591;56/6762", "google_scholar": ";https://scholar.google.co.za/citations?user=pWJ0SocAAAAJ;9UERvVEAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Steven_James1;~Benjamin_Rosman1;~George_Konidaris1", "aff": "University of the Witwatersrand;University of the Witwatersrand;Brown University", "aff_domain": "wits.ac.za;wits.ac.za;brown.edu", "position": "Lecturer;Full Professor;Assistant Professor", "bibtex": "@inproceedings{\njames2022autonomous,\ntitle={Autonomous Learning of Object-Centric Abstractions for High-Level Planning},\nauthor={Steven James and Benjamin Rosman and George Konidaris},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=rrWeE9ZDw_}\n}", "github": "", "project": "", "reviewers": "UVNz;PYjf;nhv5", "pdf_size": 0, "recommendation": "5;6;8", "confidence": "4;3;3", "correctness": "3;3;4", "technical_novelty": "2;3;3", "empirical_novelty": "2;3;3", "wc_summary_paper": "71;78;197", "wc_summary_review": "41;40;189", "wc_main_review": "376;335;785", "wc_review": "488;453;1171", "wc_reply_reviewers": "0;76;26", "wc_reply_authors": "511;222;1434", "reply_reviewers": "0;1;1", "reply_authors": "1;1;2", "recommendation_avg": [ 6.333333333333333, 1.247219128924647 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 115.33333333333333, 57.81772123569805 ], "wc_summary_review_avg": [ 90.0, 70.00476174280338 ], "wc_main_review_avg": [ 498.6666666666667, 203.1589416086714 ], "wc_review_avg": [ 704.0, 330.5278606512115 ], "wc_reply_reviewers_avg": [ 34.0, 31.538336460039655 ], "wc_reply_authors_avg": [ 722.3333333333334, 516.8702824586542 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.7559289460184545, "corr_recommendation_correctness": 0.944911182523068, "gs_citation": 28, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11456250171836680945&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 11, "pdf": "https://openreview.net/pdf?id=rrWeE9ZDw_", "email": "wits.ac.za;wits.ac.za;brown.edu", "author_num": 3, "aff_unique_index": "0;0;1", "aff_unique_norm": "University of the Witwatersrand;Brown University", "aff_unique_dep": ";", "aff_unique_url": "https://www.wits.ac.za;https://www.brown.edu", "aff_unique_abbr": "Wits;Brown", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1", "aff_country_unique": "South Africa;United States" }, { "id": "rvost-n5X4G", "title": "SPP-RL: State Planning Policy Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "We introduce an algorithm for reinforcement learning, in which the actor plans for the next state provided the current state. To communicate the actor output to the environment we incorporate an inverse dynamics control model and train it using supervised learning. \nWe train the RL agent using off-policy state-of-the-art reinforcement learning algorithms: DDPG, TD3, and SAC. To guarantee that the target states are physically relevant, the overall learning procedure is formulated as a constrained optimization problem, solved via the classical Lagrangian optimization method. We benchmark the state planning RL approach using a varied set of continuous environments, including standard MuJoCo tasks, safety-gym level 0 environments, and AntPush. In SPP approach the optimal policy is being searched for in the space of state-state mappings, a considerably larger space than the traditional space of state-action mappings. We report that quite surprisingly SPP implementations attain superior performance to vanilla state-of-the-art off-policy RL algorithms in the tested environments.", "keywords": "reinforcement learning;off-policy;constrained optimization;robotics;state planning;state-state;mujoco;safety-gym;antpush", "primary_area": "", "supplementary_material": "", "author": "Jacek Cyranka;Zuzanna Opa\u0142a;Jacek P\u0142ocharczyk;Mikhail Zanka", "authorids": "~Jacek_Cyranka1;~Zuzanna_Opa\u0142a1;~Jacek_P\u0142ocharczyk1;~Mikhail_Zanka1", "gender": ";F;;", "homepage": ";https://drive.google.com/file/d/14-V0tEaBWBzfaBXY9TGu7QfHjVTNSZEy;https://www.linkedin.com/in/jacekplocharczyk;", "dblp": "142/0441;;;", "google_scholar": "-60x4zkAAAAJ;;;", "orcid": "0000-0001-5719-0616;;;0000-0002-7327-8842", "linkedin": "cyranka/;;;misha-zanka/", "or_profile": "~Jacek_Cyranka1;~Zuzanna_Opa\u0142a1;~Jacek_P\u0142ocharczyk1;~Mikhail_Zanka1", "aff": "University of Warsaw;University of Warsaw;;", "aff_domain": "mimuw.edu.pl;mimuw.edu.pl;;", "position": "Assistant Professor;MS student;;", "bibtex": "@misc{\ncyranka2022spprl,\ntitle={{SPP}-{RL}: State Planning Policy Reinforcement Learning},\nauthor={Jacek Cyranka and Zuzanna Opa{\\l}a and Jacek P{\\l}ocharczyk and Mikhail Zanka},\nyear={2022},\nurl={https://openreview.net/forum?id=rvost-n5X4G}\n}", "github": "", "project": "", "reviewers": "B8gx;F9WZ;oVKQ", "site": "https://openreview.net/forum?id=rvost-n5X4G", "pdf_size": 0, "recommendation": "3;5;8", "confidence": "4;3;4", "correctness": "4;2;3", "technical_novelty": "2;2;3", "empirical_novelty": "2;2;3", "wc_summary_paper": "60;37;139", "wc_summary_review": "49;11;104", "wc_main_review": "745;229;251", "wc_review": "854;277;494", "wc_reply_reviewers": "148;0;44", "wc_reply_authors": "1582;777;792", "reply_reviewers": "1;0;1", "reply_authors": "3;1;1", "recommendation_avg": [ 5.333333333333333, 2.0548046676563256 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.0, 0.816496580927726 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 78.66666666666667, 43.683202975768864 ], "wc_summary_review_avg": [ 54.666666666666664, 38.177945931591914 ], "wc_main_review_avg": [ 408.3333333333333, 238.22864833787074 ], "wc_review_avg": [ 541.6666666666666, 237.95844641916415 ], "wc_reply_reviewers_avg": [ 64.0, 62.053740150507174 ], "wc_reply_authors_avg": [ 1050.3333333333333, 375.9949763257778 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.9428090415820634 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.1147078669352809, "corr_recommendation_correctness": -0.3973597071195132, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:ZuYWL5sJKq8J:scholar.google.com/&scioq=SPP-RL:+State+Planning+Policy+Reinforcement+Learning&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "University of Warsaw", "aff_unique_dep": "", "aff_unique_url": "https://www.uw.edu.pl", "aff_unique_abbr": "UW", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Poland" }, { "title": "Concurrent Adversarial Learning for Large-Batch Training", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6365", "id": "rw1mZl_ss3L", "poster": "", "openreview": "https://openreview.net/forum?id=rw1mZl_ss3L", "slides": "https://iclr.cc/virtual/2022/poster/6365", "video": "https://iclr.cc/virtual/2022/poster/6365", "author_site": "LIU YONG, Xiangning Chen, Minhao Cheng, Cho-Jui Hsieh, Yang You", "tldr": "", "abstract": "Large-batch training has become a commonly used technique when training neural networks with a large number of GPU/TPU processors. As batch size increases, stochastic optimizers tend to converge to sharp local minima, leading to degraded test performance. Current methods usually use extensive data augmentation to increase the batch size, but we found the performance gain with data augmentation decreases as batch size increases, and data augmentation will become insufficient after certain point. In this paper, we propose to use adversarial learning to increase the batch size in large-batch training. Despite being a natural choice for smoothing the decision surface and biasing towards a flat region, adversarial learning has not been successfully applied in large-batch training since it requires at least two sequential gradient computations at each step, which will at least double the running time compared with vanilla training even with a large number of processors. To overcome this issue, we propose a novel Concurrent Adversarial Learning (ConAdv) method that decouple the sequential gradient computations in adversarial learning by utilizing staled parameters. Experimental results demonstrate that ConAdv can successfully increase the batch size on both ResNet-50 and EfficientNet training on ImageNet while maintaining high accuracy. In particular, we show ConAdv along can achieve 75.3\\% top-1 accuracy on ImageNet ResNet-50 training with 96K batch size, and the accuracy can be further improved to 76.2\\% when combining ConAdv with data augmentation. This is the first work successfully scales ResNet-50 training batch size to 96K. ", "keywords": "Distributed Machine Learnig;Large-Batch Training;Adversarial Learning", "primary_area": "", "supplementary_material": "", "author": "Yong Liu;Xiangning Chen;Minhao Cheng;Cho-Jui Hsieh;Yang You", "authorids": "~Yong_Liu13;~Xiangning_Chen1;~Minhao_Cheng1;~Cho-Jui_Hsieh1;~Yang_You1", "gender": "M;M;M;M;M", "homepage": "https://ai.comp.nus.edu.sg/people/yong;;https://cmhcbb.github.io/;http://web.cs.ucla.edu/~chohsieh/index.html;https://www.comp.nus.edu.sg/~youy/", "dblp": "29/4867;56/7393;174/1717;14/2770;33/8167-1.html", "google_scholar": "2ejuK8UAAAAJ;vNcBx1sAAAAJ;_LkC1yoAAAAJ;Wy89g4IAAAAJ;jF4dPZwAAAAJ", "orcid": ";;0000-0003-3965-4215;;", "linkedin": ";;;;yang-you-0b92914b/", "or_profile": "~Yong_Liu13;~Xiangning_Chen1;~Minhao_Cheng1;~Cho-Jui_Hsieh1;~Yang_You1", "aff": "National University of Singapore;University of California, Los Angeles;Hong Kong University of Science and Technology;University of California, Los Angeles;National University of Singapore", "aff_domain": "nus.edu.sg;cs.ucla.edu;ust.hk;ucla.edu;nus.edu.sg", "position": "PhD student;PhD student;Assistant Professor;Assistant Professor;Professor", "bibtex": "@inproceedings{\nliu2022concurrent,\ntitle={Concurrent Adversarial Learning for Large-Batch Training},\nauthor={Yong Liu and Xiangning Chen and Minhao Cheng and Cho-Jui Hsieh and Yang You},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=rw1mZl_ss3L}\n}", "github": "", "project": "", "reviewers": "nTet;M9ZZ;Zv8z", "pdf_size": 0, "recommendation": "5;6;8", "confidence": "4;5;3", "correctness": "4;3;3", "technical_novelty": "2;3;3", "empirical_novelty": "3;3;4", "wc_summary_paper": "25;64;104", "wc_summary_review": "39;20;25", "wc_main_review": "102;195;156", "wc_review": "166;279;285", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "469;595;116", "reply_reviewers": "0;0;0", "reply_authors": "2;2;1", "recommendation_avg": [ 6.333333333333333, 1.247219128924647 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 64.33333333333333, 32.25247621845836 ], "wc_summary_review_avg": [ 28.0, 8.04155872120988 ], "wc_main_review_avg": [ 151.0, 38.13135192987524 ], "wc_review_avg": [ 243.33333333333334, 54.73775865179559 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 393.3333333333333, 202.7384741209445 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.6546536707079772, "corr_recommendation_correctness": -0.7559289460184545, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11956389111854249484&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=rw1mZl_ss3L", "email": "nus.edu.sg;cs.ucla.edu;ust.hk;ucla.edu;nus.edu.sg", "author_num": 5, "aff_unique_index": "0;1;2;1;0", "aff_unique_norm": "National University of Singapore;University of California, Los Angeles;Hong Kong University of Science and Technology", "aff_unique_dep": ";;", "aff_unique_url": "https://www.nus.edu.sg;https://www.ucla.edu;https://www.ust.hk", "aff_unique_abbr": "NUS;UCLA;HKUST", "aff_campus_unique_index": "1;2;1", "aff_campus_unique": ";Los Angeles;Hong Kong SAR", "aff_country_unique_index": "0;1;2;1;0", "aff_country_unique": "Singapore;United States;China" }, { "title": "Unsupervised Discovery of Object Radiance Fields", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6279", "id": "rwE8SshAlxw", "poster": "", "openreview": "https://openreview.net/forum?id=rwE8SshAlxw", "slides": "https://iclr.cc/virtual/2022/poster/6279", "video": "https://iclr.cc/virtual/2022/poster/6279", "author_site": "Koven Yu, Leonidas Guibas, Jiajun Wu", "tldr": "", "abstract": "We study the problem of inferring an object-centric scene representation from a single image, aiming to derive a representation that explains the image formation process, captures the scene's 3D nature, and is learned without supervision. Most existing methods on scene decomposition lack one or more of these characteristics, due to the fundamental challenge in integrating the complex 3D-to-2D image formation process into powerful inference schemes like deep networks. In this paper, we propose unsupervised discovery of Object Radiance Fields (uORF), integrating recent progresses in neural 3D scene representations and rendering with deep inference networks for unsupervised 3D scene decomposition. Trained on multi-view RGB images without annotations, uORF learns to decompose complex scenes with diverse, textured background from a single image. We show that uORF enables novel tasks, such as scene segmentation and editing in 3D, and it performs well on these tasks and on novel view synthesis on three datasets.", "keywords": "object discovery;scene decomposition;3D scene representations;object-centric learning", "primary_area": "", "supplementary_material": "/attachment/8b2c25b6813698abdb1d7f7aed2cd5857aefe293.zip", "author": "Hong-Xing Yu;Leonidas Guibas;Jiajun Wu", "authorids": "~Hong-Xing_Yu1;~Leonidas_Guibas1;~Jiajun_Wu1", "gender": "M;M;M", "homepage": "https://kovenyu.com;http://geometry.stanford.edu/;https://jiajunwu.com", "dblp": "205/2676.html;g/LeonidasJGuibas;117/4768", "google_scholar": "kNKncZcAAAAJ;https://scholar.google.com.tw/citations?user=5JlEyTAAAAAJ;2efgcS0AAAAJ", "orcid": ";;0000-0002-4176-343X", "linkedin": ";;jiajunwu/", "or_profile": "~Hong-Xing_Yu1;~Leonidas_Guibas1;~Jiajun_Wu1", "aff": "Stanford University;Stanford University;Stanford University", "aff_domain": "cs.stanford.edu;stanford.edu;stanford.edu", "position": "PhD student;Full Professor;Assistant Professor", "bibtex": "@inproceedings{\nyu2022unsupervised,\ntitle={Unsupervised Discovery of Object Radiance Fields},\nauthor={Hong-Xing Yu and Leonidas Guibas and Jiajun Wu},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=rwE8SshAlxw}\n}", "github": "", "project": "", "reviewers": "BYk3;pN74;tkSe", "pdf_size": 0, "recommendation": "5;8;8", "confidence": "3;4;3", "correctness": "3;3;4", "technical_novelty": "2;3;3", "empirical_novelty": "2;4;3", "wc_summary_paper": "138;94;98", "wc_summary_review": "27;86;76", "wc_main_review": "175;556;210", "wc_review": "340;736;384", "wc_reply_reviewers": "0;110;109", "wc_reply_authors": "719;743;492", "reply_reviewers": "0;1;1", "reply_authors": "3;2;2", "recommendation_avg": [ 7.0, 1.4142135623730951 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 3.0, 0.816496580927726 ], "wc_summary_paper_avg": [ 110.0, 19.86621923433512 ], "wc_summary_review_avg": [ 63.0, 25.78113005022601 ], "wc_main_review_avg": [ 313.6666666666667, 171.95025120333173 ], "wc_review_avg": [ 486.6666666666667, 177.21800761272038 ], "wc_reply_reviewers_avg": [ 73.0, 51.62040940041707 ], "wc_reply_authors_avg": [ 651.3333333333334, 113.09091721069184 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 2.3333333333333335, 0.4714045207910317 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.5, "corr_recommendation_correctness": 0.5, "gs_citation": 125, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10064360192629959715&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=rwE8SshAlxw", "email": "cs.stanford.edu;stanford.edu;stanford.edu", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Stanford University", "aff_unique_dep": "", "aff_unique_url": "https://www.stanford.edu", "aff_unique_abbr": "Stanford", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "rwEv1SklKFt", "title": "Poisoned classifiers are not only backdoored, they are fundamentally broken", "track": "main", "status": "Reject", "tldr": "", "abstract": "Under a commonly-studied backdoor poisoning attack against classification models, an attacker adds a small trigger to a subset of the training data, such that the presence of this trigger at test time causes the classifier to always predict some target class. It is often implicitly assumed that the poisoned classifier is vulnerable exclusively to the adversary who possesses the trigger. In this paper, we show empirically that this view of backdoored classifiers is incorrect. We describe a new threat model for poisoned classifier, where one without knowledge of the original trigger, would want to control the poisoned classifier. Under this threat model, we propose a test-time, human-in-the-loop attack method to generate multiple effective alternative triggers without access to the initial backdoor and the training data. We construct these alternative triggers by first generating adversarial examples for a smoothed version of the classifier, created with a procedure called Denoised Smoothing, and then extracting colors or cropped portions of smoothed adversarial images with human interaction. We demonstrate the effectiveness of our attack through extensive experiments on high-resolution datasets: ImageNet and TrojAI. We also compare our approach to previous work on modeling trigger distributions and find that our method are more scalable and efficient in generating effective triggers. Last, we include a user study which demonstrates that our method allows users to easily determine the existence of such backdoors in existing poisoned classifiers. Thus, we argue that there is no such thing as a secret backdoor in poisoned classifiers: poisoning a classifier invites attacks not just by the party that possesses the trigger, but from anyone with access to the classifier.", "keywords": "Backdoor attacks;Randomized Smoothing;Trigger construction", "primary_area": "", "supplementary_material": "/attachment/9c3f0e2ab1704da7f2a22a7bb4f1a86de559b7dd.zip", "author": "Mingjie Sun;Siddhant Agarwal;J Zico Kolter", "authorids": "~Mingjie_Sun1;~Siddhant_Agarwal1;~J_Zico_Kolter1", "gender": "M;M;M", "homepage": "https://eric-mingjie.github.io/;https://agarwalsiddhant10.github.io/;http://www.zicokolter.com", "dblp": "54/3913;;67/2526", "google_scholar": "wCZbouUAAAAJ;;UXh1I6UAAAAJ", "orcid": ";;", "linkedin": ";siddhant-agarwal-688a31156/;", "or_profile": "~Mingjie_Sun1;~Siddhant_Agarwal1;~Zico_Kolter1", "aff": "Computer Science Department, Carnegie Mellon University;IIT Kharagpur;Carnegie Mellon University", "aff_domain": "cs.cmu.edu;iitkgp.ac.in;cmu.edu", "position": "PhD student;Undergrad student;Full Professor", "bibtex": "@misc{\nsun2022poisoned,\ntitle={Poisoned classifiers are not only backdoored, they are fundamentally broken},\nauthor={Mingjie Sun and Siddhant Agarwal and J Zico Kolter},\nyear={2022},\nurl={https://openreview.net/forum?id=rwEv1SklKFt}\n}", "github": "", "project": "", "reviewers": "67bv;fLvg;9RGV;8W5u;1ehv", "site": "https://openreview.net/forum?id=rwEv1SklKFt", "pdf_size": 0, "recommendation": "3;3;5;5;5", "confidence": "4;4;4;4;4", "correctness": "3;4;3;4;3", "technical_novelty": "2;1;3;4;3", "empirical_novelty": "3;2;2;3;3", "wc_summary_paper": "81;105;52;56;69", "wc_summary_review": "65;31;38;77;95", "wc_main_review": "627;148;150;764;304", "wc_review": "773;284;240;897;468", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "0;0;0;0;0", "reply_reviewers": "0;0;0;0;0", "reply_authors": "0;0;0;0;0", "recommendation_avg": [ 4.2, 0.9797958971132712 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.4, 0.4898979485566356 ], "technical_novelty_avg": [ 2.6, 1.019803902718557 ], "empirical_novelty_avg": [ 2.6, 0.4898979485566356 ], "wc_summary_paper_avg": [ 72.6, 19.147845831842286 ], "wc_summary_review_avg": [ 61.2, 23.90313786932586 ], "wc_main_review_avg": [ 398.6, 252.67971822051723 ], "wc_review_avg": [ 532.4, 261.5986238495914 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": -0.16666666666666663, "gs_citation": 23, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=978520683469883776&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1;0", "aff_unique_norm": "Carnegie Mellon University;Indian Institute of Technology Kharagpur", "aff_unique_dep": "Computer Science Department;", "aff_unique_url": "https://www.cmu.edu;https://www.iitkgp.ac.in", "aff_unique_abbr": "CMU;IIT KGP", "aff_campus_unique_index": "1", "aff_campus_unique": ";Kharagpur", "aff_country_unique_index": "0;1;0", "aff_country_unique": "United States;India" }, { "id": "rwR3N1ApI3V", "title": "Is deeper better? It depends on locality of relevant features", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "It has been recognized that a heavily overparameterized artificial neural network exhibits surprisingly good generalization performance in various machine-learning tasks. Recent theoretical studies have made attempts to unveil the mystery of overparameterization. In most of those previous works, overparameterization is achieved by increasing the width of the network, while the effect of the depth of the network has remained less well understood. In this work, we investigate the effect of the depth within an overparameterized regime for fully connected neural networks. To gain an insight into the advantage of depth, we introduce local and global labels according to abstract but simple classification rules. It turns out that the locality of a relevant feature for a given classification rule plays a key role; our experimental results suggest that deeper is better for local labels, whereas shallower is better for global labels. We also compare the results of finite networks with those of the neural tangent kernel (NTK), and find that the NTK does not correctly capture the depth dependence of the generalization performance, which indicates the importance of the feature learning rather than the lazy learning.", "keywords": "deep learning;generalization;overparameterization;neural tangent kernel", "primary_area": "", "supplementary_material": "", "author": "Takashi Mori;Masahito Ueda", "authorids": "~Takashi_Mori1;~Masahito_Ueda1", "gender": "M;M", "homepage": "https://sites.google.com/view/takashimori/home;http://cat.phys.s.u-tokyo.ac.jp/index-e.html", "dblp": ";", "google_scholar": "https://scholar.google.co.jp/citations?hl=ja;https://scholar.google.co.jp/citations?user=Xpjx9CwAAAAJ", "orcid": ";0000-0002-5367-1436", "linkedin": ";", "or_profile": "~Takashi_Mori1;~Masahito_Ueda1", "aff": "RIKEN;The University of Tokyo", "aff_domain": "riken.jp;u-tokyo.ac.jp", "position": "Postdoc;Full Professor", "bibtex": "@misc{\nmori2022is,\ntitle={Is deeper better? It depends on locality of relevant features},\nauthor={Takashi Mori and Masahito Ueda},\nyear={2022},\nurl={https://openreview.net/forum?id=rwR3N1ApI3V}\n}", "github": "", "project": "", "reviewers": "pFj3;owQT;Jmb4;HAmi", "site": "https://openreview.net/forum?id=rwR3N1ApI3V", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "4;4;3;3", "correctness": "3;3;4;3", "technical_novelty": "2;2;2;1", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "82;81;95;111", "wc_summary_review": "39;58;98;17", "wc_main_review": "439;372;555;275", "wc_review": "560;511;748;403", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 1.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 92.25, 12.152674602736633 ], "wc_summary_review_avg": [ 53.0, 29.75735203273302 ], "wc_main_review_avg": [ 410.25, 101.90038027406963 ], "wc_review_avg": [ 555.5, 124.81285991435338 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -1.0, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=984577573730531681&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;1", "aff_unique_norm": "RIKEN;University of Tokyo", "aff_unique_dep": ";", "aff_unique_url": "https://www.riken.jp;https://www.u-tokyo.ac.jp", "aff_unique_abbr": "RIKEN;UTokyo", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Japan" }, { "id": "rwSWaS_tGgG", "title": "Uncertainty Regularized Policy Learning for Offline Reinforcement Learning", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Recent studies show the promising results of using online RL methods in the offline setting.\nHowever, such a learning diagram may suffer from an overtraining issue, that is, the performance of the policy degrades significantly as the training process continues when the dataset is not sufficiently large and diverse. \nIn this work, we propose an alternative approach to alleviate and avoid the overtraining issue: we explicitly take the learning stability into account in the policy learning objective, and adaptively select a good policy before the overtraining issue happens.\nTo do so, we develop an Uncertainty Regularized Policy Learning (URPL) method.\nURPL adds an uncertainty regularization term in the policy learning objective to enforce to learn a more stable policy under the offline setting.\nMoreover, we further use the uncertainty regularization term as a surrogate metric indicating the potential performance of a policy.\nBased on the low-valued region of the uncertainty term, we can select a good policy with considerable good performance and low computation requirements.\nOn standard offline RL benchmark D4RL, URPL achieves much better final performance over existing state-of-the-art baselines.", "keywords": "off-policy RL;offline Learning", "primary_area": "", "supplementary_material": "", "author": "Han Zheng;Jing Jiang;pengfei wei;Guodong Long;Xuan Song;Chengqi Zhang", "authorids": "~Han_Zheng2;~Jing_Jiang6;~pengfei_wei2;~Guodong_Long2;~Xuan_Song5;~Chengqi_Zhang1", "gender": ";F;M;M;;M", "homepage": ";https://www.uts.edu.au/staff/jing.jiang;https://pengfei-wei.com/;https://www.uts.edu.au/staff/guodong.long;;https://research.polyu.edu.hk/en/persons/chengqi-zhang", "dblp": ";68/1974-2;29/11273-1;34/10089;;71/964", "google_scholar": ";https://scholar.google.com.au/citations?hl=en;https://scholar.google.com.sg/citations?user=a94WthkAAAAJ;https://scholar.google.com.au/citations?user=Pl8m7hMAAAAJ;;https://scholar.google.com.au/citations?user=B6lBmqEAAAAJ", "orcid": ";;;0000-0003-3740-9515;;0000-0001-5715-7154", "linkedin": ";;;;;chengqi-zhang-55aa8910/", "or_profile": "~Han_Zheng2;~Jing_Jiang6;~pengfei_wei2;~Guodong_Long2;~Xuan_Song5;~Chengqi_Zhang1", "aff": ";University of Technology Sydney;AI LAB Bytedance;University of Technology Sydney;;University of Technology Sydney", "aff_domain": ";uts.edu.au;bytedance.com;uts.edu.au;;uts.edu.au", "position": ";Lecturer;Researcher;Associate Professor;;Full Professor", "bibtex": "@misc{\nzheng2022uncertainty,\ntitle={Uncertainty Regularized Policy Learning for Offline Reinforcement Learning},\nauthor={Han Zheng and Jing Jiang and pengfei wei and Guodong Long and Xuan Song and Chengqi Zhang},\nyear={2022},\nurl={https://openreview.net/forum?id=rwSWaS_tGgG}\n}", "github": "", "project": "", "reviewers": "yhLF;dawA;p48W;RiF4", "site": "https://openreview.net/forum?id=rwSWaS_tGgG", "pdf_size": 0, "recommendation": "3;3;3;3", "confidence": "4;4;4;4", "correctness": "4;2;4;3", "technical_novelty": "2;1;2;2", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "120;91;43;88", "wc_summary_review": "40;64;12;42", "wc_main_review": "273;435;228;542", "wc_review": "433;590;283;672", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 1.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 85.5, 27.536339626028727 ], "wc_summary_review_avg": [ 39.5, 18.45941494197473 ], "wc_main_review_avg": [ 369.5, 125.8779170466369 ], "wc_review_avg": [ 494.5, 149.282450408613 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:bgPHB6Fast0J:scholar.google.com/&scioq=Uncertainty+Regularized+Policy+Learning+for+Offline+Reinforcement+Learning&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1;0;0", "aff_unique_norm": "University of Technology Sydney;ByteDance", "aff_unique_dep": ";AI LAB", "aff_unique_url": "https://www.uts.edu.au;https://www.bytedance.com", "aff_unique_abbr": "UTS;Bytedance", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0", "aff_country_unique": "Australia;China" }, { "id": "rxF4IN3R2ml", "title": "MQTransformer: Multi-Horizon Forecasts with Context Dependent and Feedback-Aware Attention", "track": "main", "status": "Reject", "tldr": "", "abstract": "Recent advances in neural forecasting have produced major improvements in accuracy for probabilistic demand prediction. In this work, we propose novel improvements to the current state of the art by incorporating changes inspired by recent advances in Transformer architectures for Natural Language Processing. We develop a novel decoder-encoder attention for context-alignment, improving forecasting accuracy by allowing the network to study its own history based on the context for which it is producing a forecast. We also present a novel positional encoding that allows the neural network to learn context-dependent seasonality functions as well as arbitrary holiday distances. Finally we show that the current state of the art MQ-Forecaster (Wen et al., 2017) models display excess variability by failing to leverage previous errors in the forecast to improve accuracy. We propose a novel decoder-self attention scheme for forecasting that produces significant improvements in the excess variation of the forecast.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Carson Eisenach;Yagna Patel;Dhruv Madeka", "authorids": "~Carson_Eisenach1;~Yagna_Patel1;~Dhruv_Madeka1", "gender": "M;M;", "homepage": "https://carson.eisenach.org;;http://www.dhruvmadeka.com", "dblp": ";;", "google_scholar": "Jx269u0AAAAJ;DoeRWlgAAAAJ;hCL5ibIAAAAJ", "orcid": ";;", "linkedin": "carson-eisenach-73118854/;;", "or_profile": "~Carson_Eisenach1;~Yagna_Patel1;~Dhruv_Madeka1", "aff": "Amazon;;Amazon", "aff_domain": "amazon.com;;amazon.com", "position": "Applied Scientist;;Amazon", "bibtex": "@misc{\neisenach2022mqtransformer,\ntitle={{MQT}ransformer: Multi-Horizon Forecasts with Context Dependent and Feedback-Aware Attention},\nauthor={Carson Eisenach and Yagna Patel and Dhruv Madeka},\nyear={2022},\nurl={https://openreview.net/forum?id=rxF4IN3R2ml}\n}", "github": "", "project": "", "reviewers": "DW35;3wZQ;tUKV", "site": "https://openreview.net/forum?id=rxF4IN3R2ml", "pdf_size": 0, "recommendation": "5;5;6", "confidence": "4;3;4", "correctness": "4;4;3", "technical_novelty": "2;3;3", "empirical_novelty": "2;2;3", "wc_summary_paper": "95;131;94", "wc_summary_review": "19;36;30", "wc_main_review": "65;295;273", "wc_review": "179;462;397", "wc_reply_reviewers": "0;37;0", "wc_reply_authors": "27;391;375", "reply_reviewers": "0;1;0", "reply_authors": "1;1;1", "recommendation_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 106.66666666666667, 17.21110752456745 ], "wc_summary_review_avg": [ 28.333333333333332, 7.039570693980958 ], "wc_main_review_avg": [ 211.0, 103.62753816754824 ], "wc_review_avg": [ 346.0, 121.03167629454144 ], "wc_reply_reviewers_avg": [ 12.333333333333334, 17.441967269268172 ], "wc_reply_authors_avg": [ 264.3333333333333, 167.94708161269793 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.4999999999999999, "corr_recommendation_correctness": -0.9999999999999997, "gs_citation": 43, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17076029758147769504&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0", "aff_unique_norm": "Amazon", "aff_unique_dep": "Amazon.com, Inc.", "aff_unique_url": "https://www.amazon.com", "aff_unique_abbr": "Amazon", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Gradient Information Matters in Policy Optimization by Back-propagating through Model", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/5997", "id": "rzvOQrnclO0", "poster": "", "openreview": "https://openreview.net/forum?id=rzvOQrnclO0", "slides": "https://iclr.cc/virtual/2022/poster/5997", "video": "https://iclr.cc/virtual/2022/poster/5997", "author_site": "Chongchong Li, Yue Wang, Wei Chen, Yuting Liu, Zhi-Ming Ma, Tie-Yan Liu", "tldr": "", "abstract": "Model-based reinforcement learning provides an efficient mechanism to find the optimal policy by interacting with the learned environment. In addition to treating the learned environment like a black-box simulator, a more effective way to use the model is to exploit its differentiability. Such methods require the gradient information of the learned environment model when calculating the policy gradient. However, since the error of gradient is not considered in the model learning phase, there is no guarantee for the model's accuracy. To address this problem, we first analyze the convergence rate for the policy optimization methods when the policy gradient is calculated using the learned environment model. The theoretical results show that the model gradient error matters in the policy optimization phrase. Then we propose a two-model-based learning method to control the prediction error and the gradient error. We separate the different roles of these two models at the model learning phase and coordinate them at the policy optimization phase. After proposing the method, we introduce the directional derivative projection policy optimization (DDPPO) algorithm as a practical implementation to find the optimal policy. Finally, we empirically demonstrate the proposed algorithm has better sample efficiency when achieving a comparable or better performance on benchmark continuous control tasks.", "keywords": "Model-based RL;Policy Optimization", "primary_area": "", "supplementary_material": "", "author": "Chongchong Li;Yue Wang;Wei Chen;Yuting Liu;Zhi-Ming Ma;Tie-Yan Liu", "authorids": "~Chongchong_Li1;~Yue_Wang15;~Wei_Chen1;~Yuting_Liu4;~Zhi-Ming_Ma1;~Tie-Yan_Liu1", "gender": ";M;F;F;;M", "homepage": ";https://scholar.google.com/citations?hl=zh-CN&user=fGv5irIAAAAJ;https://weichen-cas.github.io/;http://faculty.bjtu.edu.cn/8454/;http://homepage.amss.ac.cn/research/homePage/8eb59241e2e74d828fb84eec0efadba5/myHomePage.html;http://member.acm.org/~tieyanliu", "dblp": ";33/4822-17.html;;;;l/TieYanLiu", "google_scholar": ";https://scholar.google.com/citations?hl=zh-CN;https://scholar.google.com/citations?hl=en;;;Nh832fgAAAAJ", "orcid": "0000-0002-8337-4627;;;;;0000-0002-0476-8020", "linkedin": ";;;;;", "or_profile": "~Chongchong_Li1;~Yue_Wang15;~Wei_Chen1;~Yuting_Liu4;~Zhi-Ming_Ma1;~Tie-Yan_Liu1", "aff": "Beijing jiaotong Univercity;Microsoft Research Aisa; Chinese Academy of Sciences;;Academy of Mathematics and Systems Science, Chinese Academy of Sciences, Chinese Academy of Sciences;Microsoft", "aff_domain": "bjtu.edu.cn;microsoft.com;ict.ac.cn;;amss.ac.cn;microsoft.com", "position": "PhD student;Researcher;Full Professor;;Full Professor;Distinguished Scientist", "bibtex": "@inproceedings{\nli2022gradient,\ntitle={Gradient Information Matters in Policy Optimization by Back-propagating through Model},\nauthor={Chongchong Li and Yue Wang and Wei Chen and Yuting Liu and Zhi-Ming Ma and Tie-Yan Liu},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=rzvOQrnclO0}\n}", "github": "", "project": "", "reviewers": "8zAV;TLT1;9ZXt;nkhB", "pdf_size": 0, "recommendation": "6;6;8;8", "confidence": "4;4;4;3", "correctness": "3;3;4;3", "technical_novelty": "3;2;3;4", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "43;93;123;61", "wc_summary_review": "66;26;58;183", "wc_main_review": "1451;381;580;619", "wc_review": "1560;500;761;863", "wc_reply_reviewers": "103;185;248;176", "wc_reply_authors": "1686;874;979;741", "reply_reviewers": "1;2;2;1", "reply_authors": "3;3;3;2", "recommendation_avg": [ 7.0, 1.0 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 80.0, 30.610455730027933 ], "wc_summary_review_avg": [ 83.25, 59.503676357011756 ], "wc_main_review_avg": [ 757.75, 410.2995095049469 ], "wc_review_avg": [ 921.0, 391.95854372624666 ], "wc_reply_reviewers_avg": [ 178.0, 51.42470223540434 ], "wc_reply_authors_avg": [ 1070.0, 365.5112857354749 ], "reply_reviewers_avg": [ 1.5, 0.5 ], "reply_authors_avg": [ 2.75, 0.4330127018922193 ], "replies_avg": [ 27, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16097978500509464321&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "pdf": "https://openreview.net/pdf?id=rzvOQrnclO0", "email": "bjtu.edu.cn;microsoft.com;ict.ac.cn;;amss.ac.cn;microsoft.com", "author_num": 6, "aff_unique_index": "0;1;2;2;1", "aff_unique_norm": "Beijing Jiao Tong University;Microsoft;Chinese Academy of Sciences", "aff_unique_dep": ";Microsoft Research;", "aff_unique_url": "http://www.bjtu.edu.cn;https://www.microsoft.com/en-us/research/group/asia;https://www.cas.cn", "aff_unique_abbr": "BJTU;MSR Asia;CAS", "aff_campus_unique_index": "1", "aff_campus_unique": ";Beijing", "aff_country_unique_index": "0;0;0;0;1", "aff_country_unique": "China;United States" }, { "id": "s-b95PMK4E6", "title": "Hierarchical Modular Framework for Long Horizon Instruction Following", "track": "main", "status": "Reject", "tldr": "", "abstract": "Robotic agents performing domestic chores using natural language directives re-quire to learn the complex task of navigating an environment and interacting with objects in it. To address such composite tasks, we propose a hierarchical modular approach to learn agents that navigate and manipulate objects in a divide-and-conquer manner for the diverse nature of the entailing tasks. Specifically, our policy operates at three levels of hierarchy. We first infer a sequence of subgoals to be executed based on language instructions by a high-level policy composition controller (PCC). We then discriminatively control the agent\u2019s navigation by a master policy by alternating between navigation policy and various independent interaction policies. Finally, we infer manipulation actions with the corresponding object masks using the appropriate interaction policy. Our hierarchical agent, named HACR (Hierarchical Approach for Compositional Reasoning), generates a human interpretable and short sequence of sub-objectives, leading to efficient interaction with an environment, and achieves the state-of-the-art performance on the challenging ALFRED benchmark.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/c11e9253cae07323a7b8ea5944098cee82e0e66b.zip", "author": "Suvaansh Bhambri;Byeonghwi Kim;Roozbeh Mottaghi;Jonghyun Choi", "authorids": "~Suvaansh_Bhambri3;~Byeonghwi_Kim1;~Roozbeh_Mottaghi1;~Jonghyun_Choi1", "gender": "M;;M;M", "homepage": "https://bhkim94.github.io/;http://roozbehm.info;https://ppolon.github.io/;", "dblp": "280/2943;36/633;21/11103;275/7440", "google_scholar": "Sr9hbXYAAAAJ;CCV58dgAAAAJ;uiGWnm4AAAAJ;UQjAvO8AAAAJ", "orcid": "0000-0003-3775-2778;;0000-0002-7934-8434;0000-0003-3941-5396", "linkedin": "byeonghwi-kim-821909167;roozbeh-mottaghi-63397aa0;jonghyun-choi-459bb615/;suvaansh-bhambri-1784bab7/", "or_profile": "~Byeonghwi_Kim1;~Roozbeh_Mottaghi1;~Jonghyun_Choi1;~SUVAANSH_BHAMBRI2", "aff": ";Allen Institute for AI;NAVER;Indian Institute of Science, Dhirubhai Ambani Institute Of Information and Communication Technology", "aff_domain": ";allenai.org;navercorp.com;iisc.ac.in", "position": ";Research Manager;AI Advisor Committee;Researcher", "bibtex": "@misc{\nbhambri2022hierarchical,\ntitle={Hierarchical Modular Framework for Long Horizon Instruction Following },\nauthor={Suvaansh Bhambri and Byeonghwi Kim and Roozbeh Mottaghi and Jonghyun Choi},\nyear={2022},\nurl={https://openreview.net/forum?id=s-b95PMK4E6}\n}", "github": "", "project": "", "reviewers": "pj1s;wh6P;4rTh;BjKc;KXWq", "site": "https://openreview.net/forum?id=s-b95PMK4E6", "pdf_size": 0, "recommendation": "3;6;8;8;8", "confidence": "4;4;4;4;4", "correctness": "3;4;4;3;3", "technical_novelty": "2;3;3;2;2", "empirical_novelty": "2;4;3;4;3", "wc_summary_paper": "109;95;74;80;86", "wc_summary_review": "28;28;30;39;116", "wc_main_review": "349;357;398;1030;356", "wc_review": "486;480;502;1149;558", "wc_reply_reviewers": "534;0;37;43;116", "wc_reply_authors": "1735;543;979;2070;1637", "reply_reviewers": "2;0;2;1;1", "reply_authors": "3;1;4;4;3", "recommendation_avg": [ 6.6, 1.9595917942265424 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.4, 0.4898979485566356 ], "technical_novelty_avg": [ 2.4, 0.4898979485566356 ], "empirical_novelty_avg": [ 3.2, 0.7483314773547882 ], "wc_summary_paper_avg": [ 88.8, 12.253978945632314 ], "wc_summary_review_avg": [ 48.2, 34.143227732597275 ], "wc_main_review_avg": [ 498.0, 266.5595618243698 ], "wc_review_avg": [ 635.0, 258.47243566771294 ], "wc_reply_reviewers_avg": [ 146.0, 197.61072845369503 ], "wc_reply_authors_avg": [ 1392.8, 552.9420946175106 ], "reply_reviewers_avg": [ 1.2, 0.7483314773547883 ], "reply_authors_avg": [ 3.0, 1.0954451150103321 ], "replies_avg": [ 28, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.16666666666666663, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:7hofyqNpClYJ:scholar.google.com/&scioq=Hierarchical+Modular+Framework+for+Long+Horizon+Instruction+Following&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;2", "aff_unique_norm": "Allen Institute for AI;NAVER Corporation;Indian Institute of Science", "aff_unique_dep": ";;", "aff_unique_url": "https://allenai.org;https://www.naver.com;https://www.iisc.ac.in", "aff_unique_abbr": "AI2;NAVER;IISc", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2", "aff_country_unique": "United States;South Korea;India" }, { "title": "ProtoRes: Proto-Residual Network for Pose Authoring via Learned Inverse Kinematics", "status": "Oral", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6539", "id": "s03AQxehtd_", "poster": "", "openreview": "https://openreview.net/forum?id=s03AQxehtd_", "slides": "https://iclr.cc/virtual/2022/poster/6539", "video": "https://iclr.cc/virtual/2022/poster/6539", "author_site": "Boris Oreshkin, Florent Bocquelet, Felix G. Harvey, Bay Raitt, Dominic Laflamme", "tldr": "", "abstract": "Our work focuses on the development of a learnable neural representation of human pose for advanced AI assisted animation tooling. Specifically, we tackle the problem of constructing a full static human pose based on sparse and variable user inputs (e.g. locations and/or orientations of a subset of body joints). To solve this problem, we propose a novel neural architecture that combines residual connections with prototype encoding of a partially specified pose to create a new complete pose from the learned latent space. We show that our architecture outperforms a baseline based on Transformer, both in terms of accuracy and computational efficiency. Additionally, we develop a user interface to integrate our neural model in Unity, a real-time 3D development platform. Furthermore, we introduce two new datasets representing the static human pose modeling problem, based on high-quality human motion capture data, which will be released publicly along with model code.", "keywords": "inverse kinematics;deep learning;pose modeling", "primary_area": "", "supplementary_material": "/attachment/ff67a06a7330e8faefa4288da4c1ce1b2e39fbb7.zip", "author": "Boris N. Oreshkin;Florent Bocquelet;Felix G. Harvey;Bay Raitt;Dominic Laflamme", "authorids": "~Boris_N._Oreshkin1;~Florent_Bocquelet1;~Felix_G._Harvey1;~Bay_Raitt1;~Dominic_Laflamme1", "gender": "M;;M;M;M", "homepage": ";;;http://spiraloid.net;https://www.unity3d.com", "dblp": "33/1017;;;;", "google_scholar": "https://scholar.google.ca/citations?user=48MBCeIAAAAJ;;https://scholar.google.ca/citations?user=gWL1kqsAAAAJ;;", "orcid": ";;;;", "linkedin": "boris-oreshkin-1710061a/;fbocquelet/;felixgharvey/;;dominic-laflamme-1263b9", "or_profile": "~Boris_N._Oreshkin1;~Florent_Bocquelet1;~Felix_G._Harvey1;~Bay_Raitt1;~Dominic_Laflamme1", "aff": "Amazon;Unity Technologies;Unity Technologies;;", "aff_domain": "amazon.com;unity3d.com;unity3d.com;;", "position": "Principal Researcher;Senior Machine Learning Developer;Senior Machine Learning Developer;;", "bibtex": "@inproceedings{\noreshkin2022protores,\ntitle={ProtoRes: Proto-Residual Network for Pose Authoring via Learned Inverse Kinematics},\nauthor={Boris N. Oreshkin and Florent Bocquelet and Felix G. Harvey and Bay Raitt and Dominic Laflamme},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=s03AQxehtd_}\n}", "github": "", "project": "", "reviewers": "5Kos;J8Sg;cvbx", "pdf_size": 0, "recommendation": "6;8;8", "confidence": "3;3;4", "correctness": "3;3;3", "technical_novelty": "3;3;3", "empirical_novelty": "3;3;2", "wc_summary_paper": "91;80;108", "wc_summary_review": "16;55;66", "wc_main_review": "304;390;714", "wc_review": "411;525;888", "wc_reply_reviewers": "0;142;55", "wc_reply_authors": "1547;2767;2703", "reply_reviewers": "0;4;1", "reply_authors": "3;7;6", "recommendation_avg": [ 7.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 93.0, 11.51810169544733 ], "wc_summary_review_avg": [ 45.666666666666664, 21.452790546272116 ], "wc_main_review_avg": [ 469.3333333333333, 176.5320241643299 ], "wc_review_avg": [ 608.0, 203.38633189081315 ], "wc_reply_reviewers_avg": [ 65.66666666666667, 58.45986277400551 ], "wc_reply_authors_avg": [ 2339.0, 560.6377321110904 ], "reply_reviewers_avg": [ 1.6666666666666667, 1.699673171197595 ], "reply_authors_avg": [ 5.333333333333333, 1.699673171197595 ], "replies_avg": [ 25, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.49999999999999983, "corr_recommendation_correctness": 0.0, "gs_citation": 23, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4035812674200440521&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=s03AQxehtd_", "email": "amazon.com;unity3d.com;unity3d.com;;", "author_num": 5, "aff_unique_index": "0;1;1", "aff_unique_norm": "Amazon;Unity Technologies", "aff_unique_dep": "Amazon.com, Inc.;", "aff_unique_url": "https://www.amazon.com;https://unity.com", "aff_unique_abbr": "Amazon;Unity", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "s2UpjzX82FS", "title": "Privacy-preserving Task-Agnostic Vision Transformer for Image Processing", "track": "main", "status": "Reject", "tldr": "", "abstract": "Distributed collaborative learning approaches such as federated and split learning have attracted significant attention lately due to their ability to train neural networks using data from multiple sources without sharing data. However, they are not usually suitable in applications where each client carries out different tasks with its own data. Inspired by the recent success of Vision Transformer (ViT), here we present a new distributed learning framework for image processing applications, allowing clients to learn multiple tasks with their private data. The key idea arises from a novel task-agnostic Vision Transformer that is introduced to learn the global attention independent of specific tasks. Specifically, by connecting task-specific heads and tails at client sides to a task-agnostic Transformer body at a server side, each client learns a translation from its own task to a common representation, while the Transformer body learns global attention between the features embedded in the common representation. To enable decomposition between the task-specific and common representation, we propose an alternating training strategy in which task-specific learning for the heads and tails is run on the clients by fixing the Transformer, which alternates with task-agnostic learning for the Transformer on the server by freezing the heads and tails. Experimental results on multi-task learning for various image processing show that our method synergistically improves the performance of the task-specific network of each client while maintaining privacy.", "keywords": "Federated learning;Split learning;Transformer;Image processing", "primary_area": "", "supplementary_material": "", "author": "Boah Kim;Jeongsol Kim;Jong Chul Ye", "authorids": "~Boah_Kim1;~Jeongsol_Kim1;~Jong_Chul_Ye1", "gender": "F;M;M", "homepage": ";https://bispl.weebly.com/;https://bispl.weebly.com/", "dblp": "239/4299;282/3103;15/5613", "google_scholar": "1IkNuooAAAAJ;ZaVNwcQAAAAJ;HNMjoNEAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Boah_Kim1;~Jeongsol_Kim1;~Jong_Chul_Ye1", "aff": "Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology", "aff_domain": "kaist.ac.kr;kaist.ac.kr;kaist.ac.kr", "position": "PhD student;MS student;Full Professor", "bibtex": "@misc{\nkim2022privacypreserving,\ntitle={Privacy-preserving Task-Agnostic Vision Transformer for Image Processing},\nauthor={Boah Kim and Jeongsol Kim and Jong Chul Ye},\nyear={2022},\nurl={https://openreview.net/forum?id=s2UpjzX82FS}\n}", "github": "", "project": "", "reviewers": "fi2d;QjZj;kCVH;iupq", "site": "https://openreview.net/forum?id=s2UpjzX82FS", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "2;3;3;3", "correctness": "4;3;4;3", "technical_novelty": "3;2;2;3", "empirical_novelty": "2;2;2;4", "wc_summary_paper": "148;90;117;73", "wc_summary_review": "174;59;56;66", "wc_main_review": "287;239;433;81", "wc_review": "609;388;606;220", "wc_reply_reviewers": "272;0;77;0", "wc_reply_authors": "2046;1680;2579;403", "reply_reviewers": "3;0;1;0", "reply_authors": "7;4;7;1", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 2.75, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 107.0, 28.398943642325854 ], "wc_summary_review_avg": [ 88.75, 49.35268483071615 ], "wc_main_review_avg": [ 260.0, 125.6383699353028 ], "wc_review_avg": [ 455.75, 162.9637613090714 ], "wc_reply_reviewers_avg": [ 87.25, 111.20111285414369 ], "wc_reply_authors_avg": [ 1677.0, 802.005299234363 ], "reply_reviewers_avg": [ 1.0, 1.224744871391589 ], "reply_authors_avg": [ 4.75, 2.48746859276655 ], "replies_avg": [ 29, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.6622661785325219, "corr_recommendation_correctness": -0.6882472016116854, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:h0TvemCPURkJ:scholar.google.com/&scioq=Privacy-preserving+Task-Agnostic+Vision+Transformer+for+Image+Processing&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;0", "aff_unique_norm": "Korea Advanced Institute of Science and Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.kaist.ac.kr", "aff_unique_abbr": "KAIST", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "South Korea" }, { "id": "s3K0arSRl4d", "title": "TransDreamer: Reinforcement Learning with Transformer World Models", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "The Dreamer agent provides various benefits of Model-Based Reinforcement Learning (MBRL) such as sample efficiency, reusable knowledge, and safe planning. However, its world model and policy networks inherit the limitations of recurrent neural networks and thus an important question is how an MBRL framework can benefit from the recent advances of transformers and what the challenges are in doing so. In this paper, we propose a transformer-based MBRL agent, called TransDreamer. We first introduce the Transformer State-Space Model, a world model that leverages a transformer for dynamics predictions. We then share this world model with a transformer-based policy network and obtain stability in training a transformer-based RL agent. In experiments, we apply the proposed model to 2D visual RL and 3D first-person visual RL tasks both requiring long-range memory access for memory-based reasoning. We show that the proposed model outperforms Dreamer in these complex tasks.", "keywords": "Model-Based Reinforcement Learning;Transformer World Models", "primary_area": "", "supplementary_material": "", "author": "Chang Chen;Jaesik Yoon;Yi-Fu Wu;Sungjin Ahn", "authorids": "~Chang_Chen1;~Jaesik_Yoon1;~Yi-Fu_Wu1;~Sungjin_Ahn1", "gender": ";M;M;", "homepage": "https://www.linkedin.com/in/chen-chang-bba27643/;https://jaesikyoon.com;https://www.yifuwu.com/;", "dblp": ";158/9715;256/1572;", "google_scholar": ";qboyyIAAAAAJ;Fv2A650AAAAJ;", "orcid": ";;;", "linkedin": ";jaesik-yoon-809726123/;;", "or_profile": "~Chang_Chen1;~Jaesik_Yoon1;~Yi-Fu_Wu1;~Sungjin_Ahn1", "aff": "Rutgers University;Rutgers University;Rutgers University;", "aff_domain": "rutgers.edu;rutgers.edu;rutgers.edu;", "position": "Phd student;PhD student;PhD student;", "bibtex": "@misc{\nchen2022transdreamer,\ntitle={TransDreamer: Reinforcement Learning with Transformer World Models},\nauthor={Chang Chen and Jaesik Yoon and Yi-Fu Wu and Sungjin Ahn},\nyear={2022},\nurl={https://openreview.net/forum?id=s3K0arSRl4d}\n}", "github": "", "project": "", "reviewers": "AUKA;QogR;K6Up;U3g8;exkP", "site": "https://openreview.net/forum?id=s3K0arSRl4d", "pdf_size": 0, "recommendation": "3;3;5;6;6", "confidence": "4;4;4;5;4", "correctness": "3;2;3;3;3", "technical_novelty": "2;3;2;4;3", "empirical_novelty": "2;2;3;2;3", "wc_summary_paper": "91;65;33;87;67", "wc_summary_review": "102;37;68;70;41", "wc_main_review": "387;472;218;691;322", "wc_review": "580;574;319;848;430", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "0;0;0;0;0", "reply_reviewers": "0;0;0;0;0", "reply_authors": "0;0;0;0;0", "recommendation_avg": [ 4.6, 1.3564659966250536 ], "confidence_avg": [ 4.2, 0.39999999999999997 ], "correctness_avg": [ 2.8, 0.39999999999999997 ], "technical_novelty_avg": [ 2.8, 0.7483314773547882 ], "empirical_novelty_avg": [ 2.4, 0.4898979485566356 ], "wc_summary_paper_avg": [ 68.6, 20.606794995826014 ], "wc_summary_review_avg": [ 63.6, 23.465719677862005 ], "wc_main_review_avg": [ 418.0, 159.76357532303788 ], "wc_review_avg": [ 550.2, 177.7868386579839 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.5160468465421401, "corr_recommendation_correctness": 0.5897678246195884, "gs_citation": 109, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10953568104090122014&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;0", "aff_unique_norm": "Rutgers University", "aff_unique_dep": "", "aff_unique_url": "https://www.rutgers.edu", "aff_unique_abbr": "Rutgers", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "s3V9I71JvkD", "title": "Offline Meta-Reinforcement Learning with Online Self-Supervision", "track": "main", "status": "Reject", "tldr": "", "abstract": "Meta-reinforcement learning (RL) methods can meta-train policies that adapt to new tasks with orders of magnitude less data than standard RL, but meta-training itself is costly and time-consuming. If we can meta-train on offline data, then we can reuse the same static dataset, labeled once with rewards for different tasks, to meta-train policies that adapt to a variety of new tasks at meta-test time. Although this capability would make meta-RL a practical tool for real-world use, offline meta-RL presents additional challenges beyond online meta-RL or standard offline RL settings. Meta-RL learns an exploration strategy that collects data for adapting, and also meta-trains a policy that quickly adapts to data from a new task. Since this policy was meta-trained on a fixed, offline dataset, it might behave unpredictably when adapting to data collected by the learned exploration strategy, which differs systematically from the offline data and thus induces distributional shift. We propose a hybrid offline meta-RL algorithm, which uses offline data with rewards to meta-train an adaptive policy, and then collects additional unsupervised online data, without any reward labels to bridge this distribution shift. By not requiring reward labels for online collection, this data can be much cheaper to collect. We compare our method to prior work on offline meta-RL on simulated robot locomotion and manipulation tasks and find that using additional unsupervised online data collection leads to a dramatic improvement in the adaptive capabilities of the meta-trained policies, matching the performance of fully online meta-RL on a range of challenging domains that require generalization to new tasks.", "keywords": "reinforcement learning;meta learning;meta reinforcement learning;offline reinforcement learning", "primary_area": "", "supplementary_material": "/attachment/6538edbe744126d9238cc554579cdf8eb1f3cad3.zip", "author": "Vitchyr H. Pong;Ashvin Nair;Laura Smith;Catherine Huang;Sergey Levine", "authorids": "~Vitchyr_H._Pong1;~Ashvin_Nair1;~Laura_Smith1;~Catherine_Huang2;~Sergey_Levine1", "gender": "M;F;F;M;M", "homepage": "http://ashvin.me/;;;https://people.eecs.berkeley.edu/~svlevine/;https://people.eecs.berkeley.edu/~vitchyr/", "dblp": "182/2436;54/11024;;80/7594;181/4235", "google_scholar": "BsOkXDsAAAAJ;;;8R35rCwAAAAJ;", "orcid": ";;;;", "linkedin": ";;huang-catherine/;;vitchyr-pong", "or_profile": "~Ashvin_Nair1;~Laura_Smith1;~Catherine_Huang2;~Sergey_Levine1;~Vitchyr_Pong1", "aff": "University of California, Berkeley;University of California, Berkeley;University of California, Berkeley;Google;University of California, Berkeley", "aff_domain": "berkeley.edu;berkeley.edu;berkeley.edu;google.com;berkeley.edu", "position": "PhD student;PhD student;Undergrad student;Research Scientist;PhD student", "bibtex": "@misc{\npong2022offline,\ntitle={Offline Meta-Reinforcement Learning with Online Self-Supervision},\nauthor={Vitchyr H. Pong and Ashvin Nair and Laura Smith and Catherine Huang and Sergey Levine},\nyear={2022},\nurl={https://openreview.net/forum?id=s3V9I71JvkD}\n}", "github": "", "project": "", "reviewers": "RpMh;tmzu;w6Yf;fffq", "site": "https://openreview.net/forum?id=s3V9I71JvkD", "pdf_size": 0, "recommendation": "3;5;6;6", "confidence": "4;4;4;3", "correctness": "3;4;4;3", "technical_novelty": "2;2;3;3", "empirical_novelty": "3;3;4;2", "wc_summary_paper": "74;172;205;43", "wc_summary_review": "56;26;59;28", "wc_main_review": "331;343;400;211", "wc_review": "461;541;664;282", "wc_reply_reviewers": "0;0;23;30", "wc_reply_authors": "1887;947;677;861", "reply_reviewers": "0;0;1;1", "reply_authors": "5;3;1;2", "recommendation_avg": [ 5.0, 1.224744871391589 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 123.5, 66.94213919497942 ], "wc_summary_review_avg": [ 42.25, 15.303185942802891 ], "wc_main_review_avg": [ 321.25, 68.78362813926 ], "wc_review_avg": [ 487.0, 138.69571009948362 ], "wc_reply_reviewers_avg": [ 13.25, 13.47915056670857 ], "wc_reply_authors_avg": [ 1093.0, 468.67686096072634 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 2.75, 1.479019945774904 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.4714045207910316, "corr_recommendation_correctness": 0.40824829046386296, "gs_citation": 87, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15951748320809875665&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff_unique_index": "0;0;0;1;0", "aff_unique_norm": "University of California, Berkeley;Google", "aff_unique_dep": ";Google", "aff_unique_url": "https://www.berkeley.edu;https://www.google.com", "aff_unique_abbr": "UC Berkeley;Google", "aff_campus_unique_index": "0;0;0;1;0", "aff_campus_unique": "Berkeley;Mountain View", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "s51gCxF70pq", "title": "Learning Temporally-Consistent Representations for Data-Efficient Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Deep reinforcement learning (RL) agents that exist in high-dimensional state spaces, such as those composed of images, have interconnected learning burdens. Agents must learn an action-selection policy that completes their given task, which requires them to learn a representation of the state space that discerns between useful and useless information. The reward function is the only supervised feedback that RL agents receive, which causes a representation learning bottleneck that can manifest in poor sample efficiency. We present $k$-Step Latent (KSL), a new representation learning method that enforces temporal consistency of representations via a self-supervised auxiliary task wherein agents learn to recurrently predict action-conditioned representations of the state space. The state encoder learned by KSL produces low-dimensional representations that make optimization of the RL task more sample efficient. Altogether, KSL produces state-of-the-art results in both data efficiency and asymptotic performance in the popular PlaNet benchmark suite. Our analyses show that KSL produces encoders that generalize better to new tasks unseen during training, and its representations are more strongly tied to reward, are more invariant to perturbations in the state space, and move more smoothly through the temporal axis of the RL problem than other methods such as DrQ, RAD, CURL, and SAC-AE.", "keywords": "Reinforcement learning;representation learning;continuous control", "primary_area": "", "supplementary_material": "", "author": "Trevor McInroe;Lukas Sch\u00e4fer;Stefano V Albrecht", "authorids": "~Trevor_McInroe1;~Lukas_Sch\u00e4fer1;~Stefano_V_Albrecht1", "gender": "M;M;", "homepage": "https://trevormcinroe.github.io/;https://lukaschaefer.com/;https://agents-lab.org/stefano-albrecht/", "dblp": "304/2817;;118/3975", "google_scholar": ";-yp0O_IAAAAJ;https://scholar.google.co.uk/citations?user=ceSFqCcAAAAJ", "orcid": ";;0000-0002-8735-1465", "linkedin": ";lukas-schaefer/;", "or_profile": "~Trevor_McInroe1;~Lukas_Sch\u00e4fer1;~Stefano_V_Albrecht1", "aff": "Northwestern University;University of Edinburgh;University of Edinburgh", "aff_domain": "northwestern.edu;ed.ac.uk;ed.ac.uk", "position": "MS student;PhD student;Associate Professor", "bibtex": "@misc{\nmcinroe2022learning,\ntitle={Learning Temporally-Consistent Representations for Data-Efficient Reinforcement Learning},\nauthor={Trevor McInroe and Lukas Sch{\\\"a}fer and Stefano V Albrecht},\nyear={2022},\nurl={https://openreview.net/forum?id=s51gCxF70pq}\n}", "github": "", "project": "", "reviewers": "38iT;J6YX;Qru8;TKuY;pTps", "site": "https://openreview.net/forum?id=s51gCxF70pq", "pdf_size": 0, "recommendation": "3;3;5;5;6", "confidence": "4;4;4;4;3", "correctness": "3;2;3;3;3", "technical_novelty": "2;2;2;3;2", "empirical_novelty": "2;2;3;3;3", "wc_summary_paper": "113;30;104;28;88", "wc_summary_review": "39;47;68;44;37", "wc_main_review": "404;502;555;218;180", "wc_review": "556;579;727;290;305", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "162;309;184;128;194", "reply_reviewers": "0;0;0;0;0", "reply_authors": "1;1;1;1;1", "recommendation_avg": [ 4.4, 1.2 ], "confidence_avg": [ 3.8, 0.39999999999999997 ], "correctness_avg": [ 2.8, 0.39999999999999997 ], "technical_novelty_avg": [ 2.2, 0.39999999999999997 ], "empirical_novelty_avg": [ 2.6, 0.4898979485566356 ], "wc_summary_paper_avg": [ 72.6, 36.494383129462534 ], "wc_summary_review_avg": [ 47.0, 11.081516141756055 ], "wc_main_review_avg": [ 371.8, 149.6614846912859 ], "wc_review_avg": [ 491.4, 168.9148898114077 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 195.4, 61.147690062667124 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.6666666666666666, "corr_recommendation_correctness": 0.5833333333333335, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13183093127220925809&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;1", "aff_unique_norm": "Northwestern University;University of Edinburgh", "aff_unique_dep": ";", "aff_unique_url": "https://www.northwestern.edu;https://www.ed.ac.uk", "aff_unique_abbr": "NU;Edinburgh", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1", "aff_country_unique": "United States;United Kingdom" }, { "id": "s5lIqsrOu3Z", "title": "Closed-Loop Data Transcription to an LDR via Minimaxing Rate Reduction", "track": "main", "status": "Reject", "tldr": "", "abstract": "This work proposes a new computational framework for automatically learning a closed-loop transcription between multi-class multi-dimensional data and a linear discriminative representation (LDR) that consists of multiple multi-dimensional linear subspaces. In particular, we argue that the optimal encoding and decoding mappings sought can be formulated as the equilibrium point of a two-player minimax game between the encoder and decoder. A natural utility function for this game is the so-called rate reduction, a simple information-theoretic measure for distances between mixtures of subspace-like Gaussians in the feature space. Our formulation avoids expensive evaluating and minimizing approximated distances between arbitrary distributions in either the data space or the feature space. To a large extent, conceptually and computationally this new formulation unifies the benefits of Auto-Encoding and GAN and naturally extends them to the settings of learning a both discriminative and generative representation for complex multi-class and multi-dimensional real-world data. Our extensive experiments on many benchmark datasets demonstrate tremendous potential of this framework: under fair comparison, visual quality of the learned decoder and classification performance of the encoder is competitive and often better than existing methods based on GAN, VAE or a combination of both. ", "keywords": "Linear discriminative representation;Generative model", "primary_area": "", "supplementary_material": "", "author": "Xili Dai;Shengbang Tong;Mingyang Li;Ziyang Wu;Kwan Ho Ryan Chan;Pengyuan Zhai;Yaodong Yu;Michael Psenka;Xiaojun Yuan;Heung-Yeung Shum;Yi Ma", "authorids": "~Xili_Dai2;~Shengbang_Tong1;~Mingyang_Li3;~Ziyang_Wu1;~Kwan_Ho_Ryan_Chan1;~Pengyuan_Zhai1;~Yaodong_Yu4;psenka@berkeley.edu;xjyuan@uestc.edu.cn;msraharry@hotmail.com;~Yi_Ma4", "gender": "M;M;M;;M;;M;;;;M", "homepage": "https://delay-xili.github.io/;https://tsb0601.github.io/petertongsb/;;https://robinwu218.github.io/;https://ryanchankh.github.io/;;https://yaodongyu.github.io;;;;http://people.eecs.berkeley.edu/~yima/", "dblp": "170/8561;306/1406;;236/5238;267/5496;;;;;;", "google_scholar": "CtRMD1UAAAAJ;https://scholar.google.com/citations?hl=en;;9RAHYd0AAAAJ;DBXWBqcAAAAJ;;bZ9oyW8AAAAJ;;;;https://scholar.google.com.hk/citations?user=XqLiBQMAAAAJ", "orcid": ";;;;;;;;;;", "linkedin": "xili-daley-dai-b87030179/;;http://www.linkedin.com/in/limy;;ryanchankh/;;;;;;", "or_profile": "~Xili_Dai2;~Shengbang_Tong1;~Mingyang_Li3;~Ziyang_Wu1;~Kwan_Ho_Ryan_Chan1;~Pengyuan_Zhai1;~Yaodong_Yu4;psenka@berkeley.edu;xjyuan@uestc.edu.cn;msraharry@hotmail.com;~Yi_Ma4", "aff": "University of Electronic Science and Technology of China,;University of California, Berkeley;Tsinghua University;University of California, Berkeley;University of Pennsylvania ;;Electrical Engineering & Computer Science Department, University of California Berkeley;;;;University of California, Berkeley", "aff_domain": "uestc.edu.cn;berkeley.edu;tsinghua.edu.cn;berkeley.edu;seas.upenn.edu;;eecs.berkeley.edu;;;;berkeley.edu", "position": "PhD student;Undergrad student;PhD student;PhD student;PhD student;;PhD student;;;;Full Professor", "bibtex": "@misc{\ndai2022closedloop,\ntitle={Closed-Loop Data Transcription to an {LDR} via Minimaxing Rate Reduction},\nauthor={Xili Dai and Shengbang Tong and Mingyang Li and Ziyang Wu and Kwan Ho Ryan Chan and Pengyuan Zhai and Yaodong Yu and Michael Psenka and Xiaojun Yuan and Heung-Yeung Shum and Yi Ma},\nyear={2022},\nurl={https://openreview.net/forum?id=s5lIqsrOu3Z}\n}", "github": "", "project": "", "reviewers": "hk2J;beUC;ykKP", "site": "https://openreview.net/forum?id=s5lIqsrOu3Z", "pdf_size": 0, "recommendation": "3;5;6", "confidence": "3;4;3", "correctness": "3;3;4", "technical_novelty": "3;2;3", "empirical_novelty": "3;2;0", "wc_summary_paper": "77;23;29", "wc_summary_review": "42;16;12", "wc_main_review": "600;162;142", "wc_review": "719;201;183", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "676;307;413", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "recommendation_avg": [ 4.666666666666667, 1.247219128924647 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 1.6666666666666667, 1.247219128924647 ], "wc_summary_paper_avg": [ 43.0, 24.166091947189145 ], "wc_summary_review_avg": [ 23.333333333333332, 13.299958228840003 ], "wc_main_review_avg": [ 301.3333333333333, 211.34700271249545 ], "wc_review_avg": [ 367.6666666666667, 248.5388411406868 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 465.3333333333333, 155.1221740722096 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 11, 0 ], "corr_recommendation_confidence": 0.18898223650461363, "corr_recommendation_correctness": 0.7559289460184545, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6037111538467233177&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "aff_unique_index": "0;1;2;1;3;1;1", "aff_unique_norm": "University of Electronic Science and Technology of China;University of California, Berkeley;Tsinghua University;University of Pennsylvania", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.uestc.edu.cn;https://www.berkeley.edu;https://www.tsinghua.edu.cn;https://www.upenn.edu", "aff_unique_abbr": "UESTC;UC Berkeley;THU;UPenn", "aff_campus_unique_index": "1;1;1;1", "aff_campus_unique": ";Berkeley", "aff_country_unique_index": "0;1;0;1;1;1;1", "aff_country_unique": "China;United States" }, { "id": "s5yOwPJicj", "title": "Carousel Memory: Rethinking the Design of Episodic Memory for Continual Learning", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Continual Learning (CL) is an emerging machine learning paradigm that aims to learn from a continuous stream of tasks without forgetting knowledge learned from the previous tasks. To avoid performance decrease caused by forgetting, prior studies exploit episodic memory (EM), which stores a subset of the past observed samples while learning from new non-i.i.d. data. Despite the promising results, since CL is often assumed to execute on mobile or IoT devices, the EM size is bounded by the small hardware memory capacity and makes it infeasible to meet the accuracy requirements for real-world applications. Specifically, all prior CL methods discard samples overflowed from the EM and can never retrieve them back for subsequent training steps, incurring loss of information that would exacerbate catastrophic forgetting. We explore a novel hierarchical EM management strategy to address the forgetting issue. In particular, in mobile and IoT devices, real-time data can be stored not just in high-speed RAMs but in internal storage devices as well, which offer significantly larger capacity than the RAMs. Based on this insight, we propose to exploit the abundant storage to preserve past experiences and alleviate the forgetting by allowing CL to efficiently migrate samples between memory and storage without being interfered by the slow access speed of the storage. We call it Carousel Memory (CarM). As CarM is complementary to existing CL methods, we conduct extensive evaluations of our method with seven popular CL methods and show that CarM significantly improves the accuracy of the methods across different settings by large margins in final average accuracy (up to 28.4%) while retaining the same training efficiency.", "keywords": "Continual Learning;Episodic Memory;Memory Replay;System for AI", "primary_area": "", "supplementary_material": "", "author": "Soobee Lee;Minindu Weerakoon;Jonghyun Choi;Minjia Zhang;Di Wang;Myeongjae Jeon", "authorids": "~Soobee_Lee1;~Minindu_Weerakoon1;~Jonghyun_Choi1;~Minjia_Zhang1;~Di_Wang8;~Myeongjae_Jeon2", "gender": "F;;M;M;M;M", "homepage": "https://supersoob.github.io/;https://omnia.unist.ac.kr/;https://ppolon.github.io/;https://minjiazhang.github.io/;https://diwangbruce.github.io/;https://sites.google.com/site/myeongjae/", "dblp": ";;21/11103;58/9033;;09/2790.html", "google_scholar": ";;uiGWnm4AAAAJ;https://scholar.google.com/citations?hl=en;O-d6-LIAAAAJ;q942rJgAAAAJ", "orcid": ";;0000-0002-7934-8434;0000-0002-8165-166X;;", "linkedin": ";;jonghyun-choi-459bb615/;minjia-zhang-05857226/;diwang/;myeongjae-jeon-1056271a/", "or_profile": "~Soobee_Lee1;~Minindu_Weerakoon1;~Jonghyun_Choi1;~Minjia_Zhang1;~Di_Wang8;~Myeongjae_Jeon3", "aff": "Ulsan National Institute of Science and Technology;Ulsan National Institute of Science and Technology;NAVER;Microsoft ;Microsoft;Microsoft", "aff_domain": "unist.ac.kr;unist.ac.kr;navercorp.com;microsoft.com;microsoft.com;microsoft.com", "position": "MS student;MS student;AI Advisor Committee;Principle Researcher;Principal Research Lead;Visiting Professor", "bibtex": "@misc{\nlee2022carousel,\ntitle={Carousel Memory: Rethinking the Design of Episodic Memory for Continual Learning},\nauthor={Soobee Lee and Minindu Weerakoon and Jonghyun Choi and Minjia Zhang and Di Wang and Myeongjae Jeon},\nyear={2022},\nurl={https://openreview.net/forum?id=s5yOwPJicj}\n}", "github": "", "project": "", "reviewers": "k6W1;4nQc;rTm3;yYjj", "site": "https://openreview.net/forum?id=s5yOwPJicj", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "5;4;4;4", "correctness": "3;4;3;4", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;0;2;4", "wc_summary_paper": "68;54;127;127", "wc_summary_review": "77;22;93;21", "wc_main_review": "110;256;426;389", "wc_review": "255;332;646;537", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 1.4142135623730951 ], "wc_summary_paper_avg": [ 94.0, 33.36914742692717 ], "wc_summary_review_avg": [ 53.25, 32.251937926270415 ], "wc_main_review_avg": [ 295.25, 124.24044228833057 ], "wc_review_avg": [ 442.5, 156.2921943028506 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.0, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15082592887056802852&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0;1;2;2;2", "aff_unique_norm": "Ulsan National Institute of Science and Technology;NAVER Corporation;Microsoft", "aff_unique_dep": ";;Microsoft Corporation", "aff_unique_url": "https://www.unist.ac.kr;https://www.naver.com;https://www.microsoft.com", "aff_unique_abbr": "UNIST;NAVER;Microsoft", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;1;1", "aff_country_unique": "South Korea;United States" }, { "id": "s6cyuoLbZLU", "title": "Comparing representations of biological data learned with different AI paradigms, augmenting and cropping strategies", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Recent advances in AI and robotics enabled automated large-scale biological image analysis. Various machine learning approaches have been successfully applied to phenotypic profiling. However, it remains unclear how they compare in terms of biological feature extraction. In this study, we propose a simple CNN architecture and implement weakly-supervised, self-supervised and unsupervised learning of image representations. We train 16 deep learning setups on the 770k image dataset under identical conditions, using different augmenting and cropping strategies. We compare the learned representations by evaluating multiple metrics for each of three downstream tasks: i) distance-based analysis of similarity of known drugs, ii) classification of drugs versus controls, iii) clustering within cell lines. We also compare training times and memory usage. We show that, among tested setups, multi-crops and random augmentations generally improve performance across tasks; that self-supervised models have competitive performance and are the fastest to train; that no single combination of augmenting and cropping strategies consistently delivered top performance for all tasks.", "keywords": "representation learning;self-supervised learning;multi-crops;augmentations;application", "primary_area": "", "supplementary_material": "/attachment/51ffde9d769f19a55bda7c187d6cf03b230997cc.zip", "author": "Andrei Dmitrenko;Mauro M. Masiero;Nicola Zamboni", "authorids": "~Andrei_Dmitrenko1;masiero@imsb.biol.ethz.ch;zamboni@imsb.biol.ethz.ch", "gender": "M;;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": "andrei-dmitrenko/;;", "or_profile": "~Andrei_Dmitrenko1;masiero@imsb.biol.ethz.ch;zamboni@imsb.biol.ethz.ch", "aff": "Swiss Federal Institute of Technology;;", "aff_domain": "ethz.ch;;", "position": "PhD student;;", "bibtex": "@misc{\ndmitrenko2022comparing,\ntitle={Comparing representations of biological data learned with different {AI} paradigms, augmenting and cropping strategies},\nauthor={Andrei Dmitrenko and Mauro M. Masiero and Nicola Zamboni},\nyear={2022},\nurl={https://openreview.net/forum?id=s6cyuoLbZLU}\n}", "github": "", "project": "", "reviewers": "zg72;rrQ2;A2j6;uNjM", "site": "https://openreview.net/forum?id=s6cyuoLbZLU", "pdf_size": 0, "recommendation": "1;3;3;8", "confidence": "5;4;5;5", "correctness": "1;3;3;4", "technical_novelty": "1;1;2;3", "empirical_novelty": "2;2;3;4", "wc_summary_paper": "56;206;80;103", "wc_summary_review": "65;206;62;36", "wc_main_review": "824;498;573;338", "wc_review": "945;910;715;477", "wc_reply_reviewers": "0;354;205;0", "wc_reply_authors": "689;334;978;316", "reply_reviewers": "0;1;1;0", "reply_authors": "1;1;2;1", "recommendation_avg": [ 3.75, 2.5860201081971503 ], "confidence_avg": [ 4.75, 0.4330127018922193 ], "correctness_avg": [ 2.75, 1.0897247358851685 ], "technical_novelty_avg": [ 1.75, 0.82915619758885 ], "empirical_novelty_avg": [ 2.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 111.25, 57.17243654069678 ], "wc_summary_review_avg": [ 92.25, 66.63473193463001 ], "wc_main_review_avg": [ 558.25, 175.34305660618557 ], "wc_review_avg": [ 761.75, 186.29730942769947 ], "wc_reply_reviewers_avg": [ 139.75, 149.34921325537675 ], "wc_reply_authors_avg": [ 579.25, 274.08700717108064 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.16744367165578428, "corr_recommendation_correctness": 0.8649597882660589, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17842526835859239599&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff_unique_index": "0", "aff_unique_norm": "Swiss Federal Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.ethz.ch", "aff_unique_abbr": "ETH Zurich", "aff_country_unique_index": "0", "aff_country_unique": "Switzerland" }, { "id": "s6roE3ZocH1", "title": "Genetic Algorithm for Constrained Molecular Inverse Design", "track": "main", "status": "Reject", "tldr": "", "abstract": "A genetic algorithm is suitable for exploring large search spaces as it finds an approximate solution. Because of this advantage, genetic algorithm is effective in exploring vast and unknown space such as molecular search space. Though the algorithm is suitable for searching vast chemical space, it is difficult to optimize pharmacological properties while maintaining molecular substructure. To solve this issue, we introduce a genetic algorithm featuring a constrained molecular inverse design. The proposed algorithm successfully produces valid molecules for crossover and mutation. Furthermore, it optimizes specific properties while adhering to structural constraints using a two-phase optimization. Experiments prove that our algorithm effectively finds molecules that satisfy specific properties while maintaining structural constraints.", "keywords": "Genetic Algorithm;Constrained Optimization;Molecular Inverse Design;Molecular Generation", "primary_area": "", "supplementary_material": "", "author": "Yurim Lee;Kyudam Choi;Cheongwon Kim", "authorids": "~Yurim_Lee1;~Kyudam_Choi1;~Cheongwon_Kim1", "gender": "F;F;M", "homepage": ";;http://home.sejong.ac.kr/~wikim/", "dblp": "284/0805-1.html;287/4293.html;", "google_scholar": "pPYXTLoAAAAJ;YR-xog0AAAAJ;", "orcid": "0000-0002-5012-7750;;", "linkedin": ";;", "or_profile": "~Yurim_Lee1;~Kyudam_Choi1;~Cheongwon_Kim1", "aff": "Sejong University;;Sejong University", "aff_domain": "sejong.ac.kr;;sejong.ac.kr", "position": "MS student;;Full Professor", "bibtex": "@misc{\nlee2022genetic,\ntitle={Genetic Algorithm for Constrained Molecular Inverse Design},\nauthor={Yurim Lee and Kyudam Choi and Cheongwon Kim},\nyear={2022},\nurl={https://openreview.net/forum?id=s6roE3ZocH1}\n}", "github": "", "project": "", "reviewers": "YxXB;WQyJ;QMQj;g92F", "site": "https://openreview.net/forum?id=s6roE3ZocH1", "pdf_size": 0, "recommendation": "3;3;3;3", "confidence": "4;4;5;4", "correctness": "3;2;2;3", "technical_novelty": "2;2;1;2", "empirical_novelty": "2;2;0;1", "wc_summary_paper": "30;36;93;36", "wc_summary_review": "41;9;47;39", "wc_main_review": "802;67;169;178", "wc_review": "873;112;309;253", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 1.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 48.75, 25.66490794840301 ], "wc_summary_review_avg": [ 34.0, 14.730919862656235 ], "wc_main_review_avg": [ 304.0, 290.8066367880898 ], "wc_review_avg": [ 386.75, 289.7674714318362 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=819530199062624816&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0", "aff_unique_norm": "Sejong University", "aff_unique_dep": "", "aff_unique_url": "https://www.sejong.ac.kr", "aff_unique_abbr": "Sejong", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "South Korea" }, { "title": "Towards General Function Approximation in Zero-Sum Markov Games", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6810", "id": "sA4qIu3zv6v", "poster": "", "openreview": "https://openreview.net/forum?id=sA4qIu3zv6v", "slides": "https://iclr.cc/virtual/2022/poster/6810", "video": "https://iclr.cc/virtual/2022/poster/6810", "author_site": "Baihe Huang, Jason Lee, Zhaoran Wang, Zhuoran Yang", "tldr": "", "abstract": "This paper considers two-player zero-sum finite-horizon Markov games with simultaneous moves. The study focuses on the challenging settings where the value\nfunction or the model is parameterized by general function classes. Provably efficient\nalgorithms for both decoupled and coordinated settings are developed. In the decoupled setting where the agent controls a single player and plays against an arbitrary opponent, we propose a new model-free algorithm. The sample complexity is governed by the Minimax Eluder dimension\u2014a new dimension of the function class in Markov games. As a special case, this method improves the state-of-the-art algorithm\nby a $\\sqrt{d}$ factor in the regret when the reward function and transition kernel are parameterized with d-dimensional linear features. In the coordinated setting where both\nplayers are controlled by the agent, we propose a model-based algorithm and a model-free algorithm. In the model-based algorithm, we prove that sample complexity can\nbe bounded by a generalization of Witness rank to Markov games. The model-free\nalgorithm enjoys a $\\sqrt{K}$-regret upper bound where $K$ is the number of episodes. Our\nalgorithms are based on new techniques of alternate optimism", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/fbf79664452d7143663daadfb9e759e4a054117d.zip", "author": "Baihe Huang;Jason D. Lee;Zhaoran Wang;Zhuoran Yang", "authorids": "~Baihe_Huang1;~Jason_D._Lee1;~Zhaoran_Wang1;~Zhuoran_Yang1", "gender": ";M;Not Specified;M", "homepage": ";https://jasondlee88.github.io/;https://zhaoranwang.github.io/;https://zhuoranyang.github.io/", "dblp": "279/4131;88/3262;117/2756;", "google_scholar": "chICXXMAAAAJ;GR_DsT0AAAAJ;https://scholar.google.com.tw/citations?user=HSx0BgQAAAAJ;", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Baihe_Huang1;~Jason_D._Lee1;~Zhaoran_Wang1;~Zhuoran_Yang1", "aff": "Peking University;Princeton University;;University of California, Berkeley", "aff_domain": "pku.edu.cn;princeton.edu;;berkeley.edu", "position": "Undergrad student;Assistant Professor;;Postdoc", "bibtex": "@inproceedings{\nhuang2022towards,\ntitle={Towards General Function Approximation in Zero-Sum Markov Games},\nauthor={Baihe Huang and Jason D. Lee and Zhaoran Wang and Zhuoran Yang},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=sA4qIu3zv6v}\n}", "github": "", "project": "", "reviewers": "s7Lf;VsQZ;mNjL;cKpw", "pdf_size": 0, "recommendation": "3;6;6;6", "confidence": "3;2;3;3", "correctness": "3;4;4;4", "technical_novelty": "2;2;2;3", "empirical_novelty": "0;0;0;0", "wc_summary_paper": "43;76;36;49", "wc_summary_review": "10;257;29;76", "wc_main_review": "144;559;177;236", "wc_review": "197;892;242;361", "wc_reply_reviewers": "4;34;0;0", "wc_reply_authors": "506;804;630;512", "reply_reviewers": "1;1;0;0", "reply_authors": "2;1;1;1", "recommendation_avg": [ 5.25, 1.299038105676658 ], "confidence_avg": [ 2.75, 0.4330127018922193 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 0, 0 ], "wc_summary_paper_avg": [ 51.0, 15.149257407543116 ], "wc_summary_review_avg": [ 93.0, 97.68572055321084 ], "wc_main_review_avg": [ 279.0, 164.98333249149746 ], "wc_review_avg": [ 423.0, 277.32742381524406 ], "wc_reply_reviewers_avg": [ 9.5, 14.239030865898142 ], "wc_reply_authors_avg": [ 613.0, 120.85114811204733 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": 1.0, "gs_citation": 58, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1185748425492137140&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=sA4qIu3zv6v", "email": "pku.edu.cn;princeton.edu;;berkeley.edu", "author_num": 4, "aff_unique_index": "0;1;2", "aff_unique_norm": "Peking University;Princeton University;University of California, Berkeley", "aff_unique_dep": ";;", "aff_unique_url": "http://www.pku.edu.cn;https://www.princeton.edu;https://www.berkeley.edu", "aff_unique_abbr": "Peking U;Princeton;UC Berkeley", "aff_campus_unique_index": "1", "aff_campus_unique": ";Berkeley", "aff_country_unique_index": "0;1;1", "aff_country_unique": "China;United States" }, { "id": "sBHGzpXndG", "title": "Collaborative Three-Stream Transformers for Video Captioning", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "As the most critical components in a sentence, subject, predicate, and object require special attention in the video captioning task. In this paper, we design the collaborative three-stream transformers to model the interactions of objects, and the actions/relations of objects between different modalities. Specifically, it is formed by three branches of transformers used to exploit the visual-linguistic interactions of different granularities in spatio-temporal domain between videos and text, detected objects and text, and actions and text. Meanwhile, we design a cross-modality attention module to align the interactions modeled by the three branches of transformers. That is, an affinity matrix is computed to help align visual modalities by injecting the information from other interactions. In this way, the three branches of transformers can support each other to exploit the most discriminative semantic information in different modalities for accurate predictions of captions, especially for the subject, predicate, and object parts in a sentence. The whole model is trained in an end-to-end fashion. Extensive experiments conducted on two large-scale challenging datasets, i.e., YouCookII and ActivityNet Captions, demonstrate that the proposed method performs favorably against the state-of-the-art methods.", "keywords": "Video captioning;transformer;cross-modality", "primary_area": "", "supplementary_material": "", "author": "Hao Wang;Longyin Wen;Libo Zhang;Tiejian Luo", "authorids": "~Hao_Wang27;~Longyin_Wen1;~Libo_Zhang1;~Tiejian_Luo2", "gender": "M;M;M;M", "homepage": "https://github.com/wanghao14;;;", "dblp": ";119/1468;78/33-1.html;", "google_scholar": ";5HDWtHsAAAAJ;https://scholar.google.com/citations?hl=zh-CN;", "orcid": "0000-0002-9274-2698;0000-0001-5525-492X;;", "linkedin": ";longyin-wen-16934689/;;tiejian-luo-18632a89/", "or_profile": "~Hao_Wang27;~Longyin_Wen1;~Libo_Zhang1;~Tiejian_Luo2", "aff": "University of Chinese Academy of Sciences;Bytedance Inc.;Institute of Software Chinese Academy of Sciences;University of Chinese Academy of Sciences", "aff_domain": "ucas.ac.cn;bytedance.com;iscas.ac.cn;ucas.ac.cn", "position": "PhD student;Research Manager;Associate Professor;Full Professor", "bibtex": "@misc{\nwang2022collaborative,\ntitle={Collaborative Three-Stream Transformers for Video Captioning},\nauthor={Hao Wang and Longyin Wen and Libo Zhang and Tiejian Luo},\nyear={2022},\nurl={https://openreview.net/forum?id=sBHGzpXndG}\n}", "github": "", "project": "", "reviewers": "juF7;sUP7;JojL;13Fm", "site": "https://openreview.net/forum?id=sBHGzpXndG", "pdf_size": 0, "recommendation": "5;5;5;5", "confidence": "4;4;4;4", "correctness": "3;4;4;3", "technical_novelty": "2;2;2;2", "empirical_novelty": "3;2;0;2", "wc_summary_paper": "58;42;68;190", "wc_summary_review": "8;101;49;47", "wc_main_review": "324;435;278;810", "wc_review": "390;578;395;1047", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 5.0, 0.0 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 1.75, 1.0897247358851685 ], "wc_summary_paper_avg": [ 89.5, 58.76010551386034 ], "wc_summary_review_avg": [ 51.25, 33.04826016600571 ], "wc_main_review_avg": [ 461.75, 209.00523318807114 ], "wc_review_avg": [ 602.5, 267.578493156681 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17090821395824778730&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "University of Chinese Academy of Sciences;Bytedance Inc.;Chinese Academy of Sciences", "aff_unique_dep": ";;Institute of Software", "aff_unique_url": "http://www.ucas.ac.cn;https://www.bytedance.com;http://www.is.cas.cn", "aff_unique_abbr": "UCAS;Bytedance;CAS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "id": "sBHVNmCt3t", "title": "On The Vulnerability of Recurrent Neural Networks to Membership Inference Attacks", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "We study the privacy implications of deploying recurrent neural networks in machine learning. We consider membership inference attacks (MIAs) in which an attacker aims to infer whether a given data record has been used in the training of a learning agent. Using existing MIAs that target feed-forward neural networks, we empirically demonstrate that the attack accuracy wanes for data records used earlier in the training history. Alternatively, recurrent networks are specifically designed to better remember their past experience; hence, they are likely to be more vulnerable to MIAs than their feed-forward counterparts. We develop a pair of MIA layouts for two primary applications of recurrent networks, namely, deep reinforcement learning and sequence-to-sequence tasks. We use the first attack to provide empirical evidence that recurrent networks are indeed more vulnerable to MIAs than feed-forward networks with the same performance level. We use the second attack to showcase the differences between the effects of overtraining recurrent and feed-forward networks on the accuracy of their respective MIAs. Finally, we deploy a differential privacy mechanism to resolve the privacy vulnerability that the MIAs exploit. For both attack layouts, the privacy mechanism degrades the attack accuracy from above 80\\% to 50\\%, which is equal to guessing the data membership uniformly at random, while trading off less than 10\\% utility.", "keywords": "Privacy;Recurrent Neural Networks;Membership Inference Attack;Reinforcement Learning", "primary_area": "", "supplementary_material": "/attachment/e54e1eddb932616508ae7024f31ec8a92f30b55d.zip", "author": "Yunhao Yang;Parham Gohari;ufuk topcu", "authorids": "~Yunhao_Yang1;~Parham_Gohari1;~ufuk_topcu1", "gender": "M;M;Unspecified", "homepage": "https://yunhaoyang234.github.io/;;https://autonomy.oden.utexas.edu/", "dblp": "264/9469.html;;12/6659.html", "google_scholar": "hEKrDSEAAAAJ;B81dQEYAAAAJ;jeNGFfQAAAAJ", "orcid": "0000-0002-7199-2508;0000-0002-8880-7436;0000-0003-0819-9985", "linkedin": ";parham-gohari-66a993ba?lipi=urn%3Ali%3Apage%3Ad_flagship3_profile_view_base_contact_details%3Bj5eH3vUMTWKylADMiii%2BHw%3D%3D;", "or_profile": "~Yunhao_Yang1;~Parham_Gohari1;~ufuk_topcu1", "aff": "University of Texas, Austin;University of Texas, Austin;University of Texas, Austin", "aff_domain": "utexas.edu;utexas.edu;utexas.edu", "position": "MS student;PhD student;Full Professor", "bibtex": "@misc{\nyang2022on,\ntitle={On The Vulnerability of Recurrent Neural Networks to Membership Inference Attacks},\nauthor={Yunhao Yang and Parham Gohari and ufuk topcu},\nyear={2022},\nurl={https://openreview.net/forum?id=sBHVNmCt3t}\n}", "github": "", "project": "", "reviewers": "yaNr;7sk1;gT9z;SJS3", "site": "https://openreview.net/forum?id=sBHVNmCt3t", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "3;4;4;3", "correctness": "3;4;2;2", "technical_novelty": "1;2;1;2", "empirical_novelty": "2;2;3;2", "wc_summary_paper": "127;86;50;137", "wc_summary_review": "19;25;21;49", "wc_main_review": "84;265;376;686", "wc_review": "230;376;447;872", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 2.75, 0.82915619758885 ], "technical_novelty_avg": [ 1.5, 0.5 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 100.0, 34.61935874622752 ], "wc_summary_review_avg": [ 28.5, 12.031209415515965 ], "wc_main_review_avg": [ 352.75, 218.81656130192707 ], "wc_review_avg": [ 481.25, 238.77957931950547 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": -0.5222329678670935, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14618926349694432303&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Texas at Austin", "aff_unique_dep": "", "aff_unique_url": "https://www.utexas.edu", "aff_unique_abbr": "UT Austin", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Austin", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "sBT5nxwt18Q", "title": "Advancing Nearest Neighbor Explanation-by-Example with Critical Classification Regions", "track": "main", "status": "Reject", "tldr": "", "abstract": "There is an increasing body of evidence suggesting that post-hoc explanation-by- example with nearest neighbors is a promising solution for the eXplainable Artificial Intelligence (XAI) problem. However, despite being thoroughly researched for decades, such post-hoc methods have never seriously explored how to enhance these explanations by highlighting specific important \"parts\" in a classification. Here, we propose the notion of Critical Classification Regions (CCRs) to do this, and several possible methods are experimentally compared to determine the best approach for this explanation strategy. CCRs supplement nearest neighbor examples by highlighting similar important \"parts\" in the image explanation. Experiments across multiple domains show that CCRs represent key features used by the CNN in both the testing and training data. Finally, a suitably-controlled user study (N=163) on ImageNet, shows CCRs improve people\u2019s assessments of the correctness of a CNN\u2019s predictions for difficult classifications due to ambiguity.", "keywords": "Explainable AI;Post-hoc Nearest Neighbor Explanation-by-Example;User Study;Case-based Reasoning;Convolutional Neural Network", "primary_area": "", "supplementary_material": "/attachment/2c83db92cb143ade17e9f362025e052885dd8933.zip", "author": "Eoin M. Kenny;Eoin D. Delaney;Mark T. Keane", "authorids": "~Eoin_M._Kenny1;~Eoin_D._Delaney1;~Mark_T._Keane1", "gender": ";M;M", "homepage": "https://e-delaney.github.io/;https://people.ucd.ie/mark.keane;https://eoinkenny.github.io/", "dblp": "275/3311.html;k/MarkTKeane;", "google_scholar": "I11ceKoAAAAJ;bBozfc4AAAAJ;AzMTFY4AAAAJ", "orcid": "0000-0002-7282-8494;0000-0001-7630-9598;0000-0001-5800-2525", "linkedin": ";;", "or_profile": "~Eoin_D._Delaney1;~Mark_T._Keane1;~Eoin_Kenny1", "aff": "University College Dublin;University College Dublin;Massachusetts Institute of Technology", "aff_domain": "ucd.ie;ucd.ie;mit.edu", "position": "PhD student;Full Professor;Postdoc", "bibtex": "@misc{\nkenny2022advancing,\ntitle={Advancing Nearest Neighbor Explanation-by-Example with Critical Classification Regions},\nauthor={Eoin M. Kenny and Eoin D. Delaney and Mark T. Keane},\nyear={2022},\nurl={https://openreview.net/forum?id=sBT5nxwt18Q}\n}", "github": "", "project": "", "reviewers": "emYe;jgiA;zNGM;sNwm", "site": "https://openreview.net/forum?id=sBT5nxwt18Q", "pdf_size": 0, "recommendation": "3;3;5;6", "confidence": "3;4;3;4", "correctness": "2;2;3;3", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;1;3;4", "wc_summary_paper": "70;69;62;45", "wc_summary_review": "64;45;18;15", "wc_main_review": "510;614;451;324", "wc_review": "644;728;531;384", "wc_reply_reviewers": "158;0;0;0", "wc_reply_authors": "1505;777;763;801", "reply_reviewers": "1;0;0;0", "reply_authors": "2;1;1;1", "recommendation_avg": [ 4.25, 1.299038105676658 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 1.118033988749895 ], "wc_summary_paper_avg": [ 61.5, 10.012492197250394 ], "wc_summary_review_avg": [ 35.5, 20.180436070610565 ], "wc_main_review_avg": [ 474.75, 104.78877563937847 ], "wc_review_avg": [ 571.75, 128.98134555043222 ], "wc_reply_reviewers_avg": [ 39.5, 68.41600689897065 ], "wc_reply_authors_avg": [ 961.5, 314.08398558347415 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.19245008972987526, "corr_recommendation_correctness": 0.9622504486493761, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1610898844572875823&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;1", "aff_unique_norm": "University College Dublin;Massachusetts Institute of Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.ucd.ie;https://web.mit.edu", "aff_unique_abbr": "UCD;MIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1", "aff_country_unique": "Ireland;United States" }, { "id": "sCrKKSWtFl5", "title": "Neural Photometric Stereo for Shape and Material Estimation", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "This paper addresses a challenging Photometric-Stereo problem where the object to be reconstructed has unknown, non-Lambertian, and possibly spatially-varying surface materials. This problem becomes even more challenging when the shape of the object is highly complex so that shadows cast on the surface are inevitable. To overcome these challenges, we propose a simple coordinate-based deep MLP (multilayer perceptron) neural network to parameterize both the unknown 3D shape and the unknown spatially-varying reflectance at every image pixel. This network is able to leverage the observed specularities and shadows on the surface, and recover both surface shape, normal and generic non-Lambertian reflectance via an inverse differentiable rendering process. We explicitly predict cast shadows, mitigating possible artifacts on these shadowing regions, leading to higher estimation accuracy. Our framework is entirely self-supervised, in the sense that it requires neither ground truth shape nor known svBRDF. Tests on real-world images demonstrate that our method achieves state-of-the-art accuracy in both shape recovery and material estimation. Thanks to the small size of the MLP-net, our method is also an order of magnitude faster than previous competing deep-learning based photometric stereo methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Junxuan Li;Hongdong Li", "authorids": "~Junxuan_Li2;~Hongdong_Li1", "gender": "M;M", "homepage": "https://junxuan-li.github.io/;http://users.cecs.anu.edu.au/~hongdong/", "dblp": "207/7737;59/4859.html", "google_scholar": "b2_zvDMAAAAJ;https://scholar.google.com.tw/citations?hl=en", "orcid": "0000-0003-4375-3443;", "linkedin": "junxuan-li-335421a6/;", "or_profile": "~Junxuan_Li2;~Hongdong_Li1", "aff": "Australian National University;Australian National University", "aff_domain": "anu.edu.au;anu.edu.au", "position": "PhD student;Full Professor", "bibtex": "@misc{\nli2022neural,\ntitle={Neural Photometric Stereo for Shape and Material Estimation},\nauthor={Junxuan Li and Hongdong Li},\nyear={2022},\nurl={https://openreview.net/forum?id=sCrKKSWtFl5}\n}", "github": "", "project": "", "reviewers": "9EQK;qvcG;trQT", "site": "https://openreview.net/forum?id=sCrKKSWtFl5", "pdf_size": 0, "recommendation": "3;5;6", "confidence": "5;5;5", "correctness": "2;1;4", "technical_novelty": "2;2;2", "empirical_novelty": "2;2;3", "wc_summary_paper": "97;108;111", "wc_summary_review": "57;67;27", "wc_main_review": "1221;244;165", "wc_review": "1375;419;303", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 4.666666666666667, 1.247219128924647 ], "confidence_avg": [ 5.0, 0.0 ], "correctness_avg": [ 2.3333333333333335, 1.247219128924647 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 105.33333333333333, 6.018490028422597 ], "wc_summary_review_avg": [ 50.333333333333336, 16.99673171197595 ], "wc_main_review_avg": [ 543.3333333333334, 480.2668239866483 ], "wc_review_avg": [ 699.0, 480.344320947658 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.5, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:ThujLbmi4r0J:scholar.google.com/&scioq=Neural+Photometric+Stereo+for+Shape+and+Material+Estimation&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Australian National University", "aff_unique_dep": "", "aff_unique_url": "https://www.anu.edu.au", "aff_unique_abbr": "ANU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Australia" }, { "id": "sEIl_stzQyB", "title": "Greedy-based Value Representation for Efficient Coordination in Multi-agent Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Due to the representation limitation of the joint Q value function, multi-agent reinforcement learning (MARL) methods with linear or monotonic value decomposition can not ensure the optimal consistency (i.e. the correspondence between the individual greedy actions and the maximal true Q value), leading to instability and poor coordination. Existing methods focus on addressing the representation limitation through learning the complete expressiveness, which is impractical and may deteriorate the performance in complex tasks. In this paper, we introduce the True-Global-Max (TGM) condition for linear and monotonic value decomposition to achieve the optimal consistency directly, where the TGM condition can be ensured under the unique stability of the optimal greedy action. Therefore, we propose the greedy-based value representation (GVR), which stabilises the optimal greedy action via inferior target shaping and destabilises non-optimal greedy actions via superior experience replay. We conduct experiments on various benchmarks, where GVR significantly outperforms state-of-the-art baselines. Experiment results demonstrate that our method can meet the optimal consistency under sufficient exploration and is more efficient than methods with complete expressiveness capability.", "keywords": "multi-agent cooperation;reinforcement learning algorithm", "primary_area": "", "supplementary_material": "/attachment/6ff8ac3c1ba6ee7ed1f3d0766341fef048a63210.zip", "author": "Lipeng Wan;Zeyang Liu;Xingyu Chen;Han Wang;Xuguang Lan", "authorids": "~Lipeng_Wan1;zeyang.liu@stu.xjtu.edu.cn;~Xingyu_Chen2;reload7@stu.xjtu.edu.cn;~Xuguang_Lan2", "gender": "M;;M;;", "homepage": "http://gr.xjtu.edu.cn/web/zeuslan/team;jsessionid=F923495DAAA043708B20337E681673E5;;;;", "dblp": "377/4923.html;;;;", "google_scholar": ";;https://scholar.google.com.hk/citations?user=LR76K-MAAAAJ;;", "orcid": ";;0000-0002-5226-963X;;", "linkedin": ";;;;", "or_profile": "~Lipeng_Wan1;zeyang.liu@stu.xjtu.edu.cn;~Xingyu_Chen2;reload7@stu.xjtu.edu.cn;~Xuguang_Lan2", "aff": "Xi'an Jiaotong University;;Xi'an Jiaotong University;;", "aff_domain": "xjtu.edu.cn;;xjtu.edu.cn;;", "position": "PhD student;;Assistant Professor;;", "bibtex": "@misc{\nwan2022greedybased,\ntitle={Greedy-based Value Representation for Efficient Coordination in Multi-agent Reinforcement Learning},\nauthor={Lipeng Wan and Zeyang Liu and Xingyu Chen and Han Wang and Xuguang Lan},\nyear={2022},\nurl={https://openreview.net/forum?id=sEIl_stzQyB}\n}", "github": "", "project": "", "reviewers": "bdoB;Nsqb;dbgG;LsGH", "site": "https://openreview.net/forum?id=sEIl_stzQyB", "pdf_size": 0, "recommendation": "3;3;3;6", "confidence": "4;3;3;2", "correctness": "2;2;2;4", "technical_novelty": "3;2;4;3", "empirical_novelty": "2;2;4;3", "wc_summary_paper": "233;55;95;112", "wc_summary_review": "130;56;139;45", "wc_main_review": "672;293;601;253", "wc_review": "1035;404;835;410", "wc_reply_reviewers": "0;224;176;0", "wc_reply_authors": "480;387;523;227", "reply_reviewers": "0;1;1;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 3.75, 1.299038105676658 ], "confidence_avg": [ 3.0, 0.7071067811865476 ], "correctness_avg": [ 2.5, 0.8660254037844386 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 123.75, 66.38288559561116 ], "wc_summary_review_avg": [ 92.5, 42.29952718411874 ], "wc_main_review_avg": [ 454.75, 184.0195302135075 ], "wc_review_avg": [ 671.0, 273.3139220749649 ], "wc_reply_reviewers_avg": [ 100.0, 101.42977866484773 ], "wc_reply_authors_avg": [ 404.25, 113.52835548883812 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.816496580927726, "corr_recommendation_correctness": 1.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:NydSieyrRYAJ:scholar.google.com/&scioq=Greedy-based+Value+Representation+for+Efficient+Coordination+in+Multi-agent+Reinforcement+Learning&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Xi'an Jiao Tong University", "aff_unique_dep": "", "aff_unique_url": "https://www.xjtu.edu.cn", "aff_unique_abbr": "XJTU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "id": "sHUFhv03qX_", "title": "Q-Learning Scheduler for Multi-Task Learning through the use of Histogram of Task Uncertainty", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Simultaneous training of a multi-task learning network on different domains or tasks is not always straightforward. It could lead to inferior performance or generalization compared to the corresponding single-task networks. To maximally taking advantage of the benefits of multi-task learning, an effective training scheduling method is deemed necessary. Traditional schedulers follow a heuristic or prefixed strategy, ignoring the relation of the tasks, their sample complexities, and the state of the emergent shared features. We proposed a deep Q-Learning Scheduler (QLS) that monitors the state of the tasks and the shared features using a novel histogram of task uncertainty, and through trial-and-error, learns an optimal policy for task scheduling. Extensive experiments on multi-domain and multi-task settings with various task difficulty profiles have been conducted, the proposed method is benchmarked against other schedulers, its superior performance has been demonstrated, and results are discussed.", "keywords": "Q-learning;Multi-Task Learning;MTL Scheduling;Histogram of Task Uncertainty", "primary_area": "", "supplementary_material": "", "author": "Kourosh Meshgi;Maryam Sadat Mirzaei;Satoshi Sekine", "authorids": "~Kourosh_Meshgi2;~Maryam_Sadat_Mirzaei1;~Satoshi_Sekine1", "gender": "F;M;M", "homepage": "http://www.ii.ist.i.kyoto-u.ac.jp/?page_id=6593&lang=en;https://aip.riken.jp/labs/goalorient_tech/lang_inf_access_tech/;http://kouroshmeshgi.com", "dblp": ";https://dblp.uni-trier.de/pers/hd/s/Sekine:Satoshi;https://dblp.uni-trier.de/pers/hd/m/Meshgi:Kourosh", "google_scholar": ";SWOdltsAAAAJ;WejG0Z8AAAAJ", "orcid": ";;0000-0001-7734-6104", "linkedin": ";;kouroshmeshgi", "or_profile": "~Maryam_Sadat_Mirzaei1;~Satoshi_Sekine1;~Kourosh_Meshgi1", "aff": ";RIKEN;RIKEN AIP", "aff_domain": ";riken.jp;riken.jp", "position": ";Team Leader;Researcher", "bibtex": "@misc{\nmeshgi2022qlearning,\ntitle={Q-Learning Scheduler for Multi-Task Learning through the use of Histogram of Task Uncertainty},\nauthor={Kourosh Meshgi and Maryam Sadat Mirzaei and Satoshi Sekine},\nyear={2022},\nurl={https://openreview.net/forum?id=sHUFhv03qX_}\n}", "github": "", "project": "", "reviewers": "e8PV;5zkP;MPYv;Bgje", "site": "https://openreview.net/forum?id=sHUFhv03qX_", "pdf_size": 0, "recommendation": "3;3;5;8", "confidence": "3;4;3;4", "correctness": "3;1;4;4", "technical_novelty": "2;1;1;3", "empirical_novelty": "2;2;2;4", "wc_summary_paper": "24;107;57;69", "wc_summary_review": "47;42;53;18", "wc_main_review": "80;147;138;240", "wc_review": "151;296;248;327", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.75, 2.0463381929681126 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.0, 1.224744871391589 ], "technical_novelty_avg": [ 1.75, 0.82915619758885 ], "empirical_novelty_avg": [ 2.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 64.25, 29.67637949615822 ], "wc_summary_review_avg": [ 40.0, 13.285330255586423 ], "wc_main_review_avg": [ 151.25, 57.32963893135906 ], "wc_review_avg": [ 255.5, 66.57514551242078 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.3665083330689157, "corr_recommendation_correctness": 0.6982565352753429, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9645350645545502269&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff_unique_index": "0;0", "aff_unique_norm": "RIKEN", "aff_unique_dep": "", "aff_unique_url": "https://www.riken.jp", "aff_unique_abbr": "RIKEN", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Japan" }, { "id": "sJJXksSg7yi", "title": "Rotation-Equivariant Keypoint Detection", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "We show how to train a rotation-equivariant representation to extract local keypoints for image matching. Existing learning-based methods focused on extracting translation-equivariant keypoints using conventional convolutional neural networks (CNNs), but rotation-equivariant keypoint detectors have not been studied extensively. Therefore, we propose a rotation-invariant keypoint detection method using rotation-equivariant CNNs. Our rotation-equivariant representation enables us to estimate local orientations to image keypoints accurately. We propose a dense histogram alignment loss to assign an orientation to keypoints more consistently. We validate the effectiveness compared to existing keypoint detection methods. Furthermore, we check the transferability of our method on public image matching benchmarks. ", "keywords": "Equivariant representation;Keypoint detector;Image matching", "primary_area": "", "supplementary_material": "", "author": "Jongmin Lee;Byungjin Kim;Minsu Cho", "authorids": "~Jongmin_Lee2;~Byungjin_Kim1;~Minsu_Cho1", "gender": "M;M;M", "homepage": "https://bluedream1121.github.io/;https://github.com/kbjpc123/;http://cvlab.postech.ac.kr/~mcho/", "dblp": "68/222-5;;", "google_scholar": "https://scholar.google.co.kr/citations?user=WVVqJX8AAAAJ;;5TyoF5QAAAAJ", "orcid": ";;", "linkedin": ";;minsu-cho-062b3750/", "or_profile": "~Jongmin_Lee2;~Byungjin_Kim1;~Minsu_Cho1", "aff": "POSTECH;POSTECH;POSTECH", "aff_domain": "postech.ac.kr;postech.ac.kr;postech.ac.kr", "position": "PhD student;MS student;Associate Professor", "bibtex": "@misc{\nlee2022rotationequivariant,\ntitle={Rotation-Equivariant Keypoint Detection},\nauthor={Jongmin Lee and Byungjin Kim and Minsu Cho},\nyear={2022},\nurl={https://openreview.net/forum?id=sJJXksSg7yi}\n}", "github": "", "project": "", "reviewers": "gduM;d8xK;kzBH;3XZ6", "site": "https://openreview.net/forum?id=sJJXksSg7yi", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "5;5;5;3", "correctness": "3;2;2;4", "technical_novelty": "2;2;3;2", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "113;76;112;148", "wc_summary_review": "53;81;41;53", "wc_main_review": "891;524;547;237", "wc_review": "1057;681;700;438", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "54;87;82;73", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 4.5, 0.8660254037844386 ], "correctness_avg": [ 2.75, 0.82915619758885 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 112.25, 25.459526704163217 ], "wc_summary_review_avg": [ 57.0, 14.696938456699069 ], "wc_main_review_avg": [ 549.75, 231.80527927551606 ], "wc_review_avg": [ 719.0, 220.7996829707869 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 74.0, 12.589678312014172 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.30151134457776363, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1, "aff_unique_index": "0;0;0", "aff_unique_norm": "Pohang University of Science and Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.postech.ac.kr", "aff_unique_abbr": "POSTECH", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Pohang", "aff_country_unique_index": "0;0;0", "aff_country_unique": "South Korea" }, { "id": "sKiAuHhc3w", "title": "Fine-grained Software Vulnerability Detection via Information Theory and Contrastive Learning", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Software vulnerabilities existing in a program or function of computer systems have been becoming a serious and crucial concern. In a program or function consisting of hundreds or thousands of source code statements, there are only few statements causing the corresponding vulnerabilities. Vulnerability labeling on a function or program level is usually done by experts with the assistance of machine learning tools; however, it will be much more costly and time-consuming to do that on a statement level. In this paper, to tackle this challenging problem, we propose a novel end-to-end deep learning-based approach to obtain the vulnerability-relevant code statements of a specific function. Inspired from previous approaches, we first leverage the mutual information theory for learning a set of latent variables that can represent the relevance of the source code statements to the corresponding function's vulnerability. We then propose a novel clustered spatial contrastive learning in order to further improve the representation learning and robust the selection process of vulnerability-relevant code statements. The experimental results on real-world datasets show the superiority of our proposed method over other state-of-the-art baselines.", "keywords": "Cybersecurity;Fine-grained vulnerability detection;Mutual information;Contrastive learning.", "primary_area": "", "supplementary_material": "", "author": "Van Nguyen;Trung Le;John C. Grundy;Dinh Phung", "authorids": "~Van_Nguyen2;~Trung_Le2;~John_C._Grundy1;~Dinh_Phung2", "gender": "M;M;M;M", "homepage": ";;https://sites.google.com/site/johncgrundy;https://research.monash.edu/en/persons/dinh-phung", "dblp": ";;g/JohnCGrundy.html;71/5859", "google_scholar": "KPpmKZ0AAAAJ;https://scholar.google.com/citations?hl=en;bbEQGY8AAAAJ;https://scholar.google.com.au/citations?user=OtA9SwIAAAAJ", "orcid": "0000-0002-5838-3409;;0000-0003-4928-7076;0000-0002-9977-8247", "linkedin": ";;jgrundy/;https://linkedin.com/in/dinh-phung-6b537a6", "or_profile": "~Van_Nguyen2;~Trung_Le2;~John_C._Grundy1;~Dinh_Phung1", "aff": "Monash University;Monash University;Monash University;Monash University", "aff_domain": "monash.edu;monash.edu;monash.edu;monash.edu", "position": "Postdoc;Assistant Professor;ARC Laureate Professor;Full Professor", "bibtex": "@misc{\nnguyen2022finegrained,\ntitle={Fine-grained Software Vulnerability Detection via Information Theory and Contrastive Learning},\nauthor={Van Nguyen and Trung Le and John C. Grundy and Dinh Phung},\nyear={2022},\nurl={https://openreview.net/forum?id=sKiAuHhc3w}\n}", "github": "", "project": "", "reviewers": "adwL;QaHV;JLb2;54mc;Nmpg", "site": "https://openreview.net/forum?id=sKiAuHhc3w", "pdf_size": 0, "recommendation": "3;3;3;5;6", "confidence": "4;4;4;3;3", "correctness": "2;2;2;3;3", "technical_novelty": "2;2;2;2;3", "empirical_novelty": "2;2;1;3;2", "wc_summary_paper": "91;60;79;73;68", "wc_summary_review": "25;39;47;44;76", "wc_main_review": "333;471;352;396;935", "wc_review": "449;570;478;513;1079", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "0;0;0;0;0", "reply_reviewers": "0;0;0;0;0", "reply_authors": "0;0;0;0;0", "recommendation_avg": [ 4.0, 1.2649110640673518 ], "confidence_avg": [ 3.6, 0.4898979485566356 ], "correctness_avg": [ 2.4, 0.4898979485566356 ], "technical_novelty_avg": [ 2.2, 0.39999999999999997 ], "empirical_novelty_avg": [ 2.0, 0.6324555320336759 ], "wc_summary_paper_avg": [ 74.2, 10.457533169921097 ], "wc_summary_review_avg": [ 46.2, 16.70209567689037 ], "wc_main_review_avg": [ 497.4, 223.89336747657353 ], "wc_review_avg": [ 617.8, 234.09861170028324 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.9682458365518543, "corr_recommendation_correctness": 0.9682458365518543, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:0B28-dCqmGsJ:scholar.google.com/&scioq=Fine-grained+Software+Vulnerability+Detection+via+Information+Theory+and+Contrastive+Learning&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Monash University", "aff_unique_dep": "", "aff_unique_url": "https://www.monash.edu", "aff_unique_abbr": "Monash", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Australia" }, { "id": "sMNvG2UMd_l", "title": "Mean-Shifted Contrastive Loss for Anomaly Detection", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Deep anomaly detection methods learn representations that separate between normal and anomalous samples. It was previously shown that the most accurate anomaly detectors can be obtained when powerful externally trained feature extractors (e.g. ResNets pre-trained on ImageNet) are fine-tuned on the training data which consists of normal samples and no anomalies. Although contrastive learning is currently the state-of-the-art in self-supervised anomaly detection, we show that it achieves poor results when used to fine-tune pre-trained feature extractors. We investigate the reason for this collapse, and find that pre-trained feature initialization causes poor conditioning for standard contrastive objectives, resulting in bad optimization dynamics. Based on our analysis, we provide a modified contrastive objective named the \\textit{Mean-Shifted Contrastive Loss}. Our method is highly effective and achieves a new state-of-the-art anomaly detection performance on multiple benchmarks including $97.2\\%$ ROC-AUC on the CIFAR-10 dataset.", "keywords": "anomaly detection", "primary_area": "", "supplementary_material": "/attachment/f47d577e55ddd5291c85546399319044322bf941.zip", "author": "Tal Reiss;Yedid Hoshen", "authorids": "~Tal_Reiss1;~Yedid_Hoshen3", "gender": "M;M", "homepage": ";https://www.cs.huji.ac.il/~ydidh/", "dblp": "276/6114;136/0280", "google_scholar": "sgMIT6EAAAAJ;https://scholar.google.co.il/citations?user=6y1-qS4AAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Tal_Reiss1;~Yedid_Hoshen3", "aff": "Hebrew University of Jerusalem;Hebrew University of Jerusalem", "aff_domain": "huji.ac.il;huji.ac.il", "position": "PhD student;Assistant Professor", "bibtex": "@misc{\nreiss2022meanshifted,\ntitle={Mean-Shifted Contrastive Loss for Anomaly Detection},\nauthor={Tal Reiss and Yedid Hoshen},\nyear={2022},\nurl={https://openreview.net/forum?id=sMNvG2UMd_l}\n}", "github": "", "project": "", "reviewers": "mhvR;equT;sn5p;kcfg", "site": "https://openreview.net/forum?id=sMNvG2UMd_l", "pdf_size": 0, "recommendation": "3;3;5;8", "confidence": "4;5;4;5", "correctness": "3;2;3;4", "technical_novelty": "2;2;2;4", "empirical_novelty": "2;2;1;4", "wc_summary_paper": "48;149;42;97", "wc_summary_review": "42;170;38;37", "wc_main_review": "449;928;230;303", "wc_review": "539;1247;310;437", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "384;779;121;193", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 4.75, 2.0463381929681126 ], "confidence_avg": [ 4.5, 0.5 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.5, 0.8660254037844386 ], "empirical_novelty_avg": [ 2.25, 1.0897247358851685 ], "wc_summary_paper_avg": [ 84.0, 43.16827538829875 ], "wc_summary_review_avg": [ 71.75, 56.75550634079481 ], "wc_main_review_avg": [ 477.5, 271.7853012949744 ], "wc_review_avg": [ 633.25, 363.5164198492277 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 369.25, 255.34523198994728 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.3665083330689157, "corr_recommendation_correctness": 0.8638684255813602, "gs_citation": 154, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10386217252747944521&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0", "aff_unique_norm": "Hebrew University of Jerusalem", "aff_unique_dep": "", "aff_unique_url": "https://www.huji.ac.il", "aff_unique_abbr": "HUJI", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Jerusalem", "aff_country_unique_index": "0;0", "aff_country_unique": "Israel" }, { "id": "sMqybmUh_u8", "title": "Provable Hierarchy-Based Meta-Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Hierarchical reinforcement learning (HRL) has seen widespread interest as an approach to tractable learning of complex modular behaviors. However, existing work either assume access to expert-constructed hierarchies, or use hierarchy-learning heuristics with no provable guarantees. To address this gap, we analyze HRL in the meta-RL setting, where a learner learns latent hierarchical structure during meta-training for use in a downstream task. We consider a tabular setting where natural hierarchical structure is embedded in the transition dynamics. Analogous to supervised meta-learning theory, we provide \u201cdiversity conditions\u201d which, together with a tractable optimism-based algorithm, guarantee sample-efficient recovery of this natural hierarchy. Furthermore, we provide regret bounds on a learner using the recovered hierarchy to solve a meta-test task. Our bounds incorporate common notions in HRL literature such as temporal and state/action abstractions, suggesting that our setting and analysis capture important features of HRL in practice.", "keywords": "RL theory;regret bounds;hierarchical RL;meta-RL", "primary_area": "", "supplementary_material": "", "author": "Kurtland Chua;Qi Lei;Jason D. Lee", "authorids": "~Kurtland_Chua1;~Qi_Lei1;~Jason_D._Lee1", "gender": "M;F;M", "homepage": "http://kchua.github.io;https://cecilialeiqi.github.io/;https://jasondlee88.github.io/", "dblp": "220/5456;;88/3262", "google_scholar": ";kGOgaowAAAAJ;GR_DsT0AAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Kurtland_Chua1;~Qi_Lei1;~Jason_D._Lee1", "aff": "Princeton University;Princeton University;Princeton University", "aff_domain": "princeton.edu;princeton.edu;princeton.edu", "position": "PhD student;Postdoc;Assistant Professor", "bibtex": "@misc{\nchua2022provable,\ntitle={Provable Hierarchy-Based Meta-Reinforcement Learning},\nauthor={Kurtland Chua and Qi Lei and Jason D. Lee},\nyear={2022},\nurl={https://openreview.net/forum?id=sMqybmUh_u8}\n}", "github": "", "project": "", "reviewers": "ht9E;AT7G;wF2a;WLF1", "site": "https://openreview.net/forum?id=sMqybmUh_u8", "pdf_size": 0, "recommendation": "3;5;6;6", "confidence": "3;3;2;4", "correctness": "3;4;4;3", "technical_novelty": "2;3;3;3", "empirical_novelty": "0;0;0;0", "wc_summary_paper": "43;66;87;59", "wc_summary_review": "74;23;47;36", "wc_main_review": "668;166;269;481", "wc_review": "785;255;403;576", "wc_reply_reviewers": "0;55;0;0", "wc_reply_authors": "665;557;205;645", "reply_reviewers": "0;1;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 5.0, 1.224744871391589 ], "confidence_avg": [ 3.0, 0.7071067811865476 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 0, 0 ], "wc_summary_paper_avg": [ 63.75, 15.801503093060482 ], "wc_summary_review_avg": [ 45.0, 18.774983355518586 ], "wc_main_review_avg": [ 396.0, 193.80273475882635 ], "wc_review_avg": [ 504.75, 197.7022698402828 ], "wc_reply_reviewers_avg": [ 13.75, 23.81569860407206 ], "wc_reply_authors_avg": [ 518.0, 185.2214890340751 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.40824829046386296, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13675452442671340777&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff_unique_index": "0;0;0", "aff_unique_norm": "Princeton University", "aff_unique_dep": "", "aff_unique_url": "https://www.princeton.edu", "aff_unique_abbr": "Princeton", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Active Hierarchical Exploration with Stable Subgoal Representation Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6965", "id": "sNuFKTMktcY", "poster": "", "openreview": "https://openreview.net/forum?id=sNuFKTMktcY", "slides": "https://iclr.cc/virtual/2022/poster/6965", "video": "https://iclr.cc/virtual/2022/poster/6965", "author_site": "Siyuan Li, Jin Zhang, Jianhao Wang, Yang Yu, Chongjie Zhang", "tldr": "", "abstract": "Goal-conditioned hierarchical reinforcement learning (GCHRL) provides a promising approach to solving long-horizon tasks. Recently, its success has been extended to more general settings by concurrently learning hierarchical policies and subgoal representations. Although GCHRL possesses superior exploration ability by decomposing tasks via subgoals, existing GCHRL methods struggle in temporally extended tasks with sparse external rewards, since the high-level policy learning relies on external rewards. As the high-level policy selects subgoals in an online learned representation space, the dynamic change of the subgoal space severely hinders effective high-level exploration. In this paper, we propose a novel regularization that contributes to both stable and efficient subgoal representation learning. Building upon the stable representation, we design measures of novelty and potential for subgoals, and develop an active hierarchical exploration strategy that seeks out new promising subgoals and states without intrinsic rewards. Experimental results show that our approach significantly outperforms state-of-the-art baselines in continuous control tasks with sparse rewards. ", "keywords": "Hierarchical Reinforcement Learning;Exploration;Representation Learning", "primary_area": "", "supplementary_material": "/attachment/41023f25ea1e2a3a06924bc8e58888fefe5de39d.zip", "author": "Siyuan Li;Jin Zhang;Jianhao Wang;Yang Yu;Chongjie Zhang", "authorids": "~Siyuan_Li1;~Jin_Zhang6;~Jianhao_Wang1;~Yang_Yu5;~Chongjie_Zhang1", "gender": "F;M;M;;M", "homepage": ";http://group.iiis.tsinghua.edu.cn/~milab/person-zhangjin.html;http://group.iiis.tsinghua.edu.cn/~milab/;;http://www.lamda.nju.edu.cn/yuy", "dblp": "63/9705;43/6657-16;https://dblp.uni-trier.de/pid/239/5945;29/6693;46/2181-1", "google_scholar": "https://scholar.google.com/citations?hl=zh-CN;;;LjxqXycAAAAJ;PG2lDSwAAAAJ", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~Siyuan_Li1;~Jin_Zhang6;~Jianhao_Wang1;~Chongjie_Zhang1;~Yang_Yu2", "aff": "Tsinghua University;Tsinghua University;Tsinghua University;Tsinghua University;Nanjing University", "aff_domain": "tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;nju.edu.cn", "position": "PhD student;PhD student;PhD student;Assistant Professor;Professor", "bibtex": "@inproceedings{\nli2022active,\ntitle={Active Hierarchical Exploration with Stable Subgoal Representation Learning},\nauthor={Siyuan Li and Jin Zhang and Jianhao Wang and Yang Yu and Chongjie Zhang},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=sNuFKTMktcY}\n}", "github": "", "project": "", "reviewers": "GV2u;XmcD;EgA9;yhF9", "pdf_size": 0, "recommendation": "6;6;8;8", "confidence": "3;4;4;4", "correctness": "4;3;4;4", "technical_novelty": "3;3;3;3", "empirical_novelty": "3;3;4;3", "wc_summary_paper": "102;144;208;242", "wc_summary_review": "53;40;107;52", "wc_main_review": "286;647;665;254", "wc_review": "441;831;980;548", "wc_reply_reviewers": "0;490;62;18", "wc_reply_authors": "618;1593;1024;637", "reply_reviewers": "0;2;2;1", "reply_authors": "1;4;3;1", "recommendation_avg": [ 7.0, 1.0 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 3.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 174.0, 54.46099521676041 ], "wc_summary_review_avg": [ 63.0, 25.913317039699876 ], "wc_main_review_avg": [ 463.0, 193.43603593953222 ], "wc_review_avg": [ 700.0, 215.4912991282943 ], "wc_reply_reviewers_avg": [ 142.5, 201.89291716154878 ], "wc_reply_authors_avg": [ 968.0, 395.54456133285413 ], "reply_reviewers_avg": [ 1.25, 0.82915619758885 ], "reply_authors_avg": [ 2.25, 1.299038105676658 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 29, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16962537436246841648&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=sNuFKTMktcY", "email": "tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;nju.edu.cn", "author_num": 5, "aff_unique_index": "0;0;0;0;1", "aff_unique_norm": "Tsinghua University;Nanjing University", "aff_unique_dep": ";", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.nju.edu.cn", "aff_unique_abbr": "THU;Nanjing U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Responsible Disclosure of Generative Models Using Scalable Fingerprinting", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6395", "id": "sOK-zS6WHB", "poster": "", "openreview": "https://openreview.net/forum?id=sOK-zS6WHB", "slides": "https://iclr.cc/virtual/2022/poster/6395", "video": "https://iclr.cc/virtual/2022/poster/6395", "author_site": "Ning Yu, Vladislav Skripniuk, Dingfan Chen, Larry Davis, Mario Fritz", "tldr": "", "abstract": "Over the past years, deep generative models have achieved a new level of performance. Generated data has become difficult, if not impossible, to be distinguished from real data. While there are plenty of use cases that benefit from this technology, there are also strong concerns on how this new technology can be misused to generate deep fakes and enable misinformation at scale. Unfortunately, current deep fake detection methods are not sustainable, as the gap between real and fake continues to close. In contrast, our work enables a responsible disclosure of such state-of-the-art generative models, that allows model inventors to fingerprint their models, so that the generated samples containing a fingerprint can be accurately detected and attributed to a source. Our technique achieves this by an efficient and scalable ad-hoc generation of a large population of models with distinct fingerprints. Our recommended operation point uses a 128-bit fingerprint which in principle results in more than 10^{38} identifiable models. Experiments show that our method fulfills key properties of a fingerprinting mechanism and achieves effectiveness in deep fake detection and attribution. Code and models are available at https://github.com/ningyu1991/ScalableGANFingerprints.", "keywords": "Generative models;fingerprinting;responsible disclosure;deep fake detection and attribution", "primary_area": "", "supplementary_material": "", "author": "Ning Yu;Vladislav Skripniuk;Dingfan Chen;Larry S. Davis;Mario Fritz", "authorids": "~Ning_Yu2;~Vladislav_Skripniuk1;~Dingfan_Chen1;~Larry_S._Davis1;~Mario_Fritz1", "gender": ";M;F;M;M", "homepage": ";;https://dingfanchen.github.io/homepage/;http://www.umiacs.umd.edu/~lsd/;https://cispa.saarland/group/fritz/", "dblp": ";;248/8198;d/LarrySDavis;", "google_scholar": ";;iARn00oAAAAJ;https://scholar.google.com.tw/citations?user=lc0ARagAAAAJ;https://scholar.google.de/citations?user=4V1nNm4AAAAJ", "orcid": ";;;;", "linkedin": ";vladislav-skripniuk-8a8891143/;dingfan-chen-44174012b/;;", "or_profile": "~Ning_Yu2;~Vladislav_Skripniuk1;~Dingfan_Chen1;~Larry_S._Davis1;~Mario_Fritz1", "aff": ";;CISPA, saarland university, saarland informatics campus;Amazon;Saarland University", "aff_domain": ";;cispa.saarland;amazon.com;uni-saarland.de", "position": ";;PhD student;Amazon Sr. Principal Scientist;Full Professor", "bibtex": "@inproceedings{\nyu2022responsible,\ntitle={Responsible Disclosure of Generative Models Using Scalable Fingerprinting},\nauthor={Ning Yu and Vladislav Skripniuk and Dingfan Chen and Larry S. Davis and Mario Fritz},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=sOK-zS6WHB}\n}", "github": "", "project": "", "reviewers": "RiQf;fu3Z;yqaN;GMRc;sqeb", "pdf_size": 0, "recommendation": "6;6;8;8;8", "confidence": "4;4;4;3;4", "correctness": "2;2;4;4;3", "technical_novelty": "2;3;4;3;4", "empirical_novelty": "2;3;3;3;3", "wc_summary_paper": "112;114;78;148;137", "wc_summary_review": "121;135;62;49;14", "wc_main_review": "774;365;579;625;594", "wc_review": "1007;614;719;822;745", "wc_reply_reviewers": "0;0;0;0;11", "wc_reply_authors": "1991;955;818;866;853", "reply_reviewers": "0;0;0;0;1", "reply_authors": "3;2;2;2;2", "recommendation_avg": [ 7.2, 0.9797958971132712 ], "confidence_avg": [ 3.8, 0.39999999999999997 ], "correctness_avg": [ 3.0, 0.8944271909999159 ], "technical_novelty_avg": [ 3.2, 0.7483314773547882 ], "empirical_novelty_avg": [ 2.8, 0.39999999999999997 ], "wc_summary_paper_avg": [ 117.8, 24.136279746472944 ], "wc_summary_review_avg": [ 76.2, 45.33166663602829 ], "wc_main_review_avg": [ 587.4, 131.00320606763788 ], "wc_review_avg": [ 781.4, 130.98488462414278 ], "wc_reply_reviewers_avg": [ 2.2, 4.4 ], "wc_reply_authors_avg": [ 1096.6, 449.47462664760064 ], "reply_reviewers_avg": [ 0.2, 0.4 ], "reply_authors_avg": [ 2.2, 0.39999999999999997 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.408248290463863, "corr_recommendation_correctness": 0.9128709291752769, "gs_citation": 113, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5724642916059035277&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "pdf": "https://openreview.net/pdf?id=sOK-zS6WHB", "email": ";;cispa.saarland;amazon.com;uni-saarland.de", "author_num": 5, "aff_unique_index": "0;1;0", "aff_unique_norm": "Saarland University;Amazon", "aff_unique_dep": "CISPA;Amazon.com, Inc.", "aff_unique_url": "https://www.uni-saarland.de;https://www.amazon.com", "aff_unique_abbr": "Saarland U;Amazon", "aff_campus_unique_index": "0", "aff_campus_unique": "Saarland Informatics Campus;", "aff_country_unique_index": "0;1;0", "aff_country_unique": "Germany;United States" }, { "title": "Offline Neural Contextual Bandits: Pessimism, Optimization and Generalization", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7136", "id": "sPIFuucA3F", "poster": "", "openreview": "https://openreview.net/forum?id=sPIFuucA3F", "slides": "https://iclr.cc/virtual/2022/poster/7136", "video": "https://iclr.cc/virtual/2022/poster/7136", "author_site": "Thanh Nguyen-Tang, Sunil Gupta, A. Tuan Nguyen, Svetha Venkatesh", "tldr": "", "abstract": "Offline policy learning (OPL) leverages existing data collected a priori for policy optimization without any active exploration. Despite the prevalence and recent interest in this problem, its theoretical and algorithmic foundations in function approximation settings remain under-developed. In this paper, we consider this problem on the axes of distributional shift, optimization, and generalization in offline contextual bandits with neural networks. In particular, we propose a provably efficient offline contextual bandit with neural network function approximation that does not require any functional assumption on the reward. We show that our method provably generalizes over unseen contexts under a milder condition for distributional shift than the existing OPL works. Notably, unlike any other OPL method, our method learns from the offline data in an online manner using stochastic gradient descent, allowing us to leverage the benefits of online learning into an offline setting. Moreover, we show that our method is more computationally efficient and has a better dependence on the effective dimension of the neural network than an online counterpart. Finally, we demonstrate the empirical effectiveness of our method in a range of synthetic and real-world OPL problems.", "keywords": "offline policy learning;offline contextual bandits;neural network function approximation", "primary_area": "", "supplementary_material": "", "author": "Thanh Nguyen-Tang;Sunil Gupta;A. Tuan Nguyen;Svetha Venkatesh", "authorids": "~Thanh_Nguyen-Tang1;~Sunil_Gupta2;~A._Tuan_Nguyen1;~Svetha_Venkatesh1", "gender": "M;F;M;M", "homepage": "https://atuannguyen.com;https://www.deakin.edu.au/about-deakin/people/svetha-venkatesh;https://personal-sites.deakin.edu.au/~sunilg/;https://thanhnguyentang.github.io/", "dblp": ";81/1984;47/333-1;287/5102.html", "google_scholar": "V-guxukAAAAJ;AEkRUQcAAAAJ;https://scholar.google.com.au/citations?user=bXeL2t8AAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";;0000-0002-3308-1930;0000-0002-1917-2190", "linkedin": "a-tuan-nguyen/;;;thanhnguyentang/", "or_profile": "~A._Tuan_Nguyen1;~Svetha_Venkatesh1;~Sunil_Kumar_Gupta1;~Thanh_Tang_Nguyen2", "aff": "University of Oxford;Deakin University;Deakin University;Deakin University", "aff_domain": "ox.ac.uk;deakin.edu.au;deakin.edu.au;deakin.edu.au", "position": "PhD student;Full Professor;Associate Professor;PhD student", "bibtex": "@inproceedings{\nnguyen-tang2022offline,\ntitle={Offline Neural Contextual Bandits: Pessimism, Optimization and Generalization},\nauthor={Thanh Nguyen-Tang and Sunil Gupta and A. Tuan Nguyen and Svetha Venkatesh},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=sPIFuucA3F}\n}", "github": "", "project": "", "reviewers": "HWn2;RvBs;5NdR;Fwh8", "pdf_size": 0, "recommendation": "6;6;6;6", "confidence": "3;3;4;3", "correctness": "3;3;3;4", "technical_novelty": "2;2;3;2", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "127;69;66;133", "wc_summary_review": "21;34;36;15", "wc_main_review": "393;98;437;369", "wc_review": "541;201;539;517", "wc_reply_reviewers": "0;0;86;29", "wc_reply_authors": "547;298;436;703", "reply_reviewers": "0;0;1;1", "reply_authors": "2;1;2;3", "recommendation_avg": [ 6.0, 0.0 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 98.75, 31.339870771909702 ], "wc_summary_review_avg": [ 26.5, 8.789197915623474 ], "wc_main_review_avg": [ 324.25, 132.88223169408317 ], "wc_review_avg": [ 449.5, 143.780214216004 ], "wc_reply_reviewers_avg": [ 28.75, 35.10964967071019 ], "wc_reply_authors_avg": [ 496.0, 148.53787395812557 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 2.0, 0.7071067811865476 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 27, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11879917374324366970&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "pdf": "https://openreview.net/pdf?id=sPIFuucA3F", "email": "ox.ac.uk;deakin.edu.au;deakin.edu.au;deakin.edu.au", "author_num": 4, "aff_unique_index": "0;1;1;1", "aff_unique_norm": "University of Oxford;Deakin University", "aff_unique_dep": ";", "aff_unique_url": "https://www.ox.ac.uk;https://www.deakin.edu.au", "aff_unique_abbr": "Oxford;Deakin", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1", "aff_country_unique": "United Kingdom;Australia" }, { "title": "Mapping conditional distributions for domain adaptation under generalized target shift", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6951", "id": "sPfB2PI87BZ", "poster": "", "openreview": "https://openreview.net/forum?id=sPfB2PI87BZ", "slides": "https://iclr.cc/virtual/2022/poster/6951", "video": "https://iclr.cc/virtual/2022/poster/6951", "author_site": "Matthieu Kirchmeyer, alain rakotomamonjy, Emmanuel de B\u00e9zenac, patrick gallinari", "tldr": "", "abstract": "We consider the problem of unsupervised domain adaptation (UDA) between a source and a target domain under conditional and label shift a.k.a Generalized Target Shift (GeTarS). Unlike simpler UDA settings, few works have addressed this challenging problem. Recent approaches learn domain-invariant representations, yet they have practical limitations and rely on strong assumptions that may not hold in practice. In this paper, we explore a novel and general approach to align pretrained representations, which circumvents existing drawbacks. Instead of constraining representation invariance, it learns an optimal transport map, implemented as a NN, which maps source representations onto target ones. Our approach is flexible and scalable, it preserves the problem's structure and it has strong theoretical guarantees under mild assumptions. In particular, our solution is unique, matches conditional distributions across domains, recovers target proportions and explicitly controls the target generalization risk. Through an exhaustive comparison on several datasets, we challenge the state-of-the-art in GeTarS.", "keywords": "Unsupervised domain adaptation;generalized target shift", "primary_area": "", "supplementary_material": "", "author": "Matthieu Kirchmeyer;Alain Rakotomamonjy;Emmanuel de Bezenac;patrick gallinari", "authorids": "~Matthieu_Kirchmeyer1;~Alain_Rakotomamonjy1;~Emmanuel_de_Bezenac2;~patrick_gallinari1", "gender": ";;M;M", "homepage": "https://mkirchmeyer.github.io;;;", "dblp": "241/9725;;;g/PatrickGallinari", "google_scholar": "oJkKtrkAAAAJ;;https://scholar.google.fr/citations?user=KvZw5gYAAAAJ;rFaxB20AAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Matthieu_Kirchmeyer1;~Alain_Rakotomamonjy1;~Emmanuel_de_Bezenac2;~patrick_gallinari1", "aff": "Criteo AI Lab;;ETHZ - ETH Zurich;Sorbonne Universite", "aff_domain": "criteo.com;;ethz.ch;sorbonne-universite.fr", "position": "Researcher;;Postdoc;Full Professor", "bibtex": "@inproceedings{\nkirchmeyer2022mapping,\ntitle={Mapping conditional distributions for domain adaptation under generalized target shift},\nauthor={Matthieu Kirchmeyer and Alain Rakotomamonjy and Emmanuel de Bezenac and patrick gallinari},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=sPfB2PI87BZ}\n}", "github": "", "project": "", "reviewers": "9ZXq;cdrt;FHUc", "pdf_size": 0, "recommendation": "5;6;8", "confidence": "2;3;4", "correctness": "4;3;4", "technical_novelty": "3;3;3", "empirical_novelty": "2;3;2", "wc_summary_paper": "85;55;186", "wc_summary_review": "72;15;74", "wc_main_review": "202;344;556", "wc_review": "359;414;816", "wc_reply_reviewers": "66;0;0", "wc_reply_authors": "1854;1749;2063", "reply_reviewers": "1;0;0", "reply_authors": "4;3;3", "recommendation_avg": [ 6.333333333333333, 1.247219128924647 ], "confidence_avg": [ 3.0, 0.816496580927726 ], "correctness_avg": [ 3.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 108.66666666666667, 56.03768573221259 ], "wc_summary_review_avg": [ 53.666666666666664, 27.35365098523819 ], "wc_main_review_avg": [ 367.3333333333333, 145.45866155792243 ], "wc_review_avg": [ 529.6666666666666, 203.70948813336003 ], "wc_reply_reviewers_avg": [ 22.0, 31.11269837220809 ], "wc_reply_authors_avg": [ 1888.6666666666667, 130.51266434930963 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 3.3333333333333335, 0.4714045207910317 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.9819805060619659, "corr_recommendation_correctness": 0.18898223650461363, "gs_citation": 28, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3768189705171035948&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "pdf": "https://openreview.net/pdf?id=sPfB2PI87BZ", "email": "criteo.com;;ethz.ch;sorbonne-universite.fr", "author_num": 4, "aff_unique_index": "0;1;2", "aff_unique_norm": "Criteo;ETH Zurich;Sorbonne University", "aff_unique_dep": "Criteo AI Lab;;", "aff_unique_url": "https://www.criteo.com;https://www.ethz.ch;https://www.sorbonne-universite.fr", "aff_unique_abbr": "Criteo;ETHZ;Sorbonne", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", "aff_country_unique": "France;Switzerland" }, { "title": "CoBERL: Contrastive BERT for Reinforcement Learning", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7114", "id": "sRZ3GhmegS", "poster": "", "openreview": "https://openreview.net/forum?id=sRZ3GhmegS", "slides": "https://iclr.cc/virtual/2022/poster/7114", "video": "https://iclr.cc/virtual/2022/poster/7114", "author_site": "Andrea Banino, Adria Puigdomenech Badia, Jacob C Walker, Tim Scholtes, Jovana Mitrovic, Charles Blundell", "tldr": "", "abstract": "Many reinforcement learning (RL) agents require a large amount of experience to solve tasks. We propose Contrastive BERT for RL (COBERL), an agent that combines a new contrastive loss and a hybrid LSTM-transformer architecture to tackle the challenge of improving data efficiency. COBERL enables efficient and robust learning from pixels across a wide variety of domains. We use bidirectional masked prediction in combination with a generalization of a recent contrastive method to learn better representations for RL, without the need of hand engineered data augmentations. We find that COBERL consistently improves data efficiency across the full Atari suite, a set of control tasks and a challenging 3D environment, and often it also increases final score performance.", "keywords": "Reinforcement Learning;Contrastive Learning;Representation Learning;Transformer;Deep Reinforcement Learning", "primary_area": "", "supplementary_material": "/attachment/215de5ab9545cd4cd0521c12e9343b915630f677.zip", "author": "Andrea Banino;Adria Puigdomenech Badia;Jacob C Walker;Tim Scholtes;Jovana Mitrovic;Charles Blundell", "authorids": "~Andrea_Banino1;~Adria_Puigdomenech_Badia2;~Jacob_C_Walker1;~Tim_Scholtes1;~Jovana_Mitrovic1;~Charles_Blundell1", "gender": ";;;;;", "homepage": ";;;https://deepmind.com/;http://jovana-mitrovic.github.io;http://www.gatsby.ucl.ac.uk/~ucgtcbl/", "dblp": ";;135/1696;;176/5114;35/8396", "google_scholar": ";;0dR_wD0AAAAJ;;;https://scholar.google.co.uk/citations?user=f31mvPsAAAAJ", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": "~Andrea_Banino1;~Adria_Puigdomenech_Badia2;~Jacob_C_Walker1;~Tim_Scholtes1;~Jovana_Mitrovic1;~Charles_Blundell1", "aff": ";;Google;Google DeepMind;Google DeepMind;Google DeepMind", "aff_domain": ";;google.com;deepmind.com;google.com;google.com", "position": ";;Research Scientist;Researcher;Research Scientist;Research Scientist", "bibtex": "@inproceedings{\nbanino2022coberl,\ntitle={Co{BERL}: Contrastive {BERT} for Reinforcement Learning},\nauthor={Andrea Banino and Adria Puigdomenech Badia and Jacob C Walker and Tim Scholtes and Jovana Mitrovic and Charles Blundell},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=sRZ3GhmegS}\n}", "github": "", "project": "", "reviewers": "YoL5;XmhC;gQY1", "pdf_size": 0, "recommendation": "6;8;8", "confidence": "3;4;3", "correctness": "3;3;4", "technical_novelty": "2;3;3", "empirical_novelty": "3;3;3", "wc_summary_paper": "81;43;51", "wc_summary_review": "101;87;21", "wc_main_review": "455;534;68", "wc_review": "637;664;140", "wc_reply_reviewers": "119;63;37", "wc_reply_authors": "733;875;55", "reply_reviewers": "1;1;1", "reply_authors": "1;2;1", "recommendation_avg": [ 7.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 58.333333333333336, 16.35712552851373 ], "wc_summary_review_avg": [ 69.66666666666667, 34.883934538536344 ], "wc_main_review_avg": [ 352.3333333333333, 203.624381862509 ], "wc_review_avg": [ 480.3333333333333, 240.90431479923495 ], "wc_reply_reviewers_avg": [ 73.0, 34.215006454283575 ], "wc_reply_authors_avg": [ 554.3333333333334, 357.8093843872119 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.49999999999999983, "corr_recommendation_correctness": 0.49999999999999983, "gs_citation": 46, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3823279505832239744&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=sRZ3GhmegS", "email": ";;google.com;deepmind.com;google.com;google.com", "author_num": 6, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Google", "aff_unique_dep": "Google", "aff_unique_url": "https://www.google.com", "aff_unique_abbr": "Google", "aff_campus_unique_index": "0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;1;1;1", "aff_country_unique": "United States;United Kingdom" }, { "id": "sS0dHmaH1I", "title": "Fast Adaptive Anomaly Detection", "track": "main", "status": "Reject", "tldr": "", "abstract": "The ability to detect anomaly has long been recognized as an inherent human ability, yet to date, practical AI solutions to mimic such capability have been lacking.This lack of progress can be attributed to several factors. To begin with, the distribution of \u201cabnormalities\u201d is intractable. Anything outside of a given normal population is by definition an anomaly. This explains why a large volume of workin this area has been dedicated to modeling the normal distribution of a given task followed by detecting deviations from it. This direction is however unsatisfying as it would require modeling the normal distribution of every task that comes along, which includes tedious data collection. In this paper, we report our work aiming to handle these issues. To deal with the intractability of abnormal distribution, we leverage Energy Based Model (EBM). EBMs learn to associates low energies to correct values and higher energies to incorrect values. As its core, the EBM em-ploys Langevin Dynamics (LD) in generating these incorrect samples based on an iterative optimization procedure, alleviating the intractable problem of modeling the world of anomalies. Then, in order to avoid training an anomaly detector for every task, we utilize an adaptive sparse coding layer. Our intention is to design a plug and play feature that can be used to quickly update what is normal during inference time. Lastly, to avoid tedious data collection, this mentioned update of the sparse coding layer needs to be achievable with just a few shots. Here, we employ a meta learning scheme that simulates such a few shot setting during training. We support our findings with strong empirical evidence.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ze Wang;Yipin Zhou;Rui Wang;Tsung-Yu Lin;Ashish Shah;Ser-Nam Lim", "authorids": "~Ze_Wang3;~Yipin_Zhou4;~Rui_Wang23;~Tsung-Yu_Lin2;~Ashish_Shah1;~Ser-Nam_Lim3", "gender": "M;M;M;M;M;Non-Binary", "homepage": ";;https://people.cs.umass.edu/~tsungyulin/;;https://sites.google.com/site/sernam;https://yipin.github.io/", "dblp": ";;37/9004;01/2068;04/6633;150/4221", "google_scholar": "80Jw_w8AAAAJ;FWpOydIAAAAJ;KyToxZoAAAAJ;KQrLwIAAAAAJ;HX0BfLYAAAAJ;9_HpgJ0AAAAJ", "orcid": ";;0000-0002-1332-646X;;;", "linkedin": ";;;ashish217/;;", "or_profile": "~Ze_Wang3;~Rui_Wang23;~Tsung-Yu_Lin2;~Ashish_Shah1;~Ser-Nam_Lim1;~Yipin_Zhou1", "aff": "Purdue University;Meta Facebook;Department of Computer Science, University of Massachusetts, Amherst;Meta Facebook;Meta Facebook;Meta", "aff_domain": "purdue.edu;fb.com;cs.umass.edu;fb.com;facebook.com;meta.com", "position": "PhD student;Researcher;PhD student;Researcher;Research Scientist Manager;Researcher", "bibtex": "@misc{\nwang2022fast,\ntitle={Fast Adaptive Anomaly Detection},\nauthor={Ze Wang and Yipin Zhou and Rui Wang and Tsung-Yu Lin and Ashish Shah and Ser-Nam Lim},\nyear={2022},\nurl={https://openreview.net/forum?id=sS0dHmaH1I}\n}", "github": "", "project": "", "reviewers": "Mzn6;d5Wi;egaq;4fuZ;rnYM", "site": "https://openreview.net/forum?id=sS0dHmaH1I", "pdf_size": 0, "recommendation": "3;5;6;8;8", "confidence": "4;2;3;4;3", "correctness": "2;3;3;4;4", "technical_novelty": "2;2;3;3;3", "empirical_novelty": "2;2;3;3;3", "wc_summary_paper": "93;89;24;64;76", "wc_summary_review": "16;139;66;16;81", "wc_main_review": "148;243;201;107;67", "wc_review": "257;471;291;187;224", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "310;775;378;97;256", "reply_reviewers": "0;0;0;0;0", "reply_authors": "1;1;1;1;1", "recommendation_avg": [ 6.0, 1.8973665961010275 ], "confidence_avg": [ 3.2, 0.7483314773547882 ], "correctness_avg": [ 3.2, 0.7483314773547882 ], "technical_novelty_avg": [ 2.6, 0.4898979485566356 ], "empirical_novelty_avg": [ 2.6, 0.4898979485566356 ], "wc_summary_paper_avg": [ 69.2, 24.798387044322055 ], "wc_summary_review_avg": [ 63.6, 45.880714902887036 ], "wc_main_review_avg": [ 153.2, 63.15188041539222 ], "wc_review_avg": [ 286.0, 98.72790892143924 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 363.2, 225.8241793962728 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.9860132971832695, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1, "aff_unique_index": "0;1;2;1;1;1", "aff_unique_norm": "Purdue University;Meta;University of Massachusetts Amherst", "aff_unique_dep": ";Meta Platforms, Inc.;Department of Computer Science", "aff_unique_url": "https://www.purdue.edu;https://meta.com;https://www.umass.edu", "aff_unique_abbr": "Purdue;Meta;UMass Amherst", "aff_campus_unique_index": "1", "aff_campus_unique": ";Amherst", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "sTECq7ZjtKX", "title": "OSSuM: A Gradient-Free Approach For Pruning Neural Networks At Initialization", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Pruning overparameterized neural networks to obtain memory-and-compute-efficient sparse networks is an active area of research. Recent works attempt to prune neural networks at initialization to design sparse networks that can be trained efficiently. In this paper we propose One-Shot Supermasking (OSSuM), a gradient-free, compute-efficient technique to efficiently prune neurons in fully-connected networks. In theory we frame this problem as a neuron subset selection problem, wherein we prune neurons to obtain a better accuracy by optimizing on the cross-entropy loss. In our experiments we show that OSSuM can perform similar to gradient-based pruning techniques at initialization, prior to training. For example, OSSuM can achieve a test set accuracy of $82.4\\%$ on MNIST by pruning a 2-layer fully-connected neural network at initialization with just a single forward-pass over the training data. Further, we empirically demonstrate that OSSuM can be used to efficiently prune trained networks as well. We also propose various variants of OSSuM that can be used to prune deeper neural networks.", "keywords": "Pruning;Supermasking;Gradient-free training;Sparse networks", "primary_area": "", "supplementary_material": "/attachment/e70369b8d7209ee736c5cea781976b1aff0c30e5.zip", "author": "Vinu Sankar Sadasivan;Jayesh Malaviya;Anirban Dasgupta", "authorids": "~Vinu_Sankar_Sadasivan1;~Jayesh_Malaviya1;~Anirban_Dasgupta1", "gender": "M;M;M", "homepage": "https://vinusankars.github.io/;;https://sites.google.com/site/anirbandasgupta", "dblp": "244/8052;;54/385-1", "google_scholar": "y1IKIw0AAAAJ;bD93gSwAAAAJ;plJC8R0AAAAJ", "orcid": ";;", "linkedin": "vinusankars/;jayeshmalaviya/;", "or_profile": "~Vinu_Sankar_Sadasivan1;~Jayesh_Malaviya1;~Anirban_Dasgupta1", "aff": "University of Maryland, College Park;IIT Gandhinagar;IIT Gandhinagar", "aff_domain": "umd.edu;iitgn.ac.in;iitgn.ac.in", "position": "PhD student;PhD student;Professor", "bibtex": "@misc{\nsadasivan2022ossum,\ntitle={{OSS}uM: A Gradient-Free Approach For Pruning Neural Networks At Initialization},\nauthor={Vinu Sankar Sadasivan and Jayesh Malaviya and Anirban Dasgupta},\nyear={2022},\nurl={https://openreview.net/forum?id=sTECq7ZjtKX}\n}", "github": "", "project": "", "reviewers": "m3Ac;YjQZ;vc4j;gN1p", "site": "https://openreview.net/forum?id=sTECq7ZjtKX", "pdf_size": 0, "recommendation": "3;3;3;3", "confidence": "4;5;3;2", "correctness": "3;2;2;3", "technical_novelty": "2;2;2;2", "empirical_novelty": "0;1;2;2", "wc_summary_paper": "78;14;34;49", "wc_summary_review": "11;26;14;21", "wc_main_review": "109;67;152;223", "wc_review": "198;107;200;293", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "308;214;499;476", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 3.5, 1.118033988749895 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 1.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 43.75, 23.34925052330374 ], "wc_summary_review_avg": [ 18.0, 5.873670062235365 ], "wc_main_review_avg": [ 137.75, 57.66877404627222 ], "wc_review_avg": [ 199.5, 65.76663287716651 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 374.25, 118.30548381203637 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:JGkKqEFzqUQJ:scholar.google.com/&scioq=OSSuM:+A+Gradient-Free+Approach+For+Pruning+Neural+Networks+At+Initialization&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1;1", "aff_unique_norm": "University of Maryland;Indian Institute of Technology Gandhinagar", "aff_unique_dep": ";", "aff_unique_url": "https://www/umd.edu;https://www.iitgn.ac.in", "aff_unique_abbr": "UMD;IITGN", "aff_campus_unique_index": "0;1;1", "aff_campus_unique": "College Park;Gandhinagar", "aff_country_unique_index": "0;1;1", "aff_country_unique": "United States;India" }, { "title": "Graphon based Clustering and Testing of Networks: Algorithms and Theory", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6208", "id": "sTNHCrIKDQc", "poster": "", "openreview": "https://openreview.net/forum?id=sTNHCrIKDQc", "slides": "https://iclr.cc/virtual/2022/poster/6208", "video": "https://iclr.cc/virtual/2022/poster/6208", "author_site": "Mahalakshmi Sabanayagam, Leena Chennuru Vankadara, Debarghya Ghoshdastidar", "tldr": "", "abstract": "Network-valued data are encountered in a wide range of applications, and pose challenges in learning due to their complex structure and absence of vertex correspondence. Typical examples of such problems include classification or grouping of protein structures and social networks. Various methods, ranging from graph kernels to graph neural networks, have been proposed that achieve some success in graph classification problems. However, most methods have limited theoretical justification, and their applicability beyond classification remains unexplored. In this work, we propose methods for clustering multiple graphs, without vertex correspondence, that are inspired by the recent literature on estimating graphons---symmetric functions corresponding to infinite vertex limit of graphs. We propose a novel graph distance based on sorting-and-smoothing graphon estimators. Using the proposed graph distance, we present two clustering algorithms and show that they achieve state-of-the-art results. We prove the statistical consistency of both algorithms under Lipschitz assumptions on the graph degrees. We further study the applicability of the proposed distance for graph two-sample testing problems.", "keywords": "Clustering;Networks;Graphs;Two-sample testing;Graphon", "primary_area": "", "supplementary_material": "/attachment/679208f0f9bf5cf9289f95cf9ed33c705e8d5409.zip", "author": "Mahalakshmi Sabanayagam;Leena Chennuru Vankadara;Debarghya Ghoshdastidar", "authorids": "~Mahalakshmi_Sabanayagam1;~Leena_Chennuru_Vankadara2;~Debarghya_Ghoshdastidar1", "gender": "F;F;M", "homepage": "https://mahalakshmi-sabanayagam.github.io/;https://leenacvankadara.com;https://www.cit.tum.de/tfai/people/debarghya-ghoshdastidar/", "dblp": ";;63/10964", "google_scholar": "Pj76VVIAAAAJ;;Kp-enVQAAAAJ", "orcid": ";;0000-0003-0202-7007", "linkedin": "maha-saba/;;", "or_profile": "~Mahalakshmi_Sabanayagam1;~Leena_Chennuru_Vankadara2;~Debarghya_Ghoshdastidar1", "aff": "Department of Informatics, Technische Universit\u00e4t M\u00fcnchen;University of Tuebingen;Technical University Munich", "aff_domain": "in.tum.de;uni-tuebingen.de;tum.de", "position": "PhD student;PhD student;Assistant Professor", "bibtex": "@inproceedings{\nsabanayagam2022graphon,\ntitle={Graphon based Clustering and Testing of Networks: Algorithms and Theory},\nauthor={Mahalakshmi Sabanayagam and Leena Chennuru Vankadara and Debarghya Ghoshdastidar},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=sTNHCrIKDQc}\n}", "github": "", "project": "", "reviewers": "1avZ;QUvS;fKkm", "pdf_size": 0, "recommendation": "6;8;8", "confidence": "4;3;3", "correctness": "4;3;3", "technical_novelty": "3;4;3", "empirical_novelty": "0;3;2", "wc_summary_paper": "89;31;108", "wc_summary_review": "118;23;81", "wc_main_review": "880;253;374", "wc_review": "1087;307;563", "wc_reply_reviewers": "24;0;0", "wc_reply_authors": "1087;211;849", "reply_reviewers": "1;0;0", "reply_authors": "2;1;2", "recommendation_avg": [ 7.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 3.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 1.6666666666666667, 1.247219128924647 ], "wc_summary_paper_avg": [ 76.0, 32.751590292177674 ], "wc_summary_review_avg": [ 74.0, 39.098167049961134 ], "wc_main_review_avg": [ 502.3333333333333, 271.580968078071 ], "wc_review_avg": [ 652.3333333333334, 324.6386024831647 ], "wc_reply_reviewers_avg": [ 8.0, 11.313708498984761 ], "wc_reply_authors_avg": [ 715.6666666666666, 369.8444117313237 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.9999999999999998, "corr_recommendation_correctness": -0.9999999999999998, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11291859558104381886&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=sTNHCrIKDQc", "email": "in.tum.de;uni-tuebingen.de;tum.de", "author_num": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "Technische Universit\u00e4t M\u00fcnchen;University of Tuebingen;Technical University of Munich", "aff_unique_dep": "Department of Informatics;;", "aff_unique_url": "https://www.tum.de;https://www.uni-tuebingen.de/;https://www.tum.de", "aff_unique_abbr": "TUM;Uni T\u00fcbingen;TUM", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Germany" }, { "id": "sTkY-RVYBz", "title": "Counterbalancing Teacher: Regularizing Batch Normalized Models for Robustness", "track": "main", "status": "Reject", "tldr": "", "abstract": "Batch normalization (BN) is a ubiquitous technique for training deep neural networks that accelerates their convergence to reach higher accuracy. However, we demonstrate that BN comes with a fundamental drawback: it incentivizes the model to rely on frequent low-variance features that are highly specific to the training (in-domain) data, and thus fails to generalize to out-of-domain examples. In this work, we investigate this phenomenon by first showing that removing BN layers across a wide range of architectures leads to lower out-of-domain and corruption errors at the cost of higher in-domain error. We then propose the Counterbalancing Teacher (CT) method, which leverages a frozen copy of the same model without BN as a teacher to enforce the student network's learning of robust representations by substantially adapting its weights through a consistency loss function. This regularization signal helps CT perform well in unforeseen data shifts, even without information from the target domain as in prior works. We theoretically show in an overparameterized linear regression setting why normalization leads a model's reliance on such in-domain features, and empirically demonstrate the efficacy of CT by outperforming several methods on standard robustness benchmark datasets such as CIFAR-10-C, CIFAR-100-C, and VLCS.", "keywords": "Robust representation learning;domain generalization", "primary_area": "", "supplementary_material": "/attachment/071d3df6d3bd16b3e8128416bd23e45824f542df.zip", "author": "Saeid Asgari;Fereshte Khani;Ali Gholami;Kristy Choi;Linh Tran;Ran Zhang", "authorids": "~Saeid_Asgari1;~Fereshte_Khani1;~Ali_Gholami1;~Kristy_Choi1;~Linh_Tran1;~Ran_Zhang1", "gender": ";F;;F;F;M", "homepage": "https://asgsaeid.github.io/;https://people.stanford.edu/fereshte/;https://aligholami.github.io;http://www.kristychoi.com;http://www.linht.com;", "dblp": "201/4374.html;129/2345;;230/4218;130/8465;", "google_scholar": "SuePM1sAAAAJ;;wjcCm80AAAAJ;WetKfYoAAAAJ;https://scholar.google.co.uk/citations?user=GHIsTp8AAAAJ;", "orcid": ";;;;;", "linkedin": ";;hexpheus/;;;ranzhangprofile/", "or_profile": "~Saeid_Asgari1;~Fereshte_Khani1;~Ali_Gholami1;~Kristy_Choi1;~Linh_Tran1;~Ran_Zhang1", "aff": "Autodesk;Microsoft;;Stanford University;Autodesk;", "aff_domain": "autodesk.com;microsoft.com;;cs.stanford.edu;autodesk.com;", "position": "Research Scientist;Researcher;;PhD student;Research Scientist;", "bibtex": "@misc{\nasgari2022counterbalancing,\ntitle={Counterbalancing Teacher: Regularizing Batch Normalized Models for Robustness},\nauthor={Saeid Asgari and Fereshte Khani and Ali Gholami and Kristy Choi and Linh Tran and Ran Zhang},\nyear={2022},\nurl={https://openreview.net/forum?id=sTkY-RVYBz}\n}", "github": "", "project": "", "reviewers": "63kd;HouJ;SoaW;yprC", "site": "https://openreview.net/forum?id=sTkY-RVYBz", "pdf_size": 0, "recommendation": "3;5;6;8", "confidence": "4;4;2;4", "correctness": "2;3;3;4", "technical_novelty": "3;3;3;3", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "49;88;77;34", "wc_summary_review": "512;101;32;16", "wc_main_review": "812;477;139;179", "wc_review": "1373;666;248;229", "wc_reply_reviewers": "1091;0;63;0", "wc_reply_authors": "1653;885;287;35", "reply_reviewers": "6;0;1;0", "reply_authors": "7;4;2;1", "recommendation_avg": [ 5.5, 1.8027756377319946 ], "confidence_avg": [ 3.5, 0.8660254037844386 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 62.0, 21.529050141610984 ], "wc_summary_review_avg": [ 165.25, 202.728112258759 ], "wc_main_review_avg": [ 401.75, 270.47308091564304 ], "wc_review_avg": [ 629.0, 463.6987168410109 ], "wc_reply_reviewers_avg": [ 288.5, 464.036905859868 ], "wc_reply_authors_avg": [ 715.0, 623.3634573826092 ], "reply_reviewers_avg": [ 1.75, 2.48746859276655 ], "reply_authors_avg": [ 3.5, 2.29128784747792 ], "replies_avg": [ 27, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.16012815380508713, "corr_recommendation_correctness": 0.9805806756909202, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:hPKzcvQuUhkJ:scholar.google.com/&scioq=Counterbalancing+Teacher:+Regularizing+Batch+Normalized+Models+for+Robustness&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "Autodesk;Microsoft;Stanford University", "aff_unique_dep": ";Microsoft Corporation;", "aff_unique_url": "https://www.autodesk.com;https://www.microsoft.com;https://www.stanford.edu", "aff_unique_abbr": "Autodesk;Microsoft;Stanford", "aff_campus_unique_index": "1", "aff_campus_unique": ";Stanford", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "sWbXSWzHPa", "title": "Invariant Learning with Partial Group Labels", "track": "main", "status": "Reject", "tldr": "", "abstract": "Learning invariant representations is an important requirement in training machine learning models that are driven by spurious correlations in the datasets. These spurious correlations, between input samples and the target labels, wrongly direct the neural network predictions resulting in poor performance on certain groups, especially the minority groups. Robust training against these spurious correlations requires the knowledge of group membership for every sample. Such a requirement is impractical in situations where the data labelling efforts for minority or rare groups is significantly laborious or where the individuals comprising the dataset choose to conceal sensitive information. On the other hand, the presence of such data collection efforts result in datasets that contain partially labelled group information. Recent works have tackled the fully unsupervised scenario where no labels for groups are available. Thus, we aim to fill the missing gap in the literature by tackling a more realistic setting that can leverage partially available sensitive or group information during training. First, we construct a constraint set and derive a high probability bound for the group assignment to belong to the set. Second, we propose an algorithm that optimizes for the worst-off group assignments from the constraint set. Through experiments on image and tabular datasets, we show improvements in the minority group\u2019s performance while preserving overall aggregate accuracy across groups.", "keywords": "Distributional Robust Optimization;Invariant Representation Learning;Semi-supervised learning;Dataset bias", "primary_area": "", "supplementary_material": "", "author": "Vishnu Suresh Lokhande;Kihyuk Sohn;Jinsung Yoon;Madeleine Udell;Chen-Yu Lee;Tomas Pfister", "authorids": "~Vishnu_Suresh_Lokhande1;~Kihyuk_Sohn1;~Jinsung_Yoon1;~Madeleine_Udell1;~Chen-Yu_Lee2;~Tomas_Pfister1", "gender": ";M;M;F;;M", "homepage": ";https://sites.google.com/site/kihyuksml/;https://sites.google.com/corp/view/jinsungyoon;https://people.orie.cornell.edu/mru8;https://chl260.github.io/;http://tomas.pfister.fi", "dblp": ";53/10771;173/5409.html;153/2166;04/656;14/8360", "google_scholar": ";VxpypngAAAAJ;kiFd6A8AAAAJ;tZ9pEDMAAAAJ;uWPUSEgAAAAJ;ahSpJOAAAAAJ", "orcid": ";;;0000-0002-3985-915X;;0009-0004-4088-8718", "linkedin": ";;jinsung-yoon-bb7751b8;;chenyulee260/;", "or_profile": "~Vishnu_Suresh_Lokhande1;~Kihyuk_Sohn1;~Jinsung_Yoon1;~Madeleine_Udell1;~Chen-Yu_Lee2;~Tomas_Pfister1", "aff": ";Google;Google;Cornell University;Google;Google", "aff_domain": ";google.com;google.com;cornell.edu;google.com;google.com", "position": ";Research Scientist;Research Scientist;Assistant Professor;Research Scientist;Head of Research @ Cloud AI", "bibtex": "@misc{\nlokhande2022invariant,\ntitle={Invariant Learning with Partial Group Labels},\nauthor={Vishnu Suresh Lokhande and Kihyuk Sohn and Jinsung Yoon and Madeleine Udell and Chen-Yu Lee and Tomas Pfister},\nyear={2022},\nurl={https://openreview.net/forum?id=sWbXSWzHPa}\n}", "github": "", "project": "", "reviewers": "sood;Jwqz;RXdi;aqwr", "site": "https://openreview.net/forum?id=sWbXSWzHPa", "pdf_size": 0, "recommendation": "3;3;6;6", "confidence": "4;5;3;3", "correctness": "2;4;4;4", "technical_novelty": "2;4;2;3", "empirical_novelty": "1;3;2;3", "wc_summary_paper": "69;47;40;38", "wc_summary_review": "33;28;16;45", "wc_main_review": "287;276;158;164", "wc_review": "389;351;214;247", "wc_reply_reviewers": "251;386;46;43", "wc_reply_authors": "889;1272;339;243", "reply_reviewers": "1;1;1;1", "reply_authors": "2;2;2;2", "recommendation_avg": [ 4.5, 1.5 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 3.5, 0.8660254037844386 ], "technical_novelty_avg": [ 2.75, 0.82915619758885 ], "empirical_novelty_avg": [ 2.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 48.5, 12.298373876248844 ], "wc_summary_review_avg": [ 30.5, 10.404326023342406 ], "wc_main_review_avg": [ 221.25, 60.41264354421184 ], "wc_review_avg": [ 300.25, 71.9839391809034 ], "wc_reply_reviewers_avg": [ 181.5, 145.08015026184665 ], "wc_reply_authors_avg": [ 685.75, 418.70716198794594 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 2.0, 0.0 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.9045340337332909, "corr_recommendation_correctness": 0.5773502691896258, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:_YU8WOraNIcJ:scholar.google.com/&scioq=Invariant+Learning+with+Partial+Group+Labels&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0;1;0;0", "aff_unique_norm": "Google;Cornell University", "aff_unique_dep": "Google;", "aff_unique_url": "https://www.google.com;https://www.cornell.edu", "aff_unique_abbr": "Google;Cornell", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "sWqjiqlUDso", "title": "Path-specific Causal Fair Prediction via Auxiliary Graph Structure Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Algorithm fairness has become a trending topic, and it has a great impact on social welfare. Among different fairness definitions, path-specific causal fairness is a widely adopted one with great potentials, as it distinguishes the fair and unfair effects that the sensitive attributes exert on algorithm predictions. Existing methods based on path-specific causal fairness either require graph structure as the prior knowledge or have high complexity in the calculation of path-specific effect. To tackle these challenges, we propose a novel casual graph based fair prediction framework, which integrates graph structure learning into fair prediction to ensure that unfair pathways are excluded in the causal graph. Furthermore, we generalize the proposed framework to the scenarios where sensitive attributes can be non-root nodes and affected by other variables, which is commonly observed in real-world applications but hardly addressed by existing works. We provide theoretical analysis on the generalization bound for the proposed fair prediction method, and conduct a series of experiments on real-world datasets to demonstrate that the proposed framework can provide better prediction performance and algorithm fairness trade-off. ", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/f219d1b7cd1ba048025bb5084d5069f8a8230e2a.zip", "author": "Liuyi Yao;Yaliang Li;Bolin Ding;Jingren Zhou;Jinduo Liu;Mengdi Huai;Jing Gao", "authorids": "~Liuyi_Yao1;~Yaliang_Li1;~Bolin_Ding3;~Jingren_Zhou2;~Jinduo_Liu1;~Mengdi_Huai1;~Jing_Gao1", "gender": "F;M;M;M;F;F;M", "homepage": ";https://sites.google.com/site/yaliangli/;https://bolinding.github.io/;https://duo67.github.io;https://mdhuai.github.io/;https://engineering.purdue.edu/~jinggao/;", "dblp": "219/1767;https://dblp.org/pers/hd/l/Li:Yaliang;46/3522.html;174/5802;150/8482;67/4834-4;84/2644", "google_scholar": "0c5is-gAAAAJ;CCPBcdYAAAAJ;AjYkTi8AAAAJ;dJIbKxkAAAAJ;40ZYTzEAAAAJ;Ftj1h4cAAAAJ;", "orcid": ";0000-0002-4204-6096;;0000-0002-6264-0471;0000-0001-6368-5973;;", "linkedin": ";;bolin-ding-50a0119/;;;;", "or_profile": "~Liuyi_Yao1;~Yaliang_Li1;~Bolin_Ding3;~Jinduo_Liu1;~Mengdi_Huai1;~Jing_Gao2;~Jingren_Zhou1", "aff": "Alibaba Group;Alibaba Group;Alibaba Group;Beijing University Of Technology;University of Virginia, Charlottesville;Purdue University;Alibaba Group", "aff_domain": "alibaba-inc.com;alibaba-inc.com;alibaba-inc.com;bjut.edu.cn;virginia.edu;purdue.edu;alibaba-inc.com", "position": "Staff Engineer;Staff Engineer;Senior Director;Lecturer;PhD student;Associate Professor;Researcher", "bibtex": "@misc{\nyao2022pathspecific,\ntitle={Path-specific Causal Fair Prediction via Auxiliary Graph Structure Learning},\nauthor={Liuyi Yao and Yaliang Li and Bolin Ding and Jingren Zhou and Jinduo Liu and Mengdi Huai and Jing Gao},\nyear={2022},\nurl={https://openreview.net/forum?id=sWqjiqlUDso}\n}", "github": "", "project": "", "reviewers": "nUyn;V6if;ZYeg", "site": "https://openreview.net/forum?id=sWqjiqlUDso", "pdf_size": 0, "recommendation": "3;3;3", "confidence": "5;4;3", "correctness": "3;3;3", "technical_novelty": "2;2;2", "empirical_novelty": "2;2;2", "wc_summary_paper": "45;65;40", "wc_summary_review": "101;45;17", "wc_main_review": "392;270;448", "wc_review": "538;380;505", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 50.0, 10.801234497346433 ], "wc_summary_review_avg": [ 54.333333333333336, 34.92213560989012 ], "wc_main_review_avg": [ 370.0, 74.31464638055319 ], "wc_review_avg": [ 474.3333333333333, 68.05063474273321 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5560379831518180923&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;0;1;2;3;0", "aff_unique_norm": "Alibaba Group;Beijing University of Technology;University of Virginia;Purdue University", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.alibaba.com;http://www.bjut.edu.cn;https://www.virginia.edu;https://www.purdue.edu", "aff_unique_abbr": "Alibaba;BJUT;UVA;Purdue", "aff_campus_unique_index": "1", "aff_campus_unique": ";Charlottesville", "aff_country_unique_index": "0;0;0;0;1;1;0", "aff_country_unique": "China;United States" }, { "title": "Pretraining Text Encoders with Adversarial Mixture of Training Signal Generators", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6049", "id": "sX3XaHwotOg", "poster": "", "openreview": "https://openreview.net/forum?id=sX3XaHwotOg", "slides": "https://iclr.cc/virtual/2022/poster/6049", "video": "https://iclr.cc/virtual/2022/poster/6049", "author_site": "Yu Meng, Chenyan Xiong, Payal Bajaj, saurabh tiwary, Paul N Bennett, Jiawei Han, Xia Song", "tldr": "", "abstract": "We present a new framework AMOS that pretrains text encoders with an Adversarial learning curriculum via a Mixture Of Signals from multiple auxiliary generators. Following ELECTRA-style pretraining, the main encoder is trained as a discriminator to detect replaced tokens generated by auxiliary masked language models (MLMs). Different from ELECTRA which trains one MLM as the generator, we jointly train multiple MLMs of different sizes to provide training signals at various levels of difficulty. To push the discriminator to learn better with challenging replaced tokens, we learn mixture weights over the auxiliary MLMs' outputs to maximize the discriminator loss by backpropagating the gradient from the discriminator via Gumbel-Softmax. For better pretraining efficiency, we propose a way to assemble multiple MLMs into one unified auxiliary model. AMOS outperforms ELECTRA and recent state-of-the-art pretrained models by about 1 point on the GLUE benchmark for BERT base-sized models.", "keywords": "Language Model Pretraining", "primary_area": "", "supplementary_material": "", "author": "Yu Meng;Chenyan Xiong;Payal Bajaj;saurabh tiwary;Paul N. Bennett;Jiawei Han;Xia Song", "authorids": "~Yu_Meng1;~Chenyan_Xiong1;~Payal_Bajaj2;~saurabh_tiwary1;~Paul_N._Bennett1;~Jiawei_Han1;~Xia_Song1", "gender": "M;M;F;M;;M;M", "homepage": "https://yumeng5.github.io/;https://www.cs.cmu.edu/~cx/;https://scholar.google.com/citations?user=656vbXQAAAAJ;;https://www.microsoft.com/en-us/research/people/pauben/publications/;http://hanj.cs.illinois.edu/;", "dblp": "30/4233-1;18/10886;;;33/6188;h/JiaweiHan.html;165/6299", "google_scholar": "S2-yZKcAAAAJ;E9BaEBYAAAAJ;;;AIncPrIAAAAJ;https://scholar.google.com.tw/citations?user=Kv9AbjMAAAAJ;0aPSv9kAAAAJ", "orcid": "0000-0003-2554-2888;;;;0009-0006-7852-9651;0000-0002-3629-2696;", "linkedin": ";;;;paulnbennett/;;xiaso/", "or_profile": "~Yu_Meng1;~Chenyan_Xiong1;~Payal_Bajaj2;~saurabh_tiwary1;~Paul_N._Bennett1;~Jiawei_Han1;~Xia_Song1", "aff": "University of Illinois, Urbana Champaign;Microsoft Research;;;Microsoft;University of Illinois at Urbana-Champaign (UIUC);Microsoft", "aff_domain": "illinois.edu;research.microsoft.com;;;microsoft.com;illinois.edu;microsoft.com", "position": "PhD student;Principal Researcher;;;Researcher;Full Professor;Researcher", "bibtex": "@inproceedings{\nmeng2022pretraining,\ntitle={Pretraining Text Encoders with Adversarial Mixture of Training Signal Generators},\nauthor={Yu Meng and Chenyan Xiong and Payal Bajaj and saurabh tiwary and Paul N. Bennett and Jiawei Han and Xia Song},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=sX3XaHwotOg}\n}", "github": "", "project": "", "reviewers": "Hrj2;CBi9;ouEb;oMcQ;sJ99;UXyk", "pdf_size": 0, "recommendation": "3;6;6;8;8;8", "confidence": "5;3;4;4;5;4", "correctness": "3;3;4;3;4;3", "technical_novelty": "1;2;3;3;3;4", "empirical_novelty": "2;3;3;3;3;4", "wc_summary_paper": "103;59;109;227;87;166", "wc_summary_review": "90;29;15;141;42;66", "wc_main_review": "385;182;213;629;190;446", "wc_review": "578;270;337;997;319;678", "wc_reply_reviewers": "90;11;0;157;0;113", "wc_reply_authors": "1735;471;653;863;166;465", "reply_reviewers": "1;1;0;1;0;1", "reply_authors": "3;2;2;2;2;2", "recommendation_avg": [ 6.5, 1.8027756377319946 ], "confidence_avg": [ 4.166666666666667, 0.6871842709362768 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.6666666666666665, 0.9428090415820634 ], "empirical_novelty_avg": [ 3.0, 0.5773502691896257 ], "wc_summary_paper_avg": [ 125.16666666666667, 55.71479954993008 ], "wc_summary_review_avg": [ 63.833333333333336, 42.26667981072351 ], "wc_main_review_avg": [ 340.8333333333333, 163.48844675457107 ], "wc_review_avg": [ 529.8333333333334, 255.4952163065985 ], "wc_reply_reviewers_avg": [ 61.833333333333336, 61.507226314384305 ], "wc_reply_authors_avg": [ 725.5, 498.15049600162666 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 2.1666666666666665, 0.3726779962499649 ], "replies_avg": [ 27, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": -0.20180183819889375, "corr_recommendation_correctness": 0.19611613513818404, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9770552085778615131&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=sX3XaHwotOg", "email": "illinois.edu;research.microsoft.com;;;microsoft.com;illinois.edu;microsoft.com", "author_num": 7, "aff_unique_index": "0;1;1;0;1", "aff_unique_norm": "University of Illinois Urbana-Champaign;Microsoft", "aff_unique_dep": ";Microsoft Research", "aff_unique_url": "https://illinois.edu;https://www.microsoft.com/en-us/research", "aff_unique_abbr": "UIUC;MSR", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Urbana-Champaign;", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "sXNVFBc-0aP", "title": "Public Data-Assisted Mirror Descent for Private Model Training", "track": "main", "status": "Reject", "tldr": "", "abstract": "In this paper, we revisit the problem of effectively using public data to improve the privacy/utility trade-offs for differentially private (DP) model training. Here, public data refers to auxiliary data sets that have no privacy concerns. We consider public training data sets that are from the *same distribution* as the private training data set.\n\nFor convex losses, we show that a variant of Mirror Descent provides population risk guarantees which are independent of the dimension of the model ($p$). Specifically, we apply Mirror Descent with the loss generated by the public data as the *mirror map*, and using DP gradients of the loss generated by the private (sensitive) data. To obtain dimension independence, we require $G_Q^2 \\leq p$ public data samples, where $G_Q$ is the Gaussian width of the smallest convex set $Q$ such that the public loss functions are 1-strongly convex with respect to $\\|\\cdot\\|_Q$. Our method is also applicable to non-convex losses, as it does not rely on convexity assumptions to ensure DP guarantees. We further show that our algorithm has a natural \"noise stability\" property: If in a bounded region around the current iterate, the public loss satisfies $\\alpha_v$-strong convexity in a direction $v$, then using noisy gradients instead of the exact gradients shifts our next iterate in the direction $v$ by an amount proportional to $1/\\alpha_v$ (in contrast with DP stochastic gradient descent (DP-SGD)), where the shift is isotropic). Analogous results in prior works had to explicitly learn the geometry using the public data in the form of preconditioner matrices.\n\nWe demonstrate the empirical efficacy of our algorithm by showing privacy/utility trade-offs on linear regression, and deep learning benchmark datasets (CIFAR-10, EMNIST, and WikiText-2). We show that our algorithm not only significantly improves over traditional DP-SGD, which does not have access to public data, but also improves over DP-SGD on models that have been pretrained with the public data to begin with.", "keywords": "Differential Privacy;Public Data;Mirror Descent", "primary_area": "", "supplementary_material": "", "author": "Ehsan Amid;Arun Ganesh;Rajiv Mathews;Swaroop Ramaswamy;Shuang Song;Thomas Steinke;Vinith Menon Suriyakumar;Om Thakkar;Abhradeep Guha Thakurta", "authorids": "~Ehsan_Amid1;~Arun_Ganesh1;~Rajiv_Mathews1;~Swaroop_Ramaswamy1;~Shuang_Song3;~Thomas_Steinke2;~Vinith_Menon_Suriyakumar1;~Om_Thakkar1;~Abhradeep_Guha_Thakurta1", "gender": "M;M;M;;;M;M;M;M", "homepage": "https://sites.google.com/corp/view/eamid/;https://people.eecs.berkeley.edu/~arunganesh/;;;;http://www.thomas-steinke.net/;;https://athakurta.squarespace.com/;http://www.omthakkar.com/", "dblp": "142/5754;201/4732;;242/9298;86/4211-1;https://dblp.uni-trier.de/pid/73/4025-2.html;;31/8315;https://dblp.uni-trier.de/pid/166/1707", "google_scholar": "https://scholar.google.fi/citations?user=F6omR3gAAAAJ;fmwchbsAAAAJ;xFBrVYgAAAAJ;tKw5jlUAAAAJ;;kwnwhrgAAAAJ;https://scholar.google.com/citations?hl=en;1rV69hMAAAAJ;j5N3bKYAAAAJ", "orcid": ";;;;;;;;", "linkedin": "ehsan-amid-63aba754;;;;;thomas-steinke-2841248/;vsuriyakumar;;", "or_profile": "~Ehsan_Amid1;~Arun_Ganesh1;~Rajiv_Mathews1;~Swaroop_Ramaswamy1;~Shuang_Song3;~Thomas_Steinke2;~Vinith_Menon_Suriyakumar1;~Abhradeep_Guha_Thakurta1;~Om_Dipakbhai_Thakkar1", "aff": "Google DeepMind;University of California, Berkeley;Google;Google;Google;Google;Massachusetts Institute of Technology;Google;Google", "aff_domain": "google.com;berkeley.edu;google.com;google.com;google.com;google.com;mit.edu;google.com;google.com", "position": "Research Scientist;PhD student;Senior Staff Software Engineer;Software Engineer;Software Engineer;Research Scientist;PhD student;Senior Research Scientist;Researcher", "bibtex": "@misc{\namid2022public,\ntitle={Public Data-Assisted Mirror Descent for Private Model Training},\nauthor={Ehsan Amid and Arun Ganesh and Rajiv Mathews and Swaroop Ramaswamy and Shuang Song and Thomas Steinke and Vinith Menon Suriyakumar and Om Thakkar and Abhradeep Guha Thakurta},\nyear={2022},\nurl={https://openreview.net/forum?id=sXNVFBc-0aP}\n}", "github": "", "project": "", "reviewers": "mr6n;cMgL;GaSe", "site": "https://openreview.net/forum?id=sXNVFBc-0aP", "pdf_size": 0, "recommendation": "5;6;8", "confidence": "5;4;4", "correctness": "4;4;4", "technical_novelty": "3;3;4", "empirical_novelty": "3;4;4", "wc_summary_paper": "101;76;59", "wc_summary_review": "54;37;36", "wc_main_review": "195;476;338", "wc_review": "350;589;433", "wc_reply_reviewers": "0;346;25", "wc_reply_authors": "448;1990;415", "reply_reviewers": "0;2;1", "reply_authors": "1;4;1", "recommendation_avg": [ 6.333333333333333, 1.247219128924647 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "correctness_avg": [ 4.0, 0.0 ], "technical_novelty_avg": [ 3.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 78.66666666666667, 17.249798710580816 ], "wc_summary_review_avg": [ 42.333333333333336, 8.259674462242579 ], "wc_main_review_avg": [ 336.3333333333333, 114.72382296435016 ], "wc_review_avg": [ 457.3333333333333, 99.07685008225796 ], "wc_reply_reviewers_avg": [ 123.66666666666667, 157.54435001681978 ], "wc_reply_authors_avg": [ 951.0, 734.8074577738034 ], "reply_reviewers_avg": [ 1.0, 0.816496580927726 ], "reply_authors_avg": [ 2.0, 1.4142135623730951 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 9, 0 ], "corr_recommendation_confidence": -0.7559289460184544, "corr_recommendation_correctness": 0.0, "gs_citation": 68, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16311948829499291413&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff_unique_index": "0;1;0;0;0;0;2;0;0", "aff_unique_norm": "Google;University of California, Berkeley;Massachusetts Institute of Technology", "aff_unique_dep": "Google DeepMind;;", "aff_unique_url": "https://deepmind.com;https://www.berkeley.edu;https://web.mit.edu", "aff_unique_abbr": "DeepMind;UC Berkeley;MIT", "aff_campus_unique_index": "1;2;2;2;2;2;2", "aff_campus_unique": ";Berkeley;Mountain View", "aff_country_unique_index": "0;1;1;1;1;1;1;1;1", "aff_country_unique": "United Kingdom;United States" }, { "id": "sZttLyMsfzb", "title": "Optimization Variance: Exploring Generalization Properties of DNNs", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Unlike the conventional wisdom in statistical learning theory, the test error of a deep neural network (DNN) often demonstrates double descent: as the model complexity increases, it first follows a classical U-shaped curve and then shows a second descent. Through bias-variance decomposition, recent studies revealed that the bell-shaped variance is the major cause of model-wise double descent (when the DNN is widened gradually). This paper investigates epoch-wise double descent, i.e., the test error of a DNN also shows double descent as the number of training epoches increases. By extending the bias-variance analysis to epoch-wise double descent of the zero-one loss, we surprisingly find that the variance itself, without the bias, varies consistently with the test error. Inspired by this result, we propose a novel metric, optimization variance (OV), to measure the diversity of model updates caused by the stochastic gradients of random training batches drawn in the same iteration. OV can be estimated using samples from the \\emph{training} set only but correlates well with the (unknown) test error, and hence early stopping may be achieved without using a validation set.", "keywords": "Double Descent;Generalization;Deep Neural Networks", "primary_area": "", "supplementary_material": "/attachment/1af2679aca4a76a4317a5c8e6d6831b10dfafd0b.zip", "author": "Xiao Zhang;Dongrui Wu;Haoyi Xiong;Bo Dai", "authorids": "~Xiao_Zhang8;~Dongrui_Wu1;~Haoyi_Xiong1;~Bo_Dai2", "gender": "M;M;M;M", "homepage": "https://zhangxiao96.github.io/;https://sites.google.com/site/drwuhust/home;https://sites.google.com/site/haoyixiongshomepage/;http://daibo.info/", "dblp": ";;06/2700;64/2903-2", "google_scholar": "jgzwvDIAAAAJ;UYGzCPEAAAAJ;f_Kcie0AAAAJ;https://scholar.google.com.hk/citations?user=KNWTvgEAAAAJ", "orcid": "0000-0002-2354-9798;0000-0002-7153-9703;;0000-0003-0777-9232", "linkedin": ";;;", "or_profile": "~Xiao_Zhang8;~Dongrui_Wu1;~Haoyi_Xiong1;~Bo_Dai2", "aff": "ByteDance Inc.;Huazhong University of Science and Technology;Baidu;Nanyang Technological University", "aff_domain": "bytedance.com;hust.edu.cn;baidu.com;ntu.edu.sg", "position": "Engineer;Full Professor;Principal Researcher;Research Assistant Professor", "bibtex": "@misc{\nzhang2022optimization,\ntitle={Optimization Variance: Exploring Generalization Properties of {DNN}s},\nauthor={Xiao Zhang and Dongrui Wu and Haoyi Xiong and Bo Dai},\nyear={2022},\nurl={https://openreview.net/forum?id=sZttLyMsfzb}\n}", "github": "", "project": "", "reviewers": "2bEg;TDmC;2Yek;rrwq", "site": "https://openreview.net/forum?id=sZttLyMsfzb", "pdf_size": 0, "recommendation": "3;3;3;3", "confidence": "3;5;5;3", "correctness": "4;4;4;2", "technical_novelty": "2;1;1;2", "empirical_novelty": "2;2;2;1", "wc_summary_paper": "48;44;106;85", "wc_summary_review": "22;154;28;103", "wc_main_review": "181;46;357;308", "wc_review": "251;244;491;496", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 4.0, 1.0 ], "correctness_avg": [ 3.5, 0.8660254037844386 ], "technical_novelty_avg": [ 1.5, 0.5 ], "empirical_novelty_avg": [ 1.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 70.75, 25.878321042911576 ], "wc_summary_review_avg": [ 76.75, 54.84238780359586 ], "wc_main_review_avg": [ 223.0, 120.70004142501361 ], "wc_review_avg": [ 370.5, 123.03759588028367 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11895903670140720730&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "ByteDance;Huazhong University of Science and Technology;Baidu;Nanyang Technological University", "aff_unique_dep": ";;Baidu, Inc.;", "aff_unique_url": "https://www.bytedance.com;http://www.hust.edu.cn;https://www.baidu.com;https://www.ntu.edu.sg", "aff_unique_abbr": "ByteDance;HUST;Baidu;NTU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1", "aff_country_unique": "China;Singapore" }, { "id": "saNgDizIODl", "title": "NUQ: Nonparametric Uncertainty Quantification for Deterministic Neural Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": " This paper proposes a fast and scalable method for uncertainty quantification of machine learning models' predictions. First, we show the principled way to measure the uncertainty of predictions for a classifier based on Nadaraya-Watson's nonparametric estimate of the conditional label distribution. Importantly, the approach allows to disentangle explicitly \\textit{aleatoric} and \\textit{epistemic} uncertainties. The resulting method works directly in the feature space. However, one can apply it to any neural network by considering an embedding of the data induced by the network. We demonstrate the strong performance of the method in uncertainty estimation tasks on a variety of real-world image datasets, such as MNIST, SVHN, CIFAR-100 and several versions of ImageNet.", "keywords": "Out-of-distribution detection;uncertainty quantification;epistemic uncertainty;aleatoric uncertainty;non-parametric models;Nadaraya-Watson estimator;Misclassification detection", "primary_area": "", "supplementary_material": "/attachment/f9cc7de8a724dbe830f1eb9697e84fe127d7d0b6.zip", "author": "Nikita Yurevich Kotelevskii;Alexander Fishkov;Kirill Fedyanin;Aleksandr Petiushko;Maxim Panov", "authorids": "~Nikita_Yurevich_Kotelevskii1;~Alexander_Fishkov1;~Kirill_Fedyanin1;~Aleksandr_Petiushko1;~Maxim_Panov1", "gender": "M;;M;M;M", "homepage": ";;;http://petiushko.info;", "dblp": "259/3057;;256/9937.html;247/6405;30/10085", "google_scholar": "D9b8bXEAAAAJ;8ojWgOIAAAAJ;JmDoBXoAAAAJ;b8d5wS-QfscC;https://scholar.google.ru/citations?user=BqDhGJQAAAAJ", "orcid": ";;;0000-0001-9692-8134;", "linkedin": "nikita-kotelevskii-b52271130/;;kirill-fedyanin-46615518/;petyushko/;", "or_profile": "~Nikita_Yurevich_Kotelevskii1;~Alexander_Fishkov1;~Kirill_Fedyanin1;~Aleksandr_Petiushko1;~Maxim_Panov1", "aff": "Skolkovo Institute of Science and Technology;Skolkovo Institute of Science and Technology;Skolkovo Institute of Science and Technology;Nuro;Skolkovo Institute of Science and Technology", "aff_domain": "skoltech.ru;skoltech.ru;skoltech.ru;nuro.ai;skoltech.ru", "position": "PhD student;PhD student;Researcher;Principal Researcher;Assistant Professor", "bibtex": "@misc{\nkotelevskii2022nuq,\ntitle={{NUQ}: Nonparametric Uncertainty Quantification for Deterministic Neural Networks},\nauthor={Nikita Yurevich Kotelevskii and Alexander Fishkov and Kirill Fedyanin and Aleksandr Petiushko and Maxim Panov},\nyear={2022},\nurl={https://openreview.net/forum?id=saNgDizIODl}\n}", "github": "", "project": "", "reviewers": "CWxx;S6bt;EX9M", "site": "https://openreview.net/forum?id=saNgDizIODl", "pdf_size": 0, "recommendation": "5;5;5", "confidence": "4;4;4", "correctness": "3;3;3", "technical_novelty": "2;2;2", "empirical_novelty": "2;2;2", "wc_summary_paper": "56;101;45", "wc_summary_review": "73;42;27", "wc_main_review": "293;479;596", "wc_review": "422;622;668", "wc_reply_reviewers": "273;185;159", "wc_reply_authors": "1262;1309;1042", "reply_reviewers": "1;1;1", "reply_authors": "2;3;2", "recommendation_avg": [ 5.0, 0.0 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 67.33333333333333, 24.225789747475496 ], "wc_summary_review_avg": [ 47.333333333333336, 19.154343864744856 ], "wc_main_review_avg": [ 456.0, 124.76377679438852 ], "wc_review_avg": [ 570.6666666666666, 106.78743163666573 ], "wc_reply_reviewers_avg": [ 205.66666666666666, 48.780688346471244 ], "wc_reply_authors_avg": [ 1204.3333333333333, 116.37964694147436 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 2.3333333333333335, 0.4714045207910317 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9483715409946390554&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0;0;1;0", "aff_unique_norm": "Skolkovo Institute of Science and Technology;Nuro Inc.", "aff_unique_dep": ";", "aff_unique_url": "https://www.skoltech.ru;https://www.nuro.ai", "aff_unique_abbr": "Skoltech;Nuro", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;0", "aff_country_unique": "Russian Federation;United States" }, { "id": "scSheedMzl", "title": "Locally Invariant Explanations: Towards Causal Explanations through Local Invariant Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Locally interpretable model agnostic explanations (LIME) method is one of the most popular methods used to explain black-box models at a per example level. Although many variants have been proposed, few provide a simple way to produce high fidelity explanations that are also stable and intuitive. In this work, we provide a novel perspective by proposing a model agnostic local explanation method inspired by the invariant risk minimization (IRM) principle -- originally proposed for (global) out-of-distribution generalization -- to provide such high fidelity explanations that are also stable and unidirectional across nearby examples. Our method is based on a game theoretic formulation where we theoretically show that our approach has a strong tendency to eliminate features where the gradient of the black-box function abruptly changes sign in the locality of the example we want to explain, while in other cases it is more careful and will choose a more conservative (feature) attribution, a behavior which can be highly desirable for recourse. Empirically, we show on tabular, image and text data that the quality of our explanations with neighborhoods formed using random perturbations are much better than LIME and in some cases even comparable to other methods that use realistic neighbors sampled from the data manifold, where the latter is a popular strategy to obtain high quality explanations. This is a desirable property given that learning a manifold to either create realistic neighbors or to project explanations is typically expensive or may even be impossible. Moreover, our algorithm is simple and efficient to train, and can ascertain stable input features for local decisions of a black-box without access to side information such as a (partial) causal graph as has been seen in some recent works.", "keywords": "explainable AI", "primary_area": "", "supplementary_material": "", "author": "Amit Dhurandhar;Karthikeyan Natesan Ramamurthy;Kartik Ahuja;Vijay Arya", "authorids": "~Amit_Dhurandhar1;~Karthikeyan_Natesan_Ramamurthy1;~Kartik_Ahuja1;~Vijay_Arya1", "gender": "M;;;M", "homepage": "https://researcher.watson.ibm.com/researcher/view.php?person=us-adhuran;https://nrkarthikeyan.github.io/;;", "dblp": "66/3289;58/7800;;77/1485", "google_scholar": "km9vIPEAAAAJ;mG8HuhEAAAAJ;;", "orcid": ";0000-0002-6021-5930;;", "linkedin": ";;;", "or_profile": "~Amit_Dhurandhar1;~Karthikeyan_Natesan_Ramamurthy1;~Kartik_Ahuja1;~Vijay_Arya1", "aff": "International Business Machines;International Business Machines;;IBM Research", "aff_domain": "ibm.com;ibm.com;;ibm.com", "position": "Principal Researcher;Research Staff Member;;Researcher", "bibtex": "@misc{\ndhurandhar2022locally,\ntitle={Locally Invariant Explanations: Towards Causal Explanations through Local Invariant Learning},\nauthor={Amit Dhurandhar and Karthikeyan Natesan Ramamurthy and Kartik Ahuja and Vijay Arya},\nyear={2022},\nurl={https://openreview.net/forum?id=scSheedMzl}\n}", "github": "", "project": "", "reviewers": "9WKN;388N;H918;tbk2", "site": "https://openreview.net/forum?id=scSheedMzl", "pdf_size": 0, "recommendation": "5;5;5;8", "confidence": "4;4;2;4", "correctness": "3;4;3;3", "technical_novelty": "2;2;4;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "65;101;60;132", "wc_summary_review": "13;54;24;48", "wc_main_review": "387;277;341;484", "wc_review": "465;432;425;664", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "963;960;446;538", "reply_reviewers": "0;0;0;0", "reply_authors": "2;2;2;1", "recommendation_avg": [ 5.75, 1.299038105676658 ], "confidence_avg": [ 3.5, 0.8660254037844386 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.82915619758885 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 89.5, 29.19332115399 ], "wc_summary_review_avg": [ 34.75, 16.843025262701474 ], "wc_main_review_avg": [ 372.25, 75.42338828241542 ], "wc_review_avg": [ 496.5, 97.8787515245265 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 726.75, 236.99512125780143 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.75, 0.4330127018922193 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": -0.3333333333333333, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:XvYJExaCrUwJ:scholar.google.com/&scioq=Locally+Invariant+Explanations:+Towards+Causal+Explanations+through+Local+Invariant+Learning&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0;1", "aff_unique_norm": "International Business Machines Corporation;IBM", "aff_unique_dep": ";IBM Research", "aff_unique_url": "https://www.ibm.com;https://www.ibm.com/research", "aff_unique_abbr": "IBM;IBM", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Understanding and Leveraging Overparameterization in Recursive Value Estimation", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7117", "id": "shbAgEsk3qM", "poster": "", "openreview": "https://openreview.net/forum?id=shbAgEsk3qM", "slides": "https://iclr.cc/virtual/2022/poster/7117", "video": "https://iclr.cc/virtual/2022/poster/7117", "author_site": "Chenjun Xiao, Bo Dai, Jincheng Mei, Oscar Ramirez, Ramki Gummadi, Chris Harris, Dale Schuurmans", "tldr": "", "abstract": "The theory of function approximation in reinforcement learning (RL) typically considers low capacity representations that incur a tradeoff between approximation error, stability and generalization. Current deep architectures, however, operate in an overparameterized regime where approximation error is not necessarily a bottleneck. To better understand the utility of deep models in RL we present an analysis of recursive value estimation using \\emph{overparameterized} linear representations that provides useful, transferable findings. First, we show that classical updates such as temporal difference (TD) learning or fitted-value-iteration (FVI) converge to \\emph{different} fixed points than residual minimization (RM) in the overparameterized linear case. We then develop a unified interpretation of overparameterized linear value estimation as minimizing the Euclidean norm of the weights subject to alternative constraints. A practical consequence is that RM can be modified by a simple alteration of the backup targets to obtain the same fixed points as FVI and TD (when they converge), while universally ensuring stability. Further, we provide an analysis of the generalization error of these methods, demonstrating per iterate bounds on the value prediction error of FVI, and fixed point bounds for TD and RM. \nGiven this understanding, we then develop new algorithmic tools for improving recursive value estimation with deep models. \nIn particular, we extract two regularizers that penalize out-of-span top-layer weights and co-linearity in top-layer features respectively. Empirically we find that these regularizers dramatically improve the stability of TD and FVI, while allowing RM to match and even sometimes surpass their generalization performance with assured stability. ", "keywords": "Temporal Difference Learning;Residual Minimization;Value Estimation;Overparameterization", "primary_area": "", "supplementary_material": "", "author": "Chenjun Xiao;Bo Dai;Jincheng Mei;Oscar A Ramirez;Ramki Gummadi;Chris Harris;Dale Schuurmans", "authorids": "~Chenjun_Xiao1;~Bo_Dai1;~Jincheng_Mei1;~Oscar_A_Ramirez1;~Ramki_Gummadi1;~Chris_Harris1;~Dale_Schuurmans1", "gender": ";;M;M;;;", "homepage": "https://chenjun-x.github.io/;https://bo-dai.github.io/;https://jinchengmei.github.io;;;;", "dblp": "178/8641;64/2903;149/1408;145/7596;https://dblp.org/pers/hd/g/Gummadi:Ramki;;", "google_scholar": ";TIKl_foAAAAJ;;LLnrH8IAAAAJ;2P8IbqoAAAAJ;;", "orcid": "0000-0002-5493-1500;0009-0002-8070-574X;;;;;", "linkedin": ";;;oscar-ramirez-905913b9;;;", "or_profile": "~Chenjun_Xiao1;~Bo_Dai1;~Jincheng_Mei1;~Oscar_A_Ramirez1;~Ramki_Gummadi1;~Chris_Harris1;~Dale_Schuurmans1", "aff": "University of Alberta;Google Brain;Google DeepMind;Google;Google DeepMind;;", "aff_domain": "ualberta.ca;google.com;google.com;google.com;google.com;;", "position": "PhD student;Research Scientist;Research Scientist;Researcher;Staff Research Scientist;;", "bibtex": "@inproceedings{\nxiao2022understanding,\ntitle={Understanding and Leveraging Overparameterization in Recursive Value Estimation},\nauthor={Chenjun Xiao and Bo Dai and Jincheng Mei and Oscar A Ramirez and Ramki Gummadi and Chris Harris and Dale Schuurmans},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=shbAgEsk3qM}\n}", "github": "", "project": "", "reviewers": "xq3y;gT5o;KksW;PSbG", "pdf_size": 0, "recommendation": "3;5;6;8", "confidence": "4;3;2;4", "correctness": "2;3;3;4", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;2;2;1", "wc_summary_paper": "3;44;66;67", "wc_summary_review": "3;37;34;81", "wc_main_review": "366;433;228;780", "wc_review": "372;514;328;928", "wc_reply_reviewers": "478;98;37;109", "wc_reply_authors": "1881;1219;856;920", "reply_reviewers": "1;2;1;2", "reply_authors": "6;5;4;4", "recommendation_avg": [ 5.5, 1.8027756377319946 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 1.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 45.0, 25.93260495977988 ], "wc_summary_review_avg": [ 38.75, 27.788261910382232 ], "wc_main_review_avg": [ 451.75, 203.41874913586506 ], "wc_review_avg": [ 535.5, 236.80529977177454 ], "wc_reply_reviewers_avg": [ 180.5, 173.93748877110994 ], "wc_reply_authors_avg": [ 1219.0, 406.02155115215254 ], "reply_reviewers_avg": [ 1.5, 0.5 ], "reply_authors_avg": [ 4.75, 0.82915619758885 ], "replies_avg": [ 30, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": -0.08362420100070908, "corr_recommendation_correctness": 0.9805806756909202, "gs_citation": 18, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18273521794815791929&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=shbAgEsk3qM", "email": "ualberta.ca;google.com;google.com;google.com;google.com;;", "author_num": 7, "aff_unique_index": "0;1;1;1;1", "aff_unique_norm": "University of Alberta;Google", "aff_unique_dep": ";Google Brain", "aff_unique_url": "https://www.ualberta.ca;https://brain.google.com", "aff_unique_abbr": "UAlberta;Google Brain", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;1;2;1;2", "aff_country_unique": "Canada;United States;United Kingdom" }, { "id": "shdfw9sQnAP", "title": "Cronus: Robust and Heterogeneous Collaborative Learning with Black-Box Knowledge Transfer", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Collaborative (federated) learning enables multiple parties to train a global model without sharing their private data, notably through repeated sharing of the parameters of their local models. Despite its advantages, this approach has many known security and privacy weaknesses, and is limited to models with the same architectures. We argue that the core reason for such security and privacy issues is the naive exchange of high-dimensional model parameters in federated learning algorithms. This increases the malleability of the trained global model to poisoning attacks and exposes the sensitive local datasets of parties to inference attacks. We propose Cronus, a robust collaborative learning framework that supports heterogeneous model architectures. The simple yet effective idea behind designing Cronus is to significantly reduce the dimensions of the exchanged information between parties. This allows us to impose a very tight bound over the error of the aggregation algorithm in presence of adversarial updates from malicious parties. We implement this through a robust knowledge transfer protocol between the local models. We evaluate prior federated learning algorithms against poisoning attacks, and we show that Cronus is the only secure method that withstands the parameter poisoning attacks. Furthermore, treating local models as black-boxes significantly reduces the information leakage about their sensitive training data. We show this using membership inference attacks.", "keywords": "Federated Learning;Poisoning attacks and defenses", "primary_area": "", "supplementary_material": "", "author": "Hongyan Chang;Virat Shejwalkar;Reza Shokri;Amir Houmansadr", "authorids": "~Hongyan_Chang1;~Virat_Shejwalkar1;~Reza_Shokri1;~Amir_Houmansadr1", "gender": "F;M;;M", "homepage": "https://www.comp.nus.edu.sg/~hongyan/;https://people.cs.umass.edu/~vshejwalkar/;;https://www.cs.umass.edu/~amir/", "dblp": "152/5447.html;243/3113.html;;22/1797", "google_scholar": "5d1AHgIAAAAJ;M6GAEdUAAAAJ;;https://scholar.google.com.tw/citations?user=cTTFHNwAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Hongyan_Chang1;~Virat_Shejwalkar1;~Reza_Shokri1;~Amir_Houmansadr1", "aff": "National University of Singapore;University of Massachusetts at Amherst;;University of Massachusetts, Amherst", "aff_domain": "nus.edu.sg;cs.umass.edu;;umass.edu", "position": "PhD student;PhD student;;Associate Professor", "bibtex": "@misc{\nchang2022cronus,\ntitle={Cronus: Robust and Heterogeneous Collaborative Learning with Black-Box Knowledge Transfer},\nauthor={Hongyan Chang and Virat Shejwalkar and Reza Shokri and Amir Houmansadr},\nyear={2022},\nurl={https://openreview.net/forum?id=shdfw9sQnAP}\n}", "github": "", "project": "", "reviewers": "sDvv;nfR7;nkKd;sT63", "site": "https://openreview.net/forum?id=shdfw9sQnAP", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "5;4;4;5", "correctness": "2;3;3;3", "technical_novelty": "3;2;2;1", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "109;145;59;84", "wc_summary_review": "68;18;40;29", "wc_main_review": "578;336;586;460", "wc_review": "755;499;685;573", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 4.5, 0.5 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 99.25, 31.78344694963087 ], "wc_summary_review_avg": [ 38.75, 18.59267328815305 ], "wc_main_review_avg": [ 490.0, 101.95096860746347 ], "wc_review_avg": [ 628.0, 98.79777325425913 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 216, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7410569709267643361&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1;1", "aff_unique_norm": "National University of Singapore;University of Massachusetts Amherst", "aff_unique_dep": ";", "aff_unique_url": "https://www.nus.edu.sg;https://www.umass.edu", "aff_unique_abbr": "NUS;UMass Amherst", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Amherst", "aff_country_unique_index": "0;1;1", "aff_country_unique": "Singapore;United States" }, { "title": "8-bit Optimizers via Block-wise Quantization", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6210", "id": "shpkpVXzo3h", "poster": "", "openreview": "https://openreview.net/forum?id=shpkpVXzo3h", "slides": "https://iclr.cc/virtual/2022/poster/6210", "video": "https://iclr.cc/virtual/2022/poster/6210", "author_site": "Tim Dettmers, Mike Lewis, Sam Shleifer, Luke Zettlemoyer", "tldr": "", "abstract": "Stateful optimizers maintain gradient statistics over time, e.g., the exponentially smoothed sum (SGD with momentum) or squared sum (Adam) of past gradient values. This state can be used to accelerate optimization significantly, compared to plain stochastic gradient descent, but uses memory that might otherwise be allocated to model parameters, thereby limiting the maximum size of models trained in practice. In this paper, we develop the first optimizers that use 8-bit statistics while maintaining the performance levels of using 32-bit optimizer states. To overcome the resulting computational, quantization, and stability challenges, we develop block-wise dynamic quantization. Block-wise quantization divides input tensors into smaller blocks that are independently quantized. Each block is processed in parallel across cores, yielding faster optimization and high precision quantization. To maintain stability and performance, we combine block-wise quantization with two additional changes: (1) dynamic quantization, a form of non-linear optimization that is precise for both large and small magnitude values, and (2) a stable embedding layer to reduce gradient variance that comes from the highly non-uniform distribution of input tokens in language models. As a result, our 8-bit optimizers maintain 32-bit performance with a small fraction of the memory footprint on a range of tasks, including 1.5B parameter language modeling, GLUE finetuning, ImageNet classification, WMT'14 machine translation, MoCo v2 contrastive ImageNet pretraining+finetuning, and RoBERTa pretraining, without changes to the original optimizer hyperparameters. We open-source our 8-bit optimizers as a drop-in replacement that only requires a two-line code change.", "keywords": "language models;pretraining;finetuning;GPU memory", "primary_area": "", "supplementary_material": "/attachment/831b28094d0b2ef84c6e475195348166159bd984.zip", "author": "Tim Dettmers;Mike Lewis;Sam Shleifer;Luke Zettlemoyer", "authorids": "~Tim_Dettmers2;~Mike_Lewis1;~Sam_Shleifer1;~Luke_Zettlemoyer1", "gender": "M;M;M;M", "homepage": "https://timdettmers.com/;;;https://www.cs.washington.edu/people/faculty/lsz/", "dblp": "172/1045;19/6214;;21/6793", "google_scholar": "lHI3w5kAAAAJ;SnQnQicAAAAJ;_d5MbfoAAAAJ;https://scholar.google.com.tw/citations?user=UjpbO6IAAAAJ", "orcid": ";;;", "linkedin": ";;https://www.linkedin.com/feed/;luke-zettlemoyer-a0109b226/", "or_profile": "~Tim_Dettmers2;~Mike_Lewis1;~Sam_Shleifer1;~Luke_Zettlemoyer1", "aff": "University of Washington;Facebook AI Research;FAIR;Meta", "aff_domain": "cs.washington.edu;fb.com;fb.com;meta.com", "position": "PhD student;Research Scientist;Research Engineer;Researcher", "bibtex": "@inproceedings{\ndettmers2022bit,\ntitle={8-bit Optimizers via Block-wise Quantization},\nauthor={Tim Dettmers and Mike Lewis and Sam Shleifer and Luke Zettlemoyer},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=shpkpVXzo3h}\n}", "github": "", "project": "", "reviewers": "zhGJ;QSRR;6PcJ", "pdf_size": 0, "recommendation": "6;8;8", "confidence": "5;5;5", "correctness": "3;3;3", "technical_novelty": "3;3;3", "empirical_novelty": "4;3;3", "wc_summary_paper": "56;77;52", "wc_summary_review": "16;102;47", "wc_main_review": "224;97;427", "wc_review": "296;276;526", "wc_reply_reviewers": "0;0;573", "wc_reply_authors": "421;129;1398", "reply_reviewers": "0;0;2", "reply_authors": "1;1;3", "recommendation_avg": [ 7.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 5.0, 0.0 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 61.666666666666664, 10.96458946893235 ], "wc_summary_review_avg": [ 55.0, 35.56215216584433 ], "wc_main_review_avg": [ 249.33333333333334, 135.90764838260168 ], "wc_review_avg": [ 366.0, 113.43133018115704 ], "wc_reply_reviewers_avg": [ 191.0, 270.1147904132612 ], "wc_reply_authors_avg": [ 649.3333333333334, 542.6431198822626 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.9428090415820634 ], "reply_authors_avg": [ 1.6666666666666667, 0.9428090415820634 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 293, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5491820601242999587&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=shpkpVXzo3h", "email": "cs.washington.edu;fb.com;fb.com;meta.com", "author_num": 4, "aff_unique_index": "0;1;1;1", "aff_unique_norm": "University of Washington;Meta", "aff_unique_dep": ";Facebook AI Research", "aff_unique_url": "https://www.washington.edu;https://research.facebook.com", "aff_unique_abbr": "UW;FAIR", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "What Happens after SGD Reaches Zero Loss? --A Mathematical Framework", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7048", "id": "siCt4xZn5Ve", "poster": "", "openreview": "https://openreview.net/forum?id=siCt4xZn5Ve", "slides": "https://iclr.cc/virtual/2022/poster/7048", "video": "https://iclr.cc/virtual/2022/poster/7048", "author_site": "Zhiyuan Li, Tianhao Wang, Sanjeev Arora", "tldr": "", "abstract": "Understanding the implicit bias of Stochastic Gradient Descent (SGD) is one of the key challenges in deep learning, especially for overparametrized models, where the local minimizers of the loss function $L$ can form a manifold. Intuitively, with a sufficiently small learning rate $\\eta$, SGD tracks Gradient Descent (GD) until it gets close to such manifold, where the gradient noise prevents further convergence. In such regime, Blanc et al. (2020) proved that SGD with label noise locally decreases a regularizer-like term, the sharpness of loss, $\\text{tr}[\\nabla^2 L]$. The current paper gives a general framework for such analysis by adapting ideas from Katzenberger (1991). It allows in principle a complete characterization for the regularization effect of SGD around such manifold---i.e., the \"implicit bias\"---using a stochastic differential equation (SDE) describing the limiting dynamics of the parameters, which is determined jointly by the loss function and the noise covariance. This yields some new results: (1) a *global* analysis of the implicit bias valid for $\\eta^{-2}$ steps, in contrast to the local analysis of Blanc et al. (2020) that is only valid for $\\eta^{-1.6}$ steps and (2) allowing *arbitrary* noise covariance. As an application, we show with arbitrary large initialization, label noise SGD can always escape the kernel regime and only requires $O(\\kappa\\ln d)$ samples for learning an $\\kappa$-sparse overparametrized linear model in $\\mathbb{R}^d$ (Woodworth et al., 2020), while GD initialized in the kernel regime requires $\\Omega(d)$ samples. This upper bound is minimax optimal and improves the previous $\\widetilde{O}(\\kappa^2)$ upper bound (HaoChen et al., 2020).", "keywords": "SGD;implicit bias;generalization;deep learning;implicit regularization;manifold", "primary_area": "", "supplementary_material": "/attachment/7bbee31b806c9688b15216dca1fa441cb9f56fcf.zip", "author": "Zhiyuan Li;Tianhao Wang;Sanjeev Arora", "authorids": "~Zhiyuan_Li2;~Tianhao_Wang1;~Sanjeev_Arora1", "gender": "M;M;", "homepage": "https://zhiyuanli.ttic.edu;https://tianhaowang.ttic.edu;http://www.cs.princeton.edu/~arora/", "dblp": "l/ZhiyuanLi;145/3288-2;a/SArora", "google_scholar": "https://scholar.google.com/citations?hl=en;m45LD1kAAAAJ;RUP4S68AAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Zhiyuan_Li2;~Tianhao_Wang1;~Sanjeev_Arora1", "aff": "Department of Computer Science, Princeton University;Yale University;Princeton University", "aff_domain": "cs.princeton.edu;yale.edu;princeton.edu", "position": "PhD student;PhD student;Full Professor", "bibtex": "@inproceedings{\nli2022what,\ntitle={What Happens after {SGD} Reaches Zero Loss? --A Mathematical Framework},\nauthor={Zhiyuan Li and Tianhao Wang and Sanjeev Arora},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=siCt4xZn5Ve}\n}", "github": "", "project": "", "reviewers": "6sCC;uj7w;TEtd;Msxe", "pdf_size": 0, "recommendation": "8;8;8;10", "confidence": "3;3;4;5", "correctness": "4;4;4;4", "technical_novelty": "3;3;4;4", "empirical_novelty": "2;3;0;0", "wc_summary_paper": "56;86;79;89", "wc_summary_review": "20;321;45;47", "wc_main_review": "259;65;460;536", "wc_review": "335;472;584;672", "wc_reply_reviewers": "0;21;0;9", "wc_reply_authors": "353;483;320;622", "reply_reviewers": "0;1;0;1", "reply_authors": "1;2;1;2", "recommendation_avg": [ 8.5, 0.8660254037844386 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 4.0, 0.0 ], "technical_novelty_avg": [ 3.5, 0.5 ], "empirical_novelty_avg": [ 1.25, 1.299038105676658 ], "wc_summary_paper_avg": [ 77.5, 12.932517156377562 ], "wc_summary_review_avg": [ 108.25, 123.29106820852839 ], "wc_main_review_avg": [ 330.0, 183.44072612154588 ], "wc_review_avg": [ 515.75, 126.15144668215264 ], "wc_reply_reviewers_avg": [ 7.5, 8.616843969807043 ], "wc_reply_authors_avg": [ 444.5, 119.22772328615522 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.8703882797784891, "corr_recommendation_correctness": 0.0, "gs_citation": 122, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15308300525186977773&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "pdf": "https://openreview.net/pdf?id=siCt4xZn5Ve", "email": "cs.princeton.edu;yale.edu;princeton.edu", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "Princeton University;Yale University", "aff_unique_dep": "Department of Computer Science;", "aff_unique_url": "https://www.princeton.edu;https://www.yale.edu", "aff_unique_abbr": "Princeton;Yale", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "size4UxXVCY", "title": "Graph Tree Neural Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "In the field of deep learning, various architectures have been developed. However, most studies are limited to specific tasks or datasets due to their fixed layer structure. In this paper, we do not express the structure delivering information as a network model but as a data structure called a graph tree. And we propose two association models of graph tree neural networks(GTNNs) designed to solve the problems of existing networks by analyzing the structure of human neural networks. Defining the starting and ending points in a single graph is difficult, and a tree cannot express the relationship among sibling nodes. On the contrary, a graph tree(GT) can express leaf and root nodes as its starting and ending points and the relationship among sibling nodes. Instead of using fixed sequence layers, we create a GT for each data and train GTNN according to the tree's structure. GTNNs are data-driven learning in which the number of convolutions varies according to the depth of the tree. Moreover, these models can simultaneously learn various types of datasets through the recursive learning method. Depth-first convolution (DFC) encodes the interaction result from leaf nodes to the root node in a bottom-up approach, and depth-first deconvolution (DFD) decodes the interaction result from the root node to the leaf nodes in a top-down approach. To demonstrate the performance of these networks, we conducted two experiments. The first experiment is whether various datasets can be processed by combining GTNN and feature extraction networks(processing various datasets). The second experiment is about whether the output of GTNN can embed information on all data contained in the GT(association). We compared the performance of existing networks that separately learned image, sound, and natural language datasets with the performance simultaneously learned by connecting these networks. As a result, these models learned without significant performance degradation, and the output vector contained all the information in the GT. ", "keywords": "Graph neural networks;Tree based convolutional neural network;Domain general learning", "primary_area": "", "supplementary_material": "", "author": "Seokjun Kim;Jaeeun Jang;Heeseok Jung;Hyeoncheol Kim", "authorids": "~Seokjun_Kim1;~Jaeeun_Jang1;~Heeseok_Jung1;~Hyeoncheol_Kim1", "gender": "M;M;M;M", "homepage": ";;https://ini.korea.ac.kr;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": "seokjun7607/;https://www.linkedin.com/mwlite/in/heeseok-jeong-0812a0174;;jaeeun0779/", "or_profile": "~Seokjun_Kim1;~Heeseok_Jung1;~Hyeoncheol_Kim1;~Jae_Eun_Jang1", "aff": "Korea University;;Korea University;Korea University", "aff_domain": "korea.ac.kr;;korea.ac.kr;korea.ac.kr", "position": "MS student;;Full Professor;MS student", "bibtex": "@misc{\nkim2022graph,\ntitle={Graph Tree Neural Networks},\nauthor={Seokjun Kim and Jaeeun Jang and Heeseok Jung and Hyeoncheol Kim},\nyear={2022},\nurl={https://openreview.net/forum?id=size4UxXVCY}\n}", "github": "", "project": "", "reviewers": "sfjx;Svha;oPzs", "site": "https://openreview.net/forum?id=size4UxXVCY", "pdf_size": 0, "recommendation": "1;1;1", "confidence": "3;3;4", "correctness": "2;2;2", "technical_novelty": "2;2;2", "empirical_novelty": "0;2;0", "wc_summary_paper": "136;143;25", "wc_summary_review": "78;53;6", "wc_main_review": "383;339;80", "wc_review": "597;535;111", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "261;228;36", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "recommendation_avg": [ 1.0, 0.0 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "correctness_avg": [ 2.0, 0.0 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 0.6666666666666666, 0.9428090415820634 ], "wc_summary_paper_avg": [ 101.33333333333333, 54.05141585153488 ], "wc_summary_review_avg": [ 45.666666666666664, 29.847761874031512 ], "wc_main_review_avg": [ 267.3333333333333, 133.6770569527754 ], "wc_review_avg": [ 414.3333333333333, 215.97736506917158 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 175.0, 99.20685460188726 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1, "aff_unique_index": "0;0;0", "aff_unique_norm": "Korea University", "aff_unique_dep": "", "aff_unique_url": "https://www.korea.ac.kr", "aff_unique_abbr": "KU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "South Korea" }, { "id": "sk63PSiUyci", "title": "AI-SARAH: Adaptive and Implicit Stochastic Recursive Gradient Methods", "track": "main", "status": "Reject", "tldr": "", "abstract": "We present AI-SARAH, a practical variant of SARAH. As a variant of SARAH, this algorithm employs the stochastic recursive gradient yet adjusts step-size based on local geometry. AI-SARAH implicitly computes step-size and efficiently estimates local Lipschitz smoothness of stochastic functions. It is fully adaptive, tune-free, straightforward to implement, and computationally efficient. We provide technical insight and intuitive illustrations on its design and convergence. We conduct extensive empirical analysis and demonstrate its strong performance compared with its classical counterparts and other state-of-the-art first-order methods in solving convex machine learning problems.", "keywords": "practical variant of SARAH;adaptive step-size;tune-free algorithm;implicit approach;convex optimization in machine learning", "primary_area": "", "supplementary_material": "/attachment/ca102a58e97bee94973ed15246222eba351853c4.zip", "author": "Zheng Shi;Nicolas Loizou;Peter Richt\u00e1rik;Martin Takac", "authorids": "~Zheng_Shi2;~Nicolas_Loizou1;~Peter_Richt\u00e1rik1;~Martin_Takac3", "gender": "M;M;M;M", "homepage": "https://www.zhengqxhs.com/;https://nicolasloizou.github.io/;http://mtakac.com;https://richtarik.org", "dblp": ";173/4958;42/3759-1.html;62/8001", "google_scholar": "YurlyCoAAAAJ;https://scholar.google.co.uk/citations?user=mvDmzAQAAAAJ;qKQD-2cAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": "0000-0002-7202-0217;;0000-0001-7455-2025;0000-0003-4380-5848", "linkedin": "zhengmartinshi/;;martintakac/;richtarik/", "or_profile": "~Zheng_Shi2;~Nicolas_Loizou1;~Martin_Takac3;~Peter_Richtarik1", "aff": "Lehigh University;Johns Hopkins University;Mohamed bin Zayed University of Artificial Intelligence;King Abdullah University of Science and Technology (KAUST)", "aff_domain": "lehigh.edu;jhu.edu;mbzuai.ac.ae;kaust.edu.sa", "position": "PhD Candidate;Assistant Professor;Associate Professor;Full Professor", "bibtex": "@misc{\nshi2022aisarah,\ntitle={{AI}-{SARAH}: Adaptive and Implicit Stochastic Recursive Gradient Methods},\nauthor={Zheng Shi and Nicolas Loizou and Peter Richt{\\'a}rik and Martin Takac},\nyear={2022},\nurl={https://openreview.net/forum?id=sk63PSiUyci}\n}", "github": "", "project": "", "reviewers": "4uKg;Y92t;CpvK;G45K", "site": "https://openreview.net/forum?id=sk63PSiUyci", "pdf_size": 0, "recommendation": "5;5;5;6", "confidence": "5;4;4;4", "correctness": "4;3;3;3", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;0;3;2", "wc_summary_paper": "108;65;53;29", "wc_summary_review": "338;52;101;20", "wc_main_review": "80;184;395;521", "wc_review": "526;301;549;570", "wc_reply_reviewers": "10;178;227;87", "wc_reply_authors": "1027;984;1470;873", "reply_reviewers": "1;1;1;2", "reply_authors": "5;6;6;5", "recommendation_avg": [ 5.25, 0.4330127018922193 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 1.75, 1.0897247358851685 ], "wc_summary_paper_avg": [ 63.75, 28.647643882176418 ], "wc_summary_review_avg": [ 127.75, 124.76853569710595 ], "wc_main_review_avg": [ 295.0, 172.93206758724654 ], "wc_review_avg": [ 486.5, 108.22314909482166 ], "wc_reply_reviewers_avg": [ 125.5, 83.48802309313594 ], "wc_reply_authors_avg": [ 1088.5, 227.31311004867274 ], "reply_reviewers_avg": [ 1.25, 0.4330127018922193 ], "reply_authors_avg": [ 5.5, 0.5 ], "replies_avg": [ 34, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": -0.3333333333333333, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2487982500957671681&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "Lehigh University;Johns Hopkins University;Mohamed bin Zayed University of Artificial Intelligence;King Abdullah University of Science and Technology", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.lehigh.edu;https://www.jhu.edu;https://mbzuai.ac.ae;https://www.kaust.edu.sa", "aff_unique_abbr": "Lehigh;JHU;MBZUAI;KAUST", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;2", "aff_country_unique": "United States;United Arab Emirates;Saudi Arabia" }, { "id": "snJ1WYQOR5", "title": "Neural Plenoptic Sampling: Capture Light-field from Imaginary Eyes", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "The Plenoptic function describes the light rays observed from any given position in every viewing direction. It is often parameterized as a 5-D function $L(x, y, z, \\theta, \\phi)$ for a static scene. Capturing all the plenoptic functions in the space of interest is paramount for Image-Based Rendering (IBR) and Novel View Synthesis (NVS). It encodes a complete light-field (\\ie, lumigraph) therefore allows one to freely roam in the space and view the scene from any location in any direction. However, achieving this goal by conventional light-field capture technique is expensive, requiring densely sampling the ray space using arrays of cameras or lenses. This paper proposes a much simpler solution to address this challenge by using only a small number of sparsely configured camera views as input. Specifically, we adopt a simple Multi-Layer Perceptron (MLP) network as a universal function approximator to learn the plenoptic function at every position in the space of interest. By placing virtual viewpoints (dubbed `imaginary eyes') at thousands of randomly sampled locations and leveraging multi-view geometric relationship, we train the MLP to regress the plenoptic function for the space. Our network is trained on a per-scene basis, and the training time is relatively short (in the order of tens of minutes). When the model is converged, we can freely render novel images. Extensive experiments demonstrate that our method well approximates the complete plenoptic function and generates high-quality results.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/f7338cfa4bc81616c469a67a99b703ea33575c37.zip", "author": "Junxuan Li;Yujiao Shi;Hongdong Li", "authorids": "~Junxuan_Li2;~Yujiao_Shi1;~Hongdong_Li1", "gender": "M;F;M", "homepage": "https://junxuan-li.github.io/;https://shiyujiao.github.io/;http://users.cecs.anu.edu.au/~hongdong/", "dblp": "207/7737;159/2546;59/4859.html", "google_scholar": "b2_zvDMAAAAJ;rVsRpZEAAAAJ;https://scholar.google.com.tw/citations?hl=en", "orcid": "0000-0003-4375-3443;0000-0001-6028-9051;", "linkedin": "junxuan-li-335421a6/;yujiao-shi-053a12198/;", "or_profile": "~Junxuan_Li2;~Yujiao_Shi1;~Hongdong_Li1", "aff": "Australian National University;Australian National University;Australian National University", "aff_domain": "anu.edu.au;anu.edu.au;anu.edu.au", "position": "PhD student;PhD student;Full Professor", "bibtex": "@misc{\nli2022neural,\ntitle={Neural Plenoptic Sampling: Capture Light-field from Imaginary Eyes},\nauthor={Junxuan Li and Yujiao Shi and Hongdong Li},\nyear={2022},\nurl={https://openreview.net/forum?id=snJ1WYQOR5}\n}", "github": "", "project": "", "reviewers": "sZgW;T8yS;xxR7;borE", "site": "https://openreview.net/forum?id=snJ1WYQOR5", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "4;4;4;5", "correctness": "2;2;3;3", "technical_novelty": "2;2;3;2", "empirical_novelty": "1;2;2;2", "wc_summary_paper": "273;122;82;70", "wc_summary_review": "40;26;40;94", "wc_main_review": "348;182;549;408", "wc_review": "661;330;671;572", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 136.75, 80.98572405060042 ], "wc_summary_review_avg": [ 50.0, 26.038433132583073 ], "wc_main_review_avg": [ 371.75, 131.6251780625576 ], "wc_review_avg": [ 558.5, 137.43816791561215 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 1.0, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Bf5rQ22HHYEJ:scholar.google.com/&scioq=Neural+Plenoptic+Sampling:+Capture+Light-field+from+Imaginary+Eyes&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0;0", "aff_unique_norm": "Australian National University", "aff_unique_dep": "", "aff_unique_url": "https://www.anu.edu.au", "aff_unique_abbr": "ANU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Australia" }, { "title": "Neural Parameter Allocation Search", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6399", "id": "srtIXtySfT4", "poster": "", "openreview": "https://openreview.net/forum?id=srtIXtySfT4", "slides": "https://iclr.cc/virtual/2022/poster/6399", "video": "https://iclr.cc/virtual/2022/poster/6399", "author_site": "Bryan Plummer, Nikoli Dryden, Julius Frost, Torsten Hoefler, Kate Saenko", "tldr": "", "abstract": "Training neural networks requires increasing amounts of memory. Parameter sharing can reduce memory and communication costs, but existing methods assume networks have many identical layers and utilize hand-crafted sharing strategies that fail to generalize. We introduce Neural Parameter Allocation Search (NPAS), a novel task where the goal is to train a neural network given an arbitrary, fixed parameter budget. NPAS covers both low-budget regimes, which produce compact networks, as well as a novel high-budget regime, where additional capacity can be added to boost performance without increasing inference FLOPs. To address NPAS, we introduce Shapeshifter Networks (SSNs), which automatically learn where and how to share parameters in a network to support any parameter budget without requiring any changes to the architecture or loss function. NPAS and SSNs provide a complete framework for addressing generalized parameter sharing, and can also be combined with prior work for additional performance gains. We demonstrate the effectiveness of our approach using nine network architectures across four diverse tasks, including ImageNet classification and transformers.", "keywords": "efficient training methods;cross-layer parameter sharing", "primary_area": "", "supplementary_material": "", "author": "Bryan A. Plummer;Nikoli Dryden;Julius Frost;Torsten Hoefler;Kate Saenko", "authorids": "~Bryan_A._Plummer1;~Nikoli_Dryden1;~Julius_Frost1;~Torsten_Hoefler1;~Kate_Saenko1", "gender": ";M;;F;M", "homepage": "https://ndryden.com;https://github.com/juliusfrost/;;http://ai.bu.edu;http://bryanplummer.com/", "dblp": "148/1273;;16/3869;88/2754;163/2330", "google_scholar": "nRhl3Q4AAAAJ;;;https://scholar.google.com.tw/citations?user=9xDADY4AAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";;;0000-0002-5704-7614;", "linkedin": ";;;;", "or_profile": "~Nikoli_Dryden1;~Julius_Frost1;~Torsten_Hoefler1;~Kate_Saenko1;~Bryan_Allen_Plummer1", "aff": "Swiss Federal Institute of Technology;;Swiss Federal Institute of Technology;Boston University, Boston University;Boston University", "aff_domain": "ethz.ch;;ethz.ch;bu.edu;bu.edu", "position": "Postdoc;;Professor;Full Professor;Assistant Professor", "bibtex": "@inproceedings{\nplummer2022neural,\ntitle={Neural Parameter Allocation Search},\nauthor={Bryan A. Plummer and Nikoli Dryden and Julius Frost and Torsten Hoefler and Kate Saenko},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=srtIXtySfT4}\n}", "github": "", "project": "", "reviewers": "jt4B;cCHH;9sBX;p84E", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "3;3;4;4", "correctness": "3;3;3;3", "technical_novelty": "3;3;3;3", "empirical_novelty": "2;3;3;4", "wc_summary_paper": "112;99;51;46", "wc_summary_review": "26;22;27;31", "wc_main_review": "391;116;172;206", "wc_review": "529;237;250;283", "wc_reply_reviewers": "0;15;271;44", "wc_reply_authors": "1293;817;1460;718", "reply_reviewers": "0;1;2;1", "reply_authors": "2;2;3;1", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 77.0, 28.922309727959142 ], "wc_summary_review_avg": [ 26.5, 3.2015621187164243 ], "wc_main_review_avg": [ 221.25, 103.13916569373635 ], "wc_review_avg": [ 324.75, 119.10998069011681 ], "wc_reply_reviewers_avg": [ 82.5, 109.97386053058244 ], "wc_reply_authors_avg": [ 1072.0, 312.14019286211766 ], "reply_reviewers_avg": [ 1.0, 0.7071067811865476 ], "reply_authors_avg": [ 2.0, 0.7071067811865476 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.6882472016116854, "corr_recommendation_correctness": 0.0, "gs_citation": 22, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15625823340904525164&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 23, "pdf": "https://openreview.net/pdf?id=srtIXtySfT4", "email": "ethz.ch;;ethz.ch;bu.edu;bu.edu", "author_num": 5, "aff_unique_index": "0;0;1;1", "aff_unique_norm": "Swiss Federal Institute of Technology;Boston University", "aff_unique_dep": ";", "aff_unique_url": "https://www.ethz.ch;https://www.bu.edu", "aff_unique_abbr": "ETH Zurich;BU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Boston", "aff_country_unique_index": "0;0;1;1", "aff_country_unique": "Switzerland;United States" }, { "id": "swRxhFpK5ds", "title": "One Timestep Is All You Need: Training Spiking Neural Networks with Ultra Low Latency", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Spiking Neural Networks (SNNs) can be energy efficient alternatives to commonly used deep neural networks (DNNs). Through event-driven information processing, SNNs can considerably reduce the compute requirements of DNNs. However, high inference latency is a significant hindrance to their deployment. Computation over multiple timesteps increases latency and incurs memory access overhead of fetching membrane potentials, both of which lessen the energy benefits of SNNs. Hence, latency reduction is pivotal to obtain SNNs\nwith high energy efficiency. However, reducing latency can have an adverse effect on accuracy. To obtain solutions which optimize the accuracy-energy-latency trade-off, we propose an iterative training method which starts with an SNN of T (T>1) timesteps, and reduces T every iteration of training, with neuron threshold and leak as trainable parameters. This results in a continuum of SNNs, starting from an SNN trained with T timesteps, all the way up to unit latency. We use direct input encoding (analog inputs from pixels) with the first\nconvolutional layer of the network of leaky integrate and fire (LIF) neurons acting as spike generator. We choose T=5 as our starting point, since it is the minimum reported latency to achieve satisfactory performance on ImageNet. Training SNNs directly with 1 timestep results in convergence failure due to layerwise spike vanishing and difficulty in finding optimum thresholds. The proposed iterative\ntraining approach overcomes this through enabling the learning of suitable layerwise thresholds with backpropagation by maintaining sufficient spiking activity, starting from T timesteps up to 1. Using the proposed training algorithm, we achieve top-1 accuracy of 93.05%, 70.15% and 67.71% on CIFAR-10, CIFAR-100 and ImageNet, respectively with VGG16, in just 1 timestep. Compared to a\n5 timestep SNN, the 1 timestep SNN achieves ~5X enhancement in efficiency, with an accuracy drop of ~1%. In addition, 1 timestep SNNs perform inference with 5X reduced latency compared to state-of-the-art SNNs, and provide 25-33X higher energy efficiency compared to DNNs, while being comparable to them in performance. The proposed method also enables training reinforcement learning\nagents on Cartpole and Atari pong environments which infer using 1 timestep.", "keywords": "Spiking Neural Networks;One timestep Inference;Iterative Initialization and Retraining;Ultra-high energy efficiency", "primary_area": "", "supplementary_material": "/attachment/6f7e21814e77f847a338283c516ad84439ac4c42.zip", "author": "Sayeed Shafayet Chowdhury;Nitin Rathi;Kaushik Roy", "authorids": "~Sayeed_Shafayet_Chowdhury3;~Nitin_Rathi1;~Kaushik_Roy1", "gender": "M;M;M", "homepage": ";;https://engineering.purdue.edu/NRL/Group", "dblp": ";;r/KaushikRoy", "google_scholar": "646ndV4AAAAJ;;to4P8KgAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Sayeed_Shafayet_Chowdhury3;~Nitin_Rathi1;~Kaushik_Roy1", "aff": "Purdue University;;Purdue University", "aff_domain": "purdue.edu;;purdue.edu", "position": "PhD student;;Full Professor", "bibtex": "@misc{\nchowdhury2022one,\ntitle={One Timestep Is All You Need: Training Spiking Neural Networks with Ultra Low Latency},\nauthor={Sayeed Shafayet Chowdhury and Nitin Rathi and Kaushik Roy},\nyear={2022},\nurl={https://openreview.net/forum?id=swRxhFpK5ds}\n}", "github": "", "project": "", "reviewers": "pyfB;JwQ3;K6Fw", "site": "https://openreview.net/forum?id=swRxhFpK5ds", "pdf_size": 0, "recommendation": "3;3;6", "confidence": "4;5;5", "correctness": "2;2;3", "technical_novelty": "2;1;3", "empirical_novelty": "2;2;3", "wc_summary_paper": "18;61;112", "wc_summary_review": "208;43;9", "wc_main_review": "23;674;275", "wc_review": "249;778;396", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 4.0, 1.4142135623730951 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "correctness_avg": [ 2.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.0, 0.816496580927726 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 63.666666666666664, 38.42163742245016 ], "wc_summary_review_avg": [ 86.66666666666667, 86.91119349977629 ], "wc_main_review_avg": [ 324.0, 268.01865606707304 ], "wc_review_avg": [ 474.3333333333333, 222.953408187052 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.5, "corr_recommendation_correctness": 1.0, "gs_citation": 41, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11999826237330556049&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Purdue University", "aff_unique_dep": "", "aff_unique_url": "https://www.purdue.edu", "aff_unique_abbr": "Purdue", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "swbAS4OpXW", "title": "One-Shot Generative Domain Adaptation", "track": "main", "status": "Reject", "tldr": "", "abstract": "This work aims at transferring a Generative Adversarial Network (GAN) pre-trained on one image domain to a new domain $\\textit{referring to as few as just one target image}$. The main challenge is that, under limited supervision, it is extremely difficult to synthesize photo-realistic and highly diverse images, while acquiring representative characters of the target. Different from existing approaches that adopt the vanilla fine-tuning strategy, we import two lightweight modules to the generator and the discriminator respectively. Concretely, we introduce an $\\textit{attribute adaptor}$ into the generator yet freeze its original parameters, through which it can reuse the prior knowledge to the most extent and hence maintain the synthesis quality and diversity. We then equip the well-learned discriminator backbone with an $\\textit{attribute classifier}$ to ensure that the generator captures the appropriate characters from the reference. Furthermore, considering the poor diversity of the training data ($\\textit{i.e.}$, as few as only one image), we propose to also constrain the diversity of the generative domain in the training process, alleviating the optimization difficulty. Our approach brings appealing results under various settings, $\\textit{substantially}$ surpassing state-of-the-art alternatives, especially in terms of synthesis diversity. Noticeably, our method works well even with large domain gaps, and robustly converges $\\textit{within a few minutes}$ for each experiment.", "keywords": "generative domain adaptation", "primary_area": "", "supplementary_material": "", "author": "Ceyuan Yang;Yujun Shen;Zhiyi Zhang;Yinghao Xu;Jiapeng Zhu;Zhirong Wu;Bolei Zhou", "authorids": "~Ceyuan_Yang2;~Yujun_Shen1;~Zhiyi_Zhang1;~Yinghao_Xu1;~Jiapeng_Zhu1;~Zhirong_Wu1;~Bolei_Zhou5", "gender": "M;;M;M;M;M;M", "homepage": "https://ceyuan.me/;;;https://justimyhxu.github.io/;;https://www.microsoft.com/en-us/research/people/wuzhiron/;https://boleizhou.github.io/", "dblp": "218/2676;;;232/2482;169/7704;147/5025;46/8066", "google_scholar": "Rfj4jWoAAAAJ;;;https://scholar.google.com/citations?hl=en;-ACBm-gAAAAJ;lH4zgcIAAAAJ;9D4aG8AAAAAJ", "orcid": ";;;;;;", "linkedin": ";;zhiyi-zhang-3088b0163/;;;;", "or_profile": "~Ceyuan_Yang2;~Yujun_Shen1;~Zhiyi_Zhang1;~Yinghao_Xu1;~Jiapeng_Zhu1;~Zhirong_Wu1;~Bolei_Zhou5", "aff": "The Chinese University of Hong Kong;;University of Southern California;Chinese University of Hong Kong;Hong Kong University of Science and Technology;Microsoft Research;University of California, Los Angeles", "aff_domain": "cuhk.edu.hk;;usc.edu;ie.cuhk.edu.hk;hkust.edu;microsoft.com;ucla.edu", "position": "PhD student;;MS student;PhD student;PhD student;Researcher;Assistant Professor", "bibtex": "@misc{\nyang2022oneshot,\ntitle={One-Shot Generative Domain Adaptation},\nauthor={Ceyuan Yang and Yujun Shen and Zhiyi Zhang and Yinghao Xu and Jiapeng Zhu and Zhirong Wu and Bolei Zhou},\nyear={2022},\nurl={https://openreview.net/forum?id=swbAS4OpXW}\n}", "github": "", "project": "", "reviewers": "Zh5e;CemL;NUUH", "site": "https://openreview.net/forum?id=swbAS4OpXW", "pdf_size": 0, "recommendation": "3;5;8", "confidence": "5;5;5", "correctness": "1;3;4", "technical_novelty": "2;2;3", "empirical_novelty": "2;2;4", "wc_summary_paper": "96;81;77", "wc_summary_review": "16;63;54", "wc_main_review": "619;500;332", "wc_review": "731;644;463", "wc_reply_reviewers": "895;158;0", "wc_reply_authors": "2033;1279;199", "reply_reviewers": "3;1;0", "reply_authors": "5;3;1", "recommendation_avg": [ 5.333333333333333, 2.0548046676563256 ], "confidence_avg": [ 5.0, 0.0 ], "correctness_avg": [ 2.6666666666666665, 1.247219128924647 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.9428090415820634 ], "wc_summary_paper_avg": [ 84.66666666666667, 8.178562764256865 ], "wc_summary_review_avg": [ 44.333333333333336, 20.368821489936252 ], "wc_main_review_avg": [ 483.6666666666667, 117.73510757440008 ], "wc_review_avg": [ 612.6666666666666, 111.63133769491233 ], "wc_reply_reviewers_avg": [ 351.0, 390.03675040522353 ], "wc_reply_authors_avg": [ 1170.3333333333333, 752.6598759658235 ], "reply_reviewers_avg": [ 1.3333333333333333, 1.247219128924647 ], "reply_authors_avg": [ 3.0, 1.632993161855452 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.9538209664765321, "gs_citation": 50, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2135481622440516382&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff_unique_index": "0;1;0;2;3;4", "aff_unique_norm": "Chinese University of Hong Kong;University of Southern California;Hong Kong University of Science and Technology;Microsoft;University of California, Los Angeles", "aff_unique_dep": ";;;Microsoft Research;", "aff_unique_url": "https://www.cuhk.edu.hk;https://www.usc.edu;https://www.ust.hk;https://www.microsoft.com/en-us/research;https://www.ucla.edu", "aff_unique_abbr": "CUHK;USC;HKUST;MSR;UCLA", "aff_campus_unique_index": "0;1;0;0;1", "aff_campus_unique": "Hong Kong SAR;Los Angeles;", "aff_country_unique_index": "0;1;0;0;1;1", "aff_country_unique": "China;United States" }, { "title": "Learning to Guide and to be Guided in the Architect-Builder Problem", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6532", "id": "swiyAeGzFhQ", "poster": "", "openreview": "https://openreview.net/forum?id=swiyAeGzFhQ", "slides": "https://iclr.cc/virtual/2022/poster/6532", "video": "https://iclr.cc/virtual/2022/poster/6532", "author_site": "Paul Barde, Tristan Karch, Derek Nowrouzezahrai, Cl\u00e9ment Moulin-Frier, Chris J Pal, Pierre-Yves Oudeyer", "tldr": "", "abstract": "We are interested in interactive agents that learn to coordinate, namely, a $builder$ -- which performs actions but ignores the goal of the task, i.e. has no access to rewards -- and an $architect$ which guides the builder towards the goal of the task. \nWe define and explore a formal setting where artificial agents are equipped with mechanisms that allow them to simultaneously learn a task while at the same time evolving a shared communication protocol. \nIdeally, such learning should only rely on high-level communication priors and be able to handle a large variety of tasks and meanings while deriving communication protocols that can be reused across tasks.\nThe field of Experimental Semiotics has shown the extent of human proficiency at learning from a priori unknown instructions meanings. Therefore, we take inspiration from it and present the Architect-Builder Problem (ABP): an asymmetrical setting in which an architect must learn to guide a builder towards constructing a specific structure. The architect knows the target structure but cannot act in the environment and can only send arbitrary messages to the builder. The builder on the other hand can act in the environment, but receives no rewards nor has any knowledge about the task, and must learn to solve it relying only on the messages sent by the architect. Crucially, the meaning of messages is initially not defined nor shared between the agents but must be negotiated throughout learning.\nUnder these constraints, we propose Architect-Builder Iterated Guiding (ABIG), a solution to the Architect-Builder Problem where the architect leverages a learned model of the builder to guide it while the builder uses self-imitation learning to reinforce its guided behavior. To palliate to the non-stationarity induced by the two agents concurrently learning, ABIG structures the sequence of interactions between the agents into interaction frames. We analyze the key learning mechanisms of ABIG and test it in a 2-dimensional instantiation of the ABP where tasks involve grasping cubes, placing them at a given location, or building various shapes. In this environment, ABIG results in a low-level, high-frequency, guiding communication protocol that not only enables an architect-builder pair to solve the task at hand, but that can also generalize to unseen tasks. ", "keywords": "Social Learning;Interactive Learning;Teacher-Student Learning;Computational Experimental Semiotics;Socially Supervised Learning", "primary_area": "", "supplementary_material": "", "author": "Paul Barde;Tristan Karch;Derek Nowrouzezahrai;Cl\u00e9ment Moulin-Frier;Christopher Pal;Pierre-Yves Oudeyer", "authorids": "~Paul_Barde1;~Tristan_Karch1;~Derek_Nowrouzezahrai1;~Cl\u00e9ment_Moulin-Frier2;~Christopher_Pal1;~Pierre-Yves_Oudeyer1", "gender": "M;M;Not Specified;M;;M", "homepage": "https://pbarde.github.io/;http://tristankarch.com;https://www.cim.mcgill.ca/~derek/;http://clement-moulin-frier.github.io/;https://scholar.google.ca/citations?user=1ScWJOoAAAAJ&hl=en&oi=ao;http://www.pyoudeyer.com", "dblp": "246/4858;https://dblp.uni-trier.de/pers/hd/k/Karch:Tristan;30/4225;124/0220;45/1217;33/5513", "google_scholar": "FoxktlkAAAAJ;unMhjagAAAAJ;https://scholar.google.ca/citations?user=nCZ2PMcAAAAJ;rBnV60QAAAAJ;https://scholar.google.ca/citations?user=1ScWJOoAAAAJ;https://scholar.google.fr/citations?user=gCqGj4sAAAAJ", "orcid": ";;;;;", "linkedin": ";;;;;pierreyvesoudeyer/", "or_profile": "~Paul_Barde1;~Tristan_Karch1;~Derek_Nowrouzezahrai1;~Cl\u00e9ment_Moulin-Frier2;~Christopher_Pal1;~Pierre-Yves_Oudeyer1", "aff": "INRIA;INRIA;McGill University;Inria;Polytechnique Montreal;Microsoft", "aff_domain": "inria.fr;inria.fr;mcgill.ca;inria.fr;polymtl.ca;microsoft.com", "position": "PhD student;PhD student;Full Professor;Associate Professor;Full Professor;Visiting researcher", "bibtex": "@inproceedings{\nbarde2022learning,\ntitle={Learning to Guide and to be Guided in the Architect-Builder Problem},\nauthor={Paul Barde and Tristan Karch and Derek Nowrouzezahrai and Cl{\\'e}ment Moulin-Frier and Christopher Pal and Pierre-Yves Oudeyer},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=swiyAeGzFhQ}\n}", "github": "", "project": "", "reviewers": "rTG5;BHGy;FMPZ;hMeT", "pdf_size": 0, "recommendation": "3;5;6;8", "confidence": "4;3;4;4", "correctness": "2;4;3;4", "technical_novelty": "2;2;3;4", "empirical_novelty": "2;2;3;4", "wc_summary_paper": "348;36;166;130", "wc_summary_review": "107;49;59;55", "wc_main_review": "388;327;363;314", "wc_review": "843;412;588;499", "wc_reply_reviewers": "0;932;0;85", "wc_reply_authors": "547;2743;315;310", "reply_reviewers": "0;3;0;1", "reply_authors": "1;6;1;2", "recommendation_avg": [ 5.5, 1.8027756377319946 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.75, 0.82915619758885 ], "empirical_novelty_avg": [ 2.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 170.0, 113.19893992436502 ], "wc_summary_review_avg": [ 67.5, 23.08137777516758 ], "wc_main_review_avg": [ 348.0, 29.248931604419333 ], "wc_review_avg": [ 585.5, 161.1652878258839 ], "wc_reply_reviewers_avg": [ 254.25, 392.83480943012165 ], "wc_reply_authors_avg": [ 978.75, 1023.0807336178314 ], "reply_reviewers_avg": [ 1.0, 1.224744871391589 ], "reply_authors_avg": [ 2.5, 2.0615528128088303 ], "replies_avg": [ 20, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.16012815380508713, "corr_recommendation_correctness": 0.7526178090063818, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14269266547395531261&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "pdf": "https://openreview.net/pdf?id=swiyAeGzFhQ", "email": "inria.fr;inria.fr;mcgill.ca;inria.fr;polymtl.ca;microsoft.com", "author_num": 6, "aff_unique_index": "0;0;1;0;2;3", "aff_unique_norm": "INRIA;McGill University;Polytechnique Montreal;Microsoft", "aff_unique_dep": ";;;Microsoft Corporation", "aff_unique_url": "https://www.inria.fr;https://www.mcgill.ca;https://www.polymtl.ca;https://www.microsoft.com", "aff_unique_abbr": "INRIA;McGill;PolyMTL;Microsoft", "aff_campus_unique_index": "1", "aff_campus_unique": ";Montreal", "aff_country_unique_index": "0;0;1;0;1;2", "aff_country_unique": "France;Canada;United States" }, { "title": "Learning to Map for Active Semantic Goal Navigation", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6293", "id": "swrMQttr6wN", "poster": "", "openreview": "https://openreview.net/forum?id=swrMQttr6wN", "slides": "https://iclr.cc/virtual/2022/poster/6293", "video": "https://iclr.cc/virtual/2022/poster/6293", "author_site": "Georgios Georgakis, Bernadette Bucher, Karl Schmeckpeper, Siddharth Singh, Kostas Daniilidis", "tldr": "", "abstract": "We consider the problem of object goal navigation in unseen environments. Solving this problem requires learning of contextual semantic priors, a challenging endeavour given the spatial and semantic variability of indoor environments. Current methods learn to implicitly encode these priors through goal-oriented navigation policy functions operating on spatial representations that are limited to the agent's observable areas. In this work, we propose a novel framework that actively learns to generate semantic maps outside the field of view of the agent and leverages the uncertainty over the semantic classes in the unobserved areas to decide on long term goals. We demonstrate that through this spatial prediction strategy, we are able to learn semantic priors in scenes that can be leveraged in unknown environments. Additionally, we show how different objectives can be defined by balancing exploration with exploitation during searching for semantic targets. Our method is validated in the visually realistic environments of the Matterport3D dataset and show improved results on object goal navigation over competitive baselines.", "keywords": "visual navigation;semantic map;uncertainty estimation", "primary_area": "", "supplementary_material": "", "author": "Georgios Georgakis;Bernadette Bucher;Karl Schmeckpeper;Siddharth Singh;Kostas Daniilidis", "authorids": "~Georgios_Georgakis1;~Bernadette_Bucher1;~Karl_Schmeckpeper1;~Siddharth_Singh5;~Kostas_Daniilidis1", "gender": "M;F;;M;M", "homepage": "https://ggeorgak11.github.io/;http://bernadettekbucher.com;https://sites.google.com/view/karlschmeckpeper/;http://www.cis.upenn.edu/~kostas;https://singh-sid930.github.io/", "dblp": "186/8108;251/5461;245/5630;d/KostasDaniilidis;", "google_scholar": "ndbhEbYAAAAJ;VIZvaGsAAAAJ;E2kpqtkAAAAJ;dGs2BcIAAAAJ;M6tXTboAAAAJ", "orcid": ";;0000-0003-4989-2022;0000-0003-0498-0758;", "linkedin": ";bernadette-bucher-09898536/;;;", "or_profile": "~Georgios_Georgakis1;~Bernadette_Bucher1;~Karl_Schmeckpeper1;~Kostas_Daniilidis1;~siddharth_Singh4", "aff": "University of Pennsylvania;University of Pennsylvania;University of Pennsylvania;University of Pennsylvania;Amazon", "aff_domain": "upenn.edu;upenn.edu;upenn.edu;upenn.edu;amazon.com", "position": "Postdoc;PhD student;PhD student;Full Professor;Software Development Engineer", "bibtex": "@inproceedings{\ngeorgakis2022learning,\ntitle={Learning to Map for Active Semantic Goal Navigation},\nauthor={Georgios Georgakis and Bernadette Bucher and Karl Schmeckpeper and Siddharth Singh and Kostas Daniilidis},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=swrMQttr6wN}\n}", "github": "", "project": "", "reviewers": "Nmyk;qy8x;CF2f", "pdf_size": 0, "recommendation": "5;6;8", "confidence": "4;4;4", "correctness": "2;4;3", "technical_novelty": "4;3;4", "empirical_novelty": "4;2;4", "wc_summary_paper": "95;75;163", "wc_summary_review": "36;71;66", "wc_main_review": "141;338;789", "wc_review": "272;484;1018", "wc_reply_reviewers": "158;37;0", "wc_reply_authors": "617;1358;771", "reply_reviewers": "2;1;0", "reply_authors": "3;3;1", "recommendation_avg": [ 6.333333333333333, 1.247219128924647 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.0, 0.816496580927726 ], "technical_novelty_avg": [ 3.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 3.3333333333333335, 0.9428090415820634 ], "wc_summary_paper_avg": [ 111.0, 37.66519171153476 ], "wc_summary_review_avg": [ 57.666666666666664, 15.456030825826172 ], "wc_main_review_avg": [ 422.6666666666667, 271.234625780379 ], "wc_review_avg": [ 591.3333333333334, 313.86762956521795 ], "wc_reply_reviewers_avg": [ 65.0, 67.47345156924068 ], "wc_reply_authors_avg": [ 915.3333333333334, 319.2640843495484 ], "reply_reviewers_avg": [ 1.0, 0.816496580927726 ], "reply_authors_avg": [ 2.3333333333333335, 0.9428090415820634 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.3273268353539886, "gs_citation": 94, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=372304590495842191&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=swrMQttr6wN", "email": "upenn.edu;upenn.edu;upenn.edu;upenn.edu;amazon.com", "author_num": 5, "aff_unique_index": "0;0;0;0;1", "aff_unique_norm": "University of Pennsylvania;Amazon", "aff_unique_dep": ";Amazon.com, Inc.", "aff_unique_url": "https://www.upenn.edu;https://www.amazon.com", "aff_unique_abbr": "UPenn;Amazon", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "sxpUavxXE0v", "title": "Decoupled Contrastive Learning", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Contrastive learning (CL) is one of the most successful paradigms for self-supervised learning (SSL). In a principled way, it considers two augmented ''views'' of the same image as positive to be pulled closer, and all other images negative to be pushed further apart. However, behind the impressive success of CL-based techniques, their formulation often relies on heavy-computation settings, including large sample batches, extensive training epochs, etc. We are thus motivated to tackle these issues and aim at establishing a simple, efficient, and yet competitive baseline of contrastive learning. Specifically, we identify, from theoretical and empirical studies, a noticeable negative-positive-coupling (NPC) effect in the widely used cross-entropy (infoNCE) loss, leading to unsuitable learning efficiency with respect to the batch size. Indeed the phenomenon tends to be neglected in that optimizing infoNCE loss with a small-size batch is effective in solving easier SSL tasks. By properly addressing the NPC effect, we reach a decoupled contrastive learning (DCL) objective function, significantly improving SSL efficiency. DCL can achieve competitive performance, requiring neither large batches in SimCLR, momentum encoding in MoCo, or large epochs. We demonstrate the usefulness of DCL in various benchmarks, while manifesting its robustness being much less sensitive to suboptimal hyperparameters. Notably, our approach achieves $66.9\\%$ ImageNet top-1 accuracy using batch size 256 within 200 epochs pre-training, outperforming its baseline SimCLR by $5.1\\%$. With further optimized hyperparameters, DCL can improve the accuracy to $68.2\\%$. We believe DCL provides a valuable baseline for future contrastive learning-based SSL studies.", "keywords": "Contrastive Learning;Unsupervised Learning;Self-Supervised Learning", "primary_area": "", "supplementary_material": "", "author": "Chun-Hsiao Yeh;Cheng-Yao Hong;Yen-Chi Hsu;Tyng-Luh Liu;Yubei Chen;Yann LeCun", "authorids": "~Chun-Hsiao_Yeh1;~Cheng-Yao_Hong2;~Yen-Chi_Hsu1;~Tyng-Luh_Liu1;~Yubei_Chen1;~Yann_LeCun1", "gender": "M;M;M;;M;M", "homepage": "https://danielchyeh.github.io/;;;https://www.iis.sinica.edu.tw/pages/liutyng/index_en.html;https://redwood.berkeley.edu/people/yubei-chen/;http://yann.lecun.com", "dblp": "203/3213;251/8476;;68/2368;30/10064;l/YannLeCun", "google_scholar": "jUAAhpMAAAAJ;XVJf16kAAAAJ;https://scholar.google.com.tw/citations?user=q7TeyD4AAAAJ;https://scholar.google.com.tw/citations?user=20N2rlkAAAAJ;WeyLqFUAAAAJ;WLN3QrAAAAAJ", "orcid": ";;;0000-0002-8366-5213;;", "linkedin": "danielchyeh/;;hsu-yen-chi-9285b312a/;;yubei-chen-05998a39/;", "or_profile": "~Chun-Hsiao_Yeh1;~Cheng-Yao_Hong2;~Yen-Chi_Hsu1;~Tyng-Luh_Liu1;~Yubei_Chen1;~Yann_LeCun1", "aff": "Adobe Inc.;Academia Sinica;National Taiwan University;Academia Sinica;Facebook AI Research;New York University", "aff_domain": "adobe.com;iis.sinica.edu.tw;csie.ntu.edu.tw;sinica.edu.tw;facebook.com;nyu.edu", "position": "Research Intern;MS student;PhD student;Principal Researcher;Postdoc Researcher;Full Professor", "bibtex": "@misc{\nyeh2022decoupled,\ntitle={Decoupled Contrastive Learning},\nauthor={Chun-Hsiao Yeh and Cheng-Yao Hong and Yen-Chi Hsu and Tyng-Luh Liu and Yubei Chen and Yann LeCun},\nyear={2022},\nurl={https://openreview.net/forum?id=sxpUavxXE0v}\n}", "github": "", "project": "", "reviewers": "tUiB;GwgY;5Ly8;mzLp;sU5a", "site": "https://openreview.net/forum?id=sxpUavxXE0v", "pdf_size": 0, "recommendation": "3;3;5;5;5", "confidence": "4;4;4;4;3", "correctness": "2;2;3;4;3", "technical_novelty": "3;2;3;2;3", "empirical_novelty": "3;2;3;2;3", "wc_summary_paper": "70;115;58;36;39", "wc_summary_review": "42;29;29;12;28", "wc_main_review": "496;499;288;197;141", "wc_review": "608;643;375;245;208", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "45;0;23;0;0", "reply_reviewers": "0;0;0;0;0", "reply_authors": "1;0;1;0;0", "recommendation_avg": [ 4.2, 0.9797958971132712 ], "confidence_avg": [ 3.8, 0.39999999999999997 ], "correctness_avg": [ 2.8, 0.7483314773547882 ], "technical_novelty_avg": [ 2.6, 0.4898979485566356 ], "empirical_novelty_avg": [ 2.6, 0.4898979485566356 ], "wc_summary_paper_avg": [ 63.6, 28.56991424558359 ], "wc_summary_review_avg": [ 28.0, 9.528903399657276 ], "wc_main_review_avg": [ 324.2, 149.07903943881584 ], "wc_review_avg": [ 415.8, 180.321268850904 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 13.6, 18.05103875127412 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0.4, 0.48989794855663565 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.408248290463863, "corr_recommendation_correctness": 0.8728715609439696, "gs_citation": 268, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16622669985771512775&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 12, "aff_unique_index": "0;1;2;1;3;4", "aff_unique_norm": "Adobe;Academia Sinica;National Taiwan University;Meta;New York University", "aff_unique_dep": "Adobe Inc.;;;Facebook AI Research;", "aff_unique_url": "https://www.adobe.com;https://www.sinica.edu.tw;https://www.ntu.edu.tw;https://research.facebook.com;https://www.nyu.edu", "aff_unique_abbr": "Adobe;Academia Sinica;NTU;FAIR;NYU", "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Taiwan", "aff_country_unique_index": "0;1;1;1;0;0", "aff_country_unique": "United States;China" }, { "id": "syzTg1vyBtL", "title": "Congested bandits: Optimal routing via short-term resets", "track": "main", "status": "Reject", "tldr": "", "abstract": "For traffic routing platforms, the choice of which route to recommend to a user depends on the congestion on these routes -- indeed, an individual's utility depends on the number of people using the recommended route at that instance. Motivated by this, we introduce the problem of Congested Bandits where each arm's reward is allowed to depend on the number of times it was played in the past $\\Delta$ timesteps. This dependence on past history of actions leads to a dynamical system where an algorithm's present choices also affect its future pay-offs, and requires an algorithm to plan for this. We study the congestion aware formulation in the multi-armed bandit (MAB) setup and in the contextual bandit setup with linear rewards. For the multi-armed setup, we propose a UCB style algorithm and show that its policy regret scales as $\\tilde{O}(\\sqrt{K \\Delta T})$. For the linear contextual bandit setup, our algorithm, based on an iterative least squares planner, achieves policy regret $\\tilde{O}(\\sqrt{dT} + \\Delta)$. From an experimental standpoint, we corroborate the no-regret properties of our algorithms via a simulation study.", "keywords": "Multi-armed bandits;linear contextual bandits;policy regret;congestion aware routing", "primary_area": "", "supplementary_material": "/attachment/100aec0823f2ed9ea0824a56dbf0252a16cbb13a.zip", "author": "Pranjal Awasthi;Kush Bhatia;Sreenivas Gollapudi;Kostas Kollias", "authorids": "~Pranjal_Awasthi3;~Kush_Bhatia3;~Sreenivas_Gollapudi2;~Kostas_Kollias1", "gender": ";M;M;M", "homepage": "https://www.cs.rutgers.edu/~pa336/;https://www.sreenivasgollapudi.com;https://research.google/people/KostasKollias/;http://people.eecs.berkeley.edu/~kush/", "dblp": "57/679;https://dblp.uni-trier.de/pers/g/Gollapudi:Sreenivas.html;79/1557;164/5807", "google_scholar": ";Ysd-WJgAAAAJ;u8bc3D8AAAAJ;X-Sd3-8AAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Pranjal_Awasthi3;~Sreenivas_Gollapudi2;~Kostas_Kollias1;~kush_Bhatia2", "aff": "Rutgers University;Google;Google;University of California, Berkeley", "aff_domain": "rutgers.edu;google.com;google.com;berkeley.edu", "position": "Assistant Professor;Researcher;Research Scientist;PhD student", "bibtex": "@misc{\nawasthi2022congested,\ntitle={Congested bandits: Optimal routing via short-term resets},\nauthor={Pranjal Awasthi and Kush Bhatia and Sreenivas Gollapudi and Kostas Kollias},\nyear={2022},\nurl={https://openreview.net/forum?id=syzTg1vyBtL}\n}", "github": "", "project": "", "reviewers": "otHr;w1p9;9u2j;kNab", "site": "https://openreview.net/forum?id=syzTg1vyBtL", "pdf_size": 0, "recommendation": "3;3;3;8", "confidence": "3;4;4;3", "correctness": "3;4;3;4", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;1;2;3", "wc_summary_paper": "163;157;59;154", "wc_summary_review": "188;31;108;35", "wc_main_review": "118;173;824;222", "wc_review": "469;361;991;411", "wc_reply_reviewers": "0;0;0;90", "wc_reply_authors": "459;310;503;227", "reply_reviewers": "0;0;0;1", "reply_authors": "1;1;1;1", "recommendation_avg": [ 4.25, 2.165063509461097 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 133.25, 42.990551287463155 ], "wc_summary_review_avg": [ 90.5, 64.0956316764255 ], "wc_main_review_avg": [ 334.25, 285.14064512096485 ], "wc_review_avg": [ 558.0, 252.89721232152797 ], "wc_reply_reviewers_avg": [ 22.5, 38.97114317029974 ], "wc_reply_authors_avg": [ 374.75, 111.32020256898565 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18422691580587375206&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff_unique_index": "0;1;1;2", "aff_unique_norm": "Rutgers University;Google;University of California, Berkeley", "aff_unique_dep": ";Google;", "aff_unique_url": "https://www.rutgers.edu;https://www.google.com;https://www.berkeley.edu", "aff_unique_abbr": "Rutgers;Google;UC Berkeley", "aff_campus_unique_index": "1;1;2", "aff_campus_unique": ";Mountain View;Berkeley", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "t14vYukzfvF", "title": "Unsupervised Visual Program Induction with Function Modularization", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Program induction serves as one way to analog the ability of human thinking. However, existing methods could only tackle the task under simple scenarios (Fig~\\ref{fig:task_examples}(a),(b)). When it comes to complex scenes, e.g., the visual scenes, current program induction methods fail due to the huge program action space. In this paper, to the best of our knowledge, we are the first to tackle this problem. We propose a novel task named {\\it unsupervised visual program induction} in complex visual scenes that require complex primitive functions. Solving this task faces two challenges: i) modeling complex primitive functions for complex visual scenes is very difficult, and ii) employing complex functions in the unsupervised program induction suffers from a huge and heterogeneous program action space. To tackle these challenges, we propose the Self-Exploratory-Modularized-Function (SEMF) model, which can jointly model individual function selection and its parameters through a unified modular block. Moreover, a Monto-Carlo-Tree-Search (MCTS) based Self-Exploratory algorithm is proposed to explore program space with modularized function as prior. The exploratory results, in turn, guide the training of these modularized functions. Our extensive experiments demonstrate that the proposed SEFM model outperforms all the existing baselines in model performance, training efficiency, and model generalization ability.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xuguang Duan;Xin Wang;Ziwei Zhang;Wenwu Zhu", "authorids": "~Xuguang_Duan1;~Xin_Wang17;~Ziwei_Zhang1;~Wenwu_Zhu1", "gender": "M;M;;M", "homepage": ";http://mn.cs.tsinghua.edu.cn/xinwang/;;http://media.cs.tsinghua.edu.cn/en/zww", "dblp": "222/2730;10/5630-19;;97/6308-1.html", "google_scholar": "p2B9-xUAAAAJ;YPOBHYUAAAAJ;;https://scholar.google.com.tw/citations?user=7t2jzpgAAAAJ", "orcid": ";0000-0002-0351-2939;;0000-0003-2236-9290", "linkedin": ";;;", "or_profile": "~Xuguang_Duan1;~Xin_Wang17;~Ziwei_Zhang1;~Wenwu_Zhu1", "aff": "Tsinghua University;Tsinghua University;;Tsinghua University", "aff_domain": "tsinghua.edu.cn;cs.tsinghua.edu.cn;;tsinghua.edu.cn", "position": "PHD;Assistant Professor;;Full Professor", "bibtex": "@misc{\nduan2022unsupervised,\ntitle={Unsupervised Visual Program Induction with Function Modularization},\nauthor={Xuguang Duan and Xin Wang and Ziwei Zhang and Wenwu Zhu},\nyear={2022},\nurl={https://openreview.net/forum?id=t14vYukzfvF}\n}", "github": "", "project": "", "reviewers": "3naL;fDWD;uW8s;rhyp", "site": "https://openreview.net/forum?id=t14vYukzfvF", "pdf_size": 0, "recommendation": "1;3;5;6", "confidence": "4;5;4;3", "correctness": "1;2;3;3", "technical_novelty": "1;2;3;2", "empirical_novelty": "1;2;3;2", "wc_summary_paper": "57;167;104;114", "wc_summary_review": "67;39;42;41", "wc_main_review": "285;631;384;335", "wc_review": "409;837;530;490", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.75, 1.920286436967152 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 2.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 110.5, 39.080046059338265 ], "wc_summary_review_avg": [ 47.25, 11.453711188955307 ], "wc_main_review_avg": [ 408.75, 133.00446421079258 ], "wc_review_avg": [ 566.5, 162.14268407794415 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.552344770738994, "corr_recommendation_correctness": 0.9813358399735743, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:vKTOn_bFdmIJ:scholar.google.com/&scioq=Unsupervised+Visual+Program+Induction+with+Function+Modularization&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;0", "aff_unique_norm": "Tsinghua University", "aff_unique_dep": "", "aff_unique_url": "https://www.tsinghua.edu.cn", "aff_unique_abbr": "THU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "id": "t1QXzSGwr9", "title": "Image Compression and Classification Using Qubits and Quantum Deep Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Recent work suggests that quantum machine learning techniques can be used for classical image classification by encoding the images in quantum states and using a quantum neural network for inference. However, such work has been restricted to very small input images, at most $4 \\times 4$, that are unrealistic and cannot even be accurately labeled by humans. The primary difficulties in using larger input images is that hitherto-proposed encoding schemes necessitate more qubits than are physically realizable. We propose a framework to classify larger, realistic images using quantum systems. Our approach relies on a novel encoding mechanism that embeds images in quantum states while necessitating fewer qubits than prior work. Our framework is able to classify images that are larger than previously possible, up to $16 \\times 16$ for the MNIST dataset on a personal laptop, and obtains accuracy comparable to classical neural networks with the same number of learnable parameters. We also propose a technique for further reducing the number of qubits needed to represent images that may result in an easier physical implementation at the expense of final performance. Our work enables quantum machine learning and classification on classical datasets of dimensions that were previously intractable by physically realizable quantum computers or classical simulation.\n", "keywords": "quantum machine learning;flexible representation of quantum images;quantum neural network", "primary_area": "", "supplementary_material": "/attachment/8992acbaaa5cc55dfa844a89bb02333c0c44a638.zip", "author": "Ali Mohsen;Mo Tiwari", "authorids": "ahm302@nyu.edu;~Mo_Tiwari1", "gender": ";", "homepage": ";http://www.motiwari.com/", "dblp": ";267/5421", "google_scholar": ";https://scholar.google.com/citations?hl=en", "orcid": ";", "linkedin": ";motiwari", "or_profile": "ahm302@nyu.edu;~Mo_Tiwari1", "aff": ";Stanford University", "aff_domain": ";stanford.edu", "position": ";PhD student", "bibtex": "@misc{\nmohsen2022image,\ntitle={Image Compression and Classification Using Qubits and Quantum Deep Learning},\nauthor={Ali Mohsen and Mo Tiwari},\nyear={2022},\nurl={https://openreview.net/forum?id=t1QXzSGwr9}\n}", "github": "", "project": "", "reviewers": "N7sU;PiSA;MiCP;snts", "site": "https://openreview.net/forum?id=t1QXzSGwr9", "pdf_size": 0, "recommendation": "1;3;3;5", "confidence": "5;5;4;2", "correctness": "3;4;4;2", "technical_novelty": "1;2;2;2", "empirical_novelty": "1;2;2;2", "wc_summary_paper": "58;38;64;29", "wc_summary_review": "42;16;60;33", "wc_main_review": "407;532;281;163", "wc_review": "507;586;405;225", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.0, 1.4142135623730951 ], "confidence_avg": [ 4.0, 1.224744871391589 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 1.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 47.25, 14.271912976192084 ], "wc_summary_review_avg": [ 37.75, 15.880412463157246 ], "wc_main_review_avg": [ 345.75, 137.86837019418195 ], "wc_review_avg": [ 430.75, 135.01180503941129 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.8660254037844385, "corr_recommendation_correctness": -0.4264014327112209, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9698920122666569591&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0", "aff_unique_norm": "Stanford University", "aff_unique_dep": "", "aff_unique_url": "https://www.stanford.edu", "aff_unique_abbr": "Stanford", "aff_campus_unique_index": "0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "id": "t2LJBsPxQM", "title": "Scaling-up Diverse Orthogonal Convolutional Networks by a Paraunitary Framework", "track": "main", "status": "Reject", "tldr": "", "abstract": "Enforcing orthogonality in neural networks is an antidote for gradient vanishing/exploding problems, sensitivity to adversarial perturbation, and bounding generalization errors. However, many previous approaches are heuristic, and the orthogonality of convolutional layers is not systematically studied. Some of these designs are not exactly orthogonal, while others only consider standard convolutional layers and propose specific classes of their realizations. We propose a theoretical framework for orthogonal convolutional layers to address this problem, establishing the equivalence between diverse orthogonal convolutional layers in the spatial domain and the paraunitary systems in the spectral domain. Since a complete factorization exists for paraunitary systems, any orthogonal convolution layer can be parameterized as convolutions of spatial filters. Our framework endows high expressive power to various convolutional layers while maintaining their exact orthogonality. Furthermore, our layers are memory and computationally efficient for deep networks compared to previous designs. Our versatile framework, for the first time, enables the study of architecture designs for deep orthogonal networks, such as choices of skip connection, initialization, stride, and dilation. Consequently, we scale up orthogonal networks to deep architectures, including ResNet and ShuffleNet, substantially increasing the performance over their shallower counterparts. Finally, we show how to construct residual flows, a flow-based generative model that requires strict Lipschitzness, using our orthogonal networks.", "keywords": "orthogonal convolutions;adversarial robustness;spectral analysis;paraunitary systems", "primary_area": "", "supplementary_material": "/attachment/c5aec5d873ae2d1a263ff14f4a538a2d811c15d0.zip", "author": "Jiahao Su;Wonmin Byeon;Furong Huang", "authorids": "~Jiahao_Su1;~Wonmin_Byeon1;~Furong_Huang1", "gender": "M;;F", "homepage": ";https://wonmin-byeon.github.io/;https://furong-huang.com", "dblp": ";40/10201;72/8513", "google_scholar": "z4AEqYkAAAAJ;0497CHoAAAAJ;13yyuCcAAAAJ", "orcid": ";;", "linkedin": "jiahaosu-umd/;;", "or_profile": "~Jiahao_Su1;~Wonmin_Byeon1;~Furong_Huang1", "aff": "University of Maryland, College Park;NVIDIA;University of Maryland", "aff_domain": "umd.edu;nvidia.com;cs.umd.edu", "position": "PhD student;Researcher;Assistant Professor", "bibtex": "@misc{\nsu2022scalingup,\ntitle={Scaling-up Diverse Orthogonal Convolutional Networks by a Paraunitary Framework},\nauthor={Jiahao Su and Wonmin Byeon and Furong Huang},\nyear={2022},\nurl={https://openreview.net/forum?id=t2LJBsPxQM}\n}", "github": "", "project": "", "reviewers": "5fbR;EX1P;yYMs;rQN6", "site": "https://openreview.net/forum?id=t2LJBsPxQM", "pdf_size": 0, "recommendation": "3;6;6;8", "confidence": "5;4;4;4", "correctness": "2;4;3;4", "technical_novelty": "2;3;3;4", "empirical_novelty": "2;2;3;4", "wc_summary_paper": "35;53;63;50", "wc_summary_review": "50;18;36;23", "wc_main_review": "573;353;233;331", "wc_review": "658;424;332;404", "wc_reply_reviewers": "654;0;0;144", "wc_reply_authors": "2608;948;638;607", "reply_reviewers": "2;0;0;2", "reply_authors": "5;3;2;3", "recommendation_avg": [ 5.75, 1.7853571071357126 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 50.25, 10.034316120194738 ], "wc_summary_review_avg": [ 31.75, 12.417225938187643 ], "wc_main_review_avg": [ 372.5, 124.26081441870562 ], "wc_review_avg": [ 454.5, 122.37136102863283 ], "wc_reply_reviewers_avg": [ 199.5, 268.9103010299159 ], "wc_reply_authors_avg": [ 1200.25, 823.6292779521622 ], "reply_reviewers_avg": [ 1.0, 1.0 ], "reply_authors_avg": [ 3.25, 1.0897247358851685 ], "replies_avg": [ 23, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.8892972917998875, "corr_recommendation_correctness": 0.8866206949335731, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3882925949659737411&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff_unique_index": "0;1;0", "aff_unique_norm": "University of Maryland;NVIDIA", "aff_unique_dep": ";NVIDIA Corporation", "aff_unique_url": "https://www/umd.edu;https://www.nvidia.com", "aff_unique_abbr": "UMD;NVIDIA", "aff_campus_unique_index": "0", "aff_campus_unique": "College Park;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "t2Mzgc9JEjZ", "title": "Certified Patch Robustness via Smoothed Vision Transformers", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Certified patch defenses can guarantee robustness of an image classifier to arbitrary changes within a bounded contiguous region. But, currently, this robustness comes at a cost of degraded standard accuracies and slower inference times. We demonstrate how using vision transformers enables significantly better certified patch robustness that is also more computationally efficient and does not incur a substantial drop in standard accuracy. These improvements stem from the inherent ability of the vision transformer to gracefully handle severely masked images.", "keywords": "adversarial robustness;patch defense;vision transformers;deep learning;computer vision;certified defense;adversarial example", "primary_area": "", "supplementary_material": "/attachment/7fb304a82e8b983f49dececb8c1e2b82cf1ed58b.zip", "author": "Hadi Salman;Saachi Jain;Eric Wong;Aleksander Madry", "authorids": "~Hadi_Salman1;~Saachi_Jain1;~Eric_Wong1;~Aleksander_Madry1", "gender": "M;F;M;M", "homepage": "https://hadisalman.com/;http://people.csail.mit.edu/saachij/;http://riceric22.github.io/;https://people.csail.mit.edu/madry/", "dblp": "192/3204;227/2617;64/1811-1.html;67/2454", "google_scholar": "Kr8JjF0AAAAJ;6hsn3EYAAAAJ;pWnTMRkAAAAJ;SupjsEUAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Hadi_Salman1;~Saachi_Jain1;~Eric_Wong1;~Aleksander_Madry1", "aff": "Massachusetts Institute of Technology;Massachusetts Institute of Technology;Massachusetts Institute of Technology;Massachusetts Institute of Technology", "aff_domain": "mit.edu;mit.edu;mit.edu;mit.edu", "position": "PhD Student;PhD student;Postdoc;Professor", "bibtex": "@misc{\nsalman2022certified,\ntitle={Certified Patch Robustness via Smoothed Vision Transformers},\nauthor={Hadi Salman and Saachi Jain and Eric Wong and Aleksander Madry},\nyear={2022},\nurl={https://openreview.net/forum?id=t2Mzgc9JEjZ}\n}", "github": "", "project": "", "reviewers": "WFJK;GVq9;nxJQ;bcr4", "site": "https://openreview.net/forum?id=t2Mzgc9JEjZ", "pdf_size": 0, "recommendation": "5;5;5;6", "confidence": "4;3;3;4", "correctness": "4;3;3;4", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;2;2;4", "wc_summary_paper": "92;39;46;30", "wc_summary_review": "32;56;12;30", "wc_main_review": "122;279;121;95", "wc_review": "246;374;179;155", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 5.25, 0.4330127018922193 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 51.75, 23.920441049445557 ], "wc_summary_review_avg": [ 32.5, 15.644487847162015 ], "wc_main_review_avg": [ 154.25, 72.83328565978607 ], "wc_review_avg": [ 238.5, 85.04263636553138 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 75, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8940759754720060702&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Massachusetts Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://web.mit.edu", "aff_unique_abbr": "MIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "t3BFUDHwEJU", "title": "Delayed Geometric Discounts: An alternative criterion for Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "The endeavor of artificial intelligence (AI) is to design autonomous agents capable of achieving complex tasks. Namely, reinforcement learning (RL) proposes a theoretical background to learn optimal behaviors. In practice, RL algorithms rely on geometric discounts to evaluate this optimality. Unfortunately, this does not cover decision processes where future returns are not exponentially less valuable.\nDepending on the problem, this limitation induces sample-inefficiency (as feed-backs are exponentially decayed) and requires additional curricula/exploration mechanisms (to deal with sparse, deceptive or adversarial rewards).\nIn this paper, we tackle these issues by generalizing the discounted problem formulation with a family of delayed objective functions. We investigate the underlying RL problem to derive: 1) the optimal stationary solution and 2) an approximation of the optimal non-stationary control. The devised algorithms solved hard exploration problems on tabular environment and improved sample-efficiency on classic simulated robotics benchmarks.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/67a1fec92b36e423cd41df8f01cbb30b7d237c82.zip", "author": "Firas Jarboui;Ahmed Akakzia", "authorids": "~Firas_Jarboui1;~Ahmed_Akakzia1", "gender": "M;M", "homepage": ";", "dblp": ";", "google_scholar": ";U2CTuQUAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Firas_Jarboui1;~Ahmed_Akakzia1", "aff": ";ISIR, UMR 7222", "aff_domain": ";sorbonne-universite.fr", "position": ";PhD student", "bibtex": "@misc{\njarboui2022delayed,\ntitle={Delayed Geometric Discounts: An alternative criterion for Reinforcement Learning},\nauthor={Firas Jarboui and Ahmed Akakzia},\nyear={2022},\nurl={https://openreview.net/forum?id=t3BFUDHwEJU}\n}", "github": "", "project": "", "reviewers": "7SFq;hhP8;1aQD;tzWw", "site": "https://openreview.net/forum?id=t3BFUDHwEJU", "pdf_size": 0, "recommendation": "3;3;5;6", "confidence": "4;4;3;3", "correctness": "3;2;3;3", "technical_novelty": "3;2;3;3", "empirical_novelty": "3;1;3;2", "wc_summary_paper": "37;33;258;248", "wc_summary_review": "86;41;130;48", "wc_main_review": "917;530;1251;491", "wc_review": "1040;604;1639;787", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.25, 1.299038105676658 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 144.0, 109.0664934798951 ], "wc_summary_review_avg": [ 76.25, 35.442735503908274 ], "wc_main_review_avg": [ 797.25, 310.4193735899871 ], "wc_review_avg": [ 1017.5, 390.79438327591146 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.9622504486493761, "corr_recommendation_correctness": 0.5555555555555555, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11769117362116005046&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 4, "aff_unique_index": "0", "aff_unique_norm": "Institut des Sciences de l'Ing\u00e9nierie de Robotique", "aff_unique_dep": "UMR 7222", "aff_unique_url": "https://www.isir.upmc.fr", "aff_unique_abbr": "ISIR", "aff_country_unique_index": "0", "aff_country_unique": "France" }, { "id": "t3E10H8UNz", "title": "Transferring Hierarchical Structure with Dual Meta Imitation Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Hierarchical Imitation learning (HIL) is an effective way for robots to learn sub-skills from long-horizon unsegmented demonstrations. However, the learned hierarchical structure lacks the mechanism to transfer across multi-tasks or to new tasks, which makes them have to learn from scratch when facing a new situation. Transferring and reorganizing modular sub-skills require fast adaptation ability of the whole hierarchical structure. In this work, we propose Dual Meta Imitation Learning (DMIL), a hierarchical meta imitation learning method where the high-level network and sub-skills are iteratively meta-learned with model-agnostic meta-learning. DMIL uses the likelihood of state-action pairs from each sub-skill as the supervision for the high-level network adaptation, and use the adapted high-level network to determine different data set for each sub-skill adaptation. We theoretically prove the convergence of the iterative training process of DMIL and establish the connection between DMIL and the Expectation-Maximization algorithm. Empirically, we achieve state-of-the-art few-shot imitation learning performance on the meta-world benchmark.", "keywords": "imitation learning;meta learning;hierarchical structure;robot learning", "primary_area": "", "supplementary_material": "/attachment/d0725b95c4c7000e694e70808b4f25dab098b79c.zip", "author": "Chongkai Gao;Yizhou Jiang;Feng Chen", "authorids": "~Chongkai_Gao1;~Yizhou_Jiang1;~Feng_Chen1", "gender": "M;M;M", "homepage": "https://chongkaigao.com/;;", "dblp": "295/8658;201/8247;21/3047-7", "google_scholar": "l_mOqY8AAAAJ;oM8ue_UAAAAJ;", "orcid": ";;0000-0003-4813-2494", "linkedin": ";;", "or_profile": "~Chongkai_Gao1;~Yizhou_Jiang1;~Feng_Chen1", "aff": "Tsinghua University;Tsinghua University;Tsinghua University", "aff_domain": "tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn", "position": "MS student;PhD student;Full Professor", "bibtex": "@misc{\ngao2022transferring,\ntitle={Transferring Hierarchical Structure with Dual Meta Imitation Learning},\nauthor={Chongkai Gao and Yizhou Jiang and Feng Chen},\nyear={2022},\nurl={https://openreview.net/forum?id=t3E10H8UNz}\n}", "github": "", "project": "", "reviewers": "wfsx;ouLD;2AqY;ZyiJ", "site": "https://openreview.net/forum?id=t3E10H8UNz", "pdf_size": 0, "recommendation": "6;6;6;8", "confidence": "3;4;2;4", "correctness": "3;4;4;4", "technical_novelty": "3;3;3;3", "empirical_novelty": "3;2;2;3", "wc_summary_paper": "69;71;58;79", "wc_summary_review": "103;48;53;56", "wc_main_review": "535;459;182;102", "wc_review": "707;578;293;237", "wc_reply_reviewers": "106;127;29;0", "wc_reply_authors": "1489;1736;1555;633", "reply_reviewers": "1;1;1;0", "reply_authors": "3;4;3;1", "recommendation_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 69.25, 7.495832175282475 ], "wc_summary_review_avg": [ 65.0, 22.124646889837585 ], "wc_main_review_avg": [ 319.5, 181.7367601780113 ], "wc_review_avg": [ 453.75, 195.18885085987878 ], "wc_reply_reviewers_avg": [ 65.5, 52.547597471245055 ], "wc_reply_authors_avg": [ 1353.25, 425.55515212484505 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 2.75, 1.0897247358851685 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.5222329678670935, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11619969074939056136&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0;0", "aff_unique_norm": "Tsinghua University", "aff_unique_dep": "", "aff_unique_url": "https://www.tsinghua.edu.cn", "aff_unique_abbr": "THU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "SOSP: Efficiently Capturing Global Correlations by Second-Order Structured Pruning", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7189", "id": "t5EmXZ3ZLR", "poster": "", "openreview": "https://openreview.net/forum?id=t5EmXZ3ZLR", "slides": "https://iclr.cc/virtual/2022/poster/7189", "video": "https://iclr.cc/virtual/2022/poster/7189", "author_site": "Manuel Nonnenmacher, Thomas Pfeil, Ingo Steinwart, David Reeb", "tldr": "", "abstract": "Pruning neural networks reduces inference time and memory costs. On standard hardware, these benefits will be especially prominent if coarse-grained structures, like feature maps, are pruned. We devise two novel saliency-based methods for second-order structured pruning (SOSP) which include correlations among all structures and layers. Our main method SOSP-H employs an innovative second-order approximation, which enables saliency evaluations by fast Hessian-vector products. SOSP-H thereby scales like a first-order method despite taking into account the full Hessian. We validate SOSP-H by comparing it to our second method SOSP-I that uses a well-established Hessian approximation, and to numerous state-of-the-art methods. While SOSP-H performs on par or better in terms of accuracy, it has clear advantages in terms of scalability and efficiency. This allowed us to scale SOSP-H to large-scale vision tasks, even though it captures correlations across all layers of the network. To underscore the global nature of our pruning methods, we evaluate their performance not only by removing structures from a pretrained network, but also by detecting architectural bottlenecks. We show that our algorithms allow to systematically reveal architectural bottlenecks, which we then remove to further increase the accuracy of the networks.", "keywords": "Structured Pruning;Saliency-based Pruning;Network Compression;Hessian Approximation;Neural Architecture Search;Deep Learning;Computer Vision", "primary_area": "", "supplementary_material": "", "author": "Manuel Nonnenmacher;Thomas Pfeil;Ingo Steinwart;David Reeb", "authorids": "~Manuel_Nonnenmacher1;~Thomas_Pfeil1;~Ingo_Steinwart1;~David_Reeb2", "gender": "M;;M;M", "homepage": "https://www.isa.uni-stuttgart.de/institut/team/Nonnenmacher-00002/;;https://www.isa.uni-stuttgart.de/en/institute/team/Steinwart-00002/;https://www.bosch-ai.com/about-us/our-people/", "dblp": ";28/9749;89/3492;129/1561", "google_scholar": "O4_qGokAAAAJ;e5dO0q0AAAAJ;https://scholar.google.de/citations?user=zFuwHeAAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";;0000-0002-4436-7109;", "linkedin": ";;;", "or_profile": "~Manuel_Nonnenmacher1;~Thomas_Pfeil1;~Ingo_Steinwart1;~David_Reeb2", "aff": "Bosch;Bosch;University of Stuttgart;Robert Bosch GmbH, Bosch", "aff_domain": "bosch.com;bosch.com;uni-stuttgart.de;de.bosch.com", "position": "PhD student;Research Scientist;Full Professor;Research Scientist", "bibtex": "@inproceedings{\nnonnenmacher2022sosp,\ntitle={{SOSP}: Efficiently Capturing Global Correlations by Second-Order Structured Pruning},\nauthor={Manuel Nonnenmacher and Thomas Pfeil and Ingo Steinwart and David Reeb},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=t5EmXZ3ZLR}\n}", "github": "", "project": "", "reviewers": "q96Z;7bu1;6Vrj;3yXC", "pdf_size": 0, "recommendation": "6;8;8;8", "confidence": "3;2;4;3", "correctness": "3;4;3;3", "technical_novelty": "2;3;3;3", "empirical_novelty": "3;3;3;3", "wc_summary_paper": "53;65;257;100", "wc_summary_review": "18;29;88;86", "wc_main_review": "176;282;400;265", "wc_review": "247;376;745;451", "wc_reply_reviewers": "0;0;50;0", "wc_reply_authors": "685;955;1022;605", "reply_reviewers": "0;0;1;0", "reply_authors": "1;2;2;1", "recommendation_avg": [ 7.5, 0.8660254037844386 ], "confidence_avg": [ 3.0, 0.7071067811865476 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 118.75, 81.66509352226323 ], "wc_summary_review_avg": [ 55.25, 31.995116814914116 ], "wc_main_review_avg": [ 280.75, 79.75391839903541 ], "wc_review_avg": [ 454.75, 182.77086064249957 ], "wc_reply_reviewers_avg": [ 12.5, 21.650635094610966 ], "wc_reply_authors_avg": [ 816.75, 175.66783285507907 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 40, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7488688580406303390&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=t5EmXZ3ZLR", "email": "bosch.com;bosch.com;uni-stuttgart.de;de.bosch.com", "author_num": 4, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Robert Bosch GmbH;University of Stuttgart", "aff_unique_dep": ";", "aff_unique_url": "https://www.bosch.com;https://www.uni-stuttgart.de", "aff_unique_abbr": "Bosch;USTuttgart", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Germany" }, { "title": "Conditioning Sequence-to-sequence Networks with Learned Activations", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/5916", "id": "t5s-hd1bqLk", "poster": "", "openreview": "https://openreview.net/forum?id=t5s-hd1bqLk", "slides": "https://iclr.cc/virtual/2022/poster/5916", "video": "https://iclr.cc/virtual/2022/poster/5916", "author_site": "Alberto Gil Couto Pimentel Ramos, Abhinav Mehrotra, Nicholas Lane, Sourav Bhattacharya", "tldr": "", "abstract": "Conditional neural networks play an important role in a number of sequence-to-sequence modeling tasks, including personalized sound enhancement (PSE), speaker dependent automatic speech recognition (ASR), and generative modeling such as text-to-speech synthesis. In conditional neural networks, the output of a model is often influenced by a conditioning vector, in addition to the input. Common approaches of conditioning include input concatenation or modulation with the conditioning vector, which comes at a cost of increased model size. In this work, we introduce a novel approach of neural network conditioning by learning intermediate layer activations based on the conditioning vector. We systematically explore and show that learned activation functions can produce conditional models with comparable or better quality, while decreasing model sizes, thus making them ideal candidates for resource-efficient on-device deployment. As exemplary target use-cases we consider (i) the task of PSE as a pre-processing technique for improving telephony or pre-trained ASR performance under noise, and (ii) personalized ASR in single speaker scenarios. We find that conditioning via activation function learning is an effective modeling strategy, suggesting a broad applicability of the proposed technique across a number of application domains.", "keywords": "Conditional Neural Networks;Sound Enhancement;Personalized ASR", "primary_area": "", "supplementary_material": "/attachment/9f037e606072f0946fa374e369cc3154d38e33f3.zip", "author": "Alberto Gil Couto Pimentel Ramos;Abhinav Mehrotra;Nicholas Donald Lane;Sourav Bhattacharya", "authorids": "~Alberto_Gil_Couto_Pimentel_Ramos1;~Abhinav_Mehrotra1;~Nicholas_Donald_Lane1;~Sourav_Bhattacharya1", "gender": "M;M;M;M", "homepage": ";https://abhinavmehrotra.github.io/;;http://niclane.org", "dblp": ";154/4273;69/3637;03/2663.html", "google_scholar": ";https://scholar.google.co.uk/citations?user=AbeyFKwAAAAJ;EU-ESvsAAAAJ;https://scholar.google.co.uk/citations?hl=en", "orcid": ";;;0000-0002-2728-8273", "linkedin": "albertogilramos;;;niclane", "or_profile": "~Alberto_Gil_Couto_Pimentel_Ramos1;~Abhinav_Mehrotra1;~Sourav_Bhattacharya1;~Nic_Lane2", "aff": "Samsung;Samsung AI Center;Samsung AI Center;Samsung", "aff_domain": "samsung.com;samsung.com;samsung.com;samsung.com", "position": "Researcher;Researcher;Principal Researcher;Laboratory Director", "bibtex": "@inproceedings{\nramos2022conditioning,\ntitle={Conditioning Sequence-to-sequence Networks with Learned Activations},\nauthor={Alberto Gil Couto Pimentel Ramos and Abhinav Mehrotra and Nicholas Donald Lane and Sourav Bhattacharya},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=t5s-hd1bqLk}\n}", "github": "", "project": "", "reviewers": "dsMi;QdFv;gotN", "pdf_size": 0, "recommendation": "6;6;6", "confidence": "4;3;3", "correctness": "4;3;3", "technical_novelty": "3;3;4", "empirical_novelty": "2;3;3", "wc_summary_paper": "120;93;140", "wc_summary_review": "35;84;14", "wc_main_review": "178;240;492", "wc_review": "333;417;646", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "515;784;855", "reply_reviewers": "0;0;0", "reply_authors": "2;2;2", "recommendation_avg": [ 6.0, 0.0 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 3.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 117.66666666666667, 19.258475767539053 ], "wc_summary_review_avg": [ 44.333333333333336, 29.32954520994525 ], "wc_main_review_avg": [ 303.3333333333333, 135.7874155517448 ], "wc_review_avg": [ 465.3333333333333, 132.27328612468287 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 718.0, 146.43997632704898 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11832884194752972303&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "pdf": "https://openreview.net/pdf?id=t5s-hd1bqLk", "email": "samsung.com;samsung.com;samsung.com;samsung.com", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Samsung", "aff_unique_dep": "Samsung", "aff_unique_url": "https://www.samsung.com", "aff_unique_abbr": "Samsung", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "South Korea" }, { "id": "t7y6MKiyiWx", "title": "Classical and Quantum Algorithms for Orthogonal Neural Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "Orthogonal neural networks have recently been introduced as a new type of neural network imposing orthogonality on the weight matrices. They could achieve higher accuracy and avoid evanescent or explosive gradients for deep architectures. Several classical gradient descent methods have been proposed to preserve orthogonality while updating the weight matrices, but these techniques suffer from long running times and/or provide only approximate orthogonality. In this paper, we introduce a new type of neural network layer called Pyramidal Circuit, which implements an orthogonal matrix multiplication. It allows for gradient descent with perfect orthogonality with the same asymptotic running time as a standard fully connected layer. This algorithm is inspired by quantum computing and can therefore be applied on a classical computer as well as on a near term quantum computer. It could become the building block for quantum neural networks and faster orthogonal neural networks.", "keywords": "orthogonal neural networks;orthogonality;quantum computing;quantum machine learning;quantum deep learning;complexity;quantum computer", "primary_area": "", "supplementary_material": "", "author": "Jonas Landman;Natansh Mathur;Iordanis Kerenidis", "authorids": "~Jonas_Landman1;~Natansh_Mathur1;~Iordanis_Kerenidis1", "gender": ";M;", "homepage": ";;https://www.irif.fr/~jkeren/", "dblp": "https://dblp.uni-trier.de/pers/hd/l/Landman:Jonas;;https://dblp.uni-trier.de/pers/k/Kerenidis:Iordanis.html", "google_scholar": "itqTkSwAAAAJ;;https://scholar.google.fr/citations?hl=en", "orcid": "0000-0002-2039-5308;;", "linkedin": ";natanshmathur/;", "or_profile": "~Jonas_Landman1;~Natansh_Mathur1;~Iordanis_Kerenidis1", "aff": "University of Edinburgh, University of Edinburgh;Universit\u00e9 de Paris;Universit\u00e9 Paris Diderot", "aff_domain": "ed.ac.uk;u-paris.fr;univ-paris-diderot.fr", "position": "Postdoc;MS student;Full Professor", "bibtex": "@misc{\nlandman2022classical,\ntitle={Classical and Quantum Algorithms for Orthogonal Neural Networks},\nauthor={Jonas Landman and Natansh Mathur and Iordanis Kerenidis},\nyear={2022},\nurl={https://openreview.net/forum?id=t7y6MKiyiWx}\n}", "github": "", "project": "", "reviewers": "h1gX;ji2w;HBaw;oStG", "site": "https://openreview.net/forum?id=t7y6MKiyiWx", "pdf_size": 0, "recommendation": "1;5;6;6", "confidence": "5;2;2;4", "correctness": "3;2;4;4", "technical_novelty": "1;2;3;3", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "43;103;83;81", "wc_summary_review": "22;99;43;34", "wc_main_review": "516;713;78;191", "wc_review": "581;915;204;306", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.5, 2.0615528128088303 ], "confidence_avg": [ 3.25, 1.299038105676658 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.25, 0.82915619758885 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 77.5, 21.696773953747133 ], "wc_summary_review_avg": [ 49.5, 29.53387885124472 ], "wc_main_review_avg": [ 374.5, 253.07558159569643 ], "wc_review_avg": [ 501.5, 275.69412398526015 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.7001400420140049, "corr_recommendation_correctness": 0.3656362120635653, "gs_citation": 38, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2374241776979629934&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1;2", "aff_unique_norm": "University of Edinburgh;Universit\u00e9 de Paris;Universit\u00e9 Paris Diderot", "aff_unique_dep": ";;", "aff_unique_url": "https://www.ed.ac.uk;https://www.universitedeparis.fr;https://www.univ-paris-diderot.fr", "aff_unique_abbr": "Edinburgh;UP;UPD", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1", "aff_country_unique": "United Kingdom;France" }, { "title": "Learning Optimal Conformal Classifiers", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6836", "id": "t8O-4LKFVx", "poster": "", "openreview": "https://openreview.net/forum?id=t8O-4LKFVx", "slides": "https://iclr.cc/virtual/2022/poster/6836", "video": "https://iclr.cc/virtual/2022/poster/6836", "author_site": "David Stutz, Krishnamurthy Dvijotham, Ali Taylan Cemgil, Arnaud Doucet", "tldr": "", "abstract": "Modern deep learning based classifiers show very high accuracy on test data but this does not provide sufficient guarantees for safe deployment, especially in high-stake AI applications such as medical diagnosis. Usually, predictions are obtained without a reliable uncertainty estimate or a formal guarantee. Conformal prediction (CP) addresses these issues by using the classifier's predictions, e.g., its probability estimates, to predict confidence sets containing the true class with a user-specified probability. However, using CP as a separate processing step after training prevents the underlying model from adapting to the prediction of confidence sets. Thus, this paper explores strategies to differentiate through CP during training with the goal of training model with the conformal wrapper end-to-end. In our approach, conformal training (ConfTr), we specifically \"simulate\" conformalization on mini-batches during training. Compared to standard training, ConfTr reduces the average confidence set size (inefficiency) of state-of-the-art CP methods applied after training. Moreover, it allows to \"shape\" the confidence sets predicted at test time, which is difficult for standard CP. On experiments with several datasets, we show ConfTr can influence how inefficiency is distributed across classes, or guide the composition of confidence sets in terms of the included classes, while retaining the guarantees offered by CP.", "keywords": "conformal prediction;conformal classification;uncertainty estimation", "primary_area": "", "supplementary_material": "", "author": "David Stutz;Krishnamurthy Dj Dvijotham;Ali Taylan Cemgil;Arnaud Doucet", "authorids": "~David_Stutz1;~Krishnamurthy_Dj_Dvijotham1;~Ali_Taylan_Cemgil2;~Arnaud_Doucet2", "gender": "M;;M;M", "homepage": "http://davidstutz.de/;https://www.stats.ox.ac.uk/~doucet/;https://www.cmpe.boun.edu.tr/~cemgil/;http://dvij.github.io", "dblp": "17/9394;68/1628;41/6613;16/8758", "google_scholar": "TxEy3cwAAAAJ;W4SZGV8AAAAJ;X3ZFZ7AAAAAJ;BUtloecAAAAJ", "orcid": ";0000-0002-7662-419X;http://orcid.org/0000-0003-4463-8455;", "linkedin": "davidstutz92/;;;", "or_profile": "~David_Stutz1;~Arnaud_Doucet2;~ali_taylan_cemgil1;~Krishnamurthy_Dvijotham2", "aff": "Saarland Informatics Campus, Max-Planck Institute;University of Oxford;Bogazici University;Google Brain", "aff_domain": "mpi-inf.mpg.de;ox.ac.uk;boun.edu.tr;google.com", "position": "PhD student;Full Professor;Full Professor;research scientist ", "bibtex": "@inproceedings{\nstutz2022learning,\ntitle={Learning Optimal Conformal Classifiers},\nauthor={David Stutz and Krishnamurthy Dj Dvijotham and Ali Taylan Cemgil and Arnaud Doucet},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=t8O-4LKFVx}\n}", "github": "", "project": "", "reviewers": "1Awq;uxwS;8BvY;iXzv", "pdf_size": 0, "recommendation": "5;8;8;8", "confidence": "3;3;3;4", "correctness": "3;2;3;4", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;3;3;0", "wc_summary_paper": "81;96;155;99", "wc_summary_review": "25;140;35;26", "wc_main_review": "173;593;289;262", "wc_review": "279;829;479;387", "wc_reply_reviewers": "249;15;29;63", "wc_reply_authors": "907;381;119;194", "reply_reviewers": "1;1;1;1", "reply_authors": "2;1;1;1", "recommendation_avg": [ 7.25, 1.299038105676658 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 1.224744871391589 ], "wc_summary_paper_avg": [ 107.75, 28.119166061602893 ], "wc_summary_review_avg": [ 56.5, 48.36579369761237 ], "wc_main_review_avg": [ 329.25, 158.20931546530375 ], "wc_review_avg": [ 493.5, 206.2298475003073 ], "wc_reply_reviewers_avg": [ 89.0, 94.01063769595439 ], "wc_reply_authors_avg": [ 400.25, 307.73639287546087 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": 0.0, "gs_citation": 104, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5366968417529245684&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "pdf": "https://openreview.net/pdf?id=t8O-4LKFVx", "email": "mpi-inf.mpg.de;ox.ac.uk;boun.edu.tr;google.com", "author_num": 4, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "Max-Planck Institute;University of Oxford;Bogazici University;Google", "aff_unique_dep": "Informatics;;;Google Brain", "aff_unique_url": "https://www.mpi-sws.org;https://www.ox.ac.uk;https://www.boun.edu.tr;https://brain.google.com", "aff_unique_abbr": "MPI-SWS;Oxford;BU;Google Brain", "aff_campus_unique_index": "0;2", "aff_campus_unique": "Saarland;;Mountain View", "aff_country_unique_index": "0;1;2;3", "aff_country_unique": "Germany;United Kingdom;T\u00fcrkiye;United States" }, { "title": "Optimal Transport for Long-Tailed Recognition with Learnable Cost Matrix", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6454", "id": "t98k9ePQQpn", "poster": "", "openreview": "https://openreview.net/forum?id=t98k9ePQQpn", "slides": "https://iclr.cc/virtual/2022/poster/6454", "video": "https://iclr.cc/virtual/2022/poster/6454", "author_site": "Hanyu Peng, Mingming Sun, Ping Li", "tldr": "", "abstract": "It is attracting attention to the long-tailed recognition problem, a burning issue that has become very popular recently. Distinctive from conventional recognition is that it posits that the allocation of the training set is supremely distorted. Predictably, it will pose challenges to the generalisation behaviour of the model. Approaches to these challenges revolve into two groups: firstly, training-aware methods, with the aim of enhancing the generalisability of the model by exploiting its potential in the training period; and secondly, post-hoc correction, liberally coupled with training-aware methods, which is intended to refine the predictions to the extent possible in the post-processing stage, offering the advantages of simplicity and effectiveness. This paper introduces an alternative direction to do the post-hoc correction, which goes beyond the statistical methods. Mathematically, we approach this issue from the perspective of optimal transport (OT), yet, choosing the exact cost matrix when applying OT is challenging and requires expert knowledge of various tasks. To overcome this limitation, we propose to employ linear mapping to learn the cost matrix without necessary configurations adaptively. Testing our methods in practice, along with high efficiency and excellent performance, our method surpasses all previous methods and has the best performance to date.", "keywords": "Long-tailed recognition;imbalanced classification;optimal transport", "primary_area": "", "supplementary_material": "", "author": "Hanyu Peng;Mingming Sun;Ping Li", "authorids": "~Hanyu_Peng1;~Mingming_Sun1;~Ping_Li3", "gender": ";M;M", "homepage": ";;http://www.stat.rutgers.edu/home/pingli/", "dblp": ";87/8665-1.html;62/5860-1", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": "~Hanyu_Peng1;~Mingming_Sun1;~Ping_Li3", "aff": ";Baidu;LinkedIn", "aff_domain": ";baidu.com;linkedin.com", "position": ";Principal Researcher;Engineer", "bibtex": "@inproceedings{\npeng2022optimal,\ntitle={Optimal Transport for Long-Tailed Recognition with Learnable Cost Matrix},\nauthor={Hanyu Peng and Mingming Sun and Ping Li},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=t98k9ePQQpn}\n}", "github": "", "project": "", "reviewers": "uP2E;vjwX;fcxC", "pdf_size": 0, "recommendation": "6;6;6", "confidence": "3;3;4", "correctness": "3;3;3", "technical_novelty": "3;3;3", "empirical_novelty": "3;2;3", "wc_summary_paper": "23;79;53", "wc_summary_review": "103;104;79", "wc_main_review": "405;402;209", "wc_review": "531;585;341", "wc_reply_reviewers": "457;205;0", "wc_reply_authors": "922;1594;361", "reply_reviewers": "2;1;0", "reply_authors": "3;4;1", "recommendation_avg": [ 6.0, 0.0 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 51.666666666666664, 22.88133640230735 ], "wc_summary_review_avg": [ 95.33333333333333, 11.55662388223981 ], "wc_main_review_avg": [ 338.6666666666667, 91.69635882750318 ], "wc_review_avg": [ 485.6666666666667, 104.64330949574473 ], "wc_reply_reviewers_avg": [ 220.66666666666666, 186.89807085384507 ], "wc_reply_authors_avg": [ 959.0, 504.04960073389606 ], "reply_reviewers_avg": [ 1.0, 0.816496580927726 ], "reply_authors_avg": [ 2.6666666666666665, 1.247219128924647 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 26, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14373516685114811969&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2, "pdf": "https://openreview.net/pdf?id=t98k9ePQQpn", "email": ";baidu.com;linkedin.com", "author_num": 3, "aff_unique_index": "0;1", "aff_unique_norm": "Baidu;LinkedIn Corporation", "aff_unique_dep": "Baidu, Inc.;", "aff_unique_url": "https://www.baidu.com;https://www.linkedin.com", "aff_unique_abbr": "Baidu;LinkedIn", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "China;United States" }, { "title": "Feature Kernel Distillation", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7194", "id": "tBIQEvApZK5", "poster": "", "openreview": "https://openreview.net/forum?id=tBIQEvApZK5", "slides": "https://iclr.cc/virtual/2022/poster/7194", "video": "https://iclr.cc/virtual/2022/poster/7194", "author_site": "Bobby He, Mete Ozay", "tldr": "", "abstract": "Trained Neural Networks (NNs) can be viewed as data-dependent kernel machines, with predictions determined by the inner product of last-layer representations across inputs, referred to as the feature kernel. We explore the relevance of the feature kernel for Knowledge Distillation (KD), using a mechanistic understanding of an NN\u2019s optimisation process. We extend the theoretical analysis of Allen-Zhu & Li (2020) to show that a trained NN\u2019s feature kernel is highly dependent on its parameter initialisation, which biases different initialisations of the same architecture to learn different data attributes in a multi-view data setting. This enables us to prove that KD using only pairwise feature kernel comparisons can improve NN test accuracy in such settings, with both single & ensemble teacher models, whereas standard training without KD fails to generalise. We further use our theory to motivate practical considerations for improving student generalisation when using distillation with feature kernels, which allows us to propose a novel approach: Feature Kernel Distillation (FKD). Finally, we experimentally corroborate our theory in the image classification setting, showing that FKD is amenable to ensemble distillation, can transfer knowledge across datasets, and outperforms both vanilla KD & other feature kernel based KD baselines across a range of standard architectures & datasets.", "keywords": "Knowledge distillation;Neural Network (NN) Feature learning;ensembling NNs;Deep learning fundamentals;Image classification", "primary_area": "", "supplementary_material": "", "author": "Bobby He;Mete Ozay", "authorids": "~Bobby_He1;~Mete_Ozay1", "gender": ";M", "homepage": "http://csml.stats.ox.ac.uk/people/he/;https://deepai.org/profile/mete-ozay", "dblp": "270/3685;26/5515", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": "~Bobby_He1;~Mete_Ozay1", "aff": "University of Oxford;Samsung Research", "aff_domain": "ox.ac.uk;samsung.com", "position": "PhD student;Principal Researcher", "bibtex": "@inproceedings{\nhe2022feature,\ntitle={Feature Kernel Distillation},\nauthor={Bobby He and Mete Ozay},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=tBIQEvApZK5}\n}", "github": "", "project": "", "reviewers": "h8ud;68WG;DKJu;NGSC", "pdf_size": 0, "recommendation": "6;6;6;8", "confidence": "4;3;4;3", "correctness": "3;3;3;4", "technical_novelty": "2;3;2;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "77;213;121;57", "wc_summary_review": "138;46;65;18", "wc_main_review": "508;109;291;94", "wc_review": "723;368;477;169", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "1645;562;1905;364", "reply_reviewers": "0;0;0;0", "reply_authors": "3;1;3;1", "recommendation_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 117.0, 60.06662967072483 ], "wc_summary_review_avg": [ 66.75, 44.40368790990226 ], "wc_main_review_avg": [ 250.5, 167.6760269090367 ], "wc_review_avg": [ 434.25, 199.96921638092198 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 1119.0, 666.0979657677991 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.0, 1.0 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 1.0, "gs_citation": 28, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15393088714001021920&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "pdf": "https://openreview.net/pdf?id=tBIQEvApZK5", "email": "ox.ac.uk;samsung.com", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "University of Oxford;Samsung", "aff_unique_dep": ";Samsung Research", "aff_unique_url": "https://www.ox.ac.uk;https://research.samsung.com", "aff_unique_abbr": "Oxford;Samsung", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "United Kingdom;South Korea" }, { "id": "tBoSm4hUWV", "title": "WaveMix: Multi-Resolution Token Mixing for Images", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Even though vision transformers (ViTs) have provided state-of-the-art results on image classification, their requirements of large data, model size, and GPU usage have put them out of reach of most practitioners of computer vision. We present WaveMix as an alternative to self-attention mechanisms in ViT and convolutional neural networks to significantly reduce computational costs and memory footprint without compromising on image classification accuracy. WaveMix uses a multi-level two-dimensional discrete wavelet transform for mixing tokens and aggregating multi-resolution pixel information over long distances, which gives it the following advantages. Firstly, unlike the self-attention mechanism of ViT, WaveMix does not unroll the image. Thus, it has the right inductive bias to utilize the 2-D structure of an image, which reduces the demand for large training data. Additionally, the quadratic complexity with respect to sequence length is also eliminated. Secondly, due to its multi-resolution token-mixing, WaveMix also requires much fewer layers than a CNN does for comparable accuracy. Preliminary results from our experiments on supervised learning using CIFAR-10 dataset show that a four-layer WaveMix model can be 37% more accurate than a ViT with a comparable number of parameters, while consuming only 3% of the latter's GPU RAM and memory. This model also performs better than efficient transformers and models not based on attention, such as, FNet, and MLP Mixer. Scaling up the WaveMix model to achieve a top-1 accuracy of over 85% on CIFAR-10 could be done on a 16 GB GPU, while consuming only 6% of the GPU RAM used by the largest ViT which could fit in that GPU. Our work suggests that research on model structures that exploit the right inductive bias is far from over, and that such models can enable the training of computer vision models in settings with limited GPU resources.", "keywords": "image classification;computer vision;wavelet transform;resource efficient", "primary_area": "", "supplementary_material": "", "author": "Pranav Jeevan P;Amit Sethi", "authorids": "~Pranav_Jeevan_P1;~Amit_Sethi2", "gender": "M;M", "homepage": "https://pranavphoenix.github.io/;https://www.ee.iitb.ac.in/~asethi/", "dblp": "296/3727;", "google_scholar": "3GlJQ24AAAAJ;https://scholar.google.com/citations?hl=en", "orcid": "0000-0003-4110-9638;0000-0002-8634-1804", "linkedin": ";amit-sethi-590b2a3/", "or_profile": "~Pranav_Jeevan_P1;~Amit_Sethi2", "aff": "Indian Institute of Technology Bombay, Dhirubhai Ambani Institute Of Information and Communication Technology;University of Illinois, Chicago", "aff_domain": "iitb.ac.in;uic.edu", "position": "PhD student;Adjunct Instructor", "bibtex": "@misc{\np2022wavemix,\ntitle={WaveMix: Multi-Resolution Token Mixing for Images},\nauthor={Pranav Jeevan P and Amit Sethi},\nyear={2022},\nurl={https://openreview.net/forum?id=tBoSm4hUWV}\n}", "github": "", "project": "", "reviewers": "wEp1;Pgqa;BKLA;WLDh", "site": "https://openreview.net/forum?id=tBoSm4hUWV", "pdf_size": 0, "recommendation": "3;3;3;3", "confidence": "4;5;5;5", "correctness": "2;2;2;1", "technical_novelty": "3;2;2;3", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "52;140;48;54", "wc_summary_review": "76;68;59;16", "wc_main_review": "155;469;418;186", "wc_review": "283;677;525;256", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 4.75, 0.4330127018922193 ], "correctness_avg": [ 1.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 73.5, 38.4545185901475 ], "wc_summary_review_avg": [ 54.75, 23.166516786085904 ], "wc_main_review_avg": [ 307.0, 138.12132348048218 ], "wc_review_avg": [ 435.25, 174.5055514876246 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14329881027876082707&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1", "aff_unique_norm": "Indian Institute of Technology Bombay;University of Illinois at Chicago", "aff_unique_dep": ";", "aff_unique_url": "https://www.iitb.ac.in;https://www.uic.edu", "aff_unique_abbr": "IIT Bombay;UIC", "aff_campus_unique_index": "0;1", "aff_campus_unique": "Bombay;Chicago", "aff_country_unique_index": "0;1", "aff_country_unique": "India;United States" }, { "title": "Evaluation Metrics for Graph Generative Models: Problems, Pitfalls, and Practical Solutions", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7082", "id": "tBtoZYKd9n", "poster": "", "openreview": "https://openreview.net/forum?id=tBtoZYKd9n", "slides": "https://iclr.cc/virtual/2022/poster/7082", "video": "https://iclr.cc/virtual/2022/poster/7082", "author_site": "Leslie O'Bray, Max Horn, Bastian Rieck, Karsten Borgwardt", "tldr": "", "abstract": "Graph generative models are a highly active branch of machine learning. Given the steady development of new models of ever-increasing complexity, it is necessary to provide a principled way to evaluate and compare them. In this paper, we enumerate the desirable criteria for such a comparison metric and provide an overview of the status quo of graph generative model comparison in use today, which predominantly relies on the maximum mean discrepancy (MMD). We perform a systematic evaluation of MMD in the context of graph generative model comparison, highlighting some of the challenges and pitfalls researchers inadvertently may encounter. After conducting a thorough analysis of the behaviour of MMD on synthetically-generated perturbed graphs as well as on recently-proposed graph generative models, we are able to provide a suitable procedure to mitigate these challenges and pitfalls. We aggregate our findings into a list of practical recommendations for researchers to use when evaluating graph generative models.", "keywords": "graph generative models;model evaluation", "primary_area": "", "supplementary_material": "", "author": "Leslie O'Bray;Max Horn;Bastian Rieck;Karsten Borgwardt", "authorids": "~Leslie_O'Bray1;~Max_Horn1;~Bastian_Rieck1;~Karsten_Borgwardt2", "gender": ";M;M;", "homepage": "https://www.leslieobray.com;https://expectationmax.github.io;https://bastian.rieck.me;https://www.biochem.mpg.de/borgwardt", "dblp": ";https://dblp.uni-trier.de/pers/hd/h/Horn:Max;119/8860;11/3733.html", "google_scholar": ";60cGPvIAAAAJ;https://scholar.google.ch/citations?user=La7zuKQAAAAJ;v3JsjMYAAAAJ", "orcid": ";0000-0002-8269-9948;0000-0003-4335-0302;0000-0001-7221-2393", "linkedin": "leslie-o-bray-722574a7/;;br-ml/;", "or_profile": "~Leslie_O'Bray1;~Max_Horn1;~Bastian_Rieck1;~Karsten_Borgwardt2", "aff": "Swiss Federal Institute of Technology;Amazon Development Center Germany;Helmholtz Zentrum M\u00fcnchen;ETHZ - ETH Zurich", "aff_domain": "ethz.ch;amazon.de;helmholtz-munich.de;ethz.ch", "position": "PhD student;Researcher;Principal Investigator;Full Professor", "bibtex": "@inproceedings{\no'bray2022evaluation,\ntitle={Evaluation Metrics for Graph Generative Models: Problems, Pitfalls, and Practical Solutions},\nauthor={Leslie O'Bray and Max Horn and Bastian Rieck and Karsten Borgwardt},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=tBtoZYKd9n}\n}", "github": "", "project": "", "reviewers": "3VAJ;7gMp;d5fz;37EN", "pdf_size": 0, "recommendation": "5;8;8;8", "confidence": "3;4;4;5", "correctness": "3;3;3;3", "technical_novelty": "3;3;3;3", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "81;38;79;90", "wc_summary_review": "74;90;158;43", "wc_main_review": "575;224;277;406", "wc_review": "730;352;514;539", "wc_reply_reviewers": "0;59;0;0", "wc_reply_authors": "803;561;499;814", "reply_reviewers": "0;1;0;0", "reply_authors": "1;2;1;1", "recommendation_avg": [ 7.25, 1.299038105676658 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 72.0, 20.062402647738878 ], "wc_summary_review_avg": [ 91.25, 42.07953778263255 ], "wc_main_review_avg": [ 370.5, 135.35601205709335 ], "wc_review_avg": [ 533.75, 134.13123238082918 ], "wc_reply_reviewers_avg": [ 14.75, 25.54774941164094 ], "wc_reply_authors_avg": [ 669.25, 141.01839419026157 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.816496580927726, "corr_recommendation_correctness": 0.0, "gs_citation": 54, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2895320049488779805&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=tBtoZYKd9n", "email": "ethz.ch;amazon.de;helmholtz-munich.de;ethz.ch", "author_num": 4, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "Swiss Federal Institute of Technology;Amazon;Helmholtz Zentrum M\u00fcnchen;ETH Zurich", "aff_unique_dep": ";Development Center;;", "aff_unique_url": "https://www.ethz.ch;https://www.amazon.de;https://www.helmholtz-muenchen.de;https://www.ethz.ch", "aff_unique_abbr": "ETH Zurich;Amazon;;ETHZ", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;0", "aff_country_unique": "Switzerland;Germany" }, { "id": "tCx6AefvuPf", "title": "Node-Level Differentially Private Graph Neural Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "Graph neural networks (GNNs) are a popular technique for modelling graph-structured data that compute node-level predictions via aggregation of information from the local neighborhood of each node. However, this aggregation implies increased risk of revealing sensitive information, as a node can participate in the inference for multiple nodes. This implies that standard privacy preserving machine learning techniques like differentially private stochastic gradient descent (DP-SGD) \u2013 which are designed for situations where each node/data point participate in inference of one point only \u2013 either do not apply or lead to inaccurate solutions. In this work, we formally define the problem of learning 1-layer GNNs with node-level privacy, and provide a method for the problem with a strong differential privacy guarantee. Even though each node can be involved in the inference for multiple nodes, by employing a careful sensitivity analysis and a non-trivial extension of the privacy-by-amplification technique, our method is able to provide accurate solutions with solid privacy parameters. Empirical evaluation on standard benchmarks demonstrates that our method is indeed able to learn accurate privacy preserving GNNs, while still outperforming standard non-private methods that completely ignore graph information.", "keywords": "differential privacy;graph neural networks;node-level privacy", "primary_area": "", "supplementary_material": "", "author": "Ameya Daigavane;Gagan Madan;Aditya Sinha;Abhradeep Guha Thakurta;Gaurav Aggarwal;Prateek Jain", "authorids": "~Ameya_Daigavane1;~Gagan_Madan1;~Aditya_Sinha1;~Abhradeep_Guha_Thakurta1;~Gaurav_Aggarwal4;~Prateek_Jain1", "gender": ";M;M;M;;M", "homepage": "https://ameya98.github.io/;;https://adityaasinha28.github.io/;https://athakurta.squarespace.com/;;http://prateekjain.org", "dblp": ";177/8934.html;;31/8315;14/5218;https://dblp.uni-trier.de/pers/j/Jain_0002:Prateek.html", "google_scholar": "2hegRsIAAAAJ;_61mGn8AAAAJ;5letoXIAAAAJ;1rV69hMAAAAJ;https://scholar.google.co.in/citations?user=9XiIwDQAAAAJ;qYhRbJoAAAAJ", "orcid": "0000-0002-5116-3075;;;;;", "linkedin": ";;adityaasinha28/;;;", "or_profile": "~Ameya_Daigavane1;~Gagan_Madan1;~Aditya_Sinha1;~Abhradeep_Guha_Thakurta1;~Gaurav_Aggarwal4;~Prateek_Jain1", "aff": "Google Research;Google;Department of Computer Science;Google;Google;Google", "aff_domain": "google.com;google.com;cs.illinois.edu;google.com;google.com;google.com", "position": "Researcher;Researcher;MS student;Senior Research Scientist;Researcher;Researcher", "bibtex": "@misc{\ndaigavane2022nodelevel,\ntitle={Node-Level Differentially Private Graph Neural Networks},\nauthor={Ameya Daigavane and Gagan Madan and Aditya Sinha and Abhradeep Guha Thakurta and Gaurav Aggarwal and Prateek Jain},\nyear={2022},\nurl={https://openreview.net/forum?id=tCx6AefvuPf}\n}", "github": "", "project": "", "reviewers": "Xcpu;Uu9v;upoR;NdP7", "site": "https://openreview.net/forum?id=tCx6AefvuPf", "pdf_size": 0, "recommendation": "3;3;5;6", "confidence": "3;3;4;3", "correctness": "3;4;1;4", "technical_novelty": "2;2;2;2", "empirical_novelty": "1;2;2;2", "wc_summary_paper": "46;55;100;60", "wc_summary_review": "31;106;85;24", "wc_main_review": "782;452;289;232", "wc_review": "859;613;474;316", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "998;831;763;635", "reply_reviewers": "0;0;0;0", "reply_authors": "3;2;2;2", "recommendation_avg": [ 4.25, 1.299038105676658 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 3.0, 1.224744871391589 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 1.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 65.25, 20.680606857633556 ], "wc_summary_review_avg": [ 61.5, 34.88911004883902 ], "wc_main_review_avg": [ 438.75, 213.9899238282027 ], "wc_review_avg": [ 565.5, 199.38718614795687 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 806.75, 130.93581442829154 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.25, 0.4330127018922193 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": -0.15713484026367722, "gs_citation": 65, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10231790179534332853&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff_unique_index": "0;0;1;0;0;0", "aff_unique_norm": "Google;Unknown Institution", "aff_unique_dep": "Google Research;Department of Computer Science", "aff_unique_url": "https://research.google;", "aff_unique_abbr": "Google Research;", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States;" }, { "title": "Improved deterministic l2 robustness on CIFAR-10 and CIFAR-100", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6980", "id": "tD7eCtaSkR", "poster": "", "openreview": "https://openreview.net/forum?id=tD7eCtaSkR", "slides": "https://iclr.cc/virtual/2022/poster/6980", "video": "https://iclr.cc/virtual/2022/poster/6980", "author_site": "Sahil Singla, Surbhi Singla, Soheil Feizi", "tldr": "", "abstract": "Training convolutional neural networks (CNNs) with a strict Lipschitz constraint under the $l_{2}$ norm is useful for provable adversarial robustness, interpretable gradients and stable training. While $1$-Lipschitz CNNs can be designed by enforcing a $1$-Lipschitz constraint on each layer, training such networks requires each layer to have an orthogonal Jacobian matrix (for all inputs) to prevent the gradients from vanishing during backpropagation. A layer with this property is said to be Gradient Norm Preserving (GNP). In this work, we introduce a procedure to certify the robustness of $1$-Lipschitz CNNs by relaxing the orthogonalization of the last linear layer of the network that significantly advances the state of the art for both standard and provable robust accuracies on CIFAR-100 (gains of $4.80\\%$ and $4.71\\%$, respectively). We further boost their robustness by introducing (i) a novel Gradient Norm preserving activation function called the Householder activation function (that includes every $\\mathrm{GroupSort}$ activation) and (ii) a certificate regularization. On CIFAR-10, we achieve significant improvements over prior works in provable robust accuracy ($5.81\\%$) with only a minor drop in standard accuracy ($-0.29\\%$). Code for reproducing all experiments in the paper is available at \\url{https://github.com/singlasahil14/SOC}. ", "keywords": "provable robustness;adversarial examples", "primary_area": "", "supplementary_material": "/attachment/efab4bcac7d8bba1bcf55e5b8e68313452e72a77.zip", "author": "Sahil Singla;Surbhi Singla;Soheil Feizi", "authorids": "~Sahil_Singla1;~Surbhi_Singla1;~Soheil_Feizi2", "gender": "M;F;M", "homepage": "https://singlasahil14.github.io/;;https://www.cs.umd.edu/~sfeizi/", "dblp": "55/8911-2;;57/2132", "google_scholar": "jjjbOI4AAAAJ;;lptAmrMAAAAJ", "orcid": ";;", "linkedin": ";surbhi-singla-a062b0110/;", "or_profile": "~Sahil_Singla1;~Surbhi_Singla1;~Soheil_Feizi2", "aff": "University of Maryland, College Park;;University of Maryland, College Park", "aff_domain": "umd.edu;;umd.edu", "position": "PhD student;;Assistant Professor", "bibtex": "@inproceedings{\nsingla2022improved,\ntitle={Improved deterministic l2 robustness on {CIFAR}-10 and {CIFAR}-100},\nauthor={Sahil Singla and Surbhi Singla and Soheil Feizi},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=tD7eCtaSkR}\n}", "github": "", "project": "", "reviewers": "zg69;FsEx;dTST;imTR", "pdf_size": 0, "recommendation": "6;6;8;8", "confidence": "5;4;2;4", "correctness": "4;4;4;3", "technical_novelty": "3;3;3;3", "empirical_novelty": "3;4;2;3", "wc_summary_paper": "70;73;104;181", "wc_summary_review": "25;68;57;46", "wc_main_review": "250;186;233;231", "wc_review": "345;327;394;458", "wc_reply_reviewers": "206;0;0;40", "wc_reply_authors": "902;95;646;287", "reply_reviewers": "2;0;0;1", "reply_authors": "2;1;1;1", "recommendation_avg": [ 7.0, 1.0 ], "confidence_avg": [ 3.75, 1.0897247358851685 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 107.0, 44.749301670528894 ], "wc_summary_review_avg": [ 49.0, 15.890248582070704 ], "wc_main_review_avg": [ 225.0, 23.695991222145572 ], "wc_review_avg": [ 381.0, 50.76908508137605 ], "wc_reply_reviewers_avg": [ 61.5, 85.01029349437631 ], "wc_reply_authors_avg": [ 482.5, 312.6855449169341 ], "reply_reviewers_avg": [ 0.75, 0.82915619758885 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.6882472016116854, "corr_recommendation_correctness": -0.5773502691896257, "gs_citation": 73, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11890020481997280688&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=tD7eCtaSkR", "email": "umd.edu;;umd.edu", "author_num": 3, "aff_unique_index": "0;0", "aff_unique_norm": "University of Maryland", "aff_unique_dep": "", "aff_unique_url": "https://www/umd.edu", "aff_unique_abbr": "UMD", "aff_campus_unique_index": "0;0", "aff_campus_unique": "College Park", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "tDirSp3pczB", "title": "Sharp Learning Bounds for Contrastive Unsupervised Representation Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Contrastive unsupervised representation learning (CURL) encourages data representation to make semantically similar pairs closer than randomly drawn negative samples, which has been successful in various domains such as vision, language, and graphs. Although recent theoretical studies have attempted to explain its success by upper bounds of a downstream classification loss by the contrastive loss, they are still not tight enough to explain an experimental fact: larger negative samples improve the classification performance. This study establishes a downstream classification loss bound with a tight intercept in the negative sample size. By regarding the contrastive loss as a downstream loss estimator, our theory not only improves the existing learning bounds substantially but also explains why downstream classification empirically improves with larger negative samples because the estimation variance of the downstream loss decays with larger negative samples. We verify that our theory is consistent with experiments on synthetic, vision, and language datasets.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/e15cbe233b6262d34c627bee13cf4ecb7362f7d7.zip", "author": "Han Bao;Yoshihiro Nagano;Kento Nozawa", "authorids": "~Han_Bao2;~Yoshihiro_Nagano1;~Kento_Nozawa2", "gender": "M;M;M", "homepage": "https://hermite.jp/;https://ganow.me;https://nzw0301.github.io/", "dblp": "120/1444-2;150/2847;215/4422", "google_scholar": "MqMzjeMAAAAJ;https://scholar.google.co.jp/citations?user=sp4ynWwAAAAJ;https://scholar.google.co.jp/citations?user=DSdjj8AAAAAJ", "orcid": "0000-0002-4473-2604;0000-0003-3863-9718;", "linkedin": ";;nozawa-kento-0301/", "or_profile": "~Han_Bao2;~Yoshihiro_Nagano1;~kento_nozawa1", "aff": "The University of Tokyo;The University of Tokyo;RIKEN AIP", "aff_domain": "u-tokyo.ac.jp;k.u-tokyo.ac.jp;riken.jp", "position": "PhD student;Postdoc;Research assistant", "bibtex": "@misc{\nbao2022sharp,\ntitle={Sharp Learning Bounds for Contrastive Unsupervised Representation Learning},\nauthor={Han Bao and Yoshihiro Nagano and Kento Nozawa},\nyear={2022},\nurl={https://openreview.net/forum?id=tDirSp3pczB}\n}", "github": "", "project": "", "reviewers": "fm83;d3WX;Dmou;unTX;URqh", "site": "https://openreview.net/forum?id=tDirSp3pczB", "pdf_size": 0, "recommendation": "6;6;6;8;8", "confidence": "5;4;4;4;3", "correctness": "2;3;3;4;4", "technical_novelty": "2;1;3;3;3", "empirical_novelty": "2;0;1;3;3", "wc_summary_paper": "77;36;23;70;38", "wc_summary_review": "57;21;36;66;56", "wc_main_review": "535;306;320;282;141", "wc_review": "669;363;379;418;235", "wc_reply_reviewers": "0;41;68;24;0", "wc_reply_authors": "856;603;343;333;36", "reply_reviewers": "0;1;1;1;0", "reply_authors": "2;1;1;1;1", "recommendation_avg": [ 6.8, 0.9797958971132712 ], "confidence_avg": [ 4.0, 0.6324555320336759 ], "correctness_avg": [ 3.2, 0.7483314773547882 ], "technical_novelty_avg": [ 2.4, 0.8 ], "empirical_novelty_avg": [ 1.8, 1.1661903789690602 ], "wc_summary_paper_avg": [ 48.8, 20.932271735289508 ], "wc_summary_review_avg": [ 47.2, 16.36337373526621 ], "wc_main_review_avg": [ 316.8, 126.37626359407847 ], "wc_review_avg": [ 412.8, 142.0568900124172 ], "wc_reply_reviewers_avg": [ 26.6, 25.858074174230378 ], "wc_reply_authors_avg": [ 434.2, 276.95876949466685 ], "reply_reviewers_avg": [ 0.6, 0.48989794855663565 ], "reply_authors_avg": [ 1.2, 0.4000000000000001 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.6454972243679028, "corr_recommendation_correctness": 0.8728715609439696, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11946384253364083692&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;1", "aff_unique_norm": "University of Tokyo;RIKEN", "aff_unique_dep": ";Advanced Institute for Computational Science", "aff_unique_url": "https://www.u-tokyo.ac.jp;https://www.aip.riken.jp", "aff_unique_abbr": "UTokyo;RIKEN AIP", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Japan" }, { "id": "tDw7Mmat8co", "title": "Towards Safe Reinforcement Learning via Constraining Conditional Value-at-Risk", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Though deep reinforcement learning (DRL) has obtained substantial success, it may encounter catastrophic failures due to the intrinsic uncertainty caused by stochasticity in both environments and policies. Existing safe reinforcement learning methods are often based on transforming the optimization criterion and adopting the variance of the return as a measure of uncertainty. However, the return variance introduces a bias for penalizing both positive and negative risk equally, deviated from the purpose of safe reinforcement learning to penalize negative ones only. To address this issue, we propose to use the conditional value-at-risk (CVaR) as an assessment of risk, which guarantees that the probability for reaching a catastrophic state is below a desired threshold. Furthermore, we present a novel reinforcement learning framework of CVaR-Proximal-Policy-Optimization (CPPO) which formalizes the risk-sensitive constrained optimization problem by keeping its CVaR under a given threshold. To evaluate the robustness of policies, we theoretically prove that performance degradation under observation disturbance and transition disturbance depends on the gap of value function between the best state and the worst state. We also show that CPPO can generate more robust policies under disturbance. Experimental results show that CPPO achieves higher cumulative reward and exhibits stronger robustness against observation disturbance and transition disturbance on a series of continuous control tasks in MuJoCo. ", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Chengyang Ying;Xinning Zhou;Hang Su;Dong Yan;Jun Zhu", "authorids": "~Chengyang_Ying1;~Xinning_Zhou1;~Hang_Su3;~Dong_Yan1;~Jun_Zhu2", "gender": "M;;M;M;M", "homepage": "https://yingchengyang.github.io/;;;http://ml.cs.tsinghua.edu.cn/~jun;", "dblp": "296/2065;293/3297;20/7834;50/2644-1;26/5371-6", "google_scholar": "vM6KE18AAAAJ;lWRfV70AAAAJ;lvztRUkAAAAJ;axsP38wAAAAJ;dxN1_X0AAAAJ", "orcid": ";;0000-0003-0641-8988;;", "linkedin": "%E9%93%96%E9%98%B3-%E5%BA%94-9b682a203/;%E8%BE%9B%E5%AE%81-%E5%91%A8-5a49821a3/;;;", "or_profile": "~Chengyang_Ying1;~Xinning_Zhou1;~Dong_Yan1;~Jun_Zhu2;~Hang_Su2", "aff": "Tsinghua University;Tsinghua University;Tsinghua University;Tsinghua University;Tsinghua University", "aff_domain": "tsinghua.edu.cn;tsinghua.edu.cn;cs.tsinghua.edu.cn;mail.tsinghua.edu.cn;tsinghua.edu.cn", "position": "PhD student;PhD student;Researcher;Professor;Associate Professor", "bibtex": "@misc{\nying2022towards,\ntitle={Towards Safe Reinforcement Learning via Constraining Conditional Value-at-Risk},\nauthor={Chengyang Ying and Xinning Zhou and Hang Su and Dong Yan and Jun Zhu},\nyear={2022},\nurl={https://openreview.net/forum?id=tDw7Mmat8co}\n}", "github": "", "project": "", "reviewers": "7bUY;mgKd;DmK1;NUHu", "site": "https://openreview.net/forum?id=tDw7Mmat8co", "pdf_size": 0, "recommendation": "5;5;5;5", "confidence": "3;4;3;3", "correctness": "3;3;3;3", "technical_novelty": "2;3;2;4", "empirical_novelty": "3;2;2;3", "wc_summary_paper": "34;45;74;44", "wc_summary_review": "210;32;48;71", "wc_main_review": "221;621;900;317", "wc_review": "465;698;1022;432", "wc_reply_reviewers": "0;655;0;0", "wc_reply_authors": "211;584;895;243", "reply_reviewers": "0;1;0;0", "reply_authors": "1;2;4;1", "recommendation_avg": [ 5.0, 0.0 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.75, 0.82915619758885 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 49.25, 14.922717580923388 ], "wc_summary_review_avg": [ 90.25, 70.51373979587241 ], "wc_main_review_avg": [ 514.75, 266.974132642097 ], "wc_review_avg": [ 654.25, 235.77783504816563 ], "wc_reply_reviewers_avg": [ 163.75, 283.62331973940366 ], "wc_reply_authors_avg": [ 483.25, 279.0738029625855 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 2.0, 1.224744871391589 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 50, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13174018823837396058&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Tsinghua University", "aff_unique_dep": "", "aff_unique_url": "https://www.tsinghua.edu.cn", "aff_unique_abbr": "THU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "id": "tFQyjbOz34", "title": "Detecting Modularity in Deep Neural Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "A neural network is modular to the extent that parts of its computational graph (i.e. structure) can be represented as performing some comprehensible subtask relevant to the overall task (i.e. functionality). Are modern deep neural networks modular? How can this be quantified? In this paper, we consider the problem of assessing the modularity exhibited by a partitioning of a network's neurons. We propose two proxies for this: importance, which reflects how crucial sets of neurons are to network performance; and coherence, which reflects how consistently their neurons associate with features of the inputs. To measure these proxies, we develop a set of statistical methods based on techniques conventionally used to interpret individual neurons. We apply the proxies to partitionings generated by spectrally clustering a graph representation of the network's neurons with edges determined either by network weights or correlations of activations. We show that these partitionings, even ones based only on weights (i.e. strictly from non-runtime analysis), reveal groups of neurons that are important and coherent. These results suggest that graph-based partitioning can reveal modularity and help us understand how deep neural networks function.", "keywords": "Modularity;clustering;interpretability;feature visualization;lesions", "primary_area": "", "supplementary_material": "/attachment/a1a77e04ff7465b24ba923e5f395f43d931f09c2.zip", "author": "Shlomi Hod;Stephen Casper;Daniel Filan;Cody Wild;Andrew Critch;Stuart Russell", "authorids": "~Shlomi_Hod1;~Stephen_Casper1;~Daniel_Filan1;~Cody_Wild1;~Andrew_Critch1;~Stuart_Russell1", "gender": ";M;M;M;M;F", "homepage": "https://shlomi.hod.xyz;https://stephencasper.com/;https://danielfilan.com/;http://acritch.com/;https://people.eecs.berkeley.edu/~russell/;https://scholar.google.com/citations?user=VcsUv5kAAAAJ&hl=en", "dblp": ";255/5295.html;;;;", "google_scholar": "s_WPt74AAAAJ;N4aglP4AAAAJ;9eoaiXMAAAAJ;F3_yOXUAAAAJ;https://scholar.google.com.tw/citations?user=KJGrjCAAAAAJ;VcsUv5kAAAAJ", "orcid": "0000-0002-0387-4542;0000-0003-0084-1937;;;;", "linkedin": "shlomi-hod/;;;acritch;;", "or_profile": "~Shlomi_Hod1;~Stephen_Casper1;~Daniel_Filan1;~Andrew_Critch1;~Stuart_Russell1;~Cody_Wild2", "aff": "Boston University;Massachusetts Institute of Technology;University of California, Berkeley;University of California, Berkeley;University of California, Berkeley;Google", "aff_domain": "bu.edu;mit.edu;berkeley.edu;berkeley.edu;berkeley.edu;google.com", "position": "PhD student;Graduate Student;PhD student;Postdoc;Full Professor;Research Engineer", "bibtex": "@misc{\nhod2022detecting,\ntitle={Detecting Modularity in Deep Neural Networks},\nauthor={Shlomi Hod and Stephen Casper and Daniel Filan and Cody Wild and Andrew Critch and Stuart Russell},\nyear={2022},\nurl={https://openreview.net/forum?id=tFQyjbOz34}\n}", "github": "", "project": "", "reviewers": "ZAxP;3KTK;d1u7;hnkg", "site": "https://openreview.net/forum?id=tFQyjbOz34", "pdf_size": 0, "recommendation": "5;5;5;6", "confidence": "4;4;3;2", "correctness": "3;2;3;4", "technical_novelty": "2;2;2;3", "empirical_novelty": "3;2;2;3", "wc_summary_paper": "223;225;100;45", "wc_summary_review": "61;138;51;14", "wc_main_review": "331;400;321;270", "wc_review": "615;763;472;329", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "451;790;733;582", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 5.25, 0.4330127018922193 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 148.25, 78.2092545674743 ], "wc_summary_review_avg": [ 66.0, 45.10543204537564 ], "wc_main_review_avg": [ 330.5, 46.31684358848301 ], "wc_review_avg": [ 544.75, 161.56171421472354 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 639.0, 132.50471689717313 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.8703882797784891, "corr_recommendation_correctness": 0.816496580927726, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15515041428459186741&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;2;2;2;3", "aff_unique_norm": "Boston University;Massachusetts Institute of Technology;University of California, Berkeley;Google", "aff_unique_dep": ";;;Google", "aff_unique_url": "https://www.bu.edu;https://web.mit.edu;https://www.berkeley.edu;https://www.google.com", "aff_unique_abbr": "BU;MIT;UC Berkeley;Google", "aff_campus_unique_index": "1;1;1;2", "aff_campus_unique": ";Berkeley;Mountain View", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Learning curves for continual learning in neural networks: Self-knowledge transfer and forgetting", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6690", "id": "tFgdrQbbaa", "poster": "", "openreview": "https://openreview.net/forum?id=tFgdrQbbaa", "slides": "https://iclr.cc/virtual/2022/poster/6690", "video": "https://iclr.cc/virtual/2022/poster/6690", "author_site": "Ryo Karakida, Shotaro Akaho", "tldr": "", "abstract": "Sequential training from task to task is becoming one of the major objects in deep learning applications such as continual learning and transfer learning. Nevertheless, it remains unclear under what conditions the trained model's performance improves or deteriorates. To deepen our understanding of sequential training, this study provides a theoretical analysis of generalization performance in a solvable case of continual learning. We consider neural networks in the neural tangent kernel (NTK) regime that continually learn target functions from task to task, and investigate the generalization by using an established statistical mechanical analysis of kernel ridge-less regression. We first show characteristic transitions from positive to negative transfer. More similar targets above a specific critical value can achieve positive knowledge transfer for the subsequent task while catastrophic forgetting occurs even with very similar targets. Next, we investigate a variant of continual learning which supposes the same target function in multiple tasks. Even for the same target, the trained model shows some transfer and forgetting depending on the sample size of each task. We can guarantee that the generalization error monotonically decreases from task to task for equal sample sizes while unbalanced sample sizes deteriorate the generalization. We respectively refer to these improvement and deterioration as self-knowledge transfer and forgetting, and empirically confirm them in realistic training of deep neural networks as well. ", "keywords": "continual learning;neural tangent kernel;statistical mechanics", "primary_area": "", "supplementary_material": "", "author": "Ryo Karakida;Shotaro Akaho", "authorids": "~Ryo_Karakida2;s.akaho@aist.go.jp", "gender": "M;", "homepage": "https://sites.google.com/view/ryokarakida/english;", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": "~Ryo_Karakida2;s.akaho@aist.go.jp", "aff": "AIST, National Institute of Advanced Industrial Science and Technology;", "aff_domain": "aist.go.jp;", "position": "Researcher;", "bibtex": "@inproceedings{\nkarakida2022learning,\ntitle={Learning curves for continual learning in neural networks: Self-knowledge transfer and forgetting},\nauthor={Ryo Karakida and Shotaro Akaho},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=tFgdrQbbaa}\n}", "github": "", "project": "", "reviewers": "aqmt;2847;spQ7;ZHBA", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "3;3;4;2", "correctness": "2;4;4;4", "technical_novelty": "3;2;3;3", "empirical_novelty": "2;0;3;0", "wc_summary_paper": "89;48;79;67", "wc_summary_review": "53;53;46;42", "wc_main_review": "270;404;226;388", "wc_review": "412;505;351;497", "wc_reply_reviewers": "87;57;0;21", "wc_reply_authors": "867;475;330;208", "reply_reviewers": "1;1;0;1", "reply_authors": "2;1;1;1", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 3.0, 0.7071067811865476 ], "correctness_avg": [ 3.5, 0.8660254037844386 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.25, 1.299038105676658 ], "wc_summary_paper_avg": [ 70.75, 15.270478054075452 ], "wc_summary_review_avg": [ 48.5, 4.716990566028302 ], "wc_main_review_avg": [ 322.0, 75.82875444051551 ], "wc_review_avg": [ 441.25, 63.58606372468735 ], "wc_reply_reviewers_avg": [ 41.25, 33.36446462930284 ], "wc_reply_authors_avg": [ 470.0, 247.9304337914166 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.6488856845230502, "corr_recommendation_correctness": 0.6622661785325219, "gs_citation": 23, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5976860535245280730&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=tFgdrQbbaa", "email": "aist.go.jp;", "author_num": 2, "aff_unique_index": "0", "aff_unique_norm": "National Institute of Advanced Industrial Science and Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.aist.go.jp", "aff_unique_abbr": "AIST", "aff_country_unique_index": "0", "aff_country_unique": "Japan" }, { "id": "tG8QrhMwEqS", "title": "Adaptive Activation-based Structured Pruning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Pruning is a promising approach to compress complex deep learning models in order to deploy them on resource-constrained edge devices. However, many existing pruning solutions are based on unstructured pruning, which yield models that cannot efficiently run on commodity hardware, and require users to manually explore and tune the pruning process, which is time consuming and often leads to sub-optimal results. To address these limitations, this paper presents an adaptive, activation-based, structured pruning approach to automatically and efficiently generate small, accurate, and hardware-efficient models that meet user requirements. First, it proposes iterative structured pruning using activation-based attention feature maps to effectively identify and prune unimportant filters. Then, it proposes adaptive pruning policies for automatically meeting the pruning objectives of accuracy-critical, memory-constrained, and latency-sensitive tasks. A comprehensive evaluation shows that the proposed method can substantially outperform the state-of-the-art structured pruning works on CIFAR-10 and ImageNet datasets. For example, on ResNet-56 with CIFAR-10, without any accuracy drop, our method achieves the largest parameter reduction (79.11%), outperforming the related works by 22.81% to 66.07%, and the largest FLOPs reduction (70.13%), outperforming the related works by 14.13% to 26.53%.", "keywords": "model compression;structured pruning", "primary_area": "", "supplementary_material": "", "author": "Kaiqi Zhao;Animesh Jain;Ming Zhao", "authorids": "~Kaiqi_Zhao2;~Animesh_Jain1;~Ming_Zhao2", "gender": ";;Not Specified", "homepage": ";;https://visa.asu.edu/ming", "dblp": ";;z/MingZhao2", "google_scholar": ";;pAcF2lEAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Kaiqi_Zhao2;~Animesh_Jain1;~Ming_Zhao2", "aff": ";;Arizona State University", "aff_domain": ";;asu.edu", "position": ";;Associate Professor", "bibtex": "@misc{\nzhao2022adaptive,\ntitle={Adaptive Activation-based Structured Pruning},\nauthor={Kaiqi Zhao and Animesh Jain and Ming Zhao},\nyear={2022},\nurl={https://openreview.net/forum?id=tG8QrhMwEqS}\n}", "github": "", "project": "", "reviewers": "XNxE;YdeV;EiLE", "site": "https://openreview.net/forum?id=tG8QrhMwEqS", "pdf_size": 0, "recommendation": "3;5;5", "confidence": "5;3;3", "correctness": "3;3;3", "technical_novelty": "2;2;2", "empirical_novelty": "2;2;2", "wc_summary_paper": "71;68;21", "wc_summary_review": "159;31;24", "wc_main_review": "195;313;143", "wc_review": "425;412;188", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 4.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.9428090415820634 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 53.333333333333336, 22.895899681432528 ], "wc_summary_review_avg": [ 71.33333333333333, 62.055530687352025 ], "wc_main_review_avg": [ 217.0, 71.12430433168866 ], "wc_review_avg": [ 341.6666666666667, 108.78827551206467 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -1.0, "corr_recommendation_correctness": 0.0, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18172894526539444178&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0", "aff_unique_norm": "Arizona State University", "aff_unique_dep": "", "aff_unique_url": "https://www.asu.edu", "aff_unique_abbr": "ASU", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "id": "tHx6q2dM86s", "title": "HYPOCRITE: Homoglyph Adversarial Examples for Natural Language Web Services in the Physical World", "track": "main", "status": "Reject", "tldr": "", "abstract": "Recently, as Artificial Intelligence (AI) develops, many companies in various industries are trying to use AI by grafting it into their domains.\nAlso, for these companies, various cloud companies (e.g., Amazon, Google, IBM, and Microsoft) are providing AI services as the form of Machine-Learning-as-a-Service (MLaaS).\nHowever, although these AI services are very advanced and well-made, security vulnerabilities such as adversarial examples still exist, which can interfere with normal AI services.\nThis paper demonstrates a HYPOCRITE for hypocrisy that generates homoglyph adversarial examples for natural language web services in the physical world. This hypocrisy can disrupt normal AI services provided by the cloud companies.\nThe key idea of HYPOCRITE is to replace English characters with other international characters that look similar to them in order to give the dataset noise to the AI engines.\nBy using this key idea, parts of text can be appropriately replaced with subtext with malicious meaning through black-box attacks for natural language web services in order to cause misclassification.\nIn order to show attack potential by HYPOCRITE, this paper implemented a framework that makes homoglyph adversarial examples for natural language web services in the physical world and evaluated the performance under various conditions.\nThrough extensive experiments, it is shown that HYPOCRITE is more effective than other baseline in terms of both attack success rate and perturbed ratio.", "keywords": "Adversarial Examples;Homograph;Natural Language;Web Services;Physical World", "primary_area": "", "supplementary_material": "/attachment/d86869c3fd0085a1153e8ea0b40971ec40b6b00c.zip", "author": "JINYONG KIM;JEONGHYEON KIM;MOSE GU;SANGHAK OHH;GILTEUN CHOI;JAEHOON JEONG", "authorids": "~JINYONG_KIM1;jeonghyeon92@skku.edu;rna0415@g.skku.edu;sanghak@skku.edu;gilteun@pusan.ac.kr;pauljeong@skku.edu", "gender": ";;;;;", "homepage": "https://github.com/wlsdyd0930;;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": "~JINYONG_KIM1;jeonghyeon92@skku.edu;rna0415@g.skku.edu;sanghak@skku.edu;gilteun@pusan.ac.kr;pauljeong@skku.edu", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\nkim2022hypocrite,\ntitle={{HYPOCRITE}: Homoglyph Adversarial Examples for Natural Language Web Services in the Physical World},\nauthor={JINYONG KIM and JEONGHYEON KIM and MOSE GU and SANGHAK OHH and GILTEUN CHOI and JAEHOON JEONG},\nyear={2022},\nurl={https://openreview.net/forum?id=tHx6q2dM86s}\n}", "github": "", "project": "", "reviewers": "JKRL;QX9M;y6Cv", "site": "https://openreview.net/forum?id=tHx6q2dM86s", "pdf_size": 0, "recommendation": "3;3;3", "confidence": "5;3;4", "correctness": "4;3;2", "technical_novelty": "1;1;2", "empirical_novelty": "1;2;2", "wc_summary_paper": "20;40;52", "wc_summary_review": "140;20;165", "wc_main_review": "19;169;80", "wc_review": "179;229;297", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "correctness_avg": [ 3.0, 0.816496580927726 ], "technical_novelty_avg": [ 1.3333333333333333, 0.4714045207910317 ], "empirical_novelty_avg": [ 1.6666666666666667, 0.4714045207910317 ], "wc_summary_paper_avg": [ 37.333333333333336, 13.199326582148887 ], "wc_summary_review_avg": [ 108.33333333333333, 63.28945848682508 ], "wc_main_review_avg": [ 89.33333333333333, 61.5918465022405 ], "wc_review_avg": [ 235.0, 48.359762888859024 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8559456667271999080&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "tJCwZBHm-jW", "title": "Image2Point: 3D Point-Cloud Understanding with 2D Image Pretrained Models", "track": "main", "status": "Reject", "tldr": "", "abstract": "3D point-clouds and 2D images are different visual representations of the physical world. While human vision can understand both representations, computer vision models designed for 2D image and 3D point-cloud understanding are quite different.\nOur paper explores the potential for transferring between these two representations by empirically investigating the feasibility of the transfer, the benefits of the transfer, and shedding light on why the transfer works.\nWe discovered that we can indeed use the same architecture and pretrained weights of a neural net model to understand both images and point-clouds. Specifically, we can transfer the pretrained image model to a point-cloud model by \\textit{inflating} 2D convolutional filters to 3D and then \\textbf{f}inetuning the \\textbf{i}mage-\\textbf{p}retrained models (FIP). \nWe discover that, surprisingly, models with minimal finetuning efforts --- only on input, output, and optionally batch normalization layers, can achieve competitive performance on 3D point-cloud classification, beating a wide range of point-cloud models that adopt task-specific architectures and use a variety of tricks. When finetuning the whole model, the performance further improves significantly. Meanwhile, we also find that FIP improves data efficiency, achieving up to 10.0 points top-1 accuracy gain on few-shot classification. It also speeds up the training of point-cloud models by up to 11.1x to reach a target accuracy.", "keywords": "Computer vision;Point-cloud;Cross-modality.", "primary_area": "", "supplementary_material": "/attachment/b7cee42a391d796ffd417e3c05642c97ab72d0dd.zip", "author": "Chenfeng Xu;Shijia Yang;Bohan Zhai;Bichen Wu;Xiangyu Yue;Wei Zhan;Peter Vajda;Kurt Keutzer;Masayoshi Tomizuka", "authorids": "~Chenfeng_Xu1;~Shijia_Yang1;~Bohan_Zhai1;~Bichen_Wu1;~Xiangyu_Yue1;~Wei_Zhan2;~Peter_Vajda1;~Kurt_Keutzer3;~Masayoshi_Tomizuka2", "gender": "M;F;M;M;M;;;;", "homepage": ";;;;http://xyue.io/;;https://sites.google.com/site/vajdap;;", "dblp": "65/1881;;;130/1371;207/7518;;44/5953;;", "google_scholar": "RpqvaTUAAAAJ;;TAbgR14AAAAJ;K3QJPdMAAAAJ;-xQ-C1sAAAAJ;;k8QB5VUAAAAJ;;", "orcid": "0000-0002-4941-6985;;;;;;;;", "linkedin": ";bronya-shijia-yang-762927193/;;bichenwu/;;;p%C3%A9ter-vajda-9a03aaa/;;", "or_profile": "~Chenfeng_Xu1;~Shijia_Yang1;~Bohan_Zhai1;~Bichen_Wu1;~Xiangyu_Yue1;~Wei_Zhan2;~Peter_Vajda1;~Kurt_Keutzer3;~Masayoshi_Tomizuka2", "aff": "University of California, Berkeley;University of California, Berkeley;Walmart Labs;Meta Facebook;University of California, Berkeley;;Meta;;", "aff_domain": "berkeley.edu;berkeley.edu;walmartlabs.com;fb.com;berkeley.edu;;meta.com;;", "position": "PhD student;Undergrad student;Researcher;Research Scientist;PhD student;;Researcher;;", "bibtex": "@misc{\nxu2022imagepoint,\ntitle={Image2Point: 3D Point-Cloud Understanding with 2D Image Pretrained Models},\nauthor={Chenfeng Xu and Shijia Yang and Bohan Zhai and Bichen Wu and Xiangyu Yue and Wei Zhan and Peter Vajda and Kurt Keutzer and Masayoshi Tomizuka},\nyear={2022},\nurl={https://openreview.net/forum?id=tJCwZBHm-jW}\n}", "github": "", "project": "", "reviewers": "SBBb;2iHv;KdMv;DFK1;aajt", "site": "https://openreview.net/forum?id=tJCwZBHm-jW", "pdf_size": 0, "recommendation": "6;6;6;6;6", "confidence": "5;3;4;5;3", "correctness": "4;3;4;3;3", "technical_novelty": "4;1;2;3;2", "empirical_novelty": "4;3;2;4;2", "wc_summary_paper": "53;92;57;53;69", "wc_summary_review": "39;53;65;41;54", "wc_main_review": "230;160;167;127;229", "wc_review": "322;305;289;221;352", "wc_reply_reviewers": "0;89;0;0;0", "wc_reply_authors": "342;470;747;256;1499", "reply_reviewers": "0;1;0;0;0", "reply_authors": "2;1;1;1;2", "recommendation_avg": [ 6.0, 0.0 ], "confidence_avg": [ 4.0, 0.8944271909999159 ], "correctness_avg": [ 3.4, 0.4898979485566356 ], "technical_novelty_avg": [ 2.4, 1.019803902718557 ], "empirical_novelty_avg": [ 3.0, 0.8944271909999159 ], "wc_summary_paper_avg": [ 64.8, 14.810806865258893 ], "wc_summary_review_avg": [ 50.4, 9.499473669630333 ], "wc_main_review_avg": [ 182.6, 40.60837352074077 ], "wc_review_avg": [ 297.8, 43.70537724353835 ], "wc_reply_reviewers_avg": [ 17.8, 35.6 ], "wc_reply_authors_avg": [ 662.8, 449.8646018526019 ], "reply_reviewers_avg": [ 0.2, 0.4000000000000001 ], "reply_authors_avg": [ 1.4, 0.4898979485566356 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 9, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 90, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10948848992729179424&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff_unique_index": "0;0;1;2;0;2", "aff_unique_norm": "University of California, Berkeley;Walmart;Meta", "aff_unique_dep": ";Walmart Labs;Meta Platforms, Inc.", "aff_unique_url": "https://www.berkeley.edu;https://www.walmart.com;https://meta.com", "aff_unique_abbr": "UC Berkeley;Walmart Labs;Meta", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Berkeley;", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "tJhIY38d2TS", "title": "Local Reweighting for Adversarial Training", "track": "main", "status": "Reject", "tldr": "", "abstract": "Instances-reweighted adversarial training (IRAT) can significantly boost the robustness of trained models, where data being less/more vulnerable to the given attack are assigned smaller/larger weights during training. However, when tested on attacks different from the given attack simulated in training, the robustness may drop significantly (e.g., even worse than no reweighting). In this paper, we study this problem and propose our solution--locally reweighted adversarial training (LRAT). The rationale behind IRAT is that we do not need to pay much attention to an instance that is already safe under the attack. We argue that the safeness should be attack-dependent, so that for the same instance, its weight can change given different attacks based on the same model. Thus, if the attack simulated in training is mis-specified, the weights of IRAT are misleading. To this end, LRAT pairs each instance with its adversarial variants and performs local reweighting inside each pair, while performing no global reweighting--the rationale is to fit the instance itself if it is immune to the attack, but not to skip the pair, in order to passively defend different attacks in future. Experiments show that LRAT works better than both IRAT (i.e., global reweighting) and the standard AT (i.e., no reweighting) when trained with an attack and tested on different attacks.", "keywords": "adversarial training", "primary_area": "", "supplementary_material": "", "author": "Ruize Gao;Feng Liu;Kaiwen Zhou;Gang Niu;Bo Han;James Cheng", "authorids": "~Ruize_Gao1;~Feng_Liu2;~Kaiwen_Zhou2;~Gang_Niu1;~Bo_Han1;~James_Cheng2", "gender": "M;M;M;M;M;M", "homepage": "https://sjtubrian.github.io/;https://fengliu90.github.io/index.html;https://jnhujnhu.github.io/;https://niug1984.github.io;https://www.cse.cuhk.edu.hk/~jcheng/;https://bhanml.github.io/", "dblp": "180/4683.html;77/1318-3;215/4936;26/3367-1;06/4171;241/0472-3", "google_scholar": "https://scholar.google.ch/citations?hl=en;https://scholar.google.com/citations?hl=en;nHmlZ5QAAAAJ;https://scholar.google.co.jp/citations?user=HOkcy00AAAAJ;;nTNjqHwAAAAJ", "orcid": ";0000-0002-5005-9129;;;;", "linkedin": ";alexfengliu;;;;", "or_profile": "~Ruize_Gao1;~Feng_Liu2;~Kaiwen_Zhou2;~Gang_Niu1;~James_Cheng2;~bo_han2", "aff": "The Chinese University of Hong Kong;University of Technology Sydney;The Chinese University of Hong Kong;RIKEN;The Chinese University of Hong Kong;Microsoft Research", "aff_domain": "cuhk.edu.hk;uts.edu.au;cuhk.edu.hk;riken.jp;cuhk.edu.hk;microsoft.com", "position": "Researcher;Assistant Professor;PhD student;Research Scientist (tenured);Associate Professor;Researcher", "bibtex": "@misc{\ngao2022local,\ntitle={Local Reweighting for Adversarial Training},\nauthor={Ruize Gao and Feng Liu and Kaiwen Zhou and Gang Niu and Bo Han and James Cheng},\nyear={2022},\nurl={https://openreview.net/forum?id=tJhIY38d2TS}\n}", "github": "", "project": "", "reviewers": "vQ9i;qiUn;AgZp;kFgS", "site": "https://openreview.net/forum?id=tJhIY38d2TS", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "4;4;5;4", "correctness": "3;1;2;3", "technical_novelty": "2;1;3;3", "empirical_novelty": "0;2;3;3", "wc_summary_paper": "53;25;62;44", "wc_summary_review": "62;186;43;27", "wc_main_review": "212;262;223;128", "wc_review": "327;473;328;199", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 2.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.25, 0.82915619758885 ], "empirical_novelty_avg": [ 2.0, 1.224744871391589 ], "wc_summary_paper_avg": [ 46.0, 13.693063937629153 ], "wc_summary_review_avg": [ 79.5, 62.723600024233306 ], "wc_main_review_avg": [ 206.25, 48.84861819949465 ], "wc_review_avg": [ 331.75, 96.96745588082632 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": 0.30151134457776363, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8156821074794320044&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1;0;2;0;3", "aff_unique_norm": "Chinese University of Hong Kong;University of Technology Sydney;RIKEN;Microsoft", "aff_unique_dep": ";;;Microsoft Research", "aff_unique_url": "https://www.cuhk.edu.hk;https://www.uts.edu.au;https://www.riken.jp;https://www.microsoft.com/en-us/research", "aff_unique_abbr": "CUHK;UTS;RIKEN;MSR", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;1;0;2;0;3", "aff_country_unique": "China;Australia;Japan;United States" }, { "id": "tJtOObu7Hxk", "title": "FINDING AND FIXING SPURIOUS PATTERNS WITH EXPLANATIONS", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Machine learning models often use spurious patterns such as \"relying on the presence of a person to detect a tennis racket,\" which do not generalize. In this work, we present an end-to-end pipeline for identifying and mitigating spurious patterns for image classifiers. We start by finding patterns such as \"the model's prediction for tennis racket changes 63% of the time if we hide the people.\" Then, if a pattern is spurious, we mitigate it via a novel form of data augmentation. We demonstrate that this approach identifies a diverse set of spurious patterns and that it mitigates them by producing a model that is both more accurate on a distribution where the spurious pattern is not helpful and more robust to distribution shift.", "keywords": "explainability;interpretability;debugging;spurious patterns;spurious correlations;image classification", "primary_area": "", "supplementary_material": "/attachment/2d8e2a6806a9bdd89d381827b27ef6b991e3423e.zip", "author": "Gregory Plumb;Marco Tulio Ribeiro;Ameet Talwalkar", "authorids": "~Gregory_Plumb2;~Marco_Tulio_Ribeiro1;~Ameet_Talwalkar1", "gender": ";M;M", "homepage": "https://gdplumb.github.io;;http://www.cs.cmu.edu/~atalwalk/", "dblp": ";21/10105;56/5528", "google_scholar": "_f4rfHYAAAAJ;rmsIyGMAAAAJ;https://scholar.google.com.tw/citations?user=TW7U1W0AAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Gregory_Plumb2;~Marco_Tulio_Ribeiro1;~Ameet_Talwalkar1", "aff": "Carnegie Mellon University;Microsoft;Carnegie Mellon University", "aff_domain": "cmu.edu;microsoft.com;cmu.edu", "position": "PhD student;Researcher;Associate Professor", "bibtex": "@misc{\nplumb2022finding,\ntitle={{FINDING} {AND} {FIXING} {SPURIOUS} {PATTERNS} {WITH} {EXPLANATIONS}},\nauthor={Gregory Plumb and Marco Tulio Ribeiro and Ameet Talwalkar},\nyear={2022},\nurl={https://openreview.net/forum?id=tJtOObu7Hxk}\n}", "github": "", "project": "", "reviewers": "A7is;2ab5;vjTa;KriC", "site": "https://openreview.net/forum?id=tJtOObu7Hxk", "pdf_size": 0, "recommendation": "1;3;3;5", "confidence": "4;4;3;3", "correctness": "1;3;3;3", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "50;92;56;67", "wc_summary_review": "11;56;48;69", "wc_main_review": "158;417;398;272", "wc_review": "219;565;502;408", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.0, 1.4142135623730951 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 2.5, 0.8660254037844386 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 66.25, 16.068213964221414 ], "wc_summary_review_avg": [ 46.0, 21.552262062252307 ], "wc_main_review_avg": [ 311.25, 104.56427449181676 ], "wc_review_avg": [ 423.5, 130.61871994473074 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.7071067811865476, "corr_recommendation_correctness": 0.816496580927726, "gs_citation": 45, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12335961958039209773&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1;0", "aff_unique_norm": "Carnegie Mellon University;Microsoft", "aff_unique_dep": ";Microsoft Corporation", "aff_unique_url": "https://www.cmu.edu;https://www.microsoft.com", "aff_unique_abbr": "CMU;Microsoft", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "tP7AnumqyjB", "title": "Deep Semi-Supervised 3D Shape Reconstruction by Solving a Poisson Equation with Spectral Methods", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "In this paper we propose a deep learning method for unsupervised 3D implicit shape reconstruction from point clouds. Our goal is to approximate 3D shapes as the iso-surface of a scalar field that is the solution of a Poisson partial differential equation. We propose neural network architecture that learns the distance field in the Fourier domain, and solve the PDE by using spectral differentiation through two novel loss functions. Our experiments show that our architecture can efficiently learn the Fourier coefficients while accurately estimating the target distance field. We train our models without any ground truth mesh, scalar distance field values, or surface normals.", "keywords": "Poisson equation;3D reconstruction;Physics informed machine learning;Semi-supervised learning", "primary_area": "", "supplementary_material": "", "author": "Diego Patino;Carlos Esteves;Kostas Daniilidis", "authorids": "~Diego_Patino1;~Carlos_Esteves1;~Kostas_Daniilidis1", "gender": "M;M;M", "homepage": "http://dipaco.github.io;http://machc.github.io;http://www.cis.upenn.edu/~kostas", "dblp": "52/2647;206/6834;d/KostasDaniilidis", "google_scholar": "LVZDLrwAAAAJ;cFFrCF0AAAAJ;dGs2BcIAAAAJ", "orcid": "0000-0003-4808-8411;0000-0001-9413-1201;0000-0003-0498-0758", "linkedin": "dipaco/;machc;", "or_profile": "~Diego_Patino1;~Carlos_Esteves1;~Kostas_Daniilidis1", "aff": "University of Pennsylvania;Google;University of Pennsylvania", "aff_domain": "upenn.edu;google.com;upenn.edu", "position": "Postdoc;Research Scientist;Full Professor", "bibtex": "@misc{\npatino2022deep,\ntitle={Deep Semi-Supervised 3D Shape Reconstruction by Solving a Poisson Equation with Spectral Methods},\nauthor={Diego Patino and Carlos Esteves and Kostas Daniilidis},\nyear={2022},\nurl={https://openreview.net/forum?id=tP7AnumqyjB}\n}", "github": "", "project": "", "reviewers": "bQi2;88jS;TJrN;Diy3;BmJS", "site": "https://openreview.net/forum?id=tP7AnumqyjB", "pdf_size": 0, "recommendation": "1;3;3;3;5", "confidence": "4;4;4;5;4", "correctness": "3;2;2;2;2", "technical_novelty": "2;2;2;3;3", "empirical_novelty": "2;2;3;0;3", "wc_summary_paper": "57;90;37;164;70", "wc_summary_review": "21;81;35;122;63", "wc_main_review": "324;242;288;503;336", "wc_review": "402;413;360;789;469", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "0;0;0;0;0", "reply_reviewers": "0;0;0;0;0", "reply_authors": "0;0;0;0;0", "recommendation_avg": [ 3.0, 1.2649110640673518 ], "confidence_avg": [ 4.2, 0.39999999999999997 ], "correctness_avg": [ 2.2, 0.39999999999999997 ], "technical_novelty_avg": [ 2.4, 0.4898979485566356 ], "empirical_novelty_avg": [ 2.0, 1.0954451150103321 ], "wc_summary_paper_avg": [ 83.6, 43.747457068954304 ], "wc_summary_review_avg": [ 64.4, 35.61797299117399 ], "wc_main_review_avg": [ 338.6, 88.47508123760046 ], "wc_review_avg": [ 486.6, 155.14973412803516 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": -0.7905694150420949, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:FdWVdmwub9AJ:scholar.google.com/&scioq=Deep+Semi-Supervised+3D+Shape+Reconstruction+by+Solving+a+Poisson+Equation+with+Spectral+Methods&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;0", "aff_unique_norm": "University of Pennsylvania;Google", "aff_unique_dep": ";Google", "aff_unique_url": "https://www.upenn.edu;https://www.google.com", "aff_unique_abbr": "UPenn;Google", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "tQ2yZj4sCnk", "title": "Divergence-Regularized Multi-Agent Actor-Critic", "track": "main", "status": "Reject", "tldr": "", "abstract": "Entropy regularization is a popular method in reinforcement learning (RL). Although it has many advantages, it alters the RL objective and makes the converged policy deviate from the optimal policy of the original Markov Decision Process. Though divergence regularization has been proposed to settle this problem, it cannot be trivially applied to cooperative multi-agent reinforcement learning (MARL). In this paper, we investigate divergence regularization in cooperative MARL and propose a novel off-policy cooperative MARL framework, divergence-regularized multi-agent actor-critic (DMAC). Mathematically, we derive the update rule of DMAC which is naturally off-policy, guarantees a monotonic policy improvement and is not biased by the regularization. DMAC is a flexible framework and can be combined with many existing MARL algorithms. We evaluate DMAC in a didactic stochastic game and StarCraft Multi-Agent Challenge and empirically show that DMAC substantially improves the performance of existing MARL algorithms. ", "keywords": "multi-agent reinforcement learning", "primary_area": "", "supplementary_material": "", "author": "Su Kefan;Zongqing Lu", "authorids": "~Su_Kefan2;~Zongqing_Lu2", "gender": "M;", "homepage": "https://github.com/sukefan;", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": "~Su_Kefan2;~Zongqing_Lu2", "aff": "Peking University, Tsinghua University;", "aff_domain": "pku.edu.cn;", "position": "PhD student;", "bibtex": "@misc{\nkefan2022divergenceregularized,\ntitle={Divergence-Regularized Multi-Agent Actor-Critic},\nauthor={Su Kefan and Zongqing Lu},\nyear={2022},\nurl={https://openreview.net/forum?id=tQ2yZj4sCnk}\n}", "github": "", "project": "", "reviewers": "GUrq;sxWU;b5Li;nt66", "site": "https://openreview.net/forum?id=tQ2yZj4sCnk", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "4;3;3;2", "correctness": "3;3;3;3", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;2;4;2", "wc_summary_paper": "96;45;64;79", "wc_summary_review": "40;43;59;124", "wc_main_review": "579;176;222;175", "wc_review": "715;264;345;378", "wc_reply_reviewers": "0;0;71;0", "wc_reply_authors": "1229;474;329;391", "reply_reviewers": "0;0;1;0", "reply_authors": "3;2;2;1", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 3.0, 0.7071067811865476 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 71.0, 18.801595676963167 ], "wc_summary_review_avg": [ 66.5, 33.974254958718376 ], "wc_main_review_avg": [ 288.0, 169.07838418910916 ], "wc_review_avg": [ 425.5, 172.2128044019956 ], "wc_reply_reviewers_avg": [ 17.75, 30.74390183434757 ], "wc_reply_authors_avg": [ 605.75, 363.49234861273214 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 2.0, 0.7071067811865476 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.7071067811865475, "corr_recommendation_correctness": 0.0, "gs_citation": 32, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2549353071483835929&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0", "aff_unique_norm": "Peking University", "aff_unique_dep": "", "aff_unique_url": "http://www.pku.edu.cn", "aff_unique_abbr": "Peking U", "aff_country_unique_index": "0", "aff_country_unique": "China" }, { "id": "tRfoq5xfU4f", "title": "Source-Free Few-Shot Domain Adaptation", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Deep models are prone to performance degradation when there is a domain shift between the source (training) data and target (test) data. Test-time adaptation of pre-trained source models with streaming unlabelled target data is an attractive setting that protects the privacy of source data, but it has mini-batch size and class-distribution requirements on the streaming data which might not be desirable in practice. In this paper, we propose the source-free few-shot adaptation setting to address these practical challenges in deploying test-time adaptation. Specifically, we propose a constrained optimization of source model batch normalization layers by finetuning linear combination coefficients between training and support statistics. The proposed method is easy to implement and improves source model performance with as little as one labelled target sample per class. We evaluate on different multi-domain classification datasets. Experiments demonstrate that our proposed method achieves comparable or better performance than test-time adaptation, while not constrained by streaming conditions.", "keywords": "domain adaptation;few-shot learning;model finetuning", "primary_area": "", "supplementary_material": "", "author": "Wenyu Zhang;Li Shen;Chuan-Sheng Foo;Wanyue Zhang", "authorids": "~Wenyu_Zhang1;~Li_Shen8;~Chuan-Sheng_Foo1;~Wanyue_Zhang1", "gender": ";F;M;F", "homepage": "https://sites.coecis.cornell.edu/wenyuzhang/;https://sites.google.com/site/dawnlishen/home;http://ai.stanford.edu/~csfoo;https://rosettawyzhang.github.io/", "dblp": "12/53-3;91/3680-3;73/1823;260/9519", "google_scholar": "yJpx_WoAAAAJ;https://scholar.google.com.sg/citations?user=4q9vNxQAAAAJ;AgbeqGkAAAAJ;U1ngwqMAAAAJ", "orcid": ";;0000-0002-4748-5792;", "linkedin": ";;;", "or_profile": "~Wenyu_Zhang1;~Li_Shen8;~Chuan-Sheng_Foo1;~Wanyue_Zhang2", "aff": "I2R, A*STAR;Institute for Infocomm Research (I2R), A*STAR;Institute for Infocomm Research, A*STAR;Saarland Informatics Campus, Max-Planck Institute", "aff_domain": "i2r.a-star.edu.sg;i2r.a-star.edu.sg;i2r.a-star.edu.sg;mpi-inf.mpg.de", "position": "Scientist;Scientist;Scientist;PhD student", "bibtex": "@misc{\nzhang2022sourcefree,\ntitle={Source-Free Few-Shot Domain Adaptation},\nauthor={Wenyu Zhang and Li Shen and Chuan-Sheng Foo and Wanyue Zhang},\nyear={2022},\nurl={https://openreview.net/forum?id=tRfoq5xfU4f}\n}", "github": "", "project": "", "reviewers": "U2g1;EKyc;qww1;Vwbo;RBZK;5J2h", "site": "https://openreview.net/forum?id=tRfoq5xfU4f", "pdf_size": 0, "recommendation": "3;3;5;5;5;5", "confidence": "5;5;4;4;4;5", "correctness": "4;2;3;3;3;3", "technical_novelty": "2;2;3;3;2;2", "empirical_novelty": "2;2;3;3;3;2", "wc_summary_paper": "58;32;57;98;49;83", "wc_summary_review": "61;21;36;31;45;73", "wc_main_review": "568;181;321;464;446;395", "wc_review": "687;234;414;593;540;551", "wc_reply_reviewers": "0;0;33;0;57;0", "wc_reply_authors": "502;509;283;465;1194;396", "reply_reviewers": "0;0;1;0;1;0", "reply_authors": "1;1;1;1;2;1", "recommendation_avg": [ 4.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 4.5, 0.5 ], "correctness_avg": [ 3.0, 0.5773502691896257 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 62.833333333333336, 21.767075034454116 ], "wc_summary_review_avg": [ 44.5, 17.755280904564703 ], "wc_main_review_avg": [ 395.8333333333333, 121.48994014137779 ], "wc_review_avg": [ 503.1666666666667, 144.86248268233183 ], "wc_reply_reviewers_avg": [ 15.0, 22.315913604421397 ], "wc_reply_authors_avg": [ 558.1666666666666, 294.5479115903255 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.1666666666666667, 0.37267799624996495 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.7071067811865476, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:xHRE2lr4Qn4J:scholar.google.com/&scioq=Source-Free+Few-Shot+Domain+Adaptation&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;1;2", "aff_unique_norm": "A*STAR;Institute for Infocomm Research;Max-Planck Institute", "aff_unique_dep": "Institute for Infocomm Research;;Informatics", "aff_unique_url": "https://www.a-star.edu.sg;https://www.i2r.a-star.edu.sg;https://www.mpi-sws.org", "aff_unique_abbr": "A*STAR;I2R;MPI-SWS", "aff_campus_unique_index": "1", "aff_campus_unique": ";Saarland", "aff_country_unique_index": "0;0;0;1", "aff_country_unique": "Singapore;Germany" }, { "title": "Towards Deepening Graph Neural Networks: A GNTK-based Optimization Perspective", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6585", "id": "tT9t_ZctZRL", "poster": "", "openreview": "https://openreview.net/forum?id=tT9t_ZctZRL", "slides": "https://iclr.cc/virtual/2022/poster/6585", "video": "https://iclr.cc/virtual/2022/poster/6585", "author_site": "Wei Huang, Yayong Li, weitao du, Richard Xu, Jie Yin, Ling Chen, Miao Zhang", "tldr": "", "abstract": "Graph convolutional networks (GCNs) and their variants have achieved great success in dealing with graph-structured data. Nevertheless, it is well known that deep GCNs suffer from the over-smoothing problem, where node representations tend to be indistinguishable as more layers are stacked up. The theoretical research to date on deep GCNs has focused primarily on expressive power rather than trainability, an optimization perspective. Compared to expressivity, trainability attempts to address a more fundamental question: Given a sufficiently expressive space of models, can we successfully find a good solution via gradient descent-based optimizers? This work fills this gap by exploiting the Graph Neural Tangent Kernel (GNTK), which governs the optimization trajectory under gradient descent for wide GCNs. We formulate the asymptotic behaviors of GNTK in the large depth, which enables us to reveal the dropping trainability of wide and deep GCNs at an exponential rate in the optimization process. Additionally, we extend our theoretical framework to analyze residual connection-based techniques, which are found to be merely able to mitigate the exponential decay of trainability mildly. Inspired by our theoretical insights on trainability, we propose Critical DropEdge, a connectivity-aware and graph-adaptive sampling method, to alleviate the exponential decay problem more fundamentally. Experimental evaluation consistently confirms using our proposed method can achieve better results compared to relevant counterparts with both infinite-width and finite-width. ", "keywords": "Trainablity;Graph Neural Tangent Kernel;Critical DropEdge", "primary_area": "", "supplementary_material": "/attachment/8b211406b7fd41568ddaea218e2e51e50fc1dd0f.zip", "author": "Wei Huang;Yayong Li;weitao Du;Richard Xu;Jie Yin;Ling Chen;Miao Zhang", "authorids": "~Wei_Huang6;~Yayong_Li1;~weitao_Du1;~Richard_Xu1;~Jie_Yin3;~Ling_Chen5;~Miao_Zhang4", "gender": "M;M;M;F;F;M;M", "homepage": "https://weihuang05.github.io/;;;https://www.sydney.edu.au/business/about/our-people/academic-staff/jie-yin.html;https://profiles.uts.edu.au/Ling.Chen;https://sites.google.com/view/miaozhang;https://www.math.hkbu.edu.hk/people/xu-yi-da/", "dblp": "81/6685-34;;17/10015;97/3358;17/1237-6;60/7041-1.html;38/3064", "google_scholar": "RZfDh4MAAAAJ;XKWOXE8AAAAJ;;-vxkP70AAAAJ;https://scholar.google.com.au/citations?user=L5aYWQcAAAAJ;6EUV_UMAAAAJ;ykOUWa4AAAAJ", "orcid": "0000-0001-5674-7021;0000-0003-2534-1971;;;0000-0002-6468-5729;0000-0002-1262-4174;0000-0003-2080-4762", "linkedin": ";;;;;miao-zhang-71b13a177/;richard-xu-0221a943/", "or_profile": "~Wei_Huang6;~Yayong_Li1;~weitao_Du1;~Jie_Yin3;~Ling_Chen5;~Miao_Zhang4;~Richard_Yi_Da_Xu1", "aff": "RIKEN AIP;, CSIRO;Academy of Mathematics and Systems Science, Chinese Academy of Sciences, Chinese Academy of Sciences;The University of Sydney;University of Technology Sydney;Aalborg University, Aalborg University;Hong Kong Baptist University", "aff_domain": "riken.jp;data61.csiro.au;amss.ac.cn;sydney.edu.au;uts.edu.au;cs.aau.dk;hkbu.edu.hk", "position": "Postdoc;Postdoc;Postdoc;Associate Professor;Full Professor;Assistant Professor;Full Professor", "bibtex": "@inproceedings{\nhuang2022towards,\ntitle={Towards Deepening Graph Neural Networks: A {GNTK}-based Optimization Perspective},\nauthor={Wei Huang and Yayong Li and weitao Du and Richard Xu and Jie Yin and Ling Chen and Miao Zhang},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=tT9t_ZctZRL}\n}", "github": "", "project": "", "reviewers": "UKok;KNdY;sX4L;CDt6;Bi6F", "pdf_size": 0, "recommendation": "5;6;6;6;8", "confidence": "3;3;2;4;2", "correctness": "3;3;3;3;4", "technical_novelty": "3;3;2;3;3", "empirical_novelty": "2;2;2;3;3", "wc_summary_paper": "88;75;68;115;123", "wc_summary_review": "36;37;37;133;74", "wc_main_review": "632;146;224;403;293", "wc_review": "756;258;329;651;490", "wc_reply_reviewers": "137;12;0;0;39", "wc_reply_authors": "1620;651;737;948;1130", "reply_reviewers": "1;1;0;0;1", "reply_authors": "5;2;2;3;2", "recommendation_avg": [ 6.2, 0.9797958971132712 ], "confidence_avg": [ 2.8, 0.7483314773547882 ], "correctness_avg": [ 3.2, 0.39999999999999997 ], "technical_novelty_avg": [ 2.8, 0.39999999999999997 ], "empirical_novelty_avg": [ 2.4, 0.4898979485566356 ], "wc_summary_paper_avg": [ 93.8, 21.70161284328886 ], "wc_summary_review_avg": [ 63.4, 37.68607169764447 ], "wc_main_review_avg": [ 339.6, 168.8390949987591 ], "wc_review_avg": [ 496.8, 187.71829958744033 ], "wc_reply_reviewers_avg": [ 37.6, 51.70145065663051 ], "wc_reply_authors_avg": [ 1017.2, 344.5271542273555 ], "reply_reviewers_avg": [ 0.6, 0.48989794855663565 ], "reply_authors_avg": [ 2.8, 1.16619037896906 ], "replies_avg": [ 27, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": -0.4909902530309828, "corr_recommendation_correctness": 0.9185586535436918, "gs_citation": 33, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10160869672579499592&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "pdf": "https://openreview.net/pdf?id=tT9t_ZctZRL", "email": "riken.jp;data61.csiro.au;amss.ac.cn;sydney.edu.au;uts.edu.au;cs.aau.dk;hkbu.edu.hk", "author_num": 7, "aff_unique_index": "0;1;2;3;4;5;6", "aff_unique_norm": "RIKEN;CSIRO;Chinese Academy of Sciences;University of Sydney;University of Technology Sydney;Aalborg University;Hong Kong Baptist University", "aff_unique_dep": "Advanced Institute for Computational Science;;Academy of Mathematics and Systems Science;;;;", "aff_unique_url": "https://www.aip.riken.jp;https://www.csiro.au;http://www.cas.cn;https://www.sydney.edu.au;https://www.uts.edu.au;https://www.aau.dk;https://www.hkbu.edu.hk", "aff_unique_abbr": "RIKEN AIP;CSIRO;CAS;USYD;UTS;AAU;HKBU", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Aalborg;Hong Kong SAR", "aff_country_unique_index": "0;1;2;1;1;3;2", "aff_country_unique": "Japan;Australia;China;Denmark" }, { "title": "Efficient Computation of Deep Nonlinear Infinite-Width Neural Networks that Learn Features", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/5924", "id": "tUMr0Iox8XW", "poster": "", "openreview": "https://openreview.net/forum?id=tUMr0Iox8XW", "slides": "https://iclr.cc/virtual/2022/poster/5924", "video": "https://iclr.cc/virtual/2022/poster/5924", "author_site": "Greg Yang, Michael Santacroce, Edward Hu", "tldr": "", "abstract": "While a popular limit of infinite-width neural networks, the Neural Tangent Kernel (NTK) often exhibits performance gaps from finite-width neural networks on standard datasets, due to lack of feature learning. Although the feature learning *maximal update limit*, or *\u03bc-limit* (Yang and Hu, 2020) of wide networks has closed the gap for 1-hidden-layer linear models, no one has been able to demonstrate this for deep nonlinear multi-layer perceptrons (MLP) because of \u03bc-limit\u2019s computational difficulty in this setting. Here, we solve this problem by proposing a novel feature learning limit, the *\u03c0-limit*, that bypasses the computational issues. The \u03c0-limit, in short, is the limit of a form of projected gradient descent, and the \u03c0-limit of an MLP is roughly another MLP where gradients are appended to weights during training. We prove its almost sure convergence with width using the Tensor Programs technique. We evaluate it on CIFAR10 and Omniglot against NTK as well as finite networks, finding the \u03c0-limit outperform finite-width models trained normally (without projection) in both settings, closing the performance gap between finite- and infinite-width neural networks previously left by NTK. Code for this work is available at github.com/santacml/pilim.", "keywords": "infinite-width neural network;feature learning;maximal update parametrization;NTK", "primary_area": "", "supplementary_material": "", "author": "Greg Yang;Michael Santacroce;Edward J Hu", "authorids": "~Greg_Yang1;michael.santacroce@microsoft.com;~Edward_J_Hu1", "gender": "M;;M", "homepage": ";;https://edwardjhu.com", "dblp": "153/2097;;295/8436", "google_scholar": "Xz4RAJkAAAAJ;;2eADy_8AAAAJ", "orcid": ";;", "linkedin": ";;edwardjhu/", "or_profile": "~Greg_Yang1;michael.santacroce@microsoft.com;~Edward_J_Hu1", "aff": "Microsoft;;Montreal Institute for Learning Algorithms, University of Montreal, Universit\u00e9 de Montr\u00e9al", "aff_domain": "microsoft.com;;mila.umontreal.ca", "position": "Researcher;;PhD student", "bibtex": "@inproceedings{\nyang2022efficient,\ntitle={Efficient Computation of Deep Nonlinear Infinite-Width Neural Networks that Learn Features},\nauthor={Greg Yang and Michael Santacroce and Edward J Hu},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=tUMr0Iox8XW}\n}", "github": "", "project": "", "reviewers": "pNHj;MBzR;emnb;JW6k", "pdf_size": 0, "recommendation": "6;6;6;8", "confidence": "3;3;4;3", "correctness": "3;3;2;4", "technical_novelty": "3;3;4;4", "empirical_novelty": "3;2;3;4", "wc_summary_paper": "155;72;67;72", "wc_summary_review": "52;84;33;65", "wc_main_review": "196;601;1228;370", "wc_review": "403;757;1328;507", "wc_reply_reviewers": "84;24;611;352", "wc_reply_authors": "711;1601;2725;731", "reply_reviewers": "1;1;2;1", "reply_authors": "2;4;6;2", "recommendation_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 3.5, 0.5 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 91.5, 36.718523935474316 ], "wc_summary_review_avg": [ 58.5, 18.607794065928395 ], "wc_main_review_avg": [ 598.75, 390.6708173129905 ], "wc_review_avg": [ 748.75, 358.3241374789033 ], "wc_reply_reviewers_avg": [ 267.75, 233.50414878541238 ], "wc_reply_authors_avg": [ 1442.0, 823.2939936620454 ], "reply_reviewers_avg": [ 1.25, 0.4330127018922193 ], "reply_authors_avg": [ 3.5, 1.6583123951777 ], "replies_avg": [ 24, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": 0.816496580927726, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1233508317588857803&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 2, "pdf": "https://openreview.net/pdf?id=tUMr0Iox8XW", "email": "microsoft.com;;mila.umontreal.ca", "author_num": 3, "aff_unique_index": "0;1", "aff_unique_norm": "Microsoft;University of Montreal", "aff_unique_dep": "Microsoft Corporation;Montreal Institute for Learning Algorithms", "aff_unique_url": "https://www.microsoft.com;https://www.mila.quebec", "aff_unique_abbr": "Microsoft;MILA", "aff_campus_unique_index": "1", "aff_campus_unique": ";Montreal", "aff_country_unique_index": "0;1", "aff_country_unique": "United States;Canada" }, { "title": "On the Certified Robustness for Ensemble Models and Beyond", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6874", "id": "tUa4REjGjTf", "poster": "", "openreview": "https://openreview.net/forum?id=tUa4REjGjTf", "slides": "https://iclr.cc/virtual/2022/poster/6874", "video": "https://iclr.cc/virtual/2022/poster/6874", "author_site": "Zhuolin Yang, Linyi Li, Xiaojun Xu, Bhavya Kailkhura, Tao Xie, Bo Li", "tldr": "", "abstract": "Recent studies show that deep neural networks (DNN) are vulnerable to adversarial examples, which aim to mislead DNNs by adding perturbations with small magnitude. To defend against such attacks, both empirical and theoretical defense approaches have been extensively studied for a single ML model. In this work, we aim to analyze and provide the certified robustness for ensemble ML models, together with the sufficient and necessary conditions of robustness for different ensemble protocols. Although ensemble models are shown more robust than a single model empirically; surprisingly, we find that in terms of the certified robustness the standard ensemble models only achieve marginal improvement compared to a single model. Thus, to explore the conditions that guarantee to provide certifiably robust ensemble ML models, we first prove that diversified gradient and large confidence margin are sufficient and necessary conditions for certifiably robust ensemble models under the model-smoothness assumption. We then provide the bounded model-smoothness analysis based on the proposed Ensemble-before-Smoothing strategy. We also prove that an ensemble model can always achieve higher certified robustness than a single base model under mild conditions. Inspired by the theoretical findings, we propose the lightweight Diversity Regularized Training (DRT) to train certifiably robust ensemble ML models. Extensive experiments show that our DRT enhanced ensembles can consistently achieve higher certified robustness than existing single and ensemble ML models, demonstrating the state-of-the-art certified $L_2$-robustness on MNIST, CIFAR-10, and ImageNet datasets.", "keywords": "robustness;ensemble;certified robustness", "primary_area": "", "supplementary_material": "/attachment/1d17f01507583bb24234968daf2daf5bcc0829f6.zip", "author": "Zhuolin Yang;Linyi Li;Xiaojun Xu;Bhavya Kailkhura;Tao Xie;Bo Li", "authorids": "~Zhuolin_Yang1;~Linyi_Li1;~Xiaojun_Xu1;~Bhavya_Kailkhura1;~Tao_Xie4;~Bo_Li19", "gender": "M;M;M;M;M;F", "homepage": "https://lucas110550.github.io/about;http://linyil.com;;https://people.llnl.gov/kailkhura1;https://taoxiease.github.io/;http://boli.cs.illinois.edu/", "dblp": ";99/4340-1.html;;132/8938;x/TaoXie;50/3402-26", "google_scholar": "BvSv-C0AAAAJ;-b0sk-YAAAAJ;rdMZZQwAAAAJ;SQpJmOgAAAAJ;DhhH9J4AAAAJ;K8vJkTcAAAAJ", "orcid": ";;;;0000-0002-6731-216X;", "linkedin": ";;;;;", "or_profile": "~Zhuolin_Yang1;~Linyi_Li1;~Xiaojun_Xu1;~Bhavya_Kailkhura1;~Tao_Xie4;~Bo_Li19", "aff": "University of Illinois at Urbana Champaign;Microsoft Research;University of Illinois, Urbana Champaign;Lawrence Livermore National Laboratory;University of Illinois, Urbana Champaign;University of Illinois, Urbana Champaign", "aff_domain": "illinois.edu;microsoft.com;illinois.edu;llnl.gov;illinois.edu;illinois.edu", "position": "PhD student;Research Intern;PhD student;Research Staff;Full Professor;Assistant Professor", "bibtex": "@inproceedings{\nyang2022on,\ntitle={On the Certified Robustness for Ensemble Models and Beyond},\nauthor={Zhuolin Yang and Linyi Li and Xiaojun Xu and Bhavya Kailkhura and Tao Xie and Bo Li},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=tUa4REjGjTf}\n}", "github": "", "project": "", "reviewers": "VR8o;AZhe;ug5z;f6eQ;mt7k", "pdf_size": 0, "recommendation": "6;6;6;8;8", "confidence": "3;4;4;3;4", "correctness": "4;3;3;4;4", "technical_novelty": "3;3;3;4;3", "empirical_novelty": "2;4;4;4;3", "wc_summary_paper": "47;95;58;60;59", "wc_summary_review": "26;61;3;51;36", "wc_main_review": "111;282;74;264;105", "wc_review": "184;438;135;375;200", "wc_reply_reviewers": "41;0;0;75;13", "wc_reply_authors": "814;1657;521;1630;171", "reply_reviewers": "1;0;0;1;1", "reply_authors": "2;3;1;3;1", "recommendation_avg": [ 6.8, 0.9797958971132712 ], "confidence_avg": [ 3.6, 0.4898979485566356 ], "correctness_avg": [ 3.6, 0.4898979485566356 ], "technical_novelty_avg": [ 3.2, 0.39999999999999997 ], "empirical_novelty_avg": [ 3.4, 0.8 ], "wc_summary_paper_avg": [ 63.8, 16.28987415543779 ], "wc_summary_review_avg": [ 35.4, 20.185143051264216 ], "wc_main_review_avg": [ 167.2, 87.47891174448846 ], "wc_review_avg": [ 266.4, 118.07218131295788 ], "wc_reply_reviewers_avg": [ 25.8, 28.7986110776197 ], "wc_reply_authors_avg": [ 958.6, 595.1902552965732 ], "reply_reviewers_avg": [ 0.6, 0.48989794855663565 ], "reply_authors_avg": [ 2.0, 0.8944271909999159 ], "replies_avg": [ 20, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.16666666666666663, "corr_recommendation_correctness": 0.6666666666666667, "gs_citation": 60, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8592223056112126154&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "pdf": "https://openreview.net/pdf?id=tUa4REjGjTf", "email": "illinois.edu;microsoft.com;illinois.edu;llnl.gov;illinois.edu;illinois.edu", "author_num": 6, "aff_unique_index": "0;1;0;2;0;0", "aff_unique_norm": "University of Illinois Urbana-Champaign;Microsoft;Lawrence Livermore National Laboratory", "aff_unique_dep": ";Microsoft Research;", "aff_unique_url": "https://illinois.edu;https://www.microsoft.com/en-us/research;https://www.llnl.gov", "aff_unique_abbr": "UIUC;MSR;LLNL", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Urbana-Champaign;", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Natural Posterior Network: Deep Bayesian Predictive Uncertainty for Exponential Family Distributions", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6370", "id": "tV3N0DWMxCg", "poster": "", "openreview": "https://openreview.net/forum?id=tV3N0DWMxCg", "slides": "https://iclr.cc/virtual/2022/poster/6370", "video": "https://iclr.cc/virtual/2022/poster/6370", "author_site": "Bertrand Charpentier, Oliver Borchert, Daniel Z\u00fcgner, Simon Geisler, Stephan G\u00fcnnemann", "tldr": "", "abstract": "Uncertainty awareness is crucial to develop reliable machine learning models. In this work, we propose the Natural Posterior Network (NatPN) for fast and high-quality uncertainty estimation for any task where the target distribution belongs to the exponential family. Thus, NatPN finds application for both classification and general regression settings. Unlike many previous approaches, NatPN does not require out-of-distribution (OOD) data at training time. Instead, it leverages Normalizing Flows to fit a single density on a learned low-dimensional and task-dependent latent space. For any input sample, NatPN uses the predicted likelihood to perform a Bayesian update over the target distribution. Theoretically, NatPN assigns high uncertainty far away from training data. Empirically, our extensive experiments on calibration and OOD detection show that NatPN delivers highly competitive performance for classification, regression and count prediction tasks.", "keywords": "Uncertainty;Exponential Family;Bayesian Update;Conjugate Prior", "primary_area": "", "supplementary_material": "", "author": "Bertrand Charpentier;Oliver Borchert;Daniel Z\u00fcgner;Simon Geisler;Stephan G\u00fcnnemann", "authorids": "~Bertrand_Charpentier2;~Oliver_Borchert1;~Daniel_Z\u00fcgner1;~Simon_Geisler1;~Stephan_G\u00fcnnemann1", "gender": ";M;M;M;M", "homepage": "https://sharpenb.github.io/;;;http://www.daml.in.tum.de;https://www.in.tum.de/en/daml/team/simon-geisler/", "dblp": "222/1875;;172/6951;43/3011;237/0253", "google_scholar": "0rqI-ycAAAAJ;;;;00x9jJwAAAAJ", "orcid": ";;;;0000-0003-0867-1856", "linkedin": "bertrand-charpentier-76995ab6/;https://linkedin.com/in/borchero;;;simon-geisler-ai/", "or_profile": "~Bertrand_Charpentier2;~Oliver_Borchert1;~Daniel_Z\u00fcgner1;~Stephan_G\u00fcnnemann1;~Simon_Markus_Geisler1", "aff": "Technical University Munich;;Technical University Munich;Technical University Munich;Google", "aff_domain": "tum.de;;tum.de;tum.de;google.com", "position": "PhD student;;PhD student;Professor;Intern", "bibtex": "@inproceedings{\ncharpentier2022natural,\ntitle={Natural Posterior Network: Deep Bayesian Predictive Uncertainty for Exponential Family Distributions},\nauthor={Bertrand Charpentier and Oliver Borchert and Daniel Z{\\\"u}gner and Simon Geisler and Stephan G{\\\"u}nnemann},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=tV3N0DWMxCg}\n}", "github": "", "project": "", "reviewers": "qjTd;x9Pt;QsKU;cKhx;pzkS", "pdf_size": 0, "recommendation": "6;6;8;8;8", "confidence": "4;4;3;3;3", "correctness": "3;3;4;3;3", "technical_novelty": "2;3;4;3;3", "empirical_novelty": "2;3;4;2;3", "wc_summary_paper": "59;66;59;229;168", "wc_summary_review": "47;47;54;183;122", "wc_main_review": "863;542;322;828;656", "wc_review": "969;655;435;1240;946", "wc_reply_reviewers": "0;178;113;77;53", "wc_reply_authors": "1224;965;1191;1666;854", "reply_reviewers": "0;1;1;1;1", "reply_authors": "2;2;3;3;2", "recommendation_avg": [ 7.2, 0.9797958971132712 ], "confidence_avg": [ 3.4, 0.4898979485566356 ], "correctness_avg": [ 3.2, 0.39999999999999997 ], "technical_novelty_avg": [ 3.0, 0.6324555320336759 ], "empirical_novelty_avg": [ 2.8, 0.7483314773547882 ], "wc_summary_paper_avg": [ 116.2, 69.95827327771892 ], "wc_summary_review_avg": [ 90.6, 54.15754795040115 ], "wc_main_review_avg": [ 642.2, 198.0064645409336 ], "wc_review_avg": [ 849.0, 277.74880737817756 ], "wc_reply_reviewers_avg": [ 84.2, 59.569791001815666 ], "wc_reply_authors_avg": [ 1180.0, 279.55464582081265 ], "reply_reviewers_avg": [ 0.8, 0.4000000000000001 ], "reply_authors_avg": [ 2.4, 0.4898979485566356 ], "replies_avg": [ 22, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -1.0, "corr_recommendation_correctness": 0.40824829046386296, "gs_citation": 83, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8571414309595111557&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=tV3N0DWMxCg", "email": "tum.de;;tum.de;tum.de;google.com", "author_num": 5, "aff_unique_index": "0;0;0;1", "aff_unique_norm": "Technical University of Munich;Google", "aff_unique_dep": ";Google", "aff_unique_url": "https://www.tum.de;https://www.google.com", "aff_unique_abbr": "TUM;Google", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0;1", "aff_country_unique": "Germany;United States" }, { "title": "Non-Transferable Learning: A New Approach for Model Ownership Verification and Applicability Authorization", "status": "Oral", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6143", "id": "tYRrOdSnVUy", "poster": "", "openreview": "https://openreview.net/forum?id=tYRrOdSnVUy", "slides": "https://iclr.cc/virtual/2022/poster/6143", "video": "https://iclr.cc/virtual/2022/poster/6143", "author_site": "Lixu Wang, Shichao Xu, Ruiqi Xu, Xiao Wang, Qi Zhu", "tldr": "", "abstract": "As Artificial Intelligence as a Service gains popularity, protecting well-trained models as intellectual property is becoming increasingly important. There are two common types of protection methods: ownership verification and usage authorization. In this paper, we propose Non-Transferable Learning (NTL), a novel approach that captures the exclusive data representation in the learned model and restricts the model generalization ability to certain domains. This approach provides effective solutions to both model verification and authorization. Specifically: 1) For ownership verification, watermarking techniques are commonly used but are often vulnerable to sophisticated watermark removal methods. By comparison, our NTL-based ownership verification provides robust resistance to state-of-the-art watermark removal methods, as shown in extensive experiments with 6 removal approaches over the digits, CIFAR10 & STL10, and VisDA datasets. 2) For usage authorization, prior solutions focus on authorizing specific users to access the model, but authorized users can still apply the model to any data without restriction. Our NTL-based authorization approach instead provides data-centric protection, which we call applicability authorization, by significantly degrading the performance of the model on unauthorized data. Its effectiveness is also shown through experiments on aforementioned datasets. ", "keywords": "Domain Adaptation;Transfer Learning;Societal Considerations of Representation Learning;Model Watermark", "primary_area": "", "supplementary_material": "", "author": "Lixu Wang;Shichao Xu;Ruiqi Xu;Xiao Wang;Qi Zhu", "authorids": "~Lixu_Wang1;~Shichao_Xu1;~Ruiqi_Xu1;~Xiao_Wang11;~Qi_Zhu2", "gender": ";M;M;M;", "homepage": ";;;https://wangxiao1254.github.io/;http://zhulab.ece.northwestern.edu/", "dblp": ";161/2413;;150/9413;66/5923-2.html", "google_scholar": ";S-9u3vYAAAAJ;;QbWLR8QAAAAJ;TN09YMcAAAAJ", "orcid": ";;;;", "linkedin": ";;jerry-xu-ruiqi/;;", "or_profile": "~Lixu_Wang1;~Shichao_Xu1;~Ruiqi_Xu1;~Xiao_Wang11;~Qi_Zhu2", "aff": ";Northwestern University;Northwestern University, Northwestern University;Northwestern University;Northwestern University", "aff_domain": ";northwestern.edu;u.northwestern.edu;northwestern.edu;northwestern.edu", "position": ";PhD student;Undergrad student;Assistant Professor;Associate Professor", "bibtex": "@inproceedings{\nwang2022nontransferable,\ntitle={Non-Transferable Learning: A New Approach for Model Ownership Verification and Applicability Authorization},\nauthor={Lixu Wang and Shichao Xu and Ruiqi Xu and Xiao Wang and Qi Zhu},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=tYRrOdSnVUy}\n}", "github": "", "project": "", "reviewers": "RSPY;1wBe;FcxC", "pdf_size": 0, "recommendation": "8;8;8", "confidence": "5;2;4", "correctness": "4;4;4", "technical_novelty": "4;3;4", "empirical_novelty": "4;0;4", "wc_summary_paper": "113;60;193", "wc_summary_review": "18;47;43", "wc_main_review": "206;205;370", "wc_review": "337;312;606", "wc_reply_reviewers": "13;0;38", "wc_reply_authors": "918;724;689", "reply_reviewers": "1;0;1", "reply_authors": "2;1;1", "recommendation_avg": [ 8.0, 0.0 ], "confidence_avg": [ 3.6666666666666665, 1.247219128924647 ], "correctness_avg": [ 4.0, 0.0 ], "technical_novelty_avg": [ 3.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.6666666666666665, 1.8856180831641267 ], "wc_summary_paper_avg": [ 122.0, 54.668699149208464 ], "wc_summary_review_avg": [ 36.0, 12.832251036613439 ], "wc_main_review_avg": [ 260.3333333333333, 77.54711829304526 ], "wc_review_avg": [ 418.3333333333333, 133.0922821036425 ], "wc_reply_reviewers_avg": [ 17.0, 15.769168230019828 ], "wc_reply_authors_avg": [ 777.0, 100.72073603119998 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 55, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9579671006829239762&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "pdf": "https://openreview.net/pdf?id=tYRrOdSnVUy", "email": ";northwestern.edu;u.northwestern.edu;northwestern.edu;northwestern.edu", "author_num": 5, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Northwestern University", "aff_unique_dep": "", "aff_unique_url": "https://www.northwestern.edu", "aff_unique_abbr": "NU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Neural Methods for Logical Reasoning over Knowledge Graphs", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/5976", "id": "tgcAoUVHRIB", "poster": "", "openreview": "https://openreview.net/forum?id=tgcAoUVHRIB", "slides": "https://iclr.cc/virtual/2022/poster/5976", "video": "https://iclr.cc/virtual/2022/poster/5976", "author_site": "Alfonso Amayuelas, Shuai Zhang, Xi Rao, Ce Zhang", "tldr": "", "abstract": "Reasoning is a fundamental problem for computers and deeply studied in Artificial Intelligence. In this paper, we specifically focus on answering multi-hop logical queries on Knowledge Graphs (KGs). This is a complicated task because, in real world scenarios, the graphs tend to be large and incomplete. Most previous works have been unable to create models that accept full First-Order Logical (FOL) queries, which includes negative queries, and have only been able to process a limited set of query structures. Additionally, most methods present logic operators that can only perform the logical operation they are made for. We introduce a set of models that use Neural Networks to create one-point vector embeddings to answer the queries. The versatility of neural networks allows the framework to handle FOL queries with Conjunction, Disjunction and Negation operators. We demonstrate experimentally the performance of our models through extensive experimentation on well-known benchmarking datasets. Besides having more versatile operators, the models achieve a 10% relative increase over best performing state of the art and more than 30% over the original method based on single-point vector embeddings.", "keywords": "Knowledge Graphs;Knowledge Graph Reasoning;Graph Mining;Data Mining;Machine Learning;Artificial Intelligence;Deep Learning", "primary_area": "", "supplementary_material": "", "author": "Alfonso Amayuelas;Shuai Zhang;Xi Susie Rao;Ce Zhang", "authorids": "~Alfonso_Amayuelas2;~Shuai_Zhang7;~Xi_Susie_Rao1;~Ce_Zhang1", "gender": "M;F;;M", "homepage": "https://www.amayuelas.me/;https://susierao.github.io/;;http://shuaizhang.tech/", "dblp": "281/7669;279/3727;97/919;71/208-7", "google_scholar": "https://scholar.google.dk/citations?user=QGQ2G28AAAAJ;9Bes4hAAAAAJ;;https://scholar.google.com.au/citations?user=PPjdxlcAAAAJ", "orcid": ";0000-0003-2379-1506;;0000-0002-7866-4611", "linkedin": "alfonsoamayuelas/;susie-xi-rao-a0199b15b/?original_referer=https%3A%2F%2Fwww%2Egoogle%2Ecom%2F&originalSubdomain=ch;;shuai-zhang-b3762911b/", "or_profile": "~Alfonso_Amayuelas2;~Xi_Susie_Rao1;~Ce_Zhang1;~SHUAI_Zhang5", "aff": "University of California, Santa Barbara;Swiss Federal Institute of Technology;University of Chicago;Swiss Federal Institute of Technology", "aff_domain": "ucsb.edu;ethz.ch;uchicago.edu;ethz.ch", "position": "PhD student;PhD student;Associate Professor;Postdoc", "bibtex": "@inproceedings{\namayuelas2022neural,\ntitle={Neural Methods for Logical Reasoning over Knowledge Graphs},\nauthor={Alfonso Amayuelas and Shuai Zhang and Xi Susie Rao and Ce Zhang},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=tgcAoUVHRIB}\n}", "github": "", "project": "", "reviewers": "vpE2;rkaY;ukvD;zRD3", "pdf_size": 0, "recommendation": "5;5;6;8", "confidence": "3;3;4;3", "correctness": "4;3;2;3", "technical_novelty": "3;2;2;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "34;13;19;109", "wc_summary_review": "43;17;22;26", "wc_main_review": "230;141;266;135", "wc_review": "307;171;307;270", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "709;788;534;400", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 6.0, 1.224744871391589 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 43.75, 38.440701086218496 ], "wc_summary_review_avg": [ 27.0, 9.772410142846033 ], "wc_main_review_avg": [ 193.0, 56.49336244197189 ], "wc_review_avg": [ 263.75, 55.63890275697392 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 607.75, 151.11316124017787 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": -0.28867513459481287, "gs_citation": 48, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11327310359192902619&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=tgcAoUVHRIB", "email": "ucsb.edu;ethz.ch;uchicago.edu;ethz.ch", "author_num": 4, "aff_unique_index": "0;1;2;1", "aff_unique_norm": "University of California, Santa Barbara;Swiss Federal Institute of Technology;University of Chicago", "aff_unique_dep": ";;", "aff_unique_url": "https://www.ucsb.edu;https://www.ethz.ch;https://www.uchicago.edu", "aff_unique_abbr": "UCSB;ETH Zurich;UChicago", "aff_campus_unique_index": "0", "aff_campus_unique": "Santa Barbara;", "aff_country_unique_index": "0;1;0;1", "aff_country_unique": "United States;Switzerland" }, { "id": "tge0BZv1Ay", "title": "PDQN - A Deep Reinforcement Learning Method for Planning with Long Delays: Optimization of Manufacturing Dispatching", "track": "main", "status": "Reject", "tldr": "", "abstract": "Scheduling is an important component in Semiconductor Manufacturing systems, where decisions must be made as to how to prioritize the use of finite machine resources to complete operations on parts in a timely manner. Traditionally, Operations Research methods have been used for simple, less complex systems. However, due to the complexity of this scheduling problem, simple dispatching rules such as Critical Ratio, and First-In-First-Out, are often used in practice in the industry for these more complex factories. This paper proposes a novel method based on Deep Reinforcement Learning for developing dynamic scheduling policies through interaction with simulated stochastic manufacturing systems. We experiment with simulated systems based on a complex Western Digital semiconductor plant. Our method builds upon DeepMind\u2019s Deep Q-network, and predictron methods to create a novel algorithm, Predictron Deep Q-network, which utilizes a predictron model as a trained planning model to create training targets for a Deep Q-Network based policy. In recent years, Deep Reinforcement Learning methods have shown state of the art performance on sequential decision-making processes in complex games such as Go. Semiconductor manufacturing systems, however, provide significant additional challenges due to complex dynamics, stochastic transitions, and long time horizons with the associated delayed rewards. In addition, dynamic decision policies need to account for uncertainties such as machine downtimes. Experimental results demonstrate that, in our simulated environments, the Predictron Deep Q-network outperforms the Deep Q-network, Critical Ratio, and First-In-First-Out dispatching policies on the task of minimizing lateness of parts.", "keywords": "This paper proposes the PDQN;a novel method based on integrating Deep Reinforcement Learning and abstract planning for developing dynamic scheduling for policies in systems with very delayed rewards;e.g. manufacturing.", "primary_area": "", "supplementary_material": "/attachment/fd50ab5ed01b8c187e7e78f69f345d50777899ed.zip", "author": "David C Jenkins;Ren\u00e9 Arendt S\u00f8rensen;Vikramank Singh;Philip Kaminsky;Anil Aswani;Ramakrishna Akella", "authorids": "~David_C_Jenkins1;~Ren\u00e9_Arendt_S\u00f8rensen1;~Vikramank_Singh1;~Philip_Kaminsky1;~Anil_Aswani1;~Ramakrishna_Akella2", "gender": ";M;M;M;;M", "homepage": ";;https://svikramank.github.io/;https://kaminsky.ieor.berkeley.edu/;;https://www.linkedin.com/in/ram-akella-a6a03a4/", "dblp": ";259/3417;;;08/1340;", "google_scholar": ";pHt2xfoAAAAJ;IRwawZcAAAAJ;;DzSZ5KAAAAAJ;", "orcid": ";0000-0001-6388-1202;;;;", "linkedin": "davidcjenkins1;renearendtsoerensen/;vikramanksingh/;;;", "or_profile": "~David_C_Jenkins1;~Ren\u00e9_Arendt_S\u00f8rensen1;~Vikramank_Singh1;~Philip_Kaminsky1;~Anil_Aswani1;~Ramakrishna_Akella2", "aff": ";Aarhus University;Learned Systems Group, AWS;University of California, Berkeley;University of California, Berkeley;", "aff_domain": ";au.dk;amazon.com;berkeley.edu;berkeley.edu;", "position": ";PhD student;Senior Applied Scientist;Full Professor;Associate Professor;", "bibtex": "@misc{\njenkins2022pdqn,\ntitle={{PDQN} - A Deep Reinforcement Learning Method for Planning with Long Delays: Optimization of Manufacturing Dispatching},\nauthor={David C Jenkins and Ren{\\'e} Arendt S{\\o}rensen and Vikramank Singh and Philip Kaminsky and Anil Aswani and Ramakrishna Akella},\nyear={2022},\nurl={https://openreview.net/forum?id=tge0BZv1Ay}\n}", "github": "", "project": "", "reviewers": "8ES3;QqV1;uJaW;ZLM1", "site": "https://openreview.net/forum?id=tge0BZv1Ay", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "4;4;4;4", "correctness": "2;3;4;3", "technical_novelty": "1;2;2;2", "empirical_novelty": "1;2;3;2", "wc_summary_paper": "45;81;175;80", "wc_summary_review": "21;97;26;235", "wc_main_review": "117;306;425;550", "wc_review": "183;484;626;865", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "559;534;727;1545", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;2", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 1.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 95.25, 48.27201570268223 ], "wc_summary_review_avg": [ 94.75, 86.37237695004116 ], "wc_main_review_avg": [ 349.5, 159.56895061383338 ], "wc_review_avg": [ 539.5, 246.78178619987335 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 841.25, 413.032913337424 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.7071067811865475, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:9Us2jhvKadAJ:scholar.google.com/&scioq=PDQN+-+A+Deep+Reinforcement+Learning+Method+for+Planning+with+Long+Delays:+Optimization+of+Manufacturing+Dispatching&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;2;2", "aff_unique_norm": "Aarhus University;Amazon;University of California, Berkeley", "aff_unique_dep": ";Learned Systems Group;", "aff_unique_url": "https://au.dk;https://aws.amazon.com;https://www.berkeley.edu", "aff_unique_abbr": "AU;AWS;UC Berkeley", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Berkeley", "aff_country_unique_index": "0;1;1;1", "aff_country_unique": "Denmark;United States" }, { "id": "tiKNfYpH8le", "title": "Pareto Navigation Gradient Descent: a First Order Algorithm for Optimization in Pareto Set", "track": "main", "status": "Reject", "tldr": "", "abstract": "Many modern machine learning applications, such as multi-task learning, require finding optimal model parameters to trade-off multiple objective functions that may conflict with each other.\nThe notion of the Pareto set allows us to focus on the set of (often infinite number of)\nmodels that cannot be strictly improved. But it does not provide an actionable procedure for picking one or a few special models to return to practical users. In this paper, we consider \\emph{optimization in Pareto set (OPT-in-Pareto)}, the problem of finding Pareto models that optimize an extra reference criterion function within the Pareto set. This function can either encode a specific preference from the users, or represent a generic diversity measure for obtaining a set of diversified Pareto models that are representative of the whole Pareto set.\nUnfortunately, despite being a highly useful framework, efficient algorithms for OPT-in-Pareto have been largely missing, especially for large-scale, non-convex, and non-linear objectives in deep learning. A naive approach is to apply Riemannian manifold gradient descent on the Pareto set, which yields a high computational cost due to the need for eigen-calculation of Hessian matrices. We propose a first-order algorithm that approximately solves OPT-in-Pareto using only gradient information, with both high practical efficiency and theoretically guaranteed convergence property. Empirically, we demonstrate that our method works efficiently for a variety of challenging multi-task-related problems.", "keywords": "Pareto set;Multitask learning", "primary_area": "", "supplementary_material": "/attachment/3c386e72bd0ca3e37e1beddc25e7640792440cff.zip", "author": "Mao Ye;qiang liu", "authorids": "~Mao_Ye11;~qiang_liu4", "gender": "M;M", "homepage": "https://lushleaf.github.io/;https://www.cs.utexas.edu/~lqiang/", "dblp": "36/2301;61/3234-1", "google_scholar": "V5gL_H0AAAAJ;https://scholar.google.com.tw/citations?user=2qDh4WUAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Mao_Ye11;~Qiang_Liu1", "aff": "University of Texas, Austin;University of Texas, Austin", "aff_domain": "utexas.edu;utexas.edu", "position": "PhD student;Assistant Professor", "bibtex": "@misc{\nye2022pareto,\ntitle={Pareto Navigation Gradient Descent: a First Order Algorithm for Optimization in Pareto Set},\nauthor={Mao Ye and qiang liu},\nyear={2022},\nurl={https://openreview.net/forum?id=tiKNfYpH8le}\n}", "github": "", "project": "", "reviewers": "6MZF;ZUH7;p4mF;vNLC", "site": "https://openreview.net/forum?id=tiKNfYpH8le", "pdf_size": 0, "recommendation": "3;5;5;5", "confidence": "5;3;2;3", "correctness": "2;4;4;3", "technical_novelty": "2;2;3;3", "empirical_novelty": "1;0;3;2", "wc_summary_paper": "54;29;201;101", "wc_summary_review": "31;32;95;88", "wc_main_review": "440;170;431;174", "wc_review": "525;231;727;363", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "543;347;456;234", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;2;1", "recommendation_avg": [ 4.5, 0.8660254037844386 ], "confidence_avg": [ 3.25, 1.0897247358851685 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 1.5, 1.118033988749895 ], "wc_summary_paper_avg": [ 96.25, 65.76995894783575 ], "wc_summary_review_avg": [ 61.5, 30.103986446980738 ], "wc_main_review_avg": [ 303.75, 131.79600714740943 ], "wc_review_avg": [ 461.5, 185.3071774109141 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 395.0, 116.02801385872293 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.9271726499455306, "corr_recommendation_correctness": 0.8703882797784891, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=386686201917828984&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff_unique_index": "0;0", "aff_unique_norm": "University of Texas at Austin", "aff_unique_dep": "", "aff_unique_url": "https://www.utexas.edu", "aff_unique_abbr": "UT Austin", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Austin", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "tiQ5Zh2S3zV", "title": "A multi-domain splitting framework for time-varying graph structure", "track": "main", "status": "Reject", "tldr": "", "abstract": "The Graph Signal Processing (GSP) methods are widely used to solve structured data analysis problems, assuming that the data structure is fixed. In the recent GSP community, anomaly detection on datasets with the time-varying structure is an open challenge. To address the anomaly detection problem for datasets with a spatial-temporal structure, in this work, we propose a novel graph multi-domain splitting framework, called GMDS, by integrating the time, vertex, and frequency features to locate the anomalies. Firstly, by introducing the discrete wavelet transform into vertex function, we design a splitting approach for separating the graph sequences into several sub-sequences adaptively. Then, we specifically design an adjacency function in the vertex domain to generate the adjacency matrix adaptively. At last, by utilizing the learned graphs to the spectral graph wavelet transform, we design a module to extract vertices features in the frequency domain. To validate the effectiveness of our framework, we apply GMDS in the anomaly detection of actual traffic flow and urban datasets and compare its performances with acknowledged baselines. The experimental results show that our proposed framework outperforms all the baselines, which distinctly demonstrate the validity of GMDS.", "keywords": "graph signal processing;time-varying structure;anomaly detection;multi-domain analysis;graph learning", "primary_area": "", "supplementary_material": "", "author": "Zehua Yu;Xianwei Zheng;Zhulun Yang;Xutao Li", "authorids": "~Zehua_Yu1;alex.w.zheng@hotmail.com;15zlyang3@stu.edu.cn;lixt@stu.edu.cn", "gender": "M;;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": "0000-0002-8517-1049;;;", "linkedin": ";;;", "or_profile": "~Zehua_Yu1;alex.w.zheng@hotmail.com;15zlyang3@stu.edu.cn;lixt@stu.edu.cn", "aff": "Shantou University;;;", "aff_domain": "stu.edu;;;", "position": "MS student;;;", "bibtex": "@inproceedings{\nyu2022a,\ntitle={A multi-domain splitting framework for time-varying graph structure},\nauthor={Zehua Yu and Xianwei Zheng and Zhulun Yang and Xutao Li},\nbooktitle={Submitted to The Tenth International Conference on Learning Representations },\nyear={2022},\nurl={https://openreview.net/forum?id=tiQ5Zh2S3zV},\nnote={under review}\n}", "github": "", "project": "", "reviewers": "atGb;N3uq;AXiN;DDjL;heAm", "site": "https://openreview.net/forum?id=tiQ5Zh2S3zV", "pdf_size": 0, "recommendation": "1;1;3;3;5", "confidence": "5;5;4;2;2", "correctness": "1;1;2;2;3", "technical_novelty": "1;1;1;2;2", "empirical_novelty": "0;0;1;1;2", "wc_summary_paper": "3;25;14;37;70", "wc_summary_review": "12;1;123;11;44", "wc_main_review": "43;1;202;194;68", "wc_review": "58;27;339;242;182", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "0;0;0;0;0", "reply_reviewers": "0;0;0;0;0", "reply_authors": "0;0;0;0;0", "recommendation_avg": [ 2.6, 1.4966629547095767 ], "confidence_avg": [ 3.6, 1.3564659966250536 ], "correctness_avg": [ 1.8, 0.7483314773547883 ], "technical_novelty_avg": [ 1.4, 0.4898979485566356 ], "empirical_novelty_avg": [ 0.8, 0.7483314773547883 ], "wc_summary_paper_avg": [ 29.8, 23.059921942625913 ], "wc_summary_review_avg": [ 38.2, 44.79910713395971 ], "wc_main_review_avg": [ 101.6, 81.61029346840998 ], "wc_review_avg": [ 169.6, 115.65569592544934 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.8669214468630108, "corr_recommendation_correctness": 1.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:kY69MB2urjcJ:scholar.google.com/&scioq=A+multi-domain+splitting+framework+for+time-varying+graph+structure&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "Shantou University", "aff_unique_dep": "", "aff_unique_url": "https://www.stu.edu.cn", "aff_unique_abbr": "STU", "aff_country_unique_index": "0", "aff_country_unique": "China" }, { "id": "tiWbMTFS57A", "title": "A partial theory of Wide Neural Networks using WC functions and its practical implications", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "We present a framework based on the theory of Polyak-\u0141ojasiewicz functions to explain the properties of convergence and generalization of overparameterized feed-forward neural networks. We introduce the class of Well-Conditioned (WC) reparameterizations, which are closed under composition and preserve the class of Polyak-\u0141ojasiewicz functions, thus enabling compositionality of the framework results which can be studied separately for each layer and in an architecture-neutral way. We show that overparameterized neural layers are WC and can therefore be composed to build easily optimizable functions. We expose a pointwise stability bound implying that overparameterization in WC models leads to a tighter convergence around a global minimizer. Our framework allows to derive quantitative estimates for the terms that govern the optimization process of neural networks. We leverage this aspect to empirically evaluate the predictions set forth by some relevant published theories concerning conditioning, training speed, and generalization of the neural networks training process. Our contribution aims to encourage the development of mixed theoretical-practical approaches, where the properties postulated by the theory can also find empirical confirmation.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/8e8e4acbc398879f9264c410ffa82d276a968d6c.zip", "author": "Dario Balboni;Davide Bacciu", "authorids": "~Dario_Balboni1;~Davide_Bacciu1", "gender": "M;M", "homepage": ";http://pages.di.unipi.it/bacciu/", "dblp": ";07/6626", "google_scholar": "zYIcJWgAAAAJ;https://scholar.google.it/citations?user=1d5n2WkAAAAJ", "orcid": ";0000-0001-5213-2468", "linkedin": ";bacciu/", "or_profile": "~Dario_Balboni1;~Davide_Bacciu1", "aff": "Scuola Normale Superiore;University of Pisa", "aff_domain": "sns.it;unipi.it", "position": "PhD student;Full Professor", "bibtex": "@misc{\nbalboni2022a,\ntitle={A partial theory of Wide Neural Networks using {WC} functions and its practical implications},\nauthor={Dario Balboni and Davide Bacciu},\nyear={2022},\nurl={https://openreview.net/forum?id=tiWbMTFS57A}\n}", "github": "", "project": "", "reviewers": "bcvt;XDc6;5idT;Qshe", "site": "https://openreview.net/forum?id=tiWbMTFS57A", "pdf_size": 0, "recommendation": "3;3;3;6", "confidence": "3;3;3;2", "correctness": "3;3;3;3", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;0;2;4", "wc_summary_paper": "251;37;49;114", "wc_summary_review": "19;20;20;22", "wc_main_review": "193;241;554;333", "wc_review": "463;298;623;469", "wc_reply_reviewers": "76;151;157;85", "wc_reply_authors": "387;271;814;542", "reply_reviewers": "1;1;1;1", "reply_authors": "1;1;1;1", "recommendation_avg": [ 3.75, 1.299038105676658 ], "confidence_avg": [ 2.75, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 1.4142135623730951 ], "wc_summary_paper_avg": [ 112.75, 85.02462878484093 ], "wc_summary_review_avg": [ 20.25, 1.0897247358851685 ], "wc_main_review_avg": [ 330.25, 138.63148091252577 ], "wc_review_avg": [ 463.25, 114.95732903995291 ], "wc_reply_reviewers_avg": [ 117.25, 36.948443810260805 ], "wc_reply_authors_avg": [ 503.5, 203.42136072694038 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -1.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:gGJOB82UVzUJ:scholar.google.com/&scioq=A+partial+theory+of+Wide+Neural+Networks+using+WC+functions+and+its+practical+implications&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1", "aff_unique_norm": "Scuola Normale Superiore;University of Pisa", "aff_unique_dep": ";", "aff_unique_url": "https://www.sns.it;https://www.unipi.it", "aff_unique_abbr": "SNS;UNIP", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Italy" }, { "id": "tk1eA4lvVRC", "title": "Robust Meta-learning with Sampling Noise and Label Noise via Eigen-Reptile", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Recent years have seen a surge of interest in meta-learning techniques for tackling the few-shot learning (FSL) problem. However, the meta-learner is prone to overfitting since there are only a few available samples with sampling noise on a clean dataset. More importantly, when handling the data sampled with noisy labels, meta-learner could be extremely sensitive to label noise on a corrupted dataset. To address these two challenges, we present Eigen-Reptile (ER) that updates the meta-parameters with the main direction of historical task-specific parameters to alleviate sampling and label noise. Specifically, the main direction is computed in a fast way for the required large-scale matrix. Furthermore, to obtain a more accurate main direction for Eigen-Reptile in the presence of label noise, we further propose Introspective Self-paced Learning (ISPL). We have theoretically and experimentally demonstrated the soundness and effectiveness of the proposed Eigen-Reptile and ISPL. Particularly, our experiments on different tasks show that the proposed method is able to outperform or achieve highly competitive performance compared with other gradient-based methods with or without noisy labels.", "keywords": "meta-learning;few-shot;noisy label", "primary_area": "", "supplementary_material": "/attachment/c6721b5822f5994d23cdaa70920e7085b57e8dd4.zip", "author": "Dong Chen;Lingfei Wu;Siliang Tang;Fangli Xu;Yun Xiao;Bo Long;Yueting Zhuang", "authorids": "~Dong_Chen5;~Lingfei_Wu1;~Siliang_Tang1;~Fangli_Xu1;~Yun_Xiao2;~Bo_Long1;~Yueting_Zhuang1", "gender": "M;;M;;M;M;M", "homepage": "https://anfeather.github.io;https://sites.google.com/view/teddy-lfwu/;https://person.zju.edu.cn/en/siliang;;;https://www.linkedin.com/in/bolonglinkedin/;https://person.zju.edu.cn/yzhuang", "dblp": ";27/9060;44/5693;89/10932;;96/6993.html;", "google_scholar": "yD-kDHEAAAAJ;https://scholar.google.com/citations?hl=en;8e7H3PcAAAAJ;;;;1RD7UJAAAAAJ", "orcid": "0000-0002-4859-1757;;0000-0002-7356-9711;;;;", "linkedin": ";;siliang-tang-4734272a/;;yun-xiao-75581326/;bolonglinkedin/;", "or_profile": "~Dong_Chen5;~Lingfei_Wu1;~Siliang_Tang1;~Fangli_Xu1;~Yun_Xiao2;~Bo_Long1;~Yueting_Zhuang1", "aff": "Zhejiang University;JD.COM Silicon Valley Research Center;Zhejiang University;;JD.COM Silicon Valley Research Center;Meta;Zhejiang University", "aff_domain": "zju.edu;jd.com;zju.edu.cn;;jd.com;meta.com;zju.edu.cn", "position": "PhD student;Principal Scientist;Full Professor;;Principal Scientist;Principal Researcher;Full Professor", "bibtex": "@misc{\nchen2022robust,\ntitle={Robust Meta-learning with Sampling Noise and Label Noise via Eigen-Reptile},\nauthor={Dong Chen and Lingfei Wu and Siliang Tang and Fangli Xu and Yun Xiao and Bo Long and Yueting Zhuang},\nyear={2022},\nurl={https://openreview.net/forum?id=tk1eA4lvVRC}\n}", "github": "", "project": "", "reviewers": "CUnc;YQso;X4Dj;tEPx", "site": "https://openreview.net/forum?id=tk1eA4lvVRC", "pdf_size": 0, "recommendation": "5;5;5;6", "confidence": "4;3;2;3", "correctness": "4;2;2;3", "technical_novelty": "3;3;2;3", "empirical_novelty": "2;2;4;0", "wc_summary_paper": "114;74;116;119", "wc_summary_review": "51;17;24;62", "wc_main_review": "719;278;434;589", "wc_review": "884;369;574;770", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 5.25, 0.4330127018922193 ], "confidence_avg": [ 3.0, 0.7071067811865476 ], "correctness_avg": [ 2.75, 0.82915619758885 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 1.4142135623730951 ], "wc_summary_paper_avg": [ 105.75, 18.417043736713012 ], "wc_summary_review_avg": [ 38.5, 18.580904176062045 ], "wc_main_review_avg": [ 505.0, 165.3949817860264 ], "wc_review_avg": [ 649.25, 196.14455766092516 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.17407765595569782, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8530355739289210050&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;1;0;1;2;0", "aff_unique_norm": "Zhejiang University;JD.com;Meta", "aff_unique_dep": ";Research Center;Meta Platforms, Inc.", "aff_unique_url": "https://www.zju.edu.cn;https://www.jd.com;https://meta.com", "aff_unique_abbr": "ZJU;JD.COM;Meta", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Silicon Valley", "aff_country_unique_index": "0;1;0;1;1;0", "aff_country_unique": "China;United States" }, { "id": "tlkHrUlNTiL", "title": "Disentangling deep neural networks with rectified linear units using duality", "track": "main", "status": "Reject", "tldr": "", "abstract": "Despite their success deep neural networks (DNNs) are still largely considered as black boxes. The main issue is that the linear and non-linear operations are entangled in every layer, making it hard to interpret the hidden layer outputs. In this paper, we look at DNNs with rectified linear units (ReLUs), and focus on the gating property (\u2018on/off\u2019 states) of the ReLUs. We extend the recently developed dual view in which the computation is broken path-wise to show that learning in the gates is more crucial, and learning the weights given the gates is characterised analytically via the so called neural path kernel (NPK) which depends on inputs and gates. In this paper, we present novel results to show that convolution with global pooling and skip connection provide respectively rotational invariance and ensemble structure to the NPK. To address \u2018black box\u2019-ness, we propose a novel interpretable counterpart of DNNs with ReLUs namely deep linearly gated networks (DLGN): the pre- activations to the gates are generated by a deep linear network, and the gates are then applied as external masks to learn the weights in a different network. The DLGN is not an alternative architecture per se, but a disentanglement and an interpretable re-arrangement of the computations in a DNN with ReLUs. The DLGN disentangles the computations into two \u2018mathematically\u2019 interpretable linearities (i) the \u2018primal\u2019 linearity between the input and the pre-activations in the gating network and (ii) the \u2018dual\u2019 linearity in the path space in the weights network characterised by the NPK. We compare the performance of DNN, DGN and DLGN on CIFAR-10 and CIFAR-100 to show that, the DLGN recovers more than 83.5% of the performance of state-of-the-art DNNs, i.e., while entanglement in the DNNs enable their improved performance, the \u2018disentangled and interpretable\u2019 computations in the DLGN recovers most part of the performance. This brings us to an interesting question: \u2018Is DLGN a universal spectral approximator?\u2019", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Chandra Shekar Lakshminarayanan;Amit Vikram Singh", "authorids": "~Chandra_Shekar_Lakshminarayanan2;~Amit_Vikram_Singh1", "gender": "M;M", "homepage": "https://iitpkd.ac.in/people/cnarayanan;", "dblp": "143/7535;", "google_scholar": ";", "orcid": ";", "linkedin": ";amitadvaita/", "or_profile": "~Chandra_Shekar_Lakshminarayanan2;~Amit_Vikram_Singh1", "aff": "Indian Institute of Technology, Madras;Adobe Systems", "aff_domain": "iitm.ac.in;adobe.com", "position": "Assistant Professor;Machine Learning Engineer", "bibtex": "@misc{\nlakshminarayanan2022disentangling,\ntitle={Disentangling deep neural networks with rectified linear units using duality},\nauthor={Chandra Shekar Lakshminarayanan and Amit Vikram Singh},\nyear={2022},\nurl={https://openreview.net/forum?id=tlkHrUlNTiL}\n}", "github": "", "project": "", "reviewers": "4UFG;pMWb;hqwS;nByk", "site": "https://openreview.net/forum?id=tlkHrUlNTiL", "pdf_size": 0, "recommendation": "5;6;6;6", "confidence": "4;3;2;3", "correctness": "4;4;3;3", "technical_novelty": "3;3;3;2", "empirical_novelty": "3;3;3;0", "wc_summary_paper": "98;56;40;60", "wc_summary_review": "43;26;19;38", "wc_main_review": "405;113;54;312", "wc_review": "546;195;113;410", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "1564;226;45;712", "reply_reviewers": "0;0;0;0", "reply_authors": "6;1;1;7", "recommendation_avg": [ 5.75, 0.4330127018922193 ], "confidence_avg": [ 3.0, 0.7071067811865476 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 1.299038105676658 ], "wc_summary_paper_avg": [ 63.5, 21.277922830953212 ], "wc_summary_review_avg": [ 31.5, 9.5 ], "wc_main_review_avg": [ 221.0, 142.9073126190539 ], "wc_review_avg": [ 316.0, 171.45407548378662 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 636.75, 588.289628924393 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 3.75, 2.7726341266023544 ], "replies_avg": [ 20, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.816496580927726, "corr_recommendation_correctness": -0.5773502691896257, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4301043306336016259&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1", "aff_unique_norm": "Indian Institute of Technology Madras;Adobe", "aff_unique_dep": ";Adobe Systems Incorporated", "aff_unique_url": "https://www.iitm.ac.in;https://www.adobe.com", "aff_unique_abbr": "IIT Madras;Adobe", "aff_campus_unique_index": "0", "aff_campus_unique": "Madras;", "aff_country_unique_index": "0;1", "aff_country_unique": "India;United States" }, { "id": "tlkMbWBEAFb", "title": "Fully Steerable 3D Spherical Neurons", "track": "main", "status": "Reject", "tldr": "", "abstract": "Emerging from low-level vision theory, steerable filters found their counterpart in prior work on steerable convolutional neural networks equivariant to rigid transformations. In our work, we propose a steerable feed-forward learning-based approach that consists of spherical decision surfaces and operates on point clouds. Focusing on 3D geometry, we derive a 3D steerability constraint for hypersphere neurons, which are obtained by conformal embedding of Euclidean space and have recently been revisited in the context of learning representations of point sets. Exploiting the rotational equivariance, we show how our model parameters are fully steerable at inference time. We use a synthetic point set and real-world 3D skeleton data to show how the proposed spherical filter banks enable making equivariant and, after online optimization, invariant class predictions for known point sets in unknown orientations.", "keywords": "geometric deep learning;steerable network;conformal embedding;spherical neuron;3D shape classification", "primary_area": "", "supplementary_material": "/attachment/52f77bd50d46ea6657efb9879c0e97105d37144d.zip", "author": "Pavlo Melnyk;Michael Felsberg;M\u00e5rten Wadenb\u00e4ck", "authorids": "~Pavlo_Melnyk1;~Michael_Felsberg2;~M\u00e5rten_Wadenb\u00e4ck1", "gender": "M;;M", "homepage": "https://pavlomelnyk.com;https://liu.se/en/employee/micfe03;https://liu.se/en/employee/marwa32", "dblp": "232/3322;00/78;132/2319", "google_scholar": "RhThiI8AAAAJ;https://scholar.google.se/citations?hl=en;6WRQpCQAAAAJ", "orcid": "0000-0002-6091-861X;0000-0002-6096-3648;0000-0002-0675-2794", "linkedin": ";https://linkedin.com/in/michael-felsberg-668a202;", "or_profile": "~Pavlo_Melnyk1;~Michael_Felsberg2;~M\u00e5rten_Wadenb\u00e4ck1", "aff": "Link\u00f6ping University;Link\u00f6ping University;Link\u00f6ping University", "aff_domain": "liu.se;liu.se;liu.se", "position": "PhD student;Full Professor;Assistant Professor", "bibtex": "@misc{\nmelnyk2022fully,\ntitle={Fully Steerable 3D Spherical Neurons},\nauthor={Pavlo Melnyk and Michael Felsberg and M{\\r{a}}rten Wadenb{\\\"a}ck},\nyear={2022},\nurl={https://openreview.net/forum?id=tlkMbWBEAFb}\n}", "github": "", "project": "", "reviewers": "31zH;CYuQ;KgA9;uUFZ;FkmY", "site": "https://openreview.net/forum?id=tlkMbWBEAFb", "pdf_size": 0, "recommendation": "5;5;5;5;8", "confidence": "2;3;2;2;4", "correctness": "3;2;3;3;3", "technical_novelty": "3;3;3;3;3", "empirical_novelty": "2;2;2;2;3", "wc_summary_paper": "74;68;50;96;147", "wc_summary_review": "20;27;27;13;242", "wc_main_review": "299;660;308;736;257", "wc_review": "393;755;385;845;646", "wc_reply_reviewers": "96;137;27;0;209", "wc_reply_authors": "568;884;535;546;88", "reply_reviewers": "1;1;1;0;1", "reply_authors": "2;2;2;1;1", "recommendation_avg": [ 5.6, 1.2 ], "confidence_avg": [ 2.6, 0.8 ], "correctness_avg": [ 2.8, 0.39999999999999997 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.2, 0.39999999999999997 ], "wc_summary_paper_avg": [ 87.0, 33.406586176980134 ], "wc_summary_review_avg": [ 65.8, 88.25281865187083 ], "wc_main_review_avg": [ 452.0, 203.0221662774782 ], "wc_review_avg": [ 604.8, 187.14956585576147 ], "wc_reply_reviewers_avg": [ 93.8, 75.36948984834646 ], "wc_reply_authors_avg": [ 524.2, 253.86484593184616 ], "reply_reviewers_avg": [ 0.8, 0.4 ], "reply_authors_avg": [ 1.6, 0.4898979485566356 ], "replies_avg": [ 24, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.875, "corr_recommendation_correctness": 0.25000000000000006, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:7MkQEpMn_ioJ:scholar.google.com/&scioq=Fully+Steerable+3D+Spherical+Neurons&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;0", "aff_unique_norm": "Link\u00f6ping University", "aff_unique_dep": "", "aff_unique_url": "https://www.liu.se", "aff_unique_abbr": "LiU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Sweden" }, { "id": "tm9-r3-O2lt", "title": "CONTROLLING THE MEMORABILITY OF REAL AND UNREAL FACE IMAGES", "track": "main", "status": "Reject", "tldr": "", "abstract": "Every day, we are bombarded with many face photographs, whether on social media, television, or smartphones. From an evolutionary perspective, faces are intended to be remembered, mainly due to survival and personal relevance. However, all these faces do not have the equal opportunity to stick in our minds. It has been shown that memorability is an intrinsic feature of an image but yet, it\u2019s largely unknown what attributes make the images more memorable. In this work, we aim to address this question by proposing a fast approach to modify and control the memorability of face images. In our proposed method, we first find a hyperplane in the latent space of StyleGAN to separate high and low memorable images. We then modify the image memorability (while keeping the identity and other facial features such as age, emotion, etc.) by moving in the positive or negative direction of this hyperplane normal vector. We further analyzed how different layers of the styleGAN augmented latent space contribute to face memorability. These analyses showed how each individual face attribute makes images more or less memorable. Most importantly, we evaluated our proposed method for both real and unreal (generated) face images. The proposed method successfully modifies and controls the memorability of real human faces as well as unreal(generated) faces. Our proposed method can be employed in photograph editing applications for social media, learning aids, or advertisement purposes.", "keywords": "Memorability;Face Memorability;Face Memorability Modification;StyleGAN;Latent Vector;Image2Style", "primary_area": "", "supplementary_material": "/attachment/02d136083dacc1fa5d2a4848eeec83eae9f042ca.zip", "author": "Mohammad Younesi;Yalda Mohsenzadeh", "authorids": "~Mohammad_Younesi1;~Yalda_Mohsenzadeh1", "gender": "M;female", "homepage": ";https://mohsenzadehlab.com/people", "dblp": ";", "google_scholar": ";xZIgSigAAAAJ", "orcid": ";0000-0001-8525-957X", "linkedin": "https://ca.linkedin.com/in/mohammad-younesi;", "or_profile": "~Mohammad_Younesi1;~Yalda_Mohsenzadeh1", "aff": ", University of Western Ontario;University of Western Ontario", "aff_domain": "csd.uwo.ca;uwo.ca", "position": "MS student;Assistant Professor", "bibtex": "@misc{\nyounesi2022controlling,\ntitle={{CONTROLLING} {THE} {MEMORABILITY} {OF} {REAL} {AND} {UNREAL} {FACE} {IMAGES}},\nauthor={Mohammad Younesi and Yalda Mohsenzadeh},\nyear={2022},\nurl={https://openreview.net/forum?id=tm9-r3-O2lt}\n}", "github": "", "project": "", "reviewers": "8EvC;iYVp;m2GG;xi9k", "site": "https://openreview.net/forum?id=tm9-r3-O2lt", "pdf_size": 0, "recommendation": "3;3;5;6", "confidence": "4;4;3;4", "correctness": "3;2;3;3", "technical_novelty": "1;2;2;2", "empirical_novelty": "2;2;3;2", "wc_summary_paper": "129;92;90;32", "wc_summary_review": "46;25;43;15", "wc_main_review": "263;145;161;192", "wc_review": "438;262;294;239", "wc_reply_reviewers": "198;87;0;54", "wc_reply_authors": "944;656;902;493", "reply_reviewers": "1;1;0;1", "reply_authors": "2;2;3;2", "recommendation_avg": [ 4.25, 1.299038105676658 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 1.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 85.75, 34.701404870696514 ], "wc_summary_review_avg": [ 32.25, 12.794041581923986 ], "wc_main_review_avg": [ 190.25, 45.273474573971015 ], "wc_review_avg": [ 308.25, 77.41567993630231 ], "wc_reply_reviewers_avg": [ 84.75, 72.38568573965436 ], "wc_reply_authors_avg": [ 748.75, 184.13225545786378 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 2.25, 0.4330127018922193 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": 0.5555555555555555, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:-SyhBDcnhxkJ:scholar.google.com/&scioq=CONTROLLING+THE+MEMORABILITY+OF+REAL+AND+UNREAL+FACE+IMAGES&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "University of Western Ontario", "aff_unique_dep": "", "aff_unique_url": "https://www.uwo.ca", "aff_unique_abbr": "UWO", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Canada" }, { "id": "tsg-Lf1MYp", "title": "Natural Attribute-based Shift Detection", "track": "main", "status": "Reject", "tldr": "", "abstract": "Despite the impressive performance of deep networks in vision, language, and healthcare, unpredictable behaviors on samples from the distribution different than the training distribution cause severe problems in deployment. For better reliability of neural-network-based classifiers, we define a new task, natural attribute-based shift (NAS) detection, to detect the samples shifted from the training distribution by some natural attribute such as age of subjects or brightness of images. Using the natural attributes present in existing datasets, we introduce benchmark datasets in vision, language, and medical for NAS detection. Further, we conduct an extensive evaluation of prior representative out-of-distribution (OOD) detection methods on NAS datasets and observe an inconsistency in their performance. To understand this, we provide an analysis on the relationship between the location of NAS samples in the feature space and the performance of distance- and confidence-based OOD detection methods. Based on the analysis, we split NAS samples into three categories and further suggest a simple modification to the training objective to obtain an improved OOD detection method that is capable of detecting samples from all NAS categories.", "keywords": "attribute shift;out-of-distribution detection;distribution shift", "primary_area": "", "supplementary_material": "/attachment/5e621953c03b05bc2a8d169bb3a2eacf3afa4783.zip", "author": "Jeonghoon Park;Jimin Hong;Radhika Dua;Daehoon Gwak;Jaegul Choo;Sharon Li;Edward Choi", "authorids": "~Jeonghoon_Park1;~Jimin_Hong1;~Radhika_Dua1;~Daehoon_Gwak1;~Jaegul_Choo1;~Sharon_Li1;~Edward_Choi1", "gender": ";M;;M;M;;M", "homepage": ";;;;https://sites.google.com/site/jaegulchoo/;;http://mp2893.com", "dblp": "62/4399;;;276/7016;07/2074;;41/3886", "google_scholar": "https://scholar.google.com/citations?hl=ko;M1cEEPgAAAAJ;;NyQ42l8AAAAJ;GHJYsLEAAAAJ;;GUlGIPkAAAAJ", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": "~Jeonghoon_Park1;~Jimin_Hong1;~Radhika_Dua1;~Daehoon_Gwak1;~Jaegul_Choo1;~Sharon_Li1;~Edward_Choi1", "aff": "Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology;;Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology;;Korea Advanced Institute of Science & Technology", "aff_domain": "kaist.ac.kr;kaist.ac.kr;;kaist.ac.kr;kaist.ac.kr;;kaist.ac.kr", "position": "MS student;PhD student;;PhD student;Associate Professor;;Associate Professor", "bibtex": "@misc{\npark2022natural,\ntitle={Natural Attribute-based Shift Detection},\nauthor={Jeonghoon Park and Jimin Hong and Radhika Dua and Daehoon Gwak and Jaegul Choo and Sharon Li and Edward Choi},\nyear={2022},\nurl={https://openreview.net/forum?id=tsg-Lf1MYp}\n}", "github": "", "project": "", "reviewers": "czxP;bsNT;KqWM", "site": "https://openreview.net/forum?id=tsg-Lf1MYp", "pdf_size": 0, "recommendation": "5;6;8", "confidence": "4;4;3", "correctness": "3;4;4", "technical_novelty": "4;3;3", "empirical_novelty": "3;4;3", "wc_summary_paper": "92;185;84", "wc_summary_review": "29;219;31", "wc_main_review": "185;620;134", "wc_review": "306;1024;249", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "1267;1363;472", "reply_reviewers": "0;0;0", "reply_authors": "3;3;2", "recommendation_avg": [ 6.333333333333333, 1.247219128924647 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 3.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 120.33333333333333, 45.84272631023983 ], "wc_summary_review_avg": [ 93.0, 89.0991956566762 ], "wc_main_review_avg": [ 313.0, 218.07796770879904 ], "wc_review_avg": [ 526.3333333333334, 352.6720226436392 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 1034.0, 399.32192526832284 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.6666666666666665, 0.4714045207910317 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": -0.944911182523068, "corr_recommendation_correctness": 0.7559289460184545, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11776318036663729886&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Korea Advanced Institute of Science and Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.kaist.ac.kr", "aff_unique_abbr": "KAIST", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "South Korea" }, { "id": "tvKdi-Nodsx", "title": "Relative Instance Credibility Inference for Learning with Noisy Labels", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "The existence of noisy labels usually leads to the degradation of generalization and robustness of neural networks in supervised learning. In this paper, we propose to use a simple theoretically guaranteed sample selection framework as a plug-in module to handle noisy labels. Specifically, we re-purpose a sparse linear model with incidental parameters as a unified Relative Instance Credibility Inference (RICI) framework, which will detect and remove outliers in the forward pass of each mini-batch and use the remaining instances to train the network. The credibility of instances is measured by the sparsity of incidental parameters, which can be ranked among other instances within each mini-batch to get a relatively consistent training mini-batch. The proposed RICI framework yields two variants that enjoy superior performance on the symmetric and asymmetric noise settings, respectively. We prove that our RICI can theoretically recover the clean data. Experimental results on several benchmark datasets and a real-world noisy dataset show the effectiveness of our framework.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yikai Wang;Xinwei Sun;Yanwei Fu", "authorids": "~Yikai_Wang1;~Xinwei_Sun1;~Yanwei_Fu2", "gender": "M;M;M", "homepage": "https://yikai-wang.github.io;https://sunxinwei0625.github.io/sunxw.github.io/;http://yanweifu.github.io", "dblp": "85/9555-2;145/6592-1;63/9065", "google_scholar": "x8HOE_cAAAAJ;;https://scholar.google.co.uk/citations?user=Vg54TcsAAAAJ", "orcid": "0000-0001-6107-5063;;0000-0002-6595-6893", "linkedin": ";;", "or_profile": "~Yikai_Wang1;~Xinwei_Sun1;~Yanwei_Fu2", "aff": "Fudan University;Fudan University;Fudan University,", "aff_domain": "fudan.edu.cn;fudan.edu.cn;fudan.edu.cn", "position": "PhD student;Assistant Professor;Professor", "bibtex": "@misc{\nwang2022relative,\ntitle={Relative Instance Credibility Inference for Learning with Noisy Labels},\nauthor={Yikai Wang and Xinwei Sun and Yanwei Fu},\nyear={2022},\nurl={https://openreview.net/forum?id=tvKdi-Nodsx}\n}", "github": "", "project": "", "reviewers": "t6MK;VCUC;VB72", "site": "https://openreview.net/forum?id=tvKdi-Nodsx", "pdf_size": 0, "recommendation": "5;5;5", "confidence": "4;4;4", "correctness": "3;3;4", "technical_novelty": "2;3;2", "empirical_novelty": "3;2;2", "wc_summary_paper": "70;78;90", "wc_summary_review": "40;50;27", "wc_main_review": "337;155;231", "wc_review": "447;283;348", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 5.0, 0.0 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 79.33333333333333, 8.219218670625303 ], "wc_summary_review_avg": [ 39.0, 9.41629792788369 ], "wc_main_review_avg": [ 241.0, 74.63689882803725 ], "wc_review_avg": [ 359.3333333333333, 67.4306227828936 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:ZckfaqzCFBkJ:scholar.google.com/&scioq=Relative+Instance+Credibility+Inference+for+Learning+with+Noisy+Labels&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;0", "aff_unique_norm": "Fudan University", "aff_unique_dep": "", "aff_unique_url": "https://www.fudan.edu.cn", "aff_unique_abbr": "Fudan", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "id": "tvwNdOKhuF5", "title": "Superior Performance with Diversified Strategic Control in FPS Games Using General Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "This paper offers an overall solution for first-person shooter (FPS) games to achieve superior performance using general reinforcement learning (RL). We introduce an agent in ViZDoom that can surpass previous top agents ranked in the open ViZDoom AI Competitions by a large margin. The proposed framework consists of a number of generally applicable techniques, including hindsight experience replay (HER) based navigation, hindsight proximal policy optimization (HPPO), rule-guided policy search (RGPS), prioritized fictitious self-play (PFSP), and diversified strategic control (DSC). The proposed agent outperforms existing agents by taking advantage of diversified and human-like strategies, instead of larger neural networks, more accurate frag skills, or hand-craft tricks, etc. We provide comprehensive analysis and experiments to elaborate the effect of each component in affecting the agent performance, and demonstrate that the proposed and adopted techniques are important to achieve superior performance in general end-to-end FPS games. The proposed methods can contribute to other games and real-world tasks which also require spatial navigation and diversified behaviors.", "keywords": "Reinforcement Learning;Hindsight Experience Replay;FPS Games", "primary_area": "", "supplementary_material": "", "author": "Shuxing Li;Jiawei Xu;Chun Yuan;peng sun;Zhuobin Zheng;Zhengyou Zhang;Lei Han", "authorids": "~Shuxing_Li1;~Jiawei_Xu1;~Chun_Yuan1;~peng_sun1;~Zhuobin_Zheng2;~Zhengyou_Zhang2;~Lei_Han1", "gender": "M;M;M;M;M;M;M", "homepage": ";https://github.com/jiawei415;https://www.sigs.tsinghua.edu.cn/fg3/105064.jhtml;http://pengsun.github.io;;;https://www.leihan.org", "dblp": ";;;;218/5295;;75/2307-1", "google_scholar": ";;https://scholar.google.com.hk/citations?user=fYdxi2sAAAAJ;;https://scholar.google.ca/citations?user=4KQgA5IAAAAJ;1I-DKy8AAAAJ;Tz4_zi8AAAAJ", "orcid": ";;;;;;", "linkedin": "%E8%88%92%E5%85%B4-%E6%9D%8E-739678133/;;;;;;", "or_profile": "~Shuxing_Li1;~Jiawei_Xu1;~Chun_Yuan1;~peng_sun1;~Zhuobin_Zheng2;~Zhengyou_Zhang2;~Lei_Han1", "aff": "Tsinghua University;Tsinghua University;Tsinghua University;ByteDance;;Tencent AI Lab;Tencent Robotics X", "aff_domain": "tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;bytedance.com;;tencent.com;tencent.com", "position": "MS student;MS student;Full Professor;Researcher;;Director;Principal Researcher", "bibtex": "@misc{\nli2022superior,\ntitle={Superior Performance with Diversified Strategic Control in {FPS} Games Using General Reinforcement Learning},\nauthor={Shuxing Li and Jiawei Xu and Chun Yuan and peng sun and Zhuobin Zheng and Zhengyou Zhang and Lei Han},\nyear={2022},\nurl={https://openreview.net/forum?id=tvwNdOKhuF5}\n}", "github": "", "project": "", "reviewers": "jKT2;ahya;73Nb;9yoB", "site": "https://openreview.net/forum?id=tvwNdOKhuF5", "pdf_size": 0, "recommendation": "3;3;3;3", "confidence": "4;5;3;3", "correctness": "3;3;2;3", "technical_novelty": "2;1;2;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "130;92;83;34", "wc_summary_review": "138;75;92;32", "wc_main_review": "245;786;300;177", "wc_review": "513;953;475;243", "wc_reply_reviewers": "0;0;139;185", "wc_reply_authors": "767;659;740;541", "reply_reviewers": "0;0;1;1", "reply_authors": "1;1;1;1", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 84.75, 34.200694437394105 ], "wc_summary_review_avg": [ 84.25, 37.962975383918476 ], "wc_main_review_avg": [ 377.0, 240.12184407087997 ], "wc_review_avg": [ 546.0, 256.7041098229633 ], "wc_reply_reviewers_avg": [ 81.0, 82.61658429153314 ], "wc_reply_authors_avg": [ 676.75, 87.87597794619414 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:mKF8rMJfDmwJ:scholar.google.com/&scioq=Superior+Performance+with+Diversified+Strategic+Control+in+FPS+Games+Using+General+Reinforcement+Learning&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;0;1;2;2", "aff_unique_norm": "Tsinghua University;ByteDance;Tencent", "aff_unique_dep": ";;Tencent AI Lab", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.bytedance.com;https://ai.tencent.com", "aff_unique_abbr": "THU;ByteDance;Tencent AI Lab", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "China" }, { "id": "twgEkDwFTP", "title": "Understanding Overfitting in Reweighting Algorithms for Worst-group Performance", "track": "main", "status": "Reject", "tldr": "", "abstract": "Prior work has proposed various reweighting algorithms to improve the worst-group performance of machine learning models for fairness. However, Sagawa et al. (2020) empirically found that these algorithms overfit easily in practice under the overparameterized setting, where the number of model parameters is much greater than the number of samples. In this work, we provide a theoretical backing to the empirical results above, and prove the pessimistic result that reweighting algorithms always overfit. Specifically we prove that with reweighting, an overparameterized model always converges to the same ERM interpolator that fits all training samples, and consequently its worst-group test performance will drop to the same level as ERM in the long run. That is, we cannot hope for reweighting algorithms to converge to a different interpolator than ERM with potentially better worst-group performance. Then, we analyze whether adding regularization helps fix the issue, and we prove that for regularization to work, it must be large enough to prevent the model from achieving small training error. Our results suggest that large regularization (or early stopping) and data augmentation are necessary for reweighting algorithms to achieve high worst-group test performance.", "keywords": "Reweighting algorithms;Worst-group performance;Implicit bias;Fairness", "primary_area": "", "supplementary_material": "/attachment/8db959c4c9cdede8c94c1e077f9b58b4d109fdf5.zip", "author": "Runtian Zhai;Chen Dan;J Zico Kolter;Pradeep Kumar Ravikumar", "authorids": "~Runtian_Zhai1;~Chen_Dan1;~J_Zico_Kolter1;~Pradeep_Kumar_Ravikumar1", "gender": "M;M;M;M", "homepage": "http://www.runtianzhai.com;https://chendancmu.github.io/;http://www.cs.cmu.edu/~pradeepr/;http://www.zicokolter.com", "dblp": "242/8411;156/6710;94/3594;67/2526", "google_scholar": "EXd0ES8AAAAJ;hQQFfuwAAAAJ;https://scholar.google.com.tw/citations?user=Q4DTPw4AAAAJ;UXh1I6UAAAAJ", "orcid": "0000-0003-3332-3466;;;", "linkedin": ";;;", "or_profile": "~Runtian_Zhai1;~Chen_Dan1;~Pradeep_Kumar_Ravikumar1;~Zico_Kolter1", "aff": "Carnegie Mellon University;Carnegie Mellon University;School of Computer Science, Carnegie Mellon University;Carnegie Mellon University", "aff_domain": "cmu.edu;cmu.edu;cs.cmu.edu;cmu.edu", "position": "PhD student;PhD student;Associate Professor;Full Professor", "bibtex": "@misc{\nzhai2022understanding,\ntitle={Understanding Overfitting in Reweighting Algorithms for Worst-group Performance},\nauthor={Runtian Zhai and Chen Dan and J Zico Kolter and Pradeep Kumar Ravikumar},\nyear={2022},\nurl={https://openreview.net/forum?id=twgEkDwFTP}\n}", "github": "", "project": "", "reviewers": "fZgx;wG4N;hLzT;M2hT", "site": "https://openreview.net/forum?id=twgEkDwFTP", "pdf_size": 0, "recommendation": "3;3;5;6", "confidence": "3;4;4;3", "correctness": "3;4;2;3", "technical_novelty": "3;2;3;3", "empirical_novelty": "2;1;2;2", "wc_summary_paper": "64;100;97;141", "wc_summary_review": "87;157;70;57", "wc_main_review": "319;221;555;414", "wc_review": "470;478;722;612", "wc_reply_reviewers": "0;0;57;0", "wc_reply_authors": "514;572;664;356", "reply_reviewers": "0;0;1;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 4.25, 1.299038105676658 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 100.5, 27.31757675929547 ], "wc_summary_review_avg": [ 92.75, 38.589992225964494 ], "wc_main_review_avg": [ 377.25, 123.24036473493577 ], "wc_review_avg": [ 570.5, 104.08049769289154 ], "wc_reply_reviewers_avg": [ 14.25, 24.681724007856502 ], "wc_reply_authors_avg": [ 526.5, 112.0301298758508 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.19245008972987526, "corr_recommendation_correctness": -0.5443310539518174, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6584111635623215436&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Carnegie Mellon University", "aff_unique_dep": "", "aff_unique_url": "https://www.cmu.edu", "aff_unique_abbr": "CMU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Pittsburgh", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Imitation Learning from Observations under Transition Model Disparity", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6137", "id": "twv2QlJhXzo", "poster": "", "openreview": "https://openreview.net/forum?id=twv2QlJhXzo", "slides": "https://iclr.cc/virtual/2022/poster/6137", "video": "https://iclr.cc/virtual/2022/poster/6137", "author_site": "Tanmay Gangwani, Yuan Zhou, Jian Peng", "tldr": "", "abstract": "Learning to perform tasks by leveraging a dataset of expert observations, also known as imitation learning from observations (ILO), is an important paradigm for learning skills without access to the expert reward function or the expert actions. We consider ILO in the setting where the expert and the learner agents operate in different environments, with the source of the discrepancy being the transition dynamics model. Recent methods for scalable ILO utilize adversarial learning to match the state-transition distributions of the expert and the learner, an approach that becomes challenging when the dynamics are dissimilar. In this work, we propose an algorithm that trains an intermediary policy in the learner environment and uses it as a surrogate expert for the learner. The intermediary policy is learned such that the state transitions generated by it are close to the state transitions in the expert dataset. To derive a practical and scalable algorithm, we employ concepts from prior work on estimating the support of a probability distribution. Experiments using MuJoCo locomotion tasks highlight that our method compares favorably to the baselines for ILO with transition dynamics mismatch.", "keywords": "Imitation Learning;Deep Reinforcement Learning", "primary_area": "", "supplementary_material": "", "author": "Tanmay Gangwani;Yuan Zhou;Jian Peng", "authorids": "~Tanmay_Gangwani1;~Yuan_Zhou1;~Jian_Peng1", "gender": "M;M;M", "homepage": "https://tgangwani.github.io/;http://yuanz.web.illinois.edu;http://jianpeng.web.engr.illinois.edu/", "dblp": "177/8611;40/7018;29/4181-1", "google_scholar": "IUY5oVkAAAAJ;https://scholar.google.com.tw/citations?user=aR34e1gAAAAJ;https://scholar.google.com.tw/citations?user=4wcAVXAAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Tanmay_Gangwani1;~Yuan_Zhou1;~Jian_Peng1", "aff": "Amazon;;University of Illinois, Urbana Champaign", "aff_domain": "amazon.com;;illinois.edu", "position": "Researcher;;Assistant Professor", "bibtex": "@inproceedings{\ngangwani2022imitation,\ntitle={Imitation Learning from Observations under Transition Model Disparity},\nauthor={Tanmay Gangwani and Yuan Zhou and Jian Peng},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=twv2QlJhXzo}\n}", "github": "", "project": "", "reviewers": "hbsF;E4NG;6eKr", "pdf_size": 0, "recommendation": "5;6;6", "confidence": "3;4;4", "correctness": "4;3;3", "technical_novelty": "3;2;3", "empirical_novelty": "2;2;2", "wc_summary_paper": "170;73;116", "wc_summary_review": "69;22;162", "wc_main_review": "113;391;1425", "wc_review": "352;486;1703", "wc_reply_reviewers": "0;137;183", "wc_reply_authors": "308;480;809", "reply_reviewers": "0;1;1", "reply_authors": "1;2;2", "recommendation_avg": [ 5.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 119.66666666666667, 39.684869772860395 ], "wc_summary_review_avg": [ 84.33333333333333, 58.17406829698455 ], "wc_main_review_avg": [ 643.0, 564.4844255306489 ], "wc_review_avg": [ 847.0, 607.7504970517643 ], "wc_reply_reviewers_avg": [ 106.66666666666667, 77.72744403076403 ], "wc_reply_authors_avg": [ 532.3333333333334, 207.8530463786588 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.9999999999999997, "corr_recommendation_correctness": -0.9999999999999997, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2183334358740050451&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=twv2QlJhXzo", "email": "amazon.com;;illinois.edu", "author_num": 3, "aff_unique_index": "0;1", "aff_unique_norm": "Amazon;University of Illinois Urbana-Champaign", "aff_unique_dep": "Amazon.com, Inc.;", "aff_unique_url": "https://www.amazon.com;https://illinois.edu", "aff_unique_abbr": "Amazon;UIUC", "aff_campus_unique_index": "1", "aff_campus_unique": ";Urbana-Champaign", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "tx4qfdJSFvG", "title": "On the Unreasonable Effectiveness of Feature Propagation in Learning on Graphs with Missing Node Features", "track": "main", "status": "Reject", "tldr": "", "abstract": "While Graph Neural Networks (GNNs) have recently become the {\\em de facto} standard for modeling relational data, they impose a strong assumption on the availability of the node or edge features of the graph. In many real-world applications, however, \nfeatures are only partially available; for example, in social networks, age and gender are available only for a small subset of users. We present a general approach for handling missing features in graph machine learning applications that is based on minimizing the Dirichlet energy and leads to a diffusion-type differential equation on the graph. The discretization of this equation produces a simple, fast and scalable algorithm which we call Feature Propagation. We experimentally show that the proposed approach outperforms previous methods on six common node-classification benchmarks and can withstand surprisingly high rates of missing features: on average we observe only around 5% relative accuracy drop when 99% of the features are missing. Moreover, it takes only 10 seconds to run on a graph with ~2.5M nodes and ~123M edges on a single GPU.", "keywords": "graph neural networks;missing features;graphs;feature propagation", "primary_area": "", "supplementary_material": "/attachment/bb3425faae632cb3a0c200ef0c7887b169decfd6.zip", "author": "Emanuele Rossi;Henry Kenlay;Maria I. Gorinova;Benjamin Paul Chamberlain;Xiaowen Dong;Michael M. Bronstein", "authorids": "~Emanuele_Rossi1;~Henry_Kenlay1;~Maria_I._Gorinova1;~Benjamin_Paul_Chamberlain1;~Xiaowen_Dong1;~Michael_M._Bronstein1", "gender": "M;;;M;;M", "homepage": "https://www.emanuelerossi.co.uk/;;;;https://web.media.mit.edu/~xdong/;http://www.inf.usi.ch/bronstein/", "dblp": ";;;;91/9827-1;07/2668", "google_scholar": "DHlkBOYAAAAJ;3xBEuKUAAAAJ;;https://scholar.google.co.uk/citations?user=Tr8LSOEAAAAJ;_8tUq8kAAAAJ;UU3N6-UAAAAJ", "orcid": ";;;;;", "linkedin": ";;;;;mbronstein/", "or_profile": "~Emanuele_Rossi1;~Henry_Kenlay1;~Maria_I._Gorinova1;~Benjamin_Paul_Chamberlain1;~Xiaowen_Dong1;~Michael_M._Bronstein1", "aff": "Twitter;University of Oxford;;Twitter;Massachusetts Institute of Technology;Twitter", "aff_domain": "twitter.com;oxford.ac.uk;;twitter.com;mit.edu;twitter.com", "position": "Machine Learning Researcher;PhD student;;ML Researcher;Research Affiliate;Head of Graph ML", "bibtex": "@misc{\nrossi2022on,\ntitle={On the Unreasonable Effectiveness of Feature Propagation in Learning on Graphs with Missing Node Features},\nauthor={Emanuele Rossi and Henry Kenlay and Maria I. Gorinova and Benjamin Paul Chamberlain and Xiaowen Dong and Michael M. Bronstein},\nyear={2022},\nurl={https://openreview.net/forum?id=tx4qfdJSFvG}\n}", "github": "", "project": "", "reviewers": "bVRh;JE1t;AjnH;U1iE", "site": "https://openreview.net/forum?id=tx4qfdJSFvG", "pdf_size": 0, "recommendation": "5;5;5;8", "confidence": "4;4;4;3", "correctness": "4;4;3;4", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "67;61;174;106", "wc_summary_review": "13;36;292;224", "wc_main_review": "194;445;608;84", "wc_review": "274;542;1074;414", "wc_reply_reviewers": "146;22;112;46", "wc_reply_authors": "391;306;1410;595", "reply_reviewers": "1;1;2;1", "reply_authors": "3;2;5;2", "recommendation_avg": [ 5.75, 1.299038105676658 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 102.0, 45.0166635813895 ], "wc_summary_review_avg": [ 141.25, 119.47672367453 ], "wc_main_review_avg": [ 332.75, 205.84627152319277 ], "wc_review_avg": [ 576.0, 302.7408132379908 ], "wc_reply_reviewers_avg": [ 81.5, 49.726753362752326 ], "wc_reply_authors_avg": [ 675.5, 436.87555436302455 ], "reply_reviewers_avg": [ 1.25, 0.4330127018922193 ], "reply_authors_avg": [ 3.0, 1.224744871391589 ], "replies_avg": [ 24, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -1.0, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 124, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=353544289563601632&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "aff_unique_index": "0;1;0;2;0", "aff_unique_norm": "Twitter, Inc.;University of Oxford;Massachusetts Institute of Technology", "aff_unique_dep": ";;", "aff_unique_url": "https://twitter.com;https://www.ox.ac.uk;https://web.mit.edu", "aff_unique_abbr": "Twitter;Oxford;MIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0;0", "aff_country_unique": "United States;United Kingdom" }, { "title": "Modeling Label Space Interactions in Multi-label Classification using Box Embeddings", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6645", "id": "tyTH9kOxcvh", "poster": "", "openreview": "https://openreview.net/forum?id=tyTH9kOxcvh", "slides": "https://iclr.cc/virtual/2022/poster/6645", "video": "https://iclr.cc/virtual/2022/poster/6645", "author_site": "Dhruvesh Patel, Pavitra Dangati, Jay-Yoon Lee, Michael Boratko, Andrew McCallum", "tldr": "", "abstract": "Multi-label classification is a challenging structured prediction task in which a set of output class labels are predicted for each input. Real-world datasets often have natural or latent taxonomic relationships between labels, making it desirable for models to employ label representations capable of capturing such taxonomies. Most existing multi-label classification methods do not do so, resulting in label predictions that are inconsistent with the taxonomic constraints, thus failing to accurately represent the fundamentals of problem setting. In this work, we introduce the multi-label box model (MBM), a multi-label classification method that combines the encoding power of neural networks with the inductive bias and probabilistic semantics of box embeddings (Vilnis, et al 2018). Box embeddings can be understood as trainable Venn-diagrams based on hyper-rectangles. Representing labels by boxes rather than vectors, MBM is able to capture taxonomic relations among labels. Furthermore, since box embeddings allow these relations to be learned by stochastic gradient descent from data, and to be read as calibrated conditional probabilities, our model is endowed with a high degree of interpretability. This interpretability also facilitates the injection of partial information about label-label relationships into model training, to further improve its consistency. We provide theoretical grounding for our method and show experimentally the model's ability to learn the true latent taxonomic structure from data. Through extensive empirical evaluations on both small and large-scale multi-label classification datasets, we show that BBM can significantly improve taxonomic consistency while preserving or surpassing the state-of-the-art predictive performance.", "keywords": "Multi-label classification;Box Embeddings;Representation Learning;Embeddings", "primary_area": "", "supplementary_material": "", "author": "Dhruvesh Patel;Pavitra Dangati;Jay-Yoon Lee;Michael Boratko;Andrew McCallum", "authorids": "~Dhruvesh_Patel1;~Pavitra_Dangati1;~Jay-Yoon_Lee1;~Michael_Boratko1;~Andrew_McCallum1", "gender": ";F;M;M;M", "homepage": "http://dhruveshp.com;;https://people.cs.umass.edu/~mboratko/;http://www.cs.umass.edu/~mccallum;https://www.cs.cmu.edu/~jaylee", "dblp": "274/7280;;222/1939;m/AndrewMcCallum;https://dblp.org/pers/l/Lee:Jay_Yoon", "google_scholar": "6F2CvwoAAAAJ;QP91qU8AAAAJ;YKZGpnkAAAAJ;yILa1y0AAAAJ;_USiaqwAAAAJ", "orcid": "0000-0003-3062-2292;;;0009-0004-5487-2848;", "linkedin": "dhruveshp/;https://linkedin.com/in/sai-pavitra-dangati;michaelboratko/;andrew-mccallum-a412;", "or_profile": "~Dhruvesh_Patel1;~Pavitra_Dangati1;~Michael_Boratko1;~Andrew_McCallum1;~Jay_Yoon_Lee1", "aff": "College of Information and Computer Science, University of Massachusetts, Amherst;;University of Massachusetts, Amherst;University of Massachusetts Amherst;Department of Computer Science, University of Massachusetts, Amherst", "aff_domain": "cics.umass.edu;;umass.edu;cs.umass.edu;cs.umass.edu", "position": "PhD student;;Postdoc;Distinguished Professor;Postdoc", "bibtex": "@inproceedings{\npatel2022modeling,\ntitle={Modeling Label Space Interactions in Multi-label Classification using Box Embeddings},\nauthor={Dhruvesh Patel and Pavitra Dangati and Jay-Yoon Lee and Michael Boratko and Andrew McCallum},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=tyTH9kOxcvh}\n}", "github": "", "project": "", "reviewers": "HdXC;Eo7g;X5cP;F1ga", "pdf_size": 0, "recommendation": "5;5;6;8", "confidence": "3;4;4;3", "correctness": "3;4;4;4", "technical_novelty": "3;1;3;3", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "13;104;96;96", "wc_summary_review": "57;70;18;81", "wc_main_review": "485;394;396;537", "wc_review": "555;568;510;714", "wc_reply_reviewers": "0;351;106;24", "wc_reply_authors": "1491;2284;980;1120", "reply_reviewers": "0;2;1;1", "reply_authors": "3;6;3;2", "recommendation_avg": [ 6.0, 1.224744871391589 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.8660254037844386 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 77.25, 37.238253181372514 ], "wc_summary_review_avg": [ 56.5, 23.79600806858159 ], "wc_main_review_avg": [ 453.0, 60.848171706305195 ], "wc_review_avg": [ 586.75, 76.55512719602783 ], "wc_reply_reviewers_avg": [ 120.25, 138.89991900645586 ], "wc_reply_authors_avg": [ 1468.75, 506.3671469398464 ], "reply_reviewers_avg": [ 1.0, 0.7071067811865476 ], "reply_authors_avg": [ 3.5, 1.5 ], "replies_avg": [ 23, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.40824829046386296, "corr_recommendation_correctness": 0.4714045207910316, "gs_citation": 37, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10529771024100862700&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=tyTH9kOxcvh", "email": "cics.umass.edu;;umass.edu;cs.umass.edu;cs.umass.edu", "author_num": 5, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "University of Massachusetts Amherst", "aff_unique_dep": "College of Information and Computer Science", "aff_unique_url": "https://www.umass.edu", "aff_unique_abbr": "UMass Amherst", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Amherst", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Pessimistic Model-based Offline Reinforcement Learning under Partial Coverage", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7210", "id": "tyrJsbKAe6", "poster": "", "openreview": "https://openreview.net/forum?id=tyrJsbKAe6", "slides": "https://iclr.cc/virtual/2022/poster/7210", "video": "https://iclr.cc/virtual/2022/poster/7210", "author_site": "Masatoshi Uehara, Wen Sun", "tldr": "", "abstract": "We study model-based offline Reinforcement Learning with general function approximation without a full coverage assumption on the offline data distribution. We present an algorithm named Constrained Pessimistic Policy Optimization (CPPO) which leverages a general function class and uses a constraint over the models to encode pessimism. Under the assumption that the ground truth model belongs to our function class (i.e., realizability in the function class), CPPO has a PAC guarantee with offline data only providing partial coverage, i.e., it can learn a policy that competes against any policy covered by the offline data. We then demonstrate that this algorithmic framework can be applied to many specialized Markov Decision Processes where the additional structural assumptions can further refine the concept of partial coverage. Two notable examples are: (1) low- rank MDP with representation learning where the partial coverage condition is defined using a relative condition number measured by the unknown ground truth feature representation; (2) factored MDP where the partial coverage condition is defined using density-ratio based concentrability coefficients associated with individual factors.", "keywords": "Reinforcement learning Theory;Offline reinforcement learning;PAC Bounds", "primary_area": "", "supplementary_material": "/attachment/f015779503ddfbff0dfb9e9800552829795a0591.zip", "author": "Masatoshi Uehara;Wen Sun", "authorids": "~Masatoshi_Uehara1;~Wen_Sun1", "gender": "M;", "homepage": "https://www.masatoshiuehara.com/;https://wensun.github.io", "dblp": "225/6517;", "google_scholar": "https://scholar.google.co.jp/citations?user=xuLKJboAAAAJ;iOLC30YAAAAJ", "orcid": "0000-0001-9017-3105;", "linkedin": ";", "or_profile": "~Masatoshi_Uehara1;~Wen_Sun1", "aff": "Amazon;Cornell University", "aff_domain": "amazon.com;cornell.edu", "position": "Intern;Assistant Professor", "bibtex": "@inproceedings{\nuehara2022pessimistic,\ntitle={Pessimistic Model-based Offline Reinforcement Learning under Partial Coverage},\nauthor={Masatoshi Uehara and Wen Sun},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=tyrJsbKAe6}\n}", "github": "", "project": "", "reviewers": "3dYA;Tgsy;Pn6R;gN7e", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "5;3;4;4", "correctness": "3;4;4;4", "technical_novelty": "2;2;3;4", "empirical_novelty": "0;0;3;0", "wc_summary_paper": "34;30;58;34", "wc_summary_review": "14;21;60;16", "wc_main_review": "233;415;215;117", "wc_review": "281;466;333;167", "wc_reply_reviewers": "166;0;0;0", "wc_reply_authors": "1543;788;516;6", "reply_reviewers": "1;0;0;0", "reply_authors": "3;1;1;1", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.82915619758885 ], "empirical_novelty_avg": [ 0.75, 1.299038105676658 ], "wc_summary_paper_avg": [ 39.0, 11.090536506409418 ], "wc_summary_review_avg": [ 27.75, 18.793283374652763 ], "wc_main_review_avg": [ 245.0, 107.61970079869205 ], "wc_review_avg": [ 311.75, 107.40431788340727 ], "wc_reply_reviewers_avg": [ 41.5, 71.88010851410841 ], "wc_reply_authors_avg": [ 713.25, 555.2438090604883 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.5, 0.8660254037844386 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.3244428422615251, "corr_recommendation_correctness": 0.6622661785325219, "gs_citation": 171, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14098223086080640360&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=tyrJsbKAe6", "email": "amazon.com;cornell.edu", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "Amazon;Cornell University", "aff_unique_dep": "Amazon.com, Inc.;", "aff_unique_url": "https://www.amazon.com;https://www.cornell.edu", "aff_unique_abbr": "Amazon;Cornell", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "tzO3RXxzuM", "title": "Stability based Generalization Bounds for Exponential Family Langevin Dynamics", "track": "main", "status": "Reject", "tldr": "", "abstract": "We study the generalization of noisy stochastic mini-batch based iterative algorithms based on the notion of stability. Recent years have seen key advances in data-dependent generalization bounds for noisy iterative learning algorithms such as stochastic gradient Langevin dynamics (SGLD) based on (Mou et al., 2018; Li et al., 2020) and related approaches (Negrea et al., 2019; Haghifam et al., 2020). In this paper, we unify and substantially generalize stability based generalization bounds and make three technical advances. First, we bound the generalization error of general noisy stochastic iterative algorithms (not necessarily gradient descent) in terms of expected stability, which in turn can be bounded by the expected Le Cam Style Divergence (LSD). Such bounds have a $O(1/n)$ sample dependence unlike many existing bounds with $O(1/\\sqrt{n})$ dependence. Second, we introduce Exponential Family Langevin Dynamics (EFLD) which is a substantial generalization of SGLD and which allows exponential family noise to be used with gradient descent. We establish data-dependent expected stability based generalization bounds for general EFLD. Third, we consider an important new special case of EFLD: Noisy Sign-SGD, which extends Sign-SGD by using Bernoulli noise over $\\{-1,+1\\}$, and we establish optimization guarantees for the algorithm. Further, we present empirical results on benchmark datasets to illustrate the our bounds are non-vacuous and quantitatively much sharper than existing bounds.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/1ff88ba03af23d6f8adfbbd53ca543fcdb90c62a.zip", "author": "Arindam Banerjee;Tiancong Chen;Xinyan Li;Yingxue Zhou", "authorids": "~Arindam_Banerjee4;~Tiancong_Chen1;~Xinyan_Li1;~Yingxue_Zhou1", "gender": ";M;F;F", "homepage": "https://arindam.cs.illinois.edu/;https://sites.google.com/view/tiancong-chen;https://xinyan-li.github.io/xinyanli.github.io/;https://sites.google.com/umn.edu/zhou0877/home", "dblp": "82/4807.html;242/8507;;", "google_scholar": "RY7cuPAAAAAJ;Y97x5I8AAAAJ;Sq1bKOQAAAAJ;EEm_z9YAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Arindam_Banerjee4;~Tiancong_Chen1;~Xinyan_Li1;~Yingxue_Zhou1", "aff": "University of Illinois, Urbana Champaign;University of Minnesota, Minneapolis;University of Minnesota, Minneapolis;University of Minnesota, Minneapolis", "aff_domain": "illinois.edu;umn.edu;umn.edu;umn.edu", "position": "Professor;PhD student;PhD student;PhD student", "bibtex": "@misc{\nbanerjee2022stability,\ntitle={Stability based Generalization Bounds for Exponential Family Langevin Dynamics},\nauthor={Arindam Banerjee and Tiancong Chen and Xinyan Li and Yingxue Zhou},\nyear={2022},\nurl={https://openreview.net/forum?id=tzO3RXxzuM}\n}", "github": "", "project": "", "reviewers": "ipDu;j8ws;AX5S;4ShW", "site": "https://openreview.net/forum?id=tzO3RXxzuM", "pdf_size": 0, "recommendation": "5;5;5;8", "confidence": "2;3;3;4", "correctness": "4;3;4;4", "technical_novelty": "2;3;2;3", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "71;44;114;139", "wc_summary_review": "46;43;36;5", "wc_main_review": "294;178;212;287", "wc_review": "411;265;362;431", "wc_reply_reviewers": "76;0;0;19", "wc_reply_authors": "856;578;923;669", "reply_reviewers": "1;0;0;1", "reply_authors": "2;1;2;1", "recommendation_avg": [ 5.75, 1.299038105676658 ], "confidence_avg": [ 3.0, 0.7071067811865476 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 92.0, 36.871398129173244 ], "wc_summary_review_avg": [ 32.5, 16.28649747490233 ], "wc_main_review_avg": [ 242.75, 49.30200300190653 ], "wc_review_avg": [ 367.25, 64.14972720129057 ], "wc_reply_reviewers_avg": [ 23.75, 31.1478329904345 ], "wc_reply_authors_avg": [ 756.5, 138.87134333619733 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.816496580927726, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4544110063909364120&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff_unique_index": "0;1;1;1", "aff_unique_norm": "University of Illinois Urbana-Champaign;University of Minnesota", "aff_unique_dep": ";", "aff_unique_url": "https://illinois.edu;https://www.minnesota.edu", "aff_unique_abbr": "UIUC;UMN", "aff_campus_unique_index": "0;1;1;1", "aff_campus_unique": "Urbana-Champaign;Minneapolis", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "tzefRCscZXZ", "title": "Adversarial Visual Robustness by Causal Intervention", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Adversarial training is the de facto most promising defense against adversarial examples. Yet, its passive nature inevitably prevents it from being immune to unknown attackers. To achieve a proactive defense, we need a more fundamental understanding of adversarial examples, beyond the popular bounded threat model. In this paper, we provide a causal viewpoint of adversarial vulnerability: the cause is the spurious correlations ubiquitously existing in learning, i.e., the confounding effect, where attackers are precisely exploiting these effects. Therefore, a fundamental solution for adversarial robustness is by causal intervention. As visual confounders are imperceptible in general, we propose to use the instrumental variable that achieves causal intervention without the need for confounder observation. We term our robust training method as Causal intervention by instrumental Variable (CiiV). It's a causal regularization that 1) augments the image with multiple retinotopic centers and 2) encourages the model to learn causal features rather than local confounding patterns by favoring features linearly responding to spatial interpolations. Extensive experiments on a wide spectrum of attackers and settings applied in CIFAR-10, CIFAR-100, and mini-ImageNet demonstrate that CiiV is robust to adaptive attacks, including the recent Auto-Attack. Besides, as a general causal regularization, it can be easily plugged into other methods to further boost their robustness. Codes are available in supplementary materials.", "keywords": "Adversarial Robustness;Causality;Instrumental Variable", "primary_area": "", "supplementary_material": "/attachment/0873879c015c8e4d58a804338b182c601398df02.zip", "author": "Kaihua Tang;Mingyuan Tao;Hanwang Zhang", "authorids": "~Kaihua_Tang1;juchen.tmy@alibaba-inc.com;~Hanwang_Zhang3", "gender": "M;;M", "homepage": "https://kaihuatang.github.io/;;https://mreallab.github.io/index.html", "dblp": "196/7269;;79/8116.html", "google_scholar": "https://scholar.google.com.sg/citations?user=WuO1sSkAAAAJ;;YG0DFyYAAAAJ", "orcid": ";;", "linkedin": "kaihua-tang-1b2522125/;;", "or_profile": "~Kaihua_Tang1;juchen.tmy@alibaba-inc.com;~Hanwang_Zhang3", "aff": "Nanyang Technological University;;", "aff_domain": "ntu.edu.sg;;", "position": "PhD student;;", "bibtex": "@misc{\ntang2022adversarial,\ntitle={Adversarial Visual Robustness by Causal Intervention},\nauthor={Kaihua Tang and Mingyuan Tao and Hanwang Zhang},\nyear={2022},\nurl={https://openreview.net/forum?id=tzefRCscZXZ}\n}", "github": "", "project": "", "reviewers": "AkCd;DZyj;1VLF", "site": "https://openreview.net/forum?id=tzefRCscZXZ", "pdf_size": 0, "recommendation": "3;6;6", "confidence": "5;5;5", "correctness": "1;2;3", "technical_novelty": "2;3;3", "empirical_novelty": "2;4;3", "wc_summary_paper": "72;123;51", "wc_summary_review": "43;114;22", "wc_main_review": "690;420;1200", "wc_review": "805;657;1273", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 5.0, 1.4142135623730951 ], "confidence_avg": [ 5.0, 0.0 ], "correctness_avg": [ 2.0, 0.816496580927726 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 3.0, 0.816496580927726 ], "wc_summary_paper_avg": [ 82.0, 30.23243291566195 ], "wc_summary_review_avg": [ 59.666666666666664, 39.36439451529206 ], "wc_main_review_avg": [ 770.0, 323.4192325759246 ], "wc_review_avg": [ 911.6666666666666, 262.5481966336001 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.8660254037844387, "gs_citation": 22, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14798663269676014647&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0", "aff_unique_norm": "Nanyang Technological University", "aff_unique_dep": "", "aff_unique_url": "https://www.ntu.edu.sg", "aff_unique_abbr": "NTU", "aff_country_unique_index": "0", "aff_country_unique": "Singapore" }, { "title": "Task Affinity with Maximum Bipartite Matching in Few-Shot Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6214", "id": "u2GZOiUTbt", "poster": "", "openreview": "https://openreview.net/forum?id=u2GZOiUTbt", "slides": "https://iclr.cc/virtual/2022/poster/6214", "video": "https://iclr.cc/virtual/2022/poster/6214", "author_site": "Cat Le, Juncheng Dong, Mohammadreza Soltani, VAHID TAROKH", "tldr": "", "abstract": "We propose an asymmetric affinity score for representing the complexity of utilizing the knowledge of one task for learning another one. Our method is based on the maximum bipartite matching algorithm and utilizes the Fisher Information matrix. We provide theoretical analyses demonstrating that the proposed score is mathematically well-defined, and subsequently use the affinity score to propose a novel algorithm for the few-shot learning problem. In particular, using this score, we find relevant training data labels to the test data and leverage the discovered relevant data for episodically fine-tuning a few-shot model. Results on various few-shot benchmark datasets demonstrate the efficacy of the proposed approach by improving the classification accuracy over the state-of-the-art methods even when using smaller models.", "keywords": "Task Affinity;Transfer Learning;Few-Shot Learning", "primary_area": "", "supplementary_material": "", "author": "Cat Phuoc Le;Juncheng Dong;Mohammadreza Soltani;Vahid Tarokh", "authorids": "~Cat_Phuoc_Le1;~Juncheng_Dong1;~Mohammadreza_Soltani1;~Vahid_Tarokh1", "gender": "M;;M;", "homepage": "https://scholars.duke.edu/person/cat.le;;https://mrezasoltani.github.io/;", "dblp": "251/5583;;150/5633;", "google_scholar": "gSzKGdQAAAAJ;;;", "orcid": "0000-0002-9121-9395;;;", "linkedin": "catphuocle/;;mohammadreza-soltani-99bb1ba0/;", "or_profile": "~Cat_Phuoc_Le1;~Juncheng_Dong1;~Mohammadreza_Soltani1;~Vahid_Tarokh1", "aff": "Duke University;;3M;", "aff_domain": "duke.edu;;mmm.com;", "position": "PhD student;;Researcher;", "bibtex": "@inproceedings{\nle2022task,\ntitle={Task Affinity with Maximum Bipartite Matching in Few-Shot Learning},\nauthor={Cat Phuoc Le and Juncheng Dong and Mohammadreza Soltani and Vahid Tarokh},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=u2GZOiUTbt}\n}", "github": "", "project": "", "reviewers": "mtpL;Y7LN;UXcu", "pdf_size": 0, "recommendation": "3;6;8", "confidence": "3;4;3", "correctness": "2;3;3", "technical_novelty": "2;3;3", "empirical_novelty": "3;2;4", "wc_summary_paper": "77;59;64", "wc_summary_review": "45;91;29", "wc_main_review": "312;383;101", "wc_review": "434;533;194", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "714;813;231", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "recommendation_avg": [ 5.666666666666667, 2.0548046676563256 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "correctness_avg": [ 2.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 3.0, 0.816496580927726 ], "wc_summary_paper_avg": [ 66.66666666666667, 7.586537784494029 ], "wc_summary_review_avg": [ 55.0, 26.280537792569366 ], "wc_main_review_avg": [ 265.3333333333333, 119.76180062477722 ], "wc_review_avg": [ 387.0, 142.33060106667153 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 586.0, 254.25577672886806 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.1147078669352809, "corr_recommendation_correctness": 0.9176629354822472, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10877103114487491040&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 7, "pdf": "https://openreview.net/pdf?id=u2GZOiUTbt", "email": "duke.edu;;mmm.com;", "author_num": 4, "aff_unique_index": "0;1", "aff_unique_norm": "Duke University;3M Company", "aff_unique_dep": ";", "aff_unique_url": "https://www.duke.edu;https://www.3m.com", "aff_unique_abbr": "Duke;3M", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "u2JeVfXIQa", "title": "Adaptive Cross-Layer Attention for Image Restoration", "track": "main", "status": "Reject", "tldr": "", "abstract": "Non-local attention module has been proven to be crucial for image restoration. Conventional non-local attention processes features of each layer separately, so it risks missing correlation between features among different layers. To address this problem, we propose Cross-Layer Attention (CLA) module in this paper. Instead of \ufb01nding correlated key pixels within the same layer, each query pixel is allowed to attend to key pixels at previous layers of the network. In order to mitigate the expensive computational cost of such hierarchical attention design, only a small \ufb01xed number of keys can be selected for each query from a previous layer. We further propose a variant of CLA termed Adaptive Cross-Layer Attention (ACLA). In ACLA, the number of keys to be aggregated for each query is dynamically selected. A neural architecture search method is used to \ufb01nd the insert positions of ACLA modules to render a compact neural network with compelling performance. Extensive experiments on image restoration tasks including single image super-resolution, image denoising, image demosaicing, and image compression artifacts reduction validate the effectiveness and ef\ufb01ciency of ACLA.", "keywords": "image restoration;neural architecture search;non-local attention", "primary_area": "", "supplementary_material": "", "author": "Yancheng Wang;Yingzhen Yang;Chong Chen;Ning Xu", "authorids": "~Yancheng_Wang2;~Yingzhen_Yang1;chongchen@kuaishou.com;~Ning_Xu3", "gender": "M;M;;M", "homepage": ";http://yingzhenyang.com;;", "dblp": ";66/3838.html;;04/5856-1", "google_scholar": "https://scholar.google.com/citations?hl=en;;;dRDZBoEAAAAJ", "orcid": ";;;0000-0003-3439-3175", "linkedin": ";yingzhen-yang-9b869122;;", "or_profile": "~Yancheng_Wang2;~Yingzhen_Yang1;chongchen@kuaishou.com;~Ning_Xu3", "aff": "Arizona State University;Arizona State University;;Kuaishou US R&D Center", "aff_domain": "asu.edu;asu.edu;;kuaishou.com", "position": "PhD student;Assistant Professor;;Chief Scientist", "bibtex": "@misc{\nwang2022adaptive,\ntitle={Adaptive Cross-Layer Attention for Image Restoration},\nauthor={Yancheng Wang and Yingzhen Yang and Chong Chen and Ning Xu},\nyear={2022},\nurl={https://openreview.net/forum?id=u2JeVfXIQa}\n}", "github": "", "project": "", "reviewers": "Xgo2;HQYo;g1nJ;6N93", "site": "https://openreview.net/forum?id=u2JeVfXIQa", "pdf_size": 0, "recommendation": "5;5;6;8", "confidence": "5;2;4;4", "correctness": "3;4;3;3", "technical_novelty": "3;2;2;3", "empirical_novelty": "3;2;3;2", "wc_summary_paper": "55;64;78;166", "wc_summary_review": "5;68;9;67", "wc_main_review": "241;251;128;601", "wc_review": "301;383;215;834", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "1171;1946;0;231", "reply_reviewers": "0;0;0;0", "reply_authors": "2;3;0;1", "recommendation_avg": [ 6.0, 1.224744871391589 ], "confidence_avg": [ 3.75, 1.0897247358851685 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 90.75, 44.21184795956849 ], "wc_summary_review_avg": [ 37.25, 30.285103598964294 ], "wc_main_review_avg": [ 305.25, 177.45193011066405 ], "wc_review_avg": [ 433.25, 238.87692961020744 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 837.0, 776.0866575325207 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.5, 1.118033988749895 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.18731716231633877, "corr_recommendation_correctness": -0.4714045207910316, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12948218695844171371&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0;1", "aff_unique_norm": "Arizona State University;Kuaishou", "aff_unique_dep": ";R&D Center", "aff_unique_url": "https://www.asu.edu;", "aff_unique_abbr": "ASU;", "aff_campus_unique_index": "1", "aff_campus_unique": ";United States", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "u3IYqzOdQdl", "title": "MixtureEnsembles: Leveraging Parameter Sharing for Efficient Ensembles", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Ensembles are a very effective way of increasing both the robustness and accuracy of a learning system. Yet they are memory and compute intensive; in a naive ensemble, $n$ networks are trained independently and $n$ networks must be stored. Recently, BatchEnsemble \\citep{wen2020batchensemble} and MIMO \\citep{havasi2020training} has significantly decreased the memory footprint with classification performance that approaches that of a naive ensemble. We improve on these methods with MixtureEnsembles, which learns to factorize ensemble members with shared parameters by constructing each layer with a linear combination of templates. Then, each ensemble member is defined as a different set of linear combination weights. By modulating the number of templates available, \nMixtureEnsembles are uniquely flexible and allow easy scaling between the low-parameter and high-parameter regime. In the low parameter regime, MixtureEnsembles outperforms BatchEnsemble on both ImageNet and CIFAR, and are competitive with MIMO. In the high-parameter regime, MixtureEnsembles outperform all baselines on CIFAR and ImageNet. This flexibility allows users to control the precise performance-memory cost trade-off without making any changes in the backbone architecture. When we additionally tune the backbone architecture width, we can outperform all baselines in the low-parameter regime with the same inference FLOP footprint. ", "keywords": "Enesembles;Robust Learning;Efficient Computing", "primary_area": "", "supplementary_material": "/attachment/d3afc0e8cb344185347190d8092fbc46ae2a57b1.zip", "author": "Piotr Teterwak;Nikoli Dryden;Dina Bashkirova;Kate Saenko;Bryan A. Plummer", "authorids": "~Piotr_Teterwak1;~Nikoli_Dryden1;~Dina_Bashkirova1;~Kate_Saenko1;~Bryan_A._Plummer1", "gender": "M;;F;F;M", "homepage": "https://scholar.google.com/citations?user=lUkd1AMAAAAJ&hl=en&oi=ao;https://ndryden.com;https://cs-people.bu.edu/dbash/;http://ai.bu.edu;http://bryanplummer.com/", "dblp": "247/6128;148/1273;;88/2754;163/2330", "google_scholar": "lUkd1AMAAAAJ;nRhl3Q4AAAAJ;qvUTYsUAAAAJ;https://scholar.google.com.tw/citations?user=9xDADY4AAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";;0000-0003-3557-3215;0000-0002-5704-7614;", "linkedin": ";;;;", "or_profile": "~Piotr_Teterwak1;~Nikoli_Dryden1;~Dina_Bashkirova1;~Kate_Saenko1;~Bryan_Allen_Plummer1", "aff": "Boston University;Swiss Federal Institute of Technology;Boston University;Boston University, Boston University;Boston University", "aff_domain": "bu.edu;ethz.ch;bu.edu;bu.edu;bu.edu", "position": "PhD student;Postdoc;PhD student;Full Professor;Assistant Professor", "bibtex": "@misc{\nteterwak2022mixtureensembles,\ntitle={MixtureEnsembles: Leveraging Parameter Sharing for Efficient Ensembles },\nauthor={Piotr Teterwak and Nikoli Dryden and Dina Bashkirova and Kate Saenko and Bryan A. Plummer},\nyear={2022},\nurl={https://openreview.net/forum?id=u3IYqzOdQdl}\n}", "github": "", "project": "", "reviewers": "vS73;yBKL;WBCU;g5Qs", "site": "https://openreview.net/forum?id=u3IYqzOdQdl", "pdf_size": 0, "recommendation": "3;3;3;6", "confidence": "4;3;4;4", "correctness": "3;2;3;3", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;2;0;3", "wc_summary_paper": "39;18;36;48", "wc_summary_review": "11;19;23;48", "wc_main_review": "293;123;295;933", "wc_review": "343;160;354;1029", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.75, 1.299038105676658 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.75, 1.0897247358851685 ], "wc_summary_paper_avg": [ 35.25, 10.894379284750462 ], "wc_summary_review_avg": [ 25.25, 13.827056809024834 ], "wc_main_review_avg": [ 411.0, 309.3573984891908 ], "wc_review_avg": [ 471.5, 330.9671433843547 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12041101732025210398&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;0;0;0", "aff_unique_norm": "Boston University;Swiss Federal Institute of Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.bu.edu;https://www.ethz.ch", "aff_unique_abbr": "BU;ETH Zurich", "aff_campus_unique_index": "1", "aff_campus_unique": ";Boston", "aff_country_unique_index": "0;1;0;0;0", "aff_country_unique": "United States;Switzerland" }, { "id": "u4C_qLuEpZ", "title": "Exploring General Intelligence of Program Analysis for Multiple Tasks", "track": "main", "status": "Reject", "tldr": "", "abstract": "Artificial intelligence are gaining more attractions for program analysis and semantic understanding. Nowadays, the prevalent program embedding techniques usually target at one single task, for example detection of binary similarity, program classification, program comment auto-complement, etc, due to the ever-growing program complexities and scale. To this end, we explore a generic program embedding approach that aim at solving multiple program analysis tasks. We design models to extract features of a program, represent the program as an embedding, and use this embedding to solve various analysis tasks. Since different tasks require not only access to the features of the source code, but also are highly relevant to its compilation process, traditional source code or AST-based embedding approaches are no longer applicable. Therefore, we propose a new program embedding approach that constructs a program representation based on the assembly code and simultaneously exploits the rich graph structure information present in the program. We tested our model on two tasks, program classification and binary similarity detection, and obtained accuracy of \n80.35% and 45.16%, respectively.", "keywords": "GNN;program analysis", "primary_area": "", "supplementary_material": "", "author": "Yixin Guo;Pengcheng Li;Yingwei Luo;Xiaolin Wang;Zhenlin Wang", "authorids": "yixinguo@pku.edu.cn;~Pengcheng_Li2;~Yingwei_Luo1;wxl@pku.edu.cn;~Zhenlin_Wang1", "gender": ";M;M;;M", "homepage": ";;http://eecs.pku.edu.cn/EN/People/Faculty/Detail/?ID=6001;;https://pages.mtu.edu/~zlwang/", "dblp": ";;;;88/5294.html", "google_scholar": ";w_j9E10AAAAJ;;;https://scholar.google.com/citations?hl=en", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "yixinguo@pku.edu.cn;~Pengcheng_Li2;~Yingwei_Luo1;wxl@pku.edu.cn;~Zhenlin_Wang1", "aff": ";TikTok Inc.;Peking University, Tsinghua University;;Michigan Technological University", "aff_domain": ";tiktok.com;pku.edu.cn;;mtu.edu", "position": ";Researcher;Full Professor;;Full Professor", "bibtex": "@misc{\nguo2022exploring,\ntitle={Exploring General Intelligence of Program Analysis for Multiple Tasks},\nauthor={Yixin Guo and Pengcheng Li and Yingwei Luo and Xiaolin Wang and Zhenlin Wang},\nyear={2022},\nurl={https://openreview.net/forum?id=u4C_qLuEpZ}\n}", "github": "", "project": "", "reviewers": "r5sH;Cbj6;PoNp;MnJW", "site": "https://openreview.net/forum?id=u4C_qLuEpZ", "pdf_size": 0, "recommendation": "1;3;3;3", "confidence": "4;4;4;5", "correctness": "1;2;3;2", "technical_novelty": "1;2;1;2", "empirical_novelty": "1;0;2;3", "wc_summary_paper": "38;90;27;41", "wc_summary_review": "59;37;11;9", "wc_main_review": "311;450;371;236", "wc_review": "408;577;409;286", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 2.5, 0.8660254037844386 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 2.0, 0.7071067811865476 ], "technical_novelty_avg": [ 1.5, 0.5 ], "empirical_novelty_avg": [ 1.5, 1.118033988749895 ], "wc_summary_paper_avg": [ 49.0, 24.238399287081645 ], "wc_summary_review_avg": [ 29.0, 20.54263858417414 ], "wc_main_review_avg": [ 342.0, 78.58434958692474 ], "wc_review_avg": [ 420.0, 103.52535921212734 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": 0.816496580927726, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:-Z2wPzi84WoJ:scholar.google.com/&scioq=Exploring+General+Intelligence+of+Program+Analysis+for+Multiple+Tasks&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1;2", "aff_unique_norm": "TikTok Inc.;Peking University;Michigan Technological University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.tiktok.com;http://www.pku.edu.cn;https://www.mtu.edu", "aff_unique_abbr": "TikTok;Peking U;MTU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1", "aff_country_unique": "China;United States" }, { "title": "Reliable Adversarial Distillation with Unreliable Teachers", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6336", "id": "u6TRGdzhfip", "poster": "", "openreview": "https://openreview.net/forum?id=u6TRGdzhfip", "slides": "https://iclr.cc/virtual/2022/poster/6336", "video": "https://iclr.cc/virtual/2022/poster/6336", "author_site": "Jianing ZHU, Jiangchao Yao, Bo Han, Jingfeng Zhang, Tongliang Liu, Gang Niu, Jingren Zhou, Jianliang Xu, Hongxia Yang", "tldr": "", "abstract": "In ordinary distillation, student networks are trained with soft labels (SLs) given by pretrained teacher networks, and students are expected to improve upon teachers since SLs are stronger supervision than the original hard labels. However, when considering adversarial robustness, teachers may become unreliable and adversarial distillation may not work: teachers are pretrained on their own adversarial data, and it is too demanding to require that teachers are also good at every adversarial data queried by students. Therefore, in this paper, we propose reliable introspective adversarial distillation (IAD) where students partially instead of fully trust their teachers. Specifically, IAD distinguishes between three cases given a query of a natural data (ND) and the corresponding adversarial data (AD): (a) if a teacher is good at AD, its SL is fully trusted; (b) if a teacher is good at ND but not AD, its SL is partially trusted and the student also takes its own SL into account; (c) otherwise, the student only relies on its own SL. Experiments demonstrate the effectiveness of IAD for improving upon teachers in terms of adversarial robustness.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jianing Zhu;Jiangchao Yao;Bo Han;Jingfeng Zhang;Tongliang Liu;Gang Niu;Jingren Zhou;Jianliang Xu;Hongxia Yang", "authorids": "~Jianing_Zhu2;~Jiangchao_Yao1;~Bo_Han1;~Jingfeng_Zhang1;~Tongliang_Liu1;~Gang_Niu1;~Jingren_Zhou1;~Jianliang_Xu1;~Hongxia_Yang2", "gender": "M;M;M;M;M;M;M;F;M", "homepage": "https://zfancy.github.io/;https://sunarker.github.io/;https://zjfheart.github.io;https://tongliang-liu.github.io/;https://niug1984.github.io;;http://www.comp.hkbu.edu.hk/~xujl;https://www4.comp.polyu.edu.hk/~hongxyang/;https://bhanml.github.io/", "dblp": "129/6807;166/5900;227/2664.html;150/6667;26/3367-1;84/2644;x/JianliangXu;;241/0472-3", "google_scholar": "82uNA3MAAAAJ;w8oDh9QAAAAJ;NS0P1FkAAAAJ;https://scholar.google.com.au/citations?user=EiLdZ_YAAAAJ;https://scholar.google.co.jp/citations?user=HOkcy00AAAAJ;;https://scholar.google.com.tw/citations?user=LJNsBeoAAAAJ;iJlC5mMAAAAJ;nTNjqHwAAAAJ", "orcid": ";;0000-0003-3491-8074;;;;0000-0001-9404-5848;;", "linkedin": ";;;;;;;;", "or_profile": "~Jianing_Zhu2;~Jiangchao_Yao1;~Jingfeng_Zhang1;~Tongliang_Liu1;~Gang_Niu1;~Jingren_Zhou1;~Jianliang_Xu1;~Hongxia_Yang2;~bo_han2", "aff": "Hong Kong Baptist University;Alibaba Group;RIKEN;University of Sydney;RIKEN;Alibaba Group;Hong Kong Baptist University;Alibaba Group;Microsoft Research", "aff_domain": "hkbu.edu.hk;alibaba-inc.com;riken.jp;sydney.edu.au;riken.jp;alibaba-inc.com;hkbu.edu.hk;alibaba-inc.com;microsoft.com", "position": "PhD student;Researcher;Postdoc;Lecturer;Research Scientist (tenured);Researcher;Full Professor;Principal Researcher;Researcher", "bibtex": "@inproceedings{\nzhu2022reliable,\ntitle={Reliable Adversarial Distillation with Unreliable Teachers},\nauthor={Jianing Zhu and Jiangchao Yao and Bo Han and Jingfeng Zhang and Tongliang Liu and Gang Niu and Jingren Zhou and Jianliang Xu and Hongxia Yang},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=u6TRGdzhfip}\n}", "github": "", "project": "", "reviewers": "Q98Y;svED;2bpg;xcEE", "pdf_size": 0, "recommendation": "6;6;6;8", "confidence": "4;4;3;4", "correctness": "4;3;3;4", "technical_novelty": "4;3;3;4", "empirical_novelty": "4;3;3;3", "wc_summary_paper": "189;50;54;88", "wc_summary_review": "43;22;39;76", "wc_main_review": "448;127;592;328", "wc_review": "680;199;685;492", "wc_reply_reviewers": "0;0;12;0", "wc_reply_authors": "1019;518;1753;802", "reply_reviewers": "0;0;1;0", "reply_authors": "2;1;4;2", "recommendation_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.5, 0.5 ], "empirical_novelty_avg": [ 3.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 95.25, 56.104255631814596 ], "wc_summary_review_avg": [ 45.0, 19.557607215607945 ], "wc_main_review_avg": [ 373.75, 170.38540870626215 ], "wc_review_avg": [ 514.0, 197.80419611322708 ], "wc_reply_reviewers_avg": [ 3.0, 5.196152422706632 ], "wc_reply_authors_avg": [ 1023.0, 457.3789457331852 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 2.25, 1.0897247358851685 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 9, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 93, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14735991802555928714&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "pdf": "https://openreview.net/pdf?id=u6TRGdzhfip", "email": "hkbu.edu.hk;alibaba-inc.com;riken.jp;sydney.edu.au;riken.jp;alibaba-inc.com;hkbu.edu.hk;alibaba-inc.com;microsoft.com", "author_num": 9, "aff_unique_index": "0;1;2;3;2;1;0;1;4", "aff_unique_norm": "Hong Kong Baptist University;Alibaba Group;RIKEN;University of Sydney;Microsoft", "aff_unique_dep": ";;;;Microsoft Research", "aff_unique_url": "https://www.hkbu.edu.hk;https://www.alibaba.com;https://www.riken.jp;https://www.sydney.edu.au;https://www.microsoft.com/en-us/research", "aff_unique_abbr": "HKBU;Alibaba;RIKEN;USYD;MSR", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;1;2;1;0;0;0;3", "aff_country_unique": "China;Japan;Australia;United States" }, { "title": "Group equivariant neural posterior estimation", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6752", "id": "u6s8dSporO8", "poster": "", "openreview": "https://openreview.net/forum?id=u6s8dSporO8", "slides": "https://iclr.cc/virtual/2022/poster/6752", "video": "https://iclr.cc/virtual/2022/poster/6752", "author_site": "Maximilian Dax, Stephen Green, Jonathan Gair, Michael Deistler, Bernhard Schoelkopf, Jakob Macke", "tldr": "", "abstract": "Simulation-based inference with conditional neural density estimators is a powerful approach to solving inverse problems in science. However, these methods typically treat the underlying forward model as a black box, with no way to exploit geometric properties such as equivariances. Equivariances are common in scientific models, however integrating them directly into expressive inference networks (such as normalizing flows) is not straightforward. We here describe an alternative method to incorporate equivariances under joint transformations of parameters and data. Our method---called group equivariant neural posterior estimation (GNPE)---is based on self-consistently standardizing the \"pose\" of the data while estimating the posterior over parameters. It is architecture-independent, and applies both to exact and approximate equivariances. As a real-world application, we use GNPE for amortized inference of astrophysical binary black hole systems from gravitational-wave observations. We show that GNPE achieves state-of-the-art accuracy while reducing inference times by three orders of magnitude.", "keywords": "simulation-based inference;likelihood-free inference;machine learning for science;equivariances;group transformations", "primary_area": "", "supplementary_material": "", "author": "Maximilian Dax;Stephen R Green;Jonathan Gair;Michael Deistler;Bernhard Sch\u00f6lkopf;Jakob H. Macke", "authorids": "~Maximilian_Dax1;~Stephen_R_Green1;~Jonathan_Gair1;~Michael_Deistler1;~Bernhard_Sch\u00f6lkopf1;~Jakob_H._Macke1", "gender": "M;M;;M;;", "homepage": ";https://www.stephenrgreen.com;;https://michaeldeistler.github.io/;;", "dblp": "249/8145;259/3011;;243/5747;;", "google_scholar": "VRBv6mEAAAAJ;https://scholar.google.de/citations?user=sqvBC1wAAAAJ;;Q24H-zYAAAAJ;;", "orcid": ";0000-0002-6987-6313;0000-0002-1671-3668;0000-0002-3573-0404;;", "linkedin": ";stephen-green-57a1b639;;;;", "or_profile": "~Maximilian_Dax1;~Stephen_R_Green1;~Jonathan_Gair1;~Michael_Deistler1;~Bernhard_Sch\u00f6lkopf1;~Jakob_H._Macke1", "aff": "Max-Planck Institute;Max Planck Institute for Gravitational Physics;Albert Einstein Institute;University of Tuebingen;;", "aff_domain": "mpg.de;aei.mpg.de;aei.mpg.de;uni-tuebingen.de;;", "position": "PhD student;Postdoc;Full Professor;PhD student;;", "bibtex": "@inproceedings{\ndax2022group,\ntitle={Group equivariant neural posterior estimation},\nauthor={Maximilian Dax and Stephen R Green and Jonathan Gair and Michael Deistler and Bernhard Sch{\\\"o}lkopf and Jakob H. Macke},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=u6s8dSporO8}\n}", "github": "", "project": "", "reviewers": "Xr5B;NKhw;W1NJ;tFyT", "pdf_size": 0, "recommendation": "5;5;6;8", "confidence": "3;2;2;4", "correctness": "3;3;3;3", "technical_novelty": "3;3;3;3", "empirical_novelty": "4;3;3;4", "wc_summary_paper": "46;41;95;155", "wc_summary_review": "40;17;74;62", "wc_main_review": "469;138;222;764", "wc_review": "555;196;391;981", "wc_reply_reviewers": "491;0;60;758", "wc_reply_authors": "1590;706;1327;2065", "reply_reviewers": "1;0;1;4", "reply_authors": "5;3;4;6", "recommendation_avg": [ 6.0, 1.224744871391589 ], "confidence_avg": [ 2.75, 0.82915619758885 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 3.5, 0.5 ], "wc_summary_paper_avg": [ 84.25, 45.97485725915851 ], "wc_summary_review_avg": [ 48.25, 21.775846711436962 ], "wc_main_review_avg": [ 398.25, 243.70717572529537 ], "wc_review_avg": [ 530.75, 289.3530499234456 ], "wc_reply_reviewers_avg": [ 327.25, 312.5998840370866 ], "wc_reply_authors_avg": [ 1422.0, 490.75299285893306 ], "reply_reviewers_avg": [ 1.5, 1.5 ], "reply_authors_avg": [ 4.5, 1.118033988749895 ], "replies_avg": [ 35, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.7385489458759963, "corr_recommendation_correctness": 0.0, "gs_citation": 37, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=737567940759599258&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "pdf": "https://openreview.net/pdf?id=u6s8dSporO8", "email": "mpg.de;aei.mpg.de;aei.mpg.de;uni-tuebingen.de;;", "author_num": 6, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "Max-Planck-Gesellschaft zur F\u00f6rderung der Wissenschaften e.V.;Max Planck Institute for Gravitational Physics;Albert Einstein Institute;University of Tuebingen", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.mpg.de;https://wwwAEI.mpg.de;https://www.aei.mpg.de;https://www.uni-tuebingen.de/", "aff_unique_abbr": "MPG;AEI;AEI;Uni T\u00fcbingen", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Germany" }, { "id": "u6sUACr7feW", "title": "DPP-TTS: Diversifying prosodic features of speech via determinantal point processes", "track": "main", "status": "Reject", "tldr": "", "abstract": "With the rapid advancement in deep generative models, recent neural text-to-speech models have succeeded in synthesizing human-like speech, even in an end-to-end manner. However, many synthesized samples often have a monotonous speaking style or simply follow the speaking style of their ground-truth samples. Although there have been many proposed methods to increase the diversity of prosody in speech, increasing prosody variance in speech often hurts the naturalness of speech. Determinantal point processes (DPPs) have shown remarkable results for modeling diversity in a wide range of machine learning tasks. However, their application in speech synthesis has not been explored. To enhance the expressiveness of speech, we propose DPP-TTS: a text-to-speech model based on a determinantal point process. The extent of prosody diversity can be easily controlled by adjusting parameters in our model. We demonstrate that DPP-TTS generates more expressive samples than baselines in the side-by-side comparison test while not harming the naturalness of the speech. ", "keywords": "Text to speech synthesis;determinantal point processes;prosody modeling", "primary_area": "", "supplementary_material": "/attachment/d81ab95e8d7715bc46ae5bdeeda9fc9bbbc3aed2.zip", "author": "Seongho Joo;Kyomin Jung", "authorids": "~Seongho_Joo1;~Kyomin_Jung1", "gender": ";M", "homepage": "https://sites.google.com/view/jsh1006/%ED%99%88;http://milab.snu.ac.kr/kjung/index.html", "dblp": "359/4619.html;48/3867", "google_scholar": ";https://scholar.google.co.kr/citations?user=u3uMl4MAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Seongho_Joo1;~Kyomin_Jung1", "aff": "Seoul National University;Seoul National University", "aff_domain": "snu.ac.kr;snu.ac.kr", "position": "PhD student;Full Professor", "bibtex": "@misc{\njoo2022dpptts,\ntitle={{DPP}-{TTS}: Diversifying prosodic features of speech via determinantal point processes},\nauthor={Seongho Joo and Kyomin Jung},\nyear={2022},\nurl={https://openreview.net/forum?id=u6sUACr7feW}\n}", "github": "", "project": "", "reviewers": "kefC;Dfaw;djSa", "site": "https://openreview.net/forum?id=u6sUACr7feW", "pdf_size": 0, "recommendation": "3;5;5", "confidence": "4;4;3", "correctness": "3;2;3", "technical_novelty": "3;2;4", "empirical_novelty": "2;3;2", "wc_summary_paper": "91;82;204", "wc_summary_review": "50;21;60", "wc_main_review": "242;834;516", "wc_review": "383;937;780", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "206;795;381", "reply_reviewers": "0;0;0", "reply_authors": "1;2;1", "recommendation_avg": [ 4.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 2.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 3.0, 0.816496580927726 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 125.66666666666667, 55.51176051572816 ], "wc_summary_review_avg": [ 43.666666666666664, 16.539514973407037 ], "wc_main_review_avg": [ 530.6666666666666, 241.90539932424457 ], "wc_review_avg": [ 700.0, 233.1365837157838 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 460.6666666666667, 246.96873396354357 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.49999999999999983, "corr_recommendation_correctness": -0.49999999999999983, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:py3lTcNzbpYJ:scholar.google.com/&scioq=DPP-TTS:+Diversifying+prosodic+features+of+speech+via+determinantal+point+processes&hl=en&as_sdt=0,33", "gs_version_total": 6, "aff_unique_index": "0;0", "aff_unique_norm": "Seoul National University", "aff_unique_dep": "", "aff_unique_url": "https://www.snu.ac.kr", "aff_unique_abbr": "SNU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "South Korea" }, { "id": "u6ybkty-bL", "title": "When Complexity Is Good: Do We Need Recurrent Deep Learning For Time Series Outlier Detection?", "track": "main", "status": "Reject", "tldr": "", "abstract": "Outlier detection is a critical part of understanding a dataset and extracting results. Outlier detection is used in different domains for various reasons; including detecting stolen credit cards, spikes of energy usage, web attacks, or in-home activity monitoring. Within this paper, we look at when it is appropriate to apply recurrent deep learning methods for time series outlier detection versus non-recurrent methods. Recurrent deep learning methods have a larger capacity for learning complex representations in time series data. We apply these methods to various synthetic and real-world datasets, including a dataset containing information about the in-home movement of people living with dementia in a clinical study cross-referenced with their recorded unplanned hospital admissions and infection episodes. We also introduce two new outlier detection methods, that can be useful in detecting contextual outliers in time series data where complex temporal relationships and local variations in the time series are important.", "keywords": "Outlier Detection;Time Series;Deep Learning;Recurrent Neural Networks", "primary_area": "", "supplementary_material": "/attachment/5e03ed9188625540b2d48d62e2225f889674eed5.zip", "author": "Alexander Capstick;Samaneh Kouchaki;Mazdak Ghajari;David J. Sharp;Payam M. Barnaghi", "authorids": "~Alexander_Capstick1;~Samaneh_Kouchaki2;m.ghajari@imperial.ac.uk;~David_J._Sharp1;~Payam_M._Barnaghi1", "gender": "M;;;M;M", "homepage": "https://alexcapstick.github.io/;https://www.surrey.ac.uk/people/samaneh-kouchaki;;;https://www.imperial.ac.uk/people/p.barnaghi", "dblp": ";;;91/6655;22/4255", "google_scholar": ";https://scholar.google.co.uk/citations?user=wBXhQ-IAAAAJ;;Sbz45kEAAAAJ;D6R2cnwAAAAJ", "orcid": ";;;;0000-0001-8591-9638", "linkedin": ";;;;", "or_profile": "~Alexander_Capstick1;~Samaneh_Kouchaki2;m.ghajari@imperial.ac.uk;~David_J._Sharp1;~Payam_M._Barnaghi1", "aff": "Imperial College London;University of Surrey;;Imperial College London;Imperial College London", "aff_domain": "imperial.ac.uk;surrey.ac.uk;;ic.ac.uk;imperial.ac.uk", "position": "PhD student;Lecturer;;Full Professor;Full Professor", "bibtex": "@misc{\ncapstick2022when,\ntitle={When Complexity Is Good: Do We Need Recurrent Deep Learning For Time Series Outlier Detection?},\nauthor={Alexander Capstick and Samaneh Kouchaki and Mazdak Ghajari and David J. Sharp and Payam M. Barnaghi},\nyear={2022},\nurl={https://openreview.net/forum?id=u6ybkty-bL}\n}", "github": "", "project": "", "reviewers": "cSV4;FAw1;aJuJ;HFSk", "site": "https://openreview.net/forum?id=u6ybkty-bL", "pdf_size": 0, "recommendation": "3;3;3;3", "confidence": "4;5;5;4", "correctness": "2;2;2;3", "technical_novelty": "1;1;2;2", "empirical_novelty": "1;2;2;2", "wc_summary_paper": "33;33;30;72", "wc_summary_review": "15;23;21;45", "wc_main_review": "94;371;100;185", "wc_review": "142;427;151;302", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "114;320;156;329", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 4.5, 0.5 ], "correctness_avg": [ 2.25, 0.4330127018922193 ], "technical_novelty_avg": [ 1.5, 0.5 ], "empirical_novelty_avg": [ 1.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 42.0, 17.363755354185336 ], "wc_summary_review_avg": [ 26.0, 11.357816691600547 ], "wc_main_review_avg": [ 187.5, 111.88945437350206 ], "wc_review_avg": [ 255.5, 117.6615910142303 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 229.75, 95.95930126881917 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Z8UjvThiFikJ:scholar.google.com/&scioq=When+Complexity+Is+Good:+Do+We+Need+Recurrent+Deep+Learning+For+Time+Series+Outlier+Detection%3F&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;0;0", "aff_unique_norm": "Imperial College London;University of Surrey", "aff_unique_dep": ";", "aff_unique_url": "https://www.imperial.ac.uk;https://www.surrey.ac.uk", "aff_unique_abbr": "ICL;Surrey", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United Kingdom" }, { "id": "u7PVCewFya", "title": "Losing Less: A Loss for Differentially Private Deep Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Differentially Private Stochastic Gradient Descent, DP-SGD, is the canonical approach to training deep neural networks with guarantees of Differential Privacy (DP). However, the modifications DP-SGD introduces to vanilla gradient descent negatively impact the accuracy of deep neural networks. In this paper, we are the first to observe that some of this performance can be recovered when training with a loss tailored to DP-SGD; we challenge cross-entropy as the de facto loss for deep learning with DP. Specifically, we introduce a loss combining three terms: the summed squared error, the focal loss, and a regularization penalty. The first term encourages learning with faster convergence. The second term emphasizes hard-to-learn examples in the later stages of training. Both are beneficial because the privacy cost of learning increases with every step of DP-SGD. The third term helps control the sensitivity of learning, decreasing the bias introduced by gradient clipping in DP-SGD. Using our loss function, we achieve new state-of-the-art tradeoffs between privacy and accuracy on MNIST, FashionMNIST, and CIFAR10. Most importantly, we improve the accuracy of DP-SGD on CIFAR10 by $4\\%$ for a DP guarantee of $\\varepsilon=3$. ", "keywords": "Differentially Private Deep Learning;DP-SGD", "primary_area": "", "supplementary_material": "/attachment/f11a5ebf956e2147998379ae9b5a2f095c3b2962.zip", "author": "Ali Shahin Shamsabadi;Nicolas Papernot", "authorids": "~Ali_Shahin_Shamsabadi1;~Nicolas_Papernot1", "gender": "M;M", "homepage": "https://alishahin.github.io;https://www.papernot.fr", "dblp": "198/1244;162/1405", "google_scholar": "1kVnWYwAAAAJ;cGxq0cMAAAAJ", "orcid": ";", "linkedin": "ali-shahin-shamsabadi-492544259/;nicolaspapernot", "or_profile": "~Ali_Shahin_Shamsabadi1;~Nicolas_Papernot1", "aff": "Vector;Google", "aff_domain": "vectorinstitute.ai;google.com", "position": "Postdoc;Research Scientist", "bibtex": "@misc{\nshamsabadi2022losing,\ntitle={Losing Less: A Loss for Differentially Private Deep Learning},\nauthor={Ali Shahin Shamsabadi and Nicolas Papernot},\nyear={2022},\nurl={https://openreview.net/forum?id=u7PVCewFya}\n}", "github": "", "project": "", "reviewers": "WSQr;VgSd;jXWf;UK9m", "site": "https://openreview.net/forum?id=u7PVCewFya", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "4;5;2;3", "correctness": "3;2;4;3", "technical_novelty": "3;2;2;2", "empirical_novelty": "3;3;2;3", "wc_summary_paper": "44;84;41;83", "wc_summary_review": "1;26;52;17", "wc_main_review": "246;242;224;235", "wc_review": "291;352;317;335", "wc_reply_reviewers": "0;127;0;0", "wc_reply_authors": "752;1612;653;672", "reply_reviewers": "0;1;0;0", "reply_authors": "2;3;1;2", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 3.5, 1.118033988749895 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 63.0, 20.530465167647808 ], "wc_summary_review_avg": [ 24.0, 18.479718612576328 ], "wc_main_review_avg": [ 236.75, 8.347903928532 ], "wc_review_avg": [ 323.75, 22.598395960775623 ], "wc_reply_reviewers_avg": [ 31.75, 54.99261314031185 ], "wc_reply_authors_avg": [ 922.25, 399.9564820077304 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 2.0, 0.7071067811865476 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.8944271909999159, "corr_recommendation_correctness": 0.7071067811865475, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=534823259749552759&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1", "aff_unique_norm": "Vector Institute;Google", "aff_unique_dep": ";Google", "aff_unique_url": "https://vectorinstitute.ai/;https://www.google.com", "aff_unique_abbr": "Vector;Google", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;1", "aff_country_unique": "Canada;United States" }, { "id": "u7UxOTefG2", "title": "Uncertainty-based out-of-distribution detection requires suitable function space priors", "track": "main", "status": "Reject", "tldr": "", "abstract": "The need to avoid confident predictions on unfamiliar data has sparked interest in out-of-distribution (OOD) detection. It is widely assumed that Bayesian neural networks (BNNs) are well suited for this task, as the endowed epistemic uncertainty should lead to disagreement in predictions on outliers. In this paper, we question this assumption and show that proper Bayesian inference with function space priors induced by neural networks does not necessarily lead to good OOD detection. To circumvent the use of approximate inference, we start by studying the infinite-width case, where Bayesian inference can be exact due to the correspondence with Gaussian processes. Strikingly, the kernels induced under common architectural choices lead to uncertainties that do not reflect the underlying data generating process and are therefore unsuited for OOD detection. Importantly, we find this OOD behavior to be consistent with the corresponding finite-width networks. Desirable function space properties can be encoded in the prior in weight space, however, this currently only applies to a specified subset of the domain and thus does not inherently extend to OOD data. Finally, we argue that a trade-off between generalization and OOD capabilities might render the application of BNNs for OOD detection undesirable in practice. Overall, our study discloses fundamental problems when naively using BNNs for OOD detection and opens interesting avenues for future research.", "keywords": "Bayesian Statistics;Out-of-distribution Detection;Machine Learning;Neural Networks;Epistemic Uncertainty;Gaussian Process", "primary_area": "", "supplementary_material": "/attachment/7f01364fa923c7a58530708bcf753cbbf1738135.zip", "author": "Francesco D'Angelo;Christian Henning", "authorids": "~Francesco_D'Angelo1;~Christian_Henning1", "gender": "M;M", "homepage": ";https://www.ini.uzh.ch/en/institute/people?uname=christian", "dblp": "32/10646;", "google_scholar": "Hg_3f5kAAAAJ;u6QSFrsAAAAJ", "orcid": ";", "linkedin": ";christian-henning/", "or_profile": "~Francesco_D'Angelo1;~Christian_Henning1", "aff": "ETH;Swiss Federal Institute of Technology", "aff_domain": "ethz.ch;ethz.ch", "position": "MS student;PhD student", "bibtex": "@misc{\nd'angelo2022uncertaintybased,\ntitle={Uncertainty-based out-of-distribution detection requires suitable function space priors},\nauthor={Francesco D'Angelo and Christian Henning},\nyear={2022},\nurl={https://openreview.net/forum?id=u7UxOTefG2}\n}", "github": "", "project": "", "reviewers": "fGuy;g5SX;HkGA", "site": "https://openreview.net/forum?id=u7UxOTefG2", "pdf_size": 0, "recommendation": "5;5;6", "confidence": "3;2;4", "correctness": "3;4;3", "technical_novelty": "2;2;3", "empirical_novelty": "1;2;2", "wc_summary_paper": "124;128;83", "wc_summary_review": "21;42;61", "wc_main_review": "371;55;479", "wc_review": "516;225;623", "wc_reply_reviewers": "98;0;478", "wc_reply_authors": "574;480;1542", "reply_reviewers": "2;0;2", "reply_authors": "2;1;4", "recommendation_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.0, 0.816496580927726 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 1.6666666666666667, 0.4714045207910317 ], "wc_summary_paper_avg": [ 111.66666666666667, 20.33606539022619 ], "wc_summary_review_avg": [ 41.333333333333336, 16.33673433979046 ], "wc_main_review_avg": [ 301.6666666666667, 179.90614837248398 ], "wc_review_avg": [ 454.6666666666667, 168.17120905659078 ], "wc_reply_reviewers_avg": [ 192.0, 206.1520474471856 ], "wc_reply_authors_avg": [ 865.3333333333334, 480.0120368861135 ], "reply_reviewers_avg": [ 1.3333333333333333, 0.9428090415820634 ], "reply_authors_avg": [ 2.3333333333333335, 1.247219128924647 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.8660254037844385, "corr_recommendation_correctness": -0.4999999999999999, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3722287678268897735&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1", "aff_unique_norm": "ETH Zurich;Swiss Federal Institute of Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.ethz.ch;https://www.ethz.ch", "aff_unique_abbr": "ETH;ETH Zurich", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Switzerland" }, { "id": "uB12zutkXJR", "title": "GRAPHIX: A Pre-trained Graph Edit Model for Automated Program Repair", "track": "main", "status": "Reject", "tldr": "", "abstract": "We present GRAPHIX, a pre-trained graph edit model for automatically detecting and fixing bugs and code quality issues in Java programs. Unlike sequence-to-sequence models, GRAPHIX leverages the abstract syntax structure of code and represents the code using a multi-head graph encoder. Along with an autoregressive tree decoder, the model learns to perform graph edit actions for automated program repair. We devise a novel pre-training strategy for GRAPHIX, namely deleted sub-tree reconstruction, to enrich the model with implicit knowledge of program structures from unlabeled source code. The pre-training objective is made consistent with the bug fixing task to facilitate the downstream learning. We evaluate GRAPHIX on the Patches in The Wild Java benchmark, using both abstract and concrete code. Experimental results show that GRAPHIX significantly outperforms a wide range of baselines including CodeBERT and BART and is as competitive as other state-of-the-art pre-trained Transformer models despite using one order of magnitude fewer parameters. Further analysis demonstrates strong inductive biases of GRAPHIX in learning meaningful structural and semantic code patterns, both in abstract and concrete source code.", "keywords": "Program Repair;Graph Neural Networks;Pre-training", "primary_area": "", "supplementary_material": "/attachment/5cb2e9743898298b753c79cfe8f1a000b32dd98c.zip", "author": "Thanh V Nguyen;Srinivasan H. Sengamedu", "authorids": "~Thanh_V_Nguyen1;~Srinivasan_H._Sengamedu1", "gender": "M;", "homepage": "http://thanhng.public.iastate.edu/;", "dblp": "210/2436;38/2372", "google_scholar": "HnkaCSQAAAAJ;X9fVMRUAAAAJ", "orcid": "0000-0003-1576-5420;0000-0003-1847-8398", "linkedin": ";srinivasan-h-sengamedu", "or_profile": "~Thanh_V_Nguyen1;~Srinivasan_H._Sengamedu1", "aff": "Amazon;Amazon", "aff_domain": "amazon.com;amazon.com", "position": "Researcher;Applied Science Manager", "bibtex": "@misc{\nnguyen2022graphix,\ntitle={{GRAPHIX}: A Pre-trained Graph Edit Model for Automated Program Repair},\nauthor={Thanh V Nguyen and Srinivasan H. Sengamedu},\nyear={2022},\nurl={https://openreview.net/forum?id=uB12zutkXJR}\n}", "github": "", "project": "", "reviewers": "AZoW;iXHo;z3GX;V4AB", "site": "https://openreview.net/forum?id=uB12zutkXJR", "pdf_size": 0, "recommendation": "5;5;5;6", "confidence": "5;5;3;4", "correctness": "3;3;3;3", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;2;3;2", "wc_summary_paper": "65;52;99;109", "wc_summary_review": "55;78;87;93", "wc_main_review": "880;244;757;509", "wc_review": "1000;374;943;711", "wc_reply_reviewers": "278;0;105;35", "wc_reply_authors": "2292;767;828;452", "reply_reviewers": "1;0;1;1", "reply_authors": "4;1;1;1", "recommendation_avg": [ 5.25, 0.4330127018922193 ], "confidence_avg": [ 4.25, 0.82915619758885 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 81.25, 23.47738273317535 ], "wc_summary_review_avg": [ 78.25, 14.446020213193666 ], "wc_main_review_avg": [ 597.5, 243.94722789980622 ], "wc_review_avg": [ 757.0, 246.1960600822036 ], "wc_reply_reviewers_avg": [ 104.5, 107.06656807799529 ], "wc_reply_authors_avg": [ 1084.75, 711.4616556779431 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.75, 1.299038105676658 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.17407765595569782, "corr_recommendation_correctness": 0.0, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17477641627828217248&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Amazon", "aff_unique_dep": "Amazon.com, Inc.", "aff_unique_url": "https://www.amazon.com", "aff_unique_abbr": "Amazon", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "uEBrNNEfceE", "title": "Safe Linear-Quadratic Dual Control with Almost Sure Performance Guarantee", "track": "main", "status": "Reject", "tldr": "", "abstract": "This paper considers the linear-quadratic dual control problem where the system parameters need to be identified and the control objective needs to be optimized in the meantime. Contrary to existing works on data-driven linear-quadratic regulation, which typically provide error or regret bounds within a certain probability, we propose an online algorithm that guarantees the asymptotic optimality of the controller in the almost sure sense. Our dual control strategy consists of two parts: a switched controller with time-decaying exploration noise and Markov parameter inference based on the cross-correlation between the exploration noise and system output. Central to the almost sure performance guarantee is a safe switched control strategy that falls back to a known conservative but stable controller when the actual state deviates significantly from the target state. We prove that this switching strategy rules out any potential destabilizing controllers from being applied, while the performance gap between our switching strategy and the optimal linear state feedback is exponentially small. Under our dual control scheme, the parameter inference error scales as $O(T^{-1/4+\\epsilon})$, while the suboptimality gap of control performance scales as $O(T^{-1/2+\\epsilon})$, where $T$ is the number of time steps, and $\\epsilon$ is an arbitrarily small positive number. Simulation results on an industrial process example are provided to illustrate the effectiveness of our proposed strategy.", "keywords": "reinforcement learning", "primary_area": "", "supplementary_material": "", "author": "Yiwen Lu;Yilin Mo", "authorids": "~Yiwen_Lu1;ylmo@tsinghua.edu.cn", "gender": ";", "homepage": "https://yiwen.lu;", "dblp": ";", "google_scholar": "ZkbKoXsAAAAJ;", "orcid": ";", "linkedin": ";", "or_profile": "~Yiwen_Lu1;ylmo@tsinghua.edu.cn", "aff": "Tsinghua University;", "aff_domain": "tsinghua.edu.cn;", "position": "PhD student;", "bibtex": "@misc{\nlu2022safe,\ntitle={Safe Linear-Quadratic Dual Control with Almost Sure Performance Guarantee},\nauthor={Yiwen Lu and Yilin Mo},\nyear={2022},\nurl={https://openreview.net/forum?id=uEBrNNEfceE}\n}", "github": "", "project": "", "reviewers": "qNrU;ZdU6;1scp;oE9i", "site": "https://openreview.net/forum?id=uEBrNNEfceE", "pdf_size": 0, "recommendation": "5;5;6;8", "confidence": "4;3;5;4", "correctness": "3;3;3;4", "technical_novelty": "3;3;3;3", "empirical_novelty": "3;3;0;3", "wc_summary_paper": "164;146;30;77", "wc_summary_review": "181;71;77;70", "wc_main_review": "725;467;345;402", "wc_review": "1070;684;452;549", "wc_reply_reviewers": "0;287;0;67", "wc_reply_authors": "572;717;1507;403", "reply_reviewers": "0;1;0;1", "reply_authors": "1;2;2;2", "recommendation_avg": [ 6.0, 1.224744871391589 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.25, 1.299038105676658 ], "wc_summary_paper_avg": [ 104.25, 53.77906191074738 ], "wc_summary_review_avg": [ 99.75, 46.98603515939603 ], "wc_main_review_avg": [ 484.75, 145.26936187648104 ], "wc_review_avg": [ 688.75, 235.02912053615825 ], "wc_reply_reviewers_avg": [ 88.5, 117.82296041094877 ], "wc_reply_authors_avg": [ 799.75, 423.1816247192215 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.75, 0.4330127018922193 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.28867513459481287, "corr_recommendation_correctness": 0.9428090415820632, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10929676586979507949&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0", "aff_unique_norm": "Tsinghua University", "aff_unique_dep": "", "aff_unique_url": "https://www.tsinghua.edu.cn", "aff_unique_abbr": "THU", "aff_country_unique_index": "0", "aff_country_unique": "China" }, { "id": "uF_Wl0xSA7O", "title": "Independent Component Alignment for Multi-task Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "We present a novel gradient-based multi-task learning (MTL) approach that balances training in multi-task systems by aligning the independent components of the training objective. In contrast to state-of-the-art MTL approaches, our method is stable and preserves the ratio of highly correlated tasks gradients. The method is scalable, reduces overfitting, and can seamlessly handle multi-task objectives with a large difference in gradient magnitudes. We demonstrate the effectiveness of the proposed approach on a variety of MTL problems including digit classification, multi-label image classification, camera relocalization, and scene understanding. Our approach performs favourably compared to other gradient-based adaptive balancing methods, and its performance is backed up by theoretical analysis.", "keywords": "multi-task learning;optimization", "primary_area": "", "supplementary_material": "/attachment/3de8d80aa464bb8697f0d460808877ee09ea9c36.zip", "author": "Dmitry Senushkin;Iaroslav Melekhov;Mikhail Romanov;Anton Konushin;Juho Kannala;Arno Solin", "authorids": "~Dmitry_Senushkin1;~Iaroslav_Melekhov1;~Mikhail_Romanov1;~Anton_Konushin1;~Juho_Kannala1;~Arno_Solin1", "gender": "M;;M;M;;M", "homepage": ";https://imelekhov.com;;;http://arno.solin.fi;https://users.aalto.fi/~kannalj1/", "dblp": ";195/5718;230/4397;69/4994-1;98/11225;47/4656.html", "google_scholar": "7u6J7eUAAAAJ;BXNprrsAAAAJ;jjQDoFwAAAAJ;ZT_k-wMAAAAJ;U_fJCnAAAAAJ;c4mWQPQAAAAJ", "orcid": ";;0009-0005-0397-6070;0000-0002-6152-0021;0000-0002-0958-7886;0000-0001-5088-4041", "linkedin": "senushkin/;imelekhov/;mikhail-romanov-5b517473;;asolin/;", "or_profile": "~Dmitry_Senushkin1;~Iaroslav_Melekhov1;~Mikhail_Romanov1;~Anton_Konushin1;~Arno_Solin1;~Juho_Kannala5", "aff": "Samsung;Aalto University;Moscow Institute of Physics and Technology;Samsung;Aalto University;Aalto University", "aff_domain": "samsung.com;aalto.fi;phystech.edu;samsung.com;aalto.fi;aalto.fi", "position": "Researcher;Postdoc;Lecturer;Principal Researcher;Assistant Professor;Assistant Professor", "bibtex": "@misc{\nsenushkin2022independent,\ntitle={Independent Component Alignment for Multi-task Learning},\nauthor={Dmitry Senushkin and Iaroslav Melekhov and Mikhail Romanov and Anton Konushin and Juho Kannala and Arno Solin},\nyear={2022},\nurl={https://openreview.net/forum?id=uF_Wl0xSA7O}\n}", "github": "", "project": "", "reviewers": "nRWJ;RbSw;PcEb", "site": "https://openreview.net/forum?id=uF_Wl0xSA7O", "pdf_size": 0, "recommendation": "5;6;8", "confidence": "5;3;3", "correctness": "2;4;3", "technical_novelty": "3;3;3", "empirical_novelty": "2;3;2", "wc_summary_paper": "90;53;133", "wc_summary_review": "41;34;23", "wc_main_review": "358;63;552", "wc_review": "489;150;708", "wc_reply_reviewers": "91;0;122", "wc_reply_authors": "778;107;1171", "reply_reviewers": "1;0;1", "reply_authors": "2;1;3", "recommendation_avg": [ 6.333333333333333, 1.247219128924647 ], "confidence_avg": [ 3.6666666666666665, 0.9428090415820634 ], "correctness_avg": [ 3.0, 0.816496580927726 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 92.0, 32.69046751985457 ], "wc_summary_review_avg": [ 32.666666666666664, 7.408703590297623 ], "wc_main_review_avg": [ 324.3333333333333, 201.04781078694248 ], "wc_review_avg": [ 449.0, 229.55173708774237 ], "wc_reply_reviewers_avg": [ 71.0, 51.77515491687752 ], "wc_reply_authors_avg": [ 685.3333333333334, 439.2905897264614 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 2.0, 0.816496580927726 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.7559289460184546, "corr_recommendation_correctness": 0.3273268353539886, "gs_citation": 57, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=971519433605712335&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff_unique_index": "0;1;2;0;1;1", "aff_unique_norm": "Samsung;Aalto University;Moscow Institute of Physics and Technology", "aff_unique_dep": "Samsung;;", "aff_unique_url": "https://www.samsung.com;https://www.aalto.fi;https://www.mipt.ru/en", "aff_unique_abbr": "Samsung;Aalto;MIPT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2;0;1;1", "aff_country_unique": "South Korea;Finland;Russian Federation" }, { "id": "uHq5rHHektz", "title": "Contextual Fusion For Adversarial Robustness", "track": "main", "status": "Reject", "tldr": "", "abstract": "Mammalian brains handle complex reasoning tasks in a gestalt manner by integrating information from regions of the brain that are specialized to individual sensory modalities. This allows for improved robustness and better generalization ability. In contrast, deep neural networks are usually designed to process one particular information stream and susceptible to various types of adversarial perturbations. While many methods exist for detecting and defending against adversarial attacks, they do not generalize across a range of attacks and negatively affect performance on clean, unperturbed data. We developed a fusion model using a combination of background and foreground features extracted in parallel from Places-CNN and Imagenet-CNN. We tested the benefits of the fusion approach on preserving adversarial robustness for human perceivable (e.g., Gaussian blur) and network perceivable (e.g., gradient-based) attacks for CIFAR-10 and MS COCO data sets. For gradient based attacks, our results show that fusion allows for significant improvements in classification without decreasing performance on unperturbed data and without need to perform adversarial retraining. Our fused model revealed improvements for Gaussian blur type perturbations as well. The increase in performance from fusion approach depended on the variability of the image contexts; larger increases were seen for classes of images with larger differences in their contexts. We also demonstrate the effect of regularization to bias the classifier decision in the presence of a known adversary. We propose that this biologically inspired approach to integrate information across multiple modalities provides a new way to improve adversarial robustness that can be complementary to current state of the art approaches.", "keywords": "Image processing;Neuroscience;Multi-modal representations", "primary_area": "", "supplementary_material": "", "author": "Aiswarya Akumalla;Seth D Haney;Maxim Bazhenov", "authorids": "aakumall@health.ucsd.edu;~Seth_D_Haney1;mbazhenov@ucsd.edu", "gender": ";M;", "homepage": ";;", "dblp": ";;", "google_scholar": ";7xrnFTgAAAAJ;", "orcid": ";;", "linkedin": ";;", "or_profile": "aakumall@health.ucsd.edu;~Seth_D_Haney1;mbazhenov@ucsd.edu", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nakumalla2022contextual,\ntitle={Contextual Fusion For Adversarial Robustness},\nauthor={Aiswarya Akumalla and Seth D Haney and Maxim Bazhenov},\nyear={2022},\nurl={https://openreview.net/forum?id=uHq5rHHektz}\n}", "github": "", "project": "", "reviewers": "yT4G;FAzC;GiW7;mH95", "site": "https://openreview.net/forum?id=uHq5rHHektz", "pdf_size": 0, "recommendation": "1;3;3;3", "confidence": "5;4;4;4", "correctness": "2;3;2;2", "technical_novelty": "1;1;1;1", "empirical_novelty": "1;2;1;1", "wc_summary_paper": "19;78;73;81", "wc_summary_review": "26;14;11;63", "wc_main_review": "266;160;105;346", "wc_review": "311;252;189;490", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 2.5, 0.8660254037844386 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 2.25, 0.4330127018922193 ], "technical_novelty_avg": [ 1.0, 0.0 ], "empirical_novelty_avg": [ 1.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 62.75, 25.420218331084413 ], "wc_summary_review_avg": [ 28.5, 20.694202086574876 ], "wc_main_review_avg": [ 219.25, 93.29355551162149 ], "wc_review_avg": [ 310.5, 112.25528940767111 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -1.0, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11728533007683017598&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3 }, { "id": "uHv20yi8saL", "title": "Monotonic Improvement Guarantees under Non-stationarity for Decentralized PPO", "track": "main", "status": "Reject", "tldr": "", "abstract": "We present a new monotonic improvement guarantee for optimizing decentralized policies in cooperative Multi-Agent Reinforcement Learning (MARL), which holds even when the transition dynamics are non-stationary. This new analysis provides a theoretical understanding of the strong performance of two recent actor-critic methods for MARL, i.e., Independent Proximal Policy Optimization (IPPO) and Multi-Agent PPO (MAPPO), which both rely on independent ratios, i.e., computing probability ratios separately for each agent's policy. We show that, despite the non-stationarity that independent ratios cause, a monotonic improvement guarantee still arises as a result of enforcing the trust region constraint over joint policies. We also show this trust region constraint can be effectively enforced in a principled way by bounding independent ratios based on the number of agents in training, providing a theoretical foundation for proximal ratio clipping. Moreover, we show that the surrogate objectives optimized in IPPO and MAPPO are essentially equivalent when their critics converge to a fixed point. Finally, our empirical results support the hypothesis that the strong performance of IPPO and MAPPO is a direct result of enforcing such a trust region constraint via clipping in centralized training, and the good values of the hyperparameters for this enforcement are highly sensitive to the number of agents, as predicted by our theoretical analysis. ", "keywords": "Multi-Agent Reinforcement Learning", "primary_area": "", "supplementary_material": "/attachment/79b2097e003aa89fb53d24573cb92be216c0f62c.zip", "author": "Mingfei Sun;Sam Devlin;Jacob Austin Beck;Katja Hofmann;Shimon Whiteson", "authorids": "~Mingfei_Sun1;~Sam_Devlin2;~Jacob_Austin_Beck1;~Katja_Hofmann1;~Shimon_Whiteson1", "gender": "M;M;M;F;", "homepage": "https://research.manchester.ac.uk/en/persons/mingfei-sun;;http://jakebeck.com;https://www.microsoft.com/en-us/research/people/kahofman/;", "dblp": "195/7934.html;64/7502;;97/3500;https://dblp.uni-trier.de/pers/w/Whiteson:Shimon.html", "google_scholar": "2Uzgp5kAAAAJ;https://scholar.google.com/citations?hl=en;https://scholar.google.ca/citations?user=PrS_dHMAAAAJ;https://scholar.google.co.uk/citations?hl=en;", "orcid": ";0000-0002-7769-3090;;;", "linkedin": ";https://www.linkedin.com/pub/sam-devlin/83/810/b23;;;", "or_profile": "~Mingfei_Sun1;~Sam_Devlin2;~Jacob_Austin_Beck1;~Katja_Hofmann1;~Shimon_Whiteson1", "aff": "Department of Computer Science, University of Oxford;Microsoft Research;Department of Computer Science, University of Oxford;Microsoft;University of Oxford", "aff_domain": "cs.ox.ac.uk;microsoft.com;cs.ox.ac.uk;microsoft.com;ox.ac.uk", "position": "Postdoc;Principal Researcher;PhD student;Senior Principal Research Manager;Professor", "bibtex": "@misc{\nsun2022monotonic,\ntitle={Monotonic Improvement Guarantees under Non-stationarity for Decentralized {PPO}},\nauthor={Mingfei Sun and Sam Devlin and Jacob Austin Beck and Katja Hofmann and Shimon Whiteson},\nyear={2022},\nurl={https://openreview.net/forum?id=uHv20yi8saL}\n}", "github": "", "project": "", "reviewers": "YJEa;7r77;JmGx;ax1X", "site": "https://openreview.net/forum?id=uHv20yi8saL", "pdf_size": 0, "recommendation": "3;6;6;8", "confidence": "4;2;3;3", "correctness": "1;4;3;3", "technical_novelty": "1;3;3;3", "empirical_novelty": "2;3;2;3", "wc_summary_paper": "40;166;55;15", "wc_summary_review": "122;28;21;16", "wc_main_review": "156;391;127;362", "wc_review": "318;585;203;393", "wc_reply_reviewers": "140;0;0;0", "wc_reply_authors": "517;614;400;560", "reply_reviewers": "3;0;0;0", "reply_authors": "4;1;1;1", "recommendation_avg": [ 5.75, 1.7853571071357126 ], "confidence_avg": [ 3.0, 0.7071067811865476 ], "correctness_avg": [ 2.75, 1.0897247358851685 ], "technical_novelty_avg": [ 2.5, 0.8660254037844386 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 69.0, 57.79705874869413 ], "wc_summary_review_avg": [ 46.75, 43.654180784891615 ], "wc_main_review_avg": [ 259.0, 118.39130035606502 ], "wc_review_avg": [ 374.75, 138.97549244381185 ], "wc_reply_reviewers_avg": [ 35.0, 60.6217782649107 ], "wc_reply_authors_avg": [ 522.75, 78.76349090790733 ], "reply_reviewers_avg": [ 0.75, 1.299038105676658 ], "reply_authors_avg": [ 1.75, 1.299038105676658 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.5940885257860046, "corr_recommendation_correctness": 0.7388664511337208, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5189847681520722136&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;0;1;0", "aff_unique_norm": "University of Oxford;Microsoft", "aff_unique_dep": "Department of Computer Science;Microsoft Research", "aff_unique_url": "https://www.ox.ac.uk;https://www.microsoft.com/en-us/research", "aff_unique_abbr": "Oxford;MSR", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Oxford;", "aff_country_unique_index": "0;1;0;1;0", "aff_country_unique": "United Kingdom;United States" }, { "title": "Language model compression with weighted low-rank factorization", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6157", "id": "uPv9Y3gmAI5", "poster": "", "openreview": "https://openreview.net/forum?id=uPv9Y3gmAI5", "slides": "https://iclr.cc/virtual/2022/poster/6157", "video": "https://iclr.cc/virtual/2022/poster/6157", "author_site": "Yen-Chang Hsu, Ting Hua, Sung-En Chang, Qian Lou, Yilin Shen, Hongxia Jin", "tldr": "", "abstract": "Factorizing a large matrix into small matrices is a popular strategy for model compression. Singular value decomposition (SVD) plays a vital role in this compression strategy, approximating a learned matrix with fewer parameters. However, SVD minimizes the squared error toward reconstructing the original matrix without gauging the importance of the parameters, potentially giving a larger reconstruction error for those who affect the task accuracy more. In other words, the optimization objective of SVD is not aligned with the trained model's task accuracy. We analyze this previously unexplored problem, make observations, and address it by introducing Fisher information to weigh the importance of parameters affecting the model prediction. This idea leads to our method: Fisher-Weighted SVD (FWSVD). Although the factorized matrices from our approach do not result in smaller reconstruction errors, we find that our resulting task accuracy is much closer to the original model's performance. We perform analysis with the transformer-based language models, showing our weighted SVD largely alleviates the mismatched optimization objectives and can maintain model performance with a higher compression rate. Our method can directly compress a task-specific model while achieving better performance than other compact model strategies requiring expensive model pre-training. Moreover, the evaluation of compressing an already compact model shows our method can further reduce 9% to 30% parameters with an insignificant impact on task accuracy.", "keywords": "model compression;low-rank approximation;transformer;language model", "primary_area": "", "supplementary_material": "", "author": "Yen-Chang Hsu;Ting Hua;Sungen Chang;Qian Lou;Yilin Shen;Hongxia Jin", "authorids": "~Yen-Chang_Hsu1;~Ting_Hua1;chang.sun@northeastern.edu;~Qian_Lou1;~Yilin_Shen1;~Hongxia_Jin1", "gender": "M;;;M;M;", "homepage": ";;;https://qlou.org;;", "dblp": "172/1140;;;207/3962.html;30/383;", "google_scholar": "7QWAiigAAAAJ;;;SBYgXLoAAAAJ;9PSFMzAAAAAJ;", "orcid": ";;;;;", "linkedin": "yenchanghsu/;;;;;", "or_profile": "~Yen-Chang_Hsu1;~Ting_Hua1;chang.sun@northeastern.edu;~Qian_Lou1;~Yilin_Shen1;~Hongxia_Jin1", "aff": "Samsung Research America;;;Samsung;Samsung Research America;", "aff_domain": "samsung.com;;;samsung.com;gmail.com;", "position": "Research Scientist;;;Research Scientist;Principal Researcher;", "bibtex": "@inproceedings{\nhsu2022language,\ntitle={Language model compression with weighted low-rank factorization},\nauthor={Yen-Chang Hsu and Ting Hua and Sungen Chang and Qian Lou and Yilin Shen and Hongxia Jin},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=uPv9Y3gmAI5}\n}", "github": "", "project": "", "reviewers": "d1T4;Kuwu;jnTC", "pdf_size": 0, "recommendation": "6;6;6", "confidence": "4;4;4", "correctness": "4;3;4", "technical_novelty": "3;2;2", "empirical_novelty": "3;0;3", "wc_summary_paper": "163;104;69", "wc_summary_review": "126;30;56", "wc_main_review": "210;500;260", "wc_review": "499;634;385", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "153;943;515", "reply_reviewers": "0;0;0", "reply_authors": "1;3;1", "recommendation_avg": [ 6.0, 0.0 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.0, 1.4142135623730951 ], "wc_summary_paper_avg": [ 112.0, 38.79003308411411 ], "wc_summary_review_avg": [ 70.66666666666667, 40.54078878802873 ], "wc_main_review_avg": [ 323.3333333333333, 126.57891697365017 ], "wc_review_avg": [ 506.0, 101.7742600071354 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 537.0, 322.8911065152874 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.6666666666666667, 0.9428090415820634 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 113, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5671830850356411400&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=uPv9Y3gmAI5", "email": "samsung.com;;;samsung.com;gmail.com;", "author_num": 6, "aff_unique_index": "0;0;0", "aff_unique_norm": "Samsung", "aff_unique_dep": "Samsung Research America", "aff_unique_url": "https://www.samsung.com/us/careers/research/", "aff_unique_abbr": "SRA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", "aff_country_unique": "United States;South Korea" }, { "id": "uR77O7SL55h", "title": "Scalable Sinkhorn Backpropagation", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Optimal transport has recently gained increasing attention in the context of deep learning. A major contributing factor is the line of work on smooth relaxations that make the classical optimal transport problem differentiable. The most prominent example is entropy regularized optimal transport which can be optimized efficiently via an alternating scheme of Sinkhorn projections. We thus experienced a surge of deep learning techniques that use the Sinkhorn operator to learn matchings, permutations, sorting and ranking, or to construct a geometrically motivated loss function for generative models. The prevalent approach to training such a neural network is first-order optimization by algorithmic unrolling of the forward pass. Hence, the runtime and memory complexity of the backward pass increase linearly with the number of Sinkhorn iterations. This often makes it impractical when computational resources like GPU memory are scarce. A more efficient alternative is computing the derivative of a Sinkhorn layer via implicit differentiation. Our main contribution is deriving a simple and efficient algorithm that performs this backward pass in closed form. It is based on the Sinkhorn operator in its most general form -- with learnable cost matrices and target capacities. We further provide a theoretical analysis with error bounds for approximate inputs. Finally, we demonstrate that, for a number of applications, replacing automatic differentiation with our module often improves the stability and accuracy of the obtained gradients while drastically reducing the computation cost.", "keywords": "Optimal Transport;Implicit Differentiation", "primary_area": "", "supplementary_material": "", "author": "Marvin Eisenberger;Aysim Toker;Laura Leal-Taix\u00e9;Florian Bernard;Daniel Cremers", "authorids": "~Marvin_Eisenberger1;~Aysim_Toker1;~Laura_Leal-Taix\u00e91;~Florian_Bernard3;~Daniel_Cremers1", "gender": "M;F;F;;M", "homepage": "https://vision.in.tum.de/members/eisenber;https://dvl.in.tum.de/team/toker/;https://dvl.in.tum.de/team/lealtaixe/;https://florianbernard.net;https://vision.in.tum.de/members/cremers", "dblp": "218/6006;277/5926;47/8483;134/8112;c/DanielCremers", "google_scholar": "https://scholar.google.de/citations?user=6ZX5D5QAAAAJ;qq4LxBcAAAAJ;tT2TC-UAAAAJ;https://scholar.google.de/citations?user=9GrQ2KYAAAAJ;cXQciMEAAAAJ", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~Marvin_Eisenberger1;~Aysim_Toker1;~Laura_Leal-Taix\u00e91;~Florian_Bernard3;~Daniel_Cremers1", "aff": "Technical University Munich;Technical University of Munich;Technical University Munich;University of Bonn;Technical University Munich", "aff_domain": "tum.de;tum.de;tum.de;uni-bonn.de;tum.de", "position": "PhD student;PhD student;Assistant Professor;Assistant Professor;Full Professor", "bibtex": "@misc{\neisenberger2022scalable,\ntitle={Scalable Sinkhorn Backpropagation},\nauthor={Marvin Eisenberger and Aysim Toker and Laura Leal-Taix{\\'e} and Florian Bernard and Daniel Cremers},\nyear={2022},\nurl={https://openreview.net/forum?id=uR77O7SL55h}\n}", "github": "", "project": "", "reviewers": "FxeZ;Ua6x;koT4;qN8G", "site": "https://openreview.net/forum?id=uR77O7SL55h", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "4;4;4;3", "correctness": "4;3;4;3", "technical_novelty": "1;2;2;3", "empirical_novelty": "0;1;2;2", "wc_summary_paper": "124;16;73;35", "wc_summary_review": "21;37;32;7", "wc_main_review": "499;570;324;209", "wc_review": "644;623;429;251", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 1.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 62.0, 41.26136207155552 ], "wc_summary_review_avg": [ 24.25, 11.519006033508273 ], "wc_main_review_avg": [ 400.5, 142.26471804351212 ], "wc_review_avg": [ 486.75, 159.84738815507745 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=366843224261974762&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;0;1;0", "aff_unique_norm": "Technical University of Munich;University of Bonn", "aff_unique_dep": ";", "aff_unique_url": "https://www.tum.de;https://www.uni-bonn.de/", "aff_unique_abbr": "TUM;UBonn", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "Germany" }, { "id": "uS4AQe9Tv_R", "title": "Improving Gender Fairness of Pre-Trained Language Models without Catastrophic Forgetting", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Although pre-trained language models, such as BERT, achieve state-of-art performance in many language understanding tasks, they have been demonstrated to inherit strong gender bias from its training data. \nExisting studies addressing the gender bias issue of pre-trained models, usually recollect and build gender-neutral data on their own and conduct a second phase pre-training on the released pre-trained model with such data. However, given the limited size of the gender-neutral data and its potential distributional mismatch with the original pre-training data, catastrophic forgetting would occur during the second-phase pre-training. Forgetting on the original training data may damage the model\u2019s downstream performance to a large margin. In this work, we first empirically show that even if the gender-neutral data for second-phase pre-training comes from the original training data, catastrophic forgetting still occurs if the size of gender-neutral data is smaller than that of original training data. Then, we propose a new method, GEnder Equality Prompt (GEEP), to improve gender fairness of pre-trained models without forgetting. GEEP learns gender-related prompts to reduce gender bias, conditioned on frozen language models. Since all pre-trained parameters are frozen, forgetting on information from the original training data can be alleviated to the most extent. Then GEEP trains new embeddings of profession names as gender equality prompts conditioned on the frozen model. This makes GEEP more effective at debiasing as well. Because gender bias from previous data embedded in profession embeddings is already removed when they are re-intialized in GEEP before second-phase pre-training starts. Empirical results show that GEEP not only achieves state-of-the-art performances on gender debiasing in various applications such as pronoun predicting and coreference resolution, but also achieves comparable results on general downstream tasks such as GLUE with original pre-trained models without much forgetting.", "keywords": "gender fairness;pre-trained language model;BERT", "primary_area": "", "supplementary_material": "", "author": "Zahra Fatemi;Chen Xing;Wenhao Liu;Caiming Xiong", "authorids": "~Zahra_Fatemi1;~Chen_Xing2;~Wenhao_Liu1;~Caiming_Xiong1", "gender": "F;F;;M", "homepage": "https://zarafatemi.github.io;;;http://cmxiong.com/", "dblp": "183/6196.html;;;80/7282", "google_scholar": ";tAUdLM0AAAAJ;;vaSdahkAAAAJ", "orcid": ";;;", "linkedin": "http://www.linkedin.com/in/zahra-fatemi;chen-xing-83082074/;;caiming-xiong-150a1417", "or_profile": "~Zahra_Fatemi1;~Chen_Xing2;~Wenhao_Liu1;~Caiming_Xiong1", "aff": "University of Illinois, Chicago;SalesForce.com;;Salesforce Research", "aff_domain": "uic.edu;salesforce.com;;salesforce.com", "position": "PhD student;Researcher;;Research Scientist", "bibtex": "@misc{\nfatemi2022improving,\ntitle={Improving Gender Fairness of Pre-Trained Language Models without Catastrophic Forgetting},\nauthor={Zahra Fatemi and Chen Xing and Wenhao Liu and Caiming Xiong},\nyear={2022},\nurl={https://openreview.net/forum?id=uS4AQe9Tv_R}\n}", "github": "", "project": "", "reviewers": "59Pt;oYoL;e2HC;Zt4q;hQDd", "site": "https://openreview.net/forum?id=uS4AQe9Tv_R", "pdf_size": 0, "recommendation": "3;3;5;6;6", "confidence": "4;4;4;4;5", "correctness": "2;3;3;3;3", "technical_novelty": "1;2;2;3;3", "empirical_novelty": "1;2;2;2;3", "wc_summary_paper": "134;131;110;33;88", "wc_summary_review": "64;81;44;34;79", "wc_main_review": "211;410;283;252;319", "wc_review": "409;622;437;319;486", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "249;0;215;164;210", "reply_reviewers": "0;0;0;0;0", "reply_authors": "1;0;1;1;1", "recommendation_avg": [ 4.6, 1.3564659966250536 ], "confidence_avg": [ 4.2, 0.39999999999999997 ], "correctness_avg": [ 2.8, 0.39999999999999997 ], "technical_novelty_avg": [ 2.2, 0.7483314773547882 ], "empirical_novelty_avg": [ 2.0, 0.6324555320336759 ], "wc_summary_paper_avg": [ 99.2, 37.0048645450838 ], "wc_summary_review_avg": [ 60.4, 18.7040102651811 ], "wc_main_review_avg": [ 295.0, 67.60177512462228 ], "wc_review_avg": [ 454.6, 99.78496880793219 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 167.6, 88.06043379407123 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0.8, 0.4000000000000001 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.5160468465421401, "corr_recommendation_correctness": 0.5897678246195884, "gs_citation": 45, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=608950169636288227&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;1;1", "aff_unique_norm": "University of Illinois at Chicago;Salesforce", "aff_unique_dep": ";", "aff_unique_url": "https://www.uic.edu;https://www.salesforce.com", "aff_unique_abbr": "UIC;Salesforce", "aff_campus_unique_index": "0", "aff_campus_unique": "Chicago;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "RISP: Rendering-Invariant State Predictor with Differentiable Simulation and Rendering for Cross-Domain Parameter Estimation", "status": "Oral", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6921", "id": "uSE03demja", "poster": "", "openreview": "https://openreview.net/forum?id=uSE03demja", "slides": "https://iclr.cc/virtual/2022/poster/6921", "video": "https://iclr.cc/virtual/2022/poster/6921", "author_site": "Pingchuan Ma, Tao Du, Joshua B Tenenbaum, Wojciech Matusik, Chuang Gan", "tldr": "", "abstract": "This work considers identifying parameters characterizing a physical system's dynamic motion directly from a video whose rendering configurations are inaccessible. Existing solutions require massive training data or lack generalizability to unknown rendering configurations. We propose a novel approach that marries domain randomization and differentiable rendering gradients to address this problem. Our core idea is to train a rendering-invariant state-prediction (RISP) network that transforms image differences into state differences independent of rendering configurations, e.g., lighting, shadows, or material reflectance. To train this predictor, we formulate a new loss on rendering variances using gradients from differentiable rendering. Moreover, we present an efficient, second-order method to compute the gradients of this loss, allowing it to be integrated seamlessly into modern deep learning frameworks. We evaluate our method in rigid-body and deformable-body simulation environments using four tasks: state estimation, system identification, imitation learning, and visuomotor control. We further demonstrate the efficacy of our approach on a real-world example: inferring the state and action sequences of a quadrotor from a video of its motion sequences. Compared with existing methods, our approach achieves significantly lower reconstruction errors and has better generalizability among unknown rendering configurations.", "keywords": "differentiable rendering;differentiable simulation;system identification", "primary_area": "", "supplementary_material": "", "author": "Pingchuan Ma;Tao Du;Joshua B. Tenenbaum;Wojciech Matusik;Chuang Gan", "authorids": "~Pingchuan_Ma3;~Tao_Du1;~Joshua_B._Tenenbaum1;~Wojciech_Matusik2;~Chuang_Gan1", "gender": "M;;;M;M", "homepage": "https://people.csail.mit.edu/pcma;https://people.iiis.tsinghua.edu.cn/~taodu/;;https://cdfg.mit.edu/wojciech;http://people.csail.mit.edu/ganchuang/", "dblp": "215/4446-2;51/3026-1;t/JoshuaBTenenbaum;;139/6993", "google_scholar": "EtCZmkwAAAAJ;https://scholar.google.com/citations?hl=en;;https://scholar.google.com/citations?hl=en;PTeSCbIAAAAJ", "orcid": ";0000-0001-7337-7667;;0000-0003-0212-5643;", "linkedin": ";;;wojciech-matusik-67238126/;", "or_profile": "~Pingchuan_Ma3;~Tao_Du1;~Joshua_B._Tenenbaum1;~Wojciech_Matusik2;~Chuang_Gan1", "aff": "Massachusetts Institute of Technology;Massachusetts Institute of Technology;Massachusetts Institute of Technology;Massachusetts Institute of Technology;MIT-IBM Watson AI Lab", "aff_domain": "mit.edu;mit.edu;mit.edu;mit.edu;ibm.com", "position": "PhD student;Postdoc;Professor;Full Professor;PhD student", "bibtex": "@inproceedings{\nma2022risp,\ntitle={{RISP}: Rendering-Invariant State Predictor with Differentiable Simulation and Rendering for Cross-Domain Parameter Estimation},\nauthor={Pingchuan Ma and Tao Du and Joshua B. Tenenbaum and Wojciech Matusik and Chuang Gan},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=uSE03demja}\n}", "github": "", "project": "", "reviewers": "qnnY;UwVk;paRJ", "pdf_size": 0, "recommendation": "8;8;8", "confidence": "4;3;4", "correctness": "4;4;3", "technical_novelty": "3;3;3", "empirical_novelty": "4;3;3", "wc_summary_paper": "174;88;63", "wc_summary_review": "30;60;44", "wc_main_review": "86;211;482", "wc_review": "290;359;589", "wc_reply_reviewers": "0;0;91", "wc_reply_authors": "86;182;611", "reply_reviewers": "0;0;1", "reply_authors": "1;1;2", "recommendation_avg": [ 8.0, 0.0 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 108.33333333333333, 47.541794478355804 ], "wc_summary_review_avg": [ 44.666666666666664, 12.256517540566822 ], "wc_main_review_avg": [ 259.6666666666667, 165.2883003186318 ], "wc_review_avg": [ 412.6666666666667, 127.82887867075351 ], "wc_reply_reviewers_avg": [ 30.333333333333332, 42.897811391983886 ], "wc_reply_authors_avg": [ 293.0, 228.24986308867744 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 30, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17678341502190401673&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=uSE03demja", "email": "mit.edu;mit.edu;mit.edu;mit.edu;ibm.com", "author_num": 5, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Massachusetts Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://web.mit.edu", "aff_unique_abbr": "MIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "uUN0Huq-n_V", "title": "Polyphonic Music Composition: An Adversarial Inverse Reinforcement Learning Approach", "track": "main", "status": "Reject", "tldr": "", "abstract": "Most recent approaches to automatic music harmony composition adopt deep supervised learning to train a model using a set of human composed songs as training data. However, these approaches suffer from inherent limitations from the chosen deep learning models which may lead to unpleasing harmonies. This paper explores an alternative approach to harmony composition using a combination of novel Deep Supervised Learning, DeepReinforcement Learning and Inverse Reinforcement Learning techniques. In this novel approach, our model selects the next chord in the composition(action) based on the previous notes(states), therefore allowing us to model harmony composition as a reinforcement learning problem in which we look to maximize an overall accumulated reward. However, designing an appropriate reward function is known to be a very tricky and difficult process. To overcome this problem we propose learning a reward function from a set of human-composed tracks using Adversarial Inverse Reinforcement Learning. We start by training a Bi-axial LSTM model using supervised learning and improve upon it by tuning it using Deep Q-learning. Instead of using GANs to generate a similar music composition to human compositions directly, we adopt GANs to learn the reward function of the music trajectories from human compositions. We then combine the learned reward function with a reward based on music theory rules to improve the generation of the model trained by supervised learning. The results show improvement over a pre-trained model without training with reinforcement learning with respect to a set of objective metrics and preference from subjective user evaluation.", "keywords": "music;reinforcement learning;airl;deep learning", "primary_area": "", "supplementary_material": "", "author": "Kelvin Xavier Munguia Velez;Von-Wun Soo", "authorids": "~Kelvin_Xavier_Munguia_Velez1;vwsoo2011@gmail.com", "gender": "M;", "homepage": "https://www.linkedin.com/in/xavier-munguia/;", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": "xavier-munguia/;", "or_profile": "~Kelvin_Xavier_Munguia_Velez1;vwsoo2011@gmail.com", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nvelez2022polyphonic,\ntitle={Polyphonic Music Composition: An Adversarial Inverse Reinforcement Learning Approach},\nauthor={Kelvin Xavier Munguia Velez and Von-Wun Soo},\nyear={2022},\nurl={https://openreview.net/forum?id=uUN0Huq-n_V}\n}", "github": "", "project": "", "reviewers": "e8mJ;N31m;n4MP", "site": "https://openreview.net/forum?id=uUN0Huq-n_V", "pdf_size": 0, "recommendation": "3;5;6", "confidence": "5;4;4", "correctness": "2;3;4", "technical_novelty": "1;3;3", "empirical_novelty": "2;4;3", "wc_summary_paper": "49;63;121", "wc_summary_review": "54;82;82", "wc_main_review": "667;471;901", "wc_review": "770;616;1104", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 4.666666666666667, 1.247219128924647 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "correctness_avg": [ 3.0, 0.816496580927726 ], "technical_novelty_avg": [ 2.3333333333333335, 0.9428090415820634 ], "empirical_novelty_avg": [ 3.0, 0.816496580927726 ], "wc_summary_paper_avg": [ 77.66666666666667, 31.169785940162562 ], "wc_summary_review_avg": [ 72.66666666666667, 13.199326582148888 ], "wc_main_review_avg": [ 679.6666666666666, 175.77510884334245 ], "wc_review_avg": [ 830.0, 203.69257882079717 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.944911182523068, "corr_recommendation_correctness": 0.9819805060619659, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:mCV9OG4kGyAJ:scholar.google.com/&scioq=Polyphonic+Music+Composition:+An+Adversarial+Inverse+Reinforcement+Learning+Approach&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "uVTp9Z-IUOC", "title": "Test-Time Adaptation to Distribution Shifts by Confidence Maximization and Input Transformation", "track": "main", "status": "Reject", "tldr": "", "abstract": "Deep neural networks often exhibit poor performance on data that is unlikely under the train-time data distribution, for instance data affected by corruptions. Previous works demonstrate that test-time adaptation to data shift, for instance using entropy minimization, effectively improves performance on such shifted distributions. This paper focuses on the fully test-time adaptation setting, where only unlabeled data from the target distribution is required. This allows adapting arbitrary pretrained networks. Specifically, we propose a novel loss that improves test-time adaptation by addressing both premature convergence and instability of entropy minimization. This is achieved by replacing the entropy by a non-saturating surrogate and adding a diversity regularizer based on batch-wise entropy maximization that prevents convergence to trivial collapsed solutions. Moreover, we propose to prepend an input transformation module to the network that can partially undo test-time distribution shifts. Surprisingly, this preprocessing can be learned solely using the fully test-time adaptation loss in an end-to-end fashion without any target domain labels or source domain data. We show that our approach outperforms previous work in improving the robustness of publicly available pretrained image classifiers to common corruptions on such challenging benchmarks as ImageNet-C.", "keywords": "natural corruptions;corruption robustness;distribution shift;test time adaptation;domain shift", "primary_area": "", "supplementary_material": "", "author": "Chaithanya Kumar Mummadi;Robin Hutmacher;Kilian Rambach;Evgeny Levinkov;Thomas Brox;Jan Hendrik Metzen", "authorids": "~Chaithanya_Kumar_Mummadi1;~Robin_Hutmacher1;~Kilian_Rambach1;~Evgeny_Levinkov1;~Thomas_Brox1;~Jan_Hendrik_Metzen1", "gender": "M;;;M;M;M", "homepage": ";;https://www.bosch-ai.com;;https://lmb.informatik.uni-freiburg.de/people/brox/index.en.html;http://jmetzen.github.io/", "dblp": "208/6386;;;142/2860;97/4586;93/1712", "google_scholar": "XJLtaG4AAAAJ;;;http://scholar.google.de/citations?user=B4J3SkcAAAAJ;https://scholar.google.com/citations?hl=de;https://scholar.google.de/citations?user=w047VfEAAAAJ", "orcid": "0000-0002-1173-2720;;;;0000-0002-6282-8861;", "linkedin": ";;;;;jan-hendrik-metzen-211543135/", "or_profile": "~Chaithanya_Kumar_Mummadi1;~Robin_Hutmacher1;~Kilian_Rambach1;~Evgeny_Levinkov1;~Thomas_Brox1;~Jan_Hendrik_Metzen1", "aff": "Bosch Center for Artificial Intelligence;Robert Bosch GmbH, Bosch;Robert Bosch GmbH, Bosch;;University of Freiburg;Bosch Center Artificial Intelligence", "aff_domain": "bosch.com;de.bosch.com;de.bosch.com;;uni-freiburg.de;bosch.com", "position": "Researcher;Research Engineer;Researcher;;Full Professor;Senior Expert", "bibtex": "@misc{\nmummadi2022testtime,\ntitle={Test-Time Adaptation to Distribution Shifts by Confidence Maximization and Input Transformation},\nauthor={Chaithanya Kumar Mummadi and Robin Hutmacher and Kilian Rambach and Evgeny Levinkov and Thomas Brox and Jan Hendrik Metzen},\nyear={2022},\nurl={https://openreview.net/forum?id=uVTp9Z-IUOC}\n}", "github": "", "project": "", "reviewers": "nGh4;9MXJ;e5PY;8M4e", "site": "https://openreview.net/forum?id=uVTp9Z-IUOC", "pdf_size": 0, "recommendation": "5;6;6;6", "confidence": "4;5;4;5", "correctness": "3;3;4;3", "technical_novelty": "2;2;2;3", "empirical_novelty": "3;2;4;3", "wc_summary_paper": "64;131;102;48", "wc_summary_review": "48;307;38;26", "wc_main_review": "182;818;389;297", "wc_review": "294;1256;529;371", "wc_reply_reviewers": "0;466;0;0", "wc_reply_authors": "653;786;338;710", "reply_reviewers": "0;1;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 5.75, 0.4330127018922193 ], "confidence_avg": [ 4.5, 0.5 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 86.25, 32.437439787998066 ], "wc_summary_review_avg": [ 104.75, 117.02857557024267 ], "wc_main_review_avg": [ 421.5, 240.37938763546262 ], "wc_review_avg": [ 612.5, 381.0606907042499 ], "wc_reply_reviewers_avg": [ 116.5, 201.7839190817742 ], "wc_reply_authors_avg": [ 621.75, 170.482220480612 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 79, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9386160447509912714&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1;1;2;0", "aff_unique_norm": "Bosch Center for Artificial Intelligence;Robert Bosch GmbH;University of Freiburg", "aff_unique_dep": "Center for Artificial Intelligence;;", "aff_unique_url": "https://www.bosch-ai.com;https://www.bosch.com;https://www.uni-freiburg.de", "aff_unique_abbr": "BCAI;Bosch;UoF", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "Germany" }, { "title": "Stiffness-aware neural network for learning Hamiltonian systems", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6856", "id": "uVXEKeqJbNa", "poster": "", "openreview": "https://openreview.net/forum?id=uVXEKeqJbNa", "slides": "https://iclr.cc/virtual/2022/poster/6856", "video": "https://iclr.cc/virtual/2022/poster/6856", "author_site": "SENWEI Liang, Zhongzhan Huang, Hong Zhang", "tldr": "", "abstract": "We propose stiffness-aware neural network (SANN), a new method for learning Hamiltonian dynamical systems from data. SANN identifies and splits the training data into stiff and nonstiff portions based on a stiffness-aware index, a simple, yet effective metric we introduce to quantify the stiffness of the dynamical system. This classification along with a resampling technique allows us to apply different time integration strategies such as step size adaptation to better capture the dynamical characteristics of the Hamiltonian vector fields. We evaluate SANN on complex physical systems including a three-body problem and billiard model. We show that SANN is more stable and can better preserve energy when compared with the state-of-the-art methods, leading to significant improvement in accuracy.", "keywords": "Hamiltonain systems;Neural network;Stiff dynamical systems;Data-driven method", "primary_area": "", "supplementary_material": "", "author": "SENWEI Liang;Zhongzhan Huang;Hong Zhang", "authorids": "~SENWEI_Liang1;~Zhongzhan_Huang1;~Hong_Zhang7", "gender": "M;M;M", "homepage": "https://leungsamwai.github.io/;https://dedekinds.github.io/;https://www.mcs.anl.gov/~hongzh/", "dblp": "230/4092;241/9753;", "google_scholar": "NLNoSBsAAAAJ;R-b68CEAAAAJ;lo_niigAAAAJ", "orcid": "0000-0002-3558-6828;;", "linkedin": ";;", "or_profile": "~SENWEI_Liang1;~Zhongzhan_Huang1;~Hong_Zhang7", "aff": "Purdue University;Sun Yat-Sen University;Argonne National Laboratory", "aff_domain": "purdue.edu;sysu.edu.cn;anl.gov", "position": "PhD student;PhD student;Assistant Computational Mathematician", "bibtex": "@inproceedings{\nliang2022stiffnessaware,\ntitle={Stiffness-aware neural network for learning Hamiltonian systems},\nauthor={SENWEI Liang and Zhongzhan Huang and Hong Zhang},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=uVXEKeqJbNa}\n}", "github": "", "project": "", "reviewers": "MYXe;RqeB;MCLP;iTTU", "pdf_size": 0, "recommendation": "6;6;6;8", "confidence": "4;3;3;4", "correctness": "3;3;3;3", "technical_novelty": "2;3;3;3", "empirical_novelty": "3;3;3;3", "wc_summary_paper": "94;61;73;132", "wc_summary_review": "26;29;39;101", "wc_main_review": "242;119;257;596", "wc_review": "362;209;369;829", "wc_reply_reviewers": "134;0;0;0", "wc_reply_authors": "569;297;548;541", "reply_reviewers": "2;0;0;0", "reply_authors": "2;1;1;1", "recommendation_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 90.0, 26.972207918522354 ], "wc_summary_review_avg": [ 48.75, 30.548117781624452 ], "wc_main_review_avg": [ 303.5, 177.15882704511228 ], "wc_review_avg": [ 442.25, 232.2642622100955 ], "wc_reply_reviewers_avg": [ 33.5, 58.023702053557386 ], "wc_reply_authors_avg": [ 488.75, 111.18537448783451 ], "reply_reviewers_avg": [ 0.5, 0.8660254037844386 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": 0.0, "gs_citation": 29, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3546402443678179224&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "pdf": "https://openreview.net/pdf?id=uVXEKeqJbNa", "email": "purdue.edu;sysu.edu.cn;anl.gov", "author_num": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "Purdue University;Sun Yat-sen University;Argonne National Laboratory", "aff_unique_dep": ";;", "aff_unique_url": "https://www.purdue.edu;http://www.sysu.edu.cn/;https://www.anl.gov", "aff_unique_abbr": "Purdue;SYSU;ANL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", "aff_country_unique": "United States;China" }, { "id": "uXpTNpkXFLB", "title": "Towards Predictable Feature Attribution: Revisiting and Improving Guided BackPropagation", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Recently, backpropagation(BP)-based feature attribution methods have been widely adopted to interpret the internal mechanisms of convolutional neural networks (CNNs), and expected to be human-understandable (lucidity) and faithful to decision-making processes (fidelity). In this paper, we introduce a novel property for feature attribution: predictability, which means users can forecast behaviors of the interpretation methods. With the evidence that many attribution methods have unexpected and harmful phenomena like class-insensitivity, the predictability is critical to avoid over-trust and misuse from users. Observing that many intuitive improvements for lucidity and fidelity tend to sacrifice predictability, we propose a new visual explanation method called TR-GBP (Theoretical Refinements of Guided BackPropagation) which revisits and improves GBP from theoretical perspective rather than solely optimizing the attribution performance. Qualitative and quantitative experiments show that TR-GBP is more visually sharpened, gets rid of the fidelity problems in GBP, and effectively predicts the possible behaviors so that we can easily discriminate some prediction errors from interpretation errors. The codes of TR-GBP are available in supplementary and will be open source.", "keywords": "explanation;interpretation;BP-based attributions;predictability", "primary_area": "", "supplementary_material": "/attachment/cd44a79f80a8fe41541cf8e05667d3cf70236273.zip", "author": "Guanhua Zheng;Jitao Sang;Wang Haonan;Changsheng Xu", "authorids": "~Guanhua_Zheng1;~Jitao_Sang1;~Wang_Haonan2;~Changsheng_Xu1", "gender": "M;;M;M", "homepage": ";;;", "dblp": ";;;85/1301", "google_scholar": ";;;https://scholar.google.com.sg/citations?user=hI9NRDkAAAAJ", "orcid": ";;;", "linkedin": ";;%E6%B5%A9%E6%A5%A0-%E7%8E%8B-819943212/;", "or_profile": "~Guanhua_Zheng1;~Jitao_Sang1;~Wang_Haonan2;~Changsheng_Xu1", "aff": ";;Beijing jiaotong univercity;Institute of Automation, Chinese Academy of Sciences", "aff_domain": ";;bjtu.edu.cn;ia.ac.cn", "position": ";;MS student;Full Professor", "bibtex": "@misc{\nzheng2022towards,\ntitle={Towards Predictable Feature Attribution: Revisiting and Improving Guided BackPropagation},\nauthor={Guanhua Zheng and Jitao Sang and Wang Haonan and Changsheng Xu},\nyear={2022},\nurl={https://openreview.net/forum?id=uXpTNpkXFLB}\n}", "github": "", "project": "", "reviewers": "u6wg;YBzY;erbM", "site": "https://openreview.net/forum?id=uXpTNpkXFLB", "pdf_size": 0, "recommendation": "3;3;6", "confidence": "4;4;2", "correctness": "1;2;3", "technical_novelty": "2;2;3", "empirical_novelty": "2;1;3", "wc_summary_paper": "76;61;79", "wc_summary_review": "61;94;19", "wc_main_review": "1068;709;150", "wc_review": "1205;864;248", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 4.0, 1.4142135623730951 ], "confidence_avg": [ 3.3333333333333335, 0.9428090415820634 ], "correctness_avg": [ 2.0, 0.816496580927726 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.0, 0.816496580927726 ], "wc_summary_paper_avg": [ 72.0, 7.874007874011811 ], "wc_summary_review_avg": [ 58.0, 30.692018506445613 ], "wc_main_review_avg": [ 642.3333333333334, 377.7250616814063 ], "wc_review_avg": [ 772.3333333333334, 396.0339491620496 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -1.0, "corr_recommendation_correctness": 0.8660254037844387, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:cLaaalLDG2wJ:scholar.google.com/&scioq=Towards+Predictable+Feature+Attribution:+Revisiting+and+Improving+Guided+BackPropagation&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1", "aff_unique_norm": "Beijing Jiao Tong University;Chinese Academy of Sciences", "aff_unique_dep": ";Institute of Automation", "aff_unique_url": "http://www.bjtu.edu.cn;http://www.ia.cas.cn", "aff_unique_abbr": "BJTU;CAS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "id": "uY6fuowMIT", "title": "Approximate Bijective Correspondence for isolating factors of variation", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Representational learning forms the backbone of most deep learning applications, and the value of a learned representation is intimately tied to its information content regarding different factors of variation. Finding good representations depends on the nature of supervision and the learning algorithm. We propose a novel algorithm that relies on a weak form of supervision where the data is partitioned into sets according to certain \\textit{inactive} factors of variation. Our key insight is that by seeking approximate correspondence between elements of different sets, we learn strong representations that exclude the inactive factors of variation and isolate the \\textit{active} factors which vary within all sets. Importantly, the information isolated is complementary to that of most other contrastive learning approaches, which isolate the inactive factors of variation. We demonstrate that the method can work in a semi-supervised scenario, and that a portion of the unsupervised data can belong to a different domain entirely. Further control over the content of the learned representations is possible by folding in data augmentation to suppress nuisance factors. We outperform competing baselines on the challenging problem of synthetic-to-real object pose transfer.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/a69825cbd789cf495533010715fd30dd07788af2.zip", "author": "Kieran A Murphy;Varun Jampani;Srikumar Ramalingam;Ameesh Makadia", "authorids": "~Kieran_A_Murphy1;~Varun_Jampani2;~Srikumar_Ramalingam2;~Ameesh_Makadia1", "gender": "M;M;;M", "homepage": "https://kieranamurphy.com;https://www.cs.utah.edu/~srikumar/;http://www.ameeshmakadia.com/index.html;https://varunjampani.github.io/", "dblp": "287/4780;17/4216;59/6004;124/2785", "google_scholar": "VC653zEAAAAJ;6m1ptOgAAAAJ;OT1uf7kAAAAJ;1Cv6Sf4AAAAJ", "orcid": "0000-0003-0960-6685;;;", "linkedin": ";srikumar-ramalingam-17728b22/;;", "or_profile": "~Kieran_A_Murphy1;~Srikumar_Ramalingam2;~Ameesh_Makadia1;~Varun_Jampani1", "aff": "University of Pennsylvania;Google;Google;Google Research", "aff_domain": "penn.edu;google.com;google.com;google.com", "position": "Postdoc;Research Scientist;Research Scientist;Researcher", "bibtex": "@misc{\nmurphy2022approximate,\ntitle={Approximate Bijective Correspondence for isolating factors of variation},\nauthor={Kieran A Murphy and Varun Jampani and Srikumar Ramalingam and Ameesh Makadia},\nyear={2022},\nurl={https://openreview.net/forum?id=uY6fuowMIT}\n}", "github": "", "project": "", "reviewers": "pmB1;sfzd;8W8J;Nnja", "site": "https://openreview.net/forum?id=uY6fuowMIT", "pdf_size": 0, "recommendation": "1;5;5;6", "confidence": "4;4;2;4", "correctness": "2;3;3;3", "technical_novelty": "1;2;2;2", "empirical_novelty": "1;2;0;2", "wc_summary_paper": "33;116;102;69", "wc_summary_review": "33;105;83;39", "wc_main_review": "456;351;245;333", "wc_review": "522;572;430;441", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.25, 1.920286436967152 ], "confidence_avg": [ 3.5, 0.8660254037844386 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 1.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 80.0, 32.05464084964921 ], "wc_summary_review_avg": [ 65.0, 30.099833886584822 ], "wc_main_review_avg": [ 346.25, 74.99124948952378 ], "wc_review_avg": [ 491.25, 58.61473790779927 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.22549380840084865, "corr_recommendation_correctness": 0.9771398364036774, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Fcuu9TN3YlYJ:scholar.google.com/&scioq=Approximate+Bijective+Correspondence+for+isolating+factors+of+variation&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;1;1", "aff_unique_norm": "University of Pennsylvania;Google", "aff_unique_dep": ";Google", "aff_unique_url": "https://www.upenn.edu;https://www.google.com", "aff_unique_abbr": "UPenn;Google", "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Efficiently Modeling Long Sequences with Structured State Spaces", "status": "Oral", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6959", "id": "uYLFoz1vlAC", "poster": "", "openreview": "https://openreview.net/forum?id=uYLFoz1vlAC", "slides": "https://iclr.cc/virtual/2022/poster/6959", "video": "https://iclr.cc/virtual/2022/poster/6959", "author_site": "Albert Gu, Karan Goel, Christopher Re", "tldr": "", "abstract": "A central goal of sequence modeling is designing a single principled model that can address sequence data across a range of modalities and tasks, particularly on long-range dependencies. Although conventional models including RNNs, CNNs, and Transformers have specialized variants for capturing long dependencies, they still struggle to scale to very long sequences of $10000$ or more steps. A promising recent approach proposed modeling sequences by simulating the fundamental state space model (SSM) \\( x'(t) = Ax(t) + Bu(t), y(t) = Cx(t) + Du(t) \\), and showed that for appropriate choices of the state matrix \\( A \\), this system could handle long-range dependencies mathematically and empirically. However, this method has prohibitive computation and memory requirements, rendering it infeasible as a general sequence modeling solution. We propose the Structured State Space sequence model (S4) based on a new parameterization for the SSM, and show that it can be computed much more efficiently than prior approaches while preserving their theoretical strengths. Our technique involves conditioning \\( A \\) with a low-rank correction, allowing it to be diagonalized stably and reducing the SSM to the well-studied computation of a Cauchy kernel. S4 achieves strong empirical results across a diverse range of established benchmarks, including (i) 91\\% accuracy on sequential CIFAR-10 with no data augmentation or auxiliary losses, on par with a larger 2-D ResNet, (ii) substantially closing the gap to Transformers on image and language modeling tasks, while performing generation $60\\times$ faster (iii) SoTA on every task from the Long Range Arena benchmark, including solving the challenging Path-X task of length 16k that all prior work fails on, while being as efficient as all competitors.", "keywords": "sequence models;state space;RNN;CNN;Long Range Arena", "primary_area": "", "supplementary_material": "/attachment/0eca97b47ae6f910c4be46fcb179d7790227b78a.zip", "author": "Albert Gu;Karan Goel;Christopher Re", "authorids": "~Albert_Gu1;~Karan_Goel1;~Christopher_Re1", "gender": "M;M;", "homepage": ";http://krandiash.github.io;", "dblp": "130/0612;175/1290;", "google_scholar": "DVCHv1kAAAAJ;;", "orcid": "0000-0002-4946-6042;;", "linkedin": ";;", "or_profile": "~Albert_Gu1;~Karan_Goel1;~Christopher_Re1", "aff": "Stanford University;Stanford University;", "aff_domain": "stanford.edu;stanford.edu;", "position": "PhD student;PhD student;", "bibtex": "@inproceedings{\ngu2022efficiently,\ntitle={Efficiently Modeling Long Sequences with Structured State Spaces},\nauthor={Albert Gu and Karan Goel and Christopher Re},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=uYLFoz1vlAC}\n}", "github": "", "project": "", "reviewers": "7JCQ;Bxrv;ncZn", "pdf_size": 0, "recommendation": "8;8;8", "confidence": "4;3;3", "correctness": "4;4;4", "technical_novelty": "4;4;3", "empirical_novelty": "4;4;3", "wc_summary_paper": "117;89;210", "wc_summary_review": "83;35;84", "wc_main_review": "536;313;547", "wc_review": "736;437;841", "wc_reply_reviewers": "42;0;85", "wc_reply_authors": "269;244;565", "reply_reviewers": "1;0;1", "reply_authors": "1;1;1", "recommendation_avg": [ 8.0, 0.0 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "correctness_avg": [ 4.0, 0.0 ], "technical_novelty_avg": [ 3.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 138.66666666666666, 51.71932800113405 ], "wc_summary_review_avg": [ 67.33333333333333, 22.866763848189994 ], "wc_main_review_avg": [ 465.3333333333333, 107.80950277637352 ], "wc_review_avg": [ 671.3333333333334, 171.15360222781044 ], "wc_reply_reviewers_avg": [ 42.333333333333336, 34.70190516703978 ], "wc_reply_authors_avg": [ 359.3333333333333, 145.78599231598199 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 2281, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8624959095392391416&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=uYLFoz1vlAC", "email": "stanford.edu;stanford.edu;", "author_num": 3, "aff_unique_index": "0;0", "aff_unique_norm": "Stanford University", "aff_unique_dep": "", "aff_unique_url": "https://www.stanford.edu", "aff_unique_abbr": "Stanford", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "uc8UsmcInvB", "title": "Statistically Meaningful Approximation: a Theoretical Analysis for Approximating Turing Machines with Transformers", "track": "main", "status": "Reject", "tldr": "", "abstract": "A common lens to theoretically study neural net architectures is to analyze the functions they can approximate. However, constructions from approximation theory may be unrealistic and therefore less meaningful. For example, a common unrealistic trick is to encode target function values using infinite precision. To address these issues, this work proposes a formal definition of statistically meaningful (SM) approximation which requires the approximating network to exhibit good statistical learnability. We study SM approximation for two function classes: boolean circuits and Turing machines. We show that overparameterized feedforward neural nets can SM approximate boolean circuits with sample complexity depending only polynomially on the circuit size, not the size of the network. In addition, we show that transformers can SM approximate Turing machines with computation time bounded by $T$ with sample complexity polynomial in the alphabet size, state space size, and $log(T)$. We also introduce new tools for analyzing generalization which provide much tighter sample complexities than the typical VC-dimension or norm-based bounds, which may be of independent interest.", "keywords": "approximation theory;generalization bounds;sample complexity bounds;learning theory", "primary_area": "", "supplementary_material": "", "author": "Colin Wei;Yining Chen;Tengyu Ma", "authorids": "~Colin_Wei1;~Yining_Chen1;~Tengyu_Ma1", "gender": "M;F;M", "homepage": "https://sites.google.com/view/colinwei;;http://ai.stanford.edu/~tengyuma/", "dblp": "185/7902;;54/9061", "google_scholar": ";4a6iPeUAAAAJ;i38QlUwAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Colin_Wei1;~Yining_Chen1;~Tengyu_Ma1", "aff": "Computer Science Department, Stanford University;Stanford University;Facebook AI Research", "aff_domain": "cs.stanford.edu;stanford.edu;fb.com", "position": "PhD student;PhD student;Visiting Scientist", "bibtex": "@misc{\nwei2022statistically,\ntitle={Statistically Meaningful Approximation: a Theoretical Analysis for Approximating Turing Machines with Transformers},\nauthor={Colin Wei and Yining Chen and Tengyu Ma},\nyear={2022},\nurl={https://openreview.net/forum?id=uc8UsmcInvB}\n}", "github": "", "project": "", "reviewers": "VpYu;RNAM;eLwz;Lqbr", "site": "https://openreview.net/forum?id=uc8UsmcInvB", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "4;2;4;4", "correctness": "3;3;3;4", "technical_novelty": "3;2;2;3", "empirical_novelty": "0;0;3;0", "wc_summary_paper": "32;60;81;104", "wc_summary_review": "77;15;72;9", "wc_main_review": "306;133;376;195", "wc_review": "415;208;529;308", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 3.5, 0.8660254037844386 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 0.75, 1.299038105676658 ], "wc_summary_paper_avg": [ 69.25, 26.54595072699413 ], "wc_summary_review_avg": [ 43.25, 31.371762781201824 ], "wc_main_review_avg": [ 252.5, 94.47354126950043 ], "wc_review_avg": [ 365.0, 119.68082553191218 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.13245323570650439, "corr_recommendation_correctness": 0.6622661785325219, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:gQQtINc4N8wJ:scholar.google.com/&scioq=Statistically+Meaningful+Approximation:+a+Theoretical+Analysis+for+Approximating+Turing+Machines+with+Transformers&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;1", "aff_unique_norm": "Stanford University;Meta", "aff_unique_dep": "Computer Science Department;Facebook AI Research", "aff_unique_url": "https://www.stanford.edu;https://research.facebook.com", "aff_unique_abbr": "Stanford;FAIR", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Stanford;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Is Homophily a Necessity for Graph Neural Networks?", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6329", "id": "ucASPPD9GKN", "poster": "", "openreview": "https://openreview.net/forum?id=ucASPPD9GKN", "slides": "https://iclr.cc/virtual/2022/poster/6329", "video": "https://iclr.cc/virtual/2022/poster/6329", "author_site": "Yao Ma, Xiaorui Liu, Neil Shah, Jiliang Tang", "tldr": "", "abstract": "Graph neural networks (GNNs) have shown great prowess in learning representations suitable for numerous graph-based machine learning tasks. When applied to semi-supervised node classification, GNNs are widely believed to work well due to the homophily assumption (``like attracts like''), and fail to generalize to heterophilous graphs where dissimilar nodes connect. Recent works design new architectures to overcome such heterophily-related limitations, citing poor baseline performance and new architecture improvements on a few heterophilous graph benchmark datasets as evidence for this notion. In our experiments, we empirically find that standard graph convolutional networks (GCNs) can actually achieve better performance than such carefully designed methods on some commonly used heterophilous graphs. This motivates us to reconsider whether homophily is truly necessary for good GNN performance. We find that this claim is not quite true, and in fact, GCNs can achieve strong performance on heterophilous graphs under certain conditions. Our work carefully characterizes these conditions and provides supporting theoretical understanding and empirical observations. Finally, we examine existing heterophilous graphs benchmarks and reconcile how the GCN (under)performs on them based on this understanding.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/bbde85db7c61a5770e7eb7b6c2abeea42fa8e857.zip", "author": "Yao Ma;Xiaorui Liu;Neil Shah;Jiliang Tang", "authorids": "~Yao_Ma3;~Xiaorui_Liu1;~Neil_Shah2;~Jiliang_Tang1", "gender": "M;M;M;M", "homepage": "https://yaoma24.github.io/;https://sites.google.com/ncsu.edu/xiaorui/;http://nshah.net;https://www.cse.msu.edu/~tangjili/", "dblp": "212/7871.html;172/0995;71/7771;64/10812", "google_scholar": "wf9TTOIAAAAJ;NhvN1KoAAAAJ;Qut69OgAAAAJ;WtzKMWAAAAAJ", "orcid": ";0000-0001-8217-5688;0000-0003-3261-8430;0000-0001-7125-3898", "linkedin": ";;;", "or_profile": "~Yao_Ma3;~Xiaorui_Liu1;~Neil_Shah2;~Jiliang_Tang1", "aff": "New Jersey Institute of Technology;Michigan State University;Snap Inc.;Michigan State University", "aff_domain": "njit.edu;msu.edu;snap.com;msu.edu", "position": "Assistant Professor;PhD student;Research Scientist;Associate Professor", "bibtex": "@inproceedings{\nma2022is,\ntitle={Is Homophily a Necessity for Graph Neural Networks?},\nauthor={Yao Ma and Xiaorui Liu and Neil Shah and Jiliang Tang},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=ucASPPD9GKN}\n}", "github": "", "project": "", "reviewers": "MC2o;WXGg;6L9A;YLxN", "pdf_size": 0, "recommendation": "6;6;6;6", "confidence": "4;4;4;4", "correctness": "3;3;2;3", "technical_novelty": "3;3;3;3", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "66;95;109;25", "wc_summary_review": "101;91;24;25", "wc_main_review": "894;387;329;204", "wc_review": "1061;573;462;254", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "2494;2092;1115;858", "reply_reviewers": "0;0;0;0", "reply_authors": "4;4;2;2", "recommendation_avg": [ 6.0, 0.0 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 73.75, 32.135455497005175 ], "wc_summary_review_avg": [ 60.25, 35.92613950871983 ], "wc_main_review_avg": [ 453.5, 262.7798508257435 ], "wc_review_avg": [ 587.5, 296.3886806205662 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 1639.75, 674.6793219745215 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 3.0, 1.0 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 358, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16926917851808301251&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=ucASPPD9GKN", "email": "njit.edu;msu.edu;snap.com;msu.edu", "author_num": 4, "aff_unique_index": "0;1;2;1", "aff_unique_norm": "New Jersey Institute of Technology;Michigan State University;Snap Inc.", "aff_unique_dep": ";;", "aff_unique_url": "https://www.njit.edu;https://www.msu.edu;https://www.snapinc.com", "aff_unique_abbr": "NJIT;MSU;Snap", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "ue4CArRAsct", "title": "Structure by Architecture: Disentangled Representations without Regularization", "track": "main", "status": "Reject", "tldr": "", "abstract": "We study the problem of self-supervised structured representation learning using autoencoders for downstream tasks such as generative modeling. Unlike most methods which rely on matching an arbitrary, relatively unstructured, prior distribution for sampling, we propose a sampling technique that relies solely on the independence of latent variables, thereby avoiding the trade-off between reconstruction quality and generative performance inherent to VAEs. We design a novel autoencoder architecture capable of learning a structured representation without the need for aggressive regularization. Our structural decoders learn a hierarchy of latent variables, akin to structural causal models, thereby ordering the information without any additional regularization. We demonstrate how these models learn a representation that improves results in a variety of downstream tasks including generation, disentanglement, and extrapolation using several challenging and natural image datasets.", "keywords": "Autoencoder;Structure;Disentanglement;Generative;Hybridization", "primary_area": "", "supplementary_material": "/attachment/ee4710b3b1e46ada006dd106280ea4182fd2f2c9.zip", "author": "Felix Leeb;Giulia Lanzillotta;Yashas Annadani;Michel Besserve;Stefan Bauer;Bernhard Sch\u00f6lkopf", "authorids": "~Felix_Leeb1;~Giulia_Lanzillotta1;~Yashas_Annadani1;~Michel_Besserve1;~Stefan_Bauer1;~Bernhard_Sch\u00f6lkopf1", "gender": ";F;;M;;", "homepage": "https://ei.is.mpg.de/person/fleeb;;https://yashasannadani.com;https://computational-homeostasis.com;https://cifar.ca/bios/stefan-bauer/;", "dblp": ";;190/7411;71/511;;", "google_scholar": ";eiB9OOkAAAAJ;ExgzcVMAAAAJ;https://scholar.google.de/citations?user=Nbq6kI0AAAAJ;O-oICE8AAAAJ;", "orcid": ";0009-0008-2047-8251;;;;", "linkedin": ";giulia-lanzillotta-0aab3186/;;;;", "or_profile": "~Felix_Leeb1;~Giulia_Lanzillotta1;~Yashas_Annadani1;~Michel_Besserve1;~Stefan_Bauer1;~Bernhard_Sch\u00f6lkopf1", "aff": "Max Planck Institute for Intelligent Systems, Max-Planck Institute;Swiss Federal Institute of Technology;KTH Royal Institute of Technology;MPI for Intelligent Systems;KTH Royal Institute of Technology;", "aff_domain": "tuebingen.mpg.de;ethz.ch;kth.se;tuebingen.mpg.de;kth.se;", "position": "PhD student;MS student;PhD student;Senior research scientist;Assistant Professor;", "bibtex": "@misc{\nleeb2022structure,\ntitle={Structure by Architecture: Disentangled Representations without Regularization},\nauthor={Felix Leeb and Giulia Lanzillotta and Yashas Annadani and Michel Besserve and Stefan Bauer and Bernhard Sch{\\\"o}lkopf},\nyear={2022},\nurl={https://openreview.net/forum?id=ue4CArRAsct}\n}", "github": "", "project": "", "reviewers": "Bkes;ERAs;UHD2;EdoF", "site": "https://openreview.net/forum?id=ue4CArRAsct", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "3;4;3;2", "correctness": "2;3;3;3", "technical_novelty": "3;3;3;3", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "74;72;46;60", "wc_summary_review": "255;53;19;28", "wc_main_review": "1024;665;344;174", "wc_review": "1353;790;409;262", "wc_reply_reviewers": "1128;344;387;0", "wc_reply_authors": "1890;996;1031;37", "reply_reviewers": "2;2;1;0", "reply_authors": "4;2;2;1", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 3.0, 0.7071067811865476 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 63.0, 11.180339887498949 ], "wc_summary_review_avg": [ 88.75, 96.78939766317383 ], "wc_main_review_avg": [ 551.75, 324.6924506359826 ], "wc_review_avg": [ 703.5, 421.5996323527809 ], "wc_reply_reviewers_avg": [ 464.75, 411.2537993745468 ], "wc_reply_authors_avg": [ 988.5, 655.7280305126509 ], "reply_reviewers_avg": [ 1.25, 0.82915619758885 ], "reply_authors_avg": [ 2.25, 1.0897247358851685 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.6488856845230502, "corr_recommendation_correctness": 0.6622661785325219, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11319862194753390313&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "aff_unique_index": "0;1;2;0;2", "aff_unique_norm": "Max Planck Institute for Intelligent Systems;Swiss Federal Institute of Technology;KTH Royal Institute of Technology", "aff_unique_dep": "Intelligent Systems;;", "aff_unique_url": "https://www.mpi-is.mpg.de;https://www.ethz.ch;https://www.kth.se", "aff_unique_abbr": "MPI-IS;ETH Zurich;KTH", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2;0;2", "aff_country_unique": "Germany;Switzerland;Sweden" }, { "id": "uecYQBshVYV", "title": "Revisiting transposed convolutions for interpreting raw waveform sound event recognition CNNs by sonification", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "The majority of recent work on the interpretability of audio and speech processing deep neural networks (DNNs) interprets spectral information modelled by the first layer, relying solely on visual means of interpretation. In this work, we propose \\textit{sonification}, a method to interpret intermediate feature representations of sound event recognition (SER) convolutional neural networks (CNNs) trained on raw waveforms by mapping these representations back into the discrete-time input signal domain, highlighting substructures in the input that maximally activate a feature map as intelligible acoustic events. We use sonifications to compare supervised and self-supervised feature hierarchies and show how sonifications work synergistically with signal processing techniques and visual means of representation, aiding the interpretability of SER models.", "keywords": "convolutional neural networks;interpretability;sound event recognition;raw waveform;contrastive learning;self-supervised learning;sound classification;audioset", "primary_area": "", "supplementary_material": "/attachment/ae88451de41150518ac86ee9e2815ff02aa5804e.zip", "author": "Sarthak Yadav;Mary Ellen Foster", "authorids": "~Sarthak_Yadav1;~Mary_Ellen_Foster2", "gender": ";F", "homepage": ";http://www.maryellenfoster.uk/", "dblp": ";39/4148", "google_scholar": ";ij0tOKoAAAAJ", "orcid": ";0000-0002-1228-7657", "linkedin": ";", "or_profile": "~Sarthak_Yadav1;~Mary_Ellen_Foster2", "aff": ";University of Glasgow", "aff_domain": ";gla.ac.uk", "position": ";Senior Lecturer", "bibtex": "@misc{\nyadav2022revisiting,\ntitle={Revisiting transposed convolutions for interpreting raw waveform sound event recognition {CNN}s by sonification},\nauthor={Sarthak Yadav and Mary Ellen Foster},\nyear={2022},\nurl={https://openreview.net/forum?id=uecYQBshVYV}\n}", "github": "", "project": "", "reviewers": "PJrE;r61T;7P67;D7LC", "site": "https://openreview.net/forum?id=uecYQBshVYV", "pdf_size": 0, "recommendation": "1;3;5;5", "confidence": "4;3;3;4", "correctness": "3;3;3;3", "technical_novelty": "1;2;2;2", "empirical_novelty": "2;2;1;0", "wc_summary_paper": "109;58;110;88", "wc_summary_review": "38;29;21;96", "wc_main_review": "466;112;248;352", "wc_review": "613;199;379;536", "wc_reply_reviewers": "0;0;0;307", "wc_reply_authors": "1514;404;769;793", "reply_reviewers": "0;0;0;1", "reply_authors": "2;1;1;2", "recommendation_avg": [ 3.5, 1.6583123951777 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 1.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 91.25, 21.111312133545844 ], "wc_summary_review_avg": [ 46.0, 29.487285395573462 ], "wc_main_review_avg": [ 294.5, 130.56320308570864 ], "wc_review_avg": [ 431.75, 158.64642290325995 ], "wc_reply_reviewers_avg": [ 76.75, 132.93489948091133 ], "wc_reply_authors_avg": [ 870.0, 402.49906832190305 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.30151134457776363, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:p9ZXlir664IJ:scholar.google.com/&scioq=Revisiting+transposed+convolutions+for+interpreting+raw+waveform+sound+event+recognition+CNNs+by+sonification&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "University of Glasgow", "aff_unique_dep": "", "aff_unique_url": "https://www.gla.ac.uk", "aff_unique_abbr": "Glasgow", "aff_country_unique_index": "0", "aff_country_unique": "United Kingdom" }, { "title": "InfinityGAN: Towards Infinite-Pixel Image Synthesis", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6857", "id": "ufGMqIM0a4b", "poster": "", "openreview": "https://openreview.net/forum?id=ufGMqIM0a4b", "slides": "https://iclr.cc/virtual/2022/poster/6857", "video": "https://iclr.cc/virtual/2022/poster/6857", "author_site": "Chieh Hubert Lin, Hsin-Ying Lee, Yen-Chi Cheng, Sergey Tulyakov, Ming-Hsuan Yang", "tldr": "", "abstract": "We present InfinityGAN, a method to generate arbitrary-sized images. The problem is associated with several key challenges. First, scaling existing models to an arbitrarily large image size is resource-constrained, both in terms of computation and availability of large-field-of-view training data. InfinityGAN trains and infers patch-by-patch seamlessly with low computational resources. Second, large images should be locally and globally consistent, avoid repetitive patterns, and look realistic. To address these, InfinityGAN takes global appearance, local structure and texture into account. With this formulation, we can generate images with spatial size and level of detail not attainable before. Experimental evaluation supports that InfinityGAN generates images with superior global structure compared to baselines and features parallelizable inference. Finally, we show several applications unlocked by our approach, such as fusing styles spatially, multi-modal outpainting and image inbetweening at arbitrary input and output sizes.", "keywords": "generative modeling;image synthesis;generative adversarial networks;infinite-pixel synthesis;GANs", "primary_area": "", "supplementary_material": "", "author": "Chieh Hubert Lin;Hsin-Ying Lee;Yen-Chi Cheng;Sergey Tulyakov;Ming-Hsuan Yang", "authorids": "~Chieh_Hubert_Lin1;~Hsin-Ying_Lee2;~Yen-Chi_Cheng1;~Sergey_Tulyakov1;~Ming-Hsuan_Yang1", "gender": ";M;M;M;M", "homepage": ";http://hsinyinglee.com/;https://yccyenchicheng.github.io;http://www.stulyakov.com/;https://faculty.ucmerced.edu/mhyang/", "dblp": ";149/7976-1.html;239/4170;40/6115;79/3711.html", "google_scholar": ";;wvuEiWgAAAAJ;mgzXR0sAAAAJ;p9-ohHsAAAAJ", "orcid": ";;;;0000-0003-4848-2304", "linkedin": ";;yen-chi-cheng-464457b5/;sergeytulyakov/;minghsuanyang/", "or_profile": "~Chieh_Hubert_Lin1;~Hsin-Ying_Lee2;~Yen-Chi_Cheng1;~Sergey_Tulyakov1;~Ming-Hsuan_Yang1", "aff": ";Snap Inc.;Carnegie Mellon University;;University of California at Merced", "aff_domain": ";snap.com;cmu.edu;;umcerced.edu", "position": ";Researcher;MS student;;Professor", "bibtex": "@inproceedings{\nlin2022infinitygan,\ntitle={Infinity{GAN}: Towards Infinite-Pixel Image Synthesis},\nauthor={Chieh Hubert Lin and Hsin-Ying Lee and Yen-Chi Cheng and Sergey Tulyakov and Ming-Hsuan Yang},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=ufGMqIM0a4b}\n}", "github": "", "project": "", "reviewers": "1Z86;TkQk;gucW;bTWK", "pdf_size": 0, "recommendation": "6;8;8;8", "confidence": "4;4;3;4", "correctness": "3;4;4;4", "technical_novelty": "3;3;3;3", "empirical_novelty": "3;3;3;3", "wc_summary_paper": "90;98;105;54", "wc_summary_review": "266;142;52;27", "wc_main_review": "1078;223;327;130", "wc_review": "1434;463;484;211", "wc_reply_reviewers": "272;30;138;26", "wc_reply_authors": "3159;581;629;260", "reply_reviewers": "6;1;1;1", "reply_authors": "8;1;1;1", "recommendation_avg": [ 7.5, 0.8660254037844386 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 86.75, 19.638928178492836 ], "wc_summary_review_avg": [ 121.75, 93.6225800755352 ], "wc_main_review_avg": [ 439.5, 375.16696283121735 ], "wc_review_avg": [ 648.0, 466.33839644618587 ], "wc_reply_reviewers_avg": [ 116.5, 100.39297784207818 ], "wc_reply_authors_avg": [ 1157.25, 1164.3853260411693 ], "reply_reviewers_avg": [ 2.25, 2.165063509461097 ], "reply_authors_avg": [ 2.75, 3.031088913245535 ], "replies_avg": [ 25, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": 1.0, "gs_citation": 90, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11409281345563394414&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=ufGMqIM0a4b", "email": ";snap.com;cmu.edu;;umcerced.edu", "author_num": 5, "aff_unique_index": "0;1;2", "aff_unique_norm": "Snap Inc.;Carnegie Mellon University;University of California, Merced", "aff_unique_dep": ";;", "aff_unique_url": "https://www.snapinc.com;https://www.cmu.edu;https://www.ucmerced.edu", "aff_unique_abbr": "Snap;CMU;UC Merced", "aff_campus_unique_index": "1", "aff_campus_unique": ";Merced", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "ugxdsne_TlO", "title": "GCF: Generalized Causal Forest for Heterogeneous Treatment Effect Estimation Using Nonparametric Methods", "track": "main", "status": "Reject", "tldr": "", "abstract": "Heterogeneous treatment effect (HTE) estimation with continuous treatment is essential in multiple disciplines, such as the online marketplace and pharmaceutical industry. The existing machine learning (ML) methods, like forest-based modeling, either work only for discrete treatments or make partially linear or parametric assumptions that may suffer from model misspecification. To alleviate these problems, we extend causal forest (CF) with non-parametric dose-response functions (DRFs) that can be estimated locally using kernel-based Double/Debiased ML estimators. Moreover, we propose a distance-based splitting criterion in the functional space of Partial DRFs to capture the heterogeneity for continuous treatments. We call the proposed algorithm generalized causal forest (GCF) as it generalizes the use case of CF to a much broader setup. We show the effectiveness of GCF compared to SOTA on synthetic data and proprietary real-world data sets.", "keywords": "Heterogeneous Treatment Effect;Causal Inference;Double/Debiased Machine Learning;Continuous Treatment", "primary_area": "", "supplementary_material": "/attachment/3fb2cda65a3f94c46101c8a7452a7eb7cf7044cd.zip", "author": "Shu Wan;Chen Zheng;Zhonggen Sun;Mengfan Xu;Xiaoqing Yang;Jiecheng Guo;Hongtu Zhu", "authorids": "~Shu_Wan1;~Chen_Zheng1;~Zhonggen_Sun2;~Mengfan_Xu2;~Xiaoqing_Yang1;~Jiecheng_Guo2;~Hongtu_Zhu2", "gender": "M;F;F;M;M;M;M", "homepage": "https://shu-wan.github.io;;;;https://bigkp.org;;", "dblp": "180/2216;;97/4171.html;;03/5683;;", "google_scholar": "H4mafGwAAAAJ;;;;https://scholar.google.com/citations?hl=en;;", "orcid": "0000-0003-0725-3644;;;;0000-0002-6781-2690;;", "linkedin": "shu-wan-88153493/;%E6%A2%A6%E5%87%A1-%E5%BE%90-5a2776221/;;\u6770\u6210-\u90ed-9a220b222/;;chen-zheng-ceys/;bruceuk/", "or_profile": "~Shu_Wan1;~Mengfan_Xu2;~Xiaoqing_Yang1;~Jiecheng_Guo2;~Hongtu_Zhu2;~Zheng_Chen8;~Zhonggeen_Sun1", "aff": "Arizona State University;;Didi Research;Didi Research;University of North Carolina at Chapel Hill;;", "aff_domain": "asu.edu;;didichuxing.com;didichuxing.com;unc.edu;;", "position": "PhD student;;Principal Researcher;Principal Researcher;Full Professor;;", "bibtex": "@misc{\nwan2022gcf,\ntitle={{GCF}: Generalized Causal Forest for Heterogeneous Treatment Effect Estimation Using Nonparametric Methods},\nauthor={Shu Wan and Chen Zheng and Zhonggen Sun and Mengfan Xu and Xiaoqing Yang and Jiecheng Guo and Hongtu Zhu},\nyear={2022},\nurl={https://openreview.net/forum?id=ugxdsne_TlO}\n}", "github": "", "project": "", "reviewers": "X6oe;kG5h;akBL", "site": "https://openreview.net/forum?id=ugxdsne_TlO", "pdf_size": 0, "recommendation": "1;5;5", "confidence": "4;3;2", "correctness": "1;3;3", "technical_novelty": "1;2;2", "empirical_novelty": "0;2;3", "wc_summary_paper": "42;35;66", "wc_summary_review": "19;61;51", "wc_main_review": "254;365;260", "wc_review": "315;461;377", "wc_reply_reviewers": "0;270;0", "wc_reply_authors": "885;1800;722", "reply_reviewers": "0;1;0", "reply_authors": "3;4;2", "recommendation_avg": [ 3.6666666666666665, 1.8856180831641267 ], "confidence_avg": [ 3.0, 0.816496580927726 ], "correctness_avg": [ 2.3333333333333335, 0.9428090415820634 ], "technical_novelty_avg": [ 1.6666666666666667, 0.4714045207910317 ], "empirical_novelty_avg": [ 1.6666666666666667, 1.247219128924647 ], "wc_summary_paper_avg": [ 47.666666666666664, 13.27487183449325 ], "wc_summary_review_avg": [ 43.666666666666664, 17.913371790059205 ], "wc_main_review_avg": [ 293.0, 50.97057974949863 ], "wc_review_avg": [ 384.3333333333333, 59.829387056492195 ], "wc_reply_reviewers_avg": [ 90.0, 127.27922061357856 ], "wc_reply_authors_avg": [ 1135.6666666666667, 474.44447046578125 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 3.0, 0.816496580927726 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": -0.8660254037844385, "corr_recommendation_correctness": 1.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:dxHFDtXB35cJ:scholar.google.com/&scioq=GCF:+Generalized+Causal+Forest+for+Heterogeneous+Treatment+Effect+Estimation+Using+Nonparametric+Methods&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1;1;2", "aff_unique_norm": "Arizona State University;Didi Research;University of North Carolina", "aff_unique_dep": ";;", "aff_unique_url": "https://www.asu.edu;https://www.didi.com;https://www.unc.edu", "aff_unique_abbr": "ASU;Didi;UNC", "aff_campus_unique_index": "1", "aff_campus_unique": ";Chapel Hill", "aff_country_unique_index": "0;1;1;0", "aff_country_unique": "United States;China" }, { "id": "uknMhonhXo", "title": "Selective Cross-Domain Consistency Regularization for Time Series Domain Generalization", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Domain generalization aims to learn models robust to domain shift, with limited source domains at training and without any access to target domain samples except at test time. Current domain alignment methods seek to extract features invariant across all domains, but do not consider inter-domain relationships. In this paper, we propose a novel representation learning methodology for time series classification that selectively enforces prediction consistency between source domains estimated to be closely-related. Specifically, we view a domain shift as a form of data transformation that preserves labels but not necessarily class relationships, and we regularize the predicted class relationships to be shared only by closely-related domains instead of all domains to prevent negative transfer. We conduct comprehensive experiments on two public real-world datasets. The proposed method significantly improves over the baseline and achieves better or competitive performance in comparison with state-of-the-art methods.", "keywords": "time series classification;domain generalization;robustness", "primary_area": "", "supplementary_material": "", "author": "Wenyu Zhang;Chuan-Sheng Foo;Mohamed Ragab", "authorids": "~Wenyu_Zhang1;~Chuan-Sheng_Foo1;~Mohamed_Ragab1", "gender": ";M;M", "homepage": "https://sites.coecis.cornell.edu/wenyuzhang/;http://ai.stanford.edu/~csfoo;http://mohamed-ragab.netlify.app", "dblp": "12/53-3;73/1823;237/3528-2.html", "google_scholar": "yJpx_WoAAAAJ;AgbeqGkAAAAJ;nNeT_NUAAAAJ", "orcid": ";0000-0002-4748-5792;0000-0002-2138-4395", "linkedin": ";;mohamedragab1/", "or_profile": "~Wenyu_Zhang1;~Chuan-Sheng_Foo1;~Mohamed_Ragab1", "aff": "I2R, A*STAR;Institute for Infocomm Research, A*STAR;Nanyang Technological University", "aff_domain": "i2r.a-star.edu.sg;i2r.a-star.edu.sg;ntu.edu.sg", "position": "Scientist;Scientist;PhD student", "bibtex": "@misc{\nzhang2022selective,\ntitle={Selective Cross-Domain Consistency Regularization for Time Series Domain Generalization},\nauthor={Wenyu Zhang and Chuan-Sheng Foo and Mohamed Ragab},\nyear={2022},\nurl={https://openreview.net/forum?id=uknMhonhXo}\n}", "github": "", "project": "", "reviewers": "49Y5;5iv7;9Zbf;aNuK", "site": "https://openreview.net/forum?id=uknMhonhXo", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "4;3;3;4", "correctness": "3;3;3;4", "technical_novelty": "2;2;3;2", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "93;52;103;73", "wc_summary_review": "45;13;38;60", "wc_main_review": "412;289;400;232", "wc_review": "550;354;541;365", "wc_reply_reviewers": "141;0;0;0", "wc_reply_authors": "1189;253;570;295", "reply_reviewers": "2;0;0;0", "reply_authors": "2;1;1;1", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 80.25, 19.562400159489634 ], "wc_summary_review_avg": [ 39.0, 16.98528775146303 ], "wc_main_review_avg": [ 333.25, 75.6087792521477 ], "wc_review_avg": [ 452.5, 93.13565375300696 ], "wc_reply_reviewers_avg": [ 35.25, 61.054790966802926 ], "wc_reply_authors_avg": [ 576.75, 373.86252486709606 ], "reply_reviewers_avg": [ 0.5, 0.8660254037844386 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:eQB-45ehOs8J:scholar.google.com/&scioq=Selective+Cross-Domain+Consistency+Regularization+for+Time+Series+Domain+Generalization&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1;2", "aff_unique_norm": "A*STAR;Institute for Infocomm Research;Nanyang Technological University", "aff_unique_dep": "Institute for Infocomm Research;;", "aff_unique_url": "https://www.a-star.edu.sg;https://www.i2r.a-star.edu.sg;https://www.ntu.edu.sg", "aff_unique_abbr": "A*STAR;I2R;NTU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Singapore" }, { "id": "uoBAKAFkVKx", "title": "Hypothesis Driven Coordinate Ascent for Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "This work develops a novel black box optimization technique for learning robust policies for stochastic environments. Through combining coordinate ascent with hypothesis testing, Hypothesis Driven Coordinate Ascent (HDCA) optimizes without computing or estimating gradients. The simplicity of this approach allows it to excel in a distributed setting; its implementation provides an interesting alternative to many state-of-the-art methods for common reinforcement learning environments. HDCA was evaluated on various problems from the MuJoCo physics simulator and OpenAI Gym framework, achieving equivalent or superior results to standard RL benchmarks.", "keywords": "Reinforcement Learning;Black-Box Optimization;Hypothesis Testing;Coordinate Ascent;Block Coordinate Ascent;Random Search;MDP", "primary_area": "", "supplementary_material": "/attachment/32f833c64a931787a02748397ac836e425cf94c1.zip", "author": "John Kenton Moore;Junier Oliva", "authorids": "~John_Kenton_Moore1;~Junier_Oliva1", "gender": "M;M", "homepage": "https://github.com/johnmoore98;http://lupalab.com", "dblp": ";137/8390", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": "~John_Kenton_Moore1;~Junier_Oliva1", "aff": "Department of Computer Science, University of North Carolina, Chapel Hill;", "aff_domain": "cs.unc.edu;", "position": "MS student;", "bibtex": "@misc{\nmoore2022hypothesis,\ntitle={Hypothesis Driven Coordinate Ascent for Reinforcement Learning},\nauthor={John Kenton Moore and Junier Oliva},\nyear={2022},\nurl={https://openreview.net/forum?id=uoBAKAFkVKx}\n}", "github": "", "project": "", "reviewers": "pTGV;Je5f;ADBA;gfVu", "site": "https://openreview.net/forum?id=uoBAKAFkVKx", "pdf_size": 0, "recommendation": "3;5;5;5", "confidence": "3;4;3;4", "correctness": "2;3;3;4", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "55;33;205;153", "wc_summary_review": "133;50;49;40", "wc_main_review": "265;481;726;305", "wc_review": "453;564;980;498", "wc_reply_reviewers": "0;48;82;0", "wc_reply_authors": "176;222;66;107", "reply_reviewers": "0;1;1;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 4.5, 0.8660254037844386 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 111.5, 70.38998508310682 ], "wc_summary_review_avg": [ 68.0, 37.72929895982696 ], "wc_main_review_avg": [ 444.25, 181.83423082577164 ], "wc_review_avg": [ 623.75, 209.43540173523672 ], "wc_reply_reviewers_avg": [ 32.5, 34.65183977799736 ], "wc_reply_authors_avg": [ 142.75, 60.32153429746296 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": 0.816496580927726, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:2UpBEaqemjgJ:scholar.google.com/&scioq=Hypothesis+Driven+Coordinate+Ascent+for+Reinforcement+Learning&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "University of North Carolina", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.unc.edu", "aff_unique_abbr": "UNC", "aff_campus_unique_index": "0", "aff_campus_unique": "Chapel Hill", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "title": "Strength of Minibatch Noise in SGD", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7176", "id": "uorVGbWV5sw", "poster": "", "openreview": "https://openreview.net/forum?id=uorVGbWV5sw", "slides": "https://iclr.cc/virtual/2022/poster/7176", "video": "https://iclr.cc/virtual/2022/poster/7176", "author_site": "Liu Ziyin, Kangqiao Liu, Takashi Mori, Masahito Ueda", "tldr": "", "abstract": "The noise in stochastic gradient descent (SGD), caused by minibatch sampling, is poorly understood despite its practical importance in deep learning. This work presents the first systematic study of the SGD noise and fluctuations close to a local minimum. We first analyze the SGD noise in linear regression in detail and then derive a general formula for approximating SGD noise in different types of minima. For application, our results (1) provide insight into the stability of training a neural network, (2) suggest that a large learning rate can help generalization by introducing an implicit regularization, (3) explain why the linear learning rate-batchsize scaling law fails at a large learning rate or at a small batchsize and (4) can provide an understanding of how discrete-time nature of SGD affects the recently discovered power-law phenomenon of SGD.", "keywords": "stochastic gradient descent;minibatch noise;discrete-time SGD;noise and fluctuation;exact solvable models", "primary_area": "", "supplementary_material": "", "author": "Liu Ziyin;Kangqiao Liu;Takashi Mori;Masahito Ueda", "authorids": "~Liu_Ziyin1;~Kangqiao_Liu1;~Takashi_Mori1;~Masahito_Ueda1", "gender": ";M;M;M", "homepage": "https://www.mit.edu/~ziyinl/;https://kangqiaoliu.github.io/;https://sites.google.com/view/takashimori/home;http://cat.phys.s.u-tokyo.ac.jp/index-e.html", "dblp": ";280/3114;;", "google_scholar": "NpN9oRMAAAAJ;utIJkHcAAAAJ;https://scholar.google.co.jp/citations?hl=ja;https://scholar.google.co.jp/citations?user=Xpjx9CwAAAAJ", "orcid": ";0000-0002-4014-5728;;0000-0002-5367-1436", "linkedin": ";kangqiaoliu/?originalSubdomain=jp;;", "or_profile": "~Liu_Ziyin1;~Kangqiao_Liu1;~Takashi_Mori1;~Masahito_Ueda1", "aff": "The University of Tokyo;The University of Tokyo;RIKEN;The University of Tokyo", "aff_domain": "u-tokyo.ac.jp;u-tokyo.ac.jp;riken.jp;u-tokyo.ac.jp", "position": "PhD student;PhD student;Postdoc;Full Professor", "bibtex": "@inproceedings{\nziyin2022strength,\ntitle={Strength of Minibatch Noise in {SGD}},\nauthor={Liu Ziyin and Kangqiao Liu and Takashi Mori and Masahito Ueda},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=uorVGbWV5sw}\n}", "github": "", "project": "", "reviewers": "oBNf;5twY;pu11;4YZA", "pdf_size": 0, "recommendation": "6;8;8;8", "confidence": "3;3;3;3", "correctness": "3;3;4;4", "technical_novelty": "3;3;4;3", "empirical_novelty": "1;3;0;2", "wc_summary_paper": "30;80;144;140", "wc_summary_review": "31;51;11;38", "wc_main_review": "657;181;266;177", "wc_review": "718;312;421;355", "wc_reply_reviewers": "0;0;24;0", "wc_reply_authors": "1011;338;406;146", "reply_reviewers": "0;0;1;0", "reply_authors": "2;1;1;1", "recommendation_avg": [ 7.5, 0.8660254037844386 ], "confidence_avg": [ 3.0, 0.0 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.5, 1.118033988749895 ], "wc_summary_paper_avg": [ 98.5, 46.976057731572155 ], "wc_summary_review_avg": [ 32.75, 14.463315664120728 ], "wc_main_review_avg": [ 320.25, 197.64535790146957 ], "wc_review_avg": [ 451.5, 158.6860107255835 ], "wc_reply_reviewers_avg": [ 6.0, 10.392304845413264 ], "wc_reply_authors_avg": [ 475.25, 323.6768257073713 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 43, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13135648825930069696&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "pdf": "https://openreview.net/pdf?id=uorVGbWV5sw", "email": "u-tokyo.ac.jp;u-tokyo.ac.jp;riken.jp;u-tokyo.ac.jp", "author_num": 4, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "University of Tokyo;RIKEN", "aff_unique_dep": ";", "aff_unique_url": "https://www.u-tokyo.ac.jp;https://www.riken.jp", "aff_unique_abbr": "UTokyo;RIKEN", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Japan" }, { "id": "uouGog2bW-F", "title": "Numerical Solution of Fredholm Integral Equations of the Second Kind using Neural Network Models", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "We propose a novel method based on a neural network with one hidden layer and the collocation method for solving linear Fredholm integral equations of the second kind. We first choose the space of polynomials as the projection space for the collocation method, then approximate the solution of a integral equation by a linear combination of polynomials in that space. The coefficients of this linear combination are served as the weights between the hidden layer and the output layer of the neural network while the mean square error between the exact solution and the approximation solution at the training set as the cost function. We train the neural network by the gradient decent method with Adam optimizer and find an optimal solution with the desired accuracy. This method provides a stable and reliable solution with higher accuracy and saves computations comparing with previous neural network approaches for solving the Fredholm integral equations of the second kind.", "keywords": "Neural network;Second kind Fredholm integral equation;Collocation NN", "primary_area": "", "supplementary_material": "", "author": "Yuzhen Liu;Lixin Shen", "authorids": "~Yuzhen_Liu1;~Lixin_Shen2", "gender": "F;M", "homepage": "https://thecollege.syr.edu/people/;https://thecollege.syr.edu/people/faculty/shen-lixin/", "dblp": ";", "google_scholar": ";https://scholar.google.com/citations?hl=en", "orcid": ";", "linkedin": ";", "or_profile": "~Yuzhen_Liu1;~Lixin_Shen2", "aff": ";Syracuse University", "aff_domain": ";syr.edu", "position": ";Full Professor", "bibtex": "@misc{\nliu2022numerical,\ntitle={Numerical Solution of Fredholm Integral Equations of the Second Kind using Neural Network Models},\nauthor={Yuzhen Liu and Lixin Shen},\nyear={2022},\nurl={https://openreview.net/forum?id=uouGog2bW-F}\n}", "github": "", "project": "", "reviewers": "kzdP;sTTz;GB9g;dE9j;pHWE", "site": "https://openreview.net/forum?id=uouGog2bW-F", "pdf_size": 0, "recommendation": "1;1;1;1;3", "confidence": "5;4;5;4;2", "correctness": "4;3;1;2;3", "technical_novelty": "1;2;1;1;1", "empirical_novelty": "1;1;0;1;2", "wc_summary_paper": "102;39;32;37;46", "wc_summary_review": "43;29;17;7;21", "wc_main_review": "388;80;97;34;113", "wc_review": "533;148;146;78;180", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "0;0;0;0;0", "reply_reviewers": "0;0;0;0;0", "reply_authors": "0;0;0;0;0", "recommendation_avg": [ 1.4, 0.8 ], "confidence_avg": [ 4.0, 1.0954451150103321 ], "correctness_avg": [ 2.6, 1.019803902718557 ], "technical_novelty_avg": [ 1.2, 0.4000000000000001 ], "empirical_novelty_avg": [ 1.0, 0.6324555320336759 ], "wc_summary_paper_avg": [ 51.2, 25.79457307264456 ], "wc_summary_review_avg": [ 23.4, 12.09297316626478 ], "wc_main_review_avg": [ 142.4, 125.60987222348409 ], "wc_review_avg": [ 217.0, 161.46083116347444 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.912870929175277, "corr_recommendation_correctness": 0.19611613513818404, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:1Jz0LYSvDNoJ:scholar.google.com/&scioq=Numerical+Solution+of+Fredholm+Integral+Equations+of+the+Second+Kind+using+Neural+Network+Models&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "Syracuse University", "aff_unique_dep": "", "aff_unique_url": "https://www.syracuse.edu", "aff_unique_abbr": "Syracuse", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "title": "Knowledge Infused Decoding", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6163", "id": "upnDJ7itech", "poster": "", "openreview": "https://openreview.net/forum?id=upnDJ7itech", "slides": "https://iclr.cc/virtual/2022/poster/6163", "video": "https://iclr.cc/virtual/2022/poster/6163", "author_site": "Ruibo Liu, Guoqing Zheng, Shashank Gupta, Radhika Gaonkar, CHONGYANG GAO, Soroush Vosoughi, Milad Shokouhi, Ahmed H Awadallah", "tldr": "", "abstract": "Pre-trained language models (LMs) have been shown to memorize a substantial amount of knowledge from the pre-training corpora; however, they are still limited in recalling factually correct knowledge given a certain context. Hence. they tend to suffer from counterfactual or hallucinatory generation when used in knowledge-intensive natural language generation (NLG) tasks. Recent remedies to this problem focus on modifying either the pre-training or task fine-tuning objectives to incorporate knowledge, which normally require additional costly training or architecture modification of LMs for practical applications.\n\nWe present Knowledge Infused Decoding (KID)---a novel decoding algorithm for generative LMs, which dynamically infuses external knowledge into each step of the LM decoding. Specifically, we maintain a local knowledge memory based on the current context, interacting with a dynamically created external knowledge trie, and continuously update the local memory as a knowledge-aware constraint to guide decoding via reinforcement learning. On six diverse knowledge-intensive NLG tasks, task-agnostic LMs (e.g., GPT-2 and BART) armed with KID outperform many task-optimized state-of-the-art models, and show particularly strong performance in few-shot scenarios over seven related knowledge-infusion techniques. Human evaluation confirms KID's ability to generate more relevant and factual language for the input context when compared with multiple baselines. Finally, KID also alleviates exposure bias and provides stable generation quality when generating longer sequences.", "keywords": "natural language;decoding;reinforcement learning;knowledge integration;generation", "primary_area": "", "supplementary_material": "", "author": "Ruibo Liu;Guoqing Zheng;Shashank Gupta;Radhika Gaonkar;Chongyang Gao;Soroush Vosoughi;Milad Shokouhi;Ahmed Hassan Awadallah", "authorids": "~Ruibo_Liu1;~Guoqing_Zheng1;~Shashank_Gupta3;~Radhika_Gaonkar1;~Chongyang_Gao1;~Soroush_Vosoughi1;~Milad_Shokouhi1;~Ahmed_Hassan_Awadallah1", "gender": "M;;M;F;;;;M", "homepage": "https://www.cs.dartmouth.edu/~rbliu/;https://www.microsoft.com/en-us/research/people/zheng/;https://shashankgupta.info/;;https://gcyzsl.github.io/;https://www.cs.dartmouth.edu/~soroush/;;https://www.microsoft.com/en-us/research/people/hassanam/publications/", "dblp": ";https://dblp.org/pers/z/Zheng:Guoqing.html;;227/6273.html;259/8515;01/1709;http://dblp.uni-trier.de/pers/hd/s/Shokouhi:Milad;147/9148", "google_scholar": "5lgfeo4AAAAJ;aMhUcoMAAAAJ;U2Gz-NIAAAAJ;or203g8AAAAJ;HEAgatAAAAAJ;45DAXkwAAAAJ;SbYANgwAAAAJ;sNGk-9MAAAAJ", "orcid": ";;0000-0002-3683-3739;;0000-0002-2358-4710;0000-0002-2564-8909;;", "linkedin": ";;shashank-gupta-5182bb28/;rgaonkar/;chongyang-gao-685597116/;;;ahmed-hassan-awadallah-a355a27/", "or_profile": "~Ruibo_Liu1;~Guoqing_Zheng1;~Shashank_Gupta3;~Radhika_Gaonkar1;~Chongyang_Gao1;~Soroush_Vosoughi1;~Milad_Shokouhi1;~Ahmed_Hassan_Awadallah1", "aff": "Dartmouth College;Microsoft Research;Microsoft;Microsoft;Northwestern University;Dartmouth College;Microsoft;Microsoft Research", "aff_domain": "dartmouth.edu;microsoft.com;microsoft.com;microsoft.com;northwestern.edu;dartmouth.edu;;microsoft.com", "position": "PhD student;Researcher;Scientist;Researcher;PhD student;Assistant Professor;Principal Applied Researcher;Principal Researcher", "bibtex": "@inproceedings{\nliu2022knowledge,\ntitle={Knowledge Infused Decoding},\nauthor={Ruibo Liu and Guoqing Zheng and Shashank Gupta and Radhika Gaonkar and Chongyang Gao and Soroush Vosoughi and Milad Shokouhi and Ahmed Hassan Awadallah},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=upnDJ7itech}\n}", "github": "", "project": "", "reviewers": "zdRT;Kmpi;RDPE;wRJi", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "2;4;3;4", "correctness": "3;3;3;3", "technical_novelty": "3;3;3;3", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "101;160;186;109", "wc_summary_review": "66;38;36;81", "wc_main_review": "183;680;204;384", "wc_review": "350;878;426;574", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "496;991;443;463", "reply_reviewers": "0;0;0;0", "reply_authors": "1;2;1;1", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 139.0, 35.33411948810951 ], "wc_summary_review_avg": [ 55.25, 19.01808349965895 ], "wc_main_review_avg": [ 362.75, 199.12982574190136 ], "wc_review_avg": [ 557.0, 202.07671810478317 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 598.25, 227.54271577002856 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": 0.7608859102526822, "corr_recommendation_correctness": 0.0, "gs_citation": 29, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5121405141535448243&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=upnDJ7itech", "email": "dartmouth.edu;microsoft.com;microsoft.com;microsoft.com;northwestern.edu;dartmouth.edu;;microsoft.com", "author_num": 8, "aff_unique_index": "0;1;1;1;2;0;1;1", "aff_unique_norm": "Dartmouth College;Microsoft;Northwestern University", "aff_unique_dep": ";Microsoft Research;", "aff_unique_url": "https://www.dartmouth.edu;https://www.microsoft.com/en-us/research;https://www.northwestern.edu", "aff_unique_abbr": "Dartmouth;MSR;NU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Is Importance Weighting Incompatible with Interpolating Classifiers?", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7198", "id": "uqBOne3LUKy", "poster": "", "openreview": "https://openreview.net/forum?id=uqBOne3LUKy", "slides": "https://iclr.cc/virtual/2022/poster/7198", "video": "https://iclr.cc/virtual/2022/poster/7198", "author_site": "Ke Wang, Niladri Chatterji, Saminul Haque, Tatsunori Hashimoto", "tldr": "", "abstract": "Importance weighting is a classic technique to handle distribution shifts. However, prior work has presented strong empirical and theoretical evidence demonstrating that importance weights can have little to no effect on overparameterized neural networks. \\emph{Is importance weighting truly incompatible with the training of overparameterized neural networks?} Our paper answers this in the negative. We show that importance weighting fails not because of the overparameterization, but instead, as a result of using exponentially-tailed losses like the logistic or cross-entropy loss. As a remedy, we show that polynomially-tailed losses restore the effects of importance reweighting in correcting distribution shift in overparameterized models. We characterize the behavior of gradient descent on importance weighted polynomially-tailed losses with overparameterized linear models, and theoretically demonstrate the advantage of using polynomially-tailed losses in a label shift setting. Surprisingly, our theory shows that using weights that are obtained by exponentiating the classical unbiased importance weights can improve performance. Finally, we demonstrate the practical value of our analysis with neural network experiments on a subpopulation shift and a label shift dataset. When reweighted, our loss function can outperform reweighted cross-entropy by as much as 9\\% in test accuracy. Our loss function also gives test accuracies comparable to, or even exceeding, well-tuned state-of-the-art methods for correcting distribution shifts.", "keywords": "overparameterization;distribution shifts;importance weighting;implicit bias;generalization analysis;interpolation", "primary_area": "", "supplementary_material": "", "author": "Ke Alexander Wang;Niladri Shekhar Chatterji;Saminul Haque;Tatsunori Hashimoto", "authorids": "~Ke_Alexander_Wang1;~Niladri_Shekhar_Chatterji1;~Saminul_Haque1;~Tatsunori_Hashimoto1", "gender": "M;M;;M", "homepage": ";;;https://thashim.github.io", "dblp": "238/0269.html;203/8783;252/5821;", "google_scholar": ";;;5ygiTwsAAAAJ", "orcid": ";;;", "linkedin": ";;saminulh/;", "or_profile": "~Ke_Alexander_Wang1;~Niladri_Shekhar_Chatterji1;~Saminul_Haque1;~Tatsunori_Hashimoto1", "aff": "Stanford University;Stanford University;Stanford University;Stanford University", "aff_domain": "stanford.edu;stanford.edu;stanford.edu;stanford.edu", "position": "PhD student;Postdoc;PhD student;Assistant Professor", "bibtex": "@inproceedings{\nwang2022is,\ntitle={Is Importance Weighting Incompatible with Interpolating Classifiers?},\nauthor={Ke Alexander Wang and Niladri Shekhar Chatterji and Saminul Haque and Tatsunori Hashimoto},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=uqBOne3LUKy}\n}", "github": "", "project": "", "reviewers": "AUFR;DWSu;46YL;s5yg", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "4;4;4;3", "correctness": "4;4;4;3", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "88;122;44;62", "wc_summary_review": "66;149;23;22", "wc_main_review": "275;570;224;120", "wc_review": "429;841;291;204", "wc_reply_reviewers": "63;0;0;0", "wc_reply_authors": "349;269;368;241", "reply_reviewers": "1;0;0;0", "reply_authors": "1;1;2;1", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 79.0, 29.34280150224242 ], "wc_summary_review_avg": [ 65.0, 51.647846034466916 ], "wc_main_review_avg": [ 297.25, 167.08586864244384 ], "wc_review_avg": [ 441.25, 244.34235715487398 ], "wc_reply_reviewers_avg": [ 15.75, 27.279800219209818 ], "wc_reply_authors_avg": [ 306.75, 53.11485197192966 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.9271726499455306, "corr_recommendation_correctness": -0.9271726499455306, "gs_citation": 28, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5476028930081234281&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=uqBOne3LUKy", "email": "stanford.edu;stanford.edu;stanford.edu;stanford.edu", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Stanford University", "aff_unique_dep": "", "aff_unique_url": "https://www.stanford.edu", "aff_unique_abbr": "Stanford", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "uut_j3UrRCg", "title": "Provable hierarchical lifelong learning with a sketch-based modular architecture", "track": "main", "status": "Reject", "tldr": "", "abstract": "We propose a modular architecture for lifelong learning of hierarchically structured tasks. Specifically, we prove that our architecture is theoretically able to learn tasks that can be solved by functions that are learnable given access to functions for other, previously learned tasks as subroutines. We show that some tasks that we can learn in this way are not learned by standard training methods in practice; indeed, prior work suggests that some such tasks cannot be learned by \\emph{any} efficient method without the aid of the simpler tasks. We also consider methods for identifying the tasks automatically, without relying on explicitly given indicators.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/eee42f2b6b44d8b70936796a385c62df25bbb83f.zip", "author": "Rina Panigrahy;Brendan Juba;Zihao Deng;Xin Wang;Zee Fryer", "authorids": "~Rina_Panigrahy1;~Brendan_Juba1;~Zihao_Deng1;~Xin_Wang30;~Zee_Fryer1", "gender": ";M;;M;Non-Binary", "homepage": ";http://www.cse.wustl.edu/~bjuba/;;;", "dblp": "p/RinaPanigrahy;62/6079;188/6173;;", "google_scholar": ";https://scholar.google.com.tw/citations?user=5wppdUoAAAAJ;;7BjA8ccAAAAJ;", "orcid": ";;;;", "linkedin": ";;;;zee-fryer/", "or_profile": "~Rina_Panigrahy1;~Brendan_Juba1;~Zihao_Deng1;~Xin_Wang30;~Zee_Fryer1", "aff": "Google;Washington University in St. Louis;Washington University, St. Louis;Google;Google", "aff_domain": "google.com;cse.wustl.edu;wustl.edu;google.com;google.com", "position": "Research Scientist;Associate Professor;PhD student;Software Engineer;Researcher", "bibtex": "@misc{\npanigrahy2022provable,\ntitle={Provable hierarchical lifelong learning with a sketch-based modular architecture},\nauthor={Rina Panigrahy and Brendan Juba and Zihao Deng and Xin Wang and Zee Fryer},\nyear={2022},\nurl={https://openreview.net/forum?id=uut_j3UrRCg}\n}", "github": "", "project": "", "reviewers": "RKp7;kTWq;yxwW;GsU1", "site": "https://openreview.net/forum?id=uut_j3UrRCg", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "3;2;3;2", "correctness": "3;2;4;3", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "47;29;94;42", "wc_summary_review": "119;104;104;3", "wc_main_review": "1400;515;338;221", "wc_review": "1566;648;536;266", "wc_reply_reviewers": "477;266;0;35", "wc_reply_authors": "906;839;102;394", "reply_reviewers": "1;1;0;1", "reply_authors": "2;2;1;1", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 2.5, 0.5 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 53.0, 24.566236993076494 ], "wc_summary_review_avg": [ 82.5, 46.30604712129939 ], "wc_main_review_avg": [ 618.5, 463.1795008417363 ], "wc_review_avg": [ 754.0, 488.9396690799387 ], "wc_reply_reviewers_avg": [ 194.5, 192.47662195705743 ], "wc_reply_authors_avg": [ 560.25, 329.7259278552416 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.7071067811865475, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1174169064203271597&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1;1;0;0", "aff_unique_norm": "Google;Washington University in St. Louis", "aff_unique_dep": "Google;", "aff_unique_url": "https://www.google.com;https://wustl.edu", "aff_unique_abbr": "Google;WashU", "aff_campus_unique_index": "0;1;1;0;0", "aff_campus_unique": "Mountain View;St. Louis", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "uwnOHjgUrTa", "title": "DNN Quantization with Attention", "track": "main", "status": "Reject", "tldr": "", "abstract": "Low-bit quantization of network weights and activations can drastically reduce the memory footprint, complexity, energy consumption and latency of Deep Neural Networks (DNNs). Many different quantization methods like min-max quantization, Statistics-Aware Weight Binning (SAWB) or Binary Weight Network (BWN) have been proposed in the past. However, they still cause a considerable accuracy drop, in particular when applied to complex learning tasks or lightweight DNN architectures. In this paper, we propose a novel training procedure that can be used to improve the performance of existing quantization methods. We call this procedure \\textit{DNN Quantization with Attention} (DQA). It relaxes the training problem, using a learnable linear combination of high, medium and low-bit quantization at the beginning, while converging to a single low-bit quantization at the end of the training. We show empirically that this relaxation effectively smooths the loss function and therefore helps convergence. Moreover, we conduct experiments and show that our procedure improves the performance of many state-of-the-art quantization methods on various object recognition tasks. In particular, we apply DQA with min-max, SAWB and BWN to train $2$bit quantized DNNs on the CIFAR10, CIFAR100 and ImageNet ILSVRC 2012 datasets, achieving a very good accuracy comparing to other conterparts.", "keywords": "Deep learning;Computer vision;Quantization", "primary_area": "", "supplementary_material": "", "author": "Ghouthi BOUKLI HACENE;Lukas Mauch;Shubhankar Chowdhury;Stefan Uhlich;Fabien Cardinaux", "authorids": "~Ghouthi_BOUKLI_HACENE2;~Lukas_Mauch1;shubhankar.chowdhury@sony.com;~Stefan_Uhlich1;~Fabien_Cardinaux1", "gender": ";M;;;M", "homepage": ";;;;https://www.linkedin.com/in/fabiencardinaux/", "dblp": ";123/9181;;19/7822;86/627", "google_scholar": "FwjpGsgAAAAJ;ivJ6Tf8AAAAJ;;https://scholar.google.de/citations?user=hja8ejYAAAAJ;https://scholar.google.co.uk/citations?user=UFl8n4gAAAAJ", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~Ghouthi_BOUKLI_HACENE2;~Lukas_Mauch1;shubhankar.chowdhury@sony.com;~Stefan_Uhlich1;~Fabien_Cardinaux1", "aff": ";Sony Europe B.V.;;Sony Europe B.V.;Sony Europe BV", "aff_domain": ";sony.com;;sony.com;sony.com", "position": ";Researcher;;Researcher;Research Scientist", "bibtex": "@misc{\nhacene2022dnn,\ntitle={{DNN} Quantization with Attention},\nauthor={Ghouthi BOUKLI HACENE and Lukas Mauch and Shubhankar Chowdhury and Stefan Uhlich and Fabien Cardinaux},\nyear={2022},\nurl={https://openreview.net/forum?id=uwnOHjgUrTa}\n}", "github": "", "project": "", "reviewers": "1Jav;H2zp;UAi4;4bsX", "site": "https://openreview.net/forum?id=uwnOHjgUrTa", "pdf_size": 0, "recommendation": "3;3;3;3", "confidence": "4;2;4;4", "correctness": "3;2;3;3", "technical_novelty": "2;2;1;2", "empirical_novelty": "2;2;1;2", "wc_summary_paper": "62;61;115;55", "wc_summary_review": "41;74;58;14", "wc_main_review": "263;160;580;246", "wc_review": "366;295;753;315", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "107;219;142;148", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 3.5, 0.8660254037844386 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 1.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 73.25, 24.252577182641847 ], "wc_summary_review_avg": [ 46.75, 22.219079638904937 ], "wc_main_review_avg": [ 312.25, 159.44023174845174 ], "wc_review_avg": [ 432.25, 186.98579491501488 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 154.0, 40.663251222694925 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4255715318983242751&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Sony Europe", "aff_unique_dep": "", "aff_unique_url": "https://www.sony.eu", "aff_unique_abbr": "Sony Europe", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Unknown" }, { "title": "A New Perspective on \"How Graph Neural Networks Go Beyond Weisfeiler-Lehman?\"", "status": "Oral", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6436", "id": "uxgg9o7bI_3", "poster": "", "openreview": "https://openreview.net/forum?id=uxgg9o7bI_3", "slides": "https://iclr.cc/virtual/2022/poster/6436", "video": "https://iclr.cc/virtual/2022/poster/6436", "author_site": "Asiri Wijesinghe, Qing Wang", "tldr": "", "abstract": "We propose a new perspective on designing powerful Graph Neural Networks (GNNs). In a nutshell, this enables a general solution to inject structural properties of graphs into a message-passing aggregation scheme of GNNs. As a theoretical basis, we develop a new hierarchy of local isomorphism on neighborhood subgraphs. Then, we theoretically characterize how message-passing GNNs can be designed to be more expressive than the Weisfeiler Lehman test. To elaborate this characterization, we propose a novel neural model, called GraphSNN, and prove that this model is strictly more expressive than the Weisfeiler Lehman test in distinguishing graph structures. We empirically verify the strength of our model on different graph learning tasks. It is shown that our model consistently improves the state-of-the-art methods on the benchmark tasks without sacrificing computational simplicity and efficiency.", "keywords": "Graph Neural Networks;Graph Isomorphism;Weisfeiler Lehman", "primary_area": "", "supplementary_material": "/attachment/933ba75f12d12c8e12bde0edcad6fc41e3aaf1a4.zip", "author": "Asiri Wijesinghe;Qing Wang", "authorids": "~Asiri_Wijesinghe1;~Qing_Wang14", "gender": "M;F", "homepage": "https://cecs.anu.edu.au/people/asiri-wijesinghe;https://graphlabanu.github.io/website/team/", "dblp": "251/5617;97/6505-2", "google_scholar": "dV4kyHYAAAAJ;GytuLAcAAAAJ", "orcid": "0000-0003-4392-5348;", "linkedin": "asiriwijesinghe/?originalSubdomain=au;", "or_profile": "~Asiri_Wijesinghe1;~Qing_Wang14", "aff": "Australian National University;Australian National University", "aff_domain": "anu.edu.au;anu.edu.au", "position": "PhD student;Associate Professor", "bibtex": "@inproceedings{\nwijesinghe2022a,\ntitle={A New Perspective on ''How Graph Neural Networks Go Beyond Weisfeiler-Lehman?''},\nauthor={Asiri Wijesinghe and Qing Wang},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=uxgg9o7bI_3}\n}", "github": "", "project": "", "reviewers": "Z8WJ;HLe3;FAad;csQY", "pdf_size": 0, "recommendation": "8;8;8;8", "confidence": "3;4;4;3", "correctness": "4;4;4;4", "technical_novelty": "3;3;3;3", "empirical_novelty": "0;3;3;3", "wc_summary_paper": "161;76;79;66", "wc_summary_review": "25;20;38;15", "wc_main_review": "225;365;840;180", "wc_review": "411;461;957;261", "wc_reply_reviewers": "28;7;0;31", "wc_reply_authors": "664;759;373;252", "reply_reviewers": "1;1;0;1", "reply_authors": "2;1;1;1", "recommendation_avg": [ 8.0, 0.0 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 4.0, 0.0 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.25, 1.299038105676658 ], "wc_summary_paper_avg": [ 95.5, 38.12151623427379 ], "wc_summary_review_avg": [ 24.5, 8.558621384311845 ], "wc_main_review_avg": [ 402.5, 261.64145313768614 ], "wc_review_avg": [ 522.5, 261.4321135591418 ], "wc_reply_reviewers_avg": [ 16.5, 13.275918047351754 ], "wc_reply_authors_avg": [ 512.0, 206.78128542012695 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 24, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 126, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12305452598089606699&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=uxgg9o7bI_3", "email": "anu.edu.au;anu.edu.au", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Australian National University", "aff_unique_dep": "", "aff_unique_url": "https://www.anu.edu.au", "aff_unique_abbr": "ANU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Australia" }, { "title": "Learning Fast, Learning Slow: A General Continual Learning Method based on Complementary Learning System", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6559", "id": "uxxFrDwrE7Y", "poster": "", "openreview": "https://openreview.net/forum?id=uxxFrDwrE7Y", "slides": "https://iclr.cc/virtual/2022/poster/6559", "video": "https://iclr.cc/virtual/2022/poster/6559", "author_site": "Elahe Arani, Fahad Sarfraz, Bahram Zonooz", "tldr": "", "abstract": "Humans excel at continually learning from an ever-changing environment whereas it remains a challenge for deep neural networks which exhibit catastrophic forgetting. The complementary learning system (CLS) theory suggests that the interplay between rapid instance-based learning and slow structured learning in the brain is crucial for accumulating and retaining knowledge. Here, we propose CLS-ER, a novel dual memory experience replay (ER) method which maintains short-term and long-term semantic memories that interact with the episodic memory. Our method employs an effective replay mechanism whereby new knowledge is acquired while aligning the decision boundaries with the semantic memories. CLS-ER does not utilize the task boundaries or make any assumption about the distribution of the data which makes it versatile and suited for ``general continual learning''. Our approach achieves state-of-the-art performance on standard benchmarks as well as more realistic general continual learning settings.", "keywords": "Continual Learning;Catastrophic Forgetting;Complementary Learning Systems Theory;Experience Replay", "primary_area": "", "supplementary_material": "/attachment/f5c77b4325f9e09235df4d250d41fda1f1dd3c8c.zip", "author": "Elahe Arani;Fahad Sarfraz;Bahram Zonooz", "authorids": "~Elahe_Arani1;~Fahad_Sarfraz1;~Bahram_Zonooz1", "gender": "F;M;M", "homepage": "https://sites.google.com/view/elahe-arani;https://www.fahadsarfraz.com/;https://sites.google.com/view/bahramzonooz", "dblp": ";250/9424;250/9573", "google_scholar": "e_I_v6cAAAAJ;Zhx_sM4AAAAJ;", "orcid": "0000-0002-0952-7007;;", "linkedin": "elahe-arani-630870b2/;fahadsarfraz/;", "or_profile": "~Elahe_Arani1;~Fahad_Sarfraz1;~Bahram_Zonooz1", "aff": "Advanced Research Lab, NavInfo Europe;Navinfo Europe;Eindhoven University of Technology", "aff_domain": "navinfo.eu;navinfo.eu;tue.nl", "position": "Sr. AI Manager & Sr. Research Scientist;Researcher;Assistant Professor", "bibtex": "@inproceedings{\narani2022learning,\ntitle={Learning Fast, Learning Slow: A General Continual Learning Method based on Complementary Learning System},\nauthor={Elahe Arani and Fahad Sarfraz and Bahram Zonooz},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=uxxFrDwrE7Y}\n}", "github": "", "project": "", "reviewers": "MUyX;M4Au;8Zhu;azqK", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "2;4;5;3", "correctness": "3;3;4;4", "technical_novelty": "2;3;4;2", "empirical_novelty": "2;3;4;3", "wc_summary_paper": "48;121;40;190", "wc_summary_review": "39;44;50;78", "wc_main_review": "91;210;101;232", "wc_review": "178;375;191;500", "wc_reply_reviewers": "0;0;0;179", "wc_reply_authors": "607;654;635;671", "reply_reviewers": "0;0;0;1", "reply_authors": "1;1;1;1", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 3.5, 1.118033988749895 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.82915619758885 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 99.75, 60.91951657720209 ], "wc_summary_review_avg": [ 52.75, 15.08931741332258 ], "wc_main_review_avg": [ 158.5, 63.08129675268257 ], "wc_review_avg": [ 311.0, 134.07647071727388 ], "wc_reply_reviewers_avg": [ 44.75, 77.50927363870726 ], "wc_reply_authors_avg": [ 641.75, 23.763154251908563 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.10259783520851541, "corr_recommendation_correctness": 0.6882472016116854, "gs_citation": 196, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2178714881439527742&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "pdf": "https://openreview.net/pdf?id=uxxFrDwrE7Y", "email": "navinfo.eu;navinfo.eu;tue.nl", "author_num": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "NavInfo Europe;NavInfo;Eindhoven University of Technology", "aff_unique_dep": "Advanced Research Lab;;", "aff_unique_url": ";;https://www.tue.nl", "aff_unique_abbr": ";;TU/e", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1", "aff_country_unique": "Unknown;Netherlands" }, { "id": "uy602F8cTrh", "title": "CausalDyna: Improving Generalization of Dyna-style Reinforcement Learning via Counterfactual-Based Data Augmentation", "track": "main", "status": "Reject", "tldr": "", "abstract": "Deep reinforcement learning agents trained in real-world environments with a limited diversity of object properties to learn manipulation tasks tend to suffer overfitting and fail to generalize to unseen testing environments. To improve the agents' ability to generalize to object properties rarely seen or unseen, we propose a data-efficient reinforcement learning algorithm, CausalDyna, that exploits structural causal models (SCMs) to model the state dynamics. The learned SCM enables us to counterfactually reason what would have happened had the object had a different property value. This can help remedy limitations of real-world environments or avoid risky exploration of robots (e.g., heavy objects may damage the robot). We evaluate our algorithm in the CausalWorld robotic-manipulation environment. When augmented with counterfactual data, our CausalDyna outperforms state-of-the-art model-based algorithm, MBPO and model-free algorithm, SAC in both sample efficiency by up to 17% and generalization by up to 30%. Code will be made publicly available.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Deyao Zhu;Li Erran Li;Mohamed Elhoseiny", "authorids": "~Deyao_Zhu1;~Li_Erran_Li1;~Mohamed_Elhoseiny1", "gender": "M;;M", "homepage": "https://tsutikgiau.github.io/;http://www.cs.columbia.edu/~lierranli/;http://www.mohamed-elhoseiny.com", "dblp": "251/6017;l/ErranLLi.html;125/2894", "google_scholar": "dENNKrsAAAAJ;GkMfzy4AAAAJ;iRBUTOAAAAAJ", "orcid": ";;0000-0001-9659-1551", "linkedin": "deyao-zhu-205774154/;;mohamed-elhoseiny-8a836215/", "or_profile": "~Deyao_Zhu1;~Li_Erran_Li1;~Mohamed_Elhoseiny1", "aff": "KAUST;Columbia University;KAUST", "aff_domain": "kaust.edu.sa;columbia.edu;kaust.edu.sa", "position": "PhD student;Adjunct Professor;Associate Professor", "bibtex": "@misc{\nzhu2022causaldyna,\ntitle={CausalDyna: Improving Generalization of Dyna-style Reinforcement Learning via Counterfactual-Based Data Augmentation},\nauthor={Deyao Zhu and Li Erran Li and Mohamed Elhoseiny},\nyear={2022},\nurl={https://openreview.net/forum?id=uy602F8cTrh}\n}", "github": "", "project": "", "reviewers": "VSvt;CWLF;cJzu;RJye", "site": "https://openreview.net/forum?id=uy602F8cTrh", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "5;4;4;4", "correctness": "2;2;3;3", "technical_novelty": "2;2;2;2", "empirical_novelty": "1;2;3;2", "wc_summary_paper": "46;127;71;48", "wc_summary_review": "42;107;32;8", "wc_main_review": "369;458;193;86", "wc_review": "457;692;296;142", "wc_reply_reviewers": "50;47;0;0", "wc_reply_authors": "742;469;204;173", "reply_reviewers": "1;1;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 73.0, 32.687918257362305 ], "wc_summary_review_avg": [ 47.25, 36.64270050091832 ], "wc_main_review_avg": [ 276.5, 145.56871229766375 ], "wc_review_avg": [ 396.75, 203.62388735116517 ], "wc_reply_reviewers_avg": [ 24.25, 24.27318479310039 ], "wc_reply_authors_avg": [ 397.0, 230.018477518655 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 1.0, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14139496818072362510&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;0", "aff_unique_norm": "King Abdullah University of Science and Technology;Columbia University", "aff_unique_dep": ";", "aff_unique_url": "https://www.kaust.edu.sa;https://www.columbia.edu", "aff_unique_abbr": "KAUST;Columbia", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", "aff_country_unique": "Saudi Arabia;United States" }, { "id": "uydP1ykieNv", "title": "Ensemble-in-One: Learning Ensemble within Random Gated Networks for Enhanced Adversarial Robustness", "track": "main", "status": "Reject", "tldr": "", "abstract": "Adversarial attacks have threatened modern deep learning systems by crafting adversarial examples with small perturbations to fool the convolutional neural networks (CNNs). Ensemble training methods are promising to facilitate better adversarial robustness by diversifying the vulnerabilities among the sub-models, simultaneously maintaining comparable accuracy as standard training. Previous practices also demonstrate that enlarging the ensemble can improve the robustness. However, existing ensemble methods are with poor scalability, owing to the rapid complexity increase when including more sub-models in the ensemble. Moreover, it is usually infeasible to train or deploy an ensemble with substantial sub-models, owing to the tight hardware resource budget and latency requirement. In this work, we propose Ensemble-in-One (EIO), a simple but effective method to enlarge the ensemble within a random gated network (RGN). EIO augments the original model by replacing the parameterized layers with multi-path random gated blocks (RGBs) to construct an RGN. By diversifying the vulnerability of the numerous paths through the super-net, it provides high scalability because the paths within an RGN exponentially increase with the network depth. Our experiments demonstrate that EIO consistently outperforms previous ensemble training methods with even less computational overhead, simultaneously achieving better accuracy-robustness trade-offs than adversarial training. ", "keywords": "Adversarial robustness;ensemble learning;random gated network;parameter sharing", "primary_area": "", "supplementary_material": "/attachment/2eb74a4071d9cc082b1de6b3c8af336490eaa2ce.zip", "author": "Yi Cai;Xuefei Ning;Huazhong Yang;Yu Wang", "authorids": "~Yi_Cai4;~Xuefei_Ning1;~Huazhong_Yang1;~Yu_Wang3", "gender": "M;Not Specified;M;M", "homepage": "https://nicsefc.ee.tsinghua.edu.cn/people/cai-y/;https://nics-effalg.com/ningxuefei/;https://nicsefc.ee.tsinghua.edu.cn;http://web.ee.tsinghua.edu.cn/yanghuazhong/en/index.htm", "dblp": ";202/9525;w/YuWang2.html;94/1128.html", "google_scholar": ";oVslpJsAAAAJ;https://scholar.google.com.hk/citations?user=j8JGVvoAAAAJ;", "orcid": ";;0000-0001-6108-5157;0000-0003-2421-353X", "linkedin": ";;;", "or_profile": "~Yi_Cai4;~Xuefei_Ning1;~Yu_Wang3;~Huazhong_Yang2", "aff": "Tsinghua University;Huawei Technologies Ltd.;Tsinghua University;Tsinghua University", "aff_domain": "tsinghua.edu.cn;huawei.com;tsinghua.edu.cn;tsinghua.edu.cn", "position": "PhD student;Postdoc;Full Professor;Full Professor", "bibtex": "@misc{\ncai2022ensembleinone,\ntitle={Ensemble-in-One: Learning Ensemble within Random Gated Networks for Enhanced Adversarial Robustness},\nauthor={Yi Cai and Xuefei Ning and Huazhong Yang and Yu Wang},\nyear={2022},\nurl={https://openreview.net/forum?id=uydP1ykieNv}\n}", "github": "", "project": "", "reviewers": "vVnY;7WHx;wUHR;a3La", "site": "https://openreview.net/forum?id=uydP1ykieNv", "pdf_size": 0, "recommendation": "5;5;5;6", "confidence": "3;4;3;4", "correctness": "3;3;3;2", "technical_novelty": "3;2;2;3", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "53;41;17;51", "wc_summary_review": "26;26;15;70", "wc_main_review": "290;122;124;277", "wc_review": "369;189;156;398", "wc_reply_reviewers": "0;53;0;93", "wc_reply_authors": "1066;355;751;1144", "reply_reviewers": "0;1;0;1", "reply_authors": "2;1;1;2", "recommendation_avg": [ 5.25, 0.4330127018922193 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 40.5, 14.309088021254185 ], "wc_summary_review_avg": [ 34.25, 21.123150806638673 ], "wc_main_review_avg": [ 203.25, 80.38462228560883 ], "wc_review_avg": [ 278.0, 106.63723552305733 ], "wc_reply_reviewers_avg": [ 36.5, 39.14396505209967 ], "wc_reply_authors_avg": [ 829.0, 310.70645310324664 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": -1.0, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14515863979255132196&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff_unique_index": "0;1;0;0", "aff_unique_norm": "Tsinghua University;Huawei", "aff_unique_dep": ";Huawei Technologies", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.huawei.com", "aff_unique_abbr": "THU;Huawei", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "id": "uymKrQiVuPg", "title": "Learning From Unpaired Data: A Variational Bayes Approach", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Collecting the paired training data is a difficult task in practice, but the unpaired samples broadly exist. Thus, current approaches aim at generating synthesized training data from the unpaired samples by exploring the relationship between the corrupted and clean data. In this work, we propose LUD-VAE, a method to learn the joint probability density function from the data sampled from marginal distributions. Our method is based on the variational inference framework and maximizes the evidence lower bound (ELBO), the lower bound of the joint probability density function. Furthermore, we show that the ELBO is computable without paired samples under the inference invariant assumption. This property provides the mathematical rationale of our approach in the unpaired setting. Finally, we apply our method to the real-world image denoising and super-resolution tasks and train the models using the synthetic data generated by the LUD-VAE. Experimental results on four datasets validate the advantages of our method over other learnable approaches.", "keywords": "Unpaired degradation modeling;Variational auto-encoder;Real-world image restoration", "primary_area": "", "supplementary_material": "", "author": "Dihan Zheng;Xiaowen Zhang;Kaisheng Ma;Chenglong Bao", "authorids": "~Dihan_Zheng1;~Xiaowen_Zhang2;~Kaisheng_Ma1;~Chenglong_Bao3", "gender": "M;;M;M", "homepage": "https://scholar.google.com/citations?user=hktsrwYAAAAJ&hl=zh-CN;https://www.xiaowen;http://group.iiis.tsinghua.edu.cn/~maks/index.html;https://matbc.github.io/", "dblp": "295/5394;;133/4053.html;", "google_scholar": "hktsrwYAAAAJ;;VtDpVoEAAAAJ;", "orcid": ";;0000-0001-9226-3366;", "linkedin": ";;;", "or_profile": "~Dihan_Zheng1;~Xiaowen_Zhang2;~Kaisheng_Ma1;~Chenglong_Bao3", "aff": "Tsinghua University;Hisilicon;;Tsinghua University", "aff_domain": "tsinghua.edu.cn;hisilicon.com;;tsinghua.edu.cn", "position": "PhD student;Researcher;;Assistant Professor", "bibtex": "@misc{\nzheng2022learning,\ntitle={Learning From Unpaired Data: A Variational Bayes Approach},\nauthor={Dihan Zheng and Xiaowen Zhang and Kaisheng Ma and Chenglong Bao},\nyear={2022},\nurl={https://openreview.net/forum?id=uymKrQiVuPg}\n}", "github": "", "project": "", "reviewers": "aL9w;7QhZ;ZHPS", "site": "https://openreview.net/forum?id=uymKrQiVuPg", "pdf_size": 0, "recommendation": "3;5;5", "confidence": "4;3;2", "correctness": "1;2;4", "technical_novelty": "2;3;3", "empirical_novelty": "2;2;3", "wc_summary_paper": "134;111;108", "wc_summary_review": "87;62;31", "wc_main_review": "1845;327;64", "wc_review": "2066;500;203", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 4.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 3.0, 0.816496580927726 ], "correctness_avg": [ 2.3333333333333335, 1.247219128924647 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 117.66666666666667, 11.61416759345623 ], "wc_summary_review_avg": [ 60.0, 22.90560339014597 ], "wc_main_review_avg": [ 745.3333333333334, 784.9595884856465 ], "wc_review_avg": [ 923.0, 817.2673981017474 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.8660254037844385, "corr_recommendation_correctness": 0.7559289460184545, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:AfmtLXU3q48J:scholar.google.com/&scioq=Learning+From+Unpaired+Data:+A+Variational+Bayes+Approach&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;0", "aff_unique_norm": "Tsinghua University;Huawei", "aff_unique_dep": ";Huawei Hisilicon", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.huawei.com/en/flowers/hisilicon", "aff_unique_abbr": "THU;HiSilicon", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "id": "v-27phh2c8O", "title": "AARL: Automated Auxiliary Loss for Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "A good state representation is crucial to reinforcement learning (RL) while an ideal representation is hard to learn only with signals from the RL objective. Thus, many recent works manually design auxiliary losses to improve sample efficiency and decision performance. However, handcrafted auxiliary losses rely heavily on expert knowledge, and therefore lack scalability and can be suboptimal for boosting RL performance. In this work, we introduce Automated Auxiliary loss for Reinforcement Learning (AARL), a principled approach that automatically searches the optimal auxiliary loss function for RL. Specifically, based on the collected trajectory data, we define a general auxiliary loss space of size $4.6\\times10^{19}$ and explore the space with an efficient evolutionary search strategy. We evaluate AARL on the DeepMind Control Suite and show that the searched auxiliary losses have significantly improved RL performance in both pixel-based and state-based settings, with the largest performance gain observed in the most challenging tasks. AARL greatly outperforms state-of-the-art methods and demonstrates strong generalization ability in unseen domains and tasks. We further conduct extensive studies to shed light on the effectiveness of auxiliary losses in RL. ", "keywords": "Reinforcement learning;Representation learning;Auxiliary Loss", "primary_area": "", "supplementary_material": "", "author": "Tairan He;Yuge Zhang;Kan Ren;Che Wang;Weinan Zhang;Dongsheng Li;Yuqing Yang", "authorids": "~Tairan_He1;~Yuge_Zhang1;~Kan_Ren1;~Che_Wang1;~Weinan_Zhang1;~Dongsheng_Li2;~Yuqing_Yang1", "gender": "M;M;M;M;M;M;", "homepage": "https://tairanhe.com;;https://saying.ren;https://watchernyu.github.io/me/;http://wnzhang.net;http://recmind.cn;", "dblp": "263/2891.html;256/1146;28/7458;130/6621;28/10261-1;254/0830-2.html;91/9064-1.html", "google_scholar": "TVWH2U8AAAAJ;kCQdkrQAAAAJ;USnQVWgAAAAJ;cx_Kg8MAAAAJ;Qzss0GEAAAAJ;VNg5rA8AAAAJ;4BtNQAEAAAAJ", "orcid": ";;;;0000-0002-0127-2425;0000-0003-3103-8442;0000-0003-3518-5212", "linkedin": "tairan-he-41a904294/;;;;;;", "or_profile": "~Tairan_He1;~Yuge_Zhang1;~Kan_Ren1;~Che_Wang1;~Weinan_Zhang1;~Dongsheng_Li2;~Yuqing_Yang1", "aff": "Microsoft;Microsoft;Microsoft;New York University;Shanghai Jiaotong University;Microsoft Research Asia;Microsoft Research", "aff_domain": "microsoft.com;microsoft.com;microsoft.com;nyu.edu;sjtu.edu.cn;microsoft.com;research.microsoft.com", "position": "Intern;Research SDE;Researcher;PhD student;Associate Professor;Principal Researcher;Researcher", "bibtex": "@misc{\nhe2022aarl,\ntitle={{AARL}: Automated Auxiliary Loss for Reinforcement Learning},\nauthor={Tairan He and Yuge Zhang and Kan Ren and Che Wang and Weinan Zhang and Dongsheng Li and Yuqing Yang},\nyear={2022},\nurl={https://openreview.net/forum?id=v-27phh2c8O}\n}", "github": "", "project": "", "reviewers": "xAxX;fPxR;Fvi1;fH26", "site": "https://openreview.net/forum?id=v-27phh2c8O", "pdf_size": 0, "recommendation": "3;3;5;6", "confidence": "3;4;4;4", "correctness": "2;2;3;3", "technical_novelty": "2;2;3;2", "empirical_novelty": "2;1;2;3", "wc_summary_paper": "50;74;167;69", "wc_summary_review": "33;52;100;27", "wc_main_review": "360;427;379;287", "wc_review": "443;553;646;383", "wc_reply_reviewers": "0;235;0;0", "wc_reply_authors": "728;929;510;556", "reply_reviewers": "0;1;0;0", "reply_authors": "3;2;1;1", "recommendation_avg": [ 4.25, 1.299038105676658 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 90.0, 45.348649373492925 ], "wc_summary_review_avg": [ 53.0, 28.661821295933027 ], "wc_main_review_avg": [ 363.25, 50.34071413875652 ], "wc_review_avg": [ 506.25, 101.12708588701645 ], "wc_reply_reviewers_avg": [ 58.75, 101.75798494467153 ], "wc_reply_authors_avg": [ 680.75, 164.75644903918027 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.75, 0.82915619758885 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.5555555555555555, "corr_recommendation_correctness": 0.9622504486493761, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15285773203190292169&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;0;1;2;0;0", "aff_unique_norm": "Microsoft;New York University;Shanghai Jiao Tong University", "aff_unique_dep": "Microsoft Corporation;;", "aff_unique_url": "https://www.microsoft.com;https://www.nyu.edu;https://www.sjtu.edu.cn", "aff_unique_abbr": "Microsoft;NYU;SJTU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Asia", "aff_country_unique_index": "0;0;0;0;1;1;0", "aff_country_unique": "United States;China" }, { "id": "v-f7ifhKYps", "title": "Maximum Entropy Population Based Training for Zero-Shot Human-AI Coordination", "track": "main", "status": "Reject", "tldr": "", "abstract": "An AI agent should be able to coordinate with humans to solve tasks. We consider the problem of training a Reinforcement Learning (RL) agent without using any human data, i.e., in a zero-shot setting, to make it capable of collaborating with humans. Standard RL agents learn through self-play. Unfortunately, these agents only know how to collaborate with themselves and normally do not perform well with unseen partners, such as humans. The methodology of how to train a robust agent in a zero-shot fashion is still subject to research. Motivated from the maximum entropy RL, we derive a centralized population entropy objective to facilitate learning of a diverse population of agents, which is later used to train a robust AI agent to collaborate with unseen partners. The proposed method shows its effectiveness compared to baseline methods, including self-play PPO, the standard Population-Based Training (PBT), and trajectory diversity-based PBT, in the popular Overcooked game environment. We also conduct online experiments with real humans and further demonstrate the efficacy of the method in the real world.", "keywords": "Human-AI Coordination;Reinforcement Learning;Zero-Shot Human-AI Coordination;Deep Reinforcement Learning", "primary_area": "", "supplementary_material": "/attachment/5625cc3bb5ce718fd6e6d4a223db03bce3b7dc6b.zip", "author": "Rui Zhao;Jinming Song;Hu Haifeng;Yang Gao;Yi Wu;Zhongqian Sun;Yang Wei", "authorids": "~Rui_Zhao1;joshuasong@tencent.com;~Hu_Haifeng1;~Yang_Gao1;~Yi_Wu1;sallensun@tencent.com;~Yang_Wei2", "gender": "M;;M;M;M;;M", "homepage": "https://ruizhaogit.github.io;;;http://yang-gao.weebly.com;https://jxwuyi.weebly.com;;", "dblp": "26/2578-11;;;89/4402-29;;;03/1094-32.html", "google_scholar": "N1yNDnQAAAAJ;;IkmKanYAAAAJ;https://scholar.google.com/citations?hl=en;dusV5HMAAAAJ;;", "orcid": ";;;;;;", "linkedin": "rui-zhao-profile/;;;yang-gao-45245348/;;;", "or_profile": "~Rui_Zhao1;joshuasong@tencent.com;~Hu_Haifeng1;~Yang_Gao1;~Yi_Wu1;sallensun@tencent.com;~Yang_Wei2", "aff": "Tencent AI Lab;;Tencent AI Platform;Tsinghua University;Tsinghua University;;Tencent AI Lab", "aff_domain": "tencent.com;;tencent.com;tsinghua.edu.cn;tsinghua.edu.cn;;tencent.com", "position": "Researcher;;Researcher;Assistant Professor;Assistant Professor;;Researcher", "bibtex": "@misc{\nzhao2022maximum,\ntitle={Maximum Entropy Population Based Training for Zero-Shot Human-{AI} Coordination},\nauthor={Rui Zhao and Jinming Song and Hu Haifeng and Yang Gao and Yi Wu and Zhongqian Sun and Yang Wei},\nyear={2022},\nurl={https://openreview.net/forum?id=v-f7ifhKYps}\n}", "github": "", "project": "", "reviewers": "mavT;PRq2;MRtf;1grX;5ykj", "site": "https://openreview.net/forum?id=v-f7ifhKYps", "pdf_size": 0, "recommendation": "3;3;5;6;6", "confidence": "5;3;4;3;3", "correctness": "3;3;2;4;3", "technical_novelty": "2;3;2;3;3", "empirical_novelty": "2;3;3;2;2", "wc_summary_paper": "112;174;44;71;64", "wc_summary_review": "41;101;56;22;67", "wc_main_review": "1231;441;555;286;664", "wc_review": "1384;716;655;379;795", "wc_reply_reviewers": "431;398;137;0;331", "wc_reply_authors": "810;446;396;227;348", "reply_reviewers": "1;1;1;0;1", "reply_authors": "1;1;2;1;1", "recommendation_avg": [ 4.6, 1.3564659966250536 ], "confidence_avg": [ 3.6, 0.8 ], "correctness_avg": [ 3.0, 0.6324555320336759 ], "technical_novelty_avg": [ 2.6, 0.4898979485566356 ], "empirical_novelty_avg": [ 2.4, 0.4898979485566356 ], "wc_summary_paper_avg": [ 93.0, 46.14758931948667 ], "wc_summary_review_avg": [ 57.4, 26.522443326360413 ], "wc_main_review_avg": [ 635.4, 323.0768329670204 ], "wc_review_avg": [ 785.8, 330.27104020788744 ], "wc_reply_reviewers_avg": [ 259.4, 164.95647910888496 ], "wc_reply_authors_avg": [ 445.4, 196.24433749792627 ], "reply_reviewers_avg": [ 0.8, 0.4 ], "reply_authors_avg": [ 1.2, 0.4 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": -0.51604684654214, "corr_recommendation_correctness": 0.23312620206007845, "gs_citation": 77, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15588872748076271403&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0;1;1;0", "aff_unique_norm": "Tencent;Tsinghua University", "aff_unique_dep": "Tencent AI Lab;", "aff_unique_url": "https://ai.tencent.com;https://www.tsinghua.edu.cn", "aff_unique_abbr": "Tencent AI Lab;THU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "title": "NASI: Label- and Data-agnostic Neural Architecture Search at Initialization", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6257", "id": "v-v1cpNNK_v", "poster": "", "openreview": "https://openreview.net/forum?id=v-v1cpNNK_v", "slides": "https://iclr.cc/virtual/2022/poster/6257", "video": "https://iclr.cc/virtual/2022/poster/6257", "author_site": "Yao Shu, shaofeng cai, Zhongxiang Dai, Beng Chin Ooi, Bryan Kian Hsiang Low", "tldr": "", "abstract": "Recent years have witnessed a surging interest in Neural Architecture Search (NAS). Various algorithms have been proposed to improve the search efficiency and effectiveness of NAS, i.e., to reduce the search cost and improve the generalization performance of the selected architectures, respectively. However, the search efficiency of these algorithms is severely limited by the need for model training during the search process. To overcome this limitation, we propose a novel NAS algorithm called NAS at Initialization (NASI) that exploits the capability of a Neural Tangent Kernel in being able to characterize the performance of candidate architectures at initialization, hence allowing model training to be completely avoided to boost the search efficiency. Besides the improved search efficiency, NASI also achieves competitive search effectiveness on various datasets like CIFAR-10/100 and ImageNet. Further, NASI is shown to be label- and data-agnostic under mild conditions, which guarantees the transferability of architectures selected by our NASI over different datasets.", "keywords": "Neural Architecture Search;Initialization;Label- and Data-agnostic;Transferability;Neural Tangent Kernel", "primary_area": "", "supplementary_material": "/attachment/5ef5424b94cc42911969653348a81b60d3154928.zip", "author": "Yao Shu;Shaofeng Cai;Zhongxiang Dai;Beng Chin Ooi;Bryan Kian Hsiang Low", "authorids": "~Yao_Shu1;~Shaofeng_Cai1;~Zhongxiang_Dai1;~Beng_Chin_Ooi1;~Bryan_Kian_Hsiang_Low1", "gender": "M;M;M;M;M", "homepage": "https://yao.notion.site;https://solopku.github.io/;https://daizhongxiang.github.io/;http://www.comp.nus.edu.sg/~ooibc/;http://www.comp.nus.edu.sg/~lowkh", "dblp": "44/1338;228/6099;172/4968;o/BengChinOoi;97/4877", "google_scholar": "https://scholar.google.com.au/citations?hl=en;Nzr-hIoAAAAJ;1v8xOIYAAAAJ;https://scholar.google.com.tw/citations?user=9560QjYAAAAJ;https://scholar.google.com.tw/citations?user=2P-Q09UAAAAJ", "orcid": ";0000-0001-8605-076X;;0000-0003-4446-1100;", "linkedin": "yao-shu-a5640514b;;;beng-chin-ooi-34b0634/;", "or_profile": "~Yao_Shu1;~Shaofeng_Cai1;~Zhongxiang_Dai1;~Beng_Chin_Ooi1;~Bryan_Kian_Hsiang_Low1", "aff": "National University of Singapore;National University of Singapore;National University of Singapore;National University of Singapore;National University of Singapore", "aff_domain": "nus.edu.sg;comp.nus.edu.sg;nus.edu.sg;comp.nus.edu.sg;nus.edu.sg", "position": "PhD student;Researcher;Postdoc;Full Professor;Associate Professor", "bibtex": "@inproceedings{\nshu2022nasi,\ntitle={{NASI}: Label- and Data-agnostic Neural Architecture Search at Initialization},\nauthor={Yao Shu and Shaofeng Cai and Zhongxiang Dai and Beng Chin Ooi and Bryan Kian Hsiang Low},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=v-v1cpNNK_v}\n}", "github": "", "project": "", "reviewers": "DrxZ;2dLC;iQzj;PjYZ", "pdf_size": 0, "recommendation": "6;6;6;8", "confidence": "5;4;3;4", "correctness": "3;3;3;4", "technical_novelty": "3;3;4;3", "empirical_novelty": "3;3;3;3", "wc_summary_paper": "24;73;86;114", "wc_summary_review": "53;41;30;41", "wc_main_review": "104;240;347;141", "wc_review": "181;354;463;296", "wc_reply_reviewers": "0;40;203;40", "wc_reply_authors": "326;880;1311;458", "reply_reviewers": "0;1;1;1", "reply_authors": "2;2;4;2", "recommendation_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 3.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 74.25, 32.575872973720905 ], "wc_summary_review_avg": [ 41.25, 8.13557004763649 ], "wc_main_review_avg": [ 208.0, 94.40603794249603 ], "wc_review_avg": [ 323.5, 101.80004911590171 ], "wc_reply_reviewers_avg": [ 70.75, 78.08128777114271 ], "wc_reply_authors_avg": [ 743.75, 386.16859983691063 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 2.5, 0.8660254037844386 ], "replies_avg": [ 20, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 1.0, "gs_citation": 55, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4114516442966769765&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=v-v1cpNNK_v", "email": "nus.edu.sg;comp.nus.edu.sg;nus.edu.sg;comp.nus.edu.sg;nus.edu.sg", "author_num": 5, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "National University of Singapore", "aff_unique_dep": "", "aff_unique_url": "https://www.nus.edu.sg", "aff_unique_abbr": "NUS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "Singapore" }, { "id": "v3LXWP63qOZ", "title": "Learning Minimal Representations with Model Invariance", "track": "main", "status": "Reject", "tldr": "", "abstract": "Sparsity has been identified as an important characteristic in learning neural networks that generalize well, forming the key idea in constructing minimal representations. Minimal representations are ones that only encode information required to predict well on a task and nothing more. In this paper we present a powerful approach to learning minimal representations. Our method, called ModInv or model invariance, argues for learning using multiple predictors and a single representation, creating a bottleneck architecture. Predictors' learning landscapes are diversified by training independently and with different learning rates. The common representation acts as a implicit invariance objective to avoid the different spurious correlations captured by individual predictors. This in turn leads to better generalization performance. ModInv is tested on both the Reinforcement Learning and the Self-supervised Learning settings, showcasing strong performance boosts in both. It is extremely simple to implement, does not lead to any delay in walk clock times while training, and can be applied across different problem settings. ", "keywords": "Representation Learning;Minimal Representations;Reinforcement Learning;Self-supervised Learning", "primary_area": "", "supplementary_material": "", "author": "Manan Tomar;Amy Zhang;Matthew E. Taylor", "authorids": "~Manan_Tomar1;~Amy_Zhang1;~Matthew_E._Taylor2", "gender": "M;F;M", "homepage": "https://manantomar.github.io/;;https://irll.ca", "dblp": "241/6227;43/2754;46/4287.html", "google_scholar": ";;edQgLXcAAAAJ", "orcid": ";;0000-0001-8946-0211", "linkedin": ";;", "or_profile": "~Manan_Tomar1;~Amy_Zhang2;~Matthew_Taylor1", "aff": "Microsoft;University of California, Berkeley;Washington State University, Pullman", "aff_domain": "microsoft.com;berkeley.edu;wsu.edu", "position": "Intern;Postdoc;Adjunct Professor", "bibtex": "@misc{\ntomar2022learning,\ntitle={Learning Minimal Representations with Model Invariance},\nauthor={Manan Tomar and Amy Zhang and Matthew E. Taylor},\nyear={2022},\nurl={https://openreview.net/forum?id=v3LXWP63qOZ}\n}", "github": "", "project": "", "reviewers": "LtXH;vnKu;8QAk;Q19W", "site": "https://openreview.net/forum?id=v3LXWP63qOZ", "pdf_size": 0, "recommendation": "3;3;5;6", "confidence": "4;4;4;4", "correctness": "1;1;3;3", "technical_novelty": "1;2;3;2", "empirical_novelty": "1;2;3;3", "wc_summary_paper": "50;108;174;63", "wc_summary_review": "60;93;95;60", "wc_main_review": "336;534;726;399", "wc_review": "446;735;995;522", "wc_reply_reviewers": "49;0;439;0", "wc_reply_authors": "351;495;541;198", "reply_reviewers": "1;0;1;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 4.25, 1.299038105676658 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 2.0, 1.0 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 98.75, 48.483889076681955 ], "wc_summary_review_avg": [ 77.0, 17.014699527173555 ], "wc_main_review_avg": [ 498.75, 149.434559255883 ], "wc_review_avg": [ 674.5, 213.21878435072273 ], "wc_reply_reviewers_avg": [ 122.0, 184.11002145456396 ], "wc_reply_authors_avg": [ 396.25, 134.21507925713863 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.9622504486493761, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:M8nZGyxgf9sJ:scholar.google.com/&scioq=Learning+Minimal+Representations+with+Model+Invariance&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1;2", "aff_unique_norm": "Microsoft;University of California, Berkeley;Washington State University", "aff_unique_dep": "Microsoft Corporation;;", "aff_unique_url": "https://www.microsoft.com;https://www.berkeley.edu;https://wsu.edu", "aff_unique_abbr": "Microsoft;UC Berkeley;WSU", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Berkeley;Pullman", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Chunked Autoregressive GAN for Conditional Waveform Synthesis", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6816", "id": "v3aeIsY_vVX", "poster": "", "openreview": "https://openreview.net/forum?id=v3aeIsY_vVX", "slides": "https://iclr.cc/virtual/2022/poster/6816", "video": "https://iclr.cc/virtual/2022/poster/6816", "author_site": "Max Morrison, Rithesh Kumar, Kundan Kumar, Prem Seetharaman, Aaron Courville, Yoshua Bengio", "tldr": "", "abstract": "Conditional waveform synthesis models learn a distribution of audio waveforms given conditioning such as text, mel-spectrograms, or MIDI. These systems employ deep generative models that model the waveform via either sequential (autoregressive) or parallel (non-autoregressive) sampling. Generative adversarial networks (GANs) have become a common choice for non-autoregressive waveform synthesis. However, state-of-the-art GAN-based models produce artifacts when performing mel-spectrogram inversion. In this paper, we demonstrate that these artifacts correspond with an inability for the generator to learn accurate pitch and periodicity. We show that simple pitch and periodicity conditioning is insufficient for reducing this error relative to using autoregression. We discuss the inductive bias that autoregression provides for learning the relationship between instantaneous frequency and phase, and show that this inductive bias holds even when autoregressively sampling large chunks of the waveform during each forward pass. Relative to prior state-of-the-art GAN-based models, our proposed model, Chunked Autoregressive GAN (CARGAN) reduces pitch error by 40-60%, reduces training time by 58%, maintains a fast inference speed suitable for real-time or interactive applications, and maintains or improves subjective quality.", "keywords": "audio generation;speech synthesis;deep learning;generative models;autoregression;generative adversarial networks", "primary_area": "", "supplementary_material": "/attachment/ec88c411918d8107aad13556f6a935a5a5d76484.zip", "author": "Max Morrison;Rithesh Kumar;Kundan Kumar;Prem Seetharaman;Aaron Courville;Yoshua Bengio", "authorids": "~Max_Morrison2;~Rithesh_Kumar1;~Kundan_Kumar1;~Prem_Seetharaman1;~Aaron_Courville3;~Yoshua_Bengio1", "gender": "M;M;M;M;;M", "homepage": "https://www.maxrmorrison.com/;;http://kundan2510.github.io;https://pseeth.github.io;;http://yoshuabengio.org", "dblp": "252/5390;192/1862;;;56/1688;56/953", "google_scholar": "DfjXyrEAAAAJ;https://scholar.google.ca/citations?user=hJjeVsQAAAAJ;;XHD-48cAAAAJ;https://scholar.google.ca/citations?user=km6CP8cAAAAJ;kukA0LcAAAAJ", "orcid": "0000-0002-6082-5157;;;;;", "linkedin": "morrimax/;rithesh-kumar-b0479488/;https://ca.linkedin.com/in/kundan-kumar-50a0a361;;;yoshuabengio/?originalSubdomain=ca", "or_profile": "~Max_Morrison2;~Rithesh_Kumar1;~Kundan_Kumar1;~Prem_Seetharaman1;~Aaron_Courville3;~Yoshua_Bengio1", "aff": "Northwestern University;Descript Inc.;University of Montreal;Descript;Universit\u00e9 de Montr\u00e9al;University of Montreal", "aff_domain": "northwestern.edu;descript.com;umontreal.ca;descript.com; ;umontreal.ca", "position": "PhD student;Researcher;PhD student;Research Scientist;Assistant Professor;Full Professor", "bibtex": "@inproceedings{\nmorrison2022chunked,\ntitle={Chunked Autoregressive {GAN} for Conditional Waveform Synthesis},\nauthor={Max Morrison and Rithesh Kumar and Kundan Kumar and Prem Seetharaman and Aaron Courville and Yoshua Bengio},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=v3aeIsY_vVX}\n}", "github": "", "project": "", "reviewers": "Zbrk;agkP;dhJa", "pdf_size": 0, "recommendation": "6;8;8", "confidence": "4;4;4", "correctness": "3;4;4", "technical_novelty": "2;4;3", "empirical_novelty": "3;4;4", "wc_summary_paper": "167;118;119", "wc_summary_review": "76;15;23", "wc_main_review": "640;313;132", "wc_review": "883;446;274", "wc_reply_reviewers": "137;108;24", "wc_reply_authors": "1000;783;308", "reply_reviewers": "1;1;1", "reply_authors": "2;2;1", "recommendation_avg": [ 7.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 3.0, 0.816496580927726 ], "empirical_novelty_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 134.66666666666666, 22.866763848189994 ], "wc_summary_review_avg": [ 38.0, 27.067816067549053 ], "wc_main_review_avg": [ 361.6666666666667, 210.22580452667768 ], "wc_review_avg": [ 534.3333333333334, 256.34915425818923 ], "wc_reply_reviewers_avg": [ 89.66666666666667, 47.91891299082464 ], "wc_reply_authors_avg": [ 697.0, 288.9786612652683 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.9999999999999998, "gs_citation": 82, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12411331012561904832&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=v3aeIsY_vVX", "email": "northwestern.edu;descript.com;umontreal.ca;descript.com; ;umontreal.ca", "author_num": 6, "aff_unique_index": "0;1;2;3;4;2", "aff_unique_norm": "Northwestern University;Descript Inc.;University of Montreal;Descript;Universit\u00e9 de Montr\u00e9al", "aff_unique_dep": ";;;;", "aff_unique_url": "https://www.northwestern.edu;https://www.descript.com;https://wwwumontreal.ca;https://www.descript.com;https://www.umontreal.ca", "aff_unique_abbr": "NU;;UM;Descript;UdeM", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;1;1", "aff_country_unique": "United States;Canada" }, { "title": "Do Users Benefit From Interpretable Vision? A User Study, Baseline, And Dataset", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/5894", "id": "v6s3HVjPerv", "poster": "", "openreview": "https://openreview.net/forum?id=v6s3HVjPerv", "slides": "https://iclr.cc/virtual/2022/poster/5894", "video": "https://iclr.cc/virtual/2022/poster/5894", "author_site": "Leon Sixt, Martin Schuessler, Oana-Iuliana Popescu, Philipp Wei\u00df, Tim Landgraf", "tldr": "", "abstract": "A variety of methods exist to explain image classification models. However, whether they provide any benefit to users over simply comparing various inputs and the model\u2019s respective predictions remains unclear. We conducted a user study (N=240) to test how such a baseline explanation technique performs against concept-based and counterfactual explanations. To this end, we contribute a synthetic dataset generator capable of biasing individual attributes and quantifying their relevance to the model. In a study, we assess if participants can identify the relevant set of attributes compared to the ground-truth. Our results show that the baseline outperformed concept-based explanations. Counterfactual explanations from an invertible neural network performed similarly as the baseline. Still, they allowed users to identify some attributes more accurately. Our results highlight the importance of measuring how well users can reason about biases of a model, rather than solely relying on technical evaluations or proxy tasks. We open-source our study and dataset so it can serve as a blue-print for future studies.", "keywords": "Interpretable ML;User Study;Human Subject Evaluation;Invertible Neural Networks;Convolutional Networks", "primary_area": "", "supplementary_material": "", "author": "Leon Sixt;Martin Schuessler;Oana-Iuliana Popescu;Philipp Wei\u00df;Tim Landgraf", "authorids": "~Leon_Sixt1;~Martin_Schuessler1;~Oana-Iuliana_Popescu1;philipp@itp.tu-berlin.de;~Tim_Landgraf1", "gender": "M;M;F;;", "homepage": "https://userpage.fu-berlin.de/leonsixt/;http://www.mschuessler.de;;;", "dblp": ";00/10854.html;170/8417;;04/10008", "google_scholar": "XtejLN8AAAAJ;w-qt1ooAAAAJ;;;https://scholar.google.de/citations?user=ChX0opIAAAAJ", "orcid": ";;;;0000-0003-4951-5235", "linkedin": ";schuesslerm/;oana-iuliana-popescu/;;", "or_profile": "~Leon_Sixt1;~Martin_Schuessler1;~Oana-Iuliana_Popescu1;philipp@itp.tu-berlin.de;~Tim_Landgraf1", "aff": "Freie Universit\u00e4t Berlin;TU Berlin;German Aerospace Center, Institute of Data Science;;Freie Universit\u00e4t Berlin", "aff_domain": "fu-berlin.de;tu-berlin.de;dlr.de;;fu-berlin.de", "position": "PhD student;PhD student;PhD student;;Assistant Professor", "bibtex": "@inproceedings{\nsixt2022do,\ntitle={Do Users Benefit From Interpretable Vision? A User Study, Baseline, And Dataset},\nauthor={Leon Sixt and Martin Schuessler and Oana-Iuliana Popescu and Philipp Wei{\\ss} and Tim Landgraf},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=v6s3HVjPerv}\n}", "github": "", "project": "", "reviewers": "3sBP;pBBz;rVJ7;gzkV", "pdf_size": 0, "recommendation": "5;5;6;8", "confidence": "5;5;3;3", "correctness": "3;3;4;4", "technical_novelty": "2;2;3;3", "empirical_novelty": "3;3;4;3", "wc_summary_paper": "106;56;102;189", "wc_summary_review": "56;37;35;73", "wc_main_review": "266;348;130;263", "wc_review": "428;441;267;525", "wc_reply_reviewers": "0;0;0;96", "wc_reply_authors": "1058;1487;549;668", "reply_reviewers": "0;0;0;1", "reply_authors": "3;3;2;2", "recommendation_avg": [ 6.0, 1.224744871391589 ], "confidence_avg": [ 4.0, 1.0 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 3.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 113.25, 47.94462952198087 ], "wc_summary_review_avg": [ 50.25, 15.481844205391036 ], "wc_main_review_avg": [ 251.75, 78.12929988167052 ], "wc_review_avg": [ 415.25, 93.33909952426154 ], "wc_reply_reviewers_avg": [ 24.0, 41.569219381653056 ], "wc_reply_authors_avg": [ 940.5, 367.42244079533305 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 2.5, 0.5 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.8164965809277259, "corr_recommendation_correctness": 0.8164965809277259, "gs_citation": 21, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17643359548454161307&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=v6s3HVjPerv", "email": "fu-berlin.de;tu-berlin.de;dlr.de;;fu-berlin.de", "author_num": 5, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "Freie Universit\u00e4t Berlin;Technische Universit\u00e4t Berlin;German Aerospace Center", "aff_unique_dep": ";;Institute of Data Science", "aff_unique_url": "https://www.fu-berlin.de;https://www.tu-berlin.de;https://www.dlr.de", "aff_unique_abbr": "FU Berlin;TU Berlin;DLR", "aff_campus_unique_index": "1", "aff_campus_unique": ";Berlin", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Germany" }, { "title": "Low-Budget Active Learning via Wasserstein Distance: An Integer Programming Approach", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6451", "id": "v8OlxjGn23S", "poster": "", "openreview": "https://openreview.net/forum?id=v8OlxjGn23S", "slides": "https://iclr.cc/virtual/2022/poster/6451", "video": "https://iclr.cc/virtual/2022/poster/6451", "author_site": "Rafid Mahmood, Sanja Fidler, Marc T Law", "tldr": "", "abstract": "Active learning is the process of training a model with limited labeled data by selecting a core subset of an unlabeled data pool to label. The large scale of data sets used in deep learning forces most sample selection strategies to employ efficient heuristics. This paper introduces an integer optimization problem for selecting a core set that minimizes the discrete Wasserstein distance from the unlabeled pool. We demonstrate that this problem can be tractably solved with a Generalized Benders Decomposition algorithm. Our strategy uses high-quality latent features that can be obtained by unsupervised learning on the unlabeled pool. Numerical results on several data sets show that our optimization approach is competitive with baselines and particularly outperforms them in the low budget regime where less than one percent of the data set is labeled. ", "keywords": "active learning;integer optimization", "primary_area": "", "supplementary_material": "", "author": "Rafid Mahmood;Sanja Fidler;Marc T Law", "authorids": "~Rafid_Mahmood1;~Sanja_Fidler1;~Marc_T_Law1", "gender": ";F;M", "homepage": "http://rafidrm.github.io;http://www.cs.toronto.edu/~fidler/;http://www.cs.toronto.edu/~law/", "dblp": "164/5832;08/6607;117/7668", "google_scholar": "https://scholar.google.ca/citations?user=NoPweUQAAAAJ;CUlqK5EAAAAJ;https://scholar.google.fr/citations?user=_7QgnUcAAAAJ", "orcid": ";;", "linkedin": ";sanja-fidler-2846a1a?trk=hp-identity-name;", "or_profile": "~Rafid_Mahmood1;~Sanja_Fidler1;~Marc_T_Law1", "aff": "NVIDIA;Department of Computer Science, University of Toronto;NVIDIA", "aff_domain": "nvidia.com;cs.toronto.edu;nvidia.com", "position": "AI Resident;Associate Professor;Research Scientist", "bibtex": "@inproceedings{\nmahmood2022lowbudget,\ntitle={Low-Budget Active Learning via Wasserstein Distance: An Integer Programming Approach},\nauthor={Rafid Mahmood and Sanja Fidler and Marc T Law},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=v8OlxjGn23S}\n}", "github": "", "project": "", "reviewers": "HRFS;9PdP;vpVq;3bUG", "pdf_size": 0, "recommendation": "6;6;6;8", "confidence": "4;3;3;3", "correctness": "4;3;3;4", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "59;83;67;73", "wc_summary_review": "36;36;36;9", "wc_main_review": "140;252;225;93", "wc_review": "235;371;328;175", "wc_reply_reviewers": "0;0;50;0", "wc_reply_authors": "634;738;373;27", "reply_reviewers": "0;0;1;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 70.5, 8.760707733967616 ], "wc_summary_review_avg": [ 29.25, 11.691342951089922 ], "wc_main_review_avg": [ 177.5, 63.93942445784135 ], "wc_review_avg": [ 277.25, 76.81918705635982 ], "wc_reply_reviewers_avg": [ 12.5, 21.650635094610966 ], "wc_reply_authors_avg": [ 443.0, 274.5277763724465 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 45, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5987498715951456845&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=v8OlxjGn23S", "email": "nvidia.com;cs.toronto.edu;nvidia.com", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "NVIDIA;University of Toronto", "aff_unique_dep": "NVIDIA Corporation;Department of Computer Science", "aff_unique_url": "https://www.nvidia.com;https://www.utoronto.ca", "aff_unique_abbr": "NVIDIA;U of T", "aff_campus_unique_index": "1", "aff_campus_unique": ";Toronto", "aff_country_unique_index": "0;1;0", "aff_country_unique": "United States;Canada" }, { "id": "v9iBLdSkFiP", "title": "TADA: Taxonomy Adaptive Domain Adaptation", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Traditional domain adaptation addresses the task of adapting a model to a novel target domain under limited or no additional supervision. While tackling the input domain gap, the standard domain adaptation settings assume no domain change in the output space. In semantic prediction tasks, different datasets are often labeled according to different semantic taxonomies. In many real-world settings, the target domain task requires a different taxonomy than the one imposed by the source domain. We therefore introduce the more general taxonomy adaptive domain adaptation (TADA) problem, allowing for inconsistent taxonomies between the two domains. We further propose an approach that jointly addresses the image-level and label-level domain adaptation. On the label-level, we employ a bilateral mixed sampling strategy to augment the target domain, and a relabelling method to unify and align the label spaces. We address the image-level domain gap by proposing an uncertainty-rectified contrastive learning method, leading to more domain-invariant and class discriminative features. We extensively evaluate the effectiveness of our framework under different TADA settings: open taxonomy, coarse-to-fine taxonomy, and partially-overlapping taxonomy. Our framework outperforms previous state-of-the-art by a large margin, while capable of adapting to target taxonomies.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Rui Gong;Martin Danelljan;Dengxin Dai;Wenguan Wang;Danda Pani Paudel;Ajad Chhatkuli;Fisher Yu;Luc Van Gool", "authorids": "~Rui_Gong2;~Martin_Danelljan4;~Dengxin_Dai1;~Wenguan_Wang4;~Danda_Pani_Paudel1;~Ajad_Chhatkuli1;~Fisher_Yu2;~Luc_Van_Gool1", "gender": "M;M;M;M;M;M;M;", "homepage": ";https://martin-danelljan.github.io/;https://scholar.google.com/citations?user=T51W57YAAAAJ&hl=en;https://sites.google.com/view/wenguanwang/;https://people.ee.ethz.ch/~paudeld/;https://ajadchhatkuli.github.io;https://www.yf.io/;", "dblp": ";151/8848;98/8616;145/1078;;149/7655;117/6314;61/5017", "google_scholar": "4St8MdYAAAAJ;NCSSpMkAAAAJ;T51W57YAAAAJ;CqAQQkgAAAAJ;https://scholar.google.ch/citations?user=W43pvPkAAAAJ;3BHMHU4AAAAJ;-XCiamcAAAAJ;https://scholar.google.be/citations?user=TwMib_QAAAAJ", "orcid": ";;;0000-0002-0802-9567;;0000-0003-2051-2209;;", "linkedin": ";;;wenguanwang;;;;", "or_profile": "~Rui_Gong2;~Martin_Danelljan4;~Dengxin_Dai1;~Wenguan_Wang4;~Danda_Pani_Paudel1;~Ajad_Chhatkuli1;~Fisher_Yu2;~Luc_Van_Gool1", "aff": "Swiss Federal Institute of Technology;ETH Zurich;;ETH Zurich;ETHZ - ETH Zurich;Swiss Federal Institute of Technology;Swiss Federal Institute of Technology;KU Leuven", "aff_domain": "ethz.ch;vision.ee.ethz.ch;;vision.ee.ethz.ch;ethz.ch;ethz.ch;ethz.ch;kuleuven.be", "position": "PhD student;Principal Researcher;;Postdoc;Lecturer;Postdoc;Assistant Professor;Emeritus", "bibtex": "@misc{\ngong2022tada,\ntitle={{TADA}: Taxonomy Adaptive Domain Adaptation},\nauthor={Rui Gong and Martin Danelljan and Dengxin Dai and Wenguan Wang and Danda Pani Paudel and Ajad Chhatkuli and Fisher Yu and Luc Van Gool},\nyear={2022},\nurl={https://openreview.net/forum?id=v9iBLdSkFiP}\n}", "github": "", "project": "", "reviewers": "FHb6;Cunx;Lgci;F5ZY", "site": "https://openreview.net/forum?id=v9iBLdSkFiP", "pdf_size": 0, "recommendation": "3;3;5;6", "confidence": "5;5;4;4", "correctness": "3;3;3;3", "technical_novelty": "2;1;3;2", "empirical_novelty": "0;1;2;3", "wc_summary_paper": "65;114;134;104", "wc_summary_review": "47;20;64;52", "wc_main_review": "378;338;206;236", "wc_review": "490;472;404;392", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.25, 1.299038105676658 ], "confidence_avg": [ 4.5, 0.5 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 1.5, 1.118033988749895 ], "wc_summary_paper_avg": [ 104.25, 25.103535607559348 ], "wc_summary_review_avg": [ 45.75, 16.099301227071937 ], "wc_main_review_avg": [ 289.5, 70.7442577174996 ], "wc_review_avg": [ 439.5, 42.19893363581596 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": -0.9622504486493761, "corr_recommendation_correctness": 0.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6762238912976414966&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1;1;1;0;0;2", "aff_unique_norm": "Swiss Federal Institute of Technology;ETH Zurich;Katholieke Universiteit Leuven", "aff_unique_dep": ";;", "aff_unique_url": "https://www.ethz.ch;https://www.ethz.ch;https://www.kuleuven.be", "aff_unique_abbr": "ETH Zurich;ETHZ;KU Leuven", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;1", "aff_country_unique": "Switzerland;Belgium" }, { "title": "Implicit Bias of Projected Subgradient Method Gives Provable Robust Recovery of Subspaces of Unknown Codimension", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7038", "id": "vA7doMdgi75", "poster": "", "openreview": "https://openreview.net/forum?id=vA7doMdgi75", "slides": "https://iclr.cc/virtual/2022/poster/7038", "video": "https://iclr.cc/virtual/2022/poster/7038", "author_site": "Paris Giampouras, Benjamin Haeffele, Rene Vidal", "tldr": "", "abstract": "Robust subspace recovery (RSR) is the problem of learning a subspace from sample data points corrupted by outliers. Dual Principal Component Pursuit (DPCP) is a robust subspace recovery method that aims to find a basis for the orthogonal complement of the subspace by minimizing the sum of the distances of the points to the subspaces subject to orthogonality constraints on the basis. Prior work has shown that DPCP can provably recover the correct subspace in the presence of outliers as long as the true dimension of the subspace is known. In this paper, we show that if the orthogonality constraints --adopted in previous DPCP formulations-- are relaxed and random initialization is used instead of spectral one, DPCP can provably recover a subspace of \\emph{unknown dimension}. Specifically, we propose a very simple algorithm based on running multiple instances of a projected sub-gradient descent method (PSGM), with each problem instance seeking to find one vector in the null space of the subspace. We theoretically prove that under mild conditions this approach succeeds with high probability. In particular, we show that 1) all of the problem instances will converge to a vector in the nullspace of the subspace and 2) the ensemble of problem instance solutions will be sufficiently diverse to fully span the nullspace of the subspace thus also revealing its true unknown codimension. We provide empirical results that corroborate our theoretical results and showcase the remarkable implicit rank regularization behavior of the PSGM algorithm that allows us to perform RSR without knowing the subspace dimension", "keywords": "representation learning;robust subspace recovery;dual principals component pursuit;outliers;model selection", "primary_area": "", "supplementary_material": "", "author": "Paris Giampouras;Benjamin David Haeffele;Rene Vidal", "authorids": "~Paris_Giampouras1;~Benjamin_David_Haeffele1;~Rene_Vidal1", "gender": "M;;", "homepage": "https://parisgiampouras.github.io;;http://www.vision.jhu.edu", "dblp": "134/0138;;v/ReneVidal", "google_scholar": "mZCc1TEAAAAJ;;https://scholar.google.com/citations?hl=en", "orcid": ";;", "linkedin": ";;rene-vidal-74844928/", "or_profile": "~Paris_Giampouras1;~Benjamin_David_Haeffele1;~Rene_Vidal1", "aff": "Johns Hopkins University;;Johns Hopkins University", "aff_domain": "jhu.edu;;jhu.edu", "position": "Postdoc;;Professor", "bibtex": "@inproceedings{\ngiampouras2022implicit,\ntitle={Implicit Bias of Projected Subgradient Method Gives Provable Robust Recovery of Subspaces of Unknown Codimension},\nauthor={Paris Giampouras and Benjamin David Haeffele and Rene Vidal},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=vA7doMdgi75}\n}", "github": "", "project": "", "reviewers": "1qf1;Az9P;MxBa;euxB", "pdf_size": 0, "recommendation": "5;6;8;8", "confidence": "4;4;2;3", "correctness": "4;4;4;3", "technical_novelty": "2;2;3;4", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "132;57;75;63", "wc_summary_review": "45;53;20;41", "wc_main_review": "420;109;125;245", "wc_review": "597;219;220;349", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "307;593;276;371", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 6.75, 1.299038105676658 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.82915619758885 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 81.75, 29.72688177390962 ], "wc_summary_review_avg": [ 39.75, 12.193748398257199 ], "wc_main_review_avg": [ 224.75, 124.37920847151263 ], "wc_review_avg": [ 346.25, 154.12231343968335 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 386.75, 123.90797996900764 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.8703882797784892, "corr_recommendation_correctness": -0.5555555555555555, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7039279168600954241&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=vA7doMdgi75", "email": "jhu.edu;;jhu.edu", "author_num": 3, "aff_unique_index": "0;0", "aff_unique_norm": "Johns Hopkins University", "aff_unique_dep": "", "aff_unique_url": "https://www.jhu.edu", "aff_unique_abbr": "JHU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "vBn2OXZuQCF", "title": "How does Contrastive Pre-training Connect Disparate Domains?", "track": "main", "status": "Reject", "tldr": "", "abstract": "Pre-training on massive unlabeled datasets greatly improves accuracy under distribution shifts. As a first step toward understanding this, we study a popular pre-training method, contrastive learning, in the unsupervised domain adaptation (UDA) setting where we only have labeled data from a source domain and unlabeled data from a target domain. We begin by showing on 4 benchmark datasets that out-of-the-box contrastive pre-training (even without large-scale unlabeled data) is competitive with other UDA methods. Intuitions from classical UDA methods such as domain adversarial training focus on bringing the domains together in feature space to improve generalization from source to target. Surprisingly, we find that contrastive pre-training learns features that are very far apart between the source and target domains. How then does contrastive learning improve robustness to distribution shift? We develop a conceptual model for contrastive learning under domain shifts, where data augmentations form connections between classes and domains that can be far apart. We propose a new measure of connectivity ---the relative connection strengths between same and different classes across domains---that governs the success of contrastive pre-training for domain adaptation in a simple example and strongly correlates with our results on benchmark datasets.", "keywords": "pre-training;contrastive learning;robustness;out-of-distribution;domain shift", "primary_area": "", "supplementary_material": "/attachment/1a032218475b77984ddce3b304a0f00c3c2ef6f9.zip", "author": "Kendrick Shen;Robbie Matthew Jones;Ananya Kumar;Sang Michael Xie;Percy Liang", "authorids": "~Kendrick_Shen1;~Robbie_Matthew_Jones1;~Ananya_Kumar1;~Sang_Michael_Xie1;~Percy_Liang1", "gender": "M;M;M;;", "homepage": "https://kendrickshen.com;;https://ananyakumar.wordpress.com/;https://cs.stanford.edu/~eix/;https://cs.stanford.edu/~pliang/", "dblp": ";;192/0474;220/3987;04/1701", "google_scholar": "https://scholar.google.com/citations?hl=en;dXzqCT4AAAAJ;tP5IBFkAAAAJ;EBNa5IEAAAAJ;pouyVyUAAAAJ", "orcid": ";;;;", "linkedin": ";robbie-jones-96;;;", "or_profile": "~Kendrick_Shen1;~Robbie_Matthew_Jones1;~Ananya_Kumar1;~Sang_Michael_Xie1;~Percy_Liang1", "aff": "Stanford University;;Microsoft;Stanford University;Stanford University", "aff_domain": "stanford.edu;;microsoft.com;stanford.edu;stanford.edu", "position": "Undergrad student;;Intern;PhD student;Associate Professor", "bibtex": "@misc{\nshen2022how,\ntitle={How does Contrastive Pre-training Connect Disparate Domains?},\nauthor={Kendrick Shen and Robbie Matthew Jones and Ananya Kumar and Sang Michael Xie and Percy Liang},\nyear={2022},\nurl={https://openreview.net/forum?id=vBn2OXZuQCF}\n}", "github": "", "project": "", "reviewers": "W3eA;pgHb;6UNN;tPmk", "site": "https://openreview.net/forum?id=vBn2OXZuQCF", "pdf_size": 0, "recommendation": "3;5;5;5", "confidence": "3;4;4;3", "correctness": "3;3;4;4", "technical_novelty": "2;3;3;2", "empirical_novelty": "2;3;3;2", "wc_summary_paper": "54;71;60;207", "wc_summary_review": "34;89;20;200", "wc_main_review": "138;548;117;496", "wc_review": "226;708;197;903", "wc_reply_reviewers": "0;739;0;0", "wc_reply_authors": "382;1477;296;516", "reply_reviewers": "0;2;0;0", "reply_authors": "1;3;1;1", "recommendation_avg": [ 4.5, 0.8660254037844386 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 98.0, 63.22578587886433 ], "wc_summary_review_avg": [ 85.75, 70.82504853510515 ], "wc_main_review_avg": [ 324.75, 198.24400999778027 ], "wc_review_avg": [ 508.5, 305.06925443249764 ], "wc_reply_reviewers_avg": [ 184.75, 319.9963866983501 ], "wc_reply_authors_avg": [ 667.75, 473.7522427387548 ], "reply_reviewers_avg": [ 0.5, 0.8660254037844386 ], "reply_authors_avg": [ 1.5, 0.8660254037844386 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1084530953398042242&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "aff_unique_index": "0;1;0;0", "aff_unique_norm": "Stanford University;Microsoft", "aff_unique_dep": ";Microsoft Corporation", "aff_unique_url": "https://www.stanford.edu;https://www.microsoft.com", "aff_unique_abbr": "Stanford;Microsoft", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Stanford;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "vDa28vlSBCP", "title": "Interactively Generating Explanations for Transformer Language Models", "track": "main", "status": "Reject", "tldr": "", "abstract": "Transformer language models are state-of-the-art in a multitude of NLP tasks. Despite these successes, their opaqueness remains problematic. Recent methods aiming to provide interpretability and explainability to black-box models primarily focus on post-hoc explanations of (sometimes spurious) input-output correlations. Instead, we emphasize using prototype networks directly incorporated into the model architecture and hence explain the reasoning process behind the network's decisions. Moreover, while our architecture performs on par with several language models, it enables one to learn from user interactions. This not only offers a better understanding of language models but uses human capabilities to incorporate knowledge outside of the rigid range of purely data-driven approaches.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/9b653f8dabc78247b1b8b252f806917194871562.zip", "author": "Patrick Schramowski;Felix Friedrich;Christopher Tauchmann;Kristian Kersting", "authorids": "~Patrick_Schramowski1;~Felix_Friedrich1;tauchmann@cs.tu-darmstadt.de;~Kristian_Kersting1", "gender": "M;;;M", "homepage": "https://ml-research.github.io/people/pschramowski/index.html;https://ml-research.github.io/people/ffriedrich/;;http://www.ml.informatik.tu-darmstadt.de/", "dblp": "217/1650;18/4626;;40/3793", "google_scholar": "GD481RkAAAAJ;RfM9ud0AAAAJ;;QY-earAAAAAJ", "orcid": "0000-0003-1231-7120;0000-0001-8387-793X;;0000-0002-2873-9152", "linkedin": ";;;", "or_profile": "~Patrick_Schramowski1;~Felix_Friedrich1;tauchmann@cs.tu-darmstadt.de;~Kristian_Kersting1", "aff": "TU Darmstadt;TU Darmstadt;;TU Darmstadt", "aff_domain": "tu-darmstadt.de;tu-darmstadt.de;;tu-darmstadt.de", "position": "PhD student;PhD student;;Full Professor", "bibtex": "@misc{\nschramowski2022interactively,\ntitle={Interactively Generating Explanations for Transformer Language Models},\nauthor={Patrick Schramowski and Felix Friedrich and Christopher Tauchmann and Kristian Kersting},\nyear={2022},\nurl={https://openreview.net/forum?id=vDa28vlSBCP}\n}", "github": "", "project": "", "reviewers": "yfzh;hWyb;QkZJ;cN7n", "site": "https://openreview.net/forum?id=vDa28vlSBCP", "pdf_size": 0, "recommendation": "3;5;5;5", "confidence": "3;4;3;4", "correctness": "3;3;3;2", "technical_novelty": "2;3;2;3", "empirical_novelty": "2;3;2;2", "wc_summary_paper": "98;189;72;86", "wc_summary_review": "40;41;47;73", "wc_main_review": "158;304;175;414", "wc_review": "296;534;294;573", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "588;861;538;772", "reply_reviewers": "0;0;0;0", "reply_authors": "1;2;1;1", "recommendation_avg": [ 4.5, 0.8660254037844386 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 111.25, 45.82234716816676 ], "wc_summary_review_avg": [ 50.25, 13.40475661845451 ], "wc_main_review_avg": [ 262.75, 103.98407329971259 ], "wc_review_avg": [ 424.25, 129.98533571137938 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 689.75, 131.7884194457161 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": -0.3333333333333333, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3488192494792015278&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;0", "aff_unique_norm": "Technische Universit\u00e4t Darmstadt", "aff_unique_dep": "", "aff_unique_url": "https://www.tu-darmstadt.de", "aff_unique_abbr": "TU Darmstadt", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Darmstadt", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Germany" }, { "title": "Gradient Matching for Domain Generalization", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6373", "id": "vDwBW49HmO", "poster": "", "openreview": "https://openreview.net/forum?id=vDwBW49HmO", "slides": "https://iclr.cc/virtual/2022/poster/6373", "video": "https://iclr.cc/virtual/2022/poster/6373", "author_site": "Yuge Shi, Jeffrey Seely, Philip Torr, Siddharth N, Awni Hannun, Nicolas Usunier, Gabriel Synnaeve", "tldr": "", "abstract": "Machine learning systems typically assume that the distributions of training and test sets match closely. However, a critical requirement of such systems in the real world is their ability to generalize to unseen domains. Here, we propose an _inter-domain gradient matching_ objective that targets domain generalization by maximizing the inner product between gradients from different domains. Since direct optimization of the gradient inner product can be computationally prohibitive --- it requires computation of second-order derivatives \u2013-- we derive a simpler first-order algorithm named Fish that approximates its optimization. We perform experiments on the Wilds benchmark, which captures distribution shift in the real world, as well as the DomainBed benchmark that focuses more on synthetic-to-real transfer. Our method produces competitive results on both benchmarks, demonstrating its effectiveness across a wide range of domain generalization tasks.", "keywords": "Domain generalization;multi-source domain adaptation", "primary_area": "", "supplementary_material": "", "author": "Yuge Shi;Jeffrey Seely;Philip Torr;Siddharth N;Awni Hannun;Nicolas Usunier;Gabriel Synnaeve", "authorids": "~Yuge_Shi1;~Jeffrey_Seely1;~Philip_Torr1;~Siddharth_N1;~Awni_Hannun1;~Nicolas_Usunier1;~Gabriel_Synnaeve1", "gender": "F;M;;M;M;M;M", "homepage": "https://yugeten.github.io/;http://jsseely.com/;http://www.robots.ox.ac.uk/~tvg/;https://homepages.inf.ed.ac.uk/snaraya3/;https://www.awnihannun.com/;;", "dblp": "227/4684;;;67/8366;https://dblp.uni-trier.de/pers/hd/h/Hannun:Awni;79/3983;http://dblp.uni-trier.de/pers/hd/s/Synnaeve:Gabriel", "google_scholar": "https://scholar.google.co.uk/citations?user=t6B_Z7MAAAAJ;https://scholar.google.com/citations?hl=en;;V7D7hxMAAAAJ;3-mdTUAAAAAJ;tYro5N8AAAAJ;wN9rBkcAAAAJ", "orcid": ";;;0000-0003-4911-7333;;;", "linkedin": ";jeffrey-s-44a5b426/;;;;;", "or_profile": "~Yuge_Shi1;~Jeffrey_Seely1;~Philip_Torr1;~Siddharth_N1;~Awni_Hannun1;~Nicolas_Usunier1;~Gabriel_Synnaeve1", "aff": "University of Oxford;Meta Facebook;University of Oxford;University of Edinburgh;Zoom;Meta Facebook;Meta Facebook", "aff_domain": "ox.ac.uk;fb.com;ox.ac.uk;ed.ac.uk;zoom.us;fb.com;fb.com", "position": "PhD student;Researcher;Full Professor;Reader (Associate Professor);Distinguished Scientist;Research Scientist;Research Scientist", "bibtex": "@inproceedings{\nshi2022gradient,\ntitle={Gradient Matching for Domain Generalization},\nauthor={Yuge Shi and Jeffrey Seely and Philip Torr and Siddharth N and Awni Hannun and Nicolas Usunier and Gabriel Synnaeve},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=vDwBW49HmO}\n}", "github": "", "project": "", "reviewers": "eig7;t4Z6;Xwsw;KYNz;b5oJ", "pdf_size": 0, "recommendation": "6;6;6;6;8", "confidence": "4;5;3;3;4", "correctness": "3;4;3;2;4", "technical_novelty": "3;3;3;2;3", "empirical_novelty": "3;2;3;2;3", "wc_summary_paper": "38;38;21;59;34", "wc_summary_review": "51;17;28;36;21", "wc_main_review": "184;386;225;159;341", "wc_review": "273;441;274;254;396", "wc_reply_reviewers": "45;358;0;24;21", "wc_reply_authors": "383;492;411;503;556", "reply_reviewers": "1;1;0;1;1", "reply_authors": "1;1;1;1;1", "recommendation_avg": [ 6.4, 0.7999999999999999 ], "confidence_avg": [ 3.8, 0.7483314773547882 ], "correctness_avg": [ 3.2, 0.7483314773547882 ], "technical_novelty_avg": [ 2.8, 0.39999999999999997 ], "empirical_novelty_avg": [ 2.6, 0.4898979485566356 ], "wc_summary_paper_avg": [ 38.0, 12.214745187681975 ], "wc_summary_review_avg": [ 30.6, 12.076423311560422 ], "wc_main_review_avg": [ 259.0, 89.03257830704445 ], "wc_review_avg": [ 327.6, 75.90678494047815 ], "wc_reply_reviewers_avg": [ 89.6, 134.95569643405202 ], "wc_reply_authors_avg": [ 469.0, 63.26768527455387 ], "reply_reviewers_avg": [ 0.8, 0.4 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.13363062095621223, "corr_recommendation_correctness": 0.5345224838248488, "gs_citation": 353, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2851826454893571179&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=vDwBW49HmO", "email": "ox.ac.uk;fb.com;ox.ac.uk;ed.ac.uk;zoom.us;fb.com;fb.com", "author_num": 7, "aff_unique_index": "0;1;0;2;3;1;1", "aff_unique_norm": "University of Oxford;Meta;University of Edinburgh;Zoom Video Communications Inc.", "aff_unique_dep": ";Meta Platforms, Inc.;;", "aff_unique_url": "https://www.ox.ac.uk;https://meta.com;https://www.ed.ac.uk;https://zoom.us", "aff_unique_abbr": "Oxford;Meta;Edinburgh;Zoom", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0;1;1;1", "aff_country_unique": "United Kingdom;United States" }, { "id": "vEIVxSN8Xhx", "title": "Log-Polar Space Convolution", "track": "main", "status": "Reject", "tldr": "", "abstract": "Convolutional neural networks use regular quadrilateral convolution kernels to extract features. Since the number of parameters increases quadratically with the size of the convolution kernel, many popular models use small convolution kernels, resulting in small local receptive fields in lower layers. This paper proposes a novel log-polar space convolution (LPSC) method, where the convolution kernel is elliptical and adaptively divides its local receptive field into different regions according to the relative directions and logarithmic distances. The local receptive field grows exponentially with the number of distance levels. Therefore, the proposed LPSC not only naturally encodes local spatial structures, but also greatly increases the single-layer receptive field while maintaining the number of parameters. We show that LPSC can be implemented with conventional convolution via log-polar space pooling and can be applied in any network architecture to replace conventional convolutions. Experiments on different tasks and datasets demonstrate the effectiveness of the proposed LPSC.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/9b16b36e2a33b31dc967be4985414159e29d7624.zip", "author": "Bing Su;Ji-Rong Wen", "authorids": "~Bing_Su1;~Ji-Rong_Wen1", "gender": "M;M", "homepage": "https://gsai.ruc.edu.cn/bingsu;https://gsai.ruc.edu.cn/english/jrwen", "dblp": "41/5270-1;w/JRWen", "google_scholar": "https://scholar.google.com.sg/citations?user=d3g2VJQAAAAJ;tbxCHJgAAAAJ", "orcid": "0000-0001-8560-1910;0000-0002-9777-9676", "linkedin": ";", "or_profile": "~Bing_Su1;~Ji-Rong_Wen1", "aff": "Renmin University of China;Renmin University of China", "aff_domain": "ruc.edu.cn;ruc.edu.cn", "position": "Associate Professor;Full Professor", "bibtex": "@misc{\nsu2022logpolar,\ntitle={Log-Polar Space Convolution},\nauthor={Bing Su and Ji-Rong Wen},\nyear={2022},\nurl={https://openreview.net/forum?id=vEIVxSN8Xhx}\n}", "github": "", "project": "", "reviewers": "MHsy;2ZGD;pwSX;TRTh", "site": "https://openreview.net/forum?id=vEIVxSN8Xhx", "pdf_size": 0, "recommendation": "3;5;5;5", "confidence": "4;4;4;4", "correctness": "3;3;3;3", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;2;3;2", "wc_summary_paper": "96;99;102;118", "wc_summary_review": "28;53;49;37", "wc_main_review": "435;449;443;428", "wc_review": "559;601;594;583", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.5, 0.8660254037844386 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 103.75, 8.496322733983215 ], "wc_summary_review_avg": [ 41.75, 9.883698700385398 ], "wc_main_review_avg": [ 438.75, 7.949056547792323 ], "wc_review_avg": [ 584.25, 15.927570436196476 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:ZnjjIdXvS90J:scholar.google.com/&scioq=Log-Polar+Space+Convolution&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Renmin University of China", "aff_unique_dep": "", "aff_unique_url": "http://www.ruc.edu.cn", "aff_unique_abbr": "RUC", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "title": "Actor-critic is implicitly biased towards high entropy optimal policies", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6634", "id": "vEZyTBRPP6o", "poster": "", "openreview": "https://openreview.net/forum?id=vEZyTBRPP6o", "slides": "https://iclr.cc/virtual/2022/poster/6634", "video": "https://iclr.cc/virtual/2022/poster/6634", "author_site": "Yuzheng Hu, Ziwei Ji, Matus Telgarsky", "tldr": "", "abstract": "We show that the simplest actor-critic method \u2014 a linear softmax policy updated with TD through interaction with a linear MDP, but featuring no explicit regularization or exploration \u2014 does not merely find an optimal policy, but moreover prefers high entropy optimal policies. To demonstrate the strength of this bias, the algorithm not only has no regularization, no projections, and no exploration like $\\epsilon$-greedy, but is moreover trained on a single trajectory with no resets. The key consequence of the high entropy bias is that uniform mixing assumptions on the MDP, which exist in some form in all prior work, can be dropped: the implicit regularization of the high entropy bias is enough to ensure that all chains mix and an optimal policy is reached with high probability. As auxiliary contributions, this work decouples concerns between the actor and critic by writing the actor update as an explicit mirror descent, provides tools to uniformly bound mixing times within KL balls of policy space, and provides a projection-free TD analysis with its own implicit bias which can be run from an unmixed starting distribution.\n", "keywords": "implicit bias;reinforcement learning;actor-critic;policy gradient;mixing time;convergence rate;mirror ascent.", "primary_area": "", "supplementary_material": "", "author": "Yuzheng Hu;Ziwei Ji;Matus Telgarsky", "authorids": "~Yuzheng_Hu1;~Ziwei_Ji1;~Matus_Telgarsky1", "gender": "M;M;M", "homepage": "https://mirnegg.github.io;https://jiziwei.github.io/;https://cims.nyu.edu/~matus/", "dblp": "231/2255.html;176/4574.html=;05/9061", "google_scholar": "cVVimVcAAAAJ;3l_6H5sAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";;", "linkedin": "yuzheng-hu-a74b5823b/;ziwei-ji-b1274899/;", "or_profile": "~Yuzheng_Hu1;~Ziwei_Ji1;~Matus_Telgarsky1", "aff": "University of Illinois, Urbana Champaign;University of Illinois Urbana Champaign;Department of Computer Science, University of Illinois, Urbana Champaign", "aff_domain": "uiuc.edu;illinois.edu;cs.illinois.edu", "position": "PhD student;PhD student;Assistant Professor", "bibtex": "@inproceedings{\nhu2022actorcritic,\ntitle={Actor-critic is implicitly biased towards high entropy optimal policies},\nauthor={Yuzheng Hu and Ziwei Ji and Matus Telgarsky},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=vEZyTBRPP6o}\n}", "github": "", "project": "", "reviewers": "UirM;gsmP;3pPx", "pdf_size": 0, "recommendation": "6;8;8", "confidence": "3;4;2", "correctness": "3;4;4", "technical_novelty": "3;4;4", "empirical_novelty": "0;4;0", "wc_summary_paper": "91;73;181", "wc_summary_review": "49;29;44", "wc_main_review": "293;285;132", "wc_review": "433;387;357", "wc_reply_reviewers": "34;405;0", "wc_reply_authors": "690;2977;337", "reply_reviewers": "1;5;0", "reply_authors": "2;7;1", "recommendation_avg": [ 7.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 3.0, 0.816496580927726 ], "correctness_avg": [ 3.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 3.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 1.3333333333333333, 1.8856180831641267 ], "wc_summary_paper_avg": [ 115.0, 47.24404724407086 ], "wc_summary_review_avg": [ 40.666666666666664, 8.498365855987974 ], "wc_main_review_avg": [ 236.66666666666666, 74.08253655364551 ], "wc_review_avg": [ 392.3333333333333, 31.25522178594945 ], "wc_reply_reviewers_avg": [ 146.33333333333334, 183.43088313827877 ], "wc_reply_authors_avg": [ 1334.6666666666667, 1170.2126112615415 ], "reply_reviewers_avg": [ 2.0, 2.160246899469287 ], "reply_authors_avg": [ 3.3333333333333335, 2.6246692913372702 ], "replies_avg": [ 24, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.9999999999999998, "gs_citation": 18, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10513168637504252564&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "pdf": "https://openreview.net/pdf?id=vEZyTBRPP6o", "email": "uiuc.edu;illinois.edu;cs.illinois.edu", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Illinois Urbana-Champaign", "aff_unique_dep": "", "aff_unique_url": "https://illinois.edu", "aff_unique_abbr": "UIUC", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Urbana-Champaign", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "vF0Qil7nPEd", "title": "Sequence-to-sequence modeling for action identification at high temporal resolution", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Automatic action identification from video and kinematic data is an important machine learning problem with applications ranging from robotics to smart health. Most existing works focus on identifying coarse actions such as running, climbing, or cutting a vegetable, which have relatively long durations. This is an important limitation for applications that require identification of subtle motions at high temporal resolution. For example, in stroke recovery, quantifying rehabilitation dose requires differentiating motions with sub-second durations. Our goal is to bridge this gap. To this end, we introduce a large-scale, multimodal dataset, $StrokeRehab$, as a new action-recognition benchmark that includes subtle short-duration actions labeled at a high temporal resolution. These short-duration actions are called motion primitives, and consist of reaches, transports, repositions, stabilizations, and idles. The dataset consists of high-quality Inertial Measurement Unit sensors and video data of 41 stroke-impaired patients performing activities of daily living like feeding, brushing teeth, etc. We show that current state-of-the-art models based on segmentation produce noisy predictions when applied to these data, which often leads to overcounting of actions. To address this, we propose a novel approach for high-resolution action identification, inspired by speech-recognition techniques, which is based on a sequence-to-sequence model that directly predicts the sequence of actions. This approach outperforms current state-of-the-art methods on the $StrokeRehab$ dataset, as well as on the standard benchmark datasets: 50Salads, Breakfast, and Jigsaws.", "keywords": "Deep learning;Action recognition;Benchmark dataset;Fine-grained action recognition;Stroke rehabilitation;Seq2seq models;sequence prediction", "primary_area": "", "supplementary_material": "", "author": "Kangning Liu;Avinash Parnandi;Haresh Rengaraj Rajamohan;Aakash Kaku;Anita Venkatesan;Audre Wirtanen;Natasha Pandit;Kannan Venkataramanan;Heidi Schambra;Carlos Fernandez-Granda", "authorids": "~Kangning_Liu1;~Avinash_Parnandi1;~Haresh_Rengaraj_Rajamohan1;~Aakash_Kaku1;anitavenkatesan1190@gmail.com;awirtanen@bennington.edu;ngp238@nyu.edu;kv942@nyu.edu;heidi.schambra@nyulangone.org;~Carlos_Fernandez-Granda1", "gender": "M;;M;;;;;;;", "homepage": "https://kangning-liu.github.io/;;;https://aakashrkaku.github.io/;;;;;;https://cims.nyu.edu/~cfgranda/", "dblp": "259/1458;https://dblp.uni-trier.de/pers/hd/p/Parnandi:Avinash;;254/2931;;;;;;77/11141", "google_scholar": "F3F2qAkAAAAJ;DBLgVFQAAAAJ;;lgObq7UAAAAJ;;;;;;GX-PtukAAAAJ", "orcid": ";;;0000-0002-2631-0897;;;;;;", "linkedin": ";;hareshrajamohan/;;;;;;;", "or_profile": "~Kangning_Liu1;~Avinash_Parnandi1;~Haresh_Rengaraj_Rajamohan1;~Aakash_Kaku1;anitavenkatesan1190@gmail.com;awirtanen@bennington.edu;ngp238@nyu.edu;kv942@nyu.edu;heidi.schambra@nyulangone.org;~Carlos_Fernandez-Granda1", "aff": "Google;NYU Langone;New York University;New York University;;;;;;New York University", "aff_domain": "google.com;nyumc.org;nyu.edu;nyu.edu;;;;;;nyu.edu", "position": "Intern;Postdoc;PhD student;PhD student;;;;;;Associate Professor", "bibtex": "@misc{\nliu2022sequencetosequence,\ntitle={Sequence-to-sequence modeling for action identification at high temporal resolution},\nauthor={Kangning Liu and Avinash Parnandi and Haresh Rengaraj Rajamohan and Aakash Kaku and Anita Venkatesan and Audre Wirtanen and Natasha Pandit and Kannan Venkataramanan and Heidi Schambra and Carlos Fernandez-Granda},\nyear={2022},\nurl={https://openreview.net/forum?id=vF0Qil7nPEd}\n}", "github": "", "project": "", "reviewers": "LxF9;6Jpo;jch7;LtmM", "site": "https://openreview.net/forum?id=vF0Qil7nPEd", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "5;2;4;4", "correctness": "3;3;2;3", "technical_novelty": "1;2;2;2", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "55;64;136;85", "wc_summary_review": "30;14;115;72", "wc_main_review": "336;88;1071;310", "wc_review": "421;166;1322;467", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 3.75, 1.0897247358851685 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 1.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 85.0, 31.392674304684526 ], "wc_summary_review_avg": [ 57.75, 39.25796097608738 ], "wc_main_review_avg": [ 451.25, 370.5653619808522 ], "wc_review_avg": [ 594.0, 435.66787809063914 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 10, 0 ], "corr_recommendation_confidence": -0.4736842105263159, "corr_recommendation_correctness": -0.13245323570650439, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=224372762923856193&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;1;2;2;2", "aff_unique_norm": "Google;NYU Langone Health;New York University", "aff_unique_dep": "Google;;", "aff_unique_url": "https://www.google.com;https://nyulangone.org;https://www.nyu.edu", "aff_unique_abbr": "Google;NYU Langone;NYU", "aff_campus_unique_index": "0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Dive Deeper Into Integral Pose Regression", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/5955", "id": "vHVcB-ak3Si", "poster": "", "openreview": "https://openreview.net/forum?id=vHVcB-ak3Si", "slides": "https://iclr.cc/virtual/2022/poster/5955", "video": "https://iclr.cc/virtual/2022/poster/5955", "author_site": "Kerui Gu, Linlin Yang, Angela Yao", "tldr": "", "abstract": "Integral pose regression combines an implicit heatmap with end-to-end training for human body and hand pose estimation. Unlike detection-based heatmap methods, which decode final joint positions from the heatmap with a non-differentiable argmax operation, integral regression methods apply a differentiable expectation operation. This paper offers a deep dive into the inference and back-propagation of integral pose regression to better understand the differences in performance and training compared to detection-based methods. For inference, we give theoretical support as to why expectation should always be better than the argmax operation, i.e. integral regression should always outperform detection. Yet, in practice, this is observed only in hard cases because the heatmap activation for regression shrinks in easy cases. We then experimentally show that activation shrinkage is one of the leading causes for integral regression's inferior performance. For back-propagation, we theoretically and empirically analyze the gradients to explain the slow training speed of integral regression. Based on these findings, we incorporate the supervision of a spatial prior to speed up training and improve performance.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Kerui Gu;Linlin Yang;Angela Yao", "authorids": "~Kerui_Gu1;~Linlin_Yang1;~Angela_Yao1", "gender": "M;M;", "homepage": "https://www.comp.nus.edu.sg/~keruigu/;https://www.mu4yang.com;http://www.angelayao.com", "dblp": "315/5511;;64/8484", "google_scholar": "if-RXSEAAAAJ;https://scholar.google.com.hk/citations?user=gI55gF0AAAAJ;https://scholar.google.ch/citations?user=-LJCZMMAAAAJ", "orcid": ";0000-0001-6752-0252;", "linkedin": ";;", "or_profile": "~Kerui_Gu1;~Linlin_Yang1;~Angela_Yao1", "aff": "National University of Singapore;University of Bonn;National University of Singapore", "aff_domain": "nus.edu.sg;uni-bonn.de;nus.edu.sg", "position": "PhD student;PhD student;Associate Professor", "bibtex": "@inproceedings{\ngu2022dive,\ntitle={Dive Deeper Into Integral Pose Regression},\nauthor={Kerui Gu and Linlin Yang and Angela Yao},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=vHVcB-ak3Si}\n}", "github": "", "project": "", "reviewers": "hXfD;mXxk;oYhA", "pdf_size": 0, "recommendation": "6;6;8", "confidence": "5;4;4", "correctness": "2;4;3", "technical_novelty": "3;2;3", "empirical_novelty": "2;3;0", "wc_summary_paper": "81;97;47", "wc_summary_review": "20;52;119", "wc_main_review": "407;290;92", "wc_review": "508;439;258", "wc_reply_reviewers": "20;31;4", "wc_reply_authors": "1137;370;62", "reply_reviewers": "1;1;1", "reply_authors": "2;1;1", "recommendation_avg": [ 6.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "correctness_avg": [ 3.0, 0.816496580927726 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 1.6666666666666667, 1.247219128924647 ], "wc_summary_paper_avg": [ 75.0, 20.848661028149188 ], "wc_summary_review_avg": [ 63.666666666666664, 41.24991582482994 ], "wc_main_review_avg": [ 263.0, 130.0076920801227 ], "wc_review_avg": [ 401.6666666666667, 105.42084971937741 ], "wc_reply_reviewers_avg": [ 18.333333333333332, 11.08552609887726 ], "wc_reply_authors_avg": [ 523.0, 452.0051622124095 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.4999999999999999, "corr_recommendation_correctness": 0.0, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11125265233325106860&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "pdf": "https://openreview.net/pdf?id=vHVcB-ak3Si", "email": "nus.edu.sg;uni-bonn.de;nus.edu.sg", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "National University of Singapore;University of Bonn", "aff_unique_dep": ";", "aff_unique_url": "https://www.nus.edu.sg;https://www.uni-bonn.de/", "aff_unique_abbr": "NUS;UBonn", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", "aff_country_unique": "Singapore;Germany" }, { "title": "Overcoming The Spectral Bias of Neural Value Approximation", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6564", "id": "vIC-xLFuM6", "poster": "", "openreview": "https://openreview.net/forum?id=vIC-xLFuM6", "slides": "https://iclr.cc/virtual/2022/poster/6564", "video": "https://iclr.cc/virtual/2022/poster/6564", "author_site": "Ge Yang, Anurag Ajay, Pulkit Agrawal", "tldr": "", "abstract": "Value approximation using deep neural networks is at the heart of off-policy deep reinforcement learning, and is often the primary module that provides learning signals to the rest of the algorithm. While multi-layer perceptron networks are universal function approximators, recent works in neural kernel regression suggest the presence of a \\textit{spectral bias}, where fitting high-frequency components of the value function requires exponentially more gradient update steps than the low-frequency ones. In this work, we re-examine off-policy reinforcement learning through the lens of kernel regression and propose to overcome such bias via a composite neural tangent kernel. With just a single line-change, our approach, the Fourier feature networks (FFN) produce state-of-the-art performance on challenging continuous control domains with only a fraction of the compute. Faster convergence and better off-policy stability also make it possible to remove the target network without suffering catastrophic divergences, which further reduces TD(0)'s estimation bias on a few tasks. Code and analysis available at https://geyang.github.io/ffn.", "keywords": "spectral bias;neural value approximation;Q learning;reinforcement learning;neural tangent kernels;kernel regression", "primary_area": "", "supplementary_material": "/attachment/906aecf6951719108ef78b5579857b00d2aef206.zip", "author": "Ge Yang;Anurag Ajay;Pulkit Agrawal", "authorids": "~Ge_Yang1;~Anurag_Ajay1;~Pulkit_Agrawal1", "gender": "M;M;M", "homepage": "http://www.episodeyang.com;https://anuragajay.github.io/;https://people.eecs.berkeley.edu/~pulkitag/", "dblp": "48/4561-3;180/5483;149/2672", "google_scholar": "vaQcF6kAAAAJ;;UpZmJI0AAAAJ", "orcid": "0000-0001-7520-7055;;", "linkedin": ";;", "or_profile": "~Ge_Yang1;~Anurag_Ajay1;~Pulkit_Agrawal1", "aff": "Massachusetts Institute of Technology;Massachusetts Institute of Technology;Massachusetts Institute of Technology", "aff_domain": "mit.edu;mit.edu;mit.edu", "position": "Postdoc;PhD student;Assistant Professor", "bibtex": "@inproceedings{\nyang2022overcoming,\ntitle={Overcoming The Spectral Bias of Neural Value Approximation},\nauthor={Ge Yang and Anurag Ajay and Pulkit Agrawal},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=vIC-xLFuM6}\n}", "github": "", "project": "", "reviewers": "cq8o;fVzS;TezL", "pdf_size": 0, "recommendation": "3;6;6", "confidence": "4;4;4", "correctness": "3;4;4", "technical_novelty": "2;2;3", "empirical_novelty": "2;4;2", "wc_summary_paper": "57;124;126", "wc_summary_review": "33;153;84", "wc_main_review": "424;415;297", "wc_review": "514;692;507", "wc_reply_reviewers": "0;606;52", "wc_reply_authors": "1741;3108;908", "reply_reviewers": "0;3;1", "reply_authors": "4;6;3", "recommendation_avg": [ 5.0, 1.4142135623730951 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.9428090415820634 ], "wc_summary_paper_avg": [ 102.33333333333333, 32.065904356843305 ], "wc_summary_review_avg": [ 90.0, 49.17316341257699 ], "wc_main_review_avg": [ 378.6666666666667, 57.86382481501048 ], "wc_review_avg": [ 571.0, 85.6076320585184 ], "wc_reply_reviewers_avg": [ 219.33333333333334, 274.23752883626673 ], "wc_reply_authors_avg": [ 1919.0, 906.9226354362684 ], "reply_reviewers_avg": [ 1.3333333333333333, 1.247219128924647 ], "reply_authors_avg": [ 4.333333333333333, 1.247219128924647 ], "replies_avg": [ 24, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 1.0, "gs_citation": 28, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10758981528113303592&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=vIC-xLFuM6", "email": "mit.edu;mit.edu;mit.edu", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Massachusetts Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://web.mit.edu", "aff_unique_abbr": "MIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Generalization of Neural Combinatorial Solvers Through the Lens of Adversarial Robustness", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7081", "id": "vJZ7dPIjip3", "poster": "", "openreview": "https://openreview.net/forum?id=vJZ7dPIjip3", "slides": "https://iclr.cc/virtual/2022/poster/7081", "video": "https://iclr.cc/virtual/2022/poster/7081", "author_site": "Simon Geisler, Johanna Sommer, Jan Schuchardt, Aleksandar Bojchevski, Stephan G\u00fcnnemann", "tldr": "", "abstract": "End-to-end (geometric) deep learning has seen first successes in approximating the solution of combinatorial optimization problems. However, generating data in the realm of NP-hard/-complete tasks brings practical and theoretical challenges, resulting in evaluation protocols that are too optimistic. Specifically, most datasets only capture a simpler subproblem and likely suffer from spurious features. We investigate these effects by studying adversarial robustness -a local generalization property- to reveal hard, model-specific instances and spurious features. For this purpose, we derive perturbation models for SAT and TSP. Unlike in other applications, where perturbation models are designed around subjective notions of imperceptibility, our perturbation models are efficient and sound, allowing us to determine the true label of perturbed samples without a solver. Surprisingly, with such perturbations, a sufficiently expressive neural solver does not suffer from the limitations of the accuracy-robustness trade-off common in supervised learning. Although such robust solvers exist, we show empirically that the assessed neural solvers do not generalize well w.r.t. small perturbations of the problem instance.", "keywords": "Generalization;Neural Combinatorial Optimization;Adversarial Robustness", "primary_area": "", "supplementary_material": "", "author": "Simon Geisler;Johanna Sommer;Jan Schuchardt;Aleksandar Bojchevski;Stephan G\u00fcnnemann", "authorids": "~Simon_Geisler1;~Johanna_Sommer1;~Jan_Schuchardt1;~Aleksandar_Bojchevski1;~Stephan_G\u00fcnnemann1", "gender": "F;;M;M;M", "homepage": "https://johanna-sommer.com;https://www.cs.cit.tum.de/daml/team/jan-schuchardt/;https://abojchevski.github.io/;http://www.daml.in.tum.de;https://www.in.tum.de/en/daml/team/simon-geisler/", "dblp": "https://dblp.uni-trier.de/pid/243/2320;241/5487;203/8114;43/3011;237/0253", "google_scholar": "R3p8FGsAAAAJ;O-cixlwAAAAJ;https://scholar.google.de/citations?user=F1APiN4AAAAJ;;00x9jJwAAAAJ", "orcid": ";;;;0000-0003-0867-1856", "linkedin": ";;;;simon-geisler-ai/", "or_profile": "~Johanna_Sommer1;~Jan_Schuchardt1;~Aleksandar_Bojchevski1;~Stephan_G\u00fcnnemann1;~Simon_Markus_Geisler1", "aff": "Technische Universit\u00e4t M\u00fcnchen;Department of Informatics, Technical University Munich;CISPA Helmholtz Center for Information Security;Technical University Munich;Google", "aff_domain": "tum.de;in.tum.de;cispa.de;tum.de;google.com", "position": "PhD student;PhD student;Principal Researcher;Professor;Intern", "bibtex": "@inproceedings{\ngeisler2022generalization,\ntitle={Generalization of Neural Combinatorial Solvers Through the Lens of Adversarial Robustness},\nauthor={Simon Geisler and Johanna Sommer and Jan Schuchardt and Aleksandar Bojchevski and Stephan G{\\\"u}nnemann},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=vJZ7dPIjip3}\n}", "github": "", "project": "", "reviewers": "4nmR;7Apu;Eun2", "pdf_size": 0, "recommendation": "6;8;8", "confidence": "3;4;2", "correctness": "3;4;4", "technical_novelty": "2;2;3", "empirical_novelty": "3;2;4", "wc_summary_paper": "115;118;38", "wc_summary_review": "106;87;21", "wc_main_review": "456;308;115", "wc_review": "677;513;174", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "533;61;40", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "recommendation_avg": [ 7.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 3.0, 0.816496580927726 ], "correctness_avg": [ 3.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 3.0, 0.816496580927726 ], "wc_summary_paper_avg": [ 90.33333333333333, 37.02551672683163 ], "wc_summary_review_avg": [ 71.33333333333333, 36.42648609032841 ], "wc_main_review_avg": [ 293.0, 139.61614042318556 ], "wc_review_avg": [ 454.6666666666667, 209.45060409451094 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 211.33333333333334, 227.61419600328583 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.9999999999999998, "gs_citation": 51, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2845452549848204755&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "pdf": "https://openreview.net/pdf?id=vJZ7dPIjip3", "email": "tum.de;in.tum.de;cispa.de;tum.de;google.com", "author_num": 5, "aff_unique_index": "0;1;2;3;4", "aff_unique_norm": "Technische Universit\u00e4t M\u00fcnchen;Technical University Munich;CISPA Helmholtz Center for Information Security;Technical University of Munich;Google", "aff_unique_dep": ";Department of Informatics;;;Google", "aff_unique_url": "https://www.tum.de;https://www.tum.de;https://www.cispa.de/;https://www.tum.de;https://www.google.com", "aff_unique_abbr": "TUM;TUM;CISPA;TUM;Google", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Munich;Mountain View", "aff_country_unique_index": "0;0;0;0;1", "aff_country_unique": "Germany;United States" }, { "title": "Noisy Feature Mixup", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6227", "id": "vJb4I2ANmy", "poster": "", "openreview": "https://openreview.net/forum?id=vJb4I2ANmy", "slides": "https://iclr.cc/virtual/2022/poster/6227", "video": "https://iclr.cc/virtual/2022/poster/6227", "author_site": "Soon Hoe Lim, N. Benjamin Erichson, Francisco Utrera, Winnie Xu, Michael W Mahoney", "tldr": "", "abstract": "We introduce Noisy Feature Mixup (NFM), an inexpensive yet effective method for data augmentation that combines the best of interpolation based training and noise injection schemes. Rather than training with convex combinations of pairs of examples and their labels, we use noise-perturbed convex combinations of pairs of data points in both input and feature space. This method includes mixup and manifold mixup as special cases, but it has additional advantages, including better smoothing of decision boundaries and enabling improved model robustness. We provide theory to understand this as well as the implicit regularization effects of NFM. Our theory is supported by empirical results, demonstrating the advantage of NFM, as compared to mixup and manifold mixup. We show that residual networks and vision transformers trained with NFM have favorable trade-offs between predictive accuracy on clean data and robustness with respect to various types of data perturbation across a range of computer vision benchmark datasets.", "keywords": "Data augmentation;implicit regularization;mixup;noise injection;model robustness", "primary_area": "", "supplementary_material": "/attachment/bafce789d72783899e364a229c350267ecb0072f.zip", "author": "Soon Hoe Lim;N. Benjamin Erichson;Francisco Utrera;Winnie Xu;Michael W. Mahoney", "authorids": "~Soon_Hoe_Lim1;~N._Benjamin_Erichson1;~Francisco_Utrera1;~Winnie_Xu1;~Michael_W._Mahoney1", "gender": "M;M;;F;", "homepage": "https://shoelim.github.io/;https://www.benerichson.com/;;https://winniexu.ca;", "dblp": "268/0660;173/5153;;285/6560;", "google_scholar": "ufTqvyoAAAAJ;https://scholar.google.co.uk/citations?user=8ViYcioAAAAJ;;k4l-zNYAAAAJ;", "orcid": ";;;;", "linkedin": ";;francisco-utrera-3771382a/;https://linkedin.com/in/winnie-xu;", "or_profile": "~Soon_Hoe_Lim1;~N._Benjamin_Erichson1;~Francisco_Utrera1;~Winnie_Xu1;~Michael_W._Mahoney1", "aff": "Nordic Institute for Theoretical Physics;University of Pittsburgh;University of Pittsburgh;University of Toronto;", "aff_domain": "su.se;pitt.edu;pitt.edu;utoronto.ca;", "position": "Postdoc;Assistant Professor;PhD student;Undergrad student;", "bibtex": "@inproceedings{\nlim2022noisy,\ntitle={Noisy Feature Mixup},\nauthor={Soon Hoe Lim and N. Benjamin Erichson and Francisco Utrera and Winnie Xu and Michael W. Mahoney},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=vJb4I2ANmy}\n}", "github": "", "project": "", "reviewers": "o7KT;46ws;gzuL;WVNT", "pdf_size": 0, "recommendation": "6;6;8;8", "confidence": "4;3;4;3", "correctness": "4;3;4;3", "technical_novelty": "3;3;3;2", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "42;161;107;71", "wc_summary_review": "24;30;92;74", "wc_main_review": "275;158;276;433", "wc_review": "341;349;475;578", "wc_reply_reviewers": "0;0;14;130", "wc_reply_authors": "544;243;325;577", "reply_reviewers": "0;0;1;1", "reply_authors": "1;1;1;2", "recommendation_avg": [ 7.0, 1.0 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 95.25, 44.39805738993543 ], "wc_summary_review_avg": [ 55.0, 28.792360097775937 ], "wc_main_review_avg": [ 285.5, 97.74072846055527 ], "wc_review_avg": [ 435.75, 97.8247795806359 ], "wc_reply_reviewers_avg": [ 36.0, 54.57105459856901 ], "wc_reply_authors_avg": [ 422.25, 141.73809473814723 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 47, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6823398693894797523&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "pdf": "https://openreview.net/pdf?id=vJb4I2ANmy", "email": "su.se;pitt.edu;pitt.edu;utoronto.ca;", "author_num": 5, "aff_unique_index": "0;1;1;2", "aff_unique_norm": "Nordic Institute for Theoretical Physics;University of Pittsburgh;University of Toronto", "aff_unique_dep": ";;", "aff_unique_url": "https://www.nordita.org;https://www.pitt.edu;https://www.utoronto.ca", "aff_unique_abbr": "Nordita;Pitt;U of T", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;2", "aff_country_unique": "Sweden;United States;Canada" }, { "id": "vKMVrqvXbXu", "title": "Effects of Data Geometry in Early Deep Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Deep neural networks can approximate functions on different types of data, from images to graphs, with varied underlying structure.This underlying structure can be viewed as the geometry of the data manifold. By extending recent advances in the theoretical understanding of neural networks, we study how a randomly initialized neural network with piecewise linear activation splits the data manifold into regions where the neural network behaves as a linear function. We derive bounds on the number of linear regions and the distance to boundaries of these linear regions on the data manifold. This leads to insights into the expressivity of randomly initialized deep neural networks on non-Euclidean data sets. We empirically corroborate our theoretical results using a toy supervised learning problem. Our experiments demonstrate that number of linear regions varies across manifolds and how our results hold upon changing neural network architectures. We further demonstrate how the complexity of linear regions changes on the low dimensional manifold of images as training progresses, using the MetFaces dataset.", "keywords": "Deep learning;geometry;manifolds;deep learning theory", "primary_area": "", "supplementary_material": "/attachment/63c2c54d04a6b81a74fac7ee95a89a8c5bc83a3f.zip", "author": "Saket Tiwari;George Konidaris", "authorids": "~Saket_Tiwari2;~George_Konidaris1", "gender": "M;M", "homepage": ";http://cs.brown.edu/people/gdk/", "dblp": "232/1978;56/6762", "google_scholar": ";9UERvVEAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Saket_Tiwari2;~George_Konidaris1", "aff": "Brown University;Brown University", "aff_domain": "brown.edu;brown.edu", "position": "PhD student;Assistant Professor", "bibtex": "@misc{\ntiwari2022effects,\ntitle={Effects of Data Geometry in Early Deep Learning},\nauthor={Saket Tiwari and George Konidaris},\nyear={2022},\nurl={https://openreview.net/forum?id=vKMVrqvXbXu}\n}", "github": "", "project": "", "reviewers": "WSLJ;1oFk;5YsV;gvJZ", "site": "https://openreview.net/forum?id=vKMVrqvXbXu", "pdf_size": 0, "recommendation": "5;5;6;8", "confidence": "4;4;2;4", "correctness": "3;3;4;4", "technical_novelty": "3;2;3;4", "empirical_novelty": "2;2;0;0", "wc_summary_paper": "107;190;85;97", "wc_summary_review": "33;65;31;67", "wc_main_review": "408;917;169;115", "wc_review": "548;1172;285;279", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "594;787;122;0", "reply_reviewers": "0;0;0;0", "reply_authors": "2;1;1;0", "recommendation_avg": [ 6.0, 1.224744871391589 ], "confidence_avg": [ 3.5, 0.8660254037844386 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 1.0, 1.0 ], "wc_summary_paper_avg": [ 119.75, 41.299969733645085 ], "wc_summary_review_avg": [ 49.0, 17.029386365926403 ], "wc_main_review_avg": [ 402.25, 316.98531117387756 ], "wc_review_avg": [ 571.0, 363.5897413294275 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 375.75, 324.9372054720727 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.7071067811865476 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.8164965809277259, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10220971549971161328&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 12, "aff_unique_index": "0;0", "aff_unique_norm": "Brown University", "aff_unique_dep": "", "aff_unique_url": "https://www.brown.edu", "aff_unique_abbr": "Brown", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "vKefw-zKOft", "title": "Towards Efficient On-Chip Training of Quantum Neural Networks", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Quantum Neural Network (QNN) is drawing increasingly more research interest thanks to its potential to achieve quantum advantage on near-term Noisy Intermediate Scale Quantum (NISQ) hardware. In order to achieve scalable QNN learning, the training process needs to be offloaded to real quantum machines instead of using exponential-cost classical simulators. One common approach to obtain QNN gradients is parameter shift whose cost scales linearly with the number of qubits. This work presents the first experimental demonstration of practical on-chip QNN training with parameter shift. Nevertheless, we find that due to the significant quantum errors (noises) on real machines, gradients obtained from naive parameter shift have low fidelity and thus degrade the training accuracy. To this end, we further propose probabilistic gradient pruning to firstly identify gradients with potentially large errors and then remove them. Specifically, small gradients have larger relative errors than large ones, thus having a higher probability to be pruned. We perform extensive experiments on 5 classification tasks with 5 real quantum machines. The results demonstrate that our on-chip training achieves over 90% and 60% accuracy for 2-class and 4-class image classification tasks. The probabilistic gradient pruning brings up to 7% \\qnn accuracy improvements over no pruning. Overall, we successfully obtain comparable accuracy with noise-free simulation but have much better training scalability. We also open-source our PyTorch library for on-chip \\qnn training with parameters shift and easy deployment at this link: https://anonymous.4open.science/r/iclr-on-chip-qnn-572E .\n", "keywords": "Quantum Computing;Machine Learning;Neural Networks;Robustness;Quantum Machine Learning;Quantum Neural Networks;On-Chip;Training", "primary_area": "", "supplementary_material": "", "author": "Hanrui Wang;Zirui Li;Jiaqi Gu;Yongshan Ding;David Z. Pan;Song Han", "authorids": "~Hanrui_Wang1;suffix_array@sjtu.edu.cn;~Jiaqi_Gu3;~Yongshan_Ding1;~David_Z._Pan1;~Song_Han5", "gender": "M;;M;;M;", "homepage": "https://hanruiwang.me;;https://scopex-asu.github.io;https://www.yongshanding.com;http://users.ece.utexas.edu/~dpan/;", "dblp": "214/9819-2;;;;p/DavidZhigangPan.html;", "google_scholar": "https://scholar.google.com/citations?hl=en;;FeIV12MAAAAJ;;3aLlroEAAAAJ;", "orcid": "0000-0002-7229-4015;;;;0000-0002-5705-2501;", "linkedin": "hanrui-wang-34458217a/;;;;davidzpan/;", "or_profile": "~Hanrui_Wang1;suffix_array@sjtu.edu.cn;~Jiaqi_Gu3;~Yongshan_Ding1;~David_Z._Pan1;~Song_Han5", "aff": "Massachusetts Institute of Technology;;University of Texas, Austin;Yale University;University of Texas, Austin;", "aff_domain": "mit.edu;;utexas.edu;yale.edu;utexas.edu;", "position": "PhD student;;PhD student;Assistant Professor;Professor;", "bibtex": "@misc{\nwang2022towards,\ntitle={Towards Efficient On-Chip Training of Quantum Neural Networks},\nauthor={Hanrui Wang and Zirui Li and Jiaqi Gu and Yongshan Ding and David Z. Pan and Song Han},\nyear={2022},\nurl={https://openreview.net/forum?id=vKefw-zKOft}\n}", "github": "", "project": "", "reviewers": "eFXE;iQid;d99Y;RwUJ;wCDF", "site": "https://openreview.net/forum?id=vKefw-zKOft", "pdf_size": 0, "recommendation": "3;3;5;5;5", "confidence": "5;5;4;2;5", "correctness": "2;4;3;3;2", "technical_novelty": "2;2;3;3;1", "empirical_novelty": "2;2;2;2;3", "wc_summary_paper": "99;65;32;58;90", "wc_summary_review": "27;48;31;51;14", "wc_main_review": "298;361;206;516;47", "wc_review": "424;474;269;625;151", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "167;139;22;52;142", "reply_reviewers": "0;0;0;0;0", "reply_authors": "1;1;1;1;1", "recommendation_avg": [ 4.2, 0.9797958971132712 ], "confidence_avg": [ 4.2, 1.16619037896906 ], "correctness_avg": [ 2.8, 0.7483314773547882 ], "technical_novelty_avg": [ 2.2, 0.7483314773547882 ], "empirical_novelty_avg": [ 2.2, 0.39999999999999997 ], "wc_summary_paper_avg": [ 68.8, 23.861265683110776 ], "wc_summary_review_avg": [ 34.2, 13.731715115017497 ], "wc_main_review_avg": [ 285.6, 156.32606948298803 ], "wc_review_avg": [ 388.6, 164.43187039014063 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 104.4, 56.68368371939142 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.560112033611204, "corr_recommendation_correctness": -0.21821789023599233, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9026863870580039658&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;2;1", "aff_unique_norm": "Massachusetts Institute of Technology;University of Texas at Austin;Yale University", "aff_unique_dep": ";;", "aff_unique_url": "https://web.mit.edu;https://www.utexas.edu;https://www.yale.edu", "aff_unique_abbr": "MIT;UT Austin;Yale", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Austin", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "vLz0e9S-iF3", "title": "Quasi-potential theory for escape problem: Quantitative sharpness effect on SGD's escape from local minima", "track": "main", "status": "Reject", "tldr": "", "abstract": "We develop a quantitative theory on the escape problem of stochastic gradient descent (SGD) and investigate the effect of the sharpness of loss surfaces on escape. Deep learning has achieved tremendous success in various domains, however, it has opened up theoretical problems. For instance, it is still an ongoing question as to why an SGD can find solutions that generalize well over non-convex loss surfaces. An approach to explain this phenomenon is the escape problem, which investigates how efficiently the SGD escapes from local minima. In this paper, we develop a novel theoretical framework for the escape problem using ``quasi-potential,\" the notion defined in a fundamental theory of stochastic dynamical systems. We show that quasi-potential theory can handle the geometric property of loss surfaces and a covariance structure of gradient noise in a unified manner through an eigenvalue argument, while previous research studied them separately. Our theoretical results imply that sharpness contributes to slowing down escape, but the SGD\u2019s noise structure cancels the effect, which ends up exponentially accelerating its escape. We also conduct experiments to empirically validate our theory using neural networks trained with real data.", "keywords": "deep learning;learning dynamics;SGD;flat minima", "primary_area": "", "supplementary_material": "/attachment/0057eb25f7c12c66a838f1a6d81a6117d3a056cb.zip", "author": "Hikaru Ibayashi;Masaaki Imaizumi", "authorids": "~Hikaru_Ibayashi1;~Masaaki_Imaizumi1", "gender": "M;M", "homepage": "http://www-scf.usc.edu/~ibayashi/;https://sites.google.com/view/mimaizumi/home", "dblp": ";", "google_scholar": "wrzvf7oAAAAJ;https://scholar.google.co.jp/citations?user=6c0Ljd4AAAAJ", "orcid": ";", "linkedin": "hikaru-ibayashi/;masaaki-imaizumi-38600b157/", "or_profile": "~Hikaru_Ibayashi1;~Masaaki_Imaizumi1", "aff": "University of Southern California;The University of Tokyo", "aff_domain": "usc.edu;u-tokyo.ac.jp", "position": "PhD student;Associate Professor", "bibtex": "@misc{\nibayashi2022quasipotential,\ntitle={Quasi-potential theory for escape problem: Quantitative sharpness effect on {SGD}'s escape from local minima},\nauthor={Hikaru Ibayashi and Masaaki Imaizumi},\nyear={2022},\nurl={https://openreview.net/forum?id=vLz0e9S-iF3}\n}", "github": "", "project": "", "reviewers": "NNov;5Jw7;mKRS", "site": "https://openreview.net/forum?id=vLz0e9S-iF3", "pdf_size": 0, "recommendation": "3;3;5", "confidence": "5;4;3", "correctness": "1;3;4", "technical_novelty": "1;3;3", "empirical_novelty": "0;2;0", "wc_summary_paper": "41;35;97", "wc_summary_review": "12;42;29", "wc_main_review": "190;591;157", "wc_review": "243;668;283", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "374;640;450", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "recommendation_avg": [ 3.6666666666666665, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "correctness_avg": [ 2.6666666666666665, 1.247219128924647 ], "technical_novelty_avg": [ 2.3333333333333335, 0.9428090415820634 ], "empirical_novelty_avg": [ 0.6666666666666666, 0.9428090415820634 ], "wc_summary_paper_avg": [ 57.666666666666664, 27.920522121829233 ], "wc_summary_review_avg": [ 27.666666666666668, 12.283683848458853 ], "wc_main_review_avg": [ 312.6666666666667, 197.2719499123538 ], "wc_review_avg": [ 398.0, 191.61593531506367 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 488.0, 111.86897097348606 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.8660254037844387, "corr_recommendation_correctness": 0.7559289460184545, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Rql5V1nu3k0J:scholar.google.com/&scioq=Quasi-potential+theory+for+escape+problem:+Quantitative+sharpness+effect+on+SGD%27s+escape+from+local+minima&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1", "aff_unique_norm": "University of Southern California;University of Tokyo", "aff_unique_dep": ";", "aff_unique_url": "https://www.usc.edu;https://www.u-tokyo.ac.jp", "aff_unique_abbr": "USC;UTokyo", "aff_campus_unique_index": "0", "aff_campus_unique": "Los Angeles;", "aff_country_unique_index": "0;1", "aff_country_unique": "United States;Japan" }, { "id": "vMWl7Ta1ymW", "title": "Regularizing Image Classification Neural Networks with Partial Differential Equations", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Differential equations can be used to design neural networks. For instance, neural ordinary differential equations (neural ODEs) can be considered as a continuous generalization of residual networks. In this work, we present a novel partial differential equation (PDE)-based approach for image classification, where we learn both a PDE's governing equation for image classification and its solution approximated by our neural network. In other words, the knowledge contained by the learned governing equation can be injected into the neural network which approximates the PDE solution function. Owing to the recent advancement of learning PDEs, the presented novel concept, called PR-Net, can be implemented. Our method shows comparable (or better) accuracy and robustness for various datasets and tasks in comparison with neural ODEs and Isometric MobileNet V3. For the efficient nature of PR-Net, it is suitable to be deployed in resource-scarce environments, e.g., deploying instead of MobileNet.", "keywords": "partial differential equations;image classification;differential equations", "primary_area": "", "supplementary_material": "/attachment/d6201a1835deb1e8a2d32a179c48b6f74b6b2dde.zip", "author": "Jungeun Kim;Seunghyun Hwang;Jeehyun Hwang;Kookjin Lee;Dongeun Lee;Noseong Park", "authorids": "~Jungeun_Kim2;~Seunghyun_Hwang1;~Jeehyun_Hwang1;~Kookjin_Lee1;~Dongeun_Lee1;~Noseong_Park1", "gender": "F;M;M;M;M;", "homepage": ";https://sites.google.com/view/johnhwang/publications;https://github.com/hwanggh96;https://scholar.google.com/citations?hl=en&user=KL89hVQAAAAJ&view_op=list_works;;", "dblp": "86/1254.html;288/0148;306/1727;122/5103;62/688;", "google_scholar": "ipQHR3wAAAAJ;hn3bN5kAAAAJ;;https://scholar.google.com/citations?hl=en;;", "orcid": "0000-0002-5341-726X;0000-0002-5900-6445;;;;", "linkedin": "jungeun-kim-383bb218a/;;;;;", "or_profile": "~Jungeun_Kim2;~Seunghyun_Hwang1;~Jeehyun_Hwang1;~Kookjin_Lee1;~Dongeun_Lee1;~Noseong_Park1", "aff": "Yonsei University;Purdue University;Yonsei Univ.;Arizona State University;East Texas A&M University;", "aff_domain": "yonsei.ac.kr;purdue.edu;yonsei.ac.kr;asu.edu;tamuc.edu;", "position": "MS student;PhD student;Undergrad student;Assistant Professor;Assistant Professor;", "bibtex": "@misc{\nkim2022regularizing,\ntitle={Regularizing Image Classification Neural Networks with Partial Differential Equations},\nauthor={Jungeun Kim and Seunghyun Hwang and Jeehyun Hwang and Kookjin Lee and Dongeun Lee and Noseong Park},\nyear={2022},\nurl={https://openreview.net/forum?id=vMWl7Ta1ymW}\n}", "github": "", "project": "", "reviewers": "oi9x;2doa;uwo2;Xi54;Z9p9", "site": "https://openreview.net/forum?id=vMWl7Ta1ymW", "pdf_size": 0, "recommendation": "3;3;3;5;5", "confidence": "2;4;4;4;3", "correctness": "2;3;3;2;3", "technical_novelty": "2;2;3;3;3", "empirical_novelty": "2;2;2;3;3", "wc_summary_paper": "11;76;37;101;75", "wc_summary_review": "27;103;54;44;85", "wc_main_review": "802;509;463;439;260", "wc_review": "840;688;554;584;420", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "0;0;0;0;0", "reply_reviewers": "0;0;0;0;0", "reply_authors": "0;0;0;0;0", "recommendation_avg": [ 3.8, 0.9797958971132712 ], "confidence_avg": [ 3.4, 0.8 ], "correctness_avg": [ 2.6, 0.4898979485566356 ], "technical_novelty_avg": [ 2.6, 0.4898979485566356 ], "empirical_novelty_avg": [ 2.4, 0.4898979485566356 ], "wc_summary_paper_avg": [ 60.0, 31.91238004286111 ], "wc_summary_review_avg": [ 62.6, 27.644891028904418 ], "wc_main_review_avg": [ 494.6, 175.4019384157427 ], "wc_review_avg": [ 617.2, 140.45412062307037 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.10206207261596574, "corr_recommendation_correctness": -0.16666666666666663, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12396301758259928966&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;0;2;3", "aff_unique_norm": "Yonsei University;Purdue University;Arizona State University;East Texas A&M University", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.yonsei.ac.kr;https://www.purdue.edu;https://www.asu.edu;https://www.etam.edu", "aff_unique_abbr": "Yonsei;Purdue;ASU;ETAMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;1;1", "aff_country_unique": "South Korea;United States" }, { "id": "vMYCSy4VwvD", "title": "Multi-Domain Active Learning: A Comparative Study", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Multi-domain learning (MDL) refers to learning a set of models simultaneously, with each one specialized to perform a task in a certain domain. Generally, high labeling effort is required in MDL, as data need to be labeled by human experts for every domain. Active learning (AL), which reduces labeling effort by only using the most informative data, can be utilized to address the above issue. The resultant paradigm is termed multi-domain active learning (MDAL). However, currently little research has been done in MDAL, not to mention any off-the-shelf solution. To fill this gap, we present a comprehensive comparative study of 20 different MDAL algorithms, which are established by combining five representative MDL models under different information-sharing schemes and four well-used AL strategies belonging to different categories. We evaluate the algorithms on five datasets, involving textual and visual classification tasks. We find that the models which capture both domain-dependent and domain-specific information are more likely to perform well in the whole AL loops. Besides, the simplest informativebased uncertainty strategy surprisingly performs well on most datasets. As our off-the-shelf recommendation, the combination of Multinomial Adversarial Networks (MAN) with the best vs second best (BvSB) uncertainty strategy shows its superiority in most cases, and this combination is also robust across datasets and domains.", "keywords": "active learning;multi-domain Learning", "primary_area": "", "supplementary_material": "", "author": "Rui He;Shengcai Liu;Shan He;Ke Tang", "authorids": "~Rui_He3;~Shengcai_Liu1;s.he@cs.bham.ac.uk;~Ke_Tang2", "gender": "M;;;M", "homepage": ";;;https://faculty.sustech.edu.cn/tangk3/", "dblp": "28/6300;;;https://dblp.uni-trier.de/pers/hd/t/Tang:Ke.html", "google_scholar": "9do30b8AAAAJ;;;mzLHFbAAAAAJ", "orcid": ";;;0000-0002-6236-2002", "linkedin": ";;;", "or_profile": "~Rui_He3;~Shengcai_Liu1;s.he@cs.bham.ac.uk;~Ke_Tang2", "aff": "Southern University of Science and Technology;;;Southern University of Science and Technology", "aff_domain": "sustech.edu.cn;;;sustech.edu.cn", "position": "PhD student;;;Full Professor", "bibtex": "@misc{\nhe2022multidomain,\ntitle={Multi-Domain Active Learning: A Comparative Study},\nauthor={Rui He and Shengcai Liu and Shan He and Ke Tang},\nyear={2022},\nurl={https://openreview.net/forum?id=vMYCSy4VwvD}\n}", "github": "", "project": "", "reviewers": "q9Ff;dGt9;UKj6;nv63", "site": "https://openreview.net/forum?id=vMYCSy4VwvD", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "4;4;4;4", "correctness": "4;4;3;3", "technical_novelty": "2;2;1;2", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "65;50;36;132", "wc_summary_review": "84;16;17;39", "wc_main_review": "227;60;228;132", "wc_review": "376;126;281;303", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 1.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 70.75, 36.819661866997095 ], "wc_summary_review_avg": [ 39.0, 27.55902755904134 ], "wc_main_review_avg": [ 161.75, 70.50664862266537 ], "wc_review_avg": [ 271.5, 91.06728281880382 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": -0.5773502691896257, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13039521841022823763&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Southern University of Science and Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.sustech.edu.cn", "aff_unique_abbr": "SUSTech", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "id": "vNDHZZa-Q92", "title": "Neural Extensions: Training Neural Networks with Set Functions", "track": "main", "status": "Desk Reject", "tldr": "", "abstract": "Integrating discrete computational steps into deep learning architectures is an important consideration when learning to reason over discrete items. However, many tasks that involve discrete choices are defined via (combinatorial) set functions, and thereby pose challenges for end-to-end training. In this work, we explore a general framework to construct continuous extensions of such discrete functions that enables training via gradient methods. Our framework includes well-known extensions such as the Lovasz extension of submodular set functions and facilitates the design of novel continuous extensions based on problem-specific considerations, including constraints. We demonstrate the versatility of our framework on tasks ranging from combinatorial optimization to image classification. ", "keywords": "continuous extensions;algorithmic reasoning;set functions;machine learning;combinatorial optimization;image classification", "primary_area": "", "supplementary_material": "", "author": "Nikolaos Karalias;Joshua David Robinson;Andreas Loukas;Stefanie Jegelka", "authorids": "~Nikolaos_Karalias1;~Joshua_David_Robinson1;~Andreas_Loukas1;~Stefanie_Jegelka3", "gender": "M;M;M;F", "homepage": ";https://joshrobinson.mit.edu/;;http://people.csail.mit.edu/stefje/", "dblp": "267/9290;15/4759;19/10012;38/7003", "google_scholar": "CRLG9UcAAAAJ;E02doCkAAAAJ;https://scholar.google.ch/citations?user=-XGXJbQAAAAJ;gTWUZlsAAAAJ", "orcid": "0000-0002-9471-5343;;;", "linkedin": ";;;", "or_profile": "~Nikolaos_Karalias1;~Joshua_David_Robinson1;~Andreas_Loukas1;~Stefanie_Jegelka3", "aff": "Swiss Federal Institute of Technology Lausanne;Massachusetts Institute of Technology;Roche / Genentech;Massachusetts Institute of Technology", "aff_domain": "epfl.ch;mit.edu;roche.com;mit.edu", "position": "PhD student;PhD student;Principal Researcher;Associate Professor", "bibtex": "@misc{\nkaralias2022neural,\ntitle={Neural Extensions: Training Neural Networks with Set Functions},\nauthor={Nikolaos Karalias and Joshua David Robinson and Andreas Loukas and Stefanie Jegelka},\nyear={2022},\nurl={https://openreview.net/forum?id=vNDHZZa-Q92}\n}", "github": "", "project": "", "reviewers": "", "site": "https://openreview.net/forum?id=vNDHZZa-Q92", "pdf_size": 0, "recommendation": "", "confidence": "", "correctness": "", "technical_novelty": "", "empirical_novelty": "", "wc_summary_paper": "", "wc_summary_review": "", "wc_main_review": "", "wc_review": "", "wc_reply_reviewers": "", "wc_reply_authors": "", "reply_reviewers": "", "reply_authors": "", "recommendation_avg": [ 0, 0 ], "confidence_avg": [ 0, 0 ], "correctness_avg": [ 0, 0 ], "technical_novelty_avg": [ 0, 0 ], "empirical_novelty_avg": [ 0, 0 ], "wc_summary_paper_avg": [ 0, 0 ], "wc_summary_review_avg": [ 0, 0 ], "wc_main_review_avg": [ 0, 0 ], "wc_review_avg": [ 0, 0 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 1, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0, "corr_recommendation_correctness": 0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:UMhcC-jPwREJ:scholar.google.com/&scioq=Neural+Extensions:+Training+Neural+Networks+with+Set+Functions&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1;2;1", "aff_unique_norm": "Swiss Federal Institute of Technology Lausanne;Massachusetts Institute of Technology;Roche", "aff_unique_dep": ";;", "aff_unique_url": "https://www.epfl.ch;https://web.mit.edu;https://www.roche.com", "aff_unique_abbr": "EPFL;MIT;Roche", "aff_campus_unique_index": "0", "aff_campus_unique": "Lausanne;", "aff_country_unique_index": "0;1;0;1", "aff_country_unique": "Switzerland;United States" }, { "id": "vPK-G5HbnWg", "title": "PACE: A Parallelizable Computation Encoder for Directed Acyclic Graphs", "track": "main", "status": "Reject", "tldr": "", "abstract": "Optimization of directed acyclic graph (DAG) structures has many applications, such as neural architecture search (NAS) and probabilistic graphical model learning. Encoding DAGs into real vectors is a dominant component in most neural-network-based DAG optimization frameworks. Currently, most popular DAG encoders use an asynchronous message passing scheme which sequentially processes nodes according to the dependency between nodes in a DAG. That is, a node must not be processed until all its predecessors are processed. As a result, they are inherently not parallelizable. In this work, we propose a Parallelizable Attention-based Computation structure Encoder (PACE) that processes nodes simultaneously and encodes DAGs in parallel. We demonstrate the superiority of PACE through encoder-dependent optimization subroutines that search the optimal DAG structure based on the learned DAG embeddings. Experiments show that PACE not only improves the effectiveness over previous sequential DAG encoders with a significantly boosted training and inference speed, but also generates smooth latent (DAG encoding) spaces that are beneficial to downstream optimization subroutines.", "keywords": "DAG encoder;graph neural network;Transformer", "primary_area": "", "supplementary_material": "/attachment/0ccc50f3928ffcc1bd54313d09021bdb29baaf1f.zip", "author": "Zehao Dong;Muhan Zhang;Fuhai Li;Yixin Chen", "authorids": "~Zehao_Dong1;~Muhan_Zhang1;~Fuhai_Li1;~Yixin_Chen1", "gender": "M;M;M;M", "homepage": "https://www.zehaodong.com;https://muhanzhang.github.io/;https://profiles.wustl.edu/en/persons/fuhai-li;https://www.cse.wustl.edu/~yixin.chen/", "dblp": "292/7480;157/5518;;59/983", "google_scholar": ";https://scholar.google.com.hk/citations?user=OBBqkosAAAAJ;rVZfU9sAAAAJ;NByrsK0AAAAJ", "orcid": ";0000-0002-7680-6401;0000-0002-3773-146X;", "linkedin": ";jerry-muhan-zhang-a33a1777/;fuhai-li-1b05611a/;", "or_profile": "~Zehao_Dong1;~Muhan_Zhang1;~Fuhai_Li1;~Yixin_Chen1", "aff": "Washington University, St. Louis;Peking University;Washington University, Saint Louis;Washington University, Saint Louis", "aff_domain": "wustl.edu;pku.edu.cn;wustl.edu;wustl.edu", "position": "PhD student;Assistant Professor;Assistant Professor;Full Professor", "bibtex": "@misc{\ndong2022pace,\ntitle={{PACE}: A Parallelizable Computation Encoder for Directed Acyclic Graphs},\nauthor={Zehao Dong and Muhan Zhang and Fuhai Li and Yixin Chen},\nyear={2022},\nurl={https://openreview.net/forum?id=vPK-G5HbnWg}\n}", "github": "", "project": "", "reviewers": "vSGS;rAFV;d1eo;1m8b", "site": "https://openreview.net/forum?id=vPK-G5HbnWg", "pdf_size": 0, "recommendation": "5;5;5;6", "confidence": "5;3;4;4", "correctness": "3;3;3;4", "technical_novelty": "2;3;2;3", "empirical_novelty": "3;3;3;3", "wc_summary_paper": "121;65;80;72", "wc_summary_review": "102;64;34;23", "wc_main_review": "994;398;229;555", "wc_review": "1217;527;343;650", "wc_reply_reviewers": "652;205;0;0", "wc_reply_authors": "1086;1008;1162;941", "reply_reviewers": "1;1;0;0", "reply_authors": "2;2;2;2", "recommendation_avg": [ 5.25, 0.4330127018922193 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 84.5, 21.73131381210073 ], "wc_summary_review_avg": [ 55.75, 30.629846555280032 ], "wc_main_review_avg": [ 544.0, 284.23669713814223 ], "wc_review_avg": [ 684.25, 326.4103054439305 ], "wc_reply_reviewers_avg": [ 214.25, 266.2314547531903 ], "wc_reply_authors_avg": [ 1049.25, 82.88961032602337 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 2.0, 0.0 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 1.0, "gs_citation": 23, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11354614986119464774&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff_unique_index": "0;1;0;0", "aff_unique_norm": "Washington University in St. Louis;Peking University", "aff_unique_dep": ";", "aff_unique_url": "https://wustl.edu;http://www.pku.edu.cn", "aff_unique_abbr": "WUSTL;Peking U", "aff_campus_unique_index": "0;2;2", "aff_campus_unique": "St. Louis;;Saint Louis", "aff_country_unique_index": "0;1;0;0", "aff_country_unique": "United States;China" }, { "id": "vQ58AMOw4Il", "title": "Hermitry Ratio: Evaluating the validity of perturbation methods for explainable deep learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Perturbation methods are model-agnostic methods used to generate heatmaps to explain black-box algorithms such as deep neural networks. Perturbation methods work by perturbing the input image. However, by perturbing parts of the input image we are changing the underlying structure of the image, potentially generating out-of-distribution (OOD) data. This would violate one of the core assumptions in supervised learning, namely that the train and test data come from the same distribution. \nIn this study, we coin the term hermitry ratio to quantify the utility of perturbation methods by looking at the amount of OOD samples they produce. Using this metric, we observe the utility of XAI methods (Occlusion analysis, LIME, Anchor LIME, Kernel SHAP) for image classification models ResNet50, DensNet121 and MnasNet1.0 on three classes of the ImageNet dataset. Our results show that, to some extent, \\emph{all} four perturbation methods generate OOD data regardless of architecture or image class. Occlusion analysis primarily produces in-distribution perturbations while LIME produces mostly OOD perturbations. ", "keywords": "deep learning;explainability;heatmaps;out-of-distribution detection", "primary_area": "", "supplementary_material": "", "author": "Gabrielle Ras;Erdi \u00c7all\u0131;Marcel van Gerven", "authorids": "~Gabrielle_Ras1;~Erdi_\u00c7all\u01311;~Marcel_van_Gerven1", "gender": "F;M;M", "homepage": "https://gabiras.com/;;http://www.artcogsys.com", "dblp": ";;", "google_scholar": "EyH9zNoAAAAJ;;https://scholar.google.com/citations?hl=en", "orcid": ";;0000-0002-2206-9098", "linkedin": "gabrielle-ras-7a3a9678/;;marcel-van-gerven-8698784/", "or_profile": "~Gabrielle_Ras1;~Erdi_\u00c7all\u01311;~Marcel_van_Gerven1", "aff": "Radboud University Nijmegen;Radboud University Medical Center;Donders Institute for Brain, Cognition and Behaviour, Radboud University", "aff_domain": "ru.nl;radboudumc.nl;ru.nl", "position": "PhD student;PhD student;Full Professor", "bibtex": "@misc{\nras2022hermitry,\ntitle={Hermitry Ratio: Evaluating the validity of perturbation methods for explainable deep learning},\nauthor={Gabrielle Ras and Erdi {\\c{C}}all{\\i} and Marcel van Gerven},\nyear={2022},\nurl={https://openreview.net/forum?id=vQ58AMOw4Il}\n}", "github": "", "project": "", "reviewers": "kCnT;VpWf;UZkz;xtip", "site": "https://openreview.net/forum?id=vQ58AMOw4Il", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "4;3;4;5", "correctness": "3;3;3;4", "technical_novelty": "2;2;2;3", "empirical_novelty": "3;2;2;3", "wc_summary_paper": "77;48;84;155", "wc_summary_review": "28;37;11;94", "wc_main_review": "232;297;273;229", "wc_review": "337;382;368;478", "wc_reply_reviewers": "0;18;0;0", "wc_reply_authors": "51;118;95;165", "reply_reviewers": "0;1;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 91.0, 39.338276525541886 ], "wc_summary_review_avg": [ 42.5, 31.164884084494844 ], "wc_main_review_avg": [ 257.75, 28.560243346302215 ], "wc_review_avg": [ 391.25, 52.66580959218229 ], "wc_reply_reviewers_avg": [ 4.5, 7.794228634059948 ], "wc_reply_authors_avg": [ 107.25, 41.12405014100629 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.816496580927726, "corr_recommendation_correctness": 1.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Y83arixOeu4J:scholar.google.com/&scioq=Hermitry+Ratio:+Evaluating+the+validity+of+perturbation+methods+for+explainable+deep+learning&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1;0", "aff_unique_norm": "Radboud University;Radboud University Medical Center", "aff_unique_dep": ";", "aff_unique_url": "https://www.ru.nl/;https://www.radboudumc.nl", "aff_unique_abbr": "RU;RadboudUMC", "aff_campus_unique_index": "0", "aff_campus_unique": "Nijmegen;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Netherlands" }, { "id": "vQmIksuciu2", "title": "EXPLAINABLE AI-BASED DYNAMIC FILTER PRUNING OF CONVOLUTIONAL NEURAL NETWORKS", "track": "main", "status": "Reject", "tldr": "", "abstract": "Filter pruning is one of the most effective ways to accelerate Convolutional Neural Networks (CNNs). Most of the existing works are focused on the static pruning of CNN filters. In dynamic pruning of CNN filters, existing works are based on\nthe idea of switching between different branches of a CNN or exiting early based on the difficulty of a sample. These approaches can reduce the average latency of inference, but they cannot reduce the longest-path latency of inference. In contrast, we present a novel approach of dynamic filter pruning that utilizes explainable AI along with early coarse prediction in the intermediate layers of a CNN. This coarse prediction is performed using a simple branch that is trained to perform top-k classification. The branch either predicts the output class with high confidence, in which case, the rest of the computations are left out. Alternatively, the branch predicts the output class to be within a subset of possible output classes. After this coarse prediction, only those filters that are important for this subset of classes are utilized for further computations. The importances of filters for each output class are obtained using explainable AI. Using this architecture of dynamic pruning, we not only reduce the average latency of inference, but we can also reduce the longest-path latency of inference. Our proposed architecture for dynamic pruning can be deployed on different hardware platforms. We evaluate our approach using commonly used image classification models and datasets on CPU and GPU platforms and demonstrate speedup without significant overhead.", "keywords": "XAI;Pruning;CNN;Explainable-AI", "primary_area": "", "supplementary_material": "", "author": "Muhammad Sabih;Frank Hannig;J\u00fcrgen Teich", "authorids": "~Muhammad_Sabih1;frank.hannig@fau.de;jeurgen.teich@fau.de", "gender": "M;;", "homepage": "https://www.cs12.tf.fau.eu/person/muhammad-sabih/;;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": "~Muhammad_Sabih1;frank.hannig@fau.de;jeurgen.teich@fau.de", "aff": "Friedrich-Alexander-Universit\u00e4t Erlangen-N\u00fcrnberg;;", "aff_domain": "fau.eu;;", "position": "PhD student;;", "bibtex": "@misc{\nsabih2022explainable,\ntitle={{EXPLAINABLE} {AI}-{BASED} {DYNAMIC} {FILTER} {PRUNING} {OF} {CONVOLUTIONAL} {NEURAL} {NETWORKS}},\nauthor={Muhammad Sabih and Frank Hannig and J{\\\"u}rgen Teich},\nyear={2022},\nurl={https://openreview.net/forum?id=vQmIksuciu2}\n}", "github": "", "project": "", "reviewers": "kGTk;PWoD;vKUc;nAuM", "site": "https://openreview.net/forum?id=vQmIksuciu2", "pdf_size": 0, "recommendation": "1;3;3;5", "confidence": "4;3;5;4", "correctness": "2;3;2;3", "technical_novelty": "1;2;1;2", "empirical_novelty": "1;2;1;3", "wc_summary_paper": "41;83;166;74", "wc_summary_review": "33;32;34;52", "wc_main_review": "189;379;358;380", "wc_review": "263;494;558;506", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.0, 1.4142135623730951 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 1.5, 0.5 ], "empirical_novelty_avg": [ 1.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 91.0, 46.03802775966842 ], "wc_summary_review_avg": [ 37.75, 8.257572282456872 ], "wc_main_review_avg": [ 326.5, 79.87020721145025 ], "wc_review_avg": [ 455.25, 113.57238880995679 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.7071067811865476, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:oWbUiFh6Jy0J:scholar.google.com/&scioq=EXPLAINABLE+AI-BASED+DYNAMIC+FILTER+PRUNING+OF+CONVOLUTIONAL+NEURAL+NETWORKS&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "Friedrich-Alexander-Universit\u00e4t Erlangen-N\u00fcrnberg", "aff_unique_dep": "", "aff_unique_url": "https://www fau.de", "aff_unique_abbr": "FAU", "aff_country_unique_index": "0", "aff_country_unique": "Germany" }, { "id": "vRhkfX8G_H9", "title": "SpSC: A Fast and Provable Algorithm for Sampling-Based GNN Training", "track": "main", "status": "Reject", "tldr": "", "abstract": "Neighbor sampling is a commonly used technique for training Graph Neural Networks (GNNs) on large graphs. Previous work has shown that sampling-based GNN training can be considered as Stochastic Compositional Optimization (SCO) problems and can be better solved by SCO algorithms. However, we find that SCO algorithms are impractical for training GNNs on large graphs because they need to store the moving averages of the aggregated features of all nodes in the graph. The moving averages can easily exceed the GPU memory limit and even the CPU memory limit. In this work, we propose a variant of SCO algorithms with sparse moving averages for GNN training. By storing the moving averages in the most recent iterations, our algorithm only requires a fixed size buffer, regardless of the graph size. We show that our algorithm can achieve $O(\\sqrt{1/K})$ convergence rate when the buffer size satisfies certain conditions. Our experiments validate our theoretical results and show that our algorithm outperforms the traditional Adam SGD for GNN training with a small memory overhead.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Shihui Song;Peng Jiang", "authorids": "~Shihui_Song1;~Peng_Jiang4", "gender": "F;", "homepage": "https://songshsongsh.github.io/;https://homepage.divms.uiowa.edu/~penjiang/", "dblp": ";92/1104-4", "google_scholar": ";CxfXT14AAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Shihui_Song1;~Peng_Jiang4", "aff": "University of Iowa;University of Iowa", "aff_domain": "iowa.edu;uiowa.edu", "position": "PhD student;Assistant Professor", "bibtex": "@misc{\nsong2022spsc,\ntitle={Sp{SC}: A Fast and Provable Algorithm for Sampling-Based {GNN} Training},\nauthor={Shihui Song and Peng Jiang},\nyear={2022},\nurl={https://openreview.net/forum?id=vRhkfX8G_H9}\n}", "github": "", "project": "", "reviewers": "YqXv;7iri;pkB7;xK4K", "site": "https://openreview.net/forum?id=vRhkfX8G_H9", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "4;5;3;3", "correctness": "3;2;4;3", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "54;34;89;85", "wc_summary_review": "36;25;167;51", "wc_main_review": "299;265;494;219", "wc_review": "389;324;750;355", "wc_reply_reviewers": "0;0;59;0", "wc_reply_authors": "84;266;116;160", "reply_reviewers": "0;0;1;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 65.5, 22.677080940897135 ], "wc_summary_review_avg": [ 69.75, 56.90068101525675 ], "wc_main_review_avg": [ 319.25, 104.81024520532333 ], "wc_review_avg": [ 454.5, 172.14891809128514 ], "wc_reply_reviewers_avg": [ 14.75, 25.54774941164094 ], "wc_reply_authors_avg": [ 156.5, 68.73681691786433 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.5222329678670935, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Hh8prQKDQ9AJ:scholar.google.com/&scioq=SpSC:+A+Fast+and+Provable+Algorithm+for+Sampling-Based+GNN+Training&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "University of Iowa", "aff_unique_dep": "", "aff_unique_url": "https://www.uiowa.edu", "aff_unique_abbr": "UIowa", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Message Passing Neural PDE Solvers", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7134", "id": "vSix3HPYKSU", "poster": "", "openreview": "https://openreview.net/forum?id=vSix3HPYKSU", "slides": "https://iclr.cc/virtual/2022/poster/7134", "video": "https://iclr.cc/virtual/2022/poster/7134", "author_site": "Johannes Brandstetter, Daniel Worrall, Max Welling", "tldr": "", "abstract": "The numerical solution of partial differential equations (PDEs) is difficult, having led to a century of research so far. Recently, there have been pushes to build neural--numerical hybrid solvers, which piggy-backs the modern trend towards fully end-to-end learned systems. Most works so far can only generalize over a subset of properties to which a generic solver would be faced, including: resolution, topology, geometry, boundary conditions, domain discretization regularity, dimensionality, etc. In this work, we build a solver, satisfying these properties, where all the components are based on neural message passing, replacing all heuristically designed components in the computation graph with backprop-optimized neural function approximators. We show that neural message passing solvers representationally contain some classical methods, such as finite differences, finite volumes, and WENO schemes. In order to encourage stability in training autoregressive models, we put forward a method that is based on the principle of zero-stability, posing stability as a domain adaptation problem. We validate our method on various fluid-like flow problems, demonstrating fast, stable, and accurate performance across different domain topologies, discretization, etc. in 1D and 2D. Our model outperforms state-of-the-art numerical solvers in the low resolution regime in terms of speed, and accuracy.", "keywords": "neural PDE solvers;message passing;autoregressive models;zero-stability", "primary_area": "", "supplementary_material": "", "author": "Johannes Brandstetter;Daniel E. Worrall;Max Welling", "authorids": "~Johannes_Brandstetter1;~Daniel_E._Worrall1;~Max_Welling1", "gender": "M;M;M", "homepage": ";https://danielewworrall.github.io/;https://staff.fnwi.uva.nl/m.welling/", "dblp": "251/8691;187/1680;16/2286", "google_scholar": "KiRvOHcAAAAJ;613GPbQAAAAJ;https://scholar.google.nl/citations?user=8200InoAAAAJ", "orcid": ";;0000-0003-1484-2121", "linkedin": ";daniel-worrall-46a43238/;", "or_profile": "~Johannes_Brandstetter1;~Daniel_E._Worrall1;~Max_Welling1", "aff": "Microsoft;Qualcomm Inc, QualComm;University of Amsterdam", "aff_domain": "microsoft.com;qti.qualcomm.com;uva.nl", "position": "Researcher;Postdoc;Full Professor", "bibtex": "@inproceedings{\nbrandstetter2022message,\ntitle={Message Passing Neural {PDE} Solvers},\nauthor={Johannes Brandstetter and Daniel E. Worrall and Max Welling},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=vSix3HPYKSU}\n}", "github": "", "project": "", "reviewers": "6VsU;5QFf;wxXT;mh3h", "pdf_size": 0, "recommendation": "6;6;8;8", "confidence": "4;3;4;4", "correctness": "3;3;4;4", "technical_novelty": "3;2;4;3", "empirical_novelty": "3;2;4;3", "wc_summary_paper": "104;13;161;56", "wc_summary_review": "31;38;102;20", "wc_main_review": "141;89;575;83", "wc_review": "276;140;838;159", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "569;395;530;32", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 7.0, 1.0 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 83.5, 55.12032293083922 ], "wc_summary_review_avg": [ 47.75, 31.971667144520318 ], "wc_main_review_avg": [ 222.0, 205.04877468543918 ], "wc_review_avg": [ 353.25, 284.674704706969 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 381.5, 211.86139336840017 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": 1.0, "gs_citation": 371, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9088135830297201356&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=vSix3HPYKSU", "email": "microsoft.com;qti.qualcomm.com;uva.nl", "author_num": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "Microsoft;Qualcomm Incorporated;University of Amsterdam", "aff_unique_dep": "Microsoft Corporation;;", "aff_unique_url": "https://www.microsoft.com;https://www.qualcomm.com;https://www.uva.nl", "aff_unique_abbr": "Microsoft;Qualcomm;UvA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1", "aff_country_unique": "United States;Netherlands" }, { "title": "A Neural Tangent Kernel Perspective of Infinite Tree Ensembles", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7111", "id": "vUH85MOXO7h", "poster": "", "openreview": "https://openreview.net/forum?id=vUH85MOXO7h", "slides": "https://iclr.cc/virtual/2022/poster/7111", "video": "https://iclr.cc/virtual/2022/poster/7111", "author_site": "Ryuichi Kanoh, Mahito Sugiyama", "tldr": "", "abstract": "In practical situations, the tree ensemble is one of the most popular models along with neural networks. A soft tree is a variant of a decision tree. Instead of using a greedy method for searching splitting rules, the soft tree is trained using a gradient method in which the entire splitting operation is formulated in a differentiable form. Although ensembles of such soft trees have been used increasingly in recent years, little theoretical work has been done to understand their behavior. By considering an ensemble of infinite soft trees, this paper introduces and studies the Tree Neural Tangent Kernel (TNTK), which provides new insights into the behavior of the infinite ensemble of soft trees. Using the TNTK, we theoretically identify several non-trivial properties, such as global convergence of the training, the equivalence of the oblivious tree structure, and the degeneracy of the TNTK induced by the deepening of the trees.", "keywords": "Neural Tangent Kernel;Tree Ensemble;Soft Tree", "primary_area": "", "supplementary_material": "/attachment/4dbd79f04fb1a962d127105279eb63dee09e444f.zip", "author": "Ryuichi Kanoh;Mahito Sugiyama", "authorids": "~Ryuichi_Kanoh1;~Mahito_Sugiyama1", "gender": "M;M", "homepage": ";https://mahito.nii.ac.jp/", "dblp": "287/4416;05/8421", "google_scholar": ";qLlRvTkAAAAJ", "orcid": ";0000-0001-5907-9831", "linkedin": "ryuichi-kanoh-43ab4316b/;", "or_profile": "~Ryuichi_Kanoh1;~Mahito_Sugiyama1", "aff": "NII, the Graduate University for Advanced Studies;National Institute of Informatics", "aff_domain": "nii.ac.jp;nii.ac.jp", "position": "PhD student;Associate Professor", "bibtex": "@inproceedings{\nkanoh2022a,\ntitle={A Neural Tangent Kernel Perspective of Infinite Tree Ensembles},\nauthor={Ryuichi Kanoh and Mahito Sugiyama},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=vUH85MOXO7h}\n}", "github": "", "project": "", "reviewers": "r8ZN;3ieY;yf2B", "pdf_size": 0, "recommendation": "3;8;8", "confidence": "2;3;3", "correctness": "3;4;4", "technical_novelty": "2;3;3", "empirical_novelty": "2;3;4", "wc_summary_paper": "114;62;57", "wc_summary_review": "38;13;6", "wc_main_review": "484;168;77", "wc_review": "636;243;140", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "1200;216;208", "reply_reviewers": "0;0;0", "reply_authors": "2;1;1", "recommendation_avg": [ 6.333333333333333, 2.357022603955158 ], "confidence_avg": [ 2.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 3.0, 0.816496580927726 ], "wc_summary_paper_avg": [ 77.66666666666667, 25.77250904010361 ], "wc_summary_review_avg": [ 19.0, 13.73559851869101 ], "wc_main_review_avg": [ 243.0, 174.41521340372424 ], "wc_review_avg": [ 339.6666666666667, 213.71684278242765 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 541.3333333333334, 465.75911752273356 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.9999999999999998, "corr_recommendation_correctness": 0.9999999999999998, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6263310966381003776&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=vUH85MOXO7h", "email": "nii.ac.jp;nii.ac.jp", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "National Institute of Informatics", "aff_unique_dep": "", "aff_unique_url": "https://www.nii.ac.jp", "aff_unique_abbr": "NII", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Japan" }, { "id": "vUvEyDA30k", "title": "Staircase Sign Method for Boosting Adversarial Attacks", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Crafting adversarial examples for the transfer-based attack is challenging and remains a research hot spot. Currently, such attack methods are based on the hypothesis that the substitute model and the victim's model learn similar decision boundaries, and they conventionally apply Sign Method (SM) to manipulate the gradient as the resultant perturbation. Although SM is efficient, it only extracts the sign of gradient units but ignores their value difference, which inevitably leads to a serious deviation. Therefore, we propose a novel Staircase Sign Method (S$^2$M) to alleviate this issue, thus boosting transfer-based attacks. Technically, our method heuristically divides the gradient sign into several segments according to the values of the gradient units, and then assigns each segment with a staircase weight for better crafting adversarial perturbation. As a result, our adversarial examples perform better in both white-box and black-box manner without being more visible. Since S$^2$M just manipulates the resultant gradient, our method can be generally integrated into any transfer-based attacks, and the computational overhead is negligible. Extensive experiments on the ImageNet dataset demonstrate the effectiveness of our proposed methods, which significantly improve the transferability (i.e., on average, \\textbf{5.1\\%} for normally trained models and \\textbf{11.2\\%} for adversarially trained defenses). Our code is available in the supplementary material.", "keywords": "black-box;transferable perturbation;simple but effective;non-targeted&targeted attacks.", "primary_area": "", "supplementary_material": "/attachment/a4e59fb826d5e0c8f67073222891ed7c9f822384.zip", "author": "Qilong Zhang;Xiaosu Zhu;Jingkuan Song;Lianli Gao", "authorids": "~Qilong_Zhang2;~Xiaosu_Zhu1;~Jingkuan_Song3;~Lianli_Gao1", "gender": "M;M;M;F", "homepage": ";https://github.com/xiaosu-zhu;https://cfm.uestc.edu.cn/~songjingkuan/;https://lianligao.github.io/", "dblp": "22/3730;243/3461;70/10575;123/9849.html", "google_scholar": "IgPyQWYAAAAJ;2DihiQ0AAAAJ;F5Zy9V4AAAAJ;https://scholar.google.com.au/citations?user=zsm2dpYAAAAJ", "orcid": "0009-0005-2591-5762;0000-0001-7728-2518;;", "linkedin": ";;;", "or_profile": "~Qilong_Zhang2;~Xiaosu_Zhu1;~Jingkuan_Song3;~Lianli_Gao1", "aff": "University of Electronic Science and Technology of China;University of Electronic Science and Technology of China;University of Electronic Science and Technology of China,;University of Electronic Science and Technology of China", "aff_domain": "uestc.edu;uestc.edu.cn;uestc.edu.cn;uestc.edu.cn", "position": "MS student;PhD student;Full Professor;Full Professor", "bibtex": "@misc{\nzhang2022staircase,\ntitle={Staircase Sign Method for Boosting Adversarial Attacks},\nauthor={Qilong Zhang and Xiaosu Zhu and Jingkuan Song and Lianli Gao},\nyear={2022},\nurl={https://openreview.net/forum?id=vUvEyDA30k}\n}", "github": "", "project": "", "reviewers": "jqB8;PEff;ZznF;MiLX", "site": "https://openreview.net/forum?id=vUvEyDA30k", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "5;3;4;4", "correctness": "3;3;4;3", "technical_novelty": "2;2;3;3", "empirical_novelty": "1;0;2;3", "wc_summary_paper": "65;59;61;86", "wc_summary_review": "84;69;57;69", "wc_main_review": "546;308;178;282", "wc_review": "695;436;296;437", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 1.5, 1.118033988749895 ], "wc_summary_paper_avg": [ 67.75, 10.755812382149477 ], "wc_summary_review_avg": [ 69.75, 9.575359001102779 ], "wc_main_review_avg": [ 328.5, 134.66532590091632 ], "wc_review_avg": [ 466.0, 144.11974188153405 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.6488856845230502, "corr_recommendation_correctness": 0.13245323570650439, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8108001188264490510&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "University of Electronic Science and Technology of China", "aff_unique_dep": "", "aff_unique_url": "https://www.uestc.edu.cn", "aff_unique_abbr": "UESTC", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "id": "vXGcHthY6v", "title": "Invariance Through Inference", "track": "main", "status": "Reject", "tldr": "", "abstract": "We introduce a general approach, called invariance through inference, for improving the test-time performance of a behavior agent in deployment environments with unknown perceptual variations. Instead of producing invariant visual features through memorization, invariance through inference turns adaptation at deployment-time into an unsupervised learning problem by trying to match the distribution of latent features to the agent's prior experience without relying on paired data. Although simple, we show that this idea leads to surprising improvements on a variety of adaptation scenarios without access to task reward, including changes in camera poses from the challenging distractor control suite. ", "keywords": "invariance;representation learning;out-of-distribution;domain adaptation;generalization", "primary_area": "", "supplementary_material": "/attachment/aefefb6daeb2afce4e16c55bb913568e169d1bc6.zip", "author": "Takuma Yoneda;Ge Yang;Matthew Walter;Bradly C. Stadie", "authorids": "~Takuma_Yoneda1;~Ge_Yang1;~Matthew_Walter1;~Bradly_C._Stadie1", "gender": "M;M;M;", "homepage": "https://takuma.yoneda.xyz/;http://www.episodeyang.com;http://ttic.edu/walter;", "dblp": ";48/4561-3;50/7734;166/1368", "google_scholar": "EtYv_AIAAAAJ;vaQcF6kAAAAJ;RAiewnEAAAAJ;", "orcid": ";0000-0001-7520-7055;0000-0003-1425-6050;", "linkedin": ";;;", "or_profile": "~Takuma_Yoneda1;~Ge_Yang1;~Matthew_Walter1;~Bradly_C._Stadie1", "aff": "Toyota Technological Institute at Chicago;Massachusetts Institute of Technology;Toyota Technological Institute at Chicago;Toyota Technological Institute at Chicago", "aff_domain": "ttic.edu;mit.edu;ttic.edu;ttic.edu", "position": "PhD student;Postdoc;Assistant Professor;Assistant Professor", "bibtex": "@misc{\nyoneda2022invariance,\ntitle={Invariance Through Inference},\nauthor={Takuma Yoneda and Ge Yang and Matthew Walter and Bradly C. Stadie},\nyear={2022},\nurl={https://openreview.net/forum?id=vXGcHthY6v}\n}", "github": "", "project": "", "reviewers": "X3P2;HvyM;jjRF;wVDQ", "site": "https://openreview.net/forum?id=vXGcHthY6v", "pdf_size": 0, "recommendation": "5;6;6;6", "confidence": "4;4;3;2", "correctness": "2;4;3;3", "technical_novelty": "3;3;3;3", "empirical_novelty": "3;3;3;0", "wc_summary_paper": "91;86;657;55", "wc_summary_review": "84;80;190;138", "wc_main_review": "606;152;495;695", "wc_review": "781;318;1342;888", "wc_reply_reviewers": "0;0;0;79", "wc_reply_authors": "1958;430;2277;1224", "reply_reviewers": "0;0;0;1", "reply_authors": "4;2;4;3", "recommendation_avg": [ 5.75, 0.4330127018922193 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.25, 1.299038105676658 ], "wc_summary_paper_avg": [ 222.25, 251.38155759721118 ], "wc_summary_review_avg": [ 123.0, 44.955533585978046 ], "wc_main_review_avg": [ 487.0, 205.98179531211005 ], "wc_review_avg": [ 832.25, 364.016740686469 ], "wc_reply_reviewers_avg": [ 19.75, 34.208003449485325 ], "wc_reply_authors_avg": [ 1472.25, 712.6515189768419 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 3.25, 0.82915619758885 ], "replies_avg": [ 20, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.5222329678670935, "corr_recommendation_correctness": 0.816496580927726, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12753206805945199412&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff_unique_index": "0;1;0;0", "aff_unique_norm": "Toyota Technological Institute at Chicago;Massachusetts Institute of Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.tti-chicago.org;https://web.mit.edu", "aff_unique_abbr": "TTI Chicago;MIT", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Chicago;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "v_gc2xDfXxR", "title": "PASS: Patch-Aware Self-Supervision for Vision Transformer", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Recent self-supervised representation learning methods have shown impressive results in learning visual representations from unlabeled images. This paper aims to improve their performance further by utilizing the architectural advantages of the underlying neural network, as the current state-of-the-art visual pretext tasks for self-supervised learning do not enjoy the benefit, i.e., they are architecture-agnostic. In particular, we focus on Vision Transformers (ViTs), which have gained much attention recently as a better architectural choice, often outperforming convolutional networks for various visual tasks. The unique characteristic of ViT is that it takes a sequence of disjoint patches from an input image and processes patch-level representations internally. Inspired by this, we design a simple yet effective visual pretext task, coined Patch-Aware Self-Supervision (PASS), for learning better patch-level representations. To be specific, we enforce invariance against each patch and its neighbors, i.e., each patch treats similar neighboring patches as positive samples. Consequently, training ViTs with PASS produces more semantically meaningful attention maps patch-wisely in an unsupervised manner, which can be beneficial, in particular, to downstream tasks of a dense prediction type. Despite the simplicity of our scheme, we demonstrate that it can significantly improve the performance of existing self-supervised learning methods for various visual tasks, including object detection and semantic segmentation.", "keywords": "Self-supervised learning;Vision Transformer;patch-level representations", "primary_area": "", "supplementary_material": "/attachment/56fa6f221472edb221cbcf2db3324f02fdc4296b.zip", "author": "Sukmin Yun;Hankook Lee;Jaehyung Kim;Jinwoo Shin", "authorids": "~Sukmin_Yun1;~Hankook_Lee1;~Jaehyung_Kim1;~Jinwoo_Shin1", "gender": ";M;M;M", "homepage": "https://sites.google.com/view/sukmin-yun;https://hankook.github.io;https://sites.google.com/view/jaehyungkim;https://sites.google.com/site/mijirim/", "dblp": "234/9078;223/4393;02/7206-1;31/7062", "google_scholar": "fQcZ_hQAAAAJ;CgqswXUAAAAJ;https://scholar.google.co.kr/citations?user=6OYOsGsAAAAJ;https://scholar.google.com.tw/citations?user=m3eDp7kAAAAJ", "orcid": ";;;", "linkedin": "sukmin-yun-975b67129/;;;", "or_profile": "~Sukmin_Yun1;~Hankook_Lee1;~Jaehyung_Kim1;~Jinwoo_Shin1", "aff": "Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology", "aff_domain": "kaist.ac.kr;kaist.ac.kr;kaist.ac.kr;kaist.ac.kr", "position": "PhD student;PhD student;PhD student;Associate Professor", "bibtex": "@misc{\nyun2022pass,\ntitle={{PASS}: Patch-Aware Self-Supervision for Vision Transformer},\nauthor={Sukmin Yun and Hankook Lee and Jaehyung Kim and Jinwoo Shin},\nyear={2022},\nurl={https://openreview.net/forum?id=v_gc2xDfXxR}\n}", "github": "", "project": "", "reviewers": "hDZU;HToY;ZtJz;6bk6;CJfW", "site": "https://openreview.net/forum?id=v_gc2xDfXxR", "pdf_size": 0, "recommendation": "3;5;5;5;5", "confidence": "4;5;5;4;4", "correctness": "4;3;3;3;2", "technical_novelty": "2;2;3;3;3", "empirical_novelty": "1;2;2;2;3", "wc_summary_paper": "116;72;81;50;111", "wc_summary_review": "82;107;16;23;31", "wc_main_review": "466;529;249;162;251", "wc_review": "664;708;346;235;393", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "0;0;0;0;0", "reply_reviewers": "0;0;0;0;0", "reply_authors": "0;0;0;0;0", "recommendation_avg": [ 4.6, 0.7999999999999999 ], "confidence_avg": [ 4.4, 0.48989794855663565 ], "correctness_avg": [ 3.0, 0.6324555320336759 ], "technical_novelty_avg": [ 2.6, 0.4898979485566356 ], "empirical_novelty_avg": [ 2.0, 0.6324555320336759 ], "wc_summary_paper_avg": [ 86.0, 24.66576574931336 ], "wc_summary_review_avg": [ 51.8, 36.06327772124991 ], "wc_main_review_avg": [ 331.4, 140.7928975481363 ], "wc_review_avg": [ 469.2, 184.82791996881858 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.4082482904638631, "corr_recommendation_correctness": -0.790569415042095, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:_6BGV-diM1AJ:scholar.google.com/&scioq=PASS:+Patch-Aware+Self-Supervision+for+Vision+Transformer&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Korea Advanced Institute of Science and Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.kaist.ac.kr", "aff_unique_abbr": "KAIST", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "South Korea" }, { "title": "Solving Inverse Problems in Medical Imaging with Score-Based Generative Models", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7035", "id": "vaRCHVj0uGI", "poster": "", "openreview": "https://openreview.net/forum?id=vaRCHVj0uGI", "slides": "https://iclr.cc/virtual/2022/poster/7035", "video": "https://iclr.cc/virtual/2022/poster/7035", "author_site": "Yang Song, Liyue Shen, Lei Xing, Stefano Ermon", "tldr": "", "abstract": "Reconstructing medical images from partial measurements is an important inverse problem in Computed Tomography (CT) and Magnetic Resonance Imaging (MRI). Existing solutions based on machine learning typically train a model to directly map measurements to medical images, leveraging a training dataset of paired images and measurements. These measurements are typically synthesized from images using a fixed physical model of the measurement process, which hinders the generalization capability of models to unknown measurement processes. To address this issue, we propose a fully unsupervised technique for inverse problem solving, leveraging the recently introduced score-based generative models. Specifically, we first train a score-based generative model on medical images to capture their prior distribution. Given measurements and a physical model of the measurement process at test time, we introduce a sampling method to reconstruct an image consistent with both the prior and the observed measurements. Our method does not assume a fixed measurement process during training, and can thus be flexibly adapted to different measurement processes at test time. Empirically, we observe comparable or better performance to supervised learning techniques in several medical imaging tasks in CT and MRI, while demonstrating significantly better generalization to unknown measurement processes.", "keywords": "score-based generative modeling;inverse problems;sparse-view CT;undersampled MRI;metal artifact removal;diffusion", "primary_area": "", "supplementary_material": "", "author": "Yang Song;Liyue Shen;Lei Xing;Stefano Ermon", "authorids": "~Yang_Song1;~Liyue_Shen1;~Lei_Xing1;~Stefano_Ermon1", "gender": "M;F;M;M", "homepage": "https://yang-song.net;https://liyueshen.engin.umich.edu/;http://med.stanford.edu/xinglab.html;http://cs.stanford.edu/~ermon/", "dblp": ";159/2036;;47/8135", "google_scholar": "o_J2CroAAAAJ;Ho4qk9wAAAAJ;;", "orcid": ";0000-0001-5942-3196;;", "linkedin": ";;;", "or_profile": "~Yang_Song1;~Liyue_Shen1;~Lei_Xing1;~Stefano_Ermon1", "aff": "Stanford University;Stanford University;Stanford University;Stanford University", "aff_domain": "stanford.edu;stanford.edu;stanford.edu;stanford.edu", "position": "PhD student;PhD student;Professor, Dept of Radiation Oncology,;Assistant Professor", "bibtex": "@inproceedings{\nsong2022solving,\ntitle={Solving Inverse Problems in Medical Imaging with Score-Based Generative Models},\nauthor={Yang Song and Liyue Shen and Lei Xing and Stefano Ermon},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=vaRCHVj0uGI}\n}", "github": "", "project": "", "reviewers": "fqS4;YBf4;5fGt", "pdf_size": 0, "recommendation": "6;6;8", "confidence": "3;3;4", "correctness": "4;3;3", "technical_novelty": "3;3;2", "empirical_novelty": "3;3;3", "wc_summary_paper": "41;54;125", "wc_summary_review": "26;20;36", "wc_main_review": "432;195;517", "wc_review": "499;269;678", "wc_reply_reviewers": "32;16;176", "wc_reply_authors": "462;480;1061", "reply_reviewers": "1;1;3", "reply_authors": "2;2;5", "recommendation_avg": [ 6.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 73.33333333333333, 36.91732505056249 ], "wc_summary_review_avg": [ 27.333333333333332, 6.599663291074443 ], "wc_main_review_avg": [ 381.3333333333333, 136.25058613533457 ], "wc_review_avg": [ 482.0, 167.4056948453865 ], "wc_reply_reviewers_avg": [ 74.66666666666667, 71.95060033723755 ], "wc_reply_authors_avg": [ 667.6666666666666, 278.2257277024459 ], "reply_reviewers_avg": [ 1.6666666666666667, 0.9428090415820634 ], "reply_authors_avg": [ 3.0, 1.4142135623730951 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.9999999999999998, "corr_recommendation_correctness": -0.49999999999999983, "gs_citation": 615, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16734106149627333689&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=vaRCHVj0uGI", "email": "stanford.edu;stanford.edu;stanford.edu;stanford.edu", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Stanford University", "aff_unique_dep": "", "aff_unique_url": "https://www.stanford.edu", "aff_unique_abbr": "Stanford", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Joint Shapley values: a measure of joint feature importance", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6484", "id": "vcUmUvQCloe", "poster": "", "openreview": "https://openreview.net/forum?id=vcUmUvQCloe", "slides": "https://iclr.cc/virtual/2022/poster/6484", "video": "https://iclr.cc/virtual/2022/poster/6484", "author_site": "Chris Harris, Richard Pymar, Colin Rowat", "tldr": "", "abstract": "The Shapley value is one of the most widely used measures of feature importance partly as it measures a feature's average effect on a model's prediction. We introduce joint Shapley values, which directly extend Shapley's axioms and intuitions: joint Shapley values measure a set of features' average effect on a model's prediction. We prove the uniqueness of joint Shapley values, for any order of explanation. Results for games show that joint Shapley values present different insights from existing interaction indices, which assess the effect of a feature within a set of features. The joint Shapley values seem to provide sensible results in ML attribution problems. With binary features, we present a presence-adjusted global value that is more consistent with local intuitions than the usual approach.", "keywords": "explainable AI;Shapley value;interaction index;cooperative game theory", "primary_area": "", "supplementary_material": "/attachment/c2c971b23f3cf858b09d283af3ced660356cd2fb.zip", "author": "Chris Harris;Richard Pymar;Colin Rowat", "authorids": "chrisharriscjh@gmail.com;~Richard_Pymar1;~Colin_Rowat1", "gender": ";M;M", "homepage": ";https://www.bbk.ac.uk/our-staff/profile/9112681/richard-pymar;http://socscistaff.bham.ac.uk/rowat/", "dblp": ";;51/9935", "google_scholar": ";;https://scholar.google.co.uk/citations?user=7gSp2_gAAAAJ", "orcid": ";;", "linkedin": ";;colinrowat", "or_profile": "chrisharriscjh@gmail.com;~Richard_Pymar1;~Colin_Rowat1", "aff": ";Birkbeck;Birmingham University", "aff_domain": ";bbk.ac.uk;bham.ac.uk", "position": ";Lecturer;Lecturer", "bibtex": "@inproceedings{\nharris2022joint,\ntitle={Joint Shapley values: a measure of joint feature importance},\nauthor={Chris Harris and Richard Pymar and Colin Rowat},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=vcUmUvQCloe}\n}", "github": "", "project": "", "reviewers": "VFxY;QfeR;Djge", "pdf_size": 0, "recommendation": "5;8;8", "confidence": "3;2;3", "correctness": "3;3;4", "technical_novelty": "3;3;4", "empirical_novelty": "2;3;4", "wc_summary_paper": "93;47;15", "wc_summary_review": "32;26;22", "wc_main_review": "489;90;98", "wc_review": "614;163;135", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "747;90;173", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "recommendation_avg": [ 7.0, 1.4142135623730951 ], "confidence_avg": [ 2.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 3.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 3.0, 0.816496580927726 ], "wc_summary_paper_avg": [ 51.666666666666664, 32.013885876114585 ], "wc_summary_review_avg": [ 26.666666666666668, 4.109609335312651 ], "wc_main_review_avg": [ 225.66666666666666, 186.23342580989294 ], "wc_review_avg": [ 304.0, 219.50094912475132 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 336.6666666666667, 292.12135986416484 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.5, "corr_recommendation_correctness": 0.5, "gs_citation": 34, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4894614344420722159&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=vcUmUvQCloe", "email": ";bbk.ac.uk;bham.ac.uk", "author_num": 3, "aff_unique_index": "0;1", "aff_unique_norm": "Birkbeck, University of London;University of Birmingham", "aff_unique_dep": ";", "aff_unique_url": "https://www.bbk.ac.uk;https://www.birmingham.ac.uk", "aff_unique_abbr": "Birkbeck;Birmingham", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United Kingdom" }, { "id": "vdKncX1WclT", "title": "Red Alarm for Pre-trained Models: Universal Vulnerability to Neuron-Level Backdoor Attacks", "track": "main", "status": "Reject", "tldr": "", "abstract": "The pre-training-then-fine-tuning paradigm has been widely used in deep learning. Due to the huge computation cost for pre-training, practitioners usually download pre-trained models from the Internet and fine-tune them on downstream datasets while the downloaded models may suffer backdoor attacks. Different from previous attacks aiming at a target task, we show that a backdoored pre-trained model can behave maliciously in various downstream tasks without foreknowing task information. Attackers can restrict the output representations of trigger-embedded samples to arbitrary predefined values through additional training, namely Neuron-level Backdoor Attack (NeuBA). Since fine-tuning has little effect on model parameters, the fine-tuned model will retain the backdoor functionality and predict a specific label for the samples embedded with the same trigger. To provoke multiple labels in a specific task, attackers can introduce several triggers with contrastive predefined values. In the experiments of both natural language processing (NLP) and computer vision (CV), we show that NeuBA can well control the predictions for trigger-embedded instances with different trigger designs. Our findings sound a red alarm for the wide use of pre-trained models. Finally, we apply several defense methods to NeuBA and find that model pruning is a promising technique to resist NeuBA by omitting backdoored neurons.", "keywords": "Pre-trained models;Backdoor attacks", "primary_area": "", "supplementary_material": "", "author": "Zhengyan Zhang;Guangxuan Xiao;Yongwei Li;Tian Lv;Fanchao Qi;Zhiyuan Liu;Yasheng Wang;Xin Jiang;Maosong Sun", "authorids": "~Zhengyan_Zhang1;~Guangxuan_Xiao1;~Yongwei_Li1;~Tian_Lv1;~Fanchao_Qi1;~Zhiyuan_Liu1;~Yasheng_Wang1;~Xin_Jiang1;~Maosong_Sun1", "gender": "M;;M;;M;M;M;M;M", "homepage": ";;https://imagasaikou.cn;https://github.com/halo2718;;http://nlp.csai.tsinghua.edu.cn/~lzy;;;https://www.cs.tsinghua.edu.cn/csen/info/1312/4394.htm", "dblp": ";;;;228/5500;53/3245-1;57/8493;42/4142-2;95/3291-1", "google_scholar": ";;;;https://scholar.google.com/citations?hl=zh-CN;dT0v5u0AAAAJ;x-UYeJ4AAAAJ;DUfcez0AAAAJ;https://scholar.google.com.tw/citations?user=zIgT0HMAAAAJ", "orcid": ";;;;0000-0002-4400-4033;0000-0002-7709-2543;;0000-0002-9117-8247;", "linkedin": ";;;;%E5%87%A1%E8%B6%85-%E5%B2%82-885770a4/;;;xin-jiang-9577b76/;", "or_profile": "~Zhengyan_Zhang1;~Guangxuan_Xiao1;~Yongwei_Li1;~Tian_Lv1;~Fanchao_Qi1;~Zhiyuan_Liu1;~Yasheng_Wang1;~Xin_Jiang1;~Maosong_Sun1", "aff": "Tsinghua University;;Tsinghua University;Tsinghua University;Tsinghua University;Tsinghua University;;Noah\u2019s Ark Lab, Huawei Technologies;Tsinghua University", "aff_domain": "tsinghua.edu.cn;;tsinghua.edu.cn;tsinghua.edu.cn;mails.tsinghua.edu.cn;tsinghua.edu.cn;;huawei.com;tsinghua.edu.cn", "position": "PhD student;;Undergrad student;Undergrad student;PhD student;Associate Professor;;Principal Researcher;Full Professor", "bibtex": "@misc{\nzhang2022red,\ntitle={Red Alarm for Pre-trained Models: Universal Vulnerability to Neuron-Level Backdoor Attacks},\nauthor={Zhengyan Zhang and Guangxuan Xiao and Yongwei Li and Tian Lv and Fanchao Qi and Zhiyuan Liu and Yasheng Wang and Xin Jiang and Maosong Sun},\nyear={2022},\nurl={https://openreview.net/forum?id=vdKncX1WclT}\n}", "github": "", "project": "", "reviewers": "6y76;CuwA;Zyx7;a3nV", "site": "https://openreview.net/forum?id=vdKncX1WclT", "pdf_size": 0, "recommendation": "3;3;5;6", "confidence": "4;4;4;4", "correctness": "2;2;3;3", "technical_novelty": "2;2;2;3", "empirical_novelty": "1;2;2;3", "wc_summary_paper": "58;50;26;107", "wc_summary_review": "25;21;41;34", "wc_main_review": "396;297;371;279", "wc_review": "479;368;438;420", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "837;660;813;373", "reply_reviewers": "0;0;0;0", "reply_authors": "2;1;2;1", "recommendation_avg": [ 4.25, 1.299038105676658 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 60.25, 29.448047473474364 ], "wc_summary_review_avg": [ 30.25, 7.790218225441442 ], "wc_main_review_avg": [ 335.75, 48.976397376695644 ], "wc_review_avg": [ 426.25, 39.85207020971433 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 670.75, 184.82745331795275 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 9, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.9622504486493761, "gs_citation": 104, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12165482719658222154&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 11, "aff_unique_index": "0;0;0;0;0;1;0", "aff_unique_norm": "Tsinghua University;Huawei", "aff_unique_dep": ";Noah\u2019s Ark Lab", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.huawei.com", "aff_unique_abbr": "THU;Huawei", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "China" }, { "id": "vdP_emhLjAt", "title": "Predicting Unreliable Predictions by Shattering a Neural Network", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Generalization error bounds measure the deviation of performance on unseen test data from performance on training data. However, by providing one scalar per model, they are input-agnostic. What if one wants to predict error for a specific test sample? To answer this, we propose the novel paradigm of input-conditioned generalization error bounds. For piecewise linear neural networks, given a weighting function that relates the errors of different input activation regions together, we obtain a bound on each region's generalization error that scales inversely with the density of training samples. That is, more densely supported regions are more reliable. As the bound is input-conditioned, it is to our knowledge the first generalization error bound applicable to the problems of detecting out-of-distribution and misclassified in-distribution samples for neural networks; we find that it performs competitively in both cases when tested on image classification tasks.", "keywords": "generalization;expressivity", "primary_area": "", "supplementary_material": "/attachment/81370c74e27f53a5bf52035f64e74e6ad75ebbe1.zip", "author": "Xu Ji;Razvan Pascanu;R Devon Hjelm;Andrea Vedaldi;Balaji Lakshminarayanan;Yoshua Bengio", "authorids": "~Xu_Ji1;~Razvan_Pascanu1;~R_Devon_Hjelm1;~Andrea_Vedaldi1;~Balaji_Lakshminarayanan1;~Yoshua_Bengio1", "gender": ";M;M;M;M;M", "homepage": ";https://razp.info;;https://www.robots.ox.ac.uk/~vedaldi/;http://www.gatsby.ucl.ac.uk/~balaji/;http://yoshuabengio.org", "dblp": ";65/8368.html;195/5928;99/2825;71/8324;56/953", "google_scholar": "https://scholar.google.co.uk/citations?user=HQzOgKYAAAAJ;https://scholar.google.ca/citations?user=eSPY8LwAAAAJ;https://scholar.google.ca/citations?user=68c5HfwAAAAJ;bRT7t28AAAAJ;QYn8RbgAAAAJ;kukA0LcAAAAJ", "orcid": ";;;0000-0003-1374-2858;;", "linkedin": ";;;;;yoshuabengio/?originalSubdomain=ca", "or_profile": "~Xu_Ji1;~Razvan_Pascanu1;~R_Devon_Hjelm1;~Andrea_Vedaldi1;~Balaji_Lakshminarayanan1;~Yoshua_Bengio1", "aff": "Montreal Institute for Learning Algorithms, University of Montreal, University of Montreal;Google DeepMind;Microsoft;Meta;Google Brain;University of Montreal", "aff_domain": "mila.umontreal.ca;google.com;microsoft.com;meta.com;google.com;umontreal.ca", "position": "Postdoc;Research Scientist;Researcher;Researcher;Research Scientist;Full Professor", "bibtex": "@misc{\nji2022predicting,\ntitle={Predicting Unreliable Predictions by Shattering a Neural Network},\nauthor={Xu Ji and Razvan Pascanu and R Devon Hjelm and Andrea Vedaldi and Balaji Lakshminarayanan and Yoshua Bengio},\nyear={2022},\nurl={https://openreview.net/forum?id=vdP_emhLjAt}\n}", "github": "", "project": "", "reviewers": "Rg2a;PGfc;Kjvq;DWBA", "site": "https://openreview.net/forum?id=vdP_emhLjAt", "pdf_size": 0, "recommendation": "1;3;3;6", "confidence": "4;2;3;3", "correctness": "1;2;3;3", "technical_novelty": "2;3;2;3", "empirical_novelty": "2;3;2;2", "wc_summary_paper": "52;78;69;92", "wc_summary_review": "47;20;38;26", "wc_main_review": "431;142;681;580", "wc_review": "530;240;788;698", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "326;77;271;218", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 3.25, 1.7853571071357126 ], "confidence_avg": [ 3.0, 0.7071067811865476 ], "correctness_avg": [ 2.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 72.75, 14.515078366994786 ], "wc_summary_review_avg": [ 32.75, 10.473180032826706 ], "wc_main_review_avg": [ 458.5, 203.2221690662709 ], "wc_review_avg": [ 564.0, 208.72469906553943 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 223.0, 92.53918089112308 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.39605901719066966, "corr_recommendation_correctness": 0.8021806287494232, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17294432422099456374&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;2;3;1;0", "aff_unique_norm": "University of Montreal;Google;Microsoft;Meta", "aff_unique_dep": "Montreal Institute for Learning Algorithms;Google DeepMind;Microsoft Corporation;Meta Platforms, Inc.", "aff_unique_url": "https://www.umontreal.ca;https://deepmind.com;https://www.microsoft.com;https://meta.com", "aff_unique_abbr": "UM;DeepMind;Microsoft;Meta", "aff_campus_unique_index": "0;2", "aff_campus_unique": "Montreal;;Mountain View", "aff_country_unique_index": "0;1;2;2;2;0", "aff_country_unique": "Canada;United Kingdom;United States" }, { "id": "vdbidlOkeF0", "title": "Scaling Densities For Improved Density Ratio Estimation", "track": "main", "status": "Reject", "tldr": "", "abstract": "Estimating the discrepancy between two densities ($p$ and $q$) is central to machine learning. Most frequently used methods for the quantification of this discrepancy capture it as a function of the ratio of the densities $p/q$. In practice, closed-form expressions for these densities or their ratio are rarely available. As such, estimating density ratios accurately using only samples from $p$ and $q$ is of high significance and has led to a flurry of recent work in this direction. Among these, binary classification based density ratio estimators have shown great promise and have been extremely successful in specialized domains. However, estimating the density ratio using a binary classifier, when the samples from the densities are well separated, remains challenging. In this work, we first show that the state-of-the-art solutions for such well-separated cases have limited applicability, may suffer from theoretical inconsistencies or lack formal guarantees and therefore perform poorly in the general case. We then present an alternative framework for density ratio estimation that is motivated by the scaled-Bregman divergence. Our proposal is to scale the densities $p$ and $q$ by another density $m$ and estimate $\\log p/q$ as $\\log p/m - \\log q/m$. We show that if the scaling measures are constructed such that they overlap with $p$ and $q$, then a single multi-class logistic regression can be trained to accurately recover $p/m$ and $q/m$ on samples from $p, q$ and $m$. We formally justify our method with the scaled-Bregman theorem and show that it does not suffer from the issues that plague the existing solutions.", "keywords": "density ratio estimation;scaled bregman divergence;mutual information estimation;representation learning", "primary_area": "", "supplementary_material": "/attachment/b9551994b8885109ca07142df4df32c57e347cfb.zip", "author": "Akash Srivastava;Seungwook Han;Benjamin Rhodes;Kai Xu;Michael U. Gutmann", "authorids": "~Akash_Srivastava1;~Seungwook_Han1;~Benjamin_Rhodes1;~Kai_Xu4;~Michael_U._Gutmann1", "gender": "M;;M;M;", "homepage": "http://akashgit.github.io;;https://benrhodes26.github.io/;https://xuk.ai;", "dblp": "24/9528;119/3428;228/9176;;", "google_scholar": "https://scholar.google.co.uk/citations?user=2h6SZeEAAAAJ;B6tpjKkAAAAJ;DNKDL-oAAAAJ;https://scholar.google.ca/citations?user=kf3C60wAAAAJ;", "orcid": ";;;;", "linkedin": "https://uk.linkedin.com/in/akash-srivastava-aa97361b;;;;", "or_profile": "~Akash_Srivastava1;~Seungwook_Han1;~Benjamin_Rhodes1;~Kai_Xu4;~Michael_U._Gutmann1", "aff": "MIT-IBM Watson AI Research Lab;MIT-IBM Watson AI Lab;University of Edinburgh;Hazy;", "aff_domain": "ibm.com;ibm.com;ed.ac.uk;hazy.com;", "position": "Research Scientist;Researcher;PhD student;Researcher;", "bibtex": "@misc{\nsrivastava2022scaling,\ntitle={Scaling Densities For Improved Density Ratio Estimation},\nauthor={Akash Srivastava and Seungwook Han and Benjamin Rhodes and Kai Xu and Michael U. Gutmann},\nyear={2022},\nurl={https://openreview.net/forum?id=vdbidlOkeF0}\n}", "github": "", "project": "", "reviewers": "RSpk;rhLq;Cfja;u7f6", "site": "https://openreview.net/forum?id=vdbidlOkeF0", "pdf_size": 0, "recommendation": "3;5;6;6", "confidence": "4;4;3;4", "correctness": "2;3;4;3", "technical_novelty": "2;3;2;3", "empirical_novelty": "3;3;3;3", "wc_summary_paper": "53;72;64;148", "wc_summary_review": "26;55;51;67", "wc_main_review": "483;210;207;304", "wc_review": "562;337;322;519", "wc_reply_reviewers": "0;0;0;39", "wc_reply_authors": "1386;464;627;678", "reply_reviewers": "0;0;0;1", "reply_authors": "2;1;1;1", "recommendation_avg": [ 5.0, 1.224744871391589 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 84.25, 37.41907935799597 ], "wc_summary_review_avg": [ 49.75, 14.922717580923388 ], "wc_main_review_avg": [ 301.0, 112.08255885729947 ], "wc_review_avg": [ 435.0, 106.72160043777454 ], "wc_reply_reviewers_avg": [ 9.75, 16.887495373796554 ], "wc_reply_authors_avg": [ 788.75, 353.76501734908726 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.4714045207910316, "corr_recommendation_correctness": 0.8660254037844386, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:eGoJnoNDNewJ:scholar.google.com/&scioq=Scaling+Densities+For+Improved+Density+Ratio+Estimation&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0;1;2", "aff_unique_norm": "Massachusetts Institute of Technology;University of Edinburgh;Hazy", "aff_unique_dep": "MIT-IBM Watson AI Research Lab;;", "aff_unique_url": "https://www.mitibmwatsonailab.org;https://www.ed.ac.uk;", "aff_unique_abbr": "MIT-IBM AI Lab;Edinburgh;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1", "aff_country_unique": "United States;United Kingdom;" }, { "title": "Superclass-Conditional Gaussian Mixture Model For Learning Fine-Grained Embeddings", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6114", "id": "vds4SNooOe", "poster": "", "openreview": "https://openreview.net/forum?id=vds4SNooOe", "slides": "https://iclr.cc/virtual/2022/poster/6114", "video": "https://iclr.cc/virtual/2022/poster/6114", "author_site": "Jingchao Ni, Wei Cheng, Zhengzhang Chen, Takayoshi Asakura, Tomoya Soma, Sho Kato, Haifeng Chen", "tldr": "", "abstract": "Learning fine-grained embeddings is essential for extending the generalizability of models pre-trained on \"coarse\" labels (e.g., animals). It is crucial to fields for which fine-grained labeling (e.g., breeds of animals) is expensive, but fine-grained prediction is desirable, such as medicine. The dilemma necessitates adaptation of a \"coarsely\" pre-trained model to new tasks with a few \"finer-grained\" training labels. However, coarsely supervised pre-training tends to suppress intra-class variation, which is vital for cross-granularity adaptation. In this paper, we develop a training framework underlain by a novel superclass-conditional Gaussian mixture model (SCGM). SCGM imitates the generative process of samples from hierarchies of classes through latent variable modeling of the fine-grained subclasses. The framework is agnostic to the encoders and only adds a few distribution related parameters, thus is efficient, and flexible to different domains. The model parameters are learned end-to-end by maximum-likelihood estimation via a principled Expectation-Maximization algorithm. Extensive experiments on benchmark datasets and a real-life medical dataset indicate the effectiveness of our method.", "keywords": "Deep learning;represenation learning;generative model", "primary_area": "", "supplementary_material": "/attachment/7a9cd85f955393d2fa93666f4430918796480e9e.zip", "author": "Jingchao Ni;Wei Cheng;Zhengzhang Chen;Takayoshi Asakura;Tomoya Soma;Sho Kato;Haifeng Chen", "authorids": "~Jingchao_Ni1;~Wei_Cheng1;zchen@nec-labs.com;takayoshi.asakura@nec.com;tomoya-s@nec.com;kato@renascience.co.jp;~Haifeng_Chen1", "gender": "M;M;;;;;", "homepage": ";https://chengw07.github.io/;;;;;https://haifengchen.gitlab.io/intro/", "dblp": "151/3208;89/2506-2.html;;;;;08/57-1.html", "google_scholar": "rH9MTZMAAAAJ;PRrGVmoAAAAJ;;;;;QzakB68AAAAJ", "orcid": ";;;;;;", "linkedin": "jingchao-ni-930a3871/;wei-cheng-ml/;;;;;", "or_profile": "~Jingchao_Ni1;~Wei_Cheng1;zchen@nec-labs.com;takayoshi.asakura@nec.com;tomoya-s@nec.com;kato@renascience.co.jp;~Haifeng_Chen1", "aff": "NEC-Labs;NEC-Labs;;;;;NEC-Labs", "aff_domain": "nec-labs.com;nec-labs.com;;;;;nec-labs.com", "position": "Researcher;Principal Researcher;;;;;Researcher", "bibtex": "@inproceedings{\nni2022superclassconditional,\ntitle={Superclass-Conditional Gaussian Mixture Model For Learning Fine-Grained Embeddings},\nauthor={Jingchao Ni and Wei Cheng and Zhengzhang Chen and Takayoshi Asakura and Tomoya Soma and Sho Kato and Haifeng Chen},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=vds4SNooOe}\n}", "github": "", "project": "", "reviewers": "w8nz;FxzL;s1rd", "pdf_size": 0, "recommendation": "6;8;8", "confidence": "2;4;4", "correctness": "4;3;4", "technical_novelty": "3;3;3", "empirical_novelty": "3;3;4", "wc_summary_paper": "91;118;44", "wc_summary_review": "36;1219;21", "wc_main_review": "101;341;177", "wc_review": "228;1678;242", "wc_reply_reviewers": "13;140;40", "wc_reply_authors": "644;4871;855", "reply_reviewers": "1;1;1", "reply_authors": "2;13;3", "recommendation_avg": [ 7.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 3.3333333333333335, 0.9428090415820634 ], "correctness_avg": [ 3.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 84.33333333333333, 30.57595278791634 ], "wc_summary_review_avg": [ 425.3333333333333, 561.2404911344948 ], "wc_main_review_avg": [ 206.33333333333334, 100.15099711047425 ], "wc_review_avg": [ 716.0, 680.2607343266747 ], "wc_reply_reviewers_avg": [ 64.33333333333333, 54.62803513052819 ], "wc_reply_authors_avg": [ 2123.3333333333335, 1944.8023607097514 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 6.0, 4.96655480858378 ], "replies_avg": [ 26, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 1.0, "corr_recommendation_correctness": -0.49999999999999983, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16398991451441380752&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=vds4SNooOe", "email": "nec-labs.com;nec-labs.com;;;;;nec-labs.com", "author_num": 7, "aff_unique_index": "0;0;0", "aff_unique_norm": "NEC Laboratories", "aff_unique_dep": "", "aff_unique_url": "https://www.nec-labs.com", "aff_unique_abbr": "NEC-Labs", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Towards Continual Knowledge Learning of Language Models", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6187", "id": "vfsRB5MImo9", "poster": "", "openreview": "https://openreview.net/forum?id=vfsRB5MImo9", "slides": "https://iclr.cc/virtual/2022/poster/6187", "video": "https://iclr.cc/virtual/2022/poster/6187", "author_site": "Joel Jang, Seonghyeon Ye, Sohee Yang, Joongbo Shin, Janghoon Han, GyeongHun kim, Stanley Jungkyu Choi, Minjoon Seo", "tldr": "", "abstract": "Large Language Models (LMs) are known to encode world knowledge in their parameters as they pretrain on a vast amount of web corpus, which is often utilized for performing knowledge-dependent downstream tasks such as question answering, fact-checking, and open dialogue. In real-world scenarios, the world knowledge stored in the LMs can quickly become outdated as the world changes, but it is non-trivial to avoid catastrophic forgetting and reliably acquire new knowledge while preserving invariant knowledge. To push the community towards better maintenance of ever-changing LMs, we formulate a new continual learning (CL) problem called Continual Knowledge Learning (CKL). We construct a new benchmark and metric to quantify the retention of time-invariant world knowledge, the update of outdated knowledge, and the acquisition of new knowledge. We adopt applicable recent methods from literature to create several strong baselines. Through extensive experiments, we find that CKL exhibits unique challenges that are not addressed in previous CL setups, where parameter expansion is necessary to reliably retain and learn knowledge simultaneously. By highlighting the critical causes of knowledge forgetting, we show that CKL is a challenging and important problem that helps us better understand and train ever-changing LMs.", "keywords": "continual learning;knowledge acquisition;catastrophic forgetting;large language models;pretraining;natural language processing", "primary_area": "", "supplementary_material": "", "author": "Joel Jang;Seonghyeon Ye;Sohee Yang;Joongbo Shin;Janghoon Han;Gyeonghun KIM;Stanley Jungkyu Choi;Minjoon Seo", "authorids": "~Joel_Jang1;~Seonghyeon_Ye1;~Sohee_Yang1;~Joongbo_Shin1;~Janghoon_Han1;~Gyeonghun_KIM1;~Stanley_Jungkyu_Choi2;~Minjoon_Seo1", "gender": "M;M;F;M;M;M;M;M", "homepage": "https://joeljang.github.io/;https://vano1205.github.io/;https://soheeyang.github.io;https://joongbo.github.io/;https://hanjanghoon.github.io/resume/;;https://sites.google.com/view/stanleyjkchoi/;https://seominjoon.github.io", "dblp": ";301/8927;236/5847;207/7602;https://dblp.org/rec/conf/naacl/HanHKKS21;;218/6600.html;149/1367", "google_scholar": "xL-7eFEAAAAJ;https://scholar.google.co.kr/citations?user=JfGGjBoAAAAJ;jh547hEAAAAJ;xzJSvJcAAAAJ;https://scholar.google.co.kr/citations?hl=ko;;https://scholar.google.co.kr/citations?user=-oztA_QAAAAJ;zYze5fIAAAAJ", "orcid": ";;;;;;;", "linkedin": "joel-jang-1289331a5/;;;;;https://kr.linkedin.com/in/%EA%B2%BD%ED%9B%88-%EA%B9%80-12aa48101;https://kr.linkedin.com/in/stanley-jungkyu-choi-19656256;minjoon-seo/", "or_profile": "~Joel_Jang1;~Seonghyeon_Ye1;~Sohee_Yang1;~Joongbo_Shin1;~Janghoon_Han1;~Gyeonghun_KIM1;~Stanley_Jungkyu_Choi2;~Minjoon_Seo1", "aff": "Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology;LG AI Research;LG AI Research;LG AI Research.;Language Lab, LG AI Research;Meta", "aff_domain": "kaist.ac.kr;kaist.ac.kr;kaist.ac.kr;lgresearch.ai;lgresearch.ai;lgresearch.ai;lgresearch.ai;fb.com", "position": "MS student;MS student;MS student;Researcher;Researcher;Researcher;Lab Leader;Research Scientist", "bibtex": "@inproceedings{\njang2022towards,\ntitle={Towards Continual Knowledge Learning of Language Models},\nauthor={Joel Jang and Seonghyeon Ye and Sohee Yang and Joongbo Shin and Janghoon Han and Gyeonghun KIM and Stanley Jungkyu Choi and Minjoon Seo},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=vfsRB5MImo9}\n}", "github": "", "project": "", "reviewers": "U9Hk;Hkbx;ELX1;Rjyt", "pdf_size": 0, "recommendation": "3;6;6;8", "confidence": "5;4;4;4", "correctness": "3;3;3;3", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;3;3;4", "wc_summary_paper": "92;66;129;83", "wc_summary_review": "57;60;22;70", "wc_main_review": "380;327;133;338", "wc_review": "529;453;284;491", "wc_reply_reviewers": "0;0;19;0", "wc_reply_authors": "980;1043;383;245", "reply_reviewers": "0;0;1;0", "reply_authors": "2;2;1;1", "recommendation_avg": [ 5.75, 1.7853571071357126 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 92.5, 23.04886114323222 ], "wc_summary_review_avg": [ 52.25, 18.115946014492316 ], "wc_main_review_avg": [ 294.5, 95.31657778162202 ], "wc_review_avg": [ 439.25, 93.57450240316537 ], "wc_reply_reviewers_avg": [ 4.75, 8.227241335952167 ], "wc_reply_authors_avg": [ 662.75, 352.85009210711564 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": -0.8892972917998875, "corr_recommendation_correctness": 0.0, "gs_citation": 165, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16520354933313015755&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "pdf": "https://openreview.net/pdf?id=vfsRB5MImo9", "email": "kaist.ac.kr;kaist.ac.kr;kaist.ac.kr;lgresearch.ai;lgresearch.ai;lgresearch.ai;lgresearch.ai;fb.com", "author_num": 8, "aff_unique_index": "0;0;0;1;1;1;1;2", "aff_unique_norm": "Korea Advanced Institute of Science and Technology;LG;Meta", "aff_unique_dep": ";LG AI Research;Meta Platforms, Inc.", "aff_unique_url": "https://www.kaist.ac.kr;https://www.lgaires.com;https://meta.com", "aff_unique_abbr": "KAIST;LG AI;Meta", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;1", "aff_country_unique": "South Korea;United States" }, { "title": "Value Function Spaces: Skill-Centric State Abstractions for Long-Horizon Reasoning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6739", "id": "vgqS1vkkCbE", "poster": "", "openreview": "https://openreview.net/forum?id=vgqS1vkkCbE", "slides": "https://iclr.cc/virtual/2022/poster/6739", "video": "https://iclr.cc/virtual/2022/poster/6739", "author_site": "Dhruv Shah, Peng Xu, Yao Lu, Ted Xiao, Alexander Toshev, Sergey Levine, brian ichter", "tldr": "", "abstract": "Reinforcement learning can train policies that effectively perform complex tasks. However for long-horizon tasks, the performance of these methods degrades with horizon, often necessitating reasoning over and chaining lower-level skills. Hierarchical reinforcement learning aims to enable this by providing a bank of low-level skills as action abstractions. Hierarchies can further improve on this by abstracting the space states as well. We posit that a suitable state abstraction should depend on the capabilities of the available lower-level policies. We propose Value Function Spaces: a simple approach that produces such a representation by using the value functions corresponding to each lower-level skill. These value functions capture the affordances of the scene, thus forming a representation that compactly abstracts task relevant information and robustly ignores distractors. Empirical evaluations for maze-solving and robotic manipulation tasks demonstrate that our approach improves long-horizon performance and enables better zero-shot generalization than alternative model-free and model-based methods.", "keywords": "hierarchical reinforcement learning;planning;representation learning;robotics", "primary_area": "", "supplementary_material": "", "author": "Dhruv Shah;Peng Xu;Yao Lu;Ted Xiao;Alexander T Toshev;Sergey Levine;brian ichter", "authorids": "~Dhruv_Shah1;~Peng_Xu9;~Yao_Lu13;~Ted_Xiao1;~Alexander_T_Toshev1;~Sergey_Levine1;~brian_ichter1", "gender": "M;M;;M;M;;M", "homepage": "http://cs.berkeley.edu/~shah;;;https://www.tedxiao.me;https://people.eecs.berkeley.edu/~svlevine/;;http://alex.toshev.tech", "dblp": ";;26/5662-6;198/0598;80/7594;;96/2687", "google_scholar": ";460NWeQAAAAJ;OI7zFmwAAAAJ;;8R35rCwAAAAJ;-w5DuHgAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";;;;;;", "linkedin": ";;;;;;alexander-toshev-9270726/", "or_profile": "~Dhruv_Shah1;~Peng_Xu9;~Yao_Lu13;~Ted_Xiao1;~Sergey_Levine1;~brian_ichter1;~Alexander_Toshev1", "aff": "UC Berkeley;Google;Google;;Google;Google;Research, Google", "aff_domain": "berkeley.edu;google.com;google.com;;google.com;google.com;research.google.com", "position": "PhD student;Researcher;Researcher;;Research Scientist;Research Scientist;Researcher", "bibtex": "@inproceedings{\nshah2022value,\ntitle={Value Function Spaces: Skill-Centric State Abstractions for Long-Horizon Reasoning},\nauthor={Dhruv Shah and Alexander T Toshev and Sergey Levine and brian ichter},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=vgqS1vkkCbE}\n}", "github": "", "project": "", "reviewers": "8FMV;78KW;WuH2;aM2S", "pdf_size": 0, "recommendation": "6;6;6;6", "confidence": "3;3;4;4", "correctness": "3;4;3;3", "technical_novelty": "3;2;3;2", "empirical_novelty": "3;3;3;3", "wc_summary_paper": "86;148;137;75", "wc_summary_review": "76;111;84;62", "wc_main_review": "402;607;345;514", "wc_review": "564;866;566;651", "wc_reply_reviewers": "243;0;0;0", "wc_reply_authors": "1301;703;231;1409", "reply_reviewers": "2;0;0;0", "reply_authors": "3;3;2;2", "recommendation_avg": [ 6.0, 0.0 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 111.5, 31.48412298286233 ], "wc_summary_review_avg": [ 83.25, 17.851820635442202 ], "wc_main_review_avg": [ 467.0, 101.14099070110002 ], "wc_review_avg": [ 661.75, 123.04140563241303 ], "wc_reply_reviewers_avg": [ 60.75, 105.2220865598093 ], "wc_reply_authors_avg": [ 911.0, 475.8592228800446 ], "reply_reviewers_avg": [ 0.5, 0.8660254037844386 ], "reply_authors_avg": [ 2.5, 0.5 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 41, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=162761348614547589&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=vgqS1vkkCbE", "email": "berkeley.edu;google.com;google.com;;google.com;google.com;research.google.com", "author_num": 7, "aff_unique_index": "0;1;1;1;1;1", "aff_unique_norm": "University of California, Berkeley;Google", "aff_unique_dep": ";Google", "aff_unique_url": "https://www.berkeley.edu;https://www.google.com", "aff_unique_abbr": "UC Berkeley;Google", "aff_campus_unique_index": "0;1;1;1;1;1", "aff_campus_unique": "Berkeley;Mountain View", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6230", "id": "vh-0sUt8HlG", "poster": "", "openreview": "https://openreview.net/forum?id=vh-0sUt8HlG", "slides": "https://iclr.cc/virtual/2022/poster/6230", "video": "https://iclr.cc/virtual/2022/poster/6230", "author_site": "Sachin Mehta, Mohammad Rastegari", "tldr": "", "abstract": "Light-weight convolutional neural networks (CNNs) are the de-facto for mobile vision tasks. Their spatial inductive biases allow them to learn representations with fewer parameters across different vision tasks. However, these networks are spatially local. To learn global representations, self-attention-based vision trans-formers (ViTs) have been adopted. Unlike CNNs, ViTs are heavy-weight. In this paper, we ask the following question: is it possible to combine the strengths of CNNs and ViTs to build a light-weight and low latency network for mobile vision tasks? Towards this end, we introduce MobileViT, a light-weight and general-purpose vision transformer for mobile devices. MobileViT presents a different perspective for the global processing of information with transformers, i.e., transformers as convolutions. Our results show that MobileViT significantly outperforms CNN- and ViT-based networks across different tasks and datasets. On the ImageNet-1k dataset, MobileViT achieves top-1 accuracy of 78.4% with about 6 million parameters, which is 3.2% and 6.2% more accurate than MobileNetv3 (CNN-based) and DeIT (ViT-based) for a similar number of parameters. On the MS-COCO object detection task, MobileViT is 5.7% more accurate than MobileNetv3 for a similar number of parameters. \n\nOur source code is open-source and available at: https://github.com/apple/ml-cvnets", "keywords": "Vision transformer;Mobile;Edge Devices;Transformer;CNN;Efficient Network;Detection;Segmentation;ImageNet", "primary_area": "", "supplementary_material": "", "author": "Sachin Mehta;Mohammad Rastegari", "authorids": "~Sachin_Mehta1;~Mohammad_Rastegari2", "gender": "M;M", "homepage": "https://sacmehta.github.io/;https://mrastegari.github.io/", "dblp": "34/11140;31/5228", "google_scholar": "https://scholar.google.co.in/citations?user=cnRJ0GUAAAAJ;N4-2Z_cAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Sachin_Mehta1;~Mohammad_Rastegari2", "aff": "Apple;Department of Computer Science, University of Washington", "aff_domain": "apple.com;cs.washington.edu", "position": "Researcher;Assistant Professor", "bibtex": "@inproceedings{\nmehta2022mobilevit,\ntitle={MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer},\nauthor={Sachin Mehta and Mohammad Rastegari},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=vh-0sUt8HlG}\n}", "github": "", "project": "", "reviewers": "XdK7;cGXj;GY8P", "pdf_size": 0, "recommendation": "5;6;8", "confidence": "4;4;4", "correctness": "3;3;3", "technical_novelty": "2;3;3", "empirical_novelty": "2;3;0", "wc_summary_paper": "70;54;54", "wc_summary_review": "130;82;24", "wc_main_review": "241;296;457", "wc_review": "441;432;535", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 6.333333333333333, 1.247219128924647 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 1.6666666666666667, 1.247219128924647 ], "wc_summary_paper_avg": [ 59.333333333333336, 7.542472332656507 ], "wc_summary_review_avg": [ 78.66666666666667, 43.33846123505335 ], "wc_main_review_avg": [ 331.3333333333333, 91.65272621271133 ], "wc_review_avg": [ 469.3333333333333, 46.57848812011351 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 1975, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5434557493125510443&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=vh-0sUt8HlG", "email": "apple.com;cs.washington.edu", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "Apple;University of Washington", "aff_unique_dep": "Apple Inc.;Department of Computer Science", "aff_unique_url": "https://www.apple.com;https://www.washington.edu", "aff_unique_abbr": "Apple;UW", "aff_campus_unique_index": "1", "aff_campus_unique": ";Seattle", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "vi9nRayoeaS", "title": "ASAP DML: Deep Metric Learning with Alternating Sets of Alternating Proxies", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Deep metric learning (DML) aims to minimize empirical expected loss of the pairwise intra-/inter- class proximity violations in the embedding image. We relate DML to feasibility problem of finite chance constraints. We show that minimizer of proxy-based DML satisfies certain chance constraints, and that the worst case generalization performance of the proxy-based methods can be characterized by the radius of the smallest ball around a class proxy to cover the entire domain of the corresponding class samples, suggesting multiple proxies per class helps performance. To provide a scalable algorithm as well as exploiting more proxies, we consider the chance constraints implied by the minimizers of proxy-based DML instances and reformulate DML as finding a feasible point in intersection of such constraints, resulting in a problem to be approximately solved by alternating projections. Simply put, we repeatedly train a regularized proxy-based loss and re-initialize the proxies with the embeddings of the deliberately selected new samples. We apply our method with the well-accepted losses and evaluate on four popular benchmark datasets for image retrieval. Outperforming state-of-the-art, the proposed approach consistently improves the performance of the applied losses.", "keywords": "Deep Metric Learning;Alternating Projections;Generalization", "primary_area": "", "supplementary_material": "", "author": "Yeti Z. G\u00fcrb\u00fcz;O\u011ful Can;A. Ayd\u0131n Alatan", "authorids": "~Yeti_Z._G\u00fcrb\u00fcz1;~O\u011ful_Can1;~A._Ayd\u0131n_Alatan1", "gender": "M;M;M", "homepage": ";http://users.metu.edu.tr/alatan;https://yetigurbuz.github.io/", "dblp": "189/2881;;https://dblp.org/pers/g/Gurbuz:Yeti_Ziya", "google_scholar": ";h6mCaBoAAAAJ;https://scholar.google.com.tr/citations?user=t3YY0FYAAAAJ", "orcid": ";;0000-0002-0244-7899", "linkedin": ";;", "or_profile": "~O\u011ful_Can1;~Aydin_Alatan2;~Yeti_Ziya_Gurbuz1", "aff": "METU;;METU", "aff_domain": "metu.edu.tr;;metu.edu.tr", "position": "PhD student;;PhD student", "bibtex": "@misc{\ng{\\\"u}rb{\\\"u}z2022asap,\ntitle={{ASAP} {DML}: Deep Metric Learning with Alternating Sets of Alternating Proxies},\nauthor={Yeti Z. G{\\\"u}rb{\\\"u}z and O{\\u{g}}ul Can and A. Ayd{\\i}n Alatan},\nyear={2022},\nurl={https://openreview.net/forum?id=vi9nRayoeaS}\n}", "github": "", "project": "", "reviewers": "Ni5u;WF8a;a84G", "site": "https://openreview.net/forum?id=vi9nRayoeaS", "pdf_size": 0, "recommendation": "3;5;6", "confidence": "4;4;3", "correctness": "3;4;3", "technical_novelty": "2;4;2", "empirical_novelty": "1;4;2", "wc_summary_paper": "57;62;39", "wc_summary_review": "41;29;90", "wc_main_review": "382;198;313", "wc_review": "480;289;442", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "1433;1119;791", "reply_reviewers": "0;0;0", "reply_authors": "3;2;2", "recommendation_avg": [ 4.666666666666667, 1.247219128924647 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.6666666666666665, 0.9428090415820634 ], "empirical_novelty_avg": [ 2.3333333333333335, 1.247219128924647 ], "wc_summary_paper_avg": [ 52.666666666666664, 9.877021593352703 ], "wc_summary_review_avg": [ 53.333333333333336, 26.386023236217735 ], "wc_main_review_avg": [ 297.6666666666667, 75.8961278473561 ], "wc_review_avg": [ 403.6666666666667, 82.5523402992846 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 1114.3333333333333, 262.1161744129669 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.3333333333333335, 0.4714045207910317 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.7559289460184545, "corr_recommendation_correctness": 0.18898223650461363, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:1BQznn4JLAQJ:scholar.google.com/&scioq=ASAP+DML:+Deep+Metric+Learning+with+Alternating+Sets+of+Alternating+Proxies&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Middle East Technical University", "aff_unique_dep": "", "aff_unique_url": "https://www.metu.edu.tr", "aff_unique_abbr": "METU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "T\u00fcrkiye" }, { "id": "viWF5cyz6i", "title": "An Efficient and Reliable Tolerance-Based Algorithm for Principal Component Analysis", "track": "main", "status": "Reject", "tldr": "", "abstract": "Principal component analysis (PCA) is an important method for dimensionality reduction in data science and machine learning. But, it is expensive for large matrices when only a few principal components are needed. Existing fast PCA algorithms typically assume the user will supply the number of components needed, but in practice, they may not know this number beforehand. Thus, it is important to have fast PCA algorithms depending on a tolerance. For $m\\times n$ matrices where a few principal components explain most of the variance in the data, we develop one such algorithm that runs in $O(mnl)$ time, where $l\\ll \\min(m,n)$ is a small multiple of the number of principal components. We provide approximation error bounds that are within a constant factor away from optimal and demonstrate its utility with data from a variety of applications.", "keywords": "principal component analysis;dimensionality reduction;data compression", "primary_area": "", "supplementary_material": "", "author": "Michael Yeh;Ming Gu", "authorids": "~Michael_Yeh1;~Ming_Gu1", "gender": ";M", "homepage": ";http://math.berkeley.edu/~mgu", "dblp": ";", "google_scholar": "https://scholar.google.com/citations?hl=en;", "orcid": ";", "linkedin": ";", "or_profile": "~Michael_Yeh1;~Ming_Gu1", "aff": "University of California, Berkeley;University of California, Berkeley", "aff_domain": "berkeley.edu;berkeley.edu", "position": "PhD student;Full Professor", "bibtex": "@misc{\nyeh2022an,\ntitle={An Efficient and Reliable Tolerance-Based Algorithm for Principal Component Analysis},\nauthor={Michael Yeh and Ming Gu},\nyear={2022},\nurl={https://openreview.net/forum?id=viWF5cyz6i}\n}", "github": "", "project": "", "reviewers": "oomW;ps4E;ELuj", "site": "https://openreview.net/forum?id=viWF5cyz6i", "pdf_size": 0, "recommendation": "3;5;5", "confidence": "3;3;3", "correctness": "3;4;2", "technical_novelty": "2;2;3", "empirical_novelty": "1;2;3", "wc_summary_paper": "32;50;61", "wc_summary_review": "28;13;54", "wc_main_review": "171;136;438", "wc_review": "231;199;553", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 4.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 3.0, 0.0 ], "correctness_avg": [ 3.0, 0.816496580927726 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.0, 0.816496580927726 ], "wc_summary_paper_avg": [ 47.666666666666664, 11.953614051360738 ], "wc_summary_review_avg": [ 31.666666666666668, 16.937794687883336 ], "wc_main_review_avg": [ 248.33333333333334, 134.87360338067967 ], "wc_review_avg": [ 327.6666666666667, 159.86939113608403 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9214614842883318401&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0", "aff_unique_norm": "University of California, Berkeley", "aff_unique_dep": "", "aff_unique_url": "https://www.berkeley.edu", "aff_unique_abbr": "UC Berkeley", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Berkeley", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "vjaGQ4cftD", "title": "Referring Self-supervised Learning on 3D Point Cloud", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "After observing a type of object, we humans could easily recognize similar objects on an unseen scene. However, such generalization ability for the neural network remains not fully explored in current researches. In this paper, we study a new problem named Referring Self-supervised Learning (RSL) on 3D scene understanding: Given the 3D synthetic models with labels and the unlabeled 3D real scene scans, our goal is to distinguish the identical semantic objects on an unseen scene according to the referring synthetic 3D models. Unlike current tasks, the purpose of RSL is to study how to transfer the neural network's knowledge from the 3D models to unseen 3D scenes, where the main challenge is solving the cross-scene -domain and -task gap between the referring synthetic model and real unseen scene. To this end, we propose a simple yet effective self-supervised framework to perform two alignment operations. First, physical alignment aims to make the referring models match the scene with data processing techniques, and then convex-hull regularized feature alignment introduces learnable prototypes to project the point features of referring models to a convex hull space, where the feature acts as a convex combination of the learned prototypes (for both referring model and real scene) and this regularization eases the alignment. Experiments show that our method achieves the average mAP of 55.32% on the ScanNet dataset by referring only to the synthetic models from the ModelNet dataset. Furthermore, it can be regarded as a pretext task to improve the performance of the downstream tasks in 3D scene understanding.", "keywords": "3D point cloud;self-supervised learning", "primary_area": "", "supplementary_material": "/attachment/528d7bea3d7ad6efb35899af17910aad141ee471.zip", "author": "Runnan Chen;Xinge ZHU;Nenglun Chen;Dawei Wang;Wei Li;Yuexin Ma;Ruigang Yang;Wenping Wang", "authorids": "~Runnan_Chen1;~Xinge_ZHU2;~Nenglun_Chen1;dawei@connect.hku.hk;~Wei_Li28;mayuexin@shanghaitech.edu.cn;ruigang.yang@inceptio.ai;~Wenping_Wang1", "gender": "M;;M;;M;;;M", "homepage": "https://scholar.google.com.hk/citations?hl=en&user=Uq2DuzkAAAAJ&view_op=list_works&sortby=pubdate;;https://scholar.google.com/citations?user=UhjTC7AAAAAJ;;;;;https://engineering.tamu.edu/cse/profiles/Wang-Wenping.html", "dblp": "232/1849;;230/7699.html;;64/6025-111;;;", "google_scholar": "https://scholar.google.com.hk/citations?hl=en;;UhjTC7AAAAAJ;;i8jP6q8AAAAJ;;;28shvv0AAAAJ", "orcid": ";;;;0000-0002-0059-3745;;;0000-0002-2284-3952", "linkedin": ";;;;;;;", "or_profile": "~Runnan_Chen1;~Xinge_ZHU2;~Nenglun_Chen1;dawei@connect.hku.hk;~Wei_Li28;mayuexin@shanghaitech.edu.cn;ruigang.yang@inceptio.ai;~Wenping_Wang1", "aff": "Inceptio;;The University of Hong Kong;;Inceptio;;;Texas A&M University - College Station", "aff_domain": "inceptio.ai;;hku.hk;;inceptio.ai;;;tamu.edu", "position": "Intern;;PhD student;;Researcher;;;Full Professor", "bibtex": "@misc{\nchen2022referring,\ntitle={Referring Self-supervised Learning on 3D Point Cloud},\nauthor={Runnan Chen and Xinge ZHU and Nenglun Chen and Dawei Wang and Wei Li and Yuexin Ma and Ruigang Yang and Wenping Wang},\nyear={2022},\nurl={https://openreview.net/forum?id=vjaGQ4cftD}\n}", "github": "", "project": "", "reviewers": "Xwo4;387M;gUNk;DL8r;Xn1t", "site": "https://openreview.net/forum?id=vjaGQ4cftD", "pdf_size": 0, "recommendation": "3;5;5;5;5", "confidence": "3;3;3;4;4", "correctness": "2;3;2;2;3", "technical_novelty": "2;2;3;2;3", "empirical_novelty": "1;2;2;3;0", "wc_summary_paper": "116;198;110;103;73", "wc_summary_review": "71;91;52;102;114", "wc_main_review": "377;762;520;585;461", "wc_review": "564;1051;682;790;648", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "0;0;0;0;0", "reply_reviewers": "0;0;0;0;0", "reply_authors": "0;0;0;0;0", "recommendation_avg": [ 4.6, 0.7999999999999999 ], "confidence_avg": [ 3.4, 0.4898979485566356 ], "correctness_avg": [ 2.4, 0.4898979485566356 ], "technical_novelty_avg": [ 2.4, 0.4898979485566356 ], "empirical_novelty_avg": [ 1.6, 1.019803902718557 ], "wc_summary_paper_avg": [ 120.0, 41.708512320628266 ], "wc_summary_review_avg": [ 86.0, 22.117866081518805 ], "wc_main_review_avg": [ 541.0, 130.01076878474336 ], "wc_review_avg": [ 747.0, 168.39239887833418 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": 0.408248290463863, "corr_recommendation_correctness": 0.408248290463863, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:g4KVEYV--zYJ:scholar.google.com/&scioq=Referring+Self-supervised+Learning+on+3D+Point+Cloud&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;0;2", "aff_unique_norm": "Inceptio;University of Hong Kong;Texas A&M University", "aff_unique_dep": ";;", "aff_unique_url": ";https://www.hku.hk;https://www.tamu.edu", "aff_unique_abbr": ";HKU;TAMU", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Hong Kong SAR;College Station", "aff_country_unique_index": "1;2", "aff_country_unique": ";China;United States" }, { "id": "vkZtFD0zga8", "title": "Uncertainty-Aware Deep Video Compression with Ensembles", "track": "main", "status": "Reject", "tldr": "", "abstract": "Deep learning-based video compression is a challenging task and many previous state-of-the-art learning-based video codecs use optical flows to exploit the temporal correlation between successive frames and then compress the residual error. Although these two-stage models are end-to-end optimized, errors in the intermediate errors are propagated to later stages and would harm the overall performance. In this work, we investigate the inherent uncertainty in these intermediate predictions and present an ensemble-based video compression model to capture the predictive uncertainty. We also propose an ensemble-aware loss to encourage the diversity between ensemble members and investigate the benefit of incorporating adversarial training in the video compression task. Experimental results on 1080p sequences show that our model can effectively save bits by more than 20% compared to DVC Pro.", "keywords": "Video compression;uncertainty;ensemble learning", "primary_area": "", "supplementary_material": "", "author": "Wufei Ma;Jiahao Li;Bin Li;Yan Lu", "authorids": "~Wufei_Ma1;~Jiahao_Li3;~Bin_Li10;~Yan_Lu7", "gender": "M;;M;M", "homepage": "https://wufeim.github.io;;;https://www.microsoft.com/en-us/research/people/yanlu/", "dblp": "243/2814;35/4180;89/6764-12;15/4830-1", "google_scholar": "mYkvHdIAAAAJ;AcOcw0AAAAAJ;rxZn4g0AAAAJ;djk5l-4AAAAJ", "orcid": ";;;0000-0001-5383-6424", "linkedin": "wufei-ma-256352133/;;;", "or_profile": "~Wufei_Ma1;~Jiahao_Li3;~Bin_Li10;~Yan_Lu7", "aff": "Purdue University;Microsoft Research Asia;Microsoft;Microsoft Research Asia", "aff_domain": "purdue.edu;microsoft.com;microsoft.com;microsoft.com", "position": "PhD student;Researcher;Researcher;Partner Research Manager", "bibtex": "@misc{\nma2022uncertaintyaware,\ntitle={Uncertainty-Aware Deep Video Compression with Ensembles},\nauthor={Wufei Ma and Jiahao Li and Bin Li and Yan Lu},\nyear={2022},\nurl={https://openreview.net/forum?id=vkZtFD0zga8}\n}", "github": "", "project": "", "reviewers": "Fyqz;3SEn;BZ5c;ZZDv", "site": "https://openreview.net/forum?id=vkZtFD0zga8", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "3;4;3;3", "correctness": "2;3;3;3", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "36;56;77;79", "wc_summary_review": "36;28;39;67", "wc_main_review": "182;142;205;258", "wc_review": "254;226;321;404", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "728;1032;1180;1406", "reply_reviewers": "0;0;0;0", "reply_authors": "1;2;2;3", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 62.0, 17.507141400011598 ], "wc_summary_review_avg": [ 42.5, 14.705441169852742 ], "wc_main_review_avg": [ 196.75, 41.936708263763386 ], "wc_review_avg": [ 301.25, 68.63444834775026 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 1086.5, 246.12750760530608 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.0, 0.7071067811865476 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7212372096125108703&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff_unique_index": "0;1;1;1", "aff_unique_norm": "Purdue University;Microsoft", "aff_unique_dep": ";Research", "aff_unique_url": "https://www.purdue.edu;https://www.microsoft.com/en-us/research/group/asia", "aff_unique_abbr": "Purdue;MSR Asia", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Asia", "aff_country_unique_index": "0;1;0;1", "aff_country_unique": "United States;China" }, { "title": "EXACT: Scalable Graph Neural Networks Training via Extreme Activation Compression", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6021", "id": "vkaMaq95_rX", "poster": "", "openreview": "https://openreview.net/forum?id=vkaMaq95_rX", "slides": "https://iclr.cc/virtual/2022/poster/6021", "video": "https://iclr.cc/virtual/2022/poster/6021", "author_site": "Zirui Liu, Kaixiong Zhou, Fan Yang, Li Li, Rui Chen, Xia Hu", "tldr": "", "abstract": "Training Graph Neural Networks (GNNs) on large graphs is a fundamental challenge due to the high memory usage, which is mainly occupied by activations (e.g., node embeddings). Previous works usually focus on reducing the number of nodes retained in memory.\nIn parallel, unlike what has been developed for other types of neural networks, training with compressed activation maps is less explored for GNNs. This extension is notoriously difficult to implement due to the miss of necessary tools in common graph learning packages. To unleash the potential of this direction, we provide { an} optimized GPU implementation which supports training GNNs with compressed activations. Based on the implementation, we propose a memory-efficient framework called ``EXACT'', which for the first time demonstrate the potential and evaluate the feasibility of training GNNs with compressed activations. We systematically analyze the trade-off among the memory saving, time overhead, and accuracy drop. In practice, EXACT can reduce the memory footprint of activations by up to $32\\times$ with $0.2$-$0.5\\%$ accuracy drop and $10$-$25\\%$ time overhead across different models and datasets. We implement EXACT as an extension for Pytorch Geometric and Pytorch. In practice, for Pytorch Geometric, EXACT can trim down the hardware requirement of training a three-layer full-batch GraphSAGE on \\textit{ogbn-products} from a 48GB GPU to a 12GB GPU.", "keywords": "graph neural networks;scalable GNN training;quantization;random projection", "primary_area": "", "supplementary_material": "/attachment/a8524b9b7ac5abbb8b31acf8cfa85ec2eed5365d.zip", "author": "Zirui Liu;Kaixiong Zhou;Fan Yang;Li Li;Rui Chen;Xia Hu", "authorids": "~Zirui_Liu1;~Kaixiong_Zhou1;~Fan_Yang27;~Li_Li11;~Rui_Chen4;~Xia_Hu4", "gender": "M;M;M;M;;", "homepage": "https://zirui-ray-liu.github.io/;https://kaixiong-zhou.github.io/;https://yangfan.sites.wfu.edu/;;;", "dblp": "196/8629-1.html;178/7315;;53/2189-35;;", "google_scholar": "https://scholar.google.com/citations?hl=zh-CN;zMspIjIAAAAJ;RXFeW-8AAAAJ;FPcI7HkAAAAJ;;", "orcid": ";0000-0001-5226-8736;0000-0003-3442-754X;0000-0002-3365-8904;;", "linkedin": ";;;li-li-b8a08664/;;", "or_profile": "~Zirui_Liu1;~Kaixiong_Zhou1;~Fan_Yang27;~Li_Li11;~Rui_Chen4;~Xia_Hu4", "aff": "Rice University;Rice University;Rice University;Samsung;;", "aff_domain": "rice.edu;rice.edu;rice.edu;samsung.com;;", "position": "PhD student;PhD student;PhD student;Researcher;;", "bibtex": "@inproceedings{\nliu2022exact,\ntitle={{EXACT}: Scalable Graph Neural Networks Training via Extreme Activation Compression},\nauthor={Zirui Liu and Kaixiong Zhou and Fan Yang and Li Li and Rui Chen and Xia Hu},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=vkaMaq95_rX}\n}", "github": "", "project": "", "reviewers": "2GtZ;eNSi;FTNh", "pdf_size": 0, "recommendation": "3;6;8", "confidence": "5;3;3", "correctness": "3;3;4", "technical_novelty": "2;3;3", "empirical_novelty": "2;3;3", "wc_summary_paper": "77;64;58", "wc_summary_review": "93;44;16", "wc_main_review": "472;351;99", "wc_review": "642;459;173", "wc_reply_reviewers": "878;83;0", "wc_reply_authors": "6262;1431;656", "reply_reviewers": "3;1;0", "reply_authors": "13;3;1", "recommendation_avg": [ 5.666666666666667, 2.0548046676563256 ], "confidence_avg": [ 3.6666666666666665, 0.9428090415820634 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 66.33333333333333, 7.93025150224688 ], "wc_summary_review_avg": [ 51.0, 31.822423959633664 ], "wc_main_review_avg": [ 307.3333333333333, 155.37553074907115 ], "wc_review_avg": [ 424.6666666666667, 193.0014392577308 ], "wc_reply_reviewers_avg": [ 320.3333333333333, 395.78304943772196 ], "wc_reply_authors_avg": [ 2783.0, 2480.2872145513043 ], "reply_reviewers_avg": [ 1.3333333333333333, 1.247219128924647 ], "reply_authors_avg": [ 5.666666666666667, 5.2493385826745405 ], "replies_avg": [ 26, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.9176629354822472, "corr_recommendation_correctness": 0.8029550685469661, "gs_citation": 67, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15989763488694802334&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "pdf": "https://openreview.net/pdf?id=vkaMaq95_rX", "email": "rice.edu;rice.edu;rice.edu;samsung.com;;", "author_num": 6, "aff_unique_index": "0;0;0;1", "aff_unique_norm": "Rice University;Samsung", "aff_unique_dep": ";Samsung", "aff_unique_url": "https://www.rice.edu;https://www.samsung.com", "aff_unique_abbr": "Rice;Samsung", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1", "aff_country_unique": "United States;South Korea" }, { "id": "vnENCLwVBET", "title": "OUMG: Objective and Universal Metric for Text Generation with Guiding Ability", "track": "main", "status": "Reject", "tldr": "", "abstract": "Existing evaluation metrics for text generation rely on comparing candidate sentences to reference sentences. Some text generation tasks, such as story generation and poetry generation, have no fixed optimal answer and cannot match a corresponding reference for each sentence. Therefore, there is a lack of an objective and universal evaluation metric. To this end, we propose OUMG, a general metric that does not depend on reference standards. We train a discriminator to distinguish between human-generated and machine-generated text, which is used to score the sentences generated by the model. These scores reflect how similar the sentences are to human-generated texts. The capability of the discriminator can be measured by its accuracy, so it avoids the subjectivity of human judgments. Furthermore, the trained discriminator can also guide the text generation process to improve model performance. Experiments on poetry generation demonstrate that OUMG can objectively evaluate text generation models without reference standards. After combining the discriminator with the generation model, the original model can produce significantly higher quality results.", "keywords": "evaluation metric;text generation;objective", "primary_area": "", "supplementary_material": "", "author": "Hanxu Liu;Nianmin Yao", "authorids": "~Hanxu_Liu1;~Nianmin_Yao1", "gender": "M;", "homepage": ";https://github.com/liu-hanxv", "dblp": "18/1894;", "google_scholar": "https://scholar.google.com.hk/citations?user=ztMJF3gAAAAJ;", "orcid": "0000-0001-9705-6649;", "linkedin": ";", "or_profile": "~Nianmin_Yao1;~Liu_Hanxu1", "aff": "Dalian University of Technology;Dalian University of Technology", "aff_domain": "dlut.edu.cn;dlut.edu.cn", "position": "Full Professor;MS student", "bibtex": "@misc{\nliu2022oumg,\ntitle={{OUMG}: Objective and Universal Metric for Text Generation with Guiding Ability},\nauthor={Hanxu Liu and Nianmin Yao},\nyear={2022},\nurl={https://openreview.net/forum?id=vnENCLwVBET}\n}", "github": "", "project": "", "reviewers": "27Bu;BR3n;pGB6;Nb33;xKWL", "site": "https://openreview.net/forum?id=vnENCLwVBET", "pdf_size": 0, "recommendation": "1;1;3;3;3", "confidence": "4;4;5;4;3", "correctness": "1;1;1;2;2", "technical_novelty": "1;1;1;2;2", "empirical_novelty": "1;1;1;2;2", "wc_summary_paper": "19;48;113;97;310", "wc_summary_review": "32;39;92;35;52", "wc_main_review": "235;361;678;464;505", "wc_review": "286;448;883;596;867", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "98;98;98;98;98", "reply_reviewers": "0;0;0;0;0", "reply_authors": "1;1;1;1;1", "recommendation_avg": [ 2.2, 0.9797958971132712 ], "confidence_avg": [ 4.0, 0.6324555320336759 ], "correctness_avg": [ 1.4, 0.4898979485566356 ], "technical_novelty_avg": [ 1.4, 0.4898979485566356 ], "empirical_novelty_avg": [ 1.4, 0.4898979485566356 ], "wc_summary_paper_avg": [ 117.4, 102.00901920908758 ], "wc_summary_review_avg": [ 50.0, 22.081666603768838 ], "wc_main_review_avg": [ 448.6, 147.87913984061444 ], "wc_review_avg": [ 616.0, 233.15831531386567 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 98.0, 0.0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.6666666666666667, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:iziVf2uD2mkJ:scholar.google.com/&scioq=OUMG:+Objective+and+Universal+Metric+for+Text+Generation+with+Guiding+Ability&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Dalian University of Technology", "aff_unique_dep": "", "aff_unique_url": "http://www.dlut.edu.cn/", "aff_unique_abbr": "DUT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "id": "vnF5gDNvcKX", "title": "Variance Reduced Domain Randomization for Policy Gradient", "track": "main", "status": "Reject", "tldr": "", "abstract": "By introducing randomness on environment parameters that fundamentally affect the dynamics, domain randomization (DR) imposes diversity to the policy trained by deep reinforcement learning, and thus improves its capability of generalization. The randomization of environments, however, introduces another source of variability for the estimate of policy gradients, in addition to the already high variance due to trajectory sampling. Therefore, with standard state-dependent baselines, the policy gradient methods may still suffer high variance, causing low sample efficiency during the training of DR. In this paper, we theoretically derive a bias-free and state/environment-dependent optimal baseline for DR, and analytically show its ability to achieve further variance reduction over the standard constant and state-dependent baselines for DR. We further propose a variance reduced domain randomization (VRDR) approach for policy gradient methods, to strike a tradeoff between the variance reduction and computational complexity in practice. By dividing the entire space of environments into some subspaces and estimating the state/subspace-dependent baseline, VRDR enjoys a theoretical guarantee of faster convergence than the state-dependent baseline. We conduct empirical evaluations on six robot control tasks with randomized dynamics. The results demonstrate that VRDR can consistently accelerate the convergence of policy training in all tasks, and achieve even higher rewards in some specific tasks. ", "keywords": "Reinforcement learning;generalization;variance reduction", "primary_area": "", "supplementary_material": "", "author": "Yuankun Jiang;Chenglin Li;Wenrui Dai;Junni Zou;Hongkai Xiong", "authorids": "~Yuankun_Jiang1;~Chenglin_Li2;~Wenrui_Dai1;~Junni_Zou1;~Hongkai_Xiong1", "gender": "M;M;;F;M", "homepage": "http://min.sjtu.edu.cn/;https://min.sjtu.edu.cn/En/FacultyShow/4?Vid=17;;http://www.cs.sjtu.edu.cn/~zou-jn;http://min.sjtu.edu.cn", "dblp": ";;16/5135.html;91/4613;21/3569", "google_scholar": ";ltW2JMcAAAAJ;Xg8MhyAAAAAJ;https://scholar.google.com/citations?hl=zh-CN;bB16iN4AAAAJ", "orcid": ";;;;0000-0003-4552-0029", "linkedin": ";;;;", "or_profile": "~Yuankun_Jiang1;~Chenglin_Li2;~Wenrui_Dai1;~Junni_Zou1;~Hongkai_Xiong1", "aff": "Shanghai Jiaotong University;Shanghai Jiaotong University;Shanghai Jiaotong University;Shanghai Jiaotong University;Shanghai Jiaotong University", "aff_domain": "sjtu.edu.cn;sjtu.edu.cn;sjtu.edu.cn;sjtu.edu.cn;sjtu.edu.cn", "position": "PhD student;Associate Professor;Associate Professor;Full Professor;Full Professor", "bibtex": "@misc{\njiang2022variance,\ntitle={Variance Reduced Domain Randomization for Policy Gradient},\nauthor={Yuankun Jiang and Chenglin Li and Wenrui Dai and Junni Zou and Hongkai Xiong},\nyear={2022},\nurl={https://openreview.net/forum?id=vnF5gDNvcKX}\n}", "github": "", "project": "", "reviewers": "uerm;kFGn;LpEU;ePpB", "site": "https://openreview.net/forum?id=vnF5gDNvcKX", "pdf_size": 0, "recommendation": "5;5;5;5", "confidence": "4;5;4;2", "correctness": "3;4;4;3", "technical_novelty": "2;2;2;4", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "77;52;18;79", "wc_summary_review": "14;13;33;38", "wc_main_review": "660;359;181;190", "wc_review": "751;424;232;307", "wc_reply_reviewers": "174;85;0;206", "wc_reply_authors": "2753;1389;497;1499", "reply_reviewers": "1;3;0;1", "reply_authors": "5;4;2;4", "recommendation_avg": [ 5.0, 0.0 ], "confidence_avg": [ 3.75, 1.0897247358851685 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.5, 0.8660254037844386 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 56.5, 24.642443060703215 ], "wc_summary_review_avg": [ 24.5, 11.146748404803978 ], "wc_main_review_avg": [ 347.5, 193.85368193562897 ], "wc_review_avg": [ 428.5, 198.3689743886377 ], "wc_reply_reviewers_avg": [ 116.25, 80.43747571872206 ], "wc_reply_authors_avg": [ 1534.5, 803.675774177622 ], "reply_reviewers_avg": [ 1.25, 1.0897247358851685 ], "reply_authors_avg": [ 3.75, 1.0897247358851685 ], "replies_avg": [ 26, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:hejfKxTDrNkJ:scholar.google.com/&scioq=Variance+Reduced+Domain+Randomization+for+Policy+Gradient&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Shanghai Jiao Tong University", "aff_unique_dep": "", "aff_unique_url": "https://www.sjtu.edu.cn", "aff_unique_abbr": "SJTU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "id": "vnOHGQY4FP1", "title": "Rethinking Temperature in Graph Contrastive Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Due to not relying on the rare human-labeled information, self-supervised learning, especially contrastive learning, attracted much attention from researchers. It has begun to show its strong advantages on both IID data (independent and identically distributed data, such as images and texts) and Non-IID data (such as nodes in graphs). Recently, researchers begin to explore the interpretability of contrastive learning and have proposed some metrics for measuring the learned representations' qualities of IID data, such as alignment, uniformity, and semantic closeness. It is very important to understand the relationships among node representations, which is helpful to design algorithms with stronger interpretability. However, few studies focus on evaluating good node representations in graph contrastive learning. In this paper, we investigate and discuss what a good representation should be for a general loss (InfoNCE) in graph contrastive learning. By theoretical analysis, we argue that global uniformity and local separation are both necessary to the learning quality. We find that the two new metrics can be regulated by the temperature coefficient in InfoNCE loss. Based on this characteristic, we develop a simple but effective algorithm GLATE to dynamically adjust the temperature value in the training phase. GLATE outperforms the state-of-the-art graph contrastive learning algorithms 2.8 and 0.9 percent on average under the transductive and inductive learning tasks, respectively. The code is available at: https://github.com/anonymousICLR22/GLATE.", "keywords": "self-supervised learning;graph contrastive learning;uniformity", "primary_area": "", "supplementary_material": "", "author": "Ziyang Liu;Hao Feng;Chaokun Wang", "authorids": "~Ziyang_Liu3;~Hao_Feng3;~Chaokun_Wang1", "gender": "M;M;M", "homepage": ";https://www.fhao.top/;https://wangchaokun.github.io/index.html", "dblp": "85/5485-4;46/4184-7;13/1672", "google_scholar": ";LGw723sAAAAJ;-7DZAIsAAAAJ", "orcid": "0009-0007-4238-1533;0000-0002-2912-909X;0000-0002-2986-2574", "linkedin": ";;", "or_profile": "~Ziyang_Liu3;~Hao_Feng3;~Chaokun_Wang1", "aff": "Tsinghua University;Tsinghua University;Tsinghua University", "aff_domain": "mails.tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn", "position": "PhD student;PhD student;Associate Professor", "bibtex": "@misc{\nliu2022rethinking,\ntitle={Rethinking Temperature in Graph Contrastive Learning},\nauthor={Ziyang Liu and Hao Feng and Chaokun Wang},\nyear={2022},\nurl={https://openreview.net/forum?id=vnOHGQY4FP1}\n}", "github": "", "project": "", "reviewers": "xNkv;e7hL;eDkW;aKmA", "site": "https://openreview.net/forum?id=vnOHGQY4FP1", "pdf_size": 0, "recommendation": "3;5;6;8", "confidence": "5;4;4;4", "correctness": "2;3;4;4", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;3;3;2", "wc_summary_paper": "37;38;69;128", "wc_summary_review": "23;52;37;21", "wc_main_review": "305;334;227;258", "wc_review": "365;424;333;407", "wc_reply_reviewers": "0;0;109;0", "wc_reply_authors": "1498;690;1737;365", "reply_reviewers": "0;0;1;0", "reply_authors": "4;2;5;1", "recommendation_avg": [ 5.5, 1.8027756377319946 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 68.0, 36.952672433803755 ], "wc_summary_review_avg": [ 33.25, 12.457427503300993 ], "wc_main_review_avg": [ 281.0, 41.32190702278877 ], "wc_review_avg": [ 382.25, 35.632674611934476 ], "wc_reply_reviewers_avg": [ 27.25, 47.198384506251905 ], "wc_reply_authors_avg": [ 1072.5, 563.3544621284187 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 3.0, 1.5811388300841898 ], "replies_avg": [ 20, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.8006407690254357, "corr_recommendation_correctness": 0.9198662110077999, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5129968190563541923&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;0", "aff_unique_norm": "Tsinghua University", "aff_unique_dep": "", "aff_unique_url": "https://www.tsinghua.edu.cn", "aff_unique_abbr": "THU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "id": "voEpzgY8gsT", "title": "Additive Poisson Process: Learning Intensity of Higher-Order Interaction in Poisson Processes", "track": "main", "status": "Reject", "tldr": "", "abstract": "We present the Additive Poisson Process (APP), a novel framework that can model the higher-order interaction effects of the intensity functions in Poisson processes using projections into lower-dimensional space. Our model combines the techniques in information geometry to model higher-order interactions on a statistical manifold and in generalized additive models to use lower-dimensional projections to overcome the effects from the curse of dimensionality. Our approach solves a convex optimization problem by minimizing the KL divergence from a sample distribution in lower-dimensional projections to the distribution modeled by an intensity function in the Poisson process. Our empirical results show that our model is able to use samples observed in the lower dimensional space to estimate the higher-order intensity function with extremely sparse observations.", "keywords": "Poisson Process;Log-Linear Model;Energy-Based Model;Generalized Additive Models;Information Geometry", "primary_area": "", "supplementary_material": "/attachment/f513335aaa6aa0faaada45b6a5ad05b66622807b.zip", "author": "Simon Luo;Feng Zhou;lamiae azizi;Mahito Sugiyama", "authorids": "~Simon_Luo1;~Feng_Zhou9;~lamiae_azizi1;~Mahito_Sugiyama1", "gender": ";;;M", "homepage": ";;https://www.maths.usyd.edu.au/u/CMIG/;https://mahito.nii.ac.jp/", "dblp": "199/2628;;;05/8421", "google_scholar": ";;https://scholar.google.fr/citations?user=fThAv-cAAAAJ;qLlRvTkAAAAJ", "orcid": ";;0000-0001-9894-2618;0000-0001-5907-9831", "linkedin": ";;;", "or_profile": "~Simon_Luo1;~Feng_Zhou9;~lamiae_azizi1;~Mahito_Sugiyama1", "aff": "University of Sydney;;;National Institute of Informatics", "aff_domain": "sydney.edu.au;;;nii.ac.jp", "position": "Postdoc;;;Associate Professor", "bibtex": "@misc{\nluo2022additive,\ntitle={Additive Poisson Process: Learning Intensity of Higher-Order Interaction in Poisson Processes},\nauthor={Simon Luo and Feng Zhou and lamiae azizi and Mahito Sugiyama},\nyear={2022},\nurl={https://openreview.net/forum?id=voEpzgY8gsT}\n}", "github": "", "project": "", "reviewers": "wbmT;cFZz;s5rK;AyVi", "site": "https://openreview.net/forum?id=voEpzgY8gsT", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "4;4;3;3", "correctness": "2;3;4;2", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "19;51;75;158", "wc_summary_review": "31;22;67;49", "wc_main_review": "310;362;618;297", "wc_review": "360;435;760;504", "wc_reply_reviewers": "0;295;0;227", "wc_reply_authors": "850;1432;1014;1492", "reply_reviewers": "0;1;0;1", "reply_authors": "2;3;2;3", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 2.75, 0.82915619758885 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 75.75, 51.47511534712671 ], "wc_summary_review_avg": [ 42.25, 17.282577932704367 ], "wc_main_review_avg": [ 396.75, 130.03340916856715 ], "wc_review_avg": [ 514.75, 150.47487331777356 ], "wc_reply_reviewers_avg": [ 130.5, 132.6960813287265 ], "wc_reply_authors_avg": [ 1197.0, 272.09740902845806 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 2.5, 0.5 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -1.0, "corr_recommendation_correctness": 0.30151134457776363, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9347708017325503881&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2, "aff_unique_index": "0;1", "aff_unique_norm": "University of Sydney;National Institute of Informatics", "aff_unique_dep": ";", "aff_unique_url": "https://www.sydney.edu.au;https://www.nii.ac.jp/", "aff_unique_abbr": "USYD;NII", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "Australia;Japan" }, { "id": "vpiOnyOBTzQ", "title": "Disentangled generative models for robust dynamical system prediction", "track": "main", "status": "Reject", "tldr": "", "abstract": "Deep neural networks have become increasingly of interest in dynamical system prediction, but out-of-distribution generalization and long-term stability still remains challenging. In this work, we treat the domain parameters of dynamical systems as factors of variation of the data generating process. By leveraging ideas from supervised disentanglement and causal factorization, we aim to separate the domain parameters from the dynamics in the latent space of generative models. In our experiments we model dynamics both in phase space and in video sequences and conduct rigorous OOD evaluations. Results indicate that disentangled models adapt better to domain parameters spaces that were not present in the training data while, at the same time, provide better long-term predictions in video sequences.", "keywords": "disentanglement;dynamical systems;prediction;generative models;robustness;out-of-distribution;distribution shift;causal inference", "primary_area": "", "supplementary_material": "/attachment/4db027caedd1280703de26ef804c867db3e1e2b7.zip", "author": "Stathi Fotiadis;Shunlong Hu;Mario Lino Valencia;Chris D Cantwell;Anil Anthony Bharath", "authorids": "~Stathi_Fotiadis1;shunlong.hu20@imperial.ac.uk;~Mario_Lino_Valencia1;~Chris_D_Cantwell1;~Anil_Anthony_Bharath2", "gender": "M;;M;M;", "homepage": "https://www.linkedin.com/in/stathifotiadis/;;;http://www.imperial.ac.uk/people/c.cantwell;", "dblp": ";;;;", "google_scholar": "ZHZczW8AAAAJ;;;https://scholar.google.co.uk/citations?user=gBCrORQAAAAJ;", "orcid": ";;;0000-0002-2448-3540;", "linkedin": ";;mario-lino-valencia-b004ba17;chrisdcantwell/;", "or_profile": "~Stathi_Fotiadis1;shunlong.hu20@imperial.ac.uk;~Mario_Lino_Valencia1;~Chris_D_Cantwell1;~Anil_Anthony_Bharath2", "aff": "Imperial College London;;Imperial College London;Imperial College London;", "aff_domain": "imperial.ac.uk;;imperial.ac.uk;imperial.ac.uk;", "position": "PhD student;;PhD student;Associate Professor;", "bibtex": "@misc{\nfotiadis2022disentangled,\ntitle={Disentangled generative models for robust dynamical system prediction},\nauthor={Stathi Fotiadis and Shunlong Hu and Mario Lino Valencia and Chris D Cantwell and Anil Anthony Bharath},\nyear={2022},\nurl={https://openreview.net/forum?id=vpiOnyOBTzQ}\n}", "github": "", "project": "", "reviewers": "z3gj;2NUz;HYc5;yFXa;dcJ9", "site": "https://openreview.net/forum?id=vpiOnyOBTzQ", "pdf_size": 0, "recommendation": "1;3;3;5;6", "confidence": "4;3;4;3;4", "correctness": "3;3;2;3;3", "technical_novelty": "2;2;2;3;2", "empirical_novelty": "2;2;2;2;2", "wc_summary_paper": "57;37;40;108;93", "wc_summary_review": "116;29;29;56;164", "wc_main_review": "1546;355;686;174;848", "wc_review": "1719;421;755;338;1105", "wc_reply_reviewers": "441;32;0;144;236", "wc_reply_authors": "2084;670;1080;443;1177", "reply_reviewers": "1;1;0;1;1", "reply_authors": "4;1;2;1;2", "recommendation_avg": [ 3.6, 1.7435595774162693 ], "confidence_avg": [ 3.6, 0.4898979485566356 ], "correctness_avg": [ 2.8, 0.39999999999999997 ], "technical_novelty_avg": [ 2.2, 0.39999999999999997 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 67.0, 28.58671019897183 ], "wc_summary_review_avg": [ 78.8, 53.146589730668516 ], "wc_main_review_avg": [ 721.8, 475.6344815086476 ], "wc_review_avg": [ 867.6, 504.74690687511895 ], "wc_reply_reviewers_avg": [ 170.6, 159.00641496493154 ], "wc_reply_authors_avg": [ 1090.8, 564.0409205013409 ], "reply_reviewers_avg": [ 0.8, 0.4 ], "reply_authors_avg": [ 2.0, 1.0954451150103321 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.1873171623163388, "corr_recommendation_correctness": 0.17206180040292135, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12826637773343668107&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;0", "aff_unique_norm": "Imperial College London", "aff_unique_dep": "", "aff_unique_url": "https://www.imperial.ac.uk", "aff_unique_abbr": "ICL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United Kingdom" }, { "title": "Mind the Gap: Domain Gap Control for Single Shot Domain Adaptation for Generative Adversarial Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6775", "id": "vqGi8Kp0wM", "poster": "", "openreview": "https://openreview.net/forum?id=vqGi8Kp0wM", "slides": "https://iclr.cc/virtual/2022/poster/6775", "video": "https://iclr.cc/virtual/2022/poster/6775", "author_site": "Peihao Zhu, Rameen Abdal, John Femiani, Peter Wonka", "tldr": "", "abstract": "We present a new method for one shot domain adaptation. The input to our method is trained GAN that can produce images in domain A and a single reference image I_B from domain B. The proposed algorithm can translate any output of the trained GAN from domain A to domain B. There are two main advantages of our method compared to the current state of the art: First, our solution achieves higher visual quality, e.g. by noticeably reducing overfitting. Second, our solution allows for more degrees of freedom to control the domain gap, i.e. what aspects of image I_B are used to define the domain B. Technically, we realize the new method by building on a pre-trained StyleGAN generator as GAN and a pre-trained CLIP model for representing the domain gap. We propose several new regularizers for controlling the domain gap to optimize the weights of the pre-trained StyleGAN generator to output images in domain B instead of domain A. The regularizers prevent the optimization from taking on too many attributes of the single reference image. Our results show significant visual improvements over the state of the art as well as multiple applications that highlight improved control.", "keywords": "GAN;StyleGAN;Clip;Domain Adaptation;Style Transfer;Single Shot", "primary_area": "", "supplementary_material": "/attachment/7622d6fc37313f193ff1062891b66ec77c6cc639.zip", "author": "Peihao Zhu;Rameen Abdal;John Femiani;Peter Wonka", "authorids": "~Peihao_Zhu1;~Rameen_Abdal1;~John_Femiani1;~Peter_Wonka1", "gender": "M;M;M;M", "homepage": ";https://rameenabdal.github.io/;http://miamioh.edu/cec/academics/departments/cse/about/faculty-and-staff/femiani-john/index.html;http://peterwonka.net", "dblp": "255/9066;239/4322;79/2994;98/5522", "google_scholar": "Gn8URq0AAAAJ;https://scholar.google.co.in/citations?user=kEQimk0AAAAJ;BRJQl5cAAAAJ;https://scholar.google.com.tw/citations?user=0EKXSXgAAAAJ", "orcid": "0000-0002-7122-1551;;0000-0002-0924-6686;0000-0003-0627-9746", "linkedin": ";;john-femiani-5593894b/;", "or_profile": "~Peihao_Zhu1;~Rameen_Abdal1;~John_Femiani1;~Peter_Wonka1", "aff": "KAUST;KAUST;Miami University;KAUST", "aff_domain": "kaust.edu.sa;kaust.edu.sa;miamioh.edu;kaust.edu.sa", "position": "PhD student;PhD student;Full Professor;Full Professor", "bibtex": "@inproceedings{\nzhu2022mind,\ntitle={Mind the Gap: Domain Gap Control for Single Shot Domain Adaptation for Generative Adversarial Networks},\nauthor={Peihao Zhu and Rameen Abdal and John Femiani and Peter Wonka},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=vqGi8Kp0wM}\n}", "github": "", "project": "", "reviewers": "d3yx;Zd5v;eZmY", "pdf_size": 0, "recommendation": "6;6;8", "confidence": "4;4;3", "correctness": "2;3;4", "technical_novelty": "2;3;3", "empirical_novelty": "2;3;4", "wc_summary_paper": "41;42;34", "wc_summary_review": "19;31;39", "wc_main_review": "262;226;323", "wc_review": "322;299;396", "wc_reply_reviewers": "807;0;26", "wc_reply_authors": "1910;506;696", "reply_reviewers": "3;0;1", "reply_authors": "5;1;1", "recommendation_avg": [ 6.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.0, 0.816496580927726 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 3.0, 0.816496580927726 ], "wc_summary_paper_avg": [ 39.0, 3.559026084010437 ], "wc_summary_review_avg": [ 29.666666666666668, 8.219218670625303 ], "wc_main_review_avg": [ 270.3333333333333, 40.03609482565562 ], "wc_review_avg": [ 339.0, 41.38437708443449 ], "wc_reply_reviewers_avg": [ 277.6666666666667, 374.4456643567335 ], "wc_reply_authors_avg": [ 1037.3333333333333, 621.9246113655756 ], "reply_reviewers_avg": [ 1.3333333333333333, 1.247219128924647 ], "reply_authors_avg": [ 2.3333333333333335, 1.8856180831641267 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.9999999999999998, "corr_recommendation_correctness": 0.8660254037844385, "gs_citation": 82, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12451250368858284023&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "pdf": "https://openreview.net/pdf?id=vqGi8Kp0wM", "email": "kaust.edu.sa;kaust.edu.sa;miamioh.edu;kaust.edu.sa", "author_num": 4, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "King Abdullah University of Science and Technology;Miami University", "aff_unique_dep": ";", "aff_unique_url": "https://www.kaust.edu.sa;https://www.miamioh.edu", "aff_unique_abbr": "KAUST;MU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "Saudi Arabia;United States" }, { "id": "vr39r4Rjt3z", "title": "Designing Less Forgetful Networks for Continual Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Neural networks usually excel in learning a single task. Their weights are plastic and help them to learn quickly, but these weights are also known to be unstable. Hence, they may experience catastrophic forgetting and lose the ability to solve past tasks when assimilating information to solve a new task. Existing methods have mostly attempted to address this problem through external constraints. Replay shows the backbone network externally stored memories; regularisation imposes additional learning objectives; and dynamic architecture often introduces more parameters to host new knowledge. In contrast, we look for internal means to create less forgetful networks. This paper demonstrates that two simple architectural modifications -- Masked Highway Connection and Layer-Wise Normalisation -- can drastically reduce the forgetfulness in a backbone network. When naively employed to sequentially learn over multiple tasks, our modified backbones were as competitive as those unmodified backbones with continual learning techniques applied. Furthermore, our proposed architectural modifications were compatible with most if not all continual learning archetypes and therefore helped those respective techniques in achieving new state of the art.", "keywords": "Continual Learning", "primary_area": "", "supplementary_material": "", "author": "Nicholas I-Hsien Kuo;Mehrtash Harandi;Nicolas Fourrier;Gabriela Ferraro;Christian Walder;Hanna Suominen", "authorids": "~Nicholas_I-Hsien_Kuo1;~Mehrtash_Harandi2;~Nicolas_Fourrier1;~Gabriela_Ferraro1;~Christian_Walder1;~Hanna_Suominen2", "gender": "M;;;;;F", "homepage": "https://www.unsw.edu.au/staff/nic-kuo;;;;;https://comp.anu.edu.au/people/hanna-suominen/", "dblp": ";;;93/7168;;28/3067", "google_scholar": "5vXLTysAAAAJ;;;;;o4qymo4AAAAJ", "orcid": ";;;;;0000-0002-4195-1641", "linkedin": ";;;;;hanna-suominen-b476507/?originalSubdomain=au", "or_profile": "~Nicholas_I-Hsien_Kuo1;~Mehrtash_Harandi2;~Nicolas_Fourrier1;~Gabriela_Ferraro1;~Christian_Walder1;~Hanna_Suominen2", "aff": "University of New South Wales;;;;;Australian National University", "aff_domain": "unsw.edu.au;;;;;anu.edu.au", "position": "Research Fellow;;;;;Associate Professor", "bibtex": "@misc{\nkuo2022designing,\ntitle={Designing Less Forgetful Networks for Continual Learning},\nauthor={Nicholas I-Hsien Kuo and Mehrtash Harandi and Nicolas Fourrier and Gabriela Ferraro and Christian Walder and Hanna Suominen},\nyear={2022},\nurl={https://openreview.net/forum?id=vr39r4Rjt3z}\n}", "github": "", "project": "", "reviewers": "h7Ku;Homk;pQdx;tojU;zxFC", "site": "https://openreview.net/forum?id=vr39r4Rjt3z", "pdf_size": 0, "recommendation": "5;5;6;8;8", "confidence": "4;5;4;3;3", "correctness": "3;3;2;3;4", "technical_novelty": "2;3;3;3;4", "empirical_novelty": "3;3;2;3;4", "wc_summary_paper": "60;51;51;69;43", "wc_summary_review": "42;60;44;45;23", "wc_main_review": "67;269;164;303;301", "wc_review": "169;380;259;417;367", "wc_reply_reviewers": "0;0;31;24;10", "wc_reply_authors": "461;2339;1594;748;258", "reply_reviewers": "0;0;1;1;1", "reply_authors": "1;6;4;3;1", "recommendation_avg": [ 6.4, 1.3564659966250536 ], "confidence_avg": [ 3.8, 0.7483314773547882 ], "correctness_avg": [ 3.0, 0.6324555320336759 ], "technical_novelty_avg": [ 3.0, 0.6324555320336759 ], "empirical_novelty_avg": [ 3.0, 0.6324555320336759 ], "wc_summary_paper_avg": [ 54.8, 8.908422980528034 ], "wc_summary_review_avg": [ 42.8, 11.7881296226331 ], "wc_main_review_avg": [ 220.8, 92.07909643344684 ], "wc_review_avg": [ 318.4, 91.35337979516684 ], "wc_reply_reviewers_avg": [ 13.0, 12.58570617804182 ], "wc_reply_authors_avg": [ 1080.0, 776.961517708567 ], "reply_reviewers_avg": [ 0.6, 0.48989794855663565 ], "reply_authors_avg": [ 3.0, 1.8973665961010275 ], "replies_avg": [ 25, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.9063269671749656, "corr_recommendation_correctness": 0.4662524041201569, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:DNqHaxgzSHcJ:scholar.google.com/&scioq=Designing+Less+Forgetful+Networks+for+Continual+Learning&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1", "aff_unique_norm": "University of New South Wales;Australian National University", "aff_unique_dep": ";", "aff_unique_url": "https://www.unsw.edu.au;https://www.anu.edu.au", "aff_unique_abbr": "UNSW;ANU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Australia" }, { "id": "vr4Wo33bd1", "title": "Semi-supervised Long-tailed Recognition using Alternate Sampling", "track": "main", "status": "Reject", "tldr": "", "abstract": "Main challenges in long-tailed recognition come from the imbalanced data distribution and sample scarcity in its tail classes. While techniques have been proposed to achieve a more balanced training loss and to improve tail classes data variations with synthesized samples, we resort to leverage readily available unlabeled data to boost recognition accuracy. The idea leads to a new recognition setting, namely semi-supervised long-tailed recognition. We argue this setting better resembles the real-world data collection and annotation process and hence can help close the gap to real-world scenarios. To address the semi-supervised long-tailed recognition problem, we present an alternate sampling framework combining the intuitions from successful methods in these two research areas. The classifier and feature embedding are learned separately and updated iteratively. The class-balanced sampling strategy has been implemented to train the classifier in a way not affected by the pseudo labels' quality on the unlabeled data. A consistency loss has been introduced to limit the impact from unlabeled data while leveraging them to update the feature embedding. We demonstrate significant accuracy improvements over other competitive methods on two datasets.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/3152bda4495dbcab14f00a9740556484c9ef8991.zip", "author": "Bo Liu;Haoxiang Li;Hao Kang;Nuno Vasconcelos;Gang Hua", "authorids": "~Bo_Liu16;~Haoxiang_Li1;hao.kang@bianlifeng.com;~Nuno_Vasconcelos1;~Gang_Hua3", "gender": "M;M;;M;M", "homepage": "http://www.svcl.ucsd.edu/people/liubo/;https://resume.haoxiang.org;;http://www.svcl.ucsd.edu/~nuno/;http://www.ganghua.org", "dblp": ";;;78/4806;75/5209.html", "google_scholar": ";Fu6aoXAAAAAJ;;Fykyo9gAAAAJ;7SgUlggAAAAJ", "orcid": ";;;0000-0002-9024-4302;0000-0001-9522-6157", "linkedin": ";haoxiangli/;;;ganghua/", "or_profile": "~Bo_Liu16;~Haoxiang_Li1;hao.kang@bianlifeng.com;~Nuno_Vasconcelos1;~Gang_Hua3", "aff": "Wormpex AI Research;Wormpex AI Research;;University of California, San Diego;Wormpex AI Research", "aff_domain": "bianlifeng.com;wormpexai.com;;ucsd.edu;bianlifeng.com", "position": "Researcher;Principal Researcher;;Professor;Chief Scientist and Managing Director", "bibtex": "@misc{\nliu2022semisupervised,\ntitle={Semi-supervised Long-tailed Recognition using Alternate Sampling},\nauthor={Bo Liu and Haoxiang Li and Hao Kang and Nuno Vasconcelos and Gang Hua},\nyear={2022},\nurl={https://openreview.net/forum?id=vr4Wo33bd1}\n}", "github": "", "project": "", "reviewers": "Y7tQ;minL;PFkd;mLKc;6A7n", "site": "https://openreview.net/forum?id=vr4Wo33bd1", "pdf_size": 0, "recommendation": "1;5;5;5;5", "confidence": "5;3;5;5;4", "correctness": "3;3;2;3;4", "technical_novelty": "2;3;2;2;3", "empirical_novelty": "1;3;2;2;2", "wc_summary_paper": "23;21;89;55;75", "wc_summary_review": "39;43;50;31;22", "wc_main_review": "332;398;327;280;294", "wc_review": "394;462;466;366;391", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "0;0;0;0;0", "reply_reviewers": "0;0;0;0;0", "reply_authors": "0;0;0;0;0", "recommendation_avg": [ 4.2, 1.6000000000000003 ], "confidence_avg": [ 4.4, 0.7999999999999999 ], "correctness_avg": [ 3.0, 0.6324555320336759 ], "technical_novelty_avg": [ 2.4, 0.4898979485566356 ], "empirical_novelty_avg": [ 2.0, 0.6324555320336759 ], "wc_summary_paper_avg": [ 52.6, 27.229395880188015 ], "wc_summary_review_avg": [ 37.0, 9.695359714832659 ], "wc_main_review_avg": [ 326.2, 40.892053017670804 ], "wc_review_avg": [ 415.8, 40.55810646467609 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.37499999999999994, "corr_recommendation_correctness": 0.0, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3146115032091924551&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Wormpex AI Research;University of California, San Diego", "aff_unique_dep": "AI Research;", "aff_unique_url": ";https://www.ucsd.edu", "aff_unique_abbr": "Wormpex AI;UCSD", "aff_campus_unique_index": "1", "aff_campus_unique": ";San Diego", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Sample Efficient Deep Reinforcement Learning via Uncertainty Estimation", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6905", "id": "vrW3tvDfOJQ", "poster": "", "openreview": "https://openreview.net/forum?id=vrW3tvDfOJQ", "slides": "https://iclr.cc/virtual/2022/poster/6905", "video": "https://iclr.cc/virtual/2022/poster/6905", "author_site": "Vincent Mai, Kaustubh Mani, Liam Paull", "tldr": "", "abstract": "In model-free deep reinforcement learning (RL) algorithms, using noisy value estimates to supervise policy evaluation and optimization is detrimental to the sample efficiency. As this noise is heteroscedastic, its effects can be mitigated using uncertainty-based weights in the optimization process. Previous methods rely on sampled ensembles, which do not capture all aspects of uncertainty. We provide a systematic analysis of the sources of uncertainty in the noisy supervision that occurs in RL, and introduce inverse-variance RL, a Bayesian framework which combines probabilistic ensembles and Batch Inverse Variance weighting. We propose a method whereby two complementary uncertainty estimation methods account for both the Q-value and the environment stochasticity to better mitigate the negative impacts of noisy supervision. Our results show significant improvement in terms of sample efficiency on discrete and continuous control tasks.", "keywords": "Deep reinforcement learning;uncertainty estimation;inverse-variance;heteroscedastic", "primary_area": "", "supplementary_material": "/attachment/14bed71546a875f6a3d6d55814d9818b4816aa8f.zip", "author": "Vincent Mai;Kaustubh Mani;Liam Paull", "authorids": "~Vincent_Mai1;~Kaustubh_Mani1;~Liam_Paull1", "gender": "M;M;", "homepage": "https://mila.quebec/en/person/vincent-mai/;;", "dblp": "229/0382;;", "google_scholar": ";MnPjDIgAAAAJ;", "orcid": "0000-0003-2823-504X;;", "linkedin": "mai-vincent/;;", "or_profile": "~Vincent_Mai1;~Kaustubh_Mani1;~Liam_Paull1", "aff": "Montreal Institute for Learning Algorithms, University of Montreal, University of Montreal;;", "aff_domain": "mila.umontreal.ca;;", "position": "PhD student;;", "bibtex": "@inproceedings{\nmai2022sample,\ntitle={Sample Efficient Deep Reinforcement Learning via Uncertainty Estimation},\nauthor={Vincent Mai and Kaustubh Mani and Liam Paull},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=vrW3tvDfOJQ}\n}", "github": "", "project": "", "reviewers": "Ncaq;gusa;9Z1v;HGRr", "pdf_size": 0, "recommendation": "8;8;8;10", "confidence": "4;4;4;5", "correctness": "3;3;4;4", "technical_novelty": "4;3;3;3", "empirical_novelty": "3;4;2;4", "wc_summary_paper": "107;116;80;49", "wc_summary_review": "111;55;91;9", "wc_main_review": "647;917;383;116", "wc_review": "865;1088;554;174", "wc_reply_reviewers": "147;80;178;0", "wc_reply_authors": "443;796;936;113", "reply_reviewers": "1;1;1;0", "reply_authors": "1;2;2;1", "recommendation_avg": [ 8.5, 0.8660254037844386 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 88.0, 26.124700955226263 ], "wc_summary_review_avg": [ 66.5, 38.79110722833263 ], "wc_main_review_avg": [ 515.75, 298.18230581307137 ], "wc_review_avg": [ 670.25, 343.5915998682156 ], "wc_reply_reviewers_avg": [ 101.25, 68.34974396440707 ], "wc_reply_authors_avg": [ 572.0, 320.15386925664353 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 1.0, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 53, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8416439116779187759&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "pdf": "https://openreview.net/pdf?id=vrW3tvDfOJQ", "email": "mila.umontreal.ca;;", "author_num": 3, "aff_unique_index": "0", "aff_unique_norm": "University of Montreal", "aff_unique_dep": "Montreal Institute for Learning Algorithms", "aff_unique_url": "https://www.umontreal.ca", "aff_unique_abbr": "UM", "aff_campus_unique_index": "0", "aff_campus_unique": "Montreal", "aff_country_unique_index": "0", "aff_country_unique": "Canada" }, { "id": "vruwp11pWnO", "title": "Improving and Assessing Anomaly Detectors for Large-Scale Settings", "track": "main", "status": "Reject", "tldr": "", "abstract": "Detecting out-of-distribution examples is important for safety-critical machine learning applications such as detecting novel biological phenomena and self-driving cars. However, existing research mainly focuses on simple small-scale settings. To set the stage for more realistic out-of-distribution detection, we depart from small-scale settings and explore large-scale multiclass and multi-label settings with high-resolution images and thousands of classes. To make future work in real-world settings possible, we create new benchmarks for three large-scale settings. To test ImageNet multiclass anomaly detectors, we introduce a new dataset of anomalous species. We leverage ImageNet-22K to evaluate PASCAL VOC and COCO multilabel anomaly detectors. Third, we introduce a new benchmark for anomaly segmentation by introducing a segmentation benchmark with road anomalies. We conduct extensive experiments in these more realistic settings for out-of-distribution detection and find that a surprisingly simple detector based on the maximum logit outperforms prior methods in all the large-scale multi-class, multi-label, and segmentation tasks, establishing a simple new baseline for future work.", "keywords": "anomaly;ood;distribution shift;out-of-distribution", "primary_area": "", "supplementary_material": "", "author": "Dan Hendrycks;Steven Basart;Mantas Mazeika;Andy Zou;Joseph Kwon;Mohammadreza Mostajabi;Jacob Steinhardt", "authorids": "~Dan_Hendrycks1;~Steven_Basart1;~Mantas_Mazeika3;~Andy_Zou1;joseph.kwon@yale.edu;~Mohammadreza_Mostajabi1;~Jacob_Steinhardt1", "gender": ";M;M;;;M;", "homepage": ";http://stevenbas.art;https://github.com/mmazeika;;;;", "dblp": "182/2504;245/2547;215/4447;274/2362;;119/2482;35/10625", "google_scholar": ";MzKvJhAAAAAJ;;;;;", "orcid": ";;;;;;", "linkedin": ";xksteven/;;andy-zou-09ba3616a/;;;", "or_profile": "~Dan_Hendrycks1;~Steven_Basart1;~Mantas_Mazeika3;~Andy_Zou1;joseph.kwon@yale.edu;~Mohammadreza_Mostajabi1;~Jacob_Steinhardt1", "aff": "UC Berkeley;Center for AI Safety ;University of Illinois, Urbana-Champaign;University of California, Berkeley;;;University of California, Berkeley", "aff_domain": "berkeley.edu;safe.ai;uiuc.edu;berkeley.edu;;;berkeley.edu", "position": "PhD student;Researcher;PhD student;MS student;;;Assistant Professor", "bibtex": "@misc{\nhendrycks2022improving,\ntitle={Improving and Assessing Anomaly Detectors for Large-Scale Settings},\nauthor={Dan Hendrycks and Steven Basart and Mantas Mazeika and Andy Zou and Joseph Kwon and Mohammadreza Mostajabi and Jacob Steinhardt},\nyear={2022},\nurl={https://openreview.net/forum?id=vruwp11pWnO}\n}", "github": "", "project": "", "reviewers": "9GvF;Tkux;osoG;qdA5;xNrS", "site": "https://openreview.net/forum?id=vruwp11pWnO", "pdf_size": 0, "recommendation": "3;5;5;5;6", "confidence": "5;3;4;4;4", "correctness": "2;3;4;3;3", "technical_novelty": "1;2;3;2;3", "empirical_novelty": "2;4;3;3;3", "wc_summary_paper": "24;75;84;83;44", "wc_summary_review": "46;14;118;25;48", "wc_main_review": "267;111;280;339;204", "wc_review": "337;200;482;447;296", "wc_reply_reviewers": "0;0;0;990;0", "wc_reply_authors": "734;303;707;1573;440", "reply_reviewers": "0;0;0;4;0", "reply_authors": "1;1;2;5;1", "recommendation_avg": [ 4.8, 0.9797958971132712 ], "confidence_avg": [ 4.0, 0.6324555320336759 ], "correctness_avg": [ 3.0, 0.6324555320336759 ], "technical_novelty_avg": [ 2.2, 0.7483314773547882 ], "empirical_novelty_avg": [ 3.0, 0.6324555320336759 ], "wc_summary_paper_avg": [ 62.0, 23.924882444852262 ], "wc_summary_review_avg": [ 50.2, 36.23478991245844 ], "wc_main_review_avg": [ 240.2, 77.54585740063746 ], "wc_review_avg": [ 352.4, 102.3613208199269 ], "wc_reply_reviewers_avg": [ 198.0, 396.0 ], "wc_reply_authors_avg": [ 751.4, 441.66349181248836 ], "reply_reviewers_avg": [ 0.8, 1.6 ], "reply_authors_avg": [ 2.0, 1.5491933384829668 ], "replies_avg": [ 20, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": -0.6454972243679028, "corr_recommendation_correctness": 0.6454972243679028, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17399196019464714026&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;2;0;0", "aff_unique_norm": "University of California, Berkeley;Center for AI Safety;University of Illinois", "aff_unique_dep": ";;", "aff_unique_url": "https://www.berkeley.edu;https://www.centerforaisafety.org;https://illinois.edu", "aff_unique_abbr": "UC Berkeley;;UIUC", "aff_campus_unique_index": "0;2;0;0", "aff_campus_unique": "Berkeley;;Urbana-Champaign", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "vsEi1UMa7TC", "title": "Exploiting Knowledge Distillation for Few-Shot Image Generation", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Few-shot image generation, which trains generative models on limited examples, is of practical importance. The existing pipeline is first pretraining a source model (which contains a generator and a discriminator) on a large-scale dataset and finetuning it on a target domain with limited samples. The main challenge is that the few-shot model easily becomes overfitting. It can be attributed to two aspects: the lack of sample diversity for the generator and the failure of fidelity discrimination for the discriminator. In this paper, we treat the diversity and fidelity in the source model as a kind of knowledge and propose to improve the generation results via exploring knowledge distillation. The source model trained on the large-scale dataset is regarded as teacher model and the target model (student) is learned by introducing momentum relation distillation module to produce diverse samples and source discrimination distillation to ensure the fidelity discrimination. With the momentum relation distillation and source discrimination distillation modules, the proposed method outperforms the state-of-the-art of by a large margin, i.e., 10% for FFHQ to Sketches, while achieving better diversity. ", "keywords": "few-show image generation;knowledge distillation", "primary_area": "", "supplementary_material": "", "author": "Xingzhong Hou;Boxiao Liu;Fang Wan;Haihang You", "authorids": "~Xingzhong_Hou1;~Boxiao_Liu1;~Fang_Wan1;~Haihang_You1", "gender": "M;M;M;", "homepage": "https://github.com/Ace-Pegasus;;https://people.ucas.ac.cn/~wanfang?language=en;", "dblp": ";188/2274;;89/4494", "google_scholar": ";-zEM0ycAAAAJ;https://scholar.google.com.hk/citations?user=0IKavloAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";0000-0002-9792-1361;0000-0002-8083-9257;", "linkedin": ";;;", "or_profile": "~Xingzhong_Hou1;~Boxiao_Liu1;~Fang_Wan1;~Haihang_You1", "aff": "State Key Laboratory of Computer Architecture, Institute of Computing Technology, Chinese Academy of Sciences;State Key Laboratory of Computer Architecture, Institute of Computing Technology, Chinese Academy of Sciences;University of Chinese Academy of Sciences;Institute of Computing Technology, Chinese Academy of Sciences", "aff_domain": "ict.ac.cn;ict.ac.cn;ucas.ac.cn;ict.ac.cn", "position": "PhD student;PhD student;Assistant Professor;Full Professor", "bibtex": "@misc{\nhou2022exploiting,\ntitle={Exploiting Knowledge Distillation for Few-Shot Image Generation},\nauthor={Xingzhong Hou and Boxiao Liu and Fang Wan and Haihang You},\nyear={2022},\nurl={https://openreview.net/forum?id=vsEi1UMa7TC}\n}", "github": "", "project": "", "reviewers": "YBA1;pNem;gk3H", "site": "https://openreview.net/forum?id=vsEi1UMa7TC", "pdf_size": 0, "recommendation": "3;3;5", "confidence": "4;5;4", "correctness": "3;3;3", "technical_novelty": "2;2;2", "empirical_novelty": "2;2;2", "wc_summary_paper": "98;75;108", "wc_summary_review": "20;19;44", "wc_main_review": "286;401;495", "wc_review": "404;495;647", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 3.6666666666666665, 0.9428090415820634 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 93.66666666666667, 13.816254517375137 ], "wc_summary_review_avg": [ 27.666666666666668, 11.55662388223981 ], "wc_main_review_avg": [ 394.0, 85.46734269103415 ], "wc_review_avg": [ 515.3333333333334, 100.24082113701095 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.4999999999999999, "corr_recommendation_correctness": 0.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17420760062602895779&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Chinese Academy of Sciences;University of Chinese Academy of Sciences", "aff_unique_dep": "Institute of Computing Technology;", "aff_unique_url": "http://www.cas.cn;http://www.ucas.ac.cn", "aff_unique_abbr": "CAS;UCAS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "id": "vtDzHJOsmfJ", "title": "Non-convex Optimization for Learning a Fair Predictor under Equalized Loss Fairness Constraint", "track": "main", "status": "Reject", "tldr": "", "abstract": "Supervised learning models have been increasingly used in various domains such as lending, college admission, natural language processing, face recognition, etc. These models may inherit pre-existing biases from training datasets and exhibit discrimination against protected social groups. Various fairness notions have been introduced to address fairness issues. In general, finding a fair predictor leads to a constrained optimization problem, and depending on the fairness notion, it may be non-convex. In this work, we focus on Equalized Loss ($\\textsf{EL}$), a fairness notion that requires the prediction error/loss to be equalized across different demographic groups. Imposing this constraint to the learning process leads to a non-convex optimization problem even if the loss function is convex. We introduce algorithms that can leverage off-the-shelf convex programming tools and efficiently find the $\\textit{global}$ optimum of this non-convex problem. In particular, we first propose the $\\mathtt{ELminimizer}$ algorithm, which finds the optimal $\\textsf{EL}$ fair predictor by reducing the non-convex optimization problem to a sequence of convex constrained optimizations. We then propose a simple algorithm that is computationally more efficient compared to $\\mathtt{ELminimizer}$ and finds a sub-optimal $\\textsf{EL}$ fair predictor using $\\textit{unconstrained}$ convex programming tools. Experiments on real-world data show the effectiveness of our algorithms. ", "keywords": "Non-convex Optimization;Fairness;Supervised Learning", "primary_area": "", "supplementary_material": "/attachment/014530fc088608e319f3eb5912b3739045776d91.zip", "author": "Mohammad Mahdi Khalili;Xueru Zhang;Mahed Abroshan;Iman Vakilinia", "authorids": "~Mohammad_Mahdi_Khalili3;~Xueru_Zhang2;~Mahed_Abroshan1;~Iman_Vakilinia1", "gender": "M;F;M;M", "homepage": "https://Khalilimahdi.github.io;https://xueruzhang.github.io/;;https://iman.domains.unf.edu/", "dblp": "159/2163.html;;;136/2276.html", "google_scholar": "hSgnKecAAAAJ;PNBO_a4AAAAJ;tYSPRRwAAAAJ;GD7bikwAAAAJ", "orcid": "0000-0002-4223-3254;;;", "linkedin": "mohammad-mahdi-khalili-aa4241127;;mahed-abroshan/;iman-vakilinia-0a794770/", "or_profile": "~Mohammad_Mahdi_Khalili3;~Xueru_Zhang2;~Mahed_Abroshan1;~Iman_Vakilinia1", "aff": "University of Delaware;Ohio State University;Alan Turing Institute;University of North Florida", "aff_domain": "udel.edu;osu.edu;turing.ac.uk;unf.edu", "position": "Assistant Professor;Assistant Professor;Postdoc;Assistant Professor", "bibtex": "@misc{\nkhalili2022nonconvex,\ntitle={Non-convex Optimization for Learning a Fair Predictor under Equalized Loss Fairness Constraint},\nauthor={Mohammad Mahdi Khalili and Xueru Zhang and Mahed Abroshan and Iman Vakilinia},\nyear={2022},\nurl={https://openreview.net/forum?id=vtDzHJOsmfJ}\n}", "github": "", "project": "", "reviewers": "toHL;2B6v;Cpty;YrTG", "site": "https://openreview.net/forum?id=vtDzHJOsmfJ", "pdf_size": 0, "recommendation": "3;3;5;6", "confidence": "5;3;4;3", "correctness": "3;4;4;4", "technical_novelty": "3;2;3;3", "empirical_novelty": "1;2;4;2", "wc_summary_paper": "56;80;49;104", "wc_summary_review": "48;31;49;94", "wc_main_review": "328;219;155;373", "wc_review": "432;330;253;571", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.25, 1.299038105676658 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 1.0897247358851685 ], "wc_summary_paper_avg": [ 72.25, 21.637640814099857 ], "wc_summary_review_avg": [ 55.5, 23.350588857671234 ], "wc_main_review_avg": [ 268.75, 86.30288233888831 ], "wc_review_avg": [ 396.5, 119.08505363814554 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.4061811972299616, "corr_recommendation_correctness": 0.5555555555555555, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:OElWQmDXr28J:scholar.google.com/&scioq=Non-convex+Optimization+for+Learning+a+Fair+Predictor+under+Equalized+Loss+Fairness+Constraint&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "University of Delaware;Ohio State University;Alan Turing Institute;University of North Florida", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.udel.edu;https://www.osu.edu;https://www.turing.ac.uk;https://www.unf.edu", "aff_unique_abbr": "UD;OSU;ATI;UNF", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "United States;United Kingdom" }, { "id": "vtLbsGUyYx", "title": "AutoCoG: A Unified Data-Modal Co-Search Framework for Graph Neural Networks", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Neural architecture search (NAS) has demonstrated success in discovering promising architectures for vision or language modeling tasks, and it has recently been introduced to searching for graph neural networks (GNNs) as well. Despite the preliminary success, we argue that for GNNs, NAS has to be customized further, due to the topological complicacy of GNN input data (graph) as well as the notorious training instability. Besides optimizing the GNN model architecture, we propose to simultaneously optimize the input graph topology, via a set of parameterized data augmentation operators. That yields AutoCoG, the first unified data-model co-search NAS framework for GNNs. By defining a highly flexible data-model co-search space, AutoCoG is gracefully formulated as a principled bi-level optimization, that can be end-to-end solved by the differential search methods. Experiments demonstrate that AutoCoG produces state-of-the-art performance at standard benchmarks including Cora, PubMed, and Citeseer, outperforming both state-of-the-art hand-crafted GNNs as well as recent GNN-NAS methods. AutoCoG can also scale to searching deeper GCNs in larger-scale datasets. Our method consistently achieves state-of-the-art (SOTA) results on Cora, Citeseer, Pubmed, and ogbn-arxiv. Specifically, we achieve gains of up to 2.04% for Cora, 2.54% for Citeseer, 2.08% for Pubmed, and finally 0.83% for ogbn-arxiv on our benchmarks.", "keywords": "GCN;NAS", "primary_area": "", "supplementary_material": "/attachment/363c4a5fa7587c0e36e32754f93144e0d8879f1f.zip", "author": "Duc N.M Hoang;Kaixiong Zhou;Tianlong Chen;Xia Hu;Zhangyang Wang", "authorids": "~Duc_N.M_Hoang1;~Kaixiong_Zhou1;~Tianlong_Chen1;~Xia_Hu4;~Zhangyang_Wang1", "gender": "M;M;M;M;M", "homepage": ";https://kaixiong-zhou.github.io/;https://tianlong-chen.github.io;https://vita-group.github.io;https://cs.rice.edu/~xh37/index.html", "dblp": ";178/7315;;119/4026;256/9406.html", "google_scholar": "v7S4UNcAAAAJ;zMspIjIAAAAJ;LE3ctn0AAAAJ;pxFyKAIAAAAJ;https://scholar.google.com.tw/citations?user=pcCS60IAAAAJ", "orcid": ";0000-0001-5226-8736;0000-0001-7774-8197;;", "linkedin": ";;tianlong-chen-783862167/;;", "or_profile": "~Duc_N.M_Hoang1;~Kaixiong_Zhou1;~Tianlong_Chen1;~Zhangyang_Wang1;~Xia_Hu2", "aff": "University of Texas, Austin;Rice University;University of Texas, Austin;University of Texas, Austin;Rice University", "aff_domain": "utexas.edu;rice.edu;utexas.edu;utexas.edu;rice.edu", "position": "PhD student;PhD student;PhD student;Assistant Professor;Associate Professor", "bibtex": "@misc{\nhoang2022autocog,\ntitle={AutoCoG: A Unified Data-Modal Co-Search Framework for Graph Neural Networks},\nauthor={Duc N.M Hoang and Kaixiong Zhou and Tianlong Chen and Xia Hu and Zhangyang Wang},\nyear={2022},\nurl={https://openreview.net/forum?id=vtLbsGUyYx}\n}", "github": "", "project": "", "reviewers": "52YC;13tu;vKFJ;bKZu", "site": "https://openreview.net/forum?id=vtLbsGUyYx", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "4;4;5;4", "correctness": "3;3;3;3", "technical_novelty": "2;2;2;3", "empirical_novelty": "3;3;2;2", "wc_summary_paper": "52;13;43;53", "wc_summary_review": "70;32;18;36", "wc_main_review": "272;134;281;214", "wc_review": "394;179;342;303", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 40.25, 16.20763708873073 ], "wc_summary_review_avg": [ 39.0, 19.1049731745428 ], "wc_main_review_avg": [ 225.25, 58.62326756502063 ], "wc_review_avg": [ 304.5, 79.32370389738492 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:ry9BbzHts74J:scholar.google.com/&scioq=AutoCoG:+A+Unified+Data-Modal+Co-Search+Framework+for+Graph+Neural+Networks&hl=en&as_sdt=0,5", "gs_version_total": 4, "aff_unique_index": "0;1;0;0;1", "aff_unique_norm": "University of Texas at Austin;Rice University", "aff_unique_dep": ";", "aff_unique_url": "https://www.utexas.edu;https://www.rice.edu", "aff_unique_abbr": "UT Austin;Rice", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Austin;", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "vuw072gfi3W", "title": "A Permutation-Invariant Representation of Neural Networks with Neuron Embeddings", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Neural networks are traditionally represented in terms of their weights. A key property of this representation is that there are multiple representations of a network which can be obtained by permuting the order of the neurons. These representations are generally not compatible and attempting to transfer part of a network without the preceding layers is usually destructive to any learned relationships. This paper proposes a method by which a neural network is represented in terms of an embedding of the neurons rather than explicit weights. In addition to reducing the number of free parameters, this encoding is agnostic to the ordering of neurons, bypassing a key problem for weight-based representations. This allows us to transplant individual neurons and layers into another network and still maintain their functionality. This is particularly important for tasks like transfer learning and neuroevolution. We show through experiments on the MNIST and CIFAR10 datasets that this method is capable of representing networks which achieve identical performance to direct weight representation, and that transfer done this way preserves much of the performance between two networks that are distant in parameter space.", "keywords": "neural networks;neural network representation;neuroevolution;transfer learning", "primary_area": "", "supplementary_material": "", "author": "Ryan Zhou;Christian Muise;Ting Hu", "authorids": "~Ryan_Zhou1;~Christian_Muise1;~Ting_Hu1", "gender": ";M;F", "homepage": "https://ryanz8.github.io/;http://www.haz.ca/;https://sites.google.com/site/tinghushomepage/research?authuser=0", "dblp": ";70/1862;", "google_scholar": ";HUzuGj8AAAAJ;https://scholar.google.ca/citations?user=ikLwqLwAAAAJ", "orcid": ";;", "linkedin": ";christianmuise/;", "or_profile": "~Ryan_Zhou1;~Christian_Muise1;~Ting_Hu1", "aff": "Cornell University;Queens University;Queens University", "aff_domain": "cornell.edu;queensu.ca;queensu.ca", "position": "Undergrad student;Assistant Professor;Assistant Professor", "bibtex": "@misc{\nzhou2022a,\ntitle={A Permutation-Invariant Representation of Neural Networks with Neuron Embeddings},\nauthor={Ryan Zhou and Christian Muise and Ting Hu},\nyear={2022},\nurl={https://openreview.net/forum?id=vuw072gfi3W}\n}", "github": "", "project": "", "reviewers": "NQBd;gimQ;aBDo;bc4H", "site": "https://openreview.net/forum?id=vuw072gfi3W", "pdf_size": 0, "recommendation": "1;3;5;6", "confidence": "5;4;3;3", "correctness": "2;3;3;3", "technical_novelty": "1;2;2;4", "empirical_novelty": "1;1;2;0", "wc_summary_paper": "50;86;99;117", "wc_summary_review": "122;66;37;87", "wc_main_review": "446;354;430;184", "wc_review": "618;506;566;388", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "291;662;461;227", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 3.75, 1.920286436967152 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 1.0897247358851685 ], "empirical_novelty_avg": [ 1.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 88.0, 24.545875417267155 ], "wc_summary_review_avg": [ 78.0, 30.99193443462347 ], "wc_main_review_avg": [ 353.5, 103.84965093826749 ], "wc_review_avg": [ 519.5, 85.6431550096095 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 410.25, 168.63774043789843 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.9813358399735743, "corr_recommendation_correctness": 0.8268106308031117, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9948333579329057823&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1;1", "aff_unique_norm": "Cornell University;Queens University", "aff_unique_dep": ";", "aff_unique_url": "https://www.cornell.edu;https://www.queensu.ca", "aff_unique_abbr": "Cornell;Queen's U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1", "aff_country_unique": "United States;Canada" }, { "title": "Continual Normalization: Rethinking Batch Normalization for Online Continual Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/5912", "id": "vwLLQ-HwqhZ", "poster": "", "openreview": "https://openreview.net/forum?id=vwLLQ-HwqhZ", "slides": "https://iclr.cc/virtual/2022/poster/5912", "video": "https://iclr.cc/virtual/2022/poster/5912", "author_site": "Quang Pham, Chenghao Liu, Steven HOI", "tldr": "", "abstract": "Existing continual learning methods use Batch Normalization (BN) to facilitate training and improve generalization across tasks. However, the non-i.i.d and non-stationary nature of continual learning data, especially in the online setting, amplify the discrepancy between training and testing in BN and hinder the performance of older tasks. In this work, we study the cross-task normalization effect of BN in online continual learning where BN normalizes the testing data using moments biased towards the current task, resulting in higher catastrophic forgetting. This limitation motivates us to propose a simple yet effective method that we call Continual Normalization (CN) to facilitate training similar to BN while mitigating its negative effect. Extensive experiments on different continual learning algorithms and online scenarios show that CN is a direct replacement for BN and can provide substantial performance improvements. Our implementation will be made publicly available upon acceptance.", "keywords": "Continual Learning;Batch Normalization", "primary_area": "", "supplementary_material": "/attachment/fe4928c18bdb976a8e37480f0491f9b68c5d10a5.zip", "author": "Quang Pham;Chenghao Liu;Steven HOI", "authorids": "~Quang_Pham1;~Chenghao_Liu1;~Steven_HOI1", "gender": "M;M;M", "homepage": "https://sites.google.com/view/quangpham93;;https://www.smu.edu.sg/faculty/profile/110831/Steven-HOI", "dblp": "81/8316;;h/StevenCHHoi", "google_scholar": "https://scholar.google.com.sg/citations?user=WC7Bu_kAAAAJ;https://scholar.google.com/citations?hl=en;https://scholar.google.com.tw/citations?user=JoLjflYAAAAJ", "orcid": ";;", "linkedin": ";chenghao-liu-40a62a56/;", "or_profile": "~Quang_Pham1;~Chenghao_Liu1;~Steven_HOI1", "aff": "Singapore Management University;Salesforce AI Research;Singapore Management University", "aff_domain": "smu.edu.sg;salesforce.com;", "position": "PhD student;Researcher;Associate Professor", "bibtex": "@inproceedings{\npham2022continual,\ntitle={Continual Normalization: Rethinking Batch Normalization for Online Continual Learning},\nauthor={Quang Pham and Chenghao Liu and Steven HOI},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=vwLLQ-HwqhZ}\n}", "github": "", "project": "", "reviewers": "9jXz;9JHA;8iRv;BLEs", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "4;4;4;4", "correctness": "3;4;3;3", "technical_novelty": "3;3;3;3", "empirical_novelty": "2;3;3;4", "wc_summary_paper": "66;80;59;102", "wc_summary_review": "32;71;17;131", "wc_main_review": "195;176;241;737", "wc_review": "293;327;317;970", "wc_reply_reviewers": "0;21;25;462", "wc_reply_authors": "205;189;826;1926", "reply_reviewers": "0;1;1;2", "reply_authors": "2;2;3;4", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 76.75, 16.422164899914993 ], "wc_summary_review_avg": [ 62.75, 44.05890942817355 ], "wc_main_review_avg": [ 337.25, 232.00255925312547 ], "wc_review_avg": [ 476.75, 285.0459392799694 ], "wc_reply_reviewers_avg": [ 127.0, 193.64529428829402 ], "wc_reply_authors_avg": [ 786.5, 706.2522566335629 ], "reply_reviewers_avg": [ 1.0, 0.7071067811865476 ], "reply_authors_avg": [ 2.75, 0.82915619758885 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": -0.13245323570650439, "gs_citation": 74, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5393032746032394321&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=vwLLQ-HwqhZ", "email": "smu.edu.sg;salesforce.com;", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "Singapore Management University;Salesforce", "aff_unique_dep": ";Salesforce AI Research", "aff_unique_url": "https://www.smu.edu.sg;https://www.salesforce.com", "aff_unique_abbr": "SMU;Salesforce AI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", "aff_country_unique": "Singapore;United States" }, { "title": "Long Expressive Memory for Sequence Modeling", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6411", "id": "vwj6aUeocyf", "poster": "", "openreview": "https://openreview.net/forum?id=vwj6aUeocyf", "slides": "https://iclr.cc/virtual/2022/poster/6411", "video": "https://iclr.cc/virtual/2022/poster/6411", "author_site": "T. Konstantin Rusch, Siddhartha Mishra, N. Benjamin Erichson, Michael W Mahoney", "tldr": "", "abstract": "We propose a novel method called Long Expressive Memory (LEM) for learning long-term sequential dependencies. LEM is gradient-based, it can efficiently process sequential tasks with very long-term dependencies, and it is sufficiently expressive to be able to learn complicated input-output maps. To derive LEM, we consider a system of multiscale ordinary differential equations, as well as a suitable time-discretization of this system. For LEM, we derive rigorous bounds to show the mitigation of the exploding and vanishing gradients problem, a well-known challenge for gradient-based recurrent sequential learning methods. We also prove that LEM can approximate a large class of dynamical systems to high accuracy. Our empirical results, ranging from image and time-series classification through dynamical systems prediction to speech recognition and language modeling, demonstrate that LEM outperforms state-of-the-art recurrent neural networks, gated recurrent units, and long short-term memory models.", "keywords": "sequence modeling;long-term dependencies;multiscale ordinary differential equations;dynamical systems", "primary_area": "", "supplementary_material": "/attachment/218cbc7f66a519c33253a5cb7fb586b057c9b1a5.zip", "author": "T. Konstantin Rusch;Siddhartha Mishra;N. Benjamin Erichson;Michael W. Mahoney", "authorids": "~T._Konstantin_Rusch1;~Siddhartha_Mishra1;~N._Benjamin_Erichson1;~Michael_W._Mahoney1", "gender": ";M;M;", "homepage": "https://konstantinrusch.com;http://www.sam.math.ethz.ch/;https://www.benerichson.com/;", "dblp": "266/1519;07/2856.html;173/5153;", "google_scholar": "9LajlSsAAAAJ;FmEqyNcAAAAJ;https://scholar.google.co.uk/citations?user=8ViYcioAAAAJ;", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~T._Konstantin_Rusch1;~Siddhartha_Mishra1;~N._Benjamin_Erichson1;~Michael_W._Mahoney1", "aff": "Swiss Federal Institute of Technology;Swiss Federal Institute of Technology;University of Pittsburgh;", "aff_domain": "ethz.ch;ethz.ch;pitt.edu;", "position": "PhD student;Full Professor;Assistant Professor;", "bibtex": "@inproceedings{\nrusch2022long,\ntitle={Long Expressive Memory for Sequence Modeling},\nauthor={T. Konstantin Rusch and Siddhartha Mishra and N. Benjamin Erichson and Michael W. Mahoney},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=vwj6aUeocyf}\n}", "github": "", "project": "", "reviewers": "Cggm;AvfW;M6L8;4Tp1", "pdf_size": 0, "recommendation": "6;6;8;8", "confidence": "3;3;4;4", "correctness": "4;4;3;4", "technical_novelty": "3;3;4;3", "empirical_novelty": "3;3;3;3", "wc_summary_paper": "162;67;138;128", "wc_summary_review": "111;97;44;29", "wc_main_review": "451;574;540;496", "wc_review": "724;738;722;653", "wc_reply_reviewers": "0;157;139;0", "wc_reply_authors": "835;1728;872;825", "reply_reviewers": "0;1;2;0", "reply_authors": "2;3;2;1", "recommendation_avg": [ 7.0, 1.0 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 123.75, 35.01696017646306 ], "wc_summary_review_avg": [ 70.25, 34.52082704687128 ], "wc_main_review_avg": [ 515.25, 46.267564232408 ], "wc_review_avg": [ 709.25, 33.05582399517519 ], "wc_reply_reviewers_avg": [ 74.0, 74.27314454094427 ], "wc_reply_authors_avg": [ 1065.0, 383.183376466151 ], "reply_reviewers_avg": [ 0.75, 0.82915619758885 ], "reply_authors_avg": [ 2.0, 0.7071067811865476 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 1.0, "corr_recommendation_correctness": -0.5773502691896257, "gs_citation": 54, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10849000047191483143&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "pdf": "https://openreview.net/pdf?id=vwj6aUeocyf", "email": "ethz.ch;ethz.ch;pitt.edu;", "author_num": 4, "aff_unique_index": "0;0;1", "aff_unique_norm": "Swiss Federal Institute of Technology;University of Pittsburgh", "aff_unique_dep": ";", "aff_unique_url": "https://www.ethz.ch;https://www.pitt.edu", "aff_unique_abbr": "ETH Zurich;Pitt", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1", "aff_country_unique": "Switzerland;United States" }, { "id": "vxlAHR9AyZ6", "title": "$\\alpha$-Weighted Federated Adversarial Training", "track": "main", "status": "Reject", "tldr": "", "abstract": "Federated Adversarial Training (FAT) helps us address the data privacy and governance issues, meanwhile maintains the model robustness to the adversarial attack. However, the inner-maximization optimization of Adversarial Training can exacerbate the data heterogeneity among local clients, which triggers the pain points of Federated Learning. This makes that the straightforward combination of two paradigms shows the performance deterioration as observed in previous works. In this paper, we introduce an $\\alpha$-Weighted Federated Adversarial Training ($\\alpha$-WFAT) method to overcome this problem, which relaxes the inner-maximization of Adversarial Training into a lower bound friendly to Federated Learning. We present the theoretical analysis about this $\\alpha$-weighted mechanism and its effect on the convergence of FAT. Empirically, the extensive experiments are conducted to comprehensively understand the characteristics of $\\alpha$-WFAT, and the results on three benchmark datasets demonstrate $\\alpha$-WFAT significantly outperforms FAT under different adversarial learning methods and federated optimization methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jianing Zhu;Jiangchao Yao;Tongliang Liu;Kunyang Jia;Jingren Zhou;Bo Han;Hongxia Yang", "authorids": "~Jianing_Zhu2;~Jiangchao_Yao1;~Tongliang_Liu1;kunyang.jky@alibaba-inc.com;~Jingren_Zhou1;~Bo_Han1;~Hongxia_Yang2", "gender": "M;M;M;;M;;F", "homepage": "https://zfancy.github.io/;https://sunarker.github.io/;https://tongliang-liu.github.io/;;;;https://www4.comp.polyu.edu.hk/~hongxyang/", "dblp": "129/6807;166/5900;150/6667;;84/2644;;", "google_scholar": "82uNA3MAAAAJ;w8oDh9QAAAAJ;https://scholar.google.com.au/citations?user=EiLdZ_YAAAAJ;;;;iJlC5mMAAAAJ", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": "~Jianing_Zhu2;~Jiangchao_Yao1;~Tongliang_Liu1;kunyang.jky@alibaba-inc.com;~Jingren_Zhou1;~Bo_Han1;~Hongxia_Yang2", "aff": "Hong Kong Baptist University;Alibaba Group;University of Sydney;;Alibaba Group;;Alibaba Group", "aff_domain": "hkbu.edu.hk;alibaba-inc.com;sydney.edu.au;;alibaba-inc.com;;alibaba-inc.com", "position": "PhD student;Researcher;Lecturer;;Researcher;;Principal Researcher", "bibtex": "@misc{\nzhu2022alphaweighted,\ntitle={\\${\\textbackslash}alpha\\$-Weighted Federated Adversarial Training},\nauthor={Jianing Zhu and Jiangchao Yao and Tongliang Liu and Kunyang Jia and Jingren Zhou and Bo Han and Hongxia Yang},\nyear={2022},\nurl={https://openreview.net/forum?id=vxlAHR9AyZ6}\n}", "github": "", "project": "", "reviewers": "m6tB;4ioa;4Hkz;1hs3", "site": "https://openreview.net/forum?id=vxlAHR9AyZ6", "pdf_size": 0, "recommendation": "5;5;5;8", "confidence": "4;3;5;3", "correctness": "3;2;3;4", "technical_novelty": "2;3;2;4", "empirical_novelty": "2;4;2;4", "wc_summary_paper": "77;63;64;104", "wc_summary_review": "31;32;20;15", "wc_main_review": "162;214;360;157", "wc_review": "270;309;444;276", "wc_reply_reviewers": "0;0;260;0", "wc_reply_authors": "944;979;1908;134", "reply_reviewers": "0;0;1;0", "reply_authors": "4;4;6;1", "recommendation_avg": [ 5.75, 1.299038105676658 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.75, 0.82915619758885 ], "empirical_novelty_avg": [ 3.0, 1.0 ], "wc_summary_paper_avg": [ 77.0, 16.537835408541227 ], "wc_summary_review_avg": [ 24.5, 7.22841614740048 ], "wc_main_review_avg": [ 223.25, 82.0468616096923 ], "wc_review_avg": [ 324.75, 70.43214819952604 ], "wc_reply_reviewers_avg": [ 65.0, 112.58330249197702 ], "wc_reply_authors_avg": [ 991.25, 628.0308013943265 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 3.75, 1.7853571071357126 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": -0.5222329678670935, "corr_recommendation_correctness": 0.816496580927726, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:rki9xFGwqS0J:scholar.google.com/&scioq=%24%5Calpha%24-Weighted+Federated+Adversarial+Training&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;2;1;1", "aff_unique_norm": "Hong Kong Baptist University;Alibaba Group;University of Sydney", "aff_unique_dep": ";;", "aff_unique_url": "https://www.hkbu.edu.hk;https://www.alibaba.com;https://www.sydney.edu.au", "aff_unique_abbr": "HKBU;Alibaba;USYD", "aff_campus_unique_index": "0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;1;0;0", "aff_country_unique": "China;Australia" }, { "id": "vyn49BUAkoD", "title": "Bayesian Active Learning with Fully Bayesian Gaussian Processes", "track": "main", "status": "Reject", "tldr": "", "abstract": "The bias-variance trade-off is a well-known problem in machine learning that only gets more pronounced the less available data there is. When data is scarce, such as in metamodeling, active learning, and Bayesian optimization, neglecting this trade-off can cause inefficient and non-optimal querying, leading to unnecessary data labeling. In this paper, we focus on metamodeling with active learning and the canonical Gaussian Process (GP). We recognize that, for the GP, the bias-variance trade-off regulation is made by optimization of the two hyperparameters: the length scale and noise-term. Considering that the optimal mode of the joint posterior of the hyperparameters is equivalent to the optimal bias-variance trade-off, we approximate this joint posterior and utilize it to design two new acquisition functions. The first one is a mode-seeking Bayesian variant of Query-by-Committee (B-QBC), and the second is simultaneously mode-seeking and minimizing the predictive variance through a Query by Mixture Gaussian Processes (QB-MGP) formulation. Across seven simulators, we empirically show that B-QBC outperforms the benchmark functions, whereas QB-MGP is the most robust acquisition function and achieves the best accuracy with the fewest iterations. We generally show that incorporating the bias-variance trade-off in the acquisition functions mitigates unnecessary and expensive data labeling.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/95ede0968dcd0fea649be8fb4adee154c8e26841.zip", "author": "Christoffer Riis;Francisco Antunes;Frederik Boe H\u00fcttel;Carlos Lima Azevedo;Francisco C. Pereira", "authorids": "~Christoffer_Riis1;~Francisco_Antunes1;~Frederik_Boe_H\u00fcttel1;~Carlos_Lima_Azevedo1;~Francisco_C._Pereira1", "gender": "M;;M;M;M", "homepage": ";;;http://azevedo.info;http://mlsm.man.dtu.dk", "dblp": "263/7663;;https://dblp.uni-trier.de/pid/295/8780;;96/3066", "google_scholar": "https://scholar.google.com/citations?hl=en;;zs_r5EcAAAAJ;;k7pIUY0AAAAJ", "orcid": "0000-0002-4540-6691;;0000-0003-4603-3708;;0000-0001-5457-9909", "linkedin": "christoffer-riis/;;frederik-boe-huttel/;;", "or_profile": "~Christoffer_Riis1;~Francisco_Antunes1;~Frederik_Boe_H\u00fcttel1;~Carlos_Lima_Azevedo1;~Francisco_C._Pereira1", "aff": "Technical University of Denmark;;Technical University of Denmark;Technical University of Denmark;", "aff_domain": "dtu.dk;;dtu.dk;dtu.dk;", "position": "PhD student;;PhD student;Associate Professor;", "bibtex": "@misc{\nriis2022bayesian,\ntitle={Bayesian Active Learning with Fully Bayesian Gaussian Processes},\nauthor={Christoffer Riis and Francisco Antunes and Frederik Boe H{\\\"u}ttel and Carlos Lima Azevedo and Francisco C. Pereira},\nyear={2022},\nurl={https://openreview.net/forum?id=vyn49BUAkoD}\n}", "github": "", "project": "", "reviewers": "hDWW;uC7q;H5XZ;ssFN", "site": "https://openreview.net/forum?id=vyn49BUAkoD", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "3;4;4;4", "correctness": "2;3;3;4", "technical_novelty": "3;3;3;3", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "105;110;22;52", "wc_summary_review": "197;45;28;20", "wc_main_review": "2148;399;99;330", "wc_review": "2450;554;149;402", "wc_reply_reviewers": "11;24;0;0", "wc_reply_authors": "373;190;141;224", "reply_reviewers": "1;1;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 72.25, 36.85359548266627 ], "wc_summary_review_avg": [ 72.5, 72.44480657714533 ], "wc_main_review_avg": [ 744.0, 818.1781590827269 ], "wc_review_avg": [ 888.75, 912.9231553093612 ], "wc_reply_reviewers_avg": [ 8.75, 9.883698700385398 ], "wc_reply_authors_avg": [ 232.0, 86.58810541870055 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.9271726499455306, "corr_recommendation_correctness": 0.9733285267845754, "gs_citation": 29, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7248161076733979181&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff_unique_index": "0;0;0", "aff_unique_norm": "Technical University of Denmark", "aff_unique_dep": "", "aff_unique_url": "https://www.tek.dk", "aff_unique_abbr": "DTU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Denmark" }, { "title": "On the Limitations of Multimodal VAEs", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6996", "id": "w-CPUXXrAj", "poster": "", "openreview": "https://openreview.net/forum?id=w-CPUXXrAj", "slides": "https://iclr.cc/virtual/2022/poster/6996", "video": "https://iclr.cc/virtual/2022/poster/6996", "author_site": "Imant Daunhawer, Thomas Sutter, Kieran Chin-Cheong, Emanuele Palumbo, Julia E Vogt", "tldr": "", "abstract": "Multimodal variational autoencoders (VAEs) have shown promise as efficient generative models for weakly-supervised data. Yet, despite their advantage of weak supervision, they exhibit a gap in generative quality compared to unimodal VAEs, which are completely unsupervised. In an attempt to explain this gap, we uncover a fundamental limitation that applies to a large family of mixture-based multimodal VAEs. We prove that the sub-sampling of modalities enforces an undesirable upper bound on the multimodal ELBO and thereby limits the generative quality of the respective models. Empirically, we showcase the generative quality gap on both synthetic and real data and present the tradeoffs between different variants of multimodal VAEs. We find that none of the existing approaches fulfills all desired criteria of an effective multimodal generative model when applied on more complex datasets than those used in previous benchmarks. In summary, we identify, formalize, and validate fundamental limitations of VAE-based approaches for modeling weakly-supervised data and discuss implications for real-world applications.", "keywords": "multimodal learning;variational autoencoder;variational information bottleneck;information theory", "primary_area": "", "supplementary_material": "/attachment/fadf88e19ce516ceaee7a0d4fbf8ec860aa9855f.zip", "author": "Imant Daunhawer;Thomas M. Sutter;Kieran Chin-Cheong;Emanuele Palumbo;Julia E Vogt", "authorids": "~Imant_Daunhawer2;~Thomas_M._Sutter1;~Kieran_Chin-Cheong1;palumboe@student.ethz.ch;~Julia_E_Vogt1", "gender": ";;M;;F", "homepage": "https://mds.inf.ethz.ch/team/detail/imant-daunhawer/;;https://mds.inf.ethz.ch/team/detail/kieran-chin-cheong/;;http://mds.inf.ethz.ch", "dblp": "259/0541;;;;13/8412", "google_scholar": ";;;;UoeV-8kAAAAJ", "orcid": ";;;;", "linkedin": ";;;;julia-vogt-50b53895", "or_profile": "~Imant_Daunhawer2;~Thomas_M._Sutter1;~Kieran_Chin-Cheong1;palumboe@student.ethz.ch;~Julia_E_Vogt1", "aff": "Swiss Federal Institute of Technology;;Swiss Federal Institute of Technology;;Swiss Federal Institute of Technology", "aff_domain": "ethz.ch;;ethz.ch;;ethz.ch", "position": "PhD student;;Researcher;;Assistant Professor", "bibtex": "@inproceedings{\ndaunhawer2022on,\ntitle={On the Limitations of Multimodal {VAE}s},\nauthor={Imant Daunhawer and Thomas M. Sutter and Kieran Chin-Cheong and Emanuele Palumbo and Julia E Vogt},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=w-CPUXXrAj}\n}", "github": "", "project": "", "reviewers": "B728;tUe8;UGYa;wNh5", "pdf_size": 0, "recommendation": "6;6;8;8", "confidence": "3;3;5;3", "correctness": "3;4;3;3", "technical_novelty": "4;2;3;3", "empirical_novelty": "3;2;4;3", "wc_summary_paper": "57;101;43;72", "wc_summary_review": "65;45;41;50", "wc_main_review": "360;191;286;153", "wc_review": "482;337;370;275", "wc_reply_reviewers": "0;22;20;0", "wc_reply_authors": "858;618;582;184", "reply_reviewers": "0;1;1;0", "reply_authors": "2;1;1;1", "recommendation_avg": [ 7.0, 1.0 ], "confidence_avg": [ 3.5, 0.8660254037844386 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 68.25, 21.510172012329424 ], "wc_summary_review_avg": [ 50.25, 9.093266739736606 ], "wc_main_review_avg": [ 247.5, 81.02623032080415 ], "wc_review_avg": [ 366.0, 75.15650337795127 ], "wc_reply_reviewers_avg": [ 10.5, 10.523782589924593 ], "wc_reply_authors_avg": [ 560.5, 241.88168595410443 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": -0.5773502691896257, "gs_citation": 40, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14829914268364500497&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=w-CPUXXrAj", "email": "ethz.ch;;ethz.ch;;ethz.ch", "author_num": 5, "aff_unique_index": "0;0;0", "aff_unique_norm": "Swiss Federal Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.ethz.ch", "aff_unique_abbr": "ETH Zurich", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Switzerland" }, { "title": "On Covariate Shift of Latent Confounders in Imitation and Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7100", "id": "w01vBAcewNX", "poster": "", "openreview": "https://openreview.net/forum?id=w01vBAcewNX", "slides": "https://iclr.cc/virtual/2022/poster/7100", "video": "https://iclr.cc/virtual/2022/poster/7100", "author_site": "Guy Tennenholtz, Assaf Hallak, Gal Dalal, Shie Mannor, Gal Chechik, Uri Shalit", "tldr": "", "abstract": "We consider the problem of using expert data with unobserved confounders for imitation and reinforcement learning. We begin by defining the problem of learning from confounded expert data in a contextual MDP setup. We analyze the limitations of learning from such data with and without external reward and propose an adjustment of standard imitation learning algorithms to fit this setup. In addition, we discuss the problem of distribution shift between the expert data and the online environment when partial observability is present in the data. We prove possibility and impossibility results for imitation learning under arbitrary distribution shift of the missing covariates. When additional external reward is provided, we propose a sampling procedure that addresses the unknown shift and prove convergence to an optimal solution. Finally, we validate our claims empirically on challenging assistive healthcare and recommender system simulation tasks.", "keywords": "imitation learning;reinforcement learning;expert data;hidden confounding;causal inference;covariate shift", "primary_area": "", "supplementary_material": "/attachment/8f27cd29fd1ddb0fd2d5f4b112ff2501a9da086e.zip", "author": "Guy Tennenholtz;Assaf Hallak;Gal Dalal;Shie Mannor;Gal Chechik;Uri Shalit", "authorids": "~Guy_Tennenholtz2;~Assaf_Hallak1;~Gal_Dalal2;~Shie_Mannor2;~Gal_Chechik1;~Uri_Shalit1", "gender": ";M;M;;M;M", "homepage": "https://guytenn.com;;https://shie.net.technion.ac.il;https://chechiklab.biu.ac.il/~gal/;;", "dblp": ";117/9126;20/1669;c/GalChechik;87/7049;166/1605", "google_scholar": "https://scholar.google.co.il/citations?user=pldrn8IAAAAJ;;https://scholar.google.com.tw/citations?user=q1HlbIUAAAAJ;Wk2gAZUAAAAJ;https://scholar.google.co.il/citations?user=aeGDj-IAAAAJ;https://scholar.google.co.il/citations?user=NfJiSMMAAAAJ", "orcid": ";0000-0001-7915-9206;;0000-0001-9164-5303;0000-0002-4026-2692;0000-0002-3166-4251", "linkedin": ";;;;;galdalal/", "or_profile": "~Guy_Tennenholtz2;~Assaf_Hallak1;~Shie_Mannor2;~Gal_Chechik1;~Uri_Shalit1;~Gal_Dalal1", "aff": "Technion, Technion;NVIDIA;Technion - Israel Institute of Technology, Technion;NVIDIA;Technion;NVIDIA", "aff_domain": "technion.ac.il;nvidia.com;technion.il;nvidia.com;technion.ac.il;nvidia.com", "position": "PhD student;Senior research scientist;Full Professor;Principal Researcher;Associate Professor;Senior Research Scientist", "bibtex": "@inproceedings{\ntennenholtz2022on,\ntitle={On Covariate Shift of Latent Confounders in Imitation and Reinforcement Learning},\nauthor={Guy Tennenholtz and Assaf Hallak and Gal Dalal and Shie Mannor and Gal Chechik and Uri Shalit},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=w01vBAcewNX}\n}", "github": "", "project": "", "reviewers": "KtAT;JYzA;BBsG;26tA", "pdf_size": 0, "recommendation": "5;6;6;6", "confidence": "4;4;4;3", "correctness": "3;3;3;3", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "86;80;181;61", "wc_summary_review": "70;79;58;30", "wc_main_review": "956;487;192;187", "wc_review": "1112;646;431;278", "wc_reply_reviewers": "505;0;31;15", "wc_reply_authors": "1143;509;472;305", "reply_reviewers": "2;0;1;1", "reply_authors": "3;1;1;1", "recommendation_avg": [ 5.75, 0.4330127018922193 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 102.0, 46.534933114811714 ], "wc_summary_review_avg": [ 59.25, 18.45772196128222 ], "wc_main_review_avg": [ 455.5, 313.45533972162605 ], "wc_review_avg": [ 616.75, 314.39734016050454 ], "wc_reply_reviewers_avg": [ 137.75, 212.31506658737152 ], "wc_reply_authors_avg": [ 607.25, 318.7196063940843 ], "reply_reviewers_avg": [ 1.0, 0.7071067811865476 ], "reply_authors_avg": [ 1.5, 0.8660254037844386 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": 0.0, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14767942777580756959&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=w01vBAcewNX", "email": "technion.ac.il;nvidia.com;technion.il;nvidia.com;technion.ac.il;nvidia.com", "author_num": 6, "aff_unique_index": "0;1;0;1;0;1", "aff_unique_norm": "Technion - Israel Institute of Technology;NVIDIA", "aff_unique_dep": ";NVIDIA Corporation", "aff_unique_url": "https://www.technion.ac.il/en/;https://www.nvidia.com", "aff_unique_abbr": "Technion;NVIDIA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;1;0;1", "aff_country_unique": "Israel;United States" }, { "title": "Neural Collapse Under MSE Loss: Proximity to and Dynamics on the Central Path", "status": "Oral", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6352", "id": "w1UbdvWH_R3", "poster": "", "openreview": "https://openreview.net/forum?id=w1UbdvWH_R3", "slides": "https://iclr.cc/virtual/2022/poster/6352", "video": "https://iclr.cc/virtual/2022/poster/6352", "author_site": "X.Y. Han, Vardan Papyan, David Donoho", "tldr": "", "abstract": "The recently discovered Neural Collapse (NC) phenomenon occurs pervasively in today's deep net training paradigm of driving cross-entropy (CE) loss towards zero. During NC, last-layer features collapse to their class-means, both classifiers and class-means collapse to the same Simplex Equiangular Tight Frame, and classifier behavior collapses to the nearest-class-mean decision rule. Recent works demonstrated that deep nets trained with mean squared error (MSE) loss perform comparably to those trained with CE. As a preliminary, we empirically establish that NC emerges in such MSE-trained deep nets as well through experiments on three canonical networks and five benchmark datasets. We provide, in a Google Colab notebook, PyTorch code for reproducing MSE-NC and CE-NC: https://colab.research.google.com/github/neuralcollapse/neuralcollapse/blob/main/neuralcollapse.ipynb. The analytically-tractable MSE loss offers more mathematical opportunities than the hard-to-analyze CE loss, inspiring us to leverage MSE loss towards the theoretical investigation of NC. We develop three main contributions: (I) We show a new decomposition of the MSE loss into (A) terms directly interpretable through the lens of NC and which assume the last-layer classifier is exactly the least-squares classifier; and (B) a term capturing the deviation from this least-squares classifier. (II) We exhibit experiments on canonical datasets and networks demonstrating that term-(B) is negligible during training. This motivates us to introduce a new theoretical construct: the central path, where the linear classifier stays MSE-optimal for feature activations throughout the dynamics. (III) By studying renormalized gradient flow along the central path, we derive exact dynamics that predict NC.", "keywords": "neural collapse;deep learning theory;deep learning;inductive bias;equiangular tight frame;ETF;nearest class center;mean squared error loss;MSE loss;invariance;renormalization;gradient flow;dynamics;adversarial robustness", "primary_area": "", "supplementary_material": "", "author": "X.Y. Han;Vardan Papyan;David L. Donoho", "authorids": "~X.Y._Han1;~Vardan_Papyan1;~David_L._Donoho1", "gender": ";M;M", "homepage": "http://xyhan.me;https://sites.google.com/view/vardan-papyan;https://statistics.stanford.edu/people/david-donoho", "dblp": ";173/9783;d/DavidLDonoho.html", "google_scholar": "VAJtX-0AAAAJ;https://scholar.google.co.il/citations?user=VrE-Gd4AAAAJ;https://scholar.google.com/scholar?hl=en", "orcid": "0000-0002-4605-4990;;0000-0003-1830-710X", "linkedin": ";;", "or_profile": "~X.Y._Han1;~Vardan_Papyan1;~David_L._Donoho1", "aff": "Cornell University;University of Toronto;Stanford University", "aff_domain": "cornell.edu;toronto.edu;stanford.edu", "position": "PhD student;Assistant Professor;Full Professor", "bibtex": "@inproceedings{\nhan2022neural,\ntitle={Neural Collapse Under {MSE} Loss: Proximity to and Dynamics on the Central Path},\nauthor={X.Y. Han and Vardan Papyan and David L. Donoho},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=w1UbdvWH_R3}\n}", "github": "", "project": "", "reviewers": "pHXk;Jnw3;9pr9;pKnQ", "pdf_size": 0, "recommendation": "6;6;8;8", "confidence": "3;4;4;4", "correctness": "3;3;4;3", "technical_novelty": "3;3;4;3", "empirical_novelty": "3;3;4;0", "wc_summary_paper": "53;106;60;132", "wc_summary_review": "36;26;24;236", "wc_main_review": "242;863;254;526", "wc_review": "331;995;338;894", "wc_reply_reviewers": "0;105;23;354", "wc_reply_authors": "476;1999;1056;1937", "reply_reviewers": "0;2;1;1", "reply_authors": "1;4;2;3", "recommendation_avg": [ 7.0, 1.0 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 3.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 1.5 ], "wc_summary_paper_avg": [ 87.75, 32.66783586342995 ], "wc_summary_review_avg": [ 80.5, 89.89299194041769 ], "wc_main_review_avg": [ 471.25, 253.0902753959543 ], "wc_review_avg": [ 639.5, 307.0932268872109 ], "wc_reply_reviewers_avg": [ 120.5, 140.34689166490293 ], "wc_reply_authors_avg": [ 1367.0, 635.3986937348865 ], "reply_reviewers_avg": [ 1.0, 0.7071067811865476 ], "reply_authors_avg": [ 2.5, 1.118033988749895 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 171, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18382376857691471152&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=w1UbdvWH_R3", "email": "cornell.edu;toronto.edu;stanford.edu", "author_num": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "Cornell University;University of Toronto;Stanford University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.cornell.edu;https://www.utoronto.ca;https://www.stanford.edu", "aff_unique_abbr": "Cornell;U of T;Stanford", "aff_campus_unique_index": "1", "aff_campus_unique": ";Stanford", "aff_country_unique_index": "0;1;0", "aff_country_unique": "United States;Canada" }, { "title": "ViDT: An Efficient and Effective Fully Transformer-based Object Detector", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6181", "id": "w4cXZDDib1H", "poster": "", "openreview": "https://openreview.net/forum?id=w4cXZDDib1H", "slides": "https://iclr.cc/virtual/2022/poster/6181", "video": "https://iclr.cc/virtual/2022/poster/6181", "author_site": "Hwanjun Song, Deqing Sun, Sanghyuk Chun, Varun Jampani, Dongyoon Han, Byeongho Heo, Wonjae Kim, Ming-Hsuan Yang", "tldr": "", "abstract": "Transformers are transforming the landscape of computer vision, especially for recognition tasks. Detection transformers are the first fully end-to-end learning systems for object detection, while vision transformers are the first fully transformer-based architecture for image classification. In this paper, we integrate Vision and Detection Transformers (ViDT) to build an effective and efficient object detector. ViDT introduces a reconfigured attention module to extend the recent Swin Transformer to be a standalone object detector, followed by a computationally efficient transformer decoder that exploits multi-scale features and auxiliary techniques essential to boost the detection performance without much increase in computational load. Extensive evaluation results on the Microsoft COCO benchmark dataset demonstrate that ViDT obtains the best AP and latency trade-off among existing fully transformer-based object detectors, and achieves 49.2AP owing to its high scalability for large models. We release the code and trained models at https://github.com/naver-ai/vidt.", "keywords": "object detection;vision transformer;detection transformer", "primary_area": "", "supplementary_material": "", "author": "Hwanjun Song;Deqing Sun;Sanghyuk Chun;Varun Jampani;Dongyoon Han;Byeongho Heo;Wonjae Kim;Ming-Hsuan Yang", "authorids": "~Hwanjun_Song2;~Deqing_Sun2;~Sanghyuk_Chun1;~Varun_Jampani2;~Dongyoon_Han1;~Byeongho_Heo1;~Wonjae_Kim1;~Ming-Hsuan_Yang1", "gender": "M;M;M;M;M;M;M;M", "homepage": "https://songhwanjun.github.io/;https://deqings.github.io/;https://sanghyukchun.github.io/home/;https://dongyoonhan.github.io/;https://sites.google.com/view/byeongho-heo/home;https://wonjae.kim;https://faculty.ucmerced.edu/mhyang/;https://varunjampani.github.io/", "dblp": "204/3381;69/4250;213/1095.html;151/8876;142/2705;158/3433;79/3711.html;124/2785", "google_scholar": "Ijzuc-8AAAAJ;t4rgICIAAAAJ;https://scholar.google.co.kr/citations?user=4_uj0xcAAAAJ;jcP7m1QAAAAJ;https://scholar.google.co.kr/citations?user=4_7rLDIAAAAJ;https://scholar.google.co.kr/citations?user=UpZ41EwAAAAJ;p9-ohHsAAAAJ;1Cv6Sf4AAAAJ", "orcid": "0000-0002-1105-0818;;0000-0002-4533-2610;0000-0002-9130-8195;;0000-0002-6616-7685;0000-0003-4848-2304;", "linkedin": ";;https://kr.linkedin.com/in/sanghyukchun/en;https://linkedin.com/in/dongyoon-han-04961a120/en;byeongho-heo-1a7756122/;;minghsuanyang/;", "or_profile": "~Hwanjun_Song2;~Deqing_Sun2;~Sanghyuk_Chun1;~Dongyoon_Han1;~Byeongho_Heo1;~Wonjae_Kim1;~Ming-Hsuan_Yang1;~Varun_Jampani1", "aff": "NAVER CLOVA;Google DeepMind;NAVER AI Lab;NAVER;NAVER AI Lab;NAVER;University of California at Merced;Google Research", "aff_domain": "navercorp.com;google.com;navercorp.com;navercorp.com;navercorp.com;navercorp.com;umcerced.edu;google.com", "position": "Research Scientist;Research Scientist;Lead research scientist;Research Scientist;Researcher;Research Scientist;Professor;Researcher", "bibtex": "@inproceedings{\nsong2022vidt,\ntitle={Vi{DT}: An Efficient and Effective Fully Transformer-based Object Detector},\nauthor={Hwanjun Song and Deqing Sun and Sanghyuk Chun and Varun Jampani and Dongyoon Han and Byeongho Heo and Wonjae Kim and Ming-Hsuan Yang},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=w4cXZDDib1H}\n}", "github": "", "project": "", "reviewers": "cEv4;vxqH;JKxV", "pdf_size": 0, "recommendation": "5;6;8", "confidence": "4;4;5", "correctness": "4;3;4", "technical_novelty": "2;2;2", "empirical_novelty": "2;2;4", "wc_summary_paper": "103;33;59", "wc_summary_review": "99;34;48", "wc_main_review": "384;308;447", "wc_review": "586;375;554", "wc_reply_reviewers": "101;130;0", "wc_reply_authors": "2554;2612;199", "reply_reviewers": "1;3;0", "reply_authors": "7;7;1", "recommendation_avg": [ 6.333333333333333, 1.247219128924647 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "correctness_avg": [ 3.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.9428090415820634 ], "wc_summary_paper_avg": [ 65.0, 28.890598240027266 ], "wc_summary_review_avg": [ 60.333333333333336, 27.932458220182166 ], "wc_main_review_avg": [ 379.6666666666667, 56.82917873377686 ], "wc_review_avg": [ 505.0, 92.84754529155128 ], "wc_reply_reviewers_avg": [ 77.0, 55.719535772174794 ], "wc_reply_authors_avg": [ 1788.3333333333333, 1124.0777948562495 ], "reply_reviewers_avg": [ 1.3333333333333333, 1.247219128924647 ], "reply_authors_avg": [ 5.0, 2.8284271247461903 ], "replies_avg": [ 24, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": 0.944911182523068, "corr_recommendation_correctness": 0.18898223650461363, "gs_citation": 119, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1253153783722573136&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "pdf": "https://openreview.net/pdf?id=w4cXZDDib1H", "email": "navercorp.com;google.com;navercorp.com;navercorp.com;navercorp.com;navercorp.com;umcerced.edu;google.com", "author_num": 8, "aff_unique_index": "0;1;0;0;0;0;2;1", "aff_unique_norm": "NAVER Corporation;Google;University of California, Merced", "aff_unique_dep": "CLOVA;Google DeepMind;", "aff_unique_url": "https://www.naver.com;https://deepmind.com;https://www.ucmerced.edu", "aff_unique_abbr": "NAVER;DeepMind;UC Merced", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Merced;Mountain View", "aff_country_unique_index": "0;1;0;0;0;0;2;2", "aff_country_unique": "South Korea;United Kingdom;United States" }, { "title": "Spanning Tree-based Graph Generation for Molecules", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6001", "id": "w60btE_8T2m", "poster": "", "openreview": "https://openreview.net/forum?id=w60btE_8T2m", "slides": "https://iclr.cc/virtual/2022/poster/6001", "video": "https://iclr.cc/virtual/2022/poster/6001", "author_site": "Sungsoo Ahn, Binghong Chen, Tianzhe Wang, Le Song", "tldr": "", "abstract": "In this paper, we explore the problem of generating molecules using deep neural networks, which has recently gained much interest in chemistry. To this end, we propose a spanning tree-based graph generation (STGG) framework based on formulating molecular graph generation as a construction of a spanning tree and the residual edges. Such a formulation exploits the sparsity of molecular graphs and allows using compact tree-constructive operations to define the molecular graph connectivity. Based on the intermediate graph structure of the construction process, our framework can constrain its generation to molecular graphs that satisfy the chemical valence rules. We also newly design a Transformer architecture with tree-based relative positional encodings for realizing the tree construction procedure. Experiments on QM9, ZINC250k, and MOSES benchmarks verify the effectiveness of the proposed framework in metrics such as validity, Frechet ChemNet distance, and fragment similarity. We also demonstrate the usefulness of STGG in maximizing penalized LogP value of molecules.", "keywords": "molecule generation;tree generation;graph generation;deep generative model;de novo drug design", "primary_area": "", "supplementary_material": "/attachment/8391d3e3f9a797ffc025777a45388ccf498bb2c9.zip", "author": "Sungsoo Ahn;Binghong Chen;Tianzhe Wang;Le Song", "authorids": "~Sungsoo_Ahn1;~Binghong_Chen1;~Tianzhe_Wang1;~Le_Song1", "gender": "M;M;M;M", "homepage": "https://sungsooahn.super.site/;http://binghongchen.net/;https://sites.google.com/view/tianzhe-wang/home;http://www.cc.gatech.edu/~lsong", "dblp": "90/5164;192/2022;243/6770;94/3481", "google_scholar": "XTenHs0AAAAJ;6Px5HxsAAAAJ;;Xl4E0CsAAAAJ", "orcid": ";;;", "linkedin": ";binghong-chen-91b697181/;;", "or_profile": "~Sungsoo_Ahn1;~Binghong_Chen1;~Tianzhe_Wang1;~Le_Song1", "aff": "Pohang University of Science and Technology;Georgia Institute of Technology;Georgia Institute of Technology;College of Computing, Georgia Institute of Technology", "aff_domain": "postech.ac.kr;gatech.edu;gatech.edu;cc.gatech.edu", "position": "Assistant Professor;PhD student;PhD student;Associate Professor", "bibtex": "@inproceedings{\nahn2022spanning,\ntitle={Spanning Tree-based Graph Generation for Molecules},\nauthor={Sungsoo Ahn and Binghong Chen and Tianzhe Wang and Le Song},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=w60btE_8T2m}\n}", "github": "", "project": "", "reviewers": "xNX3;nQAs;4r4T;PBPv", "pdf_size": 0, "recommendation": "6;6;8;8", "confidence": "5;4;4;4", "correctness": "3;3;4;3", "technical_novelty": "3;3;3;3", "empirical_novelty": "3;0;2;4", "wc_summary_paper": "82;88;94;77", "wc_summary_review": "21;35;177;5", "wc_main_review": "270;131;505;230", "wc_review": "373;254;776;312", "wc_reply_reviewers": "0;12;1057;0", "wc_reply_authors": "452;612;1501;555", "reply_reviewers": "0;1;4;0", "reply_authors": "1;1;5;1", "recommendation_avg": [ 7.0, 1.0 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.25, 1.479019945774904 ], "wc_summary_paper_avg": [ 85.25, 6.378675411086537 ], "wc_summary_review_avg": [ 59.5, 68.66403716648185 ], "wc_main_review_avg": [ 284.0, 137.26070085789306 ], "wc_review_avg": [ 428.75, 204.85284352432114 ], "wc_reply_reviewers_avg": [ 267.25, 455.98869229400856 ], "wc_reply_authors_avg": [ 780.0, 420.2005473580443 ], "reply_reviewers_avg": [ 1.25, 1.6393596310755 ], "reply_authors_avg": [ 2.0, 1.7320508075688772 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 30, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7204714873234813748&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "pdf": "https://openreview.net/pdf?id=w60btE_8T2m", "email": "postech.ac.kr;gatech.edu;gatech.edu;cc.gatech.edu", "author_num": 4, "aff_unique_index": "0;1;1;1", "aff_unique_norm": "Pohang University of Science and Technology;Georgia Institute of Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.postech.ac.kr;https://www.gatech.edu", "aff_unique_abbr": "POSTECH;Georgia Tech", "aff_campus_unique_index": "0;2", "aff_campus_unique": "Pohang;;Atlanta", "aff_country_unique_index": "0;1;1;1", "aff_country_unique": "South Korea;United States" }, { "id": "w7Nb5dSMM-", "title": "Evolutionary perspective on model fine-tuning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Be it in natural language generation or in the image generation, massive performances gains have been achieved in the last years. While a substantial part of these advances can be attributed to improvement in machine learning architectures, an important role has also been played by the ever-increasing parameter number of machine learning models, which made from-scratch retraining of the models prohibitively expensive for a large number of users.\nIn response to that, model fine-tuning - starting with an already good model and further training it on the data relevant to a new, related problem, gained in popularity. This fine-tuning is formally similar to the natural evolution of genetic codes in response to shifting environment. \nHere, we formalize this similarity in the framework of Fisher Geometric model and extreme value theory and present a set of tricks used by naturally evolving organisms to accelerate their adaptation, applicable to model fine-tuning.", "keywords": "Evolutionary algorithms;stochastic gradient descent;fine-tuning", "primary_area": "", "supplementary_material": "", "author": "Andrei Kucharavy;Ljiljana Dolamic;Rachid Guerraoui", "authorids": "~Andrei_Kucharavy1;ljiljana.dolamic@ar.admin.ch;~Rachid_Guerraoui1", "gender": "M;;M", "homepage": ";;https://lpdwww.epfl.ch/rachid/", "dblp": "267/1787;;g/RachidGuerraoui", "google_scholar": "uHXvGxgAAAAJ;;", "orcid": "0000-0003-0429-8644;;", "linkedin": "andrei-kucharavy/;;", "or_profile": "~Andrei_Kucharavy1;ljiljana.dolamic@ar.admin.ch;~Rachid_Guerraoui1", "aff": "Swiss Federal Institute of Technology Lausanne;;", "aff_domain": "epfl.ch;;", "position": "Postdoc;;", "bibtex": "@misc{\nkucharavy2022evolutionary,\ntitle={Evolutionary perspective on model fine-tuning},\nauthor={Andrei Kucharavy and Ljiljana Dolamic and Rachid Guerraoui},\nyear={2022},\nurl={https://openreview.net/forum?id=w7Nb5dSMM-}\n}", "github": "", "project": "", "reviewers": "5E1f;DYJE;UoCs;DWHG", "site": "https://openreview.net/forum?id=w7Nb5dSMM-", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "5;4;4;4", "correctness": "2;2;3;2", "technical_novelty": "3;2;2;3", "empirical_novelty": "1;0;2;3", "wc_summary_paper": "65;117;47;31", "wc_summary_review": "55;6;29;22", "wc_main_review": "775;465;278;192", "wc_review": "895;588;354;245", "wc_reply_reviewers": "316;33;23;51", "wc_reply_authors": "753;739;679;754", "reply_reviewers": "1;1;1;1", "reply_authors": "1;1;1;1", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 2.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 1.5, 1.118033988749895 ], "wc_summary_paper_avg": [ 65.0, 32.341923257592455 ], "wc_summary_review_avg": [ 28.0, 17.67766952966369 ], "wc_main_review_avg": [ 427.5, 223.59170378169222 ], "wc_review_avg": [ 520.5, 249.2132620869122 ], "wc_reply_reviewers_avg": [ 105.75, 121.80183701406149 ], "wc_reply_authors_avg": [ 731.25, 30.74390183434757 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": -0.3333333333333333, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:m0o29mdabG8J:scholar.google.com/&scioq=Evolutionary+perspective+on+model+fine-tuning&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "Swiss Federal Institute of Technology Lausanne", "aff_unique_dep": "", "aff_unique_url": "https://www.epfl.ch", "aff_unique_abbr": "EPFL", "aff_campus_unique_index": "0", "aff_campus_unique": "Lausanne", "aff_country_unique_index": "0", "aff_country_unique": "Switzerland" }, { "id": "w8HXzn2FyKm", "title": "Finite-Time Error Bounds for Distributed Linear Stochastic Approximation", "track": "main", "status": "Reject", "tldr": "", "abstract": "This paper considers a novel multi-agent linear stochastic approximation algorithm driven by Markovian noise and general consensus-type interaction, in which each agent evolves according to its local stochastic approximation process which depends on the information from its neighbors. The interconnection structure among the agents is described by a time-varying directed graph. While the convergence of consensus-based stochastic approximation algorithms when the interconnection among the agents is described by doubly stochastic matrices (at least in expectation) has been studied, less is known about the case when the interconnection matrix is simply stochastic. For any uniformly strongly connected graph sequences whose associated interaction matrices are stochastic, the paper derives finite-time bounds on the mean-square error, defined as the deviation of the output of the algorithm from the unique equilibrium point of the associated ordinary differential equation. For the case of interconnection matrices being stochastic, the equilibrium point can be any unspecified convex combination of the local equilibria of all the agents in the absence of communication. Both the cases with constant and time-varying step-sizes are considered. In the case when the convex combination is required to be a straight average and interaction between any pair of neighboring agents may be uni-directional, so that doubly stochastic matrices cannot be implemented in a distributed manner, the paper proposes a push-type distributed stochastic approximation algorithm and provides its finite-time bounds for the performance by leveraging the analysis for the consensus-type algorithm with stochastic matrices.", "keywords": "Stochastic Approximation;Distributed Stochastic Approximation;Finite-time Analysis", "primary_area": "", "supplementary_material": "/attachment/0cd7b2335a2d3bc8a37d755e9c7b98f76d5b429f.zip", "author": "Yixuan Lin;Ji Liu;Vijay Gupta", "authorids": "~Yixuan_Lin1;~Ji_Liu4;~Vijay_Gupta1", "gender": "F;M;", "homepage": "https://www.linkedin.com/in/yixuan-lin-23b7b3192/;https://sites.google.com/site/jiliucontrol;", "dblp": ";51/4433-1;", "google_scholar": ";4T2B-e8AAAAJ;", "orcid": ";;", "linkedin": ";;", "or_profile": "~Yixuan_Lin1;~Ji_Liu4;~Vijay_Gupta1", "aff": "State University of New York, Stony Brook;State University of New York, Stony Brook;", "aff_domain": "stonybrook.edu;stonybrook.edu;", "position": "PhD student;Assistant Professor;", "bibtex": "@misc{\nlin2022finitetime,\ntitle={Finite-Time Error Bounds for Distributed Linear Stochastic Approximation},\nauthor={Yixuan Lin and Ji Liu and Vijay Gupta},\nyear={2022},\nurl={https://openreview.net/forum?id=w8HXzn2FyKm}\n}", "github": "", "project": "", "reviewers": "HPzY;5MZ3;fcYJ;RnJ8;dShT", "site": "https://openreview.net/forum?id=w8HXzn2FyKm", "pdf_size": 0, "recommendation": "5;5;5;5;5", "confidence": "5;4;3;4;3", "correctness": "3;3;3;3;4", "technical_novelty": "4;2;3;3;3", "empirical_novelty": "4;2;1;0;3", "wc_summary_paper": "107;80;22;51;49", "wc_summary_review": "41;72;27;61;41", "wc_main_review": "131;640;403;647;308", "wc_review": "279;792;452;759;398", "wc_reply_reviewers": "0;0;186;105;0", "wc_reply_authors": "34;942;1026;1283;377", "reply_reviewers": "0;0;2;2;0", "reply_authors": "1;2;3;3;1", "recommendation_avg": [ 5.0, 0.0 ], "confidence_avg": [ 3.8, 0.7483314773547882 ], "correctness_avg": [ 3.2, 0.39999999999999997 ], "technical_novelty_avg": [ 3.0, 0.6324555320336759 ], "empirical_novelty_avg": [ 2.0, 1.4142135623730951 ], "wc_summary_paper_avg": [ 61.8, 29.11631844859511 ], "wc_summary_review_avg": [ 48.4, 16.01998751560063 ], "wc_main_review_avg": [ 425.8, 198.047873000444 ], "wc_review_avg": [ 536.0, 203.67326775991003 ], "wc_reply_reviewers_avg": [ 58.2, 75.7427224226856 ], "wc_reply_authors_avg": [ 732.4, 457.6822478532459 ], "reply_reviewers_avg": [ 0.8, 0.9797958971132713 ], "reply_authors_avg": [ 2.0, 0.8944271909999159 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4955512030612030773&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff_unique_index": "0;0", "aff_unique_norm": "State University of New York", "aff_unique_dep": "", "aff_unique_url": "https://www.stonybrook.edu", "aff_unique_abbr": "SUNY Stony Brook", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Stony Brook", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "wClmeg9u7G", "title": "Distributed Methods with Compressed Communication for Solving Variational Inequalities, with Theoretical Guarantees", "track": "main", "status": "Reject", "tldr": "", "abstract": "Variational inequalities in general and saddle point problems in particular are increasingly relevant in machine learning applications, including adversarial learning, GANs, transport and robust optimization. With increasing data and problem sizes necessary to train high performing models across these and other applications, it is necessary to rely on parallel and distributed computing. However, in distributed training, communication among the compute nodes is a key bottleneck during training, and this problem is exacerbated for high dimensional and over-parameterized models models. Due to these considerations, it is important to equip existing methods with strategies that would allow to reduce the volume of transmitted information during training while obtaining a model of comparable quality. In this paper, we present the first theoretically grounded distributed methods for solving variational inequalities and saddle point problems using compressed communication: MASHA1 and MASHA2. Our theory and methods allow for the use of both unbiased (such as Rand$k$; MASHA1) and contractive (such as Top$k$; MASHA2) compressors. We empirically validate our conclusions using two experimental setups: a standard bilinear min-max problem, and large-scale distributed adversarial training of transformers.", "keywords": "convex optimization;saddle point problem;minmax problem;distributed optimization;quantization;compression", "primary_area": "", "supplementary_material": "/attachment/1092b6897055e7529bc8909457401f68b65e316f.zip", "author": "Aleksandr Beznosikov;Peter Richt\u00e1rik;Michael Diskin;Max Ryabinin;Alexander Gasnikov", "authorids": "~Aleksandr_Beznosikov1;~Peter_Richt\u00e1rik1;~Michael_Diskin1;~Max_Ryabinin1;~Alexander_Gasnikov1", "gender": ";;M;Not Specified;", "homepage": ";;;https://mryab.github.io/;", "dblp": ";;295/8914.html;276/0192;", "google_scholar": ";;LRKQhcYAAAAJ;930PERsAAAAJ;", "orcid": ";;0000-0001-8902-513X;;", "linkedin": ";;https://www.linkedin.com/m/in/yhn112/;;", "or_profile": "~Aleksandr_Beznosikov1;~Peter_Richt\u00e1rik1;~Michael_Diskin1;~Max_Ryabinin1;~Alexander_Gasnikov1", "aff": ";;Yandex;Yandex;", "aff_domain": ";;yandex-team.ru;yandex-team.ru;", "position": ";;Researcher;Research Scientist;", "bibtex": "@misc{\nbeznosikov2022distributed,\ntitle={Distributed Methods with Compressed Communication for Solving Variational Inequalities, with Theoretical Guarantees},\nauthor={Aleksandr Beznosikov and Peter Richt{\\'a}rik and Michael Diskin and Max Ryabinin and Alexander Gasnikov},\nyear={2022},\nurl={https://openreview.net/forum?id=wClmeg9u7G}\n}", "github": "", "project": "", "reviewers": "oTML;V1y7;n8kw;RfLr", "site": "https://openreview.net/forum?id=wClmeg9u7G", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "4;5;2;3", "correctness": "2;2;3;2", "technical_novelty": "2;2;2;3", "empirical_novelty": "3;2;2;3", "wc_summary_paper": "32;58;45;76", "wc_summary_review": "34;108;13;56", "wc_main_review": "413;232;336;309", "wc_review": "479;398;394;441", "wc_reply_reviewers": "107;0;0;0", "wc_reply_authors": "2350;1199;912;942", "reply_reviewers": "2;0;0;0", "reply_authors": "7;3;7;4", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 3.5, 1.118033988749895 ], "correctness_avg": [ 2.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 52.75, 16.269219403523945 ], "wc_summary_review_avg": [ 52.75, 35.336772631353874 ], "wc_main_review_avg": [ 322.5, 64.70123646422842 ], "wc_review_avg": [ 428.0, 34.734708865916815 ], "wc_reply_reviewers_avg": [ 26.75, 46.332359102467464 ], "wc_reply_authors_avg": [ 1350.75, 587.6024910600703 ], "reply_reviewers_avg": [ 0.5, 0.8660254037844386 ], "reply_authors_avg": [ 5.25, 1.7853571071357126 ], "replies_avg": [ 29, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.8944271909999159, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 22, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13783045168805230130&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 12, "aff_unique_index": "0;0", "aff_unique_norm": "Yandex", "aff_unique_dep": "", "aff_unique_url": "https://yandex.com", "aff_unique_abbr": "Yandex", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Russian Federation" }, { "title": "D-CODE: Discovering Closed-form ODEs from Observed Trajectories", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7157", "id": "wENMvIsxNN", "poster": "", "openreview": "https://openreview.net/forum?id=wENMvIsxNN", "slides": "https://iclr.cc/virtual/2022/poster/7157", "video": "https://iclr.cc/virtual/2022/poster/7157", "author_site": "Zhaozhi Qian, Krzysztof Kacprzyk, Mihaela van der Schaar", "tldr": "", "abstract": "For centuries, scientists have manually designed closed-form ordinary differential equations (ODEs) to model dynamical systems. An automated tool to distill closed-form ODEs from observed trajectories would accelerate the modeling process. Traditionally, symbolic regression is used to uncover a closed-form prediction function $a=f(b)$ with label-feature pairs $(a_i, b_i)$ as training examples. However, an ODE models the time derivative $\\dot{x}(t)$ of a dynamical system, e.g. $\\dot{x}(t) = f(x(t),t)$, and the \"label\" $\\dot{x}(t)$ is usually *not* observed. The existing ways to bridge this gap only perform well for a narrow range of settings with low measurement noise, frequent sampling, and non-chaotic dynamics. In this work, we propose the Discovery of Closed-form ODE framework (D-CODE), which advances symbolic regression beyond the paradigm of supervised learning. D-CODE leverages a novel objective function based on the variational formulation of ODEs to bypass the unobserved time derivative. For formal justification, we prove that this objective is a valid proxy for the estimation error of the true (but unknown) ODE. In the experiments, D-CODE successfully discovered the governing equations of a diverse range of dynamical systems under challenging measurement settings with high noise and infrequent sampling.", "keywords": "Symbolic Regression;Ordinary Differential Equation", "primary_area": "", "supplementary_material": "", "author": "Zhaozhi Qian;Krzysztof Kacprzyk;Mihaela van der Schaar", "authorids": "~Zhaozhi_Qian1;~Krzysztof_Kacprzyk1;~Mihaela_van_der_Schaar2", "gender": ";;F", "homepage": ";;https://www.vanderschaar-lab.com", "dblp": "194/2443;;", "google_scholar": "PuTDB5gAAAAJ;;DZ3S--MAAAAJ", "orcid": "0000-0002-4561-0342;;", "linkedin": ";;", "or_profile": "~Zhaozhi_Qian1;~Krzysztof_Kacprzyk1;~Mihaela_van_der_Schaar2", "aff": "University of Cambridge;;University of California, Los Angeles", "aff_domain": "cam.ac.uk;;ucla.edu", "position": "PhD student;;Full Professor", "bibtex": "@inproceedings{\nqian2022dcode,\ntitle={D-{CODE}: Discovering Closed-form {ODE}s from Observed Trajectories},\nauthor={Zhaozhi Qian and Krzysztof Kacprzyk and Mihaela van der Schaar},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=wENMvIsxNN}\n}", "github": "", "project": "", "reviewers": "79Ft;ZddY;vaG3;EBvJ", "pdf_size": 0, "recommendation": "6;6;8;8", "confidence": "4;4;3;3", "correctness": "3;4;4;4", "technical_novelty": "3;2;4;3", "empirical_novelty": "4;2;3;3", "wc_summary_paper": "100;101;95;110", "wc_summary_review": "40;39;81;34", "wc_main_review": "215;296;376;385", "wc_review": "355;436;552;529", "wc_reply_reviewers": "159;291;134;0", "wc_reply_authors": "554;1005;427;362", "reply_reviewers": "2;2;1;0", "reply_authors": "3;3;2;1", "recommendation_avg": [ 7.0, 1.0 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 101.5, 5.408326913195984 ], "wc_summary_review_avg": [ 48.5, 18.9010581714358 ], "wc_main_review_avg": [ 318.0, 68.82223477917583 ], "wc_review_avg": [ 468.0, 78.3741028656788 ], "wc_reply_reviewers_avg": [ 146.0, 103.26422420180185 ], "wc_reply_authors_avg": [ 587.0, 251.01693169983574 ], "reply_reviewers_avg": [ 1.25, 0.82915619758885 ], "reply_authors_avg": [ 2.25, 0.82915619758885 ], "replies_avg": [ 20, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -1.0, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 27, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9525332226345236042&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=wENMvIsxNN", "email": "cam.ac.uk;;ucla.edu", "author_num": 3, "aff_unique_index": "0;1", "aff_unique_norm": "University of Cambridge;University of California, Los Angeles", "aff_unique_dep": ";", "aff_unique_url": "https://www.cam.ac.uk;https://www.ucla.edu", "aff_unique_abbr": "Cambridge;UCLA", "aff_campus_unique_index": "0;1", "aff_campus_unique": "Cambridge;Los Angeles", "aff_country_unique_index": "0;1", "aff_country_unique": "United Kingdom;United States" }, { "id": "wGLGG4918oc", "title": "CubeTR: Learning to Solve the Rubik's Cube using Transformers", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Since its first appearance, transformers have been successfully used in wide ranging domains from computer vision to natural language processing. Application of transformers in Reinforcement Learning by reformulating it as a sequence modelling problem was proposed only recently. Compared to other commonly explored reinforcement learning problems, the Rubik's cube poses a unique set of challenges. The Rubik\u2019s cube has a single solved state for quintillions of possible configurations which leads to extremely sparse rewards. The proposed model CubeTR attends to longer sequences of actions and addresses the problem of sparse rewards. CubeTR learns how to solve the Rubik's cube from arbitrary starting states without any human prior, and after move regularisation, the lengths of solutions generated by it are expected to be very close to those given by algorithms used by expert human solvers. CubeTR provides insights to the generalisability of learning algorithms to higher dimensional cubes and the applicability of transformers in other relevant sparse reward scenarios.", "keywords": "Machine Learning;Transformers;Reinforcement Learning;Rubik's Cube", "primary_area": "", "supplementary_material": "", "author": "Mustafa Ebrahim Chasmai", "authorids": "~Mustafa_Ebrahim_Chasmai1", "gender": "M", "homepage": "https://www.google.com/", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "~Mustafa_Ebrahim_Chasmai1", "aff": "", "aff_domain": "", "position": "", "bibtex": "@misc{\nchasmai2022cubetr,\ntitle={Cube{TR}: Learning to Solve the Rubik's Cube using Transformers},\nauthor={Mustafa Ebrahim Chasmai},\nyear={2022},\nurl={https://openreview.net/forum?id=wGLGG4918oc}\n}", "github": "", "project": "", "reviewers": "P1h8;zAmG;LpAm;4fCN", "site": "https://openreview.net/forum?id=wGLGG4918oc", "pdf_size": 0, "recommendation": "1;1;1;1", "confidence": "4;5;5;3", "correctness": "1;1;3;1", "technical_novelty": "2;1;1;3", "empirical_novelty": "1;1;1;2", "wc_summary_paper": "86;49;83;104", "wc_summary_review": "168;17;30;24", "wc_main_review": "1055;84;323;574", "wc_review": "1309;150;436;702", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 1.0, 0.0 ], "confidence_avg": [ 4.25, 0.82915619758885 ], "correctness_avg": [ 1.5, 0.8660254037844386 ], "technical_novelty_avg": [ 1.75, 0.82915619758885 ], "empirical_novelty_avg": [ 1.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 80.5, 19.880895352071043 ], "wc_summary_review_avg": [ 59.75, 62.66727614951842 ], "wc_main_review_avg": [ 509.0, 359.7089100925914 ], "wc_review_avg": [ 649.25, 428.01248521509274 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 1, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7423824479523210631&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2 }, { "id": "wIK1fWFXvU9", "title": "Understanding the Interaction of Adversarial Training with Noisy Labels", "track": "main", "status": "Reject", "tldr": "", "abstract": "Noisy labels (NL) and adversarial examples both undermine trained models, but interestingly they have hitherto been studied independently. A recent adversarial training (AT) study showed that the number of projected gradient descent (PGD) steps to successfully attack a point (i.e., find an adversarial example in its proximity) is an effective measure of the robustness of this point. Given that natural data are clean, this measure reveals an intrinsic geometric property---how far a point is from its nearest class boundary. Based on this breakthrough, in this paper, we figure out how AT would interact with NL. Firstly, we find if a point is too close to its noisy-class boundary (e.g., one step is enough to attack it), this point is likely to be mislabeled, which suggests to adopt the number of PGD steps as a new criterion for sample selection to correct NL. Secondly, we confirm that AT with strong smoothing effects suffers less from NL (without NL corrections) than standard training, which suggests that AT itself is an NL correction. Hence, AT with NL is helpful for improving even the natural accuracy, which again illustrates the superiority of AT as a general-purpose robust learning criterion.", "keywords": "noisy labels;adversarial training", "primary_area": "", "supplementary_material": "", "author": "Jianing Zhu;Jingfeng Zhang;Bo Han;Tongliang Liu;Gang Niu;Hongxia Yang;Mohan Kankanhalli;Masashi Sugiyama", "authorids": "~Jianing_Zhu2;~Jingfeng_Zhang1;~Bo_Han1;~Tongliang_Liu1;~Gang_Niu1;~Hongxia_Yang2;~Mohan_Kankanhalli1;~Masashi_Sugiyama1", "gender": "M;M;M;M;F;M;M;M", "homepage": "https://zfancy.github.io/;https://zjfheart.github.io;https://tongliang-liu.github.io/;https://niug1984.github.io;https://www4.comp.polyu.edu.hk/~hongxyang/;https://www.comp.nus.edu.sg/~mohan;http://www.ms.k.u-tokyo.ac.jp/sugi/;https://bhanml.github.io/", "dblp": "129/6807;227/2664.html;150/6667;26/3367-1;;09/3613.html;35/1228;241/0472-3", "google_scholar": "82uNA3MAAAAJ;NS0P1FkAAAAJ;https://scholar.google.com.au/citations?user=EiLdZ_YAAAAJ;https://scholar.google.co.jp/citations?user=HOkcy00AAAAJ;iJlC5mMAAAAJ;6Lx_eowAAAAJ;https://scholar.google.co.jp/citations?user=GkYIrlIAAAAJ;nTNjqHwAAAAJ", "orcid": ";0000-0003-3491-8074;;;;0000-0002-4846-2015;0000-0001-6658-6743;", "linkedin": ";;;;;mohan-kankanhalli-583417221;;", "or_profile": "~Jianing_Zhu2;~Jingfeng_Zhang1;~Tongliang_Liu1;~Gang_Niu1;~Hongxia_Yang2;~Mohan_Kankanhalli1;~Masashi_Sugiyama1;~bo_han2", "aff": "Hong Kong Baptist University;RIKEN;University of Sydney;RIKEN;Alibaba Group;National University of Singapore;The University of Tokyo;Microsoft Research", "aff_domain": "hkbu.edu.hk;riken.jp;sydney.edu.au;riken.jp;alibaba-inc.com;nus.edu.sg;u-tokyo.ac.jp;microsoft.com", "position": "PhD student;Postdoc;Lecturer;Research Scientist (tenured);Principal Researcher;Full Professor;Full Professor;Researcher", "bibtex": "@misc{\nzhu2022understanding,\ntitle={Understanding the Interaction of Adversarial Training with Noisy Labels},\nauthor={Jianing Zhu and Jingfeng Zhang and Bo Han and Tongliang Liu and Gang Niu and Hongxia Yang and Mohan Kankanhalli and Masashi Sugiyama},\nyear={2022},\nurl={https://openreview.net/forum?id=wIK1fWFXvU9}\n}", "github": "", "project": "", "reviewers": "FKyS;sbre;cfM4;BbnD", "site": "https://openreview.net/forum?id=wIK1fWFXvU9", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "5;4;3;3", "correctness": "3;2;3;3", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "63;81;389;74", "wc_summary_review": "34;64;100;27", "wc_main_review": "790;334;482;128", "wc_review": "887;479;971;229", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 151.75, 137.12653827760693 ], "wc_summary_review_avg": [ 56.25, 28.830322578840494 ], "wc_main_review_avg": [ 433.5, 241.18198523106986 ], "wc_review_avg": [ 641.5, 302.2428659207691 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": -0.899228803025897, "corr_recommendation_correctness": -0.13245323570650439, "gs_citation": 32, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12000663150291213796&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1;2;1;3;4;5;6", "aff_unique_norm": "Hong Kong Baptist University;RIKEN;University of Sydney;Alibaba Group;National University of Singapore;University of Tokyo;Microsoft", "aff_unique_dep": ";;;;;;Microsoft Research", "aff_unique_url": "https://www.hkbu.edu.hk;https://www.riken.jp;https://www.sydney.edu.au;https://www.alibaba.com;https://www.nus.edu.sg;https://www.u-tokyo.ac.jp;https://www.microsoft.com/en-us/research", "aff_unique_abbr": "HKBU;RIKEN;USYD;Alibaba;NUS;UTokyo;MSR", "aff_campus_unique_index": "0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;1;2;1;0;3;1;4", "aff_country_unique": "China;Japan;Australia;Singapore;United States" }, { "title": "Expressiveness and Approximation Properties of Graph Neural Networks", "status": "Oral", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6804", "id": "wIzUeM3TAU", "poster": "", "openreview": "https://openreview.net/forum?id=wIzUeM3TAU", "slides": "https://iclr.cc/virtual/2022/poster/6804", "video": "https://iclr.cc/virtual/2022/poster/6804", "author_site": "Floris Geerts, Juan L. Reutter", "tldr": "", "abstract": "Characterizing the separation power of graph neural networks (GNNs) provides an understanding of their limitations for graph learning tasks. Results regarding separation power are, however, usually geared at specific GNNs architectures, and tools for understanding arbitrary GNN architectures are generally lacking. We provide an elegant way to easily obtain bounds on the separation power of GNNs in terms of the Weisfeiler-Leman (WL) tests, which have become the yardstick to measure the separation power of GNNs. The crux is to view GNNs as expressions in a procedural tensor language describing the computations in the layers of the GNNs. Then, by a simple analysis of the obtained expressions, in terms of the number of indexes used and the nesting depth of summations, bounds on the separation power in terms of the WL-tests readily follow. We use tensor language to define Higher-Order Message-Passing Neural Networks (or k-MPNNs), a natural extension of MPNNs. Furthermore, the tensor language point of view allows for the derivation of universality results for classes of GNNs in a natural way. Our approach provides a toolbox with which GNN architecture designers can analyze the separation power of their GNNs, without needing to know the intricacies of the WL-tests. We also provide insights in what is needed to boost the separation power of GNNs.", "keywords": "Graph Neural Networks;Colour Refinement;Weisfeiler-Leman;Separation Power;Universality", "primary_area": "", "supplementary_material": "", "author": "Floris Geerts;Juan L Reutter", "authorids": "~Floris_Geerts1;~Juan_L_Reutter1", "gender": "M;M", "homepage": "https://www.uantwerpen.be/en/staff/floris-geerts/;http://jreutter.sitios.ing.uc.cl/", "dblp": "g/FlorisGeerts.html;36/27", "google_scholar": "SGay8u4AAAAJ;", "orcid": "0000-0002-8967-2473;", "linkedin": "florisgeerts/;", "or_profile": "~Floris_Geerts1;~Juan_L_Reutter1", "aff": "University of Antwerp;Pontificia Universidad Cat\u00f3lica", "aff_domain": "uantwerp.be;uc.cl", "position": "Associate Professor;Assistant Professor", "bibtex": "@inproceedings{\ngeerts2022expressiveness,\ntitle={Expressiveness and Approximation Properties of Graph Neural Networks},\nauthor={Floris Geerts and Juan L Reutter},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=wIzUeM3TAU}\n}", "github": "", "project": "", "reviewers": "KRNm;QRwU;tPHz;2fbv", "pdf_size": 0, "recommendation": "8;8;8;10", "confidence": "4;4;2;4", "correctness": "4;4;4;4", "technical_novelty": "3;3;3;4", "empirical_novelty": "0;0;0;0", "wc_summary_paper": "105;252;98;137", "wc_summary_review": "53;58;41;17", "wc_main_review": "818;370;594;265", "wc_review": "976;680;733;419", "wc_reply_reviewers": "46;178;0;6", "wc_reply_authors": "2381;1303;398;399", "reply_reviewers": "1;2;0;1", "reply_authors": "9;7;1;2", "recommendation_avg": [ 8.5, 0.8660254037844386 ], "confidence_avg": [ 3.5, 0.8660254037844386 ], "correctness_avg": [ 4.0, 0.0 ], "technical_novelty_avg": [ 3.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 0, 0 ], "wc_summary_paper_avg": [ 148.0, 61.818282085480185 ], "wc_summary_review_avg": [ 42.25, 15.833114033569013 ], "wc_main_review_avg": [ 511.75, 213.03330138736527 ], "wc_review_avg": [ 702.0, 197.86990675693968 ], "wc_reply_reviewers_avg": [ 57.5, 71.78265807282425 ], "wc_reply_authors_avg": [ 1120.25, 816.2007642118451 ], "reply_reviewers_avg": [ 1.0, 0.7071067811865476 ], "reply_authors_avg": [ 4.75, 3.344772040064913 ], "replies_avg": [ 29, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": 0.0, "gs_citation": 96, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1241457886049608228&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=wIzUeM3TAU", "email": "uantwerp.be;uc.cl", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "University of Antwerp;Pontificia Universidad Cat\u00f3lica", "aff_unique_dep": ";", "aff_unique_url": "https://www.uantwerp.be;https://www.puc.cl", "aff_unique_abbr": "UA;PUC", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "Belgium;Chile" }, { "id": "wMXYbJB-gX", "title": "Towards Understanding Label Smoothing", "track": "main", "status": "Reject", "tldr": "", "abstract": "Label smoothing regularization (LSR) is a prevalent component for training deep neural networks and can improve the generalization of models effectively. Although it achieves empirical success, the theoretical understanding about the power of label smoothing, especially about its influence on optimization, is still limited. In this work, we, for the first time, theoretically analyze the convergence behaviors of stochastic gradient descent with label smoothing in deep learning. Our analysis indicates that an appropriate LSR can speed up the convergence by reducing the variance in gradient, which provides a theoretical interpretation on the effectiveness of LSR. Besides, the analysis implies that LSR may slow down the convergence at the end of optimization. Therefore, a novel algorithm, namely Two-Stage LAbel smoothing (TSLA), is proposed to further improve the convergence. With the extensive analysis and experiments on benchmark data sets, the effectiveness of TSLA is verified both theoretically and empirically.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yi Xu;Yuanhong Xu;Qi Qian;Hao Li;Rong Jin", "authorids": "~Yi_Xu8;~Yuanhong_Xu1;~Qi_Qian1;~Hao_Li16;~Rong_Jin1", "gender": "M;;M;M;M", "homepage": ";http://qi-qian.com;;https://www.cse.msu.edu/~rongjin/;https://yxu71.github.io", "dblp": "223/4687;05/2084-1;17/5705-30;j/RongJin;14/5580", "google_scholar": ";Rp_40_gAAAAJ;pHN-QIwAAAAJ;;D4jEMqEAAAAJ", "orcid": ";;;;0009-0000-9900-6143", "linkedin": "%E6%B8%8A%E9%B8%BF-%E5%BE%90-37a542113/;;%E6%98%8A-%E6%9D%8E-392547a5/detail/recent-activity/;;", "or_profile": "~Yuanhong_Xu1;~Qi_Qian1;~Li_Hao1;~Rong_Jin3;~YI_XU3", "aff": "Alibaba Group;Alibaba Group;Alibaba Group;Alibaba Group;Dalian University of Technology", "aff_domain": "alibaba-inc.com;alibaba-inc.com;alibaba-inc.com;alibaba-inc.com;dlut.edu.cn", "position": "Researcher;Researcher;Researcher;Researcher;Associate Professor", "bibtex": "@misc{\nxu2022towards,\ntitle={Towards Understanding Label Smoothing},\nauthor={Yi Xu and Yuanhong Xu and Qi Qian and Hao Li and Rong Jin},\nyear={2022},\nurl={https://openreview.net/forum?id=wMXYbJB-gX}\n}", "github": "", "project": "", "reviewers": "AcSL;3bTK;CCDQ;u1nj", "site": "https://openreview.net/forum?id=wMXYbJB-gX", "pdf_size": 0, "recommendation": "5;5;5;6", "confidence": "4;3;3;4", "correctness": "3;4;3;3", "technical_novelty": "3;3;2;3", "empirical_novelty": "2;3;2;3", "wc_summary_paper": "57;54;36;112", "wc_summary_review": "47;32;39;19", "wc_main_review": "288;192;271;157", "wc_review": "392;278;346;288", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "427;709;795;320", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 5.25, 0.4330127018922193 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 64.75, 28.43743131859838 ], "wc_summary_review_avg": [ 34.25, 10.280442597476044 ], "wc_main_review_avg": [ 227.0, 54.27246078813821 ], "wc_review_avg": [ 326.0, 46.10856753359401 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 562.75, 195.3744801656552 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": -0.3333333333333333, "gs_citation": 53, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5768893699166304300&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;0;0;1", "aff_unique_norm": "Alibaba Group;Dalian University of Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.alibaba.com;http://www.dlut.edu.cn/", "aff_unique_abbr": "Alibaba;DUT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "title": "A Theoretical Analysis on Feature Learning in Neural Networks: Emergence from Inputs and Advantage over Fixed Features", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6008", "id": "wMpS-Z_AI_E", "poster": "", "openreview": "https://openreview.net/forum?id=wMpS-Z_AI_E", "slides": "https://iclr.cc/virtual/2022/poster/6008", "video": "https://iclr.cc/virtual/2022/poster/6008", "author_site": "Zhenmei Shi, Junyi Wei, Yingyu Liang", "tldr": "", "abstract": "An important characteristic of neural networks is their ability to learn representations of the input data with effective features for prediction, which is believed to be a key factor to their superior empirical performance. To better understand the source and benefit of feature learning in neural networks, we consider learning problems motivated by practical data, where the labels are determined by a set of class relevant patterns and the inputs are generated from these along with some background patterns. We prove that neural networks trained by gradient descent can succeed on these problems. The success relies on the emergence and improvement of effective features, which are learned among exponentially many candidates efficiently by exploiting the data (in particular, the structure of the input distribution). In contrast, no linear models on data-independent features of polynomial sizes can learn to as good errors. Furthermore, if the specific input structure is removed, then no polynomial algorithm in the Statistical Query model can learn even weakly. These results provide theoretical evidence showing that feature learning in neural networks depends strongly on the input structure and leads to the superior performance. Our preliminary experimental results on synthetic and real data also provide positive support. ", "keywords": "neural networks;feature learning;provable advantage;theoretical analysis", "primary_area": "", "supplementary_material": "/attachment/7b6dda91b863c9135c86ffb20ad33460c9a47fca.zip", "author": "Zhenmei Shi;Junyi Wei;Yingyu Liang", "authorids": "~Zhenmei_Shi1;~Junyi_Wei1;~Yingyu_Liang1", "gender": "M;F;", "homepage": "http://zhmeishi.github.io/;;", "dblp": "246/5216;166/6146;", "google_scholar": "0oeNnzMAAAAJ;Kb1GL40AAAAJ;", "orcid": ";;", "linkedin": "zhenmei-shi-56408a113/;Junyi-Jenny-Wei-04ba979b/;", "or_profile": "~Zhenmei_Shi1;~Junyi_Wei1;~Yingyu_Liang1", "aff": "University of Wisconsin - Madison;University of Wisconsin, Madison;", "aff_domain": "wisc.edu;wisc.edu;", "position": "PhD student;PhD student;", "bibtex": "@inproceedings{\nshi2022a,\ntitle={A Theoretical Analysis on Feature Learning in Neural Networks: Emergence from Inputs and Advantage over Fixed Features},\nauthor={Zhenmei Shi and Junyi Wei and Yingyu Liang},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=wMpS-Z_AI_E}\n}", "github": "", "project": "", "reviewers": "bp8c;zbTh;uM4G;Jr9n;Mhvy", "pdf_size": 0, "recommendation": "5;6;6;6;8", "confidence": "4;3;3;2;3", "correctness": "4;3;4;3;3", "technical_novelty": "2;3;3;3;3", "empirical_novelty": "3;2;2;2;2", "wc_summary_paper": "72;107;74;114;155", "wc_summary_review": "53;94;47;50;85", "wc_main_review": "121;140;336;406;367", "wc_review": "246;341;457;570;607", "wc_reply_reviewers": "0;0;0;396;0", "wc_reply_authors": "930;1060;1200;2595;780", "reply_reviewers": "0;0;0;3;0", "reply_authors": "2;2;2;5;1", "recommendation_avg": [ 6.2, 0.9797958971132712 ], "confidence_avg": [ 3.0, 0.6324555320336759 ], "correctness_avg": [ 3.4, 0.4898979485566356 ], "technical_novelty_avg": [ 2.8, 0.39999999999999997 ], "empirical_novelty_avg": [ 2.2, 0.39999999999999997 ], "wc_summary_paper_avg": [ 104.4, 30.440762145517972 ], "wc_summary_review_avg": [ 65.8, 19.6509541753066 ], "wc_main_review_avg": [ 274.0, 119.40016750407011 ], "wc_review_avg": [ 444.2, 135.96087672562282 ], "wc_reply_reviewers_avg": [ 79.2, 158.4 ], "wc_reply_authors_avg": [ 1313.0, 655.9085302082906 ], "reply_reviewers_avg": [ 0.6, 1.2 ], "reply_authors_avg": [ 2.4, 1.3564659966250536 ], "replies_avg": [ 23, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.3227486121839514, "corr_recommendation_correctness": -0.5833333333333334, "gs_citation": 72, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16774781733542804499&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "pdf": "https://openreview.net/pdf?id=wMpS-Z_AI_E", "email": "wisc.edu;wisc.edu;", "author_num": 3, "aff_unique_index": "0;1", "aff_unique_norm": "University of Wisconsin-Madison;University of Wisconsin", "aff_unique_dep": ";", "aff_unique_url": "https://www.wisc.edu;https://www.wisc.edu", "aff_unique_abbr": "UW-Madison;UW", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Madison", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "wNsNT56zDkG", "title": "Adversarial Rademacher Complexity of Deep Neural Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "Deep neural networks are vulnerable to adversarial attacks. Adversarial training is one of the most effective algorithms to increase the model's robustness. However, the trained models cannot generalize well to the adversarial examples on the test set. In this paper, we study the generalization of adversarial training through the lens of adversarial Rademacher complexity. Current analysis of adversarial Rademacher complexity is up to two-layer neural networks. In adversarial settings, one major difficulty of generalizing these results to deep neural networks is that we cannot peel off the layer as the classical analysis for standard training. We provide a method to overcome this issue and provide upper bounds of adversarial Rademacher complexity of deep neural networks. Similar to the existing bounds of standard Rademacher complexity of neural nets, our bound also includes the product of weight norms. We provide experiments to show that the adversarially trained weight norms are larger than the standard trained weight norms, thus providing an explanation for the bad generalization performance of adversarial training.", "keywords": "Adversarial Robustness;Generalization;Rademacher complexity", "primary_area": "", "supplementary_material": "", "author": "Jiancong Xiao;Yanbo Fan;Ruoyu Sun;Zhi-Quan Luo", "authorids": "~Jiancong_Xiao1;~Yanbo_Fan1;~Ruoyu_Sun1;~Zhi-Quan_Luo1", "gender": "M;M;;M", "homepage": "https://jiancongxiao.github.io;https://sites.google.com/site/yanbofan0124/;https://ruoyus.github.io/;", "dblp": "330/4306;181/4574;30/9879-1;", "google_scholar": "_vGY3joAAAAJ;OlOqHyUAAAAJ;PsfzbCMAAAAJ;dW3gcXoAAAAJ", "orcid": ";0000-0002-8530-485X;;", "linkedin": ";;;", "or_profile": "~Jiancong_Xiao1;~Yanbo_Fan1;~Ruoyu_Sun1;~Zhi-Quan_Luo1", "aff": "The Chinese University of Hong Kong, Shenzhen;Tencent AI Lab;University of Illinois, Urbana-Champaign;The Chinese University of Hong Kong, Shenzhen", "aff_domain": "cuhk.edu.cn;tencent.com;uiuc.edu;cuhk.edu.cn", "position": "PhD student;Associate Professor;Assistant Professor;Full Professor", "bibtex": "@misc{\nxiao2022adversarial,\ntitle={Adversarial Rademacher Complexity of Deep Neural Networks},\nauthor={Jiancong Xiao and Yanbo Fan and Ruoyu Sun and Zhi-Quan Luo},\nyear={2022},\nurl={https://openreview.net/forum?id=wNsNT56zDkG}\n}", "github": "", "project": "", "reviewers": "DjdD;Vmod;dFnk;Ma5T", "site": "https://openreview.net/forum?id=wNsNT56zDkG", "pdf_size": 0, "recommendation": "6;8;8;8", "confidence": "3;4;3;4", "correctness": "3;3;4;4", "technical_novelty": "3;3;3;3", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "58;73;67;99", "wc_summary_review": "20;32;10;19", "wc_main_review": "210;804;232;112", "wc_review": "288;909;309;230", "wc_reply_reviewers": "0;40;112;0", "wc_reply_authors": "644;1464;1718;491", "reply_reviewers": "0;1;1;0", "reply_authors": "1;3;3;1", "recommendation_avg": [ 7.5, 0.8660254037844386 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 74.25, 15.2540978100968 ], "wc_summary_review_avg": [ 20.25, 7.8222439235810075 ], "wc_main_review_avg": [ 339.5, 271.9572576711274 ], "wc_review_avg": [ 434.0, 275.7634856176575 ], "wc_reply_reviewers_avg": [ 38.0, 45.73838650411709 ], "wc_reply_authors_avg": [ 1079.25, 522.3779163594112 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 2.0, 1.0 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 22, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=347335824069506534&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "Chinese University of Hong Kong;Tencent;University of Illinois", "aff_unique_dep": ";Tencent AI Lab;", "aff_unique_url": "https://www.cuhk.edu.cn;https://ai.tencent.com;https://illinois.edu", "aff_unique_abbr": "CUHK;Tencent AI Lab;UIUC", "aff_campus_unique_index": "0;2;0", "aff_campus_unique": "Shenzhen;;Urbana-Champaign", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "China;United States" }, { "id": "wQ7RCayXUSl", "title": "Why so pessimistic? Estimating uncertainties for offline RL through ensembles, and why their independence matters.", "track": "main", "status": "Reject", "tldr": "", "abstract": "In order to achieve strong performance in offline reinforcement learning (RL), it is necessary to act conservatively with respect to confident lower-bounds on anticipated values of actions. Thus, a valuable approach would be to obtain high quality uncertainty estimates on action values. In current supervised learning literature, state-of-the-art approaches to uncertainty estimation and calibration rely on ensembling methods. In this work, we aim to transfer the success of ensembles from supervised learning to the setting of batch RL. We propose, MSG, a model-free dynamic programming based offline RL method that trains an ensemble of independent Q-functions, and updates a policy to act conservatively with respect to the uncertainties derived from the ensemble. Theoretically, by referring to the literature on infinite-width neural networks, we demonstrate the crucial dependence of the quality of uncertainty on the manner in which ensembling is performed, a phenomenon that arises due to the dynamic programming nature of RL and overlooked by existing offline RL methods. Our theoretical predictions are corroborated by pedagogical examples on toy MDPs, as well as empirical comparisons in benchmark continuous control domains. In the more challenging domains of the D4RL offline RL benchmark, MSG significantly surpasses highly well-tuned state-of-the-art methods in batch RL. Motivated by the success of MSG, we investigate whether efficient approximations to ensembles can be as effective. We demonstrate that while efficient variants outperform current state-of-the-art, they do not match MSG with deep ensembles. We hope our work engenders increased focus into deep network uncertainty estimation techniques directed for reinforcement learning.", "keywords": "offline reinforcement learning;batch reinforcement learning;ensembles;uncertainty estimation.", "primary_area": "", "supplementary_material": "/attachment/9668b04c28ba9e89bddd3fb440778a7b08b73e93.zip", "author": "Seyed Kamyar Seyed Ghasemipour;Shixiang Shane Gu;Ofir Nachum", "authorids": "~Seyed_Kamyar_Seyed_Ghasemipour1;~Shixiang_Shane_Gu1;~Ofir_Nachum1", "gender": "M;M;M", "homepage": "http://www.cs.utoronto.ca/~kamyar/;https://scholar.google.com/citations?user=C-ZlBWMAAAAJ&hl=en;https://sites.google.com/view/gugurus/home", "dblp": "238/2555;;121/0550", "google_scholar": "LHvso9QAAAAJ;C-ZlBWMAAAAJ;B8wslVsAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Seyed_Kamyar_Seyed_Ghasemipour1;~Ofir_Nachum1;~Shixiang_Gu1", "aff": "Google DeepMind Robotics;OpenAI;Google", "aff_domain": "google.com;openai.com;google.com", "position": "Student Researcher;Researcher;Senior Research Scientist", "bibtex": "@misc{\nghasemipour2022why,\ntitle={Why so pessimistic? Estimating uncertainties for offline {RL} through ensembles, and why their independence matters.},\nauthor={Seyed Kamyar Seyed Ghasemipour and Shixiang Shane Gu and Ofir Nachum},\nyear={2022},\nurl={https://openreview.net/forum?id=wQ7RCayXUSl}\n}", "github": "", "project": "", "reviewers": "FzHF;XyHA;s2kZ;QFka;dnQ3", "site": "https://openreview.net/forum?id=wQ7RCayXUSl", "pdf_size": 0, "recommendation": "3;3;5;5;5", "confidence": "2;4;4;2;5", "correctness": "2;2;3;3;2", "technical_novelty": "2;1;2;2;2", "empirical_novelty": "2;2;2;3;3", "wc_summary_paper": "16;80;75;25;61", "wc_summary_review": "27;74;60;38;47", "wc_main_review": "182;619;530;69;381", "wc_review": "225;773;665;132;489", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "230;1831;696;55;427", "reply_reviewers": "0;0;0;0;0", "reply_authors": "1;3;1;1;1", "recommendation_avg": [ 4.2, 0.9797958971132712 ], "confidence_avg": [ 3.4, 1.2 ], "correctness_avg": [ 2.4, 0.4898979485566356 ], "technical_novelty_avg": [ 1.8, 0.4000000000000001 ], "empirical_novelty_avg": [ 2.4, 0.4898979485566356 ], "wc_summary_paper_avg": [ 51.4, 26.14268540146555 ], "wc_summary_review_avg": [ 49.2, 16.460862674841803 ], "wc_main_review_avg": [ 356.2, 206.25944826843693 ], "wc_review_avg": [ 456.8, 246.4113633743379 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 647.8, 628.809478300065 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.4, 0.8000000000000002 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.2721655269759087, "corr_recommendation_correctness": 0.6666666666666667, "gs_citation": 81, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6972415736332431556&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 6, "aff_unique_index": "0;1;0", "aff_unique_norm": "Google;OpenAI", "aff_unique_dep": "DeepMind Robotics;", "aff_unique_url": "https://deepmind.com;https://openai.com", "aff_unique_abbr": "DeepMind;OpenAI", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;1;1", "aff_country_unique": "United Kingdom;United States" }, { "id": "wQDdEFPy6vi", "title": "Local Learning Matters: Rethinking Data Heterogeneity in Federated Learning", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Federated learning (FL) is a promising strategy for performing privacy-preserving, distributed learning with a network of clients (i.e., edge devices). However, the data distribution among clients is often non-IID in nature, making efficient optimization difficult. To alleviate this issue, many FL algorithms focus on mitigating the effects of data heterogeneity across clients by introducing a variety of proximal terms, some incurring considerable compute and/or memory overheads, to restrain local updates with respect to the global model. Instead, we consider rethinking solutions to data heterogeneity in FL with a focus on local learning generality rather than proximal restriction. Inspired by findings from generalization literature, we employ second-order information to better understand algorithm effectiveness in FL, and find that in many cases standard regularization methods are surprisingly strong performers in mitigating data heterogeneity effects. Armed with key insights from our analysis, we propose a simple and effective method, FedAlign, to overcome data heterogeneity and the pitfalls of previous methods. FedAlign achieves comparable accuracy with state-of-the-art FL methods across a variety of settings while minimizing computation and memory overhead. ", "keywords": "Federated Learning", "primary_area": "", "supplementary_material": "", "author": "Matias Mendieta;Taojiannan Yang;Pu Wang;Minwoo Lee;Zhengming Ding;Chen Chen", "authorids": "~Matias_Mendieta1;~Taojiannan_Yang1;~Pu_Wang1;minwoo.lee@uncc.edu;~Zhengming_Ding5;~Chen_Chen18", "gender": ";M;M;;M;M", "homepage": "https://sites.google.com/view/matiasmendieta;;https://webpages.charlotte.edu/pwang13;;http://www.cs.tulane.edu/~zding1/;https://www.crcv.ucf.edu/chenchen/", "dblp": "254/1788;249/8103;15/4476-1;;122/3547;65/4423-1", "google_scholar": "iO5zyPwAAAAJ;Z_--q5UAAAAJ;0buJlAUAAAAJ;;TKbyRRsAAAAJ;TuEwcZ0AAAAJ", "orcid": "0000-0002-5497-6207;;;;0000-0002-6994-5278;0000-0003-3957-7061", "linkedin": "matias-mendieta/;;;;;dennychen/", "or_profile": "~Matias_Mendieta1;~Taojiannan_Yang1;~Pu_Wang1;minwoo.lee@uncc.edu;~Zhengming_Ding5;~Chen_Chen18", "aff": "University of Central Florida;University of Central Florida;University of North Carolina at Charlotte;;Tulane University;University of Central Florida", "aff_domain": "ucf.edu;ucf.edu;uncc.edu;;tulane.edu;ucf.edu", "position": "PhD student;PhD student;Associate Professor;;Assistant Professor;Assistant Professor", "bibtex": "@misc{\nmendieta2022local,\ntitle={Local Learning Matters: Rethinking Data Heterogeneity in Federated Learning},\nauthor={Matias Mendieta and Taojiannan Yang and Pu Wang and Minwoo Lee and Zhengming Ding and Chen Chen},\nyear={2022},\nurl={https://openreview.net/forum?id=wQDdEFPy6vi}\n}", "github": "", "project": "", "reviewers": "oZgq;DvxK;F1P4", "site": "https://openreview.net/forum?id=wQDdEFPy6vi", "pdf_size": 0, "recommendation": "3;5;5", "confidence": "3;3;4", "correctness": "3;4;3", "technical_novelty": "2;2;2", "empirical_novelty": "2;2;3", "wc_summary_paper": "99;60;119", "wc_summary_review": "29;62;49", "wc_main_review": "464;81;262", "wc_review": "592;203;430", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 4.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 92.66666666666667, 24.499433100017278 ], "wc_summary_review_avg": [ 46.666666666666664, 13.572848714334887 ], "wc_main_review_avg": [ 269.0, 156.4374209281995 ], "wc_review_avg": [ 408.3333333333333, 159.54588333419602 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.49999999999999983, "corr_recommendation_correctness": 0.49999999999999983, "gs_citation": 225, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17308444446907301376&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff_unique_index": "0;0;1;2;0", "aff_unique_norm": "University of Central Florida;University of North Carolina at Charlotte;Tulane University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.ucf.edu;https://www.uncc.edu;https://www.tulane.edu", "aff_unique_abbr": "UCF;UNCC;Tulane", "aff_campus_unique_index": "1", "aff_campus_unique": ";Charlotte", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "wQStfB93RZZ", "title": "Asynchronous Multi-Agent Actor-Critic with Macro-Actions", "track": "main", "status": "Reject", "tldr": "", "abstract": "Many realistic multi-agent problems naturally require agents to be capable of performing asynchronously without waiting for other agents to terminate (e.g., multi-robot domains). Such problems can be modeled as Macro-Action Decentralized Partially Observable Markov Decision Processes (MacDec-POMDPs). Current policy gradient methods are not applicable to the asynchronous actions in MacDec-POMDPs, as these methods assume that agents synchronously reason about action selection at every time-step. To allow asynchronous learning and decision-making, we formulate a set of asynchronous multi-agent actor-critic methods that allow agents to directly optimize asynchronous (macro-action-based) policies in three standard training paradigms: decentralized learning, centralized learning, and centralized training for decentralized execution. Empirical results in various domains show high-quality solutions can be learned for large domains when using our methods.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/f0c84c16ef8505bd9ff9df9dfe103dbc44ecd984.zip", "author": "Yuchen Xiao;Weihao Tan;Christopher Amato", "authorids": "~Yuchen_Xiao1;~Weihao_Tan1;~Christopher_Amato1", "gender": ";M;M", "homepage": "https://ycx424.wixsite.com/xiaoyc;https://weihaotan.github.io/;http://www.ccs.neu.edu/home/camato/index.html", "dblp": ";238/0151;10/3254", "google_scholar": "q_ka-B0AAAAJ;https://scholar.google.com/citations?hl=zh-CN;-8-sD-sAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Yuchen_Xiao1;~Weihao_Tan1;~Christopher_Amato1", "aff": "Northeastern University;Northeastern University;Northeastern University", "aff_domain": "northeastern.edu;neu.edu;neu.edu", "position": "PhD student;Researcher;Assistant Professor", "bibtex": "@misc{\nxiao2022asynchronous,\ntitle={Asynchronous Multi-Agent Actor-Critic with Macro-Actions},\nauthor={Yuchen Xiao and Weihao Tan and Christopher Amato},\nyear={2022},\nurl={https://openreview.net/forum?id=wQStfB93RZZ}\n}", "github": "", "project": "", "reviewers": "Exb6;fiq5;7V3B;jJtb", "site": "https://openreview.net/forum?id=wQStfB93RZZ", "pdf_size": 0, "recommendation": "5;5;5;6", "confidence": "3;3;4;3", "correctness": "3;3;3;3", "technical_novelty": "3;3;2;3", "empirical_novelty": "3;3;3;3", "wc_summary_paper": "355;94;90;44", "wc_summary_review": "124;52;68;39", "wc_main_review": "950;282;367;92", "wc_review": "1429;428;525;175", "wc_reply_reviewers": "179;28;20;0", "wc_reply_authors": "1246;727;691;484", "reply_reviewers": "1;1;1;0", "reply_authors": "3;1;1;1", "recommendation_avg": [ 5.25, 0.4330127018922193 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 145.75, 122.39766133386699 ], "wc_summary_review_avg": [ 70.75, 32.41431011143072 ], "wc_main_review_avg": [ 422.75, 320.27595523235897 ], "wc_review_avg": [ 639.25, 473.52738833144593 ], "wc_reply_reviewers_avg": [ 56.75, 71.31400633816614 ], "wc_reply_authors_avg": [ 787.0, 280.7605741552756 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.5, 0.8660254037844386 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15499800055710245397&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;0", "aff_unique_norm": "Northeastern University", "aff_unique_dep": "", "aff_unique_url": "https://www.northeastern.edu", "aff_unique_abbr": "NEU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Context-Aware Sparse Deep Coordination Graphs", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6580", "id": "wQfgfb8VKTn", "poster": "", "openreview": "https://openreview.net/forum?id=wQfgfb8VKTn", "slides": "https://iclr.cc/virtual/2022/poster/6580", "video": "https://iclr.cc/virtual/2022/poster/6580", "author_site": "Tonghan Wang, Liang Zeng, Weijun Dong, Qianlan Yang, Yang Yu, Chongjie Zhang", "tldr": "", "abstract": "Learning sparse coordination graphs adaptive to the coordination dynamics among agents is a long-standing problem in cooperative multi-agent learning. This paper studies this problem and proposes a novel method using the variance of payoff functions to construct context-aware sparse coordination topologies. We theoretically consolidate our method by proving that the smaller the variance of payoff functions is, the less likely action selection will change after removing the corresponding edge. Moreover, we propose to learn action representations to effectively reduce the influence of payoff functions' estimation errors on graph construction. To empirically evaluate our method, we present the Multi-Agent COordination (MACO) benchmark by collecting classic coordination problems in the literature, increasing their difficulty, and classifying them into different types. We carry out a case study and experiments on the MACO and StarCraft II micromanagement benchmark to demonstrate the dynamics of sparse graph learning, the influence of graph sparseness, and the learning performance of our method.", "keywords": "Multi-agent reinforcement learning;Sparse coordination graphs;Deep coordination graphs", "primary_area": "", "supplementary_material": "/attachment/4ae39108a78e9b258a9cdfc44a1ad7ccf281cf63.zip", "author": "Tonghan Wang;Liang Zeng;Weijun Dong;Qianlan Yang;Yang Yu;Chongjie Zhang", "authorids": "~Tonghan_Wang1;~Liang_Zeng1;~Weijun_Dong1;~Qianlan_Yang1;~Yang_Yu5;~Chongjie_Zhang1", "gender": "M;M;M;M;;M", "homepage": "https://tonghanwang.github.io/;https://github.com/zlpure;https://github.com/dwjshift;https://github.com/yanQval;;http://www.lamda.nju.edu.cn/yuy", "dblp": "175/6039-1.html;09/2922-2;;294/4952;29/6693;46/2181-1", "google_scholar": "-AR1yc4AAAAJ;yG8fNjIAAAAJ;;iV5nuc4AAAAJ;LjxqXycAAAAJ;PG2lDSwAAAAJ", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": "~Tonghan_Wang1;~Liang_Zeng1;~Weijun_Dong1;~Qianlan_Yang1;~Chongjie_Zhang1;~Yang_Yu2", "aff": "Tsinghua University;Institute for Interdisciplinary Information Sciences (IIIS), Tsinghua University;Institute for Interdisciplinary Information Sciences, Tsinghua University;Tsinghua University;Tsinghua University;Nanjing University", "aff_domain": "tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;nju.edu.cn", "position": "MS student;PhD student;Undergrad student;Undergrad student;Assistant Professor;Professor", "bibtex": "@inproceedings{\nwang2022contextaware,\ntitle={Context-Aware Sparse Deep Coordination Graphs},\nauthor={Tonghan Wang and Liang Zeng and Weijun Dong and Qianlan Yang and Yang Yu and Chongjie Zhang},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=wQfgfb8VKTn}\n}", "github": "", "project": "", "reviewers": "Y1rR;7Rh6;gGhv;pXtp", "pdf_size": 0, "recommendation": "6;6;8;8", "confidence": "3;3;4;4", "correctness": "3;3;3;4", "technical_novelty": "3;3;4;4", "empirical_novelty": "3;3;4;3", "wc_summary_paper": "54;59;109;124", "wc_summary_review": "116;74;95;21", "wc_main_review": "315;340;687;247", "wc_review": "485;473;891;392", "wc_reply_reviewers": "44;11;265;0", "wc_reply_authors": "721;917;902;219", "reply_reviewers": "1;1;2;0", "reply_authors": "2;3;3;1", "recommendation_avg": [ 7.0, 1.0 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 3.5, 0.5 ], "empirical_novelty_avg": [ 3.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 86.5, 30.516389039334257 ], "wc_summary_review_avg": [ 76.5, 35.316426772820606 ], "wc_main_review_avg": [ 397.25, 170.71375896511682 ], "wc_review_avg": [ 560.25, 194.27992047558595 ], "wc_reply_reviewers_avg": [ 80.0, 108.03008840133381 ], "wc_reply_authors_avg": [ 689.75, 282.52201241673185 ], "reply_reviewers_avg": [ 1.0, 0.7071067811865476 ], "reply_authors_avg": [ 2.25, 0.82915619758885 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 1.0, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 40, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17498858288824989874&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=wQfgfb8VKTn", "email": "tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;nju.edu.cn", "author_num": 6, "aff_unique_index": "0;0;0;0;0;1", "aff_unique_norm": "Tsinghua University;Nanjing University", "aff_unique_dep": ";", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.nju.edu.cn", "aff_unique_abbr": "THU;Nanjing U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "iLQR-VAE : control-based learning of input-driven dynamics with applications to neural data", "status": "Oral", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6079", "id": "wRODLDHaAiW", "poster": "", "openreview": "https://openreview.net/forum?id=wRODLDHaAiW", "slides": "https://iclr.cc/virtual/2022/poster/6079", "video": "https://iclr.cc/virtual/2022/poster/6079", "author_site": "Marine Schimel, Ta-Chu Kao, Kristopher Jensen, Guillaume Hennequin", "tldr": "", "abstract": "Understanding how neural dynamics give rise to behaviour is one of the most fundamental questions in systems neuroscience. To achieve this, a common approach is to record neural populations in behaving animals, and model these data as emanating from a latent dynamical system whose state trajectories can then be related back to behavioural observations via some form of decoding. As recordings are typically performed in localized circuits that form only a part of the wider implicated network, it is important to simultaneously learn the local dynamics and infer any unobserved external input that might drive them. Here, we introduce iLQR-VAE, a novel control-based approach to variational inference in nonlinear dynamical systems, capable of learning both latent dynamics, initial conditions, and ongoing external inputs. As in recent deep learning approaches, our method is based on an input-driven sequential variational autoencoder (VAE). The main novelty lies in the use of the powerful iterative linear quadratic regulator algorithm (iLQR) in the recognition model. Optimization of the standard evidence lower-bound requires differentiating through iLQR solutions, which is made possible by recent advances in differentiable control. Importantly, having the recognition model be implicitly defined by the generative model greatly reduces the number of free parameters and allows for flexible, high-quality inference. This makes it possible for instance to evaluate the model on a single long trial after training on smaller chunks. We demonstrate the effectiveness of iLQR-VAE on a range of synthetic systems, with autonomous as well as input-driven dynamics. We further apply it to neural and behavioural recordings in non-human primates performing two different reaching tasks, and show that iLQR-VAE yields high-quality kinematic reconstructions from the neural data. ", "keywords": "neuroscience;latent variable models;RNN;VAE;motor control;control theory;dynamical systems", "primary_area": "", "supplementary_material": "/attachment/d56eef08c847ab2b7a4c4bdea44fe6464442601c.zip", "author": "Marine Schimel;Ta-Chu Kao;Kristopher T Jensen;Guillaume Hennequin", "authorids": "~Marine_Schimel1;~Ta-Chu_Kao1;~Kristopher_T_Jensen1;~Guillaume_Hennequin1", "gender": "F;;;M", "homepage": ";https://tachukao.github.io;https://krisjensen.github.io/;https://cbl-cambridge.org", "dblp": ";202/1972;267/5296;56/10432", "google_scholar": ";_JlEGXQAAAAJ;https://scholar.google.com/citations?hl=en;-NkKYYcAAAAJ", "orcid": "0000-0002-6937-011X;;;", "linkedin": ";;;", "or_profile": "~Marine_Schimel1;~Ta-Chu_Kao1;~Kristopher_T_Jensen1;~Guillaume_Hennequin1", "aff": "University of Cambridge;University College London, University of London;University of Cambridge;University of Cambridge", "aff_domain": "cam.ac.uk;ucl.ac.uk;cam.ac.uk;cam.ac.uk", "position": "PhD student;Postdoc;PhD student;Associate Professor", "bibtex": "@inproceedings{\nschimel2022ilqrvae,\ntitle={i{LQR}-{VAE} : control-based learning of input-driven dynamics with applications to neural data},\nauthor={Marine Schimel and Ta-Chu Kao and Kristopher T Jensen and Guillaume Hennequin},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=wRODLDHaAiW}\n}", "github": "", "project": "", "reviewers": "YcP3;gF5H;Ve7m", "pdf_size": 0, "recommendation": "8;8;8", "confidence": "3;4;4", "correctness": "4;4;4", "technical_novelty": "3;4;4", "empirical_novelty": "2;4;3", "wc_summary_paper": "76;69;108", "wc_summary_review": "38;48;46", "wc_main_review": "505;169;473", "wc_review": "619;286;627", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 8.0, 0.0 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 4.0, 0.0 ], "technical_novelty_avg": [ 3.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 3.0, 0.816496580927726 ], "wc_summary_paper_avg": [ 84.33333333333333, 16.97710877099579 ], "wc_summary_review_avg": [ 44.0, 4.320493798938574 ], "wc_main_review_avg": [ 382.3333333333333, 151.41407537683617 ], "wc_review_avg": [ 510.6666666666667, 158.89689179534705 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 31, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2325916628342675916&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=wRODLDHaAiW", "email": "cam.ac.uk;ucl.ac.uk;cam.ac.uk;cam.ac.uk", "author_num": 4, "aff_unique_index": "0;1;0;0", "aff_unique_norm": "University of Cambridge;University College London", "aff_unique_dep": ";", "aff_unique_url": "https://www.cam.ac.uk;https://www.ucl.ac.uk", "aff_unique_abbr": "Cambridge;UCL", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Cambridge;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United Kingdom" }, { "title": "Graph Neural Networks with Learnable Structural and Positional Representations", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6463", "id": "wTTjnvGphYj", "poster": "", "openreview": "https://openreview.net/forum?id=wTTjnvGphYj", "slides": "https://iclr.cc/virtual/2022/poster/6463", "video": "https://iclr.cc/virtual/2022/poster/6463", "author_site": "Vijay Prakash Dwivedi, Anh Tuan Luu, Thomas Laurent, Yoshua Bengio, Xavier Bresson", "tldr": "", "abstract": "Graph neural networks (GNNs) have become the standard learning architectures for graphs. GNNs have been applied to numerous domains ranging from quantum chemistry, recommender systems to knowledge graphs and natural language processing. A major issue with arbitrary graphs is the absence of canonical positional information of nodes, which decreases the representation power of GNNs to distinguish e.g. isomorphic nodes and other graph symmetries. An approach to tackle this issue is to introduce Positional Encoding (PE) of nodes, and inject it into the input layer, like in Transformers. Possible graph PE are Laplacian eigenvectors. In this work, we propose to decouple structural and positional representations to make easy for the network to learn these two essential properties. We introduce a novel generic architecture which we call \\texttt{LSPE} (Learnable Structural and Positional Encodings). We investigate several sparse and fully-connected (Transformer-like) GNNs, and observe a performance increase for molecular datasets, from $1.79\\%$ up to $64.14\\%$ when considering learnable PE for both GNN classes.", "keywords": "graph neural networks;graph representation learning;transformers;positional encoding", "primary_area": "", "supplementary_material": "", "author": "Vijay Prakash Dwivedi;Anh Tuan Luu;Thomas Laurent;Yoshua Bengio;Xavier Bresson", "authorids": "~Vijay_Prakash_Dwivedi1;~Anh_Tuan_Luu2;~Thomas_Laurent1;~Yoshua_Bengio1;~Xavier_Bresson6", "gender": "M;M;M;M;M", "homepage": "https://vijaydwivedi.com.np;https://tuanluu.github.io/;http://thomaslaurent.lmu.build/homepage.html;http://yoshuabengio.org;https://www.comp.nus.edu.sg/cs/people/xaviercs/", "dblp": "243/1717;81/8329.html;47/8889-1;56/953;95/378", "google_scholar": "8MS7iC0AAAAJ;https://scholar.google.com.sg/citations?hl=en;_Ag_9uAAAAAJ;kukA0LcAAAAJ;https://scholar.google.com.sg/citations?hl=en", "orcid": ";;;;", "linkedin": "vijay321/;;;yoshuabengio/?originalSubdomain=ca;", "or_profile": "~Vijay_Prakash_Dwivedi1;~Anh_Tuan_Luu2;~Thomas_Laurent1;~Yoshua_Bengio1;~Xavier_Bresson6", "aff": "Nanyang Technological University;Nanyang Technological University;Loyola Marymount University;University of Montreal;National University of Singapore", "aff_domain": "ntu.edu.sg;ntu.edu.sg;lmu.edu;umontreal.ca;nus.edu.sg", "position": "PhD student;Assistant Professor;Full Professor;Full Professor;Associate Professor", "bibtex": "@inproceedings{\ndwivedi2022graph,\ntitle={Graph Neural Networks with Learnable Structural and Positional Representations},\nauthor={Vijay Prakash Dwivedi and Anh Tuan Luu and Thomas Laurent and Yoshua Bengio and Xavier Bresson},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=wTTjnvGphYj}\n}", "github": "", "project": "", "reviewers": "E8yv;uyjX;ENFN;2bUV;vVMd", "pdf_size": 0, "recommendation": "5;5;6;8;8", "confidence": "4;4;3;4;4", "correctness": "3;3;3;4;3", "technical_novelty": "2;2;3;3;3", "empirical_novelty": "2;2;3;3;3", "wc_summary_paper": "27;52;157;23;32", "wc_summary_review": "41;30;113;15;31", "wc_main_review": "216;382;478;132;495", "wc_review": "284;464;748;170;558", "wc_reply_reviewers": "0;0;66;0;0", "wc_reply_authors": "1147;2022;1316;669;1169", "reply_reviewers": "0;0;1;0;0", "reply_authors": "3;4;4;1;2", "recommendation_avg": [ 6.4, 1.3564659966250536 ], "confidence_avg": [ 3.8, 0.39999999999999997 ], "correctness_avg": [ 3.2, 0.39999999999999997 ], "technical_novelty_avg": [ 2.6, 0.4898979485566356 ], "empirical_novelty_avg": [ 2.6, 0.4898979485566356 ], "wc_summary_paper_avg": [ 58.2, 50.39603158979882 ], "wc_summary_review_avg": [ 46.0, 34.51376536977674 ], "wc_main_review_avg": [ 340.6, 143.85492692292468 ], "wc_review_avg": [ 444.8, 203.21653476033882 ], "wc_reply_reviewers_avg": [ 13.2, 26.399999999999995 ], "wc_reply_authors_avg": [ 1264.6, 436.80778381342975 ], "reply_reviewers_avg": [ 0.2, 0.4000000000000001 ], "reply_authors_avg": [ 2.8, 1.16619037896906 ], "replies_avg": [ 23, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.14744195615489716, "corr_recommendation_correctness": 0.5897678246195885, "gs_citation": 448, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6297596382755615056&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=wTTjnvGphYj", "email": "ntu.edu.sg;ntu.edu.sg;lmu.edu;umontreal.ca;nus.edu.sg", "author_num": 5, "aff_unique_index": "0;0;1;2;3", "aff_unique_norm": "Nanyang Technological University;Loyola Marymount University;University of Montreal;National University of Singapore", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.ntu.edu.sg;https://www.lmu.edu;https://wwwumontreal.ca;https://www.nus.edu.sg", "aff_unique_abbr": "NTU;LMU;UM;NUS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;2;0", "aff_country_unique": "Singapore;United States;Canada" }, { "id": "wVFkD13GpeX", "title": "ReGVD: Revisiting Graph Neural Networks for Vulnerability Detection", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Identifying vulnerabilities in the source code is essential to protect the software systems from cyber security attacks. It, however, is also a challenging step that requires specialized expertise in security and code representation. Inspired by the successful applications of pre-trained programming language models such as CodeBERT and graph neural networks (GNNs) for natural language processing, we propose ReGVD, a general and novel graph neural network-based model for vulnerability detection. In particular, ReGVD views a given source code as a flat sequence of tokens and then examines two effective methods of utilizing unique tokens and indexes respectively to construct a single graph as an input, wherein node features are initialized only by the embedding layer of a pre-trained PL model. Next, ReGVD leverages a practical advantage of residual connection among GNN layers and explores a beneficial mixture of graph-level sum and max poolings to return a graph embedding for the given source code. Experimental results demonstrate that ReGVD outperforms the existing state-of-the-art models and obtain the highest accuracy on the real-world benchmark dataset from CodeXGLUE for vulnerability detection.", "keywords": "Vulnerability Detection;Cyber Security;Graph Neural Networks;Programming Language", "primary_area": "", "supplementary_material": "", "author": "Van-Anh Nguyen;Dai Quoc Nguyen;Van Nguyen;Trung Le;Quan Hung Tran;Dinh Phung", "authorids": "~Van-Anh_Nguyen1;~Dai_Quoc_Nguyen1;~Van_Nguyen2;~Trung_Le2;~Quan_Hung_Tran1;~Dinh_Phung2", "gender": "F;;M;M;M;M", "homepage": ";;;;;https://research.monash.edu/en/persons/dinh-phung", "dblp": ";35/9125;;;151/8700;71/5859", "google_scholar": "I5kuXKsAAAAJ;AmB7MhUAAAAJ;KPpmKZ0AAAAJ;https://scholar.google.com/citations?hl=en;ehs5ImcAAAAJ;https://scholar.google.com.au/citations?user=OtA9SwIAAAAJ", "orcid": ";;0000-0002-5838-3409;;;0000-0002-9977-8247", "linkedin": ";;;;;https://linkedin.com/in/dinh-phung-6b537a6", "or_profile": "~Van-Anh_Nguyen1;~Dai_Quoc_Nguyen1;~Van_Nguyen2;~Trung_Le2;~Quan_Hung_Tran1;~Dinh_Phung1", "aff": "Sejong University;Oracle;Monash University;Monash University;Adobe Systems;Monash University", "aff_domain": "sju.kr.ac;oracle.com;monash.edu;monash.edu;adobe.com;monash.edu", "position": "MS student;Researcher;Postdoc;Assistant Professor;Research Scientist;Full Professor", "bibtex": "@misc{\nnguyen2022regvd,\ntitle={Re{GVD}: Revisiting Graph Neural Networks for Vulnerability Detection},\nauthor={Van-Anh Nguyen and Dai Quoc Nguyen and Van Nguyen and Trung Le and Quan Hung Tran and Dinh Phung},\nyear={2022},\nurl={https://openreview.net/forum?id=wVFkD13GpeX}\n}", "github": "", "project": "", "reviewers": "D55e;f2hJ;DCXz;LyCz", "site": "https://openreview.net/forum?id=wVFkD13GpeX", "pdf_size": 0, "recommendation": "3;3;3;3", "confidence": "4;4;3;5", "correctness": "3;3;3;2", "technical_novelty": "1;2;1;3", "empirical_novelty": "1;2;2;2", "wc_summary_paper": "82;85;73;84", "wc_summary_review": "18;235;27;74", "wc_main_review": "232;387;166;335", "wc_review": "332;707;266;493", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 1.75, 0.82915619758885 ], "empirical_novelty_avg": [ 1.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 81.0, 4.743416490252569 ], "wc_summary_review_avg": [ 88.5, 87.21381771256203 ], "wc_main_review_avg": [ 280.0, 86.27572080255256 ], "wc_review_avg": [ 449.5, 170.05660822208586 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 139, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1459118643333817514&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "aff_unique_index": "0;1;2;2;3;2", "aff_unique_norm": "Sejong University;Oracle Corporation;Monash University;Adobe", "aff_unique_dep": ";;;Adobe Systems Incorporated", "aff_unique_url": "https://www.sejong.ac.kr;https://www.oracle.com;https://www.monash.edu;https://www.adobe.com", "aff_unique_abbr": "Sejong;Oracle;Monash;Adobe", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2;2;1;2", "aff_country_unique": "South Korea;United States;Australia" }, { "id": "wX4Z5X5vpm", "title": "Plan Your Target and Learn Your Skills: State-Only Imitation Learning via Decoupled Policy Optimization", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "State-only imitation learning (SOIL) enables agents to learn from massive demonstrations without explicit action or reward information.\nHowever, previous methods attempt to learn the implicit state-to-action mapping policy directly from state-only data, which results in ambiguity and inefficiency.\nIn this paper, we overcome this issue by introducing hyper-policy as sets of policies that share the same state transition to characterize the optimality in SOIL. Accordingly, we propose Decoupled Policy Optimization (DPO) via explicitly decoupling the state-to-action mapping policy as a state transition predictor and an inverse dynamics model. Intuitively, we teach the agent to plan the target to go and then learn its own skills to reach. \nExperiments on standard benchmarks and a real-world driving dataset demonstrate the effectiveness of DPO and its potential of bridging the gap between reality and simulations of reinforcement learning.", "keywords": "reinforcement learning;imitation learning", "primary_area": "", "supplementary_material": "/attachment/e1dfd54fc9c4cc1b444797bfe4eca4ddbb586545.zip", "author": "Minghuan Liu;Zhengbang Zhu;Yuzheng Zhuang;Weinan Zhang;Jian Shen;Jianye HAO;Yong Yu;Jun Wang", "authorids": "~Minghuan_Liu1;~Zhengbang_Zhu1;~Yuzheng_Zhuang1;~Weinan_Zhang1;~Jian_Shen2;~Jianye_HAO1;~Yong_Yu1;~Jun_Wang2", "gender": "M;M;F;M;M;M;;M", "homepage": "http://minghuanliu.com;https://github.com/zbzhu99;;http://wnzhang.net;;http://www.icdai.org/jianye.html;https://apex.sjtu.edu.cn/members/yyu;http://www0.cs.ucl.ac.uk/staff/jun.wang/", "dblp": "249/7554;277/0869;;28/10261-1;95/5846-3.html;21/7664.html;43/5685.html;w/JunWang12", "google_scholar": ";;https://scholar.google.com/citations?hl=en;Qzss0GEAAAAJ;;;;https://scholar.google.co.uk/citations?user=wIE1tY4AAAAJ", "orcid": ";;;0000-0002-0127-2425;;0000-0002-0422-8235;0000-0003-4457-2820;", "linkedin": ";;;;;;;", "or_profile": "~Minghuan_Liu1;~Zhengbang_Zhu1;~Yuzheng_Zhuang1;~Weinan_Zhang1;~Jian_Shen2;~Jianye_HAO1;~Yong_Yu1;~Jun_Wang2", "aff": "Shanghai Jiaotong University;Shanghai Jiaotong University;Huawei Technologies Ltd.;Shanghai Jiaotong University;Shanghai Jiaotong University;Tianjin University;Shanghai Jiaotong University;University College London", "aff_domain": "sjtu.edu.cn;sjtu.edu.cn;huawei.com;sjtu.edu.cn;sjtu.edu.cn;tju.edu.cn;sjtu.edu.cn;ucl.ac.uk", "position": "PhD student;PhD student;Research Engineer;Associate Professor;PhD student;Associate Professor;Full Professor;Professor", "bibtex": "@misc{\nliu2022plan,\ntitle={Plan Your Target and Learn Your Skills: State-Only Imitation Learning via Decoupled Policy Optimization},\nauthor={Minghuan Liu and Zhengbang Zhu and Yuzheng Zhuang and Weinan Zhang and Jian Shen and Jianye HAO and Yong Yu and Jun Wang},\nyear={2022},\nurl={https://openreview.net/forum?id=wX4Z5X5vpm}\n}", "github": "", "project": "", "reviewers": "xbM5;5w49;c87a", "site": "https://openreview.net/forum?id=wX4Z5X5vpm", "pdf_size": 0, "recommendation": "5;5;5", "confidence": "4;4;3", "correctness": "3;3;3", "technical_novelty": "4;3;3", "empirical_novelty": "2;2;2", "wc_summary_paper": "90;163;31", "wc_summary_review": "131;138;37", "wc_main_review": "433;607;174", "wc_review": "654;908;242", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 5.0, 0.0 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 3.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 94.66666666666667, 53.98971095392981 ], "wc_summary_review_avg": [ 102.0, 46.05069670120819 ], "wc_main_review_avg": [ 404.6666666666667, 177.90321963234828 ], "wc_review_avg": [ 601.3333333333334, 274.4319385364774 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17916567407029306285&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2, "aff_unique_index": "0;0;1;0;0;2;0;3", "aff_unique_norm": "Shanghai Jiao Tong University;Huawei;Tianjin University;University College London", "aff_unique_dep": ";Huawei Technologies;;", "aff_unique_url": "https://www.sjtu.edu.cn;https://www.huawei.com;http://www.tju.edu.cn;https://www.ucl.ac.uk", "aff_unique_abbr": "SJTU;Huawei;TJU;UCL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;1", "aff_country_unique": "China;United Kingdom" }, { "id": "wYqLTy4wkor", "title": "Grounding Aleatoric Uncertainty in Unsupervised Environment Design", "track": "main", "status": "Reject", "tldr": "", "abstract": "In reinforcement learning (RL), adaptive curricula have proven highly effective for learning policies that generalize well under a wide variety of changes to the environment. Recently, the framework of Unsupervised Environment Design (UED) generalized notions of curricula for RL in terms of generating entire environments, leading to the development of new methods with robust minimax-regret properties. However, in partially-observable or stochastic settings (those featuring aleatoric uncertainty), optimal policies may depend on the ground-truth distribution over the aleatoric features of the environment. Such settings are potentially problematic for curriculum learning, which necessarily shifts the environment distribution used during training with respect to the fixed ground-truth distribution in the intended deployment environment. We formalize this phenomenon as curriculum-induced covariate shift, and describe how, when the distribution shift occurs over such aleatoric environment parameters, it can lead to learning suboptimal policies. We then propose a method which, given black box access to a simulator, corrects this resultant bias by aligning the advantage estimates to the ground-truth distribution over aleatoric parameters. This approach leads to a minimax-regret UED method, SAMPLR, with Bayes-optimal guarantees.", "keywords": "reinforcement learning;curriculum learning;generalization;environment design;procedural content generation", "primary_area": "", "supplementary_material": "/attachment/524a0f088234220bde71b28ce9cee05e0639b19e.zip", "author": "Minqi Jiang;Michael D Dennis;Jack Parker-Holder;Andrei Lupu;Heinrich Kuttler;Edward Grefenstette;Tim Rockt\u00e4schel;Jakob Nicolaus Foerster", "authorids": "~Minqi_Jiang1;~Michael_D_Dennis1;~Jack_Parker-Holder1;~Andrei_Lupu1;~Heinrich_Kuttler1;~Edward_Grefenstette1;~Tim_Rockt\u00e4schel1;~Jakob_Nicolaus_Foerster1", "gender": "M;M;M;M;;M;M;M", "homepage": "https://twitter.com/minqijiang;;https://jparkerholder.github.io/;;;http://egrefen.com/;https://www.jakobfoerster.com;http://rockt.ai", "dblp": "270/7949;;237/9793.html;218/7027;;http://dblp.uni-trier.de/pers/hd/g/Grefenstette:Edward;176/5095;43/11537", "google_scholar": ";WXXu26AAAAAJ;;I6aB-YUAAAAJ;;https://scholar.google.co.uk/citations?user=ezllEwMAAAAJ;6z4lQzMAAAAJ;https://scholar.google.co.uk/citations?user=mWBY8aIAAAAJ", "orcid": ";;;;;;;", "linkedin": "minqi-jiang-585a6536/;;;lupu-andrei;;;;rockt/", "or_profile": "~Minqi_Jiang1;~Michael_D_Dennis1;~Jack_Parker-Holder1;~Andrei_Lupu1;~Heinrich_Kuttler1;~Edward_Grefenstette1;~Jakob_Nicolaus_Foerster1;~Tim_Rocktaeschel1", "aff": "University College London;University of California, Berkeley;University of Oxford;Meta AI;Meta Facebook;Meta Facebook;University of Oxford, University of Oxford;Facebook AI Research", "aff_domain": "ucl.ac.uk;berkeley.edu;ox.ac.uk;meta.com;fb.com;fb.com;eng.ox.ac.uk;facebook.com", "position": "PhD;PhD student;PhD student;Researcher;Research Engineer;Research Scientist;Associate Professor;Manager, Research Scientist", "bibtex": "@misc{\njiang2022grounding,\ntitle={Grounding Aleatoric Uncertainty in Unsupervised Environment Design},\nauthor={Minqi Jiang and Michael D Dennis and Jack Parker-Holder and Andrei Lupu and Heinrich Kuttler and Edward Grefenstette and Tim Rockt{\\\"a}schel and Jakob Nicolaus Foerster},\nyear={2022},\nurl={https://openreview.net/forum?id=wYqLTy4wkor}\n}", "github": "", "project": "", "reviewers": "ZRvM;R1Z8;AAp9;UQqT", "site": "https://openreview.net/forum?id=wYqLTy4wkor", "pdf_size": 0, "recommendation": "5;5;5;5", "confidence": "4;3;3;2", "correctness": "4;2;3;3", "technical_novelty": "2;2;3;2", "empirical_novelty": "2;2;3;2", "wc_summary_paper": "103;58;118;38", "wc_summary_review": "27;35;71;39", "wc_main_review": "691;438;422;349", "wc_review": "821;531;611;426", "wc_reply_reviewers": "0;0;0;344", "wc_reply_authors": "1294;634;612;711", "reply_reviewers": "0;0;0;1", "reply_authors": "2;1;1;1", "recommendation_avg": [ 5.0, 0.0 ], "confidence_avg": [ 3.0, 0.7071067811865476 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 79.25, 32.47595264191645 ], "wc_summary_review_avg": [ 43.0, 16.73320053068151 ], "wc_main_review_avg": [ 475.0, 129.141395377315 ], "wc_review_avg": [ 597.25, 144.8868092684769 ], "wc_reply_reviewers_avg": [ 86.0, 148.95636945092346 ], "wc_reply_authors_avg": [ 812.75, 280.27073964293885 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6786564016084218608&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff_unique_index": "0;1;2;3;3;3;2;3", "aff_unique_norm": "University College London;University of California, Berkeley;University of Oxford;Meta", "aff_unique_dep": ";;;Meta AI", "aff_unique_url": "https://www.ucl.ac.uk;https://www.berkeley.edu;https://www.ox.ac.uk;https://meta.com", "aff_unique_abbr": "UCL;UC Berkeley;Oxford;Meta", "aff_campus_unique_index": "1", "aff_campus_unique": ";Berkeley", "aff_country_unique_index": "0;1;0;1;1;1;0;1", "aff_country_unique": "United Kingdom;United States" }, { "title": "Differentiable Scaffolding Tree for Molecule Optimization", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6701", "id": "w_drCosT76", "poster": "", "openreview": "https://openreview.net/forum?id=w_drCosT76", "slides": "https://iclr.cc/virtual/2022/poster/6701", "video": "https://iclr.cc/virtual/2022/poster/6701", "author_site": "Tianfan Fu, Wenhao Gao, Cao Xiao, Jacob Yasonik, Connor Coley, Jimeng Sun", "tldr": "", "abstract": "The structural design of functional molecules, also called molecular optimization, is an essential chemical science and engineering task with important applications, such as drug discovery. Deep generative models and combinatorial optimization methods achieve initial success but still struggle with directly modeling discrete chemical structures and often heavily rely on brute-force enumeration. The challenge comes from the discrete and non-differentiable nature of molecule structures. To address this, we propose differentiable scaffolding tree (DST) that utilizes a learned knowledge network to convert discrete chemical structures to locally differentiable ones. DST enables a gradient-based optimization on a chemical graph structure by back-propagating the derivatives from the target properties through a graph neural network (GNN). Our empirical studies show the gradient-based molecular optimizations are both effective and sample efficient (in terms of oracle calling number). Furthermore, the learned graph parameters can also provide an explanation that helps domain experts understand the model output. The code repository (including processed data, trained model, demonstration, molecules with the highest property) is available at https://github.com/futianfan/DST.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/97ea77ff88a0fbd25c7dd11ed38efde8cc5e389c.zip", "author": "Tianfan Fu;Wenhao Gao;Cao Xiao;Jacob Yasonik;Connor W. Coley;Jimeng Sun", "authorids": "~Tianfan_Fu1;~Wenhao_Gao1;~Cao_Xiao2;jyasonik@mit.edu;~Connor_W._Coley1;~Jimeng_Sun3", "gender": "M;M;F;;;", "homepage": "https://futianfan.github.io/;https://wenhao-gao.github.io;https://sites.google.com/view/danicaxiao/home;;;http://sunlab.org", "dblp": ";177/0968;170/1833;;;", "google_scholar": "KPQ49w4AAAAJ;s4eywrUAAAAJ;ahaV25EAAAAJ;;;9jmmp5sAAAAJ", "orcid": ";0000-0002-6506-8044;;;;0000-0003-1512-6426", "linkedin": ";;caoxiao/;;;jimengsun/", "or_profile": "~Tianfan_Fu1;~Wenhao_Gao1;~Cao_Xiao2;jyasonik@mit.edu;~Connor_W._Coley1;~Jimeng_Sun3", "aff": ";Massachusetts Institute of Technology;Amplitude;;;Georgia Institute of Technology", "aff_domain": ";mit.edu;amplitude.com;;;gatech.edu", "position": ";PhD student;Senior Director of Data Science and Machine Learning;;;Associate Professor", "bibtex": "@inproceedings{\nfu2022differentiable,\ntitle={Differentiable Scaffolding Tree for Molecule Optimization},\nauthor={Tianfan Fu and Wenhao Gao and Cao Xiao and Jacob Yasonik and Connor W. Coley and Jimeng Sun},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=w_drCosT76}\n}", "github": "", "project": "", "reviewers": "BjiD;JXBo;5uXz;xvty", "pdf_size": 0, "recommendation": "5;6;8;10", "confidence": "5;5;4;4", "correctness": "3;4;4;4", "technical_novelty": "4;4;3;4", "empirical_novelty": "2;2;3;4", "wc_summary_paper": "83;50;75;160", "wc_summary_review": "83;38;27;80", "wc_main_review": "1339;322;125;152", "wc_review": "1505;410;227;392", "wc_reply_reviewers": "1836;189;0;0", "wc_reply_authors": "1834;763;201;351", "reply_reviewers": "5;1;0;0", "reply_authors": "5;2;1;1", "recommendation_avg": [ 7.25, 1.920286436967152 ], "confidence_avg": [ 4.5, 0.5 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 92.0, 41.10352782912922 ], "wc_summary_review_avg": [ 57.0, 24.829418035870273 ], "wc_main_review_avg": [ 484.5, 499.0924263099972 ], "wc_review_avg": [ 633.5, 508.19017109739536 ], "wc_reply_reviewers_avg": [ 506.25, 771.5991106137953 ], "wc_reply_authors_avg": [ 787.25, 638.4114562725202 ], "reply_reviewers_avg": [ 1.5, 2.0615528128088303 ], "reply_authors_avg": [ 2.25, 1.6393596310755 ], "replies_avg": [ 25, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.911322376865767, "corr_recommendation_correctness": 0.676481425202546, "gs_citation": 92, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3960144121819053092&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=w_drCosT76", "email": ";mit.edu;amplitude.com;;;gatech.edu", "author_num": 6, "aff_unique_index": "0;1;2", "aff_unique_norm": "Massachusetts Institute of Technology;Amplitude;Georgia Institute of Technology", "aff_unique_dep": ";;", "aff_unique_url": "https://web.mit.edu;https://amplitude.com;https://www.gatech.edu", "aff_unique_abbr": "MIT;Amplitude;Georgia Tech", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Fairness Guarantees under Demographic Shift", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6666", "id": "wbPObLm6ueA", "poster": "", "openreview": "https://openreview.net/forum?id=wbPObLm6ueA", "slides": "https://iclr.cc/virtual/2022/poster/6666", "video": "https://iclr.cc/virtual/2022/poster/6666", "author_site": "Stephen Giguere, Blossom Metevier, Bruno Silva, Yuriy Brun, Philip Thomas, Scott Niekum", "tldr": "", "abstract": "Recent studies have demonstrated that using machine learning for social applications can lead to injustice in the form of racist, sexist, and otherwise unfair and discriminatory outcomes. To address this challenge, recent machine learning algorithms have been designed to limit the likelihood such unfair behaviors will occur. However, these approaches typically assume the data used for training is representative of what will be encountered once the model is deployed, thus limiting their usefulness. In particular, if certain subgroups of the population become more or less probable after the model is deployed (a phenomenon we call demographic shift), the fair-ness assurances provided by prior algorithms are often invalid. We consider the impact of demographic shift and present a class of algorithms, called Shifty algorithms, that provide high-confidence behavioral guarantees that hold under demographic shift. Shifty is the first technique of its kind and demonstrates an effective strategy for designing algorithms to overcome the challenges demographic shift poses. We evaluate Shifty-ttest, an implementation of Shifty based on Student\u2019s \ud835\udc61-test, and, using a real-world data set of university entrance exams and subsequent student success, show that the models output by our algorithm avoid unfair bias under demo-graphic shift, unlike existing methods. Our experiments demonstrate that our algorithm\u2019s high-confidence fairness guarantees are valid in practice and that our algorithm is an effective tool for training models that are fair when demographic shift occurs.", "keywords": "Fairness and Bias in Artificial Intelligence;Machine Learning", "primary_area": "", "supplementary_material": "", "author": "Stephen Giguere;Blossom Metevier;Bruno Castro da Silva;Yuriy Brun;Philip S. Thomas;Scott Niekum", "authorids": "~Stephen_Giguere1;~Blossom_Metevier1;~Bruno_Castro_da_Silva1;~Yuriy_Brun1;~Philip_S._Thomas1;~Scott_Niekum1", "gender": "M;F;M;M;M;M", "homepage": ";https://bmetevier.github.io/;https://people.cs.umass.edu/~bsilva/;https://people.cs.umass.edu/~brun/;http://psthomas.com;https://people.cs.umass.edu/~sniekum/index.php", "dblp": "14/8174;234/4694;75/3139;51/1518.html;46/11107;62/8399", "google_scholar": ";https://scholar.google.com/citations?hl=en;eskJDVUAAAAJ;https://scholar.google.com.tw/citations?user=YVfr3wwAAAAJ;e8Gzgo4AAAAJ;4wXYfSUAAAAJ", "orcid": ";;;0000-0003-3027-7986;;", "linkedin": ";blossom-metevier-461034342/;;;;", "or_profile": "~Stephen_Giguere1;~Blossom_Metevier1;~Bruno_Castro_da_Silva1;~Yuriy_Brun1;~Philip_S._Thomas1;~Scott_Niekum1", "aff": "University of Texas, Austin;University of Massachusetts, Amherst;University of Massachusetts, Amherst;University of Massachusetts Amherst;College of Information and Computer Science, University of Massachusetts, Amherst;University of Texas, Austin", "aff_domain": "utexas.edu;umass.edu;umass.edu;umass.edu;cics.umass.edu;utexas.edu", "position": "Postdoc;PhD student;Assistant Professor;Professor;Assistant Professor;Associate Professor", "bibtex": "@inproceedings{\ngiguere2022fairness,\ntitle={Fairness Guarantees under Demographic Shift},\nauthor={Stephen Giguere and Blossom Metevier and Yuriy Brun and Philip S. Thomas and Scott Niekum and Bruno Castro da Silva},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=wbPObLm6ueA}\n}", "github": "", "project": "", "reviewers": "fKm7;8AcS;D4Vj;3Aub", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "4;3;4;5", "correctness": "3;4;3;4", "technical_novelty": "4;3;3;3", "empirical_novelty": "4;3;3;3", "wc_summary_paper": "65;283;146;71", "wc_summary_review": "62;141;28;90", "wc_main_review": "365;314;252;743", "wc_review": "492;738;426;904", "wc_reply_reviewers": "154;54;45;0", "wc_reply_authors": "763;787;721;695", "reply_reviewers": "1;1;1;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 141.25, 87.84183228963293 ], "wc_summary_review_avg": [ 80.25, 41.37858745776612 ], "wc_main_review_avg": [ 418.5, 191.5757030523443 ], "wc_review_avg": [ 640.0, 191.70289512680813 ], "wc_reply_reviewers_avg": [ 63.25, 56.24666656789538 ], "wc_reply_authors_avg": [ 741.5, 35.759614091877445 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.6488856845230502, "corr_recommendation_correctness": 0.6882472016116854, "gs_citation": 64, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14449811951269754794&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "pdf": "https://openreview.net/pdf?id=wbPObLm6ueA", "email": "utexas.edu;umass.edu;umass.edu;umass.edu;cics.umass.edu;utexas.edu", "author_num": 6, "aff_unique_index": "0;1;1;1;1;0", "aff_unique_norm": "University of Texas at Austin;University of Massachusetts Amherst", "aff_unique_dep": ";", "aff_unique_url": "https://www.utexas.edu;https://www.umass.edu", "aff_unique_abbr": "UT Austin;UMass Amherst", "aff_campus_unique_index": "0;1;1;1;1;0", "aff_campus_unique": "Austin;Amherst", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "wfRZkDvxOqj", "title": "Multi-Task Neural Processes", "track": "main", "status": "Reject", "tldr": "", "abstract": "Neural processes have recently emerged as a class of powerful neural latent variable models that combine the strengths of neural networks and stochastic processes. As they can encode contextual data in the network's function space, they offer a new way to model task relatedness in multi-task learning. To study its potential, we develop multi-task neural processes, a new variant of neural processes for multi-task learning. In particular, we propose to explore transferable knowledge from related tasks in the function space to provide inductive bias for improving each individual task. To do so, we derive the function priors in a hierarchical Bayesian inference framework, which enables each task to incorporate the shared knowledge provided by related tasks into its context of the prediction function. Our multi-task neural processes methodologically expand the scope of vanilla neural processes and provide a new way of exploring task relatedness in function spaces for multi-task learning. The proposed multi-task neural processes are capable of learning multiple tasks with limited labeled data and in the presence of domain shift. We perform extensive experimental evaluations on several benchmarks for the multi-task regression and classification tasks. The results demonstrate the effectiveness of multi-task neural processes in transferring useful knowledge among tasks for multi-task learning and superior performance in multi-task classification and brain image segmentation.", "keywords": "Multi-task learning;Neural processes;Variational Inference", "primary_area": "", "supplementary_material": "", "author": "Jiayi Shen;Xiantong Zhen;Marcel Worring;Ling Shao", "authorids": "~Jiayi_Shen3;~Xiantong_Zhen1;~Marcel_Worring2;~Ling_Shao1", "gender": "F;M;M;M", "homepage": "https://autumn9999.github.io/;;https://staff.fnwi.uva.nl/m.worring/;", "dblp": ";78/10651;35/4613;", "google_scholar": "https://scholar.google.com/citations?hl=zh-CN;https://scholar.google.ca/citations?user=DnBb3e0AAAAJ;pdu8f3sAAAAJ;z84rLjoAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Jiayi_Shen3;~Xiantong_Zhen1;~Marcel_Worring2;~Ling_Shao1", "aff": "University of Amsterdam;Inception Institute of Artificial Intelligence;University of Amsterdam;Terminus Group", "aff_domain": "uva.nl;inceptioniai.org;uva.nl;terminusgroup.com", "position": "PhD student;Senior Scientist;Full Professor;Chief Scientist", "bibtex": "@misc{\nshen2022multitask,\ntitle={Multi-Task Neural Processes},\nauthor={Jiayi Shen and Xiantong Zhen and Marcel Worring and Ling Shao},\nyear={2022},\nurl={https://openreview.net/forum?id=wfRZkDvxOqj}\n}", "github": "", "project": "", "reviewers": "Ef49;H8nA;3xsW;fDzq", "site": "https://openreview.net/forum?id=wfRZkDvxOqj", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "4;4;4;3", "correctness": "2;3;4;3", "technical_novelty": "2;2;2;3", "empirical_novelty": "1;2;4;3", "wc_summary_paper": "114;12;59;51", "wc_summary_review": "20;37;30;61", "wc_main_review": "353;76;110;295", "wc_review": "487;125;199;407", "wc_reply_reviewers": "0;0;491;116", "wc_reply_authors": "777;259;783;616", "reply_reviewers": "0;0;2;1", "reply_authors": "1;1;2;1", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 1.118033988749895 ], "wc_summary_paper_avg": [ 59.0, 36.39368077015569 ], "wc_summary_review_avg": [ 37.0, 15.116216457830975 ], "wc_main_review_avg": [ 208.5, 117.92052408296021 ], "wc_review_avg": [ 304.5, 147.61690282620077 ], "wc_reply_reviewers_avg": [ 151.75, 201.5097702345968 ], "wc_reply_authors_avg": [ 608.75, 212.74911868207587 ], "reply_reviewers_avg": [ 0.75, 0.82915619758885 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.7071067811865475, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8216536148909876755&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1;0;2", "aff_unique_norm": "University of Amsterdam;Inception Institute of Artificial Intelligence;Terminus Group", "aff_unique_dep": ";;", "aff_unique_url": "https://www.uva.nl;https://www.inceptioniai.org;", "aff_unique_abbr": "UvA;;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", "aff_country_unique": "Netherlands;United Arab Emirates;" }, { "title": "Pareto Policy Adaptation", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6886", "id": "wfZGut6e09", "poster": "", "openreview": "https://openreview.net/forum?id=wfZGut6e09", "slides": "https://iclr.cc/virtual/2022/poster/6886", "video": "https://iclr.cc/virtual/2022/poster/6886", "author_site": "Panagiotis Kyriakis, Jyotirmoy Deshmukh, Paul Bogdan", "tldr": "", "abstract": "We present a policy gradient method for Multi-Objective Reinforcement Learning under unknown, linear preferences. By enforcing Pareto stationarity, a first-order condition for Pareto optimality, we are able to design a simple policy gradient algorithm that approximates the Pareto front and infers the unknown preferences. Our method relies on a projected gradient descent solver that identifies common ascent directions for all objectives. Leveraging the solution of that solver, we introduce Pareto Policy Adaptation (PPA), a loss function that adapts the policy to be optimal with respect to any distribution over preferences. PPA uses implicit differentiation to back-propagate the loss gradient bypassing the operations of the projected gradient descent solver. Our approach is straightforward, easy to implement and can be used with all existing policy gradient and actor-critic methods. We evaluate our method in a series of reinforcement learning tasks", "keywords": "multi-objective reinforcement learning;policy gradient;pareto optimality;policy adaptation", "primary_area": "", "supplementary_material": "/attachment/413b411f4c4ec43d9502b5a6c43f9b246a0f7ba8.zip", "author": "Panagiotis Kyriakis;Jyotirmoy Deshmukh;Paul Bogdan", "authorids": "~Panagiotis_Kyriakis1;~Jyotirmoy_Deshmukh2;~Paul_Bogdan1", "gender": "M;M;M", "homepage": ";https://cps.usc.edu/;https://jdeshmukh.github.io", "dblp": "225/3791;05/5539;42/160", "google_scholar": "https://scholar.google.com/citations?hl=en;Xw_v8-gAAAAJ;https://scholar.google.com.tw/citations?user=CwFX74MAAAAJ", "orcid": ";0000-0003-2118-0816;0000-0002-8815-464X", "linkedin": ";paul-bogdan-4b098a6/;jdeshmukh/", "or_profile": "~Panagiotis_Kyriakis1;~Paul_Bogdan1;~Jyotirmoy_Deshmukh1", "aff": "University of Southern California;University of Southern California;University of Southern California", "aff_domain": "usc.edu;usc.edu;usc.edu", "position": "PhD student;Jack Munushian Early Career Chair associate professor;Assistant Professor", "bibtex": "@inproceedings{\nkyriakis2022pareto,\ntitle={Pareto Policy Adaptation},\nauthor={Panagiotis Kyriakis and Jyotirmoy Deshmukh and Paul Bogdan},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=wfZGut6e09}\n}", "github": "", "project": "", "reviewers": "hSRE;T2hX;1Zn3", "pdf_size": 0, "recommendation": "5;6;8", "confidence": "4;3;2", "correctness": "3;3;4", "technical_novelty": "3;2;3", "empirical_novelty": "3;2;3", "wc_summary_paper": "43;65;93", "wc_summary_review": "41;21;2", "wc_main_review": "762;296;131", "wc_review": "846;382;226", "wc_reply_reviewers": "144;20;0", "wc_reply_authors": "1447;966;224", "reply_reviewers": "1;1;0", "reply_authors": "2;2;1", "recommendation_avg": [ 6.333333333333333, 1.247219128924647 ], "confidence_avg": [ 3.0, 0.816496580927726 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 67.0, 20.46134567096374 ], "wc_summary_review_avg": [ 21.333333333333332, 15.923427883328248 ], "wc_main_review_avg": [ 396.3333333333333, 267.1957251820387 ], "wc_review_avg": [ 484.6666666666667, 263.3189869509772 ], "wc_reply_reviewers_avg": [ 54.666666666666664, 63.693711533313 ], "wc_reply_authors_avg": [ 879.0, 503.06328296414824 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.9819805060619659, "corr_recommendation_correctness": 0.944911182523068, "gs_citation": 18, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1086520601314764517&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=wfZGut6e09", "email": "usc.edu;usc.edu;usc.edu", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Southern California", "aff_unique_dep": "", "aff_unique_url": "https://www.usc.edu", "aff_unique_abbr": "USC", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Los Angeles", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "wgR0BQfG5vi", "title": "Adaptive Label Smoothing with Self-Knowledge", "track": "main", "status": "Reject", "tldr": "", "abstract": "Overconfidence has been shown to impair generalization and calibration of a neural network. Previous studies remedy this issue by adding a regularization term to a loss function, preventing a model from making a peaked distribution. Label smoothing smoothes target labels with a predefined prior label distribution; as a result, a model is learned to maximize the likelihood of predicting the soft label. Nonetheless, the amount of smoothing is the same in all samples and remains fixed in training. In other words, label smoothing does not reflect the change in probability distribution mapped by a model over the course of training. To address this issue, we propose a regularization scheme that brings dynamic nature into the smoothing parameter by taking model probability distribution into account, thereby varying the parameter per instance. A model in training self-regulates the extent of smoothing on the fly during forward propagation. Furthermore, inspired by recent work in bridging label smoothing and knowledge distillation, our work utilizes self-knowledge as a prior label distribution in softening target labels, and presents theoretical support for the regularization effect by knowledge distillation. Our regularizer is validated comprehensively on various datasets in machine translation and outperforms strong baselines not only in model performance but also in model calibration by a large margin.", "keywords": "Regularization;Model Calibration;Adaptive Label Smoothing;Self-Knowledge Distillation;Overconfidence;Natural Language Generation", "primary_area": "", "supplementary_material": "/attachment/5b68bbd7cad585bbc4cb8dfb5ed98542f3feb4a0.zip", "author": "Dongkyu Lee;Ka Chun Cheung;Nevin Zhang", "authorids": "~Dongkyu_Lee2;~Ka_Chun_Cheung1;~Nevin_Zhang1", "gender": "M;M;M", "homepage": ";;https://cse.hkust.edu.hk/~lzhang/teach/courses.html", "dblp": "93/5664;165/1089;https://dblp.uni-trier.de/pid/z/NevinLianwenZhang.html", "google_scholar": "BeJul-gAAAAJ;NvbCXToAAAAJ;", "orcid": ";;", "linkedin": ";;", "or_profile": "~Dongkyu_Lee2;~Ka_Chun_Cheung1;~Nevin_Zhang1", "aff": "Hong Kong University of Science and Technology;NVIDIA;Hong Kong University of Science and Technology", "aff_domain": "ust.hk;nvidia.com;ust.hk", "position": "PhD student;Senior Manager, Solution Architect;Full Professor", "bibtex": "@misc{\nlee2022adaptive,\ntitle={Adaptive Label Smoothing with Self-Knowledge},\nauthor={Dongkyu Lee and Ka Chun Cheung and Nevin Zhang},\nyear={2022},\nurl={https://openreview.net/forum?id=wgR0BQfG5vi}\n}", "github": "", "project": "", "reviewers": "PF2S;xcg8;WMdY;528k", "site": "https://openreview.net/forum?id=wgR0BQfG5vi", "pdf_size": 0, "recommendation": "6;6;6;6", "confidence": "3;4;4;4", "correctness": "4;3;4;4", "technical_novelty": "3;3;4;2", "empirical_novelty": "3;3;4;2", "wc_summary_paper": "141;103;87;95", "wc_summary_review": "26;58;66;62", "wc_main_review": "247;248;159;228", "wc_review": "414;409;312;385", "wc_reply_reviewers": "0;7;0;0", "wc_reply_authors": "470;639;268;1043", "reply_reviewers": "0;1;0;0", "reply_authors": "2;2;2;3", "recommendation_avg": [ 6.0, 0.0 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 106.5, 20.706279240848655 ], "wc_summary_review_avg": [ 53.0, 15.84297951775486 ], "wc_main_review_avg": [ 220.5, 36.39024594585752 ], "wc_review_avg": [ 380.0, 40.761501444377636 ], "wc_reply_reviewers_avg": [ 1.75, 3.031088913245535 ], "wc_reply_authors_avg": [ 605.0, 284.95350497932117 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 2.25, 0.4330127018922193 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:H6Sraji63FEJ:scholar.google.com/&scioq=Adaptive+Label+Smoothing+with+Self-Knowledge&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1;0", "aff_unique_norm": "Hong Kong University of Science and Technology;NVIDIA", "aff_unique_dep": ";NVIDIA Corporation", "aff_unique_url": "https://www.ust.hk;https://www.nvidia.com", "aff_unique_abbr": "HKUST;NVIDIA", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;1;0", "aff_country_unique": "China;United States" }, { "id": "wk5-XVtitD", "title": "Language Model Pre-training Improves Generalization in Policy Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Language model (LM) pre-training has proven useful for a wide variety of language processing tasks, including tasks that require nontrivial planning and reasoning capabilities. Can these capabilities be leveraged for more general machine learning problems? We investigate the effectiveness of LM pretraining to scaffold learning and generalization in autonomous decision-making. We use a pre-trained GPT-2 LM to initialize an interactive policy, which we fine-tune via imitation learning to perform interactive tasks in a simulated household environment featuring partial observability, large action spaces, and long time horizons. To leverage pre-training, we first encode observations, goals, and history information as templated English strings, and train the policy to predict the next action. We find that this form of pre-training enables generalization in policy learning: for test tasks involving novel goals or environment states, initializing policies with language models improves task completion rates by nearly 20%. Additional experiments explore the role of language-based encodings in these results; we find that it is possible to train a simple adapter layer that maps from observations and action histories to LM embeddings, and thus that language modeling provides an effective initializer even for tasks with no language as input or output. Together, these results suggest that language modeling induces representations that are useful for modeling not just language, but natural goals and plans; these representations can aid learning and generalization even outside of language processing.", "keywords": "Large Language model;Imitation learning;Interactive tasks;policy learning", "primary_area": "", "supplementary_material": "/attachment/0312222badd60ac63d8486a2fb35bed6f4b79c98.zip", "author": "Shuang Li;Xavier Puig;Yilun Du;Ekin Aky\u00fcrek;Antonio Torralba;Jacob Andreas;Igor Mordatch", "authorids": "~Shuang_Li5;~Xavier_Puig1;~Yilun_Du1;~Ekin_Aky\u00fcrek1;~Antonio_Torralba1;~Jacob_Andreas1;~Igor_Mordatch4", "gender": ";M;;;M;M;", "homepage": ";https://people.csail.mit.edu/xavierpuig/;https://yilundu.github.io;;http://web.mit.edu/torralba/www//;http://web.mit.edu/jda/www;", "dblp": ";50/8429;204/4379;;t/AntonioBTorralba;97/8154;", "google_scholar": ";;;;https://scholar.google.com.tw/citations?user=8cxDHS4AAAAJ;dnZ8udEAAAAJ;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": "~Shuang_Li5;~Xavier_Puig1;~Yilun_Du1;~Ekin_Aky\u00fcrek1;~Antonio_Torralba1;~Jacob_Andreas1;~Igor_Mordatch4", "aff": ";Massachusetts Institute of Technology;Massachusetts Institute of Technology;;Massachusetts Institute of Technology;Microsoft;", "aff_domain": ";mit.edu;mit.edu;;mit.edu;microsoft.com;", "position": ";PhD student;PhD student;;Full Professor;Researcher;", "bibtex": "@misc{\nli2022language,\ntitle={Language Model Pre-training Improves Generalization in Policy Learning},\nauthor={Shuang Li and Xavier Puig and Yilun Du and Ekin Aky{\\\"u}rek and Antonio Torralba and Jacob Andreas and Igor Mordatch},\nyear={2022},\nurl={https://openreview.net/forum?id=wk5-XVtitD}\n}", "github": "", "project": "", "reviewers": "KsBt;9PU7;NUGX;g58W", "site": "https://openreview.net/forum?id=wk5-XVtitD", "pdf_size": 0, "recommendation": "3;3;3;6", "confidence": "5;4;4;5", "correctness": "3;2;2;4", "technical_novelty": "1;2;2;2", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "95;90;54;298", "wc_summary_review": "60;11;37;16", "wc_main_review": "1036;242;182;132", "wc_review": "1191;343;273;446", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.75, 1.299038105676658 ], "confidence_avg": [ 4.5, 0.5 ], "correctness_avg": [ 2.75, 0.82915619758885 ], "technical_novelty_avg": [ 1.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 134.25, 95.85503377496667 ], "wc_summary_review_avg": [ 31.0, 19.3778223750761 ], "wc_main_review_avg": [ 398.0, 370.4024837929681 ], "wc_review_avg": [ 563.25, 367.6182632840757 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": 0.8703882797784892, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:K8FaycEhaE8J:scholar.google.com/&scioq=Language+Model+Pre-training+Improves+Generalization+in+Policy+Learning&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0;0;1", "aff_unique_norm": "Massachusetts Institute of Technology;Microsoft", "aff_unique_dep": ";Microsoft Corporation", "aff_unique_url": "https://web.mit.edu;https://www.microsoft.com", "aff_unique_abbr": "MIT;Microsoft", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Understanding and Improving Graph Injection Attack by Promoting Unnoticeability", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6046", "id": "wkMG8cdvh7-", "poster": "", "openreview": "https://openreview.net/forum?id=wkMG8cdvh7-", "slides": "https://iclr.cc/virtual/2022/poster/6046", "video": "https://iclr.cc/virtual/2022/poster/6046", "author_site": "Yongqiang Chen, Han Yang, Yonggang Zhang, MA KAILI, Tongliang Liu, Bo Han, James Cheng", "tldr": "", "abstract": "Recently Graph Injection Attack (GIA) emerges as a practical attack scenario on Graph Neural Networks (GNNs), where the adversary can merely inject few malicious nodes instead of modifying existing nodes or edges, i.e., Graph Modification Attack (GMA). Although GIA has achieved promising results, little is known about why it is successful and whether there is any pitfall behind the success. To understand the power of GIA, we compare it with GMA and find that GIA can be provably more harmful than GMA due to its relatively high flexibility. However, the high flexibility will also lead to great damage to the homophily distribution of the original graph, i.e., similarity among neighbors. Consequently, the threats of GIA can be easily alleviated or even prevented by homophily-based defenses designed to recover the original homophily. To mitigate the issue, we introduce a novel constraint \u2013 homophily unnoticeability that enforces GIA to preserve the homophily, and propose Harmonious Adversarial Objective (HAO) to instantiate it. Extensive experiments verify that GIA with HAO can break homophily-based defenses and outperform previous GIA attacks by a significant margin. We believe our methods can serve for a more reliable evaluation of the robustness of GNNs.", "keywords": "Graph Neural Networks;Adversarial Attacks;Node Classification", "primary_area": "", "supplementary_material": "/attachment/3b3e4c1335f5458952b6fc4f996c9af64ff1cf0e.zip", "author": "Yongqiang Chen;Han Yang;Yonggang Zhang;MA KAILI;Tongliang Liu;Bo Han;James Cheng", "authorids": "~Yongqiang_Chen1;~Han_Yang1;~Yonggang_Zhang1;~MA_KAILI1;~Tongliang_Liu1;~Bo_Han1;~James_Cheng2", "gender": ";M;M;F;M;M;M", "homepage": "https://lfhase.win;https://yanghan.me;https://yonggangzhangben.github.io/index.html;;https://tongliang-liu.github.io/;https://www.cse.cuhk.edu.hk/~jcheng/;https://bhanml.github.io/", "dblp": "76/5774-2;42/1222-2;27/6859-3;200/0854-1.html;150/6667;06/4171;241/0472-3", "google_scholar": "huQ_Ig8AAAAJ;zGiPkdsAAAAJ;XSbEr98AAAAJ;;https://scholar.google.com.au/citations?user=EiLdZ_YAAAAJ;;nTNjqHwAAAAJ", "orcid": ";;0000-0002-4080-7592;;;;", "linkedin": ";;;;;;", "or_profile": "~Yongqiang_Chen1;~Han_Yang1;~Yonggang_Zhang1;~MA_KAILI1;~Tongliang_Liu1;~James_Cheng2;~bo_han2", "aff": "Department of Computer Science and Engineering, The Chinese University of Hong Kong;Department of Computer Science and Engineering, The Chinese University of Hong Kong;University of Science and Technology of China;Department of Computer Science and Engineering, The Chinese University of Hong Kong;University of Sydney;The Chinese University of Hong Kong;Microsoft Research", "aff_domain": "cse.cuhk.edu.hk;cse.cuhk.edu.hk;ustc.edu.cn;cse.cuhk.edu.hk;sydney.edu.au;cuhk.edu.hk;microsoft.com", "position": "PhD student;PhD student;PhD student;PhD student;Lecturer;Associate Professor;Researcher", "bibtex": "@inproceedings{\nchen2022understanding,\ntitle={Understanding and Improving Graph Injection Attack by Promoting Unnoticeability},\nauthor={Yongqiang Chen and Han Yang and Yonggang Zhang and MA KAILI and Tongliang Liu and Bo Han and James Cheng},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=wkMG8cdvh7-}\n}", "github": "", "project": "", "reviewers": "iqJd;1USU;nMcc;tbGc", "pdf_size": 0, "recommendation": "6;6;6;8", "confidence": "5;3;3;4", "correctness": "4;3;3;4", "technical_novelty": "2;3;3;4", "empirical_novelty": "2;3;4;3", "wc_summary_paper": "73;98;82;83", "wc_summary_review": "34;56;27;91", "wc_main_review": "211;561;203;319", "wc_review": "318;715;312;493", "wc_reply_reviewers": "0;0;0;45", "wc_reply_authors": "1511;1799;509;1459", "reply_reviewers": "0;0;0;1", "reply_authors": "3;4;1;4", "recommendation_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 84.0, 8.972179222463181 ], "wc_summary_review_avg": [ 52.0, 24.92990172463582 ], "wc_main_review_avg": [ 323.5, 144.57091685397862 ], "wc_review_avg": [ 459.5, 164.4544009748599 ], "wc_reply_reviewers_avg": [ 11.25, 19.48557158514987 ], "wc_reply_authors_avg": [ 1319.5, 485.5314099005336 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 3.0, 1.224744871391589 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.17407765595569782, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 113, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11546054136768832920&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=wkMG8cdvh7-", "email": "cse.cuhk.edu.hk;cse.cuhk.edu.hk;ustc.edu.cn;cse.cuhk.edu.hk;sydney.edu.au;cuhk.edu.hk;microsoft.com", "author_num": 7, "aff_unique_index": "0;0;1;0;2;0;3", "aff_unique_norm": "Chinese University of Hong Kong;University of Science and Technology of China;University of Sydney;Microsoft", "aff_unique_dep": "Department of Computer Science and Engineering;;;Microsoft Research", "aff_unique_url": "https://www.cuhk.edu.hk;http://www.ustc.edu.cn;https://www.sydney.edu.au;https://www.microsoft.com/en-us/research", "aff_unique_abbr": "CUHK;USTC;USYD;MSR", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0;0;1;0;2", "aff_country_unique": "China;Australia;United States" }, { "id": "wmQCFqV9r8L", "title": "SpaceMAP: Visualizing Any Data in 2-dimension by Space Expansion", "track": "main", "status": "Reject", "tldr": "", "abstract": "Dimensionality reduction (DR) and visualization of high-dimensional data is of theoretical and practical value in machine learning and related fields. In theory, there exists an intriguing, non-intuitive discrepancy between the geometry of high-dimensional space and low-dimensional space. Based on this discrepancy, we propose a novel DR and visualization method called Space-based Manifold Approximation and Projection (SpaceMAP). Our method establishes a quantitative space transformation to address the ``crowding problem\" in DR; with the proposed equivalent extended distance (EED) and function distortion (FD) theory, we are able to match the capacity of high-dimensional and low-dimensional space, in a principled manner. To handle complex high-dimensional data with different manifold properties, SpaceMAP makes distinctions between the near field, middle field, and far field of data distribution in a data-specific, hierarchical manner. We evaluated SpaceMAP on a range of artificial and real datasets with different manifold properties, and demonstrated its excellent performance in comparison with classical and state-of-the-art DR methods. In addition, the concept of space distortion provides a generic framework for understanding nonlinear DR methods such as t-distributed Stochastic Neighbor Embedding (tSNE) and Uniform Manifold Approximation and Projection (UMAP).", "keywords": "dimensionality reduction;data embedding;manifold learning", "primary_area": "", "supplementary_material": "", "author": "Xinrui Zu;Qian Tao", "authorids": "~Xinrui_Zu1;~Qian_Tao1", "gender": "M;F", "homepage": "https://zuxinrui.github.io;https://www.tudelft.nl/tnw/over-faculteit/afdelingen/imphys/people/qian-tao", "dblp": ";", "google_scholar": "https://scholar.google.com/citations?hl=en;djCHmzsAAAAJ", "orcid": ";", "linkedin": "zuxinrui/;", "or_profile": "~Xinrui_Zu1;~Qian_Tao1", "aff": "Delft University of Technology;Delft University of Technology", "aff_domain": "tudelft.nl;tudelft.nl", "position": "Researcher;Assistant Professor", "bibtex": "@misc{\nzu2022spacemap,\ntitle={Space{MAP}: Visualizing Any Data in 2-dimension by Space Expansion},\nauthor={Xinrui Zu and Qian Tao},\nyear={2022},\nurl={https://openreview.net/forum?id=wmQCFqV9r8L}\n}", "github": "", "project": "", "reviewers": "HbVB;mVWW;qZBd;gVke;29Bs", "site": "https://openreview.net/forum?id=wmQCFqV9r8L", "pdf_size": 0, "recommendation": "3;3;5;5;5", "confidence": "4;4;4;4;3", "correctness": "2;3;3;4;2", "technical_novelty": "3;3;3;3;3", "empirical_novelty": "2;3;4;2;2", "wc_summary_paper": "114;168;30;169;139", "wc_summary_review": "29;89;19;97;48", "wc_main_review": "275;538;438;1000;341", "wc_review": "418;795;487;1266;528", "wc_reply_reviewers": "0;141;114;430;108", "wc_reply_authors": "296;604;439;694;357", "reply_reviewers": "0;1;1;1;1", "reply_authors": "1;1;1;1;1", "recommendation_avg": [ 4.2, 0.9797958971132712 ], "confidence_avg": [ 3.8, 0.39999999999999997 ], "correctness_avg": [ 2.8, 0.7483314773547882 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.6, 0.8 ], "wc_summary_paper_avg": [ 124.0, 51.22889809472775 ], "wc_summary_review_avg": [ 56.4, 31.404458282224834 ], "wc_main_review_avg": [ 518.4, 256.7104205130754 ], "wc_review_avg": [ 698.8, 311.07902532957763 ], "wc_reply_reviewers_avg": [ 158.6, 143.99388875921088 ], "wc_reply_authors_avg": [ 478.0, 149.5446421641377 ], "reply_reviewers_avg": [ 0.8, 0.4000000000000001 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.408248290463863, "corr_recommendation_correctness": 0.3273268353539886, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:DWEASdyndNsJ:scholar.google.com/&scioq=SpaceMAP:+Visualizing+Any+Data+in+2-dimension+by+Space+Expansion&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Delft University of Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.tudelft.nl", "aff_unique_abbr": "TU Delft", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Netherlands" }, { "title": "Learning Super-Features for Image Retrieval", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6552", "id": "wogsFPHwftY", "poster": "", "openreview": "https://openreview.net/forum?id=wogsFPHwftY", "slides": "https://iclr.cc/virtual/2022/poster/6552", "video": "https://iclr.cc/virtual/2022/poster/6552", "author_site": "Philippe Weinzaepfel, Thomas Lucas, Diane Larlus, Yannis Kalantidis", "tldr": "", "abstract": "Methods that combine local and global features have recently shown excellent performance on multiple challenging deep image retrieval benchmarks, but their use of local features raises at least two issues. First, these local features simply boil down to the localized map activations of a neural network, and hence can be extremely redundant. Second, they are typically trained with a global loss that only acts on top of an aggregation of local features; by contrast, testing is based on local feature matching, which creates a discrepancy between training and testing. In this paper, we propose a novel architecture for deep image retrieval, based solely on mid-level features that we call Super-features. These Super-features are constructed by an iterative attention module and constitute an ordered set in which each element focuses on a localized and discriminant image pattern. For training, they require only image labels. A contrastive loss operates directly at the level of Super-features and focuses on those that match across images. A second complementary loss encourages diversity. Experiments on common landmark retrieval benchmarks validate that Super-features substantially outperform state-of-the-art methods when using the same number of features, and only require a significantly smaller memory footprint to match their performance. Code and models are available at: https://github.com/naver/FIRe.", "keywords": "image retrieval;landmark retrieval;mid-level features", "primary_area": "", "supplementary_material": "", "author": "Philippe Weinzaepfel;Thomas Lucas;Diane Larlus;Yannis Kalantidis", "authorids": "~Philippe_Weinzaepfel1;~Thomas_Lucas1;~Diane_Larlus1;~Yannis_Kalantidis2", "gender": "M;M;F;M", "homepage": "https://europe.naverlabs.com/people_user/Philippe-Weinzaepfel/;https://scholar.google.com/citations?user=tIVcDHUAAAAJ&hl=fr;https://dlarlus.github.io/;https://www.skamalas.com/", "dblp": "29/9989;;48/4033;33/8693", "google_scholar": "https://scholar.google.fr/citations?user=LSxIJ5cAAAAJ;tIVcDHUAAAAJ;https://scholar.google.fr/citations?user=nI2oJqkAAAAJ;QJZQgN8AAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Philippe_Weinzaepfel1;~Thomas_Lucas1;~Diane_Larlus1;~Yannis_Kalantidis2", "aff": "Naver Labs Europe;Naver Labs Europe;NAVER LABS Europe;Naver Labs Europe", "aff_domain": "naverlabs.com;naverlabs.com;naverlabs.com;naverlabs.com", "position": "Research Scientist;Research scientist;Principal Researcher;Research Scientist", "bibtex": "@inproceedings{\nweinzaepfel2022learning,\ntitle={Learning Super-Features for Image Retrieval},\nauthor={Philippe Weinzaepfel and Thomas Lucas and Diane Larlus and Yannis Kalantidis},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=wogsFPHwftY}\n}", "github": "", "project": "", "reviewers": "1P44;9CXB;hp4Y;uoYN", "pdf_size": 0, "recommendation": "6;8;8;8", "confidence": "4;4;4;5", "correctness": "3;4;4;4", "technical_novelty": "2;4;3;3", "empirical_novelty": "3;3;3;3", "wc_summary_paper": "137;98;104;124", "wc_summary_review": "74;305;54;90", "wc_main_review": "367;44;281;917", "wc_review": "578;447;439;1131", "wc_reply_reviewers": "52;0;200;434", "wc_reply_authors": "1466;67;660;1615", "reply_reviewers": "1;0;2;2", "reply_authors": "2;1;2;3", "recommendation_avg": [ 7.5, 0.8660254037844386 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 115.75, 15.594470173750693 ], "wc_summary_review_avg": [ 130.75, 101.40851788681265 ], "wc_main_review_avg": [ 402.25, 319.8651082878531 ], "wc_review_avg": [ 648.75, 283.84359689801 ], "wc_reply_reviewers_avg": [ 171.5, 168.3827485225253 ], "wc_reply_authors_avg": [ 952.0, 626.9477649692994 ], "reply_reviewers_avg": [ 1.25, 0.82915619758885 ], "reply_authors_avg": [ 2.0, 0.7071067811865476 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": 1.0, "gs_citation": 59, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18354886281666747980&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=wogsFPHwftY", "email": "naverlabs.com;naverlabs.com;naverlabs.com;naverlabs.com", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "NAVER LABS", "aff_unique_dep": "", "aff_unique_url": "https://labs.naver.com", "aff_unique_abbr": "NLE", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "Unknown;France" }, { "title": "A Conditional Point Diffusion-Refinement Paradigm for 3D Point Cloud Completion", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7026", "id": "wqD6TfbYkrn", "poster": "", "openreview": "https://openreview.net/forum?id=wqD6TfbYkrn", "slides": "https://iclr.cc/virtual/2022/poster/7026", "video": "https://iclr.cc/virtual/2022/poster/7026", "author_site": "Zhaoyang Lyu, Zhifeng Kong, Xudong XU, Liang Pan, Dahua Lin", "tldr": "", "abstract": "3D point clouds are an important data format that captures 3D information for real world objects. Since 3D point clouds scanned in the real world are often incomplete, it is important to recover the complete point cloud for many downstreaming applications. Most existing point cloud completion methods use the Chamfer Distance (CD) loss for training. The CD loss estimates correspondences between two point clouds by searching nearest neighbors, which does not capture the overall point distribution on the generated shape, and therefore likely leads to non-uniform point cloud generation. To tackle this problem, we propose a novel Point Diffusion-Refinement (PDR) paradigm for point cloud completion. PDR consists of a Conditional Generation Network (CGNet) and a ReFinement Network (RFNet). The CGNet uses a conditional generative model called the denoising diffusion probabilistic model (DDPM) to generate a coarse completion conditioned on the partial observation. DDPM establishes a one-to-one pointwise mapping between the generated point cloud and the uniform ground truth, and then optimizes the mean squared error loss to realize uniform generation. The RFNet refines the coarse output of the CGNet and further improves quality of the completed point cloud. In terms of the architecture, we develop a novel dual-path architecture for both networks. The architecture can (1) effectively and efficiently extract multi-level features from partially observed point clouds to guide completion, and (2) accurately manipulate spatial locations of 3D points to obtain smooth surfaces and sharp details. Extensive experimental results on various benchmark datasets show that our PDR paradigm outperforms previous state-of-the-art methods for point cloud completion. In addition, with the help of the RFNet, we can accelerate the iterative generation process of the DDPM by up to 50 times without much performance drop.", "keywords": "Point Cloud Completion;Denoising Diffusion Pobabilistic Model;Conditional Generation", "primary_area": "", "supplementary_material": "", "author": "Zhaoyang Lyu;Zhifeng Kong;Xudong XU;Liang Pan;Dahua Lin", "authorids": "~Zhaoyang_Lyu1;~Zhifeng_Kong1;~Xudong_XU1;~Liang_Pan2;~Dahua_Lin1", "gender": "M;M;M;M;M", "homepage": ";https://cseweb.ucsd.edu/~z4kong;https://sheldontsui.github.io;https://scholar.google.com/citations?user=lSDISOcAAAAJ&hl=en;http://dahua.site", "dblp": "241/6250;206/7097;210/2741;90/343;53/6088", "google_scholar": "https://scholar.google.com.tw/citations?user=gkXFhbwAAAAJ;jAOD1dsAAAAJ;https://scholar.google.com.hk/citations?user=D8VMkA8AAAAJ;lSDISOcAAAAJ;GMzzRRUAAAAJ", "orcid": ";;;;", "linkedin": ";zhifeng-kong-745605103/;;;", "or_profile": "~Zhaoyang_Lyu1;~Zhifeng_Kong1;~Xudong_XU1;~Liang_Pan2;~Dahua_Lin1", "aff": "The Chinese University of Hong Kong;University of California, San Diego;The Chinese University of Hong Kong;Nanyang Technological University;The Chinese University of Hong Kong", "aff_domain": "cuhk.edu.hk;ucsd.edu;ie.cuhk.edu;ntu.eud.sg;cuhk.edu.hk", "position": "PhD student;PhD student;PhD student;Postdoc;Associate Professor", "bibtex": "@inproceedings{\nlyu2022a,\ntitle={A Conditional Point Diffusion-Refinement Paradigm for 3D Point Cloud Completion},\nauthor={Zhaoyang Lyu and Zhifeng Kong and Xudong XU and Liang Pan and Dahua Lin},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=wqD6TfbYkrn}\n}", "github": "", "project": "", "reviewers": "bEQt;qA73;prns", "pdf_size": 0, "recommendation": "6;8;8", "confidence": "5;4;5", "correctness": "3;4;4", "technical_novelty": "3;4;4", "empirical_novelty": "3;3;3", "wc_summary_paper": "61;109;120", "wc_summary_review": "58;25;70", "wc_main_review": "167;274;541", "wc_review": "286;408;731", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "776;448;454", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "recommendation_avg": [ 7.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "correctness_avg": [ 3.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 3.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 96.66666666666667, 25.61683474245447 ], "wc_summary_review_avg": [ 51.0, 19.026297590440446 ], "wc_main_review_avg": [ 327.3333333333333, 157.27329362892127 ], "wc_review_avg": [ 475.0, 187.74628269733242 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 559.3333333333334, 153.22604942444423 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.4999999999999999, "corr_recommendation_correctness": 0.9999999999999998, "gs_citation": 153, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4241075093947761257&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=wqD6TfbYkrn", "email": "cuhk.edu.hk;ucsd.edu;ie.cuhk.edu;ntu.eud.sg;cuhk.edu.hk", "author_num": 5, "aff_unique_index": "0;1;0;2;0", "aff_unique_norm": "Chinese University of Hong Kong;University of California, San Diego;Nanyang Technological University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.cuhk.edu.hk;https://www.ucsd.edu;https://www.ntu.edu.sg", "aff_unique_abbr": "CUHK;UCSD;NTU", "aff_campus_unique_index": "0;1;0;0", "aff_campus_unique": "Hong Kong SAR;San Diego;", "aff_country_unique_index": "0;1;0;2;0", "aff_country_unique": "China;United States;Singapore" }, { "id": "wronZ3Mx_d", "title": "Transfer Learning for Bayesian HPO with End-to-End Meta-Features", "track": "main", "status": "Reject", "tldr": "", "abstract": "Hyperparameter optimization (HPO) is a crucial component of deploying machine learning models, however, it remains an open problem due to the resource-constrained number of possible hyperparameter evaluations. As a result, prior work focus on exploring the direction of transfer learning for tackling the sample inefficiency of HPO. In contrast to existing approaches, we propose a novel Deep Kernel Gaussian Process surrogate with Landmark Meta-features (DKLM) that can be jointly meta-trained on a set of source tasks and then transferred efficiently on a new (unseen) target task. We design DKLM to capture the similarity between hyperparameter configurations with an end-to-end meta-feature network that embeds the set of evaluated configurations and their respective performance. As a result, our novel DKLM can learn contextualized dataset-specific similarity representations for hyperparameter configurations. We experimentally validate the performance of DKLM in a wide range of HPO meta-datasets from OpenML and demonstrate the empirical superiority of our method against a series of state-of-the-art baselines.", "keywords": "meta-learning;hyperparameter optimization;meta-features;deep kernel learning;Bayesian optimization;transfer learning", "primary_area": "", "supplementary_material": "/attachment/5d85e6066643af04ab119a7a0b830d6791bbf0d1.zip", "author": "Hadi Samer Jomaa;Sebastian Pineda Arango;Lars Schmidt-Thieme;Josif Grabocka", "authorids": "~Hadi_Samer_Jomaa1;~Sebastian_Pineda_Arango1;~Lars_Schmidt-Thieme1;~Josif_Grabocka1", "gender": "M;M;M;M", "homepage": "https://www.ismll.uni-hildesheim.de/personen/hsjomaa.html;;https://www.ismll.uni-hildesheim.de/personen/lst_en.html;https://www.utn.de/departments/department-engineering/machine-learning-lab/", "dblp": ";271/4257;s/LarsSchmidtThieme;117/4936", "google_scholar": "QLSZWNkAAAAJ;8UI_0B0AAAAJ;https://scholar.google.de/citations?user=l3taTdYAAAAJ;KRy27XcAAAAJ", "orcid": ";;0000-0001-5729-6023;", "linkedin": "hadisamerjomaa/;sebaspine/;;", "or_profile": "~Hadi_Samer_Jomaa1;~Sebastian_Pineda_Arango1;~Lars_Schmidt-Thieme1;~Josif_Grabocka1", "aff": "University of Hildesheim;Universit\u00e4t Freiburg;University of Hildesheim;Universit\u00e4t Freiburg", "aff_domain": "uni-hildesheim.de;uni-freiburg.de;uni-hildesheim.de;uni-freiburg.de", "position": "PhD student;PhD student;Full Professor;Assistant Professor", "bibtex": "@misc{\njomaa2022transfer,\ntitle={Transfer Learning for Bayesian {HPO} with End-to-End Meta-Features},\nauthor={Hadi Samer Jomaa and Sebastian Pineda Arango and Lars Schmidt-Thieme and Josif Grabocka},\nyear={2022},\nurl={https://openreview.net/forum?id=wronZ3Mx_d}\n}", "github": "", "project": "", "reviewers": "1rkp;ZdAZ;5B4g;bzFz;kJb9", "site": "https://openreview.net/forum?id=wronZ3Mx_d", "pdf_size": 0, "recommendation": "5;5;6;6;8", "confidence": "3;3;3;3;3", "correctness": "3;3;4;3;4", "technical_novelty": "3;3;2;3;3", "empirical_novelty": "3;2;2;3;3", "wc_summary_paper": "98;69;112;114;90", "wc_summary_review": "155;81;38;36;37", "wc_main_review": "1937;305;205;123;202", "wc_review": "2190;455;355;273;329", "wc_reply_reviewers": "703;273;713;0;192", "wc_reply_authors": "867;568;386;137;35", "reply_reviewers": "3;1;2;0;1", "reply_authors": "2;2;1;1;1", "recommendation_avg": [ 6.0, 1.0954451150103321 ], "confidence_avg": [ 3.0, 0.0 ], "correctness_avg": [ 3.4, 0.4898979485566356 ], "technical_novelty_avg": [ 2.8, 0.39999999999999997 ], "empirical_novelty_avg": [ 2.6, 0.4898979485566356 ], "wc_summary_paper_avg": [ 96.6, 16.414627622946554 ], "wc_summary_review_avg": [ 69.4, 46.07211738134031 ], "wc_main_review_avg": [ 554.4, 693.7081807215479 ], "wc_review_avg": [ 720.4, 737.1620174697011 ], "wc_reply_reviewers_avg": [ 376.2, 285.0750076734192 ], "wc_reply_authors_avg": [ 398.6, 299.63083953425087 ], "reply_reviewers_avg": [ 1.4, 1.019803902718557 ], "reply_authors_avg": [ 1.4, 0.4898979485566356 ], "replies_avg": [ 23, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.74535599249993, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Ef7lx8i9myIJ:scholar.google.com/&scioq=Transfer+Learning+for+Bayesian+HPO+with+End-to-End+Meta-Features&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;0;1", "aff_unique_norm": "University of Hildesheim;University of Freiburg", "aff_unique_dep": ";", "aff_unique_url": "https://www.uni-hildesheim.de/;https://www.uni-freiburg.de", "aff_unique_abbr": ";Uni Freiburg", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Germany" }, { "id": "wsJodhkuqs", "title": "Coordinated Attacks Against Federated Learning: A Multi-Agent Reinforcement Learning Approach", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "We propose a model-based multi-agent reinforcement learning attack framework against federated learning systems. Our framework first approximates the distribution of the clients' aggregated data through cooperative multi-agent coordination. It then learns an attack policy through multi-agent reinforcement learning. Depending on the availability of the server's federated learning configurations, we introduce algorithms for both white-box attacks and black-box attacks. Our attack methods are capable of handling scenarios when the clients' data is independent and identically distributed and when the data is independent but not necessarily identically distributed. We further derive an upper bound on the attacker's performance loss due to inaccurate distribution estimation. Experimental results on real-world datasets demonstrate that the proposed attack framework achieves strong performance even if the server deploys advanced defense mechanisms. Our work sheds light on how to attack federated learning systems through multi-agent coordination.", "keywords": "Federated learning;adversarial attackers;multi-agent reinforcement learning;model-based reinforcement learning", "primary_area": "", "supplementary_material": "", "author": "Wen Shen;Henger Li;Zizhan Zheng", "authorids": "~Wen_Shen1;~Henger_Li1;~Zizhan_Zheng1", "gender": "M;M;M", "homepage": "http://wshen.net;;https://www.cs.tulane.edu/~zzheng3/", "dblp": "55/8186-1;241/9384;23/286", "google_scholar": "https://scholar.google.ca/citations?user=kSveYFUAAAAJ;https://scholar.google.com/citations?hl=en;B1v2AUYAAAAJ", "orcid": ";;", "linkedin": ";hengerli30/;", "or_profile": "~Wen_Shen1;~Henger_Li1;~Zizhan_Zheng1", "aff": ";Tulane University;Tulane University", "aff_domain": ";tulane.edu;tulane.edu", "position": ";PhD student;Assistant Professor", "bibtex": "@misc{\nshen2022coordinated,\ntitle={Coordinated Attacks Against Federated Learning: A Multi-Agent Reinforcement Learning Approach},\nauthor={Wen Shen and Henger Li and Zizhan Zheng},\nyear={2022},\nurl={https://openreview.net/forum?id=wsJodhkuqs}\n}", "github": "", "project": "", "reviewers": "MKE5;PzTn;nurU;WhVX", "site": "https://openreview.net/forum?id=wsJodhkuqs", "pdf_size": 0, "recommendation": "3;5;6;6", "confidence": "3;3;3;3", "correctness": "4;3;4;3", "technical_novelty": "1;3;3;3", "empirical_novelty": "2;3;3;4", "wc_summary_paper": "60;63;61;128", "wc_summary_review": "25;119;33;60", "wc_main_review": "87;484;245;333", "wc_review": "172;666;339;521", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 5.0, 1.224744871391589 ], "confidence_avg": [ 3.0, 0.0 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.5, 0.8660254037844386 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 78.0, 28.88771365130858 ], "wc_summary_review_avg": [ 59.25, 36.85359548266627 ], "wc_main_review_avg": [ 287.25, 143.77825809210515 ], "wc_review_avg": [ 424.5, 186.2129157711677 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": -0.40824829046386296, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8439231343199022440&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0", "aff_unique_norm": "Tulane University", "aff_unique_dep": "", "aff_unique_url": "https://www.tulane.edu", "aff_unique_abbr": "Tulane", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "wsuQ2h6KZXQ", "title": "Image-to-Image MLP-mixer for Image Reconstruction", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Neural networks are highly effective tools for image reconstruction problems such as denoising and compressive sensing. Neural networks for image reconstruction tasks to date are almost exclusively convolutional networks. The most popular architecture is the U-net, a convolutional network with multi-resolution architecture. In this work, we show that a simple network based on the multi-layer perceptron (MLP)-mixer enables state-of-the art image reconstruction performance without convolutions and without a multi-resolution architecture. Similar to the original MLP-mixer, the image-to-image MLP-mixer is based exclusively on MLPs operating on linearly-transformed image patches. Contrary to the MLP-mixer, we incorporate structure by retaining the relative positions of the image patches. This imposes an inductive bias towards natural images which enables the image-to-image MLP-mixer to learn to denoise images based on relatively few examples. The image-to-image MLP-mixer requires fewer parameters to achieve the same denoising performance than the U-net and its parameters scale linearly in the image resolution instead of quadratically as for the original MLP-mixer. If trained on a moderate amount of examples for denoising, the image-to-image MLP-mixer outperforms the U-net by a slight margin. It also outperforms the vision transformer tailored for image reconstruction and classical un-trained methods such as BM3D.", "keywords": "MLP-mixer;image reconstruction;denoising;compressive sensing", "primary_area": "", "supplementary_material": "", "author": "Youssef Mansour;Kang Lin;Reinhard Heckel", "authorids": "~Youssef_Mansour1;ka.lin@tum.de;~Reinhard_Heckel1", "gender": "M;;M", "homepage": "https://www.ce.cit.tum.de/mli/people/youssef-mansour/yo;;", "dblp": "305/1711;;81/9668", "google_scholar": "qA-U3n0AAAAJ;;ZWV0I7cAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Youssef_Mansour1;ka.lin@tum.de;~Reinhard_Heckel1", "aff": "Technische Universit\u00e4t M\u00fcnchen;;Technical University Munich", "aff_domain": "tum.de;;tum.de", "position": "PhD student;;Assistant Professor", "bibtex": "@misc{\nmansour2022imagetoimage,\ntitle={Image-to-Image {MLP}-mixer for Image Reconstruction},\nauthor={Youssef Mansour and Kang Lin and Reinhard Heckel},\nyear={2022},\nurl={https://openreview.net/forum?id=wsuQ2h6KZXQ}\n}", "github": "", "project": "", "reviewers": "JibE;vwax;E1ux;wbCs", "site": "https://openreview.net/forum?id=wsuQ2h6KZXQ", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "4;4;3;4", "correctness": "2;2;4;3", "technical_novelty": "1;2;2;2", "empirical_novelty": "2;2;2;1", "wc_summary_paper": "71;85;48;18", "wc_summary_review": "62;650;59;52", "wc_main_review": "681;406;145;62", "wc_review": "814;1141;252;132", "wc_reply_reviewers": "687;24;0;0", "wc_reply_authors": "1129;481;355;380", "reply_reviewers": "1;1;0;0", "reply_authors": "2;1;1;1", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.82915619758885 ], "technical_novelty_avg": [ 1.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 55.5, 25.362373705944798 ], "wc_summary_review_avg": [ 205.75, 256.5135230353363 ], "wc_main_review_avg": [ 323.5, 242.31023502939368 ], "wc_review_avg": [ 584.75, 411.6050139393348 ], "wc_reply_reviewers_avg": [ 177.75, 294.1788359144825 ], "wc_reply_authors_avg": [ 586.25, 316.8874997534614 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.9045340337332909, "gs_citation": 29, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17449623033601795702&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1", "aff_unique_norm": "Technische Universit\u00e4t M\u00fcnchen;Technical University of Munich", "aff_unique_dep": ";", "aff_unique_url": "https://www.tum.de;https://www.tum.de", "aff_unique_abbr": "TUM;TUM", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Germany" }, { "id": "wu5yYUutDGW", "title": "Boundary-aware Pre-training for Video Scene Segmentation", "track": "main", "status": "Reject", "tldr": "", "abstract": "Self-supervised learning has drawn attention through its effectiveness in learning in-domain representations with no ground-truth annotations; in particular, it is shown that properly designed pretext tasks (e.g., contrastive prediction task) bring significant performance gains for a downstream task (e.g., classification task). Inspired from this, we tackle video scene segmentation, which is a task of temporally localizing scene boundaries in a video, with a self-supervised learning framework where we mainly focus on designing effective pretext tasks. In our framework, we discover a pseudo-boundary from a sequence of shots by splitting it into two continuous, non-overlapping sub-sequences and leverage the pseudo-boundary to facilitate the pre-training. Based on this, we introduce three novel boundary-aware pretext tasks: 1) Shot-Scene Matching (SSM), 2) Contextual Group Matching (CGM) and 3) Pseudo-boundary Prediction (PP); SSM and CGM guide the model to maximize intra-scene similarity and inter-scene discrimination while PP encourages the model to identify transitional moments. Through comprehensive analysis, we empirically show that pre-training and transferring contextual representation are both critical to improving the video scene segmentation performance. Lastly, we achieve the new state-of-the-art on the MovieNet-SSeg benchmark. The code will be released.", "keywords": "Temporal Segmentation;Video Scene segmentation;Self-supervised Learning", "primary_area": "", "supplementary_material": "", "author": "Jonghwan Mun;Minchul Shin;Gunsoo Han;Sangho Lee;Seongsu Ha;Joonseok Lee;Eun-Sol Kim", "authorids": "~Jonghwan_Mun1;~Minchul_Shin1;~Gunsoo_Han1;~Sangho_Lee5;~Seongsu_Ha1;~Joonseok_Lee1;~Eun-Sol_Kim1", "gender": "M;M;M;M;M;M;F", "homepage": "http://cvlab.postech.ac.kr/~jonghwan;https://github.com/nashory;https://github.com/robinsongh381;http://sangho.ai;https://seongsuha.github.io/;http://www.joonseok.net;", "dblp": "192/2107;117/7772;;;;77/1319.html;52/10086", "google_scholar": ";https://scholar.google.co.kr/citations?user=52NtRk8AAAAJ;;;dmvMjvcAAAAJ;https://scholar.google.co.kr/citations?user=M-MfqpMAAAAJ;JhZBnfYAAAAJ", "orcid": ";0000-0002-0638-2017;;;;;", "linkedin": ";minchul-shin-16656213b/;;;;joonseoklee;", "or_profile": "~Jonghwan_Mun1;~Minchul_Shin1;~Gunsoo_Han1;~Sangho_Lee5;~Seongsu_Ha1;~Joonseok_Lee1;~Eun-Sol_Kim1", "aff": "Kakao Brain;Kakao Brain;Kakao Brain;;Seoul National University;Google Research;Hanyang University", "aff_domain": "kakobrain.com;kakaobrain.com;kakaobrain.com;;snu.ac.kr;google.com;hanyang.ac.kr", "position": "Research Scientist;Research Scientist;Researcher;;MS student;Research Scientist;Assistant Professor", "bibtex": "@misc{\nmun2022boundaryaware,\ntitle={Boundary-aware Pre-training for Video Scene Segmentation},\nauthor={Jonghwan Mun and Minchul Shin and Gunsoo Han and Sangho Lee and Seongsu Ha and Joonseok Lee and Eun-Sol Kim},\nyear={2022},\nurl={https://openreview.net/forum?id=wu5yYUutDGW}\n}", "github": "", "project": "", "reviewers": "pa9p;3Yuj;DwYy", "site": "https://openreview.net/forum?id=wu5yYUutDGW", "pdf_size": 0, "recommendation": "5;6;6", "confidence": "5;4;4", "correctness": "3;3;3", "technical_novelty": "3;3;3", "empirical_novelty": "2;2;3", "wc_summary_paper": "44;68;91", "wc_summary_review": "36;26;56", "wc_main_review": "304;174;225", "wc_review": "384;268;372", "wc_reply_reviewers": "169;83;202", "wc_reply_authors": "2289;790;1685", "reply_reviewers": "1;1;1", "reply_authors": "4;1;3", "recommendation_avg": [ 5.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 67.66666666666667, 19.189117286165672 ], "wc_summary_review_avg": [ 39.333333333333336, 12.47219128924647 ], "wc_main_review_avg": [ 234.33333333333334, 53.48104544810453 ], "wc_review_avg": [ 341.3333333333333, 52.085399958998956 ], "wc_reply_reviewers_avg": [ 151.33333333333334, 50.16195991209098 ], "wc_reply_authors_avg": [ 1588.0, 615.7959618791493 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 2.6666666666666665, 1.247219128924647 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": -0.9999999999999998, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:-940JEgLsvoJ:scholar.google.com/&scioq=Boundary-aware+Pre-training+for+Video+Scene+Segmentation&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;0;1;2;3", "aff_unique_norm": "Kakao Brain;Seoul National University;Google;Hanyang University", "aff_unique_dep": ";;Google Research;", "aff_unique_url": "https://brain.kakao.com;https://www.snu.ac.kr;https://research.google;https://www.hanyang.ac.kr", "aff_unique_abbr": "Kakao Brain;SNU;Google Research;HYU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0;0;1;0", "aff_country_unique": "South Korea;United States" }, { "title": "TAMP-S2GCNets: Coupling Time-Aware Multipersistence Knowledge Representation with Spatio-Supra Graph Convolutional Networks for Time-Series Forecasting", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7162", "id": "wv6g8fWLX2q", "poster": "", "openreview": "https://openreview.net/forum?id=wv6g8fWLX2q", "slides": "https://iclr.cc/virtual/2022/poster/7162", "video": "https://iclr.cc/virtual/2022/poster/7162", "author_site": "Yuzhou Chen, Ignacio Segovia-Dominguez, Baris Coskunuzer, Yulia Gel", "tldr": "", "abstract": "Graph Neural Networks (GNNs) are proven to be a powerful machinery for learning complex dependencies in multivariate spatio-temporal processes. However, most existing GNNs have inherently static architectures, and as a result, do not explicitly account for time dependencies of the encoded knowledge and are limited in their ability to simultaneously infer latent time-conditioned relations among entities. We postulate that such hidden time-conditioned properties may be captured by the tools of multipersistence, i.e, a emerging machinery in topological data analysis which allows us to quantify dynamics of the data shape along multiple geometric dimensions. \n We make the first step toward integrating the two rising research directions, that is, time-aware deep learning and multipersistence, and propose a new model, Time-Aware Multipersistence Spatio-Supra Graph Convolutional Network (TAMP-S2GCNets). We summarize inherent time-conditioned topological properties of the data as time-aware multipersistence Euler-Poincar\\'e surface and prove its stability. We then construct a supragraph convolution module which simultaneously accounts for the extracted intra- and inter- spatio-temporal dependencies in the data. Our extensive experiments on highway traffic flow, Ethereum token prices, and COVID-19 hospitalizations demonstrate that TAMP-S2GCNets outperforms the state-of-the-art tools in multivariate time series forecasting tasks.", "keywords": "topological data analysis;multipersistence;graph convolutional networks;supragraph diffusion;multivariate time series forecasting", "primary_area": "", "supplementary_material": "/attachment/d9d5823ad1434b6aef134edf896bb18ab77aead9.zip", "author": "Yuzhou Chen;Ignacio Segovia-Dominguez;Baris Coskunuzer;Yulia Gel", "authorids": "~Yuzhou_Chen1;~Ignacio_Segovia-Dominguez1;~Baris_Coskunuzer1;~Yulia_Gel1", "gender": ";;M;", "homepage": ";;https://personal.utdallas.edu/~bxc190014/;", "dblp": ";;287/4893;", "google_scholar": ";;n49tHqQAAAAJ;", "orcid": ";;0000-0001-7462-8819;", "linkedin": ";;baris-coskunuzer-2ba327169/;", "or_profile": "~Yuzhou_Chen1;~Ignacio_Segovia-Dominguez1;~Baris_Coskunuzer1;~Yulia_Gel1", "aff": ";;University of Texas, Dallas;", "aff_domain": ";;utdallas.edu;", "position": ";;Full Professor;", "bibtex": "@inproceedings{\nchen2022tampsgcnets,\ntitle={{TAMP}-S2{GCN}ets: Coupling Time-Aware Multipersistence Knowledge Representation with Spatio-Supra Graph Convolutional Networks for Time-Series Forecasting},\nauthor={Yuzhou Chen and Ignacio Segovia-Dominguez and Baris Coskunuzer and Yulia Gel},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=wv6g8fWLX2q}\n}", "github": "", "project": "", "reviewers": "rDH8;8UH6;T4LP", "pdf_size": 0, "recommendation": "8;8;8", "confidence": "3;4;4", "correctness": "4;3;4", "technical_novelty": "3;4;4", "empirical_novelty": "4;4;4", "wc_summary_paper": "109;41;42", "wc_summary_review": "101;15;30", "wc_main_review": "302;167;211", "wc_review": "512;223;283", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 8.0, 0.0 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 3.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 4.0, 0.0 ], "wc_summary_paper_avg": [ 64.0, 31.822423959633664 ], "wc_summary_review_avg": [ 48.666666666666664, 37.5085175512028 ], "wc_main_review_avg": [ 226.66666666666666, 56.21585383343583 ], "wc_review_avg": [ 339.3333333333333, 124.52665935007207 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 86, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17404260762347894686&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "pdf": "https://openreview.net/pdf?id=wv6g8fWLX2q", "email": ";;utdallas.edu;", "author_num": 4, "aff_unique_index": "0", "aff_unique_norm": "University of Texas at Dallas", "aff_unique_dep": "", "aff_unique_url": "https://www.utdallas.edu", "aff_unique_abbr": "UT Dallas", "aff_campus_unique_index": "0", "aff_campus_unique": "Dallas", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "id": "ww6-vH7LgV", "title": "FastEnsemble: Benchmarking and Accelerating Ensemble-based Uncertainty Estimation for Image-to-Image Translation", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Estimating prediction uncertainty and confidence of deep learning models is crucial for mission-critical machine learning applications, such as biomedical imaging for diagnostics or therapy, and self-driving cars. However, making robust uncertainty estimation is complicated given the variety of learning objectives, data modalities, types of data corruption. Previous studies often addressed such a challenge by restricting datasets to standard ones like CIFAR or ImageNet. While convenient, it is doubtful whether the same conclusion holds for real-life datasets, in which more complicated image generation tasks are involved. This paper presents a different perspective to evaluate how confidence and uncertainty estimators behave under distribution shifts, focusing on the biomedical imaging domain. Specifically, we test a series of pair-wise cell imaging datasets using a new metric to compare existing models. In addition, we introduce FastEnsemble, a fast ensemble method which only requires less than $8\\%$ of the full-ensemble training time to generate a new ensemble member. Our experiments show that the proposed fast ensemble method is able to substantially improve the speed vs. quality trade-off.", "keywords": "uncertainty estimation;confidence calibration;biomedical imaging", "primary_area": "", "supplementary_material": "", "author": "Xuanqing Liu;Sara Imboden;Marie Payne;Neil Lin;Cho-Jui Hsieh", "authorids": "~Xuanqing_Liu1;imbodens@student.ethz.ch;mpayne6@g.ucla.edu;neillin@g.ucla.edu;~Cho-Jui_Hsieh1", "gender": "M;;;;M", "homepage": ";;;;http://web.cs.ucla.edu/~chohsieh/index.html", "dblp": "205/2594;;;;14/2770", "google_scholar": ";;;;Wy89g4IAAAAJ", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~Xuanqing_Liu1;imbodens@student.ethz.ch;mpayne6@g.ucla.edu;neillin@g.ucla.edu;~Cho-Jui_Hsieh1", "aff": "Amazon;;;;University of California, Los Angeles", "aff_domain": "amazon.com;;;;ucla.edu", "position": "Researcher;;;;Assistant Professor", "bibtex": "@misc{\nliu2022fastensemble,\ntitle={FastEnsemble: Benchmarking and Accelerating Ensemble-based Uncertainty Estimation for Image-to-Image Translation},\nauthor={Xuanqing Liu and Sara Imboden and Marie Payne and Neil Lin and Cho-Jui Hsieh},\nyear={2022},\nurl={https://openreview.net/forum?id=ww6-vH7LgV}\n}", "github": "", "project": "", "reviewers": "vgxk;SwFr;4uy9;CJNd", "site": "https://openreview.net/forum?id=ww6-vH7LgV", "pdf_size": 0, "recommendation": "3;3;5;6", "confidence": "2;4;3;5", "correctness": "3;4;3;3", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "38;65;67;64", "wc_summary_review": "211;50;38;53", "wc_main_review": "211;742;307;254", "wc_review": "460;857;412;371", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.25, 1.299038105676658 ], "confidence_avg": [ 3.5, 1.118033988749895 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 58.5, 11.884864324004713 ], "wc_summary_review_avg": [ 88.0, 71.23552484540281 ], "wc_main_review_avg": [ 378.5, 212.60350420442273 ], "wc_review_avg": [ 525.0, 194.2511261228619 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.6024640760767093, "corr_recommendation_correctness": -0.5555555555555555, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:--P3ax2_5ocJ:scholar.google.com/&scioq=FastEnsemble:+Benchmarking+and+Accelerating+Ensemble-based+Uncertainty+Estimation+for+Image-to-Image+Translation&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1", "aff_unique_norm": "Amazon;University of California, Los Angeles", "aff_unique_dep": "Amazon.com, Inc.;", "aff_unique_url": "https://www.amazon.com;https://www.ucla.edu", "aff_unique_abbr": "Amazon;UCLA", "aff_campus_unique_index": "1", "aff_campus_unique": ";Los Angeles", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Learning to Remember Patterns: Pattern Matching Memory Networks for Traffic Forecasting", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6957", "id": "wwDg3bbYBIq", "poster": "", "openreview": "https://openreview.net/forum?id=wwDg3bbYBIq", "slides": "https://iclr.cc/virtual/2022/poster/6957", "video": "https://iclr.cc/virtual/2022/poster/6957", "author_site": "Hyunwook Lee, Seungmin Jin, Hyeshin Chu, Hongkyu Lim, Sungahn Ko", "tldr": "", "abstract": "Traffic forecasting is a challenging problem due to complex road networks and sudden speed changes caused by various events on roads. Several models have been proposed to solve this challenging problem, with a focus on learning the spatio-temporal dependencies of roads. In this work, we propose a new perspective for converting the forecasting problem into a pattern-matching task, assuming that large traffic data can be represented by a set of patterns. To evaluate the validity of this new perspective, we design a novel traffic forecasting model called Pattern-Matching Memory Networks (PM-MemNet), which learns to match input data to representative patterns with a key-value memory structure. We first extract and cluster representative traffic patterns that serve as keys in the memory. Then, by matching the extracted keys and inputs, PM-MemNet acquires the necessary information on existing traffic patterns from the memory and uses it for forecasting. To model the spatio-temporal correlation of traffic, we proposed a novel memory architecture, GCMem, which integrates attention and graph convolution. The experimental results indicate that PM-MemNet is more accurate than state-of-the-art models, such as Graph WaveNet, with higher responsiveness. We also present a qualitative analysis describing how PM-MemNet works and achieves higher accuracy when road speed changes rapidly.", "keywords": "Traffic Forecasting;Deep Learning", "primary_area": "", "supplementary_material": "", "author": "Hyunwook Lee;Seungmin Jin;Hyeshin Chu;Hongkyu Lim;Sungahn Ko", "authorids": "~Hyunwook_Lee1;skyjin@unist.ac.kr;hyeshinchu@unist.ac.kr;limhongkyu1219@unist.ac.kr;~Sungahn_Ko1", "gender": "M;;;;M", "homepage": ";;;;https://sites.google.com/view/haiv/", "dblp": ";;;;16/9189", "google_scholar": "GTWj-V4AAAAJ;;;;gKnZiVcAAAAJ", "orcid": "0000-0002-5506-7347;;;;", "linkedin": "hyunwook-lee-2b15ba283;;;;", "or_profile": "~Hyunwook_Lee1;skyjin@unist.ac.kr;hyeshinchu@unist.ac.kr;limhongkyu1219@unist.ac.kr;~Sungahn_Ko1", "aff": "Ulsan National Institute of Science and Technology;;;;Ulsan National Institute of Science and Technology", "aff_domain": "unist.ac.kr;;;;unist.ac.kr", "position": "PhD student;;;;Associate Professor", "bibtex": "@inproceedings{\nlee2022learning,\ntitle={Learning to Remember Patterns: Pattern Matching Memory Networks for Traffic Forecasting},\nauthor={Hyunwook Lee and Seungmin Jin and Hyeshin Chu and Hongkyu Lim and Sungahn Ko},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=wwDg3bbYBIq}\n}", "github": "", "project": "", "reviewers": "mucH;xDZJ;Cncx;L94p", "pdf_size": 0, "recommendation": "5;6;8;8", "confidence": "4;4;5;2", "correctness": "3;2;4;4", "technical_novelty": "3;2;4;3", "empirical_novelty": "2;2;4;3", "wc_summary_paper": "58;30;54;50", "wc_summary_review": "22;24;3;29", "wc_main_review": "288;332;209;68", "wc_review": "368;386;266;147", "wc_reply_reviewers": "0;0;9;0", "wc_reply_authors": "817;1397;325;117", "reply_reviewers": "0;0;1;0", "reply_authors": "2;2;1;1", "recommendation_avg": [ 6.75, 1.299038105676658 ], "confidence_avg": [ 3.75, 1.0897247358851685 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 48.0, 10.770329614269007 ], "wc_summary_review_avg": [ 19.5, 9.86154146165801 ], "wc_main_review_avg": [ 224.25, 100.40013695209782 ], "wc_review_avg": [ 291.75, 95.27952298369257 ], "wc_reply_reviewers_avg": [ 2.25, 3.897114317029974 ], "wc_reply_authors_avg": [ 664.0, 493.66689173976414 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.22075539284417395, "corr_recommendation_correctness": 0.7543365091413573, "gs_citation": 54, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11909851991809109995&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=wwDg3bbYBIq", "email": "unist.ac.kr;;;;unist.ac.kr", "author_num": 5, "aff_unique_index": "0;0", "aff_unique_norm": "Ulsan National Institute of Science and Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.unist.ac.kr", "aff_unique_abbr": "UNIST", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "South Korea" }, { "id": "wwIBobGFj2V", "title": "RoQNN: Noise-Aware Training for Robust Quantum Neural Networks", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Quantum Neural Network (QNN) is a promising application towards quantum advantage on near-term quantum hardware. However, due to the large quantum noises (errors), the performance of QNN models has a severe degradation on real quantum devices. For example, the accuracy gap between noise-free simulation and noisy results on IBMQ-Yorktown for MNIST-4 classification is over 60%. Existing noise mitigation methods are general ones without leveraging unique characteristics of QNN and are only applicable to inference; on the other hand, existing QNN work does not consider noise effect. To this end, we present RoQNN, a QNN-specific framework to perform noise-aware optimizations in both training and inference stages to improve robustness. \nWe analytically deduct and experimentally observe that the effect of quantum noise to QNN measurement outcome is a linear map from noise-free outcome with a scaling and a shift factor. Motivated by that, we propose post-measurement normalization to mitigate the feature distribution differences between noise-free and noisy scenarios. Furthermore, to improve the robustness against noise, we propose noise injection to the training process by inserting quantum error gates to QNN according to realistic noise models of quantum hardware. Finally, post-measurement quantization is introduced to quantize the measurement outcomes to discrete values, achieving the denoising effect. Extensive experiments on 8 classification tasks using 6 quantum devices demonstrate that RoQNN improves accuracy by up to 43% and 22% on average, and achieves over 94% 2-class, 80% 4-class, and 34% 10-class classification accuracy on real quantum computers. We also open-source our PyTorch library for construction and noise-aware training of QNN at this link: https://anonymous.4open.science/r/iclr-roqnn-DE27/ .", "keywords": "Quantum Computing;Machine Learning;Neural Networks;Robustness;Quantum Machine Learning;Quantum Neural Networks", "primary_area": "", "supplementary_material": "/attachment/3d6124b85c7d2accab3f338e0202f6a52d42353c.zip", "author": "Hanrui Wang;Jiaqi Gu;Yongshan Ding;Zirui Li;Frederic Chong;David Z. Pan;Song Han", "authorids": "~Hanrui_Wang1;~Jiaqi_Gu3;~Yongshan_Ding1;suffix_array@sjtu.edu.cn;~Frederic_Chong1;~David_Z._Pan1;~Song_Han5", "gender": "M;M;;;M;M;", "homepage": "https://hanruiwang.me;https://scopex-asu.github.io;https://www.yongshanding.com;;http://people.cs.uchicago.edu/~ftchong/;http://users.ece.utexas.edu/~dpan/;", "dblp": "214/9819-2;;;;;p/DavidZhigangPan.html;", "google_scholar": "https://scholar.google.com/citations?hl=en;FeIV12MAAAAJ;;;;3aLlroEAAAAJ;", "orcid": "0000-0002-7229-4015;;;;;0000-0002-5705-2501;", "linkedin": "hanrui-wang-34458217a/;;;;;davidzpan/;", "or_profile": "~Hanrui_Wang1;~Jiaqi_Gu3;~Yongshan_Ding1;suffix_array@sjtu.edu.cn;~Frederic_Chong1;~David_Z._Pan1;~Song_Han5", "aff": "Massachusetts Institute of Technology;University of Texas, Austin;Yale University;;University of Chicago;University of Texas, Austin;", "aff_domain": "mit.edu;utexas.edu;yale.edu;;uchicago.edu;utexas.edu;", "position": "PhD student;PhD student;Assistant Professor;;Full Professor;Professor;", "bibtex": "@misc{\nwang2022roqnn,\ntitle={Ro{QNN}: Noise-Aware Training for Robust Quantum Neural Networks},\nauthor={Hanrui Wang and Jiaqi Gu and Yongshan Ding and Zirui Li and Frederic Chong and David Z. Pan and Song Han},\nyear={2022},\nurl={https://openreview.net/forum?id=wwIBobGFj2V}\n}", "github": "", "project": "", "reviewers": "35rW;d4fC;Drde;ERRr", "site": "https://openreview.net/forum?id=wwIBobGFj2V", "pdf_size": 0, "recommendation": "3;3;5;8", "confidence": "5;4;5;3", "correctness": "4;2;3;4", "technical_novelty": "2;2;2;4", "empirical_novelty": "2;2;2;4", "wc_summary_paper": "65;114;125;73", "wc_summary_review": "24;32;30;28", "wc_main_review": "223;179;480;260", "wc_review": "312;325;635;361", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "559;73;81;37", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 4.75, 2.0463381929681126 ], "confidence_avg": [ 4.25, 0.82915619758885 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.5, 0.8660254037844386 ], "empirical_novelty_avg": [ 2.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 94.25, 25.703842125254347 ], "wc_summary_review_avg": [ 28.5, 2.958039891549808 ], "wc_main_review_avg": [ 285.5, 115.89758409906567 ], "wc_review_avg": [ 408.25, 132.1388947282366 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 187.5, 215.12496368390165 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": -0.6998739952495694, "corr_recommendation_correctness": 0.47886115464444223, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16191022626703480543&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;2;3;1", "aff_unique_norm": "Massachusetts Institute of Technology;University of Texas at Austin;Yale University;University of Chicago", "aff_unique_dep": ";;;", "aff_unique_url": "https://web.mit.edu;https://www.utexas.edu;https://www.yale.edu;https://www.uchicago.edu", "aff_unique_abbr": "MIT;UT Austin;Yale;UChicago", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Austin", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "wwVb95CkrFm", "title": "Neuro-Symbolic Ontology-Mediated Query Answering", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Recently, low-dimensional vector space representations of Knowledge Graphs (KGs) have been applied to find answers to logical queries over incomplete KGs. However, the current methods only focus on inductive reasoning, i.e. answering such queries by predicting facts based on patterns learned from the data, and lack the ability of deductive reasoning, the task of computing logical entailments using expert domain knowledge. To address this shortcoming, we investigate how existing embedding models for query answering over incomplete KGs can be adapted to incorporate domain knowledge in the form of ontologies. We propose two novel datasets, based on LUBM and NELL KGs, as well as various training strategies to integrate domain knowledge into prominent representatives of embedding models for query answering. Our strategies involve (1) different ontology-driven data augmentation techniques and (2) adaptation of the loss function using query-rewriting methods. The achieved improvements in the settings that require both inductive and deductive reasoning, are from 20% to 50% in HITS@3.", "keywords": "knowledge graph embeddings;ontologies;logical query answering;reasoning", "primary_area": "", "supplementary_material": "", "author": "Medina Andresel;Daria Stepanova;Trung-Kien Tran;Csaba Domokos;Pasquale Minervini", "authorids": "~Medina_Andresel1;~Daria_Stepanova1;~Trung-Kien_Tran1;~Csaba_Domokos3;~Pasquale_Minervini1", "gender": ";;M;M;M", "homepage": ";;;https://sites.google.com/site/cdomokosres/;https://www.neuralnoise.com", "dblp": "179/0969;89/2024-1;73/11442;08/1147;58/10142", "google_scholar": ";;KqUMncQAAAAJ;ouFnqFgAAAAJ;https://scholar.google.it/citations?user=9sk6CSgAAAA", "orcid": ";;;;0000-0002-8442-602X", "linkedin": ";;;;pasquale-mauro-minervini-47a08324/", "or_profile": "~Medina_Andresel1;~Daria_Stepanova1;~Trung-Kien_Tran1;~Csaba_Domokos3;~Pasquale_Minervini1", "aff": "TU Wien Vienna University of Technology;Robert Bosch GmbH, Bosch;Bosch;Bosch Center for Artificial intelligence;University College London, University of London", "aff_domain": "tuwien.ac.at;de.bosch.com;bosch.com;bosch.com;ucl.ac.uk", "position": "PhD student;Research scientist;Researcher;Research Scientist;Postdoc", "bibtex": "@misc{\nandresel2022neurosymbolic,\ntitle={Neuro-Symbolic Ontology-Mediated Query Answering},\nauthor={Medina Andresel and Daria Stepanova and Trung-Kien Tran and Csaba Domokos and Pasquale Minervini},\nyear={2022},\nurl={https://openreview.net/forum?id=wwVb95CkrFm}\n}", "github": "", "project": "", "reviewers": "cmg7;3TPL;YwJs", "site": "https://openreview.net/forum?id=wwVb95CkrFm", "pdf_size": 0, "recommendation": "3;5;6", "confidence": "5;4;4", "correctness": "2;2;4", "technical_novelty": "1;3;3", "empirical_novelty": "1;2;4", "wc_summary_paper": "80;132;82", "wc_summary_review": "13;70;52", "wc_main_review": "597;280;124", "wc_review": "690;482;258", "wc_reply_reviewers": "260;0;0", "wc_reply_authors": "1126;182;58", "reply_reviewers": "1;0;0", "reply_authors": "3;1;1", "recommendation_avg": [ 4.666666666666667, 1.247219128924647 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "correctness_avg": [ 2.6666666666666665, 0.9428090415820634 ], "technical_novelty_avg": [ 2.3333333333333335, 0.9428090415820634 ], "empirical_novelty_avg": [ 2.3333333333333335, 1.247219128924647 ], "wc_summary_paper_avg": [ 98.0, 24.055491403558285 ], "wc_summary_review_avg": [ 45.0, 23.790754506740637 ], "wc_main_review_avg": [ 333.6666666666667, 196.79487346529692 ], "wc_review_avg": [ 476.6666666666667, 176.40357769110642 ], "wc_reply_reviewers_avg": [ 86.66666666666667, 122.56517540566823 ], "wc_reply_authors_avg": [ 455.3333333333333, 476.92720152614027 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.9428090415820634 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.944911182523068, "corr_recommendation_correctness": 0.7559289460184546, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4110263811024695004&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;1;2;3", "aff_unique_norm": "Vienna University of Technology;Robert Bosch GmbH;Bosch Center for Artificial Intelligence;University College London", "aff_unique_dep": ";;Center for Artificial Intelligence;", "aff_unique_url": "https://www.tuwien.ac.at;https://www.bosch.com;https://www.bosch-ai.com;https://www.ucl.ac.uk", "aff_unique_abbr": "TU Wien;Bosch;BCAI;UCL", "aff_campus_unique_index": "0", "aff_campus_unique": "Vienna;", "aff_country_unique_index": "0;1;1;1;2", "aff_country_unique": "Austria;Germany;United Kingdom" }, { "id": "wxVpa5z4DU1", "title": "Accuracy-Privacy Trade-off in Deep Ensemble: A Membership Inference Perspective", "track": "main", "status": "Reject", "tldr": "", "abstract": "Deep ensemble learning has been shown to improve accuracy by training multiple neural networks and fusing their outputs. Ensemble learning has also been used to defend against membership inference attacks that undermine privacy. In this paper, we empirically demonstrate a trade-off between these two goals, namely accuracy and privacy (in terms of membership inference attacks), in deep ensembles. Using a wide range of datasets and model architectures, we show that the effectiveness of membership inference attacks also increases when ensembling improves accuracy. To better understand this trade-off, we study the impact of various factors such as prediction confidence and agreement between models that constitute the ensemble. Finally, we evaluate defenses against membership inference attacks based on regularization and differential privacy. We show that while these defenses can mitigate the effectiveness of the membership inference attack, they simultaneously degrade ensemble accuracy. We illustrate similar trade-off in more advanced and state-of-the-art ensembling techniques, such as snapshot ensembles and diversified ensemble networks. The source code is available in supplementary materials.", "keywords": "Ensemble Learning;Deep Ensemble;Membership Inference", "primary_area": "", "supplementary_material": "/attachment/1e37253bd58c4bd292012f27694bf029fd3c8f52.zip", "author": "Shahbaz Rezaei;Zubair Shafiq;Xin Liu", "authorids": "~Shahbaz_Rezaei1;~Zubair_Shafiq2;~Xin_Liu6", "gender": "M;Not Specified;F", "homepage": "http://www.shrezaei.com/;https://web.cs.ucdavis.edu/~zubair/;https://xinliu.engineering.ucdavis.edu/", "dblp": ";;76/1820-2", "google_scholar": "dSQ7ka8AAAAJ;Q3FvLzUAAAAJ;4MV5BkQAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Shahbaz_Rezaei1;~Zubair_Shafiq2;~Xin_Liu6", "aff": "University of California, Davis;Michigan State University;University of California, Davis", "aff_domain": "ucdavis.edu;msu.edu;ucdavis.edu", "position": "PhD student;PhD student;Full Professor", "bibtex": "@misc{\nrezaei2022accuracyprivacy,\ntitle={Accuracy-Privacy Trade-off in Deep Ensemble: A Membership Inference Perspective},\nauthor={Shahbaz Rezaei and Zubair Shafiq and Xin Liu},\nyear={2022},\nurl={https://openreview.net/forum?id=wxVpa5z4DU1}\n}", "github": "", "project": "", "reviewers": "McDt;YtVz;H6K7;G7t1", "site": "https://openreview.net/forum?id=wxVpa5z4DU1", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "4;4;4;4", "correctness": "3;3;3;4", "technical_novelty": "2;2;1;2", "empirical_novelty": "2;3;2;3", "wc_summary_paper": "52;90;24;31", "wc_summary_review": "40;91;61;7", "wc_main_review": "139;856;225;397", "wc_review": "231;1037;310;435", "wc_reply_reviewers": "18;280;18;10", "wc_reply_authors": "121;1329;379;522", "reply_reviewers": "1;1;1;1", "reply_authors": "1;2;1;1", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 1.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 49.25, 25.68438241422207 ], "wc_summary_review_avg": [ 49.75, 30.621683493890405 ], "wc_main_review_avg": [ 404.25, 276.8658294192333 ], "wc_review_avg": [ 503.25, 316.627837531699 ], "wc_reply_reviewers_avg": [ 81.5, 114.65055603877374 ], "wc_reply_authors_avg": [ 587.75, 451.44400261826496 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2522169389821928748&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff_unique_index": "0;1;0", "aff_unique_norm": "University of California, Davis;Michigan State University", "aff_unique_dep": ";", "aff_unique_url": "https://www.ucdavis.edu;https://www.msu.edu", "aff_unique_abbr": "UC Davis;MSU", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Davis;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "wzJnpBhRILm", "title": "Extreme normalization: approximating full-data batch normalization with single examples", "track": "main", "status": "Reject", "tldr": "", "abstract": "While batch normalization has been successful in speeding up the training of neural networks, it is not well understood. We cast batch normalization as an approximation of the limiting case where the entire dataset is normalized jointly, and explore other ways to approximate the gradient from this limiting case. We demonstrate an approximation that removes the need to keep more than one example in memory at any given time, at the cost of a small factor increase in the training step computation, as well as a fully per-example training procedure, which removes the extra computation at the cost of a small drop in the final model accuracy. We further use our insights to improve batch renormalization for very small minibatches. Unlike previously proposed methods, our normalization does not change the function class of the inference model, and performs well in the absence of identity shortcuts.", "keywords": "batch normalization;optimization", "primary_area": "", "supplementary_material": "", "author": "Sergey Ioffe", "authorids": "~Sergey_Ioffe3", "gender": "M", "homepage": "", "dblp": "93/2096", "google_scholar": "S5zOyIkAAAAJ", "orcid": "", "linkedin": "sergey-ioffe-1758821/", "or_profile": "~Sergey_Ioffe3", "aff": "Google", "aff_domain": "google.com", "position": "Researcher", "bibtex": "@misc{\nioffe2022extreme,\ntitle={Extreme normalization: approximating full-data batch normalization with single examples},\nauthor={Sergey Ioffe},\nyear={2022},\nurl={https://openreview.net/forum?id=wzJnpBhRILm}\n}", "github": "", "project": "", "reviewers": "CZjS;cEEb;mQAF;bk8j", "site": "https://openreview.net/forum?id=wzJnpBhRILm", "pdf_size": 0, "recommendation": "3;3;5;6", "confidence": "4;3;4;2", "correctness": "2;3;4;3", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "46;52;59;93", "wc_summary_review": "51;29;92;37", "wc_main_review": "725;194;92;63", "wc_review": "822;275;243;193", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.25, 1.299038105676658 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 62.5, 18.200274723201296 ], "wc_summary_review_avg": [ 52.25, 24.262883175748097 ], "wc_main_review_avg": [ 268.5, 268.01352577808456 ], "wc_review_avg": [ 383.25, 254.9925244002263 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 1, 0 ], "corr_recommendation_confidence": -0.5222329678670935, "corr_recommendation_correctness": 0.5443310539518174, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:t6wouGe-_KEJ:scholar.google.com/&scioq=Extreme+normalization:+approximating+full-data+batch+normalization+with+single+examples&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "Google", "aff_unique_dep": "Google", "aff_unique_url": "https://www.google.com", "aff_unique_abbr": "Google", "aff_campus_unique_index": "0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "id": "x-YLAN2wJI", "title": "ESCo: Towards Provably Effective and Scalable Contrastive Representation Learning", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "InfoNCE-based contrastive learning models (e.g., MoCo, SimCLR, etc.) have shown inspiring power in unsupervised representation learning by maximizing a tight lower bound of the mutual information of two views' representations. However, its quadratic complexity makes it hard for scaling to larger batch sizes, and some recent research suggests that it may exploit superfluous information that is useless for downstream prediction tasks. In this paper, we propose ESCo (Effective and Scalable Contrastive), a new contrastive framework which is essentially an instantiation of the Information Bottleneck principle under self-supervised learning settings. Specifically, ESCo targets a new objective that seeks to maximize the similarity between the representations of positive pairs and minimize the pair-wise kernel potential of negative pairs, with a provable guarantee of effective representations that preserve task-relevant information and discard the irrelevant one. Furthermore, to escape from the quadratic time complexity and memory cost, we propose to leverage the Random Features to achieve accurate approximation with linear scalability. We show that the vanilla InfoNCE objective is a degenerated case of ESCo, which implies that ESCo can potentially boost existing InfoNCE-based models. To verify our method, we conduct extensive experiments on both synthetic and real-world datasets, showing its superior performance over the InfoNCE-based baselines in (unsupervised) representation learning tasks for images and graphs.", "keywords": "Contrastive Learning;Unsupervised Representation Learning", "primary_area": "", "supplementary_material": "", "author": "Hengrui Zhang;Qitian Wu;Shaofeng Zhang;Junchi Yan;David Wipf;Philip S. Yu", "authorids": "~Hengrui_Zhang1;~Qitian_Wu1;~Shaofeng_Zhang1;~Junchi_Yan2;~David_Wipf1;~Philip_S._Yu1", "gender": "M;;M;;M;M", "homepage": "https://hengruizhang98.github.io;;https://sherrylone.github.io;;http://www.davidwipf.com/;https://cs.uic.edu/profiles/philip-yu/", "dblp": ";;132/2540;;81/6421;y/PhilipSYu", "google_scholar": "iwffiD0AAAAJ;;VoVVJIgAAAAJ;;YJx1WSgAAAAJ;D0lL1r0AAAAJ", "orcid": "0009-0006-1330-0899;;;;;0000-0002-3491-5968", "linkedin": ";;;;;", "or_profile": "~Hengrui_Zhang1;~Qitian_Wu1;~Shaofeng_Zhang1;~Junchi_Yan2;~David_Wipf1;~Philip_S._Yu1", "aff": "University of Illinois, Chicago;;Shanghai Jiaotong University;;Amazon AI Research Lab;University of Illinois Chicago", "aff_domain": "uic.edu;;sjtu.edu.cn;;amazon.com;uic.edu", "position": "PhD student;;PhD student;;Principal Research Scientist;Full Professor", "bibtex": "@misc{\nzhang2022esco,\ntitle={{ESC}o: Towards Provably Effective and Scalable Contrastive Representation Learning},\nauthor={Hengrui Zhang and Qitian Wu and Shaofeng Zhang and Junchi Yan and David Wipf and Philip S. Yu},\nyear={2022},\nurl={https://openreview.net/forum?id=x-YLAN2wJI}\n}", "github": "", "project": "", "reviewers": "QZEi;LjFu;nQi1;Gvzf", "site": "https://openreview.net/forum?id=x-YLAN2wJI", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "4;4;3;2", "correctness": "3;3;3;4", "technical_novelty": "3;1;1;2", "empirical_novelty": "2;1;1;2", "wc_summary_paper": "61;58;121;89", "wc_summary_review": "31;16;19;62", "wc_main_review": "354;139;222;322", "wc_review": "446;213;362;473", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 1.75, 0.82915619758885 ], "empirical_novelty_avg": [ 1.5, 0.5 ], "wc_summary_paper_avg": [ 82.25, 25.43005112067217 ], "wc_summary_review_avg": [ 32.0, 18.207141456033124 ], "wc_main_review_avg": [ 259.25, 84.79792155471736 ], "wc_review_avg": [ 373.5, 101.30276402941827 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.8703882797784891, "corr_recommendation_correctness": 1.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2527195483992010682&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "University of Illinois at Chicago;Shanghai Jiao Tong University;Amazon", "aff_unique_dep": ";;Amazon AI Research Lab", "aff_unique_url": "https://www.uic.edu;https://www.sjtu.edu.cn;https://www.amazon.com", "aff_unique_abbr": "UIC;SJTU;Amazon AI", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Chicago;", "aff_country_unique_index": "0;1;0;0", "aff_country_unique": "United States;China" }, { "id": "x3F9PuOUKZc", "title": "Subpixel object segmentation using wavelets and multiresolution analysis", "track": "main", "status": "Reject", "tldr": "", "abstract": "We propose a novel deep learning framework for fast prediction of boundaries of two-dimensional simply connected domains using wavelets and Multiresolution Analysis (MRA). The boundaries are modelled as (piecewise) smooth closed curves using wavelets and the so-called Pyramid Algorithm. Our network architecture is a hybrid analog of the U-Net, where the down-sampling path is a two-dimensional encoder with learnable filters, and the upsampling path is a one-dimensional decoder, which builds curves up from low to high resolution levels. Any wavelet basis induced by a MRA can be used. This flexibility allows for incorporation of priors on the smoothness of curves. The effectiveness of the proposed method is demonstrated by delineating boundaries of simply connected domains (organs) in medical images using Debauches wavelets and comparing performance with a U-Net baseline. Our model demonstrates up to 5x faster inference speed compared to the U-Net, while maintaining similar performance in terms of Dice score and Hausdorff distance.", "keywords": "Segmentation;wavelets;contour prediction;multiresolution analysis", "primary_area": "", "supplementary_material": "", "author": "Ray Sheombarsing;Nikita Moriakov;Jan-jakob Sonke;Jonas Teuwen", "authorids": "~Ray_Sheombarsing2;~Nikita_Moriakov1;~Jan-jakob_Sonke1;~Jonas_Teuwen1", "gender": ";;;M", "homepage": ";https://www.aiforoncology.nl/people/nikita-moriakov/;;https://aiforoncology.nl", "dblp": ";218/6930;20/4093;213/7835", "google_scholar": ";iTVHO5oAAAAJ;https://scholar.google.com/citations?hl=nl;Jz3tRZMAAAAJ", "orcid": ";;0000-0001-5155-5274;0000-0002-1825-1428", "linkedin": ";;;jonasteuwen/", "or_profile": "~Ray_Sheombarsing2;~Nikita_Moriakov1;~Jan-jakob_Sonke1;~Jonas_Teuwen1", "aff": ";Radboud University Medical Center;University of Amsterdam;Radboud University Medical Center", "aff_domain": ";radboudumc.nl;uva.nl;radboudumc.nl", "position": ";Postdoc;Full Professor;Researcher", "bibtex": "@misc{\nsheombarsing2022subpixel,\ntitle={Subpixel object segmentation using wavelets and multiresolution analysis},\nauthor={Ray Sheombarsing and Nikita Moriakov and Jan-jakob Sonke and Jonas Teuwen},\nyear={2022},\nurl={https://openreview.net/forum?id=x3F9PuOUKZc}\n}", "github": "", "project": "", "reviewers": "MBzp;xHYe;ZJwW;w2LA", "site": "https://openreview.net/forum?id=x3F9PuOUKZc", "pdf_size": 0, "recommendation": "3;6;6;6", "confidence": "4;4;5;3", "correctness": "2;2;3;4", "technical_novelty": "2;3;3;4", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "34;101;51;80", "wc_summary_review": "14;62;24;40", "wc_main_review": "104;183;81;285", "wc_review": "152;346;156;405", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "638;706;209;760", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 5.25, 1.299038105676658 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 2.75, 0.82915619758885 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 66.5, 25.83118270617898 ], "wc_summary_review_avg": [ 35.0, 18.138357147217054 ], "wc_main_review_avg": [ 163.25, 79.8259826121796 ], "wc_review_avg": [ 264.75, 112.70619991819439 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 578.25, 217.5251422249851 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.5222329678670935, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14241069778105297331&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "Radboud University Medical Center;University of Amsterdam", "aff_unique_dep": ";", "aff_unique_url": "https://www.radboudumc.nl;https://www.uva.nl", "aff_unique_abbr": "RadboudUMC;UvA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Netherlands" }, { "id": "x4NvCoi2Wnb", "title": "MIKE - Multi-task Implicit Knowledge Embeddings by Autoencoding through a Shared Input Space", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "In this work, we introduce a method of learning Multi-task Implicit Knowledge Embeddings (MIKE) from a set of source (or \"teacher\") networks by autoencoding through a shared input space. MIKE uses an autoencoder to produce a reconstruction of a given input space optimized to induce the same activations in the source networks. This results in an encoder that takes inputs in the same format as the source networks and maps them to a latent semantic space which represents patterns in the data that are salient to the source networks. We present the results of our first experiments that use 11 segmentation tasks derived from the COCO dataset, which demonstrate the basic feasibility of MIKE.", "keywords": "Representation Learning;Joint Embedding;Multi-task Learning;Knowledge Transfer;Distillation;Transfer Learning", "primary_area": "", "supplementary_material": "", "author": "Ryan Anthony Dellana;William Severa;Felix Wang;Esteban J Guillen;Jaimie Murdock", "authorids": "~Ryan_Anthony_Dellana1;~William_Severa1;felwang@sandia.gov;ejguill@sandia.gov;jmmurdo@sandia.gov", "gender": ";M;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": "ryan-dellana/;;;;", "or_profile": "~Ryan_Anthony_Dellana1;~William_Severa1;felwang@sandia.gov;ejguill@sandia.gov;jmmurdo@sandia.gov", "aff": "Sandia National Laboratories;Sandia National Laboratories;;;", "aff_domain": "sandia.gov;sandia.gov;;;", "position": "Postdoc;Staff;;;", "bibtex": "@misc{\ndellana2022mike,\ntitle={{MIKE} - Multi-task Implicit Knowledge Embeddings by Autoencoding through a Shared Input Space},\nauthor={Ryan Anthony Dellana and William Severa and Felix Wang and Esteban J Guillen and Jaimie Murdock},\nyear={2022},\nurl={https://openreview.net/forum?id=x4NvCoi2Wnb}\n}", "github": "", "project": "", "reviewers": "tXme;4jro;fuGi;wgZg", "site": "https://openreview.net/forum?id=x4NvCoi2Wnb", "pdf_size": 0, "recommendation": "1;3;3;5", "confidence": "3;4;4;4", "correctness": "2;2;2;2", "technical_novelty": "2;3;2;2", "empirical_novelty": "2;3;1;2", "wc_summary_paper": "94;125;158;111", "wc_summary_review": "8;26;68;9", "wc_main_review": "305;149;201;117", "wc_review": "407;300;427;237", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "382;401;565;378", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 3.0, 1.4142135623730951 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 2.0, 0.0 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 122.0, 23.50531854708632 ], "wc_summary_review_avg": [ 27.75, 24.31434761617099 ], "wc_main_review_avg": [ 193.0, 71.27411872482185 ], "wc_review_avg": [ 342.75, 77.84078301250572 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 431.5, 77.56448929761608 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.816496580927726, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:gdbyRejON8UJ:scholar.google.com/&scioq=MIKE+-+Multi-task+Implicit+Knowledge+Embeddings+by+Autoencoding+through+a+Shared+Input+Space&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Sandia National Laboratories", "aff_unique_dep": "", "aff_unique_url": "https://www.sandia.gov", "aff_unique_abbr": "SNL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "x4tkHYGpTdq", "title": "DSEE: Dually Sparsity-embedded Efficient Tuning of Pre-trained Language Models", "track": "main", "status": "Reject", "tldr": "", "abstract": "Gigantic pre-trained models have become central to natural language processing (NLP), serving as the starting point for fine-tuning towards a range of downstream tasks. However, two pain points persist for this paradigm: (a) as the pre-trained models grow bigger (e.g., $175$B parameters for GPT-3), even the fine-tuning process can be time-consuming and computationally expensive; (b) the fine-tuned model has the same size as its starting point by default, which is neither sensible due to its more specialized functionality, nor practical since many fine-tuned models will be deployed in resource-constrained environments. To address these pain points, we propose a framework for resource- and parameter-efficient fine-tuning by leveraging the sparsity prior in both weight updates and the final model weights. Our proposed framework, dubbed $\\textbf{D}$ually $\\textbf{S}$parsity-$\\textbf{E}$mbedded $\\textbf{E}$fficient Tuning (DSEE), aims to achieve two key objectives: (i) $\\textit{parameter efficient fine-tuning}$ - by enforcing sparsity-aware weight updates on top of the pre-trained weights; and (ii) $\\textit{resource-efficient inference}$ - by encouraging a sparse weight structure towards the final fine-tuned model. We leverage sparsity in these two directions by exploiting both unstructured and structural sparse patterns in pre-trained language models via magnitude-based pruning and $\\ell_1$ sparse regularization. Extensive experiments and in-depth investigations, with diverse network backbones (i.e., BERT, GPT-2, and DeBERTa) on dozens of datasets, consistently demonstrate highly impressive parameter-/training-/inference-efficiency, while maintaining competitive downstream transfer performance. For instance, our DSEE-BERT obtains about $35\\%$ inference FLOPs savings with $<0.1\\%$ trainable parameters and comparable performance to conventional fine-tuning.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/158b75bab4397905a08e7b5e40eef6fcf0337349.zip", "author": "Xuxi Chen;Tianlong Chen;Yu Cheng;Weizhu Chen;Zhangyang Wang;Ahmed Hassan Awadallah", "authorids": "~Xuxi_Chen1;~Tianlong_Chen1;~Yu_Cheng1;~Weizhu_Chen1;~Zhangyang_Wang1;~Ahmed_Hassan_Awadallah1", "gender": "Unspecified;M;M;M;M;M", "homepage": ";https://tianlong-chen.github.io;https://ych133.github.io;https://www.microsoft.com/en-us/research/people/wzchen/;https://vita-group.github.io;https://www.microsoft.com/en-us/research/people/hassanam/publications/", "dblp": "267/9662;;96/3060-1.html;79/2536;119/4026;147/9148", "google_scholar": "afsDlKYAAAAJ;LE3ctn0AAAAJ;https://scholar.google.com/citations?hl=en;LG_E-4EAAAAJ;pxFyKAIAAAAJ;sNGk-9MAAAAJ", "orcid": ";0000-0001-7774-8197;;;;", "linkedin": ";tianlong-chen-783862167/;chengyu05/;;;ahmed-hassan-awadallah-a355a27/", "or_profile": "~Xuxi_Chen1;~Tianlong_Chen1;~Yu_Cheng1;~Weizhu_Chen1;~Zhangyang_Wang1;~Ahmed_Hassan_Awadallah1", "aff": "University of Texas at Austin;University of Texas, Austin;Microsoft Research;Microsoft GenAI;University of Texas, Austin;Microsoft Research", "aff_domain": "utexas.edu;utexas.edu;microsoft.com;microsoft.com;utexas.edu;microsoft.com", "position": "PhD student;PhD student;Principal Researcher;Vice President;Assistant Professor;Principal Researcher", "bibtex": "@misc{\nchen2022dsee,\ntitle={{DSEE}: Dually Sparsity-embedded Efficient Tuning of Pre-trained Language Models},\nauthor={Xuxi Chen and Tianlong Chen and Yu Cheng and Weizhu Chen and Zhangyang Wang and Ahmed Hassan Awadallah},\nyear={2022},\nurl={https://openreview.net/forum?id=x4tkHYGpTdq}\n}", "github": "", "project": "", "reviewers": "UaPn;uLTd;KvuY;q6NF;gnKh", "site": "https://openreview.net/forum?id=x4tkHYGpTdq", "pdf_size": 0, "recommendation": "3;3;5;5;5", "confidence": "5;3;4;4;4", "correctness": "2;3;3;2;3", "technical_novelty": "1;2;2;3;2", "empirical_novelty": "1;2;3;2;3", "wc_summary_paper": "36;71;72;111;54", "wc_summary_review": "163;74;20;11;85", "wc_main_review": "681;395;538;346;165", "wc_review": "880;540;630;468;304", "wc_reply_reviewers": "150;194;168;109;0", "wc_reply_authors": "1127;903;460;460;363", "reply_reviewers": "1;1;1;1;0", "reply_authors": "2;2;1;1;1", "recommendation_avg": [ 4.2, 0.9797958971132712 ], "confidence_avg": [ 4.0, 0.6324555320336759 ], "correctness_avg": [ 2.6, 0.4898979485566356 ], "technical_novelty_avg": [ 2.0, 0.6324555320336759 ], "empirical_novelty_avg": [ 2.2, 0.7483314773547882 ], "wc_summary_paper_avg": [ 68.8, 24.862823652996457 ], "wc_summary_review_avg": [ 70.6, 54.5329258338483 ], "wc_main_review_avg": [ 425.0, 174.9548513188474 ], "wc_review_avg": [ 564.4, 190.57974708766932 ], "wc_reply_reviewers_avg": [ 124.2, 67.98941094023392 ], "wc_reply_authors_avg": [ 662.6, 298.4336442159295 ], "reply_reviewers_avg": [ 0.8, 0.4 ], "reply_authors_avg": [ 1.4, 0.4898979485566356 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.16666666666666663, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16871599540645728205&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff_unique_index": "0;0;1;1;0;1", "aff_unique_norm": "University of Texas at Austin;Microsoft", "aff_unique_dep": ";Microsoft Research", "aff_unique_url": "https://www.utexas.edu;https://www.microsoft.com/en-us/research", "aff_unique_abbr": "UT Austin;MSR", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Austin;", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "x8l2miKNqPb", "title": "Generate Triggers in Neural Relation Extraction", "track": "main", "status": "Desk Reject", "tldr": "", "abstract": "In the relation extraction task, the relationship between two entities is determined by some specific words in their source text. These words are called relation triggers, which are the evidence to explain the relationship; other words are called ir-relevant words. The current relationship extraction neural network model aims at identifying the relation type between two entities mentioned in source text by encoding the text and entities. However, these models cannot output the relation triggers, but only gives the result of relation classification. Although models can generate weights for every single word through the improvement of attention mechanism, the weights will be affected by irrelevant words essentially, which are not required by the relation extraction task. In order to output re-lation triggers accurately, we propose a novel training frame-work for Relation Extraction (RE) that reduces the negative effect of irrelevant words on them in the encoding stage. In specific, we leverage Evolutive Mask based Point Network (EMPN) as a decoder to generate relation triggers and encode these words again. For an ordered output in relation triggers, we utilize order loss to constrain the output order in them. Ex-tensive experiment results demonstrate that the effectiveness of our proposed model achieves state-of-the-art performance on three RE benchmark datasets.", "keywords": "relation triggers\uff0cevolutive mask\uff0c pointer network", "primary_area": "", "supplementary_material": "/attachment/6ac4187dbe1a4d87d739f658a0fdd6190f2439a7.zip", "author": "Liu Yujiang", "authorids": "~Liu_Yujiang1", "gender": "M", "homepage": "", "dblp": "92/9609", "google_scholar": "jJpDEwwAAAAJ", "orcid": "", "linkedin": "", "or_profile": "~Liu_Yujiang1", "aff": ", Chinese Academy of Sciences", "aff_domain": "ucas.ac.cn", "position": "PhD student", "bibtex": "@misc{\nyujiang2022generate,\ntitle={Generate Triggers in Neural Relation Extraction},\nauthor={Liu Yujiang},\nyear={2022},\nurl={https://openreview.net/forum?id=x8l2miKNqPb}\n}", "github": "", "project": "", "reviewers": "", "site": "https://openreview.net/forum?id=x8l2miKNqPb", "pdf_size": 0, "recommendation": "", "confidence": "", "correctness": "", "technical_novelty": "", "empirical_novelty": "", "wc_summary_paper": "", "wc_summary_review": "", "wc_main_review": "", "wc_review": "", "wc_reply_reviewers": "", "wc_reply_authors": "", "reply_reviewers": "", "reply_authors": "", "recommendation_avg": [ 0, 0 ], "confidence_avg": [ 0, 0 ], "correctness_avg": [ 0, 0 ], "technical_novelty_avg": [ 0, 0 ], "empirical_novelty_avg": [ 0, 0 ], "wc_summary_paper_avg": [ 0, 0 ], "wc_summary_review_avg": [ 0, 0 ], "wc_main_review_avg": [ 0, 0 ], "wc_review_avg": [ 0, 0 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 1, 0 ], "authors#_avg": [ 1, 0 ], "corr_recommendation_confidence": 0, "corr_recommendation_correctness": 0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:7EgYOcelaw4J:scholar.google.com/&scioq=Generate+Triggers+in+Neural+Relation+Extraction&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "Chinese Academy of Sciences", "aff_unique_dep": "", "aff_unique_url": "http://www.cas.cn", "aff_unique_abbr": "CAS", "aff_country_unique_index": "0", "aff_country_unique": "China" }, { "title": "Dropout Q-Functions for Doubly Efficient Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6233", "id": "xCVJMsPv3RT", "poster": "", "openreview": "https://openreview.net/forum?id=xCVJMsPv3RT", "slides": "https://iclr.cc/virtual/2022/poster/6233", "video": "https://iclr.cc/virtual/2022/poster/6233", "author_site": "Takuya Hiraoka, Takahisa Imagawa, Taisei Hashimoto, Takashi Onishi, Yoshimasa Tsuruoka", "tldr": "", "abstract": "Randomized ensembled double Q-learning (REDQ) (Chen et al., 2021b) has recently achieved state-of-the-art sample efficiency on continuous-action reinforcement learning benchmarks. This superior sample efficiency is made possible by using a large Q-function ensemble. However, REDQ is much less computationally efficient than non-ensemble counterparts such as Soft Actor-Critic (SAC) (Haarnoja et al., 2018a). To make REDQ more computationally efficient, we propose a method of improving computational efficiency called DroQ, which is a variant of REDQ that uses a small ensemble of dropout Q-functions. Our dropout Q-functions are simple Q-functions equipped with dropout connection and layer normalization. Despite its simplicity of implementation, our experimental results indicate that DroQ is doubly (sample and computationally) efficient. It achieved comparable sample efficiency with REDQ, much better computational efficiency than REDQ, and comparable computational efficiency with that of SAC.", "keywords": "Reinforcement learning", "primary_area": "", "supplementary_material": "/attachment/f092f9beff8d577d4e346d244650718a5959f424.zip", "author": "Takuya Hiraoka;Takahisa Imagawa;Taisei Hashimoto;Takashi Onishi;Yoshimasa Tsuruoka", "authorids": "~Takuya_Hiraoka1;~Takahisa_Imagawa1;~Taisei_Hashimoto1;~Takashi_Onishi1;~Yoshimasa_Tsuruoka1", "gender": "M;M;M;;M", "homepage": "https://takuyahiraoka.github.io/;;https://www.logos.t.u-tokyo.ac.jp/members-en.html;;https://www.logos.t.u-tokyo.ac.jp/~tsuruoka/", "dblp": "31/977.html;;;64/9012;18/3787", "google_scholar": "https://scholar.google.co.jp/citations?user=L3vFMj0AAAAJ;https://scholar.google.co.jp/citations?user=MgEUW3EAAAAJ;;;J2CkFngAAAAJ", "orcid": ";;;;", "linkedin": "https://linkedin.com/in/takuya-hiraoka-33a62a167;;;;", "or_profile": "~Takuya_Hiraoka1;~Takahisa_Imagawa1;~Taisei_Hashimoto1;~Takashi_Onishi1;~Yoshimasa_Tsuruoka1", "aff": "NEC;AIST;;NEC;The University of Tokyo", "aff_domain": "nec.com;aist.go.jp;;nec.com;u-tokyo.ac.jp", "position": "Researcher;Postdoc;;Researcher;Full Professor", "bibtex": "@inproceedings{\nhiraoka2022dropout,\ntitle={Dropout Q-Functions for Doubly Efficient Reinforcement Learning},\nauthor={Takuya Hiraoka and Takahisa Imagawa and Taisei Hashimoto and Takashi Onishi and Yoshimasa Tsuruoka},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=xCVJMsPv3RT}\n}", "github": "", "project": "", "reviewers": "1PuF;RMuR;fiFc", "pdf_size": 0, "recommendation": "6;6;6", "confidence": "4;3;4", "correctness": "3;3;4", "technical_novelty": "2;3;3", "empirical_novelty": "3;3;3", "wc_summary_paper": "45;90;39", "wc_summary_review": "206;18;38", "wc_main_review": "733;182;240", "wc_review": "984;290;317", "wc_reply_reviewers": "187;26;66", "wc_reply_authors": "1587;873;487", "reply_reviewers": "1;1;1", "reply_authors": "3;3;2", "recommendation_avg": [ 6.0, 0.0 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 58.0, 22.759613353482084 ], "wc_summary_review_avg": [ 87.33333333333333, 84.30631978419859 ], "wc_main_review_avg": [ 385.0, 247.2097624825255 ], "wc_review_avg": [ 530.3333333333334, 320.980096302282 ], "wc_reply_reviewers_avg": [ 93.0, 68.44462481938714 ], "wc_reply_authors_avg": [ 982.3333333333334, 455.67922440633123 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 2.6666666666666665, 0.4714045207910317 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 130, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=207538077714334096&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=xCVJMsPv3RT", "email": "nec.com;aist.go.jp;;nec.com;u-tokyo.ac.jp", "author_num": 5, "aff_unique_index": "0;1;0;2", "aff_unique_norm": "NEC Corporation;Advanced Industrial Science and Technology;University of Tokyo", "aff_unique_dep": ";;", "aff_unique_url": "https://www.nec.com;https://www.aist.go.jp;https://www.u-tokyo.ac.jp", "aff_unique_abbr": "NEC;AIST;UTokyo", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Japan" }, { "id": "xD3RiCCfsY", "title": "On Learning to Solve Cardinality Constrained Combinatorial Optimization in One-Shot: A Re-parameterization Approach via Gumbel-Sinkhorn-TopK", "track": "main", "status": "Reject", "tldr": "", "abstract": "Cardinality constrained combinatorial optimization requires selecting an optimal subset of $k$ elements, and it will be appealing to design data-driven algorithms that perform TopK selection over a probability distribution predicted by a neural network. However, the existing differentiable TopK operator suffers from an unbounded gap between the soft prediction and the discrete solution, leading to inaccurate estimation of the combinatorial objective score. In this paper, we present a self-supervised learning pipeline for cardinality constrained combinatorial optimization, which incorporates with Gumbel-Sinkhorn-TopK (GS-TopK) for near-discrete TopK predictions and the re-parameterization trick resolving the non-differentiable challenge. Theoretically, we characterize a bounded gap between the Maximum-A-Posteriori (MAP) inference and our proposed method, resolving the divergence issue in the previous differentiable TopK operator and also providing a more accurate estimation of the objective score given a provable tightened bound to the discrete decision variables. Experiments on max covering and discrete clustering problems show that our method outperforms state-of-the-art Gurobi solver and the novel one-shot learning method Erdos Goes Neural.\n", "keywords": "Combinatorial Problems;Gumbel Re-parameterization;Optimal Transport;Machine Learning", "primary_area": "", "supplementary_material": "", "author": "Runzhong Wang;Li Shen;Yiting Chen;Junchi Yan;Xiaokang Yang;Dacheng Tao", "authorids": "~Runzhong_Wang1;~Li_Shen1;~Yiting_Chen1;~Junchi_Yan2;~Xiaokang_Yang1;~Dacheng_Tao1", "gender": "M;M;M;;M;", "homepage": "http://runzhong.wang;https://sites.google.com/site/mathshenli/home;https://ytchen981.github.io/;;https://icne.sjtu.edu.cn/info/1064/1078.htm;", "dblp": "239/4351;91/3680-8;135/6971;;06/3071-1.html;", "google_scholar": "uoM0g3cAAAAJ;yVhgENIAAAAJ;https://scholar.google.com/citations?hl=zh-CN;;yDEavdMAAAAJ;", "orcid": "0000-0002-9566-738X;;;;0000-0003-4029-3322;", "linkedin": ";;;;;", "or_profile": "~Runzhong_Wang1;~Li_Shen1;~Yiting_Chen1;~Junchi_Yan2;~Xiaokang_Yang1;~Dacheng_Tao1", "aff": "Shanghai Jiaotong University;JD Explore Academy;Shanghai Jiaotong University;;Shanghai Jiaotong University;", "aff_domain": "sjtu.edu.cn;jd.com;sjtu.edu.cn;;sjtu.edu.cn;", "position": "PhD student;Researcher;PhD student;;Full Professor;", "bibtex": "@misc{\nwang2022on,\ntitle={On Learning to Solve Cardinality Constrained Combinatorial Optimization in One-Shot: A Re-parameterization Approach via Gumbel-Sinkhorn-TopK},\nauthor={Runzhong Wang and Li Shen and Yiting Chen and Junchi Yan and Xiaokang Yang and Dacheng Tao},\nyear={2022},\nurl={https://openreview.net/forum?id=xD3RiCCfsY}\n}", "github": "", "project": "", "reviewers": "mjiu;J7pw;fJUe;BoAG", "site": "https://openreview.net/forum?id=xD3RiCCfsY", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "4;4;3;4", "correctness": "3;3;3;3", "technical_novelty": "3;3;3;2", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "87;36;61;60", "wc_summary_review": "32;133;59;60", "wc_main_review": "819;595;314;468", "wc_review": "938;764;434;588", "wc_reply_reviewers": "0;896;0;151", "wc_reply_authors": "1371;3395;508;1166", "reply_reviewers": "0;4;0;1", "reply_authors": "2;6;1;2", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 61.0, 18.041618552668716 ], "wc_summary_review_avg": [ 71.0, 37.51666296460814 ], "wc_main_review_avg": [ 549.0, 184.93377192930447 ], "wc_review_avg": [ 681.0, 188.80942773071476 ], "wc_reply_reviewers_avg": [ 261.75, 371.33702683680764 ], "wc_reply_authors_avg": [ 1610.0, 1078.759240980118 ], "reply_reviewers_avg": [ 1.25, 1.6393596310755 ], "reply_authors_avg": [ 2.75, 1.920286436967152 ], "replies_avg": [ 23, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:inmIzsf9cFwJ:scholar.google.com/&scioq=On+Learning+to+Solve+Cardinality+Constrained+Combinatorial+Optimization+in+One-Shot:+A+Re-parameterization+Approach+via+Gumbel-Sinkhorn-TopK&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1;0;0", "aff_unique_norm": "Shanghai Jiao Tong University;JD", "aff_unique_dep": ";JD Explore Academy", "aff_unique_url": "https://www.sjtu.edu.cn;", "aff_unique_abbr": "SJTU;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China;" }, { "title": "On the approximation properties of recurrent encoder-decoder architectures", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7137", "id": "xDIvIqQ3DXD", "poster": "", "openreview": "https://openreview.net/forum?id=xDIvIqQ3DXD", "slides": "https://iclr.cc/virtual/2022/poster/7137", "video": "https://iclr.cc/virtual/2022/poster/7137", "author_site": "Zhong Li, Haotian Jiang, Qianxiao Li", "tldr": "", "abstract": "Encoder-decoder architectures have recently gained popularity in sequence to sequence modelling, featuring in state-of-the-art models such as transformers. However, a mathematical understanding of their working principles still remains limited. In this paper, we study the approximation properties of recurrent encoder-decoder architectures. Prior work established theoretical results for RNNs in the linear setting, where approximation capabilities can be related to smoothness and memory of target temporal relationships. Here, we uncover that the encoder and decoder together form a particular \u201ctemporal product structure\u201d which determines the approximation efficiency. Moreover, the encoder-decoder architecture generalises RNNs with the capability to learn time-inhomogeneous relationships. Our results provide the theoretical understanding of approximation properties of the recurrent encoder-decoder architecture, which precisely characterises, in the considered setting, the types of temporal relationships that can be efficiently learned.", "keywords": "encoder-decoder;recurrent neural networks;approximation;temporal product", "primary_area": "", "supplementary_material": "", "author": "Zhong Li;Haotian Jiang;Qianxiao Li", "authorids": "~Zhong_Li2;~Haotian_Jiang1;~Qianxiao_Li1", "gender": "M;M;M", "homepage": "https://www.microsoft.com/en-us/research/people/lzhong/;;https://blog.nus.edu.sg/qianxiaoli/", "dblp": ";;172/0930.html", "google_scholar": "https://scholar.google.com/citations?view_op=list_works;https://scholar.google.com/citations?hl=en;https://scholar.google.com.sg/citations?user=zLgReYoAAAAJ", "orcid": ";;0000-0002-3903-3737", "linkedin": ";%E6%98%8A%E5%A4%A9-%E5%A7%9C-307951110/;", "or_profile": "~Zhong_Li2;~Haotian_Jiang1;~Qianxiao_Li1", "aff": "Peking University;;National University of Singapore", "aff_domain": "pku.edu.cn;;nus.edu.sg", "position": "PhD student;;Assistant Professor", "bibtex": "@inproceedings{\nli2022on,\ntitle={On the approximation properties of recurrent encoder-decoder architectures},\nauthor={Zhong Li and Haotian Jiang and Qianxiao Li},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=xDIvIqQ3DXD}\n}", "github": "", "project": "", "reviewers": "e39c;WtE5;Qn54", "pdf_size": 0, "recommendation": "6;8;8", "confidence": "2;3;3", "correctness": "3;4;3", "technical_novelty": "2;4;3", "empirical_novelty": "2;0;3", "wc_summary_paper": "81;23;268", "wc_summary_review": "84;21;66", "wc_main_review": "172;100;479", "wc_review": "337;144;813", "wc_reply_reviewers": "119;0;0", "wc_reply_authors": "751;203;1338", "reply_reviewers": "1;0;0", "reply_authors": "2;1;2", "recommendation_avg": [ 7.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 2.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 3.0, 0.816496580927726 ], "empirical_novelty_avg": [ 1.6666666666666667, 1.247219128924647 ], "wc_summary_paper_avg": [ 124.0, 104.54026337572843 ], "wc_summary_review_avg": [ 57.0, 26.49528259898354 ], "wc_main_review_avg": [ 250.33333333333334, 164.341784772535 ], "wc_review_avg": [ 431.3333333333333, 281.1456720081049 ], "wc_reply_reviewers_avg": [ 39.666666666666664, 56.09713797413277 ], "wc_reply_authors_avg": [ 764.0, 463.4529821531702 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.9999999999999998, "corr_recommendation_correctness": 0.49999999999999983, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13725527534758167519&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=xDIvIqQ3DXD", "email": "pku.edu.cn;;nus.edu.sg", "author_num": 3, "aff_unique_index": "0;1", "aff_unique_norm": "Peking University;National University of Singapore", "aff_unique_dep": ";", "aff_unique_url": "http://www.pku.edu.cn;https://www.nus.edu.sg", "aff_unique_abbr": "Peking U;NUS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "China;Singapore" }, { "title": "Sample Selection with Uncertainty of Losses for Learning with Noisy Labels", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7175", "id": "xENf4QUL4LW", "poster": "", "openreview": "https://openreview.net/forum?id=xENf4QUL4LW", "slides": "https://iclr.cc/virtual/2022/poster/7175", "video": "https://iclr.cc/virtual/2022/poster/7175", "author_site": "Xiaobo Xia, Tongliang Liu, Bo Han, Mingming Gong, Jun Yu, Gang Niu, Masashi Sugiyama", "tldr": "", "abstract": "In learning with noisy labels, the sample selection approach is very popular, which regards small-loss data as correctly labeled data during training. However, losses are generated on-the-\ufb02y based on the model being trained with noisy labels, and thus large-loss data are likely but not certain to be incorrect. There are actually two possibilities of a large-loss data point: (a) it is mislabeled, and then its loss decreases slower than other data, since deep neural networks learn patterns \ufb01rst; (b) it belongs to an underrepresented group of data and has not been selected yet. In this paper, we incorporate the uncertainty of losses by adopting interval estimation instead of point estimation of losses, where lower bounds of the con\ufb01dence intervals of losses derived from distribution-free concentration inequalities, but not losses themselves, are used for sample selection. In this way, we also give large-loss but less selected data a try; then, we can better distinguish between the cases (a) and (b) by seeing if the losses effectively decrease with the uncertainty after the try. As a result, we can better explore underrepresented data that are correctly labeled but seem to be mislabeled at \ufb01rst glance. Experiments demonstrate that the proposed method is superior to baselines and robust to a broad range of label noise types.", "keywords": "Learning with noisy labels;Sample selection;Uncertainty", "primary_area": "", "supplementary_material": "/attachment/971d5ae740f63bbe9d8adfd2be565cc003f7fc0a.zip", "author": "Xiaobo Xia;Tongliang Liu;Bo Han;Mingming Gong;Jun Yu;Gang Niu;Masashi Sugiyama", "authorids": "~Xiaobo_Xia1;~Tongliang_Liu1;~Bo_Han1;~Mingming_Gong1;~Jun_Yu3;~Gang_Niu1;~Masashi_Sugiyama1", "gender": "M;M;M;M;M;M;M", "homepage": "https://xiaoboxia.github.io/;https://tongliang-liu.github.io/;https://mingming-gong.github.io/;https://faculty.ustc.edu.cn/yujun_AI/en/index.htm;https://niug1984.github.io;http://www.ms.k.u-tokyo.ac.jp/sugi/;https://bhanml.github.io/", "dblp": "242/8072;150/6667;98/8479;50/5754-1.html;26/3367-1;35/1228;241/0472-3", "google_scholar": "jRsugY0AAAAJ;https://scholar.google.com.au/citations?user=EiLdZ_YAAAAJ;https://scholar.google.com.au/citations?user=6BmiCJIAAAAJ;efZyqyQAAAAJ;https://scholar.google.co.jp/citations?user=HOkcy00AAAAJ;https://scholar.google.co.jp/citations?user=GkYIrlIAAAAJ;nTNjqHwAAAAJ", "orcid": ";;0000-0001-7147-5589;0000-0002-3197-8103;;0000-0001-6658-6743;", "linkedin": ";;;;;;", "or_profile": "~Xiaobo_Xia1;~Tongliang_Liu1;~Mingming_Gong1;~Jun_Yu3;~Gang_Niu1;~Masashi_Sugiyama1;~bo_han2", "aff": "The University of Sydney;University of Sydney;University of Melbourne;University of Science and Technology of China;RIKEN;The University of Tokyo;Microsoft Research", "aff_domain": "sydney.edu.au;sydney.edu.au;unimelb.edu.au;ustc.edu.cn;riken.jp;u-tokyo.ac.jp;microsoft.com", "position": "PhD student;Lecturer;Assistant Professor;Associate Professor;Research Scientist (tenured);Full Professor;Researcher", "bibtex": "@inproceedings{\nxia2022sample,\ntitle={Sample Selection with Uncertainty of Losses for Learning with Noisy Labels},\nauthor={Xiaobo Xia and Tongliang Liu and Bo Han and Mingming Gong and Jun Yu and Gang Niu and Masashi Sugiyama},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=xENf4QUL4LW}\n}", "github": "", "project": "", "reviewers": "jAuy;7WR9;LJ1a;5G3L;45Q2", "pdf_size": 0, "recommendation": "5;6;6;8;8", "confidence": "5;4;4;3;2", "correctness": "3;3;3;4;4", "technical_novelty": "3;4;3;3;4", "empirical_novelty": "3;3;3;3;4", "wc_summary_paper": "29;33;110;51;128", "wc_summary_review": "72;38;66;77;23", "wc_main_review": "90;282;385;75;66", "wc_review": "191;353;561;203;217", "wc_reply_reviewers": "0;22;12;0;25", "wc_reply_authors": "457;224;414;125;189", "reply_reviewers": "0;1;1;0;1", "reply_authors": "3;1;1;1;1", "recommendation_avg": [ 6.6, 1.2 ], "confidence_avg": [ 3.6, 1.019803902718557 ], "correctness_avg": [ 3.4, 0.4898979485566356 ], "technical_novelty_avg": [ 3.4, 0.4898979485566356 ], "empirical_novelty_avg": [ 3.2, 0.39999999999999997 ], "wc_summary_paper_avg": [ 70.2, 40.92627517866731 ], "wc_summary_review_avg": [ 55.2, 21.008569680013917 ], "wc_main_review_avg": [ 179.6, 130.0378406464826 ], "wc_review_avg": [ 305.0, 140.70110163037103 ], "wc_reply_reviewers_avg": [ 11.8, 10.552724766618335 ], "wc_reply_authors_avg": [ 281.8, 130.1620528418325 ], "reply_reviewers_avg": [ 0.6, 0.48989794855663565 ], "reply_authors_avg": [ 1.4, 0.8000000000000002 ], "replies_avg": [ 25, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": -0.9478946531678893, "corr_recommendation_correctness": 0.9525793444156803, "gs_citation": 159, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9085351633773763677&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "pdf": "https://openreview.net/pdf?id=xENf4QUL4LW", "email": "sydney.edu.au;sydney.edu.au;unimelb.edu.au;ustc.edu.cn;riken.jp;u-tokyo.ac.jp;microsoft.com", "author_num": 7, "aff_unique_index": "0;0;1;2;3;4;5", "aff_unique_norm": "University of Sydney;University of Melbourne;University of Science and Technology of China;RIKEN;University of Tokyo;Microsoft", "aff_unique_dep": ";;;;;Microsoft Research", "aff_unique_url": "https://www.sydney.edu.au;https://www.unimelb.edu.au;http://www.ustc.edu.cn;https://www.riken.jp;https://www.u-tokyo.ac.jp;https://www.microsoft.com/en-us/research", "aff_unique_abbr": "USYD;UniMelb;USTC;RIKEN;UTokyo;MSR", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;2;2;3", "aff_country_unique": "Australia;China;Japan;United States" }, { "id": "xEaJvbVKeT", "title": "Open-Set Representation Learning through Combinatorial Embedding", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Visual recognition tasks are often limited to dealing with a small subset of classes simply because the labels for the remaining classes are unavailable. We are interested in identifying novel concepts in a dataset through representation learning based on the examples in both labeled and unlabeled classes, and extending the horizon of recognition to both known and novel classes. To address this challenging task, we propose a combinatorial learning approach, which naturally clusters the examples in unseen classes using the compositional knowledge given by multiple supervised meta-classifiers on heterogeneous label spaces. We also introduce a metric learning strategy to estimate pairwise pseudo-labels for improving representations of unlabeled examples, which preserves semantic relations across known and novel classes effectively. The proposed algorithm discovers novel concepts via a joint optimization of enhancing the discrimitiveness of unseen classes as well as learning the representations of known classes generalizable to novel ones. Our extensive experiments demonstrate remarkable performance gains by the proposed approach in multiple image retrieval and novel class discovery benchmarks.", "keywords": "Open-Set;Combinatorial Learning;Retrieval;Novel Class Discovery", "primary_area": "", "supplementary_material": "/attachment/de6f4a0448c661431bf0bcff17109ed58e6bf048.zip", "author": "Geeho Kim;Junoh Kang;Bohyung Han", "authorids": "~Geeho_Kim1;~Junoh_Kang2;~Bohyung_Han1", "gender": "M;M;Not Specified", "homepage": "https://cv.snu.ac.kr/;https://junoh-kang.github.io/;http://cvlab.snu.ac.kr/~bhhan", "dblp": "255/6972;355/1822;73/4880.html", "google_scholar": "jDEWojYAAAAJ;TLGqhucAAAAJ;9aaeCToAAAAJ", "orcid": ";;", "linkedin": ";junohkang;", "or_profile": "~Geeho_Kim1;~Junoh_Kang2;~Bohyung_Han1", "aff": "Seoul National University;Seoul National University;Seoul National University", "aff_domain": "snu.ac.kr;snu.ac.kr;snu.ac.kr", "position": "PhD student;PhD student;Full Professor", "bibtex": "@misc{\nkim2022openset,\ntitle={Open-Set Representation Learning through Combinatorial Embedding},\nauthor={Geeho Kim and Junoh Kang and Bohyung Han},\nyear={2022},\nurl={https://openreview.net/forum?id=xEaJvbVKeT}\n}", "github": "", "project": "", "reviewers": "92oM;K1dC;ZpcM;v3UH", "site": "https://openreview.net/forum?id=xEaJvbVKeT", "pdf_size": 0, "recommendation": "3;5;5;5", "confidence": "4;5;5;3", "correctness": "3;4;3;3", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "55;70;97;130", "wc_summary_review": "29;65;55;77", "wc_main_review": "116;296;321;393", "wc_review": "200;431;473;600", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.5, 0.8660254037844386 ], "confidence_avg": [ 4.25, 0.82915619758885 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 88.0, 28.53944638566067 ], "wc_summary_review_avg": [ 56.5, 17.684739183827393 ], "wc_main_review_avg": [ 281.5, 101.97180982997213 ], "wc_review_avg": [ 426.0, 144.5562174380611 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.17407765595569782, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14068364093983108851&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff_unique_index": "0;0;0", "aff_unique_norm": "Seoul National University", "aff_unique_dep": "", "aff_unique_url": "https://www.snu.ac.kr", "aff_unique_abbr": "SNU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "South Korea" }, { "title": "Quantitative Performance Assessment of CNN Units via Topological Entropy Calculation", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7129", "id": "xFOyMwWPkz", "poster": "", "openreview": "https://openreview.net/forum?id=xFOyMwWPkz", "slides": "https://iclr.cc/virtual/2022/poster/7129", "video": "https://iclr.cc/virtual/2022/poster/7129", "author_site": "Yang Zhao, Hao Zhang", "tldr": "", "abstract": " Identifying the status of individual network units is critical for understanding the mechanism of convolutional neural networks (CNNs). However, it is still challenging to reliably give a general indication of unit status, especially for units in different network models. To this end, we propose a novel method for quantitatively clarifying the status of single unit in CNN using algebraic topological tools. Unit status is indicated via the calculation of a defined topological-based entropy, called feature entropy, which measures the degree of chaos of the global spatial pattern hidden in the unit for a category. In this way, feature entropy could provide an accurate indication of status for units in different networks with diverse situations like weight-rescaling operation. Further, we show that feature entropy decreases as the layer goes deeper and shares almost simultaneous trend with loss during training. We show that by investigating the feature entropy of units on only training data, it could give discrimination between networks with different generalization ability from the view of the effectiveness of feature representations.\n", "keywords": "interpretation of neural network units;computational topology;convolutional neural networks;entropy", "primary_area": "", "supplementary_material": "", "author": "Yang Zhao;Hao Zhang", "authorids": "~Yang_Zhao11;~Hao_Zhang37", "gender": "M;M", "homepage": ";http://ee.tsinghua.edu.cn", "dblp": "50/2082-16;", "google_scholar": "KF9ag1sAAAAJ;", "orcid": "0000-0001-5883-2799;", "linkedin": ";", "or_profile": "~Yang_Zhao11;~Hao_Zhang37", "aff": "Tsinghua University;", "aff_domain": "tsinghua.edu.cn;", "position": "PhD student;", "bibtex": "@inproceedings{\nzhao2022quantitative,\ntitle={Quantitative Performance Assessment of {CNN} Units via Topological Entropy Calculation},\nauthor={Yang Zhao and Hao Zhang},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=xFOyMwWPkz}\n}", "github": "", "project": "", "reviewers": "CZcz;qJ7D;F2EM;LHvg", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "3;4;4;4", "correctness": "4;3;3;3", "technical_novelty": "3;3;2;3", "empirical_novelty": "2;3;2;0", "wc_summary_paper": "93;65;182;87", "wc_summary_review": "33;117;52;69", "wc_main_review": "245;518;423;203", "wc_review": "371;700;657;359", "wc_reply_reviewers": "0;248;0;22", "wc_reply_authors": "1555;1998;1201;1420", "reply_reviewers": "0;1;0;1", "reply_authors": "3;3;2;3", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.75, 1.0897247358851685 ], "wc_summary_paper_avg": [ 106.75, 44.6787141712919 ], "wc_summary_review_avg": [ 67.75, 31.155858197135252 ], "wc_main_review_avg": [ 347.25, 128.60477246198914 ], "wc_review_avg": [ 521.75, 157.542652954684 ], "wc_reply_reviewers_avg": [ 67.5, 104.59804013460291 ], "wc_reply_authors_avg": [ 1543.5, 291.2305787516139 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 2.75, 0.4330127018922193 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.6622661785325219, "corr_recommendation_correctness": -0.6622661785325219, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1435924268107503891&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=xFOyMwWPkz", "email": "tsinghua.edu.cn;", "author_num": 2, "aff_unique_index": "0", "aff_unique_norm": "Tsinghua University", "aff_unique_dep": "", "aff_unique_url": "https://www.tsinghua.edu.cn", "aff_unique_abbr": "THU", "aff_country_unique_index": "0", "aff_country_unique": "China" }, { "id": "xGZcxaYbJBF", "title": "A Multi-Task Learning Algorithm for Non-personalized Recommendations", "track": "main", "status": "Desk Reject", "tldr": "", "abstract": "In this paper, we introduce a multi-task learning (MTL) algorithm for recommending non-personalized videos to watch next on industrial video sharing platforms. Personalized recommendations have been studied for decades, while researches on non-personalized solutions are very rare to be seen, which still remain a huge portion in industry. As an indispensable part in recommender system, non-personalized video recommender system also faces several real-world challenges, including maintaining high relevance between source item and target items, as well as achieving multiple competing ranking objectives. To solve these, we largely extended model-based collaborative filtering algorithm by adding related candidate generation stage, Two-tower DNN structure and a multi-task learning mechanism. Compared with typical baseline solutions, our proposed algorithm can capture both linear and non-linear relationships from user-item interactions, and live experiments demonstrate that it can significantly advance the state of the art on recommendation quality.", "keywords": "Recommendation and Ranking;Non-personalized Recommendations;Multitask Learning;collaborative filtering;Two-tower DNN", "primary_area": "", "supplementary_material": "", "author": "Jiawei Zhang", "authorids": "~Jiawei_Zhang8", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "jiawzhan/", "or_profile": "~Jiawei_Zhang8", "aff": "Peking University", "aff_domain": "pku.edu.cn", "position": "MS student", "bibtex": "@misc{\nzhang2022a,\ntitle={A Multi-Task Learning Algorithm for Non-personalized Recommendations},\nauthor={Jiawei Zhang},\nyear={2022},\nurl={https://openreview.net/forum?id=xGZcxaYbJBF}\n}", "github": "", "project": "", "reviewers": "", "site": "https://openreview.net/forum?id=xGZcxaYbJBF", "pdf_size": 0, "recommendation": "", "confidence": "", "correctness": "", "technical_novelty": "", "empirical_novelty": "", "wc_summary_paper": "", "wc_summary_review": "", "wc_main_review": "", "wc_review": "", "wc_reply_reviewers": "", "wc_reply_authors": "", "reply_reviewers": "", "reply_authors": "", "recommendation_avg": [ 0, 0 ], "confidence_avg": [ 0, 0 ], "correctness_avg": [ 0, 0 ], "technical_novelty_avg": [ 0, 0 ], "empirical_novelty_avg": [ 0, 0 ], "wc_summary_paper_avg": [ 0, 0 ], "wc_summary_review_avg": [ 0, 0 ], "wc_main_review_avg": [ 0, 0 ], "wc_review_avg": [ 0, 0 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 1, 0 ], "authors#_avg": [ 1, 0 ], "corr_recommendation_confidence": 0, "corr_recommendation_correctness": 0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:TKogbMm491AJ:scholar.google.com/&scioq=A+Multi-Task+Learning+Algorithm+for+Non-personalized+Recommendations&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "Peking University", "aff_unique_dep": "", "aff_unique_url": "http://www.pku.edu.cn", "aff_unique_abbr": "Peking U", "aff_country_unique_index": "0", "aff_country_unique": "China" }, { "id": "xIAxm1b4pWc", "title": "Improving Sentiment Classification Using 0-Shot Generated Labels for Custom Transformer Embeddings", "track": "main", "status": "Reject", "tldr": "", "abstract": "We present an approach to improve sentiment classification for transformers (based on BERT and DistilBERT) using additional embeddings to represent emotion inputs. We used HuggingFace's 0-shot prediction pipeline to generate probabilities of whether emotions apply to a given sample. We generated 0-shot probabilities for 1.6 million samples from a sentiment classification dataset and a smaller sentiment airline dataset using 63 emotions. Then we added custom tokens to BERT's embeddings and tokenizers representing various levels of emotion for each predicted emotion. Finally, depending on the probability of each emotion, the respective custom token representing that level was prepended to the text input of the model to process and train for classification. We additionally test direct classification layer addition of emotion inputs and an ensemble of BERT and DistilBERT models both using emotion inputs achieving a modest increase in sentiment prediction accuracy. Our results show modest improvement in all cases over the original model for both BERT and DistilBERT tested with added emotion inputs generated from 0-shot pretrained models.", "keywords": "Sentiment Classification;Transformer;Natural Language Processing", "primary_area": "", "supplementary_material": "", "author": "Ryan Bluteau;Robin Gras", "authorids": "~Ryan_Bluteau1;rgras@uwindsor.ca", "gender": "M;", "homepage": "https://www.researchgate.net/profile/Ryan-Bluteau;", "dblp": ";", "google_scholar": ";", "orcid": "0000-0001-9777-0711;", "linkedin": ";", "or_profile": "~Ryan_Bluteau1;rgras@uwindsor.ca", "aff": "University of Windsor;", "aff_domain": "uwindsor.ca;", "position": "PhD student;", "bibtex": "@misc{\nbluteau2022improving,\ntitle={Improving Sentiment Classification Using 0-Shot Generated Labels for Custom Transformer Embeddings},\nauthor={Ryan Bluteau and Robin Gras},\nyear={2022},\nurl={https://openreview.net/forum?id=xIAxm1b4pWc}\n}", "github": "", "project": "", "reviewers": "WaPR;ey1C;J9AG", "site": "https://openreview.net/forum?id=xIAxm1b4pWc", "pdf_size": 0, "recommendation": "3;3;3", "confidence": "5;3;4", "correctness": "3;3;1", "technical_novelty": "1;1;1", "empirical_novelty": "2;2;0", "wc_summary_paper": "62;40;48", "wc_summary_review": "38;29;63", "wc_main_review": "475;141;138", "wc_review": "575;210;249", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "correctness_avg": [ 2.3333333333333335, 0.9428090415820634 ], "technical_novelty_avg": [ 1.0, 0.0 ], "empirical_novelty_avg": [ 1.3333333333333333, 0.9428090415820634 ], "wc_summary_paper_avg": [ 50.0, 9.092121131323903 ], "wc_summary_review_avg": [ 43.333333333333336, 14.383632673594278 ], "wc_main_review_avg": [ 251.33333333333334, 158.16095880111783 ], "wc_review_avg": [ 344.6666666666667, 163.6466382857351 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:P-xbTTDUV8wJ:scholar.google.com/&scioq=Improving+Sentiment+Classification+Using+0-Shot+Generated+Labels+for+Custom+Transformer+Embeddings&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "University of Windsor", "aff_unique_dep": "", "aff_unique_url": "https://www.uwindsor.ca", "aff_unique_abbr": "UWindsor", "aff_country_unique_index": "0", "aff_country_unique": "Canada" }, { "title": "Hierarchical Few-Shot Imitation with Skill Transition Models", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6735", "id": "xKZ4K0lTj_", "poster": "", "openreview": "https://openreview.net/forum?id=xKZ4K0lTj_", "slides": "https://iclr.cc/virtual/2022/poster/6735", "video": "https://iclr.cc/virtual/2022/poster/6735", "author_site": "Kourosh Hakhamaneshi, Ruihan Zhao, Albert Zhan, Pieter Abbeel, Michael Laskin", "tldr": "", "abstract": "A desirable property of autonomous agents is the ability to both solve long-horizon problems and generalize to unseen tasks. Recent advances in data-driven skill learning have shown that extracting behavioral priors from offline data can enable agents to solve challenging long-horizon tasks with reinforcement learning. However, generalization to tasks unseen during behavioral prior training remains an outstanding challenge. To this end, we present Few-shot Imitation with Skill Transition Models (FIST), an algorithm that extracts skills from offline data and utilizes them to generalize to unseen tasks given a few downstream demonstrations. FIST learns an inverse skill dynamics model, a distance function, and utilizes a semi-parametric approach for imitation. We show that FIST is capable of generalizing to new tasks and substantially outperforms prior baselines in navigation experiments requiring traversing unseen parts of a large maze and 7-DoF robotic arm experiments requiring manipulating previously unseen objects in a kitchen.", "keywords": "behavioral priors;skill extraction;imitation learning;few-shot learning", "primary_area": "", "supplementary_material": "/attachment/5c4744a72b39f3ec513ad38da22c69182f7bc1f2.zip", "author": "Kourosh Hakhamaneshi;Ruihan Zhao;Albert Zhan;Pieter Abbeel;Michael Laskin", "authorids": "~Kourosh_Hakhamaneshi1;~Ruihan_Zhao1;~Albert_Zhan1;~Pieter_Abbeel2;~Michael_Laskin1", "gender": "M;M;M;M;M", "homepage": "https://kouroshhakha.github.io/;https://philipzrh.com;https://albertzhan.github.io;https://people.eecs.berkeley.edu/~pabbeel/;http://mishalaskin.com", "dblp": ";236/4741-1;258/1288;;", "google_scholar": ";;P356Id4AAAAJ;https://scholar.google.com.tw/citations?user=vtwH6GkAAAAJ;DOGDnwsAAAAJ", "orcid": ";;;;", "linkedin": ";;;;mishalaskin", "or_profile": "~Kourosh_Hakhamaneshi1;~Ruihan_Zhao1;~Albert_Zhan1;~Pieter_Abbeel2;~Michael_Laskin1", "aff": "University of California, Berkeley;University of Texas at Austin;University of California, Berkeley;Covariant;Google DeepMind", "aff_domain": "berkeley.edu;utexas.edu;berkeley.edu;covariant.ai;deepmind.com", "position": "PhD student;PhD student;Undergrad student;Founder;Researcher", "bibtex": "@inproceedings{\nhakhamaneshi2022hierarchical,\ntitle={Hierarchical Few-Shot Imitation with Skill Transition Models},\nauthor={Kourosh Hakhamaneshi and Ruihan Zhao and Albert Zhan and Pieter Abbeel and Michael Laskin},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=xKZ4K0lTj_}\n}", "github": "", "project": "", "reviewers": "JEuK;s3dr;enSE;LjxN", "pdf_size": 0, "recommendation": "6;6;6;8", "confidence": "3;4;5;5", "correctness": "3;4;3;3", "technical_novelty": "2;2;3;3", "empirical_novelty": "3;3;2;3", "wc_summary_paper": "232;214;117;129", "wc_summary_review": "33;39;97;33", "wc_main_review": "321;183;515;195", "wc_review": "586;436;729;357", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 4.25, 0.82915619758885 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 173.0, 50.5816172141619 ], "wc_summary_review_avg": [ 50.5, 26.95830113341714 ], "wc_main_review_avg": [ 303.5, 133.5393200521854 ], "wc_review_avg": [ 527.0, 142.71124692889484 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.5222329678670935, "corr_recommendation_correctness": -0.3333333333333333, "gs_citation": 56, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11314236649785473138&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=xKZ4K0lTj_", "email": "berkeley.edu;utexas.edu;berkeley.edu;covariant.ai;deepmind.com", "author_num": 5, "aff_unique_index": "0;1;0;2;3", "aff_unique_norm": "University of California, Berkeley;University of Texas at Austin;Covariant;Google", "aff_unique_dep": ";;;Google DeepMind", "aff_unique_url": "https://www.berkeley.edu;https://www.utexas.edu;;https://deepmind.com", "aff_unique_abbr": "UC Berkeley;UT Austin;;DeepMind", "aff_campus_unique_index": "0;1;0", "aff_campus_unique": "Berkeley;Austin;", "aff_country_unique_index": "0;0;0;2", "aff_country_unique": "United States;;United Kingdom" }, { "title": "Energy-Based Learning for Cooperative Games, with Applications to Valuation Problems in Machine Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6807", "id": "xLfAgCroImw", "poster": "", "openreview": "https://openreview.net/forum?id=xLfAgCroImw", "slides": "https://iclr.cc/virtual/2022/poster/6807", "video": "https://iclr.cc/virtual/2022/poster/6807", "author_site": "Yatao Bian, Yu Rong, Tingyang Xu, Jiaxiang Wu, Andreas Krause, Junzhou Huang", "tldr": "", "abstract": "Valuation problems, such as feature interpretation, data valuation and model valuation for ensembles, become increasingly more important in many machine learning applications. Such problems are commonly solved by well-known game-theoretic criteria, such as Shapley value or Banzhaf value. In this work, we present a novel energy-based treatment for cooperative games, with a theoretical justification by the maximum entropy framework. Surprisingly, by conducting variational inference of the energy-based model, we recover various game-theoretic valuation criteria through conducting one-step fixed point iteration for maximizing the mean-field ELBO objective. This observation also verifies the rationality of existing criteria, as they are all attempting to decouple the correlations among the players through the mean-field approach. By running fixed point iteration for multiple steps, we achieve a trajectory of the valuations, among which we define the valuation with the best conceivable decoupling error as the Variational Index. We prove that under uniform initializations, these variational valuations all satisfy a set of game-theoretic axioms. We experimentally demonstrate that the proposed Variational Index enjoys lower decoupling error and better valuation performance on certain synthetic and real-world valuation problems. ", "keywords": "Valuation problems;Shapley value;Model interpretation;Data valuation;Enegy-based learning;Attribution-based feature interpretation;Model valuation for ensembles;Feature attributions", "primary_area": "", "supplementary_material": "/attachment/4f9cff66320116f1d78f4b52ce50e0d5ac90b01c.zip", "author": "Yatao Bian;Yu Rong;Tingyang Xu;Jiaxiang Wu;Andreas Krause;Junzhou Huang", "authorids": "~Yatao_Bian1;~Yu_Rong1;~Tingyang_Xu1;~Jiaxiang_Wu1;~Andreas_Krause1;~Junzhou_Huang2", "gender": "M;M;M;M;M;M", "homepage": "https://royrong.me/;;;https://las.inf.ethz.ch/krausea;http://ranger.uta.edu/~huang/;https://yataobian.com", "dblp": "24/10036-1;157/0940;119/6799-1.html;87/1831-1.html;22/1170.html;222/2694", "google_scholar": "https://scholar.google.com.hk/citations?user=itezhEMAAAAJ;6gIs5YMAAAAJ;https://scholar.google.com.hk/citations?user=puazh38AAAAJ;https://scholar.google.ch/citations?user=eDHv58AAAAAJ;https://scholar.google.com.tw/citations?user=X7KrguAAAAAJ;oZBTlBkAAAAJ", "orcid": "0000-0001-7387-302X;0009-0002-0106-8376;;0000-0001-7260-9673;0000-0002-9548-1227;0000-0002-2368-4084", "linkedin": ";;;krausea/;;", "or_profile": "~Yu_Rong1;~Tingyang_Xu1;~Jiaxiang_Wu1;~Andreas_Krause1;~Junzhou_Huang2;~An_Bian1", "aff": "Tencent AI Lab;Tencent AI Lab;Tencent AI Lab;ETH Zurich;University of Texas, Arlington;Tencent AI Lab", "aff_domain": "tencent.com;tencent.com;tencent.com;ethz.ch;uta.edu;tencent.com", "position": "Senior Researcher;Researcher;Researcher;Full Professor;Full Professor;Senior researcher ", "bibtex": "@inproceedings{\nbian2022energybased,\ntitle={Energy-Based Learning for Cooperative Games, with Applications to Valuation Problems in Machine Learning},\nauthor={Yatao Bian and Yu Rong and Tingyang Xu and Jiaxiang Wu and Andreas Krause and Junzhou Huang},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=xLfAgCroImw}\n}", "github": "", "project": "", "reviewers": "5RYD;LgG5;Jn7f;28es", "pdf_size": 0, "recommendation": "6;6;8;8", "confidence": "4;3;3;4", "correctness": "4;4;4;4", "technical_novelty": "2;4;4;4", "empirical_novelty": "2;2;3;4", "wc_summary_paper": "313;113;79;401", "wc_summary_review": "46;21;31;75", "wc_main_review": "373;474;305;1365", "wc_review": "732;608;415;1841", "wc_reply_reviewers": "0;0;0;655", "wc_reply_authors": "1877;1351;898;4084", "reply_reviewers": "0;0;0;1", "reply_authors": "5;2;3;7", "recommendation_avg": [ 7.0, 1.0 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 4.0, 0.0 ], "technical_novelty_avg": [ 3.5, 0.8660254037844386 ], "empirical_novelty_avg": [ 2.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 226.5, 134.6950258918272 ], "wc_summary_review_avg": [ 43.25, 20.376150274278995 ], "wc_main_review_avg": [ 629.25, 429.020031583608 ], "wc_review_avg": [ 899.0, 555.470521270031 ], "wc_reply_reviewers_avg": [ 163.75, 283.62331973940366 ], "wc_reply_authors_avg": [ 2052.5, 1222.9845665420312 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 4.25, 1.920286436967152 ], "replies_avg": [ 25, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 18, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17417040937518634325&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=xLfAgCroImw", "email": "tencent.com;tencent.com;tencent.com;ethz.ch;uta.edu;tencent.com", "author_num": 6, "aff_unique_index": "0;0;0;1;2;0", "aff_unique_norm": "Tencent;ETH Zurich;University of Texas at Arlington", "aff_unique_dep": "Tencent AI Lab;;", "aff_unique_url": "https://ai.tencent.com;https://www.ethz.ch;https://www.uta.edu", "aff_unique_abbr": "Tencent AI Lab;ETHZ;UTA", "aff_campus_unique_index": "1", "aff_campus_unique": ";Arlington", "aff_country_unique_index": "0;0;0;1;2;0", "aff_country_unique": "China;Switzerland;United States" }, { "title": "NodePiece: Compositional and Parameter-Efficient Representations of Large Knowledge Graphs", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6306", "id": "xMJWUKJnFSw", "poster": "", "openreview": "https://openreview.net/forum?id=xMJWUKJnFSw", "slides": "https://iclr.cc/virtual/2022/poster/6306", "video": "https://iclr.cc/virtual/2022/poster/6306", "author_site": "Mikhail Galkin, Etienne Denis, Jiapeng Wu, William Hamilton", "tldr": "", "abstract": "Conventional representation learning algorithms for knowledge graphs (KG) map each entity to a unique embedding vector. \nSuch a shallow lookup results in a linear growth of memory consumption for storing the embedding matrix and incurs high computational costs of working with real-world KGs.\nDrawing parallels with subword tokenization commonly used in NLP, we explore the landscape of more parameter-efficient node embedding strategies with possibly sublinear memory requirements. \nTo this end, we propose NodePiece, an anchor-based approach to learn a fixed-size entity vocabulary. \nIn NodePiece, a vocabulary of subword/sub-entity units is constructed from anchor nodes in a graph with known relation types. Given such a fixed-size vocabulary, it is possible to bootstrap an encoding and embedding for any entity, including those unseen during training.\nExperiments show that NodePiece performs competitively in node classification, link prediction, and relation prediction tasks retaining less than 10% of explicit nodes in a graph as anchors and often having 10x fewer parameters. To this end, we show that a NodePiece-enabled model outperforms existing shallow models on a large OGB WikiKG 2 graph having 70x fewer parameters.\n", "keywords": "knowledge graphs;graph representation learning;tokenization;link prediction;node classification", "primary_area": "", "supplementary_material": "/attachment/53d96e699eba373dd4e613264db035ae53de6037.zip", "author": "Mikhail Galkin;Etienne Denis;Jiapeng Wu;William L. Hamilton", "authorids": "~Mikhail_Galkin1;~Etienne_Denis1;~Jiapeng_Wu1;~William_L._Hamilton1", "gender": "M;M;M;", "homepage": "https://migalkin.github.io/;;;", "dblp": "160/8154;;;137/3314", "google_scholar": "yfYRbG4AAAAJ;lAf6xCoAAAAJ;ZG7HjHIAAAAJ;", "orcid": ";;;", "linkedin": ";egdenis;;", "or_profile": "~Mikhail_Galkin1;~Etienne_Denis1;~Jiapeng_Wu1;~William_L._Hamilton1", "aff": "Mila & McGill University;McGill University;Layer 6;McGill University", "aff_domain": "mila.quebec;mcgill.ca;layer6.ai;mcgill.ca", "position": "Postdoc;MS student;Researcher;Assistant Professor", "bibtex": "@inproceedings{\ngalkin2022nodepiece,\ntitle={NodePiece: Compositional and Parameter-Efficient Representations of Large Knowledge Graphs},\nauthor={Mikhail Galkin and Etienne Denis and Jiapeng Wu and William L. Hamilton},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=xMJWUKJnFSw}\n}", "github": "", "project": "", "reviewers": "CpaB;X7aq;2qcD;KuBz", "pdf_size": 0, "recommendation": "5;6;8;8", "confidence": "4;4;4;3", "correctness": "4;4;3;4", "technical_novelty": "2;3;3;4", "empirical_novelty": "0;3;3;4", "wc_summary_paper": "138;148;43;264", "wc_summary_review": "66;55;19;35", "wc_main_review": "277;239;332;163", "wc_review": "481;442;394;462", "wc_reply_reviewers": "123;0;10;32", "wc_reply_authors": "1939;559;1256;250", "reply_reviewers": "1;0;1;1", "reply_authors": "5;2;2;1", "recommendation_avg": [ 6.75, 1.299038105676658 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.5, 1.5 ], "wc_summary_paper_avg": [ 148.25, 78.39124632253272 ], "wc_summary_review_avg": [ 43.75, 18.102140757380052 ], "wc_main_review_avg": [ 252.75, 61.466962671015395 ], "wc_review_avg": [ 444.75, 32.383444844549814 ], "wc_reply_reviewers_avg": [ 41.25, 48.59719642119286 ], "wc_reply_authors_avg": [ 1001.0, 652.7315681043779 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 2.5, 1.5 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.5555555555555555, "corr_recommendation_correctness": -0.5555555555555555, "gs_citation": 118, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4956010200873018529&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=xMJWUKJnFSw", "email": "mila.quebec;mcgill.ca;layer6.ai;mcgill.ca", "author_num": 4, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "McGill University;Layer 6", "aff_unique_dep": "Mila;", "aff_unique_url": "https://www.mcgill.ca;", "aff_unique_abbr": "McGill;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Canada;" }, { "title": "Language-biased image classification: evaluation based on semantic representations", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/5990", "id": "xNO7OEIcJc6", "poster": "", "openreview": "https://openreview.net/forum?id=xNO7OEIcJc6", "slides": "https://iclr.cc/virtual/2022/poster/5990", "video": "https://iclr.cc/virtual/2022/poster/5990", "author_site": "Yoann Lemesle, Masataka Sawayama, Guillermo Valle-Perez, Maxime Adolphe, H\u00e9l\u00e8ne Sauz\u00e9on, Pierre-Yves Oudeyer", "tldr": "", "abstract": "Humans show language-biased image recognition for a word-embedded image, known as picture-word interference. Such interference depends on hierarchical semantic categories and reflects that human language processing highly interacts with visual processing. Similar to humans, recent artificial models jointly trained on texts and images, e.g., OpenAI CLIP, show language-biased image classification. Exploring whether the bias leads to interference similar to those observed in humans can contribute to understanding how much the model acquires hierarchical semantic representations from joint learning of language and vision. The present study introduces methodological tools from the cognitive science literature to assess the biases of artificial models. Specifically, we introduce a benchmark task to test whether words superimposed on images can distort the image classification across different category levels and, if it can, whether the perturbation is due to the shared semantic representation between language and vision. Our dataset is a set of word-embedded images and consists of a mixture of natural image datasets and hierarchical word labels with superordinate/basic category levels. Using this benchmark test, we evaluate the CLIP model. We show that presenting words distorts the image classification by the model across different category levels, but the effect does not depend on the semantic relationship between images and embedded words. This suggests that the semantic word representation in the CLIP visual processing is not shared with the image representation, although the word representation strongly dominates for word-embedded images.", "keywords": "interpretation of learned representations;language and visual processing;language-biased image classification;cognitive science", "primary_area": "", "supplementary_material": "/attachment/f77fde7c56861bfd764f13cb7d229809229f0327.zip", "author": "Yoann Lemesle;Masataka Sawayama;Guillermo Valle-Perez;Maxime Adolphe;H\u00e9l\u00e8ne Sauz\u00e9on;Pierre-Yves Oudeyer", "authorids": "~Yoann_Lemesle1;~Masataka_Sawayama1;~Guillermo_Valle-Perez1;~Maxime_Adolphe1;~H\u00e9l\u00e8ne_Sauz\u00e9on1;~Pierre-Yves_Oudeyer1", "gender": "M;M;M;M;F;M", "homepage": ";https://www.mswym.com/;http://guillefix.me;https://flowers.inria.fr/team/;;http://www.pyoudeyer.com", "dblp": ";165/9982;;;;33/5513", "google_scholar": ";https://scholar.google.co.jp/citations?user=AagXxCYAAAAJ;;;;https://scholar.google.fr/citations?user=gCqGj4sAAAAJ", "orcid": ";;;;0000-0001-5781-9891;", "linkedin": "yoann-lemesle-146104222/;;;;;pierreyvesoudeyer/", "or_profile": "~Yoann_Lemesle1;~Masataka_Sawayama1;~Guillermo_Valle-Perez1;~Maxime_Adolphe1;~H\u00e9l\u00e8ne_Sauz\u00e9on1;~Pierre-Yves_Oudeyer1", "aff": "Univerist\u00e9 Paris-Dauphine;INRIA;;INRIA;University of Bordeaux;Microsoft", "aff_domain": "dauphine.fr;inria.fr;;inria.fr;u-bordeaux.fr;microsoft.com", "position": "MS student;Postdoc;;PhD student;Full Professor;Visiting researcher", "bibtex": "@inproceedings{\nlemesle2022languagebiased,\ntitle={Language-biased image classification: evaluation based on semantic representations},\nauthor={Yoann Lemesle and Masataka Sawayama and Guillermo Valle-Perez and Maxime Adolphe and H{\\'e}l{\\`e}ne Sauz{\\'e}on and Pierre-Yves Oudeyer},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=xNO7OEIcJc6}\n}", "github": "", "project": "", "reviewers": "P83Y;WKSS;gSM4;9yiw", "pdf_size": 0, "recommendation": "3;6;6;8", "confidence": "4;3;3;4", "correctness": "1;3;3;3", "technical_novelty": "3;3;3;3", "empirical_novelty": "3;3;2;4", "wc_summary_paper": "112;145;98;98", "wc_summary_review": "133;86;53;27", "wc_main_review": "399;476;196;123", "wc_review": "644;707;347;248", "wc_reply_reviewers": "54;0;0;33", "wc_reply_authors": "658;1118;510;260", "reply_reviewers": "1;0;0;2", "reply_authors": "2;3;2;2", "recommendation_avg": [ 5.75, 1.7853571071357126 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 2.5, 0.8660254037844386 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 113.25, 19.201236939322424 ], "wc_summary_review_avg": [ 74.75, 39.60034722069997 ], "wc_main_review_avg": [ 298.5, 143.9730877629566 ], "wc_review_avg": [ 486.5, 193.5 ], "wc_reply_reviewers_avg": [ 21.75, 22.982330169066845 ], "wc_reply_authors_avg": [ 636.5, 312.2735179293947 ], "reply_reviewers_avg": [ 0.75, 0.82915619758885 ], "reply_authors_avg": [ 2.25, 0.4330127018922193 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.14002800840280097, "corr_recommendation_correctness": 0.8892972917998875, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7894245425840424018&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=xNO7OEIcJc6", "email": "dauphine.fr;inria.fr;;inria.fr;u-bordeaux.fr;microsoft.com", "author_num": 6, "aff_unique_index": "0;1;1;2;3", "aff_unique_norm": "Universit\u00e9 Paris-Dauphine;INRIA;University of Bordeaux;Microsoft", "aff_unique_dep": ";;;Microsoft Corporation", "aff_unique_url": "https://www.univ-paris-dauphine.fr;https://www.inria.fr;https://www.u-bordeaux.fr;https://www.microsoft.com", "aff_unique_abbr": "UPD;INRIA;UBordeaux;Microsoft", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;1", "aff_country_unique": "France;United States" }, { "title": "Post hoc Explanations may be Ineffective for Detecting Unknown Spurious Correlation", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6565", "id": "xNOVfCCvDpM", "poster": "", "openreview": "https://openreview.net/forum?id=xNOVfCCvDpM", "slides": "https://iclr.cc/virtual/2022/poster/6565", "video": "https://iclr.cc/virtual/2022/poster/6565", "author_site": "Julius Adebayo, Michael Muelly, Harold Abelson, Been Kim", "tldr": "", "abstract": "We investigate whether three types of post hoc model explanations\u2013feature attribution, concept activation, and training point ranking\u2013are effective for detecting a model\u2019s reliance on spurious signals in the training data. Specifically, we consider the scenario where the spurious signal to be detected is unknown, at test-time, to the user of the explanation method. We design an empirical methodology that uses semi-synthetic datasets along with pre-specified spurious artifacts to obtain models that verifiably rely on these spurious training signals. We then provide a suite of metrics that assess an explanation method\u2019s reliability for spurious signal detection under various conditions. We find that the post hoc explanation methods tested are ineffective when the spurious artifact is unknown at test-time especially for non-visible artifacts like a background blur. Further, we find that feature attribution methods are susceptible to erroneously indicating dependence on spurious signals even when the model being explained does not rely on spurious artifacts. This finding casts doubt on the utility of these approaches, in the hands of a practitioner, for detecting a model\u2019s reliance on spurious signals.", "keywords": "explanations;feature attributions;spurious correlation;interpretability;training point ranking", "primary_area": "", "supplementary_material": "/attachment/ff0c2e3f25a18eba79cd77b4f76ad0931a71c342.zip", "author": "Julius Adebayo;Michael Muelly;Harold Abelson;Been Kim", "authorids": "~Julius_Adebayo1;~Michael_Muelly1;~Harold_Abelson1;~Been_Kim1", "gender": "M;M;M;", "homepage": "https://juliusadebayo.com/;;http://groups.csail.mit.edu/mac/users/hal/hal.html/;https://beenkim.github.io/", "dblp": "146/1271;228/6895;;https://dblp.uni-trier.de/pers/k/Kim:Been.html", "google_scholar": "y1bnRg4AAAAJ;F2SAhnQAAAAJ;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Julius_Adebayo1;~Michael_Muelly1;~Harold_Abelson1;~Been_Kim1", "aff": "Massachusetts Institute of Technology;Stanford University;Massachusetts Institute of Technology;Google DeepMind", "aff_domain": "mit.edu;stanford.edu;;google.com", "position": "PhD Student;Assistant Professor;Full Professor;Research Scientist", "bibtex": "@inproceedings{\nadebayo2022post,\ntitle={Post hoc Explanations may be Ineffective for Detecting Unknown Spurious Correlation},\nauthor={Julius Adebayo and Michael Muelly and Harold Abelson and Been Kim},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=xNOVfCCvDpM}\n}", "github": "", "project": "", "reviewers": "uWNE;Mwip;Ng3n", "pdf_size": 0, "recommendation": "6;6;6", "confidence": "3;4;4", "correctness": "3;3;3", "technical_novelty": "2;2;3", "empirical_novelty": "3;3;3", "wc_summary_paper": "50;167;67", "wc_summary_review": "63;109;20", "wc_main_review": "190;553;600", "wc_review": "303;829;687", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 6.0, 0.0 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 94.66666666666667, 51.61610429141493 ], "wc_summary_review_avg": [ 64.0, 36.34097778908359 ], "wc_main_review_avg": [ 447.6666666666667, 183.2054099152703 ], "wc_review_avg": [ 606.3333333333334, 222.18510801181571 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 118, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5714357889917083968&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=xNOVfCCvDpM", "email": "mit.edu;stanford.edu;;google.com", "author_num": 4, "aff_unique_index": "0;1;0;2", "aff_unique_norm": "Massachusetts Institute of Technology;Stanford University;Google", "aff_unique_dep": ";;Google DeepMind", "aff_unique_url": "https://web.mit.edu;https://www.stanford.edu;https://deepmind.com", "aff_unique_abbr": "MIT;Stanford;DeepMind", "aff_campus_unique_index": "1", "aff_campus_unique": ";Stanford", "aff_country_unique_index": "0;0;0;1", "aff_country_unique": "United States;United Kingdom" }, { "id": "xO4xryFQltO", "title": "A new perspective on probabilistic image modeling", "track": "main", "status": "Desk Reject", "tldr": "", "abstract": "We present the Deep Convolutional Gaussian Mixture Model (DCGMM), a new probabilistic approach for image modeling capable of density estimation, sampling and tractable inference. DCGMM instances exhibit a CNN-like layered structure, in which the principal building blocks are convolutional Gaussian Mixture (cGMM) layers. A key innovation w.r.t. related models lile sum-produdct networks (SPNs) and probabilistic circuits (PCs) is that each cGMM layer optimizes an independent loss function and therefore has an independent probabilistic interpretation. This modular approach permits intervening transformation layers to harness the full spectrum of \n(potentially non-invertible) mappings available to CNNs, e.g., max-pooling or (half-)convolutions. DCGMM sampling and inference are realized by a deep chain of hierarchical priors, where samples generated by each cGMM layer parameterize sampling in the next-lower cGMM layer. For sampling through non-invertible transformation layers, we introduce a new gradient-based sharpening technique that exploits redundancy (overlap) in, e.g., half-convolutions. The basic quantities forward-transported through a DCGMM instance are the posterior probabilities of cGMM layers, which ensures numerical stability and facilitates the selection of learning rates.\nDCGMMs can be trained end-to-end by SGD from random initial conditions, much like CNNs. We experimentally show that DCGMMs compare favorably to several recent PC and SPN models in terms of inference, classification and sampling, the latter particularly for challenging datasets such as SVHN. A public TF2 implementation is provided as well.", "keywords": "deep mixture models;sum-product networks;probabilistic circuits;image modeling", "primary_area": "", "supplementary_material": "", "author": "Alexander Gepperth", "authorids": "~Alexander_Gepperth1", "gender": "M", "homepage": "http://www.gepperth.net", "dblp": "05/11166", "google_scholar": "QR2zb3IAAAAJ", "orcid": "0000-0003-2216-7808", "linkedin": "", "or_profile": "~Alexander_Gepperth1", "aff": "HAW Fulda", "aff_domain": "informatik.hs-fulda.de", "position": "Full Professor", "bibtex": "@misc{\ngepperth2022a,\ntitle={A new perspective on probabilistic image modeling},\nauthor={Alexander Gepperth},\nyear={2022},\nurl={https://openreview.net/forum?id=xO4xryFQltO}\n}", "github": "", "project": "", "reviewers": "", "site": "https://openreview.net/forum?id=xO4xryFQltO", "pdf_size": 0, "recommendation": "", "confidence": "", "correctness": "", "technical_novelty": "", "empirical_novelty": "", "wc_summary_paper": "", "wc_summary_review": "", "wc_main_review": "", "wc_review": "", "wc_reply_reviewers": "", "wc_reply_authors": "", "reply_reviewers": "", "reply_authors": "", "recommendation_avg": [ 0, 0 ], "confidence_avg": [ 0, 0 ], "correctness_avg": [ 0, 0 ], "technical_novelty_avg": [ 0, 0 ], "empirical_novelty_avg": [ 0, 0 ], "wc_summary_paper_avg": [ 0, 0 ], "wc_summary_review_avg": [ 0, 0 ], "wc_main_review_avg": [ 0, 0 ], "wc_review_avg": [ 0, 0 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 1, 0 ], "authors#_avg": [ 1, 0 ], "corr_recommendation_confidence": 0, "corr_recommendation_correctness": 0, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16801336098229418616&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 11, "aff_unique_index": "0", "aff_unique_norm": "Fulda University of Applied Sciences", "aff_unique_dep": "", "aff_unique_url": "https://www.haw-fulda.de", "aff_unique_abbr": "HAW Fulda", "aff_campus_unique_index": "0", "aff_campus_unique": "Fulda", "aff_country_unique_index": "0", "aff_country_unique": "Germany" }, { "id": "xOHuV8s7Yl", "title": "Two Instances of Interpretable Neural Network for Universal Approximations", "track": "main", "status": "Reject", "tldr": "", "abstract": "This paper proposes two bottom-up interpretable neural network (NN) constructions for universal approximation, namely Triangularly-constructed NN (TNN) and Semi-Quantized Activation NN (SQANN). The notable properties are (1) resistance to catastrophic forgetting (2) existence of proof for arbitrarily high accuracies on training dataset (3) for an input x, users can identify specific samples of training data whose activation \"fingerprints\" are similar to that of x's activations. Users can also identify samples that are out of distribution.", "keywords": "Explainable Artificial Intelligence;Neural Network;Universal Approximation;Universal Approximator", "primary_area": "", "supplementary_material": "/attachment/5f6bb9ba9e5902c5d9b98791e145715f59cc21c0.zip", "author": "Erico Tjoa;Cuntai Guan", "authorids": "~Erico_Tjoa1;~Cuntai_Guan1", "gender": "M;M", "homepage": ";https://personal.ntu.edu.sg/ctguan/index.html", "dblp": ";95/7006", "google_scholar": "hh9WwAMAAAAJ;https://scholar.google.com.tw/citations?user=sg4vxPoAAAAJ", "orcid": ";0000-0002-0872-3276", "linkedin": ";", "or_profile": "~Erico_Tjoa1;~Cuntai_Guan1", "aff": "Nanyang Technological University;Nanyang Technological University", "aff_domain": "ntu.edu.sg;ntu.edu.sg", "position": "PhD student;Full Professor", "bibtex": "@misc{\ntjoa2022two,\ntitle={Two Instances of Interpretable Neural Network for Universal Approximations},\nauthor={Erico Tjoa and Cuntai Guan},\nyear={2022},\nurl={https://openreview.net/forum?id=xOHuV8s7Yl}\n}", "github": "", "project": "", "reviewers": "KGST;v56a;Xw97;EKmc", "site": "https://openreview.net/forum?id=xOHuV8s7Yl", "pdf_size": 0, "recommendation": "1;3;3;3", "confidence": "5;4;3;4", "correctness": "2;2;3;2", "technical_novelty": "1;2;1;2", "empirical_novelty": "1;2;0;2", "wc_summary_paper": "57;106;74;27", "wc_summary_review": "85;61;80;16", "wc_main_review": "379;478;500;174", "wc_review": "521;645;654;217", "wc_reply_reviewers": "71;444;0;0", "wc_reply_authors": "804;497;572;272", "reply_reviewers": "1;1;0;0", "reply_authors": "2;2;1;1", "recommendation_avg": [ 2.5, 0.8660254037844386 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 2.25, 0.4330127018922193 ], "technical_novelty_avg": [ 1.5, 0.5 ], "empirical_novelty_avg": [ 1.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 66.0, 28.574464124459098 ], "wc_summary_review_avg": [ 60.5, 27.207535720825582 ], "wc_main_review_avg": [ 382.75, 128.85141636784596 ], "wc_review_avg": [ 509.25, 176.72630675708697 ], "wc_reply_reviewers_avg": [ 128.75, 184.3032487505307 ], "wc_reply_authors_avg": [ 536.25, 189.95838360019806 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.816496580927726, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15076069501419662587&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0", "aff_unique_norm": "Nanyang Technological University", "aff_unique_dep": "", "aff_unique_url": "https://www.ntu.edu.sg", "aff_unique_abbr": "NTU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Singapore" }, { "id": "xOeWOPFXrTh", "title": "Learning Higher-Order Dynamics in Video-Based Cardiac Measurement", "track": "main", "status": "Reject", "tldr": "", "abstract": "Computer vision methods typically optimize for first-order dynamics (e.g., optical flow). However, in many cases the properties of interest are subtle variations in higher-order changes, such as acceleration. This is true in the cardiac pulse, where the second derivative can be used as an indicator of blood pressure and arterial disease. Recent developments in camera-based vital sign measurement have shown that cardiac measurements can be recovered with impressive accuracy from videos; however, the majority of research has focused on extracting summary statistics such as heart rate. Less emphasis has been put on the accuracy of waveform morphology that is necessary for many clinically impactful scenarios. In this work, we provide evidence that higher-order dynamics are better estimated by neural models when explicitly optimized for in the loss function. Furthermore, adding second-derivative inputs also improves performance when estimating second-order dynamics. By incorporating the second derivative of both the input frames and the target vital sign signals into the training procedure, our model is better able to estimate left ventricle ejection time intervals.", "keywords": "Computer Vision;Dynamic Systems;Deep Learning", "primary_area": "", "supplementary_material": "/attachment/abae0c18d4ac836fd01486ac36aafdec3c009583.zip", "author": "Brian L. Hill;Xin Liu;Daniel McDuff", "authorids": "~Brian_L._Hill1;~Xin_Liu8;~Daniel_McDuff1", "gender": "M;M;M", "homepage": "https://www.brianlhill.info;https://homes.cs.washington.edu/~xliu0/;http://alumni.media.mit.edu/~djmcduff/", "dblp": ";76/1820-61;63/9606", "google_scholar": "UnyYursAAAAJ;p9F83HoAAAAJ;m7Jr-b4AAAAJ", "orcid": "0000-0002-6881-5770;;", "linkedin": "brianhill11/;;", "or_profile": "~Brian_L._Hill1;~Xin_Liu8;~Daniel_McDuff1", "aff": ";Department of Computer Science, University of Washington;Microsoft", "aff_domain": ";cs.washington.edu;microsoft.com", "position": ";PhD student;Principal Researcer", "bibtex": "@misc{\nhill2022learning,\ntitle={Learning Higher-Order Dynamics in Video-Based Cardiac Measurement},\nauthor={Brian L. Hill and Xin Liu and Daniel McDuff},\nyear={2022},\nurl={https://openreview.net/forum?id=xOeWOPFXrTh}\n}", "github": "", "project": "", "reviewers": "me2S;eB6A;W7ES;qH24;Z3zg", "site": "https://openreview.net/forum?id=xOeWOPFXrTh", "pdf_size": 0, "recommendation": "3;3;5;5;6", "confidence": "3;5;2;4;4", "correctness": "3;4;3;3;3", "technical_novelty": "2;2;2;1;3", "empirical_novelty": "2;2;2;3;3", "wc_summary_paper": "66;87;80;98;109", "wc_summary_review": "36;23;26;35;120", "wc_main_review": "305;284;241;242;557", "wc_review": "407;394;347;375;786", "wc_reply_reviewers": "0;178;89;54;208", "wc_reply_authors": "636;656;530;559;1021", "reply_reviewers": "0;1;1;1;3", "reply_authors": "1;1;2;1;3", "recommendation_avg": [ 4.4, 1.2 ], "confidence_avg": [ 3.6, 1.019803902718557 ], "correctness_avg": [ 3.2, 0.39999999999999997 ], "technical_novelty_avg": [ 2.0, 0.6324555320336759 ], "empirical_novelty_avg": [ 2.4, 0.4898979485566356 ], "wc_summary_paper_avg": [ 88.0, 14.7648230602334 ], "wc_summary_review_avg": [ 48.0, 36.3483149540663 ], "wc_main_review_avg": [ 325.8, 118.19204710977807 ], "wc_review_avg": [ 461.8, 163.35164523199637 ], "wc_reply_reviewers_avg": [ 105.8, 77.22279456222755 ], "wc_reply_authors_avg": [ 680.4, 176.59739522427844 ], "reply_reviewers_avg": [ 1.2, 0.9797958971132713 ], "reply_authors_avg": [ 1.6, 0.8 ], "replies_avg": [ 20, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.19611613513818404, "corr_recommendation_correctness": -0.5833333333333335, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8245829134309196463&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1", "aff_unique_norm": "University of Washington;Microsoft", "aff_unique_dep": "Department of Computer Science;Microsoft Corporation", "aff_unique_url": "https://www.washington.edu;https://www.microsoft.com", "aff_unique_abbr": "UW;Microsoft", "aff_campus_unique_index": "0", "aff_campus_unique": "Seattle;", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Cross-Domain Imitation Learning via Optimal Transport", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6198", "id": "xP3cPq2hQC", "poster": "", "openreview": "https://openreview.net/forum?id=xP3cPq2hQC", "slides": "https://iclr.cc/virtual/2022/poster/6198", "video": "https://iclr.cc/virtual/2022/poster/6198", "author_site": "Arnaud Fickinger, samuel cohen, Stuart Russell, Brandon Amos", "tldr": "", "abstract": "Cross-domain imitation learning studies how to leverage expert demonstrations of one agent to train an imitation agent with a different embodiment or morphology. Comparing trajectories and stationary distributions between the expert and imitation agents is challenging because they live on different systems that may not even have the same dimensionality. We propose Gromov-Wasserstein Imitation Learning (GWIL), a method for cross-domain imitation that uses the Gromov-Wasserstein distance to align and compare states between the different spaces of the agents. Our theory formally characterizes the scenarios where GWIL preserves optimality, revealing its possibilities and limitations. We demonstrate the effectiveness of GWIL in non-trivial continuous control domains ranging from simple rigid transformation of the expert domain to arbitrary transformation of the state-action space.", "keywords": "optimal transportation;imitation learning;cross-domain imitation learning;gromov-Wasserstein", "primary_area": "", "supplementary_material": "", "author": "Arnaud Fickinger;Samuel Cohen;Stuart Russell;Brandon Amos", "authorids": "~Arnaud_Fickinger1;~Samuel_Cohen1;~Stuart_Russell1;~Brandon_Amos1", "gender": ";M;M;", "homepage": "https://www.linkedin.com/in/arnaudfickinger/;;https://people.eecs.berkeley.edu/~russell/;http://bamos.github.io", "dblp": "236/4896;;;133/4801.html", "google_scholar": ";CmdjfTsAAAAJ;https://scholar.google.com.tw/citations?user=KJGrjCAAAAAJ;d8gdZR4AAAAJ", "orcid": ";;;", "linkedin": ";;;bdamos", "or_profile": "~Arnaud_Fickinger1;~Samuel_Cohen1;~Stuart_Russell1;~Brandon_Amos1", "aff": "University of California, Berkeley;University College London;University of California, Berkeley;Meta", "aff_domain": "berkeley.edu;ucl.ac.uk;berkeley.edu;meta.com", "position": "PhD student;PhD student;Full Professor;Research Scientist", "bibtex": "@inproceedings{\nfickinger2022crossdomain,\ntitle={Cross-Domain Imitation Learning via Optimal Transport},\nauthor={Arnaud Fickinger and Samuel Cohen and Stuart Russell and Brandon Amos},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=xP3cPq2hQC}\n}", "github": "", "project": "", "reviewers": "ZsKi;tiAF;pWm6;KcRn", "pdf_size": 0, "recommendation": "6;6;6;8", "confidence": "3;4;4;4", "correctness": "3;3;3;3", "technical_novelty": "4;3;3;4", "empirical_novelty": "3;3;3;3", "wc_summary_paper": "121;62;69;174", "wc_summary_review": "71;49;32;98", "wc_main_review": "371;93;90;413", "wc_review": "563;204;191;685", "wc_reply_reviewers": "686;11;9;13", "wc_reply_authors": "1412;597;619;803", "reply_reviewers": "2;1;1;1", "reply_authors": "4;2;2;1", "recommendation_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 3.5, 0.5 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 106.5, 45.14698218042929 ], "wc_summary_review_avg": [ 62.5, 24.72347063015223 ], "wc_main_review_avg": [ 241.75, 150.98571952340393 ], "wc_review_avg": [ 410.75, 217.61706619656465 ], "wc_reply_reviewers_avg": [ 179.75, 292.28699509215255 ], "wc_reply_authors_avg": [ 857.75, 329.8419128916154 ], "reply_reviewers_avg": [ 1.25, 0.4330127018922193 ], "reply_authors_avg": [ 2.25, 1.0897247358851685 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": 0.0, "gs_citation": 60, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=792462069574183965&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=xP3cPq2hQC", "email": "berkeley.edu;ucl.ac.uk;berkeley.edu;meta.com", "author_num": 4, "aff_unique_index": "0;1;0;2", "aff_unique_norm": "University of California, Berkeley;University College London;Meta", "aff_unique_dep": ";;Meta Platforms, Inc.", "aff_unique_url": "https://www.berkeley.edu;https://www.ucl.ac.uk;https://meta.com", "aff_unique_abbr": "UC Berkeley;UCL;Meta", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Berkeley;", "aff_country_unique_index": "0;1;0;0", "aff_country_unique": "United States;United Kingdom" }, { "title": "Pre-training Molecular Graph Representation with 3D Geometry", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6888", "id": "xQUe1pOKPam", "poster": "", "openreview": "https://openreview.net/forum?id=xQUe1pOKPam", "slides": "https://iclr.cc/virtual/2022/poster/6888", "video": "https://iclr.cc/virtual/2022/poster/6888", "author_site": "Shengchao Liu, Hanchen Wang, Weiyang Liu, Joan Lasenby, Hongyu Guo, Jian Tang", "tldr": "", "abstract": "Molecular graph representation learning is a fundamental problem in modern drug and material discovery. Molecular graphs are typically modeled by their 2D topological structures, but it has been recently discovered that 3D geometric information plays a more vital role in predicting molecular functionalities. However, the lack of 3D information in real-world scenarios has significantly impeded the learning of geometric graph representation. To cope with this challenge, we propose the Graph Multi-View Pre-training (GraphMVP) framework where self-supervised learning (SSL) is performed by leveraging the correspondence and consistency between 2D topological structures and 3D geometric views. GraphMVP effectively learns a 2D molecular graph encoder that is enhanced by richer and more discriminative 3D geometry. We further provide theoretical insights to justify the effectiveness of GraphMVP. Finally, comprehensive experiments show that GraphMVP can consistently outperform existing graph SSL methods. Code is available on GitHub: https://github.com/chao1224/GraphMVP.", "keywords": "Pre-training;SSL;Molecule;3D Geometry;2D representation", "primary_area": "", "supplementary_material": "/attachment/f1e6db0523eab0b32adda5f1e122f12f16a7e831.zip", "author": "Shengchao Liu;Hanchen Wang;Weiyang Liu;Joan Lasenby;Hongyu Guo;Jian Tang", "authorids": "~Shengchao_Liu1;~Hanchen_Wang1;~Weiyang_Liu1;~Joan_Lasenby1;~Hongyu_Guo1;~Jian_Tang1", "gender": "M;M;M;;M;", "homepage": "https://chao1224.github.io/;https://www.hanchenw.com/;http://wyliu.com/;;https://hongyuharryguo.github.io/;http://www.jian-tang.com", "dblp": ";;137/1532;;;181/2667-5", "google_scholar": "F1ws3XUAAAAJ;Yu_0vEEAAAAJ;DMjROf0AAAAJ;;https://scholar.google.ca/citations?user=bZUqlakAAAAJ;https://scholar.google.ca/citations?user=1ir6WUEAAAAJ", "orcid": "0000-0003-2030-2367;0000-0002-1691-024X;;;;", "linkedin": ";hanchenwang/;;;harry-h-y-guo-a582087/;", "or_profile": "~Shengchao_Liu1;~Hanchen_Wang1;~Weiyang_Liu1;~Joan_Lasenby1;~Hongyu_Guo1;~Jian_Tang1", "aff": "MILA-UdeM;University of Cambridge;University of Cambridge;;National Research Council Canada;Mila, HEC Montreal", "aff_domain": "mila.quebec;cam.ac.uk;cam.ac.uk;;nrc-cnrc.gc.ca;hec.ca", "position": "PhD student;PhD student;Researcher;;Senior Research Officer;Assistant Professor", "bibtex": "@inproceedings{\nliu2022pretraining,\ntitle={Pre-training Molecular Graph Representation with 3D Geometry},\nauthor={Shengchao Liu and Hanchen Wang and Weiyang Liu and Joan Lasenby and Hongyu Guo and Jian Tang},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=xQUe1pOKPam}\n}", "github": "", "project": "", "reviewers": "1JZt;zY9H;K6sw;A6B1", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "4;4;4;3", "correctness": "3;3;4;4", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;3;3;2", "wc_summary_paper": "106;19;50;86", "wc_summary_review": "57;21;45;22", "wc_main_review": "376;73;322;250", "wc_review": "539;113;417;358", "wc_reply_reviewers": "585;0;23;0", "wc_reply_authors": "1790;384;462;363", "reply_reviewers": "4;0;1;0", "reply_authors": "5;1;1;1", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 65.25, 33.40190862810088 ], "wc_summary_review_avg": [ 36.25, 15.35211711784404 ], "wc_main_review_avg": [ 255.25, 114.32273395961103 ], "wc_review_avg": [ 356.75, 155.12958292988478 ], "wc_reply_reviewers_avg": [ 152.0, 250.16894291658187 ], "wc_reply_authors_avg": [ 749.75, 601.7201903709065 ], "reply_reviewers_avg": [ 1.25, 1.6393596310755 ], "reply_authors_avg": [ 2.0, 1.7320508075688772 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 1.0, "gs_citation": 443, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12269574784453036678&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "pdf": "https://openreview.net/pdf?id=xQUe1pOKPam", "email": "mila.quebec;cam.ac.uk;cam.ac.uk;;nrc-cnrc.gc.ca;hec.ca", "author_num": 6, "aff_unique_index": "0;1;1;2;3", "aff_unique_norm": "Mila;University of Cambridge;National Research Council Canada;HEC Montreal", "aff_unique_dep": "Montreal Institute for Learning Algorithms;;;HEC Business School", "aff_unique_url": "https://mila.quebec;https://www.cam.ac.uk;https://www.nrc-cnrc.gc.ca;https://www.hec.ca", "aff_unique_abbr": "MILA;Cambridge;NRC-CNRC;HEC", "aff_campus_unique_index": "1;1;2", "aff_campus_unique": ";Cambridge;Montreal", "aff_country_unique_index": "0;1;1;0;0", "aff_country_unique": "Canada;United Kingdom" }, { "id": "xREjEGUoY4c", "title": "Robot Intent Recognition Method Based on State Grid Business Office", "track": "main", "status": "Desk Reject", "tldr": "", "abstract": "Artificial intelligence is currently in an era of change, not only changing the artificial intelligence technology itself, but also changing human society. It has become more and more common to use artificial intelligence as the core human-computer interaction technology to replace manpower. Intention recognition is an important part of the human-machine dialogue system, and deep learning technology is gradually being applied to the task of intent recognition. However, intent recognition based on deep learning often has problems such as low recognition accuracy and slow recognition speed. In response to these problems, this paper designs a BERT fine-tuning to improve the network structure based on the pre-training model and proposes new continuous pre-training goals. To improve the accuracy of intent recognition, a method based on multi-teacher model compression is proposed to compress the pre-training model, which reduces the time consumption of model inference.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Lanfang Dong;Zhao Pu Hu;Hanchao Liu", "authorids": "~Lanfang_Dong1;~Zhao_Pu_Hu1;~Hanchao_Liu1", "gender": ";M;M", "homepage": "http://staff.ustc.edu.cn/~lfdong/;https://yu25977571.jz.fkw.com/?fromJzAllSite=true;", "dblp": "26/5393;;", "google_scholar": ";;hXmNE9wAAAAJ", "orcid": "0000-0002-0267-9905;;", "linkedin": ";;", "or_profile": "~Lanfang_Dong1;~Zhao_Pu_Hu1;~Hanchao_Liu1", "aff": "University of Science and Technology of China, Tsinghua University;University of Science and Technology of China, Tsinghua University;University of Science and Technology of China", "aff_domain": "ustc.edu.cn;ustc.edu.cn;ustc.edu.cn", "position": "Lecturer;Undergrad student;PhD student", "bibtex": "@misc{\ndong2022robot,\ntitle={Robot Intent Recognition Method Based on State Grid Business Office},\nauthor={Lanfang Dong and Zhao Pu Hu and Hanchao Liu},\nyear={2022},\nurl={https://openreview.net/forum?id=xREjEGUoY4c}\n}", "github": "", "project": "", "reviewers": "", "site": "https://openreview.net/forum?id=xREjEGUoY4c", "pdf_size": 0, "recommendation": "", "confidence": "", "correctness": "", "technical_novelty": "", "empirical_novelty": "", "wc_summary_paper": "", "wc_summary_review": "", "wc_main_review": "", "wc_review": "", "wc_reply_reviewers": "", "wc_reply_authors": "", "reply_reviewers": "", "reply_authors": "", "recommendation_avg": [ 0, 0 ], "confidence_avg": [ 0, 0 ], "correctness_avg": [ 0, 0 ], "technical_novelty_avg": [ 0, 0 ], "empirical_novelty_avg": [ 0, 0 ], "wc_summary_paper_avg": [ 0, 0 ], "wc_summary_review_avg": [ 0, 0 ], "wc_main_review_avg": [ 0, 0 ], "wc_review_avg": [ 0, 0 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 1, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0, "corr_recommendation_correctness": 0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:tJIiguzLiYwJ:scholar.google.com/&scioq=Robot+Intent+Recognition+Method+Based+on+State+Grid+Business+Office&hl=en&as_sdt=0,5", "gs_version_total": 4, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Science and Technology of China", "aff_unique_dep": "", "aff_unique_url": "http://www.ustc.edu.cn/", "aff_unique_abbr": "USTC", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "id": "xRK8xgFuiu", "title": "Causal Discovery via Cholesky Factorization", "track": "main", "status": "Reject", "tldr": "", "abstract": "Discovering the causal relationship via recovering the directed acyclic graph (DAG) structure from the observed data is a challenging combinatorial problem. This paper proposes an extremely fast, easy to implement, and high-performance DAG structure recovering algorithm. The algorithm is based on the Cholesky factorization of the covariance/precision matrix. The time complexity of the algorithm is $\\mathcal{O}(p^2n + p^3)$, where $p$ and $n$ are the numbers of nodes and samples, respectively. Under proper assumptions, we show that our algorithm takes $\\mathcal{O}(\\log(p/\\epsilon))$ samples to exactly recover the DAG structure with probability at least $1-\\epsilon$. In both time and sample complexities, our algorithm is better than previous algorithms. On synthetic and real-world data sets, our algorithm is significantly faster than previous methods and achieves state-of-the-art performance.", "keywords": "DAG Structure Learning;Causal Discovery", "primary_area": "", "supplementary_material": "/attachment/cbd851ce8b417c23cc03631b6869850a05d8caee.zip", "author": "Xu Li;YUNFENG CAI;Mingming Sun;Ping Li", "authorids": "~Xu_Li3;~YUNFENG_CAI1;~Mingming_Sun1;~Ping_Li3", "gender": ";M;M;M", "homepage": ";https://www.bimsa.cn/detail/yfcai.html;;http://www.stat.rutgers.edu/home/pingli/", "dblp": ";133/8201;87/8665-1.html;62/5860-1", "google_scholar": ";https://scholar.google.com/citations?hl=en;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Xu_Li3;~YUNFENG_CAI1;~Mingming_Sun1;~Ping_Li3", "aff": ";Baidu Research;Baidu;LinkedIn", "aff_domain": ";baidu.com;baidu.com;linkedin.com", "position": ";Resseacher;Principal Researcher;Engineer", "bibtex": "@misc{\nli2022causal,\ntitle={Causal Discovery via Cholesky Factorization},\nauthor={Xu Li and YUNFENG CAI and Mingming Sun and Ping Li},\nyear={2022},\nurl={https://openreview.net/forum?id=xRK8xgFuiu}\n}", "github": "", "project": "", "reviewers": "Ag69;tZir;SYfm;WtL4;2cwa", "site": "https://openreview.net/forum?id=xRK8xgFuiu", "pdf_size": 0, "recommendation": "3;3;3;6;6", "confidence": "4;5;5;3;2", "correctness": "1;3;4;3;4", "technical_novelty": "3;2;3;3;2", "empirical_novelty": "0;0;3;3;2", "wc_summary_paper": "61;87;59;38;60", "wc_summary_review": "48;19;79;16;24", "wc_main_review": "305;365;226;115;88", "wc_review": "414;471;364;169;172", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "0;0;0;0;0", "reply_reviewers": "0;0;0;0;0", "reply_authors": "0;0;0;0;0", "recommendation_avg": [ 4.2, 1.469693845669907 ], "confidence_avg": [ 3.8, 1.16619037896906 ], "correctness_avg": [ 3.0, 1.0954451150103321 ], "technical_novelty_avg": [ 2.6, 0.4898979485566356 ], "empirical_novelty_avg": [ 1.6, 1.3564659966250538 ], "wc_summary_paper_avg": [ 61.0, 15.556349186104045 ], "wc_summary_review_avg": [ 37.2, 23.743630724891254 ], "wc_main_review_avg": [ 219.8, 106.52211038089699 ], "wc_review_avg": [ 318.0, 125.10635475466464 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.9101820546182062, "corr_recommendation_correctness": 0.37267799624996495, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:8mzvzlAFtY4J:scholar.google.com/&scioq=Causal+Discovery+via+Cholesky+Factorization&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0;1", "aff_unique_norm": "Baidu;LinkedIn Corporation", "aff_unique_dep": "Baidu Research;", "aff_unique_url": "https://research.baidu.com;https://www.linkedin.com", "aff_unique_abbr": "Baidu;LinkedIn", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1", "aff_country_unique": "China;United States" }, { "title": "Sound and Complete Neural Network Repair with Minimality and Locality Guarantees", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6686", "id": "xS8AMYiEav3", "poster": "", "openreview": "https://openreview.net/forum?id=xS8AMYiEav3", "slides": "https://iclr.cc/virtual/2022/poster/6686", "video": "https://iclr.cc/virtual/2022/poster/6686", "author_site": "Feisi Fu, Wenchao Li", "tldr": "", "abstract": "We present a novel methodology for repairing neural networks that use ReLU activation functions. Unlike existing methods that rely on modifying the weights of a neural network which can induce a global change in the function space, our approach applies only a localized change in the function space while still guaranteeing the removal of the buggy behavior. By leveraging the piecewise linear nature of ReLU networks, our approach can efficiently construct a patch network tailored to the linear region where the buggy input resides, which when combined with the original network, provably corrects the behavior on the buggy input. Our method is both sound and complete -- the repaired network is guaranteed to fix the buggy input, and a patch is guaranteed to be found for any buggy input. Moreover, our approach preserves the continuous piecewise linear nature of ReLU networks, automatically generalizes the repair to all the points including other undetected buggy inputs inside the repair region, is minimal in terms of changes in the function space, and guarantees that outputs on inputs away from the repair region are unaltered. On several benchmarks, we show that our approach significantly outperforms existing methods in terms of locality and limiting negative side effects.", "keywords": "Neural Network Repair", "primary_area": "", "supplementary_material": "", "author": "Feisi Fu;Wenchao Li", "authorids": "~Feisi_Fu1;~Wenchao_Li1", "gender": "M;", "homepage": "https://fufeisi.github.io;http://sites.bu.edu/depend/", "dblp": ";23/5721-1", "google_scholar": "9wJGIOoAAAAJ;zwA5eokAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Feisi_Fu1;~Wenchao_Li1", "aff": "Boston University;Boston University", "aff_domain": "bu.edu;bu.edu", "position": "PhD student;Assistant Professor", "bibtex": "@inproceedings{\nfu2022sound,\ntitle={Sound and Complete Neural Network Repair with Minimality and Locality Guarantees},\nauthor={Feisi Fu and Wenchao Li},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=xS8AMYiEav3}\n}", "github": "", "project": "", "reviewers": "DRaE;Jhi1;Bt2t;fkMv", "pdf_size": 0, "recommendation": "5;6;8;8", "confidence": "4;3;4;5", "correctness": "2;4;4;3", "technical_novelty": "2;4;3;4", "empirical_novelty": "2;3;2;4", "wc_summary_paper": "64;62;86;94", "wc_summary_review": "35;14;24;51", "wc_main_review": "433;306;316;319", "wc_review": "532;382;426;464", "wc_reply_reviewers": "0;270;0;158", "wc_reply_authors": "1001;1125;176;1338", "reply_reviewers": "0;2;0;1", "reply_authors": "2;2;1;2", "recommendation_avg": [ 6.75, 1.299038105676658 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 3.25, 0.82915619758885 ], "empirical_novelty_avg": [ 2.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 76.5, 13.811227316933133 ], "wc_summary_review_avg": [ 31.0, 13.729530217745982 ], "wc_main_review_avg": [ 343.5, 51.89653167601858 ], "wc_review_avg": [ 451.0, 55.036351623268054 ], "wc_reply_reviewers_avg": [ 107.0, 114.09206808538444 ], "wc_reply_authors_avg": [ 910.0, 440.58086658410394 ], "reply_reviewers_avg": [ 0.75, 0.82915619758885 ], "reply_authors_avg": [ 1.75, 0.4330127018922193 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.5443310539518174, "corr_recommendation_correctness": 0.5222329678670935, "gs_citation": 33, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=862436873685923655&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=xS8AMYiEav3", "email": "bu.edu;bu.edu", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Boston University", "aff_unique_dep": "", "aff_unique_url": "https://www.bu.edu", "aff_unique_abbr": "BU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "xUdEO_yE-GV", "title": "Localized Persistent Homologies for more Effective Deep Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Persistent Homologies have been successfully used to increase the performance of deep networks trained to detect curvilinear structures and to improve the topological quality of the results. However, existing methods are very global and ignore the location of topological features. In this paper, we introduce an approach that relies on a new filtration function to account for location during network training. We demonstrate experimentally on 2D images of roads and 3D image stacks of neural processes that networks trained in this manner are better at recovering the topology of the curvilinear structures they extract.", "keywords": "Delineation;Persistent Homology;Topology;Aerial Images;Microscopy scans", "primary_area": "", "supplementary_material": "", "author": "Doruk Oner;Ad\u00e9lie Garin;Mateusz Kozinski;Kathryn Hess;Pascal Fua", "authorids": "~Doruk_Oner1;~Ad\u00e9lie_Garin1;~Mateusz_Kozinski1;kathryn.hess@epfl.ch;~Pascal_Fua1", "gender": "M;;;;M", "homepage": ";https://www.epfl.ch/labs/hessbellwald-lab/members/adelie-garin/;;;https://people.epfl.ch/pascal.fua/bio?lang=en", "dblp": "217/1719;;;;f/PFua", "google_scholar": "https://scholar.google.com.tr/citations?user=ESA2CsAAAAAJ;;;;https://scholar.google.com/citations?view_op=list_works", "orcid": "0000-0002-9403-4628;0000-0002-3223-6320;;;", "linkedin": "doruk-oner/;;;;pascal-fua-epfl/?lipi=urn%3Ali%3Apage%3Ad_flagship3_search_srp_top%3BOz8ffqlCTcmui5v37AilTQ%3D%3D&licu=urn%3Ali%3Acontrol%3Ad_flagship3_search_srp_top-search_srp_result&lici=IhLn%2B0y4Rj23iI9XNMDNwA%3D%3D", "or_profile": "~Doruk_Oner1;~Ad\u00e9lie_Garin1;~Mateusz_Kozinski1;kathryn.hess@epfl.ch;~Pascal_Fua1", "aff": "Swiss Federal Institute of Technology Lausanne;Swiss Federal Institute of Technology Lausanne;;;EPFL - EPF Lausanne", "aff_domain": "epfl.ch;epfl.ch;;;epfl.ch", "position": "PhD student;PhD student;;;Full Professor", "bibtex": "@misc{\noner2022localized,\ntitle={Localized Persistent Homologies for more Effective Deep Learning},\nauthor={Doruk Oner and Ad{\\'e}lie Garin and Mateusz Kozinski and Kathryn Hess and Pascal Fua},\nyear={2022},\nurl={https://openreview.net/forum?id=xUdEO_yE-GV}\n}", "github": "", "project": "", "reviewers": "mHrP;ne7r;Emfj;jz5f", "site": "https://openreview.net/forum?id=xUdEO_yE-GV", "pdf_size": 0, "recommendation": "3;5;6;8", "confidence": "5;4;4;4", "correctness": "3;4;3;4", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "66;133;88;59", "wc_summary_review": "241;22;34;46", "wc_main_review": "1142;322;659;167", "wc_review": "1449;477;781;272", "wc_reply_reviewers": "484;56;9;7", "wc_reply_authors": "1340;821;776;251", "reply_reviewers": "3;1;1;1", "reply_authors": "5;3;1;1", "recommendation_avg": [ 5.5, 1.8027756377319946 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 86.5, 28.90069203323685 ], "wc_summary_review_avg": [ 85.75, 90.03436843783601 ], "wc_main_review_avg": [ 572.5, 373.82917221640156 ], "wc_review_avg": [ 744.75, 445.1024460728114 ], "wc_reply_reviewers_avg": [ 139.0, 200.14869472469712 ], "wc_reply_authors_avg": [ 797.0, 385.3511385736391 ], "reply_reviewers_avg": [ 1.5, 0.8660254037844386 ], "reply_authors_avg": [ 2.5, 1.6583123951777 ], "replies_avg": [ 23, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.8006407690254357, "corr_recommendation_correctness": 0.5547001962252291, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9265844338426802753&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;1", "aff_unique_norm": "Swiss Federal Institute of Technology Lausanne;EPFL", "aff_unique_dep": ";", "aff_unique_url": "https://www.epfl.ch;https://www.epfl.ch", "aff_unique_abbr": "EPFL;EPFL", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Lausanne", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Switzerland" }, { "id": "xVGrCe5fCXY", "title": "Denoising Diffusion Gamma Models", "track": "main", "status": "Reject", "tldr": "", "abstract": "Generative diffusion processes are an emerging and effective tool for image and speech generation. In the existing methods, the underlying noise distribution of the diffusion process is Gaussian noise. However, fitting distributions with more degrees of freedom could improve the performance of such generative models. In this work, we investigate other types of noise distribution for the diffusion process. Specifically, we introduce the Denoising Diffusion Gamma Model (DDGM) and show that noise from Gamma distribution provides improved results for image and speech generation. Our approach preserves the ability to efficiently sample state in the training diffusion process while using Gamma noise. ", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/fde7fdc69736e3769d93f5ca9b2f50fc16a3b348.zip", "author": "Eliya Nachmani;Robin San Roman;Lior Wolf", "authorids": "~Eliya_Nachmani1;~Robin_San_Roman1;~Lior_Wolf1", "gender": "M;M;M", "homepage": ";;http://www.cs.tau.ac.il/~wolf", "dblp": "183/6370;289/7209;83/4103", "google_scholar": ";https://scholar.google.com/citations?view_op=list_works;UbFrXTsAAAAJ", "orcid": ";;0000-0001-5578-8892", "linkedin": ";;", "or_profile": "~Eliya_Nachmani1;~Robin_San_Roman1;~Lior_Wolf1", "aff": "Meta Facebook;Ecole Normale Sup\u00e9rieure de Paris;Tel Aviv University", "aff_domain": "facebook.com;ens.fr;tau.ac.il", "position": "Researcher;Intern;Full Professor", "bibtex": "@misc{\nnachmani2022denoising,\ntitle={Denoising Diffusion Gamma Models},\nauthor={Eliya Nachmani and Robin San Roman and Lior Wolf},\nyear={2022},\nurl={https://openreview.net/forum?id=xVGrCe5fCXY}\n}", "github": "", "project": "", "reviewers": "YK5T;nekw;y4Qp;C6QK", "site": "https://openreview.net/forum?id=xVGrCe5fCXY", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "3;3;4;4", "correctness": "3;3;3;3", "technical_novelty": "3;2;3;3", "empirical_novelty": "3;2;2;3", "wc_summary_paper": "31;101;48;85", "wc_summary_review": "17;51;27;196", "wc_main_review": "152;338;205;1090", "wc_review": "200;490;280;1371", "wc_reply_reviewers": "70;0;0;0", "wc_reply_authors": "390;488;356;746", "reply_reviewers": "1;0;0;0", "reply_authors": "2;1;1;1", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 66.25, 27.99441908666797 ], "wc_summary_review_avg": [ 72.75, 72.22317841247366 ], "wc_main_review_avg": [ 446.25, 377.795166062246 ], "wc_review_avg": [ 585.25, 465.85157239189397 ], "wc_reply_reviewers_avg": [ 17.5, 30.31088913245535 ], "wc_reply_authors_avg": [ 495.0, 152.8037957643723 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 1.0, "corr_recommendation_correctness": 0.0, "gs_citation": 29, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16843723024026939941&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "Meta;Ecole Normale Sup\u00e9rieure de Paris;Tel Aviv University", "aff_unique_dep": "Meta Platforms, Inc.;;", "aff_unique_url": "https://meta.com;https://www.ens.fr;https://www.tau.ac.il", "aff_unique_abbr": "Meta;ENS Paris;TAU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Paris", "aff_country_unique_index": "0;1;2", "aff_country_unique": "United States;France;Israel" }, { "id": "xVlPHwnNKv", "title": "Fast Deterministic Stackelberg Actor-Critic", "track": "main", "status": "Reject", "tldr": "", "abstract": "Most advanced Actor-Critic (AC) approaches update the actor and critic concurrently through (stochastic) Gradient Descents (GD), which may be trapped into bad local optimality due to the instability of these simultaneous updating schemes. \nStackelberg AC learning scheme alleviates these limitations by adding a compensated indirect gradient terms to the GD. However, the indirect gradient terms are time-consuming to calculate, and the convergence rate is also relatively slow. To alleviates these challenges, we find that in the Deterministic Policy Gradient family, by removing the terms that contain Hessian matrices and adopting the block diagonal approximation technique to approximate the remaining inverse matrices, we can construct an approximated Stackelberg AC learning scheme that is easy to compute and fast to converge. Experiments reveal that ours outperform SOTAs in terms of average returns under acceptable training time.", "keywords": "Deep Reinforcement Learning", "primary_area": "", "supplementary_material": "", "author": "Runsheng Yu;Xinrun Wang;James Kwok", "authorids": "~Runsheng_Yu2;~Xinrun_Wang1;~James_Kwok1", "gender": "Not Specified;M;", "homepage": "https://www.linkedin.com/in/runsheng-yu-560696127/;https://rainwangphy.github.io/;", "dblp": "210/2646.html?q=runsheng%20yu;199/6413;", "google_scholar": ";ROANfPUAAAAJ;", "orcid": "0000-0003-0053-1234;;", "linkedin": ";;", "or_profile": "~Runsheng_Yu2;~Xinrun_Wang1;~James_Kwok1", "aff": "Hong Kong University of Science and Technology;Nanyang Technological University;", "aff_domain": "ust.hk;ntu.edu.sg;", "position": "PhD student;Postdoc;", "bibtex": "@misc{\nyu2022fast,\ntitle={Fast Deterministic Stackelberg Actor-Critic},\nauthor={Runsheng Yu and Xinrun Wang and James Kwok},\nyear={2022},\nurl={https://openreview.net/forum?id=xVlPHwnNKv}\n}", "github": "", "project": "", "reviewers": "jvHe;6tEQ;WR2J;7evM", "site": "https://openreview.net/forum?id=xVlPHwnNKv", "pdf_size": 0, "recommendation": "3;3;5;8", "confidence": "4;4;3;2", "correctness": "3;2;3;4", "technical_novelty": "2;2;2;3", "empirical_novelty": "3;2;2;4", "wc_summary_paper": "100;94;54;58", "wc_summary_review": "41;91;37;19", "wc_main_review": "500;782;134;84", "wc_review": "641;967;225;161", "wc_reply_reviewers": "183;215;0;0", "wc_reply_authors": "1802;1887;347;159", "reply_reviewers": "1;1;0;0", "reply_authors": "3;4;1;1", "recommendation_avg": [ 4.75, 2.0463381929681126 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 76.5, 20.657928260113597 ], "wc_summary_review_avg": [ 47.0, 26.720778431774775 ], "wc_main_review_avg": [ 375.0, 284.62080036427415 ], "wc_review_avg": [ 498.5, 327.3022303620921 ], "wc_reply_reviewers_avg": [ 99.5, 100.14115038284712 ], "wc_reply_authors_avg": [ 1048.75, 799.086470602525 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 2.25, 1.299038105676658 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.9945577827230725, "corr_recommendation_correctness": 0.8638684255813602, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:V6s4G3Q69SQJ:scholar.google.com/&scioq=Fast+Deterministic+Stackelberg+Actor-Critic&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1", "aff_unique_norm": "Hong Kong University of Science and Technology;Nanyang Technological University", "aff_unique_dep": ";", "aff_unique_url": "https://www.ust.hk;https://www.ntu.edu.sg", "aff_unique_abbr": "HKUST;NTU", "aff_campus_unique_index": "0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;1", "aff_country_unique": "China;Singapore" }, { "id": "xWRX16GCugt", "title": "Sequoia: A Software Framework to Unify Continual Learning Research", "track": "main", "status": "Reject", "tldr": "", "abstract": "The field of Continual Learning (CL) seeks to develop algorithms that accumulate knowledge and skills over time through interaction with non-stationary environments. In practice, a plethora of evaluation procedures (settings) and algorithmic solutions (methods) exist, each with their own potentially disjoint set of assumptions. This variety makes measuring progress in CL difficult. We propose a taxonomy of settings, where each setting is described as a set of assumptions. A tree-shaped hierarchy emerges from this view, where more general settings become the parents of those with more restrictive assumptions. This makes it possible to use inheritance to share and reuse research, as developing a method for a given setting also makes it directly applicable onto any of its children. We instantiate this idea as a publicly available software framework called Sequoia, which features a wide variety of settings from both the Continual Supervised Learning (CSL) and Continual Reinforcement Learning (CRL) domains. Sequoia also includes a growing suite of methods which are easy to extend and customize, in addition to more specialized methods from external libraries. We hope that this new paradigm and its first implementation can help unify and accelerate research in CL. You can help us grow the tree by visiting (this GitHub URL).", "keywords": "Continual Learning;Reinforcement Learning;Software Engineering;Deep learning", "primary_area": "", "supplementary_material": "/attachment/a855b0c373de8f5478a725c27f124f7baaa59762.zip", "author": "Fabrice Normandin;Oleksiy Ostapenko;Pau Rodriguez;Florian Golemo;Ryan Lindeborg;Matthew Riemer;Lucas Cecchi;Timothee LESORT;Khimya Khetarpal;David Vazquez;Laurent Charlin;Irina Rish;Massimo Caccia", "authorids": "~Fabrice_Normandin1;~Oleksiy_Ostapenko1;~Pau_Rodriguez2;~Florian_Golemo1;~Ryan_Lindeborg1;~Matthew_Riemer1;~Lucas_Cecchi1;~Timothee_LESORT1;~Khimya_Khetarpal1;~David_Vazquez1;~Laurent_Charlin1;~Irina_Rish1;~Massimo_Caccia1", "gender": "Not Specified;M;;M;;M;;M;F;M;M;F;", "homepage": "https://www.github.com/lebrice;;;https://fgolemo.github.io/;;;https://github.com/Lucasc-99;;https://kkhetarpal.github.io/;http://www.david-vazquez.com;http://www.cs.toronto.edu/~lcharlin/;http://irina-rish.com;", "dblp": ";;;08/8643;;166/1499;;;186/3048;94/8653;48/5717;;43/6338.html", "google_scholar": "https://scholar.google.ca/citations?user=SZIdARAAAAAJ;mqLVUGgAAAAJ;;https://scholar.google.de/citations?user=qvRf9xsAAAAJ;;PK7UzAwAAAAJ;;5NttkuoAAAAJ;https://scholar.google.ca/citations?user=VLOUhF0AAAAJ;1jHvtfsAAAAJ;Cul0g2YAAAAJ;Avse5gIAAAAJ;WaE4GicAAAAJ", "orcid": ";;;0000-0001-9238-7764;;;;;;0000-0002-2845-8158;0000-0002-6545-9459;;", "linkedin": "fabricenormandin/;;;;;;;https://fr.linkedin.com/in/timoth\u00e9e-lesort-128039aa;;https://www.linkedin.com/company/david-vazquez/;;irina-rish-8b2162;", "or_profile": "~Fabrice_Normandin1;~Oleksiy_Ostapenko1;~Pau_Rodriguez2;~Florian_Golemo1;~Ryan_Lindeborg1;~Matthew_Riemer1;~Lucas_Cecchi1;~Timothee_LESORT1;~Khimya_Khetarpal1;~David_Vazquez1;~Laurent_Charlin1;~Irina_Rish1;~Massimo_Caccia1", "aff": "Montreal Institute for Learning Algorithms, University of Montreal, University of Montreal;University of Montreal;;Mila;;International Business Machines;Pennsylvania State University;Montreal Institute for Learning Algorithms, University of Montreal, University of Montreal;McGill University;ServiceNow research;Mila - Quebec Artificial Intelligence Institute;University of Montreal;University of Montreal", "aff_domain": "mila.umontreal.ca;umontreal.ca;;mila.quebec;;ibm.com;psu.edu;mila.umontreal.ca;mcgill.ca;servicenow.com;mila.quebec;mila.quebec;umontreal.ca", "position": "MS student;PhD student;;Postdoc;;Researcher;Undergrad student;Postdoc;PhD student;Researcher;Principal Researcher;Professor;PhD student", "bibtex": "@misc{\nnormandin2022sequoia,\ntitle={Sequoia: A Software Framework to Unify Continual Learning Research},\nauthor={Fabrice Normandin and Oleksiy Ostapenko and Pau Rodriguez and Florian Golemo and Ryan Lindeborg and Matthew Riemer and Lucas Cecchi and Timothee LESORT and Khimya Khetarpal and David Vazquez and Laurent Charlin and Irina Rish and Massimo Caccia},\nyear={2022},\nurl={https://openreview.net/forum?id=xWRX16GCugt}\n}", "github": "", "project": "", "reviewers": "SrGM;pX2k;YwgU;D4MM", "site": "https://openreview.net/forum?id=xWRX16GCugt", "pdf_size": 0, "recommendation": "1;3;5;5", "confidence": "4;5;4;4", "correctness": "4;3;3;3", "technical_novelty": "1;2;1;1", "empirical_novelty": "1;1;2;3", "wc_summary_paper": "36;46;48;33", "wc_summary_review": "10;97;21;191", "wc_main_review": "197;389;68;312", "wc_review": "243;532;137;536", "wc_reply_reviewers": "0;179;0;161", "wc_reply_authors": "368;515;95;262", "reply_reviewers": "0;1;0;1", "reply_authors": "1;1;1;1", "recommendation_avg": [ 3.5, 1.6583123951777 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 1.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 40.75, 6.378675411086537 ], "wc_summary_review_avg": [ 79.75, 72.44092420724628 ], "wc_main_review_avg": [ 241.5, 121.25283501840276 ], "wc_review_avg": [ 362.0, 176.04118836226937 ], "wc_reply_reviewers_avg": [ 85.0, 85.23790236743277 ], "wc_reply_authors_avg": [ 310.0, 153.23021895174594 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 13, 0 ], "corr_recommendation_confidence": -0.17407765595569782, "corr_recommendation_correctness": -0.8703882797784891, "gs_citation": 24, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8673934302931269584&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;1;2;3;0;4;5;6;0;0", "aff_unique_norm": "University of Montreal;Mila;International Business Machines Corporation;Pennsylvania State University;McGill University;ServiceNow;Quebec Artificial Intelligence Institute", "aff_unique_dep": "Montreal Institute for Learning Algorithms;Quebec Artificial Intelligence Institute;;;;research;Artificial Intelligence", "aff_unique_url": "https://www.umontreal.ca;https://mila.quebec;https://www.ibm.com;https://www.psu.edu;https://www.mcgill.ca;https://www.servicenow.com;https://mila.quebec", "aff_unique_abbr": "UM;Mila;IBM;PSU;McGill;ServiceNow;Mila", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Montreal;", "aff_country_unique_index": "0;0;0;1;1;0;0;1;0;0;0", "aff_country_unique": "Canada;United States" }, { "title": "Robust and Scalable SDE Learning: A Functional Perspective", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6813", "id": "xZ6H7wydGl", "poster": "", "openreview": "https://openreview.net/forum?id=xZ6H7wydGl", "slides": "https://iclr.cc/virtual/2022/poster/6813", "video": "https://iclr.cc/virtual/2022/poster/6813", "author_site": "Scott Cameron, Tyron Cameron, Arnu Pretorius, S Roberts", "tldr": "", "abstract": "Stochastic differential equations provide a rich class of flexible generative\nmodels, capable of describing a wide range of spatio-temporal processes. A host\nof recent work looks to learn data-representing SDEs, using neural networks and\nother flexible function approximators. Despite these advances, learning remains\ncomputationally expensive due to the sequential nature of SDE integrators. In\nthis work, we propose an importance-sampling estimator for probabilities of\nobservations of SDEs for the purposes of learning. Crucially, the approach we\nsuggest does not rely on such integrators. The proposed method produces\nlower-variance gradient estimates compared to algorithms based on SDE\nintegrators and has the added advantage of being embarrassingly parallelizable.\nThis facilitates the effective use of large-scale parallel hardware for massive\ndecreases in computation time.\n", "keywords": "SDE Learning;Parallelization;Importance Sampling", "primary_area": "", "supplementary_material": "/attachment/f2767b17918a44b2576fe830a79535c7d8c4a5a5.zip", "author": "Scott Alexander Cameron;Tyron Luke Cameron;Arnu Pretorius;Stephen J. Roberts", "authorids": "~Scott_Alexander_Cameron1;~Tyron_Luke_Cameron1;~Arnu_Pretorius1;~Stephen_J._Roberts1", "gender": "M;M;M;M", "homepage": ";;;http://www.robots.ox.ac.uk/~sjrob", "dblp": ";;188/4368;64/1485", "google_scholar": ";;zZ6ydrAAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": "0000-0003-3830-6673;;;0000-0002-9305-9268", "linkedin": "scott-cameron-93a215bb/;tyron-cameron-82ba7716b/;arnupretorius/;", "or_profile": "~Scott_Alexander_Cameron1;~Tyron_Luke_Cameron1;~Arnu_Pretorius1;~Stephen_J._Roberts1", "aff": "University of Oxford;Discovery Insure;InstaDeep;University of Oxford", "aff_domain": "ox.ac.uk;discovery.co.za;instadeep.com;ox.ac.uk", "position": "PhD student;Actuarial analyst;Researcher;Full Professor", "bibtex": "@inproceedings{\ncameron2022robust,\ntitle={Robust and Scalable {SDE} Learning: A Functional Perspective},\nauthor={Scott Alexander Cameron and Tyron Luke Cameron and Arnu Pretorius and Stephen J. Roberts},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=xZ6H7wydGl}\n}", "github": "", "project": "", "reviewers": "k19Y;68Rx;p6RL", "pdf_size": 0, "recommendation": "5;5;6", "confidence": "2;3;2", "correctness": "2;3;3", "technical_novelty": "3;3;3", "empirical_novelty": "2;1;2", "wc_summary_paper": "76;99;57", "wc_summary_review": "60;28;38", "wc_main_review": "266;427;493", "wc_review": "402;554;588", "wc_reply_reviewers": "0;0;26", "wc_reply_authors": "602;320;201", "reply_reviewers": "0;0;1", "reply_authors": "1;1;1", "recommendation_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 2.3333333333333335, 0.4714045207910317 ], "correctness_avg": [ 2.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 1.6666666666666667, 0.4714045207910317 ], "wc_summary_paper_avg": [ 77.33333333333333, 17.172329163188344 ], "wc_summary_review_avg": [ 42.0, 13.366625103842281 ], "wc_main_review_avg": [ 395.3333333333333, 95.33916066106076 ], "wc_review_avg": [ 514.6666666666666, 80.86751854456494 ], "wc_reply_reviewers_avg": [ 8.666666666666666, 12.256517540566826 ], "wc_reply_authors_avg": [ 374.3333333333333, 168.1553514528224 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.4999999999999999, "corr_recommendation_correctness": 0.4999999999999999, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18263869225654973120&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=xZ6H7wydGl", "email": "ox.ac.uk;discovery.co.za;instadeep.com;ox.ac.uk", "author_num": 4, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "University of Oxford;Discovery Insure;InstaDeep", "aff_unique_dep": ";;", "aff_unique_url": "https://www.ox.ac.uk;https://www.discovery.co.za;https://www.instadeep.com", "aff_unique_abbr": "Oxford;;InstaDeep", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0", "aff_country_unique": "United Kingdom;South Africa" }, { "id": "x_PopzVOmYj", "title": "Tr-NAS: Memory-Efficient Neural Architecture Search with Transferred Blocks", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Neural Architecture Search (NAS) is one of the most rapidly growing research fields in machine learning due to its ability to discover high-performance architectures automatically. Although conventional NAS algorithms focus on improving search efficiency (e.g., high performance with less search time), they often require a lot of memory footprint and power consumption. To remedy this problem, we propose a new paradigm for NAS that effectively reduces the use of memory while maintaining high performance. The proposed algorithm is motivated by our observation that manually designed and NAS-based architectures share similar low-level representations, regardless of the difference in the network's topology. Reflecting this, we propose a new architectural paradigm for NAS, called $\\textbf{Transfer-NAS}$, that replaces several first cells in the generated architecture with conventional (hand-crafted) pre-trained blocks. As the replaced pre-trained blocks are kept frozen during training, the memory footprint can significantly be reduced. We demonstrate the effectiveness of the proposed method by incorporating it into Regularized Evolution and Differentiable ARchiTecture Search with Perturbation-based architecture selection (DARTS+PT) on NAS-Bench-201 and DARTS search spaces. Extensive experiments show that Transfer-NAS significantly decreases the memory usage up-to $\\textbf{50\\%}$ while achieving higher/comparable performance compared to the baselines. Furthermore, the proposed method is $\\textbf{1.98$\\times$}$ faster in terms of search time when incorporated to DARTS+PT on NAS-Bench-201 compared to the conventional method.", "keywords": "Neural Architecture Search;Memory-Efficient NAS", "primary_area": "", "supplementary_material": "", "author": "Linh-Tam Tran;A F M Shahab Uddin;Sung-Ho Bae", "authorids": "~Linh-Tam_Tran1;~A_F_M_Shahab_Uddin1;~Sung-Ho_Bae1", "gender": "M;M;M", "homepage": ";;https://sites.google.com/a/khu.ac.kr/mlvc/", "dblp": "303/4900;;76/2068", "google_scholar": "GT4_9cgAAAAJ;Ckkj9gQAAAAJ;https://scholar.google.co.kr/citations?user=EULut5oAAAAJ", "orcid": "0000-0002-9699-1747;0000-0003-1074-0515;", "linkedin": ";;", "or_profile": "~Linh-Tam_Tran1;~A_F_M_Shahab_Uddin1;~Sung-Ho_Bae1", "aff": "Kyung Hee University;KyungHee University;Kyung Hee University", "aff_domain": "khu.ac.kr;khu.ac.kr;khu.ac.kr", "position": "PhD student;Postdoc;Associate Professor", "bibtex": "@misc{\ntran2022trnas,\ntitle={Tr-{NAS}: Memory-Efficient Neural Architecture Search with Transferred Blocks},\nauthor={Linh-Tam Tran and A F M Shahab Uddin and Sung-Ho Bae},\nyear={2022},\nurl={https://openreview.net/forum?id=x_PopzVOmYj}\n}", "github": "", "project": "", "reviewers": "hjqm;vHKN;iRcm", "site": "https://openreview.net/forum?id=x_PopzVOmYj", "pdf_size": 0, "recommendation": "3;3;5", "confidence": "4;5;5", "correctness": "3;2;3", "technical_novelty": "2;1;2", "empirical_novelty": "2;2;2", "wc_summary_paper": "77;91;30", "wc_summary_review": "41;38;87", "wc_main_review": "1305;206;232", "wc_review": "1423;335;349", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 3.6666666666666665, 0.9428090415820634 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "correctness_avg": [ 2.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 1.6666666666666667, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 66.0, 26.08958923913266 ], "wc_summary_review_avg": [ 55.333333333333336, 22.425184255405547 ], "wc_main_review_avg": [ 581.0, 512.0553355514096 ], "wc_review_avg": [ 702.3333333333334, 509.62033798592546 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.4999999999999999, "corr_recommendation_correctness": 0.5, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:g8UJnGeo4asJ:scholar.google.com/&scioq=Tr-NAS:+Memory-Efficient+Neural+Architecture+Search+with+Transferred+Blocks&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0;0", "aff_unique_norm": "Kyung Hee University", "aff_unique_dep": "", "aff_unique_url": "http://www.khu.ac.kr", "aff_unique_abbr": "KHU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "South Korea" }, { "title": "Effective Model Sparsification by Scheduled Grow-and-Prune Methods", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6315", "id": "xa6otUDdP2W", "poster": "", "openreview": "https://openreview.net/forum?id=xa6otUDdP2W", "slides": "https://iclr.cc/virtual/2022/poster/6315", "video": "https://iclr.cc/virtual/2022/poster/6315", "author_site": "Xiaolong Ma, Minghai Qin, Fei Sun, Zejiang Hou, Kun Yuan, Yi Xu, Yanzhi Wang, Yen-Kuang Chen, Rong Jin, Yuan Xie", "tldr": "", "abstract": "Deep neural networks (DNNs) are effective in solving many real-world problems. Larger DNN models usually exhibit better quality (e.g., accuracy) but their excessive computation results in long inference time. Model sparsification can reduce the computation and memory cost while maintaining model quality. Most existing sparsification algorithms unidirectionally remove weights, while others randomly or greedily explore a small subset of weights in each layer for pruning. The limitations of these algorithms reduce the level of achievable sparsity. In addition, many algorithms still require pre-trained dense models and thus suffer from large memory footprint. In this paper, we propose a novel scheduled grow-and-prune (GaP) methodology without having to pre-train a dense model. It addresses the shortcomings of the previous works by repeatedly growing a subset of layers to dense and then pruning them back to sparse after some training. Experiments show that the models pruned using the proposed methods match or beat the quality of the highly optimized dense models at 80% sparsity on a variety of tasks, such as image classification, objective detection, 3D object part segmentation, and translation. They also outperform other state-of-the-art (SOTA) methods for model sparsification. As an example, a 90% non-uniform sparse ResNet-50 model obtained via GaP achieves 77.9% top-1 accuracy on ImageNet, improving the previous SOTA results by 1.5%. Code available at: https://github.com/boone891214/GaP.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xiaolong Ma;Minghai Qin;Fei Sun;Zejiang Hou;Kun Yuan;Yi Xu;Yanzhi Wang;Yen-Kuang Chen;Rong Jin;Yuan Xie", "authorids": "~Xiaolong_Ma2;~Minghai_Qin1;~Fei_Sun2;~Zejiang_Hou1;~Kun_Yuan4;~Yi_Xu8;~Yanzhi_Wang3;~Yen-Kuang_Chen2;~Rong_Jin1;~Yuan_Xie9", "gender": "M;M;M;;;;M;M;;", "homepage": "https://xiaolongma2016.com;https://sites.google.com/site/minghaiqin/home;https://feisun.org/;https://sites.google.com/view/zejianghou;;;https://web.northeastern.edu/yanzhiwang/;https://sites.google.com/site/ykchen/;;", "dblp": ";;;206/8398;;;;https://dblp.org/pers/c/Chen:Yen=Kuang.html;;", "google_scholar": "https://scholar.google.com/citations?hl=en;MSgWKbYAAAAJ;RNj18KkAAAAJ;W2ljsTsAAAAJ;;;https://scholar.google.com/citations?hl=en;HgJIF0MAAAAJ;;dK2ZuDcAAAAJ", "orcid": "0000-0003-3753-7648;;;;;;;0000-0003-4546-9497;;", "linkedin": "xiaolong-ma-66b98910b/;;fei-sun-39a644/;;;;;;;", "or_profile": "~Xiaolong_Ma2;~Minghai_Qin1;~Fei_Sun2;~Zejiang_Hou1;~Kun_Yuan4;~Yi_Xu8;~Yanzhi_Wang3;~Yen-Kuang_Chen2;~Rong_Jin1;~Yuan_Xie9", "aff": "Northeastern University;Western Digital Corporation;Alibaba Group;Princeton University;;;Northeastern University;Alibaba Group;;", "aff_domain": "northeastern.edu;wdc.com;alibaba-inc.com;princeton.edu;;;northeastern.edu;alibaba-inc.com;;", "position": "PhD student;senior technologist;Research scientist;PhD student;;;Associate Professor;Researcher;;", "bibtex": "@inproceedings{\nma2022effective,\ntitle={Effective Model Sparsification by Scheduled Grow-and-Prune Methods},\nauthor={Xiaolong Ma and Minghai Qin and Fei Sun and Zejiang Hou and Kun Yuan and Yi Xu and Yanzhi Wang and Yen-Kuang Chen and Rong Jin and Yuan Xie},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=xa6otUDdP2W}\n}", "github": "", "project": "", "reviewers": "Zewc;xnEk;XG8a;Saw3", "pdf_size": 0, "recommendation": "6;6;6;8", "confidence": "4;3;4;5", "correctness": "4;3;4;4", "technical_novelty": "3;3;2;3", "empirical_novelty": "3;2;2;3", "wc_summary_paper": "62;137;119;183", "wc_summary_review": "55;22;52;123", "wc_main_review": "168;440;181;665", "wc_review": "285;599;352;971", "wc_reply_reviewers": "36;187;0;279", "wc_reply_authors": "1396;2125;779;1308", "reply_reviewers": "1;2;0;1", "reply_authors": "4;5;2;3", "recommendation_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 125.25, 43.338060639581 ], "wc_summary_review_avg": [ 63.0, 36.966200778549045 ], "wc_main_review_avg": [ 363.5, 205.11033616080883 ], "wc_review_avg": [ 551.75, 268.8209208748456 ], "wc_reply_reviewers_avg": [ 125.5, 113.0320750937538 ], "wc_reply_authors_avg": [ 1402.0, 479.5127735524884 ], "reply_reviewers_avg": [ 1.0, 0.7071067811865476 ], "reply_authors_avg": [ 3.5, 1.118033988749895 ], "replies_avg": [ 23, 0 ], "authors#_avg": [ 10, 0 ], "corr_recommendation_confidence": 0.816496580927726, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 46, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14488112763252453275&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=xa6otUDdP2W", "email": "northeastern.edu;wdc.com;alibaba-inc.com;princeton.edu;;;northeastern.edu;alibaba-inc.com;;", "author_num": 10, "aff_unique_index": "0;1;2;3;0;2", "aff_unique_norm": "Northeastern University;Western Digital Corporation;Alibaba Group;Princeton University", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.northeastern.edu;https://www.westerndigital.com;https://www.alibaba.com;https://www.princeton.edu", "aff_unique_abbr": "NEU;WDC;Alibaba;Princeton", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;0;1", "aff_country_unique": "United States;China" }, { "id": "xaTensJtCP5", "title": "Semi-Empirical Objective Functions for Neural MCMC Proposal Optimization", "track": "main", "status": "Reject", "tldr": "", "abstract": "Current objective functions used for training neural MCMC proposal distributions implicitly rely on architectural restrictions to yield sensible optimization results, which hampers the development of highly expressive neural MCMC proposal architectures. In this work, we introduce and demonstrate a semi-empirical procedure for determining approximate objective functions suitable for optimizing arbitrarily parameterized proposal distributions in MCMC methods. Our proposed Ab Initio objective functions consist of the weighted combination of functions following constraints on their global optima and transformation invariances that we argue should be upheld by general measures of MCMC efficiency for use in proposal optimization. Our experimental results demonstrate that Ab Initio objective functions maintain favorable performance and preferable optimization behavior compared to existing objective functions for neural MCMC optimization. We find that Ab Initio objective functions are sufficiently robust to enable the confident optimization of neural proposal distributions parameterized by deep generative networks extending beyond the regimes of traditional MCMC schemes.", "keywords": "Markov Chain Monte Carlo;Neural MCMC;Generative Models;Deep Generative Models;Normalizing Flows", "primary_area": "", "supplementary_material": "/attachment/15cbbd29b4fce55c5a3ef2507fe4a0759129a992.zip", "author": "Chris Cannella;Vahid Tarokh", "authorids": "~Chris_Cannella1;~Vahid_Tarokh1", "gender": "M;", "homepage": ";", "dblp": ";", "google_scholar": "http://scholar.google.com/citations?user=T5vA9UIAAAAJ;", "orcid": ";", "linkedin": ";", "or_profile": "~Chris_Cannella1;~Vahid_Tarokh1", "aff": "Duke University;", "aff_domain": "duke.edu;", "position": "PhD student;", "bibtex": "@misc{\ncannella2022semiempirical,\ntitle={Semi-Empirical Objective Functions for Neural {MCMC} Proposal Optimization},\nauthor={Chris Cannella and Vahid Tarokh},\nyear={2022},\nurl={https://openreview.net/forum?id=xaTensJtCP5}\n}", "github": "", "project": "", "reviewers": "cPU7;Etgp;qDJ6;pwUX", "site": "https://openreview.net/forum?id=xaTensJtCP5", "pdf_size": 0, "recommendation": "3;5;5;8", "confidence": "4;3;4;3", "correctness": "3;4;4;4", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;2;3;2", "wc_summary_paper": "54;87;360;44", "wc_summary_review": "37;113;29;84", "wc_main_review": "160;249;423;504", "wc_review": "251;449;812;632", "wc_reply_reviewers": "0;126;315;350", "wc_reply_authors": "960;1204;1834;1662", "reply_reviewers": "0;1;1;4", "reply_authors": "3;3;4;5", "recommendation_avg": [ 5.25, 1.7853571071357126 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 136.25, 130.15831706041686 ], "wc_summary_review_avg": [ 65.75, 34.43381332353418 ], "wc_main_review_avg": [ 334.0, 136.31030775403596 ], "wc_review_avg": [ 536.0, 208.6779815888586 ], "wc_reply_reviewers_avg": [ 197.75, 142.46117892254017 ], "wc_reply_authors_avg": [ 1415.0, 349.3264948440069 ], "reply_reviewers_avg": [ 1.5, 1.5 ], "reply_authors_avg": [ 3.75, 0.82915619758885 ], "replies_avg": [ 26, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.7001400420140049, "corr_recommendation_correctness": 0.7276068751089989, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12207600245302446323&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "Duke University", "aff_unique_dep": "", "aff_unique_url": "https://www.duke.edu", "aff_unique_abbr": "Duke", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "id": "xbu1tzbjvd", "title": "Analyzing Populations of Neural Networks via Dynamical Model Embedding", "track": "main", "status": "Reject", "tldr": "", "abstract": "A core challenge in the interpretation of deep neural networks is identifying commonalities between the underlying algorithms implemented by distinct networks trained for the same task. Motivated by this problem, we introduce \\textsc{Dynamo}, an algorithm that constructs low-dimensional manifolds where each point corresponds to a neural network model, and two points are nearby if the corresponding neural networks enact similar high-level computational processes. \\textsc{Dynamo} takes as input a collection of pre-trained neural networks and outputs a \\emph{meta-model} that emulates the dynamics of the hidden states as well as the outputs of any model in the collection. The specific model to be emulated is determined by a \\emph{model embedding vector} that the meta-model takes as input; these model embedding vectors constitute a manifold corresponding to the given population of models. We apply \\textsc{Dynamo} to both RNNs and CNNs, and find that the resulting model embedding manifolds enable novel applications: clustering of neural networks on the basis of their high-level computational processes in a manner that is less sensitive to reparameterization; model averaging of several neural networks trained on the same task to arrive at a new, operable neural network with similar task performance; and semi-supervised learning via optimization on the model embedding manifold. Using a fixed-point analysis of meta-models trained on populations of RNNs, we gain new insights into how similarities of the topology of RNN dynamics correspond to similarities of their high-level computational processes.", "keywords": "dynamics;RNNs;model averaging;model clustering;CNNs;semi-supervised learning", "primary_area": "", "supplementary_material": "", "author": "Jordan Cotler;Kai Sheng Tai;Felipe Hernandez;Blake Elias;David Sussillo", "authorids": "~Jordan_Cotler1;~Kai_Sheng_Tai2;felipehb@stanford.edu;blakeelias@gmail.com;~David_Sussillo1", "gender": "M;;;;", "homepage": ";https://kaishengtai.github.io/;;;", "dblp": ";160/8934.html;;;56/9314", "google_scholar": "4FM97JgAAAAJ;AMwvwPYAAAAJ;;;ebBgMSkAAAAJ", "orcid": ";;;;", "linkedin": ";;;;david-sussillo-736a1290", "or_profile": "~Jordan_Cotler1;~Kai_Sheng_Tai2;felipehb@stanford.edu;blakeelias@gmail.com;~David_Sussillo1", "aff": "Harvard University;Meta;;;", "aff_domain": "harvard.edu;meta.com;;;", "position": "Postdoc;Researcher;;;", "bibtex": "@misc{\ncotler2022analyzing,\ntitle={Analyzing Populations of Neural Networks via Dynamical Model Embedding},\nauthor={Jordan Cotler and Kai Sheng Tai and Felipe Hernandez and Blake Elias and David Sussillo},\nyear={2022},\nurl={https://openreview.net/forum?id=xbu1tzbjvd}\n}", "github": "", "project": "", "reviewers": "dswE;k7ES;9dWJ;p41a", "site": "https://openreview.net/forum?id=xbu1tzbjvd", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "4;4;4;4", "correctness": "3;3;2;3", "technical_novelty": "3;2;4;2", "empirical_novelty": "2;2;3;2", "wc_summary_paper": "89;51;62;183", "wc_summary_review": "60;41;67;120", "wc_main_review": "269;582;360;614", "wc_review": "418;674;489;917", "wc_reply_reviewers": "81;0;189;0", "wc_reply_authors": "676;1643;808;931", "reply_reviewers": "1;0;1;0", "reply_authors": "2;3;1;2", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.82915619758885 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 96.25, 51.958517107400205 ], "wc_summary_review_avg": [ 72.0, 29.300170647967224 ], "wc_main_review_avg": [ 456.25, 145.79501877636287 ], "wc_review_avg": [ 624.5, 193.00841950547132 ], "wc_reply_reviewers_avg": [ 67.5, 77.5515957282634 ], "wc_reply_authors_avg": [ 1014.5, 373.90139074360235 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 2.0, 0.7071067811865476 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": -0.5773502691896257, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8539086720753480387&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1", "aff_unique_norm": "Harvard University;Meta", "aff_unique_dep": ";Meta Platforms, Inc.", "aff_unique_url": "https://www.harvard.edu;https://meta.com", "aff_unique_abbr": "Harvard;Meta", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "xbx7Hxjbd79", "title": "COLA: Consistent Learning with Opponent-Learning Awareness", "track": "main", "status": "Reject", "tldr": "", "abstract": "Optimization problems with multiple, interdependent losses, such as Generative Adversarial Networks (GANs) or multi-agent RL, are commonly formalized as differentiable games. \nLearning with Opponent-Learning Awareness (LOLA) introduced opponent shaping to this setting. More specifically, LOLA introduced an augmented learning rule that accounts for the agent's influence on the anticipated learning step of the other agents. However, the original LOLA formulation is inconsistent because LOLA models other agents as naive learners rather than LOLA agents. \nIn previous work, this inconsistency was stated to be the root cause of LOLA's failure to preserve stable fixed points (SFPs). We provide a counterexample by investigating cases where Higher-Order LOLA (HOLA) converges. \nFurthermore, we show that, contrary to claims made, Competitive Gradient Descent (CGD) does not solve the consistency problem.\nNext, we propose a new method called Consistent LOLA (COLA), which learns update functions that are consistent under mutual opponent shaping. Lastly, we empirically compare the performance and consistency of HOLA, LOLA, and COLA on a set of general-sum learning games.", "keywords": "Differentiable games;multi-agent reinforcement learning;general-sum games;lola", "primary_area": "", "supplementary_material": "/attachment/76744d2c4545b5a0b6efe92c6a44efb125b476e2.zip", "author": "Timon Willi;Johannes Treutlein;Alistair Letcher;Jakob Nicolaus Foerster", "authorids": "~Timon_Willi1;~Johannes_Treutlein1;~Alistair_Letcher1;~Jakob_Nicolaus_Foerster1", "gender": ";;M;M", "homepage": "https://www.timonwilli.com;;https://aletcher.github.io;https://www.jakobfoerster.com", "dblp": "243/3437;;;176/5095", "google_scholar": "Dn-udzAAAAAJ;;o28w0mwAAAAJ;6z4lQzMAAAAJ", "orcid": "0000-0003-4405-5700;;;", "linkedin": ";;;", "or_profile": "~Timon_Willi1;~Johannes_Treutlein1;~Alistair_Letcher1;~Jakob_Nicolaus_Foerster1", "aff": "University of Oxford, University of Oxford;;;University of Oxford, University of Oxford", "aff_domain": "eng.ox.ac.uk;;;eng.ox.ac.uk", "position": "PhD student;;;Associate Professor", "bibtex": "@misc{\nwilli2022cola,\ntitle={{COLA}: Consistent Learning with Opponent-Learning Awareness},\nauthor={Timon Willi and Johannes Treutlein and Alistair Letcher and Jakob Nicolaus Foerster},\nyear={2022},\nurl={https://openreview.net/forum?id=xbx7Hxjbd79}\n}", "github": "", "project": "", "reviewers": "ZQJ8;hj7q;QYEL;bkzQ", "site": "https://openreview.net/forum?id=xbx7Hxjbd79", "pdf_size": 0, "recommendation": "3;3;6;8", "confidence": "2;3;3;3", "correctness": "1;2;3;4", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "47;98;98;70", "wc_summary_review": "39;49;32;50", "wc_main_review": "296;339;271;92", "wc_review": "382;486;401;212", "wc_reply_reviewers": "0;406;0;0", "wc_reply_authors": "673;1497;563;119", "reply_reviewers": "0;2;0;0", "reply_authors": "3;6;2;1", "recommendation_avg": [ 5.0, 2.1213203435596424 ], "confidence_avg": [ 2.75, 0.4330127018922193 ], "correctness_avg": [ 2.5, 1.118033988749895 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 78.25, 21.358546298847212 ], "wc_summary_review_avg": [ 42.5, 7.433034373659253 ], "wc_main_review_avg": [ 249.5, 94.1289009815795 ], "wc_review_avg": [ 370.25, 99.40416238769883 ], "wc_reply_reviewers_avg": [ 101.5, 175.80315696824104 ], "wc_reply_authors_avg": [ 713.0, 497.89356292284 ], "reply_reviewers_avg": [ 0.5, 0.8660254037844386 ], "reply_authors_avg": [ 3.0, 1.8708286933869707 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.5443310539518174, "corr_recommendation_correctness": 0.9486832980505139, "gs_citation": 62, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14450342073245803366&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0", "aff_unique_norm": "University of Oxford", "aff_unique_dep": "", "aff_unique_url": "https://www.ox.ac.uk", "aff_unique_abbr": "Oxford", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United Kingdom" }, { "id": "xdNcdoHdBER", "title": "Sneakoscope: Revisiting Unsupervised Out-of-Distribution Detection", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "The problem of detecting out-of-distribution (OOD) examples in neural networks has been widely studied in the literature, with state-of-the-art techniques being supervised in that they require fine-tuning on OOD data to achieve high-quality OOD detection. But supervised OOD detection methods also have a disadvantage in that they require expensive training on OOD data, curating the OOD dataset so that it is distinguishable from the in-distribution data, and significant hyper-parameter tuning. In this work, we propose a unified evaluation suite, Sneakoscope, to revisit the problem with in-depth exploration of unsupervised OOD detection. Our surprising discovery shows that (1) model architectures play a significant role in unsupervised OOD detection performance; (2) unsupervised approaches applied on large-scale pre-trained models can achieve competitive performance compared to their supervised counterparts; and (3) unsupervised OOD detection based on Mahalanobis Distance with the support of a pre-trained model consistently outperforms other unsupervised methods by a large margin and compares favorably with results from state-of-the-art supervised OOD detection methods reported in the literature. We thus provide new baselines for unsupervised OOD detection methods.", "keywords": "OOD Detection;Unsupervised Approaches;Model Confidence Calibration;Hidden Representation Analysis", "primary_area": "", "supplementary_material": "", "author": "Tianji Cong;Atul Prakash", "authorids": "~Tianji_Cong1;~Atul_Prakash1", "gender": "M;", "homepage": ";https://www.eecs.umich.edu/~aprakash", "dblp": ";p/AtulPrakash", "google_scholar": "3ToRqNMAAAAJ;kIkHa2IAAAAJ", "orcid": ";0000-0002-4907-3687", "linkedin": ";atul-prakash-8729a44/", "or_profile": "~Tianji_Cong1;~Atul_Prakash1", "aff": "University of Michigan;University of Michigan", "aff_domain": "umich.edu;umich.edu", "position": "PhD student;Professor", "bibtex": "@misc{\ncong2022sneakoscope,\ntitle={Sneakoscope: Revisiting Unsupervised Out-of-Distribution Detection},\nauthor={Tianji Cong and Atul Prakash},\nyear={2022},\nurl={https://openreview.net/forum?id=xdNcdoHdBER}\n}", "github": "", "project": "", "reviewers": "5kZ2;yrU5;s8vF;XDPo", "site": "https://openreview.net/forum?id=xdNcdoHdBER", "pdf_size": 0, "recommendation": "1;3;3;3", "confidence": "4;5;5;4", "correctness": "3;2;4;2", "technical_novelty": "1;1;1;1", "empirical_novelty": "1;2;2;2", "wc_summary_paper": "50;76;44;84", "wc_summary_review": "25;45;71;131", "wc_main_review": "297;954;926;376", "wc_review": "372;1075;1041;591", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "45;268;264;116", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 2.5, 0.8660254037844386 ], "confidence_avg": [ 4.5, 0.5 ], "correctness_avg": [ 2.75, 0.82915619758885 ], "technical_novelty_avg": [ 1.0, 0.0 ], "empirical_novelty_avg": [ 1.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 63.5, 16.874537030686206 ], "wc_summary_review_avg": [ 68.0, 39.8622628559895 ], "wc_main_review_avg": [ 638.25, 303.201562495974 ], "wc_review_avg": [ 769.75, 298.71003916842164 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 173.25, 96.09728143917496 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": -0.17407765595569782, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:h0oMY5NRGv4J:scholar.google.com/&scioq=Sneakoscope:+Revisiting+Unsupervised+Out-of-Distribution+Detection&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "University of Michigan", "aff_unique_dep": "", "aff_unique_url": "https://www.umich.edu", "aff_unique_abbr": "UM", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "xf0B7-7MRo6", "title": "AIR-Net: Adaptive and Implicit Regularization Neural Network for matrix completion", "track": "main", "status": "Reject", "tldr": "", "abstract": "Conventionally, the matrix completion (MC) model aims to recover a matrix from partially observed elements. Accurate recovery necessarily requires a regularization encoding priors of the unknown matrix/signal properly. However, encoding the priors accurately for the complex natural signal is difficult, and even then, the model might not generalize well outside the particular matrix type. This work combines adaptive and implicit low-rank regularization that captures the prior dynamically according to the current recovered matrix. Furthermore, we aim to answer the question: how does adaptive regularization affect implicit regularization? We utilize neural networks to represent Adaptive and Implicit Regularization and named the proposed model \\textit{AIR-Net}. Theoretical analyses show that the adaptive part of the AIR-Net enhances implicit regularization. In addition, the adaptive regularizer vanishes at the end, thus can avoid saturation issues. Numerical experiments for various data demonstrate the effectiveness of AIR-Net, especially when the locations of missing elements are not randomly chosen. With complete flexibility to select neural networks for matrix representation, AIR-Net can be extended to solve more general inverse problems.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhemin Li;Hongxia Wang", "authorids": "~Zhemin_Li1;~Hongxia_Wang1", "gender": "M;F", "homepage": ";https://www.nudt.edu.cn/xysz/wlxy/index.htm", "dblp": "190/0513;", "google_scholar": ";", "orcid": "0000-0001-8390-108X;", "linkedin": ";", "or_profile": "~Zhemin_Li1;~Hongxia_Wang1", "aff": "National University of Defense Technology;National University of Defense Technology", "aff_domain": "nudt.edu.cn;nudt.edu.cn", "position": "PhD student;Full Professor", "bibtex": "@misc{\nli2022airnet,\ntitle={{AIR}-Net: Adaptive and Implicit Regularization Neural Network for matrix completion},\nauthor={Zhemin Li and Hongxia Wang},\nyear={2022},\nurl={https://openreview.net/forum?id=xf0B7-7MRo6}\n}", "github": "", "project": "", "reviewers": "Untt;MBKQ;B6ib;Bmxi", "site": "https://openreview.net/forum?id=xf0B7-7MRo6", "pdf_size": 0, "recommendation": "3;5;5;5", "confidence": "4;4;3;3", "correctness": "3;4;3;4", "technical_novelty": "3;3;2;3", "empirical_novelty": "2;3;2;3", "wc_summary_paper": "68;78;114;81", "wc_summary_review": "44;29;38;47", "wc_main_review": "324;377;512;389", "wc_review": "436;484;664;517", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.5, 0.8660254037844386 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 85.25, 17.282577932704367 ], "wc_summary_review_avg": [ 39.5, 6.87386354243376 ], "wc_main_review_avg": [ 400.5, 68.86399639869879 ], "wc_review_avg": [ 525.25, 85.12747793750265 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1, "aff_unique_index": "0;0", "aff_unique_norm": "National University of Defense Technology", "aff_unique_dep": "", "aff_unique_url": "http://www.nudt.edu.cn/", "aff_unique_abbr": "NUDT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "id": "xiXOrugVHs", "title": "Long Document Summarization with Top-Down and Bottom-Up Representation Inference", "track": "main", "status": "Reject", "tldr": "", "abstract": "Text summarization aims to condense long documents and retain key information. Critical to the success of a summarization model is the faithful inference of latent representations of words or tokens in the source documents. Most recent models infer the latent representations with a transformer encoder, which is purely bottom-up. Also, self-attention-based inference models face the challenge of quadratic complexity with respect to sequence length. We propose a principled inference framework to improve summarization models on these two aspects. Our framework assumes a hierarchical latent structure of a document where the top-level captures the long range dependency at a coarser time scale and the bottom token level preserves the details. Critically, this hierarchical structure enables token representations to be updated in both a bottom-up and top-down manner. In the bottom-up pass, token representations are inferred with local self-attention to leverage its efficiency. Top-down correction is then applied to allow tokens to capture long-range dependency. We demonstrate the effectiveness of the proposed framework on a diverse set of summarization datasets, including narrative, conversational, scientific documents and news. Our model achieves (1) competitive or better performance on short documents with higher memory and compute efficiency, compared to full attention transformers, and (2) state-of--the-art performance on a wide range of long document summarization benchmarks, compared to recent efficient transformers. We also show that our model can summarize an entire book and achieve competitive performance using $0.27\\%$ parameters (464M vs. 175B) and much less training data, compared to a recent GPT-3-based model. These results indicate the general applicability and benefits of the proposed framework. ", "keywords": "top-down inference;bottom-up inference;long document summarization", "primary_area": "", "supplementary_material": "", "author": "Bo Pang;Erik Nijkamp;Wojciech Maciej Kryscinski;Silvio Savarese;Yingbo Zhou;Caiming Xiong", "authorids": "~Bo_Pang1;~Erik_Nijkamp2;~Wojciech_Maciej_Kryscinski1;~Silvio_Savarese1;~Yingbo_Zhou1;~Caiming_Xiong1", "gender": "M;M;M;M;;M", "homepage": ";https://eriknijkamp.com/;;;;http://cmxiong.com/", "dblp": "16/6344;;;50/3578;72/8614;80/7282", "google_scholar": "s9fNEVEAAAAJ;;;ImpbxLsAAAAJ;H_6RQ7oAAAAJ;vaSdahkAAAAJ", "orcid": ";;;;;", "linkedin": ";;;;yingbozhou/;caiming-xiong-150a1417", "or_profile": "~Bo_Pang1;~Erik_Nijkamp2;~Wojciech_Maciej_Kryscinski1;~Silvio_Savarese1;~Yingbo_Zhou1;~Caiming_Xiong1", "aff": "University of California, Los Angeles;University of California, Los Angeles;;Stanford University;Salesforce Research;Salesforce Research", "aff_domain": "ucla.edu;ucla.edu;;stanford.edu;salesforce.com;salesforce.com", "position": "PhD student;PhD student;;Adjunct Professor;Research Scientist;Research Scientist", "bibtex": "@misc{\npang2022long,\ntitle={Long Document Summarization with Top-Down and Bottom-Up Representation Inference},\nauthor={Bo Pang and Erik Nijkamp and Wojciech Maciej Kryscinski and Silvio Savarese and Yingbo Zhou and Caiming Xiong},\nyear={2022},\nurl={https://openreview.net/forum?id=xiXOrugVHs}\n}", "github": "", "project": "", "reviewers": "fnCT;Kugi;VzGN", "site": "https://openreview.net/forum?id=xiXOrugVHs", "pdf_size": 0, "recommendation": "5;5;6", "confidence": "3;4;3", "correctness": "3;3;4", "technical_novelty": "2;2;2", "empirical_novelty": "3;2;3", "wc_summary_paper": "113;53;75", "wc_summary_review": "37;63;35", "wc_main_review": "190;245;259", "wc_review": "340;361;369", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "641;668;415", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "recommendation_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 80.33333333333333, 24.78350706058814 ], "wc_summary_review_avg": [ 45.0, 12.754084313139327 ], "wc_main_review_avg": [ 231.33333333333334, 29.78067979225607 ], "wc_review_avg": [ 356.6666666666667, 12.229290885229426 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 574.6666666666666, 113.43818679008503 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.4999999999999999, "corr_recommendation_correctness": 0.9999999999999997, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:8nac2fWvJrsJ:scholar.google.com/&scioq=Long+Document+Summarization+with+Top-Down+and+Bottom-Up+Representation+Inference&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0;1;2;2", "aff_unique_norm": "University of California, Los Angeles;Stanford University;Salesforce", "aff_unique_dep": ";;Salesforce Research", "aff_unique_url": "https://www.ucla.edu;https://www.stanford.edu;https://research.salesforce.com", "aff_unique_abbr": "UCLA;Stanford;Salesforce", "aff_campus_unique_index": "0;0;1", "aff_campus_unique": "Los Angeles;Stanford;", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Bayesian Neural Network Priors Revisited", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6955", "id": "xkjqJYqRJy", "poster": "", "openreview": "https://openreview.net/forum?id=xkjqJYqRJy", "slides": "https://iclr.cc/virtual/2022/poster/6955", "video": "https://iclr.cc/virtual/2022/poster/6955", "author_site": "Vincent Fortuin, Adri\u00e0 Garriga-Alonso, Sebastian Ober, Florian Wenzel, Gunnar Ratsch, Richard E Turner, Mark van der Wilk, Laurence Aitchison", "tldr": "", "abstract": "Isotropic Gaussian priors are the de facto standard for modern Bayesian neural network inference. However, it is unclear whether these priors accurately reflect our true beliefs about the weight distributions or give optimal performance. To find better priors, we study summary statistics of neural network weights in networks trained using stochastic gradient descent (SGD). We find that convolutional neural network (CNN) and ResNet weights display strong spatial correlations, while fully connected networks (FCNNs) display heavy-tailed weight distributions. We show that building these observations into priors can lead to improved performance on a variety of image classification datasets. Surprisingly, these priors mitigate the cold posterior effect in FCNNs, but slightly increase the cold posterior effect in ResNets.", "keywords": "Bayesian deep learning;Bayesian neural networks;Priors", "primary_area": "", "supplementary_material": "", "author": "Vincent Fortuin;Adri\u00e0 Garriga-Alonso;Sebastian W. Ober;Florian Wenzel;Gunnar Ratsch;Richard E Turner;Mark van der Wilk;Laurence Aitchison", "authorids": "~Vincent_Fortuin1;~Adri\u00e0_Garriga-Alonso1;~Sebastian_W._Ober1;~Florian_Wenzel1;~Gunnar_Ratsch1;~Richard_E_Turner1;~Mark_van_der_Wilk1;~Laurence_Aitchison1", "gender": "M;;;M;M;M;M;", "homepage": "https://fortuin.github.io/;;;;http://bmi.inf.ethz.ch;https://rich-turner-group.github.io/;https://mvdw.uk;http://www.gatsby.ucl.ac.uk/~laurence/", "dblp": "218/7489;;;04/9709;https://dblp.uni-trier.de/pers/hd/r/R=auml=tsch:Gunnar;40/5352;142/2927;155/1918.html", "google_scholar": "https://scholar.google.ch/citations?user=XBlrYTIAAAAJ;;;;https://scholar.google.ch/citations?user=tQuQ1FwAAAAJ;https://scholar.google.co.uk/citations?user=DgLEyZgAAAAJ;PKcjcT4AAAAJ;", "orcid": "0000-0002-0640-2671;;;;0000-0001-5486-8532;;0000-0001-7947-6682;", "linkedin": "vincent-fortuin-42426b134/;;;;;;;", "or_profile": "~Vincent_Fortuin1;~Adri\u00e0_Garriga-Alonso1;~Sebastian_W._Ober1;~Florian_Wenzel1;~Gunnar_Ratsch1;~Richard_E_Turner1;~Mark_van_der_Wilk1;~Laurence_Aitchison1", "aff": "University of Cambridge;;;Amazon;Swiss Federal Institute of Technology;University of Cambridge;Imperial College London;University of Bristol", "aff_domain": "cam.ac.uk;;;amazon.com;ethz.ch;cam.ac.uk;imperial.ac.uk;bristol.ac.uk", "position": "Researcher;;;Researcher;Professor;Professor;Lecturer (Assistant Professor);Assistant Professor", "bibtex": "@inproceedings{\nfortuin2022bayesian,\ntitle={Bayesian Neural Network Priors Revisited},\nauthor={Vincent Fortuin and Adri{\\`a} Garriga-Alonso and Sebastian W. Ober and Florian Wenzel and Gunnar Ratsch and Richard E Turner and Mark van der Wilk and Laurence Aitchison},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=xkjqJYqRJy}\n}", "github": "", "project": "", "reviewers": "CSpQ;yXaK;Ergj;sapj", "pdf_size": 0, "recommendation": "3;5;6;8", "confidence": "4;4;5;4", "correctness": "1;3;2;4", "technical_novelty": "2;3;2;1", "empirical_novelty": "2;3;4;3", "wc_summary_paper": "56;86;52;64", "wc_summary_review": "55;69;124;28", "wc_main_review": "111;297;145;619", "wc_review": "222;452;321;711", "wc_reply_reviewers": "708;0;275;0", "wc_reply_authors": "776;313;616;753", "reply_reviewers": "1;0;1;0", "reply_authors": "2;1;2;1", "recommendation_avg": [ 5.5, 1.8027756377319946 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 2.5, 1.118033988749895 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 64.5, 13.143439428094915 ], "wc_summary_review_avg": [ 69.0, 35.00714212842859 ], "wc_main_review_avg": [ 293.0, 200.8233054204616 ], "wc_review_avg": [ 426.5, 183.3991548508335 ], "wc_reply_reviewers_avg": [ 245.75, 289.5327054064877 ], "wc_reply_authors_avg": [ 614.5, 184.50541997459044 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": 0.16012815380508713, "corr_recommendation_correctness": 0.8682431421244592, "gs_citation": 177, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4553297460189369768&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=xkjqJYqRJy", "email": "cam.ac.uk;;;amazon.com;ethz.ch;cam.ac.uk;imperial.ac.uk;bristol.ac.uk", "author_num": 8, "aff_unique_index": "0;1;2;0;3;4", "aff_unique_norm": "University of Cambridge;Amazon;Swiss Federal Institute of Technology;Imperial College London;University of Bristol", "aff_unique_dep": ";Amazon.com, Inc.;;;", "aff_unique_url": "https://www.cam.ac.uk;https://www.amazon.com;https://www.ethz.ch;https://www.imperial.ac.uk;https://www.bristol.ac.uk", "aff_unique_abbr": "Cambridge;Amazon;ETH Zurich;ICL;Bristol", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Cambridge;", "aff_country_unique_index": "0;1;2;0;0;0", "aff_country_unique": "United Kingdom;United States;Switzerland" }, { "title": "VICReg: Variance-Invariance-Covariance Regularization for Self-Supervised Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6481", "id": "xm6YD62D1Ub", "poster": "", "openreview": "https://openreview.net/forum?id=xm6YD62D1Ub", "slides": "https://iclr.cc/virtual/2022/poster/6481", "video": "https://iclr.cc/virtual/2022/poster/6481", "author_site": "Adrien Bardes, Jean Ponce, Yann LeCun", "tldr": "", "abstract": "Recent self-supervised methods for image representation learning maximize the agreement between embedding vectors produced by encoders fed with different views of the same image. The main challenge is to prevent a collapse in which the encoders produce constant or non-informative vectors. We introduce VICReg (Variance-Invariance-Covariance Regularization), a method that explicitly avoids the collapse problem with two regularizations terms applied to both embeddings separately: (1) a term that maintains the variance of each embedding dimension above a threshold, (2) a term that decorrelates each pair of variables. Unlike most other approaches to the same problem, VICReg does not require techniques such as: weight sharing between the branches, batch normalization, feature-wise normalization, output quantization, stop gradient, memory banks, etc., and achieves results on par with the state of the art on several downstream tasks. In addition, we show that our variance regularization term stabilizes the training of other methods and leads to performance improvements.", "keywords": "self-supervised learning;representation learning;computer vision", "primary_area": "", "supplementary_material": "", "author": "Adrien Bardes;Jean Ponce;Yann LeCun", "authorids": "~Adrien_Bardes1;~Jean_Ponce1;~Yann_LeCun1", "gender": "M;M;M", "homepage": ";http://www.di.ens.fr/~ponce/;http://yann.lecun.com", "dblp": "292/3848.html;p/JeanPonce;l/YannLeCun", "google_scholar": "SvRU8F8AAAAJ;https://scholar.google.com.tw/citations?user=vC2vywcAAAAJ;WLN3QrAAAAAJ", "orcid": ";;", "linkedin": "adrien-bardes-48a080129/;;", "or_profile": "~Adrien_Bardes1;~Jean_Ponce1;~Yann_LeCun1", "aff": "INRIA;INRIA;New York University", "aff_domain": "inria.fr;inria.fr;nyu.edu", "position": "PhD student;Research director;Full Professor", "bibtex": "@inproceedings{\nbardes2022vicreg,\ntitle={{VICR}eg: Variance-Invariance-Covariance Regularization for Self-Supervised Learning},\nauthor={Adrien Bardes and Jean Ponce and Yann LeCun},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=xm6YD62D1Ub}\n}", "github": "", "project": "", "reviewers": "dMLo;8cmN;QN96;KA3G", "pdf_size": 0, "recommendation": "3;6;6;6", "confidence": "4;4;4;3", "correctness": "3;4;3;3", "technical_novelty": "2;3;3;2", "empirical_novelty": "2;3;3;3", "wc_summary_paper": "69;16;132;122", "wc_summary_review": "17;12;54;38", "wc_main_review": "305;314;281;214", "wc_review": "391;342;467;374", "wc_reply_reviewers": "0;0;0;51", "wc_reply_authors": "618;771;572;500", "reply_reviewers": "0;0;0;1", "reply_authors": "1;1;1;1", "recommendation_avg": [ 5.25, 1.299038105676658 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 84.75, 46.35393726534996 ], "wc_summary_review_avg": [ 30.25, 16.82817577754642 ], "wc_main_review_avg": [ 278.5, 39.14396505209967 ], "wc_review_avg": [ 393.5, 45.93745748297352 ], "wc_reply_reviewers_avg": [ 12.75, 22.083647796503186 ], "wc_reply_authors_avg": [ 615.25, 99.27077868134208 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 1327, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14326519942504966909&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "pdf": "https://openreview.net/pdf?id=xm6YD62D1Ub", "email": "inria.fr;inria.fr;nyu.edu", "author_num": 3, "aff_unique_index": "0;0;1", "aff_unique_norm": "INRIA;New York University", "aff_unique_dep": ";", "aff_unique_url": "https://www.inria.fr;https://www.nyu.edu", "aff_unique_abbr": "INRIA;NYU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1", "aff_country_unique": "France;United States" }, { "id": "xmrtP-ADzk", "title": "Self-Supervised Learning for Binary Networks by Joint Classifier Training", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Despite the great success of self-supervised learning with large floating point networks, such networks are not readily deployable to edge devices.\nTo accelerate deployment of models to edge devices for various downstream tasks by unsupervised representation learning, we propose a self-supervised learning method for binary networks.\nIn particular, we propose to use a randomly initialized classifier attached to a pretrained floating point feature extractor as targets and jointly train it with a binary network.\nFor better training of the binary network, we propose a feature similarity loss, a dynamic balancing scheme of loss terms, and modified multi-stage training.\nWe call our method as BSSL.\nOur empirical validations show that BSSL outperforms self-supervised learning baselines for binary networks in various downstream tasks and outperforms supervised pretraining in certain tasks.", "keywords": "binary networks;unsupervised representation learning", "primary_area": "", "supplementary_material": "", "author": "Dahyun Kim;Jonghyun Choi", "authorids": "~Dahyun_Kim1;~Jonghyun_Choi1", "gender": "M;M", "homepage": ";https://ppolon.github.io/", "dblp": "196/7883;21/11103", "google_scholar": "atD6Rs4AAAAJ;uiGWnm4AAAAJ", "orcid": ";0000-0002-7934-8434", "linkedin": "dahyun-kim-0a1711163;jonghyun-choi-459bb615/", "or_profile": "~Dahyun_Kim1;~Jonghyun_Choi1", "aff": "Gwangju Institute of Science and Technology;NAVER", "aff_domain": "gist.ac.kr;navercorp.com", "position": "MS student;AI Advisor Committee", "bibtex": "@misc{\nkim2022selfsupervised,\ntitle={Self-Supervised Learning for Binary Networks by Joint Classifier Training},\nauthor={Dahyun Kim and Jonghyun Choi},\nyear={2022},\nurl={https://openreview.net/forum?id=xmrtP-ADzk}\n}", "github": "", "project": "", "reviewers": "", "site": "https://openreview.net/forum?id=xmrtP-ADzk", "pdf_size": 0, "recommendation": "", "confidence": "", "correctness": "", "technical_novelty": "", "empirical_novelty": "", "wc_summary_paper": "", "wc_summary_review": "", "wc_main_review": "", "wc_review": "", "wc_reply_reviewers": "", "wc_reply_authors": "", "reply_reviewers": "", "reply_authors": "", "recommendation_avg": [ 0, 0 ], "confidence_avg": [ 0, 0 ], "correctness_avg": [ 0, 0 ], "technical_novelty_avg": [ 0, 0 ], "empirical_novelty_avg": [ 0, 0 ], "wc_summary_paper_avg": [ 0, 0 ], "wc_summary_review_avg": [ 0, 0 ], "wc_main_review_avg": [ 0, 0 ], "wc_review_avg": [ 0, 0 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 1, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0, "corr_recommendation_correctness": 0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8189482079097982660&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1", "aff_unique_norm": "Gwangju Institute of Science and Technology;NAVER Corporation", "aff_unique_dep": ";", "aff_unique_url": "https://www.gist.ac.kr;https://www.naver.com", "aff_unique_abbr": "GIST;NAVER", "aff_campus_unique_index": "0", "aff_campus_unique": "Gwangju;", "aff_country_unique_index": "0;0", "aff_country_unique": "South Korea" }, { "title": "Neural Contextual Bandits with Deep Representation and Shallow Exploration", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6858", "id": "xnYACQquaGV", "poster": "", "openreview": "https://openreview.net/forum?id=xnYACQquaGV", "slides": "https://iclr.cc/virtual/2022/poster/6858", "video": "https://iclr.cc/virtual/2022/poster/6858", "author_site": "Pan Xu, Zheng Wen, Handong Zhao, Quanquan Gu", "tldr": "", "abstract": "We study neural contextual bandits, a general class of contextual bandits, where each context-action pair is associated with a raw feature vector, but the specific reward generating function is unknown. We propose a novel learning algorithm that transforms the raw feature vector using the last hidden layer of a deep ReLU neural network (deep representation learning), and uses an upper confidence bound (UCB) approach to explore in the last linear layer (shallow exploration). We prove that under standard assumptions, our proposed algorithm achieves $\\tilde{O}(\\sqrt{T})$ finite-time regret, where $T$ is the learning time horizon. Compared with existing neural contextual bandit algorithms, our approach is computationally much more efficient since it only needs to explore in the last layer of the deep neural network.", "keywords": "neural network;deep representation learning", "primary_area": "", "supplementary_material": "/attachment/1468959f4251fd26e9ee75f2a76ef524622386c9.zip", "author": "Pan Xu;Zheng Wen;Handong Zhao;Quanquan Gu", "authorids": "~Pan_Xu1;~Zheng_Wen1;~Handong_Zhao3;~Quanquan_Gu1", "gender": "M;M;M;", "homepage": "https://panxulab.github.io/;http://zheng-wen.com/;http://web.cs.ucla.edu/~qgu/;https://hdzhao.github.io/", "dblp": "11/9718-2;;50/4597;79/8522", "google_scholar": "UkYBx6YAAAAJ;kK3qvd8AAAAJ;GU9HgNAAAAAJ;0f-YOFgAAAAJ", "orcid": "0000-0002-2559-8622;;;", "linkedin": "pan-xu-0931a2a6/;;;", "or_profile": "~Pan_Xu1;~Zheng_Wen1;~Quanquan_Gu1;~Handong_Zhao1", "aff": "California Institute of Technology;Google DeepMind;University of California, Los Angeles;Adobe Systems", "aff_domain": "caltech.edu;google.com;cs.ucla.edu;adobe.com", "position": "Postdoc;Research Scientist;Assistant Professor;Research Scientist", "bibtex": "@inproceedings{\nxu2022neural,\ntitle={Neural Contextual Bandits with Deep Representation and Shallow Exploration},\nauthor={Pan Xu and Zheng Wen and Handong Zhao and Quanquan Gu},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=xnYACQquaGV}\n}", "github": "", "project": "", "reviewers": "hPGC;16Sn;hExw;pfBo", "pdf_size": 0, "recommendation": "3;6;8;8", "confidence": "4;4;4;3", "correctness": "4;4;4;4", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;2;3;2", "wc_summary_paper": "67;164;166;93", "wc_summary_review": "56;47;20;44", "wc_main_review": "808;159;312;218", "wc_review": "931;370;498;355", "wc_reply_reviewers": "760;0;141;0", "wc_reply_authors": "1469;434;482;302", "reply_reviewers": "2;0;1;0", "reply_authors": "4;2;2;1", "recommendation_avg": [ 6.25, 2.0463381929681126 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 4.0, 0.0 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 122.5, 43.48850422812907 ], "wc_summary_review_avg": [ 41.75, 13.311179511974137 ], "wc_main_review_avg": [ 374.25, 256.3009705404956 ], "wc_review_avg": [ 538.5, 233.32434506497603 ], "wc_reply_reviewers_avg": [ 225.25, 314.0584141525267 ], "wc_reply_authors_avg": [ 671.75, 464.98729821361786 ], "reply_reviewers_avg": [ 0.75, 0.82915619758885 ], "reply_authors_avg": [ 2.25, 1.0897247358851685 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.49374193110101877, "corr_recommendation_correctness": 0.0, "gs_citation": 89, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3287287562228379419&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=xnYACQquaGV", "email": "caltech.edu;google.com;cs.ucla.edu;adobe.com", "author_num": 4, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "California Institute of Technology;Google;University of California, Los Angeles;Adobe", "aff_unique_dep": ";Google DeepMind;;Adobe Systems Incorporated", "aff_unique_url": "https://www.caltech.edu;https://deepmind.com;https://www.ucla.edu;https://www.adobe.com", "aff_unique_abbr": "Caltech;DeepMind;UCLA;Adobe", "aff_campus_unique_index": "0;2", "aff_campus_unique": "Pasadena;;Los Angeles", "aff_country_unique_index": "0;1;0;0", "aff_country_unique": "United States;United Kingdom" }, { "id": "xo_5lb5ond", "title": "LEAN: graph-based pruning for convolutional neural networks by extracting longest chains", "track": "main", "status": "Reject", "tldr": "", "abstract": "Neural network pruning techniques can substantially reduce the computational cost of applying convolutional neural networks (CNNs). Common pruning methods determine which convolutional filters to remove by ranking the filters individually, i.e., without taking into account their interdependence. In this paper, we advocate the viewpoint that pruning should consider the interdependence between series of consecutive operators. We propose the LongEst-chAiN (LEAN) method that prunes CNNs by using graph-based algorithms to select relevant chains of convolutions. A CNN is interpreted as a graph, with the operator norm of each operator as distance metric for the edges. LEAN pruning iteratively extracts the highest value path from the graph to keep. In our experiments, we test LEAN pruning on several image-to-image tasks, including the well-known CamVid dataset, and a real-world X-ray CT dataset. Results indicate that LEAN pruning can result in networks with similar accuracy but 3--20x fewer convolutional filters than networks pruned with methods that rank filters individually.", "keywords": "Pruning;Sparsity;Compression;Graph pruning", "primary_area": "", "supplementary_material": "/attachment/3c8410a37c31555697cf2b2b17b2a388c8a8f25b.zip", "author": "Richard Arnoud Schoonhoven;Allard Hendriksen;Daniel Pelt;Joost Batenburg", "authorids": "~Richard_Arnoud_Schoonhoven1;~Allard_Hendriksen1;~Daniel_Pelt1;~Joost_Batenburg1", "gender": "M;;;M", "homepage": ";https://www.cwi.nl/people/allard-hendriksen;;https://www.universiteitleiden.nl/en/staffmembers/joost-batenburg", "dblp": ";;140/0228;30/6502", "google_scholar": "A1_UXqsAAAAJ;;https://scholar.google.nl/citations?user=54V8aTMAAAAJ;https://scholar.google.com.tw/citations?user=6WpX7xUAAAAJ", "orcid": "0000-0003-3659-929X;0000-0002-3355-9551;;", "linkedin": "richard-schoonhoven-081a04107/;;;", "or_profile": "~Richard_Arnoud_Schoonhoven1;~Allard_Hendriksen1;~Daniel_Pelt1;~Joost_Batenburg1", "aff": "Centrum voor Wiskunde en Informatica;;Leiden University;Leiden University, Leiden University", "aff_domain": "cwi.nl;;leidenuniv.nl;liacs.leidenuniv.nl", "position": "PhD student;;Assistant Professor;Full Professor", "bibtex": "@misc{\nschoonhoven2022lean,\ntitle={{LEAN}: graph-based pruning for convolutional neural networks by extracting longest chains},\nauthor={Richard Arnoud Schoonhoven and Allard Hendriksen and Daniel Pelt and Joost Batenburg},\nyear={2022},\nurl={https://openreview.net/forum?id=xo_5lb5ond}\n}", "github": "", "project": "", "reviewers": "rCV2;ami3;xFHB;wyEq;XPch", "site": "https://openreview.net/forum?id=xo_5lb5ond", "pdf_size": 0, "recommendation": "3;5;5;5;5", "confidence": "4;3;4;3;3", "correctness": "3;3;3;3;3", "technical_novelty": "2;3;2;3;3", "empirical_novelty": "2;2;2;3;3", "wc_summary_paper": "162;24;52;28;65", "wc_summary_review": "76;57;33;16;26", "wc_main_review": "373;16;360;124;144", "wc_review": "611;97;445;168;235", "wc_reply_reviewers": "0;0;172;68;0", "wc_reply_authors": "801;381;811;439;610", "reply_reviewers": "0;0;1;1;0", "reply_authors": "1;1;1;1;1", "recommendation_avg": [ 4.6, 0.7999999999999999 ], "confidence_avg": [ 3.4, 0.4898979485566356 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.6, 0.4898979485566356 ], "empirical_novelty_avg": [ 2.4, 0.4898979485566356 ], "wc_summary_paper_avg": [ 66.2, 50.24101909794426 ], "wc_summary_review_avg": [ 41.6, 21.877842672439165 ], "wc_main_review_avg": [ 203.4, 140.1707530121744 ], "wc_review_avg": [ 311.2, 189.7244317424617 ], "wc_reply_reviewers_avg": [ 48.0, 67.36171019206684 ], "wc_reply_authors_avg": [ 608.4, 178.0736926106717 ], "reply_reviewers_avg": [ 0.4, 0.48989794855663565 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.6123724356957948, "corr_recommendation_correctness": 0.0, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11396526845377879204&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff_unique_index": "0;1;1", "aff_unique_norm": "Centrum voor Wiskunde en Informatica;Leiden University", "aff_unique_dep": ";", "aff_unique_url": "https://www.cwi.nl/;https://www.leidenuniv.nl", "aff_unique_abbr": "CWI;LU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Leiden", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Netherlands" }, { "id": "xp2D-1PtLc5", "title": "ClsVC: Learning Speech Representations with two different classification tasks.", "track": "main", "status": "Reject", "tldr": "", "abstract": "Voice conversion(VC) aims to convert one speaker's voice to generate a new speech as it is said by another speaker. Previous works focus on learning latent representation by applying two different encoders to learn content information and timbre information from the input speech respectively. However, whether they apply a bottleneck network or vector quantify technology, it is very difficult to perfectly separate the speaker and the content information from a speech signal. In this paper, we propose a novel voice conversion framework, 'ClsVC', to address this problem. It uses only one encoder to get both timbre and content information by dividing the latent space. Besides, some constraints are proposed to ensure the different part of latent space only contains separating content and timbre information respectively. We have shown the necessity to set these constraints, and we also experimentally prove that even if we change the division proportion of latent space, the content and timbre information will be always well separated. Experiments on the VCTK dataset show ClsVC is a state-of-the-art framework in terms of the naturalness and similarity of converted speech. ", "keywords": "voice conversion;gradient reversal;adversarial learning;speech synthesis", "primary_area": "", "supplementary_material": "/attachment/7e442d5d42687678e188472d8d7955a01648118d.zip", "author": "Huaizhen Tang;xulong Zhang;Jianzong Wang;ning Cheng;Jing Xiao", "authorids": "~Huaizhen_Tang1;zhangxulong066@pingan.com.cn;~Jianzong_Wang2;chengning211@pingan.com.cn;~Jing_Xiao3", "gender": "M;;M;;M", "homepage": ";;https://largeaudiomodel.com/author/jianzong-wang/;;http://www.cs.cmu.edu/~jxiao/", "dblp": ";;70/8380;;67/4008-6.html", "google_scholar": ";;https://scholar.google.co.uk/citations?user=noi4qcUAAAAJ;;mcBd8KUAAAAJ", "orcid": "0000-0002-0414-8497;;0000-0002-9237-4231;;0000-0001-9615-4749", "linkedin": ";;;;jing-xiao-8653051/", "or_profile": "~Huaizhen_Tang1;zhangxulong066@pingan.com.cn;~Jianzong_Wang2;chengning211@pingan.com.cn;~Jing_Xiao3", "aff": "University of Science and Technology of China;;Pingan Technology;;Pingan Group", "aff_domain": "ustc.edu.cn;;pingan.com.cn;;pingan.com.cn", "position": "MS student;;Researcher;;Chief Scientist", "bibtex": "@misc{\nhuaizhen2022clsvc,\ntitle={Cls{VC}: Learning Speech Representations with two different classification tasks.},\nauthor={Huaizhen Tang and xulong Zhang and Jianzong Wang and ning Cheng and Jing Xiao},\nyear={2022},\nurl={https://openreview.net/forum?id=xp2D-1PtLc5}\n}", "github": "", "project": "", "reviewers": "fLnd;n7Fo;XqM2;DgA7", "site": "https://openreview.net/forum?id=xp2D-1PtLc5", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "4;4;4;3", "correctness": "2;3;3;3", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "67;79;70;52", "wc_summary_review": "45;60;76;109", "wc_main_review": "456;484;517;234", "wc_review": "568;623;663;395", "wc_reply_reviewers": "0;58;0;0", "wc_reply_authors": "0;0;400;0", "reply_reviewers": "0;1;0;0", "reply_authors": "0;0;1;0", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 67.0, 9.72111104761179 ], "wc_summary_review_avg": [ 72.5, 23.753947040439407 ], "wc_main_review_avg": [ 422.75, 111.09314785350175 ], "wc_review_avg": [ 562.25, 102.2823909575837 ], "wc_reply_reviewers_avg": [ 14.5, 25.11473670974872 ], "wc_reply_authors_avg": [ 100.0, 173.20508075688772 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 0.25, 0.4330127018922193 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -1.0, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14747833917239151150&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;2", "aff_unique_norm": "University of Science and Technology of China;PingAn Technology;Ping An Group", "aff_unique_dep": ";;", "aff_unique_url": "http://www.ustc.edu.cn;https://www.pingan.com;https://www.pingan.com.cn", "aff_unique_abbr": "USTC;;Ping An", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "id": "xqt9fZmCTsP", "title": "Mining Multi-Label Samples from Single Positive Labels", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Conditional generative adversarial networks (cGANs) have shown superior results in class-conditional generation tasks. In order to simultaneously control multiple conditions, cGANs requires multi-label training datasets, where multiple labels can be assigned to each data instance. Nevertheless, the tremendous annotation cost limits the accessibility of multi-label datasets in the real world. Hence, we explore the practical setting called single positive setting, where each data instance is annotated by only one positive label with no explicit negative labels. To generate multi-label data in the single positive setting, we propose a novel sampling approach called single-to-multi-label (S2M) sampling, based on the Markov chain Monte Carlo method. As a widely applicable \u201cadd-on\u201d method, our proposed S2M sampling enables existing unconditional and conditional GANs to draw high-quality multi-label data with a minimal annotation cost. Extensive experiments on real image datasets (e.g., CIFAR-10 and CelebA) verify the effectiveness and correctness of our method, even when compared to a model trained with fully annotated datasets.", "keywords": "GANs;MCMC;sampling;conditional generation;single positive label;multi-label", "primary_area": "", "supplementary_material": "/attachment/33713e3be57431a14c00061947dce43e2dae3a80.zip", "author": "Youngin Cho;Daejin Kim;Jaegul Choo", "authorids": "~Youngin_Cho1;~Daejin_Kim1;~Jaegul_Choo1", "gender": "M;;M", "homepage": "https://github.com/choyi0521;https://github.com/kiddj;https://sites.google.com/site/jaegulchoo/", "dblp": ";;07/2074", "google_scholar": ";;GHJYsLEAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Youngin_Cho1;~Daejin_Kim1;~Jaegul_Choo1", "aff": "Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology", "aff_domain": "kaist.ac.kr;kaist.ac.kr;kaist.ac.kr", "position": "MS student;MS student;Associate Professor", "bibtex": "@misc{\ncho2022mining,\ntitle={Mining Multi-Label Samples from Single Positive Labels},\nauthor={Youngin Cho and Daejin Kim and Jaegul Choo},\nyear={2022},\nurl={https://openreview.net/forum?id=xqt9fZmCTsP}\n}", "github": "", "project": "", "reviewers": "otto;C3av;AVVg;VQ65", "site": "https://openreview.net/forum?id=xqt9fZmCTsP", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "3;3;4;3", "correctness": "4;3;4;3", "technical_novelty": "4;2;3;3", "empirical_novelty": "2;2;0;2", "wc_summary_paper": "36;62;53;96", "wc_summary_review": "89;51;16;61", "wc_main_review": "297;304;195;279", "wc_review": "422;417;264;436", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 1.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 61.75, 21.867498713844707 ], "wc_summary_review_avg": [ 54.25, 26.10914590713377 ], "wc_main_review_avg": [ 268.75, 43.54523510098436 ], "wc_review_avg": [ 384.75, 70.06202609117153 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.5773502691896257, "corr_recommendation_correctness": 0.0, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7411788920839631932&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff_unique_index": "0;0;0", "aff_unique_norm": "Korea Advanced Institute of Science and Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.kaist.ac.kr", "aff_unique_abbr": "KAIST", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "South Korea" }, { "id": "xs-tJn58XKv", "title": "Learning Stable Classifiers by Transferring Unstable Features", "track": "main", "status": "Reject", "tldr": "", "abstract": "While unbiased machine learning models are essential for many applications, bias is a human-defined concept that can vary across tasks. Given only input-label pairs, algorithms may lack sufficient information to distinguish stable (causal) features from unstable (spurious) features. However, related tasks often share similar biases -- an observation we may leverage to develop stable classifiers in the transfer setting. In this work, we explicitly inform the target classifier about unstable features in the source tasks. Specifically, we derive a representation that encodes the unstable features by contrasting different data environments in the source task. We achieve robustness by clustering data of the target task according to this representation and minimizing the worst-case risk across these clusters. We evaluate our method on both text and image classifications. Empirical results demonstrate that our algorithm is able to maintain robustness on the target task, outperforming the best baseline by 22.9% in absolute accuracy across 12 transfer settings. Our code and data will be publicly available.", "keywords": "transfer learning;spurious correlation;invariant learning", "primary_area": "", "supplementary_material": "/attachment/e36c0c691f95aa11e230e32f8d820858bd3526da.zip", "author": "Yujia Bao;Shiyu Chang;Regina Barzilay", "authorids": "~Yujia_Bao1;~Shiyu_Chang2;~Regina_Barzilay1", "gender": "M;Unspecified;female", "homepage": "https://people.csail.mit.edu/yujia/;http://people.csail.mit.edu/chang87/;https://www.regina.csail.mit.edu/", "dblp": "214/4122;28/9988;b/ReginaBarzilay", "google_scholar": "https://scholar.google.com/citations?authorid=Ee4Peu4AAAAJ;r21asW4AAAAJ;", "orcid": ";;", "linkedin": ";;", "or_profile": "~Yujia_Bao1;~Shiyu_Chang2;~Regina_Barzilay1", "aff": "Massachusetts Institute of Technology;University of California, Santa Barbara;Massachusetts Institute of Technology", "aff_domain": "mit.edu;ucsb.edu;mit.edu", "position": "PhD student;Assistant Professor;Professor", "bibtex": "@misc{\nbao2022learning,\ntitle={Learning Stable Classifiers by Transferring Unstable Features},\nauthor={Yujia Bao and Shiyu Chang and Regina Barzilay},\nyear={2022},\nurl={https://openreview.net/forum?id=xs-tJn58XKv}\n}", "github": "", "project": "", "reviewers": "RJhJ;J8M5;tPxR;L5Kn;hKdn", "site": "https://openreview.net/forum?id=xs-tJn58XKv", "pdf_size": 0, "recommendation": "3;3;6;6;6", "confidence": "4;4;5;4;4", "correctness": "4;4;4;3;3", "technical_novelty": "2;2;2;2;2", "empirical_novelty": "2;2;3;3;3", "wc_summary_paper": "55;35;247;159;231", "wc_summary_review": "46;26;156;4;202", "wc_main_review": "208;386;321;433;784", "wc_review": "309;447;724;596;1217", "wc_reply_reviewers": "0;405;187;55;271", "wc_reply_authors": "642;1608;504;605;407", "reply_reviewers": "0;1;2;1;1", "reply_authors": "1;3;2;1;1", "recommendation_avg": [ 4.8, 1.469693845669907 ], "confidence_avg": [ 4.2, 0.39999999999999997 ], "correctness_avg": [ 3.6, 0.4898979485566356 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.6, 0.4898979485566356 ], "wc_summary_paper_avg": [ 145.4, 87.40160181598505 ], "wc_summary_review_avg": [ 86.8, 77.81619368743243 ], "wc_main_review_avg": [ 426.4, 194.09337958827962 ], "wc_review_avg": [ 658.6, 312.09011519110953 ], "wc_reply_reviewers_avg": [ 183.6, 146.22940880684706 ], "wc_reply_authors_avg": [ 753.2, 435.19347421577913 ], "reply_reviewers_avg": [ 1.0, 0.6324555320336759 ], "reply_authors_avg": [ 1.6, 0.8 ], "replies_avg": [ 20, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.4082482904638631, "corr_recommendation_correctness": -0.6666666666666666, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13001665395610981653&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1;0", "aff_unique_norm": "Massachusetts Institute of Technology;University of California, Santa Barbara", "aff_unique_dep": ";", "aff_unique_url": "https://web.mit.edu;https://www.ucsb.edu", "aff_unique_abbr": "MIT;UCSB", "aff_campus_unique_index": "1", "aff_campus_unique": ";Santa Barbara", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "xspalMXAB0M", "title": "A Boosting Approach to Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "We study efficient algorithms for reinforcement learning in Markov decision processes, whose complexity is independent of the number of states. This formulation succinctly captures large scale problems, but is also known to be computationally hard in its general form.\n Previous approaches attempt to circumvent the computational hardness by assuming structure in either transition function or the value function, or by relaxing the solution guarantee to a local optimality condition.\n\n We consider the methodology of boosting, borrowed from supervised learning, for converting weak learners into an effective policy. The notion of weak learning we study is that of sampled-based approximate optimization of linear functions over policies. Under this assumption of weak learnability, we give an efficient algorithm that is capable of improving the accuracy of such weak learning methods iteratively. We prove sample complexity and running time bounds on our method, that are polynomial in the natural parameters of the problem: approximation guarantee, discount factor, distribution mismatch and number of actions. In particular, our bound does not explicitly depend on the number of states.\n\n A technical difficulty in applying previous boosting results, is that the value function over policy space is not convex. We show how to use a non-convex variant of the Frank-Wolfe method, coupled with recent advances in gradient boosting that allow incorporating a weak learner with multiplicative approximation guarantee, to overcome the non-convexity and attain global optimality guarantees.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/18fa10b3e44c8cd78fe153531e10dc508854b19c.zip", "author": "Nataly Brukhim;Elad Hazan;Karan Singh", "authorids": "~Nataly_Brukhim1;~Elad_Hazan1;~Karan_Singh1", "gender": ";M;M", "homepage": "https://www.cs.princeton.edu/~nbrukhim/;https://www.ehazan.com;https://i-am-karan-singh.github.io/", "dblp": "215/3691;72/739;00/505", "google_scholar": "https://scholar.google.com/citations?hl=en;LnhCGNMAAAAJ;PZJIgZUAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Nataly_Brukhim1;~Elad_Hazan1;~Karan_Singh1", "aff": "Princeton University;Princeton University;Microsoft Research", "aff_domain": "princeton.edu;princeton.edu;microsoft.com", "position": "PhD student;Full Professor;Postdoc", "bibtex": "@misc{\nbrukhim2022a,\ntitle={A Boosting Approach to Reinforcement Learning},\nauthor={Nataly Brukhim and Elad Hazan and Karan Singh},\nyear={2022},\nurl={https://openreview.net/forum?id=xspalMXAB0M}\n}", "github": "", "project": "", "reviewers": "aFPg;Zawz;LN8M;8LJa;Kp4W", "site": "https://openreview.net/forum?id=xspalMXAB0M", "pdf_size": 0, "recommendation": "3;5;5;6;6", "confidence": "4;3;3;3;4", "correctness": "3;3;3;4;4", "technical_novelty": "2;3;3;3;4", "empirical_novelty": "1;1;3;0;0", "wc_summary_paper": "47;43;32;70;81", "wc_summary_review": "33;15;78;37;13", "wc_main_review": "180;717;131;646;138", "wc_review": "260;775;241;753;232", "wc_reply_reviewers": "0;79;0;0;0", "wc_reply_authors": "371;263;162;227;141", "reply_reviewers": "0;1;0;0;0", "reply_authors": "1;1;1;1;1", "recommendation_avg": [ 5.0, 1.0954451150103321 ], "confidence_avg": [ 3.4, 0.4898979485566356 ], "correctness_avg": [ 3.4, 0.4898979485566356 ], "technical_novelty_avg": [ 3.0, 0.6324555320336759 ], "empirical_novelty_avg": [ 1.0, 1.0954451150103321 ], "wc_summary_paper_avg": [ 54.6, 18.095303258028036 ], "wc_summary_review_avg": [ 35.2, 23.412817002659036 ], "wc_main_review_avg": [ 362.4, 262.04625545884073 ], "wc_review_avg": [ 452.2, 254.83908648400072 ], "wc_reply_reviewers_avg": [ 15.8, 31.6 ], "wc_reply_authors_avg": [ 232.8, 81.83495585628431 ], "reply_reviewers_avg": [ 0.2, 0.4000000000000001 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.372677996249965, "corr_recommendation_correctness": 0.74535599249993, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17707409746157365618&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 11, "aff_unique_index": "0;0;1", "aff_unique_norm": "Princeton University;Microsoft", "aff_unique_dep": ";Microsoft Research", "aff_unique_url": "https://www.princeton.edu;https://www.microsoft.com/en-us/research", "aff_unique_abbr": "Princeton;MSR", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "xtZXWpXVbiK", "title": "Flow-based Recurrent Belief State Learning for POMDPs", "track": "main", "status": "Reject", "tldr": "", "abstract": "Partially Observable Markov Decision Process (POMDP) provides a principled and generic framework to model real world sequential decision making processes but yet remains unsolved, especially for high dimensional continuous space and unknown models. The main challenge lies in how to accurately obtain the belief state, which is the probability distribution over the unobservable environment states given historical information. Accurately calculating this belief state is a precondition for obtaining an optimal policy of POMDPs. Recent advances in deep learning techniques show great potential to learn good belief states, but they assume the belief states follow certain types of simple distributions such as diagonal Gaussian, which imposes strong restrictions to precisely capture the real belief states. In this paper, we introduce the \\textbf{F}l\\textbf{O}w-based \\textbf{R}ecurrent \\textbf{BE}lief \\textbf{S}tate model (FORBES), which incorporates normalizing flows into the variational inference to learn general continuous belief states for POMDPs. Furthermore, we show that the learned belief states can be plugged into downstream RL algorithms to improve performance. In experiments, we show that our methods successfully capture the complex belief states that enable multi-modal predictions as well as high quality reconstructions, and results on challenging visual-motor control tasks show that our method achieves superior performance and sample efficiency.", "keywords": "Partially Observable Markov Decision Process;POMDP;Model-based Reinforcement Learning;Visual Control Task", "primary_area": "", "supplementary_material": "/attachment/34f23e771a3a9334aa4c75941a2f0e068c3c0753.zip", "author": "Xiaoyu Chen;Yao Mu;Ping Luo;Shengbo Eben Li;Jianyu Chen", "authorids": "~Xiaoyu_Chen4;~Yao_Mu1;~Ping_Luo2;~Shengbo_Eben_Li2;~Jianyu_Chen1", "gender": ";M;M;M;", "homepage": "https://github.com/Cospui;https://yaomarkmu.github.io/;http://www.idlab-tsinghua.com/thulab/labweb/dpeople.html?11;http://people.iiis.tsinghua.edu.cn/~jychen/;http://luoping.me/", "dblp": ";260/0674;;;54/4989-2.html", "google_scholar": ";;Dxiw1K8AAAAJ;;https://scholar.google.com.hk/citations?hl=en", "orcid": ";;;;0000-0002-6685-7950", "linkedin": ";;;;", "or_profile": "~Xiaoyu_Chen4;~Yao_Mu1;~Shengbo_Eben_Li2;~Jianyu_Chen1;~Luo_Ping2", "aff": "Tsinghua University;The University of Hong Kong;Tsinghua University;Tsinghua University;The University of Hong Kong", "aff_domain": "tsinghua.edu.cn;hku.hk;tsinghua.edu.cn;tsinghua.edu.cn;hku.hk", "position": "Graduate student;PhD student;Full Professor;Assistant Professor;Assistant Professor", "bibtex": "@misc{\nchen2022flowbased,\ntitle={Flow-based Recurrent Belief State Learning for {POMDP}s},\nauthor={Xiaoyu Chen and Yao Mu and Ping Luo and Shengbo Eben Li and Jianyu Chen},\nyear={2022},\nurl={https://openreview.net/forum?id=xtZXWpXVbiK}\n}", "github": "", "project": "", "reviewers": "8rin;DDeH;9het;CAct", "site": "https://openreview.net/forum?id=xtZXWpXVbiK", "pdf_size": 0, "recommendation": "6;6;8;8", "confidence": "4;4;2;4", "correctness": "4;3;3;4", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "124;37;91;80", "wc_summary_review": "200;13;72;39", "wc_main_review": "761;729;603;369", "wc_review": "1085;779;766;488", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 7.0, 1.0 ], "confidence_avg": [ 3.5, 0.8660254037844386 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 83.0, 31.10466202999158 ], "wc_summary_review_avg": [ 81.0, 71.81573643707902 ], "wc_main_review_avg": [ 615.5, 154.0868261727783 ], "wc_review_avg": [ 779.5, 211.23742566126865 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.0, "gs_citation": 25, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5528357059377876910&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff_unique_index": "0;1;0;0;1", "aff_unique_norm": "Tsinghua University;University of Hong Kong", "aff_unique_dep": ";", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.hku.hk", "aff_unique_abbr": "THU;HKU", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "id": "xw04RdwI2kS", "title": "Inverse Contextual Bandits: Learning How Behavior Evolves over Time", "track": "main", "status": "Reject", "tldr": "", "abstract": "Understanding a decision-maker's priorities by observing their behavior is critical for transparency and accountability in decision processes\u2014such as in healthcare. Though conventional approaches to policy learning almost invariably assume stationarity in behavior, this is hardly true in practice: Medical practice is constantly evolving as clinical professionals fine-tune their knowledge over time. For instance, as the medical community's understanding of organ transplantations has progressed over the years, a pertinent question is: How have actual organ allocation policies been evolving? To give an answer, we desire a policy learning method that provides interpretable representations of decision-making, in particular capturing an agent's non-stationary knowledge of the world, as well as operating in an offline manner. First, we model the evolving behavior of decision-makers in terms of contextual bandits, and formalize the problem of Inverse Contextual Bandits (\"ICB''). Second, we propose two concrete algorithms as solutions, learning parametric and non-parametric representations of an agent's behavior. Finally, using both real and simulated data for liver transplantations, we illustrate the applicability and explainability of our method, as well as benchmarking and validating the accuracy of our algorithms.", "keywords": "inverse contextual bandits;understanding decision-making", "primary_area": "", "supplementary_material": "/attachment/0b7f158e2e37edeec0c26bb5427d810b12890e84.zip", "author": "Alihan H\u00fcy\u00fck;Daniel Jarrett;Mihaela van der Schaar", "authorids": "~Alihan_H\u00fcy\u00fck1;~Daniel_Jarrett1;~Mihaela_van_der_Schaar2", "gender": ";;F", "homepage": ";https://danieljarrett.github.io;https://www.vanderschaar-lab.com", "dblp": "227/2296;230/8183;", "google_scholar": "EMq6KwMAAAAJ;Pczk-PQAAAAJ;DZ3S--MAAAAJ", "orcid": ";0000-0002-2204-6515;", "linkedin": ";danjarrett/;", "or_profile": "~Alihan_H\u00fcy\u00fck1;~Daniel_Jarrett1;~Mihaela_van_der_Schaar2", "aff": "University of Cambridge;University of Cambridge;University of California, Los Angeles", "aff_domain": "cam.ac.uk;cam.ac.uk;ucla.edu", "position": "PhD student;Ph.D.;Full Professor", "bibtex": "@misc{\nh{\\\"u}y{\\\"u}k2022inverse,\ntitle={Inverse Contextual Bandits: Learning How Behavior Evolves over Time},\nauthor={Alihan H{\\\"u}y{\\\"u}k and Daniel Jarrett and Mihaela van der Schaar},\nyear={2022},\nurl={https://openreview.net/forum?id=xw04RdwI2kS}\n}", "github": "", "project": "", "reviewers": "79eV;G5nF;pmos;NLig", "site": "https://openreview.net/forum?id=xw04RdwI2kS", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "4;4;2;3", "correctness": "4;3;4;3", "technical_novelty": "2;2;3;2", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "25;121;80;119", "wc_summary_review": "69;82;55;22", "wc_main_review": "767;590;103;99", "wc_review": "861;793;238;240", "wc_reply_reviewers": "0;504;0;0", "wc_reply_authors": "4352;4118;135;823", "reply_reviewers": "0;1;0;0", "reply_authors": "11;12;3;4", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 86.25, 38.957508903932755 ], "wc_summary_review_avg": [ 57.0, 22.34949663862701 ], "wc_main_review_avg": [ 389.75, 295.4567438729399 ], "wc_review_avg": [ 533.0, 294.98220285298567 ], "wc_reply_reviewers_avg": [ 126.0, 218.23840175367854 ], "wc_reply_authors_avg": [ 2357.0, 1895.4937351518734 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 7.5, 4.031128874149275 ], "replies_avg": [ 37, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.9045340337332909, "corr_recommendation_correctness": 0.0, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14503123552543798761&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0;1", "aff_unique_norm": "University of Cambridge;University of California, Los Angeles", "aff_unique_dep": ";", "aff_unique_url": "https://www.cam.ac.uk;https://www.ucla.edu", "aff_unique_abbr": "Cambridge;UCLA", "aff_campus_unique_index": "0;0;1", "aff_campus_unique": "Cambridge;Los Angeles", "aff_country_unique_index": "0;0;1", "aff_country_unique": "United Kingdom;United States" }, { "id": "xwAw8QZkpWZ", "title": "SAFER: Data-Efficient and Safe Reinforcement Learning Through Skill Acquisition", "track": "main", "status": "Reject", "tldr": "", "abstract": "Though many reinforcement learning (RL) problems involve learning policies in settings that are difficult to specify safety constraints and sparse rewards, current methods struggle to rapidly and safely acquire successful policies. Behavioral priors, which extract useful policy primitives for learning from offline datasets, have recently shown considerable promise at accelerating RL in more complex problems. However, we discover that current behavioral priors may not be well-equipped for safe policy learning, and in some settings, may promote unsafe behavior, due to their tendency to ignore data from undesirable behaviors. To overcome these issues, we propose SAFEty skill pRiors (SAFER), a behavioral prior learning algorithm that accelerates policy learning on complex control tasks, under safety constraints. Through principled contrastive training on safe and unsafe data, SAFER learns to extract a safety variable from offline data that encodes safety requirements, as well as the safe primitive skills over abstract actions in different scenarios. In the inference stage, SAFER composes a safe and successful policy from the safety skills according to the inferred safety variable and abstract action. We demonstrate its effectiveness on several complex safety-critical robotic grasping tasks inspired by the game Operation, in which SAFER not only out-performs baseline methods in learning successful policies but also enforces safety more effectively.", "keywords": "safety;reinforcement learning;behavioral priors;skill primitives", "primary_area": "", "supplementary_material": "/attachment/e748cc6491f547972a518a47019a42e598a1aca4.zip", "author": "Dylan Z Slack;Yinlam Chow;Bo Dai;Nevan Wichers", "authorids": "~Dylan_Z_Slack1;~Yinlam_Chow1;~Bo_Dai1;~Nevan_Wichers1", "gender": "M;M;;M", "homepage": "https://dylanslacks.website;;https://bo-dai.github.io/;", "dblp": "https://dblp.org/pers/s/Slack:Dylan.html;146/7869;64/2903;", "google_scholar": "pyhz-gUAAAAJ;;TIKl_foAAAAJ;", "orcid": ";;0009-0002-8070-574X;", "linkedin": ";;;nevanwichers", "or_profile": "~Dylan_Z_Slack1;~Yinlam_Chow1;~Bo_Dai1;~Nevan_Wichers1", "aff": "University of California, Irvine;Google Research;Google Brain;Google", "aff_domain": "uci.edu;google.com;google.com;google.com", "position": "PhD student;Research Scientist;Research Scientist;Software Engineer", "bibtex": "@misc{\nslack2022safer,\ntitle={{SAFER}: Data-Efficient and Safe Reinforcement Learning Through Skill Acquisition},\nauthor={Dylan Z Slack and Yinlam Chow and Bo Dai and Nevan Wichers},\nyear={2022},\nurl={https://openreview.net/forum?id=xwAw8QZkpWZ}\n}", "github": "", "project": "", "reviewers": "dr7N;31VF;GszC;LYgh", "site": "https://openreview.net/forum?id=xwAw8QZkpWZ", "pdf_size": 0, "recommendation": "3;5;6;8", "confidence": "4;4;3;4", "correctness": "2;4;3;3", "technical_novelty": "2;3;3;3", "empirical_novelty": "3;3;2;4", "wc_summary_paper": "66;85;450;192", "wc_summary_review": "54;19;90;38", "wc_main_review": "690;148;257;840", "wc_review": "810;252;797;1070", "wc_reply_reviewers": "874;0;0;122", "wc_reply_authors": "2142;960;624;1149", "reply_reviewers": "2;0;0;2", "reply_authors": "4;2;1;3", "recommendation_avg": [ 5.5, 1.8027756377319946 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 198.25, 153.07902370997797 ], "wc_summary_review_avg": [ 50.25, 26.080404521402652 ], "wc_main_review_avg": [ 483.75, 288.7891748317447 ], "wc_review_avg": [ 732.25, 297.8895558759991 ], "wc_reply_reviewers_avg": [ 249.0, 364.2650134174294 ], "wc_reply_authors_avg": [ 1218.75, 565.2288806315544 ], "reply_reviewers_avg": [ 1.0, 1.0 ], "reply_authors_avg": [ 2.5, 1.118033988749895 ], "replies_avg": [ 24, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.16012815380508713, "corr_recommendation_correctness": 0.39223227027636803, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:AwCNsP5yhNMJ:scholar.google.com/&scioq=SAFER:+Data-Efficient+and+Safe+Reinforcement+Learning+Through+Skill+Acquisition&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1;1;1", "aff_unique_norm": "University of California, Irvine;Google", "aff_unique_dep": ";Google Research", "aff_unique_url": "https://www.uci.edu;https://research.google", "aff_unique_abbr": "UCI;Google Research", "aff_campus_unique_index": "0;1;1;1", "aff_campus_unique": "Irvine;Mountain View", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "xxU6qGx-2ew", "title": "Gaussian Differential Privacy Transformation: from identification to application", "track": "main", "status": "Reject", "tldr": "", "abstract": "Gaussian differential privacy (GDP) is a single-parameter family of privacy notions that provides coherent guarantees to avoid the exposure of individuals from machine learning models. Relative to traditional $(\\epsilon,\\delta)$-differential privacy (DP), GDP is more interpretable and tightens the bounds given by standard DP composition theorems. In this paper, we start with an exact privacy profile characterization of $(\\epsilon,\\delta)$-DP and then define an efficient, tractable, and visualizable tool, called the Gaussian differential privacy transformation (GDPT). With theoretical property of the GDPT, we develop an easy-to-verify criterion to characterize and identify GDP algorithms. Based on our criterion, an algorithm is GDP if and only if an asymptotic condition on its privacy profile is met. By development of numerical properties of the GDPT, we give a method to narrow down possible values of an optimal privacy measurement $\\mu$ with an arbitrarily small and quantifiable margin of error. As applications of our newly developed tools, we revisit some established \\ed-DP algorithms and find that their utility can be improved. We additionally make a comparison between two single-parameter families of privacy notions, $\\epsilon$-DP and $\\mu$-GDP. Lastly, we use the GDPT to examine the effect of subsampling under the GDP framework.", "keywords": "differential privacy;gaussian differential privacy;privacy profile", "primary_area": "", "supplementary_material": "", "author": "Yi Liu;Ke Sun;Bei Jiang;Linglong Kong", "authorids": "~Yi_Liu13;~Ke_Sun6;~Bei_Jiang1;~Linglong_Kong2", "gender": "M;M;F;M", "homepage": "https://apps.ualberta.ca/directory/person/yliu16;https://sites.google.com/view/kesun;https://www.ualberta.ca/~bei1;https://www.ualberta.ca/~lkong", "dblp": "97/4626-62;69/476-13;190/4697;35/8525", "google_scholar": ";lYdNhFQAAAAJ;https://scholar.google.ca/citations?user=MfOZ8G0AAAAJ;https://scholar.google.ca/citations?hl=en", "orcid": ";;0000-0002-0033-839X;0000-0003-3011-9216", "linkedin": ";;;", "or_profile": "~Yi_Liu13;~Ke_Sun6;~Bei_Jiang1;~Linglong_Kong2", "aff": "University of Alberta;University of Alberta;University of Alberta;University of Alberta", "aff_domain": "ualberta.ca;ualberta.ca;ualberta.ca;ualberta.ca", "position": "PhD student;PhD student;Associate Professor;Associate Professor", "bibtex": "@misc{\nliu2022gaussian,\ntitle={Gaussian Differential Privacy Transformation: from identification to application},\nauthor={Yi Liu and Ke Sun and Bei Jiang and Linglong Kong},\nyear={2022},\nurl={https://openreview.net/forum?id=xxU6qGx-2ew}\n}", "github": "", "project": "", "reviewers": "ZW49;H1fA;RPM2;qUya", "site": "https://openreview.net/forum?id=xxU6qGx-2ew", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "5;3;4;3", "correctness": "4;4;3;4", "technical_novelty": "1;2;2;4", "empirical_novelty": "0;2;0;0", "wc_summary_paper": "22;371;86;108", "wc_summary_review": "28;52;51;36", "wc_main_review": "162;631;944;289", "wc_review": "212;1054;1081;433", "wc_reply_reviewers": "0;50;78;186", "wc_reply_authors": "1050;1451;1264;1151", "reply_reviewers": "0;1;1;1", "reply_authors": "2;3;2;3", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 1.0897247358851685 ], "empirical_novelty_avg": [ 0.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 146.75, 133.2692293817294 ], "wc_summary_review_avg": [ 41.75, 10.158124826955023 ], "wc_main_review_avg": [ 506.5, 305.3248270285271 ], "wc_review_avg": [ 695.0, 380.72627962881677 ], "wc_reply_reviewers_avg": [ 78.5, 68.06430782723056 ], "wc_reply_authors_avg": [ 1229.0, 148.85731423077604 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 2.5, 0.5 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.899228803025897, "corr_recommendation_correctness": -0.13245323570650439, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:v0qdKGycdzEJ:scholar.google.com/&scioq=Gaussian+Differential+Privacy+Transformation:+from+identification+to+application&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "University of Alberta", "aff_unique_dep": "", "aff_unique_url": "https://www.ualberta.ca", "aff_unique_abbr": "UAlberta", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Canada" }, { "id": "xxyTjJFzy3C", "title": "Contrastive Learning of 3D Shape Descriptor with Dynamic Adversarial Views", "track": "main", "status": "Reject", "tldr": "", "abstract": "View-based deep learning models have shown the capability to learn 3D shape descriptors with superior performance on 3D shape recognition, classification, and retrieval. Most popular techniques often leverage the class label to train deep neural networks under supervision to learn to extract 3D deep representation by aggregating information from a static and pre-selected set of different views used for all shapes. Those approaches, however, often face challenges posed by the requirement of a large amount of annotated training data and the lack of a mechanism for the adaptive selection of shape-instance-dependent views towards the learning of more informative 3D shape representation. This paper addresses those two challenging issues by introducing the concept of adversarial views and developing a new mechanism to generate views for adversarial training of a self-supervised contrastive model for 3D shape descriptor, denoted as CoLAV. In particular, compared to the recent advances in multi-view approaches, our proposed CoLAV gains advantages by leveraging the contrastive learning techniques for self-supervised learning of 3D shape representations without the need for labeled data. In addition, CoLAV introduces a novel mechanism for the dynamic generation of shape-instance-dependent adversarial views as positive pairs to adversarially train robust contrastive learning models towards the learning of more informative 3D shape representation. Comprehensive experimental results on 3D shape classification demonstrate that the 3D shape descriptor learned by CoLAV exhibits superior performance for 3D shape recognition over other state-of-the-art techniques, even though CoLAV is completely self-trained with unlabeled 3D datasets (e.g., ModelNet40).", "keywords": "Dynamic multi-views", "primary_area": "", "supplementary_material": "", "author": "Shuaihang Yuan;Yi Fang", "authorids": "~Shuaihang_Yuan1;~Yi_Fang2", "gender": "M;M", "homepage": ";http://mmvc.engineering.nyu.edu/", "dblp": "257/3707;96/361-6", "google_scholar": "s2YA4rEAAAAJ;j-cyhzwAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Shuaihang_Yuan1;~Yi_Fang2", "aff": "New York University;New York University", "aff_domain": "nyu.edu;nyu.edu", "position": "PhD student;Associate Professor", "bibtex": "@misc{\nyuan2022contrastive,\ntitle={Contrastive Learning of 3D Shape Descriptor with Dynamic Adversarial Views},\nauthor={Shuaihang Yuan and Yi Fang},\nyear={2022},\nurl={https://openreview.net/forum?id=xxyTjJFzy3C}\n}", "github": "", "project": "", "reviewers": "6v1k;aAcK;GNMf;s4iu", "site": "https://openreview.net/forum?id=xxyTjJFzy3C", "pdf_size": 0, "recommendation": "3;5;6;6", "confidence": "4;4;3;4", "correctness": "2;3;3;4", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "125;60;59;71", "wc_summary_review": "9;27;37;70", "wc_main_review": "272;708;322;359", "wc_review": "406;795;418;500", "wc_reply_reviewers": "0;0;20;53", "wc_reply_authors": "547;1296;182;577", "reply_reviewers": "0;0;1;1", "reply_authors": "1;2;2;2", "recommendation_avg": [ 5.0, 1.224744871391589 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 78.75, 27.11434122378783 ], "wc_summary_review_avg": [ 35.75, 22.174027599874588 ], "wc_main_review_avg": [ 415.25, 171.81585345945234 ], "wc_review_avg": [ 529.75, 157.35687941745667 ], "wc_reply_reviewers_avg": [ 18.25, 21.660736367907717 ], "wc_reply_authors_avg": [ 650.5, 403.8183378698892 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.75, 0.4330127018922193 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.4714045207910316, "corr_recommendation_correctness": 0.8660254037844386, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:qW0HzC4hxtMJ:scholar.google.com/&scioq=Contrastive+Learning+of+3D+Shape+Descriptor+with+Dynamic+Adversarial+Views&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "New York University", "aff_unique_dep": "", "aff_unique_url": "https://www.nyu.edu", "aff_unique_abbr": "NYU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Communication-Efficient Actor-Critic Methods for Homogeneous Markov Games", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6817", "id": "xy_2w3J3kH", "poster": "", "openreview": "https://openreview.net/forum?id=xy_2w3J3kH", "slides": "https://iclr.cc/virtual/2022/poster/6817", "video": "https://iclr.cc/virtual/2022/poster/6817", "author_site": "Dingyang Chen, Yile Li, Qi Zhang", "tldr": "", "abstract": "Recent success in cooperative multi-agent reinforcement learning (MARL) relies on centralized training and policy sharing. Centralized training eliminates the issue of non-stationarity MARL yet induces large communication costs, and policy sharing is empirically crucial to efficient learning in certain tasks yet lacks theoretical justification. In this paper, we formally characterize a subclass of cooperative Markov games where agents exhibit a certain form of homogeneity such that policy sharing provably incurs no suboptimality. This enables us to develop the first consensus-based decentralized actor-critic method where the consensus update is applied to both the actors and the critics while ensuring convergence. We also develop practical algorithms based on our decentralized actor-critic method to reduce the communication cost during training, while still yielding policies comparable with centralized training.", "keywords": "multi-agent reinforcement learning;multi-agent communication", "primary_area": "", "supplementary_material": "", "author": "Dingyang Chen;Yile Li;Qi Zhang", "authorids": "~Dingyang_Chen1;351614132@qq.com;~Qi_Zhang12", "gender": "M;;M", "homepage": "https://dchen48.github.io/;;https://qizhg.github.io/", "dblp": "212/7542-1.html;;https://dblp.uni-trier.de/pers/hd/z/Zhang_0038:Qi", "google_scholar": "vSdOGREAAAAJ;;wJNQVS0AAAAJ", "orcid": ";;", "linkedin": "dingyang-chen-97512712a/;;", "or_profile": "~Dingyang_Chen1;351614132@qq.com;~Qi_Zhang12", "aff": "University of South Carolina;;University of South Carolina", "aff_domain": "sc.edu;;sc.edu", "position": "PhD student;;Assistant Professor", "bibtex": "@inproceedings{\nchen2022communicationefficient,\ntitle={Communication-Efficient Actor-Critic Methods for Homogeneous Markov Games},\nauthor={Dingyang Chen and Yile Li and Qi Zhang},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=xy_2w3J3kH}\n}", "github": "", "project": "", "reviewers": "HWuo;eGTR;WXRa;smsi", "pdf_size": 0, "recommendation": "6;6;6;6", "confidence": "3;4;5;3", "correctness": "2;3;3;4", "technical_novelty": "3;3;2;3", "empirical_novelty": "3;2;3;2", "wc_summary_paper": "64;140;74;45", "wc_summary_review": "71;62;28;47", "wc_main_review": "828;1038;348;171", "wc_review": "963;1240;450;263", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 6.0, 0.0 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 80.75, 35.75874019033668 ], "wc_summary_review_avg": [ 52.0, 16.294170736800325 ], "wc_main_review_avg": [ 596.25, 350.4699523496986 ], "wc_review_avg": [ 729.0, 390.7921442403877 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13990146209747042133&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "pdf": "https://openreview.net/pdf?id=xy_2w3J3kH", "email": "sc.edu;;sc.edu", "author_num": 3, "aff_unique_index": "0;0", "aff_unique_norm": "University of South Carolina", "aff_unique_dep": "", "aff_unique_url": "https://www.sc.edu", "aff_unique_abbr": "USC", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "xzeGP-PtPMI", "title": "Sequential Communication in Multi-Agent Reinforcement Learning", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Coordination is one of the essential problems in multi-agent reinforcement learning. Communication provides an alternative for agents to obtain information about others so that better coordinated behavior can be learned. Some existing work communicates predicted future trajectory with others, hoping to get clues about what others would do for better coordination. However, circular dependencies can inevitably occur when agents are treated equally so that it is impossible to coordinate decision-making. In this paper, we propose a novel communication scheme Sequential Communication (SeqComm). In more detail, we treat agents unequally (the upper-level agents make decisions prior to the lower-level) and have two communication phases. In the negotiation phase, agents share observations with others and obtain their intention by modeling the environment dynamics. Agents determine the priority of decision-making by comparing the value of intention. In the launching phase, the upper-level agents take the lead in making decisions and share their actions with the lower-level agents. Empirically, we show that SeqComm improves the performance in a variety of multi-agent cooperative scenarios, comparing to existing methods.", "keywords": "multi-agent communication;multi-agent reinforcement learning", "primary_area": "", "supplementary_material": "", "author": "Ziluo Ding;Weixin Hong;Liwen Zhu;Tiejun Huang;Zongqing Lu", "authorids": "~Ziluo_Ding1;~Weixin_Hong1;~Liwen_Zhu1;~Tiejun_Huang1;~Zongqing_Lu2", "gender": "M;M;F;M;", "homepage": ";;http://www.liwenzhu-pku.cn/;https://idm.pku.edu.cn/~tjhuang/;", "dblp": "267/2359;;;h/TiejunHuang;", "google_scholar": ";;;https://scholar.google.com.tw/citations?user=knvEK4AAAAAJ;", "orcid": ";;;0000-0002-4234-6099;", "linkedin": "ziluo/;Hong-weixin;;;", "or_profile": "~Ziluo_Ding1;~Weixin_Hong1;~Liwen_Zhu1;~Tiejun_Huang1;~Zongqing_Lu2", "aff": "Peking University;Peking University, Tsinghua University;Peking University;Institute of Computing Technology, Chinese Academy of Sciences;", "aff_domain": "pku.edu.cn;pku.edu.cn;pku.edu.cn;ict.ac.cn;", "position": "PhD student;Undergrad student;MS student;Postdoc;", "bibtex": "@misc{\nding2022sequential,\ntitle={Sequential Communication in Multi-Agent Reinforcement Learning },\nauthor={Ziluo Ding and Weixin Hong and Liwen Zhu and Tiejun Huang and Zongqing Lu},\nyear={2022},\nurl={https://openreview.net/forum?id=xzeGP-PtPMI}\n}", "github": "", "project": "", "reviewers": "MJQv;Scgd;ELmn;qUm9", "site": "https://openreview.net/forum?id=xzeGP-PtPMI", "pdf_size": 0, "recommendation": "3;3;3;3", "confidence": "4;4;3;4", "correctness": "2;2;2;2", "technical_novelty": "1;3;3;2", "empirical_novelty": "2;0;2;0", "wc_summary_paper": "66;128;115;15", "wc_summary_review": "44;157;74;44", "wc_main_review": "205;1332;357;518", "wc_review": "315;1617;546;577", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 2.0, 0.0 ], "technical_novelty_avg": [ 2.25, 0.82915619758885 ], "empirical_novelty_avg": [ 1.0, 1.0 ], "wc_summary_paper_avg": [ 81.0, 44.5701693961331 ], "wc_summary_review_avg": [ 79.75, 46.25135133160976 ], "wc_main_review_avg": [ 603.0, 435.1970817916866 ], "wc_review_avg": [ 763.75, 502.91717757499595 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16875432262542350488&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;0;1", "aff_unique_norm": "Peking University;Chinese Academy of Sciences", "aff_unique_dep": ";Institute of Computing Technology", "aff_unique_url": "http://www.pku.edu.cn;http://www.ict.ac.cn", "aff_unique_abbr": "Peking U;CAS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "id": "y-yL78_sZcr", "title": "Gradient Imbalance and solution in Online Continual learning", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Most existing techniques for online continual learning are based on experience-replay. In this approach, a memory buffer is used to save some data from past tasks for dealing with catastrophic forgetting. In training, a small batch of data from the data stream of the current task and some sampled data from a memory buffer are used jointly to update or train the current model. In this paper, we study the experience replay-based approach from a new angle, gradient imbalance. We first investigate and analyze this phenomenon experimentally from two perspectives: imbalance of samples introduced by experience replay and sequence of classes introduced by incremental learning. To our knowledge, this problem has not been studied before and it significantly limits the performance of online continual learning. Based on observations from experiments and theoretical analysis, a new learning strategy and a new loss are proposed to deal with the problem. Empirical evaluation shows that GAD helps improve the online CL performance by more than 11% in accuracy.", "keywords": "Online continual learning;Lifelong learning;Gradient imbalance", "primary_area": "", "supplementary_material": "/attachment/4e1f251286c9dd051482c2a2204c500b7b78538d.zip", "author": "Yiduo Guo;Dongyan Zhao;Bing Liu", "authorids": "~Yiduo_Guo2;~Dongyan_Zhao2;~Bing_Liu1", "gender": "M;M;M", "homepage": "https://www.wict.pku.edu.cn/zhaodongyan/en/;https://www.cs.uic.edu/~liub/;https://github.com/gydpku", "dblp": "63/1870;l/BingLiu1.html;196/5954.html", "google_scholar": "lhR8-68AAAAJ;Kt1bjZoAAAAJ;https://scholar.google.com/citations?hl=zh-CN", "orcid": ";;", "linkedin": ";;", "or_profile": "~Dongyan_Zhao2;~Bing_Liu1;~Yiduo_GUO1", "aff": "Peking University;University of Illinois at Chicago;Peking University", "aff_domain": "pku.edu.cn;uic.edu;pku.edu.cn", "position": "Full Professor;Full Professor;PhD student", "bibtex": "@misc{\nguo2022gradient,\ntitle={Gradient Imbalance and solution in Online Continual learning},\nauthor={Yiduo Guo and Dongyan Zhao and Bing Liu},\nyear={2022},\nurl={https://openreview.net/forum?id=y-yL78_sZcr}\n}", "github": "", "project": "", "reviewers": "pUgk;yS6i;Qfi7;pmdp", "site": "https://openreview.net/forum?id=y-yL78_sZcr", "pdf_size": 0, "recommendation": "5;5;5;5", "confidence": "3;4;4;4", "correctness": "3;1;2;3", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;2;3;0", "wc_summary_paper": "124;92;173;40", "wc_summary_review": "43;97;38;61", "wc_main_review": "189;485;344;522", "wc_review": "356;674;555;623", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 5.0, 0.0 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 2.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.75, 1.0897247358851685 ], "wc_summary_paper_avg": [ 107.25, 48.37031631072925 ], "wc_summary_review_avg": [ 59.75, 23.14492384951828 ], "wc_main_review_avg": [ 385.0, 131.21166106714753 ], "wc_review_avg": [ 552.0, 120.77872329181163 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:DhF9nYoZWMgJ:scholar.google.com/&scioq=Gradient+Imbalance+and+solution+in+Online+Continual+learning&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1;0", "aff_unique_norm": "Peking University;University of Illinois at Chicago", "aff_unique_dep": ";", "aff_unique_url": "http://www.pku.edu.cn;https://www.uic.edu", "aff_unique_abbr": "Peking U;UIC", "aff_campus_unique_index": "1", "aff_campus_unique": ";Chicago", "aff_country_unique_index": "0;1;0", "aff_country_unique": "China;United States" }, { "title": "On the Learning and Learnability of Quasimetrics", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7182", "id": "y0VvIg25yk", "poster": "", "openreview": "https://openreview.net/forum?id=y0VvIg25yk", "slides": "https://iclr.cc/virtual/2022/poster/7182", "video": "https://iclr.cc/virtual/2022/poster/7182", "author_site": "Tongzhou Wang, Phillip Isola", "tldr": "", "abstract": "Our world is full of asymmetries. Gravity and wind can make reaching a place easier than coming back. Social artifacts such as genealogy charts and citation graphs are inherently directed. In reinforcement learning and control, optimal goal-reaching strategies are rarely reversible (symmetrical). Distance functions supported on these asymmetrical structures are called quasimetrics. Despite their common appearance, little research has been done on the learning of quasimetrics. Our theoretical analysis reveals that a common class of learning algorithms, including unconstrained multilayer perceptrons (MLPs), provably fails to learn a quasimetric consistent with training data. In contrast, our proposed Poisson Quasimetric Embedding (PQE) is the first quasimetric learning formulation that both is learnable with gradient-based optimization and enjoys strong performance guarantees. Experiments on random graphs, social graphs, and offline Q-learning demonstrate its effectiveness over many common baselines.", "keywords": "embedding learning;quasimetric learning;deep learning", "primary_area": "", "supplementary_material": "", "author": "Tongzhou Wang;Phillip Isola", "authorids": "~Tongzhou_Wang1;~Phillip_Isola1", "gender": "M;M", "homepage": "https://www.tongzhouwang.info/;http://web.mit.edu/phillipi/", "dblp": "201/8645;36/9988", "google_scholar": "14HASnUAAAAJ;ROILf3EAAAAJ", "orcid": ";0000-0002-1411-6704", "linkedin": ";phillip-isola-a9955b20/", "or_profile": "~Tongzhou_Wang1;~Phillip_Isola1", "aff": "Massachusetts Institute of Technology;Massachusetts Institute of Technology", "aff_domain": "mit.edu;mit.edu", "position": "PhD student;Assistant Professor", "bibtex": "@inproceedings{\nwang2022on,\ntitle={On the Learning and Learnability of Quasimetrics},\nauthor={Tongzhou Wang and Phillip Isola},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=y0VvIg25yk}\n}", "github": "", "project": "", "reviewers": "Lnh9;RT4G;WT3X;kCMM", "pdf_size": 0, "recommendation": "5;6;8;8", "confidence": "3;3;2;3", "correctness": "3;3;4;3", "technical_novelty": "2;3;3;2", "empirical_novelty": "3;3;3;2", "wc_summary_paper": "44;118;66;83", "wc_summary_review": "19;40;28;46", "wc_main_review": "272;68;182;206", "wc_review": "335;226;276;335", "wc_reply_reviewers": "0;0;0;23", "wc_reply_authors": "353;114;379;377", "reply_reviewers": "0;0;0;1", "reply_authors": "2;1;1;1", "recommendation_avg": [ 6.75, 1.299038105676658 ], "confidence_avg": [ 2.75, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 77.75, 27.040478915877213 ], "wc_summary_review_avg": [ 33.25, 10.473180032826706 ], "wc_main_review_avg": [ 182.0, 73.60706487831179 ], "wc_review_avg": [ 293.0, 45.56862956025779 ], "wc_reply_reviewers_avg": [ 5.75, 9.959292143521045 ], "wc_reply_authors_avg": [ 305.75, 111.1786287916882 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.5555555555555555, "corr_recommendation_correctness": 0.5555555555555555, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12948966148228061148&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=y0VvIg25yk", "email": "mit.edu;mit.edu", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Massachusetts Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://web.mit.edu", "aff_unique_abbr": "MIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Certified Robustness for Deep Equilibrium Models via Interval Bound Propagation", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6631", "id": "y1PXylgrXZ", "poster": "", "openreview": "https://openreview.net/forum?id=y1PXylgrXZ", "slides": "https://iclr.cc/virtual/2022/poster/6631", "video": "https://iclr.cc/virtual/2022/poster/6631", "author_site": "Colin Wei, Zico Kolter", "tldr": "", "abstract": "Deep equilibrium layers (DEQs) have demonstrated promising performance and are competitive with standard explicit models on many benchmarks. However, little is known about certifying robustness for these models. Inspired by interval bound propagation (IBP), we propose the IBP-MonDEQ layer, a DEQ layer whose robustness can be verified by computing upper and lower interval bounds on the output. Our key insights are that these interval bounds can be obtained as the fixed-point solution to an IBP-inspired equilibrium equation, and furthermore, that this solution always exists and is unique when the layer obeys a certain parameterization. This fixed point can be interpreted as the result of applying IBP to an infinitely deep, weight-tied neural network, which may be of independent interest, as IBP bounds are typically unstable for deeper networks. Our empirical comparison reveals that models with IBP-MonDEQ layers can achieve comparable $\\ell_{\\infty}$ certified robustness to similarly-sized fully explicit networks.", "keywords": "deep equilibrium models;certified robustness;interval bound propagation", "primary_area": "", "supplementary_material": "/attachment/d5776f528fcd160048e57495a056e0c451cb8a41.zip", "author": "Colin Wei;J Zico Kolter", "authorids": "~Colin_Wei1;~J_Zico_Kolter1", "gender": "M;M", "homepage": "https://sites.google.com/view/colinwei;http://www.zicokolter.com", "dblp": "185/7902;67/2526", "google_scholar": ";UXh1I6UAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Colin_Wei1;~Zico_Kolter1", "aff": "Computer Science Department, Stanford University;Carnegie Mellon University", "aff_domain": "cs.stanford.edu;cmu.edu", "position": "PhD student;Full Professor", "bibtex": "@inproceedings{\nwei2022certified,\ntitle={Certified Robustness for Deep Equilibrium Models via Interval Bound Propagation},\nauthor={Colin Wei and J Zico Kolter},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=y1PXylgrXZ}\n}", "github": "", "project": "", "reviewers": "7ZJs;KViU;wCmR;jEVJ", "pdf_size": 0, "recommendation": "3;5;6;8", "confidence": "5;3;3;3", "correctness": "2;4;3;3", "technical_novelty": "3;3;3;3", "empirical_novelty": "3;3;3;3", "wc_summary_paper": "110;28;59;109", "wc_summary_review": "66;178;58;64", "wc_main_review": "460;148;344;431", "wc_review": "636;354;461;604", "wc_reply_reviewers": "590;0;108;0", "wc_reply_authors": "1564;172;424;672", "reply_reviewers": "2;0;1;0", "reply_authors": "3;1;1;1", "recommendation_avg": [ 5.5, 1.8027756377319946 ], "confidence_avg": [ 3.5, 0.8660254037844386 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 76.5, 34.77427209878016 ], "wc_summary_review_avg": [ 91.5, 50.02749244165652 ], "wc_main_review_avg": [ 345.75, 121.89006317169583 ], "wc_review_avg": [ 513.75, 113.34984561083442 ], "wc_reply_reviewers_avg": [ 174.5, 243.9072569646094 ], "wc_reply_authors_avg": [ 708.0, 524.8771284786565 ], "reply_reviewers_avg": [ 0.75, 0.82915619758885 ], "reply_authors_avg": [ 1.5, 0.8660254037844386 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.8006407690254357, "corr_recommendation_correctness": 0.39223227027636803, "gs_citation": 26, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5246265785150819730&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "pdf": "https://openreview.net/pdf?id=y1PXylgrXZ", "email": "cs.stanford.edu;cmu.edu", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "Stanford University;Carnegie Mellon University", "aff_unique_dep": "Computer Science Department;", "aff_unique_url": "https://www.stanford.edu;https://www.cmu.edu", "aff_unique_abbr": "Stanford;CMU", "aff_campus_unique_index": "0", "aff_campus_unique": "Stanford;", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "y1faDxZ_-0a", "title": "SSFL: Tackling Label Deficiency in Federated Learning via Personalized Self-Supervision", "track": "main", "status": "Reject", "tldr": "", "abstract": "Federated Learning (FL) is transforming the ML training ecosystem from a centralized over-the-cloud setting to distributed training over edge devices in order to strengthen data privacy, reduce data migration costs, and break regulatory restrictions. An essential, but rarely studied, challenge in FL is label deficiency at the edge. This problem is even more pronounced in FL, compared to centralized training, due to the fact that FL users are often reluctant to label their private data and edge devices do not provide an ideal interface to assist with annotation. Addressing label deficiency is also further complicated in FL, due to the heterogeneous nature of the data at edge devices and the need for developing personalized models for each user. We propose a self-supervised and personalized federated learning framework, named SSFL, and a series of algorithms under this framework which work towards addressing these challenges. First, under the SSFL framework, we analyze the compatibility of various centralized self-supervised learning methods in FL setting and demonstrate that SimSiam networks performs the best with the standard FedAvg algorithm. Moreover, to address the data heterogeneity at the edge devices in this framework, we have innovated a series of algorithms that broaden existing supervised personalization algorithms into the setting of self-supervised learning including perFedAvg, Ditto, and local fine-tuning, among others. We further propose a novel personalized federated self-supervised learning algorithm, Per-SSFL, which balances personalization and consensus by carefully regulating the distance between the local and global representations of data. To provide a comprehensive comparative analysis of all proposed algorithms, we also develop a distributed training system and related evaluation protocol for SSFL. Using this training system, we conduct experiments on a synthetic non-I.I.D. dataset based on CIFAR-10, and an intrinsically non-I.I.D. dataset GLD-23K. Our findings show that the gap of evaluation accuracy between supervised learning and unsupervised learning in FL is both small and reasonable. The performance comparison indicates that representation regularization-based personalization method is able to outperform other variants. Ablation studies on SSFL are also conducted to understand the role of batch size, non-I.I.D.ness, and the evaluation protocol.", "keywords": "federated learning", "primary_area": "", "supplementary_material": "/attachment/f7b10a6e657e9d04f5404dd5cddd16f60df21f5c.zip", "author": "Chaoyang He;Zhengyu Yang;Erum Mushtaq;Sunwoo Lee;Mahdi Soltanolkotabi;Salman Avestimehr", "authorids": "~Chaoyang_He1;~Zhengyu_Yang2;~Erum_Mushtaq1;~Sunwoo_Lee1;~Mahdi_Soltanolkotabi1;~Salman_Avestimehr1", "gender": "M;;;M;M;", "homepage": "http://chaoyanghe.com;https://zhengyuyang.com;https://scholar.google.com/citations?user=C5IpcRYAAAAJ&hl=en;https://sites.google.com/view/sunwoolee;http://www-bcf.usc.edu/~soltanol/;", "dblp": "222/6721-1.html;159/1188-3;;56/7811-1;75/6691;", "google_scholar": "2z2camUAAAAJ;;;WA9KNNcAAAAJ;narJyMAAAAAJ;", "orcid": ";;;0000-0001-6334-3068;;", "linkedin": ";;;sunwoo-lee-90a7308a;;", "or_profile": "~Chaoyang_He1;~Zhengyu_Yang2;~Erum_Mushtaq1;~Sunwoo_Lee1;~Mahdi_Soltanolkotabi1;~Salman_Avestimehr1", "aff": "University of Southern California;Meta;University of Southern California;University of Southern California;University of Southern California;", "aff_domain": "usc.edu;meta.com;usc.edu;usc.edu;usc.edu;", "position": "PhD student;Researcher;PhD student;Postdoc;Associate Professor;", "bibtex": "@misc{\nhe2022ssfl,\ntitle={{SSFL}: Tackling Label Deficiency in Federated Learning via Personalized Self-Supervision},\nauthor={Chaoyang He and Zhengyu Yang and Erum Mushtaq and Sunwoo Lee and Mahdi Soltanolkotabi and Salman Avestimehr},\nyear={2022},\nurl={https://openreview.net/forum?id=y1faDxZ_-0a}\n}", "github": "", "project": "", "reviewers": "bsjD;zbEP;7Cc7;TsoV", "site": "https://openreview.net/forum?id=y1faDxZ_-0a", "pdf_size": 0, "recommendation": "1;3;3;5", "confidence": "5;3;4;3", "correctness": "1;3;3;3", "technical_novelty": "1;3;2;2", "empirical_novelty": "0;2;2;2", "wc_summary_paper": "68;52;89;48", "wc_summary_review": "7;20;61;30", "wc_main_review": "7;85;160;370", "wc_review": "82;157;310;448", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "55;365;433;452", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 3.0, 1.4142135623730951 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 2.5, 0.8660254037844386 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 1.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 64.25, 16.13032857693854 ], "wc_summary_review_avg": [ 29.5, 19.93113142799475 ], "wc_main_review_avg": [ 155.5, 135.14159241329074 ], "wc_review_avg": [ 249.25, 141.13358034146233 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 326.25, 159.91149896114413 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.8528028654224418, "corr_recommendation_correctness": 0.816496580927726, "gs_citation": 41, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12753390816701392128&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1;0;0;0", "aff_unique_norm": "University of Southern California;Meta", "aff_unique_dep": ";Meta Platforms, Inc.", "aff_unique_url": "https://www.usc.edu;https://meta.com", "aff_unique_abbr": "USC;Meta", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Los Angeles;", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "y3niPR1CJf6", "title": "Wasserstein Weisfeiler-Lehman Subtree Distance for Graph-Structured Data", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Defining a valid graph distance is a challenging task in graph machine learning because we need to consider the theoretical validity of the distance, its computational complexity, and effectiveness as a distance between graphs. Addressing the shortcomings of the popular Weisfeiler-Lehman (WL) test for the graph isomorphism problem, this paper proposes a novel distance between graph structures. More specifically, we first analyze the WL algorithm from a geometric point of view and argue that discriminating nodes based on only the consistency of categorical labels do not fully capture important structural information. Therefore, instead of using such categorical labels, we define a node distance between WL subtrees with tree edit distance and propose an efficient calculation algorithm. We then apply the proposed node distance to define a graph Wasserstein distance on tree edit embedding space exploiting Optimal Transport framework. To summarize, these two distances have been proposed at the node level and graph level, respectively. Numerical experimentations on graph classification tasks show that the proposed graph Wasserstein distance performs equally or better than conventional methods.", "keywords": "graph distance;graph machine learning;optimal tarnsport", "primary_area": "", "supplementary_material": "/attachment/0a4df53c527b291e287cc8e009c35b73ead3a47f.zip", "author": "Zhongxi Fang;Jianming Huang;Hiroyuki Kasai", "authorids": "~Zhongxi_Fang1;koukenmei@toki.waseda.jp;hiroyuki.kasai@waseda.jp", "gender": "M;;", "homepage": ";;", "dblp": ";;", "google_scholar": "https://scholar.google.com/citations?hl=ja;;", "orcid": ";;", "linkedin": ";;", "or_profile": "~Zhongxi_Fang1;koukenmei@toki.waseda.jp;hiroyuki.kasai@waseda.jp", "aff": "Waseda University;;", "aff_domain": "waseda.jp;;", "position": "MS student;;", "bibtex": "@misc{\nfang2022wasserstein,\ntitle={Wasserstein Weisfeiler-Lehman Subtree Distance for Graph-Structured Data},\nauthor={Zhongxi Fang and Jianming Huang and Hiroyuki Kasai},\nyear={2022},\nurl={https://openreview.net/forum?id=y3niPR1CJf6}\n}", "github": "", "project": "", "reviewers": "Ueyb;nCtr;Wfdg;v1dg", "site": "https://openreview.net/forum?id=y3niPR1CJf6", "pdf_size": 0, "recommendation": "3;3;3;3", "confidence": "4;4;4;4", "correctness": "2;3;2;3", "technical_novelty": "2;3;2;2", "empirical_novelty": "2;1;2;1", "wc_summary_paper": "34;78;35;79", "wc_summary_review": "45;107;64;90", "wc_main_review": "233;952;943;344", "wc_review": "312;1137;1042;513", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.5, 0.5 ], "wc_summary_paper_avg": [ 56.5, 22.005681084665387 ], "wc_summary_review_avg": [ 76.5, 23.77498685593748 ], "wc_main_review_avg": [ 618.0, 331.8440898976506 ], "wc_review_avg": [ 751.0, 347.5061150541095 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:3wRKVL9EHjsJ:scholar.google.com/&scioq=Wasserstein+Weisfeiler-Lehman+Subtree+Distance+for+Graph-Structured+Data&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "Waseda University", "aff_unique_dep": "", "aff_unique_url": "https://www.waseda.jp/top", "aff_unique_abbr": "Waseda", "aff_country_unique_index": "0", "aff_country_unique": "Japan" }, { "id": "y7tKDxxTo8T", "title": "Zero-Shot Recommender Systems", "track": "main", "status": "Reject", "tldr": "", "abstract": "Performance of recommender systems (RecSys) relies heavily on the amount of training data available. This poses a chicken-and-egg problem for early-stage products, whose amount of data, in turn, relies on the performance of their RecSys. In this paper, we explore the possibility of zero-shot learning in RecSys, to enable generalization from an old dataset to an entirely new dataset. We develop an algorithm, dubbed ZEro-Shot Recommenders (ZESRec), that is trained on an old dataset and generalize to a new one where there are neither overlapping users nor overlapping items, a setting that contrasts typical cross-domain RecSys that has either overlapping users or items. Different from previous methods that use categorical item indices (i.e., item ID), ZESRec uses items' generic features, such as natural-language descriptions, product images, and videos, as their continuous indices, and therefore naturally generalizes to any unseen items. In terms of users, ZESRec builds upon recent advances on sequential RecSys to represent users using their interactions with items, thereby generalizing to unseen users as well. We study three pairs of real-world RecSys datasets and demonstrate that ZESRec can successfully enable recommendations in such a zero-shot setting, opening up new opportunities for resolving the chicken-and-egg problem for data-scarce startups or early-stage products.", "keywords": "Zero Shot Learning;Recommender Systems;Neural Networks;Bayesian", "primary_area": "", "supplementary_material": "", "author": "HAO DING;Yifei Ma;Anoop Deoras;Bernie Wang;Hao Wang", "authorids": "~HAO_DING3;~Yifei_Ma1;~Anoop_Deoras1;~Bernie_Wang1;~Hao_Wang3", "gender": "M;;M;M;M", "homepage": "https://nightlyjourney.github.io/;http://yma.io;;http://web.mit.edu/~ywang02/www/;http://www.wanghao.in", "dblp": ";;55/8761;43/8355-1;w/HaoWang-14", "google_scholar": "82CV8akAAAAJ;ZVMcyxYAAAAJ;QF_rhCIAAAAJ;IKUm624AAAAJ;NrOA9QoAAAAJ", "orcid": ";;;0000-0002-0291-7184;", "linkedin": "hao-ding-949a6a10a/;yifei-ma-48503620;anoopdeoras/;;", "or_profile": "~HAO_DING3;~Yifei_Ma1;~Anoop_Deoras1;~Bernie_Wang1;~Hao_Wang4", "aff": "Amazon;Amazon;Amazon;Amazon;Rutgers University", "aff_domain": "amazon.com;amazon.com;amazon.com;amazon.com;cs.rutgers.edu", "position": "Applied Scientist;Applied Scientist;Principal Researcher;Principal Researcher;Assistant Professor", "bibtex": "@misc{\nding2022zeroshot,\ntitle={Zero-Shot Recommender Systems},\nauthor={HAO DING and Yifei Ma and Anoop Deoras and Bernie Wang and Hao Wang},\nyear={2022},\nurl={https://openreview.net/forum?id=y7tKDxxTo8T}\n}", "github": "", "project": "", "reviewers": "hJB8;zQRZ;cG8S", "site": "https://openreview.net/forum?id=y7tKDxxTo8T", "pdf_size": 0, "recommendation": "3;5;6", "confidence": "4;4;4", "correctness": "3;4;4", "technical_novelty": "1;2;3", "empirical_novelty": "2;3;3", "wc_summary_paper": "44;58;133", "wc_summary_review": "36;103;55", "wc_main_review": "1047;72;373", "wc_review": "1127;233;561", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "865;307;958", "reply_reviewers": "0;0;0", "reply_authors": "3;2;3", "recommendation_avg": [ 4.666666666666667, 1.247219128924647 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.0, 0.816496580927726 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 78.33333333333333, 39.075425639254256 ], "wc_summary_review_avg": [ 64.66666666666667, 28.193773938387334 ], "wc_main_review_avg": [ 497.3333333333333, 407.635730633232 ], "wc_review_avg": [ 640.3333333333334, 369.2599205016555 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 710.0, 287.4821733603668 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.6666666666666665, 0.4714045207910317 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.9449111825230683, "gs_citation": 104, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8490738736333315164&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff_unique_index": "0;0;0;0;1", "aff_unique_norm": "Amazon;Rutgers University", "aff_unique_dep": "Amazon.com, Inc.;", "aff_unique_url": "https://www.amazon.com;https://www.rutgers.edu", "aff_unique_abbr": "Amazon;Rutgers", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "y8zhHLm7FsP", "title": "Ensemble Kalman Filter (EnKF) for Reinforcement Learning (RL)", "track": "main", "status": "Reject", "tldr": "", "abstract": "This paper is concerned with representing and learning the optimal control law for the linear quadratic Gaussian (LQG) optimal control problem. In recent years, there is a growing interest in re-visiting this classical problem, in part due to the successes of reinforcement learning (RL). The main question of this body of research (and also of our paper) is to approximate the optimal control law without explicitly solving the Riccati equation. For this purpose, a novel simulation-based algorithm, namely an ensemble Kalman filter (EnKF), is introduced in this paper. The algorithm is used to obtain formulae for optimal control, expressed entirely in terms of the EnKF particles. For the general partially observed LQG problem, the proposed EnKF is combined with a standard EnKF (for the estimation problem) to obtain the optimal control input based on the use of the separation principle. The theoretical results and algorithms are illustrated with numerical experiments. ", "keywords": "decision and control;control theory;reinforcement learning", "primary_area": "", "supplementary_material": "/attachment/5d534c28b669f8e50e39a74bbbb8810b54992528.zip", "author": "Anant A Joshi;Amirhossein Taghvaei;Prashant G Mehta", "authorids": "~Anant_A_Joshi1;~Amirhossein_Taghvaei1;~Prashant_G_Mehta1", "gender": "M;M;M", "homepage": "https://anantjoshi97.github.io/website/;https://amirtag.github.io/;http://mehta.mechse.illinois.edu/", "dblp": ";158/4926;", "google_scholar": ";l96zhjwAAAAJ;wv9XU7gAAAAJ", "orcid": ";;", "linkedin": ";;mehtapg/", "or_profile": "~Anant_A_Joshi1;~Amirhossein_Taghvaei1;~Prashant_G_Mehta1", "aff": "University of Illinois, Urbana Champaign;University of Washington, Seattle;University of Illinois, Urbana Champaign", "aff_domain": "illinois.edu;uw.edu;illinois.edu", "position": "PhD student;Assistant Professor;Full Professor", "bibtex": "@misc{\njoshi2022ensemble,\ntitle={Ensemble Kalman Filter (En{KF}) for Reinforcement Learning ({RL})},\nauthor={Anant A Joshi and Amirhossein Taghvaei and Prashant G Mehta},\nyear={2022},\nurl={https://openreview.net/forum?id=y8zhHLm7FsP}\n}", "github": "", "project": "", "reviewers": "faGe;HWCL;7yfG", "site": "https://openreview.net/forum?id=y8zhHLm7FsP", "pdf_size": 0, "recommendation": "3;3;3", "confidence": "5;4;5", "correctness": "4;3;3", "technical_novelty": "1;2;2", "empirical_novelty": "1;1;0", "wc_summary_paper": "67;35;38", "wc_summary_review": "21;47;42", "wc_main_review": "279;611;475", "wc_review": "367;693;555", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "591;639;809", "reply_reviewers": "0;0;0", "reply_authors": "1;1;2", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 1.6666666666666667, 0.4714045207910317 ], "empirical_novelty_avg": [ 0.6666666666666666, 0.4714045207910317 ], "wc_summary_paper_avg": [ 46.666666666666664, 14.429907214608907 ], "wc_summary_review_avg": [ 36.666666666666664, 11.2644968324772 ], "wc_main_review_avg": [ 455.0, 136.2742333189465 ], "wc_review_avg": [ 538.3333333333334, 133.6097135524044 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 679.6666666666666, 93.5283676515075 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:azesfbnfNoAJ:scholar.google.com/&scioq=Ensemble+Kalman+Filter+(EnKF)+for+Reinforcement+Learning+(RL)&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1;0", "aff_unique_norm": "University of Illinois Urbana-Champaign;University of Washington", "aff_unique_dep": ";", "aff_unique_url": "https://illinois.edu;https://www.washington.edu", "aff_unique_abbr": "UIUC;UW", "aff_campus_unique_index": "0;1;0", "aff_campus_unique": "Urbana-Champaign;Seattle", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "yBYVUDj7yF", "title": "The Power of Contrast for Feature Learning: A Theoretical Analysis", "track": "main", "status": "Reject", "tldr": "", "abstract": "Contrastive learning has achieved state-of-the-art performance in various self-supervised learning tasks and even outperforms its supervised counterpart. Despite its empirical success, the theoretical understanding of why contrastive learning works is still limited. In this paper, (i) we provably show that contrastive learning outperforms autoencoder, a classical unsupervised learning method, on both feature recovery and downstream tasks; (ii) we also illustrate the role of labeled data in supervised contrastive learning. This provides theoretical support for recent findings that contrastive learning with labels improves the performance of learned representations in the in-domain downstream task, but it can harm the performance in transfer learning. We verify our theory with numerical experiments.", "keywords": "contrastive learning;self-supervised learning;transfer learning", "primary_area": "", "supplementary_material": "", "author": "Wenlong Ji;Zhun Deng;Ryumei Nakada;James Zou;Linjun Zhang", "authorids": "~Wenlong_Ji1;~Zhun_Deng1;~Ryumei_Nakada1;~James_Zou1;~Linjun_Zhang1", "gender": "M;M;;;M", "homepage": "https://wenlong2000.github.io/;https://www.zhundeng.org/;https://statistics.rutgers.edu/people-pages/faculty/people/135-graduate-students/581-ryumei-nakada;;", "dblp": ";204/4353;;;", "google_scholar": ";nkmi-moAAAAJ;;23ZXZvEAAAAJ;TUAzs3sAAAAJ", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~Wenlong_Ji1;~Zhun_Deng1;~Ryumei_Nakada1;~James_Zou1;~Linjun_Zhang1", "aff": "Peking University;Harvard University;Rutgers University;Stanford University;Rutgers University", "aff_domain": "pku.edu.cn;harvard.edu;rutgers.edu;stanford.edu;rutgers.edu", "position": "Undergrad student;PhD student;PhD student;Assistant Professor;Assistant Professor", "bibtex": "@misc{\nji2022the,\ntitle={The Power of Contrast for Feature Learning: A Theoretical Analysis},\nauthor={Wenlong Ji and Zhun Deng and Ryumei Nakada and James Zou and Linjun Zhang},\nyear={2022},\nurl={https://openreview.net/forum?id=yBYVUDj7yF}\n}", "github": "", "project": "", "reviewers": "R3zT;wN4s;TTCS", "site": "https://openreview.net/forum?id=yBYVUDj7yF", "pdf_size": 0, "recommendation": "5;6;6", "confidence": "4;4;3", "correctness": "3;4;2", "technical_novelty": "3;4;3", "empirical_novelty": "3;2;2", "wc_summary_paper": "45;119;147", "wc_summary_review": "3;30;37", "wc_main_review": "291;531;260", "wc_review": "339;680;444", "wc_reply_reviewers": "0;0;256", "wc_reply_authors": "681;1146;1142", "reply_reviewers": "0;0;1", "reply_authors": "2;2;2", "recommendation_avg": [ 5.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.0, 0.816496580927726 ], "technical_novelty_avg": [ 3.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 103.66666666666667, 43.02970550161313 ], "wc_summary_review_avg": [ 23.333333333333332, 14.65908895153068 ], "wc_main_review_avg": [ 360.6666666666667, 121.1069316302287 ], "wc_review_avg": [ 487.6666666666667, 142.5957767802243 ], "wc_reply_reviewers_avg": [ 85.33333333333333, 120.67955732250411 ], "wc_reply_authors_avg": [ 989.6666666666666, 218.26640195463486 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 2.0, 0.0 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.4999999999999999, "corr_recommendation_correctness": 0.0, "gs_citation": 43, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7282881274216336372&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1;2;3;2", "aff_unique_norm": "Peking University;Harvard University;Rutgers University;Stanford University", "aff_unique_dep": ";;;", "aff_unique_url": "http://www.pku.edu.cn;https://www.harvard.edu;https://www.rutgers.edu;https://www.stanford.edu", "aff_unique_abbr": "Peking U;Harvard;Rutgers;Stanford", "aff_campus_unique_index": "1", "aff_campus_unique": ";Stanford", "aff_country_unique_index": "0;1;1;1;1", "aff_country_unique": "China;United States" }, { "id": "yCS5dckx_vj", "title": "Towards Demystifying Representation Learning with Non-contrastive Self-supervision", "track": "main", "status": "Reject", "tldr": "", "abstract": "Non-contrastive methods of self-supervised learning (such as BYOL and SimSiam) learn representations by minimizing the distance between two views of the same image. These approaches have achieved remarkable performance in practice, but it is not well understood 1) why these methods do not collapse to the trivial solutions and 2) how the representation is learned. Tian et al made an initial attempt on the first question and proposed DirectPred that sets the predictor directly. In our work, we analyze a generalized version of DirectPred, called DirectSet($\\alpha$). We show that in a simple linear network, DirectSet($\\alpha$) provably learns a desirable projection matrix and also reduces the sample complexity on downstream tasks. Our analysis suggests that weight decay acts as an implicit threshold that discard the features with high variance under augmentation, and keep the features with low variance. Inspired by our theory, we simplify DirectPred by removing the expensive eigen-decomposition step. On CIFAR-10, CIFAR-100, STL-10 and ImageNet, DirectCopy, our simpler and more computationally efficient algorithm rivals or even outperforms DirectPred.", "keywords": "self-supervised learning;representation learning;non-contrastive methods;DirectPred;theoretical analysis", "primary_area": "", "supplementary_material": "", "author": "Xiang Wang;Xinlei Chen;Simon Shaolei Du;Yuandong Tian", "authorids": "~Xiang_Wang1;~Xinlei_Chen1;~Simon_Shaolei_Du1;~Yuandong_Tian1", "gender": "M;M;M;M", "homepage": "https://users.cs.duke.edu/~xwang/;http://xinleic.xyz;http://simonshaoleidu.com;http://yuandong-tian.com", "dblp": ";;176/5602;t/YuandongTian", "google_scholar": "dHjYcrgAAAAJ;bSU7LYoAAAAJ;OttawxUAAAAJ;0mgEF28AAAAJ", "orcid": ";;;0000-0003-4202-4847", "linkedin": ";;;yuandongtian", "or_profile": "~Xiang_Wang1;~Xinlei_Chen1;~Simon_Shaolei_Du1;~Yuandong_Tian1", "aff": "Duke University;Meta;Meta Facebook;Meta AI (FAIR)", "aff_domain": "duke.edu;meta.com;fb.com;meta.com", "position": "PhD student;Researcher;Visiting Professor;Research Scientist", "bibtex": "@misc{\nwang2022towards,\ntitle={Towards Demystifying Representation Learning with Non-contrastive Self-supervision},\nauthor={Xiang Wang and Xinlei Chen and Simon Shaolei Du and Yuandong Tian},\nyear={2022},\nurl={https://openreview.net/forum?id=yCS5dckx_vj}\n}", "github": "", "project": "", "reviewers": "G6yG;5xNt;racS;6aGh", "site": "https://openreview.net/forum?id=yCS5dckx_vj", "pdf_size": 0, "recommendation": "3;5;6;6", "confidence": "4;3;3;3", "correctness": "4;4;4;4", "technical_novelty": "2;2;3;3", "empirical_novelty": "3;2;2;3", "wc_summary_paper": "88;93;122;91", "wc_summary_review": "54;41;67;37", "wc_main_review": "411;195;357;206", "wc_review": "553;329;546;334", "wc_reply_reviewers": "0;0;0;13", "wc_reply_authors": "462;145;124;156", "reply_reviewers": "0;0;0;1", "reply_authors": "1;1;1;1", "recommendation_avg": [ 5.0, 1.224744871391589 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 4.0, 0.0 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 98.5, 13.683932183404009 ], "wc_summary_review_avg": [ 49.75, 11.776565713313877 ], "wc_main_review_avg": [ 292.25, 93.79598872020061 ], "wc_review_avg": [ 440.5, 109.0424229371303 ], "wc_reply_reviewers_avg": [ 3.25, 5.629165124598851 ], "wc_reply_authors_avg": [ 221.75, 139.1840059058511 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.9428090415820632, "corr_recommendation_correctness": 0.0, "gs_citation": 22, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15999601502455557447&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;1;1", "aff_unique_norm": "Duke University;Meta", "aff_unique_dep": ";Meta Platforms, Inc.", "aff_unique_url": "https://www.duke.edu;https://meta.com", "aff_unique_abbr": "Duke;Meta", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "yGNzJk_tYr4", "title": "Causally Estimating the Sensitivity of Neural NLP Models to Spurious Features", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Recent work finds modern natural language processing (NLP) models relying on spurious features for prediction. Mitigating such effects is thus important. Despite this need, there is no quantitative measure to evaluate or compare the effects of different forms of spurious features in NLP. We address this gap in the literature by quantifying model sensitivity to spurious features with a causal estimand, dubbed CENT, which draws on the concept of {\\it average treatment effect} from the causality literature. By conducting simulations with four prominent NLP models --- TextRNN, BERT, RoBERTa and XLNet --- we rank the models against their sensitivity to artificial injections of eight spurious features. We further hypothesize and validate that models that are more sensitive to a spurious feature will be less robust against perturbations with this feature during inference. Conversely, data augmentation with this feature improves robustness to similar perturbations. We find statistically significant inverse correlations between sensitivity and robustness, providing empirical support for our hypothesis. Our findings contribute to the interpretation of models and their robustness.", "keywords": "spurious feature;causality;robustness;data augmentation", "primary_area": "", "supplementary_material": "", "author": "Yunxiang Zhang;Liangming Pan;Samson Tan;Min-Yen Kan", "authorids": "~Yunxiang_Zhang2;~Liangming_Pan1;~Samson_Tan1;~Min-Yen_Kan1", "gender": "M;M;;M", "homepage": "https://yunx-z.github.io/;https://liangmingpan.bio;https://samsontmr.github.io;https://www.comp.nus.edu.sg/~kanmy/", "dblp": "160/6176-2.html;186/9707;241/8934.html;k/MinYenKan", "google_scholar": "pbvWlJwAAAAJ;JcjjOTUAAAAJ;;https://scholar.google.com.tw/citations?user=aNVcd3EAAAAJ", "orcid": ";;;", "linkedin": "%E4%BA%91%E7%BF%94-%E5%BC%A0-a97859196/;;;", "or_profile": "~Yunxiang_Zhang2;~Liangming_Pan1;~Samson_Tan1;~Min-Yen_Kan1", "aff": "Peking University;National University of Singapore;Salesforce Research Asia;National University of Singapore", "aff_domain": "pku.edu.cn;u.nus.edu;salesforce.com;nus.edu.sg", "position": "Undergrad student;PhD student;PhD student;Associate Professor", "bibtex": "@misc{\nzhang2022causally,\ntitle={Causally Estimating the Sensitivity of Neural {NLP} Models to Spurious Features},\nauthor={Yunxiang Zhang and Liangming Pan and Samson Tan and Min-Yen Kan},\nyear={2022},\nurl={https://openreview.net/forum?id=yGNzJk_tYr4}\n}", "github": "", "project": "", "reviewers": "Yt74;9s2Q;J7A1;pYye", "site": "https://openreview.net/forum?id=yGNzJk_tYr4", "pdf_size": 0, "recommendation": "1;3;3;6", "confidence": "4;4;4;5", "correctness": "3;2;3;4", "technical_novelty": "1;2;2;2", "empirical_novelty": "1;2;2;3", "wc_summary_paper": "194;120;117;180", "wc_summary_review": "56;35;34;42", "wc_main_review": "655;398;309;666", "wc_review": "905;553;460;888", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.25, 1.7853571071357126 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 1.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 152.75, 34.62206666275137 ], "wc_summary_review_avg": [ 41.75, 8.78564169540279 ], "wc_main_review_avg": [ 507.0, 156.7402309555527 ], "wc_review_avg": [ 701.5, 197.84400420533345 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.8892972917998875, "corr_recommendation_correctness": 0.5940885257860046, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1471759215219768668&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2, "aff_unique_index": "0;1;2;1", "aff_unique_norm": "Peking University;National University of Singapore;Salesforce Research", "aff_unique_dep": ";;Research", "aff_unique_url": "http://www.pku.edu.cn;https://www.nus.edu.sg;https://research.salesforce.com", "aff_unique_abbr": "Peking U;NUS;Salesforce Research Asia", "aff_campus_unique_index": "1", "aff_campus_unique": ";Asia", "aff_country_unique_index": "0;1;1;1", "aff_country_unique": "China;Singapore" }, { "id": "yJF-89OH94U", "title": "DICE: A Simple Sparsification Method for Out-of-distribution Detection", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Detecting out-of-distribution (OOD) inputs is a central challenge for safely deploying machine learning models in the real world. Previous methods commonly rely on an OOD score derived from the overparameterized weight space, while largely overlooking the role of sparsification. In this paper, we reveal important insights that reliance on unimportant weights and units can directly attribute to the brittleness of OOD detection. To mitigate the issue, we propose a sparsification-based OOD detection framework termed DICE. Our key idea is to rank weights based on a measure of contribution, and selectively use the most salient weights to derive the output for OOD detection. We provide both empirical and theoretical insights, characterizing and explaining the mechanism by which DICE improves OOD detection. By pruning away noisy signals, DICE provably reduces the output variance for OOD data, resulting in a sharper output distribution and stronger separability from ID data. DICE establishes state-of-the-art performance, reducing the FPR95 by up to 24.69% compared to the previous best method.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yiyou Sun;Sharon Li", "authorids": "~Yiyou_Sun1;~Sharon_Li1", "gender": "M;F", "homepage": "https://sunyiyou.github.io/;http://pages.cs.wisc.edu/~sharonli/", "dblp": "211/5630;144/6087-1", "google_scholar": "IKqlQo4AAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";", "linkedin": ";liyixuan", "or_profile": "~Yiyou_Sun1;~Yixuan_Li1", "aff": "University of Wisconsin, Madison;Cornell University", "aff_domain": "wisc.edu;cornell.edu", "position": "PhD student;Graduate Student", "bibtex": "@misc{\nsun2022dice,\ntitle={{DICE}: A Simple Sparsification Method for Out-of-distribution Detection},\nauthor={Yiyou Sun and Sharon Li},\nyear={2022},\nurl={https://openreview.net/forum?id=yJF-89OH94U}\n}", "github": "", "project": "", "reviewers": "wqYf;X9EW;tj5N", "site": "https://openreview.net/forum?id=yJF-89OH94U", "pdf_size": 0, "recommendation": "3;5;6", "confidence": "4;3;4", "correctness": "3;4;3", "technical_novelty": "2;3;3", "empirical_novelty": "2;3;3", "wc_summary_paper": "76;33;80", "wc_summary_review": "41;18;31", "wc_main_review": "484;329;129", "wc_review": "601;380;240", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 4.666666666666667, 1.247219128924647 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 63.0, 21.275964529643932 ], "wc_summary_review_avg": [ 30.0, 9.41629792788369 ], "wc_main_review_avg": [ 314.0, 145.3157481715821 ], "wc_review_avg": [ 407.0, 148.60910694391063 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.18898223650461363, "corr_recommendation_correctness": 0.18898223650461363, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:P9SNHMVS65IJ:scholar.google.com/&scioq=DICE:+A+Simple+Sparsification+Method+for+Out-of-distribution+Detection&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1", "aff_unique_norm": "University of Wisconsin;Cornell University", "aff_unique_dep": ";", "aff_unique_url": "https://www.wisc.edu;https://www.cornell.edu", "aff_unique_abbr": "UW;Cornell", "aff_campus_unique_index": "0", "aff_campus_unique": "Madison;", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Imbedding Deep Neural Networks", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6530", "id": "yKIAXjkJc2F", "poster": "", "openreview": "https://openreview.net/forum?id=yKIAXjkJc2F", "slides": "https://iclr.cc/virtual/2022/poster/6530", "video": "https://iclr.cc/virtual/2022/poster/6530", "author_site": "Andrew Corbett, Dmitry Kangin", "tldr": "", "abstract": "Continuous-depth neural networks, such as Neural ODEs, have refashioned the understanding of residual neural networks in terms of non-linear vector-valued optimal control problems. The common solution is to use the adjoint sensitivity method to replicate a forward-backward pass optimisation problem. We propose a new approach which explicates the network's `depth' as a fundamental variable, thus reducing the problem to a system of forward-facing initial value problems. This new method is based on the principal of `Invariant Imbedding' for which we prove a general solution, applicable to all non-linear, vector-valued optimal control problems with both running and terminal loss.\nOur new architectures provide a tangible tool for inspecting the theoretical--and to a great extent unexplained--properties of network depth. They also constitute a resource of discrete implementations of Neural ODEs comparable to classes of imbedded residual neural networks. Through a series of experiments, we show the competitive performance of the proposed architectures for supervised learning and time series prediction. ", "keywords": "Neural ODEs;Optimal Control;Deep Neural Networks;Invariant Imbedding", "primary_area": "", "supplementary_material": "/attachment/b7662cb006cd7b6ccc9cc25ba55205d37a43d8f2.zip", "author": "Andrew Corbett;Dmitry Kangin", "authorids": "~Andrew_Corbett1;~Dmitry_Kangin1", "gender": "M;M", "homepage": "https://www.digilab.co.uk;", "dblp": ";134/0570", "google_scholar": ";https://scholar.google.co.uk/citations?user=vv-leaMAAAAJ", "orcid": ";", "linkedin": ";dmitry-kangin-34bab097/", "or_profile": "~Andrew_Corbett1;~Dmitry_Kangin1", "aff": "University of Exeter;Etcembly Ltd (UK)", "aff_domain": "exeter.ac.uk;etcembly.io", "position": "Research Fellow;Researcher", "bibtex": "@inproceedings{\ncorbett2022imbedding,\ntitle={Imbedding Deep Neural Networks},\nauthor={Andrew Corbett and Dmitry Kangin},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=yKIAXjkJc2F}\n}", "github": "", "project": "", "reviewers": "nSYs;4Mn5;Ynkf;W69g", "pdf_size": 0, "recommendation": "6;8;8;8", "confidence": "4;5;3;3", "correctness": "3;4;4;3", "technical_novelty": "3;4;3;3", "empirical_novelty": "3;0;3;3", "wc_summary_paper": "76;42;61;73", "wc_summary_review": "23;63;76;78", "wc_main_review": "209;302;579;225", "wc_review": "308;407;716;376", "wc_reply_reviewers": "42;21;59;14", "wc_reply_authors": "681;1045;800;321", "reply_reviewers": "1;1;1;1", "reply_authors": "1;2;1;1", "recommendation_avg": [ 7.5, 0.8660254037844386 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 3.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 1.299038105676658 ], "wc_summary_paper_avg": [ 63.0, 13.360389215887388 ], "wc_summary_review_avg": [ 60.0, 22.124646889837585 ], "wc_main_review_avg": [ 328.75, 148.6983103468227 ], "wc_review_avg": [ 451.75, 156.71052134429263 ], "wc_reply_reviewers_avg": [ 34.0, 17.7341478509682 ], "wc_reply_authors_avg": [ 711.75, 260.9955698857741 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.17407765595569782, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10680544455244654489&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=yKIAXjkJc2F", "email": "exeter.ac.uk;etcembly.io", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "University of Exeter;Etcembly Ltd", "aff_unique_dep": ";", "aff_unique_url": "https://www.exeter.ac.uk;", "aff_unique_abbr": "Exeter;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United Kingdom" }, { "id": "yK_jcv_aLX", "title": "Action-Sufficient State Representation Learning for Control with Structural Constraints", "track": "main", "status": "Reject", "tldr": "", "abstract": "Perceived signals in real-world scenarios are usually high-dimensional and noisy, and finding and using their representation that contains essential and sufficient information required by downstream decision-making tasks will help improve computational efficiency and generalization ability in the tasks. In this paper, we focus on partially observable environments and propose to learn a minimal set of state representations that capture sufficient information for decision-making, termed \\textit{Action-Sufficient state Representations} (ASRs). We build a generative environment model for the structural relationships among variables in the system and present a principled way to characterize ASRs based on structural constraints and the goal of maximizing cumulative reward in policy learning. We then develop a structured sequential Variational Auto-Encoder to estimate the environment model and extract ASRs. Our empirical results on CarRacing and VizDoom demonstrate a clear advantage of learning and using ASRs for policy learning. Moreover, the estimated environment model and ASRs allow learning behaviors from imagined outcomes in the compact latent space to improve sample efficiency.", "keywords": "Representation learning in RL;Minimal sufficient state representations;Graphical model;World model", "primary_area": "", "supplementary_material": "/attachment/21e48a04480bee9de9fa1cc4d4eb3aa83eead046.zip", "author": "Biwei Huang;Chaochao Lu;Liu Leqi;Jos\u00e9 Miguel Hern\u00e1ndez-Lobato;Clark Glymour;Bernhard Sch\u00f6lkopf;Kun Zhang", "authorids": "~Biwei_Huang1;~Chaochao_Lu1;~Liu_Leqi1;~Jos\u00e9_Miguel_Hern\u00e1ndez-Lobato1;~Clark_Glymour1;~Bernhard_Sch\u00f6lkopf1;~Kun_Zhang1", "gender": "F;;F;;male;;M", "homepage": ";https://causallu.com/;https://leqiliu.github.io/;;;;http://www.andrew.cmu.edu/user/kunz1/", "dblp": "165/3288;142/2790;174/0364;;;;96/3115-1", "google_scholar": ";C_Qxt0IAAAAJ;zmbW4iUAAAAJ;;;;RGoypN4AAAAJ", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": "~Biwei_Huang1;~Chaochao_Lu1;~Liu_Leqi1;~Jos\u00e9_Miguel_Hern\u00e1ndez-Lobato1;~Clark_Glymour1;~Bernhard_Sch\u00f6lkopf1;~Kun_Zhang1", "aff": "Carnegie Mellon University;University of Cambridge;Carnegie Mellon University;;Carnegie Mellon University;;Carnegie Mellon University", "aff_domain": "cmu.edu;cam.ac.uk;cmu.edu;;andrew.cmu.edu;;cmu.edu", "position": "PhD student;PhD student;PhD student;;Full Professor;;Associate Professor", "bibtex": "@misc{\nhuang2022actionsufficient,\ntitle={Action-Sufficient State Representation Learning for Control with Structural Constraints},\nauthor={Biwei Huang and Chaochao Lu and Liu Leqi and Jos{\\'e} Miguel Hern{\\'a}ndez-Lobato and Clark Glymour and Bernhard Sch{\\\"o}lkopf and Kun Zhang},\nyear={2022},\nurl={https://openreview.net/forum?id=yK_jcv_aLX}\n}", "github": "", "project": "", "reviewers": "2HkD;vpGY;hBsj;83Y3", "site": "https://openreview.net/forum?id=yK_jcv_aLX", "pdf_size": 0, "recommendation": "5;5;5;8", "confidence": "3;2;5;4", "correctness": "3;3;3;4", "technical_novelty": "3;3;2;4", "empirical_novelty": "3;2;2;4", "wc_summary_paper": "78;90;57;112", "wc_summary_review": "26;78;91;71", "wc_main_review": "539;285;483;515", "wc_review": "643;453;631;698", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 5.75, 1.299038105676658 ], "confidence_avg": [ 3.5, 1.118033988749895 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 84.25, 19.904459299363044 ], "wc_summary_review_avg": [ 66.5, 24.45914961727002 ], "wc_main_review_avg": [ 455.5, 100.42285596416784 ], "wc_review_avg": [ 606.25, 92.01460481901772 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.2581988897471611, "corr_recommendation_correctness": 1.0, "gs_citation": 48, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7828312335122233004&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff_unique_index": "0;1;0;0;0", "aff_unique_norm": "Carnegie Mellon University;University of Cambridge", "aff_unique_dep": ";", "aff_unique_url": "https://www.cmu.edu;https://www.cam.ac.uk", "aff_unique_abbr": "CMU;Cambridge", "aff_campus_unique_index": "1", "aff_campus_unique": ";Cambridge", "aff_country_unique_index": "0;1;0;0;0", "aff_country_unique": "United States;United Kingdom" }, { "id": "yOBqNg-CqB0", "title": "Re-evaluating Word Mover's Distance", "track": "main", "status": "Reject", "tldr": "", "abstract": "The word mover's distance (WMD) is a fundamental technique for measuring the similarity of two documents. As the crux of WMD, it can take advantage of the underlying geometry of the word space by employing an optimal transport formulation. The original study on WMD reported that WMD outperforms classical baselines such as bag-of-words (BOW) and TF-IDF by significant margins in various datasets. In this paper, we point out that the evaluation in the original study could be misleading. We re-evaluate the performances of WMD and the classical baselines and find that the classical baselines are competitive with WMD if we employ an appropriate preprocessing, i.e., L1 normalization. In addition, We introduce an analogy between WMD and L1-normalized BOW and find that not only the performance of WMD but also the distance values resemble those of BOW in high dimensional spaces.", "keywords": "optimal transport;word mover's distance", "primary_area": "", "supplementary_material": "/attachment/e4affe68f11370529c5fd8a5d9d99b837a3ce6e5.zip", "author": "Ryoma Sato;Makoto Yamada;Hisashi Kashima", "authorids": "~Ryoma_Sato1;~Makoto_Yamada3;~Hisashi_Kashima2", "gender": "M;M;M", "homepage": "https://joisino.net/en/;https://groups.oist.jp/mlds;https://hkashima.github.io/index_e.html", "dblp": "227/2014;56/4937;27/4448", "google_scholar": "https://scholar.google.co.jp/citations?user=S4kMic4AAAAJ;1cKNu1gAAAAJ;bkTB0t8AAAAJ", "orcid": ";;0000-0002-2770-0184", "linkedin": ";;", "or_profile": "~Ryoma_Sato1;~Makoto_Yamada3;~Hisashi_Kashima2", "aff": "Kyoto University;Kyoto University;Kyoto University", "aff_domain": "kyoto-u.ac.jp;kyoto-u.ac.jp;kyoto-u.ac.jp", "position": "PhD student;Associate Professor;Full Professor", "bibtex": "@misc{\nsato2022reevaluating,\ntitle={Re-evaluating Word Mover's Distance},\nauthor={Ryoma Sato and Makoto Yamada and Hisashi Kashima},\nyear={2022},\nurl={https://openreview.net/forum?id=yOBqNg-CqB0}\n}", "github": "", "project": "", "reviewers": "sFeb;id2C;5Jna;M3dN", "site": "https://openreview.net/forum?id=yOBqNg-CqB0", "pdf_size": 0, "recommendation": "3;3;8;8", "confidence": "4;4;4;4", "correctness": "4;2;3;3", "technical_novelty": "1;1;4;3", "empirical_novelty": "3;1;3;3", "wc_summary_paper": "90;32;46;50", "wc_summary_review": "41;100;32;23", "wc_main_review": "186;528;275;40", "wc_review": "317;660;353;113", "wc_reply_reviewers": "0;405;16;0", "wc_reply_authors": "328;762;701;26", "reply_reviewers": "0;2;1;0", "reply_authors": "1;2;1;1", "recommendation_avg": [ 5.5, 2.5 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.25, 1.299038105676658 ], "empirical_novelty_avg": [ 2.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 54.5, 21.558061137310098 ], "wc_summary_review_avg": [ 49.0, 30.124740662784138 ], "wc_main_review_avg": [ 257.25, 177.4082509355188 ], "wc_review_avg": [ 360.75, 195.51518483227844 ], "wc_reply_reviewers_avg": [ 105.25, 173.1839701011615 ], "wc_reply_authors_avg": [ 454.25, 297.8811633856696 ], "reply_reviewers_avg": [ 0.75, 0.82915619758885 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 34, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3123355047455627011&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff_unique_index": "0;0;0", "aff_unique_norm": "Kyoto University", "aff_unique_dep": "", "aff_unique_url": "https://www.kyoto-u.ac.jp", "aff_unique_abbr": "Kyoto U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Japan" }, { "id": "yQ7Nm-56FWU", "title": "Adversarial Training with Rectified Rejection", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Adversarial training (AT) is one of the most effective strategies for promoting model robustness, whereas even the state-of-the-art adversarially trained models struggle to exceed 65% robust test accuracy on CIFAR-10 without additional data, which is far from practical. A natural way to improve beyond this accuracy bottleneck is to introduce a rejection option, where confidence is a commonly used certainty proxy. However, the vanilla confidence can overestimate the model certainty if the input is wrongly classified. To this end, we propose to use true confidence (T-Con) (i.e., predicted probability of the true class) as a certainty oracle, and learn to predict T-Con by rectifying confidence. Intriguingly, we prove that under mild conditions, a rectified confidence (R-Con) rejector and a confidence rejector can be coupled to distinguish any wrongly classified input from correctly classified ones. We also quantify that training R-Con to be aligned with T-Con could be an easier task than learning robust classifiers. In our experiments, we evaluate our rectified rejection (RR) module on CIFAR-10, CIFAR-10-C, and CIFAR-100 under several attacks, and demonstrate that the RR module is well compatible with different AT frameworks on improving robustness, with little extra computation.", "keywords": "Adversarial Training;Rectified Rejection;Coupling Strategy", "primary_area": "", "supplementary_material": "/attachment/5d711bc75de5ffc54fbc0f59963c27a068aaa986.zip", "author": "Tianyu Pang;Huishuai Zhang;Di He;Yinpeng Dong;Hang Su;Wei Chen;Jun Zhu;Tie-Yan Liu", "authorids": "~Tianyu_Pang1;~Huishuai_Zhang3;~Di_He1;~Yinpeng_Dong2;~Hang_Su3;~Wei_Chen1;~Jun_Zhu2;~Tie-Yan_Liu1", "gender": "M;M;M;F;M;M;M;M", "homepage": "https://p2333.github.io/;https://dihe-pku.github.io/;https://dongyp13.github.io;https://weichen-cas.github.io/;http://ml.cs.tsinghua.edu.cn/~jun;http://member.acm.org/~tieyanliu;;https://huishuai-git.github.io", "dblp": "202/2550;74/184;183/0980;;50/2644-1;l/TieYanLiu;26/5371-6;144/7537", "google_scholar": "wYDbtFsAAAAJ;https://scholar.google.co.jp/citations?user=orVoz4IAAAAJ;6_4ad84AAAAJ;https://scholar.google.com/citations?hl=en;axsP38wAAAAJ;Nh832fgAAAAJ;dxN1_X0AAAAJ;w1srHyIAAAAJ", "orcid": "0000-0003-0639-6176;;;;;0000-0002-0476-8020;;", "linkedin": "%E5%A4%A9%E5%AE%87-%E5%BA%9E-b3999017a/;;;;;;;", "or_profile": "~Tianyu_Pang1;~Di_He1;~Yinpeng_Dong2;~Wei_Chen1;~Jun_Zhu2;~Tie-Yan_Liu1;~Hang_Su2;~Huishuai_Zhang2", "aff": "Tsinghua University;Microsoft;Tsinghua University; Chinese Academy of Sciences;Tsinghua University;Microsoft;Tsinghua University;Microsoft Research Asia", "aff_domain": "tsinghua.edu.cn;microsoft.com;tsinghua.edu.cn;ict.ac.cn;mail.tsinghua.edu.cn;microsoft.com;tsinghua.edu.cn;microsoft.com", "position": "PhD student;Senior Researcher;PhD student;Full Professor;Professor;Distinguished Scientist;Associate Professor;Researcher", "bibtex": "@misc{\npang2022adversarial,\ntitle={Adversarial Training with Rectified Rejection},\nauthor={Tianyu Pang and Huishuai Zhang and Di He and Yinpeng Dong and Hang Su and Wei Chen and Jun Zhu and Tie-Yan Liu},\nyear={2022},\nurl={https://openreview.net/forum?id=yQ7Nm-56FWU}\n}", "github": "", "project": "", "reviewers": "rLi6;VEEQ;JEia;DzYd", "site": "https://openreview.net/forum?id=yQ7Nm-56FWU", "pdf_size": 0, "recommendation": "3;3;5;6", "confidence": "5;4;4;2", "correctness": "2;2;3;3", "technical_novelty": "3;2;3;3", "empirical_novelty": "3;0;3;2", "wc_summary_paper": "109;39;163;58", "wc_summary_review": "72;50;46;10", "wc_main_review": "733;614;522;46", "wc_review": "914;703;731;114", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.25, 1.299038105676658 ], "confidence_avg": [ 3.75, 1.0897247358851685 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 1.224744871391589 ], "wc_summary_paper_avg": [ 92.25, 48.204641892664235 ], "wc_summary_review_avg": [ 44.5, 22.242976419535225 ], "wc_main_review_avg": [ 478.75, 260.80584253424996 ], "wc_review_avg": [ 615.5, 300.6663433109865 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": -0.8388704928078611, "corr_recommendation_correctness": 0.9622504486493761, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6096349414799064407&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;0;2;0;1;0;1", "aff_unique_norm": "Tsinghua University;Microsoft;Chinese Academy of Sciences", "aff_unique_dep": ";Microsoft Corporation;", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.microsoft.com;https://www.cas.cn", "aff_unique_abbr": "THU;Microsoft;CAS", "aff_campus_unique_index": "1", "aff_campus_unique": ";Asia", "aff_country_unique_index": "0;1;0;0;0;1;0;0", "aff_country_unique": "China;United States" }, { "id": "yRYtnKAZqxU", "title": "Interrogating Paradigms in Self-supervised Graph Representation Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Graph contrastive learning (GCL) is a newly popular paradigm for self-supervised graph representation learning and offers an alternative to reconstruction-based methods.However, it is not well understood what conditions a task must satisfy such that a given paradigm is better suited. In this paper, we investigate the role of dataset properties and augmentation strategies on the success of GCL and reconstruction-based approaches. Using the recent population augmentation graph-based analysis of self-supervised learning, we show theoretically that the success of GCL with popular augmentations is bounded by the graph edit distance between different classes. Next, we introduce a synthetic data generation process that systematically controls the amount of style vs. content in each sample- i.e. information that is irrelevant vs. relevant to the downstream task- to elucidate how graph representation learning methods perform under different dataset conditions. We empirically show that reconstruction approaches perform better when the style vs. content ratio is low and GCL with popular augmentations benefits from moderate style. Our results provide a general, systematic framework for analyzing different graph representation learning methods and demonstrate when a given approach is expected to perform well.", "keywords": "Graph Neural Networks;Contrastive Learning;Self-supervised Learning", "primary_area": "", "supplementary_material": "/attachment/0552a7ab806abac2a339fc407f84f6a7a64e2ace.zip", "author": "Puja Trivedi;Mark Heimann;Danai Koutra;Jayaraman J. Thiagarajan", "authorids": "~Puja_Trivedi1;~Mark_Heimann1;~Danai_Koutra1;~Jayaraman_J._Thiagarajan3", "gender": "F;M;F;M", "homepage": "https://pujacomputes.github.io/;https://markheimann.github.io;http://web.eecs.umich.edu/~dkoutra/;https://jjthiagarajan.com", "dblp": "274/2080;215/4357;91/9987;16/7803", "google_scholar": "1y9cR50AAAAJ;EXeTcRUAAAAJ;https://scholar.google.com.tw/citations?user=bDrA1-8AAAAJ;cMz65_oAAAAJ", "orcid": "0000-0003-1874-8992;;0000-0002-3206-8179;", "linkedin": ";;;", "or_profile": "~Puja_Trivedi1;~Mark_Heimann1;~Danai_Koutra1;~Jayaraman_J._Thiagarajan2", "aff": "University of Michigan;Lawrence Livermore National Laboratory;Amazon;Lawrence Livermore National Labs", "aff_domain": "umich.edu;llnl.gov;amazon.com;llnl.gov", "position": "PhD student;Postdoc;Scholar;Computer Scientist", "bibtex": "@misc{\ntrivedi2022interrogating,\ntitle={Interrogating Paradigms in Self-supervised Graph Representation Learning},\nauthor={Puja Trivedi and Mark Heimann and Danai Koutra and Jayaraman J. Thiagarajan},\nyear={2022},\nurl={https://openreview.net/forum?id=yRYtnKAZqxU}\n}", "github": "", "project": "", "reviewers": "MoyX;UyEA;97Kb;UMWA", "site": "https://openreview.net/forum?id=yRYtnKAZqxU", "pdf_size": 0, "recommendation": "5;5;5;5", "confidence": "4;4;4;4", "correctness": "2;1;2;4", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "23;21;62;113", "wc_summary_review": "22;16;19;24", "wc_main_review": "174;380;326;293", "wc_review": "219;417;407;430", "wc_reply_reviewers": "52;75;20;35", "wc_reply_authors": "650;558;546;1072", "reply_reviewers": "1;1;1;1", "reply_authors": "1;1;1;2", "recommendation_avg": [ 5.0, 0.0 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 2.25, 1.0897247358851685 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 54.75, 37.392345473371954 ], "wc_summary_review_avg": [ 20.25, 3.031088913245535 ], "wc_main_review_avg": [ 293.25, 75.52938170010397 ], "wc_review_avg": [ 368.25, 86.554534832093 ], "wc_reply_reviewers_avg": [ 45.5, 20.45116133621756 ], "wc_reply_authors_avg": [ 706.5, 214.82260123180708 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=226650568256835368&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2, "aff_unique_index": "0;1;2;1", "aff_unique_norm": "University of Michigan;Lawrence Livermore National Laboratory;Amazon", "aff_unique_dep": ";;Amazon.com, Inc.", "aff_unique_url": "https://www.umich.edu;https://www.llnl.gov;https://www.amazon.com", "aff_unique_abbr": "UM;LLNL;Amazon", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "QDrop: Randomly Dropping Quantization for Extremely Low-bit Post-Training Quantization", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/5893", "id": "ySQH0oDyp7", "poster": "", "openreview": "https://openreview.net/forum?id=ySQH0oDyp7", "slides": "https://iclr.cc/virtual/2022/poster/5893", "video": "https://iclr.cc/virtual/2022/poster/5893", "author_site": "Xiuying Wei, Ruihao Gong, Yuhang Li, Xianglong Liu, fengwei yu", "tldr": "", "abstract": "Recently, post-training quantization (PTQ) has driven much attention to produce efficient neural networks without long-time retraining. Despite the low cost, current PTQ works always fail under the extremely low-bit setting. In this study, we pioneeringly confirm that properly incorporating activation quantization into the PTQ reconstruction benefits the final accuracy. To deeply understand the inherent reason, a theoretical framework is established, which inspires us that the flatness of the optimized low-bit model on calibration and test data is crucial. Based on the conclusion, a simple yet effective approach dubbed as \\textsc{QDrop} is proposed, which randomly drops the quantization of activations during reconstruction. Extensive experiments on various tasks including computer vision (image classification, object detection) and natural language processing (text classification and question answering) prove its superiority. With \\textsc{QDrop}, the limit of PTQ is pushed to the 2-bit activation for the first time and the accuracy boost can be up to 51.49\\%. Without bells and whistles, \\textsc{QDrop} establishes a new state of the art for PTQ.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xiuying Wei;Ruihao Gong;Yuhang Li;Xianglong Liu;Fengwei Yu", "authorids": "~Xiuying_Wei1;~Ruihao_Gong1;~Yuhang_Li1;~Xianglong_Liu3;~Fengwei_Yu1", "gender": "F;M;M;;M", "homepage": "https://wimh966.github.io/;https://xhplus.github.io;;;https://forwil.xyz", "dblp": "315/9021;247/1172;;;188/5764", "google_scholar": ";8i7Z15kAAAAJ;3UzXL-AAAAAJ;;qzWfLRIAAAAJ", "orcid": ";0000-0002-6024-7086;;;", "linkedin": "%E7%A7%80%E9%A2%96-%E9%AD%8F-6b1277221/;;;;", "or_profile": "~Xiuying_Wei1;~Ruihao_Gong1;~Yuhang_Li1;~Xianglong_Liu3;~Fengwei_Yu1", "aff": "Beihang University;SenseTime;Yale University;;", "aff_domain": "buaa.edu.cn;sensetime.com;yale.edu;;", "position": "MS student;Principal Researcher;PhD student;;", "bibtex": "@inproceedings{\nwei2022qdrop,\ntitle={{QD}rop: Randomly Dropping Quantization for Extremely Low-bit Post-Training Quantization},\nauthor={Xiuying Wei and Ruihao Gong and Yuhang Li and Xianglong Liu and Fengwei Yu},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=ySQH0oDyp7}\n}", "github": "", "project": "", "reviewers": "gKCS;Y2p4;5Zj1;iK8W", "pdf_size": 0, "recommendation": "6;8;8;8", "confidence": "3;4;5;4", "correctness": "3;2;4;4", "technical_novelty": "2;3;4;3", "empirical_novelty": "2;3;4;3", "wc_summary_paper": "57;50;153;108", "wc_summary_review": "17;8;73;68", "wc_main_review": "160;288;644;271", "wc_review": "234;346;870;447", "wc_reply_reviewers": "0;0;110;0", "wc_reply_authors": "744;1227;2204;572", "reply_reviewers": "0;0;1;0", "reply_authors": "2;2;6;1", "recommendation_avg": [ 7.5, 0.8660254037844386 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 92.0, 41.731283229730664 ], "wc_summary_review_avg": [ 41.5, 29.227555491350966 ], "wc_main_review_avg": [ 340.75, 181.85072862103138 ], "wc_review_avg": [ 474.25, 240.5871723513122 ], "wc_reply_reviewers_avg": [ 27.5, 47.63139720814412 ], "wc_reply_authors_avg": [ 1186.75, 634.5003447595597 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 2.75, 1.920286436967152 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.816496580927726, "corr_recommendation_correctness": 0.17407765595569782, "gs_citation": 192, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11176064726665113074&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "pdf": "https://openreview.net/pdf?id=ySQH0oDyp7", "email": "buaa.edu.cn;sensetime.com;yale.edu;;", "author_num": 5, "aff_unique_index": "0;1;2", "aff_unique_norm": "Beihang University;SenseTime;Yale University", "aff_unique_dep": ";;", "aff_unique_url": "http://www.buaa.edu.cn/;https://www.sensetime.com;https://www.yale.edu", "aff_unique_abbr": "BUAA;SenseTime;Yale", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1", "aff_country_unique": "China;United States" }, { "id": "yV4_fWe4nM", "title": "Deep Fair Discriminative Clustering", "track": "main", "status": "Reject", "tldr": "", "abstract": "Deep clustering has the potential to learn a strong representation and hence better clustering performance than traditional clustering methods such as $k$-means and spectral clustering. However, this strong representation learning ability may make the clustering unfair by discovering surrogates for protected information which our experiments empirically show. This work studies a general notion of group-level fairness for both binary and multi-state protected status variables (PSVs). We begin by formulating the group-level fairness problem as an integer linear programming whose totally unimodular constraint matrix means it can be efficiently solved via linear programming. We then show how to inject this solver into a discriminative deep clustering backbone and hence propose a refinement learning algorithm to combine the clustering goal with the fairness objective to learn fair clusters adaptively. Experimental results on real-world datasets demonstrate that our model consistently outperforms state-of-the-art fair clustering algorithms. Furthermore, our framework shows promising results for novel fair clustering tasks including flexible fairness constraints, multi-state PSVs, and predictive clustering.", "keywords": "Clustering;Deep learning;Fairness", "primary_area": "", "supplementary_material": "/attachment/3467b45ff023a6522108f8cc9f6d818eb26d0b1f.zip", "author": "Hongjing Zhang;Ian Davidson", "authorids": "~Hongjing_Zhang1;~Ian_Davidson1", "gender": ";M", "homepage": "https://hongjingz.github.io/;http://www.cs.ucdavis.edu/~davidson/", "dblp": "54/7712;81/2360", "google_scholar": "W73FYBsAAAAJ;https://scholar.google.com.tw/citations?hl=en", "orcid": ";", "linkedin": ";", "or_profile": "~Hongjing_Zhang1;~Ian_Davidson1", "aff": "University of California, Davis;University of California, Davis", "aff_domain": "ucdavis.edu;ucdavis.edu", "position": "PhD student;Full Professor", "bibtex": "@misc{\nzhang2022deep,\ntitle={Deep Fair Discriminative Clustering},\nauthor={Hongjing Zhang and Ian Davidson},\nyear={2022},\nurl={https://openreview.net/forum?id=yV4_fWe4nM}\n}", "github": "", "project": "", "reviewers": "py3W;tDB1;Um5H;5Bwe", "site": "https://openreview.net/forum?id=yV4_fWe4nM", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "3;4;4;4", "correctness": "3;3;3;3", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;3;2;2", "wc_summary_paper": "54;63;32;137", "wc_summary_review": "13;3;52;50", "wc_main_review": "190;239;241;922", "wc_review": "257;305;325;1109", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "666;452;489;1475", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;2", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 71.5, 39.46200704475128 ], "wc_summary_review_avg": [ 29.5, 21.80022935659164 ], "wc_main_review_avg": [ 398.0, 303.22021700407777 ], "wc_review_avg": [ 499.0, 353.04957159016635 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 770.5, 414.706221318176 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.9271726499455306, "corr_recommendation_correctness": 0.0, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=901442158305014398&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0", "aff_unique_norm": "University of California, Davis", "aff_unique_dep": "", "aff_unique_url": "https://www.ucdavis.edu", "aff_unique_abbr": "UC Davis", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Davis", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "yWpo7kKaDM", "title": "Multimodal Dialogue State Tracking", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Designed for tracking user goals in dialogues, a dialogue state tracker is an essential component in a dialogue system. However, the research of dialogue state tracking has largely been limited to unimodality, in which slots and slot values are limited by knowledge domains (e.g. restaurant domain with slots of restaurant name and price range) and are defined by specific database schema. In this paper, we propose to extend the definition of dialogue state tracking to multimodality. Specifically, we introduce a novel dialogue state tracking task to track the information of visual objects that are mentioned in video-grounded dialogues. Each new dialogue utterance may introduce a new video segment, new visual objects, or new object attributes and a state tracker is required to update these information slots accordingly. Secondly, to facilitate research of this task, we developed DVD-DST, a synthetic video-grounded dialogue benchmark with annotations of multimodal dialogue states. Thirdly, we designed a novel baseline, Video-Dialogue Transformer Network (VDTN), for this task. VDTN combines both object-level features and segment-level features and learns contextual dependencies between videos and dialogues to generate multimodal dialogue states. We optimized VDTN for a state generation task as well as a self-supervised video understanding task which recovers video segment or object representations. Finally, we trained VDTN to use the decoded states in a response prediction task. Together with comprehensive ablation and qualitative analysis, we discovered interesting insights towards building more capable multimodal dialogue systems. ", "keywords": "dialogue state tracking;multimodal;video-grounded dialogue;video-dialogue transformer network;synthetic benchmark", "primary_area": "", "supplementary_material": "", "author": "Hung Le;Nancy F. Chen;Steven Hoi", "authorids": "~Hung_Le2;~Nancy_F._Chen1;~Steven_Hoi2", "gender": ";M;M", "homepage": "http://alum.mit.edu/www/nancychen;http://stevenhoi.com;https://sites.google.com/view/henryle2018/home", "dblp": "84/8761;;", "google_scholar": "https://scholar.google.com.sg/citations?user=K3Z9UiAAAAAJ;JoLjflYAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": "0000-0003-0872-5877;;", "linkedin": "nancy-chen-4644865/?originalSubdomain=sg;;hungle2012", "or_profile": "~Nancy_F._Chen1;~Steven_Hoi2;~Henry_Le1", "aff": "I2R, A*STAR;Singapore Management University;A*STAR", "aff_domain": "i2r.a-star.edu.sg;smu.edu.sg;a-star.edu.sg", "position": "Principal Researcher;Associate Professor;Research Scholar", "bibtex": "@misc{\nle2022multimodal,\ntitle={Multimodal Dialogue State Tracking},\nauthor={Hung Le and Nancy F. Chen and Steven Hoi},\nyear={2022},\nurl={https://openreview.net/forum?id=yWpo7kKaDM}\n}", "github": "", "project": "", "reviewers": "Sqbw;SmCp;NrrC;BP8R", "site": "https://openreview.net/forum?id=yWpo7kKaDM", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "3;4;3;3", "correctness": "4;3;2;3", "technical_novelty": "2;3;4;3", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "51;94;86;49", "wc_summary_review": "58;194;102;58", "wc_main_review": "224;248;446;202", "wc_review": "333;536;634;309", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "736;1121;1484;1152", "reply_reviewers": "0;0;0;0", "reply_authors": "2;2;3;2", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 70.0, 20.211382931407737 ], "wc_summary_review_avg": [ 103.0, 55.52476924760696 ], "wc_main_review_avg": [ 280.0, 97.21111047611791 ], "wc_review_avg": [ 453.0, 136.73514544549255 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 1123.25, 265.01639100251896 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.25, 0.4330127018922193 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.13245323570650439, "corr_recommendation_correctness": -0.6488856845230502, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7060798931582540618&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff_unique_index": "0;1;2", "aff_unique_norm": "A*STAR;Singapore Management University;Agency for Science, Technology and Research", "aff_unique_dep": "Institute for Infocomm Research;;", "aff_unique_url": "https://www.a-star.edu.sg;https://www.smu.edu.sg;https://www.a-star.edu.sg", "aff_unique_abbr": "A*STAR;SMU;A*STAR", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Singapore" }, { "id": "yXBb-0cPSKO", "title": "Regularized-OFU: an efficient algorithm for general contextual bandit with optimization oracles", "track": "main", "status": "Reject", "tldr": "", "abstract": "In contextual bandit, one major challenge is to develop theoretically solid and empirically efficient algorithms for general function classes. We present a novel algorithm called \\emph{regularized optimism in face of uncertainty (ROFU)} for general contextual bandit problems. It exploits an optimization oracle to calculate the well-founded upper confidence bound (UCB). Theoretically, for general function classes under very mild assumptions, it achieves a near-optimal regret bound $\\Tilde{O}(\\sqrt{T})$. Practically, one great advantage of ROFU is that the optimization oracle can be efficiently implemented with low computational cost. Thus, we can easily extend ROFU for contextual bandits with deep neural networks as the function class, which outperforms strong baselines including the UCB and Thompson sampling variants. ", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/bfbac3ea7974db8738f6b631baaa05f635b3e46d.zip", "author": "Yichi Zhou;Shihong Song;Huishuai Zhang;Jun Zhu;Wei Chen;Tie-Yan Liu", "authorids": "~Yichi_Zhou2;~Shihong_Song1;~Huishuai_Zhang3;~Jun_Zhu2;~Wei_Chen1;~Tie-Yan_Liu1", "gender": ";F;M;F;M;M", "homepage": "https://www.microsoft.com/en-us/research/people/yiczho/;;http://ml.cs.tsinghua.edu.cn/~jun;https://weichen-cas.github.io/;http://member.acm.org/~tieyanliu;https://huishuai-git.github.io", "dblp": "203/4453;213/8111;50/2644-1;;l/TieYanLiu;144/7537", "google_scholar": ";;axsP38wAAAAJ;https://scholar.google.com/citations?hl=en;Nh832fgAAAAJ;w1srHyIAAAAJ", "orcid": ";;;;0000-0002-0476-8020;", "linkedin": ";;;;;", "or_profile": "~Yichi_Zhou2;~Shihong_Song1;~Jun_Zhu2;~Wei_Chen1;~Tie-Yan_Liu1;~Huishuai_Zhang2", "aff": "Microsoft;Tsinghua University;Tsinghua University; Chinese Academy of Sciences;Microsoft;Microsoft Research Asia", "aff_domain": "microsoft.com;tsinghua.edu.cn;mail.tsinghua.edu.cn;ict.ac.cn;microsoft.com;microsoft.com", "position": "Microsoft research;MS student;Professor;Full Professor;Distinguished Scientist;Researcher", "bibtex": "@misc{\nzhou2022regularizedofu,\ntitle={Regularized-{OFU}: an efficient algorithm for general contextual bandit with optimization oracles},\nauthor={Yichi Zhou and Shihong Song and Huishuai Zhang and Jun Zhu and Wei Chen and Tie-Yan Liu},\nyear={2022},\nurl={https://openreview.net/forum?id=yXBb-0cPSKO}\n}", "github": "", "project": "", "reviewers": "aMkn;xPt8;sNy5;e8Nb", "site": "https://openreview.net/forum?id=yXBb-0cPSKO", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "4;4;4;4", "correctness": "2;3;3;3", "technical_novelty": "2;1;2;3", "empirical_novelty": "3;2;3;2", "wc_summary_paper": "48;113;47;66", "wc_summary_review": "31;46;38;9", "wc_main_review": "255;654;279;249", "wc_review": "334;813;364;324", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 68.5, 26.781523481684157 ], "wc_summary_review_avg": [ 31.0, 13.765899897936205 ], "wc_main_review_avg": [ 359.25, 170.543799359578 ], "wc_review_avg": [ 458.75, 205.0553278995696 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:OHYVq5x7_0MJ:scholar.google.com/&scioq=Regularized-OFU:+an+efficient+algorithm+for+general+contextual+bandit+with+optimization+oracles&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1;1;2;0;0", "aff_unique_norm": "Microsoft;Tsinghua University;Chinese Academy of Sciences", "aff_unique_dep": "Microsoft Corporation;;", "aff_unique_url": "https://www.microsoft.com;https://www.tsinghua.edu.cn;https://www.cas.cn", "aff_unique_abbr": "Microsoft;THU;CAS", "aff_campus_unique_index": "1", "aff_campus_unique": ";Asia", "aff_country_unique_index": "0;1;1;1;0;1", "aff_country_unique": "United States;China" }, { "title": "Variational autoencoders in the presence of low-dimensional data: landscape and implicit bias", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6885", "id": "y_op4lLLaWL", "poster": "", "openreview": "https://openreview.net/forum?id=y_op4lLLaWL", "slides": "https://iclr.cc/virtual/2022/poster/6885", "video": "https://iclr.cc/virtual/2022/poster/6885", "author_site": "Frederic Koehler, Viraj Mehta, Chenghui Zhou, Andrej Risteski", "tldr": "", "abstract": "Variational Autoencoders (VAEs) are one of the most commonly used generative models, particularly for image data. A prominent difficulty in training VAEs is data that is supported on a lower dimensional manifold. Recent work by Dai and Wipf (2020) proposes a two-stage training algorithm for VAEs, based on a conjecture that in standard VAE training the generator will converge to a solution with 0 variance which is correctly supported on the ground truth manifold. They gave partial support for this conjecture by showing that some optima of the VAE loss do satisfy this property, but did not analyze the training dynamics. In this paper, we show that for linear encoders/decoders, the conjecture is true\u2014that is the VAE training does recover a generator with support equal to the ground truth manifold\u2014and does so due to an implicit bias of gradient descent rather than merely the VAE loss itself. In the nonlinear case, we show that VAE training frequently learns a higher-dimensional manifold which is a superset of the ground truth manifold.", "keywords": "variational autoencoders;encoder;optima;stability;low-dimensional manifold", "primary_area": "", "supplementary_material": "", "author": "Frederic Koehler;Viraj Mehta;Chenghui Zhou;Andrej Risteski", "authorids": "~Frederic_Koehler1;~Viraj_Mehta1;~Chenghui_Zhou1;~Andrej_Risteski2", "gender": ";M;F;M", "homepage": "https://frkoehle.github.io/;http://virajm.com;;", "dblp": "132/1904;https://dblp.org/pers/m/Mehta:Viraj.html;https://dblp.uni-trier.de/pers/hd/z/Zhou:Chenghui;63/11143", "google_scholar": ";4pHjHBkAAAAJ;bl_OT0MAAAAJ;", "orcid": ";0000-0002-2021-9718;;", "linkedin": ";virajrmehta/;chenghui-zhou-35616289;", "or_profile": "~Frederic_Koehler1;~Viraj_Mehta1;~Chenghui_Zhou1;~Andrej_Risteski2", "aff": "University of California, Berkeley;Carnegie Mellon University;Carnegie Mellon University;Carnegie Mellon University", "aff_domain": "berkeley.edu;cmu.edu;andrew.cmu.edu;cmu.edu", "position": "Postdoc;PhD student;PhD student;Assistant Professor", "bibtex": "@inproceedings{\nkoehler2022variational,\ntitle={Variational autoencoders in the presence of low-dimensional data: landscape and implicit bias},\nauthor={Frederic Koehler and Viraj Mehta and Chenghui Zhou and Andrej Risteski},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=y_op4lLLaWL}\n}", "github": "", "project": "", "reviewers": "KKdc;ayh8;MVZZ;U2NF", "pdf_size": 0, "recommendation": "5;5;6;8", "confidence": "4;3;3;4", "correctness": "3;3;3;3", "technical_novelty": "2;2;4;3", "empirical_novelty": "3;2;2;3", "wc_summary_paper": "56;95;104;66", "wc_summary_review": "64;222;16;64", "wc_main_review": "89;525;253;539", "wc_review": "209;842;373;669", "wc_reply_reviewers": "129;645;0;0", "wc_reply_authors": "300;1749;160;543", "reply_reviewers": "1;2;0;0", "reply_authors": "1;4;1;2", "recommendation_avg": [ 6.0, 1.224744871391589 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.75, 0.82915619758885 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 80.25, 19.828956099603428 ], "wc_summary_review_avg": [ 91.5, 77.85081887815953 ], "wc_main_review_avg": [ 351.5, 189.6490179252189 ], "wc_review_avg": [ 523.25, 247.06919577316796 ], "wc_reply_reviewers_avg": [ 193.5, 265.9403128523391 ], "wc_reply_authors_avg": [ 688.0, 627.708929361372 ], "reply_reviewers_avg": [ 0.75, 0.82915619758885 ], "reply_authors_avg": [ 2.0, 1.224744871391589 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.40824829046386296, "corr_recommendation_correctness": 0.0, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12373917463845389421&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "pdf": "https://openreview.net/pdf?id=y_op4lLLaWL", "email": "berkeley.edu;cmu.edu;andrew.cmu.edu;cmu.edu", "author_num": 4, "aff_unique_index": "0;1;1;1", "aff_unique_norm": "University of California, Berkeley;Carnegie Mellon University", "aff_unique_dep": ";", "aff_unique_url": "https://www.berkeley.edu;https://www.cmu.edu", "aff_unique_abbr": "UC Berkeley;CMU", "aff_campus_unique_index": "0", "aff_campus_unique": "Berkeley;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "y_tIL5vki1l", "title": "LatentKeypointGAN: Controlling GANs via Latent Keypoints", "track": "main", "status": "Reject", "tldr": "", "abstract": "Generative adversarial networks (GANs) have attained photo-realistic quality in image generation. However, how to best control the image content remains an open challenge. We introduce LatentKeypointGAN, a two-stage GAN which is trained end-to-end on the classical GAN objective with internal conditioning on a set of space keypoints. These keypoints have associated appearance embeddings that respectively control the position and style of the generated objects and their parts. A major difficulty that we address with suitable network architectures and training schemes is disentangling the image into spatial and appearance factors without domain knowledge and supervision signals. We demonstrate that LatentKeypointGAN provides an interpretable latent space that can be used to re-arrange the generated images by re-positioning and exchanging keypoint embeddings, such as generating portraits by combining the eyes, nose, and mouth from different images. In addition, the explicit generation of keypoints and matching images enables a new, GAN-based method for unsupervised keypoint detection.", "keywords": "Part Disentanglement;Unsupervised Learning", "primary_area": "", "supplementary_material": "/attachment/3744eb92136cd6aada77e37ad70a8aea0db4a5b5.zip", "author": "Xingzhe He;Bastian Wandt;Helge Rhodin", "authorids": "~Xingzhe_He1;~Bastian_Wandt2;~Helge_Rhodin5", "gender": "M;M;", "homepage": "https://xingzhehe.github.io/;http://bastianwandt.de;", "dblp": "258/0493;;", "google_scholar": "25tDZpwAAAAJ;z4aXEBYAAAAJ;", "orcid": ";;", "linkedin": ";;", "or_profile": "~Xingzhe_He1;~Bastian_Wandt2;~Helge_Rhodin5", "aff": "University of British Columbia;University of British Columbia;", "aff_domain": "cs.ubc.ca;ubc.ca;", "position": "PhD student;Postdoc;", "bibtex": "@misc{\nhe2022latentkeypointgan,\ntitle={LatentKeypoint{GAN}: Controlling {GAN}s via Latent Keypoints},\nauthor={Xingzhe He and Bastian Wandt and Helge Rhodin},\nyear={2022},\nurl={https://openreview.net/forum?id=y_tIL5vki1l}\n}", "github": "", "project": "", "reviewers": "X57n;TTKS;Jmzu;kpkc", "site": "https://openreview.net/forum?id=y_tIL5vki1l", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "4;4;4;4", "correctness": "4;3;2;3", "technical_novelty": "2;3;3;3", "empirical_novelty": "1;2;2;3", "wc_summary_paper": "51;83;56;61", "wc_summary_review": "58;27;107;28", "wc_main_review": "195;319;1012;119", "wc_review": "304;429;1175;208", "wc_reply_reviewers": "89;176;151;48", "wc_reply_authors": "267;2241;1803;255", "reply_reviewers": "1;1;1;1", "reply_authors": "1;4;4;1", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 62.75, 12.214233500306108 ], "wc_summary_review_avg": [ 55.0, 32.50384592629001 ], "wc_main_review_avg": [ 411.25, 354.11324106844694 ], "wc_review_avg": [ 529.0, 381.11087625519167 ], "wc_reply_reviewers_avg": [ 116.0, 50.44303718056636 ], "wc_reply_authors_avg": [ 1141.5, 894.0239090762618 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 2.5, 1.5 ], "replies_avg": [ 20, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": -0.7071067811865475, "gs_citation": 21, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4961718794807720257&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0", "aff_unique_norm": "University of British Columbia", "aff_unique_dep": "", "aff_unique_url": "https://www.ubc.ca", "aff_unique_abbr": "UBC", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Canada" }, { "id": "ybsh6zEzIKA", "title": "Intrusion-Free Graph Mixup", "track": "main", "status": "Reject", "tldr": "", "abstract": " We present a simple and yet effective interpolation-based regularization technique to improve the generalization of Graph Neural Networks (GNNs). We leverage the recent advances in Mixup regularizer for vision and text, where random sample pairs and their labels are interpolated to create synthetic samples for training. Unlike images or natural sentences, which embrace a grid or linear sequence format, graphs have arbitrary structure and topology, which play a vital role on the semantic information of a graph. Consequently, even simply deleting or adding one edge from a graph can dramatically change its semantic meanings. This makes interpolating graph inputs very challenging because mixing random graph pairs may naturally create graphs with identical structure but with different labels, causing the manifold intrusion issue. To cope with this obstacle, we propose the first input mixing schema for Mixup on graph. We theoretically prove that our mixing strategy can recover the source graphs from the mixed graph, and guarantees that the mixed graphs are manifold intrusion free. We also empirically show that our method can effectively regularize the graph classification learning, resulting in superior predictive accuracy over popular graph augmentation baselines. ", "keywords": "graph augmentation;Mixup;graph classification;graph neural networks", "primary_area": "", "supplementary_material": "", "author": "Hongyu Guo;Yongyi Mao", "authorids": "~Hongyu_Guo1;~Yongyi_Mao2", "gender": "M;M", "homepage": "https://hongyuharryguo.github.io/;http://www.eecs.uottawa.ca/~yymao", "dblp": ";86/2933", "google_scholar": "https://scholar.google.ca/citations?user=bZUqlakAAAAJ;https://scholar.google.ca/citations?user=jM5l70wAAAAJ", "orcid": ";0000-0001-5298-5778", "linkedin": "harry-h-y-guo-a582087/;", "or_profile": "~Hongyu_Guo1;~Yongyi_Mao1", "aff": "National Research Council Canada;University of Ottawa", "aff_domain": "nrc-cnrc.gc.ca;eecs.uottawa.ca", "position": "Senior Research Officer;Full Professor", "bibtex": "@misc{\nguo2022intrusionfree,\ntitle={Intrusion-Free Graph Mixup},\nauthor={Hongyu Guo and Yongyi Mao},\nyear={2022},\nurl={https://openreview.net/forum?id=ybsh6zEzIKA}\n}", "github": "", "project": "", "reviewers": "n1Dk;q8bs;7hBS;d3Ri", "site": "https://openreview.net/forum?id=ybsh6zEzIKA", "pdf_size": 0, "recommendation": "3;5;5;8", "confidence": "5;4;4;4", "correctness": "2;3;2;4", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "105;44;42;202", "wc_summary_review": "30;16;52;45", "wc_main_review": "302;189;404;102", "wc_review": "437;249;498;349", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "794;921;1159;32", "reply_reviewers": "0;0;0;0", "reply_authors": "2;2;3;1", "recommendation_avg": [ 5.25, 1.7853571071357126 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.82915619758885 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 98.25, 65.03220356100506 ], "wc_summary_review_avg": [ 35.75, 13.899190623917638 ], "wc_main_review_avg": [ 249.25, 114.0644006690957 ], "wc_review_avg": [ 383.25, 93.87857849371176 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 726.5, 421.8332016330626 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.0, 0.7071067811865476 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.7276068751089989, "corr_recommendation_correctness": 0.8866206949335731, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3426656929932943468&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1", "aff_unique_norm": "National Research Council Canada;University of Ottawa", "aff_unique_dep": ";", "aff_unique_url": "https://www.nrc-cnrc.gc.ca;https://www.uottawa.ca", "aff_unique_abbr": "NRC-CNRC;U Ottawa", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Canada" }, { "title": "Image BERT Pre-training with Online Tokenizer", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6156", "id": "ydopy-e6Dg", "poster": "", "openreview": "https://openreview.net/forum?id=ydopy-e6Dg", "slides": "https://iclr.cc/virtual/2022/poster/6156", "video": "https://iclr.cc/virtual/2022/poster/6156", "author_site": "Jinghao Zhou, Chen Wei, Huiyu Wang, Wei Shen, Cihang Xie, Alan Yuille, Tao Kong", "tldr": "", "abstract": "The success of language Transformers is primarily attributed to the pretext task of masked language modeling (MLM), where texts are first tokenized into semantically meaningful pieces.\nIn this work, we study masked image modeling (MIM) and indicate the necessity and challenges of using a semantically meaningful visual tokenizer.\nWe present a self-supervised framework iBOT that can perform masked prediction with an online tokenizer. \nSpecifically, we perform self-distillation on masked patch tokens and take the teacher network as the online tokenizer, along with self-distillation on the class token to acquire visual semantics.\nThe online tokenizer is jointly learnable with the MIM objective and dispenses with a multi-stage training pipeline where the tokenizer needs to be pre-trained beforehand.\nWe show the prominence of iBOT by achieving an 82.3% linear probing accuracy and an 87.8% fine-tuning accuracy evaluated on ImageNet-1K.\nBeyond the state-of-the-art image classification results, we underline emerging local semantic patterns, which helps the models to obtain strong robustness against common corruptions and achieve leading results on dense downstream tasks, e.g., object detection, instance segmentation, and semantic segmentation.", "keywords": "online tokenizer;masked image modeling;vision transformer", "primary_area": "", "supplementary_material": "", "author": "Jinghao Zhou;Chen Wei;Huiyu Wang;Wei Shen;Cihang Xie;Alan Yuille;Tao Kong", "authorids": "~Jinghao_Zhou1;~Chen_Wei2;~Huiyu_Wang1;~Wei_Shen2;~Cihang_Xie3;~Alan_Yuille1;~Tao_Kong3", "gender": "M;;;M;M;M;M", "homepage": "https://shallowtoil.github.io/;https://weichen582.github.io/;http://csrhddlam.github.io/;https://shenwei1231.github.io/;;http://www.taokong.org;https://cihangxie.github.io/", "dblp": "69/5505;181/2831-5;;71/3692-2;y/AlanLYuille;01/2492;175/3366", "google_scholar": "AoDJADEAAAAJ;https://scholar.google.com/citations?hl=en;SnmuYloAAAAJ;Ae2kRCEAAAAJ;;kSUXLPkAAAAJ;X3vVZPcAAAAJ", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": "~Jinghao_Zhou1;~Chen_Wei2;~Huiyu_Wang1;~Wei_Shen2;~Alan_Yuille1;~Tao_Kong3;~cihang_xie1", "aff": "University of Oxford, University of Oxford;Johns Hopkins University;Johns Hopkins University;Shanghai Jiaotong University;Johns Hopkins University;Bytedance;University of California, Santa Cruz", "aff_domain": "eng.ox.ac.uk;jhu.edu;jhu.edu;sjtu.edu.cn;johnshopkins.edu;bytedance.com;ucsc.edu", "position": "PhD student;PhD student;PhD student;Associate Professor;Full Professor;Researcher;Assistant Professor", "bibtex": "@inproceedings{\nzhou2022image,\ntitle={Image {BERT} Pre-training with Online Tokenizer},\nauthor={Jinghao Zhou and Chen Wei and Huiyu Wang and Wei Shen and Cihang Xie and Alan Yuille and Tao Kong},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=ydopy-e6Dg}\n}", "github": "", "project": "", "reviewers": "WgwF;s5wd;RoXr", "pdf_size": 0, "recommendation": "6;6;8", "confidence": "4;4;4", "correctness": "3;3;4", "technical_novelty": "2;2;3", "empirical_novelty": "3;3;3", "wc_summary_paper": "71;93;152", "wc_summary_review": "50;41;26", "wc_main_review": "207;343;134", "wc_review": "328;477;312", "wc_reply_reviewers": "35;59;0", "wc_reply_authors": "545;694;406", "reply_reviewers": "1;1;0", "reply_authors": "1;1;1", "recommendation_avg": [ 6.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 105.33333333333333, 34.19876540981495 ], "wc_summary_review_avg": [ 39.0, 9.899494936611665 ], "wc_main_review_avg": [ 228.0, 86.60638929470889 ], "wc_review_avg": [ 372.3333333333333, 74.29819797425925 ], "wc_reply_reviewers_avg": [ 31.333333333333332, 24.225789747475496 ], "wc_reply_authors_avg": [ 548.3333333333334, 117.59913076020399 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.9999999999999998, "gs_citation": 1044, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6668235945473015803&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=ydopy-e6Dg", "email": "eng.ox.ac.uk;jhu.edu;jhu.edu;sjtu.edu.cn;johnshopkins.edu;bytedance.com;ucsc.edu", "author_num": 7, "aff_unique_index": "0;1;1;2;1;3;4", "aff_unique_norm": "University of Oxford;Johns Hopkins University;Shanghai Jiao Tong University;ByteDance;University of California, Santa Cruz", "aff_unique_dep": ";;;;", "aff_unique_url": "https://www.ox.ac.uk;https://www.jhu.edu;https://www.sjtu.edu.cn;https://www.bytedance.com;https://www.ucsc.edu", "aff_unique_abbr": "Oxford;JHU;SJTU;Bytedance;UCSC", "aff_campus_unique_index": "1", "aff_campus_unique": ";Santa Cruz", "aff_country_unique_index": "0;1;1;2;1;2;1", "aff_country_unique": "United Kingdom;United States;China" }, { "title": "Finding Biological Plausibility for Adversarially Robust Features via Metameric Tasks", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6482", "id": "yeP_zx9vqNm", "poster": "", "openreview": "https://openreview.net/forum?id=yeP_zx9vqNm", "slides": "https://iclr.cc/virtual/2022/poster/6482", "video": "https://iclr.cc/virtual/2022/poster/6482", "author_site": "Anne Harrington, Arturo Deza", "tldr": "", "abstract": "Recent work suggests that feature constraints in the training datasets of deep neural networks (DNNs) drive robustness to adversarial noise (Ilyas et al., 2019). The representations learned by such adversarially robust networks have also been shown to be more human perceptually-aligned than non-robust networks via image manipulations (Santurkar et al., 2019, Engstrom et al., 2019). Despite appearing closer to human visual perception, it is unclear if the constraints in robust DNN representations match biological constraints found in human vision. Human vision seems to rely on texture-based/summary statistic representations in the periphery, which have been shown to explain phenomena such as crowding (Balas et al., 2009) and performance on visual search tasks (Rosenholtz et al., 2012). To understand how adversarially robust optimizations/representations compare to human vision, we performed a psychophysics experiment using a metamer task similar to Freeman \\& Simoncelli, 2011, Wallis et al., 2016 and Deza et al., 2019 where we evaluated how well human observers could distinguish between images synthesized to match adversarially robust representations compared to non-robust representations and a texture synthesis model of peripheral vision (Texforms a la Long et al., 2018). We found that the discriminability of robust representation and texture model images decreased to near chance performance as stimuli were presented farther in the periphery. Moreover, performance on robust and texture-model images showed similar trends within participants, while performance on non-robust representations changed minimally across the visual field. These results together suggest that (1) adversarially robust representations capture peripheral computation better than non-robust representations and (2) robust representations capture peripheral computation similar to current state-of-the-art texture peripheral vision models. More broadly, our findings support the idea that localized texture summary statistic representations may drive human invariance to adversarial perturbations and that the incorporation of such representations in DNNs could give rise to useful properties like adversarial robustness.", "keywords": "Peripheral Computation;Adversarial Robustness;Perceptual Invariance;Metamerism;Texture;Psychophysics", "primary_area": "", "supplementary_material": "/attachment/1811a2ca899eae0183454e1177acfcdadf2f587e.zip", "author": "Anne Harrington;Arturo Deza", "authorids": "annekh@mit.edu;~Arturo_Deza1", "gender": ";M", "homepage": ";http://arturodeza.wikidot.com/", "dblp": ";160/8606", "google_scholar": ";KZLsTmQAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "annekh@mit.edu;~Arturo_Deza1", "aff": ";Massachusetts Institute of Technology", "aff_domain": ";mit.edu", "position": ";Postdoc", "bibtex": "@inproceedings{\nharrington2022finding,\ntitle={Finding Biological Plausibility for Adversarially Robust Features via Metameric Tasks},\nauthor={Anne Harrington and Arturo Deza},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=yeP_zx9vqNm}\n}", "github": "", "project": "", "reviewers": "5yZa;teJp;U8eW;s6dV", "pdf_size": 0, "recommendation": "8;8;8;8", "confidence": "4;4;3;4", "correctness": "3;3;3;2", "technical_novelty": "3;3;3;3", "empirical_novelty": "3;3;3;3", "wc_summary_paper": "85;150;163;206", "wc_summary_review": "35;73;102;153", "wc_main_review": "443;520;910;917", "wc_review": "563;743;1175;1276", "wc_reply_reviewers": "0;306;222;140", "wc_reply_authors": "772;949;1419;1344", "reply_reviewers": "0;2;1;2", "reply_authors": "1;2;2;2", "recommendation_avg": [ 8.0, 0.0 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.0 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 151.0, 43.376260788592646 ], "wc_summary_review_avg": [ 90.75, 43.083494519363214 ], "wc_main_review_avg": [ 697.5, 217.72287431503378 ], "wc_review_avg": [ 939.25, 295.4051243631363 ], "wc_reply_reviewers_avg": [ 167.0, 112.87603820120549 ], "wc_reply_authors_avg": [ 1121.0, 269.2201701210368 ], "reply_reviewers_avg": [ 1.25, 0.82915619758885 ], "reply_authors_avg": [ 1.75, 0.4330127018922193 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 21, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16970067038193259370&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=yeP_zx9vqNm", "email": ";mit.edu", "author_num": 2, "aff_unique_index": "0", "aff_unique_norm": "Massachusetts Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://web.mit.edu", "aff_unique_abbr": "MIT", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "title": "OntoProtein: Protein Pretraining With Gene Ontology Embedding", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6505", "id": "yfe1VMYAXa4", "poster": "", "openreview": "https://openreview.net/forum?id=yfe1VMYAXa4", "slides": "https://iclr.cc/virtual/2022/poster/6505", "video": "https://iclr.cc/virtual/2022/poster/6505", "author_site": "Ningyu Zhang, Zhen Bi, Xiaozhuan Liang, Siyuan Cheng, Haosen Hong, Shumin Deng, Qiang Zhang, Jiazhang Lian, Huajun Chen", "tldr": "", "abstract": "Self-supervised protein language models have proved their effectiveness in learning the proteins representations. With the increasing computational power, current protein language models pre-trained with millions of diverse sequences can advance the parameter scale from million-level to billion-level and achieve remarkable improvement. However, those prevailing approaches rarely consider incorporating knowledge graphs (KGs), which can provide rich structured knowledge facts for better protein representations. We argue that informative biology knowledge in KGs can enhance protein representation with external knowledge. In this work, we propose OntoProtein, the first general framework that makes use of structure in GO (Gene Ontology) into protein pre-training models. We construct a novel large-scale knowledge graph that consists of GO and its related proteins, and gene annotation texts or protein sequences describe all nodes in the graph. We propose novel contrastive learning with knowledge-aware negative sampling to jointly optimize the knowledge graph and protein embedding during pre-training. Experimental results show that OntoProtein can surpass state-of-the-art methods with pre-trained protein language models in TAPE benchmark and yield better performance compared with baselines in protein-protein interaction and protein function prediction.", "keywords": "pre-trained language model;knowledge graph;protein representation", "primary_area": "", "supplementary_material": "/attachment/1e384ee55d65d84e735e381af28923b6721171dc.zip", "author": "Ningyu Zhang;Zhen Bi;Xiaozhuan Liang;Siyuan Cheng;Haosen Hong;Shumin Deng;Qiang Zhang;Jiazhang Lian;Huajun Chen", "authorids": "~Ningyu_Zhang1;~Zhen_Bi1;~Xiaozhuan_Liang1;~Siyuan_Cheng2;~Haosen_Hong1;~Shumin_Deng1;~Qiang_Zhang6;~Jiazhang_Lian1;~Huajun_Chen1", "gender": "M;M;M;M;F;;M;M;M", "homepage": "https://person.zju.edu.cn/en/ningyu;;https://github.com/cheng-simian;https://github.com/Azai-yx;https://231sm.github.io/;https://qiangairesearcher.github.io;https://person.zju.edu.cn/en/lianlab;;https://www.researchgate.net/profile/Bi-Zhen-2/research", "dblp": "139/4181-1.html;295/8804;;;213/1853;72/3527-26;;94/5089;279/8441", "google_scholar": "xQDOPvsAAAAJ;https://scholar.google.com.hk/citations?user=Rmt2jcYAAAAJ;;;3am3hL4AAAAJ;https://scholar.google.com/citations?hl=zh-CN;49aU78YAAAAJ;;https://scholar.google.com/citations?hl=zh-CN", "orcid": "0000-0002-1970-0678;;;;;;;;", "linkedin": "ningyuzhang/;;;;;;;;", "or_profile": "~Ningyu_Zhang1;~Xiaozhuan_Liang1;~Siyuan_Cheng2;~Haosen_Hong1;~Shumin_Deng1;~Qiang_Zhang6;~Jiazhang_Lian1;~Huajun_Chen1;~Bi_Zhen1", "aff": "Zhejiang University;Zhejiang University;Zhejiang University;Zhejiang University;Zhejiang University;Zhejiang University;Zhejiang University;Zhejiang University;Zhejiang University", "aff_domain": "zju.edu.cn;zju.edu.cn;zju.edu.cn;zju.edu.cn;zju.edu.cn;zju.edu.cn;zju.edu.cn;zju.edu.cn;zju.edu.cn", "position": "Associate Professor;MS student;MS student;MS student;PhD student;Principal Researcher;Assistant Professor;Full Professor;PhD student", "bibtex": "@inproceedings{\nzhang2022ontoprotein,\ntitle={OntoProtein: Protein Pretraining With Gene Ontology Embedding},\nauthor={Ningyu Zhang and Zhen Bi and Xiaozhuan Liang and Siyuan Cheng and Haosen Hong and Shumin Deng and Qiang Zhang and Jiazhang Lian and Huajun Chen},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=yfe1VMYAXa4}\n}", "github": "", "project": "", "reviewers": "XNXb;GdWW;fXpi", "pdf_size": 0, "recommendation": "6;6;6", "confidence": "3;4;4", "correctness": "4;3;3", "technical_novelty": "4;2;3", "empirical_novelty": "3;2;3", "wc_summary_paper": "61;75;67", "wc_summary_review": "218;213;110", "wc_main_review": "7;652;754", "wc_review": "286;940;931", "wc_reply_reviewers": "0;90;0", "wc_reply_authors": "127;743;375", "reply_reviewers": "0;1;0", "reply_authors": "1;1;1", "recommendation_avg": [ 6.0, 0.0 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 3.0, 0.816496580927726 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 67.66666666666667, 5.734883511361751 ], "wc_summary_review_avg": [ 180.33333333333334, 49.775049528408864 ], "wc_main_review_avg": [ 471.0, 330.7294967189954 ], "wc_review_avg": [ 719.0, 306.1992815145065 ], "wc_reply_reviewers_avg": [ 30.0, 42.42640687119285 ], "wc_reply_authors_avg": [ 415.0, 253.06652616785703 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 9, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 118, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17820920484975929118&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=yfe1VMYAXa4", "email": "zju.edu.cn;zju.edu.cn;zju.edu.cn;zju.edu.cn;zju.edu.cn;zju.edu.cn;zju.edu.cn;zju.edu.cn;zju.edu.cn", "author_num": 9, "aff_unique_index": "0;0;0;0;0;0;0;0;0", "aff_unique_norm": "Zhejiang University", "aff_unique_dep": "", "aff_unique_url": "https://www.zju.edu.cn", "aff_unique_abbr": "ZJU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0", "aff_country_unique": "China" }, { "id": "ygGMP1zkiD1", "title": "Debiasing Pretrained Text Encoders by Paying Attention to Paying Attention", "track": "main", "status": "Reject", "tldr": "", "abstract": "Recent studies in fair Representation Learning have observed a strong inclination for natural language processing (NLP) models to exhibit discriminatory stereotypes across gender, religion, race and many such social constructs. In comparison to the progress made in reducing bias from static word embeddings, fairness in sentence-level text encoders received little consideration despite their wider applicability in contemporary NLP tasks. In this paper, we propose a debiasing method for pre-trained text encoders that both reduces social stereotypes, and inflicts next to no semantic offset. Unlike previous studies that directly manipulate the embeddings, we suggest to dive deeper into the operation of these encoders, and pay more attention to the way they pay attention to different social groups. We find that the attention mechanism is the root of all stereotypes. Then, we work on model debiasing by redistributing the attention scores of a text encoder such that it forgets any preference to historically advantaged groups, and attends to all social classes with the same intensity. Our experiments confirm that we successfully reduce bias with little damage to semantic representation.", "keywords": "Fairness;Pretrained Text Encoders;Self-Attention;Knowledge Distillation;Social Biases;Debiasing", "primary_area": "", "supplementary_material": "/attachment/71f450ed95730f20e74f96091f40430f6888827f.zip", "author": "Yacine GACI;Boualem Benatallah;Fabio Casati;Khalid Benabdeslem", "authorids": "~Yacine_GACI1;~Boualem_Benatallah2;fabio.casati@gmail.com;khalid.benabdeslem@univ-lyon1.fr", "gender": "M;M;;", "homepage": ";https://scholar.google.com/citations?user=WYmChp0AAAAJ&hl=en;;", "dblp": "287/6851;b/BoualemBenatallah.html;;", "google_scholar": "iRIVXygAAAAJ;WYmChp0AAAAJ;;", "orcid": ";0000-0002-8805-1130;;", "linkedin": "yacine-gaci-4951b2111/;;;", "or_profile": "~Yacine_GACI1;~Boualem_Benatallah2;fabio.casati@gmail.com;khalid.benabdeslem@univ-lyon1.fr", "aff": "University of Lyon;University of New South Wales;;", "aff_domain": "univ-lyon1.fr;unsw.edu.au;;", "position": "PhD student;Full Professor;;", "bibtex": "@misc{\ngaci2022debiasing,\ntitle={Debiasing Pretrained Text Encoders by Paying Attention to Paying Attention},\nauthor={Yacine GACI and Boualem Benatallah and Fabio Casati and Khalid Benabdeslem},\nyear={2022},\nurl={https://openreview.net/forum?id=ygGMP1zkiD1}\n}", "github": "", "project": "", "reviewers": "7L6Q;CGM2;KjSU", "site": "https://openreview.net/forum?id=ygGMP1zkiD1", "pdf_size": 0, "recommendation": "3;6;6", "confidence": "3;4;4", "correctness": "2;3;3", "technical_novelty": "2;3;3", "empirical_novelty": "2;3;3", "wc_summary_paper": "28;272;75", "wc_summary_review": "41;53;24", "wc_main_review": "182;323;352", "wc_review": "251;648;451", "wc_reply_reviewers": "0;22;0", "wc_reply_authors": "972;616;1187", "reply_reviewers": "0;1;0", "reply_authors": "2;1;2", "recommendation_avg": [ 5.0, 1.4142135623730951 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 2.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 125.0, 105.7008356952142 ], "wc_summary_review_avg": [ 39.333333333333336, 11.897712198383164 ], "wc_main_review_avg": [ 285.6666666666667, 74.25332016519904 ], "wc_review_avg": [ 450.0, 162.07611380665156 ], "wc_reply_reviewers_avg": [ 7.333333333333333, 10.370899457402697 ], "wc_reply_authors_avg": [ 925.0, 235.46691204215225 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 1.0, "corr_recommendation_correctness": 1.0, "gs_citation": 29, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5939036739585977143&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff_unique_index": "0;1", "aff_unique_norm": "University of Lyon;University of New South Wales", "aff_unique_dep": ";", "aff_unique_url": "https://www.universite-lyon.fr;https://www.unsw.edu.au", "aff_unique_abbr": "UCBL;UNSW", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "France;Australia" }, { "title": "Geometry-Consistent Neural Shape Representation with Implicit Displacement Fields", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6338", "id": "yhCp5RcZD7", "poster": "", "openreview": "https://openreview.net/forum?id=yhCp5RcZD7", "slides": "https://iclr.cc/virtual/2022/poster/6338", "video": "https://iclr.cc/virtual/2022/poster/6338", "author_site": "Wang Yifan, Lukas Rahmann, Olga Sorkine-hornung", "tldr": "", "abstract": "We present implicit displacement fields, a novel representation for detailed 3D geometry. Inspired by a classic surface deformation technique, displacement mapping, our method represents a complex surface as a smooth base surface plus a displacement along the base's normal directions, resulting in a frequency-based shape decomposition, where the high-frequency signal is constrained geometrically by the low-frequency signal. Importantly, this disentanglement is unsupervised thanks to a tailored architectural design that has an innate frequency hierarchy by construction. We explore implicit displacement field surface reconstruction and detail transfer\nand demonstrate superior representational power, training stability, and generalizability.", "keywords": "implicit functions;shape reconstruction;shape representation;object reconstruction", "primary_area": "", "supplementary_material": "/attachment/db4dd196277d0511907b7f22dc08f56e105d8dfd.zip", "author": "Wang Yifan;Lukas Rahmann;Olga Sorkine-hornung", "authorids": "~Wang_Yifan1;~Lukas_Rahmann1;~Olga_Sorkine-hornung1", "gender": "F;;F", "homepage": "https://yifita.github.io/;;https://igl.ethz.ch/people/sorkine/", "dblp": "47/6959-11;217/2089;s/OlgaSorkine.html", "google_scholar": "4zyT8SYAAAAJ;;https://scholar.google.com/citations?hl=en", "orcid": "0000-0002-2275-7288;;0000-0002-8089-3974", "linkedin": ";;", "or_profile": "~Wang_Yifan1;~Lukas_Rahmann1;~Olga_Sorkine-hornung1", "aff": "Department of Computer Science, ETHZ - ETH Zurich;;ETH Zurich", "aff_domain": "inf.ethz.ch;;ethz.ch", "position": "PhD student;;Full Professor", "bibtex": "@inproceedings{\nyifan2022geometryconsistent,\ntitle={Geometry-Consistent Neural Shape Representation with Implicit Displacement Fields},\nauthor={Wang Yifan and Lukas Rahmann and Olga Sorkine-hornung},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=yhCp5RcZD7}\n}", "github": "", "project": "", "reviewers": "G54X;Eb6T;GmTU;WHEF", "pdf_size": 0, "recommendation": "5;6;6;10", "confidence": "4;3;4;4", "correctness": "4;4;4;4", "technical_novelty": "2;3;3;3", "empirical_novelty": "3;3;3;4", "wc_summary_paper": "78;68;76;146", "wc_summary_review": "59;60;47;95", "wc_main_review": "162;129;150;384", "wc_review": "299;257;273;625", "wc_reply_reviewers": "0;0;0;16", "wc_reply_authors": "370;426;493;99", "reply_reviewers": "0;0;0;1", "reply_authors": "1;1;1;1", "recommendation_avg": [ 6.75, 1.920286436967152 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 4.0, 0.0 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 92.0, 31.400636936215164 ], "wc_summary_review_avg": [ 65.25, 17.92170471802278 ], "wc_main_review_avg": [ 206.25, 103.30143997060254 ], "wc_review_avg": [ 363.5, 151.7193132069876 ], "wc_reply_reviewers_avg": [ 4.0, 6.928203230275509 ], "wc_reply_authors_avg": [ 347.0, 149.65794332410158 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.22549380840084865, "corr_recommendation_correctness": 0.0, "gs_citation": 42, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1893838131986981154&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=yhCp5RcZD7", "email": "inf.ethz.ch;;ethz.ch", "author_num": 3, "aff_unique_index": "0;0", "aff_unique_norm": "ETH Zurich", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.ethz.ch", "aff_unique_abbr": "ETHZ", "aff_campus_unique_index": "0", "aff_campus_unique": "Zurich;", "aff_country_unique_index": "0;0", "aff_country_unique": "Switzerland" }, { "id": "yhjfOvBvvmz", "title": "Weakly-Supervised Learning of Disentangled and Interpretable Skills for Hierarchical Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Hierarchical reinforcement learning (RL) usually requires task-agnostic and interpretable skills that can be applicable to various downstream tasks. While many recent works have been proposed to learn such skills for a policy in unsupervised manner, the learned skills are still uninterpretable. To alleviate this, we propose a novel WEakly-supervised learning approach for learning Disentangled and Interpretable Skills (WEDIS) from the continuous latent representations of trajectories. We accomplish this by extending a trajectory variational autoencoder (VAE) to impose an inductive bias with weak labels, which explicitly enforces the trajectory representations to be disentangled into factors of interest that we intend the model to learn. Given the latent representations as skills, a skill-based policy network is trained to generate similar trajectories to the learned decoder of the trajectory VAE. Additionally, we propose to train a policy network with single-step transitions and perform the trajectory-level behaviors at test time with the knowledge on the skills, which simplifies the exploration problem in the training. With a sample-efficient planning strategy based on the skills, we demonstrate that our method is effective in solving the hierarchical RL problems in experiments on several challenging navigation tasks with a long horizon and sparse rewards.", "keywords": "Reinforcement learning;Variational autoencoder;Disentangled representation learning", "primary_area": "", "supplementary_material": "", "author": "Wonil Song;Sangryul Jeon;Hyesong Choi;Kwanghoon Sohn;Dongbo Min", "authorids": "~Wonil_Song1;~Sangryul_Jeon1;~Hyesong_Choi1;~Kwanghoon_Sohn2;~Dongbo_Min3", "gender": "M;;F;M;M", "homepage": "http://diml.yonsei.ac.kr/;https://sr-jeon.github.io/;;https://diml.yonsei.ac.kr;http://cvl.ewha.ac.kr", "dblp": ";195/6099;275/3868;21/2373;44/1149", "google_scholar": ";MIO6n6AAAAAJ;Ll3vLUsAAAAJ;zEtk0QsAAAAJ;3REUPXYAAAAJ", "orcid": ";;0000-0003-4440-0164;;", "linkedin": ";;;;", "or_profile": "~Wonil_Song1;~Sangryul_Jeon1;~Hyesong_Choi1;~Kwanghoon_Sohn2;~Dongbo_Min3", "aff": "Yonsei Univ.;Yonsei Univ.;Ewha Womans University;Yonsei University;Ewha Womans University", "aff_domain": "yonsei.ac.kr;ee.yonsei.ac.kr;ewha.ac.kr;yonsei.ac.kr;ewha.ac.kr", "position": "PhD student;PhD student;PhD student;Full Professor;Associate Professor", "bibtex": "@misc{\nsong2022weaklysupervised,\ntitle={Weakly-Supervised Learning of Disentangled and Interpretable Skills for Hierarchical Reinforcement Learning},\nauthor={Wonil Song and Sangryul Jeon and Hyesong Choi and Kwanghoon Sohn and Dongbo Min},\nyear={2022},\nurl={https://openreview.net/forum?id=yhjfOvBvvmz}\n}", "github": "", "project": "", "reviewers": "FYox;BoDo;CPCV;onQA", "site": "https://openreview.net/forum?id=yhjfOvBvvmz", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "3;4;3;4", "correctness": "3;2;3;3", "technical_novelty": "2;2;3;2", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "90;76;85;90", "wc_summary_review": "40;53;39;30", "wc_main_review": "252;625;404;490", "wc_review": "382;754;528;610", "wc_reply_reviewers": "0;146;107;0", "wc_reply_authors": "349;1246;728;706", "reply_reviewers": "0;1;1;0", "reply_authors": "1;3;1;1", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 85.25, 5.717298313014636 ], "wc_summary_review_avg": [ 40.5, 8.200609733428363 ], "wc_main_review_avg": [ 442.75, 135.40194791804143 ], "wc_review_avg": [ 568.5, 134.68017671506078 ], "wc_reply_reviewers_avg": [ 63.25, 64.735519616359 ], "wc_reply_authors_avg": [ 757.25, 319.77599581582103 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.5, 0.8660254037844386 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9256334287951243418&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;1;0;1", "aff_unique_norm": "Yonsei University;Ewha Womans University", "aff_unique_dep": ";", "aff_unique_url": "https://www.yonsei.ac.kr;http://www.ewha.ac.kr", "aff_unique_abbr": "Yonsei;Ewha", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "South Korea" }, { "title": "FP-DETR: Detection Transformer Advanced by Fully Pre-training", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6043", "id": "yjMQuLLcGWK", "poster": "", "openreview": "https://openreview.net/forum?id=yjMQuLLcGWK", "slides": "https://iclr.cc/virtual/2022/poster/6043", "video": "https://iclr.cc/virtual/2022/poster/6043", "author_site": "Wen Wang, Yang Cao, Jing Zhang, Dacheng Tao", "tldr": "", "abstract": "Large-scale pre-training has proven to be effective for visual representation learning on downstream tasks, especially for improving robustness and generalization. However, the recently developed detection transformers only employ pre-training on its backbone while leaving the key component, i.e., a 12-layer transformer, being trained from scratch, which prevents the model from above benefits. This separated training paradigm is mainly caused by the discrepancy between the upstream and downstream tasks. To mitigate the issue, we propose FP-DETR, a new method that Fully Pre-Trains an encoder-only transformer and smoothly fine-tunes it for object detection via a task adapter. Inspired by the success of textual prompts in NLP, we treat query positional embeddings as visual prompts to help the model attend to the target area (prompting) and recognize the object. To this end, we propose the task adapter which leverages self-attention to model the contextual relation between object query embedding. Experiments on the challenging COCO dataset demonstrate that our FP-DETR achieves competitive performance. Moreover, it enjoys better robustness to common corruptions and generalization to small-size datasets than state-of-the-art detection transformers. Code will be made publicly available at $\\url{https://github.com/encounter1997/FP-DETR}$.", "keywords": "Object Detection;Detection Transformer;Pre-training;Visual Prompt", "primary_area": "", "supplementary_material": "", "author": "Wen Wang;Yang Cao;Jing Zhang;Dacheng Tao", "authorids": "~Wen_Wang7;~Yang_Cao5;~Jing_Zhang17;~Dacheng_Tao1", "gender": "M;M;M;", "homepage": ";;;", "dblp": "29/4680-9;25/7045-10;05/3499-37.html;", "google_scholar": "1ks0R04AAAAJ;K7rTHNcAAAAJ;https://scholar.google.com/citations?hl=en;", "orcid": ";;0000-0001-6595-7661;", "linkedin": ";;;", "or_profile": "~Wen_Wang7;~Yang_Cao5;~Jing_Zhang17;~Dacheng_Tao1", "aff": "University of Science and Technology of China;University of Science and Technology of China;The University of Sydney;", "aff_domain": "ustc.edu.cn;ustc.edu.cn;sydney.edu.au;", "position": "MS student;Associate Professor;Research Fellow;", "bibtex": "@inproceedings{\nwang2022fpdetr,\ntitle={{FP}-{DETR}: Detection Transformer Advanced by Fully Pre-training},\nauthor={Wen Wang and Yang Cao and Jing Zhang and Dacheng Tao},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=yjMQuLLcGWK}\n}", "github": "", "project": "", "reviewers": "Jj3q;s2Cu;JhDk;MjUh", "pdf_size": 0, "recommendation": "5;6;6;6", "confidence": "4;4;4;5", "correctness": "3;3;4;3", "technical_novelty": "2;2;2;2", "empirical_novelty": "3;3;3;2", "wc_summary_paper": "76;114;72;59", "wc_summary_review": "52;76;45;52", "wc_main_review": "333;277;143;162", "wc_review": "461;467;260;273", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "963;791;720;2196", "reply_reviewers": "0;0;0;0", "reply_authors": "2;1;1;4", "recommendation_avg": [ 5.75, 0.4330127018922193 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 80.25, 20.474068965401088 ], "wc_summary_review_avg": [ 56.25, 11.755317945508747 ], "wc_main_review_avg": [ 228.75, 79.06445155694182 ], "wc_review_avg": [ 365.25, 98.87966171058636 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 1167.5, 600.3417776566945 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.0, 1.224744871391589 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 43, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5535525640134212787&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2, "pdf": "https://openreview.net/pdf?id=yjMQuLLcGWK", "email": "ustc.edu.cn;ustc.edu.cn;sydney.edu.au;", "author_num": 4, "aff_unique_index": "0;0;1", "aff_unique_norm": "University of Science and Technology of China;University of Sydney", "aff_unique_dep": ";", "aff_unique_url": "http://www.ustc.edu.cn;https://www.sydney.edu.au", "aff_unique_abbr": "USTC;USYD", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1", "aff_country_unique": "China;Australia" }, { "id": "yjsA8Uin-Y", "title": "A Good Representation Detects Noisy Labels", "track": "main", "status": "Reject", "tldr": "", "abstract": "Label noise is pervasive in real-world datasets, which encodes wrong correlation patterns and impairs the generalization of deep neural networks (DNNs). It is critical to find efficient ways to detect the corrupted patterns. Current methods primarily focus on designing robust training techniques to prevent DNNs from memorizing corrupted patterns. This approach has two outstanding caveats: 1) applying this approach to each individual dataset would often require customized training processes; 2) as long as the model is trained with noisy supervisions, overfitting to corrupted patterns is often hard to avoid, leading to performance drop in detection. In this paper, given good representations, we propose a universally applicable and training-free solution to detect noisy labels. Intuitively, good representations help define \"neighbors\" of each training instance, and closer instances are more likely to share the same clean label. Based on the neighborhood information, we propose two methods: the first one uses \"local voting\" via checking the noisy label consensuses of nearby representations. The second one is a ranking-based approach that scores each instance and filters out a guaranteed number of instances that are likely to be corrupted, again using only representations. Given good (but possibly imperfect) representations that are commonly available in practice, we theoretically analyze how they affect the local voting and provide guidelines for tuning neighborhood size. We also prove the worst-case error bound for the ranking-based method. Experiments with both synthetic and real-world label noise demonstrate our training-free solutions are consistently and significantly improving over most of the training-based baselines.", "keywords": "noisy labels;sample selection;training-free;representation;noise dection", "primary_area": "", "supplementary_material": "/attachment/70a889b143ebe50399e65462bafcc07a0aee19b9.zip", "author": "Zhaowei Zhu;Zihao Dong;Hao Cheng;Yang Liu", "authorids": "~Zhaowei_Zhu1;~Zihao_Dong1;~Hao_Cheng5;~Yang_Liu3", "gender": "M;M;M;M", "homepage": "https://www.zzw.ai;;https://haochenglouis.github.io;http://www.yliuu.com", "dblp": "202/1712;;;51/3710-18", "google_scholar": "YS8pSQoAAAAJ;;ftlVqVIAAAAJ;jKrIVCIAAAAJ", "orcid": "0000-0003-3894-5862;;0000-0001-8864-7818;0000-0001-8420-6011", "linkedin": ";dong-zihao-a13741212/;;", "or_profile": "~Zhaowei_Zhu1;~Zihao_Dong1;~Hao_Cheng5;~Yang_Liu3", "aff": "University of California, Santa Cruz;University of California, Santa Cruz;Tencent Youtu Lab;University of California, Santa Cruz", "aff_domain": "ucsc.edu;ucsc.edu;tencent.com;ucsc.edu", "position": "PhD student;Undergrad student;Researcher;Assistant Professor", "bibtex": "@misc{\nzhu2022a,\ntitle={A Good Representation Detects Noisy Labels},\nauthor={Zhaowei Zhu and Zihao Dong and Hao Cheng and Yang Liu},\nyear={2022},\nurl={https://openreview.net/forum?id=yjsA8Uin-Y}\n}", "github": "", "project": "", "reviewers": "1co7;fWAV;fRLd;39dB", "site": "https://openreview.net/forum?id=yjsA8Uin-Y", "pdf_size": 0, "recommendation": "5;5;5;6", "confidence": "3;4;4;4", "correctness": "3;2;3;3", "technical_novelty": "3;2;3;2", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "113;79;69;103", "wc_summary_review": "177;37;29;69", "wc_main_review": "413;242;151;164", "wc_review": "703;358;249;336", "wc_reply_reviewers": "148;68;0;0", "wc_reply_authors": "1159;952;685;517", "reply_reviewers": "2;1;0;0", "reply_authors": "4;2;1;1", "recommendation_avg": [ 5.25, 0.4330127018922193 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 91.0, 17.72004514666935 ], "wc_summary_review_avg": [ 78.0, 59.08468498688979 ], "wc_main_review_avg": [ 242.5, 104.40905133176912 ], "wc_review_avg": [ 411.5, 173.16249593950764 ], "wc_reply_reviewers_avg": [ 54.0, 60.959002616512684 ], "wc_reply_authors_avg": [ 828.25, 246.02172160197563 ], "reply_reviewers_avg": [ 0.75, 0.82915619758885 ], "reply_authors_avg": [ 2.0, 1.224744871391589 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2275158578714411505&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "University of California, Santa Cruz;Tencent", "aff_unique_dep": ";Youtu Lab", "aff_unique_url": "https://www.ucsc.edu;https://www.tencent.com", "aff_unique_abbr": "UCSC;Tencent", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Santa Cruz;", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "United States;China" }, { "id": "yjxVspo7gXt", "title": "Scaling Fair Learning to Hundreds of Intersectional Groups", "track": "main", "status": "Reject", "tldr": "", "abstract": "Bias mitigation algorithms aim to reduce the performance disparity between different protected groups. Existing techniques focus on settings where there is a small number of protected groups arising from a single protected attribute, such as skin color, gender or age. In real-world applications, however, there are multiple protected attributes yielding a large number of intersectional protected groups. These intersectional groups are particularly prone to severe underrepresentation in datasets. We conduct the first thorough empirical analysis of how existing bias mitigation methods scale to this setting, using large-scale datasets including the ImageNet People Subtree and CelebA. We find that as more protected attributes are introduced to a task, it becomes more important to leverage the protected attribute labels during training to promote fairness. We also find that the use of knowledge distillation, in conjunction with group-specific models, can help scale existing fair learning methods to hundreds of protected intersectional groups and reduce bias. We show on ImageNet's People Subtree that combining these insights can further reduce the bias amplification of fair learning algorithms by 15% ---a surprising reduction given that the dataset has 196 protected groups but fewer than 10% of the training dataset has protected attribute labels.", "keywords": "machine fairness;intersectional fairness;bias mitigation;fair learning;knowledge distillation", "primary_area": "", "supplementary_material": "/attachment/ca871456aed0e8ded6dc9f19fe114e3db5996057.zip", "author": "Eric Zhao;De-An Huang;Hao Liu;Zhiding Yu;Anqi Liu;Olga Russakovsky;Anima Anandkumar", "authorids": "~Eric_Zhao1;~De-An_Huang1;~Hao_Liu2;~Zhiding_Yu1;~Anqi_Liu2;~Olga_Russakovsky1;~Anima_Anandkumar1", "gender": "M;M;M;;F;F;", "homepage": "https://eric-zhao.com;http://ai.stanford.edu/~dahuang/;;;https://anqiliu-ai.github.io/;http://cs.princeton.edu/~olgarus;", "dblp": "294/8327.html;119/0335;09/3214-15;;;52/6883;", "google_scholar": "6OfjaHQAAAAJ;HEY3UzgAAAAJ;aXo4NLcAAAAJ;;Q8yp6zQAAAAJ;TB5OwW8AAAAJ;", "orcid": ";;;;0000-0002-0468-5698;0000-0001-5272-3241;", "linkedin": ";;;;;;", "or_profile": "~Eric_Zhao1;~De-An_Huang1;~Hao_Liu2;~Zhiding_Yu1;~Anqi_Liu2;~Olga_Russakovsky1;~Anima_Anandkumar1", "aff": "University of California, Berkeley;NVIDIA;California Institute of Technology;;University of Illinois, Chicago;Princeton University;", "aff_domain": "berkeley.edu;nvidia.com;caltech.edu;;uic.edu;princeton.edu;", "position": "PhD student;Research Scientist;PhD student;;PhD student;Assistant Professor;", "bibtex": "@misc{\nzhao2022scaling,\ntitle={Scaling Fair Learning to Hundreds of Intersectional Groups},\nauthor={Eric Zhao and De-An Huang and Hao Liu and Zhiding Yu and Anqi Liu and Olga Russakovsky and Anima Anandkumar},\nyear={2022},\nurl={https://openreview.net/forum?id=yjxVspo7gXt}\n}", "github": "", "project": "", "reviewers": "CCHj;F8SZ;zw8E;JEnE", "site": "https://openreview.net/forum?id=yjxVspo7gXt", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "3;4;3;3", "correctness": "3;2;4;3", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;2;3;4", "wc_summary_paper": "70;79;118;33", "wc_summary_review": "32;93;33;15", "wc_main_review": "214;302;230;326", "wc_review": "316;474;381;374", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "247;188;417;430", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 75.0, 30.224162519414826 ], "wc_summary_review_avg": [ 43.25, 29.600464523382062 ], "wc_main_review_avg": [ 268.0, 47.11687595755899 ], "wc_review_avg": [ 386.25, 56.596709268295804 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 320.5, 105.1914920514012 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": 0.7071067811865475, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12464678797004818986&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;2;3;4", "aff_unique_norm": "University of California, Berkeley;NVIDIA;California Institute of Technology;University of Illinois at Chicago;Princeton University", "aff_unique_dep": ";NVIDIA Corporation;;;", "aff_unique_url": "https://www.berkeley.edu;https://www.nvidia.com;https://www.caltech.edu;https://www.uic.edu;https://www.princeton.edu", "aff_unique_abbr": "UC Berkeley;NVIDIA;Caltech;UIC;Princeton", "aff_campus_unique_index": "0;2;3", "aff_campus_unique": "Berkeley;;Pasadena;Chicago", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "youe3QQepVB", "title": "Generative Modeling for Multitask Visual Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Generative modeling has recently shown great promise in computer vision, but it has mostly focused on synthesizing visually realistic images. In this paper, motivated by multi-task learning of shareable feature representations, we consider a novel problem of learning a shared generative model that is useful across various visual perception tasks. Correspondingly, we propose a general multi-task oriented generative modeling (MGM) framework, by coupling a discriminative multi-task network with a generative network. While it is challenging to synthesize both RGB images and pixel-level annotations in multi-task scenarios, our framework enables us to use synthesized images paired with only weak annotations (i.e., image-level scene labels) to facilitate multiple visual tasks. Experimental evaluation on challenging multi-task benchmarks, including NYUv2 and Taskonomy, demonstrates that our MGM framework improves the performance of all the tasks by large margins, especially in the low-data regimes, and our model consistently outperforms state-of-the-art multi-task approaches.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhipeng Bao;Yu-Xiong Wang;Martial Hebert", "authorids": "~Zhipeng_Bao1;~Yu-Xiong_Wang1;~Martial_Hebert1", "gender": "M;;M", "homepage": "https://zpbao.github.io/;https://yxw.cs.illinois.edu/;http://www.cs.cmu.edu/~hebert/", "dblp": "244/8798;35/10700;h/MartialHebert", "google_scholar": "TwYdLuYAAAAJ;T_Q-xDkAAAAJ;https://scholar.google.com.tw/citations?user=0ytii2EAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Zhipeng_Bao1;~Yu-Xiong_Wang1;~Martial_Hebert1", "aff": "Carnegie Mellon University;Department of Computer Science, University of Illinois Urbana-Champaign;Carnegie Mellon University", "aff_domain": "cmu.edu;cs.illinois.edu;cmu.edu", "position": "PhD student;Assistant Professor;Professor", "bibtex": "@misc{\nbao2022generative,\ntitle={Generative Modeling for Multitask Visual Learning},\nauthor={Zhipeng Bao and Yu-Xiong Wang and Martial Hebert},\nyear={2022},\nurl={https://openreview.net/forum?id=youe3QQepVB}\n}", "github": "", "project": "", "reviewers": "X2gh;Ytpa;zrdP", "site": "https://openreview.net/forum?id=youe3QQepVB", "pdf_size": 0, "recommendation": "5;5;6", "confidence": "4;3;4", "correctness": "3;3;4", "technical_novelty": "3;2;3", "empirical_novelty": "2;2;3", "wc_summary_paper": "117;68;135", "wc_summary_review": "208;89;31", "wc_main_review": "579;331;281", "wc_review": "904;488;447", "wc_reply_reviewers": "292;0;0", "wc_reply_authors": "2559;1164;699", "reply_reviewers": "1;0;0", "reply_authors": "4;2;1", "recommendation_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 106.66666666666667, 28.311756490114764 ], "wc_summary_review_avg": [ 109.33333333333333, 73.6764699359451 ], "wc_main_review_avg": [ 397.0, 130.30221282336944 ], "wc_review_avg": [ 613.0, 206.44773349849754 ], "wc_reply_reviewers_avg": [ 97.33333333333333, 137.65012007098127 ], "wc_reply_authors_avg": [ 1474.0, 790.3480246068816 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 2.3333333333333335, 1.247219128924647 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.4999999999999999, "corr_recommendation_correctness": 0.9999999999999997, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7184993398931022761&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "aff_unique_index": "0;1;0", "aff_unique_norm": "Carnegie Mellon University;University of Illinois Urbana-Champaign", "aff_unique_dep": ";Department of Computer Science", "aff_unique_url": "https://www.cmu.edu;https://illinois.edu", "aff_unique_abbr": "CMU;UIUC", "aff_campus_unique_index": "1", "aff_campus_unique": ";Urbana-Champaign", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "yphXO883gqN", "title": "The Deep Generative Decoder: using MAP estimates of representations", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "A deep generative model is characterized by a representation space, its distribution, and a neural network mapping the representation to a distribution over vectors in feature space. Common methods such as variational autoencoders (VAEs) apply variational inference for training the neural network, but optimizing these models is often non-trivial. The encoder adds to the complexity of the model and introduces an amortization gap and the quality of the variational approximation is usually unknown. Additionally, the balance of the loss terms of the objective function heavily influences performance. Therefore, we argue that it is worthwhile to investigate a much simpler approximation which finds representations and their distribution by maximizing the model likelihood via back-propagation. In this approach, there is no encoder, and we therefore call it a Deep Generative Decoder (DGD). Using the CIFAR10 data set, we show that the DGD is easier and faster to optimize than the VAE, achieves more consistent low reconstruction errors of test data, and alleviates the problem of balancing the reconstruction and distribution loss terms. Although the model in its simple form cannot compete with state-of-the-art image generation approaches, it obtains better image generation scores than the variational approach on the CIFAR10 data. We demonstrate on MNIST data how the use of a Gaussian mixture with priors can lead to a clear separation of classes in a 2D representation space, and how the DGD can be used with labels to obtain a supervised representation.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Viktoria Schuster;Anders Krogh", "authorids": "~Viktoria_Schuster1;~Anders_Krogh2", "gender": "F;M", "homepage": ";", "dblp": ";k/AndersKrogh", "google_scholar": "zV6sROUAAAAJ;-vGMjmwAAAAJ", "orcid": "0000-0002-6016-8337;0000-0002-5147-6282", "linkedin": "viktoria-schuster-0bbaa9156/;", "or_profile": "~Viktoria_Schuster1;~Anders_Krogh2", "aff": "University of Copenhagen;University of Copenhagen", "aff_domain": "ku.dk;ku.dk", "position": "PhD student;Full Professor", "bibtex": "@misc{\nschuster2022the,\ntitle={The Deep Generative Decoder: using {MAP} estimates of representations},\nauthor={Viktoria Schuster and Anders Krogh},\nyear={2022},\nurl={https://openreview.net/forum?id=yphXO883gqN}\n}", "github": "", "project": "", "reviewers": "z3KZ;rdPJ;mLy7;zvj7", "site": "https://openreview.net/forum?id=yphXO883gqN", "pdf_size": 0, "recommendation": "3;3;3;6", "confidence": "5;4;4;4", "correctness": "3;4;3;1", "technical_novelty": "1;1;1;3", "empirical_novelty": "1;2;2;3", "wc_summary_paper": "95;48;107;105", "wc_summary_review": "38;25;53;66", "wc_main_review": "382;132;982;296", "wc_review": "515;205;1142;467", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.75, 1.299038105676658 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 2.75, 1.0897247358851685 ], "technical_novelty_avg": [ 1.5, 0.8660254037844386 ], "empirical_novelty_avg": [ 2.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 88.75, 23.962209831315644 ], "wc_summary_review_avg": [ 45.5, 15.435349040433131 ], "wc_main_review_avg": [ 448.0, 321.1199152964512 ], "wc_review_avg": [ 582.25, 344.0358811228852 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": -0.9271726499455307, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:RahemWP5NzoJ:scholar.google.com/&scioq=The+Deep+Generative+Decoder:+using+MAP+estimates+of+representations&hl=en&as_sdt=0,5", "gs_version_total": 2, "aff_unique_index": "0;0", "aff_unique_norm": "University of Copenhagen", "aff_unique_dep": "", "aff_unique_url": "https://www.ku.dk", "aff_unique_abbr": "UCPH", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Denmark" }, { "id": "yql6px0bcT", "title": "Decentralized Cross-Entropy Method for Model-Based Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Cross-Entropy Method (CEM) is a popular approach to planning in model-based reinforcement learning.\nIt has so far always taken a \\textit{centralized} approach where the sampling distribution is updated \\textit{centrally} based on the result of a top-$k$ operation applied to \\textit{all samples}.\nWe show that such a \\textit{centralized} approach makes CEM vulnerable to local optima and impair its sample efficiency, even in a one-dimensional multi-modal optimization task.\nIn this paper, we propose \\textbf{Decent}ralized \\textbf{CEM (DecentCEM)} where an ensemble of CEM instances run independently from one another and each performs a local improvement of its own sampling distribution.\nIn the exemplar optimization task, the proposed decentralized approach DecentCEM finds the global optimum much more consistently than the existing CEM approaches that use either a single Gaussian distribution or a mixture of Gaussians.\nFurther, we extend the decentralized approach to sequential decision-making problems where we show in 13 continuous control benchmark environments that it matches or outperforms the state-of-the-art CEM algorithms in most cases, under the same budget of the total number of samples for planning.", "keywords": "Reinforcement Learning;Cross-Entropy Method;Planning;Model-Based RL", "primary_area": "", "supplementary_material": "/attachment/c12f16b73d4cf6746eea1abe449774162c00b1cb.zip", "author": "Zichen Zhang;Jun Jin;Martin Jagersand;Jun Luo;Dale Schuurmans", "authorids": "~Zichen_Zhang1;~Jun_Jin1;~Martin_Jagersand1;~Jun_Luo1;~Dale_Schuurmans1", "gender": ";;M;;", "homepage": ";;http://www.ualberta.ca/science/about-us/contact-us/faculty-directory/martin-jagersand;;", "dblp": "200/8127;78/8436.html;;42/2501;", "google_scholar": "https://scholar.google.ca/citations?user=nSh2eD4AAAAJ;a6grwUcAAAAJ;;;", "orcid": ";0000-0003-4413-8565;;;", "linkedin": ";;;;", "or_profile": "~Zichen_Zhang1;~Jun_Jin1;~Martin_Jagersand1;~Jun_Luo1;~Dale_Schuurmans1", "aff": "Huawei Technologies Ltd.;Huawei Technologies Ltd. Canada;University of Alberta;Huawei Technologies Ltd.;", "aff_domain": "huawei.com;huawei.com;;huawei.com;", "position": "Intern;Researcher;Full Professor;Researcher;", "bibtex": "@misc{\nzhang2022decentralized,\ntitle={Decentralized Cross-Entropy Method for Model-Based Reinforcement Learning},\nauthor={Zichen Zhang and Jun Jin and Martin Jagersand and Jun Luo and Dale Schuurmans},\nyear={2022},\nurl={https://openreview.net/forum?id=yql6px0bcT}\n}", "github": "", "project": "", "reviewers": "niPd;hpTg;WTkw", "site": "https://openreview.net/forum?id=yql6px0bcT", "pdf_size": 0, "recommendation": "3;6;6", "confidence": "4;4;4", "correctness": "4;4;4", "technical_novelty": "1;2;3", "empirical_novelty": "1;3;2", "wc_summary_paper": "75;83;81", "wc_summary_review": "35;128;79", "wc_main_review": "332;548;330", "wc_review": "442;759;490", "wc_reply_reviewers": "85;40;0", "wc_reply_authors": "1229;1471;996", "reply_reviewers": "1;1;0", "reply_authors": "3;3;2", "recommendation_avg": [ 5.0, 1.4142135623730951 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 4.0, 0.0 ], "technical_novelty_avg": [ 2.0, 0.816496580927726 ], "empirical_novelty_avg": [ 2.0, 0.816496580927726 ], "wc_summary_paper_avg": [ 79.66666666666667, 3.39934634239519 ], "wc_summary_review_avg": [ 80.66666666666667, 37.985377303495206 ], "wc_main_review_avg": [ 403.3333333333333, 102.2980395163509 ], "wc_review_avg": [ 563.6666666666666, 139.50467933211257 ], "wc_reply_reviewers_avg": [ 41.666666666666664, 34.721111093332766 ], "wc_reply_authors_avg": [ 1232.0, 193.92954046938456 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 2.6666666666666665, 0.4714045207910317 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Ye2XUaCIBsgJ:scholar.google.com/&scioq=Decentralized+Cross-Entropy+Method+for+Model-Based+Reinforcement+Learning&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Huawei;University of Alberta", "aff_unique_dep": "Huawei Technologies;", "aff_unique_url": "https://www.huawei.com;https://www.ualberta.ca", "aff_unique_abbr": "Huawei;UAlberta", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;0", "aff_country_unique": "China;Canada" }, { "id": "yrD7B9N_54F", "title": "Few-shot graph link prediction with domain adaptation", "track": "main", "status": "Reject", "tldr": "", "abstract": "Real world link prediction problem often deals with data coming from multiple imbalanced domains. Similar problems in computer vision are often referred to as Few-Shot Learning (FSL) problems. However, for graph link prediction, this problem has rarely been addressed and explored. In this work, we propose an adversarial training based modification to the current state-of-the-arts link prediction method to solve this problem. We introduce a domain discriminator on pairs of graph-level embedding. We then use the discriminator to improve the model in an adversarial way, such that the graph embeddings generated by the model are domain agnostic. We test our proposal on 3 benchmark datasets. Our results demonstrate that, when domain differences exist, our method creates better graph embeddings that are more evenly distributed across domains and generates better prediction outcomes. ", "keywords": "link prediction;few-shot learning;domain adaptation", "primary_area": "", "supplementary_material": "/attachment/ce726ac4b48d73682170969e9ac5e34e281921c0.zip", "author": "Hao Zhu;Mahashweta Das;Mangesh Bendre;Fei Wang;Hao Yang;Soha Hassoun", "authorids": "~Hao_Zhu10;mahdas@visa.com;mbendre@visa.com;feiwang@visa.com;~Hao_Yang8;~Soha_Hassoun1", "gender": "M;;;;;F", "homepage": "https://zhuhao.org;;;;;http://www.cs.tufts.edu/~soha/", "dblp": ";;;;;82/450", "google_scholar": "mQpT66MAAAAJ;;;;https://scholar.google.com/citations?hl=en;https://scholar.google.com.tw/citations?user=tR5MNfkAAAAJ", "orcid": "0000-0002-3386-6076;;;;;0000-0001-9477-2199", "linkedin": ";;;;;sohahassoun/", "or_profile": "~Hao_Zhu10;mahdas@visa.com;mbendre@visa.com;feiwang@visa.com;~Hao_Yang8;~Soha_Hassoun1", "aff": "Tufts University;;;;Visa Research;Tufts University", "aff_domain": "tufts.edu;;;;visa.com;tufts.edu", "position": "PhD student;;;;Vice President;Full Professor", "bibtex": "@misc{\nzhu2022fewshot,\ntitle={Few-shot graph link prediction with domain adaptation},\nauthor={Hao Zhu and Mahashweta Das and Mangesh Bendre and Fei Wang and Hao Yang and Soha Hassoun},\nyear={2022},\nurl={https://openreview.net/forum?id=yrD7B9N_54F}\n}", "github": "", "project": "", "reviewers": "1WRx;XgHj;YcHW;Vi6R", "site": "https://openreview.net/forum?id=yrD7B9N_54F", "pdf_size": 0, "recommendation": "3;5;5;8", "confidence": "5;4;3;5", "correctness": "3;3;3;4", "technical_novelty": "2;2;2;4", "empirical_novelty": "2;2;2;4", "wc_summary_paper": "43;74;49;78", "wc_summary_review": "10;41;32;16", "wc_main_review": "158;135;213;37", "wc_review": "211;250;294;131", "wc_reply_reviewers": "47;50;0;0", "wc_reply_authors": "719;635;649;21", "reply_reviewers": "1;1;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 5.25, 1.7853571071357126 ], "confidence_avg": [ 4.25, 0.82915619758885 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.8660254037844386 ], "empirical_novelty_avg": [ 2.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 61.0, 15.215124054702938 ], "wc_summary_review_avg": [ 24.75, 12.356678356257397 ], "wc_main_review_avg": [ 135.75, 63.66857545131664 ], "wc_review_avg": [ 221.5, 59.935381870811504 ], "wc_reply_reviewers_avg": [ 24.25, 24.27318479310039 ], "wc_reply_authors_avg": [ 506.0, 281.8173167142147 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.1266600992762247, "corr_recommendation_correctness": 0.8892972917998875, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:xESP4VDkCx0J:scholar.google.com/&scioq=Few-shot+graph+link+prediction+with+domain+adaptation&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;0", "aff_unique_norm": "Tufts University;Visa Inc.", "aff_unique_dep": ";Research", "aff_unique_url": "https://www.tufts.edu;https://www.visa.com/", "aff_unique_abbr": "Tufts;Visa", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "yrbF6ekqQ9w", "title": "Robust fine-tuning of zero-shot models", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Large pre-trained models such as CLIP offer consistent accuracy across a range of data distributions when performing zero-shot inference (i.e., without fine-tuning on a specific dataset). Although existing fine-tuning approaches substantially improve accuracy in-distribution, they also reduce out-of-distribution robustness. We address this tension by introducing a simple and effective method for improving robustness: ensembling the weights of the zero-shot and fine-tuned models (WiSE-FT). Compared to standard fine-tuning, WiSE-FT provides large accuracy improvements out-of-distribution, while matching or improving in-distribution accuracy. On ImageNet (in-distribution) and five derived distribution shifts, WiSE-FT improves out-of-distribution accuracy by 2 to 10 percentage points (pp) while increasing in-distribution accuracy by nearly 1 pp relative to standard fine-tuning. WiSE-FT achieves similarly large robustness improvements (2 to 15 pp) on a diverse set of six further distribution shifts, and in-distribution accuracy gains of 0.8 to 3.3 pp compared to standard fine-tuning on seven commonly used transfer learning datasets. These improvements come at no additional computational cost during fine-tuning or inference.", "keywords": "robustness;zero-shot;fine-tuning;CLIP;distribution shift", "primary_area": "", "supplementary_material": "/attachment/bfc8302d5fec12469abfc7a7ef36dd5005fc9ee2.zip", "author": "Mitchell Wortsman;Gabriel Ilharco;Jong Wook Kim;Mike Li;Hanna Hajishirzi;Ali Farhadi;Hongseok Namkoong;Ludwig Schmidt", "authorids": "~Mitchell_Wortsman1;~Gabriel_Ilharco1;~Jong_Wook_Kim2;~Mike_Li3;~Hanna_Hajishirzi1;~Ali_Farhadi3;~Hongseok_Namkoong2;~Ludwig_Schmidt1", "gender": "M;M;M;;;M;M;M", "homepage": "https://mitchellnw.github.io/;http://gabrielilharco.com/;https://jongwook.kim;;;https://homes.cs.washington.edu/~ali/;https://hsnamkoong.github.io;http://people.csail.mit.edu/ludwigs/", "dblp": "232/2273;249/2616;;;;37/5826;191/6680;141/2720", "google_scholar": "fzRnjFgAAAAJ;https://scholar.google.com/citations?hl=en;;;;jeOFRDsAAAAJ;dyXX1EgAAAAJ;SWMKy70AAAAJ", "orcid": ";;;;;;;", "linkedin": ";;;;;;;ludwig-schmidt-87ba3612/", "or_profile": "~Mitchell_Wortsman1;~Gabriel_Ilharco1;~Jong_Wook_Kim2;~Mike_Li3;~Hanna_Hajishirzi1;~Ali_Farhadi3;~Hongseok_Namkoong2;~Ludwig_Schmidt1", "aff": "University of Washington, Seattle;Department of Computer Science, University of Washington;OpenAI;;;University of Washington;Columbia University;Allen Institute for Artificial Intelligence", "aff_domain": "uw.edu;cs.washington.edu;openai.com;;;cs.uw.edu;columbia.edu;allenai.org", "position": "PhD student;PhD student;Member of Technical Staff;;;Full Professor;Assistant Professor;Researcher", "bibtex": "@misc{\nwortsman2022robust,\ntitle={Robust fine-tuning of zero-shot models},\nauthor={Mitchell Wortsman and Gabriel Ilharco and Jong Wook Kim and Mike Li and Hanna Hajishirzi and Ali Farhadi and Hongseok Namkoong and Ludwig Schmidt},\nyear={2022},\nurl={https://openreview.net/forum?id=yrbF6ekqQ9w}\n}", "github": "", "project": "", "reviewers": "wZD2;VwHa;xwCM", "site": "https://openreview.net/forum?id=yrbF6ekqQ9w", "pdf_size": 0, "recommendation": "3;5;6", "confidence": "5;4;3", "correctness": "3;3;3", "technical_novelty": "2;2;3", "empirical_novelty": "1;3;4", "wc_summary_paper": "88;53;117", "wc_summary_review": "12;42;5", "wc_main_review": "307;131;240", "wc_review": "407;226;362", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 4.666666666666667, 1.247219128924647 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.6666666666666665, 1.247219128924647 ], "wc_summary_paper_avg": [ 86.0, 26.166135875720485 ], "wc_summary_review_avg": [ 19.666666666666668, 16.048537489614297 ], "wc_main_review_avg": [ 226.0, 72.5304533741977 ], "wc_review_avg": [ 331.6666666666667, 76.9429803310362 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": -0.9819805060619659, "corr_recommendation_correctness": 0.0, "gs_citation": 786, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12206160479867816283&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "aff_unique_index": "0;0;1;0;2;3", "aff_unique_norm": "University of Washington;OpenAI;Columbia University;Allen Institute for Artificial Intelligence", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.washington.edu;https://openai.com;https://www.columbia.edu;https://allenai.org", "aff_unique_abbr": "UW;OpenAI;Columbia;AI2", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Seattle;", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "ys-bh0Eer_", "title": "Block Contextual MDPs for Continual Learning", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "In reinforcement learning (RL), when defining a Markov Decision Process (MDP), the environment dynamics is implicitly assumed to be stationary. This assumption of stationarity, while simplifying, can be unrealistic in many scenarios. In the continual reinforcement learning scenario, the sequence of tasks is another source of nonstationarity. In this work, we propose to examine this continual reinforcement learning setting through the block contextual MDP (BC-MDP) framework, which enables us to relax the assumption of stationarity. This framework challenges RL algorithms to handle both nonstationarity and rich observation settings and, by additionally leveraging smoothness properties, enables us to study generalization bounds for this setting. Finally, we take inspiration from adaptive control to propose a novel algorithm that addresses the challenges introduced by this more realistic BC-MDP setting, allows for zero-shot adaptation at evaluation time, and achieves strong performance on several nonstationary environments.", "keywords": "Reinforcement Learning;MDP;Block Contextual MDP;Continual Learning", "primary_area": "", "supplementary_material": "", "author": "Shagun Sodhani;Franziska Meier;Joelle Pineau;Amy Zhang", "authorids": "~Shagun_Sodhani1;~Franziska_Meier2;~Joelle_Pineau1;~Amy_Zhang1", "gender": "M;;F;", "homepage": "https://shagunsodhani.com;;http://www.cs.mcgill.ca/~jpineau;", "dblp": "http://dblp.uni-trier.de/pers/hd/s/Sodhani:Shagun;;p/JoellePineau;", "google_scholar": "ixp-vqMAAAAJ;;https://scholar.google.ca/citations?user=CEt6_mMAAAAJ;", "orcid": ";;;", "linkedin": "shagun-sodhani-b2239879;;;", "or_profile": "~Shagun_Sodhani1;~Franziska_Meier2;~Joelle_Pineau1;~Amy_Zhang1", "aff": "Meta Facebook;;Meta Facebook;", "aff_domain": "fb.com;;fb.com;", "position": "Researcher;;Researcher Manager;", "bibtex": "@misc{\nsodhani2022block,\ntitle={Block Contextual {MDP}s for Continual Learning},\nauthor={Shagun Sodhani and Franziska Meier and Joelle Pineau and Amy Zhang},\nyear={2022},\nurl={https://openreview.net/forum?id=ys-bh0Eer_}\n}", "github": "", "project": "", "reviewers": "8QEN;coxs;zpMJ;5i2t", "site": "https://openreview.net/forum?id=ys-bh0Eer_", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "4;4;4;4", "correctness": "3;3;1;3", "technical_novelty": "2;2;2;3", "empirical_novelty": "3;2;1;3", "wc_summary_paper": "124;72;129;72", "wc_summary_review": "7;92;5;16", "wc_main_review": "1208;1048;471;346", "wc_review": "1339;1212;605;434", "wc_reply_reviewers": "82;117;0;115", "wc_reply_authors": "1893;1497;1689;644", "reply_reviewers": "1;1;0;1", "reply_authors": "3;2;3;1", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 2.5, 0.8660254037844386 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 99.25, 27.307279249313726 ], "wc_summary_review_avg": [ 30.0, 36.0347054934545 ], "wc_main_review_avg": [ 768.25, 366.84218337045155 ], "wc_review_avg": [ 897.5, 385.428657471133 ], "wc_reply_reviewers_avg": [ 78.5, 47.40516849458506 ], "wc_reply_authors_avg": [ 1430.75, 475.32429719087577 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 2.25, 0.82915619758885 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": -0.5773502691896257, "gs_citation": 35, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16222963243214111203&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0", "aff_unique_norm": "Meta", "aff_unique_dep": "Meta Platforms, Inc.", "aff_unique_url": "https://meta.com", "aff_unique_abbr": "Meta", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "yulAchHedcT", "title": "Faster Neural Net Inference via Forests of Sparse Oblique Decision Trees", "track": "main", "status": "Withdraw", "tldr": "", "abstract": " It is widely established that large neural nets can be considerably compressed by techniques such as pruning, quantization or low-rank factorization. We show that neural nets can be further compressed by replacing layers of it with a special type of decision forest. This consists of sparse oblique trees, trained with the Tree Alternating Optimization (TAO) algorithm, using a teacher-student approach. We find we can replace the fully-connected and some convolutional layers of standard architectures with a decision forest containing very few, shallow trees so that the prediction accuracy is preserved or improved, but the number of parameters and especially the inference time is greatly reduced. For example, replacing last 7 layers of VGG16 with a single tree reduces the inference FLOPs by 7440$\\times$ with a marginal increase in the test error, and a boosted ensemble of nine trees can match the network's performance while still reducing the FLOPs 6289$\\times$. The idea is orthogonal to other compression approaches, which can also be used on other parts of the net not being replaced by a forest.", "keywords": "neural network compression;tree-based compression;decision trees;decision forests", "primary_area": "", "supplementary_material": "", "author": "Yerlan Idelbayev;Arman Zharmagambetov;Magzhan Gabidolla;Miguel A. Carreira-Perpinan", "authorids": "~Yerlan_Idelbayev1;~Arman_Zharmagambetov1;~Magzhan_Gabidolla1;~Miguel_A._Carreira-Perpinan1", "gender": "M;M;;", "homepage": "http://graduatestudent.ucmerced.edu/yidelbayev/;https://arman-z.github.io/;;http://faculty.ucmerced.edu/mcarreira-perpinan/", "dblp": "203/8094;252/5004;;23/5257", "google_scholar": "nAaroNMAAAAJ;D6QocXMAAAAJ;;https://scholar.google.com/citations?hl=en", "orcid": ";;;0000-0003-3297-9375", "linkedin": ";;;miguel-a-carreira-perpinan", "or_profile": "~Yerlan_Idelbayev1;~Arman_Zharmagambetov1;~Magzhan_Gabidolla1;~Miguel_A._Carreira-Perpinan1", "aff": "Amazon Web Services;University of California at Merced;;University of California, Merced", "aff_domain": "amazon.com;ucmerced.edu;;ucmerced.edu", "position": "Researcher;PhD student;;Full Professor", "bibtex": "@misc{\nidelbayev2022faster,\ntitle={Faster Neural Net Inference via Forests of Sparse Oblique Decision Trees},\nauthor={Yerlan Idelbayev and Arman Zharmagambetov and Magzhan Gabidolla and Miguel A. Carreira-Perpinan},\nyear={2022},\nurl={https://openreview.net/forum?id=yulAchHedcT}\n}", "github": "", "project": "", "reviewers": "Mfgf;RoQa;BVTH;J1XV", "site": "https://openreview.net/forum?id=yulAchHedcT", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "3;4;3;5", "correctness": "3;4;4;4", "technical_novelty": "2;2;3;2", "empirical_novelty": "3;3;3;2", "wc_summary_paper": "119;83;72;38", "wc_summary_review": "72;68;79;11", "wc_main_review": "211;1277;78;156", "wc_review": "402;1428;229;205", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 78.0, 28.905016865589268 ], "wc_summary_review_avg": [ 57.5, 27.13392710243027 ], "wc_main_review_avg": [ 430.5, 491.0063645208685 ], "wc_review_avg": [ 566.0, 503.44562765009687 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.7608859102526822, "corr_recommendation_correctness": 0.9271726499455306, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13643342197216389960&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;1", "aff_unique_norm": "Amazon;University of California, Merced", "aff_unique_dep": "Amazon Web Services;", "aff_unique_url": "https://aws.amazon.com;https://www.ucmerced.edu", "aff_unique_abbr": "AWS;UC Merced", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Merced", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "yuv0mwPOlz3", "title": "Active Learning over Multiple Domains in Natural Language Tasks", "track": "main", "status": "Reject", "tldr": "", "abstract": "Studies of active learning traditionally assume the target and source data stem from a single domain. However, in realistic applications, practitioners often require active learning with multiple sources of out-of-distribution data, where it is unclear a priori which data sources will help or hurt the target domain. We survey a wide variety of techniques in active learning (AL), domain shift detection (DS), and multi-domain sampling to examine this challenging setting for question answering and sentiment analysis. We ask (1) what family of methods are effective for this task? And, (2) what properties of selected examples and domains achieve strong results? Among 18 acquisition functions from 4 families of methods, we find H- Divergence methods, and particularly our proposed variant DAL-E, yield effective results, averaging 2-3% improvements over the random baseline. We also show the importance of a diverse allocation of domains, as well as room-for-improvement of existing methods on both domain and example selection. Our findings yield the first comprehensive analysis of both existing and novel methods for practitioners faced with multi-domain active learning for natural language tasks.", "keywords": "natural language processing;active learning;domain shift detection", "primary_area": "", "supplementary_material": "", "author": "Shayne Longpre;Julia Rachel Reisler;Edward Greg Huang;Yi Lu;Andrew Frank;Nikhil Ramesh;Chris DuBois", "authorids": "~Shayne_Longpre1;~Julia_Rachel_Reisler1;~Edward_Greg_Huang1;~Yi_Lu6;~Andrew_Frank1;nikhilr@apple.com;~Chris_DuBois1", "gender": "M;F;;M;M;;", "homepage": "https://www.shaynelongpre.com;;;;;;", "dblp": "190/7024;;;;;;https://dblp.uni-trier.de/pers/hd/d/DuBois:Chris", "google_scholar": "ADd_YfkAAAAJ;;;https://scholar.google.com/citations?hl=en;3t5AFu4AAAAJ;;", "orcid": ";0000-0001-5378-0635;;;;;", "linkedin": "shayne-redford-longpre/;julia-reisler-b48471149;;yi-lu-b14636a3/;;;", "or_profile": "~Shayne_Longpre1;~Julia_Rachel_Reisler1;~Edward_Greg_Huang1;~Yi_Lu6;~Andrew_Frank1;nikhilr@apple.com;~Chris_DuBois1", "aff": "Research, Google;Apple;;Forethought;Apple;;", "aff_domain": "research.google.com;apple.com;;forethought.ai;apple.com;;", "position": "Intern;Researcher;;Principal Researcher;Researcher;;", "bibtex": "@misc{\nlongpre2022active,\ntitle={Active Learning over Multiple Domains in Natural Language Tasks},\nauthor={Shayne Longpre and Julia Rachel Reisler and Edward Greg Huang and Yi Lu and Andrew Frank and Nikhil Ramesh and Chris DuBois},\nyear={2022},\nurl={https://openreview.net/forum?id=yuv0mwPOlz3}\n}", "github": "", "project": "", "reviewers": "aTPg;EJMS;cB6j;mKxw", "site": "https://openreview.net/forum?id=yuv0mwPOlz3", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "2;2;3;4", "correctness": "3;3;3;3", "technical_novelty": "2;2;2;2", "empirical_novelty": "1;2;3;3", "wc_summary_paper": "124;88;144;55", "wc_summary_review": "124;52;27;47", "wc_main_review": "124;755;199;202", "wc_review": "372;895;370;304", "wc_reply_reviewers": "0;431;0;9", "wc_reply_authors": "599;1964;779;347", "reply_reviewers": "0;1;0;1", "reply_authors": "1;3;2;2", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 2.75, 0.82915619758885 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 102.75, 34.098203765007916 ], "wc_summary_review_avg": [ 62.5, 36.718523935474316 ], "wc_main_review_avg": [ 320.0, 253.08397815744877 ], "wc_review_avg": [ 485.25, 238.1463573099534 ], "wc_reply_reviewers_avg": [ 110.0, 185.36585446084723 ], "wc_reply_authors_avg": [ 922.25, 620.7186862822804 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 2.0, 0.7071067811865476 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.7608859102526822, "corr_recommendation_correctness": 0.0, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12112306420796363235&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1;2;1", "aff_unique_norm": "Google;Apple;Forethought", "aff_unique_dep": "Google Research;Apple Inc.;", "aff_unique_url": "https://research.google;https://www.apple.com;https://www.forethought.com", "aff_unique_abbr": "Google;Apple;", "aff_campus_unique_index": "0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "ywEx0OiJflS", "title": "Multi-Class Classification from Single-Class Data with Confidences", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Can we learn a multi-class classifier from only \\emph{data from a single class}? We show that without any assumptions on the loss functions, models, and optimizers, we can successfully learn a multi-class classifier from only data from a single class with a rigorous consistency guarantee when \\emph{confidences} (i.e., the class-posterior probabilities) are available. Specifically, we propose an empirical risk minimization framework that is loss-/model-/optimizer-independent. Instead of constructing a boundary between the given class and all the other classes, our method can conduct discriminative classification between all the classes even if no data from the other classes are given. We further theoretically and experimentally show that our method can be Bayes-consistent with a simple modification even if the provided confidences are highly noisy. Then, we provide an extension of our method for the case where data from a subset of all the classes are available. Experimental results demonstrate the effectiveness of our methods.", "keywords": "Weakly supervised learning;unbiased risk estimator;empirical risk minimization", "primary_area": "", "supplementary_material": "/attachment/35ff055e01a74df74374482f6ac1cc66eb680858.zip", "author": "Yuzhou Cao;Lei Feng;Senlin Shu;Yitian Xu;Gang Niu;Masashi Sugiyama", "authorids": "~Yuzhou_Cao1;~Lei_Feng1;~Senlin_Shu1;~Yitian_Xu1;~Gang_Niu1;~Masashi_Sugiyama1", "gender": "M;M;M;M;M;M", "homepage": "https://yzcao-nkg.github.io/;https://lfeng1995.github.io/;;;https://niug1984.github.io;http://www.ms.k.u-tokyo.ac.jp/sugi/", "dblp": "256/5052;76/847-6;https://dblp.uni-trier.de/pid/257/5650.html;07/2647.html;26/3367-1;35/1228", "google_scholar": "https://scholar.google.com/citations?hl=zh-CN;https://scholar.google.com.sg/citations?user=KomQOFkAAAAJ;;;https://scholar.google.co.jp/citations?user=HOkcy00AAAAJ;https://scholar.google.co.jp/citations?user=GkYIrlIAAAAJ", "orcid": ";0000-0003-2839-5799;;0000-0001-7577-4420;;0000-0001-6658-6743", "linkedin": ";;;;;", "or_profile": "~Yuzhou_Cao1;~Lei_Feng1;~Senlin_Shu1;~Yitian_Xu1;~Gang_Niu1;~Masashi_Sugiyama1", "aff": "Nanyang Technological University;Chongqing University;;China Agricultural University;RIKEN;The University of Tokyo", "aff_domain": "ntu.edu;cqu.edu.cn;;cau.edu.cn;riken.jp;u-tokyo.ac.jp", "position": "PhD student;Full Professor;;Full Professor;Research Scientist (tenured);Full Professor", "bibtex": "@misc{\ncao2022multiclass,\ntitle={Multi-Class Classification from Single-Class Data with Confidences},\nauthor={Yuzhou Cao and Lei Feng and Senlin Shu and Yitian Xu and Gang Niu and Masashi Sugiyama},\nyear={2022},\nurl={https://openreview.net/forum?id=ywEx0OiJflS}\n}", "github": "", "project": "", "reviewers": "J1Px;svsE;HA4f;NyVL", "site": "https://openreview.net/forum?id=ywEx0OiJflS", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "4;5;5;2", "correctness": "2;4;4;4", "technical_novelty": "2;2;3;3", "empirical_novelty": "1;1;1;4", "wc_summary_paper": "112;110;107;148", "wc_summary_review": "13;136;23;55", "wc_main_review": "366;418;47;239", "wc_review": "491;664;177;442", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 4.0, 1.224744871391589 ], "correctness_avg": [ 3.5, 0.8660254037844386 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 1.75, 1.299038105676658 ], "wc_summary_paper_avg": [ 119.25, 16.69393602479655 ], "wc_summary_review_avg": [ 56.75, 48.31342980994001 ], "wc_main_review_avg": [ 267.5, 142.99038429209148 ], "wc_review_avg": [ 443.5, 174.57161854093007 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.3746343246326776, "corr_recommendation_correctness": 0.9271726499455306, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=58895210900837709&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;2;3;4", "aff_unique_norm": "Nanyang Technological University;Chongqing University;China Agricultural University;RIKEN;University of Tokyo", "aff_unique_dep": ";;;;", "aff_unique_url": "https://www.ntu.edu.sg;https://www.cqu.edu.cn;http://www.cau.edu.cn/;https://www.riken.jp;https://www.u-tokyo.ac.jp", "aff_unique_abbr": "NTU;CQU;CAU;RIKEN;UTokyo", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;2;2", "aff_country_unique": "Singapore;China;Japan" }, { "id": "yx_uIzoHJv", "title": "Effect of Pressure for Compositionality on Language Emergence", "track": "main", "status": "Reject", "tldr": "", "abstract": "Humans can use natural languages compositionally, where complicated concepts are expressed using expressions grounded in simpler concepts. Hence, it has been argued that compositionality increases the ability of generalization. This behavior is acquired during natural language learning. Natural languages contain a large number of compositional phrases that function as examples of how to construct compositional expressions for human learners. However, in language emergence, neural agents do not have access to such compositional language expressions. It can be circumvent by optimizing a suitably devised metric of compositionality, which does not require supervising examples. In this paper, we present a learning environment where agents are pressured to make their emerging languages compositional by incorporating a metric of topological similarity into the loss function. We observe that when this pressure is carefully adjusted, agents can achieve higher generalizations. The optimal level of this pressure is highly dependent on the agent architecture, input, and structure of the message space. However, we find no simple correlation between high compositionality and generalization. The advantage offered by compositional pressure is situational. We observe instances where moderately compositional languages are showing generalizing behavior to the extent of some highly compositional ones. ", "keywords": "Language Emergence;Neural Language Emergence;Compositionality", "primary_area": "", "supplementary_material": "", "author": "Mihira Kasun Vithanage;Rukshan Darshana Wijesinghe;Alex Xavier;Dumindu Tissera;Sanath Jayasena;Subha Fernando", "authorids": "~Mihira_Kasun_Vithanage1;~Rukshan_Darshana_Wijesinghe1;~Alex_Xavier1;~Dumindu_Tissera1;~Sanath_Jayasena1;~Subha_Fernando1", "gender": "M;M;M;M;M;F", "homepage": ";;;;;", "dblp": ";;;234/1107;51/8829.html;", "google_scholar": "fWBWCckAAAAJ;QzeO0OMAAAAJ;OONR58IAAAAJ;y6RyQxYAAAAJ;Yoewoo0AAAAJ;RVX5L28AAAAJ", "orcid": ";0000-0002-4003-6337;;0000-0002-7461-0165;0000-0001-5097-8769;0000-0002-6795-5392", "linkedin": ";;;dumindu-tissera/;sanath-jayasena-274a8011/;subha-fernando-b23a4a38/?originalSubdomain=lk", "or_profile": "~Mihira_Kasun_Vithanage1;~Rukshan_Darshana_Wijesinghe1;~Alex_Xavier1;~Dumindu_Tissera1;~Sanath_Jayasena1;~Subha_Fernando1", "aff": "University of Moratuwa;University of Moratuwa;University of Moratuwa;University of Moratuwa, Sri Lanka;;University of Moratuwa", "aff_domain": "uom.lk;ac.lk;mrt.ac.lk;uom.lk;;uom.lk", "position": "PhD student;PhD student;PhD student;PhD student;;Lecturer", "bibtex": "@inproceedings{\nvithanage2022effect,\ntitle={Effect of Pressure for Compositionality on Language Emergence},\nauthor={Mihira Kasun Vithanage and Rukshan Darshana Wijesinghe and Alex Xavier and Dumindu Tissera and Sanath Jayasena and Subha Fernando},\nbooktitle={Submitted to The Tenth International Conference on Learning Representations },\nyear={2022},\nurl={https://openreview.net/forum?id=yx_uIzoHJv},\nnote={under review}\n}", "github": "", "project": "", "reviewers": "zB6g;B3Jo;wKT2;WEY2", "site": "https://openreview.net/forum?id=yx_uIzoHJv", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "2;4;4;4", "correctness": "3;2;4;3", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;0;2;2", "wc_summary_paper": "134;53;63;94", "wc_summary_review": "42;68;43;246", "wc_main_review": "313;687;362;1199", "wc_review": "489;808;468;1539", "wc_reply_reviewers": "105;0;0;1715", "wc_reply_authors": "1488;2203;1342;2468", "reply_reviewers": "2;0;0;5", "reply_authors": "3;4;4;5", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 3.5, 0.8660254037844386 ], "correctness_avg": [ 3.0, 0.7071067811865476 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 1.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 86.0, 31.567388235329194 ], "wc_summary_review_avg": [ 99.75, 85.07753816372451 ], "wc_main_review_avg": [ 640.25, 353.1652410699558 ], "wc_review_avg": [ 826.0, 433.1356600419781 ], "wc_reply_reviewers_avg": [ 455.0, 728.7231984779955 ], "wc_reply_authors_avg": [ 1875.25, 472.5173938597393 ], "reply_reviewers_avg": [ 1.75, 2.0463381929681126 ], "reply_authors_avg": [ 4.0, 0.7071067811865476 ], "replies_avg": [ 33, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:2A4hZ5Nlj2sJ:scholar.google.com/&scioq=Effect+of+Pressure+for+Compositionality+on+Language+Emergence&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "University of Moratuwa", "aff_unique_dep": "", "aff_unique_url": "https://www.mrt.ac.lk", "aff_unique_abbr": "UoM", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "Sri Lanka" }, { "id": "yzDTTtlIlMr", "title": "Momentum Doesn't Change The Implicit Bias", "track": "main", "status": "Reject", "tldr": "", "abstract": "The momentum acceleration technique is widely adopted in many optimization algorithms. However, the theoretical understanding on how the momentum affects the generalization performance of the optimization algorithms is still unknown. In this paper, we answer this question through analyzing the implicit bias of momentum-based optimization. We prove that both SGD with momentum and Adam converge to the $L_2$ max-margin solution for exponential-tailed loss, which is the same as vanilla gradient descent. \nThat means, these optimizers with momentum acceleration still converge to a model with low complexity, which provides guarantees on their generalization. Technically, to overcome the difficulty brought by the error accumulation in analyzing the momentum, we construct new Lyapunov functions as a tool to analyze the gap between the model parameter and the max-margin solution.", "keywords": "Momentum-based Optimizers;Convergence Analysis;Implicit Bias", "primary_area": "", "supplementary_material": "", "author": "Bohan Wang;Qi Meng;Huishuai Zhang;Ruoyu Sun;Wei Chen;Zhi-Ming Ma", "authorids": "~Bohan_Wang1;~Qi_Meng1;~Huishuai_Zhang3;~Ruoyu_Sun1;~Wei_Chen1;~Zhi-Ming_Ma1", "gender": "M;F;;F;;M", "homepage": "https://bhwangfy.github.io/;;https://ruoyus.github.io/;https://weichen-cas.github.io/;http://homepage.amss.ac.cn/research/homePage/8eb59241e2e74d828fb84eec0efadba5/myHomePage.html;https://huishuai-git.github.io", "dblp": "202/1184;;30/9879-1;;;144/7537", "google_scholar": "LfkHCEUAAAAJ;t-z3K34AAAAJ;PsfzbCMAAAAJ;https://scholar.google.com/citations?hl=en;;w1srHyIAAAAJ", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": "~Bohan_Wang1;~Qi_Meng1;~Ruoyu_Sun1;~Wei_Chen1;~Zhi-Ming_Ma1;~Huishuai_Zhang2", "aff": "Microsoft Research Asia, University of Science and Technology of China;Microsoft;University of Illinois, Urbana-Champaign; Chinese Academy of Sciences;Academy of Mathematics and Systems Science, Chinese Academy of Sciences, Chinese Academy of Sciences;Microsoft Research Asia", "aff_domain": "ustc.edu.cn;microsoft.com;uiuc.edu;ict.ac.cn;amss.ac.cn;microsoft.com", "position": "PhD student;associate researcher;Assistant Professor;Full Professor;Full Professor;Researcher", "bibtex": "@misc{\nwang2022momentum,\ntitle={Momentum Doesn't Change The Implicit Bias},\nauthor={Bohan Wang and Qi Meng and Huishuai Zhang and Ruoyu Sun and Wei Chen and Zhi-Ming Ma},\nyear={2022},\nurl={https://openreview.net/forum?id=yzDTTtlIlMr}\n}", "github": "", "project": "", "reviewers": "4RNH;g2J5;D63g;5GXX", "site": "https://openreview.net/forum?id=yzDTTtlIlMr", "pdf_size": 0, "recommendation": "5;5;6;8", "confidence": "4;4;3;4", "correctness": "4;4;4;4", "technical_novelty": "3;2;3;4", "empirical_novelty": "1;0;0;0", "wc_summary_paper": "44;59;66;42", "wc_summary_review": "33;108;24;40", "wc_main_review": "283;1139;355;248", "wc_review": "360;1306;445;330", "wc_reply_reviewers": "0;114;0;19", "wc_reply_authors": "987;1379;599;520", "reply_reviewers": "0;1;0;1", "reply_authors": "3;4;1;1", "recommendation_avg": [ 6.0, 1.224744871391589 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 4.0, 0.0 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 0.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 52.75, 10.084022015049353 ], "wc_summary_review_avg": [ 51.25, 33.25187964611926 ], "wc_main_review_avg": [ 506.25, 367.34954403129456 ], "wc_review_avg": [ 610.25, 403.89997214656995 ], "wc_reply_reviewers_avg": [ 33.25, 47.261903262564445 ], "wc_reply_authors_avg": [ 871.25, 342.3100750781373 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 2.25, 1.299038105676658 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9293855235642568936&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;1;2;2;0", "aff_unique_norm": "Microsoft;University of Illinois;Chinese Academy of Sciences", "aff_unique_dep": "Research;;", "aff_unique_url": "https://www.microsoft.com/en-us/research/group/microsoft-research-asia;https://illinois.edu;https://www.cas.cn", "aff_unique_abbr": "MSRA;UIUC;CAS", "aff_campus_unique_index": "0;2;0", "aff_campus_unique": "Asia;;Urbana-Champaign", "aff_country_unique_index": "0;1;1;0;0;0", "aff_country_unique": "China;United States" }, { "id": "yztpblfGkZ-", "title": "Graph Convolutional Networks via Adaptive Filter Banks", "track": "main", "status": "Reject", "tldr": "", "abstract": "Graph convolutional networks have been a powerful tool in representation learning of networked data. However, most architectures of message passing graph convolutional networks (MPGCNs) are limited as they employ a single message passing strategy and typically focus on low-frequency information, especially when graph features or signals are heterogeneous in different dimensions. Then, existing spectral graph convolutional operators lack a proper sharing scheme between filters, which may result in overfitting problems with numerous parameters. In this paper, we present a novel graph convolution operator, termed BankGCN, which extends the capabilities of MPGCNs beyond single `low-pass' features and simplifies spectral methods with a carefully designed sharing scheme between filters. BankGCN decomposes multi-channel signals on arbitrary graphs into subspaces and shares adaptive filters to represent information in each subspace. The filters of all subspaces differ in frequency response and together form a filter bank. The filter bank and the signal decomposition permit to adaptively capture diverse spectral characteristics of graph data for target applications with a compact architecture. We finally show through extensive experiments that BankGCN achieves excellent performance on a collection of benchmark graph datasets. ", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xing Gao;Wenrui Dai;Chenglin Li;Junni Zou;Hongkai Xiong;Pascal Frossard", "authorids": "~Xing_Gao3;~Wenrui_Dai1;~Chenglin_Li2;~Junni_Zou1;~Hongkai_Xiong1;~Pascal_Frossard1", "gender": ";;M;F;M;", "homepage": ";;https://min.sjtu.edu.cn/En/FacultyShow/4?Vid=17;http://www.cs.sjtu.edu.cn/~zou-jn;http://min.sjtu.edu.cn;", "dblp": ";16/5135.html;;91/4613;21/3569;", "google_scholar": ";Xg8MhyAAAAAJ;ltW2JMcAAAAJ;https://scholar.google.com/citations?hl=zh-CN;bB16iN4AAAAJ;", "orcid": ";;;;0000-0003-4552-0029;", "linkedin": ";;;;;", "or_profile": "~Xing_Gao3;~Wenrui_Dai1;~Chenglin_Li2;~Junni_Zou1;~Hongkai_Xiong1;~Pascal_Frossard1", "aff": ";Shanghai Jiaotong University;Shanghai Jiaotong University;Shanghai Jiaotong University;Shanghai Jiaotong University;", "aff_domain": ";sjtu.edu.cn;sjtu.edu.cn;sjtu.edu.cn;sjtu.edu.cn;", "position": ";Associate Professor;Associate Professor;Full Professor;Full Professor;", "bibtex": "@misc{\ngao2022graph,\ntitle={Graph Convolutional Networks via Adaptive Filter Banks},\nauthor={Xing Gao and Wenrui Dai and Chenglin Li and Junni Zou and Hongkai Xiong and Pascal Frossard},\nyear={2022},\nurl={https://openreview.net/forum?id=yztpblfGkZ-}\n}", "github": "", "project": "", "reviewers": "rmns;1kMb;nrop;yK19;84WF", "site": "https://openreview.net/forum?id=yztpblfGkZ-", "pdf_size": 0, "recommendation": "3;3;5;5;6", "confidence": "5;4;3;3;4", "correctness": "3;1;3;2;4", "technical_novelty": "2;1;2;2;3", "empirical_novelty": "3;2;3;3;2", "wc_summary_paper": "77;82;60;57;21", "wc_summary_review": "54;32;26;52;11", "wc_main_review": "410;458;220;805;371", "wc_review": "541;572;306;914;403", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "618;528;386;780;0", "reply_reviewers": "0;0;0;0;0", "reply_authors": "1;2;1;1;0", "recommendation_avg": [ 4.4, 1.2 ], "confidence_avg": [ 3.8, 0.7483314773547882 ], "correctness_avg": [ 2.6, 1.019803902718557 ], "technical_novelty_avg": [ 2.0, 0.6324555320336759 ], "empirical_novelty_avg": [ 2.6, 0.4898979485566356 ], "wc_summary_paper_avg": [ 59.4, 21.453204888780604 ], "wc_summary_review_avg": [ 35.0, 16.2234398325386 ], "wc_main_review_avg": [ 452.8, 193.28259104223534 ], "wc_review_avg": [ 547.2, 206.9622187743454 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 462.4, 264.2102193330152 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.6324555320336759 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.5790660241435861, "corr_recommendation_correctness": 0.6210344279375827, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Q6ULYTYADbUJ:scholar.google.com/&scioq=Graph+Convolutional+Networks+via+Adaptive+Filter+Banks&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Shanghai Jiao Tong University", "aff_unique_dep": "", "aff_unique_url": "https://www.sjtu.edu.cn", "aff_unique_abbr": "SJTU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "id": "z-5BjnU3-OQ", "title": "HyperCGAN: Text-to-Image Synthesis with HyperNet-Modulated Conditional Generative Adversarial Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "We present HyperCGAN: a conceptually simple and general approach for text-to-image synthesis that uses hypernetworks to condition a GAN model on text. In our setting, the generator and the discriminator weights are controlled by their corresponding hypernetworks, which modulate weight parameters based on the provided text query. We explore different mechanisms to modulate the layers depending on the underlying architecture of a target network and the structure of the conditioning variable. Our method enjoys high flexibility, and we test it in two scenarios: traditional image generation (on top of StyleGAN2) and continuous image generation (on top of INR-GAN). To the best of our knowledge, our work is the first one which explores text-controllable continuous image generation. In both cases, hypernetwork-based conditioning achieves state-of-the-art performance in terms of modern text-to-image evaluation measures and human studies on CUB $256^2$, COCO $256^2$, and ArtEmis $256^2$ datasets.", "keywords": "gan;generative modelling;text-to-image;text2image;hypernetworks", "primary_area": "", "supplementary_material": "", "author": "Kilichbek Haydarov;Aashiq Muhamed;Jovana Lazarevic;Ivan Skorokhodov;Mohamed Elhoseiny", "authorids": "~Kilichbek_Haydarov2;~Aashiq_Muhamed1;~Jovana_Lazarevic1;~Ivan_Skorokhodov1;~Mohamed_Elhoseiny1", "gender": "M;M;F;M;M", "homepage": "https://kilichbek.github.io/webpage/;https://github.com/aashiqmuhamed;;https://universome.github.io/;http://www.mohamed-elhoseiny.com", "dblp": "259/1409;294/0107;;223/0010;125/2894", "google_scholar": "IW4UWrMAAAAJ;GbVC5NYAAAAJ;;https://scholar.google.com/citations?hl=en;iRBUTOAAAAAJ", "orcid": "0000-0002-3062-2228;;;0000-0002-7611-9310;0000-0001-9659-1551", "linkedin": "kilichbek-haydarov/;aashiq-muhamed-52169421/;jovana-lazarevi%C4%87-a940831bb/;ivan-skorokhodov;mohamed-elhoseiny-8a836215/", "or_profile": "~Kilichbek_Haydarov2;~Aashiq_Muhamed1;~Jovana_Lazarevic1;~Ivan_Skorokhodov1;~Mohamed_Elhoseiny1", "aff": "King Abdullah University of Science and Technology;Amazon;;KAUST;KAUST", "aff_domain": "kaust.edu.sa;amazon.com;;kaust.edu.sa;kaust.edu.sa", "position": "PhD student;Researcher;;PhD student;Associate Professor", "bibtex": "@misc{\nhaydarov2022hypercgan,\ntitle={Hyper{CGAN}: Text-to-Image Synthesis with HyperNet-Modulated Conditional Generative Adversarial Networks},\nauthor={Kilichbek Haydarov and Aashiq Muhamed and Jovana Lazarevic and Ivan Skorokhodov and Mohamed Elhoseiny},\nyear={2022},\nurl={https://openreview.net/forum?id=z-5BjnU3-OQ}\n}", "github": "", "project": "", "reviewers": "rpKE;yoBb;p56q;Jcwi", "site": "https://openreview.net/forum?id=z-5BjnU3-OQ", "pdf_size": 0, "recommendation": "5;5;5;6", "confidence": "4;4;4;3", "correctness": "3;3;3;3", "technical_novelty": "3;2;2;3", "empirical_novelty": "3;2;2;2", "wc_summary_paper": "91;92;37;55", "wc_summary_review": "187;30;34;56", "wc_main_review": "732;330;235;347", "wc_review": "1010;452;306;458", "wc_reply_reviewers": "110;0;0;0", "wc_reply_authors": "2901;1147;1247;816", "reply_reviewers": "2;0;0;0", "reply_authors": "8;4;3;3", "recommendation_avg": [ 5.25, 0.4330127018922193 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 68.75, 23.62599204266352 ], "wc_summary_review_avg": [ 76.75, 64.41806811757087 ], "wc_main_review_avg": [ 411.0, 190.18017772628144 ], "wc_review_avg": [ 556.5, 268.80987705067685 ], "wc_reply_reviewers_avg": [ 27.5, 47.63139720814412 ], "wc_reply_authors_avg": [ 1527.75, 808.7327664315327 ], "reply_reviewers_avg": [ 0.5, 0.8660254037844386 ], "reply_authors_avg": [ 4.5, 2.0615528128088303 ], "replies_avg": [ 26, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -1.0, "corr_recommendation_correctness": 0.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4658659924907490707&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;0;0", "aff_unique_norm": "King Abdullah University of Science and Technology;Amazon", "aff_unique_dep": ";Amazon.com, Inc.", "aff_unique_url": "https://www.kast.kau.edu.sa;https://www.amazon.com", "aff_unique_abbr": "KAUST;Amazon", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0", "aff_country_unique": "Saudi Arabia;United States" }, { "title": "Autoregressive Quantile Flows for Predictive Uncertainty Estimation", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6671", "id": "z1-I6rOKv1S", "poster": "", "openreview": "https://openreview.net/forum?id=z1-I6rOKv1S", "slides": "https://iclr.cc/virtual/2022/poster/6671", "video": "https://iclr.cc/virtual/2022/poster/6671", "author_site": "Phillip Si, Allan Bishop, Volodymyr Kuleshov", "tldr": "", "abstract": "Numerous applications of machine learning involve representing probability distributions over high-dimensional data. We propose autoregressive quantile flows, a flexible class of normalizing flow models trained using a novel objective based on proper scoring rules. Our objective does not require calculating computationally expensive determinants of Jacobians during training and supports new types of neural architectures, such as neural autoregressive flows from which it is easy to sample. \n We leverage these models in quantile flow regression, an approach that parameterizes predictive conditional distributions with flows, resulting in improved probabilistic predictions on tasks such as time series forecasting and object detection.\n Our novel objective functions and neural flow parameterizations also yield improvements on popular generation and density estimation tasks, and represent a step beyond maximum likelihood learning of flows.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Phillip Si;Allan Bishop;Volodymyr Kuleshov", "authorids": "~Phillip_Si1;~Allan_Bishop1;~Volodymyr_Kuleshov1", "gender": "M;M;", "homepage": "https://sites.google.com/view/psi6;https://www.linkedin.com/in/allan-bishop-20615b175/;https://www.cs.cornell.edu/~kuleshov/", "dblp": ";;81/8612", "google_scholar": ";;RY_t8XAAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Phillip_Si1;~Allan_Bishop1;~Volodymyr_Kuleshov1", "aff": "Cornell University;;Cornell University", "aff_domain": "cornell.edu;;cornell.edu", "position": "Undergrad student;;Assistant Professor", "bibtex": "@inproceedings{\nsi2022autoregressive,\ntitle={Autoregressive Quantile Flows for Predictive Uncertainty Estimation},\nauthor={Phillip Si and Volodymyr Kuleshov and Allan Bishop},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=z1-I6rOKv1S}\n}", "github": "", "project": "", "reviewers": "AxZh;85oZ;UtzJ", "pdf_size": 0, "recommendation": "6;8;8", "confidence": "4;4;4", "correctness": "3;3;3", "technical_novelty": "2;4;4", "empirical_novelty": "3;3;3", "wc_summary_paper": "72;89;46", "wc_summary_review": "58;63;11", "wc_main_review": "144;249;171", "wc_review": "274;401;228", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "494;162;0", "reply_reviewers": "0;0;0", "reply_authors": "2;2;0", "recommendation_avg": [ 7.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 3.3333333333333335, 0.9428090415820634 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 69.0, 17.682382946499793 ], "wc_summary_review_avg": [ 44.0, 23.423634787681152 ], "wc_main_review_avg": [ 188.0, 44.51965857910413 ], "wc_review_avg": [ 301.0, 73.16192087873765 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 218.66666666666666, 205.61668760638622 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.9428090415820634 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 22, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=175577611210967958&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=z1-I6rOKv1S", "email": "cornell.edu;;cornell.edu", "author_num": 3, "aff_unique_index": "0;0", "aff_unique_norm": "Cornell University", "aff_unique_dep": "", "aff_unique_url": "https://www.cornell.edu", "aff_unique_abbr": "Cornell", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "z2B0JJeNdvT", "title": "Distributed Zeroth-Order Optimization: Convergence Rates That Match Centralized Counterpart", "track": "main", "status": "Reject", "tldr": "", "abstract": "Zeroth-order optimization has become increasingly important in complex optimization and machine learning when cost functions are impossible to be described in closed analytical forms. The key idea of zeroth-order optimization lies in the ability for a learner to build gradient estimates by queries sent to the cost function, and then traditional gradient descent algorithms can be executed with gradients replaced by the estimates. For optimization of large-scale multi-agent systems with decentralized data and costs, zeroth-order optimization can continue to be utilized to develop scalable and distributed zeroth-order algorithms. It is important to understand the trend in performance transitioning from centralized to distributed zeroth-order algorithms in terms of convergence rates, especially for multi-agent systems with time-varying communication networks. In this paper, we establish a series of convergence rates for distributed zeroth-order subgradient algorithms under both one-point and two-point zeroth-order oracles. Apart from the additional node-to-node communication cost in distributed algorithms, the established rates in convergence are shown to match their centralized counterpart. We also propose a multi-stage distributed zeroth-order algorithm that better utilizes the learning rates, reduces the computational complexity, and attains even faster convergence rates for compact decision set.", "keywords": "Zeroth-order optimization;distributed optimization", "primary_area": "", "supplementary_material": "/attachment/1c06a6d7f8cb03146b0319b74152c596223406ef.zip", "author": "Deming Yuan;Lei Wang;Alexandre Proutiere;Guodong Shi", "authorids": "~Deming_Yuan1;~Lei_Wang26;~Alexandre_Proutiere1;~Guodong_Shi1", "gender": "M;M;M;M", "homepage": ";;https://people.kth.se/~alepro/;", "dblp": "87/8689;;p/AlexandreProutiere;https://dblp.org/pers/hd/s/Shi:Guodong.html", "google_scholar": "https://scholar.google.com.hk/citations?user=J-Ghvi8AAAAJ;https://scholar.google.fi/citations?user=b2LyJzsAAAAJ;g5sya5cAAAAJ;https://scholar.google.com.tw/citations?user=gD553TwAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Deming_Yuan1;~Lei_Wang26;~Alexandre_Proutiere1;~Guodong_Shi1", "aff": ";Zhejiang University;KTH Royal Institute of Technology, Stockholm, Sweden;The University of Sydney", "aff_domain": ";zju.edu.cn;kth.se;sydney.edu.au", "position": ";Researcher;Full Professor;Assistant Professor", "bibtex": "@misc{\nyuan2022distributed,\ntitle={Distributed Zeroth-Order Optimization: Convergence Rates That Match Centralized Counterpart},\nauthor={Deming Yuan and Lei Wang and Alexandre Proutiere and Guodong Shi},\nyear={2022},\nurl={https://openreview.net/forum?id=z2B0JJeNdvT}\n}", "github": "", "project": "", "reviewers": "5m6d;fRi4;JpXi", "site": "https://openreview.net/forum?id=z2B0JJeNdvT", "pdf_size": 0, "recommendation": "3;5;6", "confidence": "4;4;5", "correctness": "3;3;3", "technical_novelty": "3;2;3", "empirical_novelty": "2;2;2", "wc_summary_paper": "14;37;56", "wc_summary_review": "29;13;35", "wc_main_review": "260;123;241", "wc_review": "303;173;332", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "179;147;111", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "recommendation_avg": [ 4.666666666666667, 1.247219128924647 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 35.666666666666664, 17.172329163188344 ], "wc_summary_review_avg": [ 25.666666666666668, 9.285592184789413 ], "wc_main_review_avg": [ 208.0, 60.602530200204235 ], "wc_review_avg": [ 269.3333333333333, 69.13915115346892 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 145.66666666666666, 27.776888874666213 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.7559289460184544, "corr_recommendation_correctness": 0.0, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8055969223792647797&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff_unique_index": "0;1;2", "aff_unique_norm": "Zhejiang University;KTH Royal Institute of Technology;University of Sydney", "aff_unique_dep": ";;", "aff_unique_url": "https://www.zju.edu.cn;https://www.kth.se;https://www.sydney.edu.au", "aff_unique_abbr": "ZJU;KTH;USYD", "aff_campus_unique_index": "1", "aff_campus_unique": ";Stockholm", "aff_country_unique_index": "0;1;2", "aff_country_unique": "China;Sweden;Australia" }, { "id": "z2zmSDKONK", "title": "Exploring the Robustness of Distributional Reinforcement Learning against Noisy State Observations", "track": "main", "status": "Reject", "tldr": "", "abstract": "In real scenarios, state observations that an agent observes may contain measurement errors or adversarial noises, misleading the agent to take suboptimal actions or even collapse while training. In this paper, we study the training robustness of distributional Reinforcement Learning~(RL), a class of state-of-the-art methods that estimate the whole distribution, as opposed to only the expectation, of the total return. Firstly, we propose State-Noisy Markov Decision Process~(SN-MDP) in the tabular case to incorporate both random and adversarial state observation noises, in which the contraction of both expectation-based and distributional Bellman operators is derived. Beyond SN-MDP with the function approximation, we theoretically characterize the bounded gradient norm of histogram-based distributional loss, accounting for the better training robustness of distribution RL. We also provide stricter convergence conditions of the Temporal-Difference~(TD) learning under more flexible state noises, as well as the sensitivity analysis by the leverage of influence function. Finally, extensive experiments on the suite of games show that distributional RL enjoys better training robustness compared with its expectation-based counterpart across various state observation noises.", "keywords": "distributional reinforcement learning;robustness", "primary_area": "", "supplementary_material": "/attachment/b2526dfc4951b70b3a1400cf2b198fb7f4a6d361.zip", "author": "Ke Sun;Yi Liu;Yingnan Zhao;Hengshuai Yao;SHANGLING JUI;Linglong Kong", "authorids": "~Ke_Sun6;~Yi_Liu13;~Yingnan_Zhao1;~Hengshuai_Yao2;~SHANGLING_JUI1;~Linglong_Kong2", "gender": "M;M;M;M;M;M", "homepage": "https://sites.google.com/view/kesun;https://apps.ualberta.ca/directory/person/yliu16;;;https://www.ualberta.ca/~lkong;https://hengshuaiyao.github.io/", "dblp": "69/476-13;97/4626-62;;;35/8525;25/4960", "google_scholar": "lYdNhFQAAAAJ;;NMgYY5cAAAAJ;;https://scholar.google.ca/citations?hl=en;R_wcnUgAAAAJ", "orcid": ";;;0000-0002-1047-4264;0000-0003-3011-9216;", "linkedin": ";;;;;", "or_profile": "~Ke_Sun6;~Yi_Liu13;~Yingnan_Zhao1;~SHANGLING_JUI1;~Linglong_Kong2;~hengshuai_yao1", "aff": "University of Alberta;University of Alberta;;Huawei Technologies Ltd.;University of Alberta;Huawei Technologies Ltd.", "aff_domain": "ualberta.ca;ualberta.ca;;huawei.com;ualberta.ca;huawei.com", "position": "PhD student;PhD student;;Principal Researcher;Associate Professor;Principal Researcher", "bibtex": "@misc{\nsun2022exploring,\ntitle={Exploring the Robustness of Distributional Reinforcement Learning against Noisy State Observations},\nauthor={Ke Sun and Yi Liu and Yingnan Zhao and Hengshuai Yao and SHANGLING JUI and Linglong Kong},\nyear={2022},\nurl={https://openreview.net/forum?id=z2zmSDKONK}\n}", "github": "", "project": "", "reviewers": "DaN6;crGD;aTVu", "site": "https://openreview.net/forum?id=z2zmSDKONK", "pdf_size": 0, "recommendation": "3;5;6", "confidence": "5;2;4", "correctness": "2;4;3", "technical_novelty": "2;4;3", "empirical_novelty": "1;2;3", "wc_summary_paper": "34;126;100", "wc_summary_review": "44;17;57", "wc_main_review": "276;144;243", "wc_review": "354;287;400", "wc_reply_reviewers": "291;0;168", "wc_reply_authors": "790;308;408", "reply_reviewers": "1;0;1", "reply_authors": "3;2;2", "recommendation_avg": [ 4.666666666666667, 1.247219128924647 ], "confidence_avg": [ 3.6666666666666665, 1.247219128924647 ], "correctness_avg": [ 3.0, 0.816496580927726 ], "technical_novelty_avg": [ 3.0, 0.816496580927726 ], "empirical_novelty_avg": [ 2.0, 0.816496580927726 ], "wc_summary_paper_avg": [ 86.66666666666667, 38.724095283886946 ], "wc_summary_review_avg": [ 39.333333333333336, 16.659998666133067 ], "wc_main_review_avg": [ 221.0, 56.089214649520635 ], "wc_review_avg": [ 347.0, 46.396838972786355 ], "wc_reply_reviewers_avg": [ 153.0, 119.27279656317278 ], "wc_reply_authors_avg": [ 502.0, 207.69849943287184 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 2.3333333333333335, 0.4714045207910317 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.49999999999999994, "corr_recommendation_correctness": 0.6546536707079772, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4722363274369912213&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;1;0;1", "aff_unique_norm": "University of Alberta;Huawei", "aff_unique_dep": ";Huawei Technologies", "aff_unique_url": "https://www.ualberta.ca;https://www.huawei.com", "aff_unique_abbr": "UAlberta;Huawei", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;1", "aff_country_unique": "Canada;China" }, { "id": "z3Tf4kdOE5D", "title": "FedDiscrete: A Secure Federated Learning Algorithm Against Weight Poisoning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Federated learning (FL) is a privacy-aware collaborative learning paradigm that allows multiple parties to jointly train a machine learning model without sharing their private data. However, recent studies have shown that FL is vulnerable to weight poisoning attacks. In this paper, we propose a probabilistic discretization mechanism on the client side, which transforms the client's model weight into a vector that can only have two different values but still guarantees that the server obtains an unbiased estimation of the client's model weight. We theoretically analyze the utility, robustness, and convergence of our proposed discretization mechanism and empirically verify its superior robustness against various weight-based attacks under the cross-device FL setting.", "keywords": "federated learning;weight poisoning defense", "primary_area": "", "supplementary_material": "/attachment/51ff743e2c5f88a478e3d4debf260973eea5144e.zip", "author": "Yutong Dai;Xingjun Ma;Lichao Sun", "authorids": "~Yutong_Dai2;~Xingjun_Ma1;~Lichao_Sun1", "gender": "M;M;M", "homepage": "https://roth.rbind.io;http://xingjunma.com/;https://lichao-sun.github.io/", "dblp": ";195/8270;121/0780-1.html", "google_scholar": ";https://scholar.google.com.au/citations?user=XQViiyYAAAAJ;WhGUE7AAAAAJ", "orcid": "0000-0003-4212-2017;;", "linkedin": ";xingjun-ma-173532129/;lichao-sun-b273a290/", "or_profile": "~Yutong_Dai2;~Xingjun_Ma1;~Lichao_Sun1", "aff": "Lehigh University;Deakin University;Lehigh University", "aff_domain": "lehigh.edu;deakin.edu.au;lehigh.edu", "position": "PhD student;Assistant Professor;Assistant Professor", "bibtex": "@misc{\ndai2022feddiscrete,\ntitle={FedDiscrete: A Secure Federated Learning Algorithm Against Weight Poisoning},\nauthor={Yutong Dai and Xingjun Ma and Lichao Sun},\nyear={2022},\nurl={https://openreview.net/forum?id=z3Tf4kdOE5D}\n}", "github": "", "project": "", "reviewers": "kHbi;GZ4V;zPLT;QJJA", "site": "https://openreview.net/forum?id=z3Tf4kdOE5D", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "4;5;3;4", "correctness": "2;3;3;2", "technical_novelty": "1;2;3;2", "empirical_novelty": "1;2;3;2", "wc_summary_paper": "13;51;52;98", "wc_summary_review": "21;110;53;20", "wc_main_review": "116;279;421;227", "wc_review": "150;440;526;345", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 53.5, 30.120590963658067 ], "wc_summary_review_avg": [ 51.0, 36.55817282086182 ], "wc_main_review_avg": [ 260.75, 109.6639753975753 ], "wc_review_avg": [ 365.25, 139.7951626487841 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.7071067811865475, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:yxTzIL0p9hYJ:scholar.google.com/&scioq=FedDiscrete:+A+Secure+Federated+Learning+Algorithm+Against+Weight+Poisoning&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;0", "aff_unique_norm": "Lehigh University;Deakin University", "aff_unique_dep": ";", "aff_unique_url": "https://www.lehigh.edu;https://www.deakin.edu.au", "aff_unique_abbr": "Lehigh;Deakin", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", "aff_country_unique": "United States;Australia" }, { "id": "z7DAilcTx7", "title": "A Distributional Robustness Perspective on Adversarial Training with the $\\infty$-Wasserstein Distance", "track": "main", "status": "Reject", "tldr": "", "abstract": "While ML tools are becoming increasingly used in industrial applications, adversarial examples remain a critical flaw of neural networks. These imperceptible perturbations of natural inputs are, on average, misclassified by most of the state-of-the-art classifiers. By slightly modifying each data point, the attacker is creating a new distribution of inputs for the classifier. In this work, we consider the adversarial examples distribution as a tiny shift of the original distribution. We thus propose to address the problem of adversarial training (AT) within the framework of distributional robustness optimization (DRO). We show a formal connection between our formulation and optimal transport by relaxing AT into DRO problem with an $\\infty$-Wasserstein constraint. This connection motivates using an entropic regularizer-- a standard tool in optimal transport--- for our problem. We then prove the existence and uniqueness of an optimal regularized distribution of adversarial examples against a class of classifier (e.g., a given architecture) that we eventually use to robustly train a classifier. Using these theoretical insights, we propose to use Langevin Monte Carlo to sample from this optimal distribution of adversarial examples and train robust classifiers outperforming the standard baseline and providing a speed-up of respectively $\\times 200$ for MNIST and $\\times8$ for CIFAR-10.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Chiara Regniez;Gauthier Gidel;Hugo Berard", "authorids": "~Chiara_Regniez1;~Gauthier_Gidel1;~Hugo_Berard2", "gender": "F;M;M", "homepage": ";https://gauthiergidel.github.io/;", "dblp": ";188/6326;205/3145", "google_scholar": ";https://scholar.google.fr/citations?user=bDrXQPUAAAAJ;P5d_140AAAAJ", "orcid": ";;", "linkedin": "chiara-r%C3%A9gniez-6a8375170/;;", "or_profile": "~Chiara_Regniez1;~Gauthier_Gidel1;~Hugo_Berard2", "aff": "Columbia University;Mila - Quebec Artificial Intelligence Institute;University of Montreal, University of Montreal", "aff_domain": "columbia.edu;mila.quebec;iro.umontreal.ca", "position": "PhD student;Assistant Professor;PhD student", "bibtex": "@misc{\nregniez2022a,\ntitle={A Distributional Robustness Perspective on Adversarial Training with the \\${\\textbackslash}infty\\$-Wasserstein Distance},\nauthor={Chiara Regniez and Gauthier Gidel and Hugo Berard},\nyear={2022},\nurl={https://openreview.net/forum?id=z7DAilcTx7}\n}", "github": "", "project": "", "reviewers": "mcjf;CrMc;qU23", "site": "https://openreview.net/forum?id=z7DAilcTx7", "pdf_size": 0, "recommendation": "5;5;5", "confidence": "4;4;4", "correctness": "4;3;3", "technical_novelty": "1;3;2", "empirical_novelty": "1;1;2", "wc_summary_paper": "52;81;35", "wc_summary_review": "34;127;38", "wc_main_review": "220;162;284", "wc_review": "306;370;357", "wc_reply_reviewers": "27;219;159", "wc_reply_authors": "431;360;567", "reply_reviewers": "1;2;1", "reply_authors": "1;4;2", "recommendation_avg": [ 5.0, 0.0 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.0, 0.816496580927726 ], "empirical_novelty_avg": [ 1.3333333333333333, 0.4714045207910317 ], "wc_summary_paper_avg": [ 56.0, 18.991226044325487 ], "wc_summary_review_avg": [ 66.33333333333333, 42.92888175679502 ], "wc_main_review_avg": [ 222.0, 49.82636517614612 ], "wc_review_avg": [ 344.3333333333333, 27.620443314488796 ], "wc_reply_reviewers_avg": [ 135.0, 80.19975062305369 ], "wc_reply_authors_avg": [ 452.6666666666667, 85.88493594468254 ], "reply_reviewers_avg": [ 1.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 2.3333333333333335, 1.247219128924647 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:FoTnQWHzKAkJ:scholar.google.com/&scioq=A+Distributional+Robustness+Perspective+on+Adversarial+Training+with+the+%24%5Cinfty%24-Wasserstein+Distance&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;1;2", "aff_unique_norm": "Columbia University;Quebec Artificial Intelligence Institute;University of Montreal", "aff_unique_dep": ";Artificial Intelligence;", "aff_unique_url": "https://www.columbia.edu;https://mila.quebec;https://www.umontreal.ca", "aff_unique_abbr": "Columbia;Mila;UM", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1", "aff_country_unique": "United States;Canada" }, { "title": "Extending the WILDS Benchmark for Unsupervised Adaptation", "status": "Oral", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6963", "id": "z7p2V6KROOV", "poster": "", "openreview": "https://openreview.net/forum?id=z7p2V6KROOV", "slides": "https://iclr.cc/virtual/2022/poster/6963", "video": "https://iclr.cc/virtual/2022/poster/6963", "author_site": "Shiori Sagawa, Pang Wei Koh, Tony Lee, Irena Gao, Sang Michael Xie, Kendrick Shen, Ananya Kumar, Weihua Hu, Michihiro Yasunaga, Henrik Marklund, Sara Beery, Etienne David, Ian Stavness, Wei Guo, Jure Leskovec, Kate Saenko, Tatsunori Hashimoto, Sergey Levine, Chelsea Finn, Percy Liang", "tldr": "", "abstract": "Machine learning systems deployed in the wild are often trained on a source distribution but deployed on a different target distribution. Unlabeled data can be a powerful point of leverage for mitigating these distribution shifts, as it is frequently much more available than labeled data and can often be obtained from distributions beyond the source distribution as well. However, existing distribution shift benchmarks with unlabeled data do not reflect the breadth of scenarios that arise in real-world applications. In this work, we present the WILDS 2.0 update, which extends 8 of the 10 datasets in the WILDS benchmark of distribution shifts to include curated unlabeled data that would be realistically obtainable in deployment. These datasets span a wide range of applications (from histology to wildlife conservation), tasks (classification, regression, and detection), and modalities (photos, satellite images, microscope slides, text, molecular graphs). The update maintains consistency with the original WILDS benchmark by using identical labeled training, validation, and test sets, as well as identical evaluation metrics. We systematically benchmark state-of-the-art methods that use unlabeled data, including domain-invariant, self-training, and self-supervised methods, and show that their success on WILDS is limited. To facilitate method development, we provide an open-source package that automates data loading and contains the model architectures and methods used in this paper. Code and leaderboards are available at https://wilds.stanford.edu.", "keywords": "distribution shifts;adaptation;unlabeled data", "primary_area": "", "supplementary_material": "", "author": "Shiori Sagawa;Pang Wei Koh;Tony Lee;Irena Gao;Sang Michael Xie;Kendrick Shen;Ananya Kumar;Weihua Hu;Michihiro Yasunaga;Henrik Marklund;Sara Beery;Etienne David;Ian Stavness;Wei Guo;Jure Leskovec;Kate Saenko;Tatsunori Hashimoto;Sergey Levine;Chelsea Finn;Percy Liang", "authorids": "~Shiori_Sagawa1;~Pang_Wei_Koh1;tonyhlee@stanford.edu;~Irena_Gao1;~Sang_Michael_Xie1;~Kendrick_Shen1;~Ananya_Kumar1;~Weihua_Hu1;~Michihiro_Yasunaga1;~Henrik_Marklund2;~Sara_Beery1;etienne.david@inrae.fr;~Ian_Stavness1;guowei@g.ecc.u-tokyo.ac.jp;~Jure_Leskovec1;~Kate_Saenko1;~Tatsunori_Hashimoto1;~Sergey_Levine1;~Chelsea_Finn1;~Percy_Liang1", "gender": "Unspecified;M;;;;M;M;M;;;F;;M;;;F;M;M;F;", "homepage": "https://cs.stanford.edu/~ssagawa/;http://cs.stanford.edu/~pangwei;;https://i-gao.github.io;https://cs.stanford.edu/~eix/;https://kendrickshen.com;https://ananyakumar.wordpress.com/;http://web.stanford.edu/~weihuahu/;;;https://beerys.github.io/;;https://www.cs.usask.ca/faculty/stavness/;;http://cs.stanford.edu/~jure/;http://ai.bu.edu;https://thashim.github.io;https://people.eecs.berkeley.edu/~svlevine/;https://ai.stanford.edu/~cbfinn/;https://cs.stanford.edu/~pliang/", "dblp": "248/7578;10/10453;;193/1492;220/3987;;192/0474;42/1232;202/1809;234/7535.html;191/2643;;66/5086;;l/JureLeskovec;88/2754;;80/7594;131/1783;04/1701", "google_scholar": "9EnJFEEAAAAJ;Nn990CkAAAAJ;;;EBNa5IEAAAAJ;https://scholar.google.com/citations?hl=en;tP5IBFkAAAAJ;wAFMjfkAAAAJ;SieJYoEAAAAJ;;https://scholar.google.com/citations?hl=en;;https://scholar.google.ca/citations?user=4ONh3jcAAAAJ;;Q_kKkIUAAAAJ;https://scholar.google.com.tw/citations?user=9xDADY4AAAAJ;5ygiTwsAAAAJ;8R35rCwAAAAJ;vfPE6hgAAAAJ;pouyVyUAAAAJ", "orcid": ";;;;;;;;;;;;0000-0002-2044-2565;;0000-0002-5411-923X;0000-0002-5704-7614;;;;", "linkedin": ";;;;;;;weihua-hu-a8284228/;;;;;;;leskovec/;;;;;", "or_profile": "~Shiori_Sagawa1;~Pang_Wei_Koh1;tonyhlee@stanford.edu;~Irena_Gao1;~Sang_Michael_Xie1;~Kendrick_Shen1;~Ananya_Kumar1;~Weihua_Hu1;~Michihiro_Yasunaga1;~Henrik_Marklund2;~Sara_Beery1;etienne.david@inrae.fr;~Ian_Stavness1;guowei@g.ecc.u-tokyo.ac.jp;~Jure_Leskovec1;~Kate_Saenko1;~Tatsunori_Hashimoto1;~Sergey_Levine1;~Chelsea_Finn1;~Percy_Liang1", "aff": "Stanford University;Stanford University;;Stanford University;Stanford University;Stanford University;Microsoft;Stanford University;Stanford University;Harvard University;California Institute of Technology;;University of Saskatchewan;;Kumo.AI;Boston University, Boston University;Stanford University;Google;Google;Stanford University", "aff_domain": "stanford.edu;stanford.edu;;stanford.edu;stanford.edu;stanford.edu;microsoft.com;stanford.edu;stanford.edu;harvard.edu;caltech.edu;;usask.ca;;kumo.ai;bu.edu;stanford.edu;google.com;google.com;stanford.edu", "position": "PhD student;PhD student;;Undergrad student;PhD student;Undergrad student;Intern;PhD student;PhD student;Researcher;PhD student;;Associate Professor;;Chief Scientist;Full Professor;Assistant Professor;Research Scientist;Research Scientist;Associate Professor", "bibtex": "@inproceedings{\nsagawa2022extending,\ntitle={Extending the {WILDS} Benchmark for Unsupervised Adaptation},\nauthor={Shiori Sagawa and Pang Wei Koh and Tony Lee and Irena Gao and Sang Michael Xie and Kendrick Shen and Ananya Kumar and Weihua Hu and Michihiro Yasunaga and Henrik Marklund and Sara Beery and Etienne David and Ian Stavness and Wei Guo and Jure Leskovec and Kate Saenko and Tatsunori Hashimoto and Sergey Levine and Chelsea Finn and Percy Liang},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=z7p2V6KROOV}\n}", "github": "", "project": "", "reviewers": "3NYa;NvLs;RpU9;Dxvt", "pdf_size": 0, "recommendation": "6;8;8;8", "confidence": "4;3;4;4", "correctness": "4;4;3;4", "technical_novelty": "2;2;3;1", "empirical_novelty": "3;3;3;4", "wc_summary_paper": "117;64;64;81", "wc_summary_review": "42;56;129;38", "wc_main_review": "383;363;402;208", "wc_review": "542;483;595;327", "wc_reply_reviewers": "51;45;146;15", "wc_reply_authors": "515;818;1936;1187", "reply_reviewers": "1;1;2;1", "reply_authors": "2;2;4;3", "recommendation_avg": [ 7.5, 0.8660254037844386 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 3.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 81.5, 21.639085008382402 ], "wc_summary_review_avg": [ 66.25, 36.84002578717882 ], "wc_main_review_avg": [ 339.0, 76.8797762743883 ], "wc_review_avg": [ 486.75, 100.38021468397046 ], "wc_reply_reviewers_avg": [ 64.25, 49.12929370548695 ], "wc_reply_authors_avg": [ 1114.0, 530.902533427747 ], "reply_reviewers_avg": [ 1.25, 0.4330127018922193 ], "reply_authors_avg": [ 2.75, 0.82915619758885 ], "replies_avg": [ 23, 0 ], "authors#_avg": [ 20, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": -0.3333333333333333, "gs_citation": 143, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11714444526170009099&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "pdf": "https://openreview.net/pdf?id=z7p2V6KROOV", "email": "stanford.edu;stanford.edu;;stanford.edu;stanford.edu;stanford.edu;microsoft.com;stanford.edu;stanford.edu;harvard.edu;caltech.edu;;usask.ca;;kumo.ai;bu.edu;stanford.edu;google.com;google.com;stanford.edu", "author_num": 20, "aff_unique_index": "0;0;0;0;0;1;0;0;2;3;4;5;6;0;7;7;0", "aff_unique_norm": "Stanford University;Microsoft;Harvard University;California Institute of Technology;University of Saskatchewan;Kumo.AI;Boston University;Google", "aff_unique_dep": ";Microsoft Corporation;;;;;;Google", "aff_unique_url": "https://www.stanford.edu;https://www.microsoft.com;https://www.harvard.edu;https://www.caltech.edu;https://www.usask.ca;https://www.kumo.ai;https://www.bu.edu;https://www.google.com", "aff_unique_abbr": "Stanford;Microsoft;Harvard;Caltech;U of S;Kumo.AI;BU;Google", "aff_campus_unique_index": "0;0;0;0;0;0;0;2;3;0;4;4;0", "aff_campus_unique": "Stanford;;Pasadena;Boston;Mountain View", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0;0;1;0;0;0;0;0;0", "aff_country_unique": "United States;Canada" }, { "id": "z8Bz7m6T-xJ", "title": "Overcoming Label Ambiguity with Multi-label Iterated Learning", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Transfer learning from ImageNet pre-trained models has become essential for many computer vision tasks. Recent studies have shown that ImageNet includes label ambiguity, where images with multiple object classes present are assigned a single label. This ambiguity biases models towards a single prediction, which could result in the suppression of classes that tend to co-occur in the data. Recent approaches have explored either fixing the evaluation datasets or using costly procedures to relabel the training data. In this work, we propose multi-label iterated learning (MILe) to incorporate the inductive biases of multi-label learning from single labels using the framework of iterated learning. MILe is a simple, yet effective procedure that alternates training a teacher and a student network with binary predictions to build a multi-label description of the images. Experiments on ImageNet show that MILe achieves higher accuracy and ReaL score than when using the standard training procedure, even when fine-tuning from self-supervised weights. We also show that MILe is effective for real-world large-scale noisy data such as WebVision. Furthermore, MILe improves performance in class incremental settings such as IIRC and is robust to distribution shifts.", "keywords": "supervised learning;multi-label learning;label ambiguity;label noise", "primary_area": "", "supplementary_material": "/attachment/c2a6a51878858892b9a95547ef9978f2a325c156.zip", "author": "sai rajeswar mudumba;Pau Rodriguez;Soumye Singhal;David Vazquez;Aaron Courville", "authorids": "~sai_rajeswar_mudumba1;~Pau_Rodriguez2;~Soumye_Singhal1;~David_Vazquez1;~Aaron_Courville3", "gender": "M;M;M;;", "homepage": "https://sairajeswar.com/;;http://www.david-vazquez.com;;https://prlz77.github.io", "dblp": "159/2116;245/4872;94/8653;56/1688;190/7735", "google_scholar": "https://scholar.google.ca/citations?user=h-sqIigAAAAJ;https://scholar.google.com/citations?hl=en;1jHvtfsAAAAJ;https://scholar.google.ca/citations?user=km6CP8cAAAAJ;https://scholar.google.es/citations?user=IwBx73wAAAAJ", "orcid": ";;0000-0002-2845-8158;;0000-0002-1689-8084", "linkedin": "sairajeswar/;;https://www.linkedin.com/company/david-vazquez/;;", "or_profile": "~sai_rajeswar_mudumba1;~Soumye_Singhal1;~David_Vazquez1;~Aaron_Courville3;~Pau_Rodriguez_Lopez1", "aff": "University of Montreal;Mila, University of Montreal;ServiceNow research;Universit\u00e9 de Montr\u00e9al;Element AI", "aff_domain": "umontreal.ca;umontreal.ca;servicenow.com; ;elementai.com", "position": "PhD student;MS student;Researcher;Assistant Professor;Researcher", "bibtex": "@misc{\nmudumba2022overcoming,\ntitle={Overcoming Label Ambiguity with Multi-label Iterated Learning},\nauthor={sai rajeswar mudumba and Pau Rodriguez and Soumye Singhal and David Vazquez and Aaron Courville},\nyear={2022},\nurl={https://openreview.net/forum?id=z8Bz7m6T-xJ}\n}", "github": "", "project": "", "reviewers": "pEZt;ydzU;t1r6;ovTw", "site": "https://openreview.net/forum?id=z8Bz7m6T-xJ", "pdf_size": 0, "recommendation": "5;5;5;5", "confidence": "4;4;4;5", "correctness": "3;3;3;3", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "101;119;93;72", "wc_summary_review": "37;39;48;14", "wc_main_review": "168;372;163;138", "wc_review": "306;530;304;224", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 5.0, 0.0 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 96.25, 16.872685026396955 ], "wc_summary_review_avg": [ 34.5, 12.539936203984453 ], "wc_main_review_avg": [ 210.25, 94.07543515711208 ], "wc_review_avg": [ 341.0, 114.02192771568107 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:qufoCBiENcQJ:scholar.google.com/&scioq=Overcoming+Label+Ambiguity+with+Multi-label+Iterated+Learning&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;1;2;3", "aff_unique_norm": "University of Montreal;ServiceNow;Universit\u00e9 de Montr\u00e9al;Element AI", "aff_unique_dep": ";research;;", "aff_unique_url": "https://wwwumontreal.ca;https://www.servicenow.com;https://www.umontreal.ca;https://www.elementai.com", "aff_unique_abbr": "UM;ServiceNow;UdeM;Element AI", "aff_campus_unique_index": "1", "aff_campus_unique": ";Montreal", "aff_country_unique_index": "0;0;1;0;0", "aff_country_unique": "Canada;United States" }, { "id": "z8j0bPU4DIw", "title": "Evolution Strategies as an Alternate Learning method for Hierarchical Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "This paper investigates the performance of Scalable Evolution Strategies (S-ES) as a Hierarchical Reinforcement Learning (HRL) approach. S-ES, named for its excellent scalability across many processors, was popularised by OpenAI when they showed its performance to be comparable to the state-of-the-art policy gradient methods. However, to date, S-ES has not been tested in conjunction with HRL methods, which empower temporal abstraction thus allowing agents to tackle more challenging problems. In this work, we introduce a novel method that merges S-ES and HRL, which allows S-ES to be applied to difficult problems such as simultaneous robot locomotion and navigation. We show that S-ES needed no (methodological or hyperparameter) modifications for it to be used in a hierarchical context and that its indifference to delayed rewards leads to it having competitive performance with state-of-the-art gradient-based HRL methods. This leads to a novel HRL method that achieves state-of-the-art performance, and is also comparably simple and highly scalable.\n", "keywords": "Hierarchical reinforcement learning;evolution strategies;reinforcement learning", "primary_area": "", "supplementary_material": "/attachment/cee2f85a22d980f6f2fe69a8dc847ec2c99e5ef0.zip", "author": "Sasha Abramowitz", "authorids": "~Sasha_Abramowitz1", "gender": "M", "homepage": "", "dblp": "", "google_scholar": "https://scholar.google.com/citations?hl=en", "orcid": "", "linkedin": "sasha-abramowitz-693761148/", "or_profile": "~Sasha_Abramowitz1", "aff": "University of Cape Town", "aff_domain": "myuct.ac.za", "position": "MS student", "bibtex": "@misc{\nabramowitz2022evolution,\ntitle={Evolution Strategies as an Alternate Learning method for Hierarchical Reinforcement Learning},\nauthor={Sasha Abramowitz},\nyear={2022},\nurl={https://openreview.net/forum?id=z8j0bPU4DIw}\n}", "github": "", "project": "", "reviewers": "up7h;BgoU;xRrU;zZsS", "site": "https://openreview.net/forum?id=z8j0bPU4DIw", "pdf_size": 0, "recommendation": "3;3;5;5", "confidence": "4;4;4;4", "correctness": "3;2;3;3", "technical_novelty": "1;2;3;2", "empirical_novelty": "2;2;2;0", "wc_summary_paper": "67;85;47;64", "wc_summary_review": "70;31;16;38", "wc_main_review": "330;308;253;208", "wc_review": "467;424;316;310", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.0, 1.0 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 1.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 65.75, 13.47915056670857 ], "wc_summary_review_avg": [ 38.75, 19.715159142142372 ], "wc_main_review_avg": [ 274.75, 47.662222986344226 ], "wc_review_avg": [ 379.25, 68.00505495917197 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 1, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:sMaFKUwbZ3oJ:scholar.google.com/&scioq=Evolution+Strategies+as+an+Alternate+Learning+method+for+Hierarchical+Reinforcement+Learning&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "University of Cape Town", "aff_unique_dep": "", "aff_unique_url": "https://www.uct.ac.za", "aff_unique_abbr": "UCT", "aff_country_unique_index": "0", "aff_country_unique": "South Africa" }, { "id": "z8xVlqWwRrK", "title": "EVaDE : Event-Based Variational Thompson Sampling for Model-Based Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Posterior Sampling for Reinforcement Learning (PSRL) is a well-known algorithm that augments model-based reinforcement learning (MBRL) algorithms with Thompson sampling. PSRL maintains posterior distributions of the environment transition dynamics and the reward function to procure posterior samples that are used to generate data for training the controller. Maintaining posterior distributions over all possible transition and reward functions for tasks with high dimensional state and action spaces is intractable. Recent works show that dropout used in conjunction with neural networks induce variational distributions that can approximate these posteriors. In this paper, we propose Event-based Variational Distributions for Exploration (EVaDE), variational distributions that are useful for MBRL, especially when the underlying domain is object-based. We leverage the general domain knowledge of object-based domains to design three types of event-based convolutional layers to direct exploration, namely the noisy event interaction layer, the noisy event weighting layer and the noisy event translation layer respectively. These layers rely on Gaussian dropouts and are inserted in between the layers of the deep neural network model to help facilitate variational Thompson sampling. We empirically show the effectiveness of EVaDE equipped Simulated Policy Learning (SimPLe) on a randomly selected suite of Atari games, where the number of agent environment interactions is limited to 100K. ", "keywords": "Model-based Reinforcement Learning;Thompson sampling;Exploration", "primary_area": "", "supplementary_material": "/attachment/1f0a1d35bb65db9f9ed15cefa003bda1aa54a139.zip", "author": "Siddharth Aravindan;Dixant Mittal;Wee Sun Lee", "authorids": "~Siddharth_Aravindan1;~Dixant_Mittal1;~Wee_Sun_Lee1", "gender": ";M;M", "homepage": "https://www.comp.nus.edu.sg/~a-sid/;https://www.dixantmittal.com;http://www.comp.nus.edu.sg/~leews/", "dblp": ";258/5802;86/1498", "google_scholar": "A7AojWAAAAAJ;1AoSTvUAAAAJ;https://scholar.google.com.sg/citations?user=8PCrLgwAAAAJ", "orcid": ";0000-0001-5738-3979;", "linkedin": "siddharth-aravindan/;dixantmittal/;", "or_profile": "~Siddharth_Aravindan1;~Dixant_Mittal1;~Wee_Sun_Lee1", "aff": "National University of Singapore;Sea AI Lab;National University of Singapore", "aff_domain": "nus.edu.sg;sea.com;nus.edu.sg", "position": "PhD student;Intern;Full Professor", "bibtex": "@misc{\naravindan2022evade,\ntitle={{EV}a{DE} : Event-Based Variational Thompson Sampling for Model-Based Reinforcement Learning},\nauthor={Siddharth Aravindan and Dixant Mittal and Wee Sun Lee},\nyear={2022},\nurl={https://openreview.net/forum?id=z8xVlqWwRrK}\n}", "github": "", "project": "", "reviewers": "gXzj;trzP;XiQT;9oaA", "site": "https://openreview.net/forum?id=z8xVlqWwRrK", "pdf_size": 0, "recommendation": "5;5;6;8", "confidence": "3;3;3;3", "correctness": "4;3;3;4", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;0;3;3", "wc_summary_paper": "20;79;142;26", "wc_summary_review": "30;29;47;55", "wc_main_review": "380;119;443;382", "wc_review": "430;227;632;463", "wc_reply_reviewers": "0;62;12;0", "wc_reply_authors": "636;701;853;302", "reply_reviewers": "0;1;1;0", "reply_authors": "1;2;2;1", "recommendation_avg": [ 6.0, 1.224744871391589 ], "confidence_avg": [ 3.0, 0.0 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 1.224744871391589 ], "wc_summary_paper_avg": [ 66.75, 49.139469879110415 ], "wc_summary_review_avg": [ 40.25, 11.121488209767612 ], "wc_main_review_avg": [ 331.0, 124.989999599968 ], "wc_review_avg": [ 438.0, 143.91490541288627 ], "wc_reply_reviewers_avg": [ 18.5, 25.588083163847973 ], "wc_reply_authors_avg": [ 623.0, 201.3665811399697 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.40824829046386296, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:cqkEVq4inJYJ:scholar.google.com/&scioq=EVaDE+:+Event-Based+Variational+Thompson+Sampling+for+Model-Based+Reinforcement+Learning&hl=en&as_sdt=0,33", "gs_version_total": 7, "aff_unique_index": "0;1;0", "aff_unique_norm": "National University of Singapore;Sea AI Lab", "aff_unique_dep": ";", "aff_unique_url": "https://www.nus.edu.sg;", "aff_unique_abbr": "NUS;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Singapore;" }, { "id": "zAyZFRptzvh", "title": "Auditing AI models for Verified Deployment under Semantic Specifications", "track": "main", "status": "Reject", "tldr": "", "abstract": "Auditing trained deep learning (DL) models prior to deployment is vital for preventing unintended consequences. One of the biggest challenges in auditing is in understanding how we can obtain human-interpretable specifications that are directly useful to the end-user. We address this challenge through a sequence of semantically-aligned unit tests, where each unit test verifies whether a predefined specification (e.g., accuracy over 95\\%) is satisfied with respect to controlled and semantically aligned variations in the input space (e.g., in face recognition, the angle relative to the camera). We perform these unit tests by directly verifying the semantically aligned variations in an interpretable latent space of a generative model by building a bridge with the DL model. Our framework, AuditAI, bridges the gap between interpretable formal verification and scalability. With evaluations on four different datasets, covering images of chest X-rays, human faces, ImageNet classes, and towers, we show how AuditAI allows us to obtain controlled variations for verification and certified training. We address the limitations of the standard approach of verifying using only pixel-space perturbations.", "keywords": "auditing deep learning;verification;interpretability", "primary_area": "", "supplementary_material": "", "author": "Homanga Bharadhwaj;De-An Huang;Chaowei Xiao;Anima Anandkumar;Animesh Garg", "authorids": "~Homanga_Bharadhwaj1;~De-An_Huang1;~Chaowei_Xiao2;~Anima_Anandkumar1;~Animesh_Garg1", "gender": "M;M;M;F;M", "homepage": "https://homangab.github.io/;http://ai.stanford.edu/~dahuang/;http://animesh.garg.tech;http://tensorlab.cms.caltech.edu/users/anima/;https://xiaocw11.github.io/", "dblp": "223/5842;119/0335;123/5728;;150/3317", "google_scholar": "https://scholar.google.ca/citations?user=wwW4HRQAAAAJ;HEY3UzgAAAAJ;zp8V7ZMAAAAJ;bEcLezcAAAAJ;Juoqtj8AAAAJ", "orcid": ";;0000-0003-0482-4296;;0000-0002-7043-4926", "linkedin": ";;animeshgarg/;anima-anandkumar-35171b1/;", "or_profile": "~Homanga_Bharadhwaj1;~De-An_Huang1;~Animesh_Garg1;~anima_anandkumar1;~chaowei_xiao1", "aff": "Meta Facebook;NVIDIA;University of Toronto;California Institute of Technology;Arizona State University", "aff_domain": "facebook.com;nvidia.com;toronto.edu;caltech.edu;asu.edu", "position": "Visiting Researcher;Research Scientist;Assistant Professor;Full Professor;Assistant Professor", "bibtex": "@misc{\nbharadhwaj2022auditing,\ntitle={Auditing {AI} models for Verified Deployment under Semantic Specifications},\nauthor={Homanga Bharadhwaj and De-An Huang and Chaowei Xiao and Anima Anandkumar and Animesh Garg},\nyear={2022},\nurl={https://openreview.net/forum?id=zAyZFRptzvh}\n}", "github": "", "project": "", "reviewers": "Qgvd;1Tmi;rsGL;PLLS", "site": "https://openreview.net/forum?id=zAyZFRptzvh", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "2;5;4;4", "correctness": "3;3;3;3", "technical_novelty": "2;3;3;4", "empirical_novelty": "2;4;3;3", "wc_summary_paper": "34;63;81;27", "wc_summary_review": "32;127;34;21", "wc_main_review": "156;566;395;210", "wc_review": "222;756;510;258", "wc_reply_reviewers": "0;0;0;125", "wc_reply_authors": "648;439;495;428", "reply_reviewers": "0;0;0;1", "reply_authors": "2;1;1;2", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 3.75, 1.0897247358851685 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 3.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 51.25, 21.84462176372024 ], "wc_summary_review_avg": [ 53.5, 42.72294465506796 ], "wc_main_review_avg": [ 331.75, 161.69782775287985 ], "wc_review_avg": [ 436.5, 215.26437234247567 ], "wc_reply_reviewers_avg": [ 31.25, 54.12658773652741 ], "wc_reply_authors_avg": [ 502.5, 87.7624635023425 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.4736842105263159, "corr_recommendation_correctness": 0.0, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11233761064196738000&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;1;2;3;4", "aff_unique_norm": "Meta;NVIDIA;University of Toronto;California Institute of Technology;Arizona State University", "aff_unique_dep": "Meta Platforms, Inc.;NVIDIA Corporation;;;", "aff_unique_url": "https://meta.com;https://www.nvidia.com;https://www.utoronto.ca;https://www.caltech.edu;https://www.asu.edu", "aff_unique_abbr": "Meta;NVIDIA;U of T;Caltech;ASU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Pasadena", "aff_country_unique_index": "0;0;1;0;0", "aff_country_unique": "United States;Canada" }, { "title": "Learning Generalizable Representations for Reinforcement Learning via Adaptive Meta-learner of Behavioral Similarities", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6007", "id": "zBOI9LFpESK", "poster": "", "openreview": "https://openreview.net/forum?id=zBOI9LFpESK", "slides": "https://iclr.cc/virtual/2022/poster/6007", "video": "https://iclr.cc/virtual/2022/poster/6007", "author_site": "Jianda Chen, Sinno Pan", "tldr": "", "abstract": "How to learn an effective reinforcement learning-based model for control tasks from high-level visual observations is a practical and challenging problem. A key to solving this problem is to learn low-dimensional state representations from observations, from which an effective policy can be learned. In order to boost the learning of state encoding, recent works are focused on capturing behavioral similarities between state representations or applying data augmentation on visual observations. In this paper, we propose a novel meta-learner-based framework for representation learning regarding behavioral similarities for reinforcement learning. Specifically, our framework encodes the high-dimensional observations into two decomposed embeddings regarding reward and dynamics in a Markov Decision Process (MDP). A pair of meta-learners are developed, one of which quantifies the reward similarity and the other quantifies dynamics similarity over the correspondingly decomposed embeddings. The meta-learners are self-learned to update the state embeddings by approximating two disjoint terms in on-policy bisimulation metric. To incorporate the reward and dynamics terms, we further develop a strategy to adaptively balance their impacts based on different tasks or environments. We empirically demonstrate that our proposed framework outperforms state-of-the-art baselines on several benchmarks, including conventional DM Control Suite, Distracting DM Control Suite and a self-driving task CARLA.", "keywords": "deep reinforcement learning;deep learning;representation learning", "primary_area": "", "supplementary_material": "", "author": "Jianda Chen;Sinno Pan", "authorids": "~Jianda_Chen1;~Sinno_Pan1", "gender": ";M", "homepage": ";http://www.cse.cuhk.edu.hk/~sinnopan/", "dblp": "176/6660;80/5412", "google_scholar": "jEOSgcUAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";", "linkedin": ";", "or_profile": "~Jianda_Chen1;~Sinno_Pan1", "aff": "Nanyang Technological University;Nanyang Technological University", "aff_domain": "ntu.edu.sg;ntu.edu.sg", "position": "PhD student;Associate Professor", "bibtex": "@inproceedings{\nchen2022learning,\ntitle={Learning Generalizable Representations for Reinforcement Learning via Adaptive Meta-learner of Behavioral Similarities},\nauthor={Jianda Chen and Sinno Pan},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=zBOI9LFpESK}\n}", "github": "", "project": "", "reviewers": "Np3q;bfCF;rJ9b;vhfp", "pdf_size": 0, "recommendation": "5;6;6;6", "confidence": "3;3;5;4", "correctness": "2;3;3;3", "technical_novelty": "2;2;3;3", "empirical_novelty": "3;3;2;3", "wc_summary_paper": "154;73;55;112", "wc_summary_review": "39;39;4;71", "wc_main_review": "421;230;1282;451", "wc_review": "614;342;1341;634", "wc_reply_reviewers": "281;50;467;249", "wc_reply_authors": "1173;1118;2185;980", "reply_reviewers": "1;1;5;3", "reply_authors": "3;3;7;3", "recommendation_avg": [ 5.75, 0.4330127018922193 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 98.5, 38.095275297600885 ], "wc_summary_review_avg": [ 38.25, 23.699947257325277 ], "wc_main_review_avg": [ 596.0, 405.0314802580165 ], "wc_review_avg": [ 732.75, 369.6304742577376 ], "wc_reply_reviewers_avg": [ 261.75, 147.90093813089896 ], "wc_reply_authors_avg": [ 1364.0, 479.19046317722143 ], "reply_reviewers_avg": [ 2.5, 1.6583123951777 ], "reply_authors_avg": [ 4.0, 1.7320508075688772 ], "replies_avg": [ 31, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.5222329678670935, "corr_recommendation_correctness": 1.0, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3185167988129804114&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=zBOI9LFpESK", "email": "ntu.edu.sg;ntu.edu.sg", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Nanyang Technological University", "aff_unique_dep": "", "aff_unique_url": "https://www.ntu.edu.sg", "aff_unique_abbr": "NTU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Singapore" }, { "id": "zBVjxKB6g84", "title": "Understanding Clipping for Federated Learning: Convergence and Client-Level Differential Privacy", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Providing privacy protection has been one of the primary motivations of Federated Learning (FL). Recently, there has been a line of work on incorporating the formal privacy notion of differential privacy with FL. To guarantee the client-level differential privacy in FL algorithms, the clients' transmitted model updates have to be clipped before {adding privacy noise}. Such clipping operation is substantially different from its counterpart of gradient clipping in the centralized differentially private SGD and has not been well-understood. In this paper, we first empirically demonstrate that the clipped FedAvg can perform surprisingly well even with substantial data heterogeneity when training neural networks, which is partly because the clients' updates become {\\it similar} for several popular deep architectures. Based on this key observation, we provide the convergence analysis of a differentially private (DP) FedAvg algorithm and highlight the relationship between clipping bias and the distribution of the clients' updates.\n\nTo the best of our knowledge, this is the first work that rigorously investigates theoretical and empirical issues regarding the clipping operation in FL algorithms.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xinwei Zhang;Xiangyi Chen;Mingyi Hong;Steven Wu;Jinfeng Yi", "authorids": "~Xinwei_Zhang1;~Xiangyi_Chen1;~Mingyi_Hong1;~Steven_Wu1;~Jinfeng_Yi1", "gender": "M;M;M;M;M", "homepage": "https://564612540.github.io/;;http://people.ece.umn.edu/~mhong/mingyi.html;http://jinfengyi.net/;https://zstevenwu.com/", "dblp": "55/9870-1.html;02/445;57/8053;117/4898;137/8350", "google_scholar": "uq46meMAAAAJ;M0ki5ZgAAAAJ;qRnP-p0AAAAJ;lZxRZ84AAAAJ;MbF6rTEAAAAJ", "orcid": "0000-0001-7967-7150;;;;", "linkedin": ";;;https://www.linkedin.com/nhome/?trk=;zstevenwu/", "or_profile": "~Xinwei_Zhang1;~Xiangyi_Chen1;~Mingyi_Hong1;~Jinfeng_Yi1;~Zhiwei_Steven_Wu1", "aff": "University of Minnesota;University of Minnesota, Minneapolis;University of Minnesota, Minneapolis;JD AI Research;Carnegie Mellon University", "aff_domain": "umn.edu;umn.edu;umn.edu;jd.com;cmu.edu", "position": "MS student;PhD student;Associate Professor;Senior Director;Assistant Professor", "bibtex": "@misc{\nzhang2022understanding,\ntitle={Understanding Clipping for Federated Learning: Convergence and Client-Level Differential Privacy},\nauthor={Xinwei Zhang and Xiangyi Chen and Mingyi Hong and Steven Wu and Jinfeng Yi},\nyear={2022},\nurl={https://openreview.net/forum?id=zBVjxKB6g84}\n}", "github": "", "project": "", "reviewers": "f5vs;YjaF;RbhK;Gv7C", "site": "https://openreview.net/forum?id=zBVjxKB6g84", "pdf_size": 0, "recommendation": "3;3;3;8", "confidence": "4;4;4;3", "correctness": "4;3;2;4", "technical_novelty": "2;2;2;2", "empirical_novelty": "3;1;2;4", "wc_summary_paper": "50;119;228;77", "wc_summary_review": "10;115;58;39", "wc_main_review": "340;433;812;163", "wc_review": "400;667;1098;279", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.25, 2.165063509461097 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.5, 1.118033988749895 ], "wc_summary_paper_avg": [ 118.5, 67.83251432757008 ], "wc_summary_review_avg": [ 55.5, 38.36991008589934 ], "wc_main_review_avg": [ 437.0, 237.23722304899795 ], "wc_review_avg": [ 611.0, 314.265015552161 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -1.0, "corr_recommendation_correctness": 0.5222329678670935, "gs_citation": 132, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4917010174884492209&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "aff_unique_index": "0;0;0;1;2", "aff_unique_norm": "University of Minnesota;JD;Carnegie Mellon University", "aff_unique_dep": ";JD AI Research;", "aff_unique_url": "https://www.minnesota.edu;https://www.jd.com;https://www.cmu.edu", "aff_unique_abbr": "UMN;JD AI;CMU", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Minneapolis", "aff_country_unique_index": "0;0;0;1;0", "aff_country_unique": "United States;China" }, { "id": "zBhwgP7kt4", "title": "Dynamic Least-Squares Regression", "track": "main", "status": "Reject", "tldr": "", "abstract": "In large-scale supervised learning, after a model is trained with an initial dataset, a common challenge is how to exploit new incremental data without re-training the model from scratch. Motivated by this problem, we revisit the canonical problem of dynamic least-squares regression (LSR), where the goal is to learn a linear model over incremental training data. In this setup, data and labels $(\\mathbf{A}^{(t)}, \\mathbf{b}^{(t)}) \\in \\mathbb{R}^{t \\times d}\\times \\mathbb{R}^t$ evolve in an online fashion ($t\\gg d$), and the goal is to efficiently maintain an (approximate) solution of $\\min_{\\mathbf{x}^{(t)}} \\| \\mathbf{A}^{(t)} \\mathbf{x}^{(t)} - \\mathbf{b}^{(t)} \\|_2$ for all $t\\in [T]$. Our main result is a dynamic data structure which maintains an arbitrarily small constant approximate solution to dynamic LSR with amortized update time $O(d^{1+o(1)})$, almost matching the running time of the static (sketching-based) solution. By contrast, for exact (or $1/\\mathrm{poly}(n)$-accuracy) solutions, we show a separation between the models, namely, that dynamic LSR requires $\\Omega(d^{2-o(1)})$ amortized update time under the OMv Conjecture (Henzinger et al., STOC'15). Our data structure is fast, conceptually simple, easy to implement, and our experiments demonstrate their practicality on both synthetic and real-world datasets.", "keywords": "Least squares regression;dynamic algorithm", "primary_area": "", "supplementary_material": "/attachment/b9762f760ce02ea1dedf056b573219edb7c7efa2.zip", "author": "Binghui Peng;Shunhua Jiang;OMRI WEINSTEIN", "authorids": "~Binghui_Peng1;~Shunhua_Jiang1;~OMRI_WEINSTEIN1", "gender": "M;;M", "homepage": "http://www.cs.columbia.edu/~binghuip/;https://www.cs.columbia.edu/~jiangsh/;https://omriweinstein.huji.ac.il/", "dblp": "210/2619;198/0655;85/9060.html", "google_scholar": "twlFI3sAAAAJ;;", "orcid": ";;", "linkedin": ";;", "or_profile": "~Binghui_Peng1;~Shunhua_Jiang1;~OMRI_WEINSTEIN1", "aff": "Columbia University;Columbia University;The Hebrew University", "aff_domain": "columbia.edu;columbia.edu;ee.columbia.edu", "position": "PhD student;PhD student;Associate Professor ", "bibtex": "@misc{\npeng2022dynamic,\ntitle={Dynamic Least-Squares Regression},\nauthor={Binghui Peng and Shunhua Jiang and OMRI WEINSTEIN},\nyear={2022},\nurl={https://openreview.net/forum?id=zBhwgP7kt4}\n}", "github": "", "project": "", "reviewers": "Q1Nm;X5Ds;yBkT;aakT", "site": "https://openreview.net/forum?id=zBhwgP7kt4", "pdf_size": 0, "recommendation": "6;6;6;8", "confidence": "4;4;3;4", "correctness": "4;3;4;4", "technical_novelty": "2;3;3;3", "empirical_novelty": "0;2;3;1", "wc_summary_paper": "194;106;232;62", "wc_summary_review": "30;114;51;197", "wc_main_review": "176;1115;197;302", "wc_review": "400;1335;480;561", "wc_reply_reviewers": "21;419;0;10", "wc_reply_authors": "88;811;257;441", "reply_reviewers": "1;1;0;1", "reply_authors": "1;1;1;1", "recommendation_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.5, 1.118033988749895 ], "wc_summary_paper_avg": [ 148.5, 67.69601169936084 ], "wc_summary_review_avg": [ 98.0, 64.98076638513892 ], "wc_main_review_avg": [ 447.5, 388.3262159576662 ], "wc_review_avg": [ 694.0, 374.4335722127491 ], "wc_reply_reviewers_avg": [ 112.5, 177.11366406915081 ], "wc_reply_authors_avg": [ 399.25, 268.5110565693711 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:x25jAxIUoeoJ:scholar.google.com/&scioq=Dynamic+Least-Squares+Regression&hl=en&as_sdt=0,14", "gs_version_total": 0, "aff_unique_index": "0;0;1", "aff_unique_norm": "Columbia University;Hebrew University of Jerusalem", "aff_unique_dep": ";", "aff_unique_url": "https://www.columbia.edu;https://www.huji.ac.il", "aff_unique_abbr": "Columbia;HUJI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1", "aff_country_unique": "United States;Israel" }, { "id": "zFlFjoyOW-z", "title": "Interest-based Item Representation Framework for Recommendation with Multi-Interests Capsule Network", "track": "main", "status": "Reject", "tldr": "", "abstract": "Item representation plays an important role for recommendation, such as e-commerce, news, video, etc. It has been used by retrieval and ranking model to capture user-item relationship based on user behaviors. For recommendation systems, user interaction behaviors imply single or multi interests of the user, not only items themselves in the sequences. Existing representation learning methods mainly focus on optimizing item-based mechanism between user interaction sequences and candidate item(especially attention mechanism, sequential modeling). However, item representations learned by these methods lack modeling mechanism to reflect user interests. That is, the methods may be less effective and indirect to capture user interests. We propose a framework to learn interest-based item representations directly by introducing user Multi Interests Capsule Network(MICN). To make the framework model-agnostic, user Multi Interests Capsule Network is designed as an auxiliary task to jointly learn item-based item representations and interest-based item representations. Hence, the generic framework can be easily used to improve existing recommendation models without model redesign. The proposed approach is evaluated on multiple types of benchmarks. Furthermore, we investigate several situations on various deep neural networks, different length of behavior sequences and joint learning ratio of interest-based item representations. Experiment shows a great enhancement on performance of various recommendation models and has also validated our approach. We expect the framework could be widely used for recommendation systems.", "keywords": "Feature Representation;Recommendation System;Dynamic Routing of Capsule", "primary_area": "", "supplementary_material": "", "author": "Yanpeng Xie;Tong Zhang;Heng Zhang;Zhendong Qu", "authorids": "~Yanpeng_Xie2;xiaocao.zt@alibaba-inc.com;heng.zhangh1@alibaba-inc.com;zhendong.quzd@alibaba-inc.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Yanpeng_Xie2;xiaocao.zt@alibaba-inc.com;heng.zhangh1@alibaba-inc.com;zhendong.quzd@alibaba-inc.com", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nxie2022interestbased,\ntitle={Interest-based Item Representation Framework for Recommendation with Multi-Interests Capsule Network},\nauthor={Yanpeng Xie and Tong Zhang and Heng Zhang and Zhendong Qu},\nyear={2022},\nurl={https://openreview.net/forum?id=zFlFjoyOW-z}\n}", "github": "", "project": "", "reviewers": "KquS;LCWN;wjxV;P2Vg", "site": "https://openreview.net/forum?id=zFlFjoyOW-z", "pdf_size": 0, "recommendation": "1;3;3;3", "confidence": "4;5;4;3", "correctness": "2;1;3;3", "technical_novelty": "2;2;1;2", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "61;66;69;25", "wc_summary_review": "41;21;60;2", "wc_main_review": "360;150;249;93", "wc_review": "462;237;378;120", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 2.5, 0.8660254037844386 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 2.25, 0.82915619758885 ], "technical_novelty_avg": [ 1.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 55.25, 17.69710428290459 ], "wc_summary_review_avg": [ 31.0, 21.691011963483863 ], "wc_main_review_avg": [ 213.0, 101.58001772002208 ], "wc_review_avg": [ 299.25, 131.04841662530686 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.17407765595569782, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:cYxYEJrjODYJ:scholar.google.com/&scioq=Interest-based+Item+Representation+Framework+for+Recommendation+with+Multi-Interests+Capsule+Network&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "zFyCvjXof60", "title": "Hypergraph Convolutional Networks via Equivalency between Hypergraphs and Undirected Graphs", "track": "main", "status": "Reject", "tldr": "", "abstract": "As a powerful tool for modeling the complex relationships, hypergraphs are gaining popularity from the graph learning community. However, commonly used algorithms in deep hypergraph learning were not specifically designed for hypergraphs with edge-dependent vertex weights (EDVWs). To fill this gap, we build the equivalency condition between EDVW-hypergraphs and undirected simple graphs, which enables utilizing existing undirected graph neural networks as subroutines to learn high-order interactions induced by EDVWs of hypergraphs. Specifically, we define a generalized hypergraph with vertex weights by proposing a unified random walk framework, under which we present the equivalency condition between generalized hypergraphs and undigraphs. Guided by the equivalency results, we propose a Generalized Hypergraph Convolutional Network (GHCN) architecture for deep hypergraph learning. Furthermore, to improve the long-range interactions and alleviate the over-smoothing issue, we further propose the Simple Hypergraph Spectral Convolution (SHSC) model by constructing the Discounted Markov Diffusion Kernel from our random walk framework. Extensive experiments from various domains including social network analysis, visual objective classification, and protein fold classification demonstrate that the proposed approaches outperform state-of-the-art spectral methods with a large margin.", "keywords": "hypergraph learning;equivalency of hypergraph;graph neural networks", "primary_area": "", "supplementary_material": "/attachment/2134deae47ff6538886328a469b414e9d2cf1b30.zip", "author": "Jiying Zhang;Fuyang Li;Xi Xiao;Tingyang Xu;Yu Rong;Junzhou Huang;Yatao Bian", "authorids": "~Jiying_Zhang1;~Fuyang_Li1;~Xi_Xiao1;~Tingyang_Xu1;~Yu_Rong1;~Junzhou_Huang2;~Yatao_Bian1", "gender": "M;M;M;M;M;M;M", "homepage": "https://youjibiying.github.io/;https://guyuwuyu.github.io/;https://www.sigs.tsinghua.edu.cn/xx_en/main.htm;;https://royrong.me/;http://ranger.uta.edu/~huang/;https://yataobian.com", "dblp": "287/9432;;;157/0940;24/10036-1;22/1170.html;222/2694", "google_scholar": "j90eZ0MAAAAJ;;;6gIs5YMAAAAJ;https://scholar.google.com.hk/citations?user=itezhEMAAAAJ;https://scholar.google.com.tw/citations?user=X7KrguAAAAAJ;oZBTlBkAAAAJ", "orcid": ";;;0009-0002-0106-8376;0000-0001-7387-302X;0000-0002-9548-1227;0000-0002-2368-4084", "linkedin": ";;;;;;", "or_profile": "~Jiying_Zhang1;~Fuyang_Li1;~Xi_Xiao1;~Tingyang_Xu1;~Yu_Rong1;~Junzhou_Huang2;~An_Bian1", "aff": "Tencent AI Lab;Tencent AI Lab;Shenzhen International Graduate School, Tsinghua University;Tencent AI Lab;Tencent AI Lab;University of Texas, Arlington;Tencent AI Lab", "aff_domain": "tencent.com;tencent.com;tsinghua.edu.cn;tencent.com;tencent.com;uta.edu;tencent.com", "position": "Internship;Intership;Associate Professor;Researcher;Senior Researcher;Full Professor;Senior researcher ", "bibtex": "@misc{\nzhang2022,\ntitle={ Hypergraph Convolutional Networks via Equivalency between Hypergraphs and Undirected Graphs},\nauthor={Jiying Zhang and Fuyang Li and Xi Xiao and Tingyang Xu and Yu Rong and Junzhou Huang and Yatao Bian},\nyear={2022},\nurl={https://openreview.net/forum?id=zFyCvjXof60}\n}", "github": "", "project": "", "reviewers": "RmER;zaDF;NwNk;NQ7e", "site": "https://openreview.net/forum?id=zFyCvjXof60", "pdf_size": 0, "recommendation": "5;5;6;8", "confidence": "5;3;3;3", "correctness": "4;3;4;4", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;2;3;4", "wc_summary_paper": "99;43;93;105", "wc_summary_review": "43;66;36;28", "wc_main_review": "275;265;167;124", "wc_review": "417;374;296;257", "wc_reply_reviewers": "593;499;68;0", "wc_reply_authors": "2559;2998;769;143", "reply_reviewers": "5;2;1;0", "reply_authors": "10;6;2;1", "recommendation_avg": [ 6.0, 1.224744871391589 ], "confidence_avg": [ 3.5, 0.8660254037844386 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 85.0, 24.61706725018234 ], "wc_summary_review_avg": [ 43.25, 14.16642156650719 ], "wc_main_review_avg": [ 207.75, 64.17700133225297 ], "wc_review_avg": [ 336.0, 62.94044804416314 ], "wc_reply_reviewers_avg": [ 290.0, 259.265308130494 ], "wc_reply_authors_avg": [ 1617.25, 1192.2986989425092 ], "reply_reviewers_avg": [ 2.0, 1.8708286933869707 ], "reply_authors_avg": [ 4.75, 3.5619517121937516 ], "replies_avg": [ 33, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": -0.4714045207910316, "corr_recommendation_correctness": 0.4714045207910316, "gs_citation": 28, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13007469259545261713&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;1;0;0;2;0", "aff_unique_norm": "Tencent;Tsinghua University;University of Texas at Arlington", "aff_unique_dep": "Tencent AI Lab;Shenzhen International Graduate School;", "aff_unique_url": "https://ai.tencent.com;https://www.tsinghua.edu.cn;https://www.uta.edu", "aff_unique_abbr": "Tencent AI Lab;THU;UTA", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Shenzhen;Arlington", "aff_country_unique_index": "0;0;0;0;0;1;0", "aff_country_unique": "China;United States" }, { "id": "zHZ1mvMUMW8", "title": "Succinct Compression: Near-Optimal and Lossless Compression of Deep Neural Networks during Inference Runtime", "track": "main", "status": "Reject", "tldr": "", "abstract": "Recent advances in Deep Neural Networks (DNN) compression (e.g. pruning, quantization and etc.) significantly reduces the amount of space consumption for storage, making them easier to deploy in low-cost devices. However, those techniques do not keep the compressed representation during inference runtime, which incurs significant overheads in terms of both performance and space consumption. We introduce ``Succinct Compression\u201d, a three-stage framework to enable DNN inference with near-optimal compression and much better performance during inference runtime. The key insight of our method leverages the concept of \\textit{Succinct Data Structures}, which supports fast queries directly on compressed representation without decompression. Our method first transforms DNN models as our proposed formulations in either Element-wise or Block-wise manner, so that \\textit{Succinct Data Structures} can take advantage of. Then, our method compresses transformed DNN models using \\textit{Succinct Data Structures}. Finally, our method exploits our specialized execution pipelines for different model formulations, to retrieve relevant data for DNN inference. Our experimental results show that, our method keeps near-optimal compression, and achieves at least 8.7X/11.5X speedup on AlexNet/VGG-16 inference, compared with Huffman Coding. We also experimentally show that our method is quite synergistic with Pruning and Quantization. \n", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yicun Duan;Xiangjun Peng", "authorids": "scyyd3@nottingham.edu.cn;~Xiangjun_Peng1", "gender": ";M", "homepage": ";https://shiangjun.com", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": "scyyd3@nottingham.edu.cn;~Xiangjun_Peng1", "aff": ";The Chinese University of Hong Kong", "aff_domain": ";cuhk.edu.hk", "position": ";MPhil student", "bibtex": "@misc{\nduan2022succinct,\ntitle={Succinct Compression: Near-Optimal and Lossless Compression of Deep Neural Networks during Inference Runtime},\nauthor={Yicun Duan and Xiangjun Peng},\nyear={2022},\nurl={https://openreview.net/forum?id=zHZ1mvMUMW8}\n}", "github": "", "project": "", "reviewers": "QdUC;R8Un;G6tm;eHWE", "site": "https://openreview.net/forum?id=zHZ1mvMUMW8", "pdf_size": 0, "recommendation": "3;3;3;3", "confidence": "4;3;4;3", "correctness": "3;2;3;2", "technical_novelty": "2;3;2;2", "empirical_novelty": "2;3;2;2", "wc_summary_paper": "59;93;68;48", "wc_summary_review": "27;58;347;20", "wc_main_review": "404;364;89;619", "wc_review": "490;515;504;687", "wc_reply_reviewers": "21;30;30;165", "wc_reply_authors": "582;295;98;584", "reply_reviewers": "1;1;1;1", "reply_authors": "1;1;1;1", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 67.0, 16.59819267269783 ], "wc_summary_review_avg": [ 113.0, 135.85470179570524 ], "wc_main_review_avg": [ 369.0, 188.51392521508856 ], "wc_review_avg": [ 549.0, 80.1654539062806 ], "wc_reply_reviewers_avg": [ 61.5, 59.86860613042532 ], "wc_reply_authors_avg": [ 389.75, 205.41954020978628 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:3N_Lwvf3rRwJ:scholar.google.com/&scioq=Succinct+Compression:+Near-Optimal+and+Lossless+Compression+of+Deep+Neural+Networks+during+Inference+Runtime&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "Chinese University of Hong Kong", "aff_unique_dep": "", "aff_unique_url": "https://www.cuhk.edu.hk", "aff_unique_abbr": "CUHK", "aff_campus_unique_index": "0", "aff_campus_unique": "Hong Kong SAR", "aff_country_unique_index": "0", "aff_country_unique": "China" }, { "title": "Frame Averaging for Invariant and Equivariant Network Design", "status": "Oral", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6189", "id": "zIUyj55nXR", "poster": "", "openreview": "https://openreview.net/forum?id=zIUyj55nXR", "slides": "https://iclr.cc/virtual/2022/poster/6189", "video": "https://iclr.cc/virtual/2022/poster/6189", "author_site": "Omri Puny, Matan Atzmon, Edward Smith, Ishan Misra, Aditya Grover, Heli Ben-Hamu, Yaron Lipman", "tldr": "", "abstract": "Many machine learning tasks involve learning functions that are known to be invariant or equivariant to certain symmetries of the input data. However, it is often challenging to design neural network architectures that respect these symmetries while being expressive and computationally efficient. For example, Euclidean motion invariant/equivariant graph or point cloud neural networks. \nWe introduce Frame Averaging (FA), a highly general purpose and systematic framework for adapting known (backbone) architectures to become invariant or equivariant to new symmetry types. Our framework builds on the well known group averaging operator that guarantees invariance or equivariance but is intractable. In contrast, we observe that for many important classes of symmetries, this operator can be replaced with an averaging operator over a small subset of the group elements, called a frame. We show that averaging over a frame guarantees exact invariance or equivariance while often being much simpler to compute than averaging over the entire group. Furthermore, we prove that FA-based models have maximal expressive power in a broad setting and in general preserve the expressive power of their backbone architectures. Using frame averaging, we propose a new class of universal Graph Neural Networks (GNNs), universal Euclidean motion invariant point cloud networks, and Euclidean motion invariant Message Passing (MP) GNNs. We demonstrate the practical effectiveness of FA on several applications including point cloud normal estimation, beyond $2$-WL graph separation, and $n$-body dynamics prediction, achieving state-of-the-art results in all of these benchmarks.", "keywords": "Invariant and equivariant neural network;expressive power", "primary_area": "", "supplementary_material": "", "author": "Omri Puny;Matan Atzmon;Edward J. Smith;Ishan Misra;Aditya Grover;Heli Ben-Hamu;Yaron Lipman", "authorids": "~Omri_Puny1;~Matan_Atzmon1;~Edward_J._Smith1;~Ishan_Misra2;~Aditya_Grover1;~Heli_Ben-Hamu1;~Yaron_Lipman1", "gender": "M;M;M;;M;;", "homepage": "https://omri1348.github.io/;https://matanatz.github.io/;https://edwardsmith1884.github.io/;;https://aditya-grover.github.io;;", "dblp": "267/5465;217/2968;45/6211;;162/5052;;", "google_scholar": "https://scholar.google.com/citations?view_op=list_works;BXNft08AAAAJ;FUUlY5wAAAAJ;;oOhnPUgAAAAJ;;", "orcid": ";;;;;;", "linkedin": "omri-puny-0917771b2/;;edward-james-smith-721754b2/;;;;", "or_profile": "~Omri_Puny1;~Matan_Atzmon1;~Edward_J._Smith1;~Ishan_Misra2;~Aditya_Grover1;~Heli_Ben-Hamu1;~Yaron_Lipman1", "aff": "Weizmann Institute of Science;Weizmann Institute;McGill University;;University of California, Los Angeles;;", "aff_domain": "weizmann.ac.il;weizmann.ac.il;mcgill.ca;;ucla.edu;;", "position": "PhD student;PhD student;PhD student;;Assistant Professor;;", "bibtex": "@inproceedings{\npuny2022frame,\ntitle={Frame Averaging for Invariant and Equivariant Network Design},\nauthor={Omri Puny and Matan Atzmon and Edward J. Smith and Ishan Misra and Aditya Grover and Heli Ben-Hamu and Yaron Lipman},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=zIUyj55nXR}\n}", "github": "", "project": "", "reviewers": "5HAS;BhKC;57mq;CToH", "pdf_size": 0, "recommendation": "8;8;8;8", "confidence": "2;5;4;4", "correctness": "3;4;4;2", "technical_novelty": "4;4;4;4", "empirical_novelty": "3;4;4;3", "wc_summary_paper": "60;33;118;118", "wc_summary_review": "200;105;56;63", "wc_main_review": "233;485;398;583", "wc_review": "493;623;572;764", "wc_reply_reviewers": "32;304;40;18", "wc_reply_authors": "667;1795;721;966", "reply_reviewers": "1;3;1;1", "reply_authors": "1;4;1;2", "recommendation_avg": [ 8.0, 0.0 ], "confidence_avg": [ 3.75, 1.0897247358851685 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 4.0, 0.0 ], "empirical_novelty_avg": [ 3.5, 0.5 ], "wc_summary_paper_avg": [ 82.25, 37.002533697032156 ], "wc_summary_review_avg": [ 106.0, 57.4151547938347 ], "wc_main_review_avg": [ 424.75, 128.60477246198914 ], "wc_review_avg": [ 613.0, 98.71929902506399 ], "wc_reply_reviewers_avg": [ 98.5, 118.90647585392479 ], "wc_reply_authors_avg": [ 1037.25, 451.7634198338772 ], "reply_reviewers_avg": [ 1.5, 0.8660254037844386 ], "reply_authors_avg": [ 2.0, 1.224744871391589 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 152, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8452303343954527617&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "pdf": "https://openreview.net/pdf?id=zIUyj55nXR", "email": "weizmann.ac.il;weizmann.ac.il;mcgill.ca;;ucla.edu;;", "author_num": 7, "aff_unique_index": "0;0;1;2", "aff_unique_norm": "Weizmann Institute of Science;McGill University;University of California, Los Angeles", "aff_unique_dep": ";;", "aff_unique_url": "https://www.weizmann.org.il;https://www.mcgill.ca;https://www.ucla.edu", "aff_unique_abbr": "Weizmann;McGill;UCLA", "aff_campus_unique_index": "1", "aff_campus_unique": ";Los Angeles", "aff_country_unique_index": "0;0;1;2", "aff_country_unique": "Israel;Canada;United States" }, { "id": "zKbMQ2NY1y", "title": "Aug-ILA: More Transferable Intermediate Level Attacks with Augmented References", "track": "main", "status": "Reject", "tldr": "", "abstract": "An intriguing property of deep neural networks is that adversarial attacks can transfer across different models. Existing methods such as the Intermediate Level Attack (ILA) further improve black-box transferability by fine-tuning a reference adversarial attack, so as to maximize the perturbation on a pre-specified layer of the source model. In this paper, we revisit ILA and evaluate the effect of applying augmentation to the images before passing them to ILA. We start by looking into the effect of common image augmentation techniques and exploring novel augmentation with the aid of adversarial perturbations. Based on the observations, we propose Aug-ILA, an improved method that enhances the transferability of an existing attack under the ILA framework. Specifically, Aug-ILA has three main characteristics: typical image augmentation such as random cropping and resizing applied to all ILA inputs, reverse adversarial update on the clean image, and interpolation between two attacks on the reference image. Our experimental results show that Aug-ILA outperforms ILA and its subsequent variants, as well as state-of-the-art transfer-based attacks, by achieving $96.99\\%$ and $87.84\\%$ average attack success rates with perturbation budgets $0.05$ and $0.03$, respectively, on nine undefended models.", "keywords": "adversarial examples;adversarial transferability;intermediate feature;image augmentation", "primary_area": "", "supplementary_material": "/attachment/1027e59cec25f0f68c0ddc1e6972247a903b7768.zip", "author": "Chiu Wai Yan;Dit-Yan Yeung", "authorids": "~Chiu_Wai_Yan1;~Dit-Yan_Yeung2", "gender": "M;M", "homepage": ";https://cse.hkust.edu.hk/faculty/dyyeung/", "dblp": "350/3858;41/5668", "google_scholar": ";nEsOOx8AAAAJ", "orcid": "0000-0002-7277-5580;0000-0003-3716-8125", "linkedin": ";", "or_profile": "~Chiu_Wai_Yan1;~Dit-Yan_Yeung2", "aff": "Hong Kong University of Science and Technology;Hong Kong University of Science and Technology", "aff_domain": "hkust.edu;ust.hk", "position": "MS student;Chair Professor", "bibtex": "@misc{\nyan2022augila,\ntitle={Aug-{ILA}: More Transferable Intermediate Level Attacks with Augmented References},\nauthor={Chiu Wai Yan and Dit-Yan Yeung},\nyear={2022},\nurl={https://openreview.net/forum?id=zKbMQ2NY1y}\n}", "github": "", "project": "", "reviewers": "TcRw;2dLg;uNtx;JiJd", "site": "https://openreview.net/forum?id=zKbMQ2NY1y", "pdf_size": 0, "recommendation": "3;5;6;6", "confidence": "4;3;3;4", "correctness": "2;3;3;3", "technical_novelty": "3;2;3;2", "empirical_novelty": "2;3;3;2", "wc_summary_paper": "62;102;52;74", "wc_summary_review": "40;346;18;36", "wc_main_review": "419;758;218;184", "wc_review": "521;1206;288;294", "wc_reply_reviewers": "236;263;0;98", "wc_reply_authors": "1098;1531;323;418", "reply_reviewers": "1;1;0;1", "reply_authors": "2;2;1;1", "recommendation_avg": [ 5.0, 1.224744871391589 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 72.5, 18.728320800328042 ], "wc_summary_review_avg": [ 110.0, 136.50641010589942 ], "wc_main_review_avg": [ 394.75, 228.1418144488204 ], "wc_review_avg": [ 577.25, 374.96224809972534 ], "wc_reply_reviewers_avg": [ 149.25, 106.4973591221867 ], "wc_reply_authors_avg": [ 842.5, 497.3411806798226 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.40824829046386296, "corr_recommendation_correctness": 0.9428090415820632, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:5pgyaJcaZaQJ:scholar.google.com/&scioq=Aug-ILA:+More+Transferable+Intermediate+Level+Attacks+with+Augmented+References&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Hong Kong University of Science and Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.ust.hk", "aff_unique_abbr": "HKUST", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Hong Kong SAR", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "id": "zLb9oSWy933", "title": "Fast Finite Width Neural Tangent Kernel", "track": "main", "status": "Reject", "tldr": "", "abstract": "The Neural Tangent Kernel (NTK), defined as the outer product of the neural network (NN) Jacobians, $\\Theta_\\theta(x_1, x_2) = \\left[\\partial f(\\theta, x_1)\\big/\\partial \\theta\\right] \\left[\\partial f(\\theta, x_2)\\big/\\partial \\theta\\right]^T$, has emerged as a central object of study in deep learning. In the infinite width limit, the NTK can sometimes be computed analytically and is useful for understanding training and generalization of NN architectures. At finite widths, the NTK is also used to better initialize NNs, compare the conditioning across models, perform architecture search, and do meta-learning. Unfortunately, the finite-width NTK is notoriously expensive to compute, which severely limits its practical utility. \n\nWe perform the first in-depth analysis of the compute and memory requirements for NTK computation in finite width networks. \nLeveraging the structure of neural networks, we further propose two novel algorithms that change the exponent of the compute and memory requirements of the finite width NTK, dramatically improving efficiency.\n\nWe open-source (https://github.com/iclr2022anon/fast_finite_width_ntk) our two algorithms as general-purpose JAX function transformations that apply to any differentiable computation (convolutions, attention, recurrence, etc.) and introduce no new hyper-parameters.\n", "keywords": "Neural Tangent Kernel;NTK;Finite Width;Fast;Algorithm;JAX;Jacobian;Software", "primary_area": "", "supplementary_material": "/attachment/ee533ee353e58eca71ca54f2510b528549bb65e0.zip", "author": "Roman Novak;Jascha Sohl-Dickstein;Samuel Stern Schoenholz", "authorids": "~Roman_Novak2;~Jascha_Sohl-Dickstein2;~Samuel_Stern_Schoenholz1", "gender": "M;M;M", "homepage": "https://github.com/romanngg;https://samschoenholz.wordpress.com/;http://sohldickstein.com", "dblp": "https://dblp.org/pers/n/Novak:Roman.html;190/7108;51/7117", "google_scholar": "LWvgl-8AAAAJ;mk-zQBsAAAAJ;-3zYIjQAAAAJ", "orcid": ";;", "linkedin": "romanovak;samuel-schoenholz-379830a0;", "or_profile": "~Roman_Novak2;~Samuel_Stern_Schoenholz1;~Jascha_Sohl-Dickstein1", "aff": "Google Brain;Google;Google", "aff_domain": "google.com;google.com;google.com", "position": "Research Scientist;Research Scientist;Research Scientist", "bibtex": "@misc{\nnovak2022fast,\ntitle={Fast Finite Width Neural Tangent Kernel},\nauthor={Roman Novak and Jascha Sohl-Dickstein and Samuel Stern Schoenholz},\nyear={2022},\nurl={https://openreview.net/forum?id=zLb9oSWy933}\n}", "github": "", "project": "", "reviewers": "uyYg;Ye97;qdx1;tZAo", "site": "https://openreview.net/forum?id=zLb9oSWy933", "pdf_size": 0, "recommendation": "3;6;6;6", "confidence": "5;4;3;4", "correctness": "4;3;3;3", "technical_novelty": "1;3;2;1", "empirical_novelty": "2;4;4;3", "wc_summary_paper": "25;50;86;67", "wc_summary_review": "36;62;53;49", "wc_main_review": "165;135;488;162", "wc_review": "226;247;627;278", "wc_reply_reviewers": "0;44;372;110", "wc_reply_authors": "1092;809;2546;3205", "reply_reviewers": "0;1;3;1", "reply_authors": "3;2;5;7", "recommendation_avg": [ 5.25, 1.299038105676658 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 1.75, 0.82915619758885 ], "empirical_novelty_avg": [ 3.25, 0.82915619758885 ], "wc_summary_paper_avg": [ 57.0, 22.438805672316875 ], "wc_summary_review_avg": [ 50.0, 9.354143466934854 ], "wc_main_review_avg": [ 237.5, 145.09738109283708 ], "wc_review_avg": [ 344.5, 164.14703774360353 ], "wc_reply_reviewers_avg": [ 131.5, 144.26624691867465 ], "wc_reply_authors_avg": [ 1913.0, 995.3403940361308 ], "reply_reviewers_avg": [ 1.25, 1.0897247358851685 ], "reply_authors_avg": [ 4.25, 1.920286436967152 ], "replies_avg": [ 29, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.816496580927726, "corr_recommendation_correctness": -1.0, "gs_citation": 76, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2891750348147928089&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff_unique_index": "0;0;0", "aff_unique_norm": "Google", "aff_unique_dep": "Google Brain", "aff_unique_url": "https://brain.google.com", "aff_unique_abbr": "Google Brain", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Equivariant Transformers for Neural Network based Molecular Potentials", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6416", "id": "zNHzqZ9wrRB", "poster": "", "openreview": "https://openreview.net/forum?id=zNHzqZ9wrRB", "slides": "https://iclr.cc/virtual/2022/poster/6416", "video": "https://iclr.cc/virtual/2022/poster/6416", "author_site": "Philipp Th\u00f6lke, Gianni De Fabritiis", "tldr": "", "abstract": "The prediction of quantum mechanical properties is historically plagued by a trade-off between accuracy and speed. Machine learning potentials have previously shown great success in this domain, reaching increasingly better accuracy while maintaining computational efficiency comparable with classical force fields. In this work we propose TorchMD-NET, a novel equivariant Transformer (ET) architecture, outperforming state-of-the-art on MD17, ANI-1, and many QM9 targets in both accuracy and computational efficiency. Through an extensive attention weight analysis, we gain valuable insights into the black box predictor and show differences in the learned representation of conformers versus conformations sampled from molecular dynamics or normal modes. Furthermore, we highlight the importance of datasets including off-equilibrium conformations for the evaluation of molecular potentials.", "keywords": "Molecular Modeling;Quantum Chemistry;Attention;Transformers", "primary_area": "", "supplementary_material": "", "author": "Philipp Th\u00f6lke;Gianni De Fabritiis", "authorids": "~Philipp_Th\u00f6lke1;~Gianni_De_Fabritiis1", "gender": "M;M", "homepage": "https://github.com/PhilippThoelke;https://www.compscience.org", "dblp": ";29/605", "google_scholar": "cLA3Bz8AAAAJ;-_kX4kMAAAAJ", "orcid": "0000-0002-3208-2501;", "linkedin": ";gdefabritiis/", "or_profile": "~Philipp_Th\u00f6lke1;~Gianni_De_Fabritiis1", "aff": "Universit\u00e4t Osnabr\u00fcck;Universitat Pompeu Fabra", "aff_domain": "uni-osnabrueck.de;upf.edu", "position": "MS student;Full Professor", "bibtex": "@inproceedings{\nth{\\\"o}lke2022equivariant,\ntitle={Equivariant Transformers for Neural Network based Molecular Potentials},\nauthor={Philipp Th{\\\"o}lke and Gianni De Fabritiis},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=zNHzqZ9wrRB}\n}", "github": "", "project": "", "reviewers": "AQ9R;9RJX;ZUVc;RnuV", "pdf_size": 0, "recommendation": "6;6;8;8", "confidence": "5;3;4;4", "correctness": "3;4;4;3", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;0;3;3", "wc_summary_paper": "32;78;81;55", "wc_summary_review": "57;75;36;185", "wc_main_review": "387;213;184;53", "wc_review": "476;366;301;293", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "339;159;150;308", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 7.0, 1.0 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 1.224744871391589 ], "wc_summary_paper_avg": [ 61.5, 19.78004044485248 ], "wc_summary_review_avg": [ 88.25, 57.53857401778393 ], "wc_main_review_avg": [ 209.25, 119.01759323730252 ], "wc_review_avg": [ 359.0, 73.24274708119569 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 239.0, 85.2672269984195 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 265, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17571003781324968336&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "pdf": "https://openreview.net/pdf?id=zNHzqZ9wrRB", "email": "uni-osnabrueck.de;upf.edu", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "University of Osnabr\u00fcck;Universitat Pompeu Fabra", "aff_unique_dep": ";", "aff_unique_url": "https://www.uni-osnabrueck.de;https://www.upf.edu/", "aff_unique_abbr": "UOS;UPF", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "Germany;Spain" }, { "title": "Learning to Annotate Part Segmentation with Gradient Matching", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6015", "id": "zNR43c03lRy", "poster": "", "openreview": "https://openreview.net/forum?id=zNR43c03lRy", "slides": "https://iclr.cc/virtual/2022/poster/6015", "video": "https://iclr.cc/virtual/2022/poster/6015", "author_site": "Yu Yang, Xiaotian Cheng, Hakan Bilen, Xiangyang Ji", "tldr": "", "abstract": "The success of state-of-the-art deep neural networks heavily relies on the presence of large-scale labelled datasets, which are extremely expensive and time-consuming to annotate. This paper focuses on tackling semi-supervised part segmentation tasks by generating high-quality images with a pre-trained GAN and labelling the generated images with an automatic annotator. In particular, we formulate the annotator learning as a learning-to-learn problem. Given a pre-trained GAN, the annotator learns to label object parts in a set of randomly generated images such that a part segmentation model trained on these synthetic images with their predicted labels obtains low segmentation error on a small validation set of manually labelled images. We further reduce this nested-loop optimization problem to a simple gradient matching problem and efficiently solve it with an iterative algorithm. We show that our method can learn annotators from a broad range of labelled images including real images, generated images, and even analytically rendered images. Our method is evaluated with semi-supervised part segmentation tasks and significantly outperforms other semi-supervised competitors when the amount of labelled examples is extremely limited.", "keywords": "semi-supervised learning;part segmentation;semantic segmentation;generative models;gradient matching", "primary_area": "", "supplementary_material": "", "author": "Yu Yang;Xiaotian Cheng;Hakan Bilen;Xiangyang Ji", "authorids": "~Yu_Yang6;~Xiaotian_Cheng1;~Hakan_Bilen1;~Xiangyang_Ji1", "gender": "M;M;M;", "homepage": ";https://github.com/greatwallet;http://homepages.inf.ed.ac.uk/hbilen/;", "dblp": "16/4505-11;281/6648;97/2993;", "google_scholar": "GrpZ-akAAAAJ;https://scholar.google.com/citations?hl=en;PtBtfawAAAAJ;", "orcid": ";;0000-0002-6947-6918;", "linkedin": ";;;", "or_profile": "~Yu_Yang6;~Xiaotian_Cheng1;~Hakan_Bilen1;~Xiangyang_Ji1", "aff": "Tsinghua University;Tsinghua University;University of Edinburgh;", "aff_domain": "tsinghua.edu.cn;tsinghua.edu.cn;ed.ac.uk;", "position": "PhD student;PhD student;Assistant Professor;", "bibtex": "@inproceedings{\nyang2022learning,\ntitle={Learning to Annotate Part Segmentation with Gradient Matching},\nauthor={Yu Yang and Xiaotian Cheng and Hakan Bilen and Xiangyang Ji},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=zNR43c03lRy}\n}", "github": "", "project": "", "reviewers": "muji;y2iP;318j;Z2yc", "pdf_size": 0, "recommendation": "6;6;6;8", "confidence": "3;4;4;3", "correctness": "4;3;3;3", "technical_novelty": "2;3;3;3", "empirical_novelty": "0;2;3;3", "wc_summary_paper": "60;108;222;99", "wc_summary_review": "27;93;154;91", "wc_main_review": "264;491;576;217", "wc_review": "351;692;952;407", "wc_reply_reviewers": "43;465;18;32", "wc_reply_authors": "618;935;2096;551", "reply_reviewers": "1;2;1;1", "reply_authors": "1;3;3;1", "recommendation_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 1.224744871391589 ], "wc_summary_paper_avg": [ 122.25, 60.35053852286656 ], "wc_summary_review_avg": [ 91.25, 44.91311055805421 ], "wc_main_review_avg": [ 387.0, 150.47092742453606 ], "wc_review_avg": [ 600.5, 240.63301934688846 ], "wc_reply_reviewers_avg": [ 139.5, 188.13625381621694 ], "wc_reply_authors_avg": [ 1050.0, 621.0809126031809 ], "reply_reviewers_avg": [ 1.25, 0.4330127018922193 ], "reply_authors_avg": [ 2.0, 1.0 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.5773502691896257, "corr_recommendation_correctness": -0.3333333333333333, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16141754978886952440&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "pdf": "https://openreview.net/pdf?id=zNR43c03lRy", "email": "tsinghua.edu.cn;tsinghua.edu.cn;ed.ac.uk;", "author_num": 4, "aff_unique_index": "0;0;1", "aff_unique_norm": "Tsinghua University;University of Edinburgh", "aff_unique_dep": ";", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.ed.ac.uk", "aff_unique_abbr": "THU;Edinburgh", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1", "aff_country_unique": "China;United Kingdom" }, { "id": "zNlkpFBT9aD", "title": "Automatic Portrait Video Matting via Context Motion Network", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Automatic portrait video matting is an under-constrained problem. Most state-of-the-art methods only exploit the semantic information and process each frame individually. Their performance is compromised due to the lack of temporal information between the frames. To solve this problem, we explore the optical flow between video frames for the automatic portrait video matting. Specifically, we propose the context motion network to leverage semantic information and motion information. To capture the motion information, we estimate the optical flow and design a context-motion updating operator to integrate features between frames recurrently. Our experiments show that our network outperforms state-of-the-art matting methods significantly on the Video240K SD dataset.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Qiqi Hou", "authorids": "~Qiqi_Hou1", "gender": "M", "homepage": "https://hqqxyy.github.io/", "dblp": "168/2419", "google_scholar": "Kt9KfRQAAAAJ", "orcid": "", "linkedin": "hou-qiqi-511096b9/", "or_profile": "~Qiqi_Hou1", "aff": "Portland State University", "aff_domain": "pdx.edu", "position": "PhD student", "bibtex": "@misc{\nhou2022automatic,\ntitle={Automatic Portrait Video Matting via Context Motion Network},\nauthor={Qiqi Hou},\nyear={2022},\nurl={https://openreview.net/forum?id=zNlkpFBT9aD}\n}", "github": "", "project": "", "reviewers": "ktXS;jpmd;kf1t;MPxk;82pQ", "site": "https://openreview.net/forum?id=zNlkpFBT9aD", "pdf_size": 0, "recommendation": "3;5;5;6;6", "confidence": "4;4;5;4;2", "correctness": "3;2;4;3;3", "technical_novelty": "2;2;2;3;2", "empirical_novelty": "2;2;3;3;3", "wc_summary_paper": "73;115;53;58;112", "wc_summary_review": "14;52;36;66;71", "wc_main_review": "246;538;274;324;411", "wc_review": "333;705;363;448;594", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "0;0;0;0;0", "reply_reviewers": "0;0;0;0;0", "reply_authors": "0;0;0;0;0", "recommendation_avg": [ 5.0, 1.0954451150103321 ], "confidence_avg": [ 3.8, 0.9797958971132712 ], "correctness_avg": [ 3.0, 0.6324555320336759 ], "technical_novelty_avg": [ 2.2, 0.39999999999999997 ], "empirical_novelty_avg": [ 2.6, 0.4898979485566356 ], "wc_summary_paper_avg": [ 82.2, 26.40757467091592 ], "wc_summary_review_avg": [ 47.8, 20.826905675111703 ], "wc_main_review_avg": [ 358.6, 105.79527399652595 ], "wc_review_avg": [ 488.6, 141.11924036076724 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 1, 0 ], "corr_recommendation_confidence": -0.372677996249965, "corr_recommendation_correctness": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6718955637317173498&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0", "aff_unique_norm": "Portland State University", "aff_unique_dep": "", "aff_unique_url": "https://www.pdx.edu", "aff_unique_abbr": "PSU", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "id": "zPLQSnfd14w", "title": "Two Regimes of Generalization for Non-Linear Metric Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "A common approach to metric learning is to seek an embedding of the input data that behaves well with respect to the labels. While generalization bounds for linear embeddings are known, the non-linear case is not well understood. In this work we fill this gap by \nproviding uniform generalization guarantees for the case where the metric is induced by a neural network type embedding of the data. Specifically, we discover and analyze two regimes of behavior of the networks, which are roughly related to the sparsity of the last layer. The bounds corresponding to the first regime are based on the spectral and $(2,1)$-norms of the weight matrices, while the second regime bounds use the $(2,\\infty)$-norm at the last layer, and are significantly stronger when the last layer is dense. In addition, we empirically evaluate the behavior of the bounds for networks trained with SGD on the MNIST and 20newsgroups datasets. In particular, we demonstrate that both regimes occur naturally on realistic data.", "keywords": "metric learning;guarantees;sparsity", "primary_area": "", "supplementary_material": "/attachment/f812a48baf4df314a1123b42be2ec570538da231.zip", "author": "Mark Kozdoba;Shie Mannor", "authorids": "~Mark_Kozdoba1;~Shie_Mannor2", "gender": ";M", "homepage": "https://www.linkedin.com/in/mark-kozdoba-5b6bb835/;https://shie.net.technion.ac.il", "dblp": "161/9885;20/1669", "google_scholar": "PHE-SswAAAAJ;https://scholar.google.com.tw/citations?user=q1HlbIUAAAAJ", "orcid": "0000-0002-8451-023X;", "linkedin": "mark-kozdoba-5b6bb835/;", "or_profile": "~Mark_Kozdoba1;~Shie_Mannor2", "aff": "Technion;Technion - Israel Institute of Technology, Technion", "aff_domain": "technion.ac.il;technion.il", "position": "Principal Researcher;Full Professor", "bibtex": "@misc{\nkozdoba2022two,\ntitle={Two Regimes of Generalization for Non-Linear Metric Learning},\nauthor={Mark Kozdoba and Shie Mannor},\nyear={2022},\nurl={https://openreview.net/forum?id=zPLQSnfd14w}\n}", "github": "", "project": "", "reviewers": "Whaa;nCbx;zFNY;w4wy", "site": "https://openreview.net/forum?id=zPLQSnfd14w", "pdf_size": 0, "recommendation": "3;3;5;6", "confidence": "3;4;3;3", "correctness": "3;2;4;4", "technical_novelty": "3;1;3;3", "empirical_novelty": "0;2;2;2", "wc_summary_paper": "33;13;64;101", "wc_summary_review": "23;32;19;54", "wc_main_review": "880;137;161;380", "wc_review": "936;182;244;535", "wc_reply_reviewers": "45;0;0;46", "wc_reply_authors": "623;478;812;874", "reply_reviewers": "1;0;0;1", "reply_authors": "2;1;1;2", "recommendation_avg": [ 4.25, 1.299038105676658 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.5, 0.8660254037844386 ], "empirical_novelty_avg": [ 1.5, 0.8660254037844386 ], "wc_summary_paper_avg": [ 52.75, 33.25939716831921 ], "wc_summary_review_avg": [ 32.0, 13.546217184144066 ], "wc_main_review_avg": [ 389.5, 298.6004855990693 ], "wc_review_avg": [ 474.25, 298.04729071071927 ], "wc_reply_reviewers_avg": [ 22.75, 22.75274708689041 ], "wc_reply_authors_avg": [ 696.75, 156.5173712403834 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": -0.5555555555555555, "corr_recommendation_correctness": 0.8703882797784892, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=236873114661596912&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Technion - Israel Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.technion.ac.il/en/", "aff_unique_abbr": "Technion", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Israel" }, { "title": "ConFeSS: A Framework for Single Source Cross-Domain Few-Shot Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/5975", "id": "zRJu6mU2BaE", "poster": "", "openreview": "https://openreview.net/forum?id=zRJu6mU2BaE", "slides": "https://iclr.cc/virtual/2022/poster/5975", "video": "https://iclr.cc/virtual/2022/poster/5975", "author_site": "Debasmit Das, Sungrack Yun, Fatih Porikli", "tldr": "", "abstract": "Most current few-shot learning methods train a model from abundantly labeled base category data and then transfer and adapt the model to sparsely labeled novel category data. These methods mostly generalize well on novel categories from the same domain as the base categories but perform poorly for distant domain categories. In this paper, we propose a framework for few-shot learning coined as ConFeSS (Contrastive Learning and Feature Selection System) that tackles large domain shift between base and novel categories. The first step of our framework trains a feature extracting backbone with the contrastive loss on the base category data. Since the contrastive loss does not use supervision, the features can generalize better to distant target domains. For the second step, we train a masking module to select relevant features that are more suited to target domain classification. Finally, a classifier is fine-tuned along with the backbone such that the backbone produces features similar to the relevant ones. To evaluate our framework, we tested it on a recently introduced cross-domain few-shot learning benchmark. Experimental results demonstrate that our framework outperforms all meta-learning approaches and produces competitive results against recent cross-domain methods. Additional analyses are also performed to better understand our framework.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Debasmit Das;Sungrack Yun;Fatih Porikli", "authorids": "~Debasmit_Das2;~Sungrack_Yun1;~Fatih_Porikli2", "gender": "M;M;M", "homepage": "https://sites.google.com/site/debasmitbunadas/;;https://www.porikli.com", "dblp": "155/4271;67/8053;p/FatihMuratPorikli", "google_scholar": "0tP8MuMAAAAJ;;https://scholar.google.com.tw/citations?user=VpB8NZ8AAAAJ", "orcid": ";;0000-0002-1520-4466", "linkedin": ";;fatih-porikli-a95643/", "or_profile": "~Debasmit_Das2;~Sungrack_Yun1;~Fatih_Porikli2", "aff": "Qualcomm Inc.;Qualcomm;QualComm", "aff_domain": "qti.qualcomm.com;qualcomm.com;qualcomm.com", "position": "Researcher;Researcher;Senior Director", "bibtex": "@inproceedings{\ndas2022confess,\ntitle={ConFe{SS}: A Framework for Single Source Cross-Domain Few-Shot Learning},\nauthor={Debasmit Das and Sungrack Yun and Fatih Porikli},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=zRJu6mU2BaE}\n}", "github": "", "project": "", "reviewers": "H3UB;nRRt;yTZ7;MRXL", "pdf_size": 0, "recommendation": "5;6;6;6", "confidence": "4;4;5;4", "correctness": "3;4;4;3", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;3;2;3", "wc_summary_paper": "40;103;80;69", "wc_summary_review": "67;60;16;41", "wc_main_review": "195;379;112;204", "wc_review": "302;542;208;314", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "843;703;279;485", "reply_reviewers": "0;0;0;0", "reply_authors": "2;1;1;1", "recommendation_avg": [ 5.75, 0.4330127018922193 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 73.0, 22.66053838724932 ], "wc_summary_review_avg": [ 46.0, 19.761072845369505 ], "wc_main_review_avg": [ 222.5, 97.21239632886332 ], "wc_review_avg": [ 341.5, 122.81999022960392 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 577.5, 214.41723344917963 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.3333333333333333, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 59, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8729706362153101263&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "pdf": "https://openreview.net/pdf?id=zRJu6mU2BaE", "email": "qti.qualcomm.com;qualcomm.com;qualcomm.com", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Qualcomm Incorporated", "aff_unique_dep": "", "aff_unique_url": "https://www.qualcomm.com", "aff_unique_abbr": "Qualcomm", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "zRb7IWkTZAU", "title": "Zero-Shot Reward Specification via Grounded Natural Language", "track": "main", "status": "Reject", "tldr": "", "abstract": "Reward signals in reinforcement learning can be expensive signals in many tasks and often require access to direct state. The alternative to reward signals are usually demonstrations or goal images which can be labor intensive to collect. Goal text description is a low effort way of communicating the desired task. Goal text conditioned policies so far though have been trained with reward signals that have access to state or labelled expert demonstrations. We devise a model that leverages CLIP to ground objects in a scene described by the goal text paired with spatial relationship rules to provide an off-the-shelf reward signal on only raw pixels to learn a set of robotic manipulation tasks. We distill the policies learned with this reward signal on several tasks to produce one goal text conditioned policy.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Parsa Mahmoudieh;Sayna Ebrahimi;Deepak Pathak;Trevor Darrell", "authorids": "~Parsa_Mahmoudieh1;~Sayna_Ebrahimi1;~Deepak_Pathak1;~Trevor_Darrell2", "gender": "M;F;M;M", "homepage": "https://people.eecs.berkeley.edu/~parsa.m/;https://saynaebrahimi.github.io/;https://www.cs.cmu.edu/~dpathak/;https://people.eecs.berkeley.edu/~trevor/", "dblp": "164/8264;207/7584;155/9860;d/TrevorDarrell", "google_scholar": "FyQAwaEAAAAJ;wRyjJfMAAAAJ;https://scholar.google.cl/citations?user=AEsPCAUAAAAJ;https://scholar.google.com.tw/citations?user=bh-uRFMAAAAJ", "orcid": ";;;", "linkedin": ";saynaebrahimi/;pathak22/;", "or_profile": "~Parsa_Mahmoudieh1;~Sayna_Ebrahimi1;~Deepak_Pathak1;~trevor_darrell1", "aff": "University of California, Berkeley;Google;Carnegie Mellon University;Electrical Engineering & Computer Science Department", "aff_domain": "berkeley.edu;google.com;cmu.edu;eecs.berkeley.edu", "position": "PhD student;Research Scientist;Assistant Professor;Professor", "bibtex": "@misc{\nmahmoudieh2022zeroshot,\ntitle={Zero-Shot Reward Specification via Grounded Natural Language},\nauthor={Parsa Mahmoudieh and Sayna Ebrahimi and Deepak Pathak and Trevor Darrell},\nyear={2022},\nurl={https://openreview.net/forum?id=zRb7IWkTZAU}\n}", "github": "", "project": "", "reviewers": "PiKW;fEgi;GnSJ;b31Y", "site": "https://openreview.net/forum?id=zRb7IWkTZAU", "pdf_size": 0, "recommendation": "3;3;5;6", "confidence": "4;4;4;2", "correctness": "3;3;3;3", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "71;130;127;99", "wc_summary_review": "18;49;53;51", "wc_main_review": "307;637;595;184", "wc_review": "396;816;775;334", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "350;827;485;149", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 4.25, 1.299038105676658 ], "confidence_avg": [ 3.5, 0.8660254037844386 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 106.75, 23.920441049445557 ], "wc_summary_review_avg": [ 42.75, 14.359230480774379 ], "wc_main_review_avg": [ 430.75, 190.86431698984492 ], "wc_review_avg": [ 580.25, 216.84830527352526 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 452.75, 246.94369297473463 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.7777777777777777, "corr_recommendation_correctness": 0.0, "gs_citation": 63, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7105001314229366579&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "University of California, Berkeley;Google;Carnegie Mellon University;Electrical Engineering & Computer Science Department", "aff_unique_dep": ";Google;;Electrical Engineering & Computer Science", "aff_unique_url": "https://www.berkeley.edu;https://www.google.com;https://www.cmu.edu;", "aff_unique_abbr": "UC Berkeley;Google;CMU;", "aff_campus_unique_index": "0;1", "aff_campus_unique": "Berkeley;Mountain View;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States;" }, { "id": "zU2v47WF0Ku", "title": "Implicit Bias of Linear Equivariant Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "Group equivariant convolutional neural networks (G-CNNs) are generalizations of convolutional neural networks (CNNs) which excel in a wide range of scientific and technical applications by explicitly encoding particular group symmetries, such as rotations and permutations, in their architectures. Although the success of G-CNNs is driven by the explicit symmetry bias of their convolutional architecture, a recent line of work has proposed that the implicit bias of training algorithms on a particular parameterization (or architecture) is key to understanding generalization for overparameterized neural nets. In this context, we show that $L$-layer full-width linear G-CNNs trained via gradient descent in a binary classification task converge to solutions with low-rank Fourier matrix coefficients, regularized by the $2/L$-Schatten matrix norm. Our work strictly generalizes previous analysis on the implicit bias of linear CNNs to linear G-CNNs over all finite groups, including the challenging setting of non-commutative symmetry groups (such as permutations). We validate our theorems via experiments on a variety of groups and empirically explore more realistic nonlinear networks, which locally capture similar regularization patterns. Finally, we provide intuitive interpretations of our Fourier-space implicit regularization results in real space via uncertainty principles.", "keywords": "Implicit bias;equivariance;deep learning;linear networks;convolution;CNN", "primary_area": "", "supplementary_material": "/attachment/df096dad88ee80305fcdd67c62ebacdb6a280ebe.zip", "author": "Hannah Lawrence;Kristian Georgiev;Andrew Dienes;Bobak Kiani", "authorids": "~Hannah_Lawrence1;~Kristian_Georgiev1;adienes@mit.edu;~Bobak_Kiani1", "gender": "F;;;", "homepage": "https://hannahlawrence.github.io/;;;", "dblp": "251/5474;;;232/4086", "google_scholar": ";;;", "orcid": ";;;", "linkedin": "hannah-lawrence-417b5a130/;;;bobak-kiani", "or_profile": "~Hannah_Lawrence1;~Kristian_Georgiev1;adienes@mit.edu;~Bobak_Kiani1", "aff": "Massachusetts Institute of Technology;;;Massachusetts Institute of Technology", "aff_domain": "mit.edu;;;mit.edu", "position": "PhD student;;;PhD student", "bibtex": "@misc{\nlawrence2022implicit,\ntitle={Implicit Bias of Linear Equivariant Networks},\nauthor={Hannah Lawrence and Kristian Georgiev and Andrew Dienes and Bobak Kiani},\nyear={2022},\nurl={https://openreview.net/forum?id=zU2v47WF0Ku}\n}", "github": "", "project": "", "reviewers": "k7aV;HWqt;nEDA;Mfmn", "site": "https://openreview.net/forum?id=zU2v47WF0Ku", "pdf_size": 0, "recommendation": "5;6;6;6", "confidence": "4;3;2;2", "correctness": "4;3;4;3", "technical_novelty": "2;3;3;3", "empirical_novelty": "1;2;2;2", "wc_summary_paper": "89;292;60;95", "wc_summary_review": "127;73;142;49", "wc_main_review": "354;322;306;177", "wc_review": "570;687;508;321", "wc_reply_reviewers": "0;55;64;37", "wc_reply_authors": "716;553;1025;883", "reply_reviewers": "0;1;1;1", "reply_authors": "1;1;2;2", "recommendation_avg": [ 5.75, 0.4330127018922193 ], "confidence_avg": [ 2.75, 0.82915619758885 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 1.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 134.0, 92.17646120349815 ], "wc_summary_review_avg": [ 97.75, 38.08789177678386 ], "wc_main_review_avg": [ 289.75, 67.35122493318143 ], "wc_review_avg": [ 521.5, 132.4056267686536 ], "wc_reply_reviewers_avg": [ 39.0, 24.525496936861444 ], "wc_reply_authors_avg": [ 794.25, 177.0923134977913 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -0.8703882797784891, "corr_recommendation_correctness": -0.5773502691896257, "gs_citation": 18, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5414336386133292832&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff_unique_index": "0;0", "aff_unique_norm": "Massachusetts Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://web.mit.edu", "aff_unique_abbr": "MIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "On the relation between statistical learning and perceptual distances", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6765", "id": "zXM0b4hi5_B", "poster": "", "openreview": "https://openreview.net/forum?id=zXM0b4hi5_B", "slides": "https://iclr.cc/virtual/2022/poster/6765", "video": "https://iclr.cc/virtual/2022/poster/6765", "author_site": "Alexander Hepburn, Valero Laparra, Raul Santos-Rodriguez, Johannes Ball\u00e9, Jesus Malo", "tldr": "", "abstract": "It has been demonstrated many times that the behavior of the human visual system is connected to the statistics of natural images. Since machine learning relies on the statistics of training data as well, the above connection has interesting implications when using perceptual distances (which mimic the behavior of the human visual system) as a loss function. In this paper, we aim to unravel the non-trivial relationships between the probability distribution of the data, perceptual distances, and unsupervised machine learning. To this end, we show that perceptual sensitivity is correlated with the probability of an image in its close neighborhood. We also explore the relation between distances induced by autoencoders and the probability distribution of the training data, as well as how these induced distances are correlated with human perception. Finally, we find perceptual distances do not always lead to noticeable gains in performance over Euclidean distance in common image processing tasks, except when data is scarce and the perceptual distance provides regularization. We propose this may be due to a double-counting effect of the image statistics, once in the perceptual distance and once in the training procedure.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Alexander Hepburn;Valero Laparra;Raul Santos-Rodriguez;Johannes Ball\u00e9;Jesus Malo", "authorids": "~Alexander_Hepburn2;~Valero_Laparra1;~Raul_Santos-Rodriguez1;~Johannes_Ball\u00e91;~Jesus_Malo1", "gender": "M;;;Non-Binary;M", "homepage": ";https://www.uv.es/lapeva/;https://www.bristol.ac.uk/people/person/Raul-Santos-Rodriguez-1d708791-ea39-4078-89e6-c5c81b8c1a22/;https://balle.io;http://isp.uv.es/excathedra.html", "dblp": ";;24/7253;84/4973;31/4807", "google_scholar": "CzZTCN4AAAAJ;dNt_xikAAAAJ;U_ldrLcAAAAJ;uKDe38UAAAAJ;https://scholar.google.es/citations?user=0pgrklEAAAAJ", "orcid": ";;0000-0001-9576-3905;0000-0003-0769-8985;0000-0002-5684-8591", "linkedin": ";;;;", "or_profile": "~Alexander_Hepburn2;~Valero_Laparra1;~Raul_Santos-Rodriguez1;~Johannes_Ball\u00e91;~Jesus_Malo1", "aff": "University of Bristol;Universitat de Val\u00e8ncia;University of Bristol;Google;Universitat de Valencia", "aff_domain": "bristol.ac.uk;uv.es;bristol.ac.uk;google.com;uv.es", "position": "Postdoc;Postdoc;Associate Professor;Research Scientist;Full Professor", "bibtex": "@inproceedings{\nhepburn2022on,\ntitle={On the relation between statistical learning and perceptual distances},\nauthor={Alexander Hepburn and Valero Laparra and Raul Santos-Rodriguez and Johannes Ball{\\'e} and Jesus Malo},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=zXM0b4hi5_B}\n}", "github": "", "project": "", "reviewers": "CZqT;2c14;z8xf;Rmvg", "pdf_size": 0, "recommendation": "6;6;6;8", "confidence": "5;4;5;2", "correctness": "4;3;2;4", "technical_novelty": "1;3;2;4", "empirical_novelty": "2;3;2;3", "wc_summary_paper": "51;95;12;61", "wc_summary_review": "15;100;5;32", "wc_main_review": "273;700;340;258", "wc_review": "339;895;357;351", "wc_reply_reviewers": "432;0;102;0", "wc_reply_authors": "696;824;1139;345", "reply_reviewers": "1;0;1;0", "reply_authors": "2;2;3;1", "recommendation_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 4.0, 1.224744871391589 ], "correctness_avg": [ 3.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.5, 1.118033988749895 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 54.75, 29.583568074186047 ], "wc_summary_review_avg": [ 38.0, 37.074249823833256 ], "wc_main_review_avg": [ 392.75, 180.05745610776577 ], "wc_review_avg": [ 485.5, 236.5137416726563 ], "wc_reply_reviewers_avg": [ 133.5, 177.29847715082045 ], "wc_reply_authors_avg": [ 751.0, 284.4881368352642 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 2.0, 0.7071067811865476 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.9428090415820632, "corr_recommendation_correctness": 0.5222329678670935, "gs_citation": 22, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=883558073858578374&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "pdf": "https://openreview.net/pdf?id=zXM0b4hi5_B", "email": "bristol.ac.uk;uv.es;bristol.ac.uk;google.com;uv.es", "author_num": 5, "aff_unique_index": "0;1;0;2;3", "aff_unique_norm": "University of Bristol;Universitat de Val\u00e8ncia;Google;University of Valencia", "aff_unique_dep": ";;Google;", "aff_unique_url": "https://www.bristol.ac.uk;https://www.uv.es;https://www.google.com;https://www.uv.es", "aff_unique_abbr": "Bristol;UV;Google;UV", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;1;0;2;1", "aff_country_unique": "United Kingdom;Spain;United States" }, { "id": "zXne1klXIQ", "title": "Improving Out-of-Distribution Robustness via Selective Augmentation", "track": "main", "status": "Reject", "tldr": "", "abstract": "Machine learning algorithms typically assume that training and test examples are drawn from the same distribution. However, distribution shifts is a common problem in real-world applications and can cause models to perform dramatically worse at test time. In this paper, we specifically consider the problems of domain shifts and subpopulation shifts, where learning invariant representations by aligning domain-specific representations or balancing the risks across domains with regularizers are popular solutions. However, designing regularizers that are suitable for diverse real-world datasets is challenging. Instead, we shed new light on addressing distribution shifts by directly eliminating domain-related spurious correlations with augmentation, leading to a simple technique based on mixup, called LISA (Learning Invariant Representations via Selective Augmentation). LISA selectively interpolates samples either with the same labels but different domains or with the same domain but different labels. Empirically, we study the effectiveness of LISA on nine benchmarks ranging from subpopulation shifts to domain shifts. The results indicate that LISA consistently outperforms other state-of-the-art methods with superior invariant representations. The empirical findings are further strengthened by our theoretical analysis.", "keywords": "out-of-distribution robustness;distribution shifts;selective data augmentation", "primary_area": "", "supplementary_material": "", "author": "Huaxiu Yao;Yu Wang;Sai Li;Linjun Zhang;Weixin Liang;James Zou;Chelsea Finn", "authorids": "~Huaxiu_Yao1;~Yu_Wang24;saili@ruc.edu.cn;~Linjun_Zhang1;~Weixin_Liang1;~James_Zou1;~Chelsea_Finn1", "gender": "M;M;;M;;;F", "homepage": "http://huaxiuyao.mystrikingly.com;https://wangyu-ustc.github.io/;;;https://ai.stanford.edu/~wxliang/;;https://ai.stanford.edu/~cbfinn/", "dblp": "197/1635;;;;231/1803;;131/1783", "google_scholar": "A20BZnQAAAAJ;https://scholar.google.com/citations?hl=en;;TUAzs3sAAAAJ;7z9P1jYAAAAJ;23ZXZvEAAAAJ;vfPE6hgAAAAJ", "orcid": ";;;;;;", "linkedin": "huaxiuyao/;;;;weixin-liang-2562aa154/;;", "or_profile": "~Huaxiu_Yao1;~Yu_Wang24;saili@ruc.edu.cn;~Linjun_Zhang1;~Weixin_Liang1;~James_Zou1;~Chelsea_Finn1", "aff": "Computer Science Department, Stanford University;University of Science and Technology of China;;Rutgers University;Stanford University;Stanford University;Google", "aff_domain": "cs.stanford.edu;ustc.edu.cn;;rutgers.edu;stanford.edu;stanford.edu;google.com", "position": "Postdoc;Undergrad student;;Assistant Professor;PhD student;Assistant Professor;Research Scientist", "bibtex": "@misc{\nyao2022improving,\ntitle={Improving Out-of-Distribution Robustness via Selective Augmentation},\nauthor={Huaxiu Yao and Yu Wang and Sai Li and Linjun Zhang and Weixin Liang and James Zou and Chelsea Finn},\nyear={2022},\nurl={https://openreview.net/forum?id=zXne1klXIQ}\n}", "github": "", "project": "", "reviewers": "4YgJ;VX2N;iP5p", "site": "https://openreview.net/forum?id=zXne1klXIQ", "pdf_size": 0, "recommendation": "5;5;6", "confidence": "3;4;4", "correctness": "2;3;4", "technical_novelty": "3;2;2", "empirical_novelty": "3;3;3", "wc_summary_paper": "99;154;43", "wc_summary_review": "95;35;48", "wc_main_review": "726;340;139", "wc_review": "920;529;230", "wc_reply_reviewers": "386;277;0", "wc_reply_authors": "1889;1350;331", "reply_reviewers": "1;2;0", "reply_authors": "4;5;1", "recommendation_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.0, 0.816496580927726 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 98.66666666666667, 45.31617322276403 ], "wc_summary_review_avg": [ 59.333333333333336, 25.77250904010361 ], "wc_main_review_avg": [ 401.6666666666667, 243.57659073801725 ], "wc_review_avg": [ 559.6666666666666, 282.5247285145536 ], "wc_reply_reviewers_avg": [ 221.0, 162.48281960461748 ], "wc_reply_authors_avg": [ 1190.0, 646.0345707983952 ], "reply_reviewers_avg": [ 1.0, 0.816496580927726 ], "reply_authors_avg": [ 3.3333333333333335, 1.699673171197595 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": 0.4999999999999999, "corr_recommendation_correctness": 0.8660254037844385, "gs_citation": 235, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4894079975600009568&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "aff_unique_index": "0;1;2;0;0;3", "aff_unique_norm": "Stanford University;University of Science and Technology of China;Rutgers University;Google", "aff_unique_dep": "Computer Science Department;;;Google", "aff_unique_url": "https://www.stanford.edu;http://www.ustc.edu.cn;https://www.rutgers.edu;https://www.google.com", "aff_unique_abbr": "Stanford;USTC;Rutgers;Google", "aff_campus_unique_index": "0;0;0;2", "aff_campus_unique": "Stanford;;Mountain View", "aff_country_unique_index": "0;1;0;0;0;0", "aff_country_unique": "United States;China" }, { "id": "z_gX7gZe2cV", "title": "Membership Inference Attack in Face of Data Transformations", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Membership inference attacks (MIAs) on machine learning models, which try to infer whether an example is in the training dataset of a target model, are widely studied in recent years as data privacy attracts increasing attention. One unignorable problem in the traditional MIA threat model is that it assumes the attacker can obtain exactly the same example as in the training dataset. In reality, however, the attacker is more likely to collect only a transformed version of the original example. For instance, the attacker may download a down-scaled image from a website, while the smaller image has the same content as the original image used for model training. Generally, after transformations that would not affect its semantics, a transformed training member should still be treated the same as the original one regarding privacy leakage. In this paper, we propose extending the concept of MIAs into more realistic scenarios by considering data transformations and derive two MIAs for transformed examples: one follows the efficient loss-thresholding ideas, and the other tries to approximately reverse the transformations. We demonstrated the effectiveness of our attacks by extensive evaluations on multiple common data transformations and comparison with other state-of-the-art attacks. Moreover, we also studied the coverage difference between our two attacks to show their limitations and advantages. ", "keywords": "Membership inference attack;Data transformation;Data privacy", "primary_area": "", "supplementary_material": "/attachment/7e2aed8480e7fd88e2283e2867626b86e18330c3.zip", "author": "Jiyu Chen;Yiwen Guo;Hao Chen", "authorids": "~Jiyu_Chen2;~Yiwen_Guo1;~Hao_Chen5", "gender": "M;;", "homepage": ";;https://www.cs.ucdavis.edu/~hchen/", "dblp": ";;86/475-3", "google_scholar": "3_ruMwoAAAAJ;;1Aa3qxIAAAAJ", "orcid": "0000-0002-0144-6376;;0000-0002-4072-0710", "linkedin": "jiyu-chen/;;", "or_profile": "~Jiyu_Chen2;~Yiwen_Guo1;~Hao_Chen5", "aff": "University of California, Davis;;University of California, Davis", "aff_domain": "ucdavis.edu;;ucdavis.edu", "position": "PhD student;;Full Professor", "bibtex": "@misc{\nchen2022membership,\ntitle={Membership Inference Attack in Face of Data Transformations},\nauthor={Jiyu Chen and Yiwen Guo and Hao Chen},\nyear={2022},\nurl={https://openreview.net/forum?id=z_gX7gZe2cV}\n}", "github": "", "project": "", "reviewers": "Bxi3;prYk;jwwC;zKwj", "site": "https://openreview.net/forum?id=z_gX7gZe2cV", "pdf_size": 0, "recommendation": "3;3;3;3", "confidence": "3;4;4;5", "correctness": "2;1;3;3", "technical_novelty": "2;2;2;2", "empirical_novelty": "2;2;2;2", "wc_summary_paper": "105;71;93;88", "wc_summary_review": "51;102;46;43", "wc_main_review": "406;786;282;357", "wc_review": "562;959;421;488", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "349;393;157;203", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 2.25, 0.82915619758885 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 89.25, 12.214233500306108 ], "wc_summary_review_avg": [ 60.5, 24.12985702402731 ], "wc_main_review_avg": [ 457.75, 194.5923623886611 ], "wc_review_avg": [ 607.5, 208.97667333939452 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 275.5, 98.11600277222875 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6878702450107836117&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff_unique_index": "0;0", "aff_unique_norm": "University of California, Davis", "aff_unique_dep": "", "aff_unique_url": "https://www.ucdavis.edu", "aff_unique_abbr": "UC Davis", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Davis", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "zaALYtvbRlH", "title": "SpanDrop: Simple and Effective Counterfactual Learning for Long Sequences", "track": "main", "status": "Reject", "tldr": "", "abstract": "Distilling supervision signal from a long sequence to make predictions is a challenging task in machine learning, especially when not all elements in the input sequence contribute equally to the desired output. In this paper, we propose SpanDrop, a simple and effective data augmentation technique that helps models identify the true supervision signal in a long sequence with very few examples. By directly manipulating the input sequence, SpanDrop randomly ablates parts of the sequence at a time and ask the model to perform the same task to emulate counterfactual learning and achieve input attribution. Based on theoretical analysis of its properties, we also propose a variant of SpanDrop based on the beta-Bernoulli distribution, which yields diverse augmented sequences while providing a learning objective that is more consistent with the original dataset. We demonstrate the effectiveness of SpanDrop on a set of carefully designed toy tasks, as well as various natural language processing tasks that require reasoning over long sequences to arrive at the correct answer, and show that it helps models improve performance both when data is scarce and abundant.", "keywords": "sequential data;sample efficiency;data augmentation", "primary_area": "", "supplementary_material": "/attachment/116a831580f17516c1370119908ef3bf2f8c8cea.zip", "author": "Peng Qi;Guangtao Wang;Jing Huang", "authorids": "~Peng_Qi1;~Guangtao_Wang1;~Jing_Huang3", "gender": ";M;F", "homepage": "https://qipeng.me;https://dblp.org/pers/hd/w/Wang:Guangtao;", "dblp": "59/9474-3.html;26/11029;14/4834-19", "google_scholar": "quJME0oAAAAJ;ga5aRGkAAAAJ;ocPXoIkAAAAJ", "orcid": ";;0000-0001-8769-9130", "linkedin": ";;jing-huang-935b0216/", "or_profile": "~Peng_Qi1;~Guangtao_Wang1;~Jing_Huang3", "aff": "JD AI Research;;Amazon", "aff_domain": "jd.com;;amazon.com", "position": "Researcher;;Principal Researcher", "bibtex": "@misc{\nqi2022spandrop,\ntitle={SpanDrop: Simple and Effective Counterfactual Learning for Long Sequences},\nauthor={Peng Qi and Guangtao Wang and Jing Huang},\nyear={2022},\nurl={https://openreview.net/forum?id=zaALYtvbRlH}\n}", "github": "", "project": "", "reviewers": "ykje;qHWi;DeC3;EEvq", "site": "https://openreview.net/forum?id=zaALYtvbRlH", "pdf_size": 0, "recommendation": "3;3;3;8", "confidence": "3;4;4;5", "correctness": "3;4;3;4", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;1;2;3", "wc_summary_paper": "76;121;55;64", "wc_summary_review": "59;34;10;37", "wc_main_review": "189;66;291;218", "wc_review": "324;221;356;319", "wc_reply_reviewers": "4;0;0;110", "wc_reply_authors": "624;265;453;964", "reply_reviewers": "1;0;0;2", "reply_authors": "1;1;1;2", "recommendation_avg": [ 4.25, 2.165063509461097 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 0.7071067811865476 ], "wc_summary_paper_avg": [ 79.0, 25.367301787931645 ], "wc_summary_review_avg": [ 35.0, 17.363755354185336 ], "wc_main_review_avg": [ 191.0, 81.17573529078749 ], "wc_review_avg": [ 305.0, 50.532167972490555 ], "wc_reply_reviewers_avg": [ 28.5, 47.08237462150778 ], "wc_reply_authors_avg": [ 576.5, 257.24356162982974 ], "reply_reviewers_avg": [ 0.75, 0.82915619758885 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.816496580927726, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:_mm4vG-SsC8J:scholar.google.com/&scioq=SpanDrop:+Simple+and+Effective+Counterfactual+Learning+for+Long+Sequences&hl=en&as_sdt=0,33", "gs_version_total": 3, "aff_unique_index": "0;1", "aff_unique_norm": "JD;Amazon", "aff_unique_dep": "JD AI Research;Amazon.com, Inc.", "aff_unique_url": "https://www.jd.com;https://www.amazon.com", "aff_unique_abbr": "JD AI;Amazon", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "China;United States" }, { "id": "zbZL1s-pBF", "title": "Training-Free Robust Multimodal Learning via Sample-Wise Jacobian Regularization", "track": "main", "status": "Reject", "tldr": "", "abstract": "Multimodal fusion emerges as an appealing technique to improve model performances on many tasks. Nevertheless, the robustness of such fusion methods is rarely involved in the present literature. In this paper, we are the first to propose a training-free robust late-fusion method by exploiting conditional independence assumption and Jacobian regularization. Our key is to minimize the Frobenius norm of a Jacobian matrix, where the resulting optimization problem is relaxed to a tractable Sylvester equation. Furthermore, we provide a theoretical error bound of our method and some insights about the function of the extra modality. Several numerical experiments on AV-MNIST, RAVDESS, and VGGsound demonstrate the efficacy of our method under both adversarial attacks and random corruptions.", "keywords": "Jacobian Regularization;Robust Multimodal Learning", "primary_area": "", "supplementary_material": "", "author": "Zhengqi Gao;Sucheng Ren;Zihui Xue;Siting Li;Hang Zhao", "authorids": "~Zhengqi_Gao1;~Sucheng_Ren1;~Zihui_Xue1;~Siting_Li1;~Hang_Zhao1", "gender": "M;M;F;;M", "homepage": "http://zhengqigao.github.io/;https://oliverren.netlify.com/;https://zihuixue.github.io;;http://www.mit.edu/~hangzhao/", "dblp": "256/9403;270/9042;256/9549;;", "google_scholar": "igvvVY4AAAAJ;Hbf-SoAAAAAJ;JCV9BQ0AAAAJ;;DmahiOYAAAAJ", "orcid": ";;;;", "linkedin": "zhengqi-gao-729b51146/;;;;", "or_profile": "~Zhengqi_Gao1;~Sucheng_Ren1;~Zihui_Xue1;~Siting_Li1;~Hang_Zhao1", "aff": "Massachusetts Institute of Technology;South China University of Technology, Tsinghua University;University of Texas, Austin;;Tsinghua University", "aff_domain": "mit.edu;scut.edu.cn;utexas.edu;;tsinghua.edu.cn", "position": "PhD student;MS student;PhD student;;Assistant Professor", "bibtex": "@misc{\ngao2022trainingfree,\ntitle={Training-Free Robust Multimodal Learning via Sample-Wise Jacobian Regularization},\nauthor={Zhengqi Gao and Sucheng Ren and Zihui Xue and Siting Li and Hang Zhao},\nyear={2022},\nurl={https://openreview.net/forum?id=zbZL1s-pBF}\n}", "github": "", "project": "", "reviewers": "dv5k;oUrr;BBZL", "site": "https://openreview.net/forum?id=zbZL1s-pBF", "pdf_size": 0, "recommendation": "5;5;6", "confidence": "4;3;3", "correctness": "4;3;3", "technical_novelty": "3;2;3", "empirical_novelty": "3;2;3", "wc_summary_paper": "83;57;113", "wc_summary_review": "14;32;44", "wc_main_review": "292;173;320", "wc_review": "389;262;477", "wc_reply_reviewers": "149;0;0", "wc_reply_authors": "1117;703;1292", "reply_reviewers": "1;0;0", "reply_authors": "3;1;2", "recommendation_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "correctness_avg": [ 3.3333333333333335, 0.4714045207910317 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 84.33333333333333, 22.88133640230735 ], "wc_summary_review_avg": [ 30.0, 12.328828005937952 ], "wc_main_review_avg": [ 261.6666666666667, 63.73033465748909 ], "wc_review_avg": [ 376.0, 88.25342297422048 ], "wc_reply_reviewers_avg": [ 49.666666666666664, 70.23927359786371 ], "wc_reply_authors_avg": [ 1037.3333333333333, 246.96873396354357 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 2.0, 0.816496580927726 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.4999999999999999, "corr_recommendation_correctness": -0.4999999999999999, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10592779566744796589&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "Massachusetts Institute of Technology;South China University of Technology;University of Texas at Austin;Tsinghua University", "aff_unique_dep": ";;;", "aff_unique_url": "https://web.mit.edu;https://www.scut.edu.cn;https://www.utexas.edu;https://www.tsinghua.edu.cn", "aff_unique_abbr": "MIT;SCUT;UT Austin;THU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Austin", "aff_country_unique_index": "0;1;0;1", "aff_country_unique": "United States;China" }, { "id": "zc0YnpS90ug", "title": "On Exploring Node-feature and Graph-structure Diversities for Node Drop Graph Pooling", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Graph pooling is essential in learning effective graph-level representations. One mainstream type of graph pooling is node drop pooling, which preserves the nodes in graphs with top-k calculated significance scores. However, despite being commonly adopted, current node drop pooling methods generally ignore node diversity from the perspectives of node features and graph structures. Therefore, they still obtain graph-level representations suboptimally. To address the issue mentioned above, we propose a novel plug-and-play scheme, termed MID, using a \\textbf{M}ultidimensional score space with two score operations, \\textit{i.e.}, fl\\textbf{I}pscore and \\textbf{D}ropscore, to explore the node-feature and graph-structure diversities in graphs. Specifically, the multidimensional score space depicts the significance of nodes through multiple criteria; the flipsscore encourages the maintenance of dissimilar features, thus preserving the node-feature diversity; and the dropscore forces the model to notice diverse graph structures instead of being stuck in significant local structures. What is more, we evaluate our proposed MID by applying it to a variety of popular node drop pooling methods, including TopKPool, SAGPool, GSAPool, and ASAP. Extensive experiments on seventeen real-world graph classification datasets demonstrate that our proposed scheme efficiently and consistently brings over 2.8\\% improvements in average when using different backbone models and datasets. The datasets include FRANKENSTEIN, IMDB-B, IMDB-M, REDDIT-B, COLLAB from the social domain and D\\&D, PROTEINS, NCI1, MUTAG, PTC-MR, NCI109, ENZYMES, MUTAGENICITY, HIV, BBBP, TOXCAST, TOX21 from the biochemical domain.\\footnote{Code will be made publicly available at~\\url{http://github.com/xxx/xxx}.", "keywords": "Graph Neural Networks;Graph Pooling;Graph Classification", "primary_area": "", "supplementary_material": "/attachment/a1e57ab0be646e3855697ecd7a8e3f0fe9aa018a.zip", "author": "Chuang Liu;Yibing Zhan;Baosheng Yu;Liu Liu;Bo Du;Wenbin Hu;Tongliang Liu", "authorids": "~Chuang_Liu2;~Yibing_Zhan2;~Baosheng_Yu1;~Liu_Liu8;~Bo_Du1;~Wenbin_Hu1;~Tongliang_Liu1", "gender": "M;;F;M;M;M;M", "homepage": "https://liuchuang0059.github.io/;https://dr.ntu.edu.sg/cris/rp/rp02563;;;https://cs.whu.edu.cn/info/1019/2886.htm;https://tongliang-liu.github.io/;", "dblp": "52/1800-8;178/8725;74/7037-14;70/6443-1.html;19/3768-1;150/6667;142/8486", "google_scholar": "hQzjzekAAAAJ;fjzIdMQAAAAJ;FvGjCqEAAAAJ;Shy1gnMAAAAJ;K3l1qnoAAAAJ;https://scholar.google.com.au/citations?user=EiLdZ_YAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": "0000-0003-2377-2567;;;;;;", "linkedin": ";;;;;;", "or_profile": "~Chuang_Liu2;~Baosheng_Yu1;~Liu_Liu8;~Bo_Du1;~Wenbin_Hu1;~Tongliang_Liu1;~Yibing_Zhan1", "aff": "Wuhan University;The University of Sydney;University of Sydney;Wuhan University;Wuhan University;University of Sydney;JD Explore Academy", "aff_domain": "whu.edu;sydney.edu.au;sydney.edu.au;whu.edu.cn;whu.edu.cn;sydney.edu.au;jd.com", "position": "PhD student;Research Fellow;Postdoc;Full Professor;Full Professor;Lecturer;Researcher", "bibtex": "@misc{\nliu2022on,\ntitle={On Exploring Node-feature and Graph-structure Diversities for Node Drop Graph Pooling},\nauthor={Chuang Liu and Yibing Zhan and Baosheng Yu and Liu Liu and Bo Du and Wenbin Hu and Tongliang Liu},\nyear={2022},\nurl={https://openreview.net/forum?id=zc0YnpS90ug}\n}", "github": "", "project": "", "reviewers": "4RGR;BS4A;pPLX;U2Ym", "site": "https://openreview.net/forum?id=zc0YnpS90ug", "pdf_size": 0, "recommendation": "3;3;3;5", "confidence": "4;5;4;4", "correctness": "2;3;2;3", "technical_novelty": "2;3;2;2", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "85;103;74;98", "wc_summary_review": "44;72;38;8", "wc_main_review": "505;592;231;609", "wc_review": "634;767;343;715", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 3.5, 0.8660254037844386 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 2.5, 0.5 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 90.0, 11.335784048754634 ], "wc_summary_review_avg": [ 40.5, 22.73213584333861 ], "wc_main_review_avg": [ 484.25, 151.4420268617665 ], "wc_review_avg": [ 614.75, 163.89688069026818 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 7, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13162503979506047426&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff_unique_index": "0;1;1;0;0;1;2", "aff_unique_norm": "Wuhan University;University of Sydney;JD", "aff_unique_dep": ";;JD Explore Academy", "aff_unique_url": "http://www.whu.edu.cn/;https://www.sydney.edu.au;", "aff_unique_abbr": "WHU;USYD;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;0;0;1", "aff_country_unique": "China;Australia;" }, { "id": "zdpZyJ7xu4", "title": "FOCUS: Familiar Objects in Common And Uncommon Settings", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Standard training datasets for deep learning often contain objects in common settings (e.g., \"a horse on grass\" or \"a ship in water\") since they are usually collected by randomly scraping the web. Uncommon and rare settings (e.g., \"a plane on water\", \"a car in snowy weather\") are thus severely under-represented in the training data. This can lead to an undesirable bias in model predictions towards common settings and create a false sense of accuracy. In this paper, we introduce FOCUS (Familiar Objects in Common and Uncommon Settings), a dataset for stress-testing the generalization power of deep image classifiers. By leveraging the power of modern search engines, we deliberately gather data containing objects in common and uncommon settings in a wide range of locations, weather conditions, and time of day. We present a detailed analysis of the performance of various popular image classifiers on our dataset and demonstrate a clear drop in performance when classifying images in uncommon settings. By analyzing deep features of these models, we show that such errors can be due to the use of spurious features in model predictions. We believe that our dataset will aid researchers in understanding the inability of deep models to generalize well to uncommon settings and drive future work on improving their distributional robustness.", "keywords": "focus;dataset;focus dataset;generalization;distributional robustness;robustness", "primary_area": "", "supplementary_material": "", "author": "Priyatham Kattakinda;Soheil Feizi", "authorids": "~Priyatham_Kattakinda1;~Soheil_Feizi2", "gender": "M;M", "homepage": "https://priyathamkat.com/;https://www.cs.umd.edu/~sfeizi/", "dblp": ";57/2132", "google_scholar": "D9ebp-YAAAAJ;lptAmrMAAAAJ", "orcid": ";", "linkedin": "priyathamkat/;", "or_profile": "~Priyatham_Kattakinda1;~Soheil_Feizi2", "aff": "University of Maryland, College Park;University of Maryland, College Park", "aff_domain": "umd.edu;umd.edu", "position": "PhD student;Assistant Professor", "bibtex": "@misc{\nkattakinda2022focus,\ntitle={{FOCUS}: Familiar Objects in Common And Uncommon Settings},\nauthor={Priyatham Kattakinda and Soheil Feizi},\nyear={2022},\nurl={https://openreview.net/forum?id=zdpZyJ7xu4}\n}", "github": "", "project": "", "reviewers": "dCkJ;bAdY;9mNL", "site": "https://openreview.net/forum?id=zdpZyJ7xu4", "pdf_size": 0, "recommendation": "3;3;3", "confidence": "4;4;4", "correctness": "4;4;3", "technical_novelty": "2;2;1", "empirical_novelty": "2;2;3", "wc_summary_paper": "89;73;195", "wc_summary_review": "61;43;51", "wc_main_review": "863;135;600", "wc_review": "1013;251;846", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 1.6666666666666667, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 119.0, 54.135632135098106 ], "wc_summary_review_avg": [ 51.666666666666664, 7.363574011458174 ], "wc_main_review_avg": [ 532.6666666666666, 300.9942782771054 ], "wc_review_avg": [ 703.3333333333334, 327.03346753641114 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2485805129814216346&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff_unique_index": "0;0", "aff_unique_norm": "University of Maryland", "aff_unique_dep": "", "aff_unique_url": "https://www/umd.edu", "aff_unique_abbr": "UMD", "aff_campus_unique_index": "0;0", "aff_campus_unique": "College Park", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "zeGpMIt6Pfq", "title": "BioLCNet: Reward-modulated Locally Connected Spiking Neural Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "Recent studies have shown that convolutional neural networks (CNNs) are not the only feasible solution for image classification. Furthermore, weight sharing and backpropagation used in CNNs do not correspond to the mechanisms present in the primate visual system. To propose a more biologically plausible solution, we designed a locally connected spiking neural network (SNN) trained using spike-timing-dependent plasticity (STDP) and its reward-modulated variant (R-STDP) learning rules. The use of spiking neurons and local connections along with reinforcement learning (RL) led us to the nomenclature BioLCNet for our proposed architecture. Our network consists of a rate-coded input layer followed by a locally connected hidden layer and a decoding output layer. A spike population-based voting scheme is adopted for decoding in the output layer. We used the MNIST dataset to obtain image classification accuracy and to assess the robustness of our rewarding system to varying target responses.", "keywords": "Spiking Neural Networks;Locally Connected Networks;Spike-timing-dependent plasticity;Reinforcement Learning", "primary_area": "", "supplementary_material": "/attachment/2e7a516419527caa12a0ff07c8eebd9f6eea9ad4.zip", "author": "Hafez Ghaemi;Erfan Mirzaei;Mahbod Nouri;Saeed Reza Kheradpisheh", "authorids": "~Hafez_Ghaemi1;~Erfan_Mirzaei1;~Mahbod_Nouri1;~Saeed_Reza_Kheradpisheh1", "gender": "Not Specified;;M;M", "homepage": "https://hafezgh.github.io/;;https://mahbodnouri.com/;http://en.sbu.ac.ir/Pages/Profiles.aspx?proffID=397183", "dblp": "282/0049;;;135/7980", "google_scholar": "JCLX6oYAAAAJ;;;ZisNRVMAAAAJ", "orcid": "0000-0001-6326-5258;;0000-0001-7906-1110;0000-0001-6168-4379", "linkedin": "hafez-ghaemi-618b8287/;;mahbodnr;saeed-reza-kheradpisheh-7a0b18155/", "or_profile": "~Hafez_Ghaemi1;~Erfan_Mirzaei1;~Mahbod_Nouri1;~Saeed_Reza_Kheradpisheh1", "aff": "Politecnico di Torino;;University of Edinburgh;Shahid Beheshti University", "aff_domain": "polito.it;;ed.ac.uk;sbu.ac.ir", "position": "MS student;;MS student;Assistant Professor", "bibtex": "@misc{\nghaemi2022biolcnet,\ntitle={Bio{LCN}et: Reward-modulated Locally Connected Spiking Neural Networks},\nauthor={Hafez Ghaemi and Erfan Mirzaei and Mahbod Nouri and Saeed Reza Kheradpisheh},\nyear={2022},\nurl={https://openreview.net/forum?id=zeGpMIt6Pfq}\n}", "github": "", "project": "", "reviewers": "Urvj;BWum;b4gF", "site": "https://openreview.net/forum?id=zeGpMIt6Pfq", "pdf_size": 0, "recommendation": "3;3;6", "confidence": "4;4;3", "correctness": "2;3;3", "technical_novelty": "2;2;3", "empirical_novelty": "2;2;2", "wc_summary_paper": "83;52;105", "wc_summary_review": "78;48;107", "wc_main_review": "296;704;847", "wc_review": "457;804;1059", "wc_reply_reviewers": "338;137;140", "wc_reply_authors": "806;1107;726", "reply_reviewers": "1;1;1", "reply_authors": "2;2;1", "recommendation_avg": [ 4.0, 1.4142135623730951 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 2.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.3333333333333335, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.0, 0.0 ], "wc_summary_paper_avg": [ 80.0, 21.740898478827106 ], "wc_summary_review_avg": [ 77.66666666666667, 24.087802353519553 ], "wc_main_review_avg": [ 615.6666666666666, 233.45568220875575 ], "wc_review_avg": [ 773.3333333333334, 246.72026444718497 ], "wc_reply_reviewers_avg": [ 205.0, 94.05317644821997 ], "wc_reply_authors_avg": [ 879.6666666666666, 164.03319447260938 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": -1.0, "corr_recommendation_correctness": 0.5, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10423793964201130191&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff_unique_index": "0;1;2", "aff_unique_norm": "Politecnico di Torino;University of Edinburgh;Shahid Beheshti University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.polito.it;https://www.ed.ac.uk;https://www.sbu.ac.ir", "aff_unique_abbr": "Polito;Edinburgh;SBU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2", "aff_country_unique": "Italy;United Kingdom;Iran" }, { "id": "zfKQn4zN6sB", "title": "$\\ell_\\infty$-Robustness and Beyond: Unleashing Efficient Adversarial Training", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Neural networks are vulnerable to adversarial attacks: adding well-crafted, imperceptible perturbations to their input can modify their output. Adversarial training is one of the most effective approaches in training robust models against such attacks. However, it is much slower than vanilla training of neural networks since it needs to construct adversarial examples for the entire training data at every iteration, which has hampered its effectiveness. Recently, Fast Adversarial Training was proposed that can obtain robust models within minutes. However, the reasons behind its success are not fully understood, and more importantly, it can only train robust models for $\\ell_\\infty$-bounded attacks as it uses FGSM during training. In this paper, by leveraging the theory of coreset selection we show how selecting a small subset of training data provides a more principled approach towards reducing the time complexity of robust training. Unlike Fast Adversarial Training, our approach can be adapted to a wide variety of training objectives, including TRADES, $\\ell_p$-PGD, and Perceptual Adversarial Training. Our experimental results indicate that using coreset selection, one can train robust models 2-3 times faster while maintaining the clean and robust accuracy almost intact.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Hadi Mohaghegh Dolatabadi;Sarah Monazam Erfani;Christopher Leckie", "authorids": "~Hadi_Mohaghegh_Dolatabadi2;~Sarah_Monazam_Erfani1;~Christopher_Leckie1", "gender": "M;;", "homepage": "https://hmdolatabadi.github.io/;https://people.eng.unimelb.edu.au/smonazam/;", "dblp": "250/0584;136/0170;73/1139", "google_scholar": "0ePftHMAAAAJ;https://scholar.google.com.au/citations?user=Jq9ocx4AAAAJ;https://scholar.google.com.au/citations?user=wUsI0cAAAAAJ", "orcid": "0000-0001-9418-1487;;", "linkedin": "hadi-mohaghegh-dolatabadi/;;", "or_profile": "~Hadi_Mohaghegh_Dolatabadi2;~Sarah_Monazam_Erfani1;~Christopher_Leckie1", "aff": "Amazon;University of Melbourne;The University of Melbourne", "aff_domain": "amazon.com;unimelb.edu;unimelb.edu.au", "position": "Applied Scientist Intern;Assistant Professor;Full Professor", "bibtex": "@misc{\ndolatabadi2022ellinftyrobustness,\ntitle={\\${\\textbackslash}ell\\_{\\textbackslash}infty\\$-Robustness and Beyond: Unleashing Efficient Adversarial Training},\nauthor={Hadi Mohaghegh Dolatabadi and Sarah Monazam Erfani and Christopher Leckie},\nyear={2022},\nurl={https://openreview.net/forum?id=zfKQn4zN6sB}\n}", "github": "", "project": "", "reviewers": "nbUE;TjuH;Ef11;gRXi", "site": "https://openreview.net/forum?id=zfKQn4zN6sB", "pdf_size": 0, "recommendation": "3;3;5;6", "confidence": "5;5;5;5", "correctness": "3;3;3;4", "technical_novelty": "2;2;2;3", "empirical_novelty": "2;0;2;4", "wc_summary_paper": "63;35;60;81", "wc_summary_review": "30;29;42;154", "wc_main_review": "316;283;498;706", "wc_review": "409;347;600;941", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 4.25, 1.299038105676658 ], "confidence_avg": [ 5.0, 0.0 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.0, 1.4142135623730951 ], "wc_summary_paper_avg": [ 59.75, 16.391689967785506 ], "wc_summary_review_avg": [ 63.75, 52.356351095163234 ], "wc_main_review_avg": [ 450.75, 168.5843631538821 ], "wc_review_avg": [ 574.25, 231.36483635159428 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.7777777777777777, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1, "aff_unique_index": "0;1;1", "aff_unique_norm": "Amazon;University of Melbourne", "aff_unique_dep": "Amazon.com, Inc.;", "aff_unique_url": "https://www.amazon.com;https://www.unimelb.edu.au", "aff_unique_abbr": "Amazon;UniMelb", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1", "aff_country_unique": "United States;Australia" }, { "title": "How Much Can CLIP Benefit Vision-and-Language Tasks?", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6845", "id": "zf_Ll3HZWgy", "poster": "", "openreview": "https://openreview.net/forum?id=zf_Ll3HZWgy", "slides": "https://iclr.cc/virtual/2022/poster/6845", "video": "https://iclr.cc/virtual/2022/poster/6845", "author_site": "Sheng Shen, Liunian Li, Hao Tan, Mohit Bansal, Anna Rohrbach, Kai-Wei Chang, Zhewei Yao, Kurt Keutzer", "tldr": "", "abstract": "Most existing Vision-and-Language (V&L) models rely on pre-trained visual encoders, using a relatively small set of manually-annotated data (as compared to web-crawled data), to perceive the visual world. However, it has been observed that large-scale pretraining usually can result in better generalization performance, e.g., CLIP (Contrastive Language-Image Pre-training), trained on a massive amount of image-caption pairs, has shown a strong zero-shot capability on various vision tasks. To further study the advantage brought by CLIP, we propose to use CLIP as the visual encoder in various V&L models in two typical scenarios: 1) plugging CLIP into task-specific fine-tuning; 2) combining CLIP with V&L pre-training and transferring to downstream tasks. We show that CLIP significantly outperforms widely-used visual encoders trained with in-domain annotated data, such as BottomUp-TopDown. We achieve competitive or better results on diverse V&L tasks, while establishing new state-of-the-art results on Visual Question Answering, Visual Entailment, and V&L Navigation tasks.\n", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/6be978f711b9dc6cc4892422e52cf8626d5a7ef7.zip", "author": "Sheng Shen;Liunian Harold Li;Hao Tan;Mohit Bansal;Anna Rohrbach;Kai-Wei Chang;Zhewei Yao;Kurt Keutzer", "authorids": "~Sheng_Shen2;~Liunian_Harold_Li1;~Hao_Tan1;~Mohit_Bansal2;~Anna_Rohrbach1;~Kai-Wei_Chang1;~Zhewei_Yao1;~Kurt_Keutzer3", "gender": "M;M;M;M;F;M;M;M", "homepage": "https://sincerass.github.io;;http://www.cs.unc.edu/~airsplay/;https://www.cs.unc.edu/~mbansal/;https://anna-rohrbach.net/;http://kwchang.net;;https://people.eecs.berkeley.edu/~keutzer/", "dblp": "138/5764-1.html;236/6323;94/877-2;32/5243.html;152/5114;18/2428;195/2887;k/KurtKeutzer.html", "google_scholar": "https://scholar.google.com/citations?hl=en;ntbhn9UAAAAJ;OV1Y3FUAAAAJ;DN8QtscAAAAJ;https://scholar.google.de/citations?user=GHpxNQIAAAAJ;fqDBtzYAAAAJ;gpSeMjYAAAAJ;ID9QePIAAAAJ", "orcid": ";;;;0000-0003-1161-6006;0000-0001-5365-0072;;0000-0003-3868-8501", "linkedin": "sheng-s-ab198a174/;;hao-tan-23677180/;;;kai-wei-chang-41239040;;kurtkeutzer/", "or_profile": "~Sheng_Shen2;~Liunian_Harold_Li1;~Hao_Tan1;~Mohit_Bansal2;~Anna_Rohrbach1;~Kai-Wei_Chang1;~Zhewei_Yao1;~Kurt_Keutzer1", "aff": "University of California, Berkeley;University of California, Los Angeles;Adobe Systems;University of North Carolina at Chapel Hill;University of California, Berkeley;University of California, Los Angeles;Microsoft;University of California, Berkeley", "aff_domain": "berkeley.edu;cs.ucla.edu;adobe.com;unc.edu;berkeley.edu;ucla.edu;microsoft.com;berkeley.edu", "position": "PhD student;PhD student;Research Scientist;Full Professor;Research Scientist;Assistant Professor;Researcher;Full Professor", "bibtex": "@inproceedings{\nshen2022how,\ntitle={How Much Can {CLIP} Benefit Vision-and-Language Tasks?},\nauthor={Sheng Shen and Liunian Harold Li and Hao Tan and Mohit Bansal and Anna Rohrbach and Kai-Wei Chang and Zhewei Yao and Kurt Keutzer},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=zf_Ll3HZWgy}\n}", "github": "", "project": "", "reviewers": "jbK4;ZXD1;Rnvh;EUMT", "pdf_size": 0, "recommendation": "5;6;6;8", "confidence": "5;4;4;4", "correctness": "4;3;3;4", "technical_novelty": "1;3;2;3", "empirical_novelty": "3;3;3;3", "wc_summary_paper": "79;54;86;86", "wc_summary_review": "38;70;105;53", "wc_main_review": "243;327;310;195", "wc_review": "360;451;501;334", "wc_reply_reviewers": "418;202;0;0", "wc_reply_authors": "771;614;616;122", "reply_reviewers": "1;1;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.25, 0.82915619758885 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 76.25, 13.160072188251856 ], "wc_summary_review_avg": [ 66.5, 24.944939366532843 ], "wc_main_review_avg": [ 268.75, 52.9073482608985 ], "wc_review_avg": [ 411.5, 67.5074070010099 ], "wc_reply_reviewers_avg": [ 155.0, 172.7917822120022 ], "wc_reply_authors_avg": [ 530.75, 244.43544648843383 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": -0.6622661785325219, "corr_recommendation_correctness": 0.2294157338705618, "gs_citation": 494, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6434466912782408523&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=zf_Ll3HZWgy", "email": "berkeley.edu;cs.ucla.edu;adobe.com;unc.edu;berkeley.edu;ucla.edu;microsoft.com;berkeley.edu", "author_num": 8, "aff_unique_index": "0;1;2;3;0;1;4;0", "aff_unique_norm": "University of California, Berkeley;University of California, Los Angeles;Adobe;University of North Carolina;Microsoft", "aff_unique_dep": ";;Adobe Systems Incorporated;;Microsoft Corporation", "aff_unique_url": "https://www.berkeley.edu;https://www.ucla.edu;https://www.adobe.com;https://www.unc.edu;https://www.microsoft.com", "aff_unique_abbr": "UC Berkeley;UCLA;Adobe;UNC;Microsoft", "aff_campus_unique_index": "0;1;3;0;1;0", "aff_campus_unique": "Berkeley;Los Angeles;;Chapel Hill", "aff_country_unique_index": "0;0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "zfmB5vgfaCt", "title": "TransSlowDown: Efficiency Attacks on Neural Machine Translation Systems", "track": "main", "status": "Reject", "tldr": "", "abstract": "Neural machine translation (NMT) systems have received massive attention from academia and industry. Despite a rich set of work focusing on improving NMT systems\u2019 accuracy, the less explored topic of efficiency is also important to NMT systems because of the real-time demand of translation applications. In this paper, we observe an inherent property of the NMT system, that is, NMT systems\u2019 efficiency is related to the output length instead of the input length. Such property results in a new attack surface of the NMT system\u2014an adversary can slightly changing inputs to incur a significant amount of redundant computations in NMT systems. Such abuse of NMT systems\u2019 computational resources is analogous to denial-of-service attacks. Abuse of NMT systems\u2019 computing resources will affect the service quality (e.g., prolong response to users\u2019 translation requests) and even make the translation service unavailable (e.g., running out of resources such as batteries of mobile devices). To further the understanding of such efficiency-oriented threats and raise the community\u2019s concern on the efficiency robustness of NMT systems, we propose a new attack approach, TranSlowDown, to test the efficiency robustness of NMT systems. To demonstrate the effectiveness of TranSlowDown, we conduct a systematic evaluation on three public-available NMT systems: Google T5, Facebook Fairseq, and Helsinki-NLP translator. The experimental results show that TranSlowDown increases NMT systems\u2019 response latency up to 1232%and 1056% on Intel CPU and Nvidia GPU respectively by inserting only three characters into existing input sentences. Our results also show that the adversarial examples generated byTranSlowDowncan consume more than 30 times battery power than the original benign example. Such results suggest that further research is required for protecting NMT systems against efficiency-oriented threats.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/cfc8d864c8a6c2f1777f4745f8c5fd4b763f160c.zip", "author": "Simin Chen;Mirazul Haque;Zihe Song;Cong Liu;Wei Yang", "authorids": "~Simin_Chen1;~Mirazul_Haque1;~Zihe_Song1;~Cong_Liu2;~Wei_Yang7", "gender": ";M;F;;", "homepage": "http://seekingdream.github.io/;https://www.linkedin.com/in/mirazul-haque-b4b331a6/;https://zihesong.github.io/;https://intra.ece.ucr.edu/~cong/;", "dblp": ";272/0796;;https://dblp.uni-trier.de/pers/l/Liu_0005:Cong.html;", "google_scholar": "bgCd-_YAAAAJ;1YLCVDgAAAAJ;https://scholar.google.com/citations?;vpc4bggAAAAJ;", "orcid": ";;;;", "linkedin": ";mirazul-haque-b4b331a6;;;", "or_profile": "~Simin_Chen1;~Mirazul_Haque1;~Zihe_Song1;~Cong_Liu2;~Wei_Yang7", "aff": "University of Texas at Dallas ;University of Texas at Dallas;University of Texas, Dallas;University of Texas, Dallas;", "aff_domain": "utdallas.edu;utdallas.edu;utdallas.edu;utdallas.edu;", "position": "PhD student;PhD student;PhD student;Associate Professor;", "bibtex": "@misc{\nchen2022transslowdown,\ntitle={TransSlowDown: Efficiency Attacks on Neural Machine Translation Systems},\nauthor={Simin Chen and Mirazul Haque and Zihe Song and Cong Liu and Wei Yang},\nyear={2022},\nurl={https://openreview.net/forum?id=zfmB5vgfaCt}\n}", "github": "", "project": "", "reviewers": "mDAN;Rkum;zCy9;cSUu", "site": "https://openreview.net/forum?id=zfmB5vgfaCt", "pdf_size": 0, "recommendation": "3;5;5;6", "confidence": "4;4;5;4", "correctness": "3;3;2;3", "technical_novelty": "3;2;3;4", "empirical_novelty": "3;2;3;3", "wc_summary_paper": "77;72;72;90", "wc_summary_review": "23;32;42;34", "wc_main_review": "178;310;204;310", "wc_review": "278;414;318;434", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "393;463;225;137", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "recommendation_avg": [ 4.75, 1.0897247358851685 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "correctness_avg": [ 2.75, 0.4330127018922193 ], "technical_novelty_avg": [ 3.0, 0.7071067811865476 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 77.75, 7.361215932167728 ], "wc_summary_review_avg": [ 32.75, 6.7592529172978875 ], "wc_main_review_avg": [ 250.5, 60.205896721168436 ], "wc_review_avg": [ 361.0, 64.95382975621992 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 304.5, 129.74108832594246 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.13245323570650439, "corr_recommendation_correctness": -0.13245323570650439, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6757347947866172942&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "University of Texas at Dallas", "aff_unique_dep": "", "aff_unique_url": "https://www.utdallas.edu", "aff_unique_abbr": "UT Dallas", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Dallas", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "zhynF6JnC4q", "title": "Adaptive Q-learning for Interaction-Limited Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Conventional reinforcement learning (RL) needs an environment to collect fresh data, which is impractical when an online interaction is costly.\nOffline RL provides an alternative solution by directly learning from the logged dataset. However, it usually yields unsatisfactory performance due to a pessimistic update scheme or/and the low quality of logged datasets.\nMoreover, how to evaluate the policy under the offline setting is also a challenging problem.\nIn this paper, we propose a unified framework called Adaptive Q-learning for effectively taking advantage of offline and online learning.\nSpecifically, we explicitly consider the difference between the online and offline data and apply an adaptive update scheme accordingly, i.e., a pessimistic update strategy for the offline dataset and a greedy or no pessimistic update scheme for the online dataset.\nWhen combining both, we can apply very limited online exploration steps to achieve expert performance even when the offline dataset is poor, e.g., random dataset.\nSuch a framework provides a unified way to mix the offline and online RL and gain the best of both worlds. \nTo understand our framework better, we then provide an initialization following our framework's setting.\nExtensive experiments are done to verify the effectiveness of our proposed method.", "keywords": "reinforcement learning;offline reinforcement learning;online-to-offline;limited interactions", "primary_area": "", "supplementary_material": "", "author": "Han Zheng;Xufang Luo;pengfei wei;Xuan Song;Dongsheng Li;Jing Jiang", "authorids": "~Han_Zheng2;~Xufang_Luo1;~pengfei_wei2;~Xuan_Song5;~Dongsheng_Li2;~Jing_Jiang6", "gender": ";F;M;;M;F", "homepage": ";;https://pengfei-wei.com/;;http://recmind.cn;https://www.uts.edu.au/staff/jing.jiang", "dblp": ";218/7350;29/11273-1;;254/0830-2.html;68/1974-2", "google_scholar": ";;https://scholar.google.com.sg/citations?user=a94WthkAAAAJ;;VNg5rA8AAAAJ;https://scholar.google.com.au/citations?hl=en", "orcid": ";;;;0000-0003-3103-8442;", "linkedin": ";;;;;", "or_profile": "~Han_Zheng2;~Xufang_Luo1;~pengfei_wei2;~Xuan_Song5;~Dongsheng_Li2;~Jing_Jiang6", "aff": ";Microsoft Research;AI LAB Bytedance;;Microsoft Research Asia;University of Technology Sydney", "aff_domain": ";microsoft.com;bytedance.com;;microsoft.com;uts.edu.au", "position": ";Researcher;Researcher;;Principal Researcher;Lecturer", "bibtex": "@misc{\nzheng2022adaptive,\ntitle={Adaptive Q-learning for Interaction-Limited Reinforcement Learning},\nauthor={Han Zheng and Xufang Luo and pengfei wei and Xuan Song and Dongsheng Li and Jing Jiang},\nyear={2022},\nurl={https://openreview.net/forum?id=zhynF6JnC4q}\n}", "github": "", "project": "", "reviewers": "irZQ;VmYw;qMWG;tsK8", "site": "https://openreview.net/forum?id=zhynF6JnC4q", "pdf_size": 0, "recommendation": "3;6;6;6", "confidence": "4;4;3;4", "correctness": "3;3;4;3", "technical_novelty": "2;3;2;2", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "18;72;100;97", "wc_summary_review": "2;71;84;49", "wc_main_review": "34;913;294;929", "wc_review": "54;1056;478;1075", "wc_reply_reviewers": "0;0;0;403", "wc_reply_authors": "362;1358;369;2203", "reply_reviewers": "0;0;0;1", "reply_authors": "1;2;1;4", "recommendation_avg": [ 5.25, 1.299038105676658 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "correctness_avg": [ 3.25, 0.4330127018922193 ], "technical_novelty_avg": [ 2.25, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 71.75, 32.88141572377929 ], "wc_summary_review_avg": [ 51.5, 31.196954979612993 ], "wc_main_review_avg": [ 542.5, 389.5436432545139 ], "wc_review_avg": [ 665.75, 426.98616780874767 ], "wc_reply_reviewers_avg": [ 100.75, 174.50411886256438 ], "wc_reply_authors_avg": [ 1073.0, 767.994466125896 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 2.0, 1.224744871391589 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14232224216315253857&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;0;2", "aff_unique_norm": "Microsoft;ByteDance;University of Technology Sydney", "aff_unique_dep": "Microsoft Research;AI LAB;", "aff_unique_url": "https://www.microsoft.com/en-us/research;https://www.bytedance.com;https://www.uts.edu.au", "aff_unique_abbr": "MSR;Bytedance;UTS", "aff_campus_unique_index": "1", "aff_campus_unique": ";Asia", "aff_country_unique_index": "0;1;1;2", "aff_country_unique": "United States;China;Australia" }, { "title": "Generalized rectifier wavelet covariance models for texture synthesis", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6762", "id": "ziRLU3Y2PN_", "poster": "", "openreview": "https://openreview.net/forum?id=ziRLU3Y2PN_", "slides": "https://iclr.cc/virtual/2022/poster/6762", "video": "https://iclr.cc/virtual/2022/poster/6762", "author_site": "Antoine Brochard, Sixin Zhang, St\u00e9phane Mallat", "tldr": "", "abstract": "State-of-the-art maximum entropy models for texture synthesis are built from statistics relying on image representations defined by convolutional neural networks (CNN). Such representations capture rich structures in texture images, outperforming wavelet-based representations in this regard. However, conversely to neural networks, wavelets offer meaningful representations, as they are known to detect structures at multiple scales (e.g. edges) in images. In this work, we propose a family of statistics built upon non-linear wavelet based representations, that can be viewed as a particular instance of a one-layer CNN, using a generalized rectifier non-linearity. These statistics significantly improve the visual quality of previous classical wavelet-based models, and allow one to produce syntheses of similar quality to state-of-the-art models, on both gray-scale and color textures. We further provide insights on memorization effects in these models. \n", "keywords": "texture synthesis;generative models;wavelets", "primary_area": "", "supplementary_material": "/attachment/7e21ac9cd6b43093299dc7d2417cf78f7adc2ae2.zip", "author": "Antoine Brochard;Sixin Zhang;St\u00e9phane Mallat", "authorids": "~Antoine_Brochard1;~Sixin_Zhang2;~St\u00e9phane_Mallat1", "gender": "M;M;M", "homepage": ";https://www.irit.fr/~Sixin.Zhang/;https://www.di.ens.fr/~mallat/", "dblp": ";116/3004;61/3978", "google_scholar": ";-cL9xWMAAAAJ;https://scholar.google.com.tw/citations?user=g_YTmSgAAAAJ", "orcid": "0000-0002-2809-4956;;", "linkedin": "antoine-brochard-813232128/;;", "or_profile": "~Antoine_Brochard1;~Sixin_Zhang2;~St\u00e9phane_Mallat1", "aff": ";Universtite Toulouse;", "aff_domain": ";irit.fr;", "position": ";Assistant Professor;", "bibtex": "@inproceedings{\nbrochard2022generalized,\ntitle={Generalized rectifier wavelet covariance models for texture synthesis},\nauthor={Antoine Brochard and Sixin Zhang},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=ziRLU3Y2PN_}\n}", "github": "", "project": "", "reviewers": "xfp4;Q8m3;Gwxo;bssY", "pdf_size": 0, "recommendation": "3;8;8;8", "confidence": "4;5;3;5", "correctness": "3;4;4;4", "technical_novelty": "2;3;3;3", "empirical_novelty": "3;3;3;3", "wc_summary_paper": "61;69;120;35", "wc_summary_review": "43;41;48;34", "wc_main_review": "266;309;140;487", "wc_review": "370;419;308;556", "wc_reply_reviewers": "539;148;0;216", "wc_reply_authors": "1125;453;88;409", "reply_reviewers": "1;1;0;1", "reply_authors": "2;1;1;2", "recommendation_avg": [ 6.75, 2.165063509461097 ], "confidence_avg": [ 4.25, 0.82915619758885 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 71.25, 30.825111516424396 ], "wc_summary_review_avg": [ 41.5, 5.024937810560445 ], "wc_main_review_avg": [ 300.5, 124.30305708227775 ], "wc_review_avg": [ 413.25, 91.32188948986985 ], "wc_reply_reviewers_avg": [ 225.75, 196.99539969248013 ], "wc_reply_authors_avg": [ 518.75, 377.3104656645506 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.17407765595569782, "corr_recommendation_correctness": 1.0, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1160036386380312390&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "pdf": "https://openreview.net/pdf?id=ziRLU3Y2PN_", "email": ";irit.fr;", "author_num": 3, "aff_unique_index": "0", "aff_unique_norm": "Universit\u00e9 Toulouse", "aff_unique_dep": "", "aff_unique_url": "https://www.univ-toulouse.fr", "aff_unique_abbr": "UT", "aff_country_unique_index": "0", "aff_country_unique": "France" }, { "id": "znpOLJUYGcA", "title": "Automatic Integration for Neural Temporal Point Process", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Integration lies at the heart of the temporal point process. Due to the intrinsic mathematical difficulty of symbolic integration, neural temporal point process models either constrain the intensity function to an integrable functional form or apply certain numerical methods. However, the former type of model has limited expressive power, and the latter type of model suffers additional numerical errors and high computational costs. In this paper, we introduce automatic integration with neural point process models, a new paradigm for efficient, closed-form nonparametric inference of temporal point process characterized by any intensity function. We test the model against a variety of synthetic temporal point process datasets and show that the model can better capture inter-event intensity changes than state-of-the-art methods. We also identify certain model settings that would lead the MLE estimator for the temporal point process to be inconsistent.", "keywords": "point process", "primary_area": "", "supplementary_material": "", "author": "Zihao Zhou;Rose Yu", "authorids": "~Zihao_Zhou1;~Rose_Yu1", "gender": "M;F", "homepage": "http://zzhou.info;http://roseyu.com", "dblp": ";164/7314", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": "~Zihao_Zhou1;~Rose_Yu1", "aff": "University of California, San Diego;University of California, San Diego", "aff_domain": "ucsd.edu;ucsd.edu", "position": "MS student;Assistant Professor", "bibtex": "@misc{\nzhou2022automatic,\ntitle={Automatic Integration for Neural Temporal Point Process},\nauthor={Zihao Zhou and Rose Yu},\nyear={2022},\nurl={https://openreview.net/forum?id=znpOLJUYGcA}\n}", "github": "", "project": "", "reviewers": "", "site": "https://openreview.net/forum?id=znpOLJUYGcA", "pdf_size": 0, "recommendation": "", "confidence": "", "correctness": "", "technical_novelty": "", "empirical_novelty": "", "wc_summary_paper": "", "wc_summary_review": "", "wc_main_review": "", "wc_review": "", "wc_reply_reviewers": "", "wc_reply_authors": "", "reply_reviewers": "", "reply_authors": "", "recommendation_avg": [ 0, 0 ], "confidence_avg": [ 0, 0 ], "correctness_avg": [ 0, 0 ], "technical_novelty_avg": [ 0, 0 ], "empirical_novelty_avg": [ 0, 0 ], "wc_summary_paper_avg": [ 0, 0 ], "wc_summary_review_avg": [ 0, 0 ], "wc_main_review_avg": [ 0, 0 ], "wc_review_avg": [ 0, 0 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 1, 0 ], "authors#_avg": [ 2, 0 ], "corr_recommendation_confidence": 0, "corr_recommendation_correctness": 0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:BkmTF9TtJzgJ:scholar.google.com/&scioq=Automatic+Integration+for+Neural+Temporal+Point+Process&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "University of California, San Diego", "aff_unique_dep": "", "aff_unique_url": "https://www.ucsd.edu", "aff_unique_abbr": "UCSD", "aff_campus_unique_index": "0;0", "aff_campus_unique": "San Diego", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "zokEN0xOb0Q", "title": "Differential Privacy with Manifold Data Dependency", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "In this paper, we study dataset processing mechanisms generated by linear queries in the presence of manifold data dependency. Specifically, the input data are assumed to lie in an affine manifold as prior knowledge known to adversaries. First of all, we show such manifold data dependency may have a significant impact on the privacy levels compared to the case with the manifold constraint being absent. We establish necessary and sufficient conditions on the possibility of achieving differential privacy via structured noise injection mechanisms where non i.i.d. Gaussian or Laplace noises are calibrated into dataset. Next, in light of these conditions, procedures are developed by which a prescribed privacy budget can be tightly reached with a matching noise level. Finally, we show that the framework has immediate applications in differentially private cloud-based control, where the manifold data dependency arises naturally from the system dynamics, and the proposed theories and procedures become effective tools in evaluating privacy levels and in the design of provably useful algorithms.", "keywords": "Differential privacy;data correlation", "primary_area": "", "supplementary_material": "/attachment/b3d638bf56c907fde089ee99071160d5bf046a25.zip", "author": "Lei Wang;Deming Yuan;Guodong Shi", "authorids": "~Lei_Wang26;~Deming_Yuan1;~Guodong_Shi1", "gender": "M;M;M", "homepage": ";;", "dblp": ";87/8689;https://dblp.org/pers/hd/s/Shi:Guodong.html", "google_scholar": "https://scholar.google.fi/citations?user=b2LyJzsAAAAJ;https://scholar.google.com.hk/citations?user=J-Ghvi8AAAAJ;https://scholar.google.com.tw/citations?user=gD553TwAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Lei_Wang26;~Deming_Yuan1;~Guodong_Shi1", "aff": "Zhejiang University;;The University of Sydney", "aff_domain": "zju.edu.cn;;sydney.edu.au", "position": "Researcher;;Assistant Professor", "bibtex": "@misc{\nwang2022differential,\ntitle={Differential Privacy with Manifold Data Dependency},\nauthor={Lei Wang and Deming Yuan and Guodong Shi},\nyear={2022},\nurl={https://openreview.net/forum?id=zokEN0xOb0Q}\n}", "github": "", "project": "", "reviewers": "R4Hw;DL38;zR2R", "site": "https://openreview.net/forum?id=zokEN0xOb0Q", "pdf_size": 0, "recommendation": "3;6;6", "confidence": "5;2;4", "correctness": "2;4;4", "technical_novelty": "2;3;3", "empirical_novelty": "2;3;3", "wc_summary_paper": "40;27;38", "wc_summary_review": "7;63;27", "wc_main_review": "453;263;47", "wc_review": "500;353;112", "wc_reply_reviewers": "912;35;0", "wc_reply_authors": "1585;499;134", "reply_reviewers": "4;1;0", "reply_authors": "5;1;1", "recommendation_avg": [ 5.0, 1.4142135623730951 ], "confidence_avg": [ 3.6666666666666665, 1.247219128924647 ], "correctness_avg": [ 3.3333333333333335, 0.9428090415820634 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 35.0, 5.715476066494082 ], "wc_summary_review_avg": [ 32.333333333333336, 23.170862929310356 ], "wc_main_review_avg": [ 254.33333333333334, 165.86205781378158 ], "wc_review_avg": [ 321.6666666666667, 159.94235072536463 ], "wc_reply_reviewers_avg": [ 315.6666666666667, 421.91336656817225 ], "wc_reply_authors_avg": [ 739.3333333333334, 616.2631111970131 ], "reply_reviewers_avg": [ 1.6666666666666667, 1.699673171197595 ], "reply_authors_avg": [ 2.3333333333333335, 1.8856180831641267 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.7559289460184545, "corr_recommendation_correctness": 1.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:6HrVSLcle2wJ:scholar.google.com/&scioq=Differential+Privacy+with+Manifold+Data+Dependency&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1", "aff_unique_norm": "Zhejiang University;University of Sydney", "aff_unique_dep": ";", "aff_unique_url": "https://www.zju.edu.cn;https://www.sydney.edu.au", "aff_unique_abbr": "ZJU;USYD", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "China;Australia" }, { "id": "zou-Ry64vqx", "title": "FedMorph: Communication Efficient Federated Learning via Morphing Neural Network", "track": "main", "status": "Reject", "tldr": "", "abstract": "The two fundamental bottlenecks in Federated Learning (FL) are communication and computation on heterogeneous edge networks, restricting both model capacity and user participation. \nTo address these issues, we present FedMorph, an approach to automatically morph the global neural network to a sub-network to reduce both the communication and local computation overloads. \nFedMorph distills a fresh sub-network from the original one at the beginning of each communication round while keeps its `knowledge' as similar as the aggregated model from local clients in a federated average (FedAvg) like way.\nThe network morphing process considers the constraints, e.g., model size or computation flops, as an extra regularizer to the objective function.\nTo make the objective function solvable, we relax the model with the concept of soft-mask.\nWe empirically show that FedMorph, without any other tricks, reduces communication and computation overloads and increases the generalization accuracy.\nE.g., it provides an $85\\times$ reduction in server-to-client communication and $18\\times$ reduction in local device computation on the MNIST dataset with ResNet8 as the training network.\nWith benchmark compression approaches, e.g., TopK sparsification, FedMorph collectively provides an $847\\times$ reduction in upload communication.", "keywords": "Federated Learning;Communication efficient", "primary_area": "", "supplementary_material": "", "author": "Guoqing Ma;Chuanting Zhang;Basem Shihada", "authorids": "~Guoqing_Ma1;~Chuanting_Zhang1;~Basem_Shihada1", "gender": "M;;M", "homepage": ";https://chuanting.github.io;https://www.kaust.edu.sa/en/study/faculty/basem-shihada", "dblp": "19/8378-2;193/6367.html;", "google_scholar": "nvBmyYsAAAAJ;FCF65-UAAAAJ;https://scholar.google.com.tw/citations?user=337BjqEAAAAJ", "orcid": ";0000-0002-6685-4071;", "linkedin": ";chuanting/;", "or_profile": "~Guoqing_Ma1;~Chuanting_Zhang1;~Basem_Shihada1", "aff": ";KAUST;King Abdullah University of Science and Technology", "aff_domain": ";kaust.edu.sa;kaust.edu.sa", "position": ";Postdoc;Associate Professor", "bibtex": "@misc{\nma2022fedmorph,\ntitle={FedMorph: Communication Efficient Federated Learning via Morphing Neural Network},\nauthor={Guoqing Ma and Chuanting Zhang and Basem Shihada},\nyear={2022},\nurl={https://openreview.net/forum?id=zou-Ry64vqx}\n}", "github": "", "project": "", "reviewers": "2oB9;nDBv;7R7p", "site": "https://openreview.net/forum?id=zou-Ry64vqx", "pdf_size": 0, "recommendation": "3;3;3", "confidence": "4;3;3", "correctness": "3;2;3", "technical_novelty": "2;2;2", "empirical_novelty": "2;0;2", "wc_summary_paper": "111;42;32", "wc_summary_review": "255;17;35", "wc_main_review": "521;222;112", "wc_review": "887;281;179", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "180;129;141", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "recommendation_avg": [ 3.0, 0.0 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "correctness_avg": [ 2.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.0, 0.0 ], "empirical_novelty_avg": [ 1.3333333333333333, 0.9428090415820634 ], "wc_summary_paper_avg": [ 61.666666666666664, 35.122009560324926 ], "wc_summary_review_avg": [ 102.33333333333333, 108.20145819514428 ], "wc_main_review_avg": [ 285.0, 172.813965485046 ], "wc_review_avg": [ 449.0, 312.499599999744 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 150.0, 21.77154105707724 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:TKhjqt_Ew-MJ:scholar.google.com/&scioq=FedMorph:+Communication+Efficient+Federated+Learning+via+Morphing+Neural+Network&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "King Abdullah University of Science and Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.kaust.edu.sa", "aff_unique_abbr": "KAUST", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Saudi Arabia" }, { "title": "Supervision Exists Everywhere: A Data Efficient Contrastive Language-Image Pre-training Paradigm", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6406", "id": "zq1iJkNk3uN", "poster": "", "openreview": "https://openreview.net/forum?id=zq1iJkNk3uN", "slides": "https://iclr.cc/virtual/2022/poster/6406", "video": "https://iclr.cc/virtual/2022/poster/6406", "author_site": "Yangguang Li, Feng Liang, Lichen Zhao, Yufeng Cui, Wanli Ouyang, Jing Shao, fengwei yu, Junjie Yan", "tldr": "", "abstract": "Recently, large-scale Contrastive Language-Image Pre-training (CLIP) has attracted unprecedented attention for its impressive zero-shot recognition ability and excellent transferability to downstream tasks. However, CLIP is quite data-hungry and requires 400M image-text pairs for pre-training, thereby restricting its adoption. This work proposes a novel training paradigm, Data efficient CLIP (DeCLIP), to alleviate this limitation. We demonstrate that by carefully utilizing the widespread supervision among the image-text pairs, our De-CLIP can learn generic visual features more efficiently. Instead of using the single image-text contrastive supervision, we fully exploit data potential through the use of (1) self-supervision within each modality; (2) multi-view supervision across modalities; (3) nearest-neighbor supervision from other similar pairs. Benefiting from intrinsic supervision, our DeCLIP-ResNet50 can achieve 60.4% zero-shot top1 accuracy on ImageNet, which is 0.8% above the CLIP-ResNet50 while using 7.1\u00d7fewer data. Our DeCLIP-ResNet50 outperforms its counterpart in 8 out of 11 visual datasets when transferred to downstream tasks. Moreover, Scaling up the model and computing also works well in our framework.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yangguang Li;Feng Liang;Lichen Zhao;Yufeng Cui;Wanli Ouyang;Jing Shao;Fengwei Yu;Junjie Yan", "authorids": "~Yangguang_Li1;~Feng_Liang3;~Lichen_Zhao1;~Yufeng_Cui2;~Wanli_Ouyang1;~Jing_Shao3;~Fengwei_Yu1;~Junjie_Yan4", "gender": "M;M;M;M;;F;M;M", "homepage": "https://yg256li.github.io/;https://jeff-liangf.github.io/;https://zlccccc.github.io/;https://slothercui.github.io/;;https://amandajshao.github.io/;https://forwil.xyz;https://yan-junjie.github.io/", "dblp": "132/4829-1.html;;78/9722;;;;188/5764;115/9656", "google_scholar": "a7AMvgkAAAAJ;ecTFCUMAAAAJ;;;;VU5ObUwAAAAJ;qzWfLRIAAAAJ;rEYarG0AAAAJ", "orcid": ";;;;;;;", "linkedin": ";feng-liang-854a30150/;;;;;;", "or_profile": "~Yangguang_Li1;~Feng_Liang3;~Lichen_Zhao1;~Yufeng_Cui2;~Wanli_Ouyang1;~Jing_Shao3;~Fengwei_Yu1;~Junjie_Yan4", "aff": "SenseTime;University of Texas, Austin;SenseTime;Beihang University;;SenseTime Group Limited;;", "aff_domain": "sensetime.com;utexas.edu;sensetime.com;buaa.edu.cn;;sensetime.com;;", "position": "Researcher;PhD student;Researcher;MS student;;Researcher;;", "bibtex": "@inproceedings{\nli2022supervision,\ntitle={Supervision Exists Everywhere: A Data Efficient Contrastive Language-Image Pre-training Paradigm},\nauthor={Yangguang Li and Feng Liang and Lichen Zhao and Yufeng Cui and Wanli Ouyang and Jing Shao and Fengwei Yu and Junjie Yan},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=zq1iJkNk3uN}\n}", "github": "", "project": "", "reviewers": "nUjZ;x3bM;a3Wz;kziT", "pdf_size": 0, "recommendation": "6;6;6;8", "confidence": "5;3;4;3", "correctness": "4;3;4;4", "technical_novelty": "2;3;3;3", "empirical_novelty": "3;3;2;3", "wc_summary_paper": "126;104;47;55", "wc_summary_review": "329;23;102;36", "wc_main_review": "1039;377;263;160", "wc_review": "1494;504;412;251", "wc_reply_reviewers": "118;0;0;0", "wc_reply_authors": "1354;505;1096;311", "reply_reviewers": "1;0;0;0", "reply_authors": "2;1;2;1", "recommendation_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.75, 0.4330127018922193 ], "wc_summary_paper_avg": [ 83.0, 33.05298776207682 ], "wc_summary_review_avg": [ 122.5, 122.92782435234099 ], "wc_main_review_avg": [ 459.75, 343.12488615662954 ], "wc_review_avg": [ 665.25, 486.9719576115241 ], "wc_reply_reviewers_avg": [ 29.5, 51.09549882328188 ], "wc_reply_authors_avg": [ 816.5, 424.1429593898736 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 8, 0 ], "corr_recommendation_confidence": -0.5222329678670935, "corr_recommendation_correctness": 0.3333333333333333, "gs_citation": 535, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5003089118769672378&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=zq1iJkNk3uN", "email": "sensetime.com;utexas.edu;sensetime.com;buaa.edu.cn;;sensetime.com;;", "author_num": 8, "aff_unique_index": "0;1;0;2;3", "aff_unique_norm": "SenseTime;University of Texas at Austin;Beihang University;SenseTime Group Limited", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.sensetime.com;https://www.utexas.edu;http://www.buaa.edu.cn/;https://www.sensetime.com", "aff_unique_abbr": "SenseTime;UT Austin;BUAA;SenseTime", "aff_campus_unique_index": "1", "aff_campus_unique": ";Austin", "aff_country_unique_index": "0;1;0;0;0", "aff_country_unique": "China;United States" }, { "title": "On the benefits of maximum likelihood estimation for Regression and Forecasting", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/7096", "id": "zrW-LVXj2k1", "poster": "", "openreview": "https://openreview.net/forum?id=zrW-LVXj2k1", "slides": "https://iclr.cc/virtual/2022/poster/7096", "video": "https://iclr.cc/virtual/2022/poster/7096", "author_site": "Pranjal Awasthi, Abhimanyu Das, Rajat Sen, Ananda Suresh", "tldr": "", "abstract": "We advocate for a practical Maximum Likelihood Estimation (MLE) approach towards designing loss functions for regression and forecasting, as an alternative to the typical approach of direct empirical risk minimization on a specific target metric. The MLE approach is better suited to capture inductive biases such as prior domain knowledge in datasets, and can output post-hoc estimators at inference time that can optimize different types of target metrics. We present theoretical results to demonstrate that our approach is competitive with any estimator for the target metric under some general conditions. In two example practical settings, Poisson and Pareto regression, we show that our competitive results can be used to prove that the MLE approach has better excess risk bounds than directly minimizing the target metric. We also demonstrate empirically that our method instantiated with a well-designed general purpose mixture likelihood family can obtain superior performance for a variety of tasks across time-series forecasting and regression datasets with different data distributions.", "keywords": "Forecasting;Time-Series;Regression;MLE", "primary_area": "", "supplementary_material": "/attachment/c8e44acbe52e791f118f85a1f7b8db40d92c1ced.zip", "author": "Pranjal Awasthi;Abhimanyu Das;Rajat Sen;Ananda Theertha Suresh", "authorids": "~Pranjal_Awasthi3;~Abhimanyu_Das2;~Rajat_Sen1;~Ananda_Theertha_Suresh1", "gender": ";M;M;M", "homepage": "https://www.cs.rutgers.edu/~pa336/;https://sites.google.com/site/abhidas/;http://rajatsen91.github.io;https://theertha.info", "dblp": "57/679;83/6359;http://dblp.uni-trier.de/pers/hd/s/Sen:Rajat;119/3884", "google_scholar": ";;YzsCLBoAAAAJ;K6ef57QAAAAJ", "orcid": ";;;", "linkedin": ";;rajat-sen-a8702417/;", "or_profile": "~Pranjal_Awasthi3;~Abhimanyu_Das2;~Rajat_Sen1;~Ananda_Theertha_Suresh1", "aff": "Rutgers University;Research, Google;Google;Google", "aff_domain": "rutgers.edu;research.google.com;google.com;google.com", "position": "Assistant Professor;Researcher;Research Scientist;Research Scientist", "bibtex": "@inproceedings{\nawasthi2022on,\ntitle={On the benefits of maximum likelihood estimation for Regression and Forecasting},\nauthor={Pranjal Awasthi and Abhimanyu Das and Rajat Sen and Ananda Theertha Suresh},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=zrW-LVXj2k1}\n}", "github": "", "project": "", "reviewers": "1pkV;Fcze;BdKr", "pdf_size": 0, "recommendation": "5;5;8", "confidence": "2;3;3", "correctness": "3;3;3", "technical_novelty": "2;3;3", "empirical_novelty": "3;4;3", "wc_summary_paper": "33;114;94", "wc_summary_review": "17;48;28", "wc_main_review": "376;594;315", "wc_review": "426;756;437", "wc_reply_reviewers": "17;155;0", "wc_reply_authors": "544;1021;537", "reply_reviewers": "1;1;0", "reply_authors": "2;2;1", "recommendation_avg": [ 6.0, 1.4142135623730951 ], "confidence_avg": [ 2.6666666666666665, 0.4714045207910317 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.6666666666666665, 0.4714045207910317 ], "empirical_novelty_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_summary_paper_avg": [ 80.33333333333333, 34.451253807211266 ], "wc_summary_review_avg": [ 31.0, 12.832251036613439 ], "wc_main_review_avg": [ 428.3333333333333, 119.76180062477722 ], "wc_review_avg": [ 539.6666666666666, 153.03666953453418 ], "wc_reply_reviewers_avg": [ 57.333333333333336, 69.4086129781856 ], "wc_reply_authors_avg": [ 700.6666666666666, 226.5278986987892 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.5, "corr_recommendation_correctness": 0.0, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6196591755940399139&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=zrW-LVXj2k1", "email": "rutgers.edu;research.google.com;google.com;google.com", "author_num": 4, "aff_unique_index": "0;1;1;1", "aff_unique_norm": "Rutgers University;Google", "aff_unique_dep": ";Google Research", "aff_unique_url": "https://www.rutgers.edu;https://research.google", "aff_unique_abbr": "Rutgers;Google", "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "zrdUVVAvcP2", "title": "GrASP: Gradient-Based Affordance Selection for Planning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Planning with a learned model is arguably a key component of intelligence. There are several challenges in realizing such a component in large-scale reinforcement learning (RL) problems. One such challenge is dealing effectively with continuous action spaces when using tree-search planning (e.g., it is not feasible to consider every action even at just the root node of the tree). In this paper we present a method for \\emph{selecting} affordances useful for planning---for learning which small number of actions/options from a continuous space of actions/options to consider in the tree-expansion process during planning. We consider affordances that are goal-and-state-conditional mappings to actions/options as well as unconditional affordances that simply select actions/options available in all states. Our selection method is gradient based: we compute gradients through the planning procedure to update the parameters of the function that represents affordances. Our empirical work shows that it is feasible to learn to select both primitive-action and option affordances, and that simultaneously learning to select affordances and planning with a learned value-equivalent model can outperform model-free RL. ", "keywords": "reinforcement learning;affordances", "primary_area": "", "supplementary_material": "", "author": "Vivek Veeriah;Zeyu Zheng;Richard Lewis;Satinder Singh", "authorids": "~Vivek_Veeriah2;~Zeyu_Zheng1;~Richard_Lewis1;~Satinder_Singh2", "gender": "M;M;M;", "homepage": ";http://www-personal.umich.edu/~zeyu/;;", "dblp": "162/0205;48/7883;12/590;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": "vivkjv/;;;", "or_profile": "~Vivek_Veeriah2;~Zeyu_Zheng1;~Richard_Lewis1;~Satinder_Baveja2", "aff": "University of Michigan;University of Michigan;University of Michigan - Ann Arbor;Google DeepMind", "aff_domain": "umich.edu;umich.edu;umich.edu;google.com", "position": "PhD student;PhD student;Full Professor;Research Scientist", "bibtex": "@misc{\nveeriah2022grasp,\ntitle={Gr{ASP}: Gradient-Based Affordance Selection for Planning},\nauthor={Vivek Veeriah and Zeyu Zheng and Richard Lewis and Satinder Singh},\nyear={2022},\nurl={https://openreview.net/forum?id=zrdUVVAvcP2}\n}", "github": "", "project": "", "reviewers": "pYew;ohYy;LdeK;VDJ5", "site": "https://openreview.net/forum?id=zrdUVVAvcP2", "pdf_size": 0, "recommendation": "5;5;6;8", "confidence": "4;2;3;3", "correctness": "3;3;3;3", "technical_novelty": "2;3;3;3", "empirical_novelty": "2;2;3;3", "wc_summary_paper": "76;126;76;112", "wc_summary_review": "42;57;57;23", "wc_main_review": "359;390;288;251", "wc_review": "477;573;421;386", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "recommendation_avg": [ 6.0, 1.224744871391589 ], "confidence_avg": [ 3.0, 0.7071067811865476 ], "correctness_avg": [ 3.0, 0.0 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.5, 0.5 ], "wc_summary_paper_avg": [ 97.5, 22.06241147291021 ], "wc_summary_review_avg": [ 44.75, 13.970952007647869 ], "wc_main_review_avg": [ 322.0, 55.204166509422095 ], "wc_review_avg": [ 464.25, 70.68017756061455 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16743329590449735231&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;0;1", "aff_unique_norm": "University of Michigan;Google", "aff_unique_dep": ";Google DeepMind", "aff_unique_url": "https://www.umich.edu;https://deepmind.com", "aff_unique_abbr": "UM;DeepMind", "aff_campus_unique_index": "1", "aff_campus_unique": ";Ann Arbor", "aff_country_unique_index": "0;0;0;1", "aff_country_unique": "United States;United Kingdom" }, { "id": "zuDmDfeoB_1", "title": "How Does the Task Landscape Affect MAML Performance?", "track": "main", "status": "Reject", "tldr": "", "abstract": "Model-Agnostic Meta-Learning (MAML) has become increasingly popular for training models that can quickly adapt to new tasks via one or few stochastic gradient descent steps. However, the MAML objective is significantly more difficult to optimize compared to standard non-adaptive learning (NAL), and little is understood about how much MAML improves over NAL in terms of the fast adaptability of their solutions in various scenarios. We analytically address this issue in a linear regression setting consisting of a mixture of easy and hard tasks, where hardness is related to the rate that gradient descent converges on the task. Specifically, we prove that in order for MAML to achieve substantial gain over NAL, (i) there must be some discrepancy in hardness among the tasks, and (ii) the optimal solutions of the hard tasks must be closely packed with the center far from the center of the easy tasks optimal solutions. We also give numerical and analytical results suggesting that these insights apply to two-layer neural networks. Finally, we provide few-shot image classification experiments that support our insights for when MAML should be used and emphasize the importance of training MAML on hard tasks in practice.", "keywords": "Meta-learning;MAML;multi-task linear regression;two-layer neural networks", "primary_area": "", "supplementary_material": "/attachment/6d4476a1a127ceb2b06307836e4b86fdc2ad55f1.zip", "author": "Liam Collins;Aryan Mokhtari;Sanjay Shakkottai", "authorids": "~Liam_Collins1;~Aryan_Mokhtari3;~Sanjay_Shakkottai1", "gender": ";M;M", "homepage": "https://liamc2196.github.io/;https://sites.utexas.edu/mokhtari/;https://sites.google.com/view/sanjay-shakkottai/", "dblp": "170/1157;140/7407;61/4596", "google_scholar": "MRLe02cAAAAJ;glcep6EAAAAJ;", "orcid": "0009-0006-3139-3339;;", "linkedin": ";;", "or_profile": "~Liam_Collins1;~Aryan_Mokhtari3;~Sanjay_Shakkottai1", "aff": "University of Texas, Austin;University of Texas, Austin;University of Texas at Austin", "aff_domain": "utexas.edu;utexas.edu;utexas.edu", "position": "PhD student;Assistant Professor;Full Professor", "bibtex": "@misc{\ncollins2022how,\ntitle={How Does the Task Landscape Affect {MAML} Performance?},\nauthor={Liam Collins and Aryan Mokhtari and Sanjay Shakkottai},\nyear={2022},\nurl={https://openreview.net/forum?id=zuDmDfeoB_1}\n}", "github": "", "project": "", "reviewers": "EaUw;Y5kt;oNs8;g4W9", "site": "https://openreview.net/forum?id=zuDmDfeoB_1", "pdf_size": 0, "recommendation": "5;5;5;6", "confidence": "3;3;4;3", "correctness": "3;4;3;4", "technical_novelty": "2;3;2;3", "empirical_novelty": "2;3;2;2", "wc_summary_paper": "64;84;130;74", "wc_summary_review": "139;23;21;42", "wc_main_review": "899;132;578;261", "wc_review": "1102;239;729;377", "wc_reply_reviewers": "117;0;307;26", "wc_reply_authors": "849;81;1025;270", "reply_reviewers": "1;0;1;1", "reply_authors": "2;1;2;1", "recommendation_avg": [ 5.25, 0.4330127018922193 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 88.0, 25.25866188063018 ], "wc_summary_review_avg": [ 56.25, 48.47357527560764 ], "wc_main_review_avg": [ 467.5, 297.32347704142035 ], "wc_review_avg": [ 611.75, 334.7173247682289 ], "wc_reply_reviewers_avg": [ 112.5, 120.4045264929853 ], "wc_reply_authors_avg": [ 556.25, 391.5452560049732 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": -0.3333333333333333, "corr_recommendation_correctness": 0.5773502691896257, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6400364935583608654&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Texas at Austin", "aff_unique_dep": "", "aff_unique_url": "https://www.utexas.edu", "aff_unique_abbr": "UT Austin", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Austin", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Self-Joint Supervised Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6953", "id": "zuqcmNVK4c2", "poster": "", "openreview": "https://openreview.net/forum?id=zuqcmNVK4c2", "slides": "https://iclr.cc/virtual/2022/poster/6953", "video": "https://iclr.cc/virtual/2022/poster/6953", "author_site": "Navid Kardan, Mubarak Shah, Mitchell Hill", "tldr": "", "abstract": "Supervised learning is a fundamental framework used to train machine learning systems. A supervised learning problem is often formulated using an i.i.d. assumption that restricts model attention to a single relevant signal at a time when predicting. This contrasts with the human ability to actively use related samples as reference when making decisions. We hypothesize that the restriction to a single signal for each prediction in the standard i.i.d. framework contributes to well-known drawbacks of supervised learning: making overconfident predictions and vulnerability to overfitting, adversarial attacks, and out-of-distribution data. To address these limitations, we propose a new supervised learning paradigm called self-joint learning that generalizes the standard approach by modeling the joint conditional distribution of two observed samples, where each sample is an image and its label. Rather than assuming samples are independent, our models explicitly learn the sample-to-sample relation of conditional independence. Our framework can naturally incorporate auxiliary unlabeled data to further improve the performance. Experiments on benchmark image datasets show our method offers significant improvement over standard supervised learning in terms of accuracy, robustness against adversarial attacks, out-of-distribution detection, and overconfidence mitigation.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Navid Kardan;Mubarak Shah;Mitch Hill", "authorids": "~Navid_Kardan1;~Mubarak_Shah3;~Mitch_Hill1", "gender": ";M;M", "homepage": ";https://www.crcv.ucf.edu/person/mubarak-shah/;", "dblp": ";s/MubarakShah;217/3317", "google_scholar": ";https://scholar.google.com.tw/citations?user=p8gsO3gAAAAJ;ycEHnWoAAAAJ", "orcid": ";0000-0002-8216-1128;", "linkedin": ";mubarak-shah-b6aa68213/;", "or_profile": "~Navid_Kardan1;~Mubarak_Shah3;~Mitch_Hill1", "aff": ";University of Central Florida;University of Central Florida", "aff_domain": ";ucf.edu;ucf.edu", "position": ";Full Professor;Assistant Professor", "bibtex": "@inproceedings{\nkardan2022selfjoint,\ntitle={Self-Joint Supervised Learning},\nauthor={Navid Kardan and Mubarak Shah and Mitch Hill},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=zuqcmNVK4c2}\n}", "github": "", "project": "", "reviewers": "Ky9n;sxB6;PZCK", "pdf_size": 0, "recommendation": "5;8;8", "confidence": "3;4;5", "correctness": "3;4;4", "technical_novelty": "2;2;4", "empirical_novelty": "3;4;4", "wc_summary_paper": "85;107;49", "wc_summary_review": "38;27;22", "wc_main_review": "327;388;162", "wc_review": "450;522;233", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "recommendation_avg": [ 7.0, 1.4142135623730951 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "correctness_avg": [ 3.6666666666666665, 0.4714045207910317 ], "technical_novelty_avg": [ 2.6666666666666665, 0.9428090415820634 ], "empirical_novelty_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_summary_paper_avg": [ 80.33333333333333, 23.907228102721476 ], "wc_summary_review_avg": [ 29.0, 6.683312551921141 ], "wc_main_review_avg": [ 292.3333333333333, 95.46494411853786 ], "wc_review_avg": [ 401.6666666666667, 122.83412482784344 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.8660254037844387, "corr_recommendation_correctness": 1.0, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8230417750736491672&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "pdf": "https://openreview.net/pdf?id=zuqcmNVK4c2", "email": ";ucf.edu;ucf.edu", "author_num": 3, "aff_unique_index": "0;0", "aff_unique_norm": "University of Central Florida", "aff_unique_dep": "", "aff_unique_url": "https://www.ucf.edu", "aff_unique_abbr": "UCF", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "zxEfpcmTDnF", "title": "Learning and controlling the source-filter representation of speech with a variational autoencoder", "track": "main", "status": "Reject", "tldr": "", "abstract": "Understanding and controlling latent representations in deep generative models is a challenging yet important problem for analyzing, transforming and generating various types of data. In speech processing, inspiring from the anatomical mechanisms of phonation, the source-filter model considers that speech signals are produced from a few independent and physically meaningful continuous latent factors, among which the fundamental frequency and the formants are of primary importance. In this work, we show that the source-filter model of speech production naturally arises in the latent space of a variational autoencoder (VAE) trained in an unsupervised fashion on a dataset of natural speech signals. Using speech signals generated with an artificial speech synthesizer, we experimentally demonstrate that the fundamental frequency and formant frequencies are encoded in orthogonal subspaces of the VAE latent space and we develop a weakly-supervised method to accurately and independently control these speech factors of variation within the learned latent subspaces. Without requiring additional information such as text or human-labeled data, we propose a deep generative model of speech spectrograms that is conditioned on the fundamental frequency and formant frequencies, and which is applied to the transformation of speech signals.", "keywords": "Deep generative models;variational autoencoder (VAE);speech processing;source-filter model of speech production;representation learning", "primary_area": "", "supplementary_material": "", "author": "Samir Alain Sadok;Simon Leglaive;Laurent Girin;Xavier Alameda-Pineda;Renaud S\u00e9guier", "authorids": "~Samir_Alain_Sadok1;~Simon_Leglaive1;laurent.girin@grenoble-inp.fr;~Xavier_Alameda-Pineda1;renaud.seguier@centralesupelec.fr", "gender": "M;;;M;", "homepage": "https://samsad35.github.io/;;;http://xavirema.eu;", "dblp": ";;;22/10486;", "google_scholar": "1ruIV3sAAAAJ;;;https://scholar.google.fr/citations?user=ukI2bz8AAAAJ;", "orcid": ";;;0000-0002-5354-1084;", "linkedin": "samir-s-9aab79195/;;;xavier-alameda-pineda-4a47271a/;", "or_profile": "~Samir_Alain_Sadok1;~Simon_Leglaive1;laurent.girin@grenoble-inp.fr;~Xavier_Alameda-Pineda1;renaud.seguier@centralesupelec.fr", "aff": ";;;INRIA;", "aff_domain": ";;;inria.fr;", "position": ";;;Researcher;", "bibtex": "@misc{\nsadok2022learning,\ntitle={Learning and controlling the source-filter representation of speech with a variational autoencoder},\nauthor={Samir Alain Sadok and Simon Leglaive and Laurent Girin and Xavier Alameda-Pineda and Renaud S{\\'e}guier},\nyear={2022},\nurl={https://openreview.net/forum?id=zxEfpcmTDnF}\n}", "github": "", "project": "", "reviewers": "e7Cb;kWcG;vTAw;pLaG", "site": "https://openreview.net/forum?id=zxEfpcmTDnF", "pdf_size": 0, "recommendation": "5;5;6;6", "confidence": "4;4;4;4", "correctness": "4;4;4;3", "technical_novelty": "2;2;3;3", "empirical_novelty": "2;2;2;3", "wc_summary_paper": "74;169;130;170", "wc_summary_review": "42;51;180;37", "wc_main_review": "274;640;109;222", "wc_review": "390;860;419;429", "wc_reply_reviewers": "292;638;74;169", "wc_reply_authors": "710;2527;1097;862", "reply_reviewers": "1;1;1;1", "reply_authors": "1;5;2;2", "recommendation_avg": [ 5.5, 0.5 ], "confidence_avg": [ 4.0, 0.0 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.5, 0.5 ], "empirical_novelty_avg": [ 2.25, 0.4330127018922193 ], "wc_summary_paper_avg": [ 135.75, 39.130391002390965 ], "wc_summary_review_avg": [ 77.5, 59.39065583069444 ], "wc_main_review_avg": [ 311.25, 198.95649650111957 ], "wc_review_avg": [ 524.5, 194.22988956388767 ], "wc_reply_reviewers_avg": [ 293.25, 213.51975903882993 ], "wc_reply_authors_avg": [ 1299.0, 722.2669174204229 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 2.5, 1.5 ], "replies_avg": [ 20, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": -0.5773502691896257, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2561528419215373381&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 27, "aff_unique_index": "0", "aff_unique_norm": "INRIA", "aff_unique_dep": "", "aff_unique_url": "https://www.inria.fr", "aff_unique_abbr": "INRIA", "aff_country_unique_index": "0", "aff_country_unique": "France" }, { "id": "zxm7rzEPaj", "title": "Treatment effect estimation with confounder balanced instrumental variable regression", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "This paper considers the challenge of estimating treatment effects from observational data in the presence of unmeasured confounders. A popular way to address this challenge is to utilize an instrumental variable (IV) for two-stage regression, i.e., 2SLS and variants, but they need to assume the additive separability of noise and are limited to the linear setting. Recently, many nonlinear IV regression variants were proposed by regressing the treatment with IVs and confounders in the first stage, leading to confounding bias between the predicted treatment and outcome in the second stage. In this paper, we propose a Confounder Balanced IV Regression (CB-IV) algorithm to jointly remove the bias from the unmeasured confounders with IV regression and reduce the bias from the observed confounders by balancing for treatment effect estimation. Specifically, CB-IV algorithm consists of three main modules: (1) treatment regression: regressing the treatment with IVs and confounders like previous nonlinear IV methods for removing the confounding from unmeasured confounders; (2) confounder balancing: learning a balanced representation of confounders to eliminate the bias induced by the observed confounders (3) outcome regression: regressing the outcome with the predicted treatment and the balanced confounders representation for treatment effect estimation. To the best of our knowledge, this is the first work to combine confounder balancing in IV regression for treatment effect estimation. Moreover, we theoretically prove that CB-IV algorithm is also effective even without the additive separability assumption on noise. Extensive experiments demonstrate that the CB-IV algorithm outperforms the state-of-the-art methods, including IV regression and confounder balancing methods, for treatment effect estimation.", "keywords": "treatment effect;unmeasured confounders;instrumental variable;confounder balance;representaion learning", "primary_area": "", "supplementary_material": "/attachment/53f3429d337ff8940abb614fea1960af7fbc1a7d.zip", "author": "Anpeng Wu;Kun Kuang;Fei Wu", "authorids": "~Anpeng_Wu1;~Kun_Kuang1;~Fei_Wu2", "gender": "M;M;", "homepage": "https://scholar.google.com.hk/citations?user=VQ4m6zQAAAAJ&hl=zh-CN&oi=sra;http://kunkuang.github.io;https://person.zju.edu.cn/wufei", "dblp": "267/5637;194/4245;84/3254-1", "google_scholar": "https://scholar.google.com.hk/citations?user=VQ4m6zQAAAAJ;https://scholar.google.com.hk/citations?user=FOsNiMQAAAAJ;XJLn4MYAAAAJ", "orcid": "0000-0003-3898-7122;0009-0000-7528-8131;", "linkedin": ";;", "or_profile": "~Anpeng_Wu1;~Kun_Kuang1;~Fei_Wu2", "aff": "Zhejiang University;Zhejiang University;Zhejiang University", "aff_domain": "zju.edu.cn;zju.edu.cn;zju.edu.cn", "position": "PhD student;Associate Professor;Full Professor", "bibtex": "@misc{\nwu2022treatment,\ntitle={Treatment effect estimation with confounder balanced instrumental variable regression},\nauthor={Anpeng Wu and Kun Kuang and Fei Wu},\nyear={2022},\nurl={https://openreview.net/forum?id=zxm7rzEPaj}\n}", "github": "", "project": "", "reviewers": "a5qb;uVdE;D2wS;DjHx", "site": "https://openreview.net/forum?id=zxm7rzEPaj", "pdf_size": 0, "recommendation": "6;6;6;6", "confidence": "3;4;3;3", "correctness": "4;3;4;4", "technical_novelty": "2;3;3;3", "empirical_novelty": "3;3;3;0", "wc_summary_paper": "44;111;77;63", "wc_summary_review": "54;77;49;42", "wc_main_review": "145;385;308;217", "wc_review": "243;573;434;322", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "616;967;621;631", "reply_reviewers": "0;0;0;0", "reply_authors": "1;3;1;1", "recommendation_avg": [ 6.0, 0.0 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "correctness_avg": [ 3.75, 0.4330127018922193 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 2.25, 1.299038105676658 ], "wc_summary_paper_avg": [ 73.75, 24.488517717493643 ], "wc_summary_review_avg": [ 55.5, 13.124404748406688 ], "wc_main_review_avg": [ 263.75, 90.75619813544417 ], "wc_review_avg": [ 393.0, 124.11889461318933 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 708.75, 149.19848357138218 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.5, 0.8660254037844386 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.0, "corr_recommendation_correctness": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:saWvF2sDl6cJ:scholar.google.com/&scioq=Treatment+effect+estimation+with+confounder+balanced+instrumental+variable+regression&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0;0", "aff_unique_norm": "Zhejiang University", "aff_unique_dep": "", "aff_unique_url": "https://www.zju.edu.cn", "aff_unique_abbr": "ZJU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "id": "zyrhwrd9EYs", "title": "To Impute or Not To Impute? Missing Data in Treatment Effect Estimation", "track": "main", "status": "Reject", "tldr": "", "abstract": "Missing data is a systemic problem in practical scenarios that causes noise and bias when estimating treatment effects. This makes treatment effect estimation from data with missingness a particularly tricky endeavour. A key reason for this is that standard assumptions on missingness are rendered insufficient due to the presence of an additional variable, treatment, besides the individual and the outcome. Having a treatment variable introduces additional complexity with respect to why some variables are missing that is overlooked by previous work. In our work we identify a new missingness mechanism, which we term mixed confounded missingness (MCM), where some missingness determines treatment selection and other missingness is determined by treatment selection. Given MCM, we show that naively imputing all data leads to poor performing treatment effects models, as the act of imputation effectively removes information necessary to provide unbiased estimates. However, no imputation at all also leads to biased estimates, as missingness determined by treatment divides the population in distinct subpopulations, where estimates across these populations will be biased. Our solution is selective imputation, where we use insights from MCM to inform precisely which variables should be imputed and which should not. We empirically demonstrate how various learners benefit from selective imputation compared to other solutions for missing data.", "keywords": "Causality", "primary_area": "", "supplementary_material": "/attachment/2286409ebc9d68455ea56d25f368914408a2828a.zip", "author": "Jeroen Berrevoets;Fergus Imrie;Trent Kyono;James Jordon;Mihaela van der Schaar", "authorids": "~Jeroen_Berrevoets1;~Fergus_Imrie1;~Trent_Kyono1;~James_Jordon1;~Mihaela_van_der_Schaar2", "gender": ";;M;;F", "homepage": "https://jeroenbe.github.io;;;;https://www.vanderschaar-lab.com", "dblp": "236/4591;281/4466;https://dblp.uni-trier.de/pers/hd/k/Kyono:Trent;215/4296;", "google_scholar": "https://scholar.google.be/citations?user=Bq1dFNQAAAAJ;4qCGgpsAAAAJ;vJxuKwgAAAAJ;;DZ3S--MAAAAJ", "orcid": ";0000-0002-6241-0123;;;", "linkedin": ";;;;", "or_profile": "~Jeroen_Berrevoets1;~Fergus_Imrie1;~Trent_Kyono1;~James_Jordon1;~Mihaela_van_der_Schaar2", "aff": "University of Cambridge;University of California, Los Angeles;;Alan Turing Institute;University of California, Los Angeles", "aff_domain": "cam.ac.uk;ucla.edu;;turing.ac.uk;ucla.edu", "position": "PhD student;Postdoc;;Postdoc;Full Professor", "bibtex": "@misc{\nberrevoets2022to,\ntitle={To Impute or Not To Impute? Missing Data in Treatment Effect Estimation},\nauthor={Jeroen Berrevoets and Fergus Imrie and Trent Kyono and James Jordon and Mihaela van der Schaar},\nyear={2022},\nurl={https://openreview.net/forum?id=zyrhwrd9EYs}\n}", "github": "", "project": "", "reviewers": "8bft;efJq;ZWmF", "site": "https://openreview.net/forum?id=zyrhwrd9EYs", "pdf_size": 0, "recommendation": "3;3;5", "confidence": "4;3;5", "correctness": "1;3;2", "technical_novelty": "1;2;3", "empirical_novelty": "1;2;2", "wc_summary_paper": "76;85;57", "wc_summary_review": "8;88;102", "wc_main_review": "277;214;316", "wc_review": "361;387;475", "wc_reply_reviewers": "642;0;0", "wc_reply_authors": "1661;1437;416", "reply_reviewers": "2;0;0", "reply_authors": "3;3;1", "recommendation_avg": [ 3.6666666666666665, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "correctness_avg": [ 2.0, 0.816496580927726 ], "technical_novelty_avg": [ 2.0, 0.816496580927726 ], "empirical_novelty_avg": [ 1.6666666666666667, 0.4714045207910317 ], "wc_summary_paper_avg": [ 72.66666666666667, 11.671427600007732 ], "wc_summary_review_avg": [ 66.0, 41.40853374205209 ], "wc_main_review_avg": [ 269.0, 42.02380277890139 ], "wc_review_avg": [ 407.6666666666667, 48.780688346471244 ], "wc_reply_reviewers_avg": [ 214.0, 302.64170234784234 ], "wc_reply_authors_avg": [ 1171.3333333333333, 541.8734989726744 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.9428090415820634 ], "reply_authors_avg": [ 2.3333333333333335, 0.9428090415820634 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": 0.8660254037844387, "corr_recommendation_correctness": 0.0, "gs_citation": 28, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12312409550453998112&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff_unique_index": "0;1;2;1", "aff_unique_norm": "University of Cambridge;University of California, Los Angeles;Alan Turing Institute", "aff_unique_dep": ";;", "aff_unique_url": "https://www.cam.ac.uk;https://www.ucla.edu;https://www.turing.ac.uk", "aff_unique_abbr": "Cambridge;UCLA;ATI", "aff_campus_unique_index": "0;1;1", "aff_campus_unique": "Cambridge;Los Angeles;", "aff_country_unique_index": "0;1;0;1", "aff_country_unique": "United Kingdom;United States" }, { "title": "Revisiting Design Choices in Offline Model Based Reinforcement Learning", "status": "Spotlight", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6699", "id": "zz9hXVhf40", "poster": "", "openreview": "https://openreview.net/forum?id=zz9hXVhf40", "slides": "https://iclr.cc/virtual/2022/poster/6699", "video": "https://iclr.cc/virtual/2022/poster/6699", "author_site": "Cong Lu, Philip Ball, Jack Parker-Holder, Michael Osborne, S Roberts", "tldr": "", "abstract": "Offline reinforcement learning enables agents to leverage large pre-collected datasets of environment transitions to learn control policies, circumventing the need for potentially expensive or unsafe online data collection. Significant progress has been made recently in offline model-based reinforcement learning, approaches which leverage a learned dynamics model. This typically involves constructing a probabilistic model, and using the model uncertainty to penalize rewards where there is insufficient data, solving for a pessimistic MDP that lower bounds the true MDP. Existing methods, however, exhibit a breakdown between theory and practice, whereby pessimistic return ought to be bounded by the total variation distance of the model from the true dynamics, but is instead implemented through a penalty based on estimated model uncertainty. This has spawned a variety of uncertainty heuristics, with little to no comparison between differing approaches. In this paper, we compare these heuristics, and design novel protocols to investigate their interaction with other hyperparameters, such as the number of models, or imaginary rollout horizon. Using these insights, we show that selecting these key hyperparameters using Bayesian Optimization produces superior configurations that are vastly different to those currently used in existing hand-tuned state-of-the-art methods, and result in drastically stronger performance.", "keywords": "Model-Based Reinforcement Learning;Offline Reinforcement Learning;Uncertainty Quantification", "primary_area": "", "supplementary_material": "/attachment/22c20ba5d91e95042595781655a6800dd25b1632.zip", "author": "Cong Lu;Philip Ball;Jack Parker-Holder;Michael Osborne;Stephen J. Roberts", "authorids": "~Cong_Lu1;~Philip_Ball1;~Jack_Parker-Holder1;~Michael_Osborne1;~Stephen_J._Roberts1", "gender": "M;M;M;M;M", "homepage": "https://conglu.co.uk;https://philipjball.github.io/;https://jparkerholder.github.io/;https://www.robots.ox.ac.uk/~mosb/;http://www.robots.ox.ac.uk/~sjrob", "dblp": ";244/1972;237/9793.html;59/6403;64/1485", "google_scholar": "yMGBji4AAAAJ;5Cm8L90AAAAJ;;https://scholar.google.co.uk/citations?user=iTNcAakAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": "0000-0001-5564-838X;;;0000-0003-1959-012X;0000-0002-9305-9268", "linkedin": "cong-lu-530b74104/;;;;", "or_profile": "~Cong_Lu1;~Philip_Ball1;~Jack_Parker-Holder1;~Michael_Osborne1;~Stephen_J._Roberts1", "aff": "University of Oxford;University of California, Berkeley;University of Oxford;University of Oxford;University of Oxford", "aff_domain": "ox.ac.uk;berkeley.edu;ox.ac.uk;oxford.ac.uk;ox.ac.uk", "position": "PhD student;PhD student;PhD student;Full Professor;Full Professor", "bibtex": "@inproceedings{\nlu2022revisiting,\ntitle={Revisiting Design Choices in Offline Model Based Reinforcement Learning},\nauthor={Cong Lu and Philip Ball and Jack Parker-Holder and Michael Osborne and Stephen J. Roberts},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=zz9hXVhf40}\n}", "github": "", "project": "", "reviewers": "eM3K;TWSZ;DXYY;PM7Z;uPzV", "pdf_size": 0, "recommendation": "6;6;6;8;8", "confidence": "4;4;4;4;3", "correctness": "4;2;3;4;4", "technical_novelty": "2;2;3;3;2", "empirical_novelty": "3;3;3;3;3", "wc_summary_paper": "77;151;35;74;98", "wc_summary_review": "13;111;59;44;44", "wc_main_review": "174;514;485;695;272", "wc_review": "264;776;579;813;414", "wc_reply_reviewers": "34;400;27;451;51", "wc_reply_authors": "215;827;1126;1602;370", "reply_reviewers": "1;1;1;3;1", "reply_authors": "2;2;2;4;2", "recommendation_avg": [ 6.8, 0.9797958971132712 ], "confidence_avg": [ 3.8, 0.39999999999999997 ], "correctness_avg": [ 3.4, 0.8 ], "technical_novelty_avg": [ 2.4, 0.4898979485566356 ], "empirical_novelty_avg": [ 3.0, 0.0 ], "wc_summary_paper_avg": [ 87.0, 37.92097045171708 ], "wc_summary_review_avg": [ 54.2, 32.1085658353032 ], "wc_main_review_avg": [ 428.0, 184.81666591517117 ], "wc_review_avg": [ 569.2, 209.53987687311454 ], "wc_reply_reviewers_avg": [ 192.6, 191.00429314546832 ], "wc_reply_authors_avg": [ 828.0, 504.66107438557214 ], "reply_reviewers_avg": [ 1.4, 0.8 ], "reply_authors_avg": [ 2.4, 0.8 ], "replies_avg": [ 28, 0 ], "authors#_avg": [ 5, 0 ], "corr_recommendation_confidence": -0.6123724356957946, "corr_recommendation_correctness": 0.6123724356957946, "gs_citation": 70, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2361431656124498570&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "pdf": "https://openreview.net/pdf?id=zz9hXVhf40", "email": "ox.ac.uk;berkeley.edu;ox.ac.uk;oxford.ac.uk;ox.ac.uk", "author_num": 5, "aff_unique_index": "0;1;0;0;0", "aff_unique_norm": "University of Oxford;University of California, Berkeley", "aff_unique_dep": ";", "aff_unique_url": "https://www.ox.ac.uk;https://www.berkeley.edu", "aff_unique_abbr": "Oxford;UC Berkeley", "aff_campus_unique_index": "1", "aff_campus_unique": ";Berkeley", "aff_country_unique_index": "0;1;0;0;0", "aff_country_unique": "United Kingdom;United States" }, { "id": "zz_qjE6N1OF", "title": "P4O: Efficient Deep Reinforcement Learning with Predictive Processing Proximal Policy Optimization", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Advances in reinforcement learning (RL) often rely on massive compute resources and remain notoriously sample inefficient. In contrast, the human brain is able to efficiently learn effective control strategies using limited resources. This raises the question whether insights from neuroscience can be used to improve current RL methods. Predictive processing is a popular theoretical framework which maintains that the human brain is actively seeking to minimize surprise. We show that recurrent neural networks which predict their own sensory states can be leveraged to minimise surprise, yielding substantial gains in cumulative reward. Specifically, we present the Predictive Processing Proximal Policy Optimization (P4O) agent; an actor-critic reinforcement learning agent that applies predictive processing to a recurrent variant of the PPO algorithm by integrating a world model in its hidden state. P4O significantly outperforms a baseline recurrent variant of the PPO algorithm on multiple Atari games using a single GPU. It also outperforms other state-of-the-art agents given the same wall-clock time and exceeds human gamer performance on Seaquest, which is a particularly challenging environment in the Atari domain. Altogether, our work underscores how insights from the field of neuroscience may support the development of more capable and efficient artificial agents.", "keywords": "reinforcement learning;predictive processing", "primary_area": "", "supplementary_material": "/attachment/b5f6a0d1eddd9218c977f95902e4ed96cc156edd.zip", "author": "Burcu K\u00fc\u00e7\u00fcko\u011flu;Walraaf Borkent;Bodo Rueckauer;Nasir Ahmad;Umut G\u00fc\u00e7l\u00fc;Marcel van Gerven", "authorids": "~Burcu_K\u00fc\u00e7\u00fcko\u011flu1;~Walraaf_Borkent1;~Bodo_Rueckauer1;~Nasir_Ahmad1;~Umut_G\u00fc\u00e7l\u00fc1;~Marcel_van_Gerven1", "gender": ";;M;;M;M", "homepage": ";https://linkedin.com/in/walraaf;https://www.ru.nl/english/people/ruckauer-b/;;https://neuralcoding.nl;http://www.artcogsys.com", "dblp": ";;;;02/8743;", "google_scholar": ";;psk4UJEAAAAJ;;LM9WFngAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";;0000-0003-1628-707X;;0000-0003-4753-159X;0000-0002-2206-9098", "linkedin": "burcu-k%C3%BC%C3%A7%C3%BCko%C4%9Flu-94b104114;https://linkedin.com/in/walraaf;;;umutguclu;marcel-van-gerven-8698784/", "or_profile": "~Burcu_K\u00fc\u00e7\u00fcko\u011flu1;~Walraaf_Borkent1;~Bodo_Rueckauer1;~Nasir_Ahmad1;~Umut_G\u00fc\u00e7l\u00fc1;~Marcel_van_Gerven1", "aff": "Radboud University Nijmegen;;Radboud University Nijmegen;;Radboud University Nijmegen;Donders Institute for Brain, Cognition and Behaviour, Radboud University", "aff_domain": "ru.nl;;ru.nl;;ru.nl;ru.nl", "position": "PhD student;;Postdoc;;Assistant Professor;Full Professor", "bibtex": "@misc{\nk{\\\"u}{\\c{c}}{\\\"u}ko{\\u{g}}lu2022po,\ntitle={P4O: Efficient Deep Reinforcement Learning with Predictive Processing Proximal Policy Optimization},\nauthor={Burcu K{\\\"u}{\\c{c}}{\\\"u}ko{\\u{g}}lu and Walraaf Borkent and Bodo Rueckauer and Nasir Ahmad and Umut G{\\\"u}{\\c{c}}l{\\\"u} and Marcel van Gerven},\nyear={2022},\nurl={https://openreview.net/forum?id=zz_qjE6N1OF}\n}", "github": "", "project": "", "reviewers": "zL7m;pvUE;8gLf;bJWq;km3X", "site": "https://openreview.net/forum?id=zz_qjE6N1OF", "pdf_size": 0, "recommendation": "1;3;3;3;3", "confidence": "4;5;4;4;4", "correctness": "3;2;4;2;3", "technical_novelty": "1;2;3;2;2", "empirical_novelty": "1;2;3;1;2", "wc_summary_paper": "44;65;61;73;37", "wc_summary_review": "17;85;22;40;45", "wc_main_review": "107;530;307;351;228", "wc_review": "168;680;390;464;310", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "0;0;0;0;0", "reply_reviewers": "0;0;0;0;0", "reply_authors": "0;0;0;0;0", "recommendation_avg": [ 2.6, 0.8000000000000002 ], "confidence_avg": [ 4.2, 0.39999999999999997 ], "correctness_avg": [ 2.8, 0.7483314773547882 ], "technical_novelty_avg": [ 2.0, 0.6324555320336759 ], "empirical_novelty_avg": [ 1.8, 0.7483314773547883 ], "wc_summary_paper_avg": [ 56.0, 13.416407864998739 ], "wc_summary_review_avg": [ 41.8, 24.028316628511455 ], "wc_main_review_avg": [ 304.6, 139.91225821921392 ], "wc_review_avg": [ 402.4, 169.99482345059806 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 6, 0 ], "corr_recommendation_confidence": 0.24999999999999994, "corr_recommendation_correctness": -0.13363062095621214, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10487334843117960391&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Radboud University", "aff_unique_dep": "", "aff_unique_url": "https://www.ru.nl/", "aff_unique_abbr": "RU", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Nijmegen;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Netherlands" }, { "title": "A Theory of Tournament Representations", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2022/poster/6952", "id": "zzk231Ms1Ih", "poster": "", "openreview": "https://openreview.net/forum?id=zzk231Ms1Ih", "slides": "https://iclr.cc/virtual/2022/poster/6952", "video": "https://iclr.cc/virtual/2022/poster/6952", "author_site": "Arun Rajkumar, Vishnu Veerathu, Abdul Mir", "tldr": "", "abstract": "Real-world tournaments are almost always intransitive. Recent works have noted that parametric models which assume $d$ dimensional node representations can effectively model intransitive tournaments. However, nothing is known about the structure of the class of tournaments that arise out of any fixed $d$ dimensional representations. In this work, we develop a novel theory for understanding parametric tournament representations. Our first contribution is to structurally characterize the class of tournaments that arise out of $d$ dimensional representations. We do this by showing that these tournament classes have forbidden configurations that must necessarily be a union of flip classes, a novel way to partition the set of all tournaments. We further characterize rank $2$ tournaments completely by showing that the associated forbidden flip class contains just $2$ tournaments. Specifically, we show that the rank $2$ tournaments are equivalent to locally transitive tournaments. This insight allows us to show that the minimum feedback arc set problem on this tournament class can be solved using the standard Quicksort procedure. We also exhibit specific forbidden configurations for rank $4$ tournaments. For a general rank $d$ tournament class, we show that the flip class associated with a coned-doubly regular tournament of size $\\mathcal{O}(\\sqrt{d})$ must be a forbidden configuration. To answer a dual question, using a celebrated result of Froster, we show a lower bound of $\\Theta(\\sqrt{n})$ on the minimum dimension needed to represent all tournaments on $n$ nodes. For any given tournament, we show a novel upper bound on the smallest representation dimension that depends on the least size of the number of unique nodes in any feedback arc set of the flip class associated with a tournament. We show how our results also shed light on the upper bound of sign-rank of matrices. ", "keywords": "tournament;skew-symmetric;pairwise ranking", "primary_area": "", "supplementary_material": "", "author": "Arun Rajkumar;Vishnu Veerathu;Abdul Bakey Mir", "authorids": "~Arun_Rajkumar4;~Vishnu_Veerathu1;cs20d400@smail.iitm.ac.in", "gender": "M;M;", "homepage": ";https://sites.google.com/view/vishnuveerathu/home;", "dblp": "32/11350;;", "google_scholar": ";6SMwj7oAAAAJ;", "orcid": ";;", "linkedin": ";vishnu-veerathu-3a61b814b/?originalSubdomain=in;", "or_profile": "~Arun_Rajkumar4;~Vishnu_Veerathu1;cs20d400@smail.iitm.ac.in", "aff": "Indian Institute of Technology Madras;;", "aff_domain": "iitm.ac.in;;", "position": "Assistant Professor;;", "bibtex": "@inproceedings{\nrajkumar2022a,\ntitle={A Theory of Tournament Representations},\nauthor={Arun Rajkumar and Vishnu Veerathu and Abdul Bakey Mir},\nbooktitle={International Conference on Learning Representations},\nyear={2022},\nurl={https://openreview.net/forum?id=zzk231Ms1Ih}\n}", "github": "", "project": "", "reviewers": "YSHR;AhwX;UVYb;MMVk", "pdf_size": 0, "recommendation": "5;5;6;8", "confidence": "4;3;3;4", "correctness": "3;4;3;4", "technical_novelty": "3;2;3;3", "empirical_novelty": "2;0;0;1", "wc_summary_paper": "102;47;75;160", "wc_summary_review": "32;43;88;23", "wc_main_review": "251;377;281;160", "wc_review": "385;467;444;343", "wc_reply_reviewers": "76;0;0;0", "wc_reply_authors": "859;467;327;395", "reply_reviewers": "1;0;0;0", "reply_authors": "2;1;1;1", "recommendation_avg": [ 6.0, 1.224744871391589 ], "confidence_avg": [ 3.5, 0.5 ], "correctness_avg": [ 3.5, 0.5 ], "technical_novelty_avg": [ 2.75, 0.4330127018922193 ], "empirical_novelty_avg": [ 0.75, 0.82915619758885 ], "wc_summary_paper_avg": [ 96.0, 41.75523919222593 ], "wc_summary_review_avg": [ 46.5, 24.984995497297973 ], "wc_main_review_avg": [ 267.25, 77.46087722198865 ], "wc_review_avg": [ 409.75, 48.782040752719645 ], "wc_reply_reviewers_avg": [ 19.0, 32.90896534380867 ], "wc_reply_authors_avg": [ 512.0, 206.36617939962935 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 3, 0 ], "corr_recommendation_confidence": 0.40824829046386296, "corr_recommendation_correctness": 0.40824829046386296, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15894440574409757834&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "pdf": "https://openreview.net/pdf?id=zzk231Ms1Ih", "email": "iitm.ac.in;;", "author_num": 3, "aff_unique_index": "0", "aff_unique_norm": "Indian Institute of Technology Madras", "aff_unique_dep": "", "aff_unique_url": "https://www.iitm.ac.in", "aff_unique_abbr": "IIT Madras", "aff_campus_unique_index": "0", "aff_campus_unique": "Madras", "aff_country_unique_index": "0", "aff_country_unique": "India" } ]